From 512d31510694ad922607fc395ad76993e435c31a Mon Sep 17 00:00:00 2001 From: Qiao Ma Date: Sun, 5 Mar 2023 22:29:38 +0800 Subject: [PATCH 1/2] first commit The soure code comes from MLNX_OFED_LINUX-5.8-1.1.2.1-alios7.2-aarch64.tgz. Signed-off-by: Qiao Ma --- licenses/BSD 3-Clause | 13 + licenses/GPL-2.0 | 359 + mlnx-ofa_kernel.spec | 723 + src/mlnx-ofa_kernel-5.8/.gitignore | 112 + src/mlnx-ofa_kernel-5.8/COPYING | 356 + .../Documentation/infiniband/tag_matching.txt | 50 + .../Documentation/release_notes-storage.txt | 170 + src/mlnx-ofa_kernel-5.8/LINUX_BASE_BRANCH | 1 + src/mlnx-ofa_kernel-5.8/Makefile | 1 + src/mlnx-ofa_kernel-5.8/Module.supported | 74 + src/mlnx-ofa_kernel-5.8/README | 159 + .../0001-BACKPORT-block-blk-mq-rdma.c.patch | 52 + ...02-BACKPORT-drivers-base-auxiliary.c.patch | 125 + ...KPORT-drivers-infiniband-core-addr.c.patch | 253 + ...PORT-drivers-infiniband-core-cache.c.patch | 72 + ...ORT-drivers-infiniband-core-cgroup.c.patch | 25 + ...ACKPORT-drivers-infiniband-core-cm.c.patch | 376 + ...T-drivers-infiniband-core-cm_trace.c.patch | 18 + ...CKPORT-drivers-infiniband-core-cma.c.patch | 390 + ...ivers-infiniband-core-cma_configfs.c.patch | 245 + ...-drivers-infiniband-core-cma_trace.h.patch | 24 + ...-drivers-infiniband-core-core_priv.h.patch | 93 + ...T-drivers-infiniband-core-counters.c.patch | 47 + ...ACKPORT-drivers-infiniband-core-cq.c.patch | 206 + ...ORT-drivers-infiniband-core-device.c.patch | 309 + ...KPORT-drivers-infiniband-core-iwcm.c.patch | 85 + ...-drivers-infiniband-core-iwpm_util.c.patch | 123 + ...CKPORT-drivers-infiniband-core-lag.c.patch | 32 + ...CKPORT-drivers-infiniband-core-mad.c.patch | 445 + ...T-drivers-infiniband-core-mad_rmpp.c.patch | 24 + ...RT-drivers-infiniband-core-netlink.c.patch | 123 + ...PORT-drivers-infiniband-core-nldev.c.patch | 633 + ...T-drivers-infiniband-core-peer_mem.c.patch | 163 + ...-drivers-infiniband-core-rdma_core.c.patch | 86 + ...vers-infiniband-core-roce_gid_mgmt.c.patch | 397 + ...T-drivers-infiniband-core-sa_query.c.patch | 87 + ...PORT-drivers-infiniband-core-trace.c.patch | 19 + ...KPORT-drivers-infiniband-core-ucma.c.patch | 104 + ...-drivers-infiniband-core-ud_header.c.patch | 19 + ...rivers-infiniband-core-umem_dmabuf.c.patch | 97 + ...T-drivers-infiniband-core-umem_odp.c.patch | 1348 ++ ...T-drivers-infiniband-core-user_mad.c.patch | 172 + ...ORT-drivers-infiniband-core-uverbs.h.patch | 18 + ...drivers-infiniband-core-uverbs_cmd.c.patch | 58 + ...ivers-infiniband-core-uverbs_ioctl.c.patch | 40 + ...rivers-infiniband-core-uverbs_main.c.patch | 209 + ...rivers-infiniband-core-uverbs_uapi.c.patch | 35 + ...PORT-drivers-infiniband-core-verbs.c.patch | 133 + ...-drivers-infiniband-debug-memtrack.c.patch | 38 + ...-drivers-infiniband-hw-mlx5-Makefile.patch | 26 + ...PORT-drivers-infiniband-hw-mlx5-cq.c.patch | 79 + ...RT-drivers-infiniband-hw-mlx5-devx.c.patch | 42 + ...rivers-infiniband-hw-mlx5-doorbell.c.patch | 47 + ...PORT-drivers-infiniband-hw-mlx5-fs.c.patch | 45 + ...drivers-infiniband-hw-mlx5-ib_virt.c.patch | 25 + ...RT-drivers-infiniband-hw-mlx5-main.c.patch | 96 + ...ORT-drivers-infiniband-hw-mlx5-mem.c.patch | 22 + ...drivers-infiniband-hw-mlx5-mlx5_ib.h.patch | 66 + ...PORT-drivers-infiniband-hw-mlx5-mr.c.patch | 230 + ...ORT-drivers-infiniband-hw-mlx5-odp.c.patch | 293 + ...PORT-drivers-infiniband-hw-mlx5-qp.c.patch | 116 + ...ORT-drivers-infiniband-hw-mlx5-srq.c.patch | 31 + ...drivers-infiniband-hw-mlx5-srq_cmd.c.patch | 19 + ...PORT-drivers-infiniband-hw-mlx5-wr.c.patch | 18 + ...drivers-infiniband-ulp-ipoib-ipoib.h.patch | 68 + ...vers-infiniband-ulp-ipoib-ipoib_cm.c.patch | 105 + ...-infiniband-ulp-ipoib-ipoib_ethtool..patch | 160 + ...vers-infiniband-ulp-ipoib-ipoib_fs.c.patch | 104 + ...vers-infiniband-ulp-ipoib-ipoib_ib.c.patch | 113 + ...rs-infiniband-ulp-ipoib-ipoib_main.c.patch | 773 + ...-infiniband-ulp-ipoib-ipoib_multicas.patch | 29 + ...-infiniband-ulp-ipoib-ipoib_netlink..patch | 79 + ...rs-infiniband-ulp-ipoib-ipoib_vlan.c.patch | 66 + ...ers-infiniband-ulp-iser-iscsi_iser.c.patch | 264 + ...ers-infiniband-ulp-iser-iscsi_iser.h.patch | 82 + ...-infiniband-ulp-iser-iser_initiator..patch | 83 + ...rs-infiniband-ulp-iser-iser_memory.c.patch | 449 + ...vers-infiniband-ulp-isert-ib_isert.c.patch | 272 + ...-drivers-infiniband-ulp-srp-ib_srp.h.patch | 108 + ...-net-ethernet-mellanox-mlx5-core-Mak.patch | 43 + ...-net-ethernet-mellanox-mlx5-core-acc.patch | 41 + ...-net-ethernet-mellanox-mlx5-core-acc.patch | 25 + ...-net-ethernet-mellanox-mlx5-core-all.patch | 31 + ...-net-ethernet-mellanox-mlx5-core-cmd.patch | 174 + ...-net-ethernet-mellanox-mlx5-core-cq..patch | 30 + ...-net-ethernet-mellanox-mlx5-core-crd.patch | 54 + ...-net-ethernet-mellanox-mlx5-core-dev.patch | 353 + ...-net-ethernet-mellanox-mlx5-core-dia.patch | 25 + ...-net-ethernet-mellanox-mlx5-core-dia.patch | 88 + ...-net-ethernet-mellanox-mlx5-core-dia.patch | 25 + ...-net-ethernet-mellanox-mlx5-core-dia.patch | 81 + ...-net-ethernet-mellanox-mlx5-core-dia.patch | 101 + ...-net-ethernet-mellanox-mlx5-core-dia.patch | 25 + ...-net-ethernet-mellanox-mlx5-core-en..patch | 672 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 114 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 58 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 98 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 45 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 202 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 39 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 19 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 136 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 57 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 84 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 68 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 157 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 29 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 687 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 216 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 155 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 48 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 20 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 75 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 36 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 193 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 49 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 291 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 183 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 25 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 25 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 98 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 73 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 27 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 94 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 414 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 47 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 274 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 58 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 188 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 90 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 84 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 37 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 97 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 62 + ...-net-ethernet-mellanox-mlx5-core-en_.patch | 43 + ...-net-ethernet-mellanox-mlx5-core-en_.patch | 212 + ...-net-ethernet-mellanox-mlx5-core-en_.patch | 40 + ...-net-ethernet-mellanox-mlx5-core-en_.patch | 40 + ...-net-ethernet-mellanox-mlx5-core-en_.patch | 301 + ...-net-ethernet-mellanox-mlx5-core-en_.patch | 95 + ...-net-ethernet-mellanox-mlx5-core-en_.patch | 93 + ...-net-ethernet-mellanox-mlx5-core-en_.patch | 86 + ...-net-ethernet-mellanox-mlx5-core-en_.patch | 100 + ...-net-ethernet-mellanox-mlx5-core-en_.patch | 34 + ...-net-ethernet-mellanox-mlx5-core-en_.patch | 110 + ...-net-ethernet-mellanox-mlx5-core-en_.patch | 73 + ...-net-ethernet-mellanox-mlx5-core-en_.patch | 83 + ...-net-ethernet-mellanox-mlx5-core-en_.patch | 81 + ...-net-ethernet-mellanox-mlx5-core-en_.patch | 28 + ...-net-ethernet-mellanox-mlx5-core-en_.patch | 96 + ...-net-ethernet-mellanox-mlx5-core-en_.patch | 22 + ...-net-ethernet-mellanox-mlx5-core-en_.patch | 166 + ...-net-ethernet-mellanox-mlx5-core-en_.patch | 1271 ++ ...-net-ethernet-mellanox-mlx5-core-en_.patch | 110 + ...-net-ethernet-mellanox-mlx5-core-en_.patch | 209 + ...-net-ethernet-mellanox-mlx5-core-en_.patch | 689 + ...-net-ethernet-mellanox-mlx5-core-en_.patch | 52 + ...-net-ethernet-mellanox-mlx5-core-en_.patch | 887 + ...-net-ethernet-mellanox-mlx5-core-en_.patch | 38 + ...-net-ethernet-mellanox-mlx5-core-en_.patch | 531 + ...-net-ethernet-mellanox-mlx5-core-en_.patch | 147 + ...-net-ethernet-mellanox-mlx5-core-en_.patch | 92 + ...-net-ethernet-mellanox-mlx5-core-en_.patch | 76 + ...-net-ethernet-mellanox-mlx5-core-en_.patch | 177 + ...-net-ethernet-mellanox-mlx5-core-eq..patch | 176 + ...-net-ethernet-mellanox-mlx5-core-esw.patch | 23 + ...-net-ethernet-mellanox-mlx5-core-esw.patch | 23 + ...-net-ethernet-mellanox-mlx5-core-esw.patch | 230 + ...-net-ethernet-mellanox-mlx5-core-esw.patch | 148 + ...-net-ethernet-mellanox-mlx5-core-esw.patch | 302 + ...-net-ethernet-mellanox-mlx5-core-fpg.patch | 20 + ...-net-ethernet-mellanox-mlx5-core-fpg.patch | 33 + ...-net-ethernet-mellanox-mlx5-core-fpg.patch | 76 + ...-net-ethernet-mellanox-mlx5-core-fpg.patch | 36 + ...-net-ethernet-mellanox-mlx5-core-fs_.patch | 262 + ...-net-ethernet-mellanox-mlx5-core-fs_.patch | 43 + ...-net-ethernet-mellanox-mlx5-core-fs_.patch | 187 + ...-net-ethernet-mellanox-mlx5-core-fw..patch | 60 + ...-net-ethernet-mellanox-mlx5-core-fw_.patch | 209 + ...-net-ethernet-mellanox-mlx5-core-fw_.patch | 20 + ...-net-ethernet-mellanox-mlx5-core-hea.patch | 344 + ...-net-ethernet-mellanox-mlx5-core-ipo.patch | 193 + ...-net-ethernet-mellanox-mlx5-core-ipo.patch | 205 + ...-net-ethernet-mellanox-mlx5-core-ipo.patch | 33 + ...-net-ethernet-mellanox-mlx5-core-ipo.patch | 73 + ...-net-ethernet-mellanox-mlx5-core-lag.patch | 34 + ...-net-ethernet-mellanox-mlx5-core-lag.patch | 559 + ...-net-ethernet-mellanox-mlx5-core-lib.patch | 596 + ...-net-ethernet-mellanox-mlx5-core-lib.patch | 40 + ...-net-ethernet-mellanox-mlx5-core-lib.patch | 32 + ...-net-ethernet-mellanox-mlx5-core-lib.patch | 20 + ...-net-ethernet-mellanox-mlx5-core-lib.patch | 71 + ...-net-ethernet-mellanox-mlx5-core-lib.patch | 26 + ...-net-ethernet-mellanox-mlx5-core-lib.patch | 27 + ...-net-ethernet-mellanox-mlx5-core-lib.patch | 33 + ...-net-ethernet-mellanox-mlx5-core-lib.patch | 199 + ...-net-ethernet-mellanox-mlx5-core-mai.patch | 539 + ...-net-ethernet-mellanox-mlx5-core-mlx.patch | 70 + ...-net-ethernet-mellanox-mlx5-core-por.patch | 26 + ...-net-ethernet-mellanox-mlx5-core-sri.patch | 125 + ...-net-ethernet-mellanox-mlx5-core-ste.patch | 35 + ...-net-ethernet-mellanox-mlx5-core-ste.patch | 23 + ...-net-ethernet-mellanox-mlx5-core-uar.patch | 24 + ...-net-ethernet-mellanox-mlx5-core-vpo.patch | 18 + ...-net-ethernet-mellanox-mlx5-core-wq..patch | 22 + ...-net-ethernet-mellanox-mlxfw-mlxfw.h.patch | 82 + ...-net-ethernet-mellanox-mlxfw-mlxfw_f.patch | 621 + ...-net-ethernet-mellanox-mlxfw-mlxfw_m.patch | 18 + ...0-BACKPORT-drivers-nvme-host-ioctl.c.patch | 150 + ...ACKPORT-drivers-nvme-host-nvfs-dma.h.patch | 183 + ...CKPORT-drivers-nvme-host-nvfs-rdma.h.patch | 47 + ...217-BACKPORT-drivers-nvme-host-tcp.c.patch | 391 + ...218-BACKPORT-drivers-nvme-host-zns.c.patch | 24 + ...PORT-drivers-nvme-target-admin-cmd.c.patch | 159 + ...KPORT-drivers-nvme-target-configfs.c.patch | 387 + ...-BACKPORT-drivers-nvme-target-core.c.patch | 198 + ...22-BACKPORT-drivers-nvme-target-fc.c.patch | 109 + ...ACKPORT-drivers-nvme-target-fcloop.c.patch | 28 + ...RT-drivers-nvme-target-io-cmd-bdev.c.patch | 436 + ...RT-drivers-nvme-target-io-cmd-file.c.patch | 109 + ...BACKPORT-drivers-nvme-target-nvmet.h.patch | 100 + ...KPORT-drivers-nvme-target-passthru.c.patch | 137 + ...-BACKPORT-drivers-nvme-target-rdma.c.patch | 90 + ...T-drivers-nvme-target-rdma_offload.c.patch | 61 + ...1-BACKPORT-drivers-nvme-target-tcp.c.patch | 96 + ...BACKPORT-drivers-nvme-target-trace.h.patch | 21 + ...33-BACKPORT-drivers-scsi-scsi_priv.h.patch | 41 + ...RT-drivers-scsi-scsi_transport_srp.c.patch | 54 + ...CKPORT-include-linux-auxiliary_bus.h.patch | 42 + ...BACKPORT-include-linux-mlx5-driver.h.patch | 129 + ...ACKPORT-include-linux-mlx5-eswitch.h.patch | 22 + ...238-BACKPORT-include-linux-mlx5-fs.h.patch | 18 + ...9-BACKPORT-include-linux-mlx5-port.h.patch | 39 + ...PORT-include-linux-mod_devicetable.h.patch | 29 + ...KPORT-include-linux-nvme-fc-driver.h.patch | 44 + ...2-BACKPORT-include-linux-nvme-rdma.h.patch | 27 + .../0243-BACKPORT-include-linux-nvme.h.patch | 27 + ...PORT-include-linux-sunrpc-svc_rdma.h.patch | 189 + ...0245-BACKPORT-include-rdma-ib_addr.h.patch | 92 + ...0246-BACKPORT-include-rdma-ib_umem.h.patch | 126 + ...-BACKPORT-include-rdma-ib_umem_odp.h.patch | 201 + ...248-BACKPORT-include-rdma-ib_verbs.h.patch | 431 + .../0249-BACKPORT-include-rdma-lag.h.patch | 22 + ...BACKPORT-include-rdma-rdma_counter.h.patch | 37 + ...BACKPORT-include-rdma-rdma_netlink.h.patch | 34 + ...BACKPORT-include-rdma-uverbs_ioctl.h.patch | 41 + ...CKPORT-include-trace-events-ib_mad.h.patch | 54 + ...KPORT-include-trace-events-ib_umad.h.patch | 23 + ...KPORT-include-trace-events-rpcrdma.h.patch | 35 + .../0256-BACKPORT-net-mlxdevm-mlxdevm.c.patch | 178 + ...RT-net-sunrpc-xprtrdma-backchannel.c.patch | 249 + ...KPORT-net-sunrpc-xprtrdma-frwr_ops.c.patch | 237 + ...ACKPORT-net-sunrpc-xprtrdma-module.c.patch | 29 + ...KPORT-net-sunrpc-xprtrdma-rpc_rdma.c.patch | 460 + ...KPORT-net-sunrpc-xprtrdma-svc_rdma.c.patch | 55 + ...nrpc-xprtrdma-svc_rdma_backchannel.c.patch | 172 + ...T-net-sunrpc-xprtrdma-svc_rdma_pcl.c.patch | 62 + ...-sunrpc-xprtrdma-svc_rdma_recvfrom.c.patch | 1013 + ...et-sunrpc-xprtrdma-svc_rdma_sendto.c.patch | 813 + ...sunrpc-xprtrdma-svc_rdma_transport.c.patch | 278 + ...PORT-net-sunrpc-xprtrdma-transport.c.patch | 407 + ...KPORT-drivers-infiniband-core-umem.c.patch | 683 + ...RT-net-sunrpc-xprtrdma-svc_rdma_rw.c.patch | 939 + ...BACKPORT-net-sunrpc-xprtrdma-verbs.c.patch | 297 + ...-net-ethernet-mellanox-mlx5-core-en_.patch | 3011 +++ ...PORT-net-sunrpc-xprtrdma-xprt_rdma.h.patch | 172 + ...PORT-drivers-infiniband-core-sysfs.c.patch | 79 + ...ORT-drivers-nvme-host-fault_inject.c.patch | 22 + ...-net-ethernet-mellanox-mlx5-core-dev.patch | 33 + ...-net-ethernet-mellanox-mlx5-core-en_.patch | 1411 ++ ...2-BACKPORT-drivers-nvme-host-trace.h.patch | 46 + ...-net-ethernet-mellanox-mlx5-core-dev.patch | 797 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 24 + ...-net-ethernet-mellanox-mlx5-core-esw.patch | 322 + ...-net-ethernet-mellanox-mlx5-core-lag.patch | 22 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 111 + ...-net-ethernet-mellanox-mlx5-core-en_.patch | 385 + ...-net-ethernet-mellanox-mlx5-core-sf-.patch | 37 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 33 + ...-net-ethernet-mellanox-mlx5-core-esw.patch | 25 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 19 + ...-net-ethernet-mellanox-mlx5-core-lag.patch | 199 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 50 + ...-net-ethernet-mellanox-mlx5-core-esw.patch | 19 + ...-net-ethernet-mellanox-mlx5-core-lag.patch | 20 + ...-net-ethernet-mellanox-mlx5-core-pci.patch | 156 + ...-net-ethernet-mellanox-mlx5-core-esw.patch | 18 + ...-net-ethernet-mellanox-mlx5-core-lag.patch | 39 + ...ACKPORT-drivers-infiniband-core-rw.c.patch | 249 + ...-net-ethernet-mellanox-mlx5-core-lag.patch | 33 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 30 + ...-net-ethernet-mellanox-mlx5-core-irq.patch | 75 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 34 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 22 + ...-net-ethernet-mellanox-mlx5-core-mlx.patch | 66 + ...-net-ethernet-mellanox-mlx5-core-sf-.patch | 27 + ...-net-ethernet-mellanox-mlx5-core-sf-.patch | 21 + ...-net-ethernet-mellanox-mlx5-core-en_.patch | 119 + ...rivers-infiniband-hw-mlx5-main_ext.c.patch | 81 + ...ers-infiniband-ulp-iser-iser_verbs.c.patch | 22 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 67 + ...-drivers-infiniband-ulp-srp-ib_srp.c.patch | 968 + ...-net-ethernet-mellanox-mlx5-core-ecp.patch | 54 + .../0289-BACKPORT-include-net-psample.h.patch | 19 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 51 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 40 + ...-net-ethernet-mellanox-mlx5-core-sri.patch | 120 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 24 + .../0292-BACKPORT-include-rdma-ib.h.patch | 20 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 64 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 40 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 98 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 29 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 25 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 27 + ...-BACKPORT-drivers-nvme-target-loop.c.patch | 214 + ...07-BACKPORT-drivers-nvme-host-nvme.h.patch | 358 + ...08-BACKPORT-drivers-nvme-host-core.c.patch | 1869 ++ ...0309-BACKPORT-drivers-nvme-host-fc.c.patch | 340 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 46 + ...CKPORT-drivers-nvme-host-multipath.c.patch | 422 + ...11-BACKPORT-drivers-nvme-host-rdma.c.patch | 682 + ...BACKPORT-drivers-nvme-host-fabrics.c.patch | 106 + ...313-BACKPORT-drivers-nvme-host-pci.c.patch | 1791 ++ ...4-BACKPORT-drivers-nvme-target-zns.c.patch | 192 + ...vers-infiniband-ulp-isert-ib_isert.h.patch | 34 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 27 + ...-net-ethernet-mellanox-mlx5-core-en-.patch | 21 + ...-net-ethernet-mellanox-mlx5-core-sf-.patch | 40 + ...-net-ethernet-mellanox-mlx5-core-sf-.patch | 34 + src/mlnx-ofa_kernel-5.8/block/blk-mq-rdma.c | 44 + src/mlnx-ofa_kernel-5.8/code-metrics.txt | 9 + src/mlnx-ofa_kernel-5.8/compat/Makefile.am | 0 src/mlnx-ofa_kernel-5.8/compat/Makefile.real | 64 + src/mlnx-ofa_kernel-5.8/compat/autogen.sh | 20 + src/mlnx-ofa_kernel-5.8/compat/bitmap.c | 59 + src/mlnx-ofa_kernel-5.8/compat/build/Makefile | 48 + src/mlnx-ofa_kernel-5.8/compat/cls_api.c | 216 + src/mlnx-ofa_kernel-5.8/compat/compat-3.11.c | 369 + src/mlnx-ofa_kernel-5.8/compat/compat-3.12.c | 98 + src/mlnx-ofa_kernel-5.8/compat/compat-3.13.c | 45 + src/mlnx-ofa_kernel-5.8/compat/compat-3.15.c | 13 + src/mlnx-ofa_kernel-5.8/compat/compat-3.16.c | 70 + src/mlnx-ofa_kernel-5.8/compat/compat-3.18.c | 13 + src/mlnx-ofa_kernel-5.8/compat/compat-4.1.c | 48 + .../compat/compat_atomic.c | 35 + .../compat/compat_firmware_class.c | 764 + .../compat/config/.gitignore | 8 + .../compat/config/build-linux.m4 | 735 + .../compat/config/parallel-build.m4 | 155 + src/mlnx-ofa_kernel-5.8/compat/config/rdma.m4 | 15838 ++++++++++++++++ .../compat/config/warning_filter.sh | 20 + src/mlnx-ofa_kernel-5.8/compat/configure.ac | 56 + src/mlnx-ofa_kernel-5.8/compat/cordic.c | 105 + src/mlnx-ofa_kernel-5.8/compat/crc8.c | 93 + src/mlnx-ofa_kernel-5.8/compat/dim.c | 83 + src/mlnx-ofa_kernel-5.8/compat/exec.c | 15 + src/mlnx-ofa_kernel-5.8/compat/file.c | 49 + .../compat/flow_dissector.c | 1483 ++ src/mlnx-ofa_kernel-5.8/compat/flow_offload.c | 263 + src/mlnx-ofa_kernel-5.8/compat/idr.c | 57 + .../compat/interval_tree.c | 18 + src/mlnx-ofa_kernel-5.8/compat/kfifo.c | 629 + src/mlnx-ofa_kernel-5.8/compat/kstrtox.c | 249 + src/mlnx-ofa_kernel-5.8/compat/macsec.c | 72 + src/mlnx-ofa_kernel-5.8/compat/main.c | 104 + src/mlnx-ofa_kernel-5.8/compat/mm_util.c | 21 + src/mlnx-ofa_kernel-5.8/compat/mmu_notifier.c | 21 + src/mlnx-ofa_kernel-5.8/compat/net_dim.c | 246 + .../compat/nf_flow_table_core.c | 674 + .../compat/nf_flow_table_offload.c | 924 + src/mlnx-ofa_kernel-5.8/compat/output_core.c | 27 + src/mlnx-ofa_kernel-5.8/compat/pci.c | 393 + src/mlnx-ofa_kernel-5.8/compat/rdma_dim.c | 108 + src/mlnx-ofa_kernel-5.8/compat/rhashtable.c | 2344 +++ src/mlnx-ofa_kernel-5.8/compat/sch_codel.c | 309 + src/mlnx-ofa_kernel-5.8/compat/sch_fq_codel.c | 662 + .../compat/scripts/compat_firmware_install | 21 + .../compat/scripts/gen-compat-autoconf.sh | 105 + .../compat/scripts/gen-compat-config.sh | 77 + .../compat/scripts/skip-colors | 2 + src/mlnx-ofa_kernel-5.8/compat/string.c | 39 + src/mlnx-ofa_kernel-5.8/compat/syscall.c | 18 + src/mlnx-ofa_kernel-5.8/compat/uuid.c | 89 + src/mlnx-ofa_kernel-5.8/compat/xarray.c | 2195 +++ src/mlnx-ofa_kernel-5.8/compat/xz_crc32.c | 66 + src/mlnx-ofa_kernel-5.8/compat/xz_dec_bcj.c | 578 + src/mlnx-ofa_kernel-5.8/compat/xz_dec_lzma2.c | 1179 ++ .../compat/xz_dec_stream.c | 825 + src/mlnx-ofa_kernel-5.8/compat/xz_dec_syms.c | 39 + src/mlnx-ofa_kernel-5.8/compat/xz_lzma2.h | 204 + src/mlnx-ofa_kernel-5.8/compat/xz_private.h | 163 + src/mlnx-ofa_kernel-5.8/compat/xz_stream.h | 62 + src/mlnx-ofa_kernel-5.8/compat_base | 1 + src/mlnx-ofa_kernel-5.8/compat_base_tree | 1 + .../compat_base_tree_version | 1 + src/mlnx-ofa_kernel-5.8/compat_version | 1 + src/mlnx-ofa_kernel-5.8/configure | 1 + src/mlnx-ofa_kernel-5.8/debian/changelog | 5 + src/mlnx-ofa_kernel-5.8/debian/compat | 1 + src/mlnx-ofa_kernel-5.8/debian/control | 30 + .../debian/control.no_dkms | 23 + src/mlnx-ofa_kernel-5.8/debian/copyright | 19 + .../debian/mlnx-ofed-kernel-dkms.postinst | 43 + .../debian/mlnx-ofed-kernel-dkms.prerm | 43 + .../debian/mlnx-ofed-kernel-modules.postinst | 17 + .../debian/mlnx-ofed-kernel-modules.prerm | 10 + .../debian/mlnx-ofed-kernel-utils.examples | 2 + .../debian/mlnx-ofed-kernel-utils.postinst | 47 + .../debian/mlnx-ofed-kernel-utils.postrm | 14 + .../debian/mlnx-ofed-kernel-utils.prerm | 23 + src/mlnx-ofa_kernel-5.8/debian/rules | 218 + src/mlnx-ofa_kernel-5.8/debian/source/format | 1 + .../devtools/add_metadata.sh | 475 + .../devtools/verify_metadata.sh | 241 + src/mlnx-ofa_kernel-5.8/drivers/base/Makefile | 1 + .../drivers/base/auxiliary.c | 423 + .../drivers/infiniband/Kconfig | 111 + .../drivers/infiniband/Makefile | 5 + .../drivers/infiniband/core/Makefile | 55 + .../drivers/infiniband/core/addr.c | 890 + .../drivers/infiniband/core/agent.c | 221 + .../drivers/infiniband/core/agent.h | 51 + .../drivers/infiniband/core/cache.c | 1729 ++ .../drivers/infiniband/core/cgroup.c | 53 + .../drivers/infiniband/core/cm.c | 4548 +++++ .../drivers/infiniband/core/cm_msgs.h | 68 + .../drivers/infiniband/core/cm_trace.c | 15 + .../drivers/infiniband/core/cm_trace.h | 414 + .../drivers/infiniband/core/cma.c | 5475 ++++++ .../drivers/infiniband/core/cma_configfs.c | 367 + .../drivers/infiniband/core/cma_priv.h | 139 + .../drivers/infiniband/core/cma_trace.c | 16 + .../drivers/infiniband/core/cma_trace.h | 361 + .../drivers/infiniband/core/core_priv.h | 377 + .../drivers/infiniband/core/counters.c | 669 + .../drivers/infiniband/core/cq.c | 512 + .../drivers/infiniband/core/device.c | 2901 +++ .../drivers/infiniband/core/ib_addr_dummy.c | 60 + .../drivers/infiniband/core/ib_core_uverbs.c | 372 + .../drivers/infiniband/core/ib_mad_dummy.c | 60 + .../drivers/infiniband/core/ib_peer_mem.h | 65 + .../drivers/infiniband/core/ib_sa_dummy.c | 60 + .../drivers/infiniband/core/ib_ucm_dummy.c | 61 + .../drivers/infiniband/core/iwcm.c | 1223 ++ .../drivers/infiniband/core/iwcm.h | 62 + .../drivers/infiniband/core/iwpm_msg.c | 846 + .../drivers/infiniband/core/iwpm_util.c | 793 + .../drivers/infiniband/core/iwpm_util.h | 266 + .../drivers/infiniband/core/lag.c | 138 + .../drivers/infiniband/core/mad.c | 4145 ++++ .../drivers/infiniband/core/mad_priv.h | 274 + .../drivers/infiniband/core/mad_rmpp.c | 960 + .../drivers/infiniband/core/mad_rmpp.h | 58 + .../drivers/infiniband/core/mr_pool.c | 82 + .../drivers/infiniband/core/multicast.c | 906 + .../drivers/infiniband/core/netlink.c | 334 + .../drivers/infiniband/core/nldev.c | 2545 +++ .../drivers/infiniband/core/opa_smi.h | 78 + .../drivers/infiniband/core/packer.c | 201 + .../drivers/infiniband/core/peer_mem.c | 690 + .../drivers/infiniband/core/rdma_core.c | 1015 + .../drivers/infiniband/core/rdma_core.h | 191 + .../drivers/infiniband/core/restrack.c | 353 + .../drivers/infiniband/core/restrack.h | 36 + .../drivers/infiniband/core/roce_gid_mgmt.c | 938 + .../drivers/infiniband/core/rw.c | 762 + .../drivers/infiniband/core/sa.h | 64 + .../drivers/infiniband/core/sa_query.c | 2360 +++ .../drivers/infiniband/core/security.c | 750 + .../drivers/infiniband/core/smi.c | 338 + .../drivers/infiniband/core/smi.h | 90 + .../drivers/infiniband/core/sysfs.c | 1506 ++ .../drivers/infiniband/core/trace.c | 12 + .../drivers/infiniband/core/ucma.c | 1896 ++ .../drivers/infiniband/core/ud_header.c | 547 + .../drivers/infiniband/core/umem.c | 369 + .../drivers/infiniband/core/umem_dmabuf.c | 231 + .../drivers/infiniband/core/umem_odp.c | 517 + .../drivers/infiniband/core/user_mad.c | 1497 ++ .../drivers/infiniband/core/uverbs.h | 322 + .../drivers/infiniband/core/uverbs_cmd.c | 4044 ++++ .../drivers/infiniband/core/uverbs_ioctl.c | 836 + .../drivers/infiniband/core/uverbs_main.c | 1317 ++ .../drivers/infiniband/core/uverbs_marshall.c | 215 + .../infiniband/core/uverbs_std_types.c | 269 + .../core/uverbs_std_types_async_fd.c | 79 + .../core/uverbs_std_types_counters.c | 161 + .../infiniband/core/uverbs_std_types_cq.c | 222 + .../infiniband/core/uverbs_std_types_device.c | 503 + .../infiniband/core/uverbs_std_types_dm.c | 116 + .../core/uverbs_std_types_flow_action.c | 447 + .../infiniband/core/uverbs_std_types_mr.c | 385 + .../infiniband/core/uverbs_std_types_qp.c | 380 + .../infiniband/core/uverbs_std_types_srq.c | 234 + .../infiniband/core/uverbs_std_types_wq.c | 194 + .../drivers/infiniband/core/uverbs_uapi.c | 734 + .../drivers/infiniband/core/verbs.c | 3055 +++ .../drivers/infiniband/core/verbs_nvmf.c | 148 + .../drivers/infiniband/debug/Makefile | 3 + .../drivers/infiniband/debug/memtrack.c | 1428 ++ .../drivers/infiniband/debug/memtrack.h | 110 + .../drivers/infiniband/debug/mtrack.h | 1032 + .../drivers/infiniband/hw/Makefile | 18 + .../drivers/infiniband/hw/amso1100/Makefile | 3 + .../drivers/infiniband/hw/amso1100/main.c | 59 + .../drivers/infiniband/hw/bnxt_re/Makefile | 3 + .../drivers/infiniband/hw/bnxt_re/main.c | 59 + .../drivers/infiniband/hw/cxgb3/Makefile | 3 + .../drivers/infiniband/hw/cxgb3/main.c | 59 + .../drivers/infiniband/hw/cxgb4/Makefile | 3 + .../drivers/infiniband/hw/cxgb4/main.c | 59 + .../drivers/infiniband/hw/efa/Makefile | 3 + .../drivers/infiniband/hw/efa/efa_main.c | 59 + .../drivers/infiniband/hw/ehca/Makefile | 1 + .../drivers/infiniband/hw/ehca/ib_ehca.c | 59 + .../drivers/infiniband/hw/hfi1/Makefile | 3 + .../drivers/infiniband/hw/hfi1/main.c | 59 + .../drivers/infiniband/hw/hns/Makefile | 11 + .../infiniband/hw/hns/hns_roce_hw_v1.c | 59 + .../infiniband/hw/hns/hns_roce_hw_v2.c | 59 + .../drivers/infiniband/hw/hns/main.c | 59 + .../drivers/infiniband/hw/i40iw/Makefile | 3 + .../drivers/infiniband/hw/i40iw/main.c | 59 + .../drivers/infiniband/hw/ipath/Makefile | 4 + .../drivers/infiniband/hw/ipath/ib_ipath.c | 59 + .../drivers/infiniband/hw/irdma/Makefile | 3 + .../drivers/infiniband/hw/irdma/irdma_dummy.c | 59 + .../drivers/infiniband/hw/mlx4/Makefile | 3 + .../drivers/infiniband/hw/mlx4/main.c | 63 + .../drivers/infiniband/hw/mlx5/Kconfig | 9 + .../drivers/infiniband/hw/mlx5/Makefile | 33 + .../drivers/infiniband/hw/mlx5/ah.c | 159 + .../drivers/infiniband/hw/mlx5/cmd.c | 234 + .../drivers/infiniband/hw/mlx5/cmd.h | 62 + .../drivers/infiniband/hw/mlx5/cong.c | 464 + .../drivers/infiniband/hw/mlx5/counters.c | 931 + .../drivers/infiniband/hw/mlx5/counters.h | 17 + .../drivers/infiniband/hw/mlx5/cq.c | 1439 ++ .../drivers/infiniband/hw/mlx5/devx.c | 3286 ++++ .../drivers/infiniband/hw/mlx5/devx.h | 45 + .../drivers/infiniband/hw/mlx5/dm.c | 600 + .../drivers/infiniband/hw/mlx5/dm.h | 68 + .../drivers/infiniband/hw/mlx5/doorbell.c | 106 + .../drivers/infiniband/hw/mlx5/fs.c | 2819 +++ .../drivers/infiniband/hw/mlx5/fs.h | 29 + .../drivers/infiniband/hw/mlx5/gsi.c | 497 + .../drivers/infiniband/hw/mlx5/ib_rep.c | 251 + .../drivers/infiniband/hw/mlx5/ib_rep.h | 63 + .../drivers/infiniband/hw/mlx5/ib_virt.c | 225 + .../drivers/infiniband/hw/mlx5/mad.c | 646 + .../drivers/infiniband/hw/mlx5/main.c | 4893 +++++ .../drivers/infiniband/hw/mlx5/main_ext.c | 1725 ++ .../drivers/infiniband/hw/mlx5/mem.c | 296 + .../drivers/infiniband/hw/mlx5/mlx5_ib.h | 1741 ++ .../drivers/infiniband/hw/mlx5/mlx5_ib_ext.h | 192 + .../drivers/infiniband/hw/mlx5/mlx5_ib_nvmf.h | 85 + .../drivers/infiniband/hw/mlx5/mr.c | 3109 +++ .../drivers/infiniband/hw/mlx5/nvmf.c | 552 + .../drivers/infiniband/hw/mlx5/odp.c | 1824 ++ .../drivers/infiniband/hw/mlx5/qos.c | 133 + .../drivers/infiniband/hw/mlx5/qp.c | 5958 ++++++ .../drivers/infiniband/hw/mlx5/qp.h | 47 + .../drivers/infiniband/hw/mlx5/qp_nvmf.c | 75 + .../drivers/infiniband/hw/mlx5/qpc.c | 663 + .../drivers/infiniband/hw/mlx5/restrack.c | 179 + .../drivers/infiniband/hw/mlx5/restrack.h | 13 + .../drivers/infiniband/hw/mlx5/srq.c | 482 + .../drivers/infiniband/hw/mlx5/srq.h | 96 + .../drivers/infiniband/hw/mlx5/srq_cmd.c | 833 + .../drivers/infiniband/hw/mlx5/srq_nvmf.c | 158 + .../drivers/infiniband/hw/mlx5/srq_nvmf.h | 40 + .../drivers/infiniband/hw/mlx5/std_types.c | 222 + .../drivers/infiniband/hw/mlx5/wr.c | 1555 ++ .../drivers/infiniband/hw/mlx5/wr.h | 76 + .../drivers/infiniband/hw/mthca/Makefile | 3 + .../drivers/infiniband/hw/mthca/main.c | 59 + .../drivers/infiniband/hw/nes/Makefile | 3 + .../drivers/infiniband/hw/nes/main.c | 59 + .../drivers/infiniband/hw/ocrdma/Makefile | 1 + .../drivers/infiniband/hw/ocrdma/ocrdma.c | 59 + .../drivers/infiniband/hw/qedr/Makefile | 3 + .../drivers/infiniband/hw/qedr/main.c | 59 + .../drivers/infiniband/hw/qib/Makefile | 3 + .../drivers/infiniband/hw/qib/main.c | 59 + .../drivers/infiniband/hw/usnic/Makefile | 3 + .../drivers/infiniband/hw/usnic/main.c | 59 + .../drivers/infiniband/hw/vmw_pvrdma/Makefile | 3 + .../drivers/infiniband/hw/vmw_pvrdma/main.c | 59 + .../drivers/infiniband/sw/Makefile | 4 + .../drivers/infiniband/sw/rdmavt/Makefile | 3 + .../drivers/infiniband/sw/rdmavt/main.c | 59 + .../drivers/infiniband/sw/rxe/Makefile | 24 + .../infiniband/sw/rxe/rdma_rxe_dummy.c | 61 + .../drivers/infiniband/sw/siw/Makefile | 3 + .../drivers/infiniband/sw/siw/main.c | 59 + .../drivers/infiniband/ulp/Makefile | 12 + .../drivers/infiniband/ulp/ipoib/Kconfig | 50 + .../drivers/infiniband/ulp/ipoib/Makefile | 13 + .../drivers/infiniband/ulp/ipoib/ipoib.h | 868 + .../drivers/infiniband/ulp/ipoib/ipoib_cm.c | 1708 ++ .../infiniband/ulp/ipoib/ipoib_ethtool.c | 341 + .../drivers/infiniband/ulp/ipoib/ipoib_fs.c | 251 + .../infiniband/ulp/ipoib/ipoib_genetlink.c | 284 + .../drivers/infiniband/ulp/ipoib/ipoib_ib.c | 1397 ++ .../drivers/infiniband/ulp/ipoib/ipoib_main.c | 2886 +++ .../infiniband/ulp/ipoib/ipoib_multicast.c | 1055 + .../infiniband/ulp/ipoib/ipoib_netlink.c | 193 + .../infiniband/ulp/ipoib/ipoib_verbs.c | 298 + .../drivers/infiniband/ulp/ipoib/ipoib_vlan.c | 294 + .../infiniband/ulp/ipoib_1.5.3/Kconfig | 50 + .../infiniband/ulp/ipoib_1.5.3/Makefile | 12 + .../infiniband/ulp/ipoib_1.5.3/inet_lro.c | 605 + .../infiniband/ulp/ipoib_1.5.3/ipoib.h | 869 + .../infiniband/ulp/ipoib_1.5.3/ipoib_cm.c | 1675 ++ .../ulp/ipoib_1.5.3/ipoib_ethtool.c | 238 + .../infiniband/ulp/ipoib_1.5.3/ipoib_fs.c | 297 + .../infiniband/ulp/ipoib_1.5.3/ipoib_ib.c | 1163 ++ .../infiniband/ulp/ipoib_1.5.3/ipoib_main.c | 2067 ++ .../ulp/ipoib_1.5.3/ipoib_multicast.c | 1094 ++ .../infiniband/ulp/ipoib_1.5.3/ipoib_verbs.c | 305 + .../infiniband/ulp/ipoib_1.5.3/ipoib_vlan.c | 228 + .../drivers/infiniband/ulp/iser/Kconfig | 13 + .../drivers/infiniband/ulp/iser/Makefile | 31 + .../infiniband/ulp/iser/Module.supported | 1 + .../drivers/infiniband/ulp/iser/_makefile_ | 219 + .../drivers/infiniband/ulp/iser/autogen.sh | 9 + .../infiniband/ulp/iser/common.postinst | 296 + .../infiniband/ulp/iser/debian/changelog | 5 + .../drivers/infiniband/ulp/iser/debian/compat | 1 + .../infiniband/ulp/iser/debian/control | 17 + .../ulp/iser/debian/control.no_dkms | 14 + .../infiniband/ulp/iser/debian/copyright | 19 + .../ulp/iser/debian/iser-dkms.postinst | 43 + .../ulp/iser/debian/iser-dkms.prerm | 13 + .../drivers/infiniband/ulp/iser/debian/rules | 109 + .../infiniband/ulp/iser/debian/source/format | 1 + .../drivers/infiniband/ulp/iser/dkms.conf | 19 + .../infiniband/ulp/iser/ib_iser_dummy.c | 60 + .../drivers/infiniband/ulp/iser/iscsi_iser.c | 1086 ++ .../drivers/infiniband/ulp/iser/iscsi_iser.h | 585 + .../infiniband/ulp/iser/iser_initiator.c | 771 + .../drivers/infiniband/ulp/iser/iser_memory.c | 382 + .../drivers/infiniband/ulp/iser/iser_spec_ | 216 + .../drivers/infiniband/ulp/iser/iser_verbs.c | 970 + .../infiniband/ulp/iser/tools/sign-modules | 58 + .../drivers/infiniband/ulp/isert/Kconfig | 6 + .../drivers/infiniband/ulp/isert/Makefile | 29 + .../infiniband/ulp/isert/Module.supported | 1 + .../drivers/infiniband/ulp/isert/_makefile_ | 161 + .../drivers/infiniband/ulp/isert/autogen.sh | 9 + .../infiniband/ulp/isert/common.postinst | 296 + .../infiniband/ulp/isert/debian/changelog | 5 + .../infiniband/ulp/isert/debian/compat | 1 + .../infiniband/ulp/isert/debian/control | 17 + .../ulp/isert/debian/control.no_dkms | 14 + .../infiniband/ulp/isert/debian/copyright | 19 + .../ulp/isert/debian/isert-dkms.postinst | 43 + .../ulp/isert/debian/isert-dkms.prerm | 13 + .../drivers/infiniband/ulp/isert/debian/rules | 109 + .../infiniband/ulp/isert/debian/source/format | 1 + .../drivers/infiniband/ulp/isert/dkms.conf | 19 + .../drivers/infiniband/ulp/isert/ib_isert.c | 2703 +++ .../drivers/infiniband/ulp/isert/ib_isert.h | 215 + .../infiniband/ulp/isert/ib_isert_dummy.c | 60 + .../drivers/infiniband/ulp/isert/isert_spec_ | 216 + .../infiniband/ulp/isert/tools/sign-modules | 58 + .../drivers/infiniband/ulp/opa_vnic/Makefile | 3 + .../drivers/infiniband/ulp/opa_vnic/main.c | 59 + .../drivers/infiniband/ulp/rtrs/Makefile | 8 + .../infiniband/ulp/rtrs/rtrs-clt_dummy.c | 59 + .../infiniband/ulp/rtrs/rtrs-core_dummy.c | 59 + .../infiniband/ulp/rtrs/rtrs-srv_dummy.c | 59 + .../drivers/infiniband/ulp/srp/Kbuild | 26 + .../drivers/infiniband/ulp/srp/Kconfig | 13 + .../infiniband/ulp/srp/Module.supported | 2 + .../drivers/infiniband/ulp/srp/_makefile_ | 187 + .../drivers/infiniband/ulp/srp/autogen.sh | 14 + .../infiniband/ulp/srp/common.postinst | 296 + .../infiniband/ulp/srp/debian/changelog | 5 + .../drivers/infiniband/ulp/srp/debian/compat | 1 + .../drivers/infiniband/ulp/srp/debian/control | 17 + .../infiniband/ulp/srp/debian/control.no_dkms | 14 + .../infiniband/ulp/srp/debian/copyright | 19 + .../drivers/infiniband/ulp/srp/debian/rules | 110 + .../infiniband/ulp/srp/debian/source/format | 1 + .../ulp/srp/debian/srp-dkms.postinst | 53 + .../infiniband/ulp/srp/debian/srp-dkms.prerm | 21 + .../ulp/srp/debian/srp-modules.postinst | 16 + .../ulp/srp/debian/srp-modules.prerm | 15 + .../drivers/infiniband/ulp/srp/dkms.conf | 22 + .../drivers/infiniband/ulp/srp/ib_srp.c | 4157 ++++ .../drivers/infiniband/ulp/srp/ib_srp.h | 340 + .../drivers/infiniband/ulp/srp/ib_srp_dummy.c | 61 + .../drivers/infiniband/ulp/srp/srp_spec_ | 221 + .../infiniband/ulp/srp/tools/sign-modules | 58 + .../drivers/infiniband/ulp/srpt/Makefile | 3 + .../drivers/infiniband/ulp/srpt/main.c | 59 + .../drivers/infiniband/ulp/xsigo/Makefile | 4 + .../infiniband/ulp/xsigo/xscore/Makefile | 2 + .../infiniband/ulp/xsigo/xscore/main.c | 59 + .../infiniband/ulp/xsigo/xsvhba/Makefile | 2 + .../infiniband/ulp/xsigo/xsvhba/main.c | 59 + .../infiniband/ulp/xsigo/xsvnic/Makefile | 2 + .../infiniband/ulp/xsigo/xsvnic/main.c | 59 + .../drivers/infiniband/ulp/xsigo/xve/Makefile | 2 + .../drivers/infiniband/ulp/xsigo/xve/main.c | 59 + .../drivers/net/ethernet/mellanox/Kconfig | 27 + .../drivers/net/ethernet/mellanox/Makefile | 10 + .../net/ethernet/mellanox/mlx5/core/Kconfig | 244 + .../net/ethernet/mellanox/mlx5/core/Makefile | 130 + .../ethernet/mellanox/mlx5/core/accel/accel.h | 36 + .../ethernet/mellanox/mlx5/core/accel/ipsec.c | 179 + .../ethernet/mellanox/mlx5/core/accel/ipsec.h | 96 + .../mellanox/mlx5/core/accel/ipsec_offload.c | 456 + .../mellanox/mlx5/core/accel/ipsec_offload.h | 38 + .../ethernet/mellanox/mlx5/core/accel/tls.c | 125 + .../ethernet/mellanox/mlx5/core/accel/tls.h | 156 + .../net/ethernet/mellanox/mlx5/core/alloc.c | 316 + .../net/ethernet/mellanox/mlx5/core/cmd.c | 2641 +++ .../net/ethernet/mellanox/mlx5/core/compat.c | 162 + .../net/ethernet/mellanox/mlx5/core/compat.h | 18 + .../net/ethernet/mellanox/mlx5/core/cq.c | 230 + .../net/ethernet/mellanox/mlx5/core/crdump.c | 322 + .../net/ethernet/mellanox/mlx5/core/debugfs.c | 582 + .../net/ethernet/mellanox/mlx5/core/dev.c | 667 + .../net/ethernet/mellanox/mlx5/core/devlink.c | 1070 ++ .../net/ethernet/mellanox/mlx5/core/devlink.h | 57 + .../ethernet/mellanox/mlx5/core/diag/crdump.c | 115 + .../mellanox/mlx5/core/diag/diag_cnt.c | 737 + .../mellanox/mlx5/core/diag/diag_cnt.h | 48 + .../mlx5/core/diag/en_rep_tracepoint.h | 54 + .../mlx5/core/diag/en_tc_tracepoint.c | 58 + .../mlx5/core/diag/en_tc_tracepoint.h | 114 + .../mellanox/mlx5/core/diag/fs_tracepoint.c | 280 + .../mellanox/mlx5/core/diag/fs_tracepoint.h | 323 + .../mellanox/mlx5/core/diag/fw_tracer.c | 1153 ++ .../mellanox/mlx5/core/diag/fw_tracer.h | 196 + .../mlx5/core/diag/fw_tracer_tracepoint.h | 79 + .../mellanox/mlx5/core/diag/rsc_dump.c | 311 + .../mellanox/mlx5/core/diag/rsc_dump.h | 27 + .../net/ethernet/mellanox/mlx5/core/ecpf.c | 741 + .../net/ethernet/mellanox/mlx5/core/ecpf.h | 26 + .../net/ethernet/mellanox/mlx5/core/en.h | 1376 ++ .../net/ethernet/mellanox/mlx5/core/en/aso.c | 449 + .../net/ethernet/mellanox/mlx5/core/en/aso.h | 152 + .../ethernet/mellanox/mlx5/core/en/channels.c | 46 + .../ethernet/mellanox/mlx5/core/en/channels.h | 16 + .../ethernet/mellanox/mlx5/core/en/dcbnl.h | 55 + .../ethernet/mellanox/mlx5/core/en/devlink.c | 69 + .../ethernet/mellanox/mlx5/core/en/devlink.h | 21 + .../net/ethernet/mellanox/mlx5/core/en/fs.h | 260 + .../mellanox/mlx5/core/en/fs_tt_redirect.c | 605 + .../mellanox/mlx5/core/en/fs_tt_redirect.h | 26 + .../ethernet/mellanox/mlx5/core/en/health.c | 339 + .../ethernet/mellanox/mlx5/core/en/health.h | 58 + .../mellanox/mlx5/core/en/hv_vhca_stats.c | 159 + .../mellanox/mlx5/core/en/hv_vhca_stats.h | 18 + .../mellanox/mlx5/core/en/ipsec_aso.c | 130 + .../mellanox/mlx5/core/en/ipsec_aso.h | 22 + .../ethernet/mellanox/mlx5/core/en/mapping.c | 263 + .../ethernet/mellanox/mlx5/core/en/mapping.h | 32 + .../ethernet/mellanox/mlx5/core/en/mod_hdr.c | 215 + .../ethernet/mellanox/mlx5/core/en/mod_hdr.h | 57 + .../mellanox/mlx5/core/en/monitor_stats.c | 151 + .../mellanox/mlx5/core/en/monitor_stats.h | 12 + .../ethernet/mellanox/mlx5/core/en/params.c | 857 + .../ethernet/mellanox/mlx5/core/en/params.h | 171 + .../net/ethernet/mellanox/mlx5/core/en/port.c | 594 + .../net/ethernet/mellanox/mlx5/core/en/port.h | 78 + .../mellanox/mlx5/core/en/port_buffer.c | 362 + .../mellanox/mlx5/core/en/port_buffer.h | 74 + .../net/ethernet/mellanox/mlx5/core/en/ptp.c | 837 + .../net/ethernet/mellanox/mlx5/core/en/ptp.h | 102 + .../net/ethernet/mellanox/mlx5/core/en/qos.c | 1140 ++ .../net/ethernet/mellanox/mlx5/core/en/qos.h | 53 + .../ethernet/mellanox/mlx5/core/en/rep/bond.c | 351 + .../mellanox/mlx5/core/en/rep/bridge.c | 536 + .../mellanox/mlx5/core/en/rep/bridge.h | 21 + .../mellanox/mlx5/core/en/rep/meter.c | 211 + .../mellanox/mlx5/core/en/rep/meter.h | 18 + .../mellanox/mlx5/core/en/rep/neigh.c | 398 + .../mellanox/mlx5/core/en/rep/neigh.h | 35 + .../mellanox/mlx5/core/en/rep/sysfs.c | 309 + .../mellanox/mlx5/core/en/rep/sysfs.h | 12 + .../ethernet/mellanox/mlx5/core/en/rep/tc.c | 902 + .../ethernet/mellanox/mlx5/core/en/rep/tc.h | 73 + .../mellanox/mlx5/core/en/reporter_rx.c | 782 + .../mellanox/mlx5/core/en/reporter_tx.c | 615 + .../net/ethernet/mellanox/mlx5/core/en/rqt.c | 170 + .../net/ethernet/mellanox/mlx5/core/en/rqt.h | 42 + .../net/ethernet/mellanox/mlx5/core/en/rss.c | 606 + .../net/ethernet/mellanox/mlx5/core/en/rss.h | 50 + .../ethernet/mellanox/mlx5/core/en/rx_res.c | 728 + .../ethernet/mellanox/mlx5/core/en/rx_res.h | 72 + .../mellanox/mlx5/core/en/tc/act/accept.c | 31 + .../mellanox/mlx5/core/en/tc/act/act.c | 175 + .../mellanox/mlx5/core/en/tc/act/act.h | 115 + .../mellanox/mlx5/core/en/tc/act/csum.c | 62 + .../mellanox/mlx5/core/en/tc/act/ct.c | 108 + .../mellanox/mlx5/core/en/tc/act/drop.c | 30 + .../mellanox/mlx5/core/en/tc/act/goto.c | 123 + .../mellanox/mlx5/core/en/tc/act/mark.c | 36 + .../mellanox/mlx5/core/en/tc/act/mirred.c | 317 + .../mellanox/mlx5/core/en/tc/act/mirred_nic.c | 51 + .../mellanox/mlx5/core/en/tc/act/mpls.c | 98 + .../mellanox/mlx5/core/en/tc/act/pedit.c | 164 + .../mellanox/mlx5/core/en/tc/act/pedit.h | 31 + .../mellanox/mlx5/core/en/tc/act/police.c | 143 + .../mellanox/mlx5/core/en/tc/act/prio.c | 46 + .../mellanox/mlx5/core/en/tc/act/ptype.c | 36 + .../mlx5/core/en/tc/act/redirect_ingress.c | 78 + .../mellanox/mlx5/core/en/tc/act/sample.c | 71 + .../mellanox/mlx5/core/en/tc/act/sample.h | 14 + .../mellanox/mlx5/core/en/tc/act/trap.c | 38 + .../mellanox/mlx5/core/en/tc/act/tun.c | 63 + .../mellanox/mlx5/core/en/tc/act/vlan.c | 222 + .../mellanox/mlx5/core/en/tc/act/vlan.h | 29 + .../mlx5/core/en/tc/act/vlan_mangle.c | 86 + .../ethernet/mellanox/mlx5/core/en/tc/ct_fs.h | 49 + .../mellanox/mlx5/core/en/tc/ct_fs_dmfs.c | 79 + .../mellanox/mlx5/core/en/tc/ct_fs_smfs.c | 380 + .../mellanox/mlx5/core/en/tc/int_port.c | 506 + .../mellanox/mlx5/core/en/tc/int_port.h | 65 + .../ethernet/mellanox/mlx5/core/en/tc/meter.c | 831 + .../ethernet/mellanox/mlx5/core/en/tc/meter.h | 87 + .../mellanox/mlx5/core/en/tc/post_act.c | 184 + .../mellanox/mlx5/core/en/tc/post_act.h | 43 + .../mellanox/mlx5/core/en/tc/post_meter.c | 209 + .../mellanox/mlx5/core/en/tc/post_meter.h | 29 + .../mellanox/mlx5/core/en/tc/sample.c | 655 + .../mellanox/mlx5/core/en/tc/sample.h | 66 + .../ethernet/mellanox/mlx5/core/en/tc_ct.c | 2394 +++ .../ethernet/mellanox/mlx5/core/en/tc_ct.h | 255 + .../ethernet/mellanox/mlx5/core/en/tc_priv.h | 214 + .../ethernet/mellanox/mlx5/core/en/tc_tun.c | 995 + .../ethernet/mellanox/mlx5/core/en/tc_tun.h | 120 + .../mellanox/mlx5/core/en/tc_tun_encap.c | 1781 ++ .../mellanox/mlx5/core/en/tc_tun_encap.h | 41 + .../mellanox/mlx5/core/en/tc_tun_geneve.c | 375 + .../mellanox/mlx5/core/en/tc_tun_gre.c | 98 + .../mellanox/mlx5/core/en/tc_tun_mplsoudp.c | 128 + .../mellanox/mlx5/core/en/tc_tun_vxlan.c | 163 + .../net/ethernet/mellanox/mlx5/core/en/tir.c | 207 + .../net/ethernet/mellanox/mlx5/core/en/tir.h | 58 + .../net/ethernet/mellanox/mlx5/core/en/trap.c | 322 + .../net/ethernet/mellanox/mlx5/core/en/trap.h | 37 + .../net/ethernet/mellanox/mlx5/core/en/txrx.h | 479 + .../net/ethernet/mellanox/mlx5/core/en/xdp.c | 592 + .../net/ethernet/mellanox/mlx5/core/en/xdp.h | 187 + .../ethernet/mellanox/mlx5/core/en/xsk/pool.c | 220 + .../ethernet/mellanox/mlx5/core/en/xsk/pool.h | 27 + .../ethernet/mellanox/mlx5/core/en/xsk/rx.c | 112 + .../ethernet/mellanox/mlx5/core/en/xsk/rx.h | 52 + .../mellanox/mlx5/core/en/xsk/setup.c | 185 + .../mellanox/mlx5/core/en/xsk/setup.h | 21 + .../ethernet/mellanox/mlx5/core/en/xsk/tx.c | 126 + .../ethernet/mellanox/mlx5/core/en/xsk/tx.h | 27 + .../mellanox/mlx5/core/en_accel/en_accel.h | 220 + .../mellanox/mlx5/core/en_accel/fs_tcp.c | 402 + .../mellanox/mlx5/core/en_accel/fs_tcp.h | 27 + .../mellanox/mlx5/core/en_accel/ipsec.c | 1034 + .../mellanox/mlx5/core/en_accel/ipsec.h | 189 + .../mellanox/mlx5/core/en_accel/ipsec_fs.c | 939 + .../mellanox/mlx5/core/en_accel/ipsec_fs.h | 26 + .../mellanox/mlx5/core/en_accel/ipsec_rxtx.c | 657 + .../mellanox/mlx5/core/en_accel/ipsec_rxtx.h | 186 + .../mellanox/mlx5/core/en_accel/ipsec_stats.c | 191 + .../mellanox/mlx5/core/en_accel/ktls.c | 140 + .../mellanox/mlx5/core/en_accel/ktls.h | 69 + .../mellanox/mlx5/core/en_accel/ktls_rx.c | 747 + .../mellanox/mlx5/core/en_accel/ktls_tx.c | 496 + .../mellanox/mlx5/core/en_accel/ktls_txrx.c | 119 + .../mellanox/mlx5/core/en_accel/ktls_txrx.h | 74 + .../mellanox/mlx5/core/en_accel/ktls_utils.h | 86 + .../mellanox/mlx5/core/en_accel/macsec.c | 1986 ++ .../mellanox/mlx5/core/en_accel/macsec.h | 73 + .../mellanox/mlx5/core/en_accel/macsec_fs.c | 2027 ++ .../mellanox/mlx5/core/en_accel/macsec_fs.h | 59 + .../mlx5/core/en_accel/macsec_stats.c | 72 + .../mellanox/mlx5/core/en_accel/tls.c | 247 + .../mellanox/mlx5/core/en_accel/tls.h | 132 + .../mellanox/mlx5/core/en_accel/tls_rxtx.c | 390 + .../mellanox/mlx5/core/en_accel/tls_rxtx.h | 91 + .../mellanox/mlx5/core/en_accel/tls_stats.c | 105 + .../net/ethernet/mellanox/mlx5/core/en_arfs.c | 732 + .../ethernet/mellanox/mlx5/core/en_common.c | 175 + .../ethernet/mellanox/mlx5/core/en_dcbnl.c | 1293 ++ .../ethernet/mellanox/mlx5/core/en_debugfs.c | 193 + .../net/ethernet/mellanox/mlx5/core/en_diag.c | 302 + .../net/ethernet/mellanox/mlx5/core/en_dim.c | 64 + .../net/ethernet/mellanox/mlx5/core/en_ecn.c | 1259 ++ .../net/ethernet/mellanox/mlx5/core/en_ecn.h | 176 + .../ethernet/mellanox/mlx5/core/en_ethtool.c | 2580 +++ .../net/ethernet/mellanox/mlx5/core/en_fs.c | 1357 ++ .../mellanox/mlx5/core/en_fs_ethtool.c | 1035 + .../net/ethernet/mellanox/mlx5/core/en_main.c | 6269 ++++++ .../net/ethernet/mellanox/mlx5/core/en_rep.c | 1739 ++ .../net/ethernet/mellanox/mlx5/core/en_rep.h | 298 + .../net/ethernet/mellanox/mlx5/core/en_rx.c | 2706 +++ .../ethernet/mellanox/mlx5/core/en_selftest.c | 371 + .../ethernet/mellanox/mlx5/core/en_stats.c | 2404 +++ .../ethernet/mellanox/mlx5/core/en_stats.h | 477 + .../ethernet/mellanox/mlx5/core/en_sysfs.c | 1608 ++ .../net/ethernet/mellanox/mlx5/core/en_tc.c | 6112 ++++++ .../net/ethernet/mellanox/mlx5/core/en_tc.h | 397 + .../net/ethernet/mellanox/mlx5/core/en_tx.c | 1058 ++ .../net/ethernet/mellanox/mlx5/core/en_txrx.c | 264 + .../net/ethernet/mellanox/mlx5/core/eq.c | 1256 ++ .../ethernet/mellanox/mlx5/core/esw/Makefile | 2 + .../mellanox/mlx5/core/esw/acl/egress_lgcy.c | 288 + .../mellanox/mlx5/core/esw/acl/egress_ofld.c | 266 + .../mellanox/mlx5/core/esw/acl/helper.c | 171 + .../mellanox/mlx5/core/esw/acl/helper.h | 28 + .../mellanox/mlx5/core/esw/acl/ingress_lgcy.c | 366 + .../mellanox/mlx5/core/esw/acl/ingress_ofld.c | 414 + .../mellanox/mlx5/core/esw/acl/lgcy.h | 17 + .../mellanox/mlx5/core/esw/acl/ofld.h | 44 + .../ethernet/mellanox/mlx5/core/esw/bridge.c | 1605 ++ .../ethernet/mellanox/mlx5/core/esw/bridge.h | 69 + .../mellanox/mlx5/core/esw/bridge_priv.h | 63 + .../ethernet/mellanox/mlx5/core/esw/debugfs.c | 182 + .../mellanox/mlx5/core/esw/devlink_port.c | 219 + .../mellanox/mlx5/core/esw/devm_port.c | 144 + .../mlx5/core/esw/diag/bridge_tracepoint.h | 120 + .../mlx5/core/esw/diag/qos_tracepoint.h | 123 + .../mellanox/mlx5/core/esw/indir_table.c | 523 + .../mellanox/mlx5/core/esw/indir_table.h | 77 + .../ethernet/mellanox/mlx5/core/esw/ipsec.c | 790 + .../ethernet/mellanox/mlx5/core/esw/ipsec.h | 56 + .../ethernet/mellanox/mlx5/core/esw/legacy.c | 529 + .../ethernet/mellanox/mlx5/core/esw/legacy.h | 22 + .../mellanox/mlx5/core/esw/pet_offloads.c | 406 + .../net/ethernet/mellanox/mlx5/core/esw/qos.c | 1136 ++ .../net/ethernet/mellanox/mlx5/core/esw/qos.h | 84 + .../mellanox/mlx5/core/esw/vf_meter.c | 477 + .../mellanox/mlx5/core/esw/vf_meter.h | 18 + .../mellanox/mlx5/core/esw/vporttbl.c | 140 + .../net/ethernet/mellanox/mlx5/core/eswitch.c | 2731 +++ .../net/ethernet/mellanox/mlx5/core/eswitch.h | 1016 + .../mlx5/core/eswitch_devlink_compat.c | 451 + .../mellanox/mlx5/core/eswitch_offloads.c | 4471 +++++ .../mlx5/core/eswitch_offloads_termtbl.c | 335 + .../net/ethernet/mellanox/mlx5/core/events.c | 449 + .../ethernet/mellanox/mlx5/core/fpga/cmd.c | 235 + .../ethernet/mellanox/mlx5/core/fpga/cmd.h | 91 + .../ethernet/mellanox/mlx5/core/fpga/conn.c | 1001 + .../ethernet/mellanox/mlx5/core/fpga/conn.h | 96 + .../ethernet/mellanox/mlx5/core/fpga/core.c | 376 + .../ethernet/mellanox/mlx5/core/fpga/core.h | 114 + .../ethernet/mellanox/mlx5/core/fpga/ipsec.c | 1583 ++ .../ethernet/mellanox/mlx5/core/fpga/ipsec.h | 62 + .../ethernet/mellanox/mlx5/core/fpga/sdk.c | 170 + .../ethernet/mellanox/mlx5/core/fpga/sdk.h | 214 + .../ethernet/mellanox/mlx5/core/fpga/tls.c | 622 + .../ethernet/mellanox/mlx5/core/fpga/tls.h | 74 + .../net/ethernet/mellanox/mlx5/core/fs_cmd.c | 1114 ++ .../net/ethernet/mellanox/mlx5/core/fs_cmd.h | 121 + .../net/ethernet/mellanox/mlx5/core/fs_core.c | 3624 ++++ .../net/ethernet/mellanox/mlx5/core/fs_core.h | 357 + .../ethernet/mellanox/mlx5/core/fs_counters.c | 807 + .../ethernet/mellanox/mlx5/core/fs_ft_pool.c | 85 + .../ethernet/mellanox/mlx5/core/fs_ft_pool.h | 21 + .../net/ethernet/mellanox/mlx5/core/fw.c | 952 + .../net/ethernet/mellanox/mlx5/core/fw_exp.c | 52 + .../ethernet/mellanox/mlx5/core/fw_reset.c | 775 + .../ethernet/mellanox/mlx5/core/fw_reset.h | 23 + .../net/ethernet/mellanox/mlx5/core/health.c | 964 + .../mellanox/mlx5/core/ipoib/ethtool.c | 278 + .../ethernet/mellanox/mlx5/core/ipoib/ipoib.c | 785 + .../ethernet/mellanox/mlx5/core/ipoib/ipoib.h | 121 + .../mellanox/mlx5/core/ipoib/ipoib_vlan.c | 359 + .../mellanox/mlx5/core/irq_affinity.c | 229 + .../ethernet/mellanox/mlx5/core/lag/debugfs.c | 175 + .../net/ethernet/mellanox/mlx5/core/lag/lag.c | 1758 ++ .../net/ethernet/mellanox/mlx5/core/lag/lag.h | 126 + .../net/ethernet/mellanox/mlx5/core/lag/mp.c | 371 + .../net/ethernet/mellanox/mlx5/core/lag/mp.h | 42 + .../mellanox/mlx5/core/lag/port_sel.c | 641 + .../mellanox/mlx5/core/lag/port_sel.h | 50 + .../net/ethernet/mellanox/mlx5/core/lib/aso.c | 433 + .../net/ethernet/mellanox/mlx5/core/lib/aso.h | 78 + .../ethernet/mellanox/mlx5/core/lib/clock.c | 1021 + .../ethernet/mellanox/mlx5/core/lib/clock.h | 121 + .../ethernet/mellanox/mlx5/core/lib/crypto.c | 73 + .../ethernet/mellanox/mlx5/core/lib/devcom.c | 262 + .../ethernet/mellanox/mlx5/core/lib/devcom.h | 48 + .../net/ethernet/mellanox/mlx5/core/lib/dm.c | 307 + .../net/ethernet/mellanox/mlx5/core/lib/eq.h | 109 + .../mellanox/mlx5/core/lib/fs_chains.c | 810 + .../mellanox/mlx5/core/lib/fs_chains.h | 96 + .../ethernet/mellanox/mlx5/core/lib/fs_ttc.c | 608 + .../ethernet/mellanox/mlx5/core/lib/fs_ttc.h | 72 + .../ethernet/mellanox/mlx5/core/lib/geneve.c | 157 + .../ethernet/mellanox/mlx5/core/lib/geneve.h | 33 + .../net/ethernet/mellanox/mlx5/core/lib/gid.c | 152 + .../net/ethernet/mellanox/mlx5/core/lib/hv.c | 64 + .../net/ethernet/mellanox/mlx5/core/lib/hv.h | 22 + .../ethernet/mellanox/mlx5/core/lib/hv_vhca.c | 371 + .../ethernet/mellanox/mlx5/core/lib/hv_vhca.h | 101 + .../ethernet/mellanox/mlx5/core/lib/mlx5.h | 108 + .../ethernet/mellanox/mlx5/core/lib/mpfs.c | 211 + .../ethernet/mellanox/mlx5/core/lib/mpfs.h | 92 + .../ethernet/mellanox/mlx5/core/lib/pci_vsc.c | 316 + .../ethernet/mellanox/mlx5/core/lib/pci_vsc.h | 32 + .../mellanox/mlx5/core/lib/port_tun.c | 187 + .../mellanox/mlx5/core/lib/port_tun.h | 24 + .../net/ethernet/mellanox/mlx5/core/lib/sf.h | 45 + .../ethernet/mellanox/mlx5/core/lib/smfs.c | 68 + .../ethernet/mellanox/mlx5/core/lib/smfs.h | 36 + .../ethernet/mellanox/mlx5/core/lib/tout.c | 159 + .../ethernet/mellanox/mlx5/core/lib/tout.h | 42 + .../ethernet/mellanox/mlx5/core/lib/vxlan.c | 197 + .../ethernet/mellanox/mlx5/core/lib/vxlan.h | 70 + .../net/ethernet/mellanox/mlx5/core/main.c | 2469 +++ .../net/ethernet/mellanox/mlx5/core/mcg.c | 63 + .../ethernet/mellanox/mlx5/core/mlx5_core.h | 483 + .../ethernet/mellanox/mlx5/core/mlx5_devm.c | 860 + .../ethernet/mellanox/mlx5/core/mlx5_devm.h | 50 + .../mellanox/mlx5/core/mlx5_esw_devm.h | 45 + .../ethernet/mellanox/mlx5/core/mlx5_irq.h | 66 + .../net/ethernet/mellanox/mlx5/core/mr.c | 125 + .../ethernet/mellanox/mlx5/core/mst_dump.c | 7490 ++++++++ .../ethernet/mellanox/mlx5/core/pagealloc.c | 917 + .../net/ethernet/mellanox/mlx5/core/params.c | 197 + .../net/ethernet/mellanox/mlx5/core/pci_irq.c | 808 + .../net/ethernet/mellanox/mlx5/core/pci_irq.h | 40 + .../net/ethernet/mellanox/mlx5/core/pd.c | 60 + .../net/ethernet/mellanox/mlx5/core/port.c | 1455 ++ .../net/ethernet/mellanox/mlx5/core/qos.c | 85 + .../net/ethernet/mellanox/mlx5/core/qos.h | 30 + .../net/ethernet/mellanox/mlx5/core/rdma.c | 186 + .../net/ethernet/mellanox/mlx5/core/rdma.h | 20 + .../net/ethernet/mellanox/mlx5/core/rl.c | 399 + .../net/ethernet/mellanox/mlx5/core/sf/cmd.c | 49 + .../mellanox/mlx5/core/sf/dev/cfg_driver.c | 263 + .../mellanox/mlx5/core/sf/dev/cfg_driver.h | 22 + .../ethernet/mellanox/mlx5/core/sf/dev/dev.c | 381 + .../ethernet/mellanox/mlx5/core/sf/dev/dev.h | 64 + .../mlx5/core/sf/dev/diag/dev_tracepoint.h | 58 + .../mellanox/mlx5/core/sf/dev/driver.c | 133 + .../ethernet/mellanox/mlx5/core/sf/devlink.c | 597 + .../mlx5/core/sf/diag/sf_tracepoint.h | 173 + .../mlx5/core/sf/diag/vhca_tracepoint.h | 40 + .../ethernet/mellanox/mlx5/core/sf/hw_table.c | 364 + .../mlx5/core/sf/mlx5_ifc_vhca_event.h | 82 + .../net/ethernet/mellanox/mlx5/core/sf/priv.h | 22 + .../net/ethernet/mellanox/mlx5/core/sf/sf.h | 68 + .../mellanox/mlx5/core/sf/vhca_event.c | 191 + .../mellanox/mlx5/core/sf/vhca_event.h | 56 + .../net/ethernet/mellanox/mlx5/core/sriov.c | 324 + .../ethernet/mellanox/mlx5/core/sriov_sysfs.c | 1589 ++ .../mellanox/mlx5/core/steering/Makefile | 2 + .../mellanox/mlx5/core/steering/dr_action.c | 2017 ++ .../mellanox/mlx5/core/steering/dr_buddy.c | 168 + .../mellanox/mlx5/core/steering/dr_cmd.c | 875 + .../mellanox/mlx5/core/steering/dr_dbg.c | 658 + .../mellanox/mlx5/core/steering/dr_dbg.h | 15 + .../mellanox/mlx5/core/steering/dr_domain.c | 585 + .../mellanox/mlx5/core/steering/dr_fw.c | 171 + .../mellanox/mlx5/core/steering/dr_icm_pool.c | 752 + .../mellanox/mlx5/core/steering/dr_matcher.c | 1108 ++ .../mellanox/mlx5/core/steering/dr_rule.c | 1341 ++ .../mellanox/mlx5/core/steering/dr_send.c | 1353 ++ .../mellanox/mlx5/core/steering/dr_ste.c | 1415 ++ .../mellanox/mlx5/core/steering/dr_ste.h | 209 + .../mellanox/mlx5/core/steering/dr_ste_v0.c | 1995 ++ .../mellanox/mlx5/core/steering/dr_ste_v1.c | 2483 +++ .../mellanox/mlx5/core/steering/dr_ste_v2.c | 183 + .../mellanox/mlx5/core/steering/dr_table.c | 316 + .../mellanox/mlx5/core/steering/dr_types.h | 1586 ++ .../mellanox/mlx5/core/steering/fs_dr.c | 817 + .../mellanox/mlx5/core/steering/fs_dr.h | 69 + .../mellanox/mlx5/core/steering/mlx5_ifc_dr.h | 661 + .../mlx5/core/steering/mlx5_ifc_dr_ste_v1.h | 434 + .../mellanox/mlx5/core/steering/mlx5dr.h | 187 + .../ethernet/mellanox/mlx5/core/transobj.c | 505 + .../net/ethernet/mellanox/mlx5/core/uar.c | 407 + .../net/ethernet/mellanox/mlx5/core/vport.c | 1221 ++ .../net/ethernet/mellanox/mlx5/core/wq.c | 261 + .../net/ethernet/mellanox/mlx5/core/wq.h | 308 + .../net/ethernet/mellanox/mlxfw/Kconfig | 15 + .../net/ethernet/mellanox/mlxfw/Makefile | 3 + .../net/ethernet/mellanox/mlxfw/mlxfw.h | 115 + .../net/ethernet/mellanox/mlxfw/mlxfw_fsm.c | 439 + .../net/ethernet/mellanox/mlxfw/mlxfw_mfa2.c | 592 + .../net/ethernet/mellanox/mlxfw/mlxfw_mfa2.h | 35 + .../ethernet/mellanox/mlxfw/mlxfw_mfa2_file.h | 29 + .../mellanox/mlxfw/mlxfw_mfa2_format.h | 73 + .../ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv.h | 67 + .../mellanox/mlxfw/mlxfw_mfa2_tlv_multi.c | 98 + .../mellanox/mlxfw/mlxfw_mfa2_tlv_multi.h | 41 + .../net/ethernet/mellanox/mlxsw/Makefile | 3 + .../ethernet/mellanox/mlxsw/spectrum_main.c | 58 + src/mlnx-ofa_kernel-5.8/drivers/nvme/Kconfig | 7 + src/mlnx-ofa_kernel-5.8/drivers/nvme/Makefile | 24 + .../drivers/nvme/Module.supported | 11 + .../drivers/nvme/_makefile_ | 196 + .../drivers/nvme/autogen.sh | 11 + .../drivers/nvme/common.postinst | 296 + .../drivers/nvme/debian/changelog | 5 + .../drivers/nvme/debian/compat | 1 + .../drivers/nvme/debian/control | 17 + .../drivers/nvme/debian/control.no_dkms | 14 + .../drivers/nvme/debian/copyright | 19 + .../nvme/debian/mlnx-nvme-dkms.postinst | 43 + .../drivers/nvme/debian/mlnx-nvme-dkms.prerm | 11 + .../drivers/nvme/debian/rules | 109 + .../drivers/nvme/debian/source/format | 1 + .../drivers/nvme/dkms.conf | 62 + .../drivers/nvme/host/Kconfig | 85 + .../drivers/nvme/host/Makefile | 66 + .../drivers/nvme/host/core.c | 4891 +++++ .../drivers/nvme/host/fabrics.c | 1205 ++ .../drivers/nvme/host/fabrics.h | 203 + .../drivers/nvme/host/fault_inject.c | 82 + .../drivers/nvme/host/fc.c | 4052 ++++ .../drivers/nvme/host/fc.h | 227 + .../drivers/nvme/host/hwmon.c | 267 + .../drivers/nvme/host/ioctl.c | 497 + .../drivers/nvme/host/multipath.c | 921 + .../drivers/nvme/host/nvfs-dma.c | 43 + .../drivers/nvme/host/nvfs-dma.h | 112 + .../drivers/nvme/host/nvfs-rdma.c | 43 + .../drivers/nvme/host/nvfs-rdma.h | 121 + .../drivers/nvme/host/nvfs.h | 103 + .../drivers/nvme/host/nvme-core_dummy.c | 60 + .../drivers/nvme/host/nvme-fabrics_dummy.c | 60 + .../drivers/nvme/host/nvme-fc_dummy.c | 60 + .../drivers/nvme/host/nvme-rdma_dummy.c | 60 + .../drivers/nvme/host/nvme.h | 940 + .../drivers/nvme/host/nvme_dummy.c | 60 + .../drivers/nvme/host/nvme_snap_vfio_pci.c | 366 + .../drivers/nvme/host/passthru.c | 49 + .../drivers/nvme/host/passthru.h | 15 + .../drivers/nvme/host/pci.c | 3829 ++++ .../drivers/nvme/host/qla2xxx_dummy.c | 60 + .../drivers/nvme/host/rdma.c | 2581 +++ .../drivers/nvme/host/tcm_qla2xxx_dummy.c | 60 + .../drivers/nvme/host/tcp.c | 2693 +++ .../drivers/nvme/host/trace.c | 325 + .../drivers/nvme/host/trace.h | 175 + .../drivers/nvme/host/zns.c | 250 + .../drivers/nvme/lpfc/Makefile | 11 + .../drivers/nvme/lpfc/lpfc_dummy.c | 60 + .../drivers/nvme/mlnx-nvme_spec_ | 226 + .../drivers/nvme/target/Kconfig | 85 + .../drivers/nvme/target/Makefile | 42 + .../drivers/nvme/target/admin-cmd.c | 1078 ++ .../drivers/nvme/target/configfs.c | 2277 +++ .../drivers/nvme/target/core.c | 1863 ++ .../drivers/nvme/target/discovery.c | 407 + .../drivers/nvme/target/fabrics-cmd.c | 338 + .../drivers/nvme/target/fc.c | 2948 +++ .../drivers/nvme/target/fcloop.c | 1654 ++ .../drivers/nvme/target/io-cmd-bdev.c | 463 + .../drivers/nvme/target/io-cmd-file.c | 410 + .../drivers/nvme/target/loop.c | 744 + .../drivers/nvme/target/nvme-fcloop_dummy.c | 60 + .../drivers/nvme/target/nvme-loop_dummy.c | 60 + .../drivers/nvme/target/nvmet-fc_dummy.c | 60 + .../drivers/nvme/target/nvmet-rdma_dummy.c | 60 + .../drivers/nvme/target/nvmet.h | 756 + .../drivers/nvme/target/nvmet_dummy.c | 60 + .../drivers/nvme/target/passthru.c | 653 + .../drivers/nvme/target/rdma.c | 2279 +++ .../drivers/nvme/target/rdma_offload.c | 1127 ++ .../drivers/nvme/target/rdma_offload.h | 132 + .../drivers/nvme/target/tcp.c | 1879 ++ .../drivers/nvme/target/trace.c | 235 + .../drivers/nvme/target/trace.h | 165 + .../drivers/nvme/target/zns.c | 625 + .../drivers/nvme/tools/sign-modules | 58 + src/mlnx-ofa_kernel-5.8/drivers/scsi/Makefile | 1 + .../drivers/scsi/scsi_priv.h | 193 + .../drivers/scsi/scsi_transport_srp.c | 905 + .../drivers/vdpa/mlx5/Makefile | 3 + .../drivers/vdpa/mlx5/vdpa_main.c | 58 + src/mlnx-ofa_kernel-5.8/fs/cifs/Makefile | 3 + src/mlnx-ofa_kernel-5.8/fs/cifs/cifs_main.c | 58 + .../include/asm-generic/bug.h | 12 + .../include/linux/auxiliary_bus.h | 251 + src/mlnx-ofa_kernel-5.8/include/linux/bit.h | 14 + .../include/linux/bitfield.h | 165 + .../include/linux/bitmap.h | 41 + .../include/linux/bitops.h | 17 + src/mlnx-ofa_kernel-5.8/include/linux/bits.h | 24 + .../include/linux/blk-mq-pci.h | 43 + .../include/linux/blk-mq-rdma.h | 20 + .../include/linux/blk-mq.h | 172 + .../include/linux/blk_types.h | 12 + .../include/linux/blkdev.h | 95 + src/mlnx-ofa_kernel-5.8/include/linux/bpf.h | 56 + .../include/linux/bpf_trace.h | 11 + .../include/linux/build_bug.h | 17 + src/mlnx-ofa_kernel-5.8/include/linux/cdev.h | 48 + .../include/linux/cgroup_rdma.h | 10 + .../include/linux/compat-2.6.h | 81 + .../include/linux/compat-3.10.h | 8 + .../include/linux/compat-3.12.h | 16 + .../include/linux/compat-3.15.h | 13 + .../include/linux/compat-4.0.h | 26 + .../include/linux/compat-4.1.h | 23 + .../include/linux/compat-4.10.h | 102 + .../include/linux/compat_fix.h | 55 + .../include/linux/compiler-clang.h | 26 + .../include/linux/compiler-gcc.h | 26 + .../include/linux/compiler-intel.h | 17 + .../include/linux/compiler.h | 54 + .../include/linux/compiler_attributes.h | 28 + src/mlnx-ofa_kernel-5.8/include/linux/dcbnl.h | 54 + .../include/linux/device.h | 114 + src/mlnx-ofa_kernel-5.8/include/linux/dim.h | 351 + .../include/linux/ethtool.h | 272 + .../include/linux/export.h | 16 + .../include/linux/filter.h | 61 + .../include/linux/firmware.h | 13 + src/mlnx-ofa_kernel-5.8/include/linux/fs.h | 22 + src/mlnx-ofa_kernel-5.8/include/linux/gfp.h | 26 + .../include/linux/hashtable.h | 124 + src/mlnx-ofa_kernel-5.8/include/linux/idr.h | 20 + .../include/linux/if_ether.h | 15 + .../include/linux/if_link.h | 26 + .../include/linux/if_vlan.h | 41 + .../include/linux/indirect_call_wrapper.h | 71 + src/mlnx-ofa_kernel-5.8/include/linux/inet.h | 122 + .../include/linux/inet_lro.h | 10 + .../include/linux/inetdevice.h | 54 + .../include/linux/interval_tree.h | 16 + .../include/linux/irq_poll.h | 10 + .../include/linux/kconfig.h | 12 + .../include/linux/kern_levels.h | 22 + .../include/linux/kernel.h | 31 + src/mlnx-ofa_kernel-5.8/include/linux/kmod.h | 33 + src/mlnx-ofa_kernel-5.8/include/linux/kref.h | 16 + src/mlnx-ofa_kernel-5.8/include/linux/list.h | 32 + src/mlnx-ofa_kernel-5.8/include/linux/llist.h | 13 + .../include/linux/lockdep.h | 15 + src/mlnx-ofa_kernel-5.8/include/linux/log2.h | 37 + .../include/linux/mlx5/accel.h | 170 + .../include/linux/mlx5/cq.h | 207 + .../include/linux/mlx5/device.h | 1554 ++ .../include/linux/mlx5/doorbell.h | 60 + .../include/linux/mlx5/driver.h | 1571 ++ .../include/linux/mlx5/eq.h | 63 + .../include/linux/mlx5/eswitch.h | 220 + .../include/linux/mlx5/fs.h | 321 + .../include/linux/mlx5/fs_helpers.h | 142 + .../include/linux/mlx5/macsec.h | 9 + .../include/linux/mlx5/mlx5_ifc.h | 12470 ++++++++++++ .../include/linux/mlx5/mlx5_ifc_fpga.h | 616 + .../include/linux/mlx5/mlx5_ifc_vdpa.h | 168 + .../include/linux/mlx5/mpfs.h | 18 + .../include/linux/mlx5/nvmf.h | 112 + .../include/linux/mlx5/port.h | 250 + .../include/linux/mlx5/qp.h | 582 + .../include/linux/mlx5/rsc_dump.h | 51 + .../include/linux/mlx5/transobj.h | 89 + .../include/linux/mlx5/vport.h | 138 + src/mlnx-ofa_kernel-5.8/include/linux/mm.h | 108 + .../include/linux/mmu_notifier.h | 18 + .../include/linux/mod_devicetable.h | 912 + .../include/linux/module.h | 19 + src/mlnx-ofa_kernel-5.8/include/linux/net.h | 25 + .../include/linux/netdev_features.h | 19 + .../include/linux/netdevice.h | 338 + .../include/linux/nodemask.h | 17 + .../include/linux/nospec.h | 70 + .../include/linux/nvme-fc-driver.h | 1052 + .../include/linux/nvme-fc.h | 438 + .../include/linux/nvme-pci.h | 16 + .../include/linux/nvme-peer.h | 64 + .../include/linux/nvme-rdma.h | 89 + src/mlnx-ofa_kernel-5.8/include/linux/nvme.h | 1677 ++ .../include/linux/overflow.h | 307 + .../include/linux/page_ref.h | 35 + .../include/linux/pci-p2pdma.h | 104 + src/mlnx-ofa_kernel-5.8/include/linux/pci.h | 121 + .../include/linux/pci_regs.h | 53 + .../include/linux/pm_qos.h | 85 + src/mlnx-ofa_kernel-5.8/include/linux/poll.h | 9 + .../include/linux/radix-tree.h | 23 + .../include/linux/rbtree.h | 32 + .../include/linux/rculist.h | 35 + .../include/linux/rcupdate.h | 55 + .../include/linux/refcount.h | 48 + .../include/linux/rhashtable.h | 2150 +++ .../include/linux/scatterlist.h | 152 + src/mlnx-ofa_kernel-5.8/include/linux/sched.h | 23 + .../include/linux/sched/mm.h | 36 + .../include/linux/sched/signal.h | 10 + .../include/linux/sched/task.h | 10 + src/mlnx-ofa_kernel-5.8/include/linux/sdt.h | 16 + .../include/linux/seq_file.h | 23 + .../include/linux/skbuff.h | 33 + src/mlnx-ofa_kernel-5.8/include/linux/slab.h | 47 + .../include/linux/stddef.h | 27 + .../include/linux/string.h | 54 + .../include/linux/sunrpc/auth.h | 13 + .../include/linux/sunrpc/rpc_rdma.h | 191 + .../include/linux/sunrpc/rpc_rdma_cid.h | 24 + .../include/linux/sunrpc/svc_rdma.h | 225 + .../include/linux/sunrpc/svc_rdma_pcl.h | 128 + .../include/linux/sunrpc/xprtrdma.h | 73 + src/mlnx-ofa_kernel-5.8/include/linux/sysfs.h | 30 + .../include/linux/t10-pi.h | 231 + .../include/linux/timekeeping.h | 15 + src/mlnx-ofa_kernel-5.8/include/linux/types.h | 27 + .../include/linux/uaccess.h | 12 + src/mlnx-ofa_kernel-5.8/include/linux/units.h | 94 + src/mlnx-ofa_kernel-5.8/include/linux/uuid.h | 73 + .../include/linux/xarray.h | 1836 ++ src/mlnx-ofa_kernel-5.8/include/linux/xz.h | 284 + .../include/net/addrconf.h | 57 + src/mlnx-ofa_kernel-5.8/include/net/bareudp.h | 19 + src/mlnx-ofa_kernel-5.8/include/net/bonding.h | 184 + src/mlnx-ofa_kernel-5.8/include/net/devlink.h | 210 + src/mlnx-ofa_kernel-5.8/include/net/dst.h | 23 + .../include/net/dst_metadata.h | 102 + .../include/net/flow_dissector.h | 537 + .../include/net/flow_keys.h | 26 + .../include/net/flow_offload.h | 391 + src/mlnx-ofa_kernel-5.8/include/net/geneve.h | 21 + src/mlnx-ofa_kernel-5.8/include/net/gre.h | 22 + src/mlnx-ofa_kernel-5.8/include/net/ip_fib.h | 24 + .../include/net/ip_tunnels.h | 158 + src/mlnx-ofa_kernel-5.8/include/net/ipv6.h | 24 + .../include/net/ipv6_stubs.h | 10 + src/mlnx-ofa_kernel-5.8/include/net/macsec.h | 18 + src/mlnx-ofa_kernel-5.8/include/net/mlxdevm.h | 449 + .../include/net/netfilter/nf_flow_table.h | 14 + .../net/netfilter/nf_flow_table_4_18.h | 235 + src/mlnx-ofa_kernel-5.8/include/net/netlink.h | 39 + src/mlnx-ofa_kernel-5.8/include/net/pkt_cls.h | 230 + src/mlnx-ofa_kernel-5.8/include/net/psample.h | 26 + src/mlnx-ofa_kernel-5.8/include/net/sock.h | 45 + .../include/net/switchdev.h | 63 + .../include/net/tc_act/tc_csum.h | 34 + .../include/net/tc_act/tc_ct.h | 17 + .../include/net/tc_act/tc_ct_4_18.h | 94 + .../include/net/tc_act/tc_gact.h | 120 + .../include/net/tc_act/tc_mirred.h | 107 + .../include/net/tc_act/tc_mpls.h | 10 + .../include/net/tc_act/tc_pedit.h | 96 + .../include/net/tc_act/tc_tunnel_key.h | 212 + .../include/net/tc_act/tc_vlan.h | 65 + src/mlnx-ofa_kernel-5.8/include/net/tls.h | 27 + src/mlnx-ofa_kernel-5.8/include/net/vxlan.h | 33 + src/mlnx-ofa_kernel-5.8/include/net/xdp.h | 17 + src/mlnx-ofa_kernel-5.8/include/net/xfrm.h | 12 + src/mlnx-ofa_kernel-5.8/include/rdma/ib.h | 81 + .../include/rdma/ib_addr.h | 270 + .../include/rdma/ib_cache.h | 118 + src/mlnx-ofa_kernel-5.8/include/rdma/ib_cm.h | 574 + .../include/rdma/ib_hdrs.h | 307 + src/mlnx-ofa_kernel-5.8/include/rdma/ib_mad.h | 819 + .../include/rdma/ib_marshall.h | 28 + .../include/rdma/ib_pack.h | 284 + src/mlnx-ofa_kernel-5.8/include/rdma/ib_pma.h | 130 + src/mlnx-ofa_kernel-5.8/include/rdma/ib_sa.h | 609 + src/mlnx-ofa_kernel-5.8/include/rdma/ib_smi.h | 158 + .../include/rdma/ib_sysfs.h | 37 + .../include/rdma/ib_umem.h | 230 + .../include/rdma/ib_umem_odp.h | 115 + .../include/rdma/ib_verbs.h | 4851 +++++ .../include/rdma/ib_verbs_nvmf.h | 63 + .../include/rdma/ib_verbs_nvmf_def.h | 53 + src/mlnx-ofa_kernel-5.8/include/rdma/iba.h | 146 + .../include/rdma/ibta_vol1_c12.h | 219 + src/mlnx-ofa_kernel-5.8/include/rdma/iw_cm.h | 226 + .../include/rdma/iw_portmap.h | 65 + src/mlnx-ofa_kernel-5.8/include/rdma/lag.h | 23 + .../include/rdma/mr_pool.h | 17 + .../include/rdma/opa_addr.h | 91 + .../include/rdma/opa_port_info.h | 385 + .../include/rdma/opa_smi.h | 124 + .../include/rdma/opa_vnic.h | 97 + .../include/rdma/peer_mem.h | 175 + .../include/rdma/rdma_cm.h | 394 + .../include/rdma/rdma_cm_ib.h | 27 + .../include/rdma/rdma_counter.h | 68 + .../include/rdma/rdma_netlink.h | 125 + .../include/rdma/rdma_vt.h | 532 + .../include/rdma/rdmavt_cq.h | 67 + .../include/rdma/rdmavt_mr.h | 155 + .../include/rdma/rdmavt_qp.h | 1003 + .../include/rdma/restrack.h | 186 + src/mlnx-ofa_kernel-5.8/include/rdma/rw.h | 73 + .../include/rdma/signature.h | 124 + .../include/rdma/tid_rdma_defs.h | 108 + .../include/rdma/uverbs_ioctl.h | 1016 + .../include/rdma/uverbs_named_ioctl.h | 97 + .../include/rdma/uverbs_std_types.h | 178 + .../include/rdma/uverbs_types.h | 184 + src/mlnx-ofa_kernel-5.8/include/scsi/iser.h | 78 + src/mlnx-ofa_kernel-5.8/include/scsi/scsi.h | 12 + .../include/scsi/scsi_device.h | 20 + .../include/scsi/scsi_transport_srp.h | 145 + src/mlnx-ofa_kernel-5.8/include/scsi/srp.h | 310 + .../include/trace/events/ib_mad.h | 390 + .../include/trace/events/ib_umad.h | 126 + .../include/trace/events/rdma.h | 168 + .../include/trace/events/rdma_core.h | 394 + .../include/trace/events/rpcrdma.h | 2243 +++ .../include/trace/events/sunrpc_base.h | 18 + .../include/uapi/linux/devlink.h | 99 + .../include/uapi/linux/eventpoll.h | 18 + .../include/uapi/linux/net_tstamp.h | 12 + .../include/uapi/linux/nvme_ioctl.h | 104 + .../include/uapi/linux/pkt_cls.h | 294 + .../include/uapi/linux/tc_act/tc_ct.h | 18 + .../include/uapi/linux/tc_act/tc_ct_4_18.h | 43 + .../include/uapi/linux/tc_act/tc_pedit.h | 79 + .../include/uapi/linux/tc_act/tc_tunnel_key.h | 75 + .../include/uapi/mlxdevm/mlxdevm_netlink.h | 206 + .../include/uapi/rdma/bnxt_re-abi.h | 126 + .../include/uapi/rdma/cxgb4-abi.h | 115 + .../include/uapi/rdma/efa-abi.h | 133 + .../include/uapi/rdma/hfi/hfi1_ioctl.h | 174 + .../include/uapi/rdma/hfi/hfi1_user.h | 268 + .../include/uapi/rdma/hns-abi.h | 99 + .../include/uapi/rdma/ib_user_ioctl_cmds.h | 385 + .../include/uapi/rdma/ib_user_ioctl_verbs.h | 270 + .../include/uapi/rdma/ib_user_mad.h | 239 + .../include/uapi/rdma/ib_user_sa.h | 77 + .../include/uapi/rdma/ib_user_verbs.h | 1301 ++ .../include/uapi/rdma/irdma-abi.h | 111 + .../include/uapi/rdma/mlx4-abi.h | 191 + .../include/uapi/rdma/mlx5-abi.h | 523 + .../include/uapi/rdma/mlx5_user_ioctl_cmds.h | 361 + .../include/uapi/rdma/mlx5_user_ioctl_verbs.h | 127 + .../include/uapi/rdma/mthca-abi.h | 112 + .../include/uapi/rdma/ocrdma-abi.h | 152 + .../include/uapi/rdma/qedr-abi.h | 174 + .../include/uapi/rdma/rdma_netlink.h | 595 + .../include/uapi/rdma/rdma_user_cm.h | 341 + .../include/uapi/rdma/rdma_user_ioctl.h | 91 + .../include/uapi/rdma/rdma_user_ioctl_cmds.h | 87 + .../include/uapi/rdma/rdma_user_rxe.h | 223 + .../include/uapi/rdma/rvt-abi.h | 66 + .../include/uapi/rdma/siw-abi.h | 186 + .../include/uapi/rdma/vmw_pvrdma-abi.h | 310 + src/mlnx-ofa_kernel-5.8/makefile | 1 + src/mlnx-ofa_kernel-5.8/net/9p/9pnet_rdma.c | 59 + src/mlnx-ofa_kernel-5.8/net/9p/Makefile | 1 + src/mlnx-ofa_kernel-5.8/net/mlxdevm/Makefile | 1 + src/mlnx-ofa_kernel-5.8/net/mlxdevm/mlxdevm.c | 2463 +++ src/mlnx-ofa_kernel-5.8/net/rds/Makefile | 3 + .../net/rds/rds_rdma_dummy.c | 61 + src/mlnx-ofa_kernel-5.8/net/sched/Makefile | 16 + .../net/sched/act_ct_4_18.c | 1562 ++ src/mlnx-ofa_kernel-5.8/net/sched/act_pedit.c | 668 + .../net/sched/act_tunnel_key.c | 605 + src/mlnx-ofa_kernel-5.8/net/sched/act_vlan.c | 301 + .../net/sched/cls_flower_4_18.c | 2250 +++ .../net/sched/cls_flower_compat.c | 4554 +++++ src/mlnx-ofa_kernel-5.8/net/smc/Makefile | 5 + .../net/smc/smc_diag_dummy.c | 59 + src/mlnx-ofa_kernel-5.8/net/smc/smc_dummy.c | 59 + .../net/sunrpc/xprtrdma/Makefile | 50 + .../net/sunrpc/xprtrdma/Module.supported | 3 + .../net/sunrpc/xprtrdma/_makefile_ | 161 + .../net/sunrpc/xprtrdma/autogen.sh | 11 + .../net/sunrpc/xprtrdma/backchannel.c | 282 + .../net/sunrpc/xprtrdma/common.postinst | 296 + .../net/sunrpc/xprtrdma/debian/changelog | 5 + .../net/sunrpc/xprtrdma/debian/compat | 1 + .../net/sunrpc/xprtrdma/debian/control | 17 + .../sunrpc/xprtrdma/debian/control.no_dkms | 14 + .../net/sunrpc/xprtrdma/debian/copyright | 19 + .../debian/mlnx-nfsrdma-dkms.postinst | 43 + .../xprtrdma/debian/mlnx-nfsrdma-dkms.prerm | 11 + .../net/sunrpc/xprtrdma/debian/rules | 108 + .../net/sunrpc/xprtrdma/debian/source/format | 1 + .../net/sunrpc/xprtrdma/dkms.conf | 28 + .../net/sunrpc/xprtrdma/frwr_ops.c | 729 + .../net/sunrpc/xprtrdma/mlnx-nfsrdma_spec_ | 216 + .../net/sunrpc/xprtrdma/module.c | 52 + .../net/sunrpc/xprtrdma/nvfs.h | 103 + .../net/sunrpc/xprtrdma/nvfs_rpc_rdma.c | 43 + .../net/sunrpc/xprtrdma/nvfs_rpc_rdma.h | 59 + .../net/sunrpc/xprtrdma/rpc_rdma.c | 1505 ++ .../net/sunrpc/xprtrdma/rpcrdma_dummy.c | 59 + .../net/sunrpc/xprtrdma/svc_rdma.c | 300 + .../sunrpc/xprtrdma/svc_rdma_backchannel.c | 293 + .../net/sunrpc/xprtrdma/svc_rdma_pcl.c | 306 + .../net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 869 + .../net/sunrpc/xprtrdma/svc_rdma_rw.c | 1165 ++ .../net/sunrpc/xprtrdma/svc_rdma_sendto.c | 1044 + .../net/sunrpc/xprtrdma/svc_rdma_transport.c | 610 + .../net/sunrpc/xprtrdma/svcrdma_dummy.c | 76 + .../net/sunrpc/xprtrdma/tools/sign-modules | 58 + .../net/sunrpc/xprtrdma/transport.c | 806 + .../net/sunrpc/xprtrdma/verbs.c | 1401 ++ .../net/sunrpc/xprtrdma/xprt_rdma.h | 601 + .../net/sunrpc/xprtrdma/xprtrdma_dummy.c | 76 + .../ofed_scripts/82-net-setup-link.rules | 12 + .../ofed_scripts/83-mlnx-sf-name.rules | 7 + .../ofed_scripts/90-ib.rules | 27 + src/mlnx-ofa_kernel-5.8/ofed_scripts/Makefile | 61 + .../ofed_scripts/autoversion.sh | 32 + .../ofed_scripts/auxdev-sf-netdev-rename | 39 + .../ofed_scripts/backports_copy_patches.sh | 122 + .../ofed_scripts/backports_fixup_changes.sh | 184 + .../ofed_scripts/check_ofed_kernel.sh | 155 + .../ofed_scripts/checkout_files | 48 + src/mlnx-ofa_kernel-5.8/ofed_scripts/cleanup | 87 + .../ofed_scripts/common.postinst | 296 + .../ofed_scripts/configure | 2007 ++ .../ofed_scripts/convert_to_mlnx_en | 65 + .../deb_mlnx_en_service_install_helper | 42 + .../ofed_scripts/dkms_build_ulps.sh | 75 + .../ofed_scripts/dkms_ofed | 217 + .../ofed_scripts/dkms_ofed_post_build.sh | 72 + .../ofed_scripts/gen-compat-autoconf.sh | 116 + .../ofed_scripts/gen-compat-config.sh | 789 + .../ofed_scripts/generate_dkms_conf.sh | 98 + .../generate_mlnx_en_dkms_conf.sh | 89 + .../ofed_scripts/get_backport_dir.sh | 238 + .../ofed_scripts/get_backports_per_path.sh | 114 + .../ofed_scripts/ib_ipoib.conf | 6 + .../ofed_scripts/ibdev2netdev | 319 + .../ofed_scripts/install_helper | 45 + src/mlnx-ofa_kernel-5.8/ofed_scripts/makefile | 361 + .../ofed_scripts/makefile.packaging | 184 + .../ofed_scripts/mlnx-bf.conf | 1 + .../mlnx-ofed-kernel-utils.openibd.init | 1720 ++ .../mlnx-ofed-kernel-utils.openibd.upstart | 42 + .../ofed_scripts/mlnx.conf | 4 + .../ofed_scripts/mlnx_bf_assign_ct_cores.sh | 45 + .../ofed_scripts/mlnx_conf_mgr.sh | 254 + .../ofed_scripts/mlnx_en/LICENSE | 14 + .../mlnx_en/MLNX_EN_Linux_README.txt | 415 + .../mlnx_en/MLNX_EN_Linux_Release_Notes.txt | 274 + .../ofed_scripts/mlnx_en/Makefile.mlnx_en | 23 + .../ofed_scripts/mlnx_en/debian/changelog | 5 + .../ofed_scripts/mlnx_en/debian/compat | 1 + .../ofed_scripts/mlnx_en/debian/control | 27 + .../mlnx_en/debian/control.no_dkms | 20 + .../ofed_scripts/mlnx_en/debian/copyright | 19 + .../mlnx_en/debian/mlnx-en-dkms.postinst | 21 + .../mlnx_en/debian/mlnx-en-dkms.prerm | 13 + .../mlnx_en/debian/mlnx-en-utils.postinst | 15 + .../mlnx_en/debian/mlnx-en-utils.prerm | 14 + .../mlnx_en/debian/mlnx-en.upstart | 42 + .../ofed_scripts/mlnx_en/debian/rules | 156 + .../ofed_scripts/mlnx_en/debian/source/format | 1 + .../ofed_scripts/mlnx_en/kmodtool.rh5 | 346 + .../ofed_scripts/mlnx_en/kmodtool.rh7 | 270 + .../ofed_scripts/mlnx_en/kmp-tool.sh | 243 + .../ofed_scripts/mlnx_en/makefile.mlnx_en | 213 + .../ofed_scripts/mlnx_en/mlnx-en.conf | 15 + .../ofed_scripts/mlnx_en/mlnx-en.d | 767 + .../ofed_scripts/mlnx_en/mlnx-en.d.service | 18 + .../ofed_scripts/mlnx_en/mlnx-en.d_deb | 665 + .../ofed_scripts/mlnx_en/mlnx_en_patch.sh | 842 + .../ofed_scripts/mlnx_en/mlx4.files | 5 + .../ofed_scripts/mlnx_en/mlx4.files.sles10 | 3 + .../ofed_scripts/mlnx_en/mlx4.files.sles11 | 3 + .../ofed_scripts/mlnx_en/mlx4_core.conf | 1 + .../ofed_scripts/mlnx_en/mlx4_en.7 | 171 + .../ofed_scripts/mlnx_en/mlx4_en.conf | 1 + .../mlnx_en/mlx4_ib-dummy/Makefile | 3 + .../ofed_scripts/mlnx_en/mlx4_ib-dummy/main.c | 59 + .../ofed_scripts/mlnx_en/mlx4_ib.conf | 1 + .../mlnx_en/mlx5_ib-dummy/Makefile | 3 + .../ofed_scripts/mlnx_en/mlx5_ib-dummy/main.c | 59 + .../ofed_scripts/mlnx_en/scripts/install.sh | 973 + .../mlnx_en/scripts/mlnx_en_uninstall.sh | 37 + .../mlnx_en/scripts/mlnx_fw_updater.pl | 291 + .../ofed_scripts/mlnx_interface_mgr.sh | 462 + .../ofed_scripts/mlnx_interface_mgr@.service | 7 + .../ofed_scripts/mlnx_interface_mgr_deb.sh | 453 + .../ofed_scripts/mlx5.conf | 25 + src/mlnx-ofa_kernel-5.8/ofed_scripts/mlxnet | 211 + .../ofed_scripts/mlxnet.conf | 3 + .../ofed_scripts/net-interfaces | 154 + .../ofed_scripts/nfs_header_checkout_files | 22 + .../ofed_scripts/odp_stat.sh | 83 + .../ofed_scripts/ofed_checkout.sh | 64 + .../ofed_scripts/ofed_format_patch.sh | 56 + .../ofed_scripts/ofed_get_patches.sh | 43 + .../ofed_scripts/ofed_makedist.sh | 129 + .../ofed_scripts/ofed_patch.sh | 338 + .../ofed_scripts/openib.conf | 45 + src/mlnx-ofa_kernel-5.8/ofed_scripts/openibd | 1913 ++ .../ofed_scripts/openibd.service | 18 + .../ofed_scripts/pre_build.sh | 51 + .../ofed_scripts/setup_mr_cache.sh | 89 + .../ofed_scripts/sf-rep-netdev-rename | 22 + src/mlnx-ofa_kernel-5.8/ofed_scripts/strip.sh | 45 + .../ofed_scripts/tools/sign-modules | 84 + .../ofed_scripts/truescale.cmds | 265 + .../unifdef_tool/get_ofed_basecode.sh | 179 + .../help_scripts/build_defs_file.sh | 98 + .../help_scripts/handle_config_h.sh | 6 + .../help_scripts/handle_configure_ac.sh | 4 + .../help_scripts/split_config_h.sh | 25 + .../ofed_scripts/validate_backport_patches.sh | 59 + .../ofed_scripts/vf-net-link-name.sh | 198 + src/mlnx-ofa_kernel-5.8/scripts/admin_rdma.sh | 471 + src/mlnx-ofa_kernel-5.8/scripts/keep_files | 1 + .../scripts/refreshBackports.sh | 108 + .../scripts/setlocalversion | 55 + 1604 files changed, 554724 insertions(+) create mode 100644 licenses/BSD 3-Clause create mode 100644 licenses/GPL-2.0 create mode 100644 mlnx-ofa_kernel.spec create mode 100644 src/mlnx-ofa_kernel-5.8/.gitignore create mode 100644 src/mlnx-ofa_kernel-5.8/COPYING create mode 100644 src/mlnx-ofa_kernel-5.8/Documentation/infiniband/tag_matching.txt create mode 100644 src/mlnx-ofa_kernel-5.8/Documentation/release_notes-storage.txt create mode 100644 src/mlnx-ofa_kernel-5.8/LINUX_BASE_BRANCH create mode 120000 src/mlnx-ofa_kernel-5.8/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/Module.supported create mode 100644 src/mlnx-ofa_kernel-5.8/README create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0001-BACKPORT-block-blk-mq-rdma.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0002-BACKPORT-drivers-base-auxiliary.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0003-BACKPORT-drivers-infiniband-core-addr.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0004-BACKPORT-drivers-infiniband-core-cache.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0005-BACKPORT-drivers-infiniband-core-cgroup.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0006-BACKPORT-drivers-infiniband-core-cm.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0007-BACKPORT-drivers-infiniband-core-cm_trace.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0008-BACKPORT-drivers-infiniband-core-cma.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0009-BACKPORT-drivers-infiniband-core-cma_configfs.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0010-BACKPORT-drivers-infiniband-core-cma_trace.h.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0011-BACKPORT-drivers-infiniband-core-core_priv.h.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0012-BACKPORT-drivers-infiniband-core-counters.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0013-BACKPORT-drivers-infiniband-core-cq.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0014-BACKPORT-drivers-infiniband-core-device.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0015-BACKPORT-drivers-infiniband-core-iwcm.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0016-BACKPORT-drivers-infiniband-core-iwpm_util.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0017-BACKPORT-drivers-infiniband-core-lag.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0018-BACKPORT-drivers-infiniband-core-mad.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0019-BACKPORT-drivers-infiniband-core-mad_rmpp.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0020-BACKPORT-drivers-infiniband-core-netlink.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0021-BACKPORT-drivers-infiniband-core-nldev.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0022-BACKPORT-drivers-infiniband-core-peer_mem.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0023-BACKPORT-drivers-infiniband-core-rdma_core.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0024-BACKPORT-drivers-infiniband-core-roce_gid_mgmt.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0025-BACKPORT-drivers-infiniband-core-sa_query.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0026-BACKPORT-drivers-infiniband-core-trace.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0027-BACKPORT-drivers-infiniband-core-ucma.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0028-BACKPORT-drivers-infiniband-core-ud_header.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0030-BACKPORT-drivers-infiniband-core-umem_dmabuf.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0031-BACKPORT-drivers-infiniband-core-umem_odp.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0032-BACKPORT-drivers-infiniband-core-user_mad.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0033-BACKPORT-drivers-infiniband-core-uverbs.h.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0034-BACKPORT-drivers-infiniband-core-uverbs_cmd.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0035-BACKPORT-drivers-infiniband-core-uverbs_ioctl.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0036-BACKPORT-drivers-infiniband-core-uverbs_main.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0037-BACKPORT-drivers-infiniband-core-uverbs_uapi.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0038-BACKPORT-drivers-infiniband-core-verbs.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0039-BACKPORT-drivers-infiniband-debug-memtrack.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0040-BACKPORT-drivers-infiniband-hw-mlx5-Makefile.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0041-BACKPORT-drivers-infiniband-hw-mlx5-cq.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0042-BACKPORT-drivers-infiniband-hw-mlx5-devx.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0043-BACKPORT-drivers-infiniband-hw-mlx5-doorbell.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0044-BACKPORT-drivers-infiniband-hw-mlx5-fs.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0045-BACKPORT-drivers-infiniband-hw-mlx5-ib_virt.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0046-BACKPORT-drivers-infiniband-hw-mlx5-main.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0047-BACKPORT-drivers-infiniband-hw-mlx5-mem.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0048-BACKPORT-drivers-infiniband-hw-mlx5-mlx5_ib.h.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0049-BACKPORT-drivers-infiniband-hw-mlx5-mr.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0050-BACKPORT-drivers-infiniband-hw-mlx5-odp.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0051-BACKPORT-drivers-infiniband-hw-mlx5-qp.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0052-BACKPORT-drivers-infiniband-hw-mlx5-srq.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0053-BACKPORT-drivers-infiniband-hw-mlx5-srq_cmd.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0054-BACKPORT-drivers-infiniband-hw-mlx5-wr.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0055-BACKPORT-drivers-infiniband-ulp-ipoib-ipoib.h.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0056-BACKPORT-drivers-infiniband-ulp-ipoib-ipoib_cm.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0057-BACKPORT-drivers-infiniband-ulp-ipoib-ipoib_ethtool..patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0058-BACKPORT-drivers-infiniband-ulp-ipoib-ipoib_fs.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0059-BACKPORT-drivers-infiniband-ulp-ipoib-ipoib_ib.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0060-BACKPORT-drivers-infiniband-ulp-ipoib-ipoib_main.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0061-BACKPORT-drivers-infiniband-ulp-ipoib-ipoib_multicas.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0062-BACKPORT-drivers-infiniband-ulp-ipoib-ipoib_netlink..patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0063-BACKPORT-drivers-infiniband-ulp-ipoib-ipoib_vlan.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0064-BACKPORT-drivers-infiniband-ulp-iser-iscsi_iser.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0065-BACKPORT-drivers-infiniband-ulp-iser-iscsi_iser.h.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0066-BACKPORT-drivers-infiniband-ulp-iser-iser_initiator..patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0067-BACKPORT-drivers-infiniband-ulp-iser-iser_memory.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0068-BACKPORT-drivers-infiniband-ulp-isert-ib_isert.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0070-BACKPORT-drivers-infiniband-ulp-srp-ib_srp.h.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0071-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-Mak.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0072-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-acc.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0073-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-acc.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0074-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-all.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0075-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-cmd.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0076-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-cq..patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0077-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-crd.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0078-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-dev.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0080-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-dia.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0081-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-dia.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0082-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-dia.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0083-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-dia.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0084-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-dia.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0085-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-dia.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0086-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en..patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0087-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0088-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0089-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0090-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0091-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0092-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0093-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0094-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0095-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0096-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0097-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0098-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0099-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0100-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0101-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0102-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0103-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0104-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0105-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0106-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0107-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0108-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0109-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0110-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0111-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0112-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0113-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0114-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0115-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0116-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0117-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0118-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0119-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0120-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0121-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0122-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0123-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0124-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0125-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0126-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0127-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0128-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0129-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0130-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0131-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0132-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0133-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0134-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0135-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0136-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0137-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0138-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0139-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0140-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0141-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0142-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0143-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0144-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0145-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0146-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0147-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0149-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0150-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0151-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0152-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0153-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0154-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0155-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0157-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0159-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0160-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-eq..patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0161-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-esw.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0163-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-esw.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0164-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-esw.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0165-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-esw.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0166-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-esw.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0167-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-fpg.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0168-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-fpg.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0169-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-fpg.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0170-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-fpg.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0171-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-fs_.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0172-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-fs_.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0173-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-fs_.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0174-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-fw..patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0175-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-fw_.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0176-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-fw_.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0177-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-hea.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0178-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-ipo.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0179-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-ipo.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0180-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-ipo.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0181-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-ipo.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0182-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lag.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0183-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lag.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0184-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lib.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0185-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lib.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0186-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lib.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0187-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lib.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0188-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lib.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0189-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lib.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0190-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lib.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0191-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lib.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0192-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lib.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0193-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-mai.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0195-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-mlx.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0197-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-por.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0198-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-sri.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0199-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-ste.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0200-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-ste.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0201-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-uar.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0202-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-vpo.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0203-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-wq..patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0204-BACKPORT-drivers-net-ethernet-mellanox-mlxfw-mlxfw.h.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0205-BACKPORT-drivers-net-ethernet-mellanox-mlxfw-mlxfw_f.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0206-BACKPORT-drivers-net-ethernet-mellanox-mlxfw-mlxfw_m.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0210-BACKPORT-drivers-nvme-host-ioctl.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0212-BACKPORT-drivers-nvme-host-nvfs-dma.h.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0213-BACKPORT-drivers-nvme-host-nvfs-rdma.h.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0217-BACKPORT-drivers-nvme-host-tcp.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0218-BACKPORT-drivers-nvme-host-zns.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0219-BACKPORT-drivers-nvme-target-admin-cmd.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0220-BACKPORT-drivers-nvme-target-configfs.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0221-BACKPORT-drivers-nvme-target-core.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0222-BACKPORT-drivers-nvme-target-fc.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0223-BACKPORT-drivers-nvme-target-fcloop.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0224-BACKPORT-drivers-nvme-target-io-cmd-bdev.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0225-BACKPORT-drivers-nvme-target-io-cmd-file.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0227-BACKPORT-drivers-nvme-target-nvmet.h.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0228-BACKPORT-drivers-nvme-target-passthru.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0229-BACKPORT-drivers-nvme-target-rdma.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0230-BACKPORT-drivers-nvme-target-rdma_offload.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0231-BACKPORT-drivers-nvme-target-tcp.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0232-BACKPORT-drivers-nvme-target-trace.h.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0233-BACKPORT-drivers-scsi-scsi_priv.h.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0234-BACKPORT-drivers-scsi-scsi_transport_srp.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0235-BACKPORT-include-linux-auxiliary_bus.h.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0236-BACKPORT-include-linux-mlx5-driver.h.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0237-BACKPORT-include-linux-mlx5-eswitch.h.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0238-BACKPORT-include-linux-mlx5-fs.h.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0239-BACKPORT-include-linux-mlx5-port.h.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0240-BACKPORT-include-linux-mod_devicetable.h.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0241-BACKPORT-include-linux-nvme-fc-driver.h.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0242-BACKPORT-include-linux-nvme-rdma.h.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0243-BACKPORT-include-linux-nvme.h.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0244-BACKPORT-include-linux-sunrpc-svc_rdma.h.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0245-BACKPORT-include-rdma-ib_addr.h.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0246-BACKPORT-include-rdma-ib_umem.h.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0247-BACKPORT-include-rdma-ib_umem_odp.h.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0248-BACKPORT-include-rdma-ib_verbs.h.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0249-BACKPORT-include-rdma-lag.h.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0250-BACKPORT-include-rdma-rdma_counter.h.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0251-BACKPORT-include-rdma-rdma_netlink.h.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0252-BACKPORT-include-rdma-uverbs_ioctl.h.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0253-BACKPORT-include-trace-events-ib_mad.h.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0254-BACKPORT-include-trace-events-ib_umad.h.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0255-BACKPORT-include-trace-events-rpcrdma.h.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0256-BACKPORT-net-mlxdevm-mlxdevm.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0257-BACKPORT-net-sunrpc-xprtrdma-backchannel.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0258-BACKPORT-net-sunrpc-xprtrdma-frwr_ops.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0259-BACKPORT-net-sunrpc-xprtrdma-module.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0260-BACKPORT-net-sunrpc-xprtrdma-rpc_rdma.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0261-BACKPORT-net-sunrpc-xprtrdma-svc_rdma.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0262-BACKPORT-net-sunrpc-xprtrdma-svc_rdma_backchannel.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0263-BACKPORT-net-sunrpc-xprtrdma-svc_rdma_pcl.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0264-BACKPORT-net-sunrpc-xprtrdma-svc_rdma_recvfrom.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0266-BACKPORT-net-sunrpc-xprtrdma-svc_rdma_sendto.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0267-BACKPORT-net-sunrpc-xprtrdma-svc_rdma_transport.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0268-BACKPORT-net-sunrpc-xprtrdma-transport.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0269-BACKPORT-drivers-infiniband-core-umem.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0269-BACKPORT-net-sunrpc-xprtrdma-svc_rdma_rw.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0269-BACKPORT-net-sunrpc-xprtrdma-verbs.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0270-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0270-BACKPORT-net-sunrpc-xprtrdma-xprt_rdma.h.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0271-BACKPORT-drivers-infiniband-core-sysfs.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0271-BACKPORT-drivers-nvme-host-fault_inject.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0272-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-dev.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0272-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0272-BACKPORT-drivers-nvme-host-trace.h.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0273-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-dev.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0273-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0273-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-esw.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0273-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lag.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0274-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0274-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0274-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-sf-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0275-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0275-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-esw.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0276-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0276-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lag.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0277-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0277-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-esw.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0277-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lag.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0277-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-pci.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0278-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-esw.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0278-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lag.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0279-BACKPORT-drivers-infiniband-core-rw.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0279-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lag.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0280-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0281-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-irq.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0282-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0283-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0284-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-mlx.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0285-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-sf-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0286-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-sf-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0287-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0288-BACKPORT-drivers-infiniband-hw-mlx5-main_ext.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0288-BACKPORT-drivers-infiniband-ulp-iser-iser_verbs.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0288-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0289-BACKPORT-drivers-infiniband-ulp-srp-ib_srp.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0289-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-ecp.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0289-BACKPORT-include-net-psample.h.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0290-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0291-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0291-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-sri.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0292-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0292-BACKPORT-include-rdma-ib.h.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0293-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0294-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0295-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0296-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0297-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0298-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0306-BACKPORT-drivers-nvme-target-loop.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0307-BACKPORT-drivers-nvme-host-nvme.h.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0308-BACKPORT-drivers-nvme-host-core.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0309-BACKPORT-drivers-nvme-host-fc.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0310-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0310-BACKPORT-drivers-nvme-host-multipath.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0311-BACKPORT-drivers-nvme-host-rdma.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0312-BACKPORT-drivers-nvme-host-fabrics.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0313-BACKPORT-drivers-nvme-host-pci.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0314-BACKPORT-drivers-nvme-target-zns.c.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0315-BACKPORT-drivers-infiniband-ulp-isert-ib_isert.h.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0315-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0316-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0318-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-sf-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/backports/0319-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-sf-.patch create mode 100644 src/mlnx-ofa_kernel-5.8/block/blk-mq-rdma.c create mode 100644 src/mlnx-ofa_kernel-5.8/code-metrics.txt create mode 100644 src/mlnx-ofa_kernel-5.8/compat/Makefile.am create mode 100644 src/mlnx-ofa_kernel-5.8/compat/Makefile.real create mode 100755 src/mlnx-ofa_kernel-5.8/compat/autogen.sh create mode 100644 src/mlnx-ofa_kernel-5.8/compat/bitmap.c create mode 100644 src/mlnx-ofa_kernel-5.8/compat/build/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/compat/cls_api.c create mode 100644 src/mlnx-ofa_kernel-5.8/compat/compat-3.11.c create mode 100644 src/mlnx-ofa_kernel-5.8/compat/compat-3.12.c create mode 100644 src/mlnx-ofa_kernel-5.8/compat/compat-3.13.c create mode 100644 src/mlnx-ofa_kernel-5.8/compat/compat-3.15.c create mode 100644 src/mlnx-ofa_kernel-5.8/compat/compat-3.16.c create mode 100644 src/mlnx-ofa_kernel-5.8/compat/compat-3.18.c create mode 100644 src/mlnx-ofa_kernel-5.8/compat/compat-4.1.c create mode 100644 src/mlnx-ofa_kernel-5.8/compat/compat_atomic.c create mode 100644 src/mlnx-ofa_kernel-5.8/compat/compat_firmware_class.c create mode 100644 src/mlnx-ofa_kernel-5.8/compat/config/.gitignore create mode 100644 src/mlnx-ofa_kernel-5.8/compat/config/build-linux.m4 create mode 100644 src/mlnx-ofa_kernel-5.8/compat/config/parallel-build.m4 create mode 100644 src/mlnx-ofa_kernel-5.8/compat/config/rdma.m4 create mode 100755 src/mlnx-ofa_kernel-5.8/compat/config/warning_filter.sh create mode 100644 src/mlnx-ofa_kernel-5.8/compat/configure.ac create mode 100644 src/mlnx-ofa_kernel-5.8/compat/cordic.c create mode 100644 src/mlnx-ofa_kernel-5.8/compat/crc8.c create mode 100644 src/mlnx-ofa_kernel-5.8/compat/dim.c create mode 100644 src/mlnx-ofa_kernel-5.8/compat/exec.c create mode 100644 src/mlnx-ofa_kernel-5.8/compat/file.c create mode 100644 src/mlnx-ofa_kernel-5.8/compat/flow_dissector.c create mode 100644 src/mlnx-ofa_kernel-5.8/compat/flow_offload.c create mode 100644 src/mlnx-ofa_kernel-5.8/compat/idr.c create mode 100644 src/mlnx-ofa_kernel-5.8/compat/interval_tree.c create mode 100644 src/mlnx-ofa_kernel-5.8/compat/kfifo.c create mode 100644 src/mlnx-ofa_kernel-5.8/compat/kstrtox.c create mode 100644 src/mlnx-ofa_kernel-5.8/compat/macsec.c create mode 100644 src/mlnx-ofa_kernel-5.8/compat/main.c create mode 100644 src/mlnx-ofa_kernel-5.8/compat/mm_util.c create mode 100644 src/mlnx-ofa_kernel-5.8/compat/mmu_notifier.c create mode 100644 src/mlnx-ofa_kernel-5.8/compat/net_dim.c create mode 100644 src/mlnx-ofa_kernel-5.8/compat/nf_flow_table_core.c create mode 100644 src/mlnx-ofa_kernel-5.8/compat/nf_flow_table_offload.c create mode 100644 src/mlnx-ofa_kernel-5.8/compat/output_core.c create mode 100644 src/mlnx-ofa_kernel-5.8/compat/pci.c create mode 100644 src/mlnx-ofa_kernel-5.8/compat/rdma_dim.c create mode 100644 src/mlnx-ofa_kernel-5.8/compat/rhashtable.c create mode 100644 src/mlnx-ofa_kernel-5.8/compat/sch_codel.c create mode 100644 src/mlnx-ofa_kernel-5.8/compat/sch_fq_codel.c create mode 100755 src/mlnx-ofa_kernel-5.8/compat/scripts/compat_firmware_install create mode 100755 src/mlnx-ofa_kernel-5.8/compat/scripts/gen-compat-autoconf.sh create mode 100755 src/mlnx-ofa_kernel-5.8/compat/scripts/gen-compat-config.sh create mode 100755 src/mlnx-ofa_kernel-5.8/compat/scripts/skip-colors create mode 100644 src/mlnx-ofa_kernel-5.8/compat/string.c create mode 100644 src/mlnx-ofa_kernel-5.8/compat/syscall.c create mode 100644 src/mlnx-ofa_kernel-5.8/compat/uuid.c create mode 100644 src/mlnx-ofa_kernel-5.8/compat/xarray.c create mode 100644 src/mlnx-ofa_kernel-5.8/compat/xz_crc32.c create mode 100644 src/mlnx-ofa_kernel-5.8/compat/xz_dec_bcj.c create mode 100644 src/mlnx-ofa_kernel-5.8/compat/xz_dec_lzma2.c create mode 100644 src/mlnx-ofa_kernel-5.8/compat/xz_dec_stream.c create mode 100644 src/mlnx-ofa_kernel-5.8/compat/xz_dec_syms.c create mode 100644 src/mlnx-ofa_kernel-5.8/compat/xz_lzma2.h create mode 100644 src/mlnx-ofa_kernel-5.8/compat/xz_private.h create mode 100644 src/mlnx-ofa_kernel-5.8/compat/xz_stream.h create mode 100644 src/mlnx-ofa_kernel-5.8/compat_base create mode 100644 src/mlnx-ofa_kernel-5.8/compat_base_tree create mode 100644 src/mlnx-ofa_kernel-5.8/compat_base_tree_version create mode 100644 src/mlnx-ofa_kernel-5.8/compat_version create mode 120000 src/mlnx-ofa_kernel-5.8/configure create mode 100644 src/mlnx-ofa_kernel-5.8/debian/changelog create mode 100644 src/mlnx-ofa_kernel-5.8/debian/compat create mode 100644 src/mlnx-ofa_kernel-5.8/debian/control create mode 100644 src/mlnx-ofa_kernel-5.8/debian/control.no_dkms create mode 100644 src/mlnx-ofa_kernel-5.8/debian/copyright create mode 100755 src/mlnx-ofa_kernel-5.8/debian/mlnx-ofed-kernel-dkms.postinst create mode 100755 src/mlnx-ofa_kernel-5.8/debian/mlnx-ofed-kernel-dkms.prerm create mode 100755 src/mlnx-ofa_kernel-5.8/debian/mlnx-ofed-kernel-modules.postinst create mode 100755 src/mlnx-ofa_kernel-5.8/debian/mlnx-ofed-kernel-modules.prerm create mode 100644 src/mlnx-ofa_kernel-5.8/debian/mlnx-ofed-kernel-utils.examples create mode 100755 src/mlnx-ofa_kernel-5.8/debian/mlnx-ofed-kernel-utils.postinst create mode 100755 src/mlnx-ofa_kernel-5.8/debian/mlnx-ofed-kernel-utils.postrm create mode 100755 src/mlnx-ofa_kernel-5.8/debian/mlnx-ofed-kernel-utils.prerm create mode 100755 src/mlnx-ofa_kernel-5.8/debian/rules create mode 100644 src/mlnx-ofa_kernel-5.8/debian/source/format create mode 100755 src/mlnx-ofa_kernel-5.8/devtools/add_metadata.sh create mode 100755 src/mlnx-ofa_kernel-5.8/devtools/verify_metadata.sh create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/base/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/base/auxiliary.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/Kconfig create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/addr.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/agent.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/agent.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/cache.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/cgroup.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/cm.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/cm_msgs.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/cm_trace.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/cm_trace.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/cma.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/cma_configfs.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/cma_priv.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/cma_trace.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/cma_trace.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/core_priv.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/counters.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/cq.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/device.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/ib_addr_dummy.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/ib_core_uverbs.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/ib_mad_dummy.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/ib_peer_mem.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/ib_sa_dummy.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/ib_ucm_dummy.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/iwcm.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/iwcm.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/iwpm_msg.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/iwpm_util.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/iwpm_util.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/lag.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/mad.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/mad_priv.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/mad_rmpp.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/mad_rmpp.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/mr_pool.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/multicast.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/netlink.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/nldev.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/opa_smi.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/packer.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/peer_mem.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/rdma_core.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/rdma_core.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/restrack.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/restrack.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/roce_gid_mgmt.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/rw.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/sa.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/sa_query.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/security.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/smi.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/smi.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/sysfs.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/trace.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/ucma.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/ud_header.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/umem.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/umem_dmabuf.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/umem_odp.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/user_mad.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_cmd.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_ioctl.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_main.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_marshall.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_std_types.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_std_types_async_fd.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_std_types_counters.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_std_types_cq.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_std_types_device.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_std_types_dm.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_std_types_flow_action.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_std_types_mr.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_std_types_qp.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_std_types_srq.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_std_types_wq.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_uapi.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/verbs.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/verbs_nvmf.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/debug/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/debug/memtrack.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/debug/memtrack.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/debug/mtrack.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/amso1100/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/amso1100/main.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/bnxt_re/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/bnxt_re/main.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/cxgb3/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/cxgb3/main.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/cxgb4/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/cxgb4/main.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/efa/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/efa/efa_main.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/ehca/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/ehca/ib_ehca.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/hfi1/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/hfi1/main.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/hns/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/hns/hns_roce_hw_v1.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/hns/hns_roce_hw_v2.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/hns/main.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/i40iw/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/i40iw/main.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/ipath/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/ipath/ib_ipath.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/irdma/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/irdma/irdma_dummy.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx4/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx4/main.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/Kconfig create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/ah.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/cmd.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/cmd.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/cong.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/counters.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/counters.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/cq.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/devx.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/devx.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/dm.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/dm.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/doorbell.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/fs.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/fs.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/gsi.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/ib_rep.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/ib_rep.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/ib_virt.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/mad.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/main.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/main_ext.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/mem.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/mlx5_ib.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/mlx5_ib_ext.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/mlx5_ib_nvmf.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/mr.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/nvmf.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/odp.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/qos.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/qp.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/qp.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/qp_nvmf.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/qpc.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/restrack.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/restrack.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/srq.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/srq.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/srq_cmd.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/srq_nvmf.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/srq_nvmf.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/std_types.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/wr.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/wr.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mthca/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mthca/main.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/nes/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/nes/main.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/ocrdma/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/ocrdma/ocrdma.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/qedr/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/qedr/main.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/qib/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/qib/main.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/usnic/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/usnic/main.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/vmw_pvrdma/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/vmw_pvrdma/main.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/sw/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/sw/rdmavt/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/sw/rdmavt/main.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/sw/rxe/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/sw/rxe/rdma_rxe_dummy.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/sw/siw/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/sw/siw/main.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/Kconfig create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/ipoib.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/ipoib_cm.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/ipoib_fs.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/ipoib_genetlink.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/ipoib_ib.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/ipoib_main.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/ipoib_multicast.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/ipoib_netlink.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/ipoib_verbs.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/ipoib_vlan.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib_1.5.3/Kconfig create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib_1.5.3/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib_1.5.3/inet_lro.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib_1.5.3/ipoib.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib_1.5.3/ipoib_cm.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib_1.5.3/ipoib_ethtool.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib_1.5.3/ipoib_fs.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib_1.5.3/ipoib_ib.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib_1.5.3/ipoib_main.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib_1.5.3/ipoib_multicast.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib_1.5.3/ipoib_verbs.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib_1.5.3/ipoib_vlan.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/Kconfig create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/Module.supported create mode 100755 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/_makefile_ create mode 100755 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/autogen.sh create mode 100755 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/common.postinst create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/debian/changelog create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/debian/compat create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/debian/control create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/debian/control.no_dkms create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/debian/copyright create mode 100755 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/debian/iser-dkms.postinst create mode 100755 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/debian/iser-dkms.prerm create mode 100755 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/debian/rules create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/debian/source/format create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/dkms.conf create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/ib_iser_dummy.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/iscsi_iser.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/iscsi_iser.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/iser_initiator.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/iser_memory.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/iser_spec_ create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/iser_verbs.c create mode 100755 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/tools/sign-modules create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/Kconfig create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/Module.supported create mode 100755 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/_makefile_ create mode 100755 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/autogen.sh create mode 100755 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/common.postinst create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/debian/changelog create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/debian/compat create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/debian/control create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/debian/control.no_dkms create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/debian/copyright create mode 100755 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/debian/isert-dkms.postinst create mode 100755 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/debian/isert-dkms.prerm create mode 100755 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/debian/rules create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/debian/source/format create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/dkms.conf create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/ib_isert.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/ib_isert.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/ib_isert_dummy.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/isert_spec_ create mode 100755 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/tools/sign-modules create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/opa_vnic/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/opa_vnic/main.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/rtrs/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/rtrs/rtrs-clt_dummy.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/rtrs/rtrs-core_dummy.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/rtrs/rtrs-srv_dummy.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/Kbuild create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/Kconfig create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/Module.supported create mode 100755 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/_makefile_ create mode 100755 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/autogen.sh create mode 100755 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/common.postinst create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/debian/changelog create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/debian/compat create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/debian/control create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/debian/control.no_dkms create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/debian/copyright create mode 100755 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/debian/rules create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/debian/source/format create mode 100755 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/debian/srp-dkms.postinst create mode 100755 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/debian/srp-dkms.prerm create mode 100755 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/debian/srp-modules.postinst create mode 100755 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/debian/srp-modules.prerm create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/dkms.conf create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/ib_srp.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/ib_srp.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/ib_srp_dummy.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/srp_spec_ create mode 100755 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/tools/sign-modules create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srpt/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srpt/main.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/xsigo/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/xsigo/xscore/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/xsigo/xscore/main.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/xsigo/xsvhba/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/xsigo/xsvhba/main.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/xsigo/xsvnic/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/xsigo/xsvnic/main.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/xsigo/xve/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/xsigo/xve/main.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/Kconfig create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/Kconfig create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/accel/accel.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec_offload.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec_offload.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/alloc.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/cmd.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/compat.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/compat.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/cq.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/crdump.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/dev.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/devlink.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/devlink.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/crdump.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/diag_cnt.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/diag_cnt.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/en_rep_tracepoint.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/en_tc_tracepoint.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/en_tc_tracepoint.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer_tracepoint.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/rsc_dump.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/rsc_dump.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/ecpf.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/aso.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/aso.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/channels.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/channels.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/dcbnl.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/fs_tt_redirect.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/fs_tt_redirect.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/health.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/health.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/ipsec_aso.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/ipsec_aso.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/mapping.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/mapping.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/mod_hdr.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/mod_hdr.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/monitor_stats.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/monitor_stats.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/params.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/params.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/port.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/port.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/qos.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bond.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rep/meter.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rep/meter.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rep/neigh.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rep/neigh.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rep/sysfs.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rep/sysfs.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rqt.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rqt.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rss.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rss.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/accept.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/csum.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/ct.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/drop.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/goto.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/mark.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/mirred.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/mirred_nic.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/mpls.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/pedit.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/pedit.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/police.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/prio.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/ptype.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/redirect_ingress.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/sample.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/sample.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/trap.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/tun.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/vlan.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/vlan.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/vlan_mangle.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/ct_fs.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/ct_fs_dmfs.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/ct_fs_smfs.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/int_port.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/int_port.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/meter.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/meter.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/post_act.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/post_act.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/post_meter.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/post_meter.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/sample.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/sample.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc_priv.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_geneve.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_gre.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_mplsoudp.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_vxlan.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tir.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tir.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/trap.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/trap.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/fs_tcp.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/fs_tcp.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_stats.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_txrx.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_txrx.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_utils.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec_fs.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec_fs.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec_stats.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_stats.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_common.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_debugfs.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_diag.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_ecn.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_ecn.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_main.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_sysfs.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/eq.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_lgcy.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_ofld.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_lgcy.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_ofld.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/lgcy.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ofld.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_priv.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/debugfs.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/devm_port.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/diag/bridge_tracepoint.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/diag/qos_tracepoint.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/indir_table.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/indir_table.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/ipsec.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/ipsec.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/legacy.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/legacy.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/pet_offloads.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/vf_meter.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/vf_meter.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/vporttbl.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/eswitch_devlink_compat.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_termtbl.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/events.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fs_ft_pool.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fs_ft_pool.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fw.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fw_exp.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/health.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/irq_affinity.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lag/debugfs.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/crypto.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/dm.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_ttc.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_ttc.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/geneve.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/geneve.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/gid.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/pci_vsc.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/pci_vsc.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/port_tun.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/port_tun.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/sf.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/smfs.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/smfs.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/tout.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/tout.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/main.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/mcg.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/mlx5_devm.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/mlx5_devm.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/mlx5_esw_devm.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/mlx5_irq.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/mr.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/mst_dump.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/params.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/pd.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/port.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/qos.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/qos.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/rdma.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/rdma.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/rl.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/cmd.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/cfg_driver.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/cfg_driver.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/diag/dev_tracepoint.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/diag/sf_tracepoint.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/diag/vhca_tracepoint.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/hw_table.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/mlx5_ifc_vhca_event.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/priv.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sriov.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sriov_sysfs.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_buddy.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_dbg.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_dbg.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_domain.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_fw.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_icm_pool.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v0.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v1.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v2.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_table.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5_ifc_dr.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5_ifc_dr_ste_v1.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/transobj.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/uar.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/vport.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/wq.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/wq.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxfw/Kconfig create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxfw/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxfw/mlxfw.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxfw/mlxfw_fsm.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_file.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_format.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv_multi.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv_multi.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxsw/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxsw/spectrum_main.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/Kconfig create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/Module.supported create mode 100755 src/mlnx-ofa_kernel-5.8/drivers/nvme/_makefile_ create mode 100755 src/mlnx-ofa_kernel-5.8/drivers/nvme/autogen.sh create mode 100755 src/mlnx-ofa_kernel-5.8/drivers/nvme/common.postinst create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/debian/changelog create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/debian/compat create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/debian/control create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/debian/control.no_dkms create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/debian/copyright create mode 100755 src/mlnx-ofa_kernel-5.8/drivers/nvme/debian/mlnx-nvme-dkms.postinst create mode 100755 src/mlnx-ofa_kernel-5.8/drivers/nvme/debian/mlnx-nvme-dkms.prerm create mode 100755 src/mlnx-ofa_kernel-5.8/drivers/nvme/debian/rules create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/debian/source/format create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/dkms.conf create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/host/Kconfig create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/host/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/host/core.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/host/fabrics.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/host/fabrics.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/host/fault_inject.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/host/fc.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/host/fc.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/host/hwmon.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/host/ioctl.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/host/multipath.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/host/nvfs-dma.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/host/nvfs-dma.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/host/nvfs-rdma.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/host/nvfs-rdma.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/host/nvfs.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/host/nvme-core_dummy.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/host/nvme-fabrics_dummy.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/host/nvme-fc_dummy.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/host/nvme-rdma_dummy.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/host/nvme.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/host/nvme_dummy.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/host/nvme_snap_vfio_pci.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/host/passthru.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/host/passthru.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/host/pci.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/host/qla2xxx_dummy.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/host/rdma.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/host/tcm_qla2xxx_dummy.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/host/tcp.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/host/trace.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/host/trace.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/host/zns.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/lpfc/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/lpfc/lpfc_dummy.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/mlnx-nvme_spec_ create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/target/Kconfig create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/target/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/target/admin-cmd.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/target/configfs.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/target/core.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/target/discovery.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/target/fabrics-cmd.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/target/fc.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/target/fcloop.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/target/io-cmd-bdev.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/target/io-cmd-file.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/target/loop.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/target/nvme-fcloop_dummy.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/target/nvme-loop_dummy.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/target/nvmet-fc_dummy.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/target/nvmet-rdma_dummy.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/target/nvmet.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/target/nvmet_dummy.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/target/passthru.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/target/rdma.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/target/rdma_offload.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/target/rdma_offload.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/target/tcp.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/target/trace.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/target/trace.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/nvme/target/zns.c create mode 100755 src/mlnx-ofa_kernel-5.8/drivers/nvme/tools/sign-modules create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/scsi/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/scsi/scsi_priv.h create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/scsi/scsi_transport_srp.c create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/vdpa/mlx5/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/drivers/vdpa/mlx5/vdpa_main.c create mode 100644 src/mlnx-ofa_kernel-5.8/fs/cifs/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/fs/cifs/cifs_main.c create mode 100644 src/mlnx-ofa_kernel-5.8/include/asm-generic/bug.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/auxiliary_bus.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/bit.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/bitfield.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/bitmap.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/bitops.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/bits.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/blk-mq-pci.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/blk-mq-rdma.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/blk-mq.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/blk_types.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/blkdev.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/bpf.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/bpf_trace.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/build_bug.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/cdev.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/cgroup_rdma.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/compat-2.6.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/compat-3.10.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/compat-3.12.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/compat-3.15.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/compat-4.0.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/compat-4.1.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/compat-4.10.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/compat_fix.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/compiler-clang.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/compiler-gcc.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/compiler-intel.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/compiler.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/compiler_attributes.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/dcbnl.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/device.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/dim.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/ethtool.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/export.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/filter.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/firmware.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/fs.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/gfp.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/hashtable.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/idr.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/if_ether.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/if_link.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/if_vlan.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/indirect_call_wrapper.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/inet.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/inet_lro.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/inetdevice.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/interval_tree.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/irq_poll.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/kconfig.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/kern_levels.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/kernel.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/kmod.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/kref.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/list.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/llist.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/lockdep.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/log2.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/mlx5/accel.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/mlx5/cq.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/mlx5/device.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/mlx5/doorbell.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/mlx5/driver.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/mlx5/eq.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/mlx5/eswitch.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/mlx5/fs.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/mlx5/fs_helpers.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/mlx5/macsec.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/mlx5/mlx5_ifc.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/mlx5/mlx5_ifc_fpga.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/mlx5/mlx5_ifc_vdpa.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/mlx5/mpfs.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/mlx5/nvmf.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/mlx5/port.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/mlx5/qp.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/mlx5/rsc_dump.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/mlx5/transobj.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/mlx5/vport.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/mm.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/mmu_notifier.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/mod_devicetable.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/module.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/net.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/netdev_features.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/netdevice.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/nodemask.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/nospec.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/nvme-fc-driver.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/nvme-fc.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/nvme-pci.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/nvme-peer.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/nvme-rdma.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/nvme.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/overflow.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/page_ref.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/pci-p2pdma.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/pci.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/pci_regs.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/pm_qos.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/poll.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/radix-tree.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/rbtree.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/rculist.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/rcupdate.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/refcount.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/rhashtable.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/scatterlist.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/sched.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/sched/mm.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/sched/signal.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/sched/task.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/sdt.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/seq_file.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/skbuff.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/slab.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/stddef.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/string.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/sunrpc/auth.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/sunrpc/rpc_rdma.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/sunrpc/rpc_rdma_cid.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/sunrpc/svc_rdma.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/sunrpc/svc_rdma_pcl.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/sunrpc/xprtrdma.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/sysfs.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/t10-pi.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/timekeeping.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/types.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/uaccess.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/units.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/uuid.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/xarray.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/linux/xz.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/net/addrconf.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/net/bareudp.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/net/bonding.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/net/devlink.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/net/dst.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/net/dst_metadata.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/net/flow_dissector.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/net/flow_keys.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/net/flow_offload.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/net/geneve.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/net/gre.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/net/ip_fib.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/net/ip_tunnels.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/net/ipv6.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/net/ipv6_stubs.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/net/macsec.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/net/mlxdevm.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/net/netfilter/nf_flow_table.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/net/netfilter/nf_flow_table_4_18.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/net/netlink.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/net/pkt_cls.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/net/psample.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/net/sock.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/net/switchdev.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/net/tc_act/tc_csum.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/net/tc_act/tc_ct.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/net/tc_act/tc_ct_4_18.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/net/tc_act/tc_gact.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/net/tc_act/tc_mirred.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/net/tc_act/tc_mpls.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/net/tc_act/tc_pedit.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/net/tc_act/tc_tunnel_key.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/net/tc_act/tc_vlan.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/net/tls.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/net/vxlan.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/net/xdp.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/net/xfrm.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/rdma/ib.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/rdma/ib_addr.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/rdma/ib_cache.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/rdma/ib_cm.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/rdma/ib_hdrs.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/rdma/ib_mad.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/rdma/ib_marshall.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/rdma/ib_pack.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/rdma/ib_pma.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/rdma/ib_sa.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/rdma/ib_smi.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/rdma/ib_sysfs.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/rdma/ib_umem.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/rdma/ib_umem_odp.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/rdma/ib_verbs.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/rdma/ib_verbs_nvmf.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/rdma/ib_verbs_nvmf_def.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/rdma/iba.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/rdma/ibta_vol1_c12.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/rdma/iw_cm.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/rdma/iw_portmap.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/rdma/lag.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/rdma/mr_pool.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/rdma/opa_addr.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/rdma/opa_port_info.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/rdma/opa_smi.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/rdma/opa_vnic.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/rdma/peer_mem.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/rdma/rdma_cm.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/rdma/rdma_cm_ib.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/rdma/rdma_counter.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/rdma/rdma_netlink.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/rdma/rdma_vt.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/rdma/rdmavt_cq.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/rdma/rdmavt_mr.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/rdma/rdmavt_qp.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/rdma/restrack.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/rdma/rw.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/rdma/signature.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/rdma/tid_rdma_defs.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/rdma/uverbs_ioctl.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/rdma/uverbs_named_ioctl.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/rdma/uverbs_std_types.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/rdma/uverbs_types.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/scsi/iser.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/scsi/scsi.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/scsi/scsi_device.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/scsi/scsi_transport_srp.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/scsi/srp.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/trace/events/ib_mad.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/trace/events/ib_umad.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/trace/events/rdma.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/trace/events/rdma_core.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/trace/events/rpcrdma.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/trace/events/sunrpc_base.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/uapi/linux/devlink.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/uapi/linux/eventpoll.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/uapi/linux/net_tstamp.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/uapi/linux/nvme_ioctl.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/uapi/linux/pkt_cls.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/uapi/linux/tc_act/tc_ct.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/uapi/linux/tc_act/tc_ct_4_18.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/uapi/linux/tc_act/tc_pedit.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/uapi/linux/tc_act/tc_tunnel_key.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/uapi/mlxdevm/mlxdevm_netlink.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/uapi/rdma/bnxt_re-abi.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/uapi/rdma/cxgb4-abi.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/uapi/rdma/efa-abi.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/uapi/rdma/hfi/hfi1_ioctl.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/uapi/rdma/hfi/hfi1_user.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/uapi/rdma/hns-abi.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/uapi/rdma/ib_user_ioctl_cmds.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/uapi/rdma/ib_user_ioctl_verbs.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/uapi/rdma/ib_user_mad.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/uapi/rdma/ib_user_sa.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/uapi/rdma/ib_user_verbs.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/uapi/rdma/irdma-abi.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/uapi/rdma/mlx4-abi.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/uapi/rdma/mlx5-abi.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/uapi/rdma/mlx5_user_ioctl_cmds.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/uapi/rdma/mlx5_user_ioctl_verbs.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/uapi/rdma/mthca-abi.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/uapi/rdma/ocrdma-abi.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/uapi/rdma/qedr-abi.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/uapi/rdma/rdma_netlink.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/uapi/rdma/rdma_user_cm.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/uapi/rdma/rdma_user_ioctl.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/uapi/rdma/rdma_user_ioctl_cmds.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/uapi/rdma/rdma_user_rxe.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/uapi/rdma/rvt-abi.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/uapi/rdma/siw-abi.h create mode 100644 src/mlnx-ofa_kernel-5.8/include/uapi/rdma/vmw_pvrdma-abi.h create mode 120000 src/mlnx-ofa_kernel-5.8/makefile create mode 100644 src/mlnx-ofa_kernel-5.8/net/9p/9pnet_rdma.c create mode 100644 src/mlnx-ofa_kernel-5.8/net/9p/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/net/mlxdevm/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/net/mlxdevm/mlxdevm.c create mode 100644 src/mlnx-ofa_kernel-5.8/net/rds/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/net/rds/rds_rdma_dummy.c create mode 100644 src/mlnx-ofa_kernel-5.8/net/sched/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/net/sched/act_ct_4_18.c create mode 100644 src/mlnx-ofa_kernel-5.8/net/sched/act_pedit.c create mode 100644 src/mlnx-ofa_kernel-5.8/net/sched/act_tunnel_key.c create mode 100644 src/mlnx-ofa_kernel-5.8/net/sched/act_vlan.c create mode 100644 src/mlnx-ofa_kernel-5.8/net/sched/cls_flower_4_18.c create mode 100644 src/mlnx-ofa_kernel-5.8/net/sched/cls_flower_compat.c create mode 100644 src/mlnx-ofa_kernel-5.8/net/smc/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/net/smc/smc_diag_dummy.c create mode 100644 src/mlnx-ofa_kernel-5.8/net/smc/smc_dummy.c create mode 100644 src/mlnx-ofa_kernel-5.8/net/sunrpc/xprtrdma/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/net/sunrpc/xprtrdma/Module.supported create mode 100755 src/mlnx-ofa_kernel-5.8/net/sunrpc/xprtrdma/_makefile_ create mode 100755 src/mlnx-ofa_kernel-5.8/net/sunrpc/xprtrdma/autogen.sh create mode 100644 src/mlnx-ofa_kernel-5.8/net/sunrpc/xprtrdma/backchannel.c create mode 100755 src/mlnx-ofa_kernel-5.8/net/sunrpc/xprtrdma/common.postinst create mode 100644 src/mlnx-ofa_kernel-5.8/net/sunrpc/xprtrdma/debian/changelog create mode 100644 src/mlnx-ofa_kernel-5.8/net/sunrpc/xprtrdma/debian/compat create mode 100644 src/mlnx-ofa_kernel-5.8/net/sunrpc/xprtrdma/debian/control create mode 100644 src/mlnx-ofa_kernel-5.8/net/sunrpc/xprtrdma/debian/control.no_dkms create mode 100644 src/mlnx-ofa_kernel-5.8/net/sunrpc/xprtrdma/debian/copyright create mode 100755 src/mlnx-ofa_kernel-5.8/net/sunrpc/xprtrdma/debian/mlnx-nfsrdma-dkms.postinst create mode 100755 src/mlnx-ofa_kernel-5.8/net/sunrpc/xprtrdma/debian/mlnx-nfsrdma-dkms.prerm create mode 100755 src/mlnx-ofa_kernel-5.8/net/sunrpc/xprtrdma/debian/rules create mode 100644 src/mlnx-ofa_kernel-5.8/net/sunrpc/xprtrdma/debian/source/format create mode 100644 src/mlnx-ofa_kernel-5.8/net/sunrpc/xprtrdma/dkms.conf create mode 100644 src/mlnx-ofa_kernel-5.8/net/sunrpc/xprtrdma/frwr_ops.c create mode 100644 src/mlnx-ofa_kernel-5.8/net/sunrpc/xprtrdma/mlnx-nfsrdma_spec_ create mode 100644 src/mlnx-ofa_kernel-5.8/net/sunrpc/xprtrdma/module.c create mode 100644 src/mlnx-ofa_kernel-5.8/net/sunrpc/xprtrdma/nvfs.h create mode 100644 src/mlnx-ofa_kernel-5.8/net/sunrpc/xprtrdma/nvfs_rpc_rdma.c create mode 100644 src/mlnx-ofa_kernel-5.8/net/sunrpc/xprtrdma/nvfs_rpc_rdma.h create mode 100644 src/mlnx-ofa_kernel-5.8/net/sunrpc/xprtrdma/rpc_rdma.c create mode 100644 src/mlnx-ofa_kernel-5.8/net/sunrpc/xprtrdma/rpcrdma_dummy.c create mode 100644 src/mlnx-ofa_kernel-5.8/net/sunrpc/xprtrdma/svc_rdma.c create mode 100644 src/mlnx-ofa_kernel-5.8/net/sunrpc/xprtrdma/svc_rdma_backchannel.c create mode 100644 src/mlnx-ofa_kernel-5.8/net/sunrpc/xprtrdma/svc_rdma_pcl.c create mode 100644 src/mlnx-ofa_kernel-5.8/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c create mode 100644 src/mlnx-ofa_kernel-5.8/net/sunrpc/xprtrdma/svc_rdma_rw.c create mode 100644 src/mlnx-ofa_kernel-5.8/net/sunrpc/xprtrdma/svc_rdma_sendto.c create mode 100644 src/mlnx-ofa_kernel-5.8/net/sunrpc/xprtrdma/svc_rdma_transport.c create mode 100644 src/mlnx-ofa_kernel-5.8/net/sunrpc/xprtrdma/svcrdma_dummy.c create mode 100755 src/mlnx-ofa_kernel-5.8/net/sunrpc/xprtrdma/tools/sign-modules create mode 100644 src/mlnx-ofa_kernel-5.8/net/sunrpc/xprtrdma/transport.c create mode 100644 src/mlnx-ofa_kernel-5.8/net/sunrpc/xprtrdma/verbs.c create mode 100644 src/mlnx-ofa_kernel-5.8/net/sunrpc/xprtrdma/xprt_rdma.h create mode 100644 src/mlnx-ofa_kernel-5.8/net/sunrpc/xprtrdma/xprtrdma_dummy.c create mode 100644 src/mlnx-ofa_kernel-5.8/ofed_scripts/82-net-setup-link.rules create mode 100644 src/mlnx-ofa_kernel-5.8/ofed_scripts/83-mlnx-sf-name.rules create mode 100644 src/mlnx-ofa_kernel-5.8/ofed_scripts/90-ib.rules create mode 100644 src/mlnx-ofa_kernel-5.8/ofed_scripts/Makefile create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/autoversion.sh create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/auxdev-sf-netdev-rename create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/backports_copy_patches.sh create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/backports_fixup_changes.sh create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/check_ofed_kernel.sh create mode 100644 src/mlnx-ofa_kernel-5.8/ofed_scripts/checkout_files create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/cleanup create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/common.postinst create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/configure create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/convert_to_mlnx_en create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/deb_mlnx_en_service_install_helper create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/dkms_build_ulps.sh create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/dkms_ofed create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/dkms_ofed_post_build.sh create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/gen-compat-autoconf.sh create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/gen-compat-config.sh create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/generate_dkms_conf.sh create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/generate_mlnx_en_dkms_conf.sh create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/get_backport_dir.sh create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/get_backports_per_path.sh create mode 100644 src/mlnx-ofa_kernel-5.8/ofed_scripts/ib_ipoib.conf create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/ibdev2netdev create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/install_helper create mode 100644 src/mlnx-ofa_kernel-5.8/ofed_scripts/makefile create mode 100644 src/mlnx-ofa_kernel-5.8/ofed_scripts/makefile.packaging create mode 100644 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlnx-bf.conf create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlnx-ofed-kernel-utils.openibd.init create mode 100644 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlnx-ofed-kernel-utils.openibd.upstart create mode 100644 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlnx.conf create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlnx_bf_assign_ct_cores.sh create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlnx_conf_mgr.sh create mode 100644 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlnx_en/LICENSE create mode 100644 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlnx_en/MLNX_EN_Linux_README.txt create mode 100644 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlnx_en/MLNX_EN_Linux_Release_Notes.txt create mode 100644 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlnx_en/Makefile.mlnx_en create mode 100644 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlnx_en/debian/changelog create mode 100644 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlnx_en/debian/compat create mode 100644 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlnx_en/debian/control create mode 100644 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlnx_en/debian/control.no_dkms create mode 100644 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlnx_en/debian/copyright create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlnx_en/debian/mlnx-en-dkms.postinst create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlnx_en/debian/mlnx-en-dkms.prerm create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlnx_en/debian/mlnx-en-utils.postinst create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlnx_en/debian/mlnx-en-utils.prerm create mode 100644 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlnx_en/debian/mlnx-en.upstart create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlnx_en/debian/rules create mode 100644 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlnx_en/debian/source/format create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlnx_en/kmodtool.rh5 create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlnx_en/kmodtool.rh7 create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlnx_en/kmp-tool.sh create mode 100644 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlnx_en/makefile.mlnx_en create mode 100644 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlnx_en/mlnx-en.conf create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlnx_en/mlnx-en.d create mode 100644 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlnx_en/mlnx-en.d.service create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlnx_en/mlnx-en.d_deb create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlnx_en/mlnx_en_patch.sh create mode 100644 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlnx_en/mlx4.files create mode 100644 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlnx_en/mlx4.files.sles10 create mode 100644 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlnx_en/mlx4.files.sles11 create mode 100644 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlnx_en/mlx4_core.conf create mode 100644 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlnx_en/mlx4_en.7 create mode 100644 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlnx_en/mlx4_en.conf create mode 100644 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlnx_en/mlx4_ib-dummy/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlnx_en/mlx4_ib-dummy/main.c create mode 100644 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlnx_en/mlx4_ib.conf create mode 100644 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlnx_en/mlx5_ib-dummy/Makefile create mode 100644 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlnx_en/mlx5_ib-dummy/main.c create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlnx_en/scripts/install.sh create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlnx_en/scripts/mlnx_en_uninstall.sh create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlnx_en/scripts/mlnx_fw_updater.pl create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlnx_interface_mgr.sh create mode 100644 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlnx_interface_mgr@.service create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlnx_interface_mgr_deb.sh create mode 100644 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlx5.conf create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlxnet create mode 100644 src/mlnx-ofa_kernel-5.8/ofed_scripts/mlxnet.conf create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/net-interfaces create mode 100644 src/mlnx-ofa_kernel-5.8/ofed_scripts/nfs_header_checkout_files create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/odp_stat.sh create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/ofed_checkout.sh create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/ofed_format_patch.sh create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/ofed_get_patches.sh create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/ofed_makedist.sh create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/ofed_patch.sh create mode 100644 src/mlnx-ofa_kernel-5.8/ofed_scripts/openib.conf create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/openibd create mode 100644 src/mlnx-ofa_kernel-5.8/ofed_scripts/openibd.service create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/pre_build.sh create mode 100644 src/mlnx-ofa_kernel-5.8/ofed_scripts/setup_mr_cache.sh create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/sf-rep-netdev-rename create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/strip.sh create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/tools/sign-modules create mode 100644 src/mlnx-ofa_kernel-5.8/ofed_scripts/truescale.cmds create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/unifdef_tool/get_ofed_basecode.sh create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/unifdef_tool/help_scripts/build_defs_file.sh create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/unifdef_tool/help_scripts/handle_config_h.sh create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/unifdef_tool/help_scripts/handle_configure_ac.sh create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/unifdef_tool/help_scripts/split_config_h.sh create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/validate_backport_patches.sh create mode 100755 src/mlnx-ofa_kernel-5.8/ofed_scripts/vf-net-link-name.sh create mode 100755 src/mlnx-ofa_kernel-5.8/scripts/admin_rdma.sh create mode 100644 src/mlnx-ofa_kernel-5.8/scripts/keep_files create mode 100755 src/mlnx-ofa_kernel-5.8/scripts/refreshBackports.sh create mode 100755 src/mlnx-ofa_kernel-5.8/scripts/setlocalversion diff --git a/licenses/BSD 3-Clause b/licenses/BSD 3-Clause new file mode 100644 index 0000000..7ba4209 --- /dev/null +++ b/licenses/BSD 3-Clause @@ -0,0 +1,13 @@ +BSD 3-Clause License + +Copyright (c) 2023, Full name + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +1.Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + +2.Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +3.Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/licenses/GPL-2.0 b/licenses/GPL-2.0 new file mode 100644 index 0000000..ff0812f --- /dev/null +++ b/licenses/GPL-2.0 @@ -0,0 +1,359 @@ +Valid-License-Identifier: GPL-2.0 +Valid-License-Identifier: GPL-2.0-only +Valid-License-Identifier: GPL-2.0+ +Valid-License-Identifier: GPL-2.0-or-later +SPDX-URL: https://spdx.org/licenses/GPL-2.0.html +Usage-Guide: + To use this license in source code, put one of the following SPDX + tag/value pairs into a comment according to the placement + guidelines in the licensing rules documentation. + For 'GNU General Public License (GPL) version 2 only' use: + SPDX-License-Identifier: GPL-2.0 + or + SPDX-License-Identifier: GPL-2.0-only + For 'GNU General Public License (GPL) version 2 or any later version' use: + SPDX-License-Identifier: GPL-2.0+ + or + SPDX-License-Identifier: GPL-2.0-or-later +License-Text: + + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. diff --git a/mlnx-ofa_kernel.spec b/mlnx-ofa_kernel.spec new file mode 100644 index 0000000..e3c24c4 --- /dev/null +++ b/mlnx-ofa_kernel.spec @@ -0,0 +1,723 @@ +# +# Copyright (c) 2012 Mellanox Technologies. All rights reserved. +# +# This Software is licensed under one of the following licenses: +# +# 1) under the terms of the "Common Public License 1.0" a copy of which is +# available from the Open Source Initiative, see +# http://www.opensource.org/licenses/cpl.php. +# +# 2) under the terms of the "The BSD License" a copy of which is +# available from the Open Source Initiative, see +# http://www.opensource.org/licenses/bsd-license.php. +# +# 3) under the terms of the "GNU General Public License (GPL) Version 2" a +# copy of which is available from the Open Source Initiative, see +# http://www.opensource.org/licenses/gpl-license.php. +# +# Licensee has the right to choose one of the above licenses. +# +# Redistributions of source code must retain the above copyright +# notice and one of the license notices. +# +# Redistributions in binary form must reproduce both the above copyright +# notice, one of the license notices in the documentation +# and/or other materials provided with the distribution. +# +# + +# KMP is disabled by default +%{!?KMP: %global KMP 0} + +%global WITH_SYSTEMD %(if ( test -d "%{_unitdir}" > /dev/null); then echo -n '1'; else echo -n '0'; fi) + +%{!?configure_options: %global configure_options --with-core-mod --with-user_mad-mod --with-user_access-mod --with-addr_trans-mod --with-mlx5-mod --with-mlxfw-mod --with-ipoib-mod} + +%global MEMTRACK %(if ( echo %{configure_options} | grep "with-memtrack" > /dev/null ); then echo -n '1'; else echo -n '0'; fi) +%global MADEYE %(if ( echo %{configure_options} | grep "with-madeye-mod" > /dev/null ); then echo -n '1'; else echo -n '0'; fi) + +%global WINDRIVER %(if (grep -qiE "Wind River" /etc/issue /etc/*release* 2>/dev/null); then echo -n '1'; else echo -n '0'; fi) +%global POWERKVM %(if (grep -qiE "powerkvm" /etc/issue /etc/*release* 2>/dev/null); then echo -n '1'; else echo -n '0'; fi) +%global BLUENIX %(if (grep -qiE "Bluenix" /etc/issue /etc/*release* 2>/dev/null); then echo -n '1'; else echo -n '0'; fi) +%global XENSERVER65 %(if (grep -qiE "XenServer.*6\.5" /etc/issue /etc/*release* 2>/dev/null); then echo -n '1'; else echo -n '0'; fi) + +%global IS_RHEL_VENDOR "%{_vendor}" == "redhat" || ("%{_vendor}" == "bclinux") || ("%{_vendor}" == "openEuler") +%global KMOD_PREAMBLE "%{_vendor}" != "openEuler" + +# MarinerOS 1.0 sets -fPIE in the hardening cflags +# (in the gcc specs file). +# This seems to break only this package and not other kernel packages. +%if "%{_vendor}" == "mariner" +%global _hardened_cflags %{nil} +%endif + +%{!?KVERSION: %global KVERSION %(uname -r)} +%global kernel_version %{KVERSION} +%global krelver %(echo -n %{KVERSION} | sed -e 's/-/_/g') +# take path to kernel sources if provided, otherwise look in default location (for non KMP rpms). +%{!?K_SRC: %global K_SRC /lib/modules/%{KVERSION}/build} + +# Select packages to build + +# Kernel module packages to be included into kernel-ib +%global build_ipoib %(if ( echo %{configure_options} | grep "with-ipoib-mod" > /dev/null ); then echo -n '1'; else echo -n '0'; fi) +%global build_oiscsi %(if ( echo %{configure_options} | grep "with-iscsi-mod" > /dev/null ); then echo -n '1'; else echo -n '0'; fi) +%global build_mlx5 %(if ( echo %{configure_options} | grep "with-mlx5-mod" > /dev/null ); then echo -n '1'; else echo -n '0'; fi) + +%{!?LIB_MOD_DIR: %global LIB_MOD_DIR /lib/modules/%{KVERSION}/updates} + +%{!?IB_CONF_DIR: %global IB_CONF_DIR /etc/infiniband} + +%{!?KERNEL_SOURCES: %global KERNEL_SOURCES /lib/modules/%{KVERSION}/source} + +%{!?_name: %global _name mlnx-ofa_kernel} +%{!?_version: %global _version 5.8} +%{!?_release: %global _release OFED.5.8.1.1.2.1} +%global _kmp_rel %{_release}%{?_kmp_build_num}%{?_dist} + +%global utils_pname %{_name} +%global devel_pname %{_name}-devel +%global non_kmp_pname %{_name}-modules + +Summary: Infiniband HCA Driver +Name: %{_name} +Version: %{_version} +Release: %{_release}%{?_dist} +License: GPLv2 +Url: http://www.mellanox.com/ +Group: System Environment/Base +Source: %{_name}-%{_version}.tgz +BuildRoot: %{?build_root:%{build_root}}%{!?build_root:/var/tmp/OFED} +Vendor: Mellanox Technologies +Obsoletes: kernel-ib +Obsoletes: mlnx-en +Obsoletes: mlnx_en +Obsoletes: mlnx-en-utils +Obsoletes: kmod-mlnx-en +Obsoletes: mlnx-en-kmp-default +Obsoletes: mlnx-en-kmp-xen +Obsoletes: mlnx-en-kmp-trace +Obsoletes: mlnx-en-doc +Obsoletes: mlnx-en-debuginfo +Obsoletes: mlnx-en-sources +Requires: mlnx-tools >= 5.2.0 +Requires: coreutils +Requires: pciutils +Requires: grep +Requires: procps +Requires: module-init-tools +Requires: lsof +%if "%{KMP}" == "1" +BuildRequires: %kernel_module_package_buildreqs +BuildRequires: /usr/bin/perl +%endif +%description +InfiniBand "verbs", Access Layer and ULPs. +Utilities rpm. +The driver sources are located at: http://www.mellanox.com/downloads/ofed/mlnx-ofa_kernel-5.8-1.1.2.tgz + + +# build KMP rpms? +%if "%{KMP}" == "1" +%global kernel_release() $(make -s -C %{1} kernelrelease M=$PWD) +# prep file list for kmp rpm +%(cat > %{_builddir}/kmp.files << EOF +%defattr(644,root,root,755) +/lib/modules/%2-%1 +%if %{IS_RHEL_VENDOR} +%config(noreplace) %{_sysconfdir}/depmod.d/zz01-%{_name}-*.conf +%endif +EOF) +%(echo "Obsoletes: kmod-mlnx-rdma-rxe, mlnx-rdma-rxe-kmp" >> %{_builddir}/preamble) +%if %KMOD_PREAMBLE +%kernel_module_package -f %{_builddir}/kmp.files -r %{_kmp_rel} -p %{_builddir}/preamble +%else +%kernel_module_package -f %{_builddir}/kmp.files -r %{_kmp_rel} +%endif +%else # not KMP +%global kernel_source() %{K_SRC} +%global kernel_release() %{KVERSION} +%global flavors_to_build default +%package -n %{non_kmp_pname} +Obsoletes: kernel-ib +Obsoletes: mlnx-en +Obsoletes: mlnx_en +Obsoletes: mlnx-en-utils +Obsoletes: kmod-mlnx-en +Obsoletes: mlnx-en-kmp-default +Obsoletes: mlnx-en-kmp-xen +Obsoletes: mlnx-en-kmp-trace +Obsoletes: mlnx-en-doc +Obsoletes: mlnx-en-debuginfo +Obsoletes: mlnx-en-sources +Obsoletes: mlnx-rdma-rxe +Version: %{_version} +Release: %{_release}.kver.%{krelver} +Summary: Infiniband Driver and ULPs kernel modules +Group: System Environment/Libraries +%description -n %{non_kmp_pname} +Core, HW and ULPs kernel modules +Non-KMP format kernel modules rpm. +The driver sources are located at: http://www.mellanox.com/downloads/ofed/mlnx-ofa_kernel-5.8-1.1.2.tgz +%endif #end if "%{KMP}" == "1" + +%package -n %{devel_pname} +Version: %{_version} +# build KMP rpms? +%if "%{KMP}" == "1" +Release: %{_release}%{?_dist} +%else +Release: %{_release}.kver.%{krelver} +%endif +Obsoletes: kernel-ib-devel +Obsoletes: kernel-ib +Obsoletes: mlnx-en +Obsoletes: mlnx_en +Obsoletes: mlnx-en-utils +Obsoletes: kmod-mlnx-en +Obsoletes: mlnx-en-kmp-default +Obsoletes: mlnx-en-kmp-xen +Obsoletes: mlnx-en-kmp-trace +Obsoletes: mlnx-en-doc +Obsoletes: mlnx-en-debuginfo +Obsoletes: mlnx-en-sources +Requires: coreutils +Requires: pciutils +Requires(post): %{_sbindir}/update-alternatives +Requires(postun): %{_sbindir}/update-alternatives +Summary: Infiniband Driver and ULPs kernel modules sources +Group: System Environment/Libraries +%description -n %{devel_pname} +Core, HW and ULPs kernel modules sources +The driver sources are located at: http://www.mellanox.com/downloads/ofed/mlnx-ofa_kernel-5.8-1.1.2.tgz + +%package source +Summary: Source of the MLNX_OFED main kernel driver +Group: System Environment/Libraries +%description source +Source of the mlnx-ofa_kernel modules. + +You should probably only install this package if you want to view the +sourecs of driver. Use the -devel package if you want to build other +drivers against it. + +# +# setup module sign scripts if paths to the keys are given +# +%global WITH_MOD_SIGN %(if ( test -f "$MODULE_SIGN_PRIV_KEY" && test -f "$MODULE_SIGN_PUB_KEY" ); \ + then \ + echo -n '1'; \ + else \ + echo -n '0'; fi) + +%if "%{WITH_MOD_SIGN}" == "1" +# call module sign script +%global __modsign_install_post \ + %{_builddir}/$NAME-$VERSION/source/ofed_scripts/tools/sign-modules %{buildroot}/lib/modules/ %{kernel_source default} || exit 1 \ +%{nil} + +%global __debug_package 1 +%global buildsubdir %{_name}-%{version} +# Disgusting hack alert! We need to ensure we sign modules *after* all +# invocations of strip occur, which is in __debug_install_post if +# find-debuginfo.sh runs, and __os_install_post if not. +# +%global __spec_install_post \ + %{?__debug_package:%{__debug_install_post}} \ + %{__arch_install_post} \ + %{__os_install_post} \ + %{__modsign_install_post} \ +%{nil} + +%endif # end of setup module sign scripts +# +%if "%{_vendor}" == "suse" +%debug_package +%endif + +%if %{IS_RHEL_VENDOR} +%global __find_requires %{nil} +%endif + +# set modules dir +%if %{IS_RHEL_VENDOR} +%if 0%{?fedora} +%global install_mod_dir updates +%else +%global install_mod_dir extra/%{_name} +%endif +%endif + +%if "%{_vendor}" == "suse" +%global install_mod_dir updates +%endif + +%{!?install_mod_dir: %global install_mod_dir updates} + +%prep +%setup -n %{_name}-%{_version} +set -- * +mkdir source +mv "$@" source/ +mkdir obj + +%build +export EXTRA_CFLAGS='-DVERSION=\"%version\"' +export INSTALL_MOD_DIR=%{install_mod_dir} +export CONF_OPTIONS="%{configure_options}" +for flavor in %flavors_to_build; do + export KSRC=%{kernel_source $flavor} + export KVERSION=%{kernel_release $KSRC} + export LIB_MOD_DIR=/lib/modules/$KVERSION/$INSTALL_MOD_DIR + rm -rf obj/$flavor + cp -a source obj/$flavor + cd $PWD/obj/$flavor + find compat -type f -exec touch -t 200012201010 '{}' \; || true + ./configure --build-dummy-mods --prefix=%{_prefix} --kernel-version $KVERSION --kernel-sources $KSRC --modules-dir $LIB_MOD_DIR $CONF_OPTIONS %{?_smp_mflags} + make %{?_smp_mflags} kernel + make build_py_scripts + cd - +done + +%install +export RECORD_PY_FILES=1 +export INSTALL_MOD_PATH=%{buildroot} +export INSTALL_MOD_DIR=%{install_mod_dir} +export NAME=%{name} +export VERSION=%{version} +export PREFIX=%{_prefix} +for flavor in %flavors_to_build; do + export KSRC=%{kernel_source $flavor} + export KVERSION=%{kernel_release $KSRC} + cd $PWD/obj/$flavor + make install_modules KERNELRELEASE=$KVERSION + # install script and configuration files + make install_scripts + mkdir -p %{_builddir}/src/$NAME/$flavor + cp -ar include/ %{_builddir}/src/$NAME/$flavor + cp -ar config* %{_builddir}/src/$NAME/$flavor + cp -ar compat* %{_builddir}/src/$NAME/$flavor + cp -ar ofed_scripts %{_builddir}/src/$NAME/$flavor + + modsyms=`find . -name Module.symvers -o -name Modules.symvers` + if [ -n "$modsyms" ]; then + for modsym in $modsyms + do + cat $modsym >> %{_builddir}/src/$NAME/$flavor/Module.symvers + done + else + ./ofed_scripts/create_Module.symvers.sh + cp ./Module.symvers %{_builddir}/src/$NAME/$flavor/Module.symvers + fi + # Cleanup unnecessary kernel-generated module dependency files. + find $INSTALL_MOD_PATH/lib/modules -iname 'modules.*' -exec rm {} \; + cd - +done + +# Set the module(s) to be executable, so that they will be stripped when packaged. +find %{buildroot} \( -type f -name '*.ko' -o -name '*ko.gz' \) -exec %{__chmod} u+x \{\} \; + +%if %{IS_RHEL_VENDOR} +%if ! 0%{?fedora} +%{__install} -d %{buildroot}%{_sysconfdir}/depmod.d/ +for module in `find %{buildroot}/ -name '*.ko' -o -name '*.ko.gz' | sort` +do +ko_name=${module##*/} +mod_name=${ko_name/.ko*/} +mod_path=${module/*%{_name}} +mod_path=${mod_path/\/${ko_name}} +echo "override ${mod_name} * weak-updates/%{_name}${mod_path}" >> %{buildroot}%{_sysconfdir}/depmod.d/zz01-%{_name}-${mod_name}.conf +echo "override ${mod_name} * extra/%{_name}${mod_path}" >> %{buildroot}%{_sysconfdir}/depmod.d/zz01-%{_name}-${mod_name}.conf +done +%endif +%endif + +# copy sources +mkdir -p %{buildroot}/%{_prefix}/src/ofa_kernel-%{version} +mkdir -p %{buildroot}/%{_prefix}/src/ofa_kernel/%{_arch} +cp -a %{_builddir}/%{name}-%{version}/source %{buildroot}/%{_prefix}/src/ofa_kernel-%{version}/source +ln -s ofa_kernel-%{version}/source %{buildroot}/%{_prefix}/src/mlnx-ofa_kernel-%{version} +cp -a %{_builddir}/src/%{name}/* %{buildroot}/%{_prefix}/src/ofa_kernel/%{_arch}/%{KVERSION} +# Fix path of BACKPORT_INCLUDES +sed -i -e "s@=-I.*backport_includes@=-I/usr/src/ofa_kernel-$VERSION/backport_includes@" %{buildroot}/%{_prefix}/src/ofa_kernel/%{_arch}/%{KVERSION}/configure.mk.kernel || true +rm -rf %{_builddir}/src + +INFO=${RPM_BUILD_ROOT}/etc/infiniband/info +/bin/rm -f ${INFO} +mkdir -p ${RPM_BUILD_ROOT}/etc/infiniband +touch ${INFO} + +cat >> ${INFO} << EOFINFO +#!/bin/bash + +echo prefix=%{_prefix} +echo Kernel=%{KVERSION} +echo +echo "Configure options: %{configure_options}" +echo +EOFINFO + +chmod +x ${INFO} > /dev/null 2>&1 + +%if "%{WITH_SYSTEMD}" == "1" +install -d %{buildroot}%{_unitdir} +install -d %{buildroot}/etc/systemd/system +install -m 0644 %{_builddir}/$NAME-$VERSION/source/ofed_scripts/openibd.service %{buildroot}%{_unitdir} +install -m 0644 %{_builddir}/$NAME-$VERSION/source/ofed_scripts/mlnx_interface_mgr\@.service %{buildroot}/etc/systemd/system +%endif + +install -d %{buildroot}/bin +install -m 0755 %{_builddir}/$NAME-$VERSION/source/ofed_scripts/mlnx_conf_mgr.sh %{buildroot}/bin/ +%if "%{WINDRIVER}" == "0" && "%{BLUENIX}" == "0" +install -m 0755 %{_builddir}/$NAME-$VERSION/source/ofed_scripts/mlnx_interface_mgr.sh %{buildroot}/bin/ +%else +# Wind River and Mellanox Bluenix are rpm based, however, interfaces management is done in Debian style +install -d %{buildroot}/usr/sbin +install -m 0755 %{_builddir}/$NAME-$VERSION/source/ofed_scripts/mlnx_interface_mgr_deb.sh %{buildroot}/bin/mlnx_interface_mgr.sh +install -m 0755 %{_builddir}/$NAME-$VERSION/source/ofed_scripts/net-interfaces %{buildroot}/usr/sbin +%endif + +# Install ibroute utilities +# TBD: move these utilities into standalone package +install -d %{buildroot}%{_sbindir} + +# update /etc/init.d/openibd header +is_euler=`grep 'NAME=".*Euler' /etc/os-release 2>/dev/null || :` +if [[ -f /etc/redhat-release || -f /etc/rocks-release || "$is_euler" != '' ]]; then +perl -i -ne 'if (m@^#!/bin/bash@) { + print q@#!/bin/bash +# +# Bring up/down openib +# +# chkconfig: 2345 05 95 +# description: Activates/Deactivates InfiniBand Driver to \ +# start at boot time. +# +### BEGIN INIT INFO +# Provides: openibd +### END INIT INFO +@; + } else { + print; + }' %{buildroot}/etc/init.d/openibd +fi + +if [ -f /etc/SuSE-release ] || grep -qwi SLES /etc/os-release 2>/dev/null; then + local_fs='$local_fs' + openiscsi='' + %if %{build_oiscsi} + openiscsi='open-iscsi' + %endif + perl -i -ne "if (m@^#!/bin/bash@) { + print q@#!/bin/bash +### BEGIN INIT INFO +# Provides: openibd +# Required-Start: $local_fs +# Required-Stop: opensmd $openiscsi +# Default-Start: 2 3 5 +# Default-Stop: 0 1 2 6 +# Description: Activates/Deactivates InfiniBand Driver to \ +# start at boot time. +### END INIT INFO +@; + } else { + print; + }" %{buildroot}/etc/init.d/openibd +fi + +%if %{build_ipoib} +case $(uname -m) in + i[3-6]86) + # Decrease send/receive queue sizes on 32-bit arcitecture + echo "options ib_ipoib send_queue_size=64 recv_queue_size=128" >> %{buildroot}/etc/modprobe.d/ib_ipoib.conf + ;; +esac +%endif + +%clean +rm -rf %{buildroot} + + +%if "%{KMP}" != "1" +%post -n %{non_kmp_pname} +/sbin/depmod %{KVERSION} +# W/A for OEL6.7/7.x inbox modules get locked in memory +# in dmesg we get: Module mlx4_core locked in memory until next boot +if (grep -qiE "Oracle.*(6.([7-9]|10)| 7)" /etc/issue /etc/*release* 2>/dev/null); then + /sbin/dracut --force +fi + +%postun -n %{non_kmp_pname} +if [ $1 = 0 ]; then # 1 : Erase, not upgrade + /sbin/depmod %{KVERSION} + # W/A for OEL6.7/7.x inbox modules get locked in memory + # in dmesg we get: Module mlx4_core locked in memory until next boot + if (grep -qiE "Oracle.*(6.([7-9]|10)| 7)" /etc/issue /etc/*release* 2>/dev/null); then + /sbin/dracut --force + fi +fi +%endif # end KMP=1 + +%post -n %{utils_pname} +if [ $1 -eq 1 ]; then # 1 : This package is being installed +############################################################################################################# +is_euler=`grep 'NAME=".*Euler' /etc/os-release 2>/dev/null || :` +is_kylin=`grep 'NAME=".*Kylin' /etc/os-release 2>/dev/null || :` +if [[ -f /etc/redhat-release || -f /etc/rocks-release || -f /etc/UnionTech-release || -f /etc/ctyunos-release || "$is_euler" != '' || "$is_kylin" != '' ]]; then + /sbin/chkconfig openibd off >/dev/null 2>&1 || true + /usr/bin/systemctl disable openibd >/dev/null 2>&1 || true + /sbin/chkconfig --del openibd >/dev/null 2>&1 || true + +%if "%{WITH_SYSTEMD}" != "1" + /sbin/chkconfig --add openibd >/dev/null 2>&1 || true + /sbin/chkconfig openibd on >/dev/null 2>&1 || true +%else + /usr/bin/systemctl enable openibd >/dev/null 2>&1 || true +%endif +fi + +if [ -f /etc/SuSE-release ] || grep -qwi SLES /etc/os-release 2>/dev/null; then + /sbin/chkconfig openibd off >/dev/null 2>&1 || true + /usr/bin/systemctl disable openibd >/dev/null 2>&1 || true + /sbin/insserv -r openibd >/dev/null 2>&1 || true + +%if "%{WITH_SYSTEMD}" != "1" + /sbin/insserv openibd >/dev/null 2>&1 || true + /sbin/chkconfig openibd on >/dev/null 2>&1 || true +%else + /usr/bin/systemctl enable openibd >/dev/null 2>&1 || true +%endif +fi + +%if "%{WINDRIVER}" == "1" || "%{BLUENIX}" == "1" +/usr/sbin/update-rc.d openibd defaults || true +%endif + +%if "%{POWERKVM}" == "1" +/usr/bin/systemctl disable openibd >/dev/null 2>&1 || true +/usr/bin/systemctl enable openibd >/dev/null 2>&1 || true +%endif + +%if "%{WITH_SYSTEMD}" == "1" +/usr/bin/systemctl daemon-reload >/dev/null 2>&1 || : +cat /proc/sys/kernel/random/boot_id 2>/dev/null | sed -e 's/-//g' > /var/run/openibd.bootid || true +test -s /var/run/openibd.bootid || echo manual > /var/run/openibd.bootid || true +%endif + +# Comment core modules loading hack +if [ -e /etc/modprobe.conf.dist ]; then + sed -i -r -e 's/^(\s*install ib_core.*)/#MLX# \1/' /etc/modprobe.conf.dist + sed -i -r -e 's/^(\s*alias ib.*)/#MLX# \1/' /etc/modprobe.conf.dist +fi + +%if %{build_ipoib} +if [ -e /etc/modprobe.d/ipv6 ]; then + sed -i -r -e 's/^(\s*install ipv6.*)/#MLX# \1/' /etc/modprobe.d/ipv6 +fi +%endif + +# Update limits.conf (but not for Containers) +if [ ! -e "/.dockerenv" ] && ! (grep -q docker /proc/self/cgroup 2>/dev/null); then + if [ -e /etc/security/limits.conf ]; then + LIMITS_UPDATED=0 + if ! (grep -qE "soft.*memlock" /etc/security/limits.conf 2>/dev/null); then + echo "* soft memlock unlimited" >> /etc/security/limits.conf + LIMITS_UPDATED=1 + fi + if ! (grep -qE "hard.*memlock" /etc/security/limits.conf 2>/dev/null); then + echo "* hard memlock unlimited" >> /etc/security/limits.conf + LIMITS_UPDATED=1 + fi + if [ $LIMITS_UPDATED -eq 1 ]; then + echo "Configured /etc/security/limits.conf" + fi + fi +fi + +# Make IPoIB interfaces be unmanaged on XenServer +if (grep -qi xenserver /etc/issue /etc/*-release 2>/dev/null); then + IPOIB_PNUM=$(lspci -d 15b3: 2>/dev/null | wc -l 2>/dev/null) + IPOIB_PNUM=$(($IPOIB_PNUM * 2)) + for i in $(seq 1 $IPOIB_PNUM) + do + uuid=$(xe pif-list 2>/dev/null | grep -B2 ib${i} | grep uuid | cut -d : -f 2 | sed -e 's/ //g') + if [ "X${uuid}" != "X" ]; then + xe pif-forget uuid=${uuid} >/dev/null 2>&1 || true + fi + done +fi + +fi # 1 : closed +# END of post + +%preun -n %{utils_pname} +is_euler=`grep 'NAME=".*Euler' /etc/os-release 2>/dev/null || :` +is_kylin=`grep 'NAME=".*Kylin' /etc/os-release 2>/dev/null || :` +if [ $1 = 0 ]; then # 1 : Erase, not upgrade + if [[ -f /etc/redhat-release || -f /etc/rocks-release || -f /etc/UnionTech-release || "$is_euler" != '' || "$is_kylin" != '' ]]; then + /sbin/chkconfig openibd off >/dev/null 2>&1 || true + /usr/bin/systemctl disable openibd >/dev/null 2>&1 || true + /sbin/chkconfig --del openibd >/dev/null 2>&1 || true + fi + if [ -f /etc/SuSE-release ] || grep -qwi SLES /etc/os-release 2>/dev/null; then + /sbin/chkconfig openibd off >/dev/null 2>&1 || true + /usr/bin/systemctl disable openibd >/dev/null 2>&1 || true + /sbin/insserv -r openibd >/dev/null 2>&1 || true + fi + if [ -f /etc/debian_version ]; then + if ! ( /usr/sbin/update-rc.d openibd remove > /dev/null 2>&1 ); then + true + fi + fi +%if "%{WINDRIVER}" == "1" || "%{BLUENIX}" == "1" +/usr/sbin/update-rc.d -f openibd remove || true +%endif + +%if "%{POWERKVM}" == "1" +/usr/bin/systemctl disable openibd >/dev/null 2>&1 || true +%endif +fi + +%postun -n %{utils_pname} +%if "%{WITH_SYSTEMD}" == "1" +/usr/bin/systemctl daemon-reload >/dev/null 2>&1 || : +%endif + +# Uncomment core modules loading hack +if [ -e /etc/modprobe.conf.dist ]; then + sed -i -r -e 's/^#MLX# (.*)/\1/' /etc/modprobe.conf.dist +fi + +%if %{build_ipoib} +if [ -e /etc/modprobe.d/ipv6 ]; then + sed -i -r -e 's/^#MLX# (.*)/\1/' /etc/modprobe.d/ipv6 +fi +%endif + +#end of post uninstall + +%post -n %{devel_pname} +if [ -d "%{_prefix}/src/ofa_kernel/default" -a $1 -gt 1 ]; then + touch %{_prefix}/src/ofa_kernel/%{_arch}/%{KVERSION}.missing_link + # Will run update-alternatives in posttrans +else + update-alternatives --install \ + %{_prefix}/src/ofa_kernel/default \ + ofa_kernel_headers \ + %{_prefix}/src/ofa_kernel/%{_arch}/%{KVERSION} \ + 20 +fi + +%posttrans -n %{devel_pname} +symlink="%{_prefix}/src/ofa_kernel/default" +# Should only be used for upgrading from pre-5.5-0.2.6.0 packages: +# At the time of upgrade there was still a directory, so postpone +# generating the alternative symlink to that point: +for flag_file in %{_prefix}/src/ofa_kernel/*/*.missing_link; do + dir=${flag_file%.missing_link} + if [ ! -d "$dir" ]; then + # Directory is no longer there. Nothing left to handle + rm -f "$flag_file" + continue + fi + if [ -d "$symlink" ]; then + echo "%{devel_pname}-%{version}: $symlink is still a non-empty directory. Deleting in preparation for a symlink." + rm -rf "$symlink" + fi + update-alternatives --install \ + "$symlink" \ + ofa_kernel_headers \ + "$dir" \ + 20 + rm -f "$flag_file" +done + +%postun -n %{devel_pname} +update-alternatives --remove \ + ofa_kernel_headers \ + %{_prefix}/src/ofa_kernel/%{_arch}/%{KVERSION} \ + +%files -n %{utils_pname} +%defattr(-,root,root,-) +%doc source/ofed_scripts/82-net-setup-link.rules source/ofed_scripts/vf-net-link-name.sh +%if "%{KMP}" == "1" +%if %{IS_RHEL_VENDOR} +%endif # end rh +%endif # end KMP=1 +%dir /etc/infiniband +%config(noreplace) /etc/infiniband/openib.conf +%config(noreplace) /etc/infiniband/mlx5.conf +/etc/infiniband/info +/etc/init.d/openibd +%if "%{WITH_SYSTEMD}" == "1" +%{_unitdir}/openibd.service +/etc/systemd/system/mlnx_interface_mgr@.service +%endif +/lib/udev/sf-rep-netdev-rename +/lib/udev/auxdev-sf-netdev-rename +/usr/sbin/setup_mr_cache.sh +/usr/sbin/odp_stat.sh +%_datadir/mlnx_ofed/mlnx_bf_assign_ct_cores.sh +%config(noreplace) /etc/modprobe.d/mlnx.conf +%config(noreplace) /etc/modprobe.d/mlnx-bf.conf +%{_sbindir}/* +/lib/udev/rules.d/83-mlnx-sf-name.rules +/lib/udev/rules.d/90-ib.rules +/bin/mlnx_interface_mgr.sh +/bin/mlnx_conf_mgr.sh +%if "%{WINDRIVER}" == "1" || "%{BLUENIX}" == "1" +/usr/sbin/net-interfaces +%endif +%if %{build_ipoib} +%config(noreplace) /etc/modprobe.d/ib_ipoib.conf +%endif +%if %{build_mlx5} +%{_sbindir}/ibdev2netdev +%endif + +%if "%{KMP}" != "1" +%files -n %{non_kmp_pname} +/lib/modules/%{KVERSION}/%{install_mod_dir}/ +%if %{IS_RHEL_VENDOR} +%if ! 0%{?fedora} +%config(noreplace) %{_sysconfdir}/depmod.d/zz01-%{_name}-*.conf +%endif +%endif +%endif + +%files -n %{devel_pname} +%defattr(-,root,root,-) +%{_prefix}/src/ofa_kernel/%{_arch}/%{KVERSION} + +%files source +%defattr(-,root,root,-) +%{_prefix}/src/ofa_kernel-%version/source +%{_prefix}/src/mlnx-ofa_kernel-%version + +%changelog +* Thu Jun 18 2015 Alaa Hleihel +- Renamed kernel-ib package to mlnx-ofa_kernel-modules +* Thu Apr 10 2014 Alaa Hleihel +- Add QoS utils. +* Thu Mar 13 2014 Alaa Hleihel +- Use one spec for KMP and non-KMP OS's. +* Tue Apr 24 2012 Vladimir Sokolovsky +- Remove FC support +* Tue Mar 6 2012 Vladimir Sokolovsky +- Add weak updates support +* Wed Jul 6 2011 Vladimir Sokolovsky +- Add KMP support +* Mon Oct 4 2010 Vladimir Sokolovsky +- Add mlx4_fc and mlx4_vnic support +* Mon May 10 2010 Vladimir Sokolovsky +- Support install macro that removes RPM_BUILD_ROOT +* Thu Feb 4 2010 Vladimir Sokolovsky +- Added ibdev2netdev script +* Mon Sep 8 2008 Vladimir Sokolovsky +- Added nfsrdma support +* Wed Aug 13 2008 Vladimir Sokolovsky +- Added mlx4_en support +* Tue Aug 21 2007 Vladimir Sokolovsky +- Added %build macro +* Sun Jan 28 2007 Vladimir Sokolovsky +- Created spec file for kernel-ib diff --git a/src/mlnx-ofa_kernel-5.8/.gitignore b/src/mlnx-ofa_kernel-5.8/.gitignore new file mode 100644 index 0000000..2e58755 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/.gitignore @@ -0,0 +1,112 @@ +# +# NOTE! Don't add files that are generated in specific +# subdirectories here. Add them in the ".gitignore" file +# in that subdirectory instead. +# +# NOTE! Please use 'git ls-files -i --exclude-standard' +# command after changing this file, to see if there are +# any tracked files which get ignored after the change. +# +# Normal rules +# +.* +*.o +*.o.* +*.a +*.s +*.ko +*.ko.unsigned +*.so +*.so.dbg +*.mod.c +*.i +*.lst +*.symtypes +*.order +modules.builtin +*.elf +*.bin +*.gz +*.bz2 +*.lzma +*.xz +*.lzo +#*.patch +*.gcno +*.mod + + +# +# Top-level generic files +# +/tags +/TAGS +/linux +/vmlinux +/vmlinuz +/System.map +/Module.markers +/Module.symvers +bp2/ + +# +# git files that we don't want to ignore even it they are dot-files +# +!.gitignore +!.mailmap + +# +# Generated include files +# +include/config +include/linux/version.h +include/generated +arch/*/include/generated + +# stgit generated dirs +patches-* + +# quilt's files +patches +series + +# cscope files +cscope.* +ncscope.* + +# gnu global files +GPATH +GRTAGS +GSYMS +GTAGS + +*.orig +*~ +\#*# +configure.mk.kernel +compat.config +include/linux/autoconf.h +include/generated/autoconf.h +include/linux/compat_autoconf.h +backports_applied +openib.conf.tmp +ofed_scripts/utils/build/ + +# compat/autotools stauff +compat/COPYING +compat/Makefile.in +compat/INSTALL +compat/aclocal.m4 +compat/autom4te.cache/ +compat/confdefs.h +compat/config.h.in +compat/config.log +compat/build/Module.symvers +compat/build/output.log +compat/config.h +compat/config.status +compat/stamp-h1 + +compat/Makefile +compat/configure + diff --git a/src/mlnx-ofa_kernel-5.8/COPYING b/src/mlnx-ofa_kernel-5.8/COPYING new file mode 100644 index 0000000..ca442d3 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/COPYING @@ -0,0 +1,356 @@ + + NOTE! This copyright does *not* cover user programs that use kernel + services by normal system calls - this is merely considered normal use + of the kernel, and does *not* fall under the heading of "derived work". + Also note that the GPL below is copyrighted by the Free Software + Foundation, but the instance of code that it refers to (the Linux + kernel) is copyrighted by me and others who actually wrote it. + + Also note that the only valid version of the GPL as far as the kernel + is concerned is _this_ particular version of the license (ie v2, not + v2.2 or v3.x or whatever), unless explicitly otherwise stated. + + Linus Torvalds + +---------------------------------------- + + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. diff --git a/src/mlnx-ofa_kernel-5.8/Documentation/infiniband/tag_matching.txt b/src/mlnx-ofa_kernel-5.8/Documentation/infiniband/tag_matching.txt new file mode 100644 index 0000000..7debeb6 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/Documentation/infiniband/tag_matching.txt @@ -0,0 +1,50 @@ +Hardware tag matching logic + +The MPI standard defines a set of rules, known as tag-matching, for matching +source send operations to destination receives. The following parameters must +match the following source and destination parameters: +* Communicator +* User tag - wild card may be specified by the receiver +* Source rank – wild car may be specified by the receiver +* Destination rank – wild +The ordering rules require that when more than one pair of send and receive +message envelopes may match, the pair that includes the earliest posted-send +and the earliest posted-receive is the pair that must be used to satisfy the +matching operation. However, this doesn’t imply that tags are consumed in +the order they are created, e.g., a later generated tag may be consumed, if +earlier tags can’t be used to satisfy the matching rules. + +When a message is sent from the sender to the receiver, the communication +library may attempt to process the operation either after or before the +corresponding matching receive is posted. If a matching receive is posted, +this is an expected message, otherwise it is called an unexpected message. +Implementations frequently use different matching schemes for these two +different matching instances. + +To keep MPI library memory footprint down, MPI implementations typically use +two different protocols for this purpose: + +1. The Eager protocol- the complete message is sent when the send is +processed by the sender. A completion send is received in the send_cq +notifying that the buffer can be reused. + +2. The Rendezvous Protocol - the sender sends the tag-matching header, +and perhaps a portion of data when first notifying the receiver. When the +corresponding buffer is posted, the responder will use the information from +the header to initiate an RDMA read operation directly to the matching buffer. +A FIN message needs to be received in order for the buffer to be reused. + +Tag matching implementation + +There are two types of matching objects used, the posted receive list and the +unexpected message list. The application posts receive buffers through calls +to the MPI receive routines in the posted receive list and posts send messages +using the MPI send routines. The head of the posted receive list may be +maintained by the hardware, with the software expected to shadow this list. + +When send is initiated and arrives at the receive side, if there is no +pre-posted receive for this arriving message, it is passed to the software and +placed in the unexpected message list. Otherwise the match is processed, +including rendezvous processing, if appropriate, delivering the data to the +specified receive buffer. This allows overlapping receive-side MPI tag +matching with computation. diff --git a/src/mlnx-ofa_kernel-5.8/Documentation/release_notes-storage.txt b/src/mlnx-ofa_kernel-5.8/Documentation/release_notes-storage.txt new file mode 100644 index 0000000..1536d40 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/Documentation/release_notes-storage.txt @@ -0,0 +1,170 @@ + Mellanox Technologies + +=============================================================================== + Mellanox Storage drivers for Linux + Versions: + iSER-1.6 + SRP-1.5 + Last Modified on Mar, 2015 +=============================================================================== + +=============================================================================== +Table of Contents +=============================================================================== +1. Overview +2. Changes in This Release +3. Supported Platforms and Operating Systems +4. Supported HCAs +5. Resolved Issues +6. Known Issues + +=============================================================================== +1. Overview +=============================================================================== +These are the release notes of Mellanox OFED Storage drivers for Linux + +=============================================================================== +2. Changes in This Release +=============================================================================== +iSER: + - DIF GA + - Connection management refactor + - Bug fixes + - IO path enhancements + +SRP: + - FRWR support + - Merge with upstream + - Bug fixes + +=============================================================================== +3. Supported Platforms and Operating Systems +=============================================================================== +Supported platform is x86_64. +Requires MLNX-OFED-2.4-1.0.0 + +Full Cycle: + RHEL7.0: iSER RoCE, iSER IB, SRP + RHEL6.4: iSER IB, iSER RoCE + RHEL6.5: iSER IB, iSER RoCE, SRP + RHEL6.6: iSER IB, iSER RoCE, SRP + OEL6.6 (UEK kernel): iSER IB, iSER RoCE + Sles12: iSER IB, iSER RoCE, SRP + Ubuntu14.04: iSER IB, iSER RoCE + +Partial Cycle: + RHEL7.0 INBOX*: iSER RoCE, iSER IB, SRP + RHEL6.3: iSER IB, iSER RoCE, SRP + RHEL6.4: SRP + OEL6.4 (UEK kernel): iSER IB, SRP + OEL6.5 (UEK kernel): iSER IB, iSER RoCE, SRP + OEL6.6 (UEK kernel): SRP + Sles11SP2: iSER IB, iSER RoCE, SRP + Sles11SP3: iSER IB, iSER RoCE, SRP + Ubuntu12.04.4 (kernel 3.11): iSER IB, iSER RoCE, SRP + Ubuntu14.04: SRP + FC19: iSER IB, iSER RoCE, SRP + FC21: iSER IB, iSER RoCE, SRP + + +* INBOX means the distro RDMA stack, without the need to install MLNX_OFED. + +=============================================================================== +4. Supported HCAs +=============================================================================== +* Connect-IB FW version 10.10.3000 and above. +* ConnectX-3 (+Pro) FW version 2.31.5050 and above. + +For official firmware versions please see: +http://www.mellanox.com/content/pages.php?pg=firmware_download + +=============================================================================== +5. Resolved Issues +=============================================================================== +iSER: + - SM LID reassign during traffic on OEL6.4 uek kernel with virtual function + generates soft lockup trace. (#435775) + + - Unloading iser module during connection establishment sequence may cause + unexpected behavior. (#439838) + + - RoCE: port toggling may cause traffic disruption on the un-toggled port. + (#440464) + + - RoCE: unloading mlx4_en while open iser sessions exist may cause resources + not to be freed until unloading mlx4_ib as well. (#440458) + + - Multi-target login-logout loop may result in failure (system hang) + (#441104) + + - Sles12 inbox: PI module parameters are missing (PI not enabled) (#441106) + +=============================================================================== +6. Known Issues +=============================================================================== + - SLES12: multipathd may segfault + +SRP: + - The driver is tested with Storage target vendors recommendations for + multipath.conf extensions (ZFS, DDN, TMS, Nimbus, NetApp). + + - Unloading ib_srp with many open sessions while ports are down may take long + time (#440471) + + - MLNX_OFED SRP installation breaks ibmvstgt and ibmvscsi symbol resolution + in RHEL7.0 (#517635) + + - SRP interop known issues: + + * DDN Storage Fusion 10000 target + - DDN does not accept non-default P_Key connection establishment. + (#489952) + + * Oracle Sun ZFS storage 7420 + - ZFS does not accept non-default P_Key connection establishment. + (#489952) + + * Ungraceful power cycle of an initiator connected with Targets DDN, + Nimbus, NetApp, may result in temporary "stale connection" messages + when initiator reconnects. (#517641) + + - SLESL12: Lost path to storage as result of read only file system (#466595) + + - FW timeout during IO and then FW reset will cause ib interfaces to fail + (#510515) + +iSER: + - On SLES, the ib_iser module does not get loaded on boot. (#489944) + workaround: + Add a dummy interface using iscsiadm: + - iscsiadm -m iface -I ib_iser -o new + - iscsiadm -m iface -I ib_iser -o update -n iface.transport_name -v ib_iser + + - Ubuntu12 requires updating the user space open-iscsi to v2.0.873 (#489947) + + - Initiator does not respect interface parameter while logging in. (#489945) + workaround: + config each interface on different subnet. + + - ZFS appliance: connection establishment occurs twice which may cause iSER + to log a stack trace. (#489940) + + - iscsid may hang if target crashes during logout sequence. + reproducible with TCP. (#489942) + + - iscsid v2.0.873 can enter an endless loop on bind error. (#489941) + fix pending: + https://groups.google.com/forum/#!searchin/open-iscsi/Fix$20infinite$20loop$20when$20conn$20/open-iscsi/zgLXgf28LVE/e3qkz8ZzwHMJ + + - SLES12: login with PI disabled, followed by a logout and re-login with PI + enabled without flushing multipath might cause the block layer to panic. + (#440756) + + - Rarely In IB device catastrophic error scenario iscsi/iser initator might + not fully recover and result in a hang. (#489943) + + - Ubuntu14.04: Stress login/logout might cause block layer to invoke a WARN + trace (#453232) + + - Initiator can't recover a session after FW internal error on target while + there is an open session (#501232) diff --git a/src/mlnx-ofa_kernel-5.8/LINUX_BASE_BRANCH b/src/mlnx-ofa_kernel-5.8/LINUX_BASE_BRANCH new file mode 100644 index 0000000..8f70812 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/LINUX_BASE_BRANCH @@ -0,0 +1 @@ +754e0b0e35608ed5206d6a67a791563c631cec07 diff --git a/src/mlnx-ofa_kernel-5.8/Makefile b/src/mlnx-ofa_kernel-5.8/Makefile new file mode 120000 index 0000000..bebfaf5 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/Makefile @@ -0,0 +1 @@ +ofed_scripts/Makefile \ No newline at end of file diff --git a/src/mlnx-ofa_kernel-5.8/Module.supported b/src/mlnx-ofa_kernel-5.8/Module.supported new file mode 100644 index 0000000..76eafb6 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/Module.supported @@ -0,0 +1,74 @@ +xprtrdma.ko external +svcrdma.ko external +rds_rdma.ko external +9pnet_rdma.ko external +mlx5_core.ko external +mlx4_en.ko external +mlx4_core.ko external +mlx4_vnic.ko external +ib_mad.ko external +iw_cm.ko external +rdma_cm.ko external +ib_uverbs.ko external +ib_cm.ko external +ib_core.ko external +ib_umad.ko external +ib_addr.ko external +rdma_ucm.ko external +ib_ucm.ko external +ib_sa.ko external +iw_cxgb4.ko external +ib_mthca.ko external +mlx5_ib.ko external +mlx4_ib.ko external +iw_c2.ko external +ib_qib.ko external +iw_cxgb3.ko external +iw_nes.ko external +ib_ipath.ko external +ib_srp.ko external +ib_srpt.ko external +ib_sdp.ko external +scsi_transport_srp.ko external +ib_iser.ko external +ib_isert.ko external +ib_ipoib.ko external +mlx_compat.ko external +memtrack.ko external +ib_ehca.ko external +ocrdma.ko external +ib_netlink.ko external +hfi1.ko external +hns-roce.ko external +i40iw.ko external +qedr.ko external +usnic_verbs.ko external +rdmavt.ko external +rdma_rxe.ko external +nvme-core.ko external +nvme.ko external +nvme-fabrics.ko external +nvme-rdma.ko external +nvme-fc.ko external +nvmet.ko external +nvme-loop.ko external +nvmet-rdma.ko external +nvmet-fc.ko external +nvme-fcloop.ko external +rpcrdma.ko external +xscore.ko external +xsvnic.ko external +xve.ko external +xsvhba.ko external +bnxt_re.ko external +vmw_pvrdma.ko external +smc.ko external +smc_diag.ko external +lpfc.ko external +opa_vnic.ko external +mlxfw.ko external +mlx5_fpga_tools.ko external +act_vlan.ko external +act_tunnel_key.ko external +cls_flower.ko external +eth_ipoib.ko external diff --git a/src/mlnx-ofa_kernel-5.8/README b/src/mlnx-ofa_kernel-5.8/README new file mode 100644 index 0000000..1c297cd --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/README @@ -0,0 +1,159 @@ +Content: +1. Overview +2. Directory structure +3. HOWTO add a new feature patch(es) +4. HOWTO add backport patches +5. Check compilation +6. Submitting to Gerrit + + +Overview: +--------- +The general idea is based on compat-wireless project https://lkml.org/lkml/2011/9/9/327 +Compat Wiki: +https://backports.wiki.kernel.org/index.php/Main_Page +https://backports.wiki.kernel.org/index.php/Documentation + +Also it utilizes autotools. + +Directory structure: +-------------------- +MLNX_OFED-3.x (mlnx_rdma) git tree structure: +Compat files: + compat/ + include/linux/compat*.h + ofed_scripts/gen-compat-autoconf.sh + ofed_scripts/gen-compat-config.sh + include/linux/*.h + +Autotools: + compat/config + compat/autogen.sh + compat/configure.ac + +Feautes (Each feautre should be a separate directory): + features/- + +Backport patches: + backports/ + +Kernel subtree based on upstream kernel: + include/ + drivers/ + net/ + +--------------------- + +Compilation procedure: + +# cd mlnx_rdma +# git checkout master + +# ./configure +E.g.: ./configure --with-core-mod --with-user_mad-mod --with-user_access-mod --with-addr_trans-mod --with-mlx4-mod --with-mlx4_en-mod --with-mlx5-mod --with-ipoib-mod --with-srp-mod --with-rds-mod --with-iser-mod + +Note: If "backports_applied" file exist then backport patches will not be applied. + +# make distclean +# make +# make install + +HOWTO add a new feature/fix: +---------------------------- +Start with updated branch (equal to “masterâ€). + +Add the new code. +Check compilation on the latest kernel base. +# ./configure --without-backport-patches + +Note: Provide path to kernel sources if needed: --kernel-version --kernel-sources + +# make distclean +# make +# make install + +Check your feature. +Commit the changes. +Add backports if required. +See "Check compilation". + +Note: Make sure your commits pass linux/scripts/checkpatch.pl + Backport patches may not pass this check + + +HOWTO add backport patches: +--------------------------- +Start with updated branch (eg: equal to “masterâ€). + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +!!! NOTE: BACKPORT PATCHES MAY CHANGE FILES UNDER FOLLOWING DIRECTORIES !!! +!!! include/rdma/ !!! +!!! include/linux/mlx4/ !!! +!!! include/linux/mlx5/ !!! +!!! drivers/ !!! +!!! net/ !!! +!!! AND SHOULD NOT TOUCH FILES UNDER ofed_scripts, compat and compat headers under include/linux/ !!! +!!! CHANGES TO THE FILES UNDER ofed_scripts, compat and include/linux/compat*.h SHOULD BE COMMITTED DIRECTLY !!! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +$ /bin/rm -f backports_applied + +Run configure (check parameters) +$ ./configure --with-core-mod --with-user_mad-mod --with-user_access-mod --with-addr_trans-mod --with-mlx4-mod --with-mlx4_en-mod --with-mlx5-mod --with-ipoib-mod --with-srp-mod --with-rds-mod --with-iser-mod + +All backports are applied. + +$ make KBUILD_NOCMDDEP=1 2>&1 | tee log + +In case of undefined variable/function refer to: +https://backports.wiki.kernel.org/index.php/Documentation/compat + +If some functionality was backported to some supported Distro need to add CONFIG_COMPAT_... flag (or even better, use autotools +and define flags like HAVE_... see compat/config/rdma.m4): +For example, "netdev_get_prio_tc_map" was added in 2.6.39 but backported to RHEL6.[23] +ifeq ($(RHEL_MAJOR),6) + CFLAGS += -DCONFIG_COMPAT_IS_PRIO_TC_MAP + ... +endif + +diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c +index 0bf5a9d..f0f75b6 100644 +--- a/drivers/infiniband/core/cma.c ++++ b/drivers/infiniband/core/cma.c +@@ -1844,10 +1844,14 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) + route->path_rec->reversible = 1; + route->path_rec->pkey = cpu_to_be16(0xffff); + route->path_rec->mtu_selector = IB_SA_EQ; ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39)) || defined(CONFIG_COMPAT_IS_PRIO_TC_MAP) + route->path_rec->sl = netdev_get_prio_tc_map( + ndev->priv_flags & IFF_802_1Q_VLAN ? + vlan_dev_real_dev(ndev) : ndev, + rt_tos2priority(id_priv->tos)); ++#else ++ route->path_rec->sl = id_priv->tos >> 5; ++#endif + +Create a new backport patch from the diff above: +1. Commit the change. +2. Squash/Fixup the commit to the original commit that backported the relevant module (unless +you are backporting a new module). +3. Get updated/new patches: +$ ./ofed_scripts/ofed_get_patches.sh +Change branch to the original (E.g. “masterâ€) +Update/add new relevant patches under "backports" directory +$ cp -a ./backports_new/0XXX-.patch backports +$ git add backports +$ git commit -s -m â€BACKPORTS: Added RHEL6.[23] support†backports/ + + +Check compilation: +------------------ +Use your user and not "root". +$ cd /mswg/projects/art +$ git_url= git_branch= ./build mlnx_rdma + + +Submitting to Gerrit: +--------------------- +Please refer to +http://wiki.lab.mtl.com/tiki/tiki-index.php?page=Code+Review&structure=OFED-LE#Gerrit diff --git a/src/mlnx-ofa_kernel-5.8/backports/0001-BACKPORT-block-blk-mq-rdma.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0001-BACKPORT-block-blk-mq-rdma.c.patch new file mode 100644 index 0000000..8b8bf61 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0001-BACKPORT-block-blk-mq-rdma.c.patch @@ -0,0 +1,52 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: block/blk-mq-rdma.c + +Change-Id: Ic7b3baaebf1cdf4ec438b81dd71cb01bd0e2dc4e +--- + block/blk-mq-rdma.c | 26 ++++++++++++++++++++++++++ + 1 file changed, 26 insertions(+) + +--- a/block/blk-mq-rdma.c ++++ b/block/blk-mq-rdma.c +@@ -1,3 +1,4 @@ ++#ifdef HAVE_BLK_MQ_TAG_SET_HAS_MAP + // SPDX-License-Identifier: GPL-2.0 + /* + * Copyright (c) 2017 Sagi Grimberg. +@@ -21,6 +22,7 @@ + * @set->nr_hw_queues, or @dev does not provide an affinity mask for a + * vector, we fallback to the naive mapping. + */ ++#ifdef HAVE_BLK_MQ_RDMA_MAP_QUEUES_MAP + int blk_mq_rdma_map_queues(struct blk_mq_queue_map *map, + struct ib_device *dev, int first_vec) + { +@@ -41,4 +43,28 @@ int blk_mq_rdma_map_queues(struct blk_mq + fallback: + return blk_mq_map_queues(map); + } ++#else ++int blk_mq_rdma_map_queues(struct blk_mq_tag_set *set, ++ struct ib_device *dev, int first_vec) ++{ ++ const struct cpumask *mask; ++ unsigned int queue, cpu; ++ ++ for (queue = 0; queue < set->nr_hw_queues; queue++) { ++ mask = ib_get_vector_affinity(dev, first_vec + queue); ++ if (!mask) ++ goto fallback; ++ ++ for_each_cpu(cpu, mask) ++ set->map[0].mq_map[cpu] = queue; ++ } ++ ++ return 0; ++ ++fallback: ++ return blk_mq_map_queues(&set->map[0]); ++} ++#endif + EXPORT_SYMBOL_GPL(blk_mq_rdma_map_queues); ++ ++#endif /* HAVE_BLK_MQ_TAG_SET_HAS_MAP */ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0002-BACKPORT-drivers-base-auxiliary.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0002-BACKPORT-drivers-base-auxiliary.c.patch new file mode 100644 index 0000000..3eccb17 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0002-BACKPORT-drivers-base-auxiliary.c.patch @@ -0,0 +1,125 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/base/auxiliary.c + +Change-Id: I33fdc1295e95f46a582590e310a0eb0365e6ddfc +--- + drivers/base/auxiliary.c | 64 ++++++++++++++++++++++++++++++++++++++-- + 1 file changed, 61 insertions(+), 3 deletions(-) + +--- a/drivers/base/auxiliary.c ++++ b/drivers/base/auxiliary.c +@@ -19,7 +19,10 @@ + #include + #include + #include ++#include ++#ifdef CONFIG_COMPAT_AUXILIARY_EXTERNAL_INIT + #include "base.h" ++#endif + + /** + * DOC: PURPOSE +@@ -211,27 +214,53 @@ static int auxiliary_bus_probe(struct de + struct auxiliary_device *auxdev = to_auxiliary_dev(dev); + int ret; + ++#ifdef HAVE_DEV_PM_DOMAIN_ATTACH + ret = dev_pm_domain_attach(dev, true); ++ ++ /* In case of old kernels 4.17 and below do nothing in case of ++ * failure of ENODEV */ ++ if (ret == -ENODEV) ++ ret = 0; ++ + if (ret) { + dev_warn(dev, "Failed to attach to PM Domain : %d\n", ret); + return ret; + } ++#else ++ acpi_dev_pm_attach(dev, true); ++#endif + + ret = auxdrv->probe(auxdev, auxiliary_match_id(auxdrv->id_table, auxdev)); + if (ret) ++#ifdef HAVE_DEV_PM_DOMAIN_ATTACH + dev_pm_domain_detach(dev, true); ++#else ++ acpi_dev_pm_detach(dev, true); ++#endif + + return ret; + } + ++#ifdef HAVE_BUS_TYPE_REMOVE_RETURN_VOID + static void auxiliary_bus_remove(struct device *dev) ++#else ++static int auxiliary_bus_remove(struct device *dev) ++#endif + { + struct auxiliary_driver *auxdrv = to_auxiliary_drv(dev->driver); + struct auxiliary_device *auxdev = to_auxiliary_dev(dev); + + if (auxdrv->remove) + auxdrv->remove(auxdev); ++#ifdef HAVE_DEV_PM_DOMAIN_ATTACH + dev_pm_domain_detach(dev, true); ++#else ++ acpi_dev_pm_detach(dev, true); ++#endif ++ ++#ifndef HAVE_BUS_TYPE_REMOVE_RETURN_VOID ++ return 0; ++#endif + } + + static void auxiliary_bus_shutdown(struct device *dev) +@@ -353,9 +382,17 @@ EXPORT_SYMBOL_GPL(__auxiliary_device_add + * if it does. If the callback returns non-zero, this function will + * return to the caller and not iterate over any more devices. + */ +-struct auxiliary_device *auxiliary_find_device(struct device *start, +- const void *data, +- int (*match)(struct device *dev, const void *data)) ++#if defined(HAVE_LINUX_DEVICE_BUS_H) || defined(HAVE_BUS_FIND_DEVICE_GET_CONST) ++struct auxiliary_device * ++auxiliary_find_device(struct device *start, ++ const void *data, ++ int (*match)(struct device *dev, const void *data)) ++#else ++struct auxiliary_device * ++auxiliary_find_device(struct device *start, ++ void *data, ++ int (*match)(struct device *dev, void *data)) ++#endif /* HAVE_BUS_FIND_DEVICE_GET_CONST || HAVE_LINUX_DEVICE_BUS_H */ + { + struct device *dev; + +@@ -417,7 +454,28 @@ void auxiliary_driver_unregister(struct + } + EXPORT_SYMBOL_GPL(auxiliary_driver_unregister); + ++#ifdef CONFIG_COMPAT_AUXILIARY_EXTERNAL_INIT + void __init auxiliary_bus_init(void) + { + WARN_ON(bus_register(&auxiliary_bus_type)); + } ++#else ++static int __init auxiliary_bus_init(void) ++{ ++ return bus_register(&auxiliary_bus_type); ++} ++ ++static void __exit auxiliary_bus_exit(void) ++{ ++ bus_unregister(&auxiliary_bus_type); ++} ++ ++module_init(auxiliary_bus_init); ++module_exit(auxiliary_bus_exit); ++ ++MODULE_LICENSE("GPL v2"); ++MODULE_DESCRIPTION("Auxiliary Bus"); ++MODULE_INFO(supported, "external"); ++MODULE_AUTHOR("David Ertman "); ++MODULE_AUTHOR("Kiran Patil "); ++#endif diff --git a/src/mlnx-ofa_kernel-5.8/backports/0003-BACKPORT-drivers-infiniband-core-addr.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0003-BACKPORT-drivers-infiniband-core-addr.c.patch new file mode 100644 index 0000000..aeb6581 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0003-BACKPORT-drivers-infiniband-core-addr.c.patch @@ -0,0 +1,253 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/core/addr.c + +Change-Id: Ia1ff9d323d0e66564792cda8bd9154c96f91d3c6 +--- + drivers/infiniband/core/addr.c | 102 +++++++++++++++++++++++++++++---- + 1 file changed, 92 insertions(+), 10 deletions(-) + +--- a/drivers/infiniband/core/addr.c ++++ b/drivers/infiniband/core/addr.c +@@ -42,7 +42,9 @@ + #include + #include + #include ++#ifdef HAVE_IPV6_STUBS_H + #include ++#endif + #include + #include + #include +@@ -50,6 +52,8 @@ + #include + #include + #include ++#include ++#include + + #include "core_priv.h" + +@@ -76,9 +80,13 @@ static struct workqueue_struct *addr_wq; + + static const struct nla_policy ib_nl_addr_policy[LS_NLA_TYPE_MAX] = { + [LS_NLA_TYPE_DGID] = {.type = NLA_BINARY, ++#ifdef HAVE_NLA_POLICY_HAS_VALIDATION_TYPE + .len = sizeof(struct rdma_nla_ls_gid), + .validation_type = NLA_VALIDATE_MIN, + .min = sizeof(struct rdma_nla_ls_gid)}, ++#else ++ .len = sizeof(struct rdma_nla_ls_gid)}, ++#endif + }; + + static inline bool ib_nl_is_good_ip_resp(const struct nlmsghdr *nlh) +@@ -88,9 +96,12 @@ static inline bool ib_nl_is_good_ip_resp + + if (nlh->nlmsg_flags & RDMA_NL_LS_F_ERR) + return false; +- ++#ifdef HAVE_NLA_PARSE_DEPRECATED + ret = nla_parse_deprecated(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh), +- nlmsg_len(nlh), ib_nl_addr_policy, NULL); ++#else ++ ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh), ++#endif ++ nlmsg_len(nlh), ib_nl_addr_policy, NULL); + if (ret) + return false; + +@@ -131,8 +142,12 @@ static void ib_nl_process_good_ip_rsep(c + } + + int ib_nl_handle_ip_res_resp(struct sk_buff *skb, ++#ifdef HAVE_NETLINK_EXT_ACK + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) ++#else ++ struct nlmsghdr *nlh) ++#endif + { + if ((nlh->nlmsg_flags & NLM_F_REQUEST) || + !(NETLINK_CB(skb).sk)) +@@ -179,7 +194,8 @@ static int ib_nl_ip_send_msg(struct rdma + } + + /* Construct the family header first */ +- header = skb_put(skb, NLMSG_ALIGN(sizeof(*header))); ++ header = (struct rdma_ls_ip_resolve_header *) ++ skb_put(skb, NLMSG_ALIGN(sizeof(*header))); + header->ifindex = dev_addr->bound_dev_if; + nla_put(skb, attrtype, size, daddr); + +@@ -251,17 +267,26 @@ rdma_find_ndev_for_src_ip_rcu(struct net + + switch (src_in->sa_family) { + case AF_INET: ++#ifdef HAVE___IP_DEV_FIND + dev = __ip_dev_find(net, + ((const struct sockaddr_in *)src_in)->sin_addr.s_addr, + false); + if (dev) + ret = 0; ++#else ++ dev = ip_dev_find(net, ++ ((const struct sockaddr_in *)src_in)->sin_addr.s_addr); ++ if(dev) { ++ dev_put(dev); ++ ret = 0; ++ } ++#endif + break; + #if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + for_each_netdev_rcu(net, dev) { + if (ipv6_chk_addr(net, +- &((const struct sockaddr_in6 *)src_in)->sin6_addr, ++ &((const struct sockaddr_in6 *)src_in)->sin6_addr, + dev, 1)) { + ret = 0; + break; +@@ -347,14 +372,24 @@ static int dst_fetch_ha(const struct dst + return ret; + } + ++#ifdef HAVE_RT_USES_GATEWAY + static bool has_gateway(const struct dst_entry *dst, sa_family_t family) ++#else ++static bool has_gateway(const struct dst_entry *dst, const void *daddr, sa_family_t family) ++#endif + { + struct rtable *rt; + struct rt6_info *rt6; + + if (family == AF_INET) { + rt = container_of(dst, struct rtable, dst); ++#ifdef HAVE_RT_USES_GATEWAY + return rt->rt_uses_gateway; ++#elif defined (HAVE_RT_GW_FAMILY) ++ return rt->rt_gw_family == AF_INET; ++#else ++ return (rt->rt_gateway != *(__be32 *)daddr); ++#endif + } + + rt6 = container_of(dst, struct rt6_info, dst); +@@ -375,8 +410,12 @@ static int fetch_ha(const struct dst_ent + + might_sleep(); + ++#ifndef HAVE_RT_USES_GATEWAY ++ if (seq && has_gateway(dst, daddr, family) && dst->dev->type == ARPHRD_INFINIBAND) ++#else + /* If we have a gateway in IB mode then it must be an IB network */ + if (has_gateway(dst, family) && dev_addr->network == RDMA_NETWORK_IB) ++#endif + return ib_nl_fetch_ha(dev_addr, daddr, seq, family); + else + return dst_fetch_ha(dst, dev_addr, daddr); +@@ -406,6 +445,7 @@ static int addr4_resolve(struct sockaddr + if (ret) + return ret; + ++ + src_in->sin_addr.s_addr = fl4.saddr; + + addr->hoplimit = ip4_dst_hoplimit(&rt->dst); +@@ -423,17 +463,41 @@ static int addr6_resolve(struct sockaddr + struct sockaddr_in6 *src_in = (struct sockaddr_in6 *)src_sock; + const struct sockaddr_in6 *dst_in = + (const struct sockaddr_in6 *)dst_sock; +- struct flowi6 fl6; ++ struct flowi6 fl6; + struct dst_entry *dst; ++#ifndef HAVE_IPV6_DST_LOOKUP_FLOW ++ int ret; ++#endif ++ ++#ifdef HAVE_IPV6_MOD_ENABLED ++ if (!ipv6_mod_enabled()) ++ return -EADDRNOTAVAIL; ++#endif + + memset(&fl6, 0, sizeof fl6); + fl6.daddr = dst_in->sin6_addr; + fl6.saddr = src_in->sin6_addr; + fl6.flowi6_oif = addr->bound_dev_if; + +- dst = ipv6_stub->ipv6_dst_lookup_flow(addr->net, NULL, &fl6, NULL); +- if (IS_ERR(dst)) +- return PTR_ERR(dst); ++#ifdef HAVE_IPV6_DST_LOOKUP_FLOW ++ dst = ipv6_stub->ipv6_dst_lookup_flow(addr->net, NULL, &fl6, NULL); ++ if (IS_ERR(dst)) ++ return PTR_ERR(dst); ++#elif defined(HAVE_IPV6_DST_LOOKUP_TAKES_NET) ++ ret = ipv6_stub->ipv6_dst_lookup(addr->net, NULL, &dst, &fl6); ++ if (ret < 0) ++ return ret; ++#else /* HAVE_IPV6_DST_LOOKUP_TAKES_NET */ ++ dst = ip6_route_output(addr->net, NULL, &fl6); ++ if ((ret = dst->error)) ++ return ret; ++ if (ipv6_addr_any(&src_in->sin6_addr)) { ++ ret = ipv6_dev_get_saddr(addr->net, ip6_dst_idev(dst)->dev, ++ &fl6.daddr, 0, &fl6.saddr); ++ if (ret) ++ goto put; ++ } ++#endif /* HAVE_IPV6_DST_LOOKUP_FLOW */ + + if (ipv6_addr_any(&src_in->sin6_addr)) + src_in->sin6_addr = fl6.saddr; +@@ -442,8 +506,13 @@ static int addr6_resolve(struct sockaddr + + *pdst = dst; + return 0; ++#if (!defined(HAVE_IPV6_DST_LOOKUP_TAKES_NET) && !defined(HAVE_IPV6_DST_LOOKUP_FLOW)) ++put: ++ dst_release(dst); ++ return ret; ++#endif + } +-#else ++#else /* IS_ENABLED(CONFIG_IPV6) */ + static int addr6_resolve(struct sockaddr *src_sock, + const struct sockaddr *dst_sock, + struct rdma_dev_addr *addr, +@@ -451,7 +520,7 @@ static int addr6_resolve(struct sockaddr + { + return -EADDRNOTAVAIL; + } +-#endif ++#endif /*IS_ENABLED(CONFIG_IPV6) */ + + static int addr_resolve_neigh(const struct dst_entry *dst, + const struct sockaddr *dst_in, +@@ -478,6 +547,15 @@ static int copy_src_l2_addr(struct rdma_ + const struct net_device *ndev) + { + int ret = 0; ++#ifndef HAVE_RT_USES_GATEWAY ++ const struct sockaddr_in *dst_in4 = ++ (const struct sockaddr_in *)dst_in; ++ const struct sockaddr_in6 *dst_in6 = ++ (const struct sockaddr_in6 *)dst_in; ++ const void *daddr = (dst_in->sa_family == AF_INET) ? ++ (const void *)&dst_in4->sin_addr.s_addr : ++ (const void *)&dst_in6->sin6_addr; ++#endif + + if (dst->dev->flags & IFF_LOOPBACK) + ret = rdma_translate_ip(dst_in, dev_addr); +@@ -489,7 +567,11 @@ static int copy_src_l2_addr(struct rdma_ + * we're definitely in RoCE v2 (as RoCE v1 isn't routable) set the + * network type accordingly. + */ ++#ifdef HAVE_RT_USES_GATEWAY + if (has_gateway(dst, dst_in->sa_family) && ++#else ++ if (has_gateway(dst, daddr, dst_in->sa_family) && ++#endif + ndev->type != ARPHRD_INFINIBAND) + dev_addr->network = dst_in->sa_family == AF_INET ? + RDMA_NETWORK_IPV4 : diff --git a/src/mlnx-ofa_kernel-5.8/backports/0004-BACKPORT-drivers-infiniband-core-cache.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0004-BACKPORT-drivers-infiniband-core-cache.c.patch new file mode 100644 index 0000000..f3c6c7a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0004-BACKPORT-drivers-infiniband-core-cache.c.patch @@ -0,0 +1,72 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/core/cache.c + +Change-Id: I8f608b1a88ee9c8e551139ee6feb4a624a59527e +--- + drivers/infiniband/core/cache.c | 23 +++++++++++++++++++++-- + 1 file changed, 21 insertions(+), 2 deletions(-) + +--- a/drivers/infiniband/core/cache.c ++++ b/drivers/infiniband/core/cache.c +@@ -1401,10 +1401,19 @@ struct net_device *rdma_read_gid_attr_nd + } + EXPORT_SYMBOL(rdma_read_gid_attr_ndev_rcu); + ++#ifdef HAVE_NETDEV_WALK_ALL_LOWER_DEV_RCU + static int get_lower_dev_vlan(struct net_device *lower_dev, +- struct netdev_nested_priv *priv) ++#ifdef HAVE_NETDEV_NESTED_PRIV_STRUCT ++ struct netdev_nested_priv *priv) ++#else ++ void *data) ++#endif + { ++#ifdef HAVE_NETDEV_NESTED_PRIV_STRUCT + u16 *vlan_id = (u16 *)priv->data; ++#else ++ u16 *vlan_id = data; ++#endif + + if (is_vlan_dev(lower_dev)) + *vlan_id = vlan_dev_vlan_id(lower_dev); +@@ -1414,6 +1423,7 @@ static int get_lower_dev_vlan(struct net + */ + return 1; + } ++#endif + + /** + * rdma_read_gid_l2_fields - Read the vlan ID and source MAC address +@@ -1430,9 +1440,11 @@ static int get_lower_dev_vlan(struct net + int rdma_read_gid_l2_fields(const struct ib_gid_attr *attr, + u16 *vlan_id, u8 *smac) + { ++#ifdef HAVE_NETDEV_NESTED_PRIV_STRUCT + struct netdev_nested_priv priv = { + .data = (void *)vlan_id, + }; ++#endif + struct net_device *ndev; + + rcu_read_lock(); +@@ -1448,12 +1460,19 @@ int rdma_read_gid_l2_fields(const struct + if (is_vlan_dev(ndev)) { + *vlan_id = vlan_dev_vlan_id(ndev); + } else { ++#ifdef HAVE_NETDEV_WALK_ALL_LOWER_DEV_RCU + /* If the netdev is upper device and if it's lower + * device is vlan device, consider vlan id of the + * the lower vlan device for this gid entry. + */ + netdev_walk_all_lower_dev_rcu(attr->ndev, +- get_lower_dev_vlan, &priv); ++ get_lower_dev_vlan, ++#ifdef HAVE_NETDEV_NESTED_PRIV_STRUCT ++ &priv); ++#else ++ vlan_id); ++#endif ++#endif + } + } + rcu_read_unlock(); diff --git a/src/mlnx-ofa_kernel-5.8/backports/0005-BACKPORT-drivers-infiniband-core-cgroup.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0005-BACKPORT-drivers-infiniband-core-cgroup.c.patch new file mode 100644 index 0000000..a7b0b64 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0005-BACKPORT-drivers-infiniband-core-cgroup.c.patch @@ -0,0 +1,25 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/core/cgroup.c + +Change-Id: I22b5ced2318373bdc0fc2fb263347a5543a2c6c0 +--- + drivers/infiniband/core/cgroup.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/drivers/infiniband/core/cgroup.c ++++ b/drivers/infiniband/core/cgroup.c +@@ -3,6 +3,8 @@ + * Copyright (C) 2016 Parav Pandit + */ + ++#ifdef HAVE_CGROUP_RDMA_H ++ + #include "core_priv.h" + + /** +@@ -51,3 +53,5 @@ void ib_rdmacg_uncharge(struct ib_rdmacg + resource_index); + } + EXPORT_SYMBOL(ib_rdmacg_uncharge); ++ ++#endif /* HAVE_CGROUP_RDMA_H */ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0006-BACKPORT-drivers-infiniband-core-cm.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0006-BACKPORT-drivers-infiniband-core-cm.c.patch new file mode 100644 index 0000000..e3d136a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0006-BACKPORT-drivers-infiniband-core-cm.c.patch @@ -0,0 +1,376 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/core/cm.c + +Change-Id: I2759a5278e1fe88c9e8bc2a9fa64a69c2afd41f1 +--- + drivers/infiniband/core/cm.c | 73 ++++++++++++++++++++++++++++++++++++ + 1 file changed, 73 insertions(+) + +--- a/drivers/infiniband/core/cm.c ++++ b/drivers/infiniband/core/cm.c +@@ -28,11 +28,16 @@ + #include + #include "cm_msgs.h" + #include "core_priv.h" ++#ifdef HAVE_TRACE_EVENTS_H + #include "cm_trace.h" ++#endif + + MODULE_AUTHOR("Sean Hefty"); + MODULE_DESCRIPTION("InfiniBand CM"); + MODULE_LICENSE("Dual BSD/GPL"); ++#ifdef RETPOLINE_MLNX ++MODULE_INFO(retpoline, "Y"); ++#endif + + static const char * const ibcm_rej_reason_strs[] = { + [IB_CM_REJ_NO_QP] = "no QP", +@@ -1569,7 +1574,9 @@ int ib_send_cm_req(struct ib_cm_id *cm_i + cm_id_priv->local_qpn = cpu_to_be32(IBA_GET(CM_REQ_LOCAL_QPN, req_msg)); + cm_id_priv->rq_psn = cpu_to_be32(IBA_GET(CM_REQ_STARTING_PSN, req_msg)); + ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_icm_send_req(&cm_id_priv->id); ++#endif + ret = ib_post_send_mad(msg, NULL); + if (ret) + goto out_free; +@@ -1616,9 +1623,11 @@ static int cm_issue_rej(struct cm_port * + IBA_SET_MEM(CM_REJ_ARI, rej_msg, ari, ari_length); + } + ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_icm_issue_rej( + IBA_GET(CM_REJ_LOCAL_COMM_ID, rcv_msg), + IBA_GET(CM_REJ_REMOTE_COMM_ID, rcv_msg)); ++#endif + ret = ib_post_send_mad(msg, NULL); + if (ret) + cm_free_response_msg(msg); +@@ -1970,7 +1979,9 @@ static void cm_dup_req_handler(struct cm + } + spin_unlock_irq(&cm_id_priv->lock); + ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_icm_send_dup_req(&cm_id_priv->id); ++#endif + ret = ib_post_send_mad(msg, NULL); + if (ret) + goto free; +@@ -2135,7 +2146,9 @@ static int cm_req_handler(struct cm_work + + listen_cm_id_priv = cm_match_req(work, cm_id_priv); + if (!listen_cm_id_priv) { ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_icm_no_listener_err(&cm_id_priv->id); ++#endif + cm_id_priv->id.state = IB_CM_IDLE; + ret = -EINVAL; + goto destroy; +@@ -2287,7 +2300,9 @@ int ib_send_cm_rep(struct ib_cm_id *cm_i + spin_lock_irqsave(&cm_id_priv->lock, flags); + if (cm_id->state != IB_CM_REQ_RCVD && + cm_id->state != IB_CM_MRA_REQ_SENT) { ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_icm_send_rep_err(cm_id_priv->id.local_id, cm_id->state); ++#endif + ret = -EINVAL; + goto out; + } +@@ -2303,7 +2318,9 @@ int ib_send_cm_rep(struct ib_cm_id *cm_i + msg->timeout_ms = cm_id_priv->timeout_ms; + msg->context[1] = (void *) (unsigned long) IB_CM_REP_SENT; + ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_icm_send_rep(cm_id); ++#endif + ret = ib_post_send_mad(msg, NULL); + if (ret) + goto out_free; +@@ -2364,7 +2381,9 @@ int ib_send_cm_rtu(struct ib_cm_id *cm_i + spin_lock_irqsave(&cm_id_priv->lock, flags); + if (cm_id->state != IB_CM_REP_RCVD && + cm_id->state != IB_CM_MRA_REP_SENT) { ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_icm_send_cm_rtu_err(cm_id); ++#endif + ret = -EINVAL; + goto error; + } +@@ -2378,7 +2397,9 @@ int ib_send_cm_rtu(struct ib_cm_id *cm_i + cm_format_rtu((struct cm_rtu_msg *) msg->mad, cm_id_priv, + private_data, private_data_len); + ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_icm_send_rtu(cm_id); ++#endif + ret = ib_post_send_mad(msg, NULL); + if (ret) { + spin_unlock_irqrestore(&cm_id_priv->lock, flags); +@@ -2460,7 +2481,9 @@ static void cm_dup_rep_handler(struct cm + goto unlock; + spin_unlock_irq(&cm_id_priv->lock); + ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_icm_send_dup_rep(&cm_id_priv->id); ++#endif + ret = ib_post_send_mad(msg, NULL); + if (ret) + goto free; +@@ -2484,8 +2507,10 @@ static int cm_rep_handler(struct cm_work + cpu_to_be32(IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg)), 0); + if (!cm_id_priv) { + cm_dup_rep_handler(work); ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_icm_remote_no_priv_err( + IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg)); ++#endif + return -EINVAL; + } + +@@ -2498,10 +2523,12 @@ static int cm_rep_handler(struct cm_work + break; + default: + ret = -EINVAL; ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_icm_rep_unknown_err( + IBA_GET(CM_REP_LOCAL_COMM_ID, rep_msg), + IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg), + cm_id_priv->id.state); ++#endif + spin_unlock_irq(&cm_id_priv->lock); + goto error; + } +@@ -2518,8 +2545,10 @@ static int cm_rep_handler(struct cm_work + spin_unlock(&cm.lock); + spin_unlock_irq(&cm_id_priv->lock); + ret = -EINVAL; ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_icm_insert_failed_err( + IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg)); ++#endif + goto error; + } + /* Check for a stale connection. */ +@@ -2535,9 +2564,11 @@ static int cm_rep_handler(struct cm_work + IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REP, + NULL, 0); + ret = -EINVAL; ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_icm_staleconn_err( + IBA_GET(CM_REP_LOCAL_COMM_ID, rep_msg), + IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg)); ++#endif + + if (cur_cm_id_priv) { + ib_send_cm_dreq(&cur_cm_id_priv->id, NULL, 0); +@@ -2663,7 +2694,9 @@ static int cm_send_dreq_locked(struct cm + return -EINVAL; + + if (cm_id_priv->id.state != IB_CM_ESTABLISHED) { ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_icm_dreq_skipped(&cm_id_priv->id); ++#endif + return -EINVAL; + } + +@@ -2682,7 +2715,9 @@ static int cm_send_dreq_locked(struct cm + msg->timeout_ms = cm_id_priv->timeout_ms; + msg->context[1] = (void *) (unsigned long) IB_CM_DREQ_SENT; + ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_icm_send_dreq(&cm_id_priv->id); ++#endif + ret = ib_post_send_mad(msg, NULL); + if (ret) { + cm_enter_timewait(cm_id_priv); +@@ -2737,7 +2772,9 @@ static int cm_send_drep_locked(struct cm + return -EINVAL; + + if (cm_id_priv->id.state != IB_CM_DREQ_RCVD) { ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_icm_send_drep_err(&cm_id_priv->id); ++#endif + kfree(private_data); + return -EINVAL; + } +@@ -2752,7 +2789,9 @@ static int cm_send_drep_locked(struct cm + cm_format_drep((struct cm_drep_msg *) msg->mad, cm_id_priv, + private_data, private_data_len); + ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_icm_send_drep(&cm_id_priv->id); ++#endif + ret = ib_post_send_mad(msg, NULL); + if (ret) { + cm_free_msg(msg); +@@ -2802,9 +2841,11 @@ static int cm_issue_drep(struct cm_port + IBA_SET(CM_DREP_LOCAL_COMM_ID, drep_msg, + IBA_GET(CM_DREQ_REMOTE_COMM_ID, dreq_msg)); + ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_icm_issue_drep( + IBA_GET(CM_DREQ_LOCAL_COMM_ID, dreq_msg), + IBA_GET(CM_DREQ_REMOTE_COMM_ID, dreq_msg)); ++#endif + ret = ib_post_send_mad(msg, NULL); + if (ret) + cm_free_response_msg(msg); +@@ -2826,9 +2867,11 @@ static int cm_dreq_handler(struct cm_wor + atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES] + [CM_DREQ_COUNTER]); + cm_issue_drep(work->port, work->mad_recv_wc); ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_icm_no_priv_err( + IBA_GET(CM_DREQ_LOCAL_COMM_ID, dreq_msg), + IBA_GET(CM_DREQ_REMOTE_COMM_ID, dreq_msg)); ++#endif + return -EINVAL; + } + +@@ -2872,7 +2915,9 @@ static int cm_dreq_handler(struct cm_wor + [CM_DREQ_COUNTER]); + goto unlock; + default: ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_icm_dreq_unknown_err(&cm_id_priv->id); ++#endif + goto unlock; + } + cm_id_priv->id.state = IB_CM_DREQ_RCVD; +@@ -2957,11 +3002,15 @@ static int cm_send_rej_locked(struct cm_ + state); + break; + default: ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_icm_send_unknown_rej_err(&cm_id_priv->id); ++#endif + return -EINVAL; + } + ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_icm_send_rej(&cm_id_priv->id, reason); ++#endif + ret = ib_post_send_mad(msg, NULL); + if (ret) { + cm_free_msg(msg); +@@ -3070,7 +3119,9 @@ static int cm_rej_handler(struct cm_work + } + fallthrough; + default: ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_icm_rej_unknown_err(&cm_id_priv->id); ++#endif + spin_unlock_irq(&cm_id_priv->lock); + goto out; + } +@@ -3126,7 +3177,9 @@ int ib_send_cm_mra(struct ib_cm_id *cm_i + } + fallthrough; + default: ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_icm_send_mra_unknown_err(&cm_id_priv->id); ++#endif + ret = -EINVAL; + goto error_unlock; + } +@@ -3141,7 +3194,9 @@ int ib_send_cm_mra(struct ib_cm_id *cm_i + cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv, + msg_response, service_timeout, + private_data, private_data_len); ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_icm_send_mra(cm_id); ++#endif + ret = ib_post_send_mad(msg, NULL); + if (ret) + goto error_free_msg; +@@ -3233,7 +3288,9 @@ static int cm_mra_handler(struct cm_work + [CM_MRA_COUNTER]); + fallthrough; + default: ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_icm_mra_unknown_err(&cm_id_priv->id); ++#endif + goto out; + } + +@@ -3522,7 +3579,9 @@ int ib_send_cm_sidr_req(struct ib_cm_id + msg->timeout_ms = cm_id_priv->timeout_ms; + msg->context[1] = (void *)(unsigned long)IB_CM_SIDR_REQ_SENT; + ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_icm_send_sidr_req(&cm_id_priv->id); ++#endif + ret = ib_post_send_mad(msg, NULL); + if (ret) + goto out_free; +@@ -3684,7 +3743,9 @@ static int cm_send_sidr_rep_locked(struc + + cm_format_sidr_rep((struct cm_sidr_rep_msg *) msg->mad, cm_id_priv, + param); ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_icm_send_sidr_rep(&cm_id_priv->id); ++#endif + ret = ib_post_send_mad(msg, NULL); + if (ret) { + cm_free_msg(msg); +@@ -3786,7 +3847,9 @@ static void cm_process_send_error(struct + wc_status == IB_WC_WR_FLUSH_ERR) + goto out_unlock; + ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_icm_mad_send_err(state, wc_status); ++#endif + switch (state) { + case IB_CM_REQ_SENT: + case IB_CM_MRA_REQ_RCVD: +@@ -3901,7 +3964,9 @@ static void cm_work_handler(struct work_ + ret = cm_timewait_handler(work); + break; + default: ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_icm_handler_err(work->cm_event.event); ++#endif + ret = -EINVAL; + break; + } +@@ -3936,7 +4001,9 @@ static int cm_establish(struct ib_cm_id + ret = -EISCONN; + break; + default: ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_icm_establish_err(cm_id); ++#endif + ret = -EINVAL; + break; + } +@@ -4124,7 +4191,9 @@ static int cm_init_qp_init_attr(struct c + ret = 0; + break; + default: ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_icm_qp_init_err(&cm_id_priv->id); ++#endif + ret = -EINVAL; + break; + } +@@ -4177,7 +4246,9 @@ static int cm_init_qp_rtr_attr(struct cm + ret = 0; + break; + default: ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_icm_qp_rtr_err(&cm_id_priv->id); ++#endif + ret = -EINVAL; + break; + } +@@ -4239,7 +4310,9 @@ static int cm_init_qp_rts_attr(struct cm + ret = 0; + break; + default: ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_icm_qp_rts_err(&cm_id_priv->id); ++#endif + ret = -EINVAL; + break; + } diff --git a/src/mlnx-ofa_kernel-5.8/backports/0007-BACKPORT-drivers-infiniband-core-cm_trace.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0007-BACKPORT-drivers-infiniband-core-cm_trace.c.patch new file mode 100644 index 0000000..1aaa5cf --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0007-BACKPORT-drivers-infiniband-core-cm_trace.c.patch @@ -0,0 +1,18 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/core/cm_trace.c + +Change-Id: I16f0c2399d1f36e5cec310156e7210066263ad57 +--- + drivers/infiniband/core/cm_trace.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/drivers/infiniband/core/cm_trace.c ++++ b/drivers/infiniband/core/cm_trace.c +@@ -11,5 +11,6 @@ + #include "cma_priv.h" + + #define CREATE_TRACE_POINTS +- ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + #include "cm_trace.h" ++#endif diff --git a/src/mlnx-ofa_kernel-5.8/backports/0008-BACKPORT-drivers-infiniband-core-cma.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0008-BACKPORT-drivers-infiniband-core-cma.c.patch new file mode 100644 index 0000000..192152a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0008-BACKPORT-drivers-infiniband-core-cma.c.patch @@ -0,0 +1,390 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/core/cma.c + +Change-Id: I57ffea877bd35e565e93dbfd51fc81a8ff594de8 +--- + drivers/infiniband/core/cma.c | 97 +++++++++++++++++++++++++++++++++-- + 1 file changed, 92 insertions(+), 5 deletions(-) + +--- a/drivers/infiniband/core/cma.c ++++ b/drivers/infiniband/core/cma.c +@@ -39,11 +39,16 @@ + + #include "core_priv.h" + #include "cma_priv.h" ++#ifdef HAVE_TRACE_EVENTS_H + #include "cma_trace.h" ++#endif + + MODULE_AUTHOR("Sean Hefty"); + MODULE_DESCRIPTION("Generic RDMA CM Agent"); + MODULE_LICENSE("Dual BSD/GPL"); ++#ifdef RETPOLINE_MLNX ++MODULE_INFO(retpoline, "Y"); ++#endif + + #define CMA_CM_RESPONSE_TIMEOUT 22 + #define CMA_MAX_CM_RETRIES 15 +@@ -175,6 +180,7 @@ static struct rb_root id_table = RB_ROOT + /* Serialize operations of id_table tree */ + static DEFINE_SPINLOCK(id_table_lock); + static struct workqueue_struct *cma_wq; ++ + static struct workqueue_struct *cma_netevent_wq; + static unsigned int cma_pernet_id; + +@@ -244,6 +250,7 @@ static struct rdma_bind_list *cma_ps_fin + struct xarray *xa = cma_pernet_xa(net, ps); + + return xa_load(xa, snum); ++ + } + + static void cma_ps_remove(struct net *net, enum rdma_ucm_port_space ps, +@@ -602,7 +609,9 @@ static void _cma_attach_to_dev(struct rd + rdma_node_get_transport(cma_dev->device->node_type); + list_add_tail(&id_priv->device_item, &cma_dev->id_list); + ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_cm_id_attach(id_priv, cma_dev->device); ++#endif + } + + static void cma_attach_to_dev(struct rdma_id_private *id_priv, +@@ -1114,12 +1123,16 @@ int rdma_create_qp(struct rdma_cm_id *id + id->qp = qp; + id_priv->qp_num = qp->qp_num; + id_priv->srq = (qp->srq != NULL); ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_cm_qp_create(id_priv, pd, qp_init_attr, 0); ++#endif + return 0; + out_destroy: + ib_destroy_qp(qp); + out_err: ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_cm_qp_create(id_priv, pd, qp_init_attr, ret); ++#endif + return ret; + } + EXPORT_SYMBOL(rdma_create_qp); +@@ -1129,7 +1142,9 @@ void rdma_destroy_qp(struct rdma_cm_id * + struct rdma_id_private *id_priv; + + id_priv = container_of(id, struct rdma_id_private, id); ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_cm_qp_destroy(id_priv); ++#endif + mutex_lock(&id_priv->qp_mutex); + ib_destroy_qp(id_priv->id.qp); + id_priv->id.qp = NULL; +@@ -1579,7 +1594,12 @@ static bool validate_ipv4_net_dev(struct + fl4.saddr = saddr; + + rcu_read_lock(); ++ ++#ifdef HAVE_FIB_LOOKUP_4_PARAMS + err = fib_lookup(dev_net(net_dev), &fl4, &res, 0); ++#else ++ err = fib_lookup(dev_net(net_dev), &fl4, &res); ++#endif + ret = err == 0 && FIB_RES_DEV(res) == net_dev; + rcu_read_unlock(); + +@@ -1595,7 +1615,11 @@ static bool validate_ipv6_net_dev(struct + IPV6_ADDR_LINKLOCAL; + struct rt6_info *rt = rt6_lookup(dev_net(net_dev), &dst_addr->sin6_addr, + &src_addr->sin6_addr, net_dev->ifindex, ++#ifdef HAVE_RT6_LOOKUP_TAKES_6_PARAMS + NULL, strict); ++#else ++ strict); ++#endif + bool ret; + + if (!rt) +@@ -1781,13 +1805,14 @@ static struct rdma_id_private *cma_find_ + const struct net_device *net_dev) + { + struct rdma_id_private *id_priv, *id_priv_dev; ++ COMPAT_HL_NODE + + lockdep_assert_held(&lock); + + if (!bind_list) + return ERR_PTR(-EINVAL); + +- hlist_for_each_entry(id_priv, &bind_list->owners, node) { ++ compat_hlist_for_each_entry(id_priv, &bind_list->owners, node) { + if (cma_match_private_data(id_priv, ib_event->private_data)) { + if (id_priv->id.device == cm_id->device && + cma_match_net_dev(&id_priv->id, net_dev, req)) +@@ -2061,7 +2086,9 @@ static void destroy_id_handler_unlock(st + enum rdma_cm_state state; + unsigned long flags; + ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_cm_id_destroy(id_priv); ++#endif + + /* + * Setting the state to destroyed under the handler mutex provides a +@@ -2100,7 +2127,9 @@ static int cma_rep_recv(struct rdma_id_p + if (ret) + goto reject; + ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_cm_send_rtu(id_priv); ++#endif + ret = ib_send_cm_rtu(id_priv->cm_id.ib, NULL, 0); + if (ret) + goto reject; +@@ -2109,7 +2138,9 @@ static int cma_rep_recv(struct rdma_id_p + reject: + pr_debug_ratelimited("RDMA CM: CONNECT_ERROR: failed to handle reply. status %d\n", ret); + cma_modify_qp_err(id_priv); ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_cm_send_rej(id_priv); ++#endif + ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED, + NULL, 0, NULL, 0); + return ret; +@@ -2139,9 +2170,13 @@ static int cma_cm_event_handler(struct r + + lockdep_assert_held(&id_priv->handler_mutex); + ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_cm_event_handler(id_priv, event); ++#endif + ret = id_priv->id.event_handler(&id_priv->id, event); ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_cm_event_done(id_priv, event, ret); ++#endif + return ret; + } + +@@ -2170,7 +2205,9 @@ static int cma_ib_handler(struct ib_cm_i + case IB_CM_REP_RECEIVED: + if (state == RDMA_CM_CONNECT && + (id_priv->id.qp_type != IB_QPT_UD)) { ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_cm_send_mra(id_priv); ++#endif + ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); + } + if (id_priv->id.qp) { +@@ -2381,7 +2418,9 @@ static int cma_ib_req_handler(struct ib_ + if (IS_ERR(listen_id)) + return PTR_ERR(listen_id); + ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_cm_req_handler(listen_id, ib_event->event); ++#endif + if (!cma_ib_check_req_qp_type(&listen_id->id, ib_event)) { + ret = -EINVAL; + goto net_dev_put; +@@ -2432,7 +2471,9 @@ static int cma_ib_req_handler(struct ib_ + + if (READ_ONCE(conn_id->state) == RDMA_CM_CONNECT && + conn_id->id.qp_type != IB_QPT_UD) { ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_cm_send_mra(cm_id->context); ++#endif + ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); + } + mutex_unlock(&conn_id->handler_mutex); +@@ -2676,7 +2717,9 @@ static int cma_listen_handler(struct rdm + + id->context = id_priv->id.context; + id->event_handler = id_priv->id.event_handler; ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_cm_event_handler(id_priv, event); ++#endif + return id_priv->id.event_handler(id, event); + } + +@@ -3190,10 +3233,19 @@ struct iboe_prio_tc_map { + bool found; + }; + ++#ifdef HAVE_NETDEV_WALK_ALL_LOWER_DEV_RCU + static int get_lower_vlan_dev_tc(struct net_device *dev, +- struct netdev_nested_priv *priv) ++#ifdef HAVE_NETDEV_NESTED_PRIV_STRUCT ++ struct netdev_nested_priv *priv) ++#else ++ void *data) ++#endif + { ++#ifdef HAVE_NETDEV_NESTED_PRIV_STRUCT + struct iboe_prio_tc_map *map = (struct iboe_prio_tc_map *)priv->data; ++#else ++ struct iboe_prio_tc_map *map = data; ++#endif + + if (is_vlan_dev(dev)) + map->output_tc = get_vlan_ndev_tc(dev, map->input_prio); +@@ -3207,24 +3259,36 @@ static int get_lower_vlan_dev_tc(struct + map->found = true; + return 1; + } ++#endif + + static int iboe_tos_to_sl(struct net_device *ndev, int tos) + { + struct iboe_prio_tc_map prio_tc_map = {}; + int prio = rt_tos2priority(tos); ++#ifdef HAVE_NETDEV_NESTED_PRIV_STRUCT + struct netdev_nested_priv priv; ++#endif + + /* If VLAN device, get it directly from the VLAN netdev */ +- if (is_vlan_dev(ndev)) ++ if (is_vlan_dev(ndev)) { + return get_vlan_ndev_tc(ndev, prio); ++ } + + prio_tc_map.input_prio = prio; ++#ifdef HAVE_NETDEV_NESTED_PRIV_STRUCT + priv.data = (void *)&prio_tc_map; ++#endif ++#ifdef HAVE_NETDEV_WALK_ALL_LOWER_DEV_RCU + rcu_read_lock(); + netdev_walk_all_lower_dev_rcu(ndev, + get_lower_vlan_dev_tc, ++#ifdef HAVE_NETDEV_NESTED_PRIV_STRUCT + &priv); ++#else ++ &prio_tc_map); ++#endif + rcu_read_unlock(); ++#endif + /* If map is found from lower device, use it; Otherwise + * continue with the current netdevice to get priority to tc map. + */ +@@ -3779,10 +3843,11 @@ static int cma_port_is_unique(struct rdm + struct sockaddr *daddr = cma_dst_addr(id_priv); + struct sockaddr *saddr = cma_src_addr(id_priv); + __be16 dport = cma_port(daddr); ++ COMPAT_HL_NODE + + lockdep_assert_held(&lock); + +- hlist_for_each_entry(cur_id, &bind_list->owners, node) { ++ compat_hlist_for_each_entry(cur_id, &bind_list->owners, node) { + struct sockaddr *cur_daddr = cma_dst_addr(cur_id); + struct sockaddr *cur_saddr = cma_src_addr(cur_id); + __be16 cur_dport = cma_port(cur_daddr); +@@ -3869,11 +3934,12 @@ static int cma_check_port(struct rdma_bi + { + struct rdma_id_private *cur_id; + struct sockaddr *addr, *cur_addr; ++ COMPAT_HL_NODE + + lockdep_assert_held(&lock); + + addr = cma_src_addr(id_priv); +- hlist_for_each_entry(cur_id, &bind_list->owners, node) { ++ compat_hlist_for_each_entry(cur_id, &bind_list->owners, node) { + if (id_priv == cur_id) + continue; + +@@ -4277,7 +4343,9 @@ static int cma_resolve_ib_udp(struct rdm + req.timeout_ms = 1 << (CMA_CM_RESPONSE_TIMEOUT - 8); + req.max_cm_retries = CMA_MAX_CM_RETRIES; + ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_cm_send_sidr_req(id_priv); ++#endif + ret = ib_send_cm_sidr_req(id_priv->cm_id.ib, &req); + if (ret) { + ib_destroy_cm_id(id_priv->cm_id.ib); +@@ -4354,7 +4422,9 @@ static int cma_connect_ib(struct rdma_id + req.ece.vendor_id = id_priv->ece.vendor_id; + req.ece.attr_mod = id_priv->ece.attr_mod; + ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_cm_send_req(id_priv); ++#endif + ret = ib_send_cm_req(id_priv->cm_id.ib, &req); + out: + if (ret && !IS_ERR(id)) { +@@ -4528,7 +4598,9 @@ static int cma_accept_ib(struct rdma_id_ + rep.ece.vendor_id = id_priv->ece.vendor_id; + rep.ece.attr_mod = id_priv->ece.attr_mod; + ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_cm_send_rep(id_priv); ++#endif + ret = ib_send_cm_rep(id_priv->cm_id.ib, &rep); + out: + return ret; +@@ -4582,7 +4654,9 @@ static int cma_send_sidr_rep(struct rdma + rep.private_data = private_data; + rep.private_data_len = private_data_len; + ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_cm_send_sidr_rep(id_priv); ++#endif + return ib_send_cm_sidr_rep(id_priv->cm_id.ib, &rep); + } + +@@ -4719,7 +4793,9 @@ int rdma_reject(struct rdma_cm_id *id, c + ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT, 0, + private_data, private_data_len); + } else { ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_cm_send_rej(id_priv); ++#endif + ret = ib_send_cm_rej(id_priv->cm_id.ib, reason, NULL, 0, + private_data, private_data_len); + } +@@ -4748,6 +4824,7 @@ int rdma_disconnect(struct rdma_cm_id *i + if (ret) + goto out; + /* Initiate or respond to a disconnect. */ ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_cm_disconnect(id_priv); + if (ib_send_cm_dreq(id_priv->cm_id.ib, NULL, 0)) { + if (!ib_send_cm_drep(id_priv->cm_id.ib, NULL, 0)) +@@ -4755,6 +4832,10 @@ int rdma_disconnect(struct rdma_cm_id *i + } else { + trace_cm_sent_dreq(id_priv); + } ++#else ++ if (ib_send_cm_dreq(id_priv->cm_id.ib, NULL, 0)) ++ ib_send_cm_drep(id_priv->cm_id.ib, NULL, 0); ++#endif + } else if (rdma_cap_iw_cm(id->device, id->port_num)) { + ret = iw_cm_disconnect(id_priv->cm_id.iw, 0); + } else +@@ -5231,7 +5312,9 @@ static void cma_send_device_removal_put( + */ + cma_id_put(id_priv); + mutex_unlock(&id_priv->handler_mutex); ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_cm_id_destroy(id_priv); ++#endif + _destroy_id(id_priv, state); + return; + } +@@ -5337,7 +5420,9 @@ static int cma_add_one(struct ib_device + } + mutex_unlock(&lock); + ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_cm_add_one(device); ++#endif + return 0; + + free_listen: +@@ -5359,7 +5444,9 @@ static void cma_remove_one(struct ib_dev + { + struct cma_device *cma_dev = client_data; + ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_cm_remove_one(device); ++#endif + + mutex_lock(&lock); + list_del(&cma_dev->list); diff --git a/src/mlnx-ofa_kernel-5.8/backports/0009-BACKPORT-drivers-infiniband-core-cma_configfs.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0009-BACKPORT-drivers-infiniband-core-cma_configfs.c.patch new file mode 100644 index 0000000..d79a5b7 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0009-BACKPORT-drivers-infiniband-core-cma_configfs.c.patch @@ -0,0 +1,245 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/core/cma_configfs.c + +Change-Id: Idb75744ab42c657a5df47d912dc4742334b5ddde +--- + drivers/infiniband/core/cma_configfs.c | 135 +++++++++++++++++++++++-- + 1 file changed, 126 insertions(+), 9 deletions(-) + +--- a/drivers/infiniband/core/cma_configfs.c ++++ b/drivers/infiniband/core/cma_configfs.c +@@ -38,6 +38,10 @@ + #include "core_priv.h" + #include "cma_priv.h" + ++#ifndef CONFIGFS_ATTR ++#define HAVE_OLD_CONFIGFS_API ++#endif ++ + struct cma_device; + + struct cma_dev_group; +@@ -55,6 +59,23 @@ struct cma_dev_group { + struct cma_dev_port_group *ports; + }; + ++#ifdef HAVE_OLD_CONFIGFS_API ++struct cma_configfs_attr { ++ struct configfs_attribute attr; ++ ssize_t (*show)(struct config_item *item, ++ char *buf); ++ ssize_t (*store)(struct config_item *item, ++ const char *buf, size_t count); ++}; ++#define CONFIGFS_ATTR(dummy, _name) \ ++static struct cma_configfs_attr attr_##_name = \ ++ __CONFIGFS_ATTR(_name, S_IRUGO | S_IWUSR, _name##_show, _name##_store) ++ ++#define CONFIGFS_ATTR_ADD(name) &name.attr ++#else ++#define CONFIGFS_ATTR_ADD(name) &name ++#endif /* HAVE_OLD_CONFIGFS_API */ ++ + static struct cma_dev_port_group *to_dev_port_group(struct config_item *item) + { + struct config_group *group; +@@ -71,6 +92,34 @@ static bool filter_by_name(struct ib_dev + return !strcmp(dev_name(&ib_dev->dev), cookie); + } + ++#ifdef HAVE_OLD_CONFIGFS_API ++static ssize_t cma_configfs_attr_show(struct config_item *item, ++ struct configfs_attribute *attr, ++ char *buf) ++{ ++ struct cma_configfs_attr *ca = ++ container_of(attr, struct cma_configfs_attr, attr); ++ ++ if (ca->show) ++ return ca->show(item, buf); ++ ++ return -EINVAL; ++} ++ ++static ssize_t cma_configfs_attr_store(struct config_item *item, ++ struct configfs_attribute *attr, ++ const char *buf, size_t count) ++{ ++ struct cma_configfs_attr *ca = ++ container_of(attr, struct cma_configfs_attr, attr); ++ ++ if (ca->store) ++ return ca->store(item, buf, count); ++ ++ return -EINVAL; ++} ++#endif /* HAVE_OLD_CONFIGFS_API */ ++ + static int cma_configfs_params_get(struct config_item *item, + struct cma_device **pcma_dev, + struct cma_dev_port_group **pgroup) +@@ -187,12 +236,23 @@ static ssize_t default_roce_tos_store(st + CONFIGFS_ATTR(, default_roce_tos); + + static struct configfs_attribute *cma_configfs_attributes[] = { +- &attr_default_roce_mode, +- &attr_default_roce_tos, ++ CONFIGFS_ATTR_ADD(attr_default_roce_mode), ++ CONFIGFS_ATTR_ADD(attr_default_roce_tos), + NULL, + }; + +-static const struct config_item_type cma_port_group_type = { ++#ifdef HAVE_OLD_CONFIGFS_API ++static struct configfs_item_operations cma_item_ops = { ++ .show_attribute = cma_configfs_attr_show, ++ .store_attribute = cma_configfs_attr_store, ++}; ++#else /* HAVE_OLD_CONFIGFS_API */ ++static struct configfs_item_operations cma_item_ops = { ++}; ++#endif ++ ++static struct config_item_type cma_port_group_type = { ++ .ct_item_ops = &cma_item_ops, + .ct_attrs = cma_configfs_attributes, + .ct_owner = THIS_MODULE + }; +@@ -217,6 +277,14 @@ static int make_cma_ports(struct cma_dev + if (!ports) + return -ENOMEM; + ++#ifndef HAVE_CONFIGFS_DEFAULT_GROUPS_LIST ++ cma_dev_group->ports_group.default_groups = kcalloc((ports_num + 1), ++ sizeof(struct config_group *), ++ GFP_KERNEL); ++ if (!cma_dev_group->ports_group.default_groups) ++ return -ENOMEM; ++#endif ++ + for (i = 0; i < ports_num; i++) { + char port_str[10]; + +@@ -226,10 +294,16 @@ static int make_cma_ports(struct cma_dev + config_group_init_type_name(&ports[i].group, + port_str, + &cma_port_group_type); ++#ifdef HAVE_CONFIGFS_DEFAULT_GROUPS_LIST + configfs_add_default_group(&ports[i].group, + &cma_dev_group->ports_group); +- ++#else ++ cma_dev_group->ports_group.default_groups[i] = &ports[i].group; ++#endif + } ++#ifndef HAVE_CONFIGFS_DEFAULT_GROUPS_LIST ++ cma_dev_group->ports_group.default_groups[i] = NULL; ++#endif + cma_dev_group->ports = ports; + return 0; + } +@@ -261,7 +335,7 @@ static struct configfs_item_operations c + .release = release_cma_ports_group + }; + +-static const struct config_item_type cma_ports_group_type = { ++static struct config_item_type cma_ports_group_type = { + .ct_item_ops = &cma_ports_item_ops, + .ct_owner = THIS_MODULE + }; +@@ -270,7 +344,7 @@ static struct configfs_item_operations c + .release = release_cma_dev + }; + +-static const struct config_item_type cma_device_group_type = { ++static struct config_item_type cma_device_group_type = { + .ct_item_ops = &cma_device_item_ops, + .ct_owner = THIS_MODULE + }; +@@ -293,6 +367,15 @@ static struct config_group *make_cma_dev + goto fail; + } + ++#ifndef HAVE_CONFIGFS_DEFAULT_GROUPS_LIST ++ cma_dev_group->device_group.default_groups = kzalloc(sizeof(struct config_group *) * 2, ++ GFP_KERNEL); ++ if (!cma_dev_group->device_group.default_groups) { ++ err = -ENOMEM; ++ goto fail; ++ } ++#endif ++ + strlcpy(cma_dev_group->name, name, sizeof(cma_dev_group->name)); + + config_group_init_type_name(&cma_dev_group->ports_group, "ports", +@@ -300,16 +383,29 @@ static struct config_group *make_cma_dev + + err = make_cma_ports(cma_dev_group, cma_dev); + if (err) ++#ifdef HAVE_CONFIGFS_DEFAULT_GROUPS_LIST + goto fail; ++#else ++ goto fail_free; ++#endif + + config_group_init_type_name(&cma_dev_group->device_group, name, + &cma_device_group_type); +- configfs_add_default_group(&cma_dev_group->ports_group, +- &cma_dev_group->device_group); ++#ifdef HAVE_CONFIGFS_DEFAULT_GROUPS_LIST ++ configfs_add_default_group(&cma_dev_group->ports_group, ++ &cma_dev_group->device_group); ++#else ++ cma_dev_group->device_group.default_groups[0] = &cma_dev_group->ports_group; ++ cma_dev_group->device_group.default_groups[1] = NULL; ++#endif + + cma_dev_put(cma_dev); + return &cma_dev_group->device_group; + ++#ifndef HAVE_CONFIGFS_DEFAULT_GROUPS_LIST ++fail_free: ++ kfree(cma_dev_group->device_group.default_groups); ++#endif + fail: + if (cma_dev) + cma_dev_put(cma_dev); +@@ -324,8 +420,29 @@ static void drop_cma_dev(struct config_g + struct cma_dev_group *cma_dev_group = + container_of(group, struct cma_dev_group, device_group); + ++#ifdef HAVE_CONFIGFS_DEFAULT_GROUPS_LIST + configfs_remove_default_groups(&cma_dev_group->ports_group); + configfs_remove_default_groups(&cma_dev_group->device_group); ++#else ++ struct config_item *temp_item; ++ int i; ++ ++ for (i = 0; cma_dev_group->ports_group.default_groups[i]; i++) { ++ temp_item = ++ &cma_dev_group->ports_group.default_groups[i]->cg_item; ++ cma_dev_group->ports_group.default_groups[i] = NULL; ++ config_item_put(temp_item); ++ } ++ kfree(cma_dev_group->ports_group.default_groups); ++ ++ for (i = 0; cma_dev_group->device_group.default_groups[i]; i++) { ++ temp_item = ++ &cma_dev_group->device_group.default_groups[i]->cg_item; ++ cma_dev_group->device_group.default_groups[i] = NULL; ++ config_item_put(temp_item); ++ } ++ kfree(cma_dev_group->device_group.default_groups); ++#endif + config_item_put(item); + } + +@@ -334,7 +451,7 @@ static struct configfs_group_operations + .drop_item = drop_cma_dev, + }; + +-static const struct config_item_type cma_subsys_type = { ++static struct config_item_type cma_subsys_type = { + .ct_group_ops = &cma_subsys_group_ops, + .ct_owner = THIS_MODULE, + }; diff --git a/src/mlnx-ofa_kernel-5.8/backports/0010-BACKPORT-drivers-infiniband-core-cma_trace.h.patch b/src/mlnx-ofa_kernel-5.8/backports/0010-BACKPORT-drivers-infiniband-core-cma_trace.h.patch new file mode 100644 index 0000000..f49cc81 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0010-BACKPORT-drivers-infiniband-core-cma_trace.h.patch @@ -0,0 +1,24 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/core/cma_trace.h + +Change-Id: I4b13aeb490d1a672b92e0af28e4efa3882e774f9 +--- + drivers/infiniband/core/cma_trace.h | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/drivers/infiniband/core/cma_trace.h ++++ b/drivers/infiniband/core/cma_trace.h +@@ -6,6 +6,7 @@ + * + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. + */ ++#ifdef HAVE_TRACE_EVENTS_H + + #undef TRACE_SYSTEM + #define TRACE_SYSTEM rdma_cma +@@ -359,3 +360,5 @@ DEFINE_CMA_CLIENT_EVENT(remove_one); + #define TRACE_INCLUDE_FILE cma_trace + + #include ++ ++#endif diff --git a/src/mlnx-ofa_kernel-5.8/backports/0011-BACKPORT-drivers-infiniband-core-core_priv.h.patch b/src/mlnx-ofa_kernel-5.8/backports/0011-BACKPORT-drivers-infiniband-core-core_priv.h.patch new file mode 100644 index 0000000..d644a48 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0011-BACKPORT-drivers-infiniband-core-core_priv.h.patch @@ -0,0 +1,93 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/core/core_priv.h + +Change-Id: Ie1b1fbc2aa6d8bc4cc8d9867510a2c68f7f6ac26 +--- + drivers/infiniband/core/core_priv.h | 42 +++++++++++++++++++++++++++++ + 1 file changed, 42 insertions(+) + +--- a/drivers/infiniband/core/core_priv.h ++++ b/drivers/infiniband/core/core_priv.h +@@ -40,6 +40,7 @@ + #include + + #include ++#include + #include + #include + #include +@@ -150,6 +151,7 @@ void ib_cache_cleanup_one(struct ib_devi + void ib_cache_release_one(struct ib_device *device); + void ib_dispatch_event_clients(struct ib_event *event); + ++#ifdef HAVE_CGROUP_RDMA_H + #ifdef CONFIG_CGROUP_RDMA + void ib_device_register_rdmacg(struct ib_device *device); + void ib_device_unregister_rdmacg(struct ib_device *device); +@@ -183,11 +185,39 @@ static inline void ib_rdmacg_uncharge(st + { + } + #endif ++#endif /* HAVE_CGROUP_RDMA_H */ + + static inline bool rdma_is_upper_dev_rcu(struct net_device *dev, + struct net_device *upper) + { ++#if defined(HAVE_NETDEV_HAS_UPPER_DEV_ALL_RCU) + return netdev_has_upper_dev_all_rcu(dev, upper); ++#elif defined(HAVE_NETDEV_FOR_EACH_ALL_UPPER_DEV_RCU) ++ struct net_device *_upper = NULL; ++ struct list_head *iter; ++ ++ netdev_for_each_all_upper_dev_rcu(dev, _upper, iter) ++ if (_upper == upper) ++ break; ++ ++ return _upper == upper; ++#else ++ struct net_device *rdev_upper; ++ struct net_device *master; ++ bool ret; ++ ++ if (!upper || !dev) ++ ret = false; ++ ++ rdev_upper = rdma_vlan_dev_real_dev(upper); ++ master = netdev_master_upper_dev_get_rcu(dev); ++ ++ ret = (upper == master) || ++ (rdev_upper && (rdev_upper == master)) || ++ (rdev_upper == dev); ++ ++ return ret; ++#endif + } + + int addr_init(void); +@@ -203,14 +233,26 @@ void rdma_nl_init(void); + void rdma_nl_exit(void); + + int ib_nl_handle_resolve_resp(struct sk_buff *skb, ++#ifdef HAVE_NETLINK_EXT_ACK + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack); ++#else ++ struct nlmsghdr *nlh); ++#endif + int ib_nl_handle_set_timeout(struct sk_buff *skb, ++#ifdef HAVE_NETLINK_EXT_ACK + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack); ++#else ++ struct nlmsghdr *nlh); ++#endif + int ib_nl_handle_ip_res_resp(struct sk_buff *skb, ++#ifdef HAVE_NETLINK_EXT_ACK + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack); ++#else ++ struct nlmsghdr *nlh); ++#endif + + void ib_get_cached_subnet_prefix(struct ib_device *device, + u32 port_num, diff --git a/src/mlnx-ofa_kernel-5.8/backports/0012-BACKPORT-drivers-infiniband-core-counters.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0012-BACKPORT-drivers-infiniband-core-counters.c.patch new file mode 100644 index 0000000..e5269bc --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0012-BACKPORT-drivers-infiniband-core-counters.c.patch @@ -0,0 +1,47 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/core/counters.c + +Change-Id: I3b1ff54bb1f1e37d29316978f6670261a727df91 +--- + drivers/infiniband/core/counters.c | 15 ++++++++++++--- + 1 file changed, 12 insertions(+), 3 deletions(-) + +--- a/drivers/infiniband/core/counters.c ++++ b/drivers/infiniband/core/counters.c +@@ -40,8 +40,12 @@ static int __counter_set_mode(struct rdm + * + */ + int rdma_counter_set_auto_mode(struct ib_device *dev, u32 port, +- enum rdma_nl_counter_mask mask, +- struct netlink_ext_ack *extack) ++ enum rdma_nl_counter_mask mask ++#ifdef HAVE_NETLINK_EXT_ACK ++ , struct netlink_ext_ack *extack) ++#else ++ ) ++#endif + { + struct rdma_port_counter *port_counter; + enum rdma_nl_counter_mode mode; +@@ -63,15 +67,20 @@ int rdma_counter_set_auto_mode(struct ib + ret = 0; + goto out; + } +- + ret = __counter_set_mode(port_counter, mode, mask); + + out: + mutex_unlock(&port_counter->lock); + if (ret == -EBUSY) ++#ifdef HAVE_NETLINK_EXT_ACK + NL_SET_ERR_MSG( + extack, + "Modifying auto mode is not allowed when there is a bound QP"); ++#else ++ netdev_warn(ib_device_get_netdev(dev, port), ++ "Modifying auto mode is not allowed when there is a bound QP\n"); ++ ++#endif + return ret; + } + diff --git a/src/mlnx-ofa_kernel-5.8/backports/0013-BACKPORT-drivers-infiniband-core-cq.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0013-BACKPORT-drivers-infiniband-core-cq.c.patch new file mode 100644 index 0000000..f529245 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0013-BACKPORT-drivers-infiniband-core-cq.c.patch @@ -0,0 +1,206 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/core/cq.c + +Change-Id: I6b8815ed66d0f138f864b475759c7c1716facee1 +--- + drivers/infiniband/core/cq.c | 71 ++++++++++++++++++++++++++++++++++++ + 1 file changed, 71 insertions(+) + +--- a/drivers/infiniband/core/cq.c ++++ b/drivers/infiniband/core/cq.c +@@ -9,7 +9,10 @@ + + #include "core_priv.h" + ++#include ++#ifdef HAVE_TRACE_EVENTS_RDMA_CORE_HEADER + #include ++#endif + /* Max size for shared CQ, may require tuning */ + #define IB_MAX_SHARED_CQ_SZ 4096U + +@@ -47,7 +50,9 @@ static void ib_cq_rdma_dim_work(struct w + + dim->state = DIM_START_MEASURE; + ++#ifdef HAVE_TRACE_EVENTS_RDMA_CORE_HEADER + trace_cq_modify(cq, comps, usec); ++#endif + cq->device->ops.modify_cq(cq, comps, usec); + } + +@@ -81,6 +86,7 @@ static void rdma_dim_destroy(struct ib_c + kfree(cq->dim); + } + ++#ifdef HAVE_TRACE_EVENTS_RDMA_CORE_HEADER + static int __poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc) + { + int rc; +@@ -89,21 +95,30 @@ static int __poll_cq(struct ib_cq *cq, i + trace_cq_poll(cq, num_entries, rc); + return rc; + } ++#endif + + static int __ib_process_cq(struct ib_cq *cq, int budget, struct ib_wc *wcs, + int batch) + { + int i, n, completed = 0; + ++#ifdef HAVE_TRACE_EVENTS_RDMA_CORE_HEADER + trace_cq_process(cq); ++#endif + + /* + * budget might be (-1) if the caller does not + * want to bound this call, thus we need unsigned + * minimum here. + */ ++#ifdef HAVE_TRACE_EVENTS_RDMA_CORE_HEADER ++ trace_cq_process(cq); + while ((n = __poll_cq(cq, min_t(u32, batch, + budget - completed), wcs)) > 0) { ++#else ++ while ((n = ib_poll_cq(cq, min_t(u32, batch, ++ budget - completed), wcs)) > 0) { ++#endif + for (i = 0; i < n; i++) { + struct ib_wc *wc = &wcs[i]; + +@@ -149,6 +164,8 @@ static void ib_cq_completion_direct(stru + WARN_ONCE(1, "got unsolicited completion for CQ 0x%p\n", cq); + } + ++#if defined(HAVE_IRQ_POLL_H) ++#if IS_ENABLED(CONFIG_IRQ_POLL) + static int ib_poll_handler(struct irq_poll *iop, int budget) + { + struct ib_cq *cq = container_of(iop, struct ib_cq, iop); +@@ -159,7 +176,9 @@ static int ib_poll_handler(struct irq_po + if (completed < budget) { + irq_poll_complete(&cq->iop); + if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) { ++#ifdef HAVE_TRACE_EVENTS_RDMA_CORE_HEADER + trace_cq_reschedule(cq); ++#endif + irq_poll_sched(&cq->iop); + } + } +@@ -172,9 +191,36 @@ static int ib_poll_handler(struct irq_po + + static void ib_cq_completion_softirq(struct ib_cq *cq, void *private) + { ++#ifdef HAVE_TRACE_EVENTS_RDMA_CORE_HEADER + trace_cq_schedule(cq); ++#endif + irq_poll_sched(&cq->iop); + } ++#endif /*CONFIG_IRQ_POLL*/ ++#else /*HAVE_IRQ_POLL_H*/ ++static int ib_poll_handler(struct blk_iopoll *iop, int budget) ++{ ++ struct ib_cq *cq = container_of(iop, struct ib_cq, iop); ++ int completed; ++ ++ completed = __ib_process_cq(cq, budget, cq->wc, IB_POLL_BATCH); ++ if (completed < budget) { ++ blk_iopoll_complete(&cq->iop); ++ if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) { ++ if (!blk_iopoll_sched_prep(&cq->iop)) ++ blk_iopoll_sched(&cq->iop); ++ } ++ } ++ ++ return completed; ++} ++ ++static void ib_cq_completion_softirq(struct ib_cq *cq, void *private) ++{ ++ if (!blk_iopoll_sched_prep(&cq->iop)) ++ blk_iopoll_sched(&cq->iop); ++} ++#endif /*HAVE_IRQ_POLL_H*/ + + static void ib_cq_poll_work(struct work_struct *work) + { +@@ -192,7 +238,9 @@ static void ib_cq_poll_work(struct work_ + + static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private) + { ++#ifdef HAVE_TRACE_EVENTS_RDMA_CORE_HEADER + trace_cq_schedule(cq); ++#endif + queue_work(cq->comp_wq, &cq->work); + } + +@@ -249,12 +297,21 @@ struct ib_cq *__ib_alloc_cq(struct ib_de + case IB_POLL_DIRECT: + cq->comp_handler = ib_cq_completion_direct; + break; ++#if IS_ENABLED(CONFIG_IRQ_POLL) || !defined(HAVE_IRQ_POLL_H) + case IB_POLL_SOFTIRQ: + cq->comp_handler = ib_cq_completion_softirq; + ++#if defined(HAVE_IRQ_POLL_H) ++#if IS_ENABLED(CONFIG_IRQ_POLL) + irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ, ib_poll_handler); ++#endif ++#else ++ blk_iopoll_init(&cq->iop, IB_POLL_BUDGET_IRQ, ib_poll_handler); ++ blk_iopoll_enable(&cq->iop); ++#endif + ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + break; ++#endif + case IB_POLL_WORKQUEUE: + case IB_POLL_UNBOUND_WORKQUEUE: + cq->comp_handler = ib_cq_completion_workqueue; +@@ -272,7 +329,9 @@ struct ib_cq *__ib_alloc_cq(struct ib_de + rdma_restrack_dontrack(&cq->res); + else + rdma_restrack_add(&cq->res); ++#ifdef HAVE_TRACE_EVENTS_RDMA_CORE_HEADER + trace_cq_alloc(cq, nr_cqe, comp_vector, poll_ctx); ++#endif + return cq; + + out_destroy_cq: +@@ -283,7 +342,9 @@ out_free_wc: + kfree(cq->wc); + out_free_cq: + kfree(cq); ++#ifdef HAVE_TRACE_EVENTS_RDMA_CORE_HEADER + trace_cq_alloc_error(nr_cqe, comp_vector, poll_ctx, ret); ++#endif + return ERR_PTR(ret); + } + EXPORT_SYMBOL(__ib_alloc_cq); +@@ -332,9 +393,17 @@ void ib_free_cq(struct ib_cq *cq) + switch (cq->poll_ctx) { + case IB_POLL_DIRECT: + break; ++#if IS_ENABLED(CONFIG_IRQ_POLL) || !defined(HAVE_IRQ_POLL_H) + case IB_POLL_SOFTIRQ: ++#if defined(HAVE_IRQ_POLL_H) ++#if IS_ENABLED(CONFIG_IRQ_POLL) + irq_poll_disable(&cq->iop); ++#endif ++#else ++ blk_iopoll_disable(&cq->iop); ++#endif + break; ++#endif + case IB_POLL_WORKQUEUE: + case IB_POLL_UNBOUND_WORKQUEUE: + cancel_work_sync(&cq->work); +@@ -344,7 +413,9 @@ void ib_free_cq(struct ib_cq *cq) + } + + rdma_dim_destroy(cq); ++#ifdef HAVE_TRACE_EVENTS_RDMA_CORE_HEADER + trace_cq_free(cq); ++#endif + ret = cq->device->ops.destroy_cq(cq, NULL); + WARN_ONCE(ret, "Destroy of kernel CQ shouldn't fail"); + rdma_restrack_del(&cq->res); diff --git a/src/mlnx-ofa_kernel-5.8/backports/0014-BACKPORT-drivers-infiniband-core-device.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0014-BACKPORT-drivers-infiniband-core-device.c.patch new file mode 100644 index 0000000..46e147f --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0014-BACKPORT-drivers-infiniband-core-device.c.patch @@ -0,0 +1,309 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/core/device.c + +Change-Id: I78a0cbfef6bc77d7572cb71653cf76f74437d370 +--- + drivers/infiniband/core/device.c | 114 +++++++++++++++++++++++++++---- + 1 file changed, 102 insertions(+), 12 deletions(-) + +--- a/drivers/infiniband/core/device.c ++++ b/drivers/infiniband/core/device.c +@@ -46,6 +46,7 @@ + #include + #include + #include ++#include + + #include "core_priv.h" + #include "restrack.h" +@@ -53,6 +54,9 @@ + MODULE_AUTHOR("Roland Dreier"); + MODULE_DESCRIPTION("core kernel InfiniBand API"); + MODULE_LICENSE("Dual BSD/GPL"); ++#ifdef RETPOLINE_MLNX ++MODULE_INFO(retpoline, "Y"); ++#endif + + struct workqueue_struct *ib_comp_wq; + struct workqueue_struct *ib_comp_unbound_wq; +@@ -186,11 +190,17 @@ static DECLARE_HASHTABLE(ndev_hash, 5); + static void free_netdevs(struct ib_device *ib_dev); + static void ib_unregister_work(struct work_struct *work); + static void __ib_unregister_device(struct ib_device *device); ++#if defined(HAVE_REGISTER_BLOCKING_LSM_NOTIFIER) || defined (HAVE_REGISTER_LSM_NOTIFIER) + static int ib_security_change(struct notifier_block *nb, unsigned long event, + void *lsm_data); + static void ib_policy_change_task(struct work_struct *work); + static DECLARE_WORK(ib_policy_change_work, ib_policy_change_task); + ++static struct notifier_block ibdev_lsm_nb = { ++ .notifier_call = ib_security_change, ++}; ++#endif ++ + static void __ibdev_printk(const char *level, const struct ib_device *ibdev, + struct va_format *vaf) + { +@@ -251,10 +261,6 @@ define_ibdev_printk_level(ibdev_warn, KE + define_ibdev_printk_level(ibdev_notice, KERN_NOTICE); + define_ibdev_printk_level(ibdev_info, KERN_INFO); + +-static struct notifier_block ibdev_lsm_nb = { +- .notifier_call = ib_security_change, +-}; +- + static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, + struct net *net); + +@@ -454,17 +460,32 @@ static int alloc_name(struct ib_device * + { + struct ib_device *device; + unsigned long index; +- struct ida inuse; +- int rc; + int i; ++#ifdef HAVE_IDA_ALLOC ++ struct ida inuse; ++ int rc; ++#else ++ unsigned long *inuse; + ++ inuse = (unsigned long *) get_zeroed_page(GFP_KERNEL); ++ if (!inuse) ++ return -ENOMEM; ++#endif ++#ifdef HAVE_LOCKUP_ASSERT_HELD_EXCLUSIVE ++ lockdep_assert_held_exclusive(&devices_rwsem); ++#elif defined(HAVE_LOCKUP_ASSERT_HELD_WRITE) + lockdep_assert_held_write(&devices_rwsem); +- ida_init(&inuse); ++#endif ++ ++#ifdef HAVE_IDA_ALLOC ++ ida_init(&inuse); ++#endif + xa_for_each (&devices, index, device) { + char buf[IB_DEVICE_NAME_MAX]; + + if (sscanf(dev_name(&device->dev), name, &i) != 1) + continue; ++#ifdef HAVE_IDA_ALLOC + if (i < 0 || i >= INT_MAX) + continue; + snprintf(buf, sizeof buf, name, i); +@@ -484,6 +505,17 @@ static int alloc_name(struct ib_device * + out: + ida_destroy(&inuse); + return rc; ++#else ++ if (i < 0 || i >= PAGE_SIZE * 8) ++ continue; ++ snprintf(buf, sizeof buf, name, i); ++ if (!strcmp(buf, dev_name(&device->dev))) ++ set_bit(i, inuse); ++ } ++ i = find_first_zero_bit(inuse, PAGE_SIZE * 8); ++ free_page((unsigned long) inuse); ++ return dev_set_name(&ibdev->dev, name, i); ++#endif + } + + static void ib_device_release(struct device *device) +@@ -879,6 +911,7 @@ void ib_get_device_fw_str(struct ib_devi + } + EXPORT_SYMBOL(ib_get_device_fw_str); + ++#if defined(HAVE_REGISTER_BLOCKING_LSM_NOTIFIER) || defined (HAVE_REGISTER_LSM_NOTIFIER) + static void ib_policy_change_task(struct work_struct *work) + { + struct ib_device *dev; +@@ -908,6 +941,7 @@ static int ib_security_change(struct not + + return NOTIFY_OK; + } ++#endif /* HAVE_REGISTER_BLOCKING_LSM_NOTIFIER */ + + static void compatdev_release(struct device *dev) + { +@@ -1372,6 +1406,7 @@ int ib_register_device(struct ib_device + if (ret) + return ret; + ++#ifdef HAVE_DEVICE_DMA_OPS + /* + * If the caller does not provide a DMA capable device then the IB core + * will set up ib_sge and scatterlist structures that stash the kernel +@@ -1379,6 +1414,17 @@ int ib_register_device(struct ib_device + */ + WARN_ON(dma_device && !dma_device->dma_parms); + device->dma_device = dma_device; ++#else /* HAVE_DEVICE_DMA_OPS */ ++ WARN_ON_ONCE(!device->dev.parent && !device->dma_device); ++ WARN_ON_ONCE(device->dev.parent && device->dma_device ++ && device->dev.parent != device->dma_device); ++ if (!device->dev.parent) ++ device->dev.parent = device->dma_device; ++ if (!device->dma_device) ++ device->dma_device = device->dev.parent; ++ /* Setup default max segment size for all IB devices */ ++ dma_set_max_seg_size(device->dma_device, SZ_2G); ++#endif /* HAVE_DEVICE_DMA_OPS */ + + ret = setup_device(device); + if (ret) +@@ -1397,7 +1443,9 @@ int ib_register_device(struct ib_device + if (ret) + goto cache_cleanup; + ++#ifdef HAVE_CGROUP_RDMA_H + ib_device_register_rdmacg(device); ++#endif + + rdma_counter_init(device); + +@@ -1463,7 +1511,9 @@ dev_cleanup: + device_del(&device->dev); + cg_cleanup: + dev_set_uevent_suppress(&device->dev, false); ++#ifdef HAVE_CGROUP_RDMA_H + ib_device_unregister_rdmacg(device); ++#endif + cache_cleanup: + ib_cache_cleanup_one(device); + return ret; +@@ -1491,7 +1541,9 @@ static void __ib_unregister_device(struc + + ib_free_port_attrs(&ib_dev->coredev); + device_del(&ib_dev->dev); ++#ifdef HAVE_CGROUP_RDMA_H + ib_device_unregister_rdmacg(ib_dev); ++#endif + ib_cache_cleanup_one(ib_dev); + + /* +@@ -1610,6 +1662,7 @@ static void ib_unregister_work(struct wo + * Drivers using this API must use ib_unregister_driver before module unload + * to ensure that all scheduled unregistrations have completed. + */ ++#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + void ib_unregister_device_queued(struct ib_device *ib_dev) + { + WARN_ON(!refcount_read(&ib_dev->refcount)); +@@ -1619,6 +1672,7 @@ void ib_unregister_device_queued(struct + put_device(&ib_dev->dev); + } + EXPORT_SYMBOL(ib_unregister_device_queued); ++#endif + + /* + * The caller must pass in a device that has the kref held and the refcount +@@ -2269,9 +2323,10 @@ struct ib_device *ib_device_get_by_netde + { + struct ib_device *res = NULL; + struct ib_port_data *cur; ++ COMPAT_HL_NODE; + + rcu_read_lock(); +- hash_for_each_possible_rcu (ndev_hash, cur, ndev_hash_link, ++ compat_hash_for_each_possible_rcu (ndev_hash, cur, ndev_hash_link, + (uintptr_t)ndev) { + if (rcu_access_pointer(cur->netdev) == ndev && + (driver_id == RDMA_DRIVER_UNKNOWN || +@@ -2685,6 +2740,9 @@ void ib_set_device_ops(struct ib_device + SET_DEVICE_OP(dev_ops, get_vf_config); + SET_DEVICE_OP(dev_ops, get_vf_guid); + SET_DEVICE_OP(dev_ops, get_vf_stats); ++#ifndef HAVE_MMU_INTERVAL_NOTIFIER ++ SET_DEVICE_OP(dev_ops, invalidate_range); ++#endif + SET_DEVICE_OP(dev_ops, iw_accept); + SET_DEVICE_OP(dev_ops, iw_add_ref); + SET_DEVICE_OP(dev_ops, iw_connect); +@@ -2795,14 +2853,28 @@ static int __init ib_core_init(void) + goto err; + + ib_comp_wq = alloc_workqueue("ib-comp-wq", +- WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, 0); ++ 0 ++ | WQ_HIGHPRI ++ | WQ_MEM_RECLAIM ++ | WQ_SYSFS ++#if defined(HAVE_WQ_NON_REENTRANT) ++ | WQ_NON_REENTRANT ++#endif ++ , 0); + if (!ib_comp_wq) + goto err_unbound; + + ib_comp_unbound_wq = + alloc_workqueue("ib-comp-unb-wq", +- WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM | +- WQ_SYSFS, WQ_UNBOUND_MAX_ACTIVE); ++ 0 ++ | WQ_UNBOUND ++ | WQ_HIGHPRI ++ | WQ_MEM_RECLAIM ++ | WQ_SYSFS ++#if defined(HAVE_WQ_NON_REENTRANT) ++ | WQ_NON_REENTRANT ++#endif ++ , WQ_UNBOUND_MAX_ACTIVE); + if (!ib_comp_unbound_wq) + goto err_comp; + +@@ -2832,11 +2904,17 @@ static int __init ib_core_init(void) + goto err_mad; + } + ++#if defined(HAVE_REGISTER_BLOCKING_LSM_NOTIFIER) || defined(HAVE_REGISTER_LSM_NOTIFIER) ++#ifdef HAVE_REGISTER_BLOCKING_LSM_NOTIFIER + ret = register_blocking_lsm_notifier(&ibdev_lsm_nb); ++#elif defined(HAVE_REGISTER_LSM_NOTIFIER) ++ ret = register_lsm_notifier(&ibdev_lsm_nb); ++#endif /* HAVE_REGISTER_BLOCKING_LSM_NOTIFIER */ + if (ret) { + pr_warn("Couldn't register LSM notifier. ret %d\n", ret); + goto err_sa; + } ++#endif + + ret = register_pernet_device(&rdma_dev_net_ops); + if (ret) { +@@ -2851,9 +2929,15 @@ static int __init ib_core_init(void) + return 0; + + err_compat: ++#if defined(HAVE_REGISTER_BLOCKING_LSM_NOTIFIER) || defined(HAVE_REGISTER_LSM_NOTIFIER) ++#ifdef HAVE_REGISTER_BLOCKING_LSM_NOTIFIER + unregister_blocking_lsm_notifier(&ibdev_lsm_nb); ++#elif defined(HAVE_REGISTER_LSM_NOTIFIER) ++ unregister_lsm_notifier(&ibdev_lsm_nb); ++#endif /* HAVE_REGISTER_BLOCKING_LSM_NOTIFIER */ + err_sa: + ib_sa_cleanup(); ++#endif + err_mad: + ib_mad_cleanup(); + err_addr: +@@ -2877,7 +2961,11 @@ static void __exit ib_core_cleanup(void) + nldev_exit(); + rdma_nl_unregister(RDMA_NL_LS); + unregister_pernet_device(&rdma_dev_net_ops); ++#ifdef HAVE_REGISTER_BLOCKING_LSM_NOTIFIER + unregister_blocking_lsm_notifier(&ibdev_lsm_nb); ++#elif defined(HAVE_REGISTER_LSM_NOTIFIER) ++ unregister_lsm_notifier(&ibdev_lsm_nb); ++#endif + ib_sa_cleanup(); + ib_mad_cleanup(); + addr_cleanup(); +@@ -2887,7 +2975,9 @@ static void __exit ib_core_cleanup(void) + destroy_workqueue(ib_comp_wq); + /* Make sure that any pending umem accounting work is done. */ + destroy_workqueue(ib_wq); ++#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + destroy_workqueue(ib_unreg_wq); ++#endif + WARN_ON(!xa_empty(&clients)); + WARN_ON(!xa_empty(&devices)); + } diff --git a/src/mlnx-ofa_kernel-5.8/backports/0015-BACKPORT-drivers-infiniband-core-iwcm.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0015-BACKPORT-drivers-infiniband-core-iwcm.c.patch new file mode 100644 index 0000000..b693db0 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0015-BACKPORT-drivers-infiniband-core-iwcm.c.patch @@ -0,0 +1,85 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/core/iwcm.c + +Change-Id: I4237de312a416df68e50caa7a67a429cc41a2034 +--- + drivers/infiniband/core/iwcm.c | 13 +++++++++++++ + 1 file changed, 13 insertions(+) + +--- a/drivers/infiniband/core/iwcm.c ++++ b/drivers/infiniband/core/iwcm.c +@@ -58,6 +58,9 @@ + MODULE_AUTHOR("Tom Tucker"); + MODULE_DESCRIPTION("iWARP CM"); + MODULE_LICENSE("Dual BSD/GPL"); ++#ifdef RETPOLINE_MLNX ++MODULE_INFO(retpoline, "Y"); ++#endif + + static const char * const iwcm_rej_reason_strs[] = { + [ECONNRESET] = "reset by remote host", +@@ -102,6 +105,7 @@ struct iwcm_work { + + static unsigned int default_backlog = 256; + ++#ifndef CONFIG_SYSCTL_SYSCALL_CHECK + static struct ctl_table_header *iwcm_ctl_table_hdr; + static struct ctl_table iwcm_ctl_table[] = { + { +@@ -113,6 +117,7 @@ static struct ctl_table iwcm_ctl_table[] + }, + { } + }; ++#endif + + /* + * The following services provide a mechanism for pre-allocating iwcm_work +@@ -480,6 +485,7 @@ static void iw_cm_check_wildcard(struct + cm4_outaddr->sin_addr = cm4_addr->sin_addr; + } + } else { ++#if IS_ENABLED(CONFIG_IPV6) + struct sockaddr_in6 *pm6_addr = (struct sockaddr_in6 *)pm_addr; + + if (ipv6_addr_type(&pm6_addr->sin6_addr) == IPV6_ADDR_ANY) { +@@ -490,6 +496,7 @@ static void iw_cm_check_wildcard(struct + + cm6_outaddr->sin6_addr = cm6_addr->sin6_addr; + } ++#endif + } + } + +@@ -1192,18 +1199,22 @@ static int __init iw_cm_init(void) + if (!iwcm_wq) + goto err_alloc; + ++#ifndef CONFIG_SYSCTL_SYSCALL_CHECK + iwcm_ctl_table_hdr = register_net_sysctl(&init_net, "net/iw_cm", + iwcm_ctl_table); + if (!iwcm_ctl_table_hdr) { + pr_err("iw_cm: couldn't register sysctl paths\n"); + goto err_sysctl; + } ++#endif + + rdma_nl_register(RDMA_NL_IWCM, iwcm_nl_cb_table); + return 0; + ++#ifndef CONFIG_SYSCTL_SYSCALL_CHECK + err_sysctl: + destroy_workqueue(iwcm_wq); ++#endif + err_alloc: + iwpm_exit(RDMA_NL_IWCM); + return -ENOMEM; +@@ -1211,7 +1222,9 @@ err_alloc: + + static void __exit iw_cm_cleanup(void) + { ++#ifndef CONFIG_SYSCTL_SYSCALL_CHECK + rdma_nl_unregister(RDMA_NL_IWCM); ++#endif + unregister_net_sysctl_table(iwcm_ctl_table_hdr); + destroy_workqueue(iwcm_wq); + iwpm_exit(RDMA_NL_IWCM); diff --git a/src/mlnx-ofa_kernel-5.8/backports/0016-BACKPORT-drivers-infiniband-core-iwpm_util.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0016-BACKPORT-drivers-infiniband-core-iwpm_util.c.patch new file mode 100644 index 0000000..a1de80b --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0016-BACKPORT-drivers-infiniband-core-iwpm_util.c.patch @@ -0,0 +1,123 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/core/iwpm_util.c + +Change-Id: I984f3b6ce0da2c5e1a5c102990e6e254c4ad4afb +--- + drivers/infiniband/core/iwpm_util.c | 32 +++++++++++++++++++++-------- + 1 file changed, 24 insertions(+), 8 deletions(-) + +--- a/drivers/infiniband/core/iwpm_util.c ++++ b/drivers/infiniband/core/iwpm_util.c +@@ -156,6 +156,7 @@ int iwpm_remove_mapinfo(struct sockaddr_ + struct hlist_node *tmp_hlist_node; + struct hlist_head *hash_bucket_head; + struct iwpm_mapping_info *map_info = NULL; ++ COMPAT_HL_NODE + unsigned long flags; + int ret = -EINVAL; + +@@ -167,7 +168,7 @@ int iwpm_remove_mapinfo(struct sockaddr_ + if (!hash_bucket_head) + goto remove_mapinfo_exit; + +- hlist_for_each_entry_safe(map_info, tmp_hlist_node, ++ compat_hlist_for_each_entry_safe(map_info, tmp_hlist_node, + hash_bucket_head, hlist_node) { + + if (!iwpm_compare_sockaddr(&map_info->mapped_sockaddr, +@@ -189,13 +190,14 @@ static void free_hash_bucket(void) + { + struct hlist_node *tmp_hlist_node; + struct iwpm_mapping_info *map_info; ++ COMPAT_HL_NODE + unsigned long flags; + int i; + + /* remove all the mapinfo data from the list */ + spin_lock_irqsave(&iwpm_mapinfo_lock, flags); + for (i = 0; i < IWPM_MAPINFO_HASH_SIZE; i++) { +- hlist_for_each_entry_safe(map_info, tmp_hlist_node, ++ compat_hlist_for_each_entry_safe(map_info, tmp_hlist_node, + &iwpm_hash_bucket[i], hlist_node) { + + hlist_del_init(&map_info->hlist_node); +@@ -212,13 +214,14 @@ static void free_reminfo_bucket(void) + { + struct hlist_node *tmp_hlist_node; + struct iwpm_remote_info *rem_info; ++ COMPAT_HL_NODE + unsigned long flags; + int i; + + /* remove all the remote info from the list */ + spin_lock_irqsave(&iwpm_reminfo_lock, flags); + for (i = 0; i < IWPM_REMINFO_HASH_SIZE; i++) { +- hlist_for_each_entry_safe(rem_info, tmp_hlist_node, ++ compat_hlist_for_each_entry_safe(rem_info, tmp_hlist_node, + &iwpm_reminfo_bucket[i], hlist_node) { + + hlist_del_init(&rem_info->hlist_node); +@@ -270,6 +273,7 @@ int iwpm_get_remote_info(struct sockaddr + struct hlist_head *hash_bucket_head; + struct iwpm_remote_info *rem_info = NULL; + unsigned long flags; ++ COMPAT_HL_NODE + int ret = -EINVAL; + + spin_lock_irqsave(&iwpm_reminfo_lock, flags); +@@ -279,7 +283,7 @@ int iwpm_get_remote_info(struct sockaddr + mapped_rem_addr); + if (!hash_bucket_head) + goto get_remote_info_exit; +- hlist_for_each_entry_safe(rem_info, tmp_hlist_node, ++ compat_hlist_for_each_entry_safe(rem_info, tmp_hlist_node, + hash_bucket_head, hlist_node) { + + if (!iwpm_compare_sockaddr(&rem_info->mapped_loc_sockaddr, +@@ -462,13 +466,24 @@ int iwpm_parse_nlmsg(struct netlink_call + int ret; + const char *err_str = ""; + +- ret = nlmsg_validate_deprecated(cb->nlh, nlh_len, policy_max - 1, +- nlmsg_policy, NULL); ++#ifdef CONFIG_COMPAT_IS_NLMSG_VALIDATE_NOT_CONST_NLMSGHDR ++ ret = nlmsg_validate((struct nlmsghdr *)cb->nlh, nlh_len, policy_max-1, nlmsg_policy, ++ NULL); ++#elif defined(HAVE_NLMSG_VALIDATE_DEPRECATED) ++ ret = nlmsg_validate_deprecated(cb->nlh, nlh_len, policy_max - 1, ++ nlmsg_policy, NULL); ++#else ++ ret = nlmsg_validate(cb->nlh, nlh_len, policy_max - 1, nlmsg_policy, NULL); ++#endif + if (ret) { + err_str = "Invalid attribute"; + goto parse_nlmsg_error; + } ++#ifdef HAVE_NLMSG_PARSE_DEPRECATED + ret = nlmsg_parse_deprecated(cb->nlh, nlh_len, nltb, policy_max - 1, ++#else ++ ret = nlmsg_parse(cb->nlh, nlh_len, nltb, policy_max - 1, ++#endif + nlmsg_policy, NULL); + if (ret) { + err_str = "Unable to parse the nlmsg"; +@@ -643,8 +658,9 @@ int iwpm_send_mapinfo(u8 nl_client, int + int skb_num = 0, mapping_num = 0; + int i = 0, nlmsg_bytes = 0; + unsigned long flags; ++ COMPAT_HL_NODE + const char *err_str = ""; +- int ret; ++ int ret = 0; + + skb = dev_alloc_skb(NLMSG_GOODSIZE); + if (!skb) { +@@ -656,7 +672,7 @@ int iwpm_send_mapinfo(u8 nl_client, int + spin_lock_irqsave(&iwpm_mapinfo_lock, flags); + ret = -EINVAL; + for (i = 0; i < IWPM_MAPINFO_HASH_SIZE; i++) { +- hlist_for_each_entry(map_info, &iwpm_hash_bucket[i], ++ compat_hlist_for_each_entry(map_info, &iwpm_hash_bucket[i], + hlist_node) { + if (map_info->nl_client != nl_client) + continue; diff --git a/src/mlnx-ofa_kernel-5.8/backports/0017-BACKPORT-drivers-infiniband-core-lag.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0017-BACKPORT-drivers-infiniband-core-lag.c.patch new file mode 100644 index 0000000..4614d20 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0017-BACKPORT-drivers-infiniband-core-lag.c.patch @@ -0,0 +1,32 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/core/lag.c + +Change-Id: I823acc086ab4bcdf1b6abfd882c0579c9538b8ad +--- + drivers/infiniband/core/lag.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +--- a/drivers/infiniband/core/lag.c ++++ b/drivers/infiniband/core/lag.c +@@ -91,9 +91,13 @@ static struct net_device *rdma_get_xmit_ + return ERR_PTR(-ENOMEM); + + rcu_read_lock(); ++#ifdef HAVE_NETDEV_GET_XMIT_SLAVE + slave = netdev_get_xmit_slave(master, skb, + !!(device->lag_flags & + RDMA_LAG_FLAGS_HASH_ALL_SLAVES)); ++#else ++ slave = NULL; ++#endif + if (slave) + dev_hold(slave); + rcu_read_unlock(); +@@ -130,7 +134,6 @@ struct net_device *rdma_lag_get_ah_roce_ + + if (!netif_is_bond_master(master)) + goto put; +- + slave = rdma_get_xmit_slave_udp(device, master, ah_attr, flags); + put: + dev_put(master); diff --git a/src/mlnx-ofa_kernel-5.8/backports/0018-BACKPORT-drivers-infiniband-core-mad.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0018-BACKPORT-drivers-infiniband-core-mad.c.patch new file mode 100644 index 0000000..17d7203 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0018-BACKPORT-drivers-infiniband-core-mad.c.patch @@ -0,0 +1,445 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/core/mad.c + +Change-Id: Iee59cdcd26b190f6e365f004acd5eb430bbbf318 +--- + drivers/infiniband/core/mad.c | 134 ++++++++++++++++++++++++++++------ + 1 file changed, 112 insertions(+), 22 deletions(-) + +--- a/drivers/infiniband/core/mad.c ++++ b/drivers/infiniband/core/mad.c +@@ -55,13 +55,18 @@ + #include "opa_smi.h" + #include "agent.h" + ++#ifndef MLX_DISABLE_TRACEPOINTS + #define CREATE_TRACE_POINTS + #include + + #ifdef CONFIG_TRACEPOINTS + static void create_mad_addr_info(struct ib_mad_send_wr_private *mad_send_wr, + struct ib_mad_qp_info *qp_info, ++#ifdef HAVE_TRACE_EVENTS_H + struct trace_event_raw_ib_mad_send_template *entry) ++#else ++ struct ftrace_raw_ib_mad_send_template *entry) ++#endif + { + u16 pkey; + struct ib_device *dev = qp_info->port_priv->device; +@@ -80,6 +85,7 @@ static void create_mad_addr_info(struct + entry->dlid = rdma_ah_get_dlid(&attr); + } + #endif ++#endif + + static int mad_sendq_size = IB_MAD_QP_SEND_SIZE; + static int mad_recvq_size = IB_MAD_QP_RECV_SIZE; +@@ -151,12 +157,19 @@ static int send_sa_cc_mad(struct ib_mad_ + /* + * Timeout FIFO functions - implements FIFO with timeout mechanism + */ +- ++#ifdef HAVE_TIMER_SETUP + static void activate_timeout_handler_task(struct timer_list *t) ++#else ++static void activate_timeout_handler_task(unsigned long data) ++#endif + { + struct to_fifo *tf; + ++#ifdef HAVE_TIMER_SETUP + tf = from_timer(tf, t, timer); ++#else ++ tf = (struct to_fifo *)data; ++#endif + del_timer(&tf->timer); + queue_work(tf->workq, &tf->work); + } +@@ -264,8 +277,16 @@ static struct to_fifo *tf_create(void) + spin_lock_init(&tf->lists_lock); + INIT_LIST_HEAD(&tf->to_head); + INIT_LIST_HEAD(&tf->fifo_head); ++#ifdef HAVE_TIMER_SETUP + timer_setup(&tf->timer, activate_timeout_handler_task, 0); ++#else ++ init_timer(&tf->timer); ++#endif + INIT_WORK(&tf->work, timeout_handler_task); ++#ifndef HAVE_TIMER_SETUP ++ tf->timer.data = (unsigned long)tf; ++ tf->timer.function = activate_timeout_handler_task; ++#endif + tf->timer.expires = jiffies; + tf->stop_enqueue = 0; + tf->num_items = 0; +@@ -812,30 +833,46 @@ struct ib_mad_agent *ib_register_mad_age + /* Validate parameters */ + qpn = get_spl_qp_index(qp_type); + if (qpn == -1) { ++#ifdef RATELIMIT_STATE_INIT + dev_dbg_ratelimited(&device->dev, "%s: invalid QP Type %d\n", + __func__, qp_type); ++#else ++ dev_notice(&device->dev,"%s: invalid QP Type %d\n",__func__, qp_type); ++#endif + goto error1; + } + + if (rmpp_version && rmpp_version != IB_MGMT_RMPP_VERSION) { ++#ifdef RATELIMIT_STATE_INIT + dev_dbg_ratelimited(&device->dev, + "%s: invalid RMPP Version %u\n", + __func__, rmpp_version); ++#else ++ dev_notice(&device->dev,"%s: invalid RMPP Version%u\n",__func__, rmpp_version); ++#endif + goto error1; + } + + /* Validate MAD registration request if supplied */ + if (mad_reg_req) { + if (mad_reg_req->mgmt_class_version >= MAX_MGMT_VERSION) { ++#ifdef RATELIMIT_STATE_INIT + dev_dbg_ratelimited(&device->dev, + "%s: invalid Class Version %u\n", + __func__, + mad_reg_req->mgmt_class_version); ++#else ++ dev_notice(&device->dev,"%s: invalid Class Version %u\n",__func__, mad_reg_req->mgmt_class_version); ++#endif + goto error1; + } + if (!recv_handler) { ++#ifdef RATELIMIT_STATE_INIT + dev_dbg_ratelimited(&device->dev, + "%s: no recv_handler\n", __func__); ++#else ++ dev_notice(&device->dev,"%s: no recv_handler\n",__func__); ++#endif + goto error1; + } + if (mad_reg_req->mgmt_class >= MAX_MGMT_CLASS) { +@@ -845,9 +882,13 @@ struct ib_mad_agent *ib_register_mad_age + */ + if (mad_reg_req->mgmt_class != + IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { ++#ifdef RATELIMIT_STATE_INIT + dev_dbg_ratelimited(&device->dev, + "%s: Invalid Mgmt Class 0x%x\n", + __func__, mad_reg_req->mgmt_class); ++#else ++ dev_notice(&device->dev,"%s: Invalid Mgmt Class 0x%x\n",__func__, mad_reg_req->mgmt_class); ++#endif + goto error1; + } + } else if (mad_reg_req->mgmt_class == 0) { +@@ -855,9 +896,13 @@ struct ib_mad_agent *ib_register_mad_age + * Class 0 is reserved in IBA and is used for + * aliasing of IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE + */ ++#ifdef RATELIMIT_STATE_INIT + dev_dbg_ratelimited(&device->dev, + "%s: Invalid Mgmt Class 0\n", + __func__); ++#else ++ dev_notice(&device->dev,"%s: Invalid Mgmt Class 0\n",__func__); ++#endif + goto error1; + } else if (is_vendor_class(mad_reg_req->mgmt_class)) { + /* +@@ -865,19 +910,27 @@ struct ib_mad_agent *ib_register_mad_age + * ensure supplied OUI is not zero + */ + if (!is_vendor_oui(mad_reg_req->oui)) { ++#ifdef RATELIMIT_STATE_INIT + dev_dbg_ratelimited(&device->dev, + "%s: No OUI specified for class 0x%x\n", + __func__, + mad_reg_req->mgmt_class); ++#else ++ dev_notice(&device->dev,"%s: No OUI specified for class 0x%x\n",__func__, mad_reg_req->mgmt_class); ++#endif + goto error1; + } + } + /* Make sure class supplied is consistent with RMPP */ + if (!ib_is_mad_class_rmpp(mad_reg_req->mgmt_class)) { + if (rmpp_version) { ++#ifdef RATELIMIT_STATE_INIT + dev_dbg_ratelimited(&device->dev, + "%s: RMPP version for non-RMPP class 0x%x\n", + __func__, mad_reg_req->mgmt_class); ++#else ++ dev_notice(&device->dev,"%s: RMPP version for non-RMPP class 0x%x\n",__func__, mad_reg_req->mgmt_class); ++#endif + goto error1; + } + } +@@ -888,9 +941,13 @@ struct ib_mad_agent *ib_register_mad_age + IB_MGMT_CLASS_SUBN_LID_ROUTED) && + (mad_reg_req->mgmt_class != + IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) { ++#ifdef RATELIMIT_STATE_INIT + dev_dbg_ratelimited(&device->dev, + "%s: Invalid SM QP type: class 0x%x\n", + __func__, mad_reg_req->mgmt_class); ++#else ++ dev_notice(&device->dev,"%s: Invalid SM QP type: class 0x%x\n",__func__, mad_reg_req->mgmt_class); ++#endif + goto error1; + } + } else { +@@ -898,9 +955,13 @@ struct ib_mad_agent *ib_register_mad_age + IB_MGMT_CLASS_SUBN_LID_ROUTED) || + (mad_reg_req->mgmt_class == + IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) { ++#ifdef RATELIMIT_STATE_INIT + dev_dbg_ratelimited(&device->dev, + "%s: Invalid GS QP type: class 0x%x\n", + __func__, mad_reg_req->mgmt_class); ++#else ++ dev_notice(&device->dev,"%s: Invalid GS QP type: class 0x%x\n",__func__, mad_reg_req->mgmt_class); ++#endif + goto error1; + } + } +@@ -915,8 +976,12 @@ struct ib_mad_agent *ib_register_mad_age + /* Validate device and port */ + port_priv = ib_get_mad_port(device, port_num); + if (!port_priv) { ++#ifdef RATELIMIT_STATE_INIT + dev_dbg_ratelimited(&device->dev, "%s: Invalid port %u\n", + __func__, port_num); ++#else ++ dev_notice(&device->dev,"%s: Invalid port %d\n",__func__, port_num); ++#endif + ret = ERR_PTR(-ENODEV); + goto error1; + } +@@ -925,8 +990,12 @@ struct ib_mad_agent *ib_register_mad_age + * will not have QP0. + */ + if (!port_priv->qp_info[qpn].qp) { ++#ifdef RATELIMIT_STATE_INIT + dev_dbg_ratelimited(&device->dev, "%s: QP %d not supported\n", + __func__, qpn); ++#else ++ dev_notice(&device->dev,"%s: QP %d not supported\n",__func__, qpn); ++#endif + ret = ERR_PTR(-EPROTONOSUPPORT); + goto error1; + } +@@ -973,7 +1042,6 @@ struct ib_mad_agent *ib_register_mad_age + ret = ERR_PTR(ret2); + goto error4; + } +- + /* + * The mlx4 driver uses the top byte to distinguish which virtual + * function generated the MAD, so we must avoid using it. +@@ -990,6 +1058,7 @@ struct ib_mad_agent *ib_register_mad_age + * Make sure MAD registration (if supplied) + * is non overlapping with any existing ones + */ ++ + spin_lock_irq(&port_priv->reg_lock); + if (mad_reg_req) { + mgmt_class = convert_mgmt_class(mad_reg_req->mgmt_class); +@@ -1017,19 +1086,20 @@ struct ib_mad_agent *ib_register_mad_age + if (is_vendor_method_in_use( + vendor_class, + mad_reg_req)) +- goto error6; ++ goto error6; + } + } + ret2 = add_oui_reg_req(mad_reg_req, mad_agent_priv); + } + if (ret2) { + ret = ERR_PTR(ret2); +- goto error6; ++ goto error6; + } + } + spin_unlock_irq(&port_priv->reg_lock); +- ++#ifndef MLX_DISABLE_TRACEPOINTS + trace_ib_mad_create_agent(mad_agent_priv); ++#endif + return &mad_agent_priv->agent; + error6: + spin_unlock_irq(&port_priv->reg_lock); +@@ -1054,10 +1124,10 @@ static inline void deref_mad_agent(struc + static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv) + { + struct ib_mad_port_private *port_priv; +- ++#ifndef MLX_DISABLE_TRACEPOINTS + /* Note that we could still be handling received MADs */ + trace_ib_mad_unregister_agent(mad_agent_priv); +- ++#endif + /* + * Canceling all sends results in dropping received response + * MADs, preventing us from queuing additional work +@@ -1296,9 +1366,9 @@ static int handle_outgoing_dr_smp(struct + */ + if (opa && smp->class_version == OPA_SM_CLASS_VERSION) { + u32 opa_drslid; +- ++#ifndef MLX_DISABLE_TRACEPOINTS + trace_ib_mad_handle_out_opa_smi(opa_smp); +- ++#endif + if ((opa_get_smp_direction(opa_smp) + ? opa_smp->route.dr.dr_dlid : opa_smp->route.dr.dr_slid) == + OPA_LID_PERMISSIVE && +@@ -1324,8 +1394,9 @@ static int handle_outgoing_dr_smp(struct + opa_smi_check_local_returning_smp(opa_smp, device) == IB_SMI_DISCARD) + goto out; + } else { ++#ifndef MLX_DISABLE_TRACEPOINTS + trace_ib_mad_handle_out_ib_smi(smp); +- ++#endif + if ((ib_get_smp_direction(smp) ? smp->dr_dlid : smp->dr_slid) == + IB_LID_PERMISSIVE && + smi_handle_dr_smp_send(smp, rdma_cap_ib_switch(device), port_num) == +@@ -1714,7 +1785,9 @@ int ib_send_mad(struct ib_mad_send_wr_pr + } + } + if (qp_info->send_queue.count < qp_info->send_queue.max_active) { ++#ifndef MLX_DISABLE_TRACEPOINTS + trace_ib_mad_ib_send_mad(mad_send_wr, qp_info); ++#endif + ret = ib_post_send(mad_agent->qp, &mad_send_wr->send_wr.wr, + NULL); + list = &qp_info->send_queue.list; +@@ -2340,7 +2413,6 @@ out: + deref_mad_agent(mad_agent); + mad_agent = NULL; + } +- + return mad_agent; + } + +@@ -2595,9 +2667,9 @@ static enum smi_action handle_ib_smi(con + { + enum smi_forward_action retsmi; + struct ib_smp *smp = (struct ib_smp *)recv->mad; +- ++#ifndef MLX_DISABLE_TRACEPOINTS + trace_ib_mad_handle_ib_smi(smp); +- ++#endif + if (smi_handle_dr_smp_recv(smp, + rdma_cap_ib_switch(port_priv->device), + port_num, +@@ -2682,9 +2754,9 @@ handle_opa_smi(struct ib_mad_port_privat + { + enum smi_forward_action retsmi; + struct opa_smp *smp = (struct opa_smp *)recv->mad; +- ++#ifndef MLX_DISABLE_TRACEPOINTS + trace_ib_mad_handle_opa_smi(smp); +- ++#endif + if (opa_smi_handle_dr_smp_recv(smp, + rdma_cap_ib_switch(port_priv->device), + port_num, +@@ -2805,10 +2877,10 @@ static void ib_mad_recv_done(struct ib_c + /* Validate MAD */ + if (!validate_mad((const struct ib_mad_hdr *)recv->mad, qp_info, opa)) + goto out; +- ++#ifndef MLX_DISABLE_TRACEPOINTS + trace_ib_mad_recv_done_handler(qp_info, wc, + (struct ib_mad_hdr *)recv->mad); +- ++#endif + mad_size = recv->mad_size; + response = alloc_mad_private(mad_size, GFP_KERNEL); + if (!response) +@@ -2855,7 +2927,9 @@ static void ib_mad_recv_done(struct ib_c + + mad_agent = find_mad_agent(port_priv, (const struct ib_mad_hdr *)recv->mad); + if (mad_agent) { ++#ifndef MLX_DISABLE_TRACEPOINTS + trace_ib_mad_recv_done_agent(mad_agent); ++#endif + ib_mad_complete_recv(mad_agent, &recv->header.recv_wc); + /* + * recv is freed up in error cases in ib_mad_complete_recv +@@ -3025,10 +3099,10 @@ static void ib_mad_send_done(struct ib_c + mad_list); + send_queue = mad_list->mad_queue; + qp_info = send_queue->qp_info; +- ++#ifndef MLX_DISABLE_TRACEPOINTS + trace_ib_mad_send_done_agent(mad_send_wr->mad_agent_priv); + trace_ib_mad_send_done_handler(mad_send_wr, wc); +- ++#endif + retry: + ib_dma_unmap_single(mad_send_wr->send_buf.mad_agent->device, + mad_send_wr->header_mapping, +@@ -3057,7 +3131,9 @@ retry: + ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc); + + if (queued_send_wr) { ++#ifndef MLX_DISABLE_TRACEPOINTS + trace_ib_mad_send_done_resend(queued_send_wr, qp_info); ++#endif + ret = ib_post_send(qp_info->qp, &queued_send_wr->send_wr.wr, + NULL); + if (ret) { +@@ -3105,7 +3181,9 @@ static bool ib_mad_send_error(struct ib_ + if (mad_send_wr->retry) { + /* Repost send */ + mad_send_wr->retry = 0; ++#ifndef MLX_DISABLE_TRACEPOINTS + trace_ib_mad_error_handler(mad_send_wr, qp_info); ++#endif + ret = ib_post_send(qp_info->qp, &mad_send_wr->send_wr.wr, + NULL); + if (!ret) +@@ -4040,10 +4118,14 @@ static ssize_t sa_cc_attr_show(struct ko + return sa->show(cc_obj, buf); + } + ++#ifdef CONFIG_COMPAT_IS_CONST_KOBJECT_SYSFS_OPS + static const struct sysfs_ops sa_cc_sysfs_ops = { +- .show = sa_cc_attr_show, +- .store = sa_cc_attr_store, +-}; ++#else ++ static struct sysfs_ops sa_cc_sysfs_ops = { ++#endif ++ .show = sa_cc_attr_show, ++ .store = sa_cc_attr_store, ++ }; + + #define SA_CC_ATTR(_name) struct sa_cc_attribute sa_cc_attr_##_name = \ + __ATTR(_name, 0644, _name##_show, _name##_store) +@@ -4060,12 +4142,20 @@ static struct attribute *sa_cc_default_a + &sa_cc_attr_drops.attr, + NULL + }; ++#ifdef HAVE_KOBJ_TYPE_DEFAULT_GROUPS ++ATTRIBUTE_GROUPS(sa_cc_default); ++#endif + + static struct kobj_type sa_cc_type = { + .sysfs_ops = &sa_cc_sysfs_ops, ++#ifdef HAVE_KOBJ_TYPE_DEFAULT_GROUPS ++ .default_groups = sa_cc_default_groups ++#else + .default_attrs = sa_cc_default_attrs ++#endif + }; + ++ + static void cleanup_sa_cc_sysfs_ports(struct sa_cc_data *cc_obj) + { + kobject_put(&cc_obj->kobj); diff --git a/src/mlnx-ofa_kernel-5.8/backports/0019-BACKPORT-drivers-infiniband-core-mad_rmpp.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0019-BACKPORT-drivers-infiniband-core-mad_rmpp.c.patch new file mode 100644 index 0000000..45bcfab --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0019-BACKPORT-drivers-infiniband-core-mad_rmpp.c.patch @@ -0,0 +1,24 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/core/mad_rmpp.c + +Change-Id: Ib90113e87affa04b1817e47b209f2e1cc10c5f9f +--- + drivers/infiniband/core/mad_rmpp.c | 5 +++++ + 1 file changed, 5 insertions(+) + +--- a/drivers/infiniband/core/mad_rmpp.c ++++ b/drivers/infiniband/core/mad_rmpp.c +@@ -91,8 +91,13 @@ void ib_cancel_rmpp_recvs(struct ib_mad_ + + spin_lock_irqsave(&agent->lock, flags); + list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) { ++#ifdef HAVE___CANCEL_DELAYED_WORK ++ __cancel_delayed_work(&rmpp_recv->timeout_work); ++ __cancel_delayed_work(&rmpp_recv->cleanup_work); ++#else + cancel_delayed_work(&rmpp_recv->timeout_work); + cancel_delayed_work(&rmpp_recv->cleanup_work); ++#endif + } + spin_unlock_irqrestore(&agent->lock, flags); + diff --git a/src/mlnx-ofa_kernel-5.8/backports/0020-BACKPORT-drivers-infiniband-core-netlink.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0020-BACKPORT-drivers-infiniband-core-netlink.c.patch new file mode 100644 index 0000000..8099c59 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0020-BACKPORT-drivers-infiniband-core-netlink.c.patch @@ -0,0 +1,123 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/core/netlink.c + +Change-Id: I90f3d938c9cf861bf6b5437ca0c52b17ba430b76 +--- + drivers/infiniband/core/netlink.c | 42 +++++++++++++++++++++++++------ + 1 file changed, 35 insertions(+), 7 deletions(-) + +--- a/drivers/infiniband/core/netlink.c ++++ b/drivers/infiniband/core/netlink.c +@@ -45,6 +45,8 @@ + #include + #include "core_priv.h" + ++ ++ + static struct { + const struct rdma_nl_cbs *cb_table; + /* Synchronizes between ongoing netlink commands and netlink client +@@ -152,8 +154,12 @@ int ibnl_put_attr(struct sk_buff *skb, s + } + EXPORT_SYMBOL(ibnl_put_attr); + ++#ifdef HAVE_NETLINK_EXT_ACK + static int rdma_nl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) ++#else ++static int rdma_nl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) ++#endif + { + int type = nlh->nlmsg_type; + unsigned int index = RDMA_NL_GET_CLIENT(type); +@@ -170,7 +176,7 @@ static int rdma_nl_rcv_msg(struct sk_buf + goto done; + + if ((cb_table[op].flags & RDMA_NL_ADMIN_PERM) && +- !netlink_capable(skb, CAP_NET_ADMIN)) { ++ !netlink_capable(skb, CAP_NET_ADMIN)) { + err = -EPERM; + goto done; + } +@@ -181,7 +187,11 @@ static int rdma_nl_rcv_msg(struct sk_buf + */ + if (index == RDMA_NL_LS) { + if (cb_table[op].doit) ++#ifdef HAVE_NETLINK_EXT_ACK + err = cb_table[op].doit(skb, nlh, extack); ++#else ++ err = cb_table[op].doit(skb, nlh); ++#endif + goto done; + } + /* FIXME: Convert IWCM to properly handle doit callbacks */ +@@ -195,7 +205,11 @@ static int rdma_nl_rcv_msg(struct sk_buf + } + + if (cb_table[op].doit) +- err = cb_table[op].doit(skb, nlh, extack); ++#ifdef HAVE_NETLINK_EXT_ACK ++ err = cb_table[op].doit(skb, nlh, extack); ++#else ++ err = cb_table[op].doit(skb, nlh); ++#endif + done: + up_read(&rdma_nl_types[index].sem); + return err; +@@ -208,10 +222,16 @@ done: + * for that consumer only. + */ + static int rdma_nl_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, ++#ifdef HAVE_NETLINK_EXT_ACK + struct nlmsghdr *, + struct netlink_ext_ack *)) ++#else ++ struct nlmsghdr *)) ++#endif + { ++#ifdef HAVE_NETLINK_EXT_ACK + struct netlink_ext_ack extack = {}; ++#endif + struct nlmsghdr *nlh; + int err; + +@@ -239,13 +259,21 @@ static int rdma_nl_rcv_skb(struct sk_buf + if (nlh->nlmsg_type < NLMSG_MIN_TYPE) + goto ack; + ++#ifdef HAVE_NETLINK_EXT_ACK + err = cb(skb, nlh, &extack); ++#else ++ err = cb(skb, nlh); ++#endif + if (err == -EINTR) + goto skip; + + ack: + if (nlh->nlmsg_flags & NLM_F_ACK || err) ++#ifdef HAVE_NETLINK_EXT_ACK + netlink_ack(skb, nlh, err, &extack); ++#else ++ netlink_ack(skb, nlh, err); ++#endif + + skip: + msglen = NLMSG_ALIGN(nlh->nlmsg_len); +@@ -312,12 +340,12 @@ void rdma_nl_exit(void) + int rdma_nl_net_init(struct rdma_dev_net *rnet) + { + struct net *net = read_pnet(&rnet->net); +- struct netlink_kernel_cfg cfg = { +- .input = rdma_nl_rcv, +- }; +- struct sock *nls; ++ struct netlink_kernel_cfg cfg = { ++ .input = rdma_nl_rcv, ++ }; ++ struct sock *nls; + +- nls = netlink_kernel_create(net, NETLINK_RDMA, &cfg); ++ nls = netlink_kernel_create(net, NETLINK_RDMA, &cfg); + if (!nls) + return -ENOMEM; + diff --git a/src/mlnx-ofa_kernel-5.8/backports/0021-BACKPORT-drivers-infiniband-core-nldev.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0021-BACKPORT-drivers-infiniband-core-nldev.c.patch new file mode 100644 index 0000000..489854c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0021-BACKPORT-drivers-infiniband-core-nldev.c.patch @@ -0,0 +1,633 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/core/nldev.c + +Change-Id: I9e3f9f5413dd1c87639260a189e3812c0506cc97 +--- + drivers/infiniband/core/nldev.c | 250 +++++++++++++++++++++++++++----- + 1 file changed, 217 insertions(+), 33 deletions(-) + +--- a/drivers/infiniband/core/nldev.c ++++ b/drivers/infiniband/core/nldev.c +@@ -68,9 +68,9 @@ static const struct nla_policy nldev_pol + [RDMA_NLDEV_ATTR_DRIVER_PRINT_TYPE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_DRIVER_STRING] = { .type = NLA_NUL_STRING, + .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, +- [RDMA_NLDEV_ATTR_DRIVER_S32] = { .type = NLA_S32 }, +- [RDMA_NLDEV_ATTR_DRIVER_S64] = { .type = NLA_S64 }, +- [RDMA_NLDEV_ATTR_DRIVER_U32] = { .type = NLA_U32 }, ++ [RDMA_NLDEV_ATTR_DRIVER_S32] = { .type = NLA_S32 }, ++ [RDMA_NLDEV_ATTR_DRIVER_U32] = { .type = NLA_U32 }, ++ [RDMA_NLDEV_ATTR_DRIVER_S64] = { .type = NLA_S64 }, + [RDMA_NLDEV_ATTR_DRIVER_U64] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_FW_VERSION] = { .type = NLA_NUL_STRING, + .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, +@@ -366,9 +366,12 @@ static int fill_res_info_entry(struct sk + const char *name, u64 curr) + { + struct nlattr *entry_attr; +- ++#ifdef HAVE_NLA_NEST_START_NOFLAG + entry_attr = nla_nest_start_noflag(msg, + RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY); ++#else ++ entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY); ++#endif + if (!entry_attr) + return -EMSGSIZE; + +@@ -404,7 +407,11 @@ static int fill_res_info(struct sk_buff + if (fill_nldev_handle(msg, device)) + return -EMSGSIZE; + ++#ifdef HAVE_NLA_NEST_START_NOFLAG + table_attr = nla_nest_start_noflag(msg, RDMA_NLDEV_ATTR_RES_SUMMARY); ++#else ++ table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_SUMMARY); ++#endif + if (!table_attr) + return -EMSGSIZE; + +@@ -1012,8 +1019,12 @@ static int fill_res_counter_entry(struct + return 0; + } + ++#ifdef HAVE_NETLINK_EXT_ACK + static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) ++#else ++static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh) ++#endif + { + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct ib_device *device; +@@ -1021,8 +1032,16 @@ static int nldev_get_doit(struct sk_buff + u32 index; + int err; + ++#ifdef HAVE_NLMSG_PARSE_DEPRECATED + err = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, ++#else ++ err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, ++#endif/*HAVE_NLMSG_PARSE_DEPRECATED*/ ++#ifdef HAVE_NETLINK_EXT_ACK + nldev_policy, extack); ++#else ++ nldev_policy, NULL); ++#endif + if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) + return -EINVAL; + +@@ -1058,16 +1077,29 @@ err: + return err; + } + ++#ifdef HAVE_NETLINK_EXT_ACK + static int nldev_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) ++#else ++static int nldev_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh) ++#endif + { + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct ib_device *device; + u32 index; + int err; + ++#ifdef HAVE_NETLINK_EXT_ACK ++#ifdef HAVE_NLMSG_PARSE_DEPRECATED + err = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); ++#else ++ err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, ++ nldev_policy, extack); ++#endif /*HAVE_NLMSG_PARSE_DEPRECATED*/ ++#else ++ err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, NULL); ++#endif + if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) + return -EINVAL; + +@@ -1079,8 +1111,13 @@ static int nldev_set_doit(struct sk_buff + if (tb[RDMA_NLDEV_ATTR_DEV_NAME]) { + char name[IB_DEVICE_NAME_MAX] = {}; + ++#ifdef HAVE_NLA_STRSCPY + nla_strscpy(name, tb[RDMA_NLDEV_ATTR_DEV_NAME], + IB_DEVICE_NAME_MAX); ++#else ++ nla_strlcpy(name, tb[RDMA_NLDEV_ATTR_DEV_NAME], ++ IB_DEVICE_NAME_MAX); ++#endif + if (strlen(name) == 0) { + err = -EINVAL; + goto done; +@@ -1148,8 +1185,12 @@ static int nldev_get_dumpit(struct sk_bu + return ib_enum_all_devs(_nldev_get_dumpit, skb, cb); + } + ++#ifdef HAVE_NETLINK_EXT_ACK + static int nldev_port_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) ++#else ++static int nldev_port_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh) ++#endif + { + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct ib_device *device; +@@ -1158,8 +1199,16 @@ static int nldev_port_get_doit(struct sk + u32 port; + int err; + ++#ifdef HAVE_NLMSG_PARSE_DEPRECATED + err = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, +- nldev_policy, extack); ++#else ++ err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, ++#endif /*HAVE_NLMSG_PARSE_DEPRECATED*/ ++#ifdef HAVE_NETLINK_EXT_ACK ++ nldev_policy, extack); ++#else ++ nldev_policy, NULL); ++#endif + if (err || + !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || + !tb[RDMA_NLDEV_ATTR_PORT_INDEX]) +@@ -1193,7 +1242,7 @@ static int nldev_port_get_doit(struct sk + nlmsg_end(msg, nlh); + ib_device_put(device); + +- return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); ++ return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); + + err_free: + nlmsg_free(msg); +@@ -1214,7 +1263,11 @@ static int nldev_port_get_dumpit(struct + int err; + unsigned int p; + ++#ifdef HAVE_NLMSG_PARSE_DEPRECATED + err = nlmsg_parse_deprecated(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, ++#else ++ err = nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, ++#endif + nldev_policy, NULL); + if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) + return -EINVAL; +@@ -1260,8 +1313,12 @@ out: + return skb->len; + } + ++#ifdef HAVE_NETLINK_EXT_ACK + static int nldev_res_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) ++#else ++static int nldev_res_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh) ++#endif + { + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct ib_device *device; +@@ -1269,8 +1326,17 @@ static int nldev_res_get_doit(struct sk_ + u32 index; + int ret; + ++#ifdef HAVE_NLMSG_PARSE_DEPRECATED + ret = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, +- nldev_policy, extack); ++#else ++ ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, ++#endif /*HAVE_NLMSG_PARSE_DEPRECATED*/ ++#ifdef HAVE_NETLINK_EXT_ACK ++ nldev_policy, extack); ++#else ++ nldev_policy, NULL); ++#endif ++ + if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) + return -EINVAL; + +@@ -1295,7 +1361,7 @@ static int nldev_res_get_doit(struct sk_ + + nlmsg_end(msg, nlh); + ib_device_put(device); +- return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); ++ return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); + + err_free: + nlmsg_free(msg); +@@ -1399,7 +1465,9 @@ static const struct nldev_fill_res_entry + }; + + static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh, ++#ifdef HAVE_NETLINK_EXT_ACK + struct netlink_ext_ack *extack, ++#endif + enum rdma_restrack_type res_type, + res_fill_func_t fill_func) + { +@@ -1412,7 +1480,11 @@ static int res_get_common_doit(struct sk + struct sk_buff *msg; + int ret; + ++#ifdef HAVE_NLMSG_PARSE_DEPRECATED + ret = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, ++#else ++ ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, ++#endif + nldev_policy, extack); + if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !fe->id || !tb[fe->id]) + return -EINVAL; +@@ -1449,7 +1521,7 @@ static int res_get_common_doit(struct sk + goto err_get; + } + +- nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, ++ nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, + RDMA_NL_GET_OP(nlh->nlmsg_type)), + 0, 0); +@@ -1459,7 +1531,7 @@ static int res_get_common_doit(struct sk + goto err_free; + } + +- has_cap_net_admin = netlink_capable(skb, CAP_NET_ADMIN); ++ has_cap_net_admin = netlink_capable(skb, CAP_NET_ADMIN); + + ret = fill_func(msg, has_cap_net_admin, res, port); + if (ret) +@@ -1499,7 +1571,11 @@ static int res_get_common_dumpit(struct + u32 index, port = 0; + bool filled = false; + ++#ifdef HAVE_NLMSG_PARSE_DEPRECATED + err = nlmsg_parse_deprecated(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, ++#else ++ err = nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, ++#endif + nldev_policy, NULL); + /* + * Right now, we are expecting the device index to get res information, +@@ -1538,13 +1614,17 @@ static int res_get_common_dumpit(struct + goto err; + } + ++#ifdef HAVE_NLA_NEST_START_NOFLAG + table_attr = nla_nest_start_noflag(skb, fe->nldev_attr); ++#else ++ table_attr = nla_nest_start(skb, fe->nldev_attr); ++#endif + if (!table_attr) { + ret = -EMSGSIZE; + goto err; + } + +- has_cap_net_admin = netlink_capable(cb->skb, CAP_NET_ADMIN); ++ has_cap_net_admin = netlink_capable(cb->skb, CAP_NET_ADMIN); + + rt = &device->res[res_type]; + xa_lock(&rt->xa); +@@ -1561,7 +1641,11 @@ static int res_get_common_dumpit(struct + + filled = true; + ++#ifdef HAVE_NLA_NEST_START_NOFLAG + entry_attr = nla_nest_start_noflag(skb, fe->entry); ++#else ++ entry_attr = nla_nest_start(skb, fe->entry); ++#endif + if (!entry_attr) { + ret = -EMSGSIZE; + rdma_restrack_put(res); +@@ -1603,7 +1687,6 @@ msg_full: + + res_err: + nla_nest_cancel(skb, table_attr); +- + err: + nlmsg_cancel(skb, nlh); + +@@ -1612,6 +1695,7 @@ err_index: + return ret; + } + ++#ifdef HAVE_NETLINK_EXT_ACK + #define RES_GET_FUNCS(name, type) \ + static int nldev_res_get_##name##_dumpit(struct sk_buff *skb, \ + struct netlink_callback *cb) \ +@@ -1626,6 +1710,21 @@ err_index: + return res_get_common_doit(skb, nlh, extack, type, \ + fill_res_##name##_entry); \ + } ++#else ++#define RES_GET_FUNCS(name, type) \ ++ static int nldev_res_get_##name##_dumpit(struct sk_buff *skb, \ ++ struct netlink_callback *cb) \ ++ { \ ++ return res_get_common_dumpit(skb, cb, type, \ ++ fill_res_##name##_entry); \ ++ } \ ++ static int nldev_res_get_##name##_doit(struct sk_buff *skb, \ ++ struct nlmsghdr *nlh) \ ++ { \ ++ return res_get_common_doit(skb, nlh, type, \ ++ fill_res_##name##_entry); \ ++ } ++#endif + + RES_GET_FUNCS(qp, RDMA_RESTRACK_QP); + RES_GET_FUNCS(qp_raw, RDMA_RESTRACK_QP); +@@ -1674,8 +1773,11 @@ void rdma_link_unregister(struct rdma_li + } + EXPORT_SYMBOL(rdma_link_unregister); + +-static int nldev_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, +- struct netlink_ext_ack *extack) ++static int nldev_newlink(struct sk_buff *skb, struct nlmsghdr *nlh ++#ifdef HAVE_NETLINK_EXT_ACK ++ ,struct netlink_ext_ack *extack ++#endif ++ ) + { + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + char ibdev_name[IB_DEVICE_NAME_MAX]; +@@ -1685,20 +1787,35 @@ static int nldev_newlink(struct sk_buff + char type[IFNAMSIZ]; + int err; + ++#ifdef HAVE_NLMSG_PARSE_DEPRECATED + err = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, ++#else ++ err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, ++#endif + nldev_policy, extack); + if (err || !tb[RDMA_NLDEV_ATTR_DEV_NAME] || + !tb[RDMA_NLDEV_ATTR_LINK_TYPE] || !tb[RDMA_NLDEV_ATTR_NDEV_NAME]) + return -EINVAL; + ++#ifdef HAVE_NLA_STRSCPY + nla_strscpy(ibdev_name, tb[RDMA_NLDEV_ATTR_DEV_NAME], + sizeof(ibdev_name)); ++#else ++ nla_strlcpy(ibdev_name, tb[RDMA_NLDEV_ATTR_DEV_NAME], ++ sizeof(ibdev_name)); ++#endif + if (strchr(ibdev_name, '%') || strlen(ibdev_name) == 0) + return -EINVAL; + ++#ifdef HAVE_NLA_STRSCPY + nla_strscpy(type, tb[RDMA_NLDEV_ATTR_LINK_TYPE], sizeof(type)); + nla_strscpy(ndev_name, tb[RDMA_NLDEV_ATTR_NDEV_NAME], + sizeof(ndev_name)); ++#else ++ nla_strlcpy(type, tb[RDMA_NLDEV_ATTR_LINK_TYPE], sizeof(type)); ++ nla_strlcpy(ndev_name, tb[RDMA_NLDEV_ATTR_NDEV_NAME], ++ sizeof(ndev_name)); ++#endif + + ndev = dev_get_by_name(sock_net(skb->sk), ndev_name); + if (!ndev) +@@ -1721,15 +1838,22 @@ static int nldev_newlink(struct sk_buff + return err; + } + +-static int nldev_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, +- struct netlink_ext_ack *extack) ++static int nldev_dellink(struct sk_buff *skb, struct nlmsghdr *nlh ++#ifdef HAVE_NETLINK_EXT_ACK ++ ,struct netlink_ext_ack *extack ++#endif ++ ) + { + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct ib_device *device; + u32 index; + int err; + ++#ifdef HAVE_NLMSG_PARSE_DEPRECATED + err = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, ++#else ++ err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, ++#endif + nldev_policy, extack); + if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) + return -EINVAL; +@@ -1748,8 +1872,11 @@ static int nldev_dellink(struct sk_buff + return 0; + } + +-static int nldev_get_chardev(struct sk_buff *skb, struct nlmsghdr *nlh, +- struct netlink_ext_ack *extack) ++static int nldev_get_chardev(struct sk_buff *skb, struct nlmsghdr *nlh ++#ifdef HAVE_NETLINK_EXT_ACK ++ ,struct netlink_ext_ack *extack ++#endif ++ ) + { + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + char client_name[RDMA_NLDEV_ATTR_CHARDEV_TYPE_SIZE]; +@@ -1764,8 +1891,13 @@ static int nldev_get_chardev(struct sk_b + if (err || !tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE]) + return -EINVAL; + ++#ifdef HAVE_NLA_STRSCPY + nla_strscpy(client_name, tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE], + sizeof(client_name)); ++#else ++ nla_strlcpy(client_name, tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE], ++ sizeof(client_name)); ++#endif + + if (tb[RDMA_NLDEV_ATTR_DEV_INDEX]) { + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); +@@ -1832,8 +1964,11 @@ out_put: + return err; + } + +-static int nldev_sys_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, +- struct netlink_ext_ack *extack) ++static int nldev_sys_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh ++#ifdef HAVE_NETLINK_EXT_ACK ++ ,struct netlink_ext_ack *extack ++#endif ++ ) + { + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct sk_buff *msg; +@@ -1876,8 +2011,11 @@ static int nldev_sys_get_doit(struct sk_ + return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); + } + +-static int nldev_set_sys_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, +- struct netlink_ext_ack *extack) ++static int nldev_set_sys_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh ++#ifdef HAVE_NETLINK_EXT_ACK ++ ,struct netlink_ext_ack *extack ++#endif ++ ) + { + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + u8 enable; +@@ -1898,7 +2036,9 @@ static int nldev_set_sys_set_doit(struct + } + + static int nldev_stat_set_mode_doit(struct sk_buff *msg, ++#ifdef HAVE_NETLINK_EXT_ACK + struct netlink_ext_ack *extack, ++#endif + struct nlattr *tb[], + struct ib_device *device, u32 port) + { +@@ -1915,7 +2055,11 @@ static int nldev_stat_set_mode_doit(stru + if (tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]) + mask = nla_get_u32( + tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]); +- return rdma_counter_set_auto_mode(device, port, mask, extack); ++ return rdma_counter_set_auto_mode(device, port, mask ++#ifdef HAVE_NETLINK_EXT_ACK ++ , extack ++#endif ++ ); + } + + if (!tb[RDMA_NLDEV_ATTR_RES_LQPN]) +@@ -1991,8 +2135,11 @@ out: + return ret; + } + +-static int nldev_stat_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, +- struct netlink_ext_ack *extack) ++static int nldev_stat_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh ++#ifdef HAVE_NETLINK_EXT_ACK ++ ,struct netlink_ext_ack *extack ++#endif ++ ) + { + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct ib_device *device; +@@ -2039,7 +2186,11 @@ static int nldev_stat_set_doit(struct sk + } + + if (tb[RDMA_NLDEV_ATTR_STAT_MODE]) { +- ret = nldev_stat_set_mode_doit(msg, extack, tb, device, port); ++ ret = nldev_stat_set_mode_doit(msg, ++#ifdef HAVE_NETLINK_EXT_ACK ++ extack, ++#endif ++ tb, device, port); + if (ret) + goto err_free_msg; + } +@@ -2061,8 +2212,11 @@ err_put_device: + return ret; + } + +-static int nldev_stat_del_doit(struct sk_buff *skb, struct nlmsghdr *nlh, +- struct netlink_ext_ack *extack) ++static int nldev_stat_del_doit(struct sk_buff *skb, struct nlmsghdr *nlh ++#ifdef HAVE_NETLINK_EXT_ACK ++ ,struct netlink_ext_ack *extack ++#endif ++ ) + { + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct ib_device *device; +@@ -2118,7 +2272,7 @@ static int nldev_stat_del_doit(struct sk + + nlmsg_end(msg, nlh); + ib_device_put(device); +- return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); ++ return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); + + err_fill: + nlmsg_free(msg); +@@ -2129,8 +2283,12 @@ err: + + static int stat_get_doit_default_counter(struct sk_buff *skb, + struct nlmsghdr *nlh, ++#ifdef HAVE_NETLINK_EXT_ACK + struct netlink_ext_ack *extack, + struct nlattr *tb[]) ++#else ++ struct nlattr *tb[]) ++#endif + { + struct rdma_hw_stats *stats; + struct nlattr *table_attr; +@@ -2221,7 +2379,11 @@ err: + } + + static int stat_get_doit_qp(struct sk_buff *skb, struct nlmsghdr *nlh, ++#ifdef HAVE_NETLINK_EXT_ACK + struct netlink_ext_ack *extack, struct nlattr *tb[]) ++#else ++ struct nlattr *tb[]) ++#endif + + { + static enum rdma_nl_counter_mode mode; +@@ -2232,7 +2394,11 @@ static int stat_get_doit_qp(struct sk_bu + int ret; + + if (tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]) ++#ifdef HAVE_NETLINK_EXT_ACK + return nldev_res_get_counter_doit(skb, nlh, extack); ++#else ++ return nldev_res_get_counter_doit(skb, nlh); ++#endif + + if (!tb[RDMA_NLDEV_ATTR_STAT_MODE] || + !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX]) +@@ -2288,8 +2454,11 @@ err: + return ret; + } + +-static int nldev_stat_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, +- struct netlink_ext_ack *extack) ++static int nldev_stat_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh ++#ifdef HAVE_NETLINK_EXT_ACK ++ ,struct netlink_ext_ack *extack ++#endif ++ ) + { + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + int ret; +@@ -2300,14 +2469,26 @@ static int nldev_stat_get_doit(struct sk + return -EINVAL; + + if (!tb[RDMA_NLDEV_ATTR_STAT_RES]) ++#ifdef HAVE_NETLINK_EXT_ACK + return stat_get_doit_default_counter(skb, nlh, extack, tb); ++#else ++ return stat_get_doit_default_counter(skb, nlh, tb); ++#endif + + switch (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES])) { + case RDMA_NLDEV_ATTR_RES_QP: ++#ifdef HAVE_NETLINK_EXT_ACK + ret = stat_get_doit_qp(skb, nlh, extack, tb); ++#else ++ ret = stat_get_doit_qp(skb, nlh, tb); ++#endif + break; + case RDMA_NLDEV_ATTR_RES_MR: ++#ifdef HAVE_NETLINK_EXT_ACK + ret = res_get_common_doit(skb, nlh, extack, RDMA_RESTRACK_MR, ++#else ++ ret = res_get_common_doit(skb, nlh, RDMA_RESTRACK_MR, ++#endif + fill_stat_mr_entry); + break; + default: +@@ -2346,8 +2527,11 @@ static int nldev_stat_get_dumpit(struct + } + + static int nldev_stat_get_counter_status_doit(struct sk_buff *skb, +- struct nlmsghdr *nlh, +- struct netlink_ext_ack *extack) ++ struct nlmsghdr *nlh ++#ifdef HAVE_NETLINK_EXT_ACK ++ ,struct netlink_ext_ack *extack ++#endif ++ ) + { + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX], *table, *entry; + struct rdma_hw_stats *stats; diff --git a/src/mlnx-ofa_kernel-5.8/backports/0022-BACKPORT-drivers-infiniband-core-peer_mem.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0022-BACKPORT-drivers-infiniband-core-peer_mem.c.patch new file mode 100644 index 0000000..e30fad2 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0022-BACKPORT-drivers-infiniband-core-peer_mem.c.patch @@ -0,0 +1,163 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/core/peer_mem.c + +Change-Id: I558f4e0e855687c2a9adcb6de60f43a6b5d42583 +--- + drivers/infiniband/core/peer_mem.c | 66 +++++++++++++++++++++++++++++- + 1 file changed, 64 insertions(+), 2 deletions(-) + +--- a/drivers/infiniband/core/peer_mem.c ++++ b/drivers/infiniband/core/peer_mem.c +@@ -6,6 +6,7 @@ + #include + #include + #include ++#include + #include "ib_peer_mem.h" + + static DEFINE_MUTEX(peer_memory_mutex); +@@ -331,21 +332,41 @@ static void ib_unmap_peer_client(struct + } + + if (to_state == UMEM_PEER_UNMAPPED) { ++#ifdef HAVE_SG_APPEND_TABLE + peer_mem->dma_unmap(&umem_p->umem.sgt_append.sgt, ++#else ++ peer_mem->dma_unmap(&umem_p->umem.sg_head, ++#endif + umem_p->peer_client_context, ++#ifdef HAVE_MMU_NOTIFIER_OPS_HAS_FREE_NOTIFIER + umem_p->umem.ibdev->dma_device); ++#else ++ umem_p->umem.context->device->dma_device); ++#endif ++#ifdef HAVE_SG_APPEND_TABLE + peer_mem->put_pages(&umem_p->umem.sgt_append.sgt, ++#else ++ peer_mem->put_pages(&umem_p->umem.sg_head, ++#endif + umem_p->peer_client_context); + } + ++#ifdef HAVE_SG_APPEND_TABLE + memset(&umem->sgt_append.sgt, 0, sizeof(umem->sgt_append.sgt)); ++#else ++ memset(&umem->sg_head, 0, sizeof(umem->sg_head)); ++#endif + atomic64_inc(&ib_peer_client->stats.num_dealloc_mrs); + } + + if ((cur_state == UMEM_PEER_MAPPED && to_state == UMEM_PEER_UNMAPPED) || + (cur_state == UMEM_PEER_INVALIDATED && + to_state == UMEM_PEER_UNMAPPED)) { ++#ifdef HAVE_SG_APPEND_TABLE + atomic64_add(umem->sgt_append.sgt.nents, ++#else ++ atomic64_add(umem->sg_head.nents, ++#endif + &ib_peer_client->stats.num_dereg_pages); + atomic64_add(umem->length, + &ib_peer_client->stats.num_dereg_bytes); +@@ -517,8 +538,11 @@ static void fix_peer_sgls(struct ib_umem + struct ib_umem *umem = &umem_p->umem; + struct scatterlist *sg; + int i; +- ++#ifdef HAVE_SG_APPEND_TABLE + for_each_sgtable_dma_sg(&umem->sgt_append.sgt, sg, i) { ++#else ++ for_each_sg(umem_p->umem.sg_head.sgl, sg, umem_p->umem.nmap, i) { ++#endif + if (i == 0) { + unsigned long offset; + +@@ -534,7 +558,11 @@ static void fix_peer_sgls(struct ib_umem + sg->length -= offset; + } + ++#ifdef HAVE_SG_APPEND_TABLE + if (i == umem_p->umem.sgt_append.sgt.nents - 1) { ++#else ++ if (i == umem_p->umem.nmap - 1) { ++#endif + unsigned long trim; + + umem_p->last_sg = sg; +@@ -573,7 +601,11 @@ struct ib_umem *ib_peer_umem_get(struct + + kref_init(&umem_p->kref); + umem_p->umem = *old_umem; ++#ifdef HAVE_SG_APPEND_TABLE + memset(&umem_p->umem.sgt_append.sgt, 0, sizeof(umem_p->umem.sgt_append.sgt)); ++#else ++ memset(&umem_p->umem.sg_head, 0, sizeof(umem_p->umem.sg_head)); ++#endif + umem_p->umem.is_peer = 1; + umem_p->ib_peer_client = ib_peer_client; + umem_p->peer_client_context = peer_client_context; +@@ -605,10 +637,22 @@ struct ib_umem *ib_peer_umem_get(struct + if (ret) + goto err_xa; + ++#ifdef HAVE_SG_APPEND_TABLE + ret = ib_peer_client->peer_mem->dma_map(&umem_p->umem.sgt_append.sgt, ++#else ++ ret = ib_peer_client->peer_mem->dma_map(&umem_p->umem.sg_head, ++#endif + peer_client_context, ++#ifdef HAVE_MMU_NOTIFIER_OPS_HAS_FREE_NOTIFIER + umem_p->umem.ibdev->dma_device, ++#else ++ umem_p->umem.context->device->dma_device, ++#endif ++#ifdef HAVE_SG_APPEND_TABLE + 0, &umem_p->umem.sgt_append.sgt.nents); ++#else ++ 0, &umem_p->umem.nmap); ++#endif + if (ret) + goto err_pages; + +@@ -618,7 +662,11 @@ struct ib_umem *ib_peer_umem_get(struct + fix_peer_sgls(umem_p, peer_page_size); + + umem_p->mapped_state = UMEM_PEER_MAPPED; ++#ifdef HAVE_SG_APPEND_TABLE + atomic64_add(umem_p->umem.sgt_append.sgt.nents, &ib_peer_client->stats.num_reg_pages); ++#else ++ atomic64_add(umem_p->umem.nmap, &ib_peer_client->stats.num_reg_pages); ++#endif + atomic64_add(umem_p->umem.length, &ib_peer_client->stats.num_reg_bytes); + atomic64_inc(&ib_peer_client->stats.num_alloc_mrs); + +@@ -639,7 +687,11 @@ struct ib_umem *ib_peer_umem_get(struct + return &umem_p->umem; + + err_pages: ++#ifdef HAVE_SG_APPEND_TABLE + ib_peer_client->peer_mem->put_pages(&umem_p->umem.sgt_append.sgt, ++#else ++ ib_peer_client->peer_mem->put_pages(&umem_p->umem.sg_head, ++#endif + umem_p->peer_client_context); + err_xa: + if (umem_p->xa_id != PEER_NO_INVALIDATION_ID) +@@ -672,7 +724,17 @@ void ib_peer_umem_release(struct ib_umem + umem_p->ib_peer_client = NULL; + + /* Must match ib_umem_release() */ +- atomic64_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm); ++#ifdef HAVE_ATOMIC_PINNED_VM ++ atomic64_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm); ++#else ++ down_write(&umem->owning_mm->mmap_sem); ++#ifdef HAVE_PINNED_VM ++ umem->owning_mm->pinned_vm -= ib_umem_num_pages(umem); ++#else ++ umem->owning_mm->locked_vm -= ib_umem_num_pages(umem); ++#endif /* HAVE_PINNED_VM */ ++ up_write(&umem->owning_mm->mmap_sem); ++#endif /*HAVE_ATOMIC_PINNED_VM*/ + mmdrop(umem->owning_mm); + + kref_put(&umem_p->kref, ib_peer_umem_kref_release); diff --git a/src/mlnx-ofa_kernel-5.8/backports/0023-BACKPORT-drivers-infiniband-core-rdma_core.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0023-BACKPORT-drivers-infiniband-core-rdma_core.c.patch new file mode 100644 index 0000000..3253982 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0023-BACKPORT-drivers-infiniband-core-rdma_core.c.patch @@ -0,0 +1,86 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/core/rdma_core.c + +Change-Id: If107ff28d8fe5101bdfe8f17b1c704d78d4f369c +--- + drivers/infiniband/core/rdma_core.c | 19 +++++++++++++++++-- + 1 file changed, 17 insertions(+), 2 deletions(-) + +--- a/drivers/infiniband/core/rdma_core.c ++++ b/drivers/infiniband/core/rdma_core.c +@@ -74,7 +74,13 @@ static int uverbs_try_lock_object(struct + */ + switch (mode) { + case UVERBS_LOOKUP_READ: ++#ifdef HAVE_ATOMIC_FETCH_ADD_UNLESS + return atomic_fetch_add_unless(&uobj->usecnt, 1, -1) == -1 ? ++#elif defined(HAVE___ATOMIC_ADD_UNLESS) ++ return __atomic_add_unless(&uobj->usecnt, 1, -1) == -1 ? ++#else ++ return atomic_add_unless(&uobj->usecnt, 1, -1) == -1 ? ++#endif + -EBUSY : 0; + case UVERBS_LOOKUP_WRITE: + /* lock is exclusive */ +@@ -435,15 +441,19 @@ alloc_begin_idr_uobject(const struct uve + if (ret) + goto uobj_put; + ++#ifdef HAVE_CGROUP_RDMA_H + ret = ib_rdmacg_try_charge(&uobj->cg_obj, uobj->context->device, + RDMACG_RESOURCE_HCA_OBJECT); + if (ret) + goto remove; ++#endif + + return uobj; + ++#ifdef HAVE_CGROUP_RDMA_H + remove: + xa_erase(&attrs->ufile->idr, uobj->id); ++#endif + uobj_put: + uverbs_uobject_put(uobj); + return ERR_PTR(ret); +@@ -522,9 +532,10 @@ struct ib_uobject *rdma_alloc_begin_uobj + + static void alloc_abort_idr_uobject(struct ib_uobject *uobj) + { ++#ifdef HAVE_CGROUP_RDMA_H + ib_rdmacg_uncharge(&uobj->cg_obj, uobj->context->device, + RDMACG_RESOURCE_HCA_OBJECT); +- ++#endif + xa_erase(&uobj->ufile->idr, uobj->id); + } + +@@ -543,15 +554,17 @@ static int __must_check destroy_hw_idr_u + if (why == RDMA_REMOVE_ABORT) + return 0; + ++#ifdef HAVE_CGROUP_RDMA_H + ib_rdmacg_uncharge(&uobj->cg_obj, uobj->context->device, + RDMACG_RESOURCE_HCA_OBJECT); +- ++#endif + return 0; + } + + static void remove_handle_idr_uobject(struct ib_uobject *uobj) + { + xa_erase(&uobj->ufile->idr, uobj->id); ++ + /* Matches the kref in alloc_commit_idr_uobject */ + uverbs_uobject_put(uobj); + } +@@ -865,8 +878,10 @@ static void ufile_destroy_ucontext(struc + ib_dev->ops.disassociate_ucontext(ucontext); + } + ++#ifdef HAVE_CGROUP_RDMA_H + ib_rdmacg_uncharge(&ucontext->cg_obj, ib_dev, + RDMACG_RESOURCE_HCA_HANDLE); ++#endif + + rdma_restrack_del(&ucontext->res); + diff --git a/src/mlnx-ofa_kernel-5.8/backports/0024-BACKPORT-drivers-infiniband-core-roce_gid_mgmt.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0024-BACKPORT-drivers-infiniband-core-roce_gid_mgmt.c.patch new file mode 100644 index 0000000..d3ade2d --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0024-BACKPORT-drivers-infiniband-core-roce_gid_mgmt.c.patch @@ -0,0 +1,397 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/core/roce_gid_mgmt.c + +Change-Id: I198576d997dd95931f9811cabfefaf2dbe102eae +--- + drivers/infiniband/core/roce_gid_mgmt.c | 188 ++++++++++++++++++++---- + 1 file changed, 156 insertions(+), 32 deletions(-) + +--- a/drivers/infiniband/core/roce_gid_mgmt.c ++++ b/drivers/infiniband/core/roce_gid_mgmt.c +@@ -37,11 +37,20 @@ + + /* For in6_dev_get/in6_dev_put */ + #include ++#ifdef MLX_USE_LAG_COMPAT ++#define MLX_IMPL_LAG_EVENTS ++#endif + #include + + #include + #include + ++#if defined(MLX_USE_LAG_COMPAT) || \ ++ (defined(HAVE_NETDEV_NOTIFIER_CHANGEUPPER_INFO) && \ ++ (defined(HAVE_NETDEV_WALK_ALL_UPPER_DEV_RCU) || defined(HAVE_NETDEV_FOR_EACH_ALL_UPPER_DEV_RCU))) ++#define USE_UPPER_INFO ++#endif ++ + static struct workqueue_struct *gid_cache_wq; + + enum gid_op_type { +@@ -334,7 +343,9 @@ static void bond_delete_netdev_default_g + static void enum_netdev_ipv4_ips(struct ib_device *ib_dev, + u32 port, struct net_device *ndev) + { ++#ifndef HAVE_FOR_IFA + const struct in_ifaddr *ifa; ++#endif + struct in_device *in_dev; + struct sin_list { + struct list_head list; +@@ -354,7 +365,11 @@ static void enum_netdev_ipv4_ips(struct + return; + } + ++#ifndef HAVE_FOR_IFA + in_dev_for_each_ifa_rcu(ifa, in_dev) { ++#else ++ for_ifa(in_dev) { ++#endif + struct sin_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC); + + if (!entry) +@@ -364,6 +379,9 @@ static void enum_netdev_ipv4_ips(struct + entry->ip.sin_addr.s_addr = ifa->ifa_address; + list_add_tail(&entry->list, &sin_list); + } ++#ifdef HAVE_FOR_IFA ++ endfor_ifa(in_dev); ++#endif + + rcu_read_unlock(); + +@@ -375,6 +393,7 @@ static void enum_netdev_ipv4_ips(struct + } + } + ++#if IS_ENABLED(CONFIG_IPV6) + static void enum_netdev_ipv6_ips(struct ib_device *ib_dev, + u32 port, struct net_device *ndev) + { +@@ -420,13 +439,15 @@ static void enum_netdev_ipv6_ips(struct + kfree(sin6_iter); + } + } ++#endif + + static void _add_netdev_ips(struct ib_device *ib_dev, u32 port, + struct net_device *ndev) + { + enum_netdev_ipv4_ips(ib_dev, port, ndev); +- if (IS_ENABLED(CONFIG_IPV6)) +- enum_netdev_ipv6_ips(ib_dev, port, ndev); ++#if IS_ENABLED(CONFIG_IPV6) ++ enum_netdev_ipv6_ips(ib_dev, port, ndev); ++#endif + } + + static void add_netdev_ips(struct ib_device *ib_dev, u32 port, +@@ -441,27 +462,6 @@ static void del_netdev_ips(struct ib_dev + ib_cache_gid_del_all_netdev_gids(ib_dev, port, cookie); + } + +-/** +- * del_default_gids - Delete default GIDs of the event/cookie netdevice +- * @ib_dev: RDMA device pointer +- * @port: Port of the RDMA device whose GID table to consider +- * @rdma_ndev: Unused rdma netdevice +- * @cookie: Pointer to event netdevice +- * +- * del_default_gids() deletes the default GIDs of the event/cookie netdevice. +- */ +-static void del_default_gids(struct ib_device *ib_dev, u32 port, +- struct net_device *rdma_ndev, void *cookie) +-{ +- struct net_device *cookie_ndev = cookie; +- unsigned long gid_type_mask; +- +- gid_type_mask = roce_gid_type_mask_support(ib_dev, port); +- +- ib_cache_gid_set_default_gid(ib_dev, port, cookie_ndev, gid_type_mask, +- IB_CACHE_GID_DEFAULT_MODE_DELETE); +-} +- + static void add_default_gids(struct ib_device *ib_dev, u32 port, + struct net_device *rdma_ndev, void *cookie) + { +@@ -485,7 +485,9 @@ static void enum_all_gids_of_dev_cb(stru + * our feet + */ + rtnl_lock(); ++#ifdef HAVE_RTNETLINK_NET_RWSEM + down_read(&net_rwsem); ++#endif + for_each_net(net) + for_each_netdev(net, ndev) { + /* +@@ -501,7 +503,9 @@ static void enum_all_gids_of_dev_cb(stru + rdma_ndev, ndev)) + _add_netdev_ips(ib_dev, port, ndev); + } ++#ifdef HAVE_RTNETLINK_NET_RWSEM + up_read(&net_rwsem); ++#endif + rtnl_unlock(); + } + +@@ -533,16 +537,27 @@ static void callback_for_addr_gid_device + &parsed->gid_attr); + } + ++#ifdef USE_UPPER_INFO ++ ++#ifdef HAVE_NETDEV_WALK_ALL_UPPER_DEV_RCU + struct upper_list { + struct list_head list; + struct net_device *upper; + }; + + static int netdev_upper_walk(struct net_device *upper, +- struct netdev_nested_priv *priv) ++#ifdef HAVE_NETDEV_NESTED_PRIV_STRUCT ++ struct netdev_nested_priv *priv) ++#else ++ void *data) ++#endif + { + struct upper_list *entry = kmalloc(sizeof(*entry), GFP_ATOMIC); ++#ifdef HAVE_NETDEV_NESTED_PRIV_STRUCT + struct list_head *upper_list = (struct list_head *)priv->data; ++#else ++ struct list_head *upper_list = data; ++#endif + + if (!entry) + return 0; +@@ -553,6 +568,7 @@ static int netdev_upper_walk(struct net_ + + return 0; + } ++#endif /* HAVE_NETDEV_WALK_ALL_UPPER_DEV_RCU */ + + static void handle_netdev_upper(struct ib_device *ib_dev, u32 port, + void *cookie, +@@ -561,15 +577,64 @@ static void handle_netdev_upper(struct i + struct net_device *ndev)) + { + struct net_device *ndev = cookie; ++#ifdef HAVE_NETDEV_NESTED_PRIV_STRUCT + struct netdev_nested_priv priv; ++#endif ++#ifndef HAVE_NETDEV_WALK_ALL_UPPER_DEV_RCU ++ struct upper_list { ++ struct list_head list; ++ struct net_device *upper; ++ }; ++ struct net_device *upper; ++#ifndef MLX_USE_LAG_COMPAT ++ struct list_head *iter; ++#endif ++#endif /* HAVE_NETDEV_WALK_ALL_UPPER_DEV_RCU */ + struct upper_list *upper_iter; + struct upper_list *upper_temp; + LIST_HEAD(upper_list); + ++#ifdef MLX_USE_LAG_COMPAT ++ rtnl_lock(); ++#endif ++ ++#ifdef HAVE_NETDEV_NESTED_PRIV_STRUCT + priv.data = &upper_list; ++#endif + rcu_read_lock(); +- netdev_walk_all_upper_dev_rcu(ndev, netdev_upper_walk, &priv); ++#ifndef HAVE_NETDEV_WALK_ALL_UPPER_DEV_RCU ++#ifndef MLX_USE_LAG_COMPAT ++ netdev_for_each_all_upper_dev_rcu(ndev, upper, iter) { ++ struct upper_list *entry; ++#else ++ for_each_netdev(dev_net(ndev), upper) { ++ struct upper_list *entry; ++ ++ if (!rdma_is_upper_dev_rcu(ndev, upper)) ++ continue; ++#endif ++ entry = kmalloc(sizeof(*entry), GFP_ATOMIC); ++ if (!entry) { ++ pr_info("roce_gid_mgmt: couldn't allocate entry to delete ndev\n"); ++ continue; ++ } ++ ++ list_add_tail(&entry->list, &upper_list); ++ dev_hold(upper); ++ entry->upper = upper; ++ } ++#else /* HAVE_NETDEV_WALK_ALL_UPPER_DEV_RCU */ ++ netdev_walk_all_upper_dev_rcu(ndev, netdev_upper_walk, ++#ifdef HAVE_NETDEV_NESTED_PRIV_STRUCT ++ &priv); ++#else ++ &upper_list); ++#endif ++#endif /* HAVE_NETDEV_WALK_ALL_UPPER_DEV_RCU */ + rcu_read_unlock(); ++#ifdef MLX_USE_LAG_COMPAT ++ rtnl_unlock(); ++#endif + + handle_netdev(ib_dev, port, ndev); + list_for_each_entry_safe(upper_iter, upper_temp, &upper_list, +@@ -598,6 +663,7 @@ static void add_netdev_upper_ips(struct + { + handle_netdev_upper(ib_dev, port, cookie, _add_netdev_ips); + } ++#endif /* USE_UPPER_INFO */ + + static void del_netdev_default_ips_join(struct ib_device *ib_dev, u32 port, + struct net_device *rdma_ndev, +@@ -673,11 +739,33 @@ static const struct netdev_event_work_cm + .filter = is_eth_port_of_netdev_filter + }; + +-static const struct netdev_event_work_cmd add_cmd_upper_ips = { +- .cb = add_netdev_upper_ips, +- .filter = is_eth_port_of_netdev_filter ++static const struct netdev_event_work_cmd bonding_default_add_cmd = { ++ .cb = add_default_gids, ++ .filter = is_upper_ndev_bond_master_filter + }; + ++#ifdef USE_UPPER_INFO ++/** ++ * del_default_gids - Delete default GIDs of the event/cookie netdevice ++ * @ib_dev: RDMA device pointer ++ * @port: Port of the RDMA device whose GID table to consider ++ * @rdma_ndev: Unused rdma netdevice ++ * @cookie: Pointer to event netdevice ++ * ++ * del_default_gids() deletes the default GIDs of the event/cookie netdevice. ++ */ ++static void del_default_gids(struct ib_device *ib_dev, u32 port, ++ struct net_device *rdma_ndev, void *cookie) ++{ ++ struct net_device *cookie_ndev = cookie; ++ unsigned long gid_type_mask; ++ ++ gid_type_mask = roce_gid_type_mask_support(ib_dev, port); ++ ++ ib_cache_gid_set_default_gid(ib_dev, port, cookie_ndev, gid_type_mask, ++ IB_CACHE_GID_DEFAULT_MODE_DELETE); ++} ++ + static void + ndev_event_unlink(struct netdev_notifier_changeupper_info *changeupper_info, + struct netdev_event_work_cmd *cmds) +@@ -693,9 +781,9 @@ ndev_event_unlink(struct netdev_notifier + cmds[1] = add_cmd; + } + +-static const struct netdev_event_work_cmd bonding_default_add_cmd = { +- .cb = add_default_gids, +- .filter = is_upper_ndev_bond_master_filter ++static const struct netdev_event_work_cmd add_cmd_upper_ips = { ++ .cb = add_netdev_upper_ips, ++ .filter = is_eth_port_of_netdev_filter + }; + + static void +@@ -736,6 +824,7 @@ static void netdevice_event_changeupper( + else + ndev_event_unlink(changeupper_info, cmds); + } ++#endif + + static const struct netdev_event_work_cmd add_default_gid_cmd = { + .cb = add_default_gids, +@@ -758,9 +847,22 @@ static int netdevice_event(struct notifi + .filter = is_eth_port_of_netdev_filter + }; + static const struct netdev_event_work_cmd bonding_event_ips_del_cmd = { ++#ifdef USE_UPPER_INFO + .cb = del_netdev_upper_ips, .filter = upper_device_filter}; +- struct net_device *ndev = netdev_notifier_info_to_dev(ptr); ++#else ++ .cb = del_netdev_ips, .filter = upper_device_filter}; ++#endif + struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ] = { {NULL} }; ++ struct net_device *ndev; ++ ++#ifdef MLX_USE_LAG_COMPAT ++ if (event == NETDEV_CHANGEUPPER || event == NETDEV_CHANGELOWERSTATE) ++ ndev = netdev_notifier_info_to_dev_v2(ptr); ++ else ++ ndev = netdev_notifier_info_to_dev(ptr); ++#else ++ ndev = netdev_notifier_info_to_dev(ptr); ++#endif + + if (ndev->type != ARPHRD_ETHER) + return NOTIFY_DONE; +@@ -768,6 +870,9 @@ static int netdevice_event(struct notifi + switch (event) { + case NETDEV_REGISTER: + case NETDEV_UP: ++#ifndef USE_UPPER_INFO ++ case NETDEV_JOIN: ++#endif + cmds[0] = bonding_default_del_cmd_join; + cmds[1] = add_default_gid_cmd; + cmds[2] = add_cmd; +@@ -788,18 +893,24 @@ static int netdevice_event(struct notifi + } + break; + ++#ifdef USE_UPPER_INFO + case NETDEV_CHANGEUPPER: + netdevice_event_changeupper(ndev, + container_of(ptr, struct netdev_notifier_changeupper_info, info), + cmds); + break; ++#endif + + case NETDEV_BONDING_FAILOVER: + cmds[0] = bonding_event_ips_del_cmd; + /* Add default GIDs of the bond device */ + cmds[1] = bonding_default_add_cmd; ++#ifdef USE_UPPER_INFO + /* Add IP based GIDs of the bond device */ + cmds[2] = add_cmd_upper_ips; ++#else ++ cmds[2] = add_cmd; ++#endif + break; + + default: +@@ -904,6 +1015,13 @@ static struct notifier_block nb_inet6add + .notifier_call = inet6addr_event + }; + ++#ifdef MLX_USE_LAG_COMPAT ++static void roce_lag_compat_netdev_event(unsigned long event, void *ptr) ++{ ++ nb_netdevice.notifier_call(&nb_netdevice, event, ptr); ++} ++#endif ++ + int __init roce_gid_mgmt_init(void) + { + gid_cache_wq = alloc_ordered_workqueue("gid-cache-wq", 0); +@@ -920,11 +1038,17 @@ int __init roce_gid_mgmt_init(void) + */ + register_netdevice_notifier(&nb_netdevice); + ++#ifdef MLX_USE_LAG_COMPAT ++ mlx_lag_compat_events_open(roce_lag_compat_netdev_event); ++#endif + return 0; + } + + void __exit roce_gid_mgmt_cleanup(void) + { ++#ifdef MLX_USE_LAG_COMPAT ++ mlx_lag_compat_events_close(); ++#endif + if (IS_ENABLED(CONFIG_IPV6)) + unregister_inet6addr_notifier(&nb_inet6addr); + unregister_inetaddr_notifier(&nb_inetaddr); diff --git a/src/mlnx-ofa_kernel-5.8/backports/0025-BACKPORT-drivers-infiniband-core-sa_query.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0025-BACKPORT-drivers-infiniband-core-sa_query.c.patch new file mode 100644 index 0000000..6715ac0 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0025-BACKPORT-drivers-infiniband-core-sa_query.c.patch @@ -0,0 +1,87 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/core/sa_query.c + +Change-Id: I53449d1b2c1ab4a4fec05166682aeeb205560b57 +--- + drivers/infiniband/core/sa_query.c | 30 +++++++++++++++++++++++++++--- + 1 file changed, 27 insertions(+), 3 deletions(-) + +--- a/drivers/infiniband/core/sa_query.c ++++ b/drivers/infiniband/core/sa_query.c +@@ -714,10 +714,17 @@ static void ib_nl_set_path_rec_attrs(str + query->mad_buf->context[1] = NULL; + + /* Construct the family header first */ +- header = skb_put(skb, NLMSG_ALIGN(sizeof(*header))); ++ header = (struct rdma_ls_resolve_header *) ++ skb_put(skb, NLMSG_ALIGN(sizeof(*header))); ++#ifdef HAVE_STRSCPY_PAD + strscpy_pad(header->device_name, + dev_name(&query->port->agent->device->dev), + LS_DEVICE_NAME_MAX); ++#else ++ strncpy(header->device_name, ++ dev_name(&query->port->agent->device->dev), ++ LS_DEVICE_NAME_MAX); ++#endif + header->port_num = query->port->port_num; + + if ((comp_mask & IB_SA_PATH_REC_REVERSIBLE) && +@@ -995,8 +1002,12 @@ static void ib_nl_request_timeout(struct + } + + int ib_nl_handle_set_timeout(struct sk_buff *skb, ++#ifdef HAVE_NETLINK_EXT_ACK + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) ++#else ++ struct nlmsghdr *nlh) ++#endif + { + int timeout, delta, abs_delta; + const struct nlattr *attr; +@@ -1007,11 +1018,15 @@ int ib_nl_handle_set_timeout(struct sk_b + int ret; + + if (!(nlh->nlmsg_flags & NLM_F_REQUEST) || +- !(NETLINK_CB(skb).sk)) ++ !(NETLINK_CB(skb).sk)) + return -EPERM; +- ++#ifdef HAVE_NLA_PARSE_DEPRECATED + ret = nla_parse_deprecated(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh), + nlmsg_len(nlh), ib_nl_policy, NULL); ++#else ++ ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh), ++ nlmsg_len(nlh), ib_nl_policy, NULL); ++#endif /*HAVE_NLA_PARSE_DEPRECATED*/ + attr = (const struct nlattr *)tb[LS_NLA_TYPE_TIMEOUT]; + if (ret || !attr) + goto settimeout_out; +@@ -1062,8 +1077,13 @@ static inline int ib_nl_is_good_resolve_ + if (nlh->nlmsg_flags & RDMA_NL_LS_F_ERR) + return 0; + ++#ifdef HAVE_NLA_PARSE_DEPRECATED + ret = nla_parse_deprecated(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh), + nlmsg_len(nlh), ib_nl_policy, NULL); ++#else ++ ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh), ++ nlmsg_len(nlh), ib_nl_policy, NULL); ++#endif + if (ret) + return 0; + +@@ -1071,8 +1091,12 @@ static inline int ib_nl_is_good_resolve_ + } + + int ib_nl_handle_resolve_resp(struct sk_buff *skb, ++#ifdef HAVE_NETLINK_EXT_ACK + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) ++#else ++ struct nlmsghdr *nlh) ++#endif + { + unsigned long flags; + struct ib_sa_query *query; diff --git a/src/mlnx-ofa_kernel-5.8/backports/0026-BACKPORT-drivers-infiniband-core-trace.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0026-BACKPORT-drivers-infiniband-core-trace.c.patch new file mode 100644 index 0000000..efdaa3d --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0026-BACKPORT-drivers-infiniband-core-trace.c.patch @@ -0,0 +1,19 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/core/trace.c + +Change-Id: Ie9e1da1897c7d4ce708a82b0742459cbb48336d6 +--- + drivers/infiniband/core/trace.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/drivers/infiniband/core/trace.c ++++ b/drivers/infiniband/core/trace.c +@@ -7,6 +7,8 @@ + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. + */ + ++#ifdef HAVE_TRACE_EVENTS_RDMA_CORE_HEADER + #define CREATE_TRACE_POINTS + + #include ++#endif diff --git a/src/mlnx-ofa_kernel-5.8/backports/0027-BACKPORT-drivers-infiniband-core-ucma.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0027-BACKPORT-drivers-infiniband-core-ucma.c.patch new file mode 100644 index 0000000..f1c6a82 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0027-BACKPORT-drivers-infiniband-core-ucma.c.patch @@ -0,0 +1,104 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/core/ucma.c + +Change-Id: Ife723b66e530461be468b12b3192b71145e2a655 +--- + drivers/infiniband/core/ucma.c | 23 ++++++++++++++++++++--- + 1 file changed, 20 insertions(+), 3 deletions(-) + +--- a/drivers/infiniband/core/ucma.c ++++ b/drivers/infiniband/core/ucma.c +@@ -43,9 +43,10 @@ + #include + #include + #include +- ++#ifdef HAVE_ARRAY_INDEX_NOSPEC + #include +- ++#endif ++#include + #include + #include + #include +@@ -59,9 +60,13 @@ + MODULE_AUTHOR("Sean Hefty"); + MODULE_DESCRIPTION("RDMA Userspace Connection Manager Access"); + MODULE_LICENSE("Dual BSD/GPL"); ++#ifdef RETPOLINE_MLNX ++MODULE_INFO(retpoline, "Y"); ++#endif + + static unsigned int max_backlog = 1024; + ++#ifndef CONFIG_SYSCTL_SYSCALL_CHECK + static struct ctl_table_header *ucma_ctl_table_hdr; + static struct ctl_table ucma_ctl_table[] = { + { +@@ -73,6 +78,7 @@ static struct ctl_table ucma_ctl_table[] + }, + { } + }; ++#endif + + struct ucma_file { + struct mutex mut; +@@ -1733,8 +1739,9 @@ static ssize_t ucma_write(struct file *f + + if (hdr.cmd >= ARRAY_SIZE(ucma_cmd_table)) + return -EINVAL; ++#ifdef HAVE_ARRAY_INDEX_NOSPEC + hdr.cmd = array_index_nospec(hdr.cmd, ARRAY_SIZE(ucma_cmd_table)); +- ++#endif + if (hdr.in + sizeof(hdr) > len) + return -EINVAL; + +@@ -1785,7 +1792,11 @@ static int ucma_open(struct inode *inode + filp->private_data = file; + file->filp = filp; + ++#ifdef HAVE_STREAM_OPEN + return stream_open(inode, filp); ++#else ++ return nonseekable_open(inode, filp); ++#endif + } + + static int ucma_close(struct inode *inode, struct file *filp) +@@ -1863,12 +1874,14 @@ static int __init ucma_init(void) + goto err1; + } + ++#ifndef CONFIG_SYSCTL_SYSCALL_CHECK + ucma_ctl_table_hdr = register_net_sysctl(&init_net, "net/rdma_ucm", ucma_ctl_table); + if (!ucma_ctl_table_hdr) { + pr_err("rdma_ucm: couldn't register sysctl paths\n"); + ret = -ENOMEM; + goto err2; + } ++#endif + + ret = ib_register_client(&rdma_cma_client); + if (ret) +@@ -1877,8 +1890,10 @@ static int __init ucma_init(void) + return 0; + err3: + unregister_net_sysctl_table(ucma_ctl_table_hdr); ++#ifndef CONFIG_SYSCTL_SYSCALL_CHECK + err2: + device_remove_file(ucma_misc.this_device, &dev_attr_abi_version); ++#endif + err1: + misc_deregister(&ucma_misc); + return ret; +@@ -1887,7 +1902,9 @@ err1: + static void __exit ucma_cleanup(void) + { + ib_unregister_client(&rdma_cma_client); ++#ifndef CONFIG_SYSCTL_SYSCALL_CHECK + unregister_net_sysctl_table(ucma_ctl_table_hdr); ++#endif + device_remove_file(ucma_misc.this_device, &dev_attr_abi_version); + misc_deregister(&ucma_misc); + } diff --git a/src/mlnx-ofa_kernel-5.8/backports/0028-BACKPORT-drivers-infiniband-core-ud_header.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0028-BACKPORT-drivers-infiniband-core-ud_header.c.patch new file mode 100644 index 0000000..874cc62 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0028-BACKPORT-drivers-infiniband-core-ud_header.c.patch @@ -0,0 +1,19 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/core/ud_header.c + +Change-Id: Ic91aca1fd40986395746a437170d567a492a0396 +--- + drivers/infiniband/core/ud_header.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/infiniband/core/ud_header.c ++++ b/drivers/infiniband/core/ud_header.c +@@ -34,7 +34,7 @@ + #include + #include + #include +-#include ++#include + #include + + #include diff --git a/src/mlnx-ofa_kernel-5.8/backports/0030-BACKPORT-drivers-infiniband-core-umem_dmabuf.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0030-BACKPORT-drivers-infiniband-core-umem_dmabuf.c.patch new file mode 100644 index 0000000..54f34d1 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0030-BACKPORT-drivers-infiniband-core-umem_dmabuf.c.patch @@ -0,0 +1,97 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/core/umem_dmabuf.c + +Change-Id: I28cf6c15f74ea9f208c9641ac9a469538d63df1c +--- + drivers/infiniband/core/umem_dmabuf.c | 31 +++++++++++++++++++++++++++ + 1 file changed, 31 insertions(+) + +--- a/drivers/infiniband/core/umem_dmabuf.c ++++ b/drivers/infiniband/core/umem_dmabuf.c +@@ -3,6 +3,7 @@ + * Copyright (c) 2020 Intel Corporation. All rights reserved. + */ + ++#ifdef HAVE_DMA_BUF_DYNAMIC_ATTACH_GET_4_PARAMS + #include + #include + #include +@@ -10,15 +11,22 @@ + + #include "uverbs.h" + ++#ifdef MODULE_IMPORT_NS + MODULE_IMPORT_NS(DMA_BUF); ++#endif + + int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf) + { + struct sg_table *sgt; + struct scatterlist *sg; ++#ifndef HAVE_DMA_RESV_WAIT_TIMEOUT //forwardport + struct dma_fence *fence; ++#endif + unsigned long start, end, cur = 0; + unsigned int nmap = 0; ++#ifdef HAVE_DMA_RESV_WAIT_TIMEOUT ++ long ret; ++#endif + int i; + + dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv); +@@ -58,8 +66,14 @@ int ib_umem_dmabuf_map_pages(struct ib_u + cur += sg_dma_len(sg); + } + ++#ifdef HAVE_SG_APPEND_TABLE + umem_dmabuf->umem.sgt_append.sgt.sgl = umem_dmabuf->first_sg; + umem_dmabuf->umem.sgt_append.sgt.nents = nmap; ++#else ++ umem_dmabuf->umem.sg_head.sgl = umem_dmabuf->first_sg; ++ umem_dmabuf->umem.sg_head.nents = nmap; ++ umem_dmabuf->umem.nmap = nmap; ++#endif + umem_dmabuf->sgt = sgt; + + wait_fence: +@@ -68,11 +82,25 @@ wait_fence: + * may be not up-to-date. Wait for the exporter to finish + * the migration. + */ ++#ifdef HAVE_DMA_RESV_WAIT_TIMEOUT //forwardport ++ ret = dma_resv_wait_timeout(umem_dmabuf->attach->dmabuf->resv, false, ++ false, MAX_SCHEDULE_TIMEOUT); ++ if (ret < 0) ++ return ret; ++ if (ret == 0) ++ return -ETIMEDOUT; ++ return 0; ++#else /* HAVE_DMA_RESV_WAIT_TIMEOUT */ ++#ifdef HAVE_DMA_RESV_EXCL_FENCE + fence = dma_resv_excl_fence(umem_dmabuf->attach->dmabuf->resv); ++#else ++ fence = dma_resv_get_excl(umem_dmabuf->attach->dmabuf->resv); ++#endif + if (fence) + return dma_fence_wait(fence, false); + + return 0; ++#endif /* HAVE_DMA_RESV_WAIT_TIMEOUT */ + } + EXPORT_SYMBOL(ib_umem_dmabuf_map_pages); + +@@ -176,7 +204,9 @@ ib_umem_dmabuf_unsupported_move_notify(s + } + + static struct dma_buf_attach_ops ib_umem_dmabuf_attach_pinned_ops = { ++#ifdef HAVE_DMA_BUF_ATTACH_OPS_ALLOW_PEER2PEER + .allow_peer2peer = true, ++#endif + .move_notify = ib_umem_dmabuf_unsupported_move_notify, + }; + +@@ -229,3 +259,4 @@ void ib_umem_dmabuf_release(struct ib_um + dma_buf_put(dmabuf); + kfree(umem_dmabuf); + } ++#endif diff --git a/src/mlnx-ofa_kernel-5.8/backports/0031-BACKPORT-drivers-infiniband-core-umem_odp.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0031-BACKPORT-drivers-infiniband-core-umem_odp.c.patch new file mode 100644 index 0000000..f9bd12c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0031-BACKPORT-drivers-infiniband-core-umem_odp.c.patch @@ -0,0 +1,1348 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/core/umem_odp.c + +Change-Id: I499d44dd95625904120a8f4428ece9ef86b98ff3 +--- + drivers/infiniband/core/umem_odp.c | 1082 +++++++++++++++++++++++++++- + 1 file changed, 1081 insertions(+), 1 deletion(-) + +--- a/drivers/infiniband/core/umem_odp.c ++++ b/drivers/infiniband/core/umem_odp.c +@@ -40,7 +40,9 @@ + #include + #include + #include ++#if defined(HAVE_HMM_RANGE_FAULT_SUPPORT) + #include ++#endif + #include + + #include +@@ -49,29 +51,503 @@ + + #include "uverbs.h" + ++#if defined(HAVE_INTERVAL_TREE_TAKES_RB_ROOT) ++#ifdef HAVE_RB_ROOT_CACHED ++#undef HAVE_RB_ROOT_CACHED ++#endif ++#endif ++ ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp, + const struct mmu_interval_notifier_ops *ops) ++#else ++static void ib_umem_notifier_start_account(struct ib_umem_odp *umem_odp) ++#endif ++{ ++#ifndef HAVE_MMU_INTERVAL_NOTIFIER ++ mutex_lock(&umem_odp->umem_mutex); ++ if (umem_odp->notifiers_count++ == 0) ++ /* ++ * Initialize the completion object for waiting on ++ * notifiers. Since notifier_count is zero, no one should be ++ * waiting right now. ++ */ ++ reinit_completion(&umem_odp->notifier_completion); ++ mutex_unlock(&umem_odp->umem_mutex); ++} ++ ++static void ib_umem_notifier_end_account(struct ib_umem_odp *umem_odp) ++{ ++ mutex_lock(&umem_odp->umem_mutex); ++ /* ++ * This sequence increase will notify the QP page fault that the page ++ * that is going to be mapped in the spte could have been freed. ++ */ ++ ++umem_odp->notifiers_seq; ++ if (--umem_odp->notifiers_count == 0) ++ complete_all(&umem_odp->notifier_completion); ++ mutex_unlock(&umem_odp->umem_mutex); ++} ++ ++#ifndef HAVE_RB_ROOT_CACHED ++static int ib_umem_notifier_release_trampoline(struct ib_umem_odp *umem_odp, ++ u64 start, u64 end, void *cookie) ++{ ++ /* ++ * Increase the number of notifiers running, to ++ * prevent any further fault handling on this MR. ++ */ ++ ib_umem_notifier_start_account(umem_odp); ++ complete_all(&umem_odp->notifier_completion); ++ umem_odp->umem.context->device->ops.invalidate_range( ++ umem_odp, ib_umem_start(umem_odp), ib_umem_end(umem_odp)); ++ return 0; ++} ++#endif ++static void ib_umem_notifier_release(struct mmu_notifier *mn, ++ struct mm_struct *mm) ++{ ++ struct ib_ucontext_per_mm *per_mm = ++ container_of(mn, struct ib_ucontext_per_mm, mn); ++#ifdef HAVE_RB_ROOT_CACHED ++ struct rb_node *node; ++#endif ++ ++ down_read(&per_mm->umem_rwsem); ++#ifdef HAVE_RB_ROOT_CACHED ++#ifdef HAVE_MMU_NOTIFIER_OPS_HAS_FREE_NOTIFIER ++ if (!per_mm->mn.users) ++#else ++ if (!per_mm->active) ++#endif ++ goto out; ++ ++ for (node = rb_first_cached(&per_mm->umem_tree); node; ++ node = rb_next(node)) { ++ struct ib_umem_odp *umem_odp = ++ rb_entry(node, struct ib_umem_odp, interval_tree.rb); ++ ++ /* ++ * Increase the number of notifiers running, to prevent any ++ * further fault handling on this MR. ++ */ ++ ib_umem_notifier_start_account(umem_odp); ++ complete_all(&umem_odp->notifier_completion); ++#ifdef HAVE_MMU_NOTIFIER_OPS_HAS_FREE_NOTIFIER ++ umem_odp->umem.ibdev->ops.invalidate_range( ++#else ++ umem_odp->umem.context->device->ops.invalidate_range( ++#endif ++ umem_odp, ib_umem_start(umem_odp), ++ ib_umem_end(umem_odp)); ++ } ++ ++out: ++#else ++ if (per_mm->active) ++ rbt_ib_umem_for_each_in_range( ++ &per_mm->umem_tree, 0, ULLONG_MAX, ++#if defined(HAVE_UMEM_NOTIFIER_PARAM_BLOCKABLE) || defined(HAVE_MMU_NOTIFIER_RANGE_STRUCT) ++ ib_umem_notifier_release_trampoline, true, NULL); ++#else ++ ib_umem_notifier_release_trampoline, NULL); ++#endif ++#endif ++ up_read(&per_mm->umem_rwsem); ++} ++ ++#if defined(HAVE_INVALIDATE_PAGE) ++static int invalidate_page_trampoline(struct ib_umem_odp *item, u64 start, ++ u64 end, void *cookie) ++{ ++ ib_umem_notifier_start_account(item); ++ item->umem.context->device->ops.invalidate_range(item, start, start + PAGE_SIZE); ++ ib_umem_notifier_end_account(item); ++ return 0; ++} ++ ++static void ib_umem_notifier_invalidate_page(struct mmu_notifier *mn, ++ struct mm_struct *mm, ++ unsigned long address) ++{ ++ struct ib_ucontext_per_mm *per_mm = ++ container_of(mn, struct ib_ucontext_per_mm, mn); ++ ++ down_read(&per_mm->umem_rwsem); ++ if (per_mm->active) ++ rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, address, ++ address + PAGE_SIZE, ++ invalidate_page_trampoline, NULL); ++ ++ up_read(&per_mm->umem_rwsem); ++} ++#endif ++ ++static int invalidate_range_start_trampoline(struct ib_umem_odp *item, ++ u64 start, u64 end, void *cookie) ++{ ++ ib_umem_notifier_start_account(item); ++#ifdef HAVE_MMU_NOTIFIER_OPS_HAS_FREE_NOTIFIER ++ item->umem.ibdev->ops.invalidate_range(item, start, end); ++#else ++ item->umem.context->device->ops.invalidate_range(item, start, end); ++#endif ++ return 0; ++} ++ ++#ifdef HAVE_MMU_NOTIFIER_RANGE_STRUCT ++static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, ++ const struct mmu_notifier_range *range) ++#else ++#ifdef HAVE_UMEM_NOTIFIER_PARAM_BLOCKABLE ++ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, ++ struct mm_struct *mm, ++ unsigned long start, ++ unsigned long end, ++ bool blockable) ++#else ++static void ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, ++ struct mm_struct *mm, ++ unsigned long start, ++ unsigned long end) ++#endif ++#endif /*HAVE_MMU_NOTIFIER_RANGE_STRUCT*/ + { ++ struct ib_ucontext_per_mm *per_mm = ++ container_of(mn, struct ib_ucontext_per_mm, mn); ++ ++#if defined(HAVE_UMEM_NOTIFIER_PARAM_BLOCKABLE) || defined(HAVE_MMU_NOTIFIER_RANGE_STRUCT) ++ int rc; ++#ifdef HAVE_MMU_NOTIFIER_RANGE_BLOCKABLE ++ if (mmu_notifier_range_blockable(range)) ++#else ++#ifdef HAVE_MMU_NOTIFIER_RANGE_STRUCT ++ if (range->blockable) ++#else ++ if (blockable) ++#endif ++#endif ++ down_read(&per_mm->umem_rwsem); ++ else if (!down_read_trylock(&per_mm->umem_rwsem)) ++ return -EAGAIN; ++ ++#ifdef HAVE_MMU_NOTIFIER_OPS_HAS_FREE_NOTIFIER ++ if (!per_mm->mn.users) { ++#else ++ if (!per_mm->active) { ++#endif ++ up_read(&per_mm->umem_rwsem); ++ /* ++ * At this point users is permanently zero and visible to this ++ * CPU without a lock, that fact is relied on to skip the unlock ++ * in range_end. ++ */ ++ return 0; ++ } ++ ++#ifdef HAVE_MMU_NOTIFIER_RANGE_BLOCKABLE ++ rc = rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start, ++ range->end, ++ invalidate_range_start_trampoline, ++ mmu_notifier_range_blockable(range), ++ NULL); ++ if (rc) ++ up_read(&per_mm->umem_rwsem); ++#else ++#ifdef HAVE_MMU_NOTIFIER_RANGE_STRUCT ++ rc = rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start, ++ range->end, ++ invalidate_range_start_trampoline, ++ range->blockable, NULL); ++#else ++ rc = rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, start, end, ++ invalidate_range_start_trampoline, ++ blockable, NULL); ++#endif ++#endif //HAVE_MMU_NOTIFIER_RANGE_BLOCKABLE ++ return rc; ++#else /*defined(HAVE_UMEM_NOTIFIER_PARAM_BLOCKABLE) || defined(HAVE_MMU_NOTIFIER_RANGE_STRUCT)*/ ++ ++ //ib_ucontext_notifier_start_account(context); ++ down_read(&per_mm->umem_rwsem); ++ ++ if (!per_mm->active) { ++ up_read(&per_mm->umem_rwsem); ++ /* ++ * At this point active is permanently set and visible to this ++ * CPU without a lock, that fact is relied on to skip the unlock ++ * in range_end. ++ */ ++ return; ++ } ++ rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, start, end, ++ invalidate_range_start_trampoline, ++ NULL); ++ ++#endif ++} ++ ++static int invalidate_range_end_trampoline(struct ib_umem_odp *item, u64 start, ++ u64 end, void *cookie) ++{ ++ ib_umem_notifier_end_account(item); ++ return 0; ++} ++ ++static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, ++#ifdef HAVE_MMU_NOTIFIER_RANGE_STRUCT ++ const struct mmu_notifier_range *range) ++#else ++ struct mm_struct *mm, ++ unsigned long start, ++ unsigned long end) ++#endif ++{ ++ struct ib_ucontext_per_mm *per_mm = ++ container_of(mn, struct ib_ucontext_per_mm, mn); ++ ++#ifdef HAVE_MMU_NOTIFIER_OPS_HAS_FREE_NOTIFIER ++ if (unlikely(!per_mm->mn.users)) ++#else ++ if (unlikely(!per_mm->active)) ++#endif ++ return; ++ ++ rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, ++#ifdef HAVE_MMU_NOTIFIER_RANGE_STRUCT ++ range->start, ++ range->end, ++#else ++ start, ++ end, ++#endif/*HAVE_MMU_NOTIFIER_RANGE_STRUCT*/ ++#if defined(HAVE_UMEM_NOTIFIER_PARAM_BLOCKABLE) || defined(HAVE_MMU_NOTIFIER_RANGE_STRUCT) ++ invalidate_range_end_trampoline, true, NULL); ++#else ++ invalidate_range_end_trampoline, NULL); ++#endif/* defined(HAVE_UMEM_NOTIFIER_PARAM_BLOCKABLE) || defined(HAVE_MMU_NOTIFIER_RANGE_STRUCT) */ ++ up_read(&per_mm->umem_rwsem); ++} ++ ++#ifdef HAVE_MMU_NOTIFIER_OPS_HAS_FREE_NOTIFIER ++static struct mmu_notifier *ib_umem_alloc_notifier(struct mm_struct *mm) ++#else ++static const struct mmu_notifier_ops ib_umem_notifiers = { ++ .release = ib_umem_notifier_release, ++ .invalidate_range_start = ib_umem_notifier_invalidate_range_start, ++ .invalidate_range_end = ib_umem_notifier_invalidate_range_end, ++#if defined(HAVE_INVALIDATE_PAGE) ++ .invalidate_page = ib_umem_notifier_invalidate_page, ++#endif ++ ++}; ++ ++static void remove_umem_from_per_mm(struct ib_umem_odp *umem_odp) ++{ ++ struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; ++ ++ down_write(&per_mm->umem_rwsem); ++ interval_tree_remove(&umem_odp->interval_tree, &per_mm->umem_tree); ++ complete_all(&umem_odp->notifier_completion); ++ up_write(&per_mm->umem_rwsem); ++} ++ ++static struct ib_ucontext_per_mm *alloc_per_mm(struct ib_ucontext *ctx, ++ struct mm_struct *mm) ++#endif ++{ ++ struct ib_ucontext_per_mm *per_mm; ++#ifndef HAVE_MMU_NOTIFIER_OPS_HAS_FREE_NOTIFIER ++ int ret; ++#endif ++ per_mm = kzalloc(sizeof(*per_mm), GFP_KERNEL); ++ if (!per_mm) ++ return ERR_PTR(-ENOMEM); ++ ++#ifndef HAVE_MMU_NOTIFIER_OPS_HAS_FREE_NOTIFIER ++ per_mm->context = ctx; ++ per_mm->mm = mm; ++#endif ++#ifndef HAVE_INTERVAL_TREE_TAKES_RB_ROOT ++ per_mm->umem_tree = RB_ROOT_CACHED; ++#else ++ per_mm->umem_tree = RB_ROOT; ++#endif ++ init_rwsem(&per_mm->umem_rwsem); ++ ++#ifdef HAVE_MMU_NOTIFIER_OPS_HAS_FREE_NOTIFIER ++ WARN_ON(mm != current->mm); ++#else ++ per_mm->active = true; ++#endif ++ rcu_read_lock(); ++ per_mm->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); ++ rcu_read_unlock(); ++#ifdef HAVE_MMU_NOTIFIER_OPS_HAS_FREE_NOTIFIER ++ return &per_mm->mn; ++#else ++ WARN_ON(mm != current->mm); ++ ++ per_mm->mn.ops = &ib_umem_notifiers; ++ ret = mmu_notifier_register(&per_mm->mn, per_mm->mm); ++ if (ret) { ++ dev_err(&ctx->device->dev, ++ "Failed to register mmu_notifier %d\n", ret); ++ goto out_pid; ++ } ++ ++ list_add(&per_mm->ucontext_list, &ctx->per_mm_list); ++ return per_mm; ++ ++out_pid: ++ put_pid(per_mm->tgid); ++ kfree(per_mm); ++ return ERR_PTR(ret); ++} ++ ++static struct ib_ucontext_per_mm *get_per_mm(struct ib_umem_odp *umem_odp) ++{ ++ struct ib_ucontext *ctx = umem_odp->umem.context; ++ struct ib_ucontext_per_mm *per_mm; ++ ++ lockdep_assert_held(&ctx->per_mm_list_lock); ++ ++ /* ++ * Generally speaking we expect only one or two per_mm in this list, ++ * so no reason to optimize this search today. ++ */ ++ list_for_each_entry(per_mm, &ctx->per_mm_list, ucontext_list) { ++ if (per_mm->mm == umem_odp->umem.owning_mm) ++ return per_mm; ++ } ++ ++ return alloc_per_mm(ctx, umem_odp->umem.owning_mm); ++#ifdef HAVE_MMU_NOTIFIER_CALL_SRCU ++} ++ ++static void free_per_mm(struct rcu_head *rcu) ++{ ++ kfree(container_of(rcu, struct ib_ucontext_per_mm, rcu)); ++#endif ++#endif ++} ++ ++#ifdef HAVE_MMU_NOTIFIER_OPS_HAS_FREE_NOTIFIER ++static void ib_umem_free_notifier(struct mmu_notifier *mn) ++#else ++static void put_per_mm(struct ib_umem_odp *umem_odp) ++#endif ++{ ++#ifdef HAVE_MMU_NOTIFIER_OPS_HAS_FREE_NOTIFIER ++ struct ib_ucontext_per_mm *per_mm = ++ container_of(mn, struct ib_ucontext_per_mm, mn); ++#else ++ struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; ++ struct ib_ucontext *ctx = umem_odp->umem.context; ++ bool need_free; ++ ++ mutex_lock(&ctx->per_mm_list_lock); ++ umem_odp->per_mm = NULL; ++ per_mm->odp_mrs_count--; ++ need_free = per_mm->odp_mrs_count == 0; ++ if (need_free) ++ list_del(&per_mm->ucontext_list); ++ mutex_unlock(&ctx->per_mm_list_lock); ++ ++ if (!need_free) ++ return; ++ ++ down_write(&per_mm->umem_rwsem); ++ per_mm->active = false; ++ up_write(&per_mm->umem_rwsem); ++#endif ++#ifndef HAVE_INTERVAL_TREE_TAKES_RB_ROOT ++ WARN_ON(!RB_EMPTY_ROOT(&per_mm->umem_tree.rb_root)); ++#endif ++#ifndef HAVE_MMU_NOTIFIER_OPS_HAS_FREE_NOTIFIER ++#ifdef HAVE_MMU_NOTIFIER_UNREGISTER_NO_RELEASE ++ mmu_notifier_unregister_no_release(&per_mm->mn, per_mm->mm); ++#else ++ mmu_notifier_unregister(&per_mm->mn, per_mm->mm); ++#endif ++#endif ++ ++ put_pid(per_mm->tgid); ++#if defined(HAVE_MMU_NOTIFIER_OPS_HAS_FREE_NOTIFIER) || !defined(HAVE_MMU_NOTIFIER_CALL_SRCU) ++ kfree(per_mm); ++#else ++ mmu_notifier_call_srcu(&per_mm->rcu, free_per_mm); ++#endif ++} ++ ++#ifdef HAVE_MMU_NOTIFIER_OPS_HAS_FREE_NOTIFIER ++static const struct mmu_notifier_ops ib_umem_notifiers = { ++ .release = ib_umem_notifier_release, ++ .invalidate_range_start = ib_umem_notifier_invalidate_range_start, ++ .invalidate_range_end = ib_umem_notifier_invalidate_range_end, ++ .alloc_notifier = ib_umem_alloc_notifier, ++ .free_notifier = ib_umem_free_notifier, ++}; ++ ++static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp) ++#else ++static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp, ++ struct ib_ucontext_per_mm *per_mm) ++#endif ++{ ++#ifdef HAVE_MMU_NOTIFIER_OPS_HAS_FREE_NOTIFIER ++ struct ib_ucontext_per_mm *per_mm; ++ struct mmu_notifier *mn; ++#else ++ struct ib_ucontext *ctx = umem_odp->umem.context; ++#endif ++#endif + int ret; + + umem_odp->umem.is_odp = 1; ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + mutex_init(&umem_odp->umem_mutex); ++#endif + + if (!umem_odp->is_implicit_odp) { + size_t page_size = 1UL << umem_odp->page_shift; ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + unsigned long start; + unsigned long end; ++#if defined(HAVE_HMM_RANGE_FAULT_SUPPORT) + size_t ndmas, npfns; ++#else ++ size_t pages; ++#endif ++#else ++ size_t pages; ++#endif + ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + start = ALIGN_DOWN(umem_odp->umem.address, page_size); ++#else ++ umem_odp->interval_tree.start = ++ ALIGN_DOWN(umem_odp->umem.address, page_size); ++#endif + if (check_add_overflow(umem_odp->umem.address, + (unsigned long)umem_odp->umem.length, ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + &end)) ++#else ++ &umem_odp->interval_tree.last)) ++#endif + return -EOVERFLOW; ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + end = ALIGN(end, page_size); + if (unlikely(end < page_size)) ++#else ++ umem_odp->interval_tree.last = ++ ALIGN(umem_odp->interval_tree.last, page_size); ++ if (unlikely(umem_odp->interval_tree.last < page_size)) ++#endif + return -EOVERFLOW; + ++#if defined(HAVE_MMU_INTERVAL_NOTIFIER) && defined(HAVE_HMM_RANGE_FAULT_SUPPORT) + ndmas = (end - start) >> umem_odp->page_shift; + if (!ndmas) + return -EINVAL; +@@ -104,6 +580,99 @@ out_pfn_list: + kvfree(umem_odp->pfn_list); + return ret; + } ++#else ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER ++ pages = (end - start) >> umem_odp->page_shift; ++#else ++ pages = (umem_odp->interval_tree.last - ++ umem_odp->interval_tree.start) >> ++ umem_odp->page_shift; ++#endif ++ if (!pages) ++ return -EINVAL; ++ ++ /* ++ * Note that the representation of the intervals in the ++ * interval tree considers the ending point as contained in ++ * the interval. ++ */ ++#ifndef HAVE_MMU_INTERVAL_NOTIFIER ++ umem_odp->interval_tree.last--; ++#endif ++ ++ umem_odp->page_list = kvcalloc( ++ pages, sizeof(*umem_odp->page_list), GFP_KERNEL); ++ if (!umem_odp->page_list) ++ return -ENOMEM; ++ ++ umem_odp->dma_list = kvcalloc( ++ pages, sizeof(*umem_odp->dma_list), GFP_KERNEL); ++ if (!umem_odp->dma_list) { ++ ret = -ENOMEM; ++ goto out_page_list; ++ } ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER ++ ret = mmu_interval_notifier_insert(&umem_odp->notifier, ++ umem_odp->umem.owning_mm, ++ start, end - start, ops); ++ if (ret) ++ goto out_dma_list; ++#else ++ } ++ ++#ifdef HAVE_MMU_NOTIFIER_OPS_HAS_FREE_NOTIFIER ++ mn = mmu_notifier_get(&ib_umem_notifiers, umem_odp->umem.owning_mm); ++ if (IS_ERR(mn)) { ++ ret = PTR_ERR(mn); ++ goto out_dma_list; ++#else ++ mutex_lock(&ctx->per_mm_list_lock); ++ if (!per_mm) { ++ per_mm = get_per_mm(umem_odp); ++ if (IS_ERR(per_mm)) { ++ ret = PTR_ERR(per_mm); ++ goto out_unlock; ++ } ++#endif ++ } ++#ifdef HAVE_MMU_NOTIFIER_OPS_HAS_FREE_NOTIFIER ++ umem_odp->per_mm = per_mm = ++ container_of(mn, struct ib_ucontext_per_mm, mn); ++#else ++ umem_odp->per_mm = per_mm; ++ per_mm->odp_mrs_count++; ++ mutex_unlock(&ctx->per_mm_list_lock); ++#endif ++#endif ++#ifndef HAVE_MMU_INTERVAL_NOTIFIER ++ mutex_init(&umem_odp->umem_mutex); ++ init_completion(&umem_odp->notifier_completion); ++ ++ if (!umem_odp->is_implicit_odp) { ++ down_write(&per_mm->umem_rwsem); ++ interval_tree_insert(&umem_odp->interval_tree, ++ &per_mm->umem_tree); ++ up_write(&per_mm->umem_rwsem); ++#endif ++ } ++#ifndef HAVE_MMU_INTERVAL_NOTIFIER ++ mmgrab(umem_odp->umem.owning_mm); ++#endif ++ ++ return 0; ++ ++#ifdef HAVE_MMU_NOTIFIER_OPS_HAS_FREE_NOTIFIER ++out_dma_list: ++#else ++out_unlock: ++ mutex_unlock(&ctx->per_mm_list_lock); ++#endif ++ kvfree(umem_odp->dma_list); ++out_page_list: ++ kvfree(umem_odp->page_list); ++ return ret; ++} ++#endif + + /** + * ib_umem_odp_alloc_implicit - Allocate a parent implicit ODP umem +@@ -115,9 +684,18 @@ out_pfn_list: + * @device: IB device to create UMEM + * @access: ib_reg_mr access flags + */ ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_device *device, ++#else ++struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata, ++#endif + int access) + { ++#ifndef HAVE_MMU_INTERVAL_NOTIFIER ++ struct ib_ucontext *context = ++ container_of(udata, struct uverbs_attr_bundle, driver_udata) ++ ->context; ++#endif + struct ib_umem *umem; + struct ib_umem_odp *umem_odp; + int ret; +@@ -125,20 +703,44 @@ struct ib_umem_odp *ib_umem_odp_alloc_im + if (access & IB_ACCESS_HUGETLB) + return ERR_PTR(-EINVAL); + ++#ifndef HAVE_MMU_INTERVAL_NOTIFIER ++ if (!context) ++ return ERR_PTR(-EIO); ++ if (WARN_ON_ONCE(!context->device->ops.invalidate_range)) ++ return ERR_PTR(-EINVAL); ++#endif + umem_odp = kzalloc(sizeof(*umem_odp), GFP_KERNEL); + if (!umem_odp) + return ERR_PTR(-ENOMEM); + umem = &umem_odp->umem; ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + umem->ibdev = device; ++#else ++#ifdef HAVE_MMU_NOTIFIER_OPS_HAS_FREE_NOTIFIER ++ umem->ibdev = context->device; ++#else ++ umem->context = context; ++#endif ++#endif + umem->writable = ib_access_writable(access); + umem->owning_mm = current->mm; + umem_odp->is_implicit_odp = 1; + umem_odp->page_shift = PAGE_SHIFT; + ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + umem_odp->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); + ret = ib_init_umem_odp(umem_odp, NULL); ++#else ++#ifdef HAVE_MMU_NOTIFIER_OPS_HAS_FREE_NOTIFIER ++ ret = ib_init_umem_odp(umem_odp); ++#else ++ ret = ib_init_umem_odp(umem_odp, NULL); ++#endif ++#endif + if (ret) { ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + put_pid(umem_odp->tgid); ++#endif + kfree(umem_odp); + return ERR_PTR(ret); + } +@@ -156,10 +758,15 @@ EXPORT_SYMBOL(ib_umem_odp_alloc_implicit + * @size: The length of the userspace VA + * @ops: MMU interval ops, currently only @invalidate + */ ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + struct ib_umem_odp * + ib_umem_odp_alloc_child(struct ib_umem_odp *root, unsigned long addr, + size_t size, + const struct mmu_interval_notifier_ops *ops) ++#else ++struct ib_umem_odp *ib_umem_odp_alloc_child(struct ib_umem_odp *root, ++ unsigned long addr, size_t size) ++#endif + { + /* + * Caller must ensure that root cannot be freed during the call to +@@ -176,12 +783,17 @@ ib_umem_odp_alloc_child(struct ib_umem_o + if (!odp_data) + return ERR_PTR(-ENOMEM); + umem = &odp_data->umem; ++#ifdef HAVE_MMU_NOTIFIER_OPS_HAS_FREE_NOTIFIER + umem->ibdev = root->umem.ibdev; ++#else ++ umem->context = root->umem.context; ++#endif + umem->length = size; + umem->address = addr; + umem->writable = root->umem.writable; + umem->owning_mm = root->umem.owning_mm; + odp_data->page_shift = PAGE_SHIFT; ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + odp_data->notifier.ops = ops; + + /* +@@ -191,14 +803,27 @@ ib_umem_odp_alloc_child(struct ib_umem_o + if (!mmget_not_zero(umem->owning_mm)) { + ret = -EFAULT; + goto out_free; ++#else ++#ifdef HAVE_MMU_NOTIFIER_OPS_HAS_FREE_NOTIFIER ++ ret = ib_init_umem_odp(odp_data); ++#else ++ ret = ib_init_umem_odp(odp_data, root->per_mm); ++#endif ++ if (ret) { ++ kfree(odp_data); ++ return ERR_PTR(ret); ++#endif + } ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + + odp_data->tgid = get_pid(root->tgid); + ret = ib_init_umem_odp(odp_data, ops); + if (ret) + goto out_tgid; + mmput(umem->owning_mm); ++#endif + return odp_data; ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + + out_tgid: + put_pid(odp_data->tgid); +@@ -206,6 +831,7 @@ out_tgid: + out_free: + kfree(odp_data); + return ERR_PTR(ret); ++#endif + } + EXPORT_SYMBOL(ib_umem_odp_alloc_child); + +@@ -222,41 +848,95 @@ EXPORT_SYMBOL(ib_umem_odp_alloc_child); + * pinning, instead, stores the mm for future page fault handling in + * conjunction with MMU notifiers. + */ ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + struct ib_umem_odp *ib_umem_odp_get(struct ib_device *device, + unsigned long addr, size_t size, int access, + const struct mmu_interval_notifier_ops *ops) ++#else ++struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr, ++ size_t size, int access) ++#endif + { + struct ib_umem_odp *umem_odp; ++#ifndef HAVE_MMU_INTERVAL_NOTIFIER ++ struct ib_ucontext *context; ++#endif + int ret; + ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + if (WARN_ON_ONCE(!(access & IB_ACCESS_ON_DEMAND))) ++#else ++ if (!udata) ++ return ERR_PTR(-EIO); ++ ++ context = container_of(udata, struct uverbs_attr_bundle, driver_udata) ++ ->context; ++ if (!context) ++ return ERR_PTR(-EIO); ++ ++ if (WARN_ON_ONCE(!(access & IB_ACCESS_ON_DEMAND)) || ++ WARN_ON_ONCE(!context->device->ops.invalidate_range)) ++#endif + return ERR_PTR(-EINVAL); + + umem_odp = kzalloc(sizeof(struct ib_umem_odp), GFP_KERNEL); + if (!umem_odp) + return ERR_PTR(-ENOMEM); + ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + umem_odp->umem.ibdev = device; ++#else ++#ifdef HAVE_MMU_NOTIFIER_OPS_HAS_FREE_NOTIFIER ++ umem_odp->umem.ibdev = context->device; ++#else ++ umem_odp->umem.context = context; ++#endif ++#endif + umem_odp->umem.length = size; + umem_odp->umem.address = addr; + umem_odp->umem.writable = ib_access_writable(access); + umem_odp->umem.owning_mm = current->mm; ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + umem_odp->notifier.ops = ops; ++#endif + + umem_odp->page_shift = PAGE_SHIFT; ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + #ifdef CONFIG_HUGETLB_PAGE + if (access & IB_ACCESS_HUGETLB) + umem_odp->page_shift = HPAGE_SHIFT; + #endif ++#else ++ if (access & IB_ACCESS_HUGETLB) { ++ ret = -EINVAL; ++ goto err_free; ++ } ++#endif + ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + umem_odp->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); + ret = ib_init_umem_odp(umem_odp, ops); ++#else ++#ifdef HAVE_MMU_NOTIFIER_OPS_HAS_FREE_NOTIFIER ++ ret = ib_init_umem_odp(umem_odp); ++#else ++ ret = ib_init_umem_odp(umem_odp, NULL); ++#endif ++#endif + if (ret) ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + goto err_put_pid; ++#else ++ goto err_free; ++#endif + return umem_odp; + ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + err_put_pid: + put_pid(umem_odp->tgid); ++#else ++err_free: ++#endif + kfree(umem_odp); + return ERR_PTR(ret); + } +@@ -264,6 +944,10 @@ EXPORT_SYMBOL(ib_umem_odp_get); + + void ib_umem_odp_release(struct ib_umem_odp *umem_odp) + { ++#if !defined(HAVE_MMU_INTERVAL_NOTIFIER) && defined(HAVE_MMU_NOTIFIER_OPS_HAS_FREE_NOTIFIER) ++ struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; ++ ++#endif + /* + * Ensure that no more pages are mapped in the umem. + * +@@ -275,11 +959,47 @@ void ib_umem_odp_release(struct ib_umem_ + ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp), + ib_umem_end(umem_odp)); + mutex_unlock(&umem_odp->umem_mutex); ++#ifndef HAVE_MMU_NOTIFIER_OPS_HAS_FREE_NOTIFIER ++ remove_umem_from_per_mm(umem_odp); ++#endif ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + mmu_interval_notifier_remove(&umem_odp->notifier); ++#endif + kvfree(umem_odp->dma_list); ++#if defined(HAVE_MMU_INTERVAL_NOTIFIER) && defined(HAVE_HMM_RANGE_FAULT_SUPPORT) + kvfree(umem_odp->pfn_list); ++#else ++ kvfree(umem_odp->page_list); ++#endif + } ++#ifndef HAVE_MMU_NOTIFIER_OPS_HAS_FREE_NOTIFIER ++ put_per_mm(umem_odp); ++#endif ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + put_pid(umem_odp->tgid); ++#else ++ ++#ifdef HAVE_MMU_NOTIFIER_OPS_HAS_FREE_NOTIFIER ++ down_write(&per_mm->umem_rwsem); ++ if (!umem_odp->is_implicit_odp) { ++ interval_tree_remove(&umem_odp->interval_tree, ++ &per_mm->umem_tree); ++ complete_all(&umem_odp->notifier_completion); ++ } ++ /* ++ * NOTE! mmu_notifier_unregister() can happen between a start/end ++ * callback, resulting in a missing end, and thus an unbalanced ++ * lock. This doesn't really matter to us since we are about to kfree ++ * the memory that holds the lock, however LOCKDEP doesn't like this. ++ * Thus we call the mmu_notifier_put under the rwsem and test the ++ * internal users count to reliably see if we are past this point. ++ */ ++ mmu_notifier_put(&per_mm->mn); ++ up_write(&per_mm->umem_rwsem); ++#endif ++ ++ mmdrop(umem_odp->umem.owning_mm); ++#endif + kfree(umem_odp); + } + EXPORT_SYMBOL(ib_umem_odp_release); +@@ -295,6 +1015,7 @@ EXPORT_SYMBOL(ib_umem_odp_release); + * The function returns -EFAULT if the DMA mapping operation fails. + * + */ ++#if defined(HAVE_MMU_INTERVAL_NOTIFIER) && defined(HAVE_HMM_RANGE_FAULT_SUPPORT) + static int ib_umem_odp_map_dma_single_page( + struct ib_umem_odp *umem_odp, + unsigned int dma_index, +@@ -372,7 +1093,11 @@ int ib_umem_odp_map_dma_and_lock(struct + * mmget_not_zero will fail in this case. + */ + owning_process = get_pid_task(umem_odp->tgid, PIDTYPE_PID); ++#ifdef HAVE_MMPUT_ASYNC_EXPORTED /* Forward port */ + if (!owning_process || !mmget_not_zero(owning_mm)) { ++#else ++ if (!owning_process) { ++#endif + ret = -EINVAL; + goto out_put_task; + } +@@ -399,10 +1124,19 @@ retry: + mmap_read_lock(owning_mm); + ret = hmm_range_fault(&range); + mmap_read_unlock(owning_mm); ++#ifdef CONFIG_COMPAT_HMM_RANGE_FAULT_RETURNS_INT + if (unlikely(ret)) { + if (ret == -EBUSY && !time_after(jiffies, timeout)) ++#else ++ if (unlikely(ret <= 0)) { ++ if ((ret == 0 || ret == -EBUSY) && !time_after(jiffies, timeout)) ++#endif + goto retry; ++#ifdef HAVE_MMPUT_ASYNC_EXPORTED /* Forward port */ + goto out_put_mm; ++#else ++ goto out_put_task; ++#endif + } + + start_idx = (range.start - ib_umem_start(umem_odp)) >> page_shift; +@@ -461,23 +1195,316 @@ retry: + else + mutex_unlock(&umem_odp->umem_mutex); + ++#ifdef HAVE_MMPUT_ASYNC_EXPORTED /* Forward port */ + out_put_mm: +- mmput(owning_mm); ++ mmput_async(owning_mm); ++#endif + out_put_task: + if (owning_process) + put_task_struct(owning_process); + return ret; + } + EXPORT_SYMBOL(ib_umem_odp_map_dma_and_lock); ++#else ++static int ib_umem_odp_map_dma_single_page( ++ struct ib_umem_odp *umem_odp, ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER ++ unsigned int page_index, ++#else ++ int page_index, ++#endif ++ struct page *page, ++ u64 access_mask, ++ unsigned long current_seq) ++{ ++#ifdef HAVE_MMU_NOTIFIER_OPS_HAS_FREE_NOTIFIER ++ struct ib_device *dev = umem_odp->umem.ibdev; ++#else ++ struct ib_ucontext *context = umem_odp->umem.context; ++ struct ib_device *dev = context->device; ++#endif ++ dma_addr_t dma_addr; ++ int ret = 0; ++ ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER ++ if (mmu_interval_check_retry(&umem_odp->notifier, current_seq)) { ++#else ++ if (ib_umem_mmu_notifier_retry(umem_odp, current_seq)) { ++#endif ++ ret = -EAGAIN; ++ goto out; ++ } ++ if (!(umem_odp->dma_list[page_index])) { ++ dma_addr = ++ ib_dma_map_page(dev, page, 0, BIT(umem_odp->page_shift), ++ DMA_BIDIRECTIONAL); ++ if (ib_dma_mapping_error(dev, dma_addr)) { ++ ret = -EFAULT; ++ goto out; ++ } ++ umem_odp->dma_list[page_index] = dma_addr | access_mask; ++ umem_odp->page_list[page_index] = page; ++ umem_odp->npages++; ++ } else if (umem_odp->page_list[page_index] == page) { ++ umem_odp->dma_list[page_index] |= access_mask; ++ } else { ++ /* ++ * This is a race here where we could have done: ++ * ++ * CPU0 CPU1 ++ * get_user_pages() ++ * invalidate() ++ * page_fault() ++ * mutex_lock(umem_mutex) ++ * page from GUP != page in ODP ++ * ++ * It should be prevented by the retry test above as reading ++ * the seq number should be reliable under the ++ * umem_mutex. Thus something is really not working right if ++ * things get here. ++ */ ++ WARN(true, ++ "Got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n", ++ umem_odp->page_list[page_index], page); ++ ret = -EAGAIN; ++ } ++ ++out: ++#ifdef HAVE_PUT_USER_PAGES_DIRTY_LOCK_2_PARAMS ++ put_user_page(page); ++#else ++ put_page(page); ++#endif ++ return ret; ++} ++ ++/** ++ * ib_umem_odp_map_dma_pages - Pin and DMA map userspace memory in an ODP MR. ++ * ++ * Pins the range of pages passed in the argument, and maps them to ++ * DMA addresses. The DMA addresses of the mapped pages is updated in ++ * umem_odp->dma_list. ++ * ++ * Returns the number of pages mapped in success, negative error code ++ * for failure. ++ * An -EAGAIN error code is returned when a concurrent mmu notifier prevents ++ * the function from completing its task. ++ * An -ENOENT error code indicates that userspace process is being terminated ++ * and mm was already destroyed. ++ * @umem_odp: the umem to map and pin ++ * @user_virt: the address from which we need to map. ++ * @bcnt: the minimal number of bytes to pin and map. The mapping might be ++ * bigger due to alignment, and may also be smaller in case of an error ++ * pinning or mapping a page. The actual pages mapped is returned in ++ * the return value. ++ * @access_mask: bit mask of the requested access permissions for the given ++ * range. ++ * @current_seq: the MMU notifiers sequance value for synchronization with ++ * invalidations. the sequance number is read from ++ * umem_odp->notifiers_seq before calling this function ++ */ ++int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, ++ u64 bcnt, u64 access_mask, ++ unsigned long current_seq) ++{ ++ struct task_struct *owning_process = NULL; ++ struct mm_struct *owning_mm = umem_odp->umem.owning_mm; ++ struct page **local_page_list = NULL; ++ u64 page_mask, off; ++ int j, k, ret = 0, start_idx, npages = 0; ++ unsigned int flags = 0, page_shift; ++ phys_addr_t p = 0; ++ ++ if (access_mask == 0) ++ return -EINVAL; ++ ++ if (user_virt < ib_umem_start(umem_odp) || ++ user_virt + bcnt > ib_umem_end(umem_odp)) ++ return -EFAULT; ++ ++ local_page_list = (struct page **)__get_free_page(GFP_KERNEL); ++ if (!local_page_list) ++ return -ENOMEM; ++ ++ page_shift = umem_odp->page_shift; ++ page_mask = ~(BIT(page_shift) - 1); ++ off = user_virt & (~page_mask); ++ user_virt = user_virt & page_mask; ++ bcnt += off; /* Charge for the first page offset as well. */ ++ ++ /* ++ * owning_process is allowed to be NULL, this means somehow the mm is ++ * existing beyond the lifetime of the originating process.. Presumably ++ * mmget_not_zero will fail in this case. ++ */ ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER ++ owning_process = get_pid_task(umem_odp->tgid, PIDTYPE_PID); ++#else ++ owning_process = get_pid_task(umem_odp->per_mm->tgid, PIDTYPE_PID); ++#endif ++ if (!owning_process || !mmget_not_zero(owning_mm)) { ++ ret = -EINVAL; ++ goto out_put_task; ++ } ++ ++ if (access_mask & ODP_WRITE_ALLOWED_BIT) ++ flags |= FOLL_WRITE; ++ ++ start_idx = (user_virt - ib_umem_start(umem_odp)) >> page_shift; ++ k = start_idx; ++ ++ while (bcnt > 0) { ++ const size_t gup_num_pages = min_t(size_t, ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER ++ ALIGN(bcnt, PAGE_SIZE) / PAGE_SIZE, ++#else ++ (bcnt + BIT(page_shift) - 1) >> page_shift, ++#endif ++ PAGE_SIZE / sizeof(struct page *)); ++#ifdef HAVE_MMAP_READ_LOCK ++ mmap_read_lock(owning_mm); ++#else ++ down_read(&owning_mm->mmap_sem); ++#endif ++ /* ++ * Note: this might result in redundent page getting. We can ++ * avoid this by checking dma_list to be 0 before calling ++ * get_user_pages. However, this make the code much more ++ * complex (and doesn't gain us much performance in most use ++ * cases). ++ */ ++#ifdef HAVE_GET_USER_PAGES_REMOTE_7_PARAMS_AND_SECOND_INT ++ npages = get_user_pages_remote(owning_mm, ++ user_virt, gup_num_pages, ++ flags, local_page_list, NULL, NULL); ++#elif defined(HAVE_GET_USER_PAGES_REMOTE_8_PARAMS) || defined(HAVE_GET_USER_PAGES_REMOTE_7_PARAMS) || defined(HAVE_GET_USER_PAGES_REMOTE_8_PARAMS_W_LOCKED) ++ npages = get_user_pages_remote(owning_process, owning_mm, ++ user_virt, gup_num_pages, ++#ifdef HAVE_GET_USER_PAGES_GUP_FLAGS ++#ifdef HAVE_GET_USER_PAGES_REMOTE_8_PARAMS_W_LOCKED ++ flags, local_page_list, NULL, NULL); ++#else ++ flags, local_page_list, NULL); ++#endif ++#else ++ access_mask & ODP_WRITE_ALLOWED_BIT, 0, ++ local_page_list, NULL); ++#endif ++#else ++ npages = get_user_pages(owning_process, owning_mm, ++ user_virt, gup_num_pages, ++#ifdef HAVE_GET_USER_PAGES_7_PARAMS ++ flags, local_page_list, NULL); ++#else ++ access_mask & ODP_WRITE_ALLOWED_BIT, ++ 0, local_page_list, NULL); ++#endif ++#endif ++ ++#ifdef HAVE_MMAP_READ_LOCK ++ mmap_read_unlock(owning_mm); ++#else ++ up_read(&owning_mm->mmap_sem); ++#endif ++ if (npages < 0) { ++ if (npages != -EAGAIN) ++ pr_warn("fail to get %zu user pages with error %d\n", gup_num_pages, npages); ++ else ++ pr_debug("fail to get %zu user pages with error %d\n", gup_num_pages, npages); ++ break; ++ } ++ ++ bcnt -= min_t(size_t, npages << PAGE_SHIFT, bcnt); ++ mutex_lock(&umem_odp->umem_mutex); ++ for (j = 0; j < npages; j++, user_virt += PAGE_SIZE) { ++ if (user_virt & ~page_mask) { ++ p += PAGE_SIZE; ++ if (page_to_phys(local_page_list[j]) != p) { ++ ret = -EFAULT; ++ break; ++ } ++#ifdef HAVE_PUT_USER_PAGES_DIRTY_LOCK_2_PARAMS ++ put_user_page(local_page_list[j]); ++#else ++ put_page(local_page_list[j]); ++#endif ++ continue; ++ } ++ ++ ret = ib_umem_odp_map_dma_single_page( ++ umem_odp, k, local_page_list[j], ++ access_mask, current_seq); ++ if (ret < 0) { ++ if (ret != -EAGAIN) ++ pr_warn("ib_umem_odp_map_dma_single_page failed with error %d\n", ret); ++ else ++ pr_debug("ib_umem_odp_map_dma_single_page failed with error %d\n", ret); ++ break; ++ } ++ ++ p = page_to_phys(local_page_list[j]); ++ k++; ++ } ++ mutex_unlock(&umem_odp->umem_mutex); ++ ++ if (ret < 0) { ++ /* ++ * Release pages, remembering that the first page ++ * to hit an error was already released by ++ * ib_umem_odp_map_dma_single_page(). ++ */ ++#if defined(HAVE_RELEASE_PAGES) || defined(HAVE_PUT_USER_PAGES_DIRTY_LOCK_2_PARAMS) ++ if (npages - (j + 1) > 0) ++#ifdef HAVE_RELEASE_PAGES ++ release_pages(&local_page_list[j+1], ++ npages - (j + 1)); ++#else ++ put_user_pages(&local_page_list[j+1], ++ npages - (j + 1)); ++#endif ++#else ++ for (++j; j < npages; ++j) ++ put_page(local_page_list[j]); ++#endif ++ break; ++ } ++ } ++ ++ if (ret >= 0) { ++ if (npages < 0 && k == start_idx) ++ ret = npages; ++ else ++ ret = k - start_idx; ++ } ++ ++#ifdef HAVE_MMPUT_ASYNC_EXPORTED /* Forward port */ ++ mmput_async(owning_mm); ++#else ++ mmput(owning_mm); ++#endif ++out_put_task: ++ if (owning_process) ++ put_task_struct(owning_process); ++ free_page((unsigned long)local_page_list); ++ return ret; ++} ++EXPORT_SYMBOL(ib_umem_odp_map_dma_pages); ++#endif + + void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, + u64 bound) + { ++#if defined(HAVE_MMU_INTERVAL_NOTIFIER) && defined(HAVE_HMM_RANGE_FAULT_SUPPORT) + dma_addr_t dma_addr; + dma_addr_t dma; ++#endif + int idx; + u64 addr; ++#ifdef HAVE_MMU_NOTIFIER_OPS_HAS_FREE_NOTIFIER + struct ib_device *dev = umem_odp->umem.ibdev; ++#else ++ struct ib_device *dev = umem_odp->umem.context->device; ++#endif + + lockdep_assert_held(&umem_odp->umem_mutex); + +@@ -485,6 +1512,7 @@ void ib_umem_odp_unmap_dma_pages(struct + bound = min_t(u64, bound, ib_umem_end(umem_odp)); + for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) { + idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift; ++#if defined(HAVE_MMU_INTERVAL_NOTIFIER) && defined(HAVE_HMM_RANGE_FAULT_SUPPORT) + dma = umem_odp->dma_list[idx]; + + /* The access flags guaranteed a valid DMA address in case was NULL */ +@@ -493,6 +1521,13 @@ void ib_umem_odp_unmap_dma_pages(struct + struct page *page = hmm_pfn_to_page(umem_odp->pfn_list[pfn_idx]); + + dma_addr = dma & ODP_DMA_ADDR_MASK; ++#else ++ if (umem_odp->page_list[idx]) { ++ struct page *page = umem_odp->page_list[idx]; ++ dma_addr_t dma = umem_odp->dma_list[idx]; ++ dma_addr_t dma_addr = dma & ODP_DMA_ADDR_MASK; ++ ++#endif + ib_dma_unmap_page(dev, dma_addr, + BIT(umem_odp->page_shift), + DMA_BIDIRECTIONAL); +@@ -509,9 +1544,54 @@ void ib_umem_odp_unmap_dma_pages(struct + */ + set_page_dirty(head_page); + } ++#if ! (defined(HAVE_MMU_INTERVAL_NOTIFIER) && defined(HAVE_HMM_RANGE_FAULT_SUPPORT)) ++ umem_odp->page_list[idx] = NULL; ++#endif + umem_odp->dma_list[idx] = 0; + umem_odp->npages--; + } + } + } + EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages); ++ ++#ifndef HAVE_MMU_INTERVAL_NOTIFIER ++/* @last is not a part of the interval. See comment for function ++ * node_last. ++ */ ++#ifndef HAVE_INTERVAL_TREE_TAKES_RB_ROOT ++int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root, ++#else ++int rbt_ib_umem_for_each_in_range(struct rb_root *root, ++#endif ++ u64 start, u64 last, ++ umem_call_back cb, ++#if defined(HAVE_UMEM_NOTIFIER_PARAM_BLOCKABLE) || defined(HAVE_MMU_NOTIFIER_RANGE_STRUCT) ++ bool blockable, ++#endif ++ void *cookie) ++{ ++ int ret_val = 0; ++ struct interval_tree_node *node, *next; ++ struct ib_umem_odp *umem; ++ ++ if (unlikely(start == last)) ++ return ret_val; ++ ++ for (node = interval_tree_iter_first(root, start, last - 1); ++ node; node = next) { ++ /* TODO move the blockable decision up to the callback */ ++#if defined(HAVE_UMEM_NOTIFIER_PARAM_BLOCKABLE) || defined(HAVE_MMU_NOTIFIER_RANGE_STRUCT) ++ if (!blockable) ++ return -EAGAIN; ++#endif ++ next = interval_tree_iter_next(node, start, last - 1); ++ umem = container_of(node, struct ib_umem_odp, interval_tree); ++ ret_val = cb(umem, start, last, cookie) || ret_val; ++ } ++ ++ return ret_val; ++} ++#endif ++#ifndef HAVE_RB_ROOT_CACHED ++EXPORT_SYMBOL(rbt_ib_umem_for_each_in_range); ++#endif diff --git a/src/mlnx-ofa_kernel-5.8/backports/0032-BACKPORT-drivers-infiniband-core-user_mad.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0032-BACKPORT-drivers-infiniband-core-user_mad.c.patch new file mode 100644 index 0000000..2690e17 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0032-BACKPORT-drivers-infiniband-core-user_mad.c.patch @@ -0,0 +1,172 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/core/user_mad.c + +Change-Id: I973ac5264866b30e266633064f42dd620a685cb2 +--- + drivers/infiniband/core/user_mad.c | 57 ++++++++++++++++++++++++++---- + 1 file changed, 51 insertions(+), 6 deletions(-) + +--- a/drivers/infiniband/core/user_mad.c ++++ b/drivers/infiniband/core/user_mad.c +@@ -65,6 +65,9 @@ + MODULE_AUTHOR("Roland Dreier"); + MODULE_DESCRIPTION("InfiniBand userspace MAD packet access"); + MODULE_LICENSE("Dual BSD/GPL"); ++#ifdef RETPOLINE_MLNX ++MODULE_INFO(retpoline, "Y"); ++#endif + + enum { + IB_UMAD_MAX_PORTS = RDMA_MAX_PORTS, +@@ -143,7 +146,11 @@ static const dev_t base_issm_dev = MKDEV + static dev_t dynamic_umad_dev; + static dev_t dynamic_issm_dev; + ++#ifdef HAVE_IDA_ALLOC_MAX + static DEFINE_IDA(umad_ida); ++#else ++static DECLARE_BITMAP(dev_map, IB_UMAD_MAX_PORTS); ++#endif + + static int ib_umad_add_one(struct ib_device *device); + static void ib_umad_remove_one(struct ib_device *device, void *client_data); +@@ -342,9 +349,9 @@ static ssize_t copy_recv_mad(struct ib_u + return -EFAULT; + } + } +- ++#ifndef MLX_DISABLE_TRACEPOINTS + trace_ib_umad_read_recv(file, &packet->mad.hdr, &recv_buf->mad->mad_hdr); +- ++#endif + return hdr_size(file) + packet->length; + } + +@@ -363,10 +370,10 @@ static ssize_t copy_send_mad(struct ib_u + + if (copy_to_user(buf, packet->mad.data, packet->length)) + return -EFAULT; +- ++#ifndef MLX_DISABLE_TRACEPOINTS + trace_ib_umad_read_send(file, &packet->mad.hdr, + (struct ib_mad_hdr *)&packet->mad.data); +- ++#endif + return size; + } + +@@ -532,8 +539,10 @@ static ssize_t ib_umad_write(struct file + + mutex_lock(&file->mutex); + ++#ifndef MLX_DISABLE_TRACEPOINTS + trace_ib_umad_write(file, &packet->mad.hdr, + (struct ib_mad_hdr *)&packet->mad.data); ++#endif + + agent = __get_agent(file, packet->mad.hdr.id); + if (!agent) { +@@ -1019,8 +1028,11 @@ static int ib_umad_open(struct inode *in + filp->private_data = file; + + list_add_tail(&file->port_list, &port->file_list); +- ++#ifdef HAVE_STREAM_OPEN + stream_open(inode, filp); ++#else ++ nonseekable_open(inode, filp); ++#endif + out: + mutex_unlock(&port->file_mutex); + return ret; +@@ -1232,6 +1244,7 @@ static char *umad_devnode(struct device + return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev)); + } + ++#ifdef HAVE_CLASS_GROUPS + static ssize_t abi_version_show(struct class *class, + struct class_attribute *attr, char *buf) + { +@@ -1244,14 +1257,21 @@ static struct attribute *umad_class_attr + NULL, + }; + ATTRIBUTE_GROUPS(umad_class); ++#else ++static CLASS_ATTR_STRING(abi_version, S_IRUGO, ++ __stringify(IB_USER_MAD_ABI_VERSION)); ++#endif/*HAVE_CLASS_GROUPS*/ + + static struct class umad_class = { + .name = "infiniband_mad", + .devnode = umad_devnode, ++#ifdef HAVE_CLASS_GROUPS + .class_groups = umad_class_groups, ++#endif + .dev_groups = umad_class_dev_groups, + }; + ++ + static void ib_umad_release_port(struct device *device) + { + struct ib_umad_port *port = dev_get_drvdata(device); +@@ -1281,10 +1301,19 @@ static int ib_umad_init_port(struct ib_d + dev_t base_issm; + int ret; + ++#ifdef HAVE_IDA_ALLOC_MAX + devnum = ida_alloc_max(&umad_ida, IB_UMAD_MAX_PORTS - 1, GFP_KERNEL); + if (devnum < 0) + return -1; ++#else ++ devnum = find_first_zero_bit(dev_map, IB_UMAD_MAX_PORTS); ++ if (devnum >= IB_UMAD_MAX_PORTS) ++ return -1; ++#endif + port->dev_num = devnum; ++#ifndef HAVE_IDA_ALLOC_MAX ++ set_bit(devnum, dev_map); ++#endif + if (devnum >= IB_UMAD_NUM_FIXED_MINOR) { + base_umad = dynamic_umad_dev + devnum - IB_UMAD_NUM_FIXED_MINOR; + base_issm = dynamic_issm_dev + devnum - IB_UMAD_NUM_FIXED_MINOR; +@@ -1327,7 +1356,11 @@ err_dev: + cdev_device_del(&port->cdev, &port->dev); + err_cdev: + put_device(&port->dev); ++#ifndef HAVE_IDA_ALLOC_MAX ++ clear_bit(devnum, dev_map); ++#else + ida_free(&umad_ida, devnum); ++#endif + return ret; + } + +@@ -1358,8 +1391,11 @@ static void ib_umad_kill_port(struct ib_ + mutex_unlock(&port->file_mutex); + + cdev_device_del(&port->sm_cdev, &port->sm_dev); ++#ifndef HAVE_IDA_ALLOC_MAX ++ clear_bit(port->dev_num, dev_map); ++#else + ida_free(&umad_ida, port->dev_num); +- ++#endif + /* balances device_initialize() */ + put_device(&port->sm_dev); + put_device(&port->dev); +@@ -1455,6 +1491,15 @@ static int __init ib_umad_init(void) + goto out_chrdev; + } + ++#ifndef HAVE_CLASS_GROUPS ++ ret = class_create_file(&umad_class, &class_attr_abi_version.attr); ++ if (ret) { ++ pr_err("couldn't create abi_version attribute\n"); ++ goto out_class; ++ } ++#endif/*HAVE_CLASS_GROUPS*/ ++ ++ + ret = ib_register_client(&umad_client); + if (ret) + goto out_class; diff --git a/src/mlnx-ofa_kernel-5.8/backports/0033-BACKPORT-drivers-infiniband-core-uverbs.h.patch b/src/mlnx-ofa_kernel-5.8/backports/0033-BACKPORT-drivers-infiniband-core-uverbs.h.patch new file mode 100644 index 0000000..38e1f33 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0033-BACKPORT-drivers-infiniband-core-uverbs.h.patch @@ -0,0 +1,18 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/core/uverbs.h + +Change-Id: If360cefbe79217c8e0c538f79abd605c5c9635fb +--- + drivers/infiniband/core/uverbs.h | 1 - + 1 file changed, 1 deletion(-) + +--- a/drivers/infiniband/core/uverbs.h ++++ b/drivers/infiniband/core/uverbs.h +@@ -158,7 +158,6 @@ struct ib_uverbs_file { + struct mutex umap_lock; + struct list_head umaps; + struct page *disassociate_page; +- + struct xarray idr; + }; + diff --git a/src/mlnx-ofa_kernel-5.8/backports/0034-BACKPORT-drivers-infiniband-core-uverbs_cmd.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0034-BACKPORT-drivers-infiniband-core-uverbs_cmd.c.patch new file mode 100644 index 0000000..de6b324 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0034-BACKPORT-drivers-infiniband-core-uverbs_cmd.c.patch @@ -0,0 +1,58 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/core/uverbs_cmd.c + +Change-Id: I62711b8ce18981a6b191649e805d4a7f914099ba +--- + drivers/infiniband/core/uverbs_cmd.c | 14 +++++++++++++- + 1 file changed, 13 insertions(+), 1 deletion(-) + +--- a/drivers/infiniband/core/uverbs_cmd.c ++++ b/drivers/infiniband/core/uverbs_cmd.c +@@ -217,6 +217,10 @@ int ib_alloc_ucontext(struct uverbs_attr + ucontext = rdma_zalloc_drv_obj(ib_dev, ib_ucontext); + if (!ucontext) + return -ENOMEM; ++#ifndef HAVE_MMU_NOTIFIER_OPS_HAS_FREE_NOTIFIER ++ mutex_init(&ucontext->per_mm_list_lock); ++ INIT_LIST_HEAD(&ucontext->per_mm_list); ++#endif + + ucontext->device = ib_dev; + ucontext->ufile = ufile; +@@ -242,10 +246,12 @@ int ib_init_ucontext(struct uverbs_attr_ + goto err; + } + ++#ifdef HAVE_CGROUP_RDMA_H + ret = ib_rdmacg_try_charge(&ucontext->cg_obj, ucontext->device, + RDMACG_RESOURCE_HCA_HANDLE); + if (ret) + goto err; ++#endif + + ret = ucontext->device->ops.alloc_ucontext(ucontext, + &attrs->driver_udata); +@@ -265,8 +271,10 @@ int ib_init_ucontext(struct uverbs_attr_ + return 0; + + err_uncharge: ++#ifdef HAVE_CGROUP_RDMA_H + ib_rdmacg_uncharge(&ucontext->cg_obj, ucontext->device, + RDMACG_RESOURCE_HCA_HANDLE); ++#endif + err: + mutex_unlock(&file->ucontext_lock); + up_read(&file->hw_destroy_rwsem); +@@ -3357,7 +3365,11 @@ static int __uverbs_create_xsrq(struct u + struct ib_srq *srq; + struct ib_srq_init_attr attr; + int ret; +- struct ib_uobject *xrcd_uobj; ++#ifndef uninitialized_var ++ struct ib_uobject *xrcd_uobj; ++#else ++ struct ib_uobject *uninitialized_var(xrcd_uobj); ++#endif + struct ib_device *ib_dev; + + obj = (struct ib_usrq_object *)uobj_alloc(UVERBS_OBJECT_SRQ, attrs, diff --git a/src/mlnx-ofa_kernel-5.8/backports/0035-BACKPORT-drivers-infiniband-core-uverbs_ioctl.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0035-BACKPORT-drivers-infiniband-core-uverbs_ioctl.c.patch new file mode 100644 index 0000000..92c01c7 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0035-BACKPORT-drivers-infiniband-core-uverbs_ioctl.c.patch @@ -0,0 +1,40 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/core/uverbs_ioctl.c + +Change-Id: Idfd9543249daa36c3d547506ea9f022ac2f58600 +--- + drivers/infiniband/core/uverbs_ioctl.c | 11 ++++++++++- + 1 file changed, 10 insertions(+), 1 deletion(-) + +--- a/drivers/infiniband/core/uverbs_ioctl.c ++++ b/drivers/infiniband/core/uverbs_ioctl.c +@@ -128,7 +128,11 @@ __malloc void *_uverbs_alloc(struct uver + res = (void *)pbundle->internal_buffer + pbundle->internal_used; + pbundle->internal_used = + ALIGN(new_used, sizeof(*pbundle->internal_buffer)); ++#ifdef HAVE_WANT_INIT_ON_ALLOC + if (want_init_on_alloc(flags)) ++#else ++ if (flags & __GFP_ZERO) ++#endif + memset(res, 0, size); + return res; + } +@@ -553,11 +557,16 @@ static int ib_uverbs_cmd_verbs(struct ib + + if (unlikely(hdr->driver_id != uapi->driver_id)) + return -EINVAL; +- ++#ifdef HAVE_RADIX_TREE_ITER_LOOKUP + slot = radix_tree_iter_lookup( + &uapi->radix, &attrs_iter, + uapi_key_obj(hdr->object_id) | + uapi_key_ioctl_method(hdr->method_id)); ++#else ++ radix_tree_iter_init(&attrs_iter, uapi_key_obj(hdr->object_id) | ++ uapi_key_ioctl_method(hdr->method_id)); ++ slot = radix_tree_next_chunk(&uapi->radix, &attrs_iter, RADIX_TREE_ITER_CONTIG); ++#endif + if (unlikely(!slot)) + return -EPROTONOSUPPORT; + method_elm = rcu_dereference_protected(*slot, true); diff --git a/src/mlnx-ofa_kernel-5.8/backports/0036-BACKPORT-drivers-infiniband-core-uverbs_main.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0036-BACKPORT-drivers-infiniband-core-uverbs_main.c.patch new file mode 100644 index 0000000..fa13ca4 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0036-BACKPORT-drivers-infiniband-core-uverbs_main.c.patch @@ -0,0 +1,209 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/core/uverbs_main.c + +Change-Id: I1c4b5fb0cba8b5480beb8b3b2794b67f97ad2f94 +--- + drivers/infiniband/core/uverbs_main.c | 68 +++++++++++++++++++++++++-- + 1 file changed, 64 insertions(+), 4 deletions(-) + +--- a/drivers/infiniband/core/uverbs_main.c ++++ b/drivers/infiniband/core/uverbs_main.c +@@ -60,6 +60,9 @@ + MODULE_AUTHOR("Roland Dreier"); + MODULE_DESCRIPTION("InfiniBand userspace verbs access"); + MODULE_LICENSE("Dual BSD/GPL"); ++#ifdef RETPOLINE_MLNX ++MODULE_INFO(retpoline, "Y"); ++#endif + + enum { + IB_UVERBS_MAJOR = 231, +@@ -74,7 +77,11 @@ enum { + static dev_t dynamic_uverbs_dev; + static struct class *uverbs_class; + ++#ifndef HAVE_IDA_ALLOC_MAX ++static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES); ++#else + static DEFINE_IDA(uverbs_ida); ++#endif + static int ib_uverbs_add_one(struct ib_device *device); + static void ib_uverbs_remove_one(struct ib_device *device, void *client_data); + +@@ -90,7 +97,6 @@ struct ib_ucontext *ib_uverbs_get_uconte + * get_context, we get NULL or valid ucontext. + */ + struct ib_ucontext *ucontext = smp_load_acquire(&ufile->ucontext); +- + if (!srcu_dereference(ufile->device->ib_dev, + &ufile->device->disassociate_srcu)) + return ERR_PTR(-EIO); +@@ -522,8 +528,14 @@ static ssize_t verify_hdr(struct ib_uver + if (hdr->out_words * 8 < method_elm->resp_size) + return -ENOSPC; + ++#ifdef HAVE_ACCESS_OK_HAS_3_PARAMS ++ if (!access_ok(VERIFY_WRITE, ++ u64_to_user_ptr(ex_hdr->response), ++ (hdr->out_words + ex_hdr->provider_out_words) * 8)) ++#else + if (!access_ok(u64_to_user_ptr(ex_hdr->response), + (hdr->out_words + ex_hdr->provider_out_words) * 8)) ++#endif + return -EFAULT; + } else { + if (hdr->out_words || ex_hdr->provider_out_words) +@@ -766,11 +778,20 @@ static void rdma_umap_close(struct vm_ar + * Once the zap_vma_ptes has been called touches to the VMA will come here and + * we return a dummy writable zero page for all the pfns. + */ ++#ifdef HAVE_VM_FAULT_T ++#ifdef HAVE_VM_OPERATIONS_STRUCT_HAS_FAULT + static vm_fault_t rdma_umap_fault(struct vm_fault *vmf) ++#else ++static int rdma_umap_fault(struct vm_fault *vmf) ++#endif/*HAVE_VM_OPERATIONS_STRUCT_HAS_FAULT*/ + { + struct ib_uverbs_file *ufile = vmf->vma->vm_file->private_data; + struct rdma_umap_priv *priv = vmf->vma->vm_private_data; ++#ifdef HAVE_VM_OPERATIONS_STRUCT_HAS_FAULT + vm_fault_t ret = 0; ++#else ++ int ret = 0; ++#endif + + if (!priv) + return VM_FAULT_SIGBUS; +@@ -801,11 +822,13 @@ static vm_fault_t rdma_umap_fault(struct + + return ret; + } +- ++#endif + static const struct vm_operations_struct rdma_umap_ops = { + .open = rdma_umap_open, + .close = rdma_umap_close, ++#ifdef HAVE_VM_FAULT_T + .fault = rdma_umap_fault, ++#endif + }; + + void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile) +@@ -847,7 +870,11 @@ void uverbs_user_mmap_disassociate(struc + * at a time to get the lock ordering right. Typically there + * will only be one mm, so no big deal. + */ ++#ifdef HAVE_MMAP_READ_LOCK + mmap_read_lock(mm); ++#else ++ down_read(&mm->mmap_sem); ++#endif + mutex_lock(&ufile->umap_lock); + list_for_each_entry_safe (priv, next_priv, &ufile->umaps, + list) { +@@ -864,9 +891,18 @@ void uverbs_user_mmap_disassociate(struc + rdma_user_mmap_entry_put(priv->entry); + priv->entry = NULL; + } ++#ifndef HAVE_VM_FAULT_T ++ vma->vm_flags &= ~(VM_SHARED | VM_MAYSHARE); ++ vma->vm_ops = NULL; ++ kfree(priv); ++#endif + } + mutex_unlock(&ufile->umap_lock); +- mmap_read_unlock(mm); ++#ifdef HAVE_MMAP_READ_LOCK ++ mmap_read_unlock(mm); ++#else ++ up_read(&mm->mmap_sem); ++#endif + mmput(mm); + } + } +@@ -947,7 +983,11 @@ static int ib_uverbs_open(struct inode * + + setup_ufile_idr_uobject(file); + ++#ifdef HAVE_STREAM_OPEN + return stream_open(inode, filp); ++#else ++ return nonseekable_open(inode, filp); ++#endif + + err_module: + module_put(ib_dev->ops.owner); +@@ -1133,13 +1173,21 @@ static int ib_uverbs_add_one(struct ib_d + rcu_assign_pointer(uverbs_dev->ib_dev, device); + uverbs_dev->num_comp_vectors = device->num_comp_vectors; + ++#ifdef HAVE_IDA_ALLOC_MAX + devnum = ida_alloc_max(&uverbs_ida, IB_UVERBS_MAX_DEVICES - 1, + GFP_KERNEL); + if (devnum < 0) { ++#else ++ devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES); ++ if (devnum >= IB_UVERBS_MAX_DEVICES) { ++#endif + ret = -ENOMEM; + goto err; + } + uverbs_dev->devnum = devnum; ++#ifndef HAVE_IDA_ALLOC_MAX ++ set_bit(devnum, dev_map); ++#endif + if (devnum >= IB_UVERBS_NUM_FIXED_MINOR) + base = dynamic_uverbs_dev + devnum - IB_UVERBS_NUM_FIXED_MINOR; + else +@@ -1164,7 +1212,11 @@ static int ib_uverbs_add_one(struct ib_d + return 0; + + err_uapi: ++#ifndef HAVE_IDA_ALLOC_MAX ++ clear_bit(devnum, dev_map); ++#else + ida_free(&uverbs_ida, devnum); ++#endif + err: + if (refcount_dec_and_test(&uverbs_dev->refcount)) + ib_uverbs_comp_dev(uverbs_dev); +@@ -1197,7 +1249,6 @@ static void ib_uverbs_free_hw_resources( + + uverbs_destroy_ufile_hw(file, RDMA_REMOVE_DRIVER_REMOVE); + kref_put(&file->ref, ib_uverbs_release_file); +- + mutex_lock(&uverbs_dev->lists_mutex); + } + mutex_unlock(&uverbs_dev->lists_mutex); +@@ -1211,7 +1262,11 @@ static void ib_uverbs_remove_one(struct + int wait_clients = 1; + + cdev_device_del(&uverbs_dev->cdev, &uverbs_dev->dev); ++#ifndef HAVE_IDA_ALLOC_MAX ++ clear_bit(uverbs_dev->devnum, dev_map); ++#else + ida_free(&uverbs_ida, uverbs_dev->devnum); ++#endif + + if (device->ops.disassociate_ucontext) { + /* We disassociate HW resources and immediately return. +@@ -1274,6 +1329,7 @@ static int __init ib_uverbs_init(void) + uverbs_class->devnode = uverbs_devnode; + + ret = class_create_file(uverbs_class, &class_attr_abi_version.attr); ++ + if (ret) { + pr_err("user_verbs: couldn't create abi_version attribute\n"); + goto out_class; +@@ -1310,7 +1366,11 @@ static void __exit ib_uverbs_cleanup(voi + IB_UVERBS_NUM_FIXED_MINOR); + unregister_chrdev_region(dynamic_uverbs_dev, + IB_UVERBS_NUM_DYNAMIC_MINOR); ++#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING ++#ifdef HAVE_MMU_NOTIFIER_OPS_HAS_FREE_NOTIFIER + mmu_notifier_synchronize(); ++#endif ++#endif + } + + module_init(ib_uverbs_init); diff --git a/src/mlnx-ofa_kernel-5.8/backports/0037-BACKPORT-drivers-infiniband-core-uverbs_uapi.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0037-BACKPORT-drivers-infiniband-core-uverbs_uapi.c.patch new file mode 100644 index 0000000..cf70f17 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0037-BACKPORT-drivers-infiniband-core-uverbs_uapi.c.patch @@ -0,0 +1,35 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/core/uverbs_uapi.c + +Change-Id: Ib079f778833cbd6bc484a12fb1dec810a0d76003 +--- + drivers/infiniband/core/uverbs_uapi.c | 10 +++++++++- + 1 file changed, 9 insertions(+), 1 deletion(-) + +--- a/drivers/infiniband/core/uverbs_uapi.c ++++ b/drivers/infiniband/core/uverbs_uapi.c +@@ -478,7 +478,11 @@ static void uapi_remove_range(struct uve + if (iter.index > last) + return; + kfree(rcu_dereference_protected(*slot, true)); ++#if defined(HAVE_RADIX_TREE_ITER_DELETE) && defined (HAVE_RADIX_TREE_ITER_DELETE_EXPORTED) + radix_tree_iter_delete(&uapi->radix, &iter, slot); ++#else ++ radix_tree_delete(&uapi->radix, iter.index); ++#endif + } + } + +@@ -567,7 +571,11 @@ again: + + if (method_elm->disabled) { + kfree(method_elm); +- radix_tree_iter_delete(&uapi->radix, &iter, slot); ++#if defined(HAVE_RADIX_TREE_ITER_DELETE) && defined (HAVE_RADIX_TREE_ITER_DELETE_EXPORTED) ++ radix_tree_iter_delete(&uapi->radix, &iter, slot); ++#else ++ radix_tree_delete(&uapi->radix, iter.index); ++#endif + } + continue; + } diff --git a/src/mlnx-ofa_kernel-5.8/backports/0038-BACKPORT-drivers-infiniband-core-verbs.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0038-BACKPORT-drivers-infiniband-core-verbs.c.patch new file mode 100644 index 0000000..0066a5f --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0038-BACKPORT-drivers-infiniband-core-verbs.c.patch @@ -0,0 +1,133 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/core/verbs.c + +Change-Id: Ie3c069ffd673e868e8127d0ecf06dc9c02baaf0f +--- + drivers/infiniband/core/verbs.c | 46 +++++++++++++++++++++++++++++++++ + 1 file changed, 46 insertions(+) + +--- a/drivers/infiniband/core/verbs.c ++++ b/drivers/infiniband/core/verbs.c +@@ -53,7 +53,9 @@ + #include + + #include "core_priv.h" ++#ifdef HAVE_TRACE_EVENTS_RDMA_CORE_HEADER + #include ++#endif + + static int ib_resolve_eth_dmac(struct ib_device *device, + struct rdma_ah_attr *ah_attr); +@@ -1913,7 +1915,11 @@ int ib_get_eth_speed(struct ib_device *d + int rc; + u32 netdev_speed; + struct net_device *netdev; ++#ifdef HAVE___ETHTOOL_GET_LINK_KSETTINGS + struct ethtool_link_ksettings lksettings; ++#else ++ struct ethtool_cmd lksettings; ++#endif + + if (rdma_port_get_link_layer(dev, port_num) != IB_LINK_LAYER_ETHERNET) + return -EINVAL; +@@ -1923,13 +1929,22 @@ int ib_get_eth_speed(struct ib_device *d + return -ENODEV; + + rtnl_lock(); ++#ifdef HAVE___ETHTOOL_GET_LINK_KSETTINGS + rc = __ethtool_get_link_ksettings(netdev, &lksettings); ++#else ++ rc = __ethtool_get_settings(netdev, &lksettings); ++#endif + rtnl_unlock(); + + dev_put(netdev); + ++#ifdef HAVE___ETHTOOL_GET_LINK_KSETTINGS + if (!rc && lksettings.base.speed != (u32)SPEED_UNKNOWN) { + netdev_speed = lksettings.base.speed; ++#else ++ if (!rc) { ++ netdev_speed = ethtool_cmd_speed(&lksettings); ++#endif + } else { + netdev_speed = SPEED_1000; + pr_warn("%s speed is unknown, defaulting to %u\n", netdev->name, +@@ -2209,7 +2224,9 @@ int ib_dereg_mr_user(struct ib_mr *mr, s + struct ib_sig_attrs *sig_attrs = mr->sig_attrs; + int ret; + ++#ifdef HAVE_TRACE_EVENTS_RDMA_CORE_HEADER + trace_mr_dereg(mr); ++#endif + rdma_restrack_del(&mr->res); + ret = mr->device->ops.dereg_mr(mr, udata); + if (!ret) { +@@ -2268,7 +2285,9 @@ struct ib_mr *ib_alloc_mr(struct ib_pd * + rdma_restrack_parent_name(&mr->res, &pd->res); + rdma_restrack_add(&mr->res); + out: ++#ifdef HAVE_TRACE_EVENTS_RDMA_CORE_HEADER + trace_mr_alloc(pd, mr_type, max_num_sg, mr); ++#endif + return mr; + } + EXPORT_SYMBOL(ib_alloc_mr); +@@ -2329,7 +2348,9 @@ struct ib_mr *ib_alloc_mr_integrity(stru + rdma_restrack_parent_name(&mr->res, &pd->res); + rdma_restrack_add(&mr->res); + out: ++#ifdef HAVE_TRACE_EVENTS_RDMA_CORE_HEADER + trace_mr_integ_alloc(pd, max_num_data_sg, max_num_meta_sg, mr); ++#endif + return mr; + } + EXPORT_SYMBOL(ib_alloc_mr_integrity); +@@ -2865,7 +2886,9 @@ void ib_drain_sq(struct ib_qp *qp) + qp->device->ops.drain_sq(qp); + else + __ib_drain_sq(qp); ++#ifdef HAVE_TRACE_EVENTS_RDMA_CORE_HEADER + trace_cq_drain_complete(qp->send_cq); ++#endif + } + EXPORT_SYMBOL(ib_drain_sq); + +@@ -2894,7 +2917,9 @@ void ib_drain_rq(struct ib_qp *qp) + qp->device->ops.drain_rq(qp); + else + __ib_drain_rq(qp); ++#ifdef HAVE_TRACE_EVENTS_RDMA_CORE_HEADER + trace_cq_drain_complete(qp->recv_cq); ++#endif + } + EXPORT_SYMBOL(ib_drain_rq); + +@@ -3005,6 +3030,27 @@ bool __rdma_block_iter_next(struct ib_bl + } + EXPORT_SYMBOL(__rdma_block_iter_next); + ++#ifndef HAVE_NET_DEVICE_NEEDS_FREE_NETDEV ++int rdma_uninit_netdev(struct ib_device *device, struct net_device *netdev, ++ u8 port_num, enum rdma_netdev_t type, int force_fail) ++{ ++ struct rdma_netdev_alloc_params params; ++ int rc; ++ ++ if (!device->ops.rdma_netdev_get_params || force_fail) ++ return -EOPNOTSUPP; ++ ++ rc = device->ops.rdma_netdev_get_params(device, port_num, type, ¶ms); ++ if (rc) ++ return rc; ++ ++ params.uninitialize_rdma_netdev(netdev); ++ ++ return rc; ++} ++EXPORT_SYMBOL(rdma_uninit_netdev); ++#endif ++ + /** + * rdma_alloc_hw_stats_struct - Helper function to allocate dynamic struct + * for the drivers. diff --git a/src/mlnx-ofa_kernel-5.8/backports/0039-BACKPORT-drivers-infiniband-debug-memtrack.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0039-BACKPORT-drivers-infiniband-debug-memtrack.c.patch new file mode 100644 index 0000000..7e23b81 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0039-BACKPORT-drivers-infiniband-debug-memtrack.c.patch @@ -0,0 +1,38 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/debug/memtrack.c + +Change-Id: I6a70b705714c8667494e085fd525d329efe00180 +--- + drivers/infiniband/debug/memtrack.c | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +--- a/drivers/infiniband/debug/memtrack.c ++++ b/drivers/infiniband/debug/memtrack.c +@@ -1124,9 +1124,15 @@ static ssize_t memtrack_read(struct file + } + } + ++#ifdef HAVE_PROC_OPS_STRUCT + static const struct proc_ops memtrack_proc_ops = { + .proc_read = memtrack_read, + }; ++#else ++static const struct file_operations memtrack_proc_fops = { ++ .read = memtrack_read, ++}; ++#endif + + static const char *memtrack_proc_entry_name = "mt_memtrack"; + +@@ -1145,7 +1151,11 @@ static int create_procfs_tree(void) + + for (i = 0, bit_mask = 1; i < MEMTRACK_NUM_OF_MEMTYPES; ++i, bit_mask <<= 1) { + if (bit_mask & track_mask) { ++#ifdef HAVE_PROC_OPS_STRUCT + proc_ent = proc_create_data(rsc_names[i], S_IRUGO, memtrack_tree, &memtrack_proc_ops, NULL); ++#else ++ proc_ent = proc_create_data(rsc_names[i], S_IRUGO, memtrack_tree, &memtrack_proc_fops, NULL); ++#endif + if (!proc_ent) { + printk(KERN_INFO "Warning: Cannot create /proc/%s/%s\n", + memtrack_proc_entry_name, rsc_names[i]); diff --git a/src/mlnx-ofa_kernel-5.8/backports/0040-BACKPORT-drivers-infiniband-hw-mlx5-Makefile.patch b/src/mlnx-ofa_kernel-5.8/backports/0040-BACKPORT-drivers-infiniband-hw-mlx5-Makefile.patch new file mode 100644 index 0000000..a09fb8d --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0040-BACKPORT-drivers-infiniband-hw-mlx5-Makefile.patch @@ -0,0 +1,26 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/hw/mlx5/Makefile + +Change-Id: I3f0d982a2bb55b6f102100645d11bda16005e935 +--- + drivers/infiniband/hw/mlx5/Makefile | 12 ++++++++---- + 1 file changed, 8 insertions(+), 4 deletions(-) + +--- a/drivers/infiniband/hw/mlx5/Makefile ++++ b/drivers/infiniband/hw/mlx5/Makefile +@@ -27,7 +27,11 @@ mlx5_ib-y := ah.o \ + + mlx5_ib-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += odp.o + mlx5_ib-$(CONFIG_MLX5_ESWITCH) += ib_rep.o +-mlx5_ib-$(CONFIG_INFINIBAND_USER_ACCESS) += devx.o \ +- fs.o \ +- qos.o \ +- std_types.o ++ ++ifneq ($(CONFIG_INFINIBAND_USER_ACCESS),) ++ mlx5_ib-y += devx.o \ ++ fs.o \ ++ qos.o \ ++ std_types.o ++endif ++ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0041-BACKPORT-drivers-infiniband-hw-mlx5-cq.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0041-BACKPORT-drivers-infiniband-hw-mlx5-cq.c.patch new file mode 100644 index 0000000..a76d4a1 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0041-BACKPORT-drivers-infiniband-hw-mlx5-cq.c.patch @@ -0,0 +1,79 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/hw/mlx5/cq.c + +Change-Id: Ib7ab6c9a7ce534a3a5dcf192e43e46087792c1ca +--- + drivers/infiniband/hw/mlx5/cq.c | 26 ++++++++++++++++++++++++-- + 1 file changed, 24 insertions(+), 2 deletions(-) + +--- a/drivers/infiniband/hw/mlx5/cq.c ++++ b/drivers/infiniband/hw/mlx5/cq.c +@@ -747,9 +747,16 @@ static int create_cq_user(struct mlx5_ib + + *cqe_size = ucmd.cqe_size; + +- cq->buf.umem = ib_umem_get_peer(&dev->ib_dev, ucmd.buf_addr, ++ cq->buf.umem = ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER ++ ib_umem_get_peer(&dev->ib_dev, ucmd.buf_addr, + entries * ucmd.cqe_size, + IB_ACCESS_LOCAL_WRITE, 0); ++#else ++ ib_umem_get_peer(udata, ucmd.buf_addr, ++ entries * ucmd.cqe_size, ++ IB_ACCESS_LOCAL_WRITE, 0); ++#endif + if (IS_ERR(cq->buf.umem)) { + err = PTR_ERR(cq->buf.umem); + return err; +@@ -763,7 +770,7 @@ static int create_cq_user(struct mlx5_ib + goto err_umem; + } + +- err = mlx5_ib_db_map_user(context, ucmd.db_addr, &cq->db); ++ err = mlx5_ib_db_map_user(context, udata, ucmd.db_addr, &cq->db); + if (err) + goto err_umem; + +@@ -952,8 +959,13 @@ int mlx5_ib_create_cq(struct ib_cq *ibcq + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_ib_cq *cq = to_mcq(ibcq); + u32 out[MLX5_ST_SZ_DW(create_cq_out)]; ++#ifndef uninitialized_var + int index; + int inlen; ++#else ++ int uninitialized_var(index); ++ int uninitialized_var(inlen); ++#endif + u32 *cqb = NULL; + void *cqc; + int cqe_size; +@@ -1170,9 +1182,15 @@ static int resize_user(struct mlx5_ib_de + if (ucmd.cqe_size && SIZE_MAX / ucmd.cqe_size <= entries - 1) + return -EINVAL; + ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + umem = ib_umem_get_peer(&dev->ib_dev, ucmd.buf_addr, + (size_t)ucmd.cqe_size * entries, + IB_ACCESS_LOCAL_WRITE, 0); ++#else ++ umem = ib_umem_get_peer(udata, ucmd.buf_addr, ++ (size_t)ucmd.cqe_size * entries, ++ IB_ACCESS_LOCAL_WRITE, 0); ++#endif + if (IS_ERR(umem)) { + err = PTR_ERR(umem); + return err; +@@ -1273,7 +1291,11 @@ int mlx5_ib_resize_cq(struct ib_cq *ibcq + unsigned int page_offset_quantized = 0; + unsigned int page_shift; + int inlen; ++#ifndef uninitialized_var + int cqe_size; ++#else ++ int uninitialized_var(cqe_size); ++#endif + unsigned long flags; + + if (!MLX5_CAP_GEN(dev->mdev, cq_resize)) { diff --git a/src/mlnx-ofa_kernel-5.8/backports/0042-BACKPORT-drivers-infiniband-hw-mlx5-devx.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0042-BACKPORT-drivers-infiniband-hw-mlx5-devx.c.patch new file mode 100644 index 0000000..4ef37c2 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0042-BACKPORT-drivers-infiniband-hw-mlx5-devx.c.patch @@ -0,0 +1,42 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/hw/mlx5/devx.c + +Change-Id: I1137d2693979322aa87544ed0c5b05b0118d4f1a +--- + drivers/infiniband/hw/mlx5/devx.c | 9 +++++++++ + 1 file changed, 9 insertions(+) + +--- a/drivers/infiniband/hw/mlx5/devx.c ++++ b/drivers/infiniband/hw/mlx5/devx.c +@@ -19,6 +19,7 @@ + #include "ib_rep.h" + #include "devx.h" + #include "qp.h" ++#include + #include + + #define UVERBS_MODULE_NAME mlx5_ib +@@ -2378,7 +2379,11 @@ static int devx_umem_get(struct mlx5_ib_ + if (err) + return err; + ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + obj->umem = ib_umem_get_peer(&dev->ib_dev, addr, size, access, 0); ++#else ++ obj->umem = ib_umem_get_peer(&attrs->driver_udata, addr, size, access, 0); ++#endif + if (IS_ERR(obj->umem)) + return PTR_ERR(obj->umem); + return 0; +@@ -2841,7 +2846,11 @@ static ssize_t devx_async_event_read(str + { + struct devx_async_event_file *ev_file = filp->private_data; + struct devx_event_subscription *event_sub; ++#ifndef uninitialized_var + struct devx_async_event_data *event; ++#else ++ struct devx_async_event_data *uninitialized_var(event); ++#endif + int ret = 0; + size_t eventsz; + bool omit_data; diff --git a/src/mlnx-ofa_kernel-5.8/backports/0043-BACKPORT-drivers-infiniband-hw-mlx5-doorbell.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0043-BACKPORT-drivers-infiniband-hw-mlx5-doorbell.c.patch new file mode 100644 index 0000000..cd99ad4 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0043-BACKPORT-drivers-infiniband-hw-mlx5-doorbell.c.patch @@ -0,0 +1,47 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/hw/mlx5/doorbell.c + +Change-Id: I86d3dec149fe44a03858c3c3d1246109799899d8 +--- + drivers/infiniband/hw/mlx5/doorbell.c | 12 +++++++++++- + 1 file changed, 11 insertions(+), 1 deletion(-) + +--- a/drivers/infiniband/hw/mlx5/doorbell.c ++++ b/drivers/infiniband/hw/mlx5/doorbell.c +@@ -45,7 +45,8 @@ struct mlx5_ib_user_db_page { + struct mm_struct *mm; + }; + +-int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, unsigned long virt, ++int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, ++ struct ib_udata *udata, unsigned long virt, + struct mlx5_db *db) + { + struct mlx5_ib_user_db_page *page; +@@ -67,8 +68,13 @@ int mlx5_ib_db_map_user(struct mlx5_ib_u + page->user_virt = (virt & PAGE_MASK); + page->refcnt = 0; + page->umem = ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + ib_umem_get_peer(context->ibucontext.device, virt & PAGE_MASK, + PAGE_SIZE, 0, 0); ++#else ++ ib_umem_get_peer(udata, virt & PAGE_MASK, ++ PAGE_SIZE, 0, 0); ++#endif + if (IS_ERR(page->umem)) { + err = PTR_ERR(page->umem); + kfree(page); +@@ -80,8 +86,12 @@ int mlx5_ib_db_map_user(struct mlx5_ib_u + list_add(&page->list, &context->db_page_list); + + found: ++#ifdef HAVE_SG_APPEND_TABLE + db->dma = sg_dma_address(page->umem->sgt_append.sgt.sgl) + + (virt & ~PAGE_MASK); ++#else ++ db->dma = sg_dma_address(page->umem->sg_head.sgl) + (virt & ~PAGE_MASK); ++#endif + db->u.user_page = page; + ++page->refcnt; + diff --git a/src/mlnx-ofa_kernel-5.8/backports/0044-BACKPORT-drivers-infiniband-hw-mlx5-fs.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0044-BACKPORT-drivers-infiniband-hw-mlx5-fs.c.patch new file mode 100644 index 0000000..9ca9a1f --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0044-BACKPORT-drivers-infiniband-hw-mlx5-fs.c.patch @@ -0,0 +1,45 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/hw/mlx5/fs.c + +Change-Id: I43e90ca3d2e90fee7f610dc24a2cc70ae83e189e +--- + drivers/infiniband/hw/mlx5/fs.c | 7 ++++--- + 1 file changed, 4 insertions(+), 3 deletions(-) + +--- a/drivers/infiniband/hw/mlx5/fs.c ++++ b/drivers/infiniband/hw/mlx5/fs.c +@@ -18,6 +18,7 @@ + #include + #include + #include ++#include + #include "mlx5_ib.h" + #include "counters.h" + #include "devx.h" +@@ -747,7 +748,7 @@ static struct mlx5_ib_flow_prio *get_flo + int max_table_size; + int num_entries; + int num_groups; +- bool esw_encap; ++ bool esw_encap = false; + u32 flags = 0; + int priority; + +@@ -1449,7 +1450,7 @@ _get_flow_table(struct mlx5_ib_dev *dev, + struct mlx5_flow_namespace *ns = NULL; + struct mlx5_ib_flow_prio *prio = NULL; + int max_table_size = 0; +- bool esw_encap; ++ bool esw_encap = false; + u32 flags = 0; + int priority; + +@@ -2083,7 +2084,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD + struct mlx5_flow_context flow_context = {.flow_tag = + MLX5_FS_DEFAULT_FLOW_TAG}; + u32 *offset_attr, offset = 0, counter_id = 0; +- int dest_id, dest_type = -1, inlen, len, ret, i; ++ int dest_id = -1, dest_type = -1, inlen, len, ret, i; + struct mlx5_ib_flow_handler *flow_handler; + struct mlx5_ib_flow_matcher *fs_matcher; + struct ib_uobject **arr_flow_actions; diff --git a/src/mlnx-ofa_kernel-5.8/backports/0045-BACKPORT-drivers-infiniband-hw-mlx5-ib_virt.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0045-BACKPORT-drivers-infiniband-hw-mlx5-ib_virt.c.patch new file mode 100644 index 0000000..26c0e36 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0045-BACKPORT-drivers-infiniband-hw-mlx5-ib_virt.c.patch @@ -0,0 +1,25 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/hw/mlx5/ib_virt.c + +Change-Id: I705c86c80cdb9b43c17d667400d92e1b4181eb10 +--- + drivers/infiniband/hw/mlx5/ib_virt.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/drivers/infiniband/hw/mlx5/ib_virt.c ++++ b/drivers/infiniband/hw/mlx5/ib_virt.c +@@ -149,6 +149,7 @@ ex: + return err; + } + ++#ifdef HAVE_IFLA_VF_IB_NODE_PORT_GUID + static int set_vf_node_guid(struct ib_device *device, int vf, u32 port, + u64 guid) + { +@@ -223,3 +224,6 @@ int mlx5_ib_get_vf_guid(struct ib_device + + return 0; + } ++ ++#endif ++ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0046-BACKPORT-drivers-infiniband-hw-mlx5-main.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0046-BACKPORT-drivers-infiniband-hw-mlx5-main.c.patch new file mode 100644 index 0000000..0a6112e --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0046-BACKPORT-drivers-infiniband-hw-mlx5-main.c.patch @@ -0,0 +1,96 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/hw/mlx5/main.c + +Change-Id: Ia4e7f7c6ba5e6c94797a77cb8d8228b195f8b5b3 +--- + drivers/infiniband/hw/mlx5/main.c | 22 +++++++++++++++++++++- + 1 file changed, 21 insertions(+), 1 deletion(-) + +--- a/drivers/infiniband/hw/mlx5/main.c ++++ b/drivers/infiniband/hw/mlx5/main.c +@@ -65,6 +65,9 @@ MODULE_ALIAS("auxiliary:mlx5_core.rdma") + MODULE_ALIAS("auxiliary:mlx5_core.multiport"); + MODULE_ALIAS("auxiliary:mlx5_core.rdma-rep"); + ++#ifdef RETPOLINE_MLNX ++MODULE_INFO(retpoline, "Y"); ++#endif + struct mlx5_ib_event_work { + struct work_struct work; + union { +@@ -2204,6 +2207,12 @@ static void mlx5_ib_dealloc_ucontext(str + struct mlx5_ib_dev *dev = to_mdev(ibcontext->device); + struct mlx5_bfreg_info *bfregi; + ++#ifndef HAVE_MMU_NOTIFIER_OPS_HAS_FREE_NOTIFIER ++ mutex_lock(&ibcontext->per_mm_list_lock); ++ WARN_ON(!list_empty(&ibcontext->per_mm_list)); ++ mutex_unlock(&ibcontext->per_mm_list_lock); ++#endif ++ + bfregi = &context->bfregi; + mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid); + +@@ -2258,9 +2267,11 @@ static int get_extended_index(unsigned l + } + + ++#if defined(HAVE_PUT_TASK_STRUCT_EXPORTED) && defined (HAVE_GET_TASK_PID_EXPORTED) && defined(HAVE_GET_PID_TASK_EXPORTED) + static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext) + { + } ++#endif + + static inline char *mmap_cmd2str(enum mlx5_ib_mmap_cmd cmd) + { +@@ -2504,6 +2515,7 @@ static int mlx5_ib_mmap(struct ib_uconte + if (!dev->wc_support) + return -EPERM; + fallthrough; ++ + case MLX5_IB_MMAP_NC_PAGE: + case MLX5_IB_MMAP_REGULAR_PAGE: + return uar_mmap(dev, command, vma, context); +@@ -3827,7 +3839,9 @@ static const struct uapi_definition mlx5 + static void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev) + { + mlx5_ib_cleanup_multiport_master(dev); ++#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + WARN_ON(!xa_empty(&dev->odp_mkeys)); ++#endif + mutex_destroy(&dev->cap_mask_mutex); + WARN_ON(!xa_empty(&dev->sig_mrs)); + WARN_ON(!bitmap_empty(dev->dm.memic_alloc_pages, MLX5_MAX_MEMIC_PAGES)); +@@ -3931,7 +3945,9 @@ static const struct ib_device_ops mlx5_i + .destroy_qp = mlx5_ib_destroy_qp, + .destroy_srq = mlx5_ib_destroy_srq, + .detach_mcast = mlx5_ib_mcg_detach, ++#if defined(HAVE_PUT_TASK_STRUCT_EXPORTED) && defined (HAVE_GET_TASK_PID_EXPORTED) && defined(HAVE_GET_PID_TASK_EXPORTED) + .disassociate_ucontext = mlx5_ib_disassociate_ucontext, ++#endif + .drain_rq = mlx5_ib_drain_rq, + .drain_sq = mlx5_ib_drain_sq, + .device_group = &mlx5_attr_group, +@@ -3961,7 +3977,9 @@ static const struct ib_device_ops mlx5_i + .query_srq = mlx5_ib_query_srq, + .query_ucontext = mlx5_ib_query_ucontext, + .reg_user_mr = mlx5_ib_reg_user_mr, ++#ifdef HAVE_DMA_BUF_DYNAMIC_ATTACH_GET_4_PARAMS + .reg_user_mr_dmabuf = mlx5_ib_reg_user_mr_dmabuf, ++#endif + .req_notify_cq = mlx5_ib_arm_cq, + .rereg_user_mr = mlx5_ib_rereg_user_mr, + .resize_cq = mlx5_ib_resize_cq, +@@ -3992,9 +4010,11 @@ static const struct ib_device_ops mlx5_i + + static const struct ib_device_ops mlx5_ib_dev_sriov_ops = { + .get_vf_config = mlx5_ib_get_vf_config, +- .get_vf_guid = mlx5_ib_get_vf_guid, + .get_vf_stats = mlx5_ib_get_vf_stats, ++#ifdef HAVE_IFLA_VF_IB_NODE_PORT_GUID ++ .get_vf_guid = mlx5_ib_get_vf_guid, + .set_vf_guid = mlx5_ib_set_vf_guid, ++#endif + .set_vf_link_state = mlx5_ib_set_vf_link_state, + }; + diff --git a/src/mlnx-ofa_kernel-5.8/backports/0047-BACKPORT-drivers-infiniband-hw-mlx5-mem.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0047-BACKPORT-drivers-infiniband-hw-mlx5-mem.c.patch new file mode 100644 index 0000000..d4c7e6f --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0047-BACKPORT-drivers-infiniband-hw-mlx5-mem.c.patch @@ -0,0 +1,22 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/hw/mlx5/mem.c + +Change-Id: I89a447f5220cf140305aa19e8e51fe3e19cee63f +--- + drivers/infiniband/hw/mlx5/mem.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/drivers/infiniband/hw/mlx5/mem.c ++++ b/drivers/infiniband/hw/mlx5/mem.c +@@ -220,7 +220,11 @@ int mlx5_ib_test_wc(struct mlx5_ib_dev * + if (!dev->mdev->roce.roce_en && + port_type_cap == MLX5_CAP_PORT_TYPE_ETH) { + if (mlx5_core_is_pf(dev->mdev)) ++#ifdef arch_can_pci_mmap_wc + dev->wc_support = arch_can_pci_mmap_wc(); ++#else ++ dev->wc_support = true; ++#endif + return 0; + } + diff --git a/src/mlnx-ofa_kernel-5.8/backports/0048-BACKPORT-drivers-infiniband-hw-mlx5-mlx5_ib.h.patch b/src/mlnx-ofa_kernel-5.8/backports/0048-BACKPORT-drivers-infiniband-hw-mlx5-mlx5_ib.h.patch new file mode 100644 index 0000000..7b83955 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0048-BACKPORT-drivers-infiniband-hw-mlx5-mlx5_ib.h.patch @@ -0,0 +1,66 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/hw/mlx5/mlx5_ib.h + +Change-Id: If385cc6e3ca1e4c8ecbd11dd4422841c33a69f8f +--- + drivers/infiniband/hw/mlx5/mlx5_ib.h | 17 +++++++++++++++-- + 1 file changed, 15 insertions(+), 2 deletions(-) + +--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h ++++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h +@@ -656,7 +656,7 @@ struct mlx5_ib_mkey { + u32 key; + enum mlx5_mkey_type type; + unsigned int ndescs; +- struct wait_queue_head wait; ++ wait_queue_head_t wait; + refcount_t usecount; + }; + +@@ -1283,7 +1283,8 @@ to_mmmap(struct rdma_user_mmap_entry *rd + struct mlx5_user_mmap_entry, rdma_entry); + } + +-int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, unsigned long virt, ++int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, ++ struct ib_udata *udata, unsigned long virt, + struct mlx5_db *db); + void mlx5_ib_db_unmap_user(struct mlx5_ib_ucontext *context, struct mlx5_db *db); + void __mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq); +@@ -1348,6 +1349,7 @@ int mlx5_ib_update_xlt(struct mlx5_ib_mr + int page_shift, int flags); + int mlx5_ib_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags); + struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, ++ struct ib_udata *udata, + int access_flags); + void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *mr); + void mlx5_ib_free_odp_mr(struct mlx5_ib_mr *mr); +@@ -1422,6 +1424,10 @@ int mlx5r_odp_create_eq(struct mlx5_ib_d + void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev); + int __init mlx5_ib_odp_init(void); + void mlx5_ib_odp_cleanup(void); ++#ifndef HAVE_MMU_INTERVAL_NOTIFIER ++void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, ++ unsigned long end); ++#endif + void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent); + void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries, + struct mlx5_ib_mr *mr, int flags); +@@ -1460,10 +1466,17 @@ static inline int mlx5_ib_init_dmabuf_mr + { + return -EOPNOTSUPP; + } ++#ifndef HAVE_MMU_INTERVAL_NOTIFIER ++static inline void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, ++ unsigned long start, ++ unsigned long end){}; ++#endif + #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ + ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + extern const struct mmu_interval_notifier_ops mlx5_mn_ops; + ++#endif + /* Needed for rep profile */ + void __mlx5_ib_remove(struct mlx5_ib_dev *dev, + const struct mlx5_ib_profile *profile, diff --git a/src/mlnx-ofa_kernel-5.8/backports/0049-BACKPORT-drivers-infiniband-hw-mlx5-mr.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0049-BACKPORT-drivers-infiniband-hw-mlx5-mr.c.patch new file mode 100644 index 0000000..2eee893 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0049-BACKPORT-drivers-infiniband-hw-mlx5-mr.c.patch @@ -0,0 +1,230 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/hw/mlx5/mr.c + +Change-Id: I2fb7d75fcbc1a34d2783042afe265b37b5f09fae +--- + drivers/infiniband/hw/mlx5/mr.c | 70 ++++++++++++++++++++++++++++----- + 1 file changed, 61 insertions(+), 9 deletions(-) + +--- a/drivers/infiniband/hw/mlx5/mr.c ++++ b/drivers/infiniband/hw/mlx5/mr.c +@@ -40,7 +40,9 @@ + #include + #include + #include ++#ifdef HAVE_DMA_RESV_H + #include ++#endif + #include + #include + #include +@@ -601,12 +603,19 @@ static void clean_keys(struct mlx5_ib_de + kfree(mr); + } + } +- ++#ifdef HAVE_TIMER_SETUP + static void delay_time_func(struct timer_list *t) ++#else ++static void delay_time_func(unsigned long ctx) ++#endif + { +- struct mlx5_ib_dev *dev = from_timer(dev, t, delay_timer); ++#ifdef HAVE_TIMER_SETUP ++ struct mlx5_ib_dev *dev = from_timer(dev, t, delay_timer); ++#else ++ struct mlx5_ib_dev *dev = (struct mlx5_ib_dev *)ctx; ++#endif + +- WRITE_ONCE(dev->fill_delay, 0); ++ WRITE_ONCE(dev->fill_delay, 0); + } + + int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) +@@ -624,7 +633,11 @@ int mlx5_mr_cache_init(struct mlx5_ib_de + } + + mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx); ++#ifdef HAVE_TIMER_SETUP + timer_setup(&dev->delay_timer, delay_time_func, 0); ++#else ++ setup_timer(&dev->delay_timer, delay_time_func, (unsigned long)dev); ++#endif + for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { + ent = &cache->ent[i]; + INIT_LIST_HEAD(&ent->head); +@@ -936,8 +949,9 @@ static void *mlx5_ib_alloc_xlt(size_t *n + size_t size; + void *res = NULL; + ++#ifdef HAVE_STATIC_ASSERT + static_assert(PAGE_SIZE % MLX5_UMR_MTT_ALIGNMENT == 0); +- ++#endif + /* + * MLX5_IB_UPD_XLT_ATOMIC doesn't signal an atomic context just that the + * allocation can't trigger any kind of reclaim. +@@ -1097,13 +1111,14 @@ int mlx5_ib_update_xlt(struct mlx5_ib_mr + return -ENOMEM; + pages_iter = sg.length / desc_size; + orig_sg_length = sg.length; +- ++#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + if (!(flags & MLX5_IB_UPD_XLT_INDIRECT)) { + struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); + size_t max_pages = ib_umem_odp_num_pages(odp) - idx; + + pages_to_map = min_t(size_t, pages_to_map, max_pages); + } ++#endif + + wr.page_shift = page_shift; + +@@ -1163,8 +1178,12 @@ int mlx5_ib_update_mr_pas(struct mlx5_ib + orig_sg_length = sg.length; + + cur_mtt = mtt; ++#ifdef HAVE_SG_APPEND_TABLE + rdma_for_each_block (mr->umem->sgt_append.sgt.sgl, &biter, + mr->umem->sgt_append.sgt.nents, ++#else ++ rdma_for_each_block (mr->umem->sg_head.sgl, &biter, mr->umem->nmap, ++#endif + BIT(mr->page_shift)) { + if (cur_mtt == (void *)mtt + sg.length) { + dma_sync_single_for_device(ddev, sg.addr, sg.length, +@@ -1442,6 +1461,7 @@ static struct ib_mr *create_real_mr(stru + return &mr->ibmr; + } + ++#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length, + u64 iova, int access_flags, + struct ib_udata *udata) +@@ -1463,7 +1483,7 @@ static struct ib_mr *create_user_odp_mr( + if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) + return ERR_PTR(-EINVAL); + +- mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), access_flags); ++ mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), udata, access_flags); + if (IS_ERR(mr)) + return ERR_CAST(mr); + return &mr->ibmr; +@@ -1473,8 +1493,12 @@ static struct ib_mr *create_user_odp_mr( + if (!mlx5_ib_can_load_pas_with_umr(dev, length)) + return ERR_PTR(-EINVAL); + ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + odp = ib_umem_odp_get(&dev->ib_dev, start, length, access_flags, + &mlx5_mn_ops); ++#else ++ odp = ib_umem_odp_get(udata, start, length, access_flags); ++#endif + if (IS_ERR(odp)) + return ERR_CAST(odp); + +@@ -1499,6 +1523,7 @@ err_dereg_mr: + mlx5_ib_dereg_mr(&mr->ibmr, NULL); + return ERR_PTR(err); + } ++#endif + + struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 iova, int access_flags, +@@ -1512,17 +1537,25 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct + + mlx5_ib_dbg(dev, "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n", + start, iova, length, access_flags); +- ++#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + if ((access_flags & IB_ACCESS_ON_DEMAND) && (dev->profile != &raw_eth_profile)) + return create_user_odp_mr(pd, start, length, iova, access_flags, + udata); ++#endif ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + umem = ib_umem_get_peer(&dev->ib_dev, start, length, access_flags, ++#else ++ umem = ib_umem_get_peer(udata, start, length, access_flags, ++#endif ++ + IB_PEER_MEM_INVAL_SUPP); + if (IS_ERR(umem)) + return ERR_CAST(umem); + return create_real_mr(pd, umem, iova, access_flags); + } + ++ ++#ifdef HAVE_DMA_BUF_DYNAMIC_ATTACH_GET_4_PARAMS + static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach) + { + struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv; +@@ -1538,10 +1571,11 @@ static void mlx5_ib_dmabuf_invalidate_cb + } + + static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = { ++#ifdef HAVE_DMA_BUF_ATTACH_OPS_ALLOW_PEER2PEER + .allow_peer2peer = 1, ++#endif + .move_notify = mlx5_ib_dmabuf_invalidate_cb, + }; +- + struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, + u64 length, u64 virt_addr, + int fd, int access_flags, +@@ -1597,6 +1631,7 @@ err_dereg_mr: + mlx5_ib_dereg_mr(&mr->ibmr, NULL); + return ERR_PTR(err); + } ++#endif + + /** + * revoke_mr - Fence all DMA on the MR +@@ -1809,8 +1844,11 @@ struct ib_mr *mlx5_ib_rereg_user_mr(stru + can_use_umr_rereg_access(dev, mr->access_flags, new_access_flags)) { + struct ib_umem *new_umem; + unsigned long page_size; +- ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + new_umem = ib_umem_get_peer(&dev->ib_dev, start, length, ++#else ++ new_umem = ib_umem_get_peer(udata, start, length, ++#endif + new_access_flags, + IB_PEER_MEM_INVAL_SUPP); + if (IS_ERR(new_umem)) +@@ -2912,10 +2950,17 @@ static struct attribute *order_default_a + &order_attr_size.attr, + NULL + }; ++#ifdef HAVE_KOBJ_TYPE_DEFAULT_GROUPS ++ATTRIBUTE_GROUPS(order_default); ++#endif + + static struct kobj_type order_type = { + .sysfs_ops = &order_sysfs_ops, ++#ifdef HAVE_KOBJ_TYPE_DEFAULT_GROUPS ++ .default_groups = order_default_groups ++#else + .default_attrs = order_default_attrs ++#endif + }; + + +@@ -3044,10 +3089,17 @@ static struct attribute *cache_default_a + &cache_attr_rel_timeout.attr, + NULL + }; ++#ifdef HAVE_KOBJ_TYPE_DEFAULT_GROUPS ++ATTRIBUTE_GROUPS(cache_default); ++#endif + + static struct kobj_type cache_type = { + .sysfs_ops = &cache_sysfs_ops, ++#ifdef HAVE_KOBJ_TYPE_DEFAULT_GROUPS ++ .default_groups = cache_default_groups ++#else + .default_attrs = cache_default_attrs ++#endif + }; + + static int mlx5_mr_sysfs_init(struct mlx5_ib_dev *dev) diff --git a/src/mlnx-ofa_kernel-5.8/backports/0050-BACKPORT-drivers-infiniband-hw-mlx5-odp.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0050-BACKPORT-drivers-infiniband-hw-mlx5-odp.c.patch new file mode 100644 index 0000000..7d73cdd --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0050-BACKPORT-drivers-infiniband-hw-mlx5-odp.c.patch @@ -0,0 +1,293 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/hw/mlx5/odp.c + +Change-Id: I7108ba2f71f205e4c22d6b6a6147065af521c30c +--- + drivers/infiniband/hw/mlx5/odp.c | 120 +++++++++++++++++++++++++++++++ + 1 file changed, 120 insertions(+) + +--- a/drivers/infiniband/hw/mlx5/odp.c ++++ b/drivers/infiniband/hw/mlx5/odp.c +@@ -32,9 +32,16 @@ + + #include + #include ++#if defined(HAVE_MMU_INTERVAL_NOTIFIER) && defined(HAVE_HMM_RANGE_FAULT_SUPPORT) ++#ifndef HAVE_MMPUT_ASYNC_EXPORTED ++#include ++#endif ++#endif + #include ++#ifdef HAVE_DMA_BUF_DYNAMIC_ATTACH_GET_4_PARAMS + #include + #include ++#endif + + #include "mlx5_ib.h" + #include "cmd.h" +@@ -223,27 +230,42 @@ static void destroy_unused_implicit_chil + queue_work(system_unbound_wq, &mr->odp_destroy.work); + } + ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni, + const struct mmu_notifier_range *range, + unsigned long cur_seq) ++#else ++void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, ++ unsigned long end) ++#endif + { ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + struct ib_umem_odp *umem_odp = + container_of(mni, struct ib_umem_odp, notifier); ++#endif + struct mlx5_ib_mr *mr; + const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT / + sizeof(struct mlx5_mtt)) - 1; + u64 idx = 0, blk_start_idx = 0; + u64 invalidations = 0; ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + unsigned long start; + unsigned long end; ++#endif + int in_block = 0; + u64 addr; + ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER ++#ifdef HAVE_MMU_NOTIFIER_RANGE_BLOCKABLE + if (!mmu_notifier_range_blockable(range)) + return false; + ++#endif ++#endif /* HAVE_MMU_INTERVAL_NOTIFIER */ + mutex_lock(&umem_odp->umem_mutex); ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + mmu_interval_set_seq(mni, cur_seq); ++#endif + /* + * If npages is zero then umem_odp->private may not be setup yet. This + * does not complete until after the first page is mapped for DMA. +@@ -252,8 +274,13 @@ static bool mlx5_ib_invalidate_range(str + goto out; + mr = umem_odp->private; + ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + start = max_t(u64, ib_umem_start(umem_odp), range->start); + end = min_t(u64, ib_umem_end(umem_odp), range->end); ++#else ++ start = max_t(u64, ib_umem_start(umem_odp), start); ++ end = min_t(u64, ib_umem_end(umem_odp), end); ++#endif + + /* + * Iteration one - zap the HW's MTTs. The notifiers_count ensures that +@@ -310,12 +337,16 @@ static bool mlx5_ib_invalidate_range(str + destroy_unused_implicit_child_mr(mr); + out: + mutex_unlock(&umem_odp->umem_mutex); ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + return true; ++#endif + } + ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + const struct mmu_interval_notifier_ops mlx5_mn_ops = { + .invalidate = mlx5_ib_invalidate_range, + }; ++#endif + + static void internal_fill_odp_caps(struct mlx5_ib_dev *dev) + { +@@ -414,7 +445,11 @@ static struct mlx5_ib_mr *implicit_get_c + + odp = ib_umem_odp_alloc_child(to_ib_umem_odp(imr->umem), + idx * MLX5_IMR_MTT_SIZE, ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + MLX5_IMR_MTT_SIZE, &mlx5_mn_ops); ++#else ++ MLX5_IMR_MTT_SIZE); ++#endif + if (IS_ERR(odp)) + return ERR_CAST(odp); + +@@ -478,6 +513,7 @@ out_mr: + } + + struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, ++ struct ib_udata *udata, + int access_flags) + { + struct mlx5_ib_dev *dev = to_mdev(pd->ibpd.device); +@@ -489,7 +525,11 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implici + MLX5_IMR_MTT_ENTRIES * PAGE_SIZE)) + return ERR_PTR(-EOPNOTSUPP); + ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + umem_odp = ib_umem_odp_alloc_implicit(&dev->ib_dev, access_flags); ++#else ++ umem_odp = ib_umem_odp_alloc_implicit(udata, access_flags); ++#endif + if (IS_ERR(umem_odp)) + return ERR_CAST(umem_odp); + +@@ -552,11 +592,23 @@ static int pagefault_real_mr(struct mlx5 + u64 user_va, size_t bcnt, u32 *bytes_mapped, + u32 flags) + { ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + int page_shift, ret, np; ++#else ++ int current_seq, page_shift, ret, np; ++#endif + bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE; ++#if defined(HAVE_MMU_INTERVAL_NOTIFIER) && !defined(HAVE_HMM_RANGE_FAULT_SUPPORT) ++ unsigned long current_seq; ++#endif + u64 access_mask; + u64 start_idx; ++#if defined(HAVE_MMU_INTERVAL_NOTIFIER) && defined(HAVE_HMM_RANGE_FAULT_SUPPORT) + bool fault = !(flags & MLX5_PF_FLAGS_SNAPSHOT); ++#ifndef HAVE_MMPUT_ASYNC_EXPORTED ++ struct mm_struct *owning_mm = odp->umem.owning_mm; ++#endif ++#endif + u32 xlt_flags = MLX5_IB_UPD_XLT_ATOMIC; + + if (flags & MLX5_PF_FLAGS_ENABLE) +@@ -569,16 +621,61 @@ static int pagefault_real_mr(struct mlx5 + if (odp->umem.writable && !downgrade) + access_mask |= ODP_WRITE_ALLOWED_BIT; + ++#if defined(HAVE_MMU_INTERVAL_NOTIFIER) && defined(HAVE_HMM_RANGE_FAULT_SUPPORT) ++#ifndef HAVE_MMPUT_ASYNC_EXPORTED ++ if (!mmget_not_zero(owning_mm)) ++ return -EINVAL; ++#endif + np = ib_umem_odp_map_dma_and_lock(odp, user_va, bcnt, access_mask, fault); ++ if (np < 0) { ++#ifndef HAVE_MMPUT_ASYNC_EXPORTED ++ mmput(owning_mm); ++#endif ++ return np; ++ } ++#else ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER ++ current_seq = mmu_interval_read_begin(&odp->notifier); ++#else ++ current_seq = READ_ONCE(odp->notifiers_seq); ++ smp_rmb(); ++#endif ++ ++ np = ib_umem_odp_map_dma_pages(odp, user_va, bcnt, access_mask, ++ current_seq); + if (np < 0) + return np; ++#endif + ++#if defined(HAVE_MMU_INTERVAL_NOTIFIER) && defined(HAVE_HMM_RANGE_FAULT_SUPPORT) + /* + * No need to check whether the MTTs really belong to this MR, since + * ib_umem_odp_map_dma_and_lock already checks this. + */ + ret = mlx5_ib_update_xlt(mr, start_idx, np, page_shift, xlt_flags); + mutex_unlock(&odp->umem_mutex); ++#ifndef HAVE_MMPUT_ASYNC_EXPORTED ++ mmput(owning_mm); ++#endif ++#else ++ mutex_lock(&odp->umem_mutex); ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER ++ if (!mmu_interval_read_retry(&odp->notifier, current_seq)) { ++#else ++ if (!ib_umem_mmu_notifier_retry(odp, current_seq)) { ++#endif ++ /* ++ * No need to check whether the MTTs really belong to ++ * this MR, since ib_umem_odp_map_dma_pages already ++ * checks this. ++ */ ++ ret = mlx5_ib_update_xlt(mr, start_idx, np, ++ page_shift, xlt_flags); ++ } else { ++ ret = -EAGAIN; ++ } ++ mutex_unlock(&odp->umem_mutex); ++#endif + + if (ret < 0) { + if (ret != -EAGAIN) +@@ -597,6 +694,20 @@ static int pagefault_real_mr(struct mlx5 + return np << (page_shift - PAGE_SHIFT); + + out: ++#ifndef HAVE_MMU_INTERVAL_NOTIFIER ++ if (ret == -EAGAIN) { ++ unsigned long timeout = msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT); ++ ++ if (!wait_for_completion_timeout(&odp->notifier_completion, ++ timeout)) { ++ mlx5_ib_warn( ++ mr_to_mdev(mr), ++ "timeout waiting for mmu notifier. seq %d against %d. notifiers_count=%d\n", ++ current_seq, odp->notifiers_seq, ++ odp->notifiers_count); ++ } ++ } ++#endif + return ret; + } + +@@ -686,6 +797,7 @@ out: + return ret; + } + ++#ifdef HAVE_DMA_BUF_DYNAMIC_ATTACH_GET_4_PARAMS + static int pagefault_dmabuf_mr(struct mlx5_ib_mr *mr, size_t bcnt, + u32 *bytes_mapped, u32 flags) + { +@@ -723,6 +835,7 @@ static int pagefault_dmabuf_mr(struct ml + + return ib_umem_num_pages(mr->umem); + } ++#endif + + /* + * Returns: +@@ -741,8 +854,10 @@ static int pagefault_mr(struct mlx5_ib_m + if (unlikely(io_virt < mr->ibmr.iova)) + return -EFAULT; + ++#ifdef HAVE_DMA_BUF_DYNAMIC_ATTACH_GET_4_PARAMS + if (mr->umem->is_dmabuf) + return pagefault_dmabuf_mr(mr, bcnt, bytes_mapped, flags); ++#endif + + if (!odp->is_implicit_odp) { + u64 user_va; +@@ -770,6 +885,7 @@ int mlx5_ib_init_odp_mr(struct mlx5_ib_m + return ret >= 0 ? 0 : ret; + } + ++#ifdef HAVE_DMA_BUF_DYNAMIC_ATTACH_GET_4_PARAMS + int mlx5_ib_init_dmabuf_mr(struct mlx5_ib_mr *mr) + { + int ret; +@@ -779,6 +895,7 @@ int mlx5_ib_init_dmabuf_mr(struct mlx5_i + + return ret >= 0 ? 0 : ret; + } ++#endif + + struct pf_frame { + struct pf_frame *next; +@@ -1615,6 +1732,9 @@ void mlx5_odp_init_mr_cache_entry(struct + + static const struct ib_device_ops mlx5_ib_dev_odp_ops = { + .advise_mr = mlx5_ib_advise_mr, ++#ifndef HAVE_MMU_INTERVAL_NOTIFIER ++ .invalidate_range = mlx5_ib_invalidate_range, ++#endif + }; + + int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev) diff --git a/src/mlnx-ofa_kernel-5.8/backports/0051-BACKPORT-drivers-infiniband-hw-mlx5-qp.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0051-BACKPORT-drivers-infiniband-hw-mlx5-qp.c.patch new file mode 100644 index 0000000..696f97f --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0051-BACKPORT-drivers-infiniband-hw-mlx5-qp.c.patch @@ -0,0 +1,116 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/hw/mlx5/qp.c + +Change-Id: I4f8290773fff6c89b3b80655e052f86253332b8c +--- + drivers/infiniband/hw/mlx5/qp.c | 37 +++++++++++++++++++++++++++------ + 1 file changed, 31 insertions(+), 6 deletions(-) + +--- a/drivers/infiniband/hw/mlx5/qp.c ++++ b/drivers/infiniband/hw/mlx5/qp.c +@@ -952,8 +952,14 @@ static int create_user_rq(struct mlx5_ib + if (!ucmd->buf_addr) + return -EINVAL; + ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + rwq->umem = ib_umem_get_peer(&dev->ib_dev, ucmd->buf_addr, + rwq->buf_size, 0, 0); ++#else ++ rwq->umem = ib_umem_get_peer(udata, ucmd->buf_addr, ++ rwq->buf_size, 0, 0); ++#endif ++ + if (IS_ERR(rwq->umem)) { + mlx5_ib_dbg(dev, "umem_get failed\n"); + err = PTR_ERR(rwq->umem); +@@ -981,7 +987,7 @@ static int create_user_rq(struct mlx5_ib + ib_umem_num_pages(rwq->umem), page_size, rwq->rq_num_pas, + offset); + +- err = mlx5_ib_db_map_user(ucontext, ucmd->db_addr, &rwq->db); ++ err = mlx5_ib_db_map_user(ucontext, udata, ucmd->db_addr, &rwq->db); + if (err) { + mlx5_ib_dbg(dev, "map failed\n"); + goto err_umem; +@@ -1064,8 +1070,13 @@ static int _create_user_qp(struct mlx5_i + if (ucmd->buf_addr && ubuffer->buf_size) { + ubuffer->buf_addr = ucmd->buf_addr; + ubuffer->umem = ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + ib_umem_get_peer(&dev->ib_dev, ubuffer->buf_addr, + ubuffer->buf_size, 0, 0); ++#else ++ ib_umem_get_peer(udata, ubuffer->buf_addr, ++ ubuffer->buf_size, 0, 0); ++#endif + if (IS_ERR(ubuffer->umem)) { + err = PTR_ERR(ubuffer->umem); + goto err_bfreg; +@@ -1108,7 +1119,7 @@ static int _create_user_qp(struct mlx5_i + resp->bfreg_index = MLX5_IB_INVALID_BFREG; + qp->bfregn = bfregn; + +- err = mlx5_ib_db_map_user(context, ucmd->db_addr, &qp->db); ++ err = mlx5_ib_db_map_user(context, udata, ucmd->db_addr, &qp->db); + if (err) { + mlx5_ib_dbg(dev, "map failed\n"); + goto err_free; +@@ -1419,8 +1430,13 @@ static int create_raw_packet_qp_sq(struc + if (ts_format < 0) + return ts_format; + ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + sq->ubuffer.umem = ib_umem_get_peer(&dev->ib_dev, ubuffer->buf_addr, + ubuffer->buf_size, 0, 0); ++#else ++ sq->ubuffer.umem = ib_umem_get_peer(udata, ubuffer->buf_addr, ++ ubuffer->buf_size, 0, 0); ++#endif + if (IS_ERR(sq->ubuffer.umem)) + return PTR_ERR(sq->ubuffer.umem); + page_size = mlx5_umem_find_best_quantized_pgoff( +@@ -2145,10 +2161,10 @@ static int create_dci(struct mlx5_ib_dev + struct mlx5_ib_cq *recv_cq; + unsigned long flags; + struct mlx5_ib_qp_base *base; +- int ts_format; ++ int ts_format = 0; + int mlx5_st; + void *qpc; +- u32 *in; ++ u32 *in = NULL; + int err; + + spin_lock_init(&qp->sq.lock); +@@ -2306,10 +2322,10 @@ static int create_user_qp(struct mlx5_ib + struct mlx5_ib_cq *recv_cq; + unsigned long flags; + struct mlx5_ib_qp_base *base; +- int ts_format; ++ int ts_format = 0; + int mlx5_st; + void *qpc; +- u32 *in; ++ u32 *in = NULL; + int err; + + spin_lock_init(&qp->sq.lock); +@@ -5826,9 +5842,18 @@ static void handle_drain_completion(stru + if (triggered) { + /* Wait for any scheduled/running task to be ended */ + switch (cq->poll_ctx) { ++#if IS_ENABLED(CONFIG_IRQ_POLL) || !defined(HAVE_IRQ_POLL_H) + case IB_POLL_SOFTIRQ: ++#if defined(HAVE_IRQ_POLL_H) ++#if IS_ENABLED(CONFIG_IRQ_POLL) + irq_poll_disable(&cq->iop); + irq_poll_enable(&cq->iop); ++#endif ++#else ++ blk_iopoll_disable(&cq->iop); ++ blk_iopoll_enable(&cq->iop); ++#endif ++#endif + break; + case IB_POLL_WORKQUEUE: + cancel_work_sync(&cq->work); diff --git a/src/mlnx-ofa_kernel-5.8/backports/0052-BACKPORT-drivers-infiniband-hw-mlx5-srq.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0052-BACKPORT-drivers-infiniband-hw-mlx5-srq.c.patch new file mode 100644 index 0000000..04cbdf3 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0052-BACKPORT-drivers-infiniband-hw-mlx5-srq.c.patch @@ -0,0 +1,31 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/hw/mlx5/srq.c + +Change-Id: I52d8219e1d58d0e85875b0eb6d5ea7f9d33f59e1 +--- + drivers/infiniband/hw/mlx5/srq.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +--- a/drivers/infiniband/hw/mlx5/srq.c ++++ b/drivers/infiniband/hw/mlx5/srq.c +@@ -76,7 +76,11 @@ static int create_srq_user(struct ib_pd + + srq->wq_sig = !!(ucmd.flags & MLX5_SRQ_FLAG_SIGNATURE); + ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + srq->umem = ib_umem_get_peer(pd->device, ucmd.buf_addr, buf_size, 0, 0); ++#else ++ srq->umem = ib_umem_get_peer(udata, ucmd.buf_addr, buf_size, 0, 0); ++#endif + if (IS_ERR(srq->umem)) { + mlx5_ib_dbg(dev, "failed umem get, size %d\n", buf_size); + err = PTR_ERR(srq->umem); +@@ -84,7 +88,7 @@ static int create_srq_user(struct ib_pd + } + in->umem = srq->umem; + +- err = mlx5_ib_db_map_user(ucontext, ucmd.db_addr, &srq->db); ++ err = mlx5_ib_db_map_user(ucontext, udata, ucmd.db_addr, &srq->db); + if (err) { + mlx5_ib_dbg(dev, "map doorbell failed\n"); + goto err_umem; diff --git a/src/mlnx-ofa_kernel-5.8/backports/0053-BACKPORT-drivers-infiniband-hw-mlx5-srq_cmd.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0053-BACKPORT-drivers-infiniband-hw-mlx5-srq_cmd.c.patch new file mode 100644 index 0000000..64c7eb5 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0053-BACKPORT-drivers-infiniband-hw-mlx5-srq_cmd.c.patch @@ -0,0 +1,19 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/hw/mlx5/srq_cmd.c + +Change-Id: I7a6f1c133f834a3a39d88aec4cc7e5e0a99636fe +--- + drivers/infiniband/hw/mlx5/srq_cmd.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/infiniband/hw/mlx5/srq_cmd.c ++++ b/drivers/infiniband/hw/mlx5/srq_cmd.c +@@ -481,7 +481,7 @@ static int create_xrq_cmd(struct mlx5_ib + void *xrqc; + void *wq; + void *pas; +- int pas_size, rq_pas_size; ++ int pas_size, rq_pas_size = 0; + int inlen; + int err; + diff --git a/src/mlnx-ofa_kernel-5.8/backports/0054-BACKPORT-drivers-infiniband-hw-mlx5-wr.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0054-BACKPORT-drivers-infiniband-hw-mlx5-wr.c.patch new file mode 100644 index 0000000..2c971cd --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0054-BACKPORT-drivers-infiniband-hw-mlx5-wr.c.patch @@ -0,0 +1,18 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/hw/mlx5/wr.c + +Change-Id: Ib34ec9ec88f0dc773a11334a81603496849b753d +--- + drivers/infiniband/hw/mlx5/wr.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/drivers/infiniband/hw/mlx5/wr.c ++++ b/drivers/infiniband/hw/mlx5/wr.c +@@ -1371,6 +1371,7 @@ int mlx5_ib_post_send(struct ib_qp *ibqp + seg += sizeof(*xrc); + size += sizeof(*xrc) / 16; + fallthrough; ++ + case IB_QPT_RC: + err = handle_qpt_rc(dev, qp, wr, &ctrl, &seg, &size, + &cur_edge, &idx, nreq, fence, diff --git a/src/mlnx-ofa_kernel-5.8/backports/0055-BACKPORT-drivers-infiniband-ulp-ipoib-ipoib.h.patch b/src/mlnx-ofa_kernel-5.8/backports/0055-BACKPORT-drivers-infiniband-ulp-ipoib-ipoib.h.patch new file mode 100644 index 0000000..4906cde --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0055-BACKPORT-drivers-infiniband-ulp-ipoib-ipoib.h.patch @@ -0,0 +1,68 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/ulp/ipoib/ipoib.h + +Change-Id: I1b6d4578cd1cce7f8d55e7626eb13b7f42949758 +--- + drivers/infiniband/ulp/ipoib/ipoib.h | 22 ++++++++++++++++++++++ + 1 file changed, 22 insertions(+) + +--- a/drivers/infiniband/ulp/ipoib/ipoib.h ++++ b/drivers/infiniband/ulp/ipoib/ipoib.h +@@ -42,6 +42,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -52,6 +53,9 @@ + #include + #include + #include ++#ifdef CONFIG_COMPAT_LRO_ENABLED_IPOIB ++#include ++#endif + /* constants */ + + enum ipoib_flush_level { +@@ -117,8 +121,19 @@ enum { + IPOIB_NON_CHILD = 0, + IPOIB_LEGACY_CHILD = 1, + IPOIB_RTNL_CHILD = 2, ++#ifdef CONFIG_COMPAT_LRO_ENABLED_IPOIB ++ IPOIB_MAX_LRO_DESCRIPTORS = 8, ++ IPOIB_LRO_MAX_AGGR = 64, ++#endif + }; + ++#ifdef CONFIG_COMPAT_LRO_ENABLED_IPOIB ++struct ipoib_lro { ++ struct net_lro_mgr lro_mgr; ++ struct net_lro_desc lro_desc[IPOIB_MAX_LRO_DESCRIPTORS]; ++}; ++#endif ++ + #define IPOIB_OP_RECV (1ul << 31) + #ifdef CONFIG_INFINIBAND_IPOIB_CM + #define IPOIB_OP_CM (1ul << 30) +@@ -428,6 +443,12 @@ struct ipoib_dev_priv { + u32 recvq_size; + unsigned int max_send_sge; + const struct net_device_ops *rn_ops; ++#ifdef CONFIG_COMPAT_LRO_ENABLED_IPOIB ++ struct ipoib_lro lro; ++#endif ++#ifndef HAVE_NDO_GET_STATS64 ++ struct net_device_stats ret_stats; ++#endif + }; + + struct ipoib_ah { +@@ -864,5 +885,6 @@ extern int ipoib_debug_level; + #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG_DATA */ + + #define IPOIB_QPN(ha) (be32_to_cpup((__be32 *) ha) & 0xffffff) ++extern int ipoib_enhanced_enabled; + + #endif /* _IPOIB_H */ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0056-BACKPORT-drivers-infiniband-ulp-ipoib-ipoib_cm.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0056-BACKPORT-drivers-infiniband-ulp-ipoib-ipoib_cm.c.patch new file mode 100644 index 0000000..6962ea9 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0056-BACKPORT-drivers-infiniband-ulp-ipoib-ipoib_cm.c.patch @@ -0,0 +1,105 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/ulp/ipoib/ipoib_cm.c + +Change-Id: I931ba64b05dedee321abcb58730ef8fc3c5c413a +--- + drivers/infiniband/ulp/ipoib/ipoib_cm.c | 43 +++++++++++++++++++++---- + 1 file changed, 36 insertions(+), 7 deletions(-) + +--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c ++++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c +@@ -39,7 +39,9 @@ + #include + #include + #include ++#ifdef HAVE_MEMALLOC_NOIO_SAVE + #include ++#endif + + #include "ipoib.h" + +@@ -1079,7 +1081,11 @@ static struct ib_qp *ipoib_cm_create_tx_ + .sq_sig_type = IB_SIGNAL_ALL_WR, + .qp_type = IB_QPT_RC, + .qp_context = tx, ++#ifdef HAVE_MEMALLOC_NOIO_SAVE + .create_flags = 0 ++#else ++ .create_flags = IB_QP_CREATE_USE_GFP_NOIO ++#endif + }; + struct ib_qp *tx_qp; + +@@ -1088,6 +1094,12 @@ static struct ib_qp *ipoib_cm_create_tx_ + MAX_SKB_FRAGS + 1); + + tx_qp = ib_create_qp(priv->pd, &attr); ++#ifndef HAVE_MEMALLOC_NOIO_SAVE ++ if (PTR_ERR(tx_qp)) { ++ attr.create_flags &= ~IB_QP_CREATE_USE_GFP_NOIO; ++ tx_qp = ib_create_qp(priv->pd, &attr); ++ } ++#endif + tx->max_send_sge = attr.cap.max_send_sge; + return tx_qp; + } +@@ -1154,19 +1166,30 @@ static int ipoib_cm_tx_init(struct ipoib + struct sa_path_rec *pathrec) + { + struct ipoib_dev_priv *priv = ipoib_priv(p->dev); +- unsigned int noio_flag; +- int ret; ++#ifdef HAVE_MEMALLOC_NOIO_SAVE ++ unsigned int noio_flag; ++#endif ++ int ret; + +- noio_flag = memalloc_noio_save(); +- p->tx_ring = vzalloc(array_size(priv->sendq_size, sizeof(*p->tx_ring))); ++#ifdef HAVE_MEMALLOC_NOIO_SAVE ++ noio_flag = memalloc_noio_save(); ++ p->tx_ring = vzalloc(array_size(priv->sendq_size, sizeof(*p->tx_ring))); ++#else ++ p->tx_ring = __vmalloc(priv->sendq_size * sizeof(*p->tx_ring), ++ GFP_NOIO, PAGE_KERNEL); ++#endif + if (!p->tx_ring) { +- memalloc_noio_restore(noio_flag); ++#ifdef HAVE_MEMALLOC_NOIO_SAVE ++ memalloc_noio_restore(noio_flag); ++#endif + ret = -ENOMEM; + goto err_tx; + } + + p->qp = ipoib_cm_create_tx_qp(p->dev, p); +- memalloc_noio_restore(noio_flag); ++#ifdef HAVE_MEMALLOC_NOIO_SAVE ++ memalloc_noio_restore(noio_flag); ++#endif + if (IS_ERR(p->qp)) { + ret = PTR_ERR(p->qp); + ipoib_warn(priv, "failed to create tx qp: %d\n", ret); +@@ -1630,7 +1653,9 @@ int ipoib_cm_dev_init(struct net_device + { + struct ipoib_dev_priv *priv = ipoib_priv(dev); + int max_srq_sge, i; ++#ifdef HAVE_DEV_ADDR_MOD + u8 addr; ++#endif + + INIT_LIST_HEAD(&priv->cm.passive_ids); + INIT_LIST_HEAD(&priv->cm.reap_list); +@@ -1684,8 +1709,12 @@ int ipoib_cm_dev_init(struct net_device + } + } + ++#ifdef HAVE_DEV_ADDR_MOD + addr = IPOIB_FLAGS_RC; + dev_addr_mod(dev, 0, &addr, 1); ++#else ++ priv->dev->dev_addr[0] = IPOIB_FLAGS_RC; ++#endif + return 0; + } + diff --git a/src/mlnx-ofa_kernel-5.8/backports/0057-BACKPORT-drivers-infiniband-ulp-ipoib-ipoib_ethtool..patch b/src/mlnx-ofa_kernel-5.8/backports/0057-BACKPORT-drivers-infiniband-ulp-ipoib-ipoib_ethtool..patch new file mode 100644 index 0000000..d9947ca --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0057-BACKPORT-drivers-infiniband-ulp-ipoib-ipoib_ethtool..patch @@ -0,0 +1,160 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/ulp/ipoib/ipoib_ethtool.c + +Change-Id: I82df225cbec7991170ccfa59f9f1efa2364d8fc1 +--- + drivers/infiniband/ulp/ipoib/ipoib_ethtool.c | 66 ++++++++++++++++++++ + 1 file changed, 66 insertions(+) + +--- a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c ++++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c +@@ -59,9 +59,13 @@ static const struct ipoib_stats ipoib_gs + #define IPOIB_GLOBAL_STATS_LEN ARRAY_SIZE(ipoib_gstrings_stats) + + static int ipoib_set_ring_param(struct net_device *dev, ++#ifdef HAVE_GET_RINGPARAM_GET_4_PARAMS + struct ethtool_ringparam *ringparam, + struct kernel_ethtool_ringparam *kernel_param, + struct netlink_ext_ack *extack) ++#else ++ struct ethtool_ringparam *ringparam) ++#endif + { + struct ipoib_dev_priv *priv = ipoib_priv(dev); + unsigned int new_recvq_size, new_sendq_size; +@@ -102,7 +106,11 @@ static int ipoib_set_ring_param(struct n + priv_current_flags = priv->flags; + dev_current_flags = dev->flags; + ++#ifdef HAVE_DEV_CHANGE_FLAGS_HAS_3_PARAMS + dev_change_flags(dev, dev->flags & ~IFF_UP, NULL); ++#else ++ dev_change_flags(dev, dev->flags & ~IFF_UP); ++#endif + priv->rn_ops->ndo_uninit(dev); + + do { +@@ -138,16 +146,24 @@ static int ipoib_set_ring_param(struct n + dev->name, new_recvq_size, new_sendq_size); + + if (dev_current_flags & IFF_UP) ++#ifdef HAVE_DEV_CHANGE_FLAGS_HAS_3_PARAMS + dev_change_flags(dev, dev_current_flags, NULL); ++#else ++ dev_change_flags(dev, dev_current_flags); ++#endif + } + + return 0; + } + + static void ipoib_get_ring_param(struct net_device *dev, ++#ifdef HAVE_GET_RINGPARAM_GET_4_PARAMS + struct ethtool_ringparam *ringparam, + struct kernel_ethtool_ringparam *kernel_param, + struct netlink_ext_ack *extack) ++#else ++ struct ethtool_ringparam *ringparam) ++#endif + { + struct ipoib_dev_priv *priv = ipoib_priv(dev); + +@@ -175,9 +191,13 @@ static void ipoib_get_drvinfo(struct net + } + + static int ipoib_get_coalesce(struct net_device *dev, ++#ifdef HAVE_NDO_GET_COALESCE_GET_4_PARAMS + struct ethtool_coalesce *coal, + struct kernel_ethtool_coalesce *kernel_coal, + struct netlink_ext_ack *extack) ++#else ++ struct ethtool_coalesce *coal) ++#endif + { + struct ipoib_dev_priv *priv = ipoib_priv(dev); + +@@ -188,9 +208,13 @@ static int ipoib_get_coalesce(struct net + } + + static int ipoib_set_coalesce(struct net_device *dev, ++#ifdef HAVE_NDO_GET_COALESCE_GET_4_PARAMS + struct ethtool_coalesce *coal, + struct kernel_ethtool_coalesce *kernel_coal, + struct netlink_ext_ack *extack) ++#else ++ struct ethtool_coalesce *coal) ++#endif + { + struct ipoib_dev_priv *priv = ipoib_priv(dev); + int ret; +@@ -216,6 +240,39 @@ static int ipoib_set_coalesce(struct net + + return 0; + } ++ ++#ifdef HAVE_ETHTOOL_GET_SET_SETTINGS ++static int ipoib_get_settings(struct net_device *dev, struct ethtool_cmd *ecmd) ++{ ++ struct ipoib_dev_priv *priv = ipoib_priv(dev); ++ struct ib_port_attr attr; ++ char *speed = ""; ++ int rate;/* in deci-Gb/sec */ ++ int ret; ++ ++ ret = ib_query_port(priv->ca, priv->port, &attr); ++ if (ret) ++ return ret; ++ ++ ecmd->duplex = DUPLEX_FULL; ++ ecmd->autoneg = AUTONEG_DISABLE; ++ ecmd->phy_address = 255; ++ ecmd->port = PORT_OTHER;/* till define IB port type */ ++ ++ ib_active_speed_enum_to_rate(attr.active_speed, ++ &rate, ++ &speed); ++ ++ rate *= ib_width_enum_to_int(attr.active_width); ++ if (rate < 0) ++ rate = -1; ++ ++ ethtool_cmd_speed_set(ecmd, rate * 100); ++ ++ return 0; ++} ++#endif ++ + static void ipoib_get_ethtool_stats(struct net_device *dev, + struct ethtool_stats __always_unused *stats, + u64 *data) +@@ -282,6 +339,7 @@ static inline int ib_speed_enum_to_int(i + return SPEED_UNKNOWN; + } + ++#ifdef HAVE_GET_SET_LINK_KSETTINGS + static int ipoib_get_link_ksettings(struct net_device *netdev, + struct ethtool_link_ksettings *cmd) + { +@@ -319,14 +377,22 @@ static int ipoib_get_link_ksettings(stru + + return 0; + } ++#endif + + static const struct ethtool_ops ipoib_ethtool_ops = { ++#ifdef HAVE_SUPPORTED_COALESCE_PARAM + .supported_coalesce_params = ETHTOOL_COALESCE_RX_USECS | + ETHTOOL_COALESCE_RX_MAX_FRAMES, ++#endif ++#ifdef HAVE_GET_SET_LINK_KSETTINGS + .get_link_ksettings = ipoib_get_link_ksettings, ++#endif + .get_drvinfo = ipoib_get_drvinfo, + .get_coalesce = ipoib_get_coalesce, + .set_coalesce = ipoib_set_coalesce, ++#ifdef HAVE_ETHTOOL_GET_SET_SETTINGS ++ .get_settings = ipoib_get_settings, ++#endif + .get_strings = ipoib_get_strings, + .get_ethtool_stats = ipoib_get_ethtool_stats, + .get_sset_count = ipoib_get_sset_count, diff --git a/src/mlnx-ofa_kernel-5.8/backports/0058-BACKPORT-drivers-infiniband-ulp-ipoib-ipoib_fs.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0058-BACKPORT-drivers-infiniband-ulp-ipoib-ipoib_fs.c.patch new file mode 100644 index 0000000..b4d9908 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0058-BACKPORT-drivers-infiniband-ulp-ipoib-ipoib_fs.c.patch @@ -0,0 +1,104 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/ulp/ipoib/ipoib_fs.c + +Change-Id: I3f4f0880c32774d6765e4ede3521500b02b2ed91 +--- + drivers/infiniband/ulp/ipoib/ipoib_fs.c | 66 ++++++++++++++++++++++++- + 1 file changed, 65 insertions(+), 1 deletion(-) + +--- a/drivers/infiniband/ulp/ipoib/ipoib_fs.c ++++ b/drivers/infiniband/ulp/ipoib/ipoib_fs.c +@@ -124,6 +124,7 @@ static int ipoib_mcg_seq_show(struct seq + return 0; + } + ++#ifdef DEFINE_SEQ_ATTRIBUTE + static const struct seq_operations ipoib_mcg_sops = { + .start = ipoib_mcg_seq_start, + .next = ipoib_mcg_seq_next, +@@ -133,6 +134,38 @@ static const struct seq_operations ipoib + + DEFINE_SEQ_ATTRIBUTE(ipoib_mcg); + ++#else ++static const struct seq_operations ipoib_mcg_seq_ops = { ++ .start = ipoib_mcg_seq_start, ++ .next = ipoib_mcg_seq_next, ++ .stop = ipoib_mcg_seq_stop, ++ .show = ipoib_mcg_seq_show, ++}; ++ ++static int ipoib_mcg_open(struct inode *inode, struct file *file) ++{ ++ struct seq_file *seq; ++ int ret; ++ ++ ret = seq_open(file, &ipoib_mcg_seq_ops); ++ if (ret) ++ return ret; ++ ++ seq = file->private_data; ++ seq->private = inode->i_private; ++ ++ return 0; ++} ++ ++static const struct file_operations ipoib_mcg_fops = { ++ .owner = THIS_MODULE, ++ .open = ipoib_mcg_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release ++}; ++#endif ++ + static void *ipoib_path_seq_start(struct seq_file *file, loff_t *pos) + { + struct ipoib_path_iter *iter; +@@ -207,7 +240,7 @@ static int ipoib_path_seq_show(struct se + + return 0; + } +- ++#ifdef DEFINE_SEQ_ATTRIBUTE + static const struct seq_operations ipoib_path_sops = { + .start = ipoib_path_seq_start, + .next = ipoib_path_seq_next, +@@ -216,6 +249,37 @@ static const struct seq_operations ipoib + }; + + DEFINE_SEQ_ATTRIBUTE(ipoib_path); ++#else ++static const struct seq_operations ipoib_path_seq_ops = { ++ .start = ipoib_path_seq_start, ++ .next = ipoib_path_seq_next, ++ .stop = ipoib_path_seq_stop, ++ .show = ipoib_path_seq_show, ++}; ++ ++static int ipoib_path_open(struct inode *inode, struct file *file) ++{ ++ struct seq_file *seq; ++ int ret; ++ ++ ret = seq_open(file, &ipoib_path_seq_ops); ++ if (ret) ++ return ret; ++ ++ seq = file->private_data; ++ seq->private = inode->i_private; ++ ++ return 0; ++} ++ ++static const struct file_operations ipoib_path_fops = { ++ .owner = THIS_MODULE, ++ .open = ipoib_path_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release ++}; ++#endif + + void ipoib_create_debug_files(struct net_device *dev) + { diff --git a/src/mlnx-ofa_kernel-5.8/backports/0059-BACKPORT-drivers-infiniband-ulp-ipoib-ipoib_ib.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0059-BACKPORT-drivers-infiniband-ulp-ipoib-ipoib_ib.c.patch new file mode 100644 index 0000000..c5f2bf1 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0059-BACKPORT-drivers-infiniband-ulp-ipoib-ipoib_ib.c.patch @@ -0,0 +1,113 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/ulp/ipoib/ipoib_ib.c + +Change-Id: I183263414706486820000fcd8fa0e4f127a86042 +--- + drivers/infiniband/ulp/ipoib/ipoib_ib.c | 37 +++++++++++++++++++++---- + 1 file changed, 32 insertions(+), 5 deletions(-) + +--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c ++++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c +@@ -202,7 +202,6 @@ static inline void ipoib_create_repath_e + else + kfree(arp_repath); + } +- + static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) + { + struct ipoib_dev_priv *priv = ipoib_priv(dev); +@@ -283,7 +282,6 @@ static void ipoib_ib_handle_rx_wc(struct + } + + skb_pull(skb, IB_GRH_BYTES); +- + skb->protocol = ((struct ipoib_header *) skb->data)->proto; + skb_add_pseudo_hdr(skb); + +@@ -299,8 +297,14 @@ static void ipoib_ib_handle_rx_wc(struct + if ((dev->features & NETIF_F_RXCSUM) && + likely(wc->wc_flags & IB_WC_IP_CSUM_OK)) + skb->ip_summed = CHECKSUM_UNNECESSARY; +- ++#ifdef CONFIG_COMPAT_LRO_ENABLED_IPOIB ++ if (dev->features & NETIF_F_LRO) ++ lro_receive_skb(&priv->lro.lro_mgr, skb, NULL); ++ else ++ netif_receive_skb(skb); ++#else + napi_gro_receive(&priv->recv_napi, skb); ++#endif + + repost: + if (unlikely(ipoib_ib_post_receive(dev, wr_id))) +@@ -329,8 +333,12 @@ int ipoib_dma_map_tx(struct ib_device *c + const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + mapping[i + off] = ib_dma_map_page(ca, + skb_frag_page(frag), +- skb_frag_off(frag), +- skb_frag_size(frag), ++#ifdef HAVE_SKB_FRAG_OFF ++ skb_frag_off(frag), ++ skb_frag_size(frag), ++#else ++ frag->page_offset, skb_frag_size(frag), ++#endif + DMA_TO_DEVICE); + if (unlikely(ib_dma_mapping_error(ca, mapping[i + off]))) + goto partial_error; +@@ -521,6 +529,10 @@ poll_more: + } + + if (done < budget) { ++#ifdef CONFIG_COMPAT_LRO_ENABLED_IPOIB ++ if (dev->features & NETIF_F_LRO) ++ lro_flush_all(&priv->lro.lro_mgr); ++#endif + napi_complete(napi); + if (unlikely(ib_req_notify_cq(priv->recv_cq, + IB_CQ_NEXT_COMP | +@@ -1141,11 +1153,17 @@ static bool ipoib_dev_addr_changed_valid + { + union ib_gid search_gid; + union ib_gid gid0; ++#ifndef HAVE_DEV_ADDR_MOD ++ union ib_gid *netdev_gid; ++#endif + int err; + u16 index; + u32 port; + bool ret = false; + ++#ifndef HAVE_DEV_ADDR_MOD ++ netdev_gid = (union ib_gid *)(priv->dev->dev_addr + 4); ++#endif + if (rdma_query_gid(priv->ca, priv->port, 0, &gid0)) + return false; + +@@ -1155,8 +1173,12 @@ static bool ipoib_dev_addr_changed_valid + * to do it later + */ + priv->local_gid.global.subnet_prefix = gid0.global.subnet_prefix; ++#ifdef HAVE_DEV_ADDR_MOD + dev_addr_mod(priv->dev, 4, (u8 *)&gid0.global.subnet_prefix, + sizeof(gid0.global.subnet_prefix)); ++#else ++ netdev_gid->global.subnet_prefix = gid0.global.subnet_prefix; ++#endif + search_gid.global.subnet_prefix = gid0.global.subnet_prefix; + + search_gid.global.interface_id = priv->local_gid.global.interface_id; +@@ -1218,8 +1240,13 @@ static bool ipoib_dev_addr_changed_valid + if (!test_bit(IPOIB_FLAG_DEV_ADDR_CTRL, &priv->flags)) { + memcpy(&priv->local_gid, &gid0, + sizeof(priv->local_gid)); ++#ifdef HAVE_DEV_ADDR_MOD + dev_addr_mod(priv->dev, 4, (u8 *)&gid0, + sizeof(priv->local_gid)); ++#else ++ memcpy(priv->dev->dev_addr + 4, &gid0, ++ sizeof(priv->local_gid)); ++#endif + ret = true; + } + } diff --git a/src/mlnx-ofa_kernel-5.8/backports/0060-BACKPORT-drivers-infiniband-ulp-ipoib-ipoib_main.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0060-BACKPORT-drivers-infiniband-ulp-ipoib-ipoib_main.c.patch new file mode 100644 index 0000000..b50be1d --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0060-BACKPORT-drivers-infiniband-ulp-ipoib-ipoib_main.c.patch @@ -0,0 +1,773 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/ulp/ipoib/ipoib_main.c + +Change-Id: Icd772e7f5a73b8d1819c95f06813958fc04cf988 +--- + drivers/infiniband/ulp/ipoib/ipoib_main.c | 365 ++++++++++++++++++++-- + 1 file changed, 335 insertions(+), 30 deletions(-) + +--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c ++++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c +@@ -56,6 +56,9 @@ + MODULE_AUTHOR("Roland Dreier"); + MODULE_DESCRIPTION("IP-over-InfiniBand net driver"); + MODULE_LICENSE("Dual BSD/GPL"); ++#ifdef RETPOLINE_MLNX ++MODULE_INFO(retpoline, "Y"); ++#endif + + int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE; + int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE; +@@ -101,7 +104,9 @@ static struct net_device *ipoib_get_net_ + struct ib_device *dev, u32 port, u16 pkey, + const union ib_gid *gid, const struct sockaddr *addr, + void *client_data); ++#if LINUX_VERSION_CODE > KERNEL_VERSION(4, 7, 0) + static int ipoib_set_mac(struct net_device *dev, void *addr); ++#endif + static int ipoib_ioctl(struct net_device *dev, struct ifreq *ifr, + int cmd); + +@@ -116,8 +121,12 @@ static struct ib_client ipoib_client = { + static int ipoib_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) + { ++#ifdef HAVE_NETDEV_NOTIFIER_INFO + struct netdev_notifier_info *ni = ptr; + struct net_device *dev = ni->dev; ++#else ++ struct net_device *dev = ptr; ++#endif + + if (dev->netdev_ops->ndo_open != ipoib_open) + return NOTIFY_DONE; +@@ -169,7 +178,11 @@ int ipoib_open(struct net_device *dev) + if (flags & IFF_UP) + continue; + ++#ifdef HAVE_DEV_CHANGE_FLAGS_HAS_3_PARAMS + dev_change_flags(cpriv->dev, flags | IFF_UP, NULL); ++#else ++ dev_change_flags(cpriv->dev, flags | IFF_UP); ++#endif + } + up_read(&priv->vlan_rwsem); + } else if (priv->parent) { +@@ -214,7 +227,11 @@ static int ipoib_stop(struct net_device + if (!(flags & IFF_UP)) + continue; + ++#ifdef HAVE_DEV_CHANGE_FLAGS_HAS_3_PARAMS + dev_change_flags(cpriv->dev, flags & ~IFF_UP, NULL); ++#else ++ dev_change_flags(cpriv->dev, flags & ~IFF_UP); ++#endif + } + up_read(&priv->vlan_rwsem); + } +@@ -265,14 +282,21 @@ static int ipoib_change_mtu(struct net_d + "link layer MTU - 4 (%u)\n", priv->mcast_mtu); + + new_mtu = min(priv->mcast_mtu, priv->admin_mtu); +- ++#ifdef HAVE_NDO_CHANGE_MTU_EXTENDED ++ if (priv->rn_ops->extended.ndo_change_mtu) { ++#else + if (priv->rn_ops->ndo_change_mtu) { ++#endif + bool carrier_status = netif_carrier_ok(dev); + + netif_carrier_off(dev); + + /* notify lower level on the real mtu */ ++#ifdef HAVE_NDO_CHANGE_MTU_EXTENDED ++ ret = priv->rn_ops->extended.ndo_change_mtu(dev, new_mtu); ++#else + ret = priv->rn_ops->ndo_change_mtu(dev, new_mtu); ++#endif + + if (carrier_status) + netif_carrier_on(dev); +@@ -283,15 +307,42 @@ static int ipoib_change_mtu(struct net_d + return ret; + } + ++#ifdef HAVE_NDO_GET_STATS64_RET_VOID + static void ipoib_get_stats(struct net_device *dev, + struct rtnl_link_stats64 *stats) ++#elif defined(HAVE_NDO_GET_STATS64) ++static struct rtnl_link_stats64 * ipoib_get_stats(struct net_device *dev, ++ struct rtnl_link_stats64 *stats) ++#else ++static struct net_device_stats * ipoib_get_stats(struct net_device *dev) ++#endif + { + struct ipoib_dev_priv *priv = ipoib_priv(dev); + ++#if !defined(HAVE_NDO_GET_STATS64) && !defined(HAVE_NDO_GET_STATS64_RET_VOID) ++ struct net_device_stats *stats = &priv->ret_stats; ++#endif ++#ifdef HAVE_NDO_GET_STATS64_RET_VOID + if (priv->rn_ops->ndo_get_stats64) + priv->rn_ops->ndo_get_stats64(dev, stats); + else + netdev_stats_to_stats64(stats, &dev->stats); ++#elif defined(HAVE_NDO_GET_STATS64) ++ if (priv->rn_ops->ndo_get_stats64) { ++ return priv->rn_ops->ndo_get_stats64(dev, stats); ++ } else { ++ netdev_stats_to_stats64(stats, ++ &dev->stats); ++ return stats; ++ } ++#else ++ if (priv->rn_ops->ndo_get_stats) { ++ return priv->rn_ops->ndo_get_stats(dev); ++ } else { ++ memcpy(stats, &dev->stats, sizeof(priv->ret_stats)); ++ return stats; ++ } ++#endif + } + + /* Called with an RCU read lock taken */ +@@ -310,9 +361,21 @@ static bool ipoib_is_dev_match_addr_rcu( + if (!in_dev) + return false; + ++#ifdef HAVE_INET_CONFIRM_ADDR_EXPORTED ++#ifdef HAVE_INET_CONFIRM_ADDR_5_PARAMS + ret_addr = inet_confirm_addr(net, in_dev, 0, + addr_in->sin_addr.s_addr, + RT_SCOPE_HOST); ++#else ++ ret_addr = inet_confirm_addr(in_dev, 0, ++ addr_in->sin_addr.s_addr, ++ RT_SCOPE_HOST); ++#endif ++#else ++ ret_addr = confirm_addr_indev(in_dev, 0, ++ addr_in->sin_addr.s_addr, ++ RT_SCOPE_HOST); ++#endif + in_dev_put(in_dev); + if (ret_addr) + return true; +@@ -357,10 +420,19 @@ struct ipoib_walk_data { + struct net_device *result; + }; + ++#ifdef HAVE_NETDEV_WALK_ALL_UPPER_DEV_RCU + static int ipoib_upper_walk(struct net_device *upper, +- struct netdev_nested_priv *priv) ++#ifdef HAVE_NETDEV_NESTED_PRIV_STRUCT ++ struct netdev_nested_priv *priv) ++#else ++ void *_data) ++#endif + { ++#ifdef HAVE_NETDEV_NESTED_PRIV_STRUCT + struct ipoib_walk_data *data = (struct ipoib_walk_data *)priv->data; ++#else ++ struct ipoib_walk_data *data = _data; ++#endif + int ret = 0; + + if (ipoib_is_dev_match_addr_rcu(data->addr, upper)) { +@@ -371,6 +443,7 @@ static int ipoib_upper_walk(struct net_d + + return ret; + } ++#endif + + /** + * ipoib_get_net_dev_match_addr - Find a net_device matching +@@ -385,12 +458,19 @@ static int ipoib_upper_walk(struct net_d + static struct net_device *ipoib_get_net_dev_match_addr( + const struct sockaddr *addr, struct net_device *dev) + { ++#ifdef HAVE_NETDEV_NESTED_PRIV_STRUCT + struct netdev_nested_priv priv; ++#endif + struct ipoib_walk_data data = { + .addr = addr, + }; ++#ifndef HAVE_NETDEV_WALK_ALL_UPPER_DEV_RCU ++ struct net_device *upper; ++#endif + ++#ifdef HAVE_NETDEV_NESTED_PRIV_STRUCT + priv.data = (void *)&data; ++#endif + rcu_read_lock(); + if (ipoib_is_dev_match_addr_rcu(addr, dev)) { + dev_hold(dev); +@@ -398,7 +478,23 @@ static struct net_device *ipoib_get_net_ + goto out; + } + +- netdev_walk_all_upper_dev_rcu(dev, ipoib_upper_walk, &priv); ++#ifdef HAVE_NETDEV_WALK_ALL_UPPER_DEV_RCU ++ netdev_walk_all_upper_dev_rcu(dev, ipoib_upper_walk, ++#ifdef HAVE_NETDEV_NESTED_PRIV_STRUCT ++ &priv); ++#else ++ &data); ++#endif ++#else ++ for_each_netdev(&init_net, upper) { ++ if (ipoib_is_dev_match_addr_rcu(addr, upper)) { ++ dev_hold(upper); ++ data.result = upper; ++ break; ++ } ++ } ++#endif ++ + out: + rcu_read_unlock(); + return data.result; +@@ -727,7 +823,7 @@ static void push_pseudo_header(struct sk + { + struct ipoib_pseudo_header *phdr; + +- phdr = skb_push(skb, sizeof(*phdr)); ++ phdr = (struct ipoib_pseudo_header *)skb_push(skb, sizeof(*phdr)); + memcpy(phdr->hwaddr, daddr, INFINIBAND_ALEN); + } + +@@ -1284,15 +1380,21 @@ unref: + return NETDEV_TX_OK; + } + +-static void ipoib_timeout(struct net_device *dev, unsigned int txqueue) ++#ifdef HAVE_NDO_TX_TIMEOUT_GET_2_PARAMS ++ static void ipoib_timeout(struct net_device *dev, unsigned int txqueue) ++#else ++ static void ipoib_timeout(struct net_device *dev) ++#endif + { + struct ipoib_dev_priv *priv = ipoib_priv(dev); ++#ifdef HAVE_NDO_TX_TIMEOUT_GET_2_PARAMS + struct rdma_netdev *rn = netdev_priv(dev); + + if (rn->tx_timeout) { + rn->tx_timeout(dev, txqueue); + return; + } ++#endif + ipoib_warn(priv, "transmit timeout: latency %d msecs\n", + jiffies_to_msecs(jiffies - dev_trans_start(dev))); + ipoib_warn(priv, +@@ -1339,7 +1441,13 @@ static int ipoib_hard_header(struct sk_b + { + struct ipoib_header *header; + +- header = skb_push(skb, sizeof(*header)); ++ if (unlikely(dev->hard_header_len != IPOIB_HARD_LEN)) { ++ printk_once(KERN_DEBUG "%s: dropping packet, incorrect header length %u\n", ++ dev->name, dev->hard_header_len); ++ return -EINVAL; ++ } ++ ++ header = (struct ipoib_header *)skb_push(skb, sizeof *header); + + header->proto = htons(type); + header->reserved = 0; +@@ -1378,6 +1486,69 @@ static int ipoib_get_iflink(const struct + return priv->parent->ifindex; + } + ++#ifdef CONFIG_COMPAT_LRO_ENABLED_IPOIB ++static int get_skb_hdr(struct sk_buff *skb, void **iphdr, ++ void **tcph, u64 *hdr_flags, void *priv) ++{ ++ unsigned int ip_len; ++ struct iphdr *iph; ++ ++ if (unlikely(skb->protocol != htons(ETH_P_IP))) ++ return -1; ++ ++ /* ++ * In the future we may add an else clause that verifies the ++ * checksum and allows devices which do not calculate checksum ++ * to use LRO. ++ */ ++ if (unlikely(skb->ip_summed != CHECKSUM_UNNECESSARY)) ++ return -1; ++ ++ /* Check for non-TCP packet */ ++ skb_reset_network_header(skb); ++ iph = ip_hdr(skb); ++ if (iph->protocol != IPPROTO_TCP) ++ return -1; ++ ++ ip_len = ip_hdrlen(skb); ++ skb_set_transport_header(skb, ip_len); ++ *tcph = tcp_hdr(skb); ++ ++ /* check if IP header and TCP header are complete */ ++ if (ntohs(iph->tot_len) < ip_len + tcp_hdrlen(skb)) ++ return -1; ++ ++ *hdr_flags = LRO_IPV4 | LRO_TCP; ++ *iphdr = iph; ++ ++ return 0; ++} ++ ++#ifdef CONFIG_COMPAT_LRO_ENABLED_IPOIB ++static void ipoib_lro_setup(struct ipoib_dev_priv *priv) ++{ ++ priv->lro.lro_mgr.max_aggr = IPOIB_LRO_MAX_AGGR; ++ priv->lro.lro_mgr.max_desc = IPOIB_MAX_LRO_DESCRIPTORS; ++ priv->lro.lro_mgr.lro_arr = priv->lro.lro_desc; ++ priv->lro.lro_mgr.get_skb_header = get_skb_hdr; ++ priv->lro.lro_mgr.features = LRO_F_NAPI; ++ priv->lro.lro_mgr.dev = priv->dev; ++ priv->lro.lro_mgr.ip_summed_aggr = CHECKSUM_UNNECESSARY; ++} ++#endif ++ ++void set_lro_features_bit(struct ipoib_dev_priv *priv) ++{ ++ u64 hw_support_lro = 0; ++ hw_support_lro = priv->dev->hw_features & NETIF_F_RXCSUM; ++ if (hw_support_lro) { ++ priv->dev->features |= NETIF_F_LRO; ++ priv->dev->hw_features |= NETIF_F_LRO; ++ priv->dev->wanted_features |= NETIF_F_LRO; ++ } ++} ++#endif ++ + static u32 ipoib_addr_hash(struct ipoib_neigh_hash *htbl, u8 *daddr) + { + /* +@@ -1830,7 +2001,9 @@ static void ipoib_dev_uninit_default(str + static int ipoib_dev_init_default(struct net_device *dev) + { + struct ipoib_dev_priv *priv = ipoib_priv(dev); ++#ifdef HAVE_DEV_ADDR_MOD + u8 addr_mod[3]; ++#endif + + ipoib_napi_add(dev); + +@@ -1841,6 +2014,10 @@ static int ipoib_dev_init_default(struct + if (!priv->rx_ring) + goto out; + ++#ifdef CONFIG_COMPAT_LRO_ENABLED_IPOIB ++ ipoib_lro_setup(priv); ++#endif ++ + priv->tx_ring = vzalloc(array_size(priv->sendq_size, + sizeof(*priv->tx_ring))); + if (!priv->tx_ring) { +@@ -1858,10 +2035,16 @@ static int ipoib_dev_init_default(struct + } + + /* after qp created set dev address */ ++#ifdef HAVE_DEV_ADDR_MOD + addr_mod[0] = (priv->qp->qp_num >> 16) & 0xff; + addr_mod[1] = (priv->qp->qp_num >> 8) & 0xff; + addr_mod[2] = (priv->qp->qp_num) & 0xff; + dev_addr_mod(priv->dev, 1, addr_mod, sizeof(addr_mod)); ++#else ++ priv->dev->dev_addr[1] = (priv->qp->qp_num >> 16) & 0xff; ++ priv->dev->dev_addr[2] = (priv->qp->qp_num >> 8) & 0xff; ++ priv->dev->dev_addr[3] = (priv->qp->qp_num) & 0xff; ++#endif + + return 0; + +@@ -1881,10 +2064,17 @@ static int ipoib_ioctl(struct net_device + { + struct ipoib_dev_priv *priv = ipoib_priv(dev); + ++#ifdef HAVE_NDO_ETH_IOCTL + if (!priv->rn_ops->ndo_eth_ioctl) + return -EOPNOTSUPP; + + return priv->rn_ops->ndo_eth_ioctl(dev, ifr, cmd); ++#else ++ if (!priv->rn_ops->ndo_do_ioctl) ++ return -EOPNOTSUPP; ++ ++ return priv->rn_ops->ndo_do_ioctl(dev, ifr, cmd); ++#endif + } + + static int ipoib_dev_init(struct net_device *dev) +@@ -1968,7 +2158,11 @@ static void ipoib_parent_unregister_pre( + * running ensures the it will not add more work. + */ + rtnl_lock(); ++#ifdef HAVE_DEV_CHANGE_FLAGS_HAS_3_PARAMS + dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP, NULL); ++#else ++ dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP); ++#endif + rtnl_unlock(); + + /* ipoib_event() cannot be running once this returns */ +@@ -1986,12 +2180,12 @@ static void ipoib_set_dev_features(struc + priv->hca_caps = priv->ca->attrs.device_cap_flags; + + if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) { +- priv->dev->hw_features |= NETIF_F_IP_CSUM | NETIF_F_RXCSUM; ++ priv->dev->hw_features |= NETIF_F_IP_CSUM | NETIF_F_RXCSUM; + +- if (priv->hca_caps & IB_DEVICE_UD_TSO) +- priv->dev->hw_features |= NETIF_F_TSO; ++ if (priv->hca_caps & IB_DEVICE_UD_TSO) ++ priv->dev->hw_features |= NETIF_F_TSO; + +- priv->dev->features |= priv->dev->hw_features; ++ priv->dev->features |= priv->dev->hw_features; + } + } + +@@ -2022,13 +2216,19 @@ static int ipoib_parent_init(struct net_ + priv->ca->name, priv->port, result); + return result; + } ++#ifdef HAVE_DEV_ADDR_MOD + dev_addr_mod(priv->dev, 4, priv->local_gid.raw, sizeof(union ib_gid)); ++#else ++ memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, ++ sizeof(union ib_gid)); ++#endif + + SET_NETDEV_DEV(priv->dev, priv->ca->dev.parent); + priv->dev->dev_port = priv->port - 1; + /* Let's set this one too for backwards compatibility. */ + priv->dev->dev_id = priv->port - 1; + ++ + return 0; + } + +@@ -2043,8 +2243,13 @@ static void ipoib_child_init(struct net_ + memcpy(&priv->local_gid, priv->dev->dev_addr + 4, + sizeof(priv->local_gid)); + else { ++#ifdef HAVE_DEV_ADDR_MOD + __dev_addr_set(priv->dev, ppriv->dev->dev_addr, + INFINIBAND_ALEN); ++#else ++ memcpy(priv->dev->dev_addr, ppriv->dev->dev_addr, ++ INFINIBAND_ALEN); ++#endif + memcpy(&priv->local_gid, &ppriv->local_gid, + sizeof(priv->local_gid)); + } +@@ -2072,7 +2277,9 @@ static int ipoib_ndo_init(struct net_dev + ndev->mtu = IPOIB_UD_MTU(priv->max_ib_mtu); + priv->mcast_mtu = priv->admin_mtu = ndev->mtu; + rn->mtu = priv->mcast_mtu; +- ndev->max_mtu = IPOIB_CM_MTU; ++#ifdef HAVE_NET_DEVICE_MIN_MAX_MTU ++ ndev->max_mtu = IPOIB_CM_MTU; ++#endif + + ndev->neigh_priv_len = sizeof(struct ipoib_neigh); + +@@ -2118,6 +2325,7 @@ static void ipoib_ndo_uninit(struct net_ + * ipoib_remove_one guarantees the children are removed before the + * parent, and that is the only place where a parent can be removed. + */ ++#ifdef HAVE_NET_DEVICE_NEEDS_FREE_NETDEV + WARN_ON(!list_empty(&priv->child_intfs)); + + if (priv->parent) { +@@ -2127,6 +2335,7 @@ static void ipoib_ndo_uninit(struct net_ + list_del(&priv->list); + up_write(&ppriv->vlan_rwsem); + } ++#endif + + ipoib_neigh_hash_uninit(dev); + +@@ -2167,6 +2376,7 @@ static int ipoib_get_vf_config(struct ne + return 0; + } + ++#ifdef HAVE_NDO_SET_VF_GUID + static int ipoib_set_vf_guid(struct net_device *dev, int vf, u64 guid, int type) + { + struct ipoib_dev_priv *priv = ipoib_priv(dev); +@@ -2176,7 +2386,9 @@ static int ipoib_set_vf_guid(struct net_ + + return ib_set_vf_guid(priv->ca, vf, priv->port, guid, type); + } ++#endif + ++#ifdef HAVE_NDO_GET_VF_GUID + static int ipoib_get_vf_guid(struct net_device *dev, int vf, + struct ifla_vf_guid *node_guid, + struct ifla_vf_guid *port_guid) +@@ -2185,7 +2397,9 @@ static int ipoib_get_vf_guid(struct net_ + + return ib_get_vf_guid(priv->ca, vf, priv->port, node_guid, port_guid); + } ++#endif + ++#ifdef HAVE_NDO_GET_VF_STATS + static int ipoib_get_vf_stats(struct net_device *dev, int vf, + struct ifla_vf_stats *vf_stats) + { +@@ -2193,6 +2407,7 @@ static int ipoib_get_vf_stats(struct net + + return ib_get_vf_stats(priv->ca, vf, priv->port, vf_stats); + } ++#endif + + static int ipoib_set_vf_local_mac(struct net_device *dev, void *addr) + { +@@ -2218,20 +2433,45 @@ static const struct net_device_ops ipoib + .ndo_uninit = ipoib_ndo_uninit, + .ndo_open = ipoib_open, + .ndo_stop = ipoib_stop, +- .ndo_change_mtu = ipoib_change_mtu, ++#ifdef HAVE_NDO_CHANGE_MTU_EXTENDED ++ .extended.ndo_change_mtu = ipoib_change_mtu, ++#else ++ .ndo_change_mtu = ipoib_change_mtu, ++#endif + .ndo_fix_features = ipoib_fix_features, + .ndo_start_xmit = ipoib_start_xmit, + .ndo_tx_timeout = ipoib_timeout, + .ndo_set_rx_mode = ipoib_set_mcast_list, +- .ndo_get_iflink = ipoib_get_iflink, +- .ndo_set_vf_link_state = ipoib_set_vf_link_state, +- .ndo_get_vf_config = ipoib_get_vf_config, +- .ndo_get_vf_stats = ipoib_get_vf_stats, +- .ndo_get_vf_guid = ipoib_get_vf_guid, +- .ndo_set_vf_guid = ipoib_set_vf_guid, +- .ndo_set_mac_address = ipoib_set_mac, +- .ndo_get_stats64 = ipoib_get_stats, +- .ndo_eth_ioctl = ipoib_ioctl, ++ .ndo_get_iflink = ipoib_get_iflink, ++ .ndo_set_vf_link_state = ipoib_set_vf_link_state, ++ .ndo_get_vf_config = ipoib_get_vf_config, ++#ifdef HAVE_NDO_GET_VF_STATS ++ .ndo_get_vf_stats = ipoib_get_vf_stats, ++#endif ++#ifdef HAVE_NDO_GET_VF_GUID ++ .ndo_get_vf_guid = ipoib_get_vf_guid, ++#endif ++#ifdef HAVE_NDO_SET_VF_GUID ++ .ndo_set_vf_guid = ipoib_set_vf_guid, ++#endif ++#if LINUX_VERSION_CODE > KERNEL_VERSION(4, 7, 0) ++ .ndo_set_mac_address = ipoib_set_mac, ++#endif ++#if defined(HAVE_NDO_GET_STATS64) || defined(HAVE_NDO_GET_STATS64_RET_VOID) ++ .ndo_get_stats64 = ipoib_get_stats, ++#else ++ .ndo_get_stats = ipoib_get_stats, ++#endif ++ ++#ifdef HAVE_NDO_ETH_IOCTL ++ .ndo_eth_ioctl = ipoib_ioctl, ++#else ++ .ndo_do_ioctl = ipoib_ioctl, ++#endif ++ ++#ifdef HAVE_NET_DEVICE_OPS_EXTENDED ++ .ndo_size = sizeof(struct net_device_ops), ++#endif + }; + + static const struct net_device_ops ipoib_netdev_ops_vf = { +@@ -2239,15 +2479,32 @@ static const struct net_device_ops ipoib + .ndo_uninit = ipoib_ndo_uninit, + .ndo_open = ipoib_open, + .ndo_stop = ipoib_stop, ++#ifdef HAVE_NDO_CHANGE_MTU_EXTENDED ++ .extended.ndo_change_mtu = ipoib_change_mtu, ++#else + .ndo_change_mtu = ipoib_change_mtu, ++#endif + .ndo_fix_features = ipoib_fix_features, + .ndo_start_xmit = ipoib_start_xmit, + .ndo_tx_timeout = ipoib_timeout, + .ndo_set_rx_mode = ipoib_set_mcast_list, + .ndo_get_iflink = ipoib_get_iflink, + .ndo_set_mac_address = ipoib_set_vf_local_mac, ++#if defined(HAVE_NDO_GET_STATS64) || defined(HAVE_NDO_GET_STATS64_RET_VOID) + .ndo_get_stats64 = ipoib_get_stats, +- .ndo_eth_ioctl = ipoib_ioctl, ++#else ++ .ndo_get_stats = ipoib_get_stats, ++#endif ++ ++#ifdef HAVE_NDO_ETH_IOCTL ++ .ndo_eth_ioctl = ipoib_ioctl, ++#else ++ .ndo_do_ioctl = ipoib_ioctl, ++#endif ++ ++#ifdef HAVE_NET_DEVICE_OPS_EXTENDED ++ .ndo_size = sizeof(struct net_device_ops), ++#endif + }; + + static const struct net_device_ops ipoib_netdev_default_pf = { +@@ -2274,7 +2531,7 @@ void ipoib_setup_common(struct net_devic + dev->tx_queue_len = ipoib_sendq_size * 2; + dev->features = (NETIF_F_VLAN_CHALLENGED | + NETIF_F_HIGHDMA); +- netif_keep_dst(dev); ++ netif_keep_dst(dev); + + memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN); + +@@ -2283,7 +2540,9 @@ void ipoib_setup_common(struct net_devic + * consistently to unify all the various unregister paths, including + * those connected to rtnl_link_ops which require it. + */ ++#ifdef HAVE_NET_DEVICE_NEEDS_FREE_NETDEV + dev->needs_free_netdev = true; ++#endif + } + + static void ipoib_build_priv(struct net_device *dev) +@@ -2370,9 +2629,10 @@ int ipoib_intf_init(struct ib_device *hc + * being set, so we force it to NULL here and handle manually until it + * is safe to turn on. + */ ++#ifdef HAVE_NET_DEVICE_NEEDS_FREE_NETDEV + priv->next_priv_destructor = dev->priv_destructor; + dev->priv_destructor = NULL; +- ++#endif + ipoib_build_priv(dev); + + return 0; +@@ -2410,7 +2670,7 @@ void ipoib_intf_free(struct net_device * + { + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct rdma_netdev *rn = netdev_priv(dev); +- ++#ifdef HAVE_NET_DEVICE_NEEDS_FREE_NETDEV + dev->priv_destructor = priv->next_priv_destructor; + if (dev->priv_destructor) + dev->priv_destructor(dev); +@@ -2420,7 +2680,7 @@ void ipoib_intf_free(struct net_device * + * attempt to call priv_destructor twice, prevent that from happening. + */ + dev->priv_destructor = NULL; +- ++#endif + /* unregister/destroy is very complicated. Make bugs more obvious. */ + rn->clnt_priv = NULL; + +@@ -2485,7 +2745,11 @@ static void set_base_guid(struct ipoib_d + memcpy(&priv->local_gid.global.interface_id, + &gid->global.interface_id, + sizeof(gid->global.interface_id)); ++#ifdef HAVE_DEV_ADDR_MOD + dev_addr_mod(netdev, 4, (u8 *)&priv->local_gid, sizeof(priv->local_gid)); ++#else ++ memcpy(netdev->dev_addr + 4, &priv->local_gid, sizeof(priv->local_gid)); ++#endif + clear_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags); + + netif_addr_unlock_bh(netdev); +@@ -2498,6 +2762,7 @@ static void set_base_guid(struct ipoib_d + } + } + ++#if LINUX_VERSION_CODE > KERNEL_VERSION(4, 7, 0) + static int ipoib_check_lladdr(struct net_device *dev, + struct sockaddr_storage *ss) + { +@@ -2523,7 +2788,7 @@ static int ipoib_set_mac(struct net_devi + { + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct sockaddr_storage *ss = addr; +- int ret; ++ int ret = 0; + + if (!(dev->priv_flags & IFF_LIVE_ADDR_CHANGE) && netif_running(dev)) + return -EBUSY; +@@ -2538,6 +2803,7 @@ static int ipoib_set_mac(struct net_devi + + return 0; + } ++#endif + + static ssize_t ipoib_set_mac_using_sysfs(struct device *dev, + struct device_attribute *attr, +@@ -2686,14 +2952,22 @@ static struct net_device *ipoib_add_port + if (!rc && ops->priv_size < params.sizeof_priv) + ops->priv_size = params.sizeof_priv; + } ++#ifdef CONFIG_COMPAT_LRO_ENABLED_IPOIB ++ /* force lro on the dev->features, because the function ++ * register_netdev disable it according to our private lro ++ */ ++ set_lro_features_bit(priv); ++#endif ++ + /* + * We cannot set priv_destructor before register_netdev because we + * need priv to be always valid during the error flow to execute + * ipoib_parent_unregister_pre(). Instead handle it manually and only + * enter priv_destructor mode once we are completely registered. + */ ++#ifdef HAVE_NET_DEVICE_NEEDS_FREE_NETDEV + ndev->priv_destructor = ipoib_intf_free; +- ++#endif + if (ipoib_intercept_dev_id_attr(ndev)) + goto sysfs_failed; + if (ipoib_cm_add_mode_attr(ndev)) +@@ -2764,11 +3038,42 @@ static void ipoib_remove_one(struct ib_d + + list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, + list) +- unregister_netdevice_queue(cpriv->dev, &head); ++#ifdef HAVE_NET_DEVICE_HAS_CLOSE_LIST ++#ifndef HAVE_NET_DEVICE_NEEDS_FREE_NETDEV ++ if (cpriv->dev->reg_state == NETREG_REGISTERED) ++#endif ++ unregister_netdevice_queue(cpriv->dev, &head); + unregister_netdevice_queue(priv->dev, &head); ++#else ++#ifndef HAVE_NET_DEVICE_NEEDS_FREE_NETDEV ++ if (cpriv->dev->reg_state == NETREG_REGISTERED) ++#endif ++ unregister_netdevice(cpriv->dev); ++ unregister_netdevice(priv->dev); ++#endif ++#ifdef HAVE_NET_DEVICE_HAS_CLOSE_LIST + unregister_netdevice_many(&head); +- ++#endif + rtnl_unlock(); ++#ifndef HAVE_NET_DEVICE_NEEDS_FREE_NETDEV ++ /* Free parent resources after rtnl_unlock to ++ * avoid ipoib_get_iflink panic. ++ */ ++ list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, ++ list) ++ { ++ down_write(&priv->vlan_rwsem); ++ list_del(&cpriv->list); ++ up_write(&priv->vlan_rwsem); ++ rdma_uninit_netdev(cpriv->ca, cpriv->dev, cpriv->port, ++ RDMA_NETDEV_IPOIB, !ipoib_enhanced_enabled); ++ ipoib_intf_free(cpriv->dev); ++ } ++ ++ rdma_uninit_netdev(priv->ca, priv->dev, priv->port, ++ RDMA_NETDEV_IPOIB, !ipoib_enhanced_enabled); ++ ipoib_intf_free(priv->dev); ++#endif + } + + kfree(dev_list); diff --git a/src/mlnx-ofa_kernel-5.8/backports/0061-BACKPORT-drivers-infiniband-ulp-ipoib-ipoib_multicas.patch b/src/mlnx-ofa_kernel-5.8/backports/0061-BACKPORT-drivers-infiniband-ulp-ipoib-ipoib_multicas.patch new file mode 100644 index 0000000..0782389 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0061-BACKPORT-drivers-infiniband-ulp-ipoib-ipoib_multicas.patch @@ -0,0 +1,29 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/ulp/ipoib/ipoib_multicast.c + +Change-Id: Ibc2222aee5ab3a28759f03dc464cf52a82c809f5 +--- + drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 9 +++++---- + 1 file changed, 5 insertions(+), 4 deletions(-) + +--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c ++++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +@@ -902,13 +902,14 @@ void ipoib_mcast_restart_task(struct wor + clear_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags); + + /* Mark all of the entries that are found or don't exist */ +- netdev_for_each_mc_addr(ha, dev) { ++ netdev_for_each_mc_addr(ha, dev) { ++ + union ib_gid mgid; + +- if (!ipoib_mcast_addr_is_valid(ha->addr, dev->broadcast)) +- continue; ++ if (!ipoib_mcast_addr_is_valid(ha->addr, dev->broadcast)) ++ continue; + +- memcpy(mgid.raw, ha->addr + 4, sizeof(mgid)); ++ memcpy(mgid.raw, ha->addr + 4, sizeof(mgid)); + + mcast = __ipoib_mcast_find(dev, &mgid); + if (!mcast || test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { diff --git a/src/mlnx-ofa_kernel-5.8/backports/0062-BACKPORT-drivers-infiniband-ulp-ipoib-ipoib_netlink..patch b/src/mlnx-ofa_kernel-5.8/backports/0062-BACKPORT-drivers-infiniband-ulp-ipoib-ipoib_netlink..patch new file mode 100644 index 0000000..0c8c2b4 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0062-BACKPORT-drivers-infiniband-ulp-ipoib-ipoib_netlink..patch @@ -0,0 +1,79 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/ulp/ipoib/ipoib_netlink.c + +Change-Id: I3719fbc90d8b2c24b9409838886a43f5323edc0d +--- + drivers/infiniband/ulp/ipoib/ipoib_netlink.c | 25 +++++++++++++++++++- + 1 file changed, 24 insertions(+), 1 deletion(-) + +--- a/drivers/infiniband/ulp/ipoib/ipoib_netlink.c ++++ b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c +@@ -64,9 +64,14 @@ nla_put_failure: + return -EMSGSIZE; + } + ++#if defined(HAVE_RTNL_LINK_OPS_NEWLINK_5_PARAMS) + static int ipoib_changelink(struct net_device *dev, struct nlattr *tb[], + struct nlattr *data[], + struct netlink_ext_ack *extack) ++#else ++static int ipoib_changelink(struct net_device *dev, struct nlattr *tb[], ++ struct nlattr *data[]) ++#endif + { + u16 mode, umcast; + int ret = 0; +@@ -93,9 +98,17 @@ out_err: + return ret; + } + ++#if defined(HAVE_RTNL_LINK_OPS_NEWLINK_5_PARAMS) + static int ipoib_new_child_link(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) ++#elif defined(HAVE_RTNL_LINK_OPS_NEWLINK_4_PARAMS) ++static int ipoib_new_child_link(struct net *src_net, struct net_device *dev, ++ struct nlattr *tb[], struct nlattr *data[]) ++#else ++static int ipoib_new_child_link(struct net_device *dev, ++ struct nlattr *tb[], struct nlattr *data[]) ++#endif + { + struct net_device *pdev; + struct ipoib_dev_priv *ppriv; +@@ -104,8 +117,12 @@ static int ipoib_new_child_link(struct n + + if (!tb[IFLA_LINK]) + return -EINVAL; +- ++#ifdef HAVE_RTNL_LINK_OPS_NEWLINK_4_PARAMS + pdev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK])); ++#else ++ pdev = __dev_get_by_index(dev_net(dev), nla_get_u32(tb[IFLA_LINK])); ++#endif ++ + if (!pdev || pdev->type != ARPHRD_INFINIBAND) + return -ENODEV; + +@@ -134,7 +151,11 @@ static int ipoib_new_child_link(struct n + return err; + + if (data) { ++#if defined(HAVE_RTNL_LINK_OPS_NEWLINK_5_PARAMS) + err = ipoib_changelink(dev, tb, data, extack); ++#else ++ err = ipoib_changelink(dev, tb, data); ++#endif + if (err) { + unregister_netdevice(dev); + return err; +@@ -163,7 +184,9 @@ static size_t ipoib_get_size(const struc + + static struct rtnl_link_ops ipoib_link_ops __read_mostly = { + .kind = "ipoib", ++#ifdef HAVE_STRUCT_LINK_OPS_IPOIB_LINK_OPS_HAS_NETNS_REFUND + .netns_refund = true, ++#endif + .maxtype = IFLA_IPOIB_MAX, + .policy = ipoib_policy, + .priv_size = sizeof(struct ipoib_dev_priv), diff --git a/src/mlnx-ofa_kernel-5.8/backports/0063-BACKPORT-drivers-infiniband-ulp-ipoib-ipoib_vlan.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0063-BACKPORT-drivers-infiniband-ulp-ipoib-ipoib_vlan.c.patch new file mode 100644 index 0000000..99980ea --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0063-BACKPORT-drivers-infiniband-ulp-ipoib-ipoib_vlan.c.patch @@ -0,0 +1,66 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/ulp/ipoib/ipoib_vlan.c + +Change-Id: I66b98bcfbfde3bff1c48f5f389568af3e2483810 +--- + drivers/infiniband/ulp/ipoib/ipoib_vlan.c | 20 +++++++++++++++++--- + 1 file changed, 17 insertions(+), 3 deletions(-) + +--- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c ++++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c +@@ -105,8 +105,9 @@ int __ipoib_vlan_add(struct ipoib_dev_pr + * We do not need to touch priv if register_netdevice fails, so just + * always use this flow. + */ ++#ifdef HAVE_NET_DEVICE_NEEDS_FREE_NETDEV + ndev->priv_destructor = ipoib_intf_free; +- ++#endif + /* + * Racing with unregister of the parent must be prevented by the + * caller. +@@ -160,8 +161,14 @@ sysfs_failed: + return -ENOMEM; + + out_early: ++#ifdef HAVE_NET_DEVICE_NEEDS_FREE_NETDEV + if (ndev->priv_destructor) + ndev->priv_destructor(ndev); ++#else ++ rdma_uninit_netdev(priv->ca, priv->dev, priv->port, ++ RDMA_NETDEV_IPOIB, !ipoib_enhanced_enabled); ++ ipoib_intf_free(ndev); ++#endif + return result; + } + +@@ -228,13 +235,15 @@ static void ipoib_vlan_delete_task(struc + struct ipoib_vlan_delete_work *pwork = + container_of(work, struct ipoib_vlan_delete_work, work); + struct net_device *dev = pwork->dev; ++ struct ipoib_dev_priv *priv = NULL; ++ struct ipoib_dev_priv *ppriv = NULL; + + rtnl_lock(); + + /* Unregistering tasks can race with another task or parent removal */ + if (dev->reg_state == NETREG_REGISTERED) { +- struct ipoib_dev_priv *priv = ipoib_priv(dev); +- struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent); ++ priv = ipoib_priv(dev); ++ ppriv = ipoib_priv(priv->parent); + + ipoib_dbg(ppriv, "delete child vlan %s\n", dev->name); + unregister_netdevice(dev); +@@ -242,6 +251,11 @@ static void ipoib_vlan_delete_task(struc + + rtnl_unlock(); + ++#ifndef HAVE_NET_DEVICE_NEEDS_FREE_NETDEV ++ rdma_uninit_netdev(priv->ca, priv->dev, priv->port, ++ RDMA_NETDEV_IPOIB, !ipoib_enhanced_enabled); ++ ipoib_intf_free(priv->dev); ++#endif + kfree(pwork); + } + diff --git a/src/mlnx-ofa_kernel-5.8/backports/0064-BACKPORT-drivers-infiniband-ulp-iser-iscsi_iser.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0064-BACKPORT-drivers-infiniband-ulp-iser-iscsi_iser.c.patch new file mode 100644 index 0000000..bee5e41 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0064-BACKPORT-drivers-infiniband-ulp-iser-iscsi_iser.c.patch @@ -0,0 +1,264 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/ulp/iser/iscsi_iser.c + +Change-Id: I14c334184582c18ab629558c244fe4e27b605458 +--- + drivers/infiniband/ulp/iser/iscsi_iser.c | 128 ++++++++++++++++++++++- + 1 file changed, 126 insertions(+), 2 deletions(-) + +--- a/drivers/infiniband/ulp/iser/iscsi_iser.c ++++ b/drivers/infiniband/ulp/iser/iscsi_iser.c +@@ -76,6 +76,9 @@ + + MODULE_DESCRIPTION("iSER (iSCSI Extensions for RDMA) Datamover"); + MODULE_LICENSE("Dual BSD/GPL"); ++#ifdef RETPOLINE_MLNX ++MODULE_INFO(retpoline, "Y"); ++#endif + MODULE_AUTHOR("Alex Nezhinsky, Dan Bar Dov, Or Gerlitz"); + + static struct scsi_host_template iscsi_iser_sht; +@@ -113,6 +116,12 @@ bool iser_pi_enable = false; + module_param_named(pi_enable, iser_pi_enable, bool, S_IRUGO); + MODULE_PARM_DESC(pi_enable, "Enable T10-PI offload support (default:disabled)"); + ++#ifndef HAVE_SCSI_CMND_PROT_FLAGS ++int iser_pi_guard; ++module_param_named(pi_guard, iser_pi_guard, int, S_IRUGO); ++MODULE_PARM_DESC(pi_guard, "T10-PI guard_type, 0:CRC|1:IP_CSUM (default:IP_CSUM)"); ++#endif ++ + static int iscsi_iser_set(const char *val, const struct kernel_param *kp) + { + int ret; +@@ -221,6 +230,27 @@ int iser_initialize_task_headers(struct + return 0; + } + ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0) ++/** ++ * set_last_ping_on_nopout_task() ++ * @task: iscsi task ++ * ++ * Workaround libiscsi not setting iscsi_conn->last_ping ++ * in case of failure. ++ * fixed in kernels > 4.3 ++ */ ++static inline void ++set_last_ping_on_nopout_task(struct iscsi_task *task) ++{ ++ u8 task_opcode = (task->hdr->opcode & ISCSI_OPCODE_MASK); ++ ++ if (task_opcode == ISCSI_OP_NOOP_OUT) ++ task->conn->last_ping = jiffies; ++} ++#endif ++ ++ + /** + * iscsi_iser_task_init() - Initialize iscsi-iser task + * @task: iscsi task +@@ -237,8 +267,18 @@ static int iscsi_iser_task_init(struct i + + ret = iser_initialize_task_headers(task, &iser_task->desc); + if (ret) { ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0) ++ u8 task_opcode = (task->hdr->opcode & ISCSI_OPCODE_MASK); ++ ++ iser_err("Failed to init task %p, opcode %d, err = %d", ++ iser_task, task_opcode, ret); ++ ++ set_last_ping_on_nopout_task(task); ++#else + iser_err("Failed to init task %p, err = %d\n", + iser_task, ret); ++#endif ++ + return ret; + } + +@@ -272,6 +312,17 @@ static int iscsi_iser_mtask_xmit(struct + iser_dbg("mtask xmit [cid %d itt 0x%x]\n", conn->id, task->itt); + + error = iser_send_control(conn, task); ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0) ++ if (error) { ++ u8 task_opcode = (task->hdr->opcode & ISCSI_OPCODE_MASK); ++ ++ iser_err("Failed to send task %p, opcode %d, err = %d", ++ task->dd_data, task_opcode, error); ++ ++ set_last_ping_on_nopout_task(task); ++ ++ } ++#endif + + /* since iser xmits control with zero copy, tasks can not be recycled + * right after sending them. +@@ -493,7 +544,9 @@ static int iscsi_iser_conn_bind(struct i + iser_conn->iscsi_conn = conn; + + out: ++#ifdef HAVE_ISCSI_PUT_ENDPOINT + iscsi_put_endpoint(ep); ++#endif + mutex_unlock(&iser_conn->state_mutex); + return error; + } +@@ -568,7 +621,11 @@ static void iscsi_iser_session_destroy(s + struct Scsi_Host *shost = iscsi_session_to_shost(cls_session); + + iscsi_session_teardown(cls_session); ++#ifdef HAVE_ISCSI_HOST_REMOVE_2_PARAMS ++ iscsi_host_remove(shost, false); ++#else + iscsi_host_remove(shost); ++#endif + iscsi_host_free(shost); + } + +@@ -578,7 +635,9 @@ static inline unsigned int iser_dif_prot + + if (prot_caps & IB_PROT_T10DIF_TYPE_1) + ret |= SHOST_DIF_TYPE1_PROTECTION | ++#ifdef HAVE_SCSI_CMND_PROT_FLAGS + SHOST_DIX_TYPE0_PROTECTION | ++#endif + SHOST_DIX_TYPE1_PROTECTION; + if (prot_caps & IB_PROT_T10DIF_TYPE_2) + ret |= SHOST_DIF_TYPE2_PROTECTION | +@@ -646,13 +705,20 @@ iscsi_iser_session_create(struct iscsi_e + + shost->sg_prot_tablesize = shost->sg_tablesize; + scsi_host_set_prot(shost, iser_dif_prot_caps(sig_caps)); ++#ifdef HAVE_SCSI_CMND_PROT_FLAGS + scsi_host_set_guard(shost, SHOST_DIX_GUARD_IP | + SHOST_DIX_GUARD_CRC); ++#else ++ if (iser_pi_guard) ++ scsi_host_set_guard(shost, SHOST_DIX_GUARD_IP); ++ else ++ scsi_host_set_guard(shost, SHOST_DIX_GUARD_CRC); ++#endif + } +- ++#ifdef HAVE_SCSI_HOST_VIRT_BOUNDARY_MASK + if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG)) + shost->virt_boundary_mask = SZ_4K - 1; +- ++#endif + if (iscsi_host_add(shost, ib_dev->dev.parent)) { + mutex_unlock(&iser_conn->state_mutex); + goto free_host; +@@ -685,7 +751,11 @@ iscsi_iser_session_create(struct iscsi_e + return cls_session; + + remove_host: ++#ifdef HAVE_ISCSI_HOST_REMOVE_2_PARAMS ++ iscsi_host_remove(shost, false); ++#else + iscsi_host_remove(shost); ++#endif + free_host: + iscsi_host_free(shost); + return NULL; +@@ -756,7 +826,13 @@ static void iscsi_iser_conn_get_stats(st + stats->r2t_pdus = conn->r2t_pdus_cnt; /* always 0 */ + stats->tmfcmd_pdus = conn->tmfcmd_pdus_cnt; + stats->tmfrsp_pdus = conn->tmfrsp_pdus_cnt; ++#ifdef HAVE_VIRT_BOUNDARY + stats->custom_length = 0; ++#else ++ stats->custom_length = 1; ++ strcpy(stats->custom[0].desc, "fmr_unalign_cnt"); ++ stats->custom[0].value = conn->fmr_unalign_cnt; ++#endif + } + + static int iscsi_iser_get_ep_param(struct iscsi_endpoint *ep, +@@ -956,21 +1032,60 @@ static umode_t iser_attr_is_visible(int + return 0; + } + ++#ifdef USE_SLAVE_ALLOC_HANDLER ++static int iscsi_iser_slave_alloc(struct scsi_device *sdev) ++{ ++ struct iscsi_session *session; ++ struct iser_conn *iser_conn; ++ struct ib_device *ib_dev; ++ ++ mutex_lock(&unbind_iser_conn_mutex); ++ ++ session = starget_to_session(scsi_target(sdev))->dd_data; ++ iser_conn = session->leadconn->dd_data; ++ if (!iser_conn) { ++ mutex_unlock(&unbind_iser_conn_mutex); ++ return -ENOTCONN; ++ } ++ ib_dev = iser_conn->ib_conn.device->ib_device; ++ ++ if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG)) ++ blk_queue_virt_boundary(sdev->request_queue, ~(SZ_4K - 1)); ++ ++ mutex_unlock(&unbind_iser_conn_mutex); ++ return 0; ++} ++#endif ++ + static struct scsi_host_template iscsi_iser_sht = { + .module = THIS_MODULE, + .name = "iSCSI Initiator over iSER", + .queuecommand = iscsi_queuecommand, ++#ifdef HAVE_SCSI_CHANGE_QUEUE_DEPTH + .change_queue_depth = scsi_change_queue_depth, ++#else ++ .change_queue_depth = iscsi_change_queue_depth, ++#endif + .sg_tablesize = ISCSI_ISER_DEF_SG_TABLESIZE, + .cmd_per_lun = ISER_DEF_CMD_PER_LUN, ++#ifdef HAVE_ISCSI_EH_CMD_TIMED_OUT + .eh_timed_out = iscsi_eh_cmd_timed_out, ++#endif + .eh_abort_handler = iscsi_eh_abort, + .eh_device_reset_handler= iscsi_eh_device_reset, + .eh_target_reset_handler = iscsi_eh_recover_target, + .target_alloc = iscsi_target_alloc, ++#ifdef ENABLE_CLUSTERING ++ .use_clustering = ENABLE_CLUSTERING, ++#endif ++#ifdef USE_SLAVE_ALLOC_HANDLER ++ .slave_alloc = iscsi_iser_slave_alloc, ++#endif + .proc_name = "iscsi_iser", + .this_id = -1, ++#ifdef HAVE_SCSI_HOST_TEMPLATE_TRACK_QUEUE_DEPTH + .track_queue_depth = 1, ++#endif + }; + + static struct iscsi_transport iscsi_iser_transport = { +@@ -983,7 +1098,9 @@ static struct iscsi_transport iscsi_iser + /* connection management */ + .create_conn = iscsi_iser_conn_create, + .bind_conn = iscsi_iser_conn_bind, ++#ifdef HAVE_ISCSI_CONN_UNBIND + .unbind_conn = iscsi_conn_unbind, ++#endif + .destroy_conn = iscsi_conn_teardown, + .attr_is_visible = iser_attr_is_visible, + .set_param = iscsi_iser_set_param, +@@ -1017,6 +1134,13 @@ static int __init iser_init(void) + + iser_dbg("Starting iSER datamover...\n"); + ++#ifndef HAVE_SCSI_CMND_PROT_FLAGS ++ if (iser_pi_guard < 0 || iser_pi_guard > 1) { ++ iser_err("Invalid pi_guard value of %d\n", iser_pi_guard); ++ return -EINVAL; ++ } ++#endif ++ + memset(&ig, 0, sizeof(struct iser_global)); + + ig.desc_cache = kmem_cache_create("iser_descriptors", diff --git a/src/mlnx-ofa_kernel-5.8/backports/0065-BACKPORT-drivers-infiniband-ulp-iser-iscsi_iser.h.patch b/src/mlnx-ofa_kernel-5.8/backports/0065-BACKPORT-drivers-infiniband-ulp-iser-iscsi_iser.h.patch new file mode 100644 index 0000000..10eab6d --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0065-BACKPORT-drivers-infiniband-ulp-iser-iscsi_iser.h.patch @@ -0,0 +1,82 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/ulp/iser/iscsi_iser.h + +Change-Id: Ia7e8f94d14cd6d93e4514bae5533de32b408fd81 +--- + drivers/infiniband/ulp/iser/iscsi_iser.h | 40 ++++++++++++++++++++++++ + 1 file changed, 40 insertions(+) + +--- a/drivers/infiniband/ulp/iser/iscsi_iser.h ++++ b/drivers/infiniband/ulp/iser/iscsi_iser.h +@@ -59,6 +59,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -67,10 +68,42 @@ + #include + #include + ++#if defined(CONFIG_COMPAT_RHEL_7_3) || defined(CONFIG_COMPAT_RHEL_7_2) ++#undef HAVE_BLK_QUEUE_VIRT_BOUNDARY ++#endif ++ ++#if defined(HAVE_BLK_QUEUE_VIRT_BOUNDARY) && \ ++ !defined(HAVE_SCSI_HOST_VIRT_BOUNDARY_MASK) ++#define USE_SLAVE_ALLOC_HANDLER 1 ++#endif ++ ++#if defined(HAVE_BLK_QUEUE_VIRT_BOUNDARY) || \ ++ defined(HAVE_SCSI_HOST_VIRT_BOUNDARY_MASK) ++#define HAVE_VIRT_BOUNDARY 1 ++#endif ++ + #define DRV_NAME "iser" + #define PFX DRV_NAME ": " + #define DRV_VER "1.6" + ++#ifndef HAVE_SCSI_TRANSFER_LENGTH ++static inline unsigned scsi_transfer_length(struct scsi_cmnd *scmd) ++{ ++ unsigned int xfer_len = scsi_bufflen(scmd); ++ unsigned int prot_op = scsi_get_prot_op(scmd); ++ unsigned int sector_size = scmd->device->sector_size; ++ ++ switch (prot_op) { ++ case SCSI_PROT_NORMAL: ++ case SCSI_PROT_WRITE_STRIP: ++ case SCSI_PROT_READ_INSERT: ++ return xfer_len; ++ } ++ ++ return xfer_len + (xfer_len >> ilog2(sector_size)) * 8; ++} ++#endif ++ + #define iser_dbg(fmt, arg...) \ + do { \ + if (unlikely(iser_debug_level > 2)) \ +@@ -198,6 +231,10 @@ struct iser_data_buf { + int size; + unsigned long data_len; + int dma_nents; ++#ifndef HAVE_VIRT_BOUNDARY ++ struct scatterlist *orig_sg; ++ unsigned int orig_size; ++#endif + }; + + /* fwd declarations */ +@@ -478,6 +515,9 @@ struct iser_global { + extern struct iser_global ig; + extern int iser_debug_level; + extern bool iser_pi_enable; ++#ifndef HAVE_SCSI_CMND_PROT_FLAGS ++extern int iser_pi_guard; ++#endif + extern unsigned int iser_max_sectors; + extern bool iser_always_reg; + diff --git a/src/mlnx-ofa_kernel-5.8/backports/0066-BACKPORT-drivers-infiniband-ulp-iser-iser_initiator..patch b/src/mlnx-ofa_kernel-5.8/backports/0066-BACKPORT-drivers-infiniband-ulp-iser-iser_initiator..patch new file mode 100644 index 0000000..ed60637 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0066-BACKPORT-drivers-infiniband-ulp-iser-iser_initiator..patch @@ -0,0 +1,83 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/ulp/iser/iser_initiator.c + +Change-Id: I4770154627874a4bc0ecabd7f144cd2e2be4dbbb +--- + drivers/infiniband/ulp/iser/iser_initiator.c | 41 ++++++++++++++++---- + 1 file changed, 33 insertions(+), 8 deletions(-) + +--- a/drivers/infiniband/ulp/iser/iser_initiator.c ++++ b/drivers/infiniband/ulp/iser/iser_initiator.c +@@ -37,7 +37,6 @@ + #include + #include + #include +- + #include "iscsi_iser.h" + + /* Register user buffer memory and initialize passive rdma +@@ -643,7 +642,6 @@ static int iser_check_remote_inv(struct + return 0; + } + +- + void iser_task_rsp(struct ib_cq *cq, struct ib_wc *wc) + { + struct ib_conn *ib_conn = wc->qp->qp_context; +@@ -746,12 +744,27 @@ void iser_task_rdma_init(struct iscsi_is + void iser_task_rdma_finalize(struct iscsi_iser_task *iser_task) + { + int prot_count = scsi_prot_sg_count(iser_task->sc); ++#ifndef HAVE_VIRT_BOUNDARY ++ bool is_rdma_data_aligned; ++#endif + + if (iser_task->dir[ISER_DIR_IN]) { ++#ifndef HAVE_VIRT_BOUNDARY ++ is_rdma_data_aligned = true; ++ if (iser_task->data[ISER_DIR_IN].orig_sg) { ++ iser_finalize_rdma_unaligned_sg(iser_task, ++ &iser_task->data[ISER_DIR_IN], ++ ISER_DIR_IN); ++ is_rdma_data_aligned = false; ++ } ++#endif + iser_unreg_mem_fastreg(iser_task, ISER_DIR_IN); +- iser_dma_unmap_task_data(iser_task, +- &iser_task->data[ISER_DIR_IN], +- DMA_FROM_DEVICE); ++#ifndef HAVE_VIRT_BOUNDARY ++ if (is_rdma_data_aligned) ++#endif ++ iser_dma_unmap_task_data(iser_task, ++ &iser_task->data[ISER_DIR_IN], ++ DMA_FROM_DEVICE); + if (prot_count) + iser_dma_unmap_task_data(iser_task, + &iser_task->prot[ISER_DIR_IN], +@@ -759,10 +772,22 @@ void iser_task_rdma_finalize(struct iscs + } + + if (iser_task->dir[ISER_DIR_OUT]) { ++#ifndef HAVE_VIRT_BOUNDARY ++ is_rdma_data_aligned = true; ++ if (iser_task->data[ISER_DIR_OUT].orig_sg) { ++ iser_finalize_rdma_unaligned_sg(iser_task, ++ &iser_task->data[ISER_DIR_OUT], ++ ISER_DIR_OUT); ++ is_rdma_data_aligned = false; ++ } ++#endif + iser_unreg_mem_fastreg(iser_task, ISER_DIR_OUT); +- iser_dma_unmap_task_data(iser_task, +- &iser_task->data[ISER_DIR_OUT], +- DMA_TO_DEVICE); ++#ifndef HAVE_VIRT_BOUNDARY ++ if (is_rdma_data_aligned) ++#endif ++ iser_dma_unmap_task_data(iser_task, ++ &iser_task->data[ISER_DIR_OUT], ++ DMA_TO_DEVICE); + if (prot_count) + iser_dma_unmap_task_data(iser_task, + &iser_task->prot[ISER_DIR_OUT], diff --git a/src/mlnx-ofa_kernel-5.8/backports/0067-BACKPORT-drivers-infiniband-ulp-iser-iser_memory.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0067-BACKPORT-drivers-infiniband-ulp-iser-iser_memory.c.patch new file mode 100644 index 0000000..8e346c0 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0067-BACKPORT-drivers-infiniband-ulp-iser-iser_memory.c.patch @@ -0,0 +1,449 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/ulp/iser/iser_memory.c + +Change-Id: Ic4917cae235ce379df3e3603939d6f1a1572aa54 +--- + drivers/infiniband/ulp/iser/iser_memory.c | 348 +++++++++++++++++++++- + 1 file changed, 347 insertions(+), 1 deletion(-) + +--- a/drivers/infiniband/ulp/iser/iser_memory.c ++++ b/drivers/infiniband/ulp/iser/iser_memory.c +@@ -39,6 +39,216 @@ + + #include "iscsi_iser.h" + ++#ifndef HAVE_VIRT_BOUNDARY ++#define IS_4K_ALIGNED(addr) ((((unsigned long)addr) & ~(SZ_4K - 1)) == 0) ++static void iser_free_bounce_sg(struct iser_data_buf *data) ++{ ++ struct scatterlist *sg; ++ int count; ++ ++ for_each_sg (data->sg, sg, data->size, count) ++ __free_page(sg_page(sg)); ++ ++ kfree(data->sg); ++ ++ data->sg = data->orig_sg; ++ data->size = data->orig_size; ++ data->orig_sg = NULL; ++ data->orig_size = 0; ++} ++ ++static int iser_alloc_bounce_sg(struct iser_data_buf *data) ++{ ++ struct scatterlist *sg; ++ struct page *page; ++ unsigned long length = data->data_len; ++ int i = 0, nents = DIV_ROUND_UP(length, PAGE_SIZE); ++ ++ sg = kcalloc(nents, sizeof(*sg), GFP_ATOMIC); ++ if (!sg) ++ goto err; ++ ++ sg_init_table(sg, nents); ++ while (length) { ++ u32 page_len = min_t(u32, length, PAGE_SIZE); ++ ++ page = alloc_page(GFP_ATOMIC); ++ if (!page) ++ goto err; ++ ++ sg_set_page(&sg[i], page, page_len, 0); ++ length -= page_len; ++ i++; ++ } ++ ++ data->orig_sg = data->sg; ++ data->orig_size = data->size; ++ data->sg = sg; ++ data->size = nents; ++ ++ return 0; ++ ++err: ++ for (; i > 0; i--) ++ __free_page(sg_page(&sg[i - 1])); ++ kfree(sg); ++ ++ return -ENOMEM; ++} ++ ++static void iser_copy_bounce(struct iser_data_buf *data, bool to_buffer) ++{ ++ struct scatterlist *osg, *bsg = data->sg; ++ void *oaddr, *baddr; ++ unsigned int left = data->data_len; ++ unsigned int bsg_off = 0; ++ int i; ++ ++ for_each_sg (data->orig_sg, osg, data->orig_size, i) { ++ unsigned int copy_len, osg_off = 0; ++ ++ oaddr = kmap_atomic(sg_page(osg)) + osg->offset; ++ copy_len = min(left, osg->length); ++ while (copy_len) { ++ unsigned int len = min(copy_len, bsg->length - bsg_off); ++ ++ baddr = kmap_atomic(sg_page(bsg)) + bsg->offset; ++ ++ if (to_buffer) ++ memcpy(baddr + bsg_off, oaddr + osg_off, len); ++ else ++ memcpy(oaddr + osg_off, baddr + bsg_off, len); ++ ++ kunmap_atomic(baddr - bsg->offset); ++ osg_off += len; ++ bsg_off += len; ++ copy_len -= len; ++ ++ if (bsg_off >= bsg->length) { ++ bsg = sg_next(bsg); ++ bsg_off = 0; ++ } ++ } ++ kunmap_atomic(oaddr - osg->offset); ++ left -= osg_off; ++ } ++} ++ ++static inline void iser_copy_from_bounce(struct iser_data_buf *data) ++{ ++ iser_copy_bounce(data, false); ++} ++ ++static inline void iser_copy_to_bounce(struct iser_data_buf *data) ++{ ++ iser_copy_bounce(data, true); ++} ++ ++/** ++ * iser_start_rdma_unaligned_sg ++ */ ++static int iser_start_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, ++ struct iser_data_buf *data, ++ enum iser_data_dir cmd_dir) ++{ ++ struct ib_device *dev = iser_task->iser_conn->ib_conn.device->ib_device; ++ int rc; ++ ++ rc = iser_alloc_bounce_sg(data); ++ if (rc) { ++ iser_err("Failed to allocate bounce for data len %lu\n", ++ data->data_len); ++ return rc; ++ } ++ ++ if (cmd_dir == ISER_DIR_OUT) ++ iser_copy_to_bounce(data); ++ ++ data->dma_nents = ib_dma_map_sg( ++ dev, data->sg, data->size, ++ (cmd_dir == ISER_DIR_OUT) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); ++ if (!data->dma_nents) { ++ iser_err("Got dma_nents %d, something went wrong...\n", ++ data->dma_nents); ++ rc = -ENOMEM; ++ goto err; ++ } ++ ++ return 0; ++err: ++ iser_free_bounce_sg(data); ++ return rc; ++} ++ ++/** ++ * iser_finalize_rdma_unaligned_sg ++ */ ++void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, ++ struct iser_data_buf *data, ++ enum iser_data_dir cmd_dir) ++{ ++ struct ib_device *dev = iser_task->iser_conn->ib_conn.device->ib_device; ++ ++ ib_dma_unmap_sg(dev, data->sg, data->size, ++ (cmd_dir == ISER_DIR_OUT) ? DMA_TO_DEVICE : ++ DMA_FROM_DEVICE); ++ ++ if (cmd_dir == ISER_DIR_IN) ++ iser_copy_from_bounce(data); ++ ++ iser_free_bounce_sg(data); ++} ++ ++/** ++ * iser_data_buf_aligned_len - Tries to determine the maximal correctly aligned ++ * for RDMA sub-list of a scatter-gather list of memory buffers, and returns ++ * the number of entries which are aligned correctly. Supports the case where ++ * consecutive SG elements are actually fragments of the same physical page. ++ */ ++static int iser_data_buf_aligned_len(struct iser_data_buf *data, ++ struct ib_device *ibdev, ++ unsigned sg_tablesize) ++{ ++ struct scatterlist *sg, *sgl, *next_sg = NULL; ++ u64 start_addr, end_addr; ++ int i, ret_len, start_check = 0; ++ ++ if (data->dma_nents == 1) ++ return 1; ++ ++ sgl = data->sg; ++ start_addr = sg_dma_address(sgl); ++ ++ for_each_sg (sgl, sg, data->dma_nents, i) { ++ if (start_check && !IS_4K_ALIGNED(start_addr)) ++ break; ++ ++ next_sg = sg_next(sg); ++ if (!next_sg) ++ break; ++ ++ end_addr = start_addr + sg_dma_len(sg); ++ start_addr = sg_dma_address(next_sg); ++ ++ if (end_addr == start_addr) { ++ start_check = 0; ++ continue; ++ } else ++ start_check = 1; ++ ++ if (!IS_4K_ALIGNED(end_addr)) ++ break; ++ } ++ ret_len = (next_sg) ? i : i + 1; ++ ++ if (unlikely(ret_len != data->dma_nents)) ++ iser_warn("rdma alignment violation (%d/%d aligned)\n", ret_len, ++ data->dma_nents); ++ ++ return ret_len; ++} ++#endif ++ + void iser_reg_comp(struct ib_cq *cq, struct ib_wc *wc) + { + iser_err_comp(wc, "memreg"); +@@ -70,6 +280,65 @@ static void iser_reg_desc_put_fr(struct + spin_unlock_irqrestore(&fr_pool->lock, flags); + } + ++#ifndef HAVE_VIRT_BOUNDARY ++static void iser_data_buf_dump(struct iser_data_buf *data, ++ struct ib_device *ibdev) ++{ ++ struct scatterlist *sg; ++ int i; ++ ++ for_each_sg(data->sg, sg, data->dma_nents, i) ++ iser_dbg("sg[%d] dma_addr:0x%lX page:0x%p " ++ "off:0x%x sz:0x%x dma_len:0x%x\n", ++ i, (unsigned long)sg_dma_address(sg), ++ sg_page(sg), sg->offset, sg->length, sg_dma_len(sg)); ++} ++ ++static int fall_to_bounce_buf(struct iscsi_iser_task *iser_task, ++ struct iser_data_buf *mem, ++ enum iser_data_dir cmd_dir) ++{ ++ struct iscsi_conn *iscsi_conn = iser_task->iser_conn->iscsi_conn; ++ struct iser_device *device = iser_task->iser_conn->ib_conn.device; ++ ++ iscsi_conn->fmr_unalign_cnt++; ++ ++ if (iser_debug_level > 0) ++ iser_data_buf_dump(mem, device->ib_device); ++ ++ /* unmap the command data before accessing it */ ++ iser_dma_unmap_task_data(iser_task, mem, ++ (cmd_dir == ISER_DIR_OUT) ? DMA_TO_DEVICE : ++ DMA_FROM_DEVICE); ++ ++ /* allocate copy buf, if we are writing, copy the */ ++ /* unaligned scatterlist, dma map the copy */ ++ if (iser_start_rdma_unaligned_sg(iser_task, mem, cmd_dir) != 0) ++ return -ENOMEM; ++ ++ return 0; ++} ++ ++static int iser_handle_unaligned_buf(struct iscsi_iser_task *task, ++ struct iser_data_buf *mem, ++ enum iser_data_dir dir) ++{ ++ struct iser_conn *iser_conn = task->iser_conn; ++ struct iser_device *device = iser_conn->ib_conn.device; ++ int err, aligned_len; ++ ++ aligned_len = iser_data_buf_aligned_len(mem, device->ib_device, ++ iser_conn->scsi_sg_tablesize); ++ if (aligned_len != mem->dma_nents) { ++ err = fall_to_bounce_buf(task, mem, dir); ++ if (err) ++ return err; ++ } ++ ++ return 0; ++} ++#endif ++ + int iser_dma_map_task_data(struct iscsi_iser_task *iser_task, + struct iser_data_buf *data, + enum iser_data_dir iser_dir, +@@ -155,8 +424,21 @@ static void iser_set_dif_domain(struct s + struct ib_sig_domain *domain) + { + domain->sig_type = IB_SIG_TYPE_T10_DIF; ++#ifdef HAVE_SCSI_CMND_PROT_FLAGS + domain->sig.dif.pi_interval = scsi_prot_interval(sc); ++#ifdef HAVE_T10_PI_REF_TAG ++#ifdef HAVE_SCSI_CMD_TO_RQ + domain->sig.dif.ref_tag = t10_pi_ref_tag(scsi_cmd_to_rq(sc)); ++#else ++ domain->sig.dif.ref_tag = t10_pi_ref_tag(sc->request); ++#endif ++#else ++ domain->sig.dif.ref_tag = scsi_prot_ref_tag(sc); ++#endif ++#else ++ domain->sig.dif.pi_interval = sc->device->sector_size; ++ domain->sig.dif.ref_tag = scsi_get_lba(sc) & 0xffffffff; ++#endif + /* + * At the moment we hard code those, but in the future + * we will take them from sc. +@@ -164,8 +446,14 @@ static void iser_set_dif_domain(struct s + domain->sig.dif.apptag_check_mask = 0xffff; + domain->sig.dif.app_escape = true; + domain->sig.dif.ref_escape = true; ++#ifdef HAVE_SCSI_CMND_PROT_FLAGS + if (sc->prot_flags & SCSI_PROT_REF_INCREMENT) + domain->sig.dif.ref_remap = true; ++#else ++ if (scsi_get_prot_type(sc) == SCSI_PROT_DIF_TYPE1 || ++ scsi_get_prot_type(sc) == SCSI_PROT_DIF_TYPE2) ++ domain->sig.dif.ref_remap = true; ++#endif + } + + static int iser_set_sig_attrs(struct scsi_cmnd *sc, +@@ -182,16 +470,30 @@ static int iser_set_sig_attrs(struct scs + case SCSI_PROT_WRITE_STRIP: + sig_attrs->wire.sig_type = IB_SIG_TYPE_NONE; + iser_set_dif_domain(sc, &sig_attrs->mem); ++#ifdef HAVE_SCSI_CMND_PROT_FLAGS ++ /* WA for #963642: DIX always use SCSI_PROT_IP_CHECKSUM */ ++ sc->prot_flags |= SCSI_PROT_IP_CHECKSUM; + sig_attrs->mem.sig.dif.bg_type = sc->prot_flags & SCSI_PROT_IP_CHECKSUM ? + IB_T10DIF_CSUM : IB_T10DIF_CRC; ++#else ++ sig_attrs->mem.sig.dif.bg_type = ++ iser_pi_guard ? IB_T10DIF_CSUM : IB_T10DIF_CRC; ++#endif + break; + case SCSI_PROT_READ_PASS: + case SCSI_PROT_WRITE_PASS: + iser_set_dif_domain(sc, &sig_attrs->wire); + sig_attrs->wire.sig.dif.bg_type = IB_T10DIF_CRC; + iser_set_dif_domain(sc, &sig_attrs->mem); ++#ifdef HAVE_SCSI_CMND_PROT_FLAGS ++ /* WA for #963642: DIX always use SCSI_PROT_IP_CHECKSUM */ ++ sc->prot_flags |= SCSI_PROT_IP_CHECKSUM; + sig_attrs->mem.sig.dif.bg_type = sc->prot_flags & SCSI_PROT_IP_CHECKSUM ? + IB_T10DIF_CSUM : IB_T10DIF_CRC; ++#else ++ sig_attrs->mem.sig.dif.bg_type = ++ iser_pi_guard ? IB_T10DIF_CSUM : IB_T10DIF_CRC; ++#endif + break; + default: + iser_err("Unsupported PI operation %d\n", +@@ -202,6 +504,7 @@ static int iser_set_sig_attrs(struct scs + return 0; + } + ++#ifdef HAVE_SCSI_CMND_PROT_FLAGS + static inline void iser_set_prot_checks(struct scsi_cmnd *sc, u8 *mask) + { + *mask = 0; +@@ -210,6 +513,30 @@ static inline void iser_set_prot_checks( + if (sc->prot_flags & SCSI_PROT_GUARD_CHECK) + *mask |= IB_SIG_CHECK_GUARD; + } ++#else ++static int ++iser_set_prot_checks(struct scsi_cmnd *sc, u8 *mask) ++{ ++ switch (scsi_get_prot_type(sc)) { ++ case SCSI_PROT_DIF_TYPE0: ++ *mask = 0x0; ++ break; ++ case SCSI_PROT_DIF_TYPE1: ++ case SCSI_PROT_DIF_TYPE2: ++ *mask = IB_SIG_CHECK_GUARD | IB_SIG_CHECK_REFTAG; ++ break; ++ case SCSI_PROT_DIF_TYPE3: ++ *mask = IB_SIG_CHECK_GUARD; ++ break; ++ default: ++ iser_err("Unsupported protection type %d\n", ++ scsi_get_prot_type(sc)); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++#endif + + static inline void iser_inv_rkey(struct ib_send_wr *inv_wr, struct ib_mr *mr, + struct ib_cqe *cqe, struct ib_send_wr *next_wr) +@@ -239,8 +566,13 @@ static int iser_reg_sig_mr(struct iscsi_ + ret = iser_set_sig_attrs(iser_task->sc, sig_attrs); + if (ret) + goto err; +- ++#ifdef HAVE_SCSI_CMND_PROT_FLAGS + iser_set_prot_checks(iser_task->sc, &sig_attrs->check_mask); ++#else ++ ret = iser_set_prot_checks(iser_task->sc, &sig_attrs->check_mask); ++ if (ret) ++ goto err; ++#endif + + if (rsc->mr_valid) + iser_inv_rkey(&tx_desc->inv_wr, mr, cqe, &wr->wr); +@@ -345,12 +677,22 @@ int iser_reg_mem_fastreg(struct iscsi_is + bool all_imm) + { + struct ib_conn *ib_conn = &task->iser_conn->ib_conn; ++#ifndef HAVE_VIRT_BOUNDARY ++ struct ib_device *ib_dev = ib_conn->device->ib_device; ++#endif + struct iser_data_buf *mem = &task->data[dir]; + struct iser_mem_reg *reg = &task->rdma_reg[dir]; + struct iser_fr_desc *desc = NULL; + bool use_dma_key; + int err; + ++#ifndef HAVE_VIRT_BOUNDARY ++ if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG)) { ++ err = iser_handle_unaligned_buf(task, mem, dir); ++ if (unlikely(err)) ++ return err; ++ } ++#endif + use_dma_key = mem->dma_nents == 1 && (all_imm || !iser_always_reg) && + scsi_get_prot_op(task->sc) == SCSI_PROT_NORMAL; + +@@ -377,6 +719,10 @@ int iser_reg_mem_fastreg(struct iscsi_is + err_reg: + if (desc) + iser_reg_desc_put_fr(ib_conn, desc); ++#ifndef HAVE_VIRT_BOUNDARY ++ if (mem->orig_sg) ++ iser_free_bounce_sg(mem); ++#endif + + return err; + } diff --git a/src/mlnx-ofa_kernel-5.8/backports/0068-BACKPORT-drivers-infiniband-ulp-isert-ib_isert.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0068-BACKPORT-drivers-infiniband-ulp-isert-ib_isert.c.patch new file mode 100644 index 0000000..06c67ee --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0068-BACKPORT-drivers-infiniband-ulp-isert-ib_isert.c.patch @@ -0,0 +1,272 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/ulp/isert/ib_isert.c + +Change-Id: Ia3ecdd9622e54e8a4166b6a4fb82e5be67e83f9c +--- + drivers/infiniband/ulp/isert/ib_isert.c | 123 +++++++++++++++++++++++- + 1 file changed, 120 insertions(+), 3 deletions(-) + +--- a/drivers/infiniband/ulp/isert/ib_isert.c ++++ b/drivers/infiniband/ulp/isert/ib_isert.c +@@ -1077,8 +1077,14 @@ isert_handle_scsi_cmd(struct isert_conn + unsol_data = cmd->unsolicited_data; + data_len = cmd->se_cmd.data_length; + ++#ifdef HAVE_SE_CMD_TRANSPORT_COMPLETE_CALLBACK_HAS_THREE_PARAM + if (imm_data && imm_data_len == data_len) + cmd->se_cmd.se_cmd_flags |= SCF_PASSTHROUGH_SG_TO_MEM_NOALLOC; ++#else ++ if (imm_data && imm_data_len == data_len && ++ !(cmd->se_cmd.se_cmd_flags & SCF_COMPARE_AND_WRITE)) ++ cmd->se_cmd.se_cmd_flags |= SCF_PASSTHROUGH_SG_TO_MEM_NOALLOC; ++#endif + rc = iscsit_process_scsi_cmd(conn, cmd, hdr); + if (rc < 0) { + return 0; +@@ -1090,7 +1096,12 @@ isert_handle_scsi_cmd(struct isert_conn + if (!imm_data) + return 0; + ++#ifdef HAVE_SE_CMD_TRANSPORT_COMPLETE_CALLBACK_HAS_THREE_PARAM + if (imm_data_len != data_len) { ++#else ++ if (imm_data_len != data_len || ++ (cmd->se_cmd.se_cmd_flags & SCF_COMPARE_AND_WRITE)) { ++#endif + sg_nents = max(1UL, DIV_ROUND_UP(imm_data_len, PAGE_SIZE)); + sg_copy_from_buffer(cmd->se_cmd.t_data_sg, sg_nents, + isert_get_data(rx_desc), imm_data_len); +@@ -1119,9 +1130,17 @@ sequence_cmd: + rc = iscsit_sequence_cmd(conn, cmd, buf, hdr->cmdsn); + + if (!rc && !dump_payload && unsol_data) ++#ifdef HAVE_ISCSIT_SET_UNSOLICITED_DATAOUT + iscsit_set_unsolicited_dataout(cmd); ++#else ++ iscsit_set_unsoliticed_dataout(cmd); ++#endif + else if (dump_payload && imm_data) ++#ifdef HAVE_TARGET_PUT_SESS_CMD_HAS_1_PARAM + target_put_sess_cmd(&cmd->se_cmd); ++#else ++ target_put_sess_cmd(conn->sess->se_sess, &cmd->se_cmd); ++#endif + + return 0; + } +@@ -1468,8 +1487,11 @@ isert_put_cmd(struct isert_cmd *isert_cm + if (comp_err && + cmd->se_cmd.t_state == TRANSPORT_WRITE_PENDING) { + struct se_cmd *se_cmd = &cmd->se_cmd; +- ++#ifdef HAVE_TARGET_PUT_SESS_CMD_HAS_1_PARAM + target_put_sess_cmd(se_cmd); ++#else ++ target_put_sess_cmd(se_cmd->se_sess, se_cmd); ++#endif + } + } + +@@ -1570,12 +1592,20 @@ isert_check_pi_status(struct se_cmd *se_ + } + sec_offset_err = mr_status.sig_err.sig_err_offset; + do_div(sec_offset_err, block_size); ++#ifdef HAVE_SE_CMD_HAS_SENSE_INFO + se_cmd->sense_info = sec_offset_err + se_cmd->t_task_lba; ++#else ++ se_cmd->bad_sector = sec_offset_err + se_cmd->t_task_lba; ++#endif + + isert_err("PI error found type %d at sector 0x%llx " + "expected 0x%x vs actual 0x%x\n", + mr_status.sig_err.err_type, ++#ifdef HAVE_SE_CMD_HAS_SENSE_INFO + (unsigned long long)se_cmd->sense_info, ++#else ++ (unsigned long long)se_cmd->bad_sector, ++#endif + mr_status.sig_err.expected, + mr_status.sig_err.actual); + ret = 1; +@@ -1862,16 +1892,27 @@ isert_get_sup_prot_ops(struct iscsi_conn + { + struct isert_conn *isert_conn = conn->context; + struct isert_device *device = isert_conn->device; +- ++/* ++ * In older versions of the kernel conn->tpg->tpg_attrib.t10_pi is not ++ * initialized before calling this function. And there is no option to ++ * test it in rdma.m4 because the behavior is hide in the iscsi_target_mod ++ * module. See the commit 23a548ee656c ("iscsi,iser-target: Expose ++ * supported protection ops according to t10_pi") in the upstream kernel. ++ */ ++#if defined(CONFIG_COMPAT_RHEL_7_2) \ ++ || (LINUX_VERSION_CODE >= KERNEL_VERSION(3,19,0)) + if (conn->tpg->tpg_attrib.t10_pi) { ++#endif + if (device->pi_capable) { + isert_info("conn %p PI offload enabled\n", isert_conn); + isert_conn->pi_support = true; + isert_conn->sig_pipeline = device->sig_pipeline; + return TARGET_PROT_ALL; + } ++#if defined(CONFIG_COMPAT_RHEL_7_2) \ ++ || (LINUX_VERSION_CODE >= KERNEL_VERSION(3,19,0)) + } +- ++#endif + isert_info("conn %p PI offload disabled\n", isert_conn); + isert_conn->pi_support = false; + isert_conn->sig_pipeline = false; +@@ -2425,11 +2466,47 @@ isert_set_conn_info(struct iscsi_np *np, + { + struct rdma_cm_id *cm_id = isert_conn->cm_id; + struct rdma_route *cm_route = &cm_id->route; ++#ifndef HAVE_ISCSI_CONN_LOGIN_SOCKADDR ++ struct sockaddr_in *sock_in; ++ struct sockaddr_in6 *sock_in6; ++#endif + + conn->login_family = np->np_sockaddr.ss_family; + ++#ifdef HAVE_ISCSI_CONN_LOGIN_SOCKADDR + conn->login_sockaddr = cm_route->addr.dst_addr; + conn->local_sockaddr = cm_route->addr.src_addr; ++#else ++ if (np->np_sockaddr.ss_family == AF_INET6) { ++ sock_in6 = (struct sockaddr_in6 *)&cm_route->addr.dst_addr; ++ snprintf(conn->login_ip, sizeof(conn->login_ip), "%pI6c", ++ &sock_in6->sin6_addr.in6_u); ++ conn->login_port = ntohs(sock_in6->sin6_port); ++ ++ sock_in6 = (struct sockaddr_in6 *)&cm_route->addr.src_addr; ++#ifdef HAVE_ISCSI_CONN_LOCAL_SOCKADDR ++ memcpy(&conn->local_sockaddr , &sock_in6, sizeof(sock_in6)); ++#else ++ snprintf(conn->local_ip, sizeof(conn->local_ip), "%pI6c", ++ &sock_in6->sin6_addr.in6_u); ++ conn->local_port = ntohs(sock_in6->sin6_port); ++#endif /* HAVE_ISCSI_CONN_LOCAL_SOCKADDR */ ++ } else { ++ sock_in = (struct sockaddr_in *)&cm_route->addr.dst_addr; ++ sprintf(conn->login_ip, "%pI4", ++ &sock_in->sin_addr.s_addr); ++ conn->login_port = ntohs(sock_in->sin_port); ++ ++ sock_in = (struct sockaddr_in *)&cm_route->addr.src_addr; ++#ifdef HAVE_ISCSI_CONN_LOCAL_SOCKADDR ++ memcpy(&conn->local_sockaddr , &sock_in, sizeof(sock_in)); ++#else ++ sprintf(conn->local_ip, "%pI4", ++ &sock_in->sin_addr.s_addr); ++ conn->local_port = ntohs(sock_in->sin_port); ++#endif /* HAVE_ISCSI_CONN_LOCAL_SOCKADDR */ ++ } ++#endif /* HAVE_ISCSI_CONN_LOGIN_SOCKADDR */ + } + + static int +@@ -2558,7 +2635,11 @@ isert_wait4cmds(struct iscsi_conn *conn) + isert_info("iscsi_conn %p\n", conn); + + if (conn->sess) { ++#ifdef HAVE_TARGET_STOP_SESSION + target_stop_session(conn->sess->se_sess); ++#else ++ target_sess_cmd_list_set_waiting(conn->sess->se_sess); ++#endif + target_wait_for_sess_cmds(conn->sess->se_sess); + } + } +@@ -2598,6 +2679,7 @@ isert_put_unsol_pending_cmds(struct iscs + } + } + ++#ifdef CONFIG_COMPAT_ISCSIT_WAIT_CONN + static void isert_wait_conn(struct iscsi_conn *conn) + { + struct isert_conn *isert_conn = conn->context; +@@ -2615,15 +2697,40 @@ static void isert_wait_conn(struct iscsi + + queue_work(isert_release_wq, &isert_conn->release_work); + } ++#endif + + static void isert_free_conn(struct iscsi_conn *conn) + { + struct isert_conn *isert_conn = conn->context; + ++#ifndef CONFIG_COMPAT_ISCSIT_WAIT_CONN ++ mutex_lock(&isert_conn->mutex); ++ if (isert_conn->state == ISER_CONN_INIT) { ++ mutex_unlock(&isert_conn->mutex); ++ goto out; ++ } ++ isert_conn_terminate(isert_conn); ++ mutex_unlock(&isert_conn->mutex); ++ ++ /* ++ * Only drain qp if the isert_conn made it ++ * into full feature phase.. ++ */ ++ if (isert_conn->state == ISER_CONN_FULL_FEATURE) { ++ ib_drain_qp(isert_conn->qp); ++ isert_put_unsol_pending_cmds(conn); ++ isert_wait4cmds(conn); ++ isert_wait4logout(isert_conn); ++ } ++ queue_work(isert_release_wq, &isert_conn->release_work); ++out: ++#else + ib_drain_qp(isert_conn->qp); ++#endif + isert_put_conn(isert_conn); + } + ++#ifdef HAVE_ISCSIT_TRANSPORT_ISCSIT_GET_RX_PDU + static void isert_get_rx_pdu(struct iscsi_conn *conn) + { + struct completion comp; +@@ -2632,17 +2739,22 @@ static void isert_get_rx_pdu(struct iscs + + wait_for_completion_interruptible(&comp); + } ++#endif + + static struct iscsit_transport iser_target_transport = { + .name = "IB/iSER", + .transport_type = ISCSI_INFINIBAND, ++#ifdef HAVE_ISCSIT_TRANSPORT_RDMA_SHUTDOWN + .rdma_shutdown = true, ++#endif + .priv_size = sizeof(struct isert_cmd), + .owner = THIS_MODULE, + .iscsit_setup_np = isert_setup_np, + .iscsit_accept_np = isert_accept_np, + .iscsit_free_np = isert_free_np, ++#ifdef CONFIG_COMPAT_ISCSIT_WAIT_CONN + .iscsit_wait_conn = isert_wait_conn, ++#endif + .iscsit_free_conn = isert_free_conn, + .iscsit_get_login_rx = isert_get_login_rx, + .iscsit_put_login_tx = isert_put_login_tx, +@@ -2652,7 +2764,9 @@ static struct iscsit_transport iser_targ + .iscsit_queue_data_in = isert_put_datain, + .iscsit_queue_status = isert_put_response, + .iscsit_aborted_task = isert_aborted_task, ++#ifdef HAVE_ISCSIT_TRANSPORT_ISCSIT_GET_RX_PDU + .iscsit_get_rx_pdu = isert_get_rx_pdu, ++#endif + .iscsit_get_sup_prot_ops = isert_get_sup_prot_ops, + }; + +@@ -2698,6 +2812,9 @@ static void __exit isert_exit(void) + MODULE_DESCRIPTION("iSER-Target for mainline target infrastructure"); + MODULE_AUTHOR("nab@Linux-iSCSI.org"); + MODULE_LICENSE("GPL"); ++#ifdef RETPOLINE_MLNX ++MODULE_INFO(retpoline, "Y"); ++#endif + + module_init(isert_init); + module_exit(isert_exit); diff --git a/src/mlnx-ofa_kernel-5.8/backports/0070-BACKPORT-drivers-infiniband-ulp-srp-ib_srp.h.patch b/src/mlnx-ofa_kernel-5.8/backports/0070-BACKPORT-drivers-infiniband-ulp-srp-ib_srp.h.patch new file mode 100644 index 0000000..dc2d007 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0070-BACKPORT-drivers-infiniband-ulp-srp-ib_srp.h.patch @@ -0,0 +1,108 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/ulp/srp/ib_srp.h + +Change-Id: Ic69527700e22be62075fa72adb459bb66a96be8c +--- + drivers/infiniband/ulp/srp/ib_srp.h | 50 +++++++++++++++++++++++++++++ + 1 file changed, 50 insertions(+) + +--- a/drivers/infiniband/ulp/srp/ib_srp.h ++++ b/drivers/infiniband/ulp/srp/ib_srp.h +@@ -91,6 +91,38 @@ enum srp_iu_type { + SRP_IU_RSP, + }; + ++#if !(defined(RHEL_MAJOR) && RHEL_MAJOR -0 == 7 && \ ++ !defined(HAVE_SCSI_HOST_TEMPLATE_USE_HOST_WIDE_TAGS)) ++#define HAVE_BLK_TAGS 1 ++#endif ++ ++#if defined(HAVE_BLK_QUEUE_VIRT_BOUNDARY) && \ ++ !defined(HAVE_SCSI_HOST_VIRT_BOUNDARY_MASK) ++#define USE_SLAVE_ALLOC_HANDLER 1 ++#endif ++ ++#if defined(HAVE_BLK_QUEUE_VIRT_BOUNDARY) || \ ++ defined(HAVE_SCSI_HOST_VIRT_BOUNDARY_MASK) ++#define HAVE_VIRT_BOUNDARY 1 ++#endif ++ ++#ifndef HAVE_BLK_TAGS ++static inline u32 build_srp_tag(u16 ch, u16 req_idx) ++{ ++ return ch << 16 | req_idx; ++} ++ ++static inline u16 srp_tag_ch(u32 tag) ++{ ++ return tag >> 16; ++} ++ ++static inline u16 srp_tag_idx(u32 tag) ++{ ++ return tag & ((1 << 16) - 1); ++} ++#endif ++ + /* + * @mr_page_mask: HCA memory registration page mask. + * @mr_page_size: HCA memory registration page size. +@@ -121,12 +153,18 @@ struct srp_host { + }; + + struct srp_request { ++#ifndef HAVE_BLK_TAGS ++ struct list_head list; ++#endif + struct scsi_cmnd *scmnd; + struct srp_iu *cmd; + struct srp_fr_desc **fr_list; + struct srp_direct_buf *indirect_desc; + dma_addr_t indirect_dma_addr; + short nmdesc; ++#ifndef HAVE_BLK_TAGS ++ uint32_t tag; ++#endif + struct ib_cqe reg_cqe; + }; + +@@ -139,6 +177,9 @@ struct srp_request { + struct srp_rdma_ch { + /* These are RW in the hot path, and commonly used together */ + struct list_head free_tx; ++#ifndef HAVE_BLK_TAGS ++ struct list_head free_reqs; ++#endif + spinlock_t lock; + s32 req_lim; + +@@ -174,6 +215,9 @@ struct srp_rdma_ch { + + struct srp_iu **tx_ring; + struct srp_iu **rx_ring; ++#ifndef HAVE_SCSI_HOST_TEMPLATE_INIT_CMD_PRIV ++ struct srp_request *req_ring; ++#endif + int comp_vector; + + u64 tsk_mgmt_tag; +@@ -195,6 +239,9 @@ struct srp_target_port { + u32 global_rkey; + struct srp_rdma_ch *ch; + struct net *net; ++#ifndef HAVE_BLK_TAGS ++ int *mq_map; ++#endif + u32 ch_count; + u32 lkey; + enum srp_target_state state; +@@ -219,6 +266,9 @@ struct srp_target_port { + int mr_pool_size; + int mr_per_cmd; + int queue_size; ++#ifndef HAVE_SCSI_HOST_TEMPLATE_INIT_CMD_PRIV ++ int req_ring_size; ++#endif + int comp_vector; + int tl_retry_count; + diff --git a/src/mlnx-ofa_kernel-5.8/backports/0071-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-Mak.patch b/src/mlnx-ofa_kernel-5.8/backports/0071-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-Mak.patch new file mode 100644 index 0000000..714c474 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0071-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-Mak.patch @@ -0,0 +1,43 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/Makefile + +Change-Id: Iab23f4328b0d0808cbd31dd08483926c71c8a33d +--- + drivers/net/ethernet/mellanox/mlx5/core/Makefile | 12 ++++++++++-- + 1 file changed, 10 insertions(+), 2 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile ++++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile +@@ -21,6 +21,7 @@ mlx5_core-y := main.o cmd.o debugfs.o fw + diag/diag_cnt.o params.o fw_exp.o lib/tout.o eswitch_devlink_compat.o \ + ecpf.o lib/aso.o + ++mlx5_core-y += compat.o + # + # Netdev basic + # +@@ -77,8 +78,12 @@ mlx5_core-$(CONFIG_MLX5_ESWITCH) += es + mlx5_core-$(CONFIG_MLX5_BRIDGE) += esw/bridge.o en/rep/bridge.o + + mlx5_core-$(CONFIG_MLX5_MPFS) += lib/mpfs.o +-mlx5_core-$(CONFIG_VXLAN) += lib/vxlan.o +-mlx5_core-$(CONFIG_PTP_1588_CLOCK) += lib/clock.o ++ifneq ($(CONFIG_VXLAN),) ++ mlx5_core-y += lib/vxlan.o ++endif ++ifneq ($(CONFIG_PTP_1588_CLOCK),) ++ mlx5_core-y += lib/clock.o ++endif + mlx5_core-$(CONFIG_PCI_HYPERV_INTERFACE) += lib/hv.o lib/hv_vhca.o + mlx5_core-$(CONFIG_MLXDEVM) += mlx5_devm.o esw/devm_port.o + +@@ -97,6 +102,9 @@ mlx5_core-$(CONFIG_MLX5_ACCEL) += l + + mlx5_core-$(CONFIG_MLX5_FPGA) += fpga/cmd.o fpga/core.o fpga/conn.o fpga/sdk.o + ++mlx5_core-$(CONFIG_MLX5_EN_MACSEC) += en_accel/macsec.o en_accel/macsec_fs.o \ ++ en_accel/macsec_stats.o ++ + mlx5_core-$(CONFIG_MLX5_EN_IPSEC) += en_accel/ipsec.o en_accel/ipsec_rxtx.o \ + en_accel/ipsec_stats.o en_accel/ipsec_fs.o esw/ipsec.o \ + en/ipsec_aso.o diff --git a/src/mlnx-ofa_kernel-5.8/backports/0072-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-acc.patch b/src/mlnx-ofa_kernel-5.8/backports/0072-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-acc.patch new file mode 100644 index 0000000..cc5edfa --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0072-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-acc.patch @@ -0,0 +1,41 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/accel/tls.c + +Change-Id: Iad0fa5bb2db516324fda88387f0b390a6f0d1ad2 +--- + drivers/net/ethernet/mellanox/mlx5/core/accel/tls.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.c +@@ -31,6 +31,8 @@ + * + */ + ++#ifdef HAVE_UAPI_LINUX_TLS_H ++ + #include + + #include "accel/tls.h" +@@ -101,6 +103,7 @@ int mlx5_ktls_create_key(struct mlx5_cor + sz_bytes = sizeof(info->key); + break; + } ++#ifdef TLS_CIPHER_AES_GCM_256 + case TLS_CIPHER_AES_GCM_256: { + struct tls12_crypto_info_aes_gcm_256 *info = + (struct tls12_crypto_info_aes_gcm_256 *)crypto_info; +@@ -109,6 +112,7 @@ int mlx5_ktls_create_key(struct mlx5_cor + sz_bytes = sizeof(info->key); + break; + } ++#endif + default: + return -EINVAL; + } +@@ -123,3 +127,5 @@ void mlx5_ktls_destroy_key(struct mlx5_c + mlx5_destroy_encryption_key(mdev, key_id); + } + #endif ++ ++#endif diff --git a/src/mlnx-ofa_kernel-5.8/backports/0073-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-acc.patch b/src/mlnx-ofa_kernel-5.8/backports/0073-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-acc.patch new file mode 100644 index 0000000..4a8b32b --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0073-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-acc.patch @@ -0,0 +1,25 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/accel/tls.h + +Change-Id: I6ba8cae67da7e2e92a9d302a21e34495aff2a9c8 +--- + drivers/net/ethernet/mellanox/mlx5/core/accel/tls.h | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.h +@@ -31,6 +31,8 @@ + * + */ + ++#ifdef HAVE_UAPI_LINUX_TLS_H ++ + #ifndef __MLX5_ACCEL_TLS_H__ + #define __MLX5_ACCEL_TLS_H__ + +@@ -154,3 +156,5 @@ static inline void mlx5_accel_tls_cleanu + #endif + + #endif /* __MLX5_ACCEL_TLS_H__ */ ++ ++#endif diff --git a/src/mlnx-ofa_kernel-5.8/backports/0074-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-all.patch b/src/mlnx-ofa_kernel-5.8/backports/0074-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-all.patch new file mode 100644 index 0000000..da86ced --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0074-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-all.patch @@ -0,0 +1,31 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/alloc.c + +Change-Id: Ia13dc1726e4514a8ac3341f45738359f37af0423 +--- + drivers/net/ethernet/mellanox/mlx5/core/alloc.c | 11 +++++++++-- + 1 file changed, 9 insertions(+), 2 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/alloc.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/alloc.c +@@ -61,11 +61,18 @@ static void *mlx5_dma_zalloc_coherent_no + int original_node; + void *cpu_handle; + ++ /* WA for kernels that don't use numa_mem_id in alloc_pages_node */ ++ if (node == NUMA_NO_NODE) ++ node = numa_mem_id(); ++ + mutex_lock(&priv->alloc_mutex); + original_node = dev_to_node(device); + set_dev_node(device, node); +- cpu_handle = dma_alloc_coherent(device, size, dma_handle, +- GFP_KERNEL); ++#ifdef HAVE_DMA_ZALLOC_COHERENT ++ cpu_handle = dma_zalloc_coherent(device, size, dma_handle, GFP_KERNEL); ++#else ++ cpu_handle = dma_alloc_coherent(device, size, dma_handle, GFP_KERNEL); ++#endif + set_dev_node(device, original_node); + mutex_unlock(&priv->alloc_mutex); + return cpu_handle; diff --git a/src/mlnx-ofa_kernel-5.8/backports/0075-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-cmd.patch b/src/mlnx-ofa_kernel-5.8/backports/0075-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-cmd.patch new file mode 100644 index 0000000..12e876d --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0075-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-cmd.patch @@ -0,0 +1,174 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/cmd.c + +Change-Id: I6aa37a0cce724be92c0e21a67252a718a5d1d0d8 +--- + drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 62 ++++++++++++++++++- + 1 file changed, 61 insertions(+), 1 deletion(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +@@ -1030,7 +1030,11 @@ static void cmd_work_handler(struct work + lay->status_own = CMD_OWNER_HW; + set_signature(ent, !cmd->checksum_disabled); + dump_command(dev, ent, 1); ++#ifdef HAVE_KTIME_GET_NS + ent->ts1 = ktime_get_ns(); ++#else ++ ktime_get_ts(&ent->ts1); ++#endif + cmd_mode = cmd->mode; + + if (ent->callback && schedule_delayed_work(&ent->cb_timeout_work, cb_timeout)) +@@ -1190,6 +1194,9 @@ static int mlx5_cmd_invoke(struct mlx5_c + struct mlx5_cmd *cmd = &dev->cmd; + struct mlx5_cmd_work_ent *ent; + struct mlx5_cmd_stats *stats; ++#ifndef HAVE_KTIME_GET_NS ++ ktime_t t1, t2, delta; ++#endif + int err = 0; + s64 ds; + u16 op; +@@ -1231,7 +1238,14 @@ static int mlx5_cmd_invoke(struct mlx5_c + if (err == -ETIMEDOUT || err == -ECANCELED) + goto out_free; + ++#ifdef HAVE_KTIME_GET_NS + ds = ent->ts2 - ent->ts1; ++#else ++ t1 = timespec_to_ktime(ent->ts1); ++ t2 = timespec_to_ktime(ent->ts2); ++ delta = ktime_sub(t2, t1); ++ ds = ktime_to_ns(delta); ++#endif + op = MLX5_GET(mbox_in, in->first.data, opcode); + if (op < MLX5_CMD_OP_MAX) { + stats = &cmd->stats[op]; +@@ -1360,13 +1374,22 @@ static struct mlx5_cmd_mailbox *alloc_cm + if (!mailbox) + return ERR_PTR(-ENOMEM); + ++#ifdef HAVE_DMA_POOL_ZALLOC + mailbox->buf = dma_pool_zalloc(dev->cmd.pool, flags, ++#elif defined(HAVE_PCI_POOL_ZALLOC) ++ mailbox->buf = pci_pool_zalloc(dev->cmd.pool, flags, ++#else ++ mailbox->buf = pci_pool_alloc(dev->cmd.pool, flags, ++#endif + &mailbox->dma); + if (!mailbox->buf) { + mlx5_core_dbg(dev, "failed allocation\n"); + kfree(mailbox); + return ERR_PTR(-ENOMEM); + } ++#if !defined(HAVE_PCI_POOL_ZALLOC) && !defined(HAVE_DMA_POOL_ZALLOC) ++ memset(mailbox->buf, 0, sizeof(struct mlx5_cmd_prot_block)); ++#endif + mailbox->next = NULL; + + return mailbox; +@@ -1656,6 +1679,9 @@ static void mlx5_cmd_comp_handler(struct + struct mlx5_cmd *cmd = &dev->cmd; + struct mlx5_cmd_work_ent *ent; + mlx5_cmd_cbk_t callback; ++#ifndef HAVE_KTIME_GET_NS ++ ktime_t t1, t2, delta; ++#endif + void *context; + int err; + int i; +@@ -1692,7 +1718,11 @@ static void mlx5_cmd_comp_handler(struct + continue; + } + ++#ifdef HAVE___CANCEL_DELAYED_WORK ++ if (ent->callback && __cancel_delayed_work(&ent->cb_timeout_work)) ++#else + if (ent->callback && cancel_delayed_work(&ent->cb_timeout_work)) ++#endif + cmd_ent_put(ent); /* timeout work was canceled */ + + if (comp_type != MLX5_CMD_COMP_TYPE_FORCED || /* Real FW completion */ +@@ -1700,7 +1730,11 @@ static void mlx5_cmd_comp_handler(struct + !opcode_allowed(cmd, ent->op)) + cmd_ent_put(ent); + ++#ifdef HAVE_KTIME_GET_NS + ent->ts2 = ktime_get_ns(); ++#else ++ ktime_get_ts(&ent->ts2); ++#endif + memcpy(ent->out->first.data, ent->lay->out, sizeof(ent->lay->out)); + dump_command(dev, ent, 0); + +@@ -1718,7 +1752,14 @@ static void mlx5_cmd_comp_handler(struct + } + + if (ent->callback) { ++#ifdef HAVE_KTIME_GET_NS + ds = ent->ts2 - ent->ts1; ++#else ++ t1 = timespec_to_ktime(ent->ts1); ++ t2 = timespec_to_ktime(ent->ts2); ++ delta = ktime_sub(t2, t1); ++ ds = ktime_to_ns(delta); ++#endif + if (ent->op < MLX5_CMD_OP_MAX) { + stats = &cmd->stats[ent->op]; + spin_lock_irqsave(&stats->lock, flags); +@@ -2169,7 +2210,11 @@ static void create_msg_cache(struct mlx5 + + static int alloc_cmd_page(struct mlx5_core_dev *dev, struct mlx5_cmd *cmd) + { +- cmd->cmd_alloc_buf = dma_alloc_coherent(mlx5_core_dma_dev(dev), MLX5_ADAPTER_PAGE_SIZE, ++#ifdef HAVE_DMA_ZALLOC_COHERENT ++ cmd->cmd_alloc_buf = dma_zalloc_coherent(mlx5_core_dma_dev(dev), MLX5_ADAPTER_PAGE_SIZE, ++#else ++ cmd->cmd_alloc_buf = dma_alloc_coherent(mlx5_core_dma_dev(dev), MLX5_ADAPTER_PAGE_SIZE, ++#endif + &cmd->alloc_dma, GFP_KERNEL); + if (!cmd->cmd_alloc_buf) + return -ENOMEM; +@@ -2184,7 +2229,11 @@ static int alloc_cmd_page(struct mlx5_co + + dma_free_coherent(mlx5_core_dma_dev(dev), MLX5_ADAPTER_PAGE_SIZE, cmd->cmd_alloc_buf, + cmd->alloc_dma); ++#ifdef HAVE_DMA_ZALLOC_COHERENT ++ cmd->cmd_alloc_buf = dma_zalloc_coherent(mlx5_core_dma_dev(dev), ++#else + cmd->cmd_alloc_buf = dma_alloc_coherent(mlx5_core_dma_dev(dev), ++#endif + 2 * MLX5_ADAPTER_PAGE_SIZE - 1, + &cmd->alloc_dma, GFP_KERNEL); + if (!cmd->cmd_alloc_buf) +@@ -2545,7 +2594,11 @@ static ssize_t real_miss_store(struct de + return count; + } + ++#ifdef CONFIG_COMPAT_IS_CONST_KOBJECT_SYSFS_OPS + static const struct sysfs_ops cmd_cache_sysfs_ops = { ++#else ++static struct sysfs_ops cmd_cache_sysfs_ops = { ++#endif + .show = cmd_cache_attr_show, + .store = cmd_cache_attr_store, + }; +@@ -2573,10 +2626,17 @@ static struct attribute *cmd_cache_defau + &cmd_cache_attr_total_commands.attr, + NULL + }; ++#ifdef HAVE_KOBJ_TYPE_DEFAULT_GROUPS ++ATTRIBUTE_GROUPS(cmd_cache_default); ++#endif + + static struct kobj_type cmd_cache_type = { + .sysfs_ops = &cmd_cache_sysfs_ops, ++#ifdef HAVE_KOBJ_TYPE_DEFAULT_GROUPS ++ .default_groups = cmd_cache_default_groups ++#else + .default_attrs = cmd_cache_default_attrs ++#endif + }; + + static DEVICE_ATTR(real_miss, 0600, real_miss_show, real_miss_store); diff --git a/src/mlnx-ofa_kernel-5.8/backports/0076-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-cq..patch b/src/mlnx-ofa_kernel-5.8/backports/0076-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-cq..patch new file mode 100644 index 0000000..6579203 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0076-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-cq..patch @@ -0,0 +1,30 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/cq.c + +Change-Id: Icec6775c03a3bc7642e01a5ab66be405451020e8 +--- + drivers/net/ethernet/mellanox/mlx5/core/cq.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/cq.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/cq.c +@@ -42,11 +42,19 @@ + #define TASKLET_MAX_TIME 2 + #define TASKLET_MAX_TIME_JIFFIES msecs_to_jiffies(TASKLET_MAX_TIME) + ++#ifdef HAVE_TASKLET_SETUP + void mlx5_cq_tasklet_cb(struct tasklet_struct *t) ++#else ++void mlx5_cq_tasklet_cb(unsigned long data) ++#endif + { + unsigned long flags; + unsigned long end = jiffies + TASKLET_MAX_TIME_JIFFIES; ++#ifdef HAVE_TASKLET_SETUP + struct mlx5_eq_tasklet *ctx = from_tasklet(ctx, t, task); ++#else ++ struct mlx5_eq_tasklet *ctx = (struct mlx5_eq_tasklet *)data; ++#endif + struct mlx5_core_cq *mcq; + struct mlx5_core_cq *temp; + diff --git a/src/mlnx-ofa_kernel-5.8/backports/0077-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-crd.patch b/src/mlnx-ofa_kernel-5.8/backports/0077-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-crd.patch new file mode 100644 index 0000000..8411d8c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0077-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-crd.patch @@ -0,0 +1,54 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/crdump.c + +Change-Id: Ia5d7cee3cd0eacf2f03ac73279e5995aa402aa5a +--- + .../net/ethernet/mellanox/mlx5/core/crdump.c | 18 ++++++++++++++++++ + 1 file changed, 18 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/crdump.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/crdump.c +@@ -163,17 +163,31 @@ static int mlx5_crdump_open(struct inode + return ret; + + seq = file->private_data; ++#ifdef HAVE_PDE_DATA + seq->private = pde_data(inode); ++#else ++ seq->private = PDE_DATA(inode); ++#endif + + return 0; + } + ++#ifdef HAVE_PROC_OPS_STRUCT + static const struct proc_ops mlx5_crdump_ops = { + .proc_open = mlx5_crdump_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = seq_release + }; ++#else ++static const struct file_operations mlx5_crdump_fops = { ++ .owner = THIS_MODULE, ++ .open = mlx5_crdump_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release ++}; ++#endif + + int mlx5_cr_protected_capture(struct mlx5_core_dev *dev) + { +@@ -291,7 +305,11 @@ int mlx5_crdump_init(struct mlx5_core_de + + if (mlx5_crdump_dir) + if (!proc_create_data(pci_name(dev->pdev), S_IRUGO, ++#ifdef HAVE_PROC_OPS_STRUCT + mlx5_crdump_dir, &mlx5_crdump_ops, ++#else ++ mlx5_crdump_dir, &mlx5_crdump_fops, ++#endif + crdump)) { + pr_warn("failed creating proc file\n"); + goto clean_mem; diff --git a/src/mlnx-ofa_kernel-5.8/backports/0078-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-dev.patch b/src/mlnx-ofa_kernel-5.8/backports/0078-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-dev.patch new file mode 100644 index 0000000..8783d8d --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0078-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-dev.patch @@ -0,0 +1,353 @@ +From: Maher Sanalla +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/dev.c + +Change-Id: Iaf4527dc09607aa9451d4346686d2e0d34b4e51a +--- + drivers/net/ethernet/mellanox/mlx5/core/dev.c | 161 ++++++++++++++++-- + 1 file changed, 151 insertions(+), 10 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/dev.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/dev.c +@@ -112,8 +112,10 @@ bool mlx5_eth_supported(struct mlx5_core + return true; + } + ++#ifdef HAVE_DEVLINK_PARAM_GENERIC_ID_ENABLE_ETH + static bool is_eth_enabled(struct mlx5_core_dev *dev) + { ++#ifdef HAVE_DEVLINK_PARAM_REGISTER + union devlink_param_value val; + int err; + +@@ -121,7 +123,11 @@ static bool is_eth_enabled(struct mlx5_c + DEVLINK_PARAM_GENERIC_ID_ENABLE_ETH, + &val); + return err ? false : val.vbool; ++#else ++ return true; ++#endif + } ++#endif + + bool mlx5_vnet_supported(struct mlx5_core_dev *dev) + { +@@ -145,8 +151,10 @@ bool mlx5_vnet_supported(struct mlx5_cor + return true; + } + ++#ifdef HAVE_DEVLINK_PARAM_GENERIC_ID_ENABLE_ETH + static bool is_vnet_enabled(struct mlx5_core_dev *dev) + { ++#ifdef HAVE_DEVLINK_PARAM_REGISTER + union devlink_param_value val; + int err; + +@@ -154,7 +162,11 @@ static bool is_vnet_enabled(struct mlx5_ + DEVLINK_PARAM_GENERIC_ID_ENABLE_VNET, + &val); + return err ? false : val.vbool; ++#else ++ return true; ++#endif + } ++#endif + + static bool is_ib_rep_supported(struct mlx5_core_dev *dev) + { +@@ -219,16 +231,22 @@ bool mlx5_rdma_supported(struct mlx5_cor + return true; + } + ++#ifdef HAVE_DEVLINK_PARAM_GENERIC_ID_ENABLE_ETH + static bool is_ib_enabled(struct mlx5_core_dev *dev) + { +- union devlink_param_value val; +- int err; +- +- err = devlink_param_driverinit_value_get(priv_to_devlink(dev), +- DEVLINK_PARAM_GENERIC_ID_ENABLE_RDMA, +- &val); +- return err ? false : val.vbool; ++#ifdef HAVE_DEVLINK_PARAM_REGISTER ++ union devlink_param_value val; ++ int err; ++ ++ err = devlink_param_driverinit_value_get(priv_to_devlink(dev), ++ DEVLINK_PARAM_GENERIC_ID_ENABLE_RDMA, ++ &val); ++ return err ? false : val.vbool; ++#else ++ return true; ++#endif + } ++#endif + + enum { + MLX5_INTERFACE_PROTOCOL_ETH, +@@ -248,13 +266,22 @@ static const struct mlx5_adev_device { + } mlx5_adev_devices[] = { + [MLX5_INTERFACE_PROTOCOL_VNET] = { .suffix = "vnet", + .is_supported = &mlx5_vnet_supported, +- .is_enabled = &is_vnet_enabled }, ++#ifdef HAVE_DEVLINK_PARAM_GENERIC_ID_ENABLE_ETH ++ .is_enabled = &is_vnet_enabled ++#endif ++ }, + [MLX5_INTERFACE_PROTOCOL_IB] = { .suffix = "rdma", + .is_supported = &mlx5_rdma_supported, +- .is_enabled = &is_ib_enabled }, ++#ifdef HAVE_DEVLINK_PARAM_GENERIC_ID_ENABLE_ETH ++ .is_enabled = &is_ib_enabled ++#endif ++ }, + [MLX5_INTERFACE_PROTOCOL_ETH] = { .suffix = "eth", + .is_supported = &mlx5_eth_supported, +- .is_enabled = &is_eth_enabled }, ++#ifdef HAVE_DEVLINK_PARAM_GENERIC_ID_ENABLE_ETH ++ .is_enabled = &is_eth_enabled ++#endif ++ }, + [MLX5_INTERFACE_PROTOCOL_ETH_REP] = { .suffix = "eth-rep", + .is_supported = &is_eth_rep_supported }, + [MLX5_INTERFACE_PROTOCOL_IB_REP] = { .suffix = "rdma-rep", +@@ -265,12 +292,20 @@ static const struct mlx5_adev_device { + + int mlx5_adev_idx_alloc(void) + { ++#ifdef HAVE_IDA_ALLOC + return ida_alloc(&mlx5_adev_ida, GFP_KERNEL); ++#else ++ return ida_simple_get(&mlx5_adev_ida,0, 0, GFP_KERNEL); ++#endif + } + + void mlx5_adev_idx_free(int idx) + { ++#ifdef HAVE_IDA_FREE + ida_free(&mlx5_adev_ida, idx); ++#else ++ ida_simple_remove(&mlx5_adev_ida, idx); ++#endif + } + + int mlx5_adev_init(struct mlx5_core_dev *dev) +@@ -344,13 +379,27 @@ static void del_adev(struct auxiliary_de + + int mlx5_attach_device(struct mlx5_core_dev *dev) + { ++#if defined(HAVE_DEVL_PORT_REGISTER) && \ ++ !defined(HAVE_DEVL_TRAP_GROUPS_REGISTER) //forward port ++ struct devlink *devlink = priv_to_devlink(dev); ++#endif + struct mlx5_priv *priv = &dev->priv; + struct auxiliary_device *adev; + struct auxiliary_driver *adrv; + int ret = 0, i; + ++#ifdef HAVE_DEVL_PORT_REGISTER //forward port ++#ifdef HAVE_DEVL_TRAP_GROUPS_REGISTER //forward port ++ devl_assert_locked(priv_to_devlink(dev)); ++#else ++ devl_lock(devlink); ++#endif /* HAVE_DEVL_TRAP_GROUPS_REGISTER*/ ++#endif /* HAVE_DEVL_PORT_REGISTER */ + mutex_lock(&mlx5_intf_mutex); + priv->flags &= ~MLX5_PRIV_FLAGS_DETACH; ++#ifdef HAVE_DEVL_PORT_REGISTER //forward port ++ priv->flags |= MLX5_PRIV_FLAGS_MLX5E_LOCKED_FLOW; ++#endif + for (i = 0; i < ARRAY_SIZE(mlx5_adev_devices); i++) { + if (!priv->adev[i]) { + bool is_supported = false; +@@ -398,19 +447,40 @@ int mlx5_attach_device(struct mlx5_core_ + break; + } + } ++#ifdef HAVE_DEVL_PORT_REGISTER //forward port ++ priv->flags &= ~MLX5_PRIV_FLAGS_MLX5E_LOCKED_FLOW; ++#endif + mutex_unlock(&mlx5_intf_mutex); ++#if defined(HAVE_DEVL_PORT_REGISTER) && \ ++ !defined(HAVE_DEVL_TRAP_GROUPS_REGISTER) //forward port ++ devl_unlock(devlink); ++#endif + return ret; + } + + void mlx5_detach_device(struct mlx5_core_dev *dev) + { ++#if defined(HAVE_DEVL_PORT_REGISTER) && \ ++ !defined(HAVE_DEVL_TRAP_GROUPS_REGISTER) //forward port ++ struct devlink *devlink = priv_to_devlink(dev); ++#endif + struct mlx5_priv *priv = &dev->priv; + struct auxiliary_device *adev; + struct auxiliary_driver *adrv; + pm_message_t pm = {}; + int i; + ++#ifdef HAVE_DEVL_PORT_REGISTER //forward port ++#ifdef HAVE_DEVL_TRAP_GROUPS_REGISTER //forward port ++ devl_assert_locked(priv_to_devlink(dev)); ++#else ++ devl_lock(devlink); ++#endif /* HAVE_DEVL_TRAP_GROUPS_REGISTER */ ++#endif /* HAVE_DEVL_PORT_REGISTER */ + mutex_lock(&mlx5_intf_mutex); ++#ifdef HAVE_DEVL_PORT_REGISTER //forward port ++ priv->flags |= MLX5_PRIV_FLAGS_MLX5E_LOCKED_FLOW; ++#endif + for (i = ARRAY_SIZE(mlx5_adev_devices) - 1; i >= 0; i--) { + if (!priv->adev[i]) + continue; +@@ -439,18 +509,41 @@ skip_suspend: + del_adev(&priv->adev[i]->adev); + priv->adev[i] = NULL; + } ++#ifdef HAVE_DEVL_PORT_REGISTER //forward port ++ priv->flags &= ~MLX5_PRIV_FLAGS_MLX5E_LOCKED_FLOW; ++#endif + priv->flags |= MLX5_PRIV_FLAGS_DETACH; + mutex_unlock(&mlx5_intf_mutex); ++#if defined(HAVE_DEVL_PORT_REGISTER) && \ ++ !defined(HAVE_DEVL_TRAP_GROUPS_REGISTER) //forward port ++ devl_unlock(devlink); ++#endif + } + + int mlx5_register_device(struct mlx5_core_dev *dev) + { ++#if defined(HAVE_DEVL_PORT_REGISTER) && \ ++ !defined(HAVE_DEVL_TRAP_GROUPS_REGISTER) //forward port ++ struct devlink *devlink; ++#endif + int ret; + ++#ifdef HAVE_DEVL_PORT_REGISTER //forward port ++#ifdef HAVE_DEVL_TRAP_GROUPS_REGISTER //forward port ++ devl_assert_locked(priv_to_devlink(dev)); ++#else ++ devlink = priv_to_devlink(dev); ++ devl_lock(devlink); ++#endif /* HAVE_DEVL_TRAP_GROUPS_REGISTER */ ++#endif /* HAVE_DEVL_PORT_REGISTER */ + mutex_lock(&mlx5_intf_mutex); + dev->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV; + ret = mlx5_rescan_drivers_locked(dev); + mutex_unlock(&mlx5_intf_mutex); ++#if defined(HAVE_DEVL_PORT_REGISTER) && \ ++ !defined(HAVE_DEVL_TRAP_GROUPS_REGISTER) //forward port ++ devl_unlock(devlink); ++#endif + if (ret) + mlx5_unregister_device(dev); + +@@ -459,11 +552,25 @@ int mlx5_register_device(struct mlx5_cor + + void mlx5_unregister_device(struct mlx5_core_dev *dev) + { ++#ifdef HAVE_DEVL_PORT_REGISTER //forward port ++#ifdef HAVE_DEVL_TRAP_GROUPS_REGISTER ++ devl_assert_locked(priv_to_devlink(dev)); ++#else ++ struct devlink *devlink; ++ ++ devlink = priv_to_devlink(dev); ++ devl_lock(devlink); ++#endif /* HAVE_DEVL_TRAP_GROUPS_REGISTER */ ++#endif /* HAVE_DEVL_PORT_REGISTER */ + mutex_lock(&mlx5_intf_mutex); + dev->priv.flags = MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV; + dev->priv.flags &= ~MLX5_PRIV_FLAGS_DETACH; + mlx5_rescan_drivers_locked(dev); + mutex_unlock(&mlx5_intf_mutex); ++#if defined(HAVE_DEVL_PORT_REGISTER) && \ ++ !defined(HAVE_DEVL_TRAP_GROUPS_REGISTER) //forward port ++ devl_unlock(devlink); ++#endif + } + + static int add_drivers(struct mlx5_core_dev *dev) +@@ -536,16 +643,32 @@ del_adev: + int mlx5_rescan_drivers_locked(struct mlx5_core_dev *dev) + { + struct mlx5_priv *priv = &dev->priv; ++#ifdef HAVE_DEVL_PORT_REGISTER //forward port ++ int err = 0; ++#endif + + lockdep_assert_held(&mlx5_intf_mutex); + if (priv->flags & MLX5_PRIV_FLAGS_DETACH) + return 0; + ++#ifdef HAVE_DEVL_PORT_REGISTER //forward port ++ priv->flags |= MLX5_PRIV_FLAGS_MLX5E_LOCKED_FLOW; ++#endif + delete_drivers(dev); + if (priv->flags & MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV) ++#ifdef HAVE_DEVL_PORT_REGISTER //forward port ++ goto out; ++ ++ err = add_drivers(dev); ++ ++out: ++ priv->flags &= ~MLX5_PRIV_FLAGS_MLX5E_LOCKED_FLOW; ++ return err; ++#else + return 0; + + return add_drivers(dev); ++#endif + } + + bool mlx5_same_hw_devs(struct mlx5_core_dev *dev, struct mlx5_core_dev *peer_dev) +@@ -591,7 +714,11 @@ static struct mlx5_core_dev *is_mlx5_cor + return (struct mlx5_core_dev *)pci_get_drvdata(pdev); + } + ++#if defined(HAVE_LINUX_DEVICE_BUS_H) || defined(HAVE_BUS_FIND_DEVICE_GET_CONST) + static int next_phys_dev(struct device *dev, const void *data) ++#else ++static int next_phys_dev(struct device *dev, void *data) ++#endif /* HAVE_BUS_FIND_DEVICE_GET_CONST || HAVE_LINUX_DEVICE_BUS_H */ + { + struct mlx5_core_dev *mdev = is_mlx5_core_dev(dev, (struct mlx5_core_dev *)data); + +@@ -601,7 +728,11 @@ static int next_phys_dev(struct device * + return _next_phys_dev(mdev, data); + } + ++#if defined(HAVE_LINUX_DEVICE_BUS_H) || defined(HAVE_BUS_FIND_DEVICE_GET_CONST) + static int next_phys_dev_lag(struct device *dev, const void *data) ++#else ++static int next_phys_dev_lag(struct device *dev, void *data) ++#endif /* HAVE_BUS_FIND_DEVICE_GET_CONST || HAVE_LINUX_DEVICE_BUS_H */ + { + struct mlx5_core_dev *mdev = is_mlx5_core_dev(dev, (struct mlx5_core_dev *)data); + +@@ -617,14 +748,24 @@ static int next_phys_dev_lag(struct devi + return _next_phys_dev(mdev, data); + } + ++#if defined(HAVE_LINUX_DEVICE_BUS_H) || defined(HAVE_BUS_FIND_DEVICE_GET_CONST) + static struct device *pci_find_dev(void *data, + int (*match)(struct device *dev, const void *data)) ++#else ++static struct device *pci_find_dev(void *data, ++ int (*match)(struct device *dev, void *data)) ++#endif /* HAVE_BUS_FIND_DEVICE_GET_CONST || HAVE_LINUX_DEVICE_BUS_H */ + { + return bus_find_device(&pci_bus_type, NULL, data, match); + } + ++#if defined(HAVE_LINUX_DEVICE_BUS_H) || defined(HAVE_BUS_FIND_DEVICE_GET_CONST) ++struct mlx5_core_dev *mlx5_get_next_dev(struct mlx5_core_dev *dev, ++ int (*match)(struct device *dev, const void *data)) ++#else + struct mlx5_core_dev *mlx5_get_next_dev(struct mlx5_core_dev *dev, +- int (*match)(struct device *dev, const void *data)) ++ int (*match)(struct device *dev, void *data)) ++#endif /* HAVE_BUS_FIND_DEVICE_GET_CONST || HAVE_LINUX_DEVICE_BUS_H */ + { + struct device *next; + if (!mlx5_core_is_pf(dev)) diff --git a/src/mlnx-ofa_kernel-5.8/backports/0080-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-dia.patch b/src/mlnx-ofa_kernel-5.8/backports/0080-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-dia.patch new file mode 100644 index 0000000..188f0ec --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0080-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-dia.patch @@ -0,0 +1,25 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/diag/en_tc_tracepoint.c + +Change-Id: I26b98a23aeb7fc470e8439414f4a528960cccb2e +--- + .../net/ethernet/mellanox/mlx5/core/diag/en_tc_tracepoint.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/diag/en_tc_tracepoint.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/en_tc_tracepoint.c +@@ -1,6 +1,7 @@ + // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB + /* Copyright (c) 2019 Mellanox Technologies. */ + ++#ifndef MLX_DISABLE_TRACEPOINTS + #define CREATE_TRACE_POINTS + #include "en_tc_tracepoint.h" + +@@ -56,3 +57,5 @@ const char *parse_action(struct trace_se + trace_seq_putc(p, 0); + return ret; + } ++ ++#endif diff --git a/src/mlnx-ofa_kernel-5.8/backports/0081-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-dia.patch b/src/mlnx-ofa_kernel-5.8/backports/0081-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-dia.patch new file mode 100644 index 0000000..e084847 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0081-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-dia.patch @@ -0,0 +1,88 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/diag/en_tc_tracepoint.h + +Change-Id: I3d48c9d00301027bbf1ab8c035fe71adede59436 +--- + .../mlx5/core/diag/en_tc_tracepoint.h | 28 +++++++++++++++++++ + 1 file changed, 28 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/diag/en_tc_tracepoint.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/en_tc_tracepoint.h +@@ -1,6 +1,7 @@ + /* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ + /* Copyright (c) 2019 Mellanox Technologies. */ + ++#ifndef MLX_DISABLE_TRACEPOINTS + #undef TRACE_SYSTEM + #define TRACE_SYSTEM mlx5 + +@@ -27,10 +28,18 @@ DECLARE_EVENT_CLASS(mlx5e_flower_templat + TP_ARGS(f), + TP_STRUCT__entry(__field(void *, cookie) + __field(unsigned int, num) ++#ifdef HAVE_TC_SETUP_FLOW_ACTION + __dynamic_array(int, ids, f->rule ? + f->rule->action.num_entries : 0) ++#else ++ __dynamic_array(int, ids, ++ tcf_exts_num_actions(f->exts)) ++ __dynamic_array(char, actions, ++ tcf_exts_num_actions(f->exts) * sizeof(struct flow_action_entry)) ++#endif + ), + TP_fast_assign(__entry->cookie = (void *)f->cookie; ++#ifdef HAVE_TC_SETUP_FLOW_ACTION + __entry->num = (f->rule ? + f->rule->action.num_entries : 0); + if (__entry->num) +@@ -38,6 +47,15 @@ DECLARE_EVENT_CLASS(mlx5e_flower_templat + f->rule->action.entries, + f->rule->action.num_entries); + ), ++#else ++ __entry->num = tcf_exts_num_actions(f->exts); ++ tc_setup_flow_action(__get_dynamic_array(actions), f->exts); ++ if (__entry->num) ++ put_ids_to_array(__get_dynamic_array(ids), ++ __get_dynamic_array(actions), ++ __entry->num); ++ ), ++#endif + TP_printk("cookie=%p actions= %s\n", + __entry->cookie, __entry->num ? + __parse_action(__get_dynamic_array(ids), +@@ -59,19 +77,27 @@ TRACE_EVENT(mlx5e_stats_flower, + TP_PROTO(const struct flow_cls_offload *f), + TP_ARGS(f), + TP_STRUCT__entry(__field(void *, cookie) ++#ifdef HAVE_TC_CLS_FLOWER_OFFLOAD_HAS_STATS_FIELD + __field(u64, bytes) + __field(u64, packets) + __field(u64, lastused) ++#endif + ), + TP_fast_assign(__entry->cookie = (void *)f->cookie; ++#ifdef HAVE_TC_CLS_FLOWER_OFFLOAD_HAS_STATS_FIELD + __entry->bytes = f->stats.bytes; + __entry->packets = f->stats.pkts; + __entry->lastused = f->stats.lastused; ++#endif + ), ++#ifdef HAVE_TC_CLS_FLOWER_OFFLOAD_HAS_STATS_FIELD + TP_printk("cookie=%p bytes=%llu packets=%llu lastused=%llu\n", + __entry->cookie, __entry->bytes, + __entry->packets, __entry->lastused + ) ++#else ++ TP_printk("cookie=%p\n", __entry->cookie) ++#endif + ); + + TRACE_EVENT(mlx5e_tc_update_neigh_used_value, +@@ -112,3 +138,5 @@ TRACE_EVENT(mlx5e_tc_update_neigh_used_v + #undef TRACE_INCLUDE_FILE + #define TRACE_INCLUDE_FILE en_tc_tracepoint + #include ++ ++#endif diff --git a/src/mlnx-ofa_kernel-5.8/backports/0082-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-dia.patch b/src/mlnx-ofa_kernel-5.8/backports/0082-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-dia.patch new file mode 100644 index 0000000..ec441cd --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0082-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-dia.patch @@ -0,0 +1,25 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c + +Change-Id: Ibf737275631c442d2dead5302f3484b16b157710 +--- + drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c +@@ -30,6 +30,8 @@ + * SOFTWARE. + */ + ++#ifndef MLX_DISABLE_TRACEPOINTS ++ + #define CREATE_TRACE_POINTS + + #include "fs_tracepoint.h" +@@ -278,3 +280,4 @@ EXPORT_TRACEPOINT_SYMBOL(mlx5_fs_del_fte + EXPORT_TRACEPOINT_SYMBOL(mlx5_fs_add_rule); + EXPORT_TRACEPOINT_SYMBOL(mlx5_fs_del_rule); + ++#endif /* MLX_DISABLE_TRACEPOINTS */ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0083-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-dia.patch b/src/mlnx-ofa_kernel-5.8/backports/0083-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-dia.patch new file mode 100644 index 0000000..ae1424c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0083-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-dia.patch @@ -0,0 +1,81 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h + +Change-Id: I1b4b8b1165753c8bd4e04b2724ad75a440107204 +--- + .../mellanox/mlx5/core/diag/fs_tracepoint.h | 38 ++++++++++++++----- + 1 file changed, 28 insertions(+), 10 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h +@@ -30,6 +30,22 @@ + * SOFTWARE. + */ + ++ ++/* Before kernel 3.14 there was a bug in kernel/trace/trace_export.c that ++ * always used %d format which causes many compilation warnings. ++ * Cast the size to int for such kernels to suppress those warnings. ++ */ ++#ifdef MLX_SIZE_CAST_WA ++#undef MLX_SIZE_CAST_WA ++#endif ++#if (LINUX_VERSION_CODE < KERNEL_VERSION(3,14,0)) ++#define MLX_SIZE_CAST_WA (int) ++#else ++#define MLX_SIZE_CAST_WA ++#endif ++ ++#ifndef MLX_DISABLE_TRACEPOINTS ++ + #if !defined(_MLX5_FS_TP_) || defined(TRACE_HEADER_MULTI_READ) + #define _MLX5_FS_TP_ + +@@ -106,9 +122,9 @@ TRACE_EVENT(mlx5_fs_add_fg, + __field(u32, end_index) + __field(u32, id) + __field(u8, mask_enable) +- __array(u32, mask_outer, MLX5_ST_SZ_DW(fte_match_set_lyr_2_4)) +- __array(u32, mask_inner, MLX5_ST_SZ_DW(fte_match_set_lyr_2_4)) +- __array(u32, mask_misc, MLX5_ST_SZ_DW(fte_match_set_misc)) ++ __array(u32, mask_outer, MLX_SIZE_CAST_WA MLX5_ST_SZ_DW(fte_match_set_lyr_2_4)) ++ __array(u32, mask_inner, MLX_SIZE_CAST_WA MLX5_ST_SZ_DW(fte_match_set_lyr_2_4)) ++ __array(u32, mask_misc, MLX_SIZE_CAST_WA MLX5_ST_SZ_DW(fte_match_set_misc)) + ), + TP_fast_assign( + __entry->fg = fg; +@@ -190,12 +206,12 @@ TRACE_EVENT(mlx5_fs_set_fte, + __field(u32, flow_source) + __field(u8, mask_enable) + __field(int, new_fte) +- __array(u32, mask_outer, MLX5_ST_SZ_DW(fte_match_set_lyr_2_4)) +- __array(u32, mask_inner, MLX5_ST_SZ_DW(fte_match_set_lyr_2_4)) +- __array(u32, mask_misc, MLX5_ST_SZ_DW(fte_match_set_misc)) +- __array(u32, value_outer, MLX5_ST_SZ_DW(fte_match_set_lyr_2_4)) +- __array(u32, value_inner, MLX5_ST_SZ_DW(fte_match_set_lyr_2_4)) +- __array(u32, value_misc, MLX5_ST_SZ_DW(fte_match_set_misc)) ++ __array(u32, mask_outer, MLX_SIZE_CAST_WA MLX5_ST_SZ_DW(fte_match_set_lyr_2_4)) ++ __array(u32, mask_inner, MLX_SIZE_CAST_WA MLX5_ST_SZ_DW(fte_match_set_lyr_2_4)) ++ __array(u32, mask_misc, MLX_SIZE_CAST_WA MLX5_ST_SZ_DW(fte_match_set_misc)) ++ __array(u32, value_outer, MLX_SIZE_CAST_WA MLX5_ST_SZ_DW(fte_match_set_lyr_2_4)) ++ __array(u32, value_inner, MLX_SIZE_CAST_WA MLX5_ST_SZ_DW(fte_match_set_lyr_2_4)) ++ __array(u32, value_misc, MLX_SIZE_CAST_WA MLX5_ST_SZ_DW(fte_match_set_misc)) + ), + TP_fast_assign( + __entry->fte = fte; +@@ -279,7 +295,7 @@ TRACE_EVENT(mlx5_fs_add_rule, + __field(u32, sw_action) + __field(u32, index) + __field(u32, counter_id) +- __array(u8, destination, sizeof(struct mlx5_flow_destination)) ++ __array(u8, destination, MLX_SIZE_CAST_WA sizeof(struct mlx5_flow_destination)) + ), + TP_fast_assign( + __entry->rule = rule; +@@ -321,3 +337,5 @@ TRACE_EVENT(mlx5_fs_del_rule, + #undef TRACE_INCLUDE_FILE + #define TRACE_INCLUDE_FILE fs_tracepoint + #include ++ ++#endif /* MLX_DISABLE_TRACEPOINTS */ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0084-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-dia.patch b/src/mlnx-ofa_kernel-5.8/backports/0084-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-dia.patch new file mode 100644 index 0000000..fc58902 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0084-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-dia.patch @@ -0,0 +1,101 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c + +Change-Id: I32a564e0d1563ba56c4ce7136207259825ed0084 +--- + .../mellanox/mlx5/core/diag/fw_tracer.c | 26 ++++++++++++++++--- + 1 file changed, 22 insertions(+), 4 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c +@@ -29,11 +29,13 @@ + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ ++#ifndef MLX_DISABLE_TRACEPOINTS + #define CREATE_TRACE_POINTS +-#include "lib/eq.h" +-#include "fw_tracer.h" + #include "fw_tracer_tracepoint.h" ++#endif ++#include "fw_tracer.h" + ++#include "lib/eq.h" + static int mlx5_query_mtrc_caps(struct mlx5_fw_tracer *tracer) + { + u32 *string_db_base_address_out = tracer->str_db.base_address_out; +@@ -435,8 +437,9 @@ static struct tracer_string_format *mlx5 + u8 event_id, u32 tmsn) + { + struct tracer_string_format *message; ++ COMPAT_HL_NODE + +- hlist_for_each_entry(message, head, hlist) ++ compat_hlist_for_each_entry(message, head, hlist) + if (message->event_id == event_id && message->tmsn == tmsn) + return message; + +@@ -520,10 +523,11 @@ static void mlx5_fw_tracer_clean_print_h + { + struct tracer_string_format *str_frmt; + struct hlist_node *n; ++ COMPAT_HL_NODE + int i; + + for (i = 0; i < MESSAGE_HASH_SIZE; i++) { +- hlist_for_each_entry_safe(str_frmt, n, &tracer->hash[i], hlist) ++ compat_hlist_for_each_entry_safe(str_frmt, n, &tracer->hash[i], hlist) + mlx5_tracer_clean_message(str_frmt); + } + } +@@ -548,7 +552,11 @@ static void mlx5_fw_tracer_save_trace(st + trace_data->timestamp = timestamp; + trace_data->lost = lost; + trace_data->event_id = event_id; ++#ifdef HAVE_STRSCPY_PAD + strscpy_pad(trace_data->msg, msg, TRACE_STR_MSG); ++#else ++ strncpy(trace_data->msg, msg, TRACE_STR_MSG); ++#endif + + tracer->st_arr.saved_traces_index = + (tracer->st_arr.saved_traces_index + 1) & (SAVED_TRACES_NUM - 1); +@@ -571,8 +579,14 @@ void mlx5_tracer_print_trace(struct trac + str_frmt->params[5], + str_frmt->params[6]); + ++#ifndef MLX_DISABLE_TRACEPOINTS + trace_mlx5_fw(dev->tracer, trace_timestamp, str_frmt->lost, + str_frmt->event_id, tmp); ++#else ++ pr_debug("%s %llu %d %d %s\n", dev_name(&dev->pdev->dev), ++ trace_timestamp, str_frmt->lost, ++ str_frmt->event_id, tmp); ++#endif + + mlx5_fw_tracer_save_trace(dev->tracer, trace_timestamp, + str_frmt->lost, str_frmt->event_id, tmp); +@@ -827,6 +841,7 @@ static void mlx5_fw_tracer_ownership_cha + mlx5_fw_tracer_start(tracer); + } + ++#ifdef HAVE_DEVLINK_HEALTH_REPORT_SUPPORT + static int mlx5_fw_tracer_set_core_dump_reg(struct mlx5_core_dev *dev, + u32 *in, int size_in) + { +@@ -929,6 +944,7 @@ unlock: + mutex_unlock(&tracer->st_arr.lock); + return err; + } ++#endif /* HAVE_DEVLINK_HEALTH_REPORT_SUPPORT */ + + /* Create software resources (Buffers, etc ..) */ + struct mlx5_fw_tracer *mlx5_fw_tracer_create(struct mlx5_core_dev *dev) +@@ -1150,4 +1166,6 @@ static int fw_tracer_event(struct notifi + return NOTIFY_OK; + } + ++#ifndef MLX_DISABLE_TRACEPOINTS + EXPORT_TRACEPOINT_SYMBOL(mlx5_fw); ++#endif diff --git a/src/mlnx-ofa_kernel-5.8/backports/0085-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-dia.patch b/src/mlnx-ofa_kernel-5.8/backports/0085-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-dia.patch new file mode 100644 index 0000000..a3e11bb --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0085-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-dia.patch @@ -0,0 +1,25 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.h + +Change-Id: I6d66e3bfc0a35573d661976c147a198b1b3607dd +--- + drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.h | 7 ++++--- + 1 file changed, 4 insertions(+), 3 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.h +@@ -188,9 +188,10 @@ struct mlx5_fw_tracer *mlx5_fw_tracer_cr + int mlx5_fw_tracer_init(struct mlx5_fw_tracer *tracer); + void mlx5_fw_tracer_cleanup(struct mlx5_fw_tracer *tracer); + void mlx5_fw_tracer_destroy(struct mlx5_fw_tracer *tracer); ++int mlx5_fw_tracer_reload(struct mlx5_fw_tracer *tracer); ++#ifdef HAVE_DEVLINK_HEALTH_REPORT_SUPPORT + int mlx5_fw_tracer_trigger_core_dump_general(struct mlx5_core_dev *dev); + int mlx5_fw_tracer_get_saved_traces_objects(struct mlx5_fw_tracer *tracer, +- struct devlink_fmsg *fmsg); +-int mlx5_fw_tracer_reload(struct mlx5_fw_tracer *tracer); +- ++ struct devlink_fmsg *fmsg); ++#endif /* HAVE_DEVLINK_HEALTH_REPORT_SUPPORT */ + #endif diff --git a/src/mlnx-ofa_kernel-5.8/backports/0086-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en..patch b/src/mlnx-ofa_kernel-5.8/backports/0086-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en..patch new file mode 100644 index 0000000..d917c79 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0086-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en..patch @@ -0,0 +1,672 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/en.h + +Change-Id: I75a9db15426bd2e37cd05bb32ba324c16fa4b7b0 +--- + drivers/net/ethernet/mellanox/mlx5/core/en.h | 258 +++++++++++++++++-- + 1 file changed, 239 insertions(+), 19 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h +@@ -32,9 +32,13 @@ + #ifndef __MLX5_EN_H__ + #define __MLX5_EN_H__ + ++#ifdef HAVE_XDP_SUPPORT ++#include ++#endif + #include + #include + #include ++#include + #include + #include + #include +@@ -45,12 +49,17 @@ + #include + #include + #include ++#include ++#ifdef HAVE_UDP_TUNNEL_NIC_INFO + #include ++#endif + #include + #include + #include + #include ++#ifdef HAVE_BITS_H + #include ++#endif + #include "wq.h" + #include "mlx5_core.h" + #include "en_stats.h" +@@ -60,9 +69,20 @@ + #include "lib/hv_vhca.h" + #include "lib/clock.h" + #include "en/rx_res.h" ++#ifdef CONFIG_COMPAT_LRO_ENABLED_IPOIB ++#include ++#else ++#include ++#endif + ++#ifndef HAVE_DEVLINK_HEALTH_REPORT_SUPPORT ++/* The intention is to pass NULL for backports of old kernels */ ++struct devlink_health_reporter {}; ++#endif /* HAVE_DEVLINK_HEALTH_REPORT_SUPPORT */ + extern const struct net_device_ops mlx5e_netdev_ops; ++#ifdef HAVE_NET_PAGE_POOL_H + struct page_pool; ++#endif + + #define MLX5E_METADATA_ETHER_TYPE (0x8CE4) + #define MLX5E_METADATA_ETHER_LEN 8 +@@ -286,14 +306,22 @@ enum mlx5e_priv_flag { + MLX5E_PFLAG_RX_CQE_BASED_MODER, + MLX5E_PFLAG_TX_CQE_BASED_MODER, + MLX5E_PFLAG_RX_CQE_COMPRESS, ++ MLX5E_PFLAG_TX_CQE_COMPRESS, + MLX5E_PFLAG_RX_STRIDING_RQ, + MLX5E_PFLAG_RX_NO_CSUM_COMPLETE, ++#ifdef HAVE_XDP_SUPPORT + MLX5E_PFLAG_XDP_TX_MPWQE, ++#endif + MLX5E_PFLAG_SKB_TX_MPWQE, + MLX5E_PFLAG_TX_PORT_TS, + MLX5E_PFLAG_DROPLESS_RQ, + MLX5E_PFLAG_PER_CH_STATS, ++ /* OFED-specific private flags */ ++#ifdef CONFIG_COMPAT_LRO_ENABLED_IPOIB ++ MLX5E_PFLAG_HWLRO, ++#endif + MLX5E_PFLAG_TX_XDP_CSUM, ++ MLX5E_PFLAG_SKB_XMIT_MORE, + MLX5E_NUM_PFLAGS, /* Keep last */ + }; + +@@ -329,27 +357,38 @@ struct mlx5e_params { + u8 log_rx_page_cache_mult; + u16 num_channels; + struct { ++#ifdef HAVE_TC_MQPRIO_QOPT_OFFLOAD + u16 mode; ++#endif + u8 num_tc; + struct netdev_tc_txq tc_to_txq[TC_MAX_QUEUE]; ++#ifdef HAVE_TC_MQPRIO_QOPT_OFFLOAD + struct { + u64 max_rate[TC_MAX_QUEUE]; + u32 hw_id[TC_MAX_QUEUE]; + } channel; ++#endif + } mqprio; + bool rx_cqe_compress_def; + bool tunneled_offload_en; + struct dim_cq_moder rx_cq_moderation; + struct dim_cq_moder tx_cq_moderation; + struct mlx5e_packet_merge_param packet_merge; ++#ifdef CONFIG_COMPAT_LRO_ENABLED_IPOIB ++ bool lro_en; ++#endif + u8 tx_min_inline_mode; + bool vlan_strip_disable; + bool scatter_fcs_en; + bool rx_dim_enabled; + bool tx_dim_enabled; + u32 pflags; ++#ifdef HAVE_XDP_SUPPORT + struct bpf_prog *xdp_prog; ++#endif ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + struct mlx5e_xsk *xsk; ++#endif + unsigned int sw_mtu; + int hard_mtu; + bool ptp_rx; +@@ -361,8 +400,12 @@ struct mlx5e_params { + + static inline u8 mlx5e_get_dcb_num_tc(struct mlx5e_params *params) + { ++#ifdef HAVE_TC_MQPRIO_QOPT_OFFLOAD + return params->mqprio.mode == TC_MQPRIO_MODE_DCB ? + params->mqprio.num_tc : 1; ++#else ++ return params->mqprio.num_tc; ++#endif + } + + enum { +@@ -374,7 +417,8 @@ enum { + MLX5E_RQ_STATE_FPGA_TLS, /* FPGA TLS enabled */ + MLX5E_RQ_STATE_MINI_CQE_HW_STRIDX, /* set when mini_cqe_resp_stride_index cap is used */ + MLX5E_RQ_STATE_SHAMPO, /* set when SHAMPO cap is used */ +- MLX5E_RQ_STATE_CACHE_REDUCE_PENDING ++ MLX5E_RQ_STATE_CACHE_REDUCE_PENDING, ++ MLX5E_RQ_STATE_SKB_XMIT_MORE + }; + + struct mlx5e_cq { +@@ -386,6 +430,9 @@ struct mlx5e_cq { + struct napi_struct *napi; + struct mlx5_core_cq mcq; + struct mlx5e_ch_stats *ch_stats; ++#ifndef HAVE_NAPI_STATE_MISSED ++ unsigned long *ch_flags; ++#endif + + /* control */ + struct net_device *netdev; +@@ -425,6 +472,7 @@ enum { + MLX5E_SQ_STATE_PENDING_XSK_TX, + MLX5E_SQ_STATE_PENDING_TLS_RX_RESYNC, + MLX5E_SQ_STATE_TX_XDP_CSUM, ++ MLX5E_SQ_STATE_SKB_XMIT_MORE, + }; + + struct mlx5e_tx_mpwqe { +@@ -466,6 +514,7 @@ struct mlx5e_txqsq { + struct mlx5e_tx_mpwqe mpwqe; + + struct mlx5e_cq cq; ++ struct mlx5e_cq_decomp cqd; + + /* read only */ + struct mlx5_wq_cyc wq; +@@ -506,13 +555,21 @@ struct mlx5e_dma_info { + u32 refcnt_bias; + union { + struct page *page; ++#ifdef HAVE_XSK_BUFF_ALLOC + struct xdp_buff *xsk; ++#else ++ struct { ++ u64 handle; ++ void *data; ++ } xsk; ++#endif + }; + }; + + /* XDP packets can be transmitted in different ways. On completion, we need to + * distinguish between them to clean up things in a proper way. + */ ++#ifdef HAVE_XDP_SUPPORT + enum mlx5e_xdp_xmit_mode { + /* An xdp_frame was transmitted due to either XDP_REDIRECT from another + * device or XDP_TX from an XSK RQ. The frame has to be unmapped and +@@ -526,7 +583,7 @@ enum mlx5e_xdp_xmit_mode { + MLX5E_XDP_XMIT_MODE_PAGE, + + /* No xdp_frame was created at all, the transmit happened from a UMEM +- * page. The UMEM Completion Ring producer pointer has to be increased. ++ * page. The UMEM Completion Ring producer pointer has to be increased. + */ + MLX5E_XDP_XMIT_MODE_XSK, + }; +@@ -544,6 +601,7 @@ struct mlx5e_xdp_info { + } page; + }; + }; ++#endif /* HAVE_XDP_SUPPORT */ + + struct mlx5e_xmit_data { + dma_addr_t dma_addr; +@@ -551,6 +609,7 @@ struct mlx5e_xmit_data { + u32 len; + }; + ++#ifdef HAVE_XDP_SUPPORT + struct mlx5e_xdp_info_fifo { + struct mlx5e_xdp_info *xi; + u32 *cc; +@@ -581,7 +640,13 @@ struct mlx5e_xdpsq { + struct mlx5e_cq cq; + + /* read only */ ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT ++#ifdef HAVE_NETDEV_BPF_XSK_BUFF_POOL + struct xsk_buff_pool *xsk_pool; ++#else ++ struct xdp_umem *umem; ++#endif ++#endif + struct mlx5_wq_cyc wq; + struct mlx5e_xdpsq_stats *stats; + mlx5e_fp_xmit_xdp_frame_check xmit_xdp_frame_check; +@@ -604,6 +669,8 @@ struct mlx5e_xdpsq { + struct mlx5_wq_ctrl wq_ctrl; + struct mlx5e_channel *channel; + } ____cacheline_aligned_in_smp; ++#endif /* HAVE_XDP_SUPPORT */ ++ + + struct mlx5e_ktls_resync_resp; + +@@ -648,8 +715,25 @@ struct mlx5e_umr_dma_info { + struct mlx5e_mpw_info { + struct mlx5e_umr_dma_info umr; + u16 consumed_strides; ++#ifdef HAVE_XDP_SUPPORT + DECLARE_BITMAP(xdp_xmit_bitmap, MLX5_MPWRQ_PAGES_PER_WQE); ++#endif ++}; ++ ++#ifdef CONFIG_COMPAT_LRO_ENABLED_IPOIB ++#define IS_HW_LRO(params) \ ++ ((params)->lro_en && MLX5E_GET_PFLAG(params, MLX5E_PFLAG_HWLRO)) ++ ++#define IS_SW_LRO(params) \ ++ ((params)->lro_en && !MLX5E_GET_PFLAG(params, MLX5E_PFLAG_HWLRO)) ++ ++/* SW LRO defines for MLX5 */ ++#define MLX5E_LRO_MAX_DESC 32 ++struct mlx5e_sw_lro { ++ struct net_lro_mgr lro_mgr; ++ struct net_lro_desc lro_desc[MLX5E_LRO_MAX_DESC]; + }; ++#endif + + #define MLX5E_MAX_RX_FRAGS 4 + +@@ -686,7 +770,8 @@ static inline void mlx5e_put_page(struct + } + + struct mlx5e_rq; +-typedef void (*mlx5e_fp_handle_rx_cqe)(struct mlx5e_rq*, struct mlx5_cqe64*); ++typedef void (*mlx5e_fp_handle_rx_cqe)(struct mlx5e_rq*, struct mlx5_cqe64*, ++ bool xmit_more); + typedef struct sk_buff * + (*mlx5e_fp_skb_from_cqe_mpwrq)(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, + u16 cqe_bcnt, u32 head_offset, u32 page_idx); +@@ -763,6 +848,9 @@ struct mlx5e_rq { + + }; + struct { ++#if !defined(HAVE_XSK_BUFF_ALLOC) && defined(HAVE_XSK_ZERO_COPY_SUPPORT) ++ u16 umem_headroom; ++#endif + u16 headroom; + u32 frame0_sz; + u8 map_dir; /* dma map direction */ +@@ -792,15 +880,29 @@ struct mlx5e_rq { + + struct mlx5e_dim dim_obj; /* Adaptive Moderation */ + ++#ifdef HAVE_XDP_SUPPORT + /* XDP */ + struct bpf_prog __rcu *xdp_prog; + struct mlx5e_xdpsq *xdpsq; ++#endif + DECLARE_BITMAP(flags, 8); ++#ifdef HAVE_NET_PAGE_POOL_H + struct page_pool *page_pool; +- ++#endif ++#ifdef CONFIG_COMPAT_LRO_ENABLED_IPOIB ++ struct mlx5e_sw_lro *sw_lro; ++#endif + /* AF_XDP zero-copy */ ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT ++#ifndef HAVE_XSK_BUFF_ALLOC ++ struct zero_copy_allocator zca; ++#endif ++#ifdef HAVE_NETDEV_BPF_XSK_BUFF_POOL + struct xsk_buff_pool *xsk_pool; +- ++#else ++ struct xdp_umem *umem; ++#endif ++#endif /* HAVE_XSK_ZERO_COPY_SUPPORT */ + struct work_struct recover_work; + + /* control */ +@@ -813,8 +915,10 @@ struct mlx5e_rq { + u32 umr_mkey; + struct mlx5e_dma_info wqe_overflow; + ++#ifdef HAVE_XDP_RXQ_INFO + /* XDP read-mostly */ + struct xdp_rxq_info xdp_rxq; ++#endif + cqe_ts_to_ns ptp_cyc2time; + } ____cacheline_aligned_in_smp; + +@@ -823,14 +927,24 @@ enum mlx5e_channel_state { + MLX5E_CHANNEL_NUM_STATES + }; + ++#ifndef HAVE_NAPI_STATE_MISSED ++enum channel_flags { ++ MLX5E_CHANNEL_NAPI_SCHED = 1, ++}; ++#endif ++ + struct mlx5e_channel { + /* data path */ + struct mlx5e_rq rq; ++#ifdef HAVE_XDP_SUPPORT + struct mlx5e_xdpsq rq_xdpsq; ++#endif + struct mlx5e_txqsq sq[MLX5E_MAX_NUM_TC]; + struct mlx5e_icosq icosq; /* internal control operations */ + struct mlx5e_txqsq __rcu * __rcu *qos_sqs; ++#ifdef HAVE_XDP_SUPPORT + bool xdp; ++#endif + struct napi_struct napi; + struct device *pdev; + struct net_device *netdev; +@@ -838,14 +952,18 @@ struct mlx5e_channel { + u16 qos_sqs_size; + u8 num_tc; + u8 lag_port; +- ++#ifndef HAVE_NAPI_STATE_MISSED ++ unsigned long flags; ++#endif ++#ifdef HAVE_XDP_SUPPORT + /* XDP_REDIRECT */ + struct mlx5e_xdpsq xdpsq; +- ++#endif ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + /* AF_XDP zero-copy */ + struct mlx5e_rq xskrq; + struct mlx5e_xdpsq xsksq; +- ++#endif + /* Async ICOSQ */ + struct mlx5e_icosq async_icosq; + /* async_icosq can be accessed from any CPU - the spinlock protects it. */ +@@ -880,10 +998,16 @@ struct mlx5e_channel_stats { + struct mlx5e_ch_stats ch; + struct mlx5e_sq_stats sq[MLX5E_MAX_NUM_TC]; + struct mlx5e_rq_stats rq; ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + struct mlx5e_rq_stats xskrq; ++#endif ++#ifdef HAVE_XDP_SUPPORT + struct mlx5e_xdpsq_stats rq_xdpsq; + struct mlx5e_xdpsq_stats xdpsq; ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + struct mlx5e_xdpsq_stats xsksq; ++#endif ++#endif + } ____cacheline_aligned_in_smp; + + struct mlx5e_ptp_stats { +@@ -923,6 +1047,7 @@ struct mlx5e_hv_vhca_stats_agent { + }; + #endif + ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + struct mlx5e_xsk { + /* XSK buffer pools are stored separately from channels, + * because we don't want to lose them when channels are +@@ -930,10 +1055,15 @@ struct mlx5e_xsk { + * distinguish between zero-copy and non-zero-copy UMEMs, so + * rely on our mechanism. + */ ++#ifdef HAVE_NETDEV_BPF_XSK_BUFF_POOL + struct xsk_buff_pool **pools; ++#else ++ struct xdp_umem **umems; ++#endif + u16 refcnt; + bool ever_used; + }; ++#endif + + /* Temporary storage for variables that are allocated when struct mlx5e_priv is + * initialized, and used where we can't allocate them because that functions +@@ -956,9 +1086,9 @@ struct mlx5e_htb { + DECLARE_HASHTABLE(qos_tc2node, order_base_2(MLX5E_QOS_MAX_LEAF_NODES)); + DECLARE_BITMAP(qos_used_qids, MLX5E_QOS_MAX_LEAF_NODES); + struct mlx5e_sq_stats **qos_sq_stats; +- u16 max_qos_sqs; +- u16 maj_id; +- u16 defcls; ++ u32 max_qos_sqs; ++ u32 maj_id; ++ u32 defcls; + struct mlx5e_select_queue_params *final_selq; + }; + +@@ -1011,7 +1141,13 @@ struct mlx5e_priv { + struct mlx5e_ptp_stats ptp_stats; + u16 stats_nch; + u16 max_nch; ++#ifdef CONFIG_COMPAT_LRO_ENABLED_IPOIB ++ struct mlx5e_sw_lro sw_lro[MLX5E_MAX_NUM_CHANNELS]; ++#endif + u8 max_opened_tc; ++#if !defined(HAVE_NDO_GET_STATS64) && !defined(HAVE_NDO_GET_STATS64_RET_VOID) ++ struct net_device_stats netdev_stats; ++#endif + u8 shared_rq:1; + bool tx_ptp_opened; + bool rx_ptp_opened; +@@ -1021,7 +1157,9 @@ struct mlx5e_priv { + struct notifier_block events_nb; + struct notifier_block blocking_events_nb; + ++#ifdef HAVE_UDP_TUNNEL_NIC_INFO + struct udp_tunnel_nic_info nic_info; ++#endif + #ifdef CONFIG_MLX5_CORE_EN_DCB + struct mlx5e_dcbx dcbx; + #endif +@@ -1041,7 +1179,9 @@ struct mlx5e_priv { + struct mlx5e_delay_drop delay_drop; + struct devlink_health_reporter *tx_reporter; + struct devlink_health_reporter *rx_reporter; ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + struct mlx5e_xsk xsk; ++#endif + #if IS_ENABLED(CONFIG_PCI_HYPERV_INTERFACE) + struct mlx5e_hv_vhca_stats_agent stats_agent; + #endif +@@ -1095,7 +1235,9 @@ struct mlx5e_profile { + void mlx5e_create_debugfs(struct mlx5e_priv *priv); + void mlx5e_destroy_debugfs(struct mlx5e_priv *priv); + ++#ifdef __ETHTOOL_DECLARE_LINK_MODE_MASK + void mlx5e_build_ptys2ethtool_map(void); ++#endif + + bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev); + +@@ -1103,10 +1245,26 @@ void mlx5e_shampo_dealloc_hd(struct mlx5 + int mlx5e_sysfs_create(struct net_device *dev); + void mlx5e_sysfs_remove(struct net_device *dev); + ++#if defined(HAVE_NDO_SETUP_TC_TAKES_TC_SETUP_TYPE) || defined(HAVE_NDO_SETUP_TC_RH_EXTENDED) + int mlx5e_setup_tc_mqprio(struct mlx5e_priv *priv, +- struct tc_mqprio_qopt_offload *mqprio); ++#ifdef HAVE_TC_MQPRIO_QOPT_OFFLOAD ++ struct tc_mqprio_qopt_offload *mqprio ++#else ++ struct tc_mqprio_qopt *mqprio ++#endif ++); ++#else ++int mlx5e_setup_tc(struct net_device *netdev, u8 tc); ++#endif + ++#ifdef HAVE_NDO_GET_STATS64_RET_VOID + void mlx5e_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats); ++#elif defined(HAVE_NDO_GET_STATS64) ++struct rtnl_link_stats64 * mlx5e_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats); ++#else ++struct net_device_stats * mlx5e_get_stats(struct net_device *dev); ++#endif ++ + void mlx5e_fold_sw_stats64(struct mlx5e_priv *priv, struct rtnl_link_stats64 *s); + + void mlx5e_init_l2_addr(struct mlx5e_priv *priv); +@@ -1119,6 +1277,7 @@ void mlx5e_set_rx_mode_work(struct work_ + int mlx5e_hwstamp_set(struct mlx5e_priv *priv, struct ifreq *ifr); + int mlx5e_hwstamp_get(struct mlx5e_priv *priv, struct ifreq *ifr); + int mlx5e_modify_rx_cqe_compression_locked(struct mlx5e_priv *priv, bool val, bool rx_filter); ++int mlx5e_modify_tx_cqe_compression_locked(struct mlx5e_priv *priv, bool val); + + int mlx5e_vlan_rx_add_vid(struct net_device *dev, __always_unused __be16 proto, + u16 vid); +@@ -1134,6 +1293,9 @@ struct mlx5e_create_cq_param { + struct mlx5e_ch_stats *ch_stats; + int node; + int ix; ++#ifndef HAVE_NAPI_STATE_MISSED ++ unsigned long *ch_flags; ++#endif + }; + int mlx5e_wait_for_min_rx_wqes(struct mlx5e_rq *rq, int wait_time); + void mlx5e_close_rq(struct mlx5e_priv *priv, struct mlx5e_rq *rq); +@@ -1145,12 +1307,22 @@ int mlx5e_create_rq(struct mlx5e_rq *rq, + void mlx5e_destroy_rq(struct mlx5e_rq *rq); + + struct mlx5e_sq_param; +-int mlx5e_open_xdpsq(struct mlx5e_channel *c, struct mlx5e_params *params, +- struct mlx5e_sq_param *param, struct xsk_buff_pool *xsk_pool, +- struct mlx5e_xdpsq *sq, bool is_redirect); ++#ifdef HAVE_XDP_SUPPORT + void mlx5e_close_xdpsq(struct mlx5e_xdpsq *sq); ++int mlx5e_open_xdpsq(struct mlx5e_channel *c, struct mlx5e_params *params, ++ struct mlx5e_sq_param *param, ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT ++#ifdef HAVE_NETDEV_BPF_XSK_BUFF_POOL ++ struct xsk_buff_pool *xsk_pool, ++#else ++ struct xdp_umem *umem, ++#endif ++#endif ++ struct mlx5e_xdpsq *sq, ++ bool is_redirect); + void mlx5e_activate_xdpsq(struct mlx5e_xdpsq *sq); + void mlx5e_deactivate_xdpsq(struct mlx5e_xdpsq *sq); ++#endif + + struct mlx5e_cq_param; + int mlx5e_open_cq(struct mlx5e_priv *priv, struct dim_cq_moder moder, +@@ -1239,6 +1411,8 @@ void mlx5e_destroy_mdev_resources(struct + int mlx5e_refresh_tirs(struct mlx5e_priv *priv, bool enable_uc_lb, + bool enable_mc_lb); + int mlx5e_modify_tirs_packet_merge(struct mlx5e_priv *priv); ++int mlx5e_modify_tirs_packet_merge_ctx(struct mlx5e_priv *priv, void *context); ++int mlx5e_update_lro(struct net_device *netdev, bool enable); + void mlx5e_mkey_set_relaxed_ordering(struct mlx5_core_dev *mdev, void *mkc); + + /* common netdev helpers */ +@@ -1267,7 +1441,9 @@ int mlx5e_set_dev_port_mtu(struct mlx5e_ + int mlx5e_set_dev_port_mtu_ctx(struct mlx5e_priv *priv, void *context); + int mlx5e_change_mtu(struct net_device *netdev, int new_mtu, + mlx5e_fp_preactivate preactivate); ++#if defined(HAVE_KERNEL_WITH_VXLAN_SUPPORT_ON) && defined(HAVE_UDP_TUNNEL_NIC_INFO) + void mlx5e_vxlan_set_netdev_info(struct mlx5e_priv *priv); ++#endif + + /* ethtool helpers */ + void mlx5e_ethtool_get_drvinfo(struct mlx5e_priv *priv, +@@ -1286,19 +1462,36 @@ void mlx5e_ethtool_get_channels(struct m + int mlx5e_ethtool_set_channels(struct mlx5e_priv *priv, + struct ethtool_channels *ch); + int mlx5e_ethtool_get_coalesce(struct mlx5e_priv *priv, ++#ifdef HAVE_NDO_GET_COALESCE_GET_4_PARAMS + struct ethtool_coalesce *coal, + struct kernel_ethtool_coalesce *kernel_coal); ++#else ++ struct ethtool_coalesce *coal); ++#endif + int mlx5e_ethtool_set_coalesce(struct mlx5e_priv *priv, ++#ifdef HAVE_NDO_GET_COALESCE_GET_4_PARAMS + struct ethtool_coalesce *coal, + struct kernel_ethtool_coalesce *kernel_coal, + struct netlink_ext_ack *extack); ++#else ++ struct ethtool_coalesce *coal); ++#endif ++#ifdef HAVE_GET_SET_LINK_KSETTINGS + int mlx5e_ethtool_get_link_ksettings(struct mlx5e_priv *priv, + struct ethtool_link_ksettings *link_ksettings); + int mlx5e_ethtool_set_link_ksettings(struct mlx5e_priv *priv, + const struct ethtool_link_ksettings *link_ksettings); +-int mlx5e_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key, u8 *hfunc); +-int mlx5e_set_rxfh(struct net_device *dev, const u32 *indir, const u8 *key, +- const u8 hfunc); ++#endif ++#ifdef HAVE_ETHTOOL_GET_SET_SETTINGS ++int mlx5e_get_settings(struct net_device *netdev, struct ethtool_cmd *cmd); ++int mlx5e_set_settings(struct net_device *netdev, struct ethtool_cmd *cmd); ++#endif ++int mlx5e_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key, ++ u8 *hfunc); ++ ++int mlx5e_set_rxfh(struct net_device *dev, const u32 *indir, ++ const u8 *key, const u8 hfunc); ++ + int mlx5e_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *info, + u32 *rule_locs); + int mlx5e_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd); +@@ -1308,6 +1501,12 @@ int mlx5e_ethtool_get_ts_info(struct mlx + struct ethtool_ts_info *info); + int mlx5e_ethtool_flash_device(struct mlx5e_priv *priv, + struct ethtool_flash *flash); ++#ifdef HAVE_TC_SETUP_CB_EGDEV_REGISTER ++#ifndef HAVE_TC_BLOCK_OFFLOAD ++int mlx5e_setup_tc(struct net_device *dev, enum tc_setup_type type, ++ void *type_data); ++#endif ++#endif + void mlx5e_ethtool_get_pauseparam(struct mlx5e_priv *priv, + struct ethtool_pauseparam *pauseparam); + int mlx5e_ethtool_set_pauseparam(struct mlx5e_priv *priv, +@@ -1336,7 +1535,11 @@ int mlx5e_netdev_change_profile(struct m + const struct mlx5e_profile *new_profile, void *new_ppriv); + void mlx5e_netdev_attach_nic_profile(struct mlx5e_priv *priv); + void mlx5e_set_netdev_mtu_boundaries(struct mlx5e_priv *priv); +-void mlx5e_build_nic_params(struct mlx5e_priv *priv, struct mlx5e_xsk *xsk, u16 mtu); ++void mlx5e_build_nic_params(struct mlx5e_priv *priv, ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT ++ struct mlx5e_xsk *xsk, ++#endif ++ u16 mtu); + void mlx5e_build_txq_maps(struct mlx5e_priv *priv); + + int mlx5e_get_dump_flag(struct net_device *netdev, struct ethtool_dump *dump); +@@ -1353,19 +1556,36 @@ static inline bool mlx5e_dropless_rq_sup + void mlx5e_rx_dim_work(struct work_struct *work); + void mlx5e_tx_dim_work(struct work_struct *work); + ++#ifdef HAVE_GET_SET_LINK_KSETTINGS + int mlx5e_get_link_ksettings(struct net_device *netdev, + struct ethtool_link_ksettings *link_ksettings); + int mlx5e_set_link_ksettings(struct net_device *netdev, + const struct ethtool_link_ksettings *link_ksettings); ++#endif ++ ++#if defined(HAVE_NDO_UDP_TUNNEL_ADD) || defined(HAVE_NDO_UDP_TUNNEL_ADD_EXTENDED) ++void mlx5e_add_vxlan_port(struct net_device *netdev, struct udp_tunnel_info *ti); ++void mlx5e_del_vxlan_port(struct net_device *netdev, struct udp_tunnel_info *ti); ++#elif defined(HAVE_NDO_ADD_VXLAN_PORT) ++void mlx5e_add_vxlan_port(struct net_device *netdev, sa_family_t sa_family, __be16 port); ++void mlx5e_del_vxlan_port(struct net_device *netdev, sa_family_t sa_family, __be16 port); ++#endif ++ + netdev_features_t mlx5e_features_check(struct sk_buff *skb, + struct net_device *netdev, + netdev_features_t features); ++ ++netdev_features_t mlx5e_features_check(struct sk_buff *skb, struct net_device *netdev, ++ netdev_features_t features); ++ + int mlx5e_set_features(struct net_device *netdev, netdev_features_t features); + #ifdef CONFIG_MLX5_ESWITCH + int mlx5e_set_vf_mac(struct net_device *dev, int vf, u8 *mac); + int mlx5e_set_vf_rate(struct net_device *dev, int vf, int min_tx_rate, int max_tx_rate); + int mlx5e_get_vf_config(struct net_device *dev, int vf, struct ifla_vf_info *ivi); ++#ifdef HAVE_NDO_GET_VF_STATS + int mlx5e_get_vf_stats(struct net_device *dev, int vf, struct ifla_vf_stats *vf_stats); ++#endif + bool mlx5e_is_rep_shared_rq(const struct mlx5e_priv *priv); + #endif + diff --git a/src/mlnx-ofa_kernel-5.8/backports/0087-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0087-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..9d819ec --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0087-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,114 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c + +Change-Id: Ie5decc4c71e2dcc52b33296b2a6e29b1e42d1ede +--- + .../ethernet/mellanox/mlx5/core/en/devlink.c | 61 ++++++++++++++++++- + 1 file changed, 59 insertions(+), 2 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c +@@ -4,6 +4,7 @@ + #include "en/devlink.h" + #include "eswitch.h" + ++#if defined(HAVE_DEVLINK_PORT_ATRRS_SET_GET_2_PARAMS) || defined(HAVE_DEVLINK_PORT_ATRRS_SET_GET_7_PARAMS) + static void + mlx5e_devlink_get_port_parent_id(struct mlx5_core_dev *dev, struct netdev_phys_item_id *ppid) + { +@@ -13,14 +14,20 @@ mlx5e_devlink_get_port_parent_id(struct + ppid->id_len = sizeof(parent_id); + memcpy(ppid->id, &parent_id, sizeof(parent_id)); + } ++#endif + ++#ifdef HAVE_DEVLINK_PORT_ATRRS_SET_GET_SUPPORT + int mlx5e_devlink_port_register(struct mlx5e_priv *priv) + { + struct devlink *devlink = priv_to_devlink(priv->mdev); +- struct devlink_port_attrs attrs = {}; +- struct netdev_phys_item_id ppid = {}; + struct devlink_port *dl_port; + unsigned int dl_port_index; ++#ifdef HAVE_DEVL_PORT_REGISTER //forward port ++ int ret; ++#endif ++#ifdef HAVE_DEVLINK_PORT_ATRRS_SET_GET_2_PARAMS ++ struct devlink_port_attrs attrs = {}; ++ struct netdev_phys_item_id ppid = {}; + + if (mlx5_core_is_pf(priv->mdev)) { + attrs.flavour = DEVLINK_PORT_FLAVOUR_PHYSICAL; +@@ -40,8 +47,48 @@ int mlx5e_devlink_port_register(struct m + dl_port = mlx5e_devlink_get_dl_port(priv); + memset(dl_port, 0, sizeof(*dl_port)); + devlink_port_attrs_set(dl_port, &attrs); ++#else ++ dl_port = mlx5e_devlink_get_dl_port(priv); ++ memset(dl_port, 0, sizeof(*dl_port)); ++ if (mlx5_core_is_pf(priv->mdev)) { ++#ifdef HAVE_DEVLINK_PORT_ATRRS_SET_GET_7_PARAMS ++ struct netdev_phys_item_id ppid = {}; + ++ if (MLX5_ESWITCH_MANAGER(priv->mdev)) ++ mlx5e_devlink_get_port_parent_id(priv->mdev, &ppid); ++#endif ++ dl_port_index = mlx5_esw_vport_to_devlink_port_index(priv->mdev, ++ MLX5_VPORT_UPLINK); ++ devlink_port_attrs_set(dl_port, ++ DEVLINK_PORT_FLAVOUR_PHYSICAL, ++ mlx5_get_dev_index(priv->mdev), ++ false, 0 ++#ifdef HAVE_DEVLINK_PORT_ATRRS_SET_GET_7_PARAMS ++ ,&ppid.id[0], ppid.id_len ++#endif ++ ); ++ } else { ++ dl_port_index = mlx5_esw_vport_to_devlink_port_index(priv->mdev, 0); ++ devlink_port_attrs_set(dl_port, ++ DEVLINK_PORT_FLAVOUR_VIRTUAL, ++ 0, false , 0 ++#ifdef HAVE_DEVLINK_PORT_ATRRS_SET_GET_7_PARAMS ++ ,NULL, 0 ++#endif ++ ); ++ } ++#endif ++#ifdef HAVE_DEVL_PORT_REGISTER //forward port ++ if (!(priv->mdev->priv.flags & MLX5_PRIV_FLAGS_MLX5E_LOCKED_FLOW)) ++ devl_lock(devlink); ++ ret = devl_port_register(devlink, dl_port, dl_port_index); ++ if (!(priv->mdev->priv.flags & MLX5_PRIV_FLAGS_MLX5E_LOCKED_FLOW)) ++ devl_unlock(devlink); ++ ++ return ret; ++#else + return devlink_port_register(devlink, dl_port, dl_port_index); ++#endif + } + + void mlx5e_devlink_port_type_eth_set(struct mlx5e_priv *priv) +@@ -54,8 +101,17 @@ void mlx5e_devlink_port_type_eth_set(str + void mlx5e_devlink_port_unregister(struct mlx5e_priv *priv) + { + struct devlink_port *dl_port = mlx5e_devlink_get_dl_port(priv); ++#ifdef HAVE_DEVL_PORT_REGISTER //forward port ++ struct devlink *devlink = priv_to_devlink(priv->mdev); + ++ if (!(priv->mdev->priv.flags & MLX5_PRIV_FLAGS_MLX5E_LOCKED_FLOW)) ++ devl_lock(devlink); ++ devl_port_unregister(dl_port); ++ if (!(priv->mdev->priv.flags & MLX5_PRIV_FLAGS_MLX5E_LOCKED_FLOW)) ++ devl_unlock(devlink); ++#else + devlink_port_unregister(dl_port); ++#endif + } + + struct devlink_port *mlx5e_get_devlink_port(struct net_device *dev) +@@ -67,3 +123,4 @@ struct devlink_port *mlx5e_get_devlink_p + + return mlx5e_devlink_get_dl_port(priv); + } ++#endif diff --git a/src/mlnx-ofa_kernel-5.8/backports/0088-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0088-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..c9366a6 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0088-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,58 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/en/fs.h + +Change-Id: Ieaa56b510a910bc02a74ae6eeb82717427f9605c +--- + drivers/net/ethernet/mellanox/mlx5/core/en/fs.h | 7 +++++++ + 1 file changed, 7 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h +@@ -6,6 +6,7 @@ + + #include "mod_hdr.h" + #include "lib/fs_ttc.h" ++#include + + struct mlx5e_post_act; + +@@ -188,6 +189,7 @@ static inline int mlx5e_ethtool_get_rxnf + #endif /* CONFIG_MLX5_EN_RXNFC */ + + #ifdef CONFIG_MLX5_EN_ARFS ++#ifndef HAVE_NET_FLOW_KEYS_H + struct mlx5e_arfs_tables; + + int mlx5e_arfs_create_tables(struct mlx5e_priv *priv); +@@ -202,6 +204,7 @@ static inline void mlx5e_arfs_destroy_ta + static inline int mlx5e_arfs_enable(struct mlx5e_priv *priv) { return -EOPNOTSUPP; } + static inline int mlx5e_arfs_disable(struct mlx5e_priv *priv) { return -EOPNOTSUPP; } + #endif ++#endif + + #ifdef CONFIG_MLX5_EN_TLS + struct mlx5e_accel_fs_tcp; +@@ -224,8 +227,10 @@ struct mlx5e_flow_steering { + struct mlx5_ttc_table *ttc; + struct mlx5_ttc_table *inner_ttc; + #ifdef CONFIG_MLX5_EN_ARFS ++#ifndef HAVE_NET_FLOW_KEYS_H + struct mlx5e_arfs_tables *arfs; + #endif ++#endif + #ifdef CONFIG_MLX5_EN_TLS + struct mlx5e_accel_fs_tcp *accel_tcp; + #endif +@@ -251,10 +256,12 @@ void mlx5e_destroy_flow_steering(struct + int mlx5e_fs_init(struct mlx5e_priv *priv); + void mlx5e_fs_cleanup(struct mlx5e_priv *priv); + ++#ifdef HAVE_DEVLINK_TRAP_SUPPORT + int mlx5e_add_vlan_trap(struct mlx5e_priv *priv, int trap_id, int tir_num); + void mlx5e_remove_vlan_trap(struct mlx5e_priv *priv); + int mlx5e_add_mac_trap(struct mlx5e_priv *priv, int trap_id, int tir_num); + void mlx5e_remove_mac_trap(struct mlx5e_priv *priv); ++#endif /* HAVE_DEVLINK_TRAP_SUPPORT */ + + #endif /* __MLX5E_FLOW_STEER_H__ */ + diff --git a/src/mlnx-ofa_kernel-5.8/backports/0089-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0089-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..b21cb55 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0089-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,98 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/en/health.c + +Change-Id: I6c78bae61517f0a8c65de7357947e32a1313ad33 +--- + .../ethernet/mellanox/mlx5/core/en/health.c | 32 ++++++++++++++++++- + 1 file changed, 31 insertions(+), 1 deletion(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/health.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/health.c +@@ -5,6 +5,7 @@ + #include "lib/eq.h" + #include "lib/mlx5.h" + ++#ifdef HAVE_DEVLINK_HEALTH_REPORT_SUPPORT + int mlx5e_health_fmsg_named_obj_nest_start(struct devlink_fmsg *fmsg, char *name) + { + int err; +@@ -134,6 +135,7 @@ int mlx5e_health_eq_diag_fmsg(struct mlx + + return mlx5e_health_fmsg_named_obj_nest_end(fmsg); + } ++#endif /* HAVE_DEVLINK_HEALTH_REPORT_SUPPORT */ + + void mlx5e_health_create_reporters(struct mlx5e_priv *priv) + { +@@ -149,12 +151,14 @@ void mlx5e_health_destroy_reporters(stru + + void mlx5e_health_channels_update(struct mlx5e_priv *priv) + { ++#ifdef HAVE_DEVLINK_HEALTH_REPORTER_STATE_UPDATE + if (priv->tx_reporter) + devlink_health_reporter_state_update(priv->tx_reporter, + DEVLINK_HEALTH_REPORTER_STATE_HEALTHY); + if (priv->rx_reporter) + devlink_health_reporter_state_update(priv->rx_reporter, + DEVLINK_HEALTH_REPORTER_STATE_HEALTHY); ++#endif /* HAVE_DEVLINK_HEALTH_REPORTER_STATE_UPDATE */ + } + + int mlx5e_health_sq_to_ready(struct mlx5_core_dev *mdev, struct net_device *dev, u32 sqn) +@@ -231,9 +235,24 @@ int mlx5e_health_report(struct mlx5e_pri + if (!reporter) + return err_ctx->recover(err_ctx->ctx); + +- return devlink_health_report(reporter, err_str, err_ctx); ++#ifdef HAVE_DEVLINK_HEALTH_REPORT_SUPPORT ++ return devlink_health_report(reporter, err_str, err_ctx); ++#else ++ return 0; ++#endif /* HAVE_DEVLINK_HEALTH_REPORT_SUPPORT */ + } + ++#ifndef HAVE_DEVLINK_FMSG_BINARY_PUT ++#ifdef HAVE_DEVLINK_HEALTH_REPORT_SUPPORT ++ static int devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value, ++ u16 value_len) ++ { ++ return -EOPNOTSUPP; ++ } ++#endif /* HAVE_DEVLINK_HEALTH_REPORT_SUPPORT */ ++#endif ++ ++#ifdef HAVE_DEVLINK_HEALTH_REPORT_SUPPORT + #define MLX5_HEALTH_DEVLINK_MAX_SIZE 1024 + static int mlx5e_health_rsc_fmsg_binary(struct devlink_fmsg *fmsg, + const void *value, u32 value_len) +@@ -271,7 +290,11 @@ int mlx5e_health_rsc_fmsg_dump(struct ml + if (!page) + return -ENOMEM; + ++#ifdef HAVE_DEVLINK_FMSG_BINARY_PAIR_NEST_START + err = devlink_fmsg_binary_pair_nest_start(fmsg, "data"); ++#else ++ err = devlink_fmsg_arr_pair_nest_start(fmsg, "data"); ++#endif + if (err) + goto free_page; + +@@ -296,7 +319,12 @@ int mlx5e_health_rsc_fmsg_dump(struct ml + + destroy_cmd: + mlx5_rsc_dump_cmd_destroy(cmd); ++#ifdef HAVE_DEVLINK_FMSG_BINARY_PAIR_NEST_START + end_err = devlink_fmsg_binary_pair_nest_end(fmsg); ++#else ++ end_err = devlink_fmsg_arr_pair_nest_end(fmsg); ++#endif ++ + if (end_err) + err = end_err; + free_page: +@@ -337,3 +365,5 @@ int mlx5e_health_queue_dump(struct mlx5e + + return devlink_fmsg_obj_nest_end(fmsg); + } ++ ++#endif /* HAVE_DEVLINK_HEALTH_REPORT_SUPPORT */ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0090-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0090-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..8b3001f --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0090-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,45 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/en/health.h + +Change-Id: Ia7a5ed95f756d3942c668c17fb1475c0aab10a03 +--- + drivers/net/ethernet/mellanox/mlx5/core/en/health.h | 6 ++++++ + 1 file changed, 6 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/health.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/health.h +@@ -19,11 +19,13 @@ void mlx5e_reporter_tx_destroy(struct ml + void mlx5e_reporter_tx_err_cqe(struct mlx5e_txqsq *sq); + int mlx5e_reporter_tx_timeout(struct mlx5e_txqsq *sq); + ++#ifdef HAVE_DEVLINK_HEALTH_REPORT_SUPPORT + int mlx5e_health_cq_diag_fmsg(struct mlx5e_cq *cq, struct devlink_fmsg *fmsg); + int mlx5e_health_cq_common_diag_fmsg(struct mlx5e_cq *cq, struct devlink_fmsg *fmsg); + int mlx5e_health_eq_diag_fmsg(struct mlx5_eq_comp *eq, struct devlink_fmsg *fmsg); + int mlx5e_health_fmsg_named_obj_nest_start(struct devlink_fmsg *fmsg, char *name); + int mlx5e_health_fmsg_named_obj_nest_end(struct devlink_fmsg *fmsg); ++#endif + + void mlx5e_reporter_rx_create(struct mlx5e_priv *priv); + void mlx5e_reporter_rx_destroy(struct mlx5e_priv *priv); +@@ -37,7 +39,9 @@ void mlx5e_reporter_icosq_resume_recover + + struct mlx5e_err_ctx { + int (*recover)(void *ctx); ++#ifdef HAVE_DEVLINK_HEALTH_REPORT_SUPPORT + int (*dump)(struct mlx5e_priv *priv, struct devlink_fmsg *fmsg, void *ctx); ++#endif /* HAVE_DEVLINK_HEALTH_REPORT_SUPPORT */ + void *ctx; + }; + +@@ -51,8 +55,10 @@ int mlx5e_health_report(struct mlx5e_pri + void mlx5e_health_create_reporters(struct mlx5e_priv *priv); + void mlx5e_health_destroy_reporters(struct mlx5e_priv *priv); + void mlx5e_health_channels_update(struct mlx5e_priv *priv); ++#ifdef HAVE_DEVLINK_HEALTH_REPORT_SUPPORT + int mlx5e_health_rsc_fmsg_dump(struct mlx5e_priv *priv, struct mlx5_rsc_key *key, + struct devlink_fmsg *fmsg); + int mlx5e_health_queue_dump(struct mlx5e_priv *priv, struct devlink_fmsg *fmsg, + int queue_idx, char *lbl); + #endif ++#endif diff --git a/src/mlnx-ofa_kernel-5.8/backports/0091-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0091-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..18163a9 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0091-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,202 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/en/params.c + +Change-Id: Ifc5d6c4a8f87311d15cebbb6d9d5bd84dcc1a8c6 +--- + .../ethernet/mellanox/mlx5/core/en/params.c | 61 ++++++++++++++++--- + 1 file changed, 53 insertions(+), 8 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c +@@ -7,26 +7,40 @@ + #include "en_accel/en_accel.h" + #include "accel/ipsec.h" + #include "fpga/ipsec.h" ++#include "en_accel/tls.h" + ++#ifdef HAVE_XDP_SUPPORT + static bool mlx5e_rx_is_xdp(struct mlx5e_params *params, + struct mlx5e_xsk_param *xsk) + { + return params->xdp_prog || xsk; + } ++#endif + + u16 mlx5e_get_linear_rq_headroom(struct mlx5e_params *params, + struct mlx5e_xsk_param *xsk) + { +- u16 headroom; ++ u16 headroom = 0; + ++#ifdef HAVE_XSK_BUFF_ALLOC + if (xsk) + return xsk->headroom; +- ++#endif + headroom = NET_IP_ALIGN; +- if (mlx5e_rx_is_xdp(params, xsk)) ++ ++#ifdef HAVE_XDP_SUPPORT ++ if (mlx5e_rx_is_xdp(params, xsk)) { + headroom += XDP_PACKET_HEADROOM; +- else ++#ifndef HAVE_XSK_BUFF_ALLOC ++ if (xsk) ++ headroom += xsk->headroom; ++#endif ++ } else { ++#endif /* HAVE_XDP_SUPPORT */ + headroom += MLX5_RX_HEADROOM; ++#ifdef HAVE_XDP_SUPPORT ++ } ++#endif + + return headroom; + } +@@ -58,8 +72,10 @@ static u32 mlx5e_rx_get_linear_frag_sz(s + * The latter is important, because frames may come in a random order, + * and we will have trouble assemblying a real page of multiple frames. + */ ++#ifdef HAVE_XDP_SUPPORT + if (mlx5e_rx_is_xdp(params, xsk)) + frag_sz = max_t(u32, frag_sz, PAGE_SIZE); ++#endif + + /* Even if we can go with a smaller fragment size, we must not put + * multiple packets into a single frame. +@@ -87,8 +103,12 @@ bool mlx5e_rx_is_linear_skb(struct mlx5e + u32 linear_frag_sz = max(mlx5e_rx_get_linear_frag_sz(params, xsk), + mlx5e_rx_get_linear_frag_sz(params, NULL)); + ++#ifdef CONFIG_COMPAT_LRO_ENABLED_IPOIB ++ return !IS_HW_LRO(params) && linear_frag_sz <= PAGE_SIZE; ++#else + return params->packet_merge.type == MLX5E_PACKET_MERGE_NONE && + linear_frag_sz <= PAGE_SIZE; ++#endif + } + + static bool mlx5e_verify_rx_mpwqe_strides(struct mlx5_core_dev *mdev, +@@ -320,12 +340,14 @@ int mlx5e_mpwrq_validate_regular(struct + if (mlx5_fpga_is_ipsec_device(mdev)) + return -EOPNOTSUPP; + ++#ifdef HAVE_XDP_SUPPORT + if (params->xdp_prog && !mlx5e_rx_mpwqe_is_linear_skb(mdev, params, NULL)) + return -EINVAL; ++#endif + + return 0; + } +- ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + int mlx5e_mpwrq_validate_xsk(struct mlx5_core_dev *mdev, struct mlx5e_params *params, + struct mlx5e_xsk_param *xsk) + { +@@ -340,6 +362,7 @@ int mlx5e_mpwrq_validate_xsk(struct mlx5 + + return 0; + } ++#endif + + void mlx5e_init_rq_type_params(struct mlx5_core_dev *mdev, + struct mlx5e_params *params) +@@ -392,6 +415,9 @@ void mlx5e_build_create_cq_param(struct + .ch_stats = c->stats, + .node = cpu_to_node(c->cpu), + .ix = c->ix, ++#ifndef HAVE_NAPI_STATE_MISSED ++ .ch_flags = &c->flags, ++#endif + }; + } + +@@ -537,11 +563,18 @@ void mlx5e_build_aso_cq_param(struct mlx + + static u8 rq_end_pad_mode(struct mlx5_core_dev *mdev, struct mlx5e_params *params) + { +- bool lro_en = params->packet_merge.type == MLX5E_PACKET_MERGE_LRO; ++#ifdef HAVE_PCIE_RELAXED_ORDERING_ENABLED + bool ro = pcie_relaxed_ordering_enabled(mdev->pdev) && + MLX5_CAP_GEN(mdev, relaxed_ordering_write); +- +- return ro && lro_en ? ++#else ++ bool ro = MLX5_CAP_GEN(mdev, relaxed_ordering_write); ++#endif ++ ++#ifdef CONFIG_COMPAT_LRO_ENABLED_IPOIB ++ return ro && IS_HW_LRO(params)? ++#else ++ return ro && (params->packet_merge.type == MLX5E_PACKET_MERGE_LRO) ? ++#endif + MLX5_WQ_END_PAD_MODE_NONE : MLX5_WQ_END_PAD_MODE_ALIGN; + } + +@@ -637,6 +670,8 @@ void mlx5e_build_tx_cq_param(struct mlx5 + void *cqc = param->cqc; + + MLX5_SET(cqc, cqc, log_cq_size, params->log_sq_size); ++ if (MLX5E_GET_PFLAG(params, MLX5E_PFLAG_TX_CQE_COMPRESS)) ++ MLX5_SET(cqc, cqc, cqe_comp_en, 1); + + mlx5e_build_common_cq_param(mdev, param); + param->cq_period_mode = params->tx_cq_moderation.cq_period_mode; +@@ -772,8 +807,10 @@ static u8 mlx5e_build_icosq_log_wq_sz(st + * doesn't affect its return value, as long as params->xdp_prog != NULL, + * so we can just multiply by 2. + */ ++#ifdef HAVE_XDP_SUPPORT + if (params->xdp_prog) + wqebbs *= 2; ++#endif + + if (params->packet_merge.type == MLX5E_PACKET_MERGE_SHAMPO) + wqebbs += mlx5e_shampo_icosq_sz(mdev, params, rqp); +@@ -783,8 +820,10 @@ static u8 mlx5e_build_icosq_log_wq_sz(st + + static u8 mlx5e_build_async_icosq_log_wq_sz(struct mlx5_core_dev *mdev) + { ++#ifdef HAVE_UAPI_LINUX_TLS_H + if (mlx5e_accel_is_ktls_rx(mdev)) + return MLX5E_PARAMS_DEFAULT_LOG_SQ_SIZE; ++#endif + + return MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE; + } +@@ -812,14 +851,17 @@ static void mlx5e_build_async_icosq_para + + mlx5e_build_sq_param_common(mdev, param); + param->stop_room = mlx5e_stop_room_for_wqe(mdev, 1); /* for XSK NOP */ ++#ifdef HAVE_UAPI_LINUX_TLS_H + param->is_tls = mlx5e_accel_is_ktls_rx(mdev); + if (param->is_tls) + param->stop_room += mlx5e_stop_room_for_wqe(mdev, 1); /* for TLS RX resync NOP */ ++#endif + MLX5_SET(sqc, sqc, reg_umr, MLX5_CAP_ETH(mdev, reg_umr_sq)); + MLX5_SET(wq, wq, log_wq_sz, log_wq_size); + mlx5e_build_ico_cq_param(mdev, log_wq_size, ¶m->cqp); + } + ++#ifdef HAVE_XDP_SUPPORT + void mlx5e_build_xdpsq_param(struct mlx5_core_dev *mdev, + struct mlx5e_params *params, + struct mlx5e_sq_param *param) +@@ -832,6 +874,7 @@ void mlx5e_build_xdpsq_param(struct mlx5 + param->is_mpw = MLX5E_GET_PFLAG(params, MLX5E_PFLAG_XDP_TX_MPWQE); + mlx5e_build_tx_cq_param(mdev, params, ¶m->cqp); + } ++#endif + + int mlx5e_build_channel_param(struct mlx5_core_dev *mdev, + struct mlx5e_params *params, +@@ -849,7 +892,9 @@ int mlx5e_build_channel_param(struct mlx + async_icosq_log_wq_sz = mlx5e_build_async_icosq_log_wq_sz(mdev); + + mlx5e_build_sq_param(mdev, params, &cparam->txq_sq); ++#ifdef HAVE_XDP_SUPPORT + mlx5e_build_xdpsq_param(mdev, params, &cparam->xdp_sq); ++#endif + mlx5e_build_icosq_param(mdev, icosq_log_wq_sz, &cparam->icosq); + mlx5e_build_async_icosq_param(mdev, async_icosq_log_wq_sz, &cparam->async_icosq); + diff --git a/src/mlnx-ofa_kernel-5.8/backports/0092-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0092-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..2a64d00 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0092-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,39 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c + +Change-Id: I68792188369355fce76ff40844e600b15cd734e4 +--- + drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c | 9 +++++++++ + 1 file changed, 9 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c +@@ -31,6 +31,7 @@ + */ + #include "port_buffer.h" + ++#ifdef CONFIG_MLX5_CORE_EN_DCB + int mlx5e_port_query_buffer(struct mlx5e_priv *priv, + struct mlx5e_port_buffer *port_buffer) + { +@@ -279,7 +280,14 @@ int mlx5e_port_manual_buffer_config(stru + int i; + + mlx5e_dbg(HW, priv, "%s: change=%x\n", __func__, change); ++#if defined(HAVE_NET_DEVICE_MIN_MAX_MTU) + max_mtu = max_t(unsigned int, priv->netdev->max_mtu, MINIMUM_MAX_MTU); ++#elif defined(HAVE_NET_DEVICE_MIN_MAX_MTU_EXTENDED) ++ max_mtu = max_t(unsigned int, priv->netdev->extended->min_mtu, MINIMUM_MAX_MTU); ++#else ++ max_mtu = mtu; ++#endif ++ + + err = mlx5e_port_query_buffer(priv, &port_buffer); + if (err) +@@ -360,3 +368,4 @@ int mlx5e_port_manual_buffer_config(stru + + return err; + } ++#endif diff --git a/src/mlnx-ofa_kernel-5.8/backports/0093-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0093-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..72c1b65 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0093-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,19 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.h + +Change-Id: I6006043fe5c14b7c932ed4a5faacddc18ba4c0fe +--- + drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.h | 1 + + 1 file changed, 1 insertion(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.h +@@ -32,6 +32,7 @@ + #ifndef __MLX5_EN_PORT_BUFFER_H__ + #define __MLX5_EN_PORT_BUFFER_H__ + ++#include + #include "en.h" + #include "port.h" + diff --git a/src/mlnx-ofa_kernel-5.8/backports/0094-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0094-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..9294468 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0094-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,136 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c + +Change-Id: I8724cbabce535e831c07f3a808fd50bf03520e31 +--- + .../net/ethernet/mellanox/mlx5/core/en/ptp.c | 53 ++++++++++++++++++- + 1 file changed, 51 insertions(+), 2 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c +@@ -39,10 +39,19 @@ static void mlx5e_skb_cb_hwtstamp_tx(str + struct mlx5e_ptp_cq_stats *cq_stats) + { + struct skb_shared_hwtstamps hwts = {}; ++#ifndef HAVE_KTIME_UNION_TV64 + ktime_t diff; ++#else ++ s64 diff; ++#endif + ++#ifndef HAVE_KTIME_UNION_TV64 + diff = abs(mlx5e_skb_cb_get_hwts(skb)->port_hwtstamp - + mlx5e_skb_cb_get_hwts(skb)->cqe_hwtstamp); ++#else ++ diff = abs(mlx5e_skb_cb_get_hwts(skb)->port_hwtstamp.tv64 - ++ mlx5e_skb_cb_get_hwts(skb)->cqe_hwtstamp.tv64); ++#endif + + /* Maximal allowed diff is 1 / 128 second */ + if (diff > (NSEC_PER_SEC >> 7)) { +@@ -71,8 +80,13 @@ void mlx5e_skb_cb_hwtstamp_handler(struc + /* If both CQEs arrive, check and report the port tstamp, and clear skb cb as + * skb soon to be released. + */ ++#ifndef HAVE_KTIME_UNION_TV64 + if (!mlx5e_skb_cb_get_hwts(skb)->cqe_hwtstamp || + !mlx5e_skb_cb_get_hwts(skb)->port_hwtstamp) ++#else ++ if (!mlx5e_skb_cb_get_hwts(skb)->cqe_hwtstamp.tv64 || ++ !mlx5e_skb_cb_get_hwts(skb)->port_hwtstamp.tv64) ++#endif + return; + + mlx5e_skb_cb_hwtstamp_tx(skb, cq_stats); +@@ -119,7 +133,11 @@ static void mlx5e_ptp_handle_ts_cqe(stru + ptpsq->cq_stats->cqe++; + + out: ++#ifdef HAVE_NAPI_CONSUME_SKB + napi_consume_skb(skb, budget); ++#else ++ dev_kfree_skb(skb); ++#endif + } + + static bool mlx5e_ptp_poll_ts_cq(struct mlx5e_cq *cq, int budget) +@@ -162,6 +180,9 @@ static int mlx5e_ptp_napi_poll(struct na + rcu_read_lock(); + + ch_stats->poll++; ++#ifndef HAVE_NAPI_STATE_MISSED ++ clear_bit(MLX5E_CHANNEL_NAPI_SCHED, &c->flags); ++#endif + + if (test_bit(MLX5E_PTP_STATE_TX, c->state)) { + for (i = 0; i < c->num_tc; i++) { +@@ -183,8 +204,18 @@ static int mlx5e_ptp_napi_poll(struct na + goto out; + } + ++#ifdef HAVE_NAPI_STATE_MISSED + if (unlikely(!napi_complete_done(napi, work_done))) + goto out; ++#else ++ napi_complete_done(napi, work_done); ++ ++ /* avoid losing completion event during/after polling cqs */ ++ if (test_bit(MLX5E_CHANNEL_NAPI_SCHED, &c->flags)) { ++ napi_schedule(napi); ++ goto out; ++ } ++#endif + + ch_stats->arm++; + +@@ -398,6 +429,9 @@ static int mlx5e_ptp_open_tx_cqs(struct + ccp.ch_stats = c->stats; + ccp.napi = &c->napi; + ccp.ix = MLX5E_PTP_CHANNEL_IX; ++#ifndef HAVE_NAPI_STATE_MISSED ++ ccp.ch_flags = &c->flags; ++#endif + + cq_param = &cparams->txq_sq_param.cqp; + +@@ -455,7 +489,16 @@ static int mlx5e_init_ptp_rq(struct mlx5 + if (err) + return err; + +- return xdp_rxq_info_reg(&rq->xdp_rxq, rq->netdev, rq->ix, 0); ++#ifdef HAVE_XDP_SUPPORT ++#ifdef HAVE_XDP_RXQ_INFO ++#ifdef HAVE_XDP_RXQ_INFO_REG_4_PARAMS ++ err = xdp_rxq_info_reg(&rq->xdp_rxq, rq->netdev, rq->ix, 0); ++#else ++ err = xdp_rxq_info_reg(&rq->xdp_rxq, rq->netdev, rq->ix); ++#endif ++#endif ++#endif /* HAVE_XDP_SUPPORT */ ++ return err; + } + + static int mlx5e_ptp_open_rq(struct mlx5e_ptp *c, +@@ -471,7 +514,9 @@ static int mlx5e_ptp_open_rq(struct mlx5 + ccp.ch_stats = c->stats; + ccp.napi = &c->napi; + ccp.ix = MLX5E_PTP_CHANNEL_IX; +- ++#ifndef HAVE_NAPI_STATE_MISSED ++ ccp.ch_flags = &c->flags; ++#endif + err = mlx5e_init_ptp_rq(c, params, &c->rq); + if (err) + return err; +@@ -516,7 +561,11 @@ static void mlx5e_ptp_build_rq_param(str + + params->rq_wq_type = MLX5_WQ_TYPE_CYCLIC; + mlx5e_init_rq_type_params(mdev, params); ++#ifdef HAVE_NET_DEVICE_MIN_MAX_MTU + params->sw_mtu = netdev->max_mtu; ++#elif defined(HAVE_NET_DEVICE_MIN_MAX_MTU_EXTENDED) ++ params->sw_mtu = netdev->extended->max_mtu; ++#endif + mlx5e_build_rq_param(mdev, params, NULL, q_counter, rq_params); + } + diff --git a/src/mlnx-ofa_kernel-5.8/backports/0095-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0095-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..2085384 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0095-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,57 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h + +Change-Id: I883421b87de3661b098a8e52da3324ca0f2f57e5 +--- + .../net/ethernet/mellanox/mlx5/core/en/ptp.h | 22 ++++++++++++++++--- + 1 file changed, 19 insertions(+), 3 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h +@@ -40,6 +40,9 @@ struct mlx5e_ptp { + + /* data path - accessed per napi poll */ + struct mlx5e_ch_stats *stats; ++#ifndef HAVE_NAPI_STATE_MISSED ++ unsigned long flags; ++#endif + + /* control */ + struct mlx5e_priv *priv; +@@ -50,14 +53,26 @@ struct mlx5e_ptp { + + static inline bool mlx5e_use_ptpsq(struct sk_buff *skb) + { ++#if defined(HAVE_SKB_FLOW_DISSECT_FLOW_KEYS_HAS_3_PARAMS) || defined(HAVE_SKB_FLOW_DISSECT_FLOW_KEYS_HAS_2_PARAMS) + struct flow_keys fk; ++#endif + + if (!(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) + return false; + +- if (!skb_flow_dissect_flow_keys(skb, &fk, 0)) +- return false; ++#ifdef HAVE_SKB_FLOW_DISSECT_FLOW_KEYS_HAS_3_PARAMS ++ if (!skb_flow_dissect_flow_keys(skb, &fk, 0)) ++#elif defined(HAVE_SKB_FLOW_DISSECT_FLOW_KEYS_HAS_2_PARAMS) ++ if (!skb_flow_dissect_flow_keys(skb, &fk)) ++#endif ++#ifdef HAVE_PTP_CLASSIFY_RAW ++ return unlikely(vlan_get_protocol(skb) == htons(ETH_P_1588) || ++ ptp_classify_raw(skb) != PTP_CLASS_NONE); ++#else ++ return unlikely(vlan_get_protocol(skb) == htons(ETH_P_1588)); ++#endif + ++#if defined(HAVE_SKB_FLOW_DISSECT_FLOW_KEYS_HAS_3_PARAMS) || defined(HAVE_SKB_FLOW_DISSECT_FLOW_KEYS_HAS_2_PARAMS) + if (fk.basic.n_proto == htons(ETH_P_1588)) + return true; + +@@ -67,6 +82,7 @@ static inline bool mlx5e_use_ptpsq(struc + + return (fk.basic.ip_proto == IPPROTO_UDP && + fk.ports.dst == htons(PTP_EV_PORT)); ++#endif + } + + static inline bool mlx5e_ptpsq_fifo_has_room(struct mlx5e_txqsq *sq) diff --git a/src/mlnx-ofa_kernel-5.8/backports/0096-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0096-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..c4eb88a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0096-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,84 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/en/qos.c + +Change-Id: I9061aac2497f2640e56edfa873dd18d5785740f7 +--- + .../net/ethernet/mellanox/mlx5/core/en/qos.c | 25 ++++++++++++++++++- + 1 file changed, 24 insertions(+), 1 deletion(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c +@@ -5,6 +5,7 @@ + #include "en.h" + #include "params.h" + #include "../qos.h" ++#include + + #define BYTES_IN_MBIT 125000 + +@@ -497,7 +498,7 @@ static void mlx5e_qos_deactivate_all_que + int mlx5e_htb_root_add(struct mlx5e_priv *priv, u16 htb_maj_id, u16 htb_defcls, + struct netlink_ext_ack *extack) + { +- struct mlx5e_select_queue_params *selq; ++ struct mlx5e_select_queue_params *selq = NULL; + struct mlx5e_qos_node *root; + bool opened; + int err; +@@ -808,8 +809,13 @@ static void mlx5e_reset_qdisc(struct net + spin_unlock_bh(qdisc_lock(qdisc)); + } + ++#ifndef HAVE_TC_HTB_COMMAND_HAS_MOVED_QID + int mlx5e_htb_leaf_del(struct mlx5e_priv *priv, u16 *classid, + struct netlink_ext_ack *extack) ++#else ++int mlx5e_htb_leaf_del(struct mlx5e_priv *priv, u16 classid, u16 *old_qid, ++ u16 *new_qid, struct netlink_ext_ack *extack) ++#endif + { + struct mlx5e_qos_node *node; + struct netdev_queue *txq; +@@ -817,9 +823,17 @@ int mlx5e_htb_leaf_del(struct mlx5e_priv + bool opened; + int err; + ++#ifndef HAVE_TC_HTB_COMMAND_HAS_MOVED_QID /*will be base code next rebase*/ + qos_dbg(priv->mdev, "TC_HTB_LEAF_DEL classid %04x\n", *classid); + + node = mlx5e_sw_node_find(priv, *classid); ++#else ++ qos_dbg(priv->mdev, "TC_HTB_LEAF_DEL classid %04x\n", classid); ++ ++ *old_qid = *new_qid = 0; ++ ++ node = mlx5e_sw_node_find(priv, classid); ++#endif + if (!node) + return -ENOENT; + +@@ -837,7 +851,11 @@ int mlx5e_htb_leaf_del(struct mlx5e_priv + err = mlx5_qos_destroy_node(priv->mdev, node->hw_id); + if (err) /* Not fatal. */ + qos_warn(priv->mdev, "Failed to destroy leaf node %u (class %04x), err = %d\n", ++#ifndef HAVE_TC_HTB_COMMAND_HAS_MOVED_QID + node->hw_id, *classid, err); ++#else ++ node->hw_id, classid, err); ++#endif + + mlx5e_sw_node_delete(priv, node); + +@@ -899,7 +917,12 @@ int mlx5e_htb_leaf_del(struct mlx5e_priv + if (opened) + mlx5e_reactivate_qos_sq(priv, moved_qid, txq); + ++#ifndef HAVE_TC_HTB_COMMAND_HAS_MOVED_QID + *classid = node->classid; ++#else ++ *old_qid = mlx5e_qid_from_qos(&priv->channels, moved_qid); ++ *new_qid = mlx5e_qid_from_qos(&priv->channels, qid); ++#endif + return 0; + } + diff --git a/src/mlnx-ofa_kernel-5.8/backports/0097-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0097-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..f0330ba --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0097-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,68 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en/rep/bond.c + +Change-Id: Ie0501083e42f564e285aa9311c4fac5739d1d26f +--- + .../net/ethernet/mellanox/mlx5/core/en/rep/bond.c | 15 ++++++++++++++- + 1 file changed, 14 insertions(+), 1 deletion(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bond.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bond.c +@@ -3,8 +3,10 @@ + + #include + #include ++#include ++#ifdef HAVE_NET_LAG_H + #include +- ++#endif + #include "mlx5_core.h" + #include "eswitch.h" + #include "esw/acl/ofld.h" +@@ -193,9 +195,11 @@ static void mlx5e_rep_changelowerstate_e + struct mlx5e_rep_priv *rpriv; + struct net_device *lag_dev; + struct mlx5e_priv *priv; ++#ifdef HAVE_NETDEV_FOR_EACH_LOWER_DEV + struct list_head *iter; + struct net_device *dev; + u16 acl_vport_num; ++#endif + u16 fwd_vport_num; + int err; + +@@ -212,9 +216,12 @@ static void mlx5e_rep_changelowerstate_e + if (!lag_dev) + return; + ++#ifdef HAVE_NET_LAG_PORT_DEV_TXABLE + netdev_dbg(netdev, "lag_dev(%s)'s slave vport(%d) is txable(%d)\n", + lag_dev->name, fwd_vport_num, net_lag_port_dev_txable(netdev)); ++#endif + ++#ifdef HAVE_NETDEV_FOR_EACH_LOWER_DEV + /* Point everyone's egress acl to the vport of the active representor */ + netdev_for_each_lower_dev(lag_dev, dev, iter) { + priv = netdev_priv(dev); +@@ -235,6 +242,7 @@ static void mlx5e_rep_changelowerstate_e + acl_vport_num, err); + } + } ++#endif + + /* Insert new rx_rule for unique bond_metadata, save it as active vport's + * rx_rule with new destination as active vport's root_ft +@@ -315,6 +323,11 @@ int mlx5e_rep_bond_init(struct mlx5e_rep + if (!mlx5_esw_acl_egress_fwd2vport_supported(priv->mdev->priv.eswitch)) + goto out; + ++#ifndef HAVE_NETDEV_FOR_EACH_LOWER_DEV ++ netdev_err(netdev, "VF representors bonding is NOT supported\n"); ++ goto out; ++#endif ++ + uplink_priv->bond = kvzalloc(sizeof(*uplink_priv->bond), GFP_KERNEL); + if (!uplink_priv->bond) { + ret = -ENOMEM; diff --git a/src/mlnx-ofa_kernel-5.8/backports/0098-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0098-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..0274869 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0098-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,157 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c + +Change-Id: Ice15adb0af0b57a99e5b6c9a4acd24c5fc8c6cde +--- + .../mellanox/mlx5/core/en/rep/bridge.c | 41 ++++++++++++++++++- + 1 file changed, 40 insertions(+), 1 deletion(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c +@@ -83,6 +83,7 @@ mlx5_esw_bridge_rep_vport_num_vhca_id_ge + return dev; + } + ++#ifdef HAVE_SWITCHDEV_PORT_ATTR_SET + static struct net_device * + mlx5_esw_bridge_lower_rep_vport_num_vhca_id_get(struct net_device *dev, struct mlx5_eswitch *esw, + u16 *vport_num, u16 *esw_owner_vhca_id) +@@ -108,6 +109,7 @@ mlx5_esw_bridge_lower_rep_vport_num_vhca + + return NULL; + } ++#endif + + static bool mlx5_esw_bridge_is_local(struct net_device *dev, struct net_device *rep, + struct mlx5_eswitch *esw) +@@ -190,7 +192,10 @@ mlx5_esw_bridge_port_obj_add(struct net_ + const struct switchdev_obj *obj = port_obj_info->obj; + const struct switchdev_obj_port_vlan *vlan; + u16 vport_num, esw_owner_vhca_id; +- int err; ++#ifndef HAVE_STRUCT_SWITCHDEV_OBJ_PORT_VLAN_VID ++ u16 vid = 0; ++#endif ++ int err = 0; + + if (!mlx5_esw_bridge_rep_vport_num_vhca_id_get(dev, br_offloads->esw, &vport_num, + &esw_owner_vhca_id)) +@@ -201,8 +206,15 @@ mlx5_esw_bridge_port_obj_add(struct net_ + switch (obj->id) { + case SWITCHDEV_OBJ_ID_PORT_VLAN: + vlan = SWITCHDEV_OBJ_PORT_VLAN(obj); ++#ifdef HAVE_STRUCT_SWITCHDEV_OBJ_PORT_VLAN_VID + err = mlx5_esw_bridge_port_vlan_add(vport_num, esw_owner_vhca_id, vlan->vid, + vlan->flags, br_offloads, extack); ++#else ++ for (vid = vlan->vid_begin; vid <= vlan->vid_end; ++vid) { ++ err = mlx5_esw_bridge_port_vlan_add(vport_num, esw_owner_vhca_id, vid, ++ vlan->flags, br_offloads, extack); ++ } ++#endif + break; + default: + return -EOPNOTSUPP; +@@ -218,6 +230,9 @@ mlx5_esw_bridge_port_obj_del(struct net_ + const struct switchdev_obj *obj = port_obj_info->obj; + const struct switchdev_obj_port_vlan *vlan; + u16 vport_num, esw_owner_vhca_id; ++#ifndef HAVE_STRUCT_SWITCHDEV_OBJ_PORT_VLAN_VID ++ u16 vid = 0; ++#endif + + if (!mlx5_esw_bridge_rep_vport_num_vhca_id_get(dev, br_offloads->esw, &vport_num, + &esw_owner_vhca_id)) +@@ -228,7 +243,13 @@ mlx5_esw_bridge_port_obj_del(struct net_ + switch (obj->id) { + case SWITCHDEV_OBJ_ID_PORT_VLAN: + vlan = SWITCHDEV_OBJ_PORT_VLAN(obj); ++#ifdef HAVE_STRUCT_SWITCHDEV_OBJ_PORT_VLAN_VID + mlx5_esw_bridge_port_vlan_del(vport_num, esw_owner_vhca_id, vlan->vid, br_offloads); ++#else ++ for (vid = vlan->vid_begin; vid <= vlan->vid_end; ++vid) { ++ mlx5_esw_bridge_port_vlan_del(vport_num, esw_owner_vhca_id, vid, br_offloads); ++ } ++#endif + break; + default: + return -EOPNOTSUPP; +@@ -236,6 +257,7 @@ mlx5_esw_bridge_port_obj_del(struct net_ + return 0; + } + ++#ifdef HAVE_SWITCHDEV_PORT_ATTR_SET + static int + mlx5_esw_bridge_port_obj_attr_set(struct net_device *dev, + struct switchdev_notifier_port_attr_info *port_attr_info, +@@ -254,7 +276,11 @@ mlx5_esw_bridge_port_obj_attr_set(struct + + switch (attr->id) { + case SWITCHDEV_ATTR_ID_PORT_PRE_BRIDGE_FLAGS: ++#ifdef HAVE_STRUCT_SWITCHDEV_BRPORT_FLAGS + if (attr->u.brport_flags.mask & ~(BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD)) { ++#else ++ if (attr->u.brport_flags & ~(BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD)) { ++#endif + NL_SET_ERR_MSG_MOD(extack, "Flag is not supported"); + err = -EINVAL; + } +@@ -275,6 +301,7 @@ mlx5_esw_bridge_port_obj_attr_set(struct + + return err; + } ++#endif + + static int mlx5_esw_bridge_event_blocking(struct notifier_block *nb, + unsigned long event, void *ptr) +@@ -292,9 +319,11 @@ static int mlx5_esw_bridge_event_blockin + case SWITCHDEV_PORT_OBJ_DEL: + err = mlx5_esw_bridge_port_obj_del(dev, ptr, br_offloads); + break; ++#ifdef HAVE_SWITCHDEV_PORT_ATTR_SET + case SWITCHDEV_PORT_ATTR_SET: + err = mlx5_esw_bridge_port_obj_attr_set(dev, ptr, br_offloads); + break; ++#endif + default: + err = 0; + } +@@ -383,11 +412,13 @@ static int mlx5_esw_bridge_switchdev_eve + u16 vport_num, esw_owner_vhca_id; + struct net_device *upper, *rep; + ++#ifdef HAVE_SWITCHDEV_PORT_ATTR_SET + if (event == SWITCHDEV_PORT_ATTR_SET) { + int err = mlx5_esw_bridge_port_obj_attr_set(dev, ptr, br_offloads); + + return notifier_from_errno(err); + } ++#endif + + upper = netdev_master_upper_dev_get_rcu(dev); + if (!upper) +@@ -491,7 +522,11 @@ void mlx5e_rep_bridge_init(struct mlx5e_ + } + + br_offloads->netdev_nb.notifier_call = mlx5_esw_bridge_switchdev_port_event; ++#ifdef HAVE_UNREGISTER_NETDEVICE_NOTIFIER_NET + err = register_netdevice_notifier_net(&init_net, &br_offloads->netdev_nb); ++#else ++ err = register_netdevice_notifier(&br_offloads->netdev_nb); ++#endif + if (err) { + esw_warn(mdev, "Failed to register bridge offloads netdevice notifier (err=%d)\n", + err); +@@ -526,7 +561,11 @@ void mlx5e_rep_bridge_cleanup(struct mlx + return; + + cancel_delayed_work_sync(&br_offloads->update_work); ++#ifdef HAVE_UNREGISTER_NETDEVICE_NOTIFIER_NET + unregister_netdevice_notifier_net(&init_net, &br_offloads->netdev_nb); ++#else ++ unregister_netdevice_notifier(&br_offloads->netdev_nb); ++#endif + unregister_switchdev_blocking_notifier(&br_offloads->nb_blk); + unregister_switchdev_notifier(&br_offloads->nb); + destroy_workqueue(br_offloads->wq); diff --git a/src/mlnx-ofa_kernel-5.8/backports/0099-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0099-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..2866381 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0099-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,29 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en/rep/neigh.c + +Change-Id: I8482b6fea345f060004e0a9d2a1f4e672e155a50 +--- + drivers/net/ethernet/mellanox/mlx5/core/en/rep/neigh.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/neigh.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/neigh.c +@@ -10,6 +10,7 @@ + #include + #include + #include ++#include + #include "neigh.h" + #include "tc.h" + #include "en_rep.h" +@@ -150,7 +151,9 @@ static void mlx5e_rep_neigh_update(struc + + neigh_connected = (nud_state & NUD_VALID) && !dead; + ++#ifndef MLX_DISABLE_TRACEPOINTS + trace_mlx5e_rep_neigh_update(nhe, ha, neigh_connected); ++#endif + + if (!same_dev) + goto out; diff --git a/src/mlnx-ofa_kernel-5.8/backports/0100-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0100-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..0cb3ac4 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0100-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,687 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c + +Change-Id: I876994319d1057ee61f423561e60382989cc04d4 +--- + .../ethernet/mellanox/mlx5/core/en/rep/tc.c | 410 +++++++++++++++++- + 1 file changed, 402 insertions(+), 8 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c +@@ -26,8 +26,9 @@ + struct mlx5e_rep_indr_block_priv { + struct net_device *netdev; + struct mlx5e_rep_priv *rpriv; ++#if defined( HAVE_TC_BLOCK_OFFLOAD) || defined(HAVE_FLOW_BLOCK_OFFLOAD) + enum flow_block_binder_type binder_type; +- ++#endif + struct list_head list; + }; + +@@ -129,10 +130,92 @@ unlock: + mlx5e_put_flow_list(priv, &flow_list); + } + ++#if defined(HAVE_TC_FLOWER_OFFLOAD) || defined(HAVE_FLOW_CLS_OFFLOAD) + static int ++#if defined(HAVE_NDO_SETUP_TC_TAKES_TC_SETUP_TYPE) || defined(HAVE_NDO_SETUP_TC_RH_EXTENDED) ++#if defined( HAVE_TC_BLOCK_OFFLOAD) || defined(HAVE_FLOW_BLOCK_OFFLOAD) + mlx5e_rep_setup_tc_cls_flower(struct mlx5e_priv *priv, ++#else ++mlx5e_rep_setup_tc_cls_flower(struct net_device *dev, ++#endif + struct flow_cls_offload *cls_flower, int flags) +-{ ++#else ++mlx5e_rep_setup_tc_cls_flower(struct net_device *dev, ++ u32 handle, ++#ifdef HAVE_NDO_SETUP_TC_TAKES_CHAIN_INDEX ++ u32 chain_index, ++#endif ++ __be16 proto, ++ struct tc_to_netdev *tc, int flags) ++#endif ++{ ++#if !defined(HAVE_NDO_SETUP_TC_TAKES_TC_SETUP_TYPE) && !defined(HAVE_NDO_SETUP_TC_RH_EXTENDED) ++ struct tc_cls_flower_offload *cls_flower = tc->cls_flower; ++#endif ++ ++#ifndef HAVE_TC_CLS_CAN_OFFLOAD_AND_CHAIN0 ++#ifdef HAVE_TC_BLOCK_OFFLOAD ++ if (cls_flower->common.chain_index) ++#else ++ struct mlx5e_priv *priv = netdev_priv(dev); ++#if defined(HAVE_NDO_SETUP_TC_TAKES_TC_SETUP_TYPE) || defined(HAVE_NDO_SETUP_TC_RH_EXTENDED) ++ if (!is_classid_clsact_ingress(cls_flower->common.classid) || ++ cls_flower->common.chain_index) ++#else ++ if (TC_H_MAJ(handle) != TC_H_MAJ(TC_H_INGRESS) || ++#ifdef HAVE_NDO_SETUP_TC_TAKES_CHAIN_INDEX ++ chain_index) ++#else ++ 0) ++#endif ++#endif ++#endif ++ return -EOPNOTSUPP; ++#endif ++ ++#if defined(HAVE_TC_TO_NETDEV_EGRESS_DEV) || defined(HAVE_TC_CLS_FLOWER_OFFLOAD_EGRESS_DEV) ++#ifndef HAVE_TC_SETUP_CB_EGDEV_REGISTER ++#if !defined(HAVE_NDO_SETUP_TC_RH_EXTENDED) || defined(HAVE_TC_CLS_FLOWER_OFFLOAD_EGRESS_DEV) ++#if defined(HAVE_NDO_SETUP_TC_TAKES_TC_SETUP_TYPE) || defined(HAVE_NDO_SETUP_TC_RH_EXTENDED) ++ if (cls_flower->egress_dev) { ++#else ++ if (tc->egress_dev) { ++#endif ++ struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; ++ struct mlx5e_rep_priv * uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH); ++ struct net_device *uplink_dev = uplink_rpriv->netdev; ++ int err; ++#if defined(HAVE_TC_BLOCK_OFFLOAD) && \ ++ (defined(HAVE_NDO_SETUP_TC_TAKES_TC_SETUP_TYPE) || \ ++ defined(HAVE_NDO_SETUP_TC_RH_EXTENDED)) ++ struct net_device *dev = priv->netdev; ++#endif ++ ++ flags = (flags & (~MLX5_TC_FLAG(INGRESS))) | MLX5_TC_FLAG(EGRESS); ++ ++ if (uplink_dev != dev) { ++#if defined(HAVE_NDO_SETUP_TC_TAKES_TC_SETUP_TYPE) ++ err = dev->netdev_ops->ndo_setup_tc(uplink_dev, TC_SETUP_CLSFLOWER, ++ cls_flower); ++#elif defined(HAVE_NDO_SETUP_TC_RH_EXTENDED) ++ err = dev->netdev_ops->extended.ndo_setup_tc_rh(uplink_dev, ++ TC_SETUP_CLSFLOWER, ++ cls_flower); ++ ++#else ++ err = dev->netdev_ops->ndo_setup_tc(uplink_dev, handle, ++#ifdef HAVE_NDO_SETUP_TC_TAKES_CHAIN_INDEX ++ chain_index, ++#endif ++ proto, tc); ++#endif ++ return err; ++ } ++ } ++#endif /* !HAVE_NDO_SETUP_TC_RH_EXTENDED || HAVE_TC_CLS_FLOWER_OFFLOAD_EGRESS_DEV */ ++#endif /* !HAVE_TC_SETUP_CB_EGDEV_REGISTER */ ++#endif /* HAVE_TC_TO_NETDEV_EGRESS_DEV || HAVE_TC_CLS_FLOWER_OFFLOAD_EGRESS_DEV */ ++ + switch (cls_flower->command) { + case FLOW_CLS_REPLACE: + return mlx5e_configure_flower(priv->netdev, priv, cls_flower, +@@ -140,14 +223,18 @@ mlx5e_rep_setup_tc_cls_flower(struct mlx + case FLOW_CLS_DESTROY: + return mlx5e_delete_flower(priv->netdev, priv, cls_flower, + flags); ++#ifdef HAVE_TC_CLSFLOWER_STATS + case FLOW_CLS_STATS: + return mlx5e_stats_flower(priv->netdev, priv, cls_flower, + flags); ++#endif + default: + return -EOPNOTSUPP; + } + } ++#endif /* defined(HAVE_TC_FLOWER_OFFLOAD) */ + ++#ifdef HAVE_TC_CLSMATCHALL_STATS + static + int mlx5e_rep_setup_tc_cls_matchall(struct mlx5e_priv *priv, + struct tc_cls_matchall_offload *ma) +@@ -164,6 +251,9 @@ int mlx5e_rep_setup_tc_cls_matchall(stru + return -EOPNOTSUPP; + } + } ++#endif /* HAVE_TC_CLSMATCHALL_STATS */ ++ ++#if defined(HAVE_TC_BLOCK_OFFLOAD) || defined(HAVE_FLOW_CLS_OFFLOAD) + + static int mlx5e_rep_setup_tc_cb(enum tc_setup_type type, void *type_data, + void *cb_priv) +@@ -177,13 +267,76 @@ static int mlx5e_rep_setup_tc_cb(enum tc + switch (type) { + case TC_SETUP_CLSFLOWER: + return mlx5e_rep_setup_tc_cls_flower(priv, type_data, flags); ++#ifdef HAVE_TC_CLSMATCHALL_STATS + case TC_SETUP_CLSMATCHALL: + return mlx5e_rep_setup_tc_cls_matchall(priv, type_data); ++#endif ++ default: ++ return -EOPNOTSUPP; ++ } ++} ++ ++#ifdef HAVE_FLOW_CLS_OFFLOAD ++static LIST_HEAD(mlx5e_rep_block_cb_list); ++#endif ++ ++#ifndef HAVE_FLOW_BLOCK_CB_SETUP_SIMPLE ++static int mlx5e_rep_setup_tc_block(struct net_device *dev, ++ struct tc_block_offload *f) ++{ ++ struct mlx5e_priv *priv = netdev_priv(dev); ++#ifdef HAVE_FLOW_CLS_OFFLOAD ++ struct flow_block_cb *block_cb; ++#endif ++ ++ if (f->binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS) ++ return -EOPNOTSUPP; ++ ++#ifdef HAVE_FLOW_CLS_OFFLOAD ++ f->driver_block_list = &mlx5e_rep_block_cb_list; ++#endif ++ ++ switch (f->command) { ++ case TC_BLOCK_BIND: ++#ifdef HAVE_FLOW_CLS_OFFLOAD ++ block_cb = flow_block_cb_alloc(mlx5e_rep_setup_tc_cb, priv, priv, NULL); ++#else ++ return tcf_block_cb_register(f->block, mlx5e_rep_setup_tc_cb, ++#ifdef HAVE_TC_BLOCK_OFFLOAD_EXTACK ++ priv, priv, f->extack); ++#else ++ ++ priv, priv); ++#endif ++#endif /* HAVE_FLOW_CLS_OFFLOAD */ ++#ifdef HAVE_FLOW_CLS_OFFLOAD ++ if (IS_ERR(block_cb)) { ++ return -ENOENT; ++ } ++ flow_block_cb_add(block_cb, f); ++ list_add_tail(&block_cb->driver_list, f->driver_block_list); ++ return 0; ++#endif ++ case TC_BLOCK_UNBIND: ++#ifndef HAVE_FLOW_CLS_OFFLOAD ++ tcf_block_cb_unregister(f->block, mlx5e_rep_setup_tc_cb, priv); ++#else ++ block_cb = flow_block_cb_lookup(f->block, mlx5e_rep_setup_tc_cb, priv); ++ if (!block_cb) ++ return -ENOENT; ++ ++ flow_block_cb_remove(block_cb, f); ++ list_del(&block_cb->driver_list); ++#endif ++ return 0; + default: + return -EOPNOTSUPP; + } + } ++#endif /* HAVE_FLOW_BLOCK_CB_SETUP_SIMPLE */ ++#endif /* HAVE_TC_BLOCK_OFFLOAD */ + ++#ifdef HAVE_TC_SETUP_FT + static int mlx5e_rep_setup_ft_cb(enum tc_setup_type type, void *type_data, + void *cb_priv) + { +@@ -228,32 +381,108 @@ static int mlx5e_rep_setup_ft_cb(enum tc + return -EOPNOTSUPP; + } + } ++#endif + ++#if defined(HAVE_TC_FLOWER_OFFLOAD) || defined(HAVE_FLOW_CLS_OFFLOAD) + static LIST_HEAD(mlx5e_rep_block_tc_cb_list); + static LIST_HEAD(mlx5e_rep_block_ft_cb_list); ++#if defined(HAVE_NDO_SETUP_TC_TAKES_TC_SETUP_TYPE) || defined(HAVE_NDO_SETUP_TC_RH_EXTENDED) + int mlx5e_rep_setup_tc(struct net_device *dev, enum tc_setup_type type, + void *type_data) ++#else ++int mlx5e_rep_setup_tc(struct net_device *dev, u32 handle, ++#ifdef HAVE_NDO_SETUP_TC_TAKES_CHAIN_INDEX ++ u32 chain_index, __be16 proto, ++#else ++ __be16 proto, ++#endif ++ struct tc_to_netdev *tc) ++#endif + { ++#ifdef HAVE_FLOW_BLOCK_CB_SETUP_SIMPLE + struct mlx5e_priv *priv = netdev_priv(dev); +- struct flow_block_offload *f = type_data; ++#endif + ++#if !defined(HAVE_TC_BLOCK_OFFLOAD) && ! defined(HAVE_FLOW_BLOCK_OFFLOAD) ++ unsigned long flags = MLX5_TC_FLAG(INGRESS) | MLX5_TC_FLAG(ESW_OFFLOAD); ++#endif ++#ifdef HAVE_UNLOCKED_DRIVER_CB ++ struct flow_block_offload *f = type_data; + f->unlocked_driver_cb = true; ++#endif + ++#if defined(HAVE_NDO_SETUP_TC_TAKES_TC_SETUP_TYPE) || defined(HAVE_NDO_SETUP_TC_RH_EXTENDED) + switch (type) { ++#else ++ switch (tc->type) { ++#endif ++#if defined(HAVE_TC_BLOCK_OFFLOAD) || defined(HAVE_FLOW_BLOCK_OFFLOAD) + case TC_SETUP_BLOCK: ++#ifdef HAVE_FLOW_BLOCK_CB_SETUP_SIMPLE + return flow_block_cb_setup_simple(type_data, + &mlx5e_rep_block_tc_cb_list, + mlx5e_rep_setup_tc_cb, + priv, priv, true); ++#else ++ return mlx5e_rep_setup_tc_block(dev, type_data); ++#endif /* HAVE_FLOW_BLOCK_CB_SETUP_SIMPLE */ ++#else /* HAVE_TC_BLOCK_OFFLOAD || HAVE_FLOW_BLOCK_OFFLOAD */ ++ case TC_SETUP_CLSFLOWER: ++#if defined(HAVE_NDO_SETUP_TC_TAKES_TC_SETUP_TYPE) || defined(HAVE_NDO_SETUP_TC_RH_EXTENDED) ++ return mlx5e_rep_setup_tc_cls_flower(dev, type_data, flags); ++#else ++ return mlx5e_rep_setup_tc_cls_flower(dev, handle, ++#ifdef HAVE_NDO_SETUP_TC_TAKES_CHAIN_INDEX ++ chain_index, ++#endif /* HAVE_NDO_SETUP_TC_TAKES_CHAIN_INDEX */ ++ proto, tc, flags); ++#endif /* HAVE_NDO_SETUP_TC_TAKES_TC_SETUP_TYPE || HAVE_NDO_SETUP_TC_RH_EXTENDED */ ++#endif /* HAVE_TC_BLOCK_OFFLOAD || HAVE_FLOW_BLOCK_OFFLOAD */ ++#ifdef HAVE_TC_SETUP_FT + case TC_SETUP_FT: + return flow_block_cb_setup_simple(type_data, + &mlx5e_rep_block_ft_cb_list, + mlx5e_rep_setup_ft_cb, + priv, priv, true); ++#endif /* HAVE_TC_SETUP_FT */ ++ default: ++ return -EOPNOTSUPP; ++ } ++} ++#endif ++ ++#ifdef HAVE_TC_SETUP_CB_EGDEV_REGISTER ++#ifdef HAVE_TC_BLOCK_OFFLOAD ++int mlx5e_rep_setup_tc_cb_egdev(enum tc_setup_type type, void *type_data, ++ void *cb_priv) ++{ ++ unsigned long flags = MLX5_TC_FLAG(EGRESS) | MLX5_TC_FLAG(ESW_OFFLOAD); ++ struct mlx5e_priv *priv = cb_priv; ++ ++#ifdef HAVE_TC_INDR_API ++ /* some rhel kernels have indirect offload and egdev, ++ * so dont use egdev. e.g. rhel8.0 ++ */ ++ return -EOPNOTSUPP; ++#endif ++ ++ switch (type) { ++ case TC_SETUP_CLSFLOWER: ++ return mlx5e_rep_setup_tc_cls_flower(priv, type_data, flags); + default: + return -EOPNOTSUPP; + } + } ++#else ++int mlx5e_rep_setup_tc_cb(enum tc_setup_type type, void *type_data, ++ void *cb_priv) ++{ ++ struct net_device *dev = cb_priv; ++ ++ return mlx5e_setup_tc(dev, type, type_data); ++} ++#endif ++#endif + + int mlx5e_rep_tc_init(struct mlx5e_rep_priv *rpriv) + { +@@ -299,6 +528,7 @@ int mlx5e_rep_tc_event_port_affinity(str + return NOTIFY_OK; + } + ++#if defined( HAVE_TC_BLOCK_OFFLOAD) || defined(HAVE_FLOW_BLOCK_OFFLOAD) + static struct mlx5e_rep_indr_block_priv * + mlx5e_rep_indr_block_priv_lookup(struct mlx5e_rep_priv *rpriv, + struct net_device *netdev, +@@ -364,6 +594,7 @@ static int mlx5e_rep_indr_setup_tc_cb(en + } + } + ++#ifdef HAVE_TC_SETUP_FT + static int mlx5e_rep_indr_setup_ft_cb(enum tc_setup_type type, + void *type_data, void *indr_priv) + { +@@ -409,7 +640,9 @@ static int mlx5e_rep_indr_setup_ft_cb(en + return -EOPNOTSUPP; + } + } ++#endif + ++#ifdef HAVE_FLOW_BLOCK_CB_ALLOC + static void mlx5e_rep_indr_block_unbind(void *cb_priv) + { + struct mlx5e_rep_indr_block_priv *indr_priv = cb_priv; +@@ -417,6 +650,7 @@ static void mlx5e_rep_indr_block_unbind( + list_del(&indr_priv->list); + kfree(indr_priv); + } ++#endif + + static LIST_HEAD(mlx5e_block_cb_list); + +@@ -438,8 +672,12 @@ mlx5e_rep_indr_setup_block(struct net_de + struct mlx5e_priv *priv = netdev_priv(rpriv->netdev); + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + bool is_ovs_int_port = netif_is_ovs_master(netdev); +- struct mlx5e_rep_indr_block_priv *indr_priv; ++ struct mlx5e_rep_indr_block_priv *indr_priv = NULL; ++#ifdef HAVE_FLOW_BLOCK_CB_ALLOC + struct flow_block_cb *block_cb; ++#else ++ int err = 0; ++#endif + + if (!mlx5e_tc_tun_device_to_offload(priv, netdev) && + !(is_vlan_dev(netdev) && vlan_dev_real_dev(netdev) == rpriv->netdev) && +@@ -462,8 +700,12 @@ mlx5e_rep_indr_setup_block(struct net_de + if (is_ovs_int_port && !mlx5e_tc_int_port_supported(esw)) + return -EOPNOTSUPP; + ++#ifdef HAVE_UNLOCKED_DRIVER_CB + f->unlocked_driver_cb = true; ++#endif ++#ifdef HAVE_FLOW_BLOCK_OFFLOAD + f->driver_block_list = &mlx5e_block_cb_list; ++#endif + + switch (f->command) { + case FLOW_BLOCK_BIND: +@@ -481,10 +723,16 @@ mlx5e_rep_indr_setup_block(struct net_de + list_add(&indr_priv->list, + &rpriv->uplink_priv.tc_indr_block_priv_list); + ++#ifdef HAVE_FLOW_BLOCK_CB_ALLOC ++#ifdef HAVE_FLOW_INDR_BLOCK_CB_ALLOC + block_cb = flow_indr_block_cb_alloc(setup_cb, indr_priv, indr_priv, + mlx5e_rep_indr_block_unbind, + f, netdev, sch, data, rpriv, + cleanup); ++#else ++ block_cb = flow_block_cb_alloc(setup_cb, indr_priv, indr_priv, ++ mlx5e_rep_indr_block_unbind); ++#endif + if (IS_ERR(block_cb)) { + list_del(&indr_priv->list); + kfree(indr_priv); +@@ -494,17 +742,42 @@ mlx5e_rep_indr_setup_block(struct net_de + list_add_tail(&block_cb->driver_list, &mlx5e_block_cb_list); + + return 0; ++#else ++ err = tcf_block_cb_register(f->block, ++ mlx5e_rep_indr_setup_tc_cb, ++ indr_priv, indr_priv ++#ifdef HAVE_TC_BLOCK_OFFLOAD_EXTACK ++ , f->extack ++#endif ++ ); ++ if (err) { ++ list_del(&indr_priv->list); ++ kfree(indr_priv); ++ } ++ ++ return err; ++#endif ++ + case FLOW_BLOCK_UNBIND: + indr_priv = mlx5e_rep_indr_block_priv_lookup(rpriv, netdev, f->binder_type); + if (!indr_priv) + return -ENOENT; + ++#ifdef HAVE_FLOW_BLOCK_CB_ALLOC + block_cb = flow_block_cb_lookup(f->block, setup_cb, indr_priv); + if (!block_cb) + return -ENOENT; + + flow_indr_block_cb_remove(block_cb, f); + list_del(&block_cb->driver_list); ++#else ++ tcf_block_cb_unregister(f->block, ++ mlx5e_rep_indr_setup_tc_cb, ++ indr_priv); ++ list_del(&indr_priv->list); ++ kfree(indr_priv); ++#endif ++ + return 0; + default: + return -EOPNOTSUPP; +@@ -512,6 +785,7 @@ mlx5e_rep_indr_setup_block(struct net_de + return 0; + } + ++#ifdef HAVE_FLOW_OFFLOAD_ACTION + static int + mlx5e_rep_indr_replace_act(struct mlx5e_rep_priv *rpriv, + struct flow_offload_action *fl_act) +@@ -609,6 +883,7 @@ mlx5e_rep_indr_setup_act(struct mlx5e_re + return -EOPNOTSUPP; + } + } ++#endif /* HAVE_FLOW_OFFLOAD_ACTION */ + + static int + mlx5e_rep_indr_no_dev_setup(struct mlx5e_rep_priv *rpriv, +@@ -619,19 +894,36 @@ mlx5e_rep_indr_no_dev_setup(struct mlx5e + return -EOPNOTSUPP; + + switch (type) { ++#ifdef HAVE_FLOW_OFFLOAD_ACTION + case TC_SETUP_ACT: + return mlx5e_rep_indr_setup_act(rpriv, data); ++#endif + default: + return -EOPNOTSUPP; + } + } + + static ++#ifdef HAVE_FLOW_INDR_BLOCK_BIND_CB_T_7_PARAMS + int mlx5e_rep_indr_setup_cb(struct net_device *netdev, struct Qdisc *sch, void *cb_priv, +- enum tc_setup_type type, void *type_data, +- void *data, +- void (*cleanup)(struct flow_block_cb *block_cb)) +-{ ++#else ++int mlx5e_rep_indr_setup_cb(struct net_device *netdev, void *cb_priv, ++#endif ++ enum tc_setup_type type, void *type_data ++#if !defined(HAVE_FLOW_INDR_BLOCK_BIND_CB_T_4_PARAMS) && defined(HAVE_FLOW_INDR_DEV_REGISTER) ++ , void *data, ++ void (*cleanup)(struct flow_block_cb *block_cb) ++#endif ++ ) ++{ ++#ifndef HAVE_FLOW_INDR_BLOCK_BIND_CB_T_7_PARAMS ++ struct Qdisc *sch = NULL; ++#endif ++#if defined(HAVE_FLOW_INDR_BLOCK_BIND_CB_T_4_PARAMS) || !defined(HAVE_FLOW_INDR_DEV_REGISTER) ++ void *data = NULL; ++ void *cleanup = NULL; ++#endif ++ + if (!netdev) + return mlx5e_rep_indr_no_dev_setup(cb_priv, type, data); + +@@ -640,15 +932,78 @@ int mlx5e_rep_indr_setup_cb(struct net_d + return mlx5e_rep_indr_setup_block(netdev, sch, cb_priv, type_data, + mlx5e_rep_indr_setup_tc_cb, + data, cleanup); ++#ifdef HAVE_TC_SETUP_FT + case TC_SETUP_FT: + return mlx5e_rep_indr_setup_block(netdev, sch, cb_priv, type_data, + mlx5e_rep_indr_setup_ft_cb, + data, cleanup); ++#endif + default: + return -EOPNOTSUPP; + } + } + ++#ifndef HAVE_FLOW_INDR_DEV_REGISTER ++static int mlx5e_rep_indr_register_block(struct mlx5e_rep_priv *rpriv, ++ struct net_device *netdev) ++{ ++ int err; ++ ++ err = __flow_indr_block_cb_register(netdev, rpriv, ++ mlx5e_rep_indr_setup_cb, ++ rpriv); ++ if (err) { ++ struct mlx5e_priv *priv = netdev_priv(rpriv->netdev); ++ ++ mlx5_core_err(priv->mdev, "Failed to register remote block notifier for %s err=%d\n", ++ netdev_name(netdev), err); ++ } ++ return err; ++} ++ ++static void mlx5e_rep_indr_unregister_block(struct mlx5e_rep_priv *rpriv, ++ struct net_device *netdev) ++{ ++ __flow_indr_block_cb_unregister(netdev, mlx5e_rep_indr_setup_cb, ++ rpriv); ++} ++ ++void mlx5e_rep_indr_clean_block_privs(struct mlx5e_rep_priv *rpriv) ++{ ++ struct mlx5e_rep_indr_block_priv *cb_priv, *temp; ++ struct list_head *head = &rpriv->uplink_priv.tc_indr_block_priv_list; ++ ++ list_for_each_entry_safe(cb_priv, temp, head, list) { ++ mlx5e_rep_indr_unregister_block(rpriv, cb_priv->netdev); ++ kfree(cb_priv); ++ } ++} ++ ++static int mlx5e_nic_rep_netdevice_event(struct notifier_block *nb, ++ unsigned long event, void *ptr) ++{ ++ struct mlx5e_rep_priv *rpriv = container_of(nb, struct mlx5e_rep_priv, ++ uplink_priv.netdevice_nb); ++ struct mlx5e_priv *priv = netdev_priv(rpriv->netdev); ++ struct net_device *netdev = netdev_notifier_info_to_dev(ptr); ++ ++ if (!mlx5e_tc_tun_device_to_offload(priv, netdev) && ++ !(is_vlan_dev(netdev) && vlan_dev_real_dev(netdev) == rpriv->netdev) && ++ !netif_is_ovs_master(netdev)) ++ return NOTIFY_OK; ++ ++ switch (event) { ++ case NETDEV_REGISTER: ++ mlx5e_rep_indr_register_block(rpriv, netdev); ++ break; ++ case NETDEV_UNREGISTER: ++ mlx5e_rep_indr_unregister_block(rpriv, netdev); ++ break; ++ } ++ return NOTIFY_OK; ++} ++#endif /* HAVE_FLOW_INDR_DEV_REGISTER */ ++ + int mlx5e_rep_tc_netdevice_event_register(struct mlx5e_rep_priv *rpriv) + { + struct mlx5_rep_uplink_priv *uplink_priv = &rpriv->uplink_priv; +@@ -656,15 +1011,36 @@ int mlx5e_rep_tc_netdevice_event_registe + /* init indirect block notifications */ + INIT_LIST_HEAD(&uplink_priv->tc_indr_block_priv_list); + ++#ifdef HAVE_FLOW_INDR_DEV_REGISTER + return flow_indr_dev_register(mlx5e_rep_indr_setup_cb, rpriv); ++#else ++ uplink_priv->netdevice_nb.notifier_call = mlx5e_nic_rep_netdevice_event; ++ return register_netdevice_notifier_dev_net(rpriv->netdev, ++ &uplink_priv->netdevice_nb, ++ &uplink_priv->netdevice_nn); ++#endif + } + + void mlx5e_rep_tc_netdevice_event_unregister(struct mlx5e_rep_priv *rpriv) + { ++#ifndef HAVE_FLOW_INDR_DEV_REGISTER ++ struct mlx5_rep_uplink_priv *uplink_priv = &rpriv->uplink_priv; ++ ++ /* clean indirect TC block notifications */ ++ unregister_netdevice_notifier_dev_net(rpriv->netdev, ++ &uplink_priv->netdevice_nb, ++ &uplink_priv->netdevice_nn); ++#else + flow_indr_dev_unregister(mlx5e_rep_indr_setup_cb, rpriv, ++#ifdef HAVE_FLOW_INDR_DEV_UNREGISTER_FLOW_SETUP_CB_T ++ mlx5e_rep_indr_setup_tc_cb); ++#else + mlx5e_rep_indr_block_unbind); +-} +- ++#endif ++#endif ++ } ++#endif /* HAVE_TC_BLOCK_OFFLOAD || HAVE_FLOW_BLOCK_OFFLOAD */ ++ + static bool mlx5e_restore_tunnel(struct mlx5e_priv *priv, struct sk_buff *skb, + struct mlx5e_tc_update_priv *tc_priv, + u32 tunnel_id) +@@ -710,13 +1086,21 @@ static bool mlx5e_restore_tunnel(struct + if (key.enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { + tun_dst = __ip_tun_set_dst(key.enc_ipv4.src, key.enc_ipv4.dst, + key.enc_ip.tos, key.enc_ip.ttl, ++#ifdef HAVE___IP_TUN_SET_DST_7_PARAMS ++ TUNNEL_KEY, ++#else + key.enc_tp.dst, TUNNEL_KEY, ++#endif + key32_to_tunnel_id(key.enc_key_id.keyid), + enc_opts.key.len); + } else if (key.enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { + tun_dst = __ipv6_tun_set_dst(&key.enc_ipv6.src, &key.enc_ipv6.dst, + key.enc_ip.tos, key.enc_ip.ttl, ++#ifdef HAVE___IP_TUN_SET_DST_7_PARAMS ++ 0, TUNNEL_KEY, ++#else + key.enc_tp.dst, 0, TUNNEL_KEY, ++#endif + key32_to_tunnel_id(key.enc_key_id.keyid), + enc_opts.key.len); + } else { +@@ -734,10 +1118,16 @@ static bool mlx5e_restore_tunnel(struct + tun_dst->u.tun_info.key.tp_src = key.enc_tp.src; + + if (enc_opts.key.len) ++#ifdef HAVE_IP_TUNNEL_INFO_OPTS_SET_4_PARAMS + ip_tunnel_info_opts_set(&tun_dst->u.tun_info, + enc_opts.key.data, + enc_opts.key.len, + enc_opts.key.dst_opt_type); ++#else ++ ip_tunnel_info_opts_set(&tun_dst->u.tun_info, ++ enc_opts.key.data, ++ enc_opts.key.len); ++#endif + + skb_dst_set(skb, (struct dst_entry *)tun_dst); + dev = dev_get_by_index(&init_net, key.filter_ifindex); +@@ -760,7 +1150,9 @@ static bool mlx5e_restore_skb_chain(stru + struct mlx5e_tc_update_priv *tc_priv) + { + struct mlx5e_priv *priv = netdev_priv(skb->dev); ++#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; ++#endif + u32 tunnel_id = (reg_c1 >> ESW_TUN_OFFSET) & TUNNEL_ID_MASK; + + #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) +@@ -786,8 +1178,10 @@ static bool mlx5e_restore_skb_chain(stru + #endif /* CONFIG_NET_TC_SKB_EXT */ + return mlx5e_restore_tunnel(priv, skb, tc_priv, tunnel_id); + ++#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) + out_incr_rx_counter: + atomic_inc(&esw->dev->priv.ct_debugfs->stats.rx_dropped); ++#endif + + return false; + } diff --git a/src/mlnx-ofa_kernel-5.8/backports/0101-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0101-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..13819e9 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0101-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,216 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c + +Change-Id: I6b43de15432b5c21f81cf816d69a2cff5ad9836b +--- + .../mellanox/mlx5/core/en/reporter_rx.c | 88 ++++++++++++++----- + 1 file changed, 68 insertions(+), 20 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c +@@ -8,6 +8,7 @@ + #include "ptp.h" + #include "lib/tout.h" + ++#ifdef HAVE_DEVLINK_HEALTH_REPORT_SUPPORT + static int mlx5e_query_rq_state(struct mlx5_core_dev *dev, u32 rqn, u8 *state) + { + int outlen = MLX5_ST_SZ_BYTES(query_rq_out); +@@ -30,6 +31,7 @@ out: + kvfree(out); + return err; + } ++#endif + + static int mlx5e_wait_for_icosq_flush(struct mlx5e_icosq *icosq) + { +@@ -76,8 +78,10 @@ static int mlx5e_rx_reporter_err_icosq_c + + /* mlx5e_close_rq cancels this work before RQ and ICOSQ are killed. */ + rq = &icosq->channel->rq; ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + if (test_bit(MLX5E_RQ_STATE_ENABLED, &icosq->channel->xskrq.state)) + xskrq = &icosq->channel->xskrq; ++#endif + mdev = icosq->channel->mdev; + dev = icosq->channel->netdev; + err = mlx5_core_query_sq_state(mdev, icosq->sqn, &state); +@@ -197,22 +201,7 @@ static int mlx5e_rx_reporter_timeout_rec + return err; + } + +-static int mlx5e_rx_reporter_recover_from_ctx(struct mlx5e_err_ctx *err_ctx) +-{ +- return err_ctx->recover(err_ctx->ctx); +-} +- +-static int mlx5e_rx_reporter_recover(struct devlink_health_reporter *reporter, +- void *context, +- struct netlink_ext_ack *extack) +-{ +- struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); +- struct mlx5e_err_ctx *err_ctx = context; +- +- return err_ctx ? mlx5e_rx_reporter_recover_from_ctx(err_ctx) : +- mlx5e_health_recover_channels(priv); +-} +- ++#ifdef HAVE_DEVLINK_HEALTH_REPORT_SUPPORT + static int mlx5e_reporter_icosq_diagnose(struct mlx5e_icosq *icosq, u8 hw_state, + struct devlink_fmsg *fmsg) + { +@@ -332,6 +321,25 @@ mlx5e_rx_reporter_build_diagnose_output_ + return 0; + } + ++static int mlx5e_rx_reporter_recover_from_ctx(struct mlx5e_err_ctx *err_ctx) ++{ ++ return err_ctx->recover(err_ctx->ctx); ++} ++ ++static int mlx5e_rx_reporter_recover(struct devlink_health_reporter *reporter, ++ void *context ++#ifdef HAVE_HEALTH_REPORTER_RECOVER_HAS_EXTACK ++ , struct netlink_ext_ack *extack ++#endif ++ ) ++{ ++ struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); ++ struct mlx5e_err_ctx *err_ctx = context; ++ ++ return err_ctx ? mlx5e_rx_reporter_recover_from_ctx(err_ctx) : ++ mlx5e_health_recover_channels(priv); ++} ++ + static int mlx5e_rx_reporter_build_diagnose_output(struct mlx5e_rq *rq, + struct devlink_fmsg *fmsg) + { +@@ -465,8 +473,11 @@ static int mlx5e_rx_reporter_build_diagn + } + + static int mlx5e_rx_reporter_diagnose(struct devlink_health_reporter *reporter, +- struct devlink_fmsg *fmsg, +- struct netlink_ext_ack *extack) ++ struct devlink_fmsg *fmsg ++#ifdef HAVE_HEALTH_REPORTER_RECOVER_HAS_EXTACK ++ , struct netlink_ext_ack *extack ++#endif ++ ) + { + struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); + struct mlx5e_ptp *ptp_ch = priv->channels.ptp; +@@ -679,8 +690,11 @@ static int mlx5e_rx_reporter_dump_from_c + } + + static int mlx5e_rx_reporter_dump(struct devlink_health_reporter *reporter, +- struct devlink_fmsg *fmsg, void *context, +- struct netlink_ext_ack *extack) ++ struct devlink_fmsg *fmsg, void *context ++#ifdef HAVE_HEALTH_REPORTER_RECOVER_HAS_EXTACK ++ , struct netlink_ext_ack *extack ++#endif ++ ) + { + struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); + struct mlx5e_err_ctx *err_ctx = context; +@@ -689,6 +703,8 @@ static int mlx5e_rx_reporter_dump(struct + mlx5e_rx_reporter_dump_all_rqs(priv, fmsg); + } + ++#endif /* HAVE_DEVLINK_HEALTH_REPORT_SUPPORT */ ++ + void mlx5e_reporter_rx_timeout(struct mlx5e_rq *rq) + { + char icosq_str[MLX5E_REPORTER_PER_Q_MAX_LEN] = {}; +@@ -699,7 +715,9 @@ void mlx5e_reporter_rx_timeout(struct ml + + err_ctx.ctx = rq; + err_ctx.recover = mlx5e_rx_reporter_timeout_recover; ++#ifdef HAVE_DEVLINK_HEALTH_REPORT_SUPPORT + err_ctx.dump = mlx5e_rx_reporter_dump_rq; ++#endif /* HAVE_DEVLINK_HEALTH_REPORT_SUPPORT */ + + if (icosq) + snprintf(icosq_str, sizeof(icosq_str), "ICOSQ: 0x%x, ", icosq->sqn); +@@ -718,7 +736,9 @@ void mlx5e_reporter_rq_cqe_err(struct ml + + err_ctx.ctx = rq; + err_ctx.recover = mlx5e_rx_reporter_err_rq_cqe_recover; ++#ifdef HAVE_DEVLINK_HEALTH_REPORT_SUPPORT + err_ctx.dump = mlx5e_rx_reporter_dump_rq; ++#endif /* HAVE_DEVLINK_HEALTH_REPORT_SUPPORT */ + snprintf(err_str, sizeof(err_str), "ERR CQE on RQ: 0x%x", rq->rqn); + + mlx5e_health_report(priv, priv->rx_reporter, err_str, &err_ctx); +@@ -732,7 +752,9 @@ void mlx5e_reporter_icosq_cqe_err(struct + + err_ctx.ctx = icosq; + err_ctx.recover = mlx5e_rx_reporter_err_icosq_cqe_recover; ++#ifdef HAVE_DEVLINK_HEALTH_REPORT_SUPPORT + err_ctx.dump = mlx5e_rx_reporter_dump_icosq; ++#endif /* HAVE_DEVLINK_HEALTH_REPORT_SUPPORT */ + snprintf(err_str, sizeof(err_str), "ERR CQE on ICOSQ: 0x%x", icosq->sqn); + + mlx5e_health_report(priv, priv->rx_reporter, err_str, &err_ctx); +@@ -748,6 +770,7 @@ void mlx5e_reporter_icosq_resume_recover + mutex_unlock(&c->icosq_recovery_lock); + } + ++#ifdef HAVE_DEVLINK_HEALTH_REPORT_SUPPORT + static const struct devlink_health_reporter_ops mlx5_rx_reporter_ops = { + .name = "rx", + .recover = mlx5e_rx_reporter_recover, +@@ -756,20 +779,39 @@ static const struct devlink_health_repor + }; + + #define MLX5E_REPORTER_RX_GRACEFUL_PERIOD 500 ++#endif /* HAVE_DEVLINK_HEALTH_REPORT_SUPPORT */ + + void mlx5e_reporter_rx_create(struct mlx5e_priv *priv) + { ++#ifndef HAVE_DEVLINK_HEALTH_REPORT_SUPPORT ++ priv->rx_reporter = NULL; ++#else ++#ifndef HAVE_DEVLINK_PORT_HEALTH_REPORTER_CREATE ++ struct devlink *devlink = priv_to_devlink(priv->mdev); ++#endif ++#ifdef HAVE_DEVLINK_PORT_HEALTH_REPORTER_CREATE + struct devlink_port *dl_port = mlx5e_devlink_get_dl_port(priv); ++#endif + struct devlink_health_reporter *reporter; + ++#ifdef HAVE_DEVLINK_PORT_HEALTH_REPORTER_CREATE + reporter = devlink_port_health_reporter_create(dl_port, &mlx5_rx_reporter_ops, + MLX5E_REPORTER_RX_GRACEFUL_PERIOD, priv); ++#else ++ reporter = devlink_health_reporter_create(devlink, ++ &mlx5_rx_reporter_ops, ++#ifdef HAVE_DEVLINK_HEALTH_REPORTER_CREATE_5_ARGS ++ MLX5E_REPORTER_RX_GRACEFUL_PERIOD, ++#endif ++ true, priv); ++#endif + if (IS_ERR(reporter)) { + netdev_warn(priv->netdev, "Failed to create rx reporter, err = %ld\n", + PTR_ERR(reporter)); + return; + } + priv->rx_reporter = reporter; ++#endif /* HAVE_DEVLINK_HEALTH_REPORT_SUPPORT */ + } + + void mlx5e_reporter_rx_destroy(struct mlx5e_priv *priv) +@@ -777,6 +819,12 @@ void mlx5e_reporter_rx_destroy(struct ml + if (!priv->rx_reporter) + return; + ++#ifdef HAVE_DEVLINK_HEALTH_REPORT_SUPPORT ++#ifdef HAVE_DEVLINK_PORT_HEALTH_REPORTER_CREATE + devlink_port_health_reporter_destroy(priv->rx_reporter); ++#else ++ devlink_health_reporter_destroy(priv->rx_reporter); ++#endif /* HAVE_DEVLINK_PORT_HEALTH_REPORTER_CREATE */ ++#endif /* HAVE_DEVLINK_HEALTH_REPORT_SUPPORT */ + priv->rx_reporter = NULL; + } diff --git a/src/mlnx-ofa_kernel-5.8/backports/0102-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0102-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..c556356 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0102-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,155 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c + +Change-Id: I976687d21f7fe9e5c58b327dc844cd313c542c03 +--- + .../mellanox/mlx5/core/en/reporter_tx.c | 57 ++++++++++++++++--- + 1 file changed, 50 insertions(+), 7 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c +@@ -127,6 +127,7 @@ static int mlx5e_tx_reporter_timeout_rec + return err; + } + ++#ifdef HAVE_DEVLINK_HEALTH_REPORT_SUPPORT + /* state lock cannot be grabbed within this function. + * It can cause a dead lock or a read-after-free. + */ +@@ -136,8 +137,11 @@ static int mlx5e_tx_reporter_recover_fro + } + + static int mlx5e_tx_reporter_recover(struct devlink_health_reporter *reporter, +- void *context, +- struct netlink_ext_ack *extack) ++ void *context ++#ifdef HAVE_HEALTH_REPORTER_RECOVER_HAS_EXTACK ++ , struct netlink_ext_ack *extack ++#endif ++ ) + { + struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); + struct mlx5e_err_ctx *err_ctx = context; +@@ -352,8 +356,11 @@ out: + } + + static int mlx5e_tx_reporter_diagnose(struct devlink_health_reporter *reporter, +- struct devlink_fmsg *fmsg, +- struct netlink_ext_ack *extack) ++ struct devlink_fmsg *fmsg ++#ifdef HAVE_HEALTH_REPORTER_RECOVER_HAS_EXTACK ++ , struct netlink_ext_ack *extack ++#endif ++ ) + { + struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); + struct mlx5e_ptp *ptp_ch = priv->channels.ptp; +@@ -536,8 +543,11 @@ static int mlx5e_tx_reporter_dump_from_c + } + + static int mlx5e_tx_reporter_dump(struct devlink_health_reporter *reporter, +- struct devlink_fmsg *fmsg, void *context, +- struct netlink_ext_ack *extack) ++ struct devlink_fmsg *fmsg, void *context ++#ifdef HAVE_HEALTH_REPORTER_RECOVER_HAS_EXTACK ++ , struct netlink_ext_ack *extack ++#endif ++ ) + { + struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); + struct mlx5e_err_ctx *err_ctx = context; +@@ -545,6 +555,7 @@ static int mlx5e_tx_reporter_dump(struct + return err_ctx ? mlx5e_tx_reporter_dump_from_ctx(priv, err_ctx, fmsg) : + mlx5e_tx_reporter_dump_all_sqs(priv, fmsg); + } ++#endif /* HAVE_DEVLINK_HEALTH_REPORT_SUPPORT */ + + void mlx5e_reporter_tx_err_cqe(struct mlx5e_txqsq *sq) + { +@@ -554,7 +565,9 @@ void mlx5e_reporter_tx_err_cqe(struct ml + + err_ctx.ctx = sq; + err_ctx.recover = mlx5e_tx_reporter_err_cqe_recover; +- err_ctx.dump = mlx5e_tx_reporter_dump_sq; ++#ifdef HAVE_DEVLINK_HEALTH_REPORT_SUPPORT ++ err_ctx.dump = mlx5e_tx_reporter_dump_sq; ++#endif /* HAVE_DEVLINK_HEALTH_REPORT_SUPPORT */ + snprintf(err_str, sizeof(err_str), "ERR CQE on SQ: 0x%x", sq->sqn); + + mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx); +@@ -570,7 +583,9 @@ int mlx5e_reporter_tx_timeout(struct mlx + to_ctx.sq = sq; + err_ctx.ctx = &to_ctx; + err_ctx.recover = mlx5e_tx_reporter_timeout_recover; ++#ifdef HAVE_DEVLINK_HEALTH_REPORT_SUPPORT + err_ctx.dump = mlx5e_tx_reporter_timeout_dump; ++#endif + snprintf(err_str, sizeof(err_str), + "TX timeout on queue: %d, SQ: 0x%x, CQ: 0x%x, SQ Cons: 0x%x SQ Prod: 0x%x, usecs since last trans: %u", + sq->ch_ix, sq->sqn, sq->cq.mcq.cqn, sq->cc, sq->pc, +@@ -580,22 +595,43 @@ int mlx5e_reporter_tx_timeout(struct mlx + return to_ctx.status; + } + ++#ifdef HAVE_DEVLINK_HEALTH_REPORT_SUPPORT + static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = { + .name = "tx", + .recover = mlx5e_tx_reporter_recover, + .diagnose = mlx5e_tx_reporter_diagnose, + .dump = mlx5e_tx_reporter_dump, + }; ++#endif + + #define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500 + + void mlx5e_reporter_tx_create(struct mlx5e_priv *priv) + { ++#ifndef HAVE_DEVLINK_HEALTH_REPORT_SUPPORT ++ priv->tx_reporter = NULL; ++#else ++#ifdef HAVE_DEVLINK_PORT_HEALTH_REPORTER_CREATE + struct devlink_port *dl_port = mlx5e_devlink_get_dl_port(priv); ++#endif + struct devlink_health_reporter *reporter; + ++#ifndef HAVE_DEVLINK_PORT_HEALTH_REPORTER_CREATE ++ struct mlx5_core_dev *mdev = priv->mdev; ++ struct devlink *devlink; ++#endif ++#ifdef HAVE_DEVLINK_PORT_HEALTH_REPORTER_CREATE + reporter = devlink_port_health_reporter_create(dl_port, &mlx5_tx_reporter_ops, + MLX5_REPORTER_TX_GRACEFUL_PERIOD, priv); ++#else ++ devlink = priv_to_devlink(mdev); ++ reporter = ++ devlink_health_reporter_create(devlink, &mlx5_tx_reporter_ops, ++#ifdef HAVE_DEVLINK_HEALTH_REPORTER_CREATE_5_ARGS ++ MLX5_REPORTER_TX_GRACEFUL_PERIOD, ++#endif ++ true, priv); ++#endif + if (IS_ERR(reporter)) { + netdev_warn(priv->netdev, + "Failed to create tx reporter, err = %ld\n", +@@ -603,6 +639,7 @@ void mlx5e_reporter_tx_create(struct mlx + return; + } + priv->tx_reporter = reporter; ++#endif /* HAVE_DEVLINK_HEALTH_REPORT_SUPPORT */ + } + + void mlx5e_reporter_tx_destroy(struct mlx5e_priv *priv) +@@ -610,6 +647,12 @@ void mlx5e_reporter_tx_destroy(struct ml + if (!priv->tx_reporter) + return; + ++#ifdef HAVE_DEVLINK_HEALTH_REPORT_SUPPORT ++#ifdef HAVE_DEVLINK_PORT_HEALTH_REPORTER_CREATE + devlink_port_health_reporter_destroy(priv->tx_reporter); ++#else ++ devlink_health_reporter_destroy(priv->tx_reporter); ++#endif /* HAVE_DEVLINK_PORT_HEALTH_REPORTER_CREATE */ ++#endif /* HAVE_DEVLINK_HEALTH_REPORT_SUPPORT */ + priv->tx_reporter = NULL; + } diff --git a/src/mlnx-ofa_kernel-5.8/backports/0103-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0103-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..321be69 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0103-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,48 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en/tc/sample.c + +Change-Id: Ife9b7993c6fec02368b2c39d7ae7118cab73b3bd +--- + .../ethernet/mellanox/mlx5/core/en/tc/sample.c | 16 +++++++++++++++- + 1 file changed, 15 insertions(+), 1 deletion(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/sample.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/sample.c +@@ -324,6 +324,7 @@ sample_restore_put(struct mlx5e_tc_psamp + + void mlx5e_tc_sample_skb(struct sk_buff *skb, struct mlx5_mapped_obj *mapped_obj) + { ++#if defined(HAVE_STRUCT_PSAMPLE_METADATA) + u32 trunc_size = mapped_obj->sample.trunc_size; + struct psample_group psample_group = {}; + struct psample_metadata md = {}; +@@ -335,6 +336,19 @@ void mlx5e_tc_sample_skb(struct sk_buff + skb_push(skb, skb->mac_len); + + psample_sample_packet(&psample_group, skb, mapped_obj->sample.rate, &md); ++#else ++ u32 trunc_size = mapped_obj->sample.trunc_size; ++ struct psample_group psample_group = {}; ++ int iif = skb->dev->ifindex; ++ u32 size; ++ ++ size = trunc_size ? min(trunc_size, skb->len) : skb->len; ++ psample_group.group_num = mapped_obj->sample.group_id; ++ psample_group.net = &init_net; ++ skb_push(skb, skb->mac_len); ++ ++ psample_sample_packet(&psample_group, skb, size, iif, 0, mapped_obj->sample.rate); ++#endif + } + + static int +@@ -478,7 +492,7 @@ mlx5e_tc_sample_offload(struct mlx5e_tc_ + struct mlx5_flow_attr *pre_attr; + u32 tunnel_id = attr->tunnel_id; + struct mlx5_eswitch *esw; +- u32 default_tbl_id; ++ u32 default_tbl_id = 0; + u32 obj_id; + int err; + diff --git a/src/mlnx-ofa_kernel-5.8/backports/0104-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0104-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..8faf32e --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0104-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,20 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en/tc/sample.h + +Change-Id: I3ec3bd68afda1ff91686fd3a4ff76e6a2ab81023 +--- + drivers/net/ethernet/mellanox/mlx5/core/en/tc/sample.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/sample.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/sample.h +@@ -4,7 +4,7 @@ + #ifndef __MLX5_EN_TC_SAMPLE_H__ + #define __MLX5_EN_TC_SAMPLE_H__ + +-#include "eswitch.h" ++#include "../../eswitch.h" + + struct mlx5_flow_attr; + struct mlx5e_tc_psample; diff --git a/src/mlnx-ofa_kernel-5.8/backports/0105-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0105-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..bfb486d --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0105-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,75 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c + +Change-Id: Ideb51730a74a4a23c43aab43595b8072c9dbfb31 +--- + .../ethernet/mellanox/mlx5/core/en/tc_ct.c | 19 ++++++++++++++----- + 1 file changed, 14 insertions(+), 5 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c +@@ -786,7 +786,9 @@ mlx5_tc_ct_entry_create_mod_hdr(struct m + } + + ct_state |= MLX5_CT_STATE_ESTABLISHED_BIT | MLX5_CT_STATE_TRK_BIT; ++#ifdef HAVE_FLOW_ACTION_CT_METADATA_ORIG_DIR + ct_state |= meta->ct_metadata.orig_dir ? 0 : MLX5_CT_STATE_REPLY_BIT; ++#endif + err = mlx5_tc_ct_entry_set_registers(ct_priv, &mod_acts, + ct_state, + meta->ct_metadata.mark, +@@ -1121,9 +1123,9 @@ err_orig: + + static int + mlx5_tc_ct_block_flow_offload_add(struct mlx5_ct_ft *ft, +- struct flow_cls_offload *flow) ++ struct flow_cls_offload1 *flow) + { +- struct flow_rule *flow_rule = flow_cls_offload_flow_rule(flow); ++ struct flow_rule *flow_rule = flow_cls_offload_flow_rule1(flow); + struct mlx5_tc_ct_priv *ct_priv = ft->ct_priv; + struct flow_action_entry *meta_action; + unsigned long cookie = flow->cookie; +@@ -1224,7 +1226,7 @@ err_set: + + static int + mlx5_tc_ct_block_flow_offload_del(struct mlx5_ct_ft *ft, +- struct flow_cls_offload *flow) ++ struct flow_cls_offload1 *flow) + { + struct mlx5_tc_ct_priv *ct_priv = ft->ct_priv; + unsigned long cookie = flow->cookie; +@@ -1252,7 +1254,7 @@ mlx5_tc_ct_block_flow_offload_del(struct + + static int + mlx5_tc_ct_block_flow_offload_stats(struct mlx5_ct_ft *ft, +- struct flow_cls_offload *f) ++ struct flow_cls_offload1 *f) + { + struct mlx5_tc_ct_priv *ct_priv = ft->ct_priv; + unsigned long cookie = f->cookie; +@@ -1274,8 +1276,15 @@ mlx5_tc_ct_block_flow_offload_stats(stru + spin_unlock_bh(&ct_priv->ht_lock); + + mlx5_fc_query_cached(entry->counter->counter, &bytes, &packets, &lastuse); ++#ifdef HAVE_FLOW_STATS_UPDATE_6_PARAMS + flow_stats_update(&f->stats, bytes, packets, 0, lastuse, + FLOW_ACTION_HW_STATS_DELAYED); ++#elif defined(HAVE_FLOW_STATS_UPDATE_5_PARAMS) ++ flow_stats_update(&f->stats, bytes, packets, lastuse, ++ FLOW_ACTION_HW_STATS_DELAYED); ++#else ++ flow_stats_update(&f->stats, bytes, packets, lastuse); ++#endif + + mlx5_tc_ct_entry_put(entry); + return 0; +@@ -1285,7 +1294,7 @@ static int + mlx5_tc_ct_block_flow_offload(enum tc_setup_type type, void *type_data, + void *cb_priv) + { +- struct flow_cls_offload *f = type_data; ++ struct flow_cls_offload1 *f = type_data; + struct mlx5_ct_ft *ft = cb_priv; + + if (type != TC_SETUP_CLSFLOWER) diff --git a/src/mlnx-ofa_kernel-5.8/backports/0106-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0106-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..9139533 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0106-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,36 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.h + +Change-Id: I81ac7dbd60cee74349caa353786e0608582a66b6 +--- + drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.h | 5 +++++ + 1 file changed, 5 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.h +@@ -8,13 +8,16 @@ + #include + #include + ++#ifndef CONFIG_COMPAT_CLS_FLOWER_MOD + #include "en.h" ++#endif + + struct mlx5_flow_attr; + struct mlx5e_tc_mod_hdr_acts; + struct mlx5_rep_uplink_priv; + struct mlx5e_tc_flow; + struct mlx5e_priv; ++struct mlx5e_post_act; + + struct mlx5_fs_chains; + struct mlx5_tc_ct_priv; +@@ -147,6 +150,8 @@ mlx5_tc_ct_set_ct_clear_regs(struct mlx5 + + #else /* CONFIG_MLX5_TC_CT */ + ++#include ++ + static inline struct mlx5_tc_ct_priv * + mlx5_tc_ct_init(struct mlx5e_priv *priv, struct mlx5_fs_chains *chains, + struct mod_hdr_tbl *mod_hdr, diff --git a/src/mlnx-ofa_kernel-5.8/backports/0107-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0107-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..c9170d4 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0107-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,193 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c + +Change-Id: I3b7db6db18fb5549420d904851a18e1907d07050 +--- + .../ethernet/mellanox/mlx5/core/en/tc_tun.c | 55 +++++++++++++++++-- + 1 file changed, 50 insertions(+), 5 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c +@@ -1,6 +1,7 @@ + /* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ + /* Copyright (c) 2018 Mellanox Technologies. */ + ++#ifdef HAVE_TCF_TUNNEL_INFO + #include + #include + #include +@@ -37,15 +38,25 @@ static void mlx5e_tc_tun_route_attr_clea + + struct mlx5e_tc_tunnel *mlx5e_get_tc_tun(struct net_device *tunnel_dev) + { ++#if !defined(HAVE_TC_INDR_API) && !defined(CONFIG_COMPAT_KERNEL_4_14) ++/* in old kernels with egdev we don't pass the netdev so the filter_dev here ++ * is actually priv->netdev. we only assume and support vxlan */ ++ return &vxlan_tunnel; ++#endif ++ + if (netif_is_vxlan(tunnel_dev)) + return &vxlan_tunnel; ++#ifdef HAVE_FLOW_DISSECTOR_KEY_ENC_OPTS + else if (netif_is_geneve(tunnel_dev)) + return &geneve_tunnel; ++#endif + else if (netif_is_gretap(tunnel_dev) || + netif_is_ip6gretap(tunnel_dev)) + return &gre_tunnel; ++#if defined(HAVE_NET_BAREUDP_H) && defined(HAVE_FLOW_DISSECTOR_MPLS_LSE) + else if (netif_is_bareudp(tunnel_dev)) + return &mplsoudp_tunnel; ++#endif + else + return NULL; + } +@@ -85,7 +96,11 @@ static int get_route_and_out_devs(struct + * it's a LAG device, use the uplink + */ + *route_dev = dev; ++#ifdef HAVE_NETDEV_PORT_SAME_PARENT_ID + if (!netdev_port_same_parent_id(priv->netdev, real_dev) || ++#else ++ if (!switchdev_port_same_parent_id(priv->netdev, real_dev) || ++#endif + dst_is_lag_dev || is_vlan_dev(*route_dev) || + netif_is_ovs_master(*route_dev)) + *out_dev = uplink_dev; +@@ -140,7 +155,11 @@ static int mlx5e_route_lookup_ipv4_get(s + goto err_rt_release; + } + +- if (mlx5_lag_is_multipath(mdev) && rt->rt_gw_family != AF_INET) { ++#ifdef HAVE_RT_GW_FAMILY ++ if (mlx5_lag_is_multipath(mdev) && rt->rt_gw_family != AF_INET) { ++#else ++ if (mlx5_lag_is_multipath(mdev) && !rt->rt_uses_gateway) { ++#endif + ret = -ENETUNREACH; + goto err_rt_release; + } +@@ -441,24 +460,33 @@ release_neigh: + return err; + } + +-#if IS_ENABLED(CONFIG_INET) && IS_ENABLED(CONFIG_IPV6) + static int mlx5e_route_lookup_ipv6_get(struct mlx5e_priv *priv, + struct net_device *dev, + struct mlx5e_tc_tun_route_attr *attr) + { + struct mlx5e_tc_tunnel *tunnel = mlx5e_get_tc_tun(dev); + struct net_device *route_dev; ++ struct dst_entry *dst = NULL; + struct net_device *out_dev; +- struct dst_entry *dst; + struct neighbour *n; ++#if IS_ENABLED(CONFIG_INET) && IS_ENABLED(CONFIG_IPV6) + int ret; + + if (tunnel && tunnel->get_remote_ifindex) + attr->fl.fl6.flowi6_oif = tunnel->get_remote_ifindex(dev); ++#if defined(HAVE_IPV6_DST_LOOKUP_FLOW) || defined(HAVE_IPV6_DST_LOOKUP_FLOW_ADDR_CONF) + dst = ipv6_stub->ipv6_dst_lookup_flow(dev_net(dev), NULL, &attr->fl.fl6, + NULL); + if (IS_ERR(dst)) + return PTR_ERR(dst); ++#elif defined(HAVE_IPV6_DST_LOOKUP_TAKES_NET) ++ ret = ipv6_stub->ipv6_dst_lookup(dev_net(dev), NULL, &dst, ++ &attr->fl.fl6); ++ if (ret < 0) ++ return ret; ++#endif ++ if (!dst) ++ return -EOPNOTSUPP; + + if (!attr->ttl) + attr->ttl = ip6_dst_hoplimit(dst); +@@ -466,6 +494,9 @@ static int mlx5e_route_lookup_ipv6_get(s + ret = get_route_and_out_devs(priv, dst->dev, &route_dev, &out_dev); + if (ret < 0) + goto err_dst_release; ++#else ++ return -EOPNOTSUPP; ++#endif + + dev_hold(route_dev); + n = dst_neigh_lookup(dst, &attr->fl.fl6.daddr); +@@ -707,7 +738,6 @@ release_neigh: + mlx5e_route_lookup_ipv6_put(&attr); + return err; + } +-#endif + + int mlx5e_tc_tun_route_lookup(struct mlx5e_priv *priv, + struct mlx5_flow_spec *spec, +@@ -809,9 +839,13 @@ int mlx5e_tc_tun_parse(struct net_device + outer_headers); + void *headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, + outer_headers); ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + struct netlink_ext_ack *extack = f->common.extack; ++#else ++ struct netlink_ext_ack *extack = NULL; ++#endif + int err = 0; +- ++ + if (!tunnel) { + netdev_warn(priv->netdev, + "decapsulation offload is not supported for %s net device\n", +@@ -825,6 +859,7 @@ int mlx5e_tc_tun_parse(struct net_device + if (tunnel->parse_udp_ports) { + err = tunnel->parse_udp_ports(priv, spec, f, + headers_c, headers_v); ++ + if (err) + goto out; + } +@@ -832,6 +867,7 @@ int mlx5e_tc_tun_parse(struct net_device + if (tunnel->parse_tunnel) { + err = tunnel->parse_tunnel(priv, spec, f, + headers_c, headers_v); ++ + if (err) + goto out; + } +@@ -945,16 +981,21 @@ int mlx5e_tc_tun_parse_udp_ports(struct + struct flow_cls_offload *f, + void *headers_c, + void *headers_v) ++ + { + struct flow_rule *rule = flow_cls_offload_flow_rule(f); ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + struct netlink_ext_ack *extack = f->common.extack; ++#endif + struct flow_match_ports enc_ports; + + /* Full udp dst port must be given */ + + if (!flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_PORTS)) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, + "UDP tunnel decap filter must include enc_dst_port condition"); ++#endif + netdev_warn(priv->netdev, + "UDP tunnel decap filter must include enc_dst_port condition\n"); + return -EOPNOTSUPP; +@@ -964,8 +1005,10 @@ int mlx5e_tc_tun_parse_udp_ports(struct + + if (memchr_inv(&enc_ports.mask->dst, 0xff, + sizeof(enc_ports.mask->dst))) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, + "UDP tunnel decap filter must match enc_dst_port fully"); ++#endif + netdev_warn(priv->netdev, + "UDP tunnel decap filter must match enc_dst_port fully\n"); + return -EOPNOTSUPP; +@@ -993,3 +1036,5 @@ int mlx5e_tc_tun_parse_udp_ports(struct + + return 0; + } ++ ++#endif /* HAVE_TCF_TUNNEL_INFO */ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0108-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0108-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..4ec8348 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0108-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,49 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.h + +Change-Id: I83f767cb28e10ecd80f5d90506dced476ac0c5bd +--- + drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.h | 9 +++++++-- + 1 file changed, 7 insertions(+), 2 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.h +@@ -8,10 +8,12 @@ + #include + #include + #include +-#include "en.h" +-#include "en_rep.h" ++#include "../en.h" ++#include "../en_rep.h" ++#include "../eswitch.h" + + #ifdef CONFIG_MLX5_ESWITCH ++#ifdef HAVE_TCF_TUNNEL_INFO + + enum { + MLX5E_TC_TUNNEL_TYPE_UNKNOWN, +@@ -36,6 +38,7 @@ struct mlx5e_tc_tunnel { + struct mlx5e_priv *priv, + struct mlx5e_encap_entry *e, + struct netlink_ext_ack *extack); ++ + int (*generate_ip_tun_hdr)(char buf[], + __u8 *ip_proto, + struct mlx5e_encap_entry *e); +@@ -44,6 +47,7 @@ struct mlx5e_tc_tunnel { + struct flow_cls_offload *f, + void *headers_c, + void *headers_v); ++ + int (*parse_tunnel)(struct mlx5e_priv *priv, + struct mlx5_flow_spec *spec, + struct flow_cls_offload *f, +@@ -115,6 +119,7 @@ int mlx5e_tc_tun_parse_udp_ports(struct + bool mlx5e_tc_tun_encap_info_equal_generic(struct mlx5e_encap_key *a, + struct mlx5e_encap_key *b); + ++#endif /* HAVE_TCF_TUNNEL_INFO */ + #endif /* CONFIG_MLX5_ESWITCH */ + + #endif //__MLX5_EN_TC_TUNNEL_H__ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0109-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0109-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..bfc6519 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0109-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,291 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c + +Change-Id: I006d73768393f3a6c4fb854bed06586660187ab2 +--- + .../mellanox/mlx5/core/en/tc_tun_encap.c | 53 +++++++++++++++++++ + 1 file changed, 53 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c +@@ -1,14 +1,20 @@ + // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB + /* Copyright (c) 2021 Mellanox Technologies. */ + ++#include ++#ifdef HAVE_FIB_NOTIFIER_HEADER_FILE + #include ++#endif ++#ifdef HAVE_FIB_INFO_NH + #include ++#endif + #include "tc_tun_encap.h" + #include "en_tc.h" + #include "tc_tun.h" + #include "rep/tc.h" + #include "diag/en_tc_tracepoint.h" + ++#ifdef HAVE_TCF_TUNNEL_INFO + enum { + MLX5E_ROUTE_ENTRY_VALID = BIT(0), + }; +@@ -312,6 +318,7 @@ void mlx5e_take_all_encap_flows(struct m + /* Takes reference to all flows attached to route and adds the flows to + * flow_list using 'tmp_list' list_head in mlx5e_tc_flow. + */ ++#ifdef HAVE_FIB_NOTIFIER_HEADER_FILE + static void mlx5e_take_all_route_decap_flows(struct mlx5e_route_entry *r, + struct list_head *flow_list) + { +@@ -320,6 +327,7 @@ static void mlx5e_take_all_route_decap_f + list_for_each_entry(flow, &r->decap_flows, decap_routes) + mlx5e_take_tmp_flow(flow, flow_list, 0); + } ++#endif + + typedef bool (match_cb)(struct mlx5e_encap_entry *); + +@@ -407,9 +415,11 @@ void mlx5e_tc_update_neigh_used_value(st + if (m_neigh->family == AF_INET) + tbl = &arp_tbl; + #if IS_ENABLED(CONFIG_IPV6) ++#if defined(HAVE_IPV6_STUBS_H) + else if (m_neigh->family == AF_INET6) + tbl = ipv6_stub->nd_tbl; + #endif ++#endif + else + return; + +@@ -450,7 +460,9 @@ void mlx5e_tc_update_neigh_used_value(st + } + } + ++#ifndef MLX_DISABLE_TRACEPOINTS + trace_mlx5e_tc_update_neigh_used_value(nhe, neigh_used); ++#endif + + if (neigh_used) { + nhe->reported_lastuse = jiffies; +@@ -719,6 +731,7 @@ out: + return err; + } + ++#ifdef HAVE_FIB_NOTIFIER_HEADER_FILE + static int mlx5e_update_vf_tunnel(struct mlx5_eswitch *esw, + struct mlx5_esw_flow_attr *attr, + struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts, +@@ -753,6 +766,7 @@ out: + dev_put(route_dev); + return err; + } ++#endif + + static unsigned int mlx5e_route_tbl_get_last_update(struct mlx5e_priv *priv) + { +@@ -1030,6 +1044,7 @@ static void mlx5e_route_dealloc(struct m + kfree_rcu(r, rcu); + } + ++#ifdef HAVE_FIB_NOTIFIER_HEADER_FILE + static void mlx5e_route_put(struct mlx5e_priv *priv, struct mlx5e_route_entry *r) + { + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; +@@ -1042,6 +1057,7 @@ static void mlx5e_route_put(struct mlx5e + + mlx5e_route_dealloc(priv, r); + } ++#endif + + static void mlx5e_route_put_locked(struct mlx5e_priv *priv, struct mlx5e_route_entry *r) + { +@@ -1119,6 +1135,7 @@ mlx5e_route_get_create(struct mlx5e_priv + return r; + } + ++#ifdef HAVE_FIB_NOTIFIER_HEADER_FILE + static struct mlx5e_route_entry * + mlx5e_route_lookup_for_update(struct mlx5e_tc_tun_encap *encap, struct mlx5e_route_key *key) + { +@@ -1181,6 +1198,7 @@ mlx5e_route_enqueue_update(struct mlx5e_ + + return 0; + } ++#endif + + int mlx5e_attach_decap_route(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow) +@@ -1221,6 +1239,7 @@ int mlx5e_attach_decap_route(struct mlx5 + /* Routing changed concurrently. FIB event handler might have missed new + * entry, schedule update. + */ ++#ifdef HAVE_FIB_NOTIFIER_HEADER_FILE + if (tbl_time_before != tbl_time_after) { + err = mlx5e_route_enqueue_update(priv, r, FIB_EVENT_ENTRY_REPLACE); + if (err) { +@@ -1228,6 +1247,7 @@ int mlx5e_attach_decap_route(struct mlx5 + goto out; + } + } ++#endif + + flow->decap_route = r; + list_add(&flow->decap_routes, &r->decap_flows); +@@ -1283,6 +1303,7 @@ static int mlx5e_attach_encap_route(stru + /* Routing changed concurrently. FIB event handler might have missed new + * entry, schedule update. + */ ++#ifdef HAVE_FIB_NOTIFIER_HEADER_FILE + if (tbl_time_before != tbl_time_after) { + err = mlx5e_route_enqueue_update(priv, r, FIB_EVENT_ENTRY_REPLACE); + if (err) { +@@ -1290,6 +1311,7 @@ static int mlx5e_attach_encap_route(stru + return err; + } + } ++#endif + + flow->encap_routes[out_index].r = r; + if (new_encap_entry) +@@ -1347,6 +1369,7 @@ static void mlx5e_detach_encap_route(str + mlx5e_route_dealloc(priv, r); + } + ++#ifdef HAVE_FIB_NOTIFIER_HEADER_FILE + static void mlx5e_invalidate_encap(struct mlx5e_priv *priv, + struct mlx5e_encap_entry *e, + struct list_head *encap_flows) +@@ -1618,7 +1641,9 @@ out: + dev_put(event_data->ul_dev); + kfree(event_data); + } ++#endif + ++#ifdef HAVE_FIB_NOTIFIER_INFO_HAS_FAMILY + static struct mlx5e_tc_fib_event_data * + mlx5e_init_fib_work_ipv4(struct mlx5e_priv *priv, + struct net_device *ul_dev, +@@ -1630,15 +1655,19 @@ mlx5e_init_fib_work_ipv4(struct mlx5e_pr + struct mlx5e_tc_fib_event_data *fib_work; + struct mlx5e_route_entry *r; + struct mlx5e_route_key key; ++#ifdef HAVE_FIB_INFO_NH + struct net_device *fib_dev; ++#endif + + fen_info = container_of(info, struct fib_entry_notifier_info, info); ++#ifdef HAVE_FIB_INFO_NH + if (fen_info->fi->nh) + return NULL; + fib_dev = fib_info_nh(fen_info->fi, 0)->fib_nh_dev; + if (!fib_dev || fib_dev->netdev_ops != &mlx5e_netdev_ops || + fen_info->dst_len != 32) + return NULL; ++#endif + + fib_work = mlx5e_tc_init_fib_work(event, ul_dev, GFP_ATOMIC); + if (!fib_work) +@@ -1671,24 +1700,31 @@ mlx5e_init_fib_work_ipv6(struct mlx5e_pr + unsigned long event, + struct fib_notifier_info *info) + { ++#ifdef HAVE_FIB6_ENTRY_NOTIFIER_INFO + struct fib6_entry_notifier_info *fen_info; + struct mlx5e_tc_fib_event_data *fib_work; + struct mlx5e_route_entry *r; + struct mlx5e_route_key key; ++#ifdef HAVE_FIB6_INFO_NH_DEV + struct net_device *fib_dev; ++#endif + + fen_info = container_of(info, struct fib6_entry_notifier_info, info); ++#ifdef HAVE_FIB6_INFO_NH_DEV + fib_dev = fib6_info_nh_dev(fen_info->rt); + if (fib_dev->netdev_ops != &mlx5e_netdev_ops || + fen_info->rt->fib6_dst.plen != 128) + return NULL; ++#endif + + fib_work = mlx5e_tc_init_fib_work(event, ul_dev, GFP_ATOMIC); + if (!fib_work) + return ERR_PTR(-ENOMEM); + ++#ifdef HAVE_FIB6_INFO_IN_FIB6_ENTRY_NOTIFIER_INFO + memcpy(&key.endpoint_ip.v6, &fen_info->rt->fib6_dst.addr, + sizeof(fen_info->rt->fib6_dst.addr)); ++#endif + key.ip_version = 6; + + /* Can't fail after this point because releasing reference to r +@@ -1705,11 +1741,14 @@ mlx5e_init_fib_work_ipv6(struct mlx5e_pr + + out: + kfree(fib_work); ++#endif + return NULL; + } ++#endif + + static int mlx5e_tc_tun_fib_event(struct notifier_block *nb, unsigned long event, void *ptr) + { ++#ifdef HAVE_FIB_NOTIFIER_HEADER_FILE + struct mlx5e_tc_fib_event_data *fib_work; + struct fib_notifier_info *info = ptr; + struct mlx5e_tc_tun_encap *encap; +@@ -1724,17 +1763,21 @@ static int mlx5e_tc_tun_fib_event(struct + switch (event) { + case FIB_EVENT_ENTRY_REPLACE: + case FIB_EVENT_ENTRY_DEL: ++#ifdef HAVE_FIB_NOTIFIER_INFO_HAS_FAMILY + if (info->family == AF_INET) + fib_work = mlx5e_init_fib_work_ipv4(priv, ul_dev, encap, event, info); + else if (info->family == AF_INET6) + fib_work = mlx5e_init_fib_work_ipv6(priv, ul_dev, encap, event, info); + else + return NOTIFY_DONE; ++#endif + + if (!IS_ERR_OR_NULL(fib_work)) { + queue_work(priv->wq, &fib_work->work); + } else if (IS_ERR(fib_work)) { ++#ifdef HAVE_NETDEV_NOTIFIER_INFO_EXTACK + NL_SET_ERR_MSG_MOD(info->extack, "Failed to init fib work"); ++#endif + mlx5_core_warn(priv->mdev, "Failed to init fib work, %ld\n", + PTR_ERR(fib_work)); + } +@@ -1743,6 +1786,7 @@ static int mlx5e_tc_tun_fib_event(struct + default: + return NOTIFY_DONE; + } ++#endif + + return NOTIFY_DONE; + } +@@ -1760,8 +1804,12 @@ struct mlx5e_tc_tun_encap *mlx5e_tc_tun_ + encap->fib_nb.notifier_call = mlx5e_tc_tun_fib_event; + spin_lock_init(&encap->route_lock); + hash_init(encap->route_tbl); ++#if defined(HAVE_REGISTER_FIB_NOTIFIER_HAS_4_PARAMS) + err = register_fib_notifier(dev_net(priv->netdev), &encap->fib_nb, + NULL, NULL); ++#else ++ err = register_fib_notifier(&encap->fib_nb, NULL); ++#endif + if (err) { + kvfree(encap); + return ERR_PTR(err); +@@ -1775,7 +1823,12 @@ void mlx5e_tc_tun_cleanup(struct mlx5e_t + if (!encap) + return; + ++#if defined(HAVE_REGISTER_FIB_NOTIFIER_HAS_4_PARAMS) + unregister_fib_notifier(dev_net(encap->priv->netdev), &encap->fib_nb); ++#else ++ unregister_fib_notifier(&encap->fib_nb); ++#endif + flush_workqueue(encap->priv->wq); /* flush fib event works */ + kvfree(encap); + } ++#endif /* HAVE_TCF_TUNNEL_INFO */ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0110-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0110-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..92b84a3 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0110-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,183 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_geneve.c + +Change-Id: Ieef58a0e70fb17f456f22cbc9d8ece323c251e5e +--- + .../mellanox/mlx5/core/en/tc_tun_geneve.c | 38 ++++++++++++++++++- + 1 file changed, 36 insertions(+), 2 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_geneve.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_geneve.c +@@ -1,6 +1,8 @@ + // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB + /* Copyright (c) 2018 Mellanox Technologies. */ + ++#ifdef HAVE_FLOW_DISSECTOR_KEY_ENC_OPTS ++ + #include + #include "lib/geneve.h" + #include "en/tc_tun.h" +@@ -21,9 +23,12 @@ static int mlx5e_tc_tun_calc_hlen_geneve + + static int mlx5e_tc_tun_check_udp_dport_geneve(struct mlx5e_priv *priv, + struct flow_cls_offload *f) ++ + { + struct flow_rule *rule = flow_cls_offload_flow_rule(f); +- struct netlink_ext_ack *extack = f->common.extack; ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK ++ struct netlink_ext_ack *extack = f->common.extack; ++#endif + struct flow_match_ports enc_ports; + + if (!flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_PORTS)) +@@ -35,8 +40,10 @@ static int mlx5e_tc_tun_check_udp_dport_ + * port, so udp dst port must match. + */ + if (be16_to_cpu(enc_ports.key->dst) != GENEVE_UDP_PORT) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, + "Matched UDP dst port is not registered as a GENEVE port"); ++#endif + netdev_warn(priv->netdev, + "UDP port %d is not registered as a GENEVE port\n", + be16_to_cpu(enc_ports.key->dst)); +@@ -55,6 +62,7 @@ static int mlx5e_tc_tun_parse_udp_ports_ + int err; + + err = mlx5e_tc_tun_parse_udp_ports(priv, spec, f, headers_c, headers_v); ++ + if (err) + return err; + +@@ -125,7 +133,9 @@ static int mlx5e_tc_tun_parse_geneve_vni + struct flow_cls_offload *f) + { + struct flow_rule *rule = flow_cls_offload_flow_rule(f); +- struct netlink_ext_ack *extack = f->common.extack; ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK ++ struct netlink_ext_ack *extack = f->common.extack; ++#endif + struct flow_match_enc_keyid enc_keyid; + void *misc_c, *misc_v; + +@@ -141,7 +151,9 @@ static int mlx5e_tc_tun_parse_geneve_vni + return 0; + + if (!MLX5_CAP_ESW_FLOWTABLE_FDB(priv->mdev, ft_field_support.outer_geneve_vni)) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, "Matching on GENEVE VNI is not supported"); ++#endif + netdev_warn(priv->netdev, "Matching on GENEVE VNI is not supported\n"); + return -EOPNOTSUPP; + } +@@ -159,7 +171,9 @@ static int mlx5e_tc_tun_parse_geneve_opt + u8 max_tlv_option_data_len = MLX5_CAP_GEN(priv->mdev, max_geneve_tlv_option_data_len); + u8 max_tlv_options = MLX5_CAP_GEN(priv->mdev, max_geneve_tlv_options); + struct flow_rule *rule = flow_cls_offload_flow_rule(f); ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + struct netlink_ext_ack *extack = f->common.extack; ++#endif + void *misc_c, *misc_v, *misc_3_c, *misc_3_v; + struct geneve_opt *option_key, *option_mask; + __be32 opt_data_key = 0, opt_data_mask = 0; +@@ -179,8 +193,10 @@ static int mlx5e_tc_tun_parse_geneve_opt + if (memchr_inv(&enc_opts.mask->data, 0, sizeof(enc_opts.mask->data)) && + !MLX5_CAP_ESW_FLOWTABLE_FDB(priv->mdev, + ft_field_support.geneve_tlv_option_0_data)) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, + "Matching on GENEVE options is not supported"); ++#endif + netdev_warn(priv->netdev, + "Matching on GENEVE options is not supported\n"); + return -EOPNOTSUPP; +@@ -189,8 +205,10 @@ static int mlx5e_tc_tun_parse_geneve_opt + /* make sure that we're talking about GENEVE options */ + + if (enc_opts.key->dst_opt_type != TUNNEL_GENEVE_OPT) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, + "Matching on GENEVE options: option type is not GENEVE"); ++#endif + netdev_warn(priv->netdev, + "Matching on GENEVE options: option type is not GENEVE\n"); + return -EOPNOTSUPP; +@@ -199,7 +217,9 @@ static int mlx5e_tc_tun_parse_geneve_opt + if (enc_opts.mask->len && + !MLX5_CAP_ESW_FLOWTABLE_FDB(priv->mdev, + ft_field_support.outer_geneve_opt_len)) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, "Matching on GENEVE options len is not supported"); ++#endif + netdev_warn(priv->netdev, + "Matching on GENEVE options len is not supported\n"); + return -EOPNOTSUPP; +@@ -212,8 +232,10 @@ static int mlx5e_tc_tun_parse_geneve_opt + */ + + if ((enc_opts.key->len / 4) > ((max_tlv_option_data_len + 1) * max_tlv_options)) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, + "Matching on GENEVE options: unsupported options len"); ++#endif + netdev_warn(priv->netdev, + "Matching on GENEVE options: unsupported options len (len=%d)\n", + enc_opts.key->len); +@@ -232,8 +254,10 @@ static int mlx5e_tc_tun_parse_geneve_opt + return 0; + + if (option_key->length > max_tlv_option_data_len) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, + "Matching on GENEVE options: unsupported option len"); ++#endif + netdev_warn(priv->netdev, + "Matching on GENEVE options: unsupported option len (key=%d, mask=%d)\n", + option_key->length, option_mask->length); +@@ -242,8 +266,10 @@ static int mlx5e_tc_tun_parse_geneve_opt + + /* data can't be all 0 - fail to offload such rule */ + if (!memchr_inv(option_key->opt_data, 0, option_key->length * 4)) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, + "Matching on GENEVE options: can't match on 0 data field"); ++#endif + netdev_warn(priv->netdev, + "Matching on GENEVE options: can't match on 0 data field\n"); + return -EOPNOTSUPP; +@@ -252,8 +278,10 @@ static int mlx5e_tc_tun_parse_geneve_opt + /* add new GENEVE TLV options object */ + res = mlx5_geneve_tlv_option_add(priv->mdev->geneve, option_key); + if (res) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, + "Matching on GENEVE options: failed creating TLV opt object"); ++#endif + netdev_warn(priv->netdev, + "Matching on GENEVE options: failed creating TLV opt object (class:type:len = 0x%x:0x%x:%d)\n", + be16_to_cpu(option_key->opt_class), +@@ -290,12 +318,16 @@ static int mlx5e_tc_tun_parse_geneve_par + { + void *misc_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, misc_parameters); + void *misc_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, misc_parameters); ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + struct netlink_ext_ack *extack = f->common.extack; ++#endif + + /* match on OAM - packets with OAM bit on should NOT be offloaded */ + + if (!MLX5_CAP_ESW_FLOWTABLE_FDB(priv->mdev, ft_field_support.outer_geneve_oam)) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, "Matching on GENEVE OAM is not supported"); ++#endif + netdev_warn(priv->netdev, "Matching on GENEVE OAM is not supported\n"); + return -EOPNOTSUPP; + } +@@ -373,3 +405,5 @@ struct mlx5e_tc_tunnel geneve_tunnel = { + .parse_tunnel = mlx5e_tc_tun_parse_geneve, + .encap_info_equal = mlx5e_tc_tun_encap_info_equal_geneve, + }; ++ ++#endif /* HAVE_FLOW_DISSECTOR_KEY_ENC_OPTS */ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0111-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0111-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..85c5e4d --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0111-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,25 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_gre.c + +Change-Id: I61e04f475e71876acbd3a4fab72cf82f738dd95a +--- + drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_gre.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_gre.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_gre.c +@@ -1,6 +1,8 @@ + // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB + /* Copyright (c) 2018 Mellanox Technologies. */ + ++#ifdef HAVE_TCF_TUNNEL_INFO ++ + #include + #include "en/tc_tun.h" + +@@ -96,3 +98,4 @@ struct mlx5e_tc_tunnel gre_tunnel = { + .parse_tunnel = mlx5e_tc_tun_parse_gretap, + .encap_info_equal = mlx5e_tc_tun_encap_info_equal_generic, + }; ++#endif /* HAVE_TCF_TUNNEL_INFO */ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0112-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0112-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..68be4c9 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0112-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,25 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_mplsoudp.c + +Change-Id: I5a4e8ba9a095fe7e3c035d7edc52b8e794f60f9b +--- + drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_mplsoudp.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_mplsoudp.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_mplsoudp.c +@@ -1,6 +1,8 @@ + // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB + /* Copyright (c) 2018 Mellanox Technologies. */ + ++#if defined(HAVE_NET_BAREUDP_H) && defined(HAVE_FLOW_DISSECTOR_MPLS_LSE) ++ + #include + #include + #include "en/tc_tun.h" +@@ -126,3 +128,4 @@ struct mlx5e_tc_tunnel mplsoudp_tunnel = + .parse_tunnel = parse_tunnel, + .encap_info_equal = mlx5e_tc_tun_encap_info_equal_generic, + }; ++#endif /* HAVE_NET_BAREUDP_H */ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0113-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0113-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..d690a99 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0113-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,98 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_vxlan.c + +Change-Id: I9d7c3152268794515b9e240f96b4323bbc631357 +--- + .../mellanox/mlx5/core/en/tc_tun_vxlan.c | 25 ++++++++++++++++++- + 1 file changed, 24 insertions(+), 1 deletion(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_vxlan.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_vxlan.c +@@ -1,6 +1,8 @@ + // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB + /* Copyright (c) 2018 Mellanox Technologies. */ + ++#ifdef HAVE_TCF_TUNNEL_INFO ++ + #include + #include "lib/vxlan.h" + #include "en/tc_tun.h" +@@ -17,9 +19,12 @@ static int mlx5e_tc_tun_calc_hlen_vxlan( + + static int mlx5e_tc_tun_check_udp_dport_vxlan(struct mlx5e_priv *priv, + struct flow_cls_offload *f) ++ + { + struct flow_rule *rule = flow_cls_offload_flow_rule(f); +- struct netlink_ext_ack *extack = f->common.extack; ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK ++ struct netlink_ext_ack *extack = f->common.extack; ++#endif + struct flow_match_ports enc_ports; + + if (!flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_PORTS)) +@@ -31,8 +36,10 @@ static int mlx5e_tc_tun_check_udp_dport_ + + if (!mlx5_vxlan_lookup_port(priv->mdev->vxlan, + be16_to_cpu(enc_ports.key->dst))) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, + "Matched UDP dst port is not registered as a VXLAN port"); ++#endif + netdev_warn(priv->netdev, + "UDP port %d is not registered as a VXLAN port\n", + be16_to_cpu(enc_ports.key->dst)); +@@ -67,8 +74,10 @@ static int mlx5e_tc_tun_init_encap_attr_ + e->tunnel = &vxlan_tunnel; + + if (!mlx5_vxlan_lookup_port(priv->mdev->vxlan, dst_port)) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, + "vxlan udp dport was not registered with the HW"); ++#endif + netdev_warn(priv->netdev, + "%d isn't an offloaded vxlan udp dport\n", + dst_port); +@@ -105,7 +114,9 @@ static int mlx5e_tc_tun_parse_vxlan(stru + void *headers_v) + { + struct flow_rule *rule = flow_cls_offload_flow_rule(f); ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + struct netlink_ext_ack *extack = f->common.extack; ++#endif + struct flow_match_enc_keyid enc_keyid; + void *misc_c, *misc_v; + +@@ -124,8 +135,10 @@ static int mlx5e_tc_tun_parse_vxlan(stru + + if (!MLX5_CAP_ESW_FLOWTABLE_FDB(priv->mdev, + ft_field_support.outer_vxlan_vni)) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, + "Matching on VXLAN VNI is not supported"); ++#endif + netdev_warn(priv->netdev, + "Matching on VXLAN VNI is not supported\n"); + return -EOPNOTSUPP; +@@ -145,6 +158,15 @@ static int mlx5e_tc_tun_get_remote_ifind + { + const struct vxlan_dev *vxlan = netdev_priv(mirred_dev); + const struct vxlan_rdst *dst = &vxlan->default_dst; ++ int i; ++ ++ for (i = 0; i < 5; i++) { ++ if ((int) dst->remote_ifindex >= 0) ++ break; ++ msleep(1); ++ } ++ if ((int) dst->remote_ifindex < 0) ++ return 0; + + return dst->remote_ifindex; + } +@@ -161,3 +183,4 @@ struct mlx5e_tc_tunnel vxlan_tunnel = { + .encap_info_equal = mlx5e_tc_tun_encap_info_equal_generic, + .get_remote_ifindex = mlx5e_tc_tun_get_remote_ifindex, + }; ++#endif /* HAVE_TCF_TUNNEL_INFO */ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0114-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0114-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..303a706 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0114-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,73 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/en/trap.c + +Change-Id: I04a85c22668de7ccf0c68bb36262d728855f7a26 +--- + drivers/net/ethernet/mellanox/mlx5/core/en/trap.c | 14 ++++++++++++++ + 1 file changed, 14 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/trap.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/trap.c +@@ -1,6 +1,8 @@ + // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB + /* Copyright (c) 2020 Mellanox Technologies */ + ++#ifdef HAVE_DEVLINK_TRAP_SUPPORT ++ + #include + #include "en/txrx.h" + #include "en/params.h" +@@ -53,7 +55,11 @@ static void mlx5e_init_trap_rq(struct ml + rq->hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu); + rq->stats = &priv->trap_stats.rq; + rq->ptp_cyc2time = mlx5_rq_ts_translator(mdev); ++#ifdef HAVE_XDP_SUPPORT ++#ifdef HAVE_XDP_RXQ_INFO + xdp_rxq_info_unused(&rq->xdp_rxq); ++#endif ++#endif + mlx5e_rq_set_trap_handlers(rq, params); + } + +@@ -225,11 +231,13 @@ static int mlx5e_handle_action_trap(stru + if (err) + goto err_out; + break; ++#ifdef HAVE_DEVLINK_TRAP_DMAC_FILTER + case DEVLINK_TRAP_GENERIC_ID_DMAC_FILTER: + err = mlx5e_add_mac_trap(priv, trap_id, mlx5e_trap_get_tirn(priv->en_trap)); + if (err) + goto err_out; + break; ++#endif + default: + netdev_warn(priv->netdev, "%s: Unknown trap id %d\n", __func__, trap_id); + err = -EINVAL; +@@ -249,9 +257,11 @@ static int mlx5e_handle_action_drop(stru + case DEVLINK_TRAP_GENERIC_ID_INGRESS_VLAN_FILTER: + mlx5e_remove_vlan_trap(priv); + break; ++#ifdef HAVE_DEVLINK_TRAP_DMAC_FILTER + case DEVLINK_TRAP_GENERIC_ID_DMAC_FILTER: + mlx5e_remove_mac_trap(priv); + break; ++#endif + default: + netdev_warn(priv->netdev, "%s: Unknown trap id %d\n", __func__, trap_id); + return -EINVAL; +@@ -305,7 +315,9 @@ static int mlx5e_apply_trap(struct mlx5e + + static const int mlx5e_traps_arr[] = { + DEVLINK_TRAP_GENERIC_ID_INGRESS_VLAN_FILTER, ++#ifdef HAVE_DEVLINK_TRAP_DMAC_FILTER + DEVLINK_TRAP_GENERIC_ID_DMAC_FILTER, ++#endif + }; + + int mlx5e_apply_traps(struct mlx5e_priv *priv, bool enable) +@@ -320,3 +332,5 @@ int mlx5e_apply_traps(struct mlx5e_priv + } + return 0; + } ++ ++#endif /* HAVE_DEVLINK_TRAP_SUPPORT */ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0115-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0115-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..60faa41 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0115-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,27 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/en/trap.h + +Change-Id: I198c63243d2b896d229250aeb89265720f2b1160 +--- + drivers/net/ethernet/mellanox/mlx5/core/en/trap.h | 7 +++++++ + 1 file changed, 7 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/trap.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/trap.h +@@ -29,9 +29,16 @@ struct mlx5e_trap { + struct mlx5e_rq_param rq_param; + }; + ++#ifdef HAVE_DEVLINK_TRAP_SUPPORT + void mlx5e_close_trap(struct mlx5e_trap *trap); + void mlx5e_deactivate_trap(struct mlx5e_priv *priv); + int mlx5e_handle_trap_event(struct mlx5e_priv *priv, struct mlx5_trap_ctx *trap_ctx); + int mlx5e_apply_traps(struct mlx5e_priv *priv, bool enable); ++#else ++void mlx5e_close_trap(struct mlx5e_trap *trap) {} ++void mlx5e_deactivate_trap(struct mlx5e_priv *priv) {} ++int mlx5e_handle_trap_event(struct mlx5e_priv *priv, struct mlx5_trap_ctx *trap_ctx) { return 0; } ++int mlx5e_apply_traps(struct mlx5e_priv *priv, bool enable) { return 0; } ++#endif /* HAVE_DEVLINK_TRAP_SUPPORT */ + + #endif diff --git a/src/mlnx-ofa_kernel-5.8/backports/0116-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0116-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..e9229a2 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0116-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,94 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h + +Change-Id: I26abe6ad6a2b2b07cc594ff716334b85bb6fe11d +--- + .../net/ethernet/mellanox/mlx5/core/en/txrx.h | 45 +++++++++++++------ + 1 file changed, 31 insertions(+), 14 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h +@@ -9,18 +9,7 @@ + + #define MLX5E_TX_WQE_EMPTY_DS_COUNT (sizeof(struct mlx5e_tx_wqe) / MLX5_SEND_WQE_DS) + +-/* The mult of MLX5_SEND_WQE_MAX_WQEBBS * MLX5_SEND_WQEBB_NUM_DS +- * (16 * 4 == 64) does not fit in the 6-bit DS field of Ctrl Segment. +- * We use a bound lower that MLX5_SEND_WQE_MAX_WQEBBS to let a +- * full-session WQE be cache-aligned. +- */ +-#if L1_CACHE_BYTES < 128 +-#define MLX5E_TX_MPW_MAX_WQEBBS (MLX5_SEND_WQE_MAX_WQEBBS - 1) +-#else +-#define MLX5E_TX_MPW_MAX_WQEBBS (MLX5_SEND_WQE_MAX_WQEBBS - 2) +-#endif +- +-#define MLX5E_TX_MPW_MAX_NUM_DS (MLX5E_TX_MPW_MAX_WQEBBS * MLX5_SEND_WQEBB_NUM_DS) ++#define MLX5_XMIT_MORE_SKB_CB 0xa + + #define INL_HDR_START_SZ (sizeof(((struct mlx5_wqe_eth_seg *)NULL)->inline_hdr.start)) + +@@ -37,7 +26,7 @@ enum mlx5e_icosq_wqe_type { + MLX5E_ICOSQ_WQE_NOP, + MLX5E_ICOSQ_WQE_UMR_RX, + MLX5E_ICOSQ_WQE_SHAMPO_HD_UMR, +-#ifdef CONFIG_MLX5_EN_TLS ++#ifdef HAVE_KTLS_RX_SUPPORT + MLX5E_ICOSQ_WQE_UMR_TLS, + MLX5E_ICOSQ_WQE_SET_PSV_TLS, + MLX5E_ICOSQ_WQE_GET_PSV_TLS, +@@ -69,8 +58,27 @@ void mlx5e_free_rx_descs(struct mlx5e_rq + void mlx5e_free_rx_in_progress_descs(struct mlx5e_rq *rq); + + /* TX */ ++#ifdef HAVE_NDO_SELECT_QUEUE_HAS_3_PARMS_NO_FALLBACK + u16 mlx5e_select_queue(struct net_device *dev, struct sk_buff *skb, + struct net_device *sb_dev); ++ ++#elif defined(HAVE_SELECT_QUEUE_FALLBACK_T) ++ ++u16 mlx5e_select_queue(struct net_device *dev, struct sk_buff *skb, ++#ifdef HAVE_SELECT_QUEUE_FALLBACK_T ++#ifdef HAVE_SELECT_QUEUE_NET_DEVICE ++ struct net_device *sb_dev, ++#else ++ void *accel_priv, ++#endif /* HAVE_SELECT_QUEUE_NET_DEVICE */ ++ select_queue_fallback_t fallback); ++#else ++ void *accel_priv); ++#endif ++#else /* HAVE_SELECT_QUEUE_FALLBACK_T */ ++u16 mlx5e_select_queue(struct net_device *dev, struct sk_buff *skb); ++#endif ++ + netdev_tx_t mlx5e_xmit(struct sk_buff *skb, struct net_device *dev); + bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget); + void mlx5e_free_txqsq_descs(struct mlx5e_txqsq *sq); +@@ -240,7 +248,11 @@ mlx5e_notify_hw(struct mlx5_wq_cyc *wq, + { + ctrl->fm_ce_se |= MLX5_WQE_CTRL_CQ_UPDATE; + /* ensure wqe is visible to device before updating doorbell record */ ++#ifdef dma_wmb + dma_wmb(); ++#else ++ wmb(); ++#endif + + *wq->db = cpu_to_be32(pc); + +@@ -432,8 +444,13 @@ mlx5e_set_eseg_swp(struct sk_buff *skb, + case IPPROTO_UDP: + eseg->swp_flags |= MLX5_ETH_WQE_SWP_INNER_L4_UDP; + fallthrough; ++ + case IPPROTO_TCP: +- eseg->swp_inner_l4_offset = skb_inner_transport_offset(skb) / 2; ++#ifdef HAVE_SKB_INNER_TRANSPORT_OFFSET ++ eseg->swp_inner_l4_offset = skb_inner_transport_offset(skb) / 2; ++#else ++ eseg->swp_inner_l4_offset = (skb_inner_transport_header(skb) - skb->data) / 2; ++#endif /*HAVE_SKB_INNER_TRANSPORT_OFFSET*/ + break; + } + } diff --git a/src/mlnx-ofa_kernel-5.8/backports/0117-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0117-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..e165f30 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0117-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,414 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c + +Change-Id: Iadf0cd7110da5fb771033d34fe4ad2c56a60d230 +--- + .../net/ethernet/mellanox/mlx5/core/en/xdp.c | 231 ++++++++++++++++-- + 1 file changed, 213 insertions(+), 18 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c +@@ -30,9 +30,18 @@ + * SOFTWARE. + */ + ++#ifdef HAVE_XDP_SUPPORT + #include ++#ifdef HAVE_NET_PAGE_POOL_H + #include ++#endif ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT ++#ifdef HAVE_XDP_SOCK_DRV_H + #include ++#else ++#include ++#endif ++#endif + #include "en/xdp.h" + #include "en/params.h" + +@@ -65,13 +74,18 @@ mlx5e_xmit_xdp_buff(struct mlx5e_xdpsq * + struct xdp_frame *xdpf; + dma_addr_t dma_addr; + ++#ifdef HAVE_XDP_CONVERT_BUFF_TO_FRAME + xdpf = xdp_convert_buff_to_frame(xdp); ++#else ++ xdpf = convert_to_xdp_frame(xdp); ++#endif + if (unlikely(!xdpf)) + return false; + + xdptxd.data = xdpf->data; + xdptxd.len = xdpf->len; + ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) { + /* The xdp_buff was in the UMEM and was copied into a newly + * allocated page. The UMEM page was returned via the ZCA, and +@@ -97,7 +111,9 @@ mlx5e_xmit_xdp_buff(struct mlx5e_xdpsq * + xdptxd.dma_addr = dma_addr; + xdpi.frame.xdpf = xdpf; + xdpi.frame.dma_addr = dma_addr; +- } else { ++ } else ++#endif ++ { + /* Driver assumes that xdp_convert_buff_to_frame returns + * an xdp_frame that points to the same memory region as + * the original xdp_buff. It allows to map the memory only +@@ -125,12 +141,28 @@ bool mlx5e_xdp_handle(struct mlx5e_rq *r + { + struct bpf_prog *prog = rcu_dereference(rq->xdp_prog); + u32 act; ++#ifdef HAVE_XDP_SUPPORT + int err; ++#endif + + if (!prog) + return false; + + act = bpf_prog_run_xdp(prog, xdp); ++ ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT ++#ifndef HAVE_XSK_BUFF_ALLOC ++ if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) { ++ u64 off = xdp->data - xdp->data_hard_start; ++ ++#ifdef HAVE_XSK_UMEM_ADJUST_OFFSET ++ xdp->handle = xsk_umem_adjust_offset(rq->umem, xdp->handle, off); ++#else ++ xdp->handle = xdp->handle + off; ++#endif ++ } ++#endif ++#endif + switch (act) { + case XDP_PASS: + *len = xdp->data_end - xdp->data; +@@ -140,28 +172,42 @@ bool mlx5e_xdp_handle(struct mlx5e_rq *r + goto xdp_abort; + __set_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags); /* non-atomic */ + return true; ++#ifdef HAVE_XDP_SUPPORT + case XDP_REDIRECT: ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + if (xdp->rxq->mem.type != MEM_TYPE_XSK_BUFF_POOL) { ++#endif + page_ref_sub(di->page, di->refcnt_bias); + di->refcnt_bias = 0; ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + } ++#endif + /* When XDP enabled then page-refcnt==1 here */ + err = xdp_do_redirect(rq->netdev, xdp, prog); + if (unlikely(err)) + goto xdp_abort; + __set_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags); + __set_bit(MLX5E_RQ_FLAG_XDP_REDIRECT, rq->flags); ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + if (xdp->rxq->mem.type != MEM_TYPE_XSK_BUFF_POOL) ++#endif + mlx5e_page_dma_unmap(rq, di); + rq->stats->xdp_redirect++; + return true; ++#endif + default: ++#ifdef HAVE_BPF_WARN_IVALID_XDP_ACTION_GET_3_PARAMS + bpf_warn_invalid_xdp_action(rq->netdev, prog, act); ++#else ++ bpf_warn_invalid_xdp_action(act); ++#endif + fallthrough; + case XDP_ABORTED: + xdp_abort: ++#if defined(HAVE_TRACE_XDP_EXCEPTION) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_xdp_exception(rq->netdev, prog, act); + fallthrough; ++#endif + case XDP_DROP: + rq->stats->xdp_drop++; + return true; +@@ -378,9 +424,15 @@ mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq + + static void mlx5e_free_xdpsq_desc(struct mlx5e_xdpsq *sq, + struct mlx5e_xdp_wqe_info *wi, ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + u32 *xsk_frames, +- bool recycle, +- struct xdp_frame_bulk *bq) ++#endif ++ bool recycle ++#ifdef HAVE_XDP_FRAME_BULK ++ , struct xdp_frame_bulk *bq) ++#else ++ ) ++#endif + { + struct mlx5e_xdp_info_fifo *xdpi_fifo = &sq->db.xdpi_fifo; + u16 i; +@@ -393,16 +445,25 @@ static void mlx5e_free_xdpsq_desc(struct + /* XDP_TX from the XSK RQ and XDP_REDIRECT */ + dma_unmap_single(sq->pdev, xdpi.frame.dma_addr, + xdpi.frame.xdpf->len, DMA_TO_DEVICE); ++#ifdef HAVE_XDP_FRAME_BULK + xdp_return_frame_bulk(xdpi.frame.xdpf, bq); ++#elif defined(HAVE_XDP_FRAME) ++ xdp_return_frame(xdpi.frame.xdpf); ++#else ++ /* Assumes order0 page*/ ++ put_page(virt_to_page(xdpi.frame.xdpf->data)); ++#endif + break; + case MLX5E_XDP_XMIT_MODE_PAGE: + /* XDP_TX from the regular RQ */ + mlx5e_page_release_dynamic(xdpi.page.rq, &xdpi.page.di, recycle); + break; ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + case MLX5E_XDP_XMIT_MODE_XSK: + /* AF_XDP send */ + (*xsk_frames)++; + break; ++#endif + default: + WARN_ON_ONCE(true); + } +@@ -411,14 +472,20 @@ static void mlx5e_free_xdpsq_desc(struct + + bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq) + { ++#ifdef HAVE_XDP_FRAME_BULK + struct xdp_frame_bulk bq; ++#endif + struct mlx5e_xdpsq *sq; + struct mlx5_cqe64 *cqe; ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + u32 xsk_frames = 0; ++#endif + u16 sqcc; + int i; + ++#ifdef HAVE_XDP_FRAME_BULK + xdp_frame_bulk_init(&bq); ++#endif + + sq = container_of(cq, struct mlx5e_xdpsq, cq); + +@@ -451,7 +518,15 @@ bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq + + sqcc += wi->num_wqebbs; + +- mlx5e_free_xdpsq_desc(sq, wi, &xsk_frames, true, &bq); ++ mlx5e_free_xdpsq_desc(sq, wi ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT ++ , &xsk_frames ++#endif ++ , true ++#ifdef HAVE_XDP_FRAME_BULK ++ , &bq ++#endif ++ ); + } while (!last_wqe); + + if (unlikely(get_cqe_opcode(cqe) != MLX5_CQE_REQ)) { +@@ -464,10 +539,18 @@ bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq + } + } while ((++i < MLX5E_TX_CQ_POLL_BUDGET) && (cqe = mlx5_cqwq_get_cqe(&cq->wq))); + ++#ifdef HAVE_XDP_FRAME_BULK + xdp_flush_frame_bulk(&bq); ++#endif + ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + if (xsk_frames) ++#ifdef HAVE_NETDEV_BPF_XSK_BUFF_POOL + xsk_tx_completed(sq->xsk_pool, xsk_frames); ++#else ++ xsk_umem_complete_tx(sq->umem, xsk_frames); ++#endif ++#endif + + sq->stats->cqes += i; + +@@ -482,12 +565,18 @@ bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq + + void mlx5e_free_xdpsq_descs(struct mlx5e_xdpsq *sq) + { ++#ifdef HAVE_XDP_FRAME_BULK + struct xdp_frame_bulk bq; ++#endif ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + u32 xsk_frames = 0; ++#endif + ++#ifdef HAVE_XDP_FRAME_BULK + xdp_frame_bulk_init(&bq); + + rcu_read_lock(); /* need for xdp_return_frame_bulk */ ++#endif + + while (sq->cc != sq->pc) { + struct mlx5e_xdp_wqe_info *wi; +@@ -498,16 +587,57 @@ void mlx5e_free_xdpsq_descs(struct mlx5e + + sq->cc += wi->num_wqebbs; + +- mlx5e_free_xdpsq_desc(sq, wi, &xsk_frames, false, &bq); ++ mlx5e_free_xdpsq_desc(sq, wi ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT ++ , &xsk_frames ++#endif ++ , false ++#ifdef HAVE_XDP_FRAME_BULK ++ , &bq ++#endif ++ ); + } + ++#ifdef HAVE_XDP_FRAME_BULK + xdp_flush_frame_bulk(&bq); ++ + rcu_read_unlock(); ++#endif + ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + if (xsk_frames) ++#ifdef HAVE_NETDEV_BPF_XSK_BUFF_POOL + xsk_tx_completed(sq->xsk_pool, xsk_frames); ++#else ++ xsk_umem_complete_tx(sq->umem, xsk_frames); ++#endif ++#endif ++} ++ ++void mlx5e_xdp_rx_poll_complete(struct mlx5e_rq *rq) ++{ ++ struct mlx5e_xdpsq *xdpsq = rq->xdpsq; ++ ++ if (xdpsq->mpwqe.wqe) ++ mlx5e_xdp_mpwqe_complete(xdpsq); ++ ++ mlx5e_xmit_xdp_doorbell(xdpsq); ++ if (test_bit(MLX5E_RQ_FLAG_XDP_REDIRECT, rq->flags)) { ++ xdp_do_flush_map(); ++ __clear_bit(MLX5E_RQ_FLAG_XDP_REDIRECT, rq->flags); ++ } + } + ++void mlx5e_set_xmit_fp(struct mlx5e_xdpsq *sq, bool is_mpw) ++{ ++ sq->xmit_xdp_frame_check = is_mpw ? ++ mlx5e_xmit_xdp_frame_check_mpwqe : mlx5e_xmit_xdp_frame_check; ++ sq->xmit_xdp_frame = is_mpw ? ++ mlx5e_xmit_xdp_frame_mpwqe : mlx5e_xmit_xdp_frame; ++} ++ ++#ifdef HAVE_NDO_XDP_XMIT ++#ifndef HAVE_NDO_XDP_FLUSH + int mlx5e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, + u32 flags) + { +@@ -568,25 +698,90 @@ int mlx5e_xdp_xmit(struct net_device *de + return nxmit; + } + +-void mlx5e_xdp_rx_poll_complete(struct mlx5e_rq *rq) ++#else ++int mlx5e_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp) + { +- struct mlx5e_xdpsq *xdpsq = rq->xdpsq; ++ struct mlx5e_priv *priv = netdev_priv(dev); ++ struct mlx5e_xmit_data xdptxd; ++ struct mlx5e_xdp_info xdpi; ++ struct xdp_frame *xdpf; ++ struct mlx5e_xdpsq *sq; ++ int sq_num; ++ int err = 0; + +- if (xdpsq->mpwqe.wqe) +- mlx5e_xdp_mpwqe_complete(xdpsq); ++ /* this flag is sufficient, no need to test internal sq state */ ++ if (unlikely(!mlx5e_xdp_tx_is_enabled(priv))) ++ return -ENETDOWN; + +- mlx5e_xmit_xdp_doorbell(xdpsq); ++ sq_num = smp_processor_id(); + +- if (test_bit(MLX5E_RQ_FLAG_XDP_REDIRECT, rq->flags)) { +- xdp_do_flush_map(); +- __clear_bit(MLX5E_RQ_FLAG_XDP_REDIRECT, rq->flags); ++ if (unlikely(sq_num >= priv->channels.num)) ++ return -ENXIO; ++ ++ sq = &priv->channels.c[sq_num]->xdpsq; ++ ++ xdpf = convert_to_xdp_frame(xdp); ++ ++ if (unlikely(!xdpf)) ++ return -EINVAL; ++ ++ xdptxd.data = xdpf->data; ++ xdptxd.len = xdpf->len; ++ ++ xdptxd.dma_addr = dma_map_single(sq->pdev, xdptxd.data, ++ xdptxd.len, DMA_TO_DEVICE); ++ ++ if (unlikely(dma_mapping_error(sq->pdev, xdptxd.dma_addr))) { ++ err = -ENOMEM; ++ goto err_release_page; + } ++ ++ xdpi.mode = MLX5E_XDP_XMIT_MODE_FRAME; ++ xdpi.frame.xdpf = xdpf; ++ xdpi.frame.dma_addr = xdptxd.dma_addr; ++ ++ if (unlikely(!sq->xmit_xdp_frame(sq, &xdptxd, &xdpi, 0))) { ++ dma_unmap_single(sq->pdev, xdptxd.dma_addr, ++ xdptxd.len, DMA_TO_DEVICE); ++ err = -ENOSPC; ++ goto err_release_page; ++ } ++ ++ return 0; ++ ++err_release_page: ++#ifdef HAVE_XDP_FRAME ++ xdp_return_frame_rx_napi(xdpf); ++#else ++ /* Assumes order0 page */ ++ put_page(virt_to_page(xdpf->data)); ++#endif ++ ++ return err; + } + +-void mlx5e_set_xmit_fp(struct mlx5e_xdpsq *sq, bool is_mpw) ++void mlx5e_xdp_flush(struct net_device *dev) + { +- sq->xmit_xdp_frame_check = is_mpw ? +- mlx5e_xmit_xdp_frame_check_mpwqe : mlx5e_xmit_xdp_frame_check; +- sq->xmit_xdp_frame = is_mpw ? +- mlx5e_xmit_xdp_frame_mpwqe : mlx5e_xmit_xdp_frame; ++ struct mlx5e_priv *priv = netdev_priv(dev); ++ struct mlx5e_xdpsq *sq; ++ int sq_num; ++ ++ /* this flag is sufficient, no need to test internal sq state */ ++ if (unlikely(!mlx5e_xdp_tx_is_enabled(priv))) ++ return; ++ ++ sq_num = smp_processor_id(); ++ ++ if (unlikely(sq_num >= priv->channels.num)) ++ return; ++ ++ sq = &priv->channels.c[sq_num]->xdpsq; ++ ++ if (sq->mpwqe.wqe) ++ mlx5e_xdp_mpwqe_complete(sq); ++ mlx5e_xmit_xdp_doorbell(sq); + } ++#endif ++#endif ++#endif ++ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0118-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0118-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..2ad4415 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0118-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,47 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h + +Change-Id: Ib48b0889eba748480d35057bc445ec1977696ae6 +--- + drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h | 10 +++++++++- + 1 file changed, 9 insertions(+), 1 deletion(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h +@@ -32,6 +32,7 @@ + #ifndef __MLX5_EN_XDP_H__ + #define __MLX5_EN_XDP_H__ + ++#ifdef HAVE_XDP_SUPPORT + #include + + #include "en.h" +@@ -54,8 +55,15 @@ bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq + void mlx5e_free_xdpsq_descs(struct mlx5e_xdpsq *sq); + void mlx5e_set_xmit_fp(struct mlx5e_xdpsq *sq, bool is_mpw); + void mlx5e_xdp_rx_poll_complete(struct mlx5e_rq *rq); ++#ifdef HAVE_NDO_XDP_XMIT ++#ifndef HAVE_NDO_XDP_FLUSH + int mlx5e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, + u32 flags); ++#else ++int mlx5e_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp); ++void mlx5e_xdp_flush(struct net_device *dev); ++#endif ++#endif + + INDIRECT_CALLABLE_DECLARE(bool mlx5e_xmit_xdp_frame_mpwqe(struct mlx5e_xdpsq *sq, + struct mlx5e_xmit_data *xdptxd, +@@ -103,7 +111,6 @@ static inline void mlx5e_xmit_xdp_doorbe + sq->doorbell_cseg = NULL; + } + } +- + /* Enable inline WQEs to shift some load from a congested HCA (HW) to + * a less congested cpu (SW). + */ +@@ -185,3 +192,4 @@ mlx5e_xdpi_fifo_pop(struct mlx5e_xdp_inf + return fifo->xi[(*fifo->cc)++ & fifo->mask]; + } + #endif ++#endif diff --git a/src/mlnx-ofa_kernel-5.8/backports/0119-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0119-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..b8f93f2 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0119-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,274 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.c + +Change-Id: I1906ac0f05e40695f34e8f65bec74d3a7b54dbe8 +--- + .../ethernet/mellanox/mlx5/core/en/xsk/pool.c | 152 +++++++++++++++++- + 1 file changed, 146 insertions(+), 6 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.c +@@ -1,31 +1,90 @@ + // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB + /* Copyright (c) 2019-2020, Mellanox Technologies inc. All rights reserved. */ + ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT ++#ifdef HAVE_XDP_SOCK_DRV_H + #include ++#else ++#include ++#endif + #include "pool.h" + #include "setup.h" + #include "en/params.h" + + static int mlx5e_xsk_map_pool(struct mlx5e_priv *priv, ++#ifdef HAVE_NETDEV_BPF_XSK_BUFF_POOL + struct xsk_buff_pool *pool) ++#else ++ struct xdp_umem *umem) ++#endif + { + struct device *dev = mlx5_core_dma_dev(priv->mdev); + ++#ifdef HAVE_NETDEV_BPF_XSK_BUFF_POOL + return xsk_pool_dma_map(pool, dev, DMA_ATTR_SKIP_CPU_SYNC); ++#elif defined(HAVE_XSK_BUFF_ALLOC) ++ return xsk_buff_dma_map(umem, dev, 0); ++#else ++ u32 i; ++ ++ for (i = 0; i < umem->npgs; i++) { ++ dma_addr_t dma = dma_map_page(dev, umem->pgs[i], 0, PAGE_SIZE, ++ DMA_BIDIRECTIONAL); ++ ++ if (unlikely(dma_mapping_error(dev, dma))) ++ goto err_unmap; ++ umem->pages[i].dma = dma; ++ } ++ ++ return 0; ++ ++err_unmap: ++ while (i--) { ++ dma_unmap_page(dev, umem->pages[i].dma, PAGE_SIZE, ++ DMA_BIDIRECTIONAL); ++ umem->pages[i].dma = 0; ++ } ++ ++ return -ENOMEM; ++#endif /* HAVE_NETDEV_BPF_XSK_BUFF_POOL */ + } + + static void mlx5e_xsk_unmap_pool(struct mlx5e_priv *priv, +- struct xsk_buff_pool *pool) ++#ifdef HAVE_NETDEV_BPF_XSK_BUFF_POOL ++ struct xsk_buff_pool *pool) ++#else ++ struct xdp_umem *umem) ++#endif + { ++#ifdef HAVE_NETDEV_BPF_XSK_BUFF_POOL + return xsk_pool_dma_unmap(pool, DMA_ATTR_SKIP_CPU_SYNC); ++#elif defined(HAVE_XSK_BUFF_ALLOC) ++ return xsk_buff_dma_unmap(umem, 0); ++#else ++ struct device *dev = priv->mdev->device; ++ u32 i; ++ ++ for (i = 0; i < umem->npgs; i++) { ++ dma_unmap_page(dev, umem->pages[i].dma, PAGE_SIZE, ++ DMA_BIDIRECTIONAL); ++ umem->pages[i].dma = 0; ++ } ++#endif + } + + static int mlx5e_xsk_get_pools(struct mlx5e_xsk *xsk) + { ++#ifdef HAVE_NETDEV_BPF_XSK_BUFF_POOL + if (!xsk->pools) { + xsk->pools = kcalloc(MLX5E_MAX_NUM_CHANNELS, + sizeof(*xsk->pools), GFP_KERNEL); + if (unlikely(!xsk->pools)) ++#else ++ if (!xsk->umems) { ++ xsk->umems = kcalloc(MLX5E_MAX_NUM_CHANNELS, ++ sizeof(*xsk->umems), GFP_KERNEL); ++ if (unlikely(!xsk->umems)) ++#endif + return -ENOMEM; + } + +@@ -38,44 +97,93 @@ static int mlx5e_xsk_get_pools(struct ml + static void mlx5e_xsk_put_pools(struct mlx5e_xsk *xsk) + { + if (!--xsk->refcnt) { ++#ifdef HAVE_NETDEV_BPF_XSK_BUFF_POOL + kfree(xsk->pools); + xsk->pools = NULL; ++#else ++ kfree(xsk->umems); ++ xsk->umems = NULL; ++#endif + } + } + +-static int mlx5e_xsk_add_pool(struct mlx5e_xsk *xsk, struct xsk_buff_pool *pool, u16 ix) ++static int mlx5e_xsk_add_pool(struct mlx5e_xsk *xsk, ++#ifdef HAVE_NETDEV_BPF_XSK_BUFF_POOL ++ struct xsk_buff_pool *pool, ++#else ++ struct xdp_umem *pool, ++#endif ++ u16 ix) + { + int err; + + err = mlx5e_xsk_get_pools(xsk); ++ + if (unlikely(err)) + return err; + ++#ifdef HAVE_NETDEV_BPF_XSK_BUFF_POOL + xsk->pools[ix] = pool; ++#else ++ xsk->umems[ix] = pool; ++#endif + return 0; + } + + static void mlx5e_xsk_remove_pool(struct mlx5e_xsk *xsk, u16 ix) + { ++#ifdef HAVE_NETDEV_BPF_XSK_BUFF_POOL + xsk->pools[ix] = NULL; ++#else ++ xsk->umems[ix] = NULL; ++#endif + + mlx5e_xsk_put_pools(xsk); + } + ++#ifdef HAVE_NETDEV_BPF_XSK_BUFF_POOL + static bool mlx5e_xsk_is_pool_sane(struct xsk_buff_pool *pool) ++#else ++static bool mlx5e_xsk_is_pool_sane(struct xdp_umem *umem) ++#endif + { ++#ifdef HAVE_NETDEV_BPF_XSK_BUFF_POOL + return xsk_pool_get_headroom(pool) <= 0xffff && + xsk_pool_get_chunk_size(pool) <= 0xffff; ++#elif defined(HAVE_XSK_BUFF_ALLOC) ++ return xsk_umem_get_headroom(umem) <= 0xffff && ++ xsk_umem_get_chunk_size(umem) <= 0xffff; ++#else ++ return umem->headroom <= 0xffff && ++ umem->chunk_size_nohr <= 0xffff; ++#endif + } + ++#ifdef HAVE_NETDEV_BPF_XSK_BUFF_POOL + void mlx5e_build_xsk_param(struct xsk_buff_pool *pool, struct mlx5e_xsk_param *xsk) ++#else ++void mlx5e_build_xsk_param(struct xdp_umem *umem, struct mlx5e_xsk_param *xsk) ++#endif + { ++#ifdef HAVE_NETDEV_BPF_XSK_BUFF_POOL + xsk->headroom = xsk_pool_get_headroom(pool); + xsk->chunk_size = xsk_pool_get_chunk_size(pool); ++#elif defined(HAVE_XSK_BUFF_ALLOC) ++ xsk->headroom = xsk_umem_get_headroom(umem); ++ xsk->chunk_size = xsk_umem_get_chunk_size(umem); ++#else ++ xsk->headroom = umem->headroom; ++ xsk->chunk_size = umem->chunk_size_nohr + umem->headroom; ++#endif + } + + static int mlx5e_xsk_enable_locked(struct mlx5e_priv *priv, +- struct xsk_buff_pool *pool, u16 ix) ++#ifdef HAVE_NETDEV_BPF_XSK_BUFF_POOL ++ struct xsk_buff_pool *pool, ++#else ++ struct xdp_umem *pool, ++#endif ++ u16 ix) + { + struct mlx5e_params *params = &priv->channels.params; + struct mlx5e_xsk_param xsk; +@@ -89,10 +197,12 @@ static int mlx5e_xsk_enable_locked(struc + return -EINVAL; + + err = mlx5e_xsk_map_pool(priv, pool); ++ + if (unlikely(err)) + return err; + + err = mlx5e_xsk_add_pool(&priv->xsk, pool, ix); ++ + if (unlikely(err)) + goto err_unmap_pool; + +@@ -156,7 +266,11 @@ validate_closed: + + static int mlx5e_xsk_disable_locked(struct mlx5e_priv *priv, u16 ix) + { ++#ifdef HAVE_NETDEV_BPF_XSK_BUFF_POOL + struct xsk_buff_pool *pool = mlx5e_xsk_get_pool(&priv->channels.params, ++#else ++ struct xdp_umem *pool = mlx5e_xsk_get_pool(&priv->channels.params, ++#endif + &priv->xsk, ix); + struct mlx5e_channel *c; + +@@ -183,8 +297,13 @@ remove_pool: + return 0; + } + +-static int mlx5e_xsk_enable_pool(struct mlx5e_priv *priv, struct xsk_buff_pool *pool, +- u16 ix) ++static int mlx5e_xsk_enable_pool(struct mlx5e_priv *priv, ++#ifdef HAVE_NETDEV_BPF_XSK_BUFF_POOL ++ struct xsk_buff_pool *pool, ++#else ++ struct xdp_umem *pool, ++#endif ++ u16 ix) + { + int err; + +@@ -206,7 +325,13 @@ static int mlx5e_xsk_disable_pool(struct + return err; + } + +-int mlx5e_xsk_setup_pool(struct net_device *dev, struct xsk_buff_pool *pool, u16 qid) ++int mlx5e_xsk_setup_pool(struct net_device *dev, ++#ifdef HAVE_NETDEV_BPF_XSK_BUFF_POOL ++ struct xsk_buff_pool *pool, ++#else ++ struct xdp_umem *pool, ++#endif ++ u16 qid) + { + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5e_params *params = &priv->channels.params; +@@ -218,3 +343,18 @@ int mlx5e_xsk_setup_pool(struct net_devi + return pool ? mlx5e_xsk_enable_pool(priv, pool, ix) : + mlx5e_xsk_disable_pool(priv, ix); + } ++ ++#ifndef HAVE_XSK_BUFF_ALLOC ++int mlx5e_xsk_resize_reuseq(struct xdp_umem *umem, u32 nentries) ++{ ++ struct xdp_umem_fq_reuse *reuseq; ++ ++ reuseq = xsk_reuseq_prepare(nentries); ++ if (unlikely(!reuseq)) ++ return -ENOMEM; ++ xsk_reuseq_free(xsk_reuseq_swap(umem, reuseq)); ++ ++ return 0; ++} ++#endif ++#endif /* HAVE_XSK_ZERO_COPY_SUPPORT */ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0120-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0120-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..1606515 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0120-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,58 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.h + +Change-Id: If1599a925cc22640d3f224dd5cf6cf7b8f9d1b4f +--- + .../ethernet/mellanox/mlx5/core/en/xsk/pool.h | 27 ++++++++++++++++++- + 1 file changed, 26 insertions(+), 1 deletion(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.h +@@ -4,8 +4,10 @@ + #ifndef __MLX5_EN_XSK_POOL_H__ + #define __MLX5_EN_XSK_POOL_H__ + ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + #include "en.h" + ++#ifdef HAVE_NETDEV_BPF_XSK_BUFF_POOL + static inline struct xsk_buff_pool *mlx5e_xsk_get_pool(struct mlx5e_params *params, + struct mlx5e_xsk *xsk, u16 ix) + { +@@ -17,11 +19,34 @@ static inline struct xsk_buff_pool *mlx5 + + return xsk->pools[ix]; + } ++#else ++static inline struct xdp_umem *mlx5e_xsk_get_pool(struct mlx5e_params *params, ++ struct mlx5e_xsk *xsk, u16 ix) ++{ ++ if (!xsk || !xsk->umems) ++ return NULL; ++ ++ if (unlikely(ix >= params->num_channels)) ++ return NULL; ++ ++ return xsk->umems[ix]; ++} ++#endif + + struct mlx5e_xsk_param; +-void mlx5e_build_xsk_param(struct xsk_buff_pool *pool, struct mlx5e_xsk_param *xsk); + + /* .ndo_bpf callback. */ ++#ifdef HAVE_NETDEV_BPF_XSK_BUFF_POOL ++void mlx5e_build_xsk_param(struct xsk_buff_pool *pool, struct mlx5e_xsk_param *xsk); + int mlx5e_xsk_setup_pool(struct net_device *dev, struct xsk_buff_pool *pool, u16 qid); ++#else ++void mlx5e_build_xsk_param(struct xdp_umem *umem, struct mlx5e_xsk_param *xsk); ++int mlx5e_xsk_setup_pool(struct net_device *dev, struct xdp_umem *pool, u16 qid); ++#endif ++ ++#ifndef HAVE_XSK_BUFF_ALLOC ++int mlx5e_xsk_resize_reuseq(struct xdp_umem *umem, u32 nentries); ++#endif + ++#endif + #endif /* __MLX5_EN_XSK_POOL_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0121-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0121-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..343e289 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0121-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,188 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c + +Change-Id: Ic8c617c581267560555c4e11979433c82b7586bb +--- + .../ethernet/mellanox/mlx5/core/en/xsk/rx.c | 125 ++++++++++++++++++ + 1 file changed, 125 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c +@@ -1,12 +1,92 @@ + // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB + /* Copyright (c) 2019 Mellanox Technologies. */ + ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT ++ + #include "rx.h" + #include "en/xdp.h" ++#ifdef HAVE_XDP_SOCK_DRV_H + #include ++#else ++#include ++#endif + + /* RX data path */ + ++#ifndef HAVE_XSK_BUFF_ALLOC ++bool mlx5e_xsk_pages_enough_umem(struct mlx5e_rq *rq, int count) ++{ ++ /* Check in advance that we have enough frames, instead of allocating ++ * one-by-one, failing and moving frames to the Reuse Ring. ++ */ ++ return xsk_umem_has_addrs_rq(rq->umem, count); ++} ++ ++int mlx5e_xsk_page_alloc_pool(struct mlx5e_rq *rq, ++ struct mlx5e_dma_info *dma_info) ++{ ++ struct xdp_umem *umem = rq->umem; ++ u64 handle; ++ ++ if (!xsk_umem_peek_addr_rq(umem, &handle)) ++ return -ENOMEM; ++ ++#ifdef HAVE_XSK_UMEM_ADJUST_OFFSET ++ dma_info->xsk.handle = xsk_umem_adjust_offset(umem, handle, ++ rq->buff.umem_headroom); ++#else ++ dma_info->xsk.handle = handle + rq->buff.umem_headroom; ++#endif ++ dma_info->xsk.data = xdp_umem_get_data(umem, dma_info->xsk.handle); ++ ++ /* No need to add headroom to the DMA address. In striding RQ case, we ++ * just provide pages for UMR, and headroom is counted at the setup ++ * stage when creating a WQE. In non-striding RQ case, headroom is ++ * accounted in mlx5e_alloc_rx_wqe. ++ */ ++ dma_info->addr = xdp_umem_get_dma(umem, handle); ++ ++#ifdef HAVE_XSK_UMEM_RELEASE_ADDR_RQ ++ xsk_umem_release_addr_rq(umem); ++#else ++ xsk_umem_discard_addr_rq(umem); ++#endif ++ ++ dma_sync_single_for_device(rq->pdev, dma_info->addr, PAGE_SIZE, ++ DMA_BIDIRECTIONAL); ++ ++ return 0; ++} ++ ++static inline void mlx5e_xsk_recycle_frame(struct mlx5e_rq *rq, u64 handle) ++{ ++ xsk_umem_fq_reuse(rq->umem, handle & rq->umem->chunk_mask); ++} ++ ++/* XSKRQ uses pages from UMEM, they must not be released. They are returned to ++ * the userspace if possible, and if not, this function is called to reuse them ++ * in the driver. ++ */ ++void mlx5e_xsk_page_release(struct mlx5e_rq *rq, ++ struct mlx5e_dma_info *dma_info) ++{ ++ mlx5e_xsk_recycle_frame(rq, dma_info->xsk.handle); ++} ++ ++/* Return a frame back to the hardware to fill in again. It is used by XDP when ++ * the XDP program returns XDP_TX or XDP_REDIRECT not to an XSKMAP. ++ */ ++void mlx5e_xsk_zca_free(struct zero_copy_allocator *zca, unsigned long handle) ++{ ++ struct mlx5e_rq *rq = container_of(zca, struct mlx5e_rq, zca); ++ ++ mlx5e_xsk_recycle_frame(rq, handle); ++} ++ ++void mlx5e_fill_xdp_buff_for_old_xsk(struct mlx5e_rq *rq, void *va, u16 headroom, ++ u32 len, struct xdp_buff *xdp, struct mlx5e_dma_info *di); ++#endif /* HAVE_XSK_BUFF_ALLOC */ ++ + static struct sk_buff *mlx5e_xsk_construct_skb(struct mlx5e_rq *rq, void *data, + u32 cqe_bcnt) + { +@@ -29,7 +109,16 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_m + u32 head_offset, + u32 page_idx) + { ++#ifdef HAVE_XSK_BUFF_ALLOC + struct xdp_buff *xdp = wi->umr.dma_info[page_idx].xsk; ++#else ++ struct xdp_buff xdp_old; ++ struct xdp_buff *xdp = &xdp_old; ++ struct mlx5e_dma_info *di = &wi->umr.dma_info[page_idx]; ++ u16 rx_headroom = rq->buff.headroom - rq->buff.umem_headroom; ++ void *va, *data; ++ u32 frag_size; ++#endif + u32 cqe_bcnt32 = cqe_bcnt; + + /* Check packet size. Note LRO doesn't use linear SKB */ +@@ -45,9 +134,22 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_m + */ + WARN_ON_ONCE(head_offset); + ++#ifdef HAVE_XSK_BUFF_ALLOC + xdp->data_end = xdp->data + cqe_bcnt32; + xdp_set_data_meta_invalid(xdp); ++#ifdef HAVE_XSK_BUFF_DMA_SYNC_FOR_CPU_2_PARAMS + xsk_buff_dma_sync_for_cpu(xdp, rq->xsk_pool); ++#else ++ xsk_buff_dma_sync_for_cpu(xdp); ++#endif ++#else ++ va = di->xsk.data; ++ data = va + rx_headroom; ++ frag_size = rq->buff.headroom + cqe_bcnt32; ++ ++ dma_sync_single_for_cpu(rq->pdev, di->addr, frag_size, DMA_BIDIRECTIONAL); ++ mlx5e_fill_xdp_buff_for_old_xsk(rq, va, rx_headroom, cqe_bcnt32, xdp, di); ++#endif + net_prefetch(xdp->data); + + /* Possible flows: +@@ -82,7 +184,16 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_l + struct mlx5e_wqe_frag_info *wi, + u32 cqe_bcnt) + { ++#ifdef HAVE_XSK_BUFF_ALLOC + struct xdp_buff *xdp = wi->di->xsk; ++#else ++ struct xdp_buff xdp_old; ++ struct xdp_buff *xdp = &xdp_old; ++ struct mlx5e_dma_info *di = wi->di; ++ u16 rx_headroom = rq->buff.headroom - rq->buff.umem_headroom; ++ void *va, *data; ++ u32 frag_size; ++#endif + + /* wi->offset is not used in this function, because xdp->data and the + * DMA address point directly to the necessary place. Furthermore, the +@@ -91,9 +202,22 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_l + */ + WARN_ON_ONCE(wi->offset); + ++#ifdef HAVE_XSK_BUFF_ALLOC + xdp->data_end = xdp->data + cqe_bcnt; + xdp_set_data_meta_invalid(xdp); ++#ifdef HAVE_XSK_BUFF_DMA_SYNC_FOR_CPU_2_PARAMS + xsk_buff_dma_sync_for_cpu(xdp, rq->xsk_pool); ++#else ++ xsk_buff_dma_sync_for_cpu(xdp); ++#endif ++#else ++ va = di->xsk.data; ++ data = va + rx_headroom; ++ frag_size = rq->buff.headroom + cqe_bcnt; ++ ++ dma_sync_single_for_cpu(rq->pdev, di->addr, frag_size, DMA_BIDIRECTIONAL); ++ mlx5e_fill_xdp_buff_for_old_xsk(rq, va, rx_headroom, cqe_bcnt, xdp, di); ++#endif + net_prefetch(xdp->data); + + if (unlikely(get_cqe_opcode(cqe) != MLX5_CQE_RESP_SEND)) { +@@ -110,3 +234,4 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_l + */ + return mlx5e_xsk_construct_skb(rq, xdp->data, cqe_bcnt); + } ++#endif /* HAVE_XSK_ZERO_COPY_SUPPORT*/ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0122-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0122-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..63fc9a0 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0122-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,90 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h + +Change-Id: I2c30893cb783fc2d5a5b001afa702be9171770d2 +--- + .../ethernet/mellanox/mlx5/core/en/xsk/rx.h | 41 +++++++++++++++++-- + 1 file changed, 38 insertions(+), 3 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h +@@ -4,11 +4,25 @@ + #ifndef __MLX5_EN_XSK_RX_H__ + #define __MLX5_EN_XSK_RX_H__ + ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT ++ + #include "en.h" ++#ifdef HAVE_NDO_XSK_WAKEUP ++#ifdef HAVE_XDP_SOCK_DRV_H + #include ++#else ++#include ++#endif ++#endif + + /* RX data path */ + ++#ifndef HAVE_XSK_BUFF_ALLOC ++bool mlx5e_xsk_pages_enough_umem(struct mlx5e_rq *rq, int count); ++void mlx5e_xsk_page_release(struct mlx5e_rq *rq, ++ struct mlx5e_dma_info *dma_info); ++void mlx5e_xsk_zca_free(struct zero_copy_allocator *zca, unsigned long handle); ++#endif + struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, + struct mlx5e_mpw_info *wi, + u16 cqe_bcnt, +@@ -19,10 +33,16 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_l + struct mlx5e_wqe_frag_info *wi, + u32 cqe_bcnt); + ++#ifdef HAVE_XSK_BUFF_ALLOC + static inline int mlx5e_xsk_page_alloc_pool(struct mlx5e_rq *rq, + struct mlx5e_dma_info *dma_info) + { ++#ifdef HAVE_NETDEV_BPF_XSK_BUFF_POOL + dma_info->xsk = xsk_buff_alloc(rq->xsk_pool); ++#else ++ dma_info->xsk = xsk_buff_alloc(rq->umem); ++#endif ++ + if (!dma_info->xsk) + return -ENOMEM; + +@@ -35,18 +55,33 @@ static inline int mlx5e_xsk_page_alloc_p + + return 0; + } ++#else ++int mlx5e_xsk_page_alloc_pool(struct mlx5e_rq *rq, ++ struct mlx5e_dma_info *dma_info); ++#endif /* HAVE_XSK_BUFF_ALLOC */ + ++#ifdef HAVE_NDO_XSK_WAKEUP + static inline bool mlx5e_xsk_update_rx_wakeup(struct mlx5e_rq *rq, bool alloc_err) + { +- if (!xsk_uses_need_wakeup(rq->xsk_pool)) ++#ifdef HAVE_NETDEV_BPF_XSK_BUFF_POOL ++ struct xsk_buff_pool *xsk_pool = rq->xsk_pool; ++ ++ if (!xsk_uses_need_wakeup(xsk_pool)) ++#else ++ struct xdp_umem *xsk_pool = rq->umem; ++ ++ if (!xsk_umem_uses_need_wakeup(xsk_pool)) ++#endif + return alloc_err; + + if (unlikely(alloc_err)) +- xsk_set_rx_need_wakeup(rq->xsk_pool); ++ xsk_set_rx_need_wakeup(xsk_pool); + else +- xsk_clear_rx_need_wakeup(rq->xsk_pool); ++ xsk_clear_rx_need_wakeup(xsk_pool); + + return false; + } ++#endif + ++#endif /* HAVE_XSK_ZERO_COPY_SUPPORT */ + #endif /* __MLX5_EN_XSK_RX_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0123-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0123-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..5685f1b --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0123-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,84 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c + +Change-Id: I42c8c6bc9d01f8c284486f1169bff96d7623a9ba +--- + .../mellanox/mlx5/core/en/xsk/setup.c | 28 +++++++++++++++++-- + 1 file changed, 26 insertions(+), 2 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c +@@ -1,6 +1,7 @@ + // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB + /* Copyright (c) 2019 Mellanox Technologies. */ + ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + #include "setup.h" + #include "en/params.h" + #include "en/txrx.h" +@@ -48,7 +49,11 @@ static void mlx5e_build_xsk_cparam(struc + + static int mlx5e_init_xsk_rq(struct mlx5e_channel *c, + struct mlx5e_params *params, ++#ifdef HAVE_NETDEV_BPF_XSK_BUFF_POOL + struct xsk_buff_pool *pool, ++#else ++ struct xdp_umem *umem, ++#endif + struct mlx5e_xsk_param *xsk, + struct mlx5e_rq *rq) + { +@@ -68,7 +73,11 @@ static int mlx5e_init_xsk_rq(struct mlx5 + rq->mdev = mdev; + rq->hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu); + rq->xdpsq = &c->rq_xdpsq; ++#ifdef HAVE_NETDEV_BPF_XSK_BUFF_POOL + rq->xsk_pool = pool; ++#else ++ rq->umem = umem; ++#endif + rq->stats = &c->priv->channel_stats[c->ix]->xskrq; + rq->ptp_cyc2time = mlx5_rq_ts_translator(mdev); + rq_xdp_ix = c->ix + params->num_channels * MLX5E_RQ_GROUP_XSK; +@@ -76,11 +85,20 @@ static int mlx5e_init_xsk_rq(struct mlx5 + if (err) + return err; + ++#ifdef HAVE_XDP_RXQ_INFO_REG_4_PARAMS + return xdp_rxq_info_reg(&rq->xdp_rxq, rq->netdev, rq_xdp_ix, 0); ++#else ++ return xdp_rxq_info_reg(&rq->xdp_rxq, rq->netdev, rq_xdp_ix); ++#endif + } + + static int mlx5e_open_xsk_rq(struct mlx5e_channel *c, struct mlx5e_params *params, +- struct mlx5e_rq_param *rq_params, struct xsk_buff_pool *pool, ++ struct mlx5e_rq_param *rq_params, ++#ifdef HAVE_NETDEV_BPF_XSK_BUFF_POOL ++ struct xsk_buff_pool *pool, ++#else ++ struct xdp_umem *pool, ++#endif + struct mlx5e_create_cq_param *ccp, struct mlx5e_xsk_param *xsk) + { + int err; +@@ -93,7 +111,12 @@ static int mlx5e_open_xsk_rq(struct mlx5 + } + + int mlx5e_open_xsk(struct mlx5e_priv *priv, struct mlx5e_params *params, +- struct mlx5e_xsk_param *xsk, struct xsk_buff_pool *pool, ++ struct mlx5e_xsk_param *xsk, ++#ifdef HAVE_NETDEV_BPF_XSK_BUFF_POOL ++ struct xsk_buff_pool *pool, ++#else ++ struct xdp_umem *pool, ++#endif + struct mlx5e_channel *c) + { + struct mlx5e_channel_param *cparam; +@@ -183,3 +206,4 @@ void mlx5e_deactivate_xsk(struct mlx5e_c + mlx5e_reporter_icosq_resume_recovery(c); + synchronize_net(); /* Sync with NAPI to prevent mlx5e_post_rx_wqes. */ + } ++#endif /* HAVE_XSK_ZERO_COPY_SUPPORT */ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0124-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0124-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..3ccf69d --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0124-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,37 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.h + +Change-Id: I5c6b2e7b95848317cbaecd65b51378c0ae8b9e16 +--- + drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.h | 9 ++++++++- + 1 file changed, 8 insertions(+), 1 deletion(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.h +@@ -4,6 +4,7 @@ + #ifndef __MLX5_EN_XSK_SETUP_H__ + #define __MLX5_EN_XSK_SETUP_H__ + ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + #include "en.h" + + struct mlx5e_xsk_param; +@@ -12,10 +13,16 @@ bool mlx5e_validate_xsk_param(struct mlx + struct mlx5e_xsk_param *xsk, + struct mlx5_core_dev *mdev); + int mlx5e_open_xsk(struct mlx5e_priv *priv, struct mlx5e_params *params, +- struct mlx5e_xsk_param *xsk, struct xsk_buff_pool *pool, ++ struct mlx5e_xsk_param *xsk, ++#ifdef HAVE_NETDEV_BPF_XSK_BUFF_POOL ++ struct xsk_buff_pool *pool, ++#else ++ struct xdp_umem *umem, ++#endif + struct mlx5e_channel *c); + void mlx5e_close_xsk(struct mlx5e_channel *c); + void mlx5e_activate_xsk(struct mlx5e_channel *c); + void mlx5e_deactivate_xsk(struct mlx5e_channel *c); + ++#endif /* HAVE_XSK_ZERO_COPY_SUPPORT */ + #endif /* __MLX5_EN_XSK_SETUP_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0125-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0125-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..b30790e --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0125-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,97 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c + +Change-Id: I3219dc8abed31bfe1e34c25c5f6a2e3ce80bbb1a +--- + .../ethernet/mellanox/mlx5/core/en/xsk/tx.c | 38 +++++++++++++++++-- + 1 file changed, 35 insertions(+), 3 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c +@@ -1,13 +1,22 @@ + // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB + /* Copyright (c) 2019 Mellanox Technologies. */ + ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + #include "tx.h" + #include "pool.h" + #include "en/xdp.h" + #include "en/params.h" ++#ifdef HAVE_XDP_SOCK_DRV_H + #include +- +-int mlx5e_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags) ++#else ++#include ++#endif ++ ++int mlx5e_xsk_wakeup(struct net_device *dev, u32 qid ++#ifdef HAVE_NDO_XSK_WAKEUP ++ , u32 flags ++#endif ++ ) + { + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5e_params *params = &priv->channels.params; +@@ -66,7 +75,11 @@ static void mlx5e_xsk_tx_post_err(struct + + bool mlx5e_xsk_tx(struct mlx5e_xdpsq *sq, unsigned int budget) + { ++#ifdef HAVE_NETDEV_BPF_XSK_BUFF_POOL + struct xsk_buff_pool *pool = sq->xsk_pool; ++#else ++ struct xdp_umem *pool = sq->umem; ++#endif + struct mlx5e_xmit_data xdptxd; + struct mlx5e_xdp_info xdpi; + bool work_done = true; +@@ -86,8 +99,11 @@ bool mlx5e_xsk_tx(struct mlx5e_xdpsq *sq + work_done = false; + break; + } +- ++#ifdef HAVE_NETDEV_BPF_XSK_BUFF_POOL + if (!xsk_tx_peek_desc(pool, &desc)) { ++#else ++ if (!xsk_umem_consume_tx(pool, &desc)) { ++#endif + /* TX will get stuck until something wakes it up by + * triggering NAPI. Currently it's expected that the + * application calls sendto() if there are consumed, but +@@ -96,11 +112,22 @@ bool mlx5e_xsk_tx(struct mlx5e_xdpsq *sq + break; + } + ++#ifdef HAVE_XSK_BUFF_ALLOC + xdptxd.dma_addr = xsk_buff_raw_get_dma(pool, desc.addr); + xdptxd.data = xsk_buff_raw_get_data(pool, desc.addr); ++#else ++ xdptxd.dma_addr = xdp_umem_get_dma(pool, desc.addr); ++ xdptxd.data = xdp_umem_get_data(pool, desc.addr); ++#endif + xdptxd.len = desc.len; + ++#ifdef HAVE_XSK_BUFF_ALLOC + xsk_buff_raw_dma_sync_for_device(pool, xdptxd.dma_addr, xdptxd.len); ++#else ++ dma_sync_single_for_device(sq->pdev, xdptxd.dma_addr, ++ xdptxd.len, DMA_BIDIRECTIONAL); ++ ++#endif + + ret = INDIRECT_CALL_2(sq->xmit_xdp_frame, mlx5e_xmit_xdp_frame_mpwqe, + mlx5e_xmit_xdp_frame, sq, &xdptxd, &xdpi, check_result); +@@ -119,8 +146,13 @@ bool mlx5e_xsk_tx(struct mlx5e_xdpsq *sq + mlx5e_xdp_mpwqe_complete(sq); + mlx5e_xmit_xdp_doorbell(sq); + ++#ifdef HAVE_NETDEV_BPF_XSK_BUFF_POOL + xsk_tx_release(pool); ++#else ++ xsk_umem_consume_tx_done(pool); ++#endif + } + + return !(budget && work_done); + } ++#endif /* HAVE_XSK_ZERO_COPY_SUPPORT */ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0126-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0126-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..9c45bd2 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0126-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,62 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h + +Change-Id: I4102004efe25554c48cc8012c0c400ff0ad03278 +--- + .../ethernet/mellanox/mlx5/core/en/xsk/tx.h | 28 ++++++++++++++++++- + 1 file changed, 27 insertions(+), 1 deletion(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h +@@ -4,17 +4,31 @@ + #ifndef __MLX5_EN_XSK_TX_H__ + #define __MLX5_EN_XSK_TX_H__ + ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT ++ + #include "en.h" ++#ifdef HAVE_NDO_XSK_WAKEUP ++#ifdef HAVE_XDP_SOCK_DRV_H + #include ++#else ++#include ++#endif ++#endif + + /* TX data path */ + +-int mlx5e_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags); ++int mlx5e_xsk_wakeup(struct net_device *dev, u32 qid ++#ifdef HAVE_NDO_XSK_WAKEUP ++ , u32 flags ++#endif ++ ); + + bool mlx5e_xsk_tx(struct mlx5e_xdpsq *sq, unsigned int budget); + ++#ifdef HAVE_NDO_XSK_WAKEUP + static inline void mlx5e_xsk_update_tx_wakeup(struct mlx5e_xdpsq *sq) + { ++#ifdef HAVE_NETDEV_BPF_XSK_BUFF_POOL + if (!xsk_uses_need_wakeup(sq->xsk_pool)) + return; + +@@ -22,6 +36,18 @@ static inline void mlx5e_xsk_update_tx_w + xsk_clear_tx_need_wakeup(sq->xsk_pool); + else + xsk_set_tx_need_wakeup(sq->xsk_pool); ++#else ++ if (!xsk_umem_uses_need_wakeup(sq->umem)) ++ return; ++ ++ if (sq->pc != sq->cc) ++ xsk_clear_tx_need_wakeup(sq->umem); ++ else ++ xsk_set_tx_need_wakeup(sq->umem); ++#endif ++ + } ++#endif + ++#endif /* HAVE_XSK_ZERO_COPY_SUPPORT */ + #endif /* __MLX5_EN_XSK_TX_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0127-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch b/src/mlnx-ofa_kernel-5.8/backports/0127-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch new file mode 100644 index 0000000..ea23aa8 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0127-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch @@ -0,0 +1,43 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h + +Change-Id: I5dbe19c92a9bedcab5999ffffe089b1f0945827b +--- + .../ethernet/mellanox/mlx5/core/en_accel/en_accel.h | 12 +++++++++++- + 1 file changed, 11 insertions(+), 1 deletion(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h +@@ -120,8 +120,10 @@ static inline bool mlx5e_accel_tx_begin( + struct sk_buff *skb, + struct mlx5e_accel_tx_state *state) + { ++#ifdef HAVE_NETIF_F_GSO_UDP_L4 + if (skb_is_gso(skb) && skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) + mlx5e_udp_gso_handle_tx_skb(skb); ++#endif + + #ifdef CONFIG_MLX5_EN_TLS + /* May send SKBs and WQEs. */ +@@ -174,11 +176,19 @@ static inline unsigned int mlx5e_accel_t + + static inline void mlx5e_accel_tx_eseg(struct mlx5e_priv *priv, + struct sk_buff *skb, +- struct mlx5_wqe_eth_seg *eseg, u16 ihs) ++ struct mlx5_wqe_eth_seg *eseg, ++#if !defined(HAVE_XFRM_OFFLOAD_INNER_IPPROTO) && defined(CONFIG_MLX5_EN_IPSEC) ++ struct mlx5e_accel_tx_ipsec_state *ipsec_st, ++#endif ++ u16 ihs) + { + #ifdef CONFIG_MLX5_EN_IPSEC + if (xfrm_offload(skb)) ++#ifdef HAVE_XFRM_OFFLOAD_INNER_IPPROTO + mlx5e_ipsec_tx_build_eseg(priv, skb, eseg); ++#else ++ mlx5e_ipsec_tx_build_eseg(priv, skb, ipsec_st, eseg); ++#endif + #endif + + #ifdef CONFIG_MLX5_EN_MACSEC diff --git a/src/mlnx-ofa_kernel-5.8/backports/0128-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch b/src/mlnx-ofa_kernel-5.8/backports/0128-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch new file mode 100644 index 0000000..60467b8 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0128-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch @@ -0,0 +1,212 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c + +Change-Id: I0b3d2b27c18b214baa2c63650b331cd82395dacd +--- + .../mellanox/mlx5/core/en_accel/ipsec.c | 65 ++++++++++++++++++- + 1 file changed, 64 insertions(+), 1 deletion(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c +@@ -48,9 +48,11 @@ + #include "en/ipsec_aso.h" + #include "../esw/ipsec.h" + ++#ifdef HAVE_XFRM_STATE_OFFLOAD + #ifndef XFRM_OFFLOAD_FULL + #define XFRM_OFFLOAD_FULL 4 + #endif ++#endif /* HAVE_XFRM_STATE_OFFLOAD */ + + struct mlx5e_ipsec_async_work { + struct delayed_work dwork; +@@ -316,10 +318,20 @@ initialize_lifetime_limit(struct mlx5e_i + if (MLX5_CAP_GEN(priv->mdev, fpga)) + return; + ++#ifndef HAVE_XFRM_STATE_EXPIRE ++ attrs->soft_packet_limit = IPSEC_NO_LIMIT; ++ attrs->hard_packet_limit = IPSEC_NO_LIMIT; ++ return; ++#endif ++ + hard_limit = x->lft.hard_packet_limit; + soft_limit = (x->lft.soft_packet_limit == IPSEC_NO_LIMIT) + ? 0 : x->lft.soft_packet_limit; ++#ifdef HAVE_XFRM_STATE_OFFLOAD + if (!(x->xso.flags & XFRM_OFFLOAD_FULL) || ++#else ++ if ( ++#endif + (hard_limit <= soft_limit) || + (hard_limit == IPSEC_NO_LIMIT)) { + attrs->soft_packet_limit = IPSEC_NO_LIMIT; +@@ -419,7 +431,11 @@ mlx5e_ipsec_build_accel_xfrm_attrs(struc + attrs->keymat_type = MLX5_ACCEL_ESP_KEYMAT_AES_GCM; + + /* action */ ++#ifndef HAVE_XFRM_STATE_OFFLOAD ++ attrs->action = (!(x->xso.dir == XFRM_DEV_OFFLOAD_IN)) ? ++#else + attrs->action = (!(x->xso.flags & XFRM_OFFLOAD_INBOUND)) ? ++#endif + MLX5_ACCEL_ESP_ACTION_ENCRYPT : + MLX5_ACCEL_ESP_ACTION_DECRYPT; + /* flags */ +@@ -428,9 +444,11 @@ mlx5e_ipsec_build_accel_xfrm_attrs(struc + MLX5_ACCEL_ESP_FLAGS_TUNNEL; + + /* Valid till stack changes accepted */ ++#ifdef HAVE_XFRM_STATE_OFFLOAD + #define XFRM_OFFLOAD_FULL 4 + if (x->xso.flags & XFRM_OFFLOAD_FULL) + attrs->flags |= MLX5_ACCEL_ESP_FLAGS_FULL_OFFLOAD; ++#endif + + /* spi */ + attrs->spi = x->id.spi; +@@ -452,9 +470,15 @@ mlx5e_ipsec_build_accel_xfrm_attrs(struc + + static inline int mlx5e_xfrm_validate_state(struct xfrm_state *x) + { ++#ifdef HAVE_NET_DEVICE_REAL_DEV + struct net_device *netdev = x->xso.real_dev; ++#else ++ struct net_device *netdev = x->xso.dev; ++#endif + struct mlx5_core_dev *mdev; ++#ifdef CONFIG_MLX5_ESWITCH + struct mlx5_eswitch *esw; ++#endif + struct mlx5e_priv *priv; + + priv = netdev_priv(netdev); +@@ -527,11 +551,23 @@ static inline int mlx5e_xfrm_validate_st + netdev_info(netdev, "IPv6 xfrm state offload is not supported by this device\n"); + return -EINVAL; + } ++ ++#ifdef HAVE_XFRM_STATE_OFFLOAD + if (x->xso.flags & XFRM_OFFLOAD_FULL) { ++#else ++/* For XFRM_OFFLOAD_INBOUND it can be replaced using “xso->dir == XFRM_DEV_OFFLOAD_IN“, ++ * And for XFRM_OFFLOAD_FULL, these can be replaced with if (0) (as if not supported) as this flag exists anyway only in BlueField ++ * kernel. ++ */ ++ if ( 0 ) { ++#endif + if (!(mlx5_accel_ipsec_device_caps(mdev) & MLX5_ACCEL_IPSEC_CAP_FULL_OFFLOAD)) { + netdev_info(netdev, "IPsec full offload is not supported by this device.\n"); + return -EINVAL; + } ++#ifndef CONFIG_MLX5_ESWITCH ++ return -EINVAL; ++#else + esw = mdev->priv.eswitch; + if (!esw || esw->mode != MLX5_ESWITCH_OFFLOADS) { + netdev_info(netdev, "IPsec full offload allowed only in switchdev mode.\n"); +@@ -549,8 +585,10 @@ static inline int mlx5e_xfrm_validate_st + "IPsec crypto only offload is not allowed when devlink ipsec mode is full.\n"); + return -EINVAL; + } ++#endif + } + ++#ifdef HAVE_XFRM_STATE_OFFLOAD + if ((x->xso.flags & XFRM_OFFLOAD_FULL) && + ((x->lft.hard_byte_limit != XFRM_INF) || + (x->lft.soft_byte_limit != XFRM_INF))) { +@@ -561,6 +599,7 @@ static inline int mlx5e_xfrm_validate_st + x->lft.soft_byte_limit); + return -EINVAL; + } ++#endif + + return 0; + } +@@ -589,7 +628,11 @@ static void mlx5e_xfrm_fs_del_rule(struc + static int mlx5e_xfrm_add_state(struct xfrm_state *x) + { + struct mlx5e_ipsec_sa_entry *sa_entry = NULL; ++#ifdef HAVE_NET_DEVICE_REAL_DEV + struct net_device *netdev = x->xso.real_dev; ++#else ++ struct net_device *netdev = x->xso.dev; ++#endif + struct mlx5_accel_esp_xfrm_attrs attrs; + struct mlx5e_priv *priv; + unsigned int sa_handle; +@@ -641,7 +684,11 @@ static int mlx5e_xfrm_add_state(struct x + if (err) + goto err_hw_ctx; + ++#ifndef HAVE_XFRM_STATE_OFFLOAD ++ if (x->xso.dir == XFRM_DEV_OFFLOAD_IN) { ++#else + if (x->xso.flags & XFRM_OFFLOAD_INBOUND) { ++#endif + err = mlx5e_ipsec_sadb_rx_add(sa_entry, sa_handle); + if (err) + goto err_add_rule; +@@ -676,7 +723,11 @@ static void mlx5e_xfrm_del_state(struct + if (!sa_entry || sa_entry->is_removed) + return; + ++#ifndef HAVE_XFRM_STATE_OFFLOAD ++ if (x->xso.dir == XFRM_DEV_OFFLOAD_IN) ++#else + if (x->xso.flags & XFRM_OFFLOAD_INBOUND) ++#endif + mlx5e_ipsec_sadb_rx_del(sa_entry); + else + mlx5e_ipsec_sadb_tx_del(sa_entry); +@@ -812,6 +863,7 @@ struct mlx5e_ipsec_modify_state_work { + struct mlx5e_ipsec_sa_entry *sa_entry; + }; + ++#ifdef HAVE_XDO_DEV_STATE_ADVANCE_ESN + static void _update_xfrm_state(struct work_struct *work) + { + int ret; +@@ -851,13 +903,16 @@ static void mlx5e_xfrm_advance_esn_state + INIT_WORK(&modify_work->work, _update_xfrm_state); + WARN_ON(!queue_work(sa_entry->ipsec->wq, &modify_work->work)); + } ++#endif + + static const struct xfrmdev_ops mlx5e_ipsec_xfrmdev_ops = { + .xdo_dev_state_add = mlx5e_xfrm_add_state, + .xdo_dev_state_delete = mlx5e_xfrm_del_state, + .xdo_dev_state_free = mlx5e_xfrm_free_state, + .xdo_dev_offload_ok = mlx5e_ipsec_offload_ok, ++#ifdef HAVE_XDO_DEV_STATE_ADVANCE_ESN + .xdo_dev_state_advance_esn = mlx5e_xfrm_advance_esn_state, ++#endif + }; + + void mlx5e_ipsec_build_netdev(struct mlx5e_priv *priv) +@@ -964,14 +1019,22 @@ static void _mlx5e_ipsec_async_event(str + goto out_xs_state; + + /* Life time event */ +- if (!hard_cnt) /* Notify hard lifetime to xfrm stack */ ++ if (!hard_cnt) { /* Notify hard lifetime to xfrm stack */ ++#ifdef HAVE_XFRM_STATE_EXPIRE ++ xfrm_state_expire(xs, 1); ++#endif + goto out_xs_state; ++ } + + /* 0: no more soft + * 1: notify soft + */ + if (lft->round_soft) { + lft->round_soft--; ++#ifdef HAVE_XFRM_STATE_EXPIRE ++ if (!lft->round_soft) ++ xfrm_state_expire(xs, 0); ++#endif + } + + if (!lft->is_simulated) /* hard_limit < IPSEC_HW_LIMIT */ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0129-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch b/src/mlnx-ofa_kernel-5.8/backports/0129-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch new file mode 100644 index 0000000..195c60c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0129-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch @@ -0,0 +1,40 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c + +Change-Id: I2f6fbd2dbb446ba03a6f615ac0688536884be4c1 +--- + drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c +@@ -1,6 +1,7 @@ + // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB + /* Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. */ + ++#ifdef CONFIG_MLX5_ESWITCH + #include + #include "accel/ipsec_offload.h" + #include "../esw/ipsec.h" +@@ -617,7 +618,6 @@ static int tx_add_rule_full(struct mlx5e + int err = 0; + + esw = mdev->priv.eswitch; +- + if (esw->offloads.ipsec != DEVLINK_ESWITCH_IPSEC_MODE_FULL) + return -ENOTSUPP; + +@@ -686,7 +686,6 @@ out: + kvfree(spec); + return err; + } +- + static int tx_add_rule(struct mlx5e_priv *priv, + struct mlx5_accel_esp_xfrm_attrs *attrs, + u32 ipsec_obj_id, +@@ -937,3 +936,4 @@ int mlx5e_accel_ipsec_fs_init(struct mlx + + return err; + } ++#endif /* CONFIG_MLX5_ESWITCH */ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0130-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch b/src/mlnx-ofa_kernel-5.8/backports/0130-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch new file mode 100644 index 0000000..80913f9 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0130-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch @@ -0,0 +1,40 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.h + +Change-Id: I3a04ae2dd1d9b94e4474874eae7d3fbdca1e5917 +--- + .../ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.h | 11 +++++++++-- + 1 file changed, 9 insertions(+), 2 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.h +@@ -3,13 +3,13 @@ + + #ifndef __MLX5_IPSEC_STEERING_H__ + #define __MLX5_IPSEC_STEERING_H__ +- + #include "en.h" + #include "ipsec.h" + #include "accel/ipsec_offload.h" + #include "en/fs.h" + +-#ifdef CONFIG_MLX5_EN_IPSEC ++ ++#if defined(CONFIG_MLX5_EN_IPSEC) && defined(CONFIG_MLX5_ESWITCH) + void mlx5e_accel_ipsec_fs_cleanup(struct mlx5e_priv *priv); + int mlx5e_accel_ipsec_fs_init(struct mlx5e_priv *priv); + int mlx5e_accel_ipsec_fs_add_rule(struct mlx5e_priv *priv, +@@ -22,5 +22,12 @@ void mlx5e_accel_ipsec_fs_del_rule(struc + #else + static inline void mlx5e_accel_ipsec_fs_cleanup(struct mlx5e_priv *priv) {} + static inline int mlx5e_accel_ipsec_fs_init(struct mlx5e_priv *priv) { return 0; } ++static inline int mlx5e_accel_ipsec_fs_add_rule(struct mlx5e_priv *priv, ++ struct mlx5_accel_esp_xfrm_attrs *attrs, ++ u32 ipsec_obj_id, ++ struct mlx5e_ipsec_rule *ipsec_rule) { return 0; } ++static inline void mlx5e_accel_ipsec_fs_del_rule(struct mlx5e_priv *priv, ++ struct mlx5_accel_esp_xfrm_attrs *attrs, ++ struct mlx5e_ipsec_rule *ipsec_rule) {} + #endif + #endif /* __MLX5_IPSEC_STEERING_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0131-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch b/src/mlnx-ofa_kernel-5.8/backports/0131-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch new file mode 100644 index 0000000..fe8e18f --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0131-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch @@ -0,0 +1,301 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c + +Change-Id: I600bc912c6dcf6299d714ba272d420ffc56c1ee3 +--- + .../mellanox/mlx5/core/en_accel/ipsec_rxtx.c | 143 +++++++++++++++++- + 1 file changed, 136 insertions(+), 7 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c +@@ -135,6 +135,9 @@ static int mlx5e_ipsec_remove_trailer(st + + static void mlx5e_ipsec_set_swp(struct sk_buff *skb, + struct mlx5_wqe_eth_seg *eseg, u8 mode, ++#ifndef HAVE_XFRM_OFFLOAD_INNER_IPPROTO ++ struct mlx5e_accel_tx_ipsec_state *ipsec_st, ++#endif + struct xfrm_offload *xo) + { + /* Tunnel Mode: +@@ -149,7 +152,11 @@ static void mlx5e_ipsec_set_swp(struct s + * SWP: OutL3 InL3 InL4 + * Pkt: MAC IP ESP UDP VXLAN IP L4 + */ +- ++#ifndef HAVE_XFRM_OFFLOAD_INNER_IPPROTO ++ struct ethhdr *eth; ++#endif ++ u8 inner_ipproto = 0; ++ struct xfrm_state *x; + /* Shared settings */ + eseg->swp_outer_l3_offset = skb_network_offset(skb) / 2; + if (skb->protocol == htons(ETH_P_IPV6)) +@@ -157,11 +164,36 @@ static void mlx5e_ipsec_set_swp(struct s + + /* Tunnel mode */ + if (mode == XFRM_MODE_TUNNEL) { ++#ifdef HAVE_XFRM_OFFLOAD_INNER_IPPROTO ++ inner_ipproto = xo->inner_ipproto; ++#endif ++ /* Backport code to support kernels that don't have IPsec Tunnel mode Fix: ++ * 45a98ef4922d net/xfrm: IPsec tunnel mode fix inner_ipproto setting in sec_path ++ */ ++ if (!inner_ipproto) { ++ x = xfrm_input_state(skb); ++ switch (x->props.family) { ++ case AF_INET: ++ inner_ipproto = ((struct iphdr *)(skb->data + skb_inner_network_offset(skb)))->protocol; ++ break; ++ case AF_INET6: ++ inner_ipproto = ((struct ipv6hdr *)(skb->data + skb_inner_network_offset(skb)))->nexthdr; ++ break; ++ default: ++ break; ++ } ++ } ++ ++#ifdef HAVE_XFRM_OFFLOAD_INNER_IPPROTO ++ xo->inner_ipproto = inner_ipproto; ++#else ++ ipsec_st->inner_ipproto = inner_ipproto; ++#endif + eseg->swp_inner_l3_offset = skb_inner_network_offset(skb) / 2; + if (xo->proto == IPPROTO_IPV6) + eseg->swp_flags |= MLX5_ETH_WQE_SWP_INNER_L3_IPV6; + +- switch (xo->inner_ipproto) { ++ switch (inner_ipproto) { + case IPPROTO_UDP: + eseg->swp_flags |= MLX5_ETH_WQE_SWP_INNER_L4_UDP; + fallthrough; +@@ -179,7 +211,32 @@ static void mlx5e_ipsec_set_swp(struct s + if (mode != XFRM_MODE_TRANSPORT) + return; + +- if (!xo->inner_ipproto) { ++#ifdef HAVE_XFRM_OFFLOAD_INNER_IPPROTO ++ inner_ipproto = xo->inner_ipproto; ++#else ++ if (skb->inner_protocol_type != ENCAP_TYPE_ETHER){ ++ return; ++ } ++ ++ if (skb->inner_protocol_type == ENCAP_TYPE_IPPROTO) { ++ inner_ipproto = skb->inner_ipproto; ++ } else { ++ eth = (struct ethhdr *)skb_inner_mac_header(skb); ++ switch (ntohs(eth->h_proto)) { ++ case ETH_P_IP: ++ inner_ipproto = ((struct iphdr *)(skb->data + skb_inner_network_offset(skb)))->protocol;; ++ break; ++ case ETH_P_IPV6: ++ inner_ipproto = ((struct ipv6hdr *)(skb->data + skb_inner_network_offset(skb)))->nexthdr; ++ break; ++ default: ++ break; ++ } ++ } ++ ipsec_st->inner_ipproto = inner_ipproto; ++#endif ++ ++ if (!inner_ipproto) { + switch (xo->proto) { + case IPPROTO_UDP: + eseg->swp_flags |= MLX5_ETH_WQE_SWP_OUTER_L4_UDP; +@@ -193,7 +250,7 @@ static void mlx5e_ipsec_set_swp(struct s + } + } else { + /* Tunnel(VXLAN TCP/UDP) over Transport Mode */ +- switch (xo->inner_ipproto) { ++ switch (inner_ipproto) { + case IPPROTO_UDP: + eseg->swp_flags |= MLX5_ETH_WQE_SWP_INNER_L4_UDP; + fallthrough; +@@ -279,6 +336,26 @@ static void mlx5e_ipsec_set_metadata(str + ntohs(mdata->content.tx.seq)); + } + ++/* Copy from upstream net/ipv4/esp4.c */ ++#ifndef HAVE_ESP_OUTPUT_FILL_TRAILER ++ static ++void esp_output_fill_trailer(u8 *tail, int tfclen, int plen, __u8 proto) ++{ ++ /* Fill padding... */ ++ if (tfclen) { ++ memset(tail, 0, tfclen); ++ tail += tfclen; ++ } ++ do { ++ int i; ++ for (i = 0; i < plen - 2; i++) ++ tail[i] = i + 1; ++ } while (0); ++ tail[plen - 2] = plen - 2; ++ tail[plen - 1] = proto; ++} ++#endif ++ + void mlx5e_ipsec_handle_tx_wqe(struct mlx5e_tx_wqe *wqe, + struct mlx5e_accel_tx_ipsec_state *ipsec_st, + struct mlx5_wqe_inline_seg *inlseg) +@@ -314,17 +391,24 @@ static int mlx5e_ipsec_set_state(struct + } + + void mlx5e_ipsec_tx_build_eseg(struct mlx5e_priv *priv, struct sk_buff *skb, ++#ifndef HAVE_XFRM_OFFLOAD_INNER_IPPROTO ++ struct mlx5e_accel_tx_ipsec_state *ipsec_st, ++#endif + struct mlx5_wqe_eth_seg *eseg) + { + struct xfrm_offload *xo = xfrm_offload(skb); + struct xfrm_encap_tmpl *encap; + struct xfrm_state *x; ++#ifdef HAVE_SECPATH_SET_RETURN_POINTER + struct sec_path *sp; ++#endif + u8 l3_proto; + ++#ifdef HAVE_SECPATH_SET_RETURN_POINTER + sp = skb_sec_path(skb); + if (unlikely(sp->len != 1)) + return; ++#endif + + x = xfrm_input_state(skb); + if (unlikely(!x)) +@@ -335,7 +419,11 @@ void mlx5e_ipsec_tx_build_eseg(struct ml + skb->protocol != htons(ETH_P_IPV6)))) + return; + ++#ifdef HAVE_XFRM_OFFLOAD_INNER_IPPROTO + mlx5e_ipsec_set_swp(skb, eseg, x->props.mode, xo); ++#else ++ mlx5e_ipsec_set_swp(skb, eseg, x->props.mode, ipsec_st, xo); ++#endif + + l3_proto = (x->props.family == AF_INET) ? + ((struct iphdr *)skb_network_header(skb))->protocol : +@@ -364,12 +452,18 @@ bool mlx5e_ipsec_handle_tx_skb(struct ne + struct mlx5e_priv *priv = netdev_priv(netdev); + struct xfrm_offload *xo = xfrm_offload(skb); + struct mlx5e_ipsec_sa_entry *sa_entry; +- struct mlx5e_ipsec_metadata *mdata; ++ struct mlx5e_ipsec_metadata *mdata = NULL; + struct xfrm_state *x; ++#ifdef HAVE_SECPATH_SET_RETURN_POINTER + struct sec_path *sp; ++#endif + ++#ifdef HAVE_SECPATH_SET_RETURN_POINTER + sp = skb_sec_path(skb); + if (unlikely(sp->len != 1)) { ++#else ++ if (unlikely(skb->sp->len != 1)) { ++#endif + atomic64_inc(&priv->ipsec->sw_stats.ipsec_tx_drop_bundle); + goto drop; + } +@@ -422,11 +516,18 @@ mlx5e_ipsec_build_sp(struct net_device * + struct mlx5e_priv *priv = netdev_priv(netdev); + struct xfrm_offload *xo; + struct xfrm_state *xs; ++#ifdef HAVE_SECPATH_SET_RETURN_POINTER + struct sec_path *sp; ++#endif + u32 sa_handle; + ++#ifdef HAVE_SECPATH_SET_RETURN_POINTER + sp = secpath_set(skb); + if (unlikely(!sp)) { ++#else ++ skb->sp = secpath_dup(skb->sp); ++ if (unlikely(!skb->sp)) { ++#endif + atomic64_inc(&priv->ipsec->sw_stats.ipsec_rx_drop_sp_alloc); + return NULL; + } +@@ -438,9 +539,14 @@ mlx5e_ipsec_build_sp(struct net_device * + return NULL; + } + ++#ifdef HAVE_SECPATH_SET_RETURN_POINTER + sp = skb_sec_path(skb); + sp->xvec[sp->len++] = xs; + sp->olen++; ++#else ++ skb->sp->xvec[skb->sp->len++] = xs; ++ skb->sp->olen++; ++#endif + + xo = xfrm_offload(skb); + xo->flags = CRYPTO_DONE; +@@ -500,7 +606,9 @@ handle_rx_skb_full(struct mlx5e_priv *pr + struct mlx5_cqe64 *cqe) + { + struct xfrm_state *xs; ++#ifdef HAVE_SECPATH_SET_RETURN_POINTER + struct sec_path *sp; ++#endif + struct iphdr *v4_hdr; + u8 ip_ver; + +@@ -514,11 +622,20 @@ handle_rx_skb_full(struct mlx5e_priv *pr + if (!xs) + return; + +- sp = secpath_set(skb); +- if (unlikely(!sp)) ++#ifdef HAVE_SECPATH_SET_RETURN_POINTER ++ sp = secpath_set(skb); ++ if (unlikely(!sp)) ++#else ++ skb->sp = secpath_dup(skb->sp); ++ if (unlikely(!skb->sp)) ++#endif + return; + ++#ifdef HAVE_SECPATH_SET_RETURN_POINTER + sp->xvec[sp->len++] = xs; ++#else ++ skb->sp->xvec[skb->sp->len++] = xs; ++#endif + return; + } + +@@ -530,12 +647,19 @@ handle_rx_skb_inline(struct mlx5e_priv * + u32 ipsec_meta_data = be32_to_cpu(cqe->ft_metadata); + struct xfrm_offload *xo; + struct xfrm_state *xs; ++#ifdef HAVE_SECPATH_SET_RETURN_POINTER + struct sec_path *sp; ++#endif + u32 sa_handle; + + sa_handle = MLX5_IPSEC_METADATA_HANDLE(ipsec_meta_data); ++#ifdef HAVE_SECPATH_SET_RETURN_POINTER + sp = secpath_set(skb); + if (unlikely(!sp)) { ++#else ++ skb->sp = secpath_dup(skb->sp); ++ if (unlikely(!skb->sp)) { ++#endif + atomic64_inc(&priv->ipsec->sw_stats.ipsec_rx_drop_sp_alloc); + return; + } +@@ -546,9 +670,14 @@ handle_rx_skb_inline(struct mlx5e_priv * + return; + } + ++#ifdef HAVE_SECPATH_SET_RETURN_POINTER + sp = skb_sec_path(skb); + sp->xvec[sp->len++] = xs; + sp->olen++; ++#else ++ skb->sp->xvec[skb->sp->len++] = xs; ++ skb->sp->olen++; ++#endif + + xo = xfrm_offload(skb); + xo->flags = CRYPTO_DONE; diff --git a/src/mlnx-ofa_kernel-5.8/backports/0132-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch b/src/mlnx-ofa_kernel-5.8/backports/0132-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch new file mode 100644 index 0000000..8f39733 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0132-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch @@ -0,0 +1,95 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.h + +Change-Id: Ic68e0290438704e17540ad7041bed443eca64227 +--- + .../mellanox/mlx5/core/en_accel/ipsec_rxtx.h | 35 ++++++++++++++++--- + 1 file changed, 30 insertions(+), 5 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.h +@@ -49,6 +49,9 @@ struct mlx5e_accel_tx_ipsec_state { + struct xfrm_state *x; + u32 tailen; + u32 plen; ++#ifndef HAVE_XFRM_OFFLOAD_INNER_IPPROTO ++ u8 inner_ipproto; ++#endif + }; + + #ifdef CONFIG_MLX5_EN_IPSEC +@@ -91,6 +94,9 @@ static inline bool mlx5e_ipsec_eseg_meta + } + + void mlx5e_ipsec_tx_build_eseg(struct mlx5e_priv *priv, struct sk_buff *skb, ++#ifndef HAVE_XFRM_OFFLOAD_INNER_IPPROTO ++ struct mlx5e_accel_tx_ipsec_state *ipsec_st, ++#endif + struct mlx5_wqe_eth_seg *eseg); + + static inline netdev_features_t +@@ -98,6 +104,12 @@ mlx5e_ipsec_feature_check(struct sk_buff + { + struct xfrm_offload *xo = xfrm_offload(skb); + struct sec_path *sp = skb_sec_path(skb); ++#ifdef HAVE_XFRM_OFFLOAD_INNER_IPPROTO ++ u8 inner_ipproto = xo->inner_ipproto; ++#else ++ u8 inner_ipproto = 0; ++#endif ++ + + if (sp && sp->len && xo) { + struct xfrm_state *x = sp->xvec[0]; +@@ -105,7 +117,7 @@ mlx5e_ipsec_feature_check(struct sk_buff + if (!x || !x->xso.offload_handle) + goto out_disable; + +- if (xo->inner_ipproto) { ++ if (inner_ipproto) { + /* Cannot support tunnel packet over IPsec tunnel mode + * because we cannot offload three IP header csum + */ +@@ -113,8 +125,8 @@ mlx5e_ipsec_feature_check(struct sk_buff + goto out_disable; + + /* Only support UDP or TCP L4 checksum */ +- if (xo->inner_ipproto != IPPROTO_UDP && +- xo->inner_ipproto != IPPROTO_TCP) ++ if (inner_ipproto != IPPROTO_UDP && ++ inner_ipproto != IPPROTO_TCP) + goto out_disable; + } + +@@ -129,15 +141,28 @@ out_disable: + + static inline bool + mlx5e_ipsec_txwqe_build_eseg_csum(struct mlx5e_txqsq *sq, struct sk_buff *skb, ++#ifndef HAVE_XFRM_OFFLOAD_INNER_IPPROTO ++ struct mlx5e_accel_tx_ipsec_state *ipsec_st, ++#endif + struct mlx5_wqe_eth_seg *eseg) + { +- u8 inner_ipproto; ++#ifdef HAVE_XFRM_OFFLOAD_INNER_IPPROTO ++ struct xfrm_offload *xo; ++ u32 inner_ipproto; ++#else ++ u8 inner_ipproto = ipsec_st->inner_ipproto; ++#endif + + if (!mlx5e_ipsec_eseg_meta(eseg)) + return false; + ++#ifdef HAVE_XFRM_OFFLOAD_INNER_IPPROTO ++ xo = xfrm_offload(skb); ++ inner_ipproto = xo->inner_ipproto; ++#else ++ inner_ipproto = ipsec_st->inner_ipproto; ++#endif + eseg->cs_flags = MLX5_ETH_WQE_L3_CSUM; +- inner_ipproto = xfrm_offload(skb)->inner_ipproto; + if (inner_ipproto) { + eseg->cs_flags |= MLX5_ETH_WQE_L3_INNER_CSUM; + if (inner_ipproto == IPPROTO_TCP || inner_ipproto == IPPROTO_UDP) diff --git a/src/mlnx-ofa_kernel-5.8/backports/0133-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch b/src/mlnx-ofa_kernel-5.8/backports/0133-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch new file mode 100644 index 0000000..8f9ac13 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0133-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch @@ -0,0 +1,93 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.c + +Change-Id: Ib15b08b1fb3f4966a8b1f65f59a39b48aeaa6638 +--- + .../mellanox/mlx5/core/en_accel/ktls.c | 26 +++++++++++++++++-- + 1 file changed, 24 insertions(+), 2 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.c +@@ -1,6 +1,8 @@ + // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB + // Copyright (c) 2019 Mellanox Technologies. + ++#ifdef HAVE_KTLS_STRUCTS ++ + #include "en.h" + #include "en_accel/tls.h" + #include "en_accel/ktls.h" +@@ -22,7 +24,11 @@ static int mlx5e_ktls_add(struct net_dev + if (direction == TLS_OFFLOAD_CTX_DIR_TX) + err = mlx5e_ktls_add_tx(netdev, sk, crypto_info, start_offload_tcp_sn); + else ++#ifdef HAVE_KTLS_RX_SUPPORT + err = mlx5e_ktls_add_rx(netdev, sk, crypto_info, start_offload_tcp_sn); ++#else ++ err = -EOPNOTSUPP; ++#endif + + return err; + } +@@ -33,10 +39,13 @@ static void mlx5e_ktls_del(struct net_de + { + if (direction == TLS_OFFLOAD_CTX_DIR_TX) + mlx5e_ktls_del_tx(netdev, tls_ctx); ++#ifdef HAVE_KTLS_RX_SUPPORT + else + mlx5e_ktls_del_rx(netdev, tls_ctx); ++#endif + } + ++#ifdef HAVE_TLSDEV_OPS_HAS_TLS_DEV_RESYNC + static int mlx5e_ktls_resync(struct net_device *netdev, + struct sock *sk, u32 seq, u8 *rcd_sn, + enum tls_offload_ctx_dir direction) +@@ -44,14 +53,21 @@ static int mlx5e_ktls_resync(struct net_ + if (unlikely(direction != TLS_OFFLOAD_CTX_DIR_RX)) + return -EOPNOTSUPP; + ++#ifdef HAVE_KTLS_RX_SUPPORT + mlx5e_ktls_rx_resync(netdev, sk, seq, rcd_sn); + return 0; ++#else ++ return -EOPNOTSUPP; ++#endif + } ++#endif + + static const struct tlsdev_ops mlx5e_ktls_ops = { + .tls_dev_add = mlx5e_ktls_add, + .tls_dev_del = mlx5e_ktls_del, ++#ifdef HAVE_TLSDEV_OPS_HAS_TLS_DEV_RESYNC + .tls_dev_resync = mlx5e_ktls_resync, ++#endif + }; + + bool mlx5e_accel_is_ktls_rx(struct mlx5_core_dev *mdev) +@@ -85,12 +101,15 @@ void mlx5e_ktls_build_netdev(struct mlx5 + netdev->features |= NETIF_F_HW_TLS_TX; + } + +- if (mlx5e_accel_is_ktls_rx(mdev)) +- netdev->hw_features |= NETIF_F_HW_TLS_RX; ++#ifdef HAVE_KTLS_RX_SUPPORT ++ if (mlx5e_accel_is_ktls_rx(mdev)) ++ netdev->hw_features |= NETIF_F_HW_TLS_RX; ++#endif + + netdev->tlsdev_ops = &mlx5e_ktls_ops; + } + ++#ifdef HAVE_KTLS_RX_SUPPORT + int mlx5e_ktls_set_feature_rx(struct net_device *netdev, bool enable) + { + struct mlx5e_priv *priv = netdev_priv(netdev); +@@ -138,3 +157,6 @@ void mlx5e_ktls_cleanup_rx(struct mlx5e_ + + destroy_workqueue(priv->tls->rx_wq); + } ++#endif /* HAVE_KTLS_RX_SUPPORT */ ++ ++#endif diff --git a/src/mlnx-ofa_kernel-5.8/backports/0134-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch b/src/mlnx-ofa_kernel-5.8/backports/0134-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch new file mode 100644 index 0000000..918f1c8 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0134-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch @@ -0,0 +1,86 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.h + +Change-Id: I92bcf893a32c3833c68549923dcf36e97e790806 +--- + .../mellanox/mlx5/core/en_accel/ktls.h | 50 ++++++++++--------- + 1 file changed, 27 insertions(+), 23 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.h +@@ -6,9 +6,17 @@ + + #include "en.h" + +-#ifdef CONFIG_MLX5_EN_TLS +- ++#if defined CONFIG_MLX5_EN_TLS && defined HAVE_KTLS_STRUCTS + void mlx5e_ktls_build_netdev(struct mlx5e_priv *priv); ++bool mlx5e_accel_is_ktls_rx(struct mlx5_core_dev *mdev); ++#else ++static inline void mlx5e_ktls_build_netdev(struct mlx5e_priv *priv) ++{ ++} ++static inline bool mlx5e_accel_is_ktls_rx(struct mlx5_core_dev *mdev) { return false; } ++#endif ++ ++#if defined(CONFIG_MLX5_EN_TLS) && defined(HAVE_KTLS_RX_SUPPORT) + int mlx5e_ktls_init_rx(struct mlx5e_priv *priv); + void mlx5e_ktls_cleanup_rx(struct mlx5e_priv *priv); + int mlx5e_ktls_set_feature_rx(struct net_device *netdev, bool enable); +@@ -16,26 +24,8 @@ struct mlx5e_ktls_resync_resp * + mlx5e_ktls_rx_resync_create_resp_list(void); + void mlx5e_ktls_rx_resync_destroy_resp_list(struct mlx5e_ktls_resync_resp *resp_list); + +-static inline bool mlx5e_accel_is_ktls_tx(struct mlx5_core_dev *mdev) +-{ +- return !is_kdump_kernel() && +- mlx5_accel_is_ktls_tx(mdev); +-} +- +-bool mlx5e_accel_is_ktls_rx(struct mlx5_core_dev *mdev); +- +-static inline bool mlx5e_accel_is_ktls_device(struct mlx5_core_dev *mdev) +-{ +- return !is_kdump_kernel() && +- mlx5_accel_is_ktls_device(mdev); +-} + + #else +- +-static inline void mlx5e_ktls_build_netdev(struct mlx5e_priv *priv) +-{ +-} +- + static inline int mlx5e_ktls_init_rx(struct mlx5e_priv *priv) + { + return 0; +@@ -60,10 +50,24 @@ mlx5e_ktls_rx_resync_create_resp_list(vo + static inline void + mlx5e_ktls_rx_resync_destroy_resp_list(struct mlx5e_ktls_resync_resp *resp_list) {} + +-static inline bool mlx5e_accel_is_ktls_tx(struct mlx5_core_dev *mdev) { return false; } +-static inline bool mlx5e_accel_is_ktls_rx(struct mlx5_core_dev *mdev) { return false; } +-static inline bool mlx5e_accel_is_ktls_device(struct mlx5_core_dev *mdev) { return false; } + + #endif + ++#if defined(CONFIG_MLX5_EN_TLS) ++static inline bool mlx5e_accel_is_ktls_tx(struct mlx5_core_dev *mdev) ++{ ++ return !is_kdump_kernel() && ++ mlx5_accel_is_ktls_tx(mdev); ++} ++ ++ ++static inline bool mlx5e_accel_is_ktls_device(struct mlx5_core_dev *mdev) ++{ ++ return !is_kdump_kernel() && ++ mlx5_accel_is_ktls_device(mdev); ++} ++#else ++static inline bool mlx5e_accel_is_ktls_tx(struct mlx5_core_dev *mdev) { return false; } ++static inline bool mlx5e_accel_is_ktls_device(struct mlx5_core_dev *mdev) { return false; } ++#endif + #endif /* __MLX5E_TLS_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0135-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch b/src/mlnx-ofa_kernel-5.8/backports/0135-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch new file mode 100644 index 0000000..ae28778 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0135-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch @@ -0,0 +1,100 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c + +Change-Id: I32e25b9af4d84f8ef783db930f22dd155d87b7f7 +--- + .../mellanox/mlx5/core/en_accel/ktls_rx.c | 22 ++++++++++++++++++- + 1 file changed, 21 insertions(+), 1 deletion(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c +@@ -1,6 +1,8 @@ + // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB + // Copyright (c) 2019 Mellanox Technologies. + ++#ifdef HAVE_KTLS_RX_SUPPORT ++ + #include + #include "en_accel/en_accel.h" + #include "en_accel/tls.h" +@@ -34,7 +36,9 @@ enum { + }; + + struct mlx5e_ktls_rx_resync_ctx { ++#ifdef HAVE_TLS_OFFLOAD_RESYNC_ASYNC_STRUCT + struct tls_offload_resync_async core; ++#endif + struct work_struct work; + struct mlx5e_priv *priv; + refcount_t refcnt; +@@ -405,7 +409,9 @@ void mlx5e_ktls_handle_get_psv_completio + struct mlx5e_ktls_rx_resync_ctx *resync; + u8 tracker_state, auth_state, *ctx; + struct device *dev; ++#ifdef HAVE_TLS_OFFLOAD_RX_RESYNC_ASYNC_REQUEST_START + u32 hw_seq; ++#endif + + priv_rx = buf->priv_rx; + resync = &priv_rx->resync; +@@ -425,9 +431,14 @@ void mlx5e_ktls_handle_get_psv_completio + goto out; + } + ++#ifdef HAVE_TLS_OFFLOAD_RX_RESYNC_ASYNC_REQUEST_START + hw_seq = MLX5_GET(tls_progress_params, ctx, hw_resync_tcp_sn); + tls_offload_rx_resync_async_request_end(priv_rx->sk, cpu_to_be32(hw_seq)); + priv_rx->rq_stats->tls_resync_req_end++; ++#else ++ tls_offload_rx_force_resync_request(priv_rx->sk); ++#endif ++ + out: + mlx5e_ktls_priv_rx_put(priv_rx); + dma_unmap_single(dev, buf->dma_addr, PROGRESS_PARAMS_PADDED_SIZE, DMA_FROM_DEVICE); +@@ -463,10 +474,12 @@ static void resync_update_sn(struct mlx5 + struct ethhdr *eth = (struct ethhdr *)(skb->data); + struct net_device *netdev = rq->netdev; + struct sock *sk = NULL; ++#ifdef HAVE_TLS_OFFLOAD_RX_RESYNC_ASYNC_REQUEST_START + unsigned int datalen; ++ __be32 seq; ++#endif + struct iphdr *iph; + struct tcphdr *th; +- __be32 seq; + int depth = 0; + + __vlan_get_protocol(skb, eth->h_proto, &depth); +@@ -504,10 +517,14 @@ static void resync_update_sn(struct mlx5 + if (unlikely(!resync_queue_get_psv(sk))) + goto unref; + ++#ifdef HAVE_TLS_OFFLOAD_RX_RESYNC_ASYNC_REQUEST_START + seq = th->seq; + datalen = skb->len - depth; + tls_offload_rx_resync_async_request_start(sk, seq, datalen); + rq->stats->tls_resync_req_start++; ++#else ++ tls_offload_rx_force_resync_request(sk); ++#endif + + unref: + sock_gen_put(sk); +@@ -624,8 +641,10 @@ int mlx5e_ktls_add_rx(struct net_device + accel_rule_init(&priv_rx->rule, priv); + resync = &priv_rx->resync; + resync_init(resync, priv); ++#ifdef HAVE_TLS_OFFLOAD_RESYNC_ASYNC_STRUCT + tls_offload_ctx_rx(tls_ctx)->resync_async = &resync->core; + tls_offload_rx_resync_set_type(sk, TLS_OFFLOAD_SYNC_TYPE_DRIVER_REQ_ASYNC); ++#endif + + err = post_rx_param_wqes(priv->channels.c[rxq], priv_rx, start_offload_tcp_sn); + if (err) +@@ -745,3 +764,4 @@ bool mlx5e_ktls_rx_handle_resync_list(st + + return i == budget; + } ++#endif /* HAVE_KTLS_RX_SUPPORT */ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0136-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch b/src/mlnx-ofa_kernel-5.8/backports/0136-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch new file mode 100644 index 0000000..440d54e --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0136-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch @@ -0,0 +1,34 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c + +Change-Id: I8dd3f3086a4a2b0d83777d16b25d0104ac9c64c3 +--- + drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c | 5 +++++ + 1 file changed, 5 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c +@@ -1,6 +1,8 @@ + // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB + // Copyright (c) 2019 Mellanox Technologies. + ++#ifdef HAVE_KTLS_STRUCTS ++ + #include "en_accel/tls.h" + #include "en_accel/ktls_txrx.h" + #include "en_accel/ktls_utils.h" +@@ -475,6 +477,7 @@ bool mlx5e_ktls_handle_tx_skb(struct tls + goto out; + WARN_ON_ONCE(1); + fallthrough; ++ + case MLX5E_KTLS_SYNC_FAIL: + goto err_out; + } +@@ -494,3 +497,5 @@ err_out: + dev_kfree_skb_any(skb); + return false; + } ++ ++#endif diff --git a/src/mlnx-ofa_kernel-5.8/backports/0137-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch b/src/mlnx-ofa_kernel-5.8/backports/0137-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch new file mode 100644 index 0000000..d56adac --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0137-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch @@ -0,0 +1,110 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_txrx.h + +Change-Id: I69bf80d23472f7ce8c693881d515c6f206a5dee1 +--- + .../mellanox/mlx5/core/en_accel/ktls_txrx.h | 70 +++++++++++++++++-- + 1 file changed, 65 insertions(+), 5 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_txrx.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_txrx.h +@@ -14,18 +14,75 @@ struct mlx5e_accel_tx_tls_state { + u32 tls_tisn; + }; + ++#ifdef HAVE_KTLS_STRUCTS + u16 mlx5e_ktls_get_stop_room(struct mlx5_core_dev *mdev, struct mlx5e_params *params); + + bool mlx5e_ktls_handle_tx_skb(struct tls_context *tls_ctx, struct mlx5e_txqsq *sq, + struct sk_buff *skb, int datalen, + struct mlx5e_accel_tx_tls_state *state); ++#else ++static inline ++u16 mlx5e_ktls_get_stop_room(struct mlx5_core_dev *mdev, struct mlx5e_params *params) ++{ ++ return 0; ++} ++ ++static inline ++bool mlx5e_ktls_handle_tx_skb(struct tls_context *tls_ctx, struct mlx5e_txqsq *sq, ++ struct sk_buff *skb, int datalen, ++ struct mlx5e_accel_tx_tls_state *state) ++{ ++ return false; ++} ++#endif ++ ++#ifdef HAVE_KTLS_RX_SUPPORT + void mlx5e_ktls_handle_rx_skb(struct mlx5e_rq *rq, struct sk_buff *skb, + struct mlx5_cqe64 *cqe, u32 *cqe_bcnt); + + void mlx5e_ktls_handle_ctx_completion(struct mlx5e_icosq_wqe_info *wi); + void mlx5e_ktls_handle_get_psv_completion(struct mlx5e_icosq_wqe_info *wi, + struct mlx5e_icosq *sq); ++bool mlx5e_ktls_rx_handle_resync_list(struct mlx5e_channel *c, int budget); ++ ++static inline bool ++mlx5e_ktls_rx_pending_resync_list(struct mlx5e_channel *c, int budget) ++{ ++ return budget && test_bit(MLX5E_SQ_STATE_PENDING_TLS_RX_RESYNC, &c->async_icosq.state); ++} ++ ++#else ++static inline bool ++mlx5e_ktls_rx_handle_resync_list(struct mlx5e_channel *c, int budget) ++{ ++ return false; ++} ++ ++static inline bool ++mlx5e_ktls_rx_pending_resync_list(struct mlx5e_channel *c, int budget) ++{ ++ return false; ++} ++ ++static inline ++void mlx5e_ktls_handle_rx_skb(struct mlx5e_rq *rq, struct sk_buff *skb, ++ struct mlx5_cqe64 *cqe, u32 *cqe_bcnt) ++{ ++} ++ ++static inline ++void mlx5e_ktls_handle_ctx_completion(struct mlx5e_icosq_wqe_info *wi) ++{ ++} ++ ++static inline ++void mlx5e_ktls_handle_get_psv_completion(struct mlx5e_icosq_wqe_info *wi, ++ struct mlx5e_icosq *sq) ++{ ++} ++#endif + ++#ifdef HAVE_KTLS_STRUCTS + void mlx5e_ktls_tx_handle_resync_dump_comp(struct mlx5e_txqsq *sq, + struct mlx5e_tx_wqe_info *wi, + u32 *dma_fifo_cc); +@@ -40,14 +97,17 @@ mlx5e_ktls_tx_try_handle_resync_dump_com + } + return false; + } +- +-bool mlx5e_ktls_rx_handle_resync_list(struct mlx5e_channel *c, int budget); +- ++#else + static inline bool +-mlx5e_ktls_rx_pending_resync_list(struct mlx5e_channel *c, int budget) ++mlx5e_ktls_tx_try_handle_resync_dump_comp(struct mlx5e_txqsq *sq, ++ struct mlx5e_tx_wqe_info *wi, ++ u32 *dma_fifo_cc) + { +- return budget && test_bit(MLX5E_SQ_STATE_PENDING_TLS_RX_RESYNC, &c->async_icosq.state); ++ return false; + } ++ ++#endif /* HAVE_KTLS_STRUCTS */ ++ + #else + static inline bool + mlx5e_ktls_tx_try_handle_resync_dump_comp(struct mlx5e_txqsq *sq, diff --git a/src/mlnx-ofa_kernel-5.8/backports/0138-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch b/src/mlnx-ofa_kernel-5.8/backports/0138-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch new file mode 100644 index 0000000..3ff362f --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0138-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch @@ -0,0 +1,73 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c + +Change-Id: I224c6c18af32e891487a476da47bca9fd6e855e3 +--- + .../mellanox/mlx5/core/en_accel/tls.c | 19 +++++++++++++++++-- + 1 file changed, 17 insertions(+), 2 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c +@@ -160,17 +160,25 @@ static void mlx5e_tls_del(struct net_dev + direction == TLS_OFFLOAD_CTX_DIR_TX); + } + ++#if defined(HAVE_TLSDEV_OPS_HAS_TLS_DEV_RESYNC_RX) || defined(HAVE_TLSDEV_OPS_HAS_TLS_DEV_RESYNC) ++#ifdef HAVE_TLSDEV_OPS_HAS_TLS_DEV_RESYNC + static int mlx5e_tls_resync(struct net_device *netdev, struct sock *sk, + u32 seq, u8 *rcd_sn_data, + enum tls_offload_ctx_dir direction) ++#elif defined(HAVE_TLSDEV_OPS_HAS_TLS_DEV_RESYNC_RX) ++static void mlx5e_tls_resync_rx(struct net_device *netdev, struct sock *sk, ++ u32 seq, u64 rcd_sn) ++#endif + { + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5e_tls_offload_context_rx *rx_ctx; ++#ifdef HAVE_TLSDEV_OPS_HAS_TLS_DEV_RESYNC + __be64 rcd_sn = *(__be64 *)rcd_sn_data; + + if (WARN_ON_ONCE(direction != TLS_OFFLOAD_CTX_DIR_RX)) + return -EINVAL; ++#endif + rx_ctx = mlx5e_get_tls_rx_context(tls_ctx); + + netdev_info(netdev, "resyncing seq %d rcd %lld\n", seq, +@@ -178,13 +186,20 @@ static int mlx5e_tls_resync(struct net_d + mlx5_accel_tls_resync_rx(priv->mdev, rx_ctx->handle, seq, rcd_sn); + atomic64_inc(&priv->tls->sw_stats.rx_tls_resync_reply); + ++#ifdef HAVE_TLSDEV_OPS_HAS_TLS_DEV_RESYNC + return 0; ++#endif + } ++#endif + + static const struct tlsdev_ops mlx5e_tls_ops = { + .tls_dev_add = mlx5e_tls_add, + .tls_dev_del = mlx5e_tls_del, ++#ifdef HAVE_TLSDEV_OPS_HAS_TLS_DEV_RESYNC + .tls_dev_resync = mlx5e_tls_resync, ++#elif defined(HAVE_TLSDEV_OPS_HAS_TLS_DEV_RESYNC_RX) ++ .tls_dev_resync_rx = mlx5e_tls_resync_rx, ++#endif + }; + + void mlx5e_tls_build_netdev(struct mlx5e_priv *priv) +@@ -206,12 +221,12 @@ void mlx5e_tls_build_netdev(struct mlx5e + netdev->features |= NETIF_F_HW_TLS_TX; + netdev->hw_features |= NETIF_F_HW_TLS_TX; + } +- ++#ifdef HAVE_NETIF_F_HW_TLS_RX + if (caps & MLX5_ACCEL_TLS_RX) { + netdev->features |= NETIF_F_HW_TLS_RX; + netdev->hw_features |= NETIF_F_HW_TLS_RX; + } +- ++#endif + if (!(caps & MLX5_ACCEL_TLS_LRO)) { + netdev->features &= ~NETIF_F_LRO; + netdev->hw_features &= ~NETIF_F_LRO; diff --git a/src/mlnx-ofa_kernel-5.8/backports/0139-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch b/src/mlnx-ofa_kernel-5.8/backports/0139-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch new file mode 100644 index 0000000..6022770 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0139-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch @@ -0,0 +1,83 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.h + +Change-Id: Ic6d0ecbaadb1c6c9e60709a132aae3aaef1888a9 +--- + .../mellanox/mlx5/core/en_accel/tls.h | 32 +++++++++++++++++-- + 1 file changed, 30 insertions(+), 2 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.h +@@ -61,7 +61,11 @@ struct mlx5e_tls { + }; + + struct mlx5e_tls_offload_context_tx { ++#ifdef HAVE_TLS_OFFLOAD_CONTEXT_TX_STRUCT + struct tls_offload_context_tx base; ++#else ++ struct tls_offload_context base; ++#endif + u32 expected_seq; + __be32 swid; + }; +@@ -70,14 +74,27 @@ static inline struct mlx5e_tls_offload_c + mlx5e_get_tls_tx_context(struct tls_context *tls_ctx) + { + BUILD_BUG_ON(sizeof(struct mlx5e_tls_offload_context_tx) > ++#ifdef HAVE_TLS_OFFLOAD_CONTEXT_TX_STRUCT + TLS_OFFLOAD_CONTEXT_SIZE_TX); +- return container_of(tls_offload_ctx_tx(tls_ctx), ++#else ++ TLS_OFFLOAD_CONTEXT_SIZE); ++#endif ++ return container_of( ++#ifdef HAVE_TLS_OFFLOAD_CONTEXT_TX_STRUCT ++ tls_offload_ctx_tx(tls_ctx), ++#else ++ tls_offload_ctx(tls_ctx), ++#endif + struct mlx5e_tls_offload_context_tx, + base); + } + + struct mlx5e_tls_offload_context_rx { ++#ifdef HAVE_TLS_OFFLOAD_CONTEXT_RX_STRUCT + struct tls_offload_context_rx base; ++#else ++ struct tls_offload_context base; ++#endif + __be32 handle; + }; + +@@ -85,8 +102,17 @@ static inline struct mlx5e_tls_offload_c + mlx5e_get_tls_rx_context(struct tls_context *tls_ctx) + { + BUILD_BUG_ON(sizeof(struct mlx5e_tls_offload_context_rx) > ++#ifdef HAVE_TLS_OFFLOAD_CONTEXT_RX_STRUCT + TLS_OFFLOAD_CONTEXT_SIZE_RX); +- return container_of(tls_offload_ctx_rx(tls_ctx), ++#else ++ TLS_OFFLOAD_CONTEXT_SIZE); ++#endif ++ return container_of( ++#ifdef HAVE_TLS_OFFLOAD_CONTEXT_RX_STRUCT ++ tls_offload_ctx_rx(tls_ctx), ++#else ++ tls_offload_ctx(tls_ctx), ++#endif + struct mlx5e_tls_offload_context_rx, + base); + } +@@ -114,9 +140,11 @@ static inline bool mlx5e_accel_is_tls_de + + static inline void mlx5e_tls_build_netdev(struct mlx5e_priv *priv) + { ++#ifdef HAVE_UAPI_LINUX_TLS_H + if (!is_kdump_kernel() && + mlx5_accel_is_ktls_device(priv->mdev)) + mlx5e_ktls_build_netdev(priv); ++#endif + } + + static inline bool mlx5e_is_tls_on(struct mlx5e_priv *priv) { return false; } diff --git a/src/mlnx-ofa_kernel-5.8/backports/0140-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch b/src/mlnx-ofa_kernel-5.8/backports/0140-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch new file mode 100644 index 0000000..bfbd9bd --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0140-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch @@ -0,0 +1,81 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.c + +Change-Id: I9855117c5fecde6f22fa0614e55c570a400537af +--- + .../mellanox/mlx5/core/en_accel/tls_rxtx.c | 20 +++++++++++++++++-- + 1 file changed, 18 insertions(+), 2 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.c +@@ -31,6 +31,8 @@ + * + */ + ++#ifdef HAVE_UAPI_LINUX_TLS_H ++ + #include "en_accel/tls.h" + #include "en_accel/tls_rxtx.h" + #include "accel/accel.h" +@@ -298,6 +300,7 @@ err_out: + return false; + } + ++#ifdef HAVE_TLS_OFFLOAD_RX_RESYNC_REQUEST + static int tls_update_resync_sn(struct net_device *netdev, + struct sk_buff *skb, + struct mlx5e_tls_metadata *mdata) +@@ -321,11 +324,17 @@ static int tls_update_resync_sn(struct n + #if IS_ENABLED(CONFIG_IPV6) + } else { + struct ipv6hdr *ipv6h = (struct ipv6hdr *)iph; +- ++#ifdef HAVE___INET6_LOOKUP_ESTABLISHED_HAS_7_PARAMS + sk = __inet6_lookup_established(dev_net(netdev), &tcp_hashinfo, + &ipv6h->saddr, th->source, + &ipv6h->daddr, ntohs(th->dest), +- netdev->ifindex, 0); ++ netdev->ifindex); ++#else ++ sk = __inet6_lookup_established(dev_net(netdev), &tcp_hashinfo, ++ &ipv6h->saddr, th->source, ++ &ipv6h->daddr, ntohs(th->dest), ++ netdev->ifindex, 0); ++#endif + #endif + } + if (!sk || sk->sk_state == TCP_TIME_WAIT) { +@@ -343,6 +352,7 @@ static int tls_update_resync_sn(struct n + out: + return 0; + } ++#endif + + /* FPGA tls rx handler */ + void mlx5e_tls_handle_rx_skb_metadata(struct mlx5e_rq *rq, struct sk_buff *skb, +@@ -354,14 +364,18 @@ void mlx5e_tls_handle_rx_skb_metadata(st + /* Use the metadata */ + mdata = (struct mlx5e_tls_metadata *)(skb->data + ETH_HLEN); + switch (mdata->content.recv.syndrome) { ++#ifdef HAVE_SK_BUFF_DECRYPTED + case SYNDROM_DECRYPTED: + skb->decrypted = 1; + break; ++#endif ++#ifdef HAVE_TLS_OFFLOAD_RX_RESYNC_REQUEST + case SYNDROM_RESYNC_REQUEST: + tls_update_resync_sn(rq->netdev, skb, mdata); + priv = netdev_priv(rq->netdev); + atomic64_inc(&priv->tls->sw_stats.rx_tls_resync_request); + break; ++#endif + case SYNDROM_AUTH_FAILED: + /* Authentication failure will be observed and verified by kTLS */ + priv = netdev_priv(rq->netdev); +@@ -388,3 +402,5 @@ u16 mlx5e_tls_get_stop_room(struct mlx5_ + /* Resync SKB. */ + return mlx5e_stop_room_for_max_wqe(mdev); + } ++ ++#endif diff --git a/src/mlnx-ofa_kernel-5.8/backports/0141-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch b/src/mlnx-ofa_kernel-5.8/backports/0141-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch new file mode 100644 index 0000000..973241e --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0141-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch @@ -0,0 +1,28 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.h + +Change-Id: I4156bb1bb885f2f506e212654ccb68453802859e +--- + drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.h | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.h +@@ -37,7 +37,7 @@ + #include "accel/accel.h" + #include "en_accel/ktls_txrx.h" + +-#ifdef CONFIG_MLX5_EN_TLS ++#if defined(CONFIG_MLX5_EN_TLS) && defined(HAVE_UAPI_LINUX_TLS_H) + + #include + #include "en.h" +@@ -86,6 +86,6 @@ static inline u16 mlx5e_tls_get_stop_roo + return 0; + } + +-#endif /* CONFIG_MLX5_EN_TLS */ ++#endif /* CONFIG_MLX5_EN_TLS && HAVE_UAPI_LINUX_TLS_H */ + + #endif /* __MLX5E_TLS_RXTX_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0142-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch b/src/mlnx-ofa_kernel-5.8/backports/0142-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch new file mode 100644 index 0000000..20c6785 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0142-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch @@ -0,0 +1,96 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c + +Change-Id: I8ca092e44ac03af49a2043ff7c0b1d9bcd52a2d3 +--- + .../net/ethernet/mellanox/mlx5/core/en_arfs.c | 24 +++++++++++++------ + 1 file changed, 17 insertions(+), 7 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c +@@ -30,6 +30,8 @@ + * SOFTWARE. + */ + ++#ifndef HAVE_NET_FLOW_KEYS_H ++ + #include + #include + #include +@@ -96,7 +98,7 @@ struct arfs_rule { + + #define mlx5e_for_each_hash_arfs_rule(hn, tmp, hash, j) \ + for (j = 0; j < ARFS_HASH_SIZE; j++) \ +- hlist_for_each_entry_safe(hn, tmp, &hash[j], hlist) ++ compat_hlist_for_each_entry_safe(hn, tmp, &hash[j], hlist) + + static enum mlx5_traffic_types arfs_get_tt(enum arfs_type type) + { +@@ -396,6 +398,8 @@ static void arfs_may_expire_flow(struct + int i; + int j; + ++ COMPAT_HL_NODE ++ + spin_lock_bh(&priv->fs.arfs->arfs_lock); + mlx5e_for_each_arfs_rule(arfs_rule, htmp, priv->fs.arfs->arfs_tables, i, j) { + if (!work_pending(&arfs_rule->arfs_work) && +@@ -409,7 +413,7 @@ static void arfs_may_expire_flow(struct + } + } + spin_unlock_bh(&priv->fs.arfs->arfs_lock); +- hlist_for_each_entry_safe(arfs_rule, htmp, &del_list, hlist) { ++ compat_hlist_for_each_entry_safe(arfs_rule, htmp, &del_list, hlist) { + if (arfs_rule->rule) + mlx5_del_flow_rules(arfs_rule->rule); + hlist_del(&arfs_rule->hlist); +@@ -425,6 +429,8 @@ static void arfs_del_rules(struct mlx5e_ + int i; + int j; + ++ COMPAT_HL_NODE ++ + spin_lock_bh(&priv->fs.arfs->arfs_lock); + mlx5e_for_each_arfs_rule(rule, htmp, priv->fs.arfs->arfs_tables, i, j) { + hlist_del_init(&rule->hlist); +@@ -432,7 +438,7 @@ static void arfs_del_rules(struct mlx5e_ + } + spin_unlock_bh(&priv->fs.arfs->arfs_lock); + +- hlist_for_each_entry_safe(rule, htmp, &del_list, hlist) { ++ compat_hlist_for_each_entry_safe(rule, htmp, &del_list, hlist) { + cancel_work_sync(&rule->arfs_work); + if (rule->rule) + mlx5_del_flow_rules(rule->rule); +@@ -677,9 +683,10 @@ static struct arfs_rule *arfs_find_rule( + { + struct arfs_rule *arfs_rule; + struct hlist_head *head; ++ COMPAT_HL_NODE + + head = arfs_hash_bucket(arfs_t, fk->ports.src, fk->ports.dst); +- hlist_for_each_entry(arfs_rule, head, hlist) { ++ compat_hlist_for_each_entry(arfs_rule, head, hlist) { + if (arfs_cmp(&arfs_rule->tuple, fk)) + return arfs_rule; + } +@@ -695,8 +702,11 @@ int mlx5e_rx_flow_steer(struct net_devic + struct arfs_table *arfs_t; + struct arfs_rule *arfs_rule; + struct flow_keys fk; +- +- if (!skb_flow_dissect_flow_keys(skb, &fk, 0)) ++#ifdef HAVE_SKB_FLOW_DISSECT_FLOW_KEYS_HAS_3_PARAMS ++ if (!skb_flow_dissect_flow_keys(skb, &fk , 0)) ++#else ++ if (!skb_flow_dissect_flow_keys(skb, &fk)) ++#endif + return -EPROTONOSUPPORT; + + if (fk.basic.n_proto != htons(ETH_P_IP) && +@@ -729,4 +739,4 @@ int mlx5e_rx_flow_steer(struct net_devic + spin_unlock_bh(&arfs->arfs_lock); + return arfs_rule->filter_id; + } +- ++#endif /* HAVE_NET_FLOW_KEYS_H */ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0143-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch b/src/mlnx-ofa_kernel-5.8/backports/0143-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch new file mode 100644 index 0000000..607c9a8 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0143-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch @@ -0,0 +1,22 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/en_common.c + +Change-Id: I3712d9e16846f659e7a9469badaee7a7f6df072e +--- + drivers/net/ethernet/mellanox/mlx5/core/en_common.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c +@@ -38,7 +38,11 @@ + + void mlx5e_mkey_set_relaxed_ordering(struct mlx5_core_dev *mdev, void *mkc) + { ++#ifdef HAVE_PCIE_RELAXED_ORDERING_ENABLED + bool ro_pci_enable = pcie_relaxed_ordering_enabled(mdev->pdev); ++#else ++ bool ro_pci_enable = true; ++#endif + bool ro_write = MLX5_CAP_GEN(mdev, relaxed_ordering_write); + bool ro_read = MLX5_CAP_GEN(mdev, relaxed_ordering_read); + diff --git a/src/mlnx-ofa_kernel-5.8/backports/0144-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch b/src/mlnx-ofa_kernel-5.8/backports/0144-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch new file mode 100644 index 0000000..41f8f3e --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0144-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch @@ -0,0 +1,166 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c + +Change-Id: I5cbc252c9fbc1a4365c36cdc86ca31cb5722493f +--- + .../ethernet/mellanox/mlx5/core/en_dcbnl.c | 63 +++++++++++++++++-- + 1 file changed, 57 insertions(+), 6 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c +@@ -61,6 +61,7 @@ enum { + MLX5_CAP_QCAM_REG(mdev, qpts) && \ + MLX5_CAP_QCAM_REG(mdev, qpdpm)) + ++#ifdef CONFIG_MLX5_CORE_EN_DCB + static int mlx5e_set_trust_state(struct mlx5e_priv *priv, u8 trust_state); + static int mlx5e_set_dscp2prio(struct mlx5e_priv *priv, u8 dscp, u8 prio); + +@@ -901,6 +902,7 @@ static void mlx5e_dcbnl_setpfcstate(stru + cee_cfg->pfc_enable = state; + } + ++#ifdef HAVE_DCBNL_GETBUFFER + static int mlx5e_dcbnl_getbuffer(struct net_device *dev, + struct dcbnl_buffer *dcb_buffer) + { +@@ -984,8 +986,13 @@ static int mlx5e_dcbnl_setbuffer(struct + buffer_size, prio2buffer); + return err; + } ++#endif + ++#ifdef CONFIG_COMPAT_IS_DCBNL_OPS_CONST + static const struct dcbnl_rtnl_ops mlx5e_dcbnl_ops = { ++#else ++struct dcbnl_rtnl_ops mlx5e_dcbnl_ops = { ++#endif + .ieee_getets = mlx5e_dcbnl_ieee_getets, + .ieee_setets = mlx5e_dcbnl_ieee_setets, + .ieee_getmaxrate = mlx5e_dcbnl_ieee_getmaxrate, +@@ -996,8 +1003,10 @@ static const struct dcbnl_rtnl_ops mlx5e + .ieee_delapp = mlx5e_dcbnl_ieee_delapp, + .getdcbx = mlx5e_dcbnl_getdcbx, + .setdcbx = mlx5e_dcbnl_setdcbx, ++#ifdef HAVE_DCBNL_GETBUFFER + .dcbnl_getbuffer = mlx5e_dcbnl_getbuffer, + .dcbnl_setbuffer = mlx5e_dcbnl_setbuffer, ++#endif + + /* CEE interfaces */ + .setall = mlx5e_dcbnl_setall, +@@ -1149,18 +1158,29 @@ static int mlx5e_update_trust_state_hw(s + + static int mlx5e_set_trust_state(struct mlx5e_priv *priv, u8 trust_state) + { ++#if defined(HAVE_NDO_SETUP_TC_TAKES_TC_SETUP_TYPE) || defined(HAVE_NDO_SETUP_TC_RH_EXTENDED) ++#ifdef HAVE_TC_MQPRIO_QOPT_OFFLOAD + struct tc_mqprio_qopt_offload mqprio = {.qopt.num_tc = MLX5E_MAX_NUM_TC}; ++#else ++ struct tc_mqprio_qopt mqprio = {.num_tc = MLX5E_MAX_NUM_TC}; ++#endif ++#endif ++ + struct mlx5e_params new_params; + bool reset = true; + int err; + + mutex_lock(&priv->state_lock); ++#if defined(HAVE_NDO_SETUP_TC_TAKES_TC_SETUP_TYPE) || defined(HAVE_NDO_SETUP_TC_RH_EXTENDED) ++#ifdef HAVE_TC_MQPRIO_QOPT_OFFLOAD + mqprio.mode = priv->channels.params.mqprio.mode; + if (mqprio.mode != TC_MQPRIO_MODE_DCB) { + netdev_err(priv->netdev, "Can't change trust state while in channel mode.\n"); + err = -EINVAL; + goto unlock; + } ++#endif ++#endif + + new_params = priv->channels.params; + mlx5e_params_calc_trust_tx_min_inline_mode(priv->mdev, &new_params, +@@ -1173,22 +1193,41 @@ static int mlx5e_set_trust_state(struct + err = mlx5e_safe_switch_params(priv, &new_params, + mlx5e_update_trust_state_hw, + &trust_state, reset); ++#if defined(HAVE_NDO_SETUP_TC_TAKES_TC_SETUP_TYPE) || defined(HAVE_NDO_SETUP_TC_RH_EXTENDED) ++#ifdef HAVE_TC_MQPRIO_QOPT_OFFLOAD + unlock: ++#endif ++#endif + mutex_unlock(&priv->state_lock); + if (err) + return err; + + /* In DSCP trust state, we need 8 send queues per channel */ +- if (priv->dcbx_dp.trust_state == MLX5_QPTS_TRUST_DSCP) { +- mutex_lock(&priv->state_lock); +- mlx5e_setup_tc_mqprio(priv, &mqprio); +- mutex_unlock(&priv->state_lock); +- } else if (priv->dcbx_dp.trust_state == MLX5_QPTS_TRUST_PCP) { ++ if (priv->dcbx_dp.trust_state == MLX5_QPTS_TRUST_DSCP) ++#if defined(HAVE_NDO_SETUP_TC_TAKES_TC_SETUP_TYPE) || defined(HAVE_NDO_SETUP_TC_RH_EXTENDED) ++ { ++ mutex_lock(&priv->state_lock); ++ mlx5e_setup_tc_mqprio(priv, &mqprio); ++ mutex_unlock(&priv->state_lock); ++ } ++#else ++ mlx5e_setup_tc(priv->netdev, MLX5E_MAX_NUM_TC); ++#endif ++ else if (priv->dcbx_dp.trust_state == MLX5_QPTS_TRUST_PCP) ++#if defined(HAVE_NDO_SETUP_TC_TAKES_TC_SETUP_TYPE) || defined(HAVE_NDO_SETUP_TC_RH_EXTENDED) ++ { + mutex_lock(&priv->state_lock); ++#ifdef HAVE_TC_MQPRIO_QOPT_OFFLOAD + mqprio.qopt.num_tc = priv->pcp_tc_num; ++#else ++ mqprio.num_tc = priv->pcp_tc_num; ++#endif + mlx5e_setup_tc_mqprio(priv, &mqprio); + mutex_unlock(&priv->state_lock); + } ++#else ++ mlx5e_setup_tc(priv->netdev, priv->pcp_tc_num); ++#endif + + return 0; + } +@@ -1210,12 +1249,19 @@ static int mlx5e_trust_initialize(struct + struct mlx5_core_dev *mdev = priv->mdev; + int err; + u8 trust_state; ++ ++#if defined(HAVE_NDO_SETUP_TC_TAKES_TC_SETUP_TYPE) || defined(HAVE_NDO_SETUP_TC_RH_EXTENDED) ++#ifdef HAVE_TC_MQPRIO_QOPT_OFFLOAD + struct tc_mqprio_qopt_offload mqprio = {.qopt.num_tc = MLX5E_MAX_NUM_TC}; ++#else ++ struct tc_mqprio_qopt mqprio = {.num_tc = MLX5E_MAX_NUM_TC}; ++#endif ++#endif + const bool take_rtnl = priv->netdev->reg_state == NETREG_REGISTERED; + + if (!MLX5_DSCP_SUPPORTED(mdev)) { + WRITE_ONCE(priv->dcbx_dp.trust_state, MLX5_QPTS_TRUST_PCP); +- return 0; ++ return 0; + } + + err = mlx5_query_trust_state(priv->mdev, &trust_state); +@@ -1238,7 +1284,11 @@ static int mlx5e_trust_initialize(struct + if (priv->dcbx_dp.trust_state == MLX5_QPTS_TRUST_DSCP) { + if (take_rtnl) + rtnl_lock(); ++#if defined(HAVE_NDO_SETUP_TC_TAKES_TC_SETUP_TYPE) || defined(HAVE_NDO_SETUP_TC_RH_EXTENDED) + mlx5e_setup_tc_mqprio(priv, &mqprio); ++#else ++ mlx5e_setup_tc(priv->netdev, MLX5E_MAX_NUM_TC); ++#endif + if (take_rtnl) + rtnl_unlock(); + } +@@ -1291,3 +1341,4 @@ void mlx5e_dcbnl_initialize(struct mlx5e + + mlx5e_ets_init(priv); + } ++#endif diff --git a/src/mlnx-ofa_kernel-5.8/backports/0145-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch b/src/mlnx-ofa_kernel-5.8/backports/0145-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch new file mode 100644 index 0000000..4a9cb39 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0145-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch @@ -0,0 +1,1271 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c + +Change-Id: Ic2cb006c4461ba19d950d4b1965366927d0a5ea3 +--- + .../ethernet/mellanox/mlx5/core/en_ethtool.c | 746 +++++++++++++++++- + 1 file changed, 729 insertions(+), 17 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +@@ -35,6 +35,7 @@ + #include "en/params.h" + #include "en/xsk/pool.h" + #include "en/ptp.h" ++#include "en/rx_res.h" + #include "lib/clock.h" + + void mlx5e_ethtool_get_drvinfo(struct mlx5e_priv *priv, +@@ -61,6 +62,7 @@ static void mlx5e_get_drvinfo(struct net + mlx5e_ethtool_get_drvinfo(priv, drvinfo); + } + ++#ifdef __ETHTOOL_DECLARE_LINK_MODE_MASK + struct ptys2ethtool_config { + __ETHTOOL_DECLARE_LINK_MODE_MASK(supported); + __ETHTOOL_DECLARE_LINK_MODE_MASK(advertised); +@@ -216,6 +218,8 @@ void mlx5e_build_ptys2ethtool_map(void) + ETHTOOL_LINK_MODE_400000baseCR4_Full_BIT); + } + ++#endif ++#ifdef HAVE_GET_SET_LINK_KSETTINGS + static void mlx5e_ethtool_get_speed_arr(struct mlx5_core_dev *mdev, + struct ptys2ethtool_config **arr, + u32 *size) +@@ -226,6 +230,475 @@ static void mlx5e_ethtool_get_speed_arr( + *size = ext ? ARRAY_SIZE(ptys2ext_ethtool_table) : + ARRAY_SIZE(ptys2legacy_ethtool_table); + } ++#endif ++ ++#ifdef HAVE_ETHTOOL_GET_SET_SETTINGS ++struct deprecated_ptys2ethtool_config { ++ u32 supported; ++ u32 advertised; ++ u32 speed; ++}; ++ ++static struct deprecated_ptys2ethtool_config ++deprecated_ptys2legacy_ethtool_table[MLX5E_LINK_MODES_NUMBER] = { ++ [MLX5E_1000BASE_CX_SGMII] = { ++ .supported = SUPPORTED_1000baseKX_Full, ++ .advertised = ADVERTISED_1000baseKX_Full, ++ .speed = SPEED_1000, ++ }, ++ [MLX5E_1000BASE_KX] = { ++ .supported = SUPPORTED_1000baseKX_Full, ++ .advertised = ADVERTISED_1000baseKX_Full, ++ .speed = SPEED_1000, ++ }, ++ [MLX5E_10GBASE_CX4] = { ++ .supported = SUPPORTED_10000baseKX4_Full, ++ .advertised = ADVERTISED_10000baseKX4_Full, ++ .speed = SPEED_10000, ++ }, ++ [MLX5E_10GBASE_KX4] = { ++ .supported = SUPPORTED_10000baseKX4_Full, ++ .advertised = ADVERTISED_10000baseKX4_Full, ++ .speed = SPEED_10000, ++ }, ++ [MLX5E_10GBASE_KR] = { ++ .supported = SUPPORTED_10000baseKR_Full, ++ .advertised = ADVERTISED_10000baseKR_Full, ++ .speed = SPEED_10000, ++ }, ++ [MLX5E_20GBASE_KR2] = { ++ .supported = SUPPORTED_20000baseKR2_Full, ++ .advertised = ADVERTISED_20000baseKR2_Full, ++ .speed = SPEED_20000, ++ }, ++ [MLX5E_40GBASE_CR4] = { ++ .supported = SUPPORTED_40000baseCR4_Full, ++ .advertised = ADVERTISED_40000baseCR4_Full, ++ .speed = SPEED_40000, ++ }, ++ [MLX5E_40GBASE_KR4] = { ++ .supported = SUPPORTED_40000baseKR4_Full, ++ .advertised = ADVERTISED_40000baseKR4_Full, ++ .speed = SPEED_40000, ++ }, ++ [MLX5E_56GBASE_R4] = { ++ .supported = SUPPORTED_56000baseKR4_Full, ++ .advertised = ADVERTISED_56000baseKR4_Full, ++ .speed = SPEED_56000, ++ }, ++ [MLX5E_10GBASE_CR] = { ++ .supported = SUPPORTED_10000baseKR_Full, ++ .advertised = ADVERTISED_10000baseKR_Full, ++ .speed = SPEED_10000, ++ }, ++ [MLX5E_10GBASE_SR] = { ++ .supported = SUPPORTED_10000baseKR_Full, ++ .advertised = ADVERTISED_10000baseKR_Full, ++ .speed = SPEED_10000, ++ }, ++ [MLX5E_10GBASE_ER] = { ++ .supported = SUPPORTED_10000baseKR_Full,/* TODO: verify */ ++ .advertised = ADVERTISED_10000baseKR_Full, ++ .speed = SPEED_10000, ++ }, ++ [MLX5E_40GBASE_SR4] = { ++ .supported = SUPPORTED_40000baseSR4_Full, ++ .advertised = ADVERTISED_40000baseSR4_Full, ++ .speed = SPEED_40000, ++ }, ++ [MLX5E_40GBASE_LR4] = { ++ .supported = SUPPORTED_40000baseLR4_Full, ++ .advertised = ADVERTISED_40000baseLR4_Full, ++ .speed = SPEED_40000, ++ }, ++ [MLX5E_100GBASE_CR4] = { ++ .supported = /*SUPPORTED_100000baseCR4_Full*/ 0, ++ .advertised = /*ADVERTISED_100000baseCR4_Full*/ 0, ++ .speed = SPEED_100000, ++ }, ++ [MLX5E_100GBASE_SR4] = { ++ .supported = /*SUPPORTED_100000baseSR4_Full*/ 0, ++ .advertised = /*ADVERTISED_100000baseSR4_Full*/ 0, ++ .speed = SPEED_100000, ++ }, ++ [MLX5E_100GBASE_KR4] = { ++ .supported = /*SUPPORTED_100000baseKR4_Full*/ 0, ++ .advertised = /*ADVERTISED_100000baseKR4_Full*/ 0, ++ .speed = SPEED_100000, ++ }, ++ [MLX5E_100GBASE_LR4] = { ++ .supported = /*SUPPORTED_1000000baseLR4_Full*/ 0, ++ .advertised = /*ADVERTISED_1000000baseLR4_Full*/ 0, ++ .speed = SPEED_100000, ++ }, ++ [MLX5E_100BASE_TX] = { ++ .supported = SUPPORTED_100baseT_Full, ++ .advertised = ADVERTISED_100baseT_Full, ++ .speed = SPEED_100, ++ }, ++ [MLX5E_1000BASE_T] = { ++ .supported = SUPPORTED_1000baseT_Full, ++ .advertised = ADVERTISED_1000baseT_Full, ++ .speed = SPEED_1000, ++ }, ++ [MLX5E_10GBASE_T] = { ++ .supported = SUPPORTED_10000baseT_Full, ++ .advertised = ADVERTISED_10000baseT_Full, ++ .speed = SPEED_10000, ++ }, ++ [MLX5E_25GBASE_CR] = { ++ .supported = /*SUPPORTED_25000baseCR_Full*/ 0, ++ .advertised = /*ADVERTISED_25000baseCR_Full*/ 0, ++ .speed = SPEED_25000, ++ }, ++ [MLX5E_25GBASE_KR] = { ++ .supported = /*SUPPORTED_25000baseKR_Full*/ 0, ++ .advertised = /*ADVERTISED_25000baseKR_Full*/ 0, ++ .speed = SPEED_25000, ++ }, ++ [MLX5E_25GBASE_SR] = { ++ .supported = /*SUPPORTED_25000baseSR_Full*/ 0, ++ .advertised = /*ADVERTISED_25000baseSR_Full*/ 0, ++ .speed = SPEED_25000, ++ }, ++ [MLX5E_50GBASE_CR2] = { ++ .supported = /*SUPPORTED_50000baseCR2_Full*/ 0, ++ .advertised = /*ADVERTISED_50000baseCR2_Full*/ 0, ++ .speed = SPEED_50000, ++ }, ++ [MLX5E_50GBASE_KR2] = { ++ .supported = /*SUPPORTED_50000baseKR2_Full*/ 0, ++ .advertised = /*ADVERTISED_50000baseKR2_Full*/ 0, ++ .speed = SPEED_50000, ++ }, ++}; ++ ++static struct deprecated_ptys2ethtool_config ++deprecated_ptys2ext_ethtool_table[MLX5E_EXT_LINK_MODES_NUMBER] = { ++ [MLX5E_SGMII_100M] = { ++ .speed = SPEED_100, ++ }, ++ [MLX5E_1000BASE_X_SGMII] = { ++ .speed = SPEED_1000, ++ }, ++ [MLX5E_5GBASE_R] = { ++ .speed = SPEED_5000, ++ }, ++ [MLX5E_10GBASE_XFI_XAUI_1] = { ++ .speed = SPEED_10000, ++ }, ++ [MLX5E_40GBASE_XLAUI_4_XLPPI_4] = { ++ .speed = SPEED_40000, ++ }, ++ [MLX5E_25GAUI_1_25GBASE_CR_KR] = { ++ .speed = SPEED_25000, ++ }, ++ [MLX5E_50GAUI_2_LAUI_2_50GBASE_CR2_KR2] = { ++ .speed = SPEED_50000, ++ }, ++ [MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR] = { ++ .speed = SPEED_50000, ++ }, ++ [MLX5E_CAUI_4_100GBASE_CR4_KR4] = { ++ .speed = SPEED_100000, ++ }, ++ [MLX5E_100GAUI_2_100GBASE_CR2_KR2] = { ++ .speed = SPEED_100000, ++ }, ++ [MLX5E_200GAUI_4_200GBASE_CR4_KR4] = { ++ .speed = SPEED_200000, ++ }, ++}; ++ ++static void ++deprecated_ethtool_get_speed_arr(bool ext, u32 *size, ++ struct deprecated_ptys2ethtool_config **arr) ++{ ++ *arr = ext ? deprecated_ptys2ext_ethtool_table : ++ deprecated_ptys2legacy_ethtool_table; ++ *size = ext ? ARRAY_SIZE(deprecated_ptys2ext_ethtool_table) : ++ ARRAY_SIZE(deprecated_ptys2legacy_ethtool_table); ++} ++ ++static u32 deprecated_ptys2ethtool_supported_link(u32 eth_proto_cap) ++{ ++ int i; ++ u32 supoprted_modes = 0; ++ ++ for (i = 0; i < MLX5E_LINK_MODES_NUMBER; ++i) { ++ if (eth_proto_cap & MLX5E_PROT_MASK(i)) ++ supoprted_modes |= deprecated_ptys2legacy_ethtool_table[i].supported; ++ } ++ return supoprted_modes; ++} ++ ++static u32 deprecated_ptys2ethtool_adver_link(u32 eth_proto_cap) ++{ ++ int i; ++ u32 advertising_modes = 0; ++ ++ for (i = 0; i < MLX5E_LINK_MODES_NUMBER; ++i) { ++ if (eth_proto_cap & MLX5E_PROT_MASK(i)) ++ advertising_modes |= deprecated_ptys2legacy_ethtool_table[i].advertised; ++ } ++ return advertising_modes; ++} ++ ++static u32 deprecated_ptys2ethtool_supported_port(u32 eth_proto_cap) ++{ ++ /* ++ TODO: ++ MLX5E_40GBASE_LR4 = 16, ++ MLX5E_10GBASE_ER = 14, ++ MLX5E_10GBASE_CX4 = 2, ++ */ ++ ++ if (eth_proto_cap & (MLX5E_PROT_MASK(MLX5E_10GBASE_CR) ++ | MLX5E_PROT_MASK(MLX5E_10GBASE_SR) ++ | MLX5E_PROT_MASK(MLX5E_40GBASE_CR4) ++ | MLX5E_PROT_MASK(MLX5E_40GBASE_SR4) ++ | MLX5E_PROT_MASK(MLX5E_100GBASE_SR4) ++ | MLX5E_PROT_MASK(MLX5E_1000BASE_CX_SGMII))) { ++ return SUPPORTED_FIBRE; ++ } ++ ++ if (eth_proto_cap & (MLX5E_PROT_MASK(MLX5E_100GBASE_KR4) ++ | MLX5E_PROT_MASK(MLX5E_40GBASE_KR4) ++ | MLX5E_PROT_MASK(MLX5E_10GBASE_KR) ++ | MLX5E_PROT_MASK(MLX5E_10GBASE_KX4) ++ | MLX5E_PROT_MASK(MLX5E_1000BASE_KX))) { ++ return SUPPORTED_Backplane; ++ } ++ return 0; ++} ++ ++static void deprecated_get_speed_duplex(struct net_device *netdev, ++ struct mlx5_core_dev *mdev, ++ u32 eth_proto_oper, ++ struct ethtool_cmd *cmd) ++{ ++ struct deprecated_ptys2ethtool_config *table; ++ u32 max_size; ++ bool ext; ++ int i; ++ u32 speed = SPEED_UNKNOWN; ++ u8 duplex = DUPLEX_UNKNOWN; ++ ++ if (!netif_carrier_ok(netdev)) ++ goto out; ++ ++ ext = MLX5_CAP_PCAM_FEATURE(mdev, ptys_extended_ethernet); ++ deprecated_ethtool_get_speed_arr(ext, &max_size, &table); ++ for (i = 0; i < max_size; ++i) { ++ if (eth_proto_oper & MLX5E_PROT_MASK(i)) { ++ speed = table[i].speed; ++ duplex = DUPLEX_FULL; ++ break; ++ } ++ } ++out: ++ ethtool_cmd_speed_set(cmd, speed); ++ cmd->duplex = duplex; ++} ++ ++static void deprecated_get_supported(u32 eth_proto_cap, u32 *supported) ++{ ++ *supported |= deprecated_ptys2ethtool_supported_port(eth_proto_cap); ++ *supported |= deprecated_ptys2ethtool_supported_link(eth_proto_cap); ++ *supported |= SUPPORTED_Pause | SUPPORTED_Asym_Pause; ++} ++ ++static void deprecated_get_advertising(u32 eth_proto_cap, u8 tx_pause, ++ u8 rx_pause, u32 *advertising) ++{ ++ *advertising |= deprecated_ptys2ethtool_adver_link(eth_proto_cap); ++ *advertising |= tx_pause ? ADVERTISED_Pause : 0; ++ *advertising |= (tx_pause ^ rx_pause) ? ADVERTISED_Asym_Pause : 0; ++} ++ ++static void deprecated_get_lp_advertising(u32 eth_proto_lp, u32 *lp_advertising) ++{ ++ ++ *lp_advertising = deprecated_ptys2ethtool_adver_link(eth_proto_lp); ++} ++ ++static u32 deprecated_mlx5e_ethtool2ptys_speed_link(bool ext, u32 speed) ++ ++{ ++ struct deprecated_ptys2ethtool_config *table; ++ u32 i, speed_links = 0; ++ u32 max_size; ++ ++ deprecated_ethtool_get_speed_arr(ext, &max_size, &table); ++ for (i = 0; i < max_size; ++i) { ++ if (table[i].speed == speed) ++ speed_links |= MLX5E_PROT_MASK(i); ++ } ++ ++ return speed_links; ++} ++ ++static u8 get_connector_port(struct mlx5_core_dev *mdev, u32 eth_proto, u8 connector_type); ++ ++int mlx5e_get_settings(struct net_device *netdev, ++ struct ethtool_cmd *cmd) ++{ ++ struct mlx5e_priv *priv = netdev_priv(netdev); ++ struct mlx5_core_dev *mdev = priv->mdev; ++ u32 out[MLX5_ST_SZ_DW(ptys_reg)] = {0}; ++ u32 eth_proto_cap; ++ u32 eth_proto_admin; ++ u32 eth_proto_lp; ++ u32 eth_proto_oper; ++ u8 an_disable_admin; ++ u8 connector_type; ++ u8 an_status; ++ bool ext; ++ int err; ++ ++ err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN, 1); ++ if (err) { ++ netdev_err(netdev, "%s: query port ptys failed: %d\n", ++ __func__, err); ++ goto err_query_ptys; ++ } ++ ext = MLX5_CAP_PCAM_FEATURE(mdev, ptys_extended_ethernet); ++ eth_proto_oper = MLX5_GET_ETH_PROTO(ptys_reg, out, ext, ++ eth_proto_oper); ++ eth_proto_cap = MLX5_GET(ptys_reg, out, eth_proto_capability); ++ eth_proto_admin = MLX5_GET(ptys_reg, out, eth_proto_admin); ++ eth_proto_lp = MLX5_GET(ptys_reg, out, eth_proto_lp_advertise); ++ an_disable_admin = MLX5_GET(ptys_reg, out, an_disable_admin); ++ an_status = MLX5_GET(ptys_reg, out, an_status); ++ connector_type = MLX5_GET(ptys_reg, out, connector_type); ++ ++ cmd->supported = 0; ++ cmd->advertising = 0; ++ ++ deprecated_get_supported(eth_proto_cap, &cmd->supported); ++ deprecated_get_advertising(eth_proto_admin, 0, 0, &cmd->advertising); ++ deprecated_get_speed_duplex(netdev, mdev, eth_proto_oper, cmd); ++ ++ eth_proto_oper = eth_proto_oper ? eth_proto_oper : eth_proto_cap; ++ ++ connector_type = connector_type < MLX5E_CONNECTOR_TYPE_NUMBER ? ++ connector_type : MLX5E_PORT_UNKNOWN; ++ cmd->port = get_connector_port(mdev, eth_proto_oper, connector_type); ++ deprecated_get_lp_advertising(eth_proto_lp, &cmd->lp_advertising); ++ ++ cmd->lp_advertising |= an_status == MLX5_AN_COMPLETE ? ++ ADVERTISED_Autoneg : 0; ++ ++ cmd->transceiver = XCVR_INTERNAL; ++ cmd->autoneg = an_disable_admin ? AUTONEG_DISABLE : AUTONEG_ENABLE; ++ cmd->supported |= SUPPORTED_Autoneg; ++ cmd->advertising |= !an_disable_admin ? ADVERTISED_Autoneg : 0; ++ ++err_query_ptys: ++ return err; ++} ++ ++static u32 deprecated_mlx5e_ethtool2ptys_adver_link(u32 link_modes) ++{ ++ u32 i, ptys_modes = 0; ++ ++ for (i = 0; i < MLX5E_LINK_MODES_NUMBER; ++i) { ++ if (deprecated_ptys2legacy_ethtool_table[i].advertised & link_modes) ++ ptys_modes |= MLX5E_PROT_MASK(i); ++ } ++ ++ return ptys_modes; ++} ++ ++int mlx5e_set_settings(struct net_device *netdev, ++ struct ethtool_cmd *cmd) ++{ ++ struct mlx5e_priv *priv = netdev_priv(netdev); ++ struct mlx5_core_dev *mdev = priv->mdev; ++ struct mlx5e_port_eth_proto eproto; ++ bool an_changes = false; ++ u8 an_disable_admin; ++ u8 an_disable_cap; ++ bool an_disable; ++ u32 link_modes; ++ u8 an_status; ++ u32 speed; ++ bool ext; ++ int err; ++ ++ ext = MLX5_CAP_PCAM_FEATURE(mdev, ptys_extended_ethernet); ++ ++ speed = ethtool_cmd_speed(cmd); ++ ++ link_modes = cmd->autoneg == AUTONEG_ENABLE ? ++ deprecated_mlx5e_ethtool2ptys_adver_link(cmd->advertising) : ++ deprecated_mlx5e_ethtool2ptys_speed_link(ext, speed); ++ ++ err = mlx5_port_query_eth_proto(mdev, 1, ext, &eproto); ++ if (err) { ++ netdev_err(netdev, "%s: query port eth proto failed: %d\n", ++ __func__, err); ++ goto out; ++ } ++ ++ /* Overwrite advertise bit for old kernel. When autoneg is enabled, ++ * driver will advertise all supported speed(eth_proto_cap) and bypass ++ * advertised speed settings from user. This is because only new ++ * ethtool(after v4.6) supports advertising speeds like 100G, 25G, etc. ++ */ ++ if (cmd->autoneg == AUTONEG_ENABLE) ++ link_modes = eproto.cap; ++ link_modes = link_modes & eproto.cap;; ++ if (!link_modes) { ++ netdev_err(netdev, "%s: Not supported link mode(s) requested", ++ __func__); ++ err = -EINVAL; ++ goto out; ++ } ++ ++ mlx5_port_query_eth_autoneg(mdev, &an_status, &an_disable_cap, ++ &an_disable_admin); ++ ++ an_disable = cmd->autoneg == AUTONEG_DISABLE; ++ an_changes = ((!an_disable && an_disable_admin) || ++ (an_disable && !an_disable_admin)); ++ ++ if (!an_changes && link_modes == eproto.admin) ++ goto out; ++ ++ mlx5_port_set_eth_ptys(mdev, an_disable, link_modes, ext); ++ mlx5_toggle_port_link(mdev); ++ ++out: ++ return err; ++} ++#endif /* HAVE_ETHTOOL_GET_SET_SETTINGS */ ++ ++#ifndef HAVE_GET_SET_LINK_KSETTINGS ++int mlx5e_get_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed) ++{ ++ struct deprecated_ptys2ethtool_config *table; ++ struct mlx5e_port_eth_proto eproto; ++ u32 max_speed = 0; ++ u32 max_size; ++ bool ext; ++ int err; ++ int i; ++ ++ ext = MLX5_CAP_PCAM_FEATURE(mdev, ptys_extended_ethernet); ++ deprecated_ethtool_get_speed_arr(ext, &max_size, &table); ++ err = mlx5_port_query_eth_proto(mdev, 1, ext, &eproto); ++ if (err) ++ return err; ++ ++ for (i = 0; i < max_size; ++i) ++ if (eproto.cap & MLX5E_PROT_MASK(i)) ++ max_speed = max(max_speed, table[i].speed); ++ ++ *speed = max_speed; ++ return 0; ++} ++#endif + + typedef int (*mlx5e_pflag_handler)(struct net_device *netdev, bool enable); + +@@ -321,9 +794,14 @@ void mlx5e_ethtool_get_ringparam(struct + } + + static void mlx5e_get_ringparam(struct net_device *dev, ++#ifdef HAVE_GET_RINGPARAM_GET_4_PARAMS + struct ethtool_ringparam *param, + struct kernel_ethtool_ringparam *kernel_param, + struct netlink_ext_ack *extack) ++#else ++ struct ethtool_ringparam *param) ++#endif ++ + { + struct mlx5e_priv *priv = netdev_priv(dev); + +@@ -356,6 +834,12 @@ int mlx5e_ethtool_set_ringparam(struct m + 1 << MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE); + return -EINVAL; + } ++ if (param->rx_pending > (1 << MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE)) { ++ netdev_info(priv->netdev, "%s: rx_pending (%d) > max (%d)\n", ++ __func__, param->rx_pending, ++ 1 << MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE); ++ return -EINVAL; ++ } + + if (param->tx_pending < (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE)) { + netdev_info(priv->netdev, "%s: tx_pending (%d) < min (%d)\n", +@@ -363,6 +847,12 @@ int mlx5e_ethtool_set_ringparam(struct m + 1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE); + return -EINVAL; + } ++ if (param->tx_pending > (1 << MLX5E_PARAMS_MAXIMUM_LOG_SQ_SIZE)) { ++ netdev_info(priv->netdev, "%s: tx_pending (%d) > max (%d)\n", ++ __func__, param->tx_pending, ++ 1 << MLX5E_PARAMS_MAXIMUM_LOG_SQ_SIZE); ++ return -EINVAL; ++ } + + log_rq_size = order_base_2(param->rx_pending); + log_sq_size = order_base_2(param->tx_pending); +@@ -390,9 +880,14 @@ unlock: + } + + static int mlx5e_set_ringparam(struct net_device *dev, +- struct ethtool_ringparam *param, +- struct kernel_ethtool_ringparam *kernel_param, +- struct netlink_ext_ack *extack) ++#ifdef HAVE_GET_RINGPARAM_GET_4_PARAMS ++ struct ethtool_ringparam *param, ++ struct kernel_ethtool_ringparam *kernel_param, ++ struct netlink_ext_ack *extack) ++#else ++ struct ethtool_ringparam *param) ++#endif ++ + { + struct mlx5e_priv *priv = netdev_priv(dev); + +@@ -406,11 +901,13 @@ void mlx5e_ethtool_get_channels(struct m + + ch->max_combined = priv->max_nch; + ch->combined_count = priv->channels.params.num_channels; ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + if (priv->xsk.refcnt) { + /* The upper half are XSK queues. */ + ch->max_combined *= 2; + ch->combined_count *= 2; + } ++#endif + + mutex_unlock(&priv->state_lock); + } +@@ -461,12 +958,14 @@ int mlx5e_ethtool_set_channels(struct ml + /* Don't allow changing the number of channels if there is an active + * XSK, because the numeration of the XSK and regular RQs will change. + */ ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + if (priv->xsk.refcnt) { + err = -EINVAL; + netdev_err(priv->netdev, "%s: AF_XDP is active, cannot change the number of channels\n", + __func__); + goto out; + } ++#endif + + /* Don't allow changing the number of channels if HTB offload is active, + * because the numeration of the QoS SQs will change, while per-queue +@@ -492,6 +991,7 @@ int mlx5e_ethtool_set_channels(struct ml + } + } + ++#ifdef HAVE_TC_MQPRIO_QOPT_OFFLOAD + /* Don't allow changing the number of channels if MQPRIO mode channel offload is active, + * because it defines a partition over the channels queues. + */ +@@ -501,7 +1001,7 @@ int mlx5e_ethtool_set_channels(struct ml + __func__); + goto out; + } +- ++#endif + new_params = *cur_params; + new_params.num_channels = count; + +@@ -538,8 +1038,12 @@ static int mlx5e_set_channels(struct net + } + + int mlx5e_ethtool_get_coalesce(struct mlx5e_priv *priv, ++#ifdef HAVE_NDO_GET_COALESCE_GET_4_PARAMS //forwardport + struct ethtool_coalesce *coal, + struct kernel_ethtool_coalesce *kernel_coal) ++#else ++ struct ethtool_coalesce *coal) ++#endif + { + struct dim_cq_moder *rx_moder, *tx_moder; + +@@ -562,22 +1066,33 @@ int mlx5e_ethtool_get_coalesce(struct ml + coal->tx_max_coalesced_frames = tx_moder->pkts; + coal->use_adaptive_tx_coalesce = priv->channels.params.tx_dim_enabled; + ++#ifdef HAVE_NDO_GET_COALESCE_GET_4_PARAMS + kernel_coal->use_cqe_mode_rx = + MLX5E_GET_PFLAG(&priv->channels.params, MLX5E_PFLAG_RX_CQE_BASED_MODER); + kernel_coal->use_cqe_mode_tx = + MLX5E_GET_PFLAG(&priv->channels.params, MLX5E_PFLAG_TX_CQE_BASED_MODER); ++#endif + + return 0; + } + + static int mlx5e_get_coalesce(struct net_device *netdev, ++#ifdef HAVE_NDO_GET_COALESCE_GET_4_PARAMS + struct ethtool_coalesce *coal, + struct kernel_ethtool_coalesce *kernel_coal, + struct netlink_ext_ack *extack) ++#else ++ struct ethtool_coalesce *coal) ++#endif + { + struct mlx5e_priv *priv = netdev_priv(netdev); + +- return mlx5e_ethtool_get_coalesce(priv, coal, kernel_coal); ++ return mlx5e_ethtool_get_coalesce(priv, ++#ifdef HAVE_NDO_GET_COALESCE_GET_4_PARAMS ++ coal , kernel_coal); ++#else ++ coal); ++#endif + } + + #define MLX5E_MAX_COAL_TIME MLX5_MAX_CQ_PERIOD +@@ -630,16 +1145,22 @@ static int cqe_mode_to_period_mode(bool + } + + int mlx5e_ethtool_set_coalesce(struct mlx5e_priv *priv, ++#ifdef HAVE_NDO_GET_COALESCE_GET_4_PARAMS + struct ethtool_coalesce *coal, + struct kernel_ethtool_coalesce *kernel_coal, + struct netlink_ext_ack *extack) ++#else ++ struct ethtool_coalesce *coal) ++#endif + { + struct dim_cq_moder *rx_moder, *tx_moder; + struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5e_params new_params; + bool reset_rx, reset_tx; + bool reset = true; ++#ifdef HAVE_NDO_GET_COALESCE_GET_4_PARAMS + u8 cq_period_mode; ++#endif + int err = 0; + + if (!MLX5_CAP_GEN(mdev, cq_moderation)) +@@ -659,11 +1180,13 @@ int mlx5e_ethtool_set_coalesce(struct ml + return -ERANGE; + } + ++#ifdef HAVE_NDO_GET_COALESCE_GET_4_PARAMS + if ((kernel_coal->use_cqe_mode_rx || kernel_coal->use_cqe_mode_tx) && + !MLX5_CAP_GEN(priv->mdev, cq_period_start_from_cqe)) { + NL_SET_ERR_MSG_MOD(extack, "cqe_mode_rx/tx is not supported on this device"); + return -EOPNOTSUPP; + } ++#endif + + mutex_lock(&priv->state_lock); + new_params = priv->channels.params; +@@ -690,6 +1213,7 @@ int mlx5e_ethtool_set_coalesce(struct ml + reset_rx = !!coal->use_adaptive_rx_coalesce != priv->channels.params.rx_dim_enabled; + reset_tx = !!coal->use_adaptive_tx_coalesce != priv->channels.params.tx_dim_enabled; + ++#ifdef HAVE_NDO_GET_COALESCE_GET_4_PARAMS + cq_period_mode = cqe_mode_to_period_mode(kernel_coal->use_cqe_mode_rx); + if (cq_period_mode != rx_moder->cq_period_mode) { + mlx5e_set_rx_cq_mode_params(&new_params, cq_period_mode); +@@ -701,6 +1225,7 @@ int mlx5e_ethtool_set_coalesce(struct ml + mlx5e_set_tx_cq_mode_params(&new_params, cq_period_mode); + reset_tx = true; + } ++#endif + + if (reset_rx) { + u8 mode = MLX5E_GET_PFLAG(&new_params, +@@ -733,15 +1258,25 @@ out: + } + + static int mlx5e_set_coalesce(struct net_device *netdev, ++#ifdef HAVE_NDO_GET_COALESCE_GET_4_PARAMS + struct ethtool_coalesce *coal, + struct kernel_ethtool_coalesce *kernel_coal, + struct netlink_ext_ack *extack) ++#else ++ struct ethtool_coalesce *coal) ++#endif + { + struct mlx5e_priv *priv = netdev_priv(netdev); + +- return mlx5e_ethtool_set_coalesce(priv, coal, kernel_coal, extack); ++ return mlx5e_ethtool_set_coalesce(priv, ++#ifdef HAVE_NDO_GET_COALESCE_GET_4_PARAMS ++ coal, kernel_coal, extack); ++#else ++ coal); ++#endif + } + ++#ifdef HAVE_GET_SET_LINK_KSETTINGS + static void ptys2ethtool_supported_link(struct mlx5_core_dev *mdev, + unsigned long *supported_modes, + u32 eth_proto_cap) +@@ -775,7 +1310,8 @@ static void ptys2ethtool_adver_link(unsi + table[proto].advertised, + __ETHTOOL_LINK_MODE_MASK_NBITS); + } +- ++#endif ++#ifdef HAVE_GET_SET_FECPARAM + static const u32 pplm_fec_2_ethtool[] = { + [MLX5E_FEC_NOFEC] = ETHTOOL_FEC_OFF, + [MLX5E_FEC_FIRECODE] = ETHTOOL_FEC_BASER, +@@ -846,7 +1382,9 @@ static int get_fec_supported_advertised( + + return 0; + } ++#endif + ++#ifdef HAVE_GET_SET_LINK_KSETTINGS + static void ptys2ethtool_supported_advertised_port(struct mlx5_core_dev *mdev, + struct ethtool_link_ksettings *link_ksettings, + u32 eth_proto_cap, u8 connector_type) +@@ -974,6 +1512,7 @@ static void get_advertising(u32 eth_prot + if (tx_pause ^ rx_pause) + ethtool_link_ksettings_add_link_mode(link_ksettings, advertising, Asym_Pause); + } ++#endif /* HAVE_GET_SET_LINK_KSETTINGS */ + + static int ptys2connector_type[MLX5E_CONNECTOR_TYPE_NUMBER] = { + [MLX5E_PORT_UNKNOWN] = PORT_OTHER, +@@ -1018,6 +1557,7 @@ static u8 get_connector_port(struct mlx5 + return PORT_OTHER; + } + ++#ifdef HAVE_GET_SET_LINK_KSETTINGS + static void get_lp_advertising(struct mlx5_core_dev *mdev, u32 eth_proto_lp, + struct ethtool_link_ksettings *link_ksettings) + { +@@ -1106,13 +1646,14 @@ int mlx5e_ethtool_get_link_ksettings(str + ethtool_link_ksettings_add_link_mode(link_ksettings, supported, + Autoneg); + ++#ifdef HAVE_GET_SET_FECPARAM + err = get_fec_supported_advertised(mdev, link_ksettings); + if (err) { + netdev_dbg(priv->netdev, "%s: FEC caps query failed: %d\n", + __func__, err); + err = 0; /* don't fail caps query because of FEC error */ + } +- ++#endif + if (!an_disable_admin) + ethtool_link_ksettings_add_link_mode(link_ksettings, + advertising, Autoneg); +@@ -1163,6 +1704,7 @@ static u32 mlx5e_ethtool2ptys_adver_link + + static u32 mlx5e_ethtool2ptys_ext_adver_link(const unsigned long *link_modes) + { ++#ifdef HAVE_ETHTOOL_50G_PER_LANE_LINK_MODES + u32 i, ptys_modes = 0; + unsigned long modes[2]; + +@@ -1179,6 +1721,9 @@ static u32 mlx5e_ethtool2ptys_ext_adver_ + ptys_modes |= MLX5E_PROT_MASK(i); + } + return ptys_modes; ++#else ++ return 0; ++#endif + } + + static bool ext_link_mode_requested(const unsigned long *adver) +@@ -1241,6 +1786,14 @@ int mlx5e_ethtool_set_link_ksettings(str + } + link_modes = autoneg == AUTONEG_ENABLE ? ethtool2ptys_adver_func(adver) : + mlx5e_port_speed2linkmodes(mdev, speed, !ext); ++#ifndef HAVE_ETHTOOL_25G_50G_100G_SPEEDS ++ if (link_ksettings->base.autoneg == AUTONEG_ENABLE) ++ link_modes = eproto.cap;; ++#endif ++#ifndef HAVE_ETHTOOL_50G_PER_LANE_LINK_MODES ++ if (link_ksettings->base.autoneg == AUTONEG_ENABLE && ext_supported) ++ link_modes = eproto.cap;; ++#endif + + err = mlx5e_speed_validate(priv->netdev, ext, link_modes, autoneg); + if (err) +@@ -1278,6 +1831,7 @@ int mlx5e_set_link_ksettings(struct net_ + + return mlx5e_ethtool_set_link_ksettings(priv, link_ksettings); + } ++#endif /* HAVE_GET_SET_LINK_KSETTINGS */ + + u32 mlx5e_ethtool_get_rxfh_key_size(struct mlx5e_priv *priv) + { +@@ -1315,6 +1869,7 @@ static int mlx5e_get_rxfh_context(struct + return err; + } + ++#ifdef HAVE_ETHTOOL_GET_RXFH_CONTEXT + static int mlx5e_set_rxfh_context(struct net_device *dev, const u32 *indir, + const u8 *key, const u8 hfunc, + u32 *rss_context, bool delete) +@@ -1343,6 +1898,7 @@ unlock: + mutex_unlock(&priv->state_lock); + return err; + } ++#endif + + int mlx5e_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key, + u8 *hfunc) +@@ -1371,6 +1927,7 @@ int mlx5e_set_rxfh(struct net_device *de + max_t(u16, MLX5E_PFC_PREVEN_TOUT_MIN_MSEC, \ + (critical_tout * MLX5E_PFC_PREVEN_MINOR_PRECENT) / 100) + ++#ifdef HAVE_GET_SET_TUNABLE + static int mlx5e_get_pfc_prevention_tout(struct net_device *netdev, + u16 *pfc_prevention_tout) + { +@@ -1413,7 +1970,9 @@ static int mlx5e_set_pfc_prevention_tout + return mlx5_set_port_stall_watermark(mdev, critical_tout, + minor); + } ++#endif + ++#ifdef HAVE_GET_SET_TUNABLE + static int mlx5e_get_tunable(struct net_device *dev, + const struct ethtool_tunable *tuna, + void *data) +@@ -1453,7 +2012,8 @@ static int mlx5e_set_tunable(struct net_ + mutex_unlock(&priv->state_lock); + return err; + } +- ++#endif ++#ifdef HAVE_GET_PAUSE_STATS + static void mlx5e_get_pause_stats(struct net_device *netdev, + struct ethtool_pause_stats *pause_stats) + { +@@ -1461,7 +2021,7 @@ static void mlx5e_get_pause_stats(struct + + mlx5e_stats_pause_get(priv, pause_stats); + } +- ++#endif + void mlx5e_ethtool_get_pauseparam(struct mlx5e_priv *priv, + struct ethtool_pauseparam *pauseparam) + { +@@ -1546,7 +2106,6 @@ static int mlx5e_get_ts_info(struct net_ + + return mlx5e_ethtool_get_ts_info(priv, info); + } +- + static __u32 mlx5e_get_wol_supported(struct mlx5_core_dev *mdev) + { + __u32 ret = 0; +@@ -1669,7 +2228,7 @@ static int mlx5e_set_wol(struct net_devi + + return mlx5_set_port_wol(mdev, mlx5_wol_mode); + } +- ++#ifdef HAVE_NDO_GET_FEC_STATS + static void mlx5e_get_fec_stats(struct net_device *netdev, + struct ethtool_fec_stats *fec_stats) + { +@@ -1677,7 +2236,8 @@ static void mlx5e_get_fec_stats(struct n + + mlx5e_stats_fec_get(priv, fec_stats); + } +- ++#endif ++#ifdef HAVE_GET_SET_FECPARAM + static int mlx5e_get_fecparam(struct net_device *netdev, + struct ethtool_fecparam *fecparam) + { +@@ -1734,7 +2294,7 @@ static int mlx5e_set_fecparam(struct net + + return 0; + } +- ++#endif + static u32 mlx5e_get_msglevel(struct net_device *dev) + { + return ((struct mlx5e_priv *)netdev_priv(dev))->msglevel; +@@ -1847,6 +2407,7 @@ static int mlx5e_get_module_eeprom(struc + return 0; + } + ++#ifdef HAVE_GET_MODULE_EEPROM_BY_PAGE + static int mlx5e_get_module_eeprom_by_page(struct net_device *netdev, + const struct ethtool_module_eeprom *page_data, + struct netlink_ext_ack *extack) +@@ -1889,6 +2450,7 @@ static int mlx5e_get_module_eeprom_by_pa + + return i; + } ++#endif + + int mlx5e_ethtool_flash_device(struct mlx5e_priv *priv, + struct ethtool_flash *flash) +@@ -1908,7 +2470,11 @@ int mlx5e_ethtool_flash_device(struct ml + dev_hold(dev); + rtnl_unlock(); + +- err = mlx5_firmware_flash(mdev, fw, NULL); ++ err = mlx5_firmware_flash(mdev, fw ++#ifdef HAVE_NETLINK_EXT_ACK ++ , NULL ++#endif ++ ); + release_firmware(fw); + + rtnl_lock(); +@@ -2005,6 +2571,37 @@ int mlx5e_modify_rx_cqe_compression_lock + return 0; + } + ++int mlx5e_modify_tx_cqe_compression_locked(struct mlx5e_priv *priv, bool new_val) ++{ ++ bool curr_val = MLX5E_GET_PFLAG(&priv->channels.params, MLX5E_PFLAG_TX_CQE_COMPRESS); ++ struct mlx5e_params new_params; ++ int err = 0; ++ ++ if (!MLX5_CAP_GEN(priv->mdev, cqe_compression)) ++ return new_val ? -EOPNOTSUPP : 0; ++ ++ if (curr_val == new_val) ++ return 0; ++ ++ new_params = priv->channels.params; ++ MLX5E_SET_PFLAG(&new_params, MLX5E_PFLAG_TX_CQE_COMPRESS, new_val); ++ ++ if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) { ++ priv->channels.params = new_params; ++ return 0; ++ } ++ ++ err = mlx5e_safe_switch_params(priv, &new_params, NULL ,NULL, true); ++ if (err) ++ return err; ++ ++ mlx5e_dbg(DRV, priv, "MLX5E: TxCqeCmprss was turned %s\n", ++ MLX5E_GET_PFLAG(&priv->channels.params, ++ MLX5E_PFLAG_TX_CQE_COMPRESS) ? "ON" : "OFF"); ++ ++ return 0; ++} ++ + static int set_pflag_rx_cqe_compress(struct net_device *netdev, + bool enable) + { +@@ -2026,6 +2623,19 @@ static int set_pflag_rx_cqe_compress(str + return 0; + } + ++static int set_pflag_tx_cqe_compress(struct net_device *netdev, bool enable) ++{ ++ struct mlx5e_priv *priv = netdev_priv(netdev); ++ struct mlx5_core_dev *mdev = priv->mdev; ++ ++ if (!MLX5_CAP_GEN(mdev, cqe_compression)) ++ return -EOPNOTSUPP; ++ ++ mlx5e_modify_tx_cqe_compression_locked(priv, enable); ++ ++ return 0; ++} ++ + static int set_pflag_rx_striding_rq(struct net_device *netdev, bool enable) + { + struct mlx5e_priv *priv = netdev_priv(netdev); +@@ -2041,7 +2651,11 @@ static int set_pflag_rx_striding_rq(stru + + if (err) + return err; ++#ifndef CONFIG_COMPAT_LRO_ENABLED_IPOIB + } else if (priv->channels.params.packet_merge.type != MLX5E_PACKET_MERGE_NONE) { ++#else ++ } else if (IS_HW_LRO(&priv->channels.params)) { ++#endif + netdev_warn(netdev, "Can't set legacy RQ with HW-GRO/LRO, disable them first\n"); + return -EINVAL; + } +@@ -2054,6 +2668,39 @@ static int set_pflag_rx_striding_rq(stru + return mlx5e_safe_switch_params(priv, &new_params, NULL, NULL, true); + } + ++#ifdef CONFIG_COMPAT_LRO_ENABLED_IPOIB ++static int set_pflag_hwlro(struct net_device *netdev, bool enable) ++{ ++ struct mlx5e_priv *priv = netdev_priv(netdev); ++ struct mlx5e_params new_params; ++ int err = 0; ++ bool reset; ++ ++ ++ if (priv->channels.params.rq_wq_type != ++ MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) { ++ netdev_warn(netdev, "Can't toggle HW LRO with legacy RQ\n"); ++ return -EINVAL; ++ } ++ ++ new_params = priv->channels.params; ++ MLX5E_SET_PFLAG(&new_params, MLX5E_PFLAG_HWLRO, enable); ++ reset = test_bit(MLX5E_STATE_OPENED, &priv->state) && ++ (new_params.packet_merge.type != MLX5E_PACKET_MERGE_NONE); ++ ++ if (!reset) { ++ priv->channels.params = new_params; ++ err = mlx5e_modify_tirs_packet_merge(priv); ++ goto out; ++ } ++ ++ err = mlx5e_safe_switch_params(priv, &new_params, ++ mlx5e_modify_tirs_packet_merge_ctx, NULL, true); ++out: ++ return err; ++} ++#endif ++ + static int set_pflag_rx_no_csum_complete(struct net_device *netdev, bool enable) + { + struct mlx5e_priv *priv = netdev_priv(netdev); +@@ -2062,7 +2709,11 @@ static int set_pflag_rx_no_csum_complete + int i; + + if (!test_bit(MLX5E_STATE_OPENED, &priv->state) || ++#ifdef HAVE_XDP_SUPPORT + priv->channels.params.xdp_prog) ++#else ++ false) ++#endif + return 0; + + for (i = 0; i < channels->num; i++) { +@@ -2099,10 +2750,12 @@ static int set_pflag_per_channel_stats(s + return 0; + } + ++#ifdef HAVE_XDP_SUPPORT + static int set_pflag_xdp_tx_mpwqe(struct net_device *netdev, bool enable) + { + return set_pflag_tx_mpwqe_common(netdev, MLX5E_PFLAG_XDP_TX_MPWQE, enable); + } ++#endif + + static int set_pflag_skb_tx_mpwqe(struct net_device *netdev, bool enable) + { +@@ -2130,17 +2783,20 @@ static int set_pflag_tx_port_ts(struct n + } + + new_params = priv->channels.params; ++ + /* Don't allow enabling TX-port-TS if MQPRIO mode channel offload is + * active, since it defines explicitly which TC accepts the packet. + * This conflicts with TX-port-TS hijacking the PTP traffic to a specific + * HW TX-queue. + */ ++#ifdef HAVE_TC_MQPRIO_QOPT_OFFLOAD + if (enable && new_params.mqprio.mode == TC_MQPRIO_MODE_CHANNEL) { + netdev_err(priv->netdev, + "%s: MQPRIO mode channel offload is active, cannot set the TX-port-TS\n", + __func__); + return -EINVAL; + } ++#endif + MLX5E_SET_PFLAG(&new_params, MLX5E_PFLAG_TX_PORT_TS, enable); + /* No need to verify SQ stop room as + * ptpsq.txqsq.stop_room <= generic_sq->stop_room, and both +@@ -2216,18 +2872,44 @@ static int set_pflag_tx_xdp_hw_checksum( + return err; + } + ++static int set_pflag_skb_xmit_more(struct net_device *netdev, bool enable) ++{ ++ struct mlx5e_priv *priv = netdev_priv(netdev); ++ struct mlx5e_params new_params; ++ int err; ++ ++ new_params = priv->channels.params; ++ ++ MLX5E_SET_PFLAG(&new_params, MLX5E_PFLAG_SKB_XMIT_MORE, enable); ++ ++ if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) { ++ priv->channels.params = new_params; ++ return 0; ++ } ++ ++ err = mlx5e_safe_switch_params(priv, &new_params, NULL, NULL, true); ++ return err; ++} ++ + static const struct pflag_desc mlx5e_priv_flags[MLX5E_NUM_PFLAGS] = { + { "rx_cqe_moder", set_pflag_rx_cqe_based_moder }, + { "tx_cqe_moder", set_pflag_tx_cqe_based_moder }, + { "rx_cqe_compress", set_pflag_rx_cqe_compress }, ++ { "tx_cqe_compress", set_pflag_tx_cqe_compress }, + { "rx_striding_rq", set_pflag_rx_striding_rq }, + { "rx_no_csum_complete", set_pflag_rx_no_csum_complete }, ++#ifdef HAVE_XDP_SUPPORT + { "xdp_tx_mpwqe", set_pflag_xdp_tx_mpwqe }, ++#endif + { "skb_tx_mpwqe", set_pflag_skb_tx_mpwqe }, + { "tx_port_ts", set_pflag_tx_port_ts }, + { "dropless_rq", set_pflag_dropless_rq}, + { "per_channel_stats", set_pflag_per_channel_stats}, ++#ifdef CONFIG_COMPAT_LRO_ENABLED_IPOIB ++ { "hw_lro", set_pflag_hwlro }, ++#endif + { "tx_xdp_hw_checksum", set_pflag_tx_xdp_hw_checksum}, ++ { "skb_xmit_more", set_pflag_skb_xmit_more}, + }; + + static int mlx5e_handle_pflag(struct net_device *netdev, +@@ -2269,7 +2951,6 @@ static int mlx5e_set_priv_flags(struct n + + mutex_unlock(&priv->state_lock); + +- /* Need to fix some features.. */ + netdev_update_features(netdev); + + return err; +@@ -2343,6 +3024,7 @@ int mlx5_query_port_status(struct mlx5_c + return 0; + } + ++#ifdef HAVE_GET_LINK_EXT_STATE + struct mlx5e_ethtool_link_ext_state_opcode_mapping { + u32 status_opcode; + enum ethtool_link_ext_state link_ext_state; +@@ -2487,7 +3169,8 @@ mlx5e_get_link_ext_state(struct net_devi + + return -ENODATA; + } +- ++#endif //HAVE_GET_LINK_EXT_STATE ++#ifdef HAVE_NDO_ETH_PHY_STATS + static void mlx5e_get_eth_phy_stats(struct net_device *netdev, + struct ethtool_eth_phy_stats *phy_stats) + { +@@ -2520,15 +3203,24 @@ static void mlx5e_get_rmon_stats(struct + + mlx5e_stats_rmon_get(priv, rmon_stats, ranges); + } ++#endif + + const struct ethtool_ops mlx5e_ethtool_ops = { ++#ifdef HAVE_SUPPORTED_COALESCE_PARAM + .supported_coalesce_params = ETHTOOL_COALESCE_USECS | + ETHTOOL_COALESCE_MAX_FRAMES | ++#ifdef HAVE_NDO_GET_COALESCE_GET_4_PARAMS //forwardport + ETHTOOL_COALESCE_USE_ADAPTIVE | + ETHTOOL_COALESCE_USE_CQE, ++#else ++ ETHTOOL_COALESCE_USE_ADAPTIVE, ++#endif ++#endif + .get_drvinfo = mlx5e_get_drvinfo, + .get_link = ethtool_op_get_link, ++#ifdef HAVE_GET_LINK_EXT_STATE + .get_link_ext_state = mlx5e_get_link_ext_state, ++#endif + .get_strings = mlx5e_get_strings, + .get_sset_count = mlx5e_get_sset_count, + .get_ethtool_stats = mlx5e_get_ethtool_stats, +@@ -2538,19 +3230,31 @@ const struct ethtool_ops mlx5e_ethtool_o + .set_channels = mlx5e_set_channels, + .get_coalesce = mlx5e_get_coalesce, + .set_coalesce = mlx5e_set_coalesce, ++#ifdef HAVE_GET_SET_LINK_KSETTINGS + .get_link_ksettings = mlx5e_get_link_ksettings, + .set_link_ksettings = mlx5e_set_link_ksettings, ++#endif ++#ifdef HAVE_ETHTOOL_GET_SET_SETTINGS ++ .get_settings = mlx5e_get_settings, ++ .set_settings = mlx5e_set_settings, ++#endif + .get_rxfh_key_size = mlx5e_get_rxfh_key_size, + .get_rxfh_indir_size = mlx5e_get_rxfh_indir_size, + .get_rxfh = mlx5e_get_rxfh, + .set_rxfh = mlx5e_set_rxfh, ++#ifdef HAVE_ETHTOOL_GET_RXFH_CONTEXT + .get_rxfh_context = mlx5e_get_rxfh_context, + .set_rxfh_context = mlx5e_set_rxfh_context, ++#endif + .get_rxnfc = mlx5e_get_rxnfc, + .set_rxnfc = mlx5e_set_rxnfc, ++#ifdef HAVE_GET_SET_TUNABLE + .get_tunable = mlx5e_get_tunable, + .set_tunable = mlx5e_set_tunable, ++#endif ++#ifdef HAVE_GET_PAUSE_STATS + .get_pause_stats = mlx5e_get_pause_stats, ++#endif + .get_pauseparam = mlx5e_get_pauseparam, + .set_pauseparam = mlx5e_set_pauseparam, + .get_ts_info = mlx5e_get_ts_info, +@@ -2559,22 +3263,30 @@ const struct ethtool_ops mlx5e_ethtool_o + .set_wol = mlx5e_set_wol, + .get_module_info = mlx5e_get_module_info, + .get_module_eeprom = mlx5e_get_module_eeprom, ++#ifdef HAVE_GET_MODULE_EEPROM_BY_PAGE + .get_module_eeprom_by_page = mlx5e_get_module_eeprom_by_page, ++#endif + .flash_device = mlx5e_flash_device, + .get_priv_flags = mlx5e_get_priv_flags, + .set_priv_flags = mlx5e_set_priv_flags, + .self_test = mlx5e_self_test, + .get_msglevel = mlx5e_get_msglevel, + .set_msglevel = mlx5e_set_msglevel, ++#ifdef HAVE_NDO_GET_FEC_STATS + .get_fec_stats = mlx5e_get_fec_stats, ++#endif + .set_priv_flags = mlx5e_set_priv_flags, + .get_dump_flag = mlx5e_get_dump_flag, + .get_dump_data = mlx5e_get_dump_data, + .set_dump = mlx5e_set_dump, ++#ifdef HAVE_GET_SET_FECPARAM + .get_fecparam = mlx5e_get_fecparam, + .set_fecparam = mlx5e_set_fecparam, ++#endif ++#ifdef HAVE_NDO_ETH_PHY_STATS + .get_eth_phy_stats = mlx5e_get_eth_phy_stats, + .get_eth_mac_stats = mlx5e_get_eth_mac_stats, + .get_eth_ctrl_stats = mlx5e_get_eth_ctrl_stats, + .get_rmon_stats = mlx5e_get_rmon_stats, ++#endif + }; diff --git a/src/mlnx-ofa_kernel-5.8/backports/0146-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch b/src/mlnx-ofa_kernel-5.8/backports/0146-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch new file mode 100644 index 0000000..8a3e3ac --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0146-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch @@ -0,0 +1,110 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/en_fs.c + +Change-Id: I473cce9631f4e0fffb8c36740056341dd9bd8c0c +--- + .../net/ethernet/mellanox/mlx5/core/en_fs.c | 21 +++++++++++++++---- + 1 file changed, 17 insertions(+), 4 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c +@@ -81,8 +81,9 @@ static void mlx5e_add_l2_to_hash(struct + struct mlx5e_l2_hash_node *hn; + int ix = mlx5e_hash_l2(addr); + int found = 0; ++ COMPAT_HL_NODE + +- hlist_for_each_entry(hn, &hash[ix], hlist) ++ compat_hlist_for_each_entry(hn, &hash[ix], hlist) + if (ether_addr_equal_64bits(hn->ai.addr, addr)) { + found = 1; + break; +@@ -331,6 +332,8 @@ static int mlx5e_add_any_vid_rules(struc + return mlx5e_add_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_ANY_STAG_VID, 0); + } + ++#ifdef HAVE_DEVLINK_TRAP_SUPPORT ++ + static struct mlx5_flow_handle * + mlx5e_add_trap_rule(struct mlx5_flow_table *ft, int trap_id, int tir_num) + { +@@ -404,6 +407,8 @@ void mlx5e_remove_mac_trap(struct mlx5e_ + } + } + ++#endif /* HAVE_DEVLINK_TRAP_SUPPORT */ ++ + void mlx5e_enable_cvlan_filter(struct mlx5e_priv *priv) + { + if (!priv->fs.vlan->cvlan_filter_disabled) +@@ -457,7 +462,8 @@ static int mlx5e_vlan_rx_add_svid(struct + return err; + } + +-int mlx5e_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) ++int mlx5e_vlan_rx_add_vid(struct net_device *dev, __always_unused __be16 proto, ++ u16 vid) + { + struct mlx5e_priv *priv = netdev_priv(dev); + +@@ -472,7 +478,8 @@ int mlx5e_vlan_rx_add_vid(struct net_dev + return -EOPNOTSUPP; + } + +-int mlx5e_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid) ++int mlx5e_vlan_rx_kill_vid(struct net_device *dev, __always_unused __be16 proto, ++ u16 vid) + { + struct mlx5e_priv *priv = netdev_priv(dev); + +@@ -523,7 +530,9 @@ static void mlx5e_del_vlan_rules(struct + + WARN_ON_ONCE(!(test_bit(MLX5E_STATE_DESTROYING, &priv->state))); + ++#ifdef HAVE_DEVLINK_TRAP_SUPPORT + mlx5e_remove_vlan_trap(priv); ++#endif /* HAVE_DEVLINK_TRAP_SUPPORT */ + + /* must be called after DESTROY bit is set and + * set_rx_mode is called and flushed +@@ -534,7 +543,7 @@ static void mlx5e_del_vlan_rules(struct + + #define mlx5e_for_each_hash_node(hn, tmp, hash, i) \ + for (i = 0; i < MLX5E_L2_ADDR_HASH_SIZE; i++) \ +- hlist_for_each_entry_safe(hn, tmp, &hash[i], hlist) ++ compat_hlist_for_each_entry_safe(hn, tmp, &hash[i], hlist) + + static void mlx5e_execute_l2_action(struct mlx5e_priv *priv, + struct mlx5e_l2_hash_node *hn) +@@ -597,6 +606,7 @@ static void mlx5e_fill_addr_array(struct + struct hlist_node *tmp; + int i = 0; + int hi; ++ COMPAT_HL_NODE + + addr_list = is_uc ? priv->fs.l2.netdev_uc : priv->fs.l2.netdev_mc; + +@@ -626,6 +636,7 @@ static void mlx5e_vport_context_update_a + int size; + int err; + int hi; ++ COMPAT_HL_NODE + + size = is_uc ? 0 : (priv->fs.l2.broadcast_enabled ? 1 : 0); + max_size = is_uc ? +@@ -677,6 +688,7 @@ static void mlx5e_apply_netdev_addr(stru + struct mlx5e_l2_hash_node *hn; + struct hlist_node *tmp; + int i; ++ COMPAT_HL_NODE + + mlx5e_for_each_hash_node(hn, tmp, priv->fs.l2.netdev_uc, i) + mlx5e_execute_l2_action(priv, hn); +@@ -690,6 +702,7 @@ static void mlx5e_handle_netdev_addr(str + struct mlx5e_l2_hash_node *hn; + struct hlist_node *tmp; + int i; ++ COMPAT_HL_NODE + + mlx5e_for_each_hash_node(hn, tmp, priv->fs.l2.netdev_uc, i) + hn->action = MLX5E_ACTION_DEL; diff --git a/src/mlnx-ofa_kernel-5.8/backports/0147-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch b/src/mlnx-ofa_kernel-5.8/backports/0147-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch new file mode 100644 index 0000000..8af050a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0147-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch @@ -0,0 +1,209 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c + +Change-Id: I973a4836f862a727a66b10fd51c06b88fadd6e11 +--- + .../mellanox/mlx5/core/en_fs_ethtool.c | 39 +++++++++++++++++-- + 1 file changed, 35 insertions(+), 4 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c +@@ -40,7 +40,11 @@ static int flow_type_to_traffic_type(u32 + + static u32 flow_type_mask(u32 flow_type) + { ++#ifdef HAVE_FLOW_RSS + return flow_type & ~(FLOW_EXT | FLOW_MAC_EXT | FLOW_RSS); ++#else ++ return flow_type & ~(FLOW_EXT | FLOW_MAC_EXT); ++#endif + } + + struct mlx5e_ethtool_rule { +@@ -82,14 +86,18 @@ static struct mlx5e_ethtool_table *get_f + case ESP_V4_FLOW: + case ESP_V6_FLOW: + #endif ++#ifdef HAVE_IPV6_USER_FLOW + case TCP_V6_FLOW: + case UDP_V6_FLOW: ++#endif + max_tuples = ETHTOOL_NUM_L3_L4_FTS; + prio = MLX5E_ETHTOOL_L3_L4_PRIO + (max_tuples - num_tuples); + eth_ft = &priv->fs.ethtool.l3_l4_ft[prio]; + break; + case IP_USER_FLOW: ++#ifdef HAVE_IPV6_USER_FLOW + case IPV6_USER_FLOW: ++#endif + max_tuples = ETHTOOL_NUM_L3_L4_FTS; + prio = MLX5E_ETHTOOL_L3_L4_PRIO + (max_tuples - num_tuples); + eth_ft = &priv->fs.ethtool.l3_l4_ft[prio]; +@@ -163,6 +171,7 @@ set_ip4(void *headers_c, void *headers_v + MLX5E_FTE_SET(headers_v, ethertype, ETH_P_IP); + } + ++#ifdef HAVE_IPV6_USER_FLOW + static void + set_ip6(void *headers_c, void *headers_v, __be32 ip6src_m[4], + __be32 ip6src_v[4], __be32 ip6dst_m[4], __be32 ip6dst_v[4]) +@@ -185,6 +194,7 @@ set_ip6(void *headers_c, void *headers_v + MLX5E_FTE_SET(headers_c, ethertype, 0xffff); + MLX5E_FTE_SET(headers_v, ethertype, ETH_P_IPV6); + } ++#endif + + static void + set_tcp(void *headers_c, void *headers_v, __be16 psrc_m, __be16 psrc_v, +@@ -262,6 +272,7 @@ parse_ip4(void *headers_c, void *headers + } + } + ++#ifdef HAVE_IPV6_USER_FLOW + static void + parse_ip6(void *headers_c, void *headers_v, struct ethtool_rx_flow_spec *fs) + { +@@ -302,6 +313,7 @@ parse_udp6(void *headers_c, void *header + set_udp(headers_c, headers_v, l4_mask->psrc, l4_val->psrc, + l4_mask->pdst, l4_val->pdst); + } ++#endif + + static void + parse_ether(void *headers_c, void *headers_v, struct ethtool_rx_flow_spec *fs) +@@ -365,6 +377,7 @@ static int set_flow_attrs(u32 *match_c, + case IP_USER_FLOW: + parse_ip4(outer_headers_c, outer_headers_v, fs); + break; ++#ifdef HAVE_IPV6_USER_FLOW + case TCP_V6_FLOW: + parse_tcp6(outer_headers_c, outer_headers_v, fs); + break; +@@ -374,6 +387,7 @@ static int set_flow_attrs(u32 *match_c, + case IPV6_USER_FLOW: + parse_ip6(outer_headers_c, outer_headers_v, fs); + break; ++#endif + case ETHER_FLOW: + parse_ether(outer_headers_c, outer_headers_v, fs); + break; +@@ -426,6 +440,7 @@ static int flow_get_tirn(struct mlx5e_pr + struct ethtool_rx_flow_spec *fs, + u32 rss_context, u32 *tirn) + { ++#ifdef HAVE_FLOW_RSS + if (fs->flow_type & FLOW_RSS) { + struct mlx5e_packet_merge_param pkt_merge_param; + struct mlx5e_rss *rss; +@@ -448,7 +463,9 @@ static int flow_get_tirn(struct mlx5e_pr + return err; + eth_rule->rss = rss; + mlx5e_rss_refcnt_inc(eth_rule->rss); +- } else { ++ } else ++#endif ++ { + struct mlx5e_params *params = &priv->channels.params; + enum mlx5e_rq_group group; + u16 ix; +@@ -460,6 +477,8 @@ static int flow_get_tirn(struct mlx5e_pr + mlx5e_rx_res_get_tirn_direct(priv->rx_res, ix); + } + ++ ++ + return 0; + } + +@@ -628,6 +647,7 @@ static int validate_ip4(struct ethtool_r + return ++ntuples; + } + ++#ifdef HAVE_IPV6_USER_FLOW + static int validate_ip6(struct ethtool_rx_flow_spec *fs) + { + struct ethtool_usrip6_spec *l3_mask = &fs->m_u.usr_ip6_spec; +@@ -667,7 +687,7 @@ static int validate_tcpudp6(struct ethto + /* Flow is TCP/UDP */ + return ++ntuples; + } +- ++#endif + static int validate_vlan(struct ethtool_rx_flow_spec *fs) + { + if (fs->m_ext.vlan_etype || +@@ -680,7 +700,6 @@ static int validate_vlan(struct ethtool_ + + return 1; + } +- + static int validate_flow(struct mlx5e_priv *priv, + struct ethtool_rx_flow_spec *fs) + { +@@ -731,6 +750,7 @@ static int validate_flow(struct mlx5e_pr + return ret; + num_tuples += ret; + break; ++#ifdef HAVE_IPV6_USER_FLOW + case TCP_V6_FLOW: + case UDP_V6_FLOW: + ret = validate_tcpudp6(fs); +@@ -744,6 +764,7 @@ static int validate_flow(struct mlx5e_pr + return ret; + num_tuples += ret; + break; ++#endif + default: + return -ENOTSUPP; + } +@@ -840,19 +861,23 @@ mlx5e_ethtool_get_flow(struct mlx5e_priv + return -EINVAL; + + list_for_each_entry(eth_rule, &priv->fs.ethtool.rules, list) { ++#ifdef HAVE_FLOW_RSS + int index; ++#endif + + if (eth_rule->flow_spec.location != location) + continue; + if (!info) + return 0; + info->fs = eth_rule->flow_spec; ++#ifdef HAVE_FLOW_RSS + if (!eth_rule->rss) + return 0; + index = mlx5e_rx_res_rss_index(priv->rx_res, eth_rule->rss); + if (index < 0) + return index; + info->rss_context = index; ++#endif + return 0; + } + +@@ -985,13 +1010,19 @@ static int mlx5e_get_rss_hash_opt(struct + return 0; + } + ++#ifdef CONFIG_MLX5_EN_RXNFC + int mlx5e_ethtool_set_rxnfc(struct mlx5e_priv *priv, struct ethtool_rxnfc *cmd) + { + int err = 0; + + switch (cmd->cmd) { + case ETHTOOL_SRXCLSRLINS: ++#ifdef HAVE_FLOW_RSS + err = mlx5e_ethtool_flow_replace(priv, &cmd->fs, cmd->rss_context); ++#else ++ err = mlx5e_ethtool_flow_replace(priv, &cmd->fs, 0); ++#endif ++ + break; + case ETHTOOL_SRXCLSRLDEL: + err = mlx5e_ethtool_flow_remove(priv, cmd->fs.location); +@@ -1032,4 +1063,4 @@ int mlx5e_ethtool_get_rxnfc(struct mlx5e + + return err; + } +- ++#endif diff --git a/src/mlnx-ofa_kernel-5.8/backports/0149-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch b/src/mlnx-ofa_kernel-5.8/backports/0149-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch new file mode 100644 index 0000000..447a544 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0149-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch @@ -0,0 +1,689 @@ +From: Maher Sanalla +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/en_rep.c + +Change-Id: I77d23db69e3c71784702c030ee1073b7eeb7b6da +--- + .../net/ethernet/mellanox/mlx5/core/en_rep.c | 305 +++++++++++++++++- + 1 file changed, 292 insertions(+), 13 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +@@ -36,6 +36,10 @@ + #include + #include + #include ++#if defined(HAVE_UDP_TUNNEL_RX_INFO) && defined(HAVE_KERNEL_WITH_VXLAN_SUPPORT_ON) \ ++ && defined(HAVE_DEVLINK_HAS_RELOAD_UP_DOWN) ++#include ++#endif + + #include "eswitch.h" + #include "mlx5_esw_devm.h" +@@ -61,6 +65,7 @@ + #include "en/ptp.h" + #include "en/tc/int_port.h" + #include ++#include "compat.h" + + #define MLX5E_REP_PARAMS_DEF_NUM_CHANNELS 1 + +@@ -227,9 +232,13 @@ static int mlx5e_rep_get_sset_count(stru + + static void + mlx5e_rep_get_ringparam(struct net_device *dev, ++#ifdef HAVE_GET_RINGPARAM_GET_4_PARAMS + struct ethtool_ringparam *param, + struct kernel_ethtool_ringparam *kernel_param, + struct netlink_ext_ack *extack) ++#else ++ struct ethtool_ringparam *param) ++#endif + { + struct mlx5e_priv *priv = netdev_priv(dev); + +@@ -238,9 +247,13 @@ mlx5e_rep_get_ringparam(struct net_devic + + static int + mlx5e_rep_set_ringparam(struct net_device *dev, ++#ifdef HAVE_GET_RINGPARAM_GET_4_PARAMS + struct ethtool_ringparam *param, + struct kernel_ethtool_ringparam *kernel_param, + struct netlink_ext_ack *extack) ++#else ++ struct ethtool_ringparam *param) ++#endif + { + struct mlx5e_priv *priv = netdev_priv(dev); + +@@ -264,23 +277,41 @@ static int mlx5e_rep_set_channels(struct + } + + static int mlx5e_rep_get_coalesce(struct net_device *netdev, ++#ifdef HAVE_NDO_GET_COALESCE_GET_4_PARAMS + struct ethtool_coalesce *coal, + struct kernel_ethtool_coalesce *kernel_coal, + struct netlink_ext_ack *extack) ++#else ++ struct ethtool_coalesce *coal) ++#endif + { + struct mlx5e_priv *priv = netdev_priv(netdev); + +- return mlx5e_ethtool_get_coalesce(priv, coal, kernel_coal); ++ return mlx5e_ethtool_get_coalesce(priv, ++#ifdef HAVE_NDO_GET_COALESCE_GET_4_PARAMS ++ coal, kernel_coal); ++#else ++ coal); ++#endif + } + + static int mlx5e_rep_set_coalesce(struct net_device *netdev, ++#ifdef HAVE_NDO_GET_COALESCE_GET_4_PARAMS + struct ethtool_coalesce *coal, + struct kernel_ethtool_coalesce *kernel_coal, + struct netlink_ext_ack *extack) ++#else ++ struct ethtool_coalesce *coal) ++#endif + { + struct mlx5e_priv *priv = netdev_priv(netdev); + +- return mlx5e_ethtool_set_coalesce(priv, coal, kernel_coal, extack); ++ return mlx5e_ethtool_set_coalesce(priv, ++#ifdef HAVE_NDO_GET_COALESCE_GET_4_PARAMS ++ coal, kernel_coal, extack); ++#else ++ coal); ++#endif + } + + static u32 mlx5e_rep_get_rxfh_key_size(struct net_device *netdev) +@@ -298,16 +329,20 @@ static u32 mlx5e_rep_get_rxfh_indir_size + } + + static const struct ethtool_ops mlx5e_rep_ethtool_ops = { ++#ifdef HAVE_SUPPORTED_COALESCE_PARAM + .supported_coalesce_params = ETHTOOL_COALESCE_USECS | + ETHTOOL_COALESCE_MAX_FRAMES | + ETHTOOL_COALESCE_USE_ADAPTIVE, ++#endif + .get_drvinfo = mlx5e_rep_get_drvinfo, + .get_link = ethtool_op_get_link, + .get_strings = mlx5e_rep_get_strings, + .get_sset_count = mlx5e_rep_get_sset_count, + .get_ethtool_stats = mlx5e_rep_get_ethtool_stats, ++#ifdef HAVE_GET_SET_LINK_KSETTINGS + .get_link_ksettings = mlx5e_get_link_ksettings, + .set_link_ksettings = mlx5e_set_link_ksettings, ++#endif + .get_ringparam = mlx5e_rep_get_ringparam, + .set_ringparam = mlx5e_rep_set_ringparam, + .get_channels = mlx5e_rep_get_channels, +@@ -408,7 +443,9 @@ mlx5e_add_sqs_fwd_rules(struct mlx5e_pri + { + int sqs_per_channel = mlx5e_get_dcb_num_tc(&priv->channels.params); + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; ++#ifdef HAVE_XDP_SUPPORT + bool is_uplink_rep = mlx5e_is_uplink_rep(priv); ++#endif + struct mlx5e_rep_priv *rpriv = priv->ppriv; + struct mlx5_eswitch_rep *rep = rpriv->rep; + struct mlx5e_channel *c; +@@ -424,10 +461,11 @@ mlx5e_add_sqs_fwd_rules(struct mlx5e_pri + /* +2 for xdpsqs, they don't exist on the ptp channel but will not be + * counted for by num_sqs. + */ ++#ifdef HAVE_XDP_SUPPORT + if (is_uplink_rep) + sqs_per_channel += 2; +- +- sqs = kvcalloc(num_txqs * sqs_per_channel, sizeof(*sqs), GFP_KERNEL); ++#endif ++ sqs = kcalloc(num_txqs * sqs_per_channel, sizeof(*sqs), GFP_KERNEL); + if (!sqs) + goto out; + +@@ -436,12 +474,14 @@ mlx5e_add_sqs_fwd_rules(struct mlx5e_pri + for (tc = 0; tc < c->num_tc; tc++) + sqs[num_sqs++] = c->sq[tc].sqn; + ++#ifdef HAVE_XDP_SUPPORT + if (is_uplink_rep) { + if (c->xdp) + sqs[num_sqs++] = c->rq_xdpsq.sqn; + + sqs[num_sqs++] = c->xdpsq.sqn; + } ++#endif + } + + if (ptp_sq) { +@@ -553,6 +593,49 @@ static int mlx5e_rep_close(struct net_de + return ret; + } + ++#ifndef HAVE_DEVLINK_PORT_ATTRS_PCI_PF_SET ++int mlx5e_rep_get_phys_port_name(struct net_device *dev, ++ char *buf, size_t len) ++{ ++ struct mlx5e_priv *priv = netdev_priv(dev); ++ struct mlx5e_rep_priv *rpriv; ++ struct mlx5_eswitch_rep *rep; ++ struct mlx5_eswitch *esw; ++ int ret = -EOPNOTSUPP; ++ unsigned int fn; ++ ++ if (!netif_device_present(dev)) ++ return -EOPNOTSUPP; ++ ++ esw = priv->mdev->priv.eswitch; ++ if (!esw || (!mlx5_sriov_is_enabled(esw->dev) && !is_mdev_switchdev_mode(esw->dev))) ++ return -EOPNOTSUPP; ++ ++ fn = mlx5_get_dev_index(priv->mdev); ++ if (fn >= MLX5_MAX_PORTS) ++ return -EOPNOTSUPP; ++ ++ rpriv = priv->ppriv; ++ if (!rpriv) ++ return -EOPNOTSUPP; ++ rep = rpriv->rep; ++ ++ if (rep->vport == MLX5_VPORT_UPLINK) ++ ret = snprintf(buf, len, "p%d", fn); ++ else if (rep->vport == MLX5_VPORT_PF) ++ ret = snprintf(buf, len, "pf%d", fn); ++ else if (mlx5_eswitch_is_vf_vport(priv->mdev->priv.eswitch, rep->vport)) ++ ret = snprintf(buf, len, "pf%dvf%d", fn, rep->vport - 1); ++ else ++ return -EOPNOTSUPP; ++ ++ if (ret >= len) ++ return -EOPNOTSUPP; ++ ++ return 0; ++} ++#endif ++ + bool mlx5e_is_uplink_rep(const struct mlx5e_priv *priv) + { + struct mlx5e_rep_priv *rpriv = priv->ppriv; +@@ -568,6 +651,7 @@ bool mlx5e_is_uplink_rep(const struct ml + return (rep->vport == MLX5_VPORT_UPLINK); + } + ++#if defined(HAVE_NDO_HAS_OFFLOAD_STATS_GETS_NET_DEVICE) || defined(HAVE_NDO_HAS_OFFLOAD_STATS_EXTENDED) + bool mlx5e_rep_has_offload_stats(const struct net_device *dev, int attr_id) + { + switch (attr_id) { +@@ -577,27 +661,54 @@ bool mlx5e_rep_has_offload_stats(const s + + return false; + } ++#endif + +-static void mlx5e_rep_get_port_parent_id(struct net_device *dev, +- struct netdev_phys_item_id *ppid) ++#if defined(HAVE_NDO_GET_PORT_PARENT_ID) || defined(HAVE_SWITCHDEV_OPS) || defined(HAVE_SWITCHDEV_H_COMPAT) ++#ifdef HAVE_DEVLINK_PORT_ATTRS_PCI_PF_SET ++void ++#else ++int ++#endif ++mlx5e_rep_get_port_parent_id(struct net_device *dev, ++ struct netdev_phys_item_id *ppid) + { ++#ifndef HAVE_DEVLINK_PORT_ATTRS_PCI_PF_SET ++ struct mlx5_eswitch *esw; ++#endif + struct mlx5e_priv *priv; + u64 parent_id; + + priv = netdev_priv(dev); ++#ifndef HAVE_DEVLINK_PORT_ATTRS_PCI_PF_SET ++ esw = priv->mdev->priv.eswitch; ++ ++ if (!esw || (!mlx5_sriov_is_enabled(esw->dev) && !is_mdev_switchdev_mode(esw->dev))) ++ return -EOPNOTSUPP; ++#endif + + parent_id = mlx5_query_nic_system_image_guid(priv->mdev); + ppid->id_len = sizeof(parent_id); + memcpy(ppid->id, &parent_id, sizeof(parent_id)); ++#ifndef HAVE_DEVLINK_PORT_ATTRS_PCI_PF_SET ++ return 0; ++#endif + } ++#endif + ++#if !defined(HAVE_DEVLINK_PORT_ATTRS_PCI_SF_SET_GET_4_PARAMS) && \ ++ !defined(HAVE_DEVLINK_PORT_ATTRS_PCI_SF_SET_GET_5_PARAMS) && \ ++ defined(HAVE_NDO_GET_PORT_PARENT_ID) + static int mlx5e_rep_sf_port_parent_id(struct net_device *dev, + struct netdev_phys_item_id *ppid) + { + mlx5e_rep_get_port_parent_id(dev, ppid); + return 0; + } ++#endif + ++#if !defined(HAVE_DEVLINK_PORT_ATTRS_PCI_SF_SET_GET_4_PARAMS) && \ ++ !defined(HAVE_DEVLINK_PORT_ATTRS_PCI_SF_SET_GET_5_PARAMS) && \ ++ (defined(HAVE_NDO_GET_PHYS_PORT_NAME) || defined(HAVE_NDO_GET_PHYS_PORT_NAME_EXTENDED)) + static int mlx5e_rep_sf_get_phys_port_name(struct net_device *dev, + char *buf, size_t len) + { +@@ -626,7 +737,9 @@ static int mlx5e_rep_sf_get_phys_port_na + #endif + return 0; + } ++#endif + ++#if defined(HAVE_NDO_GET_OFFLOAD_STATS) || defined(HAVE_NDO_GET_OFFLOAD_STATS_EXTENDED) + static int + mlx5e_get_sw_stats64(const struct net_device *dev, + struct rtnl_link_stats64 *stats) +@@ -647,15 +760,28 @@ int mlx5e_rep_get_offload_stats(int attr + + return -EINVAL; + } ++#endif + +-static void +-mlx5e_rep_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) ++static ++#ifdef HAVE_NDO_GET_STATS64_RET_VOID ++void mlx5e_rep_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) ++#elif defined(HAVE_NDO_GET_STATS64) ++struct rtnl_link_stats64 * mlx5e_rep_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) ++#else ++struct net_device_stats * mlx5e_rep_get_stats(struct net_device *dev) ++#endif + { + struct mlx5e_priv *priv = netdev_priv(dev); ++#if !defined(HAVE_NDO_GET_STATS64) && !defined(HAVE_NDO_GET_STATS64_RET_VOID) ++ struct net_device_stats *stats = &priv->netdev_stats; ++#endif + + /* update HW stats in background for next time */ + mlx5e_queue_update_stats(priv); + memcpy(stats, &priv->stats.vf_vport, sizeof(*stats)); ++#ifndef HAVE_NDO_GET_STATS64_RET_VOID ++ return stats; ++#endif + } + + static int mlx5e_rep_change_mtu(struct net_device *netdev, int new_mtu) +@@ -663,6 +789,7 @@ static int mlx5e_rep_change_mtu(struct n + return mlx5e_change_mtu(netdev, new_mtu, NULL); + } + ++#ifdef HAVE_NDO_GET_DEVLINK_PORT + static struct devlink_port *mlx5e_rep_get_devlink_port(struct net_device *netdev) + { + struct mlx5e_priv *priv = netdev_priv(netdev); +@@ -671,6 +798,7 @@ static struct devlink_port *mlx5e_rep_ge + + return mlx5_esw_offloads_devlink_port(dev->priv.eswitch, rpriv->rep->vport); + } ++#endif + + static int mlx5e_rep_change_carrier(struct net_device *dev, bool new_carrier) + { +@@ -696,30 +824,108 @@ static int mlx5e_rep_change_carrier(stru + } + + static const struct net_device_ops mlx5e_netdev_ops_rep = { ++#ifdef HAVE_NET_DEVICE_OPS_EXTENDED ++ .ndo_size = sizeof(struct net_device_ops), ++#endif + .ndo_open = mlx5e_rep_open, + .ndo_stop = mlx5e_rep_close, + .ndo_start_xmit = mlx5e_xmit, +- .ndo_setup_tc = mlx5e_rep_setup_tc, ++#ifdef CONFIG_MLX5_CLS_ACT ++#if defined(HAVE_TC_FLOWER_OFFLOAD) || defined(HAVE_FLOW_CLS_OFFLOAD) ++#ifdef HAVE_NDO_SETUP_TC_RH_EXTENDED ++ .extended.ndo_setup_tc_rh = mlx5e_rep_setup_tc, ++#else ++ .ndo_setup_tc = mlx5e_rep_setup_tc, ++#endif ++#endif ++#endif /* CONFIG_MLX5_CLS_ACT */ ++#ifdef HAVE_NDO_GET_DEVLINK_PORT + .ndo_get_devlink_port = mlx5e_rep_get_devlink_port, ++#else ++#ifdef HAVE_NDO_GET_PHYS_PORT_NAME ++ .ndo_get_phys_port_name = mlx5e_rep_get_phys_port_name, ++#elif defined(HAVE_NDO_GET_PHYS_PORT_NAME_EXTENDED) ++ .extended.ndo_get_phys_port_name = mlx5e_rep_get_phys_port_name, ++#endif ++#ifdef HAVE_NDO_GET_PORT_PARENT_ID ++ .ndo_get_port_parent_id = mlx5e_rep_get_port_parent_id, ++#endif ++#endif ++#if defined(HAVE_NDO_GET_STATS64) || defined(HAVE_NDO_GET_STATS64_RET_VOID) + .ndo_get_stats64 = mlx5e_rep_get_stats, ++#else ++ .ndo_get_stats = mlx5e_rep_get_stats, ++#endif ++#ifdef HAVE_NDO_HAS_OFFLOAD_STATS_GETS_NET_DEVICE + .ndo_has_offload_stats = mlx5e_rep_has_offload_stats, ++#elif defined(HAVE_NDO_HAS_OFFLOAD_STATS_EXTENDED) ++ .extended.ndo_has_offload_stats = mlx5e_rep_has_offload_stats, ++#endif ++#ifdef HAVE_NDO_GET_OFFLOAD_STATS + .ndo_get_offload_stats = mlx5e_rep_get_offload_stats, +- .ndo_change_mtu = mlx5e_rep_change_mtu, ++#elif defined(HAVE_NDO_GET_OFFLOAD_STATS_EXTENDED) ++ .extended.ndo_get_offload_stats = mlx5e_rep_get_offload_stats, ++#endif ++#ifdef HAVE_NDO_CHANGE_MTU_EXTENDED ++ .extended.ndo_change_mtu = mlx5e_rep_change_mtu, ++#else ++ .ndo_change_mtu = mlx5e_rep_change_mtu, ++#endif + .ndo_change_carrier = mlx5e_rep_change_carrier, + }; + + static const struct net_device_ops mlx5e_netdev_ops_rep_sf = { ++#ifdef HAVE_NET_DEVICE_OPS_EXTENDED ++ .ndo_size = sizeof(struct net_device_ops), ++#endif + .ndo_open = mlx5e_rep_open, + .ndo_stop = mlx5e_rep_close, + .ndo_start_xmit = mlx5e_xmit, ++#ifdef CONFIG_MLX5_CLS_ACT ++#if defined(HAVE_TC_FLOWER_OFFLOAD) || defined(HAVE_FLOW_CLS_OFFLOAD) ++#ifdef HAVE_NDO_SETUP_TC_RH_EXTENDED ++ .extended.ndo_setup_tc_rh = mlx5e_rep_setup_tc, ++#else + .ndo_setup_tc = mlx5e_rep_setup_tc, ++#endif ++#endif ++#endif ++#ifdef HAVE_NDO_GET_DEVLINK_PORT + .ndo_get_devlink_port = mlx5e_rep_get_devlink_port, +- .ndo_get_port_parent_id = mlx5e_rep_sf_port_parent_id, ++#endif ++#if !defined(HAVE_DEVLINK_PORT_ATTRS_PCI_SF_SET_GET_4_PARAMS) && \ ++ !defined(HAVE_DEVLINK_PORT_ATTRS_PCI_SF_SET_GET_5_PARAMS) ++#ifdef HAVE_NDO_GET_PHYS_PORT_NAME + .ndo_get_phys_port_name = mlx5e_rep_sf_get_phys_port_name, ++#elif defined(HAVE_NDO_GET_PHYS_PORT_NAME_EXTENDED) ++ .extended.ndo_get_phys_port_name = mlx5e_rep_sf_get_phys_port_name, ++#endif ++#endif /* !defined(HAVE_DEVLINK_PORT_ATTRS_PCI_SF_SET_GET_4_PARAMS) && !defined(HAVE_DEVLINK_PORT_ATTRS_PCI_SF_SET_GET_5_PARAMS) */ ++#if !defined(HAVE_DEVLINK_PORT_ATTRS_PCI_SF_SET_GET_4_PARAMS) && \ ++ !defined(HAVE_DEVLINK_PORT_ATTRS_PCI_SF_SET_GET_5_PARAMS) && \ ++ defined(HAVE_NDO_GET_PORT_PARENT_ID) ++ .ndo_get_port_parent_id = mlx5e_rep_sf_port_parent_id, ++#endif ++#if defined(HAVE_NDO_GET_STATS64) || defined(HAVE_NDO_GET_STATS64_RET_VOID) + .ndo_get_stats64 = mlx5e_rep_get_stats, +- .ndo_has_offload_stats = mlx5e_rep_has_offload_stats, +- .ndo_get_offload_stats = mlx5e_rep_get_offload_stats, ++#else ++ .ndo_get_stats = mlx5e_rep_get_stats, ++#endif ++#ifdef HAVE_NDO_HAS_OFFLOAD_STATS_GETS_NET_DEVICE ++ .ndo_has_offload_stats = mlx5e_rep_has_offload_stats, ++#elif defined(HAVE_NDO_HAS_OFFLOAD_STATS_EXTENDED) ++ .extended.ndo_has_offload_stats = mlx5e_rep_has_offload_stats, ++#endif ++#ifdef HAVE_NDO_GET_OFFLOAD_STATS ++ .ndo_get_offload_stats = mlx5e_rep_get_offload_stats, ++#elif defined(HAVE_NDO_GET_OFFLOAD_STATS_EXTENDED) ++ .extended.ndo_get_offload_stats = mlx5e_rep_get_offload_stats, ++#endif ++#ifdef HAVE_NDO_CHANGE_MTU_EXTENDED ++ .extended.ndo_change_mtu = mlx5e_rep_change_mtu, ++#else + .ndo_change_mtu = mlx5e_rep_change_mtu, ++#endif + }; + + bool mlx5e_eswitch_uplink_rep(const struct net_device *netdev) +@@ -784,6 +990,12 @@ static void mlx5e_build_rep_params(struc + MLX5E_SET_PFLAG(params, MLX5E_PFLAG_PER_CH_STATS, true); + } + ++#ifdef HAVE_SWITCHDEV_OPS ++static const struct switchdev_ops mlx5e_rep_switchdev_ops = { ++ .switchdev_port_attr_get = mlx5e_attr_get, ++}; ++#endif ++ + static void mlx5e_build_rep_netdev(struct net_device *netdev, + struct mlx5_core_dev *mdev, + struct mlx5_eswitch_rep *rep) +@@ -798,6 +1010,10 @@ static void mlx5e_build_rep_netdev(struc + eth_hw_addr_random(netdev); + netdev->ethtool_ops = &mlx5e_rep_ethtool_ops; + ++#ifdef HAVE_SWITCHDEV_OPS ++ netdev->switchdev_ops = &mlx5e_rep_switchdev_ops; ++#endif ++ + netdev->watchdog_timeo = 15 * HZ; + + #if IS_ENABLED(CONFIG_MLX5_CLS_ACT) +@@ -838,7 +1054,9 @@ static int mlx5e_init_ul_rep(struct mlx5 + if (err) + mlx5_core_err(mdev, "Uplink rep IPsec initialization failed, %d\n", err); + ++#if defined(HAVE_KERNEL_WITH_VXLAN_SUPPORT_ON) && defined(HAVE_UDP_TUNNEL_NIC_INFO) + mlx5e_vxlan_set_netdev_info(priv); ++#endif + mutex_init(&priv->aso_lock); + return mlx5e_init_rep(mdev, netdev); + } +@@ -1122,6 +1340,7 @@ static int mlx5e_init_uplink_rep_tx(stru + mlx5_init_port_tun_entropy(&uplink_priv->tun_entropy, priv->mdev); + + mlx5e_rep_bond_init(rpriv); ++#if defined( HAVE_TC_BLOCK_OFFLOAD) || defined(HAVE_FLOW_BLOCK_OFFLOAD) + err = mlx5e_rep_tc_netdevice_event_register(rpriv); + if (err) { + mlx5_core_err(priv->mdev, "Failed to register netdev notifier, err: %d\n", +@@ -1134,12 +1353,20 @@ static int mlx5e_init_uplink_rep_tx(stru + err_event_reg: + mlx5e_rep_bond_cleanup(rpriv); + mlx5e_rep_tc_cleanup(rpriv); ++#endif + return err; + } + + static void mlx5e_cleanup_uplink_rep_tx(struct mlx5e_rep_priv *rpriv) + { ++#if defined( HAVE_TC_BLOCK_OFFLOAD) || defined(HAVE_FLOW_BLOCK_OFFLOAD) + mlx5e_rep_tc_netdevice_event_unregister(rpriv); ++#ifndef HAVE_FLOW_INDR_DEV_REGISTER ++#if IS_ENABLED(CONFIG_MLX5_CLS_ACT) ++ mlx5e_rep_indr_clean_block_privs(rpriv); ++#endif ++#endif /* HAVE_FLOW_INDR_DEV_REGISTER */ ++#endif + mlx5e_rep_bond_cleanup(rpriv); + mlx5e_rep_tc_cleanup(rpriv); + } +@@ -1255,11 +1482,19 @@ static void mlx5e_uplink_rep_enable(stru + struct mlx5e_rep_priv *rpriv = priv->ppriv; + struct net_device *netdev = priv->netdev; + struct mlx5_core_dev *mdev = priv->mdev; ++#if defined(HAVE_NET_DEVICE_MIN_MAX_MTU) || defined(HAVE_NET_DEVICE_MIN_MAX_MTU_EXTENDED) + u16 max_mtu; ++#endif + ++#ifdef HAVE_NET_DEVICE_MIN_MAX_MTU + netdev->min_mtu = ETH_MIN_MTU; + mlx5_query_port_max_mtu(priv->mdev, &max_mtu, 1); + netdev->max_mtu = MLX5E_HW2SW_MTU(&priv->channels.params, max_mtu); ++#elif defined(HAVE_NET_DEVICE_MIN_MAX_MTU_EXTENDED) ++ netdev->extended->min_mtu = ETH_MIN_MTU; ++ mlx5_query_port_max_mtu(priv->mdev, &max_mtu, 1); ++ netdev->extended->max_mtu = MLX5E_HW2SW_MTU(&priv->channels.params, max_mtu); ++#endif + mlx5e_set_dev_port_mtu(priv); + + mlx5e_rep_tc_enable(priv); +@@ -1279,7 +1514,13 @@ static void mlx5e_uplink_rep_enable(stru + rtnl_lock(); + if (netif_running(netdev)) + mlx5e_open(netdev); ++#ifdef HAVE_UDP_TUNNEL_NIC_INFO + udp_tunnel_nic_reset_ntf(priv->netdev); ++#elif defined(HAVE_UDP_TUNNEL_RX_INFO) && defined(HAVE_KERNEL_WITH_VXLAN_SUPPORT_ON) \ ++ && defined(HAVE_DEVLINK_HAS_RELOAD_UP_DOWN) ++ if (mlx5_vxlan_allowed(priv->mdev->vxlan)) ++ udp_tunnel_get_rx_info(priv->netdev); ++#endif + netif_device_attach(netdev); + rtnl_unlock(); + +@@ -1294,6 +1535,13 @@ static void mlx5e_uplink_rep_disable(str + rtnl_lock(); + if (netif_running(priv->netdev)) + mlx5e_close(priv->netdev); ++#ifndef HAVE_UDP_TUNNEL_NIC_INFO ++#if defined(HAVE_UDP_TUNNEL_RX_INFO) && defined(HAVE_KERNEL_WITH_VXLAN_SUPPORT_ON) \ ++ && defined(HAVE_DEVLINK_HAS_RELOAD_UP_DOWN) ++ if (mlx5_vxlan_allowed(priv->mdev->vxlan)) ++ udp_tunnel_drop_rx_info(priv->netdev); ++#endif ++#endif + netif_device_detach(priv->netdev); + rtnl_unlock(); + +@@ -1303,7 +1551,10 @@ static void mlx5e_uplink_rep_disable(str + mlx5_notifier_unregister(mdev, &priv->events_nb); + mlx5e_rep_tc_disable(priv); + mlx5_lag_remove_netdev(mdev, priv->netdev); ++#if defined(HAVE_UDP_TUNNEL_NIC_INFO) && defined(HAVE_KERNEL_WITH_VXLAN_SUPPORT_ON) \ ++ && defined(HAVE_DEVLINK_HAS_RELOAD_UP_DOWN) + mlx5_vxlan_reset_to_default(mdev->vxlan); ++#endif + } + + static MLX5E_DEFINE_STATS_GRP(sw_rep, 0); +@@ -1381,8 +1632,12 @@ static const struct mlx5e_profile mlx5e_ + .update_carrier = mlx5e_update_carrier, + .rx_handlers = &mlx5e_rx_handlers_rep, + .max_tc = MLX5E_MAX_NUM_TC, ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + /* XSK is needed so we can replace profile with NIC netdev */ + .rq_groups = MLX5E_NUM_RQ_GROUPS(XSK), ++#else ++ .rq_groups = MLX5E_NUM_RQ_GROUPS(REGULAR), ++#endif + .stats_grps = mlx5e_ul_rep_stats_grps, + .stats_grps_num = mlx5e_ul_rep_stats_grps_num, + }; +@@ -1432,7 +1687,9 @@ mlx5e_vport_uplink_rep_load(struct mlx5_ + { + struct mlx5e_priv *priv = netdev_priv(mlx5_uplink_netdev_get(dev)); + struct mlx5e_rep_priv *rpriv = mlx5e_rep_to_rep_priv(rep); ++#ifdef HAVE_DEVLINK_PORT_ATRRS_SET_GET_SUPPORT + struct devlink_port *dl_port; ++#endif + int err; + + rpriv->netdev = priv->netdev; +@@ -1449,9 +1706,11 @@ mlx5e_vport_uplink_rep_load(struct mlx5_ + if (err) + goto err_metadata_insert; + ++#ifdef HAVE_DEVLINK_PORT_ATRRS_SET_GET_SUPPORT + dl_port = mlx5_esw_offloads_devlink_port(dev->priv.eswitch, rpriv->rep->vport); + if (dl_port) + devlink_port_type_eth_set(dl_port, rpriv->netdev); ++#endif + + mlx5_smartnic_sysfs_init(rpriv->netdev); + mlx5_rep_sysfs_init(rpriv); +@@ -1468,7 +1727,9 @@ static void + mlx5e_vport_uplink_rep_unload(struct mlx5e_rep_priv *rpriv) + { + struct net_device *netdev = rpriv->netdev; ++#ifdef HAVE_DEVLINK_PORT_ATRRS_SET_GET_SUPPORT + struct devlink_port *dl_port; ++#endif + struct mlx5_core_dev *dev; + struct mlx5e_priv *priv; + +@@ -1480,9 +1741,11 @@ mlx5e_vport_uplink_rep_unload(struct mlx + + mlx5e_ipsec_ul_cleanup(priv); + ++#ifdef HAVE_DEVLINK_PORT_ATRRS_SET_GET_SUPPORT + dl_port = mlx5_esw_offloads_devlink_port(dev->priv.eswitch, rpriv->rep->vport); + if (dl_port) + devlink_port_type_clear(dl_port); ++#endif + + mlx5_smartnic_sysfs_cleanup(netdev); + if (test_bit(MLX5_INTERFACE_STATE_TEARDOWN, &dev->intf_state)) { +@@ -1499,7 +1762,9 @@ mlx5e_vport_vf_rep_load(struct mlx5_core + { + struct mlx5e_rep_priv *rpriv = mlx5e_rep_to_rep_priv(rep); + const struct mlx5e_profile *profile; ++#ifdef HAVE_DEVLINK_PORT_ATRRS_SET_GET_SUPPORT + struct devlink_port *dl_port; ++#endif + struct net_device *netdev; + struct mlx5e_priv *priv; + int err; +@@ -1538,6 +1803,10 @@ mlx5e_vport_vf_rep_load(struct mlx5_core + goto err_cleanup_profile; + } + ++ err = mlx5e_vport_rep_load_compat(priv); ++ if (err) ++ goto err_compat_cleanup; ++ + err = register_netdev(netdev); + if (err) { + netdev_warn(netdev, +@@ -1548,12 +1817,17 @@ mlx5e_vport_vf_rep_load(struct mlx5_core + + mlx5_rep_sysfs_init(rpriv); + ++#ifdef HAVE_DEVLINK_PORT_ATRRS_SET_GET_SUPPORT + dl_port = mlx5_esw_offloads_devlink_port(dev->priv.eswitch, rpriv->rep->vport); + if (dl_port) + devlink_port_type_eth_set(dl_port, netdev); ++#endif + mlx5_devm_sf_port_type_eth_set(dev, rpriv->rep->vport, netdev); + return 0; + ++err_compat_cleanup: ++ mlx5e_vport_rep_unload_compat(priv); ++ + err_detach_netdev: + mlx5e_detach_netdev(netdev_priv(netdev)); + +@@ -1598,7 +1872,9 @@ mlx5e_vport_rep_unload(struct mlx5_eswit + struct net_device *netdev = rpriv->netdev; + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5_core_dev *dev = priv->mdev; ++#ifdef HAVE_DEVLINK_PORT_ATRRS_SET_GET_SUPPORT + struct devlink_port *dl_port; ++#endif + void *ppriv = priv->ppriv; + + mlx5_rep_destroy_miss_meter(dev, rpriv); +@@ -1612,10 +1888,13 @@ mlx5e_vport_rep_unload(struct mlx5_eswit + } + + mlx5e_rep_metadata_remove(priv, rep); ++#ifdef HAVE_DEVLINK_PORT_ATRRS_SET_GET_SUPPORT + dl_port = mlx5_esw_offloads_devlink_port(dev->priv.eswitch, rpriv->rep->vport); + if (dl_port) + devlink_port_type_clear(dl_port); ++#endif + unregister_netdev(netdev); ++ mlx5e_vport_rep_unload_compat(priv); + mlx5e_detach_netdev(priv); + priv->profile->cleanup(priv); + mlx5e_destroy_netdev(priv); diff --git a/src/mlnx-ofa_kernel-5.8/backports/0150-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch b/src/mlnx-ofa_kernel-5.8/backports/0150-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch new file mode 100644 index 0000000..481be7c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0150-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch @@ -0,0 +1,52 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/en_rep.h + +Change-Id: Ia463bd7f9d2f879fed94184c47bbcaef750aaede +--- + .../net/ethernet/mellanox/mlx5/core/en_rep.h | 22 +++++++++++++++++++ + 1 file changed, 22 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h +@@ -92,6 +92,10 @@ struct mlx5_rep_uplink_priv { + + /* support eswitch vports bonding */ + struct mlx5e_rep_bond *bond; ++#ifndef HAVE_FLOW_INDR_DEV_REGISTER ++ struct notifier_block netdevice_nb; ++ struct netdev_net_notifier netdevice_nn; ++#endif + + /* tc tunneling encapsulation private data */ + struct mlx5e_tc_tun_encap *encap; +@@ -278,12 +282,30 @@ static inline bool mlx5e_eswitch_rep(con + mlx5e_eswitch_uplink_rep(netdev); + } + ++#ifndef HAVE_DEVLINK_PORT_ATTRS_PCI_PF_SET ++int mlx5e_rep_get_phys_port_name(struct net_device *dev, ++ char *buf, size_t len); ++#endif ++ ++#if defined(HAVE_NDO_GET_PORT_PARENT_ID) || defined(HAVE_SWITCHDEV_OPS) || defined(HAVE_SWITCHDEV_H_COMPAT) ++#ifdef HAVE_DEVLINK_PORT_ATTRS_PCI_PF_SET ++void ++#else ++int ++#endif ++mlx5e_rep_get_port_parent_id(struct net_device *dev, ++ struct netdev_phys_item_id *ppid); ++#endif + #else /* CONFIG_MLX5_ESWITCH */ + static inline bool mlx5e_is_uplink_rep(const struct mlx5e_priv *priv) { return false; } + static inline void mlx5e_rep_activate_channels(struct mlx5e_priv *priv) {} + static inline void mlx5e_rep_deactivate_channels(struct mlx5e_priv *priv) {} + static inline int mlx5e_rep_init(void) { return 0; }; + static inline void mlx5e_rep_cleanup(void) {}; ++#ifndef HAVE_DEVLINK_PORT_ATTRS_PCI_PF_SET ++static inline int mlx5e_rep_get_phys_port_name(struct net_device *dev, ++ char *buf, size_t len) { return 0; } ++#endif + static inline bool mlx5e_rep_has_offload_stats(const struct net_device *dev, + int attr_id) { return false; } + static inline int mlx5e_rep_get_offload_stats(int attr_id, diff --git a/src/mlnx-ofa_kernel-5.8/backports/0151-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch b/src/mlnx-ofa_kernel-5.8/backports/0151-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch new file mode 100644 index 0000000..81b68ec --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0151-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch @@ -0,0 +1,887 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/en_rx.c + +Change-Id: Ie77d4916542912398f476d0bfffc87ee80b3ff84 +--- + .../net/ethernet/mellanox/mlx5/core/en_rx.c | 341 ++++++++++++++++-- + 1 file changed, 316 insertions(+), 25 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +@@ -35,9 +35,14 @@ + #include + #include + #include ++#include ++#ifdef HAVE_NET_PAGE_POOL_H + #include ++#endif + #include ++#ifdef HAVE_NET_GRO_H + #include ++#endif + #include + #include + #include "en.h" +@@ -59,6 +64,15 @@ + #include "devlink.h" + #include "en/devlink.h" + #include "esw/ipsec.h" ++#include "en/txrx.h" ++ ++static inline void mlx5e_set_skb_driver_xmit_more(struct sk_buff *skb, ++ struct mlx5e_rq *rq, ++ bool xmit_more) ++{ ++ if (test_bit(MLX5E_RQ_STATE_SKB_XMIT_MORE, &rq->state) && xmit_more) ++ skb->cb[47] = MLX5_XMIT_MORE_SKB_CB; ++} + + static struct sk_buff * + mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, +@@ -66,14 +80,18 @@ mlx5e_skb_from_cqe_mpwrq_linear(struct m + static struct sk_buff * + mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, + u16 cqe_bcnt, u32 head_offset, u32 page_idx); +-static void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe); +-static void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe); +-static void mlx5e_handle_rx_cqe_mpwrq_shampo(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe); ++static void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe, bool xmit_more); ++static void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe, bool xmit_more); ++#ifdef HAVE_SHAMPO_SUPPORT ++static void mlx5e_handle_rx_cqe_mpwrq_shampo(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe, bool xmit_more); ++#endif + + const struct mlx5e_rx_handlers mlx5e_rx_handlers_nic = { + .handle_rx_cqe = mlx5e_handle_rx_cqe, + .handle_rx_cqe_mpwqe = mlx5e_handle_rx_cqe_mpwrq, ++#ifdef HAVE_SHAMPO_SUPPORT + .handle_rx_cqe_mpwqe_shampo = mlx5e_handle_rx_cqe_mpwrq_shampo, ++#endif + }; + + static inline bool mlx5e_rx_hw_stamp(struct hwtstamp_config *config) +@@ -193,9 +211,14 @@ static inline u32 mlx5e_decompress_cqes_ + mlx5e_read_mini_arr_slot(wq, cqd, cqcc); + + mlx5e_decompress_cqe_no_hash(rq, wq, cqcc); ++#ifdef HAVE_SHAMPO_SUPPORT + INDIRECT_CALL_3(rq->handle_rx_cqe, mlx5e_handle_rx_cqe_mpwrq, + mlx5e_handle_rx_cqe_mpwrq_shampo, mlx5e_handle_rx_cqe, +- rq, &cqd->title); ++ rq, &cqd->title, i < cqe_count - 1); ++#else ++ INDIRECT_CALL_2(rq->handle_rx_cqe, mlx5e_handle_rx_cqe_mpwrq, ++ mlx5e_handle_rx_cqe, rq, &cqd->title, i < cqe_count - 1); ++#endif + } + mlx5e_cqes_update_owner(wq, cqcc - wq->cc); + wq->cc = cqcc; +@@ -215,9 +238,14 @@ static inline u32 mlx5e_decompress_cqes_ + mlx5e_read_title_slot(rq, wq, cc); + mlx5e_read_mini_arr_slot(wq, cqd, cc + 1); + mlx5e_decompress_cqe(rq, wq, cc); ++#ifdef HAVE_SHAMPO_SUPPORT + INDIRECT_CALL_3(rq->handle_rx_cqe, mlx5e_handle_rx_cqe_mpwrq, + mlx5e_handle_rx_cqe_mpwrq_shampo, mlx5e_handle_rx_cqe, +- rq, &cqd->title); ++ rq, &cqd->title, true); ++#else ++ INDIRECT_CALL_2(rq->handle_rx_cqe, mlx5e_handle_rx_cqe_mpwrq, ++ mlx5e_handle_rx_cqe, rq, &cqd->title, true); ++#endif + cqd->mini_arr_idx++; + + return mlx5e_decompress_cqes_cont(rq, wq, 1, budget_rem) - 1; +@@ -338,6 +366,11 @@ static inline bool mlx5e_rx_cache_extend + return true; + } + ++static inline bool mlx5e_page_is_reserved(struct page *page) ++{ ++ return page_is_pfmemalloc(page) || page_to_nid(page) != numa_mem_id(); ++} ++ + static inline bool mlx5e_rx_cache_put(struct mlx5e_rq *rq, + struct mlx5e_dma_info *dma_info) + { +@@ -350,8 +383,11 @@ static inline bool mlx5e_rx_cache_put(st + return false; + } + } +- ++#ifdef HAVE_DEV_PAGE_IS_REUSABLE + if (!dev_page_is_reusable(dma_info->page)) { ++#else ++ if (unlikely(mlx5e_page_is_reserved(dma_info->page))) { ++#endif + stats->cache_waive++; + return false; + } +@@ -416,18 +452,31 @@ static inline int mlx5e_page_alloc_pool( + if (mlx5e_rx_cache_get(rq, dma_info)) + return 0; + +- dma_info->page = page_pool_dev_alloc_pages(rq->page_pool); ++#ifdef HAVE_NET_PAGE_POOL_H ++ dma_info->page = page_pool_dev_alloc_pages(rq->page_pool); ++#else ++ dma_info->page = dev_alloc_page(); ++#endif + if (unlikely(!dma_info->page)) + return -ENOMEM; + + dma_info->refcnt_bias = 0; + page_ref_elev(dma_info); + ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + dma_info->addr = dma_map_page_attrs(rq->pdev, dma_info->page, 0, PAGE_SIZE, + rq->buff.map_dir, DMA_ATTR_SKIP_CPU_SYNC); ++#else ++ dma_info->addr = dma_map_page(rq->pdev, dma_info->page, 0, ++ PAGE_SIZE, rq->buff.map_dir); ++#endif + if (unlikely(dma_mapping_error(rq->pdev, dma_info->addr))) { ++#ifdef HAVE_NET_PAGE_POOL_H + page_pool_recycle_direct(rq->page_pool, dma_info->page); + page_ref_sub(dma_info->page, dma_info->refcnt_bias); ++#else ++ mlx5e_put_page(dma_info); ++#endif + dma_info->page = NULL; + return -ENOMEM; + } +@@ -438,22 +487,33 @@ static inline int mlx5e_page_alloc_pool( + static inline int mlx5e_page_alloc(struct mlx5e_rq *rq, + struct mlx5e_dma_info *dma_info) + { ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT ++#ifdef HAVE_NETDEV_BPF_XSK_BUFF_POOL + if (rq->xsk_pool) ++#else ++ if (rq->umem) ++#endif + return mlx5e_xsk_page_alloc_pool(rq, dma_info); + else ++#endif + return mlx5e_page_alloc_pool(rq, dma_info); + } + + void mlx5e_page_dma_unmap(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info) + { ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + dma_unmap_page_attrs(rq->pdev, dma_info->addr, PAGE_SIZE, rq->buff.map_dir, + DMA_ATTR_SKIP_CPU_SYNC); ++#else ++ dma_unmap_page(rq->pdev, dma_info->addr, PAGE_SIZE, rq->buff.map_dir); ++#endif + } + + void mlx5e_page_release_dynamic(struct mlx5e_rq *rq, + struct mlx5e_dma_info *dma_info, + bool recycle) + { ++#ifdef HAVE_NET_PAGE_POOL_H + if (likely(recycle)) { + if (mlx5e_rx_cache_put(rq, dma_info)) + return; +@@ -463,22 +523,45 @@ void mlx5e_page_release_dynamic(struct m + page_pool_recycle_direct(rq->page_pool, dma_info->page); + } else { + mlx5e_page_dma_unmap(rq, dma_info); ++#ifdef HAVE_PAGE_POOL_RELEASE_PAGE ++ /* This call to page_pool_release_page should be part of ++ * the base code, not backport, in the next rebase. ++ */ + page_pool_release_page(rq->page_pool, dma_info->page); ++#endif + mlx5e_put_page(dma_info); + } ++#else ++ if (likely(recycle) && mlx5e_rx_cache_put(rq, dma_info)) ++ return; ++ ++ mlx5e_page_dma_unmap(rq, dma_info); ++ mlx5e_put_page(dma_info); ++#endif + } + + static inline void mlx5e_page_release(struct mlx5e_rq *rq, + struct mlx5e_dma_info *dma_info, + bool recycle) + { ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT ++#ifdef HAVE_NETDEV_BPF_XSK_BUFF_POOL + if (rq->xsk_pool) ++#else ++ if (rq->umem) ++#endif ++ + /* The `recycle` parameter is ignored, and the page is always + * put into the Reuse Ring, because there is no way to return + * the page to the userspace when the interface goes down. + */ ++#ifdef HAVE_XSK_BUFF_ALLOC + xsk_buff_free(dma_info->xsk); ++#else ++ mlx5e_xsk_page_release(rq, dma_info); ++#endif + else ++#endif + mlx5e_page_release_dynamic(rq, dma_info, recycle); + } + +@@ -558,17 +641,28 @@ static int mlx5e_alloc_rx_wqes(struct ml + struct mlx5_wq_cyc *wq = &rq->wqe.wq; + int err; + int i; ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT ++#ifdef HAVE_NETDEV_BPF_XSK_BUFF_POOL ++ struct xsk_buff_pool *xsk_pool = rq->xsk_pool; ++#else ++ struct xdp_umem *xsk_pool = rq->umem; ++#endif + +- if (rq->xsk_pool) { ++ if (xsk_pool) { + int pages_desired = wqe_bulk << rq->wqe.info.log_num_frags; + + /* Check in advance that we have enough frames, instead of + * allocating one-by-one, failing and moving frames to the + * Reuse Ring. + */ +- if (unlikely(!xsk_buff_can_alloc(rq->xsk_pool, pages_desired))) ++#ifdef HAVE_XSK_BUFF_ALLOC ++ if (unlikely(!xsk_buff_can_alloc(xsk_pool, pages_desired))) ++#else ++ if (unlikely(!mlx5e_xsk_pages_enough_umem(rq, pages_desired))) ++#endif + return -ENOMEM; + } ++#endif + + for (i = 0; i < wqe_bulk; i++) { + struct mlx5e_rx_wqe_cyc *wqe = mlx5_wq_cyc_get_wqe(wq, ix + i); +@@ -631,6 +725,7 @@ static void mlx5e_mpwqe_page_release(str + static void + mlx5e_free_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, bool recycle) + { ++#ifdef HAVE_XDP_SUPPORT + bool no_xdp_xmit; + struct mlx5e_dma_info *dma_info = wi->umr.dma_info; + int i; +@@ -645,6 +740,13 @@ mlx5e_free_rx_mpwqe(struct mlx5e_rq *rq, + for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) + if (no_xdp_xmit || !test_bit(i, wi->xdp_xmit_bitmap)) + mlx5e_mpwqe_page_release(rq, &dma_info[i], recycle); ++#else ++ struct mlx5e_dma_info *dma_info = &wi->umr.dma_info[0]; ++ int i; ++ ++ for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++, dma_info++) ++ mlx5e_mpwqe_page_release(rq, dma_info, recycle); ++#endif + } + + static void mlx5e_post_rx_mpwqe(struct mlx5e_rq *rq, u8 n) +@@ -658,7 +760,11 @@ static void mlx5e_post_rx_mpwqe(struct m + } while (--n); + + /* ensure wqes are visible to device before updating doorbell record */ ++#ifdef dma_wmb + dma_wmb(); ++#else ++ wmb(); ++#endif + + mlx5_wq_ll_update_db_record(wq); + +@@ -828,11 +934,22 @@ static int mlx5e_alloc_rx_mpwqe(struct m + /* Check in advance that we have enough frames, instead of allocating + * one-by-one, failing and moving frames to the Reuse Ring. + */ ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT ++#ifdef HAVE_NETDEV_BPF_XSK_BUFF_POOL + if (rq->xsk_pool && + unlikely(!xsk_buff_can_alloc(rq->xsk_pool, MLX5_MPWRQ_PAGES_PER_WQE))) { ++#elif defined(HAVE_XSK_BUFF_ALLOC) ++ if (rq->umem && ++ unlikely(!xsk_buff_can_alloc(rq->umem, MLX5_MPWRQ_PAGES_PER_WQE))) { ++ ++#else ++ if (rq->umem && ++ unlikely(!mlx5e_xsk_pages_enough_umem(rq, MLX5_MPWRQ_PAGES_PER_WQE))) { ++#endif + err = -ENOMEM; + goto err; + } ++#endif + + if (test_bit(MLX5E_RQ_STATE_SHAMPO, &rq->state)) { + err = mlx5e_alloc_rx_hd_mpwqe(rq); +@@ -851,7 +968,9 @@ static int mlx5e_alloc_rx_mpwqe(struct m + umr_wqe->inline_mtts[i].ptag = cpu_to_be64(dma_info->addr | MLX5_EN_WR); + } + ++#ifdef HAVE_XDP_SUPPORT + bitmap_zero(wi->xdp_xmit_bitmap, MLX5_MPWRQ_PAGES_PER_WQE); ++#endif + wi->consumed_strides = 0; + + umr_wqe->ctrl.opmod_idx_opcode = +@@ -943,8 +1062,10 @@ INDIRECT_CALLABLE_SCOPE bool mlx5e_post_ + if (mlx5_wq_cyc_missing(wq) < wqe_bulk) + return false; + ++#ifdef HAVE_PAGE_POLL_NID_CHANGED + if (rq->page_pool) + page_pool_nid_changed(rq->page_pool, numa_mem_id()); ++#endif + + do { + u16 head = mlx5_wq_cyc_get_head(wq); +@@ -959,7 +1080,11 @@ INDIRECT_CALLABLE_SCOPE bool mlx5e_post_ + } while (mlx5_wq_cyc_missing(wq) >= wqe_bulk); + + /* ensure wqes are visible to device before updating doorbell record */ ++#ifdef dma_wmb + dma_wmb(); ++#else ++ wmb(); ++#endif + + mlx5_wq_cyc_update_db_record(wq); + +@@ -991,7 +1116,7 @@ void mlx5e_free_icosq_descs(struct mlx5e + ci = mlx5_wq_cyc_ctr2ix(&sq->wq, sqcc); + wi = &sq->db.wqe_info[ci]; + sqcc += wi->num_wqebbs; +-#ifdef CONFIG_MLX5_EN_TLS ++#if defined(HAVE_KTLS_RX_SUPPORT) && defined (CONFIG_MLX5_EN_TLS) + switch (wi->wqe_type) { + case MLX5E_ICOSQ_WQE_SET_PSV_TLS: + mlx5e_ktls_handle_ctx_completion(wi); +@@ -1086,7 +1211,7 @@ int mlx5e_poll_ico_cq(struct mlx5e_cq *c + case MLX5E_ICOSQ_WQE_SHAMPO_HD_UMR: + mlx5e_handle_shampo_hd_umr(wi->shampo, sq); + break; +-#ifdef CONFIG_MLX5_EN_TLS ++#if defined(HAVE_KTLS_RX_SUPPORT) && defined (CONFIG_MLX5_EN_TLS) + case MLX5E_ICOSQ_WQE_UMR_TLS: + break; + case MLX5E_ICOSQ_WQE_SET_PSV_TLS: +@@ -1138,8 +1263,10 @@ INDIRECT_CALLABLE_SCOPE bool mlx5e_post_ + if (likely(missing < UMR_WQE_BULK)) + return false; + ++#ifdef HAVE_PAGE_POLL_NID_CHANGED + if (rq->page_pool) + page_pool_nid_changed(rq->page_pool, numa_mem_id()); ++#endif + + head = rq->mpwqe.actual_wq_head; + i = missing; +@@ -1166,8 +1293,14 @@ INDIRECT_CALLABLE_SCOPE bool mlx5e_post_ + * the driver when it refills the Fill Ring. + * 2. Otherwise, busy poll by rescheduling the NAPI poll. + */ ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT ++#ifdef HAVE_NETDEV_BPF_XSK_BUFF_POOL + if (unlikely(alloc_err == -ENOMEM && rq->xsk_pool)) ++#else ++ if (unlikely(alloc_err == -ENOMEM && rq->umem)) ++#endif + return true; ++#endif + + return false; + } +@@ -1242,6 +1375,7 @@ static void mlx5e_lro_update_hdr(struct + } + } + ++#ifdef HAVE_SHAMPO_SUPPORT + static void *mlx5e_shampo_get_packet_hd(struct mlx5e_rq *rq, u16 header_index) + { + struct mlx5e_dma_info *last_head = &rq->mpwqe.shampo->info[header_index]; +@@ -1377,6 +1511,7 @@ static void mlx5e_shampo_update_hdr(stru + mlx5e_shampo_update_ipv6_udp_hdr(rq, ipv6); + } + } ++#endif /* HAVE_SHAMPO_SUPPORT */ + + static inline void mlx5e_skb_set_hash(struct mlx5_cqe64 *cqe, + struct sk_buff *skb) +@@ -1415,7 +1550,11 @@ static inline void mlx5e_enable_ecn(stru + + ip = skb->data + network_depth; + rc = ((proto == htons(ETH_P_IP)) ? IP_ECN_set_ce((struct iphdr *)ip) : ++#ifdef HAVE_IP6_SET_CE_2_PARAMS + IP6_ECN_set_ce(skb, (struct ipv6hdr *)ip)); ++#else ++ IP6_ECN_set_ce((struct ipv6hdr *)ip)); ++#endif + + rq->stats->ecn_mark += !!rc; + } +@@ -1559,8 +1698,8 @@ csum_unnecessary: + (cqe->hds_ip_ext & CQE_L4_OK))) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + if (cqe_is_tunneled(cqe)) { +- skb->csum_level = 1; +- skb->encapsulation = 1; ++ skb->csum_level = 1; ++ skb->encapsulation = 1; + stats->csum_unnecessary_inner++; + return; + } +@@ -1582,6 +1721,10 @@ static inline void mlx5e_build_rx_skb(st + u8 lro_num_seg = be32_to_cpu(cqe->srqn) >> 24; + struct mlx5e_rq_stats *stats = rq->stats; + struct net_device *netdev = rq->netdev; ++#ifdef CONFIG_COMPAT_LRO_ENABLED_IPOIB ++ struct mlx5e_priv *priv = netdev_priv(netdev); ++ u8 l4_hdr_type; ++#endif + + skb->mac_len = ETH_HLEN; + +@@ -1602,6 +1745,12 @@ static inline void mlx5e_build_rx_skb(st + stats->packets += lro_num_seg - 1; + stats->lro_packets++; + stats->lro_bytes += cqe_bcnt; ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0) ++ /* Flush GRO to avoid OOO packets, since GSO bypasses the ++ * GRO queue. This was fixed in dev_gro_receive() in kernel 4.10 ++ */ ++ napi_gro_flush(rq->cq.napi, false); ++#endif + } + + if (unlikely(mlx5e_rx_hw_stamp(rq->tstamp))) +@@ -1620,7 +1769,16 @@ static inline void mlx5e_build_rx_skb(st + + skb->mark = be32_to_cpu(cqe->sop_drop_qpn) & MLX5E_TC_FLOW_ID_MASK; + ++#ifndef CONFIG_COMPAT_LRO_ENABLED_IPOIB + mlx5e_handle_csum(netdev, cqe, rq, skb, !!lro_num_seg); ++#else ++ l4_hdr_type = get_cqe_l4_hdr_type(cqe); ++ mlx5e_handle_csum(netdev, cqe, rq, skb, ++ !!lro_num_seg || ++ (IS_SW_LRO(&priv->channels.params) && ++ (l4_hdr_type != CQE_L4_HDR_TYPE_NONE) && ++ (l4_hdr_type != CQE_L4_HDR_TYPE_UDP))); ++#endif + /* checking CE bit in cqe - MSB in ml_path field */ + if (unlikely(cqe->ml_path & MLX5E_CE_BIT_MASK)) + mlx5e_enable_ecn(rq, skb); +@@ -1631,6 +1789,7 @@ static inline void mlx5e_build_rx_skb(st + stats->mcast_packets++; + } + ++#ifdef HAVE_SHAMPO_SUPPORT + static void mlx5e_shampo_complete_rx_cqe(struct mlx5e_rq *rq, + struct mlx5_cqe64 *cqe, + u32 cqe_bcnt, +@@ -1651,6 +1810,7 @@ static void mlx5e_shampo_complete_rx_cqe + rq->hw_gro_data->skb = NULL; + } + } ++#endif + + static inline void mlx5e_complete_rx_cqe(struct mlx5e_rq *rq, + struct mlx5_cqe64 *cqe, +@@ -1688,12 +1848,40 @@ struct sk_buff *mlx5e_build_linear_skb(s + return skb; + } + ++#ifdef HAVE_XDP_SUPPORT + static void mlx5e_fill_xdp_buff(struct mlx5e_rq *rq, void *va, u16 headroom, + u32 len, struct xdp_buff *xdp) + { ++#ifdef HAVE_XDP_INIT_BUFF + xdp_init_buff(xdp, rq->buff.frame0_sz, &rq->xdp_rxq); + xdp_prepare_buff(xdp, va, headroom, len, false); ++#else ++ unsigned char *data = va + headroom; ++ ++#ifdef HAVE_XDP_RXQ_INFO ++ xdp->rxq = &rq->xdp_rxq; ++#endif ++#ifdef HAVE_XDP_BUFF_HAS_FRAME_SZ ++ xdp->frame_sz = rq->buff.frame0_sz; ++#endif ++ xdp->data_hard_start = va; ++ xdp->data = data; ++ xdp->data_end = data + len; ++#ifdef HAVE_XDP_SET_DATA_META_INVALID ++ xdp_set_data_meta_invalid(xdp); ++#endif ++#endif ++} ++ ++#if !defined(HAVE_XSK_BUFF_ALLOC) && defined(HAVE_XSK_ZERO_COPY_SUPPORT) ++void mlx5e_fill_xdp_buff_for_old_xsk(struct mlx5e_rq *rq, void *va, u16 headroom, ++ u32 len, struct xdp_buff *xdp, struct mlx5e_dma_info *di) ++{ ++ mlx5e_fill_xdp_buff(rq, va, headroom, len, xdp); ++ xdp->handle = di->xsk.handle; + } ++#endif ++#endif /* HAVE_XDP_SUPPORT */ + + static struct sk_buff * + mlx5e_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe, +@@ -1701,7 +1889,9 @@ mlx5e_skb_from_cqe_linear(struct mlx5e_r + { + struct mlx5e_dma_info *di = wi->di; + u16 rx_headroom = rq->buff.headroom; ++#ifdef HAVE_XDP_SUPPORT + struct xdp_buff xdp; ++#endif + struct sk_buff *skb; + void *va, *data; + u32 frag_size; +@@ -1715,11 +1905,13 @@ mlx5e_skb_from_cqe_linear(struct mlx5e_r + net_prefetchw(va); /* xdp_frame data area */ + net_prefetch(data); + ++#ifdef HAVE_XDP_SUPPORT + mlx5e_fill_xdp_buff(rq, va, rx_headroom, cqe_bcnt, &xdp); + if (mlx5e_xdp_handle(rq, di, &cqe_bcnt, &xdp)) + return NULL; /* page/packet was consumed by XDP */ + + rx_headroom = xdp.data - xdp.data_hard_start; ++#endif + frag_size = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt); + skb = mlx5e_build_linear_skb(rq, va, frag_size, rx_headroom, cqe_bcnt); + if (unlikely(!skb)) +@@ -1794,8 +1986,12 @@ static void mlx5e_handle_rx_err_cqe(stru + rq->stats->wqe_err++; + } + +-static void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe) ++void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe, ++ bool xmit_more) + { ++#ifdef CONFIG_COMPAT_LRO_ENABLED_IPOIB ++ struct mlx5e_priv *priv = netdev_priv(rq->netdev); ++#endif + struct mlx5_wq_cyc *wq = &rq->wqe.wq; + struct mlx5e_wqe_frag_info *wi; + struct sk_buff *skb; +@@ -1828,12 +2024,19 @@ static void mlx5e_handle_rx_cqe(struct m + + mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb); + ++ mlx5e_set_skb_driver_xmit_more(skb, rq, xmit_more); ++ + if (mlx5e_cqe_regb_chain(cqe)) + if (!mlx5e_tc_update_skb(cqe, skb)) { + dev_kfree_skb_any(skb); + goto free_wqe; + } + ++#ifdef CONFIG_COMPAT_LRO_ENABLED_IPOIB ++ if (IS_SW_LRO(&priv->channels.params)) ++ lro_receive_skb(&rq->sw_lro->lro_mgr, skb, NULL); ++ else ++#endif + napi_gro_receive(rq->cq.napi, skb); + + free_wqe: +@@ -1902,7 +2105,8 @@ static bool mlx5e_rep_lookup_and_update( + return true; + } + +-static void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe) ++void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe, ++ bool xmit_more) + { + struct net_device *netdev = rq->netdev; + struct mlx5e_priv *priv = netdev_priv(netdev); +@@ -1954,7 +2158,8 @@ wq_cyc_pop: + mlx5_wq_cyc_pop(wq); + } + +-static void mlx5e_handle_rx_cqe_mpwrq_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe) ++static void mlx5e_handle_rx_cqe_mpwrq_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe, ++ bool xmit_more) + { + u16 cstrides = mpwrq_get_cqe_consumed_strides(cqe); + u16 wqe_id = be16_to_cpu(cqe->wqe_id); +@@ -2081,7 +2286,9 @@ mlx5e_skb_from_cqe_mpwrq_linear(struct m + struct mlx5e_dma_info *di = &wi->umr.dma_info[page_idx]; + u16 rx_headroom = rq->buff.headroom; + u32 cqe_bcnt32 = cqe_bcnt; ++#ifdef HAVE_XDP_SUPPORT + struct xdp_buff xdp; ++#endif + struct sk_buff *skb; + void *va, *data; + u32 frag_size; +@@ -2101,6 +2308,7 @@ mlx5e_skb_from_cqe_mpwrq_linear(struct m + net_prefetchw(va); /* xdp_frame data area */ + net_prefetch(data); + ++#ifdef HAVE_XDP_SUPPORT + mlx5e_fill_xdp_buff(rq, va, rx_headroom, cqe_bcnt32, &xdp); + if (mlx5e_xdp_handle(rq, di, &cqe_bcnt32, &xdp)) { + if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) +@@ -2109,6 +2317,7 @@ mlx5e_skb_from_cqe_mpwrq_linear(struct m + } + + rx_headroom = xdp.data - xdp.data_hard_start; ++#endif + frag_size = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt32); + skb = mlx5e_build_linear_skb(rq, va, frag_size, rx_headroom, cqe_bcnt32); + if (unlikely(!skb)) +@@ -2120,6 +2329,7 @@ mlx5e_skb_from_cqe_mpwrq_linear(struct m + return skb; + } + ++#ifdef HAVE_SHAMPO_SUPPORT + static struct sk_buff * + mlx5e_skb_from_cqe_shampo(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, + struct mlx5_cqe64 *cqe, u16 header_index) +@@ -2217,7 +2427,7 @@ mlx5e_free_rx_shampo_hd_entry(struct mlx + bitmap_clear(shampo->bitmap, header_index, 1); + } + +-static void mlx5e_handle_rx_cqe_mpwrq_shampo(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe) ++static void mlx5e_handle_rx_cqe_mpwrq_shampo(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe, bool xmit_more) + { + u16 data_bcnt = mpwrq_get_cqe_byte_cnt(cqe) - cqe->shampo.header_size; + u16 header_index = mlx5e_shampo_get_cqe_header_index(rq, cqe); +@@ -2301,9 +2511,14 @@ mpwrq_cqe_out: + mlx5e_free_rx_mpwqe(rq, wi, true); + mlx5_wq_ll_pop(wq, cqe->wqe_id, &wqe->next.next_wqe_index); + } ++#endif /* HAVE_SHAMPO_SUPPORT */ + +-static void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe) ++void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe, ++ bool xmit_more) + { ++#ifdef CONFIG_COMPAT_LRO_ENABLED_IPOIB ++ struct mlx5e_priv *priv = netdev_priv(rq->netdev); ++#endif + u16 cstrides = mpwrq_get_cqe_consumed_strides(cqe); + u16 wqe_id = be16_to_cpu(cqe->wqe_id); + struct mlx5e_mpw_info *wi = &rq->mpwqe.info[wqe_id]; +@@ -2342,12 +2557,19 @@ static void mlx5e_handle_rx_cqe_mpwrq(st + + mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb); + ++ mlx5e_set_skb_driver_xmit_more(skb, rq, xmit_more); ++ + if (mlx5e_cqe_regb_chain(cqe)) + if (!mlx5e_tc_update_skb(cqe, skb)) { + dev_kfree_skb_any(skb); + goto mpwrq_cqe_out; + } + ++#ifdef CONFIG_COMPAT_LRO_ENABLED_IPOIB ++ if (IS_SW_LRO(&priv->channels.params)) ++ lro_receive_skb(&rq->sw_lro->lro_mgr, skb, NULL); ++ else ++#endif + napi_gro_receive(rq->cq.napi, skb); + + mpwrq_cqe_out: +@@ -2364,8 +2586,17 @@ int mlx5e_poll_rx_cq(struct mlx5e_cq *cq + { + struct mlx5e_rq *rq = container_of(cq, struct mlx5e_rq, cq); + struct mlx5_cqwq *cqwq = &cq->wq; +- struct mlx5_cqe64 *cqe; ++ struct mlx5_cqe64 *cqe, *next_cqe; + int work_done = 0; ++#ifdef CONFIG_COMPAT_LRO_ENABLED_IPOIB ++ struct mlx5e_priv *priv; ++#ifdef CONFIG_MLX5_CORE_IPOIB ++ if (MLX5_CAP_GEN(cq->mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) ++ priv = mlx5i_epriv(rq->netdev); ++ else ++#endif ++ priv = netdev_priv(rq->netdev); ++#endif + + if (unlikely(!test_bit(MLX5E_RQ_STATE_ENABLED, &rq->state))) + return 0; +@@ -2388,27 +2619,46 @@ int mlx5e_poll_rx_cq(struct mlx5e_cq *cq + work_done += + mlx5e_decompress_cqes_start(rq, cqwq, + budget - work_done); ++ if (work_done == budget) ++ break; ++ cqe = mlx5_cqwq_get_cqe(&cq->wq); + continue; + } + + mlx5_cqwq_pop(cqwq); + ++ next_cqe = mlx5_cqwq_get_cqe(&cq->wq); ++#ifdef HAVE_SHAMPO_SUPPORT + INDIRECT_CALL_3(rq->handle_rx_cqe, mlx5e_handle_rx_cqe_mpwrq, + mlx5e_handle_rx_cqe, mlx5e_handle_rx_cqe_mpwrq_shampo, +- rq, cqe); +- } while ((++work_done < budget) && (cqe = mlx5_cqwq_get_cqe(cqwq))); ++ rq, cqe, next_cqe && (work_done & 0xf)); ++#else ++ INDIRECT_CALL_2(rq->handle_rx_cqe, mlx5e_handle_rx_cqe_mpwrq, ++ mlx5e_handle_rx_cqe, rq, cqe, ++ next_cqe && (work_done & 0xf)); ++#endif ++ cqe = next_cqe; ++ } while ((++work_done < budget) && cqe); + + out: ++#ifdef HAVE_SHAMPO_SUPPORT + if (test_bit(MLX5E_RQ_STATE_SHAMPO, &rq->state) && rq->hw_gro_data->skb) + mlx5e_shampo_flush_skb(rq, NULL, false); ++#endif + ++#ifdef HAVE_XDP_SUPPORT + if (rcu_access_pointer(rq->xdp_prog)) + mlx5e_xdp_rx_poll_complete(rq); ++#endif + + mlx5_cqwq_update_db_record(cqwq); + + /* ensure cq space is freed before enabling more cqes */ + wmb(); ++#ifdef CONFIG_COMPAT_LRO_ENABLED_IPOIB ++ if (IS_SW_LRO(&priv->channels.params)) ++ lro_flush_all(&rq->sw_lro->lro_mgr); ++#endif + + return work_done; + } +@@ -2434,6 +2684,9 @@ static inline void mlx5i_complete_rx_cqe + u32 qpn; + u8 *dgid; + u8 g; ++#ifdef CONFIG_COMPAT_LRO_ENABLED_IPOIB ++ struct mlx5e_priv *parent_priv = mlx5i_epriv(rq->netdev); ++#endif + + qpn = be32_to_cpu(cqe->sop_drop_qpn) & 0xffffff; + netdev = mlx5i_pkey_get_netdev(rq->netdev, qpn); +@@ -2476,6 +2729,12 @@ static inline void mlx5i_complete_rx_cqe + + skb->protocol = *((__be16 *)(skb->data)); + ++#ifdef CONFIG_COMPAT_LRO_ENABLED_IPOIB ++ if (parent_priv->netdev->features & NETIF_F_LRO) { ++ skb->ip_summed = CHECKSUM_UNNECESSARY; ++ } else ++#endif ++ + if ((netdev->features & NETIF_F_RXCSUM) && + (likely((cqe->hds_ip_ext & CQE_L3_OK) && + (cqe->hds_ip_ext & CQE_L4_OK)))) { +@@ -2511,8 +2770,12 @@ static inline void mlx5i_complete_rx_cqe + } + } + +-static void mlx5i_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe) ++void mlx5i_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe, ++ bool xmit_more) + { ++#ifdef CONFIG_COMPAT_LRO_ENABLED_IPOIB ++ struct mlx5e_priv *priv = mlx5i_epriv(rq->netdev); ++#endif + struct mlx5_wq_cyc *wq = &rq->wqe.wq; + struct mlx5e_wqe_frag_info *wi; + struct sk_buff *skb; +@@ -2540,6 +2803,12 @@ static void mlx5i_handle_rx_cqe(struct m + dev_kfree_skb_any(skb); + goto wq_free_wqe; + } ++ mlx5e_set_skb_driver_xmit_more(skb, rq, xmit_more); ++#ifdef CONFIG_COMPAT_LRO_ENABLED_IPOIB ++ if (priv->netdev->features & NETIF_F_LRO) ++ lro_receive_skb(&rq->sw_lro->lro_mgr, skb, NULL); ++ else ++#endif + napi_gro_receive(rq->cq.napi, skb); + + wq_free_wqe: +@@ -2555,8 +2824,12 @@ const struct mlx5e_rx_handlers mlx5i_rx_ + + #ifdef CONFIG_MLX5_EN_IPSEC + +-static void mlx5e_ipsec_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe) ++void mlx5e_ipsec_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe, ++ bool xmit_more) + { ++#ifdef CONFIG_COMPAT_LRO_ENABLED_IPOIB ++ struct mlx5e_priv *priv = mlx5i_epriv(rq->netdev); ++#endif + struct mlx5_wq_cyc *wq = &rq->wqe.wq; + struct mlx5e_wqe_frag_info *wi; + struct sk_buff *skb; +@@ -2584,6 +2857,7 @@ static void mlx5e_ipsec_handle_rx_cqe(st + goto wq_free_wqe; + + mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb); ++ mlx5e_set_skb_driver_xmit_more(skb, rq, xmit_more); + napi_gro_receive(rq->cq.napi, skb); + + wq_free_wqe: +@@ -2606,11 +2880,19 @@ int mlx5e_rq_set_handlers(struct mlx5e_r + + switch (rq->wq_type) { + case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ: ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + rq->mpwqe.skb_from_cqe_mpwrq = xsk ? + mlx5e_xsk_skb_from_cqe_mpwrq_linear : + mlx5e_rx_mpwqe_is_linear_skb(mdev, params, NULL) ? + mlx5e_skb_from_cqe_mpwrq_linear : + mlx5e_skb_from_cqe_mpwrq_nonlinear; ++#else ++ rq->mpwqe.skb_from_cqe_mpwrq = ++ mlx5e_rx_mpwqe_is_linear_skb(mdev, params, NULL) ? ++ mlx5e_skb_from_cqe_mpwrq_linear : ++ mlx5e_skb_from_cqe_mpwrq_nonlinear; ++#endif ++ + rq->post_wqes = mlx5e_post_rx_mpwqes; + rq->dealloc_wqe = mlx5e_dealloc_rx_mpwqe; + +@@ -2634,11 +2916,17 @@ int mlx5e_rq_set_handlers(struct mlx5e_r + + break; + default: /* MLX5_WQ_TYPE_CYCLIC */ +- rq->wqe.skb_from_cqe = xsk ? ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT ++ rq->wqe.skb_from_cqe = xsk ? + mlx5e_xsk_skb_from_cqe_linear : + mlx5e_rx_is_linear_skb(params, NULL) ? + mlx5e_skb_from_cqe_linear : + mlx5e_skb_from_cqe_nonlinear; ++#else ++ rq->wqe.skb_from_cqe = mlx5e_rx_is_linear_skb(params, NULL) ? ++ mlx5e_skb_from_cqe_linear : ++ mlx5e_skb_from_cqe_nonlinear; ++#endif + rq->post_wqes = mlx5e_post_rx_wqes; + rq->dealloc_wqe = mlx5e_dealloc_rx_wqe; + +@@ -2658,7 +2946,9 @@ int mlx5e_rq_set_handlers(struct mlx5e_r + return 0; + } + +-static void mlx5e_trap_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe) ++#ifdef HAVE_DEVLINK_TRAP_SUPPORT ++static void mlx5e_trap_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe, ++ bool xmit_more) + { + struct mlx5e_priv *priv = netdev_priv(rq->netdev); + struct mlx5_wq_cyc *wq = &rq->wqe.wq; +@@ -2704,3 +2994,4 @@ void mlx5e_rq_set_trap_handlers(struct m + rq->dealloc_wqe = mlx5e_dealloc_rx_wqe; + rq->handle_rx_cqe = mlx5e_trap_handle_rx_cqe; + } ++#endif /* HAVE_DEVLINK_TRAP_SUPPORT */ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0152-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch b/src/mlnx-ofa_kernel-5.8/backports/0152-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch new file mode 100644 index 0000000..c3a1b69 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0152-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch @@ -0,0 +1,38 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c + +Change-Id: I142e7f028ca1442dbc494e8552a5f30ead272ced +--- + drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c +@@ -95,14 +95,14 @@ static struct sk_buff *mlx5e_test_get_ud + skb_reserve(skb, NET_IP_ALIGN); + + /* Reserve for ethernet and IP header */ +- ethh = skb_push(skb, ETH_HLEN); ++ ethh = (struct ethhdr *)skb_push(skb, ETH_HLEN); + skb_reset_mac_header(skb); + + skb_set_network_header(skb, skb->len); +- iph = skb_put(skb, sizeof(struct iphdr)); ++ iph = (struct iphdr *)skb_put(skb, sizeof(struct iphdr)); + + skb_set_transport_header(skb, skb->len); +- udph = skb_put(skb, sizeof(struct udphdr)); ++ udph = (struct udphdr *)skb_put(skb, sizeof(struct udphdr)); + + /* Fill ETH header */ + ether_addr_copy(ethh->h_dest, priv->netdev->dev_addr); +@@ -131,7 +131,7 @@ static struct sk_buff *mlx5e_test_get_ud + ip_send_check(iph); + + /* Fill test header and data */ +- mlxh = skb_put(skb, sizeof(*mlxh)); ++ mlxh = (struct mlx5ehdr *)skb_put(skb, sizeof(*mlxh)); + mlxh->version = 0; + mlxh->magic = cpu_to_be64(MLX5E_TEST_MAGIC); + diff --git a/src/mlnx-ofa_kernel-5.8/backports/0153-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch b/src/mlnx-ofa_kernel-5.8/backports/0153-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch new file mode 100644 index 0000000..0a50542 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0153-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch @@ -0,0 +1,531 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/en_stats.c + +Change-Id: If27afd1e3f441463db9f87183b1778831840c142 +--- + .../ethernet/mellanox/mlx5/core/en_stats.c | 229 +++++++++++++----- + 1 file changed, 163 insertions(+), 66 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c +@@ -115,7 +115,7 @@ static const struct counter_desc sw_stat + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_mpwqe_blks) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_mpwqe_pkts) }, + +-#ifdef CONFIG_MLX5_EN_TLS ++#if defined(CONFIG_MLX5_EN_TLS) && defined(HAVE_UAPI_LINUX_TLS_H) + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_tls_encrypted_packets) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_tls_encrypted_bytes) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_tls_ooo) }, +@@ -142,6 +142,7 @@ static const struct counter_desc sw_stat + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_csum_complete_tail) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_csum_complete_tail_slow) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_csum_unnecessary_inner) }, ++#ifdef HAVE_XDP_SUPPORT + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_drop) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_redirect) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_tx_xmit) }, +@@ -151,6 +152,7 @@ static const struct counter_desc sw_stat + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_tx_full) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_tx_err) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_tx_cqe) }, ++#endif + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_csum_none) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_csum_partial) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_csum_partial_inner) }, +@@ -161,6 +163,12 @@ static const struct counter_desc sw_stat + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_cqes) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_queue_wake) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_cqe_err) }, ++#ifdef CONFIG_COMPAT_LRO_ENABLED_IPOIB ++ { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_sw_lro_aggregated) }, ++ { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_sw_lro_flushed) }, ++ { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_sw_lro_no_desc) }, ++#endif ++#ifdef HAVE_XDP_SUPPORT + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xdp_xmit) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xdp_mpwqe) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xdp_inlnw) }, +@@ -168,6 +176,9 @@ static const struct counter_desc sw_stat + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xdp_full) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xdp_err) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xdp_cqes) }, ++#endif ++ { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_cqe_compress_blks) }, ++ { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_cqe_compress_pkts) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_wqe_err) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_mpwqe_filler_cqes) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_mpwqe_filler_strides) }, +@@ -249,6 +260,27 @@ static MLX5E_DECLARE_STATS_GRP_OP_FILL_S + return idx; + } + ++#ifdef CONFIG_COMPAT_LRO_ENABLED_IPOIB ++static void mlx5e_update_sw_lro_stats(struct mlx5e_priv *priv) ++{ ++ int i; ++ struct mlx5e_sw_stats *s = &priv->stats.sw; ++ ++ s->rx_sw_lro_aggregated = 0; ++ s->rx_sw_lro_flushed = 0; ++ s->rx_sw_lro_no_desc = 0; ++ ++ for (i = 0; i < priv->channels.num; i++) { ++ struct mlx5e_sw_lro *sw_lro = &priv->sw_lro[i]; ++ ++ s->rx_sw_lro_aggregated += sw_lro->lro_mgr.stats.aggregated; ++ s->rx_sw_lro_flushed += sw_lro->lro_mgr.stats.flushed; ++ s->rx_sw_lro_no_desc += sw_lro->lro_mgr.stats.no_desc; ++ } ++} ++#endif ++ ++ + static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(sw) + { + int i; +@@ -258,6 +290,7 @@ static MLX5E_DECLARE_STATS_GRP_OP_FILL_S + return idx; + } + ++#ifdef HAVE_XDP_SUPPORT + static void mlx5e_stats_grp_sw_update_stats_xdp_red(struct mlx5e_sw_stats *s, + struct mlx5e_xdpsq_stats *xdpsq_red_stats) + { +@@ -269,7 +302,9 @@ static void mlx5e_stats_grp_sw_update_st + s->tx_xdp_err += xdpsq_red_stats->err; + s->tx_xdp_cqes += xdpsq_red_stats->cqes; + } ++#endif + ++#ifdef HAVE_XDP_SUPPORT + static void mlx5e_stats_grp_sw_update_stats_xdpsq(struct mlx5e_sw_stats *s, + struct mlx5e_xdpsq_stats *xdpsq_stats) + { +@@ -281,7 +316,8 @@ static void mlx5e_stats_grp_sw_update_st + s->rx_xdp_tx_err += xdpsq_stats->err; + s->rx_xdp_tx_cqe += xdpsq_stats->cqes; + } +- ++#endif ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + static void mlx5e_stats_grp_sw_update_stats_xsksq(struct mlx5e_sw_stats *s, + struct mlx5e_xdpsq_stats *xsksq_stats) + { +@@ -316,6 +352,7 @@ static void mlx5e_stats_grp_sw_update_st + s->rx_xsk_congst_umr += xskrq_stats->congst_umr; + s->rx_xsk_arfs_err += xskrq_stats->arfs_err; + } ++#endif + + static void mlx5e_stats_grp_sw_update_stats_rq_stats(struct mlx5e_sw_stats *s, + struct mlx5e_rq_stats *rq_stats) +@@ -337,8 +374,10 @@ static void mlx5e_stats_grp_sw_update_st + s->rx_csum_complete_tail_slow += rq_stats->csum_complete_tail_slow; + s->rx_csum_unnecessary += rq_stats->csum_unnecessary; + s->rx_csum_unnecessary_inner += rq_stats->csum_unnecessary_inner; ++#ifdef HAVE_XDP_SUPPORT + s->rx_xdp_drop += rq_stats->xdp_drop; + s->rx_xdp_redirect += rq_stats->xdp_redirect; ++#endif + s->rx_wqe_err += rq_stats->wqe_err; + s->rx_mpwqe_filler_cqes += rq_stats->mpwqe_filler_cqes; + s->rx_mpwqe_filler_strides += rq_stats->mpwqe_filler_strides; +@@ -406,7 +445,7 @@ static void mlx5e_stats_grp_sw_update_st + s->tx_csum_partial_inner += sq_stats->csum_partial_inner; + s->tx_csum_none += sq_stats->csum_none; + s->tx_csum_partial += sq_stats->csum_partial; +-#ifdef CONFIG_MLX5_EN_TLS ++#if defined(CONFIG_MLX5_EN_TLS) && defined(HAVE_UAPI_LINUX_TLS_H) + s->tx_tls_encrypted_packets += sq_stats->tls_encrypted_packets; + s->tx_tls_encrypted_bytes += sq_stats->tls_encrypted_bytes; + s->tx_tls_ooo += sq_stats->tls_ooo; +@@ -418,6 +457,8 @@ static void mlx5e_stats_grp_sw_update_st + s->tx_tls_drop_bypass_req += sq_stats->tls_drop_bypass_req; + #endif + s->tx_cqes += sq_stats->cqes; ++ s->tx_cqe_compress_blks += sq_stats->cqe_compress_blks; ++ s->tx_cqe_compress_pkts += sq_stats->cqe_compress_pkts; + } + + static void mlx5e_stats_grp_sw_update_stats_ptp(struct mlx5e_priv *priv, +@@ -478,13 +519,17 @@ static MLX5E_DECLARE_STATS_GRP_OP_UPDATE + int j; + + mlx5e_stats_grp_sw_update_stats_rq_stats(s, &channel_stats->rq); ++#ifdef HAVE_XDP_SUPPORT + mlx5e_stats_grp_sw_update_stats_xdpsq(s, &channel_stats->rq_xdpsq); + mlx5e_stats_grp_sw_update_stats_ch_stats(s, &channel_stats->ch); + /* xdp redirect */ + mlx5e_stats_grp_sw_update_stats_xdp_red(s, &channel_stats->xdpsq); ++#endif ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + /* AF_XDP zero-copy */ + mlx5e_stats_grp_sw_update_stats_xskrq(s, &channel_stats->xskrq); + mlx5e_stats_grp_sw_update_stats_xsksq(s, &channel_stats->xsksq); ++#endif + + for (j = 0; j < priv->max_opened_tc; j++) { + mlx5e_stats_grp_sw_update_stats_sq(s, &channel_stats->sq[j]); +@@ -495,6 +540,9 @@ static MLX5E_DECLARE_STATS_GRP_OP_UPDATE + } + mlx5e_stats_grp_sw_update_stats_ptp(priv, s); + mlx5e_stats_grp_sw_update_stats_qos(priv, s); ++#ifdef CONFIG_COMPAT_LRO_ENABLED_IPOIB ++ mlx5e_update_sw_lro_stats(priv); ++#endif + } + + static const struct counter_desc q_stats_desc[] = { +@@ -797,6 +845,7 @@ static MLX5E_DECLARE_STATS_GRP_OP_UPDATE + MLX5_BYTE_OFF(ppcnt_reg, \ + counter_set.set.c##_high))) + ++#if defined(HAVE_GET_PAUSE_STATS) || defined(HAVE_NDO_ETH_PHY_STATS) + static int mlx5e_stats_get_ieee(struct mlx5_core_dev *mdev, + u32 *ppcnt_ieee_802_3) + { +@@ -811,7 +860,9 @@ static int mlx5e_stats_get_ieee(struct m + return mlx5_core_access_reg(mdev, in, sz, ppcnt_ieee_802_3, + sz, MLX5_REG_PPCNT, 0, 0); + } ++#endif + ++#ifdef HAVE_GET_PAUSE_STATS + void mlx5e_stats_pause_get(struct mlx5e_priv *priv, + struct ethtool_pause_stats *pause_stats) + { +@@ -830,7 +881,9 @@ void mlx5e_stats_pause_get(struct mlx5e_ + eth_802_3_cntrs_grp_data_layout, + a_pause_mac_ctrl_frames_received); + } ++#endif + ++#ifdef HAVE_NDO_ETH_PHY_STATS + void mlx5e_stats_eth_phy_get(struct mlx5e_priv *priv, + struct ethtool_eth_phy_stats *phy_stats) + { +@@ -898,6 +951,61 @@ void mlx5e_stats_eth_ctrl_get(struct mlx + a_unsupported_opcodes_received); + } + ++static const struct ethtool_rmon_hist_range mlx5e_rmon_ranges[] = { ++ { 0, 64 }, ++ { 65, 127 }, ++ { 128, 255 }, ++ { 256, 511 }, ++ { 512, 1023 }, ++ { 1024, 1518 }, ++ { 1519, 2047 }, ++ { 2048, 4095 }, ++ { 4096, 8191 }, ++ { 8192, 10239 }, ++ {} ++}; ++ ++ ++void mlx5e_stats_rmon_get(struct mlx5e_priv *priv, ++ struct ethtool_rmon_stats *rmon, ++ const struct ethtool_rmon_hist_range **ranges) ++{ ++ u32 ppcnt_RFC_2819_counters[MLX5_ST_SZ_DW(ppcnt_reg)]; ++ struct mlx5_core_dev *mdev = priv->mdev; ++ u32 in[MLX5_ST_SZ_DW(ppcnt_reg)] = {0}; ++ int sz = MLX5_ST_SZ_BYTES(ppcnt_reg); ++ ++ MLX5_SET(ppcnt_reg, in, local_port, 1); ++ MLX5_SET(ppcnt_reg, in, grp, MLX5_RFC_2819_COUNTERS_GROUP); ++ if (mlx5_core_access_reg(mdev, in, sz, ppcnt_RFC_2819_counters, ++ sz, MLX5_REG_PPCNT, 0, 0)) ++ return; ++ ++#define RD(name) \ ++ MLX5E_READ_CTR64_BE_F(ppcnt_RFC_2819_counters, \ ++ eth_2819_cntrs_grp_data_layout, \ ++ name) ++ ++ rmon->undersize_pkts = RD(ether_stats_undersize_pkts); ++ rmon->fragments = RD(ether_stats_fragments); ++ rmon->jabbers = RD(ether_stats_jabbers); ++ ++ rmon->hist[0] = RD(ether_stats_pkts64octets); ++ rmon->hist[1] = RD(ether_stats_pkts65to127octets); ++ rmon->hist[2] = RD(ether_stats_pkts128to255octets); ++ rmon->hist[3] = RD(ether_stats_pkts256to511octets); ++ rmon->hist[4] = RD(ether_stats_pkts512to1023octets); ++ rmon->hist[5] = RD(ether_stats_pkts1024to1518octets); ++ rmon->hist[6] = RD(ether_stats_pkts1519to2047octets); ++ rmon->hist[7] = RD(ether_stats_pkts2048to4095octets); ++ rmon->hist[8] = RD(ether_stats_pkts4096to8191octets); ++ rmon->hist[9] = RD(ether_stats_pkts8192to10239octets); ++#undef RD ++ ++ *ranges = mlx5e_rmon_ranges; ++} ++#endif ++ + #define PPORT_2863_OFF(c) \ + MLX5_BYTE_OFF(ppcnt_reg, \ + counter_set.eth_2863_cntrs_grp_data_layout.c##_high) +@@ -1009,59 +1117,6 @@ static MLX5E_DECLARE_STATS_GRP_OP_UPDATE + mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0); + } + +-static const struct ethtool_rmon_hist_range mlx5e_rmon_ranges[] = { +- { 0, 64 }, +- { 65, 127 }, +- { 128, 255 }, +- { 256, 511 }, +- { 512, 1023 }, +- { 1024, 1518 }, +- { 1519, 2047 }, +- { 2048, 4095 }, +- { 4096, 8191 }, +- { 8192, 10239 }, +- {} +-}; +- +-void mlx5e_stats_rmon_get(struct mlx5e_priv *priv, +- struct ethtool_rmon_stats *rmon, +- const struct ethtool_rmon_hist_range **ranges) +-{ +- u32 ppcnt_RFC_2819_counters[MLX5_ST_SZ_DW(ppcnt_reg)]; +- struct mlx5_core_dev *mdev = priv->mdev; +- u32 in[MLX5_ST_SZ_DW(ppcnt_reg)] = {0}; +- int sz = MLX5_ST_SZ_BYTES(ppcnt_reg); +- +- MLX5_SET(ppcnt_reg, in, local_port, 1); +- MLX5_SET(ppcnt_reg, in, grp, MLX5_RFC_2819_COUNTERS_GROUP); +- if (mlx5_core_access_reg(mdev, in, sz, ppcnt_RFC_2819_counters, +- sz, MLX5_REG_PPCNT, 0, 0)) +- return; +- +-#define RD(name) \ +- MLX5E_READ_CTR64_BE_F(ppcnt_RFC_2819_counters, \ +- eth_2819_cntrs_grp_data_layout, \ +- name) +- +- rmon->undersize_pkts = RD(ether_stats_undersize_pkts); +- rmon->fragments = RD(ether_stats_fragments); +- rmon->jabbers = RD(ether_stats_jabbers); +- +- rmon->hist[0] = RD(ether_stats_pkts64octets); +- rmon->hist[1] = RD(ether_stats_pkts65to127octets); +- rmon->hist[2] = RD(ether_stats_pkts128to255octets); +- rmon->hist[3] = RD(ether_stats_pkts256to511octets); +- rmon->hist[4] = RD(ether_stats_pkts512to1023octets); +- rmon->hist[5] = RD(ether_stats_pkts1024to1518octets); +- rmon->hist[6] = RD(ether_stats_pkts1519to2047octets); +- rmon->hist[7] = RD(ether_stats_pkts2048to4095octets); +- rmon->hist[8] = RD(ether_stats_pkts4096to8191octets); +- rmon->hist[9] = RD(ether_stats_pkts8192to10239octets); +-#undef RD +- +- *ranges = mlx5e_rmon_ranges; +-} +- + #define PPORT_PHY_STATISTICAL_OFF(c) \ + MLX5_BYTE_OFF(ppcnt_reg, \ + counter_set.phys_layer_statistical_cntrs.c##_high) +@@ -1169,6 +1224,7 @@ static MLX5E_DECLARE_STATS_GRP_OP_UPDATE + mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0); + } + ++#ifdef HAVE_NDO_GET_FEC_STATS + static int fec_num_lanes(struct mlx5_core_dev *dev) + { + u32 out[MLX5_ST_SZ_DW(pmlp_reg)] = {}; +@@ -1285,6 +1341,7 @@ void mlx5e_stats_fec_get(struct mlx5e_pr + fec_set_corrected_bits_total(priv, fec_stats); + fec_set_block_stats(priv, fec_stats); + } ++#endif + + #define PPORT_ETH_EXT_OFF(c) \ + MLX5_BYTE_OFF(ppcnt_reg, \ +@@ -1871,8 +1928,10 @@ static const struct counter_desc rq_stat + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, csum_unnecessary) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, csum_unnecessary_inner) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, csum_none) }, ++#ifdef HAVE_XDP_SUPPORT + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, xdp_drop) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, xdp_redirect) }, ++#endif + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, lro_packets) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, lro_bytes) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, gro_packets) }, +@@ -1946,10 +2005,13 @@ static const struct counter_desc sq_stat + { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, xmit_more) }, + { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, recover) }, + { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, cqes) }, ++ { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, cqe_compress_blks) }, ++ { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, cqe_compress_pkts) }, + { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, wake) }, + { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, cqe_err) }, + }; + ++#ifdef HAVE_XDP_SUPPORT + static const struct counter_desc rq_xdpsq_stats_desc[] = { + { MLX5E_DECLARE_RQ_XDPSQ_STAT(struct mlx5e_xdpsq_stats, xmit) }, + { MLX5E_DECLARE_RQ_XDPSQ_STAT(struct mlx5e_xdpsq_stats, mpwqe) }, +@@ -1969,7 +2031,9 @@ static const struct counter_desc xdpsq_s + { MLX5E_DECLARE_XDPSQ_STAT(struct mlx5e_xdpsq_stats, err) }, + { MLX5E_DECLARE_XDPSQ_STAT(struct mlx5e_xdpsq_stats, cqes) }, + }; ++#endif + ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + static const struct counter_desc xskrq_stats_desc[] = { + { MLX5E_DECLARE_XSKRQ_STAT(struct mlx5e_rq_stats, packets) }, + { MLX5E_DECLARE_XSKRQ_STAT(struct mlx5e_rq_stats, bytes) }, +@@ -2000,6 +2064,7 @@ static const struct counter_desc xsksq_s + { MLX5E_DECLARE_XSKSQ_STAT(struct mlx5e_xdpsq_stats, err) }, + { MLX5E_DECLARE_XSKSQ_STAT(struct mlx5e_xdpsq_stats, cqes) }, + }; ++#endif + + static const struct counter_desc ch_stats_desc[] = { + { MLX5E_DECLARE_CH_STAT(struct mlx5e_ch_stats, events) }, +@@ -2052,8 +2117,10 @@ static const struct counter_desc ptp_rq_ + { MLX5E_DECLARE_PTP_RQ_STAT(struct mlx5e_rq_stats, csum_unnecessary) }, + { MLX5E_DECLARE_PTP_RQ_STAT(struct mlx5e_rq_stats, csum_unnecessary_inner) }, + { MLX5E_DECLARE_PTP_RQ_STAT(struct mlx5e_rq_stats, csum_none) }, ++#ifdef HAVE_XDP_SUPPORT + { MLX5E_DECLARE_PTP_RQ_STAT(struct mlx5e_rq_stats, xdp_drop) }, + { MLX5E_DECLARE_PTP_RQ_STAT(struct mlx5e_rq_stats, xdp_redirect) }, ++#endif + { MLX5E_DECLARE_PTP_RQ_STAT(struct mlx5e_rq_stats, lro_packets) }, + { MLX5E_DECLARE_PTP_RQ_STAT(struct mlx5e_rq_stats, lro_bytes) }, + { MLX5E_DECLARE_PTP_RQ_STAT(struct mlx5e_rq_stats, ecn_mark) }, +@@ -2111,10 +2178,14 @@ static const struct counter_desc qos_sq_ + + #define NUM_RQ_STATS ARRAY_SIZE(rq_stats_desc) + #define NUM_SQ_STATS ARRAY_SIZE(sq_stats_desc) +-#define NUM_XDPSQ_STATS ARRAY_SIZE(xdpsq_stats_desc) +-#define NUM_RQ_XDPSQ_STATS ARRAY_SIZE(rq_xdpsq_stats_desc) ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + #define NUM_XSKRQ_STATS ARRAY_SIZE(xskrq_stats_desc) + #define NUM_XSKSQ_STATS ARRAY_SIZE(xsksq_stats_desc) ++#endif ++#ifdef HAVE_XDP_SUPPORT ++#define NUM_XDPSQ_STATS ARRAY_SIZE(xdpsq_stats_desc) ++#define NUM_RQ_XDPSQ_STATS ARRAY_SIZE(rq_xdpsq_stats_desc) ++#endif + #define NUM_CH_STATS ARRAY_SIZE(ch_stats_desc) + #define NUM_PTP_SQ_STATS ARRAY_SIZE(ptp_sq_stats_desc) + #define NUM_PTP_CH_STATS ARRAY_SIZE(ptp_ch_stats_desc) +@@ -2253,16 +2324,23 @@ static MLX5E_DECLARE_STATS_GRP_OP_NUM_ST + + return (NUM_RQ_STATS * max_nch) + + (NUM_CH_STATS * max_nch) + +- (NUM_SQ_STATS * max_nch * priv->max_opened_tc) + +- (NUM_RQ_XDPSQ_STATS * max_nch) + +- (NUM_XDPSQ_STATS * max_nch) + +- (NUM_XSKRQ_STATS * max_nch * priv->xsk.ever_used) + +- (NUM_XSKSQ_STATS * max_nch * priv->xsk.ever_used); ++ (NUM_SQ_STATS * max_nch * priv->max_opened_tc) ++#ifdef HAVE_XDP_SUPPORT ++ + (NUM_RQ_XDPSQ_STATS * max_nch) ++ + (NUM_XDPSQ_STATS * max_nch) ++#endif ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT ++ + (NUM_XSKRQ_STATS * max_nch * priv->xsk.ever_used) ++ + (NUM_XSKSQ_STATS * max_nch * priv->xsk.ever_used) ++#endif ++ ; + } + + static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(channels) + { ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + bool is_xsk = priv->xsk.ever_used; ++#endif + int max_nch = priv->stats_nch; + int i, j, tc; + +@@ -2275,12 +2353,17 @@ static MLX5E_DECLARE_STATS_GRP_OP_FILL_S + for (j = 0; j < NUM_RQ_STATS; j++) + sprintf(data + (idx++) * ETH_GSTRING_LEN, + rq_stats_desc[j].format, i); ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + for (j = 0; j < NUM_XSKRQ_STATS * is_xsk; j++) + sprintf(data + (idx++) * ETH_GSTRING_LEN, + xskrq_stats_desc[j].format, i); +- for (j = 0; j < NUM_RQ_XDPSQ_STATS; j++) +- sprintf(data + (idx++) * ETH_GSTRING_LEN, +- rq_xdpsq_stats_desc[j].format, i); ++#endif ++ ++#ifdef HAVE_XDP_SUPPORT ++ for (j = 0; j < NUM_RQ_XDPSQ_STATS; j++) ++ sprintf(data + (idx++) * ETH_GSTRING_LEN, ++ rq_xdpsq_stats_desc[j].format, i); ++#endif + } + + for (tc = 0; tc < priv->max_opened_tc; tc++) +@@ -2291,12 +2374,16 @@ static MLX5E_DECLARE_STATS_GRP_OP_FILL_S + i + tc * max_nch); + + for (i = 0; i < max_nch; i++) { ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + for (j = 0; j < NUM_XSKSQ_STATS * is_xsk; j++) + sprintf(data + (idx++) * ETH_GSTRING_LEN, + xsksq_stats_desc[j].format, i); ++#endif ++#ifdef HAVE_XDP_SUPPORT + for (j = 0; j < NUM_XDPSQ_STATS; j++) + sprintf(data + (idx++) * ETH_GSTRING_LEN, + xdpsq_stats_desc[j].format, i); ++#endif + } + + return idx; +@@ -2304,7 +2391,9 @@ static MLX5E_DECLARE_STATS_GRP_OP_FILL_S + + static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(channels) + { ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + bool is_xsk = priv->xsk.ever_used; ++#endif + int max_nch = priv->stats_nch; + int i, j, tc; + +@@ -2319,14 +2408,18 @@ static MLX5E_DECLARE_STATS_GRP_OP_FILL_S + data[idx++] = + MLX5E_READ_CTR64_CPU(&priv->channel_stats[i]->rq, + rq_stats_desc, j); ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + for (j = 0; j < NUM_XSKRQ_STATS * is_xsk; j++) + data[idx++] = + MLX5E_READ_CTR64_CPU(&priv->channel_stats[i]->xskrq, + xskrq_stats_desc, j); ++#endif ++#ifdef HAVE_XDP_SUPPORT + for (j = 0; j < NUM_RQ_XDPSQ_STATS; j++) + data[idx++] = + MLX5E_READ_CTR64_CPU(&priv->channel_stats[i]->rq_xdpsq, + rq_xdpsq_stats_desc, j); ++#endif + } + + for (tc = 0; tc < priv->max_opened_tc; tc++) +@@ -2337,14 +2430,18 @@ static MLX5E_DECLARE_STATS_GRP_OP_FILL_S + sq_stats_desc, j); + + for (i = 0; i < max_nch; i++) { ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + for (j = 0; j < NUM_XSKSQ_STATS * is_xsk; j++) + data[idx++] = + MLX5E_READ_CTR64_CPU(&priv->channel_stats[i]->xsksq, + xsksq_stats_desc, j); ++#endif ++#ifdef HAVE_XDP_SUPPORT + for (j = 0; j < NUM_XDPSQ_STATS; j++) + data[idx++] = + MLX5E_READ_CTR64_CPU(&priv->channel_stats[i]->xdpsq, + xdpsq_stats_desc, j); ++#endif + } + + return idx; diff --git a/src/mlnx-ofa_kernel-5.8/backports/0154-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch b/src/mlnx-ofa_kernel-5.8/backports/0154-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch new file mode 100644 index 0000000..80c1413 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0154-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch @@ -0,0 +1,147 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/en_stats.h + +Change-Id: If0495f20991a179dda86b4f43a941a5c370b9d78 +--- + .../ethernet/mellanox/mlx5/core/en_stats.h | 30 +++++++++++++++++-- + 1 file changed, 28 insertions(+), 2 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h +@@ -33,6 +33,8 @@ + #ifndef __MLX5_EN_STATS_H__ + #define __MLX5_EN_STATS_H__ + ++#include ++ + #define MLX5E_READ_CTR64_CPU(ptr, dsc, i) \ + (*(u64 *)((char *)ptr + dsc[i].offset)) + #define MLX5E_READ_CTR64_BE(ptr, dsc, i) \ +@@ -114,9 +116,11 @@ void mlx5e_stats_update_ndo_stats(struct + + void mlx5e_stats_pause_get(struct mlx5e_priv *priv, + struct ethtool_pause_stats *pause_stats); ++#ifdef HAVE_NDO_GET_FEC_STATS + void mlx5e_stats_fec_get(struct mlx5e_priv *priv, + struct ethtool_fec_stats *fec_stats); +- ++#endif ++#ifdef HAVE_NDO_ETH_PHY_STATS + void mlx5e_stats_eth_phy_get(struct mlx5e_priv *priv, + struct ethtool_eth_phy_stats *phy_stats); + void mlx5e_stats_eth_mac_get(struct mlx5e_priv *priv, +@@ -126,6 +130,7 @@ void mlx5e_stats_eth_ctrl_get(struct mlx + void mlx5e_stats_rmon_get(struct mlx5e_priv *priv, + struct ethtool_rmon_stats *rmon, + const struct ethtool_rmon_hist_range **ranges); ++#endif + + /* Concrete NIC Stats */ + +@@ -158,15 +163,17 @@ struct mlx5e_sw_stats { + u64 rx_csum_complete_tail; + u64 rx_csum_complete_tail_slow; + u64 rx_csum_unnecessary_inner; ++#ifdef HAVE_XDP_SUPPORT + u64 rx_xdp_drop; + u64 rx_xdp_redirect; + u64 rx_xdp_tx_xmit; ++ u64 rx_xdp_tx_nops; + u64 rx_xdp_tx_mpwqe; + u64 rx_xdp_tx_inlnw; +- u64 rx_xdp_tx_nops; + u64 rx_xdp_tx_full; + u64 rx_xdp_tx_err; + u64 rx_xdp_tx_cqe; ++#endif + u64 tx_csum_none; + u64 tx_csum_partial; + u64 tx_csum_partial_inner; +@@ -177,6 +184,12 @@ struct mlx5e_sw_stats { + u64 tx_cqes; + u64 tx_queue_wake; + u64 tx_cqe_err; ++#ifdef CONFIG_COMPAT_LRO_ENABLED_IPOIB ++ u64 rx_sw_lro_aggregated; ++ u64 rx_sw_lro_flushed; ++ u64 rx_sw_lro_no_desc; ++#endif ++#ifdef HAVE_XDP_SUPPORT + u64 tx_xdp_xmit; + u64 tx_xdp_mpwqe; + u64 tx_xdp_inlnw; +@@ -184,6 +197,9 @@ struct mlx5e_sw_stats { + u64 tx_xdp_full; + u64 tx_xdp_err; + u64 tx_xdp_cqes; ++#endif ++ u64 tx_cqe_compress_blks; ++ u64 tx_cqe_compress_pkts; + u64 rx_wqe_err; + u64 rx_mpwqe_filler_cqes; + u64 rx_mpwqe_filler_strides; +@@ -212,6 +228,7 @@ struct mlx5e_sw_stats { + u64 ch_eq_rearm; + + #ifdef CONFIG_MLX5_EN_TLS ++#ifdef HAVE_UAPI_LINUX_TLS_H + u64 tx_tls_encrypted_packets; + u64 tx_tls_encrypted_bytes; + u64 tx_tls_ooo; +@@ -232,6 +249,7 @@ struct mlx5e_sw_stats { + u64 rx_tls_resync_res_retry; + u64 rx_tls_resync_res_skip; + u64 rx_tls_err; ++#endif /* HAVE_UAPI_LINUX_TLS_H */ + #endif + + u64 rx_xsk_packets; +@@ -340,8 +358,10 @@ struct mlx5e_rq_stats { + u64 mcast_packets; + u64 ecn_mark; + u64 removed_vlan_packets; ++#ifdef HAVE_XDP_SUPPORT + u64 xdp_drop; + u64 xdp_redirect; ++#endif + u64 wqe_err; + u64 mpwqe_filler_cqes; + u64 mpwqe_filler_strides; +@@ -363,6 +383,7 @@ struct mlx5e_rq_stats { + u64 pet_hdr_lookup_drop; + u64 pet_mdata_lookup_drop; + #ifdef CONFIG_MLX5_EN_TLS ++#ifdef HAVE_UAPI_LINUX_TLS_H + u64 tls_decrypted_packets; + u64 tls_decrypted_bytes; + u64 tls_resync_req_pkt; +@@ -373,6 +394,7 @@ struct mlx5e_rq_stats { + u64 tls_resync_res_retry; + u64 tls_resync_res_skip; + u64 tls_err; ++#endif /* HAVE_UAPI_LINUX_TLS_H */ + #endif + }; + +@@ -409,10 +431,13 @@ struct mlx5e_sq_stats { + u64 recover; + /* dirtied @completion */ + u64 cqes ____cacheline_aligned_in_smp; ++ u64 cqe_compress_blks; ++ u64 cqe_compress_pkts; + u64 wake; + u64 cqe_err; + }; + ++#ifdef HAVE_XDP_SUPPORT + struct mlx5e_xdpsq_stats { + u64 xmit; + u64 mpwqe; +@@ -423,6 +448,7 @@ struct mlx5e_xdpsq_stats { + /* dirtied @completion */ + u64 cqes ____cacheline_aligned_in_smp; + }; ++#endif + + struct mlx5e_ch_stats { + u64 events; diff --git a/src/mlnx-ofa_kernel-5.8/backports/0155-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch b/src/mlnx-ofa_kernel-5.8/backports/0155-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch new file mode 100644 index 0000000..fda3015 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0155-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch @@ -0,0 +1,92 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/en_sysfs.c + +Change-Id: Ib86105a2b804f1b91889f451e1108514e9bdd4d0 +--- + .../ethernet/mellanox/mlx5/core/en_sysfs.c | 36 ++++++++++++++++++- + 1 file changed, 35 insertions(+), 1 deletion(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_sysfs.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_sysfs.c +@@ -32,6 +32,7 @@ + + #include + #include ++#include + #include "en.h" + #include "en/port.h" + #include "en_tc.h" +@@ -64,7 +65,13 @@ static ssize_t mlx5e_store_tc_num(struct + { + struct mlx5e_priv *priv = netdev_priv(to_net_dev(device)); + struct net_device *netdev = priv->netdev; +- struct tc_mqprio_qopt_offload mqprio = { 0 }; ++#if defined(HAVE_NDO_SETUP_TC_TAKES_TC_SETUP_TYPE) || defined(HAVE_NDO_SETUP_TC_RH_EXTENDED) ++#ifdef HAVE_TC_MQPRIO_QOPT_OFFLOAD ++ struct tc_mqprio_qopt_offload mqprio = {{ 0 }}; ++#else ++ struct tc_mqprio_qopt mqprio = { 0 }; ++#endif ++#endif + int tc_num; + int err = 0; + +@@ -77,9 +84,17 @@ static ssize_t mlx5e_store_tc_num(struct + return -EINVAL; + + rtnl_lock(); ++#if defined(HAVE_NDO_SETUP_TC_TAKES_TC_SETUP_TYPE) || defined(HAVE_NDO_SETUP_TC_RH_EXTENDED) + netdev_set_num_tc(netdev, tc_num); ++#ifdef HAVE_TC_MQPRIO_QOPT_OFFLOAD + mqprio.qopt.num_tc = tc_num; ++#else ++ mqprio.num_tc = tc_num; ++#endif + mlx5e_setup_tc_mqprio(priv, &mqprio); ++#else ++ mlx5e_setup_tc(netdev, tc_num); ++#endif + rtnl_unlock(); + return count; + } +@@ -954,10 +969,22 @@ static ssize_t mlx5e_store_force_local_l + struct mlx5_core_dev *mdev = priv->mdev; + bool disable; + int err; ++#ifndef HAVE_KSTRTOBOOL ++ int udata; + ++ err = sscanf(buf, "%d", &udata); ++ if (err != 1) ++ return -EINVAL; ++ ++ if (udata >> 1) ++ return -EINVAL; ++ ++ disable = udata ? true : false; ++#else + err = kstrtobool(buf, &disable); + if (err) + return -EINVAL; ++#endif + + if (mdev->local_lb.user_force_disable != disable) { + mdev->local_lb.user_force_disable = disable; +@@ -1237,10 +1264,17 @@ static struct attribute *prio_hp_attrs[] + &prio_hp_attr_rate.attr, + NULL + }; ++#ifdef HAVE_KOBJ_TYPE_DEFAULT_GROUPS ++ATTRIBUTE_GROUPS(prio_hp); ++#endif + + static struct kobj_type prio_hp_sysfs = { + .sysfs_ops = &prio_hp_ops, ++#ifdef HAVE_KOBJ_TYPE_DEFAULT_GROUPS ++ .default_groups = prio_hp_groups ++#else + .default_attrs = prio_hp_attrs ++#endif + }; + + int create_prio_hp_sysfs(struct mlx5e_priv *priv, int prio) diff --git a/src/mlnx-ofa_kernel-5.8/backports/0157-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch b/src/mlnx-ofa_kernel-5.8/backports/0157-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch new file mode 100644 index 0000000..21891f6 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0157-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch @@ -0,0 +1,76 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/en_tc.h + +Change-Id: Ie79f0c0b5e6d7a6a72223de2002200130c5e9264 +--- + .../net/ethernet/mellanox/mlx5/core/en_tc.h | 45 +++++++++++++++++++ + 1 file changed, 45 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h +@@ -286,6 +286,17 @@ void mlx5e_tc_set_ethertype(struct mlx5_ + struct flow_match_basic *match, bool outer, + void *headers_c, void *headers_v); + ++#if defined(HAVE_NDO_SETUP_TC_TAKES_TC_SETUP_TYPE) || defined(HAVE_NDO_SETUP_TC_RH_EXTENDED) ++#if !defined(HAVE_FLOW_BLOCK_CB_SETUP_SIMPLE) && defined(HAVE_TC_BLOCK_OFFLOAD) ++int mlx5e_setup_tc_block(struct net_device *dev, struct tc_block_offload *f); ++#endif ++#if !defined(HAVE_TC_BLOCK_OFFLOAD) && !defined(HAVE_FLOW_BLOCK_OFFLOAD) ++int mlx5e_setup_tc_cls_flower(struct net_device *dev, ++ struct flow_cls_offload *cls_flower, ++ unsigned long flags); ++#endif /* !HAVE_TC_BLOCK_OFFLOAD && !HAVE_FLOW_BLOCK_OFFLOAD */ ++#endif /* HAVE_NDO_SETUP_TC_TAKES_TC_SETUP_TYPE || HAVE_NDO_SETUP_TC_RH_EXTENDED */ ++ + int mlx5e_tc_nic_init(struct mlx5e_priv *priv); + void mlx5e_tc_nic_cleanup(struct mlx5e_priv *priv); + +@@ -327,6 +338,23 @@ static inline void mlx5e_tc_ht_cleanup(s + static inline int + mlx5e_setup_tc_block_cb(enum tc_setup_type type, void *type_data, void *cb_priv) + { return -EOPNOTSUPP; } ++#if defined(HAVE_NDO_SETUP_TC_TAKES_TC_SETUP_TYPE) || defined(HAVE_NDO_SETUP_TC_RH_EXTENDED) ++#if !defined(HAVE_FLOW_BLOCK_CB_SETUP_SIMPLE) && defined(HAVE_TC_BLOCK_OFFLOAD) ++static inline int mlx5e_setup_tc_block(struct net_device *dev, ++ struct tc_block_offload *f) ++{ ++ return -EOPNOTSUPP; ++} ++#endif ++#if !defined(HAVE_TC_BLOCK_OFFLOAD) && !defined(HAVE_FLOW_BLOCK_OFFLOAD) ++static inline int mlx5e_setup_tc_cls_flower(struct net_device *dev, ++ struct flow_cls_offload *cls_flower, ++ unsigned long flags) ++{ ++ return -EOPNOTSUPP; ++} ++#endif /* !HAVE_TC_BLOCK_OFFLOAD && !HAVE_FLOW_BLOCK_OFFLOAD */ ++#endif /* HAVE_NDO_SETUP_TC_TAKES_TC_SETUP_TYPE || HAVE_NDO_SETUP_TC_RH_EXTENDED */ + + #endif /* CONFIG_MLX5_CLS_ACT */ + +@@ -352,6 +380,23 @@ static inline int mlx5e_tc_num_filters( + static inline int + mlx5e_setup_tc_block_cb(enum tc_setup_type type, void *type_data, void *cb_priv) + { return -EOPNOTSUPP; } ++#if defined(HAVE_NDO_SETUP_TC_TAKES_TC_SETUP_TYPE) || defined(HAVE_NDO_SETUP_TC_RH_EXTENDED) ++#if !defined(HAVE_FLOW_BLOCK_CB_SETUP_SIMPLE) && defined(HAVE_TC_BLOCK_OFFLOAD) ++static inline int mlx5e_setup_tc_block(struct net_device *dev, ++ struct tc_block_offload *f) ++{ ++ return -EOPNOTSUPP; ++} ++#endif ++#if !defined(HAVE_TC_BLOCK_OFFLOAD) && !defined(HAVE_FLOW_BLOCK_OFFLOAD) ++static inline int mlx5e_setup_tc_cls_flower(struct net_device *dev, ++ struct flow_cls_offload *cls_flower, ++ unsigned long flags) ++{ ++ return -EOPNOTSUPP; ++} ++#endif /* !HAVE_TC_BLOCK_OFFLOAD && !HAVE_FLOW_BLOCK_OFFLOAD */ ++#endif /* HAVE_NDO_SETUP_TC_TAKES_TC_SETUP_TYPE || HAVE_NDO_SETUP_TC_RH_EXTENDED */ + #endif + + #if IS_ENABLED(CONFIG_MLX5_CLS_ACT) diff --git a/src/mlnx-ofa_kernel-5.8/backports/0159-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch b/src/mlnx-ofa_kernel-5.8/backports/0159-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch new file mode 100644 index 0000000..b92a058 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0159-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch @@ -0,0 +1,177 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c + +Change-Id: I41ec889c63f3d34ab5e16ec8c1c5482a81d2c95f +--- + .../net/ethernet/mellanox/mlx5/core/en_txrx.c | 53 +++++++++++++++++-- + 1 file changed, 50 insertions(+), 3 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c +@@ -34,7 +34,9 @@ + #include "en.h" + #include "en/txrx.h" + #include "en/xdp.h" ++#ifdef HAVE_NDO_XSK_WAKEUP + #include "en/xsk/rx.h" ++#endif + #include "en/xsk/tx.h" + #include "en_accel/ktls_txrx.h" + #include "en/txrx.h" +@@ -90,8 +92,10 @@ void mlx5e_trigger_irq(struct mlx5e_icos + mlx5e_notify_hw(wq, sq->pc, sq->uar_map, &nopwqe->ctrl); + } + ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + static bool mlx5e_napi_xsk_post(struct mlx5e_xdpsq *xsksq, struct mlx5e_rq *xskrq) + { ++#ifdef HAVE_NDO_XSK_WAKEUP + bool busy_xsk = false, xsk_rx_alloc_err; + + /* Handle the race between the application querying need_wakeup and the +@@ -112,34 +116,52 @@ static bool mlx5e_napi_xsk_post(struct m + mlx5e_post_rx_wqes, + xskrq); + busy_xsk |= mlx5e_xsk_update_rx_wakeup(xskrq, xsk_rx_alloc_err); ++#else ++ bool busy_xsk = false; ++ ++ busy_xsk |= mlx5e_xsk_tx(xsksq, MLX5E_TX_XSK_POLL_BUDGET); ++ busy_xsk |= xskrq->post_wqes(xskrq); ++#endif + + return busy_xsk; + } ++#endif + + int mlx5e_napi_poll(struct napi_struct *napi, int budget) + { + struct mlx5e_channel *c = container_of(napi, struct mlx5e_channel, + napi); + struct mlx5e_ch_stats *ch_stats = c->stats; +- struct mlx5e_xdpsq *xsksq = &c->xsksq; ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT ++ struct mlx5e_xdpsq *xsksq = &c->xsksq; ++ struct mlx5e_rq *xskrq = &c->xskrq; ++#endif + struct mlx5e_txqsq __rcu **qos_sqs; +- struct mlx5e_rq *xskrq = &c->xskrq; + struct mlx5e_rq *rq = &c->rq; ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + bool aff_change = false; + bool busy_xsk = false; ++#endif + bool busy = false; + int work_done = 0; +- u16 qos_sqs_size; ++ u16 qos_sqs_size = 0; ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + bool xsk_open; ++#endif + int i; + + rcu_read_lock(); + + qos_sqs = rcu_dereference(c->qos_sqs); + ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + xsk_open = test_bit(MLX5E_CHANNEL_STATE_XSK, c->state); ++#endif + + ch_stats->poll++; ++#ifndef HAVE_NAPI_STATE_MISSED ++ clear_bit(MLX5E_CHANNEL_NAPI_SCHED, &c->flags); ++#endif + + for (i = 0; i < c->num_tc; i++) + busy |= mlx5e_poll_tx_cq(&c->sq[i].cq, budget); +@@ -156,14 +178,18 @@ int mlx5e_napi_poll(struct napi_struct * + } + } + ++#ifdef HAVE_XDP_SUPPORT + busy |= mlx5e_poll_xdpsq_cq(&c->xdpsq.cq); + + if (c->xdp) + busy |= mlx5e_poll_xdpsq_cq(&c->rq_xdpsq.cq); ++#endif + + if (likely(budget)) { /* budget=0 means: don't poll rx rings */ ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + if (xsk_open) + work_done = mlx5e_poll_rx_cq(&xskrq->cq, budget); ++#endif + + if (likely(budget - work_done)) + work_done += mlx5e_poll_rx_cq(&rq->cq, budget - work_done); +@@ -186,12 +212,14 @@ int mlx5e_napi_poll(struct napi_struct * + mlx5e_post_rx_mpwqes, + mlx5e_post_rx_wqes, + rq); ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + if (xsk_open) { + busy |= mlx5e_poll_xdpsq_cq(&xsksq->cq); + busy_xsk |= mlx5e_napi_xsk_post(xsksq, xskrq); + } + + busy |= busy_xsk; ++#endif + + if (busy) { + if (likely(mlx5e_channel_no_affinity_change(c))) { +@@ -199,13 +227,25 @@ int mlx5e_napi_poll(struct napi_struct * + goto out; + } + ch_stats->aff_change++; ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + aff_change = true; ++#endif + if (budget && work_done == budget) + work_done--; + } + ++#ifdef HAVE_NAPI_STATE_MISSED + if (unlikely(!napi_complete_done(napi, work_done))) + goto out; ++#else ++ napi_complete_done(napi, work_done); ++ ++ /* avoid losing completion event during/after polling cqs */ ++ if (test_bit(MLX5E_CHANNEL_NAPI_SCHED, &c->flags)) { ++ napi_schedule(napi); ++ goto out; ++ } ++#endif + + ch_stats->arm++; + +@@ -227,8 +267,11 @@ int mlx5e_napi_poll(struct napi_struct * + mlx5e_rx_dim_cq_rearm(c->priv, rq); + mlx5e_cq_arm(&c->icosq.cq); + mlx5e_cq_arm(&c->async_icosq.cq); ++#ifdef HAVE_XDP_SUPPORT + mlx5e_cq_arm(&c->xdpsq.cq); ++#endif + ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + if (xsk_open) { + mlx5e_rx_dim_cq_rearm(c->priv, xskrq); + mlx5e_cq_arm(&xsksq->cq); +@@ -238,6 +281,7 @@ int mlx5e_napi_poll(struct napi_struct * + mlx5e_trigger_irq(&c->icosq); + ch_stats->force_irq++; + } ++#endif + + out: + rcu_read_unlock(); +@@ -249,6 +293,9 @@ void mlx5e_completion_event(struct mlx5_ + { + struct mlx5e_cq *cq = container_of(mcq, struct mlx5e_cq, mcq); + ++#ifndef HAVE_NAPI_STATE_MISSED ++ set_bit(MLX5E_CHANNEL_NAPI_SCHED, cq->ch_flags); ++#endif + napi_schedule(cq->napi); + cq->event_ctr++; + cq->ch_stats->events++; diff --git a/src/mlnx-ofa_kernel-5.8/backports/0160-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-eq..patch b/src/mlnx-ofa_kernel-5.8/backports/0160-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-eq..patch new file mode 100644 index 0000000..92b6205 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0160-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-eq..patch @@ -0,0 +1,176 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/eq.c + +Change-Id: I69be9043b58bbb7388e2c593e7a0c25beebc769d +--- + drivers/net/ethernet/mellanox/mlx5/core/eq.c | 52 +++++++++++++++++++- + 1 file changed, 50 insertions(+), 2 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c +@@ -44,7 +44,9 @@ enum { + MLX5_EQ_POLLING_BUDGET = 128, + }; + ++#ifdef HAVE_STATIC_ASSERT + static_assert(MLX5_EQ_POLLING_BUDGET <= MLX5_NUM_SPARE_EQE); ++#endif + + struct mlx5_eq_table { + struct list_head comp_eqs_list; +@@ -125,7 +127,11 @@ static int mlx5_eq_comp_int(struct notif + /* Make sure we read EQ entry contents after we've + * checked the ownership bit. + */ ++#ifdef dma_rmb + dma_rmb(); ++#else ++ rmb(); ++#endif + /* Assume (eqe->type) is always MLX5_EVENT_TYPE_COMP */ + cqn = be32_to_cpu(eqe->data.comp.cqn) & 0xffffff; + +@@ -204,7 +210,7 @@ static int mlx5_eq_async_int(struct noti + struct mlx5_eq_table *eqt; + struct mlx5_core_dev *dev; + struct mlx5_eqe *eqe; +- unsigned long flags; ++ unsigned long flags = 0; + int num_eqes = 0; + bool recovery; + +@@ -223,7 +229,11 @@ static int mlx5_eq_async_int(struct noti + * Make sure we read EQ entry contents after we've + * checked the ownership bit. + */ ++#ifdef dma_rmb + dma_rmb(); ++#else ++ rmb(); ++#endif + + atomic_notifier_call_chain(&eqt->nh[eqe->type], eqe->type, eqe); + atomic_notifier_call_chain(&eqt->nh[MLX5_EVENT_TYPE_NOTIFY_ANY], eqe->type, eqe); +@@ -338,7 +348,11 @@ create_map_eq(struct mlx5_core_dev *dev, + + eq->vecidx = vecidx; + eq->eqn = MLX5_GET(create_eq_out, out, eq_number); +- eq->irqn = pci_irq_vector(dev->pdev, vecidx); ++#ifdef HAVE_PCI_IRQ_API ++ eq->irqn = pci_irq_vector(dev->pdev, vecidx); ++#else ++ eq->irqn = mlx5_get_msix_vec(dev, vecidx); ++#endif + eq->dev = dev; + eq->doorbell = priv->uar->map + MLX5_EQ_DOORBEL_OFFSET; + +@@ -683,6 +697,7 @@ static void cleanup_async_eq(struct mlx5 + name, err); + } + ++#ifdef HAVE_DEVLINK_PARAM_GENERIC_ID_IO_EQ_SIZE + static u16 async_eq_depth_devlink_param_get(struct mlx5_core_dev *dev) + { + struct devlink *devlink = priv_to_devlink(dev); +@@ -697,6 +712,8 @@ static u16 async_eq_depth_devlink_param_ + mlx5_core_dbg(dev, "Failed to get param. using default. err = %d\n", err); + return MLX5_NUM_ASYNC_EQE; + } ++#endif ++ + static int create_async_eqs(struct mlx5_core_dev *dev) + { + struct mlx5_eq_table *table = dev->priv.eq_table; +@@ -728,7 +745,11 @@ static int create_async_eqs(struct mlx5_ + + param = (struct mlx5_eq_param) { + .irq = table->ctrl_irq, ++#ifdef HAVE_DEVLINK_PARAM_GENERIC_ID_IO_EQ_SIZE + .nent = async_eq_depth_devlink_param_get(dev), ++#else ++ .nent = MLX5_NUM_ASYNC_EQE, ++#endif + }; + + if (mlx5_core_is_sf(dev) && dev->async_eq_depth) +@@ -848,7 +869,11 @@ struct mlx5_eqe *mlx5_eq_get_eqe(struct + * checked the ownership bit. + */ + if (eqe) ++#ifdef dma_rmb + dma_rmb(); ++#else ++ rmb(); ++#endif + + return eqe; + } +@@ -957,6 +982,7 @@ static void destroy_comp_eqs(struct mlx5 + comp_irqs_release(dev); + } + ++#ifdef HAVE_DEVLINK_PARAM_GENERIC_ID_IO_EQ_SIZE + static u16 comp_eq_depth_devlink_param_get(struct mlx5_core_dev *dev) + { + struct devlink *devlink = priv_to_devlink(dev); +@@ -971,6 +997,7 @@ static u16 comp_eq_depth_devlink_param_g + mlx5_core_dbg(dev, "Failed to get param. using default. err = %d\n", err); + return MLX5_COMP_EQ_SIZE; + } ++#endif + + static int create_comp_eqs(struct mlx5_core_dev *dev) + { +@@ -985,7 +1012,11 @@ static int create_comp_eqs(struct mlx5_c + if (ncomp_eqs < 0) + return ncomp_eqs; + INIT_LIST_HEAD(&table->comp_eqs_list); ++#ifdef HAVE_DEVLINK_PARAM_GENERIC_ID_IO_EQ_SIZE + nent = comp_eq_depth_devlink_param_get(dev); ++#else ++ nent = MLX5_COMP_EQ_SIZE; ++#endif + + /* if user specified completion eq depth, honor that */ + if (mlx5_core_is_sf(dev) && dev->cmpl_eq_depth) +@@ -1003,7 +1034,12 @@ static int create_comp_eqs(struct mlx5_c + INIT_LIST_HEAD(&eq->tasklet_ctx.list); + INIT_LIST_HEAD(&eq->tasklet_ctx.process_list); + spin_lock_init(&eq->tasklet_ctx.lock); ++#ifdef HAVE_TASKLET_SETUP + tasklet_setup(&eq->tasklet_ctx.task, mlx5_cq_tasklet_cb); ++#else ++ tasklet_init(&eq->tasklet_ctx.task, mlx5_cq_tasklet_cb, ++ (unsigned long)&eq->tasklet_ctx); ++#endif + + eq->irq_nb.notifier_call = mlx5_eq_comp_int; + param = (struct mlx5_eq_param) { +@@ -1138,8 +1174,13 @@ static int set_rmap(struct mlx5_core_dev + } + + for (vecidx = 0; vecidx < eq_table->num_comp_eqs; vecidx++) { ++#ifdef HAVE_PCI_IRQ_API + err = irq_cpu_rmap_add(eq_table->rmap, + pci_irq_vector(mdev->pdev, vecidx)); ++#else ++ err = irq_cpu_rmap_add(eq_table->rmap, ++ mdev->priv.msix_arr[vecidx].vector); ++#endif + if (err) { + mlx5_core_err(mdev, "irq_cpu_rmap_add failed. err %d", + err); +@@ -1155,6 +1196,13 @@ err_out: + return err; + } + ++#ifndef HAVE_PCI_IRQ_API ++u32 mlx5_get_msix_vec(struct mlx5_core_dev *dev, int vecidx) ++{ ++ return dev->priv.msix_arr[vecidx].vector; ++} ++#endif ++ + /* This function should only be called after mlx5_cmd_force_teardown_hca */ + void mlx5_core_eq_free_irqs(struct mlx5_core_dev *dev) + { diff --git a/src/mlnx-ofa_kernel-5.8/backports/0161-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-esw.patch b/src/mlnx-ofa_kernel-5.8/backports/0161-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-esw.patch new file mode 100644 index 0000000..433619b --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0161-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-esw.patch @@ -0,0 +1,23 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c + +Change-Id: I1f4e597f174680c0b1daa712b39ef302f3fada94 +--- + drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c +@@ -80,7 +80,11 @@ mlx5_esw_bridge_fdb_offload_notify(struc + send_info.addr = addr; + send_info.vid = vid; + send_info.offloaded = true; ++#ifdef HAVE_CALL_SWITCHDEV_NOTIFIERS_4_PARAMS + call_switchdev_notifiers(val, dev, &send_info.info, NULL); ++#else ++ call_switchdev_notifiers(val, dev, &send_info.info); ++#endif + } + + static void diff --git a/src/mlnx-ofa_kernel-5.8/backports/0163-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-esw.patch b/src/mlnx-ofa_kernel-5.8/backports/0163-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-esw.patch new file mode 100644 index 0000000..0c83873 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0163-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-esw.patch @@ -0,0 +1,23 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/esw/ipsec.c + +Change-Id: Iaa449638ca709082a9f9e895c1e8bdad1ba274ea +--- + drivers/net/ethernet/mellanox/mlx5/core/esw/ipsec.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/esw/ipsec.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/ipsec.c +@@ -1,6 +1,7 @@ + // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB + // Copyright (c) 2020 Mellanox Technologies. + ++#ifdef CONFIG_MLX5_ESWITCH + #include + #include + #include +@@ -788,3 +789,4 @@ void mlx5_esw_ipsec_full_offload_get_sta + mlx5_fc_query(esw->dev, esw_ipsec_tx_chk_drop_counter(esw), + &stats->ipsec_full_tx_pkts_drop, &stats->ipsec_full_tx_bytes_drop); + } ++#endif diff --git a/src/mlnx-ofa_kernel-5.8/backports/0164-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-esw.patch b/src/mlnx-ofa_kernel-5.8/backports/0164-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-esw.patch new file mode 100644 index 0000000..b48892b --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0164-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-esw.patch @@ -0,0 +1,230 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/eswitch.c + +Change-Id: I5c5eb70c94659b115ea0c99514fa343c43676712 +--- + .../net/ethernet/mellanox/mlx5/core/eswitch.c | 85 ++++++++++++++++++- + 1 file changed, 82 insertions(+), 3 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c +@@ -981,7 +981,7 @@ int mlx5_esw_vport_enable(struct mlx5_es + enum mlx5_eswitch_vport_event enabled_events) + { + struct mlx5_vport *vport; +- int ret; ++ int ret = 0; + + vport = mlx5_eswitch_get_vport(esw, vport_num); + if (IS_ERR(vport)) +@@ -1509,17 +1509,29 @@ abort: + */ + int mlx5_eswitch_enable(struct mlx5_eswitch *esw, int num_vfs) + { ++#if defined(HAVE_DEVL_PORT_REGISTER) && \ ++ !defined(HAVE_DEVL_TRAP_GROUPS_REGISTER)//forward port ++ struct devlink *devlink; ++#endif + bool toggle_lag; + int ret; + + if (!mlx5_esw_allowed(esw)) + return 0; ++#ifdef HAVE_DEVL_TRAP_GROUPS_REGISTER //forward port ++ devl_assert_locked(priv_to_devlink(esw->dev)); ++#endif + + toggle_lag = !mlx5_sriov_is_enabled(esw->dev) && !is_mdev_switchdev_mode(esw->dev); + + if (toggle_lag) + mlx5_lag_disable_change(esw->dev); + ++#if defined(HAVE_DEVL_PORT_REGISTER) && \ ++ !defined(HAVE_DEVL_TRAP_GROUPS_REGISTER)//forward port ++ devlink = priv_to_devlink(esw->dev); ++ devl_lock(devlink); ++#endif + down_write(&esw->mode_lock); + if (!mlx5_esw_is_fdb_created(esw)) { + ret = mlx5_eswitch_enable_locked(esw, num_vfs); +@@ -1533,6 +1545,10 @@ int mlx5_eswitch_enable(struct mlx5_eswi + esw->esw_funcs.num_vfs = num_vfs; + } + up_write(&esw->mode_lock); ++#if defined(HAVE_DEVL_PORT_REGISTER) && \ ++ !defined(HAVE_DEVL_TRAP_GROUPS_REGISTER)//forward port ++ devl_unlock(devlink); ++#endif + + if (toggle_lag) + mlx5_lag_enable_change(esw->dev); +@@ -1543,9 +1559,19 @@ int mlx5_eswitch_enable(struct mlx5_eswi + /* When disabling sriov, free driver level resources. */ + void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw, bool clear_vf) + { ++#if defined(HAVE_DEVL_PORT_REGISTER) && \ ++ !defined(HAVE_DEVL_TRAP_GROUPS_REGISTER)//forward port ++ struct devlink *devlink; ++#endif + if (!mlx5_esw_allowed(esw)) + return; + ++#ifdef HAVE_DEVL_TRAP_GROUPS_REGISTER ++ devl_assert_locked(priv_to_devlink(esw->dev)); ++#elif defined(HAVE_DEVL_PORT_REGISTER) //forward port ++ devlink = priv_to_devlink(esw->dev); ++ devl_lock(devlink); ++#endif + down_write(&esw->mode_lock); + /* If driver is unloaded, this function is called twice by remove_one() + * and mlx5_unload(). Prevent the second call. +@@ -1564,9 +1590,17 @@ void mlx5_eswitch_disable_sriov(struct m + * because it depends on num_vfs. + */ + if (esw->mode == MLX5_ESWITCH_OFFLOADS) { ++#ifdef HAVE_DEVLINK_HAS_RATE_FUNCTIONS + struct devlink *devlink = priv_to_devlink(esw->dev); ++#endif + +- devlink_rate_nodes_destroy(devlink); ++#ifdef HAVE_DEVLINK_HAS_RATE_FUNCTIONS ++#ifdef HAVE_DEVL_PORT_REGISTER //forward port ++ devl_rate_nodes_destroy(devlink); ++#else ++ devlink_rate_nodes_destroy(devlink); ++#endif ++#endif + #if IS_ENABLED(CONFIG_MLXDEVM) + mlx5_devm_rate_nodes_destroy(esw->dev); + #endif +@@ -1577,6 +1611,10 @@ void mlx5_eswitch_disable_sriov(struct m + + unlock: + up_write(&esw->mode_lock); ++#if defined(HAVE_DEVL_PORT_REGISTER) && \ ++ !defined(HAVE_DEVL_TRAP_GROUPS_REGISTER)//forward port ++ devl_unlock(devlink); ++#endif + } + + /* Free resources for corresponding eswitch mode. It is called by devlink +@@ -1584,9 +1622,15 @@ unlock: + */ + void mlx5_eswitch_disable_locked(struct mlx5_eswitch *esw) + { ++#ifdef HAVE_DEVLINK_HAS_RATE_FUNCTIONS + struct devlink *devlink = priv_to_devlink(esw->dev); ++#endif + ++#ifdef HAVE_LOCKUP_ASSERT_HELD_WRITE + lockdep_assert_held_write(&esw->mode_lock); ++#else ++ lockdep_assert_held(&esw->mode_lock); ++#endif + + /* Notify eswitch users that it is exiting from current mode. + * So that it can do necessary cleanup before the eswitch is disabled. +@@ -1610,7 +1654,13 @@ void mlx5_eswitch_disable_locked(struct + } + + if (esw->mode == MLX5_ESWITCH_OFFLOADS) { +- devlink_rate_nodes_destroy(devlink); ++#ifdef HAVE_DEVLINK_HAS_RATE_FUNCTIONS ++#ifdef HAVE_DEVL_PORT_REGISTER //forward port ++ devl_rate_nodes_destroy(devlink); ++#else ++ devlink_rate_nodes_destroy(devlink); ++#endif ++#endif + #if IS_ENABLED(CONFIG_MLXDEVM) + mlx5_devm_rate_nodes_destroy(esw->dev); + #endif +@@ -1619,13 +1669,30 @@ void mlx5_eswitch_disable_locked(struct + + void mlx5_eswitch_disable(struct mlx5_eswitch *esw) + { ++#if defined(HAVE_DEVL_PORT_REGISTER) && \ ++ !defined(HAVE_DEVL_TRAP_GROUPS_REGISTER)//forward port ++ struct devlink *devlink; ++#endif ++ + if (!mlx5_esw_allowed(esw)) + return; + ++#ifdef HAVE_DEVL_TRAP_GROUPS_REGISTER //forward port ++ devl_assert_locked(priv_to_devlink(esw->dev)); ++#endif + mlx5_lag_disable_change(esw->dev); ++#if defined(HAVE_DEVL_PORT_REGISTER) && \ ++ !defined(HAVE_DEVL_TRAP_GROUPS_REGISTER)//forward port ++ devlink = priv_to_devlink(esw->dev); ++ devl_lock(devlink); ++#endif + down_write(&esw->mode_lock); + mlx5_eswitch_disable_locked(esw); + up_write(&esw->mode_lock); ++#if defined(HAVE_DEVL_PORT_REGISTER) && \ ++ !defined(HAVE_DEVL_TRAP_GROUPS_REGISTER)//forward port ++ devl_unlock(devlink); ++#endif + mlx5_lag_enable_change(esw->dev); + } + +@@ -1927,7 +1994,9 @@ int mlx5_eswitch_init(struct mlx5_core_d + ida_init(&esw->offloads.vport_metadata_ida); + xa_init_flags(&esw->offloads.vhca_map, XA_FLAGS_ALLOC); + mutex_init(&esw->state_lock); ++#ifdef HAVE_LOCKDEP_UNREGISTER_KEY + lockdep_register_key(&esw->mode_lock_key); ++#endif + init_rwsem(&esw->mode_lock); + lockdep_set_class(&esw->mode_lock, &esw->mode_lock_key); + refcount_set(&esw->qos.refcnt, 0); +@@ -2010,7 +2079,9 @@ void mlx5_eswitch_cleanup(struct mlx5_es + esw->dev->priv.eswitch = NULL; + destroy_workqueue(esw->work_queue); + WARN_ON(refcount_read(&esw->qos.refcnt)); ++#ifdef HAVE_LOCKDEP_UNREGISTER_KEY + lockdep_unregister_key(&esw->mode_lock_key); ++#endif + mutex_destroy(&esw->state_lock); + WARN_ON(!xa_empty(&esw->offloads.vhca_map)); + xa_destroy(&esw->offloads.vhca_map); +@@ -2388,9 +2459,13 @@ int mlx5_eswitch_get_vport_config(struct + ivi->linkstate = evport->info.link_state; + ivi->vlan = evport->info.vlan; + ivi->qos = evport->info.qos; ++#ifdef HAVE_VF_VLAN_PROTO + ivi->vlan_proto = evport->info.vlan_proto; ++#endif + ivi->spoofchk = evport->info.spoofchk; ++#ifdef HAVE_VF_INFO_TRUST + ivi->trusted = evport->info.trusted; ++#endif + if (evport->qos.enabled) { + ivi->min_tx_rate = evport->qos.min_rate; + ivi->max_tx_rate = evport->qos.max_rate; +@@ -2452,7 +2527,9 @@ int mlx5_eswitch_get_vport_stats(struct + struct mlx5_vport *vport = mlx5_eswitch_get_vport(esw, vport_num); + int outlen = MLX5_ST_SZ_BYTES(query_vport_counter_out); + u32 in[MLX5_ST_SZ_DW(query_vport_counter_in)] = {}; ++#ifdef HAVE_STRUCT_IFLA_VF_STATS_RX_TX_DROPPED + struct mlx5_vport_drop_stats stats = {}; ++#endif + int err = 0; + u32 *out; + +@@ -2512,11 +2589,13 @@ int mlx5_eswitch_get_vport_stats(struct + vf_stats->broadcast = + MLX5_GET_CTR(out, received_eth_broadcast.packets); + ++#ifdef HAVE_STRUCT_IFLA_VF_STATS_RX_TX_DROPPED + err = mlx5_esw_query_vport_drop_stats(esw->dev, vport, &stats); + if (err) + goto free_out; + vf_stats->rx_dropped = stats.rx_dropped; + vf_stats->tx_dropped = stats.tx_dropped; ++#endif + + free_out: + kvfree(out); diff --git a/src/mlnx-ofa_kernel-5.8/backports/0165-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-esw.patch b/src/mlnx-ofa_kernel-5.8/backports/0165-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-esw.patch new file mode 100644 index 0000000..65da91d --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0165-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-esw.patch @@ -0,0 +1,148 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/eswitch.h + +Change-Id: Ia4166db3a0b572aaefb59d1d4a2e418a028d7092 +--- + .../net/ethernet/mellanox/mlx5/core/eswitch.h | 84 ++++++++++++++----- + 1 file changed, 62 insertions(+), 22 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h +@@ -71,10 +71,11 @@ struct mlx5_mapped_obj { + }; + }; + +-#ifdef CONFIG_MLX5_ESWITCH +- + #define ESW_OFFLOADS_DEFAULT_NUM_GROUPS 15 + ++#ifdef CONFIG_MLX5_ESWITCH ++#include "en/tc_ct.h" ++ + #define MLX5_MAX_UC_PER_VPORT(dev) \ + (1 << MLX5_CAP_GEN(dev, log_max_current_uc_list)) + +@@ -616,11 +617,17 @@ struct mlx5_esw_flow_attr { + struct mlx5_pkt_reformat *decap_pkt_reformat; + }; + +-int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode, +- struct netlink_ext_ack *extack); ++int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode ++#ifdef HAVE_DEVLINK_ESWITCH_MODE_SET_EXTACK ++ , struct netlink_ext_ack *extack ++#endif ++ ); + int mlx5_devlink_eswitch_mode_get(struct devlink *devlink, u16 *mode); +-int mlx5_devlink_eswitch_inline_mode_set(struct devlink *devlink, u8 mode, +- struct netlink_ext_ack *extack); ++int mlx5_devlink_eswitch_inline_mode_set(struct devlink *devlink, u8 mode ++#ifdef HAVE_DEVLINK_ESWITCH_MODE_SET_EXTACK ++ , struct netlink_ext_ack *extack ++#endif ++ ); + int mlx5_devlink_eswitch_inline_mode_get(struct devlink *devlink, u8 *mode); + + int mlx5_eswitch_vport_modify_other_hca_cap_roce(struct mlx5_eswitch *esw, +@@ -629,21 +636,45 @@ int mlx5_eswitch_vport_get_other_hca_cap + struct mlx5_vport *vport, bool *value); + + int mlx5_devlink_eswitch_encap_mode_set(struct devlink *devlink, +- enum devlink_eswitch_encap_mode encap, +- struct netlink_ext_ack *extack); ++#ifdef HAVE_DEVLINK_HAS_ESWITCH_ENCAP_MODE_SET_GET_WITH_ENUM ++ enum devlink_eswitch_encap_mode encap ++#else ++ u8 encap ++#endif ++#ifdef HAVE_DEVLINK_ESWITCH_MODE_SET_EXTACK ++ , struct netlink_ext_ack *extack ++#endif ++ ); + int mlx5_devlink_eswitch_encap_mode_get(struct devlink *devlink, ++#ifdef HAVE_DEVLINK_HAS_ESWITCH_ENCAP_MODE_SET_GET_WITH_ENUM + enum devlink_eswitch_encap_mode *encap); ++#else ++ u8 *encap); ++#endif + int mlx5_devlink_eswitch_ipsec_mode_set(struct devlink *devlink, +- enum devlink_eswitch_ipsec_mode ipsec, +- struct netlink_ext_ack *extack); ++ enum devlink_eswitch_ipsec_mode ipsec ++#ifdef HAVE_DEVLINK_ESWITCH_MODE_SET_EXTACK ++ , struct netlink_ext_ack *extack ++#endif ++ ); + int mlx5_devlink_eswitch_ipsec_mode_get(struct devlink *devlink, + enum devlink_eswitch_ipsec_mode *ipsec); +-int mlx5_devlink_port_function_hw_addr_get(struct devlink_port *port, +- u8 *hw_addr, int *hw_addr_len, +- struct netlink_ext_ack *extack); +-int mlx5_devlink_port_function_hw_addr_set(struct devlink_port *port, +- const u8 *hw_addr, int hw_addr_len, +- struct netlink_ext_ack *extack); ++#ifdef HAVE_PORT_FUNCTION_HW_ADDR_GET_GET_4_PARAM ++int mlx5_devlink_port_function_hw_addr_get( ++#else ++int mlx5_devlink_port_function_hw_addr_get(struct devlink *devlink, ++#endif ++ struct devlink_port *port, ++ u8 *hw_addr, int *hw_addr_len, ++ struct netlink_ext_ack *extack); ++#ifdef HAVE_PORT_FUNCTION_HW_ADDR_GET_GET_4_PARAM ++int mlx5_devlink_port_function_hw_addr_set( ++#else ++int mlx5_devlink_port_function_hw_addr_set(struct devlink *devlink, ++#endif ++ struct devlink_port *port, ++ const u8 *hw_addr, int hw_addr_len, ++ struct netlink_ext_ack *extack); + int mlx5_devlink_rate_node_tx_max_set(struct devlink *devlink, + const char *group, u64 tx_max, + struct netlink_ext_ack *extack); +@@ -909,6 +940,10 @@ struct mlx5_esw_event_info { + + int mlx5_esw_event_notifier_register(struct mlx5_eswitch *esw, struct notifier_block *n); + void mlx5_esw_event_notifier_unregister(struct mlx5_eswitch *esw, struct notifier_block *n); ++int mlx5_eswitch_set_vport_mac(struct mlx5_eswitch *esw, ++ u16 vport, const u8 *mac); ++int mlx5_eswitch_set_vport_state(struct mlx5_eswitch *esw, ++ u16 vport, int link_state); + bool mlx5e_esw_offloads_pet_enabled(const struct mlx5_eswitch *esw); + int mlx5e_esw_offloads_pet_setup(struct mlx5_eswitch *esw, struct mlx5_flow_table *ft); + void mlx5e_esw_offloads_pet_cleanup(struct mlx5_eswitch *esw); +@@ -969,12 +1004,6 @@ static inline const u32 *mlx5_esw_query_ + static inline void mlx5_esw_unlock(struct mlx5_eswitch *esw) { return; } + static inline void mlx5_esw_lock(struct mlx5_eswitch *esw) { return; } + +-static inline struct mlx5_flow_handle * +-esw_add_restore_rule(struct mlx5_eswitch *esw, u32 tag) +-{ +- return ERR_PTR(-EOPNOTSUPP); +-} +- + static inline bool + mlx5_esw_is_manager_vport(const struct mlx5_eswitch *esw, u16 vport_num) + { +@@ -1005,10 +1034,21 @@ mlx5_eswitch_reload_reps(struct mlx5_esw + return 0; + } + ++static inline bool mlx5e_esw_offloads_pet_enabled(const struct mlx5_eswitch *esw) ++{ ++ return false; ++}; ++ + static inline bool mlx5_esw_host_functions_enabled(const struct mlx5_core_dev *dev) + { + return true; + } ++ ++static inline int mlx5_eswitch_set_vport_mac(struct mlx5_eswitch *esw, ++ u16 vport, u8 mac[ETH_ALEN]){ return 0; } ++static inline int mlx5_eswitch_get_vport_mac(struct mlx5_eswitch *esw, ++ u16 vport, u8 *mac){ return 0; } ++ + #endif /* CONFIG_MLX5_ESWITCH */ + + int mlx5_eswitch_compat_sysfs_init(struct net_device *netdev); diff --git a/src/mlnx-ofa_kernel-5.8/backports/0166-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-esw.patch b/src/mlnx-ofa_kernel-5.8/backports/0166-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-esw.patch new file mode 100644 index 0000000..f0a2e59 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0166-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-esw.patch @@ -0,0 +1,302 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c + +Change-Id: I96cf902e74965fec6d6bc01e2571a1ef6f4df37c +--- + .../mellanox/mlx5/core/eswitch_offloads.c | 124 +++++++++++++++--- + 1 file changed, 109 insertions(+), 15 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +@@ -2675,11 +2675,13 @@ int esw_offloads_load_rep(struct mlx5_es + if (esw->mode != MLX5_ESWITCH_OFFLOADS) + return 0; + ++#ifdef HAVE_DEVLINK_PORT_ATRRS_SET_GET_SUPPORT + if (vport_num != MLX5_VPORT_UPLINK) { + err = mlx5_esw_offloads_devlink_port_register(esw, vport_num); + if (err) + return err; + } ++#endif + + err = mlx5_esw_offloads_rep_load(esw, vport_num); + if (err) +@@ -2687,8 +2689,10 @@ int esw_offloads_load_rep(struct mlx5_es + return err; + + load_err: ++#ifdef HAVE_DEVLINK_PORT_ATRRS_SET_GET_SUPPORT + if (vport_num != MLX5_VPORT_UPLINK) + mlx5_esw_offloads_devlink_port_unregister(esw, vport_num); ++#endif + return err; + } + +@@ -2699,8 +2703,10 @@ void esw_offloads_unload_rep(struct mlx5 + + mlx5_esw_offloads_rep_unload(esw, vport_num); + ++#ifdef HAVE_DEVLINK_PORT_ATRRS_SET_GET_SUPPORT + if (vport_num != MLX5_VPORT_UPLINK) + mlx5_esw_offloads_devlink_port_unregister(esw, vport_num); ++#endif + } + + static int esw_set_slave_root_fdb(struct mlx5_core_dev *master, +@@ -3137,7 +3143,11 @@ u32 mlx5_esw_match_metadata_alloc(struct + + /* Metadata is 4 bits of PFNUM and 12 bits of unique id */ + /* Use only non-zero vport_id (2-4095) for all PF's */ ++#ifdef HAVE_IDA_ALLOC_RANGE + id = ida_alloc_range(&esw->offloads.vport_metadata_ida, ++#else ++ id = ida_simple_get(&esw->offloads.vport_metadata_ida, ++#endif + MLX5_ESW_METADATA_RSVD_UPLINK + 1, + vport_end_ida, GFP_KERNEL); + if (id < 0) +@@ -3151,7 +3161,11 @@ void mlx5_esw_match_metadata_free(struct + u32 vport_bit_mask = (1 << ESW_VPORT_BITS) - 1; + + /* Metadata contains only 12 bits of actual ida id */ +- ida_free(&esw->offloads.vport_metadata_ida, metadata & vport_bit_mask); ++#ifdef HAVE_IDA_FREE ++ ida_free(&esw->offloads.vport_metadata_ida, metadata & vport_bit_mask); ++#else ++ ida_simple_remove(&esw->offloads.vport_metadata_ida, metadata & vport_bit_mask); ++#endif + } + + static int esw_offloads_vport_metadata_setup(struct mlx5_eswitch *esw, +@@ -3414,6 +3428,9 @@ static void esw_offloads_steering_cleanu + static void + esw_vfs_changed_event_handler(struct mlx5_eswitch *esw, const u32 *out) + { ++#ifdef HAVE_DEVL_PORT_REGISTER //forward port ++ struct devlink *devlink; ++#endif + bool host_pf_disabled; + u16 new_num_vfs; + +@@ -3425,6 +3442,10 @@ esw_vfs_changed_event_handler(struct mlx + if (new_num_vfs == esw->esw_funcs.num_vfs || host_pf_disabled) + return; + ++#ifdef HAVE_DEVL_PORT_REGISTER //forward port ++ devlink = priv_to_devlink(esw->dev); ++ devl_lock(devlink); ++#endif + /* Number of VFs can only change from "0 to x" or "x to 0". */ + if (esw->esw_funcs.num_vfs > 0) { + mlx5_eswitch_unload_vf_vports(esw, esw->esw_funcs.num_vfs); +@@ -3433,10 +3454,17 @@ esw_vfs_changed_event_handler(struct mlx + + err = mlx5_eswitch_load_vf_vports(esw, new_num_vfs, + MLX5_VPORT_UC_ADDR_CHANGE); +- if (err) ++ if (err) { ++#ifdef HAVE_DEVL_PORT_REGISTER //forward port ++ devl_unlock(devlink); ++#endif + return; ++ } + } + esw->esw_funcs.num_vfs = new_num_vfs; ++#ifdef HAVE_DEVL_PORT_REGISTER //forward port ++ devl_unlock(devlink); ++#endif + } + + static void esw_functions_changed_event_handler(struct work_struct *work) +@@ -3686,14 +3714,24 @@ static int esw_inline_mode_to_devlink(u8 + return 0; + } + ++#ifdef HAVE_DEVLINK_ESWITCH_MODE_SET_EXTACK + int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode, + struct netlink_ext_ack *extack) ++#else ++int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode) ++#endif + { ++#ifndef HAVE_DEVLINK_ESWITCH_MODE_SET_EXTACK ++ struct netlink_ext_ack *extack; ++#endif + u16 cur_mlx5_mode, mlx5_mode = 0; + struct mlx5_eswitch *esw; +- + int err = 0; + ++#ifndef HAVE_DEVLINK_ESWITCH_MODE_SET_EXTACK ++ extack = NULL; ++#endif ++ + esw = mlx5_devlink_eswitch_get(devlink); + if (IS_ERR(esw)) + return PTR_ERR(esw); +@@ -3729,12 +3767,14 @@ int mlx5_devlink_eswitch_mode_set(struct + + mlx5_eswitch_disable_locked(esw); + if (mode == DEVLINK_ESWITCH_MODE_SWITCHDEV) { ++#ifdef HAVE_DEVLINK_TRAP_SUPPORT + if (mlx5_devlink_trap_get_num_active(esw->dev)) { + NL_SET_ERR_MSG_MOD(extack, + "Can't change mode while devlink traps are active"); + err = -EOPNOTSUPP; + goto unlock; + } ++#endif + err = esw_offloads_start(esw, extack); + } else if (mode == DEVLINK_ESWITCH_MODE_LEGACY) { + err = esw_offloads_stop(esw, extack); +@@ -3800,14 +3840,23 @@ revert_inline_mode: + return err; + } + +-int mlx5_devlink_eswitch_inline_mode_set(struct devlink *devlink, u8 mode, +- struct netlink_ext_ack *extack) ++int mlx5_devlink_eswitch_inline_mode_set(struct devlink *devlink, u8 mode ++#ifdef HAVE_DEVLINK_ESWITCH_MODE_SET_EXTACK ++ , struct netlink_ext_ack *extack ++#endif ++ ) + { ++#ifndef HAVE_DEVLINK_ESWITCH_MODE_SET_EXTACK ++ struct netlink_ext_ack *extack; ++#endif + struct mlx5_core_dev *dev = devlink_priv(devlink); + struct mlx5_eswitch *esw; + u8 mlx5_mode; + int err; + ++#ifndef HAVE_DEVLINK_ESWITCH_MODE_SET_EXTACK ++ extack = NULL; ++#endif + esw = mlx5_devlink_eswitch_get(devlink); + if (IS_ERR(esw)) + return PTR_ERR(esw); +@@ -3872,13 +3921,26 @@ int mlx5_devlink_eswitch_inline_mode_get + } + + int mlx5_devlink_eswitch_encap_mode_set(struct devlink *devlink, +- enum devlink_eswitch_encap_mode encap, +- struct netlink_ext_ack *extack) ++#ifdef HAVE_DEVLINK_HAS_ESWITCH_ENCAP_MODE_SET_GET_WITH_ENUM ++ enum devlink_eswitch_encap_mode encap ++#else ++ u8 encap ++#endif ++#ifdef HAVE_DEVLINK_ESWITCH_MODE_SET_EXTACK ++ , struct netlink_ext_ack *extack ++#endif ++ ) + { ++#ifndef HAVE_DEVLINK_ESWITCH_MODE_SET_EXTACK ++ struct netlink_ext_ack *extack; ++#endif + struct mlx5_core_dev *dev = devlink_priv(devlink); + struct mlx5_eswitch *esw; + int err = 0; + ++#ifndef HAVE_DEVLINK_ESWITCH_MODE_SET_EXTACK ++ extack = NULL; ++#endif + esw = mlx5_devlink_eswitch_get(devlink); + if (IS_ERR(esw)) + return PTR_ERR(esw); +@@ -3931,7 +3993,11 @@ unlock: + } + + int mlx5_devlink_eswitch_encap_mode_get(struct devlink *devlink, ++#ifdef HAVE_DEVLINK_HAS_ESWITCH_ENCAP_MODE_SET_GET_WITH_ENUM + enum devlink_eswitch_encap_mode *encap) ++#else ++ u8 *encap) ++#endif + { + struct mlx5_eswitch *esw; + +@@ -3964,14 +4030,24 @@ mlx5_eswitch_vport_has_rep(const struct + } + + int mlx5_devlink_eswitch_ipsec_mode_set(struct devlink *devlink, +- enum devlink_eswitch_ipsec_mode ipsec, +- struct netlink_ext_ack *extack) ++ enum devlink_eswitch_ipsec_mode ipsec ++#ifdef HAVE_DEVLINK_ESWITCH_MODE_SET_EXTACK ++ , struct netlink_ext_ack *extack ++#endif ++) + { ++#ifndef HAVE_DEVLINK_ESWITCH_MODE_SET_EXTACK ++ struct netlink_ext_ack *extack; ++#endif + struct mlx5_core_dev *dev = devlink_priv(devlink); + struct mlx5_eswitch *esw = dev->priv.eswitch; + int err = 0; + ++#ifndef HAVE_DEVLINK_ESWITCH_MODE_SET_EXTACK ++ extack = NULL; ++#else + memset(extack, 0, sizeof(*extack)); ++#endif + + esw = mlx5_devlink_eswitch_get(devlink); + if (IS_ERR(esw)) +@@ -4419,15 +4495,24 @@ is_port_function_supported(struct mlx5_e + mlx5_esw_is_sf_vport(esw, vport_num); + } + +-int mlx5_devlink_port_function_hw_addr_get(struct devlink_port *port, +- u8 *hw_addr, int *hw_addr_len, +- struct netlink_ext_ack *extack) ++#ifdef HAVE_PORT_FUNCTION_HW_ADDR_GET_GET_4_PARAM ++int mlx5_devlink_port_function_hw_addr_get( ++#else ++int mlx5_devlink_port_function_hw_addr_get(struct devlink *devlink, ++#endif ++ struct devlink_port *port, ++ u8 *hw_addr, int *hw_addr_len, ++ struct netlink_ext_ack *extack) + { + struct mlx5_eswitch *esw; + struct mlx5_vport *vport; + u16 vport_num; + ++#ifdef HAVE_PORT_FUNCTION_HW_ADDR_GET_GET_4_PARAM + esw = mlx5_devlink_eswitch_get(port->devlink); ++#else ++ esw = mlx5_devlink_eswitch_get(devlink); ++#endif + if (IS_ERR(esw)) + return PTR_ERR(esw); + +@@ -4448,14 +4533,23 @@ int mlx5_devlink_port_function_hw_addr_g + return 0; + } + +-int mlx5_devlink_port_function_hw_addr_set(struct devlink_port *port, +- const u8 *hw_addr, int hw_addr_len, +- struct netlink_ext_ack *extack) ++#ifdef HAVE_PORT_FUNCTION_HW_ADDR_GET_GET_4_PARAM ++int mlx5_devlink_port_function_hw_addr_set( ++#else ++int mlx5_devlink_port_function_hw_addr_set(struct devlink *devlink, ++#endif ++ struct devlink_port *port, ++ const u8 *hw_addr, int hw_addr_len, ++ struct netlink_ext_ack *extack) + { + struct mlx5_eswitch *esw; + u16 vport_num; + ++#ifdef HAVE_PORT_FUNCTION_HW_ADDR_GET_GET_4_PARAM + esw = mlx5_devlink_eswitch_get(port->devlink); ++#else ++ esw = mlx5_devlink_eswitch_get(devlink); ++#endif + if (IS_ERR(esw)) { + NL_SET_ERR_MSG_MOD(extack, "Eswitch doesn't support set hw_addr"); + return PTR_ERR(esw); diff --git a/src/mlnx-ofa_kernel-5.8/backports/0167-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-fpg.patch b/src/mlnx-ofa_kernel-5.8/backports/0167-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-fpg.patch new file mode 100644 index 0000000..3c6629d --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0167-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-fpg.patch @@ -0,0 +1,20 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/fpga/core.h + +Change-Id: I88357d11fbf3fe40c53c5044c7c0ace5019bb942 +--- + drivers/net/ethernet/mellanox/mlx5/core/fpga/core.h | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.h +@@ -59,7 +59,9 @@ struct mlx5_fpga_device { + } conn_res; + + struct mlx5_fpga_ipsec *ipsec; ++#ifdef HAVE_UAPI_LINUX_TLS_H + struct mlx5_fpga_tls *tls; ++#endif + }; + + #define mlx5_fpga_dbg(__adev, format, ...) \ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0168-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-fpg.patch b/src/mlnx-ofa_kernel-5.8/backports/0168-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-fpg.patch new file mode 100644 index 0000000..19b2f19 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0168-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-fpg.patch @@ -0,0 +1,33 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c + +Change-Id: If05c0d194e70f3681451a3d71915bcc6473be205 +--- + .../net/ethernet/mellanox/mlx5/core/fpga/ipsec.c | 15 +++++++++++---- + 1 file changed, 11 insertions(+), 4 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c +@@ -88,10 +88,17 @@ static const struct rhashtable_params rh + * value is not constant during the lifetime + * of the key object. + */ +- .key_len = sizeof_field(struct mlx5_fpga_ipsec_sa_ctx, hw_sa) - +- sizeof_field(struct mlx5_ifc_fpga_ipsec_sa_v1, cmd), +- .key_offset = offsetof(struct mlx5_fpga_ipsec_sa_ctx, hw_sa) + +- sizeof_field(struct mlx5_ifc_fpga_ipsec_sa_v1, cmd), ++#ifndef FIELD_SIZEOF ++ .key_len = sizeof_field(struct mlx5_fpga_ipsec_sa_ctx, hw_sa) - ++ sizeof_field(struct mlx5_ifc_fpga_ipsec_sa_v1, cmd), ++ .key_offset = offsetof(struct mlx5_fpga_ipsec_sa_ctx, hw_sa) + ++ sizeof_field(struct mlx5_ifc_fpga_ipsec_sa_v1, cmd), ++#else ++ .key_len = FIELD_SIZEOF(struct mlx5_fpga_ipsec_sa_ctx, hw_sa) - ++ FIELD_SIZEOF(struct mlx5_ifc_fpga_ipsec_sa_v1, cmd), ++ .key_offset = offsetof(struct mlx5_fpga_ipsec_sa_ctx, hw_sa) + ++ FIELD_SIZEOF(struct mlx5_ifc_fpga_ipsec_sa_v1, cmd), ++#endif + .head_offset = offsetof(struct mlx5_fpga_ipsec_sa_ctx, hash), + .automatic_shrinking = true, + .min_size = 1, diff --git a/src/mlnx-ofa_kernel-5.8/backports/0169-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-fpg.patch b/src/mlnx-ofa_kernel-5.8/backports/0169-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-fpg.patch new file mode 100644 index 0000000..d299068 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0169-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-fpg.patch @@ -0,0 +1,76 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.c + +Change-Id: I2f33c8672b8dbe3171059c0aafecd2b28e0afc99 +--- + .../ethernet/mellanox/mlx5/core/fpga/tls.c | 24 +++++++++++++++++++ + 1 file changed, 24 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.c +@@ -31,6 +31,8 @@ + * + */ + ++#ifdef HAVE_UAPI_LINUX_TLS_H ++ + #include + #include "fpga/tls.h" + #include "fpga/cmd.h" +@@ -148,16 +150,28 @@ static int mlx5_fpga_tls_alloc_swid(stru + return ret; + } + ++#ifdef HAVE_IDR_REMOVE_RETURN_VALUE + static void *mlx5_fpga_tls_release_swid(struct idr *idr, ++#else ++static void mlx5_fpga_tls_release_swid(struct idr *idr, ++#endif + spinlock_t *idr_spinlock, u32 swid) + { + unsigned long flags; ++#ifdef HAVE_IDR_REMOVE_RETURN_VALUE + void *ptr; ++#endif + + spin_lock_irqsave(idr_spinlock, flags); ++#ifdef HAVE_IDR_REMOVE_RETURN_VALUE + ptr = idr_remove(idr, swid); ++#else ++ idr_remove(idr, swid); ++#endif + spin_unlock_irqrestore(idr_spinlock, flags); ++#ifdef HAVE_IDR_REMOVE_RETURN_VALUE + return ptr; ++#endif + } + + static void mlx_tls_kfree_complete(struct mlx5_fpga_conn *conn, +@@ -269,13 +283,21 @@ void mlx5_fpga_tls_del_flow(struct mlx5_ + void *flow; + + if (direction_sx) ++#ifdef HAVE_IDR_REMOVE_RETURN_VALUE + flow = mlx5_fpga_tls_release_swid(&tls->tx_idr, + &tls->tx_idr_spinlock, + swid); ++#else ++ flow = idr_find(&tls->tx_idr, swid); ++#endif + else ++#ifdef HAVE_IDR_REMOVE_RETURN_VALUE + flow = mlx5_fpga_tls_release_swid(&tls->rx_idr, + &tls->rx_idr_spinlock, + swid); ++#else ++ flow = idr_find(&tls->rx_idr, swid); ++#endif + + if (!flow) { + mlx5_fpga_err(mdev->fpga, "No flow information for swid %u\n", +@@ -620,3 +642,5 @@ free_swid: + + return ret; + } ++ ++#endif diff --git a/src/mlnx-ofa_kernel-5.8/backports/0170-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-fpg.patch b/src/mlnx-ofa_kernel-5.8/backports/0170-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-fpg.patch new file mode 100644 index 0000000..ae6987c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0170-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-fpg.patch @@ -0,0 +1,36 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.h + +Change-Id: I31a93a963a15bdbe6264888601c0ba19891b718b +--- + drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.h | 9 +++++++++ + 1 file changed, 9 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.h +@@ -31,12 +31,19 @@ + * + */ + ++#ifdef HAVE_UAPI_LINUX_TLS_H ++ + #ifndef __MLX5_FPGA_TLS_H__ + #define __MLX5_FPGA_TLS_H__ + + #include + ++#ifdef HAVE_NET_TLS_H + #include ++#else ++#include ++#endif ++ + #include "fpga/core.h" + + struct mlx5_fpga_tls { +@@ -72,3 +79,5 @@ int mlx5_fpga_tls_resync_rx(struct mlx5_ + u32 seq, __be64 rcd_sn); + + #endif /* __MLX5_FPGA_TLS_H__ */ ++ ++#endif /* HAVE_UAPI_LINUX_TLS_H */ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0171-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-fs_.patch b/src/mlnx-ofa_kernel-5.8/backports/0171-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-fs_.patch new file mode 100644 index 0000000..e5f608a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0171-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-fs_.patch @@ -0,0 +1,262 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/fs_core.c + +Change-Id: I0fa7e6ba70eca6894561bed963aee56e3ae9ef89 +--- + .../net/ethernet/mellanox/mlx5/core/fs_core.c | 87 +++++++++++++++++-- + 1 file changed, 79 insertions(+), 8 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +@@ -321,15 +321,27 @@ enum fs_i_lock_class { + }; + + static const struct rhashtable_params rhash_fte = { +- .key_len = sizeof_field(struct fs_fte, val), ++#ifndef FIELD_SIZEOF ++ .key_len = sizeof_field(struct fs_fte, val), ++#else ++ .key_len = FIELD_SIZEOF(struct fs_fte, val), ++#endif + .key_offset = offsetof(struct fs_fte, val), + .head_offset = offsetof(struct fs_fte, hash), + .automatic_shrinking = true, + .min_size = 1, + }; + ++#if !defined(HAVE_RHLTABLE) && defined(HAVE_NETNS_FRAGS_RHASHTABLE) ++static const struct bp_rhashtable_params rhash_fg = { ++#else + static const struct rhashtable_params rhash_fg = { +- .key_len = sizeof_field(struct mlx5_flow_group, mask), ++#endif ++#ifndef FIELD_SIZEOF ++ .key_len = sizeof_field(struct mlx5_flow_group, mask), ++#else ++ .key_len = FIELD_SIZEOF(struct mlx5_flow_group, mask), ++#endif + .key_offset = offsetof(struct mlx5_flow_group, mask), + .head_offset = offsetof(struct mlx5_flow_group, hash), + .automatic_shrinking = true, +@@ -543,7 +555,9 @@ static void del_hw_flow_table(struct fs_ + fs_get_obj(ft, node); + dev = get_dev(&ft->node); + root = find_root(&ft->node); ++#ifndef MLX_DISABLE_TRACEPOINTS + trace_mlx5_fs_del_ft(ft); ++#endif + + if (node->active) { + err = root->cmds->destroy_flow_table(root, ft); +@@ -559,7 +573,11 @@ static void del_sw_flow_table(struct fs_ + + fs_get_obj(ft, node); + ++#if !defined(HAVE_RHLTABLE) && defined(HAVE_NETNS_FRAGS_RHASHTABLE) ++ bp_rhltable_destroy(&ft->fgs_hash); ++#else + rhltable_destroy(&ft->fgs_hash); ++#endif + if (ft->node.parent) { + fs_get_obj(prio, ft->node.parent); + prio->num_ft--; +@@ -595,7 +613,9 @@ static void del_sw_hw_rule(struct fs_nod + + fs_get_obj(rule, node); + fs_get_obj(fte, rule->node.parent); ++#ifndef MLX_DISABLE_TRACEPOINTS + trace_mlx5_fs_del_rule(rule); ++#endif + if (is_fwd_next_action(rule->sw_action)) { + mutex_lock(&rule->dest_attr.ft->lock); + list_del(&rule->next_ft); +@@ -640,7 +660,9 @@ static void del_hw_fte(struct fs_node *n + fs_get_obj(fg, fte->node.parent); + fs_get_obj(ft, fg->node.parent); + ++#ifndef MLX_DISABLE_TRACEPOINTS + trace_mlx5_fs_del_fte(fte); ++#endif + dev = get_dev(&ft->node); + root = find_root(&ft->node); + if (node->active) { +@@ -667,7 +689,11 @@ static void del_sw_fte(struct fs_node *n + &fte->hash, + rhash_fte); + WARN_ON(err); ++#ifdef HAVE_IDA_FREE + ida_free(&fg->fte_allocator, fte->index - fg->start_index); ++#else ++ ida_simple_remove(&fg->fte_allocator, fte->index - fg->start_index); ++#endif + kmem_cache_free(steering->ftes_cache, fte); + } + +@@ -681,7 +707,9 @@ static void del_hw_flow_group(struct fs_ + fs_get_obj(fg, node); + fs_get_obj(ft, fg->node.parent); + dev = get_dev(&ft->node); ++#ifndef MLX_DISABLE_TRACEPOINTS + trace_mlx5_fs_del_fg(fg); ++#endif + + root = find_root(&ft->node); + if (fg->node.active && root->cmds->destroy_flow_group(root, ft, fg)) +@@ -705,7 +733,11 @@ static void del_sw_flow_group(struct fs_ + fg->max_ftes == ft->autogroup.group_size && + fg->start_index < ft->autogroup.max_fte) + ft->autogroup.num_groups--; ++#if !defined(HAVE_RHLTABLE) && defined(HAVE_NETNS_FRAGS_RHASHTABLE) ++ err = bp_rhltable_remove(&ft->fgs_hash, ++#else + err = rhltable_remove(&ft->fgs_hash, ++#endif + &fg->hash, + rhash_fg); + WARN_ON(err); +@@ -716,8 +748,11 @@ static int insert_fte(struct mlx5_flow_g + { + int index; + int ret; +- ++#ifdef HAVE_IDA_ALLOC_MAX + index = ida_alloc_max(&fg->fte_allocator, fg->max_ftes - 1, GFP_KERNEL); ++#else ++ index = ida_simple_get(&fg->fte_allocator, 0, fg->max_ftes, GFP_KERNEL); ++#endif + if (index < 0) + return index; + +@@ -733,7 +768,11 @@ static int insert_fte(struct mlx5_flow_g + return 0; + + err_ida_remove: ++#ifdef HAVE_IDA_FREE + ida_free(&fg->fte_allocator, index); ++#else ++ ida_simple_remove(&fg->fte_allocator, index); ++#endif + return ret; + } + +@@ -812,7 +851,11 @@ static struct mlx5_flow_group *alloc_ins + return fg; + + /* initialize refcnt, add to parent list */ ++#if !defined(HAVE_RHLTABLE) && defined(HAVE_NETNS_FRAGS_RHASHTABLE) ++ ret = bp_rhltable_insert(&ft->fgs_hash, ++#else + ret = rhltable_insert(&ft->fgs_hash, ++#endif + &fg->hash, + rhash_fg); + if (ret) { +@@ -841,7 +884,11 @@ static struct mlx5_flow_table *alloc_flo + if (!ft) + return ERR_PTR(-ENOMEM); + ++#if !defined(HAVE_RHLTABLE) && defined(HAVE_NETNS_FRAGS_RHASHTABLE) ++ ret = bp_rhltable_init(&ft->fgs_hash, &rhash_fg); ++#else + ret = rhltable_init(&ft->fgs_hash, &rhash_fg); ++#endif + if (ret) { + kfree(ft); + return ERR_PTR(ret); +@@ -1217,12 +1264,18 @@ static struct mlx5_flow_table *__mlx5_cr + fs_prio->num_ft++; + up_write_ref_node(&fs_prio->node, false); + mutex_unlock(&root->chain_lock); ++#ifndef MLX_DISABLE_TRACEPOINTS + trace_mlx5_fs_add_ft(ft); ++#endif + return ft; + destroy_ft: + root->cmds->destroy_flow_table(root, ft); + free_ft: ++#if !defined(HAVE_RHLTABLE) && defined(HAVE_NETNS_FRAGS_RHASHTABLE) ++ bp_rhltable_destroy(&ft->fgs_hash); ++#else + rhltable_destroy(&ft->fgs_hash); ++#endif + kfree(ft); + unlock_root: + mutex_unlock(&root->chain_lock); +@@ -1357,7 +1410,9 @@ struct mlx5_flow_group *mlx5_create_flow + tree_put_node(&fg->node, false); + return ERR_PTR(err); + } +- trace_mlx5_fs_add_fg(fg); ++#ifndef MLX_DISABLE_TRACEPOINTS ++ trace_mlx5_fs_add_fg(fg); ++#endif + fg->node.active = true; + + return fg; +@@ -1594,7 +1649,9 @@ static int create_auto_flow_group(struct + err = root->cmds->create_flow_group(root, ft, in, fg); + if (!err) { + fg->node.active = true; ++#ifndef MLX_DISABLE_TRACEPOINTS + trace_mlx5_fs_add_fg(fg); ++#endif + } + + kvfree(in); +@@ -1737,12 +1794,16 @@ static struct mlx5_flow_handle *add_rule + fte->action.action = old_action; + return handle; + } ++#ifndef MLX_DISABLE_TRACEPOINTS + trace_mlx5_fs_set_fte(fte, false); ++#endif + + for (i = 0; i < handle->num_rules; i++) { + if (refcount_read(&handle->rule[i]->node.refcount) == 1) { + tree_add_node(&handle->rule[i]->node, &fte->node); ++#ifndef MLX_DISABLE_TRACEPOINTS + trace_mlx5_fs_add_rule(handle->rule[i]); ++#endif + } + } + return handle; +@@ -1808,16 +1869,26 @@ static int build_match_list(struct match + struct mlx5_flow_group *fg, + bool ft_locked) + { ++#if !defined(HAVE_RHLTABLE) && defined(HAVE_NETNS_FRAGS_RHASHTABLE) ++ struct bp_rhlist_head *tmp, *list; ++#else + struct rhlist_head *tmp, *list; ++#endif + struct mlx5_flow_group *g; + int err = 0; + + rcu_read_lock(); + INIT_LIST_HEAD(&match_head->list); + /* Collect all fgs which has a matching match_criteria */ +- list = rhltable_lookup(&ft->fgs_hash, spec, rhash_fg); ++#if !defined(HAVE_RHLTABLE) && defined(HAVE_NETNS_FRAGS_RHASHTABLE) ++ list = bp_rhltable_lookup(&ft->fgs_hash, spec, rhash_fg); + /* RCU is atomic, we can't execute FW commands here */ +- rhl_for_each_entry_rcu(g, tmp, list, hash) { ++ bp_rhl_for_each_entry_rcu(g, tmp, list, hash) { ++#else ++ list = rhltable_lookup(&ft->fgs_hash, spec, rhash_fg); ++ /* RCU is atomic, we can't execute FW commands here */ ++ rhl_for_each_entry_rcu(g, tmp, list, hash) { ++#endif + struct match_list *curr_match; + + if (fg && fg != g) +@@ -3232,8 +3303,8 @@ cleanup: + int mlx5_init_fs(struct mlx5_core_dev *dev) + { + struct mlx5_flow_steering *steering; +- char *ftes_cache_name; +- char *fgs_cache_name; ++ char *ftes_cache_name = NULL; ++ char *fgs_cache_name = NULL; + int err = 0; + + err = mlx5_init_fc_stats(dev); diff --git a/src/mlnx-ofa_kernel-5.8/backports/0172-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-fs_.patch b/src/mlnx-ofa_kernel-5.8/backports/0172-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-fs_.patch new file mode 100644 index 0000000..e7d398c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0172-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-fs_.patch @@ -0,0 +1,43 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/fs_core.h + +Change-Id: Idde35652fafb9a1b8cc30c14d741271cddb46cfc +--- + drivers/net/ethernet/mellanox/mlx5/core/fs_core.h | 10 +++++++++- + 1 file changed, 9 insertions(+), 1 deletion(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h +@@ -37,7 +37,7 @@ + #include + #include + #include +-#include ++#include "steering/fs_dr.h" + + #define FDB_TC_MAX_CHAIN 3 + #define FDB_FT_CHAIN (FDB_TC_MAX_CHAIN + 1) +@@ -195,7 +195,11 @@ struct mlx5_flow_table { + /* FWD rules that point on this flow table */ + struct list_head fwd_rules; + u32 flags; ++#if !defined(HAVE_RHLTABLE) && defined(HAVE_NETNS_FRAGS_RHASHTABLE) ++ struct bp_rhltable fgs_hash; ++#else + struct rhltable fgs_hash; ++#endif + enum mlx5_flow_table_miss_action def_miss_action; + struct mlx5_flow_namespace *ns; + }; +@@ -263,7 +267,11 @@ struct mlx5_flow_group { + struct ida fte_allocator; + u32 id; + struct rhashtable ftes_hash; ++#if !defined(HAVE_RHLTABLE) && defined(HAVE_NETNS_FRAGS_RHASHTABLE) ++ struct bp_rhlist_head hash; ++#else + struct rhlist_head hash; ++#endif + }; + + struct mlx5_flow_root_namespace { diff --git a/src/mlnx-ofa_kernel-5.8/backports/0173-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-fs_.patch b/src/mlnx-ofa_kernel-5.8/backports/0173-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-fs_.patch new file mode 100644 index 0000000..9ae57ae --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0173-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-fs_.patch @@ -0,0 +1,187 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c + +Change-Id: I0f0f738f512f8a59b5edc8731c87ebbd549d8e5e +--- + .../ethernet/mellanox/mlx5/core/fs_counters.c | 83 +++++++++++++++++-- + 1 file changed, 76 insertions(+), 7 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c +@@ -106,14 +106,46 @@ static void mlx5_fc_pool_release_counter + * elapsed, the thread will actually query the hardware. + */ + ++#if defined(HAVE_IDR_RT) ++#define USE_IDR 1 ++#else ++/* for now, we want to use this if it's original kernel function and ++ * we don't define idr_* funcs ourselves, so it will be fast. */ ++void *idr_get_next_ul(struct idr *idr, unsigned long *nextid) ++{ ++ int next = (int) *nextid; ++ void *ret; ++ ++ ret = idr_get_next(idr, &next); ++ *nextid = (unsigned long) next; ++ ++ return ret; ++} ++int idr_alloc_u32(struct idr *idr, void *ptr, u32 *nextid, ++ unsigned long max, gfp_t gfp) ++{ ++ int err = idr_alloc(idr, ptr, *nextid, max + 1, gfp); ++ ++ if (err < 0) ++ return err; ++ ++ *nextid = err; ++ ++ return 0; ++} ++#define USE_IDR 1 ++#endif ++ + static struct list_head *mlx5_fc_counters_lookup_next(struct mlx5_core_dev *dev, + u32 id) + { + struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats; +- unsigned long next_id = (unsigned long)id + 1; +- struct mlx5_fc *counter; +- unsigned long tmp; +- ++#ifdef USE_IDR ++ unsigned long next_id = (unsigned long)id + 1; ++#endif ++ struct mlx5_fc *counter; ++#ifdef idr_for_each_entry_continue_ul ++ unsigned long tmp; + rcu_read_lock(); + /* skip counters that are in idr, but not yet in counters list */ + idr_for_each_entry_continue_ul(&fc_stats->counters_idr, +@@ -123,7 +155,24 @@ static struct list_head *mlx5_fc_counter + } + rcu_read_unlock(); + +- return counter ? &counter->list : &fc_stats->counters; ++#elif defined(USE_IDR) ++ rcu_read_lock(); ++ /* skip counters that are in idr, but not yet in counters list */ ++ while ((counter = idr_get_next_ul(&fc_stats->counters_idr, ++ &next_id)) != NULL && ++ list_empty(&counter->list)) ++ next_id++; ++ rcu_read_unlock(); ++#else ++ list_for_each_entry(counter, &fc_stats->counters, list) ++ if (counter->id > id) ++ return &counter->list; ++#endif ++#ifdef USE_IDR ++ return counter ? &counter->list : &fc_stats->counters; ++#else ++ return &fc_stats->counters; ++#endif + } + + static void mlx5_fc_stats_insert(struct mlx5_core_dev *dev, +@@ -137,13 +186,21 @@ static void mlx5_fc_stats_insert(struct + static void mlx5_fc_stats_remove(struct mlx5_core_dev *dev, + struct mlx5_fc *counter) + { ++#ifdef USE_IDR + struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats; ++#endif + + list_del(&counter->list); + ++#ifdef USE_IDR + spin_lock(&fc_stats->counters_idr_lock); ++#ifdef HAVE_IDR_REMOVE_RETURN_VALUE + WARN_ON(!idr_remove(&fc_stats->counters_idr, counter->id)); ++#else ++ idr_remove(&fc_stats->counters_idr, counter->id); ++#endif + spin_unlock(&fc_stats->counters_idr_lock); ++#endif/*USE_IDR*/ + } + + static int get_init_bulk_query_len(struct mlx5_core_dev *dev) +@@ -350,21 +407,27 @@ struct mlx5_fc *mlx5_fc_create_ex(struct + { + struct mlx5_fc *counter = mlx5_fc_acquire(dev, aging); + struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats; ++#ifdef USE_IDR + int err; ++#endif + + if (IS_ERR(counter)) + return counter; + ++#ifdef USE_IDR + INIT_LIST_HEAD(&counter->list); ++#endif + counter->aging = aging; + + if (aging) { ++#ifdef USE_IDR + u32 id = counter->id; +- ++#endif + counter->cache.lastuse = jiffies; + counter->lastbytes = counter->cache.bytes; + counter->lastpackets = counter->cache.packets; + ++#ifdef USE_IDR + idr_preload(GFP_KERNEL); + spin_lock(&fc_stats->counters_idr_lock); + +@@ -375,15 +438,18 @@ struct mlx5_fc *mlx5_fc_create_ex(struct + idr_preload_end(); + if (err) + goto err_out_alloc; ++#endif + + llist_add(&counter->addlist, &fc_stats->addlist); + } + + return counter; + ++#ifdef USE_IDR + err_out_alloc: + mlx5_fc_release(dev, counter); + return ERR_PTR(err); ++#endif + } + + struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging) +@@ -462,8 +528,10 @@ int mlx5_init_fc_stats(struct mlx5_core_ + if (dev->disable_fc) + return 0; + ++#ifdef USE_IDR + spin_lock_init(&fc_stats->counters_idr_lock); + idr_init(&fc_stats->counters_idr); ++#endif + INIT_LIST_HEAD(&fc_stats->counters); + init_llist_head(&fc_stats->addlist); + init_llist_head(&fc_stats->dellist); +@@ -512,7 +580,9 @@ void mlx5_cleanup_fc_stats(struct mlx5_c + mlx5_fc_release(dev, counter); + + mlx5_fc_pool_cleanup(&fc_stats->fc_pool); ++#ifdef USE_IDR + idr_destroy(&fc_stats->counters_idr); ++#endif + kfree(fc_stats->bulk_query_out); + } + +@@ -566,7 +636,6 @@ void mlx5_fc_update_sampling_interval(st + fc_stats->sampling_interval = min_t(unsigned long, interval, + fc_stats->sampling_interval); + } +- + /* Flow counter bluks */ + + struct mlx5_fc_bulk { diff --git a/src/mlnx-ofa_kernel-5.8/backports/0174-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-fw..patch b/src/mlnx-ofa_kernel-5.8/backports/0174-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-fw..patch new file mode 100644 index 0000000..e0fd5e9 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0174-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-fw..patch @@ -0,0 +1,60 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/fw.c + +Change-Id: I2842ac9a3e88d1c27ded1a1f52a5f13511d8058d +--- + drivers/net/ethernet/mellanox/mlx5/core/fw.c | 17 ++++++++++++++--- + 1 file changed, 14 insertions(+), 3 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c +@@ -262,11 +262,13 @@ int mlx5_query_hca_caps(struct mlx5_core + return err; + } + ++#ifdef HAVE_UAPI_LINUX_TLS_H + if (mlx5_accel_is_ktls_tx(dev) || mlx5_accel_is_ktls_rx(dev)) { + err = mlx5_core_get_caps(dev, MLX5_CAP_TLS); + if (err) + return err; + } ++#endif + + if (MLX5_CAP_GEN_64(dev, general_obj_types) & + MLX5_GENERAL_OBJ_TYPES_CAP_VIRTIO_NET_Q) { +@@ -706,15 +708,20 @@ static const struct mlxfw_dev_ops mlx5_m + }; + + int mlx5_firmware_flash(struct mlx5_core_dev *dev, +- const struct firmware *firmware, +- struct netlink_ext_ack *extack) ++ const struct firmware *firmware ++#ifdef HAVE_NETLINK_EXT_ACK ++ , struct netlink_ext_ack *extack ++#endif ++ ) + { + struct mlx5_mlxfw_dev mlx5_mlxfw_dev = { + .mlxfw_dev = { + .ops = &mlx5_mlxfw_dev_ops, + .psid = dev->board_id, + .psid_size = strlen(dev->board_id), ++#ifdef HAVE_DEVLINK_H + .devlink = priv_to_devlink(dev), ++#endif + }, + .mlx5_core_dev = dev + }; +@@ -728,7 +735,11 @@ int mlx5_firmware_flash(struct mlx5_core + } + + return mlxfw_firmware_flash(&mlx5_mlxfw_dev.mlxfw_dev, +- firmware, extack); ++ firmware ++#ifdef HAVE_NETLINK_EXT_ACK ++ , extack ++#endif ++ ); + } + + static int mlx5_reg_mcqi_version_query(struct mlx5_core_dev *dev, diff --git a/src/mlnx-ofa_kernel-5.8/backports/0175-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-fw_.patch b/src/mlnx-ofa_kernel-5.8/backports/0175-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-fw_.patch new file mode 100644 index 0000000..59d4889 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0175-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-fw_.patch @@ -0,0 +1,209 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c + +Change-Id: I9a6db7203a4d745dd194a002ff04e2819aaf29cb +--- + .../ethernet/mellanox/mlx5/core/fw_reset.c | 72 ++++++++++++++++++- + 1 file changed, 71 insertions(+), 1 deletion(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c +@@ -8,7 +8,9 @@ + enum { + MLX5_FW_RESET_FLAGS_RESET_REQUESTED, + MLX5_FW_RESET_FLAGS_NACK_RESET_REQUEST, ++#ifdef HAVE_DEVLINK_RELOAD_DOWN_SUPPORT_RELOAD_ACTION + MLX5_FW_RESET_FLAGS_PENDING_COMP, ++#endif + MLX5_FW_RESET_FLAGS_DROP_NEW_REQUESTS + }; + +@@ -23,8 +25,10 @@ struct mlx5_fw_reset { + struct work_struct reset_abort_work; + unsigned long reset_flags; + struct timer_list timer; ++#ifdef HAVE_DEVLINK_RELOAD_DOWN_SUPPORT_RELOAD_ACTION + struct completion done; + int ret; ++#endif + }; + + void mlx5_fw_reset_enable_remote_dev_reset_set(struct mlx5_core_dev *dev, bool enable) +@@ -84,6 +88,7 @@ int mlx5_fw_reset_query(struct mlx5_core + return mlx5_reg_mfrl_query(dev, reset_level, reset_type, NULL); + } + ++#ifdef HAVE_DEVLINK_RELOAD_DOWN_SUPPORT_RELOAD_ACTION + static int mlx5_fw_reset_get_reset_state_err(struct mlx5_core_dev *dev, + struct netlink_ext_ack *extack) + { +@@ -135,12 +140,14 @@ int mlx5_fw_reset_set_reset_sync(struct + NL_SET_ERR_MSG_MOD(extack, "Sync reset command failed"); + return mlx5_cmd_check(dev, err, in, out); + } ++#endif /* HAVE_DEVLINK_RELOAD_DOWN_SUPPORT_RELOAD_ACTION */ + + int mlx5_fw_reset_set_live_patch(struct mlx5_core_dev *dev) + { + return mlx5_reg_mfrl_set(dev, MLX5_MFRL_REG_RESET_LEVEL0, 0, 0, false); + } + ++#ifdef HAVE_DEVLINK_RELOAD_DOWN_SUPPORT_RELOAD_ACTION + static void mlx5_fw_reset_complete_reload(struct mlx5_core_dev *dev) + { + struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset; +@@ -149,12 +156,18 @@ static void mlx5_fw_reset_complete_reloa + if (test_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags)) { + complete(&fw_reset->done); + } else { ++#ifdef HAVE_DEVL_TRAP_GROUPS_REGISTER //forward port ++ mlx5_unload_one(dev); ++ if (mlx5_health_wait_pci_up(dev)) ++ mlx5_core_err(dev, "reset reload flow aborted, PCI reads still not working\n"); ++#endif + mlx5_load_one(dev, false); + devlink_remote_reload_actions_performed(priv_to_devlink(dev), 0, + BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT) | + BIT(DEVLINK_RELOAD_ACTION_FW_ACTIVATE)); + } + } ++#endif + + static void mlx5_stop_sync_reset_poll(struct mlx5_core_dev *dev) + { +@@ -183,22 +196,45 @@ static void mlx5_sync_reset_reload_work( + struct mlx5_fw_reset *fw_reset = container_of(work, struct mlx5_fw_reset, + reset_reload_work); + struct mlx5_core_dev *dev = fw_reset->dev; ++#ifndef HAVE_DEVL_TRAP_GROUPS_REGISTER //forward port + int err; ++#endif + + mlx5_sync_reset_clear_reset_requested(dev, false); + mlx5_enter_error_state(dev, true); ++#ifndef HAVE_DEVL_TRAP_GROUPS_REGISTER //forward port + mlx5_unload_one(dev); + err = mlx5_health_wait_pci_up(dev); +- if (err) ++ if (err) { + mlx5_core_err(dev, "reset reload flow aborted, PCI reads still not working\n"); ++#ifndef HAVE_DEVLINK_RELOAD_DOWN_SUPPORT_RELOAD_ACTION ++ return; ++#endif ++ } ++#endif /* HAVE_DEVL_TRAP_GROUPS_REGISTER */ ++ ++#ifdef HAVE_DEVLINK_RELOAD_DOWN_SUPPORT_RELOAD_ACTION ++#ifndef HAVE_DEVL_TRAP_GROUPS_REGISTER //forward port + fw_reset->ret = err; ++#endif + mlx5_fw_reset_complete_reload(dev); ++#else /* HAVE_DEVLINK_RELOAD_DOWN_SUPPORT_RELOAD_ACTION */ ++ mlx5_load_one(dev, false); ++#endif /* HAVE_DEVLINK_RELOAD_DOWN_SUPPORT_RELOAD_ACTION */ + } + + #define MLX5_RESET_POLL_INTERVAL (HZ / 10) ++#ifdef HAVE_TIMER_SETUP + static void poll_sync_reset(struct timer_list *t) ++#else ++static void poll_sync_reset(unsigned long data) ++#endif + { ++#ifdef HAVE_TIMER_SETUP + struct mlx5_fw_reset *fw_reset = from_timer(fw_reset, t, timer); ++#else ++ struct mlx5_fw_reset *fw_reset = (struct mlx5_fw_reset *)data; ++#endif + struct mlx5_core_dev *dev = fw_reset->dev; + u32 fatal_error; + +@@ -224,7 +260,13 @@ static void mlx5_start_sync_reset_poll(s + { + struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset; + ++#ifdef HAVE_TIMER_SETUP + timer_setup(&fw_reset->timer, poll_sync_reset, 0); ++#else ++ init_timer(&fw_reset->timer); ++ fw_reset->timer.data = (unsigned long)fw_reset; ++ fw_reset->timer.function = poll_sync_reset; ++#endif + fw_reset->timer.expires = round_jiffies(jiffies + MLX5_RESET_POLL_INTERVAL); + add_timer(&fw_reset->timer); + } +@@ -288,6 +330,10 @@ static void mlx5_sync_reset_request_even + mlx5_core_warn(dev, "PCI Sync FW Update Reset Ack. Device reset is expected.\n"); + } + ++#ifndef PCI_EXP_RTCAP_CRSVIS ++#define PCI_EXP_RTCAP_CRSVIS 0x0001 /* CRS Software Visibility capability */ ++#endif ++ + static int mlx5_pci_config_hw_control(struct pci_dev *root_port, + bool new_val, bool *prev_val) + { +@@ -440,6 +486,20 @@ static int mlx5_reset_pci_topology(struc + return 0; + } + ++#ifndef HAVE_PCIE_FIND_ROOT_PORT ++static inline struct pci_dev *pcie_find_root_port(struct pci_dev *dev) ++{ ++ while (dev) { ++ if (pci_is_pcie(dev) && ++ pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT) ++ return dev; ++ dev = pci_upstream_bridge(dev); ++ } ++ ++ return NULL; ++} ++#endif ++ + static int mlx5_pci_link_toggle_ecpf(struct mlx5_core_dev *dev) + { + struct pci_dev *root_port; +@@ -638,10 +698,16 @@ static void mlx5_sync_reset_now_event(st + } + + mlx5_enter_error_state(dev, true); ++#ifndef HAVE_DEVL_TRAP_GROUPS_REGISTER //forward port + mlx5_unload_one(dev); ++#endif + done: ++#ifdef HAVE_DEVLINK_RELOAD_DOWN_SUPPORT_RELOAD_ACTION + fw_reset->ret = err; + mlx5_fw_reset_complete_reload(dev); ++#else ++ mlx5_load_one(dev, false); ++#endif + } + + static void mlx5_sync_reset_abort_event(struct work_struct *work) +@@ -697,6 +763,7 @@ static int fw_reset_event_notifier(struc + return NOTIFY_OK; + } + ++#ifdef HAVE_DEVLINK_RELOAD_DOWN_SUPPORT_RELOAD_ACTION + int mlx5_fw_reset_wait_reset_done(struct mlx5_core_dev *dev) + { + unsigned long pci_sync_update_timeout = mlx5_tout_ms(dev, PCI_SYNC_UPDATE); +@@ -715,6 +782,7 @@ out: + clear_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags); + return err; + } ++#endif /* HAVE_DEVLINK_RELOAD_DOWN_SUPPORT_RELOAD_ACTION */ + + void mlx5_fw_reset_events_start(struct mlx5_core_dev *dev) + { +@@ -762,7 +830,9 @@ int mlx5_fw_reset_init(struct mlx5_core_ + INIT_WORK(&fw_reset->reset_now_work, mlx5_sync_reset_now_event); + INIT_WORK(&fw_reset->reset_abort_work, mlx5_sync_reset_abort_event); + ++#ifdef HAVE_DEVLINK_RELOAD_DOWN_SUPPORT_RELOAD_ACTION + init_completion(&fw_reset->done); ++#endif + return 0; + } + diff --git a/src/mlnx-ofa_kernel-5.8/backports/0176-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-fw_.patch b/src/mlnx-ofa_kernel-5.8/backports/0176-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-fw_.patch new file mode 100644 index 0000000..5d46c39 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0176-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-fw_.patch @@ -0,0 +1,20 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/fw_reset.h + +Change-Id: I6ed68a3bbdf1f195b5ce24235473310f5308921e +--- + drivers/net/ethernet/mellanox/mlx5/core/fw_reset.h | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.h +@@ -13,7 +13,9 @@ int mlx5_fw_reset_set_reset_sync(struct + struct netlink_ext_ack *extack); + int mlx5_fw_reset_set_live_patch(struct mlx5_core_dev *dev); + ++#ifdef HAVE_DEVLINK_RELOAD_DOWN_SUPPORT_RELOAD_ACTION + int mlx5_fw_reset_wait_reset_done(struct mlx5_core_dev *dev); ++#endif + void mlx5_fw_reset_events_start(struct mlx5_core_dev *dev); + void mlx5_fw_reset_events_stop(struct mlx5_core_dev *dev); + void mlx5_drain_fw_reset(struct mlx5_core_dev *dev); diff --git a/src/mlnx-ofa_kernel-5.8/backports/0177-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-hea.patch b/src/mlnx-ofa_kernel-5.8/backports/0177-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-hea.patch new file mode 100644 index 0000000..cd93b38 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0177-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-hea.patch @@ -0,0 +1,344 @@ +From: Maher Sanalla +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/health.c + +Change-Id: I2a5e79b8debeeb4a659fb0d47837349105487e27 +--- + .../net/ethernet/mellanox/mlx5/core/health.c | 157 ++++++++++++++++-- + 1 file changed, 147 insertions(+), 10 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c +@@ -80,6 +80,31 @@ enum { + MLX5_SEVERITY_VALID_MASK = 0x8, + }; + ++/* BACKPORT for RHEL 7.4 PPC - can't be inline with unknown num of args */ ++static void mlx5_printk(struct mlx5_core_dev *dev, int level, const char *format, ...) ++{ ++ struct device *device = dev->device; ++ struct va_format vaf; ++ va_list args; ++ ++ if (WARN_ONCE(level < LOGLEVEL_EMERG || level > LOGLEVEL_DEBUG, ++ "Level %d is out of range, set to default level\n", level)) ++ level = LOGLEVEL_DEFAULT; ++ ++ va_start(args, format); ++ vaf.fmt = format; ++ vaf.va = &args; ++ ++ dev_printk_emit(level, device, "%s %s: %pV", dev_driver_string(device), dev_name(device), ++ &vaf); ++ va_end(args); ++} ++ ++#define mlx5_log(__dev, level, format, ...) \ ++ mlx5_printk(__dev, level, "%s:%d:(pid %d): " format, \ ++ __func__, __LINE__, current->pid, \ ++ ##__VA_ARGS__) ++ + u8 mlx5_get_nic_state(struct mlx5_core_dev *dev) + { + return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 7; +@@ -368,6 +393,23 @@ err_eio: + return -EIO; + } + ++#ifndef HAVE_DEVLINK_HEALTH_REPORT_SUPPORT ++static void health_recover_work(struct work_struct *work) ++{ ++ struct mlx5_core_health *health; ++ struct mlx5_core_dev *dev; ++ struct mlx5_priv *priv; ++ ++ health = container_of(work, struct mlx5_core_health, fatal_report_work); ++ priv = container_of(health, struct mlx5_priv, health); ++ dev = container_of(priv, struct mlx5_core_dev, priv); ++ ++ enter_error_state(dev, false); ++ if (mlx5_health_try_recover(dev)) ++ mlx5_core_err(dev, "Health recovery failed\n"); ++} ++#endif /* HAVE_DEVLINK_HEALTH_REPORT_SUPPORT */ ++ + static const char *hsynd_str(u8 synd) + { + switch (synd) { +@@ -468,10 +510,14 @@ static void print_health_info(struct mlx + mlx5_log(dev, severity, "raw fw_ver 0x%08x\n", ioread32be(&h->fw_ver)); + } + ++#ifdef HAVE_DEVLINK_HEALTH_REPORT_SUPPORT + static int + mlx5_fw_reporter_diagnose(struct devlink_health_reporter *reporter, +- struct devlink_fmsg *fmsg, +- struct netlink_ext_ack *extack) ++ struct devlink_fmsg *fmsg ++#ifdef HAVE_HEALTH_REPORTER_RECOVER_HAS_EXTACK ++ , struct netlink_ext_ack *extack ++#endif ++ ) + { + struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter); + struct mlx5_core_health *health = &dev->priv.health; +@@ -583,8 +629,11 @@ mlx5_fw_reporter_heath_buffer_data_put(s + + static int + mlx5_fw_reporter_dump(struct devlink_health_reporter *reporter, +- struct devlink_fmsg *fmsg, void *priv_ctx, +- struct netlink_ext_ack *extack) ++ struct devlink_fmsg *fmsg, void *priv_ctx ++#ifdef HAVE_HEALTH_REPORTER_RECOVER_HAS_EXTACK ++ , struct netlink_ext_ack *extack ++#endif ++ ) + { + struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter); + int err; +@@ -638,22 +687,36 @@ static const struct devlink_health_repor + + static int + mlx5_fw_fatal_reporter_recover(struct devlink_health_reporter *reporter, +- void *priv_ctx, +- struct netlink_ext_ack *extack) ++ void *priv_ctx ++#ifdef HAVE_HEALTH_REPORTER_RECOVER_HAS_EXTACK ++ , struct netlink_ext_ack *extack ++#endif ++ ) + { + struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter); + + return mlx5_health_try_recover(dev); + } + ++#ifndef HAVE_DEVLINK_FMSG_BINARY_PAIR_PUT_ARG_U32 ++#define MLX5_CR_DUMP_CHUNK_SIZE 256 ++#endif ++ + static int + mlx5_fw_fatal_reporter_dump(struct devlink_health_reporter *reporter, +- struct devlink_fmsg *fmsg, void *priv_ctx, +- struct netlink_ext_ack *extack) ++ struct devlink_fmsg *fmsg, void *priv_ctx ++#ifdef HAVE_HEALTH_REPORTER_RECOVER_HAS_EXTACK ++ , struct netlink_ext_ack *extack ++#endif ++ ) + { + struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter); + u32 crdump_size = dev->priv.health.crdump_size; + u32 *cr_data; ++#ifndef HAVE_DEVLINK_FMSG_BINARY_PAIR_PUT_ARG_U32 ++ u32 data_size; ++ u32 offset; ++#endif + int err; + + if (!mlx5_core_is_pf(dev)) +@@ -677,7 +740,24 @@ mlx5_fw_fatal_reporter_dump(struct devli + goto free_data; + } + ++#ifdef HAVE_DEVLINK_FMSG_BINARY_PAIR_PUT_ARG_U32 + err = devlink_fmsg_binary_pair_put(fmsg, "crdump_data", cr_data, crdump_size); ++#else ++ err = devlink_fmsg_arr_pair_nest_start(fmsg, "crdump_data"); ++ if (err) ++ goto free_data; ++ for (offset = 0; offset < crdump_size; offset += data_size) { ++ if (crdump_size - offset < MLX5_CR_DUMP_CHUNK_SIZE) ++ data_size = crdump_size - offset; ++ else ++ data_size = MLX5_CR_DUMP_CHUNK_SIZE; ++ err = devlink_fmsg_binary_put(fmsg, (char *)cr_data + offset, ++ data_size); ++ if (err) ++ goto free_data; ++ } ++ err = devlink_fmsg_arr_pair_nest_end(fmsg); ++#endif /* HAVE_DEVLINK_FMSG_BINARY_PAIR_PUT_ARG_U32 */ + + free_data: + kvfree(cr_data); +@@ -690,24 +770,38 @@ static void mlx5_fw_fatal_reporter_err_w + struct mlx5_fw_reporter_ctx fw_reporter_ctx; + struct mlx5_core_health *health; + struct mlx5_core_dev *dev; ++#ifdef HAVE_DEVL_TRAP_GROUPS_REGISTER //forward port ++ struct devlink *devlink; ++#endif + struct mlx5_priv *priv; + + health = container_of(work, struct mlx5_core_health, fatal_report_work); + priv = container_of(health, struct mlx5_priv, health); + dev = container_of(priv, struct mlx5_core_dev, priv); +- ++#ifdef HAVE_DEVL_TRAP_GROUPS_REGISTER //forward port ++ devlink = priv_to_devlink(dev); ++#endif + enter_error_state(dev, false); + if (IS_ERR_OR_NULL(health->fw_fatal_reporter)) { ++#ifdef HAVE_DEVL_TRAP_GROUPS_REGISTER //forward port ++ devl_lock(devlink); ++#endif + if (mlx5_health_try_recover(dev)) + mlx5_core_err(dev, "health recovery failed\n"); ++#ifdef HAVE_DEVL_TRAP_GROUPS_REGISTER //forward port ++ devl_unlock(devlink); ++#endif + return; + } + fw_reporter_ctx.err_synd = health->synd; + fw_reporter_ctx.miss_counter = health->miss_counter; ++#ifdef HAVE_DEVLINK_HEALTH_REPORTER_STATE_UPDATE + if (health->failed_in_seq && + health->failed_in_seq < MLX5_MAX_FAILED_RECOVERIES_IN_SEQUENCE) + devlink_health_reporter_state_update(health->fw_fatal_reporter, + DEVLINK_HEALTH_REPORTER_STATE_HEALTHY); ++#endif ++ + if (devlink_health_report(health->fw_fatal_reporter, + "FW fatal error reported", &fw_reporter_ctx) == -ECANCELED) { + /* If recovery wasn't performed, due to grace period, +@@ -748,7 +842,11 @@ static void mlx5_fw_reporters_create(str + + health->fw_reporter = + devlink_health_reporter_create(devlink, &mlx5_fw_reporter_ops, +- 0, dev); ++#ifndef HAVE_DEVLINK_HEALTH_REPORTER_CREATE_5_ARGS ++ 0, dev); ++#else ++ 0, false, dev); ++#endif + if (IS_ERR(health->fw_reporter)) + mlx5_core_warn(dev, "Failed to create fw reporter, err = %ld\n", + PTR_ERR(health->fw_reporter)); +@@ -756,7 +854,11 @@ static void mlx5_fw_reporters_create(str + health->fw_fatal_reporter = + devlink_health_reporter_create(devlink, + &mlx5_fw_fatal_reporter_ops, ++#ifndef HAVE_DEVLINK_HEALTH_REPORTER_CREATE_5_ARGS + grace_period, dev); ++#else ++ grace_period, true, dev); ++#endif + if (IS_ERR(health->fw_fatal_reporter)) + mlx5_core_warn(dev, "Failed to create fw fatal reporter, err = %ld\n", + PTR_ERR(health->fw_fatal_reporter)); +@@ -772,6 +874,7 @@ static void mlx5_fw_reporters_destroy(st + if (!IS_ERR_OR_NULL(health->fw_fatal_reporter)) + devlink_health_reporter_destroy(health->fw_fatal_reporter); + } ++#endif /* HAVE_DEVLINK_HEALTH_REPORT_SUPPORT */ + + static unsigned long get_next_poll_jiffies(struct mlx5_core_dev *dev) + { +@@ -822,13 +925,25 @@ static void mlx5_health_log_ts_update(st + msecs_to_jiffies(MLX5_MSEC_PER_HOUR)); + } + ++#ifdef HAVE_TIMER_SETUP + static void poll_health(struct timer_list *t) ++#else ++static void poll_health(unsigned long data) ++#endif + { ++#ifdef HAVE_TIMER_SETUP + struct mlx5_core_dev *dev = from_timer(dev, t, priv.health.timer); ++#else ++ struct mlx5_core_dev *dev = (struct mlx5_core_dev *)data; ++#endif + struct mlx5_core_health *health = &dev->priv.health; ++#ifdef HAVE_DEVLINK_HEALTH_REPORT_SUPPORT + struct health_buffer __iomem *h = health->health; ++#endif /* HAVE_DEVLINK_HEALTH_REPORT_SUPPORT */ + u32 fatal_error; ++#ifdef HAVE_DEVLINK_HEALTH_REPORT_SUPPORT + u8 prev_synd; ++#endif /* HAVE_DEVLINK_HEALTH_REPORT_SUPPORT */ + u32 count; + + if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) +@@ -855,13 +970,17 @@ static void poll_health(struct timer_lis + if (health->miss_counter == MAX_MISSES) { + mlx5_core_err(dev, "device's health compromised - reached miss count\n"); + print_health_info(dev); ++#ifdef HAVE_DEVLINK_HEALTH_REPORT_SUPPORT + queue_work(health->wq, &health->report_work); ++#endif /* HAVE_DEVLINK_HEALTH_REPORT_SUPPORT */ + } + ++#ifdef HAVE_DEVLINK_HEALTH_REPORT_SUPPORT + prev_synd = health->synd; + health->synd = ioread8(&h->synd); + if (health->synd && health->synd != prev_synd) + queue_work(health->wq, &health->report_work); ++#endif /* HAVE_DEVLINK_HEALTH_REPORT_SUPPORT */ + + out: + mod_timer(&health->timer, get_next_poll_jiffies(dev)); +@@ -872,7 +991,13 @@ void mlx5_start_health_poll(struct mlx5_ + u64 poll_interval_ms = mlx5_tout_ms(dev, HEALTH_POLL_INTERVAL); + struct mlx5_core_health *health = &dev->priv.health; + ++#ifdef HAVE_TIMER_SETUP + timer_setup(&health->timer, poll_health, 0); ++#else ++ init_timer(&health->timer); ++ health->timer.data = (unsigned long)dev; ++ health->timer.function = poll_health; ++#endif + health->fatal_error = MLX5_SENSOR_NO_ERR; + clear_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags); + health->health = &dev->iseg->health; +@@ -913,7 +1038,9 @@ void mlx5_drain_health_wq(struct mlx5_co + set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags); + spin_unlock_irqrestore(&health->wq_lock, flags); + cancel_delayed_work_sync(&health->update_fw_log_ts_work); ++#ifdef HAVE_DEVLINK_HEALTH_REPORT_SUPPORT + cancel_work_sync(&health->report_work); ++#endif /* HAVE_DEVLINK_HEALTH_REPORT_SUPPORT */ + cancel_work_sync(&health->fatal_report_work); + } + +@@ -930,7 +1057,9 @@ void mlx5_health_cleanup(struct mlx5_cor + + cancel_delayed_work_sync(&health->update_fw_log_ts_work); + destroy_workqueue(health->wq); ++#ifdef HAVE_DEVLINK_HEALTH_REPORT_SUPPORT + mlx5_fw_reporters_destroy(dev); ++#endif /* HAVE_DEVLINK_HEALTH_REPORT_SUPPORT */ + } + + int mlx5_health_init(struct mlx5_core_dev *dev) +@@ -938,7 +1067,9 @@ int mlx5_health_init(struct mlx5_core_de + struct mlx5_core_health *health; + char *name; + ++#ifdef HAVE_DEVLINK_HEALTH_REPORT_SUPPORT + mlx5_fw_reporters_create(dev); ++#endif /* HAVE_DEVLINK_HEALTH_REPORT_SUPPORT */ + + health = &dev->priv.health; + name = kmalloc(64, GFP_KERNEL); +@@ -952,13 +1083,19 @@ int mlx5_health_init(struct mlx5_core_de + if (!health->wq) + goto out_err; + spin_lock_init(&health->wq_lock); ++#ifdef HAVE_DEVLINK_HEALTH_REPORT_SUPPORT + INIT_WORK(&health->fatal_report_work, mlx5_fw_fatal_reporter_err_work); + INIT_WORK(&health->report_work, mlx5_fw_reporter_err_work); ++#else ++ INIT_WORK(&health->fatal_report_work, health_recover_work); ++#endif + INIT_DELAYED_WORK(&health->update_fw_log_ts_work, mlx5_health_log_ts_update); + + return 0; + + out_err: ++#ifdef HAVE_DEVLINK_HEALTH_REPORT_SUPPORT + mlx5_fw_reporters_destroy(dev); ++#endif + return -ENOMEM; + } diff --git a/src/mlnx-ofa_kernel-5.8/backports/0178-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-ipo.patch b/src/mlnx-ofa_kernel-5.8/backports/0178-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-ipo.patch new file mode 100644 index 0000000..7786418 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0178-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-ipo.patch @@ -0,0 +1,193 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c + +Change-Id: Ic39fbdeb861fd185fd473a6d935efaea29f20841 +--- + .../mellanox/mlx5/core/ipoib/ethtool.c | 98 ++++++++++++++++++- + 1 file changed, 96 insertions(+), 2 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c +@@ -67,9 +67,13 @@ static void mlx5i_get_ethtool_stats(stru + } + + static int mlx5i_set_ringparam(struct net_device *dev, ++#ifdef HAVE_GET_RINGPARAM_GET_4_PARAMS + struct ethtool_ringparam *param, + struct kernel_ethtool_ringparam *kernel_param, + struct netlink_ext_ack *extack) ++#else ++ struct ethtool_ringparam *param) ++#endif + { + struct mlx5e_priv *priv = mlx5i_epriv(dev); + +@@ -77,9 +81,13 @@ static int mlx5i_set_ringparam(struct ne + } + + static void mlx5i_get_ringparam(struct net_device *dev, ++#ifdef HAVE_GET_RINGPARAM_GET_4_PARAMS + struct ethtool_ringparam *param, + struct kernel_ethtool_ringparam *kernel_param, + struct netlink_ext_ack *extack) ++#else ++ struct ethtool_ringparam *param) ++#endif + { + struct mlx5e_priv *priv = mlx5i_epriv(dev); + +@@ -103,23 +111,41 @@ static void mlx5i_get_channels(struct ne + } + + static int mlx5i_set_coalesce(struct net_device *netdev, ++#ifdef HAVE_NDO_GET_COALESCE_GET_4_PARAMS + struct ethtool_coalesce *coal, + struct kernel_ethtool_coalesce *kernel_coal, + struct netlink_ext_ack *extack) ++#else ++ struct ethtool_coalesce *coal) ++#endif + { + struct mlx5e_priv *priv = mlx5i_epriv(netdev); + +- return mlx5e_ethtool_set_coalesce(priv, coal, kernel_coal, extack); ++ return mlx5e_ethtool_set_coalesce(priv, ++#ifdef HAVE_NDO_GET_COALESCE_GET_4_PARAMS ++ coal, kernel_coal, extack); ++#else ++ coal); ++#endif + } + + static int mlx5i_get_coalesce(struct net_device *netdev, ++#ifdef HAVE_NDO_GET_COALESCE_GET_4_PARAMS + struct ethtool_coalesce *coal, + struct kernel_ethtool_coalesce *kernel_coal, + struct netlink_ext_ack *extack) ++#else ++ struct ethtool_coalesce *coal) ++#endif + { + struct mlx5e_priv *priv = mlx5i_epriv(netdev); + +- return mlx5e_ethtool_get_coalesce(priv, coal, kernel_coal); ++ return mlx5e_ethtool_get_coalesce(priv, ++#ifdef HAVE_NDO_GET_COALESCE_GET_4_PARAMS ++ coal, kernel_coal); ++#else ++ coal); ++#endif + } + + static int mlx5i_get_ts_info(struct net_device *netdev, +@@ -190,6 +216,7 @@ static int mlx5i_get_speed_settings(u16 + return rate * width; + } + ++#ifdef HAVE_GET_SET_LINK_KSETTINGS + static int mlx5i_get_link_ksettings(struct net_device *netdev, + struct ethtool_link_ksettings *link_ksettings) + { +@@ -220,11 +247,64 @@ static int mlx5i_get_link_ksettings(stru + + return 0; + } ++#endif ++ ++#ifdef HAVE_ETHTOOL_GET_SET_SETTINGS ++static int mlx5i_get_port_settings(struct net_device *netdev, ++ u16 *ib_link_width_oper, u16 *ib_proto_oper) ++{ ++ struct mlx5e_priv *priv = mlx5i_epriv(netdev); ++ struct mlx5_core_dev *mdev = priv->mdev; ++ u32 out[MLX5_ST_SZ_DW(ptys_reg)] = {0}; ++ int ret; ++ ++ ret = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_IB, 1); ++ if (ret) ++ return ret; ++ ++ *ib_link_width_oper = MLX5_GET(ptys_reg, out, ib_link_width_oper); ++ *ib_proto_oper = MLX5_GET(ptys_reg, out, ib_proto_oper); ++ ++ return 0; ++} ++ ++static int mlx5i_get_settings(struct net_device *netdev, ++ struct ethtool_cmd *ecmd) ++{ ++ u16 ib_link_width_oper; ++ u16 ib_proto_oper; ++ int speed, ret; ++ ++ ret = mlx5i_get_port_settings(netdev, ++ &ib_link_width_oper, ++ &ib_proto_oper); ++ if (ret) ++ return ret; ++ ++ speed = mlx5i_get_speed_settings(ib_link_width_oper, ib_proto_oper); ++ if (speed < 0) ++ return -EINVAL; ++ ++ ecmd->duplex = DUPLEX_FULL; ++ ecmd->port = PORT_OTHER;// FIXME: till define IB port type ++ ecmd->phy_address = 255; ++ ecmd->autoneg = AUTONEG_DISABLE; ++ ++ ethtool_cmd_speed_set(ecmd, speed); ++ ++ return 0; ++} ++#endif + + #ifdef CONFIG_MLX5_EN_RXNFC + static u32 mlx5i_flow_type_mask(u32 flow_type) + { ++#ifdef HAVE_FLOW_RSS + return flow_type & ~(FLOW_EXT | FLOW_MAC_EXT | FLOW_RSS); ++#else ++ return flow_type & ~(FLOW_EXT | FLOW_MAC_EXT); ++#endif ++ + } + + static int mlx5i_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd) +@@ -248,9 +328,11 @@ static int mlx5i_get_rxnfc(struct net_de + #endif + + const struct ethtool_ops mlx5i_ethtool_ops = { ++#ifdef HAVE_SUPPORTED_COALESCE_PARAM + .supported_coalesce_params = ETHTOOL_COALESCE_USECS | + ETHTOOL_COALESCE_MAX_FRAMES | + ETHTOOL_COALESCE_USE_ADAPTIVE, ++#endif + .get_drvinfo = mlx5i_get_drvinfo, + .get_strings = mlx5i_get_strings, + .get_sset_count = mlx5i_get_sset_count, +@@ -267,7 +349,12 @@ const struct ethtool_ops mlx5i_ethtool_o + .get_rxnfc = mlx5i_get_rxnfc, + .set_rxnfc = mlx5i_set_rxnfc, + #endif ++#ifdef HAVE_GET_SET_LINK_KSETTINGS + .get_link_ksettings = mlx5i_get_link_ksettings, ++#endif ++#ifdef HAVE_ETHTOOL_GET_SET_SETTINGS ++ .get_settings = mlx5i_get_settings, ++#endif + .get_link = ethtool_op_get_link, + }; + +@@ -275,4 +362,11 @@ const struct ethtool_ops mlx5i_pkey_etht + .get_drvinfo = mlx5i_get_drvinfo, + .get_link = ethtool_op_get_link, + .get_ts_info = mlx5i_get_ts_info, ++#ifdef HAVE_GET_SET_LINK_KSETTINGS ++ .get_link_ksettings = mlx5i_get_link_ksettings, ++#endif ++#ifdef HAVE_ETHTOOL_GET_SET_SETTINGS ++ .get_settings = mlx5i_get_settings, ++#endif + }; ++ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0179-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-ipo.patch b/src/mlnx-ofa_kernel-5.8/backports/0179-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-ipo.patch new file mode 100644 index 0000000..8caef74 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0179-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-ipo.patch @@ -0,0 +1,205 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c + +Change-Id: I2bf3ef8d7e901edd91c7880d91ebc800ac671803 +--- + .../ethernet/mellanox/mlx5/core/ipoib/ipoib.c | 95 ++++++++++++++++++- + 1 file changed, 90 insertions(+), 5 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c +@@ -46,11 +46,28 @@ static int mlx5i_change_mtu(struct net_d + static const struct net_device_ops mlx5i_netdev_ops = { + .ndo_open = mlx5i_open, + .ndo_stop = mlx5i_close, +- .ndo_get_stats64 = mlx5i_get_stats, +- .ndo_init = mlx5i_dev_init, +- .ndo_uninit = mlx5i_dev_cleanup, +- .ndo_change_mtu = mlx5i_change_mtu, ++#if defined(HAVE_NDO_GET_STATS64) || defined(HAVE_NDO_GET_STATS64_RET_VOID) ++ .ndo_get_stats64 = mlx5i_get_stats, ++#else ++ .ndo_get_stats = mlx5i_get_stats, ++#endif ++ .ndo_init = mlx5i_dev_init, ++ .ndo_uninit = mlx5i_dev_cleanup, ++#ifndef HAVE_NDO_CHANGE_MTU_EXTENDED ++ .ndo_change_mtu = mlx5i_change_mtu, ++#else ++ .extended.ndo_change_mtu = mlx5i_change_mtu, ++#endif ++ ++#ifdef HAVE_NDO_ETH_IOCTL + .ndo_eth_ioctl = mlx5i_ioctl, ++#else ++ .ndo_do_ioctl = mlx5i_ioctl, ++#endif ++ ++#ifdef HAVE_NET_DEVICE_OPS_EXTENDED ++ .ndo_size = sizeof(struct net_device_ops), ++#endif + }; + + /* IPoIB mlx5 netdev profile */ +@@ -68,20 +85,50 @@ static void mlx5i_build_nic_params(struc + MLX5I_PARAMS_DEFAULT_LOG_RQ_SIZE; + + params->packet_merge.type = MLX5E_PACKET_MERGE_NONE; ++ + params->hard_mtu = MLX5_IB_GRH_BYTES + MLX5_IPOIB_HARD_LEN; + params->tunneled_offload_en = false; + } + ++#ifdef CONFIG_COMPAT_LRO_ENABLED_IPOIB ++void set_lro_features_bit(struct mlx5e_priv *priv) ++{ ++ struct mlx5e_params *params = &priv->channels.params; ++ u64 hw_support_lro = 0; ++ ++ hw_support_lro = priv->netdev->hw_features & NETIF_F_RXCSUM; ++ if (hw_support_lro) { ++ priv->netdev->hw_features |= NETIF_F_LRO; ++ priv->netdev->features |= NETIF_F_LRO; ++ priv->netdev->wanted_features |= NETIF_F_LRO; ++ } ++ params->lro_en = hw_support_lro; ++} ++#endif ++ + /* Called directly after IPoIB netdevice was created to initialize SW structs */ + int mlx5i_init(struct mlx5_core_dev *mdev, struct net_device *netdev) + { + struct mlx5e_priv *priv = mlx5i_epriv(netdev); ++#ifndef HAVE_NET_DEVICE_MIN_MAX_MTU ++ u16 max_mtu; ++#endif + + netif_carrier_off(netdev); + mlx5e_set_netdev_mtu_boundaries(priv); ++#ifdef HAVE_NET_DEVICE_MIN_MAX_MTU + netdev->mtu = netdev->max_mtu; + +- mlx5e_build_nic_params(priv, NULL, netdev->mtu); ++#else ++ mlx5_query_port_max_mtu(mdev, &max_mtu, 1); ++ netdev->mtu = max_mtu; ++#endif ++ ++ mlx5e_build_nic_params(priv, ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT ++ NULL, ++#endif ++ netdev->mtu); + mlx5i_build_nic_params(mdev, &priv->channels.params); + + mlx5e_timestamp_init(priv); +@@ -95,6 +142,9 @@ int mlx5i_init(struct mlx5_core_dev *mde + netdev->hw_features |= NETIF_F_TSO6; + netdev->hw_features |= NETIF_F_RXCSUM; + netdev->hw_features |= NETIF_F_RXHASH; ++#ifdef CONFIG_COMPAT_LRO_ENABLED_IPOIB ++ set_lro_features_bit(priv); ++#endif + + netdev->netdev_ops = &mlx5i_netdev_ops; + netdev->ethtool_ops = &mlx5i_ethtool_ops; +@@ -141,10 +191,20 @@ static void mlx5i_grp_sw_update_stats(st + priv->stats.sw.tx_queue_dropped = s.tx_dropped; + } + ++#ifdef HAVE_NDO_GET_STATS64_RET_VOID + void mlx5i_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) ++#elif defined(HAVE_NDO_GET_STATS64) ++struct rtnl_link_stats64 * mlx5i_get_stats(struct net_device *dev, ++ struct rtnl_link_stats64 *stats) ++#else ++struct net_device_stats * mlx5i_get_stats(struct net_device *dev) ++#endif + { + struct mlx5e_priv *priv = mlx5i_epriv(dev); + struct mlx5e_sw_stats *sstats = &priv->stats.sw; ++#if !defined(HAVE_NDO_GET_STATS64) && !defined(HAVE_NDO_GET_STATS64_RET_VOID) ++ struct net_device_stats *stats = &priv->netdev_stats; ++#endif + + mlx5i_grp_sw_update_stats(priv); + +@@ -153,6 +213,10 @@ void mlx5i_get_stats(struct net_device * + stats->tx_packets = sstats->tx_packets; + stats->tx_bytes = sstats->tx_bytes; + stats->tx_dropped = sstats->tx_queue_dropped; ++ ++#ifndef HAVE_NDO_GET_STATS64_RET_VOID ++ return stats; ++#endif + } + + int mlx5i_init_underlay_qp(struct mlx5e_priv *priv) +@@ -332,7 +396,9 @@ static int mlx5i_create_flow_steering(st + if (err) { + netdev_err(priv->netdev, "Failed to create arfs tables, err=%d\n", + err); ++#ifdef CONFIG_RFS_ACCEL + priv->netdev->hw_features &= ~NETIF_F_NTUPLE; ++#endif + } + + err = mlx5e_create_ttc_table(priv); +@@ -479,13 +545,21 @@ int mlx5i_dev_init(struct net_device *de + { + struct mlx5e_priv *priv = mlx5i_epriv(dev); + struct mlx5i_priv *ipriv = priv->ppriv; ++#ifdef HAVE_DEV_ADDR_MOD + u8 addr_mod[3]; ++#endif + + /* Set dev address using underlay QP */ ++#ifdef HAVE_DEV_ADDR_MOD + addr_mod[0] = (ipriv->qpn >> 16) & 0xff; + addr_mod[1] = (ipriv->qpn >> 8) & 0xff; + addr_mod[2] = (ipriv->qpn) & 0xff; + dev_addr_mod(dev, 1, addr_mod, sizeof(addr_mod)); ++#else ++ dev->dev_addr[1] = (ipriv->qpn >> 16) & 0xff; ++ dev->dev_addr[2] = (ipriv->qpn >> 8) & 0xff; ++ dev->dev_addr[3] = (ipriv->qpn) & 0xff; ++#endif + + /* Add QPN to net-device mapping to HT */ + mlx5i_pkey_add_qpn(dev, ipriv->qpn); +@@ -640,7 +714,13 @@ static int mlx5i_xmit(struct net_device + struct mlx5_ib_ah *mah = to_mah(address); + struct mlx5i_priv *ipriv = epriv->ppriv; + ++#ifdef HAVE_NETDEV_XMIT_MORE + mlx5i_sq_xmit(sq, skb, &mah->av, dqpn, ipriv->qkey, netdev_xmit_more()); ++#elif defined(HAVE_SK_BUFF_XMIT_MORE) ++ mlx5i_sq_xmit(sq, skb, &mah->av, dqpn, ipriv->qkey, skb->xmit_more); ++#else ++ mlx5i_sq_xmit(sq, skb, &mah->av, dqpn, ipriv->qkey); ++#endif + + return NETDEV_TX_OK; + } +@@ -742,8 +822,10 @@ static int mlx5_rdma_setup_rn(struct ib_ + rn->detach_mcast = mlx5i_detach_mcast; + rn->set_id = mlx5i_set_pkey_index; + ++#ifdef HAVE_NET_DEVICE_NEEDS_FREE_NETDEV + netdev->priv_destructor = mlx5_rdma_netdev_free; + netdev->needs_free_netdev = 1; ++#endif + + return 0; + +@@ -778,6 +860,9 @@ int mlx5_rdma_rn_get_params(struct mlx5_ + .rxqs = nch, + .param = mdev, + .initialize_rdma_netdev = mlx5_rdma_setup_rn, ++#ifndef HAVE_NET_DEVICE_NEEDS_FREE_NETDEV ++ .uninitialize_rdma_netdev = mlx5_rdma_netdev_free, ++#endif + }; + + return 0; diff --git a/src/mlnx-ofa_kernel-5.8/backports/0180-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-ipo.patch b/src/mlnx-ofa_kernel-5.8/backports/0180-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-ipo.patch new file mode 100644 index 0000000..28053ab --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0180-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-ipo.patch @@ -0,0 +1,33 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h + +Change-Id: Ic8ef4ce10a479d0772d212a8337a4fcc620caeed +--- + .../net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h | 14 +++++++++++++- + 1 file changed, 13 insertions(+), 1 deletion(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h +@@ -114,8 +114,20 @@ struct mlx5i_tx_wqe { + ((struct mlx5i_tx_wqe *)mlx5e_fetch_wqe(&(sq)->wq, pi, sizeof(struct mlx5i_tx_wqe))) + + void mlx5i_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb, +- struct mlx5_av *av, u32 dqpn, u32 dqkey, bool xmit_more); ++ struct mlx5_av *av, u32 dqpn, u32 dqkey ++#if defined(HAVE_SK_BUFF_XMIT_MORE) || defined(HAVE_NETDEV_XMIT_MORE) ++ , bool xmit_more); ++#else ++ ); ++#endif ++#ifdef HAVE_NDO_GET_STATS64_RET_VOID + void mlx5i_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats); ++#elif defined(HAVE_NDO_GET_STATS64) ++struct rtnl_link_stats64 * mlx5i_get_stats(struct net_device *dev, ++ struct rtnl_link_stats64 *stats); ++#else ++struct net_device_stats * mlx5i_get_stats(struct net_device *dev); ++#endif + + #endif /* CONFIG_MLX5_CORE_IPOIB */ + #endif /* __MLX5E_IPOB_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0181-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-ipo.patch b/src/mlnx-ofa_kernel-5.8/backports/0181-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-ipo.patch new file mode 100644 index 0000000..74cf2f6 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0181-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-ipo.patch @@ -0,0 +1,73 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c + +Change-Id: I86093b198c9da847f1b59fafff6104b5f21e330e +--- + .../mellanox/mlx5/core/ipoib/ipoib_vlan.c | 29 ++++++++++++++++++- + 1 file changed, 28 insertions(+), 1 deletion(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c +@@ -73,8 +73,9 @@ static struct qpn_to_netdev *mlx5i_find_ + { + struct hlist_head *h = &buckets[hash_32(qpn, MLX5I_MAX_LOG_PKEY_SUP)]; + struct qpn_to_netdev *node; ++ COMPAT_HL_NODE + +- hlist_for_each_entry(node, h, hlist) { ++ compat_hlist_for_each_entry(node, h, hlist) { + if (node->underlay_qpn == qpn) + return node; + } +@@ -146,10 +147,28 @@ static const struct net_device_ops mlx5i + .ndo_open = mlx5i_pkey_open, + .ndo_stop = mlx5i_pkey_close, + .ndo_init = mlx5i_pkey_dev_init, ++#if defined(HAVE_NDO_GET_STATS64) || defined(HAVE_NDO_GET_STATS64_RET_VOID) + .ndo_get_stats64 = mlx5i_get_stats, ++#else ++ .ndo_get_stats = mlx5i_get_stats, ++#endif + .ndo_uninit = mlx5i_pkey_dev_cleanup, ++#ifndef HAVE_NDO_CHANGE_MTU_RH74 + .ndo_change_mtu = mlx5i_pkey_change_mtu, ++ .ndo_do_ioctl = mlx5i_pkey_ioctl, ++#else ++ .ndo_change_mtu_rh74 = mlx5i_pkey_change_mtu, ++#endif ++ ++#ifdef HAVE_NDO_ETH_IOCTL + .ndo_eth_ioctl = mlx5i_pkey_ioctl, ++#else ++ .ndo_do_ioctl = mlx5i_pkey_ioctl, ++#endif ++ ++#ifdef HAVE_NET_DEVICE_OPS_EXTENDED ++ .ndo_size = sizeof(struct net_device_ops), ++#endif + }; + + /* Child NDOs */ +@@ -165,6 +184,7 @@ static int mlx5i_pkey_dev_init(struct ne + /* Get QPN to netdevice hash table from parent */ + parent_ifindex = dev->netdev_ops->ndo_get_iflink(dev); + parent_dev = dev_get_by_index(dev_net(dev), parent_ifindex); ++ + if (!parent_dev) { + mlx5_core_warn(priv->mdev, "failed to get parent device\n"); + return -EINVAL; +@@ -286,6 +306,13 @@ static int mlx5i_pkey_init(struct mlx5_c + if (err) + return err; + ++#ifdef CONFIG_COMPAT_LRO_ENABLED_IPOIB ++ netdev->features &= ~NETIF_F_LRO; ++ priv->netdev->hw_features &= ~NETIF_F_LRO; ++ priv->netdev->wanted_features &= ~NETIF_F_LRO; ++ priv->channels.params.lro_en = false; ++#endif ++ + /* Override parent ndo */ + netdev->netdev_ops = &mlx5i_pkey_netdev_ops; + diff --git a/src/mlnx-ofa_kernel-5.8/backports/0182-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lag.patch b/src/mlnx-ofa_kernel-5.8/backports/0182-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lag.patch new file mode 100644 index 0000000..5d4c202 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0182-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lag.patch @@ -0,0 +1,34 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/lag/debugfs.c + +Change-Id: I1be01efd6f1321f6a35919073c062237433caad2 +--- + .../ethernet/mellanox/mlx5/core/lag/debugfs.c | 16 ++++++++++++++++ + 1 file changed, 16 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/debugfs.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/debugfs.c +@@ -147,6 +147,22 @@ static int members_show(struct seq_file + return 0; + } + ++#ifndef DEFINE_SHOW_ATTRIBUTE ++#define DEFINE_SHOW_ATTRIBUTE(__name) \ ++static int __name ## _open(struct inode *inode, struct file *file) \ ++{ \ ++ return single_open(file, __name ## _show, inode->i_private); \ ++} \ ++ \ ++static const struct file_operations __name ## _fops = { \ ++ .owner = THIS_MODULE, \ ++ .open = __name ## _open, \ ++ .read = seq_read, \ ++ .llseek = seq_lseek, \ ++ .release = single_release, \ ++} ++#endif ++ + DEFINE_SHOW_ATTRIBUTE(type); + DEFINE_SHOW_ATTRIBUTE(port_sel_mode); + DEFINE_SHOW_ATTRIBUTE(state); diff --git a/src/mlnx-ofa_kernel-5.8/backports/0183-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lag.patch b/src/mlnx-ofa_kernel-5.8/backports/0183-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lag.patch new file mode 100644 index 0000000..767064b --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0183-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lag.patch @@ -0,0 +1,559 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c + +Change-Id: I55bc9a084b59711b27a8d0ba4d0f556011e7291f +--- + .../net/ethernet/mellanox/mlx5/core/lag/lag.c | 210 +++++++++++++++++- + 1 file changed, 202 insertions(+), 8 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c +@@ -34,13 +34,26 @@ + #include + #include + #include +-#include + #include "lib/devcom.h" + #include "mlx5_core.h" + #include "eswitch.h" +-#include "lag.h" +-#include "mp.h" + #include "esw/acl/ofld.h" ++#ifdef MLX_USE_LAG_COMPAT ++#define MLX_IMPL_LAG_EVENTS ++#include ++#include ++#include ++#include "en.h" ++#endif ++ ++#include ++ ++#if defined(MLX_USE_LAG_COMPAT) || defined(HAVE_LAG_TX_TYPE) ++#define MLX_LAG_SUPPORTED ++#endif ++ ++#ifdef MLX_LAG_SUPPORTED ++#include "lag.h" + + enum { + MLX5_LAG_EGRESS_PORT_1 = 1, +@@ -51,7 +64,94 @@ enum { + * under it). + */ + static DEFINE_SPINLOCK(lag_lock); ++#endif ++ ++#ifdef MLX_USE_LAG_COMPAT ++#undef register_netdevice_notifier ++#undef unregister_netdevice_notifier ++#define register_netdevice_notifier mlx5_lag_compat_register_netdev_notifier ++#define unregister_netdevice_notifier mlx5_lag_compat_unregister_netdev_notifier ++#undef register_netdevice_notifier_rh ++#undef unregister_netdevice_notifier_rh ++#define register_netdevice_notifier_rh mlx5_lag_compat_register_netdev_notifier ++#define unregister_netdevice_notifier_rh mlx5_lag_compat_unregister_netdev_notifier ++ ++#undef netdev_notifier_info_to_dev ++#define netdev_notifier_info_to_dev netdev_notifier_info_to_dev_v2 ++ ++#define MLX5_LAG_COMPAT_MAX_LAGDEVS 0x8 ++ ++static int mlx5_lag_netdev_event(struct notifier_block *this, ++ unsigned long event, void *ptr); ++ ++static struct mlx5_lag *mlx5_lag_compat_ldevs[MLX5_LAG_COMPAT_MAX_LAGDEVS] = {}; ++static int mlx5_lag_compat_reg_ldevs = 0; ++ ++static void mlx5_lag_compat_netdev_event(unsigned long event, void *ptr) ++{ ++ struct mlx5_lag *ldev; ++ int i; ++ ++ for (i = 0; i < MLX5_LAG_COMPAT_MAX_LAGDEVS; ++i) { ++ ldev = mlx5_lag_compat_ldevs[i]; ++ if (!ldev) ++ continue; ++ mlx5_lag_netdev_event(&ldev->nb, event, ptr); ++ } ++} ++ ++static int mlx5_lag_compat_register_netdev_notifier(struct notifier_block *nb) ++{ ++ struct mlx5_lag *ldev = container_of(nb, struct mlx5_lag, nb); ++ int err = 0, i; ++ ++ if (!mlx5_lag_compat_reg_ldevs) ++ mlx_lag_compat_events_open(mlx5_lag_compat_netdev_event); ++ ++ rtnl_lock(); ++ for (i = 0; i < MLX5_LAG_COMPAT_MAX_LAGDEVS; ++i) { ++ if (mlx5_lag_compat_ldevs[i]) ++ continue; ++ ++ mlx5_lag_compat_ldevs[i] = ldev; ++ break; ++ } ++ ++ if (i == MLX5_LAG_COMPAT_MAX_LAGDEVS) { ++ err = -EINVAL; ++ goto unlock; ++ } ++ ++ ++mlx5_lag_compat_reg_ldevs; ++ ++unlock: ++ rtnl_unlock(); ++ return err; ++} ++ ++static void mlx5_lag_compat_unregister_netdev_notifier(struct notifier_block *nb) ++{ ++ struct mlx5_lag *ldev = container_of(nb, struct mlx5_lag, nb); ++ int i; ++ ++ rtnl_lock(); ++ for (i = 0; i < MLX5_LAG_COMPAT_MAX_LAGDEVS; ++i) { ++ if (mlx5_lag_compat_ldevs[i] != ldev) ++ continue; ++ ++ mlx5_lag_compat_ldevs[i] = NULL; ++ break; ++ } ++ ++ --mlx5_lag_compat_reg_ldevs; ++ rtnl_unlock(); ++ ++ if (!mlx5_lag_compat_reg_ldevs) ++ mlx_lag_compat_events_close(); ++} ++#endif + ++#ifdef MLX_LAG_SUPPORTED + static u8 lag_active_port_bits(struct mlx5_lag *ldev) + { + u8 enabled_ports[MLX5_MAX_PORTS] = {}; +@@ -108,24 +208,33 @@ static int mlx5_cmd_modify_lag(struct ml + + return mlx5_cmd_exec_in(dev, modify_lag, in); + } ++#endif /* #ifdef MLX_LAG_SUPPORTED */ + + int mlx5_cmd_create_vport_lag(struct mlx5_core_dev *dev) + { ++#ifndef MLX_LAG_SUPPORTED ++ return -EOPNOTSUPP; ++#else + u32 in[MLX5_ST_SZ_DW(create_vport_lag_in)] = {}; + + MLX5_SET(create_vport_lag_in, in, opcode, MLX5_CMD_OP_CREATE_VPORT_LAG); + + return mlx5_cmd_exec_in(dev, create_vport_lag, in); ++#endif /* #ifndef MLX_LAG_SUPPORTED */ + } + EXPORT_SYMBOL(mlx5_cmd_create_vport_lag); + + int mlx5_cmd_destroy_vport_lag(struct mlx5_core_dev *dev) + { ++#ifndef MLX_LAG_SUPPORTED ++ return -EOPNOTSUPP; ++#else + u32 in[MLX5_ST_SZ_DW(destroy_vport_lag_in)] = {}; + + MLX5_SET(destroy_vport_lag_in, in, opcode, MLX5_CMD_OP_DESTROY_VPORT_LAG); + + return mlx5_cmd_exec_in(dev, destroy_vport_lag, in); ++#endif /* #ifndef MLX_LAG_SUPPORTED */ + } + EXPORT_SYMBOL(mlx5_cmd_destroy_vport_lag); + +@@ -207,7 +316,11 @@ static void mlx5_ldev_free(struct kref * + struct mlx5_lag *ldev = container_of(ref, struct mlx5_lag, ref); + + if (ldev->nb.notifier_call) ++#ifdef HAVE_UNREGISTER_NETDEVICE_NOTIFIER_NET + unregister_netdevice_notifier_net(&init_net, &ldev->nb); ++#else ++ unregister_netdevice_notifier(&ldev->nb); ++#endif + mlx5_lag_mp_cleanup(ldev); + cancel_delayed_work_sync(&ldev->bond_work); + destroy_workqueue(ldev->wq); +@@ -245,7 +358,11 @@ static struct mlx5_lag *mlx5_lag_dev_all + INIT_DELAYED_WORK(&ldev->bond_work, mlx5_do_bond_work); + + ldev->nb.notifier_call = mlx5_lag_netdev_event; ++#ifdef HAVE_UNREGISTER_NETDEVICE_NOTIFIER_NET + if (register_netdevice_notifier_net(&init_net, &ldev->nb)) { ++#else ++ if (register_netdevice_notifier(&ldev->nb)) { ++#endif + ldev->nb.notifier_call = NULL; + mlx5_core_err(dev, "Failed to register LAG netdev notifier\n"); + } +@@ -264,11 +381,11 @@ int mlx5_lag_dev_get_netdev_idx(struct m + struct net_device *ndev) + { + int i; +- ++#ifdef MLX_LAG_SUPPORTED + for (i = 0; i < ldev->ports; i++) + if (ldev->pf[i].netdev == ndev) + return i; +- ++#endif /* #ifdef MLX_LAG_SUPPORTED */ + return -ENOENT; + } + +@@ -859,6 +976,7 @@ static void mlx5_do_bond(struct mlx5_lag + dev0->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV; + mlx5_rescan_drivers_locked(dev0); + ++#ifdef CONFIG_MLX5_ESWITCH + err = mlx5_eswitch_reload_reps(dev0->priv.eswitch); + if (!err) + err = mlx5_eswitch_reload_reps(dev1->priv.eswitch); +@@ -873,6 +991,7 @@ static void mlx5_do_bond(struct mlx5_lag + mlx5_core_err(dev0, "Failed to enable lag\n"); + return; + } ++#endif + } + } else if (do_bond && __mlx5_lag_is_active(ldev)) { + mlx5_modify_lag(ldev, &tracker); +@@ -933,8 +1052,12 @@ static bool mlx5_lag_eval_bonding_conds( + struct slave *slave; + bool changed = false; + +- rcu_read_lock(); +- for_each_netdev_in_bond_rcu(upper, ndev_tmp) { ++#ifdef for_each_netdev_in_bond_rcu ++ rcu_read_lock(); ++ for_each_netdev_in_bond_rcu(upper, ndev_tmp) { ++#else ++ for_each_netdev_in_bond(upper, ndev_tmp) { ++#endif + idx = mlx5_lag_dev_get_netdev_idx(ldev, ndev_tmp); + if (idx >= 0) { + slave = bond_slave_get_rcu(ndev_tmp); +@@ -945,8 +1068,9 @@ static bool mlx5_lag_eval_bonding_conds( + + num_slaves++; + } ++#ifdef for_each_netdev_in_bond_rcu + rcu_read_unlock(); +- ++#endif + /* None of this lagdev's netdevs are slaves of this master. */ + if (!(bond_status & GENMASK(ldev->ports - 1, 0))) + return false; +@@ -975,13 +1099,23 @@ static bool mlx5_lag_eval_bonding_conds( + + if (!mlx5_lag_is_ready(ldev)) { + if (info) ++#ifdef HAVE_NETDEV_NOTIFIER_INFO_EXTACK + NL_SET_ERR_MSG_MOD(info->info.extack, + "Can't activate LAG offload, PF is configured with more than 64 VFs"); ++#else ++ netdev_warn(upper, ++ "Can't activate LAG offload, PF is configured with more than 64 VFs\n"); ++#endif + } + else if (!mode_supported) { + if (info) ++#ifdef HAVE_NETDEV_NOTIFIER_INFO_EXTACK + NL_SET_ERR_MSG_MOD(info->info.extack, + "Can't activate LAG offload, TX type isn't supported"); ++#else ++ netdev_warn(upper, ++ "Can't activate LAG offload, TX type isn't supported\n"); ++#endif + } + + return changed; +@@ -1004,7 +1138,9 @@ static bool mlx5_handle_changeupper_even + + if (lag_upper_info) { + tx_type = lag_upper_info->tx_type; ++#ifdef HAVE_INFO_HASH_TYPE + tracker->hash_type = lag_upper_info->hash_type; ++#endif + } + } + +@@ -1271,6 +1407,7 @@ static int __mlx5_lag_dev_add_mdev(struc + + void mlx5_lag_remove_mdev(struct mlx5_core_dev *dev) + { ++#ifdef MLX_LAG_SUPPORTED + struct mlx5_lag *ldev; + + ldev = mlx5_lag_dev(dev); +@@ -1291,10 +1428,12 @@ recheck: + mlx5_ldev_remove_mdev(ldev, dev); + mutex_unlock(&ldev->lock); + mlx5_ldev_put(ldev); ++#endif /* #ifndef MLX_LAG_SUPPORTED */ + } + + void mlx5_lag_add_mdev(struct mlx5_core_dev *dev) + { ++#ifdef MLX_LAG_SUPPORTED + int err; + + if (!mlx5_lag_is_supported(dev)) +@@ -1310,11 +1449,13 @@ recheck: + goto recheck; + } + mlx5_ldev_add_debugfs(dev); ++#endif /* #ifndef MLX_LAG_SUPPORTED */ + } + + void mlx5_lag_remove_netdev(struct mlx5_core_dev *dev, + struct net_device *netdev) + { ++#ifdef MLX_LAG_SUPPORTED + struct mlx5_lag *ldev; + bool lag_is_active; + +@@ -1334,12 +1475,14 @@ void mlx5_lag_remove_netdev(struct mlx5_ + + if (lag_is_active) + mlx5_queue_bond_work(ldev, 0); ++#endif /* #ifdef MLX_LAG_SUPPORTED */ + } + + /* Must be called with intf_mutex held */ + void mlx5_lag_add_netdev(struct mlx5_core_dev *dev, + struct net_device *netdev) + { ++#ifdef MLX_LAG_SUPPORTED + struct mlx5_lag *ldev; + int i; + +@@ -1359,10 +1502,14 @@ void mlx5_lag_add_netdev(struct mlx5_cor + + mutex_unlock(&ldev->lock); + mlx5_lag_update_trackers(ldev); ++#endif /* #ifdef MLX_LAG_SUPPORTED */ + } + + bool mlx5_lag_is_roce(struct mlx5_core_dev *dev) + { ++#ifndef MLX_LAG_SUPPORTED ++ return false; ++#else + struct mlx5_lag *ldev; + unsigned long flags; + bool res; +@@ -1373,11 +1520,15 @@ bool mlx5_lag_is_roce(struct mlx5_core_d + spin_unlock_irqrestore(&lag_lock, flags); + + return res; ++#endif /* #ifdef MLX_LAG_SUPPORTED */ + } + EXPORT_SYMBOL(mlx5_lag_is_roce); + + bool mlx5_lag_is_active(struct mlx5_core_dev *dev) + { ++#ifndef MLX_LAG_SUPPORTED ++ return false; ++#else + struct mlx5_lag *ldev; + unsigned long flags; + bool res; +@@ -1388,6 +1539,7 @@ bool mlx5_lag_is_active(struct mlx5_core + spin_unlock_irqrestore(&lag_lock, flags); + + return res; ++#endif /* #ifdef MLX_LAG_SUPPORTED */ + } + EXPORT_SYMBOL(mlx5_lag_is_active); + +@@ -1409,6 +1561,9 @@ EXPORT_SYMBOL(mlx5_lag_mode_is_hash); + + bool mlx5_lag_is_master(struct mlx5_core_dev *dev) + { ++#ifndef MLX_LAG_SUPPORTED ++ return false; ++#else + struct mlx5_lag *ldev; + unsigned long flags; + bool res; +@@ -1420,11 +1575,15 @@ bool mlx5_lag_is_master(struct mlx5_core + spin_unlock_irqrestore(&lag_lock, flags); + + return res; ++#endif /* #ifdef MLX_LAG_SUPPORTED */ + } + EXPORT_SYMBOL(mlx5_lag_is_master); + + bool mlx5_lag_is_sriov(struct mlx5_core_dev *dev) + { ++#ifndef MLX_LAG_SUPPORTED ++ return false; ++#else + struct mlx5_lag *ldev; + unsigned long flags; + bool res; +@@ -1435,11 +1594,15 @@ bool mlx5_lag_is_sriov(struct mlx5_core_ + spin_unlock_irqrestore(&lag_lock, flags); + + return res; ++#endif /* #ifdef MLX_LAG_SUPPORTED */ + } + EXPORT_SYMBOL(mlx5_lag_is_sriov); + + bool mlx5_lag_is_mpesw(struct mlx5_core_dev *dev) + { ++#ifndef MLX_LAG_SUPPORTED ++ return false; ++#else + struct mlx5_lag *ldev; + unsigned long flags; + bool res; +@@ -1449,6 +1612,7 @@ bool mlx5_lag_is_mpesw(struct mlx5_core_ + res = ldev && ldev->flags & MLX5_LAG_FLAG_MULTI_PORT_ESW; + spin_unlock_irqrestore(&lag_lock, flags); + ++#endif /* #ifdef MLX_LAG_SUPPORTED */ + return res; + } + EXPORT_SYMBOL(mlx5_lag_is_mpesw); +@@ -1470,6 +1634,7 @@ EXPORT_SYMBOL(mlx5_lag_is_shared_fdb); + + void mlx5_lag_disable_change(struct mlx5_core_dev *dev) + { ++#ifdef MLX_LAG_SUPPORTED + struct mlx5_lag *ldev; + + ldev = mlx5_lag_dev(dev); +@@ -1485,10 +1650,12 @@ void mlx5_lag_disable_change(struct mlx5 + + mutex_unlock(&ldev->lock); + mlx5_dev_list_unlock(); ++#endif /* #ifdef MLX_LAG_SUPPORTED */ + } + + void mlx5_lag_enable_change(struct mlx5_core_dev *dev) + { ++#ifdef MLX_LAG_SUPPORTED + struct mlx5_lag *ldev; + + ldev = mlx5_lag_dev(dev); +@@ -1499,10 +1666,14 @@ void mlx5_lag_enable_change(struct mlx5_ + ldev->mode_changes_in_progress--; + mutex_unlock(&ldev->lock); + mlx5_queue_bond_work(ldev, 0); ++#endif /* #ifdef MLX_LAG_SUPPORTED */ + } + + struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev) + { ++#ifndef MLX_LAG_SUPPORTED ++ return NULL; ++#else + struct net_device *ndev = NULL; + struct mlx5_lag *ldev; + unsigned long flags; +@@ -1530,12 +1701,16 @@ unlock: + spin_unlock_irqrestore(&lag_lock, flags); + + return ndev; ++#endif /* #ifdef MLX_LAG_SUPPORTED */ + } + EXPORT_SYMBOL(mlx5_lag_get_roce_netdev); + + u8 mlx5_lag_get_slave_port(struct mlx5_core_dev *dev, + struct net_device *slave) + { ++#ifndef MLX_LAG_SUPPORTED ++ return 0; ++#else + struct mlx5_lag *ldev; + unsigned long flags; + u8 port = 0; +@@ -1558,6 +1733,7 @@ u8 mlx5_lag_get_slave_port(struct mlx5_c + unlock: + spin_unlock_irqrestore(&lag_lock, flags); + return port; ++#endif /* #ifndef MLX_LAG_SUPPORTED */ + } + EXPORT_SYMBOL(mlx5_lag_get_slave_port); + +@@ -1575,6 +1751,9 @@ EXPORT_SYMBOL(mlx5_lag_get_num_ports); + + struct mlx5_core_dev *mlx5_lag_get_peer_mdev(struct mlx5_core_dev *dev) + { ++#ifndef MLX_LAG_SUPPORTED ++ return NULL; ++#else + struct mlx5_core_dev *peer_dev = NULL; + struct mlx5_lag *ldev; + unsigned long flags; +@@ -1591,6 +1770,7 @@ struct mlx5_core_dev *mlx5_lag_get_peer_ + unlock: + spin_unlock_irqrestore(&lag_lock, flags); + return peer_dev; ++#endif /* #ifndef MLX_LAG_SUPPORTED */ + } + EXPORT_SYMBOL(mlx5_lag_get_peer_mdev); + +@@ -1601,7 +1781,9 @@ int mlx5_lag_query_cong_counters(struct + { + int outlen = MLX5_ST_SZ_BYTES(query_cong_statistics_out); + struct mlx5_core_dev **mdev; ++#ifdef MLX_LAG_SUPPORTED + struct mlx5_lag *ldev; ++#endif + unsigned long flags; + int num_ports; + int ret = 0; +@@ -1620,6 +1802,7 @@ int mlx5_lag_query_cong_counters(struct + + memset(values, 0, sizeof(*values) * num_counters); + ++#ifdef MLX_LAG_SUPPORTED + spin_lock_irqsave(&lag_lock, flags); + ldev = mlx5_lag_dev(dev); + if (ldev && __mlx5_lag_is_active(ldev)) { +@@ -1631,6 +1814,10 @@ int mlx5_lag_query_cong_counters(struct + mdev[MLX5_LAG_P1] = dev; + } + spin_unlock_irqrestore(&lag_lock, flags); ++#else ++ num_ports = 1; ++ mdev[0] = dev; ++#endif + + for (i = 0; i < num_ports; ++i) { + u32 in[MLX5_ST_SZ_DW(query_cong_statistics_in)] = {}; +@@ -1666,7 +1853,9 @@ int mlx5_lag_modify_cong_params(struct m + void *in, int in_size) + { + struct mlx5_core_dev **mdev; ++#ifdef MLX_LAG_SUPPORTED + struct mlx5_lag *ldev; ++#endif + unsigned long flags; + int num_ports; + int ret = 0; +@@ -1676,6 +1865,7 @@ int mlx5_lag_modify_cong_params(struct m + if (!mdev) + return -ENOMEM; + ++#ifdef MLX_LAG_SUPPORTED + spin_lock_irqsave(&lag_lock, flags); + ldev = mlx5_lag_dev(dev); + if (ldev && __mlx5_lag_is_active(ldev)) { +@@ -1687,6 +1877,10 @@ int mlx5_lag_modify_cong_params(struct m + mdev[0] = dev; + } + spin_unlock_irqrestore(&lag_lock, flags); ++#else ++ num_ports = 1; ++ mdev[0] = dev; ++#endif + + for (i = 0; i < num_ports; i++) { + ret = mlx5_cmd_modify_cong_params(mdev[i], in, in_size); diff --git a/src/mlnx-ofa_kernel-5.8/backports/0184-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lib.patch b/src/mlnx-ofa_kernel-5.8/backports/0184-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lib.patch new file mode 100644 index 0000000..a1ac72f --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0184-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lib.patch @@ -0,0 +1,596 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c + +Change-Id: Ieb6b5cefc2ccec8366902aba3bf9bd8c3510355f +--- + .../ethernet/mellanox/mlx5/core/lib/clock.c | 263 +++++++++++++++++- + 1 file changed, 249 insertions(+), 14 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c +@@ -38,6 +38,10 @@ + #include "en.h" + #include "clock.h" + ++#ifndef smp_store_mb ++#define smp_store_mb set_mb ++#endif ++ + enum { + MLX5_CYCLES_SHIFT = 23 + }; +@@ -98,25 +102,37 @@ static int mlx5_set_mtutc(struct mlx5_co + } + + static u64 mlx5_read_time(struct mlx5_core_dev *dev, ++#ifdef HAVE_GETTIMEX64 + struct ptp_system_timestamp *sts, ++#else ++ void *sts, ++#endif + bool real_time) + { + u32 timer_h, timer_h1, timer_l; + + timer_h = ioread32be(real_time ? &dev->iseg->real_time_h : + &dev->iseg->internal_timer_h); ++#ifdef HAVE_GETTIMEX64 + ptp_read_system_prets(sts); ++#endif + timer_l = ioread32be(real_time ? &dev->iseg->real_time_l : + &dev->iseg->internal_timer_l); ++#ifdef HAVE_GETTIMEX64 + ptp_read_system_postts(sts); ++#endif + timer_h1 = ioread32be(real_time ? &dev->iseg->real_time_h : + &dev->iseg->internal_timer_h); + if (timer_h != timer_h1) { + /* wrap around */ ++#ifdef HAVE_GETTIMEX64 + ptp_read_system_prets(sts); ++#endif + timer_l = ioread32be(real_time ? &dev->iseg->real_time_l : + &dev->iseg->internal_timer_l); ++#ifdef HAVE_GETTIMEX64 + ptp_read_system_postts(sts); ++#endif + } + + return real_time ? REAL_TIME_TO_NS(timer_h1, timer_l) : +@@ -139,14 +155,12 @@ static void mlx5_update_clock_info_page( + struct mlx5_clock *clock = &mdev->clock; + struct mlx5_timer *timer; + u32 sign; +- + if (!clock_info) + return; + + sign = smp_load_acquire(&clock_info->sign); + smp_store_mb(clock_info->sign, + sign | MLX5_IB_CLOCK_INFO_KERNEL_UPDATING); +- + timer = &clock->timer; + clock_info->cycles = timer->tc.cycle_last; + clock_info->mult = timer->cycles.mult; +@@ -157,6 +171,7 @@ static void mlx5_update_clock_info_page( + sign + MLX5_IB_CLOCK_INFO_KERNEL_UPDATING * 2); + } + ++#if defined (CONFIG_PTP_1588_CLOCK) || defined(CONFIG_PTP_1588_CLOCK_MODULE) + static void mlx5_pps_out(struct work_struct *work) + { + struct mlx5_pps *pps_info = container_of(work, struct mlx5_pps, +@@ -185,6 +200,7 @@ static void mlx5_pps_out(struct work_str + mlx5_set_mtpps(mdev, in, sizeof(in)); + } + } ++#endif + + static void mlx5_timestamp_overflow(struct work_struct *work) + { +@@ -205,8 +221,13 @@ static void mlx5_timestamp_overflow(stru + schedule_delayed_work(&timer->overflow_work, timer->overflow_period); + } + ++#if (defined (CONFIG_PTP_1588_CLOCK) || defined(CONFIG_PTP_1588_CLOCK_MODULE)) + static int mlx5_ptp_settime_real_time(struct mlx5_core_dev *mdev, ++#ifndef HAVE_PTP_CLOCK_INFO_GETTIME_32BIT + const struct timespec64 *ts) ++#else ++ const struct timespec *ts) ++#endif + { + u32 in[MLX5_ST_SZ_DW(mtutc_reg)] = {}; + +@@ -224,7 +245,12 @@ static int mlx5_ptp_settime_real_time(st + return mlx5_set_mtutc(mdev, in, sizeof(in)); + } + +-static int mlx5_ptp_settime(struct ptp_clock_info *ptp, const struct timespec64 *ts) ++static int mlx5_ptp_settime(struct ptp_clock_info *ptp, ++#ifndef HAVE_PTP_CLOCK_INFO_GETTIME_32BIT ++ const struct timespec64 *ts) ++#else ++ const struct timespec *ts) ++#endif + { + struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock, ptp_info); + struct mlx5_timer *timer = &clock->timer; +@@ -238,27 +264,53 @@ static int mlx5_ptp_settime(struct ptp_c + return err; + + write_seqlock_irqsave(&clock->lock, flags); ++#ifndef HAVE_PTP_CLOCK_INFO_GETTIME_32BIT + timecounter_init(&timer->tc, &timer->cycles, timespec64_to_ns(ts)); ++#else ++ timecounter_init(&timer->tc, &timer->cycles, timespec_to_ns(ts)); ++#endif + mlx5_update_clock_info_page(mdev); + write_sequnlock_irqrestore(&clock->lock, flags); + + return 0; + } +- ++#ifndef HAVE_PTP_CLOCK_INFO_GETTIME_32BIT + static + struct timespec64 mlx5_ptp_gettimex_real_time(struct mlx5_core_dev *mdev, ++#else ++static ++struct timespec mlx5_ptp_gettimex_real_time(struct mlx5_core_dev *mdev, ++#endif ++#ifdef HAVE_GETTIMEX64 + struct ptp_system_timestamp *sts) ++#else ++ void *sts) ++#endif + { ++#ifndef HAVE_PTP_CLOCK_INFO_GETTIME_32BIT + struct timespec64 ts; ++#else ++ struct timespec ts; ++#endif + u64 time; + + time = mlx5_read_time(mdev, sts, true); ++#ifndef HAVE_PTP_CLOCK_INFO_GETTIME_32BIT + ts = ns_to_timespec64(time); ++#else ++ ts = ns_to_timespec(time); ++#endif + return ts; + } + +-static int mlx5_ptp_gettimex(struct ptp_clock_info *ptp, struct timespec64 *ts, +- struct ptp_system_timestamp *sts) ++#ifdef HAVE_GETTIMEX64 ++static int mlx5_ptp_gettimex(struct ptp_clock_info *ptp, ++#ifndef HAVE_PTP_CLOCK_INFO_GETTIME_32BIT ++ struct timespec64 *ts, ++#else ++ struct timespec *ts, ++#endif ++ struct ptp_system_timestamp *sts) + { + struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock, ptp_info); + struct mlx5_timer *timer = &clock->timer; +@@ -276,11 +328,49 @@ static int mlx5_ptp_gettimex(struct ptp_ + cycles = mlx5_read_time(mdev, sts, false); + ns = timecounter_cyc2time(&timer->tc, cycles); + write_sequnlock_irqrestore(&clock->lock, flags); ++#ifndef HAVE_PTP_CLOCK_INFO_GETTIME_32BIT + *ts = ns_to_timespec64(ns); ++#else ++ *ts = ns_to_timespec(ns); ++#endif + out: + return 0; + } ++#else/*HAVE_GETTIMEX64*/ ++#ifndef HAVE_PTP_CLOCK_INFO_GETTIME_32BIT ++static int mlx5_ptp_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts) ++#else ++static int mlx5_ptp_gettime(struct ptp_clock_info *ptp, struct timespec *ts) ++#endif ++{ ++ struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock, ++ ptp_info); ++ struct mlx5_timer *timer = &clock->timer; ++ struct mlx5_core_dev *mdev; ++ unsigned long flags; ++ u64 cycles, ns; ++ ++ mdev = container_of(clock, struct mlx5_core_dev, clock); ++ if (mlx5_real_time_mode(mdev)) { ++ *ts = mlx5_ptp_gettimex_real_time(mdev, NULL); ++ goto out; ++ } ++ ++ write_seqlock_irqsave(&clock->lock, flags); ++ cycles = mlx5_read_time(mdev, NULL, false); ++ ns = timecounter_read(&timer->tc); ++ write_sequnlock_irqrestore(&clock->lock, flags); ++ ++#ifndef HAVE_PTP_CLOCK_INFO_GETTIME_32BIT ++ *ts = ns_to_timespec64(ns); ++#else ++ *ts = ns_to_timespec(ns); ++#endif + ++out: ++ return 0; ++} ++#endif/*HAVE_GETTIMEX64*/ + static int mlx5_ptp_adjtime_real_time(struct mlx5_core_dev *mdev, s64 delta) + { + u32 in[MLX5_ST_SZ_DW(mtutc_reg)] = {}; +@@ -290,12 +380,21 @@ static int mlx5_ptp_adjtime_real_time(st + + /* HW time adjustment range is s16. If out of range, settime instead */ + if (delta < S16_MIN || delta > S16_MAX) { ++#ifndef HAVE_PTP_CLOCK_INFO_GETTIME_32BIT + struct timespec64 ts; ++#else ++ struct timespec ts; ++#endif + s64 ns; + + ts = mlx5_ptp_gettimex_real_time(mdev, NULL); ++#ifndef HAVE_PTP_CLOCK_INFO_GETTIME_32BIT + ns = timespec64_to_ns(&ts) + delta; + ts = ns_to_timespec64(ns); ++#else ++ ns = timespec_to_ns(&ts) + delta; ++ ts = ns_to_timespec(ns); ++#endif + return mlx5_ptp_settime_real_time(mdev, &ts); + } + +@@ -374,6 +473,32 @@ static int mlx5_ptp_adjfreq(struct ptp_c + return 0; + } + ++#ifndef PTP_STRICT_FLAGS ++#define PTP_STRICT_FLAGS (1<<3) ++#endif ++#ifndef PTP_EXTTS_EDGES ++#define PTP_EXTTS_EDGES (PTP_RISING_EDGE | PTP_FALLING_EDGE) ++#endif ++ ++#ifndef HAVE_PTP_FIND_PIN_UNLOCK ++static int mlx5_ptp_find_pin(struct mlx5_clock *clock, ++ enum ptp_pin_function func, ++ unsigned int chan, int on) ++{ ++ int i; ++ ++ if (on) ++ return ptp_find_pin(clock->ptp, func, chan); ++ ++ for (i = 0; i < clock->ptp_info.n_pins; i++) { ++ if (clock->ptp_info.pin_config[i].func == func && ++ clock->ptp_info.pin_config[i].chan == chan) ++ return i; ++ } ++ return -1; ++} ++#endif ++ + static int mlx5_extts_configure(struct ptp_clock_info *ptp, + struct ptp_clock_request *rq, + int on) +@@ -408,7 +533,12 @@ static int mlx5_extts_configure(struct p + if (rq->extts.index >= clock->ptp_info.n_pins) + return -EINVAL; + ++#ifdef HAVE_PTP_FIND_PIN_UNLOCK + pin = ptp_find_pin(clock->ptp, PTP_PF_EXTTS, rq->extts.index); ++#else ++ pin = mlx5_ptp_find_pin(clock, PTP_PF_EXTTS, rq->extts.index, on); ++#endif ++ + if (pin < 0) + return -EBUSY; + +@@ -459,11 +589,19 @@ static u64 find_target_cycles(struct mlx + + static u64 perout_conf_internal_timer(struct mlx5_core_dev *mdev, s64 sec) + { ++#ifndef HAVE_PTP_CLOCK_INFO_GETTIME_32BIT + struct timespec64 ts = {}; ++#else ++ struct timespec ts = {}; ++#endif + s64 target_ns; + + ts.tv_sec = sec; ++#ifndef HAVE_PTP_CLOCK_INFO_GETTIME_32BIT + target_ns = timespec64_to_ns(&ts); ++#else ++ target_ns = timespec_to_ns(&ts); ++#endif + + return find_target_cycles(mdev, target_ns); + } +@@ -476,13 +614,20 @@ static u64 perout_conf_real_time(s64 sec + static int perout_conf_1pps(struct mlx5_core_dev *mdev, struct ptp_clock_request *rq, + u64 *time_stamp, bool real_time) + { +- struct timespec64 ts; ++#ifndef HAVE_PTP_CLOCK_INFO_GETTIME_32BIT ++ struct timespec64 ts; ++#else ++ struct timespec ts; ++#endif + s64 ns; + + ts.tv_nsec = rq->perout.period.nsec; + ts.tv_sec = rq->perout.period.sec; ++#ifndef HAVE_PTP_CLOCK_INFO_GETTIME_32BIT + ns = timespec64_to_ns(&ts); +- ++#else ++ ns = timespec_to_ns(&ts); ++#endif + if ((ns >> 1) != 500000000LL) + return -EINVAL; + +@@ -499,20 +644,36 @@ static int mlx5_perout_conf_out_pulse_du + { + struct mlx5_pps *pps_info = &mdev->clock.pps_info; + u32 out_pulse_duration; +- struct timespec64 ts; ++#ifndef HAVE_PTP_CLOCK_INFO_GETTIME_32BIT ++ struct timespec64 ts; ++#else ++ struct timespec ts; ++#endif + ++#ifdef HAVE_PTP_PEROUT_DUTY_CYCLE + if (rq->perout.flags & PTP_PEROUT_DUTY_CYCLE) { + ts.tv_sec = rq->perout.on.sec; + ts.tv_nsec = rq->perout.on.nsec; ++#ifndef HAVE_PTP_CLOCK_INFO_GETTIME_32BIT + out_pulse_duration = (u32)timespec64_to_ns(&ts); ++#else ++ out_pulse_duration = (u32)timespec_to_ns(&ts); ++#endif + } else { ++#endif + /* out_pulse_duration_ns should be up to 50% of the + * pulse period as default + */ + ts.tv_sec = rq->perout.period.sec; + ts.tv_nsec = rq->perout.period.nsec; ++#ifndef HAVE_PTP_CLOCK_INFO_GETTIME_32BIT + out_pulse_duration = (u32)timespec64_to_ns(&ts) >> 1; ++#else ++ out_pulse_duration = (u32)timespec_to_ns(&ts) >> 1; ++#endif ++#ifdef HAVE_PTP_PEROUT_DUTY_CYCLE + } ++#endif + + if (out_pulse_duration < pps_info->min_out_pulse_duration_ns || + out_pulse_duration > MLX5_MAX_PULSE_DURATION) { +@@ -532,11 +693,19 @@ static int perout_conf_npps_real_time(st + { + struct mlx5_pps *pps_info = &mdev->clock.pps_info; + struct ptp_clock_time *time = &rq->perout.start; ++#ifndef HAVE_PTP_CLOCK_INFO_GETTIME_32BIT + struct timespec64 ts; ++#else ++ struct timespec ts; ++#endif + + ts.tv_sec = rq->perout.period.sec; + ts.tv_nsec = rq->perout.period.nsec; ++#ifndef HAVE_PTP_CLOCK_INFO_GETTIME_32BIT + if (timespec64_to_ns(&ts) < pps_info->min_npps_period) { ++#else ++ if (timespec_to_ns(&ts) < pps_info->min_npps_period) { ++#endif + mlx5_core_err(mdev, "NPPS period is lower than minimal npps period %llu\n", + pps_info->min_npps_period); + return -EINVAL; +@@ -556,7 +725,11 @@ static int perout_conf_npps_real_time(st + static bool mlx5_perout_verify_flags(struct mlx5_core_dev *mdev, unsigned int flags) + { + return ((!mlx5_npps_real_time_supported(mdev) && flags) || ++#ifdef HAVE_PTP_PEROUT_DUTY_CYCLE + (mlx5_npps_real_time_supported(mdev) && flags & ~PTP_PEROUT_DUTY_CYCLE)); ++#else ++ (mlx5_npps_real_time_supported(mdev) && flags)); ++#endif + } + + static int mlx5_perout_configure(struct ptp_clock_info *ptp, +@@ -589,7 +762,12 @@ static int mlx5_perout_configure(struct + return -EINVAL; + + field_select = MLX5_MTPPS_FS_ENABLE; ++#ifdef HAVE_PTP_FIND_PIN_UNLOCK + pin = ptp_find_pin(clock->ptp, PTP_PF_PEROUT, rq->perout.index); ++#else ++ pin = mlx5_ptp_find_pin(clock, PTP_PF_PEROUT, rq->perout.index, on); ++#endif ++ + if (pin < 0) + return -EBUSY; + +@@ -699,8 +877,18 @@ static const struct ptp_clock_info mlx5_ + .pps = 0, + .adjfreq = mlx5_ptp_adjfreq, + .adjtime = mlx5_ptp_adjtime, ++#ifdef HAVE_GETTIMEX64 + .gettimex64 = mlx5_ptp_gettimex, + .settime64 = mlx5_ptp_settime, ++#else /*HAVE_GETTIMEX64*/ ++#ifndef HAVE_PTP_CLOCK_INFO_GETTIME_32BIT ++ .gettime64 = mlx5_ptp_gettime, ++ .settime64 = mlx5_ptp_settime, ++#else ++ .gettime = mlx5_ptp_gettime, ++ .settime = mlx5_ptp_settime, ++#endif ++#endif /*HAVE_GETTIMEX64*/ + .enable = NULL, + .verify = NULL, + }; +@@ -796,7 +984,11 @@ static void mlx5_get_pps_caps(struct mlx + clock->pps_info.pin_caps[7] = MLX5_GET(mtpps_reg, out, cap_pin_7_mode); + } + ++#ifndef HAVE_PTP_CLOCK_INFO_GETTIME_32BIT + static void ts_next_sec(struct timespec64 *ts) ++#else ++static void ts_next_sec(struct timespec *ts) ++#endif + { + ts->tv_sec += 1; + ts->tv_nsec = 0; +@@ -805,12 +997,24 @@ static void ts_next_sec(struct timespec6 + static u64 perout_conf_next_event_timer(struct mlx5_core_dev *mdev, + struct mlx5_clock *clock) + { +- struct timespec64 ts; ++#ifndef HAVE_PTP_CLOCK_INFO_GETTIME_32BIT ++ struct timespec64 ts; ++#else ++ struct timespec ts; ++#endif + s64 target_ns; + ++#ifdef HAVE_GETTIMEX64 + mlx5_ptp_gettimex(&clock->ptp_info, &ts, NULL); ++#else ++ mlx5_ptp_gettime(&clock->ptp_info, &ts); ++#endif + ts_next_sec(&ts); ++#ifndef HAVE_PTP_CLOCK_INFO_GETTIME_32BIT + target_ns = timespec64_to_ns(&ts); ++#else ++ target_ns = timespec_to_ns(&ts); ++#endif + + return find_target_cycles(mdev, target_ns); + } +@@ -825,21 +1029,36 @@ static int mlx5_pps_event(struct notifie + struct mlx5_core_dev *mdev; + unsigned long flags; + u64 ns; +- ++#ifdef HAVE_KTIME_UNION_TV64 ++ ktime_t ktime; ++#endif + mdev = container_of(clock, struct mlx5_core_dev, clock); + + switch (clock->ptp_info.pin_config[pin].func) { + case PTP_PF_EXTTS: + ptp_event.index = pin; ++#ifdef HAVE_KTIME_UNION_TV64 ++ ktime = mlx5_real_time_mode(mdev) ? ++ mlx5_real_time_cyc2time(clock, ++ be64_to_cpu(eqe->data.pps.time_stamp)) : ++ mlx5_timecounter_cyc2time(clock, ++ be64_to_cpu(eqe->data.pps.time_stamp)); ++ ptp_event.timestamp = ktime.tv64; ++#else + ptp_event.timestamp = mlx5_real_time_mode(mdev) ? + mlx5_real_time_cyc2time(clock, + be64_to_cpu(eqe->data.pps.time_stamp)) : + mlx5_timecounter_cyc2time(clock, + be64_to_cpu(eqe->data.pps.time_stamp)); ++#endif + if (clock->pps_info.enabled) { + ptp_event.type = PTP_CLOCK_PPSUSR; + ptp_event.pps_times.ts_real = ++#ifndef HAVE_PTP_CLOCK_INFO_GETTIME_32BIT + ns_to_timespec64(ptp_event.timestamp); ++#else ++ ns_to_timespec(ptp_event.timestamp); ++#endif + } else { + ptp_event.type = PTP_CLOCK_EXTTS; + } +@@ -860,6 +1079,7 @@ static int mlx5_pps_event(struct notifie + + return NOTIFY_OK; + } ++#endif /* HAVE_PTP_CLOCK_INFO && (CONFIG_PTP_1588_CLOCK || CONFIG_PTP_1588_CLOCK_MODULE) */ + + static void mlx5_timecounter_init(struct mlx5_core_dev *mdev) + { +@@ -897,7 +1117,6 @@ static void mlx5_init_overflow_period(st + */ + overflow_cycles = div64_u64(~0ULL >> 1, timer->cycles.mult); + overflow_cycles = min(overflow_cycles, div_u64(timer->cycles.mask, 3)); +- + ns = cyclecounter_cyc2ns(&timer->cycles, overflow_cycles, + frac, &frac); + do_div(ns, NSEC_PER_SEC / HZ); +@@ -947,9 +1166,15 @@ static void mlx5_init_timer_clock(struct + clock->ptp_info = mlx5_ptp_clock_info; + + if (mlx5_real_time_mode(mdev)) { ++#ifndef HAVE_PTP_CLOCK_INFO_GETTIME_32BIT + struct timespec64 ts; + + ktime_get_real_ts64(&ts); ++#else ++ struct timespec ts; ++ ++ ktime_get_real_ts(&ts); ++#endif + mlx5_ptp_settime(&clock->ptp_info, &ts); + } + } +@@ -976,8 +1201,11 @@ void mlx5_init_clock(struct mlx5_core_de + + seqlock_init(&clock->lock); + mlx5_init_timer_clock(mdev); ++#if defined (CONFIG_PTP_1588_CLOCK) || defined(CONFIG_PTP_1588_CLOCK_MODULE) + INIT_WORK(&clock->pps_info.out_work, mlx5_pps_out); ++#endif + ++#if (defined (CONFIG_PTP_1588_CLOCK) || defined(CONFIG_PTP_1588_CLOCK_MODULE)) + /* Configure the PHC */ + clock->ptp_info = mlx5_ptp_clock_info; + +@@ -991,9 +1219,11 @@ void mlx5_init_clock(struct mlx5_core_de + PTR_ERR(clock->ptp)); + clock->ptp = NULL; + } +- ++#endif ++#if defined (CONFIG_PTP_1588_CLOCK) || defined(CONFIG_PTP_1588_CLOCK_MODULE) + MLX5_NB_INIT(&clock->pps_nb, mlx5_pps_event, PPS_EVENT); + mlx5_eq_notifier_register(mdev, &clock->pps_nb); ++#endif + } + + void mlx5_cleanup_clock(struct mlx5_core_dev *mdev) +@@ -1004,12 +1234,15 @@ void mlx5_cleanup_clock(struct mlx5_core + return; + + mlx5_eq_notifier_unregister(mdev, &clock->pps_nb); ++#if (defined (CONFIG_PTP_1588_CLOCK) || defined(CONFIG_PTP_1588_CLOCK_MODULE)) + if (clock->ptp) { + ptp_clock_unregister(clock->ptp); + clock->ptp = NULL; + } +- ++#endif ++#if defined (CONFIG_PTP_1588_CLOCK) || defined(CONFIG_PTP_1588_CLOCK_MODULE) + cancel_work_sync(&clock->pps_info.out_work); ++#endif + cancel_delayed_work_sync(&clock->timer.overflow_work); + + if (mdev->clock_info) { +@@ -1017,5 +1250,7 @@ void mlx5_cleanup_clock(struct mlx5_core + mdev->clock_info = NULL; + } + ++#if defined (CONFIG_PTP_1588_CLOCK) || defined(CONFIG_PTP_1588_CLOCK_MODULE) + kfree(clock->ptp_info.pin_config); ++#endif + } diff --git a/src/mlnx-ofa_kernel-5.8/backports/0185-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lib.patch b/src/mlnx-ofa_kernel-5.8/backports/0185-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lib.patch new file mode 100644 index 0000000..17fddf9 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0185-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lib.patch @@ -0,0 +1,40 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h + +Change-Id: Ib9550d92a934f5f230e0f622ecaba561eab58b6e +--- + drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h | 11 ++++++++--- + 1 file changed, 8 insertions(+), 3 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h +@@ -68,12 +68,14 @@ static inline ktime_t mlx5_timecounter_c + struct mlx5_timer *timer = &clock->timer; + unsigned int seq; + u64 nsec; +- ++#if (defined (CONFIG_PTP_1588_CLOCK) || defined(CONFIG_PTP_1588_CLOCK_MODULE)) + do { + seq = read_seqbegin(&clock->lock); + nsec = timecounter_cyc2time(&timer->tc, timestamp); + } while (read_seqretry(&clock->lock, seq)); +- ++#else ++ nsec = 0; ++#endif + return ns_to_ktime(nsec); + } + +@@ -82,8 +84,11 @@ static inline ktime_t mlx5_timecounter_c + static inline ktime_t mlx5_real_time_cyc2time(struct mlx5_clock *clock, + u64 timestamp) + { ++#if (defined (CONFIG_PTP_1588_CLOCK) || defined(CONFIG_PTP_1588_CLOCK_MODULE)) + u64 time = REAL_TIME_TO_NS(timestamp >> 32, timestamp & 0xFFFFFFFF); +- ++#else ++ u64 time = 0; ++#endif + return ns_to_ktime(time); + } + #else diff --git a/src/mlnx-ofa_kernel-5.8/backports/0186-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lib.patch b/src/mlnx-ofa_kernel-5.8/backports/0186-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lib.patch new file mode 100644 index 0000000..6721dfd --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0186-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lib.patch @@ -0,0 +1,32 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h + +Change-Id: I6878e687fb210384244f0c9816673ca03c503c42 +--- + drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h | 7 +++++++ + 1 file changed, 7 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h +@@ -84,7 +84,11 @@ int mlx5_eq_add_cq(struct mlx5_eq *eq, s + void mlx5_eq_del_cq(struct mlx5_eq *eq, struct mlx5_core_cq *cq); + struct mlx5_eq_comp *mlx5_eqn2comp_eq(struct mlx5_core_dev *dev, int eqn); + struct mlx5_eq *mlx5_get_async_eq(struct mlx5_core_dev *dev); ++#ifdef HAVE_TASKLET_SETUP + void mlx5_cq_tasklet_cb(struct tasklet_struct *t); ++#else ++void mlx5_cq_tasklet_cb(unsigned long data); ++#endif + struct cpumask *mlx5_eq_comp_cpumask(struct mlx5_core_dev *dev, int ix); + + u32 mlx5_eq_poll_irq_disabled(struct mlx5_eq_comp *eq); +@@ -98,6 +102,9 @@ void mlx5_debug_eq_remove(struct mlx5_co + void mlx5_eq_debugfs_init(struct mlx5_core_dev *dev); + void mlx5_eq_debugfs_cleanup(struct mlx5_core_dev *dev); + ++#ifndef HAVE_PCI_IRQ_API ++u32 mlx5_get_msix_vec(struct mlx5_core_dev *dev, int vecidx); ++#endif + /* This function should only be called after mlx5_cmd_force_teardown_hca */ + void mlx5_core_eq_free_irqs(struct mlx5_core_dev *dev); + #ifdef CONFIG_RFS_ACCEL diff --git a/src/mlnx-ofa_kernel-5.8/backports/0187-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lib.patch b/src/mlnx-ofa_kernel-5.8/backports/0187-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lib.patch new file mode 100644 index 0000000..23efbf4 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0187-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lib.patch @@ -0,0 +1,20 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/lib/geneve.h + +Change-Id: Iae3799a85e502166d45aff0e3d16c9c9c2a8c2db +--- + drivers/net/ethernet/mellanox/mlx5/core/lib/geneve.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/geneve.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/geneve.h +@@ -18,7 +18,7 @@ int mlx5_geneve_tlv_option_add(struct ml + void mlx5_geneve_tlv_option_del(struct mlx5_geneve *geneve); + + #else /* CONFIG_MLX5_ESWITCH */ +- ++struct geneve_opt; + static inline struct mlx5_geneve + *mlx5_geneve_create(struct mlx5_core_dev *mdev) { return NULL; } + static inline void diff --git a/src/mlnx-ofa_kernel-5.8/backports/0188-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lib.patch b/src/mlnx-ofa_kernel-5.8/backports/0188-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lib.patch new file mode 100644 index 0000000..5629953 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0188-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lib.patch @@ -0,0 +1,71 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/lib/gid.c + +Change-Id: I37abfa7b94db542b09fe81c1c2836eedc5af4013 +--- + .../net/ethernet/mellanox/mlx5/core/lib/gid.c | 30 +++++++++++++++++++ + 1 file changed, 30 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/gid.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/gid.c +@@ -45,9 +45,25 @@ void mlx5_init_reserved_gids(struct mlx5 + dev->roce.reserved_gids.count = 0; + } + ++#if !defined(HAVE_IDA_IS_EMPTY) && !defined(HAVE_IDR_IS_EMPTY) ++static int idr_has_entry(int id, void *p, void *data) ++{ ++ return 1; ++} ++ ++bool idr_is_empty(struct idr *idp) ++{ ++ return !idr_for_each(idp, idr_has_entry, NULL); ++} ++#endif ++ + void mlx5_cleanup_reserved_gids(struct mlx5_core_dev *dev) + { ++#ifdef HAVE_IDA_IS_EMPTY + WARN_ON(!ida_is_empty(&dev->roce.reserved_gids.ida)); ++#else ++ WARN_ON(!idr_is_empty(&dev->roce.reserved_gids.ida.idr)); ++#endif + dev->roce.reserved_gids.start = 0; + dev->roce.reserved_gids.count = 0; + ida_destroy(&dev->roce.reserved_gids.ida); +@@ -88,12 +104,22 @@ void mlx5_core_unreserve_gids(struct mlx + int mlx5_core_reserved_gid_alloc(struct mlx5_core_dev *dev, int *gid_index) + { + int end = dev->roce.reserved_gids.start + ++#ifdef HAVE_IDA_ALLOC_RANGE + dev->roce.reserved_gids.count - 1; ++#else ++ dev->roce.reserved_gids.count; ++#endif + int index = 0; + ++#ifdef HAVE_IDA_ALLOC_RANGE + index = ida_alloc_range(&dev->roce.reserved_gids.ida, + dev->roce.reserved_gids.start, end, + GFP_KERNEL); ++#else ++ index = ida_simple_get(&dev->roce.reserved_gids.ida, ++ dev->roce.reserved_gids.start, end, ++ GFP_KERNEL); ++#endif + if (index < 0) + return index; + +@@ -105,7 +131,11 @@ int mlx5_core_reserved_gid_alloc(struct + void mlx5_core_reserved_gid_free(struct mlx5_core_dev *dev, int gid_index) + { + mlx5_core_dbg(dev, "Freeing reserved GID %u\n", gid_index); ++#ifdef HAVE_IDA_FREE + ida_free(&dev->roce.reserved_gids.ida, gid_index); ++#else ++ ida_simple_remove(&dev->roce.reserved_gids.ida, gid_index); ++#endif + } + + unsigned int mlx5_core_reserved_gids_count(struct mlx5_core_dev *dev) diff --git a/src/mlnx-ofa_kernel-5.8/backports/0189-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lib.patch b/src/mlnx-ofa_kernel-5.8/backports/0189-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lib.patch new file mode 100644 index 0000000..5af836d --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0189-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lib.patch @@ -0,0 +1,26 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c + +Change-Id: I90167df82f82e34936f5a7402429cc60650daa49 +--- + drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c +@@ -136,6 +136,7 @@ int mlx5_mpfs_add_mac(struct mlx5_core_d + struct l2table_node *l2addr; + int err = 0; + u32 index; ++ COMPAT_HL_NODE + + if (!mpfs) + return 0; +@@ -184,6 +185,7 @@ int mlx5_mpfs_del_mac(struct mlx5_core_d + struct l2table_node *l2addr; + int err = 0; + u32 index; ++ COMPAT_HL_NODE + + if (!mpfs) + return 0; diff --git a/src/mlnx-ofa_kernel-5.8/backports/0190-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lib.patch b/src/mlnx-ofa_kernel-5.8/backports/0190-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lib.patch new file mode 100644 index 0000000..b394fd8 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0190-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lib.patch @@ -0,0 +1,27 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.h + +Change-Id: I23341370a334700bf10253dbaf526d57871c6d1a +--- + drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.h | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.h +@@ -47,14 +47,14 @@ struct l2addr_node { + + #define for_each_l2hash_node(hn, tmp, hash, i) \ + for (i = 0; i < MLX5_L2_ADDR_HASH_SIZE; i++) \ +- hlist_for_each_entry_safe(hn, tmp, &(hash)[i], hlist) ++ compat_hlist_for_each_entry_safe(hn, tmp, &hash[i], hlist) + + #define l2addr_hash_find(hash, mac, type) ({ \ + int ix = MLX5_L2_ADDR_HASH(mac); \ + bool found = false; \ + type *ptr = NULL; \ + \ +- hlist_for_each_entry(ptr, &(hash)[ix], node.hlist) \ ++ compat_hlist_for_each_entry(ptr, &hash[ix], node.hlist) \ + if (ether_addr_equal(ptr->node.addr, mac)) {\ + found = true; \ + break; \ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0191-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lib.patch b/src/mlnx-ofa_kernel-5.8/backports/0191-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lib.patch new file mode 100644 index 0000000..81c7f8f --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0191-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lib.patch @@ -0,0 +1,33 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/lib/tout.c + +Change-Id: Ie87f3226801a2b1bd3e55e1ca8e1248167befde4 +--- + .../net/ethernet/mellanox/mlx5/core/lib/tout.c | 16 ++++++++++++++++ + 1 file changed, 16 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/tout.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/tout.c +@@ -51,6 +51,22 @@ void mlx5_tout_cleanup(struct mlx5_core_ + kfree(dev->timeouts); + } + ++#ifndef HAVE_INT_POW ++static u64 int_pow(u64 base, unsigned int exp) ++{ ++ u64 result = 1; ++ ++ while (exp) { ++ if (exp & 1) ++ result *= base; ++ exp >>= 1; ++ base *= base; ++ } ++ ++ return result; ++} ++#endif ++ + /* Time register consists of two fields to_multiplier(time out multiplier) + * and to_value(time out value). to_value is the quantity of the time units and + * to_multiplier is the type and should be one off these four values. diff --git a/src/mlnx-ofa_kernel-5.8/backports/0192-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lib.patch b/src/mlnx-ofa_kernel-5.8/backports/0192-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lib.patch new file mode 100644 index 0000000..f779352 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0192-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lib.patch @@ -0,0 +1,199 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.c + +Change-Id: I93396aa797408f332eaa024b7d792893740a58ae +--- + .../ethernet/mellanox/mlx5/core/lib/vxlan.c | 98 +++++++++++++++++-- + 1 file changed, 91 insertions(+), 7 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.c +@@ -42,11 +42,17 @@ struct mlx5_vxlan { + struct mlx5_core_dev *mdev; + /* max_num_ports is usually 4, 16 buckets is more than enough */ + DECLARE_HASHTABLE(htable, 4); ++#ifndef HAVE_UDP_TUNNEL_NIC_INFO ++ int num_ports; ++#endif + struct mutex sync_lock; /* sync add/del port HW operations */ + }; + + struct mlx5_vxlan_port { + struct hlist_node hlist; ++#ifndef HAVE_UDP_TUNNEL_NIC_INFO ++ refcount_t refcount; ++#endif + u16 udp_port; + }; + +@@ -74,12 +80,13 @@ bool mlx5_vxlan_lookup_port(struct mlx5_ + { + struct mlx5_vxlan_port *vxlanp; + bool found = false; ++ COMPAT_HL_NODE + + if (!mlx5_vxlan_allowed(vxlan)) + return NULL; + + rcu_read_lock(); +- hash_for_each_possible_rcu(vxlan->htable, vxlanp, hlist, port) ++ compat_hash_for_each_possible_rcu(vxlan->htable, vxlanp, hlist, port) + if (vxlanp->udp_port == port) { + found = true; + break; +@@ -92,8 +99,9 @@ bool mlx5_vxlan_lookup_port(struct mlx5_ + static struct mlx5_vxlan_port *vxlan_lookup_port(struct mlx5_vxlan *vxlan, u16 port) + { + struct mlx5_vxlan_port *vxlanp; ++ COMPAT_HL_NODE + +- hash_for_each_possible(vxlan->htable, vxlanp, hlist, port) ++ compat_hash_for_each_possible(vxlan->htable, vxlanp, hlist, port) + if (vxlanp->udp_port == port) + return vxlanp; + return NULL; +@@ -102,24 +110,62 @@ static struct mlx5_vxlan_port *vxlan_loo + int mlx5_vxlan_add_port(struct mlx5_vxlan *vxlan, u16 port) + { + struct mlx5_vxlan_port *vxlanp; +- int ret; +- ++ int ret = 0; ++#ifdef HAVE_UDP_TUNNEL_NIC_INFO + vxlanp = kzalloc(sizeof(*vxlanp), GFP_KERNEL); + if (!vxlanp) + return -ENOMEM; + vxlanp->udp_port = port; +- ++#else ++ mutex_lock(&vxlan->sync_lock); ++ vxlanp = vxlan_lookup_port(vxlan, port); ++ if (vxlanp) { ++ refcount_inc(&vxlanp->refcount); ++ goto unlock; ++ } ++ if (vxlan->num_ports >= mlx5_vxlan_max_udp_ports(vxlan->mdev)) { ++ mlx5_core_info(vxlan->mdev, ++ "UDP port (%d) not offloaded, max number of UDP ports (%d) are already offloaded\n", ++ port, mlx5_vxlan_max_udp_ports(vxlan->mdev)); ++ ret = -ENOSPC; ++ goto unlock; ++ } ++#endif + ret = mlx5_vxlan_core_add_port_cmd(vxlan->mdev, port); ++#ifdef HAVE_UDP_TUNNEL_NIC_INFO + if (ret) { + kfree(vxlanp); + return ret; + } + + mutex_lock(&vxlan->sync_lock); ++#else ++ if (ret) ++ goto unlock; ++ ++ vxlanp = kzalloc(sizeof(*vxlanp), GFP_KERNEL); ++ if (!vxlanp) { ++ ret = -ENOMEM; ++ goto err_delete_port; ++ } ++ vxlanp->udp_port = port; ++ refcount_set(&vxlanp->refcount, 1); ++#endif + hash_add_rcu(vxlan->htable, &vxlanp->hlist, port); ++#ifndef HAVE_UDP_TUNNEL_NIC_INFO ++ vxlan->num_ports++; ++#endif + mutex_unlock(&vxlan->sync_lock); +- + return 0; ++ ++#ifndef HAVE_UDP_TUNNEL_NIC_INFO ++err_delete_port: ++ mlx5_vxlan_core_del_port_cmd(vxlan->mdev, port); ++ ++unlock: ++ mutex_unlock(&vxlan->sync_lock); ++ return ret; ++#endif + } + + int mlx5_vxlan_del_port(struct mlx5_vxlan *vxlan, u16 port) +@@ -130,15 +176,29 @@ int mlx5_vxlan_del_port(struct mlx5_vxla + mutex_lock(&vxlan->sync_lock); + + vxlanp = vxlan_lookup_port(vxlan, port); ++#ifdef HAVE_UDP_TUNNEL_NIC_INFO + if (WARN_ON(!vxlanp)) { ++#else ++ if (!vxlanp) { ++#endif + ret = -ENOENT; + goto out_unlock; + } + ++#ifdef HAVE_UDP_TUNNEL_NIC_INFO + hash_del_rcu(&vxlanp->hlist); + synchronize_rcu(); + mlx5_vxlan_core_del_port_cmd(vxlan->mdev, port); + kfree(vxlanp); ++#else ++ if (refcount_dec_and_test(&vxlanp->refcount)) { ++ hash_del_rcu(&vxlanp->hlist); ++ synchronize_rcu(); ++ mlx5_vxlan_core_del_port_cmd(vxlan->mdev, port); ++ kfree(vxlanp); ++ vxlan->num_ports--; ++ } ++#endif + + out_unlock: + mutex_unlock(&vxlan->sync_lock); +@@ -166,6 +226,7 @@ struct mlx5_vxlan *mlx5_vxlan_create(str + return vxlan; + } + ++#ifdef HAVE_DEVLINK_HAS_RELOAD_UP_DOWN + void mlx5_vxlan_destroy(struct mlx5_vxlan *vxlan) + { + if (!mlx5_vxlan_allowed(vxlan)) +@@ -182,11 +243,12 @@ void mlx5_vxlan_reset_to_default(struct + struct mlx5_vxlan_port *vxlanp; + struct hlist_node *tmp; + int bkt; ++ COMPAT_HL_NODE + + if (!mlx5_vxlan_allowed(vxlan)) + return; + +- hash_for_each_safe(vxlan->htable, bkt, tmp, vxlanp, hlist) { ++ compat_hash_for_each_safe(vxlan->htable, bkt, tmp, vxlanp, hlist) { + /* Don't delete default UDP port added by the HW. + * Remove only user configured ports + */ +@@ -195,3 +257,25 @@ void mlx5_vxlan_reset_to_default(struct + mlx5_vxlan_del_port(vxlan, vxlanp->udp_port); + } + } ++ ++#else ++void mlx5_vxlan_destroy(struct mlx5_vxlan *vxlan) ++{ ++ struct mlx5_vxlan_port *vxlanp; ++ struct hlist_node *tmp; ++ int bkt; ++ COMPAT_HL_NODE ++ ++ if (!mlx5_vxlan_allowed(vxlan)) ++ return; ++ ++ /* Lockless since we are the only hash table consumers*/ ++ compat_hash_for_each_safe(vxlan->htable, bkt, tmp, vxlanp, hlist) { ++ hash_del(&vxlanp->hlist); ++ mlx5_vxlan_core_del_port_cmd(vxlan->mdev, vxlanp->udp_port); ++ kfree(vxlanp); ++ } ++ ++ kfree(vxlan); ++} ++#endif diff --git a/src/mlnx-ofa_kernel-5.8/backports/0193-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-mai.patch b/src/mlnx-ofa_kernel-5.8/backports/0193-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-mai.patch new file mode 100644 index 0000000..e859947 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0193-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-mai.patch @@ -0,0 +1,539 @@ +From: Maher Sanalla +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/main.c + +Change-Id: I8b6650ce5a2a5ce78b326d66ae0217c356d6943d +--- + .../net/ethernet/mellanox/mlx5/core/main.c | 187 ++++++++++++++++-- + 1 file changed, 170 insertions(+), 17 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c +@@ -95,6 +95,10 @@ static unsigned int prof_sel = MLX5_DEFA + module_param_named(prof_sel, prof_sel, uint, 0444); + MODULE_PARM_DESC(prof_sel, "profile selector. Valid range 0 - 3"); + ++static bool probe_vf = 1; ++module_param_named(probe_vf, probe_vf, bool, 0644); ++MODULE_PARM_DESC(probe_vf, "probe VFs or not, 0 = not probe, 1 = probe. Default = 1"); ++ + MODULE_ALIAS("auxiliary:mlx5_core.eth"); + MODULE_ALIAS("auxiliary:mlx5_core.eth-rep"); + +@@ -618,6 +622,7 @@ static int handle_hca_cap_odp(struct mlx + return set_caps(dev, set_ctx, MLX5_SET_HCA_CAP_OP_MOD_ODP); + } + ++#ifdef HAVE_DEVLINK_PARAM_GENERIC_ID_ENABLE_ETH + static int max_uc_list_get_devlink_param(struct mlx5_core_dev *dev) + { + struct devlink *devlink = priv_to_devlink(dev); +@@ -632,6 +637,7 @@ static int max_uc_list_get_devlink_param + mlx5_core_dbg(dev, "Failed to get param. err = %d\n", err); + return err; + } ++#endif + + int mlx5_core_other_function_set_caps(struct mlx5_core_dev *dev, + const void *hca_cap_on_behalf, +@@ -658,6 +664,7 @@ int mlx5_core_other_function_set_caps(st + + bool mlx5_is_roce_on(struct mlx5_core_dev *dev) + { ++#if defined(HAVE_DEVLINK_PARAM) && defined(HAVE_DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE) + struct devlink *devlink = priv_to_devlink(dev); + union devlink_param_value val; + int err; +@@ -671,6 +678,9 @@ bool mlx5_is_roce_on(struct mlx5_core_de + + mlx5_core_dbg(dev, "Failed to get param. err = %d\n", err); + return MLX5_CAP_GEN(dev, roce); ++#else ++ return MLX5_CAP_GEN(dev, roce) && dev->roce.enabled; ++#endif + } + EXPORT_SYMBOL(mlx5_is_roce_on); + +@@ -678,7 +688,9 @@ static int handle_hca_cap(struct mlx5_co + { + struct mlx5_profile *prof = &dev->profile; + void *set_hca_cap; ++#ifdef HAVE_DEVLINK_PARAM_GENERIC_ID_ENABLE_ETH + int max_uc_list; ++#endif + int err; + + err = mlx5_core_get_caps(dev, MLX5_CAP_GENERAL); +@@ -750,17 +762,23 @@ static int handle_hca_cap(struct mlx5_co + + mlx5_vhca_state_cap_handle(dev, set_hca_cap); + ++#ifdef HAVE_SRIOV_GET_SET_MSIX_VEC_COUNT + if (MLX5_CAP_GEN_MAX(dev, num_total_dynamic_vf_msix)) + MLX5_SET(cmd_hca_cap, set_hca_cap, num_total_dynamic_vf_msix, + MLX5_CAP_GEN_MAX(dev, num_total_dynamic_vf_msix)); ++#endif + ++#if defined(HAVE_DEVLINK_PARAM) && defined(HAVE_DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE) + if (MLX5_CAP_GEN(dev, roce_rw_supported)) + MLX5_SET(cmd_hca_cap, set_hca_cap, roce, mlx5_is_roce_on(dev)); ++#endif + ++#ifdef HAVE_DEVLINK_PARAM_GENERIC_ID_ENABLE_ETH + max_uc_list = max_uc_list_get_devlink_param(dev); + if (max_uc_list > 0) + MLX5_SET(cmd_hca_cap, set_hca_cap, log_max_current_uc_list, + ilog2(max_uc_list)); ++#endif + + return set_caps(dev, set_ctx, MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE); + } +@@ -781,8 +799,12 @@ static int handle_hca_cap(struct mlx5_co + */ + static bool is_roce_fw_disabled(struct mlx5_core_dev *dev) + { ++#if defined(HAVE_DEVLINK_PARAM) && defined(HAVE_DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE) + return (MLX5_CAP_GEN(dev, roce_rw_supported) && !mlx5_is_roce_on(dev)) || + (!MLX5_CAP_GEN(dev, roce_rw_supported) && !MLX5_CAP_GEN(dev, roce)); ++#else ++ return !MLX5_CAP_GEN(dev, roce); ++#endif + } + + static int handle_hca_cap_roce(struct mlx5_core_dev *dev, void *set_ctx) +@@ -1015,6 +1037,9 @@ static ssize_t mlx5_roce_enable_show_ena + struct mlx5_core_dev *dev = pci_get_drvdata(pdev); + int ret; + ++#if defined(HAVE_DEVLINK_PARAM) && defined(HAVE_DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE) ++ return -EOPNOTSUPP; ++#endif + mutex_lock(&dev->roce.state_lock); + ret = dev->roce.enabled; + mutex_unlock(&dev->roce.state_lock); +@@ -1028,11 +1053,15 @@ static ssize_t mlx5_roce_enable_set_enab + { + struct pci_dev *pdev = container_of(device, struct pci_dev, dev); + struct mlx5_core_dev *dev = pci_get_drvdata(pdev); +- struct devlink *devlink = priv_to_devlink(dev); +- union devlink_param_value value; ++#if !defined(HAVE_DEVLINK_HAS_RELOAD) && !defined(HAVE_DEVLINK_HAS_RELOAD_UP_DOWN) ++ bool change; ++#endif + int ret; + bool val; + ++#if defined(HAVE_DEVLINK_PARAM) && defined(HAVE_DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE) ++ return -EOPNOTSUPP; ++#endif + ret = kstrtobool(buf, &val); + if (ret) + return -EINVAL; +@@ -1040,15 +1069,26 @@ static ssize_t mlx5_roce_enable_set_enab + if (val && !MLX5_CAP_GEN(dev, roce)) + return -EOPNOTSUPP; + ++ if (mlx5_core_is_mp_slave(dev) || mlx5_lag_is_active(dev)) ++ return -EOPNOTSUPP; ++ + mutex_lock(&dev->roce.state_lock); ++#if !defined(HAVE_DEVLINK_HAS_RELOAD) && !defined(HAVE_DEVLINK_HAS_RELOAD_UP_DOWN) ++ change = dev->roce.enabled != val; ++#endif + dev->roce.enabled = val; +- value.vbool = val; +- devlink_param_driverinit_value_set(devlink, +- DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE, +- value); + mutex_unlock(&dev->roce.state_lock); ++#if !defined(HAVE_DEVLINK_HAS_RELOAD) && !defined(HAVE_DEVLINK_HAS_RELOAD_UP_DOWN) ++ if (mlx5_sf_dev_allocated(dev)) ++ return -EOPNOTSUPP; ++ ++ if (!change) ++ return count; + +- return count; ++ mlx5_unload_one(dev); ++ ret = mlx5_load_one(dev, false); ++#endif ++ return (ret != 0 ? ret : count); + } + + static DEVICE_ATTR(roce_enable, 0644, mlx5_roce_enable_show_enabled, +@@ -1515,12 +1555,13 @@ static int mlx5_load(struct mlx5_core_de + + mlx5_accel_ipsec_init(dev); + ++#ifdef HAVE_UAPI_LINUX_TLS_H + err = mlx5_accel_tls_init(dev); + if (err) { + mlx5_core_err(dev, "TLS device start failed %d\n", err); + goto err_tls_start; + } +- ++#endif + err = mlx5_init_fs(dev); + if (err) { + mlx5_core_err(dev, "Failed to init flow steering\n"); +@@ -1575,8 +1616,10 @@ err_vhca: + err_set_hca: + mlx5_cleanup_fs(dev); + err_fs: ++#ifdef HAVE_UAPI_LINUX_TLS_H + mlx5_accel_tls_cleanup(dev); + err_tls_start: ++#endif + mlx5_accel_ipsec_cleanup(dev); + mlx5_fpga_device_stop(dev); + err_fpga_start: +@@ -1607,7 +1650,9 @@ static void mlx5_unload(struct mlx5_core + mlx5_vhca_event_stop(dev); + mlx5_cleanup_fs(dev); + mlx5_accel_ipsec_cleanup(dev); ++#ifdef HAVE_UAPI_LINUX_TLS_H + mlx5_accel_tls_cleanup(dev); ++#endif + mlx5_fpga_device_stop(dev); + mlx5_rsc_dump_cleanup(dev); + mlx5_hv_vhca_cleanup(dev->hv_vhca); +@@ -1622,8 +1667,14 @@ static void mlx5_unload(struct mlx5_core + + int mlx5_init_one(struct mlx5_core_dev *dev) + { ++#ifdef HAVE_DEVL_TRAP_GROUPS_REGISTER //forward port ++ struct devlink *devlink = priv_to_devlink(dev); ++#endif + int err = 0; + ++#ifdef HAVE_DEVL_TRAP_GROUPS_REGISTER //forward port ++ devl_lock(devlink); ++#endif + mutex_lock(&dev->intf_state_mutex); + dev->state = MLX5_DEVICE_STATE_UP; + +@@ -1648,7 +1699,11 @@ int mlx5_init_one(struct mlx5_core_dev * + mlx5_devm_params_publish(dev); + set_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state); + ++#ifdef HAVE_DEVLINK_REGISTER_GET_1_PARAMS + err = mlx5_devlink_register(priv_to_devlink(dev)); ++#else ++ err = mlx5_devlink_register(priv_to_devlink(dev), dev->device); ++#endif + if (err) + goto err_devlink_reg; + +@@ -1657,6 +1712,9 @@ int mlx5_init_one(struct mlx5_core_dev * + goto err_register; + + mutex_unlock(&dev->intf_state_mutex); ++#ifdef HAVE_DEVL_TRAP_GROUPS_REGISTER //forward port ++ devl_unlock(devlink); ++#endif + return 0; + + err_register: +@@ -1671,11 +1729,19 @@ function_teardown: + err_function: + dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR; + mutex_unlock(&dev->intf_state_mutex); ++#ifdef HAVE_DEVL_TRAP_GROUPS_REGISTER //forward port ++ devl_unlock(devlink); ++#endif + return err; + } + + void mlx5_uninit_one(struct mlx5_core_dev *dev) + { ++#ifdef HAVE_DEVL_TRAP_GROUPS_REGISTER //forward port ++ struct devlink *devlink = priv_to_devlink(dev); ++ ++ devl_lock(devlink); ++#endif + mutex_lock(&dev->intf_state_mutex); + + mlx5_unregister_device(dev); +@@ -1694,12 +1760,22 @@ void mlx5_uninit_one(struct mlx5_core_de + mlx5_function_teardown(dev, true); + out: + mutex_unlock(&dev->intf_state_mutex); ++#ifdef HAVE_DEVL_TRAP_GROUPS_REGISTER //forward port ++ devl_unlock(devlink); ++#endif + } + ++#ifdef HAVE_DEVL_TRAP_GROUPS_REGISTER //forward port ++int mlx5_load_one_devl_locked(struct mlx5_core_dev *dev, bool recovery) ++#else + int mlx5_load_one(struct mlx5_core_dev *dev, bool recovery) ++#endif + { + int err = 0; + ++#ifdef HAVE_DEVL_TRAP_GROUPS_REGISTER //forward port ++ devl_assert_locked(priv_to_devlink(dev)); ++#endif + mutex_lock(&dev->intf_state_mutex); + if (test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state)) { + mlx5_core_warn(dev, "interface is up, NOP\n"); +@@ -1743,6 +1819,19 @@ out: + return err; + } + ++#ifdef HAVE_DEVL_TRAP_GROUPS_REGISTER //forward port ++int mlx5_load_one(struct mlx5_core_dev *dev, bool recovery) ++{ ++ struct devlink *devlink = priv_to_devlink(dev); ++ int ret; ++ ++ devl_lock(devlink); ++ ret = mlx5_load_one_devl_locked(dev, recovery); ++ devl_unlock(devlink); ++ return ret; ++} ++#endif ++ + static int mlx5_try_fast_unload(struct mlx5_core_dev *dev) + { + bool fast_teardown = false, force_teardown = false; +@@ -1797,8 +1886,15 @@ succeed: + return 0; + } + ++#ifdef HAVE_DEVL_TRAP_GROUPS_REGISTER //forward port ++void mlx5_unload_one_devl_locked(struct mlx5_core_dev *dev) ++#else + void mlx5_unload_one(struct mlx5_core_dev *dev) ++#endif + { ++#ifdef HAVE_DEVL_TRAP_GROUPS_REGISTER //forward port ++ devl_assert_locked(priv_to_devlink(dev)); ++#endif + mutex_lock(&dev->intf_state_mutex); + + mlx5_detach_device(dev); +@@ -1816,6 +1912,17 @@ out: + mutex_unlock(&dev->intf_state_mutex); + } + ++#ifdef HAVE_DEVL_TRAP_GROUPS_REGISTER //forward port ++void mlx5_unload_one(struct mlx5_core_dev *dev) ++{ ++ struct devlink *devlink = priv_to_devlink(dev); ++ ++ devl_lock(devlink); ++ mlx5_unload_one_devl_locked(dev); ++ devl_unlock(devlink); ++} ++#endif ++ + static const int types[] = { + MLX5_CAP_GENERAL, + MLX5_CAP_GENERAL_2, +@@ -1882,9 +1989,13 @@ int mlx5_mdev_init(struct mlx5_core_dev + memcpy(&dev->profile, &profile[profile_idx], sizeof(dev->profile)); + INIT_LIST_HEAD(&priv->ctx_list); + spin_lock_init(&priv->ctx_lock); ++#ifdef HAVE_LOCKDEP_UNREGISTER_KEY + lockdep_register_key(&dev->lock_key); ++#endif + mutex_init(&dev->intf_state_mutex); ++#ifdef HAVE_LOCKDEP_UNREGISTER_KEY + lockdep_set_class(&dev->intf_state_mutex, &dev->lock_key); ++#endif + + mutex_init(&priv->bfregs.reg_head.lock); + mutex_init(&priv->bfregs.wc_head.lock); +@@ -1939,7 +2050,9 @@ err_timeout_init: + mutex_destroy(&priv->bfregs.wc_head.lock); + mutex_destroy(&priv->bfregs.reg_head.lock); + mutex_destroy(&dev->intf_state_mutex); ++#ifdef HAVE_LOCKDEP_UNREGISTER_KEY + lockdep_unregister_key(&dev->lock_key); ++#endif + return err; + } + +@@ -1958,13 +2071,16 @@ void mlx5_mdev_uninit(struct mlx5_core_d + mutex_destroy(&priv->bfregs.wc_head.lock); + mutex_destroy(&priv->bfregs.reg_head.lock); + mutex_destroy(&dev->intf_state_mutex); ++#ifdef HAVE_LOCKDEP_UNREGISTER_KEY + lockdep_unregister_key(&dev->lock_key); ++#endif + } + + static int probe_one(struct pci_dev *pdev, const struct pci_device_id *id) + { +- struct mlx5_core_dev *dev; ++ struct mlx5_core_dev *dev = NULL; + struct devlink *devlink; ++ struct mlx5_priv *priv; + int err; + + devlink = mlx5_devlink_alloc(&pdev->dev); +@@ -1978,14 +2094,21 @@ static int probe_one(struct pci_dev *pde + goto remove_roce_file; + + dev = devlink_priv(devlink); ++ priv = &dev->priv; + dev->device = &pdev->dev; + dev->pdev = pdev; ++ priv->sriov.probe_vf = probe_vf; + + dev->coredev_type = id->driver_data & MLX5_PCI_DEV_IS_VF ? + MLX5_COREDEV_VF : MLX5_COREDEV_PF; + + pci_set_drvdata(dev->pdev, dev); + ++ if (pdev->is_virtfn && !probe_vf) { ++ dev_info(&pdev->dev, "Avoid probing VFs\n"); ++ return 0; ++ } ++ + dev->priv.adev_idx = mlx5_adev_idx_alloc(); + if (dev->priv.adev_idx < 0) { + err = dev->priv.adev_idx; +@@ -2022,7 +2145,13 @@ static int probe_one(struct pci_dev *pde + dev_err(&pdev->dev, "mlx5_crdump_enable failed with error code %d\n", err); + + pci_save_state(pdev); ++#ifdef HAVE_DEVLINK_REGISTER_GET_1_PARAMS + devlink_register(devlink); ++#endif ++#if defined(HAVE_DEVLINK_RELOAD_ENABLE) && !defined(HAVE_DEVLINK_SET_FEATURES) ++ if (!mlx5_core_is_mp_slave(dev)) ++ devlink_reload_enable(devlink); ++#endif + return 0; + + err_init_one: +@@ -2043,8 +2172,16 @@ remove_roce_file: + + static void remove_one(struct pci_dev *pdev) + { +- struct mlx5_core_dev *dev = pci_get_drvdata(pdev); +- struct devlink *devlink = priv_to_devlink(dev); ++ struct mlx5_core_dev *dev; ++ struct devlink *devlink; ++ struct mlx5_priv *priv; ++ ++ dev = pci_get_drvdata(pdev); ++ devlink = priv_to_devlink(dev); ++ priv = &dev->priv; ++ ++ if (pdev->is_virtfn && !priv->sriov.probe_vf) ++ goto out; + + /* mlx5_drain_fw_reset() is using devlink APIs. Hence, we must drain + * fw_reset before unregistering the devlink. +@@ -2054,7 +2191,12 @@ static void remove_one(struct pci_dev *p + if (mlx5_try_fast_unload(dev)) + dev_dbg(&dev->pdev->dev, "mlx5_try_fast_unload failed\n"); + ++#if defined(HAVE_DEVLINK_RELOAD_DISABLE) && !defined(HAVE_DEVLINK_SET_FEATURES) ++ devlink_reload_disable(devlink); ++#endif ++#ifdef HAVE_DEVLINK_REGISTER_GET_1_PARAMS + devlink_unregister(devlink); ++#endif + mlx5_crdump_disable(dev); + mlx5_drain_health_wq(dev); + mlx5_uninit_one(dev); +@@ -2062,6 +2204,7 @@ static void remove_one(struct pci_dev *p + mlx5_pci_close(dev); + mlx5_mdev_uninit(dev); + mlx5_adev_idx_free(dev->priv.adev_idx); ++out: + device_remove_file(&pdev->dev, mlx5_roce_enable_dev_attrs); + mlx5_devlink_free(devlink); + } +@@ -2090,7 +2233,7 @@ static int suspend(struct device *device + + dev_info(&pdev->dev, "suspend was called\n"); + +- if (pdev->is_virtfn) ++ if (pdev->is_virtfn && !dev->priv.sriov.probe_vf) + return 0; + + mlx5_unload_one(dev); +@@ -2125,7 +2268,7 @@ static int resume(struct device *device) + + dev_info(&pdev->dev, "resume was called\n"); + +- if (pdev->is_virtfn) ++ if (pdev->is_virtfn && !dev->priv.sriov.probe_vf) + return 0; + + err = pci_set_power_state(pdev, PCI_D0); +@@ -2170,7 +2313,7 @@ static pci_ers_result_t mlx5_pci_err_det + + mlx5_pci_trace(dev, "Enter, pci channel state = %d\n", state); + +- if (pdev->is_virtfn) ++ if (pdev->is_virtfn && !dev->priv.sriov.probe_vf) + return PCI_ERS_RESULT_CAN_RECOVER; + + mlx5_enter_error_state(dev, false); +@@ -2223,7 +2366,7 @@ static pci_ers_result_t mlx5_pci_slot_re + + mlx5_core_info(dev, "%s was called\n", __func__); + +- if (pdev->is_virtfn) ++ if (pdev->is_virtfn && !dev->priv.sriov.probe_vf) + return PCI_ERS_RESULT_NEED_RESET; + + err = mlx5_pci_enable_device(dev); +@@ -2257,7 +2400,7 @@ static void mlx5_pci_resume(struct pci_d + + mlx5_pci_trace(dev, "Enter, loading driver..\n"); + +- if (pdev->is_virtfn) ++ if (pdev->is_virtfn && !dev->priv.sriov.probe_vf) + return; + + dev->priv.sw_reset_lag = dev->priv.lag_enabled; +@@ -2280,7 +2423,7 @@ static void shutdown(struct pci_dev *pde + + mlx5_core_info(dev, "Shutdown was called\n"); + +- if (pdev->is_virtfn) ++ if (pdev->is_virtfn && !dev->priv.sriov.probe_vf) + return; + + set_bit(MLX5_INTERFACE_STATE_TEARDOWN, &dev->intf_state); +@@ -2344,7 +2487,11 @@ MODULE_DEVICE_TABLE(pci, mlx5_core_pci_t + void mlx5_disable_device(struct mlx5_core_dev *dev) + { + mlx5_error_sw_reset(dev); ++#ifdef HAVE_DEVL_TRAP_GROUPS_REGISTER //forward port ++ mlx5_unload_one_devl_locked(dev); ++#else + mlx5_unload_one(dev); ++#endif + } + + int mlx5_recover_device(struct mlx5_core_dev *dev) +@@ -2355,7 +2502,11 @@ int mlx5_recover_device(struct mlx5_core + return -EIO; + } + ++#ifdef HAVE_DEVL_TRAP_GROUPS_REGISTER //forward port ++ return mlx5_load_one_devl_locked(dev, true); ++#else + return mlx5_load_one(dev, true); ++#endif + } + + static struct pci_driver mlx5_core_driver = { +@@ -2373,8 +2524,10 @@ static struct pci_driver mlx5_core_drive + .shutdown = shutdown, + .err_handler = &mlx5_err_handler, + .sriov_configure = mlx5_core_sriov_configure, ++#ifdef HAVE_SRIOV_GET_SET_MSIX_VEC_COUNT + .sriov_get_vf_total_msix = mlx5_sriov_get_vf_total_msix, + .sriov_set_msix_vec_count = mlx5_core_sriov_set_msix_vec_count, ++#endif + }; + + static void mlx5_core_verify_params(void) diff --git a/src/mlnx-ofa_kernel-5.8/backports/0195-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-mlx.patch b/src/mlnx-ofa_kernel-5.8/backports/0195-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-mlx.patch new file mode 100644 index 0000000..c98b636 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0195-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-mlx.patch @@ -0,0 +1,70 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/mlx5_devm.c + +Change-Id: I78abe0eb11fea9634d41ad7d28fcaa04505d0a53 +--- + .../ethernet/mellanox/mlx5/core/mlx5_devm.c | 23 +++++++++++++++---- + 1 file changed, 19 insertions(+), 4 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_devm.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_devm.c +@@ -1,6 +1,5 @@ + // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB + /* Copyright (c) 2021 Mellanox Technologies Ltd. */ +- + #include + #include "mlx5_core.h" + #include "fs_core.h" +@@ -140,8 +139,11 @@ int mlx5_devm_sf_port_fn_state_get(struc + memset(&devport, 0, sizeof(devport)); + devport.devlink = devlink; + devport.index = port->index; +- ++#ifdef HAVE_PORT_FUNCTION_STATE_GET_4_PARAM + ret = mlx5_devlink_sf_port_fn_state_get(&devport, &dl_state, &dl_opstate, extack); ++#else ++ ret = mlx5_devlink_sf_port_fn_state_get(devlink, &devport, &dl_state, &dl_opstate, extack); ++#endif + if (!ret) { + *state = devlink_to_mlxdevm_state(dl_state); + *opstate = devlink_to_mlxdevm_opstate(dl_opstate); +@@ -162,7 +164,12 @@ int mlx5_devm_sf_port_fn_state_set(struc + devport.devlink = devlink; + devport.index = port->index; + dl_state = mlxdevm_to_devlink_state(state); ++#ifdef HAVE_PORT_FUNCTION_STATE_GET_4_PARAM + return mlx5_devlink_sf_port_fn_state_set(&devport, dl_state, extack); ++#else ++ return mlx5_devlink_sf_port_fn_state_set(devlink, &devport, dl_state, ++ extack); ++#endif + } + + int mlx5_devm_sf_port_fn_hw_addr_get(struct mlxdevm_port *port, +@@ -177,8 +184,12 @@ int mlx5_devm_sf_port_fn_hw_addr_get(str + devport.devlink = devlink; + devport.index = port->index; + +- return mlx5_devlink_port_function_hw_addr_get(&devport, hw_addr, +- hw_addr_len, extack); ++#ifdef HAVE_PORT_FUNCTION_HW_ADDR_GET_GET_4_PARAM ++ return mlx5_devlink_port_function_hw_addr_get(&devport, ++#else ++ return mlx5_devlink_port_function_hw_addr_get(devlink, &devport, ++#endif ++ hw_addr, hw_addr_len, extack); + } + + int mlx5_devm_sf_port_function_trust_get(struct mlxdevm_port *port, +@@ -206,7 +217,11 @@ int mlx5_devm_sf_port_fn_hw_addr_set(str + memset(&devport, 0, sizeof(devport)); + devport.devlink = devlink; + devport.index = port->index; ++#ifdef HAVE_PORT_FUNCTION_HW_ADDR_GET_GET_4_PARAM + return mlx5_devlink_port_function_hw_addr_set(&devport, hw_addr, ++#else ++ return mlx5_devlink_port_function_hw_addr_set(devlink, &devport, hw_addr, ++#endif + hw_addr_len, extack); + } + diff --git a/src/mlnx-ofa_kernel-5.8/backports/0197-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-por.patch b/src/mlnx-ofa_kernel-5.8/backports/0197-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-por.patch new file mode 100644 index 0000000..9414a67 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0197-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-por.patch @@ -0,0 +1,26 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/port.c + +Change-Id: I96593219c6a7afbae6d87b4bd2cce7437475c981 +--- + drivers/net/ethernet/mellanox/mlx5/core/port.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/port.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c +@@ -442,6 +442,7 @@ int mlx5_query_module_eeprom(struct mlx5 + } + EXPORT_SYMBOL_GPL(mlx5_query_module_eeprom); + ++#ifdef HAVE_GET_MODULE_EEPROM_BY_PAGE + int mlx5_query_module_eeprom_by_page(struct mlx5_core_dev *dev, + struct mlx5_module_eeprom_query_params *params, + u8 *data) +@@ -484,6 +485,7 @@ int mlx5_query_module_eeprom_by_page(str + return mlx5_query_mcia(dev, params, data); + } + EXPORT_SYMBOL_GPL(mlx5_query_module_eeprom_by_page); ++#endif + + static int mlx5_query_port_pvlc(struct mlx5_core_dev *dev, u32 *pvlc, + int pvlc_size, u8 local_port) diff --git a/src/mlnx-ofa_kernel-5.8/backports/0198-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-sri.patch b/src/mlnx-ofa_kernel-5.8/backports/0198-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-sri.patch new file mode 100644 index 0000000..c4a3826 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0198-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-sri.patch @@ -0,0 +1,125 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/sriov.c + +Change-Id: I9cb7f2773cb0d44ab041fe1661c05d7e72a6bd85 +--- + .../net/ethernet/mellanox/mlx5/core/sriov.c | 32 +++++++++++++++++-- + 1 file changed, 30 insertions(+), 2 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c +@@ -72,7 +72,10 @@ static int sriov_restore_guids(struct ml + static int mlx5_device_enable_sriov(struct mlx5_core_dev *dev, int num_vfs) + { + struct mlx5_core_sriov *sriov = &dev->priv.sriov; +- int err, vf, num_msix_count; ++#ifdef HAVE_SRIOV_GET_SET_MSIX_VEC_COUNT ++ int num_msix_count; ++#endif ++ int err, vf; + + if (!MLX5_ESWITCH_MANAGER(dev)) + goto enable_vfs_hca; +@@ -94,8 +97,9 @@ enable_vfs_hca: + #endif + return err; + } +- ++#ifdef HAVE_SRIOV_GET_SET_MSIX_VEC_COUNT + num_msix_count = mlx5_get_default_msix_vec_count(dev, num_vfs); ++#endif + for (vf = 0; vf < num_vfs; vf++) { + err = mlx5_core_enable_hca(dev, vf + 1); + if (err) { +@@ -103,6 +107,7 @@ enable_vfs_hca: + continue; + } + ++#ifdef HAVE_SRIOV_GET_SET_MSIX_VEC_COUNT + err = mlx5_set_msix_vec_count(dev, vf + 1, num_msix_count); + if (err) { + mlx5_core_warn(dev, +@@ -110,6 +115,7 @@ enable_vfs_hca: + vf, err); + continue; + } ++#endif + + sriov->vfs_ctx[vf].enabled = 1; + if (MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_IB) { +@@ -156,6 +162,9 @@ mlx5_device_disable_sriov(struct mlx5_co + static int mlx5_sriov_enable(struct pci_dev *pdev, int num_vfs) + { + struct mlx5_core_dev *dev = pci_get_drvdata(pdev); ++#ifdef HAVE_DEVL_TRAP_GROUPS_REGISTER //forward port ++ struct devlink *devlink = priv_to_devlink(dev); ++#endif + int err; + + if (num_vfs && pci_num_vf(dev->pdev)) { +@@ -168,7 +177,13 @@ static int mlx5_sriov_enable(struct pci_ + return -EBUSY; + } + ++#ifdef HAVE_DEVL_TRAP_GROUPS_REGISTER //forward port ++ devl_lock(devlink); ++#endif + err = mlx5_device_enable_sriov(dev, num_vfs); ++#ifdef HAVE_DEVL_TRAP_GROUPS_REGISTER //forward port ++ devl_unlock(devlink); ++#endif + if (err) { + mlx5_core_warn(dev, "mlx5_device_enable_sriov failed : %d\n", err); + return err; +@@ -185,10 +200,19 @@ static int mlx5_sriov_enable(struct pci_ + static void mlx5_sriov_disable(struct pci_dev *pdev) + { + struct mlx5_core_dev *dev = pci_get_drvdata(pdev); ++#ifdef HAVE_DEVL_TRAP_GROUPS_REGISTER //forward port ++ struct devlink *devlink = priv_to_devlink(dev); ++#endif + int num_vfs = pci_num_vf(dev->pdev); + + pci_disable_sriov(pdev); ++#ifdef HAVE_DEVL_TRAP_GROUPS_REGISTER //forward port ++ devl_lock(devlink); ++#endif + mlx5_device_disable_sriov(dev, num_vfs, true); ++#ifdef HAVE_DEVL_TRAP_GROUPS_REGISTER //forward port ++ devl_unlock(devlink); ++#endif + } + + int mlx5_core_sriov_configure(struct pci_dev *pdev, int num_vfs) +@@ -209,6 +233,7 @@ int mlx5_core_sriov_configure(struct pci + return err ? err : num_vfs; + } + ++#ifdef HAVE_SRIOV_GET_SET_MSIX_VEC_COUNT + int mlx5_core_sriov_set_msix_vec_count(struct pci_dev *vf, int msix_vec_count) + { + struct pci_dev *pf = pci_physfn(vf); +@@ -236,6 +261,7 @@ int mlx5_core_sriov_set_msix_vec_count(s + + if (vf->devfn == pci_iov_virtfn_devfn(pf, id)) + break; ++ + } + + if (id == pci_num_vf(pf) || !sriov->vfs_ctx[id].enabled) +@@ -243,6 +269,7 @@ int mlx5_core_sriov_set_msix_vec_count(s + + return mlx5_set_msix_vec_count(dev, id + 1, msix_vec_count); + } ++#endif + + int mlx5_sriov_attach(struct mlx5_core_dev *dev) + { +@@ -296,6 +323,7 @@ int mlx5_sriov_init(struct mlx5_core_dev + return 0; + + total_vfs = pci_sriov_get_totalvfs(pdev); ++ + sriov->max_vfs = mlx5_get_max_vfs(dev); + sriov->num_vfs = pci_num_vf(pdev); + sriov->vfs_ctx = kcalloc(total_vfs, sizeof(*sriov->vfs_ctx), GFP_KERNEL); diff --git a/src/mlnx-ofa_kernel-5.8/backports/0199-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-ste.patch b/src/mlnx-ofa_kernel-5.8/backports/0199-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-ste.patch new file mode 100644 index 0000000..8683585 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0199-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-ste.patch @@ -0,0 +1,35 @@ +From: Yevgeny Kliteynik +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/steering/dr_icm_pool.c + +Change-Id: Iff8ba4ae191f1c44a80953dad5bb8b9c78bcf701 +--- + .../mellanox/mlx5/core/steering/dr_icm_pool.c | 17 +++++++++++++++++ + 1 file changed, 17 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_icm_pool.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_icm_pool.c +@@ -3,6 +3,23 @@ + + #include "dr_types.h" + ++/* Define local implementation of kvfree to replace compat ++ * layer implementation, so that memtrack will see the calling ++ * function directly - otherwise it is hidden by compat's ++ * "backport_kvfree" function in the stack. ++ * Unfortunately, compat's backport_kvfree is defined even in ++ * some kernels that do have kvfree. ++ */ ++#ifdef kvfree ++#undef kvfree ++#endif ++#define kvfree(p) { if (is_vmalloc_addr(p)) vfree(p); else kfree(p); } ++ ++#ifdef backport_kvfree ++#undef backport_kvfree ++#endif ++#define backport_kvfree kvfree ++ + #define DR_ICM_MODIFY_HDR_ALIGN_BASE 64 + #define DR_ICM_MODIFY_HDR_GRANULARITY_4K 12 + #define DR_ICM_POOL_HOT_MEMORY_FRACTION 4 diff --git a/src/mlnx-ofa_kernel-5.8/backports/0200-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-ste.patch b/src/mlnx-ofa_kernel-5.8/backports/0200-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-ste.patch new file mode 100644 index 0000000..d4da971 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0200-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-ste.patch @@ -0,0 +1,23 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c + +Change-Id: Ibb7dc6d4c07efb98ddbbc7206bd2c325c0bcdfe4 +--- + drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c +@@ -360,7 +360,11 @@ static void dr_destroy_qp(struct mlx5_co + + static void dr_cmd_notify_hw(struct mlx5dr_qp *dr_qp, void *ctrl) + { ++#ifdef dma_wmb + dma_wmb(); ++#else ++ wmb(); ++#endif + *dr_qp->wq.sq.db = cpu_to_be32(dr_qp->sq.pc & 0xffff); + + /* After wmb() the hw aware of new work */ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0201-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-uar.patch b/src/mlnx-ofa_kernel-5.8/backports/0201-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-uar.patch new file mode 100644 index 0000000..8879731 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0201-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-uar.patch @@ -0,0 +1,24 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/uar.c + +Change-Id: Ib7d7c27094e7ab37a45ceef4a1660629f4038420 +--- + drivers/net/ethernet/mellanox/mlx5/core/uar.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/uar.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/uar.c +@@ -358,9 +358,13 @@ static int mlx5_get_pcie_dev_link_caps(s + *width = (lnkcap1 & PCI_EXP_LNKCAP_MLW) >> + PCI_EXP_LNKCAP_MLW_SHIFT; + if (*speed == PCI_SPEED_UNKNOWN) { /* pre-r3.0 */ ++#ifdef PCI_EXP_LNKCAP_SLS_8_0GB + if (lnkcap1 & PCI_EXP_LNKCAP_SLS_8_0GB) + *speed = PCIE_SPEED_8_0GT; + else if (lnkcap1 & PCI_EXP_LNKCAP_SLS_5_0GB) ++#else ++ if (lnkcap1 & PCI_EXP_LNKCAP_SLS_5_0GB) ++#endif + *speed = PCIE_SPEED_5_0GT; + else if (lnkcap1 & PCI_EXP_LNKCAP_SLS_2_5GB) + *speed = PCIE_SPEED_2_5GT; diff --git a/src/mlnx-ofa_kernel-5.8/backports/0202-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-vpo.patch b/src/mlnx-ofa_kernel-5.8/backports/0202-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-vpo.patch new file mode 100644 index 0000000..e06a9d1 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0202-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-vpo.patch @@ -0,0 +1,18 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/vport.c + +Change-Id: Ia9c049ce31fe7e19cfdab9acf84c03b43550aa80 +--- + drivers/net/ethernet/mellanox/mlx5/core/vport.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c +@@ -113,6 +113,7 @@ void mlx5_query_min_inline(struct mlx5_c + if (!mlx5_query_nic_vport_min_inline(mdev, 0, min_inline_mode)) + break; + fallthrough; ++ + case MLX5_CAP_INLINE_MODE_L2: + *min_inline_mode = MLX5_INLINE_MODE_L2; + break; diff --git a/src/mlnx-ofa_kernel-5.8/backports/0203-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-wq..patch b/src/mlnx-ofa_kernel-5.8/backports/0203-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-wq..patch new file mode 100644 index 0000000..e8cff64 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0203-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-wq..patch @@ -0,0 +1,22 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/wq.h + +Change-Id: I50f3e5639722d17186cf8a676760a165ce567b8c +--- + drivers/net/ethernet/mellanox/mlx5/core/wq.h | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/wq.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/wq.h +@@ -238,7 +238,11 @@ static inline struct mlx5_cqe64 *mlx5_cq + return NULL; + + /* ensure cqe content is read after cqe ownership bit */ ++#ifdef dma_rmb + dma_rmb(); ++#else ++ rmb(); ++#endif + + return cqe; + } diff --git a/src/mlnx-ofa_kernel-5.8/backports/0204-BACKPORT-drivers-net-ethernet-mellanox-mlxfw-mlxfw.h.patch b/src/mlnx-ofa_kernel-5.8/backports/0204-BACKPORT-drivers-net-ethernet-mellanox-mlxfw-mlxfw.h.patch new file mode 100644 index 0000000..158d542 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0204-BACKPORT-drivers-net-ethernet-mellanox-mlxfw-mlxfw.h.patch @@ -0,0 +1,82 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlxfw/mlxfw.h + +Change-Id: I51aea1feb481a5fa1deb1fed6ce90ec98354b10b +--- + drivers/net/ethernet/mellanox/mlxfw/mlxfw.h | 33 +++++++++++++++++---- + 1 file changed, 28 insertions(+), 5 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlxfw/mlxfw.h ++++ b/drivers/net/ethernet/mellanox/mlxfw/mlxfw.h +@@ -13,23 +13,33 @@ struct mlxfw_dev { + const struct mlxfw_dev_ops *ops; + const char *psid; + u16 psid_size; ++#ifdef HAVE_DEVLINK_H + struct devlink *devlink; ++#endif + }; + ++#ifdef HAVE_DEVLINK_H + static inline + struct device *mlxfw_dev_dev(struct mlxfw_dev *mlxfw_dev) + { ++#ifdef HAVE_DEVLINK_TO_DEV + return devlink_to_dev(mlxfw_dev->devlink); ++#else ++ return mlxfw_dev->devlink->dev; ++#endif + } ++#endif + + #define MLXFW_PRFX "mlxfw: " + ++#ifdef HAVE_DEVLINK_H + #define mlxfw_info(mlxfw_dev, fmt, ...) \ + dev_info(mlxfw_dev_dev(mlxfw_dev), MLXFW_PRFX fmt, ## __VA_ARGS__) + #define mlxfw_err(mlxfw_dev, fmt, ...) \ + dev_err(mlxfw_dev_dev(mlxfw_dev), MLXFW_PRFX fmt, ## __VA_ARGS__) + #define mlxfw_dbg(mlxfw_dev, fmt, ...) \ + dev_dbg(mlxfw_dev_dev(mlxfw_dev), MLXFW_PRFX fmt, ## __VA_ARGS__) ++#endif + + enum mlxfw_fsm_state { + MLXFW_FSM_STATE_IDLE, +@@ -96,19 +106,32 @@ struct mlxfw_dev_ops { + void (*fsm_cancel)(struct mlxfw_dev *mlxfw_dev, u32 fwhandle); + + void (*fsm_release)(struct mlxfw_dev *mlxfw_dev, u32 fwhandle); ++ ++#ifndef HAVE_DEVLINK_FLASH_UPDATE_STATUS_NOTIFY ++ void (*status_notify)(struct mlxfw_dev *mlxfw_dev, ++ const char *msg, const char *comp_name, ++ u32 done_bytes, u32 total_bytes); ++#endif ++ + }; + + #if IS_REACHABLE(CONFIG_MLXFW) + int mlxfw_firmware_flash(struct mlxfw_dev *mlxfw_dev, +- const struct firmware *firmware, +- struct netlink_ext_ack *extack); ++ const struct firmware *firmware ++#ifdef HAVE_NETLINK_EXT_ACK ++ , struct netlink_ext_ack *extack ++#endif ++ ); + #else + static inline + int mlxfw_firmware_flash(struct mlxfw_dev *mlxfw_dev, +- const struct firmware *firmware, +- struct netlink_ext_ack *extack) ++ const struct firmware *firmware ++#ifdef HAVE_NETLINK_EXT_ACK ++ , struct netlink_ext_ack *extack ++#endif ++ ) + { +- return -EOPNOTSUPP; ++ return -EOPNOTSUPP; + } + #endif + diff --git a/src/mlnx-ofa_kernel-5.8/backports/0205-BACKPORT-drivers-net-ethernet-mellanox-mlxfw-mlxfw_f.patch b/src/mlnx-ofa_kernel-5.8/backports/0205-BACKPORT-drivers-net-ethernet-mellanox-mlxfw-mlxfw_f.patch new file mode 100644 index 0000000..5e62d63 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0205-BACKPORT-drivers-net-ethernet-mellanox-mlxfw-mlxfw_f.patch @@ -0,0 +1,621 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlxfw/mlxfw_fsm.c + +Change-Id: I758da2d1c64ef054cecc519a51b700150fae02b9 +--- + .../net/ethernet/mellanox/mlxfw/mlxfw_fsm.c | 284 ++++++++++++++++-- + 1 file changed, 265 insertions(+), 19 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlxfw/mlxfw_fsm.c ++++ b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_fsm.c +@@ -39,7 +39,9 @@ static const int mlxfw_fsm_state_errno[] + } while (0) + + static int mlxfw_fsm_state_err(struct mlxfw_dev *mlxfw_dev, ++#ifdef HAVE_NETLINK_EXT_ACK + struct netlink_ext_ack *extack, ++#endif + enum mlxfw_fsm_state_err err) + { + enum mlxfw_fsm_state_err fsm_state_err; +@@ -49,35 +51,75 @@ static int mlxfw_fsm_state_err(struct ml + + switch (fsm_state_err) { + case MLXFW_FSM_STATE_ERR_ERROR: ++#ifdef HAVE_NETLINK_EXT_ACK + MLXFW_ERR_MSG(mlxfw_dev, extack, "general error", err); ++#else ++ pr_err("%s: general error, err (%d)\n", MLXFW_ERR_PRFX , fsm_state_err); ++#endif + break; + case MLXFW_FSM_STATE_ERR_REJECTED_DIGEST_ERR: ++#ifdef HAVE_NETLINK_EXT_ACK + MLXFW_ERR_MSG(mlxfw_dev, extack, "component hash mismatch", err); ++#else ++ pr_err("%s: component hash mismatch, err (%d)\n", MLXFW_ERR_PRFX , fsm_state_err); ++#endif + break; + case MLXFW_FSM_STATE_ERR_REJECTED_NOT_APPLICABLE: ++#ifdef HAVE_NETLINK_EXT_ACK + MLXFW_ERR_MSG(mlxfw_dev, extack, "component not applicable", err); ++#else ++ pr_err("%s: component not applicable, err (%d)\n", MLXFW_ERR_PRFX , fsm_state_err); ++#endif + break; + case MLXFW_FSM_STATE_ERR_REJECTED_UNKNOWN_KEY: ++#ifdef HAVE_NETLINK_EXT_ACK + MLXFW_ERR_MSG(mlxfw_dev, extack, "unknown key", err); ++#else ++ pr_err("%s: unknown key, err (%d)\n", MLXFW_ERR_PRFX , fsm_state_err); ++#endif + break; + case MLXFW_FSM_STATE_ERR_REJECTED_AUTH_FAILED: ++#ifdef HAVE_NETLINK_EXT_ACK + MLXFW_ERR_MSG(mlxfw_dev, extack, "authentication failed", err); ++#else ++ pr_err("%s: authentication failed, err (%d)\n", MLXFW_ERR_PRFX , fsm_state_err); ++#endif + break; + case MLXFW_FSM_STATE_ERR_REJECTED_UNSIGNED: ++#ifdef HAVE_NETLINK_EXT_ACK + MLXFW_ERR_MSG(mlxfw_dev, extack, "component was not signed", err); ++#else ++ pr_err("%s: component was not signed, err (%d)\n", MLXFW_ERR_PRFX , fsm_state_err); ++#endif + break; + case MLXFW_FSM_STATE_ERR_REJECTED_KEY_NOT_APPLICABLE: ++#ifdef HAVE_NETLINK_EXT_ACK + MLXFW_ERR_MSG(mlxfw_dev, extack, "key not applicable", err); ++#else ++ pr_err("%s: key not applicable, err (%d)\n", MLXFW_ERR_PRFX , fsm_state_err); ++#endif + break; + case MLXFW_FSM_STATE_ERR_REJECTED_BAD_FORMAT: ++#ifdef HAVE_NETLINK_EXT_ACK + MLXFW_ERR_MSG(mlxfw_dev, extack, "bad format", err); ++#else ++ pr_err("%s: bad format, err (%d)\n", MLXFW_ERR_PRFX , fsm_state_err); ++#endif + break; + case MLXFW_FSM_STATE_ERR_BLOCKED_PENDING_RESET: ++#ifdef HAVE_NETLINK_EXT_ACK + MLXFW_ERR_MSG(mlxfw_dev, extack, "pending reset", err); ++#else ++ pr_err("%s: pending reset, err (%d)\n", MLXFW_ERR_PRFX , fsm_state_err); ++#endif + break; + case MLXFW_FSM_STATE_ERR_OK: + case MLXFW_FSM_STATE_ERR_MAX: ++#ifdef HAVE_NETLINK_EXT_ACK + MLXFW_ERR_MSG(mlxfw_dev, extack, "unknown error", err); ++#else ++ pr_err("%s: unknown error, err (%d)\n", MLXFW_ERR_PRFX , fsm_state_err); ++#endif + break; + } + +@@ -85,8 +127,11 @@ static int mlxfw_fsm_state_err(struct ml + }; + + static int mlxfw_fsm_state_wait(struct mlxfw_dev *mlxfw_dev, u32 fwhandle, +- enum mlxfw_fsm_state fsm_state, +- struct netlink_ext_ack *extack) ++ enum mlxfw_fsm_state fsm_state ++#ifdef HAVE_NETLINK_EXT_ACK ++ , struct netlink_ext_ack *extack ++#endif ++ ) + { + enum mlxfw_fsm_state_err fsm_state_err; + enum mlxfw_fsm_state curr_fsm_state; +@@ -98,17 +143,29 @@ retry: + err = mlxfw_dev->ops->fsm_query_state(mlxfw_dev, fwhandle, + &curr_fsm_state, &fsm_state_err); + if (err) { ++#ifdef HAVE_NETLINK_EXT_ACK + MLXFW_ERR_MSG(mlxfw_dev, extack, "FSM state query failed", err); ++#else ++ pr_err("%s: FSM state query failed, err (%d)\n", MLXFW_ERR_PRFX , err); ++#endif + return err; + } + + if (fsm_state_err != MLXFW_FSM_STATE_ERR_OK) +- return mlxfw_fsm_state_err(mlxfw_dev, extack, fsm_state_err); ++ return mlxfw_fsm_state_err(mlxfw_dev, ++#ifdef HAVE_NETLINK_EXT_ACK ++ extack, ++#endif ++ fsm_state_err); + + if (curr_fsm_state != fsm_state) { + if (--times == 0) { ++#ifdef HAVE_NETLINK_EXT_ACK + MLXFW_ERR_MSG(mlxfw_dev, extack, + "Timeout reached on FSM state change", -ETIMEDOUT); ++#else ++ pr_err("%s: Timeout reached on FSM state change, err (%d)\n", MLXFW_ERR_PRFX , -ETIMEDOUT); ++#endif + return -ETIMEDOUT; + } + msleep(MLXFW_FSM_STATE_WAIT_CYCLE_MS); +@@ -119,7 +176,10 @@ retry: + + static int + mlxfw_fsm_reactivate_err(struct mlxfw_dev *mlxfw_dev, +- struct netlink_ext_ack *extack, u8 err) ++#ifdef HAVE_NETLINK_EXT_ACK ++ struct netlink_ext_ack *extack, ++#endif ++ u8 err) + { + enum mlxfw_fsm_reactivate_status status; + +@@ -132,42 +192,84 @@ mlxfw_fsm_reactivate_err(struct mlxfw_de + + switch (status) { + case MLXFW_FSM_REACTIVATE_STATUS_BUSY: ++#ifdef HAVE_NETLINK_EXT_ACK + MLXFW_REACT_ERR("busy", err); ++#else ++ pr_err("%s: busy, err (%d)\n", MXFW_REACT_PRFX , err); ++#endif + break; + case MLXFW_FSM_REACTIVATE_STATUS_PROHIBITED_FW_VER_ERR: ++#ifdef HAVE_NETLINK_EXT_ACK + MLXFW_REACT_ERR("prohibited fw ver", err); ++#else ++ pr_err("%s: prohibited fw ver, err (%d)\n", MXFW_REACT_PRFX , err); ++#endif + break; + case MLXFW_FSM_REACTIVATE_STATUS_FIRST_PAGE_COPY_FAILED: ++#ifdef HAVE_NETLINK_EXT_ACK + MLXFW_REACT_ERR("first page copy failed", err); ++#else ++ pr_err("%s: first page copy failed, err (%d)\n", MXFW_REACT_PRFX , err); ++#endif + break; + case MLXFW_FSM_REACTIVATE_STATUS_FIRST_PAGE_ERASE_FAILED: ++#ifdef HAVE_NETLINK_EXT_ACK + MLXFW_REACT_ERR("first page erase failed", err); ++#else ++ pr_err("%s: first page erase failed, err (%d)\n", MXFW_REACT_PRFX , err); ++#endif + break; + case MLXFW_FSM_REACTIVATE_STATUS_FIRST_PAGE_RESTORE_FAILED: ++#ifdef HAVE_NETLINK_EXT_ACK + MLXFW_REACT_ERR("first page restore failed", err); ++#else ++ pr_err("%s: first page restore failed, err (%d)\n", MXFW_REACT_PRFX , err); ++#endif + break; + case MLXFW_FSM_REACTIVATE_STATUS_CANDIDATE_FW_DEACTIVATION_FAILED: ++#ifdef HAVE_NETLINK_EXT_ACK + MLXFW_REACT_ERR("candidate fw deactivation failed", err); ++#else ++ pr_err("%s: candidate fw deactivation failed, err (%d)\n", MXFW_REACT_PRFX , err); ++#endif + break; + case MLXFW_FSM_REACTIVATE_STATUS_ERR_DEVICE_RESET_REQUIRED: ++#ifdef HAVE_NETLINK_EXT_ACK + MLXFW_REACT_ERR("device reset required", err); ++#else ++ pr_err("%s: device reset required, err (%d)\n", MXFW_REACT_PRFX , err); ++#endif + break; + case MLXFW_FSM_REACTIVATE_STATUS_ERR_FW_PROGRAMMING_NEEDED: ++#ifdef HAVE_NETLINK_EXT_ACK + MLXFW_REACT_ERR("fw programming needed", err); ++#else ++ pr_err("%s: fw progamming needed, err (%d)\n", MXFW_REACT_PRFX , err); ++#endif + break; + case MLXFW_FSM_REACTIVATE_STATUS_FW_ALREADY_ACTIVATED: ++#ifdef HAVE_NETLINK_EXT_ACK + MLXFW_REACT_ERR("fw already activated", err); ++#else ++ pr_err("%s: fw already activated, err (%d)\n", MXFW_REACT_PRFX , err); ++#endif + break; + case MLXFW_FSM_REACTIVATE_STATUS_OK: + case MLXFW_FSM_REACTIVATE_STATUS_MAX: ++#ifdef HAVE_NETLINK_EXT_ACK + MLXFW_REACT_ERR("unexpected error", err); ++#else ++ pr_err("%s: unexpected error, err (%d)\n", MXFW_REACT_PRFX , err); ++#endif + break; + } + return -EREMOTEIO; + }; + + static int mlxfw_fsm_reactivate(struct mlxfw_dev *mlxfw_dev, ++#ifdef HAVE_NETLINK_EXT_ACK + struct netlink_ext_ack *extack, ++#endif + bool *supported) + { + u8 status; +@@ -183,8 +285,12 @@ static int mlxfw_fsm_reactivate(struct m + } + + if (err) { ++#ifdef HAVE_NETLINK_EXT_ACK + MLXFW_ERR_MSG(mlxfw_dev, extack, + "Could not reactivate firmware flash", err); ++#else ++ pr_err("%s: Could not reactivate firmware flash, err (%d)\n", MLXFW_ERR_PRFX , err); ++#endif + return err; + } + +@@ -192,15 +298,26 @@ static int mlxfw_fsm_reactivate(struct m + status == MLXFW_FSM_REACTIVATE_STATUS_FW_ALREADY_ACTIVATED) + return 0; + +- return mlxfw_fsm_reactivate_err(mlxfw_dev, extack, status); ++ return mlxfw_fsm_reactivate_err(mlxfw_dev, ++#ifdef HAVE_NETLINK_EXT_ACK ++ extack, ++#endif ++ status); + } + + static void mlxfw_status_notify(struct mlxfw_dev *mlxfw_dev, + const char *msg, const char *comp_name, + u32 done_bytes, u32 total_bytes) + { ++#ifdef HAVE_DEVLINK_FLASH_UPDATE_STATUS_NOTIFY + devlink_flash_update_status_notify(mlxfw_dev->devlink, msg, comp_name, + done_bytes, total_bytes); ++#else ++ if (!mlxfw_dev->ops->status_notify) ++ return; ++ mlxfw_dev->ops->status_notify(mlxfw_dev, msg, comp_name, ++ done_bytes, total_bytes); ++#endif + } + + #define MLXFW_ALIGN_DOWN(x, align_bits) ((x) & ~((1 << (align_bits)) - 1)) +@@ -210,8 +327,11 @@ static void mlxfw_status_notify(struct m + static int mlxfw_flash_component(struct mlxfw_dev *mlxfw_dev, + u32 fwhandle, + struct mlxfw_mfa2_component *comp, +- bool reactivate_supp, +- struct netlink_ext_ack *extack) ++ bool reactivate_supp ++#ifdef HAVE_NETLINK_EXT_ACK ++ ,struct netlink_ext_ack *extack ++#endif ++ ) + { + u16 comp_max_write_size; + u8 comp_align_bits; +@@ -228,42 +348,70 @@ static int mlxfw_flash_component(struct + &comp_max_size, &comp_align_bits, + &comp_max_write_size); + if (err) { ++#ifdef HAVE_NETLINK_EXT_ACK + MLXFW_ERR_MSG(mlxfw_dev, extack, "FSM component query failed", err); ++#else ++ pr_err("%s: FSM component query failed, err (%d)\n", MLXFW_ERR_PRFX , err); ++#endif + return err; + } + + comp_max_size = min_t(u32, comp_max_size, MLXFW_FSM_MAX_COMPONENT_SIZE); + if (comp->data_size > comp_max_size) { ++#ifdef HAVE_NETLINK_EXT_ACK + MLXFW_ERR_MSG(mlxfw_dev, extack, + "Component size is bigger than limit", -EINVAL); ++#else ++ pr_err("%s: Component size is bigger than limit, err (%d)\n", MLXFW_ERR_PRFX , -EINVAL); ++#endif + return -EINVAL; + } + + comp_max_write_size = MLXFW_ALIGN_DOWN(comp_max_write_size, + comp_align_bits); + ++#ifdef HAVE_NETLINK_EXT_ACK + mlxfw_dbg(mlxfw_dev, "Component update\n"); ++#else ++ pr_debug("Component update\n"); ++#endif + mlxfw_status_notify(mlxfw_dev, "Updating component", comp_name, 0, 0); + err = mlxfw_dev->ops->fsm_component_update(mlxfw_dev, fwhandle, + comp->index, + comp->data_size); + if (err) { + if (!reactivate_supp) ++#ifdef HAVE_NETLINK_EXT_ACK + MLXFW_ERR_MSG(mlxfw_dev, extack, + "FSM component update failed, FW reactivate is not supported", + err); ++#else ++ pr_err("%s: FSM component update failed, FW reactivate is not supported, err (%d)\n", MLXFW_ERR_PRFX , err); ++#endif + else ++#ifdef HAVE_NETLINK_EXT_ACK + MLXFW_ERR_MSG(mlxfw_dev, extack, + "FSM component update failed", err); ++#else ++ pr_err("%s: FSM component update failed, err (%d)\n", MLXFW_ERR_PRFX , err); ++#endif + return err; + } + + err = mlxfw_fsm_state_wait(mlxfw_dev, fwhandle, +- MLXFW_FSM_STATE_DOWNLOAD, extack); ++ MLXFW_FSM_STATE_DOWNLOAD ++#ifdef HAVE_NETLINK_EXT_ACK ++ , extack ++#endif ++ ); + if (err) + goto err_out; + ++#ifdef HAVE_NETLINK_EXT_ACK + mlxfw_dbg(mlxfw_dev, "Component download\n"); ++#else ++ pr_debug("Component download\n"); ++#endif + mlxfw_status_notify(mlxfw_dev, "Downloading component", + comp_name, 0, comp->data_size); + for (offset = 0; +@@ -276,8 +424,12 @@ static int mlxfw_flash_component(struct + block_ptr, block_size, + offset); + if (err) { ++#ifdef HAVE_NETLINK_EXT_ACK + MLXFW_ERR_MSG(mlxfw_dev, extack, + "Component download failed", err); ++#else ++ pr_err("%s: Component download failed, err (%d)\n", MLXFW_ERR_PRFX , err); ++#endif + goto err_out; + } + mlxfw_status_notify(mlxfw_dev, "Downloading component", +@@ -285,18 +437,30 @@ static int mlxfw_flash_component(struct + comp->data_size); + } + ++#ifdef HAVE_NETLINK_EXT_ACK + mlxfw_dbg(mlxfw_dev, "Component verify\n"); ++#else ++ pr_debug("Component verify\n"); ++#endif + mlxfw_status_notify(mlxfw_dev, "Verifying component", comp_name, 0, 0); + err = mlxfw_dev->ops->fsm_component_verify(mlxfw_dev, fwhandle, + comp->index); + if (err) { ++#ifdef HAVE_NETLINK_EXT_ACK + MLXFW_ERR_MSG(mlxfw_dev, extack, + "FSM component verify failed", err); ++#else ++ pr_err("%s: FSM component verify failed, err (%d)\n", MLXFW_ERR_PRFX , err); ++#endif + goto err_out; + } + + err = mlxfw_fsm_state_wait(mlxfw_dev, fwhandle, +- MLXFW_FSM_STATE_LOCKED, extack); ++ MLXFW_FSM_STATE_LOCKED ++#ifdef HAVE_NETLINK_EXT_ACK ++ , extack ++#endif ++ ); + if (err) + goto err_out; + return 0; +@@ -308,8 +472,11 @@ err_out: + + static int mlxfw_flash_components(struct mlxfw_dev *mlxfw_dev, u32 fwhandle, + struct mlxfw_mfa2_file *mfa2_file, +- bool reactivate_supp, +- struct netlink_ext_ack *extack) ++ bool reactivate_supp ++#ifdef HAVE_NETLINK_EXT_ACK ++ , struct netlink_ext_ack *extack ++#endif ++ ) + { + u32 component_count; + int err; +@@ -319,8 +486,12 @@ static int mlxfw_flash_components(struct + mlxfw_dev->psid_size, + &component_count); + if (err) { ++#ifdef HAVE_NETLINK_EXT_ACK + MLXFW_ERR_MSG(mlxfw_dev, extack, + "Could not find device PSID in MFA2 file", err); ++#else ++ pr_err("%s: Could not find device PSID in MFA2 file, err (%d)\n", MLXFW_ERR_PRFX , err); ++#endif + return err; + } + +@@ -331,15 +502,28 @@ static int mlxfw_flash_components(struct + mlxfw_dev->psid_size, i); + if (IS_ERR(comp)) { + err = PTR_ERR(comp); ++#ifdef HAVE_NETLINK_EXT_ACK + MLXFW_ERR_MSG(mlxfw_dev, extack, + "Failed to get MFA2 component", err); ++#else ++ pr_err("%s: Failed to get MFA2 component, err (%d)\n", MLXFW_ERR_PRFX , err); ++#endif + return err; + } + ++#ifdef HAVE_NETLINK_EXT_ACK + mlxfw_info(mlxfw_dev, "Flashing component type %d\n", + comp->index); ++#else ++ pr_info("Flashing component type %d\n", comp->index); ++#endif ++ + err = mlxfw_flash_component(mlxfw_dev, fwhandle, comp, +- reactivate_supp, extack); ++ reactivate_supp ++#ifdef HAVE_NETLINK_EXT_ACK ++ , extack ++#endif ++ ); + mlxfw_mfa2_file_component_put(comp); + if (err) + return err; +@@ -348,8 +532,11 @@ static int mlxfw_flash_components(struct + } + + int mlxfw_firmware_flash(struct mlxfw_dev *mlxfw_dev, +- const struct firmware *firmware, +- struct netlink_ext_ack *extack) ++ const struct firmware *firmware ++#ifdef HAVE_NETLINK_EXT_ACK ++ , struct netlink_ext_ack *extack ++#endif ++ ) + { + struct mlxfw_mfa2_file *mfa2_file; + bool reactivate_supp = true; +@@ -357,68 +544,124 @@ int mlxfw_firmware_flash(struct mlxfw_de + int err; + + if (!mlxfw_mfa2_check(firmware)) { ++#ifdef HAVE_NETLINK_EXT_ACK + MLXFW_ERR_MSG(mlxfw_dev, extack, + "Firmware file is not MFA2", -EINVAL); ++#else ++ pr_err("%s: Firmware file is not MFA2, err (%d)\n", MLXFW_ERR_PRFX , -EINVAL); ++#endif + return -EINVAL; + } + + mfa2_file = mlxfw_mfa2_file_init(firmware); + if (IS_ERR(mfa2_file)) { + err = PTR_ERR(mfa2_file); ++#ifdef HAVE_NETLINK_EXT_ACK + MLXFW_ERR_MSG(mlxfw_dev, extack, + "Failed to initialize MFA2 firmware file", err); ++#else ++ pr_err("%s: Failed to initialize MFA2 firmware file, err (%d)\n", MLXFW_ERR_PRFX , err); ++#endif + return err; + } + ++#ifdef HAVE_NETLINK_EXT_ACK + mlxfw_info(mlxfw_dev, "Initialize firmware flash process\n"); ++#else ++ pr_info("Initialize firmware flash process\n"); ++#endif + mlxfw_status_notify(mlxfw_dev, "Initializing firmware flash process", + NULL, 0, 0); + err = mlxfw_dev->ops->fsm_lock(mlxfw_dev, &fwhandle); + if (err) { ++#ifdef HAVE_NETLINK_EXT_ACK + MLXFW_ERR_MSG(mlxfw_dev, extack, + "Could not lock the firmware FSM", err); ++#else ++ pr_err("%s: Could not lock the firmware FSM, err (%d)\n", MLXFW_ERR_PRFX , err); ++#endif + goto err_fsm_lock; + } + + err = mlxfw_fsm_state_wait(mlxfw_dev, fwhandle, +- MLXFW_FSM_STATE_LOCKED, extack); ++ MLXFW_FSM_STATE_LOCKED ++#ifdef HAVE_NETLINK_EXT_ACK ++ , extack ++#endif ++ ); + if (err) + goto err_state_wait_idle_to_locked; + +- err = mlxfw_fsm_reactivate(mlxfw_dev, extack, &reactivate_supp); ++ err = mlxfw_fsm_reactivate(mlxfw_dev, ++#ifdef HAVE_NETLINK_EXT_ACK ++ extack, ++#endif ++ &reactivate_supp); + if (err) + goto err_fsm_reactivate; + + err = mlxfw_fsm_state_wait(mlxfw_dev, fwhandle, +- MLXFW_FSM_STATE_LOCKED, extack); ++ MLXFW_FSM_STATE_LOCKED ++#ifdef HAVE_NETLINK_EXT_ACK ++ , extack ++#endif ++ ); + if (err) + goto err_state_wait_reactivate_to_locked; + + err = mlxfw_flash_components(mlxfw_dev, fwhandle, mfa2_file, +- reactivate_supp, extack); ++ reactivate_supp ++#ifdef HAVE_NETLINK_EXT_ACK ++ , extack ++#endif ++ ); + if (err) + goto err_flash_components; + ++#ifdef HAVE_NETLINK_EXT_ACK + mlxfw_dbg(mlxfw_dev, "Activate image\n"); ++#else ++ pr_debug("Activate image\n"); ++#endif + mlxfw_status_notify(mlxfw_dev, "Activating image", NULL, 0, 0); + err = mlxfw_dev->ops->fsm_activate(mlxfw_dev, fwhandle); + if (err) { ++#ifdef HAVE_NETLINK_EXT_ACK + MLXFW_ERR_MSG(mlxfw_dev, extack, + "Could not activate the downloaded image", err); ++#else ++ pr_err("%s: Could not activate the downloaded image, err (%d)\n", MLXFW_ERR_PRFX , err); ++#endif + goto err_fsm_activate; + } + + err = mlxfw_fsm_state_wait(mlxfw_dev, fwhandle, +- MLXFW_FSM_STATE_LOCKED, extack); ++ MLXFW_FSM_STATE_LOCKED ++#ifdef HAVE_NETLINK_EXT_ACK ++ , extack ++#endif ++ ); + if (err) + goto err_state_wait_activate_to_locked; + ++#ifdef HAVE_NETLINK_EXT_ACK + mlxfw_dbg(mlxfw_dev, "Handle release\n"); ++#else ++ pr_debug("Handle release\n"); ++#endif + mlxfw_dev->ops->fsm_release(mlxfw_dev, fwhandle); + ++#ifdef HAVE_NETLINK_EXT_ACK + mlxfw_info(mlxfw_dev, "Firmware flash done\n"); ++#else ++ pr_info("Firmware flash done\n"); ++#endif + mlxfw_status_notify(mlxfw_dev, "Firmware flash done", NULL, 0, 0); + mlxfw_mfa2_file_fini(mfa2_file); ++ ++#ifdef HAVE_DEVLINK_FLASH_UPDATE_END_NOTIFY ++ devlink_flash_update_end_notify(mlxfw_dev->devlink); ++#endif + return 0; + + err_state_wait_activate_to_locked: +@@ -430,6 +673,9 @@ err_state_wait_idle_to_locked: + mlxfw_dev->ops->fsm_release(mlxfw_dev, fwhandle); + err_fsm_lock: + mlxfw_mfa2_file_fini(mfa2_file); ++#ifdef HAVE_DEVLINK_FLASH_UPDATE_END_NOTIFY ++ devlink_flash_update_end_notify(mlxfw_dev->devlink); ++#endif + return err; + } + EXPORT_SYMBOL(mlxfw_firmware_flash); diff --git a/src/mlnx-ofa_kernel-5.8/backports/0206-BACKPORT-drivers-net-ethernet-mellanox-mlxfw-mlxfw_m.patch b/src/mlnx-ofa_kernel-5.8/backports/0206-BACKPORT-drivers-net-ethernet-mellanox-mlxfw-mlxfw_m.patch new file mode 100644 index 0000000..23dafa5 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0206-BACKPORT-drivers-net-ethernet-mellanox-mlxfw-mlxfw_m.patch @@ -0,0 +1,18 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2.c + +Change-Id: I9e8a466a3b051b4f56e331fc326735a8a50d9a88 +--- + drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2.c ++++ b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2.c +@@ -7,6 +7,7 @@ + #define pr_fmt(fmt) "mlxfw_mfa2: " fmt + + #include ++#include + #include + #include + #include diff --git a/src/mlnx-ofa_kernel-5.8/backports/0210-BACKPORT-drivers-nvme-host-ioctl.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0210-BACKPORT-drivers-nvme-host-ioctl.c.patch new file mode 100644 index 0000000..b7b37f4 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0210-BACKPORT-drivers-nvme-host-ioctl.c.patch @@ -0,0 +1,150 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/nvme/host/ioctl.c + +Change-Id: Iec372d7c3a75a0c7c90896d43ef4a5caf927c42f +--- + drivers/nvme/host/ioctl.c | 53 +++++++++++++++++++++++++++++++++++++-- + 1 file changed, 51 insertions(+), 2 deletions(-) + +--- a/drivers/nvme/host/ioctl.c ++++ b/drivers/nvme/host/ioctl.c +@@ -14,8 +14,10 @@ + */ + static void __user *nvme_to_user_ptr(uintptr_t ptrval) + { ++#if defined(HAVE_IN_COMPAT_SYSCALL) && defined(HAVE_COMPAT_UPTR_T) + if (in_compat_syscall()) + ptrval = (compat_uptr_t)ptrval; ++#endif + return (void __user *)ptrval; + } + +@@ -39,9 +41,10 @@ static void *nvme_add_user_metadata(stru + ret = PTR_ERR(bip); + goto out_free_meta; + } +- ++#ifdef HAVE_BIO_INTEGRITY_PYLD_BIP_ITER + bip->bip_iter.bi_size = len; + bip->bip_iter.bi_sector = seed; ++#endif + ret = bio_integrity_add_page(bio, virt_to_page(buf), len, + offset_in_page(buf)); + if (ret == len) +@@ -60,13 +63,21 @@ static int nvme_submit_user_cmd(struct r + { + bool write = nvme_is_write(cmd); + struct nvme_ns *ns = q->queuedata; ++#ifdef HAVE_ENUM_BIO_REMAPPED + struct block_device *bdev = ns ? ns->disk->part0 : NULL; ++#else ++ struct gendisk *disk = ns ? ns->disk : NULL; ++#endif + struct request *req; + struct bio *bio = NULL; + void *meta = NULL; + int ret; + ++#ifdef HAVE_BLK_MQ_ALLOC_REQUEST_HAS_3_PARAMS + req = nvme_alloc_request(q, cmd, 0); ++#else ++ req = nvme_alloc_request(q, cmd, GFP_KERNEL, false); ++#endif + if (IS_ERR(req)) + return PTR_ERR(req); + +@@ -80,30 +91,62 @@ static int nvme_submit_user_cmd(struct r + if (ret) + goto out; + bio = req->bio; ++#ifdef HAVE_BIO_BI_DISK ++ bio->bi_disk = disk; ++ if (disk && meta_buffer && meta_len) { ++#elif defined HAVE_ENUM_BIO_REMAPPED + if (bdev) + bio_set_dev(bio, bdev); + if (bdev && meta_buffer && meta_len) { ++#else ++ if (disk) { ++ bio->bi_bdev = bdget_disk(disk, 0); ++ if (!bio->bi_bdev) { ++ ret = -ENODEV; ++ goto out_unmap; ++ } ++ } ++ if (disk && meta_buffer && meta_len) { ++#endif + meta = nvme_add_user_metadata(bio, meta_buffer, meta_len, + meta_seed, write); + if (IS_ERR(meta)) { + ret = PTR_ERR(meta); + goto out_unmap; + } ++#ifdef HAVE_BLK_TYPES_REQ_INTEGRITY + req->cmd_flags |= REQ_INTEGRITY; ++#endif + } + } + ++#if defined(HAVE_BLK_EXECUTE_RQ_2_PARAM) || defined(HAVE_BLK_EXECUTE_RQ_3_PARAM) + ret = nvme_execute_passthru_rq(req); + if (result) + *result = le64_to_cpu(nvme_req(req)->result.u64); ++#else ++ nvme_execute_passthru_rq(req); ++ if (nvme_req(req)->flags & NVME_REQ_CANCELLED) ++ ret = -EINTR; ++ else ++ ret = nvme_req(req)->status; ++#endif + if (meta && !ret && !write) { + if (copy_to_user(meta_buffer, meta, meta_len)) + ret = -EFAULT; + } + kfree(meta); + out_unmap: ++#if defined HAVE_BIO_BI_DISK || defined HAVE_ENUM_BIO_REMAPPED + if (bio) + blk_rq_unmap_user(bio); ++#else ++ if (bio) { ++ if (disk && bio->bi_bdev) ++ bdput(bio->bi_bdev); ++ blk_rq_unmap_user(bio); ++ } ++#endif + out: + blk_mq_free_request(req); + return ret; +@@ -193,7 +236,7 @@ static int nvme_user_cmd(struct nvme_ctr + struct nvme_passthru_cmd cmd; + struct nvme_command c; + unsigned timeout = 0; +- u64 result; ++ u64 result = 0; + int status; + + if (!capable(CAP_SYS_ADMIN)) +@@ -284,8 +327,10 @@ static bool is_ctrl_ioctl(unsigned int c + { + if (cmd == NVME_IOCTL_ADMIN_CMD || cmd == NVME_IOCTL_ADMIN64_CMD) + return true; ++#ifdef HAVE_LINUX_SED_OPAL_H + if (is_sed_ioctl(cmd)) + return true; ++#endif + return false; + } + +@@ -298,7 +343,11 @@ static int nvme_ctrl_ioctl(struct nvme_c + case NVME_IOCTL_ADMIN64_CMD: + return nvme_user_cmd64(ctrl, NULL, argp); + default: ++#ifdef HAVE_LINUX_SED_OPAL_H + return sed_ioctl(ctrl->opal_dev, cmd, argp); ++#else ++ return 0; ++#endif + } + } + diff --git a/src/mlnx-ofa_kernel-5.8/backports/0212-BACKPORT-drivers-nvme-host-nvfs-dma.h.patch b/src/mlnx-ofa_kernel-5.8/backports/0212-BACKPORT-drivers-nvme-host-nvfs-dma.h.patch new file mode 100644 index 0000000..4f8c0fd --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0212-BACKPORT-drivers-nvme-host-nvfs-dma.h.patch @@ -0,0 +1,183 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/nvme/host/nvfs-dma.h + +Change-Id: Id9e8f89b4daf48524898f4243582bbf0ee1d7157 +--- + drivers/nvme/host/nvfs-dma.h | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/drivers/nvme/host/nvfs-dma.h ++++ b/drivers/nvme/host/nvfs-dma.h +@@ -10,30 +10,47 @@ static blk_status_t nvme_pci_setup_prps( + struct request *req, struct nvme_rw_command *cmnd); + + static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev, ++#ifdef HAVE_BIO_SPLIT_TO_LIMITS ++ struct request *req, struct nvme_rw_command *cmd); ++#else + struct request *req, struct nvme_rw_command *cmd, int entries); ++#endif + + static bool nvme_nvfs_unmap_data(struct nvme_dev *dev, struct request *req) + { +- struct nvme_iod *iod = blk_mq_rq_to_pdu(req); +- enum dma_data_direction dma_dir = rq_dma_dir(req); ++ struct nvme_iod *iod = blk_mq_rq_to_pdu(req); ++ enum dma_data_direction dma_dir = rq_dma_dir(req); + ++#ifdef HAVE_BIO_SPLIT_TO_LIMITS ++ if (!iod || !iod->sgt.nents) ++ return false; ++ ++ if (iod->sgt.sgl && !is_pci_p2pdma_page(sg_page(iod->sgt.sgl)) && ++#else + if (!iod || !iod->nents) + return false; +- if (iod->sg && !is_pci_p2pdma_page(sg_page(iod->sg)) && +- !blk_integrity_rq(req) && +- !iod->dma_len && +- nvfs_ops != NULL) { +- int count; +- count = nvfs_ops->nvfs_dma_unmap_sg(dev->dev, iod->sg, iod->nents, +- dma_dir); +- +- if (!count) +- return false; +- +- nvfs_put_ops(); +- return true; +- } +- return false; ++ ++ if (iod->sg && !is_pci_p2pdma_page(sg_page(iod->sg)) && ++#endif ++ !blk_integrity_rq(req) && ++#if defined(HAVE_BLKDEV_DMA_MAP_BVEC) && defined(HAVE_BLKDEV_REQ_BVEC) ++ !iod->dma_len && ++#endif ++ nvfs_ops != NULL) { ++ int count; ++#ifdef HAVE_BIO_SPLIT_TO_LIMITS ++ count = nvfs_ops->nvfs_dma_unmap_sg(dev->dev, iod->sgt.sgl, iod->sgt.nents, dma_dir); ++#else ++ count = nvfs_ops->nvfs_dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir); ++#endif ++ if (!count) ++ return false; ++ ++ nvfs_put_ops(); ++ return true; ++ } ++ ++ return false; + } + + static blk_status_t nvme_nvfs_map_data(struct nvme_dev *dev, struct request *req, +@@ -49,52 +66,95 @@ static blk_status_t nvme_nvfs_map_data(s + *is_nvfs_io = false; + + if (!blk_integrity_rq(req) && nvfs_get_ops()) { ++#if defined(HAVE_BLKDEV_DMA_MAP_BVEC) && defined(HAVE_BLKDEV_REQ_BVEC) + iod->dma_len = 0; ++#endif ++#ifdef HAVE_BIO_SPLIT_TO_LIMITS ++ iod->sgt.sgl = mempool_alloc(dev->iod_mempool, GFP_ATOMIC); ++ if (!iod->sgt.sgl) { ++#else + iod->sg = mempool_alloc(dev->iod_mempool, GFP_ATOMIC); + if (!iod->sg) { ++#endif + nvfs_put_ops(); + return BLK_STS_RESOURCE; + } + ++#ifdef HAVE_BIO_SPLIT_TO_LIMITS ++ sg_init_table(iod->sgt.sgl, blk_rq_nr_phys_segments(req)); ++ // associates bio pages to scatterlist ++ iod->sgt.orig_nents = nvfs_ops->nvfs_blk_rq_map_sg(q, req, iod->sgt.sgl); ++ if (!iod->sgt.orig_nents) { ++ mempool_free(iod->sgt.sgl, dev->iod_mempool); ++#else + sg_init_table(iod->sg, blk_rq_nr_phys_segments(req)); + // associates bio pages to scatterlist + iod->nents = nvfs_ops->nvfs_blk_rq_map_sg(q, req, iod->sg); + if (!iod->nents) { + mempool_free(iod->sg, dev->iod_mempool); ++#endif + nvfs_put_ops(); + return BLK_STS_IOERR; // reset to original ret + } + *is_nvfs_io = true; + ++#ifdef HAVE_BIO_SPLIT_TO_LIMITS ++ if (unlikely((iod->sgt.orig_nents == NVFS_IO_ERR))) { ++ pr_err("%s: failed to map sg_nents=:%d\n", __func__, iod->sgt.nents); ++ mempool_free(iod->sgt.sgl, dev->iod_mempool); ++#else + if (unlikely((iod->nents == NVFS_IO_ERR))) { + pr_err("%s: failed to map sg_nents=:%d\n", __func__, iod->nents); + mempool_free(iod->sg, dev->iod_mempool); ++#endif + nvfs_put_ops(); + return BLK_STS_IOERR; + } + + nr_mapped = nvfs_ops->nvfs_dma_map_sg_attrs(dev->dev, ++#ifdef HAVE_BIO_SPLIT_TO_LIMITS ++ iod->sgt.sgl, ++ iod->sgt.orig_nents, ++ dma_dir, ++ DMA_ATTR_NO_WARN); ++ iod->sgt.nents = nr_mapped; ++#else + iod->sg, + iod->nents, + dma_dir, + DMA_ATTR_NO_WARN); ++#endif + + if (unlikely((nr_mapped == NVFS_IO_ERR))) { ++#ifdef HAVE_BIO_SPLIT_TO_LIMITS ++ mempool_free(iod->sgt.sgl, dev->iod_mempool); ++ nvfs_put_ops(); ++ pr_err("%s: failed to dma map sglist=:%d\n", __func__, iod->sgt.nents); ++#else + mempool_free(iod->sg, dev->iod_mempool); + nvfs_put_ops(); + pr_err("%s: failed to dma map sglist=:%d\n", __func__, iod->nents); ++#endif + return BLK_STS_IOERR; + } + + if (unlikely(nr_mapped == NVFS_CPU_REQ)) { ++#ifdef HAVE_BIO_SPLIT_TO_LIMITS ++ mempool_free(iod->sgt.sgl, dev->iod_mempool); ++#else + mempool_free(iod->sg, dev->iod_mempool); ++#endif + nvfs_put_ops(); + BUG(); + } + + iod->use_sgl = nvme_pci_use_sgls(dev, req); + if (iod->use_sgl) { // TBD: not tested on SGL mode supporting drive ++#ifdef HAVE_BIO_SPLIT_TO_LIMITS ++ ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw); ++#else + ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw, nr_mapped); ++#endif + } else { + // push dma address to hw registers + ret = nvme_pci_setup_prps(dev, req, &cmnd->rw); +@@ -102,7 +162,11 @@ static blk_status_t nvme_nvfs_map_data(s + + if (ret != BLK_STS_OK) { + nvme_nvfs_unmap_data(dev, req); ++#ifdef HAVE_BIO_SPLIT_TO_LIMITS ++ mempool_free(iod->sgt.sgl, dev->iod_mempool); ++#else + mempool_free(iod->sg, dev->iod_mempool); ++#endif + } + return ret; + } diff --git a/src/mlnx-ofa_kernel-5.8/backports/0213-BACKPORT-drivers-nvme-host-nvfs-rdma.h.patch b/src/mlnx-ofa_kernel-5.8/backports/0213-BACKPORT-drivers-nvme-host-nvfs-rdma.h.patch new file mode 100644 index 0000000..ed351fe --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0213-BACKPORT-drivers-nvme-host-nvfs-rdma.h.patch @@ -0,0 +1,47 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/nvme/host/nvfs-rdma.h + +Change-Id: Ie1ed15fc872e0738e354e8d283c82c1ee7aaa6b2 +--- + drivers/nvme/host/nvfs-rdma.h | 13 +++++++++++++ + 1 file changed, 13 insertions(+) + +--- a/drivers/nvme/host/nvfs-rdma.h ++++ b/drivers/nvme/host/nvfs-rdma.h +@@ -34,7 +34,12 @@ static bool nvme_rdma_nvfs_unmap_data(st + dma_dir); + if (count) { + nvfs_put_ops(); ++#ifdef HAVE_SG_ALLOC_TABLE_CHAINED_NENTS_FIRST_CHUNK_PARAM + sg_free_table_chained(&req->data_sgl.sg_table, NVME_INLINE_SG_CNT); ++#else ++ sg_free_table_chained(&req->data_sgl.sg_table, true); ++#endif ++ + return true; + } + } +@@ -89,7 +94,11 @@ static int nvme_rdma_nvfs_map_data(struc + if (count <= dev->num_inline_segments) { + if (rq_data_dir(rq) == WRITE && nvme_rdma_queue_idx(queue) && + queue->ctrl->use_inline_data && ++#ifdef HAVE_BLK_RQ_NR_PAYLOAD_BYTES + blk_rq_payload_bytes(rq) <= ++#else ++ nvme_map_len(rq) <= ++#endif + nvme_rdma_inline_data_size(queue)) { + ret = nvme_rdma_map_sg_inline(queue, req, cmnd, count); + goto out; +@@ -114,7 +123,11 @@ out: + } + + out_free_table: ++#ifdef HAVE_SG_ALLOC_TABLE_CHAINED_NENTS_FIRST_CHUNK_PARAM + sg_free_table_chained(&req->data_sgl.sg_table, NVME_INLINE_SG_CNT); ++#else ++ sg_free_table_chained(&req->data_sgl.sg_table, true); ++#endif + return ret; + } + diff --git a/src/mlnx-ofa_kernel-5.8/backports/0217-BACKPORT-drivers-nvme-host-tcp.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0217-BACKPORT-drivers-nvme-host-tcp.c.patch new file mode 100644 index 0000000..a199d54 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0217-BACKPORT-drivers-nvme-host-tcp.c.patch @@ -0,0 +1,391 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/nvme/host/tcp.c + +Change-Id: I947d2fac11ccc3e3b2fcfafaa47fe7e4c10c8a7d +--- + drivers/nvme/host/tcp.c | 105 ++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 105 insertions(+) + +--- a/drivers/nvme/host/tcp.c ++++ b/drivers/nvme/host/tcp.c +@@ -3,6 +3,9 @@ + * NVMe over Fabrics TCP host. + * Copyright (c) 2018 Lightbits Labs. All rights reserved. + */ ++#ifdef pr_fmt ++#undef pr_fmt ++#endif + #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include + #include +@@ -131,7 +134,9 @@ struct nvme_tcp_ctrl { + struct work_struct err_work; + struct delayed_work connect_work; + struct nvme_tcp_request async_req; ++#ifdef HAVE_BLK_MQ_HCTX_TYPE + u32 io_queues[HCTX_MAX_TYPES]; ++#endif + }; + + static LIST_HEAD(nvme_tcp_ctrl_list); +@@ -237,19 +242,29 @@ static void nvme_tcp_init_iter(struct nv + offset = 0; + } else { + struct bio *bio = req->curr_bio; ++#ifdef HAVE_BIO_FOR_EACH_BVEC + struct bvec_iter bi; + struct bio_vec bv; ++#endif + + vec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter); ++#ifdef HAVE_BIO_FOR_EACH_BVEC + nr_bvec = 0; + bio_for_each_bvec(bv, bio, bi) { + nr_bvec++; + } ++#else ++ nr_bvec = bio_segments(bio); ++#endif + size = bio->bi_iter.bi_size; + offset = bio->bi_iter.bi_bvec_done; + } + ++#ifdef HAVE_IOV_ITER_IS_BVEC_SET + iov_iter_bvec(&req->iter, dir, vec, nr_bvec, size); ++#else ++ iov_iter_bvec(&req->iter, ITER_BVEC | dir, vec, nr_bvec, size); ++#endif + req->iter.iov_offset = offset; + } + +@@ -1275,6 +1290,10 @@ static void nvme_tcp_free_queue(struct n + mutex_destroy(&queue->queue_lock); + } + ++#ifndef NVME_TCP_MIN_MAXH2CDATA ++#define NVME_TCP_MIN_MAXH2CDATA 4096 ++#endif ++ + static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue) + { + struct nvme_tcp_icreq_pdu *icreq; +@@ -1382,6 +1401,7 @@ free_icreq: + return ret; + } + ++#ifdef HAVE_BLK_MQ_HCTX_TYPE + static bool nvme_tcp_admin_queue(struct nvme_tcp_queue *queue) + { + return nvme_tcp_queue_id(queue) == 0; +@@ -1435,6 +1455,7 @@ static void nvme_tcp_set_queue_io_cpu(st + ctrl->io_queues[HCTX_TYPE_READ] - 1; + queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false); + } ++#endif /* HAVE_BLK_MQ_HCTX_TYPE */ + + static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, + int qid, size_t queue_size) +@@ -1442,6 +1463,12 @@ static int nvme_tcp_alloc_queue(struct n + struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); + struct nvme_tcp_queue *queue = &ctrl->queues[qid]; + int ret, rcv_pdu_size; ++#ifndef HAVE_BLK_MQ_HCTX_TYPE ++ int n; ++#endif ++#ifndef HAVE_IP_SOCK_SET_TOS ++ int opt; ++#endif + + mutex_init(&queue->queue_lock); + queue->ctrl = ctrl; +@@ -1466,10 +1493,32 @@ static int nvme_tcp_alloc_queue(struct n + } + + /* Single syn retry */ ++#ifdef HAVE_TCP_SOCK_SET_SYNCNT + tcp_sock_set_syncnt(queue->sock->sk, 1); ++#else ++ opt = 1; ++ ret = kernel_setsockopt(queue->sock, IPPROTO_TCP, TCP_SYNCNT, ++ (char *)&opt, sizeof(opt)); ++ if (ret) { ++ dev_err(nctrl->device, ++ "failed to set TCP_SYNCNT sock opt %d\n", ret); ++ goto err_sock; ++ } ++#endif + + /* Set TCP no delay */ ++#ifdef HAVE_TCP_SOCK_SET_NODELAY + tcp_sock_set_nodelay(queue->sock->sk); ++#else ++ opt = 1; ++ ret = kernel_setsockopt(queue->sock, IPPROTO_TCP, ++ TCP_NODELAY, (char *)&opt, sizeof(opt)); ++ if (ret) { ++ dev_err(nctrl->device, ++ "failed to set TCP_NODELAY sock opt %d\n", ret); ++ goto err_sock; ++ } ++#endif + + /* + * Cleanup whatever is sitting in the TCP transmit queue on socket +@@ -1482,14 +1531,34 @@ static int nvme_tcp_alloc_queue(struct n + sock_set_priority(queue->sock->sk, so_priority); + + /* Set socket type of service */ ++#ifdef HAVE_IP_SOCK_SET_TOS + if (nctrl->opts->tos >= 0) + ip_sock_set_tos(queue->sock->sk, nctrl->opts->tos); ++#else ++ if (nctrl->opts->tos >= 0) { ++ opt = nctrl->opts->tos; ++ ret = kernel_setsockopt(queue->sock, SOL_IP, IP_TOS, ++ (char *)&opt, sizeof(opt)); ++ if (ret) { ++ dev_err(nctrl->device, ++ "failed to set IP_TOS sock opt %d\n", ret); ++ } ++ } ++#endif + + /* Set 10 seconds timeout for icresp recvmsg */ + queue->sock->sk->sk_rcvtimeo = 10 * HZ; + + queue->sock->sk->sk_allocation = GFP_ATOMIC; ++#ifdef HAVE_BLK_MQ_HCTX_TYPE + nvme_tcp_set_queue_io_cpu(queue); ++#else ++ if (!qid) ++ n = 0; ++ else ++ n = (qid - 1) % num_online_cpus(); ++ queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false); ++#endif + queue->request = NULL; + queue->data_remaining = 0; + queue->ddgst_remaining = 0; +@@ -1508,6 +1577,7 @@ static int nvme_tcp_alloc_queue(struct n + } + } + ++#ifdef HAVE_SOCK_SETOPTVAL_SOCKPTR_T + if (nctrl->opts->mask & NVMF_OPT_HOST_IFACE) { + char *iface = nctrl->opts->host_iface; + sockptr_t optval = KERNEL_SOCKPTR(iface); +@@ -1521,6 +1591,7 @@ static int nvme_tcp_alloc_queue(struct n + goto err_sock; + } + } ++#endif + + queue->hdr_digest = nctrl->opts->hdr_digest; + queue->data_digest = nctrl->opts->data_digest; +@@ -1673,7 +1744,9 @@ static struct blk_mq_tag_set *nvme_tcp_a + set->driver_data = ctrl; + set->nr_hw_queues = nctrl->queue_count - 1; + set->timeout = NVME_IO_TIMEOUT; ++#ifdef HAVE_BLK_MQ_HCTX_TYPE + set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2; ++#endif + } + + ret = blk_mq_alloc_tag_set(set); +@@ -1781,6 +1854,7 @@ static unsigned int nvme_tcp_nr_io_queue + static void nvme_tcp_set_io_queues(struct nvme_ctrl *nctrl, + unsigned int nr_io_queues) + { ++#ifdef HAVE_BLK_MQ_HCTX_TYPE + struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); + struct nvmf_ctrl_options *opts = nctrl->opts; + +@@ -1811,6 +1885,7 @@ static void nvme_tcp_set_io_queues(struc + ctrl->io_queues[HCTX_TYPE_POLL] = + min(opts->nr_poll_queues, nr_io_queues); + } ++#endif + } + + static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) +@@ -1842,7 +1917,11 @@ static void nvme_tcp_destroy_io_queues(s + { + nvme_tcp_stop_io_queues(ctrl); + if (remove) { ++#ifdef HAVE_BLK_MQ_DESTROY_QUEUE ++ blk_mq_destroy_queue(ctrl->connect_q); ++#else + blk_cleanup_queue(ctrl->connect_q); ++#endif + blk_mq_free_tag_set(ctrl->tagset); + } + nvme_tcp_free_io_queues(ctrl); +@@ -1899,7 +1978,11 @@ out_wait_freeze_timed_out: + out_cleanup_connect_q: + nvme_cancel_tagset(ctrl); + if (new) ++#ifdef HAVE_BLK_MQ_DESTROY_QUEUE ++ blk_mq_destroy_queue(ctrl->connect_q); ++#else + blk_cleanup_queue(ctrl->connect_q); ++#endif + out_free_tag_set: + if (new) + blk_mq_free_tag_set(ctrl->tagset); +@@ -1912,8 +1995,13 @@ static void nvme_tcp_destroy_admin_queue + { + nvme_tcp_stop_queue(ctrl, 0); + if (remove) { ++#ifdef HAVE_BLK_MQ_DESTROY_QUEUE ++ blk_mq_destroy_queue(ctrl->admin_q); ++ blk_mq_destroy_queue(ctrl->fabrics_q); ++#else + blk_cleanup_queue(ctrl->admin_q); + blk_cleanup_queue(ctrl->fabrics_q); ++#endif + blk_mq_free_tag_set(ctrl->admin_tagset); + } + nvme_tcp_free_admin_queue(ctrl); +@@ -1969,12 +2057,21 @@ out_quiesce_queue: + out_stop_queue: + nvme_tcp_stop_queue(ctrl, 0); + nvme_cancel_admin_tagset(ctrl); ++#ifdef HAVE_BLK_MQ_DESTROY_QUEUE ++out_cleanup_queue: ++ if (new) ++ blk_mq_destroy_queue(ctrl->admin_q); ++out_cleanup_fabrics_q: ++ if (new) ++ blk_mq_destroy_queue(ctrl->fabrics_q); ++#else + out_cleanup_queue: + if (new) + blk_cleanup_queue(ctrl->admin_q); + out_cleanup_fabrics_q: + if (new) + blk_cleanup_queue(ctrl->fabrics_q); ++#endif + out_free_tagset: + if (new) + blk_mq_free_tag_set(ctrl->admin_tagset); +@@ -2287,7 +2384,11 @@ static void nvme_tcp_complete_timed_out( + } + + static enum blk_eh_timer_return ++#ifdef HAVE_BLK_MQ_OPS_TIMEOUT_1_PARAM ++nvme_tcp_timeout(struct request *rq) ++#else + nvme_tcp_timeout(struct request *rq, bool reserved) ++#endif + { + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl; +@@ -2397,6 +2498,7 @@ static blk_status_t nvme_tcp_setup_cmd_p + return 0; + } + ++#ifdef HAVE_BLK_MQ_OPS_COMMIT_RQS + static void nvme_tcp_commit_rqs(struct blk_mq_hw_ctx *hctx) + { + struct nvme_tcp_queue *queue = hctx->driver_data; +@@ -2404,6 +2506,7 @@ static void nvme_tcp_commit_rqs(struct b + if (!llist_empty(&queue->req_list)) + queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); + } ++#endif + + static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +@@ -2431,6 +2534,7 @@ static blk_status_t nvme_tcp_queue_rq(st + + static int nvme_tcp_map_queues(struct blk_mq_tag_set *set) + { ++#ifdef HAVE_BLK_MQ_HCTX_TYPE + struct nvme_tcp_ctrl *ctrl = set->driver_data; + struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; + +@@ -2470,11 +2574,23 @@ static int nvme_tcp_map_queues(struct bl + ctrl->io_queues[HCTX_TYPE_DEFAULT], + ctrl->io_queues[HCTX_TYPE_READ], + ctrl->io_queues[HCTX_TYPE_POLL]); ++#else ++ blk_mq_map_queues(set); ++#endif + + return 0; + } + ++#ifdef HAVE_BLK_MQ_OPS_POLL ++#ifdef HAVE_BLK_MQ_OPS_POLL_1_ARG ++static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx) ++#else ++#ifdef HAVE_BLK_MQ_OPS_POLL_2_ARG + static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) ++#else ++static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) ++#endif ++#endif + { + struct nvme_tcp_queue *queue = hctx->driver_data; + struct sock *sk = queue->sock->sk; +@@ -2483,23 +2599,32 @@ static int nvme_tcp_poll(struct blk_mq_h + return 0; + + set_bit(NVME_TCP_Q_POLLING, &queue->flags); ++#ifdef HAVE_SKB_QUEUE_EMPTY_LOCKLESS + if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue)) ++#else ++ if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue)) ++#endif + sk_busy_loop(sk, true); + nvme_tcp_try_recv(queue); + clear_bit(NVME_TCP_Q_POLLING, &queue->flags); + return queue->nr_cqe; + } ++#endif + + static const struct blk_mq_ops nvme_tcp_mq_ops = { + .queue_rq = nvme_tcp_queue_rq, ++#ifdef HAVE_BLK_MQ_OPS_COMMIT_RQS + .commit_rqs = nvme_tcp_commit_rqs, ++#endif + .complete = nvme_complete_rq, + .init_request = nvme_tcp_init_request, + .exit_request = nvme_tcp_exit_request, + .init_hctx = nvme_tcp_init_hctx, + .timeout = nvme_tcp_timeout, + .map_queues = nvme_tcp_map_queues, ++#ifdef HAVE_BLK_MQ_OPS_POLL + .poll = nvme_tcp_poll, ++#endif + }; + + static const struct blk_mq_ops nvme_tcp_admin_mq_ops = { +@@ -2592,6 +2717,7 @@ static struct nvme_ctrl *nvme_tcp_create + } + } + ++#ifdef HAVE_SOCK_SETOPTVAL_SOCKPTR_T + if (opts->mask & NVMF_OPT_HOST_IFACE) { + if (!__dev_get_by_name(&init_net, opts->host_iface)) { + pr_err("invalid interface passed: %s\n", +@@ -2600,6 +2726,7 @@ static struct nvme_ctrl *nvme_tcp_create + goto out_free_ctrl; + } + } ++#endif + + if (!opts->duplicate_connect && nvme_tcp_existing_controller(opts)) { + ret = -EALREADY; +@@ -2657,7 +2784,11 @@ static struct nvmf_transport_ops nvme_tc + NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO | + NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST | + NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES | ++#ifdef HAVE_SOCK_SETOPTVAL_SOCKPTR_T + NVMF_OPT_TOS | NVMF_OPT_HOST_IFACE, ++#else ++ NVMF_OPT_TOS, ++#endif + .create_ctrl = nvme_tcp_create_ctrl, + }; + diff --git a/src/mlnx-ofa_kernel-5.8/backports/0218-BACKPORT-drivers-nvme-host-zns.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0218-BACKPORT-drivers-nvme-host-zns.c.patch new file mode 100644 index 0000000..b84a076 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0218-BACKPORT-drivers-nvme-host-zns.c.patch @@ -0,0 +1,24 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/nvme/host/zns.c + +Change-Id: Ia6b7efa54174639c1fcec915c6064f1809c0816a +--- + drivers/nvme/host/zns.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/drivers/nvme/host/zns.c ++++ b/drivers/nvme/host/zns.c +@@ -3,6 +3,8 @@ + * Copyright (C) 2020 Western Digital Corporation or its affiliates. + */ + ++#ifdef HAVE_BLK_QUEUE_MAX_ACTIVE_ZONES ++ + #include + #include + #include "nvme.h" +@@ -248,3 +250,4 @@ blk_status_t nvme_setup_zone_mgmt_send(s + + return BLK_STS_OK; + } ++#endif /* HAVE_BLK_QUEUE_MAX_ACTIVE_ZONES */ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0219-BACKPORT-drivers-nvme-target-admin-cmd.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0219-BACKPORT-drivers-nvme-target-admin-cmd.c.patch new file mode 100644 index 0000000..f86a722 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0219-BACKPORT-drivers-nvme-target-admin-cmd.c.patch @@ -0,0 +1,159 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/nvme/target/admin-cmd.c + +Change-Id: Ibe53a661b98013094406ad6e7b60809621049c58 +--- + drivers/nvme/target/admin-cmd.c | 40 ++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 40 insertions(+) + +--- a/drivers/nvme/target/admin-cmd.c ++++ b/drivers/nvme/target/admin-cmd.c +@@ -9,7 +9,9 @@ + #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include + #include ++#ifdef HAVE_PART_STAT_H + #include ++#endif + + #include + #include +@@ -88,12 +90,21 @@ static u16 nvmet_get_smart_log_nsid(stru + if (!req->ns->bdev) + return NVME_SC_SUCCESS; + ++#ifdef HAVE_REQUEST_BDEV + host_reads = part_stat_read(req->ns->bdev, ios[READ]); + data_units_read = + DIV_ROUND_UP(part_stat_read(req->ns->bdev, sectors[READ]), 1000); + host_writes = part_stat_read(req->ns->bdev, ios[WRITE]); + data_units_written = + DIV_ROUND_UP(part_stat_read(req->ns->bdev, sectors[WRITE]), 1000); ++#else ++ host_reads = part_stat_read(req->ns->bdev->bd_part, ios[READ]); ++ data_units_read = DIV_ROUND_UP(part_stat_read(req->ns->bdev->bd_part, ++ sectors[READ]), 1000); ++ host_writes = part_stat_read(req->ns->bdev->bd_part, ios[WRITE]); ++ data_units_written = DIV_ROUND_UP(part_stat_read(req->ns->bdev->bd_part, ++ sectors[WRITE]), 1000); ++#endif + + put_unaligned_le64(host_reads, &slog->host_reads[0]); + put_unaligned_le64(data_units_read, &slog->data_units_read[0]); +@@ -117,12 +128,21 @@ static u16 nvmet_get_smart_log_all(struc + /* we don't have the right data for file backed ns */ + if (!ns->bdev) + continue; ++#ifdef HAVE_REQUEST_BDEV + host_reads += part_stat_read(ns->bdev, ios[READ]); + data_units_read += DIV_ROUND_UP( + part_stat_read(ns->bdev, sectors[READ]), 1000); + host_writes += part_stat_read(ns->bdev, ios[WRITE]); + data_units_written += DIV_ROUND_UP( + part_stat_read(ns->bdev, sectors[WRITE]), 1000); ++#else ++ host_reads += part_stat_read(ns->bdev->bd_part, ios[READ]); ++ data_units_read += DIV_ROUND_UP( ++ part_stat_read(ns->bdev->bd_part, sectors[READ]), 1000); ++ host_writes += part_stat_read(ns->bdev->bd_part, ios[WRITE]); ++ data_units_written += DIV_ROUND_UP( ++ part_stat_read(ns->bdev->bd_part, sectors[WRITE]), 1000); ++#endif + } + + put_unaligned_le64(host_reads, &slog->host_reads[0]); +@@ -267,7 +287,11 @@ static u32 nvmet_format_ana_group(struct + desc->chgcnt = cpu_to_le64(nvmet_ana_chgcnt); + desc->state = req->port->ana_state[grpid]; + memset(desc->rsvd17, 0, sizeof(desc->rsvd17)); ++#ifdef struct_size + return struct_size(desc, nsids, count); ++#else ++ return sizeof(struct nvme_ana_group_desc) + count * sizeof(__le32); ++#endif + } + + static void nvmet_execute_get_log_page_ana(struct nvmet_req *req) +@@ -345,8 +369,10 @@ static void nvmet_execute_get_log_page(s + nvmet_req_complete(req, NVME_SC_INVALID_FIELD | NVME_SC_DNR); + } + ++#ifdef HAVE_BLKDEV_ISSUE_ZEROOUT + static bool nvmet_is_write_zeroes(struct nvmet_ctrl *ctrl) + { ++#ifdef HAVE_BDEV_WRITE_ZEROES_SECTORS + struct nvmet_ns *ns; + unsigned long idx; + +@@ -354,7 +380,11 @@ static bool nvmet_is_write_zeroes(struct + if (!bdev_write_zeroes_sectors(ns->bdev)) + return false; + return true; ++#else ++ return false; ++#endif + } ++#endif + + static void nvmet_execute_identify_ctrl(struct nvmet_req *req) + { +@@ -453,11 +483,15 @@ static void nvmet_execute_identify_ctrl( + + id->nn = cpu_to_le32(NVMET_MAX_NAMESPACES); + id->mnan = cpu_to_le32(NVMET_MAX_NAMESPACES); ++#ifdef HAVE_BLKDEV_ISSUE_ZEROOUT + if (!req->port->offload || nvmet_is_write_zeroes(ctrl)) + id->oncs = cpu_to_le16(NVME_CTRL_ONCS_DSM | + NVME_CTRL_ONCS_WRITE_ZEROES); + else + id->oncs = cpu_to_le16(NVME_CTRL_ONCS_DSM); ++#else ++ id->oncs = cpu_to_le16(NVME_CTRL_ONCS_DSM); ++#endif + + /* XXX: don't report vwc if the underlying device is write through */ + id->vwc = NVME_CTRL_VWC_PRESENT; +@@ -720,6 +754,7 @@ static void nvmet_execute_identify(struc + } + break; + case NVME_ID_CNS_CS_NS: ++#ifdef HAVE_BIO_ADD_ZONE_APPEND_PAGE + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED)) { + switch (req->cmd->identify.csi) { + case NVME_CSI_ZNS: +@@ -728,6 +763,7 @@ static void nvmet_execute_identify(struc + break; + } + } ++#endif + break; + case NVME_ID_CNS_CTRL: + switch (req->cmd->identify.csi) { +@@ -736,6 +772,7 @@ static void nvmet_execute_identify(struc + } + break; + case NVME_ID_CNS_CS_CTRL: ++#ifdef HAVE_BIO_ADD_ZONE_APPEND_PAGE + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED)) { + switch (req->cmd->identify.csi) { + case NVME_CSI_ZNS: +@@ -744,6 +781,7 @@ static void nvmet_execute_identify(struc + break; + } + } ++#endif + break; + case NVME_ID_CNS_NS_ACTIVE_LIST: + switch (req->cmd->identify.csi) { +@@ -781,9 +819,11 @@ static u16 nvmet_write_protect_flush_syn + { + u16 status; + ++#ifdef HAVE_FS_HAS_KIOCB + if (req->ns->file) + status = nvmet_file_flush(req); + else ++#endif + status = nvmet_bdev_flush(req); + + if (status) diff --git a/src/mlnx-ofa_kernel-5.8/backports/0220-BACKPORT-drivers-nvme-target-configfs.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0220-BACKPORT-drivers-nvme-target-configfs.c.patch new file mode 100644 index 0000000..63337b2 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0220-BACKPORT-drivers-nvme-target-configfs.c.patch @@ -0,0 +1,387 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/nvme/target/configfs.c + +Change-Id: I29301dc54b2dae1eb408390ee655686625fdcabd +--- + drivers/nvme/target/configfs.c | 95 +++++++++++++++++++++++++++------- + 1 file changed, 75 insertions(+), 20 deletions(-) + +--- a/drivers/nvme/target/configfs.c ++++ b/drivers/nvme/target/configfs.c +@@ -18,8 +18,13 @@ + + #include "nvmet.h" + +-static const struct config_item_type nvmet_host_type; +-static const struct config_item_type nvmet_subsys_type; ++#ifdef MLX_CONFIG_ITEM_TYPE_CONST ++#undef MLX_CONFIG_ITEM_TYPE_CONST ++#endif ++#define MLX_CONFIG_ITEM_TYPE_CONST ++ ++static MLX_CONFIG_ITEM_TYPE_CONST struct config_item_type nvmet_host_type; ++static MLX_CONFIG_ITEM_TYPE_CONST struct config_item_type nvmet_subsys_type; + + static LIST_HEAD(nvmet_ports_list); + struct list_head *nvmet_ports = &nvmet_ports_list; +@@ -253,6 +258,7 @@ static ssize_t nvmet_param_inline_data_s + CONFIGFS_ATTR(nvmet_, param_inline_data_size); + + #ifdef CONFIG_BLK_DEV_INTEGRITY ++#ifdef HAVE_BLKDEV_BIO_INTEGRITY_BYTES + static ssize_t nvmet_param_pi_enable_show(struct config_item *item, + char *page) + { +@@ -279,6 +285,7 @@ static ssize_t nvmet_param_pi_enable_sto + + CONFIGFS_ATTR(nvmet_, param_pi_enable); + #endif ++#endif + + static ssize_t nvmet_addr_trtype_show(struct config_item *item, + char *page) +@@ -495,7 +502,12 @@ static ssize_t nvmet_ns_device_path_stor + + kfree(ns->device_path); + ret = -ENOMEM; ++#ifdef HAVE_KMEMDUP_NUL + ns->device_path = kmemdup_nul(page, len, GFP_KERNEL); ++#else ++ ns->device_path = kstrndup(page, len, GFP_KERNEL); ++#endif ++ + if (!ns->device_path) + goto out_unlock; + +@@ -905,12 +917,13 @@ static struct configfs_item_operations n + .release = nvmet_ns_release, + }; + +-static const struct config_item_type nvmet_ns_type = { ++static MLX_CONFIG_ITEM_TYPE_CONST struct config_item_type nvmet_ns_type = { + .ct_item_ops = &nvmet_ns_item_ops, + .ct_attrs = nvmet_ns_attrs, + .ct_owner = THIS_MODULE, + }; + ++#ifdef HAVE_CONFIGFS_REGISTER_GROUP + static ssize_t nvmet_offload_ctx_traddr_show(struct config_item *item, char *page) + { + struct nvmet_offload_ctx *ctx = to_nvmet_offload_ctx(item); +@@ -968,14 +981,15 @@ static struct configfs_attribute *nvmet_ + NULL, + }; + +-static const struct config_item_type nvmet_offload_ctx_type = { ++static MLX_CONFIG_ITEM_TYPE_CONST struct config_item_type nvmet_offload_ctx_type = { + .ct_attrs = nvmet_offload_ctx_attrs, + .ct_owner = THIS_MODULE, + }; + +-static const struct config_item_type nvmet_offload_ctxs_type = { ++static MLX_CONFIG_ITEM_TYPE_CONST struct config_item_type nvmet_offload_ctxs_type = { + .ct_owner = THIS_MODULE, + }; ++#endif + + static struct config_group *nvmet_ns_make(struct config_group *group, + const char *name) +@@ -1001,10 +1015,12 @@ static struct config_group *nvmet_ns_mak + goto out; + config_group_init_type_name(&ns->group, name, &nvmet_ns_type); + ++#ifdef HAVE_CONFIGFS_REGISTER_GROUP + config_group_init_type_name(&ns->offload_ctxs_group, + "offload_ctxs", &nvmet_offload_ctxs_type); + configfs_add_default_group(&ns->offload_ctxs_group, + &ns->group); ++#endif + + pr_info("adding nsid %d to subsystem %s\n", nsid, subsys->subsysnqn); + +@@ -1017,7 +1033,7 @@ static struct configfs_group_operations + .make_group = nvmet_ns_make, + }; + +-static const struct config_item_type nvmet_namespaces_type = { ++static MLX_CONFIG_ITEM_TYPE_CONST struct config_item_type nvmet_namespaces_type = { + .ct_group_ops = &nvmet_namespaces_group_ops, + .ct_owner = THIS_MODULE, + }; +@@ -1158,7 +1174,7 @@ static struct configfs_attribute *nvmet_ + NULL, + }; + +-static const struct config_item_type nvmet_passthru_type = { ++static MLX_CONFIG_ITEM_TYPE_CONST struct config_item_type nvmet_passthru_type = { + .ct_attrs = nvmet_passthru_attrs, + .ct_owner = THIS_MODULE, + }; +@@ -1238,8 +1254,13 @@ out_free_link: + return ret; + } + ++#ifndef HAVE_CONFIGFS_DROP_LINK_RETURNS_INT + static void nvmet_port_subsys_drop_link(struct config_item *parent, + struct config_item *target) ++#else ++static int nvmet_port_subsys_drop_link(struct config_item *parent, ++ struct config_item *target) ++#endif + { + struct nvmet_port *port = to_nvmet_port(parent->ci_parent); + struct nvmet_subsys *subsys = to_subsys(target); +@@ -1251,7 +1272,11 @@ static void nvmet_port_subsys_drop_link( + goto found; + } + up_write(&nvmet_config_sem); ++#ifndef HAVE_CONFIGFS_DROP_LINK_RETURNS_INT + return; ++#else ++ return -EINVAL; ++#endif + + found: + list_del(&p->entry); +@@ -1266,6 +1291,9 @@ found: + nvmet_uninit_offload_subsystem_port_attrs(subsys); + up_write(&nvmet_config_sem); + kfree(p); ++#ifdef HAVE_CONFIGFS_DROP_LINK_RETURNS_INT ++ return 0; ++#endif + } + + static struct configfs_item_operations nvmet_port_subsys_item_ops = { +@@ -1273,7 +1301,7 @@ static struct configfs_item_operations n + .drop_link = nvmet_port_subsys_drop_link, + }; + +-static const struct config_item_type nvmet_port_subsys_type = { ++static MLX_CONFIG_ITEM_TYPE_CONST struct config_item_type nvmet_port_subsys_type = { + .ct_item_ops = &nvmet_port_subsys_item_ops, + .ct_owner = THIS_MODULE, + }; +@@ -1320,8 +1348,14 @@ out_free_link: + return ret; + } + ++#ifndef HAVE_CONFIGFS_DROP_LINK_RETURNS_INT + static void nvmet_allowed_hosts_drop_link(struct config_item *parent, + struct config_item *target) ++#else ++static int nvmet_allowed_hosts_drop_link(struct config_item *parent, ++ struct config_item *target) ++ ++#endif + { + struct nvmet_subsys *subsys = to_subsys(parent->ci_parent); + struct nvmet_host *host = to_host(target); +@@ -1333,7 +1367,11 @@ static void nvmet_allowed_hosts_drop_lin + goto found; + } + up_write(&nvmet_config_sem); ++#ifndef HAVE_CONFIGFS_DROP_LINK_RETURNS_INT + return; ++#else ++ return -EINVAL; ++#endif + + found: + list_del(&p->entry); +@@ -1341,6 +1379,9 @@ found: + + up_write(&nvmet_config_sem); + kfree(p); ++#ifdef HAVE_CONFIGFS_DROP_LINK_RETURNS_INT ++ return 0; ++#endif + } + + static struct configfs_item_operations nvmet_allowed_hosts_item_ops = { +@@ -1348,7 +1389,7 @@ static struct configfs_item_operations n + .drop_link = nvmet_allowed_hosts_drop_link, + }; + +-static const struct config_item_type nvmet_allowed_hosts_type = { ++static MLX_CONFIG_ITEM_TYPE_CONST struct config_item_type nvmet_allowed_hosts_type = { + .ct_item_ops = &nvmet_allowed_hosts_item_ops, + .ct_owner = THIS_MODULE, + }; +@@ -1609,7 +1650,11 @@ static ssize_t nvmet_subsys_attr_model_s + return -EINVAL; + } + ++#ifdef HAVE_KMEMDUP_NUL + subsys->model_number = kmemdup_nul(page, len, GFP_KERNEL); ++#else ++ subsys->model_number = kstrndup(page, len, GFP_KERNEL); ++#endif + if (!subsys->model_number) + return -ENOMEM; + return count; +@@ -1632,6 +1677,7 @@ static ssize_t nvmet_subsys_attr_model_s + CONFIGFS_ATTR(nvmet_subsys_, attr_model); + + #ifdef CONFIG_BLK_DEV_INTEGRITY ++#ifdef HAVE_BLKDEV_BIO_INTEGRITY_BYTES + static ssize_t nvmet_subsys_attr_pi_enable_show(struct config_item *item, + char *page) + { +@@ -1652,6 +1698,7 @@ static ssize_t nvmet_subsys_attr_pi_enab + } + CONFIGFS_ATTR(nvmet_subsys_, attr_pi_enable); + #endif ++#endif + + static ssize_t + nvmet_subsys_attr_offload_subsys_unknown_ns_cmds_show(struct config_item *item, +@@ -1750,9 +1797,11 @@ static struct configfs_attribute *nvmet_ + &nvmet_subsys_attr_attr_cntlid_min, + &nvmet_subsys_attr_attr_cntlid_max, + &nvmet_subsys_attr_attr_model, ++#ifdef HAVE_BLKDEV_BIO_INTEGRITY_BYTES + #ifdef CONFIG_BLK_DEV_INTEGRITY + &nvmet_subsys_attr_attr_pi_enable, + #endif ++#endif + &nvmet_subsys_attr_attr_offload, + &nvmet_subsys_attr_attr_offload_subsys_unknown_ns_cmds, + NULL, +@@ -1773,7 +1822,7 @@ static struct configfs_item_operations n + .release = nvmet_subsys_release, + }; + +-static const struct config_item_type nvmet_subsys_type = { ++static MLX_CONFIG_ITEM_TYPE_CONST struct config_item_type nvmet_subsys_type = { + .ct_item_ops = &nvmet_subsys_item_ops, + .ct_attrs = nvmet_subsys_attrs, + .ct_owner = THIS_MODULE, +@@ -1813,7 +1862,7 @@ static struct configfs_group_operations + .make_group = nvmet_subsys_make, + }; + +-static const struct config_item_type nvmet_subsystems_type = { ++static MLX_CONFIG_ITEM_TYPE_CONST struct config_item_type nvmet_subsystems_type = { + .ct_group_ops = &nvmet_subsystems_group_ops, + .ct_owner = THIS_MODULE, + }; +@@ -1881,7 +1930,7 @@ static struct configfs_item_operations n + .release = nvmet_referral_release, + }; + +-static const struct config_item_type nvmet_referral_type = { ++static MLX_CONFIG_ITEM_TYPE_CONST struct config_item_type nvmet_referral_type = { + .ct_owner = THIS_MODULE, + .ct_attrs = nvmet_referral_attrs, + .ct_item_ops = &nvmet_referral_item_ops, +@@ -1907,7 +1956,7 @@ static struct configfs_group_operations + .disconnect_notify = nvmet_referral_notify, + }; + +-static const struct config_item_type nvmet_referrals_type = { ++static MLX_CONFIG_ITEM_TYPE_CONST struct config_item_type nvmet_referrals_type = { + .ct_owner = THIS_MODULE, + .ct_group_ops = &nvmet_referral_group_ops, + }; +@@ -1986,7 +2035,7 @@ static struct configfs_item_operations n + .release = nvmet_ana_group_release, + }; + +-static const struct config_item_type nvmet_ana_group_type = { ++static MLX_CONFIG_ITEM_TYPE_CONST struct config_item_type nvmet_ana_group_type = { + .ct_item_ops = &nvmet_ana_group_item_ops, + .ct_attrs = nvmet_ana_group_attrs, + .ct_owner = THIS_MODULE, +@@ -2031,7 +2080,7 @@ static struct configfs_group_operations + .make_group = nvmet_ana_groups_make_group, + }; + +-static const struct config_item_type nvmet_ana_groups_type = { ++static MLX_CONFIG_ITEM_TYPE_CONST struct config_item_type nvmet_ana_groups_type = { + .ct_group_ops = &nvmet_ana_groups_group_ops, + .ct_owner = THIS_MODULE, + }; +@@ -2064,8 +2113,10 @@ static struct configfs_attribute *nvmet_ + &nvmet_attr_param_offload_queue_size, + &nvmet_attr_param_offload_passthrough_sqe_rw, + #ifdef CONFIG_BLK_DEV_INTEGRITY ++#ifdef HAVE_BLKDEV_BIO_INTEGRITY_BYTES + &nvmet_attr_param_pi_enable, + #endif ++#endif + NULL, + }; + +@@ -2073,7 +2124,7 @@ static struct configfs_item_operations n + .release = nvmet_port_release, + }; + +-static const struct config_item_type nvmet_port_type = { ++static MLX_CONFIG_ITEM_TYPE_CONST struct config_item_type nvmet_port_type = { + .ct_attrs = nvmet_port_attrs, + .ct_item_ops = &nvmet_port_item_ops, + .ct_owner = THIS_MODULE, +@@ -2149,7 +2200,7 @@ static struct configfs_group_operations + .make_group = nvmet_ports_make, + }; + +-static const struct config_item_type nvmet_ports_type = { ++static MLX_CONFIG_ITEM_TYPE_CONST struct config_item_type nvmet_ports_type = { + .ct_group_ops = &nvmet_ports_group_ops, + .ct_owner = THIS_MODULE, + }; +@@ -2168,7 +2219,7 @@ static struct configfs_item_operations n + .release = nvmet_host_release, + }; + +-static const struct config_item_type nvmet_host_type = { ++static MLX_CONFIG_ITEM_TYPE_CONST struct config_item_type nvmet_host_type = { + .ct_item_ops = &nvmet_host_item_ops, + .ct_owner = THIS_MODULE, + }; +@@ -2191,14 +2242,14 @@ static struct configfs_group_operations + .make_group = nvmet_hosts_make_group, + }; + +-static const struct config_item_type nvmet_hosts_type = { ++static MLX_CONFIG_ITEM_TYPE_CONST struct config_item_type nvmet_hosts_type = { + .ct_group_ops = &nvmet_hosts_group_ops, + .ct_owner = THIS_MODULE, + }; + + static struct config_group nvmet_hosts_group; + +-static const struct config_item_type nvmet_root_type = { ++static MLX_CONFIG_ITEM_TYPE_CONST struct config_item_type nvmet_root_type = { + .ct_owner = THIS_MODULE, + }; + +@@ -2213,16 +2264,19 @@ static struct configfs_subsystem nvmet_c + + void nvmet_offload_ctx_configfs_del(struct nvmet_offload_ctx *ctx) + { ++#ifdef HAVE_CONFIGFS_REGISTER_GROUP + if (d_inode(ctx->group.cg_item.ci_dentry)) { + pr_info("Removing offload ctx %d from configfs\n", ctx->id); + configfs_unregister_group(&ctx->group); + } ++#endif + } + EXPORT_SYMBOL_GPL(nvmet_offload_ctx_configfs_del); + + int nvmet_offload_ctx_configfs_create(struct nvmet_offload_ctx *ctx) + { + int res = 0; ++#ifdef HAVE_CONFIGFS_REGISTER_GROUP + char name[CONFIGFS_ITEM_NAME_LEN]; + + sprintf(name, "%d", ctx->id); +@@ -2235,6 +2289,7 @@ int nvmet_offload_ctx_configfs_create(st + if (res) + pr_err("failed to register configfs group for offload ctx %s\n", + name); ++#endif + + return res; + } diff --git a/src/mlnx-ofa_kernel-5.8/backports/0221-BACKPORT-drivers-nvme-target-core.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0221-BACKPORT-drivers-nvme-target-core.c.patch new file mode 100644 index 0000000..2857cfd --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0221-BACKPORT-drivers-nvme-target-core.c.patch @@ -0,0 +1,198 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/nvme/target/core.c + +Change-Id: Ife1f49222e3b289ae37e31859fad2be58455f4e9 +--- + drivers/nvme/target/core.c | 97 ++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 97 insertions(+) + +--- a/drivers/nvme/target/core.c ++++ b/drivers/nvme/target/core.c +@@ -89,8 +89,13 @@ u16 nvmet_report_invalid_opcode(struct n + static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port, + const char *subsysnqn); + ++#ifdef CONFIG_COMPAT_SCATTERLIST_SG_PCOPY_TO_BUFFER + u16 nvmet_copy_to_sgl(struct nvmet_req *req, off_t off, const void *buf, + size_t len) ++#else ++u16 nvmet_copy_to_sgl(struct nvmet_req *req, off_t off, void *buf, ++ size_t len) ++#endif + { + if (sg_pcopy_from_buffer(req->sg, req->sg_cnt, buf, len, off) != len) { + req->error_loc = offsetof(struct nvme_common_command, dptr); +@@ -564,7 +569,9 @@ void nvmet_put_namespace(struct nvmet_ns + static void nvmet_ns_dev_disable(struct nvmet_ns *ns) + { + nvmet_bdev_ns_disable(ns); ++#ifdef HAVE_FS_HAS_KIOCB + nvmet_file_ns_disable(ns); ++#endif + } + + static int nvmet_p2pmem_ns_enable(struct nvmet_ns *ns) +@@ -656,8 +663,10 @@ void nvmet_ns_revalidate(struct nvmet_ns + + if (ns->bdev) + nvmet_bdev_ns_revalidate(ns); ++#ifdef HAVE_FS_HAS_KIOCB + else + nvmet_file_ns_revalidate(ns); ++#endif + + if (oldsize != ns->size) + nvmet_ns_changed(ns->subsys, ns->nsid); +@@ -715,8 +724,10 @@ int nvmet_ns_enable(struct nvmet_ns *ns) + goto out_unlock; + + ret = nvmet_bdev_ns_enable(ns); ++#ifdef HAVE_FS_HAS_KIOCB + if (ret == -ENOTBLK) + ret = nvmet_file_ns_enable(ns); ++#endif + if (ret) + goto out_unlock; + +@@ -1082,12 +1093,16 @@ static u16 nvmet_parse_io_cmd(struct nvm + + switch (req->ns->csi) { + case NVME_CSI_NVM: ++#ifdef HAVE_FS_HAS_KIOCB + if (req->ns->file) + return nvmet_file_parse_io_cmd(req); ++#endif + return nvmet_bdev_parse_io_cmd(req); + case NVME_CSI_ZNS: ++#ifdef HAVE_BIO_ADD_ZONE_APPEND_PAGE + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED)) + return nvmet_bdev_zns_parse_io_cmd(req); ++#endif + return NVME_SC_INVALID_IO_CMD_SET; + default: + return NVME_SC_INVALID_IO_CMD_SET; +@@ -1230,6 +1245,63 @@ static struct pci_dev *nvmet_req_find_p2 + return radix_tree_lookup(&req->sq->ctrl->p2p_ns_map, req->ns->nsid); + } + ++#ifndef HAVE_SGL_FREE ++static void nvmet_free_sgl(struct scatterlist *sgl, unsigned int nents) ++{ ++ struct scatterlist *sg; ++ int count; ++ ++ if (!sgl || !nents) ++ return; ++ ++ for_each_sg(sgl, sg, nents, count) ++ __free_page(sg_page(sg)); ++ kfree(sgl); ++} ++#endif ++ ++#ifndef HAVE_SGL_ALLOC ++static int nvmet_alloc_sgl(struct scatterlist **sgl, unsigned int *nents, ++ u32 length) ++{ ++ struct scatterlist *sg; ++ struct page *page; ++ unsigned int nent; ++ int i = 0; ++ ++ nent = DIV_ROUND_UP(length, PAGE_SIZE); ++ sg = kmalloc_array(nent, sizeof(struct scatterlist), GFP_KERNEL); ++ if (!sg) ++ goto out; ++ ++ sg_init_table(sg, nent); ++ ++ while (length) { ++ u32 page_len = min_t(u32, length, PAGE_SIZE); ++ ++ page = alloc_page(GFP_KERNEL); ++ if (!page) ++ goto out_free_pages; ++ ++ sg_set_page(&sg[i], page, page_len, 0); ++ length -= page_len; ++ i++; ++ } ++ *sgl = sg; ++ *nents = nent; ++ return 0; ++ ++out_free_pages: ++ while (i > 0) { ++ i--; ++ __free_page(sg_page(&sg[i])); ++ } ++ kfree(sg); ++out: ++ return NVME_SC_INTERNAL; ++} ++#endif ++ + int nvmet_req_alloc_sgls(struct nvmet_req *req) + { + struct pci_dev *p2p_dev = nvmet_req_find_p2p_dev(req); +@@ -1237,21 +1309,37 @@ int nvmet_req_alloc_sgls(struct nvmet_re + if (p2p_dev && !nvmet_req_alloc_p2pmem_sgls(p2p_dev, req)) + return 0; + ++#ifdef HAVE_SGL_ALLOC + req->sg = sgl_alloc(nvmet_data_transfer_len(req), GFP_KERNEL, + &req->sg_cnt); + if (unlikely(!req->sg)) + goto out; ++#else ++ if (nvmet_alloc_sgl(&req->sg, &req->sg_cnt, ++ nvmet_data_transfer_len(req))) ++ goto out; ++#endif + + if (req->metadata_len) { ++#ifdef HAVE_SGL_ALLOC + req->metadata_sg = sgl_alloc(req->metadata_len, GFP_KERNEL, + &req->metadata_sg_cnt); + if (unlikely(!req->metadata_sg)) + goto out_free; ++#else ++ if (nvmet_alloc_sgl(&req->metadata_sg, &req->metadata_sg_cnt, ++ req->metadata_len)) ++ goto out_free; ++#endif + } + + return 0; + out_free: ++#ifdef HAVE_SGL_FREE + sgl_free(req->sg); ++#else ++ nvmet_free_sgl(req->sg, req->sg_cnt); ++#endif + out: + return -ENOMEM; + } +@@ -1265,9 +1353,15 @@ void nvmet_req_free_sgls(struct nvmet_re + pci_p2pmem_free_sgl(req->p2p_dev, req->metadata_sg); + req->p2p_dev = NULL; + } else { ++#ifdef HAVE_SGL_FREE + sgl_free(req->sg); + if (req->metadata_sg) + sgl_free(req->metadata_sg); ++#else ++ nvmet_free_sgl(req->sg, req->sg_cnt); ++ if (req->metadata_sg) ++ nvmet_free_sgl(req->metadata_sg, req->metadata_sg_cnt); ++#endif + } + + req->sg = NULL; +@@ -1861,3 +1955,6 @@ module_init(nvmet_init); + module_exit(nvmet_exit); + + MODULE_LICENSE("GPL v2"); ++#ifdef RETPOLINE_MLNX ++MODULE_INFO(retpoline, "Y"); ++#endif diff --git a/src/mlnx-ofa_kernel-5.8/backports/0222-BACKPORT-drivers-nvme-target-fc.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0222-BACKPORT-drivers-nvme-target-fc.c.patch new file mode 100644 index 0000000..ecb5454 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0222-BACKPORT-drivers-nvme-target-fc.c.patch @@ -0,0 +1,109 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/nvme/target/fc.c + +Change-Id: Ia7bcaf75f873ab451c95bf960896b6123323a198 +--- + drivers/nvme/target/fc.c | 55 ++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 55 insertions(+) + +--- a/drivers/nvme/target/fc.c ++++ b/drivers/nvme/target/fc.c +@@ -2,6 +2,8 @@ + /* + * Copyright (c) 2016 Avago Technologies. All rights reserved. + */ ++#ifdef HAVE_LINUX_NVME_FC_DRIVER_H ++ + #ifdef pr_fmt + #undef pr_fmt + #endif +@@ -2081,11 +2083,37 @@ nvmet_fc_alloc_tgt_pgs(struct nvmet_fc_f + { + struct scatterlist *sg; + unsigned int nent; ++#ifndef HAVE_SGL_ALLOC ++ struct page *page; ++ u32 page_len, length; ++ int i = 0; ++ ++ length = fod->req.transfer_len; ++ nent = DIV_ROUND_UP(length, PAGE_SIZE); ++ sg = kmalloc_array(nent, sizeof(struct scatterlist), GFP_KERNEL); ++#else + + sg = sgl_alloc(fod->req.transfer_len, GFP_KERNEL, &nent); ++#endif + if (!sg) + goto out; + ++#ifndef HAVE_SGL_ALLOC ++ sg_init_table(sg, nent); ++ ++ while (length) { ++ page_len = min_t(u32, length, PAGE_SIZE); ++ ++ page = alloc_page(GFP_KERNEL); ++ if (!page) ++ goto out_free_pages; ++ ++ sg_set_page(&sg[i], page, page_len, 0); ++ length -= page_len; ++ i++; ++ } ++ ++#endif + fod->data_sg = sg; + fod->data_sg_cnt = nent; + fod->data_sg_cnt = fc_dma_map_sg(fod->tgtport->dev, sg, nent, +@@ -2096,6 +2124,17 @@ nvmet_fc_alloc_tgt_pgs(struct nvmet_fc_f + + return 0; + ++#ifndef HAVE_SGL_ALLOC ++out_free_pages: ++ while (i > 0) { ++ i--; ++ __free_page(sg_page(&sg[i])); ++ } ++ ++ kfree(sg); ++ fod->data_sg = NULL; ++ fod->data_sg_cnt = 0; ++#endif + out: + return NVME_SC_INTERNAL; + } +@@ -2103,13 +2142,24 @@ out: + static void + nvmet_fc_free_tgt_pgs(struct nvmet_fc_fcp_iod *fod) + { ++#ifndef HAVE_SGL_FREE ++ struct scatterlist *sg; ++ int count; ++ ++#endif + if (!fod->data_sg || !fod->data_sg_cnt) + return; + + fc_dma_unmap_sg(fod->tgtport->dev, fod->data_sg, fod->data_sg_cnt, + ((fod->io_dir == NVMET_FCP_WRITE) ? + DMA_FROM_DEVICE : DMA_TO_DEVICE)); ++#ifdef HAVE_SGL_FREE + sgl_free(fod->data_sg); ++#else ++ for_each_sg(fod->data_sg, sg, fod->data_sg_cnt, count) ++ __free_page(sg_page(sg)); ++ kfree(fod->data_sg); ++#endif + fod->data_sg = NULL; + fod->data_sg_cnt = 0; + } +@@ -2946,3 +2996,8 @@ module_init(nvmet_fc_init_module); + module_exit(nvmet_fc_exit_module); + + MODULE_LICENSE("GPL v2"); ++#ifdef RETPOLINE_MLNX ++MODULE_INFO(retpoline, "Y"); ++#endif ++ ++#endif /* HAVE_LINUX_NVME_FC_DRIVER_H */ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0223-BACKPORT-drivers-nvme-target-fcloop.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0223-BACKPORT-drivers-nvme-target-fcloop.c.patch new file mode 100644 index 0000000..2254487 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0223-BACKPORT-drivers-nvme-target-fcloop.c.patch @@ -0,0 +1,28 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/nvme/target/fcloop.c + +Change-Id: I37e9f085d666a79bd35744ec32fe1a623e283480 +--- + drivers/nvme/target/fcloop.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +--- a/drivers/nvme/target/fcloop.c ++++ b/drivers/nvme/target/fcloop.c +@@ -2,6 +2,8 @@ + /* + * Copyright (c) 2016 Avago Technologies. All rights reserved. + */ ++#ifdef HAVE_LINUX_NVME_FC_DRIVER_H ++ + #ifdef pr_fmt + #undef pr_fmt + #endif +@@ -1652,3 +1654,8 @@ module_init(fcloop_init); + module_exit(fcloop_exit); + + MODULE_LICENSE("GPL v2"); ++#ifdef RETPOLINE_MLNX ++MODULE_INFO(retpoline, "Y"); ++#endif ++ ++#endif /* HAVE_LINUX_NVME_FC_DRIVER_H */ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0224-BACKPORT-drivers-nvme-target-io-cmd-bdev.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0224-BACKPORT-drivers-nvme-target-io-cmd-bdev.c.patch new file mode 100644 index 0000000..f0487df --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0224-BACKPORT-drivers-nvme-target-io-cmd-bdev.c.patch @@ -0,0 +1,436 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/nvme/target/io-cmd-bdev.c + +Change-Id: I8520008cd8438ea5d04a2831716037b2f19fcb6e +--- + drivers/nvme/target/io-cmd-bdev.c | 209 +++++++++++++++++++++++++++++- + 1 file changed, 206 insertions(+), 3 deletions(-) + +--- a/drivers/nvme/target/io-cmd-bdev.c ++++ b/drivers/nvme/target/io-cmd-bdev.c +@@ -8,8 +8,12 @@ + #endif + #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include ++#ifdef HAVE_BLK_INTEGRITY_H + #include ++#endif ++#ifdef HAVE_NET_MEMREMAP_H + #include ++#endif + #include + #include "nvmet.h" + +@@ -62,6 +66,8 @@ void nvmet_bdev_ns_disable(struct nvmet_ + + static void nvmet_bdev_ns_enable_integrity(struct nvmet_ns *ns) + { ++#if defined(CONFIG_BLK_DEV_INTEGRITY) && \ ++ defined(HAVE_BLKDEV_BIO_INTEGRITY_BYTES) + struct blk_integrity *bi = bdev_get_integrity(ns->bdev); + + if (bi) { +@@ -74,6 +80,7 @@ static void nvmet_bdev_ns_enable_integri + /* Unsupported metadata type */ + ns->metadata_size = 0; + } ++#endif + } + + int nvmet_bdev_ns_enable(struct nvmet_ns *ns) +@@ -91,14 +98,20 @@ int nvmet_bdev_ns_enable(struct nvmet_ns + ns->bdev = NULL; + return ret; + } ++#ifdef HAVE_BDEV_NR_BYTES + ns->size = bdev_nr_bytes(ns->bdev); ++#else ++ ns->size = i_size_read(ns->bdev->bd_inode); ++#endif + ns->blksize_shift = blksize_bits(bdev_logical_block_size(ns->bdev)); + + ns->pi_type = 0; + ns->metadata_size = 0; +- if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY_T10)) ++ if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) + nvmet_bdev_ns_enable_integrity(ns); + ++#ifdef CONFIG_BLK_DEV_ZONED ++#ifdef HAVE_BIO_ADD_ZONE_APPEND_PAGE + if (bdev_is_zoned(ns->bdev)) { + if (!nvmet_bdev_zns_enable(ns)) { + nvmet_bdev_ns_disable(ns); +@@ -106,15 +119,22 @@ int nvmet_bdev_ns_enable(struct nvmet_ns + } + ns->csi = NVME_CSI_ZNS; + } ++#endif ++#endif + + return 0; + } + + void nvmet_bdev_ns_revalidate(struct nvmet_ns *ns) + { ++#ifdef HAVE_BDEV_NR_BYTES + ns->size = bdev_nr_bytes(ns->bdev); ++#else ++ ns->size = i_size_read(ns->bdev->bd_inode); ++#endif + } + ++#ifdef HAVE_BLK_STATUS_T + u16 blk_to_nvme_status(struct nvmet_req *req, blk_status_t blk_sts) + { + u16 status = NVME_SC_SUCCESS; +@@ -171,16 +191,28 @@ u16 blk_to_nvme_status(struct nvmet_req + } + return status; + } ++#endif + ++#ifdef HAVE_BIO_ENDIO_1_PARAM + static void nvmet_bio_done(struct bio *bio) ++#else ++static void nvmet_bio_done(struct bio *bio, int error) ++#endif + { + struct nvmet_req *req = bio->bi_private; + ++#ifdef HAVE_BLK_STATUS_T + nvmet_req_complete(req, blk_to_nvme_status(req, bio->bi_status)); ++#elif defined(HAVE_STRUCT_BIO_BI_ERROR) ++ nvmet_req_complete(req, bio->bi_error ? NVME_SC_INTERNAL | NVME_SC_DNR : 0); ++#else ++ nvmet_req_complete(req, error ? NVME_SC_INTERNAL | NVME_SC_DNR : 0); ++#endif + nvmet_req_bio_put(req, bio); + } + +-#ifdef CONFIG_BLK_DEV_INTEGRITY ++#if defined(CONFIG_BLK_DEV_INTEGRITY) && \ ++ defined(HAVE_BLKDEV_BIO_INTEGRITY_BYTES) + static int nvmet_bdev_alloc_bip(struct nvmet_req *req, struct bio *bio, + struct sg_mapping_iter *miter) + { +@@ -194,9 +226,14 @@ static int nvmet_bdev_alloc_bip(struct n + pr_err("Unable to locate bio_integrity\n"); + return -ENODEV; + } +- ++#ifdef HAVE_BIO_MAX_SEGS + bip = bio_integrity_alloc(bio, GFP_NOIO, + bio_max_segs(req->metadata_sg_cnt)); ++#else ++ bip = bio_integrity_alloc(bio, GFP_NOIO, ++ min_t(unsigned int, req->metadata_sg_cnt, BIO_MAX_PAGES)); ++#endif ++ + if (IS_ERR(bip)) { + pr_err("Unable to allocate bio_integrity_payload\n"); + return PTR_ERR(bip); +@@ -236,12 +273,19 @@ static int nvmet_bdev_alloc_bip(struct n + + static void nvmet_bdev_execute_rw(struct nvmet_req *req) + { ++#ifdef HAVE_BIO_MAX_SEGS + unsigned int sg_cnt = req->sg_cnt; ++#else ++ int sg_cnt = req->sg_cnt; ++#endif + struct bio *bio; + struct scatterlist *sg; + struct blk_plug plug; + sector_t sector; + int op, i, rc; ++#ifndef HAVE_BLK_TYPE_OP_IS_SYNC ++ int op_flags = 0; ++#endif + struct sg_mapping_iter prot_miter; + unsigned int iter_flags; + unsigned int total_len = nvmet_rw_data_len(req) + req->metadata_len; +@@ -255,9 +299,22 @@ static void nvmet_bdev_execute_rw(struct + } + + if (req->cmd->rw.opcode == nvme_cmd_write) { ++#ifdef HAVE_BLK_TYPE_OP_IS_SYNC ++#ifdef HAVE_REQ_IDLE + op = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE; ++#else ++ op = REQ_OP_WRITE | WRITE_ODIRECT; ++#endif ++#else ++ op = REQ_OP_WRITE; ++ op_flags = REQ_SYNC; ++#endif /* HAVE_BLK_TYPE_OP_IS_SYNC */ + if (req->cmd->rw.control & cpu_to_le16(NVME_RW_FUA)) ++#ifdef HAVE_BLK_TYPE_OP_IS_SYNC + op |= REQ_FUA; ++#else ++ op_flags |= REQ_FUA; ++#endif + iter_flags = SG_MITER_TO_SG; + } else { + op = REQ_OP_READ; +@@ -265,21 +322,65 @@ static void nvmet_bdev_execute_rw(struct + } + + if (is_pci_p2pdma_page(sg_page(req->sg))) ++#ifdef HAVE_BLK_TYPE_OP_IS_SYNC + op |= REQ_NOMERGE; ++#else ++ op_flags |= REQ_NOMERGE; ++#endif + + sector = nvmet_lba_to_sect(req->ns, req->cmd->rw.slba); + + if (nvmet_use_inline_bvec(req)) { + bio = &req->b.inline_bio; ++#ifdef HAVE_BIO_INIT_5_PARAMS ++ bio_init(bio, req->ns->bdev, req->inline_bvec, ++ ARRAY_SIZE(req->inline_bvec), op); ++#else ++#ifdef HAVE_BIO_INIT_3_PARAMS + bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec)); ++#else ++ bio_init(bio); ++ bio->bi_io_vec = req->inline_bvec; ++ bio->bi_max_vecs = ARRAY_SIZE(req->inline_bvec); ++#endif ++#endif + } else { ++#ifdef HAVE_BIO_INIT_5_PARAMS ++ bio = bio_alloc(req->ns->bdev, bio_max_segs(sg_cnt), op, ++ GFP_KERNEL); ++#else ++#ifdef HAVE_BIO_MAX_SEGS + bio = bio_alloc(GFP_KERNEL, bio_max_segs(sg_cnt)); ++#else ++ bio = bio_alloc(GFP_KERNEL, min(sg_cnt, BIO_MAX_PAGES)); ++#endif ++#endif + } ++#if defined HAVE_BIO_BI_DISK || defined HAVE_ENUM_BIO_REMAPPED ++#ifndef HAVE_BIO_INIT_5_PARAMS + bio_set_dev(bio, req->ns->bdev); ++#endif ++#else ++ bio->bi_bdev = req->ns->bdev; ++#endif ++#ifdef HAVE_STRUCT_BIO_BI_ITER + bio->bi_iter.bi_sector = sector; ++#else ++ bio->bi_sector = sector; ++#endif + bio->bi_private = req; + bio->bi_end_io = nvmet_bio_done; ++#ifdef HAVE_BLK_TYPE_OP_IS_SYNC ++#ifndef HAVE_BIO_INIT_5_PARAMS + bio->bi_opf = op; ++#endif ++#else ++ bio_set_op_attrs(bio, op, op_flags); ++#endif ++ ++#ifdef HAVE_RH7_STRUCT_BIO_AUX ++ bio_init_aux(bio, &req->bio_aux); ++#endif + + blk_start_plug(&plug); + if (req->metadata_len) +@@ -300,13 +401,42 @@ static void nvmet_bdev_execute_rw(struct + } + } + ++#ifdef HAVE_BIO_INIT_5_PARAMS ++ bio = bio_alloc(req->ns->bdev, bio_max_segs(sg_cnt), ++ op, GFP_KERNEL); ++#else ++#ifdef HAVE_BIO_MAX_SEGS + bio = bio_alloc(GFP_KERNEL, bio_max_segs(sg_cnt)); ++#else ++ bio = bio_alloc(GFP_KERNEL, min(sg_cnt, BIO_MAX_PAGES)); ++#endif ++#endif ++#if defined HAVE_BIO_BI_DISK || defined HAVE_ENUM_BIO_REMAPPED ++#ifndef HAVE_BIO_INIT_5_PARAMS + bio_set_dev(bio, req->ns->bdev); ++#endif ++#else ++ bio->bi_bdev = req->ns->bdev; ++#endif ++#ifdef HAVE_STRUCT_BIO_BI_ITER + bio->bi_iter.bi_sector = sector; ++#else ++ bio->bi_sector = sector; ++#endif ++#ifdef HAVE_BLK_TYPE_OP_IS_SYNC ++#ifndef HAVE_BIO_INIT_5_PARAMS + bio->bi_opf = op; ++#endif ++#else ++ bio_set_op_attrs(bio, op, op_flags); ++#endif + + bio_chain(bio, prev); ++#ifdef HAVE_SUBMIT_BIO_1_PARAM + submit_bio(prev); ++#else ++ submit_bio(bio_data_dir(prev), prev); ++#endif + } + + sector += sg->length >> 9; +@@ -321,7 +451,11 @@ static void nvmet_bdev_execute_rw(struct + } + } + ++#ifdef HAVE_SUBMIT_BIO_1_PARAM + submit_bio(bio); ++#else ++ submit_bio(bio_data_dir(bio), bio); ++#endif + blk_finish_plug(&plug); + } + +@@ -332,18 +466,53 @@ static void nvmet_bdev_execute_flush(str + if (!nvmet_check_transfer_len(req, 0)) + return; + ++#ifdef HAVE_BIO_INIT_5_PARAMS ++ bio_init(bio, req->ns->bdev, req->inline_bvec, ++ ARRAY_SIZE(req->inline_bvec), REQ_OP_WRITE | REQ_PREFLUSH); ++#else ++#ifdef HAVE_BIO_INIT_3_PARAMS + bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec)); ++#else ++ bio_init(bio); ++ bio->bi_io_vec = req->inline_bvec; ++ bio->bi_max_vecs = ARRAY_SIZE(req->inline_bvec); ++#endif ++#endif ++#if defined HAVE_BIO_BI_DISK || defined HAVE_ENUM_BIO_REMAPPED ++#ifndef HAVE_BIO_INIT_5_PARAMS + bio_set_dev(bio, req->ns->bdev); ++#endif ++#else ++ bio->bi_bdev = req->ns->bdev; ++#endif + bio->bi_private = req; + bio->bi_end_io = nvmet_bio_done; ++#ifdef HAVE_BLK_TYPE_OP_IS_SYNC ++#ifndef HAVE_BIO_INIT_5_PARAMS + bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; ++#endif ++#else ++ bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH); ++#endif + ++#ifdef HAVE_SUBMIT_BIO_1_PARAM + submit_bio(bio); ++#else ++ submit_bio(bio_data_dir(bio), bio); ++#endif + } + + u16 nvmet_bdev_flush(struct nvmet_req *req) + { ++#ifdef HAVE_BLKDEV_ISSUE_FLUSH_1_PARAM + if (blkdev_issue_flush(req->ns->bdev)) ++#else ++#ifdef HAVE_BLKDEV_ISSUE_FLUSH_2_PARAM ++ if (blkdev_issue_flush(req->ns->bdev, GFP_KERNEL)) ++#else ++ if (blkdev_issue_flush(req->ns->bdev, GFP_KERNEL, NULL)) ++#endif ++#endif + return NVME_SC_INTERNAL | NVME_SC_DNR; + return 0; + } +@@ -354,10 +523,24 @@ static u16 nvmet_bdev_discard_range(stru + struct nvmet_ns *ns = req->ns; + int ret; + ++#ifdef HAVE___BLKDEV_ISSUE_DISCARD_5_PARAM ++ ret = __blkdev_issue_discard(ns->bdev, ++ nvmet_lba_to_sect(ns, range->slba), ++ le32_to_cpu(range->nlb) << (ns->blksize_shift - 9), ++ GFP_KERNEL, bio); ++#else ++#ifdef HAVE___BLKDEV_ISSUE_DISCARD + ret = __blkdev_issue_discard(ns->bdev, + nvmet_lba_to_sect(ns, range->slba), + le32_to_cpu(range->nlb) << (ns->blksize_shift - 9), + GFP_KERNEL, 0, bio); ++#else ++ ret = blkdev_issue_discard(ns->bdev, ++ nvmet_lba_to_sect(ns, range->slba), ++ le32_to_cpu(range->nlb) << (ns->blksize_shift - 9), ++ GFP_KERNEL, 0); ++#endif ++#endif + if (ret && ret != -EOPNOTSUPP) { + req->error_slba = le64_to_cpu(range->slba); + return errno_to_nvme_status(req, ret); +@@ -389,7 +572,11 @@ static void nvmet_bdev_execute_discard(s + if (status) + bio_io_error(bio); + else ++#ifdef HAVE_SUBMIT_BIO_1_PARAM + submit_bio(bio); ++#else ++ submit_bio(bio_data_dir(bio), bio); ++#endif + } else { + nvmet_req_complete(req, status); + } +@@ -413,6 +600,7 @@ static void nvmet_bdev_execute_dsm(struc + } + } + ++#ifdef HAVE_BLKDEV_ISSUE_ZEROOUT + static void nvmet_bdev_execute_write_zeroes(struct nvmet_req *req) + { + struct nvme_write_zeroes_cmd *write_zeroes = &req->cmd->write_zeroes; +@@ -428,16 +616,29 @@ static void nvmet_bdev_execute_write_zer + nr_sector = (((sector_t)le16_to_cpu(write_zeroes->length) + 1) << + (req->ns->blksize_shift - 9)); + ++#ifdef CONFIG_COMPAT_IS_BLKDEV_ISSUE_ZEROOUT_HAS_FLAGS + ret = __blkdev_issue_zeroout(req->ns->bdev, sector, nr_sector, + GFP_KERNEL, &bio, 0); ++#else ++ if (__blkdev_issue_zeroout(req->ns->bdev, sector, nr_sector, ++ GFP_KERNEL, &bio, true)) ++ ret = -EIO; ++ else ++ ret = 0; ++#endif + if (bio) { + bio->bi_private = req; + bio->bi_end_io = nvmet_bio_done; ++#ifdef HAVE_SUBMIT_BIO_1_PARAM + submit_bio(bio); ++#else ++ submit_bio(bio_data_dir(bio), bio); ++#endif + } else { + nvmet_req_complete(req, errno_to_nvme_status(req, ret)); + } + } ++#endif + + u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req) + { +@@ -454,9 +655,11 @@ u16 nvmet_bdev_parse_io_cmd(struct nvmet + case nvme_cmd_dsm: + req->execute = nvmet_bdev_execute_dsm; + return 0; ++#ifdef HAVE_BLKDEV_ISSUE_ZEROOUT + case nvme_cmd_write_zeroes: + req->execute = nvmet_bdev_execute_write_zeroes; + return 0; ++#endif + default: + return nvmet_report_invalid_opcode(req); + } diff --git a/src/mlnx-ofa_kernel-5.8/backports/0225-BACKPORT-drivers-nvme-target-io-cmd-file.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0225-BACKPORT-drivers-nvme-target-io-cmd-file.c.patch new file mode 100644 index 0000000..75ceca4 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0225-BACKPORT-drivers-nvme-target-io-cmd-file.c.patch @@ -0,0 +1,109 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/nvme/target/io-cmd-file.c + +Change-Id: I37d9f4cc3591c88ef8883dac99c10a9dcd5d3c49 +--- + drivers/nvme/target/io-cmd-file.c | 26 ++++++++++++++++++++++++++ + 1 file changed, 26 insertions(+) + +--- a/drivers/nvme/target/io-cmd-file.c ++++ b/drivers/nvme/target/io-cmd-file.c +@@ -4,6 +4,7 @@ + * Copyright (c) 2017-2018 Western Digital Corporation or its + * affiliates. + */ ++#ifdef HAVE_FS_HAS_KIOCB + #ifdef pr_fmt + #undef pr_fmt + #endif +@@ -22,8 +23,12 @@ int nvmet_file_ns_revalidate(struct nvme + struct kstat stat; + int ret; + ++#ifdef HAVE_VFS_GETATTR_HAS_4_PARAMS + ret = vfs_getattr(&ns->file->f_path, &stat, STATX_SIZE, + AT_STATX_FORCE_SYNC); ++#else ++ ret = vfs_getattr(&ns->file->f_path, &stat); ++#endif + if (!ret) + ns->size = stat.size; + return ret; +@@ -120,7 +125,11 @@ static ssize_t nvmet_file_submit_bvec(st + rw = READ; + } + ++#ifdef HAVE_IOV_ITER_IS_BVEC_SET + iov_iter_bvec(&iter, rw, req->f.bvec, nr_segs, count); ++#else ++ iov_iter_bvec(&iter, ITER_BVEC | rw, req->f.bvec, nr_segs, count); ++#endif + + iocb->ki_pos = pos; + iocb->ki_filp = req->ns->file; +@@ -129,7 +138,11 @@ static ssize_t nvmet_file_submit_bvec(st + return call_iter(iocb, &iter); + } + ++#ifdef HAVE_FS_KIOCB_KI_COMPLETE_2_ARG + static void nvmet_file_io_done(struct kiocb *iocb, long ret) ++#else ++static void nvmet_file_io_done(struct kiocb *iocb, long ret, long ret2) ++#endif + { + struct nvmet_req *req = container_of(iocb, struct nvmet_req, f.iocb); + u16 status = NVME_SC_SUCCESS; +@@ -202,7 +215,9 @@ static bool nvmet_file_execute_io(struct + * A NULL ki_complete ask for synchronous execution, which we want + * for the IOCB_NOWAIT case. + */ ++#ifdef HAVE_IOCB_NOWAIT + if (!(ki_flags & IOCB_NOWAIT)) ++#endif + req->f.iocb.ki_complete = nvmet_file_io_done; + + ret = nvmet_file_submit_bvec(req, pos, bv_cnt, total_len, ki_flags); +@@ -210,6 +225,7 @@ static bool nvmet_file_execute_io(struct + switch (ret) { + case -EIOCBQUEUED: + return true; ++#ifdef HAVE_IOCB_NOWAIT + case -EAGAIN: + if (WARN_ON_ONCE(!(ki_flags & IOCB_NOWAIT))) + goto complete; +@@ -223,10 +239,15 @@ static bool nvmet_file_execute_io(struct + if ((ki_flags & IOCB_NOWAIT)) + return false; + break; ++#endif + } + + complete: ++#ifdef HAVE_FS_KIOCB_KI_COMPLETE_2_ARG + nvmet_file_io_done(&req->f.iocb, ret); ++#else ++ nvmet_file_io_done(&req->f.iocb, ret, 0); ++#endif + return true; + } + +@@ -269,10 +290,14 @@ static void nvmet_file_execute_rw(struct + req->f.mpool_alloc = false; + + if (req->ns->buffered_io) { ++#ifdef HAVE_IOCB_NOWAIT + if (likely(!req->f.mpool_alloc) && ++#ifdef HAVE_FMODE_NOWAIT + (req->ns->file->f_mode & FMODE_NOWAIT) && ++#endif + nvmet_file_execute_io(req, IOCB_NOWAIT)) + return; ++#endif + nvmet_file_submit_buffered_io(req); + } else + nvmet_file_execute_io(req, 0); +@@ -408,3 +433,4 @@ u16 nvmet_file_parse_io_cmd(struct nvmet + return nvmet_report_invalid_opcode(req); + } + } ++#endif /* HAVE_FS_HAS_KIOCB */ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0227-BACKPORT-drivers-nvme-target-nvmet.h.patch b/src/mlnx-ofa_kernel-5.8/backports/0227-BACKPORT-drivers-nvme-target-nvmet.h.patch new file mode 100644 index 0000000..2392b41 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0227-BACKPORT-drivers-nvme-target-nvmet.h.patch @@ -0,0 +1,100 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/nvme/target/nvmet.h + +Change-Id: Idc76180a8600aeb79485137a311462911cfbd50d +--- + drivers/nvme/target/nvmet.h | 29 +++++++++++++++++++++++++++++ + 1 file changed, 29 insertions(+) + +--- a/drivers/nvme/target/nvmet.h ++++ b/drivers/nvme/target/nvmet.h +@@ -22,6 +22,11 @@ + #include + #include + #include ++#include ++ ++#ifdef HAVE_BLK_INTEGRITY_H ++#define HAVE_BLKDEV_BIO_INTEGRITY_BYTES ++#endif + + #define NVMET_DEFAULT_VS NVME_VS(1, 3, 0) + +@@ -401,6 +406,9 @@ struct nvmet_req { + struct nvmet_ns *ns; + struct scatterlist *sg; + struct scatterlist *metadata_sg; ++#ifdef HAVE_RH7_STRUCT_BIO_AUX ++ struct bio_aux bio_aux; ++#endif + struct bio_vec inline_bvec[NVMET_MAX_INLINE_BIOVEC]; + union { + struct { +@@ -408,7 +416,9 @@ struct nvmet_req { + } b; + struct { + bool mpool_alloc; ++#ifdef HAVE_FS_HAS_KIOCB + struct kiocb iocb; ++#endif + struct bio_vec *bvec; + struct work_struct work; + } f; +@@ -492,8 +502,12 @@ void nvmet_stop_keep_alive_timer(struct + u16 nvmet_parse_connect_cmd(struct nvmet_req *req); + void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id); + u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req); ++#ifdef HAVE_FS_HAS_KIOCB + u16 nvmet_file_parse_io_cmd(struct nvmet_req *req); ++#endif ++#ifdef HAVE_BIO_ADD_ZONE_APPEND_PAGE + u16 nvmet_bdev_zns_parse_io_cmd(struct nvmet_req *req); ++#endif + u16 nvmet_parse_admin_cmd(struct nvmet_req *req); + u16 nvmet_parse_discovery_cmd(struct nvmet_req *req); + u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req); +@@ -565,8 +579,13 @@ void nvmet_offload_ctx_configfs_del(stru + void nvmet_referral_enable(struct nvmet_port *parent, struct nvmet_port *port); + void nvmet_referral_disable(struct nvmet_port *parent, struct nvmet_port *port); + ++#ifdef CONFIG_COMPAT_SCATTERLIST_SG_PCOPY_TO_BUFFER + u16 nvmet_copy_to_sgl(struct nvmet_req *req, off_t off, const void *buf, + size_t len); ++#else ++u16 nvmet_copy_to_sgl(struct nvmet_req *req, off_t off, void *buf, ++ size_t len); ++#endif + u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf, + size_t len); + u16 nvmet_zero_sgl(struct nvmet_req *req, off_t off, size_t len); +@@ -621,20 +640,30 @@ extern struct rw_semaphore nvmet_ana_sem + bool nvmet_host_allowed(struct nvmet_subsys *subsys, const char *hostnqn); + + int nvmet_bdev_ns_enable(struct nvmet_ns *ns); ++#ifdef HAVE_FS_HAS_KIOCB + int nvmet_file_ns_enable(struct nvmet_ns *ns); ++#endif + void nvmet_bdev_ns_disable(struct nvmet_ns *ns); ++#ifdef HAVE_FS_HAS_KIOCB + void nvmet_file_ns_disable(struct nvmet_ns *ns); ++#endif + u16 nvmet_bdev_flush(struct nvmet_req *req); + u16 nvmet_file_flush(struct nvmet_req *req); + void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid); + void nvmet_bdev_ns_revalidate(struct nvmet_ns *ns); ++#ifdef HAVE_FS_HAS_KIOCB + int nvmet_file_ns_revalidate(struct nvmet_ns *ns); ++#endif + void nvmet_ns_revalidate(struct nvmet_ns *ns); ++#ifdef HAVE_BLK_STATUS_T + u16 blk_to_nvme_status(struct nvmet_req *req, blk_status_t blk_sts); ++#endif + ++#ifdef HAVE_BIO_ADD_ZONE_APPEND_PAGE + bool nvmet_bdev_zns_enable(struct nvmet_ns *ns); + void nvmet_execute_identify_cns_cs_ctrl(struct nvmet_req *req); + void nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req); ++#endif + void nvmet_bdev_execute_zone_mgmt_recv(struct nvmet_req *req); + void nvmet_bdev_execute_zone_mgmt_send(struct nvmet_req *req); + void nvmet_bdev_execute_zone_append(struct nvmet_req *req); diff --git a/src/mlnx-ofa_kernel-5.8/backports/0228-BACKPORT-drivers-nvme-target-passthru.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0228-BACKPORT-drivers-nvme-target-passthru.c.patch new file mode 100644 index 0000000..affaec2 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0228-BACKPORT-drivers-nvme-target-passthru.c.patch @@ -0,0 +1,137 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/nvme/target/passthru.c + +Change-Id: I2dc19fe64118e81d2e4d99860b72b80ce82ea1d4 +--- + drivers/nvme/target/passthru.c | 64 ++++++++++++++++++++++++++++++++++ + 1 file changed, 64 insertions(+) + +--- a/drivers/nvme/target/passthru.c ++++ b/drivers/nvme/target/passthru.c +@@ -112,8 +112,13 @@ static u16 nvmet_passthru_override_id_ct + * nvmet_passthru_map_sg is limitted to using a single bio so limit + * the mdts based on BIO_MAX_VECS as well + */ ++#ifdef HAVE_BIO_MAX_VECS + max_hw_sectors = min_not_zero(BIO_MAX_VECS << (PAGE_SHIFT - 9), + max_hw_sectors); ++#else ++ max_hw_sectors = min_not_zero(((uint32_t)BIO_MAX_PAGES) << (PAGE_SHIFT - 9), ++ max_hw_sectors); ++#endif + + page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12; + +@@ -218,9 +223,18 @@ static void nvmet_passthru_execute_cmd_w + { + struct nvmet_req *req = container_of(w, struct nvmet_req, p.work); + struct request *rq = req->p.rq; ++#if defined(HAVE_BLK_EXECUTE_RQ_2_PARAM) || defined(HAVE_BLK_EXECUTE_RQ_3_PARAM) ++ u16 status; ++#else + int status; ++#endif + ++#if defined(HAVE_BLK_EXECUTE_RQ_2_PARAM) || defined(HAVE_BLK_EXECUTE_RQ_3_PARAM) + status = nvme_execute_passthru_rq(rq); ++#else ++ nvme_execute_passthru_rq(rq); ++ status = nvme_req(rq)->status; ++#endif + + if (status == NVME_SC_SUCCESS && + req->cmd->common.opcode == nvme_admin_identify) { +@@ -235,8 +249,12 @@ static void nvmet_passthru_execute_cmd_w + nvmet_passthru_override_id_descs(req); + break; + } ++#if defined(HAVE_BLK_EXECUTE_RQ_2_PARAM) || defined(HAVE_BLK_EXECUTE_RQ_3_PARAM) + } else if (status < 0) + status = NVME_SC_INTERNAL; ++#else ++ } ++#endif + + req->cqe->result = nvme_req(rq)->result; + nvmet_req_complete(req, status); +@@ -258,18 +276,41 @@ static int nvmet_passthru_map_sg(struct + struct scatterlist *sg; + struct bio *bio; + int i; ++#ifndef HAVE_BLK_RQ_BIO_PREP ++ int ret; ++#endif + ++#ifdef HAVE_BIO_MAX_VECS + if (req->sg_cnt > BIO_MAX_VECS) ++#else ++ if (req->sg_cnt > BIO_MAX_PAGES) ++#endif + return -EINVAL; + + if (nvmet_use_inline_bvec(req)) { + bio = &req->p.inline_bio; ++#ifdef HAVE_BIO_INIT_5_PARAMS ++ bio_init(bio, NULL, req->inline_bvec, ++ ARRAY_SIZE(req->inline_bvec), req_op(rq)); ++#else + bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec)); ++#endif + } else { ++#ifdef HAVE_BIO_INIT_5_PARAMS ++ bio = bio_alloc(NULL, bio_max_segs(req->sg_cnt), req_op(rq), ++ GFP_KERNEL); ++#else ++#ifdef HAVE_BIO_MAX_SEGS + bio = bio_alloc(GFP_KERNEL, bio_max_segs(req->sg_cnt)); ++#else ++ bio = bio_alloc(GFP_KERNEL, min(req->sg_cnt, BIO_MAX_PAGES)); ++#endif ++#endif + bio->bi_end_io = bio_put; + } ++#ifndef HAVE_BIO_INIT_5_PARAMS + bio->bi_opf = req_op(rq); ++#endif + + for_each_sg(req->sg, sg, req->sg_cnt, i) { + if (bio_add_pc_page(rq->q, bio, sg_page(sg), sg->length, +@@ -279,7 +320,15 @@ static int nvmet_passthru_map_sg(struct + } + } + ++#ifdef HAVE_BLK_RQ_BIO_PREP + blk_rq_bio_prep(rq, bio, req->sg_cnt); ++#else ++ ret = blk_rq_append_bio(rq, &bio); ++ if (unlikely(ret)) { ++ bio_put(bio); ++ return ret; ++ } ++#endif + + return 0; + } +@@ -342,7 +391,22 @@ static void nvmet_passthru_execute_cmd(s + schedule_work(&req->p.work); + } else { + rq->end_io_data = req; ++#ifdef HAVE_BLK_EXECUTE_RQ_NOWAIT_2_PARAM ++ rq->end_io = nvmet_passthru_req_done; ++ blk_execute_rq_nowait(rq, false); ++#else ++#ifdef HAVE_BLK_EXECUTE_RQ_NOWAIT_5_PARAM ++ blk_execute_rq_nowait(rq->q, ns ? ns->disk : NULL, rq, 0, ++ nvmet_passthru_req_done); ++#else ++#ifdef HAVE_BLK_EXECUTE_RQ_NOWAIT_3_PARAM + blk_execute_rq_nowait(rq, false, nvmet_passthru_req_done); ++#else ++ blk_execute_rq_nowait(ns ? ns->disk : NULL, rq, 0, ++ nvmet_passthru_req_done); ++#endif ++#endif ++#endif + } + + if (ns) diff --git a/src/mlnx-ofa_kernel-5.8/backports/0229-BACKPORT-drivers-nvme-target-rdma.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0229-BACKPORT-drivers-nvme-target-rdma.c.patch new file mode 100644 index 0000000..789a9f5 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0229-BACKPORT-drivers-nvme-target-rdma.c.patch @@ -0,0 +1,90 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/nvme/target/rdma.c + +Change-Id: Ibae525e055b3a9213a8d8a2bc19a7c46df42fa4a +--- + drivers/nvme/target/rdma.c | 27 +++++++++++++++++++++++++++ + 1 file changed, 27 insertions(+) + +--- a/drivers/nvme/target/rdma.c ++++ b/drivers/nvme/target/rdma.c +@@ -8,7 +8,9 @@ + #endif + #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include ++#ifdef HAVE_BLK_INTEGRITY_H + #include ++#endif + #include + #include + #include +@@ -166,8 +168,13 @@ static int nvmet_rdma_srq_size = 1024; + module_param_cb(srq_size, &srq_size_ops, &nvmet_rdma_srq_size, 0644); + MODULE_PARM_DESC(srq_size, "set Shared Receive Queue (SRQ) size, should >= 256 (default: 1024)"); + ++#ifdef HAVE_PARAM_OPS_ULLONG + static unsigned long long nvmet_rdma_offload_mem_start = 0; + module_param_named(offload_mem_start, nvmet_rdma_offload_mem_start, ullong, 0444); ++#else ++static unsigned long nvmet_rdma_offload_mem_start = 0; ++module_param_named(offload_mem_start, nvmet_rdma_offload_mem_start, ulong, 0444); ++#endif + MODULE_PARM_DESC(offload_mem_start, + "Start address of the memory dedicated for P2P data transfer. If not set, the driver will allocate 1MB staging buffer per offload context." + "Using bigger staging buffer will improve performance. Must be contiguous and aligned to" __stringify(PAGE_SIZE) "(default:0)"); +@@ -231,6 +238,13 @@ static int srq_size_set(const char *val, + return param_set_int(val, kp); + } + ++#if !defined HAVE_PUT_UNALIGNED_LE24 && !defined HAVE_PUT_UNALIGNED_LE24_ASM_GENERIC ++static inline u32 get_unaligned_le24(const u8 *p) ++{ ++ return (u32)p[0] | (u32)p[1] << 8 | (u32)p[2] << 16; ++} ++#endif ++ + static int num_pages(int len) + { + return 1 + (((len - 1) & PAGE_MASK) >> PAGE_SHIFT); +@@ -638,7 +652,13 @@ static void nvmet_rdma_set_sig_domain(st + { + domain->sig_type = IB_SIG_TYPE_T10_DIF; + domain->sig.dif.bg_type = IB_T10DIF_CRC; ++#ifdef CONFIG_BLK_DEV_INTEGRITY ++#ifdef HAVE_BLK_INTEGRITY_SECTOR_SIZE ++ domain->sig.dif.pi_interval = 1 << bi->sector_size; ++#else + domain->sig.dif.pi_interval = 1 << bi->interval_exp; ++#endif ++#endif + domain->sig.dif.ref_tag = le32_to_cpu(cmd->rw.reftag); + if (control & NVME_RW_PRINFO_PRCHK_REF) + domain->sig.dif.ref_remap = true; +@@ -2074,6 +2094,7 @@ static void nvmet_rdma_remove_port(struc + static void nvmet_rdma_disc_port_addr(struct nvmet_req *req, + struct nvmet_port *nport, char *traddr) + { ++#ifdef HAVE_INET_ADDR_IS_ANY + struct nvmet_rdma_port *port = nport->priv; + struct rdma_cm_id *cm_id = port->cm_id; + +@@ -2087,6 +2108,9 @@ static void nvmet_rdma_disc_port_addr(st + } else { + memcpy(traddr, nport->disc_addr.traddr, NVMF_TRADDR_SIZE); + } ++#else ++ memcpy(traddr, nport->disc_addr.traddr, NVMF_TRADDR_SIZE); ++#endif + } + + static u8 nvmet_rdma_get_mdts(const struct nvmet_ctrl *ctrl) +@@ -2274,6 +2298,9 @@ module_init(nvmet_rdma_init); + module_exit(nvmet_rdma_exit); + + MODULE_LICENSE("GPL v2"); ++#ifdef RETPOLINE_MLNX ++MODULE_INFO(retpoline, "Y"); ++#endif + MODULE_ALIAS("nvmet-transport-1"); /* 1 == NVMF_TRTYPE_RDMA */ + + #include "rdma_offload.c" diff --git a/src/mlnx-ofa_kernel-5.8/backports/0230-BACKPORT-drivers-nvme-target-rdma_offload.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0230-BACKPORT-drivers-nvme-target-rdma_offload.c.patch new file mode 100644 index 0000000..847f9d5 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0230-BACKPORT-drivers-nvme-target-rdma_offload.c.patch @@ -0,0 +1,61 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/nvme/target/rdma_offload.c + +Change-Id: I3a988682804c66c45312d5513ec860cd362bb47c +--- + drivers/nvme/target/rdma_offload.c | 23 +++++++++++++++++++++++ + 1 file changed, 23 insertions(+) + +--- a/drivers/nvme/target/rdma_offload.c ++++ b/drivers/nvme/target/rdma_offload.c +@@ -16,6 +16,10 @@ + */ + + #include "rdma_offload.h" ++#if defined(CONFIG_PPC) && defined(HAVE_PNV_PCI_SET_P2P) ++#include ++#endif ++#include + + static unsigned int + __nvmet_rdma_peer_to_peer_sqe_inline_size(struct ib_nvmf_caps *nvmf_caps, +@@ -363,6 +367,11 @@ static u16 nvmet_rdma_install_offload_qu + + static void nvmet_rdma_free_be_ctrl(struct nvmet_rdma_backend_ctrl *be_ctrl) + { ++#if defined(CONFIG_PPC) && defined(HAVE_PNV_PCI_SET_P2P) ++ pnv_pci_set_p2p(container_of(be_ctrl->xrq->ndev->device->dev.parent, ++ struct pci_dev, dev), ++ be_ctrl->pdev, OPAL_PCI_P2P_STORE); ++#endif + lockdep_assert_held(&be_ctrl->xrq->be_mutex); + list_del_init(&be_ctrl->entry); + be_ctrl->xrq->nr_be_ctrls--; +@@ -536,6 +545,15 @@ nvmet_rdma_create_be_ctrl(struct nvmet_r + } + mutex_unlock(&xrq->be_mutex); + ++#if defined(CONFIG_PPC) && defined(HAVE_PNV_PCI_SET_P2P) ++ err = pnv_pci_set_p2p(container_of(xrq->ndev->device->dev.parent, ++ struct pci_dev, dev), ++ ns->pdev, ++ OPAL_PCI_P2P_STORE | OPAL_PCI_P2P_ENABLE); ++ if (err) ++ return ERR_PTR(err); ++#endif ++ + be_ctrl = kzalloc(sizeof(*be_ctrl), GFP_KERNEL); + if (!be_ctrl) { + err = -ENOMEM; +@@ -636,6 +654,11 @@ out_free_be_ctrl: + kref_put(&xrq->ref, nvmet_rdma_destroy_xrq); + kfree(be_ctrl); + out_err: ++#if defined(CONFIG_PPC) && defined(HAVE_PNV_PCI_SET_P2P) ++ pnv_pci_set_p2p(container_of(xrq->ndev->device->dev.parent, ++ struct pci_dev, dev), ++ ns->pdev, OPAL_PCI_P2P_STORE); ++#endif + return ERR_PTR(err); + } + diff --git a/src/mlnx-ofa_kernel-5.8/backports/0231-BACKPORT-drivers-nvme-target-tcp.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0231-BACKPORT-drivers-nvme-target-tcp.c.patch new file mode 100644 index 0000000..5fce207 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0231-BACKPORT-drivers-nvme-target-tcp.c.patch @@ -0,0 +1,96 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/nvme/target/tcp.c + +Change-Id: I9ecfbfc638e6e80645fdde805c1530fef6cb5a19 +--- + drivers/nvme/target/tcp.c | 40 +++++++++++++++++++++++++++++++++++++++ + 1 file changed, 40 insertions(+) + +--- a/drivers/nvme/target/tcp.c ++++ b/drivers/nvme/target/tcp.c +@@ -3,6 +3,9 @@ + * NVMe over Fabrics TCP target. + * Copyright (c) 2018 Lightbits Labs. All rights reserved. + */ ++#ifdef pr_fmt ++#undef pr_fmt ++#endif + #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include + #include +@@ -1528,14 +1531,27 @@ static int nvmet_tcp_set_queue_sock(stru + struct socket *sock = queue->sock; + struct inet_sock *inet = inet_sk(sock->sk); + int ret; ++#ifndef HAVE_KERNEL_GETSOCKNAME_2_PARAMS ++ int len; ++#endif + ++#ifdef HAVE_KERNEL_GETSOCKNAME_2_PARAMS + ret = kernel_getsockname(sock, + (struct sockaddr *)&queue->sockaddr); ++#else ++ ret = kernel_getsockname(sock, ++ (struct sockaddr *)&queue->sockaddr, &len); ++#endif + if (ret < 0) + return ret; + ++#ifdef HAVE_KERNEL_GETSOCKNAME_2_PARAMS + ret = kernel_getpeername(sock, + (struct sockaddr *)&queue->sockaddr_peer); ++#else ++ ret = kernel_getpeername(sock, ++ (struct sockaddr *)&queue->sockaddr_peer, &len); ++#endif + if (ret < 0) + return ret; + +@@ -1550,8 +1566,19 @@ static int nvmet_tcp_set_queue_sock(stru + sock_set_priority(sock->sk, so_priority); + + /* Set socket type of service */ ++#ifdef HAVE_IP_SOCK_SET_TOS + if (inet->rcv_tos > 0) + ip_sock_set_tos(sock->sk, inet->rcv_tos); ++#else ++ if (inet->rcv_tos > 0) { ++ int tos = inet->rcv_tos; ++ ++ ret = kernel_setsockopt(sock, SOL_IP, IP_TOS, ++ (char *)&tos, sizeof(tos)); ++ if (ret) ++ return ret; ++ } ++#endif + + ret = 0; + write_lock_bh(&sock->sk->sk_callback_lock); +@@ -1680,6 +1707,9 @@ static int nvmet_tcp_add_port(struct nvm + struct nvmet_tcp_port *port; + __kernel_sa_family_t af; + int ret; ++#ifndef HAVE_TCP_SOCK_SET_NODELAY ++ int opt; ++#endif + + port = kzalloc(sizeof(*port), GFP_KERNEL); + if (!port) +@@ -1723,7 +1753,17 @@ static int nvmet_tcp_add_port(struct nvm + port->data_ready = port->sock->sk->sk_data_ready; + port->sock->sk->sk_data_ready = nvmet_tcp_listen_data_ready; + sock_set_reuseaddr(port->sock->sk); ++#ifdef HAVE_TCP_SOCK_SET_NODELAY + tcp_sock_set_nodelay(port->sock->sk); ++#else ++ opt = 1; ++ ret = kernel_setsockopt(port->sock, IPPROTO_TCP, ++ TCP_NODELAY, (char *)&opt, sizeof(opt)); ++ if (ret) { ++ pr_err("failed to set TCP_NODELAY sock opt %d\n", ret); ++ goto err_sock; ++ } ++#endif + if (so_priority > 0) + sock_set_priority(port->sock->sk, so_priority); + diff --git a/src/mlnx-ofa_kernel-5.8/backports/0232-BACKPORT-drivers-nvme-target-trace.h.patch b/src/mlnx-ofa_kernel-5.8/backports/0232-BACKPORT-drivers-nvme-target-trace.h.patch new file mode 100644 index 0000000..ac9b616 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0232-BACKPORT-drivers-nvme-target-trace.h.patch @@ -0,0 +1,21 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/nvme/target/trace.h + +Change-Id: I0d2c17701dad010487b8c45e5e17893737b9eefb +--- + drivers/nvme/target/trace.h | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/drivers/nvme/target/trace.h ++++ b/drivers/nvme/target/trace.h +@@ -53,8 +53,8 @@ static inline void __assign_req_name(cha + return; + } + +- strncpy(name, req->ns->device_path, +- min_t(size_t, DISK_NAME_LEN, strlen(req->ns->device_path))); ++ // XXX Fix RH 8 compilation warning ++ strncpy(name, req->ns->device_path, DISK_NAME_LEN); + } + #endif + diff --git a/src/mlnx-ofa_kernel-5.8/backports/0233-BACKPORT-drivers-scsi-scsi_priv.h.patch b/src/mlnx-ofa_kernel-5.8/backports/0233-BACKPORT-drivers-scsi-scsi_priv.h.patch new file mode 100644 index 0000000..8cfc03e --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0233-BACKPORT-drivers-scsi-scsi_priv.h.patch @@ -0,0 +1,41 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/scsi/scsi_priv.h + +Change-Id: I6ed55ccec6559a8d38a9e263e6774b5cb1ae6a40 +--- + drivers/scsi/scsi_priv.h | 9 +++++++++ + 1 file changed, 9 insertions(+) + +--- a/drivers/scsi/scsi_priv.h ++++ b/drivers/scsi/scsi_priv.h +@@ -4,7 +4,9 @@ + + #include + #include ++#ifdef HAVE_SCSI_DEVICE_BUDGET_MAP + #include ++#endif + + struct bsg_device; + struct request_queue; +@@ -181,7 +183,9 @@ static inline void scsi_dh_release_devic + + struct bsg_device *scsi_bsg_register_queue(struct scsi_device *sdev); + ++#ifdef HAVE_SCSI_DEVICE_BUDGET_MAP + extern int scsi_device_max_queue_depth(struct scsi_device *sdev); ++#endif + + /* + * internal scsi timeout functions: for use by mid-layer and transport +@@ -189,5 +193,10 @@ extern int scsi_device_max_queue_depth(s + */ + + #define SCSI_DEVICE_BLOCK_MAX_TIMEOUT 600 /* units in seconds */ ++#ifndef HAVE_SCSI_DEVICE_SCSI_INTERNAL_DEVICE_BLOCK ++extern int scsi_internal_device_block(struct scsi_device *sdev); ++extern int scsi_internal_device_unblock(struct scsi_device *sdev, ++ enum scsi_device_state new_state); ++#endif + + #endif /* _SCSI_PRIV_H */ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0234-BACKPORT-drivers-scsi-scsi_transport_srp.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0234-BACKPORT-drivers-scsi-scsi_transport_srp.c.patch new file mode 100644 index 0000000..c7e1a58 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0234-BACKPORT-drivers-scsi-scsi_transport_srp.c.patch @@ -0,0 +1,54 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/scsi/scsi_transport_srp.c + +Change-Id: I93160ed19ed63a9dd5930d1c966049626554b96a +--- + drivers/scsi/scsi_transport_srp.c | 16 ++++++++++++++++ + 1 file changed, 16 insertions(+) + +--- a/drivers/scsi/scsi_transport_srp.c ++++ b/drivers/scsi/scsi_transport_srp.c +@@ -569,12 +569,21 @@ int srp_reconnect_rport(struct srp_rport + * invoking scsi_target_unblock() won't change the state of + * these devices into running so do that explicitly. + */ ++#ifdef HAVE_SCSI_DEVICE_STATE_MUTEX + shost_for_each_device(sdev, shost) { + mutex_lock(&sdev->state_mutex); ++#else ++ spin_lock_irq(shost->host_lock); ++ __shost_for_each_device(sdev, shost) ++#endif + if (sdev->sdev_state == SDEV_OFFLINE) + sdev->sdev_state = SDEV_RUNNING; ++#ifdef HAVE_SCSI_DEVICE_STATE_MUTEX + mutex_unlock(&sdev->state_mutex); + } ++#else ++ spin_unlock_irq(shost->host_lock); ++#endif + } else if (rport->state == SRP_RPORT_RUNNING) { + /* + * srp_reconnect_rport() has been invoked with fast_io_fail +@@ -616,7 +625,11 @@ enum blk_eh_timer_return srp_timed_out(s + return rport && rport->fast_io_fail_tmo < 0 && + rport->dev_loss_tmo < 0 && + i->f->reset_timer_if_blocked && scsi_device_blocked(sdev) ? ++#ifdef HAVE_BLK_EH_DONE + BLK_EH_RESET_TIMER : BLK_EH_DONE; ++#else ++ BLK_EH_RESET_TIMER : BLK_EH_NOT_HANDLED; ++#endif + } + EXPORT_SYMBOL(srp_timed_out); + +@@ -900,6 +913,9 @@ static void __exit srp_transport_exit(vo + MODULE_AUTHOR("FUJITA Tomonori"); + MODULE_DESCRIPTION("SRP Transport Attributes"); + MODULE_LICENSE("GPL"); ++#ifdef RETPOLINE_MLNX ++MODULE_INFO(retpoline, "Y"); ++#endif + + module_init(srp_transport_init); + module_exit(srp_transport_exit); diff --git a/src/mlnx-ofa_kernel-5.8/backports/0235-BACKPORT-include-linux-auxiliary_bus.h.patch b/src/mlnx-ofa_kernel-5.8/backports/0235-BACKPORT-include-linux-auxiliary_bus.h.patch new file mode 100644 index 0000000..95ca446 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0235-BACKPORT-include-linux-auxiliary_bus.h.patch @@ -0,0 +1,42 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: include/linux/auxiliary_bus.h + +Change-Id: I1285585cd9afe374d546ae29dc7253c910b07c18 +--- + include/linux/auxiliary_bus.h | 16 ++++++++++++++++ + 1 file changed, 16 insertions(+) + +--- a/include/linux/auxiliary_bus.h ++++ b/include/linux/auxiliary_bus.h +@@ -8,6 +8,8 @@ + #ifndef _AUXILIARY_BUS_H_ + #define _AUXILIARY_BUS_H_ + ++#include "../../compat/config.h" ++ + #include + #include + +@@ -244,8 +246,22 @@ void auxiliary_driver_unregister(struct + #define module_auxiliary_driver(__auxiliary_driver) \ + module_driver(__auxiliary_driver, auxiliary_driver_register, auxiliary_driver_unregister) + ++#ifdef HAVE_LINUX_DEVICE_BUS_H + struct auxiliary_device *auxiliary_find_device(struct device *start, + const void *data, + int (*match)(struct device *dev, const void *data)); ++#else ++#ifdef HAVE_BUS_FIND_DEVICE_GET_CONST ++struct auxiliary_device * ++auxiliary_find_device(struct device *start, ++ const void *data, ++ int (*match)(struct device *dev, const void *data)); ++#else ++struct auxiliary_device * ++auxiliary_find_device(struct device *start, ++ void *data, ++ int (*match)(struct device *dev, void *data)); ++#endif /* HAVE_BUS_FIND_DEVICE_GET_CONST */ ++#endif /* HAVE_LINUX_DEVICE_BUS_H */ + + #endif /* _AUXILIARY_BUS_H_ */ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0236-BACKPORT-include-linux-mlx5-driver.h.patch b/src/mlnx-ofa_kernel-5.8/backports/0236-BACKPORT-include-linux-mlx5-driver.h.patch new file mode 100644 index 0000000..f2dcd63 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0236-BACKPORT-include-linux-mlx5-driver.h.patch @@ -0,0 +1,129 @@ +From: Maher Sanalla +Subject: [PATCH] BACKPORT: include/linux/mlx5/driver.h + +Change-Id: Idcc3f0c83f3d8c856b411f26e8d4c3a7b9793253 +--- + include/linux/mlx5/driver.h | 31 +++++++++++++++++++++++++++++-- + 1 file changed, 29 insertions(+), 2 deletions(-) + +--- a/include/linux/mlx5/driver.h ++++ b/include/linux/mlx5/driver.h +@@ -33,6 +33,8 @@ + #ifndef MLX5_DRIVER_H + #define MLX5_DRIVER_H + ++#include "../../../compat/config.h" ++ + #include + #include + #include +@@ -56,6 +58,9 @@ + #include + #include + #include ++#include ++#include ++ + + #define MLX5_ADEV_NAME "mlx5_core" + +@@ -457,9 +462,11 @@ struct mlx5_core_health { + unsigned long flags; + struct mlx5_fw_crdump *crdump; + struct work_struct fatal_report_work; ++#ifdef HAVE_DEVLINK_HEALTH_REPORT_SUPPORT + struct work_struct report_work; + struct devlink_health_reporter *fw_reporter; + struct devlink_health_reporter *fw_fatal_reporter; ++#endif /* HAVE_DEVLINK_HEALTH_REPORT_SUPPORT */ + struct delayed_work update_fw_log_ts_work; + /* failed recoveries in sequence*/ + u32 failed_in_seq; +@@ -529,6 +536,7 @@ struct mlx5_core_sriov { + struct kobject *groups_config; + struct kobject node_guid_kobj; + struct mlx5_sriov_vf *vfs; ++ bool probe_vf; + }; + + struct mlx5_fc_pool { +@@ -622,6 +630,12 @@ enum { + * creation/deletion on drivers rescan. Unset during device attach. + */ + MLX5_PRIV_FLAGS_DETACH = 1 << 2, ++#ifdef HAVE_DEVL_PORT_REGISTER //forward port ++ /* Distinguish between mlx5e_probe/remove called by module init/cleanup ++ * and called by other flows which can already hold devlink lock ++ */ ++ MLX5_PRIV_FLAGS_MLX5E_LOCKED_FLOW = 1 << 3, ++#endif + }; + + struct mlx5_adev { +@@ -661,6 +675,9 @@ struct mlx5_priv { + /* IRQ table valid only for real pci devices PF or VF */ + struct mlx5_irq_table *irq_table; + struct mlx5_eq_table *eq_table; ++#ifndef HAVE_PCI_IRQ_API ++ struct msix_entry *msix_arr; ++#endif + + /* pages stuff */ + struct mlx5_nb pg_nb; +@@ -871,9 +888,11 @@ struct mlx5_clock { + struct mlx5_nb pps_nb; + seqlock_t lock; + struct hwtstamp_config hwtstamp_config; ++#if (defined (CONFIG_PTP_1588_CLOCK) || defined(CONFIG_PTP_1588_CLOCK_MODULE)) + struct ptp_clock *ptp; + struct ptp_clock_info ptp_info; + struct mlx5_pps pps_info; ++#endif + struct mlx5_timer timer; + }; + +@@ -965,7 +984,9 @@ struct mlx5_core_dev { + enum mlx5_device_state state; + /* sync interface state */ + struct mutex intf_state_mutex; ++#ifdef HAVE_LOCKDEP_UNREGISTER_KEY + struct lock_class_key lock_key; ++#endif + unsigned long intf_state; + struct mlx5_priv priv; + struct mlx5_profile profile; +@@ -1047,8 +1068,13 @@ struct mlx5_cmd_work_ent { + int page_queue; + u8 status; + u8 token; ++#ifdef HAVE_KTIME_GET_NS + u64 ts1; + u64 ts2; ++#else ++ struct timespec ts1; ++ struct timespec ts2; ++#endif + u16 op; + bool polling; + /* Track the max comp handlers */ +@@ -1183,7 +1209,7 @@ void mlx5_cmd_allowed_opcode(struct mlx5 + struct mlx5_async_ctx { + struct mlx5_core_dev *dev; + atomic_t num_inflight; +- struct wait_queue_head wait; ++ wait_queue_head_t wait; + }; + + struct mlx5_async_work; +@@ -1504,9 +1530,10 @@ bool mlx5_is_roce_on(struct mlx5_core_de + + static inline bool mlx5_get_roce_state(struct mlx5_core_dev *dev) + { ++#if defined(HAVE_DEVLINK_PARAM) && defined(HAVE_DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE) + if (MLX5_CAP_GEN(dev, roce_rw_supported)) + return MLX5_CAP_GEN(dev, roce); +- ++#endif + /* If RoCE cap is read-only in FW, get RoCE state from devlink + * in order to support RoCE enable/disable feature + */ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0237-BACKPORT-include-linux-mlx5-eswitch.h.patch b/src/mlnx-ofa_kernel-5.8/backports/0237-BACKPORT-include-linux-mlx5-eswitch.h.patch new file mode 100644 index 0000000..85220ad --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0237-BACKPORT-include-linux-mlx5-eswitch.h.patch @@ -0,0 +1,22 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: include/linux/mlx5/eswitch.h + +Change-Id: If57e18057ae3accbc6d768795fdd693383e1e947 +--- + include/linux/mlx5/eswitch.h | 5 ----- + 1 file changed, 5 deletions(-) + +--- a/include/linux/mlx5/eswitch.h ++++ b/include/linux/mlx5/eswitch.h +@@ -200,11 +200,6 @@ static inline struct mlx5_core_dev *mlx5 + return NULL; + } + +-static bool mlx5_eswitch_is_manager_vport(const struct mlx5_eswitch *esw, +- u16 vport_num) +-{ +- return false; +-} + #endif /* CONFIG_MLX5_ESWITCH */ + + static inline bool is_mdev_legacy_mode(struct mlx5_core_dev *dev) diff --git a/src/mlnx-ofa_kernel-5.8/backports/0238-BACKPORT-include-linux-mlx5-fs.h.patch b/src/mlnx-ofa_kernel-5.8/backports/0238-BACKPORT-include-linux-mlx5-fs.h.patch new file mode 100644 index 0000000..a7addb3 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0238-BACKPORT-include-linux-mlx5-fs.h.patch @@ -0,0 +1,18 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: include/linux/mlx5/fs.h + +Change-Id: Iaa7d71aeb7d9bfd28db3c62b9c168aa2aadce726 +--- + include/linux/mlx5/fs.h | 1 + + 1 file changed, 1 insertion(+) + +--- a/include/linux/mlx5/fs.h ++++ b/include/linux/mlx5/fs.h +@@ -35,6 +35,7 @@ + + #include + #include ++#include + + #define MLX5_FS_DEFAULT_FLOW_TAG 0x0 + diff --git a/src/mlnx-ofa_kernel-5.8/backports/0239-BACKPORT-include-linux-mlx5-port.h.patch b/src/mlnx-ofa_kernel-5.8/backports/0239-BACKPORT-include-linux-mlx5-port.h.patch new file mode 100644 index 0000000..9f8da21 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0239-BACKPORT-include-linux-mlx5-port.h.patch @@ -0,0 +1,39 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: include/linux/mlx5/port.h + +Change-Id: Icc1f275d7484cb434072a75bdbef47871908b97b +--- + include/linux/mlx5/port.h | 5 +++++ + 1 file changed, 5 insertions(+) + +--- a/include/linux/mlx5/port.h ++++ b/include/linux/mlx5/port.h +@@ -33,6 +33,7 @@ + #ifndef __MLX5_PORT_H__ + #define __MLX5_PORT_H__ + ++#include "../../../compat/config.h" + #include + + enum mlx5_beacon_duration { +@@ -45,7 +46,9 @@ enum mlx5_module_id { + MLX5_MODULE_ID_QSFP = 0xC, + MLX5_MODULE_ID_QSFP_PLUS = 0xD, + MLX5_MODULE_ID_QSFP28 = 0x11, ++#ifdef HAVE_GET_MODULE_EEPROM_BY_PAGE + MLX5_MODULE_ID_DSFP = 0x1B, ++#endif + }; + + enum mlx5_an_status { +@@ -225,8 +228,10 @@ void mlx5_query_port_fcs(struct mlx5_cor + int mlx5_query_module_num(struct mlx5_core_dev *dev, int *module_num); + int mlx5_query_module_eeprom(struct mlx5_core_dev *dev, + u16 offset, u16 size, u8 *data); ++#ifdef HAVE_GET_MODULE_EEPROM_BY_PAGE + int mlx5_query_module_eeprom_by_page(struct mlx5_core_dev *dev, + struct mlx5_module_eeprom_query_params *params, u8 *data); ++#endif + + int mlx5_query_port_dcbx_param(struct mlx5_core_dev *mdev, u32 *out); + int mlx5_set_port_dcbx_param(struct mlx5_core_dev *mdev, u32 *in); diff --git a/src/mlnx-ofa_kernel-5.8/backports/0240-BACKPORT-include-linux-mod_devicetable.h.patch b/src/mlnx-ofa_kernel-5.8/backports/0240-BACKPORT-include-linux-mod_devicetable.h.patch new file mode 100644 index 0000000..bafdf00 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0240-BACKPORT-include-linux-mod_devicetable.h.patch @@ -0,0 +1,29 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: include/linux/mod_devicetable.h + +Change-Id: Ia771e29dc3aa903108943f2ff62c4a36951caf4b +--- + include/linux/mod_devicetable.h | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/include/linux/mod_devicetable.h ++++ b/include/linux/mod_devicetable.h +@@ -8,6 +8,8 @@ + #ifndef LINUX_MOD_DEVICETABLE_H + #define LINUX_MOD_DEVICETABLE_H + ++#include "../../compat/config.h" ++ + #ifdef __KERNEL__ + #include + #include +@@ -45,7 +47,9 @@ struct pci_device_id { + __u32 subvendor, subdevice; /* Subsystem ID's or PCI_ANY_ID */ + __u32 class, class_mask; /* (class,subclass,prog-if) triplet */ + kernel_ulong_t driver_data; /* Data private to the driver */ ++#ifdef HAVE_VFIO_PCI_CORE_H + __u32 override_only; ++#endif + }; + + diff --git a/src/mlnx-ofa_kernel-5.8/backports/0241-BACKPORT-include-linux-nvme-fc-driver.h.patch b/src/mlnx-ofa_kernel-5.8/backports/0241-BACKPORT-include-linux-nvme-fc-driver.h.patch new file mode 100644 index 0000000..bf423f6 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0241-BACKPORT-include-linux-nvme-fc-driver.h.patch @@ -0,0 +1,44 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: include/linux/nvme-fc-driver.h + +Change-Id: Ie593e1e9be3a7da932df39cde7ccb05bc8669ee4 +--- + include/linux/nvme-fc-driver.h | 11 +++++++++++ + 1 file changed, 11 insertions(+) + +--- a/include/linux/nvme-fc-driver.h ++++ b/include/linux/nvme-fc-driver.h +@@ -6,6 +6,8 @@ + #ifndef _NVME_FC_DRIVER_H + #define _NVME_FC_DRIVER_H 1 + ++#include "../../compat/config.h" ++ + #include + #include + +@@ -20,6 +22,13 @@ + * *********************************************************** + */ + ++#ifndef HAVE_SCSI_TRANSPORT_FC_FC_PORT_ROLE_NVME_TARGET ++/* FC Port role bitmask - can merge with FC Port Roles in fc transport */ ++#define FC_PORT_ROLE_NVME_INITIATOR 0x10 ++#define FC_PORT_ROLE_NVME_TARGET 0x20 ++#define FC_PORT_ROLE_NVME_DISCOVERY 0x40 ++#endif ++ + /** + * struct nvmefc_ls_req - Request structure passed from the transport + * to the LLDD to perform a NVME-FC LS request and obtain +@@ -498,8 +507,10 @@ struct nvme_fc_port_template { + int (*xmt_ls_rsp)(struct nvme_fc_local_port *localport, + struct nvme_fc_remote_port *rport, + struct nvmefc_ls_rsp *ls_rsp); ++#if defined(HAVE_BLK_MQ_MAP_QUEUES) && defined(HAVE_BLK_MQ_TAG_SET_HAS_MAP) + void (*map_queues)(struct nvme_fc_local_port *localport, + struct blk_mq_queue_map *map); ++#endif + + u32 max_hw_queues; + u16 max_sgl_segments; diff --git a/src/mlnx-ofa_kernel-5.8/backports/0242-BACKPORT-include-linux-nvme-rdma.h.patch b/src/mlnx-ofa_kernel-5.8/backports/0242-BACKPORT-include-linux-nvme-rdma.h.patch new file mode 100644 index 0000000..e4b6b61 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0242-BACKPORT-include-linux-nvme-rdma.h.patch @@ -0,0 +1,27 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: include/linux/nvme-rdma.h + +Change-Id: I108ce38255ef32bba891a3f6bd2340a47159a82b +--- + include/linux/nvme-rdma.h | 6 ++++++ + 1 file changed, 6 insertions(+) + +--- a/include/linux/nvme-rdma.h ++++ b/include/linux/nvme-rdma.h +@@ -3,6 +3,11 @@ + * Copyright (c) 2015 Mellanox Technologies. All rights reserved. + */ + ++/* build vs. Non-MLNX_OFED .h */ ++#if 0 ++#include_next ++#else ++ + #ifndef _LINUX_NVME_RDMA_H + #define _LINUX_NVME_RDMA_H + +@@ -87,3 +92,4 @@ struct nvme_rdma_cm_rej { + }; + + #endif /* _LINUX_NVME_RDMA_H */ ++#endif /* build vs. Non-MLNX_OFED .h */ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0243-BACKPORT-include-linux-nvme.h.patch b/src/mlnx-ofa_kernel-5.8/backports/0243-BACKPORT-include-linux-nvme.h.patch new file mode 100644 index 0000000..f2f2537 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0243-BACKPORT-include-linux-nvme.h.patch @@ -0,0 +1,27 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: include/linux/nvme.h + +Change-Id: I9b26bc5d7ae1d19962ecc7ea3a7272fbc9264954 +--- + include/linux/nvme.h | 6 ++++++ + 1 file changed, 6 insertions(+) + +--- a/include/linux/nvme.h ++++ b/include/linux/nvme.h +@@ -4,6 +4,11 @@ + * Copyright (c) 2011-2014, Intel Corporation. + */ + ++/* build vs. Non-MLNX_OFED .h */ ++#if 0 ++#include_next ++#else ++ + #ifndef _LINUX_NVME_H + #define _LINUX_NVME_H + +@@ -1675,3 +1680,4 @@ struct nvme_completion { + #define NVME_TERTIARY(ver) ((ver) & 0xff) + + #endif /* _LINUX_NVME_H */ ++#endif /* build vs. Non-MLNX_OFED .h */ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0244-BACKPORT-include-linux-sunrpc-svc_rdma.h.patch b/src/mlnx-ofa_kernel-5.8/backports/0244-BACKPORT-include-linux-sunrpc-svc_rdma.h.patch new file mode 100644 index 0000000..efdb6e9 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0244-BACKPORT-include-linux-sunrpc-svc_rdma.h.patch @@ -0,0 +1,189 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: include/linux/sunrpc/svc_rdma.h + +Change-Id: I8ba532b5975fb3dbbced299e22ec49f8b9a8df42 +--- + include/linux/sunrpc/svc_rdma.h | 79 ++++++++++++++++++++++++++++++++- + 1 file changed, 78 insertions(+), 1 deletion(-) + +--- a/include/linux/sunrpc/svc_rdma.h ++++ b/include/linux/sunrpc/svc_rdma.h +@@ -42,12 +42,19 @@ + + #ifndef SVC_RDMA_H + #define SVC_RDMA_H ++ ++#include "../../../compat/config.h" ++ ++#ifdef HAVE_SVC_FILL_WRITE_VECTOR + #include ++#endif + #include + #include + #include + #include ++#ifdef HAVE_SVC_RDMA_PCL + #include ++#endif + + #include + #include +@@ -94,8 +101,10 @@ struct svcxprt_rdma { + spinlock_t sc_rw_ctxt_lock; + struct llist_head sc_rw_ctxts; + ++#ifdef HAVE_SVCXPRT_RDMA_SC_PENDING_RECVS + u32 sc_pending_recvs; + u32 sc_recv_batch; ++#endif + struct list_head sc_rq_dto_q; + spinlock_t sc_rq_dto_lock; + struct ib_qp *sc_qp; +@@ -106,9 +115,17 @@ struct svcxprt_rdma { + + wait_queue_head_t sc_send_wait; /* SQ exhaustion waitlist */ + unsigned long sc_flags; ++#ifndef HAVE_SVC_RDMA_PCL ++ struct list_head sc_read_complete_q; ++#endif + struct work_struct sc_work; + ++#ifdef HAVE_SVC_FILL_WRITE_VECTOR + struct llist_head sc_recv_ctxts; ++#else ++ spinlock_t sc_recv_lock; ++ struct list_head sc_recv_ctxts; ++#endif + + atomic_t sc_completion_ids; + }; +@@ -127,26 +144,55 @@ enum { + #define RPCSVC_MAXPAYLOAD_RDMA RPCSVC_MAXPAYLOAD + + struct svc_rdma_recv_ctxt { ++#ifdef HAVE_SVC_FILL_WRITE_VECTOR + struct llist_node rc_node; ++#endif + struct list_head rc_list; + struct ib_recv_wr rc_recv_wr; + struct ib_cqe rc_cqe; + struct rpc_rdma_cid rc_cid; ++#ifdef HAVE_SVC_FILL_WRITE_VECTOR + struct ib_sge rc_recv_sge; + void *rc_recv_buf; ++#endif ++#ifndef HAVE_SVC_RDMA_PCL ++ struct xdr_buf rc_arg; ++#endif + struct xdr_stream rc_stream; ++#ifdef HAVE_SVC_FILL_WRITE_VECTOR + bool rc_temp; ++#endif + u32 rc_byte_len; + unsigned int rc_page_count; ++#ifndef HAVE_SVC_RDMA_PCL ++ unsigned int rc_hdr_count; ++#endif ++#ifndef HAVE_SVC_FILL_WRITE_VECTOR ++ struct ib_sge rc_sges[1 + ++ RPCRDMA_MAX_INLINE_THRESH / PAGE_SIZE]; ++#endif ++#ifndef HAVE_SVC_RDMA_PCL ++ struct page *rc_pages[RPCSVC_MAXPAGES]; ++#endif + u32 rc_inv_rkey; ++#ifdef HAVE_SVC_RDMA_PCL + __be32 rc_msgtype; +- + struct svc_rdma_pcl rc_call_pcl; ++#endif ++ ++#ifndef HAVE_SVC_RDMA_PCL ++ __be32 *rc_write_list; ++ __be32 *rc_reply_chunk; ++ unsigned int rc_read_payload_offset; ++ unsigned int rc_read_payload_length; ++#endif + ++#ifdef HAVE_SVC_RDMA_PCL + struct svc_rdma_pcl rc_read_pcl; + struct svc_rdma_chunk *rc_cur_result_payload; + struct svc_rdma_pcl rc_write_pcl; + struct svc_rdma_pcl rc_reply_pcl; ++#endif + }; + + struct svc_rdma_send_ctxt { +@@ -171,25 +217,46 @@ extern void svc_rdma_handle_bc_reply(str + /* svc_rdma_recvfrom.c */ + extern void svc_rdma_recv_ctxts_destroy(struct svcxprt_rdma *rdma); + extern bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma); ++#ifdef HAVE_SVC_RDMA_PCL + extern struct svc_rdma_recv_ctxt * + svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma); ++#endif + extern void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma, + struct svc_rdma_recv_ctxt *ctxt); + extern void svc_rdma_flush_recv_queues(struct svcxprt_rdma *rdma); ++#ifdef HAVE_SVC_RDMA_RELEASE_RQST + extern void svc_rdma_release_rqst(struct svc_rqst *rqstp); ++#endif + extern int svc_rdma_recvfrom(struct svc_rqst *); + + /* svc_rdma_rw.c */ + extern void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma); ++#ifndef HAVE_SVC_RDMA_PCL ++extern int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma, ++ struct svc_rqst *rqstp, ++ struct svc_rdma_recv_ctxt *head, __be32 *p); ++#endif + extern int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, ++#ifdef HAVE_SVC_RDMA_PCL + const struct svc_rdma_chunk *chunk, + const struct xdr_buf *xdr); ++#else ++ __be32 *wr_ch, struct xdr_buf *xdr, ++ unsigned int offset, ++ unsigned long length); ++#endif + extern int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, + const struct svc_rdma_recv_ctxt *rctxt, ++#ifdef HAVE_SVC_RDMA_PCL + const struct xdr_buf *xdr); ++#else ++ struct xdr_buf *xdr); ++#endif ++#ifdef HAVE_SVC_RDMA_PCL + extern int svc_rdma_process_read_list(struct svcxprt_rdma *rdma, + struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *head); ++#endif + + /* svc_rdma_sendto.c */ + extern void svc_rdma_send_ctxts_destroy(struct svcxprt_rdma *rdma); +@@ -202,15 +269,25 @@ extern int svc_rdma_send(struct svcxprt_ + extern int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *sctxt, + const struct svc_rdma_recv_ctxt *rctxt, ++#ifdef HAVE_SVC_RDMA_PCL + const struct xdr_buf *xdr); ++#else ++ struct xdr_buf *xdr); ++#endif + extern void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *sctxt, + struct svc_rdma_recv_ctxt *rctxt, + int status); + extern void svc_rdma_wake_send_waiters(struct svcxprt_rdma *rdma, int avail); + extern int svc_rdma_sendto(struct svc_rqst *); ++#ifdef HAVE_XPO_READ_PAYLOAD ++extern int svc_rdma_read_payload(struct svc_rqst *rqstp, unsigned int offset, ++ unsigned int length); ++#endif ++#ifdef HAVE_XPO_RESULT_PAYLOAD + extern int svc_rdma_result_payload(struct svc_rqst *rqstp, unsigned int offset, + unsigned int length); ++#endif + + /* svc_rdma_transport.c */ + extern struct svc_xprt_class svc_rdma_class; diff --git a/src/mlnx-ofa_kernel-5.8/backports/0245-BACKPORT-include-rdma-ib_addr.h.patch b/src/mlnx-ofa_kernel-5.8/backports/0245-BACKPORT-include-rdma-ib_addr.h.patch new file mode 100644 index 0000000..0797c24 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0245-BACKPORT-include-rdma-ib_addr.h.patch @@ -0,0 +1,92 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: include/rdma/ib_addr.h + +Change-Id: I3a43623b1df937987556849c72efa2d69ffe1772 +--- + include/rdma/ib_addr.h | 33 +++++++++++++++++++++++++++++---- + 1 file changed, 29 insertions(+), 4 deletions(-) + +--- a/include/rdma/ib_addr.h ++++ b/include/rdma/ib_addr.h +@@ -7,6 +7,8 @@ + #ifndef IB_ADDR_H + #define IB_ADDR_H + ++#include "../../compat/config.h" ++ + #include + #include + #include +@@ -21,6 +23,7 @@ + #include + #include + #include ++#include + + /** + * struct rdma_dev_addr - Contains resolved RDMA hardware addresses +@@ -106,7 +109,11 @@ static inline int rdma_addr_gid_offset(s + return dev_addr->dev_type == ARPHRD_INFINIBAND ? 4 : 0; + } + ++#ifdef HAVE_IS_VLAN_DEV_CONST + static inline u16 rdma_vlan_dev_vlan_id(const struct net_device *dev) ++#else ++static inline u16 rdma_vlan_dev_vlan_id(struct net_device *dev) ++#endif + { + return is_vlan_dev(dev) ? vlan_dev_vlan_id(dev) : 0xffff; + } +@@ -196,22 +203,36 @@ static inline enum ib_mtu iboe_get_mtu(i + + static inline int iboe_get_rate(struct net_device *dev) + { ++#ifndef HAVE___ETHTOOL_GET_LINK_KSETTINGS ++ struct ethtool_cmd cmd; ++#else + struct ethtool_link_ksettings cmd; ++#endif ++ u32 speed; + int err; + + rtnl_lock(); ++#ifndef HAVE___ETHTOOL_GET_LINK_KSETTINGS ++ err = __ethtool_get_settings(dev, &cmd); ++#else + err = __ethtool_get_link_ksettings(dev, &cmd); ++#endif + rtnl_unlock(); + if (err) + return IB_RATE_PORT_CURRENT; + +- if (cmd.base.speed >= 40000) ++#ifndef HAVE___ETHTOOL_GET_LINK_KSETTINGS ++ speed = ethtool_cmd_speed(&cmd); ++#else ++ speed = cmd.base.speed; ++#endif ++ if (speed >= 40000) + return IB_RATE_40_GBPS; +- else if (cmd.base.speed >= 30000) ++ else if (speed >= 30000) + return IB_RATE_30_GBPS; +- else if (cmd.base.speed >= 20000) ++ else if (speed >= 20000) + return IB_RATE_20_GBPS; +- else if (cmd.base.speed >= 10000) ++ else if (speed >= 10000) + return IB_RATE_10_GBPS; + else + return IB_RATE_PORT_CURRENT; +@@ -262,7 +283,11 @@ static inline u16 rdma_get_vlan_id(union + return vid < 0x1000 ? vid : 0xffff; + } + ++#ifdef HAVE_IS_VLAN_DEV_CONST + static inline struct net_device *rdma_vlan_dev_real_dev(const struct net_device *dev) ++#else ++static inline struct net_device *rdma_vlan_dev_real_dev(struct net_device *dev) ++#endif + { + return is_vlan_dev(dev) ? vlan_dev_real_dev(dev) : NULL; + } diff --git a/src/mlnx-ofa_kernel-5.8/backports/0246-BACKPORT-include-rdma-ib_umem.h.patch b/src/mlnx-ofa_kernel-5.8/backports/0246-BACKPORT-include-rdma-ib_umem.h.patch new file mode 100644 index 0000000..29059b8 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0246-BACKPORT-include-rdma-ib_umem.h.patch @@ -0,0 +1,126 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: include/rdma/ib_umem.h + +Change-Id: I696e534e2654b20074d3c7ee0cf76a40dbe054be +--- + include/rdma/ib_umem.h | 40 ++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 40 insertions(+) + +--- a/include/rdma/ib_umem.h ++++ b/include/rdma/ib_umem.h +@@ -7,6 +7,8 @@ + #ifndef IB_UMEM_H + #define IB_UMEM_H + ++#include "../../compat/config.h" ++ + #include + #include + #include +@@ -17,18 +19,31 @@ struct ib_umem_odp; + struct dma_buf_attach_ops; + + struct ib_umem { ++#ifdef HAVE_MMU_NOTIFIER_OPS_HAS_FREE_NOTIFIER + struct ib_device *ibdev; ++#else ++ struct ib_ucontext *context; ++#endif + struct mm_struct *owning_mm; + u64 iova; + size_t length; + unsigned long address; + u32 writable : 1; ++#if !defined(HAVE_FOLL_LONGTERM) && !defined(HAVE_GET_USER_PAGES_LONGTERM) ++ u32 hugetlb : 1; ++#endif + u32 is_odp : 1; + u32 is_dmabuf : 1; + /* Placing at the end of the bitfield list is ABI preserving on LE */ + u32 is_peer : 1; + struct work_struct work; ++#ifdef HAVE_SG_APPEND_TABLE + struct sg_append_table sgt_append; ++#else ++ struct sg_table sg_head; ++ int nmap; ++ unsigned int sg_nents; ++#endif + }; + + struct ib_umem_dmabuf { +@@ -63,8 +78,13 @@ static inline int ib_umem_offset(struct + static inline unsigned long ib_umem_dma_offset(struct ib_umem *umem, + unsigned long pgsz) + { ++#ifdef HAVE_SG_APPEND_TABLE + return (sg_dma_address(umem->sgt_append.sgt.sgl) + ib_umem_offset(umem)) & + (pgsz - 1); ++#else ++ return (sg_dma_address(umem->sg_head.sgl) + ib_umem_offset(umem)) & ++ (pgsz - 1); ++#endif + } + + static inline size_t ib_umem_num_dma_blocks(struct ib_umem *umem, +@@ -84,8 +104,12 @@ static inline void __rdma_umem_block_ite + struct ib_umem *umem, + unsigned long pgsz) + { ++#ifdef HAVE_SG_APPEND_TABLE + __rdma_block_iter_start(biter, umem->sgt_append.sgt.sgl, + umem->sgt_append.sgt.nents, pgsz); ++#else ++ __rdma_block_iter_start(biter, umem->sg_head.sgl, umem->nmap, pgsz); ++#endif + } + + /** +@@ -105,7 +129,11 @@ static inline void __rdma_umem_block_ite + + #ifdef CONFIG_INFINIBAND_USER_MEM + ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr, ++#else ++struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, ++#endif + size_t size, int access); + void ib_umem_release(struct ib_umem *umem); + int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset, +@@ -136,7 +164,11 @@ static inline unsigned long ib_umem_find + unsigned long pgsz_bitmap, + u64 pgoff_bitmask) + { ++#ifdef HAVE_SG_APPEND_TABLE + struct scatterlist *sg = umem->sgt_append.sgt.sgl; ++#else ++ struct scatterlist *sg = umem->sg_head.sgl; ++#endif + dma_addr_t dma_addr; + + dma_addr = sg_dma_address(sg) + (umem->address & ~PAGE_MASK); +@@ -155,7 +187,11 @@ struct ib_umem_dmabuf *ib_umem_dmabuf_ge + int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf); + void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf); + void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf); ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + struct ib_umem *ib_umem_get_peer(struct ib_device *device, unsigned long addr, ++#else ++struct ib_umem *ib_umem_get_peer(struct ib_udata *udata, unsigned long addr, ++#endif + size_t size, int access, + unsigned long peer_mem_flags); + void ib_umem_activate_invalidation_notifier(struct ib_umem *umem, +@@ -167,7 +203,11 @@ void ib_umem_stop_invalidation_notifier( + + #include + ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + static inline struct ib_umem *ib_umem_get(struct ib_device *device, ++#else ++static inline struct ib_umem *ib_umem_get(struct ib_udata *udata, ++#endif + unsigned long addr, size_t size, + int access) + { diff --git a/src/mlnx-ofa_kernel-5.8/backports/0247-BACKPORT-include-rdma-ib_umem_odp.h.patch b/src/mlnx-ofa_kernel-5.8/backports/0247-BACKPORT-include-rdma-ib_umem_odp.h.patch new file mode 100644 index 0000000..93254f5 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0247-BACKPORT-include-rdma-ib_umem_odp.h.patch @@ -0,0 +1,201 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: include/rdma/ib_umem_odp.h + +Change-Id: Ib041e811f4ab32f7682aaa728cfc65e5890913b6 +--- + include/rdma/ib_umem_odp.h | 109 +++++++++++++++++++++++++++++++++++++ + 1 file changed, 109 insertions(+) + +--- a/include/rdma/ib_umem_odp.h ++++ b/include/rdma/ib_umem_odp.h +@@ -6,16 +6,30 @@ + #ifndef IB_UMEM_ODP_H + #define IB_UMEM_ODP_H + ++#include "../../compat/config.h" ++ ++#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + #include + #include ++#ifndef HAVE_MMU_INTERVAL_NOTIFIER ++#include ++#endif + + struct ib_umem_odp { + struct ib_umem umem; ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + struct mmu_interval_notifier notifier; + struct pid *tgid; ++#else ++ struct ib_ucontext_per_mm *per_mm; ++#endif + ++#if defined(HAVE_MMU_INTERVAL_NOTIFIER) && defined(HAVE_HMM_RANGE_FAULT_SUPPORT) + /* An array of the pfns included in the on-demand paging umem. */ + unsigned long *pfn_list; ++#else ++ struct page **page_list; ++#endif + + /* + * An array with DMA addresses mapped for pfns in pfn_list. +@@ -31,8 +45,15 @@ struct ib_umem_odp { + struct mutex umem_mutex; + void *private; /* for the HW driver to use. */ + ++#ifndef HAVE_MMU_INTERVAL_NOTIFIER ++ int notifiers_seq; ++ int notifiers_count; ++#endif + int npages; + ++#ifndef HAVE_MMU_INTERVAL_NOTIFIER ++ struct interval_tree_node interval_tree; ++#endif + /* + * An implicit odp umem cannot be DMA mapped, has 0 length, and serves + * only as an anchor for the driver to hold onto the per_mm. FIXME: +@@ -41,6 +62,9 @@ struct ib_umem_odp { + */ + bool is_implicit_odp; + ++#ifndef HAVE_MMU_INTERVAL_NOTIFIER ++ struct completion notifier_completion; ++#endif + unsigned int page_shift; + }; + +@@ -52,13 +76,21 @@ static inline struct ib_umem_odp *to_ib_ + /* Returns the first page of an ODP umem. */ + static inline unsigned long ib_umem_start(struct ib_umem_odp *umem_odp) + { ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + return umem_odp->notifier.interval_tree.start; ++#else ++ return umem_odp->interval_tree.start; ++#endif + } + + /* Returns the address of the page after the last one of an ODP umem. */ + static inline unsigned long ib_umem_end(struct ib_umem_odp *umem_odp) + { ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + return umem_odp->notifier.interval_tree.last + 1; ++#else ++ return umem_odp->interval_tree.last + 1; ++#endif + } + + static inline size_t ib_umem_odp_num_pages(struct ib_umem_odp *umem_odp) +@@ -82,28 +114,104 @@ static inline size_t ib_umem_odp_num_pag + + #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + struct ib_umem_odp * + ib_umem_odp_get(struct ib_device *device, unsigned long addr, size_t size, + int access, const struct mmu_interval_notifier_ops *ops); + struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_device *device, ++#else ++struct ib_ucontext_per_mm { ++#ifdef HAVE_MMU_NOTIFIER_OPS_HAS_FREE_NOTIFIER ++ struct mmu_notifier mn; ++ struct pid *tgid; ++#else ++ struct ib_ucontext *context; ++ struct mm_struct *mm; ++ struct pid *tgid; ++ bool active; ++#endif ++ ++#ifndef HAVE_INTERVAL_TREE_TAKES_RB_ROOT ++ struct rb_root_cached umem_tree; ++#else ++ struct rb_root umem_tree; ++#endif ++ /* Protects umem_tree */ ++ struct rw_semaphore umem_rwsem; ++#ifndef HAVE_MMU_NOTIFIER_OPS_HAS_FREE_NOTIFIER ++ struct mmu_notifier mn; ++ unsigned int odp_mrs_count; ++ ++ struct list_head ucontext_list; ++ struct rcu_head rcu; ++#endif ++}; ++ ++struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr, ++ size_t size, int access); ++struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata, ++#endif + int access); ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + struct ib_umem_odp * + ib_umem_odp_alloc_child(struct ib_umem_odp *root_umem, unsigned long addr, + size_t size, + const struct mmu_interval_notifier_ops *ops); ++#else ++struct ib_umem_odp *ib_umem_odp_alloc_child(struct ib_umem_odp *root_umem, ++ unsigned long addr, size_t size); ++#endif + void ib_umem_odp_release(struct ib_umem_odp *umem_odp); + ++#if defined(HAVE_MMU_INTERVAL_NOTIFIER) && defined(HAVE_HMM_RANGE_FAULT_SUPPORT) + int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 start_offset, + u64 bcnt, u64 access_mask, bool fault); ++#else ++int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 start_offset, ++ u64 bcnt, u64 access_mask, ++ unsigned long current_seq); ++#endif ++ + + void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 start_offset, + u64 bound); + ++#ifndef HAVE_MMU_INTERVAL_NOTIFIER ++typedef int (*umem_call_back)(struct ib_umem_odp *item, u64 start, u64 end, ++ void *cookie); ++#ifndef HAVE_INTERVAL_TREE_TAKES_RB_ROOT ++int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root, ++#else ++int rbt_ib_umem_for_each_in_range(struct rb_root *root, ++#endif ++ u64 start, u64 end, ++ umem_call_back cb, ++#if defined(HAVE_UMEM_NOTIFIER_PARAM_BLOCKABLE) || defined(HAVE_MMU_NOTIFIER_RANGE_STRUCT) ++ bool blockable, ++#endif ++ void *cookie); ++ ++static inline int ib_umem_mmu_notifier_retry(struct ib_umem_odp *umem_odp, ++ unsigned long mmu_seq) ++{ ++ if (unlikely(umem_odp->notifiers_count)) ++ return 1; ++ if (umem_odp->notifiers_seq != mmu_seq) ++ return 1; ++ return 0; ++} ++#endif + #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ + ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + static inline struct ib_umem_odp * + ib_umem_odp_get(struct ib_device *device, unsigned long addr, size_t size, + int access, const struct mmu_interval_notifier_ops *ops) ++#else ++static inline struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, ++ unsigned long addr, ++ size_t size, int access) ++#endif + { + return ERR_PTR(-EINVAL); + } +@@ -111,5 +219,6 @@ ib_umem_odp_get(struct ib_device *device + static inline void ib_umem_odp_release(struct ib_umem_odp *umem_odp) {} + + #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ ++#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ + + #endif /* IB_UMEM_ODP_H */ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0248-BACKPORT-include-rdma-ib_verbs.h.patch b/src/mlnx-ofa_kernel-5.8/backports/0248-BACKPORT-include-rdma-ib_verbs.h.patch new file mode 100644 index 0000000..c8b51c1 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0248-BACKPORT-include-rdma-ib_verbs.h.patch @@ -0,0 +1,431 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: include/rdma/ib_verbs.h + +Change-Id: I6276412d85dc392f795c2b92bbcc1a37972a9210 +--- + include/rdma/ib_verbs.h | 210 +++++++++++++++++++++++++++++++++++++++- + 1 file changed, 207 insertions(+), 3 deletions(-) + +--- a/include/rdma/ib_verbs.h ++++ b/include/rdma/ib_verbs.h +@@ -12,6 +12,8 @@ + #ifndef IB_VERBS_H + #define IB_VERBS_H + ++#include "../../compat/config.h" ++ + #include + #include + #include +@@ -20,7 +22,11 @@ + #include + #include + #include ++#if defined(HAVE_IRQ_POLL_H) + #include ++#else ++#include ++#endif + #include + #include + #include +@@ -84,10 +90,13 @@ void ibdev_notice(const struct ib_device + __printf(2, 3) __cold + void ibdev_info(const struct ib_device *ibdev, const char *format, ...); + +-#if defined(CONFIG_DYNAMIC_DEBUG) || \ ++#if defined(CONFIG_DYNAMIC_DEBUG) && defined(dynamic_ibdev_dbg) ++#define ibdev_dbg(__dev, format, args...) \ ++ dynamic_ibdev_dbg(__dev, format, ##args) ++#elif defined(DEBUG) + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) + #define ibdev_dbg(__dev, format, args...) \ +- dynamic_ibdev_dbg(__dev, format, ##args) ++ ibdev_printk(KERN_DEBUG, __dev, format, ##args) + #else + __printf(2, 3) __cold + static inline +@@ -1161,6 +1170,9 @@ enum ib_qp_create_flags { + IB_QP_CREATE_PCI_WRITE_END_PADDING = + IB_UVERBS_QP_CREATE_PCI_WRITE_END_PADDING, + IB_QP_CREATE_SIGNATURE_PIPELINE = 1 << 12, ++#ifndef HAVE_MEMALLOC_NOIO_SAVE ++ IB_QP_CREATE_USE_GFP_NOIO = 1 << 13, ++#endif + /* reserve bits 26-31 for low level drivers' internal use */ + IB_QP_CREATE_RESERVED_START = 1 << 26, + IB_QP_CREATE_RESERVED_END = 1 << 31, +@@ -1508,17 +1520,26 @@ enum rdma_remove_reason { + RDMA_REMOVE_DRIVER_FAILURE, + }; + ++#ifdef HAVE_CGROUP_RDMA_H + struct ib_rdmacg_object { + #ifdef CONFIG_CGROUP_RDMA + struct rdma_cgroup *cg; /* owner rdma cgroup */ + #endif + }; ++#endif + + struct ib_ucontext { + struct ib_device *device; + struct ib_uverbs_file *ufile; + +- struct ib_rdmacg_object cg_obj; ++#ifndef HAVE_MMU_NOTIFIER_OPS_HAS_FREE_NOTIFIER ++ struct mutex per_mm_list_lock; ++ struct list_head per_mm_list; ++#endif ++ ++#ifdef HAVE_CGROUP_RDMA_H ++ struct ib_rdmacg_object cg_obj; ++#endif + /* + * Implementation details of the RDMA core, don't use in drivers: + */ +@@ -1534,7 +1555,9 @@ struct ib_uobject { + struct ib_ucontext *context; /* associated user context */ + void *object; /* containing object */ + struct list_head list; /* link to context's list */ ++#ifdef HAVE_CGROUP_RDMA_H + struct ib_rdmacg_object cg_obj; /* rdmacg object */ ++#endif + int id; /* index into kernel idr */ + struct kref ref; + atomic_t usecnt; /* protects exclusive access */ +@@ -1606,7 +1629,13 @@ struct ib_cq { + struct ib_wc *wc; + struct list_head pool_entry; + union { ++#if defined(HAVE_IRQ_POLL_H) ++#if IS_ENABLED(CONFIG_IRQ_POLL) + struct irq_poll iop; ++#endif ++#else ++ struct blk_iopoll iop; ++#endif + struct work_struct work; + }; + struct workqueue_struct *comp_wq; +@@ -2193,6 +2222,63 @@ struct ib_port_cache { + enum ib_port_state port_state; + }; + ++#ifndef HAVE_DEVICE_DMA_OPS ++struct ib_dma_mapping_ops { ++ int (*mapping_error)(struct ib_device *dev, ++ u64 dma_addr); ++ u64 (*map_single)(struct ib_device *dev, ++ void *ptr, size_t size, ++ enum dma_data_direction direction); ++ void (*unmap_single)(struct ib_device *dev, ++ u64 addr, size_t size, ++ enum dma_data_direction direction); ++ u64 (*map_page)(struct ib_device *dev, ++ struct page *page, unsigned long offset, ++ size_t size, ++ enum dma_data_direction direction); ++ void (*unmap_page)(struct ib_device *dev, ++ u64 addr, size_t size, ++ enum dma_data_direction direction); ++ int (*map_sg)(struct ib_device *dev, ++ struct scatterlist *sg, int nents, ++ enum dma_data_direction direction); ++ void (*unmap_sg)(struct ib_device *dev, ++ struct scatterlist *sg, int nents, ++ enum dma_data_direction direction); ++ int (*map_sg_attrs)(struct ib_device *dev, ++ struct scatterlist *sg, int nents, ++ enum dma_data_direction direction, ++#ifdef HAVE_STRUCT_DMA_ATTRS ++ struct dma_attrs *attrs); ++#else ++ unsigned long attrs); ++#endif ++ void (*unmap_sg_attrs)(struct ib_device *dev, ++ struct scatterlist *sg, int nents, ++ enum dma_data_direction direction, ++#ifdef HAVE_STRUCT_DMA_ATTRS ++ struct dma_attrs *attrs); ++#else ++ unsigned long attrs); ++#endif ++ void (*sync_single_for_cpu)(struct ib_device *dev, ++ u64 dma_handle, ++ size_t size, ++ enum dma_data_direction dir); ++ void (*sync_single_for_device)(struct ib_device *dev, ++ u64 dma_handle, ++ size_t size, ++ enum dma_data_direction dir); ++ void *(*alloc_coherent)(struct ib_device *dev, ++ size_t size, ++ u64 *dma_handle, ++ gfp_t flag); ++ void (*free_coherent)(struct ib_device *dev, ++ size_t size, void *cpu_addr, ++ u64 dma_handle); ++}; ++#endif ++ + struct ib_port_immutable { + int pkey_tbl_len; + int gid_tbl_len; +@@ -2265,6 +2351,9 @@ struct rdma_netdev_alloc_params { + + int (*initialize_rdma_netdev)(struct ib_device *device, u32 port_num, + struct net_device *netdev, void *param); ++#ifndef HAVE_NET_DEVICE_NEEDS_FREE_NETDEV ++ void (*uninitialize_rdma_netdev)(struct net_device *netdev); ++#endif + }; + + struct ib_odp_counters { +@@ -2526,6 +2615,10 @@ struct ib_device_ops { + struct ib_mr_status *mr_status); + int (*alloc_mw)(struct ib_mw *mw, struct ib_udata *udata); + int (*dealloc_mw)(struct ib_mw *mw); ++#ifndef HAVE_MMU_INTERVAL_NOTIFIER ++ void (*invalidate_range)(struct ib_umem_odp *umem_odp, ++ unsigned long start, unsigned long end); ++#endif + int (*attach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid); + int (*detach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid); + int (*alloc_xrcd)(struct ib_xrcd *xrcd, struct ib_udata *udata); +@@ -2578,6 +2671,9 @@ struct ib_device_ops { + int (*read_counters)(struct ib_counters *counters, + struct ib_counters_read_attr *counters_read_attr, + struct uverbs_attr_bundle *attrs); ++#ifndef HAVE_DEVICE_DMA_OPS ++ struct ib_dma_mapping_ops *dma_ops; ++#endif + int (*map_mr_sg_pi)(struct ib_mr *mr, struct scatterlist *data_sg, + int data_sg_nents, unsigned int *data_sg_offset, + struct scatterlist *meta_sg, int meta_sg_nents, +@@ -2778,9 +2874,11 @@ struct ib_device { + struct ib_device_attr attrs; + struct hw_stats_device_data *hw_stats_data; + ++#ifdef HAVE_CGROUP_RDMA_H + #ifdef CONFIG_CGROUP_RDMA + struct rdmacg_device cg_device; + #endif ++#endif + + u32 index; + +@@ -4088,6 +4186,10 @@ struct ib_mr *ib_get_dma_mr(struct ib_pd + */ + static inline int ib_dma_mapping_error(struct ib_device *dev, u64 dma_addr) + { ++#ifndef HAVE_DEVICE_DMA_OPS ++ if (dev->ops.dma_ops) ++ return dev->ops.dma_ops->mapping_error(dev, dma_addr); ++#endif + if (ib_uses_virt_dma(dev)) + return 0; + return dma_mapping_error(dev->dma_device, dma_addr); +@@ -4104,6 +4206,10 @@ static inline u64 ib_dma_map_single(stru + void *cpu_addr, size_t size, + enum dma_data_direction direction) + { ++#ifndef HAVE_DEVICE_DMA_OPS ++ if (dev->ops.dma_ops) ++ return dev->ops.dma_ops->map_single(dev, cpu_addr, size, direction); ++#endif + if (ib_uses_virt_dma(dev)) + return (uintptr_t)cpu_addr; + return dma_map_single(dev->dma_device, cpu_addr, size, direction); +@@ -4120,6 +4226,11 @@ static inline void ib_dma_unmap_single(s + u64 addr, size_t size, + enum dma_data_direction direction) + { ++#ifndef HAVE_DEVICE_DMA_OPS ++ if (dev->ops.dma_ops) ++ dev->ops.dma_ops->unmap_single(dev, addr, size, direction); ++ else ++#endif + if (!ib_uses_virt_dma(dev)) + dma_unmap_single(dev->dma_device, addr, size, direction); + } +@@ -4138,6 +4249,10 @@ static inline u64 ib_dma_map_page(struct + size_t size, + enum dma_data_direction direction) + { ++#ifndef HAVE_DEVICE_DMA_OPS ++ if (dev->ops.dma_ops) ++ return dev->ops.dma_ops->map_page(dev, page, offset, size, direction); ++#endif + if (ib_uses_virt_dma(dev)) + return (uintptr_t)(page_address(page) + offset); + return dma_map_page(dev->dma_device, page, offset, size, direction); +@@ -4154,6 +4269,11 @@ static inline void ib_dma_unmap_page(str + u64 addr, size_t size, + enum dma_data_direction direction) + { ++#ifndef HAVE_DEVICE_DMA_OPS ++ if (dev->ops.dma_ops) ++ dev->ops.dma_ops->unmap_page(dev, addr, size, direction); ++ else ++#endif + if (!ib_uses_virt_dma(dev)) + dma_unmap_page(dev->dma_device, addr, size, direction); + } +@@ -4162,8 +4282,17 @@ int ib_dma_virt_map_sg(struct ib_device + static inline int ib_dma_map_sg_attrs(struct ib_device *dev, + struct scatterlist *sg, int nents, + enum dma_data_direction direction, ++#ifdef HAVE_STRUCT_DMA_ATTRS ++ struct dma_attrs *dma_attrs) ++#else + unsigned long dma_attrs) ++#endif + { ++#ifndef HAVE_DEVICE_DMA_OPS ++ if (dev->ops.dma_ops) ++ return dev->ops.dma_ops->map_sg_attrs(dev, sg, nents, direction, ++ dma_attrs); ++#endif + if (ib_uses_virt_dma(dev)) + return ib_dma_virt_map_sg(dev, sg, nents); + return dma_map_sg_attrs(dev->dma_device, sg, nents, direction, +@@ -4173,8 +4302,18 @@ static inline int ib_dma_map_sg_attrs(st + static inline void ib_dma_unmap_sg_attrs(struct ib_device *dev, + struct scatterlist *sg, int nents, + enum dma_data_direction direction, ++#ifdef HAVE_STRUCT_DMA_ATTRS ++ struct dma_attrs *dma_attrs) ++#else + unsigned long dma_attrs) ++#endif + { ++#ifndef HAVE_DEVICE_DMA_OPS ++ if (dev->ops.dma_ops) ++ return dev->ops.dma_ops->unmap_sg_attrs(dev, sg, nents, direction, ++ dma_attrs); ++ else ++#endif + if (!ib_uses_virt_dma(dev)) + dma_unmap_sg_attrs(dev->dma_device, sg, nents, direction, + dma_attrs); +@@ -4187,6 +4326,7 @@ static inline void ib_dma_unmap_sg_attrs + * @direction: The direction of the DMA + * @attrs: Optional DMA attributes for the map operation + */ ++#ifdef HAVE_DMA_MAP_SGTABLE + static inline int ib_dma_map_sgtable_attrs(struct ib_device *dev, + struct sg_table *sgt, + enum dma_data_direction direction, +@@ -4212,6 +4352,7 @@ static inline void ib_dma_unmap_sgtable_ + if (!ib_uses_virt_dma(dev)) + dma_unmap_sgtable(dev->dma_device, sgt, direction, dma_attrs); + } ++#endif + + /** + * ib_dma_map_sg - Map a scatter/gather list to DMA addresses +@@ -4224,6 +4365,10 @@ static inline int ib_dma_map_sg(struct i + struct scatterlist *sg, int nents, + enum dma_data_direction direction) + { ++#ifndef HAVE_DEVICE_DMA_OPS ++ if (dev->ops.dma_ops) ++ return dev->ops.dma_ops->map_sg(dev, sg, nents, direction); ++#endif + return ib_dma_map_sg_attrs(dev, sg, nents, direction, 0); + } + +@@ -4238,6 +4383,11 @@ static inline void ib_dma_unmap_sg(struc + struct scatterlist *sg, int nents, + enum dma_data_direction direction) + { ++#ifndef HAVE_DEVICE_DMA_OPS ++ if (dev->ops.dma_ops) ++ dev->ops.dma_ops->unmap_sg(dev, sg, nents, direction); ++ else ++#endif + ib_dma_unmap_sg_attrs(dev, sg, nents, direction, 0); + } + +@@ -4266,6 +4416,11 @@ static inline void ib_dma_sync_single_fo + size_t size, + enum dma_data_direction dir) + { ++#ifndef HAVE_DEVICE_DMA_OPS ++ if (dev->ops.dma_ops) ++ dev->ops.dma_ops->sync_single_for_cpu(dev, addr, size, dir); ++ else ++#endif + if (!ib_uses_virt_dma(dev)) + dma_sync_single_for_cpu(dev->dma_device, addr, size, dir); + } +@@ -4282,6 +4437,11 @@ static inline void ib_dma_sync_single_fo + size_t size, + enum dma_data_direction dir) + { ++#ifndef HAVE_DEVICE_DMA_OPS ++ if (dev->ops.dma_ops) ++ dev->ops.dma_ops->sync_single_for_device(dev, addr, size, dir); ++ else ++#endif + if (!ib_uses_virt_dma(dev)) + dma_sync_single_for_device(dev->dma_device, addr, size, dir); + } +@@ -4375,6 +4535,45 @@ struct ib_xrcd *ib_alloc_xrcd_user(struc + struct inode *inode, struct ib_udata *udata); + int ib_dealloc_xrcd_user(struct ib_xrcd *xrcd, struct ib_udata *udata); + ++#ifdef HAVE_ETHTOOL_GET_SET_SETTINGS ++static inline int ib_active_speed_enum_to_rate(enum ib_port_speed active_speed, ++ int *rate, ++ char **speed) ++{ ++ switch (active_speed) { ++ case IB_SPEED_DDR: ++ *speed = " DDR"; ++ *rate = 50; ++ break; ++ case IB_SPEED_QDR: ++ *speed = " QDR"; ++ *rate = 100; ++ break; ++ case IB_SPEED_FDR10: ++ *speed = " FDR10"; ++ *rate = 100; ++ break; ++ case IB_SPEED_FDR: ++ *speed = " FDR"; ++ *rate = 140; ++ break; ++ case IB_SPEED_EDR: ++ *speed = " EDR"; ++ *rate = 250; ++ break; ++ case IB_SPEED_HDR: ++ *speed = " HDR"; ++ *rate = 500; ++ break; ++ case IB_SPEED_SDR: ++ default: /* default to SDR for invalid rates */ ++ *rate = 25; ++ break; ++ } ++ return 0; ++} ++#endif ++ + static inline int ib_check_mr_access(struct ib_device *ib_dev, + unsigned int flags) + { +@@ -4741,6 +4940,11 @@ int rdma_init_netdev(struct ib_device *d + struct net_device *netdev, + int force_fail); + ++#ifndef HAVE_NET_DEVICE_NEEDS_FREE_NETDEV ++int rdma_uninit_netdev(struct ib_device *device, struct net_device *netdev, ++ u8 port_num, enum rdma_netdev_t type, int force_fail); ++#endif ++ + /** + * rdma_device_to_ibdev - Get ib_device pointer from device pointer + * diff --git a/src/mlnx-ofa_kernel-5.8/backports/0249-BACKPORT-include-rdma-lag.h.patch b/src/mlnx-ofa_kernel-5.8/backports/0249-BACKPORT-include-rdma-lag.h.patch new file mode 100644 index 0000000..41fba33 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0249-BACKPORT-include-rdma-lag.h.patch @@ -0,0 +1,22 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: include/rdma/lag.h + +Change-Id: Iac3248bcddd4b94ed542bd6d1579576264975a5a +--- + include/rdma/lag.h | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/include/rdma/lag.h ++++ b/include/rdma/lag.h +@@ -6,7 +6,11 @@ + #ifndef _RDMA_LAG_H_ + #define _RDMA_LAG_H_ + ++#include "../../compat/config.h" ++ ++#ifdef HAVE_NET_LAG_H + #include ++#endif + + struct ib_device; + struct rdma_ah_attr; diff --git a/src/mlnx-ofa_kernel-5.8/backports/0250-BACKPORT-include-rdma-rdma_counter.h.patch b/src/mlnx-ofa_kernel-5.8/backports/0250-BACKPORT-include-rdma-rdma_counter.h.patch new file mode 100644 index 0000000..faf7330 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0250-BACKPORT-include-rdma-rdma_counter.h.patch @@ -0,0 +1,37 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: include/rdma/rdma_counter.h + +Change-Id: I8591490261e2adfc60e3197404b711cb5e365f72 +--- + include/rdma/rdma_counter.h | 11 ++++++++--- + 1 file changed, 8 insertions(+), 3 deletions(-) + +--- a/include/rdma/rdma_counter.h ++++ b/include/rdma/rdma_counter.h +@@ -6,9 +6,10 @@ + #ifndef _RDMA_COUNTER_H_ + #define _RDMA_COUNTER_H_ + ++#include "../../compat/config.h" ++ + #include + #include +- + #include + #include + +@@ -46,8 +47,12 @@ struct rdma_counter { + void rdma_counter_init(struct ib_device *dev); + void rdma_counter_release(struct ib_device *dev); + int rdma_counter_set_auto_mode(struct ib_device *dev, u32 port, +- enum rdma_nl_counter_mask mask, +- struct netlink_ext_ack *extack); ++ enum rdma_nl_counter_mask mask ++#ifdef HAVE_NETLINK_EXT_ACK ++ , struct netlink_ext_ack *extack); ++#else ++ ); ++#endif + int rdma_counter_bind_qp_auto(struct ib_qp *qp, u32 port); + int rdma_counter_unbind_qp(struct ib_qp *qp, bool force); + diff --git a/src/mlnx-ofa_kernel-5.8/backports/0251-BACKPORT-include-rdma-rdma_netlink.h.patch b/src/mlnx-ofa_kernel-5.8/backports/0251-BACKPORT-include-rdma-rdma_netlink.h.patch new file mode 100644 index 0000000..5c90bb4 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0251-BACKPORT-include-rdma-rdma_netlink.h.patch @@ -0,0 +1,34 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: include/rdma/rdma_netlink.h + +Change-Id: If34a02e37016c569e3f3624c8992938a32c80764 +--- + include/rdma/rdma_netlink.h | 10 ++++++++-- + 1 file changed, 8 insertions(+), 2 deletions(-) + +--- a/include/rdma/rdma_netlink.h ++++ b/include/rdma/rdma_netlink.h +@@ -3,6 +3,8 @@ + #ifndef _RDMA_NETLINK_H + #define _RDMA_NETLINK_H + ++#include "../../compat/config.h" ++ + #include + #include + +@@ -13,8 +15,12 @@ enum { + }; + + struct rdma_nl_cbs { +- int (*doit)(struct sk_buff *skb, struct nlmsghdr *nlh, +- struct netlink_ext_ack *extack); ++#ifdef HAVE_NETLINK_EXT_ACK ++ int (*doit)(struct sk_buff *skb, struct nlmsghdr *nlh, ++ struct netlink_ext_ack *extack); ++#else ++ int (*doit)(struct sk_buff *skb, struct nlmsghdr *nlh); ++#endif + int (*dump)(struct sk_buff *skb, struct netlink_callback *nlcb); + u8 flags; + }; diff --git a/src/mlnx-ofa_kernel-5.8/backports/0252-BACKPORT-include-rdma-uverbs_ioctl.h.patch b/src/mlnx-ofa_kernel-5.8/backports/0252-BACKPORT-include-rdma-uverbs_ioctl.h.patch new file mode 100644 index 0000000..515fe78 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0252-BACKPORT-include-rdma-uverbs_ioctl.h.patch @@ -0,0 +1,41 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: include/rdma/uverbs_ioctl.h + +Change-Id: I64c7f7b5fddf874bda30f13c2760d08cb0fe0449 +--- + include/rdma/uverbs_ioctl.h | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +--- a/include/rdma/uverbs_ioctl.h ++++ b/include/rdma/uverbs_ioctl.h +@@ -11,6 +11,10 @@ + #include + #include + #include ++#if defined(CONFIG_COMPAT_RHEL_7_2) ++#include ++#endif ++ + + /* + * ======================================= +@@ -123,6 +127,7 @@ struct uverbs_attr_spec { + * ie the WRITE path is treated as a special method type in the ioctl + * framework. + */ ++ + enum uapi_radix_data { + UVERBS_API_NS_FLAG = 1U << UVERBS_ID_NS_SHIFT, + +@@ -865,6 +870,11 @@ int uverbs_get_flags32(u32 *to, const st + size_t idx, u64 allowed_bits); + int uverbs_copy_to(const struct uverbs_attr_bundle *attrs_bundle, size_t idx, + const void *from, size_t size); ++ ++#ifndef __malloc ++#define __malloc ++#endif ++ + __malloc void *_uverbs_alloc(struct uverbs_attr_bundle *bundle, size_t size, + gfp_t flags); + diff --git a/src/mlnx-ofa_kernel-5.8/backports/0253-BACKPORT-include-trace-events-ib_mad.h.patch b/src/mlnx-ofa_kernel-5.8/backports/0253-BACKPORT-include-trace-events-ib_mad.h.patch new file mode 100644 index 0000000..0edeed2 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0253-BACKPORT-include-trace-events-ib_mad.h.patch @@ -0,0 +1,54 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: include/trace/events/ib_mad.h + +Change-Id: I6e52fe5ac249ec3577ec85af5cefb6985cb79f97 +--- + include/trace/events/ib_mad.h | 16 ++++++++++++++++ + 1 file changed, 16 insertions(+) + +--- a/include/trace/events/ib_mad.h ++++ b/include/trace/events/ib_mad.h +@@ -4,20 +4,34 @@ + * Copyright (c) 2018 Intel Corporation. All rights reserved. + */ + ++#ifndef MLX_DISABLE_TRACEPOINTS + #undef TRACE_SYSTEM + #define TRACE_SYSTEM ib_mad + + #if !defined(_TRACE_IB_MAD_H) || defined(TRACE_HEADER_MULTI_READ) + #define _TRACE_IB_MAD_H + ++#include "../../../compat/config.h" ++ + #include + #include + + #ifdef CONFIG_TRACEPOINTS ++ ++#ifdef HAVE_TRACE_EVENTS_H + struct trace_event_raw_ib_mad_send_template; ++#else ++struct ftrace_raw_ib_mad_send_template; ++#endif /* HAVE_TRACE_EVENTS_H */ ++ + static void create_mad_addr_info(struct ib_mad_send_wr_private *mad_send_wr, + struct ib_mad_qp_info *qp_info, ++#ifdef HAVE_TRACE_EVENTS_H + struct trace_event_raw_ib_mad_send_template *entry); ++#else ++ struct ftrace_raw_ib_mad_send_template *entry); ++#endif ++ + #endif + + DECLARE_EVENT_CLASS(ib_mad_send_template, +@@ -385,6 +399,8 @@ DEFINE_EVENT(ib_mad_opa_ib_template, ib_ + TP_PROTO(struct ib_smp *smp), + TP_ARGS(smp)); + ++ + #endif /* _TRACE_IB_MAD_H */ + + #include ++#endif diff --git a/src/mlnx-ofa_kernel-5.8/backports/0254-BACKPORT-include-trace-events-ib_umad.h.patch b/src/mlnx-ofa_kernel-5.8/backports/0254-BACKPORT-include-trace-events-ib_umad.h.patch new file mode 100644 index 0000000..0122898 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0254-BACKPORT-include-trace-events-ib_umad.h.patch @@ -0,0 +1,23 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: include/trace/events/ib_umad.h + +Change-Id: If1084c7e984238a2725a4317474c56ea4728d41a +--- + include/trace/events/ib_umad.h | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/include/trace/events/ib_umad.h ++++ b/include/trace/events/ib_umad.h +@@ -4,6 +4,7 @@ + * Copyright (c) 2018 Intel Corporation. All rights reserved. + * + */ ++#ifndef MLX_DISABLE_TRACEPOINTS + + #undef TRACE_SYSTEM + #define TRACE_SYSTEM ib_umad +@@ -124,3 +125,4 @@ DEFINE_EVENT(ib_umad_template, ib_umad_r + #endif /* _TRACE_IB_UMAD_H */ + + #include ++#endif diff --git a/src/mlnx-ofa_kernel-5.8/backports/0255-BACKPORT-include-trace-events-rpcrdma.h.patch b/src/mlnx-ofa_kernel-5.8/backports/0255-BACKPORT-include-trace-events-rpcrdma.h.patch new file mode 100644 index 0000000..6f1540b --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0255-BACKPORT-include-trace-events-rpcrdma.h.patch @@ -0,0 +1,35 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: include/trace/events/rpcrdma.h + +Change-Id: I42338bb33017512646c4cea493a67ac204ef45f3 +--- + include/trace/events/rpcrdma.h | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/include/trace/events/rpcrdma.h ++++ b/include/trace/events/rpcrdma.h +@@ -10,6 +10,8 @@ + #if !defined(_TRACE_RPCRDMA_H) || defined(TRACE_HEADER_MULTI_READ) + #define _TRACE_RPCRDMA_H + ++#include "../../../compat/config.h" ++ + #include + #include + #include +@@ -1674,6 +1676,7 @@ TRACE_EVENT(svcrdma_encode_wseg, + ) + ); + ++#ifdef HAVE_SVC_RDMA_PCL + TRACE_EVENT(svcrdma_decode_rseg, + TP_PROTO( + const struct rpc_rdma_cid *cid, +@@ -1746,6 +1749,7 @@ TRACE_EVENT(svcrdma_decode_wseg, + (unsigned long long)__entry->offset, __entry->handle + ) + ); ++#endif + + DECLARE_EVENT_CLASS(svcrdma_error_event, + TP_PROTO( diff --git a/src/mlnx-ofa_kernel-5.8/backports/0256-BACKPORT-net-mlxdevm-mlxdevm.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0256-BACKPORT-net-mlxdevm-mlxdevm.c.patch new file mode 100644 index 0000000..ffa864e --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0256-BACKPORT-net-mlxdevm-mlxdevm.c.patch @@ -0,0 +1,178 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: net/mlxdevm/mlxdevm.c + +Change-Id: I4f75139b210dc01aa08d199bb5042ddd322dac06 +--- + net/mlxdevm/mlxdevm.c | 61 +++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 61 insertions(+) + +--- a/net/mlxdevm/mlxdevm.c ++++ b/net/mlxdevm/mlxdevm.c +@@ -1438,7 +1438,11 @@ static int mlxdevm_nl_cmd_port_get_dumpi + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI, ++#ifdef HAVE_NETLINK_CALLBACK_EXTACK + cb->extack); ++#else ++ NULL); ++#endif + if (err) { + up_read(&dev->port_list_rwsem); + goto out; +@@ -1738,7 +1742,11 @@ static int mlxdevm_nl_cmd_rate_get_dumpi + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI, ++#ifdef HAVE_NETLINK_CALLBACK_EXTACK + cb->extack); ++#else ++ NULL); ++#endif + if (err) { + up_read(&dev->rate_group_rwsem); + goto out; +@@ -1760,7 +1768,11 @@ static int mlxdevm_nl_cmd_rate_get_dumpi + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI, ++#ifdef HAVE_NETLINK_CALLBACK_EXTACK + cb->extack); ++#else ++ NULL); ++#endif + if (err) { + up_read(&dev->port_list_rwsem); + goto out; +@@ -2365,74 +2377,121 @@ static const struct nla_policy mlxdevm_n + static const struct genl_ops mlxdevm_nl_ops[] = { + { + .cmd = MLXDEVM_CMD_DEV_GET, ++#ifdef HAVE_GENL_OPS_VALIDATE + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, ++#endif ++#ifndef HAVE_GENL_FAMILY_POLICY ++ .policy = mlxdevm_nl_policy, ++#endif + .doit = mlxdevm_nl_cmd_dev_get_doit, + .dumpit = mlxdevm_nl_cmd_dev_get_dumpit, + }, + { + .cmd = MLXDEVM_CMD_PORT_SET, ++#ifdef HAVE_GENL_OPS_VALIDATE + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, ++#endif ++#ifndef HAVE_GENL_FAMILY_POLICY ++ .policy = mlxdevm_nl_policy, ++#endif + .doit = mlxdevm_nl_cmd_port_set_doit, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = MLXDEVM_CMD_PORT_GET, ++#ifdef HAVE_GENL_OPS_VALIDATE + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, ++#endif ++#ifndef HAVE_GENL_FAMILY_POLICY ++ .policy = mlxdevm_nl_policy, ++#endif + .doit = mlxdevm_nl_cmd_port_get_doit, + .dumpit = mlxdevm_nl_cmd_port_get_dumpit, + /* can be retrieved by unprivileged users */ + }, + { + .cmd = MLXDEVM_CMD_PORT_NEW, ++#ifndef HAVE_GENL_FAMILY_POLICY ++ .policy = mlxdevm_nl_policy, ++#endif + .doit = mlxdevm_nl_cmd_port_new_doit, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = MLXDEVM_CMD_PORT_DEL, ++#ifndef HAVE_GENL_FAMILY_POLICY ++ .policy = mlxdevm_nl_policy, ++#endif + .doit = mlxdevm_nl_cmd_port_del_doit, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = MLXDEVM_CMD_PARAM_GET, ++#ifdef HAVE_GENL_OPS_VALIDATE + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, ++#endif + .doit = mlxdevm_nl_cmd_param_get_doit, + .dumpit = mlxdevm_nl_cmd_param_get_dumpit, + /* can be retrieved by unprivileged users */ + }, + { + .cmd = MLXDEVM_CMD_PARAM_SET, ++#ifdef HAVE_GENL_OPS_VALIDATE + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, ++#endif + .doit = mlxdevm_nl_cmd_param_set_doit, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = MLXDEVM_CMD_EXT_CAP_SET, ++#ifdef HAVE_GENL_OPS_VALIDATE + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, ++#endif + .doit = mlxdevm_nl_cmd_port_fn_cap_set_doit, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = MLXDEVM_CMD_EXT_RATE_SET, ++#ifdef HAVE_GENL_OPS_VALIDATE + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, ++#endif ++#ifndef HAVE_GENL_FAMILY_POLICY ++ .policy = mlxdevm_nl_policy, ++#endif + .doit = mlxdevm_nl_cmd_rate_set_doit, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = MLXDEVM_CMD_EXT_RATE_GET, ++#ifdef HAVE_GENL_OPS_VALIDATE + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, ++#endif ++#ifndef HAVE_GENL_FAMILY_POLICY ++ .policy = mlxdevm_nl_policy, ++#endif + .doit = mlxdevm_nl_cmd_rate_get_doit, + .dumpit = mlxdevm_nl_cmd_rate_get_dumpit, + /* can be retrieved by unprivileged users */ + }, + { + .cmd = MLXDEVM_CMD_EXT_RATE_NEW, ++#ifdef HAVE_GENL_OPS_VALIDATE + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, ++#endif ++#ifndef HAVE_GENL_FAMILY_POLICY ++ .policy = mlxdevm_nl_policy, ++#endif + .doit = mlxdevm_nl_cmd_rate_new_doit, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = MLXDEVM_CMD_EXT_RATE_DEL, ++#ifdef HAVE_GENL_OPS_VALIDATE + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, ++#endif ++#ifndef HAVE_GENL_FAMILY_POLICY ++ .policy = mlxdevm_nl_policy, ++#endif + .doit = mlxdevm_nl_cmd_rate_del_doit, + .flags = GENL_ADMIN_PERM, + }, +@@ -2442,7 +2501,9 @@ static struct genl_family mlxdevm_nl_fam + .name = MLXDEVM_GENL_NAME, + .version = MLXDEVM_GENL_VERSION, + .maxattr = MLXDEVM_ATTR_MAX, ++#ifdef HAVE_GENL_FAMILY_POLICY + .policy = mlxdevm_nl_policy, ++#endif + .netnsok = false, + .module = THIS_MODULE, + .ops = mlxdevm_nl_ops, diff --git a/src/mlnx-ofa_kernel-5.8/backports/0257-BACKPORT-net-sunrpc-xprtrdma-backchannel.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0257-BACKPORT-net-sunrpc-xprtrdma-backchannel.c.patch new file mode 100644 index 0000000..45ca135 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0257-BACKPORT-net-sunrpc-xprtrdma-backchannel.c.patch @@ -0,0 +1,249 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: net/sunrpc/xprtrdma/backchannel.c + +Change-Id: I40d3c93c9302ce89b1ec2a1f830fe78b7131a5f5 +--- + net/sunrpc/xprtrdma/backchannel.c | 141 ++++++++++++++++++++++++++++++ + 1 file changed, 141 insertions(+) + +--- a/net/sunrpc/xprtrdma/backchannel.c ++++ b/net/sunrpc/xprtrdma/backchannel.c +@@ -11,10 +11,19 @@ + #include + + #include "xprt_rdma.h" ++#ifdef HAVE_TRACE_RPCRDMA_H + #include ++#endif + + #undef RPCRDMA_BACKCHANNEL_DEBUG + ++#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) ++#ifndef RPCDBG_FACILITY ++#define RPCDBG_FACILITY RPCDBG_TRANS ++#endif ++#endif ++ ++#ifdef HAVE_RPC_XPRT_OPS_FREE_SLOT + /** + * xprt_rdma_bc_setup - Pre-allocate resources for handling backchannel requests + * @xprt: transport associated with these backchannel resources +@@ -27,9 +36,92 @@ int xprt_rdma_bc_setup(struct rpc_xprt * + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + + r_xprt->rx_buf.rb_bc_srv_max_requests = RPCRDMA_BACKWARD_WRS >> 1; ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_xprtrdma_cb_setup(r_xprt, reqs); ++#endif ++ return 0; ++} ++#else ++static void rpcrdma_bc_free_rqst(struct rpcrdma_xprt *r_xprt, ++ struct rpc_rqst *rqst) ++{ ++ struct rpcrdma_buffer *buf = &r_xprt->rx_buf; ++ struct rpcrdma_req *req = rpcr_to_rdmar(rqst); ++ ++ spin_lock(&buf->rb_lock); ++ rpcrdma_req_destroy(req); ++ spin_unlock(&buf->rb_lock); ++ ++ kfree(rqst); ++} ++ ++static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt, ++ struct rpc_rqst *rqst); ++ ++int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs) ++{ ++ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); ++ struct rpcrdma_buffer *buffer = &r_xprt->rx_buf; ++ struct rpc_rqst *rqst; ++ unsigned int i; ++ ++ if (reqs > RPCRDMA_BACKWARD_WRS >> 1) ++ goto out_err; ++ ++ for (i = 0; i < (reqs << 1); i++) { ++ rqst = kzalloc(sizeof(*rqst), GFP_KERNEL); ++ if (!rqst) ++ goto out_free; ++ ++ dprintk("RPC: %s: new rqst %p\n", __func__, rqst); ++ ++ rqst->rq_xprt = &r_xprt->rx_xprt; ++ INIT_LIST_HEAD(&rqst->rq_list); ++ INIT_LIST_HEAD(&rqst->rq_bc_list); ++ __set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state); ++ ++ if (rpcrdma_bc_setup_rqst(r_xprt, rqst)) ++ goto out_free; ++ ++ spin_lock_bh(&xprt->bc_pa_lock); ++ list_add(&rqst->rq_bc_pa_list, &xprt->bc_pa_list); ++ spin_unlock_bh(&xprt->bc_pa_lock); ++ } ++ ++ buffer->rb_bc_srv_max_requests = reqs; ++ request_module("svcrdma"); + return 0; ++ ++out_free: ++ xprt_rdma_bc_destroy(xprt, reqs); ++ ++out_err: ++ pr_err("RPC: %s: setup backchannel transport failed\n", __func__); ++ return -ENOMEM; + } ++#endif ++ ++#if defined(CONFIG_SUNRPC_BACKCHANNEL) && defined(HAVE_RPC_XPRT_OPS_BC_UP) ++/** ++ * xprt_rdma_bc_up - Create transport endpoint for backchannel service ++ * @serv: server endpoint ++ * @net: network namespace ++ * ++ * The "xprt" is an implied argument: it supplies the name of the ++ * backchannel transport class. ++ * ++ * Returns zero on success, negative errno on failure ++ */ ++int xprt_rdma_bc_up(struct svc_serv *serv, struct net *net) ++{ ++ int ret; ++ ++ ret = svc_create_xprt(serv, "rdma-bc", net, PF_INET, 0, 0); ++ if (ret < 0) ++ return ret; ++ return 0; ++} ++#endif + + /** + * xprt_rdma_bc_maxpayload - Return maximum backchannel message size +@@ -48,10 +140,12 @@ size_t xprt_rdma_bc_maxpayload(struct rp + return maxmsg - RPCRDMA_HDRLEN_MIN; + } + ++#ifdef HAVE_RPC_XPRT_OPS_BC_NUM_SLOTS + unsigned int xprt_rdma_bc_max_slots(struct rpc_xprt *xprt) + { + return RPCRDMA_BACKWARD_WRS >> 1; + } ++#endif + + static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) + { +@@ -60,8 +154,13 @@ static int rpcrdma_bc_marshal_reply(stru + __be32 *p; + + rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0); ++#ifdef HAVE_XDR_INIT_ENCODE_RQST_ARG + xdr_init_encode(&req->rl_stream, &req->rl_hdrbuf, + rdmab_data(req->rl_rdmabuf), rqst); ++#else ++ xdr_init_encode(&req->rl_stream, &req->rl_hdrbuf, ++ rdmab_data(req->rl_rdmabuf)); ++#endif + + p = xdr_reserve_space(&req->rl_stream, 28); + if (unlikely(!p)) +@@ -78,7 +177,9 @@ static int rpcrdma_bc_marshal_reply(stru + &rqst->rq_snd_buf, rpcrdma_noch_pullup)) + return -EIO; + ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_xprtrdma_cb_reply(r_xprt, rqst); ++#endif + return 0; + } + +@@ -104,8 +205,10 @@ int xprt_rdma_bc_send_reply(struct rpc_r + if (!xprt_connected(xprt)) + return -ENOTCONN; + ++#ifdef HAVE_XPRT_REQUEST_GET_CONG + if (!xprt_request_get_cong(xprt, rqst)) + return -EBADSLT; ++#endif + + rc = rpcrdma_bc_marshal_reply(rqst); + if (rc < 0) +@@ -137,7 +240,11 @@ void xprt_rdma_bc_destroy(struct rpc_xpr + list_del(&rqst->rq_bc_pa_list); + spin_unlock(&xprt->bc_pa_lock); + ++#ifdef HAVE_RPC_XPRT_OPS_FREE_SLOT + rpcrdma_req_destroy(rpcr_to_rdmar(rqst)); ++#else ++ rpcrdma_bc_free_rqst(rpcx_to_rdmax(xprt), rqst); ++#endif + + spin_lock(&xprt->bc_pa_lock); + } +@@ -164,6 +271,7 @@ void xprt_rdma_bc_free_rqst(struct rpc_r + xprt_put(xprt); + } + ++#ifdef HAVE_RPC_XPRT_OPS_FREE_SLOT + static struct rpc_rqst *rpcrdma_bc_rqst_get(struct rpcrdma_xprt *r_xprt) + { + struct rpc_xprt *xprt = &r_xprt->rx_xprt; +@@ -204,6 +312,24 @@ create_req: + xdr_buf_init(&rqst->rq_snd_buf, rdmab_data(req->rl_sendbuf), size); + return rqst; + } ++#else ++static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt, ++ struct rpc_rqst *rqst) ++{ ++ struct rpcrdma_req *req; ++ size_t size; ++ ++ size = min_t(size_t, r_xprt->rx_ep->re_inline_recv, PAGE_SIZE); ++ req = rpcrdma_req_create(r_xprt, size, GFP_KERNEL); ++ if (!req) ++ return PTR_ERR(req); ++ ++ xdr_buf_init(&rqst->rq_snd_buf, rdmab_data(req->rl_sendbuf), ++ size); ++ rpcrdma_set_xprtdata(rqst, req); ++ return 0; ++} ++#endif + + /** + * rpcrdma_bc_receive_call - Handle a reverse-direction Call +@@ -236,9 +362,22 @@ void rpcrdma_bc_receive_call(struct rpcr + pr_info("RPC: %s: %*ph\n", __func__, size, p); + #endif + ++#ifdef HAVE_RPC_XPRT_OPS_FREE_SLOT + rqst = rpcrdma_bc_rqst_get(r_xprt); + if (!rqst) + goto out_overflow; ++#else ++ /* Grab a free bc rqst */ ++ spin_lock(&xprt->bc_pa_lock); ++ if (list_empty(&xprt->bc_pa_list)) { ++ spin_unlock(&xprt->bc_pa_lock); ++ goto out_overflow; ++ } ++ rqst = list_first_entry(&xprt->bc_pa_list, ++ struct rpc_rqst, rq_bc_pa_list); ++ list_del(&rqst->rq_bc_pa_list); ++ spin_unlock(&xprt->bc_pa_lock); ++#endif + + rqst->rq_reply_bytes_recvd = 0; + rqst->rq_xid = *p; +@@ -258,7 +397,9 @@ void rpcrdma_bc_receive_call(struct rpcr + */ + req = rpcr_to_rdmar(rqst); + req->rl_reply = rep; ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_xprtrdma_cb_call(r_xprt, rqst); ++#endif + + /* Queue rqst for ULP's callback service */ + bc_serv = xprt->bc_serv; diff --git a/src/mlnx-ofa_kernel-5.8/backports/0258-BACKPORT-net-sunrpc-xprtrdma-frwr_ops.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0258-BACKPORT-net-sunrpc-xprtrdma-frwr_ops.c.patch new file mode 100644 index 0000000..977a3f9 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0258-BACKPORT-net-sunrpc-xprtrdma-frwr_ops.c.patch @@ -0,0 +1,237 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: net/sunrpc/xprtrdma/frwr_ops.c + +Change-Id: Id8f23c352af2a7de609983030d8eb4096cd83875 +--- + net/sunrpc/xprtrdma/frwr_ops.c | 50 ++++++++++++++++++++++++++++++++-- + 1 file changed, 48 insertions(+), 2 deletions(-) + +--- a/net/sunrpc/xprtrdma/frwr_ops.c ++++ b/net/sunrpc/xprtrdma/frwr_ops.c +@@ -43,7 +43,9 @@ + #include + + #include "xprt_rdma.h" ++#ifdef HAVE_TRACE_RPCRDMA_H + #include ++#endif + + #ifdef CONFIG_NVFS + #define NVFS_FRWR +@@ -51,6 +53,12 @@ + #include "nvfs_rpc_rdma.h" + #endif + ++#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) ++#ifndef RPCDBG_FACILITY ++#define RPCDBG_FACILITY RPCDBG_TRANS ++#endif ++#endif ++ + static void frwr_cid_init(struct rpcrdma_ep *ep, + struct rpcrdma_mr *mr) + { +@@ -63,7 +71,9 @@ static void frwr_cid_init(struct rpcrdma + static void frwr_mr_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) + { + if (mr->mr_device) { ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_xprtrdma_mr_unmap(mr); ++#endif + #ifdef CONFIG_NVFS + if (rpcrdma_nvfs_unmap_data(mr->mr_device->dma_device, + mr->mr_sg, mr->mr_nents, mr->mr_dir)) +@@ -90,8 +100,10 @@ void frwr_mr_release(struct rpcrdma_mr * + frwr_mr_unmap(mr->mr_xprt, mr); + + rc = ib_dereg_mr(mr->mr_ibmr); ++#ifdef HAVE_TRACE_RPCRDMA_H + if (rc) + trace_xprtrdma_frwr_dereg(mr, rc); ++#endif + kfree(mr->mr_sg); + kfree(mr); + } +@@ -161,7 +173,9 @@ int frwr_mr_init(struct rpcrdma_xprt *r_ + + out_mr_err: + rc = PTR_ERR(frmr); ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_xprtrdma_frwr_alloc(mr, rc); ++#endif + return rc; + + out_list_err: +@@ -368,16 +382,22 @@ struct rpcrdma_mr_seg *frwr_map(struct r + mr->mr_handle = ibmr->rkey; + mr->mr_length = ibmr->length; + mr->mr_offset = ibmr->iova; ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_xprtrdma_mr_map(mr); ++#endif + + return seg; + + out_dmamap_err: ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_xprtrdma_frwr_sgerr(mr, i); ++#endif + return ERR_PTR(-EIO); + + out_mapmr_err: ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_xprtrdma_frwr_maperr(mr, n); ++#endif + return ERR_PTR(-EIO); + } + +@@ -390,11 +410,13 @@ out_mapmr_err: + */ + static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc) + { ++#ifdef HAVE_TRACE_RPCRDMA_H + struct ib_cqe *cqe = wc->wr_cqe; + struct rpcrdma_mr *mr = container_of(cqe, struct rpcrdma_mr, mr_cqe); + + /* WARNING: Only wr_cqe and status are reliable at this point */ + trace_xprtrdma_wc_fastreg(wc, &mr->mr_cid); ++#endif + + rpcrdma_flush_disconnect(cq->cq_context, wc); + } +@@ -424,7 +446,9 @@ int frwr_send(struct rpcrdma_xprt *r_xpr + num_wrs = 1; + post_wr = send_wr; + list_for_each_entry(mr, &req->rl_registered, mr_list) { ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_xprtrdma_mr_fastreg(mr); ++#endif + + mr->mr_cqe.done = frwr_wc_fastreg; + mr->mr_regwr.wr.next = post_wr; +@@ -445,10 +469,14 @@ int frwr_send(struct rpcrdma_xprt *r_xpr + ep->re_send_count -= num_wrs; + } + ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_xprtrdma_post_send(req); ++#endif + ret = ib_post_send(ep->re_id->qp, post_wr, NULL); ++#ifdef HAVE_TRACE_RPCRDMA_H + if (ret) + trace_xprtrdma_post_send_err(r_xprt, req, ret); ++#endif + return ret; + } + +@@ -465,7 +493,9 @@ void frwr_reminv(struct rpcrdma_rep *rep + list_for_each_entry(mr, mrs, mr_list) + if (mr->mr_handle == rep->rr_inv_rkey) { + list_del_init(&mr->mr_list); ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_xprtrdma_mr_reminv(mr); ++#endif + frwr_mr_put(mr); + break; /* only one invalidated MR per RPC */ + } +@@ -489,7 +519,9 @@ static void frwr_wc_localinv(struct ib_c + struct rpcrdma_mr *mr = container_of(cqe, struct rpcrdma_mr, mr_cqe); + + /* WARNING: Only wr_cqe and status are reliable at this point */ ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_xprtrdma_wc_li(wc, &mr->mr_cid); ++#endif + frwr_mr_done(wc, mr); + + rpcrdma_flush_disconnect(cq->cq_context, wc); +@@ -508,7 +540,9 @@ static void frwr_wc_localinv_wake(struct + struct rpcrdma_mr *mr = container_of(cqe, struct rpcrdma_mr, mr_cqe); + + /* WARNING: Only wr_cqe and status are reliable at this point */ ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_xprtrdma_wc_li_wake(wc, &mr->mr_cid); ++#endif + frwr_mr_done(wc, mr); + complete(&mr->mr_linv_done); + +@@ -528,7 +562,7 @@ static void frwr_wc_localinv_wake(struct + */ + void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) + { +- struct ib_send_wr *first, **prev, *last; ++ struct ib_send_wr *first, **prev, *last = NULL; + struct rpcrdma_ep *ep = r_xprt->rx_ep; + const struct ib_send_wr *bad_wr; + struct rpcrdma_mr *mr; +@@ -542,7 +576,9 @@ void frwr_unmap_sync(struct rpcrdma_xprt + prev = &first; + mr = rpcrdma_mr_pop(&req->rl_registered); + do { ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_xprtrdma_mr_localinv(mr); ++#endif + r_xprt->rx_stats.local_inv_needed++; + + last = &mr->mr_invwr; +@@ -585,8 +621,10 @@ void frwr_unmap_sync(struct rpcrdma_xprt + if (!rc) + return; + ++#ifdef HAVE_TRACE_RPCRDMA_H + /* On error, the MRs get destroyed once the QP has drained. */ + trace_xprtrdma_post_linv_err(req, rc); ++#endif + + /* Force a connection loss to ensure complete recovery. + */ +@@ -605,8 +643,10 @@ static void frwr_wc_localinv_done(struct + struct rpcrdma_mr *mr = container_of(cqe, struct rpcrdma_mr, mr_cqe); + struct rpcrdma_rep *rep; + ++#ifdef HAVE_TRACE_RPCRDMA_H + /* WARNING: Only wr_cqe and status are reliable at this point */ + trace_xprtrdma_wc_li_done(wc, &mr->mr_cid); ++#endif + + /* Ensure that @rep is generated before the MR is released */ + rep = mr->mr_req->rl_reply; +@@ -634,7 +674,7 @@ static void frwr_wc_localinv_done(struct + */ + void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) + { +- struct ib_send_wr *first, *last, **prev; ++ struct ib_send_wr *first, *last = NULL, **prev; + struct rpcrdma_ep *ep = r_xprt->rx_ep; + struct rpcrdma_mr *mr; + int rc; +@@ -645,7 +685,9 @@ void frwr_unmap_async(struct rpcrdma_xpr + prev = &first; + mr = rpcrdma_mr_pop(&req->rl_registered); + do { ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_xprtrdma_mr_localinv(mr); ++#endif + r_xprt->rx_stats.local_inv_needed++; + + last = &mr->mr_invwr; +@@ -678,8 +720,10 @@ void frwr_unmap_async(struct rpcrdma_xpr + if (!rc) + return; + ++#ifdef HAVE_TRACE_RPCRDMA_H + /* On error, the MRs get destroyed once the QP has drained. */ + trace_xprtrdma_post_linv_err(req, rc); ++#endif + + /* The final LOCAL_INV WR in the chain is supposed to + * do the wake. If it was never posted, the wake does +@@ -716,7 +760,9 @@ int frwr_wp_create(struct rpcrdma_xprt * + seg.mr_offset = offset_in_page(ep->re_write_pad); + if (IS_ERR(frwr_map(r_xprt, &seg, 1, true, xdr_zero, mr))) + return -EIO; ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_xprtrdma_mr_fastreg(mr); ++#endif + + mr->mr_cqe.done = frwr_wc_fastreg; + mr->mr_regwr.wr.next = NULL; diff --git a/src/mlnx-ofa_kernel-5.8/backports/0259-BACKPORT-net-sunrpc-xprtrdma-module.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0259-BACKPORT-net-sunrpc-xprtrdma-module.c.patch new file mode 100644 index 0000000..d9b91e3 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0259-BACKPORT-net-sunrpc-xprtrdma-module.c.patch @@ -0,0 +1,29 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: net/sunrpc/xprtrdma/module.c + +Change-Id: I48b51e371cc9af9c37616be08a9436b6b21ad96b +--- + net/sunrpc/xprtrdma/module.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/net/sunrpc/xprtrdma/module.c ++++ b/net/sunrpc/xprtrdma/module.c +@@ -17,14 +17,18 @@ + #include "xprt_rdma.h" + + #define CREATE_TRACE_POINTS ++#ifdef HAVE_TRACE_RPCRDMA_H + #include ++#endif + + MODULE_AUTHOR("Open Grid Computing and Network Appliance, Inc."); + MODULE_DESCRIPTION("RPC/RDMA Transport"); + MODULE_LICENSE("Dual BSD/GPL"); + MODULE_ALIAS("svcrdma"); + MODULE_ALIAS("xprtrdma"); ++#ifdef HAVE_XPRT_CLASS_NETID + MODULE_ALIAS("rpcrdma6"); ++#endif + + static void __exit rpc_rdma_cleanup(void) + { diff --git a/src/mlnx-ofa_kernel-5.8/backports/0260-BACKPORT-net-sunrpc-xprtrdma-rpc_rdma.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0260-BACKPORT-net-sunrpc-xprtrdma-rpc_rdma.c.patch new file mode 100644 index 0000000..eff1b76 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0260-BACKPORT-net-sunrpc-xprtrdma-rpc_rdma.c.patch @@ -0,0 +1,460 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: net/sunrpc/xprtrdma/rpc_rdma.c + +Change-Id: Ie041c79fd73b40f605d2afff7787484c5b4c64a0 +--- + net/sunrpc/xprtrdma/rpc_rdma.c | 182 ++++++++++++++++++++++++++++++++- + 1 file changed, 180 insertions(+), 2 deletions(-) + +--- a/net/sunrpc/xprtrdma/rpc_rdma.c ++++ b/net/sunrpc/xprtrdma/rpc_rdma.c +@@ -52,8 +52,15 @@ + #include + + #include "xprt_rdma.h" ++#ifdef HAVE_TRACE_RPCRDMA_H + #include ++#endif + ++#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) ++#ifndef RPCDBG_FACILITY ++#define RPCDBG_FACILITY RPCDBG_TRANS ++#endif ++#endif + /* Returns size of largest RPC-over-RDMA header in a Call message + * + * The largest Call header contains a full-size Read list and a +@@ -307,11 +314,19 @@ static struct rpcrdma_mr_seg *rpcrdma_mr + } + + rpcrdma_mr_push(*mr, &req->rl_registered); ++#ifdef HAVE_RPC_XPRT_OPS_FREE_SLOT + return frwr_map(r_xprt, seg, nsegs, writing, req->rl_slot.rq_xid, *mr); ++#else ++ return frwr_map(r_xprt, seg, nsegs, writing, req->rl_xid, *mr); ++#endif + + out_getmr_err: ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_xprtrdma_nomrs_err(r_xprt, req); ++#endif ++#ifdef HAVE_XPRT_WAIT_FOR_BUFFER_SPACE_RQST_ARG + xprt_wait_for_buffer_space(&r_xprt->rx_xprt); ++#endif + rpcrdma_mrs_refresh(r_xprt); + return ERR_PTR(-EAGAIN); + } +@@ -361,7 +376,9 @@ static int rpcrdma_encode_read_list(stru + if (encode_read_segment(xdr, mr, pos) < 0) + return -EMSGSIZE; + ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_xprtrdma_chunk_read(rqst->rq_task, pos, mr, nsegs); ++#endif + r_xprt->rx_stats.read_chunk_count++; + nsegs -= mr->mr_nents; + } while (nsegs); +@@ -425,7 +442,9 @@ static int rpcrdma_encode_write_list(str + if (encode_rdma_segment(xdr, mr) < 0) + return -EMSGSIZE; + ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_xprtrdma_chunk_write(rqst->rq_task, mr, nsegs); ++#endif + r_xprt->rx_stats.write_chunk_count++; + r_xprt->rx_stats.total_rdma_request += mr->mr_length; + nchunks++; +@@ -436,8 +455,10 @@ static int rpcrdma_encode_write_list(str + if (encode_rdma_segment(xdr, ep->re_write_pad_mr) < 0) + return -EMSGSIZE; + ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_xprtrdma_chunk_wp(rqst->rq_task, ep->re_write_pad_mr, + nsegs); ++#endif + r_xprt->rx_stats.write_chunk_count++; + r_xprt->rx_stats.total_rdma_request += mr->mr_length; + nchunks++; +@@ -503,7 +524,9 @@ static int rpcrdma_encode_reply_chunk(st + if (encode_rdma_segment(xdr, mr) < 0) + return -EMSGSIZE; + ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_xprtrdma_chunk_reply(rqst->rq_task, mr, nsegs); ++#endif + r_xprt->rx_stats.reply_chunk_count++; + r_xprt->rx_stats.total_rdma_request += mr->mr_length; + nchunks++; +@@ -625,7 +648,9 @@ static bool rpcrdma_prepare_pagelist(str + return true; + + out_mapping_err: ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_xprtrdma_dma_maperr(sge->addr); ++#endif + return false; + } + +@@ -653,7 +678,9 @@ static bool rpcrdma_prepare_tail_iov(str + return true; + + out_mapping_err: ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_xprtrdma_dma_maperr(sge->addr); ++#endif + return false; + } + +@@ -833,7 +860,9 @@ inline int rpcrdma_prepare_send_sges(str + out_unmap: + rpcrdma_sendctx_unmap(req->rl_sendctx); + out_nosc: ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_xprtrdma_prepsend_failed(&req->rl_slot, ret); ++#endif + return ret; + } + +@@ -867,15 +896,23 @@ rpcrdma_marshal_req(struct rpcrdma_xprt + __be32 *p; + int ret; + ++#ifdef HAVE_XDRBUF_SPARSE_PAGES + if (unlikely(rqst->rq_rcv_buf.flags & XDRBUF_SPARSE_PAGES)) { ++#endif + ret = rpcrdma_alloc_sparse_pages(&rqst->rq_rcv_buf); + if (ret) + return ret; ++#ifdef HAVE_XDRBUF_SPARSE_PAGES + } ++#endif + + rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0); ++#ifdef HAVE_XDR_INIT_ENCODE_RQST_ARG + xdr_init_encode(xdr, &req->rl_hdrbuf, rdmab_data(req->rl_rdmabuf), + rqst); ++#else ++ xdr_init_encode(xdr, &req->rl_hdrbuf, rdmab_data(req->rl_rdmabuf)); ++#endif + + /* Fixed header fields */ + ret = -EMSGSIZE; +@@ -891,7 +928,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt + * is not allowed. + */ + ddp_allowed = !test_bit(RPCAUTH_AUTH_DATATOUCH, +- &rqst->rq_cred->cr_auth->au_flags); ++ (const void *)&rqst->rq_cred->cr_auth->au_flags); + + /* + * Chunks needed for results? +@@ -937,6 +974,14 @@ rpcrdma_marshal_req(struct rpcrdma_xprt + rtype = rpcrdma_areadch; + } + ++#if !defined(HAVE_RPC_XPRT_OPS_FREE_SLOT) || !defined(HAVE_XPRT_PIN_RQST) ++ req->rl_xid = rqst->rq_xid; ++#endif ++ ++#ifndef HAVE_XPRT_PIN_RQST ++ rpcrdma_insert_req(&r_xprt->rx_buf, req); ++#endif ++ + /* This implementation supports the following combinations + * of chunk lists in one RPC-over-RDMA Call message: + * +@@ -974,11 +1019,20 @@ rpcrdma_marshal_req(struct rpcrdma_xprt + if (ret) + goto out_err; + ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_xprtrdma_marshal(req, rtype, wtype); ++#endif + return 0; + + out_err: ++#ifndef HAVE_XPRT_WAIT_FOR_BUFFER_SPACE_RQST_ARG ++ if (ret == -EAGAIN) ++ xprt_wait_for_buffer_space(rqst->rq_task, NULL); ++#endif ++ ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_xprtrdma_marshal_failed(rqst, ret); ++#endif + r_xprt->rx_stats.failed_marshal_count++; + frwr_reset(req); + return ret; +@@ -1013,7 +1067,9 @@ void rpcrdma_reset_cwnd(struct rpcrdma_x + struct rpc_xprt *xprt = &r_xprt->rx_xprt; + + spin_lock(&xprt->transport_lock); ++#ifdef HAVE_XPRT_REQUEST_GET_CONG + xprt->cong = 0; ++#endif + __rpcrdma_update_cwnd_locked(xprt, &r_xprt->rx_buf, 1); + spin_unlock(&xprt->transport_lock); + } +@@ -1107,8 +1163,10 @@ rpcrdma_inline_fixup(struct rpc_rqst *rq + rqst->rq_private_buf.tail[0].iov_base = srcp; + } + ++#ifdef HAVE_TRACE_RPCRDMA_H + if (fixup_copy_count) + trace_xprtrdma_fixup(rqst, fixup_copy_count); ++#endif + return fixup_copy_count; + } + +@@ -1171,7 +1229,9 @@ static int decode_rdma_segment(struct xd + return -EIO; + + xdr_decode_rdma_segment(p, &handle, length, &offset); ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_xprtrdma_decode_seg(handle, *length, offset); ++#endif + return 0; + } + +@@ -1324,13 +1384,19 @@ rpcrdma_decode_error(struct rpcrdma_xprt + p = xdr_inline_decode(xdr, 2 * sizeof(*p)); + if (!p) + break; ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_xprtrdma_err_vers(rqst, p, p + 1); ++#endif + break; + case err_chunk: ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_xprtrdma_err_chunk(rqst); ++#endif + break; ++#ifdef HAVE_TRACE_RPCRDMA_H + default: + trace_xprtrdma_err_unrecognized(rqst, p); ++#endif + } + + return -EIO; +@@ -1346,16 +1412,28 @@ rpcrdma_decode_error(struct rpcrdma_xprt + */ + void rpcrdma_unpin_rqst(struct rpcrdma_rep *rep) + { ++#ifdef HAVE_XPRT_PIN_RQST + struct rpc_xprt *xprt = &rep->rr_rxprt->rx_xprt; ++#endif + struct rpc_rqst *rqst = rep->rr_rqst; + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + + req->rl_reply = NULL; + rep->rr_rqst = NULL; + ++#ifdef HAVE_XPRT_PIN_RQST ++#ifdef HAVE_XPRT_QUEUE_LOCK + spin_lock(&xprt->queue_lock); ++#else ++ spin_lock(&xprt->recv_lock); ++#endif + xprt_unpin_rqst(rqst); ++#ifdef HAVE_XPRT_QUEUE_LOCK + spin_unlock(&xprt->queue_lock); ++#else ++ spin_unlock(&xprt->recv_lock); ++#endif ++#endif + } + + /** +@@ -1369,7 +1447,9 @@ void rpcrdma_unpin_rqst(struct rpcrdma_r + void rpcrdma_complete_rqst(struct rpcrdma_rep *rep) + { + struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; ++#ifdef HAVE_XPRT_PIN_RQST + struct rpc_xprt *xprt = &r_xprt->rx_xprt; ++#endif + struct rpc_rqst *rqst = rep->rr_rqst; + int status; + +@@ -1390,20 +1470,39 @@ void rpcrdma_complete_rqst(struct rpcrdm + goto out_badheader; + + out: ++#ifdef HAVE_XPRT_PIN_RQST ++#ifdef HAVE_XPRT_QUEUE_LOCK + spin_lock(&xprt->queue_lock); ++#else ++ spin_lock(&xprt->recv_lock); ++#endif ++#endif /* HAVE_XPRT_PIN_RQST */ ++ + xprt_complete_rqst(rqst->rq_task, status); ++ ++#ifdef HAVE_XPRT_PIN_RQST + xprt_unpin_rqst(rqst); ++ ++#ifdef HAVE_XPRT_QUEUE_LOCK + spin_unlock(&xprt->queue_lock); ++#else ++ spin_unlock(&xprt->recv_lock); ++#endif ++#endif /* HAVE_XPRT_PIN_RQST */ ++ + return; + + out_badheader: ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_xprtrdma_reply_hdr_err(rep); ++#endif + r_xprt->rx_stats.bad_reply_count++; + rqst->rq_task->tk_status = status; + status = 0; + goto out; + } + ++#ifdef HAVE_XPRT_PIN_RQST + static void rpcrdma_reply_done(struct kref *kref) + { + struct rpcrdma_req *req = +@@ -1411,6 +1510,7 @@ static void rpcrdma_reply_done(struct kr + + rpcrdma_complete_rqst(req->rl_reply); + } ++#endif + + /** + * rpcrdma_reply_handler - Process received RPC/RDMA messages +@@ -1436,8 +1536,13 @@ void rpcrdma_reply_handler(struct rpcrdm + xprt->reestablish_timeout = 0; + + /* Fixed transport header fields */ ++#ifdef HAVE_XDR_INIT_DECODE_RQST_ARG + xdr_init_decode(&rep->rr_stream, &rep->rr_hdrbuf, + rep->rr_hdrbuf.head[0].iov_base, NULL); ++#else ++ xdr_init_decode(&rep->rr_stream, &rep->rr_hdrbuf, ++ rep->rr_hdrbuf.head[0].iov_base); ++#endif + p = xdr_inline_decode(&rep->rr_stream, 4 * sizeof(*p)); + if (unlikely(!p)) + goto out_shortreply; +@@ -1455,12 +1560,36 @@ void rpcrdma_reply_handler(struct rpcrdm + /* Match incoming rpcrdma_rep to an rpcrdma_req to + * get context for handling any incoming chunks. + */ ++#ifdef HAVE_XPRT_PIN_RQST ++#ifdef HAVE_XPRT_QUEUE_LOCK + spin_lock(&xprt->queue_lock); ++#else ++ spin_lock(&xprt->recv_lock); ++#endif + rqst = xprt_lookup_rqst(xprt, rep->rr_xid); + if (!rqst) + goto out_norqst; ++ + xprt_pin_rqst(rqst); ++#ifdef HAVE_XPRT_QUEUE_LOCK + spin_unlock(&xprt->queue_lock); ++#else ++ spin_unlock(&xprt->recv_lock); ++#endif ++ ++ req = rpcr_to_rdmar(rqst); ++#else /* HAVE_XPRT_PIN_RQST */ ++ spin_lock(&buf->rb_lock); ++ req = rpcrdma_lookup_req_locked(&r_xprt->rx_buf, rep->rr_xid); ++ if (!req) { ++ spin_unlock(&buf->rb_lock); ++ goto out; ++ } ++ ++ /* Avoid races with signals and duplicate replies ++ * by marking this req as matched. ++ */ ++#endif /* HAVE_XPRT_PIN_RQST */ + + if (credits == 0) + credits = 1; /* don't deadlock */ +@@ -1471,14 +1600,25 @@ void rpcrdma_reply_handler(struct rpcrdm + if (buf->rb_credits != credits) + rpcrdma_update_cwnd(r_xprt, credits); + +- req = rpcr_to_rdmar(rqst); + if (unlikely(req->rl_reply)) ++#ifdef HAVE_XPRT_PIN_RQST + rpcrdma_rep_put(buf, req->rl_reply); ++#else ++ rpcrdma_recv_buffer_put_locked(req->rl_reply); ++#endif + req->rl_reply = rep; ++ ++#ifdef HAVE_XPRT_PIN_RQST + rep->rr_rqst = rqst; ++#else ++ spin_unlock(&buf->rb_lock); ++#endif + ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_xprtrdma_reply(rqst->rq_task, rep, credits); ++#endif + ++#ifdef HAVE_XPRT_PIN_RQST + if (rep->rr_wc_flags & IB_WC_WITH_INVALIDATE) + frwr_reminv(rep, &req->rl_registered); + if (!list_empty(&req->rl_registered)) +@@ -1486,19 +1626,57 @@ void rpcrdma_reply_handler(struct rpcrdm + /* LocalInv completion will complete the RPC */ + else + kref_put(&req->rl_kref, rpcrdma_reply_done); ++#else ++#ifdef HAVE_RPC_XPRT_RECV_LOCK ++ spin_lock(&xprt->recv_lock); ++#else ++ spin_lock_bh(&xprt->transport_lock); ++#endif ++ ++ rqst = xprt_lookup_rqst(xprt, rep->rr_xid); ++ if (!rqst) { ++#ifdef HAVE_RPC_XPRT_RECV_LOCK ++ spin_unlock(&xprt->recv_lock); ++#else ++ spin_unlock_bh(&xprt->transport_lock); ++#endif ++ goto out; ++ } ++ ++ rep->rr_rqst = rqst; ++ rpcrdma_complete_rqst(rep); ++#ifdef HAVE_RPC_XPRT_RECV_LOCK ++ spin_unlock(&xprt->recv_lock); ++#else ++ spin_unlock_bh(&xprt->transport_lock); ++#endif ++#endif /* HAVE_XPRT_PIN_RQST */ ++ + return; + + out_badversion: ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_xprtrdma_reply_vers_err(rep); ++#endif + goto out; + ++#ifdef HAVE_XPRT_PIN_RQST + out_norqst: ++#ifdef HAVE_XPRT_QUEUE_LOCK + spin_unlock(&xprt->queue_lock); ++#else ++ spin_unlock(&xprt->recv_lock); ++#endif ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_xprtrdma_reply_rqst_err(rep); ++#endif + goto out; ++#endif + + out_shortreply: ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_xprtrdma_reply_short_err(rep); ++#endif + + out: + rpcrdma_rep_put(buf, rep); diff --git a/src/mlnx-ofa_kernel-5.8/backports/0261-BACKPORT-net-sunrpc-xprtrdma-svc_rdma.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0261-BACKPORT-net-sunrpc-xprtrdma-svc_rdma.c.patch new file mode 100644 index 0000000..4e91b69 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0261-BACKPORT-net-sunrpc-xprtrdma-svc_rdma.c.patch @@ -0,0 +1,55 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: net/sunrpc/xprtrdma/svc_rdma.c + +Change-Id: Ie8b2886b865b398a8b6cc3a24595e6d4b90bb763 +--- + net/sunrpc/xprtrdma/svc_rdma.c | 15 +++++++++++++++ + 1 file changed, 15 insertions(+) + +--- a/net/sunrpc/xprtrdma/svc_rdma.c ++++ b/net/sunrpc/xprtrdma/svc_rdma.c +@@ -75,7 +75,11 @@ enum { + }; + + static int svcrdma_counter_handler(struct ctl_table *table, int write, ++#ifdef HAVE_CGROUP_BPF_RUN_FILTER_SYSCTL_7_PARAMETERS + void *buffer, size_t *lenp, loff_t *ppos) ++#else ++ void __user *buffer, size_t *lenp, loff_t *ppos) ++#endif + { + struct percpu_counter *stat = (struct percpu_counter *)table->data; + char tmp[SVCRDMA_COUNTER_BUFSIZ + 1]; +@@ -98,8 +102,13 @@ static int svcrdma_counter_handler(struc + len -= *ppos; + if (len > *lenp) + len = *lenp; ++#ifdef HAVE_CGROUP_BPF_RUN_FILTER_SYSCTL_7_PARAMETERS + if (len) + memcpy(buffer, tmp, len); ++#else ++ if (len && copy_to_user(buffer, tmp, len)) ++ return -EFAULT; ++#endif + *lenp = len; + *ppos += len; + +@@ -276,6 +285,9 @@ out_err: + void svc_rdma_cleanup(void) + { + dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n"); ++#if defined(CONFIG_SUNRPC_BACKCHANNEL) && defined(HAVE_RPC_XPRT_OPS_BC_UP) ++ svc_unreg_xprt_class(&svc_rdma_bc_class); ++#endif + svc_unreg_xprt_class(&svc_rdma_class); + svc_rdma_proc_cleanup(); + } +@@ -296,5 +308,8 @@ int svc_rdma_init(void) + + /* Register RDMA with the SVC transport switch */ + svc_reg_xprt_class(&svc_rdma_class); ++#if defined(CONFIG_SUNRPC_BACKCHANNEL) && defined(HAVE_RPC_XPRT_OPS_BC_UP) ++ svc_reg_xprt_class(&svc_rdma_bc_class); ++#endif + return 0; + } diff --git a/src/mlnx-ofa_kernel-5.8/backports/0262-BACKPORT-net-sunrpc-xprtrdma-svc_rdma_backchannel.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0262-BACKPORT-net-sunrpc-xprtrdma-svc_rdma_backchannel.c.patch new file mode 100644 index 0000000..6cd1b44 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0262-BACKPORT-net-sunrpc-xprtrdma-svc_rdma_backchannel.c.patch @@ -0,0 +1,172 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: net/sunrpc/xprtrdma/svc_rdma_backchannel.c + +Change-Id: I7bf2b55e064b74340fa027b9830ed01666ae8b27 +--- + net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 63 +++++++++++++++++++++- + 1 file changed, 62 insertions(+), 1 deletion(-) + +--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c ++++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +@@ -8,7 +8,9 @@ + #include + + #include "xprt_rdma.h" ++#ifdef HAVE_TRACE_RPCRDMA_H + #include ++#endif + + /** + * svc_rdma_handle_bc_reply - Process incoming backchannel Reply +@@ -24,11 +26,21 @@ void svc_rdma_handle_bc_reply(struct svc + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct xdr_buf *rcvbuf = &rqstp->rq_arg; + struct kvec *dst, *src = &rcvbuf->head[0]; ++#ifdef HAVE_SVC_FILL_WRITE_VECTOR + __be32 *rdma_resp = rctxt->rc_recv_buf; ++#else ++ __be32 *rdma_resp = (__be32 *)src->iov_base; ++#endif + struct rpc_rqst *req; + u32 credits; + ++#ifdef HAVE_XPRT_QUEUE_LOCK + spin_lock(&xprt->queue_lock); ++#elif defined HAVE_RPC_XPRT_RECV_LOCK ++ spin_lock(&xprt->recv_lock); ++#else /* HAVE_XPRT_PIN_RQST is undefined in this case */ ++ spin_lock_bh(&xprt->transport_lock); ++#endif + req = xprt_lookup_rqst(xprt, *rdma_resp); + if (!req) + goto out_unlock; +@@ -38,25 +50,52 @@ void svc_rdma_handle_bc_reply(struct svc + if (dst->iov_len < src->iov_len) + goto out_unlock; + memcpy(dst->iov_base, src->iov_base, src->iov_len); ++ ++#ifdef HAVE_XPRT_PIN_RQST + xprt_pin_rqst(req); ++ ++#ifdef HAVE_XPRT_QUEUE_LOCK + spin_unlock(&xprt->queue_lock); ++#else ++ spin_unlock(&xprt->recv_lock); ++#endif ++#endif /* HAVE_XPRT_PIN_RQST */ + + credits = be32_to_cpup(rdma_resp + 2); + if (credits == 0) + credits = 1; /* don't deadlock */ + else if (credits > r_xprt->rx_buf.rb_bc_max_requests) + credits = r_xprt->rx_buf.rb_bc_max_requests; ++#if defined(HAVE_RPC_XPRT_RECV_LOCK)|| defined(HAVE_XPRT_QUEUE_LOCK) + spin_lock(&xprt->transport_lock); ++#endif + xprt->cwnd = credits << RPC_CWNDSHIFT; ++#if defined(HAVE_RPC_XPRT_RECV_LOCK)|| defined(HAVE_XPRT_QUEUE_LOCK) + spin_unlock(&xprt->transport_lock); ++#endif + ++#ifdef HAVE_XPRT_PIN_RQST ++#ifdef HAVE_XPRT_QUEUE_LOCK + spin_lock(&xprt->queue_lock); ++#else ++ spin_lock(&xprt->recv_lock); ++#endif ++#endif /* HAVE_XPRT_PIN_RQST */ + xprt_complete_rqst(req->rq_task, rcvbuf->len); ++ ++#ifdef HAVE_XPRT_PIN_RQST + xprt_unpin_rqst(req); ++#endif + rcvbuf->len = 0; + + out_unlock: ++#ifdef HAVE_XPRT_QUEUE_LOCK + spin_unlock(&xprt->queue_lock); ++#elif defined HAVE_RPC_XPRT_RECV_LOCK ++ spin_unlock(&xprt->recv_lock); ++#else ++ spin_unlock_bh(&xprt->transport_lock); ++#endif + } + + /* Send a reverse-direction RPC Call. +@@ -76,8 +115,9 @@ static int svc_rdma_bc_sendto(struct svc + struct rpc_rqst *rqst, + struct svc_rdma_send_ctxt *sctxt) + { +- struct svc_rdma_recv_ctxt *rctxt; + int ret; ++#ifdef HAVE_SVC_RDMA_PCL ++ struct svc_rdma_recv_ctxt *rctxt; + + rctxt = svc_rdma_recv_ctxt_get(rdma); + if (!rctxt) +@@ -85,6 +125,10 @@ static int svc_rdma_bc_sendto(struct svc + + ret = svc_rdma_map_reply_msg(rdma, sctxt, rctxt, &rqst->rq_snd_buf); + svc_rdma_recv_ctxt_put(rdma, rctxt); ++#else ++ ++ ret = svc_rdma_map_reply_msg(rdma, sctxt, NULL, &rqst->rq_snd_buf); ++#endif + if (ret < 0) + return -EIO; + +@@ -186,8 +230,14 @@ drop_connection: + * %0 if the message was sent successfully + * %ENOTCONN if the message was not sent + */ ++#ifdef HAVE_XPRT_OPS_SEND_REQUEST_RQST_ARG + static int xprt_rdma_bc_send_request(struct rpc_rqst *rqst) + { ++#else ++static int xprt_rdma_bc_send_request(struct rpc_task *task) ++{ ++ struct rpc_rqst *rqst = task->tk_rqstp; ++#endif + struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt; + struct svcxprt_rdma *rdma = + container_of(sxprt, struct svcxprt_rdma, sc_xprt); +@@ -198,7 +248,11 @@ static int xprt_rdma_bc_send_request(str + + ret = rpcrdma_bc_send_request(rdma, rqst); + if (ret == -ENOTCONN) ++#ifdef HAVE_SVC_XPRT_CLOSE ++ svc_xprt_close(sxprt); ++#else + svc_close_xprt(sxprt); ++#endif + return ret; + } + +@@ -216,16 +270,27 @@ xprt_rdma_bc_put(struct rpc_xprt *xprt) + xprt_free(xprt); + } + ++#ifdef HAVE_RPC_XPRT_OPS_CONST + static const struct rpc_xprt_ops xprt_rdma_bc_procs = { ++#else ++static struct rpc_xprt_ops xprt_rdma_bc_procs = { ++#endif + .reserve_xprt = xprt_reserve_xprt_cong, + .release_xprt = xprt_release_xprt_cong, + .alloc_slot = xprt_alloc_slot, ++#ifdef HAVE_RPC_XPRT_OPS_FREE_SLOT + .free_slot = xprt_free_slot, ++#endif + .release_request = xprt_release_rqst_cong, + .buf_alloc = xprt_rdma_bc_allocate, + .buf_free = xprt_rdma_bc_free, + .send_request = xprt_rdma_bc_send_request, ++#ifdef HAVE_RPC_XPRT_OPS_SET_RETRANS_TIMEOUT ++ .set_retrans_timeout = xprt_set_retrans_timeout_def, ++#endif ++#ifdef HAVE_RPC_XPRT_OPS_WAIT_FOR_REPLY_REQUEST + .wait_for_reply_request = xprt_wait_for_reply_request_def, ++#endif + .close = xprt_rdma_bc_close, + .destroy = xprt_rdma_bc_put, + .print_stats = xprt_rdma_print_stats diff --git a/src/mlnx-ofa_kernel-5.8/backports/0263-BACKPORT-net-sunrpc-xprtrdma-svc_rdma_pcl.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0263-BACKPORT-net-sunrpc-xprtrdma-svc_rdma_pcl.c.patch new file mode 100644 index 0000000..2ae3778 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0263-BACKPORT-net-sunrpc-xprtrdma-svc_rdma_pcl.c.patch @@ -0,0 +1,62 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: net/sunrpc/xprtrdma/svc_rdma_pcl.c + +Change-Id: I38d5097850b7fd4a3024b6ee03917c46ac66950f +--- + net/sunrpc/xprtrdma/svc_rdma_pcl.c | 12 ++++++++++++ + 1 file changed, 12 insertions(+) + +--- a/net/sunrpc/xprtrdma/svc_rdma_pcl.c ++++ b/net/sunrpc/xprtrdma/svc_rdma_pcl.c +@@ -3,11 +3,14 @@ + * Copyright (c) 2020 Oracle. All rights reserved. + */ + ++#ifdef HAVE_SVC_RDMA_PCL + #include + #include + + #include "xprt_rdma.h" ++#ifdef HAVE_TRACE_RPCRDMA_H + #include ++#endif + + /** + * pcl_free - Release all memory associated with a parsed chunk list +@@ -76,7 +79,9 @@ static void pcl_set_read_segment(const s + segment->rs_length = length; + segment->rs_offset = offset; + ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_svcrdma_decode_rseg(&rctxt->rc_cid, chunk, segment); ++#endif + + chunk->ch_length += length; + chunk->ch_segcount++; +@@ -220,7 +225,9 @@ bool pcl_alloc_write(struct svc_rdma_rec + p = xdr_decode_rdma_segment(p, &segment->rs_handle, + &segment->rs_length, + &segment->rs_offset); ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_svcrdma_decode_wseg(&rctxt->rc_cid, chunk, j); ++#endif + + chunk->ch_length += segment->rs_length; + chunk->ch_segcount++; +@@ -238,7 +245,11 @@ static int pcl_process_region(const stru + + if (!length) + return 0; ++#ifdef HAVE_XDR_BUF_SUBSEGMENT_CONST + if (xdr_buf_subsegment(xdr, &subbuf, offset, length)) ++#else ++ if (xdr_buf_subsegment((struct xdr_buf *)(void *)xdr, &subbuf, offset, length)) ++#endif + return -EMSGSIZE; + return actor(&subbuf, data); + } +@@ -304,3 +315,4 @@ int pcl_process_nonpayloads(const struct + + return 0; + } ++#endif diff --git a/src/mlnx-ofa_kernel-5.8/backports/0264-BACKPORT-net-sunrpc-xprtrdma-svc_rdma_recvfrom.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0264-BACKPORT-net-sunrpc-xprtrdma-svc_rdma_recvfrom.c.patch new file mode 100644 index 0000000..80b58b5 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0264-BACKPORT-net-sunrpc-xprtrdma-svc_rdma_recvfrom.c.patch @@ -0,0 +1,1013 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: net/sunrpc/xprtrdma/svc_rdma_recvfrom.c + +Change-Id: Ib3001f0bb9fb665176ad3ada55c7e9588f48b9fb +--- + net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 607 +++++++++++++++++++++++- + 1 file changed, 606 insertions(+), 1 deletion(-) + +--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c ++++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +@@ -92,7 +92,9 @@ + * are transferred from the svc_rdma_recv_ctxt to the second svc_rqst. + */ + ++#ifdef HAVE_SVC_RDMA_PCL + #include ++#endif + #include + #include + #include +@@ -104,7 +106,9 @@ + #include + + #include "xprt_rdma.h" ++#ifdef HAVE_TRACE_RPCRDMA_H + #include ++#endif + + static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc); + +@@ -115,6 +119,7 @@ svc_rdma_next_recv_ctxt(struct list_head + rc_list); + } + ++#ifdef HAVE_SVC_FILL_WRITE_VECTOR + static void svc_rdma_recv_cid_init(struct svcxprt_rdma *rdma, + struct rpc_rdma_cid *cid) + { +@@ -141,10 +146,12 @@ svc_rdma_recv_ctxt_alloc(struct svcxprt_ + goto fail2; + + svc_rdma_recv_cid_init(rdma, &ctxt->rc_cid); ++#ifdef HAVE_SVC_RDMA_PCL + pcl_init(&ctxt->rc_call_pcl); + pcl_init(&ctxt->rc_read_pcl); + pcl_init(&ctxt->rc_write_pcl); + pcl_init(&ctxt->rc_reply_pcl); ++#endif + + ctxt->rc_recv_wr.next = NULL; + ctxt->rc_recv_wr.wr_cqe = &ctxt->rc_cqe; +@@ -174,6 +181,7 @@ static void svc_rdma_recv_ctxt_destroy(s + kfree(ctxt->rc_recv_buf); + kfree(ctxt); + } ++#endif + + /** + * svc_rdma_recv_ctxts_destroy - Release all recv_ctxt's for an xprt +@@ -183,14 +191,21 @@ static void svc_rdma_recv_ctxt_destroy(s + void svc_rdma_recv_ctxts_destroy(struct svcxprt_rdma *rdma) + { + struct svc_rdma_recv_ctxt *ctxt; ++#ifdef HAVE_SVC_FILL_WRITE_VECTOR + struct llist_node *node; + + while ((node = llist_del_first(&rdma->sc_recv_ctxts))) { + ctxt = llist_entry(node, struct svc_rdma_recv_ctxt, rc_node); + svc_rdma_recv_ctxt_destroy(rdma, ctxt); ++#else ++ while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_recv_ctxts))) { ++ list_del(&ctxt->rc_list); ++ kfree(ctxt); ++#endif + } + } + ++#ifdef HAVE_SVC_RDMA_PCL + /** + * svc_rdma_recv_ctxt_get - Allocate a recv_ctxt + * @rdma: controlling svcxprt_rdma +@@ -198,26 +213,66 @@ void svc_rdma_recv_ctxts_destroy(struct + * Returns a recv_ctxt or (rarely) NULL if none are available. + */ + struct svc_rdma_recv_ctxt *svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma) ++#else ++static struct svc_rdma_recv_ctxt * ++svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma) ++#endif + { + struct svc_rdma_recv_ctxt *ctxt; ++#ifdef HAVE_SVC_FILL_WRITE_VECTOR + struct llist_node *node; + + node = llist_del_first(&rdma->sc_recv_ctxts); + if (!node) + goto out_empty; + ctxt = llist_entry(node, struct svc_rdma_recv_ctxt, rc_node); +- ++#else ++ spin_lock(&rdma->sc_recv_lock); ++ ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_recv_ctxts); ++ if (!ctxt) ++ goto out_empty; ++ list_del(&ctxt->rc_list); ++ spin_unlock(&rdma->sc_recv_lock); ++#endif ++ + out: ++#ifndef HAVE_SVC_FILL_WRITE_VECTOR ++ ctxt->rc_recv_wr.num_sge = 0; ++#endif + ctxt->rc_page_count = 0; ++#ifndef HAVE_SVC_RDMA_PCL ++ ctxt->rc_read_payload_length = 0; ++#endif + return ctxt; + + out_empty: ++#ifdef HAVE_SVC_FILL_WRITE_VECTOR + ctxt = svc_rdma_recv_ctxt_alloc(rdma); ++#else ++ spin_unlock(&rdma->sc_recv_lock); ++ ++ ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL); ++#endif + if (!ctxt) + return NULL; + goto out; + } + ++#ifndef HAVE_SVC_FILL_WRITE_VECTOR ++static void svc_rdma_recv_ctxt_unmap(struct svcxprt_rdma *rdma, ++ struct svc_rdma_recv_ctxt *ctxt) ++{ ++ struct ib_device *device = rdma->sc_cm_id->device; ++ int i; ++ ++ for (i = 0; i < ctxt->rc_recv_wr.num_sge; i++) ++ ib_dma_unmap_page(device, ++ ctxt->rc_sges[i].addr, ++ ctxt->rc_sges[i].length, ++ DMA_FROM_DEVICE); ++} ++#endif ++ + /** + * svc_rdma_recv_ctxt_put - Return recv_ctxt to free list + * @rdma: controlling svcxprt_rdma +@@ -227,17 +282,33 @@ out_empty: + void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma, + struct svc_rdma_recv_ctxt *ctxt) + { ++#ifndef HAVE_SVC_RDMA_PCL ++ unsigned int i; ++ ++ for (i = 0; i < ctxt->rc_page_count; i++) ++ put_page(ctxt->rc_pages[i]); ++#endif ++ ++#ifdef HAVE_SVC_RDMA_PCL + pcl_free(&ctxt->rc_call_pcl); + pcl_free(&ctxt->rc_read_pcl); + pcl_free(&ctxt->rc_write_pcl); + pcl_free(&ctxt->rc_reply_pcl); ++#endif + ++#ifdef HAVE_SVC_FILL_WRITE_VECTOR + if (!ctxt->rc_temp) + llist_add(&ctxt->rc_node, &rdma->sc_recv_ctxts); + else + svc_rdma_recv_ctxt_destroy(rdma, ctxt); ++#else ++ spin_lock(&rdma->sc_recv_lock); ++ list_add(&ctxt->rc_list, &rdma->sc_recv_ctxts); ++ spin_unlock(&rdma->sc_recv_lock); ++#endif + } + ++#ifdef HAVE_SVC_RDMA_RELEASE_RQST + /** + * svc_rdma_release_rqst - Release transport-specific per-rqst resources + * @rqstp: svc_rqst being released +@@ -257,7 +328,9 @@ void svc_rdma_release_rqst(struct svc_rq + if (ctxt) + svc_rdma_recv_ctxt_put(rdma, ctxt); + } ++#endif + ++#ifdef HAVE_SVC_RDMA_PCL + static bool svc_rdma_refresh_recvs(struct svcxprt_rdma *rdma, + unsigned int wanted, bool temp) + { +@@ -275,11 +348,15 @@ static bool svc_rdma_refresh_recvs(struc + if (!ctxt) + break; + ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_svcrdma_post_recv(ctxt); ++#endif + ctxt->rc_temp = temp; + ctxt->rc_recv_wr.next = recv_chain; + recv_chain = &ctxt->rc_recv_wr; ++#ifdef HAVE_SVCXPRT_RDMA_SC_PENDING_RECVS + rdma->sc_pending_recvs++; ++#endif + } + if (!recv_chain) + return false; +@@ -290,7 +367,9 @@ static bool svc_rdma_refresh_recvs(struc + return true; + + err_free: ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_svcrdma_rq_post_err(rdma, ret); ++#endif + while (bad_wr) { + ctxt = container_of(bad_wr, struct svc_rdma_recv_ctxt, + rc_recv_wr); +@@ -301,7 +380,106 @@ err_free: + * sc_pending_recvs. */ + return false; + } ++#else ++#ifdef HAVE_SVC_FILL_WRITE_VECTOR ++static int __svc_rdma_post_recv(struct svcxprt_rdma *rdma, ++ struct svc_rdma_recv_ctxt *ctxt) ++{ ++ int ret; ++ ++#ifdef HAVE_TRACE_RPCRDMA_H ++ trace_svcrdma_post_recv(ctxt); ++#endif ++ ret = ib_post_recv(rdma->sc_qp, &ctxt->rc_recv_wr, NULL); ++ if (ret) ++ goto err_post; ++ return 0; ++ ++err_post: ++#ifdef HAVE_TRACE_RPCRDMA_H ++ trace_svcrdma_rq_post_err(rdma, ret); ++#endif ++ svc_rdma_recv_ctxt_put(rdma, ctxt); ++ return ret; ++} ++ ++static int svc_rdma_post_recv(struct svcxprt_rdma *rdma) ++{ ++ struct svc_rdma_recv_ctxt *ctxt; ++ ++ if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags)) ++ return 0; ++ ctxt = svc_rdma_recv_ctxt_get(rdma); ++ if (!ctxt) ++ return -ENOMEM; ++ return __svc_rdma_post_recv(rdma, ctxt); ++} ++#else ++static int svc_rdma_post_recv(struct svcxprt_rdma *rdma) ++{ ++ struct ib_device *device = rdma->sc_cm_id->device; ++ struct svc_rdma_recv_ctxt *ctxt; ++ int sge_no, buflen, ret; ++ struct page *page; ++ dma_addr_t pa; ++ ++ ctxt = svc_rdma_recv_ctxt_get(rdma); ++ if (!ctxt) ++ return -ENOMEM; ++ ++ buflen = 0; ++ ctxt->rc_cqe.done = svc_rdma_wc_receive; ++ for (sge_no = 0; buflen < rdma->sc_max_req_size; sge_no++) { ++ if (sge_no >= rdma->sc_max_send_sges) { ++ pr_err("svcrdma: Too many sges (%d)\n", sge_no); ++ goto err_put_ctxt; ++ } + ++ page = alloc_page(GFP_KERNEL); ++ if (!page) ++ goto err_put_ctxt; ++ ctxt->rc_pages[sge_no] = page; ++ ctxt->rc_page_count++; ++ ++ pa = ib_dma_map_page(device, ctxt->rc_pages[sge_no], ++ 0, PAGE_SIZE, DMA_FROM_DEVICE); ++ if (ib_dma_mapping_error(device, pa)) ++ goto err_put_ctxt; ++ ctxt->rc_sges[sge_no].addr = pa; ++ ctxt->rc_sges[sge_no].length = PAGE_SIZE; ++ ctxt->rc_sges[sge_no].lkey = rdma->sc_pd->local_dma_lkey; ++ ctxt->rc_recv_wr.num_sge++; ++ ++ buflen += PAGE_SIZE; ++ } ++ ctxt->rc_recv_wr.next = NULL; ++ ctxt->rc_recv_wr.sg_list = &ctxt->rc_sges[0]; ++ ctxt->rc_recv_wr.wr_cqe = &ctxt->rc_cqe; ++ ++#ifdef HAVE_TRACE_RPCRDMA_H ++ trace_svcrdma_post_recv(ctxt); ++#endif ++ ret = ib_post_recv(rdma->sc_qp, &ctxt->rc_recv_wr, NULL); ++ if (ret) ++ goto err_post; ++ return 0; ++ ++err_put_ctxt: ++ svc_rdma_recv_ctxt_unmap(rdma, ctxt); ++ svc_rdma_recv_ctxt_put(rdma, ctxt); ++ return -ENOMEM; ++ ++err_post: ++ svc_rdma_recv_ctxt_unmap(rdma, ctxt); ++#ifdef HAVE_TRACE_RPCRDMA_H ++ trace_svcrdma_rq_post_err(rdma, ret); ++#endif ++ svc_rdma_recv_ctxt_put(rdma, ctxt); ++ return ret; ++} ++#endif /* end of HAVE_SVC_FILL_WRITE_VECTOR */ ++#endif /* end of HAVE_SVC_RDMA_PCL */ ++ + /** + * svc_rdma_post_recvs - Post initial set of Recv WRs + * @rdma: fresh svcxprt_rdma +@@ -310,7 +488,30 @@ err_free: + */ + bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma) + { ++#ifdef HAVE_SVC_RDMA_PCL + return svc_rdma_refresh_recvs(rdma, rdma->sc_max_requests, true); ++#else ++#ifdef HAVE_SVC_FILL_WRITE_VECTOR ++ struct svc_rdma_recv_ctxt *ctxt; ++#endif ++ unsigned int i; ++ int ret; ++ ++ for (i = 0; i < rdma->sc_max_requests; i++) { ++#ifdef HAVE_SVC_FILL_WRITE_VECTOR ++ ctxt = svc_rdma_recv_ctxt_get(rdma); ++ if (!ctxt) ++ return false; ++ ctxt->rc_temp = true; ++ ret = __svc_rdma_post_recv(rdma, ctxt); ++#else ++ ret = svc_rdma_post_recv(rdma); ++#endif ++ if (ret) ++ return false; ++ } ++ return true; ++#endif + } + + /** +@@ -325,15 +526,23 @@ static void svc_rdma_wc_receive(struct i + struct ib_cqe *cqe = wc->wr_cqe; + struct svc_rdma_recv_ctxt *ctxt; + ++#ifdef HAVE_SVCXPRT_RDMA_SC_PENDING_RECVS + rdma->sc_pending_recvs--; ++#endif + + /* WARNING: Only wc->wr_cqe and wc->status are reliable */ + ctxt = container_of(cqe, struct svc_rdma_recv_ctxt, rc_cqe); ++#ifndef HAVE_SVC_FILL_WRITE_VECTOR ++ svc_rdma_recv_ctxt_unmap(rdma, ctxt); ++#endif + + if (wc->status != IB_WC_SUCCESS) + goto flushed; ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_svcrdma_wc_recv(wc, &ctxt->rc_cid); ++#endif + ++#ifdef HAVE_SVC_RDMA_PCL + /* If receive posting fails, the connection is about to be + * lost anyway. The server will not be able to send a reply + * for this RPC, and the client will retransmit this RPC +@@ -343,9 +552,18 @@ static void svc_rdma_wc_receive(struct i + * to reduce the likelihood of replayed requests once the + * client reconnects. + */ ++#ifdef HAVE_SVCXPRT_RDMA_SC_PENDING_RECVS + if (rdma->sc_pending_recvs < rdma->sc_max_requests) + if (!svc_rdma_refresh_recvs(rdma, rdma->sc_recv_batch, false)) + goto dropped; ++#else ++ if (!svc_rdma_refresh_recvs(rdma, 1, false)) ++ goto dropped; ++#endif ++#else ++ if (svc_rdma_post_recv(rdma)) ++ goto dropped; ++#endif + + /* All wc fields are now known to be valid */ + ctxt->rc_byte_len = wc->byte_len; +@@ -360,13 +578,22 @@ static void svc_rdma_wc_receive(struct i + return; + + flushed: ++#ifdef HAVE_TRACE_RPCRDMA_H + if (wc->status == IB_WC_WR_FLUSH_ERR) + trace_svcrdma_wc_recv_flush(wc, &ctxt->rc_cid); + else + trace_svcrdma_wc_recv_err(wc, &ctxt->rc_cid); ++#endif + dropped: + svc_rdma_recv_ctxt_put(rdma, ctxt); ++#ifdef HAVE_SVC_XPRT_DEFERRED_CLOSE + svc_xprt_deferred_close(&rdma->sc_xprt); ++#else ++ set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); ++#endif ++#ifndef HAVE_SVC_RDMA_PCL ++ svc_xprt_enqueue(&rdma->sc_xprt); ++#endif + } + + /** +@@ -378,6 +605,13 @@ void svc_rdma_flush_recv_queues(struct s + { + struct svc_rdma_recv_ctxt *ctxt; + ++#ifndef HAVE_SVC_RDMA_PCL ++ while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_read_complete_q))) { ++ list_del(&ctxt->rc_list); ++ svc_rdma_recv_ctxt_put(rdma, ctxt); ++ } ++#endif ++ + while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_rq_dto_q))) { + list_del(&ctxt->rc_list); + svc_rdma_recv_ctxt_put(rdma, ctxt); +@@ -387,6 +621,7 @@ void svc_rdma_flush_recv_queues(struct s + static void svc_rdma_build_arg_xdr(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *ctxt) + { ++#ifdef HAVE_SVC_FILL_WRITE_VECTOR + struct xdr_buf *arg = &rqstp->rq_arg; + + arg->head[0].iov_base = ctxt->rc_recv_buf; +@@ -397,8 +632,72 @@ static void svc_rdma_build_arg_xdr(struc + arg->page_base = 0; + arg->buflen = ctxt->rc_byte_len; + arg->len = ctxt->rc_byte_len; ++#else ++ struct page *page; ++ int sge_no; ++ u32 len; ++ ++ /* The reply path assumes the Call's transport header resides ++ * in rqstp->rq_pages[0]. ++ */ ++ page = ctxt->rc_pages[0]; ++ put_page(rqstp->rq_pages[0]); ++ rqstp->rq_pages[0] = page; ++ ++ /* Set up the XDR head */ ++ rqstp->rq_arg.head[0].iov_base = page_address(page); ++ rqstp->rq_arg.head[0].iov_len = ++ min_t(size_t, ctxt->rc_byte_len, ctxt->rc_sges[0].length); ++ rqstp->rq_arg.len = ctxt->rc_byte_len; ++ rqstp->rq_arg.buflen = ctxt->rc_byte_len; ++ ++ /* Compute bytes past head in the SGL */ ++ len = ctxt->rc_byte_len - rqstp->rq_arg.head[0].iov_len; ++ ++ /* If data remains, store it in the pagelist */ ++ rqstp->rq_arg.page_len = len; ++ rqstp->rq_arg.page_base = 0; ++ ++ sge_no = 1; ++ while (len && sge_no < ctxt->rc_recv_wr.num_sge) { ++ page = ctxt->rc_pages[sge_no]; ++ put_page(rqstp->rq_pages[sge_no]); ++ rqstp->rq_pages[sge_no] = page; ++ len -= min_t(u32, len, ctxt->rc_sges[sge_no].length); ++ sge_no++; ++ } ++ ctxt->rc_hdr_count = sge_no; ++ rqstp->rq_respages = &rqstp->rq_pages[sge_no]; ++ rqstp->rq_next_page = rqstp->rq_respages + 1; ++ ++ /* If not all pages were used from the SGL, free the remaining ones */ ++ while (sge_no < ctxt->rc_recv_wr.num_sge) { ++ page = ctxt->rc_pages[sge_no++]; ++ put_page(page); ++ } ++ ++ /* @ctxt's pages have all been released or moved to @rqstp->rq_pages. ++ */ ++ ctxt->rc_page_count = 0; ++ ++ /* Set up tail */ ++ rqstp->rq_arg.tail[0].iov_base = NULL; ++ rqstp->rq_arg.tail[0].iov_len = 0; ++#endif + } + ++#ifndef HAVE_SVC_RDMA_PCL ++/* This accommodates the largest possible Write chunk. ++ */ ++#define MAX_BYTES_WRITE_CHUNK ((u32)(RPCSVC_MAXPAGES << PAGE_SHIFT)) ++ ++/* This accommodates the largest possible Position-Zero ++ * Read chunk or Reply chunk. ++ */ ++#define MAX_BYTES_SPECIAL_CHUNK ((u32)((RPCSVC_MAXPAGES + 2) << PAGE_SHIFT)) ++#endif ++ ++#ifdef HAVE_SVC_RDMA_PCL + /** + * xdr_count_read_segments - Count number of Read segments in Read list + * @rctxt: Ingress receive context +@@ -517,6 +816,92 @@ static bool xdr_count_write_chunks(struc + } + return true; + } ++#else ++/* Sanity check the Read list. ++ * ++ * Implementation limits: ++ * - This implementation supports only one Read chunk. ++ * ++ * Sanity checks: ++ * - Read list does not overflow Receive buffer. ++ * - Segment size limited by largest NFS data payload. ++ * ++ * The segment count is limited to how many segments can ++ * fit in the transport header without overflowing the ++ * buffer. That's about 40 Read segments for a 1KB inline ++ * threshold. ++ * ++ * Return values: ++ * %true: Read list is valid. @rctxt's xdr_stream is updated ++ * to point to the first byte past the Read list. ++ * %false: Read list is corrupt. @rctxt's xdr_stream is left ++ * in an unknown state. ++ */ ++static bool xdr_check_read_list(struct svc_rdma_recv_ctxt *rctxt) ++{ ++ u32 position, len; ++ bool first; ++ __be32 *p; ++ ++ p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p)); ++ if (!p) ++ return false; ++ ++ len = 0; ++ first = true; ++ while (xdr_item_is_present(p)) { ++ p = xdr_inline_decode(&rctxt->rc_stream, ++ rpcrdma_readseg_maxsz * sizeof(*p)); ++ if (!p) ++ return false; ++ ++ if (first) { ++ position = be32_to_cpup(p); ++ first = false; ++ } else if (be32_to_cpup(p) != position) { ++ return false; ++ } ++ p += 2; ++ len += be32_to_cpup(p); ++ ++ p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p)); ++ if (!p) ++ return false; ++ } ++ return len <= MAX_BYTES_SPECIAL_CHUNK; ++} ++ ++/* The segment count is limited to how many segments can ++ * fit in the transport header without overflowing the ++ * buffer. That's about 60 Write segments for a 1KB inline ++ * threshold. ++ */ ++static bool xdr_check_write_chunk(struct svc_rdma_recv_ctxt *rctxt, u32 maxlen) ++{ ++ u32 i, segcount, total; ++ __be32 *p; ++ ++ p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p)); ++ if (!p) ++ return false; ++ segcount = be32_to_cpup(p); ++ ++ total = 0; ++ for (i = 0; i < segcount; i++) { ++ u32 handle, length; ++ u64 offset; ++ ++ p = xdr_inline_decode(&rctxt->rc_stream, ++ rpcrdma_segment_maxsz * sizeof(*p)); ++ if (!p) ++ return false; ++ ++ xdr_decode_rdma_segment(p, &handle, &length, &offset); ++ total += length; ++ } ++ return total <= maxlen; ++} ++#endif + + /* Sanity check the Write list. + * +@@ -535,11 +920,16 @@ static bool xdr_count_write_chunks(struc + */ + static bool xdr_check_write_list(struct svc_rdma_recv_ctxt *rctxt) + { ++#ifndef HAVE_SVC_RDMA_PCL ++ u32 chcount = 0; ++#endif + __be32 *p; + + p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p)); + if (!p) + return false; ++ ++#ifdef HAVE_SVC_RDMA_PCL + if (!xdr_count_write_chunks(rctxt, p)) + return false; + if (!pcl_alloc_write(rctxt, &rctxt->rc_write_pcl, p)) +@@ -547,6 +937,20 @@ static bool xdr_check_write_list(struct + + rctxt->rc_cur_result_payload = pcl_first_chunk(&rctxt->rc_write_pcl); + return true; ++#else ++ rctxt->rc_write_list = p; ++ while (xdr_item_is_present(p)) { ++ if (!xdr_check_write_chunk(rctxt, MAX_BYTES_WRITE_CHUNK)) ++ return false; ++ ++chcount; ++ p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p)); ++ if (!p) ++ return false; ++ } ++ if (!chcount) ++ rctxt->rc_write_list = NULL; ++ return chcount < 2; ++#endif + } + + /* Sanity check the Reply chunk. +@@ -569,6 +973,7 @@ static bool xdr_check_reply_chunk(struct + if (!p) + return false; + ++#ifdef HAVE_SVC_RDMA_PCL + if (!xdr_item_is_present(p)) + return true; + if (!xdr_check_write_chunk(rctxt)) +@@ -576,6 +981,15 @@ static bool xdr_check_reply_chunk(struct + + rctxt->rc_reply_pcl.cl_count = 1; + return pcl_alloc_write(rctxt, &rctxt->rc_reply_pcl, p); ++#else ++ rctxt->rc_reply_chunk = NULL; ++ if (xdr_item_is_present(p)) { ++ if (!xdr_check_write_chunk(rctxt, MAX_BYTES_SPECIAL_CHUNK)) ++ return false; ++ rctxt->rc_reply_chunk = p; ++ } ++ return true; ++#endif + } + + /* RPC-over-RDMA Version One private extension: Remote Invalidation. +@@ -588,6 +1002,7 @@ static bool xdr_check_reply_chunk(struct + static void svc_rdma_get_inv_rkey(struct svcxprt_rdma *rdma, + struct svc_rdma_recv_ctxt *ctxt) + { ++#ifdef HAVE_SVC_RDMA_PCL + struct svc_rdma_segment *segment; + struct svc_rdma_chunk *chunk; + u32 inv_rkey; +@@ -631,6 +1046,59 @@ static void svc_rdma_get_inv_rkey(struct + } + } + ctxt->rc_inv_rkey = inv_rkey; ++#else ++ __be32 inv_rkey, *p; ++ u32 i, segcount; ++ ++ ctxt->rc_inv_rkey = 0; ++ ++ if (!rdma->sc_snd_w_inv) ++ return; ++ ++ inv_rkey = xdr_zero; ++#ifdef HAVE_SVC_FILL_WRITE_VECTOR ++ p = ctxt->rc_recv_buf; ++#else ++ p = page_address(ctxt->rc_pages[0]); ++#endif ++ p += rpcrdma_fixed_maxsz; ++ ++ /* Read list */ ++ while (xdr_item_is_present(p++)) { ++ p++; /* position */ ++ if (inv_rkey == xdr_zero) ++ inv_rkey = *p; ++ else if (inv_rkey != *p) ++ return; ++ p += 4; ++ } ++ ++ /* Write list */ ++ while (xdr_item_is_present(p++)) { ++ segcount = be32_to_cpup(p++); ++ for (i = 0; i < segcount; i++) { ++ if (inv_rkey == xdr_zero) ++ inv_rkey = *p; ++ else if (inv_rkey != *p) ++ return; ++ p += 4; ++ } ++ } ++ ++ /* Reply chunk */ ++ if (xdr_item_is_present(p++)) { ++ segcount = be32_to_cpup(p++); ++ for (i = 0; i < segcount; i++) { ++ if (inv_rkey == xdr_zero) ++ inv_rkey = *p; ++ else if (inv_rkey != *p) ++ return; ++ p += 4; ++ } ++ } ++ ++ ctxt->rc_inv_rkey = be32_to_cpu(inv_rkey); ++#endif + } + + /** +@@ -656,7 +1124,11 @@ static int svc_rdma_xdr_decode_req(struc + unsigned int hdr_len; + + rdma_argp = rq_arg->head[0].iov_base; ++#ifdef HAVE_XDR_INIT_DECODE_RQST_ARG + xdr_init_decode(&rctxt->rc_stream, rq_arg, rdma_argp, NULL); ++#else ++ xdr_init_decode(&rctxt->rc_stream, rq_arg, rdma_argp); ++#endif + + p = xdr_inline_decode(&rctxt->rc_stream, + rpcrdma_fixed_maxsz * sizeof(*p)); +@@ -666,8 +1138,12 @@ static int svc_rdma_xdr_decode_req(struc + if (*p != rpcrdma_version) + goto out_version; + p += 2; ++#ifdef HAVE_SVC_RDMA_PCL + rctxt->rc_msgtype = *p; + switch (rctxt->rc_msgtype) { ++#else ++ switch (*p) { ++#endif + case rdma_msg: + break; + case rdma_nomsg: +@@ -691,30 +1167,73 @@ static int svc_rdma_xdr_decode_req(struc + hdr_len = xdr_stream_pos(&rctxt->rc_stream); + rq_arg->head[0].iov_len -= hdr_len; + rq_arg->len -= hdr_len; ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_svcrdma_decode_rqst(rctxt, rdma_argp, hdr_len); ++#endif + return hdr_len; + + out_short: ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_svcrdma_decode_short_err(rctxt, rq_arg->len); ++#endif + return -EINVAL; + + out_version: ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_svcrdma_decode_badvers_err(rctxt, rdma_argp); ++#endif + return -EPROTONOSUPPORT; + + out_drop: ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_svcrdma_decode_drop_err(rctxt, rdma_argp); ++#endif + return 0; + + out_proc: ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_svcrdma_decode_badproc_err(rctxt, rdma_argp); ++#endif + return -EINVAL; + + out_inval: ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_svcrdma_decode_parse_err(rctxt, rdma_argp); ++#endif + return -EINVAL; + } + ++#ifndef HAVE_SVC_RDMA_PCL ++static void rdma_read_complete(struct svc_rqst *rqstp, ++ struct svc_rdma_recv_ctxt *head) ++{ ++ int page_no; ++ ++ /* Move Read chunk pages to rqstp so that they will be released ++ * when svc_process is done with them. ++ */ ++ for (page_no = 0; page_no < head->rc_page_count; page_no++) { ++ put_page(rqstp->rq_pages[page_no]); ++ rqstp->rq_pages[page_no] = head->rc_pages[page_no]; ++ } ++ head->rc_page_count = 0; ++ ++ /* Point rq_arg.pages past header */ ++ rqstp->rq_arg.pages = &rqstp->rq_pages[head->rc_hdr_count]; ++ rqstp->rq_arg.page_len = head->rc_arg.page_len; ++ ++ /* rq_respages starts after the last arg page */ ++ rqstp->rq_respages = &rqstp->rq_pages[page_no]; ++ rqstp->rq_next_page = rqstp->rq_respages + 1; ++ ++ /* Rebuild rq_arg head and tail. */ ++ rqstp->rq_arg.head[0] = head->rc_arg.head[0]; ++ rqstp->rq_arg.tail[0] = head->rc_arg.tail[0]; ++ rqstp->rq_arg.len = head->rc_arg.len; ++ rqstp->rq_arg.buflen = head->rc_arg.buflen; ++} ++#endif ++ + static void svc_rdma_send_error(struct svcxprt_rdma *rdma, + struct svc_rdma_recv_ctxt *rctxt, + int status) +@@ -732,10 +1251,15 @@ static void svc_rdma_send_error(struct s + * the RPC/RDMA header small and fixed in size, so it is + * straightforward to check the RPC header's direction field. + */ ++#ifdef HAVE_SVC_RDMA_PCL + static bool svc_rdma_is_reverse_direction_reply(struct svc_xprt *xprt, + struct svc_rdma_recv_ctxt *rctxt) + { ++#ifdef HAVE_SVC_FILL_WRITE_VECTOR + __be32 *p = rctxt->rc_recv_buf; ++#else ++ __be32 *p = page_address(rctxt->rc_pages[0]); ++#endif + + if (!xprt->xpt_bc_xprt) + return false; +@@ -758,6 +1282,36 @@ static bool svc_rdma_is_reverse_directio + + return true; + } ++#else ++static bool svc_rdma_is_backchannel_reply(struct svc_xprt *xprt, ++ __be32 *rdma_resp) ++{ ++ __be32 *p; ++ ++ if (!xprt->xpt_bc_xprt) ++ return false; ++ ++ p = rdma_resp + 3; ++ if (*p++ != rdma_msg) ++ return false; ++ ++ if (*p++ != xdr_zero) ++ return false; ++ if (*p++ != xdr_zero) ++ return false; ++ if (*p++ != xdr_zero) ++ return false; ++ ++ /* XID sanity */ ++ if (*p++ != *rdma_resp) ++ return false; ++ /* call direction */ ++ if (*p == cpu_to_be32(RPC_CALL)) ++ return false; ++ ++ return true; ++} ++#endif + + /** + * svc_rdma_recvfrom - Receive an RPC call +@@ -795,12 +1349,26 @@ int svc_rdma_recvfrom(struct svc_rqst *r + struct svcxprt_rdma *rdma_xprt = + container_of(xprt, struct svcxprt_rdma, sc_xprt); + struct svc_rdma_recv_ctxt *ctxt; ++#ifndef HAVE_SVC_RDMA_PCL ++ __be32 *p; ++#endif + int ret; + ++#ifdef HAVE_SVC_RDMA_RELEASE_RQST + rqstp->rq_xprt_ctxt = NULL; ++#endif + + ctxt = NULL; + spin_lock(&rdma_xprt->sc_rq_dto_lock); ++#ifndef HAVE_SVC_RDMA_PCL ++ ctxt = svc_rdma_next_recv_ctxt(&rdma_xprt->sc_read_complete_q); ++ if (ctxt) { ++ list_del(&ctxt->rc_list); ++ spin_unlock(&rdma_xprt->sc_rq_dto_lock); ++ rdma_read_complete(rqstp, ctxt); ++ goto complete; ++ } ++#endif + ctxt = svc_rdma_next_recv_ctxt(&rdma_xprt->sc_rq_dto_q); + if (ctxt) + list_del(&ctxt->rc_list); +@@ -809,53 +1377,92 @@ int svc_rdma_recvfrom(struct svc_rqst *r + clear_bit(XPT_DATA, &xprt->xpt_flags); + spin_unlock(&rdma_xprt->sc_rq_dto_lock); + ++#ifdef HAVE_SVC_XPRT_RECEIVED + /* Unblock the transport for the next receive */ + svc_xprt_received(xprt); ++#endif ++ + if (!ctxt) + return 0; + + percpu_counter_inc(&svcrdma_stat_recv); ++#ifdef HAVE_SVC_FILL_WRITE_VECTOR + ib_dma_sync_single_for_cpu(rdma_xprt->sc_pd->device, + ctxt->rc_recv_sge.addr, ctxt->rc_byte_len, + DMA_FROM_DEVICE); ++#endif + svc_rdma_build_arg_xdr(rqstp, ctxt); + ++#ifdef HAVE_SVC_FILL_WRITE_VECTOR + /* Prevent svc_xprt_release from releasing pages in rq_pages + * if we return 0 or an error. + */ + rqstp->rq_respages = rqstp->rq_pages; + rqstp->rq_next_page = rqstp->rq_respages; ++#endif ++ ++#ifndef HAVE_SVC_RDMA_PCL ++ p = (__be32 *)rqstp->rq_arg.head[0].iov_base; ++#endif + + ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg, ctxt); + if (ret < 0) + goto out_err; + if (ret == 0) + goto out_drop; ++#ifdef HAVE_SVC_RQST_RQ_XPRT_HLEN + rqstp->rq_xprt_hlen = 0; ++#endif + ++#ifdef HAVE_SVC_RDMA_PCL + if (svc_rdma_is_reverse_direction_reply(xprt, ctxt)) ++#else ++ if (svc_rdma_is_backchannel_reply(xprt, p)) ++#endif + goto out_backchannel; + + svc_rdma_get_inv_rkey(rdma_xprt, ctxt); + ++#ifdef HAVE_SVC_RDMA_PCL + if (!pcl_is_empty(&ctxt->rc_read_pcl) || + !pcl_is_empty(&ctxt->rc_call_pcl)) { + ret = svc_rdma_process_read_list(rdma_xprt, rqstp, ctxt); + if (ret < 0) + goto out_readfail; + } ++#else ++ p += rpcrdma_fixed_maxsz; ++ if (*p != xdr_zero) ++ goto out_readchunk; ++#endif ++ ++#ifndef HAVE_SVC_RDMA_PCL ++complete: ++#endif + + rqstp->rq_xprt_ctxt = ctxt; + rqstp->rq_prot = IPPROTO_MAX; + svc_xprt_copy_addrs(rqstp, xprt); + return rqstp->rq_arg.len; + ++#ifndef HAVE_SVC_RDMA_PCL ++out_readchunk: ++ ret = svc_rdma_recv_read_chunk(rdma_xprt, rqstp, ctxt, p); ++ if (ret < 0) ++ goto out_postfail; ++ return 0; ++#endif ++ + out_err: + svc_rdma_send_error(rdma_xprt, ctxt, ret); + svc_rdma_recv_ctxt_put(rdma_xprt, ctxt); + return 0; + ++#ifdef HAVE_SVC_RDMA_PCL + out_readfail: ++#else ++out_postfail: ++#endif + if (ret == -EINVAL) + svc_rdma_send_error(rdma_xprt, ctxt, ret); + svc_rdma_recv_ctxt_put(rdma_xprt, ctxt); diff --git a/src/mlnx-ofa_kernel-5.8/backports/0266-BACKPORT-net-sunrpc-xprtrdma-svc_rdma_sendto.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0266-BACKPORT-net-sunrpc-xprtrdma-svc_rdma_sendto.c.patch new file mode 100644 index 0000000..1de94b7 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0266-BACKPORT-net-sunrpc-xprtrdma-svc_rdma_sendto.c.patch @@ -0,0 +1,813 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: net/sunrpc/xprtrdma/svc_rdma_sendto.c + +Change-Id: If2545499bef1576c97208b12bf909f15458876d6 +--- + net/sunrpc/xprtrdma/svc_rdma_sendto.c | 483 +++++++++++++++++++++++++- + 1 file changed, 482 insertions(+), 1 deletion(-) + +--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c ++++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c +@@ -109,7 +109,9 @@ + #include + + #include "xprt_rdma.h" ++#ifdef HAVE_TRACE_RPCRDMA_H + #include ++#endif + + static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc); + +@@ -209,8 +211,13 @@ struct svc_rdma_send_ctxt *svc_rdma_send + + out: + rpcrdma_set_xdrlen(&ctxt->sc_hdrbuf, 0); ++#ifdef HAVE_XDR_INIT_ENCODE_RQST_ARG + xdr_init_encode(&ctxt->sc_stream, &ctxt->sc_hdrbuf, + ctxt->sc_xprt_buf, NULL); ++#else ++ xdr_init_encode(&ctxt->sc_stream, &ctxt->sc_hdrbuf, ++ ctxt->sc_xprt_buf); ++#endif + + ctxt->sc_send_wr.num_sge = 0; + ctxt->sc_cur_sge_no = 0; +@@ -243,9 +250,11 @@ void svc_rdma_send_ctxt_put(struct svcxp + ctxt->sc_sges[i].addr, + ctxt->sc_sges[i].length, + DMA_TO_DEVICE); ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_svcrdma_dma_unmap_page(rdma, + ctxt->sc_sges[i].addr, + ctxt->sc_sges[i].length); ++#endif + } + + llist_add(&ctxt->sc_node, &rdma->sc_send_ctxts); +@@ -286,15 +295,24 @@ static void svc_rdma_wc_send(struct ib_c + if (unlikely(wc->status != IB_WC_SUCCESS)) + goto flushed; + ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_svcrdma_wc_send(wc, &ctxt->sc_cid); ++#endif + return; + + flushed: ++#ifdef HAVE_TRACE_RPCRDMA_H + if (wc->status != IB_WC_WR_FLUSH_ERR) + trace_svcrdma_wc_send_err(wc, &ctxt->sc_cid); + else + trace_svcrdma_wc_send_flush(wc, &ctxt->sc_cid); ++#endif ++#ifdef HAVE_SVC_XPRT_DEFERRED_CLOSE + svc_xprt_deferred_close(&rdma->sc_xprt); ++#else ++ set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); ++ svc_xprt_enqueue(&rdma->sc_xprt); ++#endif + } + + /** +@@ -322,26 +340,38 @@ int svc_rdma_send(struct svcxprt_rdma *r + while (1) { + if ((atomic_dec_return(&rdma->sc_sq_avail) < 0)) { + percpu_counter_inc(&svcrdma_stat_sq_starve); ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_svcrdma_sq_full(rdma); ++#endif + atomic_inc(&rdma->sc_sq_avail); + wait_event(rdma->sc_send_wait, + atomic_read(&rdma->sc_sq_avail) > 1); + if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags)) + return -ENOTCONN; ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_svcrdma_sq_retry(rdma); ++#endif + continue; + } + ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_svcrdma_post_send(ctxt); ++#endif + ret = ib_post_send(rdma->sc_qp, wr, NULL); + if (ret) + break; + return 0; + } + ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_svcrdma_sq_post_err(rdma, ret); ++#endif ++#ifdef HAVE_SVC_XPRT_DEFERRED_CLOSE + svc_xprt_deferred_close(&rdma->sc_xprt); + wake_up(&rdma->sc_send_wait); ++#else ++ set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); ++#endif + return ret; + } + +@@ -360,6 +390,8 @@ static ssize_t svc_rdma_encode_read_list + return xdr_stream_encode_item_absent(&sctxt->sc_stream); + } + ++ ++#ifdef HAVE_SVC_RDMA_PCL + /** + * svc_rdma_encode_write_segment - Encode one Write segment + * @sctxt: Send context for the RPC Reply +@@ -389,11 +421,43 @@ static ssize_t svc_rdma_encode_write_seg + *remaining -= length; + xdr_encode_rdma_segment(p, segment->rs_handle, length, + segment->rs_offset); ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_svcrdma_encode_wseg(sctxt, segno, segment->rs_handle, length, + segment->rs_offset); ++#endif + return len; + } ++#else ++static ssize_t svc_rdma_encode_write_segment(__be32 *src, ++ struct svc_rdma_send_ctxt *sctxt, ++ unsigned int *remaining) ++{ ++ __be32 *p; ++ const size_t len = rpcrdma_segment_maxsz * sizeof(*p); ++ u32 handle, length; ++ u64 offset; + ++ p = xdr_reserve_space(&sctxt->sc_stream, len); ++ if (!p) ++ return -EMSGSIZE; ++ ++ xdr_decode_rdma_segment(src, &handle, &length, &offset); ++ ++ if (*remaining < length) { ++ /* segment only partly filled */ ++ length = *remaining; ++ *remaining = 0; ++ } else { ++ /* entire segment was consumed */ ++ *remaining -= length; ++ } ++ xdr_encode_rdma_segment(p, handle, length, offset); ++ ++ return len; ++} ++#endif ++ ++#ifdef HAVE_SVC_RDMA_PCL + /** + * svc_rdma_encode_write_chunk - Encode one Write chunk + * @sctxt: Send context for the RPC Reply +@@ -435,6 +499,39 @@ static ssize_t svc_rdma_encode_write_chu + + return len; + } ++#else ++static ssize_t svc_rdma_encode_write_chunk(__be32 *src, ++ struct svc_rdma_send_ctxt *sctxt, ++ unsigned int remaining) ++{ ++ unsigned int i, nsegs; ++ ssize_t len, ret; ++ ++ len = 0; ++ ++ src++; ++ ret = xdr_stream_encode_item_present(&sctxt->sc_stream); ++ if (ret < 0) ++ return -EMSGSIZE; ++ len += ret; ++ ++ nsegs = be32_to_cpup(src++); ++ ret = xdr_stream_encode_u32(&sctxt->sc_stream, nsegs); ++ if (ret < 0) ++ return -EMSGSIZE; ++ len += ret; ++ ++ for (i = nsegs; i; i--) { ++ ret = svc_rdma_encode_write_segment(src, sctxt, &remaining); ++ if (ret < 0) ++ return -EMSGSIZE; ++ src += rpcrdma_segment_maxsz; ++ len += ret; ++ } ++ ++ return len; ++} ++#endif + + /** + * svc_rdma_encode_write_list - Encode RPC Reply's Write chunk list +@@ -446,12 +543,23 @@ static ssize_t svc_rdma_encode_write_chu + * that was consumed by the Reply's Write list + * %-EMSGSIZE on XDR buffer overflow + */ ++#ifdef HAVE_SVC_RDMA_PCL + static ssize_t svc_rdma_encode_write_list(struct svc_rdma_recv_ctxt *rctxt, + struct svc_rdma_send_ctxt *sctxt) ++ + { + struct svc_rdma_chunk *chunk; + ssize_t len, ret; ++#else ++static ssize_t ++svc_rdma_encode_write_list(const struct svc_rdma_recv_ctxt *rctxt, ++ struct svc_rdma_send_ctxt *sctxt, ++ unsigned int length) ++{ ++ ssize_t len, ret; ++#endif + ++#ifdef HAVE_SVC_RDMA_PCL + len = 0; + pcl_for_each_chunk(chunk, &rctxt->rc_write_pcl) { + ret = svc_rdma_encode_write_chunk(sctxt, chunk); +@@ -459,6 +567,12 @@ static ssize_t svc_rdma_encode_write_lis + return ret; + len += ret; + } ++#else ++ ret = svc_rdma_encode_write_chunk(rctxt->rc_write_list, sctxt, length); ++ if (ret < 0) ++ return ret; ++ len = ret; ++#endif + + /* Terminate the Write list */ + ret = xdr_stream_encode_item_absent(&sctxt->sc_stream); +@@ -480,6 +594,7 @@ static ssize_t svc_rdma_encode_write_lis + * %-EMSGSIZE on XDR buffer overflow + * %-E2BIG if the RPC message is larger than the Reply chunk + */ ++#ifdef HAVE_SVC_RDMA_PCL + static ssize_t + svc_rdma_encode_reply_chunk(struct svc_rdma_recv_ctxt *rctxt, + struct svc_rdma_send_ctxt *sctxt, +@@ -497,7 +612,18 @@ svc_rdma_encode_reply_chunk(struct svc_r + chunk->ch_payload_length = length; + return svc_rdma_encode_write_chunk(sctxt, chunk); + } ++#else ++static ssize_t ++svc_rdma_encode_reply_chunk(const struct svc_rdma_recv_ctxt *rctxt, ++ struct svc_rdma_send_ctxt *sctxt, ++ unsigned int length) ++{ ++ return svc_rdma_encode_write_chunk(rctxt->rc_reply_chunk, sctxt, ++ length); ++} ++#endif + ++#ifdef HAVE_SVC_RDMA_PCL + struct svc_rdma_map_data { + struct svcxprt_rdma *md_rdma; + struct svc_rdma_send_ctxt *md_ctxt; +@@ -520,26 +646,45 @@ static int svc_rdma_page_dma_map(void *d + struct svc_rdma_map_data *args = data; + struct svcxprt_rdma *rdma = args->md_rdma; + struct svc_rdma_send_ctxt *ctxt = args->md_ctxt; ++#else ++static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma, ++ struct svc_rdma_send_ctxt *ctxt, ++ struct page *page, ++ unsigned long offset, ++ unsigned int len) ++{ ++#endif + struct ib_device *dev = rdma->sc_cm_id->device; + dma_addr_t dma_addr; + ++#ifdef HAVE_SVC_RDMA_PCL + ++ctxt->sc_cur_sge_no; ++#endif + + dma_addr = ib_dma_map_page(dev, page, offset, len, DMA_TO_DEVICE); + if (ib_dma_mapping_error(dev, dma_addr)) + goto out_maperr; + ++#ifdef HAVE_TRACE_RPCRDMA_H ++#ifdef HAVE_SVC_RDMA_PCL + trace_svcrdma_dma_map_page(rdma, dma_addr, len); ++#endif ++#endif + ctxt->sc_sges[ctxt->sc_cur_sge_no].addr = dma_addr; + ctxt->sc_sges[ctxt->sc_cur_sge_no].length = len; + ctxt->sc_send_wr.num_sge++; + return 0; + + out_maperr: ++#ifdef HAVE_TRACE_RPCRDMA_H ++#ifdef HAVE_SVC_RDMA_PCL + trace_svcrdma_dma_map_err(rdma, dma_addr, len); ++#endif ++#endif + return -EIO; + } + ++#ifdef HAVE_SVC_RDMA_PCL + /** + * svc_rdma_iov_dma_map - DMA map an iovec + * @data: pointer to arguments +@@ -604,7 +749,21 @@ static int svc_rdma_xb_dma_map(const str + + return xdr->len; + } ++#else ++/* ib_dma_map_page() is used here because svc_rdma_dma_unmap() ++ * handles DMA-unmap and it uses ib_dma_unmap_page() exclusively. ++ */ ++static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma, ++ struct svc_rdma_send_ctxt *ctxt, ++ unsigned char *base, ++ unsigned int len) ++{ ++ return svc_rdma_dma_map_page(rdma, ctxt, virt_to_page(base), ++ offset_in_page(base), len); ++} ++#endif + ++#ifdef HAVE_SVC_RDMA_PCL + struct svc_rdma_pullup_data { + u8 *pd_dest; + unsigned int pd_length; +@@ -643,7 +802,9 @@ static int svc_rdma_xb_count_sges(const + args->pd_length += xdr->len; + return 0; + } ++#endif + ++#ifdef HAVE_SVC_RDMA_PCL + /** + * svc_rdma_pull_up_needed - Determine whether to use pull-up + * @rdma: controlling transport +@@ -752,9 +913,128 @@ static int svc_rdma_pull_up_reply_msg(co + return ret; + + sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len + args.pd_length; ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_svcrdma_send_pullup(sctxt, args.pd_length); ++#endif + return 0; + } ++#else ++/** ++ * svc_rdma_pull_up_needed - Determine whether to use pull-up ++ * @rdma: controlling transport ++ * @sctxt: send_ctxt for the Send WR ++ * @rctxt: Write and Reply chunks provided by client ++ * @xdr: xdr_buf containing RPC message to transmit ++ * ++ * Returns: ++ * %true if pull-up must be used ++ * %false otherwise ++ */ ++ ++static bool svc_rdma_pull_up_needed(struct svcxprt_rdma *rdma, ++ struct svc_rdma_send_ctxt *sctxt, ++ const struct svc_rdma_recv_ctxt *rctxt, ++ struct xdr_buf *xdr) ++{ ++ int elements; ++ ++ /* For small messages, copying bytes is cheaper than DMA mapping. ++ */ ++ if (sctxt->sc_hdrbuf.len + xdr->len < RPCRDMA_PULLUP_THRESH) ++ return true; ++ ++ /* Check whether the xdr_buf has more elements than can ++ * fit in a single RDMA Send. ++ */ ++ /* xdr->head */ ++ elements = 1; ++ ++ /* xdr->pages */ ++ if (!rctxt || !rctxt->rc_write_list) { ++ unsigned int remaining; ++ unsigned long pageoff; ++ ++ pageoff = xdr->page_base & ~PAGE_MASK; ++ remaining = xdr->page_len; ++ while (remaining) { ++ ++elements; ++ remaining -= min_t(u32, PAGE_SIZE - pageoff, ++ remaining); ++ pageoff = 0; ++ } ++ } ++ ++ /* xdr->tail */ ++ if (xdr->tail[0].iov_len) ++ ++elements; ++ ++ /* assume 1 SGE is needed for the transport header */ ++ return elements >= rdma->sc_max_send_sges; ++} ++ ++/** ++ * svc_rdma_pull_up_reply_msg - Copy Reply into a single buffer ++ * @rdma: controlling transport ++ * @sctxt: send_ctxt for the Send WR; xprt hdr is already prepared ++ * @rctxt: Write and Reply chunks provided by client ++ * @xdr: prepared xdr_buf containing RPC message ++ * ++ * The device is not capable of sending the reply directly. ++ * Assemble the elements of @xdr into the transport header buffer. ++ * ++ * Returns zero on success, or a negative errno on failure. ++ */ ++static int svc_rdma_pull_up_reply_msg(struct svcxprt_rdma *rdma, ++ struct svc_rdma_send_ctxt *sctxt, ++ const struct svc_rdma_recv_ctxt *rctxt, ++ const struct xdr_buf *xdr) ++{ ++ unsigned char *dst, *tailbase; ++ unsigned int taillen; ++ ++ dst = sctxt->sc_xprt_buf + sctxt->sc_hdrbuf.len; ++ memcpy(dst, xdr->head[0].iov_base, xdr->head[0].iov_len); ++ dst += xdr->head[0].iov_len; ++ ++ tailbase = xdr->tail[0].iov_base; ++ taillen = xdr->tail[0].iov_len; ++ if (rctxt && rctxt->rc_write_list) { ++ u32 xdrpad; ++ ++ xdrpad = xdr_pad_size(xdr->page_len); ++ if (taillen && xdrpad) { ++ tailbase += xdrpad; ++ taillen -= xdrpad; ++ } ++ } else { ++ unsigned int len, remaining; ++ unsigned long pageoff; ++ struct page **ppages; ++ ++ ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); ++ pageoff = xdr->page_base & ~PAGE_MASK; ++ remaining = xdr->page_len; ++ while (remaining) { ++ len = min_t(u32, PAGE_SIZE - pageoff, remaining); ++ ++ memcpy(dst, page_address(*ppages) + pageoff, len); ++ remaining -= len; ++ dst += len; ++ pageoff = 0; ++ ppages++; ++ } ++ } ++ ++ if (taillen) ++ memcpy(dst, tailbase, taillen); ++ ++ sctxt->sc_sges[0].length += xdr->len; ++#ifdef HAVE_TRACE_RPCRDMA_H ++ trace_svcrdma_send_pullup(sctxt, sctxt->sc_sges[0].length); ++#endif ++ return 0; ++} ++#endif + + /* svc_rdma_map_reply_msg - DMA map the buffer holding RPC message + * @rdma: controlling transport +@@ -772,12 +1052,25 @@ static int svc_rdma_pull_up_reply_msg(co + int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *sctxt, + const struct svc_rdma_recv_ctxt *rctxt, ++#ifdef HAVE_SVC_RDMA_PCL + const struct xdr_buf *xdr) ++#else ++ struct xdr_buf *xdr) ++#endif + { ++#ifdef HAVE_SVC_RDMA_PCL + struct svc_rdma_map_data args = { + .md_rdma = rdma, + .md_ctxt = sctxt, + }; ++#else ++ unsigned int len, remaining; ++ unsigned long page_off; ++ struct page **ppages; ++ unsigned char *base; ++ u32 xdr_pad; ++ int ret; ++#endif + + /* Set up the (persistently-mapped) transport header SGE. */ + sctxt->sc_send_wr.num_sge = 1; +@@ -786,7 +1079,11 @@ int svc_rdma_map_reply_msg(struct svcxpr + /* If there is a Reply chunk, nothing follows the transport + * header, and we're done here. + */ ++#ifdef HAVE_SVC_RDMA_PCL + if (!pcl_is_empty(&rctxt->rc_reply_pcl)) ++#else ++ if (rctxt && rctxt->rc_reply_chunk) ++#endif + return 0; + + /* For pull-up, svc_rdma_send() will sync the transport header. +@@ -795,8 +1092,63 @@ int svc_rdma_map_reply_msg(struct svcxpr + if (svc_rdma_pull_up_needed(rdma, sctxt, rctxt, xdr)) + return svc_rdma_pull_up_reply_msg(rdma, sctxt, rctxt, xdr); + ++#ifdef HAVE_SVC_RDMA_PCL + return pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr, + svc_rdma_xb_dma_map, &args); ++#else ++ ++sctxt->sc_cur_sge_no; ++ ret = svc_rdma_dma_map_buf(rdma, sctxt, ++ xdr->head[0].iov_base, ++ xdr->head[0].iov_len); ++ if (ret < 0) ++ return ret; ++ ++ /* If a Write chunk is present, the xdr_buf's page list ++ * is not included inline. However the Upper Layer may ++ * have added XDR padding in the tail buffer, and that ++ * should not be included inline. ++ */ ++ if (rctxt && rctxt->rc_write_list) { ++ base = xdr->tail[0].iov_base; ++ len = xdr->tail[0].iov_len; ++ xdr_pad = xdr_pad_size(xdr->page_len); ++ ++ if (len && xdr_pad) { ++ base += xdr_pad; ++ len -= xdr_pad; ++ } ++ ++ goto tail; ++ } ++ ++ ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); ++ page_off = xdr->page_base & ~PAGE_MASK; ++ remaining = xdr->page_len; ++ while (remaining) { ++ len = min_t(u32, PAGE_SIZE - page_off, remaining); ++ ++ ++sctxt->sc_cur_sge_no; ++ ret = svc_rdma_dma_map_page(rdma, sctxt, *ppages++, ++ page_off, len); ++ if (ret < 0) ++ return ret; ++ ++ remaining -= len; ++ page_off = 0; ++ } ++ ++ base = xdr->tail[0].iov_base; ++ len = xdr->tail[0].iov_len; ++tail: ++ if (len) { ++ ++sctxt->sc_cur_sge_no; ++ ret = svc_rdma_dma_map_buf(rdma, sctxt, base, len); ++ if (ret < 0) ++ return ret; ++ } ++ ++ return 0; ++#endif + } + + /* Prepare the portion of the RPC Reply that will be transmitted +@@ -864,12 +1216,23 @@ void svc_rdma_send_error_msg(struct svcx + struct svc_rdma_recv_ctxt *rctxt, + int status) + { ++#ifdef HAVE_SVC_FILL_WRITE_VECTOR + __be32 *rdma_argp = rctxt->rc_recv_buf; ++#else ++ struct svc_rqst *rqstp = ++ container_of((void *)rctxt, struct svc_rqst, rq_xprt_ctxt); ++ __be32 *rdma_argp = page_address(rqstp->rq_pages[0]); ++#endif + __be32 *p; + + rpcrdma_set_xdrlen(&sctxt->sc_hdrbuf, 0); ++#ifdef HAVE_XDR_INIT_ENCODE_RQST_ARG + xdr_init_encode(&sctxt->sc_stream, &sctxt->sc_hdrbuf, + sctxt->sc_xprt_buf, NULL); ++#else ++ xdr_init_encode(&sctxt->sc_stream, &sctxt->sc_hdrbuf, ++ sctxt->sc_xprt_buf); ++#endif + + p = xdr_reserve_space(&sctxt->sc_stream, + rpcrdma_fixed_maxsz * sizeof(*p)); +@@ -890,7 +1253,9 @@ void svc_rdma_send_error_msg(struct svcx + *p++ = err_vers; + *p++ = rpcrdma_version; + *p = rpcrdma_version; ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_svcrdma_err_vers(*rdma_argp); ++#endif + break; + default: + p = xdr_reserve_space(&sctxt->sc_stream, sizeof(*p)); +@@ -898,7 +1263,9 @@ void svc_rdma_send_error_msg(struct svcx + goto put_ctxt; + + *p = err_chunk; ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_svcrdma_err_chunk(*rdma_argp); ++#endif + } + + /* Remote Invalidation is skipped for simplicity. */ +@@ -932,15 +1299,26 @@ int svc_rdma_sendto(struct svc_rqst *rqs + struct svcxprt_rdma *rdma = + container_of(xprt, struct svcxprt_rdma, sc_xprt); + struct svc_rdma_recv_ctxt *rctxt = rqstp->rq_xprt_ctxt; ++#ifdef HAVE_SVC_FILL_WRITE_VECTOR + __be32 *rdma_argp = rctxt->rc_recv_buf; ++#else ++ __be32 *rdma_argp = page_address(rqstp->rq_pages[0]); ++#endif ++#ifndef HAVE_SVC_RDMA_PCL ++ __be32 *wr_lst = rctxt->rc_write_list; ++ __be32 *rp_ch = rctxt->rc_reply_chunk; ++ struct xdr_buf *xdr = &rqstp->rq_res; ++#endif + struct svc_rdma_send_ctxt *sctxt; + unsigned int rc_size; + __be32 *p; + int ret; + ++#ifdef HAVE_SVC_XPRT_IS_DEAD + ret = -ENOTCONN; + if (svc_xprt_is_dead(xprt)) + goto drop_connection; ++#endif + + ret = -ENOMEM; + sctxt = svc_rdma_send_ctxt_get(rdma); +@@ -953,26 +1331,66 @@ int svc_rdma_sendto(struct svc_rqst *rqs + if (!p) + goto put_ctxt; + ++#ifdef HAVE_SVC_RDMA_PCL + ret = svc_rdma_send_reply_chunk(rdma, rctxt, &rqstp->rq_res); + if (ret < 0) + goto reply_chunk; ++#endif + rc_size = ret; + + *p++ = *rdma_argp; + *p++ = *(rdma_argp + 1); + *p++ = rdma->sc_fc_credits; ++#ifdef HAVE_SVC_RDMA_PCL + *p = pcl_is_empty(&rctxt->rc_reply_pcl) ? rdma_msg : rdma_nomsg; ++#else ++ *p = rp_ch ? rdma_nomsg : rdma_msg; ++#endif + + ret = svc_rdma_encode_read_list(sctxt); + if (ret < 0) + goto put_ctxt; ++#ifdef HAVE_SVC_RDMA_PCL + ret = svc_rdma_encode_write_list(rctxt, sctxt); + if (ret < 0) + goto put_ctxt; + ret = svc_rdma_encode_reply_chunk(rctxt, sctxt, rc_size); + if (ret < 0) + goto put_ctxt; +- ++#else ++ if (wr_lst) { ++ /* XXX: Presume the client sent only one Write chunk */ ++ unsigned long offset; ++ unsigned int length; ++ ++ if (rctxt->rc_read_payload_length) { ++ offset = rctxt->rc_read_payload_offset; ++ length = rctxt->rc_read_payload_length; ++ } else { ++ offset = xdr->head[0].iov_len; ++ length = xdr->page_len; ++ } ++ ret = svc_rdma_send_write_chunk(rdma, wr_lst, xdr, offset, ++ length); ++ if (ret < 0) ++ goto reply_chunk; ++ if (svc_rdma_encode_write_list(rctxt, sctxt, length) < 0) ++ goto put_ctxt; ++ } else { ++ if (xdr_stream_encode_item_absent(&sctxt->sc_stream) < 0) ++ goto put_ctxt; ++ } ++ if (rp_ch) { ++ ret = svc_rdma_send_reply_chunk(rdma, rctxt, &rqstp->rq_res); ++ if (ret < 0) ++ goto reply_chunk; ++ if (svc_rdma_encode_reply_chunk(rctxt, sctxt, ret) < 0) ++ goto put_ctxt; ++ } else { ++ if (xdr_stream_encode_item_absent(&sctxt->sc_stream) < 0) ++ goto put_ctxt; ++ } ++#endif + ret = svc_rdma_send_reply_msg(rdma, sctxt, rctxt, rqstp); + if (ret < 0) + goto put_ctxt; +@@ -981,21 +1399,47 @@ int svc_rdma_sendto(struct svc_rqst *rqs + * rq_res.head[0].iov_base. It's no longer being accessed by + * the I/O device. */ + rqstp->rq_respages++; ++#ifdef HAVE_SVC_RDMA_RELEASE_RQST + return 0; ++#else ++ ret = 0; ++ ++out: ++ rqstp->rq_xprt_ctxt = NULL; ++ svc_rdma_recv_ctxt_put(rdma, rctxt); ++ ++ return ret; ++#endif + + reply_chunk: + if (ret != -E2BIG && ret != -EINVAL) + goto put_ctxt; + + svc_rdma_send_error_msg(rdma, sctxt, rctxt, ret); ++#ifdef HAVE_SVC_RDMA_RELEASE_RQST + return 0; ++#else ++ ret = 0; ++ goto out; ++#endif + + put_ctxt: + svc_rdma_send_ctxt_put(rdma, sctxt); + drop_connection: ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_svcrdma_send_err(rqstp, ret); ++#endif ++#ifdef HAVE_SVC_XPRT_DEFERRED_CLOSE + svc_xprt_deferred_close(&rdma->sc_xprt); ++#else ++ set_bit(XPT_CLOSE, &xprt->xpt_flags); ++#endif ++#ifdef HAVE_SVC_RDMA_RELEASE_RQST + return -ENOTCONN; ++#else ++ ret = -ENOTCONN; ++ goto out; ++#endif + } + + /** +@@ -1013,6 +1457,25 @@ drop_connection: + * %-ENOTCONN if posting failed (connection is lost) + * %-EIO if rdma_rw initialization failed (DMA mapping, etc) + */ ++#ifdef HAVE_XPO_READ_PAYLOAD ++int svc_rdma_read_payload(struct svc_rqst *rqstp, unsigned int offset, ++ unsigned int length) ++{ ++ struct svc_rdma_recv_ctxt *rctxt = rqstp->rq_xprt_ctxt; ++ ++ /* XXX: Just one READ payload slot for now, since our ++ * transport implementation currently supports only one ++ * Write chunk. ++ */ ++ rctxt->rc_read_payload_offset = offset; ++ rctxt->rc_read_payload_length = length; ++ ++ return 0; ++} ++#endif ++ ++#ifdef HAVE_XPO_RESULT_PAYLOAD ++#ifdef HAVE_SVC_RDMA_PCL + int svc_rdma_result_payload(struct svc_rqst *rqstp, unsigned int offset, + unsigned int length) + { +@@ -1040,5 +1503,23 @@ int svc_rdma_result_payload(struct svc_r + ret = svc_rdma_send_write_chunk(rdma, chunk, &subbuf); + if (ret < 0) + return ret; ++ ++ return 0; ++} ++#else ++int svc_rdma_result_payload(struct svc_rqst *rqstp, unsigned int offset, ++ unsigned int length) ++{ ++ struct svc_rdma_recv_ctxt *rctxt = rqstp->rq_xprt_ctxt; ++ ++ /* XXX: Just one READ payload slot for now, since our ++ * transport implementation currently supports only one ++ * Write chunk. ++ */ ++ rctxt->rc_read_payload_offset = offset; ++ rctxt->rc_read_payload_length = length; ++ + return 0; + } ++#endif ++#endif diff --git a/src/mlnx-ofa_kernel-5.8/backports/0267-BACKPORT-net-sunrpc-xprtrdma-svc_rdma_transport.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0267-BACKPORT-net-sunrpc-xprtrdma-svc_rdma_transport.c.patch new file mode 100644 index 0000000..d9397bb --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0267-BACKPORT-net-sunrpc-xprtrdma-svc_rdma_transport.c.patch @@ -0,0 +1,278 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: net/sunrpc/xprtrdma/svc_rdma_transport.c + +Change-Id: I08ccee22a48bf218d62059ac4a6623a229323841 +--- + net/sunrpc/xprtrdma/svc_rdma_transport.c | 105 +++++++++++++++++++++++ + 1 file changed, 105 insertions(+) + +--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c ++++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c +@@ -59,7 +59,9 @@ + #include + + #include "xprt_rdma.h" ++#ifdef HAVE_TRACE_RPCRDMA_H + #include ++#endif + + #define RPCDBG_FACILITY RPCDBG_SVCXPRT + +@@ -70,20 +72,45 @@ static struct svc_xprt *svc_rdma_create( + struct sockaddr *sa, int salen, + int flags); + static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt); ++#ifndef HAVE_SVC_RDMA_RELEASE_RQST ++static void svc_rdma_release_rqst(struct svc_rqst *); ++#endif + static void svc_rdma_detach(struct svc_xprt *xprt); + static void svc_rdma_free(struct svc_xprt *xprt); + static int svc_rdma_has_wspace(struct svc_xprt *xprt); ++#ifdef HAVE_XPO_SECURE_PORT_NO_RETURN + static void svc_rdma_secure_port(struct svc_rqst *); ++#else ++static int svc_rdma_secure_port(struct svc_rqst *); ++#endif + static void svc_rdma_kill_temp_xprt(struct svc_xprt *); + ++#ifdef HAVE_SVC_XPRT_XPO_PREP_REPLY_HDR ++static void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp) ++{ ++} ++#endif ++ ++#ifdef HAVE_SVC_XPRT_CLASS_XCL_OPS_CONST + static const struct svc_xprt_ops svc_rdma_ops = { ++#else ++static struct svc_xprt_ops svc_rdma_ops = { ++#endif + .xpo_create = svc_rdma_create, + .xpo_recvfrom = svc_rdma_recvfrom, + .xpo_sendto = svc_rdma_sendto, ++#ifdef HAVE_XPO_READ_PAYLOAD ++ .xpo_read_payload = svc_rdma_read_payload, ++#endif ++#ifdef HAVE_XPO_RESULT_PAYLOAD + .xpo_result_payload = svc_rdma_result_payload, ++#endif + .xpo_release_rqst = svc_rdma_release_rqst, + .xpo_detach = svc_rdma_detach, + .xpo_free = svc_rdma_free, ++#ifdef HAVE_SVC_XPRT_XPO_PREP_REPLY_HDR ++ .xpo_prep_reply_hdr = svc_rdma_prep_reply_hdr, ++#endif + .xpo_has_wspace = svc_rdma_has_wspace, + .xpo_accept = svc_rdma_accept, + .xpo_secure_port = svc_rdma_secure_port, +@@ -98,12 +125,37 @@ struct svc_xprt_class svc_rdma_class = { + .xcl_ident = XPRT_TRANSPORT_RDMA, + }; + ++#if defined(CONFIG_SUNRPC_BACKCHANNEL) && defined(HAVE_RPC_XPRT_OPS_BC_UP) ++#ifdef HAVE_SVC_XPRT_CLASS_XCL_OPS_CONST ++static const struct svc_xprt_ops svc_rdma_bc_ops = { ++#else ++static struct svc_xprt_ops svc_rdma_bc_ops = { ++#endif ++ .xpo_create = svc_rdma_create, ++ .xpo_detach = svc_rdma_detach, ++ .xpo_free = svc_rdma_free, ++#ifdef HAVE_SVC_XPRT_XPO_PREP_REPLY_HDR ++ .xpo_prep_reply_hdr = svc_rdma_prep_reply_hdr, ++#endif ++ .xpo_secure_port = svc_rdma_secure_port, ++}; ++ ++struct svc_xprt_class svc_rdma_bc_class = { ++ .xcl_name = "rdma-bc", ++ .xcl_owner = THIS_MODULE, ++ .xcl_ops = &svc_rdma_bc_ops, ++ .xcl_max_payload = (1024 - RPCRDMA_HDRLEN_MIN) ++}; ++#endif ++ + /* QP event handler */ + static void qp_event_handler(struct ib_event *event, void *context) + { + struct svc_xprt *xprt = context; + ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_svcrdma_qp_error(event, (struct sockaddr *)&xprt->xpt_remote); ++#endif + switch (event->event) { + /* These are considered benign events */ + case IB_EVENT_PATH_MIG: +@@ -119,7 +171,12 @@ static void qp_event_handler(struct ib_e + case IB_EVENT_QP_ACCESS_ERR: + case IB_EVENT_DEVICE_FATAL: + default: ++#ifdef HAVE_SVC_XPRT_DEFERRED_CLOSE + svc_xprt_deferred_close(xprt); ++#else ++ set_bit(XPT_CLOSE, &xprt->xpt_flags); ++ svc_xprt_enqueue(xprt); ++#endif + break; + } + } +@@ -136,14 +193,24 @@ static struct svcxprt_rdma *svc_rdma_cre + svc_xprt_init(net, &svc_rdma_class, &cma_xprt->sc_xprt, serv); + INIT_LIST_HEAD(&cma_xprt->sc_accept_q); + INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q); ++#ifndef HAVE_SVC_RDMA_PCL ++ INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q); ++#endif + init_llist_head(&cma_xprt->sc_send_ctxts); ++#ifdef HAVE_SVC_FILL_WRITE_VECTOR + init_llist_head(&cma_xprt->sc_recv_ctxts); ++#else ++ INIT_LIST_HEAD(&cma_xprt->sc_recv_ctxts); ++#endif + init_llist_head(&cma_xprt->sc_rw_ctxts); + init_waitqueue_head(&cma_xprt->sc_send_wait); + + spin_lock_init(&cma_xprt->sc_lock); + spin_lock_init(&cma_xprt->sc_rq_dto_lock); + spin_lock_init(&cma_xprt->sc_send_lock); ++#ifndef HAVE_SVC_FILL_WRITE_VECTOR ++ spin_lock_init(&cma_xprt->sc_recv_lock); ++#endif + spin_lock_init(&cma_xprt->sc_rw_ctxt_lock); + + /* +@@ -211,8 +278,10 @@ static void handle_connect_req(struct rd + newxprt->sc_xprt.xpt_remotelen = svc_addr_len(sa); + memcpy(&newxprt->sc_xprt.xpt_remote, sa, + newxprt->sc_xprt.xpt_remotelen); ++#ifdef HAVE_SVC_XPRT_XPT_REMOTEBUF + snprintf(newxprt->sc_xprt.xpt_remotebuf, + sizeof(newxprt->sc_xprt.xpt_remotebuf) - 1, "%pISc", sa); ++#endif + + /* The remote port is arbitrary and not under the control of the + * client ULP. Set it to a fixed value so that the DRC continues +@@ -284,7 +353,12 @@ static int svc_rdma_cma_handler(struct r + break; + case RDMA_CM_EVENT_DISCONNECTED: + case RDMA_CM_EVENT_DEVICE_REMOVAL: ++#ifdef HAVE_SVC_XPRT_DEFERRED_CLOSE + svc_xprt_deferred_close(xprt); ++#else ++ set_bit(XPT_CLOSE, &xprt->xpt_flags); ++ svc_xprt_enqueue(xprt); ++#endif + break; + default: + break; +@@ -310,7 +384,9 @@ static struct svc_xprt *svc_rdma_create( + if (!cma_xprt) + return ERR_PTR(-ENOMEM); + set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags); ++#ifdef HAVE_SVC_XPRT_XPT_REMOTEBUF + strcpy(cma_xprt->sc_xprt.xpt_remotebuf, "listener"); ++#endif + + listen_id = rdma_create_id(net, svc_rdma_listen_handler, cma_xprt, + RDMA_PS_TCP, IB_QPT_RC); +@@ -404,14 +480,20 @@ static struct svc_xprt *svc_rdma_accept( + newxprt->sc_max_req_size = svcrdma_max_req_size; + newxprt->sc_max_requests = svcrdma_max_requests; + newxprt->sc_max_bc_requests = svcrdma_max_bc_requests; ++#ifdef HAVE_SVCXPRT_RDMA_SC_PENDING_RECVS + newxprt->sc_recv_batch = RPCRDMA_MAX_RECV_BATCH; + rq_depth = newxprt->sc_max_requests + newxprt->sc_max_bc_requests + + newxprt->sc_recv_batch; ++#else ++ rq_depth = newxprt->sc_max_requests + newxprt->sc_max_bc_requests; ++#endif + if (rq_depth > dev->attrs.max_qp_wr) { + pr_warn("svcrdma: reducing receive depth to %d\n", + dev->attrs.max_qp_wr); + rq_depth = dev->attrs.max_qp_wr; ++#ifdef HAVE_SVCXPRT_RDMA_SC_PENDING_RECVS + newxprt->sc_recv_batch = 1; ++#endif + newxprt->sc_max_requests = rq_depth - 2; + newxprt->sc_max_bc_requests = 2; + } +@@ -428,7 +510,9 @@ static struct svc_xprt *svc_rdma_accept( + + newxprt->sc_pd = ib_alloc_pd(dev, 0); + if (IS_ERR(newxprt->sc_pd)) { ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_svcrdma_pd_err(newxprt, PTR_ERR(newxprt->sc_pd)); ++#endif + goto errout; + } + newxprt->sc_sq_cq = ib_alloc_cq_any(dev, newxprt, newxprt->sc_sq_depth, +@@ -462,7 +546,9 @@ static struct svc_xprt *svc_rdma_accept( + + ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, &qp_attr); + if (ret) { ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_svcrdma_qp_err(newxprt, ret); ++#endif + goto errout; + } + newxprt->sc_qp = newxprt->sc_cm_id->qp; +@@ -471,7 +557,9 @@ static struct svc_xprt *svc_rdma_accept( + newxprt->sc_snd_w_inv = false; + if (!rdma_protocol_iwarp(dev, newxprt->sc_port_num) && + !rdma_ib_or_roce(dev, newxprt->sc_port_num)) { ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_svcrdma_fabric_err(newxprt, -EINVAL); ++#endif + goto errout; + } + +@@ -493,7 +581,9 @@ static struct svc_xprt *svc_rdma_accept( + dev->attrs.max_qp_init_rd_atom); + if (!conn_param.initiator_depth) { + ret = -EINVAL; ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_svcrdma_initdepth_err(newxprt, ret); ++#endif + goto errout; + } + conn_param.private_data = &pmsg; +@@ -503,7 +593,9 @@ static struct svc_xprt *svc_rdma_accept( + ret = rdma_accept(newxprt->sc_cm_id, &conn_param); + rdma_unlock_handler(newxprt->sc_cm_id); + if (ret) { ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_svcrdma_accept_err(newxprt, ret); ++#endif + goto errout; + } + +@@ -533,6 +625,12 @@ static struct svc_xprt *svc_rdma_accept( + return NULL; + } + ++#ifndef HAVE_SVC_RDMA_RELEASE_RQST ++static void svc_rdma_release_rqst(struct svc_rqst *rqstp) ++{ ++} ++#endif ++ + static void svc_rdma_detach(struct svc_xprt *xprt) + { + struct svcxprt_rdma *rdma = +@@ -600,10 +698,17 @@ static int svc_rdma_has_wspace(struct sv + return 1; + } + ++#ifdef HAVE_XPO_SECURE_PORT_NO_RETURN + static void svc_rdma_secure_port(struct svc_rqst *rqstp) + { + set_bit(RQ_SECURE, &rqstp->rq_flags); + } ++#else ++static int svc_rdma_secure_port(struct svc_rqst *rqstp) ++{ ++ return 1; ++} ++#endif + + static void svc_rdma_kill_temp_xprt(struct svc_xprt *xprt) + { diff --git a/src/mlnx-ofa_kernel-5.8/backports/0268-BACKPORT-net-sunrpc-xprtrdma-transport.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0268-BACKPORT-net-sunrpc-xprtrdma-transport.c.patch new file mode 100644 index 0000000..80b1a5d --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0268-BACKPORT-net-sunrpc-xprtrdma-transport.c.patch @@ -0,0 +1,407 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: net/sunrpc/xprtrdma/transport.c + +Change-Id: Icdf3a3c3fc50c1802013fcd6917f8dbaaf4b29e2 +--- + net/sunrpc/xprtrdma/transport.c | 147 +++++++++++++++++++++++++++++++- + 1 file changed, 145 insertions(+), 2 deletions(-) + +--- a/net/sunrpc/xprtrdma/transport.c ++++ b/net/sunrpc/xprtrdma/transport.c +@@ -58,8 +58,15 @@ + #include + + #include "xprt_rdma.h" ++#ifdef HAVE_TRACE_RPCRDMA_H + #include ++#endif + ++#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) ++#ifndef RPCDBG_FACILITY ++#define RPCDBG_FACILITY RPCDBG_TRANS ++#endif ++#endif + /* + * tunables + */ +@@ -69,7 +76,9 @@ unsigned int xprt_rdma_max_inline_read = + unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE; + unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRWR; + int xprt_rdma_pad_optimize; ++#ifdef HAVE_RPC_XPRT_XPRT_CLASS + static struct xprt_class xprt_rdma; ++#endif + + #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) + +@@ -81,6 +90,9 @@ static unsigned int max_padding = PAGE_S + static unsigned int min_memreg = RPCRDMA_BOUNCEBUFFERS; + static unsigned int max_memreg = RPCRDMA_LAST - 1; + static unsigned int dummy; ++#ifndef HAVE_SYSCTL_ZERO_ENABLED ++static unsigned int zero; ++#endif + + static struct ctl_table_header *sunrpc_table_header; + +@@ -118,7 +130,11 @@ static struct ctl_table xr_tunables_tabl + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, ++#ifdef HAVE_SYSCTL_ZERO_ENABLED + .extra1 = SYSCTL_ZERO, ++#else ++ .extra1 = &zero, ++#endif + .extra2 = &max_padding, + }, + { +@@ -151,7 +167,11 @@ static struct ctl_table sunrpc_table[] = + + #endif + ++#ifdef HAVE_RPC_XPRT_OPS_CONST + static const struct rpc_xprt_ops xprt_rdma_procs; ++#else ++static struct rpc_xprt_ops xprt_rdma_procs; ++#endif + + static void + xprt_rdma_format_addresses4(struct rpc_xprt *xprt, struct sockaddr *sap) +@@ -246,9 +266,18 @@ xprt_rdma_connect_worker(struct work_str + xprt->stat.connect_start; + xprt_set_connected(xprt); + rc = -EAGAIN; ++#ifdef HAVE_XPRT_LOCK_CONNECT + } else + rpcrdma_xprt_disconnect(r_xprt); + xprt_unlock_connect(xprt, r_xprt); ++#else ++ } else { ++ /* Force a call to xprt_rdma_close to clean up */ ++ spin_lock(&xprt->transport_lock); ++ set_bit(XPRT_CLOSE_WAIT, &xprt->state); ++ spin_unlock(&xprt->transport_lock); ++ } ++#endif + xprt_wake_pending_tasks(xprt, rc); + } + +@@ -266,7 +295,9 @@ xprt_rdma_inject_disconnect(struct rpc_x + { + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_xprtrdma_op_inject_dsc(r_xprt); ++#endif + rdma_disconnect(r_xprt->rx_ep->re_id); + } + +@@ -318,16 +349,24 @@ xprt_setup_rdma(struct xprt_create *args + if (!try_module_get(THIS_MODULE)) + return ERR_PTR(-EIO); + ++#ifdef HAVE_RPC_XPRT_OPS_FREE_SLOT + xprt = xprt_alloc(args->net, sizeof(struct rpcrdma_xprt), 0, + xprt_rdma_slot_table_entries); ++#else ++ xprt = xprt_alloc(args->net, sizeof(struct rpcrdma_xprt), ++ xprt_rdma_slot_table_entries, ++ xprt_rdma_slot_table_entries); ++#endif + if (!xprt) { + module_put(THIS_MODULE); + return ERR_PTR(-ENOMEM); + } + + xprt->timeout = &xprt_rdma_default_timeout; ++#ifdef HAVE_XPRT_RECONNECT_DELAY + xprt->connect_timeout = xprt->timeout->to_initval; + xprt->max_reconnect_timeout = xprt->timeout->to_maxval; ++#endif + xprt->bind_timeout = RPCRDMA_BIND_TO; + xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO; + xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO; +@@ -343,7 +382,9 @@ xprt_setup_rdma(struct xprt_create *args + /* Ensure xprt->addr holds valid server TCP (not RDMA) + * address, for any side protocols which peek at it */ + xprt->prot = IPPROTO_TCP; ++#ifdef HAVE_RPC_XPRT_XPRT_CLASS + xprt->xprt_class = &xprt_rdma; ++#endif + xprt->addrlen = args->addrlen; + memcpy(&xprt->addr, sap, xprt->addrlen); + +@@ -431,6 +472,7 @@ xprt_rdma_timer(struct rpc_xprt *xprt, s + xprt_force_disconnect(xprt); + } + ++#ifdef HAVE_XPRT_RECONNECT_DELAY + /** + * xprt_rdma_set_connect_timeout - set timeouts for establishing a connection + * @xprt: controlling transport instance +@@ -444,7 +486,9 @@ static void xprt_rdma_set_connect_timeou + { + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_xprtrdma_op_set_cto(r_xprt, connect_timeout, reconnect_timeout); ++#endif + + spin_lock(&xprt->transport_lock); + +@@ -468,6 +512,7 @@ static void xprt_rdma_set_connect_timeou + + spin_unlock(&xprt->transport_lock); + } ++#endif + + /** + * xprt_rdma_connect - schedule an attempt to reconnect +@@ -480,20 +525,44 @@ xprt_rdma_connect(struct rpc_xprt *xprt, + { + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct rpcrdma_ep *ep = r_xprt->rx_ep; ++#ifdef HAVE_XPRT_RECONNECT_DELAY + unsigned long delay; + ++#ifdef HAVE_XPRT_LOCK_CONNECT + WARN_ON_ONCE(!xprt_lock_connect(xprt, task, r_xprt)); ++#endif + + delay = 0; + if (ep && ep->re_connect_status != 0) { + delay = xprt_reconnect_delay(xprt); + xprt_reconnect_backoff(xprt, RPCRDMA_INIT_REEST_TO); + } +- trace_xprtrdma_op_connect(r_xprt, delay); ++ + queue_delayed_work(xprtiod_workqueue, &r_xprt->rx_connect_worker, + delay); ++#else ++ if (ep && ep->re_connect_status != 0) { ++ /* Reconnect */ ++ schedule_delayed_work(&r_xprt->rx_connect_worker, ++ xprt->reestablish_timeout); ++ xprt->reestablish_timeout <<= 1; ++ if (xprt->reestablish_timeout > RPCRDMA_MAX_REEST_TO) ++ xprt->reestablish_timeout = RPCRDMA_MAX_REEST_TO; ++ else if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO) ++ xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO; ++ } else { ++ schedule_delayed_work(&r_xprt->rx_connect_worker, 0); ++ if (!RPC_IS_ASYNC(task)) ++ flush_delayed_work(&r_xprt->rx_connect_worker); ++ } ++#endif ++ ++#if defined(HAVE_TRACE_RPCRDMA_H) && defined(HAVE_XPRT_RECONNECT_DELAY) ++ trace_xprtrdma_op_connect(r_xprt, delay); ++#endif + } + ++#ifdef HAVE_RPC_XPRT_OPS_FREE_SLOT + /** + * xprt_rdma_alloc_slot - allocate an rpc_rqst + * @xprt: controlling RPC transport +@@ -518,7 +587,12 @@ xprt_rdma_alloc_slot(struct rpc_xprt *xp + + out_sleep: + task->tk_status = -EAGAIN; ++#ifdef HAVE_XPRT_ADD_BACKLOG + xprt_add_backlog(xprt, task); ++#else ++ set_bit(XPRT_CONGESTED, &xprt->state); ++ rpc_sleep_on(&xprt->backlog, task, NULL); ++#endif + } + + /** +@@ -533,12 +607,20 @@ xprt_rdma_free_slot(struct rpc_xprt *xpr + struct rpcrdma_xprt *r_xprt = + container_of(xprt, struct rpcrdma_xprt, rx_xprt); + ++#ifdef HAVE_XPRT_ADD_BACKLOG + rpcrdma_reply_put(&r_xprt->rx_buf, rpcr_to_rdmar(rqst)); + if (!xprt_wake_up_backlog(xprt, rqst)) { + memset(rqst, 0, sizeof(*rqst)); + rpcrdma_buffer_put(&r_xprt->rx_buf, rpcr_to_rdmar(rqst)); + } ++#else ++ memset(rqst, 0, sizeof(*rqst)); ++ rpcrdma_buffer_put(&r_xprt->rx_buf, rpcr_to_rdmar(rqst)); ++ if (unlikely(!rpc_wake_up_next(&xprt->backlog))) ++ clear_bit(XPRT_CONGESTED, &xprt->state); ++#endif + } ++#endif + + static bool rpcrdma_check_regbuf(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_regbuf *rb, size_t size, +@@ -566,9 +648,19 @@ xprt_rdma_allocate(struct rpc_task *task + { + struct rpc_rqst *rqst = task->tk_rqstp; + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); ++#ifdef HAVE_RPC_XPRT_OPS_FREE_SLOT + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); ++#else ++ struct rpcrdma_req *req; ++#endif + gfp_t flags; + ++#ifndef HAVE_RPC_XPRT_OPS_FREE_SLOT ++ req = rpcrdma_buffer_get(&r_xprt->rx_buf); ++ if (req == NULL) ++ goto out_get; ++#endif ++ + flags = RPCRDMA_DEF_GFP; + if (RPC_IS_SWAPPER(task)) + flags = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN; +@@ -580,11 +672,19 @@ xprt_rdma_allocate(struct rpc_task *task + flags)) + goto out_fail; + ++#ifndef HAVE_RPC_XPRT_OPS_FREE_SLOT ++ rpcrdma_set_xprtdata(rqst, req); ++#endif ++ + rqst->rq_buffer = rdmab_data(req->rl_sendbuf); + rqst->rq_rbuffer = rdmab_data(req->rl_recvbuf); + return 0; + + out_fail: ++#ifndef HAVE_RPC_XPRT_OPS_FREE_SLOT ++ rpcrdma_buffer_put(&r_xprt->rx_buf, req); ++out_get: ++#endif + return -ENOMEM; + } + +@@ -598,13 +698,25 @@ static void + xprt_rdma_free(struct rpc_task *task) + { + struct rpc_rqst *rqst = task->tk_rqstp; ++#if !defined(HAVE_XPRT_PIN_RQST) || !defined(HAVE_RPC_XPRT_OPS_FREE_SLOT) ++ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); ++#endif + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + ++#ifndef HAVE_XPRT_PIN_RQST ++ rpcrdma_remove_req(&r_xprt->rx_buf, req); ++#endif ++ + if (unlikely(!list_empty(&req->rl_registered))) { ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_xprtrdma_mrs_zap(task); ++#endif + frwr_unmap_sync(rpcx_to_rdmax(rqst->rq_xprt), req); + } + ++#ifndef HAVE_RPC_XPRT_OPS_FREE_SLOT ++ rpcrdma_buffer_put(&r_xprt->rx_buf, req); ++#endif + /* XXX: If the RPC is completing because of a signal and + * not because a reply was received, we ought to ensure + * that the Send completion has fired, so that memory +@@ -629,8 +741,14 @@ xprt_rdma_free(struct rpc_task *task) + * Do not try to send this message again. + */ + static int ++#ifdef HAVE_XPRT_OPS_SEND_REQUEST_RQST_ARG + xprt_rdma_send_request(struct rpc_rqst *rqst) + { ++#else ++xprt_rdma_send_request(struct rpc_task *task) ++{ ++ struct rpc_rqst *rqst = task->tk_rqstp; ++#endif + struct rpc_xprt *xprt = rqst->rq_xprt; + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); +@@ -644,8 +762,10 @@ xprt_rdma_send_request(struct rpc_rqst * + if (!xprt_connected(xprt)) + return -ENOTCONN; + ++#ifdef HAVE_XPRT_REQUEST_GET_CONG + if (!xprt_request_get_cong(xprt, rqst)) + return -EBADSLT; ++#endif + + rc = rpcrdma_marshal_req(r_xprt, rqst); + if (rc < 0) +@@ -661,11 +781,13 @@ xprt_rdma_send_request(struct rpc_rqst * + + rqst->rq_xmit_bytes_sent += rqst->rq_snd_buf.len; + ++#ifdef HAVE_RPC_REPLY_EXPECTED + /* An RPC with no reply will throw off credit accounting, + * so drop the connection to reset the credit grant. + */ + if (!rpc_reply_expected(rqst->rq_task)) + goto drop_connection; ++#endif + return 0; + + failed_marshal: +@@ -731,14 +853,26 @@ xprt_rdma_disable_swap(struct rpc_xprt * + /* + * Plumbing for rpc transport switch and kernel module + */ +- ++#ifdef HAVE_RPC_XPRT_OPS_CONST + static const struct rpc_xprt_ops xprt_rdma_procs = { ++#else ++static struct rpc_xprt_ops xprt_rdma_procs = { ++#endif + .reserve_xprt = xprt_reserve_xprt_cong, + .release_xprt = xprt_release_xprt_cong, /* sunrpc/xprt.c */ ++#ifdef HAVE_RPC_XPRT_OPS_FREE_SLOT + .alloc_slot = xprt_rdma_alloc_slot, + .free_slot = xprt_rdma_free_slot, ++#else ++ .alloc_slot = xprt_alloc_slot, ++#endif + .release_request = xprt_release_rqst_cong, /* ditto */ ++#ifdef HAVE_RPC_XPRT_OPS_SET_RETRANS_TIMEOUT ++ .set_retrans_timeout = xprt_set_retrans_timeout_def, /* ditto */ ++#endif ++#ifdef HAVE_RPC_XPRT_OPS_WAIT_FOR_REPLY_REQUEST + .wait_for_reply_request = xprt_wait_for_reply_request_def, /* ditto */ ++#endif + .timer = xprt_rdma_timer, + .rpcbind = rpcb_getport_async, /* sunrpc/rpcb_clnt.c */ + .set_port = xprt_rdma_set_port, +@@ -748,15 +882,22 @@ static const struct rpc_xprt_ops xprt_rd + .send_request = xprt_rdma_send_request, + .close = xprt_rdma_close, + .destroy = xprt_rdma_destroy, ++#ifdef HAVE_XPRT_RECONNECT_DELAY + .set_connect_timeout = xprt_rdma_set_connect_timeout, ++#endif + .print_stats = xprt_rdma_print_stats, + .enable_swap = xprt_rdma_enable_swap, + .disable_swap = xprt_rdma_disable_swap, + .inject_disconnect = xprt_rdma_inject_disconnect, + #if defined(CONFIG_SUNRPC_BACKCHANNEL) + .bc_setup = xprt_rdma_bc_setup, ++#ifdef HAVE_RPC_XPRT_OPS_BC_UP ++ .bc_up = xprt_rdma_bc_up, ++#endif + .bc_maxpayload = xprt_rdma_bc_maxpayload, ++#ifdef HAVE_RPC_XPRT_OPS_BC_NUM_SLOTS + .bc_num_slots = xprt_rdma_bc_max_slots, ++#endif + .bc_free_rqst = xprt_rdma_bc_free_rqst, + .bc_destroy = xprt_rdma_bc_destroy, + #endif +@@ -768,7 +909,9 @@ static struct xprt_class xprt_rdma = { + .owner = THIS_MODULE, + .ident = XPRT_TRANSPORT_RDMA, + .setup = xprt_setup_rdma, ++#ifdef HAVE_XPRT_CLASS_NETID + .netid = { "rdma", "rdma6", "" }, ++#endif + }; + + void xprt_rdma_cleanup(void) diff --git a/src/mlnx-ofa_kernel-5.8/backports/0269-BACKPORT-drivers-infiniband-core-umem.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0269-BACKPORT-drivers-infiniband-core-umem.c.patch new file mode 100644 index 0000000..c66b902 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0269-BACKPORT-drivers-infiniband-core-umem.c.patch @@ -0,0 +1,683 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/core/umem.c + +Change-Id: I740d2558bcb559eebc2949b7fdc6bb36e478b091 +--- + drivers/infiniband/core/umem.c | 467 ++++++++++++++++++++++++++++++++- + 1 file changed, 463 insertions(+), 4 deletions(-) + +--- a/drivers/infiniband/core/umem.c ++++ b/drivers/infiniband/core/umem.c +@@ -39,9 +39,17 @@ + #include + #include + #include ++#include ++#if !defined(HAVE_FOLL_LONGTERM) && !defined(HAVE_GET_USER_PAGES_LONGTERM) ++#include ++#endif + #include ++#ifdef HAVE_LINUX_COUNT_ZEROS_H + #include ++#endif ++#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + #include ++#endif + + #include "uverbs.h" + +@@ -49,21 +57,79 @@ + + static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty) + { ++#ifdef HAVE_UNPIN_USER_PAGE_RANGE_DIRTY_LOCK_EXPORTED + bool make_dirty = umem->writable && dirty; + struct scatterlist *sg; + unsigned int i; ++#else ++ struct sg_page_iter sg_iter; ++ struct page *page; ++#endif + ++#ifdef HAVE_SG_APPEND_TABLE + if (dirty) + ib_dma_unmap_sgtable_attrs(dev, &umem->sgt_append.sgt, + DMA_BIDIRECTIONAL, 0); ++#else ++ if (umem->nmap > 0) ++ ib_dma_unmap_sg(dev, umem->sg_head.sgl, umem->sg_nents, ++ DMA_BIDIRECTIONAL); ++#endif + ++#ifdef HAVE_UNPIN_USER_PAGE_RANGE_DIRTY_LOCK_EXPORTED ++#ifdef HAVE_SG_APPEND_TABLE + for_each_sgtable_sg(&umem->sgt_append.sgt, sg, i) ++#else ++ for_each_sg(umem->sg_head.sgl, sg, umem->sg_nents, i) ++#endif + unpin_user_page_range_dirty_lock(sg_page(sg), +- DIV_ROUND_UP(sg->length, PAGE_SIZE), make_dirty); ++ DIV_ROUND_UP(sg->length, PAGE_SIZE), make_dirty); ++#else ++ for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->sg_nents, 0) { ++ page = sg_page_iter_page(&sg_iter); ++#ifdef HAVE_UNPIN_USER_PAGES_DIRTY_LOCK_EXPORTED ++ unpin_user_pages_dirty_lock(&page, 1, umem->writable && dirty); ++#elif defined(HAVE_PUT_USER_PAGES_DIRTY_LOCK_3_PARAMS) ++ put_user_pages_dirty_lock(&page, 1, umem->writable && dirty); ++#elif defined(HAVE_PUT_USER_PAGES_DIRTY_LOCK_2_PARAMS) ++ if (umem->writable && dirty) ++ put_user_pages_dirty_lock(&page, 1); ++ else ++ put_user_page(page); ++#else ++ if (!PageDirty(page) && umem->writable && dirty) ++ set_page_dirty_lock(page); ++ put_page(page); ++#endif /*HAVE_UNPIN_USER_PAGES_DIRTY_LOCK_EXPORTED*/ ++ } ++#endif /*HAVE_UNPIN_USER_PAGE_RANGE_DIRTY_LOCK_EXPORTED*/ + ++#ifdef HAVE_SG_APPEND_TABLE + sg_free_append_table(&umem->sgt_append); ++#else ++ sg_free_table(&umem->sg_head); ++#endif /*HAVE_SG_APPEND_TABLE*/ + } + ++#ifndef HAVE_LINUX_COUNT_ZEROS_H ++static inline unsigned int rdma_find_pg_bit(unsigned long addr, ++ unsigned long pgsz_bitmap) ++{ ++ unsigned long align; ++ unsigned long pgsz; ++ ++ align = addr & -addr; ++ ++ /* Find page bit such that addr is aligned to the highest supported ++ * * HW page size ++ * */ ++ pgsz = pgsz_bitmap & ~(-align << 1); ++ if (!pgsz) ++ return __ffs(pgsz_bitmap); ++ ++ return __fls(pgsz); ++} ++#endif + /** + * ib_umem_find_best_pgsz - Find best HW page size to use for this MR + * +@@ -83,10 +149,14 @@ unsigned long ib_umem_find_best_pgsz(str + unsigned long virt) + { + struct scatterlist *sg; ++#ifndef HAVE_LINUX_COUNT_ZEROS_H ++ unsigned int best_pg_bit; ++#endif + unsigned long va, pgoff; + dma_addr_t mask; + int i; + ++#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + if (umem->is_odp) { + unsigned int page_size = BIT(to_ib_umem_odp(umem)->page_shift); + +@@ -95,13 +165,17 @@ unsigned long ib_umem_find_best_pgsz(str + return 0; + return page_size; + } ++#endif + + /* rdma_for_each_block() has a bug if the page size is smaller than the + * page size used to build the umem. For now prevent smaller page sizes + * from being returned. + */ ++#ifndef CONFIG_COMPAT_GENMASK_32_BIT + pgsz_bitmap &= GENMASK(BITS_PER_LONG - 1, PAGE_SHIFT); +- ++#else ++ pgsz_bitmap &= GENMASK_ULL(BITS_PER_LONG - 1, PAGE_SHIFT); ++#endif + umem->iova = va = virt; + /* The best result is the smallest page size that results in the minimum + * number of required pages. Compute the largest page size that could +@@ -113,7 +187,12 @@ unsigned long ib_umem_find_best_pgsz(str + /* offset into first SGL */ + pgoff = umem->address & ~PAGE_MASK; + ++#ifdef HAVE_SG_APPEND_TABLE + for_each_sgtable_dma_sg(&umem->sgt_append.sgt, sg, i) { ++#else ++ for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) { ++#endif ++ + /* Walk SGL and reduce max page size if VA/PA bits differ + * for any address. + */ +@@ -123,7 +202,11 @@ unsigned long ib_umem_find_best_pgsz(str + * the maximum possible page size as the low bits of the iova + * must be zero when starting the next chunk. + */ ++#ifdef HAVE_SG_APPEND_TABLE + if (i != (umem->sgt_append.sgt.nents - 1)) ++#else ++ if (i != (umem->nmap - 1)) ++#endif + mask |= va; + pgoff = 0; + } +@@ -132,12 +215,76 @@ unsigned long ib_umem_find_best_pgsz(str + * address differ, thus the length of trailing 0 is the largest page + * size that can pass the VA through to the physical. + */ ++#ifdef HAVE_LINUX_COUNT_ZEROS_H + if (mask) + pgsz_bitmap &= GENMASK(count_trailing_zeros(mask), 0); + return pgsz_bitmap ? rounddown_pow_of_two(pgsz_bitmap) : 0; ++#else ++ best_pg_bit = rdma_find_pg_bit(mask, pgsz_bitmap); ++ ++ return BIT_ULL(best_pg_bit); ++#endif + } + EXPORT_SYMBOL(ib_umem_find_best_pgsz); + ++#if !defined( HAVE_SG_ALLOC_TABLE_FROM_PAGES_GET_9_PARAMS) && !defined(HAVE_SG_APPEND_TABLE) ++static struct scatterlist *ib_umem_add_sg_table(struct scatterlist *sg, ++ struct page **page_list, ++ unsigned long npages, ++ unsigned int max_seg_sz, ++ int *nents) ++{ ++ unsigned long first_pfn; ++ unsigned long i = 0; ++ bool update_cur_sg = false; ++ bool first = !sg_page(sg); ++ ++ /* Check if new page_list is contiguous with end of previous page_list. ++ * * sg->length here is a multiple of PAGE_SIZE and sg->offset is 0. ++ * */ ++ if (!first && (page_to_pfn(sg_page(sg)) + (sg->length >> PAGE_SHIFT) == ++ page_to_pfn(page_list[0]))) ++ update_cur_sg = true; ++ ++ while (i != npages) { ++ unsigned long len; ++ struct page *first_page = page_list[i]; ++ ++ first_pfn = page_to_pfn(first_page); ++ ++ /* Compute the number of contiguous pages we have starting ++ * * at i ++ * */ ++ for (len = 0; i != npages && ++ first_pfn + len == page_to_pfn(page_list[i]) && ++ len < (max_seg_sz >> PAGE_SHIFT); ++ len++) ++ i++; ++ ++ /* Squash N contiguous pages from page_list into current sge */ ++ if (update_cur_sg) { ++ if ((max_seg_sz - sg->length) >= (len << PAGE_SHIFT)) { ++ sg_set_page(sg, sg_page(sg), ++ sg->length + (len << PAGE_SHIFT), ++ 0); ++ update_cur_sg = false; ++ continue; ++ } ++ update_cur_sg = false; ++ } ++ /* Squash N contiguous pages into next sge or first sge */ ++ if (!first) ++ sg = sg_next(sg); ++ ++ (*nents)++; ++ sg_set_page(sg, first_page, len << PAGE_SHIFT, 0); ++ first = false; ++ } ++ ++ return sg; ++} ++#endif ++ + /** + * __ib_umem_get - Pin and DMA map userspace memory. + * +@@ -147,20 +294,68 @@ EXPORT_SYMBOL(ib_umem_find_best_pgsz); + * @access: IB_ACCESS_xxx flags for memory being pinned + * @peer_mem_flags: IB_PEER_MEM_xxx flags for memory being used + */ ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + static struct ib_umem *__ib_umem_get(struct ib_device *device, ++#else ++struct ib_umem *__ib_umem_get(struct ib_udata *udata, ++#endif + unsigned long addr, size_t size, int access, + unsigned long peer_mem_flags) + { ++#ifndef HAVE_MMU_INTERVAL_NOTIFIER ++ struct ib_ucontext *context; ++#endif + struct ib_umem *umem; + struct page **page_list; + unsigned long lock_limit; ++#if defined(HAVE_PINNED_VM) || defined(HAVE_ATOMIC_PINNED_VM) + unsigned long new_pinned; ++#endif + unsigned long cur_base; + unsigned long dma_attr = 0; + struct mm_struct *mm; + unsigned long npages; ++#ifdef HAVE_SG_APPEND_TABLE + int pinned, ret; ++#else ++ int ret; ++ struct scatterlist *sg = NULL; ++#endif /*HAVE_SG_APPEND_TABLE*/ ++#ifdef HAVE_GET_USER_PAGES_GUP_FLAGS + unsigned int gup_flags = FOLL_WRITE; ++#endif ++#if defined(HAVE_SG_ALLOC_TABLE_FROM_PAGES_GET_9_PARAMS) && (!defined(HAVE_UNPIN_USER_PAGES_DIRTY_LOCK_EXPORTED) && !defined(HAVE_PUT_USER_PAGES_DIRTY_LOCK_3_PARAMS)) ++ unsigned long index; ++#endif ++#if !defined(HAVE_FOLL_LONGTERM) && !defined(HAVE_GET_USER_PAGES_LONGTERM) ++ struct vm_area_struct **vma_list; ++ int i; ++#endif ++#ifdef DMA_ATTR_WRITE_BARRIER ++#ifdef HAVE_STRUCT_DMA_ATTRS ++ DEFINE_DMA_ATTRS(attrs); ++#else ++ unsigned long dma_attrs = 0; ++#endif //HAVE_STRUCT_DMA_ATTRS ++#endif //DMA_ATTR_WRITE_BARRIER ++ ++#ifdef DMA_ATTR_WRITE_BARRIER ++#ifdef HAVE_STRUCT_DMA_ATTRS ++ dma_set_attr(DMA_ATTR_WRITE_BARRIER, &attrs); ++#else ++ dma_attrs |= DMA_ATTR_WRITE_BARRIER; ++#endif //HAVE_STRUCT_DMA_ATTRS ++#endif //DMA_ATTR_WRITE_BARRIER ++ ++#ifndef HAVE_MMU_INTERVAL_NOTIFIER ++ if (!udata) ++ return ERR_PTR(-EIO); ++ ++ context = container_of(udata, struct uverbs_attr_bundle, driver_udata) ++ ->context; ++ if (!context) ++ return ERR_PTR(-EIO); ++#endif + + /* + * If the combination of the addr and size requested for this memory +@@ -183,7 +378,15 @@ static struct ib_umem *__ib_umem_get(str + umem = kzalloc(sizeof(*umem), GFP_KERNEL); + if (!umem) + return ERR_PTR(-ENOMEM); ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + umem->ibdev = device; ++#else ++#ifdef HAVE_MMU_NOTIFIER_OPS_HAS_FREE_NOTIFIER ++ umem->ibdev = context->device; ++#else ++ umem->context = context; ++#endif ++#endif + umem->length = size; + umem->address = addr; + /* +@@ -195,12 +398,24 @@ static struct ib_umem *__ib_umem_get(str + umem->owning_mm = mm = current->mm; + mmgrab(mm); + ++#if !defined(HAVE_FOLL_LONGTERM) && !defined(HAVE_GET_USER_PAGES_LONGTERM) ++ /* We assume the memory is from hugetlb until proved otherwise */ ++ umem->hugetlb = 1; ++#endif + page_list = (struct page **) __get_free_page(GFP_KERNEL); + if (!page_list) { + ret = -ENOMEM; + goto umem_kfree; + } +- ++#if !defined(HAVE_FOLL_LONGTERM) && !defined(HAVE_GET_USER_PAGES_LONGTERM) ++ /* ++ * * if we can't alloc the vma_list, it's not so bad; ++ * * just assume the memory is not hugetlb memory ++ * */ ++ vma_list = (struct vm_area_struct **) __get_free_page(GFP_KERNEL); ++ if (!vma_list) ++ umem->hugetlb = 0; ++#endif + npages = ib_umem_num_pages(umem); + if (npages == 0 || npages > UINT_MAX) { + ret = -EINVAL; +@@ -209,18 +424,54 @@ static struct ib_umem *__ib_umem_get(str + + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + ++#ifdef HAVE_ATOMIC_PINNED_VM + new_pinned = atomic64_add_return(npages, &mm->pinned_vm); + if (new_pinned > lock_limit && !capable(CAP_IPC_LOCK)) { ++#else ++ down_write(&mm->mmap_sem); ++#ifdef HAVE_PINNED_VM ++ if (check_add_overflow(mm->pinned_vm, npages, &new_pinned) || ++ (new_pinned > lock_limit && !capable(CAP_IPC_LOCK))) { ++#else ++ current->mm->locked_vm += npages; ++ if ((current->mm->locked_vm > lock_limit) && !capable(CAP_IPC_LOCK)) { ++#endif /* HAVE_PINNED_VM */ ++#endif /* HAVE_ATOMIC_PINNED_VM */ ++ ++#ifdef HAVE_ATOMIC_PINNED_VM + atomic64_sub(npages, &mm->pinned_vm); ++#else ++ up_write(&mm->mmap_sem); ++#ifndef HAVE_PINNED_VM ++ current->mm->locked_vm -= npages; ++#endif /* HAVE_PINNED_VM */ ++#endif /* HAVE_ATOMIC_PINNED_VM */ + ret = -ENOMEM; + goto out; + } ++#ifndef HAVE_ATOMIC_PINNED_VM ++#ifdef HAVE_PINNED_VM ++ mm->pinned_vm = new_pinned; ++#endif /* HAVE_PINNED_VM */ ++ up_write(&mm->mmap_sem); ++#endif /* HAVE_ATOMIC_PINNED_VM */ + + cur_base = addr & PAGE_MASK; + ++#if !defined( HAVE_SG_ALLOC_TABLE_FROM_PAGES_GET_9_PARAMS) && !defined(HAVE_SG_APPEND_TABLE) ++ ret = sg_alloc_table(&umem->sg_head, npages, GFP_KERNEL); ++ if (ret) ++ goto vma; ++#endif ++#ifdef HAVE_GET_USER_PAGES_GUP_FLAGS + if (!umem->writable) + gup_flags |= FOLL_FORCE; ++#endif ++#if !defined( HAVE_SG_ALLOC_TABLE_FROM_PAGES_GET_9_PARAMS) && !defined(HAVE_SG_APPEND_TABLE) ++ sg = umem->sg_head.sgl; ++#endif + ++#ifdef HAVE_SG_APPEND_TABLE + while (npages) { + cond_resched(); + pinned = pin_user_pages_fast(cur_base, +@@ -261,8 +512,163 @@ static struct ib_umem *__ib_umem_get(str + } + goto out; + ++#else /*HAVE_SG_APPEND_TABLE*/ ++ while (npages) { ++ cond_resched(); ++#ifdef HAVE_UNPIN_USER_PAGES_DIRTY_LOCK_EXPORTED ++ ret = pin_user_pages_fast(cur_base, ++ min_t(unsigned long, npages, ++ PAGE_SIZE / ++ sizeof(struct page *)), ++ gup_flags | FOLL_LONGTERM, page_list); ++ if (ret < 0) ++ goto umem_release; ++#else ++ down_read(&mm->mmap_sem); ++#ifdef HAVE_FOLL_LONGTERM ++ ret = get_user_pages(cur_base, ++ min_t(unsigned long, npages, ++ PAGE_SIZE / sizeof (struct page *)), ++ gup_flags | FOLL_LONGTERM, ++ page_list, NULL); ++#elif defined(HAVE_GET_USER_PAGES_LONGTERM) ++ ret = get_user_pages_longterm(cur_base, ++ min_t(unsigned long, npages, ++ PAGE_SIZE / sizeof (struct page *)), ++ gup_flags, page_list, NULL); ++#elif defined(HAVE_GET_USER_PAGES_8_PARAMS) ++ ret = get_user_pages(current, current->mm, cur_base, ++ min_t(unsigned long, npages, ++ PAGE_SIZE / sizeof (struct page *)), ++ 1, !umem->writable, page_list, vma_list); ++#else ++#ifdef HAVE_GET_USER_PAGES_7_PARAMS ++ ret = get_user_pages(current, current->mm, cur_base, ++#else ++ ret = get_user_pages(cur_base, ++#endif ++ min_t(unsigned long, npages, ++ PAGE_SIZE / sizeof (struct page *)), ++#ifdef HAVE_GET_USER_PAGES_GUP_FLAGS ++ gup_flags, page_list, vma_list); ++#else ++ 1, !umem->writable, page_list, vma_list); ++#endif ++#endif /*HAVE_FOLL_LONGTERM*/ ++ ++ if (ret < 0) { ++#ifdef HAVE_GET_USER_PAGES_GUP_FLAGS ++ pr_debug("%s: failed to get user pages, nr_pages=%lu, flags=%u\n", __func__, ++ min_t(unsigned long, npages, ++ PAGE_SIZE / sizeof(struct page *)), ++ gup_flags); ++#else ++ pr_debug("%s: failed to get user pages, nr_pages=%lu\n", __func__, ++ min_t(unsigned long, npages, ++ PAGE_SIZE / sizeof(struct page *))); ++#endif ++#ifndef HAVE_UNPIN_USER_PAGES_DIRTY_LOCK_EXPORTED ++ up_read(&mm->mmap_sem); ++#endif ++ goto umem_release; ++ } ++#endif /*HAVE_UNPIN_USER_PAGES_DIRTY_LOCK_EXPORTED*/ ++ ++ cur_base += ret * PAGE_SIZE; ++ npages -= ret; ++#ifdef HAVE_SG_ALLOC_TABLE_FROM_PAGES_GET_9_PARAMS ++ sg = __sg_alloc_table_from_pages( ++ &umem->sg_head, page_list, ret, 0, ret << PAGE_SHIFT, ++#else ++ sg = ib_umem_add_sg_table(sg, page_list, ret, ++#endif ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER ++ dma_get_max_seg_size(device->dma_device), ++#else ++ dma_get_max_seg_size(context->device->dma_device), ++#endif ++#ifdef HAVE_SG_ALLOC_TABLE_FROM_PAGES_GET_9_PARAMS ++ sg, npages, ++ GFP_KERNEL); ++ umem->sg_nents = umem->sg_head.nents; ++ if (IS_ERR(sg)) { ++#ifdef HAVE_UNPIN_USER_PAGES_DIRTY_LOCK_EXPORTED ++ unpin_user_pages_dirty_lock(page_list, ret, 0); ++#elif defined(HAVE_PUT_USER_PAGES_DIRTY_LOCK_3_PARAMS) ++ put_user_pages_dirty_lock(page_list, ret, 0); ++#elif defined(HAVE_PUT_USER_PAGES_DIRTY_LOCK_2_PARAMS) ++ for (index = 0; index < ret; index++) ++ put_user_page(page_list[index]); ++#else ++ for (index = 0; index < ret; index++) ++ put_page(page_list[index]); ++#endif /*HAVE_UNPIN_USER_PAGES_DIRTY_LOCK_EXPORTED*/ ++ ret = PTR_ERR(sg); ++ goto umem_release; ++ } ++#else ++ &umem->sg_nents); ++#endif ++#if !defined(HAVE_FOLL_LONGTERM) && !defined(HAVE_GET_USER_PAGES_LONGTERM) ++ /* Continue to hold the mmap_sem as vma_list access ++ * * needs to be protected. ++ * */ ++ for (i = 0; i < ret && umem->hugetlb; i++) { ++ if (vma_list && !is_vm_hugetlb_page(vma_list[i])) ++ umem->hugetlb = 0; ++ } ++#endif ++#ifndef HAVE_UNPIN_USER_PAGES_DIRTY_LOCK_EXPORTED ++ up_read(&mm->mmap_sem); ++#endif ++ } ++ ++#ifndef HAVE_SG_ALLOC_TABLE_FROM_PAGES_GET_9_PARAMS ++ sg_mark_end(sg); ++#endif ++ if (access & IB_ACCESS_RELAXED_ORDERING) ++ dma_attr |= DMA_ATTR_WEAK_ORDERING; ++ ++#ifndef DMA_ATTR_WRITE_BARRIER ++ umem->nmap = ib_dma_map_sg( ++#else ++ umem->nmap = ib_dma_map_sg_attrs( ++#endif ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER ++ device, ++#else ++ context->device, ++#endif ++ umem->sg_head.sgl, ++ umem->sg_nents, ++ DMA_BIDIRECTIONAL ++#ifdef DMA_ATTR_WRITE_BARRIER ++#ifdef HAVE_STRUCT_DMA_ATTRS ++ , &attrs ++#else ++ , dma_attrs ++#endif //HAVE_STRUCT_DMA_ATTRS ++#endif //DMA_ATTR_WRITE_BARRIER ++ ); ++ ++ if (!umem->nmap) { ++ pr_err("%s: failed to map scatterlist, npages=%lu\n", __func__, ++ npages); ++ ret = -ENOMEM; ++ goto umem_release; ++ } ++ ++ ret = 0; ++ goto out; ++ ++#endif /*HAVE_SG_APPEND_TABLE*/ ++ + umem_release: ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + __ib_umem_release(device, umem, 0); ++#else ++ __ib_umem_release(context->device, umem, 0); ++#endif + + /* + * If the address belongs to peer memory client, then the first +@@ -283,8 +689,22 @@ umem_release: + goto out; + } + vma: ++#ifdef HAVE_ATOMIC_PINNED_VM + atomic64_sub(ib_umem_num_pages(umem), &mm->pinned_vm); ++#else ++ down_write(&mm->mmap_sem); ++#ifdef HAVE_PINNED_VM ++ mm->pinned_vm -= ib_umem_num_pages(umem); ++#else ++ mm->locked_vm -= ib_umem_num_pages(umem); ++#endif /* HAVE_PINNED_VM */ ++ up_write(&mm->mmap_sem); ++#endif /* HAVE_ATOMIC_PINNED_VM */ + out: ++#if !defined(HAVE_FOLL_LONGTERM) && !defined(HAVE_GET_USER_PAGES_LONGTERM) ++ if (vma_list) ++ free_page((unsigned long) vma_list); ++#endif + free_page((unsigned long) page_list); + umem_kfree: + if (ret) { +@@ -294,19 +714,36 @@ umem_kfree: + return ret ? ERR_PTR(ret) : umem; + } + ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr, ++#else ++struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, ++#endif + size_t size, int access) + { ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + return __ib_umem_get(device, addr, size, access, 0); ++#else ++ return __ib_umem_get(udata, addr, size, access, 0); ++#endif + } + EXPORT_SYMBOL(ib_umem_get); + ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + struct ib_umem *ib_umem_get_peer(struct ib_device *device, unsigned long addr, ++#else ++struct ib_umem *ib_umem_get_peer(struct ib_udata *udata, unsigned long addr, ++#endif + size_t size, int access, + unsigned long peer_mem_flags) + { ++#ifdef HAVE_MMU_INTERVAL_NOTIFIER + return __ib_umem_get(device, addr, size, access, + IB_PEER_MEM_ALLOW | peer_mem_flags); ++#else ++ return __ib_umem_get(udata, addr, size, access, ++ IB_PEER_MEM_ALLOW | peer_mem_flags); ++#endif + } + EXPORT_SYMBOL(ib_umem_get_peer); + +@@ -318,16 +755,34 @@ void ib_umem_release(struct ib_umem *ume + { + if (!umem) + return; ++#ifdef HAVE_DMA_BUF_DYNAMIC_ATTACH_GET_4_PARAMS + if (umem->is_dmabuf) + return ib_umem_dmabuf_release(to_ib_umem_dmabuf(umem)); ++#endif ++#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + if (umem->is_odp) + return ib_umem_odp_release(to_ib_umem_odp(umem)); ++#endif + + if (umem->is_peer) + return ib_peer_umem_release(umem); ++#ifdef HAVE_MMU_NOTIFIER_OPS_HAS_FREE_NOTIFIER + __ib_umem_release(umem->ibdev, umem, 1); ++#else ++ __ib_umem_release(umem->context->device, umem, 1); ++#endif + ++#ifdef HAVE_ATOMIC_PINNED_VM + atomic64_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm); ++#else ++ down_write(&umem->owning_mm->mmap_sem); ++#ifdef HAVE_PINNED_VM ++ umem->owning_mm->pinned_vm -= ib_umem_num_pages(umem); ++#else ++ umem->owning_mm->locked_vm -= ib_umem_num_pages(umem); ++#endif /* HAVE_PINNED_VM */ ++ up_write(&umem->owning_mm->mmap_sem); ++#endif /*HAVE_ATOMIC_PINNED_VM*/ + mmdrop(umem->owning_mm); + kfree(umem); + } +@@ -355,8 +810,12 @@ int ib_umem_copy_from(void *dst, struct + return -EINVAL; + } + ++#ifdef HAVE_SG_APPEND_TABLE + ret = sg_pcopy_to_buffer(umem->sgt_append.sgt.sgl, + umem->sgt_append.sgt.orig_nents, dst, length, ++#else ++ ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->sg_nents, dst, length, ++#endif + offset + ib_umem_offset(umem)); + + if (ret < 0) diff --git a/src/mlnx-ofa_kernel-5.8/backports/0269-BACKPORT-net-sunrpc-xprtrdma-svc_rdma_rw.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0269-BACKPORT-net-sunrpc-xprtrdma-svc_rdma_rw.c.patch new file mode 100644 index 0000000..b888e8c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0269-BACKPORT-net-sunrpc-xprtrdma-svc_rdma_rw.c.patch @@ -0,0 +1,939 @@ +From: Roy Novich +Subject: [PATCH] BACKPORT: net/sunrpc/xprtrdma/svc_rdma_rw.c + +Change-Id: I2e08d1c1010bef81e0a295134ca3f82601617287 +--- + net/sunrpc/xprtrdma/svc_rdma_rw.c | 446 +++++++++++++++++++++++++++++- + 1 file changed, 440 insertions(+), 6 deletions(-) + +--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c ++++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c +@@ -12,7 +12,9 @@ + #include + + #include "xprt_rdma.h" ++#ifdef HAVE_TRACE_RPCRDMA_H + #include ++#endif + + static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc); + static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc); +@@ -71,16 +73,26 @@ svc_rdma_get_rw_ctxt(struct svcxprt_rdma + } + + ctxt->rw_sg_table.sgl = ctxt->rw_first_sgl; ++#ifdef HAVE_SG_ALLOC_TABLE_CHAINED_NENTS_FIRST_CHUNK_PARAM + if (sg_alloc_table_chained(&ctxt->rw_sg_table, sges, + ctxt->rw_sg_table.sgl, + SG_CHUNK_SIZE)) ++#else ++ if (sg_alloc_table_chained(&ctxt->rw_sg_table, sges, ++#ifdef HAVE_SG_ALLOC_TABLE_CHAINED_GFP_MASK ++ GFP_ATOMIC, ++#endif ++ ctxt->rw_sg_table.sgl)) ++#endif + goto out_free; + return ctxt; + + out_free: + kfree(ctxt); + out_noctx: ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_svcrdma_no_rwctx_err(rdma, sges); ++#endif + return NULL; + } + +@@ -88,7 +100,11 @@ static void __svc_rdma_put_rw_ctxt(struc + struct svc_rdma_rw_ctxt *ctxt, + struct llist_head *list) + { ++#ifdef HAVE_SG_ALLOC_TABLE_CHAINED_NENTS_FIRST_CHUNK_PARAM + sg_free_table_chained(&ctxt->rw_sg_table, SG_CHUNK_SIZE); ++#else ++ sg_free_table_chained(&ctxt->rw_sg_table, true); ++#endif + llist_add(&ctxt->rw_node, list); + } + +@@ -137,7 +153,9 @@ static int svc_rdma_rw_ctx_init(struct s + 0, offset, handle, direction); + if (unlikely(ret < 0)) { + svc_rdma_put_rw_ctxt(rdma, ctxt); ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_svcrdma_dma_map_rw_err(rdma, ctxt->rw_nents, ret); ++#endif + } + return ret; + } +@@ -157,8 +175,10 @@ struct svc_rdma_chunk_ctxt { + struct list_head cc_rwctxts; + ktime_t cc_posttime; + int cc_sqecount; ++#ifdef HAVE_SVC_RDMA_PCL + enum ib_wc_status cc_status; + struct completion cc_done; ++#endif + }; + + static void svc_rdma_cc_cid_init(struct svcxprt_rdma *rdma, +@@ -214,7 +234,12 @@ static void svc_rdma_cc_release(struct s + * - Stores arguments for the SGL constructor functions + */ + struct svc_rdma_write_info { ++#ifdef HAVE_SVC_RDMA_PCL + const struct svc_rdma_chunk *wi_chunk; ++#else ++ unsigned int wi_nsegs; ++ __be32 *wi_segs; ++#endif + + /* write state of this chunk */ + unsigned int wi_seg_off; +@@ -229,8 +254,12 @@ struct svc_rdma_write_info { + }; + + static struct svc_rdma_write_info * ++#ifdef HAVE_SVC_RDMA_PCL + svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, + const struct svc_rdma_chunk *chunk) ++#else ++svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, __be32 *chunk) ++#endif + { + struct svc_rdma_write_info *info; + +@@ -238,7 +267,12 @@ svc_rdma_write_info_alloc(struct svcxprt + if (!info) + return info; + ++#ifdef HAVE_SVC_RDMA_PCL + info->wi_chunk = chunk; ++#else ++ info->wi_nsegs = be32_to_cpup(++chunk); ++ info->wi_segs = ++chunk; ++#endif + info->wi_seg_off = 0; + info->wi_seg_no = 0; + svc_rdma_cc_init(rdma, &info->wi_cc); +@@ -268,6 +302,7 @@ static void svc_rdma_write_done(struct i + struct svc_rdma_write_info *info = + container_of(cc, struct svc_rdma_write_info, wi_cc); + ++#ifdef HAVE_TRACE_RPCRDMA_H + switch (wc->status) { + case IB_WC_SUCCESS: + trace_svcrdma_wc_write(wc, &cc->cc_cid); +@@ -278,11 +313,16 @@ static void svc_rdma_write_done(struct i + default: + trace_svcrdma_wc_write_err(wc, &cc->cc_cid); + } ++#endif + + svc_rdma_wake_send_waiters(rdma, cc->cc_sqecount); + + if (unlikely(wc->status != IB_WC_SUCCESS)) ++#ifdef HAVE_SVC_XPRT_DEFERRED_CLOSE + svc_xprt_deferred_close(&rdma->sc_xprt); ++#else ++ set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); ++#endif + + svc_rdma_write_info_free(info); + } +@@ -290,11 +330,16 @@ static void svc_rdma_write_done(struct i + /* State for pulling a Read chunk. + */ + struct svc_rdma_read_info { ++#ifdef HAVE_SVC_RDMA_PCL + struct svc_rqst *ri_rqst; ++ unsigned int ri_totalbytes; ++#else ++ unsigned int ri_position; ++ unsigned int ri_chunklen; ++#endif + struct svc_rdma_recv_ctxt *ri_readctxt; + unsigned int ri_pageno; + unsigned int ri_pageoff; +- unsigned int ri_totalbytes; + + struct svc_rdma_chunk_ctxt ri_cc; + }; +@@ -330,13 +375,25 @@ static void svc_rdma_wc_read_done(struct + struct ib_cqe *cqe = wc->wr_cqe; + struct svc_rdma_chunk_ctxt *cc = + container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe); +- struct svc_rdma_read_info *info; ++#ifndef HAVE_SVC_RDMA_PCL ++ struct svcxprt_rdma *rdma = cc->cc_rdma; ++#endif + ++#if !defined(HAVE_SVC_RDMA_PCL) || defined(HAVE_TRACE_RPCRDMA_H) ++ struct svc_rdma_read_info *info = ++ container_of(cc, struct svc_rdma_read_info, ri_cc); ++#endif ++ ++#ifdef HAVE_TRACE_RPCRDMA_H + switch (wc->status) { + case IB_WC_SUCCESS: +- info = container_of(cc, struct svc_rdma_read_info, ri_cc); ++#ifdef HAVE_SVC_RDMA_PCL + trace_svcrdma_wc_read(wc, &cc->cc_cid, info->ri_totalbytes, + cc->cc_posttime); ++#else ++ trace_svcrdma_wc_read(wc, &cc->cc_cid, info->ri_chunklen, ++ cc->cc_posttime); ++#endif + break; + case IB_WC_WR_FLUSH_ERR: + trace_svcrdma_wc_read_flush(wc, &cc->cc_cid); +@@ -344,11 +401,30 @@ static void svc_rdma_wc_read_done(struct + default: + trace_svcrdma_wc_read_err(wc, &cc->cc_cid); + } ++#endif + + svc_rdma_wake_send_waiters(cc->cc_rdma, cc->cc_sqecount); ++#ifdef HAVE_SVC_RDMA_PCL + cc->cc_status = wc->status; + complete(&cc->cc_done); + return; ++#else ++ if (unlikely(wc->status != IB_WC_SUCCESS)) { ++ set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); ++ svc_rdma_recv_ctxt_put(rdma, info->ri_readctxt); ++ } else { ++ spin_lock(&rdma->sc_rq_dto_lock); ++ list_add_tail(&info->ri_readctxt->rc_list, ++ &rdma->sc_read_complete_q); ++ /* Note the unlock pairs with the smp_rmb in svc_xprt_ready: */ ++ set_bit(XPT_DATA, &rdma->sc_xprt.xpt_flags); ++ spin_unlock(&rdma->sc_rq_dto_lock); ++ ++ svc_xprt_enqueue(&rdma->sc_xprt); ++ } ++ ++ svc_rdma_read_info_free(info); ++#endif + } + + /* This function sleeps when the transport's Send Queue is congested. +@@ -361,6 +437,9 @@ static void svc_rdma_wc_read_done(struct + static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc) + { + struct svcxprt_rdma *rdma = cc->cc_rdma; ++#ifndef HAVE_SVC_XPRT_DEFERRED_CLOSE ++ struct svc_xprt *xprt = &rdma->sc_xprt; ++#endif + struct ib_send_wr *first_wr; + const struct ib_send_wr *bad_wr; + struct list_head *tmp; +@@ -392,15 +471,25 @@ static int svc_rdma_post_chunk_ctxt(stru + } + + percpu_counter_inc(&svcrdma_stat_sq_starve); ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_svcrdma_sq_full(rdma); ++#endif + atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); + wait_event(rdma->sc_send_wait, + atomic_read(&rdma->sc_sq_avail) > cc->cc_sqecount); ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_svcrdma_sq_retry(rdma); ++#endif + } while (1); + ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_svcrdma_sq_post_err(rdma, ret); ++#endif ++#ifdef HAVE_SVC_XPRT_DEFERRED_CLOSE + svc_xprt_deferred_close(&rdma->sc_xprt); ++#else ++ set_bit(XPT_CLOSE, &xprt->xpt_flags); ++#endif + + /* If even one was posted, there will be a completion. */ + if (bad_wr != first_wr) +@@ -470,10 +559,15 @@ svc_rdma_build_writes(struct svc_rdma_wr + { + struct svc_rdma_chunk_ctxt *cc = &info->wi_cc; + struct svcxprt_rdma *rdma = cc->cc_rdma; ++#ifdef HAVE_SVC_RDMA_PCL + const struct svc_rdma_segment *seg; ++#else ++ __be32 *seg; ++#endif + struct svc_rdma_rw_ctxt *ctxt; + int ret; + ++#ifdef HAVE_SVC_RDMA_PCL + do { + unsigned int write_len; + u64 offset; +@@ -483,6 +577,21 @@ svc_rdma_build_writes(struct svc_rdma_wr + + seg = &info->wi_chunk->ch_segments[info->wi_seg_no]; + write_len = min(remaining, seg->rs_length - info->wi_seg_off); ++#else ++ seg = info->wi_segs + info->wi_seg_no * rpcrdma_segment_maxsz; ++ do { ++ unsigned int write_len; ++ u32 handle, length; ++ u64 offset; ++ ++ if (info->wi_seg_no >= info->wi_nsegs) ++ goto out_overflow; ++ ++ xdr_decode_rdma_segment(seg, &handle, &length, &offset); ++ offset += info->wi_seg_off; ++ ++ write_len = min(remaining, length - info->wi_seg_off); ++#endif + if (!write_len) + goto out_overflow; + ctxt = svc_rdma_get_rw_ctxt(rdma, +@@ -491,8 +600,12 @@ svc_rdma_build_writes(struct svc_rdma_wr + return -ENOMEM; + + constructor(info, write_len, ctxt); ++#ifdef HAVE_SVC_RDMA_PCL + offset = seg->rs_offset + info->wi_seg_off; + ret = svc_rdma_rw_ctx_init(rdma, ctxt, offset, seg->rs_handle, ++#else ++ ret = svc_rdma_rw_ctx_init(rdma, ctxt, offset, handle, ++#endif + DMA_TO_DEVICE); + if (ret < 0) + return -EIO; +@@ -500,7 +613,12 @@ svc_rdma_build_writes(struct svc_rdma_wr + + list_add(&ctxt->rw_list, &cc->cc_rwctxts); + cc->cc_sqecount += ret; ++#ifdef HAVE_SVC_RDMA_PCL + if (write_len == seg->rs_length - info->wi_seg_off) { ++#else ++ if (write_len == length - info->wi_seg_off) { ++ seg += 4; ++#endif + info->wi_seg_no++; + info->wi_seg_off = 0; + } else { +@@ -512,8 +630,14 @@ svc_rdma_build_writes(struct svc_rdma_wr + return 0; + + out_overflow: ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_svcrdma_small_wrch_err(rdma, remaining, info->wi_seg_no, ++#ifdef HAVE_SVC_RDMA_PCL + info->wi_chunk->ch_segcount); ++#else ++ info->wi_nsegs); ++#endif ++#endif + return -E2BIG; + } + +@@ -560,6 +684,7 @@ static int svc_rdma_pages_write(struct s + length); + } + ++#ifdef HAVE_SVC_RDMA_PCL + /** + * svc_rdma_xb_write - Construct RDMA Writes to write an xdr_buf + * @xdr: xdr_buf to write +@@ -597,6 +722,7 @@ static int svc_rdma_xb_write(const struc + + return xdr->len; + } ++#endif + + /** + * svc_rdma_send_write_chunk - Write all segments in a Write chunk +@@ -611,28 +737,54 @@ static int svc_rdma_xb_write(const struc + * %-ENOTCONN if posting failed (connection is lost), + * %-EIO if rdma_rw initialization failed (DMA mapping, etc). + */ ++#ifdef HAVE_SVC_RDMA_PCL + int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, + const struct svc_rdma_chunk *chunk, + const struct xdr_buf *xdr) ++#else ++int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, __be32 *wr_ch, ++ struct xdr_buf *xdr, ++ unsigned int offset, unsigned long length) ++#endif + { + struct svc_rdma_write_info *info; + struct svc_rdma_chunk_ctxt *cc; + int ret; + ++#ifndef HAVE_SVC_RDMA_PCL ++ if (!length) ++ return 0; ++#endif ++ ++#ifdef HAVE_SVC_RDMA_PCL + info = svc_rdma_write_info_alloc(rdma, chunk); ++#else ++ info = svc_rdma_write_info_alloc(rdma, wr_ch); ++#endif + if (!info) + return -ENOMEM; + cc = &info->wi_cc; + ++#ifdef HAVE_SVC_RDMA_PCL + ret = svc_rdma_xb_write(xdr, info); + if (ret != xdr->len) ++#else ++ ret = svc_rdma_pages_write(info, xdr, offset, length); ++ if (ret < 0) ++#endif + goto out_err; + ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_svcrdma_post_write_chunk(&cc->cc_cid, cc->cc_sqecount); ++#endif + ret = svc_rdma_post_chunk_ctxt(cc); + if (ret < 0) + goto out_err; ++#ifdef HAVE_SVC_RDMA_PCL + return xdr->len; ++#else ++ return length; ++#endif + + out_err: + svc_rdma_write_info_free(info); +@@ -654,33 +806,77 @@ out_err: + */ + int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, + const struct svc_rdma_recv_ctxt *rctxt, ++#ifdef HAVE_SVC_RDMA_PCL + const struct xdr_buf *xdr) ++#else ++ struct xdr_buf *xdr) ++#endif + { + struct svc_rdma_write_info *info; + struct svc_rdma_chunk_ctxt *cc; ++#ifdef HAVE_SVC_RDMA_PCL + struct svc_rdma_chunk *chunk; + int ret; ++#else ++ int consumed, ret; ++#endif + ++#ifdef HAVE_SVC_RDMA_PCL + if (pcl_is_empty(&rctxt->rc_reply_pcl)) + return 0; + + chunk = pcl_first_chunk(&rctxt->rc_reply_pcl); + info = svc_rdma_write_info_alloc(rdma, chunk); ++#else ++ info = svc_rdma_write_info_alloc(rdma, rctxt->rc_reply_chunk); ++#endif + if (!info) + return -ENOMEM; + cc = &info->wi_cc; + ++#ifdef HAVE_SVC_RDMA_PCL + ret = pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr, + svc_rdma_xb_write, info); ++#else ++ ret = svc_rdma_iov_write(info, &xdr->head[0]); ++#endif + if (ret < 0) + goto out_err; + ++#ifndef HAVE_SVC_RDMA_PCL ++ consumed = xdr->head[0].iov_len; ++ ++ /* Send the page list in the Reply chunk only if the ++ * client did not provide Write chunks. ++ */ ++ if (!rctxt->rc_write_list && xdr->page_len) { ++ ret = svc_rdma_pages_write(info, xdr, xdr->head[0].iov_len, ++ xdr->page_len); ++ if (ret < 0) ++ goto out_err; ++ consumed += xdr->page_len; ++ } ++ ++ if (xdr->tail[0].iov_len) { ++ ret = svc_rdma_iov_write(info, &xdr->tail[0]); ++ if (ret < 0) ++ goto out_err; ++ consumed += xdr->tail[0].iov_len; ++ } ++#endif ++ ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_svcrdma_post_reply_chunk(&cc->cc_cid, cc->cc_sqecount); ++#endif + ret = svc_rdma_post_chunk_ctxt(cc); + if (ret < 0) + goto out_err; + ++#ifdef HAVE_SVC_RDMA_PCL + return xdr->len; ++#else ++ return consumed; ++#endif + + out_err: + svc_rdma_write_info_free(info); +@@ -699,17 +895,28 @@ out_err: + * %-EIO: a DMA mapping error occurred + */ + static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info, ++#ifdef HAVE_SVC_RDMA_PCL + const struct svc_rdma_segment *segment) ++#else ++ struct svc_rqst *rqstp, ++ u32 rkey, u32 len, u64 offset) ++#endif + { + struct svc_rdma_recv_ctxt *head = info->ri_readctxt; + struct svc_rdma_chunk_ctxt *cc = &info->ri_cc; ++#ifdef HAVE_SVC_RDMA_PCL + struct svc_rqst *rqstp = info->ri_rqst; + unsigned int sge_no, seg_len, len; ++#else ++ unsigned int sge_no, seg_len; ++#endif + struct svc_rdma_rw_ctxt *ctxt; + struct scatterlist *sg; + int ret; + ++#ifdef HAVE_SVC_RDMA_PCL + len = segment->rs_length; ++#endif + sge_no = PAGE_ALIGN(info->ri_pageoff + len) >> PAGE_SHIFT; + ctxt = svc_rdma_get_rw_ctxt(cc->cc_rdma, sge_no); + if (!ctxt) +@@ -721,6 +928,10 @@ static int svc_rdma_build_read_segment(s + seg_len = min_t(unsigned int, len, + PAGE_SIZE - info->ri_pageoff); + ++#ifndef HAVE_SVC_RDMA_PCL ++ head->rc_arg.pages[info->ri_pageno] = ++ rqstp->rq_pages[info->ri_pageno]; ++#endif + if (!info->ri_pageoff) + head->rc_page_count++; + +@@ -741,8 +952,13 @@ static int svc_rdma_build_read_segment(s + goto out_overrun; + } + ++#ifdef HAVE_SVC_RDMA_PCL + ret = svc_rdma_rw_ctx_init(cc->cc_rdma, ctxt, segment->rs_offset, + segment->rs_handle, DMA_FROM_DEVICE); ++#else ++ ret = svc_rdma_rw_ctx_init(cc->cc_rdma, ctxt, offset, rkey, ++ DMA_FROM_DEVICE); ++#endif + if (ret < 0) + return -EIO; + percpu_counter_inc(&svcrdma_stat_read); +@@ -752,10 +968,13 @@ static int svc_rdma_build_read_segment(s + return 0; + + out_overrun: ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_svcrdma_page_overrun_err(cc->cc_rdma, rqstp, info->ri_pageno); ++#endif + return -EINVAL; + } + ++#ifdef HAVE_SVC_RDMA_PCL + /** + * svc_rdma_build_read_chunk - Build RDMA Read WQEs to pull one RDMA chunk + * @info: context for ongoing I/O +@@ -782,7 +1001,36 @@ static int svc_rdma_build_read_chunk(str + } + return ret; + } ++#else ++/* Walk the segments in the Read chunk starting at @p and construct ++ * RDMA Read operations to pull the chunk to the server. ++ */ ++static int svc_rdma_build_read_chunk(struct svc_rqst *rqstp, ++ struct svc_rdma_read_info *info, ++ __be32 *p) ++{ ++ int ret; + ++ ret = -EINVAL; ++ info->ri_chunklen = 0; ++ while (*p++ != xdr_zero && be32_to_cpup(p++) == info->ri_position) { ++ u32 handle, length; ++ u64 offset; ++ ++ p = xdr_decode_rdma_segment(p, &handle, &length, &offset); ++ ret = svc_rdma_build_read_segment(info, rqstp, handle, length, ++ offset); ++ if (ret < 0) ++ break; ++ ++ info->ri_chunklen += length; ++ } ++ return ret; ++} ++#endif ++ ++ ++#ifdef HAVE_SVC_RDMA_PCL + /** + * svc_rdma_copy_inline_range - Copy part of the inline content into pages + * @info: context for RDMA Reads +@@ -803,7 +1051,11 @@ static int svc_rdma_copy_inline_range(st + unsigned int remaining) + { + struct svc_rdma_recv_ctxt *head = info->ri_readctxt; ++#ifdef HAVE_SVC_FILL_WRITE_VECTOR + unsigned char *dst, *src = head->rc_recv_buf; ++#else ++ unsigned char *dst, *src = page_address(head->rc_pages[0]); ++#endif + struct svc_rqst *rqstp = info->ri_rqst; + unsigned int page_no, numpages; + +@@ -814,10 +1066,18 @@ static int svc_rdma_copy_inline_range(st + page_len = min_t(unsigned int, remaining, + PAGE_SIZE - info->ri_pageoff); + ++#ifndef HAVE_SVC_RDMA_PCL ++ head->rc_arg.pages[info->ri_pageno] = ++ rqstp->rq_pages[info->ri_pageno]; ++#endif + if (!info->ri_pageoff) + head->rc_page_count++; + ++#ifdef HAVE_SVC_RDMA_PCL + dst = page_address(rqstp->rq_pages[info->ri_pageno]); ++#else ++ dst = page_address(head->rc_arg.pages[info->ri_pageno]); ++#endif + memcpy(dst + info->ri_pageno, src + offset, page_len); + + info->ri_totalbytes += page_len; +@@ -851,7 +1111,11 @@ static noinline int svc_rdma_read_multip + { + struct svc_rdma_recv_ctxt *head = info->ri_readctxt; + const struct svc_rdma_pcl *pcl = &head->rc_read_pcl; ++#ifdef HAVE_SVC_RDMA_PCL + struct xdr_buf *buf = &info->ri_rqst->rq_arg; ++#else ++ struct xdr_buf *buf = &head->rc_arg; ++#endif + struct svc_rdma_chunk *chunk, *next; + unsigned int start, length; + int ret; +@@ -888,12 +1152,18 @@ static noinline int svc_rdma_read_multip + buf->len += info->ri_totalbytes; + buf->buflen += info->ri_totalbytes; + ++#ifdef HAVE_SVC_RDMA_PCL + buf->head[0].iov_base = page_address(info->ri_rqst->rq_pages[0]); +- buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, info->ri_totalbytes); + buf->pages = &info->ri_rqst->rq_pages[1]; ++#else ++ head->rc_hdr_count = 1; ++ buf->head[0].iov_base = page_address(head->rc_pages[0]); ++#endif ++ buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, info->ri_totalbytes); + buf->page_len = info->ri_totalbytes - buf->head[0].iov_len; + return 0; + } ++#endif + + /** + * svc_rdma_read_data_item - Construct RDMA Reads to pull data item Read chunks +@@ -912,24 +1182,46 @@ static noinline int svc_rdma_read_multip + * %-ENOTCONN: posting failed (connection is lost), + * %-EIO: rdma_rw initialization failed (DMA mapping, etc). + */ ++#ifdef HAVE_SVC_RDMA_PCL + static int svc_rdma_read_data_item(struct svc_rdma_read_info *info) ++#else ++static int svc_rdma_build_normal_read_chunk(struct svc_rqst *rqstp, ++ struct svc_rdma_read_info *info, ++ __be32 *p) ++#endif + { + struct svc_rdma_recv_ctxt *head = info->ri_readctxt; ++#ifdef HAVE_SVC_RDMA_PCL + struct xdr_buf *buf = &info->ri_rqst->rq_arg; + struct svc_rdma_chunk *chunk; + unsigned int length; ++#endif + int ret; + ++#ifndef HAVE_SVC_FILL_WRITE_VECTOR ++ info->ri_pageno = head->rc_hdr_count; ++ info->ri_pageoff = 0; ++#endif ++ ++#ifdef HAVE_SVC_RDMA_PCL + chunk = pcl_first_chunk(&head->rc_read_pcl); + ret = svc_rdma_build_read_chunk(info, chunk); ++#else ++ ret = svc_rdma_build_read_chunk(rqstp, info, p); ++#endif + if (ret < 0) + goto out; + ++#if !defined(HAVE_SVC_RDMA_PCL) && defined(HAVE_SVC_FILL_WRITE_VECTOR) ++ head->rc_hdr_count = 0; ++#endif ++ + /* Split the Receive buffer between the head and tail + * buffers at Read chunk's position. XDR roundup of the + * chunk is not included in either the pagelist or in + * the tail. + */ ++#ifdef HAVE_SVC_RDMA_PCL + buf->tail[0].iov_base = buf->head[0].iov_base + chunk->ch_position; + buf->tail[0].iov_len = buf->head[0].iov_len - chunk->ch_position; + buf->head[0].iov_len = chunk->ch_position; +@@ -948,11 +1240,34 @@ static int svc_rdma_read_data_item(struc + buf->page_len = length; + buf->len += length; + buf->buflen += length; ++#else ++ head->rc_arg.tail[0].iov_base = ++ head->rc_arg.head[0].iov_base + info->ri_position; ++ head->rc_arg.tail[0].iov_len = ++ head->rc_arg.head[0].iov_len - info->ri_position; ++ head->rc_arg.head[0].iov_len = info->ri_position; ++ ++ /* Read chunk may need XDR roundup (see RFC 8166, s. 3.4.5.2). ++ * ++ * If the client already rounded up the chunk length, the ++ * length does not change. Otherwise, the length of the page ++ * list is increased to include XDR round-up. ++ * ++ * Currently these chunks always start at page offset 0, ++ * thus the rounded-up length never crosses a page boundary. ++ */ ++ info->ri_chunklen = XDR_QUADLEN(info->ri_chunklen) << 2; ++ ++ head->rc_arg.page_len = info->ri_chunklen; ++ head->rc_arg.len += info->ri_chunklen; ++ head->rc_arg.buflen += info->ri_chunklen; ++#endif + + out: + return ret; + } + ++#ifdef HAVE_SVC_RDMA_PCL + /** + * svc_rdma_read_chunk_range - Build RDMA Read WQEs for portion of a chunk + * @info: context for RDMA Reads +@@ -1050,6 +1365,7 @@ static int svc_rdma_read_call_chunk(stru + length = call_chunk->ch_length - start; + return svc_rdma_read_chunk_range(info, call_chunk, start, length); + } ++#endif + + /** + * svc_rdma_read_special - Build RDMA Read WQEs to pull a Long Message +@@ -1069,27 +1385,101 @@ static int svc_rdma_read_call_chunk(stru + * %-ENOTCONN: posting failed (connection is lost), + * %-EIO: rdma_rw initialization failed (DMA mapping, etc). + */ ++#ifdef HAVE_SVC_RDMA_PCL + static noinline int svc_rdma_read_special(struct svc_rdma_read_info *info) ++#else ++static int svc_rdma_build_pz_read_chunk(struct svc_rqst *rqstp, ++ struct svc_rdma_read_info *info, ++ __be32 *p) ++#endif + { ++#ifdef HAVE_SVC_RDMA_PCL + struct xdr_buf *buf = &info->ri_rqst->rq_arg; ++#endif ++#ifndef HAVE_SVC_RDMA_PCL ++ struct svc_rdma_recv_ctxt *head = info->ri_readctxt; ++#endif + int ret; + ++#ifndef HAVE_SVC_FILL_WRITE_VECTOR ++ info->ri_pageno = head->rc_hdr_count - 1; ++ info->ri_pageoff = offset_in_page(head->rc_byte_len); ++#endif ++ ++#ifdef HAVE_SVC_RDMA_PCL + ret = svc_rdma_read_call_chunk(info); ++#else ++ ret = svc_rdma_build_read_chunk(rqstp, info, p); ++#endif + if (ret < 0) + goto out; + ++#ifdef HAVE_SVC_RDMA_PCL + buf->len += info->ri_totalbytes; + buf->buflen += info->ri_totalbytes; ++#else ++ head->rc_arg.len += info->ri_chunklen; ++ head->rc_arg.buflen += info->ri_chunklen; ++#endif + ++#ifdef HAVE_SVC_FILL_WRITE_VECTOR ++#ifdef HAVE_SVC_RDMA_PCL + buf->head[0].iov_base = page_address(info->ri_rqst->rq_pages[0]); + buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, info->ri_totalbytes); + buf->pages = &info->ri_rqst->rq_pages[1]; + buf->page_len = info->ri_totalbytes - buf->head[0].iov_len; +- ++#else ++ head->rc_hdr_count = 1; ++ head->rc_arg.head[0].iov_base = page_address(head->rc_pages[0]); ++ head->rc_arg.head[0].iov_len = min_t(size_t, PAGE_SIZE, ++ info->ri_chunklen); ++ ++ head->rc_arg.page_len = info->ri_chunklen - ++ head->rc_arg.head[0].iov_len; ++#endif ++#else ++ if (head->rc_arg.buflen <= head->rc_sges[0].length) { ++ /* Transport header and RPC message fit entirely ++ * in page where head iovec resides. ++ */ ++ head->rc_arg.head[0].iov_len = info->ri_chunklen; ++ } else { ++ /* Transport header and part of RPC message reside ++ * in the head iovec's page. ++ */ ++ head->rc_arg.head[0].iov_len = ++ head->rc_sges[0].length - head->rc_byte_len; ++ head->rc_arg.page_len = ++ info->ri_chunklen - head->rc_arg.head[0].iov_len; ++ } ++#endif ++ + out: + return ret; + } + ++#ifndef HAVE_SVC_RDMA_PCL ++/* Pages under I/O have been copied to head->rc_pages. Ensure they ++ * are not released by svc_xprt_release() until the I/O is complete. ++ * ++ * This has to be done after all Read WRs are constructed to properly ++ * handle a page that is part of I/O on behalf of two different RDMA ++ * segments. ++ * ++ * Do this only if I/O has been posted. Otherwise, we do indeed want ++ * svc_xprt_release() to clean things up properly. ++ */ ++static void svc_rdma_save_io_pages(struct svc_rqst *rqstp, ++ const unsigned int start, ++ const unsigned int num_pages) ++{ ++ unsigned int i; ++ ++ for (i = start; i < num_pages + start; i++) ++ rqstp->rq_pages[i] = NULL; ++} ++#endif ++ + /** + * svc_rdma_process_read_list - Pull list of Read chunks from the client + * @rdma: controlling RDMA transport +@@ -1113,24 +1503,52 @@ out: + * %-ENOTCONN: posting failed (connection is lost), + * %-EIO: rdma_rw initialization failed (DMA mapping, etc). + */ ++#ifdef HAVE_SVC_RDMA_PCL + int svc_rdma_process_read_list(struct svcxprt_rdma *rdma, + struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *head) ++#else ++int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp, ++ struct svc_rdma_recv_ctxt *head, __be32 *p) ++#endif + { + struct svc_rdma_read_info *info; + struct svc_rdma_chunk_ctxt *cc; + int ret; + ++#ifndef HAVE_SVC_FILL_WRITE_VECTOR ++ head->rc_page_count = head->rc_hdr_count; ++#endif ++ ++#ifndef HAVE_SVC_RDMA_PCL ++ /* The request (with page list) is constructed in ++ * head->rc_arg. Pages involved with RDMA Read I/O are ++ * transferred there. ++ */ ++ head->rc_arg.head[0] = rqstp->rq_arg.head[0]; ++ head->rc_arg.tail[0] = rqstp->rq_arg.tail[0]; ++ head->rc_arg.pages = head->rc_pages; ++ head->rc_arg.page_base = 0; ++ head->rc_arg.page_len = 0; ++ head->rc_arg.len = rqstp->rq_arg.len; ++ head->rc_arg.buflen = rqstp->rq_arg.buflen; ++#endif ++ + info = svc_rdma_read_info_alloc(rdma); + if (!info) + return -ENOMEM; + cc = &info->ri_cc; ++#ifdef HAVE_SVC_RDMA_PCL + info->ri_rqst = rqstp; ++ info->ri_totalbytes = 0; ++#endif + info->ri_readctxt = head; ++#ifdef HAVE_SVC_FILL_WRITE_VECTOR + info->ri_pageno = 0; + info->ri_pageoff = 0; +- info->ri_totalbytes = 0; ++#endif + ++#ifdef HAVE_SVC_RDMA_PCL + if (pcl_is_empty(&head->rc_call_pcl)) { + if (head->rc_read_pcl.cl_count == 1) + ret = svc_rdma_read_data_item(info); +@@ -1138,15 +1556,27 @@ int svc_rdma_process_read_list(struct sv + ret = svc_rdma_read_multiple_chunks(info); + } else + ret = svc_rdma_read_special(info); ++#else ++ info->ri_position = be32_to_cpup(p + 1); ++ if (info->ri_position) ++ ret = svc_rdma_build_normal_read_chunk(rqstp, info, p); ++ else ++ ret = svc_rdma_build_pz_read_chunk(rqstp, info, p); ++#endif + if (ret < 0) + goto out_err; + ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_svcrdma_post_read_chunk(&cc->cc_cid, cc->cc_sqecount); ++#endif ++#ifdef HAVE_SVC_RDMA_PCL + init_completion(&cc->cc_done); ++#endif + ret = svc_rdma_post_chunk_ctxt(cc); + if (ret < 0) + goto out_err; + ++#ifdef HAVE_SVC_RDMA_PCL + ret = 1; + wait_for_completion(&cc->cc_done); + if (cc->cc_status != IB_WC_SUCCESS) +@@ -1158,6 +1588,10 @@ int svc_rdma_process_read_list(struct sv + + /* Ensure svc_rdma_recv_ctxt_put() does not try to release pages */ + head->rc_page_count = 0; ++#else ++ svc_rdma_save_io_pages(rqstp, 0, head->rc_page_count); ++ return 1; ++#endif + + out_err: + svc_rdma_read_info_free(info); diff --git a/src/mlnx-ofa_kernel-5.8/backports/0269-BACKPORT-net-sunrpc-xprtrdma-verbs.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0269-BACKPORT-net-sunrpc-xprtrdma-verbs.c.patch new file mode 100644 index 0000000..1551a74 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0269-BACKPORT-net-sunrpc-xprtrdma-verbs.c.patch @@ -0,0 +1,297 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: net/sunrpc/xprtrdma/verbs.c + +Change-Id: I20e8c40ffa5c8db92396833be8ae554e601a5dd1 +--- + net/sunrpc/xprtrdma/verbs.c | 92 +++++++++++++++++++++++++++++++++++++ + 1 file changed, 92 insertions(+) + +--- a/net/sunrpc/xprtrdma/verbs.c ++++ b/net/sunrpc/xprtrdma/verbs.c +@@ -55,14 +55,27 @@ + #include + #include + ++#include ++ ++#if ((LINUX_VERSION_CODE >= KERNEL_VERSION(4,15,0)) || \ ++ (defined(RHEL_MAJOR) && ((RHEL_MAJOR == 7 && RHEL_MINOR >= 6) || \ ++ RHEL_MAJOR >= 8))) + #include ++#endif + #include + + #include + + #include "xprt_rdma.h" ++#ifdef HAVE_TRACE_RPCRDMA_H + #include ++#endif + ++#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) ++#ifndef RPCDBG_FACILITY ++#define RPCDBG_FACILITY RPCDBG_TRANS ++#endif ++#endif + static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt); + static void rpcrdma_sendctxs_destroy(struct rpcrdma_xprt *r_xprt); + static void rpcrdma_sendctx_put_locked(struct rpcrdma_xprt *r_xprt, +@@ -146,7 +159,9 @@ static void rpcrdma_wc_send(struct ib_cq + struct rpcrdma_xprt *r_xprt = cq->cq_context; + + /* WARNING: Only wr_cqe and status are reliable at this point */ ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_xprtrdma_wc_send(wc, &sc->sc_cid); ++#endif + rpcrdma_sendctx_put_locked(r_xprt, sc); + rpcrdma_flush_disconnect(r_xprt, wc); + } +@@ -165,7 +180,9 @@ static void rpcrdma_wc_receive(struct ib + struct rpcrdma_xprt *r_xprt = cq->cq_context; + + /* WARNING: Only wr_cqe and status are reliable at this point */ ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_xprtrdma_wc_receive(wc, &rep->rr_cid); ++#endif + --r_xprt->rx_ep->re_receive_count; + if (wc->status != IB_WC_SUCCESS) + goto out_flushed; +@@ -253,7 +270,9 @@ rpcrdma_cm_event_handler(struct rdma_cm_ + rpcrdma_ep_get(ep); + ep->re_connect_status = 1; + rpcrdma_update_cm_private(ep, &event->param.conn); ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_xprtrdma_inline_thresh(ep); ++#endif + wake_up_all(&ep->re_connect_wait); + break; + case RDMA_CM_EVENT_CONNECT_ERROR: +@@ -532,7 +551,9 @@ int rpcrdma_xprt_connect(struct rpcrdma_ + frwr_wp_create(r_xprt); + + out: ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_xprtrdma_connect(r_xprt, rc); ++#endif + return rc; + } + +@@ -557,7 +578,9 @@ void rpcrdma_xprt_disconnect(struct rpcr + + id = ep->re_id; + rc = rdma_disconnect(id); ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_xprtrdma_disconnect(r_xprt, rc); ++#endif + + rpcrdma_xprt_drain(r_xprt); + rpcrdma_reps_unmap(r_xprt); +@@ -642,6 +665,9 @@ static int rpcrdma_sendctxs_create(struc + buf->rb_sc_ctxs[i] = sc; + } + ++#ifndef HAVE_XPRT_WAIT_FOR_BUFFER_SPACE_RQST_ARG ++ buf->rb_flags = 0; ++#endif + buf->rb_sc_head = 0; + buf->rb_sc_tail = 0; + return 0; +@@ -696,7 +722,11 @@ out_emptyq: + * completions recently. This is a sign the Send Queue is + * backing up. Cause the caller to pause and try again. + */ ++#ifdef HAVE_XPRT_WAIT_FOR_BUFFER_SPACE_RQST_ARG + xprt_wait_for_buffer_space(&r_xprt->rx_xprt); ++#else ++ set_bit(RPCRDMA_BUF_F_EMPTY_SCQ, &buf->rb_flags); ++#endif + r_xprt->rx_stats.empty_sendctx_q++; + return NULL; + } +@@ -732,7 +762,14 @@ static void rpcrdma_sendctx_put_locked(s + /* Paired with READ_ONCE */ + smp_store_release(&buf->rb_sc_tail, next_tail); + ++#ifdef HAVE_XPRT_WAIT_FOR_BUFFER_SPACE_RQST_ARG + xprt_write_space(&r_xprt->rx_xprt); ++#else ++ if (test_and_clear_bit(RPCRDMA_BUF_F_EMPTY_SCQ, &buf->rb_flags)) { ++ smp_mb__after_atomic(); ++ xprt_write_space(&r_xprt->rx_xprt); ++ } ++#endif + } + + static void +@@ -763,7 +800,12 @@ rpcrdma_mrs_create(struct rpcrdma_xprt * + } + + r_xprt->rx_stats.mrs_allocated += count; ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_xprtrdma_createmrs(r_xprt, count); ++#endif ++#ifndef HAVE_XPRT_WAIT_FOR_BUFFER_SPACE_RQST_ARG ++ xprt_write_space(&r_xprt->rx_xprt); ++#endif + } + + static void +@@ -775,7 +817,9 @@ rpcrdma_mr_refresh_worker(struct work_st + rx_buf); + + rpcrdma_mrs_create(r_xprt); ++#ifdef HAVE_XPRT_WAIT_FOR_BUFFER_SPACE_RQST_ARG + xprt_write_space(&r_xprt->rx_xprt); ++#endif + } + + /** +@@ -796,7 +840,11 @@ void rpcrdma_mrs_refresh(struct rpcrdma_ + * workqueue in order to prevent MR allocation + * from recursing into NFS during direct reclaim. + */ ++#ifdef HAVE_XPRT_RECONNECT_DELAY + queue_work(xprtiod_workqueue, &buf->rb_refresh_worker); ++#else ++ schedule_work(&buf->rb_refresh_worker); ++#endif + } + } + +@@ -896,8 +944,10 @@ static int rpcrdma_reqs_setup(struct rpc + + static void rpcrdma_req_reset(struct rpcrdma_req *req) + { ++#ifdef HAVE_RPC_XPRT_OPS_FREE_SLOT + /* Credits are valid for only one connection */ + req->rl_slot.rq_cong = 0; ++#endif + + rpcrdma_regbuf_free(req->rl_rdmabuf); + req->rl_rdmabuf = NULL; +@@ -954,9 +1004,13 @@ struct rpcrdma_rep *rpcrdma_rep_create(s + rep->rr_recv_wr.num_sge = 1; + rep->rr_temp = temp; + ++#ifdef HAVE_XPRT_PIN_RQST + spin_lock(&buf->rb_lock); ++#endif + list_add(&rep->rr_all, &buf->rb_all_reps); ++#ifdef HAVE_XPRT_PIN_RQST + spin_unlock(&buf->rb_lock); ++#endif + return rep; + + out_free_regbuf: +@@ -967,6 +1021,7 @@ out: + return NULL; + } + ++#ifdef HAVE_XPRT_PIN_RQST + static void rpcrdma_rep_free(struct rpcrdma_rep *rep) + { + rpcrdma_regbuf_free(rep->rr_rdmabuf); +@@ -983,6 +1038,14 @@ static void rpcrdma_rep_destroy(struct r + + rpcrdma_rep_free(rep); + } ++#else ++static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep) ++{ ++ list_del(&rep->rr_all); ++ rpcrdma_regbuf_free(rep->rr_rdmabuf); ++ kfree(rep); ++} ++#endif + + static struct rpcrdma_rep *rpcrdma_rep_get_locked(struct rpcrdma_buffer *buf) + { +@@ -1025,6 +1088,7 @@ static void rpcrdma_reps_destroy(struct + { + struct rpcrdma_rep *rep; + ++#ifdef HAVE_XPRT_PIN_RQST + spin_lock(&buf->rb_lock); + while ((rep = list_first_entry_or_null(&buf->rb_all_reps, + struct rpcrdma_rep, +@@ -1037,6 +1101,10 @@ static void rpcrdma_reps_destroy(struct + spin_lock(&buf->rb_lock); + } + spin_unlock(&buf->rb_lock); ++#else ++ while ((rep = rpcrdma_rep_get_locked(buf)) != NULL) ++ rpcrdma_rep_destroy(rep); ++#endif + } + + /** +@@ -1054,6 +1122,9 @@ int rpcrdma_buffer_create(struct rpcrdma + spin_lock_init(&buf->rb_lock); + INIT_LIST_HEAD(&buf->rb_mrs); + INIT_LIST_HEAD(&buf->rb_all_mrs); ++#ifndef HAVE_XPRT_PIN_RQST ++ INIT_LIST_HEAD(&buf->rb_pending); ++#endif + INIT_WORK(&buf->rb_refresh_worker, rpcrdma_mr_refresh_worker); + + INIT_LIST_HEAD(&buf->rb_send_bufs); +@@ -1079,6 +1150,19 @@ out: + return rc; + } + ++#ifndef HAVE_XPRT_PIN_RQST ++void rpcrdma_recv_buffer_put_locked(struct rpcrdma_rep *rep) ++{ ++ struct rpcrdma_buffer *buffers = &rep->rr_rxprt->rx_buf; ++ ++ if (!rep->rr_temp) { ++ llist_add(&rep->rr_node, &buffers->rb_free_reps); ++ } else { ++ rpcrdma_rep_destroy(rep); ++ } ++} ++#endif ++ + /** + * rpcrdma_req_destroy - Destroy an rpcrdma_req object + * @req: unused object to be destroyed +@@ -1298,7 +1382,9 @@ bool __rpcrdma_regbuf_dma_map(struct rpc + rb->rg_iov.addr = ib_dma_map_single(device, rdmab_data(rb), + rdmab_length(rb), rb->rg_direction); + if (ib_dma_mapping_error(device, rdmab_addr(rb))) { ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_xprtrdma_dma_maperr(rdmab_addr(rb)); ++#endif + return false; + } + +@@ -1369,7 +1455,9 @@ void rpcrdma_post_recvs(struct rpcrdma_x + break; + + rep->rr_cid.ci_queue_id = ep->re_attr.recv_cq->res.id; ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_xprtrdma_post_recv(rep); ++#endif + rep->rr_recv_wr.next = wr; + wr = &rep->rr_recv_wr; + --needed; +@@ -1381,7 +1469,9 @@ void rpcrdma_post_recvs(struct rpcrdma_x + rc = ib_post_recv(ep->re_id->qp, wr, + (const struct ib_recv_wr **)&bad_wr); + if (rc) { ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_xprtrdma_post_recvs_err(r_xprt, rc); ++#endif + for (wr = bad_wr; wr;) { + struct rpcrdma_rep *rep; + +@@ -1395,7 +1485,9 @@ void rpcrdma_post_recvs(struct rpcrdma_x + complete(&ep->re_done); + + out: ++#ifdef HAVE_TRACE_RPCRDMA_H + trace_xprtrdma_post_recvs(r_xprt, count); ++#endif + ep->re_receive_count += count; + return; + } diff --git a/src/mlnx-ofa_kernel-5.8/backports/0270-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch b/src/mlnx-ofa_kernel-5.8/backports/0270-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch new file mode 100644 index 0000000..9f63908 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0270-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch @@ -0,0 +1,3011 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/en_main.c + +Change-Id: I1d79f7746d92d2763217cc6fc748ff0a8a96b359 +--- + .../net/ethernet/mellanox/mlx5/core/en_main.c | 1354 ++++++++++++++++- + 1 file changed, 1281 insertions(+), 73 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +@@ -30,17 +30,30 @@ + * SOFTWARE. + */ + ++#ifdef CONFIG_MLX5_CLS_ACT + #include ++#endif + #include + #include ++#include ++#if defined(HAVE_KERNEL_WITH_VXLAN_SUPPORT_ON) + #include ++#endif + #include + #include + #include + #include + #include ++#ifdef HAVE_NET_PAGE_POOL_H + #include ++#endif ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT ++#ifdef HAVE_XDP_SOCK_DRV_H + #include ++#else ++#include ++#endif ++#endif + #include "eswitch.h" + #include "en.h" + #include "en/txrx.h" +@@ -52,7 +65,9 @@ + #include "en_accel/tls.h" + #include "accel/ipsec.h" + #include "accel/tls.h" ++#if defined(HAVE_KERNEL_WITH_VXLAN_SUPPORT_ON) + #include "lib/vxlan.h" ++#endif + #include "lib/clock.h" + #include "en/port.h" + #include "en/xdp.h" +@@ -71,6 +86,7 @@ + #include "qos.h" + #include "en/trap.h" + #include "fpga/ipsec.h" ++#include "compat.h" + + bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev) + { +@@ -95,16 +111,21 @@ void mlx5e_update_carrier(struct mlx5e_p + { + struct mlx5_core_dev *mdev = priv->mdev; + u8 port_state; ++#ifdef HAVE_NETIF_CARRIER_EVENT + bool up; ++#endif + + port_state = mlx5_query_vport_state(mdev, + MLX5_VPORT_STATE_OP_MOD_VNIC_VPORT, + 0); +- ++#ifdef HAVE_NETIF_CARRIER_EVENT + up = port_state == VPORT_STATE_UP; + if (up == netif_carrier_ok(priv->netdev)) + netif_carrier_event(priv->netdev); + if (up) { ++#else ++ if (port_state == VPORT_STATE_UP) { ++#endif + netdev_info(priv->netdev, "Link up\n"); + netif_carrier_on(priv->netdev); + } else { +@@ -270,6 +291,7 @@ static inline void mlx5e_build_umr_wqe(s + ucseg->mkey_mask = cpu_to_be64(MLX5_MKEY_MASK_FREE); + } + ++#ifdef HAVE_SHAMPO_SUPPORT + static int mlx5e_rq_shampo_hd_alloc(struct mlx5e_rq *rq, int node) + { + rq->mpwqe.shampo = kvzalloc_node(sizeof(*rq->mpwqe.shampo), +@@ -288,8 +310,12 @@ static int mlx5e_rq_shampo_hd_info_alloc + { + struct mlx5e_shampo_hd *shampo = rq->mpwqe.shampo; + ++#ifdef HAVE_BITMAP_ZALLOC_NODE + shampo->bitmap = bitmap_zalloc_node(shampo->hd_per_wq, GFP_KERNEL, + node); ++#else ++ shampo->bitmap = bitmap_zalloc(shampo->hd_per_wq, GFP_KERNEL); ++#endif + if (!shampo->bitmap) + return -ENOMEM; + +@@ -308,6 +334,7 @@ static void mlx5e_rq_shampo_hd_info_free + kvfree(rq->mpwqe.shampo->bitmap); + kvfree(rq->mpwqe.shampo->info); + } ++#endif + + static int mlx5e_rq_alloc_mpwqe_info(struct mlx5e_rq *rq, int node) + { +@@ -374,6 +401,7 @@ static int mlx5e_create_umr_mtt_mkey(str + return err; + } + ++#ifdef HAVE_SHAMPO_SUPPORT + static int mlx5e_create_umr_klm_mkey(struct mlx5_core_dev *mdev, + u64 nentries, + u32 *umr_mkey) +@@ -406,6 +434,7 @@ static int mlx5e_create_umr_klm_mkey(str + kvfree(in); + return err; + } ++#endif + + static int mlx5e_create_rq_umr_mkey(struct mlx5_core_dev *mdev, struct mlx5e_rq *rq) + { +@@ -415,6 +444,7 @@ static int mlx5e_create_rq_umr_mkey(stru + &rq->umr_mkey, rq->wqe_overflow.addr); + } + ++#ifdef HAVE_SHAMPO_SUPPORT + static int mlx5e_create_rq_hd_umr_mkey(struct mlx5_core_dev *mdev, + struct mlx5e_rq *rq) + { +@@ -428,6 +458,7 @@ static int mlx5e_create_rq_hd_umr_mkey(s + return mlx5e_create_umr_klm_mkey(mdev, rq->mpwqe.shampo->hd_per_wq, + &rq->mpwqe.shampo->mkey); + } ++#endif + + static u64 mlx5e_get_mpwqe_offset(u16 wqe_ix) + { +@@ -531,7 +562,9 @@ static int mlx5e_init_rxq_rq(struct mlx5 + rq->channel = c; + rq->mdev = mdev; + rq->hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu); ++#ifdef HAVE_XDP_SUPPORT + rq->xdpsq = &c->rq_xdpsq; ++#endif + rq->stats = &c->priv->channel_stats[c->ix]->rq; + rq->ptp_cyc2time = mlx5_rq_ts_translator(mdev); + if (mlx5_eswitch_mode(mdev) == MLX5_ESWITCH_OFFLOADS && +@@ -543,9 +576,19 @@ static int mlx5e_init_rxq_rq(struct mlx5 + if (err) + return err; + +- return xdp_rxq_info_reg(&rq->xdp_rxq, rq->netdev, rq->ix, 0); ++#ifdef HAVE_XDP_SUPPORT ++#ifdef HAVE_XDP_RXQ_INFO ++#ifdef HAVE_XDP_RXQ_INFO_REG_4_PARAMS ++ err = xdp_rxq_info_reg(&rq->xdp_rxq, rq->netdev, rq->ix, 0); ++#else ++ err = xdp_rxq_info_reg(&rq->xdp_rxq, rq->netdev, rq->ix); ++#endif ++#endif ++#endif /* HAVE_XDP_SUPPORT */ ++ return err; + } + ++#ifdef HAVE_SHAMPO_SUPPORT + static int mlx5_rq_shampo_alloc(struct mlx5_core_dev *mdev, + struct mlx5e_params *params, + struct mlx5e_rq_param *rqp, +@@ -604,6 +647,7 @@ static void mlx5e_rq_free_shampo(struct + mlx5_core_destroy_mkey(rq->mdev, rq->mpwqe.shampo->mkey); + mlx5e_rq_shampo_hd_free(rq); + } ++#endif + + static void mlx5e_rx_cache_reduce_clean_pending(struct mlx5e_rq *rq) + { +@@ -697,11 +741,20 @@ static int mlx5e_alloc_rq(struct mlx5e_p + struct mlx5e_rq_param *rqp, + int node, struct mlx5e_rq *rq) + { ++#ifdef HAVE_NET_PAGE_POOL_H + struct page_pool_params pp_params = { 0 }; ++#endif + struct mlx5_core_dev *mdev = rq->mdev; + void *rqc = rqp->rqc; + void *rqc_wq = MLX5_ADDR_OF(rqc, rqc, wq); ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT ++#ifndef HAVE_XSK_BUFF_ALLOC ++ u32 num_xsk_frames = 0; ++#endif ++#endif ++#ifdef HAVE_NET_PAGE_POOL_H + u32 pool_size; ++#endif + u32 cache_init_sz; + int wq_sz; + int err; +@@ -709,14 +762,36 @@ static int mlx5e_alloc_rq(struct mlx5e_p + + rqp->wq.db_numa_node = node; + INIT_WORK(&rq->recover_work, mlx5e_rq_err_cqe_work); +- ++#ifdef HAVE_XDP_SUPPORT + if (params->xdp_prog) ++#ifndef HAVE_BPF_PROG_ADD_RET_STRUCT + bpf_prog_inc(params->xdp_prog); ++#else ++ { ++ struct bpf_prog *prog = bpf_prog_inc(params->xdp_prog); ++ if (IS_ERR(prog)) { ++ err = PTR_ERR(prog); ++ goto err_rq_xdp_prog; ++ } ++ } ++#endif /* HAVE_BPF_PROG_ADD_RET_STRUCT */ + RCU_INIT_POINTER(rq->xdp_prog, params->xdp_prog); + + rq->buff.map_dir = params->xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE; ++#else ++ rq->buff.map_dir = DMA_FROM_DEVICE; ++#endif ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + rq->buff.headroom = mlx5e_get_rq_headroom(mdev, params, xsk); ++#ifndef HAVE_XSK_BUFF_ALLOC ++ rq->buff.umem_headroom = xsk ? xsk->headroom : 0; ++#endif ++#else ++ rq->buff.headroom = mlx5e_get_rq_headroom(mdev, params, NULL); ++#endif ++#ifdef HAVE_NET_PAGE_POOL_H + pool_size = 1 << params->log_rq_mtu_frames; ++#endif + + switch (rq->wq_type) { + case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ: +@@ -733,10 +808,19 @@ static int mlx5e_alloc_rq(struct mlx5e_p + + wq_sz = mlx5_wq_ll_get_size(&rq->mpwqe.wq); + ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT ++#ifndef HAVE_XSK_BUFF_ALLOC ++ if (xsk) ++ num_xsk_frames = wq_sz << ++ mlx5e_mpwqe_get_log_num_strides(mdev, params, xsk); ++#endif ++#endif + cache_init_sz = wq_sz * MLX5_MPWRQ_PAGES_PER_WQE; + ++#ifdef HAVE_NET_PAGE_POOL_H + pool_size = MLX5_MPWRQ_PAGES_PER_WQE << + mlx5e_mpwqe_get_log_rq_size(params, xsk); ++#endif + + rq->mpwqe.log_stride_sz = mlx5e_mpwqe_get_log_stride_size(mdev, params, xsk); + rq->mpwqe.num_strides = +@@ -753,9 +837,11 @@ static int mlx5e_alloc_rq(struct mlx5e_p + if (err) + goto err_rq_mkey; + ++#ifdef HAVE_SHAMPO_SUPPORT + err = mlx5_rq_shampo_alloc(mdev, params, rqp, rq, &pool_size, node); + if (err) + goto err_free_by_rq_type; ++#endif + + break; + default: /* MLX5_WQ_TYPE_CYCLIC */ +@@ -768,6 +854,13 @@ static int mlx5e_alloc_rq(struct mlx5e_p + + wq_sz = mlx5_wq_cyc_get_size(&rq->wqe.wq); + ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT ++#ifndef HAVE_XSK_BUFF_ALLOC ++ if (xsk) ++ num_xsk_frames = wq_sz << rq->wqe.info.log_num_frags; ++#endif ++#endif ++ + cache_init_sz = wq_sz; + rq->wqe.info = rqp->frags_info; + rq->buff.frame0_sz = rq->wqe.info.arr[0].frag_stride; +@@ -788,16 +881,38 @@ static int mlx5e_alloc_rq(struct mlx5e_p + rq->mkey_be = cpu_to_be32(mdev->mlx5e_res.hw_objs.mkey); + } + ++ err = 0; ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + if (xsk) { ++#ifdef HAVE_XSK_BUFF_ALLOC + err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, + MEM_TYPE_XSK_BUFF_POOL, NULL); ++#ifdef HAVE_NETDEV_BPF_XSK_BUFF_POOL + xsk_pool_set_rxq_info(rq->xsk_pool, &rq->xdp_rxq); ++#else ++ xsk_buff_set_rxq_info(rq->umem, &rq->xdp_rxq); ++#endif ++#else ++ err = mlx5e_xsk_resize_reuseq(rq->umem, num_xsk_frames); ++ if (unlikely(err)) { ++ mlx5_core_err(mdev, "Unable to allocate the Reuse Ring for %u frames\n", ++ num_xsk_frames); ++ goto err_free_by_rq_type; ++ } ++ rq->zca.free = mlx5e_xsk_zca_free; ++ err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, ++ MEM_TYPE_ZERO_COPY, ++ &rq->zca); ++ ++#endif /* HAVE_XSK_BUFF_ALLOC */ + } else { ++#endif /* HAVE_XSK_ZERO_COPY_SUPPORT */ + err = mlx5e_rx_alloc_page_cache(rq, node, + ilog2(cache_init_sz)); + if (err) + goto err_free_by_rq_type; + ++#ifdef HAVE_NET_PAGE_POOL_H + /* Create a page_pool and register it with rxq */ + pp_params.order = 0; + pp_params.flags = 0; /* No-internal DMA mapping in page_pool */ +@@ -817,10 +932,21 @@ static int mlx5e_alloc_rq(struct mlx5e_p + rq->page_pool = NULL; + goto err_free_shampo; + } ++#endif /* HAVE_NET_PAGE_POOL_H */ ++#ifdef HAVE_XDP_SUPPORT ++#ifdef HAVE_XDP_RXQ_INFO_REG_MEM_MODEL + if (xdp_rxq_info_is_reg(&rq->xdp_rxq)) + err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, ++#ifdef HAVE_NET_PAGE_POOL_H + MEM_TYPE_PAGE_POOL, rq->page_pool); ++#else ++ MEM_TYPE_PAGE_ORDER0, NULL); ++#endif ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + } ++#endif ++#endif /* HAVE_XDP_RXQ_INFO_REG_MEM_MODEL */ ++#endif /* HAVE_XDP_SUPPORT */ + if (err) + goto err_free_shampo; + +@@ -872,7 +998,9 @@ static int mlx5e_alloc_rq(struct mlx5e_p + return 0; + + err_free_shampo: ++#ifdef HAVE_SHAMPO_SUPPORT + mlx5e_rq_free_shampo(rq); ++#endif + err_free_by_rq_type: + switch (rq->wq_type) { + case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ: +@@ -890,22 +1018,29 @@ err_rq_frags: + err_rq_wq_destroy: + mlx5_wq_destroy(&rq->wq_ctrl); + err_rq_xdp_prog: ++#ifdef HAVE_XDP_SUPPORT + if (params->xdp_prog) + bpf_prog_put(params->xdp_prog); +- ++#endif + return err; + } + + static void mlx5e_free_rq(struct mlx5e_rq *rq) + { ++#ifdef HAVE_XDP_SUPPORT + struct bpf_prog *old_prog; + ++#ifdef HAVE_XDP_RXQ_INFO + if (xdp_rxq_info_is_reg(&rq->xdp_rxq)) { ++#endif + old_prog = rcu_dereference_protected(rq->xdp_prog, + lockdep_is_held(&rq->priv->state_lock)); + if (old_prog) + bpf_prog_put(old_prog); ++#ifdef HAVE_XDP_RXQ_INFO + } ++#endif ++#endif /* HAVE_XDP_SUPPORT */ + + if (rq->page_cache.page_cache) + mlx5e_rx_free_page_cache(rq); +@@ -915,15 +1050,23 @@ static void mlx5e_free_rq(struct mlx5e_r + kvfree(rq->mpwqe.info); + mlx5_core_destroy_mkey(rq->mdev, rq->umr_mkey); + mlx5e_free_mpwqe_rq_drop_page(rq); ++#ifdef HAVE_SHAMPO_SUPPORT + mlx5e_rq_free_shampo(rq); ++#endif + break; + default: /* MLX5_WQ_TYPE_CYCLIC */ + kvfree(rq->wqe.frags); + mlx5e_free_di_list(rq); + } +- ++#ifdef HAVE_XDP_SUPPORT ++#ifdef HAVE_XDP_RXQ_INFO + xdp_rxq_info_unreg(&rq->xdp_rxq); +- page_pool_destroy(rq->page_pool); ++#endif ++#endif ++#ifdef HAVE_NET_PAGE_POOL_H ++ if (rq->page_pool) ++ page_pool_destroy(rq->page_pool); ++#endif + mlx5_wq_destroy(&rq->wq_ctrl); + } + +@@ -983,11 +1126,13 @@ int mlx5e_create_rq(struct mlx5e_rq *rq, + MLX5_ADAPTER_PAGE_SHIFT); + MLX5_SET64(wq, wq, dbr_addr, rq->wq_ctrl.db.dma); + ++#ifdef HAVE_SHAMPO_SUPPORT + if (test_bit(MLX5E_RQ_STATE_SHAMPO, &rq->state)) { + MLX5_SET(wq, wq, log_headers_buffer_entry_num, + order_base_2(rq->mpwqe.shampo->hd_per_wq)); + MLX5_SET(wq, wq, headers_mkey, rq->mpwqe.shampo->mkey); + } ++#endif + + mlx5_fill_page_frag_array(&rq->wq_ctrl.buf, + (__be64 *)MLX5_ADDR_OF(wq, wq, pas)); +@@ -1129,6 +1274,7 @@ void mlx5e_free_rx_in_progress_descs(str + head = mlx5_wq_ll_get_wqe_next_ix(wq, head); + } + ++#ifdef HAVE_SHAMPO_SUPPORT + if (test_bit(MLX5E_RQ_STATE_SHAMPO, &rq->state)) { + u16 len; + +@@ -1137,6 +1283,7 @@ void mlx5e_free_rx_in_progress_descs(str + mlx5e_shampo_dealloc_hd(rq, len, rq->mpwqe.shampo->ci, false); + rq->mpwqe.shampo->pi = rq->mpwqe.shampo->ci; + } ++#endif + + rq->mpwqe.actual_wq_head = wq->head; + rq->mpwqe.umr_in_progress = 0; +@@ -1164,9 +1311,11 @@ void mlx5e_free_rx_descs(struct mlx5e_rq + &wqe->next.next_wqe_index); + } + ++#ifdef HAVE_SHAMPO_SUPPORT + if (test_bit(MLX5E_RQ_STATE_SHAMPO, &rq->state)) + mlx5e_shampo_dealloc_hd(rq, rq->mpwqe.shampo->hd_per_wq, + 0, true); ++#endif + } else { + struct mlx5_wq_cyc *wq = &rq->wqe.wq; + +@@ -1179,6 +1328,59 @@ void mlx5e_free_rx_descs(struct mlx5e_rq + + } + ++#ifdef CONFIG_COMPAT_LRO_ENABLED_IPOIB ++static int get_skb_hdr(struct sk_buff *skb, void **iphdr, ++ void **tcph, u64 *hdr_flags, void *priv) ++{ ++ unsigned int ip_len; ++ struct iphdr *iph; ++ ++ if (unlikely(skb->protocol != htons(ETH_P_IP))) ++ return -1; ++ ++ /* ++ * In the future we may add an else clause that verifies the ++ * checksum and allows devices which do not calculate checksum ++ * to use LRO. ++ */ ++ if (unlikely(skb->ip_summed != CHECKSUM_UNNECESSARY)) ++ return -1; ++ ++ /* Check for non-TCP packet */ ++ skb_reset_network_header(skb); ++ iph = ip_hdr(skb); ++ if (iph->protocol != IPPROTO_TCP) ++ return -1; ++ ++ ip_len = ip_hdrlen(skb); ++ skb_set_transport_header(skb, ip_len); ++ *tcph = tcp_hdr(skb); ++ ++ /* check if IP header and TCP header are complete */ ++ if (ntohs(iph->tot_len) < ip_len + tcp_hdrlen(skb)) ++ return -1; ++ ++ *hdr_flags = LRO_IPV4 | LRO_TCP; ++ *iphdr = iph; ++ ++ return 0; ++} ++ ++static void mlx5e_rq_sw_lro_init(struct mlx5e_rq *rq) ++{ ++ rq->sw_lro = &rq->priv->sw_lro[rq->ix]; ++ rq->sw_lro->lro_mgr.max_aggr = 64; ++ rq->sw_lro->lro_mgr.max_desc = MLX5E_LRO_MAX_DESC; ++ rq->sw_lro->lro_mgr.lro_arr = rq->sw_lro->lro_desc; ++ rq->sw_lro->lro_mgr.get_skb_header = get_skb_hdr; ++ rq->sw_lro->lro_mgr.features = LRO_F_NAPI; ++ rq->sw_lro->lro_mgr.frag_align_pad = NET_IP_ALIGN; ++ rq->sw_lro->lro_mgr.dev = rq->netdev; ++ rq->sw_lro->lro_mgr.ip_summed = CHECKSUM_UNNECESSARY; ++ rq->sw_lro->lro_mgr.ip_summed_aggr = CHECKSUM_UNNECESSARY; ++} ++#endif ++ + int mlx5e_open_rq(struct mlx5e_priv *priv, struct mlx5e_params *params, + struct mlx5e_rq_param *param, struct mlx5e_xsk_param *xsk, + struct mlx5e_create_cq_param *ccp, struct dim_cq_moder moder, +@@ -1215,12 +1417,19 @@ int mlx5e_open_rq(struct mlx5e_priv *pri + mlx5_core_warn(mdev, "Failed to enable delay drop err=%d\n", + err); + ++#ifdef CONFIG_COMPAT_LRO_ENABLED_IPOIB ++ mlx5e_rq_sw_lro_init(rq); ++#endif ++ + err = mlx5e_modify_rq_state(rq, MLX5_RQC_STATE_RST, MLX5_RQC_STATE_RDY); + if (err) + goto err_destroy_rq; + ++#if defined(HAVE_UAPI_LINUX_TLS_H) && defined(CONFIG_MLX5_EN_TLS) && \ ++ defined(HAVE_KTLS_RX_SUPPORT) + if (mlx5e_is_tls_on(rq->priv) && !mlx5e_accel_is_ktls_device(mdev)) + __set_bit(MLX5E_RQ_STATE_FPGA_TLS, &rq->state); /* must be FPGA */ ++#endif + + if (MLX5_CAP_ETH(mdev, cqe_checksum_full)) + __set_bit(MLX5E_RQ_STATE_CSUM_FULL, &rq->state); +@@ -1232,7 +1441,11 @@ int mlx5e_open_rq(struct mlx5e_priv *pri + * XDP programs might manipulate packets which will render + * skb->checksum incorrect. + */ ++#ifdef HAVE_XDP_SUPPORT + if (MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_NO_CSUM_COMPLETE) || params->xdp_prog) ++#else ++ if (MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_NO_CSUM_COMPLETE)) ++#endif + __set_bit(MLX5E_RQ_STATE_NO_CSUM_COMPLETE, &rq->state); + + /* For CQE compression on striding RQ, use stride index provided by +@@ -1242,6 +1455,9 @@ int mlx5e_open_rq(struct mlx5e_priv *pri + MLX5_CAP_GEN(mdev, mini_cqe_resp_stride_index)) + __set_bit(MLX5E_RQ_STATE_MINI_CQE_HW_STRIDX, &rq->state); + ++ if (MLX5E_GET_PFLAG(params, MLX5E_PFLAG_SKB_XMIT_MORE)) ++ __set_bit(MLX5E_RQ_STATE_SKB_XMIT_MORE, &rq->state); ++ + return 0; + + err_destroy_rq: +@@ -1277,6 +1493,7 @@ void mlx5e_close_rq(struct mlx5e_priv *p + memset(rq, 0, sizeof(*rq)); + } + ++#ifdef HAVE_XDP_SUPPORT + static void mlx5e_free_xdpsq_db(struct mlx5e_xdpsq *sq) + { + kvfree(sq->db.xdpi_fifo.xi); +@@ -1324,7 +1541,13 @@ static int mlx5e_alloc_xdpsq_db(struct m + + static int mlx5e_alloc_xdpsq(struct mlx5e_channel *c, + struct mlx5e_params *params, ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT ++#ifdef HAVE_NETDEV_BPF_XSK_BUFF_POOL + struct xsk_buff_pool *xsk_pool, ++#else ++ struct xdp_umem *xsk_pool, ++#endif ++#endif + struct mlx5e_sq_param *param, + struct mlx5e_xdpsq *sq, + bool is_redirect) +@@ -1340,11 +1563,19 @@ static int mlx5e_alloc_xdpsq(struct mlx5 + sq->uar_map = mdev->mlx5e_res.hw_objs.bfreg.map; + sq->min_inline_mode = params->tx_min_inline_mode; + sq->hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu); ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT ++#ifdef HAVE_NETDEV_BPF_XSK_BUFF_POOL + sq->xsk_pool = xsk_pool; +- + sq->stats = sq->xsk_pool ? ++#else ++ sq->umem = xsk_pool; ++ sq->stats = sq->umem ? ++#endif + &c->priv->channel_stats[c->ix]->xsksq : + is_redirect ? ++#else ++ sq->stats = is_redirect ? ++#endif + &c->priv->channel_stats[c->ix]->xdpsq : + &c->priv->channel_stats[c->ix]->rq_xdpsq; + sq->stop_room = param->is_mpw ? mlx5e_stop_room_for_mpwqe(mdev) : +@@ -1374,6 +1605,7 @@ static void mlx5e_free_xdpsq(struct mlx5 + mlx5e_free_xdpsq_db(sq); + mlx5_wq_destroy(&sq->wq_ctrl); + } ++#endif /* HAVE_XDP_SUPPORT */ + + static void mlx5e_free_icosq_db(struct mlx5e_icosq *sq) + { +@@ -1701,6 +1933,9 @@ int mlx5e_open_txqsq(struct mlx5e_channe + if (params->tx_dim_enabled) + sq->state |= BIT(MLX5E_SQ_STATE_AM); + ++ if (MLX5E_GET_PFLAG(params, MLX5E_PFLAG_SKB_XMIT_MORE)) ++ set_bit(MLX5E_SQ_STATE_SKB_XMIT_MORE, &sq->state); ++ + return 0; + + err_free_txqsq: +@@ -1798,6 +2033,7 @@ static int mlx5e_open_icosq(struct mlx5e + if (err) + goto err_free_icosq; + ++#if defined(CONFIG_MLX5_EN_TLS) && defined(HAVE_KTLS_RX_SUPPORT) + if (param->is_tls) { + sq->ktls_resync = mlx5e_ktls_rx_resync_create_resp_list(); + if (IS_ERR(sq->ktls_resync)) { +@@ -1805,10 +2041,13 @@ static int mlx5e_open_icosq(struct mlx5e + goto err_destroy_icosq; + } + } ++#endif + return 0; + ++#if defined(CONFIG_MLX5_EN_TLS) && defined(HAVE_KTLS_RX_SUPPORT) + err_destroy_icosq: + mlx5e_destroy_sq(c->mdev, sq->sqn); ++#endif + err_free_icosq: + mlx5e_free_icosq(sq); + +@@ -1836,14 +2075,26 @@ static void mlx5e_close_icosq(struct mlx + mlx5e_free_icosq(sq); + } + ++#ifdef HAVE_XDP_SUPPORT + int mlx5e_open_xdpsq(struct mlx5e_channel *c, struct mlx5e_params *params, +- struct mlx5e_sq_param *param, struct xsk_buff_pool *xsk_pool, ++ struct mlx5e_sq_param *param, ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT ++#ifdef HAVE_NETDEV_BPF_XSK_BUFF_POOL ++ struct xsk_buff_pool *xsk_pool, ++#else ++ struct xdp_umem *xsk_pool, ++#endif ++#endif + struct mlx5e_xdpsq *sq, bool is_redirect) + { + struct mlx5e_create_sq_param csp = {}; + int err; + +- err = mlx5e_alloc_xdpsq(c, params, xsk_pool, param, sq, is_redirect); ++ err = mlx5e_alloc_xdpsq(c, params, ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT ++ xsk_pool, ++#endif ++ param, sq, is_redirect); + if (err) + return err; + +@@ -1860,6 +2111,9 @@ int mlx5e_open_xdpsq(struct mlx5e_channe + if (MLX5E_GET_PFLAG(params, MLX5E_PFLAG_TX_XDP_CSUM)) + set_bit(MLX5E_SQ_STATE_TX_XDP_CSUM, &sq->state); + ++ if (MLX5E_GET_PFLAG(params, MLX5E_PFLAG_SKB_XMIT_MORE)) ++ set_bit(MLX5E_SQ_STATE_SKB_XMIT_MORE, &sq->state); ++ + if (!param->is_mpw) { + unsigned int ds_cnt = MLX5E_XDP_TX_DS_COUNT; + unsigned int inline_hdr_sz = 0; +@@ -1916,6 +2170,7 @@ void mlx5e_close_xdpsq(struct mlx5e_xdps + mlx5e_free_xdpsq_descs(sq); + mlx5e_free_xdpsq(sq); + } ++#endif + + int mlx5e_alloc_cq_common(struct mlx5e_priv *priv, + struct mlx5e_cq_param *param, +@@ -1968,6 +2223,9 @@ static int mlx5e_alloc_cq(struct mlx5e_p + + cq->napi = ccp->napi; + cq->ch_stats = ccp->ch_stats; ++#ifndef HAVE_NAPI_STATE_MISSED ++ cq->ch_flags = ccp->ch_flags; ++#endif + + return err; + } +@@ -2094,6 +2352,7 @@ static void mlx5e_close_tx_cqs(struct ml + mlx5e_close_cq(&c->sq[tc].cq); + } + ++#ifdef HAVE_TC_MQPRIO_QOPT_OFFLOAD + static int mlx5e_mqprio_txq_to_tc(struct netdev_tc_txq *tc_to_txq, unsigned int txq) + { + int tc; +@@ -2129,7 +2388,7 @@ static int mlx5e_txq_get_qos_node_hw_id( + *hw_id = params->mqprio.channel.hw_id[tc]; + return 0; + } +- ++#endif + static int mlx5e_open_sqs(struct mlx5e_channel *c, + struct mlx5e_params *params, + struct mlx5e_channel_param *cparam) +@@ -2138,15 +2397,21 @@ static int mlx5e_open_sqs(struct mlx5e_c + + for (tc = 0; tc < mlx5e_get_dcb_num_tc(params); tc++) { + int txq_ix = c->ix + tc * params->num_channels; ++#ifdef HAVE_TC_MQPRIO_QOPT_OFFLOAD + u32 qos_queue_group_id; + + err = mlx5e_txq_get_qos_node_hw_id(params, txq_ix, &qos_queue_group_id); + if (err) + goto err_close_sqs; ++#endif + + err = mlx5e_open_txqsq(c, c->priv->tisn[c->lag_port][tc], txq_ix, + params, &cparam->txq_sq, &c->sq[tc], tc, ++#ifdef HAVE_TC_MQPRIO_QOPT_OFFLOAD + qos_queue_group_id, ++#else ++ 0, ++#endif + &c->priv->channel_stats[c->ix]->sq[tc]); + if (err) + goto err_close_sqs; +@@ -2219,6 +2484,7 @@ static int mlx5e_set_sq_maxrate(struct n + return 0; + } + ++#if defined(HAVE_NDO_SET_TX_MAXRATE) || defined(HAVE_NDO_SET_TX_MAXRATE_EXTENDED) + static int mlx5e_set_tx_maxrate(struct net_device *dev, int index, u32 rate) + { + struct mlx5e_priv *priv = netdev_priv(dev); +@@ -2249,6 +2515,7 @@ static int mlx5e_set_tx_maxrate(struct n + + return err; + } ++#endif + + static int mlx5e_open_rxq_rq(struct mlx5e_channel *c, struct mlx5e_params *params, + struct mlx5e_create_cq_param *ccp, struct mlx5e_rq_param *rq_params) +@@ -2271,12 +2538,10 @@ static int mlx5e_open_queues(struct mlx5 + int err; + + mlx5e_build_create_cq_param(&ccp, c); +- + err = mlx5e_open_cq(c->priv, icocq_moder, &cparam->async_icosq.cqp, &ccp, + &c->async_icosq.cq); + if (err) + return err; +- + err = mlx5e_open_cq(c->priv, icocq_moder, &cparam->icosq.cqp, &ccp, + &c->icosq.cq); + if (err) +@@ -2286,6 +2551,7 @@ static int mlx5e_open_queues(struct mlx5 + if (err) + goto err_close_icosq_cq; + ++#ifdef HAVE_XDP_SUPPORT + err = mlx5e_open_cq(c->priv, params->tx_cq_moderation, &cparam->xdp_sq.cqp, &ccp, + &c->xdpsq.cq); + if (err) +@@ -2295,6 +2561,7 @@ static int mlx5e_open_queues(struct mlx5 + &ccp, &c->rq_xdpsq.cq) : 0; + if (err) + goto err_close_xdp_tx_cqs; ++#endif + + spin_lock_init(&c->async_icosq_lock); + +@@ -2318,25 +2585,36 @@ static int mlx5e_open_queues(struct mlx5 + if (err) + goto err_close_sqs; + ++#ifdef HAVE_XDP_SUPPORT + if (c->xdp) { +- err = mlx5e_open_xdpsq(c, params, &cparam->xdp_sq, NULL, ++ err = mlx5e_open_xdpsq(c, params, &cparam->xdp_sq, ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT ++ NULL, ++#endif + &c->rq_xdpsq, false); + if (err) + goto err_close_rq; + } + +- err = mlx5e_open_xdpsq(c, params, &cparam->xdp_sq, NULL, &c->xdpsq, true); ++ err = mlx5e_open_xdpsq(c, params, &cparam->xdp_sq, ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT ++ NULL, ++#endif ++ &c->xdpsq, true); + if (err) + goto err_close_xdp_sq; ++#endif + + return 0; + ++#ifdef HAVE_XDP_SUPPORT + err_close_xdp_sq: + if (c->xdp) + mlx5e_close_xdpsq(&c->rq_xdpsq); + + err_close_rq: + mlx5e_close_rq(c->priv, &c->rq); ++#endif + + err_close_sqs: + mlx5e_close_sqs(c); +@@ -2348,6 +2626,7 @@ err_close_async_icosq: + mlx5e_close_icosq(&c->async_icosq); + + err_close_xdpsq_cq: ++#ifdef HAVE_XDP_SUPPORT + if (c->xdp) + mlx5e_close_cq(&c->rq_xdpsq.cq); + +@@ -2355,6 +2634,7 @@ err_close_xdp_tx_cqs: + mlx5e_close_cq(&c->xdpsq.cq); + + err_close_tx_cqs: ++#endif + mlx5e_close_tx_cqs(c); + + err_close_icosq_cq: +@@ -2368,9 +2648,11 @@ err_close_async_icosq_cq: + + static void mlx5e_close_queues(struct mlx5e_channel *c) + { ++#ifdef HAVE_XDP_SUPPORT + mlx5e_close_xdpsq(&c->xdpsq); + if (c->xdp) + mlx5e_close_xdpsq(&c->rq_xdpsq); ++#endif + /* The same ICOSQ is used for UMRs for both RQ and XSKRQ. */ + cancel_work_sync(&c->icosq.recover_work); + mlx5e_close_rq(c->priv, &c->rq); +@@ -2378,9 +2660,11 @@ static void mlx5e_close_queues(struct ml + mlx5e_close_icosq(&c->icosq); + mutex_destroy(&c->icosq_recovery_lock); + mlx5e_close_icosq(&c->async_icosq); ++#ifdef HAVE_XDP_SUPPORT + if (c->xdp) + mlx5e_close_cq(&c->rq_xdpsq.cq); + mlx5e_close_cq(&c->xdpsq.cq); ++#endif + mlx5e_close_tx_cqs(c); + mlx5e_close_cq(&c->icosq.cq); + mlx5e_close_cq(&c->async_icosq.cq); +@@ -2434,11 +2718,19 @@ void mlx5e_trigger_napi_sched(struct nap + static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix, + struct mlx5e_params *params, + struct mlx5e_channel_param *cparam, ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT ++#ifdef HAVE_NETDEV_BPF_XSK_BUFF_POOL + struct xsk_buff_pool *xsk_pool, ++#else ++ struct xdp_umem *xsk_pool, ++#endif ++#endif + struct mlx5e_channel **cp) + { + struct net_device *netdev = priv->netdev; ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + struct mlx5e_xsk_param xsk; ++#endif + const struct cpumask *aff; + struct mlx5e_channel *c; + unsigned int irq; +@@ -2449,7 +2741,17 @@ static int mlx5e_open_channel(struct mlx + if (err) + return err; + ++#ifdef HAVE_IRQ_GET_EFFECTIVE_AFFINITY_MASK + aff = irq_get_effective_affinity_mask(irq); ++#elif defined(HAVE_IRQ_GET_AFFINITY_MASK) ++ aff = irq_get_affinity_mask(irq); ++#else ++#ifndef HAVE_IRQ_DATA_AFFINITY ++ aff = irq_data_get_affinity_mask(irq_desc_get_irq_data(irq_to_desc(irq))); ++#else ++ aff = irq_desc_get_irq_data(irq_to_desc(irq))->affinity; ++#endif ++#endif + cpu = cpumask_first(aff); + + err = mlx5e_channel_stats_alloc(priv, ix, cpu); +@@ -2469,9 +2771,11 @@ static int mlx5e_open_channel(struct mlx + c->netdev = priv->netdev; + c->mkey_be = cpu_to_be32(priv->mdev->mlx5e_res.hw_objs.mkey); + c->num_tc = mlx5e_get_dcb_num_tc(params); ++#ifdef HAVE_XDP_SUPPORT + c->xdp = !!params->xdp_prog; ++#endif + c->stats = &priv->channel_stats[ix]->ch; +- c->aff_mask = irq_get_effective_affinity_mask(irq); ++ c->aff_mask = aff; + c->lag_port = mlx5e_enumerate_lag_port(priv->mdev, ix); + + netif_napi_add(netdev, &c->napi, mlx5e_napi_poll, 64); +@@ -2480,19 +2784,23 @@ static int mlx5e_open_channel(struct mlx + if (unlikely(err)) + goto err_napi_del; + ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + if (xsk_pool) { + mlx5e_build_xsk_param(xsk_pool, &xsk); + err = mlx5e_open_xsk(priv, params, &xsk, xsk_pool, c); + if (unlikely(err)) + goto err_close_queues; + } ++#endif + + *cp = c; + + return 0; + ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + err_close_queues: + mlx5e_close_queues(c); ++#endif + + err_napi_del: + netif_napi_del(&c->napi); +@@ -2507,13 +2815,19 @@ static void mlx5e_rq_channel_activate(st + if (c->priv->shared_rq) + return; + ++#ifdef HAVE_XDP_SUPPORT + if (c->xdp) + mlx5e_activate_xdpsq(&c->rq_xdpsq); ++#endif + mlx5e_activate_rq(&c->rq); ++#ifdef HAVE_XDP_SUPPORT + mlx5e_activate_xdpsq(&c->xdpsq); ++#endif + ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + if (test_bit(MLX5E_CHANNEL_STATE_XSK, c->state)) + mlx5e_activate_xsk(c); ++#endif + + mlx5e_trigger_napi_icosq(c); + } +@@ -2545,13 +2859,19 @@ static void mlx5e_rq_channel_deactivate( + if (c->priv->shared_rq) + return; + ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + if (test_bit(MLX5E_CHANNEL_STATE_XSK, c->state)) + mlx5e_deactivate_xsk(c); ++#endif + ++#ifdef HAVE_XDP_SUPPORT + mlx5e_deactivate_xdpsq(&c->xdpsq); ++#endif + mlx5e_deactivate_rq(&c->rq); ++#ifdef HAVE_XDP_SUPPORT + if (c->xdp) + mlx5e_deactivate_xdpsq(&c->rq_xdpsq); ++#endif + } + + static void mlx5e_disable_channel(struct mlx5e_channel *c) +@@ -2579,8 +2899,10 @@ static void mlx5e_stop_channel(struct ml + + static void mlx5e_close_channel(struct mlx5e_channel *c) + { ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + if (test_bit(MLX5E_CHANNEL_STATE_XSK, c->state)) + mlx5e_close_xsk(c); ++#endif + mlx5e_close_queues(c); + mlx5e_qos_close_queues(c); + netif_napi_del(&c->napi); +@@ -2607,12 +2929,20 @@ int mlx5e_open_channels(struct mlx5e_pri + goto err_free; + + for (i = 0; i < chs->num; i++) { ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT ++#ifdef HAVE_NETDEV_BPF_XSK_BUFF_POOL + struct xsk_buff_pool *xsk_pool = NULL; +- ++#else ++ struct xdp_umem *xsk_pool = NULL; ++#endif + if (chs->params.xdp_prog) + xsk_pool = mlx5e_xsk_get_pool(&chs->params, chs->params.xsk, i); +- +- err = mlx5e_open_channel(priv, i, &chs->params, cparam, xsk_pool, &chs->c[i]); ++#endif ++ err = mlx5e_open_channel(priv, i, &chs->params, cparam, ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT ++ xsk_pool, ++#endif ++ &chs->c[i]); + if (err) + goto err_close_channels; + } +@@ -2723,7 +3053,7 @@ int mlx5e_modify_tirs_packet_merge(struc + return mlx5e_rx_res_packet_merge_set_param(res, &priv->channels.params.packet_merge); + } + +-static MLX5E_DEFINE_PREACTIVATE_WRAPPER_CTX(mlx5e_modify_tirs_packet_merge); ++MLX5E_DEFINE_PREACTIVATE_WRAPPER_CTX(mlx5e_modify_tirs_packet_merge); + + static int mlx5e_set_mtu(struct mlx5_core_dev *mdev, + struct mlx5e_params *params, u16 mtu) +@@ -2778,10 +3108,14 @@ MLX5E_DEFINE_PREACTIVATE_WRAPPER_CTX(mlx + + void mlx5e_set_netdev_mtu_boundaries(struct mlx5e_priv *priv) + { ++#if defined(HAVE_NET_DEVICE_MIN_MAX_MTU) || defined(HAVE_NET_DEVICE_MIN_MAX_MTU_EXTENDED) + struct mlx5e_params *params = &priv->channels.params; + struct net_device *netdev = priv->netdev; + struct mlx5_core_dev *mdev = priv->mdev; + u16 max_mtu; ++#endif ++ ++#ifdef HAVE_NET_DEVICE_MIN_MAX_MTU + + /* MTU range: 68 - hw-specific max */ + netdev->min_mtu = ETH_MIN_MTU; +@@ -2789,6 +3123,12 @@ void mlx5e_set_netdev_mtu_boundaries(str + mlx5_query_port_max_mtu(mdev, &max_mtu, 1); + netdev->max_mtu = min_t(unsigned int, MLX5E_HW2SW_MTU(params, max_mtu), + ETH_MAX_MTU); ++#elif defined(HAVE_NET_DEVICE_MIN_MAX_MTU_EXTENDED) ++ netdev->extended->min_mtu = ETH_MIN_MTU; ++ mlx5_query_port_max_mtu(mdev, &max_mtu, 1); ++ netdev->extended->max_mtu = min_t(unsigned int, MLX5E_HW2SW_MTU(params, max_mtu), ++ ETH_MAX_MTU); ++#endif + } + + static int mlx5e_netdev_set_tcs(struct net_device *netdev, u16 nch, u8 ntc, +@@ -2821,6 +3161,10 @@ static int mlx5e_netdev_set_tcs(struct n + int mlx5e_update_tx_netdev_queues(struct mlx5e_priv *priv) + { + int qos_queues, nch, ntc, num_txqs, err; ++#ifndef HAVE_NET_SYNCHRONIZE_IN_SET_REAL_NUM_TX_QUEUES ++ struct net_device *netdev = priv->netdev; ++ bool disabling; ++#endif + + qos_queues = mlx5e_qos_cur_leaf_nodes(priv); + +@@ -2829,6 +3173,9 @@ int mlx5e_update_tx_netdev_queues(struct + num_txqs = nch * ntc + qos_queues; + if (MLX5E_GET_PFLAG(&priv->channels.params, MLX5E_PFLAG_TX_PORT_TS)) + num_txqs += ntc; ++#ifndef HAVE_NET_SYNCHRONIZE_IN_SET_REAL_NUM_TX_QUEUES ++ disabling = num_txqs < netdev->real_num_tx_queues; ++#endif + + mlx5e_dbg(DRV, priv, "Setting num_txqs %d\n", num_txqs); + err = netif_set_real_num_tx_queues(priv->netdev, num_txqs); +@@ -2836,6 +3183,11 @@ int mlx5e_update_tx_netdev_queues(struct + netdev_warn(priv->netdev, "netif_set_real_num_tx_queues failed (%d > %d), %d\n", + num_txqs, priv->netdev->num_tx_queues, err); + ++#ifndef HAVE_NET_SYNCHRONIZE_IN_SET_REAL_NUM_TX_QUEUES ++ if (disabling) ++ synchronize_net(); ++#endif ++ + return err; + } + +@@ -2887,8 +3239,11 @@ err_out: + return err; + } + ++#if defined(HAVE_NDO_SETUP_TC_TAKES_TC_SETUP_TYPE) || defined(HAVE_NDO_SETUP_TC_RH_EXTENDED) ++#ifdef HAVE_TC_MQPRIO_QOPT_OFFLOAD + static MLX5E_DEFINE_PREACTIVATE_WRAPPER_CTX(mlx5e_update_netdev_queues); +- ++#endif ++#endif + static void mlx5e_set_default_xps_cpumasks(struct mlx5e_priv *priv, + struct mlx5e_params *params) + { +@@ -2922,7 +3277,11 @@ static int mlx5e_num_channels_changed(st + mlx5e_set_default_xps_cpumasks(priv, &priv->channels.params); + + /* This function may be called on attach, before priv->rx_res is created. */ ++#ifdef HAVE_NETIF_IS_RXFH_CONFIGURED + if (!netif_is_rxfh_configured(priv->netdev) && priv->rx_res) ++#else ++ if (priv->rx_res) ++#endif + mlx5e_rx_res_rss_set_indir_uniform(priv->rx_res, count); + + return 0; +@@ -3003,8 +3362,9 @@ void mlx5e_activate_priv_channels(struct + mlx5e_build_txq_maps(priv); + mlx5e_activate_channels(&priv->channels); + mlx5e_qos_activate_queues(priv); ++#ifdef HAVE_XDP_SUPPORT + mlx5e_xdp_tx_enable(priv); +- ++#endif + /* dev_watchdog() wants all TX queues to be started when the carrier is + * OK, including the ones in range real_num_tx_queues..num_tx_queues-1. + * Make it happy to avoid TX timeout false alarms. +@@ -3040,7 +3400,9 @@ void mlx5e_deactivate_priv_channels(stru + */ + netif_tx_disable(priv->netdev); + ++#ifdef HAVE_XDP_SUPPORT + mlx5e_xdp_tx_disable(priv); ++#endif + mlx5e_deactivate_channels(&priv->channels); + } + +@@ -3285,8 +3647,11 @@ static int mlx5e_alloc_drop_rq(struct ml + return err; + + /* Mark as unused given "Drop-RQ" packets never reach XDP */ ++#ifdef HAVE_XDP_SUPPORT ++#ifdef HAVE_XDP_RXQ_INFO + xdp_rxq_info_unused(&rq->xdp_rxq); +- ++#endif ++#endif + rq->mdev = mdev; + + return 0; +@@ -3450,7 +3815,10 @@ static int mlx5e_modify_channels_scatter + return 0; + } + +-static int mlx5e_modify_channels_vsd(struct mlx5e_channels *chs, bool vsd) ++#ifndef LEGACY_ETHTOOL_OPS ++static ++#endif ++int mlx5e_modify_channels_vsd(struct mlx5e_channels *chs, bool vsd) + { + int err; + int i; +@@ -3484,6 +3852,23 @@ static void mlx5e_mqprio_build_default_t + } + } + ++static void mlx5e_params_mqprio_dcb_set(struct mlx5e_params *params, u8 num_tc) ++{ ++#ifdef HAVE_TC_MQPRIO_QOPT_OFFLOAD ++ params->mqprio.mode = TC_MQPRIO_MODE_DCB; ++#endif ++ params->mqprio.num_tc = num_tc; ++ mlx5e_mqprio_build_default_tc_to_txq(params->mqprio.tc_to_txq, num_tc, ++ params->num_channels); ++} ++ ++static void mlx5e_params_mqprio_reset(struct mlx5e_params *params) ++{ ++ mlx5e_params_mqprio_dcb_set(params, 1); ++} ++ ++#if defined(HAVE_NDO_SETUP_TC_TAKES_TC_SETUP_TYPE) || defined(HAVE_NDO_SETUP_TC_RH_EXTENDED) ++#ifdef HAVE_TC_MQPRIO_QOPT_OFFLOAD + static void mlx5e_mqprio_build_tc_to_txq(struct netdev_tc_txq *tc_to_txq, + struct tc_mqprio_qopt *qopt) + { +@@ -3497,14 +3882,6 @@ static void mlx5e_mqprio_build_tc_to_txq + } + } + +-static void mlx5e_params_mqprio_dcb_set(struct mlx5e_params *params, u8 num_tc) +-{ +- params->mqprio.mode = TC_MQPRIO_MODE_DCB; +- params->mqprio.num_tc = num_tc; +- mlx5e_mqprio_build_default_tc_to_txq(params->mqprio.tc_to_txq, num_tc, +- params->num_channels); +-} +- + static void mlx5e_mqprio_rl_update_params(struct mlx5e_params *params, + struct mlx5e_mqprio_rl *rl) + { +@@ -3534,11 +3911,7 @@ static void mlx5e_params_mqprio_channel_ + mlx5e_mqprio_rl_update_params(params, rl); + mlx5e_mqprio_build_tc_to_txq(params->mqprio.tc_to_txq, &mqprio->qopt); + } +- +-static void mlx5e_params_mqprio_reset(struct mlx5e_params *params) +-{ +- mlx5e_params_mqprio_dcb_set(params, 1); +-} ++#endif + + static int mlx5e_setup_tc_mqprio_dcb(struct mlx5e_priv *priv, + struct tc_mqprio_qopt *mqprio) +@@ -3577,6 +3950,7 @@ static int mlx5e_setup_tc_mqprio_dcb(str + return err; + } + ++#ifdef HAVE_TC_MQPRIO_QOPT_OFFLOAD + static int mlx5e_mqprio_channel_validate(struct mlx5e_priv *priv, + struct tc_mqprio_qopt_offload *mqprio) + { +@@ -3707,16 +4081,24 @@ static int mlx5e_setup_tc_mqprio_channel + + return 0; + } ++#endif ++#endif + ++#if defined(HAVE_NDO_SETUP_TC_TAKES_TC_SETUP_TYPE) || defined(HAVE_NDO_SETUP_TC_RH_EXTENDED) + int mlx5e_setup_tc_mqprio(struct mlx5e_priv *priv, +- struct tc_mqprio_qopt_offload *mqprio) ++#ifdef HAVE_TC_MQPRIO_QOPT_OFFLOAD ++ struct tc_mqprio_qopt_offload *mqprio ++#else ++ struct tc_mqprio_qopt *mqprio ++#endif ++) + { + /* MQPRIO is another toplevel qdisc that can't be attached + * simultaneously with the offloaded HTB. + */ + if (WARN_ON(priv->htb.maj_id)) + return -EINVAL; +- ++#ifdef HAVE_TC_MQPRIO_QOPT_OFFLOAD + switch (mqprio->mode) { + case TC_MQPRIO_MODE_DCB: + return mlx5e_setup_tc_mqprio_dcb(priv, &mqprio->qopt); +@@ -3725,8 +4107,55 @@ int mlx5e_setup_tc_mqprio(struct mlx5e_p + default: + return -EOPNOTSUPP; + } ++#else ++ return mlx5e_setup_tc_mqprio_dcb(priv, mqprio); ++#endif ++ ++} ++#else ++int mlx5e_setup_tc(struct net_device *netdev, u8 tc) ++{ ++ struct mlx5e_priv *priv = netdev_priv(netdev); ++ struct mlx5e_params new_params; ++ int err = 0; ++ ++ if (tc && tc != MLX5E_MAX_NUM_TC ++#ifdef CONFIG_MLX5_CORE_EN_DCB ++ && priv->dcbx_dp.trust_state != MLX5_QPTS_TRUST_PCP ++#endif ++ ) ++ return -EINVAL; ++ ++ mutex_lock(&priv->state_lock); ++ ++ /* MQPRIO is another toplevel qdisc that can't be attached ++ * * simultaneously with the offloaded HTB. ++ * */ ++ if (WARN_ON(priv->htb.maj_id)) { ++ err = -EINVAL; ++ goto out; ++ } ++ ++ new_params = priv->channels.params; ++ mlx5e_params_mqprio_dcb_set(&new_params, tc ? tc : 1); ++ ++#ifdef CONFIG_MLX5_CORE_EN_DCB ++ if (priv->dcbx_dp.trust_state == MLX5_QPTS_TRUST_PCP) ++ priv->pcp_tc_num = tc; ++#endif ++ ++ err = mlx5e_safe_switch_params(priv, &new_params, ++ mlx5e_num_channels_changed_ctx, NULL, true); ++ ++out: ++ priv->max_opened_tc = max_t(u8, priv->max_opened_tc, ++ mlx5e_get_dcb_num_tc(&priv->channels.params)); ++ mutex_unlock(&priv->state_lock); ++ return err; + } ++#endif + ++#ifdef HAVE_ENUM_TC_HTB_COMMAND + static int mlx5e_setup_tc_htb(struct mlx5e_priv *priv, struct tc_htb_qopt_offload *htb) + { + int res; +@@ -3748,7 +4177,12 @@ static int mlx5e_setup_tc_htb(struct mlx + return mlx5e_htb_leaf_to_inner(priv, htb->parent_classid, htb->classid, + htb->rate, htb->ceil, htb->extack); + case TC_HTB_LEAF_DEL: ++#ifndef HAVE_TC_HTB_COMMAND_HAS_MOVED_QID /* will be base code next rebase */ + return mlx5e_htb_leaf_del(priv, &htb->classid, htb->extack); ++#else ++ return mlx5e_htb_leaf_del(priv, htb->classid, &htb->moved_qid, &htb->qid, ++ htb->extack); ++#endif + case TC_HTB_LEAF_DEL_LAST: + case TC_HTB_LEAF_DEL_LAST_FORCE: + return mlx5e_htb_leaf_del_last(priv, htb->classid, +@@ -3767,47 +4201,138 @@ static int mlx5e_setup_tc_htb(struct mlx + return -EOPNOTSUPP; + } + } ++#endif + ++#if defined(HAVE_NDO_SETUP_TC_TAKES_TC_SETUP_TYPE) || defined(HAVE_NDO_SETUP_TC_RH_EXTENDED) ++#ifdef HAVE_FLOW_CLS_OFFLOAD + static LIST_HEAD(mlx5e_block_cb_list); ++#endif + ++#ifdef HAVE_TC_SETUP_CB_EGDEV_REGISTER ++int mlx5e_setup_tc(struct net_device *dev, enum tc_setup_type type, ++ void *type_data) ++#else + static int mlx5e_setup_tc(struct net_device *dev, enum tc_setup_type type, + void *type_data) ++#endif + { + struct mlx5e_priv *priv = netdev_priv(dev); + bool tc_unbind = false; + int err; + ++#if defined(HAVE_TC_BLOCK_OFFLOAD) || defined(HAVE_FLOW_BLOCK_OFFLOAD) + if (type == TC_SETUP_BLOCK && + ((struct flow_block_offload *)type_data)->command == FLOW_BLOCK_UNBIND) + tc_unbind = true; ++#endif + + if (!netif_device_present(dev) && !tc_unbind) + return -ENODEV; + + switch (type) { ++#ifdef CONFIG_MLX5_ESWITCH ++#if defined(HAVE_TC_BLOCK_OFFLOAD) || defined(HAVE_FLOW_BLOCK_OFFLOAD) ++#ifdef HAVE_FLOW_BLOCK_CB_SETUP_SIMPLE + case TC_SETUP_BLOCK: { ++#ifdef HAVE_UNLOCKED_DRIVER_CB + struct flow_block_offload *f = type_data; + + f->unlocked_driver_cb = true; ++#endif + return flow_block_cb_setup_simple(type_data, + &mlx5e_block_cb_list, + mlx5e_setup_tc_block_cb, + priv, priv, true); + } ++#else /* HAVE_FLOW_BLOCK_CB_SETUP_SIMPLE */ ++ case TC_SETUP_BLOCK: ++ return mlx5e_setup_tc_block(dev, type_data); ++#endif /* HAVE_FLOW_BLOCK_CB_SETUP_SIMPLE */ ++#else ++ case TC_SETUP_CLSFLOWER: ++#ifdef CONFIG_MLX5_CLS_ACT ++ return mlx5e_setup_tc_cls_flower(dev, type_data, MLX5_TC_FLAG(INGRESS)); ++#endif ++#endif /* HAVE_TC_BLOCK_OFFLOAD || HAVE_FLOW_BLOCK_OFFLOAD */ ++#endif /* CONFIG_MLX5_ESWITCH */ + case TC_SETUP_QDISC_MQPRIO: + mutex_lock(&priv->state_lock); + err = mlx5e_setup_tc_mqprio(priv, type_data); + mutex_unlock(&priv->state_lock); + return err; ++#ifdef HAVE_ENUM_TC_HTB_COMMAND + case TC_SETUP_QDISC_HTB: + mutex_lock(&priv->state_lock); + err = mlx5e_setup_tc_htb(priv, type_data); + mutex_unlock(&priv->state_lock); + return err; ++#endif ++ default: ++ return -EOPNOTSUPP; ++ } ++} ++#else /* HAVE_NDO_SETUP_TC_TAKES_TC_SETUP_TYPE || HAVE_NDO_SETUP_TC_RH_EXTENDED */ ++#if defined(HAVE_NDO_SETUP_TC_4_PARAMS) || defined(HAVE_NDO_SETUP_TC_TAKES_CHAIN_INDEX) ++static int mlx5e_ndo_setup_tc(struct net_device *dev, u32 handle, ++#ifdef HAVE_NDO_SETUP_TC_TAKES_CHAIN_INDEX ++ u32 chain_index, __be16 proto, ++#else ++ __be16 proto, ++#endif ++ struct tc_to_netdev *tc) ++{ ++#ifdef HAVE_TC_FLOWER_OFFLOAD ++#ifdef CONFIG_MLX5_CLS_ACT ++ struct mlx5e_priv *priv = netdev_priv(dev); ++#endif /*CONFIG_MLX5_CLS_ACT*/ ++ ++ if (!netif_device_present(dev)) ++ return -EOPNOTSUPP; ++ ++ if (TC_H_MAJ(handle) != TC_H_MAJ(TC_H_INGRESS)) ++ goto mqprio; ++ ++#ifdef HAVE_NDO_SETUP_TC_TAKES_CHAIN_INDEX ++ if (chain_index) ++ return -EOPNOTSUPP; ++#endif ++ ++ switch (tc->type) { ++#ifdef CONFIG_MLX5_CLS_ACT ++ case TC_SETUP_CLSFLOWER: ++ switch (tc->cls_flower->command) { ++ case TC_CLSFLOWER_REPLACE: ++ return mlx5e_configure_flower(priv->netdev, priv, tc->cls_flower, ++ MLX5_TC_FLAG(INGRESS)); ++ case TC_CLSFLOWER_DESTROY: ++ return mlx5e_delete_flower(priv->netdev, priv, tc->cls_flower, ++ MLX5_TC_FLAG(INGRESS)); ++#ifdef HAVE_TC_CLSFLOWER_STATS ++ case TC_CLSFLOWER_STATS: ++ return mlx5e_stats_flower(priv->netdev, priv, tc->cls_flower, ++ MLX5_TC_FLAG(INGRESS)); ++#endif ++ } ++#endif /*CONFIG_MLX5_CLS_ACT*/ + default: + return -EOPNOTSUPP; + } ++ ++mqprio: ++#endif /* HAVE_TC_FLOWER_OFFLOAD */ ++ if (tc->type != TC_SETUP_MQPRIO) ++ return -EINVAL; ++ ++#ifdef HAVE_TC_TO_NETDEV_TC ++ return mlx5e_setup_tc(dev, tc->tc); ++#else ++ tc->mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS; ++ ++ return mlx5e_setup_tc(dev, tc->mqprio->num_tc); ++#endif /* HAVE_TC_TO_NETDEV_TC */ + } ++#endif /* HAVE_NDO_SETUP_TC_4_PARAMS || HAVE_NDO_SETUP_TC_TAKES_CHAIN_INDEX */ ++#endif /* HAVE_NDO_SETUP_TC_TAKES_TC_SETUP_TYPE || HAVE_NDO_SETUP_TC_RH_EXTENDED */ + + void mlx5e_fold_sw_stats64(struct mlx5e_priv *priv, struct rtnl_link_stats64 *s) + { +@@ -3815,13 +4340,27 @@ void mlx5e_fold_sw_stats64(struct mlx5e_ + + for (i = 0; i < priv->stats_nch; i++) { + struct mlx5e_channel_stats *channel_stats = priv->channel_stats[i]; ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + struct mlx5e_rq_stats *xskrq_stats = &channel_stats->xskrq; ++#endif + struct mlx5e_rq_stats *rq_stats = &channel_stats->rq; + int j; + +- s->rx_packets += rq_stats->packets + xskrq_stats->packets; +- s->rx_bytes += rq_stats->bytes + xskrq_stats->bytes; +- s->multicast += rq_stats->mcast_packets + xskrq_stats->mcast_packets; ++ s->rx_packets += rq_stats->packets ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT ++ + xskrq_stats->packets ++#endif ++ ; ++ s->rx_bytes += rq_stats->bytes ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT ++ + xskrq_stats->bytes ++#endif ++ ; ++ s->multicast += rq_stats->mcast_packets ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT ++ + xskrq_stats->mcast_packets ++#endif ++ ; + + for (j = 0; j < priv->max_opened_tc; j++) { + struct mlx5e_sq_stats *sq_stats = &channel_stats->sq[j]; +@@ -3849,14 +4388,26 @@ void mlx5e_fold_sw_stats64(struct mlx5e_ + } + } + +-void +-mlx5e_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) ++#ifdef HAVE_NDO_GET_STATS64_RET_VOID ++void mlx5e_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) ++#elif defined(HAVE_NDO_GET_STATS64) ++struct rtnl_link_stats64 * mlx5e_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) ++#else ++struct net_device_stats * mlx5e_get_stats(struct net_device *dev) ++#endif + { + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5e_pport_stats *pstats = &priv->stats.pport; ++#if !defined(HAVE_NDO_GET_STATS64) && !defined(HAVE_NDO_GET_STATS64_RET_VOID) ++ struct net_device_stats *stats = &priv->netdev_stats; ++#endif + + if (!netif_device_present(dev)) ++#ifdef HAVE_NDO_GET_STATS64_RET_VOID + return; ++#else ++ return stats; ++#endif + + /* In switchdev mode, monitor counters doesn't monitor + * rx/tx stats of 802_3. The update stats mechanism +@@ -3897,6 +4448,10 @@ mlx5e_get_stats(struct net_device *dev, + stats->rx_errors = stats->rx_length_errors + stats->rx_crc_errors + + stats->rx_frame_errors; + stats->tx_errors = stats->tx_aborted_errors + stats->tx_carrier_errors; ++ ++#ifndef HAVE_NDO_GET_STATS64_RET_VOID ++ return stats; ++#endif + } + + static void mlx5e_nic_set_rx_mode(struct mlx5e_priv *priv) +@@ -3923,7 +4478,11 @@ static int mlx5e_set_mac(struct net_devi + return -EADDRNOTAVAIL; + + netif_addr_lock_bh(netdev); ++#ifdef HAVE_DEV_ADDR_MOD + eth_hw_addr_set(netdev, saddr->sa_data); ++#else ++ ether_addr_copy(netdev->dev_addr, saddr->sa_data); ++#endif + netif_addr_unlock_bh(netdev); + + mlx5e_nic_set_rx_mode(priv); +@@ -3952,29 +4511,44 @@ static int set_feature_lro(struct net_de + + mutex_lock(&priv->state_lock); + ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + if (enable && priv->xsk.refcnt) { + netdev_warn(netdev, "LRO is incompatible with AF_XDP (%u XSKs are active)\n", + priv->xsk.refcnt); + err = -EINVAL; + goto out; + } ++#endif + + cur_params = &priv->channels.params; +- if (enable && !MLX5E_GET_PFLAG(cur_params, MLX5E_PFLAG_RX_STRIDING_RQ)) { +- netdev_warn(netdev, "can't set LRO with legacy RQ\n"); +- err = -EINVAL; +- goto out; +- } + + new_params = *cur_params; + ++ + if (enable) ++#if defined(CONFIG_COMPAT_LRO_ENABLED_IPOIB) ++ { ++ new_params.lro_en = true; ++ if (IS_HW_LRO(&new_params)) ++ new_params.packet_merge.type = MLX5E_PACKET_MERGE_LRO; ++ } ++ else if (new_params.packet_merge.type == MLX5E_PACKET_MERGE_LRO) { ++ new_params.lro_en = false; ++ new_params.packet_merge.type = MLX5E_PACKET_MERGE_NONE; ++ ++ } else { ++ new_params.lro_en = false; ++ goto out; ++ } ++#else + new_params.packet_merge.type = MLX5E_PACKET_MERGE_LRO; + else if (new_params.packet_merge.type == MLX5E_PACKET_MERGE_LRO) + new_params.packet_merge.type = MLX5E_PACKET_MERGE_NONE; + else + goto out; + ++#endif ++ + if (!(cur_params->packet_merge.type == MLX5E_PACKET_MERGE_SHAMPO && + new_params.packet_merge.type == MLX5E_PACKET_MERGE_LRO)) { + if (cur_params->rq_wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) { +@@ -3984,6 +4558,13 @@ static int set_feature_lro(struct net_de + } + } + ++ if ((new_params.packet_merge.type != MLX5E_PACKET_MERGE_NONE) && ++ !MLX5E_GET_PFLAG(cur_params, MLX5E_PFLAG_RX_STRIDING_RQ)) { ++ netdev_warn(netdev, "can't set HW LRO with legacy RQ\n"); ++ err = -EINVAL; ++ goto out; ++ } ++ + err = mlx5e_safe_switch_params(priv, &new_params, + mlx5e_modify_tirs_packet_merge_ctx, NULL, reset); + out: +@@ -3991,6 +4572,7 @@ out: + return err; + } + ++#ifdef HAVE_NETIF_F_GRO_HW + static int set_feature_hw_gro(struct net_device *netdev, bool enable) + { + struct mlx5e_priv *priv = netdev_priv(netdev); +@@ -4019,6 +4601,7 @@ out: + mutex_unlock(&priv->state_lock); + return err; + } ++#endif + + static int set_feature_cvlan_filter(struct net_device *netdev, bool enable) + { +@@ -4148,6 +4731,7 @@ unlock: + } + + #ifdef CONFIG_MLX5_EN_ARFS ++#ifndef HAVE_NET_FLOW_KEYS_H + static int set_feature_arfs(struct net_device *netdev, bool enable) + { + struct mlx5e_priv *priv = netdev_priv(netdev); +@@ -4161,6 +4745,7 @@ static int set_feature_arfs(struct net_d + return err; + } + #endif ++#endif + + static int mlx5e_handle_feature(struct net_device *netdev, + netdev_features_t *features, +@@ -4194,7 +4779,9 @@ int mlx5e_set_features(struct net_device + mlx5e_handle_feature(netdev, &oper_features, feature, handler) + + err |= MLX5E_HANDLE_FEATURE(NETIF_F_LRO, set_feature_lro); ++#ifdef HAVE_NETIF_F_GRO_HW + err |= MLX5E_HANDLE_FEATURE(NETIF_F_GRO_HW, set_feature_hw_gro); ++#endif + err |= MLX5E_HANDLE_FEATURE(NETIF_F_HW_VLAN_CTAG_FILTER, + set_feature_cvlan_filter); + err |= MLX5E_HANDLE_FEATURE(NETIF_F_HW_TC, set_feature_hw_tc); +@@ -4202,9 +4789,13 @@ int mlx5e_set_features(struct net_device + err |= MLX5E_HANDLE_FEATURE(NETIF_F_RXFCS, set_feature_rx_fcs); + err |= MLX5E_HANDLE_FEATURE(NETIF_F_HW_VLAN_CTAG_RX, set_feature_rx_vlan); + #ifdef CONFIG_MLX5_EN_ARFS ++#ifndef HAVE_NET_FLOW_KEYS_H + err |= MLX5E_HANDLE_FEATURE(NETIF_F_NTUPLE, set_feature_arfs); + #endif ++#endif ++#ifdef HAVE_NETIF_F_HW_TLS_RX + err |= MLX5E_HANDLE_FEATURE(NETIF_F_HW_TLS_RX, mlx5e_ktls_set_feature_rx); ++#endif + + if (err) { + netdev->features = oper_features; +@@ -4217,6 +4808,7 @@ int mlx5e_set_features(struct net_device + static netdev_features_t mlx5e_fix_uplink_rep_features(struct net_device *netdev, + netdev_features_t features) + { ++#ifdef HAVE_NETIF_F_HW_TLS_RX + features &= ~NETIF_F_HW_TLS_RX; + if (netdev->features & NETIF_F_HW_TLS_RX) + netdev_warn(netdev, "Disabling hw_tls_rx, not supported in switchdev mode\n"); +@@ -4225,13 +4817,20 @@ static netdev_features_t mlx5e_fix_uplin + if (netdev->features & NETIF_F_HW_TLS_TX) + netdev_warn(netdev, "Disabling hw_tls_tx, not supported in switchdev mode\n"); + ++#endif ++#ifdef CONFIG_MLX5_EN_ARFS ++#ifndef HAVE_NET_FLOW_KEYS_H + features &= ~NETIF_F_NTUPLE; + if (netdev->features & NETIF_F_NTUPLE) + netdev_warn(netdev, "Disabling ntuple, not supported in switchdev mode\n"); ++#endif ++#endif + ++#ifdef HAVE_NETIF_F_GRO_HW + features &= ~NETIF_F_GRO_HW; + if (netdev->features & NETIF_F_GRO_HW) + netdev_warn(netdev, "Disabling HW_GRO, not supported in switchdev mode\n"); ++#endif + + return features; + } +@@ -4254,28 +4853,39 @@ static netdev_features_t mlx5e_fix_featu + netdev_warn(netdev, "Dropping C-tag vlan stripping offload due to S-tag vlan\n"); + } + +- if (!MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_STRIDING_RQ)) { ++ if (!MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_STRIDING_RQ) ++#ifdef CONFIG_COMPAT_LRO_ENABLED_IPOIB ++ && IS_HW_LRO(&priv->channels.params) ++#endif ++ ) { + if (features & NETIF_F_LRO) { + netdev_warn(netdev, "Disabling LRO, not supported in legacy RQ\n"); + features &= ~NETIF_F_LRO; + } ++#ifdef HAVE_NETIF_F_GRO_HW + if (features & NETIF_F_GRO_HW) { + netdev_warn(netdev, "Disabling HW-GRO, not supported in legacy RQ\n"); + features &= ~NETIF_F_GRO_HW; + } ++#endif + } + ++#ifdef HAVE_XDP_SUPPORT + if (params->xdp_prog) { + if (features & NETIF_F_LRO) { + netdev_warn(netdev, "LRO is incompatible with XDP\n"); + features &= ~NETIF_F_LRO; + } ++#ifdef HAVE_NETIF_F_GRO_HW + if (features & NETIF_F_GRO_HW) { + netdev_warn(netdev, "HW GRO is incompatible with XDP\n"); + features &= ~NETIF_F_GRO_HW; + } ++#endif + } ++#endif + ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + if (priv->xsk.refcnt) { + if (features & NETIF_F_GRO_HW) { + netdev_warn(netdev, "HW GRO is incompatible with AF_XDP (%u XSKs are active)\n", +@@ -4283,16 +4893,19 @@ static netdev_features_t mlx5e_fix_featu + features &= ~NETIF_F_GRO_HW; + } + } ++#endif + + if (MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_CQE_COMPRESS)) { + features &= ~NETIF_F_RXHASH; + if (netdev->features & NETIF_F_RXHASH) + netdev_warn(netdev, "Disabling rxhash, not supported when CQE compress is active\n"); + ++#ifdef HAVE_NETIF_F_GRO_HW + if (features & NETIF_F_GRO_HW) { + netdev_warn(netdev, "Disabling HW-GRO, not supported when CQE compress is active\n"); + features &= ~NETIF_F_GRO_HW; + } ++#endif + } + + /* LRO/HW-GRO features cannot be combined with RX-FCS */ +@@ -4301,16 +4914,20 @@ static netdev_features_t mlx5e_fix_featu + netdev_warn(netdev, "Dropping LRO feature since RX-FCS is requested\n"); + features &= ~NETIF_F_LRO; + } ++#ifdef HAVE_NETIF_F_GRO_HW + if (features & NETIF_F_GRO_HW) { + netdev_warn(netdev, "Dropping HW-GRO feature since RX-FCS is requested\n"); + features &= ~NETIF_F_GRO_HW; + } ++#endif + } + ++#ifdef HAVE_NETIF_F_HW_TLS_RX + if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) { + netdev_warn(netdev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n"); + features &= ~NETIF_F_HW_TLS_RX; + } ++#endif + + if (mlx5e_is_uplink_rep(priv)) + features = mlx5e_fix_uplink_rep_features(netdev, features); +@@ -4320,6 +4937,7 @@ static netdev_features_t mlx5e_fix_featu + return features; + } + ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + static bool mlx5e_xsk_validate_mtu(struct net_device *netdev, + struct mlx5e_channels *chs, + struct mlx5e_params *new_params, +@@ -4328,8 +4946,13 @@ static bool mlx5e_xsk_validate_mtu(struc + u16 ix; + + for (ix = 0; ix < chs->params.num_channels; ix++) { ++#ifdef HAVE_NETDEV_BPF_XSK_BUFF_POOL + struct xsk_buff_pool *xsk_pool = ++#else ++ struct xdp_umem *xsk_pool = ++#endif + mlx5e_xsk_get_pool(&chs->params, chs->params.xsk, ix); ++ + struct mlx5e_xsk_param xsk; + + if (!xsk_pool) +@@ -4357,6 +4980,7 @@ static bool mlx5e_xsk_validate_mtu(struc + + return true; + } ++#endif /* HAVE_XSK_ZERO_COPY_SUPPORT */ + + int mlx5e_change_mtu(struct net_device *netdev, int new_mtu, + mlx5e_fp_preactivate preactivate) +@@ -4364,19 +4988,38 @@ int mlx5e_change_mtu(struct net_device * + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5e_params new_params; + struct mlx5e_params *params; ++#if !defined(HAVE_NET_DEVICE_MIN_MAX_MTU) && !defined(HAVE_NET_DEVICE_MIN_MAX_MTU_EXTENDED) ++ struct mlx5_core_dev *mdev = priv->mdev; ++ u16 max_mtu; ++ u16 min_mtu; ++#endif + bool reset = true; + int err = 0; + + mutex_lock(&priv->state_lock); + + params = &priv->channels.params; ++#if !defined(HAVE_NET_DEVICE_MIN_MAX_MTU) && !defined(HAVE_NET_DEVICE_MIN_MAX_MTU_EXTENDED) ++ mlx5_query_port_max_mtu(mdev, &max_mtu, 1); ++ max_mtu = min_t(unsigned int, MLX5E_HW2SW_MTU(params, max_mtu), ++ ETH_MAX_MTU); ++ min_mtu = ETH_MIN_MTU; ++ ++ if (new_mtu > max_mtu || new_mtu < min_mtu) { ++ netdev_err(netdev, ++ "%s: Bad MTU (%d), valid range is: [%d..%d]\n", ++ __func__, new_mtu, min_mtu, max_mtu); ++ mutex_unlock(&priv->state_lock); ++ return -EINVAL; ++ } ++#endif + + new_params = *params; + new_params.sw_mtu = new_mtu; + err = mlx5e_validate_params(priv->mdev, &new_params); + if (err) + goto out; +- ++#ifdef HAVE_XDP_SUPPORT + if (params->xdp_prog && + !mlx5e_rx_is_linear_skb(&new_params, NULL)) { + netdev_err(netdev, "MTU(%d) > %d is not allowed while XDP enabled\n", +@@ -4385,6 +5028,7 @@ int mlx5e_change_mtu(struct net_device * + goto out; + } + ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + if (priv->xsk.refcnt && + !mlx5e_xsk_validate_mtu(netdev, &priv->channels, + &new_params, priv->mdev)) { +@@ -4392,7 +5036,13 @@ int mlx5e_change_mtu(struct net_device * + goto out; + } + ++#endif /* HAVE_XSK_ZERO_COPY_SUPPORT */ ++#endif /* HAVE_XDP_SUPPORT */ ++#ifdef CONFIG_COMPAT_LRO_ENABLED_IPOIB ++ if (IS_HW_LRO(&priv->channels.params)) ++#else + if (params->packet_merge.type == MLX5E_PACKET_MERGE_LRO) ++#endif + reset = false; + + if (params->rq_wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) { +@@ -4406,7 +5056,10 @@ int mlx5e_change_mtu(struct net_device * + * Check that the mode was non-linear and didn't change. + * If XSK is active, XSK RQs are linear. + */ +- if (!is_linear_old && !is_linear_new && !priv->xsk.refcnt && ++ if (!is_linear_old && !is_linear_new && ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT ++ !priv->xsk.refcnt && ++#endif + ppw_old == ppw_new) + reset = false; + } +@@ -4533,7 +5186,6 @@ int mlx5e_hwstamp_set(struct mlx5e_priv + memcpy(&priv->tstamp, &config, sizeof(config)); + mutex_unlock(&priv->state_lock); + +- /* might need to fix some features */ + netdev_update_features(priv->netdev); + + return copy_to_user(ifr->ifr_data, &config, +@@ -4576,15 +5228,24 @@ int mlx5e_set_vf_mac(struct net_device * + return mlx5_eswitch_set_vport_mac(mdev->priv.eswitch, vf + 1, mac); + } + ++#if defined(HAVE_NDO_SET_VF_VLAN) || defined(HAVE_NDO_SET_VF_VLAN_EXTENDED) ++#ifdef HAVE_VF_VLAN_PROTO + static int mlx5e_set_vf_vlan(struct net_device *dev, int vf, u16 vlan, u8 qos, + __be16 vlan_proto) ++#else ++static int mlx5e_set_vf_vlan(struct net_device *dev, int vf, u16 vlan, u8 qos) ++#endif + { + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5_core_dev *mdev = priv->mdev; ++#ifndef HAVE_VF_VLAN_PROTO ++ __be16 vlan_proto = htons(ETH_P_8021Q); ++#endif + + return mlx5_eswitch_set_vport_vlan(mdev->priv.eswitch, vf + 1, + vlan, qos, vlan_proto); + } ++#endif /* HAVE_NDO_SET_VF_VLAN */ + + #ifdef HAVE_NETDEV_OPS_NDO_SET_VF_TRUNK_RANGE + static int mlx5e_add_vf_vlan_trunk_range(struct net_device *dev, int vf, +@@ -4624,6 +5285,7 @@ static int mlx5e_set_vf_spoofchk(struct + return mlx5_eswitch_set_vport_spoofchk(mdev->priv.eswitch, vf + 1, setting); + } + ++#if defined(HAVE_NETDEV_OPS_NDO_SET_VF_TRUST) || defined(HAVE_NETDEV_OPS_NDO_SET_VF_TRUST_EXTENDED) + static int mlx5e_set_vf_trust(struct net_device *dev, int vf, bool setting) + { + struct mlx5e_priv *priv = netdev_priv(dev); +@@ -4631,14 +5293,28 @@ static int mlx5e_set_vf_trust(struct net + + return mlx5_eswitch_set_vport_trust(mdev->priv.eswitch, vf + 1, setting); + } ++#endif + + int mlx5e_set_vf_rate(struct net_device *dev, int vf, int min_tx_rate, + int max_tx_rate) + { + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5_core_dev *mdev = priv->mdev; ++ int vport = (vf == 0xffff) ? 0 : vf + 1; ++ ++ /* MLNX OFED only -?? ++ * Allow to set eswitch min rate for the PF. ++ * In order to avoid bottlenecks on the slow-path arising from ++ * VF->PF packet transitions consuming a high amount of HW BW, ++ * resulting in drops of packets destined from PF->WIRE. ++ * This essentially assigns PF->WIRE a higher priority than VF->PF ++ * packet processing. */ ++ if (vport == 0) { ++ min_tx_rate = max_tx_rate; ++ max_tx_rate = 0; ++ } + +- return mlx5_eswitch_set_vport_rate(mdev->priv.eswitch, vf + 1, ++ return mlx5_eswitch_set_vport_rate(mdev->priv.eswitch, vport, + max_tx_rate, min_tx_rate); + } + +@@ -4694,6 +5370,7 @@ int mlx5e_get_vf_config(struct net_devic + return 0; + } + ++#ifdef HAVE_NDO_GET_VF_STATS + int mlx5e_get_vf_stats(struct net_device *dev, + int vf, struct ifla_vf_stats *vf_stats) + { +@@ -4703,13 +5380,19 @@ int mlx5e_get_vf_stats(struct net_device + return mlx5_eswitch_get_vport_stats(mdev->priv.eswitch, vf + 1, + vf_stats); + } ++#endif + ++#if defined(HAVE_NDO_HAS_OFFLOAD_STATS_GETS_NET_DEVICE) || defined(HAVE_NDO_HAS_OFFLOAD_STATS_EXTENDED) + static bool + mlx5e_has_offload_stats(const struct net_device *dev, int attr_id) + { + struct mlx5e_priv *priv = netdev_priv(dev); + ++#ifdef HAVE_NETIF_DEVICE_PRESENT_GET_CONST + if (!netif_device_present(dev)) ++#else ++ if (!netif_device_present_const(dev)) ++#endif + return false; + + if (!mlx5e_is_uplink_rep(priv)) +@@ -4717,7 +5400,9 @@ mlx5e_has_offload_stats(const struct net + + return mlx5e_rep_has_offload_stats(dev, attr_id); + } ++#endif + ++#if defined(HAVE_NDO_GET_OFFLOAD_STATS) || defined(HAVE_NDO_GET_OFFLOAD_STATS_EXTENDED) + static int + mlx5e_get_offload_stats(int attr_id, const struct net_device *dev, + void *sp) +@@ -4730,6 +5415,7 @@ mlx5e_get_offload_stats(int attr_id, con + return mlx5e_rep_get_offload_stats(attr_id, dev, sp); + } + #endif ++#endif /*CONFIG_MLX5_ESWITCH*/ + + static bool mlx5e_tunnel_proto_supported_tx(struct mlx5_core_dev *mdev, u8 proto_type) + { +@@ -4745,6 +5431,112 @@ static bool mlx5e_tunnel_proto_supported + } + } + ++ ++#ifdef HAVE_KERNEL_WITH_VXLAN_SUPPORT_ON ++struct mlx5e_vxlan_work { ++ struct work_struct work; ++ struct mlx5e_priv *priv; ++ u16 port; ++}; ++ ++#if defined(HAVE_NDO_UDP_TUNNEL_ADD) || defined(HAVE_NDO_UDP_TUNNEL_ADD_EXTENDED) || defined(HAVE_NDO_ADD_VXLAN_PORT) ++static void mlx5e_vxlan_add_work(struct work_struct *work) ++{ ++ struct mlx5e_vxlan_work *vxlan_work = ++ container_of(work, struct mlx5e_vxlan_work, work); ++ struct mlx5e_priv *priv = vxlan_work->priv; ++ u16 port = vxlan_work->port; ++ ++ mutex_lock(&priv->state_lock); ++ mlx5_vxlan_add_port(priv->mdev->vxlan, port); ++ mutex_unlock(&priv->state_lock); ++ ++ kfree(vxlan_work); ++} ++ ++static void mlx5e_vxlan_del_work(struct work_struct *work) ++{ ++ struct mlx5e_vxlan_work *vxlan_work = ++ container_of(work, struct mlx5e_vxlan_work, work); ++ struct mlx5e_priv *priv = vxlan_work->priv; ++ u16 port = vxlan_work->port; ++ ++ mutex_lock(&priv->state_lock); ++ mlx5_vxlan_del_port(priv->mdev->vxlan, port); ++ mutex_unlock(&priv->state_lock); ++ kfree(vxlan_work); ++} ++ ++static void mlx5e_vxlan_queue_work(struct mlx5e_priv *priv, u16 port, int add) ++{ ++ struct mlx5e_vxlan_work *vxlan_work; ++ ++ vxlan_work = kmalloc(sizeof(*vxlan_work), GFP_ATOMIC); ++ if (!vxlan_work) ++ return; ++ ++ if (add) ++ INIT_WORK(&vxlan_work->work, mlx5e_vxlan_add_work); ++ else ++ INIT_WORK(&vxlan_work->work, mlx5e_vxlan_del_work); ++ ++ vxlan_work->priv = priv; ++ vxlan_work->port = port; ++ queue_work(priv->wq, &vxlan_work->work); ++} ++#endif ++ ++#if defined(HAVE_NDO_UDP_TUNNEL_ADD) || defined(HAVE_NDO_UDP_TUNNEL_ADD_EXTENDED) ++void mlx5e_add_vxlan_port(struct net_device *netdev, struct udp_tunnel_info *ti) ++{ ++ struct mlx5e_priv *priv = netdev_priv(netdev); ++ ++ if (ti->type != UDP_TUNNEL_TYPE_VXLAN) ++ return; ++ ++ if (!mlx5_vxlan_allowed(priv->mdev->vxlan)) ++ return; ++ ++ mlx5e_vxlan_queue_work(priv, be16_to_cpu(ti->port), 1); ++} ++ ++void mlx5e_del_vxlan_port(struct net_device *netdev, struct udp_tunnel_info *ti) ++{ ++ struct mlx5e_priv *priv = netdev_priv(netdev); ++ ++ if (ti->type != UDP_TUNNEL_TYPE_VXLAN) ++ return; ++ ++ if (!mlx5_vxlan_allowed(priv->mdev->vxlan)) ++ return; ++ ++ mlx5e_vxlan_queue_work(priv, be16_to_cpu(ti->port), 0); ++} ++#elif defined(HAVE_NDO_ADD_VXLAN_PORT) ++void mlx5e_add_vxlan_port(struct net_device *netdev, ++ sa_family_t sa_family, __be16 port) ++{ ++ struct mlx5e_priv *priv = netdev_priv(netdev); ++ ++ if (!mlx5_vxlan_allowed(priv->mdev->vxlan)) ++ return; ++ ++ mlx5e_vxlan_queue_work(priv, be16_to_cpu(port), 1); ++} ++ ++void mlx5e_del_vxlan_port(struct net_device *netdev, ++ sa_family_t sa_family, __be16 port) ++{ ++ struct mlx5e_priv *priv = netdev_priv(netdev); ++ ++ if (!mlx5_vxlan_allowed(priv->mdev->vxlan)) ++ return; ++ ++ mlx5e_vxlan_queue_work(priv, be16_to_cpu(port), 0); ++} ++#endif ++#endif /* HAVE_KERNEL_WITH_VXLAN_SUPPORT_ON */ ++ + static bool mlx5e_gre_tunnel_inner_proto_offload_supported(struct mlx5_core_dev *mdev, + struct sk_buff *skb) + { +@@ -4765,9 +5557,13 @@ static netdev_features_t mlx5e_tunnel_fe + netdev_features_t features) + { + unsigned int offset = 0; ++#ifdef HAVE_KERNEL_WITH_VXLAN_SUPPORT_ON + struct udphdr *udph; ++#endif + u8 proto; ++#ifdef HAVE_KERNEL_WITH_VXLAN_SUPPORT_ON + u16 port; ++#endif + + switch (vlan_get_protocol(skb)) { + case htons(ETH_P_IP): +@@ -4790,6 +5586,7 @@ static netdev_features_t mlx5e_tunnel_fe + if (mlx5e_tunnel_proto_supported_tx(priv->mdev, IPPROTO_IPIP)) + return features; + break; ++#ifdef HAVE_KERNEL_WITH_VXLAN_SUPPORT_ON + case IPPROTO_UDP: + udph = udp_hdr(skb); + port = be16_to_cpu(udph->dest); +@@ -4797,6 +5594,7 @@ static netdev_features_t mlx5e_tunnel_fe + /* Verify if UDP port is being offloaded by HW */ + if (mlx5_vxlan_lookup_port(priv->mdev->vxlan, port)) + return features; ++#endif + + #if IS_ENABLED(CONFIG_GENEVE) + /* Support Geneve offload for default UDP port */ +@@ -4822,7 +5620,9 @@ netdev_features_t mlx5e_features_check(s + struct mlx5e_priv *priv = netdev_priv(netdev); + + features = vlan_features_check(skb, features); ++#ifdef HAVE_KERNEL_WITH_VXLAN_SUPPORT_ON + features = vxlan_features_check(skb, features); ++#endif /* HAVE_KERNEL_WITH_VXLAN_SUPPORT_ON */ + + /* Validate if the tunneled packet is being offloaded by HW */ + if (skb->encapsulation && +@@ -4866,7 +5666,11 @@ unlock: + rtnl_unlock(); + } + ++#ifdef HAVE_NDO_TX_TIMEOUT_GET_2_PARAMS + static void mlx5e_tx_timeout(struct net_device *dev, unsigned int txqueue) ++#else ++static void mlx5e_tx_timeout(struct net_device *dev) ++#endif + { + struct mlx5e_priv *priv = netdev_priv(dev); + +@@ -4874,12 +5678,17 @@ static void mlx5e_tx_timeout(struct net_ + queue_work(priv->wq, &priv->tx_timeout_work); + } + ++#ifdef HAVE_XDP_SUPPORT + static int mlx5e_xdp_allowed(struct mlx5e_priv *priv, struct bpf_prog *prog) + { + struct net_device *netdev = priv->netdev; + struct mlx5e_params new_params; + ++#ifdef CONFIG_COMPAT_LRO_ENABLED_IPOIB ++ if (IS_HW_LRO(&priv->channels.params)) { ++#else + if (priv->channels.params.packet_merge.type != MLX5E_PACKET_MERGE_NONE) { ++#endif + netdev_warn(netdev, "can't set XDP while HW-GRO/LRO is on, disable them first\n"); + return -EINVAL; + } +@@ -4967,15 +5776,33 @@ static int mlx5e_xdp_set(struct net_devi + /* exchanging programs w/o reset, we update ref counts on behalf + * of the channels RQs here. + */ ++#ifndef HAVE_BPF_PROG_ADD_RET_STRUCT + bpf_prog_add(prog, priv->channels.num); ++#else ++ prog = bpf_prog_add(prog, priv->channels.num); ++ if (IS_ERR(prog)) { ++ err = PTR_ERR(prog); ++ goto unlock; ++ } ++#endif + for (i = 0; i < priv->channels.num; i++) { + struct mlx5e_channel *c = priv->channels.c[i]; + + mlx5e_rq_replace_xdp_prog(&c->rq, prog); ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + if (test_bit(MLX5E_CHANNEL_STATE_XSK, c->state)) { ++#ifndef HAVE_BPF_PROG_ADD_RET_STRUCT + bpf_prog_inc(prog); ++#else ++ prog = bpf_prog_inc(prog); ++ if (IS_ERR(prog)) { ++ err = PTR_ERR(prog); ++ goto unlock; ++ } ++#endif + mlx5e_rq_replace_xdp_prog(&c->xskrq, prog); + } ++#endif + } + + unlock: +@@ -4988,23 +5815,81 @@ unlock: + return err; + } + ++#ifndef HAVE_DEV_XDP_PROG_ID ++static u32 mlx5e_xdp_query(struct net_device *dev) ++{ ++ struct mlx5e_priv *priv = netdev_priv(dev); ++ const struct bpf_prog *xdp_prog; ++ u32 prog_id = 0; ++ ++ if (!netif_device_present(dev)) ++ goto out; ++ ++ mutex_lock(&priv->state_lock); ++ xdp_prog = priv->channels.params.xdp_prog; ++ if (xdp_prog) ++ prog_id = xdp_prog->aux->id; ++ mutex_unlock(&priv->state_lock); ++ ++out: ++ return prog_id; ++} ++#endif ++ + static int mlx5e_xdp(struct net_device *dev, struct netdev_bpf *xdp) + { + switch (xdp->command) { + case XDP_SETUP_PROG: + return mlx5e_xdp_set(dev, xdp->prog); ++#ifndef HAVE_DEV_XDP_PROG_ID ++ case XDP_QUERY_PROG: ++ xdp->prog_id = mlx5e_xdp_query(dev); ++ return 0; ++#endif ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT ++#ifdef HAVE_NETDEV_BPF_XSK_BUFF_POOL + case XDP_SETUP_XSK_POOL: + return mlx5e_xsk_setup_pool(dev, xdp->xsk.pool, + xdp->xsk.queue_id); ++#else ++ case XDP_SETUP_XSK_UMEM: ++ return mlx5e_xsk_setup_pool(dev, xdp->xsk.umem, ++ xdp->xsk.queue_id); ++#endif ++#endif + default: + return -EINVAL; + } + } ++#endif ++ ++#ifndef HAVE_NETPOLL_POLL_DEV_EXPORTED ++#ifdef CONFIG_NET_POLL_CONTROLLER ++/* Fake "interrupt" called by netpoll (eg netconsole) to send skbs without ++ * reenabling interrupts. ++ */ ++static void mlx5e_netpoll(struct net_device *dev) ++{ ++ struct mlx5e_priv *priv = netdev_priv(dev); ++ struct mlx5e_channels *chs = &priv->channels; ++ ++ int i; ++ ++ for (i = 0; i < chs->num; i++) ++ napi_schedule(&chs->c[i]->napi); ++} ++#endif ++#endif/*HAVE_NETPOLL_POLL_DEV__EXPORTED*/ + + #ifdef CONFIG_MLX5_ESWITCH ++#if defined(HAVE_NDO_BRIDGE_GETLINK) || defined(HAVE_NDO_BRIDGE_GETLINK_NLFLAGS) + static int mlx5e_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, +- struct net_device *dev, u32 filter_mask, +- int nlflags) ++ struct net_device *dev, u32 filter_mask ++#if defined(HAVE_NDO_BRIDGE_GETLINK_NLFLAGS) ++ , int nlflags) ++#elif defined(HAVE_NDO_BRIDGE_GETLINK) ++ ) ++#endif + { + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5_core_dev *mdev = priv->mdev; +@@ -5015,13 +5900,35 @@ static int mlx5e_bridge_getlink(struct s + if (err) + return err; + mode = setting ? BRIDGE_MODE_VEPA : BRIDGE_MODE_VEB; +- return ndo_dflt_bridge_getlink(skb, pid, seq, dev, +- mode, +- 0, 0, nlflags, filter_mask, NULL); ++ return ndo_dflt_bridge_getlink(skb, pid, seq, dev, mode ++#if defined(HAVE_NDO_DFLT_BRIDGE_GETLINK_FLAG_MASK) ++ , 0, 0); ++#endif ++#if defined(HAVE_NDO_DFLT_BRIDGE_GETLINK_FLAG_MASK_NFLAGS) && defined(HAVE_NDO_BRIDGE_GETLINK) ++ , 0, 0, 0); ++#endif ++#if defined(HAVE_NDO_DFLT_BRIDGE_GETLINK_FLAG_MASK_NFLAGS) && defined(HAVE_NDO_BRIDGE_GETLINK_NLFLAGS) ++ , 0, 0, nlflags); ++#endif ++#if defined(HAVE_NDO_DFLT_BRIDGE_GETLINK_FLAG_MASK_NFLAGS_FILTER) && defined(HAVE_NDO_BRIDGE_GETLINK) ++ , 0, 0, 0, filter_mask, NULL); ++#endif ++#if defined(HAVE_NDO_DFLT_BRIDGE_GETLINK_FLAG_MASK_NFLAGS_FILTER) && defined(HAVE_NDO_BRIDGE_GETLINK_NLFLAGS) ++ , 0, 0, nlflags, filter_mask, NULL); ++#endif + } ++#endif + ++#if defined(HAVE_NDO_BRIDGE_SETLINK) || defined(HAVE_NDO_BRIDGE_SETLINK_EXTACK) ++#ifdef HAVE_NDO_BRIDGE_SETLINK_EXTACK + static int mlx5e_bridge_setlink(struct net_device *dev, struct nlmsghdr *nlh, + u16 flags, struct netlink_ext_ack *extack) ++#endif ++#ifdef HAVE_NDO_BRIDGE_SETLINK ++static int mlx5e_bridge_setlink(struct net_device *dev, struct nlmsghdr *nlh, ++ u16 flags) ++#endif ++ + { + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5_core_dev *mdev = priv->mdev; +@@ -5056,37 +5963,184 @@ static int mlx5e_bridge_setlink(struct n + } + #endif + ++#ifndef HAVE_DEVLINK_PORT_ATTRS_PCI_PF_SET ++#if defined(HAVE_NDO_GET_PHYS_PORT_NAME) || defined(HAVE_NDO_GET_PHYS_PORT_NAME_EXTENDED) ++int mlx5e_get_phys_port_name(struct net_device *dev, ++ char *buf, size_t len) ++{ ++ struct mlx5e_priv *priv = netdev_priv(dev); ++ unsigned int fn; ++ int ret; ++ ++ if (!netif_device_present(dev)) ++ return -EOPNOTSUPP; ++ ++ if (mlx5e_is_uplink_rep(priv)) ++ return mlx5e_rep_get_phys_port_name(dev, buf, len); ++ ++ /* Only rename ecpf, don't rename non-smartnic PF/VF/SF */ ++ if (!mlx5_core_is_pf(priv->mdev) && ++ !mlx5_core_is_ecpf(priv->mdev)) ++ return -EOPNOTSUPP; ++ ++ fn = mlx5_get_dev_index(priv->mdev); ++ ret = snprintf(buf, len, "p%d", fn); ++ if (ret >= len) ++ return -EOPNOTSUPP; ++ ++ return 0; ++} ++#endif ++#endif ++ ++#if defined(HAVE_NDO_GET_PORT_PARENT_ID) ++#ifdef HAVE_DEVLINK_PORT_ATTRS_PCI_PF_SET ++void ++#else ++int ++#endif ++mlx5e_get_port_parent_id(struct net_device *dev, ++ struct netdev_phys_item_id *ppid) ++{ ++ struct mlx5e_priv *priv = netdev_priv(dev); ++ ++ if (!netif_device_present(dev)) ++#ifndef HAVE_DEVLINK_PORT_ATTRS_PCI_PF_SET ++ return -EOPNOTSUPP; ++#else ++ return; ++#endif ++ ++ if (!mlx5e_is_uplink_rep(priv)) ++#ifndef HAVE_DEVLINK_PORT_ATTRS_PCI_PF_SET ++ return -EOPNOTSUPP; ++#else ++ return; ++#endif ++ ++#ifndef HAVE_DEVLINK_PORT_ATTRS_PCI_PF_SET ++ return mlx5e_rep_get_port_parent_id(dev, ppid); ++#else ++ mlx5e_rep_get_port_parent_id(dev, ppid); ++#endif ++} ++#endif ++#endif /* CONFIG_MLX5_ESWITCH */ ++ + const struct net_device_ops mlx5e_netdev_ops = { + .ndo_open = mlx5e_open, + .ndo_stop = mlx5e_close, + .ndo_start_xmit = mlx5e_xmit, ++#ifdef HAVE_NDO_SETUP_TC_RH_EXTENDED ++ .extended.ndo_setup_tc_rh = mlx5e_setup_tc, ++#else ++#ifdef HAVE_NDO_SETUP_TC ++#ifdef HAVE_NDO_SETUP_TC_TAKES_TC_SETUP_TYPE + .ndo_setup_tc = mlx5e_setup_tc, ++#else ++#if defined(HAVE_NDO_SETUP_TC_4_PARAMS) || defined(HAVE_NDO_SETUP_TC_TAKES_CHAIN_INDEX) ++ .ndo_setup_tc = mlx5e_ndo_setup_tc, ++#else ++ .ndo_setup_tc = mlx5e_setup_tc, ++#endif ++#endif ++#endif ++#endif + .ndo_select_queue = mlx5e_select_queue, ++#if defined(HAVE_NDO_GET_STATS64) || defined(HAVE_NDO_GET_STATS64_RET_VOID) + .ndo_get_stats64 = mlx5e_get_stats, ++#else ++ .ndo_get_stats = mlx5e_get_stats, ++#endif + .ndo_set_rx_mode = mlx5e_set_rx_mode, + .ndo_set_mac_address = mlx5e_set_mac, + .ndo_vlan_rx_add_vid = mlx5e_vlan_rx_add_vid, + .ndo_vlan_rx_kill_vid = mlx5e_vlan_rx_kill_vid, + .ndo_set_features = mlx5e_set_features, + .ndo_fix_features = mlx5e_fix_features, +- .ndo_change_mtu = mlx5e_change_nic_mtu, ++#ifdef HAVE_NDO_CHANGE_MTU_EXTENDED ++ .extended.ndo_change_mtu = mlx5e_change_nic_mtu, ++#else ++ .ndo_change_mtu = mlx5e_change_nic_mtu, ++#endif ++ ++#ifdef HAVE_NDO_ETH_IOCTL + .ndo_eth_ioctl = mlx5e_ioctl, ++#else ++ .ndo_do_ioctl = mlx5e_ioctl, ++#endif ++ ++#ifdef HAVE_NDO_SET_TX_MAXRATE + .ndo_set_tx_maxrate = mlx5e_set_tx_maxrate, ++#elif defined(HAVE_NDO_SET_TX_MAXRATE_EXTENDED) ++ .extended.ndo_set_tx_maxrate = mlx5e_set_tx_maxrate, ++#endif ++ ++#ifdef HAVE_KERNEL_WITH_VXLAN_SUPPORT_ON ++#if defined(HAVE_UDP_TUNNEL_NIC_INFO) && defined(HAVE_NDO_UDP_TUNNEL_ADD) ++ .ndo_udp_tunnel_add = udp_tunnel_nic_add_port, ++ .ndo_udp_tunnel_del = udp_tunnel_nic_del_port, ++#elif defined(HAVE_NDO_UDP_TUNNEL_ADD) ++ .ndo_udp_tunnel_add = mlx5e_add_vxlan_port, ++ .ndo_udp_tunnel_del = mlx5e_del_vxlan_port, ++#elif defined(HAVE_NDO_UDP_TUNNEL_ADD_EXTENDED) ++ .extended.ndo_udp_tunnel_add = mlx5e_add_vxlan_port, ++ .extended.ndo_udp_tunnel_del = mlx5e_del_vxlan_port, ++#elif defined(HAVE_NDO_ADD_VXLAN_PORT) ++ .ndo_add_vxlan_port = mlx5e_add_vxlan_port, ++ .ndo_del_vxlan_port = mlx5e_del_vxlan_port, ++#endif /* HAVE_UDP_TUNNEL_NIC_INFO */ ++#endif + .ndo_features_check = mlx5e_features_check, + .ndo_tx_timeout = mlx5e_tx_timeout, +- .ndo_bpf = mlx5e_xdp, ++#ifdef HAVE_XDP_SUPPORT ++#ifdef HAVE_NDO_XDP_EXTENDED ++ .extended.ndo_xdp = mlx5e_xdp, ++#else ++ .ndo_bpf = mlx5e_xdp, ++#endif ++#ifdef HAVE_NDO_XDP_XMIT + .ndo_xdp_xmit = mlx5e_xdp_xmit, ++#endif ++#ifdef HAVE_NDO_XDP_FLUSH ++ .ndo_xdp_flush = mlx5e_xdp_flush, ++#endif ++#endif /* HAVE_XDP_SUPPORT */ ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT ++#ifdef HAVE_NDO_XSK_WAKEUP + .ndo_xsk_wakeup = mlx5e_xsk_wakeup, ++#else ++ .ndo_xsk_async_xmit = mlx5e_xsk_wakeup, ++#endif ++#endif + #ifdef CONFIG_MLX5_EN_ARFS ++#ifndef HAVE_NET_FLOW_KEYS_H + .ndo_rx_flow_steer = mlx5e_rx_flow_steer, + #endif ++#endif ++#ifndef HAVE_NETPOLL_POLL_DEV_EXPORTED ++#ifdef CONFIG_NET_POLL_CONTROLLER ++ .ndo_poll_controller = mlx5e_netpoll, ++#endif ++#endif ++#ifdef HAVE_NET_DEVICE_OPS_EXTENDED ++ .ndo_size = sizeof(struct net_device_ops), ++#endif + #ifdef CONFIG_MLX5_ESWITCH ++#if defined(HAVE_NDO_BRIDGE_SETLINK) || defined(HAVE_NDO_BRIDGE_SETLINK_EXTACK) + .ndo_bridge_setlink = mlx5e_bridge_setlink, ++#endif ++#if defined(HAVE_NDO_BRIDGE_GETLINK) || defined(HAVE_NDO_BRIDGE_GETLINK_NLFLAGS) + .ndo_bridge_getlink = mlx5e_bridge_getlink, ++#endif + + /* SRIOV E-Switch NDOs */ + .ndo_set_vf_mac = mlx5e_set_vf_mac, ++#if defined(HAVE_NDO_SET_VF_VLAN) + .ndo_set_vf_vlan = mlx5e_set_vf_vlan, ++#elif defined(HAVE_NDO_SET_VF_VLAN_EXTENDED) ++ .extended.ndo_set_vf_vlan = mlx5e_set_vf_vlan, ++#endif + + /* these ndo's are not upstream yet */ + #ifdef HAVE_NETDEV_OPS_NDO_SET_VF_TRUNK_RANGE +@@ -5095,15 +6149,40 @@ const struct net_device_ops mlx5e_netdev + #endif + + .ndo_set_vf_spoofchk = mlx5e_set_vf_spoofchk, ++#ifdef HAVE_NETDEV_OPS_NDO_SET_VF_TRUST + .ndo_set_vf_trust = mlx5e_set_vf_trust, ++#elif defined(HAVE_NETDEV_OPS_NDO_SET_VF_TRUST_EXTENDED) ++ .extended.ndo_set_vf_trust = mlx5e_set_vf_trust, ++#endif + .ndo_set_vf_rate = mlx5e_set_vf_rate, ++ .ndo_set_vf_link_state = mlx5e_set_vf_link_state, + .ndo_get_vf_config = mlx5e_get_vf_config, +- .ndo_set_vf_link_state = mlx5e_set_vf_link_state, +- .ndo_get_vf_stats = mlx5e_get_vf_stats, ++#ifdef HAVE_NDO_GET_VF_STATS ++ .ndo_get_vf_stats = mlx5e_get_vf_stats, ++#endif ++#ifdef HAVE_NDO_GET_DEVLINK_PORT ++ .ndo_get_devlink_port = mlx5e_get_devlink_port, ++#else ++#ifdef HAVE_NDO_GET_PHYS_PORT_NAME ++ .ndo_get_phys_port_name = mlx5e_get_phys_port_name, ++#elif defined(HAVE_NDO_GET_PHYS_PORT_NAME_EXTENDED) ++ .extended.ndo_get_phys_port_name = mlx5e_get_phys_port_name, ++#endif ++#ifdef HAVE_NDO_GET_PORT_PARENT_ID ++ .ndo_get_port_parent_id = mlx5e_get_port_parent_id, ++#endif ++#endif ++#ifdef HAVE_NDO_HAS_OFFLOAD_STATS_GETS_NET_DEVICE + .ndo_has_offload_stats = mlx5e_has_offload_stats, ++#elif defined(HAVE_NDO_HAS_OFFLOAD_STATS_EXTENDED) ++ .extended.ndo_has_offload_stats = mlx5e_has_offload_stats, ++#endif ++#ifdef HAVE_NDO_GET_OFFLOAD_STATS + .ndo_get_offload_stats = mlx5e_get_offload_stats, ++#elif defined(HAVE_NDO_GET_OFFLOAD_STATS_EXTENDED) ++ .extended.ndo_get_offload_stats = mlx5e_get_offload_stats, + #endif +- .ndo_get_devlink_port = mlx5e_get_devlink_port, ++#endif /* CONFIG_MLX5_ESWITCH */ + }; + + u32 mlx5e_choose_lro_timeout(struct mlx5_core_dev *mdev, u32 wanted_timeout) +@@ -5130,7 +6209,11 @@ static void mlx5e_init_delay_drop(struct + INIT_WORK(&priv->delay_drop.work, mlx5e_delay_drop_handler); + } + +-void mlx5e_build_nic_params(struct mlx5e_priv *priv, struct mlx5e_xsk *xsk, u16 mtu) ++void mlx5e_build_nic_params(struct mlx5e_priv *priv, ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT ++ struct mlx5e_xsk *xsk, ++#endif ++ u16 mtu) + { + struct mlx5e_params *params = &priv->channels.params; + struct mlx5_core_dev *mdev = priv->mdev; +@@ -5150,7 +6233,9 @@ void mlx5e_build_nic_params(struct mlx5e + MLX5E_SET_PFLAG(params, MLX5E_PFLAG_SKB_TX_MPWQE, mlx5e_tx_mpwqe_supported(mdev)); + + /* XDP SQ */ ++#ifdef HAVE_XDP_SUPPORT + MLX5E_SET_PFLAG(params, MLX5E_PFLAG_XDP_TX_MPWQE, mlx5e_tx_mpwqe_supported(mdev)); ++#endif + + /* set CQE compression */ + params->rx_cqe_compress_def = false; +@@ -5159,6 +6244,7 @@ void mlx5e_build_nic_params(struct mlx5e + params->rx_cqe_compress_def = slow_pci_heuristic(mdev); + + MLX5E_SET_PFLAG(params, MLX5E_PFLAG_RX_CQE_COMPRESS, params->rx_cqe_compress_def); ++ MLX5E_SET_PFLAG(params, MLX5E_PFLAG_TX_CQE_COMPRESS, false); + MLX5E_SET_PFLAG(params, MLX5E_PFLAG_RX_NO_CSUM_COMPLETE, false); + + /* RQ */ +@@ -5170,6 +6256,12 @@ void mlx5e_build_nic_params(struct mlx5e + rx_cq_period_mode = MLX5_CAP_GEN(mdev, cq_period_start_from_cqe) ? + MLX5_CQ_PERIOD_MODE_START_FROM_CQE : + MLX5_CQ_PERIOD_MODE_START_FROM_EQE; ++ ++#ifdef CONFIG_COMPAT_LRO_ENABLED_IPOIB ++ MLX5E_SET_PFLAG(params, MLX5E_PFLAG_HWLRO, MLX5_CAP_ETH(mdev, lro_cap) && ++ MLX5_CAP_GEN(mdev, striding_rq)); ++#endif ++ + params->rx_dim_enabled = MLX5_CAP_GEN(mdev, cq_moderation); + params->tx_dim_enabled = MLX5_CAP_GEN(mdev, cq_moderation); + mlx5e_set_rx_cq_mode_params(params, rx_cq_period_mode); +@@ -5181,11 +6273,17 @@ void mlx5e_build_nic_params(struct mlx5e + params->tunneled_offload_en = mlx5_tunnel_inner_ft_supported(mdev); + + MLX5E_SET_PFLAG(params, MLX5E_PFLAG_PER_CH_STATS, true); ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + /* AF_XDP */ + params->xsk = xsk; ++#endif + + /* TX HW checksum offload for XDP is off by default */ + MLX5E_SET_PFLAG(params, MLX5E_PFLAG_TX_XDP_CSUM, 0); ++ ++ /* skb xmit_more in driver is off by default */ ++ MLX5E_SET_PFLAG(params, MLX5E_PFLAG_SKB_XMIT_MORE, 0); ++ + /* Do not update netdev->features directly in here + * on mlx5e_attach_netdev() we will call mlx5e_update_features() + * To update netdev->features please modify mlx5e_fix_features() +@@ -5195,19 +6293,28 @@ void mlx5e_build_nic_params(struct mlx5e + static void mlx5e_set_netdev_dev_addr(struct net_device *netdev) + { + struct mlx5e_priv *priv = netdev_priv(netdev); +- u8 addr[ETH_ALEN]; ++#ifdef HAVE_DEV_ADDR_MOD ++ u8 addr[ETH_ALEN]; + +- mlx5_query_mac_address(priv->mdev, addr); +- if (is_zero_ether_addr(addr) && ++ mlx5_query_mac_address(priv->mdev, addr); ++ if (is_zero_ether_addr(addr) && ++#else ++ mlx5_query_mac_address(priv->mdev, netdev->dev_addr); ++ if (is_zero_ether_addr(netdev->dev_addr) && ++#endif + !MLX5_CAP_GEN(priv->mdev, vport_group_manager)) { + eth_hw_addr_random(netdev); + mlx5_core_info(priv->mdev, "Assigned random MAC address %pM\n", netdev->dev_addr); ++#ifdef HAVE_DEV_ADDR_MOD + return; ++#endif + } +- ++#ifdef HAVE_DEV_ADDR_MOD + eth_hw_addr_set(netdev, addr); ++#endif + } + ++#if defined(HAVE_KERNEL_WITH_VXLAN_SUPPORT_ON) && defined(HAVE_UDP_TUNNEL_NIC_INFO) + static int mlx5e_vxlan_set_port(struct net_device *netdev, unsigned int table, + unsigned int entry, struct udp_tunnel_info *ti) + { +@@ -5240,6 +6347,13 @@ void mlx5e_vxlan_set_netdev_info(struct + + priv->netdev->udp_tunnel_nic_info = &priv->nic_info; + } ++#endif ++ ++#if defined(CONFIG_MLX5_ESWITCH) && defined(HAVE_SWITCHDEV_OPS) ++static const struct switchdev_ops mlx5e_switchdev_ops = { ++ .switchdev_port_attr_get = mlx5e_attr_get, ++}; ++#endif + + static bool mlx5e_tunnel_any_tx_proto_supported(struct mlx5_core_dev *mdev) + { +@@ -5249,7 +6363,11 @@ static bool mlx5e_tunnel_any_tx_proto_su + if (mlx5e_tunnel_proto_supported_tx(mdev, mlx5_get_proto_by_tunnel_type(tt))) + return true; + } ++#ifdef HAVE_KERNEL_WITH_VXLAN_SUPPORT_ON + return (mlx5_vxlan_allowed(mdev->vxlan) || mlx5_geneve_tx_allowed(mdev)); ++#else ++ return false; ++#endif + } + + static void mlx5e_build_nic_netdev(struct net_device *netdev) +@@ -5265,6 +6383,9 @@ static void mlx5e_build_nic_netdev(struc + + mlx5e_dcbnl_build_netdev(netdev); + ++#if defined(CONFIG_MLX5_ESWITCH) && defined(HAVE_SWITCHDEV_OPS) ++ netdev->switchdev_ops = &mlx5e_switchdev_ops; ++#endif + netdev->watchdog_timeo = 15 * HZ; + + netdev->ethtool_ops = &mlx5e_ethtool_ops; +@@ -5290,12 +6411,16 @@ static void mlx5e_build_nic_netdev(struc + * for inner TIRs while having it enabled for outer TIRs. Due to this, + * block LRO altogether if the firmware declares tunneled LRO support. + */ ++ /* If SW LRO is supported turn on LRO Primary flags*/ ++#ifdef CONFIG_COMPAT_LRO_ENABLED_IPOIB ++ netdev->vlan_features |= NETIF_F_LRO; ++#else + if (!!MLX5_CAP_ETH(mdev, lro_cap) && + !MLX5_CAP_ETH(mdev, tunnel_lro_vxlan) && + !MLX5_CAP_ETH(mdev, tunnel_lro_gre) && + mlx5e_check_fragmented_striding_rq_cap(mdev)) + netdev->vlan_features |= NETIF_F_LRO; +- ++#endif + netdev->hw_features = netdev->vlan_features; + netdev->hw_features |= NETIF_F_HW_VLAN_CTAG_TX; + netdev->hw_features |= NETIF_F_HW_VLAN_CTAG_RX; +@@ -5306,41 +6431,55 @@ static void mlx5e_build_nic_netdev(struc + netdev->hw_enc_features |= NETIF_F_HW_CSUM; + netdev->hw_enc_features |= NETIF_F_TSO; + netdev->hw_enc_features |= NETIF_F_TSO6; ++#ifdef HAVE_NETIF_F_GSO_PARTIAL + netdev->hw_enc_features |= NETIF_F_GSO_PARTIAL; ++#endif + } + ++#ifdef HAVE_KERNEL_WITH_VXLAN_SUPPORT_ON + if (mlx5_vxlan_allowed(mdev->vxlan) || mlx5_geneve_tx_allowed(mdev)) { + netdev->hw_features |= NETIF_F_GSO_UDP_TUNNEL | + NETIF_F_GSO_UDP_TUNNEL_CSUM; + netdev->hw_enc_features |= NETIF_F_GSO_UDP_TUNNEL | + NETIF_F_GSO_UDP_TUNNEL_CSUM; ++#ifdef HAVE_NETIF_F_GSO_PARTIAL + netdev->gso_partial_features = NETIF_F_GSO_UDP_TUNNEL_CSUM; ++#endif + netdev->vlan_features |= NETIF_F_GSO_UDP_TUNNEL | + NETIF_F_GSO_UDP_TUNNEL_CSUM; + } ++#endif + + if (mlx5e_tunnel_proto_supported_tx(mdev, IPPROTO_GRE)) { + netdev->hw_features |= NETIF_F_GSO_GRE | + NETIF_F_GSO_GRE_CSUM; + netdev->hw_enc_features |= NETIF_F_GSO_GRE | + NETIF_F_GSO_GRE_CSUM; ++#ifdef HAVE_NETIF_F_GSO_PARTIAL + netdev->gso_partial_features |= NETIF_F_GSO_GRE | + NETIF_F_GSO_GRE_CSUM; ++#endif + } + + if (mlx5e_tunnel_proto_supported_tx(mdev, IPPROTO_IPIP)) { ++#ifdef HAVE_NETIF_F_GSO_IPXIP6 + netdev->hw_features |= NETIF_F_GSO_IPXIP4 | + NETIF_F_GSO_IPXIP6; + netdev->hw_enc_features |= NETIF_F_GSO_IPXIP4 | + NETIF_F_GSO_IPXIP6; + netdev->gso_partial_features |= NETIF_F_GSO_IPXIP4 | + NETIF_F_GSO_IPXIP6; ++#endif + } + ++#ifdef HAVE_NETIF_F_GSO_PARTIAL + netdev->hw_features |= NETIF_F_GSO_PARTIAL; ++#endif ++#ifdef HAVE_NETIF_F_GSO_UDP_L4 + netdev->gso_partial_features |= NETIF_F_GSO_UDP_L4; + netdev->hw_features |= NETIF_F_GSO_UDP_L4; + netdev->features |= NETIF_F_GSO_UDP_L4; ++#endif + + mlx5_query_port_fcs(mdev, &fcs_supported, &fcs_enabled); + +@@ -5350,8 +6489,12 @@ static void mlx5e_build_nic_netdev(struc + if (MLX5_CAP_ETH(mdev, scatter_fcs)) + netdev->hw_features |= NETIF_F_RXFCS; + ++#ifdef CONFIG_COMPAT_CLS_FLOWER_MOD ++#if !defined(CONFIG_NET_SCHED_NEW) && !defined(CONFIG_COMPAT_KERNEL_4_14) + if (mlx5_qos_is_supported(mdev)) + netdev->hw_features |= NETIF_F_HW_TC; ++#endif ++#endif + + netdev->features = netdev->hw_features; + +@@ -5359,7 +6502,9 @@ static void mlx5e_build_nic_netdev(struc + if (fcs_enabled) + netdev->features &= ~NETIF_F_RXALL; + netdev->features &= ~NETIF_F_LRO; ++#ifdef HAVE_NETIF_F_GRO_HW + netdev->features &= ~NETIF_F_GRO_HW; ++#endif + netdev->features &= ~NETIF_F_RXFCS; + + #define FT_CAP(f) MLX5_CAP_FLOWTABLE(mdev, flow_table_properties_nic_receive.f) +@@ -5371,8 +6516,10 @@ static void mlx5e_build_nic_netdev(struc + netdev->hw_features |= NETIF_F_HW_TC; + #endif + #ifdef CONFIG_MLX5_EN_ARFS ++#ifndef HAVE_NET_FLOW_KEYS_H + netdev->hw_features |= NETIF_F_NTUPLE; + #endif ++#endif + } + + netdev->features |= NETIF_F_HIGHDMA; +@@ -5430,8 +6577,14 @@ static int mlx5e_nic_init(struct mlx5_co + struct mlx5e_priv *priv = netdev_priv(netdev); + int err; + +- mlx5e_build_nic_params(priv, &priv->xsk, netdev->mtu); ++ mlx5e_build_nic_params(priv, ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT ++ &priv->xsk, ++#endif ++ netdev->mtu); ++#if defined(HAVE_KERNEL_WITH_VXLAN_SUPPORT_ON) && defined(HAVE_UDP_TUNNEL_NIC_INFO) + mlx5e_vxlan_set_netdev_info(priv); ++#endif + mutex_init(&priv->aso_lock); + + mlx5e_init_delay_drop(priv, &priv->channels.params); +@@ -5485,7 +6638,11 @@ static int mlx5e_init_nic_rx(struct mlx5 + goto err_destroy_q_counters; + } + +- features = MLX5E_RX_RES_FEATURE_XSK | MLX5E_RX_RES_FEATURE_PTP; ++ features = MLX5E_RX_RES_FEATURE_PTP ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT ++ | MLX5E_RX_RES_FEATURE_XSK ++#endif ++ ; + if (priv->channels.params.tunneled_offload_en) + features |= MLX5E_RX_RES_FEATURE_INNER_FT; + err = mlx5e_rx_res_init(priv->rx_res, priv->mdev, features, +@@ -5542,6 +6699,7 @@ static void mlx5e_cleanup_nic_rx(struct + priv->rx_res = NULL; + } + ++#ifdef HAVE_TC_MQPRIO_QOPT_OFFLOAD + static void mlx5e_set_mqprio_rl(struct mlx5e_priv *priv) + { + struct mlx5e_params *params; +@@ -5558,6 +6716,7 @@ static void mlx5e_set_mqprio_rl(struct m + priv->mqprio_rl = rl; + mlx5e_mqprio_rl_update_params(params, rl); + } ++#endif + + static int mlx5e_init_nic_tx(struct mlx5e_priv *priv) + { +@@ -5568,8 +6727,9 @@ static int mlx5e_init_nic_tx(struct mlx5 + mlx5_core_warn(priv->mdev, "create tises failed, %d\n", err); + return err; + } +- ++#ifdef HAVE_TC_MQPRIO_QOPT_OFFLOAD + mlx5e_set_mqprio_rl(priv); ++#endif + mlx5e_dcbnl_initialize(priv); + return 0; + } +@@ -5579,6 +6739,9 @@ static void mlx5e_nic_enable(struct mlx5 + struct net_device *netdev = priv->netdev; + struct mlx5_core_dev *mdev = priv->mdev; + int err; ++#if defined(HAVE_NET_DEVICE_MIN_MAX_MTU_EXTENDED) ++ u16 max_mtu; ++#endif + + mlx5e_init_l2_addr(priv); + +@@ -5590,11 +6753,20 @@ static void mlx5e_nic_enable(struct mlx5 + if (!netif_running(netdev)) + mlx5e_modify_admin_state(mdev, MLX5_PORT_DOWN); + ++#ifdef HAVE_NET_DEVICE_MIN_MAX_MTU + mlx5e_set_netdev_mtu_boundaries(priv); ++#elif defined(HAVE_NET_DEVICE_MIN_MAX_MTU_EXTENDED) ++ netdev->extended->min_mtu = ETH_MIN_MTU; ++ mlx5_query_port_max_mtu(priv->mdev, &max_mtu, 1); ++ netdev->extended->max_mtu = MLX5E_HW2SW_MTU(&priv->channels.params, max_mtu); ++#endif + mlx5e_set_dev_port_mtu(priv); + + mlx5_lag_add_netdev(mdev, netdev); + ++ if (!is_valid_ether_addr(netdev->perm_addr)) ++ memcpy(netdev->perm_addr, netdev->dev_addr, netdev->addr_len); ++ + mlx5e_enable_async_events(priv); + mlx5e_enable_blocking_events(priv); + if (mlx5e_monitor_counter_supported(priv)) +@@ -5610,7 +6782,13 @@ static void mlx5e_nic_enable(struct mlx5 + rtnl_lock(); + if (netif_running(netdev)) + mlx5e_open(netdev); ++#ifdef HAVE_UDP_TUNNEL_NIC_INFO + udp_tunnel_nic_reset_ntf(priv->netdev); ++#elif defined(HAVE_UDP_TUNNEL_RX_INFO) && defined(HAVE_KERNEL_WITH_VXLAN_SUPPORT_ON) \ ++ && defined(HAVE_DEVLINK_HAS_RELOAD_UP_DOWN) ++ if (mlx5_vxlan_allowed(priv->mdev->vxlan)) ++ udp_tunnel_get_rx_info(priv->netdev); ++#endif + netif_device_attach(netdev); + rtnl_unlock(); + } +@@ -5625,6 +6803,14 @@ static void mlx5e_nic_disable(struct mlx + rtnl_lock(); + if (netif_running(priv->netdev)) + mlx5e_close(priv->netdev); ++#ifndef HAVE_UDP_TUNNEL_NIC_INFO ++#if defined(HAVE_UDP_TUNNEL_RX_INFO) && defined(HAVE_KERNEL_WITH_VXLAN_SUPPORT_ON) \ ++ && defined(HAVE_DEVLINK_HAS_RELOAD_UP_DOWN) ++ if (mlx5_vxlan_allowed(priv->mdev->vxlan)) ++ udp_tunnel_drop_rx_info(priv->netdev); ++ ++#endif ++#endif + netif_device_detach(priv->netdev); + rtnl_unlock(); + +@@ -5642,7 +6828,9 @@ static void mlx5e_nic_disable(struct mlx + } + mlx5e_disable_async_events(priv); + mlx5_lag_remove_netdev(mdev, priv->netdev); ++#if defined(HAVE_KERNEL_WITH_VXLAN_SUPPORT_ON) && defined(HAVE_DEVLINK_HAS_RELOAD_UP_DOWN) + mlx5_vxlan_reset_to_default(mdev->vxlan); ++#endif + mlx5e_macsec_cleanup(priv); + } + +@@ -5665,7 +6853,11 @@ static const struct mlx5e_profile mlx5e_ + .update_carrier = mlx5e_update_carrier, + .rx_handlers = &mlx5e_rx_handlers_nic, + .max_tc = MLX5E_MAX_NUM_TC, ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + .rq_groups = MLX5E_NUM_RQ_GROUPS(XSK), ++#else ++ .rq_groups = MLX5E_NUM_RQ_GROUPS(REGULAR), ++#endif + .stats_grps = mlx5e_nic_stats_grps, + .stats_grps_num = mlx5e_nic_stats_grps_num, + .features = BIT(MLX5E_PROFILE_FEATURE_PTP_RX) | +@@ -5914,12 +7106,16 @@ int mlx5e_attach_netdev(struct mlx5e_pri + /* Reducing the number of channels - RXFH has to be reset, and + * mlx5e_num_channels_changed below will build the RQT. + */ ++#ifdef HAVE_NETDEV_IFF_RXFH_CONFIGURED + priv->netdev->priv_flags &= ~IFF_RXFH_CONFIGURED; ++#endif + priv->channels.params.num_channels = max_nch; ++#ifdef HAVE_TC_MQPRIO_QOPT_OFFLOAD + if (priv->channels.params.mqprio.mode == TC_MQPRIO_MODE_CHANNEL) { + mlx5_core_warn(priv->mdev, "MLX5E: Disabling MQPRIO channel mode\n"); + mlx5e_params_mqprio_reset(&priv->channels.params); + } ++#endif + } + if (max_nch != priv->max_nch) { + mlx5_core_warn(priv->mdev, +@@ -6152,11 +7348,13 @@ static int mlx5e_probe(struct auxiliary_ + priv->profile = profile; + priv->ppriv = NULL; + ++#ifdef HAVE_DEVLINK_PORT_ATRRS_SET_GET_SUPPORT + err = mlx5e_devlink_port_register(priv); + if (err) { + mlx5_core_err(mdev, "mlx5e_devlink_port_register failed, %d\n", err); + goto err_destroy_netdev; + } ++#endif + + err = profile->init(mdev, netdev); + if (err) { +@@ -6170,13 +7368,17 @@ static int mlx5e_probe(struct auxiliary_ + goto err_profile_cleanup; + } + ++ mlx5e_rep_set_sysfs_attr(netdev); ++ + err = register_netdev(netdev); + if (err) { + mlx5_core_err(mdev, "register_netdev failed, %d\n", err); + goto err_resume; + } + ++#ifdef HAVE_DEVLINK_PORT_ATRRS_SET_GET_SUPPORT + mlx5e_devlink_port_type_eth_set(priv); ++#endif + + err = mlx5e_sysfs_create(netdev); + if (err) +@@ -6193,8 +7395,10 @@ err_resume: + err_profile_cleanup: + profile->cleanup(priv); + err_devlink_cleanup: ++#ifdef HAVE_DEVLINK_PORT_ATRRS_SET_GET_SUPPORT + mlx5e_devlink_port_unregister(priv); + err_destroy_netdev: ++#endif + mlx5e_destroy_netdev(priv); + return err; + } +@@ -6209,7 +7413,9 @@ static void mlx5e_remove(struct auxiliar + unregister_netdev(priv->netdev); + mlx5e_suspend(adev, state); + priv->profile->cleanup(priv); ++#ifdef HAVE_DEVLINK_PORT_ATRRS_SET_GET_SUPPORT + mlx5e_devlink_port_unregister(priv); ++#endif + mlx5e_destroy_netdev(priv); + } + +@@ -6234,7 +7440,9 @@ int mlx5e_init(void) + int ret; + + mlx5e_ipsec_build_inverse_table(); ++#ifdef __ETHTOOL_DECLARE_LINK_MODE_MASK + mlx5e_build_ptys2ethtool_map(); ++#endif + ret = auxiliary_driver_register(&mlx5e_driver); + if (ret) + return ret; diff --git a/src/mlnx-ofa_kernel-5.8/backports/0270-BACKPORT-net-sunrpc-xprtrdma-xprt_rdma.h.patch b/src/mlnx-ofa_kernel-5.8/backports/0270-BACKPORT-net-sunrpc-xprtrdma-xprt_rdma.h.patch new file mode 100644 index 0000000..884dd86 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0270-BACKPORT-net-sunrpc-xprtrdma-xprt_rdma.h.patch @@ -0,0 +1,172 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: net/sunrpc/xprtrdma/xprt_rdma.h + +Change-Id: I6aa2f3174e853844afaffa9321b3aa73319c34b5 +--- + net/sunrpc/xprtrdma/xprt_rdma.h | 86 +++++++++++++++++++++++++++++++++ + 1 file changed, 86 insertions(+) + +--- a/net/sunrpc/xprtrdma/xprt_rdma.h ++++ b/net/sunrpc/xprtrdma/xprt_rdma.h +@@ -310,7 +310,9 @@ enum { + struct rpcrdma_buffer; + struct rpcrdma_req { + struct list_head rl_list; ++#ifdef HAVE_RPC_XPRT_OPS_FREE_SLOT + struct rpc_rqst rl_slot; ++#endif + struct rpcrdma_rep *rl_reply; + struct xdr_stream rl_stream; + struct xdr_buf rl_hdrbuf; +@@ -320,6 +322,10 @@ struct rpcrdma_req { + struct rpcrdma_regbuf *rl_sendbuf; /* rq_snd_buf */ + struct rpcrdma_regbuf *rl_recvbuf; /* rq_rcv_buf */ + ++#if !defined(HAVE_RPC_XPRT_OPS_FREE_SLOT) || !defined(HAVE_XPRT_PIN_RQST) ++ __be32 rl_xid; ++#endif ++ + struct list_head rl_all; + struct kref rl_kref; + +@@ -328,10 +334,22 @@ struct rpcrdma_req { + struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; + }; + ++#ifndef HAVE_RPC_XPRT_OPS_FREE_SLOT ++static inline void ++rpcrdma_set_xprtdata(struct rpc_rqst *rqst, struct rpcrdma_req *req) ++{ ++ rqst->rq_xprtdata = req; ++} ++#endif ++ + static inline struct rpcrdma_req * + rpcr_to_rdmar(const struct rpc_rqst *rqst) + { ++#ifdef HAVE_RPC_XPRT_OPS_FREE_SLOT + return container_of(rqst, struct rpcrdma_req, rl_slot); ++#else ++ return rqst->rq_xprtdata; ++#endif + } + + static inline void +@@ -362,11 +380,18 @@ struct rpcrdma_buffer { + struct list_head rb_send_bufs; + struct list_head rb_mrs; + ++#ifndef HAVE_XPRT_WAIT_FOR_BUFFER_SPACE_RQST_ARG ++ unsigned long rb_flags; ++#endif + unsigned long rb_sc_head; + unsigned long rb_sc_tail; + unsigned long rb_sc_last; + struct rpcrdma_sendctx **rb_sc_ctxs; + ++#ifndef HAVE_XPRT_PIN_RQST ++ struct list_head rb_pending; ++#endif ++ + struct list_head rb_allreqs; + struct list_head rb_all_mrs; + struct list_head rb_all_reps; +@@ -382,6 +407,38 @@ struct rpcrdma_buffer { + struct work_struct rb_refresh_worker; + }; + ++#ifndef HAVE_XPRT_PIN_RQST ++static inline void ++rpcrdma_insert_req(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req) ++{ ++ spin_lock(&buffers->rb_lock); ++ if (list_empty(&req->rl_list)) ++ list_add_tail(&req->rl_list, &buffers->rb_pending); ++ spin_unlock(&buffers->rb_lock); ++} ++ ++static inline struct rpcrdma_req * ++rpcrdma_lookup_req_locked(struct rpcrdma_buffer *buffers, __be32 xid) ++{ ++ struct rpcrdma_req *pos; ++ ++ list_for_each_entry(pos, &buffers->rb_pending, rl_list) ++ if (pos->rl_xid == xid) ++ return pos; ++ return NULL; ++} ++ ++static inline void ++rpcrdma_remove_req(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req) ++{ ++ spin_lock(&buffers->rb_lock); ++ list_del(&req->rl_list); ++ spin_unlock(&buffers->rb_lock); ++} ++ ++void rpcrdma_recv_buffer_put_locked(struct rpcrdma_rep *rep); ++#endif ++ + /* + * Statistics for RPCRDMA + */ +@@ -426,7 +483,9 @@ struct rpcrdma_xprt { + struct rpcrdma_ep *rx_ep; + struct rpcrdma_buffer rx_buf; + struct delayed_work rx_connect_worker; ++#ifdef HAVE_XPRT_RECONNECT_DELAY + struct rpc_timeout rx_timeout; ++#endif + struct rpcrdma_stats rx_stats; + }; + +@@ -454,6 +513,13 @@ extern int xprt_rdma_pad_optimize; + */ + extern unsigned int xprt_rdma_memreg_strategy; + ++#ifndef HAVE_XPRT_WAIT_FOR_BUFFER_SPACE_RQST_ARG ++/* rb_flags */ ++enum { ++ RPCRDMA_BUF_F_EMPTY_SCQ = 0, ++}; ++#endif ++ + /* + * Endpoint calls - xprtrdma/verbs.c + */ +@@ -572,6 +638,21 @@ static inline void rpcrdma_set_xdrlen(st + xdr->len = len; + } + ++#ifndef HAVE_XDR_STREAM_REMAINING ++/** ++ * xdr_stream_remaining - Return the number of bytes remaining in the stream ++ * @xdr: pointer to struct xdr_stream ++ * ++ * Return value: ++ * Number of bytes remaining in @xdr before xdr->end ++ */ ++static inline size_t ++xdr_stream_remaining(const struct xdr_stream *xdr) ++{ ++ return xdr->nwords << 2; ++} ++#endif ++ + /* RPC/RDMA module init - xprtrdma/transport.c + */ + extern unsigned int xprt_rdma_max_inline_read; +@@ -587,8 +668,13 @@ void xprt_rdma_cleanup(void); + */ + #if defined(CONFIG_SUNRPC_BACKCHANNEL) + int xprt_rdma_bc_setup(struct rpc_xprt *, unsigned int); ++#ifdef HAVE_RPC_XPRT_OPS_BC_UP ++int xprt_rdma_bc_up(struct svc_serv *, struct net *); ++#endif + size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *); ++#ifdef HAVE_RPC_XPRT_OPS_BC_NUM_SLOTS + unsigned int xprt_rdma_bc_max_slots(struct rpc_xprt *); ++#endif + int rpcrdma_bc_post_recv(struct rpcrdma_xprt *, unsigned int); + void rpcrdma_bc_receive_call(struct rpcrdma_xprt *, struct rpcrdma_rep *); + int xprt_rdma_bc_send_reply(struct rpc_rqst *rqst); diff --git a/src/mlnx-ofa_kernel-5.8/backports/0271-BACKPORT-drivers-infiniband-core-sysfs.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0271-BACKPORT-drivers-infiniband-core-sysfs.c.patch new file mode 100644 index 0000000..11634b2 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0271-BACKPORT-drivers-infiniband-core-sysfs.c.patch @@ -0,0 +1,79 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/core/sysfs.c + +Change-Id: Iba19aef565bedc0c9532e52c2c9c2ae76a4514d0 +--- + drivers/infiniband/core/sysfs.c | 29 +++++++++++++++-------------- + 1 file changed, 15 insertions(+), 14 deletions(-) + +--- a/drivers/infiniband/core/sysfs.c ++++ b/drivers/infiniband/core/sysfs.c +@@ -115,19 +115,6 @@ static ssize_t port_attr_show(struct kob + return port_attr->show(p->ibdev, p->port_num, port_attr, buf); + } + +-static ssize_t port_attr_store(struct kobject *kobj, +- struct attribute *attr, +- const char *buf, size_t count) +-{ +- struct ib_port_attribute *port_attr = +- container_of(attr, struct ib_port_attribute, attr); +- struct ib_port *p = container_of(kobj, struct ib_port, kobj); +- +- if (!port_attr->store) +- return -EIO; +- return port_attr->store(p->ibdev, p->port_num, port_attr, buf, count); +-} +- + struct ib_device *ib_port_sysfs_get_ibdev_kobj(struct kobject *kobj, + u32 *port_num) + { +@@ -138,9 +125,12 @@ struct ib_device *ib_port_sysfs_get_ibde + } + EXPORT_SYMBOL(ib_port_sysfs_get_ibdev_kobj); + ++#ifdef CONFIG_COMPAT_IS_CONST_KOBJECT_SYSFS_OPS + static const struct sysfs_ops port_sysfs_ops = { ++#else ++static struct sysfs_ops port_sysfs_ops = { ++#endif + .show = port_attr_show, +- .store = port_attr_store + }; + + static ssize_t hw_stat_device_show(struct device *dev, +@@ -206,7 +196,11 @@ static ssize_t gid_attr_show(struct kobj + return port_attr->show(p->ibdev, p->port_num, port_attr, buf); + } + ++#ifdef CONFIG_COMPAT_IS_CONST_KOBJECT_SYSFS_OPS + static const struct sysfs_ops gid_attr_sysfs_ops = { ++#else ++static struct sysfs_ops gid_attr_sysfs_ops = { ++#endif + .show = gid_attr_show + }; + +@@ -451,7 +445,10 @@ static struct attribute *port_default_at + &ib_port_attr_link_layer.attr, + NULL + }; ++ ++#ifdef HAVE_KOBJ_TYPE_DEFAULT_GROUPS + ATTRIBUTE_GROUPS(port_default); ++#endif + + static ssize_t print_ndev(const struct ib_gid_attr *gid_attr, char *buf) + { +@@ -801,7 +798,11 @@ static void ib_port_gid_attr_release(str + static struct kobj_type port_type = { + .release = ib_port_release, + .sysfs_ops = &port_sysfs_ops, ++#ifdef HAVE_KOBJ_TYPE_DEFAULT_GROUPS + .default_groups = port_default_groups, ++#else ++ .default_attrs = port_default_attrs ++#endif + }; + + static struct kobj_type gid_attr_type = { diff --git a/src/mlnx-ofa_kernel-5.8/backports/0271-BACKPORT-drivers-nvme-host-fault_inject.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0271-BACKPORT-drivers-nvme-host-fault_inject.c.patch new file mode 100644 index 0000000..ea1b350 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0271-BACKPORT-drivers-nvme-host-fault_inject.c.patch @@ -0,0 +1,22 @@ +From: Roy Novich +Subject: [PATCH] BACKPORT: drivers/nvme/host/fault_inject.c + +Change-Id: I342924da6f47877e5d956edd28dbe6aa45edb82b +--- + drivers/nvme/host/fault_inject.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/drivers/nvme/host/fault_inject.c ++++ b/drivers/nvme/host/fault_inject.c +@@ -56,7 +56,11 @@ void nvme_fault_inject_fini(struct nvme_ + + void nvme_should_fail(struct request *req) + { ++#ifdef HAVE_REQ_RQ_DISK ++ struct gendisk *disk = req->rq_disk; ++#else + struct gendisk *disk = req->q->disk; ++#endif + struct nvme_fault_inject *fault_inject = NULL; + u16 status; + diff --git a/src/mlnx-ofa_kernel-5.8/backports/0272-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-dev.patch b/src/mlnx-ofa_kernel-5.8/backports/0272-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-dev.patch new file mode 100644 index 0000000..6eb7bf0 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0272-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-dev.patch @@ -0,0 +1,33 @@ +From: Roy Novich +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/devlink.h + +Change-Id: Ie5812c36cd8b3c1c6c8196696ace1c11c7c753fa +--- + drivers/net/ethernet/mellanox/mlx5/core/devlink.h | 6 ++++++ + 1 file changed, 6 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.h +@@ -29,16 +29,22 @@ struct mlx5_devlink_trap { + struct list_head list; + }; + ++#ifdef HAVE_DEVLINK_TRAP_SUPPORT + struct mlx5_core_dev; + void mlx5_devlink_trap_report(struct mlx5_core_dev *dev, int trap_id, struct sk_buff *skb, + struct devlink_port *dl_port); + int mlx5_devlink_trap_get_num_active(struct mlx5_core_dev *dev); + int mlx5_devlink_traps_get_action(struct mlx5_core_dev *dev, int trap_id, + enum devlink_trap_action *action); ++#endif + + struct devlink *mlx5_devlink_alloc(struct device *dev); + void mlx5_devlink_free(struct devlink *devlink); ++#ifdef HAVE_DEVLINK_REGISTER_GET_1_PARAMS + int mlx5_devlink_register(struct devlink *devlink); ++#else ++int mlx5_devlink_register(struct devlink *devlink, struct device *pdev); ++#endif + void mlx5_devlink_unregister(struct devlink *devlink); + + int diff --git a/src/mlnx-ofa_kernel-5.8/backports/0272-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch b/src/mlnx-ofa_kernel-5.8/backports/0272-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch new file mode 100644 index 0000000..9742fac --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0272-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch @@ -0,0 +1,1411 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/en_tc.c + +Change-Id: I77671164640664e1f5efd0d4089304646e82b23e +--- + .../net/ethernet/mellanox/mlx5/core/en_tc.c | 481 +++++++++++++++++- + 1 file changed, 469 insertions(+), 12 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +@@ -39,6 +39,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -1421,8 +1422,10 @@ static int mlx5e_hairpin_get_prio(struct + + #ifdef CONFIG_MLX5_CORE_EN_DCB + if (priv->dcbx_dp.trust_state != MLX5_QPTS_TRUST_PCP) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, + "only PCP trust state supported for hairpin"); ++#endif + return -EOPNOTSUPP; + } + #endif +@@ -1438,8 +1441,10 @@ static int mlx5e_hairpin_get_prio(struct + if (!vlan_present || !prio_mask) { + prio_val = 0; + } else if (prio_mask != 0x7) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, + "masked priority match not supported for hairpin"); ++#endif + return -EOPNOTSUPP; + } + +@@ -1932,12 +1937,16 @@ static int mlx5e_hairpin_flow_add(struct + + peer_mdev = mlx5e_hairpin_get_mdev(dev_net(priv->netdev), peer_ifindex); + if (IS_ERR(peer_mdev)) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, "invalid ifindex of mirred device"); ++#endif + return PTR_ERR(peer_mdev); + } + + if (!MLX5_CAP_GEN(priv->mdev, hairpin) || !MLX5_CAP_GEN(peer_mdev, hairpin)) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, "hairpin is not supported"); ++#endif + return -EOPNOTSUPP; + } + +@@ -2526,7 +2535,9 @@ set_encap_dests(struct mlx5e_priv *priv, + mirred_ifindex = parse_attr->mirred_ifindex[out_index]; + out_dev = dev_get_by_index(dev_net(priv->netdev), mirred_ifindex); + if (!out_dev) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, "Requested mirred device not found"); ++#endif + err = -ENODEV; + goto out; + } +@@ -2548,7 +2559,9 @@ set_encap_dests(struct mlx5e_priv *priv, + } + + if (*vf_tun && esw_attr->out_count > 1) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, "VF tunnel encap with mirroring is not supported"); ++#endif + err = -EOPNOTSUPP; + goto out; + } +@@ -2609,16 +2622,20 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv + */ + max_chain = mlx5_chains_get_chain_range(esw_chains(esw)); + if (!mlx5e_is_ft_flow(flow) && attr->chain > max_chain) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, + "Requested chain is out of supported range"); ++#endif + err = -EOPNOTSUPP; + goto err_out; + } + + max_prio = mlx5_chains_get_prio_range(esw_chains(esw)); + if (attr->prio > max_prio) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, + "Requested priority is out of supported range"); ++#endif + err = -EOPNOTSUPP; + goto err_out; + } +@@ -2660,15 +2677,19 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv + struct mlx5e_tc_int_port *int_port; + + if (attr->chain) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, + "Internal port rule is only supported on chain 0"); ++#endif + err = -EOPNOTSUPP; + goto err_out; + } + + if (attr->dest_chain) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, + "Internal port rule offload doesn't support goto action"); ++#endif + err = -EOPNOTSUPP; + goto err_out; + } +@@ -2916,8 +2937,10 @@ enc_opts_is_dont_care_or_full_match(stru + + if (opt->opt_class != htons(U16_MAX) || + opt->type != U8_MAX) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, + "Partial match of tunnel options in chain > 0 isn't supported"); ++#endif + netdev_warn(priv->netdev, + "Partial match of tunnel options in chain > 0 isn't supported"); + return -EOPNOTSUPP; +@@ -2948,7 +2971,11 @@ static int mlx5e_get_flow_tunnel_id(stru + struct net_device *filter_dev) + { + struct flow_rule *rule = flow_cls_offload_flow_rule(f); ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + struct netlink_ext_ack *extack = f->common.extack; ++#else ++ struct netlink_ext_ack *extack = NULL; ++#endif + struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts; + struct flow_match_enc_opts enc_opts_match; + struct tunnel_match_enc_opts tun_enc_opts; +@@ -2967,6 +2994,7 @@ static int mlx5e_get_flow_tunnel_id(stru + uplink_priv = &uplink_rpriv->uplink_priv; + + memset(&tunnel_key, 0, sizeof(tunnel_key)); ++#ifdef HAVE_FLOW_DISSECTOR_KEY_ENC_CONTROL + COPY_DISSECTOR(rule, FLOW_DISSECTOR_KEY_ENC_CONTROL, + &tunnel_key.enc_control); + if (tunnel_key.enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) +@@ -2980,6 +3008,7 @@ static int mlx5e_get_flow_tunnel_id(stru + &tunnel_key.enc_tp); + COPY_DISSECTOR(rule, FLOW_DISSECTOR_KEY_ENC_KEYID, + &tunnel_key.enc_key_id); ++#endif + tunnel_key.filter_ifindex = filter_dev->ifindex; + + err = mapping_add(uplink_priv->tunnel_mapping, &tunnel_key, &tun_id); +@@ -3135,7 +3164,9 @@ static int mlx5e_tc_verify_tunnel_ecn(st + { + u8 outer_ecn_mask = 0, outer_ecn_key = 0, inner_ecn_mask = 0, inner_ecn_key = 0; + struct flow_rule *rule = flow_cls_offload_flow_rule(f); ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + struct netlink_ext_ack *extack = f->common.extack; ++#endif + struct flow_match_ip match; + + *match_inner_ecn = true; +@@ -3153,7 +3184,9 @@ static int mlx5e_tc_verify_tunnel_ecn(st + } + + if (outer_ecn_mask != 0 && outer_ecn_mask != INET_ECN_MASK) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, "Partial match on enc_tos ecn bits isn't supported"); ++#endif + netdev_warn(priv->netdev, "Partial match on enc_tos ecn bits isn't supported"); + return -EOPNOTSUPP; + } +@@ -3162,16 +3195,20 @@ static int mlx5e_tc_verify_tunnel_ecn(st + if (!inner_ecn_mask) + return 0; + ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, + "Matching on tos ecn bits without also matching enc_tos ecn bits isn't supported"); ++#endif + netdev_warn(priv->netdev, + "Matching on tos ecn bits without also matching enc_tos ecn bits isn't supported"); + return -EOPNOTSUPP; + } + + if (inner_ecn_mask && inner_ecn_mask != INET_ECN_MASK) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, + "Partial match on tos ecn bits with match on enc_tos ecn bits isn't supported"); ++#endif + netdev_warn(priv->netdev, + "Partial match on tos ecn bits with match on enc_tos ecn bits isn't supported"); + return -EOPNOTSUPP; +@@ -3185,7 +3222,9 @@ static int mlx5e_tc_verify_tunnel_ecn(st + if (outer_ecn_key == INET_ECN_ECT_1) { + /* inner ecn might change by DECAP action */ + ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, "Match on enc_tos ecn = ECT(1) isn't supported"); ++#endif + netdev_warn(priv->netdev, "Match on enc_tos ecn = ECT(1) isn't supported"); + return -EOPNOTSUPP; + } +@@ -3195,8 +3234,10 @@ static int mlx5e_tc_verify_tunnel_ecn(st + + if (inner_ecn_key != INET_ECN_CE) { + /* Can't happen in software, as packet ecn will be changed to CE after decap */ ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, + "Match on tos enc_tos ecn = CE while match on tos ecn != CE isn't supported"); ++#endif + netdev_warn(priv->netdev, + "Match on tos enc_tos ecn = CE while match on tos ecn != CE isn't supported"); + return -EOPNOTSUPP; +@@ -3210,6 +3251,7 @@ static int mlx5e_tc_verify_tunnel_ecn(st + return 0; + } + ++#ifdef HAVE_TCF_TUNNEL_INFO + static int parse_tunnel_attr(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow, + struct mlx5_flow_spec *spec, +@@ -3220,12 +3262,16 @@ static int parse_tunnel_attr(struct mlx5 + { + struct mlx5e_tc_tunnel *tunnel = mlx5e_get_tc_tun(filter_dev); + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + struct netlink_ext_ack *extack = f->common.extack; ++#endif + bool needs_mapping, sets_mapping; + int err; + + if (!mlx5e_is_eswitch_flow(flow)) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, "Match on tunnel is not supported"); ++#endif + return -EOPNOTSUPP; + } + +@@ -3235,8 +3281,10 @@ static int parse_tunnel_attr(struct mlx5 + + if ((needs_mapping || sets_mapping) && + !mlx5_eswitch_reg_c1_loopback_enabled(esw)) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, + "Chains on tunnel devices isn't supported without register loopback support"); ++#endif + netdev_warn(priv->netdev, + "Chains on tunnel devices isn't supported without register loopback support"); + return -EOPNOTSUPP; +@@ -3246,8 +3294,10 @@ static int parse_tunnel_attr(struct mlx5 + err = mlx5e_tc_tun_parse(filter_dev, priv, spec, f, + match_level); + if (err) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, + "Failed to parse tunnel attributes"); ++#endif + netdev_warn(priv->netdev, + "Failed to parse tunnel attributes"); + return err; +@@ -3256,7 +3306,9 @@ static int parse_tunnel_attr(struct mlx5 + /* With mpls over udp we decapsulate using packet reformat + * object + */ ++#ifdef HAVE_NET_BAREUDP_H + if (!netif_is_bareudp(filter_dev)) ++#endif + flow->attr->action |= MLX5_FLOW_CONTEXT_ACTION_DECAP; + err = mlx5e_tc_set_attr_rx_tun(flow, spec); + if (err) +@@ -3266,7 +3318,9 @@ static int parse_tunnel_attr(struct mlx5 + + tmp_spec = kvzalloc(sizeof(*tmp_spec), GFP_KERNEL); + if (!tmp_spec) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, "Failed to allocate memory for vxlan tmp spec"); ++#endif + netdev_warn(priv->netdev, "Failed to allocate memory for vxlan tmp spec"); + return -ENOMEM; + } +@@ -3275,7 +3329,9 @@ static int parse_tunnel_attr(struct mlx5 + err = mlx5e_tc_tun_parse(filter_dev, priv, tmp_spec, f, match_level); + if (err) { + kvfree(tmp_spec); ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, "Failed to parse tunnel attributes"); ++#endif + netdev_warn(priv->netdev, "Failed to parse tunnel attributes"); + return err; + } +@@ -3290,6 +3346,7 @@ static int parse_tunnel_attr(struct mlx5 + + return mlx5e_get_flow_tunnel_id(priv, flow, f, filter_dev); + } ++#endif /* HAVE_TCF_TUNNEL_INFO */ + + static void *get_match_inner_headers_criteria(struct mlx5_flow_spec *spec) + { +@@ -3333,9 +3390,16 @@ static int mlx5e_flower_parse_meta(struc + struct flow_cls_offload *f) + { + struct flow_rule *rule = flow_cls_offload_flow_rule(f); ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + struct netlink_ext_ack *extack = f->common.extack; ++#else ++ struct netlink_ext_ack *extack; ++#endif + struct net_device *ingress_dev; + struct flow_match_meta match; ++#ifndef HAVE_TC_CLS_OFFLOAD_EXTACK ++ extack = NULL; ++#endif + + if (!flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_META)) + return 0; +@@ -3345,21 +3409,27 @@ static int mlx5e_flower_parse_meta(struc + return 0; + + if (match.mask->ingress_ifindex != 0xFFFFFFFF) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, "Unsupported ingress ifindex mask"); ++#endif + return -EOPNOTSUPP; + } + + ingress_dev = __dev_get_by_index(dev_net(filter_dev), + match.key->ingress_ifindex); + if (!ingress_dev) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, + "Can't find the ingress port to match on"); ++#endif + return -ENOENT; + } + + if (ingress_dev != filter_dev) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, + "Can't match on the ingress filter port"); ++#endif + return -EOPNOTSUPP; + } + +@@ -3374,9 +3444,10 @@ static bool skip_key_basic(struct net_de + * label fields. However, the actual ethertype is IP so we want to + * avoid matching on this, otherwise we'll fail the match. + */ ++#ifdef HAVE_NET_BAREUDP_H + if (netif_is_bareudp(filter_dev) && f->common.chain_index == 0) + return true; +- ++#endif + return false; + } + +@@ -3388,7 +3459,11 @@ static int __parse_cls_flower(struct mlx + u8 *inner_match_level, u8 *outer_match_level, + bool *is_tunnel_flow) + { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + struct netlink_ext_ack *extack = f->common.extack; ++#else ++ struct netlink_ext_ack *extack; ++#endif + void *headers_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, + outer_headers); + void *headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, +@@ -3409,6 +3484,9 @@ static int __parse_cls_flower(struct mlx + u8 ip_proto = 0; + u8 *match_level; + int err; ++#ifndef HAVE_TC_CLS_OFFLOAD_EXTACK ++ extack = NULL; ++#endif + + fs_type = mlx5e_is_eswitch_flow(flow) ? FS_FT_FDB : FS_FT_NIC_RX; + match_level = outer_match_level; +@@ -3423,25 +3501,44 @@ static int __parse_cls_flower(struct mlx + BIT(FLOW_DISSECTOR_KEY_IPV4_ADDRS) | + BIT(FLOW_DISSECTOR_KEY_IPV6_ADDRS) | + BIT(FLOW_DISSECTOR_KEY_PORTS) | ++#ifdef HAVE_FLOW_DISSECTOR_KEY_ENC_CONTROL + BIT(FLOW_DISSECTOR_KEY_ENC_KEYID) | + BIT(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) | + BIT(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) | + BIT(FLOW_DISSECTOR_KEY_ENC_PORTS) | + BIT(FLOW_DISSECTOR_KEY_ENC_CONTROL) | ++#endif + BIT(FLOW_DISSECTOR_KEY_TCP) | + BIT(FLOW_DISSECTOR_KEY_IP) | + BIT(FLOW_DISSECTOR_KEY_CT) | + BIT(FLOW_DISSECTOR_KEY_ENC_IP) | + BIT(FLOW_DISSECTOR_KEY_ENC_OPTS) | ++#ifdef HAVE_FLOW_DISSECTOR_KEY_ENC_CONTROL + BIT(FLOW_DISSECTOR_KEY_ICMP) | ++#endif + BIT(FLOW_DISSECTOR_KEY_MPLS))) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, "Unsupported key"); ++#endif + netdev_dbg(priv->netdev, "Unsupported key used: 0x%x\n", + dissector->used_keys); + return -EOPNOTSUPP; + } + +- if (mlx5e_get_tc_tun(filter_dev)) { ++#ifdef HAVE_FLOW_DISSECTOR_KEY_ENC_CONTROL ++#if !defined(HAVE_TC_INDR_API) && !defined(CONFIG_COMPAT_KERNEL_4_14) ++ /* for old kernels we dont have real filter_dev, ++ * and mlx5e_get_tc_tun always return vxlan ++ */ ++ if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) || ++ flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) || ++ flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_KEYID) || ++ flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_PORTS) || ++ flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_OPTS)) ++#else ++ if (mlx5e_get_tc_tun(filter_dev)) ++#endif ++ { + bool match_inner = false; + + err = parse_tunnel_attr(priv, flow, spec, f, filter_dev, +@@ -3464,6 +3561,7 @@ static int __parse_cls_flower(struct mlx + if (err) + return err; + } ++#endif + + err = mlx5e_flower_parse_meta(filter_dev, f); + if (err) +@@ -3480,6 +3578,15 @@ static int __parse_cls_flower(struct mlx + + if (match.mask->n_proto) + *match_level = MLX5_MATCH_L2; ++ ++#ifndef HAVE_FLOW_DISSECTOR_KEY_CVLAN ++ if (match.key->n_proto == htons(ETH_P_8021AD)) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK ++ NL_SET_ERR_MSG_MOD(extack, "Matching on CVLAN is not supported"); ++#endif ++ return -EOPNOTSUPP; ++ } ++#endif + } + if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_VLAN) || + is_vlan_dev(filter_dev)) { +@@ -3543,8 +3650,10 @@ static int __parse_cls_flower(struct mlx + match.mask->vlan_tpid) { + if (!MLX5_CAP_FLOWTABLE_TYPE(priv->mdev, ft_field_support.outer_second_vid, + fs_type)) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, + "Matching on CVLAN is not supported"); ++#endif + return -EOPNOTSUPP; + } + +@@ -3605,7 +3714,9 @@ static int __parse_cls_flower(struct mlx + + /* the HW doesn't support frag first/later */ + if (match.mask->flags & FLOW_DIS_FIRST_FRAG) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, "Match on frag first/later is not supported"); ++#endif + return -EOPNOTSUPP; + } + +@@ -3706,8 +3817,10 @@ static int __parse_cls_flower(struct mlx + if (match.mask->ttl && + !MLX5_CAP_ESW_FLOWTABLE_FDB(priv->mdev, + ft_field_support.outer_ipv4_ttl)) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, + "Matching on TTL is not supported"); ++#endif + return -EOPNOTSUPP; + } + +@@ -3746,8 +3859,10 @@ static int __parse_cls_flower(struct mlx + udp_dport, ntohs(match.key->dst)); + break; + default: ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, + "Only UDP and TCP transports are supported for L4 matching"); ++#endif + netdev_err(priv->netdev, + "Only UDP and TCP transport are supported\n"); + return -EINVAL; +@@ -3769,6 +3884,7 @@ static int __parse_cls_flower(struct mlx + if (match.mask->flags) + *match_level = MLX5_MATCH_L4; + } ++#ifdef HAVE_FLOW_DISSECTOR_KEY_ENC_CONTROL + if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ICMP)) { + struct flow_match_icmp match; + +@@ -3777,8 +3893,10 @@ static int __parse_cls_flower(struct mlx + case IPPROTO_ICMP: + if (!(MLX5_CAP_GEN(priv->mdev, flex_parser_protocols) & + MLX5_FLEX_PROTO_ICMP)) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, + "Match on Flex protocols for ICMP is not supported"); ++#endif + return -EOPNOTSUPP; + } + MLX5_SET(fte_match_set_misc3, misc_c_3, icmp_type, +@@ -3793,8 +3911,10 @@ static int __parse_cls_flower(struct mlx + case IPPROTO_ICMPV6: + if (!(MLX5_CAP_GEN(priv->mdev, flex_parser_protocols) & + MLX5_FLEX_PROTO_ICMPV6)) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, + "Match on Flex protocols for ICMPV6 is not supported"); ++#endif + return -EOPNOTSUPP; + } + MLX5_SET(fte_match_set_misc3, misc_c_3, icmpv6_type, +@@ -3807,8 +3927,10 @@ static int __parse_cls_flower(struct mlx + match.key->code); + break; + default: ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, + "Code and type matching only with ICMP and ICMPv6"); ++#endif + netdev_err(priv->netdev, + "Code and type matching only with ICMP and ICMPv6\n"); + return -EINVAL; +@@ -3818,15 +3940,20 @@ static int __parse_cls_flower(struct mlx + spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS_3; + } + } ++#endif ++#ifdef HAVE_NET_BAREUDP_H + /* Currently supported only for MPLS over UDP */ + if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_MPLS) && + !netif_is_bareudp(filter_dev)) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, + "Matching on MPLS is supported only for MPLS over UDP"); ++#endif + netdev_err(priv->netdev, + "Matching on MPLS is supported only for MPLS over UDP\n"); + return -EOPNOTSUPP; + } ++#endif + + return 0; + } +@@ -3838,7 +3965,9 @@ static int parse_cls_flower(struct mlx5e + struct net_device *filter_dev) + { + u8 inner_match_level, outer_match_level, non_tunnel_match_level; ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + struct netlink_ext_ack *extack = f->common.extack; ++#endif + struct mlx5_core_dev *dev = priv->mdev; + struct mlx5_eswitch *esw = dev->priv.eswitch; + struct mlx5e_rep_priv *rpriv = priv->ppriv; +@@ -3863,8 +3992,10 @@ static int parse_cls_flower(struct mlx5e + if (rep->vport != MLX5_VPORT_UPLINK && + (esw->offloads.inline_mode != MLX5_INLINE_MODE_NONE && + esw->offloads.inline_mode < non_tunnel_match_level)) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, + "Flow is not offloaded due to min inline setting"); ++#endif + netdev_warn(priv->netdev, + "Flow is not offloaded due to min inline setting, required %d actual %d\n", + non_tunnel_match_level, esw->offloads.inline_mode); +@@ -4025,8 +4156,10 @@ static int offload_pedit_fields(struct m + continue; + + if (s_mask && a_mask) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, + "can't set and add to the same HW field"); ++#endif + netdev_warn(priv->netdev, + "mlx5: can't set and add to the same HW field (%x)\n", + f->field); +@@ -4066,8 +4199,10 @@ static int offload_pedit_fields(struct m + next_z = find_next_zero_bit(&mask, f->field_bsize, first); + last = find_last_bit(&mask, f->field_bsize); + if (first < next_z && next_z < last) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, + "rewrite of few sub-fields isn't supported"); ++#endif + netdev_warn(priv->netdev, + "mlx5: rewrite of few sub-fields (mask %lx) isn't offloaded\n", + mask); +@@ -4076,8 +4211,10 @@ static int offload_pedit_fields(struct m + + action = mlx5e_mod_hdr_alloc(priv->mdev, namespace, mod_acts); + if (IS_ERR(action)) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, + "too many pedit actions, can't offload"); ++#endif + mlx5_core_warn(priv->mdev, + "mlx5: parsed %d pedit actions, can't do more\n", + mod_acts->num_actions); +@@ -4125,7 +4262,9 @@ static int verify_offload_pedit_fields(s + for (cmd = 0; cmd < __PEDIT_CMD_MAX; cmd++) { + cmd_masks = &parse_attr->hdrs[cmd].masks; + if (memcmp(cmd_masks, &zero_masks, sizeof(zero_masks))) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, "attempt to offload an unsupported field"); ++#endif + netdev_warn(priv->netdev, "attempt to offload an unsupported field (cmd %d)\n", cmd); + print_hex_dump(KERN_WARNING, "mask: ", DUMP_PREFIX_ADDRESS, + 16, 1, cmd_masks, sizeof(zero_masks), true); +@@ -4143,6 +4282,7 @@ static int alloc_tc_pedit_action(struct + { + int err; + ++#ifdef HAVE_TCF_PEDIT_TCFP_KEYS_EX + err = offload_pedit_fields(priv, namespace, parse_attr, action_flags, extack); + if (err) + goto out_dealloc_parsed_actions; +@@ -4151,6 +4291,9 @@ static int alloc_tc_pedit_action(struct + if (err) + goto out_dealloc_parsed_actions; + ++#else /* HAVE_TCF_PEDIT_TCFP_KEYS_EX */ ++ return -EOPNOTSUPP; ++#endif /* HAVE_TCF_PEDIT_TCFP_KEYS_EX */ + return 0; + + out_dealloc_parsed_actions: +@@ -4199,8 +4342,10 @@ is_action_keys_supported(const struct fl + *modify_tuple = true; + + if (ct_flow && *modify_tuple) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, + "can't offload re-write of ipv4 address with action ct"); ++#endif + return false; + } + } else if (htype == FLOW_ACT_MANGLE_HDR_TYPE_IP6) { +@@ -4217,16 +4362,20 @@ is_action_keys_supported(const struct fl + *modify_tuple = true; + + if (ct_flow && *modify_tuple) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, + "can't offload re-write of ipv6 address with action ct"); ++#endif + return false; + } + } else if (htype == FLOW_ACT_MANGLE_HDR_TYPE_TCP || + htype == FLOW_ACT_MANGLE_HDR_TYPE_UDP) { + *modify_tuple = true; + if (ct_flow) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, + "can't offload re-write of transport header ports with action ct"); ++#endif + return false; + } + } +@@ -4243,8 +4392,10 @@ static bool modify_tuple_supported(bool + return true; + + if (ct_flow) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, + "can't offload tuple modification with non-clear ct()"); ++#endif + netdev_info(priv->netdev, + "can't offload tuple modification with non-clear ct()"); + return false; +@@ -4255,8 +4406,10 @@ static bool modify_tuple_supported(bool + * we can't restore ct state + */ + if (mlx5_tc_ct_add_no_trk_match(spec)) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, + "can't offload tuple modification with ct matches and no ct(clear) action"); ++#endif + netdev_info(priv->netdev, + "can't offload tuple modification with ct matches and no ct(clear) action"); + return false; +@@ -4309,8 +4462,10 @@ static bool modify_header_match_supporte + ip_proto = MLX5_GET(fte_match_set_lyr_2_4, headers_v, ip_protocol); + if (modify_ip_header && ip_proto != IPPROTO_TCP && + ip_proto != IPPROTO_UDP && ip_proto != IPPROTO_ICMP) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, + "can't offload re-write of non TCP/UDP"); ++#endif + netdev_info(priv->netdev, "can't offload re-write of ip proto %d\n", + ip_proto); + return false; +@@ -4337,13 +4492,17 @@ actions_match_supported_fdb(struct mlx5e + /* All registers used by ct are cleared when using + * split rules. + */ ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, "Can't offload mirroring with action ct"); ++#endif + return false; + } + + if (esw_attr->split_count > 0 && !mlx5_esw_has_fwd_fdb(priv->mdev)) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, + "current firmware doesn't support split rule for port mirroring"); ++#endif + netdev_warn_once(priv->netdev, + "current firmware doesn't support split rule for port mirroring\n"); + return false; +@@ -4367,31 +4526,41 @@ actions_match_supported(struct mlx5e_pri + + if (!(actions & + (MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | MLX5_FLOW_CONTEXT_ACTION_DROP))) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, "Rule must have at least one forward/drop action"); ++#endif + return false; + } + + if (!(~actions & + (MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | MLX5_FLOW_CONTEXT_ACTION_DROP))) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, "Rule cannot support forward+drop action"); ++#endif + return false; + } + + if (!(~actions & + (MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | MLX5_FLOW_CONTEXT_ACTION_DROP))) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, "Rule cannot support forward+drop action"); ++#endif + return false; + } + + if (actions & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR && + actions & MLX5_FLOW_CONTEXT_ACTION_DROP) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, "Drop with modify header action is not supported"); ++#endif + return false; + } + + if (actions & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR && + actions & MLX5_FLOW_CONTEXT_ACTION_DROP) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, "Drop with modify header action is not supported"); ++#endif + return false; + } + +@@ -4700,7 +4869,9 @@ parse_tc_actions(struct mlx5e_tc_act_par + act = *_act; + tc_act = mlx5e_tc_act_get(act->id, ns_type); + if (!tc_act) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, "Not implemented offload action"); ++#endif + err = -EOPNOTSUPP; + goto out_free; + } +@@ -4759,15 +4930,21 @@ flow_action_supported(struct flow_action + struct netlink_ext_ack *extack) + { + if (!flow_action_has_entries(flow_action)) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, "Flow action doesn't have any entries"); ++#endif + return -EINVAL; + } + ++#ifdef HAVE_FLOW_ACTION_HW_STATS_CHECK + if (!flow_action_hw_stats_check(flow_action, extack, + FLOW_ACTION_HW_STATS_DELAYED_BIT)) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, "Flow action HW stats type is not supported"); ++#endif + return -EOPNOTSUPP; + } ++#endif + + return 0; + } +@@ -4939,16 +5116,20 @@ parse_tc_fdb_actions(struct mlx5e_priv * + /* Forward to/from internal port can only have 1 dest */ + if ((netif_is_ovs_master(filter_dev) || esw_attr->dest_int_port) && + esw_attr->out_count > 1) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, + "Rules with internal port can have only one destination"); ++#endif + return -EOPNOTSUPP; + } + + /* Forward from tunnel/internal port to internal port is not supported */ + if ((mlx5e_get_tc_tun(filter_dev) || netif_is_ovs_master(filter_dev)) && + esw_attr->dest_int_port) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, + "Forwarding from tunnel/internal port to internal port is not supported"); ++#endif + return -EOPNOTSUPP; + } + +@@ -4989,13 +5170,47 @@ static const struct rhashtable_params tc + .automatic_shrinking = true, + }; + ++#ifdef CONFIG_COMPAT_CLS_FLOWER_MOD ++static void get_new_flags(struct mlx5e_priv *priv, unsigned long *flags) ++{ ++ struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; ++ ++ if (mlx5e_eswitch_rep(priv->netdev) && ++ MLX5_VPORT_MANAGER(priv->mdev) && esw->mode == MLX5_ESWITCH_OFFLOADS) ++ *flags |= MLX5_TC_FLAG(ESW_OFFLOAD); ++} ++#elif !defined(HAVE_TC_BLOCK_OFFLOAD) && !defined(HAVE_FLOW_BLOCK_OFFLOAD) ++static void get_new_flags(struct mlx5e_priv *priv, unsigned long *flags) ++{ ++ struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; ++ ++ if (esw && esw->mode == MLX5_ESWITCH_OFFLOADS) ++ *flags |= MLX5_TC_FLAG(ESW_OFFLOAD); ++} ++#endif ++ + static struct rhashtable *get_tc_ht(struct mlx5e_priv *priv, + unsigned long flags) + { + struct mlx5e_rep_priv *rpriv; + ++#ifdef CONFIG_COMPAT_CLS_FLOWER_MOD ++ if (mlx5e_eswitch_rep(priv->netdev) && ++ MLX5_VPORT_MANAGER(priv->mdev) && ++ priv->mdev->priv.eswitch->mode == MLX5_ESWITCH_OFFLOADS) { ++#elif !defined(HAVE_TC_BLOCK_OFFLOAD) && !defined(HAVE_FLOW_BLOCK_OFFLOAD) ++ if ((flags & MLX5_TC_FLAG(ESW_OFFLOAD)) || ++ (priv->mdev->priv.eswitch && ++ priv->mdev->priv.eswitch->mode == MLX5_ESWITCH_OFFLOADS)) { ++#else + if (flags & MLX5_TC_FLAG(ESW_OFFLOAD)) { ++#endif + rpriv = priv->ppriv; ++#if !defined(CONFIG_COMPAT_CLS_FLOWER_MOD) && \ ++ !defined(HAVE_TC_BLOCK_OFFLOAD) && !defined(HAVE_FLOW_BLOCK_OFFLOAD) ++ if (!rpriv || !rpriv->tc_ht.tbl) ++ return &priv->fs.tc.ht; ++#endif + return &rpriv->tc_ht; + } else /* NIC offload */ + return &priv->fs.tc.ht; +@@ -5004,23 +5219,33 @@ static struct rhashtable *get_tc_ht(stru + static bool is_peer_flow_needed(struct mlx5e_tc_flow *flow) + { + struct mlx5_esw_flow_attr *esw_attr = flow->attr->esw_attr; ++#ifdef HAVE_QDISC_SUPPORTS_BLOCK_SHARING + struct mlx5_flow_attr *attr = flow->attr; + bool is_rep_ingress = esw_attr->in_rep->vport != MLX5_VPORT_UPLINK && + flow_flag_test(flow, INGRESS); + bool act_is_encap = !!(attr->action & + MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT); ++#endif + bool esw_paired = mlx5_devcom_is_paired(esw_attr->in_mdev->priv.devcom, + MLX5_DEVCOM_ESW_OFFLOADS); + + if (!esw_paired) + return false; + ++#ifdef HAVE_QDISC_SUPPORTS_BLOCK_SHARING + if ((mlx5_lag_is_sriov(esw_attr->in_mdev) || + mlx5_lag_is_multipath(esw_attr->in_mdev)) && +- (is_rep_ingress || act_is_encap)) ++ (is_rep_ingress || act_is_encap ++#ifdef HAVE_TC_SETUP_CB_EGDEV_REGISTER ++ || (flow->flags & MLX5_TC_FLAG(EGRESS)) ++#endif ++ )) + return true; + + return false; ++#else ++ return (mlx5_lag_is_sriov(esw_attr->in_mdev) || mlx5_lag_is_multipath(esw_attr->in_mdev)); ++#endif + } + + struct mlx5_flow_attr * +@@ -5092,8 +5317,16 @@ mlx5e_flow_attr_init(struct mlx5_flow_at + struct flow_cls_offload *f) + { + attr->parse_attr = parse_attr; ++#ifdef CONFIG_COMPAT_PRIO_CHAIN_SUPPORT + attr->chain = f->common.chain_index; ++#ifdef CONFIG_COMPAT_TC_PRIO_IS_MAJOR + attr->prio = f->common.prio; ++#else ++ attr->prio = TC_H_MAJ(f->common.prio) >> 16; ++#endif ++#else ++ attr->prio = 1; ++#endif + } + + static void +@@ -5127,12 +5360,24 @@ __mlx5e_add_fdb_flow(struct mlx5e_priv * + struct mlx5_eswitch_rep *in_rep, + struct mlx5_core_dev *in_mdev) + { +- struct flow_rule *rule = flow_cls_offload_flow_rule(f); ++ struct flow_rule *rule; ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + struct netlink_ext_ack *extack = f->common.extack; ++#else ++ struct netlink_ext_ack *extack = NULL; ++#endif + struct mlx5e_tc_flow_parse_attr *parse_attr; + struct mlx5e_tc_flow *flow; + int attr_size, err; + ++#ifndef HAVE_TC_SETUP_FLOW_ACTION ++ rule = alloc_flow_rule(&f); ++ if (IS_ERR(rule)) ++ return ERR_PTR(PTR_ERR(rule)); ++#else ++ rule = flow_cls_offload_flow_rule(f); ++#endif ++ + flow_flags |= BIT(MLX5E_TC_FLOW_FLAG_ESWITCH); + attr_size = sizeof(struct mlx5_esw_flow_attr); + err = mlx5e_alloc_flow(priv, attr_size, f, flow_flags, +@@ -5171,11 +5416,18 @@ __mlx5e_add_fdb_flow(struct mlx5e_priv * + add_unready_flow(flow); + } + ++#ifndef HAVE_TC_SETUP_FLOW_ACTION ++ free_flow_rule(rule); ++#endif ++ + return flow; + + err_free: + mlx5e_flow_put(priv, flow); + out: ++#ifndef HAVE_TC_SETUP_FLOW_ACTION ++ free_flow_rule(rule); ++#endif + return ERR_PTR(err); + } + +@@ -5275,18 +5527,32 @@ mlx5e_add_nic_flow(struct mlx5e_priv *pr + struct net_device *filter_dev, + struct mlx5e_tc_flow **__flow) + { +- struct flow_rule *rule = flow_cls_offload_flow_rule(f); ++ struct flow_rule *rule; ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + struct netlink_ext_ack *extack = f->common.extack; ++#else ++ struct netlink_ext_ack *extack = NULL; ++#endif + struct mlx5e_tc_flow_parse_attr *parse_attr; + struct mlx5e_tc_flow *flow; +- int attr_size, err; ++ int attr_size, err = -EOPNOTSUPP; + ++#ifndef HAVE_TC_SETUP_FLOW_ACTION ++ rule = alloc_flow_rule(&f); ++ if (IS_ERR(rule)) ++ return PTR_ERR(rule); ++#else ++ rule = flow_cls_offload_flow_rule(f); ++#endif ++ ++#if defined(HAVE_TC_CLS_OFFLOAD_EXTACK) && defined(CONFIG_COMPAT_PRIO_CHAIN_SUPPORT) + if (!MLX5_CAP_FLOWTABLE_NIC_RX(priv->mdev, ignore_flow_level)) { + if (!tc_cls_can_offload_and_chain0(priv->netdev, &f->common)) +- return -EOPNOTSUPP; ++ goto out; + } else if (!tc_can_offload_extack(priv->netdev, f->common.extack)) { +- return -EOPNOTSUPP; ++ goto out; + } ++#endif + + flow_flags |= BIT(MLX5E_TC_FLOW_FLAG_NIC); + attr_size = sizeof(struct mlx5_nic_flow_attr); +@@ -5317,6 +5583,9 @@ mlx5e_add_nic_flow(struct mlx5e_priv *pr + goto err_free; + + flow_flag_set(flow, OFFLOADED); ++#ifndef HAVE_TC_SETUP_FLOW_ACTION ++ free_flow_rule(rule); ++#endif + *__flow = flow; + + return 0; +@@ -5326,6 +5595,9 @@ err_free: + mlx5e_mod_hdr_dealloc(&parse_attr->mod_hdr_acts); + mlx5e_flow_put(priv, flow); + out: ++#ifndef HAVE_TC_SETUP_FLOW_ACTION ++ free_flow_rule(rule); ++#endif + return err; + } + +@@ -5342,8 +5614,10 @@ mlx5e_tc_add_flow(struct mlx5e_priv *pri + + get_flags(flags, &flow_flags); + ++#if defined(HAVE_TC_CLS_OFFLOAD_EXTACK) && defined(HAVE_TC_CLS_FLOWER_OFFLOAD_COMMON) + if (!tc_can_offload_extack(priv->netdev, f->common.extack)) + return -EOPNOTSUPP; ++#endif + + if (esw && esw->mode == MLX5_ESWITCH_OFFLOADS) + err = mlx5e_add_fdb_flow(priv, f, flow_flags, +@@ -5368,12 +5642,19 @@ static bool is_flow_rule_duplicate_allow + int mlx5e_configure_flower(struct net_device *dev, struct mlx5e_priv *priv, + struct flow_cls_offload *f, unsigned long flags) + { +- struct netlink_ext_ack *extack = f->common.extack; ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK ++ struct netlink_ext_ack *extack = f->common.extack; ++#endif + struct rhashtable *tc_ht = get_tc_ht(priv, flags); + struct mlx5e_rep_priv *rpriv = priv->ppriv; + struct mlx5e_tc_flow *flow; + int err = 0; + ++#if defined(CONFIG_COMPAT_CLS_FLOWER_MOD) || \ ++ (!defined(HAVE_TC_BLOCK_OFFLOAD) && !defined(HAVE_FLOW_BLOCK_OFFLOAD)) ++ get_new_flags(priv, &flags); ++#endif ++ + if (!mlx5_esw_hold(priv->mdev)) + return -EBUSY; + +@@ -5388,8 +5669,15 @@ int mlx5e_configure_flower(struct net_de + if (is_flow_rule_duplicate_allowed(dev, rpriv) && flow->orig_dev != dev) + goto rcu_unlock; + ++#if !defined(HAVE_TC_INDR_API) ++ if(flow->orig_dev != dev) ++ goto out; ++#endif ++ ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, + "flow cookie already exists, ignoring"); ++#endif + netdev_warn_once(priv->netdev, + "flow cookie %lx already exists, ignoring\n", + f->cookie); +@@ -5401,7 +5689,9 @@ rcu_unlock: + if (flow) + goto out; + ++#ifndef MLX_DISABLE_TRACEPOINTS + trace_mlx5e_configure_flower(f); ++#endif + err = mlx5e_tc_add_flow(priv, f, flags, dev, &flow); + if (err) + goto out; +@@ -5427,6 +5717,10 @@ out: + return err; + } + ++#ifdef CONFIG_COMPAT_CLS_FLOWER_MOD ++EXPORT_SYMBOL(mlx5e_configure_flower); ++#endif ++ + static bool same_flow_direction(struct mlx5e_tc_flow *flow, int flags) + { + bool dir_ingress = !!(flags & MLX5_TC_FLAG(INGRESS)); +@@ -5460,7 +5754,9 @@ int mlx5e_delete_flower(struct net_devic + rhashtable_remove_fast(tc_ht, &flow->node, tc_ht_params); + rcu_read_unlock(); + +- trace_mlx5e_delete_flower(f); ++#ifndef MLX_DISABLE_TRACEPOINTS ++ trace_mlx5e_delete_flower(f); ++#endif + mlx5e_flow_put(priv, flow); + + mlx5_esw_put(priv->mdev); +@@ -5470,6 +5766,9 @@ errout: + rcu_read_unlock(); + return err; + } ++#ifdef CONFIG_COMPAT_CLS_FLOWER_MOD ++EXPORT_SYMBOL(mlx5e_delete_flower); ++#endif + + int mlx5e_stats_flower(struct net_device *dev, struct mlx5e_priv *priv, + struct flow_cls_offload *f, unsigned long flags) +@@ -5479,11 +5778,21 @@ int mlx5e_stats_flower(struct net_device + struct mlx5_eswitch *peer_esw; + struct mlx5e_tc_flow *flow; + struct mlx5_fc *counter; ++#if !defined(HAVE_TC_CLS_FLOWER_OFFLOAD_HAS_STATS_FIELD) && \ ++ !defined(HAVE_TCF_EXTS_STATS_UPDATE) ++ struct tc_action *a; ++ LIST_HEAD(actions); ++#endif + u64 lastuse = 0; + u64 packets = 0; + u64 bytes = 0; + int err = 0; + ++#if defined(CONFIG_COMPAT_CLS_FLOWER_MOD) || \ ++ (!defined(HAVE_TC_BLOCK_OFFLOAD) && !defined(HAVE_FLOW_BLOCK_OFFLOAD)) ++ get_new_flags(priv, &flags); ++#endif ++ + rcu_read_lock(); + flow = mlx5e_flow_get(rhashtable_lookup(tc_ht, &f->cookie, + tc_ht_params)); +@@ -5530,14 +5839,52 @@ int mlx5e_stats_flower(struct net_device + no_peer_counter: + mlx5_devcom_release_peer_data(devcom, MLX5_DEVCOM_ESW_OFFLOADS); + out: ++#ifdef HAVE_FLOW_STATS_UPDATE_6_PARAMS + flow_stats_update(&f->stats, bytes, packets, 0, lastuse, + FLOW_ACTION_HW_STATS_DELAYED); ++#elif defined(HAVE_FLOW_STATS_UPDATE_5_PARAMS) ++ flow_stats_update(&f->stats, bytes, packets, lastuse, ++ FLOW_ACTION_HW_STATS_DELAYED); ++#elif defined(HAVE_TC_CLS_FLOWER_OFFLOAD_HAS_STATS_FIELD) ++ flow_stats_update(&f->stats, bytes, packets, lastuse); ++#elif defined(HAVE_TCF_EXTS_STATS_UPDATE) ++ tcf_exts_stats_update(f->exts, bytes, packets, lastuse); ++#else ++ preempt_disable(); ++ ++#ifdef HAVE_TCF_EXTS_TO_LIST ++ tcf_exts_to_list(f->exts, &actions); ++ list_for_each_entry(a, &actions, list) ++#else ++ tc_for_each_action(a, f->exts) ++#endif ++#ifdef HAVE_TCF_ACTION_STATS_UPDATE ++ tcf_action_stats_update(a, bytes, packets, lastuse); ++#else ++ { ++ struct tcf_act_hdr *h = a->priv; ++ ++ spin_lock(&h->tcf_lock); ++ h->tcf_tm.lastuse = max_t(u64, h->tcf_tm.lastuse, lastuse); ++ h->tcf_bstats.bytes += bytes; ++ h->tcf_bstats.packets += packets; ++ spin_unlock(&h->tcf_lock); ++ } ++#endif ++ preempt_enable(); ++#endif /* HAVE_TC_CLS_FLOWER_OFFLOAD_HAS_STATS_FIELD */ ++#ifndef MLX_DISABLE_TRACEPOINTS + trace_mlx5e_stats_flower(f); ++#endif + errout: + mlx5e_flow_put(priv, flow); + return err; + } ++#ifdef CONFIG_COMPAT_CLS_FLOWER_MOD ++EXPORT_SYMBOL(mlx5e_stats_flower); ++#endif + ++#ifdef HAVE_TC_CLSMATCHALL_STATS + static int apply_police_params(struct mlx5e_priv *priv, u64 rate, + struct netlink_ext_ack *extack) + { +@@ -5549,8 +5896,10 @@ static int apply_police_params(struct ml + + vport_num = rpriv->rep->vport; + if (vport_num >= MLX5_VPORT_ECPF) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, + "Ingress rate limit is supported only for Eswitch ports connected to VFs"); ++#endif + return -EOPNOTSUPP; + } + +@@ -5568,8 +5917,10 @@ static int apply_police_params(struct ml + } + + err = mlx5_esw_qos_modify_vport_rate(esw, vport_num, rate_mbps); ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + if (err) + NL_SET_ERR_MSG_MOD(extack, "failed applying action to hardware"); ++#endif + + return err; + } +@@ -5584,27 +5935,41 @@ static int scan_tc_matchall_fdb_actions( + int i; + + if (!flow_action_has_entries(flow_action)) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, "matchall called with no action"); ++#endif + return -EINVAL; + } + ++#ifdef HAVE_FLOW_OFFLOAD_HAS_ONE_ACTION + if (!flow_offload_has_one_action(flow_action)) { ++#else ++ if (flow_action->num_entries != 1) { ++#endif ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, "matchall policing support only a single action"); ++#endif + return -EOPNOTSUPP; + } + ++#ifdef HAVE_FLOW_ACTION_HW_STATS_CHECK + if (!flow_action_basic_hw_stats_check(flow_action, extack)) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, "Flow action HW stats type is not supported"); ++#endif + return -EOPNOTSUPP; + } ++#endif + + flow_action_for_each(i, act, flow_action) { + switch (act->id) { + case FLOW_ACTION_POLICE: ++#ifdef HAVE_FLOW_ACTION_POLICE_RATE_PKT_PS + if (act->police.rate_pkt_ps) { + NL_SET_ERR_MSG_MOD(extack, "QoS offload not support packets per second"); + return -EOPNOTSUPP; + } ++#endif + err = apply_police_params(priv, act->police.rate_bytes_ps, extack); + if (err) + return err; +@@ -5612,7 +5977,9 @@ static int scan_tc_matchall_fdb_actions( + rpriv->prev_vf_vport_stats = priv->stats.vf_vport; + break; + default: ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, "mlx5 supports only police action for matchall"); ++#endif + return -EOPNOTSUPP; + } + } +@@ -5624,13 +5991,35 @@ int mlx5e_tc_configure_matchall(struct m + struct tc_cls_matchall_offload *ma) + { + struct netlink_ext_ack *extack = ma->common.extack; ++ int prio = ma->common.prio; ++ struct flow_rule *rule; ++ int err; + +- if (ma->common.prio != 1) { ++#ifndef CONFIG_COMPAT_TC_PRIO_IS_MAJOR ++ prio = TC_H_MAJ(prio) >> 16; ++#endif ++ ++ if (prio != 1) { ++#ifdef HAVE_TC_CLS_OFFLOAD_EXTACK + NL_SET_ERR_MSG_MOD(extack, "only priority 1 is supported"); ++#endif + return -EINVAL; + } + +- return scan_tc_matchall_fdb_actions(priv, &ma->rule->action, extack); ++#ifndef HAVE_TC_SETUP_FLOW_ACTION ++ rule = __alloc_flow_rule(ma->exts, NULL, 0); ++ if (IS_ERR(rule)) ++ return PTR_ERR(rule); ++#else ++ rule = ma->rule; ++#endif ++ ++ err = scan_tc_matchall_fdb_actions(priv, &rule->action, extack); ++#ifndef HAVE_TC_SETUP_FLOW_ACTION ++ free_flow_rule(rule); ++#endif ++ ++ return err; + } + + int mlx5e_tc_delete_matchall(struct mlx5e_priv *priv, +@@ -5653,9 +6042,19 @@ void mlx5e_tc_stats_matchall(struct mlx5 + dpkts = cur_stats.rx_packets - rpriv->prev_vf_vport_stats.rx_packets; + dbytes = cur_stats.rx_bytes - rpriv->prev_vf_vport_stats.rx_bytes; + rpriv->prev_vf_vport_stats = cur_stats; ++#ifdef HAVE_FLOW_STATS_UPDATE_6_PARAMS + flow_stats_update(&ma->stats, dbytes, dpkts, 0, jiffies, + FLOW_ACTION_HW_STATS_DELAYED); ++#elif defined(HAVE_FLOW_STATS_UPDATE_5_PARAMS) ++ flow_stats_update(&ma->stats, dbytes, dpkts, jiffies, ++ FLOW_ACTION_HW_STATS_DELAYED); ++#elif defined(HAVE_TC_SETUP_FLOW_ACTION) ++ flow_stats_update(&ma->stats, dbytes, dpkts, jiffies); ++#else ++ tcf_exts_stats_update(ma->exts, dbytes, dpkts, jiffies); ++#endif + } ++#endif /* HAVE_TC_CLSMATCHALL_STATS */ + + static void mlx5e_tc_hairpin_update_dead_peer(struct mlx5e_priv *priv, + struct mlx5e_priv *peer_priv) +@@ -5897,6 +6296,10 @@ int mlx5e_tc_ht_init(struct rhashtable * + void mlx5e_tc_ht_cleanup(struct rhashtable *tc_ht) + { + rhashtable_free_and_destroy(tc_ht, _mlx5e_tc_del_flow, NULL); ++#if !defined (CONFIG_COMPAT_CLS_FLOWER_MOD) && \ ++ !defined(HAVE_TC_BLOCK_OFFLOAD) && !defined(HAVE_FLOW_BLOCK_OFFLOAD) ++ tc_ht->tbl = NULL; ++#endif + } + + int mlx5e_tc_esw_init(struct mlx5_rep_uplink_priv *uplink_priv) +@@ -6024,10 +6427,28 @@ void mlx5e_tc_reoffload_flows_work(struc + mutex_unlock(&rpriv->unready_flows_lock); + } + ++#if defined(HAVE_NDO_SETUP_TC_TAKES_TC_SETUP_TYPE) || defined(HAVE_NDO_SETUP_TC_RH_EXTENDED) ++#ifdef CONFIG_MLX5_ESWITCH ++#if defined(HAVE_TC_BLOCK_OFFLOAD) || defined(HAVE_FLOW_BLOCK_OFFLOAD) + static int mlx5e_setup_tc_cls_flower(struct mlx5e_priv *priv, ++#else ++int mlx5e_setup_tc_cls_flower(struct net_device *dev, ++#endif + struct flow_cls_offload *cls_flower, + unsigned long flags) + { ++#ifndef HAVE_TC_CLS_CAN_OFFLOAD_AND_CHAIN0 ++#ifdef HAVE_TC_BLOCK_OFFLOAD ++ if (cls_flower->common.chain_index) ++#else ++ struct mlx5e_priv *priv = netdev_priv(dev); ++ ++ if (!is_classid_clsact_ingress(cls_flower->common.classid) || ++ cls_flower->common.chain_index) ++#endif ++ return -EOPNOTSUPP; ++#endif ++ + switch (cls_flower->command) { + case FLOW_CLS_REPLACE: + return mlx5e_configure_flower(priv->netdev, priv, cls_flower, +@@ -6043,6 +6464,7 @@ static int mlx5e_setup_tc_cls_flower(str + } + } + ++#if defined(HAVE_TC_BLOCK_OFFLOAD) || defined(HAVE_FLOW_BLOCK_OFFLOAD) + int mlx5e_setup_tc_block_cb(enum tc_setup_type type, void *type_data, + void *cb_priv) + { +@@ -6052,6 +6474,11 @@ int mlx5e_setup_tc_block_cb(enum tc_setu + if (!priv->netdev || !netif_device_present(priv->netdev)) + return -EOPNOTSUPP; + ++#if defined(HAVE_TC_CLS_OFFLOAD_EXTACK) && !defined(CONFIG_COMPAT_PRIO_CHAIN_SUPPORT) ++ if (!tc_cls_can_offload_and_chain0(priv->netdev, type_data)) ++ return -EOPNOTSUPP; ++#endif ++ + if (mlx5e_is_uplink_rep(priv)) + flags |= MLX5_TC_FLAG(ESW_OFFLOAD); + else +@@ -6110,3 +6537,33 @@ bool mlx5e_tc_update_skb(struct mlx5_cqe + + return true; + } ++ ++#ifndef HAVE_FLOW_BLOCK_CB_SETUP_SIMPLE ++int mlx5e_setup_tc_block(struct net_device *dev, ++ struct tc_block_offload *f) ++{ ++ struct mlx5e_priv *priv = netdev_priv(dev); ++ ++ if (f->binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS) ++ return -EOPNOTSUPP; ++ ++ switch (f->command) { ++ case TC_BLOCK_BIND: ++ return tcf_block_cb_register(f->block, mlx5e_setup_tc_block_cb, ++ priv, priv ++#ifdef HAVE_TC_BLOCK_OFFLOAD_EXTACK ++ , f->extack ++#endif ++ ); ++ case TC_BLOCK_UNBIND: ++ tcf_block_cb_unregister(f->block, mlx5e_setup_tc_block_cb, ++ priv); ++ return 0; ++ default: ++ return -EOPNOTSUPP; ++ } ++} ++#endif /* HAVE_FLOW_BLOCK_CB_SETUP_SIMPLE */ ++#endif /* HAVE_TC_BLOCK_OFFLOAD || HAVE_FLOW_BLOCK_OFFLOAD */ ++#endif /*ESWITCH */ ++#endif diff --git a/src/mlnx-ofa_kernel-5.8/backports/0272-BACKPORT-drivers-nvme-host-trace.h.patch b/src/mlnx-ofa_kernel-5.8/backports/0272-BACKPORT-drivers-nvme-host-trace.h.patch new file mode 100644 index 0000000..71c5771 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0272-BACKPORT-drivers-nvme-host-trace.h.patch @@ -0,0 +1,46 @@ +From: Roy Novich +Subject: [PATCH] BACKPORT: drivers/nvme/host/trace.h + +Change-Id: I1182eb5b528b083c383128a5ac2a0968a722d3e6 +--- + drivers/nvme/host/trace.h | 12 ++++++++++++ + 1 file changed, 12 insertions(+) + +--- a/drivers/nvme/host/trace.h ++++ b/drivers/nvme/host/trace.h +@@ -68,7 +68,11 @@ TRACE_EVENT(nvme_setup_cmd, + __entry->nsid = le32_to_cpu(cmd->common.nsid); + __entry->metadata = !!blk_integrity_rq(req); + __entry->fctype = cmd->fabrics.fctype; ++#ifdef HAVE_REQ_RQ_DISK ++ __assign_disk_name(__entry->disk, req->rq_disk); ++#else + __assign_disk_name(__entry->disk, req->q->disk); ++#endif + memcpy(__entry->cdw10, &cmd->common.cdw10, + sizeof(__entry->cdw10)); + ), +@@ -103,7 +107,11 @@ TRACE_EVENT(nvme_complete_rq, + __entry->retries = nvme_req(req)->retries; + __entry->flags = nvme_req(req)->flags; + __entry->status = nvme_req(req)->status; ++#ifdef HAVE_REQ_RQ_DISK ++ __assign_disk_name(__entry->disk, req->rq_disk); ++#else + __assign_disk_name(__entry->disk, req->q->disk); ++#endif + ), + TP_printk("nvme%d: %sqid=%d, cmdid=%u, res=%#llx, retries=%u, flags=0x%x, status=%#x", + __entry->ctrl_id, __print_disk_name(__entry->disk), +@@ -153,7 +161,11 @@ TRACE_EVENT(nvme_sq, + ), + TP_fast_assign( + __entry->ctrl_id = nvme_req(req)->ctrl->instance; ++#ifdef HAVE_REQ_RQ_DISK ++ __assign_disk_name(__entry->disk, req->rq_disk); ++#else + __assign_disk_name(__entry->disk, req->q->disk); ++#endif + __entry->qid = nvme_req_qid(req); + __entry->sq_head = le16_to_cpu(sq_head); + __entry->sq_tail = sq_tail; diff --git a/src/mlnx-ofa_kernel-5.8/backports/0273-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-dev.patch b/src/mlnx-ofa_kernel-5.8/backports/0273-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-dev.patch new file mode 100644 index 0000000..84ad985 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0273-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-dev.patch @@ -0,0 +1,797 @@ +From: Maher Sanalla +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/devlink.c + +Change-Id: Iaa6fe411197e75608d3debd9585b45e1a5bfb944 +--- + .../net/ethernet/mellanox/mlx5/core/devlink.c | 370 ++++++++++++++++-- + 1 file changed, 334 insertions(+), 36 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c +@@ -4,7 +4,9 @@ + #include + + #include "mlx5_core.h" ++#ifdef HAVE_DEVLINK_RELOAD_DOWN_SUPPORT_RELOAD_ACTION + #include "fw_reset.h" ++#endif + #include "fs_core.h" + #include "eswitch.h" + #include "mlx5_devm.h" +@@ -13,15 +15,59 @@ + #include "sf/sf.h" + #include "en/tc_ct.h" + ++#ifdef HAVE_DEVLINK_DRIVERINIT_VAL ++static unsigned int esw_offloads_num_big_groups = ESW_OFFLOADS_DEFAULT_NUM_GROUPS; ++#else ++unsigned int esw_offloads_num_big_groups = ESW_OFFLOADS_DEFAULT_NUM_GROUPS; ++#endif ++module_param_named(num_of_groups, esw_offloads_num_big_groups, ++ uint, 0644); ++MODULE_PARM_DESC(num_of_groups, ++ "Eswitch offloads number of big groups in FDB table. Valid range 1 - 1024. Default 15"); ++ ++#ifdef HAVE_DEVLINK_HAS_FLASH_UPDATE + static int mlx5_devlink_flash_update(struct devlink *devlink, ++#ifdef HAVE_FLASH_UPDATE_GET_3_PARAMS + struct devlink_flash_update_params *params, ++#else ++ const char *file_name, ++ const char *component, ++#endif + struct netlink_ext_ack *extack) + { + struct mlx5_core_dev *dev = devlink_priv(devlink); + ++#ifdef HAVE_DEVLINK_FLASH_UPDATE_PARAMS_HAS_STRUCT_FW + return mlx5_firmware_flash(dev, params->fw, extack); ++#else ++ const struct firmware *fw; ++ int err; ++#ifdef HAVE_FLASH_UPDATE_GET_3_PARAMS ++ if (params->component) ++#else ++ if (component) ++#endif ++ return -EOPNOTSUPP; ++ ++ err = request_firmware_direct(&fw, ++#ifdef HAVE_FLASH_UPDATE_GET_3_PARAMS ++ params->file_name, ++#else ++ file_name, ++#endif ++ &dev->pdev->dev); ++ if (err) ++ return err; ++ ++ err = mlx5_firmware_flash(dev, fw, extack); ++ release_firmware(fw); ++ ++ return err; ++#endif /* HAVE_DEVLINK_FLASH_UPDATE_PARAMS_HAS_STRUCT_FW */ + } ++#endif /* HAVE_DEVLINK_HAS_FLASH_UPDATE */ + ++#if defined(HAVE_DEVLINK_HAS_INFO_GET) && defined(HAVE_DEVLINK_INFO_VERSION_FIXED_PUT) + static u8 mlx5_fw_ver_major(u32 version) + { + return (version >> 24) & 0xff; +@@ -38,7 +84,6 @@ static u16 mlx5_fw_ver_subminor(u32 vers + } + + #define DEVLINK_FW_STRING_LEN 32 +- + static int + mlx5_devlink_info_get(struct devlink *devlink, struct devlink_info_req *req, + struct netlink_ext_ack *extack) +@@ -86,7 +131,9 @@ mlx5_devlink_info_get(struct devlink *de + DEVLINK_INFO_VERSION_GENERIC_FW, + version_str); + } ++#endif + ++#ifdef HAVE_DEVLINK_RELOAD_DOWN_SUPPORT_RELOAD_ACTION + static int mlx5_devlink_reload_fw_activate(struct devlink *devlink, struct netlink_ext_ack *extack) + { + struct mlx5_core_dev *dev = devlink_priv(devlink); +@@ -106,7 +153,17 @@ static int mlx5_devlink_reload_fw_activa + if (err) + return err; + +- return mlx5_fw_reset_wait_reset_done(dev); ++ err = mlx5_fw_reset_wait_reset_done(dev); ++#ifdef HAVE_DEVL_TRAP_GROUPS_REGISTER //forward port ++ if (err) ++ return err; ++ ++ mlx5_unload_one_devl_locked(dev); ++ err = mlx5_health_wait_pci_up(dev); ++ if (err) ++ NL_SET_ERR_MSG_MOD(extack, "FW activate aborted, PCI reads fail after reset"); ++#endif ++ return err; + } + + static int mlx5_devlink_trigger_fw_live_patch(struct devlink *devlink, +@@ -127,15 +184,23 @@ static int mlx5_devlink_trigger_fw_live_ + + return mlx5_fw_reset_set_live_patch(dev); + } ++#endif + +-static int mlx5_devlink_reload_down(struct devlink *devlink, bool netns_change, ++#ifdef HAVE_DEVLINK_HAS_RELOAD_UP_DOWN ++static int mlx5_devlink_reload_down(struct devlink *devlink, ++#ifdef HAVE_DEVLINK_RELOAD_DOWN_SUPPORT_RELOAD_ACTION ++ bool netns_change, + enum devlink_reload_action action, + enum devlink_reload_limit limit, ++#elif defined(HAVE_DEVLINK_RELOAD_DOWN_HAS_3_PARAMS) ++ bool netns_change, ++#endif + struct netlink_ext_ack *extack) + { + struct mlx5_core_dev *dev = devlink_priv(devlink); + struct pci_dev *pdev = dev->pdev; + bool sf_dev_allocated; ++ int ret = 0; + #ifdef CONFIG_MLX5_ESWITCH + u16 mode = 0; + +@@ -166,46 +231,99 @@ static int mlx5_devlink_reload_down(stru + NL_SET_ERR_MSG_MOD(extack, "reload while VFs are present is unfavorable"); + } + ++#ifdef HAVE_DEVLINK_RELOAD_DOWN_SUPPORT_RELOAD_ACTION + switch (action) { + case DEVLINK_RELOAD_ACTION_DRIVER_REINIT: ++#ifdef HAVE_DEVL_TRAP_GROUPS_REGISTER //forward port ++ mlx5_unload_one_devl_locked(dev); ++#else + mlx5_unload_one(dev); +- return 0; ++#endif ++ break; + case DEVLINK_RELOAD_ACTION_FW_ACTIVATE: + if (limit == DEVLINK_RELOAD_LIMIT_NO_RESET) +- return mlx5_devlink_trigger_fw_live_patch(devlink, extack); +- return mlx5_devlink_reload_fw_activate(devlink, extack); ++ ret = mlx5_devlink_trigger_fw_live_patch(devlink, extack); ++ else ++ ret = mlx5_devlink_reload_fw_activate(devlink, extack); ++ break; + default: + /* Unsupported action should not get to this function */ + WARN_ON(1); +- return -EOPNOTSUPP; ++ ret = -EOPNOTSUPP; + } ++#else /* HAVE_DEVLINK_RELOAD_DOWN_SUPPORT_RELOAD_ACTION */ ++ mlx5_unload_one(dev); ++#endif /* HAVE_DEVLINK_RELOAD_DOWN_SUPPORT_RELOAD_ACTION */ ++ return ret; + } + +-static int mlx5_devlink_reload_up(struct devlink *devlink, enum devlink_reload_action action, ++static int mlx5_devlink_reload_up(struct devlink *devlink, ++#ifdef HAVE_DEVLINK_RELOAD_DOWN_SUPPORT_RELOAD_ACTION ++ enum devlink_reload_action action, + enum devlink_reload_limit limit, u32 *actions_performed, ++#endif + struct netlink_ext_ack *extack) + { + struct mlx5_core_dev *dev = devlink_priv(devlink); ++ int ret = 0; + ++#ifdef HAVE_DEVLINK_RELOAD_DOWN_SUPPORT_RELOAD_ACTION + *actions_performed = BIT(action); + switch (action) { + case DEVLINK_RELOAD_ACTION_DRIVER_REINIT: +- return mlx5_load_one(dev, false); ++#ifdef HAVE_DEVL_TRAP_GROUPS_REGISTER //forward port ++ ret = mlx5_load_one_devl_locked(dev, false); ++#else ++ ret = mlx5_load_one(dev, false); ++#endif ++ break; + case DEVLINK_RELOAD_ACTION_FW_ACTIVATE: + if (limit == DEVLINK_RELOAD_LIMIT_NO_RESET) + break; + /* On fw_activate action, also driver is reloaded and reinit performed */ + *actions_performed |= BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT); +- return mlx5_load_one(dev, false); ++#ifdef HAVE_DEVL_TRAP_GROUPS_REGISTER //forward port ++ ret = mlx5_load_one_devl_locked(dev, false); ++#else ++ ret = mlx5_load_one(dev, false); ++#endif ++ break;// ROY NEED BASE + default: + /* Unsupported action should not get to this function */ + WARN_ON(1); +- return -EOPNOTSUPP; ++ ret = -EOPNOTSUPP; + } + +- return 0; ++#else /* HAVE_DEVLINK_RELOAD_DOWN_SUPPORT_RELOAD_ACTION*/ ++ ret = mlx5_load_one(dev, false); ++#endif /* HAVE_DEVLINK_RELOAD_DOWN_SUPPORT_RELOAD_ACTION */ ++ return ret; + } ++#endif /* HAVE_DEVLINK_HAS_RELOAD_UP_DOWN */ + ++#if defined(HAVE_DEVLINK_HAS_RELOAD) ++static int load_one_and_check(struct mlx5_core_dev *dev, ++ struct netlink_ext_ack *extack) ++{ ++ int err; ++ ++ err = mlx5_load_one(dev, false); ++ if (err == -EUSERS) ++ NL_SET_ERR_MSG_MOD(extack, "IRQs for requested CPU affinity are not available"); ++ return err; ++} ++ ++static int mlx5_devlink_reload(struct devlink *devlink, ++ struct netlink_ext_ack *extack) ++{ ++ struct mlx5_core_dev *dev = devlink_priv(devlink); ++ ++ mlx5_unload_one(dev); ++ return load_one_and_check(dev, extack); ++} ++#endif ++ ++#ifdef HAVE_DEVLINK_TRAP_SUPPORT + static struct mlx5_devlink_trap *mlx5_find_trap_by_id(struct mlx5_core_dev *dev, int trap_id) + { + struct mlx5_devlink_trap *dl_trap; +@@ -258,8 +376,12 @@ static void mlx5_devlink_trap_fini(struc + + static int mlx5_devlink_trap_action_set(struct devlink *devlink, + const struct devlink_trap *trap, ++#ifdef HAVE_DEVLINK_TRAP_ACTION_SET_4_ARGS + enum devlink_trap_action action, + struct netlink_ext_ack *extack) ++#else ++ enum devlink_trap_action action) ++#endif + { + struct mlx5_core_dev *dev = devlink_priv(devlink); + enum devlink_trap_action action_orig; +@@ -267,7 +389,9 @@ static int mlx5_devlink_trap_action_set( + int err = 0; + + if (is_mdev_switchdev_mode(dev)) { ++#ifdef HAVE_DEVLINK_TRAP_ACTION_SET_4_ARGS + NL_SET_ERR_MSG_MOD(extack, "Devlink traps can't be set in switchdev mode"); ++#endif + return -EOPNOTSUPP; + } + +@@ -295,17 +419,26 @@ static int mlx5_devlink_trap_action_set( + out: + return err; + } ++#endif /* HAVE_DEVLINK_TRAP_SUPPORT */ + + static const struct devlink_ops mlx5_devlink_ops = { + #ifdef CONFIG_MLX5_ESWITCH ++#ifdef HAVE_DEVLINK_HAS_ESWITCH_MODE_GET_SET + .eswitch_mode_set = mlx5_devlink_eswitch_mode_set, + .eswitch_mode_get = mlx5_devlink_eswitch_mode_get, ++#endif /* HAVE_DEVLINK_HAS_ESWITCH_MODE_GET_SET */ ++#ifdef HAVE_DEVLINK_HAS_ESWITCH_INLINE_MODE_GET_SET + .eswitch_inline_mode_set = mlx5_devlink_eswitch_inline_mode_set, + .eswitch_inline_mode_get = mlx5_devlink_eswitch_inline_mode_get, ++#endif /* HAVE_DEVLINK_HAS_ESWITCH_INLINE_MODE_GET_SET */ ++#ifdef HAVE_DEVLINK_HAS_ESWITCH_ENCAP_MODE_SET + .eswitch_encap_mode_set = mlx5_devlink_eswitch_encap_mode_set, + .eswitch_encap_mode_get = mlx5_devlink_eswitch_encap_mode_get, ++#endif /* HAVE_DEVLINK_HAS_ESWITCH_ENCAP_MODE_SET */ ++#ifdef HAVE_DEVLINK_HAS_PORT_FUNCTION_HW_ADDR_GET + .port_function_hw_addr_get = mlx5_devlink_port_function_hw_addr_get, + .port_function_hw_addr_set = mlx5_devlink_port_function_hw_addr_set, ++#ifdef HAVE_DEVLINK_HAS_RATE_FUNCTIONS + .rate_leaf_tx_share_set = mlx5_esw_devlink_rate_leaf_tx_share_set, + .rate_leaf_tx_max_set = mlx5_esw_devlink_rate_leaf_tx_max_set, + .rate_node_tx_share_set = mlx5_esw_devlink_rate_node_tx_share_set, +@@ -314,6 +447,9 @@ static const struct devlink_ops mlx5_dev + .rate_node_del = mlx5_esw_devlink_rate_node_del, + .rate_leaf_parent_set = mlx5_esw_devlink_rate_parent_set, + #endif ++#endif ++#endif /* CONFIG_MLX5_ESWITCH */ ++ + #if defined(CONFIG_MLX5_SF_MANAGER) && \ + (defined(HAVE_DEVLINK_PORT_ATTRS_PCI_SF_SET_GET_4_PARAMS) || defined(HAVE_DEVLINK_PORT_ATTRS_PCI_SF_SET_GET_5_PARAMS)) + .port_new = mlx5_devlink_sf_port_new, +@@ -324,18 +460,32 @@ static const struct devlink_ops mlx5_dev + .port_fn_state_get = mlx5_devlink_sf_port_fn_state_get, + .port_fn_state_set = mlx5_devlink_sf_port_fn_state_set, + #endif ++#ifdef HAVE_DEVLINK_HAS_FLASH_UPDATE + .flash_update = mlx5_devlink_flash_update, ++#endif /* HAVE_DEVLINK_HAS_FLASH_UPDATE */ ++#if defined(HAVE_DEVLINK_HAS_INFO_GET) && defined(HAVE_DEVLINK_INFO_VERSION_FIXED_PUT) + .info_get = mlx5_devlink_info_get, ++#endif /* HAVE_DEVLINK_HAS_INFO_GET && HAVE_DEVLINK_INFO_VERSION_FIXED_PUT */ ++#ifdef HAVE_DEVLINK_RELOAD_DOWN_SUPPORT_RELOAD_ACTION + .reload_actions = BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT) | + BIT(DEVLINK_RELOAD_ACTION_FW_ACTIVATE), + .reload_limits = BIT(DEVLINK_RELOAD_LIMIT_NO_RESET), ++#endif ++#ifdef HAVE_DEVLINK_HAS_RELOAD_UP_DOWN + .reload_down = mlx5_devlink_reload_down, + .reload_up = mlx5_devlink_reload_up, ++#endif /* HAVE_DEVLINK_HAS_RELOAD_UP_DOWN */ ++#ifdef HAVE_DEVLINK_HAS_RELOAD ++ .reload = mlx5_devlink_reload, ++#endif ++#ifdef HAVE_DEVLINK_TRAP_SUPPORT + .trap_init = mlx5_devlink_trap_init, + .trap_fini = mlx5_devlink_trap_fini, + .trap_action_set = mlx5_devlink_trap_action_set, ++#endif /* HAVE_DEVLINK_TRAP_SUPPORT */ + }; + ++#ifdef HAVE_DEVLINK_TRAP_SUPPORT + void mlx5_devlink_trap_report(struct mlx5_core_dev *dev, int trap_id, struct sk_buff *skb, + struct devlink_port *dl_port) + { +@@ -353,7 +503,11 @@ void mlx5_devlink_trap_report(struct mlx + dl_trap->trap.action); + return; + } ++#ifdef HAVE_DEVLINK_TRAP_REPORT_5_ARGS + devlink_trap_report(devlink, skb, dl_trap->item, dl_port, NULL); ++#else ++ devlink_trap_report(devlink, skb, dl_trap->item, dl_port); ++#endif + } + + int mlx5_devlink_trap_get_num_active(struct mlx5_core_dev *dev) +@@ -383,11 +537,16 @@ int mlx5_devlink_traps_get_action(struct + *action = dl_trap->trap.action; + return 0; + } ++#endif /* HAVE_DEVLINK_TRAP_SUPPORT */ + + struct devlink *mlx5_devlink_alloc(struct device *dev) + { ++#ifdef HAVE_DEVLINK_ALLOC_GET_3_PARAMS + return devlink_alloc(&mlx5_devlink_ops, sizeof(struct mlx5_core_dev), + dev); ++#else ++ return devlink_alloc(&mlx5_devlink_ops, sizeof(struct mlx5_core_dev)); ++#endif + } + + void mlx5_devlink_free(struct devlink *devlink) +@@ -395,6 +554,7 @@ void mlx5_devlink_free(struct devlink *d + devlink_free(devlink); + } + ++#if defined(HAVE_DEVLINK_PARAM) && (defined(HAVE_DEVLINK_PARAMS_PUBLISHED) || defined(HAVE_DEVLINK_REGISTER_GET_1_PARAMS)) + static int mlx5_devlink_fs_mode_validate(struct devlink *devlink, u32 id, + union devlink_param_value val, + struct netlink_ext_ack *extack) +@@ -459,6 +619,7 @@ static int mlx5_devlink_fs_mode_get(stru + return 0; + } + ++#ifdef HAVE_DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE + static int mlx5_devlink_enable_roce_validate(struct devlink *devlink, u32 id, + union devlink_param_value val, + struct netlink_ext_ack *extack) +@@ -478,6 +639,7 @@ static int mlx5_devlink_enable_roce_vali + + return 0; + } ++#endif + + #ifdef CONFIG_MLX5_ESWITCH + static int mlx5_devlink_large_group_num_validate(struct devlink *devlink, u32 id, +@@ -517,25 +679,6 @@ static int mlx5_devlink_esw_port_metadat + ctx->val.vbool = mlx5_eswitch_vport_match_metadata_enabled(dev->priv.eswitch); + return 0; + } +-#endif /* CONFIG_MLX5_ESWITCH */ +- +-static int mlx5_devlink_ct_max_offloaded_conns_set(struct devlink *devlink, u32 id, +- struct devlink_param_gset_ctx *ctx) +-{ +- struct mlx5_core_dev *dev = devlink_priv(devlink); +- +- mlx5_tc_ct_max_offloaded_conns_set(dev, ctx->val.vu32); +- return 0; +-} +- +-static int mlx5_devlink_ct_max_offloaded_conns_get(struct devlink *devlink, u32 id, +- struct devlink_param_gset_ctx *ctx) +-{ +- struct mlx5_core_dev *dev = devlink_priv(devlink); +- +- ctx->val.vu32 = mlx5_tc_ct_max_offloaded_conns_get(dev); +- return 0; +-} + + static int mlx5_devlink_esw_port_metadata_validate(struct devlink *devlink, u32 id, + union devlink_param_value val, +@@ -607,7 +750,27 @@ static int mlx5_devlink_esw_pet_insert_v + + return 0; + } ++#endif /* CONFIG_MLX5_ESWITCH */ ++ ++static int mlx5_devlink_ct_max_offloaded_conns_set(struct devlink *devlink, u32 id, ++ struct devlink_param_gset_ctx *ctx) ++{ ++ struct mlx5_core_dev *dev = devlink_priv(devlink); ++ ++ mlx5_tc_ct_max_offloaded_conns_set(dev, ctx->val.vu32); ++ return 0; ++} ++ ++static int mlx5_devlink_ct_max_offloaded_conns_get(struct devlink *devlink, u32 id, ++ struct devlink_param_gset_ctx *ctx) ++{ ++ struct mlx5_core_dev *dev = devlink_priv(devlink); ++ ++ ctx->val.vu32 = mlx5_tc_ct_max_offloaded_conns_get(dev); ++ return 0; ++} + ++#ifdef HAVE_DEVLINK_RELOAD_DOWN_SUPPORT_RELOAD_ACTION + static int mlx5_devlink_enable_remote_dev_reset_set(struct devlink *devlink, u32 id, + struct devlink_param_gset_ctx *ctx) + { +@@ -625,13 +788,16 @@ static int mlx5_devlink_enable_remote_de + ctx->val.vbool = mlx5_fw_reset_enable_remote_dev_reset_get(dev); + return 0; + } ++#endif + ++#ifdef HAVE_DEVLINK_PARAM_GENERIC_ID_IO_EQ_SIZE + static int mlx5_devlink_eq_depth_validate(struct devlink *devlink, u32 id, + union devlink_param_value val, + struct netlink_ext_ack *extack) + { + return (val.vu16 >= 64 && val.vu16 <= 4096) ? 0 : -EINVAL; + } ++#endif + + static const struct devlink_param mlx5_devlink_params[] = { + DEVLINK_PARAM_DRIVER(MLX5_DEVLINK_PARAM_ID_CT_ACTION_ON_NAT_CONNS, +@@ -645,8 +811,15 @@ static const struct devlink_param mlx5_d + BIT(DEVLINK_PARAM_CMODE_RUNTIME), + mlx5_devlink_fs_mode_get, mlx5_devlink_fs_mode_set, + mlx5_devlink_fs_mode_validate), ++#ifdef HAVE_DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE + DEVLINK_PARAM_GENERIC(ENABLE_ROCE, BIT(DEVLINK_PARAM_CMODE_DRIVERINIT), + NULL, NULL, mlx5_devlink_enable_roce_validate), ++#endif ++#ifdef HAVE_DEVLINK_RELOAD_DOWN_SUPPORT_RELOAD_ACTION ++ DEVLINK_PARAM_GENERIC(ENABLE_REMOTE_DEV_RESET, BIT(DEVLINK_PARAM_CMODE_RUNTIME), ++ mlx5_devlink_enable_remote_dev_reset_get, ++ mlx5_devlink_enable_remote_dev_reset_set, NULL), ++#endif + DEVLINK_PARAM_DRIVER(MLX5_DEVLINK_PARAM_ID_CT_MAX_OFFLOADED_CONNS, + "ct_max_offloaded_conns", DEVLINK_PARAM_TYPE_U32, + BIT(DEVLINK_PARAM_CMODE_RUNTIME), +@@ -672,13 +845,12 @@ static const struct devlink_param mlx5_d + mlx5_devlink_esw_pet_insert_set, + mlx5_devlink_esw_pet_insert_validate), + #endif +- DEVLINK_PARAM_GENERIC(ENABLE_REMOTE_DEV_RESET, BIT(DEVLINK_PARAM_CMODE_RUNTIME), +- mlx5_devlink_enable_remote_dev_reset_get, +- mlx5_devlink_enable_remote_dev_reset_set, NULL), ++#ifdef HAVE_DEVLINK_PARAM_GENERIC_ID_IO_EQ_SIZE + DEVLINK_PARAM_GENERIC(IO_EQ_SIZE, BIT(DEVLINK_PARAM_CMODE_DRIVERINIT), + NULL, NULL, mlx5_devlink_eq_depth_validate), + DEVLINK_PARAM_GENERIC(EVENT_EQ_SIZE, BIT(DEVLINK_PARAM_CMODE_DRIVERINIT), + NULL, NULL, mlx5_devlink_eq_depth_validate), ++#endif + }; + + static void mlx5_devlink_set_params_init_values(struct devlink *devlink) +@@ -694,13 +866,15 @@ static void mlx5_devlink_set_params_init + MLX5_DEVLINK_PARAM_ID_FLOW_STEERING_MODE, + value); + ++#ifdef HAVE_DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE + value.vbool = MLX5_CAP_GEN(dev, roce); + devlink_param_driverinit_value_set(devlink, + DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE, + value); ++#endif + + #ifdef CONFIG_MLX5_ESWITCH +- value.vu32 = ESW_OFFLOADS_DEFAULT_NUM_GROUPS; ++ value.vu32 = esw_offloads_num_big_groups; + devlink_param_driverinit_value_set(devlink, + MLX5_DEVLINK_PARAM_ID_ESW_LARGE_GROUP_NUM, + value); +@@ -725,6 +899,7 @@ static void mlx5_devlink_set_params_init + } + #endif + ++#ifdef HAVE_DEVLINK_PARAM_GENERIC_ID_IO_EQ_SIZE + value.vu32 = MLX5_COMP_EQ_SIZE; + devlink_param_driverinit_value_set(devlink, + DEVLINK_PARAM_GENERIC_ID_IO_EQ_SIZE, +@@ -734,8 +909,11 @@ static void mlx5_devlink_set_params_init + devlink_param_driverinit_value_set(devlink, + DEVLINK_PARAM_GENERIC_ID_EVENT_EQ_SIZE, + value); ++#endif + } ++#endif /* defined(HAVE_DEVLINK_PARAM) && (defined(HAVE_DEVLINK_PARAMS_PUBLISHED) || defined(HAVE_DEVLINK_REGISTER_GET_1_PARAMS) */ + ++#ifdef HAVE_DEVLINK_PARAM_REGISTER + static const struct devlink_param enable_eth_param = + DEVLINK_PARAM_GENERIC(ENABLE_ETH, BIT(DEVLINK_PARAM_CMODE_DRIVERINIT), + NULL, NULL, NULL); +@@ -757,6 +935,9 @@ static int mlx5_devlink_eth_param_regist + devlink_param_driverinit_value_set(devlink, + DEVLINK_PARAM_GENERIC_ID_ENABLE_ETH, + value); ++#ifdef HAVE_DEVLINK_PARAM_PUBLISH ++ devlink_param_publish(devlink, &enable_eth_param); ++#endif + return 0; + } + +@@ -767,6 +948,9 @@ static void mlx5_devlink_eth_param_unreg + if (!mlx5_eth_supported(dev)) + return; + ++#ifdef HAVE_DEVLINK_PARAM_PUBLISH ++ devlink_param_unpublish(devlink, &enable_eth_param); ++#endif + devlink_param_unregister(devlink, &enable_eth_param); + } + +@@ -802,6 +986,9 @@ static int mlx5_devlink_rdma_param_regis + devlink_param_driverinit_value_set(devlink, + DEVLINK_PARAM_GENERIC_ID_ENABLE_RDMA, + value); ++#ifdef HAVE_DEVLINK_PARAM_PUBLISH ++ devlink_param_publish(devlink, &enable_rdma_param); ++#endif + return 0; + } + +@@ -810,6 +997,9 @@ static void mlx5_devlink_rdma_param_unre + if (!IS_ENABLED(CONFIG_MLX5_INFINIBAND)) + return; + ++#ifdef HAVE_DEVLINK_PARAM_PUBLISH ++ devlink_param_unpublish(devlink, &enable_rdma_param); ++#endif + devlink_param_unregister(devlink, &enable_rdma_param); + } + +@@ -834,6 +1024,9 @@ static int mlx5_devlink_vnet_param_regis + devlink_param_driverinit_value_set(devlink, + DEVLINK_PARAM_GENERIC_ID_ENABLE_VNET, + value); ++#ifdef HAVE_DEVLINK_PARAM_PUBLISH ++ devlink_param_publish(devlink, &enable_vnet_param); ++#endif + return 0; + } + +@@ -844,6 +1037,9 @@ static void mlx5_devlink_vnet_param_unre + if (!mlx5_vnet_supported(dev)) + return; + ++#ifdef HAVE_DEVLINK_PARAM_PUBLISH ++ devlink_param_unpublish(devlink, &enable_vnet_param); ++#endif + devlink_param_unregister(devlink, &enable_vnet_param); + } + +@@ -937,19 +1133,34 @@ mlx5_devlink_max_uc_list_param_unregiste + + devlink_param_unregister(devlink, &max_uc_list_param); + } ++#endif /* HAVE_DEVLINK_PARAM_PUBLISH */ + ++#ifdef HAVE_DEVLINK_TRAP_SUPPORT ++#ifdef HAVE_DEVLINK_TRAP_GROUPS_REGISTER + #define MLX5_TRAP_DROP(_id, _group_id) \ + DEVLINK_TRAP_GENERIC(DROP, DROP, _id, \ + DEVLINK_TRAP_GROUP_GENERIC_ID_##_group_id, \ + DEVLINK_TRAP_METADATA_TYPE_F_IN_PORT) ++#else ++#define MLX5_TRAP_DROP(_id, group) \ ++ DEVLINK_TRAP_GENERIC(DROP, DROP, _id, \ ++ DEVLINK_TRAP_GROUP_GENERIC(group), \ ++ DEVLINK_TRAP_METADATA_TYPE_F_IN_PORT) ++#endif + + static const struct devlink_trap mlx5_traps_arr[] = { + MLX5_TRAP_DROP(INGRESS_VLAN_FILTER, L2_DROPS), ++#ifdef HAVE_DEVLINK_TRAP_DMAC_FILTER + MLX5_TRAP_DROP(DMAC_FILTER, L2_DROPS), ++#endif + }; + + static const struct devlink_trap_group mlx5_trap_groups_arr[] = { ++#ifdef HAVE_DEVLINK_TRAP_GROUP_GENERIC_2_ARGS + DEVLINK_TRAP_GROUP_GENERIC(L2_DROPS, 0), ++#else ++ DEVLINK_TRAP_GROUP_GENERIC(L2_DROPS), ++#endif + }; + + static int mlx5_devlink_traps_register(struct devlink *devlink) +@@ -957,42 +1168,94 @@ static int mlx5_devlink_traps_register(s + struct mlx5_core_dev *core_dev = devlink_priv(devlink); + int err; + ++#ifdef HAVE_DEVLINK_TRAP_GROUPS_REGISTER ++#ifdef HAVE_DEVL_TRAP_GROUPS_REGISTER //forward port ++ err = devl_trap_groups_register(devlink, mlx5_trap_groups_arr, ++ ARRAY_SIZE(mlx5_trap_groups_arr)); ++#else + err = devlink_trap_groups_register(devlink, mlx5_trap_groups_arr, + ARRAY_SIZE(mlx5_trap_groups_arr)); ++#endif /* HAVE_DEVL_TRAP_GROUPS_REGISTER */ + if (err) + return err; ++#endif /* HAVE_DEVLINK_TRAP_GROUPS_REGISTER */ + ++#ifdef HAVE_DEVL_TRAP_GROUPS_REGISTER //forward port ++ err = devl_traps_register(devlink, mlx5_traps_arr, ARRAY_SIZE(mlx5_traps_arr), ++ &core_dev->priv); ++#else + err = devlink_traps_register(devlink, mlx5_traps_arr, ARRAY_SIZE(mlx5_traps_arr), + &core_dev->priv); ++#endif ++#ifdef HAVE_DEVLINK_TRAP_GROUPS_REGISTER + if (err) + goto err_trap_group; + return 0; + + err_trap_group: ++#ifdef HAVE_DEVL_TRAP_GROUPS_REGISTER //forward port ++ devl_trap_groups_unregister(devlink, mlx5_trap_groups_arr, ++ ARRAY_SIZE(mlx5_trap_groups_arr)); ++#else + devlink_trap_groups_unregister(devlink, mlx5_trap_groups_arr, + ARRAY_SIZE(mlx5_trap_groups_arr)); ++#endif /* HAVE_DEVL_TRAP_GROUPS_REGISTER */ ++#endif /* HAVE_DEVLINK_TRAP_GROUPS_REGISTER */ + return err; + } + + static void mlx5_devlink_traps_unregister(struct devlink *devlink) + { ++#ifdef HAVE_DEVL_TRAP_GROUPS_REGISTER //forward port ++ devl_traps_unregister(devlink, mlx5_traps_arr, ARRAY_SIZE(mlx5_traps_arr)); ++#else + devlink_traps_unregister(devlink, mlx5_traps_arr, ARRAY_SIZE(mlx5_traps_arr)); ++#endif ++#ifdef HAVE_DEVLINK_TRAP_GROUPS_REGISTER ++#ifdef HAVE_DEVL_TRAP_GROUPS_REGISTER //forward port ++ devl_trap_groups_unregister(devlink, mlx5_trap_groups_arr, ++ ARRAY_SIZE(mlx5_trap_groups_arr)); ++#else + devlink_trap_groups_unregister(devlink, mlx5_trap_groups_arr, + ARRAY_SIZE(mlx5_trap_groups_arr)); ++#endif /* HAVE_DEVL_TRAP_GROUPS_REGISTER */ ++#endif /* HAVE_DEVLINK_TRAP_GROUPS_REGISTER */ + } ++#endif /* HAVE_DEVLINK_TRAP_SUPPORT */ + ++#ifdef HAVE_DEVLINK_REGISTER_GET_1_PARAMS + int mlx5_devlink_register(struct devlink *devlink) ++#else ++int mlx5_devlink_register(struct devlink *devlink, struct device *pdev) ++#endif + { ++#ifdef HAVE_DEVLINK_SET_FEATURES + struct mlx5_core_dev *dev = devlink_priv(devlink); ++#endif ++#if (!defined(HAVE_DEVLINK_PARAM) || (!defined(HAVE_DEVLINK_PARAMS_PUBLISHED) && !defined(HAVE_DEVLINK_REGISTER_GET_1_PARAMS))) && defined(CONFIG_MLX5_ESWITCH) ++ struct mlx5_core_dev *priv_dev; ++ struct mlx5_eswitch *eswitch; ++#endif + int err; + ++#ifndef HAVE_DEVLINK_REGISTER_GET_1_PARAMS ++ err = devlink_register(devlink, pdev); ++ if (err) ++ return err; ++#endif ++ ++#if defined(HAVE_DEVLINK_PARAM) && (defined(HAVE_DEVLINK_PARAMS_PUBLISHED) || defined(HAVE_DEVLINK_REGISTER_GET_1_PARAMS)) + err = devlink_params_register(devlink, mlx5_devlink_params, + ARRAY_SIZE(mlx5_devlink_params)); + if (err) ++#ifdef HAVE_DEVLINK_REGISTER_GET_1_PARAMS + return err; ++#else ++ goto params_reg_err; ++#endif + + mlx5_devlink_set_params_init_values(devlink); +- ++#ifdef HAVE_DEVLINK_PARAM_REGISTER + err = mlx5_devlink_auxdev_params_register(devlink); + if (err) + goto auxdev_reg_err; +@@ -1000,33 +1263,68 @@ int mlx5_devlink_register(struct devlink + err = mlx5_devlink_max_uc_list_param_register(devlink); + if (err) + goto max_uc_list_err; ++#endif /* HAVE_DEVLINK_PARAM_REGISTER */ + ++#ifdef HAVE_DEVLINK_TRAP_SUPPORT + err = mlx5_devlink_traps_register(devlink); + if (err) + goto traps_reg_err; ++#endif /* HAVE_DEVLINK_TRAP_SUPPORT */ ++ ++#ifdef HAVE_DEVLINK_PARAMS_PUBLISHED ++ devlink_params_publish(devlink); ++#endif /* defined(HAVE_DEVLINK_PARAMS_PUBLISHED) && !defined(HAVE_DEVLINK_REGISTER_GET_1_PARAMS) */ + ++#ifdef HAVE_DEVLINK_SET_FEATURES + if (!mlx5_core_is_mp_slave(dev)) + devlink_set_features(devlink, DEVLINK_F_RELOAD); ++#endif + + return 0; + ++#ifdef HAVE_DEVLINK_TRAP_SUPPORT + traps_reg_err: ++#ifdef HAVE_DEVLINK_PARAM_REGISTER + mlx5_devlink_max_uc_list_param_unregister(devlink); + max_uc_list_err: + mlx5_devlink_auxdev_params_unregister(devlink); + auxdev_reg_err: ++#endif /* HAVE_DEVLINK_PARAM_REGISTER */ ++#endif /* HAVE_DEVLINK_TRAP_SUPPORT */ + devlink_params_unregister(devlink, mlx5_devlink_params, + ARRAY_SIZE(mlx5_devlink_params)); ++#ifndef HAVE_DEVLINK_REGISTER_GET_1_PARAMS ++params_reg_err: ++ devlink_unregister(devlink); ++#endif ++#elif defined(CONFIG_MLX5_ESWITCH) ++ priv_dev = devlink_priv(devlink); ++ eswitch = priv_dev->priv.eswitch; ++ if (eswitch && mlx5_esw_vport_match_metadata_supported(eswitch)) ++ eswitch->flags |= MLX5_ESWITCH_VPORT_MATCH_METADATA; ++#endif /* HAVE_DEVLINK_PARAM && (HAVE_DEVLINK_PARAMS_PUBLISHED || HAVE_DEVLINK_REGISTER_GET_1_PARAMS) */ + return err; + } + + void mlx5_devlink_unregister(struct devlink *devlink) + { ++#if defined(HAVE_DEVLINK_PARAM) && (defined(HAVE_DEVLINK_PARAMS_PUBLISHED) || defined(HAVE_DEVLINK_REGISTER_GET_1_PARAMS)) ++#ifndef HAVE_DEVLINK_REGISTER_GET_1_PARAMS ++ devlink_params_unpublish(devlink); ++#endif ++#ifdef HAVE_DEVLINK_TRAP_SUPPORT + mlx5_devlink_traps_unregister(devlink); ++#endif /* HAVE_DEVLINK_TRAP_SUPPORT */ ++#ifdef HAVE_DEVLINK_PARAM_REGISTER + mlx5_devlink_max_uc_list_param_unregister(devlink); + mlx5_devlink_auxdev_params_unregister(devlink); ++#endif + devlink_params_unregister(devlink, mlx5_devlink_params, + ARRAY_SIZE(mlx5_devlink_params)); ++#endif /* HAVE_DEVLINK_PARAM && (HAVE_DEVLINK_PARAMS_PUBLISHED || HAVE_DEVLINK_REGISTER_GET_1_PARAMS) */ ++#ifndef HAVE_DEVLINK_REGISTER_GET_1_PARAMS ++ devlink_unregister(devlink); ++#endif + } + + int diff --git a/src/mlnx-ofa_kernel-5.8/backports/0273-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0273-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..e65245a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0273-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,24 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/en/qos.h + +Change-Id: Iaafa2ae48160b02f580b8c1767a5b3751820d653 +--- + drivers/net/ethernet/mellanox/mlx5/core/en/qos.h | 5 +++++ + 1 file changed, 5 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/qos.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/qos.h +@@ -35,8 +35,13 @@ int mlx5e_htb_leaf_alloc_queue(struct ml + struct netlink_ext_ack *extack); + int mlx5e_htb_leaf_to_inner(struct mlx5e_priv *priv, u16 classid, u16 child_classid, + u64 rate, u64 ceil, struct netlink_ext_ack *extack); ++#ifndef HAVE_TC_HTB_COMMAND_HAS_MOVED_QID + int mlx5e_htb_leaf_del(struct mlx5e_priv *priv, u16 *classid, + struct netlink_ext_ack *extack); ++#else ++int mlx5e_htb_leaf_del(struct mlx5e_priv *priv, u16 classid, u16 *old_qid, ++ u16 *new_qid, struct netlink_ext_ack *extack); ++#endif + int mlx5e_htb_leaf_del_last(struct mlx5e_priv *priv, u16 classid, bool force, + struct netlink_ext_ack *extack); + int mlx5e_htb_node_modify(struct mlx5e_priv *priv, u16 classid, u64 rate, u64 ceil, diff --git a/src/mlnx-ofa_kernel-5.8/backports/0273-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-esw.patch b/src/mlnx-ofa_kernel-5.8/backports/0273-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-esw.patch new file mode 100644 index 0000000..eb26122 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0273-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-esw.patch @@ -0,0 +1,322 @@ +From: Maher Sanalla +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c + +Change-Id: I073a7860c9d305d42fd248fd494457d82ac1ff99 +--- + .../mellanox/mlx5/core/esw/devlink_port.c | 142 ++++++++++++++++-- + 1 file changed, 132 insertions(+), 10 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c +@@ -2,9 +2,11 @@ + /* Copyright (c) 2020 Mellanox Technologies Ltd. */ + + #include ++#include + #include "eswitch.h" + #include "mlx5_esw_devm.h" + ++#ifdef HAVE_DEVLINK_PORT_ATRRS_SET_GET_SUPPORT + static void + mlx5_esw_get_port_parent_id(struct mlx5_core_dev *dev, struct netdev_phys_item_id *ppid) + { +@@ -14,7 +16,9 @@ mlx5_esw_get_port_parent_id(struct mlx5_ + ppid->id_len = sizeof(parent_id); + memcpy(ppid->id, &parent_id, sizeof(parent_id)); + } ++#endif + ++#ifdef HAVE_DEVLINK_PORT_TYPE_ETH_SET + static bool mlx5_esw_devlink_port_supported(struct mlx5_eswitch *esw, u16 vport_num) + { + return vport_num == MLX5_VPORT_UPLINK || +@@ -24,12 +28,17 @@ static bool mlx5_esw_devlink_port_suppor + + static struct devlink_port *mlx5_esw_dl_port_alloc(struct mlx5_eswitch *esw, u16 vport_num) + { ++#ifdef HAVE_DEVLINK_PORT_ATRRS_SET_GET_SUPPORT + struct mlx5_core_dev *dev = esw->dev; ++#ifdef HAVE_DEVLINK_PORT_ATRRS_SET_GET_2_PARAMS + struct devlink_port_attrs attrs = {}; ++#endif + struct netdev_phys_item_id ppid = {}; + struct devlink_port *dl_port; ++#if defined(HAVE_DEVLINK_PORT_ATTRS_PCI_PF_SET_CONTROLLER_NUM) + u32 controller_num = 0; + bool external; ++#endif + u16 pfnum; + + dl_port = kzalloc(sizeof(*dl_port), GFP_KERNEL); +@@ -38,36 +47,82 @@ static struct devlink_port *mlx5_esw_dl_ + + mlx5_esw_get_port_parent_id(dev, &ppid); + pfnum = mlx5_get_dev_index(dev); ++#if defined(HAVE_DEVLINK_PORT_ATTRS_PCI_PF_SET_CONTROLLER_NUM) + external = mlx5_core_is_ecpf_esw_manager(dev); + if (external) + controller_num = dev->priv.eswitch->offloads.host_number + 1; ++#endif + + if (vport_num == MLX5_VPORT_UPLINK) { ++#ifdef HAVE_DEVLINK_PORT_ATRRS_SET_GET_2_PARAMS + attrs.flavour = DEVLINK_PORT_FLAVOUR_PHYSICAL; + attrs.phys.port_number = pfnum; + memcpy(attrs.switch_id.id, ppid.id, ppid.id_len); + attrs.switch_id.id_len = ppid.id_len; + devlink_port_attrs_set(dl_port, &attrs); +- } else if (vport_num == MLX5_VPORT_PF) { ++#else ++ devlink_port_attrs_set(dl_port, ++ DEVLINK_PORT_FLAVOUR_PHYSICAL, ++ mlx5_get_dev_index(dev), ++ false, 0 ++#ifdef HAVE_DEVLINK_PORT_ATRRS_SET_GET_7_PARAMS ++ ,NULL, 0 ++#endif ++ ); ++#endif ++ } ++#ifdef HAVE_DEVLINK_PORT_ATTRS_PCI_PF_SET ++ else if (vport_num == MLX5_VPORT_PF) { + memcpy(dl_port->attrs.switch_id.id, ppid.id, ppid.id_len); + dl_port->attrs.switch_id.id_len = ppid.id_len; ++#if defined(HAVE_DEVLINK_PORT_ATTRS_PCI_PF_SET_CONTROLLER_NUM) + devlink_port_attrs_pci_pf_set(dl_port, controller_num, pfnum, external); ++#elif defined(HAVE_DEVLINK_PORT_ATTRS_PCI_PF_SET_GET_2_PARAMS) ++ devlink_port_attrs_pci_pf_set(dl_port, pfnum); ++#else ++ devlink_port_attrs_pci_pf_set(dl_port, ++ &ppid.id[0], ppid.id_len, ++ pfnum); ++#endif + } else if (mlx5_eswitch_is_vf_vport(esw, vport_num)) { + memcpy(dl_port->attrs.switch_id.id, ppid.id, ppid.id_len); + dl_port->attrs.switch_id.id_len = ppid.id_len; ++#if defined(HAVE_DEVLINK_PORT_ATTRS_PCI_VF_SET_GET_CONTROLLER_NUM) + devlink_port_attrs_pci_vf_set(dl_port, controller_num, pfnum, + vport_num - 1, external); ++#elif defined(HAVE_DEVLINK_PORT_ATTRS_PCI_VF_SET_GET_3_PARAMS) ++ devlink_port_attrs_pci_vf_set(dl_port, pfnum, vport_num - 1); ++#elif defined(HAVE_DEVLINK_PORT_ATTRS_PCI_VF_SET_GET_5_PARAMS) ++ devlink_port_attrs_pci_vf_set(dl_port, ++ &ppid.id[0], ppid.id_len, ++ pfnum, vport_num - 1); ++#endif + } ++#else ++ else ++ devlink_port_attrs_set(dl_port, ++ DEVLINK_PORT_FLAVOUR_VIRTUAL, ++ 0, false , 0 ++#ifdef HAVE_DEVLINK_PORT_ATRRS_SET_GET_7_PARAMS ++ ,NULL, 0 ++#endif ++ ); ++#endif /* HAVE_DEVLINK_PORT_ATTRS_PCI_PF_SET */ + return dl_port; ++#else ++ return NULL; ++#endif /* HAVE_DEVLINK_PORT_ATRRS_SET_GET_SUPPORT */ + } + + static void mlx5_esw_dl_port_free(struct devlink_port *dl_port) + { + kfree(dl_port); + } ++#endif + + int mlx5_esw_offloads_devlink_port_register(struct mlx5_eswitch *esw, u16 vport_num) + { ++#ifdef HAVE_DEVLINK_PORT_TYPE_ETH_SET + struct mlx5_core_dev *dev = esw->dev; + struct devlink_port *dl_port; + unsigned int dl_port_index; +@@ -88,26 +143,46 @@ int mlx5_esw_offloads_devlink_port_regis + + devlink = priv_to_devlink(dev); + dl_port_index = mlx5_esw_vport_to_devlink_port_index(dev, vport_num); ++#ifdef HAVE_DEVL_PORT_REGISTER //forward port ++ err = devl_port_register(devlink, dl_port, dl_port_index); ++#else + err = devlink_port_register(devlink, dl_port, dl_port_index); ++#endif + if (err) + goto reg_err; + ++#ifdef HAVE_DEVLINK_HAS_RATE_FUNCTIONS ++#ifdef HAVE_DEVL_PORT_REGISTER //forward port ++ err = devl_rate_leaf_create(dl_port, vport); ++#else + err = devlink_rate_leaf_create(dl_port, vport); ++#endif + if (err) + goto rate_err; ++#endif + + vport->dl_port = dl_port; + return 0; + ++#ifdef HAVE_DEVLINK_HAS_RATE_FUNCTIONS + rate_err: ++#ifdef HAVE_DEVL_PORT_REGISTER //forward port ++ devl_port_unregister(dl_port); ++#else + devlink_port_unregister(dl_port); ++#endif ++#endif + reg_err: + mlx5_esw_dl_port_free(dl_port); + return err; ++#else ++ return 0; ++#endif + } + + void mlx5_esw_offloads_devlink_port_unregister(struct mlx5_eswitch *esw, u16 vport_num) + { ++#ifdef HAVE_DEVLINK_PORT_TYPE_ETH_SET + struct mlx5_vport *vport; + + if (!mlx5_esw_devlink_port_supported(esw, vport_num)) +@@ -117,15 +192,26 @@ void mlx5_esw_offloads_devlink_port_unre + if (IS_ERR(vport)) + return; + ++#ifdef HAVE_DEVLINK_HAS_RATE_FUNCTIONS + if (vport->dl_port->devlink_rate) { + if (!test_bit(MLX5_INTERFACE_STATE_TEARDOWN, &esw->dev->intf_state)) + mlx5_esw_qos_vport_update_group(esw, vport, NULL, NULL); +- devlink_rate_leaf_destroy(vport->dl_port); ++#ifdef HAVE_DEVL_PORT_REGISTER //forward port ++ devl_rate_leaf_destroy(vport->dl_port); ++#else ++ devlink_rate_leaf_destroy(vport->dl_port); ++#endif + } ++#endif + +- devlink_port_unregister(vport->dl_port); +- mlx5_esw_dl_port_free(vport->dl_port); +- vport->dl_port = NULL; ++#ifdef HAVE_DEVL_PORT_REGISTER //forward port ++ devl_port_unregister(vport->dl_port); ++#else ++ devlink_port_unregister(vport->dl_port); ++#endif ++ mlx5_esw_dl_port_free(vport->dl_port); ++ vport->dl_port = NULL; ++#endif + } + + struct devlink_port *mlx5_esw_offloads_devlink_port(struct mlx5_eswitch *esw, u16 vport_num) +@@ -139,6 +225,7 @@ struct devlink_port *mlx5_esw_offloads_d + int _mlx5_esw_devlink_sf_port_register(struct mlx5_eswitch *esw, struct devlink_port *dl_port, + u16 vport_num, u32 controller, u32 sfnum) + { ++#if defined(HAVE_DEVLINK_PORT_ATTRS_PCI_SF_SET_GET_4_PARAMS) || defined(HAVE_DEVLINK_PORT_ATTRS_PCI_SF_SET_GET_5_PARAMS) + struct mlx5_core_dev *dev = esw->dev; + struct netdev_phys_item_id ppid = {}; + unsigned int dl_port_index; +@@ -155,23 +242,44 @@ int _mlx5_esw_devlink_sf_port_register(s + mlx5_esw_get_port_parent_id(dev, &ppid); + memcpy(dl_port->attrs.switch_id.id, &ppid.id[0], ppid.id_len); + dl_port->attrs.switch_id.id_len = ppid.id_len; ++#if defined(HAVE_DEVLINK_PORT_ATTRS_PCI_SF_SET_GET_5_PARAMS) + devlink_port_attrs_pci_sf_set(dl_port, controller, pfnum, sfnum, !!controller); ++#elif defined(HAVE_DEVLINK_PORT_ATTRS_PCI_SF_SET_GET_4_PARAMS) ++ devlink_port_attrs_pci_sf_set(dl_port, controller, pfnum, sfnum); ++#endif + devlink = priv_to_devlink(dev); + dl_port_index = mlx5_esw_vport_to_devlink_port_index(dev, vport_num); ++#ifdef HAVE_DEVL_PORT_REGISTER //forward port ++ err = devl_port_register(devlink, dl_port, dl_port_index); ++#else + err = devlink_port_register(devlink, dl_port, dl_port_index); ++#endif + if (err) + return err; +- ++#ifdef HAVE_DEVLINK_HAS_RATE_FUNCTIONS ++#ifdef HAVE_DEVL_PORT_REGISTER //forward port ++ err = devl_rate_leaf_create(dl_port, vport); ++#else + err = devlink_rate_leaf_create(dl_port, vport); ++#endif + if (err) + goto rate_err; +- ++#endif + vport->dl_port = dl_port; + return 0; +- ++#ifdef HAVE_DEVLINK_HAS_RATE_FUNCTIONS + rate_err: ++#ifdef HAVE_DEVL_PORT_REGISTER //forward port ++ devl_port_unregister(dl_port); ++#else + devlink_port_unregister(dl_port); ++#endif + return err; ++#endif ++#else /* defined(HAVE_DEVLINK_PORT_ATTRS_PCI_SF_SET_GET_4_PARAMS) || defined(HAVE_DEVLINK_PORT_ATTRS_PCI_SF_SET_GET_5_PARAMS) */ ++ ++ return -EOPNOTSUPP; ++#endif /* defined(HAVE_DEVLINK_PORT_ATTRS_PCI_SF_SET_GET_4_PARAMS) || defined(HAVE_DEVLINK_PORT_ATTRS_PCI_SF_SET_GET_5_PARAMS) */ + } + + int mlx5_esw_devlink_sf_port_register(struct mlx5_eswitch *esw, +@@ -181,10 +289,12 @@ int mlx5_esw_devlink_sf_port_register(st + { + int err = 0; + ++#if defined(HAVE_DEVLINK_PORT_ATTRS_PCI_SF_SET_GET_4_PARAMS) || defined(HAVE_DEVLINK_PORT_ATTRS_PCI_SF_SET_GET_5_PARAMS) + err = _mlx5_esw_devlink_sf_port_register(esw, dl_port, vport_num, + controller, sfnum); + if (err) + return err; ++#endif + + #if IS_ENABLED(CONFIG_MLXDEVM) + err = mlx5_devm_sf_port_register(esw->dev, vport_num, controller, sfnum, dl_port); +@@ -194,25 +304,37 @@ int mlx5_esw_devlink_sf_port_register(st + + void _mlx5_esw_devlink_sf_port_unregister(struct mlx5_eswitch *esw, u16 vport_num) + { ++#if defined(HAVE_DEVLINK_PORT_ATTRS_PCI_SF_SET_GET_4_PARAMS) || defined(HAVE_DEVLINK_PORT_ATTRS_PCI_SF_SET_GET_5_PARAMS) + struct mlx5_vport *vport; + + vport = mlx5_eswitch_get_vport(esw, vport_num); + if (IS_ERR(vport)) + return; +- ++#ifdef HAVE_DEVLINK_HAS_RATE_FUNCTIONS + if (vport->dl_port->devlink_rate) { + mlx5_esw_qos_vport_update_group(esw, vport, NULL, NULL); ++#ifdef HAVE_DEVL_PORT_REGISTER //forward port ++ devl_rate_leaf_destroy(vport->dl_port); ++#else + devlink_rate_leaf_destroy(vport->dl_port); ++#endif + } +- ++#endif ++#ifdef HAVE_DEVL_PORT_REGISTER //forward port ++ devl_port_unregister(vport->dl_port); ++#else + devlink_port_unregister(vport->dl_port); ++#endif + vport->dl_port = NULL; ++#endif /* defined(HAVE_DEVLINK_PORT_ATTRS_PCI_SF_SET_GET_4_PARAMS) || defined(HAVE_DEVLINK_PORT_ATTRS_PCI_SF_SET_GET_5_PARAMS) */ + } + + void mlx5_esw_devlink_sf_port_unregister(struct mlx5_eswitch *esw, + u16 vport_num) + { ++#if defined(HAVE_DEVLINK_PORT_ATTRS_PCI_SF_SET_GET_4_PARAMS) || defined(HAVE_DEVLINK_PORT_ATTRS_PCI_SF_SET_GET_5_PARAMS) + _mlx5_esw_devlink_sf_port_unregister(esw, vport_num); ++#endif + #if IS_ENABLED(CONFIG_MLXDEVM) + mlx5_devm_sf_port_unregister(esw->dev, vport_num); + #endif diff --git a/src/mlnx-ofa_kernel-5.8/backports/0273-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lag.patch b/src/mlnx-ofa_kernel-5.8/backports/0273-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lag.patch new file mode 100644 index 0000000..2f74f1c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0273-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lag.patch @@ -0,0 +1,22 @@ +From: Roy Novich +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h + +Change-Id: I59263cab8ba56a48ae3804fee417ba0bc46c4823 +--- + drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h +@@ -46,7 +46,11 @@ struct lag_tracker { + enum netdev_lag_tx_type tx_type; + struct netdev_lag_lower_state_info netdev_state[MLX5_MAX_PORTS]; + unsigned int is_bonded:1; ++#ifdef HAVE_INFO_HASH_TYPE + enum netdev_lag_hash hash_type; ++#else ++ int hash_type; ++#endif + unsigned int has_inactive:1; + }; + diff --git a/src/mlnx-ofa_kernel-5.8/backports/0274-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0274-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..0f65dfc --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0274-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,111 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en/tc/meter.c + +Change-Id: I280d0e9b1881708285a3e56d5688daf6a752cdb9 +--- + .../ethernet/mellanox/mlx5/core/en/tc/meter.c | 24 +++++++++++++++++++ + 1 file changed, 24 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/meter.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/meter.c +@@ -4,8 +4,14 @@ + #include + #include "en/aso.h" + #include "meter.h" ++#if IS_ENABLED(CONFIG_MLX5_CLS_ACT) + #include "en/tc_priv.h" ++#elif defined(CONFIG_MLX5_ESWITCH) ++#include "en_tc.h" ++#endif ++#if IS_ENABLED(CONFIG_MLX5_CLS_ACT) + #include "en/tc/post_act.h" ++#endif + + #define START_COLOR_SHIFT 28 + #define METER_MODE_SHIFT 24 +@@ -13,6 +19,10 @@ + #define CBS_MAN_SHIFT 16 + #define CIR_EXP_SHIFT 8 + ++#ifndef BITS_TO_BYTES ++#define BITS_TO_BYTES(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE) ++#endif ++ + /* cir = 8*(10^9)*cir_mantissa/(2^cir_exponent)) bits/s */ + #define CONST_CIR 8000000000ULL + #define CALC_CIR(m, e) ((CONST_CIR * (m)) >> (e)) +@@ -373,8 +383,10 @@ mlx5e_tc_meter_get(struct mlx5_core_dev + struct mlx5e_flow_meters *flow_meters; + struct mlx5e_flow_meter_handle *meter; + ++#if IS_ENABLED(CONFIG_MLX5_CLS_ACT) + flow_meters = mlx5e_get_flow_meters(mdev); + if (!flow_meters) ++#endif + return ERR_PTR(-EOPNOTSUPP); + + meter = __mlx5e_tc_meter_get(flow_meters, params->index); +@@ -445,12 +457,16 @@ int + mlx5e_tc_meter_update(struct mlx5e_flow_meter_handle *meter, + struct mlx5e_flow_meter_params *params) + { ++#if IS_ENABLED(CONFIG_MLX5_CLS_ACT) + struct mlx5_core_dev *mdev = meter->flow_meters->mdev; ++#endif + struct mlx5e_flow_meters *flow_meters; + int err; + ++#if IS_ENABLED(CONFIG_MLX5_CLS_ACT) + flow_meters = mlx5e_get_flow_meters(mdev); + if (!flow_meters) ++#endif + return -EOPNOTSUPP; + + mutex_lock(&flow_meters->sync_lock); +@@ -466,8 +482,10 @@ mlx5e_tc_meter_replace(struct mlx5_core_ + struct mlx5e_flow_meter_handle *meter; + int err; + ++#if IS_ENABLED(CONFIG_MLX5_CLS_ACT) + flow_meters = mlx5e_get_flow_meters(mdev); + if (!flow_meters) ++#endif + return ERR_PTR(-EOPNOTSUPP); + + mutex_lock(&flow_meters->sync_lock); +@@ -515,11 +533,13 @@ mlx5e_flow_meters_init(struct mlx5e_priv + if (!flow_meters) + return NULL; + ++#if IS_ENABLED(CONFIG_MLX5_CLS_ACT) + if (IS_ERR_OR_NULL(post_act)) { + netdev_dbg(priv->netdev, + "flow meter offload is not supported, post action is missing\n"); + goto errout; + } ++#endif + + flow_meters->aso = mlx5e_aso_get(priv); + if (!flow_meters->aso) { +@@ -549,7 +569,9 @@ mlx5e_flow_meters_cleanup(struct mlx5e_f + if (!flow_meters) + return; + ++#if IS_ENABLED(CONFIG_MLX5_CLS_ACT) + mlx5e_aso_put(flow_meters->aso->priv); ++#endif + kfree(flow_meters); + } + +@@ -792,8 +814,10 @@ mlx5e_alloc_flow_meter(struct mlx5_core_ + struct mlx5e_flow_meters *flow_meters; + struct mlx5e_flow_meter_handle *meter; + ++#if IS_ENABLED(CONFIG_MLX5_CLS_ACT) + flow_meters = mlx5e_get_flow_meters(dev); + if (!flow_meters) ++#endif + return ERR_PTR(-EOPNOTSUPP); + + mutex_lock(&flow_meters->sync_lock); diff --git a/src/mlnx-ofa_kernel-5.8/backports/0274-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch b/src/mlnx-ofa_kernel-5.8/backports/0274-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch new file mode 100644 index 0000000..846000a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0274-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch @@ -0,0 +1,385 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/en_tx.c + +Change-Id: I040cdd07dd338653de3f2f9102fc73992bf46c5c +--- + .../net/ethernet/mellanox/mlx5/core/en_tx.c | 179 +++++++++++++++++- + 1 file changed, 176 insertions(+), 3 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +@@ -41,6 +41,67 @@ + #include "en_accel/ipsec_rxtx.h" + #include "en_accel/macsec.h" + #include "en/ptp.h" ++#include ++ ++static inline void mlx5e_read_cqe_slot(struct mlx5_cqwq *wq, ++ u32 cqcc, void *data) ++{ ++ u32 ci = mlx5_cqwq_ctr2ix(wq, cqcc); ++ ++ memcpy(data, mlx5_cqwq_get_wqe(wq, ci), sizeof(struct mlx5_cqe64)); ++} ++ ++static inline void mlx5e_read_title_slot(struct mlx5e_txqsq *sq, ++ struct mlx5_cqwq *wq, ++ u32 cqcc) ++{ ++ struct mlx5e_cq_decomp *cqd = &sq->cqd; ++ struct mlx5_cqe64 *title = &cqd->title; ++ ++ mlx5e_read_cqe_slot(wq, cqcc, title); ++ cqd->left = be32_to_cpu(title->byte_cnt); ++ sq->stats->cqe_compress_blks++; ++ sq->stats->cqe_compress_pkts += cqd->left; ++} ++ ++static inline void mlx5e_decompress_cqes(struct mlx5e_txqsq *sq, ++ struct mlx5_cqwq *wq) ++{ ++ struct mlx5e_cq_decomp *cqd = &sq->cqd; ++ struct mlx5_cqe64 *title = &cqd->title; ++ struct mlx5_mini_cqe8 *mini_cqe; ++ int iteration_sz; ++ u32 cc = wq->cc; ++ ++ mlx5e_read_title_slot(sq, wq, cc); ++ mlx5e_read_cqe_slot(wq, cc + 1, cqd->mini_arr); ++ cqd->mini_arr_idx = 0; ++ do { ++ // Read 8 mini CQEs ++ iteration_sz = min_t(u16, cqd->left, 8); ++ // For each CQE update WQ ++ do { ++ struct mlx5_cqe64 cqe_tmp = *title; ++ struct mlx5_cqe64 *cqe; ++ ++ mini_cqe = &cqd->mini_arr[cqd->mini_arr_idx++]; ++ cqe_tmp.byte_cnt = mini_cqe->byte_cnt; ++ cqe_tmp.op_own &= 0xf0; ++ cqe_tmp.op_own |= 0x01 & (cc >> wq->fbc.log_sz); ++ cqe_tmp.wqe_counter = mini_cqe->s_wqe_info.wqe_counter; ++ ++ cqe = mlx5_cqwq_get_wqe(wq, mlx5_cqwq_ctr2ix(wq, cc++)); ++ *cqe = cqe_tmp; ++ ++ } while (cqd->mini_arr_idx < iteration_sz); ++ ++ cqd->left -= iteration_sz; ++ if (!cqd->left) ++ break; ++ mlx5e_read_cqe_slot(wq, cc, cqd->mini_arr); ++ cqd->mini_arr_idx = 0; ++ } while (1); ++} + + static void mlx5e_dma_unmap_wqe_err(struct mlx5e_txqsq *sq, u8 num_dma) + { +@@ -106,8 +167,24 @@ static int mlx5e_select_htb_queue(struct + return mlx5e_get_txq_by_classid(priv, classid); + } + ++#ifdef HAVE_NDO_SELECT_QUEUE_HAS_3_PARMS_NO_FALLBACK + u16 mlx5e_select_queue(struct net_device *dev, struct sk_buff *skb, + struct net_device *sb_dev) ++#elif defined(HAVE_SELECT_QUEUE_FALLBACK_T) ++u16 mlx5e_select_queue(struct net_device *dev, struct sk_buff *skb, ++#ifdef HAVE_SELECT_QUEUE_FALLBACK_T ++#ifdef HAVE_SELECT_QUEUE_NET_DEVICE ++ struct net_device *sb_dev, ++#else ++ void *accel_priv, ++#endif /* HAVE_SELECT_QUEUE_NET_DEVICE */ ++ select_queue_fallback_t fallback) ++#else ++ void *accel_priv) ++#endif ++#else /* HAVE_SELECT_QUEUE_FALLBACK_T */ ++u16 mlx5e_select_queue(struct net_device *dev, struct sk_buff *skb) ++#endif + { + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5e_select_queue_params *selq; +@@ -131,7 +208,13 @@ u16 mlx5e_select_queue(struct net_device + if (unlikely(mlx5e_use_ptpsq(skb))) + return mlx5e_select_ptpsq(dev, skb, selq); + ++#ifdef HAVE_NDO_SELECT_QUEUE_HAS_3_PARMS_NO_FALLBACK + txq_ix = netdev_pick_tx(dev, skb, NULL); ++#elif defined (HAVE_SELECT_QUEUE_FALLBACK_T_3_PARAMS) ++ txq_ix = fallback(dev, skb, NULL); ++#else ++ txq_ix = fallback(dev, skb); ++#endif + /* Fix netdev_pick_tx() not to choose ptp_channel and HTB txqs. + * If they are selected, switch to regular queues. + * Driver to select these queues only at mlx5e_select_ptpsq() +@@ -140,7 +223,13 @@ u16 mlx5e_select_queue(struct net_device + if (unlikely(txq_ix >= selq->num_regular_queues)) + txq_ix %= selq->num_regular_queues; + } else { ++#ifdef HAVE_NDO_SELECT_QUEUE_HAS_3_PARMS_NO_FALLBACK + txq_ix = netdev_pick_tx(dev, skb, NULL); ++#elif defined (HAVE_SELECT_QUEUE_FALLBACK_T_3_PARAMS) ++ txq_ix = fallback(dev, skb, NULL); ++#else ++ txq_ix = fallback(dev, skb); ++#endif + } + + if (selq->num_tcs <= 1) +@@ -159,7 +248,14 @@ static inline int mlx5e_skb_l2_header_of + { + #define MLX5E_MIN_INLINE (ETH_HLEN + VLAN_HLEN) + +- return max(skb_network_offset(skb), MLX5E_MIN_INLINE); ++ struct ethhdr *eth = (struct ethhdr *)(skb->data); ++ int max_hlen, l2_hlen = 0; ++ ++ max_hlen = max_t(int, skb_network_offset(skb), MLX5E_MIN_INLINE); ++ if (unlikely(!__vlan_get_protocol(skb, eth->h_proto, &l2_hlen))) ++ return max_hlen; ++ ++ return max_t(int, max_hlen, l2_hlen); + } + + static inline int mlx5e_skb_l3_header_offset(struct sk_buff *skb) +@@ -179,12 +275,23 @@ static inline u16 mlx5e_calc_min_inline( + case MLX5_INLINE_MODE_NONE: + return 0; + case MLX5_INLINE_MODE_TCP_UDP: ++#ifdef HAVE_ETH_GET_HEADLEN_3_PARAMS + hlen = eth_get_headlen(skb->dev, skb->data, skb_headlen(skb)); ++#elif defined(HAVE_ETH_GET_HEADLEN_2_PARAMS) ++ hlen = eth_get_headlen(skb->data, skb_headlen(skb)); ++#else ++ hlen = mlx5e_skb_l3_header_offset(skb) + sizeof(struct udphdr); ++ if (unlikely(hlen < ETH_HLEN + sizeof(struct iphdr) + sizeof(struct udphdr))) ++ hlen = MLX5E_MIN_INLINE + sizeof(struct ipv6hdr) + sizeof(struct tcphdr); ++#endif ++ + if (hlen == ETH_HLEN && !skb_vlan_tag_present(skb)) + hlen += VLAN_HLEN; + break; + case MLX5_INLINE_MODE_IP: + hlen = mlx5e_skb_l3_header_offset(skb); ++ if (unlikely(hlen < ETH_HLEN + sizeof(struct iphdr))) ++ hlen = MLX5E_MIN_INLINE + sizeof(struct ipv6hdr); + break; + case MLX5_INLINE_MODE_L2: + default: +@@ -199,7 +306,11 @@ static inline void mlx5e_insert_vlan(voi + int cpy1_sz = 2 * ETH_ALEN; + int cpy2_sz = ihs - cpy1_sz; + ++#ifdef HAVE_VLAN_ETHHDR_HAS_ADDRS + memcpy(&vhdr->addrs, skb->data, cpy1_sz); ++#else ++ memcpy(vhdr, skb->data, cpy1_sz); ++#endif + vhdr->h_vlan_proto = skb->vlan_proto; + vhdr->h_vlan_TCI = cpu_to_be16(skb_vlan_tag_get(skb)); + memcpy(&vhdr->h_vlan_encapsulated_proto, skb->data + cpy1_sz, cpy2_sz); +@@ -210,8 +321,14 @@ mlx5e_txwqe_build_eseg_csum(struct mlx5e + struct mlx5e_accel_tx_state *accel, + struct mlx5_wqe_eth_seg *eseg) + { ++#ifdef CONFIG_MLX5_EN_IPSEC ++#ifdef HAVE_XFRM_OFFLOAD_INNER_IPPROTO + if (unlikely(mlx5e_ipsec_txwqe_build_eseg_csum(sq, skb, eseg))) ++#else ++ if (unlikely(mlx5e_ipsec_txwqe_build_eseg_csum(sq, skb, &accel->ipsec ,eseg))) ++#endif + return; ++#endif + + if (likely(skb->ip_summed == CHECKSUM_PARTIAL)) { + eseg->cs_flags = MLX5_ETH_WQE_L3_CSUM; +@@ -239,13 +356,19 @@ mlx5e_tx_get_gso_ihs(struct mlx5e_txqsq + u16 ihs; + + if (skb->encapsulation) { ++#ifdef HAVE_SKB_INNER_TRANSPORT_OFFSET + ihs = skb_inner_transport_offset(skb) + inner_tcp_hdrlen(skb); ++#else ++ ihs = skb_inner_transport_header(skb) - skb->data + inner_tcp_hdrlen(skb); ++#endif + stats->tso_inner_packets++; + stats->tso_inner_bytes += skb->len - ihs; + } else { ++#ifdef HAVE_NETIF_F_GSO_UDP_L4 + if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) + ihs = skb_transport_offset(skb) + sizeof(struct udphdr); + else ++#endif + ihs = skb_transport_offset(skb) + tcp_hdrlen(skb); + stats->tso_packets++; + stats->tso_bytes += skb->len - ihs; +@@ -420,6 +543,14 @@ static void mlx5e_tx_check_stop(struct m + } + } + ++static inline bool mlx5e_is_skb_driver_xmit_more(struct sk_buff *skb, ++ struct mlx5e_txqsq *sq) ++{ ++ if (test_bit(MLX5E_SQ_STATE_SKB_XMIT_MORE, &sq->state)) ++ return skb->cb[47] & MLX5_XMIT_MORE_SKB_CB; ++ return false; ++} ++ + static inline void + mlx5e_txwqe_complete(struct mlx5e_txqsq *sq, struct sk_buff *skb, + const struct mlx5e_tx_attr *attr, +@@ -438,6 +569,8 @@ mlx5e_txwqe_complete(struct mlx5e_txqsq + .num_fifo_pkts = 0, + }; + ++ xmit_more = xmit_more || mlx5e_is_skb_driver_xmit_more(skb, sq); ++ + cseg->opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | attr->opcode); + cseg->qpn_ds = cpu_to_be32((sq->sqn << 8) | wqe_attr->ds_cnt); + +@@ -459,6 +592,7 @@ mlx5e_txwqe_complete(struct mlx5e_txqsq + } + + send_doorbell = __netdev_tx_sent_queue(sq->txq, attr->num_bytes, xmit_more); ++ + if (send_doorbell) + mlx5e_notify_hw(wq, sq->pc, sq->uar_map, cseg); + } +@@ -476,7 +610,9 @@ mlx5e_sq_xmit_wqe(struct mlx5e_txqsq *sq + struct mlx5e_sq_stats *stats = sq->stats; + int num_dma; + ++#if defined(HAVE_SK_BUFF_XMIT_MORE) || defined(HAVE_NETDEV_XMIT_MORE) + stats->xmit_more += xmit_more; ++#endif + + /* fill wqe */ + wi = &sq->db.wqe_info[pi]; +@@ -628,7 +764,9 @@ mlx5e_sq_xmit_mpwqe(struct mlx5e_txqsq * + mlx5e_tx_mpwqe_session_start(sq, eseg); + } + ++#if defined(HAVE_SK_BUFF_XMIT_MORE) || defined(HAVE_NETDEV_XMIT_MORE) + sq->stats->xmit_more += xmit_more; ++#endif + + txd.data = skb->data; + txd.len = skb->len; +@@ -644,6 +782,8 @@ mlx5e_sq_xmit_mpwqe(struct mlx5e_txqsq * + + mlx5e_tx_skb_update_hwts_flags(skb); + ++ xmit_more = xmit_more || mlx5e_is_skb_driver_xmit_more(skb, sq); ++ + if (unlikely(mlx5e_tx_mpwqe_is_full(&sq->mpwqe, sq->max_sq_mpw_wqebbs))) { + /* Might stop the queue and affect the retval of __netdev_tx_sent_queue. */ + cseg = mlx5e_tx_mpwqe_session_complete(sq); +@@ -685,7 +825,12 @@ static void mlx5e_txwqe_build_eseg(struc + struct sk_buff *skb, struct mlx5e_accel_tx_state *accel, + struct mlx5_wqe_eth_seg *eseg, u16 ihs) + { +- mlx5e_accel_tx_eseg(priv, skb, eseg, ihs); ++#if !defined(HAVE_XFRM_OFFLOAD_INNER_IPPROTO) && defined(CONFIG_MLX5_EN_IPSEC) ++ mlx5e_accel_tx_eseg(priv, skb, eseg, &accel->ipsec, ihs); ++#else ++ mlx5e_accel_tx_eseg(priv, skb, eseg, ihs); ++#endif ++ + mlx5e_txwqe_build_eseg_csum(sq, skb, accel, eseg); + if (unlikely(sq->ptpsq)) + mlx5e_cqe_ts_id_eseg(sq, skb, eseg); +@@ -725,7 +870,13 @@ netdev_tx_t mlx5e_xmit(struct sk_buff *s + struct mlx5_wqe_eth_seg eseg = {}; + + mlx5e_txwqe_build_eseg(priv, sq, skb, &accel, &eseg, attr.ihs); ++#ifdef HAVE_NETDEV_XMIT_MORE + mlx5e_sq_xmit_mpwqe(sq, skb, &eseg, netdev_xmit_more()); ++#elif defined(HAVE_SK_BUFF_XMIT_MORE) ++ mlx5e_sq_xmit_mpwqe(sq, skb, &eseg, skb->xmit_more); ++#else ++ mlx5e_sq_xmit_mpwqe(sq, skb, &eseg, false); ++#endif + return NETDEV_TX_OK; + } + +@@ -740,7 +891,13 @@ netdev_tx_t mlx5e_xmit(struct sk_buff *s + mlx5e_accel_tx_finish(sq, wqe, &accel, + (struct mlx5_wqe_inline_seg *)(wqe->data + wqe_attr.ds_cnt_inl)); + mlx5e_txwqe_build_eseg(priv, sq, skb, &accel, &wqe->eth, attr.ihs); ++#ifdef HAVE_NETDEV_XMIT_MORE + mlx5e_sq_xmit_wqe(sq, skb, &attr, &wqe_attr, wqe, pi, netdev_xmit_more()); ++#elif defined(HAVE_SK_BUFF_XMIT_MORE) ++ mlx5e_sq_xmit_wqe(sq, skb, &attr, &wqe_attr, wqe, pi, skb->xmit_more); ++#else ++ mlx5e_sq_xmit_wqe(sq, skb, &attr, &wqe_attr, wqe, pi, false); ++#endif + + return NETDEV_TX_OK; + } +@@ -787,7 +944,11 @@ static void mlx5e_consume_skb(struct mlx + skb_tstamp_tx(skb, &hwts); + } + ++#ifdef HAVE_NAPI_CONSUME_SKB + napi_consume_skb(skb, napi_budget); ++#else ++ dev_kfree_skb(skb); ++#endif + } + + static void mlx5e_tx_wi_consume_fifo_skbs(struct mlx5e_txqsq *sq, struct mlx5e_tx_wqe_info *wi, +@@ -842,6 +1003,9 @@ bool mlx5e_poll_tx_cq(struct mlx5e_cq *c + bool last_wqe; + u16 ci; + ++ if (mlx5_get_cqe_format(cqe) == MLX5_COMPRESSED) ++ mlx5e_decompress_cqes(sq, &cq->wq); ++ + mlx5_cqwq_pop(&cq->wq); + + wqe_counter = be16_to_cpu(cqe->wqe_counter); +@@ -862,7 +1026,6 @@ bool mlx5e_poll_tx_cq(struct mlx5e_cq *c + nbytes += wi->num_bytes; + continue; + } +- + if (unlikely(mlx5e_ktls_tx_try_handle_resync_dump_comp(sq, wi, + &dma_fifo_cc))) + continue; +@@ -996,7 +1159,11 @@ static void mlx5i_sq_calc_wqe_attr(struc + } + + void mlx5i_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb, ++#if defined(HAVE_SK_BUFF_XMIT_MORE) || defined(HAVE_NETDEV_XMIT_MORE) + struct mlx5_av *av, u32 dqpn, u32 dqkey, bool xmit_more) ++#else ++ struct mlx5_av *av, u32 dqpn, u32 dqkey) ++#endif + { + struct mlx5e_tx_wqe_attr wqe_attr; + struct mlx5e_tx_attr attr; +@@ -1018,7 +1185,9 @@ void mlx5i_sq_xmit(struct mlx5e_txqsq *s + pi = mlx5e_txqsq_get_next_pi(sq, wqe_attr.num_wqebbs); + wqe = MLX5I_SQ_FETCH_WQE(sq, pi); + ++#if defined(HAVE_SK_BUFF_XMIT_MORE) || defined(HAVE_NETDEV_XMIT_MORE) + stats->xmit_more += xmit_more; ++#endif + + /* fill wqe */ + wi = &sq->db.wqe_info[pi]; +@@ -1044,7 +1213,11 @@ void mlx5i_sq_xmit(struct mlx5e_txqsq *s + if (unlikely(num_dma < 0)) + goto err_drop; + ++#if defined(HAVE_SK_BUFF_XMIT_MORE) || defined(HAVE_NETDEV_XMIT_MORE) + mlx5e_txwqe_complete(sq, skb, &attr, &wqe_attr, num_dma, wi, cseg, xmit_more); ++#else ++ mlx5e_txwqe_complete(sq, skb, &attr, &wqe_attr, num_dma, wi, cseg, false); ++#endif + + sq->dim_obj.sample.pkt_ctr = sq->stats->packets; + sq->dim_obj.sample.byte_ctr = sq->stats->bytes; diff --git a/src/mlnx-ofa_kernel-5.8/backports/0274-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-sf-.patch b/src/mlnx-ofa_kernel-5.8/backports/0274-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-sf-.patch new file mode 100644 index 0000000..f809e4a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0274-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-sf-.patch @@ -0,0 +1,37 @@ +From: Roy Novich +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c + +Change-Id: I73edb5d1431307e4a705b0db40e5283f51aa8246 +--- + .../net/ethernet/mellanox/mlx5/core/sf/dev/driver.c | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c +@@ -55,7 +55,12 @@ static int mlx5_sf_dev_probe(struct auxi + mlx5_core_warn(mdev, "mlx5_init_one err=%d\n", err); + goto init_one_err; + } ++#ifdef HAVE_DEVLINK_REGISTER_GET_1_PARAMS + devlink_register(devlink); ++#endif ++#if defined(HAVE_DEVLINK_RELOAD_ENABLE) && !defined(HAVE_DEVLINK_SET_FEATURES) ++ devlink_reload_enable(devlink); ++#endif + return 0; + + init_one_err: +@@ -73,7 +78,12 @@ static void mlx5_sf_dev_remove(struct au + struct devlink *devlink = priv_to_devlink(sf_dev->mdev); + + set_bit(MLX5_INTERFACE_STATE_TEARDOWN, &sf_dev->mdev->intf_state); ++#if defined(HAVE_DEVLINK_RELOAD_DISABLE) && !defined(HAVE_DEVLINK_SET_FEATURES) ++ devlink_reload_disable(devlink); ++#endif ++#ifdef HAVE_DEVLINK_REGISTER_GET_1_PARAMS + devlink_unregister(devlink); ++#endif + mlx5_uninit_one(sf_dev->mdev); + + /* health work might still be active, and it needs pci bar in diff --git a/src/mlnx-ofa_kernel-5.8/backports/0275-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0275-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..d25232f --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0275-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,33 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en/tc/post_meter.c + +Change-Id: Idfcb728a5a2940144a03b8bf9b1ae3b81d6f552b +--- + .../net/ethernet/mellanox/mlx5/core/en/tc/post_meter.c | 9 +++++++++ + 1 file changed, 9 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/post_meter.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/post_meter.c +@@ -1,12 +1,21 @@ + // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB + // Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + ++#if IS_ENABLED(CONFIG_MLX5_CLS_ACT) + #include "en/tc_priv.h" ++#elif defined(CONFIG_MLX5_ESWITCH) ++#include "en_tc.h" ++#endif + #include "post_meter.h" ++#if IS_ENABLED(CONFIG_MLX5_CLS_ACT) + #include "en/tc/post_act.h" ++#endif + ++ ++#if IS_ENABLED(CONFIG_MLX5_CLS_ACT) + #define MLX5_PACKET_COLOR_BITS MLX5_REG_MAPPING_MBITS(PACKET_COLOR_TO_REG) + #define MLX5_PACKET_COLOR_MASK MLX5_REG_MAPPING_MASK(PACKET_COLOR_TO_REG) ++#endif + + struct mlx5e_post_meter_priv { + struct mlx5_flow_table *ft; diff --git a/src/mlnx-ofa_kernel-5.8/backports/0275-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-esw.patch b/src/mlnx-ofa_kernel-5.8/backports/0275-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-esw.patch new file mode 100644 index 0000000..5d6fdd1 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0275-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-esw.patch @@ -0,0 +1,25 @@ +From: Roy Novich +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/esw/ipsec.h + +Change-Id: I4fa99c39e973f9d98e30be1c9ef68e7ffda33fe8 +--- + drivers/net/ethernet/mellanox/mlx5/core/esw/ipsec.h | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/esw/ipsec.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/ipsec.h +@@ -27,10 +27,14 @@ void mlx5_esw_ipsec_release(struct mlx5_ + void mlx5_esw_ipsec_full_offload_get_stats(struct mlx5_eswitch *esw, void *ipsec_stats); + static inline int mlx5_is_ipsec_full_offload(struct mlx5e_priv *priv) + { ++#ifdef CONFIG_MLX5_ESWITCH + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + + return esw && (mlx5_eswitch_mode(priv->mdev) == MLX5_ESWITCH_OFFLOADS) && + (esw->offloads.ipsec == DEVLINK_ESWITCH_IPSEC_MODE_FULL); ++#else ++ return 0; ++#endif + } + + #else /* CONFIG_MLX5_EN_IPSEC */ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0276-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0276-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..3b94d35 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0276-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,19 @@ +From: Roy Novich +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en/tc/post_meter.h + +Change-Id: I4ebbcbe018f9d8498b762f352b3381041f085fae +--- + drivers/net/ethernet/mellanox/mlx5/core/en/tc/post_meter.h | 1 + + 1 file changed, 1 insertion(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/post_meter.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/post_meter.h +@@ -4,6 +4,7 @@ + #ifndef __MLX5_EN_POST_METER_H__ + #define __MLX5_EN_POST_METER_H__ + ++ + #define packet_color_to_reg { \ + .mfield = MLX5_ACTION_IN_FIELD_METADATA_REG_C_5, \ + .moffset = 0, \ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0276-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lag.patch b/src/mlnx-ofa_kernel-5.8/backports/0276-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lag.patch new file mode 100644 index 0000000..413829a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0276-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lag.patch @@ -0,0 +1,199 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c + +Change-Id: I16d1870660bdd48baa6c1e7242c23a00f07a3f9d +--- + .../net/ethernet/mellanox/mlx5/core/lag/mp.c | 74 ++++++++++++++++++- + 1 file changed, 71 insertions(+), 3 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c +@@ -2,13 +2,21 @@ + /* Copyright (c) 2019 Mellanox Technologies. */ + + #include ++#ifdef HAVE_FIB_INFO_NH + #include +-#include "lag/lag.h" +-#include "lag/mp.h" ++#endif + #include "mlx5_core.h" + #include "eswitch.h" + #include "lib/mlx5.h" + ++#if defined(MLX_USE_LAG_COMPAT) || defined(HAVE_LAG_TX_TYPE) ++#define MLX_LAG_SUPPORTED ++#endif ++ ++#ifdef MLX_LAG_SUPPORTED ++#include "lag.h" ++ ++#ifdef HAVE_FIB_NH_NOTIFIER_INFO + static bool __mlx5_lag_is_multipath(struct mlx5_lag *ldev) + { + return !!(ldev->flags & MLX5_LAG_FLAG_MULTIPATH); +@@ -36,6 +44,7 @@ bool mlx5_lag_is_multipath(struct mlx5_c + + return res; + } ++#endif + + /** + * mlx5_lag_set_port_affinity +@@ -47,6 +56,7 @@ bool mlx5_lag_is_multipath(struct mlx5_c + * 2 - set affinity to port 2. + * + **/ ++#ifdef HAVE_FIB_NH_NOTIFIER_INFO + static void mlx5_lag_set_port_affinity(struct mlx5_lag *ldev, + enum mlx5_lag_port_affinity port) + { +@@ -141,11 +151,23 @@ static void mlx5_lag_fib_route_event(str + return; + + /* Handle add/replace event */ ++#ifdef HAVE_FIB_INFO_NH + nhs = fib_info_num_path(fi); ++#else ++ nhs = fi->fib_nhs; ++#endif + if (nhs == 1) { + if (__mlx5_lag_is_active(ldev)) { ++#ifdef HAVE_FIB_INFO_NH + struct fib_nh *nh = fib_info_nh(fi, 0); + struct net_device *nh_dev = nh->fib_nh_dev; ++#else ++#ifdef HAVE_FIB_NH_DEV ++ struct net_device *nh_dev = fi->fib_nh[0].fib_nh_dev; ++#else ++ struct net_device *nh_dev = fi->fib_nh[0].nh_dev; ++#endif ++#endif + int i = mlx5_lag_dev_get_netdev_idx(ldev, nh_dev); + + if (i < 0) +@@ -163,12 +185,28 @@ static void mlx5_lag_fib_route_event(str + return; + + /* Verify next hops are ports of the same hca */ ++#ifdef HAVE_FIB_INFO_NH + fib_nh0 = fib_info_nh(fi, 0); + fib_nh1 = fib_info_nh(fi, 1); + if (!(fib_nh0->fib_nh_dev == ldev->pf[MLX5_LAG_P1].netdev && + fib_nh1->fib_nh_dev == ldev->pf[MLX5_LAG_P2].netdev) && + !(fib_nh0->fib_nh_dev == ldev->pf[MLX5_LAG_P2].netdev && + fib_nh1->fib_nh_dev == ldev->pf[MLX5_LAG_P1].netdev)) { ++#else ++ fib_nh0 = &fi->fib_nh[0]; ++ fib_nh1 = &fi->fib_nh[1]; ++#ifdef HAVE_FIB_NH_DEV ++ if (!(fib_nh0->fib_nh_dev == ldev->pf[MLX5_LAG_P1].netdev && ++ fib_nh1->fib_nh_dev == ldev->pf[MLX5_LAG_P2].netdev) && ++ !(fib_nh0->fib_nh_dev == ldev->pf[MLX5_LAG_P2].netdev && ++ fib_nh1->fib_nh_dev == ldev->pf[MLX5_LAG_P1].netdev)) { ++#else ++ if (!(fib_nh0->nh_dev == ldev->pf[MLX5_LAG_P1].netdev && ++ fib_nh1->nh_dev == ldev->pf[MLX5_LAG_P2].netdev) && ++ !(fib_nh0->nh_dev == ldev->pf[MLX5_LAG_P2].netdev && ++ fib_nh1->nh_dev == ldev->pf[MLX5_LAG_P1].netdev)) { ++#endif ++#endif + mlx5_core_warn(ldev->pf[MLX5_LAG_P1].dev, + "Multipath offload require two ports of the same HCA\n"); + return; +@@ -202,14 +240,22 @@ static void mlx5_lag_fib_nexthop_event(s + + /* nh added/removed */ + if (event == FIB_EVENT_NH_DEL) { ++#ifdef HAVE_FIB_NH_DEV + int i = mlx5_lag_dev_get_netdev_idx(ldev, fib_nh->fib_nh_dev); ++#else ++ int i = mlx5_lag_dev_get_netdev_idx(ldev, fib_nh->nh_dev); ++#endif + + if (i >= 0) { + i = (i + 1) % 2 + 1; /* peer port */ + mlx5_lag_set_port_affinity(ldev, i); + } + } else if (event == FIB_EVENT_NH_ADD && ++#ifdef HAVE_FIB_INFO_NH + fib_info_num_path(fi) == 2) { ++#else ++ fi->fib_nhs == 2) { ++#endif + mlx5_lag_set_port_affinity(ldev, MLX5_LAG_NORMAL_AFFINITY); + } + } +@@ -225,6 +271,8 @@ static void mlx5_lag_fib_update(struct w + rtnl_lock(); + switch (fib_work->event) { + case FIB_EVENT_ENTRY_REPLACE: ++ case FIB_EVENT_ENTRY_APPEND: /* fall through */ ++ case FIB_EVENT_ENTRY_ADD: /* fall through */ + case FIB_EVENT_ENTRY_DEL: + mlx5_lag_fib_route_event(ldev, fib_work->event, + &fib_work->fen_info); +@@ -271,8 +319,10 @@ static int mlx5_lag_fib_event(struct not + struct mlx5_fib_event_work *fib_work; + struct fib_entry_notifier_info *fen_info; + struct fib_nh_notifier_info *fnh_info; +- struct net_device *fib_dev; + struct fib_info *fi; ++#ifdef HAVE_FIB_INFO_NH ++ struct net_device *fib_dev; ++#endif + + if (info->family != AF_INET) + return NOTIFY_DONE; +@@ -282,15 +332,22 @@ static int mlx5_lag_fib_event(struct not + + switch (event) { + case FIB_EVENT_ENTRY_REPLACE: ++ case FIB_EVENT_ENTRY_APPEND: /* fall through */ ++ case FIB_EVENT_ENTRY_ADD: /* fall through */ + case FIB_EVENT_ENTRY_DEL: + fen_info = container_of(info, struct fib_entry_notifier_info, + info); + fi = fen_info->fi; ++#ifdef HAVE_FIB_INFO_NH + if (fi->nh) + return NOTIFY_DONE; + fib_dev = fib_info_nh(fen_info->fi, 0)->fib_nh_dev; + if (fib_dev != ldev->pf[MLX5_LAG_P1].netdev && + fib_dev != ldev->pf[MLX5_LAG_P2].netdev) { ++#else ++ if (fi->fib_dev != ldev->pf[MLX5_LAG_P1].netdev && ++ fi->fib_dev != ldev->pf[MLX5_LAG_P2].netdev) { ++#endif + return NOTIFY_DONE; + } + fib_work = mlx5_lag_init_fib_work(ldev, event); +@@ -347,8 +404,13 @@ int mlx5_lag_mp_init(struct mlx5_lag *ld + return -ENOMEM; + + mp->fib_nb.notifier_call = mlx5_lag_fib_event; ++#ifdef HAVE_REGISTER_FIB_NOTIFIER_HAS_4_PARAMS + err = register_fib_notifier(&init_net, &mp->fib_nb, + mlx5_lag_fib_event_flush, NULL); ++#else ++ err = register_fib_notifier(&mp->fib_nb, ++ mlx5_lag_fib_event_flush); ++#endif + if (err) { + destroy_workqueue(mp->wq); + mp->fib_nb.notifier_call = NULL; +@@ -364,8 +426,14 @@ void mlx5_lag_mp_cleanup(struct mlx5_lag + if (!mp->fib_nb.notifier_call) + return; + ++#ifdef HAVE_REGISTER_FIB_NOTIFIER_HAS_4_PARAMS + unregister_fib_notifier(&init_net, &mp->fib_nb); ++#else ++ unregister_fib_notifier(&mp->fib_nb); ++#endif + destroy_workqueue(mp->wq); + mp->fib_nb.notifier_call = NULL; + mp->fib.mfi = NULL; + } ++#endif /* HAVE_FIB_NH_NOTIFIER_INFO */ ++#endif /* MLX_LAG_SUPPORTED */ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0277-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0277-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..65992fc --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0277-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,50 @@ +From: Roy Novich +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en/tc/int_port.c + +Change-Id: If47db432fd762dcbe3e7b7690136b13d8e980741 +--- + .../net/ethernet/mellanox/mlx5/core/en/tc/int_port.c | 10 +++++++++- + 1 file changed, 9 insertions(+), 1 deletion(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/int_port.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/int_port.c +@@ -30,8 +30,12 @@ struct mlx5e_tc_int_port_priv { + + bool mlx5e_tc_int_port_supported(const struct mlx5_eswitch *esw) + { ++#ifdef HAVE_FLOW_BLOCK_OFFLOAD + return mlx5_eswitch_vport_match_metadata_enabled(esw) && + MLX5_CAP_GEN(esw->dev, reg_c_preserve); ++#else ++ return false; ++#endif + } + + u32 mlx5e_tc_int_port_get_metadata(struct mlx5e_tc_int_port *int_port) +@@ -291,7 +295,7 @@ mlx5e_int_port_remove(struct mlx5e_tc_in + mlx5_del_flow_rules(int_port->rx_rule); + mapping_remove(ctx, int_port->mapping); + mlx5e_int_port_metadata_free(priv, int_port->match_metadata); +- kfree_rcu(int_port); ++ kfree_rcu(int_port, rcu_head); + priv->num_ports--; + } + +@@ -493,12 +497,16 @@ mlx5e_tc_int_port_dev_fwd(struct mlx5e_t + + if (fwd_type == MLX5E_TC_INT_PORT_INGRESS) { + skb->pkt_type = PACKET_HOST; ++#ifdef HAVE_SKB_SET_REDIRECTED + skb_set_redirected(skb, true); ++#endif + *forward_tx = false; + } else { + skb_reset_network_header(skb); + skb_push_rcsum(skb, skb->mac_len); ++#ifdef HAVE_SKB_SET_REDIRECTED + skb_set_redirected(skb, false); ++#endif + *forward_tx = true; + } + diff --git a/src/mlnx-ofa_kernel-5.8/backports/0277-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-esw.patch b/src/mlnx-ofa_kernel-5.8/backports/0277-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-esw.patch new file mode 100644 index 0000000..5781b5d --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0277-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-esw.patch @@ -0,0 +1,19 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c + +Change-Id: Ib3175f39254faa004772929db4998fc63d21b05a +--- + drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c +@@ -1113,7 +1113,7 @@ int mlx5_esw_devlink_rate_parent_set(str + return mlx5_esw_qos_vport_update_group(vport->dev->priv.eswitch, vport, group, extack); + } + +-#endif ++#endif /* HAVE_DEVLINK_HAS_RATE_FUNCTIONS */ + + int mlx5_esw_qos_vport_update_group(struct mlx5_eswitch *esw, + struct mlx5_vport *vport, diff --git a/src/mlnx-ofa_kernel-5.8/backports/0277-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lag.patch b/src/mlnx-ofa_kernel-5.8/backports/0277-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lag.patch new file mode 100644 index 0000000..fd59251 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0277-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lag.patch @@ -0,0 +1,20 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/lag/mp.h + +Change-Id: Ib0707276f568cd7d5e1bfc929cd440962aaccb49 +--- + drivers/net/ethernet/mellanox/mlx5/core/lag/mp.h | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.h +@@ -24,8 +24,7 @@ struct lag_mp { + struct workqueue_struct *wq; + }; + +-#ifdef CONFIG_MLX5_ESWITCH +- ++#if defined(CONFIG_MLX5_ESWITCH) && defined(HAVE_FIB_NH_NOTIFIER_INFO) + void mlx5_lag_mp_reset(struct mlx5_lag *ldev); + int mlx5_lag_mp_init(struct mlx5_lag *ldev); + void mlx5_lag_mp_cleanup(struct mlx5_lag *ldev); diff --git a/src/mlnx-ofa_kernel-5.8/backports/0277-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-pci.patch b/src/mlnx-ofa_kernel-5.8/backports/0277-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-pci.patch new file mode 100644 index 0000000..40fdf0e --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0277-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-pci.patch @@ -0,0 +1,156 @@ +From: Roy Novich +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c + +Change-Id: I520b477de3abbafeac059e5a5d63857b672ee60a +--- + .../net/ethernet/mellanox/mlx5/core/pci_irq.c | 59 +++++++++++++++++-- + 1 file changed, 55 insertions(+), 4 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c +@@ -135,7 +135,11 @@ static void irq_release(struct mlx5_irq + * before calling it. This is why there is asymmetry with set_rmap + * which should be called after alloc_irq but before request_irq. + */ ++#ifdef HAVE_IRQ_UPDATE_AFFINITY_HINT + irq_update_affinity_hint(irq->irqn, NULL); ++#else ++ irq_set_affinity_hint(irq->irqn, NULL); ++#endif + free_cpumask_var(irq->mask); + free_irq(irq->irqn, &irq->nh); + kfree(irq); +@@ -211,6 +215,10 @@ static void irq_set_name(struct mlx5_irq + struct mlx5_irq *mlx5_irq_alloc(struct mlx5_irq_pool *pool, int i, + const struct cpumask *affinity) + { ++#ifndef HAVE_PCI_IRQ_API ++ struct mlx5_priv *priv = &pool->dev->priv; ++ struct msix_entry *msix; ++#endif + struct mlx5_core_dev *dev = pool->dev; + char name[MLX5_MAX_IRQ_NAME]; + struct mlx5_irq *irq; +@@ -219,7 +227,12 @@ struct mlx5_irq *mlx5_irq_alloc(struct m + irq = kzalloc(sizeof(*irq), GFP_KERNEL); + if (!irq) + return ERR_PTR(-ENOMEM); ++#ifdef HAVE_PCI_IRQ_API + irq->irqn = pci_irq_vector(dev->pdev, i); ++#else ++ msix = priv->msix_arr; ++ irq->irqn = msix[i].vector; ++#endif + if (!mlx5_irq_pool_is_sf_pool(pool)) + irq_set_name(pool, name, i); + else +@@ -240,7 +253,11 @@ struct mlx5_irq *mlx5_irq_alloc(struct m + } + if (affinity) { + cpumask_copy(irq->mask, affinity); ++#ifdef HAVE_IRQ_UPDATE_AFFINITY_HINT + irq_set_affinity_and_hint(irq->irqn, irq->mask); ++#else ++ irq_set_affinity_hint(irq->irqn, irq->mask); ++#endif + } + irq->pool = pool; + irq->refcount = 1; +@@ -253,7 +270,11 @@ struct mlx5_irq *mlx5_irq_alloc(struct m + } + return irq; + err_xa: ++#ifdef HAVE_IRQ_UPDATE_AFFINITY_HINT + irq_update_affinity_hint(irq->irqn, NULL); ++#else ++ irq_set_affinity_hint(irq->irqn, irq->mask); ++#endif + free_cpumask_var(irq->mask); + err_cpumask: + free_irq(irq->irqn, &irq->nh); +@@ -470,7 +491,7 @@ int mlx5_irqs_request_vectors(struct mlx + struct mlx5_irq **irqs) + { + cpumask_var_t req_mask; +- struct mlx5_irq *irq; ++ struct mlx5_irq *irq = NULL; + int i; + + if (!zalloc_cpumask_var(&req_mask, GFP_KERNEL)) +@@ -556,10 +577,12 @@ int mlx5_irqs_request_mask(struct mlx5_c + return i; + } + irqs[i] = irq; ++#ifdef HAVE_PCI_IRQ_API + mlx5_core_dbg(dev, "IRQ %u mapped to cpu %*pbl, %u EQs on this irq\n", + pci_irq_vector(dev->pdev, mlx5_irq_get_index(irq)), + cpumask_pr_args(mlx5_irq_get_affinity_mask(irq)), + mlx5_irq_read_locked(irq) / MLX5_EQ_REFS_PER_IRQ); ++#endif + } + return i; + } +@@ -718,6 +741,10 @@ int mlx5_irq_table_create(struct mlx5_co + int total_vec; + int pf_vec; + int err; ++#ifndef HAVE_PCI_IRQ_API ++ struct mlx5_priv* priv = &dev->priv; ++ int i; ++#endif + + if (mlx5_core_is_sf(dev)) + return 0; +@@ -738,16 +765,34 @@ int mlx5_irq_table_create(struct mlx5_co + if (mlx5_sf_max_functions(dev)) + total_vec += MLX5_IRQ_CTRL_SF_MAX + + MLX5_COMP_EQS_PER_SF * mlx5_sf_max_functions(dev); +- ++#ifndef HAVE_PCI_IRQ_API ++ priv->msix_arr = kcalloc(total_vec, sizeof(*priv->msix_arr), GFP_KERNEL); ++ if (!priv->msix_arr) ++ return -ENOMEM; ++ ++ for (i = 0; i < total_vec; i++) ++ priv->msix_arr[i].entry = i; ++#endif ++#ifdef HAVE_PCI_IRQ_API + total_vec = pci_alloc_irq_vectors(dev->pdev, 1, total_vec, PCI_IRQ_MSIX); ++#else /* HAVE_PCI_IRQ_API */ ++ total_vec = pci_enable_msix_range(dev->pdev, priv->msix_arr, ++ MLX5_PF_IRQ_CTRL_NUM + 1, total_vec); ++#endif /* HAVE_PCI_IRQ_API */ + if (total_vec < 0) + return total_vec; ++ + pf_vec = min(pf_vec, total_vec); + + err = irq_pools_init(dev, total_vec - pf_vec, pf_vec); +- if (err) ++ if (err) { ++#ifdef HAVE_PCI_IRQ_API + pci_free_irq_vectors(dev->pdev); +- ++#else ++ pci_disable_msix(dev->pdev); ++ kfree(priv->msix_arr); ++#endif ++ } + return err; + } + +@@ -762,7 +807,13 @@ void mlx5_irq_table_destroy(struct mlx5_ + * to here. Hence, making sure all the irqs are released. + */ + irq_pools_destroy(table); +- pci_free_irq_vectors(dev->pdev); ++#ifdef HAVE_PCI_IRQ_API ++ pci_free_irq_vectors(dev->pdev); ++#else ++ pci_disable_msix(dev->pdev); ++ kfree(dev->priv.msix_arr); ++#endif ++ + } + + bool mlx5_irq_table_have_dedicated_sfs_irqs(struct mlx5_irq_table *table) diff --git a/src/mlnx-ofa_kernel-5.8/backports/0278-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-esw.patch b/src/mlnx-ofa_kernel-5.8/backports/0278-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-esw.patch new file mode 100644 index 0000000..b510d89 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0278-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-esw.patch @@ -0,0 +1,18 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/esw/qos.h + +Change-Id: I68012885237022db11582b6049878c6b47fb2141 +--- + drivers/net/ethernet/mellanox/mlx5/core/esw/qos.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.h +@@ -79,6 +79,6 @@ int mlx5_esw_devlink_rate_parent_set(str + + #endif /* HAVE_DEVLINK_HAS_RATE_FUNCTIONS */ + +-#endif ++#endif /* CONFIG_MLX5_ESWITCH */ + + #endif diff --git a/src/mlnx-ofa_kernel-5.8/backports/0278-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lag.patch b/src/mlnx-ofa_kernel-5.8/backports/0278-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lag.patch new file mode 100644 index 0000000..03be45a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0278-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lag.patch @@ -0,0 +1,39 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c + +Change-Id: Ibcc893ccff6a13435e24c761e1c4c426ec118095 +--- + drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c +@@ -1,6 +1,7 @@ + // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB + /* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. */ + ++#ifdef HAVE_INFO_HASH_TYPE + #include + #include "lag.h" + +@@ -268,12 +269,14 @@ static int mlx5_lag_set_definer(u32 *mat + MLX5_SET_TO_ONES(match_definer_format_0, match_definer_mask, + outer_smac_15_0); + ++#ifdef HAVE_NETDEV_LAG_HASH_VLAN_SRCMAC + if (hash == NETDEV_LAG_HASH_VLAN_SRCMAC) { + MLX5_SET_TO_ONES(match_definer_format_0, + match_definer_mask, + outer_first_vlan_vid); + break; + } ++#endif + + MLX5_SET_TO_ONES(match_definer_format_0, match_definer_mask, + outer_ethertype); +@@ -639,3 +642,4 @@ void mlx5_lag_port_sel_destroy(struct ml + mlx5_lag_destroy_definers(ldev); + memset(port_sel, 0, sizeof(*port_sel)); + } ++#endif diff --git a/src/mlnx-ofa_kernel-5.8/backports/0279-BACKPORT-drivers-infiniband-core-rw.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0279-BACKPORT-drivers-infiniband-core-rw.c.patch new file mode 100644 index 0000000..6881c11 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0279-BACKPORT-drivers-infiniband-core-rw.c.patch @@ -0,0 +1,249 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/core/rw.c + +Change-Id: Ifffbd0165bed7e78da8adfe74e422e659284f81a +--- + drivers/infiniband/core/rw.c | 106 ++++++++++++++++++++++++++++++++++- + 1 file changed, 105 insertions(+), 1 deletion(-) + +--- a/drivers/infiniband/core/rw.c ++++ b/drivers/infiniband/core/rw.c +@@ -2,7 +2,9 @@ + /* + * Copyright (c) 2016 HGST, a Western Digital Company. + */ ++#ifdef HAVE_NET_MEMREMAP_H + #include ++#endif + #include + #include + #include +@@ -275,6 +277,7 @@ static int rdma_rw_init_single_wr(struct + return 1; + } + ++#ifndef HAVE_DMA_F_PCI_P2PDMA_SUPPORTED + static void rdma_rw_unmap_sg(struct ib_device *dev, struct scatterlist *sg, + u32 sg_cnt, enum dma_data_direction dir) + { +@@ -284,23 +287,42 @@ static void rdma_rw_unmap_sg(struct ib_d + ib_dma_unmap_sg(dev, sg, sg_cnt, dir); + } + ++#ifdef HAVE_DMA_MAP_SGTABLE + static int rdma_rw_map_sgtable(struct ib_device *dev, struct sg_table *sgt, + enum dma_data_direction dir) ++#else ++static int rdma_rw_map_sg(struct ib_device *dev, struct scatterlist *sg, ++ u32 sg_cnt, enum dma_data_direction dir) ++#endif + { ++#ifdef HAVE_DMA_MAP_SGTABLE + int nents; + + if (is_pci_p2pdma_page(sg_page(sgt->sgl))) { ++#else ++ if (is_pci_p2pdma_page(sg_page(sg))) { ++#endif + if (WARN_ON_ONCE(ib_uses_virt_dma(dev))) + return 0; ++#ifdef HAVE_DMA_MAP_SGTABLE + nents = pci_p2pdma_map_sg(dev->dma_device, sgt->sgl, + sgt->orig_nents, dir); + if (!nents) + return -EIO; + sgt->nents = nents; + return 0; ++#else ++ return pci_p2pdma_map_sg(dev->dma_device, sg, sg_cnt, dir); ++#endif ++ + } ++#ifdef HAVE_DMA_MAP_SGTABLE + return ib_dma_map_sgtable_attrs(dev, sgt, dir, 0); ++#else ++ return ib_dma_map_sg(dev, sg, sg_cnt, dir); ++#endif + } ++#endif/* HAVE_DMA_F_PCI_P2PDMA_SUPPORTED */ + + /** + * rdma_rw_ctx_init - initialize a RDMA READ/WRITE context +@@ -322,16 +344,29 @@ int rdma_rw_ctx_init(struct rdma_rw_ctx + u64 remote_addr, u32 rkey, enum dma_data_direction dir) + { + struct ib_device *dev = qp->pd->device; ++#ifdef HAVE_DMA_MAP_SGTABLE + struct sg_table sgt = { + .sgl = sg, + .orig_nents = sg_cnt, + }; ++#endif + int ret; + ++#ifdef HAVE_DMA_MAP_SGTABLE ++#ifdef HAVE_DMA_F_PCI_P2PDMA_SUPPORTED /* forwardport */ ++ ret = ib_dma_map_sgtable_attrs(dev, &sgt, dir, 0); ++#else + ret = rdma_rw_map_sgtable(dev, &sgt, dir); ++#endif + if (ret) + return ret; + sg_cnt = sgt.nents; ++#else ++ ret = rdma_rw_map_sg(dev, sg, sg_cnt, dir); ++ if (!ret) ++ return -ENOMEM; ++ sg_cnt = ret; ++#endif + + /* + * Skip to the S/G entry that sg_offset falls into: +@@ -367,7 +402,15 @@ int rdma_rw_ctx_init(struct rdma_rw_ctx + return ret; + + out_unmap_sg: ++#ifdef HAVE_DMA_MAP_SGTABLE ++#ifdef HAVE_DMA_F_PCI_P2PDMA_SUPPORTED /* forwardport */ ++ ib_dma_unmap_sgtable_attrs(dev, &sgt, dir, 0); ++#else + rdma_rw_unmap_sg(dev, sgt.sgl, sgt.orig_nents, dir); ++#endif ++#else ++ rdma_rw_unmap_sg(dev, sg, sg_cnt, dir); ++#endif + return ret; + } + EXPORT_SYMBOL(rdma_rw_ctx_init); +@@ -398,6 +441,7 @@ int rdma_rw_ctx_signature_init(struct rd + struct ib_device *dev = qp->pd->device; + u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device, + qp->integrity_en); ++#ifdef HAVE_DMA_MAP_SGTABLE + struct sg_table sgt = { + .sgl = sg, + .orig_nents = sg_cnt, +@@ -406,6 +450,7 @@ int rdma_rw_ctx_signature_init(struct rd + .sgl = prot_sg, + .orig_nents = prot_sg_cnt, + }; ++#endif + struct ib_rdma_wr *rdma_wr; + int count = 0, ret; + +@@ -415,14 +460,39 @@ int rdma_rw_ctx_signature_init(struct rd + return -EINVAL; + } + ++#ifdef HAVE_DMA_MAP_SGTABLE ++#ifdef HAVE_DMA_F_PCI_P2PDMA_SUPPORTED /* forwardport */ ++ ret = ib_dma_map_sgtable_attrs(dev, &sgt, dir, 0); ++#else + ret = rdma_rw_map_sgtable(dev, &sgt, dir); ++#endif + if (ret) + return ret; ++#else ++ ret = rdma_rw_map_sg(dev, sg, sg_cnt, dir); ++ if (!ret) ++ return -ENOMEM; ++ sg_cnt = ret; ++#endif + + if (prot_sg_cnt) { ++#ifdef HAVE_DMA_MAP_SGTABLE ++#ifdef HAVE_DMA_F_PCI_P2PDMA_SUPPORTED /* forwardport */ ++ ret = ib_dma_map_sgtable_attrs(dev, &prot_sgt, dir, 0); ++#else + ret = rdma_rw_map_sgtable(dev, &prot_sgt, dir); ++#endif + if (ret) ++#else ++ ret = rdma_rw_map_sg(dev, prot_sg, prot_sg_cnt, dir); ++ if (!ret) { ++ ret = -ENOMEM; ++#endif + goto out_unmap_sg; ++#ifndef HAVE_DMA_MAP_SGTABLE ++ } ++ prot_sg_cnt = ret; ++#endif + } + + ctx->type = RDMA_RW_SIG_MR; +@@ -443,11 +513,20 @@ int rdma_rw_ctx_signature_init(struct rd + + memcpy(ctx->reg->mr->sig_attrs, sig_attrs, sizeof(struct ib_sig_attrs)); + ++#ifdef HAVE_DMA_MAP_SGTABLE + ret = ib_map_mr_sg_pi(ctx->reg->mr, sg, sgt.nents, NULL, prot_sg, + prot_sgt.nents, NULL, SZ_4K); ++#else ++ ret = ib_map_mr_sg_pi(ctx->reg->mr, sg, sg_cnt, NULL, prot_sg, ++ prot_sg_cnt, NULL, SZ_4K); ++#endif + if (unlikely(ret)) { ++#ifdef HAVE_DMA_MAP_SGTABLE + pr_err("failed to map PI sg (%u)\n", + sgt.nents + prot_sgt.nents); ++#else ++ pr_err("failed to map PI sg (%u)\n", sg_cnt + prot_sg_cnt); ++#endif + goto out_destroy_sig_mr; + } + +@@ -486,10 +565,27 @@ out_destroy_sig_mr: + out_free_ctx: + kfree(ctx->reg); + out_unmap_prot_sg: ++#ifdef HAVE_DMA_MAP_SGTABLE + if (prot_sgt.nents) ++#ifdef HAVE_DMA_F_PCI_P2PDMA_SUPPORTED /* forwardport */ ++ ib_dma_unmap_sgtable_attrs(dev, &prot_sgt, dir, 0); ++#else + rdma_rw_unmap_sg(dev, prot_sgt.sgl, prot_sgt.orig_nents, dir); ++#endif ++#else ++ if (prot_sg_cnt) ++ rdma_rw_unmap_sg(dev, prot_sg, prot_sg_cnt, dir); ++#endif + out_unmap_sg: ++#ifdef HAVE_DMA_MAP_SGTABLE ++#ifdef HAVE_DMA_F_PCI_P2PDMA_SUPPORTED /* forwardport */ ++ ib_dma_unmap_sgtable_attrs(dev, &sgt, dir, 0); ++#else + rdma_rw_unmap_sg(dev, sgt.sgl, sgt.orig_nents, dir); ++#endif ++#else ++ rdma_rw_unmap_sg(dev, sg, sg_cnt, dir); ++#endif + return ret; + } + EXPORT_SYMBOL(rdma_rw_ctx_signature_init); +@@ -621,8 +717,11 @@ void rdma_rw_ctx_destroy(struct rdma_rw_ + BUG(); + break; + } +- ++#ifdef HAVE_DMA_F_PCI_P2PDMA_SUPPORTED /* forwardport */ ++ ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir); ++#else + rdma_rw_unmap_sg(qp->pd->device, sg, sg_cnt, dir); ++#endif + } + EXPORT_SYMBOL(rdma_rw_ctx_destroy); + +@@ -650,8 +749,13 @@ void rdma_rw_ctx_destroy_signature(struc + kfree(ctx->reg); + + if (prot_sg_cnt) ++#ifdef HAVE_DMA_F_PCI_P2PDMA_SUPPORTED /* forwardport */ ++ ib_dma_unmap_sg(qp->pd->device, prot_sg, prot_sg_cnt, dir); ++ ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir); ++#else + rdma_rw_unmap_sg(qp->pd->device, prot_sg, prot_sg_cnt, dir); + rdma_rw_unmap_sg(qp->pd->device, sg, sg_cnt, dir); ++#endif + } + EXPORT_SYMBOL(rdma_rw_ctx_destroy_signature); + diff --git a/src/mlnx-ofa_kernel-5.8/backports/0279-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lag.patch b/src/mlnx-ofa_kernel-5.8/backports/0279-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lag.patch new file mode 100644 index 0000000..3393661 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0279-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-lag.patch @@ -0,0 +1,33 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.h + +Change-Id: I102e9442db466b5ac1d5a808303ebca1d2053601 +--- + drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.h | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.h +@@ -25,8 +25,7 @@ struct mlx5_lag_port_sel { + struct mlx5_lag_ttc inner; + }; + +-#ifdef CONFIG_MLX5_ESWITCH +- ++#if defined(CONFIG_MLX5_ESWITCH) && defined(HAVE_INFO_HASH_TYPE) + int mlx5_lag_port_sel_modify(struct mlx5_lag *ldev, u8 *ports); + void mlx5_lag_port_sel_destroy(struct mlx5_lag *ldev); + int mlx5_lag_port_sel_create(struct mlx5_lag *ldev, +@@ -34,7 +33,11 @@ int mlx5_lag_port_sel_create(struct mlx5 + + #else /* CONFIG_MLX5_ESWITCH */ + static inline int mlx5_lag_port_sel_create(struct mlx5_lag *ldev, ++#ifdef HAVE_INFO_HASH_TYPE + enum netdev_lag_hash hash_type, ++#else ++ int hash_type, ++#endif + u8 *ports) + { + return 0; diff --git a/src/mlnx-ofa_kernel-5.8/backports/0280-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0280-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..be8ed48 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0280-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,30 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en/channels.c + +Change-Id: Icc09bb0cfd65339a464c01360290902a25b3bda4 +--- + drivers/net/ethernet/mellanox/mlx5/core/en/channels.c | 5 +++++ + 1 file changed, 5 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/channels.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/channels.c +@@ -22,6 +22,7 @@ void mlx5e_channels_get_regular_rqn(stru + + bool mlx5e_channels_get_xsk_rqn(struct mlx5e_channels *chs, unsigned int ix, u32 *rqn) + { ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + struct mlx5e_channel *c; + + WARN_ON(ix >= mlx5e_channels_get_num(chs)); +@@ -32,6 +33,10 @@ bool mlx5e_channels_get_xsk_rqn(struct m + + *rqn = c->xskrq.rqn; + return true; ++#else ++ return false; ++#endif ++ + } + + bool mlx5e_channels_get_ptp_rqn(struct mlx5e_channels *chs, u32 *rqn) diff --git a/src/mlnx-ofa_kernel-5.8/backports/0281-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-irq.patch b/src/mlnx-ofa_kernel-5.8/backports/0281-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-irq.patch new file mode 100644 index 0000000..10538ee --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0281-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-irq.patch @@ -0,0 +1,75 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/irq_affinity.c + +Change-Id: I2811c0a3010eca033558e7c4fcfbdbcc09934327 +--- + .../mellanox/mlx5/core/irq_affinity.c | 22 ++++++++++++++++++- + 1 file changed, 21 insertions(+), 1 deletion(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/irq_affinity.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/irq_affinity.c +@@ -152,10 +152,15 @@ mlx5_irq_affinity_request(struct mlx5_ir + out: + mlx5_irq_get_locked(least_loaded_irq); + if (mlx5_irq_read_locked(least_loaded_irq) > pool->max_threshold) ++#ifdef HAVE_PCI_IRQ_API + mlx5_core_dbg(pool->dev, "IRQ %u overloaded, pool_name: %s, %u EQs on this irq\n", + pci_irq_vector(pool->dev->pdev, + mlx5_irq_get_index(least_loaded_irq)), pool->name, + mlx5_irq_read_locked(least_loaded_irq) / MLX5_EQ_REFS_PER_IRQ); ++#else ++ mlx5_core_dbg(pool->dev, "IRQ overloaded, pool_name: %s, %u EQs on this irq\n", ++ pool->name, mlx5_irq_read_locked(least_loaded_irq) / MLX5_EQ_REFS_PER_IRQ); ++#endif + unlock: + mutex_unlock(&pool->lock); + return least_loaded_irq; +@@ -166,12 +171,21 @@ void mlx5_irq_affinity_irqs_release(stru + { + struct mlx5_irq_pool *pool = mlx5_irq_pool_get(dev); + int i; ++#ifndef HAVE_PCI_IRQ_API ++ struct mlx5_priv* priv = &dev->priv; ++#endif ++ + + for (i = 0; i < num_irqs; i++) { + int cpu = cpumask_first(mlx5_irq_get_affinity_mask(irqs[i])); + ++#ifdef HAVE_PCI_IRQ_API + synchronize_irq(pci_irq_vector(pool->dev->pdev, + mlx5_irq_get_index(irqs[i]))); ++#else ++ int index = mlx5_irq_get_index(irqs[i]); ++ synchronize_irq(priv->msix_arr[index].vector); ++#endif + if (mlx5_irq_put(irqs[i])) + if (pool->irqs_per_cpu) + cpu_put(pool, cpu); +@@ -198,7 +212,7 @@ int mlx5_irq_affinity_irqs_request_auto( + { + struct mlx5_irq_pool *pool = mlx5_irq_pool_get(dev); + cpumask_var_t req_mask; +- struct mlx5_irq *irq; ++ struct mlx5_irq *irq = NULL; + int i = 0; + + if (!zalloc_cpumask_var(&req_mask, GFP_KERNEL)) +@@ -217,10 +231,16 @@ int mlx5_irq_affinity_irqs_request_auto( + break; + irqs[i] = irq; + cpumask_clear_cpu(cpumask_first(mlx5_irq_get_affinity_mask(irq)), req_mask); ++#ifdef HAVE_PCI_IRQ_API + mlx5_core_dbg(dev, "IRQ %u mapped to cpu %*pbl, %u EQs on this irq\n", + pci_irq_vector(dev->pdev, mlx5_irq_get_index(irq)), + cpumask_pr_args(mlx5_irq_get_affinity_mask(irq)), + mlx5_irq_read_locked(irq) / MLX5_EQ_REFS_PER_IRQ); ++#else ++ mlx5_core_dbg(dev, "IRQ mapped to cpu %*pbl, %u EQs on this irq\n", ++ cpumask_pr_args(mlx5_irq_get_affinity_mask(irq)), ++ mlx5_irq_read_locked(irq) / MLX5_EQ_REFS_PER_IRQ); ++#endif + } + free_cpumask_var(req_mask); + if (!i) diff --git a/src/mlnx-ofa_kernel-5.8/backports/0282-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0282-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..e5a67f7 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0282-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,34 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.c + +Change-Id: I8c0c40f256e4af2c39779c049c31e5655ff97199 +--- + drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.c +@@ -548,7 +548,6 @@ void mlx5e_rx_res_channels_activate(stru + + if (!(res->features & MLX5E_RX_RES_FEATURE_XSK)) + continue; +- + if (!mlx5e_channels_get_xsk_rqn(chs, ix, &rqn)) + rqn = res->drop_rqn; + err = mlx5e_rqt_redirect_direct(&res->channels[ix].xsk_rqt, rqn); +@@ -621,6 +620,7 @@ void mlx5e_rx_res_channels_deactivate(st + } + } + ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + int mlx5e_rx_res_xsk_activate(struct mlx5e_rx_res *res, struct mlx5e_channels *chs, + unsigned int ix) + { +@@ -649,6 +649,7 @@ int mlx5e_rx_res_xsk_deactivate(struct m + res->drop_rqn, ix, err); + return err; + } ++#endif + + int mlx5e_rx_res_packet_merge_set_param(struct mlx5e_rx_res *res, + struct mlx5e_packet_merge_param *pkt_merge_param) diff --git a/src/mlnx-ofa_kernel-5.8/backports/0283-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0283-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..39296fd --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0283-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,22 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/en/rss.c + +Change-Id: I46ac9d85e57188b9062a65261f637aa22e2a9250 +--- + drivers/net/ethernet/mellanox/mlx5/core/en/rss.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/rss.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rss.c +@@ -327,7 +327,11 @@ err_out: + + int mlx5e_rss_cleanup(struct mlx5e_rss *rss) + { ++#ifdef HAVE_REFCOUNT_DEC_IF_ONE + if (!refcount_dec_if_one(&rss->refcnt)) ++#else ++ if (atomic_cmpxchg(&rss->refcnt, 1, 0) != 1) ++#endif + return -EBUSY; + + mlx5e_rss_destroy_tirs(rss, false); diff --git a/src/mlnx-ofa_kernel-5.8/backports/0284-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-mlx.patch b/src/mlnx-ofa_kernel-5.8/backports/0284-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-mlx.patch new file mode 100644 index 0000000..4ea2981 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0284-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-mlx.patch @@ -0,0 +1,66 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h + +Change-Id: I2c999637681fee9cfbab15dabff97b008acf57d7 +--- + .../ethernet/mellanox/mlx5/core/mlx5_core.h | 35 +++++-------------- + 1 file changed, 9 insertions(+), 26 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h +@@ -99,30 +99,6 @@ do { \ + __func__, __LINE__, current->pid, \ + ##__VA_ARGS__) + +-static inline void mlx5_printk(struct mlx5_core_dev *dev, int level, const char *format, ...) +-{ +- struct device *device = dev->device; +- struct va_format vaf; +- va_list args; +- +- if (WARN_ONCE(level < LOGLEVEL_EMERG || level > LOGLEVEL_DEBUG, +- "Level %d is out of range, set to default level\n", level)) +- level = LOGLEVEL_DEFAULT; +- +- va_start(args, format); +- vaf.fmt = format; +- vaf.va = &args; +- +- dev_printk_emit(level, device, "%s %s: %pV", dev_driver_string(device), dev_name(device), +- &vaf); +- va_end(args); +-} +- +-#define mlx5_log(__dev, level, format, ...) \ +- mlx5_printk(__dev, level, "%s:%d:(pid %d): " format, \ +- __func__, __LINE__, current->pid, \ +- ##__VA_ARGS__) +- + static inline struct device *mlx5_core_dma_dev(struct mlx5_core_dev *dev) + { + return &dev->pdev->dev; +@@ -342,8 +318,11 @@ void mlx5_dm_cleanup(struct mlx5_core_de + MLX5_CAP_MCAM_FEATURE((mdev), mtpps_fs) && \ + MLX5_CAP_MCAM_FEATURE((mdev), mtpps_enh_out_per_adj)) + +-int mlx5_firmware_flash(struct mlx5_core_dev *dev, const struct firmware *fw, +- struct netlink_ext_ack *extack); ++int mlx5_firmware_flash(struct mlx5_core_dev *dev, const struct firmware *fw ++#ifdef HAVE_NETLINK_EXT_ACK ++ , struct netlink_ext_ack *extack ++#endif ++ ); + int mlx5_fw_version_query(struct mlx5_core_dev *dev, + u32 *running_ver, u32 *stored_ver); + +@@ -462,6 +441,10 @@ void mlx5_uninit_one(struct mlx5_core_de + void mlx5_pcie_print_link_status(struct mlx5_core_dev *dev); + void mlx5_unload_one(struct mlx5_core_dev *dev); + int mlx5_load_one(struct mlx5_core_dev *dev, bool recovery); ++#ifdef HAVE_DEVL_TRAP_GROUPS_REGISTER //forward port ++void mlx5_unload_one_devl_locked(struct mlx5_core_dev *dev); ++int mlx5_load_one_devl_locked(struct mlx5_core_dev *dev, bool recovery); ++#endif + + int mlx5_vport_get_other_func_cap(struct mlx5_core_dev *dev, u16 function_id, void *out); + diff --git a/src/mlnx-ofa_kernel-5.8/backports/0285-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-sf-.patch b/src/mlnx-ofa_kernel-5.8/backports/0285-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-sf-.patch new file mode 100644 index 0000000..926481e --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0285-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-sf-.patch @@ -0,0 +1,27 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/sf/diag/vhca_tracepoint.h + +Change-Id: I15a767cb5b4d1b58c299df416f76746a2bde13db +--- + .../net/ethernet/mellanox/mlx5/core/sf/diag/vhca_tracepoint.h | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/sf/diag/vhca_tracepoint.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/diag/vhca_tracepoint.h +@@ -11,6 +11,7 @@ + #include + #include "sf/vhca_event.h" + ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + TRACE_EVENT(mlx5_sf_vhca_event, + TP_PROTO(const struct mlx5_core_dev *dev, + const struct mlx5_vhca_state_event *event), +@@ -29,6 +30,7 @@ TRACE_EVENT(mlx5_sf_vhca_event, + __get_str(devname), __entry->hw_fn_id, + __entry->sfnum, __entry->vhca_state) + ); ++#endif /* defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) */ + + #endif /* _MLX5_SF_VHCA_TP_ */ + diff --git a/src/mlnx-ofa_kernel-5.8/backports/0286-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-sf-.patch b/src/mlnx-ofa_kernel-5.8/backports/0286-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-sf-.patch new file mode 100644 index 0000000..4396bbf --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0286-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-sf-.patch @@ -0,0 +1,21 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.c + +Change-Id: I92cc488ad8638484666502bd8a6d876be30ac347 +--- + drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.c +@@ -84,7 +84,9 @@ mlx5_vhca_event_notify(struct mlx5_core_ + vhca_state_context.vhca_state); + + mlx5_vhca_event_arm(dev, event->function_id); ++#if defined(HAVE_TRACE_EVENTS_H) && !defined(MLX_DISABLE_TRACEPOINTS) + trace_mlx5_sf_vhca_event(dev, event); ++#endif + + blocking_notifier_call_chain(&dev->priv.vhca_state_notifier->n_head, 0, event); + } diff --git a/src/mlnx-ofa_kernel-5.8/backports/0287-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch b/src/mlnx-ofa_kernel-5.8/backports/0287-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch new file mode 100644 index 0000000..6eb2d81 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0287-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en_.patch @@ -0,0 +1,119 @@ +From: Roy Novich +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en_debugfs.c + +Change-Id: Ib267b896da7d29a1bb21025fb461b073957cb322 +--- + .../ethernet/mellanox/mlx5/core/en_debugfs.c | 53 +++++++++++++++++++ + 1 file changed, 53 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_debugfs.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_debugfs.c +@@ -35,6 +35,7 @@ + #include "en.h" + #include "en/rx_res.h" + #include "en/rss.h" ++#include "linux/namei.h" + + /* For non-default namespaces, add suffix in format "@" */ + /* PCI id format: "%04x:%02x:%02x.%d" pci_domain bus_num pci_slot pci_func */ +@@ -79,6 +80,37 @@ static void mlx5e_create_channel_debugfs + &channel->rq.cq.mcq.cqn); + } + ++#ifndef HAVE_DEBUGFS_LOOKUP ++struct dentry *mlx5e_debugfs_lookup(const char *name, struct dentry *parent) ++{ ++ struct dentry *dentry; ++ ++ if (IS_ERR(parent)) ++ return NULL; ++ ++#ifdef HAVE_INODE_LOCK ++ inode_lock(d_inode(parent)); ++#else ++ mutex_lock(&(parent->d_inode)->i_mutex); ++#endif ++ dentry = lookup_one_len(name, parent, strlen(name)); ++#ifdef HAVE_INODE_LOCK ++ inode_unlock(d_inode(parent)); ++#else ++ mutex_unlock(&(parent->d_inode)->i_mutex); ++#endif ++ ++ if (IS_ERR(dentry)) ++ return NULL; ++ if (dentry->d_inode == NULL) { ++ dput(dentry); ++ return NULL; ++ } ++ return dentry; ++} ++#endif ++ ++ + struct rx_res_debugfs { + struct mlx5e_rx_res *rx_res; + int i; +@@ -100,8 +132,13 @@ static int get_tir_indir(void *data, u64 + return 0; + } + ++#ifdef HAVE_DEBUGFS_CREATE_FILE_UNSAFE + DEFINE_DEBUGFS_ATTRIBUTE(fops_dir, get_tir_dir, NULL, "%llu\n"); + DEFINE_DEBUGFS_ATTRIBUTE(fops_indir, get_tir_indir, NULL, "%llu\n"); ++#else ++DEFINE_SIMPLE_ATTRIBUTE(fops_dir, get_tir_dir, NULL, "%llu\n"); ++DEFINE_SIMPLE_ATTRIBUTE(fops_indir, get_tir_indir, NULL, "%llu\n"); ++#endif + + void mlx5e_create_debugfs(struct mlx5e_priv *priv) + { +@@ -147,7 +184,11 @@ void mlx5e_create_debugfs(struct mlx5e_p + + rx_res_dbg->i = i; + snprintf(name, MLX5_MAX_DEBUGFS_NAME_LEN, "indir-tirn-%d", i); ++#ifdef HAVE_DEBUGFS_CREATE_FILE_UNSAFE + debugfs_create_file_unsafe(name, 0400, priv->dfs_root, rx_res_dbg, &fops_indir); ++#else ++ debugfs_create_file(name, 0400, priv->dfs_root, rx_res_dbg, &fops_indir); ++#endif + } + + for (i = 0; i < priv->max_nch; i++) { +@@ -155,7 +196,11 @@ void mlx5e_create_debugfs(struct mlx5e_p + + rx_res_dbg->i = i; + snprintf(name, MLX5_MAX_DEBUGFS_NAME_LEN, "dir-tirn-%d", i); ++#ifdef HAVE_DEBUGFS_CREATE_FILE_UNSAFE + debugfs_create_file_unsafe(name, 0400, priv->dfs_root, rx_res_dbg, &fops_dir); ++#else ++ debugfs_create_file(name, 0400, priv->dfs_root, rx_res_dbg, &fops_dir); ++#endif + } + + for (i = 0; i < priv->channels.num; i++) +@@ -171,7 +216,11 @@ void mlx5e_debugs_free_recursive_private + for (i = 0; i < MLX5E_NUM_INDIR_TIRS; i++) { + snprintf(name, MLX5_MAX_DEBUGFS_NAME_LEN, "indir-tirn-%d", i); + ++#ifdef HAVE_DEBUGFS_LOOKUP + dent = debugfs_lookup(name, priv->dfs_root); ++#else ++ dent = mlx5e_debugfs_lookup(name, priv->dfs_root); ++#endif + if (dent && dent->d_inode && dent->d_inode->i_private) + kvfree(dent->d_inode->i_private); + } +@@ -179,7 +228,11 @@ void mlx5e_debugs_free_recursive_private + for (i = 0; i < priv->max_nch; i++) { + snprintf(name, MLX5_MAX_DEBUGFS_NAME_LEN, "dir-tirn-%d", i); + ++#ifdef HAVE_DEBUGFS_LOOKUP + dent = debugfs_lookup(name, priv->dfs_root); ++#else ++ dent = mlx5e_debugfs_lookup(name, priv->dfs_root); ++#endif + if (dent && dent->d_inode && dent->d_inode->i_private) + kvfree(dent->d_inode->i_private); + } diff --git a/src/mlnx-ofa_kernel-5.8/backports/0288-BACKPORT-drivers-infiniband-hw-mlx5-main_ext.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0288-BACKPORT-drivers-infiniband-hw-mlx5-main_ext.c.patch new file mode 100644 index 0000000..e2e85a8 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0288-BACKPORT-drivers-infiniband-hw-mlx5-main_ext.c.patch @@ -0,0 +1,81 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/infiniband/hw/mlx5/main_ext.c + +Change-Id: I92aec53ed425a70f21f7dbe5035d7cc08653dd88 +--- + drivers/infiniband/hw/mlx5/main_ext.c | 29 ++++++++++++++++++++++++--- + 1 file changed, 26 insertions(+), 3 deletions(-) + +--- a/drivers/infiniband/hw/mlx5/main_ext.c ++++ b/drivers/infiniband/hw/mlx5/main_ext.c +@@ -73,6 +73,9 @@ static struct attribute *ttl_attrs[] = { + &ttl_attr_ttl.attr, + NULL + }; ++#ifdef HAVE_KOBJ_TYPE_DEFAULT_GROUPS ++ATTRIBUTE_GROUPS(ttl); ++#endif + + static ssize_t ttl_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +@@ -99,7 +102,11 @@ static const struct sysfs_ops ttl_sysfs_ + + static struct kobj_type ttl_type = { + .sysfs_ops = &ttl_sysfs_ops, +- .default_attrs = ttl_attrs ++#ifdef HAVE_KOBJ_TYPE_DEFAULT_GROUPS ++ .default_groups = ttl_groups ++#else ++ .default_attrs = ttl_attrs ++#endif + }; + + int init_ttl_sysfs(struct mlx5_ib_dev *dev) +@@ -679,6 +686,10 @@ static struct attribute *tc_attrs[] = { + NULL + }; + ++#ifdef HAVE_KOBJ_TYPE_DEFAULT_GROUPS ++ATTRIBUTE_GROUPS(tc); ++#endif ++ + static ssize_t tc_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) + { +@@ -710,7 +721,11 @@ static const struct sysfs_ops tc_sysfs_o + + static struct kobj_type tc_type = { + .sysfs_ops = &tc_sysfs_ops, +- .default_attrs = tc_attrs ++#ifdef HAVE_KOBJ_TYPE_DEFAULT_GROUPS ++ .default_groups = tc_groups ++#else ++ .default_attrs = tc_attrs ++#endif + }; + + int init_tc_sysfs(struct mlx5_ib_dev *dev) +@@ -1379,6 +1394,10 @@ static struct attribute *dc_attrs[] = { + NULL + }; + ++#ifdef HAVE_KOBJ_TYPE_DEFAULT_GROUPS ++ATTRIBUTE_GROUPS(dc); ++#endif ++ + static ssize_t dc_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) + { +@@ -1410,7 +1429,11 @@ static const struct sysfs_ops dc_sysfs_o + + static struct kobj_type dc_type = { + .sysfs_ops = &dc_sysfs_ops, +- .default_attrs = dc_attrs ++#ifdef HAVE_KOBJ_TYPE_DEFAULT_GROUPS ++ .default_groups = dc_groups ++#else ++ .default_attrs = dc_attrs ++#endif + }; + + static int init_sysfs(struct mlx5_ib_dev *dev) diff --git a/src/mlnx-ofa_kernel-5.8/backports/0288-BACKPORT-drivers-infiniband-ulp-iser-iser_verbs.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0288-BACKPORT-drivers-infiniband-ulp-iser-iser_verbs.c.patch new file mode 100644 index 0000000..7dee148 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0288-BACKPORT-drivers-infiniband-ulp-iser-iser_verbs.c.patch @@ -0,0 +1,22 @@ +From: Sergey Gorenko +Subject: [PATCH] BACKPORT: drivers/infiniband/ulp/iser/iser_verbs.c + +Change-Id: Ifbee0d6dce95279eaafb05522c88f3f4e294b5ef +--- + drivers/infiniband/ulp/iser/iser_verbs.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/drivers/infiniband/ulp/iser/iser_verbs.c ++++ b/drivers/infiniband/ulp/iser/iser_verbs.c +@@ -928,7 +928,11 @@ u8 iser_check_task_pi_status(struct iscs + sector_t sector_off = mr_status.sig_err.sig_err_offset; + + sector_div(sector_off, sector_size + 8); ++#ifdef HAVE_SCSI_GET_SECTOR + *sector = scsi_get_sector(iser_task->sc) + sector_off; ++#else ++ *sector = scsi_get_lba(iser_task->sc) + sector_off; ++#endif + + iser_err("PI error found type %d at sector %llx " + "expected %x vs actual %x\n", diff --git a/src/mlnx-ofa_kernel-5.8/backports/0288-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0288-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..e4b4b3b --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0288-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,67 @@ +From: Chris Mi +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/police.c + +Change-Id: Ibdb8fde20dd1828c60a8b1ca8db1a52648b68754 +--- + .../mellanox/mlx5/core/en/tc/act/police.c | 15 ++++++++++++++- + 1 file changed, 14 insertions(+), 1 deletion(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/police.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/police.c +@@ -17,21 +17,30 @@ static int + fill_meter_params_from_act(const struct flow_action_entry *act, + struct mlx5e_flow_meter_params *params) + { ++#if defined(HAVE_FLOW_ACTION_POLICE_INDEX) || defined(HAVE_FLOW_ACTION_HW_INDEX) ++#ifdef HAVE_FLOW_ACTION_HW_INDEX + params->index = act->hw_index; ++#else ++ params->index = act->police.index; ++#endif + if (act->police.rate_bytes_ps) { + params->mode = MLX5_RATE_LIMIT_BPS; + /* change rate to bits per second */ + params->rate = act->police.rate_bytes_ps << 3; + params->burst = act->police.burst; ++#ifdef HAVE_FLOW_ACTION_POLICE_RATE_PKT_PS + } else if (act->police.rate_pkt_ps) { + params->mode = MLX5_RATE_LIMIT_PPS; + params->rate = act->police.rate_pkt_ps; + params->burst = act->police.burst_pkt; ++#endif + } else { + return -EOPNOTSUPP; + } +- + return 0; ++#else ++ return -EOPNOTSUPP; ++#endif + } + + static int +@@ -60,6 +69,7 @@ tc_act_is_multi_table_act_police(struct + return true; + } + ++#ifdef HAVE_FLOW_OFFLOAD_ACTION + static int + tc_act_police_offload(struct mlx5e_priv *priv, + struct flow_offload_action *fl_act, +@@ -132,12 +142,15 @@ tc_act_police_stats(struct mlx5e_priv *p + mlx5e_tc_meter_put(meter); + return 0; + } ++#endif /* HAVE_FLOW_OFFLOAD_ACTION */ + + struct mlx5e_tc_act mlx5e_tc_act_police = { + .can_offload = tc_act_can_offload_police, + .parse_action = tc_act_parse_police, + .is_multi_table_act = tc_act_is_multi_table_act_police, ++#ifdef HAVE_FLOW_OFFLOAD_ACTION + .offload_action = tc_act_police_offload, + .destroy_action = tc_act_police_destroy, + .stats_action = tc_act_police_stats, ++#endif + }; diff --git a/src/mlnx-ofa_kernel-5.8/backports/0289-BACKPORT-drivers-infiniband-ulp-srp-ib_srp.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0289-BACKPORT-drivers-infiniband-ulp-srp-ib_srp.c.patch new file mode 100644 index 0000000..ac81a3b --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0289-BACKPORT-drivers-infiniband-ulp-srp-ib_srp.c.patch @@ -0,0 +1,968 @@ +From: Sergey Gorenko +Subject: [PATCH] BACKPORT: drivers/infiniband/ulp/srp/ib_srp.c + +Change-Id: I2a1872c43a93aea0a0ffa9abd319d67dc829e1c5 +--- + drivers/infiniband/ulp/srp/ib_srp.c | 512 +++++++++++++++++++++++++++- + 1 file changed, 511 insertions(+), 1 deletion(-) + +--- a/drivers/infiniband/ulp/srp/ib_srp.c ++++ b/drivers/infiniband/ulp/srp/ib_srp.c +@@ -64,10 +64,30 @@ + MODULE_AUTHOR("Roland Dreier"); + MODULE_DESCRIPTION("InfiniBand SCSI RDMA Protocol initiator"); + MODULE_LICENSE("Dual BSD/GPL"); ++#ifdef RETPOLINE_MLNX ++MODULE_INFO(retpoline, "Y"); ++#endif + + #if !defined(CONFIG_DYNAMIC_DEBUG) + #define DEFINE_DYNAMIC_DEBUG_METADATA(name, fmt) + #define DYNAMIC_DEBUG_BRANCH(descriptor) false ++#else ++#if defined(DEFINE_DYNAMIC_DEBUG_METADATA) && !defined(DYNAMIC_DEBUG_BRANCH) ++#ifdef DEBUG ++#define DYNAMIC_DEBUG_BRANCH(descriptor) \ ++ likely(descriptor.flags & _DPRINTK_FLAGS_PRINT) ++#else ++#define DYNAMIC_DEBUG_BRANCH(descriptor) \ ++ unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT) ++#endif ++#endif ++#endif ++ ++#ifndef DEFINE_DYNAMIC_DEBUG_METADATA ++#define DEFINE_DYNAMIC_DEBUG_METADATA(name, fmt) ++#endif ++#ifndef DYNAMIC_DEBUG_BRANCH ++#define DYNAMIC_DEBUG_BRANCH(descriptor) false + #endif + + static unsigned int srp_sg_tablesize; +@@ -86,8 +106,13 @@ MODULE_PARM_DESC(cmd_sg_entries, + "Default number of gather/scatter entries in the SRP command (default is 12, max 255)"); + + module_param(indirect_sg_entries, uint, 0444); ++#ifdef HAVE_SG_MAX_SEGMENTS + MODULE_PARM_DESC(indirect_sg_entries, + "Default max number of gather/scatter entries (default is 12, max is " __stringify(SG_MAX_SEGMENTS) ")"); ++#else ++MODULE_PARM_DESC(indirect_sg_entries, ++ "Default max number of gather/scatter entries (default is 12, max is " __stringify(SCSI_MAX_SG_CHAIN_SEGMENTS) ")"); ++#endif + + module_param(allow_ext_sg, bool, 0444); + MODULE_PARM_DESC(allow_ext_sg, +@@ -965,6 +990,7 @@ static void srp_disconnect_target(struct + } + } + ++#ifdef HAVE_SCSI_HOST_TEMPLATE_INIT_CMD_PRIV + static int srp_exit_cmd_priv(struct Scsi_Host *shost, struct scsi_cmnd *cmd) + { + struct srp_target_port *target = host_to_target(shost); +@@ -1016,6 +1042,81 @@ static int srp_init_cmd_priv(struct Scsi + out: + return ret; + } ++#else /* HAVE_SCSI_HOST_TEMPLATE_INIT_CMD_PRIV */ ++static void srp_free_req_data(struct srp_target_port *target, ++ struct srp_rdma_ch *ch) ++{ ++ struct srp_device *dev = target->srp_host->srp_dev; ++ struct ib_device *ibdev = dev->dev; ++ struct srp_request *req; ++ int i; ++ ++ if (!ch->req_ring) ++ return; ++ ++ for (i = 0; i < target->req_ring_size; ++i) { ++ req = &ch->req_ring[i]; ++ if (dev->use_fast_reg) ++ kfree(req->fr_list); ++ if (req->indirect_dma_addr) { ++ ib_dma_unmap_single(ibdev, req->indirect_dma_addr, ++ target->indirect_size, ++ DMA_TO_DEVICE); ++ } ++ kfree(req->indirect_desc); ++ } ++ ++ kfree(ch->req_ring); ++ ch->req_ring = NULL; ++} ++ ++static int srp_alloc_req_data(struct srp_rdma_ch *ch) ++{ ++ struct srp_target_port *target = ch->target; ++ struct srp_device *srp_dev = target->srp_host->srp_dev; ++ struct ib_device *ibdev = srp_dev->dev; ++ struct srp_request *req; ++ dma_addr_t dma_addr; ++ int i, ret = -ENOMEM; ++ ++#ifndef HAVE_BLK_TAGS ++ INIT_LIST_HEAD(&ch->free_reqs); ++#endif ++ ch->req_ring = kcalloc(target->req_ring_size, sizeof(*ch->req_ring), ++ GFP_KERNEL); ++ if (!ch->req_ring) ++ goto out; ++ ++ for (i = 0; i < target->req_ring_size; ++i) { ++ req = &ch->req_ring[i]; ++ if (srp_dev->use_fast_reg) { ++ req->fr_list = kmalloc_array(target->mr_per_cmd, ++ sizeof(void *), GFP_KERNEL); ++ if (!req->fr_list) ++ goto out; ++ } ++ req->indirect_desc = kmalloc(target->indirect_size, GFP_KERNEL); ++ if (!req->indirect_desc) ++ goto out; ++ ++ dma_addr = ib_dma_map_single(ibdev, req->indirect_desc, ++ target->indirect_size, ++ DMA_TO_DEVICE); ++ if (ib_dma_mapping_error(ibdev, dma_addr)) ++ goto out; ++ ++ req->indirect_dma_addr = dma_addr; ++#ifndef HAVE_BLK_TAGS ++ req->tag = build_srp_tag(ch - target->ch, i); ++ list_add_tail(&req->list, &ch->free_reqs); ++#endif ++ } ++ ret = 0; ++ ++out: ++ return ret; ++} ++#endif /* HAVE_SCSI_HOST_TEMPLATE_INIT_CMD_PRIV */ + + /** + * srp_del_scsi_host_attr() - Remove attributes defined in the host template. +@@ -1026,6 +1127,7 @@ out: + */ + static void srp_del_scsi_host_attr(struct Scsi_Host *shost) + { ++#ifdef HAVE_SCSI_HOST_TEMPLATE_SHOST_GROUPS + const struct attribute_group **g; + struct attribute **attr; + +@@ -1037,6 +1139,12 @@ static void srp_del_scsi_host_attr(struc + device_remove_file(&shost->shost_dev, dev_attr); + } + } ++#else ++ struct device_attribute **attr; ++ ++ for (attr = shost->hostt->shost_attrs; attr && *attr; ++attr) ++ device_remove_file(&shost->shost_dev, *attr); ++#endif + } + + static void srp_remove_target(struct srp_target_port *target) +@@ -1052,13 +1160,25 @@ static void srp_remove_target(struct srp + scsi_remove_host(target->scsi_host); + srp_stop_rport_timers(target->rport); + srp_disconnect_target(target); ++#ifdef HAVE_KOBJ_NS_GRAB_CURRENT_EXPORTED + kobj_ns_drop(KOBJ_NS_TYPE_NET, target->net); ++#endif + for (i = 0; i < target->ch_count; i++) { + ch = &target->ch[i]; + srp_free_ch_ib(target, ch); + } + cancel_work_sync(&target->tl_err_work); + srp_rport_put(target->rport); ++#ifndef HAVE_SCSI_HOST_TEMPLATE_INIT_CMD_PRIV ++ for (i = 0; i < target->ch_count; i++) { ++ ch = &target->ch[i]; ++ srp_free_req_data(target, ch); ++ } ++#endif ++#ifndef HAVE_BLK_TAGS ++ kfree(target->mq_map); ++ target->mq_map = NULL; ++#endif + kfree(target->ch); + target->ch = NULL; + +@@ -1262,6 +1382,9 @@ static void srp_free_req(struct srp_rdma + + spin_lock_irqsave(&ch->lock, flags); + ch->req_lim += req_lim_delta; ++#ifndef HAVE_BLK_TAGS ++ list_add_tail(&req->list, &ch->free_reqs); ++#endif + spin_unlock_irqrestore(&ch->lock, flags); + } + +@@ -1273,21 +1396,34 @@ static void srp_finish_req(struct srp_rd + if (scmnd) { + srp_free_req(ch, req, scmnd, 0); + scmnd->result = result; ++#ifdef HAVE_SCSI_DONE + scsi_done(scmnd); ++#else ++ scmnd->scsi_done(scmnd); ++#endif + } + } + ++#ifdef HAVE_SCSI_HOST_TEMPLATE_INIT_CMD_PRIV + struct srp_terminate_context { + struct srp_target_port *srp_target; + int scsi_result; + }; + ++#ifdef HAVE_SCSI_HOST_BUSY_ITER_FN_2_ARGS ++static bool srp_terminate_cmd(struct scsi_cmnd *scmnd, void *context_ptr) ++#else + static bool srp_terminate_cmd(struct scsi_cmnd *scmnd, void *context_ptr, + bool reserved) ++#endif + { + struct srp_terminate_context *context = context_ptr; + struct srp_target_port *target = context->srp_target; ++#ifdef HAVE_SCSI_CMD_TO_RQ + u32 tag = blk_mq_unique_tag(scsi_cmd_to_rq(scmnd)); ++#else ++ u32 tag = blk_mq_unique_tag(scmnd->request); ++#endif + struct srp_rdma_ch *ch = &target->ch[blk_mq_unique_tag_to_hwq(tag)]; + struct srp_request *req = scsi_cmd_priv(scmnd); + +@@ -1304,6 +1440,25 @@ static void srp_terminate_io(struct srp_ + + scsi_host_busy_iter(target->scsi_host, srp_terminate_cmd, &context); + } ++#else ++static void srp_terminate_io(struct srp_rport *rport) ++{ ++ struct srp_target_port *target = rport->lld_data; ++ struct srp_rdma_ch *ch; ++ int i, j; ++ ++ for (i = 0; i < target->ch_count; i++) { ++ ch = &target->ch[i]; ++ ++ for (j = 0; j < target->req_ring_size; ++j) { ++ struct srp_request *req = &ch->req_ring[j]; ++ ++ srp_finish_req(ch, req, NULL, ++ DID_TRANSPORT_FAILFAST << 16); ++ } ++ } ++} ++#endif /* HAVE_SCSI_HOST_TEMPLATE_INIT_CMD_PRIV */ + + /* Calculate maximum initiator to target information unit length. */ + static uint32_t srp_max_it_iu_len(int cmd_sg_cnt, bool use_imm_data, +@@ -1358,6 +1513,7 @@ static int srp_rport_reconnect(struct sr + ch = &target->ch[i]; + ret += srp_new_cm_id(ch); + } ++#ifdef HAVE_SCSI_HOST_TEMPLATE_INIT_CMD_PRIV + { + struct srp_terminate_context context = { + .srp_target = target, .scsi_result = DID_RESET << 16}; +@@ -1365,6 +1521,16 @@ static int srp_rport_reconnect(struct sr + scsi_host_busy_iter(target->scsi_host, srp_terminate_cmd, + &context); + } ++#else ++ for (i = 0; i < target->ch_count; i++) { ++ ch = &target->ch[i]; ++ for (j = 0; j < target->req_ring_size; ++j) { ++ struct srp_request *req = &ch->req_ring[j]; ++ ++ srp_finish_req(ch, req, NULL, DID_RESET << 16); ++ } ++ } ++#endif + for (i = 0; i < target->ch_count; i++) { + ch = &target->ch[i]; + /* +@@ -1942,6 +2108,9 @@ static void srp_process_rsp(struct srp_r + struct srp_request *req; + struct scsi_cmnd *scmnd; + unsigned long flags; ++#ifndef HAVE_BLK_TAGS ++ unsigned i; ++#endif + + if (unlikely(rsp->tag & SRP_TAG_TSK_MGMT)) { + spin_lock_irqsave(&ch->lock, flags); +@@ -1958,11 +2127,31 @@ static void srp_process_rsp(struct srp_r + } + spin_unlock_irqrestore(&ch->lock, flags); + } else { ++#ifdef HAVE_BLK_TAGS + scmnd = scsi_host_find_tag(target->scsi_host, rsp->tag); ++#ifdef HAVE_SCSI_HOST_TEMPLATE_INIT_CMD_PRIV + if (scmnd) { + req = scsi_cmd_priv(scmnd); ++#else ++ if (scmnd && scmnd->host_scribble) { ++ req = (void *)scmnd->host_scribble; ++#endif + scmnd = srp_claim_req(ch, req, NULL, scmnd); ++#else ++ if (srp_tag_ch(rsp->tag) != ch - target->ch) ++ pr_err("Channel idx mismatch: tag %#llx <> ch %#lx\n", ++ rsp->tag, ch - target->ch); ++ i = srp_tag_idx(rsp->tag); ++ if (i < target->req_ring_size) { ++ req = &ch->req_ring[i]; ++ scmnd = srp_claim_req(ch, req, NULL, NULL); ++#endif + } else { ++#ifndef HAVE_SCSI_HOST_TEMPLATE_INIT_CMD_PRIV ++ scmnd = NULL; ++ } ++ if (!scmnd) { ++#endif + shost_printk(KERN_ERR, target->scsi_host, + "Null scmnd for RSP w/tag %#016llx received on ch %td / QP %#x\n", + rsp->tag, ch - target->ch, ch->qp->qp_num); +@@ -1994,7 +2183,14 @@ static void srp_process_rsp(struct srp_r + srp_free_req(ch, req, scmnd, + be32_to_cpu(rsp->req_lim_delta)); + ++#ifndef HAVE_SCSI_HOST_TEMPLATE_INIT_CMD_PRIV ++ scmnd->host_scribble = NULL; ++#endif ++#ifdef HAVE_SCSI_DONE + scsi_done(scmnd); ++#else ++ scmnd->scsi_done(scmnd); ++#endif + } + } + +@@ -2056,9 +2252,10 @@ static void srp_process_aer_req(struct s + .tag = req->tag, + }; + s32 delta = be32_to_cpu(req->req_lim_delta); ++ uint64_t lun = scsilun_to_int(&req->lun); + + shost_printk(KERN_ERR, target->scsi_host, PFX +- "ignoring AER for LUN %llu\n", scsilun_to_int(&req->lun)); ++ "ignoring AER for LUN %llu\n", lun); + + if (srp_response_common(ch, delta, &rsp, sizeof(rsp))) + shost_printk(KERN_ERR, target->scsi_host, PFX +@@ -2156,39 +2353,84 @@ static void srp_handle_qp_err(struct ib_ + } + target->qp_in_error = true; + } ++#ifndef HAVE_BLK_TAGS ++static struct srp_rdma_ch *srp_map_cpu_to_ch(struct srp_target_port *target) ++{ ++ return &target->ch[target->mq_map[raw_smp_processor_id()]]; ++} ++#endif + + static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd) + { ++#ifdef HAVE_BLK_TAGS ++#ifdef HAVE_SCSI_CMD_TO_RQ + struct request *rq = scsi_cmd_to_rq(scmnd); ++#else ++ struct request *rq = scmnd->request; ++#endif ++#endif + struct srp_target_port *target = host_to_target(shost); + struct srp_rdma_ch *ch; ++#ifdef HAVE_SCSI_HOST_TEMPLATE_INIT_CMD_PRIV + struct srp_request *req = scsi_cmd_priv(scmnd); ++#else ++ struct srp_request *req; ++#endif + struct srp_iu *iu; + struct srp_cmd *cmd; + struct ib_device *dev; + unsigned long flags; + u32 tag; ++#if defined(HAVE_BLK_TAGS) && !defined(HAVE_SCSI_HOST_TEMPLATE_INIT_CMD_PRIV) ++ u16 idx; ++#endif + int len, ret; + + scmnd->result = srp_chkready(target->rport); + if (unlikely(scmnd->result)) + goto err; + ++#ifdef HAVE_BLK_TAGS + WARN_ON_ONCE(rq->tag < 0); + tag = blk_mq_unique_tag(rq); + ch = &target->ch[blk_mq_unique_tag_to_hwq(tag)]; ++#ifndef HAVE_SCSI_HOST_TEMPLATE_INIT_CMD_PRIV ++ idx = blk_mq_unique_tag_to_tag(tag); ++ WARN_ONCE(idx >= target->req_ring_size, "%s: tag %#x: idx %d >= %d\n", ++ dev_name(&shost->shost_gendev), tag, idx, ++ target->req_ring_size); ++#endif ++#else ++ ch = srp_map_cpu_to_ch(target); ++#endif + + spin_lock_irqsave(&ch->lock, flags); + iu = __srp_get_tx_iu(ch, SRP_IU_CMD); ++#ifdef HAVE_BLK_TAGS + spin_unlock_irqrestore(&ch->lock, flags); + + if (!iu) + goto err; + ++#ifndef HAVE_SCSI_HOST_TEMPLATE_INIT_CMD_PRIV ++ req = &ch->req_ring[idx]; ++#endif ++#else ++ if (!iu) ++ goto err_unlock; ++ ++ req = list_first_entry(&ch->free_reqs, struct srp_request, list); ++ list_del(&req->list); ++ tag = req->tag; ++ spin_unlock_irqrestore(&ch->lock, flags); ++#endif + dev = target->srp_host->srp_dev->dev; + ib_dma_sync_single_for_cpu(dev, iu->dma, ch->max_it_iu_len, + DMA_TO_DEVICE); + ++#ifndef HAVE_SCSI_HOST_TEMPLATE_INIT_CMD_PRIV ++ scmnd->host_scribble = (void *) req; ++#endif + cmd = iu->buf; + memset(cmd, 0, sizeof *cmd); + +@@ -2217,7 +2459,11 @@ static int srp_queuecommand(struct Scsi_ + * to reduce queue depth temporarily. + */ + scmnd->result = len == -ENOMEM ? ++#ifdef HAVE_SCSI_QUEUE_FULL ++ DID_OK << 16 | QUEUE_FULL << 1 : DID_ERROR << 16; ++#else + DID_OK << 16 | SAM_STAT_TASK_SET_FULL : DID_ERROR << 16; ++#endif + goto err_iu; + } + +@@ -2244,9 +2490,20 @@ err_iu: + */ + req->scmnd = NULL; + ++#ifndef HAVE_BLK_TAGS ++ spin_lock_irqsave(&ch->lock, flags); ++ list_add(&req->list, &ch->free_reqs); ++ ++err_unlock: ++ spin_unlock_irqrestore(&ch->lock, flags); ++#endif + err: + if (scmnd->result) { ++#ifdef HAVE_SCSI_DONE + scsi_done(scmnd); ++#else ++ scmnd->scsi_done(scmnd); ++#endif + ret = 0; + } else { + ret = SCSI_MLQUEUE_HOST_BUSY; +@@ -2705,6 +2962,30 @@ static int srp_rdma_cm_handler(struct rd + return 0; + } + ++#ifdef HAVE_SCSI_HOST_TEMPLATE_CHANGE_QUEUE_TYPE ++/** ++ * srp_change_queue_type - changing device queue tag type ++ * @sdev: scsi device struct ++ * @tag_type: requested tag type ++ * ++ * Returns queue tag type. ++ */ ++static int ++srp_change_queue_type(struct scsi_device *sdev, int tag_type) ++{ ++ if (sdev->tagged_supported) { ++ scsi_set_tag_type(sdev, tag_type); ++ if (tag_type) ++ scsi_activate_tcq(sdev, sdev->queue_depth); ++ else ++ scsi_deactivate_tcq(sdev, sdev->queue_depth); ++ } else ++ tag_type = 0; ++ ++ return tag_type; ++} ++#endif ++ + /** + * srp_change_queue_depth - setting device queue depth + * @sdev: scsi device struct +@@ -2712,13 +2993,40 @@ static int srp_rdma_cm_handler(struct rd + * + * Returns queue depth. + */ ++#ifdef HAVE_SCSI_HOST_TEMPLATE_TRACK_QUEUE_DEPTH + static int + srp_change_queue_depth(struct scsi_device *sdev, int qdepth) + { + if (!sdev->tagged_supported) + qdepth = 1; ++#ifdef HAVE_SCSI_CHANGE_QUEUE_DEPTH + return scsi_change_queue_depth(sdev, qdepth); ++#else ++ scsi_adjust_queue_depth(sdev, qdepth); ++ return sdev->queue_depth; ++#endif //HAVE_SCSI_CHANGE_QUEUE_DEPTH ++} ++#else ++static int ++srp_change_queue_depth(struct scsi_device *sdev, int qdepth, int reason) ++{ ++ struct Scsi_Host *shost = sdev->host; ++ int max_depth; ++ if (reason == SCSI_QDEPTH_DEFAULT || reason == SCSI_QDEPTH_RAMP_UP) { ++ max_depth = shost->can_queue; ++ if (!sdev->tagged_supported) ++ max_depth = 1; ++ if (qdepth > max_depth) ++ qdepth = max_depth; ++ scsi_adjust_queue_depth(sdev, scsi_get_tag_type(sdev), qdepth); ++ } else if (reason == SCSI_QDEPTH_QFULL) ++ scsi_track_queue_full(sdev, qdepth); ++ else ++ return -EOPNOTSUPP; ++ ++ return sdev->queue_depth; + } ++#endif //HAVE_SCSI_HOST_TEMPLATE_TRACK_QUEUE_DEPTH + + static int srp_send_tsk_mgmt(struct srp_rdma_ch *ch, u64 req_tag, u64 lun, + u8 func, u8 *status) +@@ -2799,8 +3107,17 @@ static int srp_abort(struct scsi_cmnd *s + + if (!req) + return SUCCESS; ++#ifdef HAVE_BLK_TAGS ++#ifdef HAVE_SCSI_CMD_TO_RQ + tag = blk_mq_unique_tag(scsi_cmd_to_rq(scmnd)); ++#else ++ tag = blk_mq_unique_tag(scmnd->request); ++#endif + ch_idx = blk_mq_unique_tag_to_hwq(tag); ++#else ++ tag = req->tag; ++ ch_idx = srp_tag_ch(tag); ++#endif + if (WARN_ON_ONCE(ch_idx >= target->ch_count)) + return SUCCESS; + ch = &target->ch[ch_idx]; +@@ -2818,7 +3135,11 @@ static int srp_abort(struct scsi_cmnd *s + if (ret == SUCCESS) { + srp_free_req(ch, req, scmnd, 0); + scmnd->result = DID_ABORT << 16; ++#ifdef HAVE_SCSI_DONE + scsi_done(scmnd); ++#else ++ scmnd->scsi_done(scmnd); ++#endif + } + + return ret; +@@ -2861,6 +3182,20 @@ static int srp_target_alloc(struct scsi_ + return 0; + } + ++#ifdef USE_SLAVE_ALLOC_HANDLER ++static int srp_slave_alloc(struct scsi_device *sdev) ++{ ++ struct Scsi_Host *shost = sdev->host; ++ struct srp_target_port *target = host_to_target(shost); ++ struct srp_device *srp_dev = target->srp_host->srp_dev; ++ ++ blk_queue_virt_boundary(sdev->request_queue, ++ ~srp_dev->mr_page_mask); ++ ++ return 0; ++} ++#endif ++ + static int srp_slave_configure(struct scsi_device *sdev) + { + struct Scsi_Host *shost = sdev->host; +@@ -3057,6 +3392,7 @@ static ssize_t allow_ext_sg_show(struct + + static DEVICE_ATTR_RO(allow_ext_sg); + ++#ifdef HAVE_SCSI_HOST_TEMPLATE_SHOST_GROUPS + static struct attribute *srp_host_attrs[] = { + &dev_attr_id_ext.attr, + &dev_attr_ioc_guid.attr, +@@ -3078,18 +3414,47 @@ static struct attribute *srp_host_attrs[ + }; + + ATTRIBUTE_GROUPS(srp_host); ++#else ++static struct device_attribute *srp_host_attrs[] = { ++ &dev_attr_id_ext, ++ &dev_attr_ioc_guid, ++ &dev_attr_service_id, ++ &dev_attr_pkey, ++ &dev_attr_sgid, ++ &dev_attr_dgid, ++ &dev_attr_orig_dgid, ++ &dev_attr_req_lim, ++ &dev_attr_zero_req_lim, ++ &dev_attr_local_ib_port, ++ &dev_attr_local_ib_device, ++ &dev_attr_ch_count, ++ &dev_attr_comp_vector, ++ &dev_attr_tl_retry_count, ++ &dev_attr_cmd_sg_entries, ++ &dev_attr_allow_ext_sg, ++ NULL ++}; ++#endif /* HAVE_SCSI_HOST_TEMPLATE_SHOST_GROUPS */ + + static struct scsi_host_template srp_template = { + .module = THIS_MODULE, + .name = "InfiniBand SRP initiator", + .proc_name = DRV_NAME, + .target_alloc = srp_target_alloc, ++#ifdef USE_SLAVE_ALLOC_HANDLER ++ .slave_alloc = srp_slave_alloc, ++#endif + .slave_configure = srp_slave_configure, + .info = srp_target_info, ++#ifdef HAVE_SCSI_HOST_TEMPLATE_INIT_CMD_PRIV + .init_cmd_priv = srp_init_cmd_priv, + .exit_cmd_priv = srp_exit_cmd_priv, ++#endif + .queuecommand = srp_queuecommand, + .change_queue_depth = srp_change_queue_depth, ++#ifdef HAVE_SCSI_HOST_TEMPLATE_CHANGE_QUEUE_TYPE ++ .change_queue_type = srp_change_queue_type, ++#endif + .eh_timed_out = srp_timed_out, + .eh_abort_handler = srp_abort, + .eh_device_reset_handler = srp_reset_device, +@@ -3099,9 +3464,26 @@ static struct scsi_host_template srp_tem + .can_queue = SRP_DEFAULT_CMD_SQ_SIZE, + .this_id = -1, + .cmd_per_lun = SRP_DEFAULT_CMD_SQ_SIZE, ++#ifdef ENABLE_CLUSTERING ++ .use_clustering = ENABLE_CLUSTERING, ++#endif ++#ifdef HAVE_SCSI_HOST_TEMPLATE_SHOST_GROUPS + .shost_groups = srp_host_groups, ++#else ++ .shost_attrs = srp_host_attrs, ++#endif ++#ifdef HAVE_SCSI_HOST_TEMPLATE_USE_HOST_WIDE_TAGS ++ .use_host_wide_tags = 1, ++#endif ++#ifdef HAVE_SCSI_HOST_TEMPLATE_USE_BLK_TAGS ++ .use_blk_tags = 1, ++#endif ++#ifdef HAVE_SCSI_HOST_TEMPLATE_TRACK_QUEUE_DEPTH + .track_queue_depth = 1, ++#endif ++#ifdef HAVE_SCSI_HOST_TEMPLATE_INIT_CMD_PRIV + .cmd_size = sizeof(struct srp_request), ++#endif + }; + + static int srp_sdev_count(struct Scsi_Host *host) +@@ -3289,6 +3671,7 @@ static const match_table_t srp_opt_token + { SRP_OPT_ERR, NULL } + }; + ++#ifdef HAVE_INET_PTON_WITH_SCOPE + /** + * srp_parse_in - parse an IP address and port number combination + * @net: [in] Network namespace. +@@ -3329,6 +3712,28 @@ static int srp_parse_in(struct net *net, + pr_debug("%s -> %pISpfsc\n", addr_port_str, sa); + return ret; + } ++#else ++static int srp_parse_in(struct sockaddr_in *ip4, const char *p, bool *has_port) ++{ ++ const char *dst_port_str = NULL; ++ u16 dst_port; ++ ++ if (!in4_pton(p, -1, (u8 *)&ip4->sin_addr, ':', &dst_port_str)) ++ return -1; ++ ++ if (has_port) { ++ if (sscanf(dst_port_str, ":%hu", &dst_port) < 1) { ++ *has_port = false; ++ } else { ++ *has_port = true; ++ ip4->sin_port = htons(dst_port); ++ } ++ } ++ ip4->sin_family = AF_INET; ++ ++ return 0; ++} ++#endif /* HAVE_INET_PTON_WITH_SCOPE */ + + static int srp_parse_options(struct net *net, const char *buf, + struct srp_target_port *target) +@@ -3436,8 +3841,12 @@ static int srp_parse_options(struct net + ret = -ENOMEM; + goto out; + } ++#ifdef HAVE_INET_PTON_WITH_SCOPE + ret = srp_parse_in(net, &target->rdma_cm.src.ss, p, + NULL); ++#else ++ ret = srp_parse_in(&target->rdma_cm.src.ip4, p, NULL); ++#endif + if (ret < 0) { + pr_warn("bad source parameter '%s'\n", p); + kfree(p); +@@ -3453,8 +3862,13 @@ static int srp_parse_options(struct net + ret = -ENOMEM; + goto out; + } ++#ifdef HAVE_INET_PTON_WITH_SCOPE + ret = srp_parse_in(net, &target->rdma_cm.dst.ss, p, + &has_port); ++#else ++ ret = srp_parse_in(&target->rdma_cm.dst.ip4, p, ++ &has_port); ++#endif + if (!has_port) + ret = -EINVAL; + if (ret < 0) { +@@ -3553,12 +3967,21 @@ static int srp_parse_options(struct net + break; + + case SRP_OPT_SG_TABLESIZE: ++#ifdef HAVE_SG_MAX_SEGMENTS + if (match_int(args, &token) || token < 1 || + token > SG_MAX_SEGMENTS) { + pr_warn("bad max sg_tablesize parameter '%s'\n", + p); + goto out; + } ++#else ++ if (match_int(args, &token) || token < 1 || ++ token > SCSI_MAX_SG_CHAIN_SEGMENTS) { ++ pr_warn("bad max sg_tablesize parameter '%s'\n", ++ p); ++ goto out; ++ } ++#endif + target->sg_tablesize = token; + break; + +@@ -3634,7 +4057,14 @@ static ssize_t add_target_store(struct d + struct srp_device *srp_dev = host->srp_dev; + struct ib_device *ibdev = srp_dev->dev; + int ret, i, ch_idx; ++#ifdef HAVE_VIRT_BOUNDARY + unsigned int max_sectors_per_mr, mr_per_cmd = 0; ++#else ++ unsigned int mr_per_cmd = 0; ++#endif ++#ifndef HAVE_BLK_TAGS ++ int cpu; ++#endif + bool multich = false; + uint32_t max_iu_len; + +@@ -3648,19 +4078,44 @@ static ssize_t add_target_store(struct d + target_host->max_id = 1; + target_host->max_lun = -1LL; + target_host->max_cmd_len = sizeof ((struct srp_cmd *) (void *) 0L)->cdb; ++#ifdef HAVE_SCSI_HOST_MAX_SEGMENT_SIZE + target_host->max_segment_size = ib_dma_max_seg_size(ibdev); ++#endif ++#ifdef HAVE_SCSI_HOST_VIRT_BOUNDARY_MASK + target_host->virt_boundary_mask = ~srp_dev->mr_page_mask; ++#endif + + target = host_to_target(target_host); + ++#ifdef HAVE_KOBJ_NS_GRAB_CURRENT_EXPORTED + target->net = kobj_ns_grab_current(KOBJ_NS_TYPE_NET); ++#else ++ target->net = &init_net; ++#endif + target->io_class = SRP_REV16A_IB_IO_CLASS; + target->scsi_host = target_host; + target->srp_host = host; + target->lkey = host->srp_dev->pd->local_dma_lkey; + target->global_rkey = host->srp_dev->global_rkey; + target->cmd_sg_cnt = cmd_sg_entries; ++#ifndef HAVE_VIRT_BOUNDARY ++ if (never_register) { ++ target->sg_tablesize = indirect_sg_entries ? : cmd_sg_entries; ++ } else { ++ if (target->cmd_sg_cnt > 12) { ++ target->cmd_sg_cnt = 12; ++ pr_warn("Clamping cmd_sg_entries and " ++ "indirect_sg_entries to 12. Because %s is " ++ "not supported MR with gaps. And values more " ++ "than 12 can cause allocation errors of the " ++ "MR pool.\n", ++ dev_name(&ibdev->dev)); ++ } ++ target->sg_tablesize = target->cmd_sg_cnt; ++ } ++#else + target->sg_tablesize = indirect_sg_entries ? : cmd_sg_entries; ++#endif + target->allow_ext_sg = allow_ext_sg; + target->tl_retry_count = 7; + target->queue_size = SRP_DEFAULT_QUEUE_SIZE; +@@ -3679,6 +4134,14 @@ static ssize_t add_target_store(struct d + if (ret) + goto out; + ++#ifdef HAVE_SCSI_HOST_TEMPLATE_USE_BLK_TAGS ++ ret = scsi_init_shared_tag_map(target_host, target_host->can_queue); ++ if (ret) ++ goto out; ++#endif ++#ifndef HAVE_SCSI_HOST_TEMPLATE_INIT_CMD_PRIV ++ target->req_ring_size = target->queue_size - SRP_TSK_MGMT_SQ_SIZE; ++#endif + if (!srp_conn_unique(target->srp_host, target)) { + if (target->using_rdma_cm) { + shost_printk(KERN_INFO, target->scsi_host, +@@ -3704,6 +4167,7 @@ static ssize_t add_target_store(struct d + } + + if (srp_dev->use_fast_reg) { ++#ifdef HAVE_VIRT_BOUNDARY + max_sectors_per_mr = srp_dev->max_pages_per_mr << + (ilog2(srp_dev->mr_page_size) - 9); + +@@ -3727,6 +4191,13 @@ static ssize_t add_target_store(struct d + pr_debug("max_sectors = %u; max_pages_per_mr = %u; mr_page_size = %u; max_sectors_per_mr = %u; mr_per_cmd = %u\n", + target->scsi_host->max_sectors, srp_dev->max_pages_per_mr, srp_dev->mr_page_size, + max_sectors_per_mr, mr_per_cmd); ++#else ++ mr_per_cmd = target->cmd_sg_cnt + register_always; ++ ++ pr_debug("max_sectors = %u; max_pages_per_mr = %u; mr_page_size = %u; mr_per_cmd = %u\n", ++ target->scsi_host->max_sectors, srp_dev->max_pages_per_mr, srp_dev->mr_page_size, ++ mr_per_cmd); ++#endif + } + + target_host->sg_tablesize = target->sg_tablesize; +@@ -3759,6 +4230,12 @@ static ssize_t add_target_store(struct d + if (!target->ch) + goto out; + ++#ifndef HAVE_BLK_TAGS ++ target->mq_map = kcalloc(nr_cpu_ids, sizeof(*target->mq_map), ++ GFP_KERNEL); ++ if (!target->mq_map) ++ goto err_free_ch; ++#endif + for (ch_idx = 0; ch_idx < target->ch_count; ++ch_idx) { + ch = &target->ch[ch_idx]; + ch->target = target; +@@ -3773,6 +4250,11 @@ static ssize_t add_target_store(struct d + if (ret) + goto err_disconnect; + ++#ifndef HAVE_SCSI_HOST_TEMPLATE_INIT_CMD_PRIV ++ ret = srp_alloc_req_data(ch); ++ if (ret) ++ goto err_disconnect; ++#endif + ret = srp_connect_ch(ch, max_iu_len, multich); + if (ret) { + char dst[64]; +@@ -3791,15 +4273,25 @@ static ssize_t add_target_store(struct d + goto free_ch; + } else { + srp_free_ch_ib(target, ch); ++#ifndef HAVE_SCSI_HOST_TEMPLATE_INIT_CMD_PRIV ++ srp_free_req_data(target, ch); ++#endif + target->ch_count = ch - target->ch; + goto connected; + } + } + multich = true; + } ++#ifndef HAVE_BLK_TAGS ++ for_each_online_cpu(cpu) { ++ target->mq_map[cpu] = cpu % target->ch_count; ++ } ++#endif + + connected: ++#ifdef HAVE_SCSI_HOST_NR_HW_QUEUES + target->scsi_host->nr_hw_queues = target->ch_count; ++#endif + + ret = srp_add_target(host, target); + if (ret) +@@ -3832,6 +4324,7 @@ out: + put: + scsi_host_put(target->scsi_host); + if (ret < 0) { ++#ifdef HAVE_KOBJ_NS_GRAB_CURRENT_EXPORTED + /* + * If a call to srp_remove_target() has not been scheduled, + * drop the network namespace reference now that was obtained +@@ -3839,6 +4332,7 @@ put: + */ + if (target->state != SRP_TARGET_REMOVED) + kobj_ns_drop(KOBJ_NS_TYPE_NET, target->net); ++#endif + scsi_host_put(target->scsi_host); + } + +@@ -3851,8 +4345,16 @@ free_ch: + for (i = 0; i < target->ch_count; i++) { + ch = &target->ch[i]; + srp_free_ch_ib(target, ch); ++#ifndef HAVE_SCSI_HOST_TEMPLATE_INIT_CMD_PRIV ++ srp_free_req_data(target, ch); ++#endif + } + ++#ifndef HAVE_BLK_TAGS ++ kfree(target->mq_map); ++ ++err_free_ch: ++#endif + kfree(target->ch); + goto out; + } +@@ -4097,11 +4599,19 @@ static int __init srp_init_module(void) + indirect_sg_entries = cmd_sg_entries; + } + ++#ifdef HAVE_SG_MAX_SEGMENTS + if (indirect_sg_entries > SG_MAX_SEGMENTS) { + pr_warn("Clamping indirect_sg_entries to %u\n", + SG_MAX_SEGMENTS); + indirect_sg_entries = SG_MAX_SEGMENTS; + } ++#else ++ if (indirect_sg_entries > SCSI_MAX_SG_CHAIN_SEGMENTS) { ++ pr_warn("Clamping indirect_sg_entries to %u\n", ++ SCSI_MAX_SG_CHAIN_SEGMENTS); ++ indirect_sg_entries = SCSI_MAX_SG_CHAIN_SEGMENTS; ++ } ++#endif + + srp_remove_wq = create_workqueue("srp_remove"); + if (!srp_remove_wq) { diff --git a/src/mlnx-ofa_kernel-5.8/backports/0289-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-ecp.patch b/src/mlnx-ofa_kernel-5.8/backports/0289-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-ecp.patch new file mode 100644 index 0000000..744a794 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0289-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-ecp.patch @@ -0,0 +1,54 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/ecpf.c + +Change-Id: I3337e98de3ea94a2e6096cc494713a63a018c110 +--- + drivers/net/ethernet/mellanox/mlx5/core/ecpf.c | 14 ++++++++++++++ + 1 file changed, 14 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c +@@ -508,6 +508,9 @@ static struct attribute *smart_nic_attrs + &attr_regex_en.attr, + NULL, + }; ++#ifdef HAVE_KOBJ_TYPE_DEFAULT_GROUPS ++ATTRIBUTE_GROUPS(smart_nic); ++#endif + + static const struct sysfs_ops smart_nic_sysfs_ops = { + .show = smart_nic_attr_show, +@@ -516,7 +519,11 @@ static const struct sysfs_ops smart_nic_ + + static struct kobj_type smart_nic_type = { + .sysfs_ops = &smart_nic_sysfs_ops, ++#ifdef HAVE_KOBJ_TYPE_DEFAULT_GROUPS ++ .default_groups = smart_nic_groups ++#else + .default_attrs = smart_nic_attrs ++#endif + }; + + void mlx5_smartnic_sysfs_init(struct net_device *dev) +@@ -642,6 +649,9 @@ static struct attribute *regex_attrs[] = + &attr_regex.attr, + NULL, + }; ++#ifdef HAVE_KOBJ_TYPE_DEFAULT_GROUPS ++ATTRIBUTE_GROUPS(regex); ++#endif + + static ssize_t regex_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +@@ -663,7 +673,11 @@ static const struct sysfs_ops regex_sysf + + static struct kobj_type regex_type = { + .sysfs_ops = ®ex_sysfs_ops, ++#ifdef HAVE_KOBJ_TYPE_DEFAULT_GROUPS ++ .default_groups = regex_groups ++#else + .default_attrs = regex_attrs ++#endif + }; + + int mlx5_regex_sysfs_init(struct mlx5_core_dev *dev) diff --git a/src/mlnx-ofa_kernel-5.8/backports/0289-BACKPORT-include-net-psample.h.patch b/src/mlnx-ofa_kernel-5.8/backports/0289-BACKPORT-include-net-psample.h.patch new file mode 100644 index 0000000..b233f13 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0289-BACKPORT-include-net-psample.h.patch @@ -0,0 +1,19 @@ +From: Chris Mi +Subject: [PATCH] BACKPORT: include/net/psample.h + +Change-Id: I8b608ac8e55d1c20379f029d419714f70ecf0582 +--- + include/net/psample.h | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/include/net/psample.h ++++ b/include/net/psample.h +@@ -4,6 +4,8 @@ + #include "../../compat/config.h" + + #ifdef HAVE_NET_PSAMPLE_H ++struct sk_buff; ++ + #include_next + #else + struct psample_group { diff --git a/src/mlnx-ofa_kernel-5.8/backports/0290-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0290-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..f84beea --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0290-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,51 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en/rep/sysfs.c + +Change-Id: I6ff49cb68993aae0de8fa24213d3c6dc96280746 +--- + .../net/ethernet/mellanox/mlx5/core/en/rep/sysfs.c | 14 ++++++++++++++ + 1 file changed, 14 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/sysfs.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/sysfs.c +@@ -241,6 +241,9 @@ static struct attribute *rep_attrs[] = { + &attr_miss_rl_stats_clr.attr, + NULL, + }; ++#ifdef HAVE_KOBJ_TYPE_DEFAULT_GROUPS ++ATTRIBUTE_GROUPS(rep); ++#endif + + static const struct sysfs_ops rep_sysfs_ops = { + .show = rep_attr_show, +@@ -249,7 +252,11 @@ static const struct sysfs_ops rep_sysfs_ + + static struct kobj_type rep_type = { + .sysfs_ops = &rep_sysfs_ops, ++#ifdef HAVE_KOBJ_TYPE_DEFAULT_GROUPS ++ .default_groups = rep_groups ++#else + .default_attrs = rep_attrs ++#endif + }; + + static struct attribute *rep_paging_attrs[] = { +@@ -257,10 +264,17 @@ static struct attribute *rep_paging_attr + &attr_num_pages.attr, + NULL, + }; ++#ifdef HAVE_KOBJ_TYPE_DEFAULT_GROUPS ++ATTRIBUTE_GROUPS(rep_paging); ++#endif + + static struct kobj_type rep_paging = { + .sysfs_ops = &rep_sysfs_ops, ++#ifdef HAVE_KOBJ_TYPE_DEFAULT_GROUPS ++ .default_groups = rep_paging_groups ++#else + .default_attrs = rep_paging_attrs ++#endif + }; + + void mlx5_rep_sysfs_init(struct mlx5e_rep_priv *rpriv) diff --git a/src/mlnx-ofa_kernel-5.8/backports/0291-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0291-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..eb69dc7 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0291-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,40 @@ +From: Chris Mi +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.h + +Change-Id: I549aef5f514fc892c0c6f781700421efca33da41 +--- + .../net/ethernet/mellanox/mlx5/core/en/rep/tc.h | 17 +++++++++++++++++ + 1 file changed, 17 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.h +@@ -33,12 +33,29 @@ int mlx5e_rep_encap_entry_attach(struct + void mlx5e_rep_encap_entry_detach(struct mlx5e_priv *priv, + struct mlx5e_encap_entry *e); + ++#if defined(HAVE_TC_FLOWER_OFFLOAD) || defined(HAVE_FLOW_CLS_OFFLOAD) ++#if defined(HAVE_NDO_SETUP_TC_TAKES_TC_SETUP_TYPE) || defined(HAVE_NDO_SETUP_TC_RH_EXTENDED) + int mlx5e_rep_setup_tc(struct net_device *dev, enum tc_setup_type type, + void *type_data); ++#else ++int mlx5e_rep_setup_tc(struct net_device *dev, u32 handle, ++#ifdef HAVE_NDO_SETUP_TC_TAKES_CHAIN_INDEX ++ u32 chain_index, __be16 proto, ++#else ++ __be16 proto, ++#endif ++ struct tc_to_netdev *tc); ++#endif ++#endif + + void mlx5e_rep_tc_receive(struct mlx5_cqe64 *cqe, struct mlx5e_rq *rq, + struct sk_buff *skb); + ++void mlx5e_rep_indr_clean_block_privs(struct mlx5e_rep_priv *rpriv); ++ ++int mlx5e_rep_setup_tc_cb_egdev(enum tc_setup_type type, void *type_data, ++ void *cb_priv); ++ + #else /* CONFIG_MLX5_CLS_ACT */ + + struct mlx5e_rep_priv; diff --git a/src/mlnx-ofa_kernel-5.8/backports/0291-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-sri.patch b/src/mlnx-ofa_kernel-5.8/backports/0291-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-sri.patch new file mode 100644 index 0000000..a865342 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0291-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-sri.patch @@ -0,0 +1,120 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/sriov_sysfs.c + +Change-Id: I07904c6d8990c67268169da0222f8bd1f0268de3 +--- + .../ethernet/mellanox/mlx5/core/sriov_sysfs.c | 42 +++++++++++++++++++ + 1 file changed, 42 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/sriov_sysfs.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/sriov_sysfs.c +@@ -1160,6 +1160,9 @@ static struct attribute *vf_eth_attrs[] + &vf_attr_group.attr, + NULL + }; ++#ifdef HAVE_KOBJ_TYPE_DEFAULT_GROUPS ++ATTRIBUTE_GROUPS(vf_eth); ++#endif + + static struct attribute *vf_group_attrs[] = { + &vf_group_attr_max_tx_rate.attr, +@@ -1167,26 +1170,44 @@ static struct attribute *vf_group_attrs[ + &vf_group_attr_config.attr, + NULL + }; ++#ifdef HAVE_KOBJ_TYPE_DEFAULT_GROUPS ++ATTRIBUTE_GROUPS(vf_group); ++#endif + + static struct attribute *vf_paging_attrs[] = { + &vf_attr_page_limit.attr, + &vf_attr_num_pages.attr, + NULL + }; ++#ifdef HAVE_KOBJ_TYPE_DEFAULT_GROUPS ++ATTRIBUTE_GROUPS(vf_paging); ++#endif + + static struct kobj_type vf_type_eth = { + .sysfs_ops = &vf_sysfs_ops, ++#ifdef HAVE_KOBJ_TYPE_DEFAULT_GROUPS ++ .default_groups = vf_eth_groups ++#else + .default_attrs = vf_eth_attrs ++#endif + }; + + static struct kobj_type vf_paging = { + .sysfs_ops = &vf_paging_ops, ++#ifdef HAVE_KOBJ_TYPE_DEFAULT_GROUPS ++ .default_groups = vf_paging_groups ++#else + .default_attrs = vf_paging_attrs ++#endif + }; + + static struct kobj_type vf_group = { + .sysfs_ops = &vf_group_sysfs_ops, ++#ifdef HAVE_KOBJ_TYPE_DEFAULT_GROUPS ++ .default_groups = vf_group_groups ++#else + .default_attrs = vf_group_attrs ++#endif + }; + + static struct vf_attributes pf_attr_min_pf_tx_rate = \ +@@ -1196,10 +1217,17 @@ static struct attribute *pf_eth_attrs[] + &pf_attr_min_pf_tx_rate.attr, + NULL, + }; ++#ifdef HAVE_KOBJ_TYPE_DEFAULT_GROUPS ++ATTRIBUTE_GROUPS(pf_eth); ++#endif + + static struct kobj_type pf_type_eth = { + .sysfs_ops = &vf_sysfs_ops, ++#ifdef HAVE_KOBJ_TYPE_DEFAULT_GROUPS ++ .default_groups = pf_eth_groups ++#else + .default_attrs = pf_eth_attrs ++#endif + }; + + VF_ATTR(rate); +@@ -1214,10 +1242,17 @@ static struct attribute *vf_meters_eth_a + &vf_attr_packets_dropped.attr, + NULL + }; ++#ifdef HAVE_KOBJ_TYPE_DEFAULT_GROUPS ++ATTRIBUTE_GROUPS(vf_meters_eth); ++#endif + + static struct kobj_type vf_meters_type_eth = { + .sysfs_ops = &vf_sysfs_ops, ++#ifdef HAVE_KOBJ_TYPE_DEFAULT_GROUPS ++ .default_groups = vf_meters_eth_groups ++#else + .default_attrs = vf_meters_eth_attrs ++#endif + }; + #endif /* CONFIG_MLX5_ESWITCH */ + +@@ -1227,10 +1262,17 @@ static struct attribute *vf_ib_attrs[] = + &vf_attr_policy.attr, + NULL + }; ++#ifdef HAVE_KOBJ_TYPE_DEFAULT_GROUPS ++ATTRIBUTE_GROUPS(vf_ib); ++#endif + + static struct kobj_type vf_type_ib = { + .sysfs_ops = &vf_sysfs_ops, ++#ifdef HAVE_KOBJ_TYPE_DEFAULT_GROUPS ++ .default_groups = vf_ib_groups ++#else + .default_attrs = vf_ib_attrs ++#endif + }; + + static struct device_attribute *mlx5_class_attributes[] = { diff --git a/src/mlnx-ofa_kernel-5.8/backports/0292-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0292-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..55d2e8e --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0292-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,24 @@ +From: Chris Mi +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/mirred.c + +Change-Id: I2219dc3f634ffa1d1e0c7ff643bc155fbc55b555 +--- + drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/mirred.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/mirred.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/mirred.c +@@ -120,10 +120,12 @@ tc_act_can_offload_mirred(struct mlx5e_t + return false; + } + ++#ifdef HAVE_NETIF_IS_BAREDUDP + if (parse_state->mpls_push && !netif_is_bareudp(out_dev)) { + NL_SET_ERR_MSG_MOD(extack, "mpls is supported only through a bareudp device"); + return false; + } ++#endif + + if (mlx5e_is_ft_flow(flow) && out_dev == priv->netdev) { + /* Ignore forward to self rules generated diff --git a/src/mlnx-ofa_kernel-5.8/backports/0292-BACKPORT-include-rdma-ib.h.patch b/src/mlnx-ofa_kernel-5.8/backports/0292-BACKPORT-include-rdma-ib.h.patch new file mode 100644 index 0000000..c5fae30 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0292-BACKPORT-include-rdma-ib.h.patch @@ -0,0 +1,20 @@ +From: Valentine Fatiev +Subject: [PATCH] BACKPORT: include/rdma/ib.h + +Change-Id: I616ca2ce21066e92d8473b8827824b21b2464897 +--- + include/rdma/ib.h | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/include/rdma/ib.h ++++ b/include/rdma/ib.h +@@ -75,7 +75,8 @@ struct sockaddr_ib { + */ + static inline bool ib_safe_file_access(struct file *filp) + { +- return filp->f_cred == current_cred() && !uaccess_kernel(); ++ /* BACKPORT as a result of upstram commit 967747bbc084b93b54e66f9047d342232314cd25 */ ++ return filp->f_cred == current_cred(); + } + + #endif /* _RDMA_IB_H */ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0293-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0293-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..7eaffff --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0293-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,64 @@ +From: Chris Mi +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/mpls.c + +Change-Id: If80666e1d5e2386678bea7b563f1026c7ab69c81 +--- + .../net/ethernet/mellanox/mlx5/core/en/tc/act/mpls.c | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/mpls.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/mpls.c +@@ -11,6 +11,7 @@ tc_act_can_offload_mpls_push(struct mlx5 + int act_index, + struct mlx5_flow_attr *attr) + { ++#ifdef HAVE_FLOW_ACTION_MPLS + struct netlink_ext_ack *extack = parse_state->extack; + struct mlx5e_priv *priv = parse_state->flow->priv; + +@@ -21,16 +22,21 @@ tc_act_can_offload_mpls_push(struct mlx5 + } + + return true; ++#else ++ return false; ++#endif + } + + static void + copy_mpls_info(struct mlx5e_mpls_info *mpls_info, + const struct flow_action_entry *act) + { ++#ifdef HAVE_FLOW_ACTION_MPLS + mpls_info->label = act->mpls_push.label; + mpls_info->tc = act->mpls_push.tc; + mpls_info->bos = act->mpls_push.bos; + mpls_info->ttl = act->mpls_push.ttl; ++#endif + } + + static int +@@ -66,10 +72,12 @@ tc_act_can_offload_mpls_pop(struct mlx5e + return false; + } + ++#ifdef HAVE_NETIF_IS_BAREDUDP + if (!netif_is_bareudp(filter_dev)) { + NL_SET_ERR_MSG_MOD(extack, "mpls pop supported only on bareudp devices"); + return false; + } ++#endif + + return true; + } +@@ -80,7 +88,9 @@ tc_act_parse_mpls_pop(struct mlx5e_tc_ac + struct mlx5e_priv *priv, + struct mlx5_flow_attr *attr) + { ++#ifdef HAVE_FLOW_ACTION_MPLS + attr->parse_attr->eth.h_proto = act->mpls_pop.proto; ++#endif + attr->action |= MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT; + flow_flag_set(parse_state->flow, L3_TO_L2_DECAP); + diff --git a/src/mlnx-ofa_kernel-5.8/backports/0294-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0294-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..553dc3d --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0294-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,40 @@ +From: Chris Mi +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/prio.c + +Change-Id: I48aba352f02bddfb49d8d9cf4858dbff1873fba6 +--- + drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/prio.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/prio.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/prio.c +@@ -12,10 +12,12 @@ tc_act_can_offload_prio(struct mlx5e_tc_ + int act_index, + struct mlx5_flow_attr *attr) + { ++#ifdef HAVE_FLOW_ACTION_PRIORITY + if (act->priority > parse_state->flow->priv->fs.tc.num_prio_hp) { + NL_SET_ERR_MSG_MOD(parse_state->extack, "Skb priority value is out of range"); + return false; + } ++#endif + + return true; + } +@@ -26,6 +28,7 @@ tc_act_parse_prio(struct mlx5e_tc_act_pa + struct mlx5e_priv *priv, + struct mlx5_flow_attr *attr) + { ++#ifdef HAVE_FLOW_ACTION_PRIORITY + int err; + + attr->nic_attr->user_prio = act->priority; +@@ -36,6 +39,7 @@ tc_act_parse_prio(struct mlx5e_tc_act_pa + return err; + + attr->action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR; ++#endif + + return 0; + } diff --git a/src/mlnx-ofa_kernel-5.8/backports/0295-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0295-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..afa9ea9 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0295-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,98 @@ +From: Chris Mi +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.c + +Change-Id: Ibfbff98b93739dd1e467b78d0247fd74a5188a76 +--- + .../mellanox/mlx5/core/en/tc/act/act.c | 31 +++++++++++++++++++ + 1 file changed, 31 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.c +@@ -14,8 +14,10 @@ static struct mlx5e_tc_act *tc_acts_fdb[ + &mlx5e_tc_act_goto, + &mlx5e_tc_act_mirred, + &mlx5e_tc_act_mirred, ++#ifdef HAVE_FLOW_ACTION_REDIRECT_INGRESS + &mlx5e_tc_act_redirect_ingress, + NULL, /* FLOW_ACTION_MIRRED_INGRESS, */ ++#endif + &mlx5e_tc_act_vlan, + &mlx5e_tc_act_vlan, + &mlx5e_tc_act_vlan_mangle, +@@ -25,16 +27,26 @@ static struct mlx5e_tc_act *tc_acts_fdb[ + &mlx5e_tc_act_pedit, + &mlx5e_tc_act_csum, + NULL, /* FLOW_ACTION_MARK, */ ++#ifdef HAVE_FLOW_ACTION_PTYPE + &mlx5e_tc_act_ptype, ++#endif ++#ifdef HAVE_FLOW_ACTION_PRIORITY + NULL, /* FLOW_ACTION_PRIORITY, */ ++#endif + NULL, /* FLOW_ACTION_WAKE, */ + NULL, /* FLOW_ACTION_QUEUE, */ + &mlx5e_tc_act_sample, + &mlx5e_tc_act_police, ++#ifdef HAVE_FLOW_ACTION_CT + &mlx5e_tc_act_ct, ++#endif ++#ifdef HAVE_FLOW_ACTION_CT_METADATA + NULL, /* FLOW_ACTION_CT_METADATA, */ ++#endif ++#ifdef HAVE_FLOW_ACTION_MPLS + &mlx5e_tc_act_mpls_push, + &mlx5e_tc_act_mpls_pop, ++#endif + }; + + /* Must be aligned with enum flow_action_id. */ +@@ -44,8 +56,10 @@ static struct mlx5e_tc_act *tc_acts_nic[ + NULL, /* FLOW_ACTION_TRAP, */ + &mlx5e_tc_act_goto, + &mlx5e_tc_act_mirred_nic, ++#ifdef HAVE_FLOW_ACTION_REDIRECT_INGRESS + NULL, /* FLOW_ACTION_MIRRED, */ + NULL, /* FLOW_ACTION_REDIRECT_INGRESS, */ ++#endif + NULL, /* FLOW_ACTION_MIRRED_INGRESS, */ + NULL, /* FLOW_ACTION_VLAN_PUSH, */ + NULL, /* FLOW_ACTION_VLAN_POP, */ +@@ -56,13 +70,19 @@ static struct mlx5e_tc_act *tc_acts_nic[ + &mlx5e_tc_act_pedit, + &mlx5e_tc_act_csum, + &mlx5e_tc_act_mark, ++#ifdef HAVE_FLOW_ACTION_PTYPE + NULL, /* FLOW_ACTION_PTYPE, */ ++#endif ++#ifdef HAVE_FLOW_ACTION_PRIORITY + &mlx5e_tc_act_prio, ++#endif + NULL, /* FLOW_ACTION_WAKE, */ + NULL, /* FLOW_ACTION_QUEUE, */ + NULL, /* FLOW_ACTION_SAMPLE, */ + NULL, /* FLOW_ACTION_POLICE, */ ++#ifdef HAVE_FLOW_ACTION_CT + &mlx5e_tc_act_ct, ++#endif + }; + + /** +@@ -76,6 +96,17 @@ mlx5e_tc_act_get(enum flow_action_id act + { + struct mlx5e_tc_act **tc_acts; + ++#ifdef HAVE_FLOW_ACTION_PRIORITY ++ /* WA for BF kernel 5.4 where flow_action_id order is wrong, FLOW_ACTION_PRIORITY is ++ * defined after FLOW_ACTION_MPLS_MANGLE ++ */ ++ if (FLOW_ACTION_PRIORITY > FLOW_ACTION_WAKE) { ++ if (act_id >= FLOW_ACTION_WAKE && act_id != FLOW_ACTION_PRIORITY) ++ act_id++; ++ else if (act_id == FLOW_ACTION_PRIORITY) ++ act_id = FLOW_ACTION_WAKE; ++ } ++#endif + tc_acts = ns_type == MLX5_FLOW_NAMESPACE_FDB ? tc_acts_fdb : tc_acts_nic; + + return tc_acts[act_id]; diff --git a/src/mlnx-ofa_kernel-5.8/backports/0296-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0296-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..6b3445b --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0296-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,29 @@ +From: Chris Mi +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/ptype.c + +Change-Id: I8e388cead9d6c7f44df7687d910777cc46799571 +--- + drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/ptype.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/ptype.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/ptype.c +@@ -19,6 +19,7 @@ tc_act_parse_ptype(struct mlx5e_tc_act_p + struct mlx5e_priv *priv, + struct mlx5_flow_attr *attr) + { ++#ifdef HAVE_FLOW_ACTION_PTYPE + struct netlink_ext_ack *extack = parse_state->extack; + + if (act->ptype != PACKET_HOST) { +@@ -28,6 +29,9 @@ tc_act_parse_ptype(struct mlx5e_tc_act_p + + parse_state->ptype_host = true; + return 0; ++#else ++ return -EOPNOTSUPP; ++#endif + } + + struct mlx5e_tc_act mlx5e_tc_act_ptype = { diff --git a/src/mlnx-ofa_kernel-5.8/backports/0297-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0297-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..1258cb0 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0297-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,25 @@ +From: Chris Mi +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/vlan.c + +Change-Id: I016e1d421dc93bbb23bc917502c8472a49aa27b4 +--- + drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/vlan.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/vlan.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/vlan.c +@@ -133,10 +133,12 @@ mlx5e_tc_act_vlan_add_pop_action(struct + struct flow_action_entry vlan_act = { + .id = FLOW_ACTION_VLAN_POP, + }; +- int nest_level, err = 0; ++ int nest_level = 1, err = 0; + ++#ifdef HAVE_NET_DEVICE_LOWER_LEVEL + nest_level = attr->parse_attr->filter_dev->lower_level - + priv->netdev->lower_level; ++#endif + while (nest_level--) { + err = parse_tc_vlan_action(priv, &vlan_act, attr->esw_attr, &attr->action, + extack); diff --git a/src/mlnx-ofa_kernel-5.8/backports/0298-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0298-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..be4aa4d --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0298-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,27 @@ +From: Chris Mi +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/sample.c + +Change-Id: Id914c6cae530e6b475b09ad1efbe56548f396b08 +--- + drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/sample.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/sample.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/sample.c +@@ -12,6 +12,7 @@ tc_act_can_offload_sample(struct mlx5e_t + int act_index, + struct mlx5_flow_attr *attr) + { ++#ifdef CONFIG_COMPAT_KERNEL_CT + struct netlink_ext_ack *extack = parse_state->extack; + bool ct_nat; + +@@ -21,6 +22,7 @@ tc_act_can_offload_sample(struct mlx5e_t + NL_SET_ERR_MSG_MOD(extack, "Sample action with CT NAT is not supported"); + return false; + } ++#endif + + return true; + } diff --git a/src/mlnx-ofa_kernel-5.8/backports/0306-BACKPORT-drivers-nvme-target-loop.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0306-BACKPORT-drivers-nvme-target-loop.c.patch new file mode 100644 index 0000000..56722ca --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0306-BACKPORT-drivers-nvme-target-loop.c.patch @@ -0,0 +1,214 @@ +From: Alaa Hleihel +Subject: [PATCH] BACKPORT: drivers/nvme/target/loop.c + +Change-Id: If8b0c6a0985942e15579e581ffa4691bcac9b46f +--- + drivers/nvme/target/loop.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 70 insertions(+) + +--- a/drivers/nvme/target/loop.c ++++ b/drivers/nvme/target/loop.c +@@ -78,7 +78,11 @@ static void nvme_loop_complete_rq(struct + { + struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(req); + ++#ifdef HAVE_SG_ALLOC_TABLE_CHAINED_NENTS_FIRST_CHUNK_PARAM + sg_free_table_chained(&iod->sg_table, NVME_INLINE_SG_CNT); ++#else ++ sg_free_table_chained(&iod->sg_table, true); ++#endif + nvme_complete_rq(req); + } + +@@ -157,16 +161,29 @@ static blk_status_t nvme_loop_queue_rq(s + + if (blk_rq_nr_phys_segments(req)) { + iod->sg_table.sgl = iod->first_sgl; ++#ifdef HAVE_SG_ALLOC_TABLE_CHAINED_NENTS_FIRST_CHUNK_PARAM + if (sg_alloc_table_chained(&iod->sg_table, + blk_rq_nr_phys_segments(req), + iod->sg_table.sgl, NVME_INLINE_SG_CNT)) { ++#else ++ if (sg_alloc_table_chained(&iod->sg_table, ++ blk_rq_nr_phys_segments(req), ++#ifdef HAVE_SG_ALLOC_TABLE_CHAINED_4_PARAMS ++ GFP_ATOMIC, ++#endif ++ iod->sg_table.sgl)) { ++#endif + nvme_cleanup_cmd(req); + return BLK_STS_RESOURCE; + } + + iod->req.sg = iod->sg_table.sgl; + iod->req.sg_cnt = blk_rq_map_sg(req->q, req, iod->sg_table.sgl); ++#ifdef HAVE_BLK_RQ_NR_PAYLOAD_BYTES + iod->req.transfer_len = blk_rq_payload_bytes(req); ++#else ++ iod->req.transfer_len = nvme_map_len(req); ++#endif + } + + schedule_work(&iod->work); +@@ -203,6 +220,7 @@ static int nvme_loop_init_iod(struct nvm + return 0; + } + ++#ifdef HAVE_BLK_MQ_OPS_INIT_REQUEST_HAS_4_PARAMS + static int nvme_loop_init_request(struct blk_mq_tag_set *set, + struct request *req, unsigned int hctx_idx, + unsigned int numa_node) +@@ -215,8 +233,35 @@ static int nvme_loop_init_request(struct + return nvme_loop_init_iod(ctrl, blk_mq_rq_to_pdu(req), + (set == &ctrl->tag_set) ? hctx_idx + 1 : 0); + } ++#else ++static int nvme_loop_init_request(void *data, struct request *req, ++ unsigned int hctx_idx, unsigned int rq_idx, ++ unsigned int numa_node) ++{ ++ struct nvme_loop_ctrl *ctrl = data; ++ struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(req); ++ ++ nvme_req(req)->ctrl = &ctrl->ctrl; ++ nvme_req(req)->cmd = &iod->cmd; ++ return nvme_loop_init_iod(data, blk_mq_rq_to_pdu(req), hctx_idx + 1); ++} + ++static int nvme_loop_init_admin_request(void *data, struct request *req, ++ unsigned int hctx_idx, unsigned int rq_idx, ++ unsigned int numa_node) ++{ ++ struct nvme_loop_ctrl *ctrl = data; ++ struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(req); ++ ++ nvme_req(req)->ctrl = &ctrl->ctrl; ++ nvme_req(req)->cmd = &iod->cmd; ++ return nvme_loop_init_iod(data, blk_mq_rq_to_pdu(req), 0); ++} ++#endif ++ ++#ifdef HAVE_BLK_MQ_HCTX_SET_FQ_LOCK_CLASS + static struct lock_class_key loop_hctx_fq_lock_key; ++#endif + + static int nvme_loop_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) +@@ -232,7 +277,9 @@ static int nvme_loop_init_hctx(struct bl + * then we can remove the dynamically allocated lock class for each + * flush queue, that way may cause horrible boot delay. + */ ++#ifdef HAVE_BLK_MQ_HCTX_SET_FQ_LOCK_CLASS + blk_mq_hctx_set_fq_lock_class(hctx, &loop_hctx_fq_lock_key); ++#endif + + hctx->driver_data = queue; + return 0; +@@ -250,17 +297,35 @@ static int nvme_loop_init_admin_hctx(str + return 0; + } + ++#ifdef HAVE_BLK_MQ_TAG_SET_HAS_CONST_OPS + static const struct blk_mq_ops nvme_loop_mq_ops = { ++#else ++static struct blk_mq_ops nvme_loop_mq_ops = { ++#endif + .queue_rq = nvme_loop_queue_rq, + .complete = nvme_loop_complete_rq, ++#ifdef HAVE_BLK_MQ_OPS_MAP_QUEUE ++ .map_queue = blk_mq_map_queue, ++#endif + .init_request = nvme_loop_init_request, + .init_hctx = nvme_loop_init_hctx, + }; + ++#ifdef HAVE_BLK_MQ_TAG_SET_HAS_CONST_OPS + static const struct blk_mq_ops nvme_loop_admin_mq_ops = { ++#else ++static struct blk_mq_ops nvme_loop_admin_mq_ops = { ++#endif + .queue_rq = nvme_loop_queue_rq, + .complete = nvme_loop_complete_rq, ++#ifdef HAVE_BLK_MQ_OPS_MAP_QUEUE ++ .map_queue = blk_mq_map_queue, ++#endif ++#ifdef HAVE_BLK_MQ_OPS_INIT_REQUEST_HAS_4_PARAMS + .init_request = nvme_loop_init_request, ++#else ++ .init_request = nvme_loop_init_admin_request, ++#endif + .init_hctx = nvme_loop_init_admin_hctx, + }; + +@@ -269,8 +334,13 @@ static void nvme_loop_destroy_admin_queu + if (!test_and_clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags)) + return; + nvmet_sq_destroy(&ctrl->queues[0].nvme_sq); ++#ifdef HAVE_BLK_MQ_DESTROY_QUEUE ++ blk_mq_destroy_queue(ctrl->ctrl.admin_q); ++ blk_mq_destroy_queue(ctrl->ctrl.fabrics_q); ++#else + blk_cleanup_queue(ctrl->ctrl.admin_q); + blk_cleanup_queue(ctrl->ctrl.fabrics_q); ++#endif + blk_mq_free_tag_set(&ctrl->admin_tag_set); + } + +@@ -286,7 +356,11 @@ static void nvme_loop_free_ctrl(struct n + mutex_unlock(&nvme_loop_ctrl_mutex); + + if (nctrl->tagset) { ++#ifdef HAVE_BLK_MQ_DESTROY_QUEUE ++ blk_mq_destroy_queue(ctrl->ctrl.connect_q); ++#else + blk_cleanup_queue(ctrl->ctrl.connect_q); ++#endif + blk_mq_free_tag_set(&ctrl->tag_set); + } + kfree(ctrl->queues); +@@ -363,7 +437,9 @@ static int nvme_loop_configure_admin_que + ctrl->admin_tag_set.driver_data = ctrl; + ctrl->admin_tag_set.nr_hw_queues = 1; + ctrl->admin_tag_set.timeout = NVME_ADMIN_TIMEOUT; ++#ifdef HAVE_BLK_MQ_F_NO_SCHED + ctrl->admin_tag_set.flags = BLK_MQ_F_NO_SCHED; ++#endif + + ctrl->queues[0].ctrl = ctrl; + error = nvmet_sq_init(&ctrl->queues[0].nvme_sq); +@@ -413,9 +489,15 @@ static int nvme_loop_configure_admin_que + + out_cleanup_queue: + clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags); ++#ifdef HAVE_BLK_MQ_DESTROY_QUEUE ++ blk_mq_destroy_queue(ctrl->ctrl.admin_q); ++out_cleanup_fabrics_q: ++ blk_mq_destroy_queue(ctrl->ctrl.fabrics_q); ++#else + blk_cleanup_queue(ctrl->ctrl.admin_q); + out_cleanup_fabrics_q: + blk_cleanup_queue(ctrl->ctrl.fabrics_q); ++#endif + out_free_tagset: + blk_mq_free_tag_set(&ctrl->admin_tag_set); + out_free_sq: +@@ -560,7 +642,11 @@ static int nvme_loop_create_io_queues(st + return 0; + + out_cleanup_connect_q: ++#ifdef HAVE_BLK_MQ_DESTROY_QUEUE ++ blk_mq_destroy_queue(ctrl->ctrl.connect_q); ++#else + blk_cleanup_queue(ctrl->ctrl.connect_q); ++#endif + out_free_tagset: + blk_mq_free_tag_set(&ctrl->tag_set); + out_destroy_queues: +@@ -741,4 +827,7 @@ module_init(nvme_loop_init_module); + module_exit(nvme_loop_cleanup_module); + + MODULE_LICENSE("GPL v2"); ++#ifdef RETPOLINE_MLNX ++MODULE_INFO(retpoline, "Y"); ++#endif + MODULE_ALIAS("nvmet-transport-254"); /* 254 == NVMF_TRTYPE_LOOP */ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0307-BACKPORT-drivers-nvme-host-nvme.h.patch b/src/mlnx-ofa_kernel-5.8/backports/0307-BACKPORT-drivers-nvme-host-nvme.h.patch new file mode 100644 index 0000000..7b27761 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0307-BACKPORT-drivers-nvme-host-nvme.h.patch @@ -0,0 +1,358 @@ +From: Israel Rukshin +Subject: [PATCH] BACKPORT: drivers/nvme/host/nvme.h + +Change-Id: I7d4d628aac4be6550135ff71e002a32265e13aa9 +--- + drivers/nvme/host/nvme.h | 135 +++++++++++++++++++++++++++++++++++++++ + 1 file changed, 135 insertions(+) + +--- a/drivers/nvme/host/nvme.h ++++ b/drivers/nvme/host/nvme.h +@@ -6,12 +6,30 @@ + #ifndef _NVME_H + #define _NVME_H + ++#ifndef HAVE_BLK_TYPES_REQ_DRV ++#undef CONFIG_NVME_MULTIPATH ++#endif ++ ++#ifndef HAVE_BLK_QUEUE_MAX_ACTIVE_ZONES ++#undef CONFIG_BLK_DEV_ZONED ++#endif ++ ++#ifndef HAVE_PCIE_FIND_ROOT_PORT ++#undef CONFIG_ACPI ++#endif ++ ++#ifdef HAVE_BLK_INTEGRITY_H ++#define HAVE_BLK_INTEGRITY_DEVICE_CAPABLE ++#endif ++ + #include + #include + #include + #include + #include ++#ifdef HAVE_LINUX_SED_OPAL_H + #include ++#endif + #include + #include + #include +@@ -19,6 +37,8 @@ + + #include + ++#include ++ + extern unsigned int nvme_io_timeout; + #define NVME_IO_TIMEOUT (nvme_io_timeout * HZ) + +@@ -27,6 +47,7 @@ extern unsigned int admin_timeout; + + #define NVME_DEFAULT_KATO 5 + ++#ifdef HAVE_SG_ALLOC_TABLE_CHAINED_NENTS_FIRST_CHUNK_PARAM + #ifdef CONFIG_ARCH_NO_SG_CHAIN + #define NVME_INLINE_SG_CNT 0 + #define NVME_INLINE_METADATA_SG_CNT 0 +@@ -34,6 +55,15 @@ extern unsigned int admin_timeout; + #define NVME_INLINE_SG_CNT 2 + #define NVME_INLINE_METADATA_SG_CNT 1 + #endif ++#else /* HAVE_SG_ALLOC_TABLE_CHAINED_NENTS_FIRST_CHUNK_PARAM */ ++#ifdef HAVE_SCSI_MAX_SG_SEGMENTS ++#define NVME_INLINE_SG_CNT SCSI_MAX_SG_SEGMENTS ++#define NVME_INLINE_METADATA_SG_CNT SCSI_MAX_SG_SEGMENTS ++#else ++#define NVME_INLINE_SG_CNT SG_CHUNK_SIZE ++#define NVME_INLINE_METADATA_SG_CNT SG_CHUNK_SIZE ++#endif ++#endif + + /* + * Default to a 4K page size, with the intention to update this +@@ -64,11 +94,19 @@ enum nvme_quirks { + */ + NVME_QUIRK_IDENTIFY_CNS = (1 << 1), + ++#ifdef HAVE_BLK_QUEUE_MAX_WRITE_ZEROES_SECTORS + /* + * The controller deterministically returns O's on reads to + * logical blocks that deallocate was called on. + */ + NVME_QUIRK_DEALLOCATE_ZEROES = (1 << 2), ++#else ++ /* ++ * The controller deterministically returns O's on reads to discarded ++ * logical blocks. ++ */ ++ NVME_QUIRK_DISCARD_ZEROES = (1 << 2), ++#endif + + /* + * The controller needs a delay before starts checking the device +@@ -185,7 +223,11 @@ static inline u16 nvme_req_qid(struct re + if (!req->q->queuedata) + return 0; + ++#ifdef HAVE_REQUEST_MQ_HCTX + return req->mq_hctx->queue_num + 1; ++#else ++ return blk_mq_unique_tag_to_hwq(blk_mq_unique_tag(req)) + 1; ++#endif + } + + /* The below value is the specific amount of delay needed before checking +@@ -263,7 +305,9 @@ struct nvme_ctrl { + struct nvme_subsystem *subsys; + struct list_head subsys_entry; + ++#ifdef HAVE_LINUX_SED_OPAL_H + struct opal_dev *opal_dev; ++#endif + + char name[12]; + u16 cntlid; +@@ -285,8 +329,10 @@ struct nvme_ctrl { + u16 crdt[3]; + u16 oncs; + u16 oacs; ++#ifdef HAVE_BLK_MAX_WRITE_HINTS + u16 nssa; + u16 nr_streams; ++#endif + u16 sqsize; + u32 max_namespaces; + atomic_t abort_limit; +@@ -314,6 +360,9 @@ struct nvme_ctrl { + struct delayed_work failfast_work; + struct nvme_command ka_cmd; + struct work_struct fw_act_work; ++#ifndef HAVE_BLK_QUEUE_VIRT_BOUNDARY ++ bool sg_gaps_support; ++#endif + unsigned long events; + + #ifdef CONFIG_NVME_MULTIPATH +@@ -457,8 +506,10 @@ struct nvme_ns { + + int lba_shift; + u16 ms; ++#ifdef HAVE_BLK_MAX_WRITE_HINTS + u16 sgs; + u32 sws; ++#endif + u8 pi_type; + #ifdef CONFIG_BLK_DEV_ZONED + u64 zsze; +@@ -610,6 +661,20 @@ static inline bool nvme_is_path_error(u1 + return (status & 0x700) == 0x300; + } + ++#ifndef HAVE_BLK_RQ_NR_PAYLOAD_BYTES ++static inline unsigned nvme_map_len(struct request *rq) ++{ ++#ifdef HAVE_BLK_TYPES_REQ_OP_DISCARD ++ if (req_op(rq) == REQ_OP_DISCARD) ++#else ++ if (rq->cmd_flags & REQ_DISCARD) ++#endif ++ return sizeof(struct nvme_dsm_range); ++ else ++ return blk_rq_bytes(rq); ++} ++#endif ++ + /* + * Fill in the status and result information from the CQE, and then figure out + * if blk-mq will need to use IPI magic to complete the request, and if yes do +@@ -629,9 +694,20 @@ static inline bool nvme_try_complete_req + rq->result = result; + /* inject error when permitted by fault injection framework */ + nvme_should_fail(req); ++#ifdef HAVE_BLK_SHOULD_FAKE_TIMEOUT + if (unlikely(blk_should_fake_timeout(req->q))) + return true; ++#endif ++#ifdef HAVE_BLK_MQ_COMPLETE_REQUEST_REMOTE + return blk_mq_complete_request_remote(req); ++#else ++#ifdef HAVE_BLK_MQ_COMPLETE_REQUEST_HAS_2_PARAMS ++ blk_mq_complete_request(req, 0); ++#else ++ blk_mq_complete_request(req); ++#endif ++ return true; ++#endif + } + + static inline void nvme_get_ctrl(struct nvme_ctrl *ctrl) +@@ -653,6 +729,7 @@ static inline bool nvme_is_aen_req(u16 q + void nvme_complete_rq(struct request *req); + void nvme_complete_batch_req(struct request *req); + ++#ifdef HAVE_BLK_MQ_OPS_POLL_2_ARG + static __always_inline void nvme_complete_batch(struct io_comp_batch *iob, + void (*fn)(struct request *rq)) + { +@@ -664,9 +741,16 @@ static __always_inline void nvme_complet + } + blk_mq_end_request_batch(iob); + } ++#endif + + blk_status_t nvme_host_path_error(struct request *req); ++#ifdef HAVE_BLK_MQ_BUSY_TAG_ITER_FN_BOOL_3_PARAMS + bool nvme_cancel_request(struct request *req, void *data, bool reserved); ++#elif defined HAVE_BLK_MQ_BUSY_TAG_ITER_FN_BOOL_2_PARAMS ++bool nvme_cancel_request(struct request *req, void *data); ++#else ++void nvme_cancel_request(struct request *req, void *data, bool reserved); ++#endif + void nvme_cancel_tagset(struct nvme_ctrl *ctrl); + void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl); + bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, +@@ -684,8 +768,10 @@ int nvme_init_ctrl_finish(struct nvme_ct + + void nvme_remove_namespaces(struct nvme_ctrl *ctrl); + ++#ifdef HAVE_LINUX_SED_OPAL_H + int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, + bool send); ++#endif + + void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, + volatile union nvme_result *res); +@@ -703,8 +789,13 @@ int nvme_wait_freeze_timeout(struct nvme + void nvme_start_freeze(struct nvme_ctrl *ctrl); + + #define NVME_QID_ANY -1 ++#ifdef HAVE_BLK_MQ_ALLOC_REQUEST_HAS_3_PARAMS + struct request *nvme_alloc_request(struct request_queue *q, + struct nvme_command *cmd, blk_mq_req_flags_t flags); ++#else ++struct request *nvme_alloc_request(struct request_queue *q, ++ struct nvme_command *cmd, gfp_t gfp, bool reserved); ++#endif + void nvme_cleanup_cmd(struct request *req); + blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req); + blk_status_t nvme_fail_nonready_command(struct nvme_ctrl *ctrl, +@@ -724,10 +815,17 @@ static inline bool nvme_check_ready(stru + } + int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, + void *buf, unsigned bufflen); ++#ifdef HAVE_BLK_MQ_ALLOC_REQUEST_HAS_3_PARAMS + int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, + union nvme_result *result, void *buffer, unsigned bufflen, + unsigned timeout, int qid, int at_head, + blk_mq_req_flags_t flags); ++#else ++int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, ++ union nvme_result *result, void *buffer, unsigned bufflen, ++ unsigned timeout, int qid, int at_head, gfp_t gfp, bool reserved, ++ bool poll); ++#endif + int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid, + unsigned int dword11, void *buffer, size_t buflen, + u32 *result); +@@ -759,7 +857,11 @@ long nvme_dev_ioctl(struct file *file, u + unsigned long arg); + int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo); + ++#ifdef HAVE_DEVICE_ADD_DISK_3_ARGS + extern const struct attribute_group *nvme_ns_id_attr_groups[]; ++#else ++extern const struct attribute_group nvme_ns_id_attr_group; ++#endif + extern const struct pr_ops nvme_pr_ops; + extern const struct block_device_operations nvme_ns_head_ops; + +@@ -789,12 +891,23 @@ void nvme_mpath_revalidate_paths(struct + void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl); + void nvme_mpath_shutdown_disk(struct nvme_ns_head *head); + ++#ifdef HAVE_TRACE_BLOCK_BIO_COMPLETE_2_PARAM + static inline void nvme_trace_bio_complete(struct request *req) ++#else ++static inline void nvme_trace_bio_complete(struct request *req, ++ blk_status_t status) ++#endif + { + struct nvme_ns *ns = req->q->queuedata; + ++ + if (req->cmd_flags & REQ_NVME_MPATH) ++#ifdef HAVE_TRACE_BLOCK_BIO_COMPLETE_2_PARAM + trace_block_bio_complete(ns->head->disk->queue, req->bio); ++#else ++ trace_block_bio_complete(ns->head->disk->queue, req->bio, ++ blk_status_to_errno(status)); ++#endif + } + + extern struct device_attribute dev_attr_ana_grpid; +@@ -842,7 +955,12 @@ static inline void nvme_mpath_clear_ctrl + static inline void nvme_mpath_shutdown_disk(struct nvme_ns_head *head) + { + } ++#ifdef HAVE_TRACE_BLOCK_BIO_COMPLETE_2_PARAM + static inline void nvme_trace_bio_complete(struct request *req) ++#else ++static inline void nvme_trace_bio_complete(struct request *req, ++ blk_status_t status) ++#endif + { + } + static inline void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl) +@@ -851,9 +969,11 @@ static inline void nvme_mpath_init_ctrl( + static inline int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, + struct nvme_id_ctrl *id) + { ++#ifdef HAVE_BLK_TYPES_REQ_DRV + if (ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA) + dev_warn(ctrl->device, + "Please enable CONFIG_NVME_MULTIPATH for full support of multi-port devices.\n"); ++#endif + return 0; + } + static inline void nvme_mpath_uninit(struct nvme_ctrl *ctrl) +@@ -877,8 +997,10 @@ static inline void nvme_mpath_default_io + #endif /* CONFIG_NVME_MULTIPATH */ + + int nvme_revalidate_zones(struct nvme_ns *ns); ++#ifdef CONFIG_BLK_DEV_ZONED + int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector, + unsigned int nr_zones, report_zones_cb cb, void *data); ++#endif + #ifdef CONFIG_BLK_DEV_ZONED + int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf); + blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req, +@@ -927,7 +1049,11 @@ static inline bool nvme_ctrl_sgl_support + struct nvme_ns *disk_to_nvme_ns(struct gendisk *disk); + u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, + u8 opcode); ++#if defined(HAVE_BLK_EXECUTE_RQ_2_PARAM) || defined(HAVE_BLK_EXECUTE_RQ_3_PARAM) + int nvme_execute_passthru_rq(struct request *rq); ++#else ++void nvme_execute_passthru_rq(struct request *rq); ++#endif + struct nvme_ctrl *nvme_ctrl_from_file(struct file *file); + struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid); + void nvme_put_ns(struct nvme_ns *ns); +@@ -937,4 +1063,15 @@ static inline bool nvme_multi_css(struct + return (ctrl->ctrl_config & NVME_CC_CSS_MASK) == NVME_CC_CSS_CSI; + } + ++#ifndef HAVE_BLK_RQ_NR_PHYS_SEGMENTS ++static inline unsigned short blk_rq_nr_phys_segments(struct request *rq) ++{ ++#ifdef HAVE_REQUEST_RQ_FLAGS ++ if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) ++ return 1; ++#endif ++ return rq->nr_phys_segments; ++} ++#endif ++ + #endif /* _NVME_H */ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0308-BACKPORT-drivers-nvme-host-core.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0308-BACKPORT-drivers-nvme-host-core.c.patch new file mode 100644 index 0000000..14adfb9 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0308-BACKPORT-drivers-nvme-host-core.c.patch @@ -0,0 +1,1869 @@ +From: Israel Rukshin +Subject: [PATCH] BACKPORT: drivers/nvme/host/core.c + +Change-Id: Ibfcc74945c146fd22236672047f42ed7d3154fce +--- + drivers/nvme/host/core.c | 912 ++++++++++++++++++++++++++++++++++++++- + 1 file changed, 905 insertions(+), 7 deletions(-) + +--- a/drivers/nvme/host/core.c ++++ b/drivers/nvme/host/core.c +@@ -6,7 +6,9 @@ + + #include + #include ++#ifdef HAVE_BLK_INTEGRITY_H + #include ++#endif + #include + #include + #include +@@ -16,10 +18,14 @@ + #include + #include + #include ++#ifdef HAVE_PR_H + #include ++#endif + #include + #include ++#ifdef HAVE_DEV_PM_INFO_SET_LATENCY_TOLERANCE + #include ++#endif + #include + + #include "nvme.h" +@@ -50,8 +56,13 @@ MODULE_PARM_DESC(max_retries, "max numbe + + static unsigned long default_ps_max_latency_us = 100000; + module_param(default_ps_max_latency_us, ulong, 0644); ++#ifdef HAVE_DEV_PM_INFO_SET_LATENCY_TOLERANCE + MODULE_PARM_DESC(default_ps_max_latency_us, + "max power saving latency for new devices; use PM QOS to change per device"); ++#else ++MODULE_PARM_DESC(default_ps_max_latency_us, ++ "max power saving latency for new devices [deprecated]"); ++#endif + + static bool force_apst; + module_param(force_apst, bool, 0644); +@@ -79,7 +90,11 @@ MODULE_PARM_DESC(apst_secondary_latency_ + + static bool streams; + module_param(streams, bool, 0644); ++#ifdef HAVE_BLK_MAX_WRITE_HINTS + MODULE_PARM_DESC(streams, "turn on support for Streams write directives"); ++#else ++MODULE_PARM_DESC(streams, "turn on support for Streams write directives [deprecated]"); ++#endif + + /* + * nvme_wq - hosts nvme related works that are not reset or delete +@@ -230,6 +245,7 @@ int nvme_delete_ctrl(struct nvme_ctrl *c + } + EXPORT_SYMBOL_GPL(nvme_delete_ctrl); + ++#ifdef HAVE_DEVICE_REMOVE_FILE_SELF + static void nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl) + { + /* +@@ -241,6 +257,7 @@ static void nvme_delete_ctrl_sync(struct + nvme_do_delete_ctrl(ctrl); + nvme_put_ctrl(ctrl); + } ++#endif + + static blk_status_t nvme_error_status(u16 status) + { +@@ -275,12 +292,18 @@ static blk_status_t nvme_error_status(u1 + return BLK_STS_NEXUS; + case NVME_SC_HOST_PATH_ERROR: + return BLK_STS_TRANSPORT; ++#ifdef HAVE_BLK_MQ_BLK_STS_ZONE_ACTIVE_RESOURCE + case NVME_SC_ZONE_TOO_MANY_ACTIVE: + return BLK_STS_ZONE_ACTIVE_RESOURCE; + case NVME_SC_ZONE_TOO_MANY_OPEN: + return BLK_STS_ZONE_OPEN_RESOURCE; ++#endif + default: ++#ifdef HAVE_BLK_MQ_END_REQUEST_TAKES_BLK_STATUS_T + return BLK_STS_IOERR; ++#else ++ return -EIO; ++#endif + } + } + +@@ -295,8 +318,16 @@ static void nvme_retry_req(struct reques + delay = nvme_req(req)->ctrl->crdt[crd - 1] * 100; + + nvme_req(req)->retries++; ++#ifdef HAVE_BLK_MQ_REQUEUE_REQUEST_2_PARAMS + blk_mq_requeue_request(req, false); ++#else ++ blk_mq_requeue_request(req); ++#endif ++#ifdef HAVE_BLK_MQ_DELAY_KICK_REQUEUE_LIST + blk_mq_delay_kick_requeue_list(req->q, delay); ++#else ++ blk_mq_kick_requeue_list(req->q); ++#endif + } + + enum nvme_disposition { +@@ -315,6 +346,7 @@ static inline enum nvme_disposition nvme + nvme_req(req)->retries >= nvme_max_retries) + return COMPLETE; + ++#ifdef CONFIG_NVME_MULTIPATH + if (req->cmd_flags & REQ_NVME_MPATH) { + if (nvme_is_path_error(nvme_req(req)->status) || + blk_queue_dying(req->q)) +@@ -323,16 +355,22 @@ static inline enum nvme_disposition nvme + if (blk_queue_dying(req->q)) + return COMPLETE; + } ++#else ++ if (blk_queue_dying(req->q)) ++ return COMPLETE; ++#endif + + return RETRY; + } + + static inline void nvme_end_req_zoned(struct request *req) + { ++#ifdef HAVE_BLK_QUEUE_MAX_ACTIVE_ZONES + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && + req_op(req) == REQ_OP_ZONE_APPEND) + req->__sector = nvme_lba_to_sect(req->q->queuedata, + le64_to_cpu(nvme_req(req)->result.u64)); ++#endif + } + + static inline void nvme_end_req(struct request *req) +@@ -340,7 +378,12 @@ static inline void nvme_end_req(struct r + blk_status_t status = nvme_error_status(nvme_req(req)->status); + + nvme_end_req_zoned(req); ++#ifdef HAVE_TRACE_BLOCK_BIO_COMPLETE_2_PARAM + nvme_trace_bio_complete(req); ++#else ++ nvme_trace_bio_complete(req, status); ++ ++#endif + blk_mq_end_request(req, status); + } + +@@ -383,25 +426,54 @@ EXPORT_SYMBOL_GPL(nvme_complete_batch_re + blk_status_t nvme_host_path_error(struct request *req) + { + nvme_req(req)->status = NVME_SC_HOST_PATH_ERROR; ++#ifdef HAVE_MQ_RQ_STATE + blk_mq_set_request_complete(req); ++#endif + nvme_complete_rq(req); + return BLK_STS_OK; + } + EXPORT_SYMBOL_GPL(nvme_host_path_error); + ++#ifdef HAVE_BLK_MQ_BUSY_TAG_ITER_FN_BOOL_3_PARAMS + bool nvme_cancel_request(struct request *req, void *data, bool reserved) ++#elif defined HAVE_BLK_MQ_BUSY_TAG_ITER_FN_BOOL_2_PARAMS ++bool nvme_cancel_request(struct request *req, void *data) ++#else ++void nvme_cancel_request(struct request *req, void *data, bool reserved) ++#endif + { ++#ifndef HAVE_BLK_MQ_BUSY_TAG_ITER_FN_BOOL ++ if (!blk_mq_request_started(req)) ++ return; ++#endif ++ + dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device, + "Cancelling I/O %d", req->tag); + ++#ifdef HAVE_MQ_RQ_STATE + /* don't abort one completed request */ + if (blk_mq_request_completed(req)) ++#ifdef HAVE_BLK_MQ_BUSY_TAG_ITER_FN_BOOL + return true; ++#else ++ return; ++#endif ++#endif + + nvme_req(req)->status = NVME_SC_HOST_ABORTED_CMD; + nvme_req(req)->flags |= NVME_REQ_CANCELLED; ++#ifdef HAVE_BLK_MQ_COMPLETE_REQUEST_HAS_2_PARAMS ++ blk_mq_complete_request(req, 0); ++#else ++#ifdef HAVE_BLK_MQ_COMPLETE_REQUEST_SYNC ++ blk_mq_complete_request_sync(req); ++#else + blk_mq_complete_request(req); ++#endif ++#endif ++#ifdef HAVE_BLK_MQ_BUSY_TAG_ITER_FN_BOOL + return true; ++#endif + } + EXPORT_SYMBOL_GPL(nvme_cancel_request); + +@@ -425,6 +497,18 @@ void nvme_cancel_admin_tagset(struct nvm + } + EXPORT_SYMBOL_GPL(nvme_cancel_admin_tagset); + ++#ifndef HAVE_BLKDEV_QUEUE_FLAG_QUIESCED ++void nvme_ns_kick_requeue_lists(struct nvme_ctrl *ctrl) ++{ ++ struct nvme_ns *ns; ++ ++ down_read(&ctrl->namespaces_rwsem); ++ list_for_each_entry(ns, &ctrl->namespaces, list) ++ blk_mq_kick_requeue_list(ns->queue); ++ up_read(&ctrl->namespaces_rwsem); ++} ++#endif ++ + bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, + enum nvme_ctrl_state new_state) + { +@@ -513,6 +597,9 @@ bool nvme_change_ctrl_state(struct nvme_ + if (ctrl->state == NVME_CTRL_LIVE) { + if (old_state == NVME_CTRL_CONNECTING) + nvme_stop_failfast_work(ctrl); ++#ifndef HAVE_BLKDEV_QUEUE_FLAG_QUIESCED ++ nvme_ns_kick_requeue_lists(ctrl); ++#endif + nvme_kick_requeue_lists(ctrl); + } else if (ctrl->state == NVME_CTRL_CONNECTING && + old_state == NVME_CTRL_RESETTING) { +@@ -604,13 +691,19 @@ static inline void nvme_clear_nvme_reque + nvme_req(req)->status = 0; + nvme_req(req)->retries = 0; + nvme_req(req)->flags = 0; ++#ifdef HAVE_REQUEST_RQ_FLAGS + req->rq_flags |= RQF_DONTPREP; ++#else ++ req->cmd_flags |= REQ_DONTPREP; ++#endif + } + ++#ifdef HAVE_BLK_TYPES_REQ_OP_DRV_OUT + static inline unsigned int nvme_req_op(struct nvme_command *cmd) + { + return nvme_is_write(cmd) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN; + } ++#endif + + static inline void nvme_init_request(struct request *req, + struct nvme_command *cmd) +@@ -624,35 +717,81 @@ static inline void nvme_init_request(str + cmd->common.flags &= ~NVME_CMD_SGL_ALL; + + req->cmd_flags |= REQ_FAILFAST_DRIVER; ++#ifdef HAVE_BLK_MQ_HCTX_TYPE + if (req->mq_hctx->type == HCTX_TYPE_POLL) ++#ifdef HAVE_BLK_TYPES_REQ_HIPRI ++ req->cmd_flags |= REQ_HIPRI; ++#else + req->cmd_flags |= REQ_POLLED; ++#endif ++#endif + nvme_clear_nvme_request(req); + memcpy(nvme_req(req)->cmd, cmd, sizeof(*cmd)); + } + ++#ifdef HAVE_BLK_MQ_ALLOC_REQUEST_HAS_3_PARAMS + struct request *nvme_alloc_request(struct request_queue *q, + struct nvme_command *cmd, blk_mq_req_flags_t flags) ++#else ++struct request *nvme_alloc_request(struct request_queue *q, ++ struct nvme_command *cmd, gfp_t gfp, bool reserved) ++#endif + { + struct request *req; + ++#ifdef HAVE_BLK_TYPES_REQ_OP_DRV_OUT + req = blk_mq_alloc_request(q, nvme_req_op(cmd), flags); +- if (!IS_ERR(req)) ++#else ++#ifdef HAVE_BLK_MQ_ALLOC_REQUEST_HAS_3_PARAMS ++ req = blk_mq_alloc_request(q, nvme_is_write(cmd), flags); ++#else ++ // XXX RH 7.2 doesn't use qid. ++ // XXX We should call blk_mq_alloc_request_hctx() here. ++ req = blk_mq_alloc_request(q, nvme_is_write(cmd), gfp, reserved); ++#endif /* !HAVE_BLK_MQ_ALLOC_REQUEST_HAS_3_PARAMS */ ++#endif ++ if (!IS_ERR(req)) { ++#ifndef HAVE_BLK_TYPES_REQ_OP_DRV_OUT ++#ifdef HAVE_BLKDEV_REQ_TYPE_DRV_PRIV ++ req->cmd_type = REQ_TYPE_DRV_PRIV; ++#else ++ req->cmd_type = REQ_TYPE_SPECIAL; ++#endif ++#endif + nvme_init_request(req, cmd); ++ } ++ + return req; + } + EXPORT_SYMBOL_GPL(nvme_alloc_request); + ++#ifdef HAVE_BLK_MQ_ALLOC_REQUEST_HAS_3_PARAMS + static struct request *nvme_alloc_request_qid(struct request_queue *q, + struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid) + { + struct request *req; + ++#ifdef HAVE_BLK_TYPES_REQ_OP_DRV_OUT + req = blk_mq_alloc_request_hctx(q, nvme_req_op(cmd), flags, + qid ? qid - 1 : 0); +- if (!IS_ERR(req)) ++#else ++ req = blk_mq_alloc_request_hctx(q, nvme_is_write(cmd), flags, ++ qid ? qid - 1 : 0); ++#endif ++ if (!IS_ERR(req)) { ++#ifndef HAVE_BLK_TYPES_REQ_OP_DRV_OUT ++#ifdef HAVE_BLKDEV_REQ_TYPE_DRV_PRIV ++ req->cmd_type = REQ_TYPE_DRV_PRIV; ++#else ++ req->cmd_type = REQ_TYPE_SPECIAL; ++#endif ++#endif + nvme_init_request(req, cmd); ++ } ++ + return req; + } ++#endif + + /* + * For something we're not in a state to send to the device the default action +@@ -666,12 +805,25 @@ static struct request *nvme_alloc_reques + blk_status_t nvme_fail_nonready_command(struct nvme_ctrl *ctrl, + struct request *rq) + { ++#ifdef CONFIG_NVME_MULTIPATH + if (ctrl->state != NVME_CTRL_DELETING_NOIO && + ctrl->state != NVME_CTRL_DELETING && + ctrl->state != NVME_CTRL_DEAD && + !test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) && + !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH)) ++#else ++ if (ctrl->state != NVME_CTRL_DELETING_NOIO && ++ ctrl->state != NVME_CTRL_DELETING && ++ ctrl->state != NVME_CTRL_DEAD && ++ !test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) && ++ !blk_noretry_request(rq)) ++#endif + return BLK_STS_RESOURCE; ++ ++#ifndef HAVE_MQ_RQ_STATE ++ blk_mq_start_request(rq); ++#endif ++ + return nvme_host_path_error(rq); + } + EXPORT_SYMBOL_GPL(nvme_fail_nonready_command); +@@ -715,6 +867,7 @@ bool __nvme_check_ready(struct nvme_ctrl + } + EXPORT_SYMBOL_GPL(__nvme_check_ready); + ++#ifdef HAVE_BLK_MAX_WRITE_HINTS + static int nvme_toggle_streams(struct nvme_ctrl *ctrl, bool enable) + { + struct nvme_command c = { }; +@@ -813,6 +966,7 @@ static void nvme_assign_write_stream(str + if (streamid < ARRAY_SIZE(req->q->write_hints)) + req->q->write_hints[streamid] += blk_rq_bytes(req) >> 9; + } ++#endif /* HAVE_BLK_MAX_WRITE_HINTS */ + + static inline void nvme_setup_flush(struct nvme_ns *ns, + struct nvme_command *cmnd) +@@ -825,16 +979,32 @@ static inline void nvme_setup_flush(stru + static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req, + struct nvme_command *cmnd) + { ++#ifdef HAVE_BLK_RQ_NR_DISCARD_SEGMENTS + unsigned short segments = blk_rq_nr_discard_segments(req), n = 0; ++#else ++ unsigned short segments = 1; ++#endif + struct nvme_dsm_range *range; ++#ifdef HAVE_BLK_RQ_NR_DISCARD_SEGMENTS + struct bio *bio; ++#else ++ unsigned int nr_bytes = blk_rq_bytes(req); ++#endif ++#ifndef HAVE_REQUEST_RQ_FLAGS ++ struct page *page; ++ int offset; ++#endif + + /* + * Some devices do not consider the DSM 'Number of Ranges' field when + * determining how much data to DMA. Always allocate memory for maximum + * number of segments to prevent device reading beyond end of buffer. + */ ++#ifdef HAVE_BLK_RQ_NR_DISCARD_SEGMENTS + static const size_t alloc_size = sizeof(*range) * NVME_DSM_MAX_RANGES; ++#else ++ static const size_t alloc_size = sizeof(*range); ++#endif + + range = kzalloc(alloc_size, GFP_ATOMIC | __GFP_NOWARN); + if (!range) { +@@ -849,6 +1019,7 @@ static blk_status_t nvme_setup_discard(s + range = page_address(ns->ctrl->discard_page); + } + ++#ifdef HAVE_BLK_RQ_NR_DISCARD_SEGMENTS + __rq_for_each_bio(bio, req) { + u64 slba = nvme_sect_to_lba(ns, bio->bi_iter.bi_sector); + u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift; +@@ -868,6 +1039,11 @@ static blk_status_t nvme_setup_discard(s + kfree(range); + return BLK_STS_IOERR; + } ++#else ++ range->cattr = cpu_to_le32(0); ++ range->nlb = cpu_to_le32(nr_bytes >> ns->lba_shift); ++ range->slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req))); ++#endif + + memset(cmnd, 0, sizeof(*cmnd)); + cmnd->dsm.opcode = nvme_cmd_dsm; +@@ -875,10 +1051,29 @@ static blk_status_t nvme_setup_discard(s + cmnd->dsm.nr = cpu_to_le32(segments - 1); + cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); + ++#ifdef HAVE_REQUEST_RQ_FLAGS + req->special_vec.bv_page = virt_to_page(range); + req->special_vec.bv_offset = offset_in_page(range); + req->special_vec.bv_len = alloc_size; + req->rq_flags |= RQF_SPECIAL_PAYLOAD; ++#else ++ req->completion_data = range; ++ page = virt_to_page(range); ++ offset = offset_in_page(range); ++#ifdef HAVE_BLK_ADD_REQUEST_PAYLOAD_HAS_4_PARAMS ++ blk_add_request_payload(req, page, offset, sizeof(*range)); ++#else ++ blk_add_request_payload(req, page, sizeof(*range)); ++ req->bio->bi_io_vec->bv_offset = offset; ++#endif ++ ++ /* ++ * we set __data_len back to the size of the area to be discarded ++ * on disk. This allows us to report completion on the full amount ++ * of blocks described by the request. ++ */ ++ req->__data_len = nr_bytes; ++#endif /* HAVE_REQUEST_RQ_FLAGS */ + + return BLK_STS_OK; + } +@@ -888,8 +1083,10 @@ static inline blk_status_t nvme_setup_wr + { + memset(cmnd, 0, sizeof(*cmnd)); + ++#ifdef HAVE_BLK_QUEUE_MAX_WRITE_ZEROES_SECTORS + if (ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) + return nvme_setup_discard(ns, req, cmnd); ++#endif + + cmnd->write_zeroes.opcode = nvme_cmd_write_zeroes; + cmnd->write_zeroes.nsid = cpu_to_le32(ns->head->ns_id); +@@ -917,7 +1114,9 @@ static inline blk_status_t nvme_setup_rw + struct request *req, struct nvme_command *cmnd, + enum nvme_opcode op) + { ++#ifdef HAVE_BLK_MAX_WRITE_HINTS + struct nvme_ctrl *ctrl = ns->ctrl; ++#endif + u16 control = 0; + u32 dsmgmt = 0; + +@@ -940,8 +1139,10 @@ static inline blk_status_t nvme_setup_rw + cmnd->rw.apptag = 0; + cmnd->rw.appmask = 0; + ++#ifdef HAVE_BLK_MAX_WRITE_HINTS + if (req_op(req) == REQ_OP_WRITE && ctrl->nr_streams) + nvme_assign_write_stream(ctrl, req, &control, &dsmgmt); ++#endif + + if (ns->ms) { + /* +@@ -954,6 +1155,14 @@ static inline blk_status_t nvme_setup_rw + if (WARN_ON_ONCE(!nvme_ns_has_pi(ns))) + return BLK_STS_NOTSUPP; + control |= NVME_RW_PRINFO_PRACT; ++#if defined(HAVE_T10_PI_PREPARE) || !defined(HAVE_T10_PI_H) ++#ifdef HAVE_REQ_OP ++ } else if (req_op(req) == REQ_OP_WRITE) { ++#else ++ } else if (rq_data_dir(req) == WRITE) { ++#endif ++ t10_pi_prepare(req, ns->pi_type); ++#endif + } + + switch (ns->pi_type) { +@@ -978,14 +1187,46 @@ static inline blk_status_t nvme_setup_rw + + void nvme_cleanup_cmd(struct request *req) + { ++#if defined(HAVE_T10_PI_PREPARE) || !defined(HAVE_T10_PI_H) ++#ifdef HAVE_REQ_OP ++ if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ && ++ nvme_req(req)->status == 0) { ++#else ++ if (blk_integrity_rq(req) && rq_data_dir(req) == READ && ++ nvme_req(req)->status == 0) { ++#endif ++ struct nvme_ns *ns = req->rq_disk->private_data; ++ ++ t10_pi_complete(req, ns->pi_type, ++ blk_rq_bytes(req) >> ns->lba_shift); ++ } ++#endif ++#ifdef HAVE_REQUEST_RQ_FLAGS + if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { + struct nvme_ctrl *ctrl = nvme_req(req)->ctrl; ++#ifdef HAVE_BVEC_VIRT + + if (req->special_vec.bv_page == ctrl->discard_page) + clear_bit_unlock(0, &ctrl->discard_page_busy); + else + kfree(bvec_virt(&req->special_vec)); ++#else ++ struct page *page = req->special_vec.bv_page; ++ ++ if (page == ctrl->discard_page) ++ clear_bit_unlock(0, &ctrl->discard_page_busy); ++ else ++ kfree(page_address(page) + req->special_vec.bv_offset); ++#endif + } ++#else ++#ifdef HAVE_BLK_TYPES_REQ_OP_DISCARD ++ if (req_op(req) == REQ_OP_DISCARD) ++#else ++ if (req->cmd_flags & REQ_DISCARD) ++#endif ++ kfree(req->completion_data); ++#endif + } + EXPORT_SYMBOL_GPL(nvme_cleanup_cmd); + +@@ -994,9 +1235,14 @@ blk_status_t nvme_setup_cmd(struct nvme_ + struct nvme_command *cmd = nvme_req(req)->cmd; + blk_status_t ret = BLK_STS_OK; + ++#ifdef HAVE_REQUEST_RQ_FLAGS + if (!(req->rq_flags & RQF_DONTPREP)) ++#else ++ if (!(req->cmd_flags & REQ_DONTPREP)) ++#endif + nvme_clear_nvme_request(req); + ++#ifdef HAVE_BLK_TYPES_REQ_OP_DRV_OUT + switch (req_op(req)) { + case REQ_OP_DRV_IN: + case REQ_OP_DRV_OUT: +@@ -1005,6 +1251,7 @@ blk_status_t nvme_setup_cmd(struct nvme_ + case REQ_OP_FLUSH: + nvme_setup_flush(ns, cmd); + break; ++#ifdef HAVE_BLK_QUEUE_MAX_ACTIVE_ZONES + case REQ_OP_ZONE_RESET_ALL: + case REQ_OP_ZONE_RESET: + ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_RESET); +@@ -1018,9 +1265,12 @@ blk_status_t nvme_setup_cmd(struct nvme_ + case REQ_OP_ZONE_FINISH: + ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH); + break; ++#endif ++#ifdef HAVE_BLK_QUEUE_MAX_WRITE_ZEROES_SECTORS + case REQ_OP_WRITE_ZEROES: + ret = nvme_setup_write_zeroes(ns, req, cmd); + break; ++#endif + case REQ_OP_DISCARD: + ret = nvme_setup_discard(ns, req, cmd); + break; +@@ -1030,13 +1280,43 @@ blk_status_t nvme_setup_cmd(struct nvme_ + case REQ_OP_WRITE: + ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write); + break; ++#ifdef HAVE_BLK_QUEUE_MAX_ACTIVE_ZONES + case REQ_OP_ZONE_APPEND: + ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append); + break; ++#endif + default: + WARN_ON_ONCE(1); + return BLK_STS_IOERR; + } ++#else ++#ifdef HAVE_BLKDEV_REQ_TYPE_DRV_PRIV ++ if (req->cmd_type == REQ_TYPE_DRV_PRIV) ++#else ++ if (req->cmd_type == REQ_TYPE_SPECIAL) ++#endif ++ memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd)); ++#ifdef HAVE_BLK_TYPES_REQ_OP_FLUSH ++ else if (req_op(req) == REQ_OP_FLUSH) ++#else ++ else if (req->cmd_flags & REQ_FLUSH) ++#endif ++ nvme_setup_flush(ns, cmd); ++#ifdef HAVE_BLK_TYPES_REQ_OP_DISCARD ++ else if (req_op(req) == REQ_OP_DISCARD) ++#else ++ else if (req->cmd_flags & REQ_DISCARD) ++#endif ++ ret = nvme_setup_discard(ns, req, cmd); ++#ifdef HAVE_REQ_OP ++ else if (req_op(req) == REQ_OP_READ) ++#else ++ else if (rq_data_dir(req) == READ) ++#endif ++ nvme_setup_rw(ns, req, cmd, nvme_cmd_read); ++ else ++ nvme_setup_rw(ns, req, cmd, nvme_cmd_write); ++#endif + + cmd->common.command_id = nvme_cid(req); + trace_nvme_setup_cmd(req, cmd); +@@ -1050,35 +1330,52 @@ EXPORT_SYMBOL_GPL(nvme_setup_cmd); + * >0: nvme controller's cqe status response + * <0: kernel error in lieu of controller response + */ ++#if defined(HAVE_BLK_EXECUTE_RQ_2_PARAM) || defined(HAVE_BLK_EXECUTE_RQ_3_PARAM) + static int nvme_execute_rq(struct gendisk *disk, struct request *rq, + bool at_head) + { + blk_status_t status; + ++#ifdef HAVE_BLK_EXECUTE_RQ_2_PARAM + status = blk_execute_rq(rq, at_head); ++#else ++ status = blk_execute_rq(disk, rq, at_head); ++#endif + if (nvme_req(rq)->flags & NVME_REQ_CANCELLED) + return -EINTR; + if (nvme_req(rq)->status) + return nvme_req(rq)->status; + return blk_status_to_errno(status); + } ++#endif + + /* + * Returns 0 on success. If the result is negative, it's a Linux error code; + * if the result is positive, it's an NVM Express status code + */ ++#ifdef HAVE_BLK_MQ_ALLOC_REQUEST_HAS_3_PARAMS + int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, + union nvme_result *result, void *buffer, unsigned bufflen, + unsigned timeout, int qid, int at_head, + blk_mq_req_flags_t flags) ++#else ++int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, ++ union nvme_result *result, void *buffer, unsigned bufflen, ++ unsigned timeout, int qid, int at_head, gfp_t gfp, bool reserved, ++ bool poll) ++#endif + { + struct request *req; + int ret; + ++#ifdef HAVE_BLK_MQ_ALLOC_REQUEST_HAS_3_PARAMS + if (qid == NVME_QID_ANY) + req = nvme_alloc_request(q, cmd, flags); + else + req = nvme_alloc_request_qid(q, cmd, flags, qid); ++#else ++ req = nvme_alloc_request(q, cmd, gfp, reserved); ++#endif + if (IS_ERR(req)) + return PTR_ERR(req); + +@@ -1091,9 +1388,23 @@ int __nvme_submit_sync_cmd(struct reques + goto out; + } + ++#if defined(HAVE_BLK_EXECUTE_RQ_2_PARAM) || defined(HAVE_BLK_EXECUTE_RQ_3_PARAM) + ret = nvme_execute_rq(NULL, req, at_head); + if (result && ret >= 0) + *result = nvme_req(req)->result; ++#else ++#ifdef HAVE_BLK_EXECUTE_RQ_4_PARAM ++ blk_execute_rq(req->q, NULL, req, at_head); ++#else ++ blk_execute_rq(NULL, req, at_head); ++#endif ++ if (result) ++ *result = nvme_req(req)->result; ++ if (nvme_req(req)->flags & NVME_REQ_CANCELLED) ++ ret = -EINTR; ++ else ++ ret = nvme_req(req)->status; ++#endif + out: + blk_mq_free_request(req); + return ret; +@@ -1103,8 +1414,13 @@ EXPORT_SYMBOL_GPL(__nvme_submit_sync_cmd + int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, + void *buffer, unsigned bufflen) + { ++#ifdef HAVE_BLK_MQ_ALLOC_REQUEST_HAS_3_PARAMS + return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0, + NVME_QID_ANY, 0, 0); ++#else ++ return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0, ++ NVME_QID_ANY, 0, GFP_KERNEL, false, false); ++#endif + } + EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd); + +@@ -1202,6 +1518,7 @@ static void nvme_passthru_end(struct nvm + } + } + ++#if defined(HAVE_BLK_EXECUTE_RQ_2_PARAM) || defined(HAVE_BLK_EXECUTE_RQ_3_PARAM) + int nvme_execute_passthru_rq(struct request *rq) + { + struct nvme_command *cmd = nvme_req(rq)->cmd; +@@ -1218,6 +1535,25 @@ int nvme_execute_passthru_rq(struct requ + + return ret; + } ++#else ++void nvme_execute_passthru_rq(struct request *rq) ++{ ++ struct nvme_command *cmd = nvme_req(rq)->cmd; ++ struct nvme_ctrl *ctrl = nvme_req(rq)->ctrl; ++ struct nvme_ns *ns = rq->q->queuedata; ++ struct gendisk *disk = ns ? ns->disk : NULL; ++ u32 effects; ++ ++ effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode); ++#ifdef HAVE_BLK_EXECUTE_RQ_4_PARAM ++ blk_execute_rq(rq->q, disk, rq, 0); ++#else ++ blk_execute_rq(disk, rq, 0); ++#endif ++ if (effects) /* nothing to be done for zero cmd effects */ ++ nvme_passthru_end(ctrl, effects, cmd, 0); ++} ++#endif + EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU); + + /* +@@ -1270,9 +1606,12 @@ static void nvme_keep_alive_work(struct + nvme_queue_keep_alive_work(ctrl); + return; + } +- ++#ifdef HAVE_BLK_MQ_ALLOC_REQUEST_HAS_3_PARAMS + rq = nvme_alloc_request(ctrl->admin_q, &ctrl->ka_cmd, + BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT); ++#else ++ rq = nvme_alloc_request(ctrl->admin_q, &ctrl->ka_cmd, GFP_KERNEL, true); ++#endif + if (IS_ERR(rq)) { + /* allocation failure, reset the controller */ + dev_err(ctrl->device, "keep-alive failed: %ld\n", PTR_ERR(rq)); +@@ -1282,7 +1621,20 @@ static void nvme_keep_alive_work(struct + + rq->timeout = ctrl->kato * HZ; + rq->end_io_data = ctrl; ++#ifdef HAVE_BLK_EXECUTE_RQ_NOWAIT_2_PARAM ++ rq->end_io = nvme_keep_alive_end_io; ++ blk_execute_rq_nowait(rq, false); ++#else ++#ifdef HAVE_BLK_EXECUTE_RQ_NOWAIT_5_PARAM ++ blk_execute_rq_nowait(rq->q, NULL, rq, 0, nvme_keep_alive_end_io); ++#else ++#ifdef HAVE_BLK_EXECUTE_RQ_NOWAIT_3_PARAM + blk_execute_rq_nowait(rq, false, nvme_keep_alive_end_io); ++#else ++ blk_execute_rq_nowait(NULL, rq, 0, nvme_keep_alive_end_io); ++#endif ++#endif ++#endif + } + + static void nvme_start_keep_alive(struct nvme_ctrl *ctrl) +@@ -1512,8 +1864,13 @@ static int nvme_features(struct nvme_ctr + c.features.fid = cpu_to_le32(fid); + c.features.dword11 = cpu_to_le32(dword11); + ++#ifdef HAVE_BLK_MQ_ALLOC_REQUEST_HAS_3_PARAMS + ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res, + buffer, buflen, 0, NVME_QID_ANY, 0, 0); ++#else ++ ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res, ++ buffer, buflen, 0, NVME_QID_ANY, 0, GFP_KERNEL, false, false); ++#endif + if (ret >= 0 && result) + *result = le32_to_cpu(res.u32); + return ret; +@@ -1632,6 +1989,7 @@ int nvme_getgeo(struct block_device *bde + } + + #ifdef CONFIG_BLK_DEV_INTEGRITY ++#ifdef HAVE_BLK_INTEGRITY_DEVICE_CAPABLE + static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type, + u32 max_integrity_segments) + { +@@ -1658,6 +2016,47 @@ static void nvme_init_integrity(struct g + blk_queue_max_integrity_segments(disk->queue, max_integrity_segments); + } + #else ++#ifdef HAVE_REQUEST_QUEUE_INTEGRITY ++static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type, ++ u32 max_integrity_segments) ++{ ++ struct blk_integrity integrity; ++ ++ memset(&integrity, 0, sizeof(integrity)); ++ integrity.tag_size = pi_type ? sizeof(u16) + sizeof(u32) ++ : sizeof(u16); ++ integrity.tuple_size = ms; ++ blk_integrity_register(disk, &integrity); ++ blk_queue_max_integrity_segments(disk->queue, max_integrity_segments); ++} ++#else ++static int nvme_noop_verify(struct blk_integrity_exchg *exg) ++{ ++ return 0; ++} ++ ++static void nvme_noop_generate(struct blk_integrity_exchg *exg) ++{ ++} ++ ++struct blk_integrity nvme_meta_noop = { ++ .name = "NVME_META_NOOP", ++ .generate_fn = nvme_noop_generate, ++ .verify_fn = nvme_noop_verify, ++}; ++ ++static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type, ++ u32 max_integrity_segments) ++{ ++ nvme_meta_noop.tag_size = pi_type ? sizeof(u16) + sizeof(u32) ++ : sizeof(u16); ++ nvme_meta_noop.tuple_size = ms; ++ blk_integrity_register(disk, &nvme_meta_noop); ++ blk_queue_max_integrity_segments(disk->queue, max_integrity_segments); ++} ++#endif /* HAVE_REQUEST_QUEUE_INTEGRITY */ ++#endif /* HAVE_BLK_INTEGRITY_DEVICE_CAPABLE */ ++#else + static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type, + u32 max_integrity_segments) + { +@@ -1671,28 +2070,51 @@ static void nvme_config_discard(struct g + u32 size = queue_logical_block_size(queue); + + if (ctrl->max_discard_sectors == 0) { ++#ifdef HAVE_QUEUE_FLAG_DISCARD + blk_queue_flag_clear(QUEUE_FLAG_DISCARD, queue); ++#else ++ blk_queue_max_discard_sectors(queue, 0); ++#endif + return; + } + ++#ifdef HAVE_BLK_MAX_WRITE_HINTS + if (ctrl->nr_streams && ns->sws && ns->sgs) + size *= ns->sws * ns->sgs; ++#endif + ++#ifdef HAVE_BLK_RQ_NR_DISCARD_SEGMENTS + BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) < + NVME_DSM_MAX_RANGES); ++#endif ++ ++#ifndef HAVE_BLK_QUEUE_MAX_WRITE_ZEROES_SECTORS ++ if (ctrl->quirks & NVME_QUIRK_DISCARD_ZEROES) ++ queue->limits.discard_zeroes_data = 1; ++ else ++ queue->limits.discard_zeroes_data = 0; ++#endif + + queue->limits.discard_alignment = 0; + queue->limits.discard_granularity = size; + + /* If discard is already enabled, don't reset queue limits */ ++#ifdef HAVE_QUEUE_FLAG_DISCARD + if (blk_queue_flag_test_and_set(QUEUE_FLAG_DISCARD, queue)) ++#else ++ if (queue->limits.max_discard_sectors) ++#endif + return; + + blk_queue_max_discard_sectors(queue, ctrl->max_discard_sectors); ++#ifdef HAVE_BLK_RQ_NR_DISCARD_SEGMENTS + blk_queue_max_discard_segments(queue, ctrl->max_discard_segments); ++#endif + ++#ifdef HAVE_BLK_QUEUE_MAX_WRITE_ZEROES_SECTORS + if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) + blk_queue_max_write_zeroes_sectors(queue, UINT_MAX); ++#endif + } + + static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b) +@@ -1703,6 +2125,7 @@ static bool nvme_ns_ids_equal(struct nvm + a->csi == b->csi; + } + ++#ifdef HAVE_BLK_MAX_WRITE_HINTS + static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns, + u32 *phys_bs, u32 *io_opt) + { +@@ -1727,7 +2150,9 @@ static int nvme_setup_streams_ns(struct + + return 0; + } ++#endif + ++#if !defined HAVE_BD_SET_NR_SECTORS && !defined HAVE_BD_SET_SIZE && !defined HAVE_REVALIDATE_DISK_SIZE + static void nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id) + { + struct nvme_ctrl *ctrl = ns->ctrl; +@@ -1781,6 +2206,7 @@ static void nvme_configure_metadata(stru + ns->features |= NVME_NS_METADATA_SUPPORTED; + } + } ++#endif + + static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, + struct request_queue *q) +@@ -1795,7 +2221,12 @@ static void nvme_set_queue_limits(struct + blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors); + blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX)); + } ++#ifdef HAVE_BLK_QUEUE_VIRT_BOUNDARY + blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1); ++#else ++ if (!ctrl->sg_gaps_support) ++ queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, q); ++#endif + blk_queue_dma_alignment(q, 3); + blk_queue_write_cache(q, vwc, vwc); + } +@@ -1819,7 +2250,9 @@ static void nvme_update_disk_info(struct + blk_integrity_unregister(disk); + + atomic_bs = phys_bs = bs; ++#ifdef HAVE_BLK_MAX_WRITE_HINTS + nvme_setup_streams_ns(ns->ctrl, ns, &phys_bs, &io_opt); ++#endif + if (id->nabo == 0) { + /* + * Bit 1 indicates whether NAWUPF is defined for this namespace +@@ -1864,17 +2297,31 @@ static void nvme_update_disk_info(struct + capacity = 0; + } + ++#if defined HAVE_BD_SET_NR_SECTORS || defined HAVE_BD_SET_SIZE ++#ifdef HAVE_SET_CAPACITY_REVALIDATE_AND_NOTIFY ++ set_capacity_revalidate_and_notify(disk, capacity, false); ++#else ++ set_capacity(disk, capacity); ++#endif ++#else + set_capacity_and_notify(disk, capacity); ++#endif + + nvme_config_discard(disk, ns); ++#ifdef HAVE_BLK_QUEUE_MAX_WRITE_ZEROES_SECTORS + blk_queue_max_write_zeroes_sectors(disk->queue, + ns->ctrl->max_zeroes_sectors); ++#endif + } + + static inline bool nvme_first_scan(struct gendisk *disk) + { + /* nvme_alloc_ns() scans the disk prior to adding it */ ++#ifdef HAVE_GENHD_FL_UP ++ return !(disk->flags & GENHD_FL_UP); ++#else + return !disk_live(disk); ++#endif + } + + static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id) +@@ -1898,16 +2345,188 @@ static void nvme_set_chunk_sectors(struc + return; + } + ++#ifdef CONFIG_BLK_DEV_ZONED + if (blk_queue_is_zoned(ns->disk->queue)) { + if (nvme_first_scan(ns->disk)) + pr_warn("%s: ignoring zoned namespace IO boundary\n", + ns->disk->disk_name); + return; + } ++#endif + + blk_queue_chunk_sectors(ns->queue, iob); + } + ++#if defined HAVE_BD_SET_NR_SECTORS || defined HAVE_BD_SET_SIZE || defined HAVE_REVALIDATE_DISK_SIZE ++static void nvme_update_bdev_size(struct gendisk *disk); ++ ++static int nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid, ++ struct nvme_id_ns *id, struct nvme_ns_ids *ids) ++{ ++ memset(ids, 0, sizeof(*ids)); ++ ++ if (ctrl->vs >= NVME_VS(1, 1, 0)) ++ memcpy(ids->eui64, id->eui64, sizeof(id->eui64)); ++ if (ctrl->vs >= NVME_VS(1, 2, 0)) ++ memcpy(ids->nguid, id->nguid, sizeof(id->nguid)); ++ if (ctrl->vs >= NVME_VS(1, 3, 0) || nvme_multi_css(ctrl)) ++ return nvme_identify_ns_descs(ctrl, nsid, ids); ++ return 0; ++} ++ ++ ++static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) ++{ ++ unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK; ++ struct nvme_ns *ns = disk->private_data; ++ struct nvme_ctrl *ctrl = ns->ctrl; ++ int ret; ++ ++ /* ++ * If identify namespace failed, use default 512 byte block size so ++ * block layer can use before failing read/write for 0 capacity. ++ */ ++ ns->lba_shift = id->lbaf[lbaf].ds; ++ if (ns->lba_shift == 0) ++ ns->lba_shift = 9; ++ ++ switch (ns->head->ids.csi) { ++ case NVME_CSI_NVM: ++ break; ++ case NVME_CSI_ZNS: ++ ret = nvme_update_zone_info(ns, lbaf); ++ if (ret) { ++ dev_warn(ctrl->device, ++ "failed to add zoned namespace:%u ret:%d\n", ++ ns->head->ns_id, ret); ++ return ret; ++ } ++ break; ++ default: ++ dev_warn(ctrl->device, "unknown csi:%u ns:%u\n", ++ ns->head->ids.csi, ns->head->ns_id); ++ return -ENODEV; ++ } ++ ++ ns->features = 0; ++ ns->ms = le16_to_cpu(id->lbaf[lbaf].ms); ++ /* the PI implementation requires metadata equal t10 pi tuple size */ ++ if (ns->ms == sizeof(struct t10_pi_tuple)) ++ ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK; ++ else ++ ns->pi_type = 0; ++ ++ if (ns->ms) { ++ /* ++ * For PCIe only the separate metadata pointer is supported, ++ * as the block layer supplies metadata in a separate bio_vec ++ * chain. For Fabrics, only metadata as part of extended data ++ * LBA is supported on the wire per the Fabrics specification, ++ * but the HBA/HCA will do the remapping from the separate ++ * metadata buffers for us. ++ */ ++ if (id->flbas & NVME_NS_FLBAS_META_EXT) { ++ ns->features |= NVME_NS_EXT_LBAS; ++ if ((ctrl->ops->flags & NVME_F_FABRICS) && ++ (ctrl->ops->flags & NVME_F_METADATA_SUPPORTED) && ++ ctrl->max_integrity_segments) ++ ns->features |= NVME_NS_METADATA_SUPPORTED; ++ } else { ++ if (WARN_ON_ONCE(ctrl->ops->flags & NVME_F_FABRICS)) ++ return -EINVAL; ++ if (ctrl->ops->flags & NVME_F_METADATA_SUPPORTED) ++ ns->features |= NVME_NS_METADATA_SUPPORTED; ++ } ++ } ++ ++ nvme_set_chunk_sectors(ns, id); ++ nvme_update_disk_info(disk, ns, id); ++ set_bit(NVME_NS_READY, &ns->flags); ++#ifdef CONFIG_NVME_MULTIPATH ++ if (ns->head->disk) { ++ nvme_update_disk_info(ns->head->disk, ns, id); ++ blk_stack_limits(&ns->head->disk->queue->limits, ++ &ns->queue->limits, 0); ++#if defined HAVE_BD_SET_NR_SECTORS || defined HAVE_BD_SET_SIZE ++ nvme_update_bdev_size(ns->head->disk); ++#endif ++ } ++#endif ++ return 0; ++} ++ ++static int _nvme_revalidate_disk(struct gendisk *disk) ++{ ++ struct nvme_ns *ns = disk->private_data; ++ struct nvme_ctrl *ctrl = ns->ctrl; ++ struct nvme_id_ns *id; ++ struct nvme_ns_ids ids; ++ int ret = 0; ++ ++ if (test_bit(NVME_NS_DEAD, &ns->flags)) { ++ set_capacity(disk, 0); ++ return -ENODEV; ++ } ++ ++ ret = nvme_identify_ns(ctrl, ns->head->ns_id, &ids, &id); ++ if (ret) ++ goto out; ++ ++ if (id->ncap == 0) { ++ ret = -ENODEV; ++ goto free_id; ++ } ++ ++ ret = nvme_report_ns_ids(ctrl, ns->head->ns_id, id, &ids); ++ if (ret) ++ goto free_id; ++ ++ if (!nvme_ns_ids_equal(&ns->head->ids, &ids)) { ++ dev_err(ctrl->device, ++ "identifiers changed for nsid %d\n", ns->head->ns_id); ++ ret = -ENODEV; ++ goto free_id; ++ } ++ ++ ret = __nvme_revalidate_disk(disk, id); ++free_id: ++ kfree(id); ++out: ++ /* ++ * Only fail the function if we got a fatal error back from the ++ * device, otherwise ignore the error and just move on. ++ */ ++ if (ret == -ENOMEM || (ret > 0 && !(ret & NVME_SC_DNR))) ++ ret = 0; ++ else if (ret > 0) ++ ret = blk_status_to_errno(nvme_error_status(ret)); ++ return ret; ++} ++ ++ ++static int nvme_revalidate_disk(struct gendisk *disk) ++{ ++ int ret; ++ ++ ret = _nvme_revalidate_disk(disk); ++ if (ret) ++ return ret; ++ ++#ifdef CONFIG_BLK_DEV_ZONED ++ if (blk_queue_is_zoned(disk->queue)) { ++ struct nvme_ns *ns = disk->private_data; ++ struct nvme_ctrl *ctrl = ns->ctrl; ++ ++ ret = blk_revalidate_disk_zones(disk, NULL); ++ if (!ret) ++ blk_queue_max_zone_append_sectors(disk->queue, ++ ctrl->max_zone_append); ++ } ++#endif ++ return ret; ++} ++#else //defined HAVE_BD_SET_NR_SECTORS || defined HAVE_BD_SET_SIZE || defined HAVE_REVALIDATE_DISK_SIZE ++ + static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id) + { + unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK; +@@ -1934,11 +2553,13 @@ static int nvme_update_ns_info(struct nv + set_bit(NVME_NS_READY, &ns->flags); + blk_mq_unfreeze_queue(ns->disk->queue); + ++#ifdef CONFIG_BLK_DEV_ZONED + if (blk_queue_is_zoned(ns->queue)) { + ret = nvme_revalidate_zones(ns); + if (ret && !nvme_first_scan(ns->disk)) + goto out; + } ++#endif + + if (nvme_ns_head_multipath(ns->head)) { + blk_mq_freeze_queue(ns->head->disk->queue); +@@ -1949,7 +2570,11 @@ static int nvme_update_ns_info(struct nv + nvme_mpath_revalidate_paths(ns); + blk_stack_limits(&ns->head->disk->queue->limits, + &ns->queue->limits, 0); ++#ifdef HAVE_DISK_UPDATE_READAHEAD + disk_update_readahead(ns->head->disk); ++#else ++ blk_queue_update_readahead(ns->head->disk->queue); ++#endif + blk_mq_unfreeze_queue(ns->head->disk->queue); + } + +@@ -1966,7 +2591,9 @@ out: + } + return ret; + } ++#endif //defined HAVE_BD_SET_NR_SECTORS || defined HAVE_BD_SET_SIZE || defined HAVE_REVALIDATE_DISK_SIZE + ++#ifdef HAVE_PR_H + static char nvme_pr_type(enum pr_type type) + { + switch (type) { +@@ -2084,7 +2711,9 @@ const struct pr_ops nvme_pr_ops = { + .pr_preempt = nvme_pr_preempt, + .pr_clear = nvme_pr_clear, + }; ++#endif + ++#ifdef HAVE_LINUX_SED_OPAL_H + #ifdef CONFIG_BLK_SED_OPAL + int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, + bool send) +@@ -2100,11 +2729,18 @@ int nvme_sec_submit(void *data, u16 spsp + cmd.common.cdw10 = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8); + cmd.common.cdw11 = cpu_to_le32(len); + ++#ifdef HAVE_BLK_MQ_ALLOC_REQUEST_HAS_3_PARAMS + return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len, 0, + NVME_QID_ANY, 1, 0); ++#else ++ return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len, ++ ADMIN_TIMEOUT, NVME_QID_ANY, 1, ++ GFP_KERNEL, false, false); ++#endif + } + EXPORT_SYMBOL_GPL(nvme_sec_submit); + #endif /* CONFIG_BLK_SED_OPAL */ ++#endif /* HAVE_LINUX_SED_OPAL_H */ + + #ifdef CONFIG_BLK_DEV_ZONED + static int nvme_report_zones(struct gendisk *disk, sector_t sector, +@@ -2123,8 +2759,15 @@ static const struct block_device_operati + .open = nvme_open, + .release = nvme_release, + .getgeo = nvme_getgeo, ++#ifdef HAVE_BLK_QUEUE_MAX_ACTIVE_ZONES + .report_zones = nvme_report_zones, ++#endif ++#if !defined(HAVE_REVALIDATE_DISK_SIZE) && !defined(HAVE_BDEV_NR_SECTORS) ++ .revalidate_disk= nvme_revalidate_disk, ++#endif ++#ifdef HAVE_PR_H + .pr_ops = &nvme_pr_ops, ++#endif + }; + + static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled) +@@ -2301,6 +2944,7 @@ static int nvme_configure_acre(struct nv + * timeout value is returned and the matching tolerance index (1 or 2) is + * reported. + */ ++#ifdef HAVE_DEV_PM_INFO_SET_LATENCY_TOLERANCE + static bool nvme_apst_get_transition_time(u64 total_latency, + u64 *transition_time, unsigned *last_index) + { +@@ -2321,6 +2965,7 @@ static bool nvme_apst_get_transition_tim + } + return false; + } ++#endif + + /* + * APST (Autonomous Power State Transition) lets us program a table of power +@@ -2347,6 +2992,7 @@ static bool nvme_apst_get_transition_tim + * + * Users can set ps_max_latency_us to zero to turn off APST. + */ ++#ifdef HAVE_DEV_PM_INFO_SET_LATENCY_TOLERANCE + static int nvme_configure_apst(struct nvme_ctrl *ctrl) + { + struct nvme_feat_auto_pst *table; +@@ -2473,6 +3119,7 @@ static void nvme_set_latency_tolerance(s + nvme_configure_apst(ctrl); + } + } ++#endif + + struct nvme_core_quirk_entry { + /* +@@ -2940,7 +3587,9 @@ static int nvme_init_identify(struct nvm + { + struct nvme_id_ctrl *id; + u32 max_hw_sectors; ++#ifdef HAVE_DEV_PM_INFO_SET_LATENCY_TOLERANCE + bool prev_apst_enabled; ++#endif + int ret; + + ret = nvme_identify_ctrl(ctrl, &id); +@@ -3024,6 +3673,7 @@ static int nvme_init_identify(struct nvm + } else + ctrl->shutdown_timeout = shutdown_timeout; + ++#ifdef HAVE_DEV_PM_INFO_SET_LATENCY_TOLERANCE + ctrl->npss = id->npss; + ctrl->apsta = id->apsta; + prev_apst_enabled = ctrl->apst_enabled; +@@ -3038,6 +3688,7 @@ static int nvme_init_identify(struct nvm + ctrl->apst_enabled = id->apsta; + } + memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd)); ++#endif + + if (ctrl->ops->flags & NVME_F_FABRICS) { + ctrl->icdoff = le16_to_cpu(id->icdoff); +@@ -3075,10 +3726,12 @@ static int nvme_init_identify(struct nvm + if (ret < 0) + goto out_free; + ++#ifdef HAVE_DEV_PM_INFO_SET_LATENCY_TOLERANCE + if (ctrl->apst_enabled && !prev_apst_enabled) + dev_pm_qos_expose_latency_tolerance(ctrl->device); + else if (!ctrl->apst_enabled && prev_apst_enabled) + dev_pm_qos_hide_latency_tolerance(ctrl->device); ++#endif + + out_free: + kfree(id); +@@ -3113,17 +3766,21 @@ int nvme_init_ctrl_finish(struct nvme_ct + if (ret < 0) + return ret; + ++#ifdef HAVE_DEV_PM_INFO_SET_LATENCY_TOLERANCE + ret = nvme_configure_apst(ctrl); + if (ret < 0) + return ret; ++#endif + + ret = nvme_configure_timestamp(ctrl); + if (ret < 0) + return ret; + ++#ifdef HAVE_BLK_MAX_WRITE_HINTS + ret = nvme_configure_directives(ctrl); + if (ret < 0) + return ret; ++#endif + + ret = nvme_configure_acre(ctrl); + if (ret < 0) +@@ -3225,8 +3882,10 @@ static ssize_t wwid_show(struct device * + int serial_len = sizeof(subsys->serial); + int model_len = sizeof(subsys->model); + ++#ifdef HAVE_UUID_IS_NULL + if (!uuid_is_null(&ids->uuid)) + return sysfs_emit(buf, "uuid.%pU\n", &ids->uuid); ++#endif + + if (memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) + return sysfs_emit(buf, "eui.%16phN\n", ids->nguid); +@@ -3247,12 +3906,14 @@ static ssize_t wwid_show(struct device * + } + static DEVICE_ATTR_RO(wwid); + ++#ifdef HAVE_UUID_IS_NULL + static ssize_t nguid_show(struct device *dev, struct device_attribute *attr, + char *buf) + { + return sysfs_emit(buf, "%pU\n", dev_to_ns_head(dev)->ids.nguid); + } + static DEVICE_ATTR_RO(nguid); ++#endif + + static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, + char *buf) +@@ -3262,11 +3923,13 @@ static ssize_t uuid_show(struct device * + /* For backward compatibility expose the NGUID to userspace if + * we have no UUID set + */ ++#ifdef HAVE_UUID_IS_NULL + if (uuid_is_null(&ids->uuid)) { + dev_warn_ratelimited(dev, + "No UUID available providing old NGUID\n"); + return sysfs_emit(buf, "%pU\n", ids->nguid); + } ++#endif + return sysfs_emit(buf, "%pU\n", &ids->uuid); + } + static DEVICE_ATTR_RO(uuid); +@@ -3288,7 +3951,9 @@ static DEVICE_ATTR_RO(nsid); + static struct attribute *nvme_ns_id_attrs[] = { + &dev_attr_wwid.attr, + &dev_attr_uuid.attr, ++#ifdef HAVE_UUID_IS_NULL + &dev_attr_nguid.attr, ++#endif + &dev_attr_eui.attr, + &dev_attr_nsid.attr, + #ifdef CONFIG_NVME_MULTIPATH +@@ -3305,11 +3970,13 @@ static umode_t nvme_ns_id_attrs_are_visi + struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids; + + if (a == &dev_attr_uuid.attr) { ++#ifdef HAVE_UUID_IS_NULL + if (uuid_is_null(&ids->uuid) && + !memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) + return 0; + } + if (a == &dev_attr_nguid.attr) { ++#endif + if (!memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) + return 0; + } +@@ -3328,7 +3995,11 @@ static umode_t nvme_ns_id_attrs_are_visi + return a->mode; + } + ++#ifdef HAVE_DEVICE_ADD_DISK_3_ARGS + static const struct attribute_group nvme_ns_id_attr_group = { ++#else ++const struct attribute_group nvme_ns_id_attr_group = { ++#endif + .attrs = nvme_ns_id_attrs, + .is_visible = nvme_ns_id_attrs_are_visible, + }; +@@ -3367,6 +4038,7 @@ nvme_show_int_function(queue_count); + nvme_show_int_function(sqsize); + nvme_show_int_function(kato); + ++#ifdef HAVE_DEVICE_REMOVE_FILE_SELF + static ssize_t nvme_sysfs_delete(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t count) +@@ -3377,6 +4049,49 @@ static ssize_t nvme_sysfs_delete(struct + nvme_delete_ctrl_sync(ctrl); + return count; + } ++#else ++static int __nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl) ++{ ++ int ret = 0; ++ ++ /* ++ * Keep a reference until the work is flushed since ->delete_ctrl ++ * can free the controller. ++ */ ++ nvme_get_ctrl(ctrl); ++ ret = nvme_delete_ctrl(ctrl); ++ if (!ret) ++ flush_work(&ctrl->delete_work); ++ nvme_put_ctrl(ctrl); ++ return ret; ++} ++ ++static void nvme_delete_callback(struct device *dev) ++{ ++ struct nvme_ctrl *ctrl = dev_get_drvdata(dev); ++ ++ __nvme_delete_ctrl_sync(ctrl); ++} ++ ++static ssize_t nvme_sysfs_delete(struct device *dev, ++ struct device_attribute *attr, const char *buf, ++ size_t count) ++{ ++ int ret; ++ ++ /* An attribute cannot be unregistered by one of its own methods, ++ * so we have to use this roundabout approach. ++ */ ++ ret = device_schedule_callback(dev, nvme_delete_callback); ++ if (ret) ++ count = ret; ++ else ++ /* Wait for nvme_delete_callback() to finish */ ++ msleep(500); ++ ++ return count; ++} ++#endif + static DEVICE_ATTR(delete_controller, S_IWUSR, NULL, nvme_sysfs_delete); + + static ssize_t nvme_sysfs_show_transport(struct device *dev, +@@ -3621,7 +4336,11 @@ static struct nvme_ns_head *nvme_find_ns + static int nvme_subsys_check_duplicate_ids(struct nvme_subsystem *subsys, + struct nvme_ns_ids *ids) + { ++#ifdef HAVE_UUID_IS_NULL + bool has_uuid = !uuid_is_null(&ids->uuid); ++#else ++ bool has_uuid = false; ++#endif + bool has_nguid = memchr_inv(ids->nguid, 0, sizeof(ids->nguid)); + bool has_eui64 = memchr_inv(ids->eui64, 0, sizeof(ids->eui64)); + struct nvme_ns_head *h; +@@ -3863,6 +4582,7 @@ static void nvme_alloc_ns(struct nvme_ct + if (!ns) + goto out_free_id; + ++#ifdef HAVE_BLK_MQ_ALLOC_DISK + disk = blk_mq_alloc_disk(ctrl->tagset, ns); + if (IS_ERR(disk)) + goto out_free_ns; +@@ -3871,19 +4591,61 @@ static void nvme_alloc_ns(struct nvme_ct + + ns->disk = disk; + ns->queue = disk->queue; ++#else ++ ns->queue = blk_mq_init_queue(ctrl->tagset); ++ if (IS_ERR(ns->queue)) ++ goto out_free_ns; ++#endif + ++#ifdef HAVE_REQUEST_QUEUE_BACKING_DEV_INFO + if (ctrl->opts && ctrl->opts->data_digest) ++#ifdef HAVE_QUEUE_FLAG_STABLE_WRITES + blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, ns->queue); ++#else ++ ns->queue->backing_dev_info->capabilities ++ |= BDI_CAP_STABLE_WRITES; ++#endif ++#endif + + blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue); ++#ifdef HAVE_QUEUE_FLAG_PCI_P2PDMA + if (ctrl->ops->flags & NVME_F_PCI_P2PDMA) + blk_queue_flag_set(QUEUE_FLAG_PCI_P2PDMA, ns->queue); ++#endif + ++#ifndef HAVE_BLK_MQ_ALLOC_DISK ++ ns->queue->queuedata = ns; ++#endif + ns->ctrl = ctrl; + kref_init(&ns->kref); + ++#if defined HAVE_BD_SET_NR_SECTORS || defined HAVE_BD_SET_SIZE || defined HAVE_REVALIDATE_DISK_SIZE ++ ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */ ++ ++ blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); ++ nvme_set_queue_limits(ctrl, ns->queue); ++#endif ++ + if (nvme_init_ns_head(ns, nsid, ids, id->nmic & NVME_NS_NMIC_SHARED)) ++#ifdef HAVE_BLK_MQ_ALLOC_DISK + goto out_cleanup_disk; ++#else ++ goto out_free_queue; ++ ++ disk = alloc_disk_node(0, node); ++ if (!disk) ++ goto out_unlink_ns; ++ ++ disk->fops = &nvme_bdev_ops; ++ disk->private_data = ns; ++ disk->queue = ns->queue; ++#if !defined(HAVE_DEVICE_ADD_DISK) && !defined(HAVE_DEVICE_ADD_DISK_3_ARGS) ++ disk->driverfs_dev = ctrl->device; ++#endif ++#ifdef HAVE_GENHD_FL_EXT_DEVT ++ disk->flags = GENHD_FL_EXT_DEVT; ++#endif ++#endif /* HAVE_BLK_MQ_ALLOC_DISK */ + + /* + * Without the multipath code enabled, multiple controller per +@@ -3893,17 +4655,43 @@ static void nvme_alloc_ns(struct nvme_ct + if (!nvme_mpath_set_disk_name(ns, disk->disk_name, &disk->flags)) + sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, + ns->head->instance); +- ++#ifndef HAVE_BLK_MQ_ALLOC_DISK ++ ns->disk = disk; ++#endif ++#if defined HAVE_BD_SET_NR_SECTORS || defined HAVE_BD_SET_SIZE || defined HAVE_REVALIDATE_DISK_SIZE ++ if (__nvme_revalidate_disk(disk, id)) ++#else + if (nvme_update_ns_info(ns, id)) ++#endif ++#ifdef HAVE_BLK_MQ_ALLOC_DISK + goto out_unlink_ns; ++#else ++ goto out_put_disk; ++#endif + + down_write(&ctrl->namespaces_rwsem); + nvme_ns_add_to_ctrl_list(ns); + up_write(&ctrl->namespaces_rwsem); + nvme_get_ctrl(ctrl); + ++#ifdef HAVE_DEVICE_ADD_DISK_3_ARGS ++#ifdef HAVE_DEVICE_ADD_DISK_RETURN + if (device_add_disk(ctrl->device, ns->disk, nvme_ns_id_attr_groups)) + goto out_cleanup_ns_from_list; ++#else ++ device_add_disk(ctrl->device, ns->disk, nvme_ns_id_attr_groups); ++#endif ++#else ++#ifdef HAVE_DEVICE_ADD_DISK ++ device_add_disk(ctrl->device, ns->disk); ++#else ++ add_disk(ns->disk); ++#endif ++ if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj, ++ &nvme_ns_id_attr_group)) ++ pr_warn("%s: failed to create sysfs group for identification\n", ++ ns->disk->disk_name); ++#endif /* HAVE_DEVICE_ADD_DISK_3_ARGS */ + + if (!nvme_ns_head_multipath(ns->head)) + nvme_add_ns_cdev(ns); +@@ -3913,12 +4701,19 @@ static void nvme_alloc_ns(struct nvme_ct + kfree(id); + + return; +- ++#ifdef HAVE_DEVICE_ADD_DISK_RETURN + out_cleanup_ns_from_list: + nvme_put_ctrl(ctrl); + down_write(&ctrl->namespaces_rwsem); + list_del_init(&ns->list); + up_write(&ctrl->namespaces_rwsem); ++#endif ++#ifndef HAVE_BLK_MQ_ALLOC_DISK ++ out_put_disk: ++ /* prevent double queue cleanup */ ++ ns->disk->queue = NULL; ++ put_disk(ns->disk); ++#endif + out_unlink_ns: + mutex_lock(&ctrl->subsys->lock); + list_del_rcu(&ns->siblings); +@@ -3926,8 +4721,17 @@ static void nvme_alloc_ns(struct nvme_ct + list_del_init(&ns->head->entry); + mutex_unlock(&ctrl->subsys->lock); + nvme_put_ns_head(ns->head); ++#ifdef HAVE_BLK_MQ_ALLOC_DISK + out_cleanup_disk: ++#ifdef HAVE_BLK_CLEANUP_DISK + blk_cleanup_disk(disk); ++#else ++ put_disk(disk); ++#endif ++#else ++ out_free_queue: ++ blk_cleanup_queue(ns->queue); ++#endif + out_free_ns: + kfree(ns); + out_free_id: +@@ -3969,7 +4773,9 @@ static void nvme_ns_remove(struct nvme_n + if (!nvme_ns_head_multipath(ns->head)) + nvme_cdev_del(&ns->cdev, &ns->cdev_device); + del_gendisk(ns->disk); ++#ifndef HAVE_BLK_MQ_DESTROY_QUEUE + blk_cleanup_queue(ns->queue); ++#endif + + down_write(&ns->ctrl->namespaces_rwsem); + list_del_init(&ns->list); +@@ -4009,7 +4815,18 @@ static void nvme_validate_ns(struct nvme + goto out_free_id; + } + ++#ifdef HAVE_REVALIDATE_DISK_SIZE ++ ret = nvme_revalidate_disk(ns->disk); ++ revalidate_disk_size(ns->disk, ret == 0); ++#elif defined HAVE_BD_SET_NR_SECTORS || defined HAVE_BD_SET_SIZE ++#ifdef HAVE_BDEV_NR_SECTORS ++ ret = nvme_revalidate_disk(ns->disk); ++#else ++ ret = revalidate_disk(ns->disk); ++#endif ++#else + ret = nvme_update_ns_info(ns, id); ++#endif + + out_free_id: + kfree(id); +@@ -4443,7 +5260,9 @@ void nvme_uninit_ctrl(struct nvme_ctrl * + { + nvme_hwmon_exit(ctrl); + nvme_fault_inject_fini(&ctrl->fault_inject); ++#ifdef HAVE_DEV_PM_INFO_SET_LATENCY_TOLERANCE + dev_pm_qos_hide_latency_tolerance(ctrl->device); ++#endif + cdev_device_del(&ctrl->cdev, ctrl->device); + nvme_put_ctrl(ctrl); + } +@@ -4557,9 +5376,11 @@ int nvme_init_ctrl(struct nvme_ctrl *ctr + * Initialize latency tolerance controls. The sysfs files won't + * be visible to userspace unless the device actually supports APST. + */ ++#ifdef HAVE_DEV_PM_INFO_SET_LATENCY_TOLERANCE + ctrl->device->power.set_latency_tolerance = nvme_set_latency_tolerance; + dev_pm_qos_update_user_latency_tolerance(ctrl->device, + min(default_ps_max_latency_us, (unsigned long)S32_MAX)); ++#endif + + nvme_fault_inject_init(&ctrl->fault_inject, dev_name(ctrl->device)); + nvme_mpath_init_ctrl(ctrl); +@@ -4580,17 +5401,76 @@ EXPORT_SYMBOL_GPL(nvme_init_ctrl); + static void nvme_start_ns_queue(struct nvme_ns *ns) + { + if (test_and_clear_bit(NVME_NS_STOPPED, &ns->flags)) ++#ifdef HAVE_BLKDEV_QUEUE_FLAG_QUIESCED ++ blk_mq_unquiesce_queue(ns->queue); ++#else ++#ifdef HAVE_BLK_MQ_QUIESCE_QUEUE ++ queue_flag_clear_unlocked(QUEUE_FLAG_STOPPED, ns->queue); ++#endif ++#ifdef HAVE_BLK_MQ_UNQUIESCE_QUEUE + blk_mq_unquiesce_queue(ns->queue); ++#else ++ blk_mq_start_stopped_hw_queues(ns->queue, true); ++#endif ++#endif /* HAVE_BLKDEV_QUEUE_FLAG_QUIESCED */ + } + + static void nvme_stop_ns_queue(struct nvme_ns *ns) + { +- if (!test_and_set_bit(NVME_NS_STOPPED, &ns->flags)) ++ if (!test_and_set_bit(NVME_NS_STOPPED, &ns->flags)) { ++#ifdef HAVE_BLK_MQ_QUIESCE_QUEUE ++#ifdef HAVE_BLKDEV_QUEUE_FLAG_QUIESCED + blk_mq_quiesce_queue(ns->queue); +- else ++#else ++ spin_lock_irq(ns->queue->queue_lock); ++ queue_flag_set(QUEUE_FLAG_STOPPED, ns->queue); ++ spin_unlock_irq(ns->queue->queue_lock); ++ blk_mq_quiesce_queue(ns->queue); ++#endif ++#else ++ blk_mq_cancel_requeue_work(ns->queue); ++ blk_mq_stop_hw_queues(ns->queue); ++#endif ++#ifdef HAVE_BLK_MQ_WAIT_QUIESCE_DONE ++ } else { + blk_mq_wait_quiesce_done(ns->queue); ++ } ++#else ++ } ++#endif + } + ++#if defined HAVE_BD_SET_NR_SECTORS || defined HAVE_BD_SET_SIZE ++static void nvme_update_bdev_size(struct gendisk *disk) ++{ ++ struct block_device *bdev = bdget_disk(disk, 0); ++ ++ if (bdev) { ++#ifdef HAVE_BD_SET_NR_SECTORS ++ bd_set_nr_sectors(bdev, get_capacity(disk)); ++#else ++ if (bdev->bd_disk) { ++ bd_set_size(bdev, get_capacity(disk) << SECTOR_SHIFT); ++ } else { ++#ifdef HAVE_INODE_LOCK ++ inode_lock(bdev->bd_inode); ++#else ++ mutex_lock(&bdev->bd_inode->i_mutex); ++#endif ++ i_size_write(bdev->bd_inode, ++ get_capacity(disk) << SECTOR_SHIFT); ++#ifdef HAVE_INODE_LOCK ++ inode_unlock(bdev->bd_inode); ++#else ++ mutex_unlock(&bdev->bd_inode->i_mutex); ++#endif ++ } ++#endif ++ bdput(bdev); ++ } ++} ++#endif ++ + /* + * Prepare a queue for teardown. + * +@@ -4604,10 +5484,19 @@ static void nvme_set_queue_dying(struct + if (test_and_set_bit(NVME_NS_DEAD, &ns->flags)) + return; + ++#ifdef HAVE_BLK_MARK_DISK_DEAD + blk_mark_disk_dead(ns->disk); ++#else ++ blk_set_queue_dying(ns->queue); ++#endif + nvme_start_ns_queue(ns); + ++#if defined HAVE_BD_SET_NR_SECTORS || defined HAVE_BD_SET_SIZE ++ set_capacity(ns->disk, 0); ++ nvme_update_bdev_size(ns->disk); ++#else + set_capacity_and_notify(ns->disk, 0); ++#endif + } + + /** +@@ -4677,7 +5566,11 @@ void nvme_start_freeze(struct nvme_ctrl + + down_read(&ctrl->namespaces_rwsem); + list_for_each_entry(ns, &ctrl->namespaces, list) ++#ifdef HAVE_BLK_FREEZE_QUEUE_START + blk_freeze_queue_start(ns->queue); ++#else ++ blk_mq_freeze_queue_start(ns->queue); ++#endif + up_read(&ctrl->namespaces_rwsem); + } + EXPORT_SYMBOL_GPL(nvme_start_freeze); +@@ -4707,16 +5600,26 @@ EXPORT_SYMBOL_GPL(nvme_start_queues); + void nvme_stop_admin_queue(struct nvme_ctrl *ctrl) + { + if (!test_and_set_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags)) ++#ifdef HAVE_BLK_MQ_UNQUIESCE_QUEUE + blk_mq_quiesce_queue(ctrl->admin_q); ++#else ++ blk_mq_stop_hw_queues(ctrl->admin_q); ++#endif ++#ifdef HAVE_BLK_MQ_WAIT_QUIESCE_DONE + else + blk_mq_wait_quiesce_done(ctrl->admin_q); ++#endif + } + EXPORT_SYMBOL_GPL(nvme_stop_admin_queue); + + void nvme_start_admin_queue(struct nvme_ctrl *ctrl) + { + if (test_and_clear_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags)) ++#ifdef HAVE_BLK_MQ_UNQUIESCE_QUEUE + blk_mq_unquiesce_queue(ctrl->admin_q); ++#else ++ blk_mq_start_stopped_hw_queues(ctrl->admin_q, true); ++#endif + } + EXPORT_SYMBOL_GPL(nvme_start_admin_queue); + +@@ -4886,6 +5789,9 @@ static void __exit nvme_core_exit(void) + } + + MODULE_LICENSE("GPL"); ++#ifdef RETPOLINE_MLNX ++MODULE_INFO(retpoline, "Y"); ++#endif + MODULE_VERSION("1.0"); + module_init(nvme_core_init); + module_exit(nvme_core_exit); diff --git a/src/mlnx-ofa_kernel-5.8/backports/0309-BACKPORT-drivers-nvme-host-fc.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0309-BACKPORT-drivers-nvme-host-fc.c.patch new file mode 100644 index 0000000..b3415ef --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0309-BACKPORT-drivers-nvme-host-fc.c.patch @@ -0,0 +1,340 @@ +From: Israel Rukshin +Subject: [PATCH] BACKPORT: drivers/nvme/host/fc.c + +Change-Id: Id5a3d60331a15a46ea4e47b0da00b14a03e2135c +--- + drivers/nvme/host/fc.c | 113 ++++++++++++++++++++++++++++++++++++++++- + 1 file changed, 112 insertions(+), 1 deletion(-) + +--- a/drivers/nvme/host/fc.c ++++ b/drivers/nvme/host/fc.c +@@ -2,6 +2,8 @@ + /* + * Copyright (c) 2016 Avago Technologies. All rights reserved. + */ ++#ifdef HAVE_LINUX_NVME_FC_DRIVER_H ++ + #ifdef pr_fmt + #undef pr_fmt + #endif +@@ -12,13 +14,18 @@ + #include + #include + #include ++#include ++#ifdef HAVE_FC_APPID_LEN + #include ++#endif + #include "nvme.h" + #include "fabrics.h" + #include + #include + #include "fc.h" ++#ifdef HAVE_SCSI_TRANSPORT_FC_FC_PORT_ROLE_NVME_TARGET + #include ++#endif + #include + + /* *************************** Data Structures/Defines ****************** */ +@@ -1826,6 +1833,7 @@ __nvme_fc_exit_request(struct nvme_fc_ct + atomic_set(&op->state, FCPOP_STATE_UNINIT); + } + ++#ifdef HAVE_BLK_MQ_OPS_EXIT_REQUEST_HAS_3_PARAMS + static void + nvme_fc_exit_request(struct blk_mq_tag_set *set, struct request *rq, + unsigned int hctx_idx) +@@ -1834,6 +1842,16 @@ nvme_fc_exit_request(struct blk_mq_tag_s + + return __nvme_fc_exit_request(set->driver_data, op); + } ++#else ++static void ++nvme_fc_exit_request(void *data, struct request *rq, ++ unsigned int hctx_idx, unsigned int rq_idx) ++{ ++ struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq); ++ ++ __nvme_fc_exit_request(data, op); ++} ++#endif + + static int + __nvme_fc_abort_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_fcp_op *op) +@@ -2116,6 +2134,7 @@ out_on_error: + return ret; + } + ++#ifdef HAVE_BLK_MQ_OPS_INIT_REQUEST_HAS_4_PARAMS + static int + nvme_fc_init_request(struct blk_mq_tag_set *set, struct request *rq, + unsigned int hctx_idx, unsigned int numa_node) +@@ -2135,6 +2154,49 @@ nvme_fc_init_request(struct blk_mq_tag_s + nvme_req(rq)->cmd = &op->op.cmd_iu.sqe; + return res; + } ++#else ++static int ++nvme_fc_init_request(void *data, struct request *rq, ++ unsigned int hctx_idx, unsigned int rq_idx, ++ unsigned int numa_node) ++{ ++ struct nvme_fc_ctrl *ctrl = data; ++ struct nvme_fcp_op_w_sgl *op = blk_mq_rq_to_pdu(rq); ++ struct nvme_fc_queue *queue = &ctrl->queues[hctx_idx+1]; ++ int res; ++ ++ res = __nvme_fc_init_request(ctrl, queue, &op->op, rq, queue->rqcnt++); ++ if (res) ++ return res; ++ op->op.fcp_req.first_sgl = &op->sgl[0]; ++ op->op.fcp_req.private = &op->priv[0]; ++ nvme_req(rq)->ctrl = &ctrl->ctrl; ++ nvme_req(rq)->cmd = &op->op.cmd_iu.sqe; ++ ++ return res; ++} ++ ++static int ++nvme_fc_init_admin_request(void *data, struct request *rq, ++ unsigned int hctx_idx, unsigned int rq_idx, ++ unsigned int numa_node) ++{ ++ struct nvme_fc_ctrl *ctrl = data; ++ struct nvme_fcp_op_w_sgl *op = blk_mq_rq_to_pdu(rq); ++ struct nvme_fc_queue *queue = &ctrl->queues[0]; ++ int res; ++ ++ res = __nvme_fc_init_request(ctrl, queue, &op->op, rq, queue->rqcnt++); ++ if (res) ++ return res; ++ op->op.fcp_req.first_sgl = &op->sgl[0]; ++ op->op.fcp_req.private = &op->priv[0]; ++ nvme_req(rq)->ctrl = &ctrl->ctrl; ++ nvme_req(rq)->cmd = &op->op.cmd_iu.sqe; ++ return res; ++} ++#endif ++ + + static int + nvme_fc_init_aen_ops(struct nvme_fc_ctrl *ctrl) +@@ -2377,7 +2439,11 @@ nvme_fc_ctrl_free(struct kref *ref) + unsigned long flags; + + if (ctrl->ctrl.tagset) { ++#ifdef HAVE_BLK_MQ_DESTROY_QUEUE ++ blk_mq_destroy_queue(ctrl->ctrl.connect_q); ++#else + blk_cleanup_queue(ctrl->ctrl.connect_q); ++#endif + blk_mq_free_tag_set(&ctrl->tag_set); + } + +@@ -2387,8 +2453,13 @@ nvme_fc_ctrl_free(struct kref *ref) + spin_unlock_irqrestore(&ctrl->rport->lock, flags); + + nvme_start_admin_queue(&ctrl->ctrl); ++#ifdef HAVE_BLK_MQ_DESTROY_QUEUE ++ blk_mq_destroy_queue(ctrl->ctrl.admin_q); ++ blk_mq_destroy_queue(ctrl->ctrl.fabrics_q); ++#else + blk_cleanup_queue(ctrl->ctrl.admin_q); + blk_cleanup_queue(ctrl->ctrl.fabrics_q); ++#endif + blk_mq_free_tag_set(&ctrl->admin_tag_set); + + kfree(ctrl->queues); +@@ -2441,8 +2512,13 @@ nvme_fc_nvme_ctrl_freed(struct nvme_ctrl + * status. The done path will return the io request back to the block + * layer with an error status. + */ +-static bool +-nvme_fc_terminate_exchange(struct request *req, void *data, bool reserved) ++#ifdef HAVE_BLK_MQ_BUSY_TAG_ITER_FN_BOOL_3_PARAMS ++static bool nvme_fc_terminate_exchange(struct request *req, void *data, bool reserved) ++#elif defined HAVE_BLK_MQ_BUSY_TAG_ITER_FN_BOOL_2_PARAMS ++static bool nvme_fc_terminate_exchange(struct request *req, void *data) ++#else ++static void nvme_fc_terminate_exchange(struct request *req, void *data, bool reserved) ++#endif + { + struct nvme_ctrl *nctrl = data; + struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl); +@@ -2450,7 +2526,9 @@ nvme_fc_terminate_exchange(struct reques + + op->nreq.flags |= NVME_REQ_CANCELLED; + __nvme_fc_abort_op(ctrl, op); ++#ifdef HAVE_BLK_MQ_BUSY_TAG_ITER_FN_BOOL + return true; ++#endif + } + + /* +@@ -2552,7 +2630,11 @@ nvme_fc_error_recovery(struct nvme_fc_ct + } + + static enum blk_eh_timer_return ++#ifdef HAVE_BLK_MQ_OPS_TIMEOUT_1_PARAM ++nvme_fc_timeout(struct request *rq) ++#else + nvme_fc_timeout(struct request *rq, bool reserved) ++#endif + { + struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq); + struct nvme_fc_ctrl *ctrl = op->ctrl; +@@ -2592,9 +2674,18 @@ nvme_fc_map_data(struct nvme_fc_ctrl *ct + return 0; + + freq->sg_table.sgl = freq->first_sgl; ++#ifdef HAVE_SG_ALLOC_TABLE_CHAINED_NENTS_FIRST_CHUNK_PARAM + ret = sg_alloc_table_chained(&freq->sg_table, + blk_rq_nr_phys_segments(rq), freq->sg_table.sgl, + NVME_INLINE_SG_CNT); ++#else ++ ret = sg_alloc_table_chained(&freq->sg_table, ++ blk_rq_nr_phys_segments(rq), ++#ifdef HAVE_SG_ALLOC_TABLE_CHAINED_4_PARAMS ++ GFP_ATOMIC, ++#endif ++ freq->sg_table.sgl); ++#endif + if (ret) + return -ENOMEM; + +@@ -2603,7 +2694,11 @@ nvme_fc_map_data(struct nvme_fc_ctrl *ct + freq->sg_cnt = fc_dma_map_sg(ctrl->lport->dev, freq->sg_table.sgl, + op->nents, rq_dma_dir(rq)); + if (unlikely(freq->sg_cnt <= 0)) { ++#ifdef HAVE_SG_ALLOC_TABLE_CHAINED_NENTS_FIRST_CHUNK_PARAM + sg_free_table_chained(&freq->sg_table, NVME_INLINE_SG_CNT); ++#else ++ sg_free_table_chained(&freq->sg_table, true); ++#endif + freq->sg_cnt = 0; + return -EFAULT; + } +@@ -2626,7 +2721,11 @@ nvme_fc_unmap_data(struct nvme_fc_ctrl * + fc_dma_unmap_sg(ctrl->lport->dev, freq->sg_table.sgl, op->nents, + rq_dma_dir(rq)); + ++#ifdef HAVE_SG_ALLOC_TABLE_CHAINED_NENTS_FIRST_CHUNK_PARAM + sg_free_table_chained(&freq->sg_table, NVME_INLINE_SG_CNT); ++#else ++ sg_free_table_chained(&freq->sg_table, true); ++#endif + + freq->sg_cnt = 0; + } +@@ -2801,7 +2900,11 @@ nvme_fc_queue_rq(struct blk_mq_hw_ctx *h + * physical segments, there is no payload. + */ + if (blk_rq_nr_phys_segments(rq)) { ++#ifdef HAVE_BLK_RQ_NR_PAYLOAD_BYTES + data_len = blk_rq_payload_bytes(rq); ++#else ++ data_len = nvme_map_len(rq); ++#endif + io_dir = ((rq_data_dir(rq) == WRITE) ? + NVMEFC_FCP_WRITE : NVMEFC_FCP_READ); + } else { +@@ -2846,6 +2949,7 @@ nvme_fc_complete_rq(struct request *rq) + nvme_fc_ctrl_put(ctrl); + } + ++#if defined(HAVE_BLK_MQ_MAP_QUEUES) && defined(HAVE_BLK_MQ_TAG_SET_HAS_MAP) + static int nvme_fc_map_queues(struct blk_mq_tag_set *set) + { + struct nvme_fc_ctrl *ctrl = set->driver_data; +@@ -2868,15 +2972,22 @@ static int nvme_fc_map_queues(struct blk + } + return 0; + } ++#endif + ++#ifdef HAVE_BLK_MQ_TAG_SET_HAS_CONST_OPS + static const struct blk_mq_ops nvme_fc_mq_ops = { ++#else ++static struct blk_mq_ops nvme_fc_mq_ops = { ++#endif + .queue_rq = nvme_fc_queue_rq, + .complete = nvme_fc_complete_rq, + .init_request = nvme_fc_init_request, + .exit_request = nvme_fc_exit_request, + .init_hctx = nvme_fc_init_hctx, + .timeout = nvme_fc_timeout, ++#if defined(HAVE_BLK_MQ_MAP_QUEUES) && defined(HAVE_BLK_MQ_TAG_SET_HAS_MAP) + .map_queues = nvme_fc_map_queues, ++#endif + }; + + static int +@@ -2941,7 +3052,11 @@ nvme_fc_create_io_queues(struct nvme_fc_ + out_delete_hw_queues: + nvme_fc_delete_hw_io_queues(ctrl); + out_cleanup_blk_queue: ++#ifdef HAVE_BLK_MQ_DESTROY_QUEUE ++ blk_mq_destroy_queue(ctrl->ctrl.connect_q); ++#else + blk_cleanup_queue(ctrl->ctrl.connect_q); ++#endif + out_free_tag_set: + blk_mq_free_tag_set(&ctrl->tag_set); + nvme_fc_free_io_queues(ctrl); +@@ -3414,11 +3529,18 @@ nvme_fc_connect_ctrl_work(struct work_st + ctrl->cnum); + } + +- ++#ifdef HAVE_BLK_MQ_TAG_SET_HAS_CONST_OPS + static const struct blk_mq_ops nvme_fc_admin_mq_ops = { ++#else ++static struct blk_mq_ops nvme_fc_admin_mq_ops = { ++#endif + .queue_rq = nvme_fc_queue_rq, + .complete = nvme_fc_complete_rq, ++#ifdef HAVE_BLK_MQ_OPS_INIT_REQUEST_HAS_4_PARAMS + .init_request = nvme_fc_init_request, ++#else ++ .init_request = nvme_fc_init_admin_request, ++#endif + .exit_request = nvme_fc_exit_request, + .init_hctx = nvme_fc_init_admin_hctx, + .timeout = nvme_fc_timeout, +@@ -3548,7 +3670,9 @@ nvme_fc_init_ctrl(struct device *dev, st + ctrl->admin_tag_set.driver_data = ctrl; + ctrl->admin_tag_set.nr_hw_queues = 1; + ctrl->admin_tag_set.timeout = NVME_ADMIN_TIMEOUT; ++#ifdef HAVE_BLK_MQ_F_NO_SCHED + ctrl->admin_tag_set.flags = BLK_MQ_F_NO_SCHED; ++#endif + + ret = blk_mq_alloc_tag_set(&ctrl->admin_tag_set); + if (ret) +@@ -3631,10 +3755,17 @@ fail_ctrl: + + return ERR_PTR(-EIO); + ++#ifdef HAVE_BLK_MQ_DESTROY_QUEUE ++out_cleanup_admin_q: ++ blk_mq_destroy_queue(ctrl->ctrl.admin_q); ++out_cleanup_fabrics_q: ++ blk_mq_destroy_queue(ctrl->ctrl.fabrics_q); ++#else + out_cleanup_admin_q: + blk_cleanup_queue(ctrl->ctrl.admin_q); + out_cleanup_fabrics_q: + blk_cleanup_queue(ctrl->ctrl.fabrics_q); ++#endif + out_free_admin_tag_set: + blk_mq_free_tag_set(&ctrl->admin_tag_set); + out_free_queues: +@@ -4050,3 +4181,8 @@ module_init(nvme_fc_init_module); + module_exit(nvme_fc_exit_module); + + MODULE_LICENSE("GPL v2"); ++#ifdef RETPOLINE_MLNX ++MODULE_INFO(retpoline, "Y"); ++#endif ++ ++#endif /* HAVE_LINUX_NVME_FC_DRIVER_H */ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0310-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0310-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..9cb25aa --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0310-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,46 @@ +From: Chris Mi +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/ct.c + +Change-Id: I5b9613b9cacf967d5848afa618f1625181436b4e +--- + drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/ct.c | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/ct.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/ct.c +@@ -11,7 +11,11 @@ tc_act_can_offload_ct(struct mlx5e_tc_ac + int act_index, + struct mlx5_flow_attr *attr) + { ++#ifdef HAVE_FLOW_ACTION_CT + bool clear_action = act->ct.action & TCA_CT_ACT_CLEAR; ++#else ++ bool clear_action = false; ++#endif + struct netlink_ext_ack *extack = parse_state->extack; + + if (flow_flag_test(parse_state->flow, SAMPLE)) { +@@ -34,7 +38,11 @@ tc_act_parse_ct(struct mlx5e_tc_act_pars + struct mlx5e_priv *priv, + struct mlx5_flow_attr *attr) + { ++#ifdef HAVE_FLOW_ACTION_CT + bool clear_action = act->ct.action & TCA_CT_ACT_CLEAR; ++#else ++ bool clear_action = false; ++#endif + int err; + + /* It's redundant to do ct clear more than once. */ +@@ -93,8 +101,10 @@ tc_act_is_multi_table_act_ct(struct mlx5 + const struct flow_action_entry *act, + struct mlx5_flow_attr *attr) + { ++#ifdef HAVE_FLOW_ACTION_CT + if (act->ct.action & TCA_CT_ACT_CLEAR) + return false; ++#endif + + return true; + } diff --git a/src/mlnx-ofa_kernel-5.8/backports/0310-BACKPORT-drivers-nvme-host-multipath.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0310-BACKPORT-drivers-nvme-host-multipath.c.patch new file mode 100644 index 0000000..1110281 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0310-BACKPORT-drivers-nvme-host-multipath.c.patch @@ -0,0 +1,422 @@ +From: Israel Rukshin +Subject: [PATCH] BACKPORT: drivers/nvme/host/multipath.c + +Change-Id: Ie81b70d5e2a7ef05e97ca7f1bb43378ab0b20bca +--- + drivers/nvme/host/multipath.c | 185 ++++++++++++++++++++++++++++++++++ + 1 file changed, 185 insertions(+) + +--- a/drivers/nvme/host/multipath.c ++++ b/drivers/nvme/host/multipath.c +@@ -3,6 +3,7 @@ + * Copyright (c) 2017-2018 Christoph Hellwig. + */ + ++#ifdef HAVE_BLK_TYPES_REQ_DRV + #include + #include + #include +@@ -122,11 +123,17 @@ void nvme_failover_req(struct request *r + + spin_lock_irqsave(&ns->head->requeue_lock, flags); + for (bio = req->bio; bio; bio = bio->bi_next) { ++#ifdef HAVE_BIO_BI_DISK ++ bio->bi_disk = ns->head->disk; ++#else + bio_set_dev(bio, ns->head->disk->part0); ++#endif ++#ifdef HAVE_BIO_BI_COOKIE + if (bio->bi_opf & REQ_POLLED) { + bio->bi_opf &= ~REQ_POLLED; + bio->bi_cookie = BLK_QC_T_NONE; + } ++#endif + } + blk_steal_bios(&ns->head->requeue_list, req); + spin_unlock_irqrestore(&ns->head->requeue_lock, flags); +@@ -141,11 +148,16 @@ void nvme_kick_requeue_lists(struct nvme + + down_read(&ctrl->namespaces_rwsem); + list_for_each_entry(ns, &ctrl->namespaces, list) { ++#ifdef HAVE_DISK_UEVENT + if (!ns->head->disk) + continue; + kblockd_schedule_work(&ns->head->requeue_work); + if (ctrl->state == NVME_CTRL_LIVE) + disk_uevent(ns->head->disk, KOBJ_CHANGE); ++#else ++ if (ns->head->disk) ++ kblockd_schedule_work(&ns->head->requeue_work); ++#endif + } + up_read(&ctrl->namespaces_rwsem); + } +@@ -355,11 +367,27 @@ static bool nvme_available_path(struct n + return false; + } + ++#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_SUBMIT_BIO ++#ifdef HAVE_BIO_BI_COOKIE + static void nvme_ns_head_submit_bio(struct bio *bio) ++#else ++static blk_qc_t nvme_ns_head_submit_bio(struct bio *bio) ++#endif ++#else ++static blk_qc_t nvme_ns_head_make_request(struct request_queue *q, ++ struct bio *bio) ++#endif + { ++#ifdef HAVE_BIO_BI_DISK ++ struct nvme_ns_head *head = bio->bi_disk->private_data; ++#else + struct nvme_ns_head *head = bio->bi_bdev->bd_disk->private_data; ++#endif + struct device *dev = disk_to_dev(head->disk); + struct nvme_ns *ns; ++#ifndef HAVE_BIO_BI_COOKIE ++ blk_qc_t ret = BLK_QC_T_NONE; ++#endif + int srcu_idx; + + /* +@@ -367,16 +395,42 @@ static void nvme_ns_head_submit_bio(stru + * different queue via blk_steal_bios(), so we need to use the bio_split + * pool from the original queue to allocate the bvecs from. + */ ++#ifdef HAVE_BIO_SPLIT_TO_LIMITS ++ bio = bio_split_to_limits(bio); ++#else ++#ifdef HAVE_BLK_QUEUE_SPLIT_1_PARAM + blk_queue_split(&bio); ++#else ++ blk_queue_split(q, &bio); ++#endif ++#endif + + srcu_idx = srcu_read_lock(&head->srcu); + ns = nvme_find_path(head); + if (likely(ns)) { ++#ifdef HAVE_BIO_BI_DISK ++ bio->bi_disk = ns->disk; ++#else + bio_set_dev(bio, ns->disk->part0); ++#endif + bio->bi_opf |= REQ_NVME_MPATH; ++#ifdef HAVE_TRACE_BLOCK_BIO_REMAP_4_PARAM ++ trace_block_bio_remap(bio->bi_disk->queue, bio, ++ disk_devt(ns->head->disk), ++ bio->bi_iter.bi_sector); ++#else + trace_block_bio_remap(bio, disk_devt(ns->head->disk), + bio->bi_iter.bi_sector); ++#endif ++#ifdef HAVE_SUBMIT_BIO_NOACCT ++#ifdef HAVE_BIO_BI_COOKIE + submit_bio_noacct(bio); ++#else ++ ret = submit_bio_noacct(bio); ++#endif ++#else ++ ret = direct_make_request(bio); ++#endif + } else if (nvme_available_path(head)) { + dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n"); + +@@ -391,6 +445,9 @@ static void nvme_ns_head_submit_bio(stru + } + + srcu_read_unlock(&head->srcu, srcu_idx); ++#ifndef HAVE_BIO_BI_COOKIE ++ return ret; ++#endif + } + + static int nvme_ns_head_open(struct block_device *bdev, fmode_t mode) +@@ -426,12 +483,16 @@ static int nvme_ns_head_report_zones(str + + const struct block_device_operations nvme_ns_head_ops = { + .owner = THIS_MODULE, ++#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_SUBMIT_BIO + .submit_bio = nvme_ns_head_submit_bio, ++#endif + .open = nvme_ns_head_open, + .release = nvme_ns_head_release, + .ioctl = nvme_ns_head_ioctl, + .getgeo = nvme_getgeo, ++#ifdef HAVE_BLK_QUEUE_MAX_ACTIVE_ZONES + .report_zones = nvme_ns_head_report_zones, ++#endif + .pr_ops = &nvme_pr_ops, + }; + +@@ -461,6 +522,7 @@ static const struct file_operations nvme + .compat_ioctl = compat_ptr_ioctl, + }; + ++#ifdef HAVE_DEVICE_ADD_DISK_3_ARGS + static int nvme_add_ns_head_cdev(struct nvme_ns_head *head) + { + int ret; +@@ -474,6 +536,7 @@ static int nvme_add_ns_head_cdev(struct + &nvme_ns_head_chr_fops, THIS_MODULE); + return ret; + } ++#endif + + static void nvme_requeue_work(struct work_struct *work) + { +@@ -489,12 +552,19 @@ static void nvme_requeue_work(struct wor + next = bio->bi_next; + bio->bi_next = NULL; + ++#ifdef HAVE_SUBMIT_BIO_NOACCT + submit_bio_noacct(bio); ++#else ++ generic_make_request(bio); ++#endif + } + } + + int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) + { ++#ifndef HAVE_BLK_ALLOC_DISK ++ struct request_queue *q; ++#endif + bool vwc = false; + + mutex_init(&head->lock); +@@ -510,14 +580,61 @@ int nvme_mpath_alloc_disk(struct nvme_ct + if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) || !multipath) + return 0; + ++#ifdef HAVE_BLK_ALLOC_DISK + head->disk = blk_alloc_disk(ctrl->numa_node); ++#else ++#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_SUBMIT_BIO ++ q = blk_alloc_queue(ctrl->numa_node); ++#else ++#ifdef HAVE_BLK_QUEUE_MAKE_REQUEST ++#ifdef HAVE_BLK_ALLOC_QUEUE_NODE_3_ARGS ++ q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE, NULL); ++#else ++#ifdef HAVE_BLK_ALLOC_QUEUE_RH ++ q = blk_alloc_queue_rh(nvme_ns_head_make_request, ctrl->numa_node); ++#else ++ q = blk_alloc_queue_node(GFP_KERNEL, ctrl->numa_node); ++#endif ++#endif ++#else ++ q = blk_alloc_queue(nvme_ns_head_make_request, ctrl->numa_node); ++#endif ++#endif /* HAVE_BLOCK_DEVICE_OPERATIONS_SUBMIT_BIO */ ++ if (!q) ++ goto out; ++#if defined(HAVE_BLK_QUEUE_MAKE_REQUEST) && !defined(HAVE_BLK_ALLOC_QUEUE_RH) ++ blk_queue_make_request(q, nvme_ns_head_make_request); ++#endif ++ blk_queue_flag_set(QUEUE_FLAG_NONROT, q); ++ /* set to a default value for 512 until disk is validated */ ++ blk_queue_logical_block_size(q, 512); ++ blk_set_stacking_limits(&q->limits); ++ ++ /* we need to propagate up the VMC settings */ ++ if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) ++ vwc = true; ++ blk_queue_write_cache(q, vwc, vwc); ++ ++ head->disk = alloc_disk(0); ++#endif /* HAVE_BLK_ALLOC_DISK */ + if (!head->disk) ++#ifdef HAVE_BLK_ALLOC_DISK + return -ENOMEM; ++#else ++ goto out_cleanup_queue; ++#endif + head->disk->fops = &nvme_ns_head_ops; + head->disk->private_data = head; ++#ifndef HAVE_BLK_ALLOC_DISK ++ head->disk->queue = q; ++#endif ++#ifdef HAVE_GENHD_FL_EXT_DEVT ++ head->disk->flags = GENHD_FL_EXT_DEVT; ++#endif + sprintf(head->disk->disk_name, "nvme%dn%d", + ctrl->subsys->instance, head->instance); + ++#ifdef HAVE_BLK_ALLOC_DISK + blk_queue_flag_set(QUEUE_FLAG_NONROT, head->disk->queue); + blk_queue_flag_set(QUEUE_FLAG_NOWAIT, head->disk->queue); + /* +@@ -539,12 +656,22 @@ int nvme_mpath_alloc_disk(struct nvme_ct + vwc = true; + blk_queue_write_cache(head->disk->queue, vwc, vwc); + return 0; ++#else ++ return 0; ++ ++ out_cleanup_queue: ++ blk_cleanup_queue(q); ++ out: ++ return -ENOMEM; ++#endif + } + + static void nvme_mpath_set_live(struct nvme_ns *ns) + { + struct nvme_ns_head *head = ns->head; ++#ifdef HAVE_DEVICE_ADD_DISK_RETURN + int rc; ++#endif + + if (!head->disk) + return; +@@ -554,15 +681,30 @@ static void nvme_mpath_set_live(struct n + * paths simultaneously calling device_add_disk() on the same namespace + * head. + */ ++#ifdef HAVE_DEVICE_ADD_DISK_3_ARGS + if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) { ++#ifdef HAVE_DEVICE_ADD_DISK_RETURN + rc = device_add_disk(&head->subsys->dev, head->disk, + nvme_ns_id_attr_groups); + if (rc) { + clear_bit(NVME_NSHEAD_DISK_LIVE, &ns->flags); + return; + } ++#else ++ device_add_disk(&head->subsys->dev, head->disk, ++ nvme_ns_id_attr_groups); ++#endif + nvme_add_ns_head_cdev(head); + } ++#else ++ if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) { ++ device_add_disk(&head->subsys->dev, head->disk); ++ if (sysfs_create_group(&disk_to_dev(head->disk)->kobj, ++ &nvme_ns_id_attr_group)) ++ dev_warn(&head->subsys->dev, ++ "failed to create id group.\n"); ++ } ++#endif + + mutex_lock(&head->lock); + if (nvme_path_is_optimized(ns)) { +@@ -598,7 +740,11 @@ static int nvme_parse_ana_log(struct nvm + return -EINVAL; + + nr_nsids = le32_to_cpu(desc->nnsids); ++#ifdef flex_array_size + nsid_buf_size = flex_array_size(desc, nsids, nr_nsids); ++#else ++ nsid_buf_size = nr_nsids * sizeof(__le32); ++#endif + + if (WARN_ON_ONCE(desc->grpid == 0)) + return -EINVAL; +@@ -722,9 +868,15 @@ static void nvme_ana_work(struct work_st + nvme_read_ana_log(ctrl); + } + ++#ifdef HAVE_TIMER_SETUP + static void nvme_anatt_timeout(struct timer_list *t) + { + struct nvme_ctrl *ctrl = from_timer(ctrl, t, anatt_timer); ++#else ++static void nvme_anatt_timeout(unsigned long data) ++{ ++ struct nvme_ctrl *ctrl = (struct nvme_ctrl *)data; ++#endif + + dev_info(ctrl->device, "ANATT timeout, resetting controller.\n"); + nvme_reset_ctrl(ctrl); +@@ -824,13 +976,28 @@ void nvme_mpath_add_disk(struct nvme_ns + nvme_mpath_set_live(ns); + } + ++#ifdef HAVE_QUEUE_FLAG_STABLE_WRITES + if (blk_queue_stable_writes(ns->queue) && ns->head->disk) + blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, + ns->head->disk->queue); ++#else ++ if (bdi_cap_stable_pages_required(ns->queue->backing_dev_info)) { ++ struct gendisk *disk = ns->head->disk; ++ ++ if (disk) ++ disk->queue->backing_dev_info->capabilities |= ++ BDI_CAP_STABLE_WRITES; ++ } ++#endif ++ + #ifdef CONFIG_BLK_DEV_ZONED + if (blk_queue_is_zoned(ns->queue) && ns->head->disk) ++#ifdef HAVE_GENDISK_CONV_ZONES_BITMAP ++ ns->head->disk->nr_zones = ns->disk->nr_zones; ++#else + ns->head->disk->queue->nr_zones = ns->queue->nr_zones; + #endif ++#endif + } + + void nvme_mpath_shutdown_disk(struct nvme_ns_head *head) +@@ -838,27 +1005,56 @@ void nvme_mpath_shutdown_disk(struct nvm + if (!head->disk) + return; + kblockd_schedule_work(&head->requeue_work); ++#ifdef HAVE_DEVICE_ADD_DISK_3_ARGS + if (test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) { + nvme_cdev_del(&head->cdev, &head->cdev_device); + del_gendisk(head->disk); + } ++#else ++ if (test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) { ++ sysfs_remove_group(&disk_to_dev(head->disk)->kobj, ++ &nvme_ns_id_attr_group); ++ del_gendisk(head->disk); ++ } ++#endif + } + + void nvme_mpath_remove_disk(struct nvme_ns_head *head) + { + if (!head->disk) + return; ++#ifdef HAVE_BLK_MARK_DISK_DEAD + blk_mark_disk_dead(head->disk); ++#else ++ blk_set_queue_dying(head->disk->queue); ++#endif + /* make sure all pending bios are cleaned up */ + kblockd_schedule_work(&head->requeue_work); + flush_work(&head->requeue_work); ++#ifdef HAVE_BLK_ALLOC_DISK ++#ifdef HAVE_BLK_CLEANUP_DISK + blk_cleanup_disk(head->disk); ++#else ++ put_disk(head->disk); ++#endif ++#else ++ blk_cleanup_queue(head->disk->queue); ++ if (!test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) ++ head->disk->queue = NULL; ++ put_disk(head->disk); ++#endif + } + + void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl) + { + mutex_init(&ctrl->ana_lock); ++#ifdef HAVE_TIMER_SETUP + timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0); ++#else ++ init_timer(&ctrl->anatt_timer); ++ ctrl->anatt_timer.data = (unsigned long)ctrl; ++ ctrl->anatt_timer.function = nvme_anatt_timeout; ++#endif + INIT_WORK(&ctrl->ana_work, nvme_ana_work); + } + +@@ -919,3 +1115,4 @@ void nvme_mpath_uninit(struct nvme_ctrl + ctrl->ana_log_buf = NULL; + ctrl->ana_log_size = 0; + } ++#endif /* HAVE_BLK_TYPES_REQ_DRV */ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0311-BACKPORT-drivers-nvme-host-rdma.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0311-BACKPORT-drivers-nvme-host-rdma.c.patch new file mode 100644 index 0000000..6f86a10 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0311-BACKPORT-drivers-nvme-host-rdma.c.patch @@ -0,0 +1,682 @@ +From: Israel Rukshin +Subject: [PATCH] BACKPORT: drivers/nvme/host/rdma.c + +Change-Id: I2d26aedcdc44d668f4dd931ab7d7511c4bad6e7d +--- + drivers/nvme/host/rdma.c | 254 +++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 254 insertions(+) + +--- a/drivers/nvme/host/rdma.c ++++ b/drivers/nvme/host/rdma.c +@@ -16,14 +16,22 @@ + #include + #include + #include ++#if defined(HAVE_BLK_MQ_MAP_QUEUES) && defined(HAVE_BLK_MQ_TAG_SET_HAS_MAP) + #include ++#endif ++#ifdef HAVE_BLK_INTEGRITY_H + #include ++#endif + #include + #include + #include + #include + #include + #include ++#ifdef HAVE_SCSI_MAX_SG_SEGMENTS ++#include ++#endif ++#include + + #include + #include +@@ -132,7 +140,9 @@ struct nvme_rdma_ctrl { + + struct nvme_ctrl ctrl; + bool use_inline_data; ++#if defined(HAVE_BLK_MQ_HCTX_TYPE) && defined(HAVE_BLK_MQ_RDMA_MAP_QUEUES_MAP) + u32 io_queues[HCTX_MAX_TYPES]; ++#endif + }; + + static inline struct nvme_rdma_ctrl *to_rdma_ctrl(struct nvme_ctrl *ctrl) +@@ -161,8 +171,22 @@ static int nvme_rdma_cm_handler(struct r + static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc); + static void nvme_rdma_complete_rq(struct request *rq); + ++#ifdef HAVE_BLK_MQ_TAG_SET_HAS_CONST_OPS + static const struct blk_mq_ops nvme_rdma_mq_ops; + static const struct blk_mq_ops nvme_rdma_admin_mq_ops; ++#else ++static struct blk_mq_ops nvme_rdma_mq_ops; ++static struct blk_mq_ops nvme_rdma_admin_mq_ops; ++#endif ++ ++#if !defined HAVE_PUT_UNALIGNED_LE24 && !defined HAVE_PUT_UNALIGNED_LE24_ASM_GENERIC ++static inline void put_unaligned_le24(u32 val, u8 *p) ++{ ++ *p++ = val; ++ *p++ = val >> 8; ++ *p++ = val >> 16; ++} ++#endif + + static inline int nvme_rdma_queue_idx(struct nvme_rdma_queue *queue) + { +@@ -171,9 +195,13 @@ static inline int nvme_rdma_queue_idx(st + + static bool nvme_rdma_poll_queue(struct nvme_rdma_queue *queue) + { ++#if defined(HAVE_BLK_MQ_HCTX_TYPE) && defined(HAVE_BLK_MQ_RDMA_MAP_QUEUES_MAP) + return nvme_rdma_queue_idx(queue) > + queue->ctrl->io_queues[HCTX_TYPE_DEFAULT] + + queue->ctrl->io_queues[HCTX_TYPE_READ]; ++#else ++ return false; ++#endif + } + + static inline size_t nvme_rdma_inline_data_size(struct nvme_rdma_queue *queue) +@@ -290,21 +318,49 @@ static int nvme_rdma_create_qp(struct nv + return ret; + } + ++#ifdef HAVE_BLK_MQ_OPS_EXIT_REQUEST_HAS_3_PARAMS + static void nvme_rdma_exit_request(struct blk_mq_tag_set *set, + struct request *rq, unsigned int hctx_idx) ++#else ++static void __nvme_rdma_exit_request(struct nvme_rdma_ctrl *ctrl, ++ struct request *rq, unsigned int queue_idx) ++#endif + { + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + + kfree(req->sqe.data); + } + ++#ifndef HAVE_BLK_MQ_OPS_EXIT_REQUEST_HAS_3_PARAMS ++static void nvme_rdma_exit_request(void *data, struct request *rq, ++ unsigned int hctx_idx, unsigned int rq_idx) ++{ ++ __nvme_rdma_exit_request(data, rq, hctx_idx + 1); ++} ++ ++static void nvme_rdma_exit_admin_request(void *data, struct request *rq, ++ unsigned int hctx_idx, unsigned int rq_idx) ++{ ++ __nvme_rdma_exit_request(data, rq, 0); ++} ++#endif ++ ++#ifdef HAVE_BLK_MQ_OPS_INIT_REQUEST_HAS_4_PARAMS + static int nvme_rdma_init_request(struct blk_mq_tag_set *set, + struct request *rq, unsigned int hctx_idx, + unsigned int numa_node) ++#else ++static int __nvme_rdma_init_request(struct nvme_rdma_ctrl *ctrl, ++ struct request *rq, unsigned int queue_idx) ++#endif + { ++#ifdef HAVE_BLK_MQ_OPS_INIT_REQUEST_HAS_4_PARAMS + struct nvme_rdma_ctrl *ctrl = set->driver_data; ++#endif + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); ++#ifdef HAVE_BLK_MQ_OPS_INIT_REQUEST_HAS_4_PARAMS + int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0; ++#endif + struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx]; + + nvme_req(rq)->ctrl = &ctrl->ctrl; +@@ -323,6 +379,21 @@ static int nvme_rdma_init_request(struct + + return 0; + } ++#ifndef HAVE_BLK_MQ_OPS_INIT_REQUEST_HAS_4_PARAMS ++static int nvme_rdma_init_request(void *data, struct request *rq, ++ unsigned int hctx_idx, unsigned int rq_idx, ++ unsigned int numa_node) ++{ ++ return __nvme_rdma_init_request(data, rq, hctx_idx + 1); ++} ++ ++static int nvme_rdma_init_admin_request(void *data, struct request *rq, ++ unsigned int hctx_idx, unsigned int rq_idx, ++ unsigned int numa_node) ++{ ++ return __nvme_rdma_init_request(data, rq, 0); ++} ++#endif + + static int nvme_rdma_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) +@@ -453,6 +524,9 @@ static void nvme_rdma_destroy_queue_ib(s + sizeof(struct nvme_completion), DMA_FROM_DEVICE); + + nvme_rdma_dev_put(dev); ++#ifndef HAVE_REQUEST_QUEUE_TIMEOUT_WORK ++ queue->device = NULL; ++#endif + } + + static int nvme_rdma_get_max_fr_pages(struct ib_device *ibdev, bool pi_support) +@@ -470,14 +544,22 @@ static int nvme_rdma_get_max_fr_pages(st + static int nvme_rdma_create_cq(struct ib_device *ibdev, + struct nvme_rdma_queue *queue) + { ++#ifdef HAVE_BLK_MQ_ALLOC_REQUEST_HCTX + int ret, comp_vector, idx = nvme_rdma_queue_idx(queue); ++#else ++ int ret, comp_vector; ++#endif + enum ib_poll_context poll_ctx; + ++#ifdef HAVE_BLK_MQ_ALLOC_REQUEST_HCTX + /* + * Spread I/O queues completion vectors according their queue index. + * Admin queues can always go on completion vector 0. + */ + comp_vector = (idx == 0 ? idx : idx - 1) % ibdev->num_comp_vectors; ++#else ++ comp_vector = queue->ctrl->ctrl.instance % ibdev->num_comp_vectors; ++#endif + + /* Polling queues need direct cq polling context */ + if (nvme_rdma_poll_queue(queue)) { +@@ -504,6 +586,9 @@ static int nvme_rdma_create_queue_ib(str + const int send_wr_factor = 3; /* MR, SEND, INV */ + const int cq_factor = send_wr_factor + 1; /* + RECV */ + int ret, pages_per_mr; ++#ifndef HAVE_BLK_QUEUE_VIRT_BOUNDARY ++ enum ib_mr_type mr_type; ++#endif + + queue->device = nvme_rdma_find_get_device(queue->cm_id); + if (!queue->device) { +@@ -531,15 +616,29 @@ static int nvme_rdma_create_queue_ib(str + goto out_destroy_qp; + } + ++#ifndef HAVE_BLK_QUEUE_VIRT_BOUNDARY ++ if (ibdev->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG) ++ mr_type = IB_MR_TYPE_SG_GAPS; ++ else ++ mr_type = IB_MR_TYPE_MEM_REG; ++#endif + /* + * Currently we don't use SG_GAPS MR's so if the first entry is + * misaligned we'll end up using two entries for a single data page, + * so one additional entry is required. + */ + pages_per_mr = nvme_rdma_get_max_fr_pages(ibdev, queue->pi_support) + 1; ++#ifndef HAVE_BLK_QUEUE_VIRT_BOUNDARY ++ if (mr_type == IB_MR_TYPE_SG_GAPS) ++ pages_per_mr--; ++#endif + ret = ib_mr_pool_init(queue->qp, &queue->qp->rdma_mrs, + queue->queue_size, ++#ifdef HAVE_BLK_QUEUE_VIRT_BOUNDARY + IB_MR_TYPE_MEM_REG, ++#else ++ mr_type, ++#endif + pages_per_mr, 0); + if (ret) { + dev_err(queue->ctrl->ctrl.device, +@@ -750,6 +849,7 @@ static int nvme_rdma_alloc_io_queues(str + dev_info(ctrl->ctrl.device, + "creating %d I/O queues.\n", nr_io_queues); + ++#if defined(HAVE_BLK_MQ_HCTX_TYPE) && defined(HAVE_BLK_MQ_RDMA_MAP_QUEUES_MAP) + if (opts->nr_write_queues && nr_read_queues < nr_io_queues) { + /* + * separate read/write queues +@@ -777,6 +877,7 @@ static int nvme_rdma_alloc_io_queues(str + ctrl->io_queues[HCTX_TYPE_POLL] = + min(nr_poll_queues, nr_io_queues); + } ++#endif + + for (i = 1; i < ctrl->ctrl.queue_count; i++) { + ret = nvme_rdma_alloc_queue(ctrl, i, +@@ -813,7 +914,9 @@ static struct blk_mq_tag_set *nvme_rdma_ + set->driver_data = ctrl; + set->nr_hw_queues = 1; + set->timeout = NVME_ADMIN_TIMEOUT; ++#ifdef HAVE_BLK_MQ_F_NO_SCHED + set->flags = BLK_MQ_F_NO_SCHED; ++#endif + } else { + set = &ctrl->tag_set; + memset(set, 0, sizeof(*set)); +@@ -830,7 +933,9 @@ static struct blk_mq_tag_set *nvme_rdma_ + set->driver_data = ctrl; + set->nr_hw_queues = nctrl->queue_count - 1; + set->timeout = NVME_IO_TIMEOUT; ++#if defined(HAVE_BLK_MQ_HCTX_TYPE) && defined(HAVE_BLK_MQ_RDMA_MAP_QUEUES_MAP) + set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2; ++#endif + } + + ret = blk_mq_alloc_tag_set(set); +@@ -844,8 +949,13 @@ static void nvme_rdma_destroy_admin_queu + bool remove) + { + if (remove) { ++#ifdef HAVE_BLK_MQ_DESTROY_QUEUE ++ blk_mq_destroy_queue(ctrl->ctrl.admin_q); ++ blk_mq_destroy_queue(ctrl->ctrl.fabrics_q); ++#else + blk_cleanup_queue(ctrl->ctrl.admin_q); + blk_cleanup_queue(ctrl->ctrl.fabrics_q); ++#endif + blk_mq_free_tag_set(ctrl->ctrl.admin_tagset); + } + if (ctrl->async_event_sqe.data) { +@@ -870,10 +980,12 @@ static int nvme_rdma_configure_admin_que + ctrl->device = ctrl->queues[0].device; + ctrl->ctrl.numa_node = ibdev_to_node(ctrl->device->dev); + ++#ifdef HAVE_BLK_INTEGRITY_DEVICE_CAPABLE + /* T10-PI support */ + if (ctrl->device->dev->attrs.device_cap_flags & + IB_DEVICE_INTEGRITY_HANDOVER) + pi_capable = true; ++#endif + + ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev, + pi_capable); +@@ -912,6 +1024,10 @@ static int nvme_rdma_configure_admin_que + if (error) + goto out_cleanup_queue; + ++#ifndef HAVE_BLK_QUEUE_VIRT_BOUNDARY ++ if (ctrl->device->dev->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG) ++ ctrl->ctrl.sg_gaps_support = true; ++#endif + error = nvme_enable_ctrl(&ctrl->ctrl); + if (error) + goto out_stop_queue; +@@ -942,12 +1058,21 @@ out_quiesce_queue: + out_stop_queue: + nvme_rdma_stop_queue(&ctrl->queues[0]); + nvme_cancel_admin_tagset(&ctrl->ctrl); ++#ifdef HAVE_BLK_MQ_DESTROY_QUEUE ++out_cleanup_queue: ++ if (new) ++ blk_mq_destroy_queue(ctrl->ctrl.admin_q); ++out_cleanup_fabrics_q: ++ if (new) ++ blk_mq_destroy_queue(ctrl->ctrl.fabrics_q); ++#else + out_cleanup_queue: + if (new) + blk_cleanup_queue(ctrl->ctrl.admin_q); + out_cleanup_fabrics_q: + if (new) + blk_cleanup_queue(ctrl->ctrl.fabrics_q); ++#endif + out_free_tagset: + if (new) + blk_mq_free_tag_set(ctrl->ctrl.admin_tagset); +@@ -966,7 +1091,11 @@ static void nvme_rdma_destroy_io_queues( + bool remove) + { + if (remove) { ++#ifdef HAVE_BLK_MQ_DESTROY_QUEUE ++ blk_mq_destroy_queue(ctrl->ctrl.connect_q); ++#else + blk_cleanup_queue(ctrl->ctrl.connect_q); ++#endif + blk_mq_free_tag_set(ctrl->ctrl.tagset); + } + nvme_rdma_free_io_queues(ctrl); +@@ -1009,8 +1138,10 @@ static int nvme_rdma_configure_io_queues + ret = -ENODEV; + goto out_wait_freeze_timed_out; + } ++#ifdef HAVE_BLK_MQ_UPDATE_NR_HW_QUEUES + blk_mq_update_nr_hw_queues(ctrl->ctrl.tagset, + ctrl->ctrl.queue_count - 1); ++#endif + nvme_unfreeze(&ctrl->ctrl); + } + +@@ -1023,7 +1154,11 @@ out_wait_freeze_timed_out: + out_cleanup_connect_q: + nvme_cancel_tagset(&ctrl->ctrl); + if (new) ++#ifdef HAVE_BLK_MQ_DESTROY_QUEUE ++ blk_mq_destroy_queue(ctrl->ctrl.connect_q); ++#else + blk_cleanup_queue(ctrl->ctrl.connect_q); ++#endif + out_free_tag_set: + if (new) + blk_mq_free_tag_set(ctrl->ctrl.tagset); +@@ -1343,8 +1478,12 @@ static void nvme_rdma_unmap_data(struct + if (blk_integrity_rq(rq)) { + ib_dma_unmap_sg(ibdev, req->metadata_sgl->sg_table.sgl, + req->metadata_sgl->nents, rq_dma_dir(rq)); ++#ifdef HAVE_SG_ALLOC_TABLE_CHAINED_NENTS_FIRST_CHUNK_PARAM + sg_free_table_chained(&req->metadata_sgl->sg_table, + NVME_INLINE_METADATA_SG_CNT); ++#else ++ sg_free_table_chained(&req->metadata_sgl->sg_table, true); ++#endif + } + + if (req->use_sig_mr) +@@ -1362,7 +1501,11 @@ static void nvme_rdma_unmap_data(struct + + ib_dma_unmap_sg(ibdev, req->data_sgl.sg_table.sgl, req->data_sgl.nents, + rq_dma_dir(rq)); ++#ifdef HAVE_SG_ALLOC_TABLE_CHAINED_NENTS_FIRST_CHUNK_PARAM + sg_free_table_chained(&req->data_sgl.sg_table, NVME_INLINE_SG_CNT); ++#else ++ sg_free_table_chained(&req->data_sgl.sg_table, true); ++#endif + } + + static int nvme_rdma_set_sg_null(struct nvme_command *c) +@@ -1467,7 +1610,13 @@ static void nvme_rdma_set_sig_domain(str + { + domain->sig_type = IB_SIG_TYPE_T10_DIF; + domain->sig.dif.bg_type = IB_T10DIF_CRC; ++#ifdef CONFIG_BLK_DEV_INTEGRITY ++#ifdef HAVE_BLK_INTEGRITY_SECTOR_SIZE ++ domain->sig.dif.pi_interval = 1 << bi->sector_size; ++#else + domain->sig.dif.pi_interval = 1 << bi->interval_exp; ++#endif ++#endif + domain->sig.dif.ref_tag = le32_to_cpu(cmd->rw.reftag); + if (control & NVME_RW_PRINFO_PRCHK_REF) + domain->sig.dif.ref_remap = true; +@@ -1529,7 +1678,9 @@ static int nvme_rdma_map_sg_pi(struct nv + struct ib_reg_wr *wr = &req->reg_wr; + struct request *rq = blk_mq_rq_from_pdu(req); + struct nvme_ns *ns = rq->q->queuedata; ++#if defined HAVE_BIO_BI_DISK || defined HAVE_BIO_BI_BDEV + struct bio *bio = rq->bio; ++#endif + struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl; + int nr; + +@@ -1543,8 +1694,16 @@ static int nvme_rdma_map_sg_pi(struct nv + if (unlikely(nr)) + goto mr_put; + ++#ifdef HAVE_BIO_BI_BDEV + nvme_rdma_set_sig_attrs(blk_get_integrity(bio->bi_bdev->bd_disk), c, + req->mr->sig_attrs, ns->pi_type); ++#elif defined(HAVE_BIO_BI_DISK) ++ nvme_rdma_set_sig_attrs(blk_get_integrity(bio->bi_disk), c, ++ req->mr->sig_attrs, ns->pi_type); ++#else ++ nvme_rdma_set_sig_attrs(blk_get_integrity(rq->rq_disk), c, ++ req->mr->sig_attrs, ns->pi_type); ++#endif + nvme_rdma_set_prot_checks(c, &req->mr->sig_attrs->check_mask); + + ib_update_fast_reg_key(req->mr, ib_inc_rkey(req->mr->rkey)); +@@ -1594,9 +1753,18 @@ static int nvme_rdma_map_data(struct nvm + return nvme_rdma_set_sg_null(c); + + req->data_sgl.sg_table.sgl = (struct scatterlist *)(req + 1); ++#ifdef HAVE_SG_ALLOC_TABLE_CHAINED_NENTS_FIRST_CHUNK_PARAM + ret = sg_alloc_table_chained(&req->data_sgl.sg_table, + blk_rq_nr_phys_segments(rq), req->data_sgl.sg_table.sgl, + NVME_INLINE_SG_CNT); ++#else ++ ret = sg_alloc_table_chained(&req->data_sgl.sg_table, ++ blk_rq_nr_phys_segments(rq), ++#ifdef HAVE_SG_ALLOC_TABLE_CHAINED_4_PARAMS ++ GFP_ATOMIC, ++#endif ++ req->data_sgl.sg_table.sgl); ++#endif + if (ret) + return -ENOMEM; + +@@ -1622,10 +1790,19 @@ static int nvme_rdma_map_data(struct nvm + if (blk_integrity_rq(rq)) { + req->metadata_sgl->sg_table.sgl = + (struct scatterlist *)(req->metadata_sgl + 1); ++#ifdef HAVE_SG_ALLOC_TABLE_CHAINED_NENTS_FIRST_CHUNK_PARAM + ret = sg_alloc_table_chained(&req->metadata_sgl->sg_table, + blk_rq_count_integrity_sg(rq->q, rq->bio), + req->metadata_sgl->sg_table.sgl, + NVME_INLINE_METADATA_SG_CNT); ++#else ++ ret = sg_alloc_table_chained(&req->metadata_sgl->sg_table, ++ blk_rq_count_integrity_sg(rq->q, rq->bio), ++#ifdef HAVE_SG_ALLOC_TABLE_CHAINED_4_PARAMS ++ GFP_ATOMIC, ++#endif ++ req->metadata_sgl->sg_table.sgl); ++#endif + if (unlikely(ret)) { + ret = -ENOMEM; + goto out_unmap_sg; +@@ -1651,7 +1828,11 @@ static int nvme_rdma_map_data(struct nvm + if (count <= dev->num_inline_segments) { + if (rq_data_dir(rq) == WRITE && nvme_rdma_queue_idx(queue) && + queue->ctrl->use_inline_data && ++#ifdef HAVE_BLK_RQ_NR_PAYLOAD_BYTES + blk_rq_payload_bytes(rq) <= ++#else ++ nvme_map_len(rq) <= ++#endif + nvme_rdma_inline_data_size(queue)) { + ret = nvme_rdma_map_sg_inline(queue, req, c, count); + goto out; +@@ -1676,13 +1857,21 @@ out_unmap_pi_sg: + req->metadata_sgl->nents, rq_dma_dir(rq)); + out_free_pi_table: + if (blk_integrity_rq(rq)) ++#ifdef HAVE_SG_ALLOC_TABLE_CHAINED_NENTS_FIRST_CHUNK_PARAM + sg_free_table_chained(&req->metadata_sgl->sg_table, + NVME_INLINE_METADATA_SG_CNT); ++#else ++ sg_free_table_chained(&req->metadata_sgl->sg_table, true); ++#endif + out_unmap_sg: + ib_dma_unmap_sg(ibdev, req->data_sgl.sg_table.sgl, req->data_sgl.nents, + rq_dma_dir(rq)); + out_free_table: ++#ifdef HAVE_SG_ALLOC_TABLE_CHAINED_NENTS_FIRST_CHUNK_PARAM + sg_free_table_chained(&req->data_sgl.sg_table, NVME_INLINE_SG_CNT); ++#else ++ sg_free_table_chained(&req->data_sgl.sg_table, true); ++#endif + return ret; + } + +@@ -2068,6 +2257,7 @@ static int nvme_rdma_cm_handler(struct r + return 0; + } + ++#ifdef HAVE_BLK_EH_DONE + static void nvme_rdma_complete_timed_out(struct request *rq) + { + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); +@@ -2079,9 +2269,14 @@ static void nvme_rdma_complete_timed_out + blk_mq_complete_request(rq); + } + } ++#endif + + static enum blk_eh_timer_return ++#ifdef HAVE_BLK_MQ_OPS_TIMEOUT_1_PARAM ++nvme_rdma_timeout(struct request *rq) ++#else + nvme_rdma_timeout(struct request *rq, bool reserved) ++#endif + { + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_rdma_queue *queue = req->queue; +@@ -2090,6 +2285,16 @@ nvme_rdma_timeout(struct request *rq, bo + dev_warn(ctrl->ctrl.device, "I/O %d QID %d timeout\n", + rq->tag, nvme_rdma_queue_idx(queue)); + ++#ifndef HAVE_BLK_EH_DONE ++ /* ++ * Restart the timer if a controller reset is already scheduled. Any ++ * timed out commands would be handled before entering the connecting ++ * state. ++ */ ++ if (ctrl->ctrl.state == NVME_CTRL_RESETTING) ++ return BLK_EH_RESET_TIMER; ++#endif ++ + if (ctrl->ctrl.state != NVME_CTRL_LIVE) { + /* + * If we are resetting, connecting or deleting we should +@@ -2104,8 +2309,22 @@ nvme_rdma_timeout(struct request *rq, bo + * All other requests should be cancelled by the error + * recovery work, so it's fine that we fail it here. + */ ++#ifdef HAVE_BLK_EH_DONE + nvme_rdma_complete_timed_out(rq); + return BLK_EH_DONE; ++#else ++ /* ++ * Completing the request directly from EH timer is not possible ++ * since the block layer marked the request before calling us ++ * (calling blk_mq_complete_request() from the driver is doing ++ * nothing). The only way to complete the request on timeout is ++ * by returning BLK_EH_HANDLED which complete the request later ++ * on at blk_mq_rq_timed_out(). ++ */ ++ nvme_req(rq)->status = NVME_SC_ABORT_REQ; ++ return BLK_EH_HANDLED; ++ ++#endif + } + + /* +@@ -2197,12 +2416,26 @@ unmap_qe: + return ret; + } + ++#ifdef HAVE_BLK_MQ_OPS_POLL ++#ifdef HAVE_BLK_MQ_OPS_POLL_1_ARG ++static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx) ++#else ++#ifdef HAVE_BLK_MQ_OPS_POLL_2_ARG + static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) ++#else ++static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) ++#endif ++#endif + { + struct nvme_rdma_queue *queue = hctx->driver_data; + ++#if defined(HAVE_BLK_MQ_OPS_POLL_1_ARG) || defined(HAVE_BLK_MQ_OPS_POLL_2_ARG) + return ib_process_cq_direct(queue->ib_cq, -1); ++#else ++ return ib_process_cq_direct(queue->ib_cq, tag); ++#endif + } ++#endif + + static void nvme_rdma_check_pi_status(struct nvme_rdma_request *req) + { +@@ -2239,20 +2472,33 @@ static void nvme_rdma_complete_rq(struct + { + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_rdma_queue *queue = req->queue; ++#ifdef HAVE_REQUEST_QUEUE_TIMEOUT_WORK + struct ib_device *ibdev = queue->device->dev; ++#endif + + if (req->use_sig_mr) + nvme_rdma_check_pi_status(req); + ++#ifdef HAVE_REQUEST_QUEUE_TIMEOUT_WORK + nvme_rdma_unmap_data(queue, rq); + ib_dma_unmap_single(ibdev, req->sqe.dma, sizeof(struct nvme_command), + DMA_TO_DEVICE); ++#else ++ // WA for use after free device ++ if (likely(queue->device)) { ++ nvme_rdma_unmap_data(queue, rq); ++ ib_dma_unmap_single(queue->device->dev, req->sqe.dma, ++ sizeof(struct nvme_command), DMA_TO_DEVICE); ++ } ++#endif + nvme_complete_rq(rq); + } + ++#if defined(HAVE_BLK_MQ_MAP_QUEUES) && defined(HAVE_BLK_MQ_TAG_SET_HAS_MAP) + static int nvme_rdma_map_queues(struct blk_mq_tag_set *set) + { + struct nvme_rdma_ctrl *ctrl = set->driver_data; ++#if defined(HAVE_BLK_MQ_HCTX_TYPE) && defined(HAVE_BLK_MQ_RDMA_MAP_QUEUES_MAP) + struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; + + if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) { +@@ -2295,24 +2541,55 @@ static int nvme_rdma_map_queues(struct b + ctrl->io_queues[HCTX_TYPE_POLL]); + + return 0; ++#else ++ return blk_mq_rdma_map_queues(set, ctrl->device->dev, 0); ++#endif ++ + } ++#endif + ++#ifdef HAVE_BLK_MQ_TAG_SET_HAS_CONST_OPS + static const struct blk_mq_ops nvme_rdma_mq_ops = { ++#else ++static struct blk_mq_ops nvme_rdma_mq_ops = { ++#endif + .queue_rq = nvme_rdma_queue_rq, + .complete = nvme_rdma_complete_rq, ++#ifdef HAVE_BLK_MQ_OPS_MAP_QUEUE ++ .map_queue = blk_mq_map_queue, ++#endif + .init_request = nvme_rdma_init_request, + .exit_request = nvme_rdma_exit_request, + .init_hctx = nvme_rdma_init_hctx, + .timeout = nvme_rdma_timeout, ++#if defined(HAVE_BLK_MQ_MAP_QUEUES) && defined(HAVE_BLK_MQ_TAG_SET_HAS_MAP) + .map_queues = nvme_rdma_map_queues, ++#endif ++#ifdef HAVE_BLK_MQ_OPS_POLL + .poll = nvme_rdma_poll, ++#endif + }; + ++#ifdef HAVE_BLK_MQ_TAG_SET_HAS_CONST_OPS + static const struct blk_mq_ops nvme_rdma_admin_mq_ops = { ++#else ++static struct blk_mq_ops nvme_rdma_admin_mq_ops = { ++#endif + .queue_rq = nvme_rdma_queue_rq, + .complete = nvme_rdma_complete_rq, ++#ifdef HAVE_BLK_MQ_OPS_MAP_QUEUE ++ .map_queue = blk_mq_map_queue, ++#endif ++#ifdef HAVE_BLK_MQ_OPS_INIT_REQUEST_HAS_4_PARAMS + .init_request = nvme_rdma_init_request, ++#else ++ .init_request = nvme_rdma_init_admin_request, ++#endif ++#ifdef HAVE_BLK_MQ_OPS_EXIT_REQUEST_HAS_3_PARAMS + .exit_request = nvme_rdma_exit_request, ++#else ++ .exit_request = nvme_rdma_exit_admin_request, ++#endif + .init_hctx = nvme_rdma_init_admin_hctx, + .timeout = nvme_rdma_timeout, + }; +@@ -2579,3 +2856,6 @@ module_init(nvme_rdma_init_module); + module_exit(nvme_rdma_cleanup_module); + + MODULE_LICENSE("GPL v2"); ++#ifdef RETPOLINE_MLNX ++MODULE_INFO(retpoline, "Y"); ++#endif diff --git a/src/mlnx-ofa_kernel-5.8/backports/0312-BACKPORT-drivers-nvme-host-fabrics.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0312-BACKPORT-drivers-nvme-host-fabrics.c.patch new file mode 100644 index 0000000..4c6f5a0 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0312-BACKPORT-drivers-nvme-host-fabrics.c.patch @@ -0,0 +1,106 @@ +From: Israel Rukshin +Subject: [PATCH] BACKPORT: drivers/nvme/host/fabrics.c + +Change-Id: Idf6aae63bdd1a58911a7be8c7694f755df3b806b +--- + drivers/nvme/host/fabrics.c | 34 ++++++++++++++++++++++++++++++++++ + 1 file changed, 34 insertions(+) + +--- a/drivers/nvme/host/fabrics.c ++++ b/drivers/nvme/host/fabrics.c +@@ -156,8 +156,13 @@ int nvmf_reg_read32(struct nvme_ctrl *ct + cmd.prop_get.fctype = nvme_fabrics_type_property_get; + cmd.prop_get.offset = cpu_to_le32(off); + ++#ifdef HAVE_BLK_MQ_ALLOC_REQUEST_HAS_3_PARAMS + ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0, 0, + NVME_QID_ANY, 0, 0); ++#else ++ ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0, 0, ++ NVME_QID_ANY, 0, GFP_KERNEL, false, false); ++#endif + + if (ret >= 0) + *val = le64_to_cpu(res.u64); +@@ -202,8 +207,13 @@ int nvmf_reg_read64(struct nvme_ctrl *ct + cmd.prop_get.attrib = 1; + cmd.prop_get.offset = cpu_to_le32(off); + ++#ifdef HAVE_BLK_MQ_ALLOC_REQUEST_HAS_3_PARAMS + ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0, 0, + NVME_QID_ANY, 0, 0); ++#else ++ ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0, 0, ++ NVME_QID_ANY, 0, GFP_KERNEL, false, false); ++#endif + + if (ret >= 0) + *val = le64_to_cpu(res.u64); +@@ -247,8 +257,13 @@ int nvmf_reg_write32(struct nvme_ctrl *c + cmd.prop_set.offset = cpu_to_le32(off); + cmd.prop_set.value = cpu_to_le64(val); + ++#ifdef HAVE_BLK_MQ_ALLOC_REQUEST_HAS_3_PARAMS + ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, NULL, NULL, 0, 0, + NVME_QID_ANY, 0, 0); ++#else ++ ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, NULL, NULL, 0, 0, ++ NVME_QID_ANY, 0, GFP_KERNEL, false, false); ++#endif + if (unlikely(ret)) + dev_err(ctrl->device, + "Property Set error: %d, offset %#x\n", +@@ -392,9 +407,15 @@ int nvmf_connect_admin_queue(struct nvme + strncpy(data->subsysnqn, ctrl->opts->subsysnqn, NVMF_NQN_SIZE); + strncpy(data->hostnqn, ctrl->opts->host->nqn, NVMF_NQN_SIZE); + ++#ifdef HAVE_BLK_MQ_ALLOC_REQUEST_HAS_3_PARAMS + ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, + data, sizeof(*data), 0, NVME_QID_ANY, 1, + BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT); ++#else ++ ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, ++ data, sizeof(*data), 0, NVME_QID_ANY, 1, ++ GFP_ATOMIC, true, false); ++#endif + if (ret) { + nvmf_log_connect_error(ctrl, ret, le32_to_cpu(res.u32), + &cmd, data); +@@ -453,9 +474,15 @@ int nvmf_connect_io_queue(struct nvme_ct + strncpy(data->subsysnqn, ctrl->opts->subsysnqn, NVMF_NQN_SIZE); + strncpy(data->hostnqn, ctrl->opts->host->nqn, NVMF_NQN_SIZE); + ++#ifdef HAVE_BLK_MQ_ALLOC_REQUEST_HAS_3_PARAMS + ret = __nvme_submit_sync_cmd(ctrl->connect_q, &cmd, &res, + data, sizeof(*data), 0, qid, 1, + BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT); ++#else ++ ret = __nvme_submit_sync_cmd(ctrl->connect_q, &cmd, &res, ++ data, sizeof(*data), 0, qid, 1, ++ GFP_ATOMIC, true, false); ++#endif + if (ret) { + nvmf_log_connect_error(ctrl, ret, le32_to_cpu(res.u32), + &cmd, data); +@@ -567,7 +594,11 @@ static int nvmf_parse_options(struct nvm + + /* Set defaults */ + opts->queue_size = NVMF_DEF_QUEUE_SIZE; ++#ifdef HAVE_BLK_MQ_ALLOC_REQUEST_HCTX + opts->nr_io_queues = num_online_cpus(); ++#else ++ opts->nr_io_queues = 1; ++#endif + opts->reconnect_delay = NVMF_DEF_RECONNECT_DELAY; + opts->kato = 0; + opts->duplicate_connect = false; +@@ -1200,6 +1231,9 @@ static void __exit nvmf_exit(void) + } + + MODULE_LICENSE("GPL v2"); ++#ifdef RETPOLINE_MLNX ++MODULE_INFO(retpoline, "Y"); ++#endif + + module_init(nvmf_init); + module_exit(nvmf_exit); diff --git a/src/mlnx-ofa_kernel-5.8/backports/0313-BACKPORT-drivers-nvme-host-pci.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0313-BACKPORT-drivers-nvme-host-pci.c.patch new file mode 100644 index 0000000..8706494 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0313-BACKPORT-drivers-nvme-host-pci.c.patch @@ -0,0 +1,1791 @@ +From: Israel Rukshin +Subject: [PATCH] BACKPORT: drivers/nvme/host/pci.c + +Change-Id: I126cff9b8793bd2de2ee0cce424f41fcebe7fb78 +--- + drivers/nvme/host/pci.c | 720 +++++++++++++++++++++++++++++++++++++++- + 1 file changed, 715 insertions(+), 5 deletions(-) + +--- a/drivers/nvme/host/pci.c ++++ b/drivers/nvme/host/pci.c +@@ -10,24 +10,41 @@ + #include + #include + #include ++#ifdef HAVE_BLK_INTEGRITY_H + #include ++#endif + #include + #include + #include + #include ++#ifdef HAVE_NET_MEMREMAP_H + #include ++#endif + #include + #include + #include ++#ifdef HAVE_ONCE_H + #include ++#endif + #include + #include + #include + #include + #include ++#ifdef HAVE_IO_64_NONATOMIC_LO_HI_H + #include ++#else ++#include ++#endif ++#ifdef HAVE_IO_64_NONATOMIC_HI_LO_H + #include ++#else ++#include ++#endif ++#ifdef HAVE_LINUX_SED_OPAL_H + #include ++#endif ++#include + #include + + #include "trace.h" +@@ -98,6 +115,7 @@ static const struct kernel_param_ops io_ + .get = param_get_uint, + }; + ++#ifdef HAVE_BLK_MQ_HCTX_TYPE + static unsigned int write_queues; + module_param_cb(write_queues, &io_queue_count_ops, &write_queues, 0644); + MODULE_PARM_DESC(write_queues, +@@ -107,6 +125,14 @@ MODULE_PARM_DESC(write_queues, + static unsigned int poll_queues; + module_param_cb(poll_queues, &io_queue_count_ops, &poll_queues, 0644); + MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO."); ++#else ++static int write_queues = 0; ++MODULE_PARM_DESC(write_queues, ++ "Number of queues to use for writes [deprecated]"); ++ ++static int poll_queues = 0; ++MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO [deprecated]"); ++#endif + + static bool noacpi; + module_param(noacpi, bool, 0444); +@@ -146,11 +172,18 @@ struct nvme_dev { + struct dma_pool *prp_small_pool; + unsigned online_queues; + unsigned max_qid; ++#ifdef HAVE_BLK_MQ_HCTX_TYPE + unsigned io_queues[HCTX_MAX_TYPES]; ++#endif ++#if defined(HAVE_PCI_IRQ_API) && defined(HAVE_IRQ_CALC_AFFINITY_VECTORS_3_ARGS) + unsigned int num_vecs; ++#endif + u32 q_depth; + int io_sqes; + u32 db_stride; ++#ifndef HAVE_PCI_IRQ_API ++ struct msix_entry *entry; ++#endif + void __iomem *bar; + unsigned long bar_mapped_size; + struct work_struct remove_work; +@@ -188,8 +221,19 @@ struct nvme_dev { + + static int io_queue_depth_set(const char *val, const struct kernel_param *kp) + { ++#ifdef HAVE_PARAM_SET_UINT_MINMAX + return param_set_uint_minmax(val, kp, NVME_PCI_MIN_QUEUE_SIZE, + NVME_PCI_MAX_QUEUE_SIZE); ++#else ++ int ret; ++ u32 n; ++ ++ ret = kstrtou32(val, 10, &n); ++ if (ret != 0 || n < 2) ++ return -EINVAL; ++ ++ return param_set_uint(val, kp); ++#endif + } + + static int num_p2p_queues_set(const char *val, const struct kernel_param *kp) +@@ -225,6 +269,9 @@ static inline struct nvme_dev *to_nvme_d + */ + struct nvme_queue { + struct nvme_dev *dev; ++#ifndef HAVE_PCI_FREE_IRQ ++ char irqname[24]; /* nvme4294967295-65535\0 */ ++#endif + spinlock_t sq_lock; + void *sq_cmds; + /* only used for poll queues: */ +@@ -270,11 +317,21 @@ struct nvme_iod { + bool use_sgl; + int aborted; + int npages; /* In the PRP list. 0 means small pool in use */ ++#ifndef HAVE_BIO_SPLIT_TO_LIMITS + int nents; /* Used in scatterlist */ ++#endif + dma_addr_t first_dma; ++#if defined(HAVE_BLKDEV_DMA_MAP_BVEC) && defined(HAVE_BLKDEV_REQ_BVEC) + unsigned int dma_len; /* length of single DMA segment mapping */ + dma_addr_t meta_dma; ++#else ++ struct scatterlist meta_sg; ++#endif ++#ifdef HAVE_BIO_SPLIT_TO_LIMITS ++ struct sg_table sgt; ++#else + struct scatterlist *sg; ++#endif + }; + + static int nvme_peer_init_resource(struct nvme_queue *nvmeq, +@@ -288,11 +345,19 @@ static int nvme_peer_init_resource(struc + + if (mask & NVME_PEER_SQT_DBR) + /* Calculation from NVMe 1.2.1 SPEC */ ++#ifndef CONFIG_PPC + nvmeq->resource.sqt_dbr_addr = pci_bus_address(pdev, 0) + (0x1000 + ((2 * (qid)) * (4 << NVME_CAP_STRIDE(dev->ctrl.cap)))); ++#else ++ nvmeq->resource.sqt_dbr_addr = 0x800000000000000 | (pci_resource_start(pdev, 0) + (0x1000 + ((2 * (qid)) * (4 << NVME_CAP_STRIDE(dev->ctrl.cap))))); ++#endif + + if (mask & NVME_PEER_CQH_DBR) + /* Calculation from NVMe 1.2.1 SPEC */ ++#ifndef CONFIG_PPC + nvmeq->resource.cqh_dbr_addr = pci_bus_address(pdev, 0) + (0x1000 + ((2 * (qid) + 1) * (4 << NVME_CAP_STRIDE(dev->ctrl.cap)))); ++#else ++ nvmeq->resource.cqh_dbr_addr = 0x800000000000000 | (pci_resource_start(pdev, 0) + (0x1000 + ((2 * (qid) + 1) * (4 << NVME_CAP_STRIDE(dev->ctrl.cap))))); ++#endif + + if (mask & NVME_PEER_SQ_PAS) + nvmeq->resource.sq_dma_addr = nvmeq->sq_dma_addr; +@@ -566,6 +631,17 @@ static size_t nvme_pci_iod_alloc_size(vo + sizeof(struct scatterlist) * NVME_MAX_SEGS; + } + ++#ifndef HAVE_PCI_FREE_IRQ ++static int nvmeq_irq(struct nvme_queue *nvmeq) ++{ ++#ifdef HAVE_PCI_IRQ_API ++ return pci_irq_vector(to_pci_dev(nvmeq->dev->dev), nvmeq->cq_vector); ++#else ++ return nvmeq->dev->entry[nvmeq->cq_vector].vector; ++#endif ++} ++#endif ++ + static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) + { +@@ -590,6 +666,7 @@ static int nvme_init_hctx(struct blk_mq_ + return 0; + } + ++#ifdef HAVE_BLK_MQ_OPS_INIT_REQUEST_HAS_4_PARAMS + static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req, + unsigned int hctx_idx, unsigned int numa_node) + { +@@ -605,16 +682,54 @@ static int nvme_init_request(struct blk_ + nvme_req(req)->cmd = &iod->cmd; + return 0; + } ++#else ++static int nvme_init_request(void *data, struct request *req, ++ unsigned int hctx_idx, unsigned int rq_idx, ++ unsigned int numa_node) ++{ ++ struct nvme_dev *dev = data; ++ struct nvme_iod *iod = blk_mq_rq_to_pdu(req); ++ struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1]; ++ ++ BUG_ON(!nvmeq); ++ iod->nvmeq = nvmeq; ++ ++ nvme_req(req)->ctrl = &dev->ctrl; ++ nvme_req(req)->cmd = &iod->cmd; ++ return 0; ++} ++ ++static int nvme_admin_init_request(void *data, struct request *req, ++ unsigned int hctx_idx, unsigned int rq_idx, ++ unsigned int numa_node) ++{ ++ struct nvme_dev *dev = data; ++ struct nvme_iod *iod = blk_mq_rq_to_pdu(req); ++ struct nvme_queue *nvmeq = &dev->queues[0]; ++ ++ BUG_ON(!nvmeq); ++ iod->nvmeq = nvmeq; + ++ nvme_req(req)->ctrl = &dev->ctrl; ++ nvme_req(req)->cmd = &iod->cmd; ++ return 0; ++} ++#endif ++#if defined(HAVE_BLK_MQ_OPS_MAP_QUEUES) && \ ++ (defined(HAVE_PCI_IRQ_GET_AFFINITY) || \ ++ defined(HAVE_BLK_MQ_PCI_MAP_QUEUES_3_ARGS)) + static int queue_irq_offset(struct nvme_dev *dev) + { ++#if defined(HAVE_PCI_IRQ_API) && defined(HAVE_IRQ_CALC_AFFINITY_VECTORS_3_ARGS) + /* if we have more than 1 vec, admin queue offsets us by 1 */ + if (dev->num_vecs > 1) + return 1; ++#endif + + return 0; + } + ++#ifdef HAVE_BLK_MQ_HCTX_TYPE + static int nvme_pci_map_queues(struct blk_mq_tag_set *set) + { + struct nvme_dev *dev = set->driver_data; +@@ -645,6 +760,24 @@ static int nvme_pci_map_queues(struct bl + + return 0; + } ++#else ++static int nvme_pci_map_queues(struct blk_mq_tag_set *set) ++{ ++ struct nvme_dev *dev = set->driver_data; ++ int offset = queue_irq_offset(dev); ++ ++#ifdef HAVE_BLK_MQ_PCI_MAP_QUEUES_3_ARGS ++#ifdef HAVE_BLK_MQ_QUEUE_MAP ++ return blk_mq_pci_map_queues(&set->map[0], to_pci_dev(dev->dev), offset); ++#else ++ return blk_mq_pci_map_queues(set, to_pci_dev(dev->dev), offset); ++#endif ++#else ++ return __blk_mq_pci_map_queues(set, to_pci_dev(dev->dev), offset); ++#endif /* HAVE_BLK_MQ_PCI_MAP_QUEUES_3_ARGS */ ++} ++#endif /* HAVE_BLK_MQ_TAG_SET_NR_MAPS */ ++#endif + + /* + * Write sq tail if we are asked to, or if the next command would wrap. +@@ -666,6 +799,9 @@ static inline void nvme_write_sq_db(stru + nvmeq->last_sq_tail = nvmeq->sq_tail; + } + ++#ifndef absolute_pointer ++#define absolute_pointer(val) RELOC_HIDE((void *)(val), 0) ++#endif + static inline void nvme_sq_copy_cmd(struct nvme_queue *nvmeq, + struct nvme_command *cmd) + { +@@ -675,6 +811,7 @@ static inline void nvme_sq_copy_cmd(stru + nvmeq->sq_tail = 0; + } + ++#ifdef HAVE_BLK_MQ_OPS_COMMIT_RQS + static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx) + { + struct nvme_queue *nvmeq = hctx->driver_data; +@@ -684,11 +821,16 @@ static void nvme_commit_rqs(struct blk_m + nvme_write_sq_db(nvmeq, true); + spin_unlock(&nvmeq->sq_lock); + } ++#endif + + static void **nvme_pci_iod_list(struct request *req) + { + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); ++#ifdef HAVE_BIO_SPLIT_TO_LIMITS ++ return (void **)(iod->sgt.sgl + blk_rq_nr_phys_segments(req)); ++#else + return (void **)(iod->sg + blk_rq_nr_phys_segments(req)); ++#endif + } + + static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req) +@@ -697,7 +839,11 @@ static inline bool nvme_pci_use_sgls(str + int nseg = blk_rq_nr_phys_segments(req); + unsigned int avg_seg_size; + ++#ifdef HAVE_BLK_RQ_NR_PAYLOAD_BYTES + avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg); ++#else ++ avg_seg_size = DIV_ROUND_UP(nvme_map_len(req), nseg); ++#endif + + if (!nvme_ctrl_sgl_supported(&dev->ctrl)) + return false; +@@ -744,6 +890,7 @@ static void nvme_free_sgls(struct nvme_d + #include "nvfs-dma.h" + #endif + ++#ifndef HAVE_BIO_SPLIT_TO_LIMITS + static void nvme_unmap_sg(struct nvme_dev *dev, struct request *req) + { + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); +@@ -752,26 +899,41 @@ static void nvme_unmap_sg(struct nvme_de + if (nvme_nvfs_unmap_data(dev, req)) + return; + #endif ++ + if (is_pci_p2pdma_page(sg_page(iod->sg))) + pci_p2pdma_unmap_sg(dev->dev, iod->sg, iod->nents, + rq_dma_dir(req)); + else + dma_unmap_sg(dev->dev, iod->sg, iod->nents, rq_dma_dir(req)); + } ++#endif + + static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) + { + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + ++#if defined(HAVE_BLKDEV_DMA_MAP_BVEC) && defined(HAVE_BLKDEV_REQ_BVEC) + if (iod->dma_len) { + dma_unmap_page(dev->dev, iod->first_dma, iod->dma_len, + rq_dma_dir(req)); + return; + } ++#endif ++ ++#ifdef HAVE_BIO_SPLIT_TO_LIMITS ++ WARN_ON_ONCE(!iod->sgt.nents); + ++#ifdef CONFIG_NVFS ++ if (nvme_nvfs_unmap_data(dev, req)) ++ return; ++#endif ++ ++ dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0); ++#else + WARN_ON_ONCE(!iod->nents); + + nvme_unmap_sg(dev, req); ++#endif + if (iod->npages == 0) + dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0], + iod->first_dma); +@@ -779,9 +941,14 @@ static void nvme_unmap_data(struct nvme_ + nvme_free_sgls(dev, req); + else + nvme_free_prps(dev, req); ++#ifdef HAVE_BIO_SPLIT_TO_LIMITS ++ mempool_free(iod->sgt.sgl, dev->iod_mempool); ++#else + mempool_free(iod->sg, dev->iod_mempool); ++#endif + } + ++#ifdef HAVE_ONCE_H + static void nvme_print_sgl(struct scatterlist *sgl, int nents) + { + int i; +@@ -795,14 +962,23 @@ static void nvme_print_sgl(struct scatte + sg_dma_len(sg)); + } + } ++#endif + + static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev, + struct request *req, struct nvme_rw_command *cmnd) + { + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct dma_pool *pool; ++#ifdef HAVE_BLK_RQ_NR_PAYLOAD_BYTES + int length = blk_rq_payload_bytes(req); ++#else ++ int length = nvme_map_len(req); ++#endif ++#ifdef HAVE_BIO_SPLIT_TO_LIMITS ++ struct scatterlist *sg = iod->sgt.sgl; ++#else + struct scatterlist *sg = iod->sg; ++#endif + int dma_len = sg_dma_len(sg); + u64 dma_addr = sg_dma_address(sg); + int offset = dma_addr & (NVME_CTRL_PAGE_SIZE - 1); +@@ -875,16 +1051,50 @@ static blk_status_t nvme_pci_setup_prps( + dma_len = sg_dma_len(sg); + } + done: ++#ifdef HAVE_BIO_SPLIT_TO_LIMITS ++ cmnd->dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sgt.sgl)); ++#else + cmnd->dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); ++#endif + cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma); + return BLK_STS_OK; + free_prps: + nvme_free_prps(dev, req); + return BLK_STS_RESOURCE; + bad_sgl: ++#ifdef HAVE_ONCE_H ++#ifdef HAVE_BIO_SPLIT_TO_LIMITS ++ WARN(DO_ONCE(nvme_print_sgl, iod->sgt.sgl, iod->sgt.nents), ++#else + WARN(DO_ONCE(nvme_print_sgl, iod->sg, iod->nents), ++#endif + "Invalid SGL for payload:%d nents:%d\n", ++#ifdef HAVE_BLK_RQ_NR_PAYLOAD_BYTES ++#ifdef HAVE_BIO_SPLIT_TO_LIMITS ++ blk_rq_payload_bytes(req), iod->sgt.nents); ++#else + blk_rq_payload_bytes(req), iod->nents); ++#endif ++#else ++ nvme_map_len(req), iod->nents); ++#endif ++#else ++ if (WARN_ONCE(1, "Invalid SGL for payload:%d nents:%d\n", ++#ifdef HAVE_BLK_RQ_NR_PAYLOAD_BYTES ++ blk_rq_payload_bytes(req), iod->nents)) { ++#else ++ nvme_map_len(req), iod->nents)) { ++#endif ++ for_each_sg(iod->sg, sg, iod->nents, i) { ++ dma_addr_t phys = sg_phys(sg); ++ pr_warn("sg[%d] phys_addr:%pad offset:%d length:%d " ++ "dma_address:%pad dma_length:%d\n", i, &phys, ++ sg->offset, sg->length, ++ &sg_dma_address(sg), ++ sg_dma_len(sg)); ++ } ++ } ++#endif + return BLK_STS_IOERR; + } + +@@ -910,12 +1120,21 @@ static void nvme_pci_sgl_set_seg(struct + } + + static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev, ++#ifdef HAVE_BIO_SPLIT_TO_LIMITS ++ struct request *req, struct nvme_rw_command *cmd) ++#else + struct request *req, struct nvme_rw_command *cmd, int entries) ++#endif + { + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct dma_pool *pool; + struct nvme_sgl_desc *sg_list; ++#ifdef HAVE_BIO_SPLIT_TO_LIMITS ++ struct scatterlist *sg = iod->sgt.sgl; ++ unsigned int entries = iod->sgt.nents; ++#else + struct scatterlist *sg = iod->sg; ++#endif + dma_addr_t sgl_dma; + int i = 0; + +@@ -971,6 +1190,7 @@ free_sgls: + return BLK_STS_RESOURCE; + } + ++#if defined(HAVE_BLKDEV_DMA_MAP_BVEC) && defined(HAVE_BLKDEV_REQ_BVEC) + static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev, + struct request *req, struct nvme_rw_command *cmnd, + struct bio_vec *bv) +@@ -1007,13 +1227,18 @@ static blk_status_t nvme_setup_sgl_simpl + cmnd->dptr.sgl.type = NVME_SGL_FMT_DATA_DESC << 4; + return BLK_STS_OK; + } ++#endif + + static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, + struct nvme_command *cmnd) + { + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + blk_status_t ret = BLK_STS_RESOURCE; ++#ifdef HAVE_BIO_SPLIT_TO_LIMITS ++ int rc; ++#else + int nr_mapped; ++#endif + + #ifdef CONFIG_NVFS + bool is_nvfs_io = false; +@@ -1022,6 +1247,7 @@ static blk_status_t nvme_map_data(struct + return ret; + #endif + ++#if defined(HAVE_BLKDEV_DMA_MAP_BVEC) && defined(HAVE_BLKDEV_REQ_BVEC) + if (blk_rq_nr_phys_segments(req) == 1) { + struct bio_vec bv = req_bvec(req); + +@@ -1038,6 +1264,24 @@ static blk_status_t nvme_map_data(struct + } + + iod->dma_len = 0; ++#endif ++#ifdef HAVE_BIO_SPLIT_TO_LIMITS ++ iod->sgt.sgl = mempool_alloc(dev->iod_mempool, GFP_ATOMIC); ++ if (!iod->sgt.sgl) ++ return BLK_STS_RESOURCE; ++ sg_init_table(iod->sgt.sgl, blk_rq_nr_phys_segments(req)); ++ iod->sgt.orig_nents = blk_rq_map_sg(req->q, req, iod->sgt.sgl); ++ if (!iod->sgt.orig_nents) ++ goto out_free_sg; ++ ++ rc = dma_map_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), ++ DMA_ATTR_NO_WARN); ++ if (rc) { ++ if (rc == -EREMOTEIO) ++ ret = BLK_STS_TARGET; ++ goto out_free_sg; ++ } ++#else + iod->sg = mempool_alloc(dev->iod_mempool, GFP_ATOMIC); + if (!iod->sg) + return BLK_STS_RESOURCE; +@@ -1047,27 +1291,50 @@ static blk_status_t nvme_map_data(struct + goto out_free_sg; + + if (is_pci_p2pdma_page(sg_page(iod->sg))) ++#ifdef HAVE_PCI_P2PDMA_MAP_SG_ATTRS + nr_mapped = pci_p2pdma_map_sg_attrs(dev->dev, iod->sg, + iod->nents, rq_dma_dir(req), DMA_ATTR_NO_WARN); ++#else ++ nr_mapped = pci_p2pdma_map_sg(dev->dev, iod->sg, iod->nents, ++ rq_dma_dir(req)); ++#endif + else ++#if defined(HAVE_DMA_ATTR_NO_WARN) && \ ++ defined(HAVE_DMA_SET_ATTR_TAKES_UNSIGNED_LONG_ATTRS) + nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents, + rq_dma_dir(req), DMA_ATTR_NO_WARN); ++#else ++ nr_mapped = dma_map_sg(dev->dev, iod->sg, iod->nents, ++ rq_dma_dir(req)); ++#endif + if (!nr_mapped) + goto out_free_sg; ++#endif + + iod->use_sgl = nvme_pci_use_sgls(dev, req); + if (iod->use_sgl) ++#ifdef HAVE_BIO_SPLIT_TO_LIMITS ++ ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw); ++#else + ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw, nr_mapped); ++#endif + else + ret = nvme_pci_setup_prps(dev, req, &cmnd->rw); + if (ret != BLK_STS_OK) + goto out_unmap_sg; + return BLK_STS_OK; + ++#ifdef HAVE_BIO_SPLIT_TO_LIMITS ++out_unmap_sg: ++ dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0); ++out_free_sg: ++ mempool_free(iod->sgt.sgl, dev->iod_mempool); ++#else + out_unmap_sg: + nvme_unmap_sg(dev, req); + out_free_sg: + mempool_free(iod->sg, dev->iod_mempool); ++#endif + return ret; + } + +@@ -1076,11 +1343,24 @@ static blk_status_t nvme_map_metadata(st + { + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + ++#if defined(HAVE_BLKDEV_DMA_MAP_BVEC) && defined(HAVE_BLKDEV_REQ_BVEC) + iod->meta_dma = dma_map_bvec(dev->dev, rq_integrity_vec(req), + rq_dma_dir(req), 0); + if (dma_mapping_error(dev->dev, iod->meta_dma)) + return BLK_STS_IOERR; + cmnd->rw.metadata = cpu_to_le64(iod->meta_dma); ++#else ++ if (blk_rq_count_integrity_sg(req->q, req->bio) != 1) ++ return BLK_STS_IOERR; ++ ++ sg_init_table(&iod->meta_sg, 1); ++ if (blk_rq_map_integrity_sg(req->q, req->bio, &iod->meta_sg) != 1) ++ return BLK_STS_IOERR; ++ ++ if (!dma_map_sg(dev->dev, &iod->meta_sg, 1, rq_dma_dir(req))) ++ return BLK_STS_IOERR; ++ cmnd->rw.metadata = cpu_to_le64(sg_dma_address(&iod->meta_sg)); ++#endif + return BLK_STS_OK; + } + +@@ -1091,7 +1371,11 @@ static blk_status_t nvme_prep_rq(struct + + iod->aborted = 0; + iod->npages = -1; ++#ifdef HAVE_BIO_SPLIT_TO_LIMITS ++ iod->sgt.nents = 0; ++#else + iod->nents = 0; ++#endif + + ret = nvme_setup_cmd(req->q->queuedata, req); + if (ret) +@@ -1150,6 +1434,7 @@ static blk_status_t nvme_queue_rq(struct + return BLK_STS_OK; + } + ++#ifdef HAVE_BLK_MQ_OPS_QUEUE_RQS + static void nvme_submit_cmds(struct nvme_queue *nvmeq, struct request **rqlist) + { + spin_lock(&nvmeq->sq_lock); +@@ -1207,6 +1492,7 @@ static void nvme_queue_rqs(struct reques + + *rqlist = requeue_list; + } ++#endif /* HAVE_BLK_MQ_OPS_QUEUE_RQS */ + + static __always_inline void nvme_pci_unmap_rq(struct request *req) + { +@@ -1214,8 +1500,12 @@ static __always_inline void nvme_pci_unm + struct nvme_dev *dev = iod->nvmeq->dev; + + if (blk_integrity_rq(req)) ++#if defined(HAVE_BLKDEV_DMA_MAP_BVEC) && defined(HAVE_BLKDEV_REQ_BVEC) + dma_unmap_page(dev->dev, iod->meta_dma, + rq_integrity_vec(req)->bv_len, rq_data_dir(req)); ++#else ++ dma_unmap_sg(dev->dev, &iod->meta_sg, 1, rq_data_dir(req)); ++#endif + if (blk_rq_nr_phys_segments(req)) + nvme_unmap_data(dev, req); + } +@@ -1226,10 +1516,12 @@ static void nvme_pci_complete_rq(struct + nvme_complete_rq(req); + } + ++#ifdef HAVE_BLK_MQ_OPS_POLL_2_ARG + static void nvme_pci_complete_batch(struct io_comp_batch *iob) + { + nvme_complete_batch(iob, nvme_pci_unmap_rq); + } ++#endif + + /* We read the CQE phase first to check if the rest of the entry is valid */ + static inline bool nvme_cqe_pending(struct nvme_queue *nvmeq) +@@ -1255,8 +1547,12 @@ static inline struct blk_mq_tags *nvme_q + return nvmeq->dev->tagset.tags[nvmeq->qid - 1]; + } + ++#ifdef HAVE_BLK_MQ_OPS_POLL_2_ARG + static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, + struct io_comp_batch *iob, u16 idx) ++#else ++static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) ++#endif + { + struct nvme_completion *cqe = &nvmeq->cqes[idx]; + __u16 command_id = READ_ONCE(cqe->command_id); +@@ -1283,9 +1579,13 @@ static inline void nvme_handle_cqe(struc + } + + trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail); ++#ifdef HAVE_BLK_MQ_OPS_POLL_2_ARG + if (!nvme_try_complete_req(req, cqe->status, cqe->result) && + !blk_mq_add_to_batch(req, iob, nvme_req(req)->status, + nvme_pci_complete_batch)) ++#else ++ if (!nvme_try_complete_req(req, cqe->status, cqe->result)) ++#endif + nvme_pci_complete_rq(req); + } + +@@ -1301,19 +1601,36 @@ static inline void nvme_update_cq_head(s + } + } + ++#ifdef HAVE_MQ_RQ_STATE ++#ifdef HAVE_BLK_MQ_OPS_POLL_2_ARG + static inline int nvme_poll_cq(struct nvme_queue *nvmeq, + struct io_comp_batch *iob) ++#else ++static inline int nvme_poll_cq(struct nvme_queue *nvmeq) ++#endif ++#else ++static inline int nvme_poll_cq(struct nvme_queue *nvmeq, unsigned int tag) ++#endif + { + int found = 0; + + while (nvme_cqe_pending(nvmeq)) { ++#ifdef HAVE_MQ_RQ_STATE + found++; ++#else ++ if (tag == -1U || nvmeq->cqes[nvmeq->cq_head].command_id == tag) ++ found++; ++#endif + /* + * load-load control dependency between phase and the rest of + * the cqe requires a full read memory barrier + */ + dma_rmb(); ++#ifdef HAVE_BLK_MQ_OPS_POLL_2_ARG + nvme_handle_cqe(nvmeq, iob, nvmeq->cq_head); ++#else ++ nvme_handle_cqe(nvmeq, nvmeq->cq_head); ++#endif + nvme_update_cq_head(nvmeq); + } + +@@ -1325,11 +1642,23 @@ static inline int nvme_poll_cq(struct nv + static irqreturn_t nvme_irq(int irq, void *data) + { + struct nvme_queue *nvmeq = data; ++#ifdef HAVE_BLK_MQ_OPS_POLL_2_ARG + DEFINE_IO_COMP_BATCH(iob); ++#endif + ++#ifdef HAVE_MQ_RQ_STATE ++#ifdef HAVE_BLK_MQ_OPS_POLL_2_ARG + if (nvme_poll_cq(nvmeq, &iob)) { ++#else ++ if (nvme_poll_cq(nvmeq)) { ++#endif ++#else ++ if (nvme_poll_cq(nvmeq, -1)) { ++#endif ++#ifdef HAVE_BLK_MQ_OPS_POLL_2_ARG + if (!rq_list_empty(iob.req_list)) + nvme_pci_complete_batch(&iob); ++#endif + return IRQ_HANDLED; + } + return IRQ_NONE; +@@ -1348,21 +1677,87 @@ static irqreturn_t nvme_irq_check(int ir + * Poll for completions for any interrupt driven queue + * Can be called from any context. + */ ++#ifdef HAVE_MQ_RQ_STATE + static void nvme_poll_irqdisable(struct nvme_queue *nvmeq) ++#else ++static void nvme_poll_irqdisable(struct nvme_queue *nvmeq, unsigned int tag) ++#endif + { ++#ifdef HAVE_PCI_IRQ_API + struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev); ++#endif + + if (nvmeq->p2p) + return; + + WARN_ON_ONCE(test_bit(NVMEQ_POLLED, &nvmeq->flags)); + ++#ifdef HAVE_PCI_IRQ_API + disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); ++#else ++ disable_irq(nvmeq->dev->entry[nvmeq->cq_vector].vector); ++#endif ++#ifdef HAVE_MQ_RQ_STATE ++#ifdef HAVE_BLK_MQ_OPS_POLL_2_ARG + nvme_poll_cq(nvmeq, NULL); ++#else ++ nvme_poll_cq(nvmeq); ++#endif ++#else ++ nvme_poll_cq(nvmeq, tag); ++#endif ++#ifdef HAVE_PCI_IRQ_API + enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); ++#else ++ enable_irq(nvmeq->dev->entry[nvmeq->cq_vector].vector); ++#endif + } + ++#ifndef HAVE_MQ_RQ_STATE ++static int __nvme_poll_irqdisable(struct nvme_queue *nvmeq, unsigned int tag) ++{ ++#ifdef HAVE_PCI_IRQ_API ++ struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev); ++#endif ++ int found; ++ ++ if (nvmeq->p2p) ++ return 0; ++ ++ if (!nvme_cqe_pending(nvmeq)) ++ return 0; ++ ++ if (test_bit(NVMEQ_POLLED, &nvmeq->flags)) { ++ spin_lock(&nvmeq->cq_poll_lock); ++ found = nvme_poll_cq(nvmeq, tag); ++ spin_unlock(&nvmeq->cq_poll_lock); ++ } else { ++#ifdef HAVE_PCI_IRQ_API ++ disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); ++#else ++ disable_irq(nvmeq->dev->entry[nvmeq->cq_vector].vector); ++#endif ++ found = nvme_poll_cq(nvmeq, tag); ++#ifdef HAVE_PCI_IRQ_API ++ enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); ++#else ++ enable_irq(nvmeq->dev->entry[nvmeq->cq_vector].vector); ++#endif ++ } ++ ++ return found; ++} ++#endif ++#ifdef HAVE_BLK_MQ_OPS_POLL ++#ifdef HAVE_BLK_MQ_OPS_POLL_1_ARG ++static int nvme_poll(struct blk_mq_hw_ctx *hctx) ++#else ++#ifdef HAVE_BLK_MQ_OPS_POLL_2_ARG + static int nvme_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) ++#else ++static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) ++#endif ++#endif + { + struct nvme_queue *nvmeq = hctx->driver_data; + bool found; +@@ -1370,12 +1765,49 @@ static int nvme_poll(struct blk_mq_hw_ct + if (!nvme_cqe_pending(nvmeq)) + return 0; + +- spin_lock(&nvmeq->cq_poll_lock); +- found = nvme_poll_cq(nvmeq, iob); +- spin_unlock(&nvmeq->cq_poll_lock); ++ /* ++ * For a poll queue we need to protect against the polling thread ++ * using the CQ lock. For normal interrupt driven threads we have ++ * to disable the interrupt to avoid racing with it. ++ * Note: The polling of non-polled queue is not allowed in new kernels. ++ */ ++ if (test_bit(NVMEQ_POLLED, &nvmeq->flags)) { ++ spin_lock(&nvmeq->cq_poll_lock); ++#ifdef HAVE_MQ_RQ_STATE ++#ifdef HAVE_BLK_MQ_OPS_POLL_2_ARG ++ found = nvme_poll_cq(nvmeq, iob); ++#else ++ found = nvme_poll_cq(nvmeq); ++#endif ++#else ++ found = nvme_poll_cq(nvmeq, -1); ++#endif ++ spin_unlock(&nvmeq->cq_poll_lock); ++ } else { ++#ifdef HAVE_PCI_IRQ_API ++ disable_irq(pci_irq_vector(to_pci_dev(nvmeq->dev->dev), nvmeq->cq_vector)); ++#else ++ disable_irq(nvmeq->dev->entry[nvmeq->cq_vector].vector); ++#endif ++#ifdef HAVE_MQ_RQ_STATE ++#ifdef HAVE_BLK_MQ_OPS_POLL_2_ARG ++ found = nvme_poll_cq(nvmeq, iob); ++#else ++ found = nvme_poll_cq(nvmeq); ++#endif ++#else ++ found = nvme_poll_cq(nvmeq, -1); ++#endif ++#ifdef HAVE_PCI_IRQ_API ++ enable_irq(pci_irq_vector(to_pci_dev(nvmeq->dev->dev), nvmeq->cq_vector)); ++#else ++ enable_irq(nvmeq->dev->entry[nvmeq->cq_vector].vector); ++#endif ++ } + + return found; + } ++#endif /* HAVE_BLK_MQ_OPS_POLL */ + + static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl) + { +@@ -1526,7 +1958,11 @@ static void nvme_warn_reset(struct nvme_ + "Try \"nvme_core.default_ps_max_latency_us=0 pcie_aspm=off\" and report a bug\n"); + } + ++#ifdef HAVE_BLK_MQ_OPS_TIMEOUT_1_PARAM ++static enum blk_eh_timer_return nvme_timeout(struct request *req) ++#else + static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) ++#endif + { + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = iod->nvmeq; +@@ -1549,22 +1985,52 @@ static enum blk_eh_timer_return nvme_tim + nvme_warn_reset(dev, csts); + nvme_dev_disable(dev, false); + nvme_reset_ctrl(&dev->ctrl); ++#ifdef HAVE_BLK_EH_DONE + return BLK_EH_DONE; ++#else ++ return BLK_EH_HANDLED; ++#endif + } + + /* + * Did we miss an interrupt? + */ ++#ifdef HAVE_MQ_RQ_STATE ++#ifdef HAVE_REQUEST_MQ_HCTX + if (test_bit(NVMEQ_POLLED, &nvmeq->flags)) ++#ifdef HAVE_BLK_MQ_OPS_POLL_1_ARG ++ nvme_poll(req->mq_hctx); ++#else ++#ifdef HAVE_BLK_MQ_OPS_POLL_2_ARG + nvme_poll(req->mq_hctx, NULL); ++#else ++ nvme_poll(req->mq_hctx, req->tag); ++#endif ++#endif /* HAVE_BLK_MQ_OPS_POLL_1_ARG */ ++#else ++ if (test_bit(NVMEQ_POLLED, &nvmeq->flags)) { ++ if (nvme_cqe_pending(nvmeq)) { ++ spin_lock(&nvmeq->cq_poll_lock); ++ nvme_poll_cq(nvmeq); ++ spin_unlock(&nvmeq->cq_poll_lock); ++ } ++ } ++#endif + else + nvme_poll_irqdisable(nvmeq); + + if (blk_mq_request_completed(req)) { ++#else ++ if (__nvme_poll_irqdisable(nvmeq, req->tag)) { ++#endif /* HAVE_MQ_RQ_STATE */ + dev_warn(dev->ctrl.device, + "I/O %d QID %d timeout, completion polled\n", + req->tag, nvmeq->qid); ++#ifdef HAVE_BLK_EH_DONE + return BLK_EH_DONE; ++#else ++ return BLK_EH_HANDLED; ++#endif + } + + /* +@@ -1583,7 +2049,11 @@ static enum blk_eh_timer_return nvme_tim + req->tag, nvmeq->qid); + nvme_req(req)->flags |= NVME_REQ_CANCELLED; + nvme_dev_disable(dev, true); ++#ifdef HAVE_BLK_EH_DONE + return BLK_EH_DONE; ++#else ++ return BLK_EH_HANDLED; ++#endif + case NVME_CTRL_RESETTING: + return BLK_EH_RESET_TIMER; + default: +@@ -1603,7 +2073,11 @@ static enum blk_eh_timer_return nvme_tim + nvme_dev_disable(dev, false); + nvme_reset_ctrl(&dev->ctrl); + ++#ifdef HAVE_BLK_EH_DONE + return BLK_EH_DONE; ++#else ++ return BLK_EH_HANDLED; ++#endif + } + + if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) { +@@ -1620,15 +2094,33 @@ static enum blk_eh_timer_return nvme_tim + "I/O %d QID %d timeout, aborting\n", + req->tag, nvmeq->qid); + ++#ifdef HAVE_BLK_MQ_ALLOC_REQUEST_HAS_3_PARAMS + abort_req = nvme_alloc_request(dev->ctrl.admin_q, &cmd, + BLK_MQ_REQ_NOWAIT); ++#else ++ abort_req = nvme_alloc_request(dev->ctrl.admin_q, &cmd, ++ GFP_KERNEL, reserved); ++#endif + if (IS_ERR(abort_req)) { + atomic_inc(&dev->ctrl.abort_limit); + return BLK_EH_RESET_TIMER; + } + + abort_req->end_io_data = NULL; ++#ifdef HAVE_BLK_EXECUTE_RQ_NOWAIT_2_PARAM ++ abort_req->end_io = abort_endio; ++ blk_execute_rq_nowait(abort_req, false); ++#else ++#ifdef HAVE_BLK_EXECUTE_RQ_NOWAIT_5_PARAM ++ blk_execute_rq_nowait(abort_req->q, NULL, abort_req, 0, abort_endio); ++#else ++#ifdef HAVE_BLK_EXECUTE_RQ_NOWAIT_3_PARAM + blk_execute_rq_nowait(abort_req, false, abort_endio); ++#else ++ blk_execute_rq_nowait(NULL, abort_req, 0, abort_endio); ++#endif ++#endif ++#endif + + /* + * The aborted req will be completed on receiving the abort req. +@@ -1689,7 +2181,11 @@ static int nvme_suspend_queue(struct nvm + if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q) + nvme_stop_admin_queue(&nvmeq->dev->ctrl); + if (!nvmeq->p2p && !test_and_clear_bit(NVMEQ_POLLED, &nvmeq->flags)) ++#ifdef HAVE_PCI_FREE_IRQ + pci_free_irq(to_pci_dev(nvmeq->dev->dev), nvmeq->cq_vector, nvmeq); ++#else ++ free_irq(nvmeq_irq(nvmeq), nvmeq); ++#endif + return 0; + } + +@@ -1710,7 +2206,11 @@ static void nvme_disable_admin_queue(str + else + nvme_disable_ctrl(&dev->ctrl); + ++#ifdef HAVE_MQ_RQ_STATE + nvme_poll_irqdisable(nvmeq); ++#else ++ nvme_poll_irqdisable(nvmeq, -1); ++#endif + } + + /* +@@ -1727,7 +2227,15 @@ static void nvme_reap_pending_cqes(struc + if (dev->queues[i].p2p) + continue; + spin_lock(&dev->queues[i].cq_poll_lock); ++#ifdef HAVE_MQ_RQ_STATE ++#ifdef HAVE_BLK_MQ_OPS_POLL_2_ARG + nvme_poll_cq(&dev->queues[i], NULL); ++#else ++ nvme_poll_cq(&dev->queues[i]); ++#endif ++#else ++ nvme_poll_cq(&dev->queues[i], -1); ++#endif + spin_unlock(&dev->queues[i].cq_poll_lock); + } + } +@@ -1801,6 +2309,10 @@ static int nvme_alloc_queue(struct nvme_ + goto free_cqdma; + + nvmeq->dev = dev; ++#ifndef HAVE_PCI_FREE_IRQ ++ snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d", ++ dev->ctrl.instance, qid); ++#endif + spin_lock_init(&nvmeq->sq_lock); + spin_lock_init(&nvmeq->cq_poll_lock); + nvmeq->cq_head = 0; +@@ -1823,6 +2335,7 @@ static int nvme_alloc_queue(struct nvme_ + + static int queue_request_irq(struct nvme_queue *nvmeq) + { ++#ifdef HAVE_PCI_FREE_IRQ + struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev); + int nr = nvmeq->dev->ctrl.instance; + +@@ -1833,6 +2346,14 @@ static int queue_request_irq(struct nvme + return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq, + NULL, nvmeq, "nvme%dq%d", nr, nvmeq->qid); + } ++#else ++ if (use_threaded_interrupts) ++ return request_threaded_irq(nvmeq_irq(nvmeq), nvme_irq_check, ++ nvme_irq, IRQF_SHARED, nvmeq->irqname, nvmeq); ++ else ++ return request_irq(nvmeq_irq(nvmeq), nvme_irq, IRQF_SHARED, ++ nvmeq->irqname, nvmeq); ++#endif + } + + static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) +@@ -1885,7 +2406,11 @@ static int nvme_create_queue(struct nvme + * has only one vector available. + */ + if (!polled && !nvmeq->p2p) ++#if defined(HAVE_PCI_IRQ_API) && defined(HAVE_IRQ_CALC_AFFINITY_VECTORS_3_ARGS) + vector = dev->num_vecs == 1 ? 0 : qid; ++#else ++ vector = qid - 1; ++#endif + else if (polled) + set_bit(NVMEQ_POLLED, &nvmeq->flags); + +@@ -1929,24 +2454,52 @@ release_cq: + return result; + } + ++#ifdef HAVE_BLK_MQ_TAG_SET_HAS_CONST_OPS + static const struct blk_mq_ops nvme_mq_admin_ops = { ++#else ++static struct blk_mq_ops nvme_mq_admin_ops = { ++#endif + .queue_rq = nvme_queue_rq, + .complete = nvme_pci_complete_rq, ++#ifdef HAVE_BLK_MQ_OPS_MAP_QUEUE ++ .map_queue = blk_mq_map_queue, ++#endif + .init_hctx = nvme_admin_init_hctx, ++#ifdef HAVE_BLK_MQ_OPS_INIT_REQUEST_HAS_4_PARAMS + .init_request = nvme_init_request, ++#else ++ .init_request = nvme_admin_init_request, ++#endif + .timeout = nvme_timeout, + }; + ++#ifdef HAVE_BLK_MQ_TAG_SET_HAS_CONST_OPS + static const struct blk_mq_ops nvme_mq_ops = { ++#else ++static struct blk_mq_ops nvme_mq_ops = { ++#endif + .queue_rq = nvme_queue_rq, ++#ifdef HAVE_BLK_MQ_OPS_QUEUE_RQS + .queue_rqs = nvme_queue_rqs, ++#endif + .complete = nvme_pci_complete_rq, ++#ifdef HAVE_BLK_MQ_OPS_MAP_QUEUE ++ .map_queue = blk_mq_map_queue, ++#endif ++#ifdef HAVE_BLK_MQ_OPS_COMMIT_RQS + .commit_rqs = nvme_commit_rqs, ++#endif + .init_hctx = nvme_init_hctx, + .init_request = nvme_init_request, ++#if defined(HAVE_BLK_MQ_OPS_MAP_QUEUES) && \ ++ (defined(HAVE_PCI_IRQ_GET_AFFINITY) || \ ++ defined(HAVE_BLK_MQ_PCI_MAP_QUEUES_3_ARGS)) + .map_queues = nvme_pci_map_queues, ++#endif + .timeout = nvme_timeout, ++#ifdef HAVE_BLK_MQ_OPS_POLL + .poll = nvme_poll, ++#endif + }; + + static void nvme_dev_remove_admin(struct nvme_dev *dev) +@@ -1958,7 +2511,11 @@ static void nvme_dev_remove_admin(struct + * queue to flush these to completion. + */ + nvme_start_admin_queue(&dev->ctrl); ++#ifdef HAVE_BLK_MQ_DESTROY_QUEUE ++ blk_mq_destroy_queue(dev->ctrl.admin_q); ++#else + blk_cleanup_queue(dev->ctrl.admin_q); ++#endif + blk_mq_free_tag_set(&dev->admin_tagset); + } + } +@@ -1973,7 +2530,9 @@ static int nvme_alloc_admin_tags(struct + dev->admin_tagset.timeout = NVME_ADMIN_TIMEOUT; + dev->admin_tagset.numa_node = dev->ctrl.numa_node; + dev->admin_tagset.cmd_size = sizeof(struct nvme_iod); ++#ifdef HAVE_BLK_MQ_F_NO_SCHED + dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED; ++#endif + dev->admin_tagset.driver_data = dev; + + if (blk_mq_alloc_tag_set(&dev->admin_tagset)) +@@ -2086,12 +2645,16 @@ static int nvme_create_io_queues(struct + } + + max = min(dev->max_qid, dev->ctrl.queue_count - 1); ++#ifdef HAVE_BLK_MQ_HCTX_TYPE + if (max != 1 && dev->io_queues[HCTX_TYPE_POLL]) { + rw_queues = dev->io_queues[HCTX_TYPE_DEFAULT] + + dev->io_queues[HCTX_TYPE_READ]; + } else { + rw_queues = max; + } ++#else ++ rw_queues = max; ++#endif + + for (i = dev->online_queues; i <= max; i++) { + bool polled = i > rw_queues && !dev->queues[i].p2p; +@@ -2226,9 +2789,20 @@ static void nvme_free_host_mem(struct nv + struct nvme_host_mem_buf_desc *desc = &dev->host_mem_descs[i]; + size_t size = le32_to_cpu(desc->size) * NVME_CTRL_PAGE_SIZE; + ++#ifdef HAVE_DMA_ATTRS ++ DEFINE_DMA_ATTRS(attrs); ++ dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs); ++ dma_free_attrs(dev->dev, size, dev->host_mem_desc_bufs[i], ++ le64_to_cpu(desc->addr), &attrs); ++#else + dma_free_attrs(dev->dev, size, dev->host_mem_desc_bufs[i], + le64_to_cpu(desc->addr), ++#ifdef HAVE_DMA_ATTR_NO_WARN + DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN); ++#else ++ DMA_ATTR_NO_KERNEL_MAPPING); ++#endif ++#endif + } + + kfree(dev->host_mem_desc_bufs); +@@ -2268,10 +2842,26 @@ static int __nvme_alloc_host_mem(struct + + for (size = 0; size < preferred && i < max_entries; size += len) { + dma_addr_t dma_addr; ++#ifndef HAVE_DMA_SET_ATTR_TAKES_UNSIGNED_LONG_ATTRS ++ DEFINE_DMA_ATTRS(attrs); ++#ifdef HAVE_DMA_ATTR_NO_WARN ++ dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN, &attrs); ++#else ++ dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs); ++#endif ++#endif + + len = min_t(u64, chunk_size, preferred - size); + bufs[i] = dma_alloc_attrs(dev->dev, len, &dma_addr, GFP_KERNEL, ++#ifdef HAVE_DMA_SET_ATTR_TAKES_UNSIGNED_LONG_ATTRS ++#ifdef HAVE_DMA_ATTR_NO_WARN + DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN); ++#else ++ DMA_ATTR_NO_KERNEL_MAPPING); ++#endif ++#else ++ &attrs); ++#endif + if (!bufs[i]) + break; + +@@ -2294,9 +2884,20 @@ out_free_bufs: + while (--i >= 0) { + size_t size = le32_to_cpu(descs[i].size) * NVME_CTRL_PAGE_SIZE; + ++#ifdef HAVE_DMA_ATTRS ++ DEFINE_DMA_ATTRS(attrs); ++ dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs); ++ dma_free_attrs(dev->dev, size, bufs[i], ++ le64_to_cpu(descs[i].addr), &attrs); ++#else + dma_free_attrs(dev->dev, size, bufs[i], + le64_to_cpu(descs[i].addr), ++#ifdef HAVE_DMA_ATTR_NO_WARN + DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN); ++#else ++ DMA_ATTR_NO_KERNEL_MAPPING); ++#endif ++#endif + } + + kfree(bufs); +@@ -2467,6 +3068,7 @@ static const struct attribute_group nvme + .is_visible = nvme_pci_attrs_are_visible, + }; + ++#ifdef HAVE_IRQ_AFFINITY_PRIV + /* + * nirqs is the number of interrupts available for write and read + * queues. The core already reserved an interrupt for the admin queue. +@@ -2504,14 +3106,18 @@ static void nvme_calc_irq_sets(struct ir + affd->set_size[HCTX_TYPE_READ] = nr_read_queues; + affd->nr_sets = nr_read_queues ? 2 : 1; + } ++#endif + ++#if defined(HAVE_PCI_IRQ_API) && defined(HAVE_IRQ_CALC_AFFINITY_VECTORS_3_ARGS) + static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues) + { + struct pci_dev *pdev = to_pci_dev(dev->dev); + struct irq_affinity affd = { + .pre_vectors = 1, ++#ifdef HAVE_IRQ_AFFINITY_PRIV + .calc_sets = nvme_calc_irq_sets, + .priv = dev, ++#endif + }; + unsigned int irq_queues, poll_queues; + +@@ -2520,6 +3126,7 @@ static int nvme_setup_irqs(struct nvme_d + * left over for non-polled I/O. + */ + poll_queues = min(dev->nr_poll_queues, nr_io_queues - 1); ++#ifdef HAVE_BLK_MQ_HCTX_TYPE + dev->io_queues[HCTX_TYPE_POLL] = poll_queues; + + /* +@@ -2528,6 +3135,7 @@ static int nvme_setup_irqs(struct nvme_d + */ + dev->io_queues[HCTX_TYPE_DEFAULT] = 1; + dev->io_queues[HCTX_TYPE_READ] = 0; ++#endif + + /* + * We need interrupts for the admin queue and each non-polled I/O queue, +@@ -2540,6 +3148,7 @@ static int nvme_setup_irqs(struct nvme_d + return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues, + PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd); + } ++#endif + + static void nvme_disable_io_queues(struct nvme_dev *dev) + { +@@ -2565,6 +3174,9 @@ static int nvme_setup_io_queues(struct n + unsigned int nr_io_queues; + unsigned long size; + int result; ++#ifndef HAVE_PCI_IRQ_API ++ int i, vecs; ++#endif + + /* + * Sample the module parameters once at reset time so that we have +@@ -2593,7 +3205,13 @@ static int nvme_setup_io_queues(struct n + if (result) + return result; + if (test_and_clear_bit(NVMEQ_ENABLED, &adminq->flags)) ++#ifdef HAVE_PCI_FREE_IRQ + pci_free_irq(pdev, 0, adminq); ++#elif defined(HAVE_PCI_IRQ_API) ++ free_irq(pci_irq_vector(pdev, 0), adminq); ++#else ++ free_irq(dev->entry[0].vector, adminq); ++#endif + + if (dev->cmb_use_sqes) { + result = nvme_cmb_qdepth(dev, nr_io_queues, +@@ -2630,14 +3248,22 @@ static int nvme_setup_io_queues(struct n + retry: + /* Deregister the admin queue's interrupt */ + if (test_and_clear_bit(NVMEQ_ENABLED, &adminq->flags)) ++#ifdef HAVE_PCI_FREE_IRQ + pci_free_irq(pdev, 0, adminq); ++#elif defined(HAVE_PCI_IRQ_API) ++ free_irq(pci_irq_vector(pdev, 0), adminq); ++#else ++ free_irq(dev->entry[0].vector, adminq); ++#endif + + /* + * If we enable msix early due to not intx, disable it again before + * setting up the full range we need. + */ ++#ifdef HAVE_PCI_IRQ_API + pci_free_irq_vectors(pdev); + ++#ifdef HAVE_IRQ_CALC_AFFINITY_VECTORS_3_ARGS + result = nvme_setup_irqs(dev, nr_io_queues); + if (result <= 0) { + result = -EIO; +@@ -2646,7 +3272,39 @@ static int nvme_setup_io_queues(struct n + + dev->num_vecs = result; + result = max(result - 1 + dev->num_p2p_queues, 1u); ++#ifdef HAVE_BLK_MQ_HCTX_TYPE + dev->max_qid = result + dev->io_queues[HCTX_TYPE_POLL]; ++#else ++ dev->max_qid = result; ++#endif ++#else ++ nr_io_queues = pci_alloc_irq_vectors(pdev, 1, nr_io_queues - dev->num_p2p_queues, ++ PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY); ++ if (nr_io_queues <= 0) ++ return -EIO; ++ dev->max_qid = nr_io_queues + dev->num_p2p_queues; ++#endif ++#else ++ if (pdev->msi_enabled) ++ pci_disable_msi(pdev); ++ else if (pdev->msix_enabled) ++ pci_disable_msix(pdev); ++ ++ for (i = 0; i < nr_io_queues - dev->num_p2p_queues; i++) ++ dev->entry[i].entry = i; ++ vecs = pci_enable_msix_range(pdev, dev->entry, 1, nr_io_queues - dev->num_p2p_queues); ++ if (vecs < 0) { ++ vecs = pci_enable_msi_range(pdev, 1, min((nr_io_queues - dev->num_p2p_queues), 32u)); ++ if (vecs < 0) { ++ vecs = 1; ++ } else { ++ for (i = 0; i < vecs; i++) ++ dev->entry[i].vector = i + pdev->irq; ++ } ++ } ++ nr_io_queues = vecs; ++ dev->max_qid = nr_io_queues + dev->num_p2p_queues; ++#endif /* HAVE_PCI_IRQ_API */ + + /* + * Should investigate if there's a performance win from allocating +@@ -2673,10 +3331,12 @@ static int nvme_setup_io_queues(struct n + nvme_suspend_io_queues(dev); + goto retry; + } ++#ifdef HAVE_BLK_MQ_HCTX_TYPE + dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n", + dev->io_queues[HCTX_TYPE_DEFAULT], + dev->io_queues[HCTX_TYPE_READ], + dev->io_queues[HCTX_TYPE_POLL]); ++#endif + return 0; + out_unlock: + mutex_unlock(&dev->shutdown_lock); +@@ -2710,15 +3370,36 @@ static int nvme_delete_queue(struct nvme + cmd.delete_queue.opcode = opcode; + cmd.delete_queue.qid = cpu_to_le16(nvmeq->qid); + ++#ifdef HAVE_BLK_MQ_ALLOC_REQUEST_HAS_3_PARAMS + req = nvme_alloc_request(q, &cmd, BLK_MQ_REQ_NOWAIT); ++#else ++ req = nvme_alloc_request(q, &cmd, GFP_KERNEL, false); ++#endif + if (IS_ERR(req)) + return PTR_ERR(req); + + req->end_io_data = nvmeq; + + init_completion(&nvmeq->delete_done); +- blk_execute_rq_nowait(req, false, opcode == nvme_admin_delete_cq ? +- nvme_del_cq_end : nvme_del_queue_end); ++#ifdef HAVE_BLK_EXECUTE_RQ_NOWAIT_2_PARAM ++ if (opcode == nvme_admin_delete_cq) ++ req->end_io = nvme_del_cq_end; ++ else ++ req->end_io = nvme_del_queue_end; ++ blk_execute_rq_nowait(req, false); ++#else ++#ifdef HAVE_BLK_EXECUTE_RQ_NOWAIT_5_PARAM ++ blk_execute_rq_nowait(q, NULL, req, false, ++#else ++#ifdef HAVE_BLK_EXECUTE_RQ_NOWAIT_3_PARAM ++ blk_execute_rq_nowait(req, false, ++#else ++ blk_execute_rq_nowait(NULL, req, false, ++#endif ++#endif ++ opcode == nvme_admin_delete_cq ? ++ nvme_del_cq_end : nvme_del_queue_end); ++#endif + return 0; + } + +@@ -2758,9 +3439,11 @@ static void nvme_dev_add(struct nvme_dev + if (!dev->ctrl.tagset) { + dev->tagset.ops = &nvme_mq_ops; + dev->tagset.nr_hw_queues = nr_hw_queues; ++#ifdef HAVE_BLK_MQ_HCTX_TYPE + dev->tagset.nr_maps = 2; /* default + read */ + if (dev->io_queues[HCTX_TYPE_POLL]) + dev->tagset.nr_maps++; ++#endif + dev->tagset.timeout = NVME_IO_TIMEOUT; + dev->tagset.numa_node = dev->ctrl.numa_node; + dev->tagset.queue_depth = min_t(unsigned int, dev->q_depth, +@@ -2785,7 +3468,9 @@ static void nvme_dev_add(struct nvme_dev + } + dev->ctrl.tagset = &dev->tagset; + } else { ++#ifdef HAVE_BLK_MQ_UPDATE_NR_HW_QUEUES + blk_mq_update_nr_hw_queues(&dev->tagset, nr_hw_queues); ++#endif + + /* Free previously allocated queues that are no longer usable */ + nvme_free_queues(dev, dev->online_queues); +@@ -2820,9 +3505,21 @@ static int nvme_pci_enable(struct nvme_d + * interrupts. Pre-enable a single MSIX or MSI vec for setup. We'll + * adjust this later. + */ ++#ifdef HAVE_PCI_IRQ_API + result = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES); + if (result < 0) + return result; ++#else ++ if (pci_enable_msix(pdev, dev->entry, 1)) { ++ pci_enable_msi(pdev); ++ dev->entry[0].vector = pdev->irq; ++ } ++ ++ if (!dev->entry[0].vector) { ++ result = -ENODEV; ++ goto disable; ++ } ++#endif + + dev->ctrl.cap = lo_hi_readq(dev->bar + NVME_REG_CAP); + +@@ -2893,7 +3590,14 @@ static void nvme_pci_disable(struct nvme + { + struct pci_dev *pdev = to_pci_dev(dev->dev); + ++#ifdef HAVE_PCI_IRQ_API + pci_free_irq_vectors(pdev); ++#else ++ if (pdev->msi_enabled) ++ pci_disable_msi(pdev); ++ else if (pdev->msix_enabled) ++ pci_disable_msix(pdev); ++#endif + + if (pci_is_enabled(pdev)) { + pci_disable_pcie_error_reporting(pdev); +@@ -3002,7 +3706,12 @@ static void nvme_pci_free_ctrl(struct nv + nvme_free_tagset(dev); + if (dev->ctrl.admin_q) + blk_put_queue(dev->ctrl.admin_q); ++#ifdef HAVE_LINUX_SED_OPAL_H + free_opal_dev(dev->ctrl.opal_dev); ++#endif ++#ifndef HAVE_PCI_IRQ_API ++ kfree(dev->entry); ++#endif + mempool_destroy(dev->iod_mempool); + put_device(dev->dev); + kfree(dev->queues); +@@ -3027,7 +3736,9 @@ static void nvme_reset_work(struct work_ + { + struct nvme_dev *dev = + container_of(work, struct nvme_dev, ctrl.reset_work); ++#ifdef HAVE_LINUX_SED_OPAL_H + bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL); ++#endif + int result; + + if (dev->ctrl.state != NVME_CTRL_RESETTING) { +@@ -3062,15 +3773,21 @@ static void nvme_reset_work(struct work_ + * Limit the max command size to prevent iod->sg allocations going + * over a single page. + */ ++#ifdef HAVE_DMA_MAX_MAPPING_SIZE + dev->ctrl.max_hw_sectors = min_t(u32, + NVME_MAX_KB_SZ << 1, dma_max_mapping_size(dev->dev) >> 9); ++#else ++ dev->ctrl.max_hw_sectors = NVME_MAX_KB_SZ << 1; ++#endif + dev->ctrl.max_segments = NVME_MAX_SEGS; + + /* + * Don't limit the IOMMU merged segment size. + */ + dma_set_max_seg_size(dev->dev, 0xffffffff); ++#ifdef HAVE_DMA_SET_MIN_ALIGN_MASK + dma_set_min_align_mask(dev->dev, NVME_CTRL_PAGE_SIZE - 1); ++#endif + + mutex_unlock(&dev->shutdown_lock); + +@@ -3095,6 +3812,7 @@ static void nvme_reset_work(struct work_ + if (result) + goto out; + ++#ifdef HAVE_LINUX_SED_OPAL_H + if (dev->ctrl.oacs & NVME_CTRL_OACS_SEC_SUPP) { + if (!dev->ctrl.opal_dev) + dev->ctrl.opal_dev = +@@ -3105,6 +3823,7 @@ static void nvme_reset_work(struct work_ + free_opal_dev(dev->ctrl.opal_dev); + dev->ctrl.opal_dev = NULL; + } ++#endif + + if (dev->ctrl.oacs & NVME_CTRL_OACS_DBBUF_SUPP) { + result = nvme_dbbuf_dma_alloc(dev); +@@ -3304,6 +4023,13 @@ static void nvme_async_probe(void *data, + nvme_put_ctrl(&dev->ctrl); + } + ++#ifndef HAVE_ACPI_STORAGE_D3 ++static inline bool acpi_storage_d3(struct device *dev) ++{ ++ return false; ++} ++#endif ++ + static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) + { + int node, result = -ENOMEM; +@@ -3319,6 +4045,13 @@ static int nvme_probe(struct pci_dev *pd + if (!dev) + return -ENOMEM; + ++#ifndef HAVE_PCI_IRQ_API ++ dev->entry = kzalloc_node(num_possible_cpus() * sizeof(*dev->entry), ++ GFP_KERNEL, node); ++ if (!dev->entry) ++ goto free; ++#endif ++ + dev->nr_write_queues = write_queues; + dev->nr_poll_queues = poll_queues; + dev->nr_allocated_queues = nvme_max_io_queues(dev) + 1; +@@ -3404,10 +4137,24 @@ static int nvme_probe(struct pci_dev *pd + put_device(dev->dev); + free: + kfree(dev->queues); ++#ifndef HAVE_PCI_IRQ_API ++ kfree(dev->entry); ++#endif + kfree(dev); + return result; + } + ++#ifdef HAVE_PCI_ERROR_HANDLERS_RESET_NOTIFY ++static void nvme_reset_notify(struct pci_dev *pdev, bool prepare) ++{ ++ struct nvme_dev *dev = pci_get_drvdata(pdev); ++ ++ if (prepare) ++ nvme_dev_disable(dev, false); ++ else ++ nvme_reset_ctrl(&dev->ctrl); ++} ++#elif defined(HAVE_PCI_ERROR_HANDLERS_RESET_PREPARE) && defined(HAVE_PCI_ERROR_HANDLERS_RESET_DONE) + static void nvme_reset_prepare(struct pci_dev *pdev) + { + struct nvme_dev *dev = pci_get_drvdata(pdev); +@@ -3428,6 +4175,7 @@ static void nvme_reset_done(struct pci_d + if (!nvme_try_sched_reset(&dev->ctrl)) + flush_work(&dev->ctrl.reset_work); + } ++#endif + + static void nvme_shutdown(struct pci_dev *pdev) + { +@@ -3473,6 +4221,7 @@ static void nvme_remove(struct pci_dev * + nvme_uninit_ctrl(&dev->ctrl); + } + ++#ifdef HAVE_PM_SUSPEND_VIA_FIRMWARE + #ifdef CONFIG_PM_SLEEP + static int nvme_get_power_state(struct nvme_ctrl *ctrl, u32 *ps) + { +@@ -3600,6 +4349,7 @@ static const struct dev_pm_ops nvme_dev_ + .restore = nvme_simple_resume, + }; + #endif /* CONFIG_PM_SLEEP */ ++#endif /* HAVE_PM_SUSPEND_VIA_FIRMWARE */ + + static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev, + pci_channel_state_t state) +@@ -3648,11 +4398,20 @@ static const struct pci_error_handlers n + .error_detected = nvme_error_detected, + .slot_reset = nvme_slot_reset, + .resume = nvme_error_resume, ++#ifdef HAVE_PCI_ERROR_HANDLERS_RESET_NOTIFY ++ .reset_notify = nvme_reset_notify, ++#elif defined(HAVE_PCI_ERROR_HANDLERS_RESET_PREPARE) && defined(HAVE_PCI_ERROR_HANDLERS_RESET_DONE) + .reset_prepare = nvme_reset_prepare, + .reset_done = nvme_reset_done, ++#endif /* HAVE_PCI_ERROR_HANDLERS_RESET_NOTIFY */ + }; + ++#ifndef HAVE_PCI_CLASS_STORAGE_EXPRESS ++#define PCI_CLASS_STORAGE_EXPRESS 0x010802 ++#endif ++ + static const struct pci_device_id nvme_id_table[] = { ++#ifdef HAVE_BLK_QUEUE_MAX_WRITE_ZEROES_SECTORS + { PCI_VDEVICE(INTEL, 0x0953), /* Intel 750/P3500/P3600/P3700 */ + .driver_data = NVME_QUIRK_STRIPE_SIZE | + NVME_QUIRK_DEALLOCATE_ZEROES, }, +@@ -3666,6 +4425,20 @@ static const struct pci_device_id nvme_i + { PCI_VDEVICE(INTEL, 0x0a55), /* Dell Express Flash P4600 */ + .driver_data = NVME_QUIRK_STRIPE_SIZE | + NVME_QUIRK_DEALLOCATE_ZEROES, }, ++#else ++ { PCI_VDEVICE(INTEL, 0x0953), /* Intel 750/P3500/P3600/P3700 */ ++ .driver_data = NVME_QUIRK_STRIPE_SIZE | ++ NVME_QUIRK_DISCARD_ZEROES, }, ++ { PCI_VDEVICE(INTEL, 0x0a53), /* Intel P3520 */ ++ .driver_data = NVME_QUIRK_STRIPE_SIZE | ++ NVME_QUIRK_DISCARD_ZEROES, }, ++ { PCI_VDEVICE(INTEL, 0x0a54), /* Intel P4500/P4600 */ ++ .driver_data = NVME_QUIRK_STRIPE_SIZE | ++ NVME_QUIRK_DISCARD_ZEROES, }, ++ { PCI_VDEVICE(INTEL, 0x0a55), /* Dell Express Flash P4600 */ ++ .driver_data = NVME_QUIRK_STRIPE_SIZE | ++ NVME_QUIRK_DISCARD_ZEROES, }, ++#endif + { PCI_VDEVICE(INTEL, 0xf1a5), /* Intel 600P/P3100 */ + .driver_data = NVME_QUIRK_NO_DEEPEST_PS | + NVME_QUIRK_MEDIUM_PRIO_SQ | +@@ -3677,8 +4450,10 @@ static const struct pci_device_id nvme_i + .driver_data = NVME_QUIRK_IDENTIFY_CNS | + NVME_QUIRK_DISABLE_WRITE_ZEROES | + NVME_QUIRK_BOGUS_NID, }, ++#ifdef HAVE_PCI_VENDOR_ID_REDHAT + { PCI_VDEVICE(REDHAT, 0x0010), /* Qemu emulated controller */ + .driver_data = NVME_QUIRK_BOGUS_NID, }, ++#endif + { PCI_DEVICE(0x126f, 0x2263), /* Silicon Motion unidentified */ + .driver_data = NVME_QUIRK_NO_NS_DESC_LIST | + NVME_QUIRK_BOGUS_NID, }, +@@ -3740,6 +4515,7 @@ static const struct pci_device_id nvme_i + .driver_data = NVME_QUIRK_BOGUS_NID, }, + { PCI_DEVICE(0x1e49, 0x0041), /* ZHITAI TiPro7000 NVMe SSD */ + .driver_data = NVME_QUIRK_NO_DEEPEST_PS, }, ++#ifdef HAVE_PCI_IDS_PCI_VENDOR_ID_AMAZON + { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x0061), + .driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, }, + { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x0065), +@@ -3752,6 +4528,7 @@ static const struct pci_device_id nvme_i + .driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, }, + { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0xcd02), + .driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, }, ++#endif + { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001), + .driver_data = NVME_QUIRK_SINGLE_VECTOR }, + { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) }, +@@ -3766,18 +4543,44 @@ static const struct pci_device_id nvme_i + }; + MODULE_DEVICE_TABLE(pci, nvme_id_table); + ++#ifndef PCI_SRIOV_CONFIGURE_SIMPLE ++static int nvme_pci_sriov_configure(struct pci_dev *pdev, int numvfs) ++{ ++ int ret = 0; ++ ++ if (numvfs == 0) { ++ if (pci_vfs_assigned(pdev)) { ++ dev_warn(&pdev->dev, ++ "Cannot disable SR-IOV VFs while assigned\n"); ++ return -EPERM; ++ } ++ pci_disable_sriov(pdev); ++ return 0; ++ } ++ ++ ret = pci_enable_sriov(pdev, numvfs); ++ return ret ? ret : numvfs; ++} ++#endif ++ + static struct pci_driver nvme_driver = { + .name = "nvme", + .id_table = nvme_id_table, + .probe = nvme_probe, + .remove = nvme_remove, + .shutdown = nvme_shutdown, ++#ifdef HAVE_PM_SUSPEND_VIA_FIRMWARE + #ifdef CONFIG_PM_SLEEP + .driver = { + .pm = &nvme_dev_pm_ops, + }, + #endif ++#endif ++#ifdef PCI_SRIOV_CONFIGURE_SIMPLE + .sriov_configure = pci_sriov_configure_simple, ++#else ++ .sriov_configure = nvme_pci_sriov_configure, ++#endif + .err_handler = &nvme_err_handler, + }; + +@@ -3811,7 +4614,9 @@ static int __init nvme_init(void) + BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64); + BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64); + BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); ++#ifdef HAVE_IRQ_AFFINITY_PRIV + BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2); ++#endif + + return pci_register_driver(&nvme_driver); + } +@@ -3824,6 +4629,9 @@ static void __exit nvme_exit(void) + + MODULE_AUTHOR("Matthew Wilcox "); + MODULE_LICENSE("GPL"); ++#ifdef RETPOLINE_MLNX ++MODULE_INFO(retpoline, "Y"); ++#endif + MODULE_VERSION("1.0"); + module_init(nvme_init); + module_exit(nvme_exit); diff --git a/src/mlnx-ofa_kernel-5.8/backports/0314-BACKPORT-drivers-nvme-target-zns.c.patch b/src/mlnx-ofa_kernel-5.8/backports/0314-BACKPORT-drivers-nvme-target-zns.c.patch new file mode 100644 index 0000000..6fa34a9 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0314-BACKPORT-drivers-nvme-target-zns.c.patch @@ -0,0 +1,192 @@ +From: Israel Rukshin +Subject: [PATCH] BACKPORT: drivers/nvme/target/zns.c + +Change-Id: I558493b5af6b4ceb8c3e33405d9dbe7c73ecedf9 +--- + drivers/nvme/target/zns.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 45 insertions(+) + +--- a/drivers/nvme/target/zns.c ++++ b/drivers/nvme/target/zns.c +@@ -3,11 +3,15 @@ + * NVMe ZNS-ZBD command implementation. + * Copyright (C) 2021 Western Digital Corporation or its affiliates. + */ ++#ifdef pr_fmt ++#undef pr_fmt ++#endif + #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include + #include + #include "nvmet.h" + ++#ifdef HAVE_BIO_ADD_ZONE_APPEND_PAGE + /* + * We set the Memory Page Size Minimum (MPSMIN) for target controller to 0 + * which gets added by 12 in the nvme_enable_ctrl() which results in 2^12 = 4k +@@ -58,10 +62,18 @@ bool nvmet_bdev_zns_enable(struct nvmet_ + * zones, reject the device. Otherwise, use report zones to detect if + * the device has conventional zones. + */ ++#ifdef HAVE_GENDISK_CONV_ZONES_BITMAP ++ if (ns->bdev->bd_disk->conv_zones_bitmap) ++#else + if (ns->bdev->bd_disk->queue->conv_zones_bitmap) ++#endif + return false; + ++#ifdef HAVE_BDEV_NR_ZONES ++ ret = blkdev_report_zones(ns->bdev, 0, bdev_nr_zones(ns->bdev), ++#else + ret = blkdev_report_zones(ns->bdev, 0, blkdev_nr_zones(bd_disk), ++#endif + validate_conv_zones_cb, NULL); + if (ret < 0) + return false; +@@ -251,7 +263,11 @@ static unsigned long nvmet_req_nr_zones_ + { + unsigned int sect = nvmet_lba_to_sect(req->ns, req->cmd->zmr.slba); + ++#ifdef HAVE_BDEV_NR_ZONES ++ return bdev_nr_zones(req->ns->bdev) - ++#else + return blkdev_nr_zones(req->ns->bdev->bd_disk) - ++#endif + (sect >> ilog2(bdev_zone_sectors(req->ns->bdev))); + } + +@@ -318,7 +334,11 @@ void nvmet_bdev_execute_zone_mgmt_recv(s + queue_work(zbd_wq, &req->z.zmgmt_work); + } + ++#ifdef HAVE_BLK_TYPES_REQ_OPF + static inline enum req_opf zsa_req_op(u8 zsa) ++#else ++static inline enum req_op zsa_req_op(u8 zsa) ++#endif + { + switch (zsa) { + case NVME_ZONE_OPEN: +@@ -393,10 +413,31 @@ static int zmgmt_send_scan_cb(struct blk + return 0; + } + ++#ifndef HAVE_BLK_NEXT_BIO_3_PARAMS ++#ifndef HAVE_BIO_INIT_5_PARAMS ++static struct bio *blk_next_bio(struct bio *bio, ++ unsigned int nr_pages, gfp_t gfp) ++{ ++ struct bio *new = bio_alloc(gfp, nr_pages); ++ ++ if (bio) { ++ bio_chain(bio, new); ++ submit_bio(bio); ++ } ++ ++ return new; ++} ++#endif ++#endif ++ + static u16 nvmet_bdev_zone_mgmt_emulate_all(struct nvmet_req *req) + { + struct block_device *bdev = req->ns->bdev; ++#ifdef HAVE_BDEV_NR_ZONES ++ unsigned int nr_zones = bdev_nr_zones(bdev); ++#else + unsigned int nr_zones = blkdev_nr_zones(bdev->bd_disk); ++#endif + struct request_queue *q = bdev_get_queue(bdev); + struct bio *bio = NULL; + sector_t sector = 0; +@@ -423,16 +464,35 @@ static u16 nvmet_bdev_zone_mgmt_emulate_ + ret = 0; + } + ++#ifdef HAVE_BLK_QUEUE_ZONE_SECTORS + while (sector < get_capacity(bdev->bd_disk)) { ++#else ++ while (sector < bdev_nr_sectors(bdev)) { ++#endif ++#ifdef HAVE_GENDISK_CONV_ZONES_BITMAP ++ if (test_bit(disk_zone_no(bdev->bd_disk, sector), d.zbitmap)) { ++#else + if (test_bit(blk_queue_zone_no(q, sector), d.zbitmap)) { ++#endif ++#ifdef HAVE_BIO_INIT_5_PARAMS ++ bio = blk_next_bio(bio, bdev, 0, ++ zsa_req_op(req->cmd->zms.zsa) | REQ_SYNC, ++ GFP_KERNEL); ++ bio->bi_iter.bi_sector = sector; ++#else + bio = blk_next_bio(bio, 0, GFP_KERNEL); + bio->bi_opf = zsa_req_op(req->cmd->zms.zsa) | REQ_SYNC; + bio->bi_iter.bi_sector = sector; + bio_set_dev(bio, bdev); ++#endif + /* This may take a while, so be nice to others */ + cond_resched(); + } ++#ifdef HAVE_BLK_QUEUE_ZONE_SECTORS + sector += blk_queue_zone_sectors(q); ++#else ++ sector += bdev_zone_sectors(bdev); ++#endif + } + + if (bio) { +@@ -475,7 +535,11 @@ static void nvmet_bdev_zmgmt_send_work(s + { + struct nvmet_req *req = container_of(w, struct nvmet_req, z.zmgmt_work); + sector_t sect = nvmet_lba_to_sect(req->ns, req->cmd->zms.slba); ++#ifdef HAVE_BLK_TYPES_REQ_OPF + enum req_opf op = zsa_req_op(req->cmd->zms.zsa); ++#else ++ enum req_op op = zsa_req_op(req->cmd->zms.zsa); ++#endif + struct block_device *bdev = req->ns->bdev; + sector_t zone_sectors = bdev_zone_sectors(bdev); + u16 status = NVME_SC_SUCCESS; +@@ -535,6 +599,9 @@ static void nvmet_bdev_zone_append_bio_d + void nvmet_bdev_execute_zone_append(struct nvmet_req *req) + { + sector_t sect = nvmet_lba_to_sect(req->ns, req->cmd->rw.slba); ++#ifdef HAVE_BIO_INIT_5_PARAMS ++ const unsigned int op = REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE; ++#endif + u16 status = NVME_SC_SUCCESS; + unsigned int total_len = 0; + struct scatterlist *sg; +@@ -564,14 +631,27 @@ void nvmet_bdev_execute_zone_append(stru + + if (nvmet_use_inline_bvec(req)) { + bio = &req->z.inline_bio; ++#ifdef HAVE_BIO_INIT_5_PARAMS ++ bio_init(bio, req->ns->bdev, req->inline_bvec, ++ ARRAY_SIZE(req->inline_bvec), op); ++#else + bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec)); ++#endif + } else { ++#ifdef HAVE_BIO_INIT_5_PARAMS ++ bio = bio_alloc(req->ns->bdev, req->sg_cnt, op, GFP_KERNEL); ++#else + bio = bio_alloc(GFP_KERNEL, req->sg_cnt); ++#endif + } + ++#ifndef HAVE_BIO_INIT_5_PARAMS + bio->bi_opf = REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE; ++#endif + bio->bi_end_io = nvmet_bdev_zone_append_bio_done; ++#ifndef HAVE_BIO_INIT_5_PARAMS + bio_set_dev(bio, req->ns->bdev); ++#endif + bio->bi_iter.bi_sector = sect; + bio->bi_private = req; + if (req->cmd->rw.control & cpu_to_le16(NVME_RW_FUA)) +@@ -623,3 +703,4 @@ u16 nvmet_bdev_zns_parse_io_cmd(struct n + return nvmet_bdev_parse_io_cmd(req); + } + } ++#endif /* HAVE_BIO_ADD_ZONE_APPEND_PAGE */ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0315-BACKPORT-drivers-infiniband-ulp-isert-ib_isert.h.patch b/src/mlnx-ofa_kernel-5.8/backports/0315-BACKPORT-drivers-infiniband-ulp-isert-ib_isert.h.patch new file mode 100644 index 0000000..24a306e --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0315-BACKPORT-drivers-infiniband-ulp-isert-ib_isert.h.patch @@ -0,0 +1,34 @@ +From: Sergey Gorenko +Subject: [PATCH] BACKPORT: drivers/infiniband/ulp/isert/ib_isert.h + +Change-Id: If5432fe227487897e738aa6f16d704d4d7aa4b2b +--- + drivers/infiniband/ulp/isert/ib_isert.h | 17 +++++++++++++++++ + 1 file changed, 17 insertions(+) + +--- a/drivers/infiniband/ulp/isert/ib_isert.h ++++ b/drivers/infiniband/ulp/isert/ib_isert.h +@@ -11,6 +11,23 @@ + #define DRV_NAME "isert" + #define PFX DRV_NAME ": " + ++#ifdef HAVE_ISCSIT_CONN ++#define iscsi_conn iscsit_conn ++#endif ++ ++#ifdef HAVE_ISCSIT_CMD ++#define iscsi_cmd iscsit_cmd ++#endif ++ ++#ifdef HAVE_ISCSIT_CONN_LOGIN_SOCKADDR ++#define HAVE_ISCSI_CONN_LOGIN_SOCKADDR 1 ++#endif ++ ++#ifdef HAVE_ISCSIT_CONN_LOCAL_SOCKADDR ++#define HAVE_ISCSI_CONN_LOCAL_SOCKADDR 1 ++#endif ++ ++ + #define isert_dbg(fmt, arg...) \ + do { \ + if (unlikely(isert_debug_level > 2)) \ diff --git a/src/mlnx-ofa_kernel-5.8/backports/0315-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0315-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..78ba284 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0315-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,27 @@ +From: Roi Dayan +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.h + +Change-Id: I0bfa3760a3c0bcc6b3666df612851b17540f2d53 +--- + drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.h | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.h +@@ -49,6 +49,7 @@ struct mlx5e_tc_act { + const struct flow_action_entry *act, + struct mlx5_flow_attr *attr); + ++#ifdef HAVE_FLOW_OFFLOAD_ACTION + int (*offload_action)(struct mlx5e_priv *priv, + struct flow_offload_action *fl_act, + struct flow_action_entry *act); +@@ -58,6 +59,7 @@ struct mlx5e_tc_act { + + int (*stats_action)(struct mlx5e_priv *priv, + struct flow_offload_action *fl_act); ++#endif + }; + + struct mlx5e_tc_flow_action { diff --git a/src/mlnx-ofa_kernel-5.8/backports/0316-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch b/src/mlnx-ofa_kernel-5.8/backports/0316-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch new file mode 100644 index 0000000..237da5b --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0316-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-en-.patch @@ -0,0 +1,21 @@ +From: Lama Kayal +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/en/params.h + +Change-Id: I3743469f528b65e78f8378b7a95a3a15460ab0fd +--- + drivers/net/ethernet/mellanox/mlx5/core/en/params.h | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.h +@@ -93,8 +93,10 @@ void mlx5e_set_rx_cq_mode_params(struct + + bool slow_pci_heuristic(struct mlx5_core_dev *mdev); + int mlx5e_mpwrq_validate_regular(struct mlx5_core_dev *mdev, struct mlx5e_params *params); ++#ifdef HAVE_XSK_ZERO_COPY_SUPPORT + int mlx5e_mpwrq_validate_xsk(struct mlx5_core_dev *mdev, struct mlx5e_params *params, + struct mlx5e_xsk_param *xsk); ++#endif + void mlx5e_build_rq_params(struct mlx5_core_dev *mdev, struct mlx5e_params *params); + void mlx5e_set_rq_type(struct mlx5_core_dev *mdev, struct mlx5e_params *params); + void mlx5e_init_rq_type_params(struct mlx5_core_dev *mdev, struct mlx5e_params *params); diff --git a/src/mlnx-ofa_kernel-5.8/backports/0318-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-sf-.patch b/src/mlnx-ofa_kernel-5.8/backports/0318-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-sf-.patch new file mode 100644 index 0000000..0f7d49a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0318-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-sf-.patch @@ -0,0 +1,40 @@ +From: Maher Sanalla +Subject: [PATCH] BACKPORT: + drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c + +Change-Id: I923d56ef3bd76c7fddb4cabb6046ad93127f3c16 +--- + .../net/ethernet/mellanox/mlx5/core/sf/devlink.c | 15 ++++++++++++--- + 1 file changed, 12 insertions(+), 3 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c +@@ -166,8 +166,12 @@ static bool mlx5_sf_is_active(const stru + { + return sf->hw_state == MLX5_VHCA_STATE_ACTIVE || sf->hw_state == MLX5_VHCA_STATE_IN_USE; + } +- +-int mlx5_devlink_sf_port_fn_state_get(struct devlink_port *dl_port, ++#ifdef HAVE_PORT_FUNCTION_STATE_GET_4_PARAM ++int mlx5_devlink_sf_port_fn_state_get( ++#else ++int mlx5_devlink_sf_port_fn_state_get(struct devlink *devlink, ++#endif ++ struct devlink_port *dl_port, + enum devlink_port_fn_state *state, + enum devlink_port_fn_opstate *opstate, + struct netlink_ext_ack *extack) +@@ -253,7 +257,12 @@ out: + return err; + } + +-int mlx5_devlink_sf_port_fn_state_set(struct devlink_port *dl_port, ++#ifdef HAVE_PORT_FUNCTION_STATE_GET_4_PARAM ++int mlx5_devlink_sf_port_fn_state_set( ++#else ++int mlx5_devlink_sf_port_fn_state_set(struct devlink *devlink, ++#endif ++ struct devlink_port *dl_port, + enum devlink_port_fn_state state, + struct netlink_ext_ack *extack) + { diff --git a/src/mlnx-ofa_kernel-5.8/backports/0319-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-sf-.patch b/src/mlnx-ofa_kernel-5.8/backports/0319-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-sf-.patch new file mode 100644 index 0000000..48016b8 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/backports/0319-BACKPORT-drivers-net-ethernet-mellanox-mlx5-core-sf-.patch @@ -0,0 +1,34 @@ +From: Maher Sanalla +Subject: [PATCH] BACKPORT: drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h + +Change-Id: I4c99ddeeb0c7171db284df2e7c62ee1b29f23a1d +--- + drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h | 14 ++++++++++++-- + 1 file changed, 12 insertions(+), 2 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h +@@ -23,11 +23,21 @@ int mlx5_devlink_sf_port_new(struct devl + unsigned int *new_port_index); + int mlx5_devlink_sf_port_del(struct devlink *devlink, unsigned int port_index, + struct netlink_ext_ack *extack); +-int mlx5_devlink_sf_port_fn_state_get(struct devlink_port *dl_port, ++#ifdef HAVE_PORT_FUNCTION_STATE_GET_4_PARAM ++int mlx5_devlink_sf_port_fn_state_get( ++#else ++int mlx5_devlink_sf_port_fn_state_get(struct devlink *devlink, ++#endif ++ struct devlink_port *dl_port, + enum devlink_port_fn_state *state, + enum devlink_port_fn_opstate *opstate, + struct netlink_ext_ack *extack); +-int mlx5_devlink_sf_port_fn_state_set(struct devlink_port *dl_port, ++#ifdef HAVE_PORT_FUNCTION_STATE_GET_4_PARAM ++int mlx5_devlink_sf_port_fn_state_set( ++#else ++int mlx5_devlink_sf_port_fn_state_set(struct devlink *devlink, ++#endif ++ struct devlink_port *dl_port, + enum devlink_port_fn_state state, + struct netlink_ext_ack *extack); + #if IS_ENABLED(CONFIG_MLXDEVM) diff --git a/src/mlnx-ofa_kernel-5.8/block/blk-mq-rdma.c b/src/mlnx-ofa_kernel-5.8/block/blk-mq-rdma.c new file mode 100644 index 0000000..14f968e --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/block/blk-mq-rdma.c @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2017 Sagi Grimberg. + */ +#include +#include +#include + +/** + * blk_mq_rdma_map_queues - provide a default queue mapping for rdma device + * @map: CPU to hardware queue map. + * @dev: rdma device to provide a mapping for. + * @first_vec: first interrupt vectors to use for queues (usually 0) + * + * This function assumes the rdma device @dev has at least as many available + * interrupt vetors as @set has queues. It will then query it's affinity mask + * and built queue mapping that maps a queue to the CPUs that have irq affinity + * for the corresponding vector. + * + * In case either the driver passed a @dev with less vectors than + * @set->nr_hw_queues, or @dev does not provide an affinity mask for a + * vector, we fallback to the naive mapping. + */ +int blk_mq_rdma_map_queues(struct blk_mq_queue_map *map, + struct ib_device *dev, int first_vec) +{ + const struct cpumask *mask; + unsigned int queue, cpu; + + for (queue = 0; queue < map->nr_queues; queue++) { + mask = ib_get_vector_affinity(dev, first_vec + queue); + if (!mask) + goto fallback; + + for_each_cpu(cpu, mask) + map->mq_map[cpu] = map->queue_offset + queue; + } + + return 0; + +fallback: + return blk_mq_map_queues(map); +} +EXPORT_SYMBOL_GPL(blk_mq_rdma_map_queues); diff --git a/src/mlnx-ofa_kernel-5.8/code-metrics.txt b/src/mlnx-ofa_kernel-5.8/code-metrics.txt new file mode 100644 index 0000000..2a002de --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/code-metrics.txt @@ -0,0 +1,9 @@ + +compat-rdma code metrics + + 376034 - Total upstream lines of code being pulled + +Base tree: linux.git +Base tree version: rc4 +compat.git: compat-mlnx_ofed +compat-rdma release: 1f3020dd8d17dirty diff --git a/src/mlnx-ofa_kernel-5.8/compat/Makefile.am b/src/mlnx-ofa_kernel-5.8/compat/Makefile.am new file mode 100644 index 0000000..e69de29 diff --git a/src/mlnx-ofa_kernel-5.8/compat/Makefile.real b/src/mlnx-ofa_kernel-5.8/compat/Makefile.real new file mode 100644 index 0000000..8d14771 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/Makefile.real @@ -0,0 +1,64 @@ +obj-m += mlx_compat.o +#compat-objs := + +obj-$(CONFIG_COMPAT_FIRMWARE_CLASS) += compat_firmware_class.o +obj-$(CONFIG_COMPAT_NET_SCH_CODEL) += sch_codel.o +obj-$(CONFIG_COMPAT_NET_SCH_FQ_CODEL) += sch_fq_codel.o + +mlx_compat-y += main.o + +mlx_compat-$(CONFIG_COMPAT_KFIFO) += kfifo.o + +mlx_compat-$(CONFIG_COMPAT_KERNEL_3_10) += compat-3.10.o +mlx_compat-$(CONFIG_COMPAT_KERNEL_3_11) += compat-3.11.o +mlx_compat-$(CONFIG_COMPAT_KERNEL_3_12) += compat-3.12.o +mlx_compat-$(CONFIG_COMPAT_KERNEL_3_13) += compat-3.13.o +mlx_compat-$(CONFIG_COMPAT_KERNEL_3_15) += compat-3.15.o +mlx_compat-$(CONFIG_COMPAT_KERNEL_3_16) += compat-3.16.o +mlx_compat-$(CONFIG_COMPAT_KERNEL_3_18) += compat-3.18.o + +mlx_compat-$(CONFIG_COMPAT_KERNEL_4_1) += compat-4.1.o + +mlx_compat-$(CONFIG_COMPAT_CORDIC) += cordic.o +mlx_compat-$(CONFIG_COMPAT_CRC8) += crc8.o +mlx_compat-$(CONFIG_COMPAT_FLOW_DISSECTOR) += flow_dissector.o +mlx_compat-y += string.o +mlx_compat-y += output_core.o +mlx_compat-y += idr.o +mlx_compat-y += interval_tree.o + +ifndef CONFIG_64BIT +ifndef CONFIG_GENERIC_ATOMIC64 + mlx_compat-y += compat_atomic.o +endif +endif + +mlx_compat-y += xz_crc32.o +mlx_compat-y += xz_dec_lzma2.o +mlx_compat-y += xz_dec_stream.o +mlx_compat-y += xz_dec_syms.o +mlx_compat-$(CONFIG_XZ_DEC_BCJ) += xz_dec_bcj.o + +mlx_compat-y += mm_util.o +mlx_compat-y += uuid.o +mlx_compat-y += rhashtable.o +mlx_compat-y += ../block/blk-mq-rdma.o +mlx_compat-y += exec.o +mlx_compat-y += pci.o +mlx_compat-y += syscall.o +mlx_compat-y += mmu_notifier.o +mlx_compat-y += xarray.o +mlx_compat-y += rdma_dim.o +mlx_compat-y += dim.o +mlx_compat-y += net_dim.o +mlx_compat-y += file.o +ifdef CONFIG_MLX5_CLS_ACT +mlx_compat-y += flow_offload.o +mlx_compat-y += cls_api.o +mlx_compat-$(CONFIG_COMPAT_CLS_FLOWER_4_18_MOD) += nf_flow_table_core.o nf_flow_table_offload.o +endif + +mlx_compat-y += bitmap.o +ifdef CONFIG_MLX5_EN_MACSEC +mlx_compat-y += macsec.o +endif diff --git a/src/mlnx-ofa_kernel-5.8/compat/autogen.sh b/src/mlnx-ofa_kernel-5.8/compat/autogen.sh new file mode 100755 index 0000000..b119ea2 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/autogen.sh @@ -0,0 +1,20 @@ +#!/bin/sh + +run_cmd() +{ + cmd="$@" + echo -n "Running $cmd" + eval $cmd + res=$? + if [ $res -ne 0 ]; then + echo " failed: $res" + echo "Aborting" + exit 1 + fi + echo +} + +run_cmd "aclocal -I $PWD/config $ACLOCAL_FLAGS" +run_cmd "autoheader" +run_cmd "automake -a -c --force-missing" +run_cmd autoconf diff --git a/src/mlnx-ofa_kernel-5.8/compat/bitmap.c b/src/mlnx-ofa_kernel-5.8/compat/bitmap.c new file mode 100644 index 0000000..a6cc838 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/bitmap.c @@ -0,0 +1,59 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifndef HAVE_BITMAP_KZALLOC +unsigned long *bitmap_alloc(unsigned int nbits, gfp_t flags) +{ + return kmalloc_array(BITS_TO_LONGS(nbits), sizeof(unsigned long), + flags); +} +EXPORT_SYMBOL(bitmap_alloc); + +unsigned long *bitmap_zalloc(unsigned int nbits, gfp_t flags) +{ + return bitmap_alloc(nbits, flags | __GFP_ZERO); +} +EXPORT_SYMBOL(bitmap_zalloc); +#endif + +#ifndef HAVE_BITMAP_FREE +void bitmap_free(const unsigned long *bitmap) +{ + kfree(bitmap); +} +EXPORT_SYMBOL(bitmap_free); +#endif + +#ifndef HAVE_BITMAP_FROM_ARR32 +#if BITS_PER_LONG == 64 +void bitmap_from_arr32(unsigned long *bitmap, const u32 *buf, + unsigned int nbits) +{ + unsigned int i, halfwords; + + halfwords = DIV_ROUND_UP(nbits, 32); + for (i = 0; i < halfwords; i++) { + bitmap[i/2] = (unsigned long) buf[i]; + if (++i < halfwords) + bitmap[i/2] |= ((unsigned long) buf[i]) << 32; + } + + /* Clear tail bits in last word beyond nbits. */ + if (nbits % BITS_PER_LONG) + bitmap[(halfwords - 1) / 2] &= BITMAP_LAST_WORD_MASK(nbits); +} +EXPORT_SYMBOL(bitmap_from_arr32); +#endif +#endif diff --git a/src/mlnx-ofa_kernel-5.8/compat/build/Makefile b/src/mlnx-ofa_kernel-5.8/compat/build/Makefile new file mode 100644 index 0000000..656dd95 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/build/Makefile @@ -0,0 +1,48 @@ +# +# There are four ways this Makefile can be called: +# +# +# 1. As a subdirectory from the toplevel, for automake +# +# 2. A wrapper around the kernel's makefile when building modules, to +# possibly override the .config file +# +# 3. At configure time, as the toplevel module dir for building +# kernel tests +# +# 4. At configure time, to determine the kernel's idea of $(ARCH) +# + +ifeq ($(PATCHLEVEL),) + +ifeq ($(MLNX_LINUX_CONFIG),) + +# case #1 +include autoMakefile + +else + +# case #2 +# Note that this comes from make -C $LINUX -f $MLNX/build/Makefile +# so "include Makefile" below includes $LINUX/Makefile, not this file +include $(MLNX_LINUX_CONFIG) +include Makefile + +endif # MLNX_LINUX_CONFIG + +else # PATCHLEVEL + +# case 3 + +ifneq ($(MLNX_KERNEL_TEST),) +extra-y = $(MLNX_KERNEL_TEST) +endif + +obj-m := conftest.o + +endif # PATCHLEVEL + +# case 4 + +echoarch: + echo $(ARCH) >$(ARCHFILE) diff --git a/src/mlnx-ofa_kernel-5.8/compat/cls_api.c b/src/mlnx-ofa_kernel-5.8/compat/cls_api.c new file mode 100644 index 0000000..11e2ae1 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/cls_api.c @@ -0,0 +1,216 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * net/sched/cls_api.c Packet classifier API. + * + * Authors: Alexey Kuznetsov, + * + * Changes: + * + * Eduardo J. Blanco :990222: kmod support + */ + +#ifndef HAVE_TC_SETUP_FLOW_ACTION + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef HAVE_NET_PSAMPLE_H +#include +#endif +#ifdef HAVE_IS_TCF_POLICE +#include +#endif +#include + + +#if !defined(HAVE_IS_TCF_TUNNEL) && defined(HAVE_TCF_TUNNEL_INFO) + +struct tcf_tunnel_key { + struct tcf_common common; + int tcft_action; + struct ip_tunnel_info ti; + }; + +#define to_tunnel_key(pc) \ + container_of(pc, struct tcf_tunnel_key, common) + +static void parse_tunnel(const struct tc_action *act, + struct flow_action_entry *entry) +{ + struct tcf_tunnel_key *t; + + t = to_tunnel_key(act->priv); + + entry->tunnel = &t->ti; +} +#endif + +int tc_setup_flow_action(struct flow_action *flow_action, + const struct tcf_exts *exts) +{ + const struct tc_action *act; + int i, j; + + if (!exts) + return 0; + + j = 0; + tcf_exts_for_each_action(i, act, exts) { + struct flow_action_entry *entry; + + entry = &flow_action->entries[j]; + if (is_tcf_gact_ok(act)) { + entry->id = FLOW_ACTION_ACCEPT; +#ifdef HAVE_IS_TCF_GACT_SHOT + } else if (is_tcf_gact_shot(act)) { + entry->id = FLOW_ACTION_DROP; +#endif +#ifdef HAVE_IS_TCF_GACT_GOTO_CHAIN + } else if (is_tcf_gact_trap(act)) { + entry->id = FLOW_ACTION_TRAP; + } else if (is_tcf_gact_goto_chain(act)) { + entry->id = FLOW_ACTION_GOTO; + entry->chain_index = tcf_gact_goto_chain_index(act); +#endif + } else if (is_tcf_mirred_egress_redirect(act)) { + entry->id = FLOW_ACTION_REDIRECT; + entry->dev = tcf_mirred_dev(act); + } else if (is_tcf_mirred_egress_mirror(act)) { + entry->id = FLOW_ACTION_MIRRED; + entry->dev = tcf_mirred_dev(act); +#ifdef HAVE_IS_TCF_VLAN + } else if (is_tcf_vlan(act)) { + switch (tcf_vlan_action(act)) { + case TCA_VLAN_ACT_PUSH: + entry->id = FLOW_ACTION_VLAN_PUSH; + entry->vlan.vid = tcf_vlan_push_vid(act); + entry->vlan.proto = tcf_vlan_push_proto(act); + entry->vlan.prio = tcf_vlan_push_prio(act); + break; + case TCA_VLAN_ACT_POP: + entry->id = FLOW_ACTION_VLAN_POP; + break; +#ifdef HAVE_TCA_VLAN_ACT_MODIFY + case TCA_VLAN_ACT_MODIFY: + entry->id = FLOW_ACTION_VLAN_MANGLE; + entry->vlan.vid = tcf_vlan_push_vid(act); + entry->vlan.proto = tcf_vlan_push_proto(act); + entry->vlan.prio = tcf_vlan_push_prio(act); + break; +#endif /* HAVE_TCA_VLAN_ACT_MODIFY */ + default: + goto err_out; + } +#endif /* HAVE_IS_TCF_VLAN */ +#ifdef HAVE_IS_TCF_TUNNEL + } else if (is_tcf_tunnel_set(act)) { + entry->id = FLOW_ACTION_TUNNEL_ENCAP; + entry->tunnel = tcf_tunnel_info(act); + } else if (is_tcf_tunnel_release(act)) { + entry->id = FLOW_ACTION_TUNNEL_DECAP; +#elif defined(HAVE_TCF_TUNNEL_INFO) + } else if (is_tcf_tunnel_set(act)) { + entry->id = FLOW_ACTION_TUNNEL_ENCAP; + parse_tunnel(act, entry); + } else if (is_tcf_tunnel_release(act)) { + entry->id = FLOW_ACTION_TUNNEL_DECAP; +#endif +#ifdef HAVE_TCF_PEDIT_TCFP_KEYS_EX + } else if (is_tcf_pedit(act)) { + int k; + for (k = 0; k < tcf_pedit_nkeys(act); k++) { + switch (tcf_pedit_cmd(act, k)) { + case TCA_PEDIT_KEY_EX_CMD_SET: + entry->id = FLOW_ACTION_MANGLE; + break; + case TCA_PEDIT_KEY_EX_CMD_ADD: + entry->id = FLOW_ACTION_ADD; + break; + default: + goto err_out; + } + entry->mangle.htype = tcf_pedit_htype(act, k); + entry->mangle.mask = tcf_pedit_mask(act, k); + entry->mangle.val = tcf_pedit_val(act, k); + entry->mangle.offset = tcf_pedit_offset(act, k); + entry = &flow_action->entries[++j]; + } +#endif + } else if (is_tcf_csum(act)) { + entry->id = FLOW_ACTION_CSUM; + entry->csum_flags = tcf_csum_update_flags(act); +#ifdef HAVE_IS_TCF_SKBEDIT_MARK + } else if (is_tcf_skbedit_mark(act)) { + entry->id = FLOW_ACTION_MARK; + entry->mark = tcf_skbedit_mark(act); +#endif +#ifdef HAVE_IS_TCF_POLICE + } else if (is_tcf_police(act)) { + entry->id = FLOW_ACTION_POLICE; + entry->police.burst = tcf_police_tcfp_burst(act); + entry->police.rate_bytes_ps = + tcf_police_rate_bytes_ps(act); +#endif +#ifdef CONFIG_COMPAT_ACT_CT + } else if (is_tcf_ct(act)) { + entry->id = FLOW_ACTION_CT; + entry->ct.action = tcf_ct_action(act); + entry->ct.zone = tcf_ct_zone(act); + entry->ct.flow_table = tcf_ct_ft(act); +#endif +#ifdef HAVE_NET_PSAMPLE_H + } else if (is_tcf_sample(act)) { + entry->id = FLOW_ACTION_SAMPLE; + entry->sample.trunc_size = tcf_sample_trunc_size(act); + entry->sample.truncate = tcf_sample_truncate(act); + entry->sample.rate = tcf_sample_rate(act); + entry->sample.psample_group = + tcf_sample_psample_group(act); +#endif + } else { + goto err_out; + } + +#ifdef HAVE_TCF_PEDIT_TCFP_KEYS_EX + if (!is_tcf_pedit(act)) + j++; +#endif + } + return 0; +err_out: + return -EOPNOTSUPP; +} +EXPORT_SYMBOL(tc_setup_flow_action); + +#endif + + +#ifndef HAVE_TCF_EXTS_NUM_ACTIONS +#include +#include +unsigned int tcf_exts_num_actions(struct tcf_exts *exts) +{ + unsigned int num_acts = 0; +#ifdef HAVE_TCF_PEDIT_TCFP_KEYS_EX + struct tc_action *act; + int i; + + tcf_exts_for_each_action(i, act, exts) { + if (is_tcf_pedit(act)) + num_acts += tcf_pedit_nkeys(act); + else + num_acts++; + } +#endif + return num_acts; +} +EXPORT_SYMBOL(tcf_exts_num_actions); + +#endif diff --git a/src/mlnx-ofa_kernel-5.8/compat/compat-3.11.c b/src/mlnx-ofa_kernel-5.8/compat/compat-3.11.c new file mode 100644 index 0000000..5bd79f3 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/compat-3.11.c @@ -0,0 +1,369 @@ +#include +#include +#include + +#define sg_mapping_iter LINUX_BACKPORT(sg_mapping_iter) +#define sg_miter_start LINUX_BACKPORT(sg_miter_start) +#define sg_miter_skip LINUX_BACKPORT(sg_miter_skip) +#define sg_miter_next LINUX_BACKPORT(sg_miter_next) +#define sg_miter_stop LINUX_BACKPORT(sg_miter_stop) + +/* + * Mapping sg iterator + * + * Iterates over sg entries mapping page-by-page. On each successful + * iteration, @miter->page points to the mapped page and + * @miter->length bytes of data can be accessed at @miter->addr. As + * long as an interation is enclosed between start and stop, the user + * is free to choose control structure and when to stop. + * + * @miter->consumed is set to @miter->length on each iteration. It + * can be adjusted if the user can't consume all the bytes in one go. + * Also, a stopped iteration can be resumed by calling next on it. + * This is useful when iteration needs to release all resources and + * continue later (e.g. at the next interrupt). + */ + +#define SG_MITER_ATOMIC (1 << 0) /* use kmap_atomic */ +#define SG_MITER_TO_SG (1 << 1) /* flush back to phys on unmap*/ +#define SG_MITER_FROM_SG (1 << 2) /* nop */ + +struct sg_mapping_iter { + /* the following three fields can be accessed directly */ + struct page *page; /* currently mapped page */ + void *addr; /* pointer to the mapped area */ + size_t length; /* length of the mapped area */ + size_t consumed; /* number of consumed bytes */ + struct sg_page_iter piter; /* page iterator */ + + /* these are internal states, keep away */ + unsigned int __offset; /* offset within page */ + unsigned int __remaining; /* remaining bytes on page */ + unsigned int __flags; +}; + +void sg_miter_stop(struct sg_mapping_iter *miter); + +/** + * sg_miter_start - start mapping iteration over a sg list + * @miter: sg mapping iter to be started + * @sgl: sg list to iterate over + * @nents: number of sg entries + * + * Description: + * Starts mapping iterator @miter. + * + * Context: + * Don't care. + */ +void sg_miter_start(struct sg_mapping_iter *miter, struct scatterlist *sgl, + unsigned int nents, unsigned int flags) +{ + memset(miter, 0, sizeof(struct sg_mapping_iter)); + + __sg_page_iter_start(&miter->piter, sgl, nents, 0); + WARN_ON(!(flags & (SG_MITER_TO_SG | SG_MITER_FROM_SG))); + miter->__flags = flags; +} +/* EXPORT_SYMBOL(sg_miter_start); */ + +static bool sg_miter_get_next_page(struct sg_mapping_iter *miter) +{ + if (!miter->__remaining) { + struct scatterlist *sg; + unsigned long pgoffset; + + if (!__sg_page_iter_next(&miter->piter)) + return false; + + sg = miter->piter.sg; + pgoffset = miter->piter.sg_pgoffset; + + miter->__offset = pgoffset ? 0 : sg->offset; + miter->__remaining = sg->offset + sg->length - + (pgoffset << PAGE_SHIFT) - miter->__offset; + miter->__remaining = min_t(unsigned long, miter->__remaining, + PAGE_SIZE - miter->__offset); + } + + return true; +} + +/** + * sg_miter_skip - reposition mapping iterator + * @miter: sg mapping iter to be skipped + * @offset: number of bytes to plus the current location + * + * Description: + * Sets the offset of @miter to its current location plus @offset bytes. + * If mapping iterator @miter has been proceeded by sg_miter_next(), this + * stops @miter. + * + * Context: + * Don't care if @miter is stopped, or not proceeded yet. + * Otherwise, preemption disabled if the SG_MITER_ATOMIC is set. + * + * Returns: + * true if @miter contains the valid mapping. false if end of sg + * list is reached. + */ +bool sg_miter_skip(struct sg_mapping_iter *miter, off_t offset) +{ + sg_miter_stop(miter); + + while (offset) { + off_t consumed; + + if (!sg_miter_get_next_page(miter)) + return false; + + consumed = min_t(off_t, offset, miter->__remaining); + miter->__offset += consumed; + miter->__remaining -= consumed; + offset -= consumed; + } + + return true; +} +/* EXPORT_SYMBOL(sg_miter_skip); */ + +/** + * sg_miter_next - proceed mapping iterator to the next mapping + * @miter: sg mapping iter to proceed + * + * Description: + * Proceeds @miter to the next mapping. @miter should have been started + * using sg_miter_start(). On successful return, @miter->page, + * @miter->addr and @miter->length point to the current mapping. + * + * Context: + * Preemption disabled if SG_MITER_ATOMIC. Preemption must stay disabled + * till @miter is stopped. May sleep if !SG_MITER_ATOMIC. + * + * Returns: + * true if @miter contains the next mapping. false if end of sg + * list is reached. + */ +bool sg_miter_next(struct sg_mapping_iter *miter) +{ + sg_miter_stop(miter); + + /* + * Get to the next page if necessary. + * __remaining, __offset is adjusted by sg_miter_stop + */ + if (!sg_miter_get_next_page(miter)) + return false; + + miter->page = sg_page_iter_page(&miter->piter); + miter->consumed = miter->length = miter->__remaining; + + if (miter->__flags & SG_MITER_ATOMIC) +#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 4, 0)) + miter->addr = kmap_atomic(miter->page, 0) + miter->__offset; +#else + miter->addr = kmap_atomic(miter->page) + miter->__offset; +#endif + else + miter->addr = kmap(miter->page) + miter->__offset; + + return true; +} +/* EXPORT_SYMBOL(sg_miter_next); */ + +/** + * sg_miter_stop - stop mapping iteration + * @miter: sg mapping iter to be stopped + * + * Description: + * Stops mapping iterator @miter. @miter should have been started + * started using sg_miter_start(). A stopped iteration can be + * resumed by calling sg_miter_next() on it. This is useful when + * resources (kmap) need to be released during iteration. + * + * Context: + * Preemption disabled if the SG_MITER_ATOMIC is set. Don't care + * otherwise. + */ +void sg_miter_stop(struct sg_mapping_iter *miter) +{ + WARN_ON(miter->consumed > miter->length); + + /* drop resources from the last iteration */ + if (miter->addr) { + miter->__offset += miter->consumed; + miter->__remaining -= miter->consumed; + + if ((miter->__flags & SG_MITER_TO_SG) && + !PageSlab(miter->page)) + flush_kernel_dcache_page(miter->page); + + if (miter->__flags & SG_MITER_ATOMIC) { + WARN_ON_ONCE(preemptible()); +#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 4, 0)) + kunmap_atomic(miter->addr, 0); +#else + kunmap_atomic(miter->addr); +#endif + } else + kunmap(miter->page); + + miter->page = NULL; + miter->addr = NULL; + miter->length = 0; + miter->consumed = 0; + } +} +/* EXPORT_SYMBOL(sg_miter_stop); */ + +/** + * sg_copy_buffer - Copy data between a linear buffer and an SG list + * @sgl: The SG list + * @nents: Number of SG entries + * @buf: Where to copy from + * @buflen: The number of bytes to copy + * @skip: Number of bytes to skip before copying + * @to_buffer: transfer direction (true == from an sg list to a + * buffer, false == from a buffer to an sg list + * + * Returns the number of copied bytes. + * + **/ +#define sg_copy_buffer LINUX_BACKPORT(sg_copy_buffer) +static size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, + void *buf, size_t buflen, off_t skip, + bool to_buffer) +{ + unsigned int offset = 0; + struct sg_mapping_iter miter; + unsigned long flags; + unsigned int sg_flags = SG_MITER_ATOMIC; + + if (to_buffer) + sg_flags |= SG_MITER_FROM_SG; + else + sg_flags |= SG_MITER_TO_SG; + + sg_miter_start(&miter, sgl, nents, sg_flags); + + if (!sg_miter_skip(&miter, skip)) + return false; + + local_irq_save(flags); + + while (sg_miter_next(&miter) && offset < buflen) { + unsigned int len; + + len = min(miter.length, buflen - offset); + + if (to_buffer) + memcpy(buf + offset, miter.addr, len); + else + memcpy(miter.addr, buf + offset, len); + + offset += len; + } + + sg_miter_stop(&miter); + + local_irq_restore(flags); + return offset; +} + +/** + * sg_copy_from_buffer - Copy from a linear buffer to an SG list + * @sgl: The SG list + * @nents: Number of SG entries + * @buf: Where to copy from + * @buflen: The number of bytes to copy + * + * Returns the number of copied bytes. + * + **/ +#define sg_copy_from_buffer LINUX_BACKPORT(sg_copy_from_buffer) +size_t sg_copy_from_buffer(struct scatterlist *sgl, unsigned int nents, + void *buf, size_t buflen) +{ + return sg_copy_buffer(sgl, nents, buf, buflen, 0, false); +} +EXPORT_SYMBOL(sg_copy_from_buffer); + +/** + * sg_copy_to_buffer - Copy from an SG list to a linear buffer + * @sgl: The SG list + * @nents: Number of SG entries + * @buf: Where to copy to + * @buflen: The number of bytes to copy + * + * Returns the number of copied bytes. + * + **/ +#define sg_copy_to_buffer LINUX_BACKPORT(sg_copy_to_buffer) +size_t sg_copy_to_buffer(struct scatterlist *sgl, unsigned int nents, + void *buf, size_t buflen) +{ + return sg_copy_buffer(sgl, nents, buf, buflen, 0, true); +} +EXPORT_SYMBOL(sg_copy_to_buffer); + +/** + * sg_pcopy_from_buffer - Copy from a linear buffer to an SG list + * @sgl: The SG list + * @nents: Number of SG entries + * @buf: Where to copy from + * @skip: Number of bytes to skip before copying + * @buflen: The number of bytes to copy + * + * Returns the number of copied bytes. + * + **/ +#define sg_pcopy_from_buffer LINUX_BACKPORT(sg_pcopy_from_buffer) +size_t sg_pcopy_from_buffer(struct scatterlist *sgl, unsigned int nents, + void *buf, size_t buflen, off_t skip) +{ + return sg_copy_buffer(sgl, nents, buf, buflen, skip, false); +} +EXPORT_SYMBOL(sg_pcopy_from_buffer); + +/** + * sg_pcopy_to_buffer - Copy from an SG list to a linear buffer + * @sgl: The SG list + * @nents: Number of SG entries + * @buf: Where to copy to + * @skip: Number of bytes to skip before copying + * @buflen: The number of bytes to copy + * + * Returns the number of copied bytes. + * + **/ +#define sg_pcopy_to_buffer LINUX_BACKPORT(sg_pcopy_to_buffer) +size_t sg_pcopy_to_buffer(struct scatterlist *sgl, unsigned int nents, + void *buf, size_t buflen, off_t skip) +{ + return sg_copy_buffer(sgl, nents, buf, buflen, skip, true); +} +EXPORT_SYMBOL(sg_pcopy_to_buffer); + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)) +/** + * fixed_size_llseek - llseek implementation for fixed-sized devices + * @file: file structure to seek on + * @offset: file offset to seek to + * @whence: type of seek + * @size: size of the file + * + **/ +#define fixed_size_llseek LINUX_BACKPORT(fixed_size_llseek) +loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, + loff_t size) +{ + switch (whence) { + case SEEK_SET: case SEEK_CUR: case SEEK_END: + return generic_file_llseek_size(file, offset, whence, + size, size); + default: + return -EINVAL; + } +} +EXPORT_SYMBOL(fixed_size_llseek); +#endif diff --git a/src/mlnx-ofa_kernel-5.8/compat/compat-3.12.c b/src/mlnx-ofa_kernel-5.8/compat/compat-3.12.c new file mode 100644 index 0000000..0400a9d --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/compat-3.12.c @@ -0,0 +1,98 @@ +#include +#include +#include +#include +#include + +#ifndef HAVE_UDP4_HWCSUM +/** + * udp4_hwcsum - handle outgoing HW checksumming + * @skb: sk_buff containing the filled-in UDP header + * (checksum field must be zeroed out) + * @src: source IP address + * @dst: destination IP address + */ +#define udp4_hwcsum LINUX_BACKPORT(udp4_hwcsum) +void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst) +{ + struct udphdr *uh = udp_hdr(skb); + int offset = skb_transport_offset(skb); + int len = skb->len - offset; + int hlen = len; + __wsum csum = 0; + + if (!skb_has_frag_list(skb)) { + /* + * Only one fragment on the socket. + */ + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct udphdr, check); + uh->check = ~csum_tcpudp_magic(src, dst, len, + IPPROTO_UDP, 0); + } else { + struct sk_buff *frags; + + /* + * HW-checksum won't work as there are two or more + * fragments on the socket so that all csums of sk_buffs + * should be together + */ + skb_walk_frags(skb, frags) { + csum = csum_add(csum, frags->csum); + hlen -= frags->len; + } + + csum = skb_checksum(skb, offset, hlen, csum); + skb->ip_summed = CHECKSUM_NONE; + + uh->check = csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, csum); + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + } +} +EXPORT_SYMBOL_GPL(udp4_hwcsum); +#endif + +#define debugfs_create_atomic_t LINUX_BACKPORT(debugfs_create_atomic_t) + +static int debugfs_atomic_t_set(void *data, u64 val) +{ + atomic_set((atomic_t *)data, val); + return 0; +} +static int debugfs_atomic_t_get(void *data, u64 *val) +{ + *val = atomic_read((atomic_t *)data); + return 0; +} +DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t, debugfs_atomic_t_get, + debugfs_atomic_t_set, "%lld\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t_ro, debugfs_atomic_t_get, NULL, "%lld\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, "%lld\n"); + +/** + * debugfs_create_atomic_t - create a debugfs file that is used to read and + * write an atomic_t value + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is %NULL, then the + * file will be created in the root of the debugfs filesystem. + * @value: a pointer to the variable that the file should read to and write + * from. + */ +struct dentry *debugfs_create_atomic_t(const char *name, umode_t mode, + struct dentry *parent, atomic_t *value) +{ + /* if there are no write bits set, make read only */ + if (!(mode & S_IWUGO)) + return debugfs_create_file(name, mode, parent, value, + &fops_atomic_t_ro); + /* if there are no read bits set, make write only */ + if (!(mode & S_IRUGO)) + return debugfs_create_file(name, mode, parent, value, + &fops_atomic_t_wo); + + return debugfs_create_file(name, mode, parent, value, &fops_atomic_t); +} +EXPORT_SYMBOL_GPL(debugfs_create_atomic_t); diff --git a/src/mlnx-ofa_kernel-5.8/compat/compat-3.13.c b/src/mlnx-ofa_kernel-5.8/compat/compat-3.13.c new file mode 100644 index 0000000..517badd --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/compat-3.13.c @@ -0,0 +1,45 @@ +#include +#include + +/** + * pcie_get_mps - get PCI Express maximum payload size + * @dev: PCI device to query + * + * Returns maximum payload size in bytes + */ +#define pcie_get_mps LINUX_BACKPORT(pcie_get_mps) +int pcie_get_mps(struct pci_dev *dev) +{ + u16 ctl; + + pcie_capability_read_word(dev, PCI_EXP_DEVCTL, &ctl); + + return 128 << ((ctl & PCI_EXP_DEVCTL_PAYLOAD) >> 5); +} +EXPORT_SYMBOL(pcie_get_mps); + +/** + * pcie_set_mps - set PCI Express maximum payload size + * @dev: PCI device to query + * @mps: maximum payload size in bytes + * valid values are 128, 256, 512, 1024, 2048, 4096 + * + * If possible sets maximum payload size + */ +#define pcie_set_mps LINUX_BACKPORT(pcie_set_mps) +int pcie_set_mps(struct pci_dev *dev, int mps) +{ + u16 v; + + if (mps < 128 || mps > 4096 || !is_power_of_2(mps)) + return -EINVAL; + + v = ffs(mps) - 8; + if (v > dev->pcie_mpss) + return -EINVAL; + v <<= 5; + + return pcie_capability_clear_and_set_word(dev, PCI_EXP_DEVCTL, + PCI_EXP_DEVCTL_PAYLOAD, v); +} +EXPORT_SYMBOL(pcie_set_mps); diff --git a/src/mlnx-ofa_kernel-5.8/compat/compat-3.15.c b/src/mlnx-ofa_kernel-5.8/compat/compat-3.15.c new file mode 100644 index 0000000..eea2733 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/compat-3.15.c @@ -0,0 +1,13 @@ +#include +#include +#include + +#define kvfree LINUX_BACKPORT(kvfree) +void kvfree(const void *addr) +{ + if (is_vmalloc_addr(addr)) + vfree(addr); + else + kfree(addr); +} +EXPORT_SYMBOL(kvfree); diff --git a/src/mlnx-ofa_kernel-5.8/compat/compat-3.16.c b/src/mlnx-ofa_kernel-5.8/compat/compat-3.16.c new file mode 100644 index 0000000..b57c041 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/compat-3.16.c @@ -0,0 +1,70 @@ +#include +#include +#include +#include +#include +#include + +/** + * cpumask_set_cpu_local_first - set i'th cpu with local numa cpu's first + * + * @i: index number + * @numa_node: local numa_node + * @dstp: cpumask with the relevant cpu bit set according to the policy + * + * This function sets the cpumask according to a numa aware policy. + * cpumask could be used as an affinity hint for the IRQ related to a + * queue. When the policy is to spread queues across cores - local cores + * first. + * + * Returns 0 on success, -ENOMEM for no memory, and -EAGAIN when failed to set + * the cpu bit and need to re-call the function. + */ +#define cpumask_set_cpu_local_first LINUX_BACKPORT(cpumask_set_cpu_local_first) +int cpumask_set_cpu_local_first(int i, int numa_node, cpumask_t *dstp) +{ + cpumask_var_t mask; + int cpu; + int ret = 0; + + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + i %= num_online_cpus(); + + if (numa_node == -1 || !cpumask_of_node(numa_node)) { + /* Use all online cpu's for non numa aware system */ + cpumask_copy(mask, cpu_online_mask); + } else { + int n; + + cpumask_and(mask, + cpumask_of_node(numa_node), cpu_online_mask); + + n = cpumask_weight(mask); + if (i >= n) { + i -= n; + + /* If index > number of local cpu's, mask out local + * cpu's + */ + cpumask_andnot(mask, cpu_online_mask, mask); + } + } + + for_each_cpu(cpu, mask) { + if (--i < 0) + goto out; + } + + ret = -EAGAIN; + +out: + free_cpumask_var(mask); + + if (!ret) + cpumask_set_cpu(cpu, dstp); + + return ret; +} +EXPORT_SYMBOL(cpumask_set_cpu_local_first); diff --git a/src/mlnx-ofa_kernel-5.8/compat/compat-3.18.c b/src/mlnx-ofa_kernel-5.8/compat/compat-3.18.c new file mode 100644 index 0000000..105c299 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/compat-3.18.c @@ -0,0 +1,13 @@ +#include +#include +#include + +#ifndef HAVE_ELFCOREHDR_ADDR_EXPORTED +#ifndef ELFCORE_ADDR_MAX +#define ELFCORE_ADDR_MAX (-1ULL) +#endif + +#define elfcorehdr_addr LINUX_BACKPORT(elfcorehdr_addr) +unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; +EXPORT_SYMBOL_GPL(elfcorehdr_addr); +#endif /* HAVE_ELFCOREHDR_ADDR_EXPORTED */ diff --git a/src/mlnx-ofa_kernel-5.8/compat/compat-4.1.c b/src/mlnx-ofa_kernel-5.8/compat/compat-4.1.c new file mode 100644 index 0000000..936cb0c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/compat-4.1.c @@ -0,0 +1,48 @@ +#include +#include +#include +#include +#include +#include + +/** + * cpumask_local_spread - select the i'th cpu with local numa cpu's first + * @i: index number + * @node: local numa_node + * + * This function selects an online CPU according to a numa aware policy; + * local cpus are returned first, followed by non-local ones, then it + * wraps around. + * + * It's not very efficient, but useful for setup. + */ +#define cpumask_local_spread LINUX_BACKPORT(cpumask_local_spread) +unsigned int cpumask_local_spread(unsigned int i, int node) +{ + int cpu; + + /* Wrap: we always want a cpu. */ + i %= num_online_cpus(); + + if (node == -1) { + for_each_cpu(cpu, cpu_online_mask) + if (i-- == 0) + return cpu; + } else { + /* NUMA first. */ + for_each_cpu_and(cpu, cpumask_of_node(node), cpu_online_mask) + if (i-- == 0) + return cpu; + + for_each_cpu(cpu, cpu_online_mask) { + /* Skip NUMA nodes, done above. */ + if (cpumask_test_cpu(cpu, cpumask_of_node(node))) + continue; + + if (i-- == 0) + return cpu; + } + } + BUG(); +} +EXPORT_SYMBOL(cpumask_local_spread); diff --git a/src/mlnx-ofa_kernel-5.8/compat/compat_atomic.c b/src/mlnx-ofa_kernel-5.8/compat/compat_atomic.c new file mode 100644 index 0000000..7859147 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/compat_atomic.c @@ -0,0 +1,35 @@ +#include +#include + +#if !((LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,31)) && (defined(CONFIG_UML) || defined(CONFIG_X86))) && !((LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,33)) && (defined(CONFIG_ARM) || defined(CONFIG_ARM64)) && !defined(CONFIG_GENERIC_ATOMIC64)) + +static DEFINE_SPINLOCK(lock); + +#define atomic64_read LINUX_BACKPORT(atomic64_read) +long long atomic64_read(const atomic64_t *v) +{ + unsigned long flags; + long long val; + + spin_lock_irqsave(&lock, flags); + val = v->counter; + spin_unlock_irqrestore(&lock, flags); + return val; +} +EXPORT_SYMBOL_GPL(atomic64_read); + +#define atomic64_add_return LINUX_BACKPORT(atomic64_add_return) +long long atomic64_add_return(long long a, atomic64_t *v) +{ + unsigned long flags; + long long val; + + spin_lock_irqsave(&lock, flags); + val = v->counter += a; + spin_unlock_irqrestore(&lock, flags); + return val; +} +EXPORT_SYMBOL_GPL(atomic64_add_return); + +#endif + diff --git a/src/mlnx-ofa_kernel-5.8/compat/compat_firmware_class.c b/src/mlnx-ofa_kernel-5.8/compat/compat_firmware_class.c new file mode 100644 index 0000000..32cd96b --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/compat_firmware_class.c @@ -0,0 +1,764 @@ +/* + * firmware_class.c - Multi purpose firmware loading support + * + * Copyright (c) 2003 Manuel Estrada Sainz + * + * Please see Documentation/firmware_class/ for more information. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define backport_firmware_to_dev(obj) container_of(obj, struct device, kobj) + +MODULE_AUTHOR("Manuel Estrada Sainz"); +MODULE_DESCRIPTION("Multi purpose firmware loading support"); +MODULE_LICENSE("GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif + +/* Builtin firmware support */ + +//#ifdef CONFIG_FW_LOADER +#if 0 + +extern struct builtin_fw __start_builtin_fw[]; +extern struct builtin_fw __end_builtin_fw[]; + +static bool fw_get_builtin_firmware(struct firmware *fw, const char *name) +{ + struct builtin_fw *b_fw; + + for (b_fw = __start_builtin_fw; b_fw != __end_builtin_fw; b_fw++) { + if (strcmp(name, b_fw->name) == 0) { + fw->size = b_fw->size; + fw->data = b_fw->data; + return true; + } + } + + return false; +} + +static bool fw_is_builtin_firmware(const struct firmware *fw) +{ + struct builtin_fw *b_fw; + + for (b_fw = __start_builtin_fw; b_fw != __end_builtin_fw; b_fw++) + if (fw->data == b_fw->data) + return true; + + return false; +} + +#else /* Module case - no builtin firmware support */ + +static inline bool fw_get_builtin_firmware(struct firmware *fw, const char *name) +{ + return false; +} + +static inline bool fw_is_builtin_firmware(const struct firmware *fw) +{ + return false; +} +#endif + +enum { + FW_STATUS_LOADING, + FW_STATUS_DONE, + FW_STATUS_ABORT, +}; + +static int loading_timeout = 60; /* In seconds */ + +/* fw_lock could be moved to 'struct firmware_priv' but since it is just + * guarding for corner cases a global lock should be OK */ +static DEFINE_MUTEX(fw_lock); + +struct firmware_priv { + struct completion completion; + struct firmware *fw; + unsigned long status; + struct page **pages; + int nr_pages; + int page_array_size; + struct timer_list timeout; + struct device dev; + bool nowait; + char fw_id[]; +}; + +static struct firmware_priv *to_firmware_priv(struct device *dev) +{ + return container_of(dev, struct firmware_priv, dev); +} + +static void fw_load_abort(struct firmware_priv *fw_priv) +{ + set_bit(FW_STATUS_ABORT, &fw_priv->status); + wmb(); + complete(&fw_priv->completion); +} + +static ssize_t firmware_timeout_show(struct class *class, + char *buf) +{ + return sprintf(buf, "%d\n", loading_timeout); +} + +/** + * firmware_timeout_store - set number of seconds to wait for firmware + * @class: device class pointer + * @buf: buffer to scan for timeout value + * @count: number of bytes in @buf + * + * Sets the number of seconds to wait for the firmware. Once + * this expires an error will be returned to the driver and no + * firmware will be provided. + * + * Note: zero means 'wait forever'. + **/ +static ssize_t firmware_timeout_store(struct class *class, + const char *buf, size_t count) +{ + loading_timeout = simple_strtol(buf, NULL, 10); + if (loading_timeout < 0) + loading_timeout = 0; + + return count; +} + +static struct class_attribute firmware_class_attrs[] = { + __ATTR(timeout, S_IWUSR | S_IRUGO, + firmware_timeout_show, firmware_timeout_store), + __ATTR_NULL +}; + +static void fw_dev_release(struct device *dev) +{ + struct firmware_priv *fw_priv = to_firmware_priv(dev); + int i; + + for (i = 0; i < fw_priv->nr_pages; i++) + __free_page(fw_priv->pages[i]); + kfree(fw_priv->pages); + kfree(fw_priv); + + module_put(THIS_MODULE); +} + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24)) +static int firmware_uevent(struct device *dev, struct kobj_uevent_env *env) +{ + struct firmware_priv *fw_priv = to_firmware_priv(dev); + + if (add_uevent_var(env, "FIRMWARE=%s", fw_priv->fw_id)) + return -ENOMEM; + if (add_uevent_var(env, "TIMEOUT=%i", loading_timeout)) + return -ENOMEM; + if (add_uevent_var(env, "ASYNC=%d", fw_priv->nowait)) + return -ENOMEM; + + return 0; +} +#else +static int firmware_uevent(struct device *dev, char **envp, + int num_envp, char *buf, int size) +{ + struct firmware_priv *fw_priv = to_firmware_priv(dev); + int error, len = 0, i = 0; + + error = add_uevent_var(envp, num_envp, &i, + buf, size, &len, + "FIRMWARE=%s", fw_priv->fw_id); + if (error) + goto exit; + + error = add_uevent_var(envp, num_envp, &i, + buf, size, &len, + "TIMEOUT=%i", loading_timeout); + if (error) + goto exit; + error = add_uevent_var(envp, num_envp, &i, + buf, size, &len, + "ASYNC=%i", fw_priv->nowait); + if (error) + goto exit; + + return 0; +exit: + envp[i] = NULL; + return error; +} +#endif + +static struct class firmware_class = { + .name = "compat_firmware", + .class_attrs = firmware_class_attrs, + .dev_uevent = firmware_uevent, + .dev_release = fw_dev_release, +}; + +static ssize_t firmware_loading_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct firmware_priv *fw_priv = to_firmware_priv(dev); + int loading = test_bit(FW_STATUS_LOADING, &fw_priv->status); + + return sprintf(buf, "%d\n", loading); +} + +static void firmware_free_data(const struct firmware *fw) +{ +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,35)) + int i; + vunmap(fw->data); + if (fw->pages) { + for (i = 0; i < PFN_UP(fw->size); i++) + __free_page(fw->pages[i]); + kfree(fw->pages); + } +#else + vunmap(fw->data); +#endif +} + +/* Some architectures don't have PAGE_KERNEL_RO */ +#ifndef PAGE_KERNEL_RO +#define PAGE_KERNEL_RO PAGE_KERNEL +#endif +/** + * firmware_loading_store - set value in the 'loading' control file + * @dev: device pointer + * @buf: buffer to scan for loading control value + * @count: number of bytes in @buf + * + * The relevant values are: + * + * 1: Start a load, discarding any previous partial load. + * 0: Conclude the load and hand the data to the driver code. + * -1: Conclude the load with an error and discard any written data. + **/ +static ssize_t firmware_loading_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct firmware_priv *fw_priv = to_firmware_priv(dev); + int loading = simple_strtol(buf, NULL, 10); + int i; + + switch (loading) { + case 1: + mutex_lock(&fw_lock); + if (!fw_priv->fw) { + mutex_unlock(&fw_lock); + break; + } + firmware_free_data(fw_priv->fw); + memset(fw_priv->fw, 0, sizeof(struct firmware)); + /* If the pages are not owned by 'struct firmware' */ + for (i = 0; i < fw_priv->nr_pages; i++) + __free_page(fw_priv->pages[i]); + kfree(fw_priv->pages); + fw_priv->pages = NULL; + fw_priv->page_array_size = 0; + fw_priv->nr_pages = 0; + set_bit(FW_STATUS_LOADING, &fw_priv->status); + mutex_unlock(&fw_lock); + break; + case 0: + if (test_bit(FW_STATUS_LOADING, &fw_priv->status)) { + vunmap(fw_priv->fw->data); + fw_priv->fw->data = vmap(fw_priv->pages, + fw_priv->nr_pages, + 0, PAGE_KERNEL_RO); + if (!fw_priv->fw->data) { + dev_err(dev, "%s: vmap() failed\n", __func__); + goto err; + } + /* Pages are now owned by 'struct firmware' */ +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,35)) + fw_priv->fw->pages = fw_priv->pages; + fw_priv->pages = NULL; +#endif + + fw_priv->page_array_size = 0; + fw_priv->nr_pages = 0; + complete(&fw_priv->completion); + clear_bit(FW_STATUS_LOADING, &fw_priv->status); + break; + } + /* fallthrough */ + default: + dev_err(dev, "%s: unexpected value (%d)\n", __func__, loading); + /* fallthrough */ + case -1: + err: + fw_load_abort(fw_priv); + break; + } + + return count; +} + +static DEVICE_ATTR(loading, 0644, firmware_loading_show, firmware_loading_store); + +#if defined(CONFIG_COMPAT_FIRMWARE_DATA_RW_NEEDS_FILP) +static ssize_t firmware_data_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buffer, loff_t offset, size_t count) +#else +static ssize_t firmware_data_read(struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buffer, loff_t offset, size_t count) +#endif +{ + struct device *dev = backport_firmware_to_dev(kobj); + struct firmware_priv *fw_priv = to_firmware_priv(dev); + struct firmware *fw; + ssize_t ret_count; + + mutex_lock(&fw_lock); + fw = fw_priv->fw; + if (!fw || test_bit(FW_STATUS_DONE, &fw_priv->status)) { + ret_count = -ENODEV; + goto out; + } + if (offset > fw->size) { + ret_count = 0; + goto out; + } + if (count > fw->size - offset) + count = fw->size - offset; + + ret_count = count; + + while (count) { + void *page_data; + int page_nr = offset >> PAGE_SHIFT; + int page_ofs = offset & (PAGE_SIZE-1); + int page_cnt = min_t(size_t, PAGE_SIZE - page_ofs, count); + + page_data = kmap(fw_priv->pages[page_nr]); + + memcpy(buffer, page_data + page_ofs, page_cnt); + + kunmap(fw_priv->pages[page_nr]); + buffer += page_cnt; + offset += page_cnt; + count -= page_cnt; + } +out: + mutex_unlock(&fw_lock); + return ret_count; +} + +static int fw_realloc_buffer(struct firmware_priv *fw_priv, int min_size) +{ + int pages_needed = ALIGN(min_size, PAGE_SIZE) >> PAGE_SHIFT; + + /* If the array of pages is too small, grow it... */ + if (fw_priv->page_array_size < pages_needed) { + int new_array_size = max(pages_needed, + fw_priv->page_array_size * 2); + struct page **new_pages; + + new_pages = kmalloc(new_array_size * sizeof(void *), + GFP_KERNEL); + if (!new_pages) { + fw_load_abort(fw_priv); + return -ENOMEM; + } + memcpy(new_pages, fw_priv->pages, + fw_priv->page_array_size * sizeof(void *)); + memset(&new_pages[fw_priv->page_array_size], 0, sizeof(void *) * + (new_array_size - fw_priv->page_array_size)); + kfree(fw_priv->pages); + fw_priv->pages = new_pages; + fw_priv->page_array_size = new_array_size; + } + + while (fw_priv->nr_pages < pages_needed) { + fw_priv->pages[fw_priv->nr_pages] = + alloc_page(GFP_KERNEL | __GFP_HIGHMEM); + + if (!fw_priv->pages[fw_priv->nr_pages]) { + fw_load_abort(fw_priv); + return -ENOMEM; + } + fw_priv->nr_pages++; + } + return 0; +} + +/** + * firmware_data_write - write method for firmware + * @kobj: kobject for the device + * @bin_attr: bin_attr structure + * @buffer: buffer being written + * @offset: buffer offset for write in total data store area + * @count: buffer size + * + * Data written to the 'data' attribute will be later handed to + * the driver as a firmware image. + **/ +#if defined(CONFIG_COMPAT_FIRMWARE_DATA_RW_NEEDS_FILP) +static ssize_t firmware_data_write(struct file *filp, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buffer, loff_t offset, size_t count) +#else +static ssize_t firmware_data_write(struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buffer, loff_t offset, size_t count) +#endif +{ + struct device *dev = backport_firmware_to_dev(kobj); + struct firmware_priv *fw_priv = to_firmware_priv(dev); + struct firmware *fw; + ssize_t retval; + + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + + mutex_lock(&fw_lock); + fw = fw_priv->fw; + if (!fw || test_bit(FW_STATUS_DONE, &fw_priv->status)) { + retval = -ENODEV; + goto out; + } + retval = fw_realloc_buffer(fw_priv, offset + count); + if (retval) + goto out; + + retval = count; + + while (count) { + void *page_data; + int page_nr = offset >> PAGE_SHIFT; + int page_ofs = offset & (PAGE_SIZE - 1); + int page_cnt = min_t(size_t, PAGE_SIZE - page_ofs, count); + + page_data = kmap(fw_priv->pages[page_nr]); + + memcpy(page_data + page_ofs, buffer, page_cnt); + + kunmap(fw_priv->pages[page_nr]); + buffer += page_cnt; + offset += page_cnt; + count -= page_cnt; + } + + fw->size = max_t(size_t, offset, fw->size); +out: + mutex_unlock(&fw_lock); + return retval; +} + +static struct bin_attribute firmware_attr_data = { + .attr = { .name = "data", .mode = 0644 }, + .size = 0, + .read = firmware_data_read, + .write = firmware_data_write, +}; + +static void firmware_class_timeout(u_long data) +{ + struct firmware_priv *fw_priv = (struct firmware_priv *) data; + + fw_load_abort(fw_priv); +} + +static struct firmware_priv * +fw_create_instance(struct firmware *firmware, const char *fw_name, + struct device *device, bool uevent, bool nowait) +{ + struct firmware_priv *fw_priv; + struct device *f_dev; + int error; + + fw_priv = kzalloc(sizeof(*fw_priv) + strlen(fw_name) + 1 , GFP_KERNEL); + if (!fw_priv) { + dev_err(device, "%s: kmalloc failed\n", __func__); + error = -ENOMEM; + goto err_out; + } + + fw_priv->fw = firmware; + fw_priv->nowait = nowait; + strcpy(fw_priv->fw_id, fw_name); + init_completion(&fw_priv->completion); + setup_timer(&fw_priv->timeout, + firmware_class_timeout, (u_long) fw_priv); + + f_dev = &fw_priv->dev; + + device_initialize(f_dev); + dev_set_name(f_dev, "%s", dev_name(device)); + f_dev->parent = device; + f_dev->class = &firmware_class; + + dev_set_uevent_suppress(f_dev, true); + + /* Need to pin this module until class device is destroyed */ + __module_get(THIS_MODULE); + + error = device_add(f_dev); + if (error) { + dev_err(device, "%s: device_register failed\n", __func__); + goto err_put_dev; + } + + error = device_create_bin_file(f_dev, &firmware_attr_data); + if (error) { + dev_err(device, "%s: sysfs_create_bin_file failed\n", __func__); + goto err_del_dev; + } + + error = device_create_file(f_dev, &dev_attr_loading); + if (error) { + dev_err(device, "%s: device_create_file failed\n", __func__); + goto err_del_bin_attr; + } + + if (uevent) + dev_set_uevent_suppress(f_dev, false); + + return fw_priv; + +err_del_bin_attr: + device_remove_bin_file(f_dev, &firmware_attr_data); +err_del_dev: + device_del(f_dev); +err_put_dev: + put_device(f_dev); +err_out: + return ERR_PTR(error); +} + +static void fw_destroy_instance(struct firmware_priv *fw_priv) +{ + struct device *f_dev = &fw_priv->dev; + + device_remove_file(f_dev, &dev_attr_loading); + device_remove_bin_file(f_dev, &firmware_attr_data); + device_unregister(f_dev); +} + +static int _request_firmware(const struct firmware **firmware_p, + const char *name, struct device *device, + bool uevent, bool nowait) +{ + struct firmware_priv *fw_priv; + struct firmware *firmware; + int retval = 0; + + if (!firmware_p) + return -EINVAL; + + *firmware_p = firmware = kzalloc(sizeof(*firmware), GFP_KERNEL); + if (!firmware) { + dev_err(device, "%s: kmalloc(struct firmware) failed\n", + __func__); + retval = -ENOMEM; + goto out; + } + + if (fw_get_builtin_firmware(firmware, name)) { + dev_dbg(device, "firmware: using built-in firmware %s\n", name); + return 0; + } + + if (uevent) + dev_dbg(device, "firmware: requesting %s\n", name); + + fw_priv = fw_create_instance(firmware, name, device, uevent, nowait); + if (IS_ERR(fw_priv)) { + retval = PTR_ERR(fw_priv); + goto out; + } + + if (uevent) { + if (loading_timeout > 0) + mod_timer(&fw_priv->timeout, + round_jiffies_up(jiffies + + loading_timeout * HZ)); + + kobject_uevent(&fw_priv->dev.kobj, KOBJ_ADD); + } + + wait_for_completion(&fw_priv->completion); + + set_bit(FW_STATUS_DONE, &fw_priv->status); + del_timer_sync(&fw_priv->timeout); + + mutex_lock(&fw_lock); + if (!fw_priv->fw->size || test_bit(FW_STATUS_ABORT, &fw_priv->status)) + retval = -ENOENT; + fw_priv->fw = NULL; + mutex_unlock(&fw_lock); + + fw_destroy_instance(fw_priv); + +out: + if (retval) { + release_firmware(firmware); + *firmware_p = NULL; + } + + return retval; +} + +/** + * request_firmware: - send firmware request and wait for it + * @firmware_p: pointer to firmware image + * @name: name of firmware file + * @device: device for which firmware is being loaded + * + * @firmware_p will be used to return a firmware image by the name + * of @name for device @device. + * + * Should be called from user context where sleeping is allowed. + * + * @name will be used as $FIRMWARE in the uevent environment and + * should be distinctive enough not to be confused with any other + * firmware image for this or any other device. + **/ +int +#define request_firmware LINUX_BACKPORT(request_firmware) +request_firmware(const struct firmware **firmware_p, const char *name, + struct device *device) +{ + int uevent = 1; + return _request_firmware(firmware_p, name, device, uevent, false); +} + +/** + * release_firmware: - release the resource associated with a firmware image + * @fw: firmware resource to release + **/ +#define release_firmware LINUX_BACKPORT(release_firmware) +void release_firmware(const struct firmware *fw) +{ + if (fw) { + if (!fw_is_builtin_firmware(fw)) + firmware_free_data(fw); + kfree(fw); + } +} + +/* Async support */ +struct firmware_work { + struct work_struct work; + struct module *module; + const char *name; + struct device *device; + void *context; + void (*cont)(const struct firmware *fw, void *context); + int uevent; +}; + +static int request_firmware_work_func(void *arg) +{ + struct firmware_work *fw_work = arg; + const struct firmware *fw; + int ret; + + if (!arg) { + WARN_ON(1); + return 0; + } + + ret = _request_firmware(&fw, fw_work->name, fw_work->device, + fw_work->uevent, true); + fw_work->cont(fw, fw_work->context); + + module_put(fw_work->module); + kfree(fw_work); + + return ret; +} + +/** + * request_firmware_nowait - asynchronous version of request_firmware + * @module: module requesting the firmware + * @uevent: sends uevent to copy the firmware image if this flag + * is non-zero else the firmware copy must be done manually. + * @name: name of firmware file + * @device: device for which firmware is being loaded + * @gfp: allocation flags + * @context: will be passed over to @cont, and + * @fw may be %NULL if firmware request fails. + * @cont: function will be called asynchronously when the firmware + * request is over. + * + * Asynchronous variant of request_firmware() for user contexts where + * it is not possible to sleep for long time. It can't be called + * in atomic contexts. + **/ +int +backport_request_firmware_nowait( + struct module *module, int uevent, + const char *name, struct device *device, gfp_t gfp, void *context, + void (*cont)(const struct firmware *fw, void *context)) +{ + struct task_struct *task; + struct firmware_work *fw_work; + + fw_work = kzalloc(sizeof (struct firmware_work), gfp); + if (!fw_work) + return -ENOMEM; + + fw_work->module = module; + fw_work->name = name; + fw_work->device = device; + fw_work->context = context; + fw_work->cont = cont; + fw_work->uevent = uevent; + + if (!try_module_get(module)) { + kfree(fw_work); + return -EFAULT; + } + + task = kthread_run(request_firmware_work_func, fw_work, + "firmware/%s", name); + if (IS_ERR(task)) { + fw_work->cont(NULL, fw_work->context); + module_put(fw_work->module); + kfree(fw_work); + return PTR_ERR(task); + } + + return 0; +} + +static int __init firmware_class_init(void) +{ + return class_register(&firmware_class); +} + +static void __exit firmware_class_exit(void) +{ + class_unregister(&firmware_class); +} + +fs_initcall(firmware_class_init); +module_exit(firmware_class_exit); + +EXPORT_SYMBOL_GPL(release_firmware); +EXPORT_SYMBOL_GPL(request_firmware); +EXPORT_SYMBOL_GPL(backport_request_firmware_nowait); diff --git a/src/mlnx-ofa_kernel-5.8/compat/config/.gitignore b/src/mlnx-ofa_kernel-5.8/compat/config/.gitignore new file mode 100644 index 0000000..4d4c7b1 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/config/.gitignore @@ -0,0 +1,8 @@ +mkinstalldirs +depcomp +compile +missing +config.guess +config.sub +ltmain.sh +install-sh diff --git a/src/mlnx-ofa_kernel-5.8/compat/config/build-linux.m4 b/src/mlnx-ofa_kernel-5.8/compat/config/build-linux.m4 new file mode 100644 index 0000000..1a9d1d4 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/config/build-linux.m4 @@ -0,0 +1,735 @@ +# LB_CHECK_FILE +# +# Check for file existance even when cross compiling +# +AC_DEFUN([LB_CHECK_FILE], +[AS_VAR_PUSHDEF([lb_File], [lb_cv_file_$1])dnl +AC_CACHE_CHECK([for $1], lb_File, +[if test -r "$1"; then + AS_VAR_SET(lb_File, yes) +else + AS_VAR_SET(lb_File, no) +fi]) +AS_IF([test AS_VAR_GET(lb_File) = yes], [$2], [$3])[]dnl +AS_VAR_POPDEF([lb_File])dnl +])# LB_CHECK_FILE + + +# +# Support XEN +# +AC_DEFUN([SET_XEN_INCLUDES], +[ +XEN_INCLUDES= +LB_LINUX_CONFIG([XEN],[XEN_INCLUDES="-I$LINUX/arch/x86/include/mach-xen"],[]) +LB_LINUX_CONFIG_VALUE([XEN_INTERFACE_VERSION],[XEN_INCLUDES="$XEN_INCLUDES -D__XEN_INTERFACE_VERSION__=$res"],[XEN_INCLUDES="$XEN_INCLUDES -D__XEN_INTERFACE_VERSION__=$res"]) +]) + +# +# LB_LINUX_VERSION +# +# Set things accordingly for a linux kernel +# +AC_DEFUN([LB_LINUX_VERSION],[ +KMODEXT=".ko" +AC_SUBST(KMODEXT) +] +) + + +# +# LB_LINUX_RELEASE +# +# get the release version of linux +# +AC_DEFUN([LB_LINUX_RELEASE], +[ +LINUXRELEASE=$(LB_LINUX_MAKE_OUTPUT([kernelrelease])) +if test x$LINUXRELEASE = x ; then + AC_MSG_RESULT([unknown]) + AC_MSG_ERROR([Could not determine Linux release version from linux/version.h.]) +fi +AC_MSG_RESULT([$LINUXRELEASE]) +AC_SUBST(LINUXRELEASE) + +moduledir='/lib/modules/'$LINUXRELEASE/updates/kernel +AC_SUBST(moduledir) + +modulefsdir='$(moduledir)/fs/$(PACKAGE)' +AC_SUBST(modulefsdir) + +modulenetdir='$(moduledir)/net/$(PACKAGE)' +AC_SUBST(modulenetdir) + +# ------------ RELEASE -------------------------------- +AC_MSG_CHECKING([for MLNX release]) +AC_ARG_WITH([release], + AC_HELP_STRING([--with-release=string], + [set the release string (default=$kvers_YYYYMMDDhhmm)]), + [RELEASE=$withval], + RELEASE="" + if test -n "$DOWNSTREAM_RELEASE"; then + RELEASE="${DOWNSTREAM_RELEASE}_" + fi + RELEASE="$RELEASE`echo ${LINUXRELEASE} | tr '-' '_'`_$BUILDID") +AC_MSG_RESULT($RELEASE) +AC_SUBST(RELEASE) + +# check is redhat/suse kernels +AC_MSG_CHECKING([that RedHat kernel]) +LB_LINUX_TRY_COMPILE([ + #include + ],[ + #ifndef RHEL_RELEASE_CODE + #error "not redhat kernel" + #endif + ],[ + RHEL_KERNEL="yes" + AC_MSG_RESULT([yes]) + ],[ + AC_MSG_RESULT([no]) +]) + +LB_LINUX_CONFIG([SUSE_KERNEL],[SUSE_KERNEL="yes"],[]) + +]) + +# LB_ARG_REPLACE_PATH(PACKAGE, PATH) +AC_DEFUN([LB_ARG_REPLACE_PATH],[ + new_configure_args= + eval "set x $ac_configure_args" + shift + for arg; do + case $arg in + --with-[$1]=*) + arg=--with-[$1]=[$2] + ;; + *\'*) + arg=$(printf %s\n ["$arg"] | \ + sed "s/'/'\\\\\\\\''/g") + ;; + esac + dnl AS_VAR_APPEND([new_configure_args], [" '$arg'"]) + new_configure_args="$new_configure_args \"$arg\"" + done + ac_configure_args=$new_configure_args +]) + +# this is the work-horse of the next function +AC_DEFUN([__LB_ARG_CANON_PATH], [ + [$3]=$(readlink -f $with_$2) + LB_ARG_REPLACE_PATH([$1], $[$3]) +]) + +# a front-end for the above function that transforms - and . in the +# PACKAGE portion of --with-PACKAGE into _ suitable for variable names +AC_DEFUN([LB_ARG_CANON_PATH], [ + __LB_ARG_CANON_PATH([$1], m4_translit([$1], [-.], [__]), [$2]) +]) + +# +# +# LB_LINUX_PATH +# +# Find paths for linux, handling kernel-source rpms +# +AC_DEFUN([LB_LINUX_PATH], +[# prep some default values +for DEFAULT_LINUX in /lib/modules/$(uname -r)/{source,build} /usr/src/linux; do + if readlink -q -e $DEFAULT_LINUX; then + break + fi +done +if test "$DEFAULT_LINUX" = "/lib/modules/$(uname -r)/source"; then + PATHS="/lib/modules/$(uname -r)/build" +fi +PATHS="$PATHS $DEFAULT_LINUX" +for DEFAULT_LINUX_OBJ in $PATHS; do + if readlink -q -e $DEFAULT_LINUX_OBJ; then + break + fi +done +AC_MSG_CHECKING([for Linux sources]) +AC_ARG_WITH([linux], + AC_HELP_STRING([--with-linux=path], + [set path to Linux source (default=/lib/modules/$(uname -r)/{source,build},/usr/src/linux)]), + [LB_ARG_CANON_PATH([linux], [LINUX]) + DEFAULT_LINUX_OBJ=$LINUX], + [LINUX=$DEFAULT_LINUX]) +AC_MSG_RESULT([$LINUX]) +AC_SUBST(LINUX) + +# -------- check for linux -------- +LB_CHECK_FILE([$LINUX],[], + [AC_MSG_ERROR([Kernel source $LINUX could not be found.])]) + +# -------- linux objects (for 2.6) -- +AC_MSG_CHECKING([for Linux objects dir]) +AC_ARG_WITH([linux-obj], + AC_HELP_STRING([--with-linux-obj=path], + [set path to Linux objects dir (default=/lib/modules/$(uname -r)/build,/usr/src/linux)]), + [LB_ARG_CANON_PATH([linux-obj], [LINUX_OBJ])], + [LINUX_OBJ=$DEFAULT_LINUX_OBJ]) + +AC_MSG_RESULT([$LINUX_OBJ]) +AC_SUBST(LINUX_OBJ) + +# -------- check for .config -------- +AC_ARG_WITH([linux-config], + [AC_HELP_STRING([--with-linux-config=path], + [set path to Linux .conf (default=$LINUX_OBJ/.config)])], + [LB_ARG_CANON_PATH([linux-config], [LINUX_CONFIG])], + [LINUX_CONFIG=$LINUX_OBJ/.config]) +AC_SUBST(LINUX_CONFIG) + +LB_CHECK_FILE([/boot/kernel.h], + [KERNEL_SOURCE_HEADER='/boot/kernel.h'], + [LB_CHECK_FILE([/var/adm/running-kernel.h], + [KERNEL_SOURCE_HEADER='/var/adm/running-kernel.h'])]) + +AC_ARG_WITH([kernel-source-header], + AC_HELP_STRING([--with-kernel-source-header=path], + [Use a different kernel version header. Consult build/README.kernel-source for details.]), + [LB_ARG_CANON_PATH([kernel-source-header], [KERNEL_SOURCE_HEADER])]) + +# ------------ .config exists ---------------- +LB_CHECK_FILE([$LINUX_CONFIG],[], + [AC_MSG_ERROR([Kernel config could not be found. If you are building from a kernel-source rpm consult build/README.kernel-source])]) + +# ----------- kconfig.h exists --------------- +# kernel 3.1, $LINUX/include/linux/kconfig.h is added +# see kernel commit 2a11c8ea20bf850b3a2c60db8c2e7497d28aba99 +LB_CHECK_FILE([$LINUX/include/linux/kconfig.h], + [CONFIG_INCLUDE=$LINUX/include/linux/kconfig.h], + [CONFIG_INCLUDE=$LINUX/include/generated/kconfig.h]) + AC_SUBST(CONFIG_INCLUDE) + +if test -e $CONFIG_INCLUDE; then + CONFIG_INCLUDE_FLAG="-include $CONFIG_INCLUDE" +fi + +# ------------ rhconfig.h includes runtime-generated bits -- +# red hat kernel-source checks + +# we know this exists after the check above. if the user +# tarred up the tree and ran make dep etc. in it, then +# version.h gets overwritten with a standard linux one. +# +if (grep -q rhconfig $LINUX_OBJ/include/linux/version.h 2>/dev/null) || + (grep -q rhconfig $LINUX_OBJ/include/generated/uapi/linux/version.h 2>/dev/null) ; then + # This is a clean kernel-source tree, we need to + # enable extensive workarounds to get this to build + # modules + LB_CHECK_FILE([$KERNEL_SOURCE_HEADER], + [if test $KERNEL_SOURCE_HEADER = '/boot/kernel.h' ; then + AC_MSG_WARN([Using /boot/kernel.h from RUNNING kernel.]) + AC_MSG_WARN([If this is not what you want, use --with-kernel-source-header.]) + AC_MSG_WARN([Consult build/README.kernel-source for details.]) + fi], + [AC_MSG_ERROR([$KERNEL_SOURCE_HEADER not found. Consult build/README.kernel-source for details.])]) + EXTRA_KCFLAGS="-include $KERNEL_SOURCE_HEADER $EXTRA_KCFLAGS" +fi + +# this is needed before we can build modules +SET_BUILD_ARCH +LB_LINUX_CROSS +LB_LINUX_VERSION +SET_XEN_INCLUDES + +# --- check that we can build modules at all +AC_MSG_CHECKING([that modules can be built at all]) +LB_LINUX_TRY_COMPILE([],[],[ + AC_MSG_RESULT([yes]) +],[ + AC_MSG_RESULT([no]) + AC_MSG_WARN([Consult config.log for details.]) + AC_MSG_WARN([If you are trying to build with a kernel-source rpm, consult build/README.kernel-source]) + AC_MSG_ERROR([Kernel modules cannot be built.]) +]) + +LB_LINUX_RELEASE +]) # end of LB_LINUX_PATH + +# LB_LINUX_SYMVERFILE +# SLES 9 uses a different name for this file - unsure about vanilla kernels +# around this version, but it matters for servers only. +AC_DEFUN([LB_LINUX_SYMVERFILE], + [AC_MSG_CHECKING([name of module symbol version file]) + if grep -q Modules.symvers $LINUX/scripts/Makefile.modpost ; then + SYMVERFILE=Modules.symvers + else + SYMVERFILE=Module.symvers + fi + AC_MSG_RESULT($SYMVERFILE) + AC_SUBST(SYMVERFILE) +]) + +# +# LB_LINUX_CROSS +# +# check for cross compilation +# +AC_DEFUN([LB_LINUX_CROSS], + [AC_MSG_CHECKING([for cross compilation]) +CROSS_VARS= +case $target_vendor in + # The K1OM architecture is an extension of the x86 architecture. + # So, the $target_arch is x86_64. + k1om) + AC_MSG_RESULT([Intel(R) Xeon Phi(TM)]) + CC_TARGET_ARCH=`$CC -v 2>&1 | grep Target: | sed -e 's/Target: //'` + if test $CC_TARGET_ARCH != x86_64-$target_vendor-linux ; then + AC_MSG_ERROR([Cross compiler not found in PATH.]) + fi + CROSS_VARS="ARCH=$target_vendor CROSS_COMPILE=x86_64-$target_vendor-linux-" + CCAS=$CC + if test x$enable_server = xyes ; then + AC_MSG_WARN([Disabling server (not supported for x86_64-$target_vendor-linux).]) + enable_server='no' + fi + ;; + *) + CROSS_VARS="CROSS_COMPILE=$CROSS_COMPILE" + AC_MSG_RESULT([no]) + ;; +esac +AC_SUBST(CROSS_VARS) +]) + +# these are like AC_TRY_COMPILE, but try to build modules against the +# kernel, inside the build directory + +# LB_LANG_PROGRAM(C)([PROLOGUE], [BODY]) +# -------------------------------------- +m4_define([LB_LANG_PROGRAM], +[ +#include +$1 +int +main (void) +{ +dnl Do *not* indent the following line: there may be CPP directives. +dnl Don't move the `;' right after for the same reason. +$2 + ; + return 0; +}]) + + +# +# LB_LINUX_MAKE_OUTPUT +# +# Runs a make target ($1, potentially with extra flags) +# output goes to standard output. +# +AC_DEFUN([LB_LINUX_MAKE_OUTPUT], +[ +MAKE=${MAKE:-make} +$MAKE -s M=$PWD -C $LINUX_OBJ $1 +]) + +# +# LB_LINUX_COMPILE_IFELSE +# +# like AC_COMPILE_IFELSE +# +AC_DEFUN([LB_LINUX_COMPILE_IFELSE], +[m4_ifvaln([$1], [AC_LANG_CONFTEST([$1])])dnl +MAKE=${MAKE:-make} +rm -f build/conftest.o build/conftest.mod.c build/conftest.ko build/output.log +AS_IF([AC_TRY_COMMAND(cp conftest.c build && env $CROSS_VARS $MAKE -d [$2] ${LD:+"LD=$CROSS_COMPILE$LD"} CC="$CROSS_COMPILE$CC" -f $PWD/build/Makefile MLNX_LINUX_CONFIG=$LINUX_CONFIG LINUXINCLUDE="-include generated/autoconf.h $XEN_INCLUDES $EXTRA_MLNX_INCLUDE -I$LINUX/arch/$SRCARCH/include -Iarch/$SRCARCH/include/generated -Iinclude -I$LINUX/arch/$SRCARCH/include/uapi -Iarch/$SRCARCH/include/generated/uapi -I$LINUX/include -I$LINUX/include/uapi -Iinclude/generated/uapi -I$LINUX/arch/$SRCARCH/include -Iarch/$SRCARCH/include/generated -I$LINUX/arch/$SRCARCH/include -I$LINUX/arch/$SRCARCH/include/generated -I$LINUX_OBJ/include -I$LINUX/include -I$LINUX_OBJ/include2 $CONFIG_INCLUDE_FLAG" -o tmp_include_depends -o scripts -o include/config/MARKER -C $LINUX_OBJ EXTRA_CFLAGS="-Werror-implicit-function-declaration -Wno-unused-variable -Wno-uninitialized $EXTRA_KCFLAGS" $CROSS_VARS M=$PWD/build >/dev/null 2>build/output.log; [[[ $? -ne 0 ]]] && cat build/output.log 1>&2 && false || config/warning_filter.sh build/output.log) >/dev/null && AC_TRY_COMMAND([$3])], + [$4], + [_AC_MSG_LOG_CONFTEST +m4_ifvaln([$5],[$5])dnl]) +rm -f build/conftest.o build/conftest.mod.c build/conftest.mod.o build/conftest.ko m4_ifval([$1], [build/conftest.c conftest.c])[]dnl +]) + +# +# LB_LINUX_ARCH +# +# Determine the kernel's idea of the current architecture +# +AC_DEFUN([LB_LINUX_ARCH], + [AC_MSG_CHECKING([Linux kernel architecture]) + AS_IF([rm -f $PWD/build/arch + make -s --no-print-directory echoarch -f $PWD/build/Makefile \ + MLNX_LINUX_CONFIG=$LINUX_CONFIG -C $LINUX $CROSS_VARS \ + ARCHFILE=$PWD/build/arch && LINUX_ARCH=`cat $PWD/build/arch`], + [AC_MSG_RESULT([$LINUX_ARCH])], + [AC_MSG_ERROR([Could not determine the kernel architecture.])]) + rm -f build/arch]) + +# +# LB_LINUX_TRY_COMPILE +# +# like AC_TRY_COMPILE +# +AC_DEFUN([LB_LINUX_TRY_COMPILE], +[LB_LINUX_COMPILE_IFELSE( + [AC_LANG_SOURCE([LB_LANG_PROGRAM([[$1]], [[$2]])])], + [modules], + [test -s build/conftest.o], + [$3], [$4])]) + +# +# LB_LINUX_CONFIG +# +# check if a given config option is defined +# +AC_DEFUN([LB_LINUX_CONFIG],[ + AC_MSG_CHECKING([if Linux was built with CONFIG_$1]) + LB_LINUX_TRY_COMPILE([ + #include + ],[ + #ifndef CONFIG_$1 + #error CONFIG_$1 not #defined + #endif + ],[ + AC_MSG_RESULT([yes]) + $2 + ],[ + AC_MSG_RESULT([no]) + $3 + ]) +]) + +# +# LB_LINUX_CONFIG_IM +# +# check if a given config option is builtin or as module +# +AC_DEFUN([LB_LINUX_CONFIG_IM],[ + AC_MSG_CHECKING([if Linux was built with CONFIG_$1 in or as module]) + LB_LINUX_TRY_COMPILE([ + #include + ],[ + #if !(defined(CONFIG_$1) || defined(CONFIG_$1_MODULE)) + #error CONFIG_$1 and CONFIG_$1_MODULE not #defined + #endif + ],[ + AC_MSG_RESULT([yes]) + $2 + ],[ + AC_MSG_RESULT([no]) + $3 + ]) +]) + +# +# LB_LINUX_TRY_MAKE +# +# like LB_LINUX_TRY_COMPILE, but with different arguments +# +AC_DEFUN([LB_LINUX_TRY_MAKE], + [LB_LINUX_COMPILE_IFELSE( + [AC_LANG_SOURCE([LB_LANG_PROGRAM([[$1]], [[$2]])])], + [$3], [$4], [$5], [$6] + )] +) + +# +# LB_CONFIG_COMPAT_RDMA +# +AC_DEFUN([LB_CONFIG_COMPAT_RDMA], +[AC_MSG_CHECKING([whether to use Compat RDMA]) +# set default +AC_ARG_WITH([o2ib], + AC_HELP_STRING([--with-o2ib=path], + [build o2iblnd against path]), + [ + case $with_o2ib in + yes) O2IBPATHS="$LINUX $LINUX/drivers/infiniband" + ENABLEO2IB=2 + ;; + no) ENABLEO2IB=0 + ;; + *) O2IBPATHS=$with_o2ib + ENABLEO2IB=3 + ;; + esac + ],[ + O2IBPATHS="$LINUX $LINUX/drivers/infiniband" + ENABLEO2IB=1 + ]) +if test $ENABLEO2IB -eq 0; then + AC_MSG_RESULT([no]) +else + o2ib_found=false + for O2IBPATH in $O2IBPATHS; do + if test \( -f ${O2IBPATH}/include/rdma/rdma_cm.h -a \ + -f ${O2IBPATH}/include/rdma/ib_cm.h -a \ + -f ${O2IBPATH}/include/rdma/ib_verbs.h -a \ + -f ${O2IBPATH}/include/rdma/ib_fmr_pool.h \); then + o2ib_found=true + break + fi + done + compatrdma_found=false + if $o2ib_found; then + if test \( -f ${O2IBPATH}/include/linux/compat-2.6.h \); then + compatrdma_found=true + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_COMPAT_RDMA, 1, [compat rdma found]) + else + AC_MSG_RESULT([no]) + fi + fi +fi +]) + +# +# LB_CONFIG_OFED_BACKPORTS +# +# include any OFED backport headers in all compile commands +# NOTE: this does only include the backport paths, not the OFED headers +# adding the OFED headers is done in the lnet portion +AC_DEFUN([LB_CONFIG_OFED_BACKPORTS], +[AC_MSG_CHECKING([whether to use any OFED backport headers]) +if test $ENABLEO2IB -eq 0; then + AC_MSG_RESULT([no]) +else + if ! $o2ib_found; then + AC_MSG_RESULT([no]) + case $ENABLEO2IB in + 1) ;; + 2) AC_MSG_ERROR([kernel OpenIB gen2 headers not present]);; + 3) AC_MSG_ERROR([bad --with-o2ib path]);; + *) AC_MSG_ERROR([internal error]);; + esac + else + if ! $compatrdma_found; then + if test -f $O2IBPATH/config.mk; then + . $O2IBPATH/config.mk + elif test -f $O2IBPATH/ofed_patch.mk; then + . $O2IBPATH/ofed_patch.mk + fi + fi + if test -n "$BACKPORT_INCLUDES"; then + OFED_BACKPORT_PATH="$O2IBPATH/${BACKPORT_INCLUDES/*\/kernel_addons/kernel_addons}/" + EXTRA_LNET_INCLUDE="-I$OFED_BACKPORT_PATH $EXTRA_LNET_INCLUDE" + AC_MSG_RESULT([yes]) + else + AC_MSG_RESULT([no]) + fi + fi +fi +]) + +# LC_MODULE_LOADING +# after 2.6.28 CONFIG_KMOD is removed, and only CONFIG_MODULES remains +# so we test if request_module is implemented or not +AC_DEFUN([LC_MODULE_LOADING], +[AC_MSG_CHECKING([if kernel module loading is possible]) +LB_LINUX_TRY_MAKE([ + #include +],[ + int myretval=ENOSYS ; + return myretval; +],[ + MLNX_KERNEL_TEST=conftest.i +],[dnl + grep request_module build/conftest.i |dnl + grep -v `grep "int myretval=" build/conftest.i |dnl + cut -d= -f2 | cut -d" " -f1`dnl + >/dev/null dnl +],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_MODULE_LOADING_SUPPORT, 1, + [kernel module loading is possible]) +],[ + AC_MSG_RESULT(no) + AC_MSG_WARN([]) + AC_MSG_WARN([Kernel module loading support is highly recommended.]) + AC_MSG_WARN([]) +]) +]) + +# +# LB_PROG_LINUX +# +# linux tests +# +AC_DEFUN([LB_PROG_LINUX], +[LB_LINUX_PATH +LB_LINUX_ARCH +LB_LINUX_SYMVERFILE + + +LB_LINUX_CONFIG([MODULES],[],[ + AC_MSG_ERROR([module support is required to build MLNX kernel modules.]) +]) + +LB_LINUX_CONFIG([MODVERSIONS]) + +LB_LINUX_CONFIG([KALLSYMS],[],[ + AC_MSG_ERROR([compat_mlnx requires that CONFIG_KALLSYMS is enabled in your kernel.]) +]) + +# 2.6.28 +LC_MODULE_LOADING + +LB_CONFIG_COMPAT_RDMA + +# it's ugly to be doing anything with OFED outside of the lnet module, but +# this has to be done here so that the backports path is set before all of +# the LN_PROG_LINUX checks are done +LB_CONFIG_OFED_BACKPORTS +]) + +# +# LB_CHECK_SYMBOL_EXPORT +# check symbol exported or not +# $1 - symbol +# $2 - file(s) for find. +# $3 - do 'yes' +# $4 - do 'no' +# +# 2.6 based kernels - put modversion info into $LINUX_OBJ/Module.modvers +# or check +AC_DEFUN([LB_CHECK_SYMBOL_EXPORT], +[AC_MSG_CHECKING([if Linux was built with symbol $1 exported]) +grep -q -E '[[[:space:]]]$1[[[:space:]]]' $LINUX_OBJ/$SYMVERFILE 2>/dev/null +rc=$? +if test $rc -ne 0; then + export=0 + for file in $2; do + grep -q -E "EXPORT_SYMBOL.*\($1\)" "$LINUX/$file" 2>/dev/null + rc=$? + if test $rc -eq 0; then + export=1 + break; + fi + done + if test $export -eq 0; then + AC_MSG_RESULT([no]) + $4 + else + AC_MSG_RESULT([yes]) + $3 + fi +else + AC_MSG_RESULT([yes]) + $3 +fi +]) + +# +# Like AC_CHECK_HEADER but checks for a kernel-space header +# +m4_define([LB_CHECK_LINUX_HEADER], +[AS_VAR_PUSHDEF([ac_Header], [ac_cv_header_$1])dnl +AC_CACHE_CHECK([for $1], ac_Header, + [LB_LINUX_COMPILE_IFELSE([LB_LANG_PROGRAM([@%:@include <$1>])], + [modules], + [test -s build/conftest.o], + [AS_VAR_SET(ac_Header, [yes])], + [AS_VAR_SET(ac_Header, [no])])]) +AS_IF([test AS_VAR_GET(ac_Header) = yes], [$2], [$3])[]dnl +AS_VAR_POPDEF([ac_Header])dnl +]) + +# +# LB_USES_DPKG +# +# Determine if the target is a dpkg system or rpm +# +AC_DEFUN([LB_USES_DPKG], +[ +AC_MSG_CHECKING([if this distro uses dpkg]) +case `lsb_release -i -s 2>/dev/null` in + Ubuntu | Debian) + AC_MSG_RESULT([yes]) + uses_dpkg=yes + ;; + *) + AC_MSG_RESULT([no]) + uses_dpkg=no + ;; +esac +]) + +# +# LB_PROG_CC +# +# checks on the C compiler +# +AC_DEFUN([LB_PROG_CC], +[AC_PROG_RANLIB +AC_CHECK_TOOL(LD, ld, [no]) +AC_CHECK_TOOL(OBJDUMP, objdump, [no]) +AC_CHECK_TOOL(STRIP, strip, [no]) + +# --------- unsigned long long sane? ------- +AC_CHECK_SIZEOF(unsigned long long, 0) +echo "---> size SIZEOF $SIZEOF_unsigned_long_long" +echo "---> size SIZEOF $ac_cv_sizeof_unsigned_long_long" +if test $ac_cv_sizeof_unsigned_long_long != 8 ; then + AC_MSG_ERROR([** we assume that sizeof(long long) == 8.]) +fi + +if test $target_cpu == "powerpc64"; then + AC_MSG_WARN([set compiler with -m64]) + CFLAGS="$CFLAGS -m64" + CC="$CC -m64" +fi +]) + +# LB_CONTITIONALS +# +AC_DEFUN([LB_CONDITIONALS], +[ +AM_CONDITIONAL(ARCH_x86, test x$target_cpu = "xx86_64" -o x$target_cpu = "xi686") + +AC_OUTPUT + +cat <<_ACEOF + +CC: $CC +LD: $LD +CFLAGS: $CFLAGS +EXTRA_KCFLAGS: $EXTRA_KCFLAGS + +Type 'make' to build kernel modules. +_ACEOF +]) + +# +# SET_BUILD_ARCH +# +AC_DEFUN([SET_BUILD_ARCH], +[ +AC_MSG_CHECKING([for build ARCH]) +SRCARCH=${ARCH:-$(uname -m)} +SRCARCH=$(echo $SRCARCH | sed -e s/i.86/x86/ \ + -e s/x86_64/x86/ \ + -e s/ppc.*/powerpc/ \ + -e 's/powerpc64/powerpc/' \ + -e s/aarch64.*/arm64/ \ + -e s/sparc32.*/sparc/ \ + -e s/sparc64.*/sparc/ \ + -e s/s390x/s390/) + +# very old kernels had different strucure under arch dir +if [[ "X$SRCARCH" == "Xx86" ]] && ! [[ -d "$LINUX/arch/x86" ]]; then + SRCARCH=x86_64 +fi + +AC_MSG_RESULT([ARCH=$ARCH, SRCARCH=$SRCARCH]) +]) + +# +# LB_LINUX_CONFIG_VALUE +# +# get a given config's option value +# +AC_DEFUN([LB_LINUX_CONFIG_VALUE],[ + AC_MSG_CHECKING([get value of CONFIG_$1]) + if (grep -q "^#define CONFIG_$1 " $LINUX_OBJ/include/generated/autoconf.h 2>/dev/null); then + res=$(grep "^#define CONFIG_$1 " $LINUX_OBJ/include/generated/autoconf.h 2>/dev/null | cut -d' ' -f'3') + AC_MSG_RESULT([$1 value is '$res']) + $2 + else + AC_MSG_RESULT([$1 in not defined in autoconf.h]) + $3 + fi +]) diff --git a/src/mlnx-ofa_kernel-5.8/compat/config/parallel-build.m4 b/src/mlnx-ofa_kernel-5.8/compat/config/parallel-build.m4 new file mode 100644 index 0000000..396140d --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/config/parallel-build.m4 @@ -0,0 +1,155 @@ +# +# This file defines macros used to manage and support running +# build tests in parallel. +# + +# +# Prepare stuff for parralel build jobs process +# +AC_DEFUN([MLNX_PARALLEL_INIT_ONCE], +[ +if [[ "X${RAN_MLNX_PARALLEL_INIT_ONCE}" != "X1" ]]; then + MAX_JOBS=${NJOBS:-1} + RAN_MLNX_PARALLEL_INIT_ONCE=1 + /bin/rm -rf CONFDEFS_H_DIR + /bin/mkdir -p CONFDEFS_H_DIR + declare -i CONFDEFS_H_INDEX=0 + declare -i RUNNING_JOBS=0 +fi +]) + +# +# MLNX_AC_DEFINE(VARIABLE, [VALUE], [DESCRIPTION]) +# ------------------------------------------- +# Set VARIABLE to VALUE, verbatim, or 1. Remember the value +# and if VARIABLE is affected the same VALUE, do nothing, else +# die. The third argument is used by autoheader. +m4_define([MLNX_AC_DEFINE], [_MLNX_AC_DEFINE_Q([\], $@)]) + + +# _MLNX_AC_DEFINE_Q(QUOTE, VARIABLE, [VALUE], [DESCRIPTION]) +# ----------------------------------------------------- +# Internal function that performs common elements of AC_DEFINE{,_UNQUOTED}. +# +# m4_index is roughly 5 to 8 times faster than m4_bpatsubst, so only +# use the regex when necessary. AC_name is defined with over-quotation, +# so that we can avoid m4_defn. +m4_define([_MLNX_AC_DEFINE_Q], +[m4_pushdef([AC_name], m4_if(m4_index([$2], [(]), [-1], [[[$2]]], + [m4_bpatsubst([[[$2]]], [(.*)])]))dnl +AC_DEFINE_TRACE(AC_name)dnl +m4_cond([m4_index([$3], [ +])], [-1], [], + [AS_LITERAL_IF([$3], [m4_bregexp([[$3]], [[^\\] +], [-])])], [], [], + [m4_warn([syntax], [AC_DEFINE]m4_ifval([$1], [], [[_UNQUOTED]])dnl +[: `$3' is not a valid preprocessor define value])])dnl +m4_ifval([$4], [AH_TEMPLATE(AC_name, [$4])])dnl +cat >>CONFDEFS_H_DIR/confdefs.h.${CONFDEFS_H_INDEX} <<$1_ACEOF +[@%:@define] $2 m4_if([$#], 2, 1, [$3], [], [/**/], [$3]) +_ACEOF +]) + +# MLNX_AC_LANG_SOURCE(C)(BODY) +# ----------------------- +# We can't use '#line $LINENO "configure"' here, since +# Sun c89 (Sun WorkShop 6 update 2 C 5.3 Patch 111679-08 2002/05/09) +# rejects $LINENO greater than 32767, and some configure scripts +# are longer than 32767 lines. +m4_define([MLNX_AC_LANG_SOURCE(C)], +[/* confdefs.h. */ +_ACEOF +cat confdefs.h >>$tmpbuild/conftest.$ac_ext +cat >>$tmpbuild/conftest.$ac_ext <<_ACEOF +/* end confdefs.h. */ +$1]) + +# MLNX_AC_LANG_SOURCE(BODY) +# -------------------- +# Produce a valid source for the current language, which includes the +# BODY, and as much as possible `confdefs.h'. +AC_DEFUN([MLNX_AC_LANG_SOURCE], +[_AC_LANG_DISPATCH([$0], _AC_LANG, $@)]) + + +# MLNX_AC_LANG_CONFTEST(BODY) +# ---------------------- +# Save the BODY in `conftest.$ac_ext'. Add a trailing new line. +AC_DEFUN([MLNX_AC_LANG_CONFTEST], +[cat >$tmpbuild/conftest.$ac_ext <<_ACEOF +$1 +_ACEOF]) + +# _MLNX_AC_MSG_LOG_CONFTEST +# -------------------- +m4_define([_MLNX_AC_MSG_LOG_CONFTEST], +[AS_ECHO(["$as_me: failed program was:"]) >&AS_MESSAGE_LOG_FD +sed 's/^/| /' $tmpbuild/conftest.$ac_ext >&AS_MESSAGE_LOG_FD +]) + + +# +# MLNX_LB_LINUX_COMPILE_IFELSE +# +# like AC_COMPILE_IFELSE. +# runs in a temp dir +# +AC_DEFUN([MLNX_LB_LINUX_COMPILE_IFELSE], +[ +{ +MAKE=${MAKE:-make} +tmpbuild=$(/bin/mktemp -d $PWD/build/build_XXXXX) +/bin/cp build/Makefile $tmpbuild/ +m4_ifvaln([$1], [MLNX_AC_LANG_CONFTEST([$1])])dnl +AS_IF([AC_TRY_COMMAND(env $CROSS_VARS $MAKE -d [$2] ${LD:+"LD=$CROSS_COMPILE$LD"} CC="$CROSS_COMPILE$CC" -f $tmpbuild/Makefile MLNX_LINUX_CONFIG=$LINUX_CONFIG LINUXINCLUDE="-include generated/autoconf.h $XEN_INCLUDES $EXTRA_MLNX_INCLUDE -I$LINUX/arch/$SRCARCH/include -Iarch/$SRCARCH/include/generated -Iinclude -I$LINUX/arch/$SRCARCH/include/uapi -Iarch/$SRCARCH/include/generated/uapi -I$LINUX/include -I$LINUX/include/uapi -Iinclude/generated/uapi -I$LINUX/arch/$SRCARCH/include -Iarch/$SRCARCH/include/generated -I$LINUX/arch/$SRCARCH/include -I$LINUX/arch/$SRCARCH/include/generated -I$LINUX_OBJ/include -I$LINUX/include -I$LINUX_OBJ/include2 $CONFIG_INCLUDE_FLAG" -o tmp_include_depends -o scripts -o include/config/MARKER -C $LINUX_OBJ EXTRA_CFLAGS="-Werror-implicit-function-declaration -Wno-unused-variable -Wno-uninitialized $EXTRA_KCFLAGS" $CROSS_VARS M=$tmpbuild >/dev/null 2>$tmpbuild/output.log; [[[ $? -ne 0 ]]] && cat $tmpbuild/output.log 1>&2 && false || config/warning_filter.sh $tmpbuild/output.log) >/dev/null && AC_TRY_COMMAND([$3])], + [$4], + [_MLNX_AC_MSG_LOG_CONFTEST +m4_ifvaln([$5],[$5])dnl]) +/bin/rm -rf $tmpbuild +} +]) + +# +# MLNX_LB_LINUX_TRY_COMPILE +# +# like AC_TRY_COMPILE +# +AC_DEFUN([MLNX_LB_LINUX_TRY_COMPILE], +[MLNX_LB_LINUX_COMPILE_IFELSE( + [MLNX_AC_LANG_SOURCE([LB_LANG_PROGRAM([[$1]], [[$2]])])], + [modules], + [test -s $tmpbuild/conftest.o], + [$3], [$4])]) + +# MLNX_BG_LB_LINUX_COMPILE_IFELSE +# +# Do fork and call LB_LINUX_COMPILE_IFELSE +# to run the build test in background +# +AC_DEFUN([MLNX_BG_LB_LINUX_TRY_COMPILE], +[ +# init stuff +MLNX_PARALLEL_INIT_ONCE + +# wait if there are MAX_JOBS tests running +if [[ $RUNNING_JOBS -eq $MAX_JOBS ]]; then + wait + RUNNING_JOBS=0 +else + let RUNNING_JOBS++ +fi + +# inc header index +let CONFDEFS_H_INDEX++ + +# run test in background if MAX_JOBS > 1 +if [[ $MAX_JOBS -eq 1 ]]; then +MLNX_LB_LINUX_TRY_COMPILE([$1], [$2], [$3], [$4]) +else +{ +MLNX_LB_LINUX_TRY_COMPILE([$1], [$2], [$3], [$4]) +}& +fi +]) + +##################################### diff --git a/src/mlnx-ofa_kernel-5.8/compat/config/rdma.m4 b/src/mlnx-ofa_kernel-5.8/compat/config/rdma.m4 new file mode 100644 index 0000000..67fcb2c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/config/rdma.m4 @@ -0,0 +1,15838 @@ +/nl Examine kernel functionality + +# DO NOT insert new defines in this section!!! +# Add your defines ONLY in LINUX_CONFIG_COMPAT section +AC_DEFUN([BP_CHECK_RHTABLE], +[ + AC_MSG_CHECKING([if file include/linux/rhashtable-types.h exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct rhltable x; + x = x; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_RHASHTABLE_TYPES, 1, + [file rhashtable-types exists]) + ],[ + AC_MSG_RESULT(no) + AC_MSG_CHECKING([if rhltable defined]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct rhltable x; + x = x; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_RHLTABLE, 1, + [struct rhltable is defined]) + AC_MSG_CHECKING([if struct rhashtable_params contains insecure_elasticity]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct rhashtable_params x; + unsigned int y; + y = (unsigned int)x.insecure_elasticity; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_RHASHTABLE_INSECURE_ELASTICITY, 1, + [struct rhashtable_params has insecure_elasticity]) + ],[ + AC_MSG_RESULT(no) + ]) + AC_MSG_CHECKING([if struct rhashtable_params contains insecure_max_entries]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct rhashtable_params x; + unsigned int y; + y = (unsigned int)x.insecure_max_entries; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_RHASHTABLE_INSECURE_MAX_ENTRIES, 1, + [struct rhashtable_params has insecure_max_entries]) + ],[ + AC_MSG_RESULT(no) + ]) + AC_MSG_CHECKING([if struct rhashtable contains max_elems]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct rhashtable x; + unsigned int y; + y = (unsigned int)x.max_elems; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_RHASHTABLE_MAX_ELEMS, 1, + [struct rhashtable has max_elems]) + ],[ + AC_MSG_RESULT(no) + ]) + ],[ + AC_MSG_RESULT(no) + AC_MSG_CHECKING([if struct netns_frags contains rhashtable]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + struct netns_frags x; + struct rhashtable rh; + rh = x.rhashtable; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NETNS_FRAGS_RHASHTABLE, 1, + [struct netns_frags has rhashtable]) + ],[ + AC_MSG_RESULT(no) + ]) + ]) + ]) +]) + + +AC_DEFUN([LINUX_CONFIG_COMPAT], +[ + AC_MSG_CHECKING([if have hmm_pfn_to_map_order]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + unsigned int i = hmm_pfn_to_map_order(0UL); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_HMM_PFN_TO_MAP_ORDER, 1, + [have hmm_pfn_to_map_order]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if hmm_range has hmm_pfns]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct hmm_range h; + h.hmm_pfns = NULL; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_HMM_RANGE_HAS_HMM_PFNS, 1, + [hmm_range has hmm_pfns]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if hmm_range_fault has one param]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int l; + l = hmm_range_fault(NULL); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_HMM_RANGE_FAULT_HAS_ONE_PARAM, 1, + [hmm_range_fault has one param]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if debugfs.h debugfs_lookup defined]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + debugfs_lookup(NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEBUGFS_LOOKUP, 1, + [debugfs.h debugfs_lookup defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if has is_tcf_police]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return is_tcf_police(NULL) ? 1 : 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IS_TCF_POLICE, 1, + [is_tcf_police is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if has is_tcf_tunnel_set && is_tcf_tunnel_release]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return is_tcf_tunnel_set(NULL) && is_tcf_tunnel_release(NULL); + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IS_TCF_TUNNEL, 1, + [is_tcf_tunnel_set and is_tcf_tunnel_release are defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if has netdev_notifier_info_to_dev]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return netdev_notifier_info_to_dev(NULL) ? 1 : 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NETDEV_NOTIFIER_INFO_TO_DEV, 1, + [netdev_notifier_info_to_dev is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if udp_tunnel.h has struct udp_tunnel_nic_info]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct udp_tunnel_nic_info x; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_UDP_TUNNEL_NIC_INFO, 1, + [udp_tunnel.h has struct udp_tunnel_nic_info is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if mm has register_netdevice_notifier_rh]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return register_netdevice_notifier_rh(NULL); + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_REGISTER_NETDEVICE_NOTIFIER_RH, 1, + [register_netdevice_notifier_rh is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/netdevice.h has unregister_netdevice_notifier_net]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + unregister_netdevice_notifier_net(NULL,NULL); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_UNREGISTER_NETDEVICE_NOTIFIER_NET, 1, + [unregister_netdevice_notifier_net is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/netdevice.h has dev_xdp_prog_id]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + dev_xdp_prog_id(NULL,0); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEV_XDP_PROG_ID, 1, + [dev_xdp_prog_id is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct netdev_net_notifier exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct netdev_net_notifier notifier; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NETDEV_NET_NOTIFIER, 1, + [struct netdev_net_notifier is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/netdevice.h has net_prefetch]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + net_prefetch(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NET_PREFETCH, 1, + [net_prefetch is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/pagemap.h has release_pages ]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + release_pages(NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_RELEASE_PAGES, 1, + [release_pages is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/mm.h has get_user_pages_longterm]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + get_user_pages_longterm(0, 0, 0, NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_GET_USER_PAGES_LONGTERM, 1, + [get_user_pages_longterm is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if get_user_pages has 5 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + get_user_pages(0, 0, 0, NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_GET_USER_PAGES_5_PARAMS, 1, + [get_user_pages has 5 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if get_user_pages has 7 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + get_user_pages(NULL, NULL, 0, 0, 0, NULL, NULL); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_GET_USER_PAGES_7_PARAMS, 1, + [get_user_pages has 7 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if string.h has memcpy_and_pad]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ], + [ + memcpy_and_pad(NULL, 0, NULL, 0, ' '); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_MEMCPY_AND_PAD, 1, + [memcpy_and_pad is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if map_lock has mmap_read_lock]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + mmap_read_lock(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_MMAP_READ_LOCK, 1, + [map_lock has mmap_read_lock]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if mm has get_user_pages_remote with 7 parameters]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + get_user_pages_remote(NULL, NULL, 0, 0, 0, NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_GET_USER_PAGES_REMOTE_7_PARAMS, 1, + [get_user_pages_remote is defined with 7 parameters]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if mm has get_user_pages_remote with 7 parameters and parameter 2 is integer]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + get_user_pages_remote(NULL, 0, 0, 0, NULL, NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_GET_USER_PAGES_REMOTE_7_PARAMS_AND_SECOND_INT, 1, + [get_user_pages_remote is defined with 7 parameters and parameter 2 is integer]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if mm has get_user_pages_remote with 8 parameters]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + get_user_pages_remote(NULL, NULL, 0, 0, 0, 0, NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_GET_USER_PAGES_REMOTE_8_PARAMS, 1, + [get_user_pages_remote is defined with 8 parameters]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if mm has get_user_pages_remote with 8 parameters with locked]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + get_user_pages_remote(NULL, NULL, 0, 0, 0, NULL, NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_GET_USER_PAGES_REMOTE_8_PARAMS_W_LOCKED, 1, + [get_user_pages_remote is defined with 8 parameters with locked]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if kernel has ktime_get_ns]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + unsigned long long ns; + + ns = ktime_get_ns(); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_KTIME_GET_NS, 1, + [ktime_get_ns defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if page_ref.h has page_ref_count/add/sub/inc]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + page_ref_count(NULL); + page_ref_add(NULL, 0); + page_ref_sub(NULL, 0); + page_ref_inc(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_PAGE_REF_COUNT_ADD_SUB_INC, 1, + [page_ref_count/add/sub/inc defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if kernel.h has int_pow]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return int_pow(2, 3); + + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_INT_POW, 1, + [int_pow defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct net_device_ops has ndo_get_devlink_port]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct net_device_ops ndops = { + .ndo_get_devlink_port = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NDO_GET_DEVLINK_PORT, 1, + [ndo_get_devlink_port is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct net_device_ops has ndo_get_phys_port_name]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct net_device_ops ndops = { + .ndo_get_phys_port_name = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NDO_GET_PHYS_PORT_NAME, 1, + [ndo_get_phys_port_name is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink.h has devlink_set_features]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + devlink_set_features(NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_SET_FEATURES, 1, + [devlink.h has devlink_set_features]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink.h has devlink_to_dev]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + devlink_to_dev(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_TO_DEV, 1, + [devlink.h has devlink_to_dev]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink.h devl_port_register defined]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + devl_port_register(NULL, NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVL_PORT_REGISTER, 1, + [devlink.h devl_port_register defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink.h devl_trap_groups_register defined]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + devl_trap_groups_register(NULL, NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVL_TRAP_GROUPS_REGISTER, 1, + [devlink.h devl_trap_groups_register defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink.h devlink_param_register defined]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + devlink_param_register(NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_PARAM_REGISTER, 1, + [devlink.h devlink_param_register defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink.h has devlink_register get 1 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + devlink_register(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_REGISTER_GET_1_PARAMS, 1, + [devlink.h has devlink_register get 1 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink.h has devlink_alloc get 3 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + devlink_alloc(NULL, 0, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_ALLOC_GET_3_PARAMS, 1, + [devlink.h has devlink_alloc get 3 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink.h has devlink_port_attrs_pci_sf_set get 4 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + devlink_port_attrs_pci_sf_set(NULL, 0, 0, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_PORT_ATTRS_PCI_SF_SET_GET_4_PARAMS, 1, + [devlink.h has devlink_port_attrs_pci_sf_set get 4 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink.h has devlink_port_attrs_pci_sf_set get 5 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + devlink_port_attrs_pci_sf_set(NULL, 0, 0, 0, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_PORT_ATTRS_PCI_SF_SET_GET_5_PARAMS, 1, + [devlink.h has devlink_port_attrs_pci_sf_set get 5 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink.h devlink_port_attrs_pci_vf_set get 3 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + devlink_port_attrs_pci_vf_set(NULL, 0, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_PORT_ATTRS_PCI_VF_SET_GET_3_PARAMS, 1, + [devlink.h devlink_port_attrs_pci_vf_set get 3 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink has devlink_port_attrs_pci_vf_set has 5 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + devlink_port_attrs_pci_vf_set(NULL, NULL, 0, 0, 0); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_PORT_ATTRS_PCI_VF_SET_GET_5_PARAMS, 1, + [devlink_port_attrs_pci_vf_set has 5 params]) + ],[ + AC_MSG_RESULT(no) + AC_MSG_CHECKING([if devlink has devlink_port_attrs_pci_vf_set has 5 params and controller num]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + devlink_port_attrs_pci_vf_set(NULL, 0, 0, 0, 0); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_PORT_ATTRS_PCI_VF_SET_GET_CONTROLLER_NUM, 1, + [devlink_port_attrs_pci_vf_set has 5 params and controller num]) + ],[ + AC_MSG_RESULT(no) + ]) + ]) + + AC_MSG_CHECKING([if devlink.h devlink_port_attrs_pci_pf_set get 2 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + devlink_port_attrs_pci_pf_set(NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_PORT_ATTRS_PCI_PF_SET_GET_2_PARAMS, 1, + [devlink.h devlink_port_attrs_pci_pf_set get 2 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink.h has devlink_fmsg_binary_pair_nest_start]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + devlink_fmsg_binary_pair_nest_start(NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_FMSG_BINARY_PAIR_NEST_START, 1, + [devlink.h has devlink_fmsg_binary_pair_nest_start is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink has devlink_flash_update_status_notify]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + devlink_flash_update_status_notify(NULL, NULL, NULL, 0, 0); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_FLASH_UPDATE_STATUS_NOTIFY, 1, + [devlink_flash_update_status_notify]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink has devlink_flash_update_end_notify]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + devlink_flash_update_end_notify(NULL); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_FLASH_UPDATE_END_NOTIFY, 1, + [devlink_flash_update_end_notify]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink has devlink_info_version_fixed_put]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + devlink_info_version_fixed_put(NULL, NULL, NULL); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_INFO_VERSION_FIXED_PUT, 1, + [devlink_info_version_fixed_put exist]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink has devlink_port_type_eth_set]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + devlink_port_type_eth_set(NULL, NULL); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_PORT_TYPE_ETH_SET, 1, + [devlink_port_type_eth_set exist]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink has devlink_health_reporter_state_update]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + devlink_health_reporter_state_update(NULL, 0); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_HEALTH_REPORTER_STATE_UPDATE, 1, + [devlink_health_reporter_state_update exist]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink_health_reporter_ops.recover has extack parameter]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + static int reporter_recover(struct devlink_health_reporter *reporter, + void *context, + struct netlink_ext_ack *extack) + { + return 0; + } + ],[ + struct devlink_health_reporter_ops mlx5_tx_reporter_ops = { + .recover = reporter_recover + } + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_HEALTH_REPORTER_RECOVER_HAS_EXTACK, 1, + [devlink_health_reporter_ops.recover has extack]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink has devlink_param_driverinit_value_get]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + devlink_param_driverinit_value_get(NULL, 0, NULL); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_DRIVERINIT_VAL, 1, + [devlink_param_driverinit_value_get exist]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink enum has DEVLINK_PARAM_GENERIC_ID_MAX]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int i = DEVLINK_PARAM_GENERIC_ID_MAX; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_PARAM_GENERIC_ID_MAX, 1, + [devlink enum has HAVE_DEVLINK_PARAM_GENERIC_ID_MAX]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink enum has DEVLINK_PARAM_GENERIC_ID_IO_EQ_SIZE]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int i = DEVLINK_PARAM_GENERIC_ID_IO_EQ_SIZE; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_PARAM_GENERIC_ID_IO_EQ_SIZE, 1, + [devlink enum has DEVLINK_PARAM_GENERIC_ID_IO_EQ_SIZE]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink enum has HAVE_DEVLINK_PARAM_GENERIC_ID_ENABLE_ETH]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int i = DEVLINK_PARAM_GENERIC_ID_ENABLE_ETH; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_PARAM_GENERIC_ID_ENABLE_ETH, 1, + [devlink enum has HAVE_DEVLINK_PARAM_GENERIC_ID_ENABLE_ETH]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink struct devlink_port exist]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct devlink_port i; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_PORT_STRUCT, 1, + [devlink struct devlink_port exist]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink struct devlink_port_new_attrs exist]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct devlink_port_new_attrs i; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_PORT_NEW_ATTRS_STRUCT, 1, + [devlink struct devlink_port_new_attrs exist]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink_port_attrs_set has 7 parameters]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + devlink_port_attrs_set(NULL, 0, 0, 0, 0, NULL ,0); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_PORT_ATRRS_SET_GET_7_PARAMS, 1, + [devlink_port_attrs_set has 7 parameters]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink_port_attrs_set has 5 parameters]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + devlink_port_attrs_set(NULL, 0, 0, 0, 0); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_PORT_ATRRS_SET_GET_5_PARAMS, 1, + [devlink_port_attrs_set has 5 parameters]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink_port_attrs_set has 2 parameters]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + devlink_port_attrs_set(NULL, 0); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_PORT_ATRRS_SET_GET_2_PARAMS, 1, + [devlink_port_attrs_set has 2 parameters]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink enum has DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int i = DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE, 1, + [struct devlink_param exist]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink enum devlink_port_flavour exist]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + enum devlink_port_flavour flavour; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_PORT_FLAVOUR, 1, + [enum devlink_port_flavour exist]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink enum devlink_port_fn_state exist]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + enum devlink_port_fn_state fn_state; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_PORT_FN_STATE, 1, + [enum devlink_port_fn_state exist]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink enum devlink_port_fn_opstate exist]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + enum devlink_port_fn_opstate fn_opstate; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_PORT_FN_OPSTATE, 1, + [enum devlink_port_fn_opstate exist]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink enum has DEVLINK_PORT_FLAVOUR_VIRTUAL]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int i = DEVLINK_PORT_FLAVOUR_VIRTUAL; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_PORT_FLAVOUR_VIRTUAL, 1, + [enum DEVLINK_PORT_FLAVOUR_VIRTUAL is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink enum has DEVLINK_PORT_FLAVOUR_PCI_SF]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int i = DEVLINK_PORT_FLAVOUR_PCI_SF; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_PORT_FLAVOUR_PCI_SF, 1, + [enum DEVLINK_PORT_FLAVOUR_PCI_SF is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct devlink_param exist in net/devlink.h]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct devlink_param soso; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_PARAM, 1, + [struct devlink_param exist]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink has devlink_reload_disable]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + devlink_reload_disable(NULL); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_RELOAD_DISABLE, 1, + [devlink_reload_disable exist]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink has devlink_reload_enable]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + devlink_reload_enable(NULL); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_RELOAD_ENABLE, 1, + [devlink_reload_enable exist]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink has devlink_net]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + devlink_net(NULL); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_NET, 1, + [devlink_net exist]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct devlink_ops has reload has 2 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + static int devlink_reload(struct devlink *devlink, + struct netlink_ext_ack *extack) + { + return 0; + } + + ],[ + struct devlink_ops dlops = { + .reload = devlink_reload, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_HAS_RELOAD, 1, + [reload is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct devlink_ops has reload_up/down]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct devlink_ops dlops = { + .reload_up = NULL, + .reload_down = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_HAS_RELOAD_UP_DOWN, 1, + [reload_up/down is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink_ops.port_function_hw_addr_get has 4 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + static int devlink_port_function_hw_addr_get(struct devlink_port *port, u8 *hw_addr, + int *hw_addr_len, + struct netlink_ext_ack *extack) + { + return 0; + } + + ],[ + struct devlink_ops dlops = { + .port_function_hw_addr_get = devlink_port_function_hw_addr_get, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_PORT_FUNCTION_HW_ADDR_GET_GET_4_PARAM, 1, + [port_function_hw_addr_get has 4 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink_ops.port_function_state_get has 4 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + static int mlx5_devlink_sf_port_fn_state_get(struct devlink_port *dl_port, + enum devlink_port_fn_state *state, + enum devlink_port_fn_opstate *opstate, + struct netlink_ext_ack *extack) + { + return 0; + } + + ],[ + struct devlink_ops dlops = { + .port_fn_state_get = mlx5_devlink_sf_port_fn_state_get, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_PORT_FUNCTION_STATE_GET_4_PARAM, 1, + [port_function_state_get has 4 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct devlink_ops has port_function_state_get/set]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct devlink_ops dlops = { + .port_fn_state_get = NULL, + .port_fn_state_set = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_HAS_PORT_FUNCTION_STATE_GET, 1, + [port_function_state_get/set is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink_ops.reload_down has 3 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + static int devlink_reload_down(struct devlink *devlink, bool netns_change, + struct netlink_ext_ack *extack) + { + return 0; + } + + ],[ + struct devlink_ops dlops = { + .reload_down = devlink_reload_down, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_RELOAD_DOWN_HAS_3_PARAMS, 1, + [reload_down has 3 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink_ops.reload_down has 5 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + static int devlink_reload_down(struct devlink *devlink, bool netns_change, + enum devlink_reload_action action, enum devlink_reload_limit limit, + struct netlink_ext_ack *extack) + { + return 0; + } + + ],[ + struct devlink_ops dlops = { + .reload_down = devlink_reload_down, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_RELOAD_DOWN_SUPPORT_RELOAD_ACTION, 1, + [reload_down has 5 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct devlink_ops has info_get]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct devlink_ops dlops = { + .info_get = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_HAS_INFO_GET, 1, + [info_get is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink struct devlink_trap exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct devlink_trap t; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_TRAP_SUPPORT, 1, + [devlink struct devlink_trap exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink has DEVLINK_TRAP_GENERIC_ID_DMAC_FILTER]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int n = DEVLINK_TRAP_GENERIC_ID_DMAC_FILTER; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_TRAP_DMAC_FILTER, 1, + [devlink has DEVLINK_TRAP_GENERIC_ID_DMAC_FILTER]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink_ops.trap_action_set has 4 args]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + static int mlx5_devlink_trap_action_set(struct devlink *devlink, + const struct devlink_trap *trap, + enum devlink_trap_action action, + struct netlink_ext_ack *extack) + { + return 0; + } + ],[ + struct devlink_ops dlops = { + .trap_action_set = mlx5_devlink_trap_action_set, + }; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_TRAP_ACTION_SET_4_ARGS, 1, + [devlink_ops.trap_action_set has 4 args]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink_trap_report has 5 args]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + devlink_trap_report(NULL, NULL, NULL, NULL, NULL); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_TRAP_REPORT_5_ARGS, 1, + [devlink_trap_report has 5 args]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink has DEVLINK_TRAP_GROUP_GENERIC with 2 args]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + static const struct devlink_trap_group mlx5_trap_groups_arr[] = { + DEVLINK_TRAP_GROUP_GENERIC(L2_DROPS, 0), + }; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_TRAP_GROUP_GENERIC_2_ARGS, 1, + [devlink has DEVLINK_TRAP_GROUP_GENERIC with 2 args]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink has devlink_trap_groups_register]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + devlink_trap_groups_register(NULL, NULL, 0); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_TRAP_GROUPS_REGISTER, 1, + [devlink has devlink_trap_groups_register]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink has devlink_port_health_reporter_create]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct devlink_health_reporter *r; + + r = devlink_port_health_reporter_create(NULL, NULL, 0, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_PORT_HEALTH_REPORTER_CREATE, 1, + [devlink_health_reporter_create is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink has devlink_health_reporter_create with 5 args]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct devlink_health_reporter *r; + + r = devlink_health_reporter_create(NULL, NULL, 0, 0, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_HEALTH_REPORTER_CREATE_5_ARGS, 1, + [devlink_health_reporter_create has 5 args]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink has devlink_health_reporter_create with 4 args]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct devlink_health_reporter *r; + + r = devlink_health_reporter_create(NULL, NULL, 0, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_HEALTH_REPORTER_CREATE_4_ARGS, 1, + [devlink_health_reporter_create has 4 args]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink has devlink_health_reporter & devlink_fmsg]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + /* test for devlink_health_reporter and devlink_fmsg */ + struct devlink_health_reporter *r; + struct devlink_fmsg *fmsg; + int err; + + devlink_health_reporter_destroy(r); + devlink_health_reporter_priv(r); + + err = devlink_health_report(r, NULL, NULL); + + err = devlink_fmsg_arr_pair_nest_start(fmsg, "name"); + err = devlink_fmsg_arr_pair_nest_end(fmsg); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_HEALTH_REPORT_BASE_SUPPORT, 1, + [structs devlink_health_reporter & devlink_fmsg exist]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink has devlink_fmsg_binary_put]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct devlink_fmsg *fmsg; + int err; + int value; + + err = devlink_fmsg_binary_put(fmsg, &value, 2); + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_FMSG_BINARY_PUT, 1, + [devlink_fmsg_binary_put exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink has devlink_fmsg_binary_pair_put]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + /* Only interested in function with arg u32 and not u16 */ + /* See upstream commit e2cde864a1d3e3626bfc8fa088fbc82b04ce66ed */ + int devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name, const void *value, u32 value_len); + ],[ + struct devlink_fmsg *fmsg; + int err; + int value; + + err = devlink_fmsg_binary_pair_put(fmsg, "name", &value, 2); + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_FMSG_BINARY_PAIR_PUT_ARG_U32, 1, + [devlink_fmsg_binary_pair_put exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct devlink_ops has eswitch_mode_get/set]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct devlink_ops dlops = { + .eswitch_mode_get = NULL, + .eswitch_mode_set = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_HAS_ESWITCH_MODE_GET_SET, 1, + [eswitch_mode_get/set is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct devlink_ops.eswitch_mode_set has extack]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode, + struct netlink_ext_ack *extack) { + return 0; + } + ],[ + static const struct devlink_ops dlops = { + .eswitch_mode_set = mlx5_devlink_eswitch_mode_set, + }; + dlops.eswitch_mode_set(NULL, 0, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_ESWITCH_MODE_SET_EXTACK, 1, + [struct devlink_ops.eswitch_mode_set has extack]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct devlink_ops has port_function_hw_addr_get/set]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct devlink_ops dlops = { + .port_function_hw_addr_get = NULL, + .port_function_hw_addr_set = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_HAS_PORT_FUNCTION_HW_ADDR_GET, 1, + [port_function_hw_addr_get/set is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct devlink_ops has rate functions]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct devlink_ops dlops = { + .rate_leaf_tx_share_set = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_HAS_RATE_FUNCTIONS, 1, + [rate functions are defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct devlink_ops has eswitch_encap_mode_set/get]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct devlink_ops dlops = { + .eswitch_encap_mode_set = NULL, + .eswitch_encap_mode_get = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_HAS_ESWITCH_ENCAP_MODE_SET, 1, + [eswitch_encap_mode_set/get is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct devlink_ops defines eswitch_encap_mode_set/get with enum arg]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + int local_eswitch_encap_mode_get(struct devlink *devlink, + enum devlink_eswitch_encap_mode *p_encap_mode) { + return 0; + } + int local_eswitch_encap_mode_set(struct devlink *devlink, + enum devlink_eswitch_encap_mode encap_mode, + struct netlink_ext_ack *extack) { + return 0; + } + + struct devlink_ops dlops = { + .eswitch_encap_mode_set = local_eswitch_encap_mode_set, + .eswitch_encap_mode_get = local_eswitch_encap_mode_get, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_HAS_ESWITCH_ENCAP_MODE_SET_GET_WITH_ENUM, 1, + [eswitch_encap_mode_set/get is defined with enum]) + ],[ + AC_MSG_RESULT(no) + ]) + + + AC_MSG_CHECKING([if struct devlink_ops has eswitch_inline_mode_get/set]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct devlink_ops dlops = { + .eswitch_inline_mode_get = NULL, + .eswitch_inline_mode_set = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_HAS_ESWITCH_INLINE_MODE_GET_SET, 1, + [eswitch_inline_mode_get/set is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct devlink_ops has flash_update]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct devlink_ops dlops = { + .flash_update = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_HAS_FLASH_UPDATE, 1, + [flash_update is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct devlink_ops flash_update get 3 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + + static int flash_update_func(struct devlink *devlink, + struct devlink_flash_update_params *params, + struct netlink_ext_ack *extack) + { + return 0; + } + ],[ + struct devlink_ops dlops = { + .flash_update = flash_update_func, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FLASH_UPDATE_GET_3_PARAMS, 1, + [struct devlink_ops flash_update get 3 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink has devlink_port_attrs_pci_pf_set has 4 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + devlink_port_attrs_pci_pf_set(NULL, NULL, 0, 0); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_PORT_ATTRS_PCI_PF_SET_4_PARAMS, 1, + [devlink_port_attrs_pci_pf_set has 4 params]) + ],[ + AC_MSG_RESULT(no) + AC_MSG_CHECKING([if devlink has devlink_port_attrs_pci_pf_set has 4 params and controller num]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + devlink_port_attrs_pci_pf_set(NULL, 0, 0, 0); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_PORT_ATTRS_PCI_PF_SET_CONTROLLER_NUM, 1, + [devlink_port_attrs_pci_pf_set has 4 params and controller num]) + ],[ + AC_MSG_RESULT(no) + ]) + ]) + + AC_MSG_CHECKING([if devlink has devlink_port_attrs_pci_pf_set has 2 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + devlink_port_attrs_pci_pf_set(NULL, 0); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_PORT_ATTRS_PCI_PF_SET_2_PARAMS, 1, + [devlink_port_attrs_pci_pf_set has 2 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if devlink_flash_update_params has struct firmware fw]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct devlink_flash_update_params *x; + x->fw = NULL; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_FLASH_UPDATE_PARAMS_HAS_STRUCT_FW, 1, + [devlink_flash_update_params has struct firmware fw]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct ifla_vf_info has vlan_proto]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct ifla_vf_info *ivf; + + ivf->vlan_proto = 0; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_VF_VLAN_PROTO, 1, + [vlan_proto is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if IP6_ECN_set_ce has 2 parameters]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + IP6_ECN_set_ce(NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IP6_SET_CE_2_PARAMS, 1, + [IP6_ECN_set_ce has 2 parameters]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if exists netif_carrier_event]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + netif_carrier_event(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NETIF_CARRIER_EVENT, 1, + [netif_carrier_event exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if netif_device_present get const]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + const struct net_device *dev; + netif_device_present(dev); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NETIF_DEVICE_PRESENT_GET_CONST, 1, + [netif_device_present get const]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if netdev_master_upper_dev_link gets 4 parameters]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + netdev_master_upper_dev_link(NULL, NULL, NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NETDEV_MASTER_UPPER_DEV_LINK_4_PARAMS, 1, + [netdev_master_upper_dev_link gets 4 parameters]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct net_device has lower_level]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct net_device dev; + + dev.lower_level = 1; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NET_DEVICE_LOWER_LEVEL, 1, + [struct net_device has lower_level]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if netdev_lag_hash has NETDEV_LAG_HASH_VLAN_SRCMAC]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int x = NETDEV_LAG_HASH_VLAN_SRCMAC; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NETDEV_LAG_HASH_VLAN_SRCMAC, 1, + [netdev_lag_hash has NETDEV_LAG_HASH_VLAN_SRCMAC]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if uapi ethtool.h has IPV6_USER_FLOW]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int x = IPV6_USER_FLOW; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IPV6_USER_FLOW, 1, + [uapi ethtool has IPV6_USER_FLOW]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if uapi ethtool.h has FLOW_RSS]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int x = FLOW_RSS; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FLOW_RSS, 1, + [uapi ethtool has FLOW_RSS]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct ethtool_ops has supported_coalesce_params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + const struct ethtool_ops en_ethtool_ops = { + .supported_coalesce_params = 0, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SUPPORTED_COALESCE_PARAM, 1, + [supported_coalesce_params is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct ethtool_ops has get/set_tunable]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + const struct ethtool_ops en_ethtool_ops = { + .get_tunable = NULL, + .set_tunable = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_GET_SET_TUNABLE, 1, + [get/set_tunable is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct ethtool_ops has get_module_eeprom_by_page]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + const struct ethtool_ops en_ethtool_ops = { + .get_module_eeprom_by_page = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_GET_MODULE_EEPROM_BY_PAGE, 1, + [ethtool_ops has get_module_eeprom_by_page]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if ethtool.h has __ethtool_get_link_ksettings]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + __ethtool_get_link_ksettings(NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE___ETHTOOL_GET_LINK_KSETTINGS, 1, + [__ethtool_get_link_ksettings is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct net_device has min/max]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct net_device *dev = NULL; + + dev->min_mtu = 0; + dev->max_mtu = 0; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NET_DEVICE_MIN_MAX_MTU, 1, + [net_device min/max is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct net_device has needs_free_netdev]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct net_device *dev = NULL; + + dev->needs_free_netdev = true; + dev->priv_destructor = NULL; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NET_DEVICE_NEEDS_FREE_NETDEV, 1, + [net_device needs_free_netdev is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct net_device has close_list]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct net_device *dev = NULL; + struct list_head xlist; + + dev->close_list = xlist; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NET_DEVICE_HAS_CLOSE_LIST, 1, + [net_device close_list is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct net_device has lower_level]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct net_device *dev = NULL; + + dev->lower_level = 0; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NET_DEVICE_HAS_LOWER_LEVEL, 1, + [lower_level is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/tls.h has struct tls_offload_resync_async]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct tls_offload_resync_async x; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TLS_OFFLOAD_RESYNC_ASYNC_STRUCT, 1, + [net/tls.h has struct tls_offload_resync_async is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if ktls related structs exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + struct tlsdev_ops dev; + struct tls_offload_context_tx tx_ctx; + struct tls12_crypto_info_aes_gcm_128 crypto_info; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_KTLS_STRUCTS, 1, + [ktls related structs exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct tlsdev_ops has tls_dev_resync_rx]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct tlsdev_ops dev; + + dev.tls_dev_resync_rx = NULL; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TLSDEV_OPS_HAS_TLS_DEV_RESYNC_RX, 1, + [struct tlsdev_ops has tls_dev_resync_rx]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct tlsdev_ops has tls_dev_resync]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct tlsdev_ops dev; + + dev.tls_dev_resync = NULL; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TLSDEV_OPS_HAS_TLS_DEV_RESYNC, 1, + [struct tlsdev_ops has tls_dev_resync]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if skb_frag_off_add exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + skb_frag_off_add(NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SKB_FRAG_OFF_ADD, 1, + [linux/skbuff.h skb_frag_off_add is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct netdev_xdp exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct netdev_xdp xdp; + xdp = xdp; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NETDEV_XDP, 1, + [struct netdev_xdp is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct net_device_ops has ndo_xdp]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct net_device_ops netdev_ops = { + .ndo_bpf = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NDO_XDP, 1, + [net_device_ops has ndo_xdp is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + + AC_MSG_CHECKING([if struct net_device_ops has ndo_xdp_xmit]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct net_device_ops netdev_ops = { + .ndo_xdp_xmit = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NDO_XDP_XMIT, 1, + [net_device_ops has ndo_xdp_xmit is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct net_device_ops has ndo_xdp_flush]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct net_device_ops netdev_ops = { + .ndo_xdp_flush = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NDO_XDP_FLUSH, 1, + [ndo_xdp_flush is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct net_device_ops has ndo_xsk_wakeup]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct net_device_ops netdev_ops = { + .ndo_xsk_wakeup = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NDO_XSK_WAKEUP, 1, + [ndo_xsk_wakeup is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct net_device_ops_extended has ndo_xdp]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct net_device_ops_extended netdev_ops_extended = { + .ndo_xdp = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NDO_XDP_EXTENDED, 1, + [extended ndo_xdp is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if enum tc_htb_command exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + enum tc_htb_command x; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_ENUM_TC_HTB_COMMAND, 1, + [enum tc_htb_command is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct tc_mqprio_qopt_offload exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct tc_mqprio_qopt_offload x; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TC_MQPRIO_QOPT_OFFLOAD, 1, + [tc_mqprio_qopt_offload is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct tc_cls_flower_offload exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct tc_cls_flower_offload x; + x = x; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TC_FLOWER_OFFLOAD, 1, + [struct tc_cls_flower_offload is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct tc_block_offload exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct tc_block_offload x; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TC_BLOCK_OFFLOAD, 1, + [struct tc_block_offload is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct flow_block_offload exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct flow_block_offload x; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FLOW_BLOCK_OFFLOAD, 1, + [struct flow_block_offload exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct flow_block_offload hash unlocked_driver_cb]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct flow_block_offload x; + x.unlocked_driver_cb = true; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_UNLOCKED_DRIVER_CB, 1, + [struct flow_block_offload has unlocked_driver_cb]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct netdev_notifier_info has extack]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + struct netdev_notifier_info *x; + x->extack = NULL; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NETDEV_NOTIFIER_INFO_EXTACK, 1, + [struct netdev_notifier_info has extack]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct netlink_ext_ack exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct netlink_ext_ack extack; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NETLINK_EXTACK, 1, + [struct netlink_ext_ack exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if tc_cls_common_offload has extack]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct tc_cls_common_offload x; + x.extack = NULL; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TC_CLS_OFFLOAD_EXTACK, 1, + [struct tc_cls_common_offload has extack]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct tc_block_offload has extack]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct tc_block_offload x; + x.extack = NULL; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TC_BLOCK_OFFLOAD_EXTACK, 1, + [struct tc_block_offload has extack]) + ],[ + AC_MSG_RESULT(no) + ]) + + BP_CHECK_RHTABLE + + AC_MSG_CHECKING([if struct ptp_clock_info has gettimex64]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct ptp_clock_info info = { + .gettimex64 = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_GETTIMEX64, 1, + [gettimex64 is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct ptp_clock_info has gettime]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct ptp_clock_info info = { + .gettime = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_PTP_CLOCK_INFO_GETTIME_32BIT, 1, + [gettime 32bit is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if pci.h pci_bus_addr_t]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + pci_bus_addr_t x = 0; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_PCI_BUS_ADDR_T, 1, + [pci_bus_addr_t is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if mm.h has page_is_pfmemalloc]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + bool x = page_is_pfmemalloc(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_PAGE_IS_PFMEMALLOC, 1, + [page_is_pfmemalloc is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if mm.h has want_init_on_alloc]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + bool x = want_init_on_alloc(__GFP_ZERO); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_WANT_INIT_ON_ALLOC, 1, + [want_init_on_alloc is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct page has pfmemalloc]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct page *page; + page->pfmemalloc = true; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_PAGE_PFMEMALLOC, 1, + [pfmemalloc is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if netdevice.h has select_queue_fallback_t]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + select_queue_fallback_t fallback; + + fallback = NULL; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SELECT_QUEUE_FALLBACK_T, 1, + [select_queue_fallback_t is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if skbuff.h has skb_frag_off]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + skb_frag_off(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SKB_FRAG_OFF, 1, + [skb_frag_off is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if skbuff.h has dev_page_is_reusable]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + dev_page_is_reusable(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEV_PAGE_IS_REUSABLE, 1, + [dev_page_is_reusable is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if gfp.h has gfpflags_allow_blocking]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + gfpflags_allow_blocking(0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_HAS_GFPFLAGES_ALLOW_BLOCKING, 1, + [gfpflags_allow_blocking is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if gfp.h has __GFP_DIRECT_RECLAIM]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + gfp_t gfp_mask = __GFP_DIRECT_RECLAIM; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_HAS_GFP_DIRECT_RECLAIM, 1, + [__GFP_DIRECT_RECLAIM is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if skbuff.h skb_flow_dissect]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + skb_flow_dissect(NULL, NULL, NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SKB_FLOW_DISSECT, 1, + [skb_flow_dissect is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/pkt_cls.h has tc_skb_ext_alloc]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + struct sk_buff skb; + + tc_skb_ext_alloc(&skb); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TC_SKB_EXT_ALLOC, 1, + [tc_skb_ext_alloc is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if netdevice.h dev_change_flags has 3 parameters]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + dev_change_flags(NULL, 0, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEV_CHANGE_FLAGS_HAS_3_PARAMS, 1, + [dev_change_flags has 3 parameters]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if uaccess.h access_ok has 3 parameters]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + access_ok(0, NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_ACCESS_OK_HAS_3_PARAMS, 1, + [access_ok has 3 parameters]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if mm.h put_user_pages_dirty_lock has 3 parameters]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + put_user_pages_dirty_lock(NULL, 0, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_PUT_USER_PAGES_DIRTY_LOCK_3_PARAMS, 1, + [put_user_pages_dirty_lock has 3 parameters]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if mm.h put_user_pages_dirty_lock has 2 parameters]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + put_user_pages_dirty_lock(NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_PUT_USER_PAGES_DIRTY_LOCK_2_PARAMS, 1, + [put_user_pages_dirty_lock has 2 parameters]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if skbuff.h skb_flow_dissect_flow_keys has 3 parameters]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + struct sk_buff *skb; + struct flow_keys *flow; + + skb_flow_dissect_flow_keys(skb, flow, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SKB_FLOW_DISSECT_FLOW_KEYS_HAS_3_PARAMS, 1, + [skb_flow_dissect_flow_keys has 3 parameters]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if skbuff.h skb_flow_dissect_flow_keys has 2 parameters]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + skb_flow_dissect_flow_keys(NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SKB_FLOW_DISSECT_FLOW_KEYS_HAS_2_PARAMS, 1, + [skb_flow_dissect_flow_keys has 2 parameters]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if ptp_classify.h has ptp_classify_raw]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + ptp_classify_raw(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_PTP_CLASSIFY_RAW, 1, + [ptp_classify_raw is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if netdevice.h has enum NAPI_STATE_MISSED]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int napi = NAPI_STATE_MISSED; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NAPI_STATE_MISSED, 1, + [NAPI_STATE_MISSED is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if bitfield.h exist]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BITFIELD_H, 1, + [bitfield.h exist]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if flow_dissector.h exist]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FLOW_DISSECTOR_H, 1, + [flow_dissector.h exist]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if flow_dissector.h has struct flow_dissector_mpls_lse]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct flow_dissector_mpls_lse ls; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FLOW_DISSECTOR_MPLS_LSE, 1, + [flow_dissector.h has struct flow_dissector_mpls_lse]) + ],[ + AC_MSG_RESULT(no) + ]) + + + AC_MSG_CHECKING([if flow_dissector.h has FLOW_DISSECTOR_KEY_ENC_IP]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int n = FLOW_DISSECTOR_KEY_ENC_IP; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FLOW_DISSECTOR_KEY_ENC_IP, 1, + [flow_dissector.h has FLOW_DISSECTOR_KEY_ENC_IP]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if flow_dissector.h has FLOW_DISSECTOR_KEY_ENC_CONTROL]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int n = FLOW_DISSECTOR_KEY_ENC_CONTROL; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FLOW_DISSECTOR_KEY_ENC_CONTROL, 1, + [flow_dissector.h has FLOW_DISSECTOR_KEY_ENC_CONTROL]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if flow_dissector.h has FLOW_DISSECTOR_KEY_MPLS]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int n = FLOW_DISSECTOR_KEY_MPLS; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FLOW_DISSECTOR_KEY_MPLS, 1, + [flow_dissector.h has FLOW_DISSECTOR_KEY_MPLS]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if flow_dissector.h has dissector_uses_key]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + dissector_uses_key(NULL, 1); + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FLOW_DISSECTOR_USES_KEY, 1, + [flow_dissector.h has dissector_uses_key]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if flow_dissector.h has FLOW_DISSECTOR_KEY_ENC_KEYID]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int n = FLOW_DISSECTOR_KEY_ENC_KEYID; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FLOW_DISSECTOR_KEY_ENC_KEYID, 1, + [flow_dissector.h has FLOW_DISSECTOR_KEY_ENC_KEYID]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if call_switchdev_notifiers has 4 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + call_switchdev_notifiers(0, NULL, NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_CALL_SWITCHDEV_NOTIFIERS_4_PARAMS, 1, + [call_switchdev_notifiers is defined with 4 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if enum switchdev_notifier_type has SWITCHDEV_PORT_ATTR_SET]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + enum switchdev_notifier_type xx = SWITCHDEV_PORT_ATTR_SET; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SWITCHDEV_PORT_ATTR_SET, 1, + [SWITCHDEV_PORT_ATTR_SET is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if switchdev.h has struct switchdev_ops]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + struct switchdev_ops x; + struct net_device *ndev; + + ndev->switchdev_ops = &x; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SWITCHDEV_OPS, 1, + [HAVE_SWITCHDEV_OPS is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct switchdev_obj_port_vlan has vid]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct switchdev_obj_port_vlan x; + x.vid = 0; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_STRUCT_SWITCHDEV_OBJ_PORT_VLAN_VID, 1, + [struct switchdev_obj_port_vlan has vid]) + ],[ + AC_MSG_RESULT(no) + ]) + AC_MSG_CHECKING([if struct switchdev_brport_flags exist]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct switchdev_brport_flags x; + x.mask = 0; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_STRUCT_SWITCHDEV_BRPORT_FLAGS, 1, + [struct switchdev_brport_flags exist]) + ],[ + AC_MSG_RESULT(no) + ]) + + + AC_MSG_CHECKING([if switchdev.h has switchdev_port_same_parent_id]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + switchdev_port_same_parent_id(NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SWITCHDEV_PORT_SAME_PARENT_ID, 1, + [switchdev_port_same_parent_id is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct sk_buff has xmit_more]) + case $LINUXRELEASE in + 3\.1[[0-7]]*fbk*|2*fbk*) + AC_MSG_RESULT(Not checking xmit_more support for fbk kernel: $LINUXRELEASE) + ;; + *) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct sk_buff *skb; + skb->xmit_more = 0; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SK_BUFF_XMIT_MORE, 1, + [xmit_more is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + ;; + esac + + AC_MSG_CHECKING([if struct sk_buff has decrypted]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct sk_buff *skb; + skb->decrypted = 0; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SK_BUFF_DECRYPTED, 1, + [decrypted is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if xfrm_state_offload has real_dev as member]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct xfrm_state_offload x = { + .real_dev = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NET_DEVICE_REAL_DEV, 1, + [xfrm_state_offload has real_dev as member]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if xfrm_state_offload exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct xfrm_state_offload x; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_XFRM_STATE_OFFLOAD, 1, + [xfrm_state_offload exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if xfrm.h has xfrm_state_expire]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + xfrm_state_expire(NULL, 0); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_XFRM_STATE_EXPIRE, 1, + [xfrm_state_expire is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if secpath_set returns struct sec_path *]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct sec_path *temp = secpath_set(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SECPATH_SET_RETURN_POINTER, 1, + [if secpath_set returns struct sec_path *]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if eth_get_headlen has 3 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + eth_get_headlen(NULL, NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_ETH_GET_HEADLEN_3_PARAMS, 1, + [eth_get_headlen is defined with 3 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if eth_get_headlen has 2 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + eth_get_headlen(NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_ETH_GET_HEADLEN_2_PARAMS, 1, + [eth_get_headlen is defined with 2 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct skbuff.h has napi_consume_skb]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + napi_consume_skb(NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NAPI_CONSUME_SKB, 1, + [napi_consume_skb is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct skbuff.h has skb_inner_transport_offset]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + skb_inner_transport_offset(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SKB_INNER_TRANSPORT_OFFSET, 1, + [skb_inner_transport_offset is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if if_vlan.h has vlan_get_encap_level]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + vlan_get_encap_level(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_VLAN_GET_ENCAP_LEVEL, 1, + [vlan_get_encap_level is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct vlan_ethhdr has addrs member]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct vlan_ethhdr vhdr = { + .addrs = {0}, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_VLAN_ETHHDR_HAS_ADDRS, 1, + [struct vlan_ethhdr has addrs member]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if ndo_select_queue has accel_priv]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + static u16 select_queue(struct net_device *dev, struct sk_buff *skb, + struct net_device *sb_dev) + { + return 0; + } + ],[ + struct net_device_ops ndops = { + .ndo_select_queue = select_queue, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NDO_SELECT_QUEUE_HAS_3_PARMS_NO_FALLBACK, 1, + [ndo_select_queue has 3 params with no fallback]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if ndo_select_queue has a second net_device parameter]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + static u16 select_queue(struct net_device *dev, struct sk_buff *skb, + struct net_device *sb_dev, + select_queue_fallback_t fallback) + { + return 0; + } + ],[ + struct net_device_ops ndops = { + .ndo_select_queue = select_queue, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SELECT_QUEUE_NET_DEVICE, 1, + [ndo_select_queue has a second net_device parameter]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if include/trace/trace_events.h exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #undef TRACE_INCLUDE_PATH + #undef TRACE_INCLUDE_FILE + #undef TRACE_INCLUDE + #define TRACE_INCLUDE(a) "/dev/null" + + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TRACE_EVENTS_H, 1, + [include/trace/trace_events.h exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if include/linux/count_zeros.h exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_LINUX_COUNT_ZEROS_H, 1, + [include/linux/count_zeros.h exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if include/linux/bits.h exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BITS_H, 1, + [include/linux/bits.h exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if include/linux/build_bug.h exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BUILD_BUG_H, 1, + [include/linux/build_bug.h exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if include/net/devlink.h exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_H, 1, + [include/net/devlink.h exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if enum devlink_param_cmode exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + enum devlink_param_cmode p; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVLINK_PARAM_CMODE, 1, + [enum devlink_param_cmode exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if include/net/switchdev.h exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SWITCHDEV_H, 1, + [include/net/switchdev.h exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if tc_vlan.h has is_tcf_vlan]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + is_tcf_vlan(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IS_TCF_VLAN, 1, + [is_tcf_vlan is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if tc_vlan.h has tcf_vlan_push_prio]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + tcf_vlan_push_prio(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TCF_VLAN_PUSH_PRIO, 1, + [tcf_vlan_push_prio is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if flow_dissector.h enum flow_dissector_key_keyid has FLOW_DISSECTOR_KEY_VLAN]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + enum flow_dissector_key_id keyid = FLOW_DISSECTOR_KEY_VLAN; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FLOW_DISSECTOR_KEY_VLAN, 1, + [FLOW_DISSECTOR_KEY_VLAN is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if flow_dissector.h enum flow_dissector_key_keyid has FLOW_DISSECTOR_KEY_CVLAN]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + enum flow_dissector_key_id keyid = FLOW_DISSECTOR_KEY_CVLAN; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FLOW_DISSECTOR_KEY_CVLAN, 1, + [FLOW_DISSECTOR_KEY_CVLAN is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if flow_dissector.h enum flow_dissector_key_keyid has FLOW_DISSECTOR_KEY_IP]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + enum flow_dissector_key_id keyid = FLOW_DISSECTOR_KEY_IP; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FLOW_DISSECTOR_KEY_IP, 1, + [FLOW_DISSECTOR_KEY_IP is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if flow_dissector.h enum flow_dissector_key_keyid has FLOW_DISSECTOR_KEY_TCP]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + enum flow_dissector_key_id keyid = FLOW_DISSECTOR_KEY_TCP; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FLOW_DISSECTOR_KEY_TCP, 1, + [FLOW_DISSECTOR_KEY_TCP is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if FLOW_ACTION_PRIORITY exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + enum flow_action_id action = FLOW_ACTION_PRIORITY; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FLOW_ACTION_PRIORITY, 1, + [FLOW_ACTION_PRIORITY exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if HAVE_FLOW_OFFLOAD_ACTION exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct flow_offload_action act = {}; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FLOW_OFFLOAD_ACTION, 1, + [HAVE_FLOW_OFFLOAD_ACTION exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if flow_offload_has_one_action exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct flow_action action; + + flow_offload_has_one_action(&action); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FLOW_OFFLOAD_HAS_ONE_ACTION, 1, + [flow_offload_has_one_action exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if flow_dissector.h enum flow_dissector_key_keyid has FLOW_DISSECTOR_KEY_ENC_IP]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + enum flow_dissector_key_id keyid = FLOW_DISSECTOR_KEY_ENC_IP; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FLOW_DISSECTOR_KEY_ENC_IP, 1, + [FLOW_DISSECTOR_KEY_ENC_IP is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct net_device_ops has *ndo_set_tx_maxrate]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct net_device_ops x = { + .ndo_set_tx_maxrate = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NDO_SET_TX_MAXRATE, 1, + [ndo_set_tx_maxrate is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct net_device_ops_extended has *ndo_set_tx_maxrate]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct net_device_ops_extended x = { + .ndo_set_tx_maxrate = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NDO_SET_TX_MAXRATE_EXTENDED, 1, + [extended ndo_set_tx_maxrate is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct net_device_ops_extended has *ndo_chane_mtu_extended]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct net_device_ops_extended x = { + .ndo_change_mtu = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NDO_CHANGE_MTU_EXTENDED, 1, + [extended ndo_change_mtu is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct net_device_ops has *ndo_chane_mtu_rh74]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct net_device_ops x = { + .ndo_change_mtu_rh74 = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NDO_CHANGE_MTU_RH74, 1, + [extended ndo_change_mtu_rh74 is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct net_device_extended has min/max_mtu]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct net_device_extended x = { + .min_mtu = 0, + .max_mtu = 0, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NET_DEVICE_MIN_MAX_MTU_EXTENDED, 1, + [extended min/max_mtu is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct net_device_ops has *ndo_setup_tc]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct net_device_ops x = { + .ndo_setup_tc = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NDO_SETUP_TC, 1, + [ndo_setup_tc is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct net_device_ops_extended has has *ndo_setup_tc_rh]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct net_device_ops_extended x = { + .ndo_setup_tc_rh = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NDO_SETUP_TC_RH_EXTENDED, 1, + [ndo_setup_tc_rh is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if ndo_setup_tc takes 4 parameters]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + int mlx4_en_setup_tc(struct net_device *dev, u32 handle, + __be16 protocol, struct tc_to_netdev *tc) + { + return 0; + } + ],[ + struct net_device_ops x = { + .ndo_setup_tc = mlx4_en_setup_tc, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NDO_SETUP_TC_4_PARAMS, 1, + [ndo_setup_tc takes 4 parameters]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if ndo_setup_tc takes chain_index]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + int mlx_en_setup_tc(struct net_device *dev, u32 handle, u32 chain_index, + __be16 protocol, struct tc_to_netdev *tc) + { + return 0; + } + ],[ + struct net_device_ops x = { + .ndo_setup_tc = mlx_en_setup_tc, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NDO_SETUP_TC_TAKES_CHAIN_INDEX, 1, + [ndo_setup_tc takes chain_index]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if ndo_setup_tc takes tc_setup_type]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + int mlx_en_setup_tc(struct net_device *dev, enum tc_setup_type type, + void *type_data) + { + return 0; + } + ],[ + struct net_device_ops x = { + .ndo_setup_tc = mlx_en_setup_tc, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NDO_SETUP_TC_TAKES_TC_SETUP_TYPE, 1, + [ndo_setup_tc takes tc_setup_type]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if pkt_cls.h has tcf_exts_to_list]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + tcf_exts_to_list(NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TCF_EXTS_TO_LIST, 1, + [tcf_exts_to_list is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if pkt_cls.h has tc_setup_flow_action]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + tc_setup_flow_action(NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TC_SETUP_FLOW_ACTION_FUNC, 1, + [tc_setup_flow_action is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if pkt_cls.h has tc_setup_offload_action]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + tc_setup_offload_action(NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TC_SETUP_OFFLOAD_ACTION_FUNC, 1, + [tc_setup_offload_action is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if pkt_cls.h has tc_setup_offload_action get 3 param]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + tc_setup_offload_action(NULL, NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TC_SETUP_OFFLOAD_ACTION_FUNC_HAS_3_PARAM, 1, + [tc_setup_offload_action is defined and get 3 param]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if pkt_cls.h has tc_setup_flow_action with rtnl_held]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + tc_setup_flow_action(NULL, NULL, false); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TC_SETUP_FLOW_ACTION_WITH_RTNL_HELD, 1, + [tc_setup_flow_action has rtnl_held]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if pkt_cls.h has tcf_queue_work]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + tcf_queue_work(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TCF_QUEUE_WORK, 1, + [tcf_queue_work is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if pkt_cls.h has tcf_exts_init]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + tcf_exts_init(NULL, 0, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TCF_EXTS_INIT, 1, + [tcf_exts_init is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if pkt_cls.h has tcf_exts_get_dev]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + tcf_exts_get_dev(NULL, NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TCF_EXTS_GET_DEV, 1, + [tcf_exts_get_dev is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if pkt_cls.h has __tc_indr_block_cb_register]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + __tc_indr_block_cb_register(NULL, NULL, NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE___TC_INDR_BLOCK_CB_REGISTER, 1, + [__tc_indr_block_cb_register is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if pkt_cls.h has TC_CLSMATCHALL_STATS]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + enum tc_matchall_command x = TC_CLSMATCHALL_STATS; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TC_CLSMATCHALL_STATS, 1, + [TC_CLSMATCHALL_STATS is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if have __flow_indr_block_cb_register]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + __flow_indr_block_cb_register(NULL, NULL, NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE___FLOW_INDR_BLOCK_CB_REGISTER, 1, + [__flow_indr_block_cb_register is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if have flow_cls_offload_flow_rule]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + flow_cls_offload_flow_rule(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FLOW_CLS_OFFLOAD_FLOW_RULE, 1, + [flow_cls_offload_flow_rule is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if have flow_block_cb_setup_simple]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + flow_block_cb_setup_simple(NULL, NULL, NULL, NULL, NULL, false); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FLOW_BLOCK_CB_SETUP_SIMPLE, 1, + [flow_block_cb_setup_simple is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if have flow_block_cb_alloc]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + flow_block_cb_alloc(NULL, NULL, NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FLOW_BLOCK_CB_ALLOC, 1, + [flow_block_cb_alloc is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if have flow_setup_cb_t]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + flow_setup_cb_t *cb = NULL; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FLOW_SETUP_CB_T, 1, + [flow_setup_cb_t is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if have netif_is_gretap]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + struct net_device dev = {}; + + netif_is_gretap(&dev); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NETIF_IS_GRETAP, 1, + [netif_is_gretap is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if have netif_is_vxlan]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct net_device dev = {}; + + netif_is_vxlan(&dev); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NETIF_IS_VXLAN, 1, + [netif_is_vxlan is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if tc_mirred.h has is_tcf_mirred_redirect]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + is_tcf_mirred_redirect(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IS_TCF_MIRRED_REDIRECT, 1, + [is_tcf_mirred_redirect is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if tc_mirred.h has is_tcf_mirred_egress_redirect]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + is_tcf_mirred_egress_redirect(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IS_TCF_MIRRED_EGRESS_REDIRECT, 1, + [is_tcf_mirred_egress_redirect is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if tc_mirred.h has is_tcf_mirred_mirror]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + is_tcf_mirred_mirror(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IS_TCF_MIRRED_MIRROR, 1, + [is_tcf_mirred_mirror is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if tc_mirred.h has is_tcf_mirred_egress_mirror]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + is_tcf_mirred_egress_mirror(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IS_TCF_MIRRED_EGRESS_MIRROR, 1, + [is_tcf_mirred_egress_mirror is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if tc_mirred.h has tcf_mirred_ifindex]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + tcf_mirred_ifindex(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TCF_MIRRED_IFINDEX, 1, + [tcf_mirred_ifindex is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if tc_mirred.h has tcf_mirred_dev]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + tcf_mirred_dev(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TCF_MIRRED_DEV, 1, + [tcf_mirred_dev is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/ipv6_stubs.h exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IPV6_STUBS_H, 1, + [net/ipv6_stubs.h exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if tc_gact.h has is_tcf_gact_goto_chain]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + is_tcf_gact_goto_chain(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IS_TCF_GACT_GOTO_CHAIN, 1, + [is_tcf_gact_goto_chain is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if tc_skbedit.h has is_tcf_skbedit_ptype]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + is_tcf_skbedit_ptype(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IS_TCF_SKBEDIT_PTYPE, 1, + [is_tcf_skbedit_ptype is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if tc_mirred.h has is_tcf_mirred_ingress_redirect]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + is_tcf_mirred_ingress_redirect(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IS_TCF_MIRRED_INGRESS_REDIRECT, 1, + [is_tcf_mirred_ingress_redirect is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if tc_gact.h has is_tcf_gact_shot]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + is_tcf_gact_shot(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IS_TCF_GACT_SHOT, 1, + [is_tcf_gact_shot is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if tc_gact.h has is_tcf_gact_ok]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct tc_action a = {}; + is_tcf_gact_ok(&a); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IS_TCF_GACT_OK, 1, + [is_tcf_gact_ok is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if tc_gact.h has __is_tcf_gact_act with 3 variables]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct tc_action a = {}; + __is_tcf_gact_act(&a, 0, false); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IS_TCF_GACT_ACT, 1, + [__is_tcf_gact_act is defined with 3 variables]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if tc_gact.h has __is_tcf_gact_act with 2 variables]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct tc_action a = {}; + __is_tcf_gact_act(&a, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IS_TCF_GACT_ACT_OLD, 1, + [__is_tcf_gact_act is defined with 2 variables]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if tc_skbedit.h has is_tcf_skbedit_mark]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + is_tcf_skbedit_mark(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IS_TCF_SKBEDIT_MARK, 1, + [is_tcf_skbedit_mark is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net_device_ops has *ndo_get_stats64 that returns void]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + void get_stats_64(struct net_device *dev, + struct rtnl_link_stats64 *storage) + { + return; + } + ],[ + struct net_device_ops netdev_ops; + + netdev_ops.ndo_get_stats64 = get_stats_64; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NDO_GET_STATS64_RET_VOID, 1, + [ndo_get_stats64 is defined and returns void]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct net_device_ops has ndo_eth_ioctl]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct net_device_ops netdev_ops = { + .ndo_eth_ioctl = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NDO_ETH_IOCTL, 1, + [net_device_ops has ndo_eth_ioctl is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct net_device_ops has *ndo_get_stats64]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + struct rtnl_link_stats64* get_stats_64(struct net_device *dev, + struct rtnl_link_stats64 *storage) + { + struct rtnl_link_stats64 stats_64; + return &stats_64; + } + ],[ + struct net_device_ops netdev_ops; + + netdev_ops.ndo_get_stats64 = get_stats_64; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NDO_GET_STATS64, 1, + [ndo_get_stats64 is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct net_device_ops has ndo_get_port_parent_id]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + int get_port_parent_id(struct net_device *dev, + struct netdev_phys_item_id *ppid) + { + return 0; + } + ],[ + struct net_device_ops netdev_ops; + + netdev_ops.ndo_get_port_parent_id = get_port_parent_id; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NDO_GET_PORT_PARENT_ID, 1, + [HAVE_NDO_GET_PORT_PARENT_ID is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net_device_ops_extended has ndo_get_phys_port_id]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + int get_phys_port_name(struct net_device *dev, + char *name, size_t len) + { + return 0; + } + ],[ + struct net_device_ops_extended netdev_ops; + + netdev_ops.ndo_get_phys_port_name = get_phys_port_name; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NDO_GET_PHYS_PORT_NAME_EXTENDED, 1, + [ is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if netdevice.h has struct netdev_nested_priv]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct netdev_nested_priv x; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NETDEV_NESTED_PRIV_STRUCT, 1, + [netdevice.h has struct netdev_nested_priv]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct net_device_ops_extended exist]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct net_device_ops_extended ops_extended; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NET_DEVICE_OPS_EXTENDED, 1, + [struct net_device_ops_extended is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net_device_ops has ndo_set_vf_trust]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + int set_vf_trust(struct net_device *dev, int vf, bool setting) + { + return 0; + } + ],[ + struct net_device_ops netdev_ops; + + netdev_ops.ndo_set_vf_trust = set_vf_trust; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NETDEV_OPS_NDO_SET_VF_TRUST, 1, + [ndo_set_vf_trust is defined in net_device_ops]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net_device_ops_extended has ndo_set_vf_trust]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + int set_vf_trust(struct net_device *dev, int vf, bool setting) + { + return 0; + } + ],[ + struct net_device_ops_extended netdev_ops; + + netdev_ops.ndo_set_vf_trust = set_vf_trust; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NETDEV_OPS_NDO_SET_VF_TRUST_EXTENDED, 1, + [extended ndo_set_vf_trust is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct net_device_ops has ndo_set_vf_vlan]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct net_device_ops netdev_ops = { + .ndo_set_vf_vlan = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NDO_SET_VF_VLAN, 1, + [ndo_set_vf_vlan is defined in net_device_ops]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct net_device_ops_extended has ndo_set_vf_vlan]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct net_device_ops_extended netdev_ops_extended = { + .ndo_set_vf_vlan = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NDO_SET_VF_VLAN_EXTENDED, 1, + [ndo_set_vf_vlan is defined in net_device_ops_extended]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if netdevice.h has enum netdev_lag_tx_type]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + enum netdev_lag_tx_type x; + x = 0; + + return x; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_LAG_TX_TYPE, 1, + [enum netdev_lag_tx_type is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if dev_addr_mod exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + dev_addr_mod(NULL, 0, NULL, 0); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEV_ADDR_MOD, 1, + [function dev_addr_mod exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if netdev_get_xmit_slave exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + netdev_get_xmit_slave(NULL, NULL, 0); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NETDEV_GET_XMIT_SLAVE, 1, + [function netdev_get_xmit_slave exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/lag.h exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NET_LAG_H, 1, + [net/lag.h exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/lag.h net_lag_port_dev_txable exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + net_lag_port_dev_txable(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NET_LAG_PORT_DEV_TXABLE, 1, + [net/lag.h exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if ndo_get_ringparam get 4 parameters]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + static void ipoib_get_ringparam(struct net_device *dev, + struct ethtool_ringparam *param, + struct kernel_ethtool_ringparam *kernel_param, + struct netlink_ext_ack *extack) + { + return; + } + ],[ + struct ethtool_ops ipoib_ethtool_ops = { + .get_ringparam = ipoib_get_ringparam, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_GET_RINGPARAM_GET_4_PARAMS, 1, + [ndo_get_ringparam get 4 parameters]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if ndo_get_coalesce get 4 parameters]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + static int ipoib_get_coalesce(struct net_device *dev, + struct ethtool_coalesce *coal, + struct kernel_ethtool_coalesce *kernel_coal, + struct netlink_ext_ack *extack) + { + return 0; + } + ],[ + struct ethtool_ops ipoib_ethtool_ops = { + .get_coalesce = ipoib_get_coalesce, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NDO_GET_COALESCE_GET_4_PARAMS, 1, + [ndo_get_coalesce get 4 parameters]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct ethtool_ops has get_pause_stats]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + const struct ethtool_ops en_ethtool_ops = { + .get_pause_stats = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_GET_PAUSE_STATS, 1, + [get_pause_stats is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct ethtool_ops has get/set_link_ksettings]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + const struct ethtool_ops en_ethtool_ops = { + .get_link_ksettings = NULL, + .set_link_ksettings = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_GET_SET_LINK_KSETTINGS, 1, + [get/set_link_ksettings is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct ethtool_ops has get/set_rxfh_context]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + const struct ethtool_ops en_ethtool_ops = { + .get_rxfh_context = NULL, + .set_rxfh_context = NULL, + + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_ETHTOOL_GET_RXFH_CONTEXT, 1, + [get/set_rxfh_context is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct ethtool_ops has get_link_ext_state]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + const struct ethtool_ops en_ethtool_ops = { + .get_link_ext_state = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_GET_LINK_EXT_STATE, 1, + [.get_link_ext_state is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if ethtool supports 25G,50G,100G link speeds]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + const enum ethtool_link_mode_bit_indices speeds[] = { + ETHTOOL_LINK_MODE_25000baseCR_Full_BIT, + ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT, + ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_ETHTOOL_25G_50G_100G_SPEEDS, 1, + [ethtool supprts 25G,50G,100G link speeds]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if ethtool supports 50G-pre-lane link modes]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + const enum ethtool_link_mode_bit_indices speeds[] = { + ETHTOOL_LINK_MODE_50000baseKR_Full_BIT, + ETHTOOL_LINK_MODE_50000baseSR_Full_BIT, + ETHTOOL_LINK_MODE_50000baseCR_Full_BIT, + ETHTOOL_LINK_MODE_50000baseLR_ER_FR_Full_BIT, + ETHTOOL_LINK_MODE_50000baseDR_Full_BIT, + ETHTOOL_LINK_MODE_100000baseKR2_Full_BIT, + ETHTOOL_LINK_MODE_100000baseSR2_Full_BIT, + ETHTOOL_LINK_MODE_100000baseCR2_Full_BIT, + ETHTOOL_LINK_MODE_100000baseLR2_ER2_FR2_Full_BIT, + ETHTOOL_LINK_MODE_100000baseDR2_Full_BIT, + ETHTOOL_LINK_MODE_200000baseKR4_Full_BIT, + ETHTOOL_LINK_MODE_200000baseSR4_Full_BIT, + ETHTOOL_LINK_MODE_200000baseLR4_ER4_FR4_Full_BIT, + ETHTOOL_LINK_MODE_200000baseDR4_Full_BIT, + ETHTOOL_LINK_MODE_200000baseCR4_Full_BIT, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_ETHTOOL_50G_PER_LANE_LINK_MODES, 1, + [ethtool supprts 50G-pre-lane link modes]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct ethtool_ops has get/set_settings]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + const struct ethtool_ops en_ethtool_ops = { + .get_settings = NULL, + .set_settings = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_ETHTOOL_GET_SET_SETTINGS, 1, + [get/set_settings is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if TCA_VLAN_ACT_MODIFY exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + u16 x = TCA_VLAN_ACT_MODIFY; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TCA_VLAN_ACT_MODIFY, 1, + [TCA_VLAN_ACT_MODIFY exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if ETH_MAX_MTU exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + u16 max_mtu = ETH_MAX_MTU; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_ETH_MAX_MTU, 1, + [ETH_MAX_MTU exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + + AC_MSG_CHECKING([if ETH_MIN_MTU exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + u16 min_mtu = ETH_MIN_MTU; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_ETH_MIN_MTU, 1, + [ETH_MIN_MTU exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if vxlan.h has vxlan_vni_field]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + vxlan_vni_field(0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_VXLAN_VNI_FIELD, 1, + [vxlan_vni_field is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if netdevice.h has IFF_RXFH_CONFIGURED]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int x = IFF_RXFH_CONFIGURED; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NETDEV_IFF_RXFH_CONFIGURED, 1, + [IFF_RXFH_CONFIGURED is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if netdevice.h has netdev_for_each_lower_dev]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct net_device *lag, *dev; + struct list_head *iter; + netdev_for_each_lower_dev(lag, dev, iter); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NETDEV_FOR_EACH_LOWER_DEV, 1, + [netdev_for_each_lower_dev is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if irq.h irq_data has member affinity]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + struct irq_data y; + const struct cpumask *x = y.affinity; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IRQ_DATA_AFFINITY, 1, + [irq_data member affinity is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if irq.h has irq_get_effective_affinity_mask]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + irq_get_effective_affinity_mask(0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IRQ_GET_EFFECTIVE_AFFINITY_MASK, 1, + [irq_get_effective_affinity_mask is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if irq.h has irq_get_affinity_mask]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + irq_get_affinity_mask(0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IRQ_GET_AFFINITY_MASK, 1, + [irq_get_affinity_mask is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if ifla_vf_info has trust]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct ifla_vf_info *ivf; + + ivf->trusted = 0; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_VF_INFO_TRUST, 1, + [trust is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if if_link.h has IFLA_VF_IB_NODE_PORT_GUID]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int type = IFLA_VF_IB_NODE_GUID; + + type = IFLA_VF_IB_PORT_GUID; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IFLA_VF_IB_NODE_PORT_GUID, 1, + [trust is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if pkt_cls.h enum enum tc_fl_command has TC_CLSFLOWER_STATS]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + enum tc_fl_command x = TC_CLSFLOWER_STATS; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TC_CLSFLOWER_STATS, 1, + [HAVE_TC_CLSFLOWER_STATS is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct tc_cls_flower_offload has stats field]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct tc_cls_flower_offload *f; + struct flow_stats stats; + + f->stats = stats; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TC_CLS_FLOWER_OFFLOAD_HAS_STATS_FIELD, 1, + [HAVE_TC_CLS_FLOWER_OFFLOAD_HAS_STATS_FIELD is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/inetdevice.h inet_confirm_addr has 5 parameters]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + inet_confirm_addr(NULL, NULL, 0, 0, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_INET_CONFIRM_ADDR_5_PARAMS, 1, + [inet_confirm_addr has 5 parameters]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/inetdevice.h has for_ifa define]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct in_device *in_dev; + + for_ifa(in_dev) { + } + + endfor_ifa(in_dev); + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FOR_IFA, 1, + [for_ifa defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if netdevice.h has netdev_port_same_parent_id]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + netdev_port_same_parent_id(NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NETDEV_PORT_SAME_PARENT_ID, 1, + [netdev_port_same_parent_id is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if netdevice.h has struct netdev_phys_item_id]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct netdev_phys_item_id x; + x.id_len = 0; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NETDEV_PHYS_ITEM_ID, 1, + [netdev_phys_item_id is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if netdev_features.h has NETIF_F_HW_TLS_RX]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + netdev_features_t tls_rx = NETIF_F_HW_TLS_RX; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NETIF_F_HW_TLS_RX, 1, + [NETIF_F_HW_TLS_RX is defined in netdev_features.h]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if netdev_features.h has NETIF_F_GRO_HW]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + netdev_features_t value = NETIF_F_GRO_HW; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NETIF_F_GRO_HW, 1, + [NETIF_F_GRO_HW is defined in netdev_features.h]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if netdevice.h has NETIF_IS_LAG_MASTER]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct net_device *dev; + netif_is_lag_master(dev); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NETIF_IS_LAG_MASTER, 1, + [NETIF_IS_LAG_MASTER is defined in netdevice.h]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if netdevice.h has NETIF_IS_LAG_PORT]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct net_device *dev; + netif_is_lag_port(dev); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NETIF_IS_LAG_PORT, 1, + [NETIF_IS_LAG_PORT is defined in netdevice.h]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if ktime.h ktime is union and has tv64]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + ktime_t x; + x.tv64 = 0; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_KTIME_UNION_TV64, 1, + [ktime is union and has tv64]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if vxlan have ndo_add_vxlan_port]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + #if IS_ENABLED(CONFIG_VXLAN) + void add_vxlan_port(struct net_device *dev, sa_family_t sa_family, __be16 port) + { + return; + } + #endif + ],[ + struct net_device_ops netdev_ops; + netdev_ops.ndo_add_vxlan_port = add_vxlan_port; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NDO_ADD_VXLAN_PORT, 1, + [ndo_add_vxlan_port is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if udp_tunnel.h has udp_tunnel_drop_rx_port]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + udp_tunnel_drop_rx_port(NULL, NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_UDP_TUNNEL_RX_INFO, 1, + [udp_tunnel.h has udp_tunnel_drop_rx_port is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if ndo_add_vxlan_port have udp_tunnel_info]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + #if IS_ENABLED(CONFIG_VXLAN) + void add_vxlan_port(struct net_device *dev, struct udp_tunnel_info *ti) + { + return; + } + #endif + + ],[ + struct net_device_ops netdev_ops; + netdev_ops.ndo_udp_tunnel_add = add_vxlan_port; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NDO_UDP_TUNNEL_ADD, 1, + [ndo_add_vxlan_port is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct net_device_ops_extended has ndo_udp_tunnel_add]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + #if IS_ENABLED(CONFIG_VXLAN) + void add_vxlan_port(struct net_device *dev, struct udp_tunnel_info *ti) + { + return; + } + #endif + + ],[ + struct net_device_ops_extended x = { + .ndo_udp_tunnel_add = add_vxlan_port, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NDO_UDP_TUNNEL_ADD_EXTENDED, 1, + [extended ndo_add_vxlan_port is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if dst.h has skb_dst_update_pmtu]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct sk_buff x; + skb_dst_update_pmtu(&x, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SKB_DST_UPDATE_PMTU, 1, + [skb_dst_update_pmtu is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if ipv6_stub has ipv6_dst_lookup_flow]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + int x = ipv6_stub->ipv6_dst_lookup_flow(NULL, NULL, NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IPV6_DST_LOOKUP_FLOW, 1, + [if ipv6_stub has ipv6_dst_lookup_flow]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if ipv6_stub has ipv6_dst_lookup_flow in addrconf.h]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int x = ipv6_stub->ipv6_dst_lookup_flow(NULL, NULL, NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IPV6_DST_LOOKUP_FLOW_ADDR_CONF, 1, + [if ipv6_stub has ipv6_dst_lookup_flow in addrconf.h]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if nla_policy has validation_type]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct nla_policy x; + x.validation_type = NLA_VALIDATE_MIN; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NLA_POLICY_HAS_VALIDATION_TYPE, 1, + [nla_policy has validation_type]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if netlink.h has nla_strscpy]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + nla_strscpy(NULL, NULL ,0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NLA_STRSCPY, 1, + [nla_strscpy exist]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if netlink.h has nla_nest_start_noflag]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + nla_nest_start_noflag(NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NLA_NEST_START_NOFLAG, 1, + [nla_nest_start_noflag exist]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if netlink.h has nlmsg_validate_deprecated ]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + nlmsg_validate_deprecated(NULL, 0, 0, NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NLMSG_VALIDATE_DEPRECATED, 1, + [nlmsg_validate_deprecated exist]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if netlink.h has nlmsg_parse_deprecated ]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + nlmsg_parse_deprecated(NULL, 0, NULL, 0, NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NLMSG_PARSE_DEPRECATED, 1, + [nlmsg_parse_deprecated exist]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if netlink.h has nla_parse_deprecated ]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + nla_parse_deprecated(NULL, 0, NULL, 0, NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NLA_PARSE_DEPRECATED, 1, + [nla_parse_deprecated exist]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if netlink.h nla_parse takes 6 parameters]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + nla_parse(NULL, 0, NULL, 0, NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NLA_PARSE_6_PARAMS, 1, + [nla_parse takes 6 parameters]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/netlink.h has nla_put_u64_64bit]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + nla_put_u64_64bit(NULL, 0, 0, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NLA_PUT_U64_64BIT, 1, + [nla_put_u64_64bit is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if netlink.h has struct netlink_ext_ack]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct netlink_ext_ack x; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NETLINK_EXT_ACK, 1, + [struct netlink_ext_ack is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct genl_ops has member validate]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct genl_ops x; + + x.validate = 0; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_GENL_OPS_VALIDATE, 1, + [struct genl_ops has member validate]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct genl_family has member policy]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct genl_family x; + + x.policy = NULL; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_GENL_FAMILY_POLICY, 1, + [struct genl_family has member policy]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct netlink_callback has member extack]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct netlink_callback x; + + x.extack = NULL; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NETLINK_CALLBACK_EXTACK, 1, + [struct netlink_callback has member extack]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if sysfs.h has sysfs_emit]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + sysfs_emit(NULL, ""); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SYSFS_EMIT, 1, + [sysfs_emit is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if ethtool.h has struct ethtool_pause_stats]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct ethtool_pause_stats x; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_ETHTOOL_PAUSE_STATS, 1, + [ethtool_pause_stats is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if ethtool.h has struct ethtool_rmon_hist_range]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct ethtool_rmon_hist_range x; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_ETHTOOL_RMON_HIST_RANGE, 1, + [ethtool_rmon_hist_range is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if ethtool.h has get/set_fecparam]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct ethtool_ops x = { + .get_fecparam = NULL, + .set_fecparam = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_GET_SET_FECPARAM, 1, + [get/set_fecparam is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if ethtool.h has ndo eth_phy_stats]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct ethtool_ops x = { + .get_eth_phy_stats = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NDO_ETH_PHY_STATS, 1, + [eth_phy_stats is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if ethtool.h has ndo get_fec_stats]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct ethtool_ops x = { + .get_fec_stats = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NDO_GET_FEC_STATS, 1, + [get_fec_stats is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if skbuff.h has skb_put_zero]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + skb_put_zero(NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SKB_PUT_ZERO, 1, + [skb_put_zero is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if skbuff.h has skb_set_redirected]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct sk_buff x; + skb_set_redirected(&x, false); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SKB_SET_REDIRECTED, 1, + [skb_set_redirected is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if skbuff.h struct sk_buff has member sw_hash]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct sk_buff x = { + .sw_hash = 0, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SKB_SWHASH, 1, + [sk_buff has member sw_hash]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if addrconf.h has addrconf_ifid_eui48]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + u8 *a; + + int x = addrconf_ifid_eui48(a, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_ADDRCONF_IFID_EUI48, 1, + [addrconf_ifid_eui48 is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if addrconf.h has addrconf_addr_eui48]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + addrconf_addr_eui48(NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_ADDRCONF_ADDR_EUI48, 1, + [addrconf_addr_eui48 is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if addrconf.h ipv6_dst_lookup takes net]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int x = ipv6_stub->ipv6_dst_lookup(NULL, NULL, NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IPV6_DST_LOOKUP_TAKES_NET, 1, + [ipv6_dst_lookup takes net]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if include/net/dcbnl.h struct dcbnl_rtnl_ops has *ieee_getqcn]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + struct dcbnl_rtnl_ops x = { + .ieee_getqcn = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IEEE_GETQCN, 1, + [ieee_getqcn is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if dcbnl.h has struct ieee_qcn]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + struct ieee_qcn x; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_STRUCT_IEEE_QCN, 1, + [ieee_qcn is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if netdevice.h has netdev_for_each_all_upper_dev_rcu]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct net_device *dev; + struct net_device *upper; + struct list_head *list; + + netdev_for_each_all_upper_dev_rcu(dev, upper, list); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NETDEV_FOR_EACH_ALL_UPPER_DEV_RCU, 1, + [netdev_master_upper_dev_get_rcu is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if netdevice.h has netdev_walk_all_upper_dev_rcu]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + ],[ + netdev_walk_all_upper_dev_rcu(NULL, NULL, NULL); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NETDEV_WALK_ALL_UPPER_DEV_RCU, 1, + [netdev_walk_all_upper_dev_rcu is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if netdevice.h has netdev_walk_all_lower_dev_rcu]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + netdev_walk_all_lower_dev_rcu(NULL, NULL, NULL); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NETDEV_WALK_ALL_LOWER_DEV_RCU, 1, + [netdev_walk_all_lower_dev_rcu is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if netdevice.h has netdev_has_upper_dev_all_rcu]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct net_device *dev; + struct net_device *upper; + + netdev_has_upper_dev_all_rcu(dev, upper); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NETDEV_HAS_UPPER_DEV_ALL_RCU, 1, + [netdev_has_upper_dev_all_rcu is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if netdevice.h has netdev_notifier_changeupper_info]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct netdev_notifier_changeupper_info info; + + info.master = 1; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NETDEV_NOTIFIER_CHANGEUPPER_INFO, 1, + [netdev_notifier_changeupper_info is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if build_bug.h has static_assert]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #define A 5 + #define B 6 + ],[ + static_assert(A < B); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_STATIC_ASSERT, 1, + [build_bug.h has static_assert]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if ip_fib.h fib_nh_notifier_info exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + struct fib_nh_notifier_info fnh_info; + struct fib_notifier_info info; + + /* also checking family attr in fib_notifier_info */ + info.family = AF_INET; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FIB_NH_NOTIFIER_INFO, 1, + [fib_nh_notifier_info is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if register_fib_notifier has 4 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + register_fib_notifier(NULL, NULL, NULL, NULL); + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_REGISTER_FIB_NOTIFIER_HAS_4_PARAMS, 1, + [register_fib_notifier has 4 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if function fib_info_nh exists in file net/nexthop.h]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + fib_info_nh(NULL, 0); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FIB_INFO_NH, 1, + [function fib_info_nh exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if function fib6_info_nh_dev exists in file net/nexthop.h]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + fib6_info_nh_dev(NULL); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FIB6_INFO_NH_DEV, 1, + [function fib6_info_nh_dev exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct fib6_entry_notifier_info exist]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct fib6_entry_notifier_info info; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FIB6_ENTRY_NOTIFIER_INFO, 1, + [struct fib6_entry_notifier_info exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct fib6_entry_notifier_info has member struct fib6_info]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct fib6_entry_notifier_info info; + struct fib6_info rt; + + info.rt = &rt; + info.rt->fib6_dst.plen = 0; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FIB6_INFO_IN_FIB6_ENTRY_NOTIFIER_INFO, 1, + [struct fib6_entry_notifier_info has member struct fib6_info]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/fib_notifier.h exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct fib_notifier_info info; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FIB_NOTIFIER_HEADER_FILE, 1, + [has net/fib_notifier.h]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct fib_notifier_info has member family]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct fib_notifier_info info; + + info.family = 0; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FIB_NOTIFIER_INFO_HAS_FAMILY, 1, + [struct fib_notifier_info has member family]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/kobject.h kobj_type has default_groups member]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct kobj_type x = { + .default_groups = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_KOBJ_TYPE_DEFAULT_GROUPS, 1, + [linux/kobject.h kobj_type has default_groups member]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/lockdep.h has lockdep_unregister_key]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + lockdep_unregister_key(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_LOCKDEP_UNREGISTER_KEY, 1, + [linux/lockdep.h has lockdep_unregister_key]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/lockdep.h has lockdep_assert_held_exclusive]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + lockdep_assert_held_exclusive(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_LOCKUP_ASSERT_HELD_EXCLUSIVE, 1, + [linux/lockdep.h has lockdep_assert_held_exclusive]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/lockdep.h has lockdep_assert_held_write]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + lockdep_assert_held_write(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_LOCKUP_ASSERT_HELD_WRITE, 1, + [linux/lockdep.h has lockdep_assert_held_write]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if ip_fib.h fib_lookup has 4 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + #include + ],[ + fib_lookup(NULL, NULL, NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FIB_LOOKUP_4_PARAMS, 1, + [fib_lookup has 4 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if fib_nh has fib_nh_dev]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct fib_nh x = { + .fib_nh_dev = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FIB_NH_DEV, 1, + [fib_nh has fib_nh_dev]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if inet6_hashtables.h __inet6_lookup_established has 7 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + __inet6_lookup_established(NULL,NULL,NULL,0,NULL,0,0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE___INET6_LOOKUP_ESTABLISHED_HAS_7_PARAMS, 1, + [__inet6_lookup_established has 7 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if workqueue.h has __cancel_delayed_work]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + __cancel_delayed_work(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE___CANCEL_DELAYED_WORK, 1, + [__cancel_delayed_work is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if workqueue.h has WQ_NON_REENTRANT]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct workqueue_struct *my_wq = alloc_workqueue("my_wq", WQ_NON_REENTRANT, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_WQ_NON_REENTRANT, 1, + [WQ_NON_REENTRANT is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if vm_fault_t exist in mm_types.h]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + vm_fault_t a; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_VM_FAULT_T, 1, + [vm_fault_t is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct mm_struct has member atomic_pinned_vm]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct mm_struct x; + atomic64_t y; + x.pinned_vm = y; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_ATOMIC_PINNED_VM, 1, + [atomic_pinned_vm is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct mm_struct has member pinned_vm]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct mm_struct x; + x.pinned_vm = 0; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_PINNED_VM, 1, + [pinned_vm is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if sock.h sk_wait_data has 3 parameters]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + sk_wait_data(NULL, NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SK_WAIT_DATA_3_PARAMS, 1, + [sk_wait_data has 3 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if sock.h sk_data_ready has 2 parameters]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + static struct socket *mlx_lag_compat_rtnl_sock; + mlx_lag_compat_rtnl_sock->sk->sk_data_ready(NULL , 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SK_DATA_READY_2_PARAMS, 1, + [sk_data_ready has 2 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if route.h struct rtable has member rt_gw_family]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct rtable x = { + .rt_gw_family = 0, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_RT_GW_FAMILY, 1, + [rt_gw_family is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if route.h struct rtable has member rt_uses_gateway]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct rtable x = { + .rt_uses_gateway = 0, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_RT_USES_GATEWAY, 1, + [rt_uses_gateway is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + LB_CHECK_SYMBOL_EXPORT([unpin_user_pages_dirty_lock], + [mm/gup.c], + [AC_DEFINE(HAVE_UNPIN_USER_PAGES_DIRTY_LOCK_EXPORTED, 1, + [unpin_user_pages_dirty_lock is exported by the kernel])], + []) + + LB_CHECK_SYMBOL_EXPORT([unpin_user_page_range_dirty_lock], + [mm/gup.c], + [AC_DEFINE(HAVE_UNPIN_USER_PAGE_RANGE_DIRTY_LOCK_EXPORTED, 1, + [unpin_user_page_range_dirty_lock is exported by the kernel])], + []) + + LB_CHECK_SYMBOL_EXPORT([compat_ptr_ioctl], + [fs/ioctl.c], + [AC_DEFINE(HAVE_COMPAT_PTR_IOCTL_EXPORTED, 1, + [compat_ptr_ioctl is exported by the kernel])], + []) + + LB_CHECK_SYMBOL_EXPORT([flow_rule_match_cvlan], + [net/core/flow_offload.c], + [AC_DEFINE(HAVE_FLOW_RULE_MATCH_CVLAN, 1, + [flow_rule_match_cvlan is exported by the kernel])], + []) + LB_CHECK_SYMBOL_EXPORT([devlink_params_publish], + [net/core/devlink.c], + [AC_DEFINE(HAVE_DEVLINK_PARAMS_PUBLISHED, 1, + [devlink_params_publish is exported by the kernel])], + []) + LB_CHECK_SYMBOL_EXPORT([debugfs_create_file_unsafe], + [fs/debugfs/inode.c], + [AC_DEFINE(HAVE_DEBUGFS_CREATE_FILE_UNSAFE, 1, + [debugfs_create_file_unsafe is exported by the kernel])], + []) + LB_CHECK_SYMBOL_EXPORT([devlink_param_publish], + [net/core/devlink.c], + [AC_DEFINE(HAVE_DEVLINK_PARAM_PUBLISH, 1, + [devlink_param_publish is exported by the kernel])], + []) + LB_CHECK_SYMBOL_EXPORT([split_page], + [mm/page_alloc.c], + [AC_DEFINE(HAVE_SPLIT_PAGE_EXPORTED, 1, + [split_page is exported by the kernel])], + []) + + LB_CHECK_SYMBOL_EXPORT([ip6_dst_hoplimit], + [net/ipv6/output_core.c], + [AC_DEFINE(HAVE_IP6_DST_HOPLIMIT, 1, + [ip6_dst_hoplimit is exported by the kernel])], + []) + + LB_CHECK_SYMBOL_EXPORT([udp4_hwcsum], + [net/ipv4/udp.c], + [AC_DEFINE(HAVE_UDP4_HWCSUM, 1, + [udp4_hwcsum is exported by the kernel])], + []) + + LB_CHECK_SYMBOL_EXPORT([__ip_dev_find], + [net/ipv4/devinet.c], + [AC_DEFINE(HAVE___IP_DEV_FIND, 1, + [HAVE___IP_DEV_FIND is exported by the kernel])], + []) + LB_CHECK_SYMBOL_EXPORT([inet_confirm_addr], + [net/ipv4/devinet.c], + [AC_DEFINE(HAVE_INET_CONFIRM_ADDR_EXPORTED, 1, + [inet_confirm_addr is exported by the kernel])], + []) + + AC_MSG_CHECKING([if ipv6.h has ip6_make_flowinfo]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + ip6_make_flowinfo(0, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IP6_MAKE_FLOWINFO, 1, + [ip6_make_flowinfo is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + LB_CHECK_SYMBOL_EXPORT([dev_pm_qos_update_user_latency_tolerance], + [drivers/base/power/qos.c], + [AC_DEFINE(HAVE_PM_QOS_UPDATE_USER_LATENCY_TOLERANCE_EXPORTED, 1, + [dev_pm_qos_update_user_latency_tolerance is exported by the kernel])], + []) + + AC_MSG_CHECKING([if pm_qos.h has DEV_PM_QOS_RESUME_LATENCY]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + enum dev_pm_qos_req_type type = DEV_PM_QOS_RESUME_LATENCY; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEV_PM_QOS_RESUME_LATENCY, 1, + [DEV_PM_QOS_RESUME_LATENCY is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if sock.h has skwq_has_sleeper]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + struct socket_wq wq; + skwq_has_sleeper(&wq); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SKWQ_HAS_SLEEPER, 1, + [skwq_has_sleeper is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net.h sock_create_kern has 5 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + sock_create_kern(NULL, 0, 0, 0, NULL); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SOCK_CREATE_KERN_5_PARAMS, 1, + [sock_create_kern has 5 params is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if pci.h has pci_pool_zalloc]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + pci_pool_zalloc(NULL, 0, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_PCI_POOL_ZALLOC, 1, + [pci_pool_zalloc is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if pci.h has pcie_relaxed_ordering_enabled]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + pcie_relaxed_ordering_enabled(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_PCIE_RELAXED_ORDERING_ENABLED, 1, + [pcie_relaxed_ordering_enabled is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if netdev_features.h has NETIF_F_GSO_IPXIP6]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int x = NETIF_F_GSO_IPXIP6; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NETIF_F_GSO_IPXIP6, 1, + [NETIF_F_GSO_IPXIP6 is defined in netdev_features.h]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if netdev_features.h has ]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int x = NETIF_F_GSO_UDP_L4; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NETIF_F_GSO_UDP_L4, 1, + [HAVE_NETIF_F_GSO_UDP_L4 is defined in netdev_features.h]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct netdev_features.h has NETIF_F_GSO_PARTIAL]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int x = NETIF_F_GSO_PARTIAL; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NETIF_F_GSO_PARTIAL, 1, + [NETIF_F_GSO_PARTIAL is defined in netdev_features.h]) + ],[ + AC_MSG_RESULT(no) + ]) + + # this checker will test if the function exist AND gets const + # otherwise it will fail. + AC_MSG_CHECKING([if if_vlan.h has is_vlan_dev get const]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + const struct net_device *dev; + is_vlan_dev(dev); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IS_VLAN_DEV_CONST, 1, + [is_vlan_dev get const]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct net_device_ops has *ndo_bridge_setlink]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + int bridge_setlink(struct net_device *dev, struct nlmsghdr *nlh, + u16 flags) + { + return 0; + } + ],[ + struct net_device_ops netdev_ops; + netdev_ops.ndo_bridge_setlink = bridge_setlink; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NDO_BRIDGE_SETLINK, 1, + [ndo_bridge_setlink is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct net_device_ops has *ndo_bridge_setlink]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + int bridge_setlink(struct net_device *dev, struct nlmsghdr *nlh, + u16 flags, struct netlink_ext_ack *extack) + { + return 0; + } + ],[ + struct net_device_ops netdev_ops; + netdev_ops.ndo_bridge_setlink = bridge_setlink; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NDO_BRIDGE_SETLINK_EXTACK, 1, + [ndo_bridge_setlink is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct net_device_ops has *ndo_bridge_getlink]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + int bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, + struct net_device *dev, u32 filter_mask, + int nlflags) + { + return 0; + } + ],[ + struct net_device_ops netdev_ops; + netdev_ops.ndo_bridge_getlink = bridge_getlink; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NDO_BRIDGE_GETLINK_NLFLAGS, 1, + [ndo_bridge_getlink is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct net_device_ops has *ndo_bridge_getlink]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + int bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, + struct net_device *dev, u32 filter_mask) + { + return 0; + } + ],[ + struct net_device_ops netdev_ops; + netdev_ops.ndo_bridge_getlink = bridge_getlink; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NDO_BRIDGE_GETLINK, 1, + [ndo_bridge_getlink is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if include/linux/rtnetlink.h] has ndo_dflt_bridge_getlink) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + ndo_dflt_bridge_getlink(NULL, 0, 0, NULL, 0, 0, 0, + 0, 0, NULL); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NDO_DFLT_BRIDGE_GETLINK_FLAG_MASK_NFLAGS_FILTER, 1, + [ndo_dflt_bridge_getlink is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if include/linux/rtnetlink.h] has ndo_dflt_bridge_getlink) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + ndo_dflt_bridge_getlink(NULL, 0, 0, NULL, 0, 0, 0, + 0); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NDO_DFLT_BRIDGE_GETLINK_FLAG_MASK_NFLAGS, 1, + [ndo_dflt_bridge_getlink is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if include/linux/rtnetlink.h] has ndo_dflt_bridge_getlink) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + ndo_dflt_bridge_getlink(NULL, 0, 0, NULL, 0, 0, 0); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NDO_DFLT_BRIDGE_GETLINK_FLAG_MASK, 1, + [ndo_dflt_bridge_getlink is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct net_device_ops has *ndo_get_vf_stats]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + int get_vf_stats(struct net_device *dev, int vf, struct ifla_vf_stats *vf_stats) + { + return 0; + } + ],[ + struct net_device_ops netdev_ops; + netdev_ops.ndo_get_vf_stats = get_vf_stats; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NDO_GET_VF_STATS, 1, + [ndo_get_vf_stats is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct net_device_ops has *ndo_set_vf_guid]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + int set_vf_guid(struct net_device *dev, int vf, u64 guid, int guid_type) + { + return 0; + } + ],[ + struct net_device_ops netdev_ops; + netdev_ops.ndo_set_vf_guid = set_vf_guid; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NDO_SET_VF_GUID, 1, + [ndo_set_vf_guid is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct net_device_ops has *ndo_get_vf_guid]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + + int get_vf_guid(struct net_device *dev, int vf, struct ifla_vf_guid *node_guid, + struct ifla_vf_guid *port_guid) + + { + return 0; + } + ],[ + struct net_device_ops netdev_ops; + netdev_ops.ndo_get_vf_guid = get_vf_guid; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NDO_GET_VF_GUID, 1, + [ndo_get_vf_guid is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if if_link.h struct has struct ifla_vf_stats]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + ],[ + struct ifla_vf_stats x; + x = x; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IFLA_VF_STATS, 1, + [struct ifla_vf_stats is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if if_link.h struct has struct ifla_vf_guid]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + ],[ + struct ifla_vf_guid x; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IFLA_VF_GUID, 1, + [struct ifla_vf_guid is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if pci.h has pci_irq_get_node]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + pci_irq_get_node(NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_PCI_IRQ_GET_NODE, 1, + [pci_irq_get_node is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if pci.h has pci_irq_get_affinity]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + pci_irq_get_affinity(NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_PCI_IRQ_GET_AFFINITY, 1, + [pci_irq_get_affinity is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + LB_CHECK_SYMBOL_EXPORT([elfcorehdr_addr], + [kernel/crash_dump.c], + [AC_DEFINE(HAVE_ELFCOREHDR_ADDR_EXPORTED, 1, + [elfcorehdr_addr is exported by the kernel])], + []) + + LB_CHECK_SYMBOL_EXPORT([fib_lookup], + [net/ipv4/fib_rules.c], + [AC_DEFINE(HAVE_FIB_LOOKUP_EXPORTED, 1, + [fib_lookup is exported by the kernel])], + []) + + LB_CHECK_SYMBOL_EXPORT([idr_get_next_ul], + [lib/idr.c], + [AC_DEFINE(HAVE_IDR_GET_NEXT_UL_EXPORTED, 1, + [idr_get_next_ul is exported by the kernel])], + []) + + AC_MSG_CHECKING([if idr.h has ida_free]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + ida_free(NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IDA_FREE, 1, + [idr.h has ida_free]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if idr.h has ida_alloc_range]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + ida_alloc_range(NULL, 0, 0, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IDA_ALLOC_RANGE, 1, + [idr.h has ida_alloc_range]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if idr struct has idr_rt]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct idr tmp_idr; + struct radix_tree_root tmp_radix; + + tmp_idr.idr_rt = tmp_radix; + tmp_idr.idr_base = 0; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IDR_RT, 1, + [struct idr has idr_rt]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if idr_remove return value exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + void *ret; + + ret = idr_remove(NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IDR_REMOVE_RETURN_VALUE, 1, + [idr_remove return value exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if idr.h has ida_is_empty]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct ida ida; + ida_is_empty(&ida); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IDA_IS_EMPTY, 1, + [ida_is_empty is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if idr.h has idr_is_empty]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct ida ida; + idr_is_empty(&ida.idr); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IDR_IS_EMPTY, 1, + [idr_is_empty is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if xarray is defined]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct xa_limit x; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_XARRAY, 1, + [xa_array is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if xa_for_each_range is defined]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + #ifdef xa_for_each_range + return 0; + #else + #return 1; + #endif + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_XA_FOR_EACH_RANGE, 1, + [xa_for_each_range is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if DEFINE_SHOW_ATTRIBUTE is defined]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + #ifdef DEFINE_SHOW_ATTRIBUTE + return 0; + #else + #return 1; + #endif + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEFINE_SHOW_ATTRIBUTE, 1, + [DEFINE_SHOW_ATTRIBUTE is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if nospec.h has array_index_nospec]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + array_index_nospec(0, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_ARRAY_INDEX_NOSPEC, 1, + [array_index_nospec is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if idr.h has ida_alloc]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + ida_alloc(NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IDA_ALLOC, 1, + [ida_alloc is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if idr.h has ida_alloc_max]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + ida_alloc_max(NULL, 0, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IDA_ALLOC_MAX, 1, + [ida_alloc_max is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if scsi_transfer_length is defind]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + scsi_transfer_length(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SCSI_TRANSFER_LENGTH, 1, + [scsi_transfer_length is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if scsi_cmd_to_rq is defind]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + scsi_cmd_to_rq(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SCSI_CMD_TO_RQ, 1, + [scsi_cmd_to_rq is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if scsi_done is defind]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + scsi_done(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SCSI_DONE, 1, + [scsi_done is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if scsi_get_sector is defind]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + scsi_get_sector(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SCSI_GET_SECTOR, 1, + [scsi_get_sector is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if string.h has strnicmp]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + char a[10] = "aaa"; + char b[10] = "bbb"; + strnicmp(a, b, sizeof(a)); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_STRNICMP, 1, + [strnicmp is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if string.h has kfree_const]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + const char *x; + kfree_const(x); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_KFREE_CONST, 1, + [kfree_const is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if string.h has strscpy_pad]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + strscpy_pad(NULL, NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_STRSCPY_PAD, 1, + [strscpy_pad is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct dcbnl_rtnl_ops has dcbnl_get/set buffer]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + const struct dcbnl_rtnl_ops en_dcbnl_ops = { + .dcbnl_getbuffer = NULL, + .dcbnl_setbuffer = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DCBNL_GETBUFFER, 1, + [struct dcbnl_rtnl_ops has dcbnl_get/set buffer]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if device.h struct class has class_groups]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + ],[ + struct class cm_class = { + .class_groups = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_CLASS_GROUPS, 1, + [struct class has class_groups]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if mm.h struct vm_operations_struct has .fault]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + static vm_fault_t rdma_umap_fault(struct vm_fault *vmf) { + vm_fault_t a; + return a; + } + + ],[ + struct vm_operations_struct rdma_umap_ops = { + .fault = rdma_umap_fault, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_VM_OPERATIONS_STRUCT_HAS_FAULT, 1, + [vm_operations_struct has .fault]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if bus_find_device get const]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + const void *data; + bus_find_device(NULL, NULL, data, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BUS_FIND_DEVICE_GET_CONST, 1, + [bus_find_device get const]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if device.h struct device has dma_ops]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct device devx = { + .dma_ops = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVICE_DMA_OPS, 1, + [struct device has dma_ops]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if dst_ops.h update_pmtu has 4 parameters]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + + static void mtu_up (struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu) + { + return; + } + ],[ + struct dst_ops x = { + .update_pmtu = mtu_up, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_UPDATE_PMTU_4_PARAMS, 1, + [update_pmtu has 4 paramters]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if rtnetlink.h rtnl_link_ops newlink has 4 paramters]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + + static int ipoib_new_child_link(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) + { + return 0; + } + ],[ + struct rtnl_link_ops x = { + .newlink = ipoib_new_child_link, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_RTNL_LINK_OPS_NEWLINK_4_PARAMS, 1, + [newlink has 4 paramters]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if netdevice.h if struct rtnl_link_ops has netns_refund]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + ],[ + struct rtnl_link_ops x = { + .netns_refund = 0, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_STRUCT_LINK_OPS_IPOIB_LINK_OPS_HAS_NETNS_REFUND, 1, + [struct rtnl_link_ops has netns_refund]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if rtnetlink.h rtnl_link_ops newlink has 5 paramters]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + + static int ipoib_new_child_link(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) + { + return 0; + } + ],[ + struct rtnl_link_ops x = { + .newlink = ipoib_new_child_link, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_RTNL_LINK_OPS_NEWLINK_5_PARAMS, 1, + [newlink has 5 paramters]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/ipv6.h has ipv6_mod_enabled]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + + ipv6_mod_enabled(); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IPV6_MOD_ENABLED, 1, + [ipv6_mod_enabled is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/flow_keys.h exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NET_FLOW_KEYS_H, 1, + [net/flow_keys.h exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if pm_domain.h has dev_pm_domain_attach]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + dev_pm_domain_attach(NULL, true); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEV_PM_DOMAIN_ATTACH, 1, + [pm_domain.h has dev_pm_domain_attach]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if netdevice.h has netif_trans_update]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + netif_trans_update(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NETIF_TRANS_UPDATE, 1, + [netif_trans_update is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if include/linux/inet_lro.h exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_INET_LRO_H, 1, + [include/linux/inet_lro.h exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if netdevice.h has netdev_xmit_more]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + netdev_xmit_more(); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NETDEV_XMIT_MORE, 1, + [netdev_xmit_more is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if netdevice.h alloc_netdev_mqs has 5 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + alloc_netdev_mqs(0, NULL, NULL, 0, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_ALLOC_NETDEV_MQS_5_PARAMS, 1, + [alloc_netdev_mqs has 5 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if netdevice.h alloc_netdev_mq has 4 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + alloc_netdev_mq(0, NULL, NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_ALLOC_NETDEV_MQ_4_PARAMS, 1, + [alloc_netdev_mq has 4 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if mm.h get_user_pages has 8 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + get_user_pages(NULL, NULL, 0, 0, 0, 0, NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_GET_USER_PAGES_8_PARAMS, 1, + [get_user_pages has 8 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if mm.h has FOLL_LONGTERM]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int x = FOLL_LONGTERM; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FOLL_LONGTERM, 1, + [FOLL_LONGTERM is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if mm.h has kvzalloc]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + kvzalloc(0, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_KVZALLOC, 1, + [kvzalloc is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if mm.h has mmget]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + mmget(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_MMGET, 1, + [mmget is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if mm.h has mmget_not_zero]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + mmget_not_zero(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SCHED_MM_MMGET_NOT_ZERO, 1, + [mmget_not_zero is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if sched.h has mmget_not_zero]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + mmget_not_zero(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SCHED_MMGET_NOT_ZERO, 1, + [sched_mmget_not_zero is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if mm.h has mmgrab]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + mmgrab(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_MMGRAB, 1, + [mmgrab is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if mm.h has kvmalloc_array]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + kvmalloc_array(0, 0, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_KVMALLOC_ARRAY, 1, + [kvmalloc_array is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if mm.h has kvmalloc_node]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + kvmalloc_node(0, 0, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_KVMALLOC_NODE, 1, + [kvmalloc_node is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if mm.h has kvmalloc]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + kvmalloc(0, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_KVMALLOC, 1, + [kvmalloc is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if mm.h has kvzalloc_node]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + kvzalloc_node(0, 0, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_KVZALLOC_NODE, 1, + [kvzalloc_node is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if mm.h has kvcalloc]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + kvcalloc(0, 0, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_KVCALLOC, 1, + [kvcalloc is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if mm_types.h struct page has _count]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + struct page p; + p._count.counter = 0; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_MM_PAGE__COUNT, 1, + [struct page has _count]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if configfs.h default_groups is list_head]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct config_group x = { + .group_entry = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_CONFIGFS_DEFAULT_GROUPS_LIST, 1, + [default_groups is list_head]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if include/linux/irq_poll.h exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IRQ_POLL_H, 1, + [include/linux/irq_poll.h exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/dma-mapping.h has struct dma_attrs]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct dma_attrs *attrs; + int ret; + + ret = dma_get_attr(DMA_ATTR_WRITE_BARRIER, attrs); + + return ret; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_STRUCT_DMA_ATTRS, 1, + [struct dma_attrs is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/proc_fs.h has pde_data]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + pde_data(NULL); + return 0; + + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_PDE_DATA, 1, + [linux/proc_fs.h has pde_data]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/proc_fs.h has struct proc_ops]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct proc_ops x = { + .proc_open = NULL, + .proc_read = NULL, + .proc_lseek = NULL, + .proc_release = NULL, + }; + + return 0; + + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_PROC_OPS_STRUCT, 1, + [struct proc_ops is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blk_mark_disk_dead exist]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + blk_mark_disk_dead(NULL); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_MARK_DISK_DEAD, 1, + [blk_mark_disk_dead exist]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct blk_mq_ops has map_queue]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct blk_mq_ops ops = { + .map_queue = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_MQ_OPS_MAP_QUEUE, 1, + [struct blk_mq_ops has map_queue]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/blk-mq.h has blk_mq_freeze_queue_wait_timeout]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + blk_mq_freeze_queue_wait_timeout(NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_MQ_FREEZE_QUEUE_WAIT_TIMEOUT, 1, + [blk_mq_freeze_queue_wait_timeout is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/blk-mq.h has blk_mq_freeze_queue_wait]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + blk_mq_freeze_queue_wait(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_MQ_FREEZE_QUEUE_WAIT, 1, + [blk_mq_freeze_queue_wait is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct blk_mq_ops has map_queues]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct blk_mq_ops ops = { + .map_queues = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_MQ_OPS_MAP_QUEUES, 1, + [struct blk_mq_ops has map_queues]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if include/linux/blk-mq-pci.h exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_MQ_PCI_H, 1, + [include/linux/blk-mq-pci.h exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if dma-mapping.h has DMA_ATTR_NO_WARN]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int x = DMA_ATTR_NO_WARN; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DMA_ATTR_NO_WARN, 1, + [DMA_ATTR_NO_WARN is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if dma-mapping.h has dma_zalloc_coherent function]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + dma_zalloc_coherent(NULL, 0, NULL, GFP_KERNEL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DMA_ZALLOC_COHERENT, 1, + [dma-mapping.h has dma_zalloc_coherent function]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if dma-mapping.h has dma_alloc_attrs takes unsigned long attrs]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + dma_alloc_attrs(NULL, 0, NULL, GFP_KERNEL, DMA_ATTR_NO_KERNEL_MAPPING); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DMA_SET_ATTR_TAKES_UNSIGNED_LONG_ATTRS, 1, + [dma_alloc_attrs takes unsigned long attrs]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if filter.h struct xdp_buff has data_hard_start]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct xdp_buff d = { + .data_hard_start = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_XDP_BUFF_DATA_HARD_START, 1, + [xdp_buff data_hard_start is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/xdp.h has struct xdp_frame]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + ],[ + struct xdp_frame f = {}; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_XDP_FRAME_IN_NET_XDP, 1, + [struct xdp_frame is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/xdp.h has struct xdp_frame workaround for 5.4.17-2011.1.2.el8uek.x86_64]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + + ],[ + struct xdp_frame f = {}; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_XDP_FRAME_IN_UEK_KABI, 1, + [struct xdp_frame is defined in 5.4.17-2011.1.2.el8uek.x86_64]) + ],[ + AC_MSG_RESULT(no) + + ]) + + AC_MSG_CHECKING([if net/xdp_sock_drv.h has xsk_buff_alloc]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + xsk_buff_alloc(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_XSK_BUFF_ALLOC, 1, + [xsk_buff_alloc is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/xdp_sock.h has xsk_umem_release_addr_rq]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + xsk_umem_release_addr_rq(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_XSK_UMEM_RELEASE_ADDR_RQ, 1, + [xsk_umem_release_addr_rq is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/xdp_sock.h has xsk_umem_adjust_offset]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + xsk_umem_adjust_offset(NULL, 0, 0); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_XSK_UMEM_ADJUST_OFFSET, 1, + [xsk_umem_adjust_offset is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/xdp_soc_drv.h has xsk_umem_consume_tx get 2 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + xsk_umem_consume_tx(NULL,NULL); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_XSK_UMEM_CONSUME_TX_GET_2_PARAMS_IN_SOCK_DRV, 1, + [net/xdp_soc_drv.h has xsk_umem_consume_tx get 2 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/xdp_sock.h has xsk_umem_consume_tx get 2 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + xsk_umem_consume_tx(NULL,NULL); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_XSK_UMEM_CONSUME_TX_GET_2_PARAMS_IN_SOCK, 1, + [net/xdp_sock.h has xsk_umem_consume_tx get 2 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if filter.h has bpf_warn_invalid_xdp_action get 3 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + bpf_warn_invalid_xdp_action(NULL, NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BPF_WARN_IVALID_XDP_ACTION_GET_3_PARAMS, 1, + [filter.h has bpf_warn_invalid_xdp_action get 3 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if filter.h has xdp_set_data_meta_invalid]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct xdp_buff d; + xdp_set_data_meta_invalid(&d); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_XDP_SET_DATA_META_INVALID, 1, + [xdp_set_data_meta_invalid is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if scsi.h has SG_MAX_SEGMENTS]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int x = SG_MAX_SEGMENTS; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SG_MAX_SEGMENTS, 1, + [SG_MAX_SEGMENTS is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if scsi.h has QUEUE_FULL]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int x = QUEUE_FULL; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SCSI_QUEUE_FULL, 1, + [QUEUE_FULL is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if scsi_device.h has enum scsi_scan_mode]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + enum scsi_scan_mode xx = SCSI_SCAN_INITIAL; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_ENUM_SCSI_SCAN_MODE, 1, + [enum scsi_scan_mode is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if scsi_device.h has blist_flags_t]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + blist_flags_t x = 0; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLIST_FLAGS_T, 1, + [blist_flags_t is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if iscsi_transport.h struct iscsit_transport has member rdma_shutdown]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct iscsit_transport it = { + .rdma_shutdown = 0, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_ISCSIT_TRANSPORT_RDMA_SHUTDOWN, 1, + [rdma_shutdown is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if iscsi_transport.h struct iscsit_transport has member iscsit_get_rx_pdu]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct iscsit_transport it = { + .iscsit_get_rx_pdu = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_ISCSIT_TRANSPORT_ISCSIT_GET_RX_PDU, 1, + [iscsit_get_rx_pdu is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if iscsi_target_core.h has struct iscsit_conn]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct iscsit_conn c; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_ISCSIT_CONN, 1, + [iscsi_target_core.h has struct iscsit_conn]) + + AC_MSG_CHECKING([if iscsi_target_core.h struct iscsit_conn has member login_sockaddr]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct sockaddr_storage s; + struct iscsit_conn c = { + .login_sockaddr = s, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_ISCSIT_CONN_LOGIN_SOCKADDR, 1, + [iscsit_conn has member login_sockaddr]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if iscsi_target_core.h struct iscsit_conn has member local_sockaddr]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct sockaddr_storage s; + struct iscsit_conn c = { + .local_sockaddr = s, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_ISCSIT_CONN_LOCAL_SOCKADDR, 1, + [iscsit_conn has members local_sockaddr]) + ],[ + AC_MSG_RESULT(no) + ]) + ],[ + AC_MSG_RESULT(no) + + AC_MSG_CHECKING([if iscsi_target_core.h struct iscsi_conn has member login_sockaddr]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct sockaddr_storage s; + struct iscsi_conn c = { + .login_sockaddr = s, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_ISCSI_CONN_LOGIN_SOCKADDR, 1, + [iscsi_conn has member login_sockaddr]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if iscsi_target_core.h struct iscsi_conn has member local_sockaddr]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct sockaddr_storage s; + struct iscsi_conn c = { + .local_sockaddr = s, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_ISCSI_CONN_LOCAL_SOCKADDR, 1, + [iscsi_conn has members local_sockaddr]) + ],[ + AC_MSG_RESULT(no) + ]) + ]) + + AC_MSG_CHECKING([if iscsi_target_core.h has struct iscsit_cmd]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct iscsit_cmd c; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_ISCSIT_CMD, 1, + [iscsi_target_core.h has struct iscsit_cmd]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blk_queue_virt_boundary exist]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + blk_queue_virt_boundary(NULL, 0); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_QUEUE_VIRT_BOUNDARY, 1, + [blk_queue_virt_boundary exist]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/blkdev.h/linux/blk-mq.h has blk_rq_is_passthrough]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + blk_rq_is_passthrough(NULL); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_RQ_IS_PASSTHROUGH, 1, + [blk_rq_is_passthrough is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if target_put_sess_cmd has 1 parameter]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + target_put_sess_cmd(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TARGET_PUT_SESS_CMD_HAS_1_PARAM, 1, + [target_put_sess_cmd in target_core_fabric.h has 1 parameter]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if target/target_core_fabric.h has target_stop_session]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + target_stop_session(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TARGET_STOP_SESSION, 1, + [target_stop_session is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if scsi_device.h has scsi_change_queue_depth]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + scsi_change_queue_depth(NULL, 0); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SCSI_CHANGE_QUEUE_DEPTH, 1, + [scsi_change_queue_depth exist]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if scsi_host.h struct scsi_host_template has member track_queue_depth]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct scsi_host_template sh = { + .track_queue_depth = 0, + }; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SCSI_HOST_TEMPLATE_TRACK_QUEUE_DEPTH, 1, + [scsi_host_template has members track_queue_depth]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if scsi_host.h struct scsi_host_template has member shost_groups]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct scsi_host_template sh = { + .shost_groups = NULL, + }; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SCSI_HOST_TEMPLATE_SHOST_GROUPS, 1, + [scsi_host_template has members shost_groups]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if scsi_host.h struct scsi_host_template has member init_cmd_priv]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct scsi_host_template sh = { + .init_cmd_priv = NULL, + }; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SCSI_HOST_TEMPLATE_INIT_CMD_PRIV, 1, + [scsi_host_template has member init_cmd_priv]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if scsi_host.h struct Scsi_Host has member nr_hw_queues]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct Scsi_Host sh = { + .nr_hw_queues = 0, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SCSI_HOST_NR_HW_QUEUES, 1, + [Scsi_Host has members nr_hw_queues]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if scsi_host.h struct Scsi_Host has member max_segment_size]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct Scsi_Host sh = { + .max_segment_size = 0, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SCSI_HOST_MAX_SEGMENT_SIZE, 1, + [Scsi_Host has members max_segment_size]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if scsi_host.h struct Scsi_Host has member virt_boundary_mask]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct Scsi_Host sh = { + .virt_boundary_mask = 0, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SCSI_HOST_VIRT_BOUNDARY_MASK, 1, + [Scsi_Host has members virt_boundary_mask]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if scsi_host.h scsi_host_busy_iter fn has 2 args]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + bool fn(struct scsi_cmnd *scmnd, void *ctx) + { + return false; + } + ],[ + scsi_host_busy_iter(NULL, fn, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SCSI_HOST_BUSY_ITER_FN_2_ARGS, 1, + [scsi_host.h scsi_host_busy_iter fn has 2 args]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if scsi_cmnd.h struct scsi_cmnd has member prot_flags]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct scsi_cmnd sc = { + .prot_flags = 0, + }; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SCSI_CMND_PROT_FLAGS, 1, + [scsi_cmnd has members prot_flags]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if target_core_base.h struct se_cmd has member sense_info]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + ],[ + struct se_cmd se = { + .sense_info = 0, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SE_CMD_HAS_SENSE_INFO, 1, + [struct se_cmd has member sense_info]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if types.h has cycle_t]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + cycle_t x = 0; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TYPE_CYCLE_T, 1, + [type cycle_t is defined in linux/types.h]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/clocksource.h has cycle_t]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + cycle_t x = 0; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_CLOCKSOURCE_CYCLE_T, 1, + [cycle_t is defined in linux/clocksource.h]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if scsi_device.h struct scsi_device has member state_mutex]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + struct scsi_device *sdev; + mutex_init(&sdev->state_mutex); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SCSI_DEVICE_STATE_MUTEX, 1, + [scsi_device.h struct scsi_device has member state_mutex]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if scsi_device.h struct scsi_device has member budget_map]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct scsi_device sdev; + sbitmap_init_node(&sdev.budget_map, 0, 0, 0, 0, false, false); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SCSI_DEVICE_BUDGET_MAP, 1, + [scsi_device.h struct scsi_device has member budget_map]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if scsi_host.h struct scsi_host_template has member use_blk_tags]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct scsi_host_template sh = { + .use_blk_tags = 0, + }; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SCSI_HOST_TEMPLATE_USE_BLK_TAGS, 1, + [scsi_host_template has members use_blk_tags]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if scsi_host.h struct scsi_host_template has member change_queue_type]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct scsi_host_template sh = { + .change_queue_type = 0, + }; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SCSI_HOST_TEMPLATE_CHANGE_QUEUE_TYPE, 1, + [scsi_host_template has members change_queue_type]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if scsi_host.h struct scsi_host_template has member use_host_wide_tags]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct scsi_host_template sh = { + .use_host_wide_tags = 0, + }; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SCSI_HOST_TEMPLATE_USE_HOST_WIDE_TAGS, 1, + [scsi_host_template has members use_host_wide_tags]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if target_core_base.h se_cmd transport_complete_callback has three params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + sense_reason_t transport_complete_callback(struct se_cmd *se, bool b, int *i) { + return 0; + } + ],[ + struct se_cmd se = { + .transport_complete_callback = transport_complete_callback, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SE_CMD_TRANSPORT_COMPLETE_CALLBACK_HAS_THREE_PARAM, 1, + [target_core_base.h se_cmd transport_complete_callback has three params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if uapi/linux/lightnvm.h has struct nvm_user_vio]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + struct nvm_user_vio vio; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NVM_USER_VIO, 1, + [struct nvm_user_vio is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blkdev.h struct request has rq_flags]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + struct request rq = { .rq_flags = 0 }; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_REQUEST_RQ_FLAGS, 1, + [blkdev.h struct request has rq_flags]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blk-mq.h blk_mq_requeue_request has 2 parameters]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + blk_mq_requeue_request(NULL, false); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_MQ_REQUEUE_REQUEST_2_PARAMS, 1, + [blk-mq.h blk_mq_requeue_request has 2 parameters]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blkdev.h has blk_mq_quiesce_queue]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + blk_mq_quiesce_queue(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_MQ_QUIESCE_QUEUE, 1, + [blk_mq_quiesce_queue exist]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blk-mq.h has BLK_MQ_F_NO_SCHED]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int x = BLK_MQ_F_NO_SCHED; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_MQ_F_NO_SCHED, 1, + [BLK_MQ_F_NO_SCHED is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blkdev.h has blk_rq_nr_phys_segments]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + blk_rq_nr_phys_segments(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_RQ_NR_PHYS_SEGMENTS, 1, + [blk_rq_nr_phys_segments exist]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blkdev.h has blk_rq_payload_bytes]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + blk_rq_payload_bytes(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_RQ_NR_PAYLOAD_BYTES, 1, + [blk_rq_payload_bytes exist]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blkdev.h has req_op]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + struct request *req; + req_op(req); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_REQ_OP, 1, + [req_op exist]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blkdev.h has blk_rq_nr_discard_segments]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + blk_rq_nr_discard_segments(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_RQ_NR_DISCARD_SEGMENTS, 1, + [blk_rq_nr_discard_segments is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if pci_ids.h has PCI_CLASS_STORAGE_EXPRESS]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int x = PCI_CLASS_STORAGE_EXPRESS; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_PCI_CLASS_STORAGE_EXPRESS, 1, + [PCI_CLASS_STORAGE_EXPRESS is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if enum req_opf has REQ_OP_DRV_OUT]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + enum req_opf xx = REQ_OP_DRV_OUT; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_REQ_OPF_REQ_OP_DRV_OUT, 1, + [enum req_opf has REQ_OP_DRV_OUT]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if enum req_op has REQ_OP_DRV_OUT]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + enum req_op xx = REQ_OP_DRV_OUT; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_REQ_OP_REQ_OP_DRV_OUT, 1, + [enum req_op has REQ_OP_DRV_OUT]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/blk_types.h has enum req_opf]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + enum req_opf xx = REQ_OP_DRV_OUT; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_TYPES_REQ_OPF, 1, + [enum req_opf is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/blk_types.h has blk_mq_req_flags_t]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + blk_mq_req_flags_t x = 0; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_MQ_REQ_FLAGS_T, 1, + [blk_mq_req_flags_t is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/cgroup_rdma.h exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_CGROUP_RDMA_H, 1, + [linux/cgroup_rdma exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if __cgroup_bpf_run_filter_sysctl have 7 parameters]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return __cgroup_bpf_run_filter_sysctl(NULL, NULL, 0, NULL, NULL, NULL, 0); + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_CGROUP_BPF_RUN_FILTER_SYSCTL_7_PARAMETERS, 1, + [__cgroup_bpf_run_filter_sysctl have 7 parameters]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/pci-p2pdma.h exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_PCI_P2PDMA_H, 1, + [linux/pci-p2pdma.h exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if trace/events/rdma_core.h exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TRACE_EVENTS_RDMA_CORE_HEADER, 1, + [trace/events/rdma_core.h exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if pci-p2pdma.h has pci_p2pdma_unmap_sg]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + pci_p2pdma_unmap_sg(NULL, NULL, 0, 0); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_PCI_P2PDMA_UNMAP_SG, 1, + [pci_p2pdma_unmap_sg defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/sched/signal.h exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SCHED_SIGNAL_H, 1, + [linux/sched/signal.h exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/sched/mm.h exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SCHED_MM_H, 1, + [linux/sched/mm.h exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if memalloc_noio_save defined]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + unsigned int noio_flag = memalloc_noio_save(); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_MEMALLOC_NOIO_SAVE, 1, + [memalloc_noio_save is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/sched/task.h exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SCHED_TASK_H, 1, + [linux/sched/task.h exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/ip_tunnels.h has struct ip_tunnel_info]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + struct ip_tunnel_info ip_tunnel_info_test; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IP_TUNNEL_INFO, 1, + [struct ip_tunnel_info is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if ip_tunnel_info_opts_set has 4 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include /* for kernel linux-3.10.0-1149 */ + #include + ],[ + ip_tunnel_info_opts_set(NULL, NULL, 0, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IP_TUNNEL_INFO_OPTS_SET_4_PARAMS, 1, + [ip_tunnel_info_opts_set has 4 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if __ip_tun_set_dst has 7 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + __ip_tun_set_dst(0, 0, 0, 0, 0, 0, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE___IP_TUN_SET_DST_7_PARAMS, 1, + [__ip_tun_set_dst has 7 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/bpf_trace exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_LINUX_BPF_TRACE_H, 1, + [linux/bpf_trace exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/bpf_trace has trace_xdp_exception]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + trace_xdp_exception(NULL, NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TRACE_XDP_EXCEPTION, 1, + [trace_xdp_exception is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/bpf.h exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_LINUX_BPF_H, 1, + [uapi/bpf.h exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + LB_CHECK_SYMBOL_EXPORT([tcf_exts_num_actions], + [net/sched/cls_api.c], + [AC_DEFINE(HAVE_TCF_EXTS_NUM_ACTIONS, 1, + [tcf_exts_num_actions is exported by the kernel])], + []) + + LB_CHECK_SYMBOL_EXPORT([netpoll_poll_dev], + [net/core/netpoll.c], + [AC_DEFINE(HAVE_NETPOLL_POLL_DEV_EXPORTED, 1, + [netpoll_poll_dev is exported by the kernel])], + []) + + LB_CHECK_SYMBOL_EXPORT([bpf_prog_inc], + [kernel/bpf/syscall.c], + [AC_DEFINE(HAVE_BPF_PROG_INC_EXPORTED, 1, + [bpf_prog_inc is exported by the kernel])], + []) + + LB_CHECK_SYMBOL_EXPORT([__put_task_struct], + [kernel/fork.c], + [AC_DEFINE(HAVE_PUT_TASK_STRUCT_EXPORTED, 1, + [__put_task_struct is exported by the kernel])], + []) + + LB_CHECK_SYMBOL_EXPORT([mmput_async], + [kernel/fork.c], + [AC_DEFINE(HAVE_MMPUT_ASYNC_EXPORTED, 1, + [mmput_async is exported by the kernel])], + []) + + LB_CHECK_SYMBOL_EXPORT([get_pid_task], + [kernel/pid.c], + [AC_DEFINE(HAVE_GET_PID_TASK_EXPORTED, 1, + [get_pid_task is exported by the kernel])], + []) + + LB_CHECK_SYMBOL_EXPORT([get_task_pid], + [kernel/pid.c], + [AC_DEFINE(HAVE_GET_TASK_PID_EXPORTED, 1, + [get_task_pid is exported by the kernel])], + []) + + LB_CHECK_SYMBOL_EXPORT([get_task_comm], + [fs/exec.c], + [AC_DEFINE(HAVE_GET_TASK_COMM_EXPORTED, 1, + [get_task_comm is exported by the kernel])], + []) + + LB_CHECK_SYMBOL_EXPORT([__get_task_comm], + [fs/exec.c], + [AC_DEFINE(HAVE___GET_TASK_COMM_EXPORTED, 1, + [__get_task_comm is exported by the kernel])], + []) + + AC_MSG_CHECKING([if linux/bpf.h has bpf_prog_sub]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + bpf_prog_sub(NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BPF_PROG_SUB, 1, + [bpf_prog_sub is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if bpf_prog_add\bfs_prog_inc functions return struct]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct bpf_prog *prog; + + prog = bpf_prog_add(prog, 0); + prog = bpf_prog_inc(prog); + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BPF_PROG_ADD_RET_STRUCT, 1, + [bpf_prog_add\bfs_prog_inc functions return struct]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/bpf.h has XDP_REDIRECT]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + enum xdp_action x = XDP_REDIRECT; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_XDP_REDIRECT, 1, + [XDP_REDIRECT is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct tc_cls_flower_offload has common]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct tc_cls_flower_offload x = { + .common = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TC_CLS_FLOWER_OFFLOAD_COMMON, 1, + [struct tc_cls_flower_offload has common]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct tc_to_netdev has egress_dev]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct tc_to_netdev x = { + .egress_dev = false, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TC_TO_NETDEV_EGRESS_DEV, 1, + [struct tc_to_netdev has egress_dev]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct tc_cls_flower_offload has egress_dev]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct tc_cls_flower_offload x = { + .egress_dev = false, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TC_CLS_FLOWER_OFFLOAD_EGRESS_DEV, 1, + [struct tc_cls_flower_offload has egress_dev]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct flow_cls_offload exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct flow_cls_offload x = { + .classid = 3, + }; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FLOW_CLS_OFFLOAD, 1, + [struct flow_cls_offload exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct flow_action_entry has ct_metadata.orig_dir]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct flow_action_entry x = { + .ct_metadata.orig_dir = true, + }; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FLOW_ACTION_CT_METADATA_ORIG_DIR, 1, + [struct flow_action_entry has ct_metadata.orig_dir]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct flow_action_entry has ptype]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct flow_action_entry x = { + .ptype = 1, + }; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FLOW_ACTION_PTYPE, 1, + [struct flow_action_entry has ptype]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct flow_action_entry has mpls]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct flow_action_entry x = { + .mpls_push.label = 1, + }; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FLOW_ACTION_MPLS, 1, + [struct flow_action_entry has mpls]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct flow_action_entry has police.index]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct flow_action_entry x = { + .police.index = 1, + }; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FLOW_ACTION_POLICE_INDEX, 1, + [struct flow_action_entry has police.index]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct flow_action_entry has hw_index]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct flow_action_entry x = { + .hw_index = 1, + }; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FLOW_ACTION_HW_INDEX, 1, + [struct flow_action_entry has hw_index]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct flow_action_entry has police.rate_pkt_ps]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct flow_action_entry x = { + .police.rate_pkt_ps = 1, + }; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FLOW_ACTION_POLICE_RATE_PKT_PS, 1, + [struct flow_action_entry has police.rate_pkt_ps]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if flow_rule_match_meta exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + flow_rule_match_meta(NULL, NULL); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FLOW_RULE_MATCH_META, 1, + [flow_rule_match_meta exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if flow_action_hw_stats_check exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + flow_action_hw_stats_check(NULL, NULL, 0); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FLOW_ACTION_HW_STATS_CHECK, 1, + [flow_action_hw_stats_check exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if FLOW_ACTION_POLICE exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + enum flow_action_id action = FLOW_ACTION_POLICE; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FLOW_ACTION_POLICE, 1, + [FLOW_ACTION_POLICE exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if FLOW_ACTION_CT exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + enum flow_action_id action = FLOW_ACTION_CT; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FLOW_ACTION_CT, 1, + [FLOW_ACTION_CT exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if FLOW_ACTION_REDIRECT_INGRESS exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + enum flow_action_id action = FLOW_ACTION_REDIRECT_INGRESS; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FLOW_ACTION_REDIRECT_INGRESS, 1, + [FLOW_ACTION_REDIRECT_INGRESS exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if FLOW_ACTION_CT_METADATA exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + enum flow_action_id action = FLOW_ACTION_CT_METADATA; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FLOW_ACTION_CT_METADATA, 1, + [FLOW_ACTION_CT_METADATA exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if NUM_FLOW_ACTIONS exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + enum flow_action_id action = NUM_FLOW_ACTIONS; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NUM_FLOW_ACTIONS, 1, + [NUM_FLOW_ACTIONS exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if enum flow_block_binder_type exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + enum flow_block_binder_type binder_type; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_ENUM_FLOW_BLOCK_BINDER_TYPE, 1, + [enum flow_block_binder_type exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if flow_indr_block_bind_cb_t has 7 parameters]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + static + int mlx5e_rep_indr_setup_cb(struct net_device *netdev, struct Qdisc *sch, void *cb_priv, + enum tc_setup_type type, void *type_data, + void *data, + void (*cleanup)(struct flow_block_cb *block_cb)) + { + return 0; + } + + ],[ + flow_indr_dev_register(mlx5e_rep_indr_setup_cb, NULL); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FLOW_INDR_BLOCK_BIND_CB_T_7_PARAMS, 1, + [flow_indr_block_bind_cb_t has 7 parameters]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if flow_indr_block_bind_cb_t has 4 parameters]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + static + int mlx5e_rep_indr_setup_cb(struct net_device *netdev, void *cb_priv, + enum tc_setup_type type, void *type_data) + { + return 0; + } + + ],[ + flow_indr_dev_register(mlx5e_rep_indr_setup_cb, NULL); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FLOW_INDR_BLOCK_BIND_CB_T_4_PARAMS, 1, + [flow_indr_block_bind_cb_t has 4 parameters]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if flow_indr_dev_unregister receive flow_setup_cb_t parameter]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + static int mlx5e_rep_indr_setup_tc_cb(enum tc_setup_type type, + void *type_data, void *indr_priv) + { + return 0; + } + + ],[ + flow_indr_dev_unregister(NULL,NULL, mlx5e_rep_indr_setup_tc_cb); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FLOW_INDR_DEV_UNREGISTER_FLOW_SETUP_CB_T, 1, + [flow_indr_dev_unregister receive flow_setup_cb_t parameter]) + ],[ + AC_MSG_RESULT(no) + ]) + + + AC_MSG_CHECKING([if flow_indr_dev_register exist]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + flow_indr_dev_register(NULL, NULL); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FLOW_INDR_DEV_REGISTER, 1, + [flow_indr_dev_register exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if flow_stats_update has 5 parameters]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + flow_stats_update(NULL, 0, 0, 0, 0); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FLOW_STATS_UPDATE_5_PARAMS, 1, + [flow_stats_update has 5 parameters]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if flow_stats_update has 6 parameters]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + flow_stats_update(NULL, 0, 0, 0, 0, 0); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FLOW_STATS_UPDATE_6_PARAMS, 1, + [flow_stats_update has 6 parameters]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct tc_to_netdev has tc]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct tc_to_netdev x; + x.tc = 0; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TC_TO_NETDEV_TC, 1, + [struct tc_to_netdev has tc]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if netdev_lag_upper_info has hash_type]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct netdev_lag_upper_info info; + info.hash_type = 0; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_INFO_HASH_TYPE, 1, + [netdev_lag_upper_info has hash_type]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if ndo_has_offload_stats gets net_device]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + bool mlx5e_has_offload_stats(const struct net_device *dev, int attr_id) + { + return true; + } + ],[ + struct net_device_ops ndops = { + .ndo_has_offload_stats = mlx5e_has_offload_stats, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NDO_HAS_OFFLOAD_STATS_GETS_NET_DEVICE, 1, + [ndo_has_offload_stats gets net_device]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net_device_ops_extended has ndo_has_offload_stats]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + bool mlx5e_has_offload_stats(const struct net_device *dev, int attr_id) + { + return true; + } + ],[ + struct net_device_ops_extended ndops = { + .ndo_has_offload_stats = mlx5e_has_offload_stats, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NDO_HAS_OFFLOAD_STATS_EXTENDED, 1, + [ndo_has_offload_stats gets net_device]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if ndo_get_offload_stats defined]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + int mlx5e_get_offload_stats(int attr_id, const struct net_device *dev, + void *sp) + { + return 0; + } + ],[ + struct net_device_ops ndops = { + .ndo_get_offload_stats = mlx5e_get_offload_stats, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NDO_GET_OFFLOAD_STATS, 1, + [ndo_get_offload_stats is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct net_device_ops_extended has ndo_get_offload_stats]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + int mlx5e_get_offload_stats(int attr_id, const struct net_device *dev, + void *sp) + { + return 0; + } + ],[ + struct net_device_ops_extended ndops = { + .ndo_get_offload_stats = mlx5e_get_offload_stats, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NDO_GET_OFFLOAD_STATS_EXTENDED, 1, + [extended ndo_get_offload_stats is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct mlx5e_netdev_ops has ndo_tx_timeout get 2 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + void mlx5e_tx_timeout(struct net_device *dev, unsigned int txqueue) + { + return; + } + ],[ + struct net_device_ops mlx5e_netdev_ops = { + .ndo_tx_timeout = mlx5e_tx_timeout, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NDO_TX_TIMEOUT_GET_2_PARAMS, 1, + [ndo_tx_timeout get 2 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if ib_umem_notifier_invalidate_range_start has parameter blockable]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + static int notifier(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, + unsigned long end, + bool blockable) { + return 0; + } + ],[ + static const struct mmu_notifier_ops notifiers = { + .invalidate_range_start = notifier + }; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_UMEM_NOTIFIER_PARAM_BLOCKABLE, 1, + [ib_umem_notifier_invalidate_range_start has parameter blockable]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if netdevice.h has struct netdev_notifier_info]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct netdev_notifier_info x = { + .dev = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NETDEV_NOTIFIER_INFO, 1, + [struct netdev_notifier_info is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if netdevice.h has field upper_info in struct netdev_notifier_changeupper_info]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct netdev_notifier_changeupper_info x = { + .upper_info = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NETDEV_NOTIFIER_CHANGEUPPER_INFO_UPPER_INFO, 1, + [struct netdev_notifier_changeupper_info has upper_info]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/tc_act/tc_mpls.h exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NET_TC_ACT_TC_MPLS_H, 1, + [net/tc_act/tc_mpls.h exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/tc_act/tc_tunnel_key.h exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NET_TC_ACT_TC_TUNNEL_KEY_H, 1, + [net/tc_act/tc_tunnel_key.h exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/tc_act/tc_tunnel_key.h has tcf_tunnel_info]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + const struct tc_action xx; + tcf_tunnel_info(&xx); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TCF_TUNNEL_INFO, 1, + [tcf_tunnel_info is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/tc_act/tc_pedit.h has tcf_pedit_nkeys]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + const struct tc_action xx; + tcf_pedit_nkeys(&xx); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TCF_PEDIT_NKEYS, 1, + [tcf_pedit_nkeys is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/tc_act/tc_pedit.h struct tcf_pedit has member tcfp_keys_ex]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct tcf_pedit x = { + .tcfp_keys_ex = NULL, + }; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TCF_PEDIT_TCFP_KEYS_EX, 1, + [struct tcf_pedit has member tcfp_keys_ex]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if scsi_device.h has function scsi_internal_device_block]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + scsi_internal_device_block(NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SCSI_DEVICE_SCSI_INTERNAL_DEVICE_BLOCK, 1, + [scsi_device.h has function scsi_internal_device_block]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if libiscsi.h has iscsi_eh_cmd_timed_out]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + iscsi_eh_cmd_timed_out(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_ISCSI_EH_CMD_TIMED_OUT, 1, + [iscsi_eh_cmd_timed_out is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if libiscsi.h has iscsi_conn_unbind]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + iscsi_conn_unbind(NULL, false); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_ISCSI_CONN_UNBIND, 1, + [iscsi_conn_unbind is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if libiscsi.h iscsi_host_remove has 2 parameters]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + iscsi_host_remove(NULL, false); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_ISCSI_HOST_REMOVE_2_PARAMS, 1, + [libiscsi.h iscsi_host_remove has 2 parameters]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if scsi_transport_iscsi.h has iscsi_put_endpoint]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + iscsi_put_endpoint(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_ISCSI_PUT_ENDPOINT, 1, + [iscsi_put_endpoint is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/sed-opal.h exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_LINUX_SED_OPAL_H, 1, + [linux/sed-opal.h exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if bio.h bio_init has 3 parameters]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + bio_init(NULL, NULL, false); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BIO_INIT_3_PARAMS, 1, + [bio.h bio_init has 3 parameters]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blk_types.h has REQ_IDLE]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int flags = REQ_IDLE; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_REQ_IDLE, 1, + [blk_types.h has REQ_IDLE]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blkdev.h has __blkdev_issue_zeroout]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + __blkdev_issue_zeroout(NULL, 0, 0, 0, NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLKDEV_ISSUE_ZEROOUT, 1, + [__blkdev_issue_zeroout exist]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if compiler.h has const __read_once_size]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + const unsigned long tmp; + __read_once_size(&tmp, NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_CONST_READ_ONCE_SIZE, 1, + [const __read_once_size exist]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if configfs_item_operations drop_link returns int]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + static int my_drop_link(struct config_item *parent, struct config_item *target) + + { + return 0; + } + + ],[ + static struct configfs_item_operations item_ops = { + .drop_link = my_drop_link, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_CONFIGFS_DROP_LINK_RETURNS_INT, 1, + [if configfs_item_operations drop_link returns int]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/nvme-fc-driver.h exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_LINUX_NVME_FC_DRIVER_H, 1, + [linux/nvme-fc-driver.h exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/blk-mq.h has blk_freeze_queue_start]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + blk_freeze_queue_start(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_FREEZE_QUEUE_START, 1, + [blk_freeze_queue_start is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/blk-mq.h blk_mq_complete_request has 2 parameters]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + blk_mq_complete_request(NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_MQ_COMPLETE_REQUEST_HAS_2_PARAMS, 1, + [linux/blk-mq.h blk_mq_complete_request has 2 parameters]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/blk-mq.h blk_mq_ops init_request has 4 parameters]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + int init_request(struct blk_mq_tag_set *set, struct request * req, + unsigned int i, unsigned int k) { + return 0; + } + ],[ + struct blk_mq_ops ops = { + .init_request = init_request, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_MQ_OPS_INIT_REQUEST_HAS_4_PARAMS, 1, + [linux/blk-mq.h blk_mq_ops init_request has 4 parameters]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/blk-mq.h blk_mq_ops exit_request has 3 parameters]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + void exit_request(struct blk_mq_tag_set *set, struct request * req, + unsigned int i) { + return; + } + ],[ + struct blk_mq_ops ops = { + .exit_request = exit_request, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_MQ_OPS_EXIT_REQUEST_HAS_3_PARAMS, 1, + [linux/blk-mq.h blk_mq_ops exit_request has 3 parameters]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/blk-mq.h blk_mq_tag_set has member map]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct blk_mq_tag_set x = { + .map = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_MQ_TAG_SET_HAS_MAP, 1, + [blk_mq_tag_set has member map]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/blk-mq.h blk_mq_tag_set has member ops is const]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + static const struct blk_mq_ops xmq = {0}; + + ],[ + struct blk_mq_tag_set x = { + .ops = &xmq, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_MQ_TAG_SET_HAS_CONST_OPS, 1, + [ blk_mq_tag_set member ops is const]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/blkdev.h has blk_queue_max_write_zeroes_sectors]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + blk_queue_max_write_zeroes_sectors(NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_QUEUE_MAX_WRITE_ZEROES_SECTORS, 1, + [blk_queue_max_write_zeroes_sectors is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/pci.h has pci_free_irq]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + pci_free_irq(NULL, 0, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_PCI_FREE_IRQ, 1, + [linux/pci.h has pci_free_irq]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/security.h has register_lsm_notifier]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + register_lsm_notifier(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_REGISTER_LSM_NOTIFIER, 1, + [linux/security.h has register_lsm_notifier]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/security.h has register_blocking_lsm_notifier]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + register_blocking_lsm_notifier(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_REGISTER_BLOCKING_LSM_NOTIFIER, 1, + [linux/security.h has register_blocking_lsm_notifier]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/dma-map-ops.h has DMA_F_PCI_P2PDMA_SUPPORTED]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct dma_map_ops * a; + a->flags = DMA_F_PCI_P2PDMA_SUPPORTED; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DMA_F_PCI_P2PDMA_SUPPORTED, 1, + [linux/dma-map-ops.h has DMA_F_PCI_P2PDMA_SUPPORTED]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/cdev.h has cdev_set_parent]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + cdev_set_parent(NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_CDEV_SET_PARENT, 1, + [linux/cdev.h has cdev_set_parent]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/atomic.h has __atomic_add_unless]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + atomic_t x; + __atomic_add_unless(&x, 1, 1); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE___ATOMIC_ADD_UNLESS, 1, + [__atomic_add_unless is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/atomic.h has atomic_fetch_add_unless]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + atomic_t x; + atomic_fetch_add_unless(&x, 1, 1); + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_ATOMIC_FETCH_ADD_UNLESS, 1, + [atomic_fetch_add_unless is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/net_tstamp.h has HWTSTAMP_FILTER_NTP_ALL]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int x = HWTSTAMP_FILTER_NTP_ALL; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_HWTSTAMP_FILTER_NTP_ALL, 1, + [HWTSTAMP_FILTER_NTP_ALL is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/pkt_cls.h has tcf_exts_stats_update]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + tcf_exts_stats_update(NULL, 0, 0, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TCF_EXTS_STATS_UPDATE, 1, + [tcf_exts_stats_update is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct tcf_exts has actions as array]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct tcf_exts x; + x.actions = 0; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TCF_EXTS_HAS_ARRAY_ACTIONS, 1, + [struct tcf_exts has actions as array]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/pkt_cls.h has tc_cls_can_offload_and_chain0]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + tc_cls_can_offload_and_chain0(NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TC_CLS_CAN_OFFLOAD_AND_CHAIN0, 1, + [tc_cls_can_offload_and_chain0 is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/tc_act/tc_sum.h has is_tcf_csum]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + is_tcf_csum(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IS_TCF_CSUM, 1, + [is_tcf_csum is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct tc_action_ops has id]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct tc_action_ops x = { .id = 0, }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TC_ACTION_OPS_HAS_ID, 1, + [struct tc_action_ops has id]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct tcf_common exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct tcf_common pc; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TCF_COMMON, 1, + [struct tcf_common is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if tcf_hash helper functions have tcf_hashinfo parameter]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + tcf_hash_check(0, NULL, 0, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TCF_HASH_WITH_HASHINFO, 1, + [tcf_hash helper functions have tcf_hashinfo parameter]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if uapi/linux/nvme_ioctl.h has NVME_IOCTL_RESCAN]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + #include + ],[ + unsigned int x = NVME_IOCTL_RESCAN; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_UAPI_LINUX_NVME_IOCTL_RESCAN, 1, + [uapi/linux/nvme_ioctl.h has NVME_IOCTL_RESCAN]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if refcount.h exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_REFCOUNT, 1, + [refcount.h exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if refcount.h has refcount_dec_if_one]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + bool i = refcount_dec_if_one(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_REFCOUNT_DEC_IF_ONE, 1, + [refcount.h has refcount_dec_if_one]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if firmware.h has request_firmware_direct]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + request_firmware_direct(NULL, NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_REQUEST_FIRMWARE_DIRECT, 1, + [request_firmware_direct is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/pr.h exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_PR_H, 1, + [linux/pr.h exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/device/bus.h exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_LINUX_DEVICE_BUS_H, 1, + [linux/device/bus.h exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if bus_type remove function return void]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + static void auxiliary_bus_remove(struct device *dev) + { + } + ],[ + struct bus_type btype = { + .remove = auxiliary_bus_remove, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BUS_TYPE_REMOVE_RETURN_VOID, 1, + [bus_type remove function return void]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/kern_levels.h has LOGLEVEL_DEFAULT]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int i = LOGLEVEL_DEFAULT; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_LOGLEVEL_DEFAULT, 1, + [linux/kern_levels.h has LOGLEVEL_DEFAULT]) + ],[ + AC_MSG_RESULT(no) + ]) + + + AC_MSG_CHECKING([if linux/t10-pi.h exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_T10_PI_H, 1, + [linux/t10-pi.h exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if pm.h struct dev_pm_info has member set_latency_tolerance]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + #include + + static void nvme_set_latency_tolerance(struct device *dev, s32 val) + { + return; + } + ],[ + struct dev_pm_info dpinfo = { + .set_latency_tolerance = nvme_set_latency_tolerance, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEV_PM_INFO_SET_LATENCY_TOLERANCE, 1, + [set_latency_tolerance is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/blk-mq.h blk_mq_alloc_request has 3 parameters]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + blk_mq_alloc_request(NULL, 0, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_MQ_ALLOC_REQUEST_HAS_3_PARAMS, 1, + [linux/blk-mq.h blk_mq_alloc_request has 3 parameters]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blkdev.h has REQ_TYPE_DRV_PRIV]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + enum rq_cmd_type_bits rctb = REQ_TYPE_DRV_PRIV; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLKDEV_REQ_TYPE_DRV_PRIV, 1, + [REQ_TYPE_DRV_PRIV is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blkdev.h blk_add_request_payload has 4 parameters]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + blk_add_request_payload(NULL, NULL, 0, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_ADD_REQUEST_PAYLOAD_HAS_4_PARAMS, 1, + [blkdev.h blk_add_request_payload has 4 parameters]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/blk_types.h has REQ_OP_FLUSH]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int x = REQ_OP_FLUSH; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_TYPES_REQ_OP_FLUSH, 1, + [REQ_OP_FLUSH is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/blk_types.h has REQ_OP_DISCARD]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int x = REQ_OP_DISCARD; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_TYPES_REQ_OP_DISCARD, 1, + [REQ_OP_DISCARD is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/blk_types.h has blk_status_t]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + blk_status_t xx; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_STATUS_T, 1, + [blk_status_t is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if bio.h struct bio_integrity_payload has member bip_iter]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + struct bvec_iter bip_it = {0}; + struct bio_integrity_payload bip = { + .bip_iter = bip_it, + }; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BIO_INTEGRITY_PYLD_BIP_ITER, 1, + [bio_integrity_payload has members bip_iter]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/pci_ids.h has PCI_VENDOR_ID_AMAZON]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int x = PCI_VENDOR_ID_AMAZON; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_PCI_IDS_PCI_VENDOR_ID_AMAZON, 1, + [PCI_VENDOR_ID_AMAZON is defined in pci_ids]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blkdev.h has BLK_INTEGRITY_DEVICE_CAPABLE]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + enum blk_integrity_flags bif = BLK_INTEGRITY_DEVICE_CAPABLE; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_INTEGRITY_DEVICE_CAPABLE, 1, + [BLK_INTEGRITY_DEVICE_CAPABLE is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blkdev.h has BLK_MAX_WRITE_HINTS]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int x = BLK_MAX_WRITE_HINTS; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_MAX_WRITE_HINTS, 1, + [BLK_MAX_WRITE_HINTS is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blkdev.h has blk_rq_append_bio]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct bio **bio; + + blk_rq_append_bio(NULL, bio); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_RQ_APPEND_BIO, 1, + [blk_rq_append_bio is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blkdev.h has blk_init_request_from_bio]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + blk_init_request_from_bio(NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_INIT_REQUEST_FROM_BIO, 1, + [blk_init_request_from_bio is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if device.h has device_remove_file_self]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + device_remove_file_self(NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVICE_REMOVE_FILE_SELF, 1, + [device.h has device_remove_file_self]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if genhd.h has device_add_disk]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + device_add_disk(NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVICE_ADD_DISK, 1, + [genhd.h has device_add_disk]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if genhd.h has device_add_disk 3 args]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + device_add_disk(NULL, NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVICE_ADD_DISK_3_ARGS, 1, + [genhd.h has device_add_disk]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/blk-mq.h has blk_mq_unquiesce_queue]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + blk_mq_unquiesce_queue(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_MQ_UNQUIESCE_QUEUE, 1, + [blk_mq_unquiesce_queue is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/blk-mq.h has blk_mq_alloc_request_hctx]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + blk_mq_alloc_request_hctx(NULL, 0, 0, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_MQ_ALLOC_REQUEST_HCTX, 1, + [linux/blk-mq.h has blk_mq_alloc_request_hctx]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/lightnvm.h exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_LIGHTNVM_H, 1, + [linux/lightnvm.h exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if pci.h struct pci_error_handlers has reset_notify]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + void reset(struct pci_dev *dev, bool prepare) { + return; + } + ],[ + struct pci_error_handlers x = { + .reset_notify = reset, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_PCI_ERROR_HANDLERS_RESET_NOTIFY, 1, + [pci.h struct pci_error_handlers has reset_notify]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if scsi.h has SCSI_MAX_SG_SEGMENTS]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int x = SCSI_MAX_SG_SEGMENTS; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SCSI_MAX_SG_SEGMENTS, 1, + [SCSI_MAX_SG_SEGMENTS is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/scatterlist.h sg_alloc_table_chained has 4 parameters]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + gfp_t gfp_mask; + sg_alloc_table_chained(NULL, 0, gfp_mask, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SG_ALLOC_TABLE_CHAINED_4_PARAMS, 1, + [sg_alloc_table_chained has 4 parameters]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/scatterlist.h _sg_alloc_table_from_pages has 9 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include ; + ],[ + struct scatterlist *sg; + + sg = __sg_alloc_table_from_pages(NULL, NULL, 0, 0, + 0, 0, NULL, 0, GFP_KERNEL); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SG_ALLOC_TABLE_FROM_PAGES_GET_9_PARAMS, 1, + [__sg_alloc_table_from_pages has 9 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/scatterlist.h has sgl_free]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + sgl_free(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SGL_FREE, 1, + [sgl_free is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/scatterlist.h has sgl_alloc]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + sgl_alloc(0, 0, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SGL_ALLOC, 1, + [sgl_alloc is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/scatterlist.h has sg_zero_buffer]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + sg_zero_buffer(NULL, 0, 0, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SG_ZERO_BUFFER, 1, + [sg_zero_buffer is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/scatterlist.h has sg_append_table]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct sg_append_table sgt_append; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SG_APPEND_TABLE, 1, + [linux/scatterlist.h has sg_append_table]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/uuid.h has uuid_gen]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + uuid_t id; + uuid_gen(&id); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_UUID_GEN, 1, + [uuid_gen is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/uuid.h has uuid_is_null]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + uuid_t uuid; + uuid_is_null(&uuid); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_UUID_IS_NULL, 1, + [uuid_is_null is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/uuid.h has uuid_be_to_bin]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + uuid_be_to_bin(NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_UUID_BE_TO_BIN, 1, + [uuid_be_to_bin is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/uuid.h has uuid_equal]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + uuid_equal(NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_UUID_EQUAL, 1, + [uuid_equal is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/inet.h inet_pton_with_scope]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + inet_pton_with_scope(NULL, 0, NULL, NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_INET_PTON_WITH_SCOPE, 1, + [inet_pton_with_scope is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/dma-resv.h exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DMA_RESV_H, 1, + [linux/dma-resv.h exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/dma-resv.h has dma_resv_wait_timeout]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + dma_resv_wait_timeout(NULL, 0, 0, 0); + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DMA_RESV_WAIT_TIMEOUT, 1, + [linux/dma-resv.h has dma_resv_wait_timeout]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/dma-resv.h has dma_resv_excl_fence]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + dma_resv_excl_fence(NULL); + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DMA_RESV_EXCL_FENCE, 1, + [linux/dma-resv.h has dma_resv_excl_fence]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if uapi/linux/nvme_ioctl.h exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_UAPI_LINUX_NVME_IOCTL_H, 1, + [uapi/linux/nvme_ioctl.h exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blkdev.h has QUEUE_FLAG_WC_FUA]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int x = QUEUE_FLAG_WC; + int y = QUEUE_FLAG_FUA; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_QUEUE_FLAG_WC_FUA, 1, + [QUEUE_FLAG_WC_FUA is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/scatterlist.h sg_alloc_table_chained has 3 parameters]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + sg_alloc_table_chained(NULL, 0, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SG_ALLOC_TABLE_CHAINED_3_PARAMS, 1, + [sg_alloc_table_chained has 3 parameters]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/blk-mq.h has blk_mq_tagset_busy_iter]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + static void + nvme_cancel_request(struct request *req, void *data, bool reserved) { + return; + } + ],[ + blk_mq_tagset_busy_iter(NULL, nvme_cancel_request, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_MQ_TAGSET_BUSY_ITER, 1, + [blk_mq_tagset_busy_iter is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if dma_buf_dynamic_attach get 4 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + dma_buf_dynamic_attach(NULL, NULL, NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DMA_BUF_DYNAMIC_ATTACH_GET_4_PARAMS, 1, + [dma_buf_dynamic_attach get 4 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct dma_buf_attach_ops has allow_peer2peer]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct dma_buf_attach_ops x = { + .allow_peer2peer = 0, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DMA_BUF_ATTACH_OPS_ALLOW_PEER2PEER, 1, + [struct dma_buf_attach_ops has allow_peer2peer]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct request_queue has q_usage_counter]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct percpu_ref counter = {0}; + struct request_queue rq = { + .q_usage_counter = counter, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_REQUEST_QUEUE_Q_USAGE_COUNTER, 1, + [struct request_queue has q_usage_counter]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if string.h has memdup_user_nul]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + memdup_user_nul(NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_MEMDUP_USER_NUL, 1, + [memdup_user_nul is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if radix-tree.h hasradix_tree_is_internal_node]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + radix_tree_is_internal_node(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_RADIX_TREE_IS_INTERNAL, 1, + [radix_tree_is_internal_node is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if radix-tree.h has radix_tree_iter_delete]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + radix_tree_iter_delete(NULL, NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_RADIX_TREE_ITER_DELETE, 1, + [radix_tree_iter_delete is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if radix-tree.h has radix_tree_iter_lookup]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + radix_tree_iter_lookup(NULL, NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_RADIX_TREE_ITER_LOOKUP, 1, + [radix_tree_iter_lookup is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blkdev.h has blk_queue_write_cache]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + blk_queue_write_cache(NULL, 0, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_QUEUE_WRITE_CACHE, 1, + [blkdev.h has blk_queue_write_cache]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/blk-mq.h has blk_mq_all_tag_busy_iter]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + static void + nvme_cancel_request(struct request *req, void *data, bool reserved) { + return; + } + ],[ + blk_mq_all_tag_busy_iter(NULL, nvme_cancel_request, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_MQ_ALL_TAG_BUSY_ITER, 1, + [blk_mq_all_tag_busy_iter is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/blk-mq.h has blk_mq_update_nr_hw_queues]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + blk_mq_update_nr_hw_queues(NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_MQ_UPDATE_NR_HW_QUEUES, 1, + [blk_mq_update_nr_hw_queues is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/blk-mq.h has blk_mq_map_queues]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + blk_mq_map_queues(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_MQ_MAP_QUEUES, 1, + [blk_mq_map_queues is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if netdevice.h has netif_is_rxfh_configured]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + netif_is_rxfh_configured(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NETIF_IS_RXFH_CONFIGURED, 1, + [netif_is_rxfh_configured is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if bareudp.h has netif_is_bareudp]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + netif_is_bareudp(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NETIF_IS_BAREDUDP, 1, + [netif_is_bareudp is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if netdevice.h has enum tc_setup_type]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + enum tc_setup_type x; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TC_SETUP_TYPE, 1, + [TC_SETUP_TYPE is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if netdevice.h has TC_SETUP_QDISC_MQPRIO]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + enum tc_setup_type x = TC_SETUP_QDISC_MQPRIO; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TC_SETUP_QDISC_MQPRIO, 1, + [TC_SETUP_QDISC_MQPRIO is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if netdevice.h has TC_SETUP_FT]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + enum tc_setup_type x = TC_SETUP_FT; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TC_SETUP_FT, 1, + [TC_TC_SETUP_FT is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if iscsit_set_unsolicited_dataout is defined]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + iscsit_set_unsolicited_dataout(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_ISCSIT_SET_UNSOLICITED_DATAOUT, 1, + [iscsit_set_unsolicited_dataout is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if mmu_notifier.h has mmu_notifier_call_srcu]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + mmu_notifier_call_srcu(NULL, NULL); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_MMU_NOTIFIER_CALL_SRCU, 1, + [mmu_notifier_call_srcu defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if mmu_notifier.h has mmu_notifier_synchronize]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + mmu_notifier_synchronize(); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_MMU_NOTIFIER_SYNCHRONIZE, 1, + [mmu_notifier_synchronize defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + + AC_MSG_CHECKING([if mmu_notifier.h has mmu_notifier_range_blockable]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + const struct mmu_notifier_range *range; + + mmu_notifier_range_blockable(range); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_MMU_NOTIFIER_RANGE_BLOCKABLE, 1, + [mmu_notifier_range_blockable defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct mmu_notifier_ops has free_notifier ]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + static struct mmu_notifier_ops notifiers = { + .free_notifier = NULL, + }; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_MMU_NOTIFIER_OPS_HAS_FREE_NOTIFIER, 1, + [ struct mmu_notifier_ops has alloc/free_notifier ]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if ib_umem_notifier_invalidate_range_start get struct mmu_notifier_range ]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + static int notifier(struct mmu_notifier *mn, + const struct mmu_notifier_range *range) + { + return 0; + } + ],[ + static const struct mmu_notifier_ops notifiers = { + .invalidate_range_start = notifier + }; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_MMU_NOTIFIER_RANGE_STRUCT, 1, + [ ib_umem_notifier_invalidate_range_start get struct mmu_notifier_range ]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if mmu_notifier.h has mmu_notifier_unregister_no_release]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + mmu_notifier_unregister_no_release(NULL, NULL); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_MMU_NOTIFIER_UNREGISTER_NO_RELEASE, 1, + [mmu_notifier_unregister_no_release defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if have mmu interval notifier]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + static struct mmu_interval_notifier_ops int_notifier_ops_xx= { + .invalidate = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_MMU_INTERVAL_NOTIFIER, 1, + [mmu interval notifier defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct mmu_notifier_ops has invalidate_page]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + static struct mmu_notifier_ops mmu_notifier_ops_xx= { + .invalidate_page = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_INVALIDATE_PAGE, 1, + [invalidate_page defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blk_mq_end_request accepts blk_status_t as second parameter]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + blk_status_t error = BLK_STS_OK; + + blk_mq_end_request(NULL, error); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_MQ_END_REQUEST_TAKES_BLK_STATUS_T, 1, + [blk_mq_end_request accepts blk_status_t as second parameter]) + ],[ + AC_MSG_RESULT(no) + ]) + + + AC_MSG_CHECKING([if linux/blk_types.h has REQ_INTEGRITY]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int x = REQ_INTEGRITY; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_TYPES_REQ_INTEGRITY, 1, + [REQ_INTEGRITY is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/bio.h bio_endio has 1 parameter]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + bio_endio(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BIO_ENDIO_1_PARAM, 1, + [linux/bio.h bio_endio has 1 parameter]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blkdev.h has __blkdev_issue_discard]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + __blkdev_issue_discard(NULL, 0, 0, 0, 0, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE___BLKDEV_ISSUE_DISCARD, 1, + [__blkdev_issue_discard is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if __blkdev_issue_discard has 5 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + __blkdev_issue_discard(NULL, 0, 0, 0, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE___BLKDEV_ISSUE_DISCARD_5_PARAM, 1, + [__blkdev_issue_discard has 5 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/bio.h submit_bio has 1 parameter]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + submit_bio(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SUBMIT_BIO_1_PARAM, 1, + [linux/bio.h submit_bio has 1 parameter]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct bio has member bi_iter]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct bio b = { + .bi_iter = 0, + }; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_STRUCT_BIO_BI_ITER, 1, + [struct bio has member bi_iter]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct bio has member bi_disk]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct bio b = { + .bi_disk = NULL, + }; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BIO_BI_DISK, 1, + [struct bio has member bi_disk]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct bio has member bi_error]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct bio b = { + .bi_error = 0, + }; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_STRUCT_BIO_BI_ERROR, 1, + [struct bio has member bi_error]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct ifla_vf_stats has rx_dropped and tx_dropped]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct ifla_vf_stats x = { + .rx_dropped = 0, + .tx_dropped = 0, + }; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_STRUCT_IFLA_VF_STATS_RX_TX_DROPPED, 1, + [struct ifla_vf_stats has memebers rx_dropped and tx_dropped]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/moduleparam.h has member param_ops_ullong]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + param_get_ullong(NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_PARAM_OPS_ULLONG, 1, + [param_ops_ullong is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if fs.h has stream_open]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + stream_open(NULL, NULL); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_STREAM_OPEN, 1, + [fs.h has stream_open]) + ],[ + AC_MSG_RESULT(no) + ]) + + + AC_MSG_CHECKING([if vfs_getattr has 4 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + vfs_getattr(NULL, NULL, 0, 0); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_VFS_GETATTR_HAS_4_PARAMS, 1, + [vfs_getattr has 4 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/fs.h has struct kiocb definition]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct kiocb x = { + .ki_flags = 0, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FS_HAS_KIOCB, 1, + [struct kiocb is defined in linux/fs.h]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/blk_types.h has struct bio_aux]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + struct bio_aux x; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_RH7_STRUCT_BIO_AUX, 1, + [struct bio_aux is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/pci.h has pci_irq_vector, pci_free_irq_vectors, pci_alloc_irq_vectors]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + pci_irq_vector(NULL, 0); + pci_free_irq_vectors(NULL); + pci_alloc_irq_vectors(NULL, 0, 0, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_PCI_IRQ_API, 1, + [linux/pci.h has pci_irq_vector, pci_free_irq_vectors, pci_alloc_irq_vectors]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if pci.h struct pci_error_handlers has reset_prepare]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + void reset_prepare(struct pci_dev *dev) { + return; + } + ],[ + struct pci_error_handlers x = { + .reset_prepare = reset_prepare, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_PCI_ERROR_HANDLERS_RESET_PREPARE, 1, + [pci.h struct pci_error_handlers has reset_prepare]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if pci.h struct pci_error_handlers has reset_done]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + void reset_done(struct pci_dev *dev) { + return; + } + ],[ + struct pci_error_handlers x = { + .reset_done = reset_done, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_PCI_ERROR_HANDLERS_RESET_DONE, 1, + [pci.h struct pci_error_handlers has reset_done]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/io-64-nonatomic-lo-hi.h exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IO_64_NONATOMIC_LO_HI_H, 1, + [linux/io-64-nonatomic-lo-hi.h exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if pci.h has pci_request_mem_regions]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + pci_request_mem_regions(NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_PCI_REQUEST_MEM_REGIONS, 1, + [pci_request_mem_regions is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if pci.h has pci_release_mem_regions]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + pci_release_mem_regions(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_PCI_RELEASE_MEM_REGIONS, 1, + [pci_release_mem_regions is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if pci.h pcie_get_minimum_link]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + pcie_get_minimum_link(NULL, NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_PCIE_GET_MINIMUM_LINK, 1, + [pcie_get_minimum_link is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if pci.h pcie_print_link_status]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + pcie_print_link_status(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_PCIE_PRINT_LINK_STATUS, 1, + [pcie_print_link_status is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if pnv-pci.h has pnv_pci_set_p2p]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + pnv_pci_set_p2p(NULL, NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_PNV_PCI_SET_P2P, 1, + [pnv-pci.h has pnv_pci_set_p2p]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct timerqueue_head has struct rb_root_cached]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + struct timerqueue_head *head; + struct rb_node *leftmost = rb_first_cached(&head->rb_root); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TIMERQUEUE_HEAD_RB_ROOT_CACHED, 1, + [struct timerqueue_head has struct rb_root_cached]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if rbtree.h has struct rb_root_cached]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct rb_root_cached rb_root_cached_test; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_RB_ROOT_CACHED, 1, + [struct rb_root_cached is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + LB_CHECK_SYMBOL_EXPORT([interval_tree_insert], + [lib/interval_tree.c], + [AC_DEFINE(HAVE_INTERVAL_TREE_EXPORTED, 1, + [interval_tree functions exported by the kernel])], + []) + + AC_MSG_CHECKING([if INTERVAL_TREE takes rb_root]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + + struct x_node { + u64 __subtree_last; + struct rb_node rb; + }; + static u64 node_last(struct x_node *n) + { + return 0; + } + static u64 node_start(struct x_node *n) + { + return 0; + } + INTERVAL_TREE_DEFINE(struct x_node, rb, u64, __subtree_last, + node_start, node_last, static, rbt_x) + ],[ + struct x_node x_interval_tree; + struct rb_root x_tree; + rbt_x_insert(&x_interval_tree, &x_tree); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_INTERVAL_TREE_TAKES_RB_ROOT, 1, + [INTERVAL_TREE takes rb_root]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if timer.h has timer_setup]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + static void activate_timeout_handler_task(struct timer_list *t) + { + return; + } + ],[ + struct timer_list tmr; + timer_setup(&tmr, activate_timeout_handler_task, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TIMER_SETUP, 1, + [timer_setup is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if dmapool.h has dma_pool_zalloc]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + dma_pool_zalloc(NULL, 0, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DMA_POOL_ZALLOC, 1, + [dma_pool_zalloc is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if act_apt.h tc_setup_cb_egdev_register]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + tc_setup_cb_egdev_register(NULL, NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TC_SETUP_CB_EGDEV_REGISTER, 1, + [tc_setup_cb_egdev_register is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if act_api.h has tcf_action_stats_update]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + tcf_action_stats_update(NULL, 0, 0, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TCF_ACTION_STATS_UPDATE, 1, + [tc_action_stats_update is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if include/linux/once.h exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_ONCE_H, 1, + [include/linux/once.h exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/blk_types.h has blk_path_error]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + #include + ],[ + blk_path_error(0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_PATH_ERROR, 1, + [blk_path_error is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if slab.h has kcalloc_node]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + kcalloc_node(0, 0, 0, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_KCALLOC_NODE, 1, + [kcalloc_node is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if slab.h has kmalloc_array_node]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + kmalloc_array_node(0, 0, 0, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_KMALLOC_ARRAY_NODE, 1, + [kmalloc_array_node is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if kref.h has kref_read]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + kref_read(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_KREF_READ, 1, + [kref_read is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/inet.h has inet_addr_is_any]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + inet_addr_is_any(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_INET_ADDR_IS_ANY, 1, + [inet_addr_is_any is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/blkdev.h has bdev_write_zeroes_sectors]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + bdev_write_zeroes_sectors(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BDEV_WRITE_ZEROES_SECTORS, 1, + [bdev_write_zeroes_sectors is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/blkdev.h has blk_queue_flag_set]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + blk_queue_flag_set(0, NULL); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_QUEUE_FLAG_SET, 1, + [blk_queue_flag_set is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/uio.h has iov_iter_is_bvec]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct iov_iter i; + + iov_iter_is_bvec(&i); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IOV_ITER_IS_BVEC_SET, 1, + [iov_iter_is_bvec is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if include/linux/blk-mq-pci.h has blk_mq_pci_map_queues]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + blk_mq_pci_map_queues(NULL, NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_MQ_PCI_MAP_QUEUES_3_ARGS, 1, + [blk_mq_pci_map_queues is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if netdev_master_upper_dev_link gets 5 parameters]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + netdev_master_upper_dev_link(NULL, NULL, NULL, NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NETDEV_MASTER_UPPER_DEV_LINK_5_PARAMS, 1, + [netdev_master_upper_dev_link gets 5 parameters]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if BLK_EH_DONE exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + int x = BLK_EH_DONE; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_EH_DONE, 1, + [BLK_EH_DONE is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if netdevice.h has netdev_reg_state]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + netdev_reg_state(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NETDEV_REG_STATE, 1, + [netdev_reg_state is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct xfrmdev_ops has member xdo_dev_state_advance_esn]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct xfrmdev_ops x = { + .xdo_dev_state_advance_esn = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_XDO_DEV_STATE_ADVANCE_ESN, 1, + [struct xfrmdev_ops has member xdo_dev_state_advance_esn]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if interrupt.h has irq_calc_affinity_vectors with 3 args]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int x = irq_calc_affinity_vectors(0, 0, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IRQ_CALC_AFFINITY_VECTORS_3_ARGS, 1, + [irq_calc_affinity_vectors is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if interrupt.h has irq_set_affinity_and_hint]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int x = irq_set_affinity_and_hint(0, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IRQ_UPDATE_AFFINITY_HINT, 1, + [irq_set_affinity_and_hint is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/overflow.h exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_LINUX_OVERFLOW_H, 1, + [linux/overflow.h is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/rtnetlink.h has net_rwsem]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + down_read(&net_rwsem); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_RTNETLINK_NET_RWSEM, 1, + [linux/rtnetlink.h has net_rwsem]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/net/ip6_route.h rt6_lookup takes 6 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + rt6_lookup(NULL, NULL, NULL, 0, NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_RT6_LOOKUP_TAKES_6_PARAMS, 1, + [rt6_lookup takes 6 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if type __poll_t is defined]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + __poll_t x = 0; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TYPE___POLL_T, 1, + [type __poll_t is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if type rcu_callback_t is defined]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + rcu_callback_t x = NULL; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TYPE_RCU_CALLBACK_T, 1, + [type rcu_callback_t is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if function kvfree_call_rcu is defined]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + kvfree_call_rcu(NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_KVFREE_CALL_RCU, 1, + [function kvfree_call_rcu is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/xdp.h has xdp_init_buff]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + xdp_init_buff(NULL, 0, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_XDP_INIT_BUFF, 1, + [net/xdp.h has xdp_init_buff]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/xdp.h has struct xdp_rxq_info]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct xdp_rxq_info *rxq; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_XDP_RXQ_INFO_IN_NET_XDP, 1, + [net/xdp.h has struct xdp_rxq_info]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/xdp.h has struct xdp_rxq_info WA for 5.4.17-2011.1.2.el8uek.x86_64]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + + ],[ + struct xdp_rxq_info *rxq; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_XDP_RXQ_INFO_IN_UEK_KABI, 1, + [net/xdp.h has struct xdp_rxq_info WA for 5.4.17-2011.1.2.el8uek.x86_64]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/xdp.h has xdp_rxq_info_reg get 4 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + xdp_rxq_info_reg(NULL, NULL, 0, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_XDP_RXQ_INFO_REG_4_PARAMS, 1, + [net/xdp.h has xdp_rxq_info_reg get 4 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/xdp.h struct xdp_frame_bulk exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct xdp_frame_bulk x; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_XDP_FRAME_BULK, 1, + [net/xdp.h struct xdp_frame_bulk exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if xdp_buff has frame_sz ass member]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct xdp_buff x; + x.frame_sz = 0; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_XDP_BUFF_HAS_FRAME_SZ, 1, + [xdp_buff has frame_sz ass member]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/xdp.h has xdp_convert_buff_to_frame]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + xdp_convert_buff_to_frame(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_XDP_CONVERT_BUFF_TO_FRAME, 1, + [net/xdp.h has xdp_convert_buff_to_frame]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/xdp.h exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NET_XDP_HEADER, 1, + [net/xdp.h is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/xdp.h exists workaround for 5.4.17-2011.1.2.el8uek.x86_64]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NET_XDP_HEADER_UEK_KABI, 1, + [net/xdp.h is defined workaround for 5.4.17-2011.1.2.el8uek.x86_64]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/xdp.h has convert_to_xdp_frame]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + convert_to_xdp_frame(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_XDP_CONVERT_TO_XDP_FRAME_IN_NET_XDP, 1, + [net/xdp.h has convert_to_xdp_frame]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/xdp.h has convert_to_xdp_frame workaround for 5.4.17-2011.1.2.el8uek.x86_64]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + convert_to_xdp_frame(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_XDP_CONVERT_TO_XDP_FRAME_IN_UEK_KABI, 1, + [net/xdp.h has convert_to_xdp_frame workaround for 5.4.17-2011.1.2.el8uek.x86_64]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/xdp.h has xdp_rxq_info_reg_mem_model]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + xdp_rxq_info_reg_mem_model(NULL, 0, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_XDP_RXQ_INFO_REG_MEM_MODEL_IN_NET_XDP, 1, + [net/xdp.h has xdp_rxq_info_reg_mem_model]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/xdp.h has xdp_rxq_info_reg_mem_model workaround for 5.4.17-2011.1.2.el8uek.x86_64]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + xdp_rxq_info_reg_mem_model(NULL, 0, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_XDP_RXQ_INFO_REG_MEM_MODEL_IN_UEK_KABI, 1, + [net/xdp.h has xdp_rxq_info_reg_mem_model workaround for 5.4.17-2011.1.2.el8uek.x86_64]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/vfio_pci_core.h exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_VFIO_PCI_CORE_H, 1, + [linux/vfio_pci_core.h exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/gro.h exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NET_GRO_H, 1, + [net/gro.h is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/page_pool.h exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NET_PAGE_POOL_H, 1, + [net/page_pool.h is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/page_pool.h has page_pool_release_page]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + page_pool_release_page(NULL, NULL); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_PAGE_POOL_RELEASE_PAGE, 1, + [net/page_pool.h has page_pool_release_page]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/page_pool.h has page_pool_nid_changed]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + page_pool_nid_changed(NULL,0); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_PAGE_POLL_NID_CHANGED, 1, + [net/page_pool.h has page_pool_nid_changed]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/tls.h exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NET_TLS_H, 1, + [net/tls.h exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if uapi/linux/tls.h exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_UAPI_LINUX_TLS_H, 1, + [uapi/linux/tls.h exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/tls.h has tls_offload_context_tx]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct tls_offload_context_tx tmp; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TLS_OFFLOAD_CONTEXT_TX_STRUCT, 1, + [net/tls.h has tls_offload_context_tx]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/tls.h has tls_offload_context_rx]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct tls_offload_context_rx tmp; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TLS_OFFLOAD_CONTEXT_RX_STRUCT, 1, + [net/tls.h has tls_offload_context_rx]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/tls.h has tls_offload_rx_resync_request]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + tls_offload_rx_resync_request(NULL,0); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TLS_OFFLOAD_RX_RESYNC_REQUEST, 1, + [net/tls.h has tls_offload_rx_resync_request]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/tls.h has tls_driver_ctx]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + tls_driver_ctx(NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TLS_DRIVER_CTX, 1, + [net/tls.h has tls_driver_ctx]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/tls.h has tls_offload_rx_force_resync_request]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + tls_offload_rx_force_resync_request(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TLS_OFFLOAD_RX_FORCE_RESYNC_REQUEST, 1, + [net/tls.h has tls_offload_rx_force_resync_request]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/tls.h has tls_offload_rx_resync_async_request_start]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + tls_offload_rx_resync_async_request_start(NULL, 0, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TLS_OFFLOAD_RX_RESYNC_ASYNC_REQUEST_START, 1, + [net/tls.h has tls_offload_rx_resync_async_request_start]) + ],[ + AC_MSG_RESULT(no) + ]) + + LB_CHECK_SYMBOL_EXPORT([idr_preload], + [lib/radix-tree.c], + [AC_DEFINE(HAVE_IDR_PRELOAD_EXPORTED, 1, + [idr_preload is exported by the kernel])], + []) + + LB_CHECK_SYMBOL_EXPORT([radix_tree_iter_delete], + [lib/radix-tree.c], + [AC_DEFINE(HAVE_RADIX_TREE_ITER_DELETE_EXPORTED, 1, + [radix_tree_iter_delete is exported by the kernel])], + []) + LB_CHECK_SYMBOL_EXPORT([kobj_ns_grab_current], + [lib/kobject.c], + [AC_DEFINE(HAVE_KOBJ_NS_GRAB_CURRENT_EXPORTED, 1, + [kobj_ns_grab_current is exported by the kernel])], + []) + + AC_MSG_CHECKING([if linux/blk_types.h has REQ_DRV]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int x = REQ_DRV; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_TYPES_REQ_DRV, 1, + [REQ_DRV is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blk_alloc_queue_node has 3 args]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + blk_alloc_queue_node(0, 0, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_ALLOC_QUEUE_NODE_3_ARGS, 1, + [blk_alloc_queue_node has 3 args]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if have blk_queue_make_request]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + blk_queue_make_request(NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_QUEUE_MAKE_REQUEST, 1, + [blk_queue_make_request existing]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if have put_unaligned_le24]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + put_unaligned_le24(0, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_PUT_UNALIGNED_LE24, 1, + [put_unaligned_le24 existing]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([for include/linux/part_stat.h]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_PART_STAT_H, 1, [part_stat.h exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if pci.h has pci_enable_atomic_ops_to_root]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + pci_enable_atomic_ops_to_root(NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_PCI_ENABLE_ATOMIC_OPS_TO_ROOT, 1, + [pci_enable_atomic_ops_to_root is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if string.h has kstrtobool]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + char s[] = "test"; + bool res; + int rc; + + rc = kstrtobool(s, &res); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_KSTRTOBOOL, 1, + [kstrtobool is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct blk_mq_ops has poll]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct blk_mq_ops ops = { + .poll = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_MQ_OPS_POLL, 1, + [struct blk_mq_ops has poll]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if netdev_bpf struct has pool member]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + struct xsk_buff_pool *x; + struct netdev_bpf *xdp; + + xdp->xsk.pool = x; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NETDEV_BPF_XSK_BUFF_POOL, 1, + [netdev_bpf struct has pool member]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if select_queue_fallback_t has third parameter]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + extern select_queue_fallback_t fallback; + fallback(NULL, NULL, NULL); + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SELECT_QUEUE_FALLBACK_T_3_PARAMS, 1, + [select_queue_fallback_t has third parameter]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if t10_pi_ref_tag() exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + t10_pi_ref_tag(NULL); + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_T10_PI_REF_TAG, 1, + [t10_pi_ref_tag() exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blkdev.h has QUEUE_FLAG_PCI_P2PDMA]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int x = QUEUE_FLAG_PCI_P2PDMA; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_QUEUE_FLAG_PCI_P2PDMA, 1, + [QUEUE_FLAG_PCI_P2PDMA is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if memremap.h has is_pci_p2pdma_page]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + is_pci_p2pdma_page(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IS_PCI_P2PDMA_PAGE_IN_MEMREMAP_H, 1, + [is_pci_p2pdma_page is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if mm.h has is_pci_p2pdma_page]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + is_pci_p2pdma_page(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IS_PCI_P2PDMA_PAGE_IN_MM_H, 1, + [is_pci_p2pdma_page is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if t10-pi.h has t10_pi_prepare]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + t10_pi_prepare(NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_T10_PI_PREPARE, 1, + [t10_pi_prepare is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct request_queue has integrity]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct request_queue rq = { + .integrity = {0}, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_REQUEST_QUEUE_INTEGRITY, 1, + [struct request_queue has integrity]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/bio.h has bip_get_seed]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + bip_get_seed(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BIO_BIP_GET_SEED, 1, + [linux/bio.h has bip_get_seed]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if t10-pi.h has enum t10_dif_type]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + enum t10_dif_type x = T10_PI_TYPE0_PROTECTION; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_T10_DIF_TYPE, 1, + [enum t10_dif_type is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct blk_integrity has sector_size]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct blk_integrity bi = { + .sector_size = 0, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_INTEGRITY_SECTOR_SIZE, 1, + [struct blk_integrity has sector_size]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if need expose current_link_speed/width in sysfs]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + struct kobject kobj = {}; + struct device *dev = kobj_to_dev(&kobj); + /* https://patchwork.kernel.org/patch/9759133/ + * patch exposing link stats also introduce this const */ + #ifdef PCI_EXP_LNKCAP_SLS_8_0GB + #error no need + #endif + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NO_LINKSTA_SYSFS, 1, + [current_link_speed/width not exposed]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if uapi/linux/pkt_cls.h has TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int x = TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT, 1, + [TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if uapi/linux/pkt_cls.h has TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int x = TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST, 1, + [TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if uapi/linux/pkt_cls.h has TCA_FLOWER_KEY_SCTP_SRC_MASK]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int x = TCA_FLOWER_KEY_SCTP_SRC_MASK; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TCA_FLOWER_KEY_SCTP_SRC_MASK, 1, + [TCA_FLOWER_KEY_SCTP_SRC_MASK is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if uapi/linux/pkt_cls.h has TCA_FLOWER_KEY_MPLS_TTL]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int x = TCA_FLOWER_KEY_MPLS_TTL; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TCA_FLOWER_KEY_MPLS_TTL, 1, + [TCA_FLOWER_KEY_MPLS_TTL is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if uapi/linux/pkt_cls.h has TCA_FLOWER_KEY_CVLAN_ID]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int x = TCA_FLOWER_KEY_CVLAN_ID; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TCA_FLOWER_KEY_CVLAN_ID, 1, + [TCA_FLOWER_KEY_CVLAN_ID is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if TCA_TUNNEL_KEY_ENC_TOS exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int x = TCA_TUNNEL_KEY_ENC_TOS; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TCA_TUNNEL_KEY_ENC_TOS, 1, + [TCA_TUNNEL_KEY_ENC_TOS is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if TCA_TUNNEL_KEY_ENC_DST_PORT exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int x = TCA_TUNNEL_KEY_ENC_DST_PORT; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TCA_TUNNEL_KEY_ENC_DST_PORT, 1, + [TCA_TUNNEL_KEY_ENC_DST_PORT is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/blk-mq.h has struct blk_mq_queue_map]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct blk_mq_queue_map x = {}; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_MQ_QUEUE_MAP, 1, + [linux/blk-mq.h has struct blk_mq_queue_map]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/blk-mq.h has busy_tag_iter_fn return bool with 2 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + static bool + nvme_cancel_request(struct request *req, void *data) { + return true; + } + ],[ + busy_tag_iter_fn *fn = nvme_cancel_request; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_MQ_BUSY_TAG_ITER_FN_BOOL_2_PARAMS, 1, + [linux/blk-mq.h has busy_tag_iter_fn return bool]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/blk-mq.h has busy_tag_iter_fn return bool with 3 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + static bool + nvme_cancel_request(struct request *req, void *data, bool reserved) { + return true; + } + ],[ + busy_tag_iter_fn *fn = nvme_cancel_request; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_MQ_BUSY_TAG_ITER_FN_BOOL_3_PARAMS, 1, + [linux/blk-mq.h has busy_tag_iter_fn return bool]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct blk_mq_ops has poll 1 arg]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + static int nvme_poll(struct blk_mq_hw_ctx *hctx) { + return 0; + } + ],[ + struct blk_mq_ops ops = { + .poll = nvme_poll, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_MQ_OPS_POLL_1_ARG, 1, + [struct blk_mq_ops has poll 1 arg]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct Qdisc_ops has ingress_block_set net/sch_generic.h ]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct Qdisc_ops ops = { + .ingress_block_set = 0, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_QDISC_SUPPORTS_BLOCK_SHARING, 1, + [struct Qdisc_ops has ingress_block_set]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if uuid.h has guid_parse]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + char *str; + guid_t uuid; + int ret; + + ret = guid_parse(str, &uuid); + + return ret; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_GUID_PARSE, 1, + [guid_parse is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if bitmap.h bitmap_zalloc_node]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + unsigned long *bmap; + + bmap = bitmap_zalloc_node(1, 0, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BITMAP_ZALLOC_NODE, 1, + [bitmap_zalloc_node is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if bitmap.h has bitmap_kzalloc]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + unsigned long *bmap; + + bmap = bitmap_zalloc(1, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BITMAP_KZALLOC, 1, + [bitmap_kzalloc is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if bitmap.h has bitmap_free]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + unsigned long *bmap; + + bmap = kcalloc(BITS_TO_LONGS(1), sizeof(unsigned long), 0); + bitmap_free(bmap); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BITMAP_FREE, 1, + [bitmap_free is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if bitmap.h has bitmap_from_arr32]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + unsigned long *bmap; + u32 *word; + + bmap = kcalloc(BITS_TO_LONGS(1), sizeof(unsigned long), 0); + bitmap_from_arr32(bmap, word, 1); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BITMAP_FROM_ARR32, 1, + [bitmap_from_arr32 is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if dma-mapping.h has dma_map_sgtable]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int i = dma_map_sgtable(NULL, NULL, 0, 0); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DMA_MAP_SGTABLE, 1, + [dma-mapping.h has dma_map_sgtable]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if tc_cls_common_offload has handle]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct tc_cls_common_offload x; + x.handle = 0; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TC_CLS_OFFLOAD_HANDLE, 1, + [struct tc_cls_common_offload has handle]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if tc_htb_command has moved_qid]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct tc_htb_qopt_offload *x; + x->moved_qid = 0; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TC_HTB_COMMAND_HAS_MOVED_QID, 1, + [struct tc_htb_command has moved_qid]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if built in flower supports multi mask per prio]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct rcu_work *rwork; + work_func_t func; + + tcf_queue_work(rwork, func); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FLOWER_MULTI_MASK, 1, + [tcf_queue_work has 2 params per prio]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blk-mq.h has enum hctx_type]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + enum hctx_type type = HCTX_TYPE_DEFAULT; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_MQ_HCTX_TYPE, 1, + [blk-mq.h has enum hctx_type]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blk-mq.h has blk_mq_complete_request_sync]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + blk_mq_complete_request_sync(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_MQ_COMPLETE_REQUEST_SYNC, 1, + [blk-mq.h has blk_mq_complete_request_sync]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if scsi/scsi_transport_fc.h has FC_PORT_ROLE_NVME_TARGET]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int x = FC_PORT_ROLE_NVME_TARGET; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SCSI_TRANSPORT_FC_FC_PORT_ROLE_NVME_TARGET, 1, + [scsi/scsi_transport_fc.h has FC_PORT_ROLE_NVME_TARGET]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/blk_types.h has REQ_HIPRI]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int x = REQ_HIPRI; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_TYPES_REQ_HIPRI, 1, + [REQ_HIPRI is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct blk_mq_ops has commit_rqs]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct blk_mq_ops ops = { + .commit_rqs = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_MQ_OPS_COMMIT_RQS, 1, + [struct blk_mq_ops has commit_rqs]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct irq_affinity has priv]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct irq_affinity affd = { + .priv = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IRQ_AFFINITY_PRIV, 1, + [struct irq_affinity has priv]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if interrupt.h has tasklet_setup]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + tasklet_setup(NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TASKLET_SETUP, 1, + [interrupt.h has tasklet_setup]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if fs.h has IOCB_NOWAIT]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int x = IOCB_NOWAIT; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IOCB_NOWAIT, 1, + [fs.h has IOCB_NOWAIT]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if fs.h has FMODE_NOWAIT]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int x = FMODE_NOWAIT; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FMODE_NOWAIT, 1, + [fs.h has FMODE_NOWAIT]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if dma-attrs.h has struct dma_attrs]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct dma_attrs attr = {}; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DMA_ATTRS, 1, + [dma-attrs.h has struct dma_attrs]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/blk-mq.h has blk_mq_delay_kick_requeue_list]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + blk_mq_delay_kick_requeue_list(NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_MQ_DELAY_KICK_REQUEUE_LIST, 1, + [blk_mq_delay_kick_requeue_list is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/blk_types.h has op_is_write]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + op_is_write(0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_OP_IS_WRITE, 1, + [op_is_write is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if dma_map_bvec exist]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + struct bio_vec bv = {}; + + dma_map_bvec(NULL, &bv, 0, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLKDEV_DMA_MAP_BVEC, 1, + [dma_map_bvec exist]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if flow_indr_block_cb_alloc exist]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + flow_indr_block_cb_alloc(NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FLOW_INDR_BLOCK_CB_ALLOC, 1, + [flow_indr_block_cb_alloc exist]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct flow_block_cb exist]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct flow_block_cb a; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FLOW_BLOCK_CB, 1, + [struct flow_block_cb exist]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/scatterlist.h sg_alloc_table_chained has nents_first_chunk parameter]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + sg_alloc_table_chained(NULL, 0, NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SG_ALLOC_TABLE_CHAINED_NENTS_FIRST_CHUNK_PARAM, 1, + [sg_alloc_table_chained has nents_first_chunk parameter]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/blk-mq-rdma.h has blk_mq_rdma_map_queues with map]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + struct blk_mq_queue_map *map = NULL; + + blk_mq_rdma_map_queues(map, NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_MQ_RDMA_MAP_QUEUES_MAP, 1, + [linux/blk-mq-rdma.h has blk_mq_rdma_map_queues with map]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/blk-mq.h has request_to_qc_t]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + request_to_qc_t(NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_REQUEST_TO_QC_T, 1, + [linux/blk-mq.h has request_to_qc_t]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/blk-mq.h has blk_mq_request_completed]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + blk_mq_request_completed(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_MQ_REQUEST_COMPLETED, 1, + [linux/blk-mq.h has blk_mq_request_completed]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blkdev.h has enum mq_rq_state]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + enum mq_rq_state state = MQ_RQ_COMPLETE; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_MQ_RQ_STATE, 1, + [mq_rq_state is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/blk-mq.h has blk_mq_tagset_wait_completed_request]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + blk_mq_tagset_wait_completed_request(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_MQ_TAGSET_WAIT_COMPLETED_REQUEST, 1, + [linux/blk-mq.h has blk_mq_tagset_wait_completed_request]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/net.h has kernel_getsockname 2 parameters]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + kernel_getsockname(NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_KERNEL_GETSOCKNAME_2_PARAMS, 1, + [linux/net.h has kernel_getsockname 2 parameters]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if *xpo_secure_port returns void]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + void secure_port(struct svc_rqst *rqstp) + { + return; + } + ],[ + struct svc_xprt_ops check_rdma_ops; + + check_rdma_ops.xpo_secure_port = secure_port; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_XPO_SECURE_PORT_NO_RETURN, 1, + [xpo_secure_port is defined and returns void]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if svc_fill_write_vector getting 4 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return svc_fill_write_vector(NULL, NULL, NULL, 0); + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SVC_FILL_WRITE_VECTOR_4_PARAMS, 1, + [svc_fill_write_vector getting 4 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if svc_fill_write_vector getting 3 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return svc_fill_write_vector(NULL, NULL, 0); + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SVC_FILL_WRITE_VECTOR_3_PARAMS, 1, + [svc_fill_write_vector getting 3 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if svc_fill_write_vector getting 2 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return svc_fill_write_vector(NULL, NULL); + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SVC_FILL_WRITE_VECTOR_2_PARAMS, 1, + [svc_fill_write_vector getting 2 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct svc_rqst has rq_xprt_hlen]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct svc_rqst rqst; + + rqst.rq_xprt_hlen = 0; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SVC_RQST_RQ_XPRT_HLEN, 1, + [struct svc_rqst has rq_xprt_hlen]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if *send_request has 'struct rpc_rqst *req' as a param]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + int send_request(struct rpc_rqst *req) + { + return 0; + } + ],[ + struct rpc_xprt_ops ops; + + ops.send_request = send_request; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_XPRT_OPS_SEND_REQUEST_RQST_ARG, 1, + [*send_request has 'struct rpc_rqst *req' as a param]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([for rpc_reply_expected]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return rpc_reply_expected(NULL); + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_RPC_REPLY_EXPECTED, 1, [rpc reply expected]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([for xprt_request_get_cong]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return xprt_request_get_cong(NULL, NULL); + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_XPRT_REQUEST_GET_CONG, 1, [get cong request]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([for "xpt_remotebuf" inside "struct svc_xprt"]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct svc_xprt dummy_xprt; + + dummy_xprt.xpt_remotebuf[0] = 0; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SVC_XPRT_XPT_REMOTEBUF, 1, + [struct svc_xprt has 'xpt_remotebuf' field]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([for "xpo_prep_reply_hdr" inside "struct svc_xprt_ops"]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct svc_xprt_ops dummy_svc_ops; + + dummy_svc_ops.xpo_prep_reply_hdr = NULL; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SVC_XPRT_XPO_PREP_REPLY_HDR, 1, + [struct svc_xprt_ops 'xpo_prep_reply_hdr' field]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([for "xpo_read_payload" inside "struct svc_xprt_ops"]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct svc_xprt_ops dummy_svc_ops; + + dummy_svc_ops.xpo_read_payload = NULL; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_XPO_READ_PAYLOAD, 1, + [struct svc_xprt_ops has 'xpo_read_payload' field]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([for "xpo_result_payload" inside "struct svc_xprt_ops"]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct svc_xprt_ops dummy_svc_ops; + + dummy_svc_ops.xpo_result_payload = NULL; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_XPO_RESULT_PAYLOAD, 1, + [struct svc_xprt_ops has 'xpo_result_payload' field]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([for "free_slot" inside "struct rpc_xprt_ops"]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct rpc_xprt_ops dummy_ops; + + dummy_ops.free_slot = NULL; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_RPC_XPRT_OPS_FREE_SLOT, 1, + [struct rpc_xprt_ops has 'free_slot' field]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([for "set_retrans_timeout" inside "struct rpc_xprt_ops"]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct rpc_xprt_ops dummy_ops; + + dummy_ops.set_retrans_timeout = NULL; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_RPC_XPRT_OPS_SET_RETRANS_TIMEOUT, 1, + [struct rpc_xprt_ops has 'set_retrans_timeout' field]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([for "wait_for_reply_request" inside "struct rpc_xprt_ops"]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct rpc_xprt_ops dummy_ops; + + dummy_ops.wait_for_reply_request = NULL; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_RPC_XPRT_OPS_WAIT_FOR_REPLY_REQUEST, 1, + [struct rpc_xprt_ops has 'wait_for_reply_request' field]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([for "queue_lock" inside "struct rpc_xprt"]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + spinlock_t *dummy_lock; + struct rpc_xprt dummy_xprt; + + dummy_lock = &dummy_xprt.queue_lock; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_XPRT_QUEUE_LOCK, 1, + [struct rpc_xprt has 'queue_lock' field]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if 'struct rpc_xprt_ops *ops' field is const inside 'struct rpc_xprt']) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + const struct rpc_xprt_ops ops = {0}; + struct rpc_xprt xprt; + const struct rpc_xprt_ops *ptr = &ops; + + xprt.ops = ptr; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_RPC_XPRT_OPS_CONST, 1, + [struct rpc_xprt_ops *ops' field is const inside 'struct rpc_xprt']) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if 'struct svc_xprt_ops *xcl_ops' field is const inside 'struct svc_xprt_class']) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + const struct svc_xprt_ops xcl_ops = {0}; + struct svc_xprt_class xprt; + const struct svc_xprt_ops *ptr = &xcl_ops; + + xprt.xcl_ops = ptr; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SVC_XPRT_CLASS_XCL_OPS_CONST, 1, + ['struct svc_xprt_ops *xcl_ops' field is const inside 'struct svc_xprt_class']) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if xprt_wait_for_buffer_space has xprt as a parameter]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct rpc_xprt xprt = {0}; + + xprt_wait_for_buffer_space(&xprt); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_XPRT_WAIT_FOR_BUFFER_SPACE_RQST_ARG, 1, + [xprt_wait_for_buffer_space has xprt as a parameter]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([for "recv_lock" inside "struct rpc_xprt"]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + spinlock_t *dummy_lock; + struct rpc_xprt dummy_xprt; + + dummy_lock = &dummy_xprt.recv_lock; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_RPC_XPRT_RECV_LOCK, 1, [struct rpc_xprt has 'recv_lock' field]) + ],[ + AC_MSG_RESULT(no) + ]) + + + AC_MSG_CHECKING([for "xprt_class" inside "struct rpc_xprt"]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct rpc_xprt dummy_xprt; + + dummy_xprt.xprt_class = NULL; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_RPC_XPRT_XPRT_CLASS, 1, [struct rpc_xprt has 'xprt_class' field]) + ],[ + AC_MSG_RESULT(no) + ]) + + LB_CHECK_SYMBOL_EXPORT([xprt_reconnect_delay], + [net/sunrpc/xprt.c], + [AC_DEFINE(HAVE_XPRT_RECONNECT_DELAY, 1, + [xprt_reconnect_delay is exported by the kernel])], + []) + + AC_MSG_CHECKING([for "bc_num_slots" inside "struct rpc_xprt_ops"]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct rpc_xprt_ops dummy_ops; + + dummy_ops.bc_num_slots = NULL; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_RPC_XPRT_OPS_BC_NUM_SLOTS, 1, + [struct rpc_xprt_ops has 'bc_num_slots' field]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([for "bc_up" inside "struct rpc_xprt_ops"]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct rpc_xprt_ops dummy_ops; + + dummy_ops.bc_up = NULL; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_RPC_XPRT_OPS_BC_UP, 1, + [struct rpc_xprt_ops has 'bc_up' field]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([for "netid" inside "struct xprt_class"]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct xprt_class xc; + + xc.netid; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_XPRT_CLASS_NETID, 1, + [struct xprt_class has 'netid' field]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/sysctl.h has SYSCTL_ZERO]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + void *dummy; + + dummy = SYSCTL_ZERO; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SYSCTL_ZERO_ENABLED, 1, + [linux/sysctl.h has SYSCTL_ZERO defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if defined XDRBUF_SPARSE_PAGES]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int dummy; + + dummy = XDRBUF_SPARSE_PAGES; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_XDRBUF_SPARSE_PAGES, 1, + [XDRBUF_SPARSE_PAGES has defined in linux/sunrpc/xdr.h]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if xdr_init_encode has rqst as a parameter]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct rpc_rqst *rqst = NULL; + + xdr_init_encode(NULL, NULL, NULL, rqst); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_XDR_INIT_ENCODE_RQST_ARG, 1, + [xdr_init_encode has rqst as a parameter]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if xdr_init_decode has rqst as a parameter]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct rpc_rqst *rqst = NULL; + + xdr_init_decode(NULL, NULL, NULL, rqst); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_XDR_INIT_DECODE_RQST_ARG, 1, + [xdr_init_decode has rqst as a parameter]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if xdr_stream_remaining as defined]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + xdr_stream_remaining(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_XDR_STREAM_REMAINING, 1, + [xdr_stream_remaining as defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([for "rc_stream" inside "struct svc_rdma_recv_ctxt"]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + struct xdr_stream dummy_stream; + struct svc_rdma_recv_ctxt dummy_rctxt; + + dummy_rctxt.rc_stream = dummy_stream; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SVC_RDMA_RECV_CTXT_RC_STREAM, 1, + [struct svc_rdma_recv_ctxt has 'rc_stream' field]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([for "sc_pending_recvs" inside "struct svcxprt_rdma"]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct svcxprt_rdma dummy_rdma; + + dummy_rdma.sc_pending_recvs = 0; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SVCXPRT_RDMA_SC_PENDING_RECVS, 1, + [struct svcxprt_rdma has 'sc_pending_recvs' field]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if xdr_encode_rdma_segment has defined]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + xdr_encode_rdma_segment(NULL, 0, 0, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_XDR_ENCODE_RDMA_SEGMENT, 1, + [xdr_encode_rdma_segment has defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if xdr_decode_rdma_segment has defined]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + xdr_decode_rdma_segment(NULL, 0, 0, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_XDR_DECODE_RDMA_SEGMENT, 1, + [xdr_decode_rdma_segment has defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if xdr_stream_encode_item_absent has defined]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + xdr_stream_encode_item_absent(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_XDR_STREAM_ENCODE_ITEM_ABSENT, 1, + [xdr_stream_encode_item_absent has defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if xdr_item_is_absent has defined]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + xdr_item_is_absent(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_XDR_ITEM_IS_ABSENT, 1, + [xdr_item_is_absent has defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if xdr_buf_subsegment get const]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + const struct xdr_buf *dummy; + xdr_buf_subsegment(dummy, NULL, 0, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_XDR_BUF_SUBSEGMENT_CONST, 1, + [xdr_buf_subsegment get const]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if svc_xprt_is_dead has defined]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + svc_xprt_is_dead(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SVC_XPRT_IS_DEAD, 1, + [svc_xprt_is_dead has defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if svc_rdma_release_rqst has externed]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + svc_rdma_release_rqst(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SVC_RDMA_RELEASE_RQST, 1, + [svc_rdma_release_rqst has externed]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if sg_alloc_table_chained has 4 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return sg_alloc_table_chained(NULL, 0, GFP_ATOMIC, NULL); + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SG_ALLOC_TABLE_CHAINED_GFP_MASK, 1, + [sg_alloc_table_chained has 4 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + LB_CHECK_SYMBOL_EXPORT([xprt_pin_rqst], + [net/sunrpc/xprt.c], + [AC_DEFINE(HAVE_XPRT_PIN_RQST, 1, + [xprt_pin_rqst is exported by the sunrpc core])], + []) + + LB_CHECK_SYMBOL_EXPORT([xprt_add_backlog], + [net/sunrpc/xprt.c], + [AC_DEFINE(HAVE_XPRT_ADD_BACKLOG, 1, + [xprt_add_backlog is exported by the sunrpc core])], + []) + + LB_CHECK_SYMBOL_EXPORT([xprt_lock_connect], + [net/sunrpc/xprt.c], + [AC_DEFINE(HAVE_XPRT_LOCK_CONNECT, 1, + [xprt_lock_connect is exported by the sunrpc core])], + []) + + LB_CHECK_SYMBOL_EXPORT([svc_xprt_deferred_close], + [net/sunrpc/svc_xprt.c], + [AC_DEFINE(HAVE_SVC_XPRT_DEFERRED_CLOSE, 1, + [svc_xprt_deferred_close is exported by the sunrpc core])], + []) + + LB_CHECK_SYMBOL_EXPORT([svc_xprt_received], + [net/sunrpc/svc_xprt.c], + [AC_DEFINE(HAVE_SVC_XPRT_RECEIVED, 1, + [svc_xprt_received is exported by the sunrpc core])], + []) + + LB_CHECK_SYMBOL_EXPORT([svc_xprt_close], + [net/sunrpc/svc_xprt.c], + [AC_DEFINE(HAVE_SVC_XPRT_CLOSE, 1, + [svc_xprt_close is exported by the sunrpc core])], + []) + + AC_MSG_CHECKING([for trace/events/rpcrdma.h]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include "../../net/sunrpc/xprtrdma/xprt_rdma.h" + + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TRACE_RPCRDMA_H, 1, [rpcrdma.h exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([for struct svc_rdma_pcl]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + struct svc_rdma_pcl *pcl; + + pcl = NULL; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SVC_RDMA_PCL, 1, [struct svc_rdma_pcl exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blkdev.h struct request_queue has timeout_work]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct request_queue q = { .timeout_work = {} }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_REQUEST_QUEUE_TIMEOUT_WORK, 1, + [blkdev.h struct request_queue has timeout_work]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if netdevice.h has __netdev_tx_sent_queue]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + __netdev_tx_sent_queue(NULL, 0, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE___NETDEV_TX_SENT_QUEUE, 1, + [netdevice.h has __netdev_tx_sent_queue]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if synchronize_net done when updating netdev queues]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + /* + * We can't have a real test for upstream commit ac5b70198adc + * This test is good for us. All kernels 4.16+ include the fix. + * And if the older kernels include this synchronize_net fix, + * it is still harmless for us to add it again in our backport. + */ + + #if LINUX_VERSION_CODE < KERNEL_VERSION(4,16,0) + #error No synchronize_net fix in kernel + #endif + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NET_SYNCHRONIZE_IN_SET_REAL_NUM_TX_QUEUES, 1, + [kernel does synchronize_net for us]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if flow_dissector.h enum flow_dissector_key_keyid has FLOW_DISSECTOR_KEY_ENC_OPTS]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + enum flow_dissector_key_id keyid = FLOW_DISSECTOR_KEY_ENC_OPTS; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FLOW_DISSECTOR_KEY_ENC_OPTS, 1, + [FLOW_DISSECTOR_KEY_ENC_OPTS is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if flow_dissector.h enum flow_dissector_key_keyid has FLOW_DISSECTOR_KEY_META]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + enum flow_dissector_key_id keyid = FLOW_DISSECTOR_KEY_META; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FLOW_DISSECTOR_KEY_META, 1, + [FLOW_DISSECTOR_KEY_META is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if netif_is_geneve exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + netif_is_geneve(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NETIF_IS_GENEVE, 1, + [netif_is_geneve is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/memremap.h exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NET_MEMREMAP_H, 1, + [net/bareudp.h is exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/bareudp.h exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NET_BAREUDP_H, 1, + [net/bareudp.h is exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/psample.h exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NET_PSAMPLE_H, 1, + [net/psample.h exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/psample.h has struct psample_metadata]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + struct psample_metadata *x; + x->trunc_size = 0; + + return 0 + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_STRUCT_PSAMPLE_METADATA, 1, + [net/psample.h has struct psample_metadata]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if netif_is_bareudp exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + netif_is_bareudp(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_NETIF_IS_BAREUDP, 1, + [netif_is_bareudp is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/blkdev.h has req_bvec]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + req_bvec(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLKDEV_REQ_BVEC, 1, + [linux/blkdev.h has req_bvec]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/blkdev.h has QUEUE_FLAG_QUIESCED]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int x = QUEUE_FLAG_QUIESCED; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLKDEV_QUEUE_FLAG_QUIESCED, 1, + [linux/blkdev.h has QUEUE_FLAG_QUIESCED]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if pci-p2pdma.h has pci_p2pdma_map_sg_attrs]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + pci_p2pdma_map_sg_attrs(NULL, NULL, 0, 0, 0); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_PCI_P2PDMA_MAP_SG_ATTRS, 1, + [pci_p2pdma_map_sg_attrs defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if uapi/linux/nvme_ioctl.h has struct nvme_passthru_cmd64]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + #include + ],[ + struct nvme_passthru_cmd64 cmd = {}; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_UAPI_LINUX_NVME_PASSTHRU_CMD64, 1, + [uapi/linux/nvme_ioctl.h has struct nvme_passthru_cmd64]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/blk_types.h has op_is_sync]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + op_is_sync(0); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_TYPE_OP_IS_SYNC, 1, + [linux/blk_types.h has op_is_sync]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/suspend.h has pm_suspend_via_firmware]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + pm_suspend_via_firmware(); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_PM_SUSPEND_VIA_FIRMWARE, 1, + [linux/suspend.h has pm_suspend_via_firmware]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/dma-mapping.h has dma_max_mapping_size]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + dma_max_mapping_size(NULL); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DMA_MAX_MAPPING_SIZE, 1, + [linux/dma-mapping.h has dma_max_mapping_size]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct request_queue has backing_dev_info]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct backing_dev_info *bdi = NULL; + struct request_queue rq = { + .backing_dev_info = bdi, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_REQUEST_QUEUE_BACKING_DEV_INFO, 1, + [struct request_queue has backing_dev_info]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/skbuff.h has skb_queue_empty_lockless]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + skb_queue_empty_lockless(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SKB_QUEUE_EMPTY_LOCKLESS, 1, + [linux/skbuff.h has skb_queue_empty_lockless]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/pci.h has pcie_aspm_enabled]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + pcie_aspm_enabled(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_PCIE_ASPM_ENABLED, 1, + [linux/pci.h has pcie_aspm_enabled]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/xdp_sock_drv.h exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_XDP_SOCK_DRV_H, 1, + [net/xdp_sock_drv.h exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if xsk_buff_dma_sync_for_cpu get 2 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + xsk_buff_dma_sync_for_cpu(NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_XSK_BUFF_DMA_SYNC_FOR_CPU_2_PARAMS, 1, + [xsk_buff_dma_sync_for_cpu get 2 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if include/linux/units.h exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_UNITS_H, 1, + [include/linux/units.h exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blkdev.h struct request has mq_hctx]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + struct request rq = { .mq_hctx = NULL }; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_REQUEST_MQ_HCTX, 1, + [blkdev.h struct request has mq_hctx]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/blkdev.h has bio_integrity_bytes]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + bio_integrity_bytes(NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLKDEV_BIO_INTEGRITY_BYTES, 1, + [linux/blkdev.h has bio_integrity_bytes]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/compat.h has in_compat_syscall]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + in_compat_syscall(); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IN_COMPAT_SYSCALL, 1, + [linux/compat.h has in_compat_syscall]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if include/net/esp.h has esp_output_fill_trailer]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + esp_output_fill_trailer(NULL, 0, 0, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_ESP_OUTPUT_FILL_TRAILER, 1, + [esp_output_fill_trailer is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/compat.h has compat_uptr_t]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + compat_uptr_t x; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_COMPAT_UPTR_T, 1, + [linux/compat.h has compat_uptr_t]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blk_queue_max_active_zones exist]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + blk_queue_max_active_zones(NULL, 0); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_QUEUE_MAX_ACTIVE_ZONES, 1, + [blk_queue_max_active_zones exist]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if genhd.h has set_capacity_revalidate_and_notify]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + set_capacity_revalidate_and_notify(NULL, 0, false); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SET_CAPACITY_REVALIDATE_AND_NOTIFY, 1, + [genhd.h has set_capacity_revalidate_and_notify]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct block_device_operations has submit_bio]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct block_device_operations ops = { + .submit_bio = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLOCK_DEVICE_OPERATIONS_SUBMIT_BIO, 1, + [struct block_device_operations has submit_bio]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blk_queue_split has 1 param]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + blk_queue_split(NULL); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_QUEUE_SPLIT_1_PARAM, 1, + [blk_queue_split has 1 param]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blkdev.h has bio_split_to_limits]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + bio_split_to_limits(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BIO_SPLIT_TO_LIMITS, 1, + [blkdev.h has bio_split_to_limits]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if submit_bio_noacct exist]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + submit_bio_noacct(NULL); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SUBMIT_BIO_NOACCT, 1, + [submit_bio_noacct exist]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if pci.h has pcie_find_root_port]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + pcie_find_root_port(NULL); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_PCIE_FIND_ROOT_PORT, 1, + [pci.h has pcie_find_root_port]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/blk-mq.h has blk_should_fake_timeout]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + blk_should_fake_timeout(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_SHOULD_FAKE_TIMEOUT, 1, + [linux/blk-mq.h has blk_should_fake_timeout]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/blk-mq.h has blk_mq_complete_request_remote]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + blk_mq_complete_request_remote(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_MQ_COMPLETE_REQUEST_REMOTE, 1, + [linux/blk-mq.h has blk_mq_complete_request_remote]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if trace_block_bio_complete has 2 param]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + trace_block_bio_complete(NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TRACE_BLOCK_BIO_COMPLETE_2_PARAM, 1, + [trace_block_bio_complete has 2 param]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/ip.h has ip_sock_set_tos]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + ip_sock_set_tos(NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IP_SOCK_SET_TOS, 1, + [net/ip.h has ip_sock_set_tos]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/tcp.h has tcp_sock_set_syncnt]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + tcp_sock_set_syncnt(NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TCP_SOCK_SET_SYNCNT, 1, + [linux/tcp.h has tcp_sock_set_syncnt]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/tcp.h has tcp_sock_set_nodelay]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + tcp_sock_set_nodelay(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TCP_SOCK_SET_NODELAY, 1, + [linux/tcp.h has tcp_sock_set_nodelay]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if string.h has kmemdup_nul]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + kmemdup_nul(NULL, 0, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_KMEMDUP_NUL, 1, + [string.h has kmemdup_nul]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blkdev_issue_flush has 2 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + blkdev_issue_flush(NULL, 0); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLKDEV_ISSUE_FLUSH_2_PARAM, 1, + [blkdev_issue_flush has 2 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/sock.h has sock_no_linger]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + sock_no_linger(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SOCK_NO_LINGER, 1, + [net/sock.h has sock_no_linger]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/sock.h has sock_set_priority]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + sock_set_priority(NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SOCK_SET_PRIORITY, 1, + [net/sock.h has sock_set_priority]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/sock.h has sock_set_reuseaddr]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + sock_set_reuseaddr(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SOCK_SET_REUSEADDR, 1, + [net/sock.h has sock_set_reuseaddr]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/net.h has sendpage_ok]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + sendpage_ok(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SENDPAGE_OK, 1, + [linux/net.h has sendpage_ok]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/page_ref.h has page_count]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + page_count(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_PAGE_COUNT, 1, + [linux/page_ref.h has page_count]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if ptp_find_pin_unlocked is defined]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + ptp_find_pin_unlocked(NULL, 0, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_PTP_FIND_PIN_UNLOCK, 1, + [ptp_find_pin_unlocked is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if uapi/linux/xfrm.h has XFRM_OFFLOAD_FULL]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int a = XFRM_OFFLOAD_FULL; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_XFRM_OFFLOAD_FULL, 1, + [XFRM_OFFLOAD_FULL is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct xfrm_offload has inner_ipproto]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct xfrm_offload xo = { + .inner_ipproto = 4, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_XFRM_OFFLOAD_INNER_IPPROTO, 1, + [struct xfrm_offload has inner_ipproto]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if genhd.h has bd_set_nr_sectors]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + bd_set_nr_sectors(NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BD_SET_NR_SECTORS, 1, + [genhd.h has bd_set_nr_sectors]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blkdev.h has QUEUE_FLAG_STABLE_WRITES]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int x = QUEUE_FLAG_STABLE_WRITES; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_QUEUE_FLAG_STABLE_WRITES, 1, + [QUEUE_FLAG_STABLE_WRITES is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if genhd.h has revalidate_disk_size]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + revalidate_disk_size(NULL, false); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_REVALIDATE_DISK_SIZE, 1, + [genhd.h has revalidate_disk_size]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if fs.h has inode_lock]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + inode_lock(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_INODE_LOCK, 1, + [fs.h has inode_lock]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/blk-mq.h has blk_mq_set_request_complete]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + blk_mq_set_request_complete(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_MQ_SET_REQUEST_COMPLETE, 1, + [linux/blk-mq.h has blk_mq_set_request_complete]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/blkdev.h has blk_alloc_queue_rh]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + blk_alloc_queue_rh(NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_ALLOC_QUEUE_RH, 1, + [linux/blkdev.h has blk_alloc_queue_rh]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blkdev.h struct request has block_device]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + struct block_device *bdev = NULL; + struct request rq = { .part = bdev }; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_REQUEST_BDEV, 1, + [blkdev.h struct request has block_device]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blkdev_issue_flush has 1 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + blkdev_issue_flush(NULL); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLKDEV_ISSUE_FLUSH_1_PARAM, 1, + [blkdev_issue_flush has 1 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if bio.h has bio_max_segs]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + bio_max_segs(0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BIO_MAX_SEGS, 1, + [if bio.h has bio_max_segs]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if trace_block_bio_remap has 4 param]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + trace_block_bio_remap(NULL, NULL, 0, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_TRACE_BLOCK_BIO_REMAP_4_PARAM, 1, + [trace_block_bio_remap has 4 param]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if genhd.h has bd_set_size]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + bd_set_size(NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BD_SET_SIZE, 1, + [genhd.h has bd_set_size]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blk_execute_rq_nowait has 5 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + blk_execute_rq_nowait(NULL, NULL, NULL, 0, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_EXECUTE_RQ_NOWAIT_5_PARAM, 1, + [blk_execute_rq_nowait has 5 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blk_execute_rq_nowait has 3 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + blk_execute_rq_nowait(NULL, 0, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_EXECUTE_RQ_NOWAIT_3_PARAM, 1, + [blk_execute_rq_nowait has 3 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blk_execute_rq_nowait has 2 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + blk_execute_rq_nowait(NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_EXECUTE_RQ_NOWAIT_2_PARAM, 1, + [blk_execute_rq_nowait has 2 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blk_execute_rq has 4 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + blk_execute_rq(NULL, NULL, NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_EXECUTE_RQ_4_PARAM, 1, + [blk_execute_rq has 4 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct enum has member BIO_REMAPPED]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int tmp = BIO_REMAPPED; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_ENUM_BIO_REMAPPED, 1, + [struct enum has member BIO_REMAPPED]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct pci_driver has member sriov_get_vf_total_msix/sriov_set_msix_vec_count]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct pci_driver core_driver = { + .sriov_get_vf_total_msix = NULL, + .sriov_set_msix_vec_count = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SRIOV_GET_SET_MSIX_VEC_COUNT, 1, + [struct pci_driver has member sriov_get_vf_total_msix/sriov_set_msix_vec_count]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if configfs.h has configfs_register_group]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + configfs_register_group(NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_CONFIGFS_REGISTER_GROUP, 1, + [configfs.h has configfs_register_group]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct bio has member bi_bdev]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct bio b = { + .bi_bdev = NULL, + }; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BIO_BI_BDEV, 1, + [struct bio has member bi_bdev]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blkdev.h has blk_rq_append_bio]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + struct bio *bio; + + blk_rq_append_bio(NULL, bio); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_RQ_APPEND_BIO_POINTER, 1, + [blk_rq_append_bio is defined with struct *bio]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if genhd.h has bdev_nr_sectors]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + bdev_nr_sectors(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BDEV_NR_SECTORS, 1, + [genhd.h has bdev_nr_sectors]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if BLK_STS_ZONE_ACTIVE_RESOURCE is defined in blk_types]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + blk_status_t error = BLK_STS_ZONE_ACTIVE_RESOURCE; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_MQ_BLK_STS_ZONE_ACTIVE_RESOURCE, 1, + [blk_types.h has BLK_STS_ZONE_ACTIVE_RESOURCE]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blkdev.h has blk_queue_update_readahead]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + blk_queue_update_readahead(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_QUEUE_UPDATE_READAHEAD, 1, + [blk_queue_update_readahead is defined in blkdev.h]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if dma-mapping.h has dma_set_min_align_mask]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + dma_set_min_align_mask(NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DMA_SET_MIN_ALIGN_MASK, 1, + [dma_set_min_align_mask is defined in dma-mapping]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if bio.h has bio_for_each_bvec]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct bio *bio; + struct bvec_iter bi; + struct bio_vec bv; + + bio_for_each_bvec(bv, bio, bi); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BIO_FOR_EACH_BVEC, 1, + [bio_for_each_bvec is defined in bio.h]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if have linux/io-64-nonatomic-hi-lo.h]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_IO_64_NONATOMIC_HI_LO_H, 1, + [can include linux/io-64-nonatomic-hi-lo.h]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blk-mq.h has blk_mq_hctx_set_fq_lock_class]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + blk_mq_hctx_set_fq_lock_class(NULL, NULL); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_MQ_HCTX_SET_FQ_LOCK_CLASS, 1, + [blk-mq.h has blk_mq_hctx_set_fq_lock_class]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if bio.h has BIO_MAX_VECS]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int x = BIO_MAX_VECS; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BIO_MAX_VECS, 1, + [if bio.h has BIO_MAX_VECS]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blk-mq.h has blk_rq_bio_prep]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + blk_rq_bio_prep(NULL, NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_RQ_BIO_PREP, 1, + [if blk-mq.h has blk_rq_bio_prep]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if genhd.h has blk_alloc_disk]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + blk_alloc_disk(0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_ALLOC_DISK, 1, + [genhd.h has blk_alloc_disk]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if asm-generic/unaligned.h has put_unaligned_le24]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + put_unaligned_le24(0, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_PUT_UNALIGNED_LE24_ASM_GENERIC, 1, + [put_unaligned_le24 existing in asm-generic/unaligned.h]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if genhd.h has GENHD_FL_UP]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int x = GENHD_FL_UP; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_GENHD_FL_UP, 1, + [genhd.h has GENHD_FL_UP]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/blk-mq.h has blk_mq_alloc_disk]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + blk_mq_alloc_disk(NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_MQ_ALLOC_DISK, 1, + [blk_mq_alloc_disk is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct blk_mq_ops has poll 2 args]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + static int nvme_poll(struct blk_mq_hw_ctx *hctx, + struct io_comp_batch *iob) { + return 0; + } + ],[ + struct blk_mq_ops ops = { + .poll = nvme_poll, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_MQ_OPS_POLL_2_ARG, 1, + [struct blk_mq_ops has poll 2 args]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/blk-integrity.h exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_INTEGRITY_H, 1, + [linux/blk-integrity.h exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct bio has member bi_cookie]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct bio b = { + .bi_cookie = 0, + }; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BIO_BI_COOKIE, 1, + [struct bio has member bi_cookie]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if genhd.h has device_add_disk retrun]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int ret = device_add_disk(NULL, NULL, NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DEVICE_ADD_DISK_RETURN, 1, + [genhd.h has device_add_disk retrun]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/fs.h has struct kiocb ki_complete 2 args]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + + static void func(struct kiocb *iocb, long ret) { + return; + } + ],[ + struct kiocb x = { + .ki_complete = func, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FS_KIOCB_KI_COMPLETE_2_ARG, 1, + [linux/fs.h has struct kiocb ki_complete 2 args]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blk_execute_rq has 2 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + blk_execute_rq(NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_EXECUTE_RQ_2_PARAM, 1, + [blk_execute_rq has 2 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if genhd.h has GENHD_FL_EXT_DEVT]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int x = GENHD_FL_EXT_DEVT; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_GENHD_FL_EXT_DEVT, 1, + [genhd.h has GENHD_FL_EXT_DEVT]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blk-mq.h struct request has rq_disk]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct request rq = { .rq_disk = NULL }; + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_REQ_RQ_DISK, 1, + [blkdev.h struct request has rq_disk]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct blk_mq_ops has queue_rqs]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct blk_mq_ops ops = { + .queue_rqs = NULL, + }; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_MQ_OPS_QUEUE_RQS, 1, + [struct blk_mq_ops has queue_rqs]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if bdev_nr_bytes exist]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + bdev_nr_bytes(NULL); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BDEV_NR_BYTES, 1, + [bdev_nr_bytes exist]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if pci_ids.h has PCI_VENDOR_ID_REDHAT]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int x = PCI_VENDOR_ID_REDHAT; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_PCI_VENDOR_ID_REDHAT, 1, + [PCI_VENDOR_ID_REDHAT is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if acpi_storage_d3 exist]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + acpi_storage_d3(NULL); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_ACPI_STORAGE_D3, 1, + [acpi_storage_d3 exist]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/moduleparam.h has param_set_uint_minmax]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + param_set_uint_minmax(NULL, NULL, 0, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_PARAM_SET_UINT_MINMAX, 1, + [linux/moduleparam.h has param_set_uint_minmax]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/blk-mq.h has blk_mq_wait_quiesce_done]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + blk_mq_wait_quiesce_done(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_MQ_WAIT_QUIESCE_DONE, 1, + [blk_mq_wait_quiesce_done is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if timeout from struct blk_mq_ops has 1 param]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + + static enum blk_eh_timer_return + timeout_dummy(struct request *req) { + return 0; + } + ],[ + struct blk_mq_ops ops_dummy; + + ops_dummy.timeout = timeout_dummy; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_MQ_OPS_TIMEOUT_1_PARAM, 1, + [timeout from struct blk_mq_ops has 1 param]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/blk-mq.h has blk_mq_destroy_queue]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + blk_mq_destroy_queue(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_MQ_DESTROY_QUEUE, 1, + [blk_mq_destroy_queue is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blk_execute_rq has 3 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + blk_status_t x = blk_execute_rq(NULL, NULL, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_EXECUTE_RQ_3_PARAM, 1, + [blk_execute_rq has 3 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if disk_uevent exist]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + disk_uevent(NULL, 0); + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DISK_UEVENT, 1, + [disk_uevent exist]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blk-cgroup.h has FC_APPID_LEN]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int x = FC_APPID_LEN; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FC_APPID_LEN, 1, + [FC_APPID_LEN is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/bvec.h has bvec_virt]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + #include + ],[ + bvec_virt(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BVEC_VIRT, 1, + [linux/bvec.h has bvec_virt]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/sock.h has sock_setsockopt sockptr_t]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + sockptr_t optval = {}; + + sock_setsockopt(NULL, 0, 0, optval, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_SOCK_SETOPTVAL_SOCKPTR_T, 1, + [net/sock.h has sock_setsockopt sockptr_t]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if bio.h blk_next_bio has 3 parameters]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + blk_next_bio(NULL, 0, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_NEXT_BIO_3_PARAMS, 1, + [bio.h blk_next_bio has 3 parameters]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if disk_update_readahead exists]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + disk_update_readahead(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_DISK_UPDATE_READAHEAD, 1, + [disk_update_readahead exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if linux/vmalloc.h has __vmalloc 3 params]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + __vmalloc(0, 0, PAGE_KERNEL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_VMALLOC_3_PARAM, 1, + [linux/vmalloc.h has __vmalloc 3 params]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if bio.h bio_init has 5 parameters]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + bio_init(NULL, NULL, NULL, 0, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BIO_INIT_5_PARAMS, 1, + [bio.h bio_init has 5 parameters]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if bio.h has bio_add_zone_append_page]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + bio_add_zone_append_page(NULL, NULL, 0, 0); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BIO_ADD_ZONE_APPEND_PAGE, 1, + [bio.h has bio_add_zone_append_page]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blkdev.h has blk_cleanup_disk()]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct gendisk *disk; + + blk_cleanup_disk(disk); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_CLEANUP_DISK, 1, + [blk_cleanup_disk() is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blkdev.h has QUEUE_FLAG_DISCARD]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int x = QUEUE_FLAG_DISCARD; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_QUEUE_FLAG_DISCARD, 1, + [QUEUE_FLAG_DISCARD is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if struct gendisk has conv_zones_bitmap]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct gendisk disk; + + disk.conv_zones_bitmap = NULL; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_GENDISK_CONV_ZONES_BITMAP, 1, + [struct gendisk has conv_zones_bitmap]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blkdev.h has bdev_nr_zones]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + bdev_nr_zones(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BDEV_NR_ZONES, 1, + [blkdev.h has bdev_nr_zones]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if blkdev.h has blk_queue_zone_sectors]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + blk_queue_zone_sectors(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_BLK_QUEUE_ZONE_SECTORS, 1, + [blkdev.h has blk_queue_zone_sectors]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if uapi/linux/ptp_clock.h has PTP_PEROUT_DUTY_CYCLE]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int x = PTP_PEROUT_DUTY_CYCLE; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_PTP_PEROUT_DUTY_CYCLE, 1, + [PTP_PEROUT_DUTY_CYCLE is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/dst_metadata.h has struct macsec_info]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + struct macsec_info info = {}; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_STRUCT_MACSEC_INFO_METADATA, 1, + [net/dst_metadata.h has struct macsec_info]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/macsec.c has function macsec_get_real_dev]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + macsec_get_real_dev(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FUNC_MACSEC_GET_REAL_DEV, 1, + [net/macsec.c has function macsec_get_real_dev]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if flow_dissector.h has FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + int x = FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP; + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP, 1, + [FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([if net/macsec.c has function netdev_macsec_is_offloaded]) + MLNX_BG_LB_LINUX_TRY_COMPILE([ + #include + ],[ + netdev_macsec_is_offloaded(NULL); + + return 0; + ],[ + AC_MSG_RESULT(yes) + MLNX_AC_DEFINE(HAVE_FUNC_NETDEV_MACSEC_IS_OFFLOADED, 1, + [net/macsec.c has function netdev_macsec_is_offloaded]) + ],[ + AC_MSG_RESULT(no) + ]) +]) +# +# COMPAT_CONFIG_HEADERS +# +# add -include config.h +# +AC_DEFUN([COMPAT_CONFIG_HEADERS],[ +# +# Wait for remaining build tests running in background +# + wait +# +# Append confdefs.h files from CONFDEFS_H_DIR to the main confdefs.h file +# + /bin/cat CONFDEFS_H_DIR/confdefs.h.* >> confdefs.h + /bin/rm -rf CONFDEFS_H_DIR +# +# Generate the config.h header file +# + AC_CONFIG_HEADERS([config.h]) + EXTRA_KCFLAGS="-include $PWD/config.h $EXTRA_KCFLAGS" + AC_SUBST(EXTRA_KCFLAGS) +]) + +AC_DEFUN([MLNX_PROG_LINUX], +[ + +LB_LINUX_PATH +LB_LINUX_SYMVERFILE +LB_LINUX_CONFIG([MODULES],[],[ + AC_MSG_ERROR([module support is required to build mlnx kernel modules.]) +]) +LB_LINUX_CONFIG([MODVERSIONS]) +LB_LINUX_CONFIG([KALLSYMS],[],[ + AC_MSG_ERROR([compat_mlnx requires that CONFIG_KALLSYMS is enabled in your kernel.]) +]) + +LINUX_CONFIG_COMPAT +COMPAT_CONFIG_HEADERS + +]) diff --git a/src/mlnx-ofa_kernel-5.8/compat/config/warning_filter.sh b/src/mlnx-ofa_kernel-5.8/compat/config/warning_filter.sh new file mode 100755 index 0000000..1f83a32 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/config/warning_filter.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +log_file=$1 + +echo_err() +{ + echo -e "$@" 1>&2 +} + +regEx="(from incompatible pointer type|declared inside parameter list|is deprecated|expects argument of type|discards .const. qualifier|but argument is of type|discards qualifiers from pointer target type|discards ‘const’ qualifier|makes pointer from integer without a cast|not in enumerated type ‘const enum flow_action_id’)" + +cat $log_file 1>&2 + +if (grep -qE "$regEx" $log_file 2>/dev/null); then + echo_err "warning_filter.sh: treating warnings as errors!" + grep -E "$regEx" $log_file 1>&2 + exit 1 +fi + +exit 0 diff --git a/src/mlnx-ofa_kernel-5.8/compat/configure.ac b/src/mlnx-ofa_kernel-5.8/compat/configure.ac new file mode 100644 index 0000000..26183b1 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/configure.ac @@ -0,0 +1,56 @@ +AC_PREREQ([2.57]) +AC_INIT([compat_mlnx], 2.3, [http://support.mellanox.com/SupportWeb/service_center/SelfService], [compat_mlnx]) + +AC_CONFIG_AUX_DIR([config]) +AC_CONFIG_MACRO_DIR([config]) + +AC_CANONICAL_SYSTEM + +AM_INIT_AUTOMAKE([foreign]) +AC_CONFIG_FILES([Makefile]) + +AC_PROG_CC + +AM_PROG_AS + +AC_CHECK_TOOLS(AR, ar) + +LB_PROG_CC + +AC_ARG_WITH(njobs, + AS_HELP_STRING([--with-njobs=N],[Allow N jobs at once; jobs as number of CPUs with no arg.]), + [ + NJOBS="$withval" + case "X${NJOBS}" in + X | X[A-Za-z]*) + NJOBS=$(MLXNUMC=$(grep ^processor /proc/cpuinfo | wc -l) && echo $(($MLXNUMC<16?$MLXNUMC:16))) + ;; + esac + ], + NJOBS=1 +) + +MLNX_PROG_LINUX +LB_CONDITIONALS + +# +# cleanup auto-generated defines +# +sed -i '/\/d' $PWD/config.h +sed -i '/\/d' $PWD/config.h +sed -i '/\/d' $PWD/config.h +sed -i '/\/d' $PWD/config.h +sed -i '/\/d' $PWD/config.h +sed -i '/\/d' $PWD/config.h +sed -i '/\/d' $PWD/config.h +sed -i '/\/d' $PWD/config.h + +# +cat << 'END_CONFIG' >> $PWD/config.h + +/* Make sure LINUX_BACKPORT macro is defined for all external users */ +#ifndef LINUX_BACKPORT +#define LINUX_BACKPORT(__sym) backport_ ##__sym +#endif + +END_CONFIG diff --git a/src/mlnx-ofa_kernel-5.8/compat/cordic.c b/src/mlnx-ofa_kernel-5.8/compat/cordic.c new file mode 100644 index 0000000..faef265 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/cordic.c @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2011 Broadcom Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#include +#include + +#define CORDIC_ANGLE_GEN 39797 +#define CORDIC_PRECISION_SHIFT 16 +#define CORDIC_NUM_ITER (CORDIC_PRECISION_SHIFT + 2) + +#define FIXED(X) ((s32)((X) << CORDIC_PRECISION_SHIFT)) +#define FLOAT(X) (((X) >= 0) \ + ? ((((X) >> (CORDIC_PRECISION_SHIFT - 1)) + 1) >> 1) \ + : -((((-(X)) >> (CORDIC_PRECISION_SHIFT - 1)) + 1) >> 1)) + +static const s32 arctan_table[] = { + 2949120, + 1740967, + 919879, + 466945, + 234379, + 117304, + 58666, + 29335, + 14668, + 7334, + 3667, + 1833, + 917, + 458, + 229, + 115, + 57, + 29 +}; + +/* + * cordic_calc_iq() - calculates the i/q coordinate for given angle + * + * theta: angle in degrees for which i/q coordinate is to be calculated + * coord: function output parameter holding the i/q coordinate + */ +#define cordic_calc_iq LINUX_BACKPORT(cordic_calc_iq) +struct cordic_iq cordic_calc_iq(s32 theta) +{ + struct cordic_iq coord; + s32 angle, valtmp; + unsigned iter; + int signx = 1; + int signtheta; + + coord.i = CORDIC_ANGLE_GEN; + coord.q = 0; + angle = 0; + + theta = FIXED(theta); + signtheta = (theta < 0) ? -1 : 1; + theta = ((theta + FIXED(180) * signtheta) % FIXED(360)) - + FIXED(180) * signtheta; + + if (FLOAT(theta) > 90) { + theta -= FIXED(180); + signx = -1; + } else if (FLOAT(theta) < -90) { + theta += FIXED(180); + signx = -1; + } + + for (iter = 0; iter < CORDIC_NUM_ITER; iter++) { + if (theta > angle) { + valtmp = coord.i - (coord.q >> iter); + coord.q += (coord.i >> iter); + angle += arctan_table[iter]; + } else { + valtmp = coord.i + (coord.q >> iter); + coord.q -= (coord.i >> iter); + angle -= arctan_table[iter]; + } + coord.i = valtmp; + } + + coord.i *= signx; + coord.q *= signx; + return coord; +} +EXPORT_SYMBOL_GPL(cordic_calc_iq); + +MODULE_DESCRIPTION("Cordic functions"); +MODULE_AUTHOR("Broadcom Corporation"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif diff --git a/src/mlnx-ofa_kernel-5.8/compat/crc8.c b/src/mlnx-ofa_kernel-5.8/compat/crc8.c new file mode 100644 index 0000000..6a26d17 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/crc8.c @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2011 Broadcom Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include + +/* + * crc8_populate_msb - fill crc table for given polynomial in reverse bit order. + * + * table: table to be filled. + * polynomial: polynomial for which table is to be filled. + */ +#define crc8_populate_msb LINUX_BACKPORT(crc8_populate_msb) +void crc8_populate_msb(u8 table[CRC8_TABLE_SIZE], u8 polynomial) +{ + int i, j; + const u8 msbit = 0x80; + u8 t = msbit; + + table[0] = 0; + + for (i = 1; i < CRC8_TABLE_SIZE; i *= 2) { + t = (t << 1) ^ (t & msbit ? polynomial : 0); + for (j = 0; j < i; j++) + table[i+j] = table[j] ^ t; + } +} +EXPORT_SYMBOL_GPL(crc8_populate_msb); + +/* + * crc8_populate_lsb - fill crc table for given polynomial in regular bit order. + * + * table: table to be filled. + * polynomial: polynomial for which table is to be filled. + */ +#define crc8_populate_lsb LINUX_BACKPORT(crc8_populate_lsb) +void crc8_populate_lsb(u8 table[CRC8_TABLE_SIZE], u8 polynomial) +{ + int i, j; + u8 t = 1; + + table[0] = 0; + + for (i = (CRC8_TABLE_SIZE >> 1); i; i >>= 1) { + t = (t >> 1) ^ (t & 1 ? polynomial : 0); + for (j = 0; j < CRC8_TABLE_SIZE; j += 2*i) + table[i+j] = table[j] ^ t; + } +} +EXPORT_SYMBOL_GPL(crc8_populate_lsb); + +/* + * crc8 - calculate a crc8 over the given input data. + * + * table: crc table used for calculation. + * pdata: pointer to data buffer. + * nbytes: number of bytes in data buffer. + * crc: previous returned crc8 value. + */ +#define crc8 LINUX_BACKPORT(crc8) +u8 crc8(const u8 table[CRC8_TABLE_SIZE], u8 *pdata, size_t nbytes, u8 crc) +{ + /* loop over the buffer data */ + while (nbytes-- > 0) + crc = table[(crc ^ *pdata++) & 0xff]; + + return crc; +} +EXPORT_SYMBOL_GPL(crc8); + +MODULE_DESCRIPTION("CRC8 (by Williams, Ross N.) function"); +MODULE_AUTHOR("Broadcom Corporation"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif diff --git a/src/mlnx-ofa_kernel-5.8/compat/dim.c b/src/mlnx-ofa_kernel-5.8/compat/dim.c new file mode 100644 index 0000000..38045d6 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/dim.c @@ -0,0 +1,83 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2019, Mellanox Technologies inc. All rights reserved. + */ + +#include + +bool dim_on_top(struct dim *dim) +{ + switch (dim->tune_state) { + case DIM_PARKING_ON_TOP: + case DIM_PARKING_TIRED: + return true; + case DIM_GOING_RIGHT: + return (dim->steps_left > 1) && (dim->steps_right == 1); + default: /* DIM_GOING_LEFT */ + return (dim->steps_right > 1) && (dim->steps_left == 1); + } +} +EXPORT_SYMBOL(dim_on_top); + +void dim_turn(struct dim *dim) +{ + switch (dim->tune_state) { + case DIM_PARKING_ON_TOP: + case DIM_PARKING_TIRED: + break; + case DIM_GOING_RIGHT: + dim->tune_state = DIM_GOING_LEFT; + dim->steps_left = 0; + break; + case DIM_GOING_LEFT: + dim->tune_state = DIM_GOING_RIGHT; + dim->steps_right = 0; + break; + } +} +EXPORT_SYMBOL(dim_turn); + +void dim_park_on_top(struct dim *dim) +{ + dim->steps_right = 0; + dim->steps_left = 0; + dim->tired = 0; + dim->tune_state = DIM_PARKING_ON_TOP; +} +EXPORT_SYMBOL(dim_park_on_top); + +void dim_park_tired(struct dim *dim) +{ + dim->steps_right = 0; + dim->steps_left = 0; + dim->tune_state = DIM_PARKING_TIRED; +} +EXPORT_SYMBOL(dim_park_tired); + +void dim_calc_stats(struct dim_sample *start, struct dim_sample *end, + struct dim_stats *curr_stats) +{ + /* u32 holds up to 71 minutes, should be enough */ + u32 delta_us = ktime_us_delta(end->time, start->time); + u32 npkts = BIT_GAP(BITS_PER_TYPE(u32), end->pkt_ctr, start->pkt_ctr); + u32 nbytes = BIT_GAP(BITS_PER_TYPE(u32), end->byte_ctr, + start->byte_ctr); + u32 ncomps = BIT_GAP(BITS_PER_TYPE(u32), end->comp_ctr, + start->comp_ctr); + + if (!delta_us) + return; + + curr_stats->ppms = DIV_ROUND_UP(npkts * USEC_PER_MSEC, delta_us); + curr_stats->bpms = DIV_ROUND_UP(nbytes * USEC_PER_MSEC, delta_us); + curr_stats->epms = DIV_ROUND_UP(DIM_NEVENTS * USEC_PER_MSEC, + delta_us); + curr_stats->cpms = DIV_ROUND_UP(ncomps * USEC_PER_MSEC, delta_us); + if (curr_stats->epms != 0) + curr_stats->cpe_ratio = DIV_ROUND_DOWN_ULL( + curr_stats->cpms * 100, curr_stats->epms); + else + curr_stats->cpe_ratio = 0; + +} +EXPORT_SYMBOL(dim_calc_stats); diff --git a/src/mlnx-ofa_kernel-5.8/compat/exec.c b/src/mlnx-ofa_kernel-5.8/compat/exec.c new file mode 100644 index 0000000..803db38 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/exec.c @@ -0,0 +1,15 @@ +#if !defined(HAVE___GET_TASK_COMM_EXPORTED) && !defined(HAVE_GET_TASK_COMM_EXPORTED) + +#include +#include + +char *__get_task_comm(char *buf, size_t buf_size, struct task_struct *tsk) +{ + task_lock(tsk); + strncpy(buf, tsk->comm, buf_size); + task_unlock(tsk); + return buf; +} +EXPORT_SYMBOL_GPL(__get_task_comm); + +#endif diff --git a/src/mlnx-ofa_kernel-5.8/compat/file.c b/src/mlnx-ofa_kernel-5.8/compat/file.c new file mode 100644 index 0000000..28c7dbe --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/file.c @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/sysfs/file.c - sysfs regular (text) file implementation + * + * Copyright (c) 2001-3 Patrick Mochel + * Copyright (c) 2007 SUSE Linux Products GmbH + * Copyright (c) 2007 Tejun Heo + * + * Please see Documentation/filesystems/sysfs.rst for more information. + */ + +#include +#include +#include +#include +#include +#include +#include + +/** + * sysfs_emit - scnprintf equivalent, aware of PAGE_SIZE buffer. + * @buf: start of PAGE_SIZE buffer. + * @fmt: format + * @...: optional arguments to @format + * + * + * Returns number of characters written to @buf. + */ +#ifndef HAVE_SYSFS_EMIT +#ifdef CONFIG_SYSFS +int sysfs_emit(char *buf, const char *fmt, ...) +{ + va_list args; + int len; + + if (WARN(!buf || offset_in_page(buf), + "invalid sysfs_emit: buf:%p\n", buf)) + return 0; + + va_start(args, fmt); + len = vscnprintf(buf, PAGE_SIZE, fmt, args); + va_end(args); + + return len; +} +EXPORT_SYMBOL_GPL(sysfs_emit); +#endif +#endif + diff --git a/src/mlnx-ofa_kernel-5.8/compat/flow_dissector.c b/src/mlnx-ofa_kernel-5.8/compat/flow_dissector.c new file mode 100644 index 0000000..cc10c8d --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/flow_dissector.c @@ -0,0 +1,1483 @@ +#ifdef CONFIG_COMPAT_FLOW_DISSECTOR + +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef __GRE_SUPPORT__ +#include +#endif +#ifdef __PPTP_SUPPORT__ +#include +#endif +#ifdef __TIPC_SUPPORT__ +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef IP6_OFFSET +#define IP6_OFFSET 0xFFF8 +#endif + +#ifndef MPLS_LABEL_ENTROPY +#define MPLS_LABEL_ENTROPY 7 /* RFC6790 */ +#endif + +#ifndef IPPROTO_MPLS +#define IPPROTO_MPLS 137 +#endif + +#ifdef HAVE_NET_FLOW_KEYS_H +#define flow_keys LINUX_BACKPORT(flow_keys) +#define flow_hash_from_keys LINUX_BACKPORT(flow_hash_from_keys) +#endif + +static inline bool eth_type_vlan_bp(__be16 ethertype) +{ + switch (ethertype) { + case htons(ETH_P_8021Q): + case htons(ETH_P_8021AD): + return true; + default: + return false; + } +} +#ifndef CONFIG_NET_SCHED_NEW +bool skb_flow_dissect_flow_keys(const struct sk_buff *skb, + struct flow_keys *flow, + unsigned int flags) +{ + memset(flow, 0, sizeof(*flow)); + return __skb_flow_dissect(skb, &flow_keys_dissector, flow, + NULL, 0, 0, 0, flags); +} +EXPORT_SYMBOL(skb_flow_dissect_flow_keys); +#endif + +#ifndef HAVE_SKB_FLOW_DISSECT +static void dissector_set_key(struct flow_dissector *flow_dissector, + enum flow_dissector_key_id key_id) +{ + flow_dissector->used_keys |= (1 << key_id); +} + +void skb_flow_dissector_init(struct flow_dissector *flow_dissector, + const struct flow_dissector_key *key, + unsigned int key_count) +{ + unsigned int i; + + memset(flow_dissector, 0, sizeof(*flow_dissector)); + + for (i = 0; i < key_count; i++, key++) { + /* User should make sure that every key target offset is withing + * boundaries of unsigned short. + */ + BUG_ON(key->offset > USHRT_MAX); + BUG_ON(dissector_uses_key(flow_dissector, + key->key_id)); + + dissector_set_key(flow_dissector, key->key_id); + flow_dissector->offset[key->key_id] = key->offset; + } + + /* Ensure that the dissector always includes control and basic key. + * That way we are able to avoid handling lack of these in fast path. + */ + BUG_ON(!dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_CONTROL)); + BUG_ON(!dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_BASIC)); +} +EXPORT_SYMBOL(skb_flow_dissector_init); + +/** + * skb_flow_get_be16 - extract be16 entity + * @skb: sk_buff to extract from + * @poff: offset to extract at + * @data: raw buffer pointer to the packet + * @hlen: packet header length + * + * The function will try to retrieve a be32 entity at + * offset poff + */ +static __be16 skb_flow_get_be16(const struct sk_buff *skb, int poff, + void *data, int hlen) +{ + __be16 *u, _u; + + u = __skb_header_pointer(skb, poff, sizeof(_u), data, hlen, &_u); + if (u) + return *u; + + return 0; +} + +/** + * __skb_flow_get_ports - extract the upper layer ports and return them + * @skb: sk_buff to extract the ports from + * @thoff: transport header offset + * @ip_proto: protocol for which to get port offset + * @data: raw buffer pointer to the packet, if NULL use skb->data + * @hlen: packet header length, if @data is NULL use skb_headlen(skb) + * + * The function will try to retrieve the ports at offset thoff + poff where poff + * is the protocol port offset returned from proto_ports_offset + */ +__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, + void *data, int hlen) +{ + int poff = proto_ports_offset(ip_proto); + + if (!data) { + data = skb->data; + hlen = skb_headlen(skb); + } + + if (poff >= 0) { + __be32 *ports, _ports; + + ports = __skb_header_pointer(skb, thoff + poff, + sizeof(_ports), data, hlen, &_ports); + if (ports) + return *ports; + } + + return 0; +} +EXPORT_SYMBOL(__skb_flow_get_ports); + +#ifdef HAVE_IP_TUNNEL_INFO +static void +skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type, + struct flow_dissector *flow_dissector, + void *target_container) +{ + struct flow_dissector_key_control *ctrl; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_CONTROL)) + return; + + ctrl = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_CONTROL, + target_container); + ctrl->addr_type = type; +} +static void +__skb_flow_dissect_tunnel_info(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container) +{ + struct ip_tunnel_info *info; + struct ip_tunnel_key *key; + + /* A quick check to see if there might be something to do. */ + if (!dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_KEYID) && + !dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) && + !dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) && + !dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_CONTROL) && + !dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_PORTS) && + !dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IP)) + return; + + info = skb_tunnel_info((struct sk_buff *)skb); + if (!info) + return; + + key = &info->key; + + switch (ip_tunnel_info_af(info)) { + case AF_INET: + skb_flow_dissect_set_enc_addr_type(FLOW_DISSECTOR_KEY_IPV4_ADDRS, + flow_dissector, + target_container); + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS)) { + struct flow_dissector_key_ipv4_addrs *ipv4; + + ipv4 = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, + target_container); + ipv4->src = key->u.ipv4.src; + ipv4->dst = key->u.ipv4.dst; + } + break; + case AF_INET6: + skb_flow_dissect_set_enc_addr_type(FLOW_DISSECTOR_KEY_IPV6_ADDRS, + flow_dissector, + target_container); + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS)) { + struct flow_dissector_key_ipv6_addrs *ipv6; + + ipv6 = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, + target_container); + ipv6->src = key->u.ipv6.src; + ipv6->dst = key->u.ipv6.dst; + } + break; + } + + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_KEYID)) { + struct flow_dissector_key_keyid *keyid; + + keyid = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_KEYID, + target_container); + keyid->keyid = tunnel_id_to_key32(key->tun_id); + } + + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_PORTS)) { + struct flow_dissector_key_ports *tp; + + tp = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_PORTS, + target_container); + tp->src = key->tp_src; + tp->dst = key->tp_dst; + } + + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IP)) { + struct flow_dissector_key_ip *ip; + + ip = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IP, + target_container); + ip->tos = key->tos; + ip->ttl = key->ttl; + } + +} +#endif /* HAVE_IP_TUNNEL_INFO */ + +static enum flow_dissect_ret +__skb_flow_dissect_mpls(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, void *data, int nhoff, int hlen) +{ + struct flow_dissector_key_keyid *key_keyid; + struct mpls_label *hdr, _hdr[2]; + u32 entry, label; + + if (!dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_MPLS_ENTROPY) && + !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS)) + return FLOW_DISSECT_RET_OUT_GOOD; + + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, + hlen, &_hdr); + if (!hdr) + return FLOW_DISSECT_RET_OUT_BAD; + + entry = ntohl(hdr[0].entry); + label = (entry & MPLS_LS_LABEL_MASK) >> MPLS_LS_LABEL_SHIFT; + + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS)) { + struct flow_dissector_key_mpls *key_mpls; + + key_mpls = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_MPLS, + target_container); + key_mpls->mpls_label = label; + key_mpls->mpls_ttl = (entry & MPLS_LS_TTL_MASK) + >> MPLS_LS_TTL_SHIFT; + key_mpls->mpls_tc = (entry & MPLS_LS_TC_MASK) + >> MPLS_LS_TC_SHIFT; + key_mpls->mpls_bos = (entry & MPLS_LS_S_MASK) + >> MPLS_LS_S_SHIFT; + } + + if (label == MPLS_LABEL_ENTROPY) { + key_keyid = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_MPLS_ENTROPY, + target_container); + key_keyid->keyid = hdr[1].entry & htonl(MPLS_LS_LABEL_MASK); + } + return FLOW_DISSECT_RET_OUT_GOOD; +} + +static enum flow_dissect_ret +__skb_flow_dissect_arp(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, void *data, int nhoff, int hlen) +{ + struct flow_dissector_key_arp *key_arp; + struct { + unsigned char ar_sha[ETH_ALEN]; + unsigned char ar_sip[4]; + unsigned char ar_tha[ETH_ALEN]; + unsigned char ar_tip[4]; + } *arp_eth, _arp_eth; + const struct arphdr *arp; + struct arphdr _arp; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ARP)) + return FLOW_DISSECT_RET_OUT_GOOD; + + arp = __skb_header_pointer(skb, nhoff, sizeof(_arp), data, + hlen, &_arp); + if (!arp) + return FLOW_DISSECT_RET_OUT_BAD; + + if (arp->ar_hrd != htons(ARPHRD_ETHER) || + arp->ar_pro != htons(ETH_P_IP) || + arp->ar_hln != ETH_ALEN || + arp->ar_pln != 4 || + (arp->ar_op != htons(ARPOP_REPLY) && + arp->ar_op != htons(ARPOP_REQUEST))) + return FLOW_DISSECT_RET_OUT_BAD; + + arp_eth = __skb_header_pointer(skb, nhoff + sizeof(_arp), + sizeof(_arp_eth), data, + hlen, &_arp_eth); + if (!arp_eth) + return FLOW_DISSECT_RET_OUT_BAD; + + key_arp = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ARP, + target_container); + + memcpy(&key_arp->sip, arp_eth->ar_sip, sizeof(key_arp->sip)); + memcpy(&key_arp->tip, arp_eth->ar_tip, sizeof(key_arp->tip)); + + /* Only store the lower byte of the opcode; + * this covers ARPOP_REPLY and ARPOP_REQUEST. + */ + key_arp->op = ntohs(arp->ar_op) & 0xff; + + ether_addr_copy(key_arp->sha, arp_eth->ar_sha); + ether_addr_copy(key_arp->tha, arp_eth->ar_tha); + + return FLOW_DISSECT_RET_OUT_GOOD; +} + +#ifdef __GRE_SUPPORT__ +static enum flow_dissect_ret +__skb_flow_dissect_gre(const struct sk_buff *skb, + struct flow_dissector_key_control *key_control, + struct flow_dissector *flow_dissector, + void *target_container, void *data, + __be16 *p_proto, int *p_nhoff, int *p_hlen, + unsigned int flags) +{ + struct flow_dissector_key_keyid *key_keyid; + struct gre_base_hdr *hdr, _hdr; + int offset = 0; + u16 gre_ver; + + hdr = __skb_header_pointer(skb, *p_nhoff, sizeof(_hdr), + data, *p_hlen, &_hdr); + if (!hdr) + return FLOW_DISSECT_RET_OUT_BAD; + + /* Only look inside GRE without routing */ + if (hdr->flags & GRE_ROUTING) + return FLOW_DISSECT_RET_OUT_GOOD; + + /* Only look inside GRE for version 0 and 1 */ + gre_ver = ntohs(hdr->flags & GRE_VERSION); + if (gre_ver > 1) + return FLOW_DISSECT_RET_OUT_GOOD; + + *p_proto = hdr->protocol; + if (gre_ver) { + /* Version1 must be PPTP, and check the flags */ + if (!(*p_proto == GRE_PROTO_PPP && (hdr->flags & GRE_KEY))) + return FLOW_DISSECT_RET_OUT_GOOD; + } + + offset += sizeof(struct gre_base_hdr); + + if (hdr->flags & GRE_CSUM) + offset += sizeof(((struct gre_full_hdr *) 0)->csum) + + sizeof(((struct gre_full_hdr *) 0)->reserved1); + + if (hdr->flags & GRE_KEY) { + const __be32 *keyid; + __be32 _keyid; + + keyid = __skb_header_pointer(skb, *p_nhoff + offset, + sizeof(_keyid), + data, *p_hlen, &_keyid); + if (!keyid) + return FLOW_DISSECT_RET_OUT_BAD; + + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_GRE_KEYID)) { + key_keyid = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_GRE_KEYID, + target_container); + if (gre_ver == 0) + key_keyid->keyid = *keyid; + else + key_keyid->keyid = *keyid & GRE_PPTP_KEY_MASK; + } + offset += sizeof(((struct gre_full_hdr *) 0)->key); + } + + if (hdr->flags & GRE_SEQ) + offset += sizeof(((struct pptp_gre_header *) 0)->seq); + + if (gre_ver == 0) { + if (*p_proto == htons(ETH_P_TEB)) { + const struct ethhdr *eth; + struct ethhdr _eth; + + eth = __skb_header_pointer(skb, *p_nhoff + offset, + sizeof(_eth), + data, *p_hlen, &_eth); + if (!eth) + return FLOW_DISSECT_RET_OUT_BAD; + *p_proto = eth->h_proto; + offset += sizeof(*eth); + + /* Cap headers that we access via pointers at the + * end of the Ethernet header as our maximum alignment + * at that point is only 2 bytes. + */ + if (NET_IP_ALIGN) + *p_hlen = *p_nhoff + offset; + } + } else { /* version 1, must be PPTP */ + u8 _ppp_hdr[PPP_HDRLEN]; + u8 *ppp_hdr; + + if (hdr->flags & GRE_ACK) + offset += sizeof(((struct pptp_gre_header *) 0)->ack); + + ppp_hdr = __skb_header_pointer(skb, *p_nhoff + offset, + sizeof(_ppp_hdr), + data, *p_hlen, _ppp_hdr); + if (!ppp_hdr) + return FLOW_DISSECT_RET_OUT_BAD; + + switch (PPP_PROTOCOL(ppp_hdr)) { + case PPP_IP: + *p_proto = htons(ETH_P_IP); + break; + case PPP_IPV6: + *p_proto = htons(ETH_P_IPV6); + break; + default: + /* Could probably catch some more like MPLS */ + break; + } + + offset += PPP_HDRLEN; + } + + *p_nhoff += offset; + key_control->flags |= FLOW_DIS_ENCAPSULATION; + if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) + return FLOW_DISSECT_RET_OUT_GOOD; + + return FLOW_DISSECT_RET_PROTO_AGAIN; +} +#endif + +static void +__skb_flow_dissect_tcp(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, void *data, int thoff, int hlen) +{ + struct flow_dissector_key_tcp *key_tcp; + struct tcphdr *th, _th; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_TCP)) + return; + + th = __skb_header_pointer(skb, thoff, sizeof(_th), data, hlen, &_th); + if (!th) + return; + + if (unlikely(th->doff * 4 < sizeof(_th))) + return; + + key_tcp = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_TCP, + target_container); + key_tcp->flags = (*(__be16 *) &tcp_flag_word(th) & htons(0x0FFF)); +} + +static void +__skb_flow_dissect_ipv4(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, void *data, const struct iphdr *iph) +{ + struct flow_dissector_key_ip *key_ip; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IP)) + return; + + key_ip = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IP, + target_container); + key_ip->tos = iph->tos; + key_ip->ttl = iph->ttl; +} + +static void +__skb_flow_dissect_ipv6(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, void *data, const struct ipv6hdr *iph) +{ + struct flow_dissector_key_ip *key_ip; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IP)) + return; + + key_ip = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IP, + target_container); + key_ip->tos = ipv6_get_dsfield(iph); + key_ip->ttl = iph->hop_limit; +} + +/* Maximum number of protocol headers that can be parsed in + * __skb_flow_dissect + */ +#define MAX_FLOW_DISSECT_HDRS 15 + +static bool skb_flow_dissect_allowed(int *num_hdrs) +{ + ++*num_hdrs; + + return (*num_hdrs <= MAX_FLOW_DISSECT_HDRS); +} + +/** + * __skb_flow_dissect - extract the flow_keys struct and return it + * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified + * @flow_dissector: list of keys to dissect + * @target_container: target structure to put dissected values into + * @data: raw buffer pointer to the packet, if NULL use skb->data + * @proto: protocol for which to get the flow, if @data is NULL use skb->protocol + * @nhoff: network header offset, if @data is NULL use skb_network_offset(skb) + * @hlen: packet header length, if @data is NULL use skb_headlen(skb) + * + * The function will try to retrieve individual keys into target specified + * by flow_dissector from either the skbuff or a raw buffer specified by the + * rest parameters. + * + * Caller must take care of zeroing target container memory. + */ +bool __skb_flow_dissect(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, + void *data, __be16 proto, int nhoff, int hlen, + unsigned int flags) +{ + struct flow_dissector_key_control *key_control; + struct flow_dissector_key_basic *key_basic; + struct flow_dissector_key_addrs *key_addrs; + struct flow_dissector_key_ports *key_ports; + struct flow_dissector_key_icmp *key_icmp; + struct flow_dissector_key_tags *key_tags; + struct flow_dissector_key_vlan *key_vlan; + enum flow_dissect_ret fdret; + bool skip_vlan = false; + int num_hdrs = 0; + u8 ip_proto = 0; + bool ret; + + if (!data) { + data = skb->data; + proto = skb_vlan_tag_present(skb) ? + skb->vlan_proto : skb->protocol; + nhoff = skb_network_offset(skb); + hlen = skb_headlen(skb); +#if IS_ENABLED(CONFIG_NET_DSA) + if (unlikely(skb->dev && netdev_uses_dsa(skb->dev))) { + const struct dsa_device_ops *ops; + int offset; + + ops = skb->dev->dsa_ptr->tag_ops; + if (ops->flow_dissect && + !ops->flow_dissect(skb, &proto, &offset)) { + hlen -= offset; + nhoff += offset; + } + } +#endif + } + + /* It is ensured by skb_flow_dissector_init() that control key will + * be always present. + */ + key_control = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_CONTROL, + target_container); + + /* It is ensured by skb_flow_dissector_init() that basic key will + * be always present. + */ + key_basic = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_BASIC, + target_container); + +#ifdef HAVE_IP_TUNNEL_INFO + __skb_flow_dissect_tunnel_info(skb, flow_dissector, + target_container); +#endif + + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ETH_ADDRS)) { + struct ethhdr *eth = eth_hdr(skb); + struct flow_dissector_key_eth_addrs *key_eth_addrs; + + key_eth_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ETH_ADDRS, + target_container); + memcpy(key_eth_addrs, ð->h_dest, sizeof(*key_eth_addrs)); + } + +proto_again: + fdret = FLOW_DISSECT_RET_CONTINUE; + + switch (proto) { + case htons(ETH_P_IP): { + const struct iphdr *iph; + struct iphdr _iph; + + iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph); + if (!iph || iph->ihl < 5) { + fdret = FLOW_DISSECT_RET_OUT_BAD; + break; + } + + nhoff += iph->ihl * 4; + + ip_proto = iph->protocol; + + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_IPV4_ADDRS)) { + key_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPV4_ADDRS, + target_container); + + memcpy(&key_addrs->v4addrs, &iph->saddr, + sizeof(key_addrs->v4addrs)); + key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + } + + if (ip_is_fragment(iph)) { + key_control->flags |= FLOW_DIS_IS_FRAGMENT; + + if (iph->frag_off & htons(IP_OFFSET)) { + fdret = FLOW_DISSECT_RET_OUT_GOOD; + break; + } else { + key_control->flags |= FLOW_DIS_FIRST_FRAG; + if (!(flags & + FLOW_DISSECTOR_F_PARSE_1ST_FRAG)) { + fdret = FLOW_DISSECT_RET_OUT_GOOD; + break; + } + } + } + + __skb_flow_dissect_ipv4(skb, flow_dissector, + target_container, data, iph); + + if (flags & FLOW_DISSECTOR_F_STOP_AT_L3) { + fdret = FLOW_DISSECT_RET_OUT_GOOD; + break; + } + + break; + } + case htons(ETH_P_IPV6): { + const struct ipv6hdr *iph; + struct ipv6hdr _iph; + + iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph); + if (!iph) { + fdret = FLOW_DISSECT_RET_OUT_BAD; + break; + } + + ip_proto = iph->nexthdr; + nhoff += sizeof(struct ipv6hdr); + + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_IPV6_ADDRS)) { + key_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPV6_ADDRS, + target_container); + + memcpy(&key_addrs->v6addrs, &iph->saddr, + sizeof(key_addrs->v6addrs)); + key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + } + + if ((dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_FLOW_LABEL) || + (flags & FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL)) && + ip6_flowlabel(iph)) { + __be32 flow_label = ip6_flowlabel(iph); + + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_FLOW_LABEL)) { + key_tags = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_FLOW_LABEL, + target_container); + key_tags->flow_label = ntohl(flow_label); + } + if (flags & FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL) { + fdret = FLOW_DISSECT_RET_OUT_GOOD; + break; + } + } + + __skb_flow_dissect_ipv6(skb, flow_dissector, + target_container, data, iph); + + if (flags & FLOW_DISSECTOR_F_STOP_AT_L3) + fdret = FLOW_DISSECT_RET_OUT_GOOD; + + break; + } + case htons(ETH_P_8021AD): + case htons(ETH_P_8021Q): { + const struct vlan_hdr *vlan; + struct vlan_hdr _vlan; + bool vlan_tag_present = skb && skb_vlan_tag_present(skb); + + if (vlan_tag_present) + proto = skb->protocol; + + if (!vlan_tag_present || eth_type_vlan_bp(skb->protocol)) { + vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan), + data, hlen, &_vlan); + if (!vlan) { + fdret = FLOW_DISSECT_RET_OUT_BAD; + break; + } + + proto = vlan->h_vlan_encapsulated_proto; + nhoff += sizeof(*vlan); + if (skip_vlan) { + fdret = FLOW_DISSECT_RET_PROTO_AGAIN; + break; + } + } + + skip_vlan = true; + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_VLAN)) { + key_vlan = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_VLAN, + target_container); + + if (vlan_tag_present) { + key_vlan->vlan_id = skb_vlan_tag_get_id(skb); + key_vlan->vlan_priority = + (skb_vlan_tag_get_prio(skb) >> VLAN_PRIO_SHIFT); + } else { + key_vlan->vlan_id = ntohs(vlan->h_vlan_TCI) & + VLAN_VID_MASK; + key_vlan->vlan_priority = + (ntohs(vlan->h_vlan_TCI) & + VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; + } + } + + fdret = FLOW_DISSECT_RET_PROTO_AGAIN; + break; + } + case htons(ETH_P_PPP_SES): { + struct { + struct pppoe_hdr hdr; + __be16 proto; + } *hdr, _hdr; + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); + if (!hdr) { + fdret = FLOW_DISSECT_RET_OUT_BAD; + break; + } + + proto = hdr->proto; + nhoff += PPPOE_SES_HLEN; + switch (proto) { + case htons(PPP_IP): + proto = htons(ETH_P_IP); + fdret = FLOW_DISSECT_RET_PROTO_AGAIN; + break; + case htons(PPP_IPV6): + proto = htons(ETH_P_IPV6); + fdret = FLOW_DISSECT_RET_PROTO_AGAIN; + break; + default: + fdret = FLOW_DISSECT_RET_OUT_BAD; + break; + } + break; + } +#ifdef __TIPC_SUPPORT__ + case htons(ETH_P_TIPC): { + struct tipc_basic_hdr *hdr, _hdr; + + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), + data, hlen, &_hdr); + if (!hdr) { + fdret = FLOW_DISSECT_RET_OUT_BAD; + break; + } + + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_TIPC)) { + key_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_TIPC, + target_container); + key_addrs->tipckey.key = tipc_hdr_rps_key(hdr); + key_control->addr_type = FLOW_DISSECTOR_KEY_TIPC; + } + fdret = FLOW_DISSECT_RET_OUT_GOOD; + break; + } +#endif + case htons(ETH_P_MPLS_UC): + case htons(ETH_P_MPLS_MC): + fdret = __skb_flow_dissect_mpls(skb, flow_dissector, + target_container, data, + nhoff, hlen); + break; + case htons(ETH_P_FCOE): + if ((hlen - nhoff) < FCOE_HEADER_LEN) { + fdret = FLOW_DISSECT_RET_OUT_BAD; + break; + } + + nhoff += FCOE_HEADER_LEN; + fdret = FLOW_DISSECT_RET_OUT_GOOD; + break; + + case htons(ETH_P_ARP): + case htons(ETH_P_RARP): + fdret = __skb_flow_dissect_arp(skb, flow_dissector, + target_container, data, + nhoff, hlen); + break; + + default: + fdret = FLOW_DISSECT_RET_OUT_BAD; + break; + } + + /* Process result of proto processing */ + switch (fdret) { + case FLOW_DISSECT_RET_OUT_GOOD: + goto out_good; + case FLOW_DISSECT_RET_PROTO_AGAIN: + if (skb_flow_dissect_allowed(&num_hdrs)) + goto proto_again; + goto out_good; + case FLOW_DISSECT_RET_CONTINUE: + case FLOW_DISSECT_RET_IPPROTO_AGAIN: + break; + case FLOW_DISSECT_RET_OUT_BAD: + default: + goto out_bad; + } + +ip_proto_again: + fdret = FLOW_DISSECT_RET_CONTINUE; + + switch (ip_proto) { +#ifdef __GRE_SUPPORT__ + case IPPROTO_GRE: + fdret = __skb_flow_dissect_gre(skb, key_control, flow_dissector, + target_container, data, + &proto, &nhoff, &hlen, flags); + break; +#endif + case NEXTHDR_HOP: + case NEXTHDR_ROUTING: + case NEXTHDR_DEST: { + u8 _opthdr[2], *opthdr; + + if (proto != htons(ETH_P_IPV6)) + break; + + opthdr = __skb_header_pointer(skb, nhoff, sizeof(_opthdr), + data, hlen, &_opthdr); + if (!opthdr) { + fdret = FLOW_DISSECT_RET_OUT_BAD; + break; + } + + ip_proto = opthdr[0]; + nhoff += (opthdr[1] + 1) << 3; + + fdret = FLOW_DISSECT_RET_IPPROTO_AGAIN; + break; + } + case NEXTHDR_FRAGMENT: { + struct frag_hdr _fh, *fh; + + if (proto != htons(ETH_P_IPV6)) + break; + + fh = __skb_header_pointer(skb, nhoff, sizeof(_fh), + data, hlen, &_fh); + + if (!fh) { + fdret = FLOW_DISSECT_RET_OUT_BAD; + break; + } + + key_control->flags |= FLOW_DIS_IS_FRAGMENT; + + nhoff += sizeof(_fh); + ip_proto = fh->nexthdr; + + if (!(fh->frag_off & htons(IP6_OFFSET))) { + key_control->flags |= FLOW_DIS_FIRST_FRAG; + if (flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG) { + fdret = FLOW_DISSECT_RET_IPPROTO_AGAIN; + break; + } + } + + fdret = FLOW_DISSECT_RET_OUT_GOOD; + break; + } + case IPPROTO_IPIP: + proto = htons(ETH_P_IP); + + key_control->flags |= FLOW_DIS_ENCAPSULATION; + if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) { + fdret = FLOW_DISSECT_RET_OUT_GOOD; + break; + } + + fdret = FLOW_DISSECT_RET_PROTO_AGAIN; + break; + + case IPPROTO_IPV6: + proto = htons(ETH_P_IPV6); + + key_control->flags |= FLOW_DIS_ENCAPSULATION; + if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) { + fdret = FLOW_DISSECT_RET_OUT_GOOD; + break; + } + + fdret = FLOW_DISSECT_RET_PROTO_AGAIN; + break; + + + case IPPROTO_MPLS: + proto = htons(ETH_P_MPLS_UC); + fdret = FLOW_DISSECT_RET_PROTO_AGAIN; + break; + + case IPPROTO_TCP: + __skb_flow_dissect_tcp(skb, flow_dissector, target_container, + data, nhoff, hlen); + break; + + default: + break; + } + + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS)) { + key_ports = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS, + target_container); + key_ports->ports = __skb_flow_get_ports(skb, nhoff, ip_proto, + data, hlen); + } + + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ICMP)) { + key_icmp = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ICMP, + target_container); + key_icmp->icmp = skb_flow_get_be16(skb, nhoff, data, hlen); + } + + /* Process result of IP proto processing */ + switch (fdret) { + case FLOW_DISSECT_RET_PROTO_AGAIN: + if (skb_flow_dissect_allowed(&num_hdrs)) + goto proto_again; + break; + case FLOW_DISSECT_RET_IPPROTO_AGAIN: + if (skb_flow_dissect_allowed(&num_hdrs)) + goto ip_proto_again; + break; + case FLOW_DISSECT_RET_OUT_GOOD: + case FLOW_DISSECT_RET_CONTINUE: + break; + case FLOW_DISSECT_RET_OUT_BAD: + default: + goto out_bad; + } + +out_good: + ret = true; + +out: + key_control->thoff = min_t(u16, nhoff, skb ? skb->len : hlen); + key_basic->n_proto = proto; + key_basic->ip_proto = ip_proto; + + return ret; + +out_bad: + ret = false; + goto out; +} +EXPORT_SYMBOL(__skb_flow_dissect); +#endif /* HAVE_SKB_FLOW_DISSECT */ + +static u32 hashrnd __read_mostly; +static __always_inline void __flow_hash_secret_init(void) +{ + net_get_random_once(&hashrnd, sizeof(hashrnd)); +} + +static __always_inline u32 __flow_hash_words(const u32 *words, u32 length, + u32 keyval) +{ + return jhash2(words, length, keyval); +} + +static inline const u32 *flow_keys_hash_start(const struct flow_keys *flow) +{ + const void *p = flow; + + BUILD_BUG_ON(FLOW_KEYS_HASH_OFFSET % sizeof(u32)); + return (const u32 *)(p + FLOW_KEYS_HASH_OFFSET); +} + +static inline size_t flow_keys_hash_length(const struct flow_keys *flow) +{ + size_t diff = FLOW_KEYS_HASH_OFFSET + sizeof(flow->addrs); + BUILD_BUG_ON((sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) % sizeof(u32)); + BUILD_BUG_ON(offsetof(typeof(*flow), addrs) != + sizeof(*flow) - sizeof(flow->addrs)); + + switch (flow->control.addr_type) { + case FLOW_DISSECTOR_KEY_IPV4_ADDRS: + diff -= sizeof(flow->addrs.v4addrs); + break; + case FLOW_DISSECTOR_KEY_IPV6_ADDRS: + diff -= sizeof(flow->addrs.v6addrs); + break; +#ifdef __TIPC_SUPPORT__ + case FLOW_DISSECTOR_KEY_TIPC: + diff -= sizeof(flow->addrs.tipckey); + break; +#endif + } + return (sizeof(*flow) - diff) / sizeof(u32); +} + +__be32 flow_get_u32_src(const struct flow_keys *flow) +{ + switch (flow->control.addr_type) { + case FLOW_DISSECTOR_KEY_IPV4_ADDRS: + return flow->addrs.v4addrs.src; + case FLOW_DISSECTOR_KEY_IPV6_ADDRS: + return (__force __be32)ipv6_addr_hash( + &flow->addrs.v6addrs.src); +#ifdef __TIPC_SUPPORT__ + case FLOW_DISSECTOR_KEY_TIPC: + return flow->addrs.tipckey.key; +#endif + default: + return 0; + } +} +EXPORT_SYMBOL(flow_get_u32_src); + +__be32 flow_get_u32_dst(const struct flow_keys *flow) +{ + switch (flow->control.addr_type) { + case FLOW_DISSECTOR_KEY_IPV4_ADDRS: + return flow->addrs.v4addrs.dst; + case FLOW_DISSECTOR_KEY_IPV6_ADDRS: + return (__force __be32)ipv6_addr_hash( + &flow->addrs.v6addrs.dst); + default: + return 0; + } +} +EXPORT_SYMBOL(flow_get_u32_dst); + +static inline void __flow_hash_consistentify(struct flow_keys *keys) +{ + int addr_diff, i; + + switch (keys->control.addr_type) { + case FLOW_DISSECTOR_KEY_IPV4_ADDRS: + addr_diff = (__force u32)keys->addrs.v4addrs.dst - + (__force u32)keys->addrs.v4addrs.src; + if ((addr_diff < 0) || + (addr_diff == 0 && + ((__force u16)keys->ports.dst < + (__force u16)keys->ports.src))) { + swap(keys->addrs.v4addrs.src, keys->addrs.v4addrs.dst); + swap(keys->ports.src, keys->ports.dst); + } + break; + case FLOW_DISSECTOR_KEY_IPV6_ADDRS: + addr_diff = memcmp(&keys->addrs.v6addrs.dst, + &keys->addrs.v6addrs.src, + sizeof(keys->addrs.v6addrs.dst)); + if ((addr_diff < 0) || + (addr_diff == 0 && + ((__force u16)keys->ports.dst < + (__force u16)keys->ports.src))) { + for (i = 0; i < 4; i++) + swap(keys->addrs.v6addrs.src.s6_addr32[i], + keys->addrs.v6addrs.dst.s6_addr32[i]); + swap(keys->ports.src, keys->ports.dst); + } + break; + } +} + +static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval) +{ + u32 hash; + + __flow_hash_consistentify(keys); + + hash = __flow_hash_words(flow_keys_hash_start(keys), + flow_keys_hash_length(keys), keyval); + if (!hash) + hash = 1; + + return hash; +} + +u32 flow_hash_from_keys(struct flow_keys *keys) +{ + __flow_hash_secret_init(); + return __flow_hash_from_keys(keys, hashrnd); +} +EXPORT_SYMBOL(flow_hash_from_keys); + +static inline u32 ___skb_get_hash(const struct sk_buff *skb, + struct flow_keys *keys, u32 keyval) +{ + skb_flow_dissect_flow_keys(skb, keys, + FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); + + return __flow_hash_from_keys(keys, keyval); +} + +struct _flow_keys_digest_data { + __be16 n_proto; + u8 ip_proto; + u8 padding; + __be32 ports; + __be32 src; + __be32 dst; +}; + +void make_flow_keys_digest(struct flow_keys_digest *digest, + const struct flow_keys *flow) +{ + struct _flow_keys_digest_data *data = + (struct _flow_keys_digest_data *)digest; + + BUILD_BUG_ON(sizeof(*data) > sizeof(*digest)); + + memset(digest, 0, sizeof(*digest)); + + data->n_proto = flow->basic.n_proto; + data->ip_proto = flow->basic.ip_proto; + data->ports = flow->ports.ports; + data->src = flow->addrs.v4addrs.src; + data->dst = flow->addrs.v4addrs.dst; +} +EXPORT_SYMBOL(make_flow_keys_digest); + +static struct flow_dissector flow_keys_dissector_symmetric __read_mostly; + +#ifndef CONFIG_NET_SCHED_NEW +u32 __skb_get_hash_symmetric(const struct sk_buff *skb) +{ + struct flow_keys keys; + + __flow_hash_secret_init(); + + memset(&keys, 0, sizeof(keys)); + __skb_flow_dissect(skb, &flow_keys_dissector_symmetric, &keys, + NULL, 0, 0, 0, + FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); + + return __flow_hash_from_keys(&keys, hashrnd); +} +EXPORT_SYMBOL_GPL(__skb_get_hash_symmetric); + +/** + * __skb_get_hash: calculate a flow hash + * @skb: sk_buff to calculate flow hash from + * + * This function calculates a flow hash based on src/dst addresses + * and src/dst port numbers. Sets hash in skb to non-zero hash value + * on success, zero indicates no valid hash. Also, sets l4_hash in skb + * if hash is a canonical 4-tuple hash over transport ports. + */ +void __skb_get_hash(struct sk_buff *skb) +{ + struct flow_keys keys; + u32 hash; + + __flow_hash_secret_init(); + + hash = ___skb_get_hash(skb, &keys, hashrnd); + +#ifdef HAVE_SKB_SWHASH + __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys)); +#else + skb->l4_hash = flow_keys_have_l4(&keys); + skb->hash = hash; +#endif +} +EXPORT_SYMBOL(__skb_get_hash); + +__u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys) +{ + memset(keys, 0, sizeof(*keys)); + + memcpy(&keys->addrs.v6addrs.src, &fl6->saddr, + sizeof(keys->addrs.v6addrs.src)); + memcpy(&keys->addrs.v6addrs.dst, &fl6->daddr, + sizeof(keys->addrs.v6addrs.dst)); + keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + keys->ports.src = fl6->fl6_sport; + keys->ports.dst = fl6->fl6_dport; + keys->keyid.keyid = fl6->fl6_gre_key; + keys->tags.flow_label = (__force u32)fl6->flowlabel; + keys->basic.ip_proto = fl6->flowi6_proto; + + return flow_hash_from_keys(keys); +} +EXPORT_SYMBOL(__get_hash_from_flowi6); + +__u32 __get_hash_from_flowi4(const struct flowi4 *fl4, struct flow_keys *keys) +{ + memset(keys, 0, sizeof(*keys)); + + keys->addrs.v4addrs.src = fl4->saddr; + keys->addrs.v4addrs.dst = fl4->daddr; + keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + keys->ports.src = fl4->fl4_sport; + keys->ports.dst = fl4->fl4_dport; + keys->keyid.keyid = fl4->fl4_gre_key; + keys->basic.ip_proto = fl4->flowi4_proto; + + return flow_hash_from_keys(keys); +} +EXPORT_SYMBOL(__get_hash_from_flowi4); + +__u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb) +{ + struct flow_keys keys; + + return ___skb_get_hash(skb, &keys, perturb); +} +EXPORT_SYMBOL(skb_get_hash_perturb); +#endif + +#ifndef HAVE_SKB_FLOW_DISSECT +u32 __skb_get_poff(const struct sk_buff *skb, void *data, + const struct flow_keys *keys, int hlen) +{ + u32 poff = keys->control.thoff; + + /* skip L4 headers for fragments after the first */ + if ((keys->control.flags & FLOW_DIS_IS_FRAGMENT) && + !(keys->control.flags & FLOW_DIS_FIRST_FRAG)) + return poff; + + switch (keys->basic.ip_proto) { + case IPPROTO_TCP: { + /* access doff as u8 to avoid unaligned access */ + const u8 *doff; + u8 _doff; + + doff = __skb_header_pointer(skb, poff + 12, sizeof(_doff), + data, hlen, &_doff); + if (!doff) + return poff; + + poff += max_t(u32, sizeof(struct tcphdr), (*doff & 0xF0) >> 2); + break; + } + case IPPROTO_UDP: + case IPPROTO_UDPLITE: + poff += sizeof(struct udphdr); + break; + /* For the rest, we do not really care about header + * extensions at this point for now. + */ + case IPPROTO_ICMP: + poff += sizeof(struct icmphdr); + break; + case IPPROTO_ICMPV6: + poff += sizeof(struct icmp6hdr); + break; + case IPPROTO_IGMP: + poff += sizeof(struct igmphdr); + break; + case IPPROTO_DCCP: + poff += sizeof(struct dccp_hdr); + break; + case IPPROTO_SCTP: + poff += sizeof(struct sctphdr); + break; + } + + return poff; +} + +/** + * skb_get_poff - get the offset to the payload + * @skb: sk_buff to get the payload offset from + * + * The function will get the offset to the payload as far as it could + * be dissected. The main user is currently BPF, so that we can dynamically + * truncate packets without needing to push actual payload to the user + * space and can analyze headers only, instead. + */ +u32 skb_get_poff(const struct sk_buff *skb) +{ + struct flow_keys keys; + + if (!skb_flow_dissect_flow_keys(skb, &keys, 0)) + return 0; + + return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb)); +} +#endif + +static const struct flow_dissector_key flow_keys_dissector_keys[] = { + { + .key_id = FLOW_DISSECTOR_KEY_CONTROL, + .offset = offsetof(struct flow_keys, control), + }, + { + .key_id = FLOW_DISSECTOR_KEY_BASIC, + .offset = offsetof(struct flow_keys, basic), + }, + { + .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS, + .offset = offsetof(struct flow_keys, addrs.v4addrs), + }, + { + .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS, + .offset = offsetof(struct flow_keys, addrs.v6addrs), + }, +#ifdef __TIPC_SUPPORT__ + { + .key_id = FLOW_DISSECTOR_KEY_TIPC, + .offset = offsetof(struct flow_keys, addrs.tipckey), + }, +#endif + { + .key_id = FLOW_DISSECTOR_KEY_PORTS, + .offset = offsetof(struct flow_keys, ports), + }, + { + .key_id = FLOW_DISSECTOR_KEY_VLAN, + .offset = offsetof(struct flow_keys, vlan), + }, + { + .key_id = FLOW_DISSECTOR_KEY_FLOW_LABEL, + .offset = offsetof(struct flow_keys, tags), + }, +#ifdef __GRE_SUPPORT__ + { + .key_id = FLOW_DISSECTOR_KEY_GRE_KEYID, + .offset = offsetof(struct flow_keys, keyid), + }, +#endif +}; + +static const struct flow_dissector_key flow_keys_dissector_symmetric_keys[] = { + { + .key_id = FLOW_DISSECTOR_KEY_CONTROL, + .offset = offsetof(struct flow_keys, control), + }, + { + .key_id = FLOW_DISSECTOR_KEY_BASIC, + .offset = offsetof(struct flow_keys, basic), + }, + { + .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS, + .offset = offsetof(struct flow_keys, addrs.v4addrs), + }, + { + .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS, + .offset = offsetof(struct flow_keys, addrs.v6addrs), + }, + { + .key_id = FLOW_DISSECTOR_KEY_PORTS, + .offset = offsetof(struct flow_keys, ports), + }, +}; + +static const struct flow_dissector_key flow_keys_buf_dissector_keys[] = { + { + .key_id = FLOW_DISSECTOR_KEY_CONTROL, + .offset = offsetof(struct flow_keys, control), + }, + { + .key_id = FLOW_DISSECTOR_KEY_BASIC, + .offset = offsetof(struct flow_keys, basic), + }, +}; + +struct flow_dissector flow_keys_dissector __read_mostly; +EXPORT_SYMBOL(flow_keys_dissector); + +struct flow_dissector flow_keys_buf_dissector __read_mostly; + +int init_default_flow_dissectors(void) +{ + skb_flow_dissector_init(&flow_keys_dissector, + flow_keys_dissector_keys, + ARRAY_SIZE(flow_keys_dissector_keys)); + skb_flow_dissector_init(&flow_keys_dissector_symmetric, + flow_keys_dissector_symmetric_keys, + ARRAY_SIZE(flow_keys_dissector_symmetric_keys)); + skb_flow_dissector_init(&flow_keys_buf_dissector, + flow_keys_buf_dissector_keys, + ARRAY_SIZE(flow_keys_buf_dissector_keys)); + return 0; +} + +#endif /* CONFIG_COMPAT_FLOW_DISSECTOR */ diff --git a/src/mlnx-ofa_kernel-5.8/compat/flow_offload.c b/src/mlnx-ofa_kernel-5.8/compat/flow_offload.c new file mode 100644 index 0000000..d9c8535 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/flow_offload.c @@ -0,0 +1,263 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include +#include +#include +#include + +#define FLOW_DISSECTOR_MATCH(__rule, __type, __out) \ + const struct flow_match *__m = &(__rule)->match; \ + struct flow_dissector *__d = (__m)->dissector; \ + \ + (__out)->key = skb_flow_dissector_target(__d, __type, (__m)->key); \ + (__out)->mask = skb_flow_dissector_target(__d, __type, (__m)->mask); \ + +#ifndef HAVE_FLOW_RULE_MATCH_CVLAN +void flow_rule_match_basic(const struct flow_rule *rule, + struct flow_match_basic *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_BASIC, out); +} +EXPORT_SYMBOL(flow_rule_match_basic); + +void flow_rule_match_control(const struct flow_rule *rule, + struct flow_match_control *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_CONTROL, out); +} +EXPORT_SYMBOL(flow_rule_match_control); + +void flow_rule_match_eth_addrs(const struct flow_rule *rule, + struct flow_match_eth_addrs *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ETH_ADDRS, out); +} +EXPORT_SYMBOL(flow_rule_match_eth_addrs); + +#ifdef HAVE_FLOW_DISSECTOR_KEY_VLAN +void flow_rule_match_vlan(const struct flow_rule *rule, + struct flow_match_vlan *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_VLAN, out); +} +EXPORT_SYMBOL(flow_rule_match_vlan); +#endif + +void flow_rule_match_ipv4_addrs(const struct flow_rule *rule, + struct flow_match_ipv4_addrs *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_IPV4_ADDRS, out); +} +EXPORT_SYMBOL(flow_rule_match_ipv4_addrs); + +void flow_rule_match_ipv6_addrs(const struct flow_rule *rule, + struct flow_match_ipv6_addrs *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_IPV6_ADDRS, out); +} +EXPORT_SYMBOL(flow_rule_match_ipv6_addrs); + +void flow_rule_match_ip(const struct flow_rule *rule, + struct flow_match_ip *out) +{ +#ifdef HAVE_FLOW_DISSECTOR_KEY_IP + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_IP, out); +#endif +} +EXPORT_SYMBOL(flow_rule_match_ip); + +void flow_rule_match_cvlan(const struct flow_rule *rule, + struct flow_match_vlan *out) +{ +#ifdef HAVE_FLOW_DISSECTOR_KEY_CVLAN + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_CVLAN, out); +#endif +} +EXPORT_SYMBOL(flow_rule_match_cvlan); + +void flow_rule_match_ports(const struct flow_rule *rule, + struct flow_match_ports *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_PORTS, out); +} +EXPORT_SYMBOL(flow_rule_match_ports); + +void flow_rule_match_tcp(const struct flow_rule *rule, + struct flow_match_tcp *out) +{ +#ifdef HAVE_FLOW_DISSECTOR_KEY_TCP + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_TCP, out); +#endif +} +EXPORT_SYMBOL(flow_rule_match_tcp); + +#ifdef HAVE_FLOW_DISSECTOR_KEY_ENC_KEYID +void flow_rule_match_icmp(const struct flow_rule *rule, + struct flow_match_icmp *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ICMP, out); +} +EXPORT_SYMBOL(flow_rule_match_icmp); +#endif + +void flow_rule_match_mpls(const struct flow_rule *rule, + struct flow_match_mpls *out) +{ +#ifdef HAVE_FLOW_DISSECTOR_KEY_MPLS + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_MPLS, out); +#endif +} +EXPORT_SYMBOL(flow_rule_match_mpls); + +#ifdef HAVE_FLOW_DISSECTOR_KEY_ENC_KEYID +void flow_rule_match_enc_control(const struct flow_rule *rule, + struct flow_match_control *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_CONTROL, out); +} +EXPORT_SYMBOL(flow_rule_match_enc_control); + +void flow_rule_match_enc_ipv4_addrs(const struct flow_rule *rule, + struct flow_match_ipv4_addrs *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, out); +} +EXPORT_SYMBOL(flow_rule_match_enc_ipv4_addrs); + +void flow_rule_match_enc_ipv6_addrs(const struct flow_rule *rule, + struct flow_match_ipv6_addrs *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, out); +} +EXPORT_SYMBOL(flow_rule_match_enc_ipv6_addrs); +#endif /* HAVE_FLOW_DISSECTOR_KEY_ENC_KEYID */ + +void flow_rule_match_enc_ip(const struct flow_rule *rule, + struct flow_match_ip *out) +{ +#ifdef HAVE_FLOW_DISSECTOR_KEY_ENC_IP + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_IP, out); +#endif +} +EXPORT_SYMBOL(flow_rule_match_enc_ip); + +#ifdef HAVE_FLOW_DISSECTOR_KEY_ENC_KEYID +void flow_rule_match_enc_ports(const struct flow_rule *rule, + struct flow_match_ports *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_PORTS, out); +} +EXPORT_SYMBOL(flow_rule_match_enc_ports); + +void flow_rule_match_enc_keyid(const struct flow_rule *rule, + struct flow_match_enc_keyid *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_KEYID, out); +} +EXPORT_SYMBOL(flow_rule_match_enc_keyid); +#endif /* HAVE_FLOW_DISSECTOR_KEY_ENC_KEYID */ + +void flow_rule_match_enc_opts(const struct flow_rule *rule, + struct flow_match_enc_opts *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_OPTS, out); +} +EXPORT_SYMBOL(flow_rule_match_enc_opts); + +void flow_rule_match_ct(const struct flow_rule *rule, + struct flow_match_ct *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_CT, out); +} +EXPORT_SYMBOL(flow_rule_match_ct); + +#endif /* HAVE_FLOW_RULE_MATCH_CVLAN */ + +#ifndef HAVE_FLOW_RULE_MATCH_META +void flow_rule_match_meta(const struct flow_rule *rule, + struct flow_match_meta *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_META, out); +} +EXPORT_SYMBOL(flow_rule_match_meta); +#endif /* HAVE_FLOW_RULE_MATCH_META */ + +#ifndef HAVE_TC_SETUP_FLOW_ACTION +static void build_rule_match(struct tc_cls_flower_offload *f, + struct flow_match *match) +{ + match->dissector = f->dissector; + match->mask = f->mask; + match->key = f->key; +} + +static int build_rule_action(struct tcf_exts *exts, + struct flow_rule *rule) +{ + return tc_setup_flow_action(&rule->action, exts); +} + +struct flow_rule *__alloc_flow_rule(struct tcf_exts *exts, + void *priv, int size) +{ + struct flow_rule *rule; + int num_ent; + void *ret; + int err; + + if (!exts) { + pr_err_once("mlx5_core: %s: no exts\n", __func__); + return ERR_PTR(-EINVAL); + } + + num_ent = tcf_exts_num_actions(exts); + ret = kzalloc(size + sizeof(*rule) + + num_ent * sizeof(rule->action.entries[0]), + GFP_KERNEL); + if (!ret) + return ERR_PTR(-ENOMEM); + + rule = (struct flow_rule *)((uintptr_t)ret + size); + rule->action.num_entries = num_ent; + err = build_rule_action(exts, rule); + if (err) + goto out; + + if (priv) + memcpy(ret, priv, size); + rule->buff = ret; + rule->priv = priv; + rule->priv_size = size; + + return rule; + +out: + kfree(ret); + return ERR_PTR(err); +} +EXPORT_SYMBOL(__alloc_flow_rule); + +struct flow_rule *alloc_flow_rule(struct tc_cls_flower_offload **f) +{ + struct flow_rule *rule; + + rule = __alloc_flow_rule((*f)->exts, *f, sizeof(**f)); + if (IS_ERR(rule)) + return rule; + + build_rule_match(*f, &rule->match); + + *f = (struct tc_cls_flower_offload *)rule->buff; + + return rule; +} +EXPORT_SYMBOL(alloc_flow_rule); + +void free_flow_rule(struct flow_rule *rule) +{ + if (rule->priv) + memcpy(rule->priv, rule->buff, rule->priv_size); + + kfree(rule->buff); +} +EXPORT_SYMBOL(free_flow_rule); +#endif /* HAVE_TC_SETUP_FLOW_ACTION */ diff --git a/src/mlnx-ofa_kernel-5.8/compat/idr.c b/src/mlnx-ofa_kernel-5.8/compat/idr.c new file mode 100644 index 0000000..6edfe2e --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/idr.c @@ -0,0 +1,57 @@ + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef HAVE_IDR_GET_NEXT_UL_EXPORTED + +#ifdef HAVE_IDR_RT +void *idr_get_next_ul(struct idr *idr, unsigned long *nextid) +{ + struct radix_tree_iter iter; + void __rcu **slot; + unsigned long base = idr->idr_base; + unsigned long id = *nextid; + + id = (id < base) ? 0 : id - base; + slot = radix_tree_iter_find(&idr->idr_rt, &iter, id); + if (!slot) + return NULL; + + *nextid = iter.index + base; + return rcu_dereference_raw(*slot); +} +EXPORT_SYMBOL(idr_get_next_ul); +int idr_alloc_u32(struct idr *idr, void *ptr, u32 *nextid, + unsigned long max, gfp_t gfp) +{ + struct radix_tree_iter iter; + void __rcu **slot; + unsigned int base = idr->idr_base; + unsigned int id = *nextid; + + if (WARN_ON_ONCE(!(idr->idr_rt.xa_flags & ROOT_IS_IDR))) + idr->idr_rt.xa_flags |= IDR_RT_MARKER; + + id = (id < base) ? 0 : id - base; + radix_tree_iter_init(&iter, id); + slot = idr_get_free(&idr->idr_rt, &iter, gfp, max - base); + if (IS_ERR(slot)) + return PTR_ERR(slot); + + *nextid = iter.index + base; + /*there is a memory barrier inside radix_tree_iter_replace() */ + radix_tree_iter_replace(&idr->idr_rt, &iter, slot, ptr); + radix_tree_iter_tag_clear(&idr->idr_rt, &iter, IDR_FREE); + + return 0; +} +EXPORT_SYMBOL_GPL(idr_alloc_u32); +#endif /* HAVE_IDR_RT */ + +#endif /* HAVE_IDR_GET_NEXT_UL_EXPORTED */ diff --git a/src/mlnx-ofa_kernel-5.8/compat/interval_tree.c b/src/mlnx-ofa_kernel-5.8/compat/interval_tree.c new file mode 100644 index 0000000..b2432ab --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/interval_tree.c @@ -0,0 +1,18 @@ +#if (defined(HAVE_INTERVAL_TREE_EXPORTED) && !defined(CONFIG_INTERVAL_TREE)) || !defined(HAVE_INTERVAL_TREE_EXPORTED) +#include +#include +#include +#include + +#define START(node) ((node)->start) +#define LAST(node) ((node)->last) + +INTERVAL_TREE_DEFINE(struct interval_tree_node, rb, + unsigned long, __subtree_last, + START, LAST,, backport_interval_tree) + +EXPORT_SYMBOL_GPL(interval_tree_insert); +EXPORT_SYMBOL_GPL(interval_tree_remove); +EXPORT_SYMBOL_GPL(interval_tree_iter_first); +EXPORT_SYMBOL_GPL(interval_tree_iter_next); +#endif diff --git a/src/mlnx-ofa_kernel-5.8/compat/kfifo.c b/src/mlnx-ofa_kernel-5.8/compat/kfifo.c new file mode 100644 index 0000000..652f4e1 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/kfifo.c @@ -0,0 +1,629 @@ +/* + * A generic kernel FIFO implementation + * + * Copyright (C) 2009/2010 Stefani Seibold + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include +#include +#include +#include +#include +#include +#include + +/* + * internal helper to calculate the unused elements in a fifo + */ +static inline unsigned int kfifo_unused(struct __kfifo *fifo) +{ + return (fifo->mask + 1) - (fifo->in - fifo->out); +} + +#define __kfifo_alloc LINUX_BACKPORT(__kfifo_alloc) +int __kfifo_alloc(struct __kfifo *fifo, unsigned int size, + size_t esize, gfp_t gfp_mask) +{ + /* + * round down to the next power of 2, since our 'let the indices + * wrap' technique works only in this case. + */ + if (!is_power_of_2(size)) + size = rounddown_pow_of_two(size); + + fifo->in = 0; + fifo->out = 0; + fifo->esize = esize; + + if (size < 2) { + fifo->data = NULL; + fifo->mask = 0; + return -EINVAL; + } + + fifo->data = kmalloc(size * esize, gfp_mask); + + if (!fifo->data) { + fifo->mask = 0; + return -ENOMEM; + } + fifo->mask = size - 1; + + return 0; +} +EXPORT_SYMBOL_GPL(__kfifo_alloc); + +#define __kfifo_free LINUX_BACKPORT(__kfifo_free) +void __kfifo_free(struct __kfifo *fifo) +{ + kfree(fifo->data); + fifo->in = 0; + fifo->out = 0; + fifo->esize = 0; + fifo->data = NULL; + fifo->mask = 0; +} +EXPORT_SYMBOL_GPL(__kfifo_free); + +#define __kfifo_init LINUX_BACKPORT(__kfifo_init) +int __kfifo_init(struct __kfifo *fifo, void *buffer, + unsigned int size, size_t esize) +{ + size /= esize; + + if (!is_power_of_2(size)) + size = rounddown_pow_of_two(size); + + fifo->in = 0; + fifo->out = 0; + fifo->esize = esize; + fifo->data = buffer; + + if (size < 2) { + fifo->mask = 0; + return -EINVAL; + } + fifo->mask = size - 1; + + return 0; +} +EXPORT_SYMBOL_GPL(__kfifo_init); + +static void kfifo_copy_in(struct __kfifo *fifo, const void *src, + unsigned int len, unsigned int off) +{ + unsigned int size = fifo->mask + 1; + unsigned int esize = fifo->esize; + unsigned int l; + + off &= fifo->mask; + if (esize != 1) { + off *= esize; + size *= esize; + len *= esize; + } + l = min(len, size - off); + + memcpy(fifo->data + off, src, l); + memcpy(fifo->data, src + l, len - l); + /* + * make sure that the data in the fifo is up to date before + * incrementing the fifo->in index counter + */ + smp_wmb(); +} + +#define __kfifo_in LINUX_BACKPORT(__kfifo_in) +unsigned int __kfifo_in(struct __kfifo *fifo, + const void *buf, unsigned int len) +{ + unsigned int l; + + l = kfifo_unused(fifo); + if (len > l) + len = l; + + kfifo_copy_in(fifo, buf, len, fifo->in); + fifo->in += len; + return len; +} +EXPORT_SYMBOL_GPL(__kfifo_in); + +static void kfifo_copy_out(struct __kfifo *fifo, void *dst, + unsigned int len, unsigned int off) +{ + unsigned int size = fifo->mask + 1; + unsigned int esize = fifo->esize; + unsigned int l; + + off &= fifo->mask; + if (esize != 1) { + off *= esize; + size *= esize; + len *= esize; + } + l = min(len, size - off); + + memcpy(dst, fifo->data + off, l); + memcpy(dst + l, fifo->data, len - l); + /* + * make sure that the data is copied before + * incrementing the fifo->out index counter + */ + smp_wmb(); +} + +#define __kfifo_out_peek LINUX_BACKPORT(__kfifo_out_peek) +unsigned int __kfifo_out_peek(struct __kfifo *fifo, + void *buf, unsigned int len) +{ + unsigned int l; + + l = fifo->in - fifo->out; + if (len > l) + len = l; + + kfifo_copy_out(fifo, buf, len, fifo->out); + return len; +} +EXPORT_SYMBOL_GPL(__kfifo_out_peek); + +#define __kfifo_out LINUX_BACKPORT(__kfifo_out) +unsigned int __kfifo_out(struct __kfifo *fifo, + void *buf, unsigned int len) +{ + len = __kfifo_out_peek(fifo, buf, len); + fifo->out += len; + return len; +} +EXPORT_SYMBOL_GPL(__kfifo_out); + +static unsigned long kfifo_copy_from_user(struct __kfifo *fifo, + const void __user *from, unsigned int len, unsigned int off, + unsigned int *copied) +{ + unsigned int size = fifo->mask + 1; + unsigned int esize = fifo->esize; + unsigned int l; + unsigned long ret; + + off &= fifo->mask; + if (esize != 1) { + off *= esize; + size *= esize; + len *= esize; + } + l = min(len, size - off); + + ret = copy_from_user(fifo->data + off, from, l); + if (unlikely(ret)) + ret = DIV_ROUND_UP(ret + len - l, esize); + else { + ret = copy_from_user(fifo->data, from + l, len - l); + if (unlikely(ret)) + ret = DIV_ROUND_UP(ret, esize); + } + /* + * make sure that the data in the fifo is up to date before + * incrementing the fifo->in index counter + */ + smp_wmb(); + *copied = len - ret; + /* return the number of elements which are not copied */ + return ret; +} + +#define __kfifo_from_user LINUX_BACKPORT(__kfifo_from_user) +int __kfifo_from_user(struct __kfifo *fifo, const void __user *from, + unsigned long len, unsigned int *copied) +{ + unsigned int l; + unsigned long ret; + unsigned int esize = fifo->esize; + int err; + + if (esize != 1) + len /= esize; + + l = kfifo_unused(fifo); + if (len > l) + len = l; + + ret = kfifo_copy_from_user(fifo, from, len, fifo->in, copied); + if (unlikely(ret)) { + len -= ret; + err = -EFAULT; + } else + err = 0; + fifo->in += len; + return err; +} +EXPORT_SYMBOL_GPL(__kfifo_from_user); + +static unsigned long kfifo_copy_to_user(struct __kfifo *fifo, void __user *to, + unsigned int len, unsigned int off, unsigned int *copied) +{ + unsigned int l; + unsigned long ret; + unsigned int size = fifo->mask + 1; + unsigned int esize = fifo->esize; + + off &= fifo->mask; + if (esize != 1) { + off *= esize; + size *= esize; + len *= esize; + } + l = min(len, size - off); + + ret = copy_to_user(to, fifo->data + off, l); + if (unlikely(ret)) + ret = DIV_ROUND_UP(ret + len - l, esize); + else { + ret = copy_to_user(to + l, fifo->data, len - l); + if (unlikely(ret)) + ret = DIV_ROUND_UP(ret, esize); + } + /* + * make sure that the data is copied before + * incrementing the fifo->out index counter + */ + smp_wmb(); + *copied = len - ret; + /* return the number of elements which are not copied */ + return ret; +} + +#define __kfifo_to_user LINUX_BACKPORT(__kfifo_to_user) +int __kfifo_to_user(struct __kfifo *fifo, void __user *to, + unsigned long len, unsigned int *copied) +{ + unsigned int l; + unsigned long ret; + unsigned int esize = fifo->esize; + int err; + + if (esize != 1) + len /= esize; + + l = fifo->in - fifo->out; + if (len > l) + len = l; + ret = kfifo_copy_to_user(fifo, to, len, fifo->out, copied); + if (unlikely(ret)) { + len -= ret; + err = -EFAULT; + } else + err = 0; + fifo->out += len; + return err; +} +EXPORT_SYMBOL_GPL(__kfifo_to_user); + +static int setup_sgl_buf(struct scatterlist *sgl, void *buf, + int nents, unsigned int len) +{ + int n; + unsigned int l; + unsigned int off; + struct page *page; + + if (!nents) + return 0; + + if (!len) + return 0; + + n = 0; + page = virt_to_page(buf); + off = offset_in_page(buf); + l = 0; + + while (len >= l + PAGE_SIZE - off) { + struct page *npage; + + l += PAGE_SIZE; + buf += PAGE_SIZE; + npage = virt_to_page(buf); + if (page_to_phys(page) != page_to_phys(npage) - l) { + sg_set_page(sgl, page, l - off, off); + sgl = sg_next(sgl); + if (++n == nents || sgl == NULL) + return n; + page = npage; + len -= l - off; + l = off = 0; + } + } + sg_set_page(sgl, page, len, off); + return n + 1; +} + +static unsigned int setup_sgl(struct __kfifo *fifo, struct scatterlist *sgl, + int nents, unsigned int len, unsigned int off) +{ + unsigned int size = fifo->mask + 1; + unsigned int esize = fifo->esize; + unsigned int l; + unsigned int n; + + off &= fifo->mask; + if (esize != 1) { + off *= esize; + size *= esize; + len *= esize; + } + l = min(len, size - off); + + n = setup_sgl_buf(sgl, fifo->data + off, nents, l); + n += setup_sgl_buf(sgl + n, fifo->data, nents - n, len - l); + + return n; +} + +#define __kfifo_dma_in_prepare LINUX_BACKPORT(__kfifo_dma_in_prepare) +unsigned int __kfifo_dma_in_prepare(struct __kfifo *fifo, + struct scatterlist *sgl, int nents, unsigned int len) +{ + unsigned int l; + + l = kfifo_unused(fifo); + if (len > l) + len = l; + + return setup_sgl(fifo, sgl, nents, len, fifo->in); +} +EXPORT_SYMBOL_GPL(__kfifo_dma_in_prepare); + +#define __kfifo_dma_out_prepare LINUX_BACKPORT(__kfifo_dma_out_prepare) +unsigned int __kfifo_dma_out_prepare(struct __kfifo *fifo, + struct scatterlist *sgl, int nents, unsigned int len) +{ + unsigned int l; + + l = fifo->in - fifo->out; + if (len > l) + len = l; + + return setup_sgl(fifo, sgl, nents, len, fifo->out); +} +EXPORT_SYMBOL_GPL(__kfifo_dma_out_prepare); + +unsigned int __kfifo_max_r(unsigned int len, size_t recsize) +{ + unsigned int max = (1 << (recsize << 3)) - 1; + + if (len > max) + return max; + return len; +} + +#define __KFIFO_PEEK(data, out, mask) \ + ((data)[(out) & (mask)]) +/* + * __kfifo_peek_n internal helper function for determinate the length of + * the next record in the fifo + */ +static unsigned int __kfifo_peek_n(struct __kfifo *fifo, size_t recsize) +{ + unsigned int l; + unsigned int mask = fifo->mask; + unsigned char *data = fifo->data; + + l = __KFIFO_PEEK(data, fifo->out, mask); + + if (--recsize) + l |= __KFIFO_PEEK(data, fifo->out + 1, mask) << 8; + + return l; +} + +#define __KFIFO_POKE(data, in, mask, val) \ + ( \ + (data)[(in) & (mask)] = (unsigned char)(val) \ + ) + +/* + * __kfifo_poke_n internal helper function for storeing the length of + * the record into the fifo + */ +static void __kfifo_poke_n(struct __kfifo *fifo, unsigned int n, size_t recsize) +{ + unsigned int mask = fifo->mask; + unsigned char *data = fifo->data; + + __KFIFO_POKE(data, fifo->in, mask, n); + + if (recsize > 1) + __KFIFO_POKE(data, fifo->in + 1, mask, n >> 8); +} + +#define __kfifo_len_r LINUX_BACKPORT(__kfifo_len_r) +unsigned int __kfifo_len_r(struct __kfifo *fifo, size_t recsize) +{ + return __kfifo_peek_n(fifo, recsize); +} +EXPORT_SYMBOL_GPL(__kfifo_len_r); + +#define __kfifo_in_r LINUX_BACKPORT(__kfifo_in_r) +unsigned int __kfifo_in_r(struct __kfifo *fifo, const void *buf, + unsigned int len, size_t recsize) +{ + if (len + recsize > kfifo_unused(fifo)) + return 0; + + __kfifo_poke_n(fifo, len, recsize); + + kfifo_copy_in(fifo, buf, len, fifo->in + recsize); + fifo->in += len + recsize; + return len; +} +EXPORT_SYMBOL_GPL(__kfifo_in_r); + +static unsigned int kfifo_out_copy_r(struct __kfifo *fifo, + void *buf, unsigned int len, size_t recsize, unsigned int *n) +{ + *n = __kfifo_peek_n(fifo, recsize); + + if (len > *n) + len = *n; + + kfifo_copy_out(fifo, buf, len, fifo->out + recsize); + return len; +} + +#define __kfifo_out_peek_r LINUX_BACKPORT(__kfifo_out_peek_r) +unsigned int __kfifo_out_peek_r(struct __kfifo *fifo, void *buf, + unsigned int len, size_t recsize) +{ + unsigned int n; + + if (fifo->in == fifo->out) + return 0; + + return kfifo_out_copy_r(fifo, buf, len, recsize, &n); +} +EXPORT_SYMBOL_GPL(__kfifo_out_peek_r); + +#define __kfifo_out_r LINUX_BACKPORT(__kfifo_out_r) +unsigned int __kfifo_out_r(struct __kfifo *fifo, void *buf, + unsigned int len, size_t recsize) +{ + unsigned int n; + + if (fifo->in == fifo->out) + return 0; + + len = kfifo_out_copy_r(fifo, buf, len, recsize, &n); + fifo->out += n + recsize; + return len; +} +EXPORT_SYMBOL_GPL(__kfifo_out_r); + +#define __kfifo_skip_r LINUX_BACKPORT(__kfifo_skip_r) +void __kfifo_skip_r(struct __kfifo *fifo, size_t recsize) +{ + unsigned int n; + + n = __kfifo_peek_n(fifo, recsize); + fifo->out += n + recsize; +} +EXPORT_SYMBOL_GPL(__kfifo_skip_r); + +#define __kfifo_from_user_r LINUX_BACKPORT(__kfifo_from_user_r) +int __kfifo_from_user_r(struct __kfifo *fifo, const void __user *from, + unsigned long len, unsigned int *copied, size_t recsize) +{ + unsigned long ret; + + len = __kfifo_max_r(len, recsize); + + if (len + recsize > kfifo_unused(fifo)) { + *copied = 0; + return 0; + } + + __kfifo_poke_n(fifo, len, recsize); + + ret = kfifo_copy_from_user(fifo, from, len, fifo->in + recsize, copied); + if (unlikely(ret)) { + *copied = 0; + return -EFAULT; + } + fifo->in += len + recsize; + return 0; +} +EXPORT_SYMBOL_GPL(__kfifo_from_user_r); + +#define __kfifo_to_user_r LINUX_BACKPORT(__kfifo_to_user_r) +int __kfifo_to_user_r(struct __kfifo *fifo, void __user *to, + unsigned long len, unsigned int *copied, size_t recsize) +{ + unsigned long ret; + unsigned int n; + + if (fifo->in == fifo->out) { + *copied = 0; + return 0; + } + + n = __kfifo_peek_n(fifo, recsize); + if (len > n) + len = n; + + ret = kfifo_copy_to_user(fifo, to, len, fifo->out + recsize, copied); + if (unlikely(ret)) { + *copied = 0; + return -EFAULT; + } + fifo->out += n + recsize; + return 0; +} +EXPORT_SYMBOL_GPL(__kfifo_to_user_r); + +#define __kfifo_dma_in_prepare_r LINUX_BACKPORT(__kfifo_dma_in_prepare_r) +unsigned int __kfifo_dma_in_prepare_r(struct __kfifo *fifo, + struct scatterlist *sgl, int nents, unsigned int len, size_t recsize) +{ + if (!nents) + return -EINVAL; + + len = __kfifo_max_r(len, recsize); + + if (len + recsize > kfifo_unused(fifo)) + return 0; + + return setup_sgl(fifo, sgl, nents, len, fifo->in + recsize); +} +EXPORT_SYMBOL_GPL(__kfifo_dma_in_prepare_r); + +#define __kfifo_dma_in_finish_r LINUX_BACKPORT(__kfifo_dma_in_finish_r) +void __kfifo_dma_in_finish_r(struct __kfifo *fifo, + unsigned int len, size_t recsize) +{ + len = __kfifo_max_r(len, recsize); + __kfifo_poke_n(fifo, len, recsize); + fifo->in += len + recsize; +} +EXPORT_SYMBOL_GPL(__kfifo_dma_in_finish_r); + +#define __kfifo_dma_out_prepare_r LINUX_BACKPORT(__kfifo_dma_out_prepare_r) +unsigned int __kfifo_dma_out_prepare_r(struct __kfifo *fifo, + struct scatterlist *sgl, int nents, unsigned int len, size_t recsize) +{ + if (!nents) + return -EINVAL; + + len = __kfifo_max_r(len, recsize); + + if (len + recsize > fifo->in - fifo->out) + return 0; + + return setup_sgl(fifo, sgl, nents, len, fifo->out + recsize); +} +EXPORT_SYMBOL_GPL(__kfifo_dma_out_prepare_r); + +#define __kfifo_dma_out_finish_r LINUX_BACKPORT(__kfifo_dma_out_finish_r) +void __kfifo_dma_out_finish_r(struct __kfifo *fifo, size_t recsize) +{ + unsigned int len; + + len = __kfifo_peek_n(fifo, recsize); + fifo->out += len + recsize; +} +EXPORT_SYMBOL_GPL(__kfifo_dma_out_finish_r); diff --git a/src/mlnx-ofa_kernel-5.8/compat/kstrtox.c b/src/mlnx-ofa_kernel-5.8/compat/kstrtox.c new file mode 100644 index 0000000..ed3c46e --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/kstrtox.c @@ -0,0 +1,249 @@ +/* + * Convert integer string representation to an integer. + * If an integer doesn't fit into specified type, -E is returned. + * + * Integer starts with optional sign. + * kstrtou*() functions do not accept sign "-". + * + * Radix 0 means autodetection: leading "0x" implies radix 16, + * leading "0" implies radix 8, otherwise radix is 10. + * Autodetection hints work after optional sign, but not before. + * + * If -E is returned, result is not touched. + */ +#include + +#ifndef CONFIG_COMPAT_IS_KSTRTOX +/* + * kstrto* was included in kernel 2.6.38.4 and causes conflicts with the + * version included in compat-wireless. We use strict_strtol to check if + * kstrto* is already available. + */ +#ifndef strict_strtoll + +#include +#include +#include +#include +#include +#include + +static inline char _tolower(const char c) +{ + return c | 0x20; +} + +static int _kstrtoull(const char *s, unsigned int base, unsigned long long *res) +{ + unsigned long long acc; + int ok; + + if (base == 0) { + if (s[0] == '0') { + if (_tolower(s[1]) == 'x' && isxdigit(s[2])) + base = 16; + else + base = 8; + } else + base = 10; + } + if (base == 16 && s[0] == '0' && _tolower(s[1]) == 'x') + s += 2; + + acc = 0; + ok = 0; + while (*s) { + unsigned int val; + + if ('0' <= *s && *s <= '9') + val = *s - '0'; + else if ('a' <= _tolower(*s) && _tolower(*s) <= 'f') + val = _tolower(*s) - 'a' + 10; + else if (*s == '\n') { + if (*(s + 1) == '\0') + break; + else + return -EINVAL; + } else + return -EINVAL; + + if (val >= base) + return -EINVAL; + if (acc > div_u64(ULLONG_MAX - val, base)) + return -ERANGE; + acc = acc * base + val; + ok = 1; + + s++; + } + if (!ok) + return -EINVAL; + *res = acc; + return 0; +} + +#define kstrtoull LINUX_BACKPORT(kstrtoull) +int kstrtoull(const char *s, unsigned int base, unsigned long long *res) +{ + if (s[0] == '+') + s++; + return _kstrtoull(s, base, res); +} +EXPORT_SYMBOL_GPL(kstrtoull); + +#define kstrtoll LINUX_BACKPORT(kstrtoll) +int kstrtoll(const char *s, unsigned int base, long long *res) +{ + unsigned long long tmp; + int rv; + + if (s[0] == '-') { + rv = _kstrtoull(s + 1, base, &tmp); + if (rv < 0) + return rv; + if ((long long)(-tmp) >= 0) + return -ERANGE; + *res = -tmp; + } else { + rv = kstrtoull(s, base, &tmp); + if (rv < 0) + return rv; + if ((long long)tmp < 0) + return -ERANGE; + *res = tmp; + } + return 0; +} +EXPORT_SYMBOL_GPL(kstrtoll); + +/* Internal, do not use. */ +#define _kstrtoul LINUX_BACKPORT(_kstrtoul) +int _kstrtoul(const char *s, unsigned int base, unsigned long *res) +{ + unsigned long long tmp; + int rv; + + rv = kstrtoull(s, base, &tmp); + if (rv < 0) + return rv; + if (tmp != (unsigned long long)(unsigned long)tmp) + return -ERANGE; + *res = tmp; + return 0; +} +EXPORT_SYMBOL_GPL(_kstrtoul); + +/* Internal, do not use. */ +#define _kstrtol LINUX_BACKPORT(_kstrtol) +int _kstrtol(const char *s, unsigned int base, long *res) +{ + long long tmp; + int rv; + + rv = kstrtoll(s, base, &tmp); + if (rv < 0) + return rv; + if (tmp != (long long)(long)tmp) + return -ERANGE; + *res = tmp; + return 0; +} +EXPORT_SYMBOL_GPL(_kstrtol); + +#define kstrtouint LINUX_BACKPORT(kstrtouint) +int kstrtouint(const char *s, unsigned int base, unsigned int *res) +{ + unsigned long long tmp; + int rv; + + rv = kstrtoull(s, base, &tmp); + if (rv < 0) + return rv; + if (tmp != (unsigned long long)(unsigned int)tmp) + return -ERANGE; + *res = tmp; + return 0; +} +EXPORT_SYMBOL_GPL(kstrtouint); + +#define kstrtoint LINUX_BACKPORT(kstrtoint) +int kstrtoint(const char *s, unsigned int base, int *res) +{ + long long tmp; + int rv; + + rv = kstrtoll(s, base, &tmp); + if (rv < 0) + return rv; + if (tmp != (long long)(int)tmp) + return -ERANGE; + *res = tmp; + return 0; +} +EXPORT_SYMBOL_GPL(kstrtoint); + +#define kstrtou16 LINUX_BACKPORT(kstrtou16) +int kstrtou16(const char *s, unsigned int base, u16 *res) +{ + unsigned long long tmp; + int rv; + + rv = kstrtoull(s, base, &tmp); + if (rv < 0) + return rv; + if (tmp != (unsigned long long)(u16)tmp) + return -ERANGE; + *res = tmp; + return 0; +} +EXPORT_SYMBOL_GPL(kstrtou16); + +#define kstrtos16 LINUX_BACKPORT(kstrtos16) +int kstrtos16(const char *s, unsigned int base, s16 *res) +{ + long long tmp; + int rv; + + rv = kstrtoll(s, base, &tmp); + if (rv < 0) + return rv; + if (tmp != (long long)(s16)tmp) + return -ERANGE; + *res = tmp; + return 0; +} +EXPORT_SYMBOL_GPL(kstrtos16); + +#define kstrtou8 LINUX_BACKPORT(kstrtou8) +int kstrtou8(const char *s, unsigned int base, u8 *res) +{ + unsigned long long tmp; + int rv; + + rv = kstrtoull(s, base, &tmp); + if (rv < 0) + return rv; + if (tmp != (unsigned long long)(u8)tmp) + return -ERANGE; + *res = tmp; + return 0; +} +EXPORT_SYMBOL_GPL(kstrtou8); + +#define kstrtos8 LINUX_BACKPORT(kstrtos8) +int kstrtos8(const char *s, unsigned int base, s8 *res) +{ + long long tmp; + int rv; + + rv = kstrtoll(s, base, &tmp); + if (rv < 0) + return rv; + if (tmp != (long long)(s8)tmp) + return -ERANGE; + *res = tmp; + return 0; +} +EXPORT_SYMBOL_GPL(kstrtos8); +#endif /* #ifndef strict_strtol */ +#endif /* #ifndef CONFIG_COMPAT_IS_KSTRTOX */ diff --git a/src/mlnx-ofa_kernel-5.8/compat/macsec.c b/src/mlnx-ofa_kernel-5.8/compat/macsec.c new file mode 100644 index 0000000..6605a0a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/macsec.c @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include + +#ifndef HAVE_FUNC_MACSEC_GET_REAL_DEV +#include +#include + +#ifdef CONFIG_NET_DEV_REFCNT_TRACKER +#include +typedef struct ref_tracker *netdevice_tracker_compat; +#else +typedef struct {} netdevice_tracker_compat; +#endif + +struct macsec_dev_compat { + struct macsec_secy secy; + struct net_device *real_dev; + netdevice_tracker_compat dev_tracker; + struct pcpu_secy_stats __percpu *stats; + struct list_head secys; + struct gro_cells gro_cells; + enum macsec_offload offload; +}; + +struct net_device *macsec_get_real_dev(const struct net_device *dev) +{ + return (struct macsec_dev_compat *)netdev_priv(dev)->real_dev; +} +EXPORT_SYMBOL_GPL(macsec_get_real_dev); +#endif /* HAVE_FUNC_MACSEC_GET_REAL_DEV_ */ + +#ifndef HAVE_FUNC_NETDEV_MACSEC_IS_OFFLOADED +#include +#include + +#ifdef CONFIG_NET_DEV_REFCNT_TRACKER +#include +typedef struct ref_tracker *netdevice_tracker_compat; +#else +typedef struct {} netdevice_tracker_compat; +#endif + +struct macsec_dev_compat { + struct macsec_secy secy; + struct net_device *real_dev; + netdevice_tracker_compat dev_tracker; + struct pcpu_secy_stats __percpu *stats; + struct list_head secys; + struct gro_cells gro_cells; + enum macsec_offload offload; +}; + +#define MACSEC_OFFLOAD_PHY_COMPAT 1 +#define MACSEC_OFFLOAD_MAC_COMPAT 2 + +bool netdev_macsec_is_offloaded(struct net_device *dev) +{ + struct macsec_dev_compat *macsec_dev; + + if (!dev) + return false; + + macsec_dev = (struct macsec_dev_compat *)netdev_priv(dev); + + if (macsec_dev->offload == MACSEC_OFFLOAD_MAC_COMPAT || + macsec_dev->offload == MACSEC_OFFLOAD_PHY_COMPAT) + return true; + + return false; +} +EXPORT_SYMBOL_GPL(netdev_macsec_is_offloaded); +#endif /* HAVE_FUNC_NETDEV_MACSEC_IS_OFFLOADED */ diff --git a/src/mlnx-ofa_kernel-5.8/compat/main.c b/src/mlnx-ofa_kernel-5.8/compat/main.c new file mode 100644 index 0000000..0c7aac0 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/main.c @@ -0,0 +1,104 @@ +#include + +#include "config.h" + +#ifdef CONFIG_COMPAT_FLOW_DISSECTOR +#include +#endif + +#ifndef HAVE_XARRAY +#include +#endif + +#ifdef CONFIG_COMPAT_CLS_FLOWER_4_18_MOD +#include +#endif +MODULE_AUTHOR("Luis R. Rodriguez"); +MODULE_DESCRIPTION("Kernel backport module"); +MODULE_LICENSE("GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif + +#ifndef COMPAT_BASE +#error "You need a COMPAT_BASE" +#endif + +#ifndef COMPAT_BASE_TREE +#error "You need a COMPAT_BASE_TREE" +#endif + +#ifndef COMPAT_BASE_TREE_VERSION +#error "You need a COMPAT_BASE_TREE_VERSION" +#endif + +#ifndef COMPAT_VERSION +#error "You need a COMPAT_VERSION" +#endif + +static char *compat_base = COMPAT_BASE; +static char *compat_base_tree = COMPAT_BASE_TREE; +static char *compat_base_tree_version = COMPAT_BASE_TREE_VERSION; +static char *compat_version = COMPAT_VERSION; + +module_param(compat_base, charp, 0400); +MODULE_PARM_DESC(compat_base_tree, + "The upstream verion of compat.git used"); + +module_param(compat_base_tree, charp, 0400); +MODULE_PARM_DESC(compat_base_tree, + "The upstream tree used as base for this backport"); + +module_param(compat_base_tree_version, charp, 0400); +MODULE_PARM_DESC(compat_base_tree_version, + "The git-describe of the upstream base tree"); + +module_param(compat_version, charp, 0400); +MODULE_PARM_DESC(compat_version, + "Version of the kernel compat backport work"); + +void backport_dependency_symbol(void) +{ +} +EXPORT_SYMBOL_GPL(backport_dependency_symbol); + + +static int __init backport_init(void) +{ +#ifdef CONFIG_COMPAT_FLOW_DISSECTOR + init_default_flow_dissectors(); +#endif + + printk(KERN_INFO + COMPAT_PROJECT " backport release: " + COMPAT_VERSION + "\n"); + printk(KERN_INFO "Backport based on " + COMPAT_BASE_TREE " " COMPAT_BASE_TREE_VERSION + "\n"); + printk(KERN_INFO "compat.git: " + COMPAT_BASE_TREE "\n"); + +#ifndef HAVE_XARRAY + compat_radix_tree_init(); +#endif +#ifdef CONFIG_COMPAT_CLS_FLOWER_4_18_MOD + nf_flow_table_offload_init(); +#endif + return 0; +} +module_init(backport_init); + +static void __exit backport_exit(void) +{ +#ifndef HAVE_XARRAY + compat_radix_tree_clean(); +#endif +#ifdef CONFIG_COMPAT_CLS_FLOWER_4_18_MOD + nf_flow_table_offload_exit(); +#endif + + return; +} +module_exit(backport_exit); + diff --git a/src/mlnx-ofa_kernel-5.8/compat/mm_util.c b/src/mlnx-ofa_kernel-5.8/compat/mm_util.c new file mode 100644 index 0000000..a13dbd3 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/mm_util.c @@ -0,0 +1,21 @@ +#if defined(RHEL_MAJOR) && RHEL_MAJOR -0 == 7 && RHEL_MINOR -0 >= 2 +#ifndef HAVE_MEMDUP_USER_NUL +void *memdup_user_nul(const void __user *src, size_t len) +{ + char *p; + + p = kmalloc(len + 1, GFP_KERNEL); + if (!p) + return ERR_PTR(-ENOMEM); + + if (copy_from_user(p, src, len)) { + kfree(p); + return ERR_PTR(-EFAULT); + } + p[len] = '\0'; + + return p; +} +EXPORT_SYMBOL(memdup_user_nul); +#endif +#endif diff --git a/src/mlnx-ofa_kernel-5.8/compat/mmu_notifier.c b/src/mlnx-ofa_kernel-5.8/compat/mmu_notifier.c new file mode 100644 index 0000000..78e964c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/mmu_notifier.c @@ -0,0 +1,21 @@ +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING +#ifdef CONFIG_MMU_NOTIFIER +#ifndef HAVE_MMU_NOTIFIER_CALL_SRCU +#include +DEFINE_STATIC_SRCU(srcu); +void mmu_notifier_call_srcu(struct rcu_head *rcu, + void (*func)(struct rcu_head *rcu)) +{ + call_srcu(&srcu, rcu, func); +} +EXPORT_SYMBOL_GPL(mmu_notifier_call_srcu); +#ifndef HAVE_MMU_NOTIFIER_SYNCHRONIZE +void mmu_notifier_synchronize(void) +{ + synchronize_srcu(&srcu); +} +EXPORT_SYMBOL_GPL(mmu_notifier_synchronize); +#endif +#endif +#endif +#endif diff --git a/src/mlnx-ofa_kernel-5.8/compat/net_dim.c b/src/mlnx-ofa_kernel-5.8/compat/net_dim.c new file mode 100644 index 0000000..06811d8 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/net_dim.c @@ -0,0 +1,246 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2018, Mellanox Technologies inc. All rights reserved. + */ + +#include + +/* + * Net DIM profiles: + * There are different set of profiles for each CQ period mode. + * There are different set of profiles for RX/TX CQs. + * Each profile size must be of NET_DIM_PARAMS_NUM_PROFILES + */ +#define NET_DIM_PARAMS_NUM_PROFILES 5 +#define NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE 256 +#define NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE 128 +#define NET_DIM_DEF_PROFILE_CQE 1 +#define NET_DIM_DEF_PROFILE_EQE 1 + +#define NET_DIM_RX_EQE_PROFILES { \ + {1, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \ + {8, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \ + {64, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \ + {128, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \ + {256, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \ +} + +#define NET_DIM_RX_CQE_PROFILES { \ + {2, 256}, \ + {8, 128}, \ + {16, 64}, \ + {32, 64}, \ + {64, 64} \ +} + +#define NET_DIM_TX_EQE_PROFILES { \ + {1, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE}, \ + {8, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE}, \ + {32, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE}, \ + {64, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE}, \ + {128, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE} \ +} + +#define NET_DIM_TX_CQE_PROFILES { \ + {5, 128}, \ + {8, 64}, \ + {16, 32}, \ + {32, 32}, \ + {64, 32} \ +} + +static const struct dim_cq_moder +rx_profile[DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = { + NET_DIM_RX_EQE_PROFILES, + NET_DIM_RX_CQE_PROFILES, +}; + +static const struct dim_cq_moder +tx_profile[DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = { + NET_DIM_TX_EQE_PROFILES, + NET_DIM_TX_CQE_PROFILES, +}; + +struct dim_cq_moder +net_dim_get_rx_moderation(u8 cq_period_mode, int ix) +{ + struct dim_cq_moder cq_moder = rx_profile[cq_period_mode][ix]; + + cq_moder.cq_period_mode = cq_period_mode; + return cq_moder; +} +EXPORT_SYMBOL(net_dim_get_rx_moderation); + +struct dim_cq_moder +net_dim_get_def_rx_moderation(u8 cq_period_mode) +{ + u8 profile_ix = cq_period_mode == DIM_CQ_PERIOD_MODE_START_FROM_CQE ? + NET_DIM_DEF_PROFILE_CQE : NET_DIM_DEF_PROFILE_EQE; + + return net_dim_get_rx_moderation(cq_period_mode, profile_ix); +} +EXPORT_SYMBOL(net_dim_get_def_rx_moderation); + +struct dim_cq_moder +net_dim_get_tx_moderation(u8 cq_period_mode, int ix) +{ + struct dim_cq_moder cq_moder = tx_profile[cq_period_mode][ix]; + + cq_moder.cq_period_mode = cq_period_mode; + return cq_moder; +} +EXPORT_SYMBOL(net_dim_get_tx_moderation); + +struct dim_cq_moder +net_dim_get_def_tx_moderation(u8 cq_period_mode) +{ + u8 profile_ix = cq_period_mode == DIM_CQ_PERIOD_MODE_START_FROM_CQE ? + NET_DIM_DEF_PROFILE_CQE : NET_DIM_DEF_PROFILE_EQE; + + return net_dim_get_tx_moderation(cq_period_mode, profile_ix); +} +EXPORT_SYMBOL(net_dim_get_def_tx_moderation); + +static int net_dim_step(struct dim *dim) +{ + if (dim->tired == (NET_DIM_PARAMS_NUM_PROFILES * 2)) + return DIM_TOO_TIRED; + + switch (dim->tune_state) { + case DIM_PARKING_ON_TOP: + case DIM_PARKING_TIRED: + break; + case DIM_GOING_RIGHT: + if (dim->profile_ix == (NET_DIM_PARAMS_NUM_PROFILES - 1)) + return DIM_ON_EDGE; + dim->profile_ix++; + dim->steps_right++; + break; + case DIM_GOING_LEFT: + if (dim->profile_ix == 0) + return DIM_ON_EDGE; + dim->profile_ix--; + dim->steps_left++; + break; + } + + dim->tired++; + return DIM_STEPPED; +} + +static void net_dim_exit_parking(struct dim *dim) +{ + dim->tune_state = dim->profile_ix ? DIM_GOING_LEFT : DIM_GOING_RIGHT; + net_dim_step(dim); +} + +static int net_dim_stats_compare(struct dim_stats *curr, + struct dim_stats *prev) +{ + if (!prev->bpms) + return curr->bpms ? DIM_STATS_BETTER : DIM_STATS_SAME; + + if (IS_SIGNIFICANT_DIFF(curr->bpms, prev->bpms)) + return (curr->bpms > prev->bpms) ? DIM_STATS_BETTER : + DIM_STATS_WORSE; + + if (!prev->ppms) + return curr->ppms ? DIM_STATS_BETTER : + DIM_STATS_SAME; + + if (IS_SIGNIFICANT_DIFF(curr->ppms, prev->ppms)) + return (curr->ppms > prev->ppms) ? DIM_STATS_BETTER : + DIM_STATS_WORSE; + + if (!prev->epms) + return DIM_STATS_SAME; + + if (IS_SIGNIFICANT_DIFF(curr->epms, prev->epms)) + return (curr->epms < prev->epms) ? DIM_STATS_BETTER : + DIM_STATS_WORSE; + + return DIM_STATS_SAME; +} + +static bool net_dim_decision(struct dim_stats *curr_stats, struct dim *dim) +{ + int prev_state = dim->tune_state; + int prev_ix = dim->profile_ix; + int stats_res; + int step_res; + + switch (dim->tune_state) { + case DIM_PARKING_ON_TOP: + stats_res = net_dim_stats_compare(curr_stats, + &dim->prev_stats); + if (stats_res != DIM_STATS_SAME) + net_dim_exit_parking(dim); + break; + + case DIM_PARKING_TIRED: + dim->tired--; + if (!dim->tired) + net_dim_exit_parking(dim); + break; + + case DIM_GOING_RIGHT: + case DIM_GOING_LEFT: + stats_res = net_dim_stats_compare(curr_stats, + &dim->prev_stats); + if (stats_res != DIM_STATS_BETTER) + dim_turn(dim); + + if (dim_on_top(dim)) { + dim_park_on_top(dim); + break; + } + + step_res = net_dim_step(dim); + switch (step_res) { + case DIM_ON_EDGE: + dim_park_on_top(dim); + break; + case DIM_TOO_TIRED: + dim_park_tired(dim); + break; + } + + break; + } + + if (prev_state != DIM_PARKING_ON_TOP || + dim->tune_state != DIM_PARKING_ON_TOP) + dim->prev_stats = *curr_stats; + + return dim->profile_ix != prev_ix; +} + +void net_dim(struct dim *dim, struct dim_sample end_sample) +{ + struct dim_stats curr_stats; + u16 nevents; + + switch (dim->state) { + case DIM_MEASURE_IN_PROGRESS: + nevents = BIT_GAP(BITS_PER_TYPE(u16), + end_sample.event_ctr, + dim->start_sample.event_ctr); + if (nevents < DIM_NEVENTS) + break; + dim_calc_stats(&dim->start_sample, &end_sample, &curr_stats); + if (net_dim_decision(&curr_stats, dim)) { + dim->state = DIM_APPLY_NEW_PROFILE; + schedule_work(&dim->work); + break; + } + fallthrough; + case DIM_START_MEASURE: + dim_update_sample(end_sample.event_ctr, end_sample.pkt_ctr, + end_sample.byte_ctr, &dim->start_sample); + dim->state = DIM_MEASURE_IN_PROGRESS; + break; + case DIM_APPLY_NEW_PROFILE: + break; + } +} +EXPORT_SYMBOL(net_dim); diff --git a/src/mlnx-ofa_kernel-5.8/compat/nf_flow_table_core.c b/src/mlnx-ofa_kernel-5.8/compat/nf_flow_table_core.c new file mode 100644 index 0000000..0a6d945 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/nf_flow_table_core.c @@ -0,0 +1,674 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static DEFINE_MUTEX(flowtable_lock); +static LIST_HEAD(flowtables); + +struct flow_block_cb *flow_block_cb_lookup(struct flow_block *block, + flow_setup_cb_t *cb, void *cb_ident) +{ + struct flow_block_cb *block_cb; + + list_for_each_entry(block_cb, &block->cb_list, list) { + if (block_cb->cb == cb && + block_cb->cb_ident == cb_ident) + return block_cb; + } + + return NULL; +} +EXPORT_SYMBOL(flow_block_cb_lookup); + +static inline void flow_block_init(struct flow_block *flow_block) +{ + INIT_LIST_HEAD(&flow_block->cb_list); +} + +struct flow_block_cb *flow_block_cb_alloc(flow_setup_cb_t *cb, + void *cb_ident, void *cb_priv, + void (*release)(void *cb_priv)) +{ + struct flow_block_cb *block_cb; + + block_cb = kzalloc(sizeof(*block_cb), GFP_KERNEL); + if (!block_cb) + return ERR_PTR(-ENOMEM); + + block_cb->cb = cb; + block_cb->cb_ident = cb_ident; + block_cb->cb_priv = cb_priv; + block_cb->release = release; + + return block_cb; +} +EXPORT_SYMBOL(flow_block_cb_alloc); + +void flow_block_cb_free(struct flow_block_cb *block_cb) +{ + if (block_cb->release) + block_cb->release(block_cb->cb_priv); + + kfree(block_cb); +} +EXPORT_SYMBOL(flow_block_cb_free); + +static void +flow_offload_fill_dir(struct flow_offload *flow, + enum flow_offload_tuple_dir dir) +{ + struct flow_offload_tuple *ft = &flow->tuplehash[dir].tuple; + struct nf_conntrack_tuple *ctt = &flow->ct->tuplehash[dir].tuple; + + ft->dir = dir; + + switch (ctt->src.l3num) { + case NFPROTO_IPV4: + ft->src_v4 = ctt->src.u3.in; + ft->dst_v4 = ctt->dst.u3.in; + break; + case NFPROTO_IPV6: + ft->src_v6 = ctt->src.u3.in6; + ft->dst_v6 = ctt->dst.u3.in6; + break; + } + + ft->l3proto = ctt->src.l3num; + ft->l4proto = ctt->dst.protonum; + ft->src_port = ctt->src.u.tcp.port; + ft->dst_port = ctt->dst.u.tcp.port; +} + +struct flow_offload *flow_offload_alloc(struct nf_conn *ct) +{ + struct flow_offload *flow; + + if (unlikely(nf_ct_is_dying(ct) || + !atomic_inc_not_zero(&ct->ct_general.use))) + return NULL; + + flow = kzalloc(sizeof(*flow), GFP_ATOMIC); + if (!flow) + goto err_ct_refcnt; + + flow->ct = ct; + + flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_ORIGINAL); + flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_REPLY); + + if (ct->status & IPS_SRC_NAT) + __set_bit(NF_FLOW_SNAT, &flow->flags); + if (ct->status & IPS_DST_NAT) + __set_bit(NF_FLOW_DNAT, &flow->flags); + + return flow; + +err_ct_refcnt: + nf_ct_put(ct); + + return NULL; +} +EXPORT_SYMBOL_GPL(flow_offload_alloc); + +static int flow_offload_fill_route(struct flow_offload *flow, + const struct nf_flow_route *route, + enum flow_offload_tuple_dir dir) +{ + struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple; + struct dst_entry *other_dst = route->tuple[!dir].dst; + struct dst_entry *dst = route->tuple[dir].dst; + + if (!dst_hold_safe(route->tuple[dir].dst)) + return -1; + + switch (flow_tuple->l3proto) { + case NFPROTO_IPV4: + flow_tuple->mtu = ip_dst_mtu_maybe_forward(dst, true); + break; + case NFPROTO_IPV6: + flow_tuple->mtu = ip6_dst_mtu_forward(dst); + break; + } + + flow_tuple->iifidx = other_dst->dev->ifindex; + flow_tuple->dst_cache = dst; + + return 0; +} + +int flow_offload_route_init(struct flow_offload *flow, + const struct nf_flow_route *route) +{ + int err; + + err = flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_ORIGINAL); + if (err < 0) + return err; + + err = flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_REPLY); + if (err < 0) + goto err_route_reply; + + flow->type = NF_FLOW_OFFLOAD_ROUTE; + + return 0; + +err_route_reply: + dst_release(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst); + + return err; +} +EXPORT_SYMBOL_GPL(flow_offload_route_init); + +static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp) +{ + tcp->state = TCP_CONNTRACK_ESTABLISHED; + tcp->seen[0].td_maxwin = 0; + tcp->seen[1].td_maxwin = 0; +} + +#define NF_FLOWTABLE_TCP_PICKUP_TIMEOUT (120 * HZ) +#define NF_FLOWTABLE_UDP_PICKUP_TIMEOUT (30 * HZ) + +static void flow_offload_fixup_ct_timeout(struct nf_conn *ct) +{ + int l4num = nf_ct_protonum(ct); + unsigned int timeout; + + if (l4num == IPPROTO_TCP) + timeout = NF_FLOWTABLE_TCP_PICKUP_TIMEOUT; + else if (l4num == IPPROTO_UDP) + timeout = NF_FLOWTABLE_UDP_PICKUP_TIMEOUT; + else + return; + + if (nf_flow_timeout_delta(ct->timeout) > (__s32)timeout) + ct->timeout = nfct_time_stamp + timeout; +} + +static void flow_offload_fixup_ct_state(struct nf_conn *ct) +{ + if (nf_ct_protonum(ct) == IPPROTO_TCP) + flow_offload_fixup_tcp(&ct->proto.tcp); +} + +static void flow_offload_fixup_ct(struct nf_conn *ct) +{ + flow_offload_fixup_ct_state(ct); + flow_offload_fixup_ct_timeout(ct); +} + +static void flow_offload_route_release(struct flow_offload *flow) +{ + dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache); + dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache); +} + +void flow_offload_free(struct flow_offload *flow) +{ + switch (flow->type) { + case NF_FLOW_OFFLOAD_ROUTE: + flow_offload_route_release(flow); + break; + default: + break; + } + nf_ct_put(flow->ct); + kfree_rcu(flow, rcu_head); +} +EXPORT_SYMBOL_GPL(flow_offload_free); + +static u32 flow_offload_hash(const void *data, u32 len, u32 seed) +{ + const struct flow_offload_tuple *tuple = data; + + return jhash(tuple, offsetof(struct flow_offload_tuple, dir), seed); +} + +static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed) +{ + const struct flow_offload_tuple_rhash *tuplehash = data; + + return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, dir), seed); +} + +static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg, + const void *ptr) +{ + const struct flow_offload_tuple *tuple = arg->key; + const struct flow_offload_tuple_rhash *x = ptr; + + if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, dir))) + return 1; + + return 0; +} + +static const struct rhashtable_params nf_flow_offload_rhash_params = { + .head_offset = offsetof(struct flow_offload_tuple_rhash, node), + .hashfn = flow_offload_hash, + .obj_hashfn = flow_offload_hash_obj, + .obj_cmpfn = flow_offload_hash_cmp, + .automatic_shrinking = true, +}; + +int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) +{ + int err; + + flow->timeout = nf_flowtable_time_stamp + + nf_flow_offload_timeout(flow_table); + + err = rhashtable_insert_fast(&flow_table->rhashtable, + &flow->tuplehash[0].node, + nf_flow_offload_rhash_params); + if (err < 0) + return err; + + err = rhashtable_insert_fast(&flow_table->rhashtable, + &flow->tuplehash[1].node, + nf_flow_offload_rhash_params); + if (err < 0) { + rhashtable_remove_fast(&flow_table->rhashtable, + &flow->tuplehash[0].node, + nf_flow_offload_rhash_params); + return err; + } + + if (nf_flowtable_hw_offload(flow_table)) { + __set_bit(NF_FLOW_HW, &flow->flags); + nf_flow_offload_add(flow_table, flow); + } + + return 0; +} +EXPORT_SYMBOL_GPL(flow_offload_add); + +void flow_offload_refresh(struct nf_flowtable *flow_table, + struct flow_offload *flow) +{ + flow->timeout = nf_flowtable_time_stamp + + nf_flow_offload_timeout(flow_table); + + if (likely(!nf_flowtable_hw_offload(flow_table) || + !test_and_clear_bit(NF_FLOW_HW_REFRESH, &flow->flags))) + return; + + nf_flow_offload_add(flow_table, flow); +} +EXPORT_SYMBOL_GPL(flow_offload_refresh); + +static inline bool nf_flow_has_expired(const struct flow_offload *flow) +{ + return nf_flow_timeout_delta(flow->timeout) <= 0; +} + +static void flow_offload_del(struct nf_flowtable *flow_table, + struct flow_offload *flow) +{ + rhashtable_remove_fast(&flow_table->rhashtable, + &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node, + nf_flow_offload_rhash_params); + rhashtable_remove_fast(&flow_table->rhashtable, + &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node, + nf_flow_offload_rhash_params); + + clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status); + + if (nf_flow_has_expired(flow)) + flow_offload_fixup_ct(flow->ct); + else + flow_offload_fixup_ct_timeout(flow->ct); + + flow_offload_free(flow); +} + +void flow_offload_teardown(struct flow_offload *flow) +{ + set_bit(NF_FLOW_TEARDOWN, &flow->flags); + + flow_offload_fixup_ct_state(flow->ct); +} +EXPORT_SYMBOL_GPL(flow_offload_teardown); + +struct flow_offload_tuple_rhash * +flow_offload_lookup(struct nf_flowtable *flow_table, + struct flow_offload_tuple *tuple) +{ + struct flow_offload_tuple_rhash *tuplehash; + struct flow_offload *flow; + int dir; + + tuplehash = rhashtable_lookup(&flow_table->rhashtable, tuple, + nf_flow_offload_rhash_params); + if (!tuplehash) + return NULL; + + dir = tuplehash->tuple.dir; + flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); + if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) + return NULL; + + if (unlikely(nf_ct_is_dying(flow->ct))) + return NULL; + + return tuplehash; +} +EXPORT_SYMBOL_GPL(flow_offload_lookup); + +static int +nf_flow_table_iterate(struct nf_flowtable *flow_table, + void (*iter)(struct flow_offload *flow, void *data), + void *data) +{ + struct flow_offload_tuple_rhash *tuplehash; + struct rhashtable_iter hti; + struct flow_offload *flow; + int err = 0; + + rhashtable_walk_enter(&flow_table->rhashtable, &hti); + rhashtable_walk_start(&hti); + + while ((tuplehash = rhashtable_walk_next(&hti))) { + if (IS_ERR(tuplehash)) { + if (PTR_ERR(tuplehash) != -EAGAIN) { + err = PTR_ERR(tuplehash); + break; + } + continue; + } + if (tuplehash->tuple.dir) + continue; + + flow = container_of(tuplehash, struct flow_offload, tuplehash[0]); + + iter(flow, data); + } + rhashtable_walk_stop(&hti); + rhashtable_walk_exit(&hti); + + return err; +} + +static void nf_flow_offload_gc_step(struct flow_offload *flow, void *data) +{ + struct nf_flowtable *flow_table = data; + + if (nf_flow_has_expired(flow) || nf_ct_is_dying(flow->ct)) + set_bit(NF_FLOW_TEARDOWN, &flow->flags); + + if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) { + if (test_bit(NF_FLOW_HW, &flow->flags)) { + if (!test_bit(NF_FLOW_HW_DYING, &flow->flags)) + nf_flow_offload_del(flow_table, flow); + else if (test_bit(NF_FLOW_HW_DEAD, &flow->flags)) + flow_offload_del(flow_table, flow); + } else { + flow_offload_del(flow_table, flow); + } + } else if (test_bit(NF_FLOW_HW, &flow->flags)) { + nf_flow_offload_stats(flow_table, flow); + } +} + +static void nf_flow_offload_work_gc(struct work_struct *work) +{ + struct nf_flowtable *flow_table; + + flow_table = container_of(work, struct nf_flowtable, gc_work.work); + nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, flow_table); + queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ); +} + +int nf_flow_table_offload_add_cb(struct nf_flowtable *flow_table, + flow_setup_cb_t *cb, void *cb_priv) +{ + struct flow_block *block = &flow_table->flow_block; + struct flow_block_cb *block_cb; + int err = 0; + + down_write(&flow_table->flow_block_lock); + block_cb = flow_block_cb_lookup(block, cb, cb_priv); + if (block_cb) { + err = -EEXIST; + goto unlock; + } + + block_cb = flow_block_cb_alloc(cb, cb_priv, cb_priv, NULL); + if (IS_ERR(block_cb)) { + err = PTR_ERR(block_cb); + goto unlock; + } + + list_add_tail(&block_cb->list, &block->cb_list); + +unlock: + up_write(&flow_table->flow_block_lock); + return err; +} +EXPORT_SYMBOL_GPL(nf_flow_table_offload_add_cb); + +void nf_flow_table_offload_del_cb(struct nf_flowtable *flow_table, + flow_setup_cb_t *cb, void *cb_priv) +{ + struct flow_block *block = &flow_table->flow_block; + struct flow_block_cb *block_cb; + + down_write(&flow_table->flow_block_lock); + block_cb = flow_block_cb_lookup(block, cb, cb_priv); + if (block_cb) { + list_del(&block_cb->list); + flow_block_cb_free(block_cb); + } else { + WARN_ON(true); + } + up_write(&flow_table->flow_block_lock); +} +EXPORT_SYMBOL_GPL(nf_flow_table_offload_del_cb); + +static int nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff, + __be16 port, __be16 new_port) +{ + struct tcphdr *tcph; + + if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) || + skb_try_make_writable(skb, thoff + sizeof(*tcph))) + return -1; + + tcph = (void *)(skb_network_header(skb) + thoff); + inet_proto_csum_replace2(&tcph->check, skb, port, new_port, true); + + return 0; +} + +static int nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff, + __be16 port, __be16 new_port) +{ + struct udphdr *udph; + + if (!pskb_may_pull(skb, thoff + sizeof(*udph)) || + skb_try_make_writable(skb, thoff + sizeof(*udph))) + return -1; + + udph = (void *)(skb_network_header(skb) + thoff); + if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) { + inet_proto_csum_replace2(&udph->check, skb, port, + new_port, true); + if (!udph->check) + udph->check = CSUM_MANGLED_0; + } + + return 0; +} + +static int nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff, + u8 protocol, __be16 port, __be16 new_port) +{ + switch (protocol) { + case IPPROTO_TCP: + if (nf_flow_nat_port_tcp(skb, thoff, port, new_port) < 0) + return NF_DROP; + break; + case IPPROTO_UDP: + if (nf_flow_nat_port_udp(skb, thoff, port, new_port) < 0) + return NF_DROP; + break; + } + + return 0; +} + +int nf_flow_snat_port(const struct flow_offload *flow, + struct sk_buff *skb, unsigned int thoff, + u8 protocol, enum flow_offload_tuple_dir dir) +{ + struct flow_ports *hdr; + __be16 port, new_port; + + if (!pskb_may_pull(skb, thoff + sizeof(*hdr)) || + skb_try_make_writable(skb, thoff + sizeof(*hdr))) + return -1; + + hdr = (void *)(skb_network_header(skb) + thoff); + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + port = hdr->source; + new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port; + hdr->source = new_port; + break; + case FLOW_OFFLOAD_DIR_REPLY: + port = hdr->dest; + new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port; + hdr->dest = new_port; + break; + default: + return -1; + } + + return nf_flow_nat_port(skb, thoff, protocol, port, new_port); +} +EXPORT_SYMBOL_GPL(nf_flow_snat_port); + +int nf_flow_dnat_port(const struct flow_offload *flow, + struct sk_buff *skb, unsigned int thoff, + u8 protocol, enum flow_offload_tuple_dir dir) +{ + struct flow_ports *hdr; + __be16 port, new_port; + + if (!pskb_may_pull(skb, thoff + sizeof(*hdr)) || + skb_try_make_writable(skb, thoff + sizeof(*hdr))) + return -1; + + hdr = (void *)(skb_network_header(skb) + thoff); + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + port = hdr->dest; + new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port; + hdr->dest = new_port; + break; + case FLOW_OFFLOAD_DIR_REPLY: + port = hdr->source; + new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port; + hdr->source = new_port; + break; + default: + return -1; + } + + return nf_flow_nat_port(skb, thoff, protocol, port, new_port); +} +EXPORT_SYMBOL_GPL(nf_flow_dnat_port); + +int nf_flow_table_init(struct nf_flowtable *flowtable) +{ + int err; + + flowtable->flow_timeout = flowtable->flow_timeout ? + flowtable->flow_timeout * HZ : + NF_DEFAULT_FLOW_TIMEOUT; + + INIT_DEFERRABLE_WORK(&flowtable->gc_work, nf_flow_offload_work_gc); + flow_block_init(&flowtable->flow_block); + init_rwsem(&flowtable->flow_block_lock); + + err = rhashtable_init(&flowtable->rhashtable, + &nf_flow_offload_rhash_params); + if (err < 0) + return err; + + queue_delayed_work(system_power_efficient_wq, + &flowtable->gc_work, HZ); + + mutex_lock(&flowtable_lock); + list_add(&flowtable->list, &flowtables); + mutex_unlock(&flowtable_lock); + + return 0; +} +EXPORT_SYMBOL_GPL(nf_flow_table_init); + +static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data) +{ + struct net_device *dev = data; + + if (!dev) { + flow_offload_teardown(flow); + return; + } + + if (net_eq(nf_ct_net(flow->ct), dev_net(dev)) && + (flow->tuplehash[0].tuple.iifidx == dev->ifindex || + flow->tuplehash[1].tuple.iifidx == dev->ifindex)) + flow_offload_teardown(flow); +} + +static void nf_flow_table_iterate_cleanup(struct nf_flowtable *flowtable, + struct net_device *dev) +{ + nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, dev); + flush_delayed_work(&flowtable->gc_work); + nf_flow_table_offload_flush(flowtable); +} + +void nf_flow_table_cleanup(struct net_device *dev) +{ + struct nf_flowtable *flowtable; + + mutex_lock(&flowtable_lock); + list_for_each_entry(flowtable, &flowtables, list) + nf_flow_table_iterate_cleanup(flowtable, dev); + mutex_unlock(&flowtable_lock); +} +EXPORT_SYMBOL_GPL(nf_flow_table_cleanup); + +void nf_flow_table_free(struct nf_flowtable *flow_table) +{ + mutex_lock(&flowtable_lock); + list_del(&flow_table->list); + mutex_unlock(&flowtable_lock); + + cancel_delayed_work_sync(&flow_table->gc_work); + nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL); + nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, flow_table); + nf_flow_table_offload_flush(flow_table); + if (nf_flowtable_hw_offload(flow_table)) + nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, + flow_table); + rhashtable_destroy(&flow_table->rhashtable); +} +EXPORT_SYMBOL_GPL(nf_flow_table_free); diff --git a/src/mlnx-ofa_kernel-5.8/compat/nf_flow_table_offload.c b/src/mlnx-ofa_kernel-5.8/compat/nf_flow_table_offload.c new file mode 100644 index 0000000..16e4652 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/nf_flow_table_offload.c @@ -0,0 +1,924 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define IPS_HW_OFFLOAD_BIT 15 + +static struct workqueue_struct *nf_flow_offload_wq; + +struct flow_offload_work { + struct list_head list; + enum flow_cls_command cmd; + int priority; + struct nf_flowtable *flowtable; + struct flow_offload *flow; + struct work_struct work; +}; + +#define NF_FLOW_DISSECTOR(__match, __type, __field) \ + (__match)->dissector.offset[__type] = \ + offsetof(struct nf_flow_key, __field) + +static void nf_flow_rule_lwt_match(struct nf_flow_match *match, + struct ip_tunnel_info *tun_info) +{ + struct nf_flow_key *mask = &match->mask; + struct nf_flow_key *key = &match->key; + unsigned int enc_keys; + + if (!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX)) + return; + + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_ENC_CONTROL, enc_control); + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_ENC_KEYID, enc_key_id); + key->enc_key_id.keyid = tunnel_id_to_key32(tun_info->key.tun_id); + mask->enc_key_id.keyid = 0xffffffff; + enc_keys = BIT(FLOW_DISSECTOR_KEY_ENC_KEYID) | + BIT(FLOW_DISSECTOR_KEY_ENC_CONTROL); + + if (ip_tunnel_info_af(tun_info) == AF_INET) { + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, + enc_ipv4); + key->enc_ipv4.src = tun_info->key.u.ipv4.dst; + key->enc_ipv4.dst = tun_info->key.u.ipv4.src; + if (key->enc_ipv4.src) + mask->enc_ipv4.src = 0xffffffff; + if (key->enc_ipv4.dst) + mask->enc_ipv4.dst = 0xffffffff; + enc_keys |= BIT(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS); + key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + } else { + memcpy(&key->enc_ipv6.src, &tun_info->key.u.ipv6.dst, + sizeof(struct in6_addr)); + memcpy(&key->enc_ipv6.dst, &tun_info->key.u.ipv6.src, + sizeof(struct in6_addr)); + if (memcmp(&key->enc_ipv6.src, &in6addr_any, + sizeof(struct in6_addr))) + memset(&key->enc_ipv6.src, 0xff, + sizeof(struct in6_addr)); + if (memcmp(&key->enc_ipv6.dst, &in6addr_any, + sizeof(struct in6_addr))) + memset(&key->enc_ipv6.dst, 0xff, + sizeof(struct in6_addr)); + enc_keys |= BIT(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS); + key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + } + + match->dissector.used_keys |= enc_keys; +} + +static int nf_flow_rule_match(struct nf_flow_match *match, + const struct flow_offload_tuple *tuple, + struct dst_entry *other_dst) +{ + struct nf_flow_key *mask = &match->mask; + struct nf_flow_key *key = &match->key; + struct ip_tunnel_info *tun_info; + + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_META, meta); + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_CONTROL, control); + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_BASIC, basic); + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4); + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6); + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_TCP, tcp); + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_PORTS, tp); + + if (other_dst && other_dst->lwtstate) { + tun_info = lwt_tun_info(other_dst->lwtstate); + nf_flow_rule_lwt_match(match, tun_info); + } + + key->meta.ingress_ifindex = tuple->iifidx; + mask->meta.ingress_ifindex = 0xffffffff; + + switch (tuple->l3proto) { + case AF_INET: + key->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + key->basic.n_proto = htons(ETH_P_IP); + key->ipv4.src = tuple->src_v4.s_addr; + mask->ipv4.src = 0xffffffff; + key->ipv4.dst = tuple->dst_v4.s_addr; + mask->ipv4.dst = 0xffffffff; + break; + case AF_INET6: + key->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + key->basic.n_proto = htons(ETH_P_IPV6); + key->ipv6.src = tuple->src_v6; + memset(&mask->ipv6.src, 0xff, sizeof(mask->ipv6.src)); + key->ipv6.dst = tuple->dst_v6; + memset(&mask->ipv6.dst, 0xff, sizeof(mask->ipv6.dst)); + break; + default: + return -EOPNOTSUPP; + } + mask->control.addr_type = 0xffff; + match->dissector.used_keys |= BIT(key->control.addr_type); + mask->basic.n_proto = 0xffff; + + switch (tuple->l4proto) { + case IPPROTO_TCP: + key->tcp.flags = 0; + mask->tcp.flags = cpu_to_be16(be32_to_cpu(TCP_FLAG_RST | TCP_FLAG_FIN) >> 16); + match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_TCP); + break; + case IPPROTO_UDP: + break; + default: + return -EOPNOTSUPP; + } + + key->basic.ip_proto = tuple->l4proto; + mask->basic.ip_proto = 0xff; + + key->tp.src = tuple->src_port; + mask->tp.src = 0xffff; + key->tp.dst = tuple->dst_port; + mask->tp.dst = 0xffff; + + match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_META) | + BIT(FLOW_DISSECTOR_KEY_CONTROL) | + BIT(FLOW_DISSECTOR_KEY_BASIC) | + BIT(FLOW_DISSECTOR_KEY_PORTS); + return 0; +} + +static void flow_offload_mangle(struct flow_action_entry *entry, + enum flow_action_mangle_base htype, u32 offset, + const __be32 *value, const __be32 *mask) +{ + entry->id = FLOW_ACTION_MANGLE; + entry->mangle.htype = htype; + entry->mangle.offset = offset; + memcpy(&entry->mangle.mask, mask, sizeof(u32)); + memcpy(&entry->mangle.val, value, sizeof(u32)); +} + +static inline struct flow_action_entry * +flow_action_entry_next(struct nf_flow_rule *flow_rule) +{ + int i = flow_rule->rule->action.num_entries++; + + return &flow_rule->rule->action.entries[i]; +} + +static int flow_offload_eth_src(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + const struct flow_offload_tuple *tuple = &flow->tuplehash[!dir].tuple; + struct flow_action_entry *entry0 = flow_action_entry_next(flow_rule); + struct flow_action_entry *entry1 = flow_action_entry_next(flow_rule); + struct net_device *dev; + u32 mask, val; + u16 val16; + + dev = dev_get_by_index(net, tuple->iifidx); + if (!dev) + return -ENOENT; + + mask = ~0xffff0000; + memcpy(&val16, dev->dev_addr, 2); + val = val16 << 16; + flow_offload_mangle(entry0, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 4, + &val, &mask); + + mask = ~0xffffffff; + memcpy(&val, dev->dev_addr + 2, 4); + flow_offload_mangle(entry1, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 8, + &val, &mask); + dev_put(dev); + + return 0; +} + +static int flow_offload_eth_dst(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + struct flow_action_entry *entry0 = flow_action_entry_next(flow_rule); + struct flow_action_entry *entry1 = flow_action_entry_next(flow_rule); + const void *daddr = &flow->tuplehash[!dir].tuple.src_v4; + const struct dst_entry *dst_cache; + unsigned char ha[ETH_ALEN]; + struct neighbour *n; + u32 mask, val; + u8 nud_state; + u16 val16; + + dst_cache = flow->tuplehash[dir].tuple.dst_cache; + n = dst_neigh_lookup(dst_cache, daddr); + if (!n) + return -ENOENT; + + read_lock_bh(&n->lock); + nud_state = n->nud_state; + ether_addr_copy(ha, n->ha); + read_unlock_bh(&n->lock); + + if (!(nud_state & NUD_VALID)) { + neigh_release(n); + return -ENOENT; + } + + mask = ~0xffffffff; + memcpy(&val, ha, 4); + flow_offload_mangle(entry0, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 0, + &val, &mask); + + mask = ~0x0000ffff; + memcpy(&val16, ha + 4, 2); + val = val16; + flow_offload_mangle(entry1, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 4, + &val, &mask); + neigh_release(n); + + return 0; +} + +static void flow_offload_ipv4_snat(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + struct flow_action_entry *entry = flow_action_entry_next(flow_rule); + u32 mask = ~htonl(0xffffffff); + __be32 addr; + u32 offset; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4.s_addr; + offset = offsetof(struct iphdr, saddr); + break; + case FLOW_OFFLOAD_DIR_REPLY: + addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4.s_addr; + offset = offsetof(struct iphdr, daddr); + break; + default: + return; + } + + flow_offload_mangle(entry, FLOW_ACT_MANGLE_HDR_TYPE_IP4, offset, + &addr, &mask); +} + +static void flow_offload_ipv4_dnat(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + struct flow_action_entry *entry = flow_action_entry_next(flow_rule); + u32 mask = ~htonl(0xffffffff); + __be32 addr; + u32 offset; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4.s_addr; + offset = offsetof(struct iphdr, daddr); + break; + case FLOW_OFFLOAD_DIR_REPLY: + addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4.s_addr; + offset = offsetof(struct iphdr, saddr); + break; + default: + return; + } + + flow_offload_mangle(entry, FLOW_ACT_MANGLE_HDR_TYPE_IP4, offset, + &addr, &mask); +} + +static void flow_offload_ipv6_mangle(struct nf_flow_rule *flow_rule, + unsigned int offset, + const __be32 *addr, const __be32 *mask) +{ + struct flow_action_entry *entry; + int i; + + for (i = 0; i < sizeof(struct in6_addr) / sizeof(u32); i += sizeof(u32)) { + entry = flow_action_entry_next(flow_rule); + flow_offload_mangle(entry, FLOW_ACT_MANGLE_HDR_TYPE_IP6, + offset + i, &addr[i], mask); + } +} + +static void flow_offload_ipv6_snat(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + u32 mask = ~htonl(0xffffffff); + const __be32 *addr; + u32 offset; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v6.s6_addr32; + offset = offsetof(struct ipv6hdr, saddr); + break; + case FLOW_OFFLOAD_DIR_REPLY: + addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6.s6_addr32; + offset = offsetof(struct ipv6hdr, daddr); + break; + default: + return; + } + + flow_offload_ipv6_mangle(flow_rule, offset, addr, &mask); +} + +static void flow_offload_ipv6_dnat(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + u32 mask = ~htonl(0xffffffff); + const __be32 *addr; + u32 offset; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v6.s6_addr32; + offset = offsetof(struct ipv6hdr, daddr); + break; + case FLOW_OFFLOAD_DIR_REPLY: + addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6.s6_addr32; + offset = offsetof(struct ipv6hdr, saddr); + break; + default: + return; + } + + flow_offload_ipv6_mangle(flow_rule, offset, addr, &mask); +} + +static int flow_offload_l4proto(const struct flow_offload *flow) +{ + u8 protonum = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l4proto; + u8 type = 0; + + switch (protonum) { + case IPPROTO_TCP: + type = FLOW_ACT_MANGLE_HDR_TYPE_TCP; + break; + case IPPROTO_UDP: + type = FLOW_ACT_MANGLE_HDR_TYPE_UDP; + break; + default: + break; + } + + return type; +} + +static void flow_offload_port_snat(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + struct flow_action_entry *entry = flow_action_entry_next(flow_rule); + u32 mask, port; + u32 offset; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + port = ntohs(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port); + offset = 0; /* offsetof(struct tcphdr, source); */ + port = htonl(port << 16); + mask = ~htonl(0xffff0000); + break; + case FLOW_OFFLOAD_DIR_REPLY: + port = ntohs(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port); + offset = 0; /* offsetof(struct tcphdr, dest); */ + port = htonl(port); + mask = ~htonl(0xffff); + break; + default: + return; + } + + flow_offload_mangle(entry, flow_offload_l4proto(flow), offset, + &port, &mask); +} + +static void flow_offload_port_dnat(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + struct flow_action_entry *entry = flow_action_entry_next(flow_rule); + u32 mask, port; + u32 offset; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + port = ntohs(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port); + offset = 0; /* offsetof(struct tcphdr, dest); */ + port = htonl(port); + mask = ~htonl(0xffff); + break; + case FLOW_OFFLOAD_DIR_REPLY: + port = ntohs(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port); + offset = 0; /* offsetof(struct tcphdr, source); */ + port = htonl(port << 16); + mask = ~htonl(0xffff0000); + break; + default: + return; + } + + flow_offload_mangle(entry, flow_offload_l4proto(flow), offset, + &port, &mask); +} + +static void flow_offload_ipv4_checksum(struct net *net, + const struct flow_offload *flow, + struct nf_flow_rule *flow_rule) +{ + u8 protonum = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l4proto; + struct flow_action_entry *entry = flow_action_entry_next(flow_rule); + + entry->id = FLOW_ACTION_CSUM; + entry->csum_flags = TCA_CSUM_UPDATE_FLAG_IPV4HDR; + + switch (protonum) { + case IPPROTO_TCP: + entry->csum_flags |= TCA_CSUM_UPDATE_FLAG_TCP; + break; + case IPPROTO_UDP: + entry->csum_flags |= TCA_CSUM_UPDATE_FLAG_UDP; + break; + } +} + +static void flow_offload_redirect(const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + struct flow_action_entry *entry = flow_action_entry_next(flow_rule); + struct rtable *rt; + + rt = (struct rtable *)flow->tuplehash[dir].tuple.dst_cache; + entry->id = FLOW_ACTION_REDIRECT; + entry->dev = rt->dst.dev; + dev_hold(rt->dst.dev); +} + +static void flow_offload_encap_tunnel(const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + struct flow_action_entry *entry; + struct dst_entry *dst; + + dst = flow->tuplehash[dir].tuple.dst_cache; + if (dst && dst->lwtstate) { + struct ip_tunnel_info *tun_info; + + tun_info = lwt_tun_info(dst->lwtstate); + if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX)) { + entry = flow_action_entry_next(flow_rule); + entry->id = FLOW_ACTION_TUNNEL_ENCAP; + entry->tunnel = tun_info; + } + } +} + +static void flow_offload_decap_tunnel(const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + struct flow_action_entry *entry; + struct dst_entry *dst; + + dst = flow->tuplehash[!dir].tuple.dst_cache; + if (dst && dst->lwtstate) { + struct ip_tunnel_info *tun_info; + + tun_info = lwt_tun_info(dst->lwtstate); + if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX)) { + entry = flow_action_entry_next(flow_rule); + entry->id = FLOW_ACTION_TUNNEL_DECAP; + } + } +} + +int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + flow_offload_decap_tunnel(flow, dir, flow_rule); + flow_offload_encap_tunnel(flow, dir, flow_rule); + + if (flow_offload_eth_src(net, flow, dir, flow_rule) < 0 || + flow_offload_eth_dst(net, flow, dir, flow_rule) < 0) + return -1; + + if (test_bit(NF_FLOW_SNAT, &flow->flags)) { + flow_offload_ipv4_snat(net, flow, dir, flow_rule); + flow_offload_port_snat(net, flow, dir, flow_rule); + } + if (test_bit(NF_FLOW_DNAT, &flow->flags)) { + flow_offload_ipv4_dnat(net, flow, dir, flow_rule); + flow_offload_port_dnat(net, flow, dir, flow_rule); + } + if (test_bit(NF_FLOW_SNAT, &flow->flags) || + test_bit(NF_FLOW_DNAT, &flow->flags)) + flow_offload_ipv4_checksum(net, flow, flow_rule); + + flow_offload_redirect(flow, dir, flow_rule); + + return 0; +} +EXPORT_SYMBOL_GPL(nf_flow_rule_route_ipv4); + +int nf_flow_rule_route_ipv6(struct net *net, const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + flow_offload_decap_tunnel(flow, dir, flow_rule); + flow_offload_encap_tunnel(flow, dir, flow_rule); + + if (flow_offload_eth_src(net, flow, dir, flow_rule) < 0 || + flow_offload_eth_dst(net, flow, dir, flow_rule) < 0) + return -1; + + if (test_bit(NF_FLOW_SNAT, &flow->flags)) { + flow_offload_ipv6_snat(net, flow, dir, flow_rule); + flow_offload_port_snat(net, flow, dir, flow_rule); + } + if (test_bit(NF_FLOW_DNAT, &flow->flags)) { + flow_offload_ipv6_dnat(net, flow, dir, flow_rule); + flow_offload_port_dnat(net, flow, dir, flow_rule); + } + + flow_offload_redirect(flow, dir, flow_rule); + + return 0; +} +EXPORT_SYMBOL_GPL(nf_flow_rule_route_ipv6); + +#define NF_FLOW_RULE_ACTION_MAX 16 + +static inline struct flow_rule *flow_rule_alloc(unsigned int num_actions) +{ + struct flow_rule *rule; + + rule = kzalloc(struct_size(rule, action.entries, num_actions), + GFP_KERNEL); + if (!rule) + return NULL; + + rule->action.num_entries = num_actions; + + return rule; +} + +static struct nf_flow_rule * +nf_flow_offload_rule_alloc(struct net *net, + const struct flow_offload_work *offload, + enum flow_offload_tuple_dir dir) +{ + const struct nf_flowtable *flowtable = offload->flowtable; + const struct flow_offload *flow = offload->flow; + const struct flow_offload_tuple *tuple; + struct nf_flow_rule *flow_rule; + struct dst_entry *other_dst; + int err = -ENOMEM; + + flow_rule = kzalloc(sizeof(*flow_rule), GFP_KERNEL); + if (!flow_rule) + goto err_flow; + + flow_rule->rule = flow_rule_alloc(NF_FLOW_RULE_ACTION_MAX); + if (!flow_rule->rule) + goto err_flow_rule; + + flow_rule->rule->match.dissector = &flow_rule->match.dissector; + flow_rule->rule->match.mask = &flow_rule->match.mask; + flow_rule->rule->match.key = &flow_rule->match.key; + + tuple = &flow->tuplehash[dir].tuple; + other_dst = flow->tuplehash[!dir].tuple.dst_cache; + err = nf_flow_rule_match(&flow_rule->match, tuple, other_dst); + if (err < 0) + goto err_flow_match; + + flow_rule->rule->action.num_entries = 0; + if (flowtable->type->action(net, flow, dir, flow_rule) < 0) + goto err_flow_match; + + return flow_rule; + +err_flow_match: + kfree(flow_rule->rule); +err_flow_rule: + kfree(flow_rule); +err_flow: + return NULL; +} + +static void __nf_flow_offload_destroy(struct nf_flow_rule *flow_rule) +{ + struct flow_action_entry *entry; + int i; + + for (i = 0; i < flow_rule->rule->action.num_entries; i++) { + entry = &flow_rule->rule->action.entries[i]; + if (entry->id != FLOW_ACTION_REDIRECT) + continue; + + dev_put(entry->dev); + } + kfree(flow_rule->rule); + kfree(flow_rule); +} + +static void nf_flow_offload_destroy(struct nf_flow_rule *flow_rule[]) +{ + int i; + + for (i = 0; i < FLOW_OFFLOAD_DIR_MAX; i++) + __nf_flow_offload_destroy(flow_rule[i]); +} + +static int nf_flow_offload_alloc(const struct flow_offload_work *offload, + struct nf_flow_rule *flow_rule[]) +{ + struct net *net = read_pnet(&offload->flowtable->net); + + flow_rule[0] = nf_flow_offload_rule_alloc(net, offload, + FLOW_OFFLOAD_DIR_ORIGINAL); + if (!flow_rule[0]) + return -ENOMEM; + + flow_rule[1] = nf_flow_offload_rule_alloc(net, offload, + FLOW_OFFLOAD_DIR_REPLY); + if (!flow_rule[1]) { + __nf_flow_offload_destroy(flow_rule[0]); + return -ENOMEM; + } + + return 0; +} + +static void nf_flow_offload_init(struct flow_cls_offload1 *cls_flow, + __be16 proto, int priority, + enum flow_cls_command cmd, + const struct flow_offload_tuple *tuple, + struct netlink_ext_ack *extack) +{ + cls_flow->common.protocol = proto; + cls_flow->common.prio = priority; + cls_flow->common.extack = extack; + cls_flow->command = cmd; + cls_flow->cookie = (unsigned long)tuple; +} + +static int nf_flow_offload_tuple(struct nf_flowtable *flowtable, + struct flow_offload *flow, + struct nf_flow_rule *flow_rule, + enum flow_offload_tuple_dir dir, + int priority, int cmd, + struct flow_stats *stats, + struct list_head *block_cb_list) +{ + struct flow_cls_offload1 cls_flow = {}; + struct flow_block_cb *block_cb; + struct netlink_ext_ack extack; + __be16 proto = ETH_P_ALL; + int err, i = 0; + + nf_flow_offload_init(&cls_flow, proto, priority, cmd, + &flow->tuplehash[dir].tuple, &extack); + if (cmd == FLOW_CLS_REPLACE) + cls_flow.rule = flow_rule->rule; + + down_read(&flowtable->flow_block_lock); + list_for_each_entry(block_cb, block_cb_list, list) { + err = block_cb->cb(TC_SETUP_CLSFLOWER, &cls_flow, + block_cb->cb_priv); + if (err < 0) + continue; + + i++; + } + up_read(&flowtable->flow_block_lock); + + if (cmd == FLOW_CLS_STATS) + memcpy(stats, &cls_flow.stats, sizeof(*stats)); + + return i; +} + +static int flow_offload_tuple_add(struct flow_offload_work *offload, + struct nf_flow_rule *flow_rule, + enum flow_offload_tuple_dir dir) +{ + return nf_flow_offload_tuple(offload->flowtable, offload->flow, + flow_rule, dir, offload->priority, + FLOW_CLS_REPLACE, NULL, + &offload->flowtable->flow_block.cb_list); +} + +static void flow_offload_tuple_del(struct flow_offload_work *offload, + enum flow_offload_tuple_dir dir) +{ + nf_flow_offload_tuple(offload->flowtable, offload->flow, NULL, dir, + offload->priority, FLOW_CLS_DESTROY, NULL, + &offload->flowtable->flow_block.cb_list); +} + +static int flow_offload_rule_add(struct flow_offload_work *offload, + struct nf_flow_rule *flow_rule[]) +{ + int ok_count = 0; + + ok_count += flow_offload_tuple_add(offload, flow_rule[0], + FLOW_OFFLOAD_DIR_ORIGINAL); + ok_count += flow_offload_tuple_add(offload, flow_rule[1], + FLOW_OFFLOAD_DIR_REPLY); + if (ok_count == 0) + return -ENOENT; + + return 0; +} + +static void flow_offload_work_add(struct flow_offload_work *offload) +{ + struct nf_flow_rule *flow_rule[FLOW_OFFLOAD_DIR_MAX]; + int err; + + err = nf_flow_offload_alloc(offload, flow_rule); + if (err < 0) + return; + + err = flow_offload_rule_add(offload, flow_rule); + if (err < 0) + set_bit(NF_FLOW_HW_REFRESH, &offload->flow->flags); + else + set_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status); + + nf_flow_offload_destroy(flow_rule); +} + +static void flow_offload_work_del(struct flow_offload_work *offload) +{ + clear_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status); + flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_ORIGINAL); + flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_REPLY); + set_bit(NF_FLOW_HW_DEAD, &offload->flow->flags); +} + +static void flow_offload_tuple_stats(struct flow_offload_work *offload, + enum flow_offload_tuple_dir dir, + struct flow_stats *stats) +{ + nf_flow_offload_tuple(offload->flowtable, offload->flow, NULL, dir, + offload->priority, FLOW_CLS_STATS, stats, + &offload->flowtable->flow_block.cb_list); +} + +static void flow_offload_work_stats(struct flow_offload_work *offload) +{ + u64 flow_timeout = nf_flow_offload_timeout(offload->flowtable); + struct flow_stats stats[FLOW_OFFLOAD_DIR_MAX] = {}; + u64 lastused; + + flow_offload_tuple_stats(offload, FLOW_OFFLOAD_DIR_ORIGINAL, &stats[0]); + flow_offload_tuple_stats(offload, FLOW_OFFLOAD_DIR_REPLY, &stats[1]); + + lastused = max_t(u64, stats[0].lastused, stats[1].lastused); + offload->flow->timeout = max_t(u64, offload->flow->timeout, + lastused + flow_timeout); +} + +static void flow_offload_work_handler(struct work_struct *work) +{ + struct flow_offload_work *offload; + + offload = container_of(work, struct flow_offload_work, work); + switch (offload->cmd) { + case FLOW_CLS_REPLACE: + flow_offload_work_add(offload); + break; + case FLOW_CLS_DESTROY: + flow_offload_work_del(offload); + break; + case FLOW_CLS_STATS: + flow_offload_work_stats(offload); + break; + default: + WARN_ON_ONCE(1); + } + + clear_bit(NF_FLOW_HW_PENDING, &offload->flow->flags); + kfree(offload); +} + +static void flow_offload_queue_work(struct flow_offload_work *offload) +{ + queue_work(nf_flow_offload_wq, &offload->work); +} + +static struct flow_offload_work * +nf_flow_offload_work_alloc(struct nf_flowtable *flowtable, + struct flow_offload *flow, unsigned int cmd) +{ + struct flow_offload_work *offload; + + if (test_and_set_bit(NF_FLOW_HW_PENDING, &flow->flags)) + return NULL; + + offload = kmalloc(sizeof(struct flow_offload_work), GFP_ATOMIC); + if (!offload) { + clear_bit(NF_FLOW_HW_PENDING, &flow->flags); + return NULL; + } + + offload->cmd = cmd; + offload->flow = flow; + offload->priority = flowtable->priority; + offload->flowtable = flowtable; + INIT_WORK(&offload->work, flow_offload_work_handler); + + return offload; +} + + +void nf_flow_offload_add(struct nf_flowtable *flowtable, + struct flow_offload *flow) +{ + struct flow_offload_work *offload; + + offload = nf_flow_offload_work_alloc(flowtable, flow, FLOW_CLS_REPLACE); + if (!offload) + return; + + flow_offload_queue_work(offload); +} + +void nf_flow_offload_del(struct nf_flowtable *flowtable, + struct flow_offload *flow) +{ + struct flow_offload_work *offload; + + offload = nf_flow_offload_work_alloc(flowtable, flow, FLOW_CLS_DESTROY); + if (!offload) + return; + + set_bit(NF_FLOW_HW_DYING, &flow->flags); + flow_offload_queue_work(offload); +} + +void nf_flow_offload_stats(struct nf_flowtable *flowtable, + struct flow_offload *flow) +{ + struct flow_offload_work *offload; + __s32 delta; + + delta = nf_flow_timeout_delta(flow->timeout); + if ((delta >= (9 * nf_flow_offload_timeout(flowtable)) / 10)) + return; + + offload = nf_flow_offload_work_alloc(flowtable, flow, FLOW_CLS_STATS); + if (!offload) + return; + + flow_offload_queue_work(offload); +} + +void nf_flow_table_offload_flush(struct nf_flowtable *flowtable) +{ + if (nf_flowtable_hw_offload(flowtable)) + flush_workqueue(nf_flow_offload_wq); +} + +int nf_flow_table_offload_init(void) +{ + nf_flow_offload_wq = alloc_workqueue("nf_flow_table_offload", + WQ_UNBOUND, 0); + if (!nf_flow_offload_wq) + return -ENOMEM; + return 0; +} +EXPORT_SYMBOL_GPL(nf_flow_table_offload_init); + +void nf_flow_table_offload_exit(void) +{ + destroy_workqueue(nf_flow_offload_wq); +} +EXPORT_SYMBOL_GPL(nf_flow_table_offload_exit); diff --git a/src/mlnx-ofa_kernel-5.8/compat/output_core.c b/src/mlnx-ofa_kernel-5.8/compat/output_core.c new file mode 100644 index 0000000..f987a0a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/output_core.c @@ -0,0 +1,27 @@ +#include +#include +#include +#include +#include +#include + +#ifndef HAVE_IP6_DST_HOPLIMIT +#if IS_ENABLED(CONFIG_IPV6) +#define ip6_dst_hoplimit LINUX_BACKPORT(ip6_dst_hoplimit) +int ip6_dst_hoplimit(struct dst_entry *dst) +{ + int hoplimit = dst_metric(dst, RTAX_HOPLIMIT); + if (hoplimit < 0) { + struct net_device *dev = dst->dev; + struct inet6_dev *idev = in6_dev_get(dev); + if (idev) { + hoplimit = idev->cnf.hop_limit; + in6_dev_put(idev); + } else + hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit; + } + return hoplimit; +} +EXPORT_SYMBOL(ip6_dst_hoplimit); +#endif +#endif diff --git a/src/mlnx-ofa_kernel-5.8/compat/pci.c b/src/mlnx-ofa_kernel-5.8/compat/pci.c new file mode 100644 index 0000000..5e35bda --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/pci.c @@ -0,0 +1,393 @@ + +#include + +const unsigned char pcie_link_speed[] = { + PCI_SPEED_UNKNOWN, /* 0 */ + PCIE_SPEED_2_5GT, /* 1 */ + PCIE_SPEED_5_0GT, /* 2 */ + PCIE_SPEED_8_0GT, /* 3 */ + PCIE_SPEED_16_0GT, /* 4 */ + PCI_SPEED_UNKNOWN, /* 5 */ + PCI_SPEED_UNKNOWN, /* 6 */ + PCI_SPEED_UNKNOWN, /* 7 */ + PCI_SPEED_UNKNOWN, /* 8 */ + PCI_SPEED_UNKNOWN, /* 9 */ + PCI_SPEED_UNKNOWN, /* A */ + PCI_SPEED_UNKNOWN, /* B */ + PCI_SPEED_UNKNOWN, /* C */ + PCI_SPEED_UNKNOWN, /* D */ + PCI_SPEED_UNKNOWN, /* E */ + PCI_SPEED_UNKNOWN /* F */ +}; + +#ifndef HAVE_PCIE_GET_MINIMUM_LINK +/** + * pcie_get_minimum_link - determine minimum link settings of a PCI device + * @dev: PCI device to query + * @speed: storage for minimum speed + * @width: storage for minimum width + * + * This function will walk up the PCI device chain and determine the minimum + * link width and speed of the device. + */ +int pcie_get_minimum_link(struct pci_dev *dev, enum pci_bus_speed *speed, + enum pcie_link_width *width) +{ + int ret; + + *speed = PCI_SPEED_UNKNOWN; + *width = PCIE_LNK_WIDTH_UNKNOWN; + + while (dev) { + u16 lnksta; + enum pci_bus_speed next_speed; + enum pcie_link_width next_width; + + ret = pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &lnksta); + if (ret) + return ret; + + next_speed = pcie_link_speed[lnksta & PCI_EXP_LNKSTA_CLS]; + next_width = (lnksta & PCI_EXP_LNKSTA_NLW) >> + PCI_EXP_LNKSTA_NLW_SHIFT; + + if (next_speed < *speed) + *speed = next_speed; + + if (next_width < *width) + *width = next_width; + + dev = dev->bus->self; + } + + return 0; +} +EXPORT_SYMBOL(pcie_get_minimum_link); +#endif + +#ifndef HAVE_PCIE_PRINT_LINK_STATUS +/** + * pcie_bandwidth_available - determine minimum link settings of a PCIe + * device and its bandwidth limitation + * @dev: PCI device to query + * @limiting_dev: storage for device causing the bandwidth limitation + * @speed: storage for speed of limiting device + * @width: storage for width of limiting device + * + * Walk up the PCI device chain and find the point where the minimum + * bandwidth is available. Return the bandwidth available there and (if + * limiting_dev, speed, and width pointers are supplied) information about + * that point. The bandwidth returned is in Mb/s, i.e., megabits/second of + * raw bandwidth. + */ +u32 pcie_bandwidth_available(struct pci_dev *dev, struct pci_dev **limiting_dev, + enum pci_bus_speed *speed, + enum pcie_link_width *width) +{ + u16 lnksta; + enum pci_bus_speed next_speed; + enum pcie_link_width next_width; + u32 bw, next_bw; + + if (speed) + *speed = PCI_SPEED_UNKNOWN; + if (width) + *width = PCIE_LNK_WIDTH_UNKNOWN; + + bw = 0; + + while (dev) { + pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &lnksta); + + next_speed = pcie_link_speed[lnksta & PCI_EXP_LNKSTA_CLS]; + next_width = (lnksta & PCI_EXP_LNKSTA_NLW) >> + PCI_EXP_LNKSTA_NLW_SHIFT; + + next_bw = next_width * PCIE_SPEED2MBS_ENC(next_speed); + + /* Check if current device limits the total bandwidth */ + if (!bw || next_bw <= bw) { + bw = next_bw; + + if (limiting_dev) + *limiting_dev = dev; + if (speed) + *speed = next_speed; + if (width) + *width = next_width; + } + + dev = pci_upstream_bridge(dev); + } + + return bw; +} +EXPORT_SYMBOL(pcie_bandwidth_available); + +/** + * pcie_get_speed_cap - query for the PCI device's link speed capability + * @dev: PCI device to query + * + * Query the PCI device speed capability. Return the maximum link speed + * supported by the device. + */ +#define pcie_get_speed_cap LINUX_BACKPORT(pcie_get_speed_cap) +enum pci_bus_speed pcie_get_speed_cap(struct pci_dev *dev) +{ + u32 lnkcap2, lnkcap; + + /* + * PCIe r4.0 sec 7.5.3.18 recommends using the Supported Link + * Speeds Vector in Link Capabilities 2 when supported, falling + * back to Max Link Speed in Link Capabilities otherwise. + */ + pcie_capability_read_dword(dev, PCI_EXP_LNKCAP2, &lnkcap2); + if (lnkcap2) { /* PCIe r3.0-compliant */ + if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_16_0GB) + return PCIE_SPEED_16_0GT; + else if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_8_0GB) + return PCIE_SPEED_8_0GT; + else if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_5_0GB) + return PCIE_SPEED_5_0GT; + else if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_2_5GB) + return PCIE_SPEED_2_5GT; + return PCI_SPEED_UNKNOWN; + } + + pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, &lnkcap); + if (lnkcap) { + if (lnkcap & PCI_EXP_LNKCAP_SLS_16_0GB) + return PCIE_SPEED_16_0GT; + else if (lnkcap & PCI_EXP_LNKCAP_SLS_8_0GB) + return PCIE_SPEED_8_0GT; + else if (lnkcap & PCI_EXP_LNKCAP_SLS_5_0GB) + return PCIE_SPEED_5_0GT; + else if (lnkcap & PCI_EXP_LNKCAP_SLS_2_5GB) + return PCIE_SPEED_2_5GT; + } + + return PCI_SPEED_UNKNOWN; +} + +/** + * pcie_get_width_cap - query for the PCI device's link width capability + * @dev: PCI device to query + * + * Query the PCI device width capability. Return the maximum link width + * supported by the device. + */ +#define pcie_get_width_cap LINUX_BACKPORT(pcie_get_width_cap) +enum pcie_link_width pcie_get_width_cap(struct pci_dev *dev) +{ + u32 lnkcap; + + pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, &lnkcap); + if (lnkcap) + return (lnkcap & PCI_EXP_LNKCAP_MLW) >> 4; + + return PCIE_LNK_WIDTH_UNKNOWN; +} + +/** + * pcie_bandwidth_capable - calculate a PCI device's link bandwidth capability + * @dev: PCI device + * @speed: storage for link speed + * @width: storage for link width + * + * Calculate a PCI device's link bandwidth by querying for its link speed + * and width, multiplying them, and applying encoding overhead. The result + * is in Mb/s, i.e., megabits/second of raw bandwidth. + */ +#define pcie_bandwidth_capable LINUX_BACKPORT(pcie_bandwidth_capable) +u32 pcie_bandwidth_capable(struct pci_dev *dev, enum pci_bus_speed *speed, + enum pcie_link_width *width) +{ + *speed = pcie_get_speed_cap(dev); + *width = pcie_get_width_cap(dev); + + if (*speed == PCI_SPEED_UNKNOWN || *width == PCIE_LNK_WIDTH_UNKNOWN) + return 0; + + return *width * PCIE_SPEED2MBS_ENC(*speed); +} + +/** + * pcie_print_link_status - Report the PCI device's link speed and width + * @dev: PCI device to query + * + * Report the available bandwidth at the device. If this is less than the + * device is capable of, report the device's maximum possible bandwidth and + * the upstream link that limits its performance to less than that. + */ +void pcie_print_link_status(struct pci_dev *dev) +{ + enum pcie_link_width width, width_cap; + enum pci_bus_speed speed, speed_cap; + struct pci_dev *limiting_dev = NULL; + u32 bw_avail, bw_cap; + + bw_cap = pcie_bandwidth_capable(dev, &speed_cap, &width_cap); + bw_avail = pcie_bandwidth_available(dev, &limiting_dev, &speed, &width); + + if (bw_avail >= bw_cap) + pci_info(dev, "%u.%03u Gb/s available PCIe bandwidth (%s x%d link)\n", + bw_cap / 1000, bw_cap % 1000, + PCIE_SPEED2STR(speed_cap), width_cap); + else + pci_info(dev, "%u.%03u Gb/s available PCIe bandwidth, limited by %s x%d link at %s (capable of %u.%03u Gb/s with %s x%d link)\n", + bw_avail / 1000, bw_avail % 1000, + PCIE_SPEED2STR(speed), width, + limiting_dev ? pci_name(limiting_dev) : "", + bw_cap / 1000, bw_cap % 1000, + PCIE_SPEED2STR(speed_cap), width_cap); +} +EXPORT_SYMBOL(pcie_print_link_status); +#endif /* HAVE_PCIE_PRINT_LINK_STATUS */ + +#ifndef HAVE_PCI_ENABLE_ATOMIC_OPS_TO_ROOT +/** + * pci_enable_atomic_ops_to_root - enable AtomicOp requests to root port + * @dev: the PCI device + * @cap_mask: mask of desired AtomicOp sizes, including one or more of: + * PCI_EXP_DEVCAP2_ATOMIC_COMP32 + * PCI_EXP_DEVCAP2_ATOMIC_COMP64 + * PCI_EXP_DEVCAP2_ATOMIC_COMP128 + * + * Return 0 if all upstream bridges support AtomicOp routing, egress + * blocking is disabled on all upstream ports, and the root port supports + * the requested completion capabilities (32-bit, 64-bit and/or 128-bit + * AtomicOp completion), or negative otherwise. + */ +int pci_enable_atomic_ops_to_root(struct pci_dev *dev, u32 cap_mask) +{ + struct pci_dev *bridge; + u32 ctl2, cap2; + u16 flags; + int rc = 0; + + bridge = dev->bus->self; + if (!bridge) + return -EINVAL; + + /* Check atomic routing support all the way to root complex */ + while (bridge->bus->parent) { + rc = pcie_capability_read_word(bridge, PCI_EXP_FLAGS, &flags); + if (rc || ((flags & PCI_EXP_FLAGS_VERS) < 2)) + return -EINVAL; + + rc = pcie_capability_read_dword(bridge, PCI_EXP_DEVCAP2, &cap2); + if (rc) + return -EINVAL; + rc = pcie_capability_read_dword(bridge, PCI_EXP_DEVCTL2, &ctl2); + if (rc) + return -EINVAL; + + if (!(cap2 & PCI_EXP_DEVCAP2_ATOMIC_ROUTE) || + (ctl2 & PCI_EXP_DEVCTL2_ATOMIC_EGRESS_BLOCK)) + return -EINVAL; + bridge = bridge->bus->parent->self; + } + + rc = pcie_capability_read_word(bridge, PCI_EXP_FLAGS, &flags); + if (rc || ((flags & PCI_EXP_FLAGS_VERS) < 2)) + return -EINVAL; + + rc = pcie_capability_read_dword(bridge, PCI_EXP_DEVCAP2, &cap2); + if (rc || !(cap2 & cap_mask)) + return -EINVAL; + + /* Set atomic operations */ + pcie_capability_set_word(dev, PCI_EXP_DEVCTL2, + PCI_EXP_DEVCTL2_ATOMIC_REQ); + return 0; +} +EXPORT_SYMBOL(pci_enable_atomic_ops_to_root); +#endif + +#ifdef HAVE_NO_LINKSTA_SYSFS +/* based on upstream drivers/pci/pci-sysfs.c*/ +static ssize_t current_link_speed_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct pci_dev *pci_dev = to_pci_dev(dev); + u16 linkstat; + int err; + const char *speed; + + err = pcie_capability_read_word(pci_dev, PCI_EXP_LNKSTA, &linkstat); + if (err) + return -EINVAL; + + switch (linkstat & PCI_EXP_LNKSTA_CLS) { + case PCI_EXP_LNKSTA_CLS_16_0GB: + speed = "16 GT/s"; + break; + case PCI_EXP_LNKSTA_CLS_8_0GB: + speed = "8 GT/s"; + break; + case PCI_EXP_LNKSTA_CLS_5_0GB: + speed = "5 GT/s"; + break; + case PCI_EXP_LNKSTA_CLS_2_5GB: + speed = "2.5 GT/s"; + break; + default: + speed = "Unknown speed"; + } + + return sprintf(buf, "%s\n", speed); +} +static DEVICE_ATTR_RO(current_link_speed); + +static ssize_t current_link_width_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct pci_dev *pci_dev = to_pci_dev(dev); + u16 linkstat; + int err; + + err = pcie_capability_read_word(pci_dev, PCI_EXP_LNKSTA, &linkstat); + if (err) + return -EINVAL; + + return sprintf(buf, "%u\n", + (linkstat & PCI_EXP_LNKSTA_NLW) >> PCI_EXP_LNKSTA_NLW_SHIFT); +} +static DEVICE_ATTR_RO(current_link_width); + +static struct attribute *pcie_dev_attrs[] = { + &dev_attr_current_link_speed.attr, + &dev_attr_current_link_width.attr, + NULL, +}; + +static umode_t pcie_dev_attrs_are_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct device *dev = kobj_to_dev(kobj); + struct pci_dev *pdev = to_pci_dev(dev); + + if (pci_is_pcie(pdev)) + return a->mode; + + return 0; +} + +static const struct attribute_group pcie_dev_attr_group = { + .attrs = pcie_dev_attrs, + .is_visible = pcie_dev_attrs_are_visible, +}; + +void register_pcie_dev_attr_group(struct pci_dev *pdev) +{ + if (sysfs_create_group(&pdev->dev.kobj, &pcie_dev_attr_group)) + pci_info(pdev, "failed to register sysfs PCIe device group"); +} +EXPORT_SYMBOL(register_pcie_dev_attr_group); + +void unregister_pcie_dev_attr_group(struct pci_dev *pdev) +{ + sysfs_remove_group(&pdev->dev.kobj, &pcie_dev_attr_group); +} +EXPORT_SYMBOL(unregister_pcie_dev_attr_group); +#endif diff --git a/src/mlnx-ofa_kernel-5.8/compat/rdma_dim.c b/src/mlnx-ofa_kernel-5.8/compat/rdma_dim.c new file mode 100644 index 0000000..15462d5 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/rdma_dim.c @@ -0,0 +1,108 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2019, Mellanox Technologies inc. All rights reserved. + */ + +#include + +static int rdma_dim_step(struct dim *dim) +{ + if (dim->tune_state == DIM_GOING_RIGHT) { + if (dim->profile_ix == (RDMA_DIM_PARAMS_NUM_PROFILES - 1)) + return DIM_ON_EDGE; + dim->profile_ix++; + dim->steps_right++; + } + if (dim->tune_state == DIM_GOING_LEFT) { + if (dim->profile_ix == 0) + return DIM_ON_EDGE; + dim->profile_ix--; + dim->steps_left++; + } + + return DIM_STEPPED; +} + +static int rdma_dim_stats_compare(struct dim_stats *curr, + struct dim_stats *prev) +{ + /* first stat */ + if (!prev->cpms) + return DIM_STATS_SAME; + + if (IS_SIGNIFICANT_DIFF(curr->cpms, prev->cpms)) + return (curr->cpms > prev->cpms) ? DIM_STATS_BETTER : + DIM_STATS_WORSE; + + if (IS_SIGNIFICANT_DIFF(curr->cpe_ratio, prev->cpe_ratio)) + return (curr->cpe_ratio > prev->cpe_ratio) ? DIM_STATS_BETTER : + DIM_STATS_WORSE; + + return DIM_STATS_SAME; +} + +static bool rdma_dim_decision(struct dim_stats *curr_stats, struct dim *dim) +{ + int prev_ix = dim->profile_ix; + u8 state = dim->tune_state; + int stats_res; + int step_res; + + if (state != DIM_PARKING_ON_TOP && state != DIM_PARKING_TIRED) { + stats_res = rdma_dim_stats_compare(curr_stats, + &dim->prev_stats); + + switch (stats_res) { + case DIM_STATS_SAME: + if (curr_stats->cpe_ratio <= 50 * prev_ix) + dim->profile_ix = 0; + break; + case DIM_STATS_WORSE: + dim_turn(dim); + fallthrough; + case DIM_STATS_BETTER: + step_res = rdma_dim_step(dim); + if (step_res == DIM_ON_EDGE) + dim_turn(dim); + break; + } + } + + dim->prev_stats = *curr_stats; + + return dim->profile_ix != prev_ix; +} + +void rdma_dim(struct dim *dim, u64 completions) +{ + struct dim_sample *curr_sample = &dim->measuring_sample; + struct dim_stats curr_stats; + u32 nevents; + + dim_update_sample_with_comps(curr_sample->event_ctr + 1, 0, 0, + curr_sample->comp_ctr + completions, + &dim->measuring_sample); + + switch (dim->state) { + case DIM_MEASURE_IN_PROGRESS: + nevents = curr_sample->event_ctr - dim->start_sample.event_ctr; + if (nevents < DIM_NEVENTS) + break; + dim_calc_stats(&dim->start_sample, curr_sample, &curr_stats); + if (rdma_dim_decision(&curr_stats, dim)) { + dim->state = DIM_APPLY_NEW_PROFILE; + schedule_work(&dim->work); + break; + } + fallthrough; + case DIM_START_MEASURE: + dim->state = DIM_MEASURE_IN_PROGRESS; + dim_update_sample_with_comps(curr_sample->event_ctr, 0, 0, + curr_sample->comp_ctr, + &dim->start_sample); + break; + case DIM_APPLY_NEW_PROFILE: + break; + } +} +EXPORT_SYMBOL(rdma_dim); diff --git a/src/mlnx-ofa_kernel-5.8/compat/rhashtable.c b/src/mlnx-ofa_kernel-5.8/compat/rhashtable.c new file mode 100644 index 0000000..46978e6 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/rhashtable.c @@ -0,0 +1,2344 @@ +/* + * Resizable, Scalable, Concurrent Hash Table + * + * Copyright (c) 2015 Herbert Xu + * Copyright (c) 2014-2015 Thomas Graf + * Copyright (c) 2008-2014 Patrick McHardy + * + * Code partially derived from nft_hash + * Rewritten with rehash code from br_multicast plus single list + * pointer as suggested by Josh Triplett + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include + +#if (!defined(HAVE_RHLTABLE) && defined(HAVE_NETNS_FRAGS_RHASHTABLE)) + +/* THIS PART OF THE BACKPORT IS FOR PRE-RHLTABLE KERNELS + * which added (via cherry-picking) struct rhashtable + * to struct netns_frags. This has been done in some stable + * branches, such as upstream kernel 4.4.181, and + * Ubuntu-4.4.0-145. + * + * This addition of rhashtable to that struct causes + * struct net (from file net_namespace.h) to include struct rhashtable + * indirectly in several places. Since a struct rhashtable which + * supports rhltable has a different size than struct rhashtable without + * rhltable support, this causes a size-mismatch between the inbox + * struct rhashtable and the backported rhashtable. + * + * Due to net notifiers, struct net used by the inbox kernel MUST + * have the same size as struct net used by the mlnx_ofed driver + * (See, for example, cma_init_net->cma_pernet->net_generic(), which + * uses offsets in struct net). + * Therefore, we cannot have the backport simply replace the inbox's + * rhashtable. + * + * Instead, we do a private implementation of the rhltable mechanism + * for use by those modules which need rhltable (such as mlx5_core, + * in file fs_core.c) -- and we use the inbox rhashtable implementation + * everywhere else. + * + * The private rhltable implementation also requires a private rhashtable + * implementation as well. The private implementations are achieved + * by adding prefix "bp_" to functions, structs and macros where required, + * in order not to have naming conflicts with the inbox rhashtable + * implementation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define BP_HASH_DEFAULT_SIZE 64UL +#define BP_HASH_MIN_SIZE 4U +#define BP_BUCKET_LOCKS_PER_CPU 32UL + +int bp_alloc_bucket_spinlocks(spinlock_t **locks, unsigned int *locks_mask, + size_t max_size, unsigned int cpu_mult, gfp_t gfp) +{ + spinlock_t *tlocks = NULL; + unsigned int i, size; +#if defined(CONFIG_PROVE_LOCKING) + unsigned int nr_pcpus = 2; +#else + unsigned int nr_pcpus = num_possible_cpus(); +#endif + + if (cpu_mult) { + nr_pcpus = min_t(unsigned int, nr_pcpus, 64UL); + size = min_t(unsigned int, nr_pcpus * cpu_mult, max_size); + } else { + size = max_size; + } + + if (sizeof(spinlock_t) != 0) { + if (gfpflags_allow_blocking(gfp)) + tlocks = kvmalloc(size * sizeof(spinlock_t), gfp); + else + tlocks = kmalloc_array(size, sizeof(spinlock_t), gfp); + if (!tlocks) + return -ENOMEM; + for (i = 0; i < size; i++) + spin_lock_init(&tlocks[i]); + } + + *locks = tlocks; + *locks_mask = size - 1; + + return 0; +} + +void bp_free_bucket_spinlocks(spinlock_t *locks) +{ + kvfree(locks); +} + +union bp_nested_table { + union bp_nested_table __rcu *table; + struct bp_rhash_head __rcu *bucket; +}; + +static u32 bp_head_hashfn(struct bp_rhashtable *ht, + const struct bp_bucket_table *tbl, + const struct bp_rhash_head *he) +{ + return bp_rht_head_hashfn(ht, tbl, he, ht->p); +} + +#ifdef CONFIG_PROVE_LOCKING +#define BP_ASSERT_RHT_MUTEX(HT) BUG_ON(!bp_lockdep_rht_mutex_is_held(HT)) + +int bp_lockdep_rht_mutex_is_held(struct bp_rhashtable *ht) +{ + return (debug_locks) ? lockdep_is_held(&ht->mutex) : 1; +} +EXPORT_SYMBOL_GPL(bp_lockdep_rht_mutex_is_held); + +int bp_lockdep_rht_bucket_is_held(const struct bp_bucket_table *tbl, u32 hash) +{ + spinlock_t *lock = bp_rht_bucket_lock(tbl, hash); + + return (debug_locks) ? lockdep_is_held(lock) : 1; +} +EXPORT_SYMBOL_GPL(bp_lockdep_rht_bucket_is_held); +#else +#define BP_ASSERT_RHT_MUTEX(HT) +#endif + +static void bp_nested_table_free(union bp_nested_table *ntbl, unsigned int size) +{ + const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); + const unsigned int len = 1 << shift; + unsigned int i; + + ntbl = rcu_dereference_raw(ntbl->table); + if (!ntbl) + return; + + if (size > len) { + size >>= shift; + for (i = 0; i < len; i++) + bp_nested_table_free(ntbl + i, size); + } + + kfree(ntbl); +} + +static void bp_nested_bucket_table_free(const struct bp_bucket_table *tbl) +{ + unsigned int size = tbl->size >> tbl->nest; + unsigned int len = 1 << tbl->nest; + union bp_nested_table *ntbl; + unsigned int i; + + ntbl = (union bp_nested_table *)rcu_dereference_raw(tbl->buckets[0]); + + for (i = 0; i < len; i++) + bp_nested_table_free(ntbl + i, size); + + kfree(ntbl); +} + +static void bp_bucket_table_free(const struct bp_bucket_table *tbl) +{ + if (tbl->nest) + bp_nested_bucket_table_free(tbl); + + bp_free_bucket_spinlocks(tbl->locks); + kvfree(tbl); +} + +static void bp_bucket_table_free_rcu(struct rcu_head *head) +{ + bp_bucket_table_free(container_of(head, struct bp_bucket_table, rcu)); +} + +static union bp_nested_table *bp_nested_table_alloc(struct bp_rhashtable *ht, + union bp_nested_table __rcu **prev, + unsigned int shifted, + unsigned int nhash) +{ + union bp_nested_table *ntbl; + int i; + + ntbl = rcu_dereference(*prev); + if (ntbl) + return ntbl; + + ntbl = kzalloc(PAGE_SIZE, GFP_ATOMIC); + + if (ntbl && shifted) { + for (i = 0; i < PAGE_SIZE / sizeof(ntbl[0].bucket); i++) + BP_INIT_RHT_NULLS_HEAD(ntbl[i].bucket, ht, + (i << shifted) | nhash); + } + + rcu_assign_pointer(*prev, ntbl); + + return ntbl; +} + +static struct bp_bucket_table *bp_nested_bucket_table_alloc(struct bp_rhashtable *ht, + size_t nbuckets, + gfp_t gfp) +{ + const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); + struct bp_bucket_table *tbl; + size_t size; + + if (nbuckets < (1 << (shift + 1))) + return NULL; + + size = sizeof(*tbl) + sizeof(tbl->buckets[0]); + + tbl = kzalloc(size, gfp); + if (!tbl) + return NULL; + + if (!bp_nested_table_alloc(ht, (union bp_nested_table __rcu **)tbl->buckets, + 0, 0)) { + kfree(tbl); + return NULL; + } + + tbl->nest = (ilog2(nbuckets) - 1) % shift + 1; + + return tbl; +} + +static struct bp_bucket_table *bp_bucket_table_alloc(struct bp_rhashtable *ht, + size_t nbuckets, + gfp_t gfp) +{ + struct bp_bucket_table *tbl = NULL; + size_t size, max_locks; + int i; + + size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]); + if (gfp != GFP_KERNEL) + tbl = kzalloc(size, gfp | __GFP_NOWARN | __GFP_NORETRY); + else + tbl = kvzalloc(size, gfp); + + size = nbuckets; + + if (tbl == NULL && gfp != GFP_KERNEL) { + tbl = bp_nested_bucket_table_alloc(ht, nbuckets, gfp); + nbuckets = 0; + } + if (tbl == NULL) + return NULL; + + tbl->size = size; + + max_locks = size >> 1; + if (tbl->nest) + max_locks = min_t(size_t, max_locks, 1U << tbl->nest); + + if (bp_alloc_bucket_spinlocks(&tbl->locks, &tbl->locks_mask, max_locks, + ht->p.locks_mul, gfp) < 0) { + bp_bucket_table_free(tbl); + return NULL; + } + + INIT_LIST_HEAD(&tbl->walkers); + + get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd)); + + for (i = 0; i < nbuckets; i++) + BP_INIT_RHT_NULLS_HEAD(tbl->buckets[i], ht, i); + + return tbl; +} + +static struct bp_bucket_table *bp_rhashtable_last_table(struct bp_rhashtable *ht, + struct bp_bucket_table *tbl) +{ + struct bp_bucket_table *new_tbl; + + do { + new_tbl = tbl; + tbl = bp_rht_dereference_rcu(tbl->future_tbl, ht); + } while (tbl); + + return new_tbl; +} + +static int bp_rhashtable_rehash_one(struct bp_rhashtable *ht, unsigned int old_hash) +{ + struct bp_bucket_table *old_tbl = bp_rht_dereference(ht->tbl, ht); + struct bp_bucket_table *new_tbl = bp_rhashtable_last_table(ht, + bp_rht_dereference_rcu(old_tbl->future_tbl, ht)); + struct bp_rhash_head __rcu **pprev = bp_rht_bucket_var(old_tbl, old_hash); + int err = -EAGAIN; + struct bp_rhash_head *head, *next, *entry; + spinlock_t *new_bucket_lock; + unsigned int new_hash; + + if (new_tbl->nest) + goto out; + + err = -ENOENT; + + bp_rht_for_each(entry, old_tbl, old_hash) { + err = 0; + next = bp_rht_dereference_bucket(entry->next, old_tbl, old_hash); + + if (bp_rht_is_a_nulls(next)) + break; + + pprev = &entry->next; + } + + if (err) + goto out; + + new_hash = bp_head_hashfn(ht, new_tbl, entry); + + new_bucket_lock = bp_rht_bucket_lock(new_tbl, new_hash); + + spin_lock_nested(new_bucket_lock, SINGLE_DEPTH_NESTING); + head = bp_rht_dereference_bucket(new_tbl->buckets[new_hash], + new_tbl, new_hash); + + RCU_INIT_POINTER(entry->next, head); + + rcu_assign_pointer(new_tbl->buckets[new_hash], entry); + spin_unlock(new_bucket_lock); + + rcu_assign_pointer(*pprev, next); + +out: + return err; +} + +static int bp_rhashtable_rehash_chain(struct bp_rhashtable *ht, + unsigned int old_hash) +{ + struct bp_bucket_table *old_tbl = bp_rht_dereference(ht->tbl, ht); + spinlock_t *old_bucket_lock; + int err; + + old_bucket_lock = bp_rht_bucket_lock(old_tbl, old_hash); + + spin_lock_bh(old_bucket_lock); + while (!(err = bp_rhashtable_rehash_one(ht, old_hash))) + ; + + if (err == -ENOENT) { + old_tbl->rehash++; + err = 0; + } + spin_unlock_bh(old_bucket_lock); + + return err; +} + +static int bp_rhashtable_rehash_attach(struct bp_rhashtable *ht, + struct bp_bucket_table *old_tbl, + struct bp_bucket_table *new_tbl) +{ + /* Protect future_tbl using the first bucket lock. */ + spin_lock_bh(old_tbl->locks); + + /* Did somebody beat us to it? */ + if (rcu_access_pointer(old_tbl->future_tbl)) { + spin_unlock_bh(old_tbl->locks); + return -EEXIST; + } + + /* Make insertions go into the new, empty table right away. Deletions + * and lookups will be attempted in both tables until we synchronize. + */ + rcu_assign_pointer(old_tbl->future_tbl, new_tbl); + + spin_unlock_bh(old_tbl->locks); + + return 0; +} + +static int bp_rhashtable_rehash_table(struct bp_rhashtable *ht) +{ + struct bp_bucket_table *old_tbl = bp_rht_dereference(ht->tbl, ht); + struct bp_bucket_table *new_tbl; + struct bp_rhashtable_walker *walker; + unsigned int old_hash; + int err; + + new_tbl = bp_rht_dereference(old_tbl->future_tbl, ht); + if (!new_tbl) + return 0; + + for (old_hash = 0; old_hash < old_tbl->size; old_hash++) { + err = bp_rhashtable_rehash_chain(ht, old_hash); + if (err) + return err; + cond_resched(); + } + + /* Publish the new table pointer. */ + rcu_assign_pointer(ht->tbl, new_tbl); + + spin_lock(&ht->lock); + list_for_each_entry(walker, &old_tbl->walkers, list) + walker->tbl = NULL; + spin_unlock(&ht->lock); + + /* Wait for readers. All new readers will see the new + * table, and thus no references to the old table will + * remain. + */ + call_rcu(&old_tbl->rcu, bp_bucket_table_free_rcu); + + return bp_rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0; +} + +static int bp_rhashtable_rehash_alloc(struct bp_rhashtable *ht, + struct bp_bucket_table *old_tbl, + unsigned int size) +{ + struct bp_bucket_table *new_tbl; + int err; + + BP_ASSERT_RHT_MUTEX(ht); + + new_tbl = bp_bucket_table_alloc(ht, size, GFP_KERNEL); + if (new_tbl == NULL) + return -ENOMEM; + + err = bp_rhashtable_rehash_attach(ht, old_tbl, new_tbl); + if (err) + bp_bucket_table_free(new_tbl); + + return err; +} + +/** + * bp_rhashtable_shrink - Shrink hash table while allowing concurrent lookups + * @ht: the hash table to shrink + * + * This function shrinks the hash table to fit, i.e., the smallest + * size would not cause it to expand right away automatically. + * + * The caller must ensure that no concurrent resizing occurs by holding + * ht->mutex. + * + * The caller must ensure that no concurrent table mutations take place. + * It is however valid to have concurrent lookups if they are RCU protected. + * + * It is valid to have concurrent insertions and deletions protected by per + * bucket locks or concurrent RCU protected lookups and traversals. + */ +static int bp_rhashtable_shrink(struct bp_rhashtable *ht) +{ + struct bp_bucket_table *old_tbl = bp_rht_dereference(ht->tbl, ht); + unsigned int nelems = atomic_read(&ht->nelems); + unsigned int size = 0; + + if (nelems) + size = roundup_pow_of_two(nelems * 3 / 2); + if (size < ht->p.min_size) + size = ht->p.min_size; + + if (old_tbl->size <= size) + return 0; + + if (bp_rht_dereference(old_tbl->future_tbl, ht)) + return -EEXIST; + + return bp_rhashtable_rehash_alloc(ht, old_tbl, size); +} + +static void bp_rht_deferred_worker(struct work_struct *work) +{ + struct bp_rhashtable *ht; + struct bp_bucket_table *tbl; + int err = 0; + + ht = container_of(work, struct bp_rhashtable, run_work); + mutex_lock(&ht->mutex); + + tbl = bp_rht_dereference(ht->tbl, ht); + tbl = bp_rhashtable_last_table(ht, tbl); + + if (bp_rht_grow_above_75(ht, tbl)) + err = bp_rhashtable_rehash_alloc(ht, tbl, tbl->size * 2); + else if (ht->p.automatic_shrinking && bp_rht_shrink_below_30(ht, tbl)) + err = bp_rhashtable_shrink(ht); + else if (tbl->nest) + err = bp_rhashtable_rehash_alloc(ht, tbl, tbl->size); + + if (!err || err == -EEXIST) { + int nerr; + + nerr = bp_rhashtable_rehash_table(ht); + err = err ?: nerr; + } + + mutex_unlock(&ht->mutex); + + if (err) + schedule_work(&ht->run_work); +} + +static int bp_rhashtable_insert_rehash(struct bp_rhashtable *ht, + struct bp_bucket_table *tbl) +{ + struct bp_bucket_table *old_tbl; + struct bp_bucket_table *new_tbl; + unsigned int size; + int err; + + old_tbl = bp_rht_dereference_rcu(ht->tbl, ht); + + size = tbl->size; + + err = -EBUSY; + + if (bp_rht_grow_above_75(ht, tbl)) + size *= 2; + /* Do not schedule more than one rehash */ + else if (old_tbl != tbl) + goto fail; + + err = -ENOMEM; + + new_tbl = bp_bucket_table_alloc(ht, size, GFP_ATOMIC); + if (new_tbl == NULL) + goto fail; + + err = bp_rhashtable_rehash_attach(ht, tbl, new_tbl); + if (err) { + bp_bucket_table_free(new_tbl); + if (err == -EEXIST) + err = 0; + } else + schedule_work(&ht->run_work); + + return err; + +fail: + /* Do not fail the insert if someone else did a rehash. */ + if (likely(rcu_dereference_raw(tbl->future_tbl))) + return 0; + + /* Schedule async rehash to retry allocation in process context. */ + if (err == -ENOMEM) + schedule_work(&ht->run_work); + + return err; +} + +static void *bp_rhashtable_lookup_one(struct bp_rhashtable *ht, + struct bp_bucket_table *tbl, unsigned int hash, + const void *key, struct bp_rhash_head *obj) +{ + struct bp_rhashtable_compare_arg arg = { + .ht = ht, + .key = key, + }; + struct bp_rhash_head __rcu **pprev; + struct bp_rhash_head *head; + int elasticity; + + elasticity = BP_RHT_ELASTICITY; + pprev = bp_rht_bucket_var(tbl, hash); + bp_rht_for_each_continue(head, *pprev, tbl, hash) { + struct bp_rhlist_head *list; + struct bp_rhlist_head *plist; + + elasticity--; + if (!key || + (ht->p.obj_cmpfn ? + ht->p.obj_cmpfn(&arg, bp_rht_obj(ht, head)) : + bp_rhashtable_compare(&arg, bp_rht_obj(ht, head)))) { + pprev = &head->next; + continue; + } + + if (!ht->rhlist) + return bp_rht_obj(ht, head); + + list = container_of(obj, struct bp_rhlist_head, rhead); + plist = container_of(head, struct bp_rhlist_head, rhead); + + RCU_INIT_POINTER(list->next, plist); + head = bp_rht_dereference_bucket(head->next, tbl, hash); + RCU_INIT_POINTER(list->rhead.next, head); + rcu_assign_pointer(*pprev, obj); + + return NULL; + } + + if (elasticity <= 0) + return ERR_PTR(-EAGAIN); + + return ERR_PTR(-ENOENT); +} + +static struct bp_bucket_table *bp_rhashtable_insert_one(struct bp_rhashtable *ht, + struct bp_bucket_table *tbl, + unsigned int hash, + struct bp_rhash_head *obj, + void *data) +{ + struct bp_rhash_head __rcu **pprev; + struct bp_bucket_table *new_tbl; + struct bp_rhash_head *head; + + if (!IS_ERR_OR_NULL(data)) + return ERR_PTR(-EEXIST); + + if (PTR_ERR(data) != -EAGAIN && PTR_ERR(data) != -ENOENT) + return ERR_CAST(data); + + new_tbl = rcu_dereference(tbl->future_tbl); + if (new_tbl) + return new_tbl; + + if (PTR_ERR(data) != -ENOENT) + return ERR_CAST(data); + + if (unlikely(bp_rht_grow_above_max(ht, tbl))) + return ERR_PTR(-E2BIG); + + if (unlikely(bp_rht_grow_above_100(ht, tbl))) + return ERR_PTR(-EAGAIN); + + pprev = bp_rht_bucket_insert(ht, tbl, hash); + if (!pprev) + return ERR_PTR(-ENOMEM); + + head = bp_rht_dereference_bucket(*pprev, tbl, hash); + + RCU_INIT_POINTER(obj->next, head); + if (ht->rhlist) { + struct bp_rhlist_head *list; + + list = container_of(obj, struct bp_rhlist_head, rhead); + RCU_INIT_POINTER(list->next, NULL); + } + + rcu_assign_pointer(*pprev, obj); + + atomic_inc(&ht->nelems); + if (bp_rht_grow_above_75(ht, tbl)) + schedule_work(&ht->run_work); + + return NULL; +} + +static void *bp_rhashtable_try_insert(struct bp_rhashtable *ht, const void *key, + struct bp_rhash_head *obj) +{ + struct bp_bucket_table *new_tbl; + struct bp_bucket_table *tbl; + unsigned int hash; + spinlock_t *lock; + void *data; + + tbl = rcu_dereference(ht->tbl); + + /* All insertions must grab the oldest table containing + * the hashed bucket that is yet to be rehashed. + */ + for (;;) { + hash = bp_rht_head_hashfn(ht, tbl, obj, ht->p); + lock = bp_rht_bucket_lock(tbl, hash); + spin_lock_bh(lock); + + if (tbl->rehash <= hash) + break; + + spin_unlock_bh(lock); + tbl = rcu_dereference(tbl->future_tbl); + } + + data = bp_rhashtable_lookup_one(ht, tbl, hash, key, obj); + new_tbl = bp_rhashtable_insert_one(ht, tbl, hash, obj, data); + if (PTR_ERR(new_tbl) != -EEXIST) + data = ERR_CAST(new_tbl); + + while (!IS_ERR_OR_NULL(new_tbl)) { + tbl = new_tbl; + hash = bp_rht_head_hashfn(ht, tbl, obj, ht->p); + spin_lock_nested(bp_rht_bucket_lock(tbl, hash), + SINGLE_DEPTH_NESTING); + + data = bp_rhashtable_lookup_one(ht, tbl, hash, key, obj); + new_tbl = bp_rhashtable_insert_one(ht, tbl, hash, obj, data); + if (PTR_ERR(new_tbl) != -EEXIST) + data = ERR_CAST(new_tbl); + + spin_unlock(bp_rht_bucket_lock(tbl, hash)); + } + + spin_unlock_bh(lock); + + if (PTR_ERR(data) == -EAGAIN) + data = ERR_PTR(bp_rhashtable_insert_rehash(ht, tbl) ?: + -EAGAIN); + + return data; +} + +void *bp_rhashtable_insert_slow(struct bp_rhashtable *ht, const void *key, + struct bp_rhash_head *obj) +{ + void *data; + + do { + rcu_read_lock(); + data = bp_rhashtable_try_insert(ht, key, obj); + rcu_read_unlock(); + } while (PTR_ERR(data) == -EAGAIN); + + return data; +} +EXPORT_SYMBOL_GPL(bp_rhashtable_insert_slow); + +/** + * bp_rhashtable_walk_enter - Initialise an iterator + * @ht: Table to walk over + * @iter: Hash table Iterator + * + * This function prepares a hash table walk. + * + * Note that if you restart a walk after bp_rhashtable_walk_stop you + * may see the same object twice. Also, you may miss objects if + * there are removals in between bp_rhashtable_walk_stop and the next + * call to bp_rhashtable_walk_start. + * + * For a completely stable walk you should construct your own data + * structure outside the hash table. + * + * This function may be called from any process context, including + * non-preemptable context, but cannot be called from softirq or + * hardirq context. + * + * You must call bp_rhashtable_walk_exit after this function returns. + */ +void bp_rhashtable_walk_enter(struct bp_rhashtable *ht, struct bp_rhashtable_iter *iter) +{ + iter->ht = ht; + iter->p = NULL; + iter->slot = 0; + iter->skip = 0; + iter->end_of_table = 0; + + spin_lock(&ht->lock); + iter->walker.tbl = + rcu_dereference_protected(ht->tbl, lockdep_is_held(&ht->lock)); + list_add(&iter->walker.list, &iter->walker.tbl->walkers); + spin_unlock(&ht->lock); +} +EXPORT_SYMBOL_GPL(bp_rhashtable_walk_enter); + +/** + * bp_rhashtable_walk_exit - Free an iterator + * @iter: Hash table Iterator + * + * This function frees resources allocated by bp_rhashtable_walk_init. + */ +void bp_rhashtable_walk_exit(struct bp_rhashtable_iter *iter) +{ + spin_lock(&iter->ht->lock); + if (iter->walker.tbl) + list_del(&iter->walker.list); + spin_unlock(&iter->ht->lock); +} +EXPORT_SYMBOL_GPL(bp_rhashtable_walk_exit); + +/** + * bp_rhashtable_walk_start_check - Start a hash table walk + * @iter: Hash table iterator + * + * Start a hash table walk at the current iterator position. Note that we take + * the RCU lock in all cases including when we return an error. So you must + * always call bp_rhashtable_walk_stop to clean up. + * + * Returns zero if successful. + * + * Returns -EAGAIN if resize event occured. Note that the iterator + * will rewind back to the beginning and you may use it immediately + * by calling bp_rhashtable_walk_next. + * + * bp_rhashtable_walk_start is defined as an inline variant that returns + * void. This is preferred in cases where the caller would ignore + * resize events and always continue. + */ +int bp_rhashtable_walk_start_check(struct bp_rhashtable_iter *iter) + __acquires(RCU) +{ + struct bp_rhashtable *ht = iter->ht; + bool rhlist = ht->rhlist; + + rcu_read_lock(); + + spin_lock(&ht->lock); + if (iter->walker.tbl) + list_del(&iter->walker.list); + spin_unlock(&ht->lock); + + if (iter->end_of_table) + return 0; + if (!iter->walker.tbl) { + iter->walker.tbl = bp_rht_dereference_rcu(ht->tbl, ht); + iter->slot = 0; + iter->skip = 0; + return -EAGAIN; + } + + if (iter->p && !rhlist) { + /* + * We need to validate that 'p' is still in the table, and + * if so, update 'skip' + */ + struct bp_rhash_head *p; + int skip = 0; + bp_rht_for_each_rcu(p, iter->walker.tbl, iter->slot) { + skip++; + if (p == iter->p) { + iter->skip = skip; + goto found; + } + } + iter->p = NULL; + } else if (iter->p && rhlist) { + /* Need to validate that 'list' is still in the table, and + * if so, update 'skip' and 'p'. + */ + struct bp_rhash_head *p; + struct bp_rhlist_head *list; + int skip = 0; + bp_rht_for_each_rcu(p, iter->walker.tbl, iter->slot) { + for (list = container_of(p, struct bp_rhlist_head, rhead); + list; + list = rcu_dereference(list->next)) { + skip++; + if (list == iter->list) { + iter->p = p; + skip = skip; + goto found; + } + } + } + iter->p = NULL; + } +found: + return 0; +} +EXPORT_SYMBOL_GPL(bp_rhashtable_walk_start_check); + +/** + * __bp_rhashtable_walk_find_next - Find the next element in a table (or the first + * one in case of a new walk). + * + * @iter: Hash table iterator + * + * Returns the found object or NULL when the end of the table is reached. + * + * Returns -EAGAIN if resize event occurred. + */ +static void *__bp_rhashtable_walk_find_next(struct bp_rhashtable_iter *iter) +{ + struct bp_bucket_table *tbl = iter->walker.tbl; + struct bp_rhlist_head *list = iter->list; + struct bp_rhashtable *ht = iter->ht; + struct bp_rhash_head *p = iter->p; + bool rhlist = ht->rhlist; + + if (!tbl) + return NULL; + + for (; iter->slot < tbl->size; iter->slot++) { + int skip = iter->skip; + + bp_rht_for_each_rcu(p, tbl, iter->slot) { + if (rhlist) { + list = container_of(p, struct bp_rhlist_head, + rhead); + do { + if (!skip) + goto next; + skip--; + list = rcu_dereference(list->next); + } while (list); + + continue; + } + if (!skip) + break; + skip--; + } + +next: + if (!bp_rht_is_a_nulls(p)) { + iter->skip++; + iter->p = p; + iter->list = list; + return bp_rht_obj(ht, rhlist ? &list->rhead : p); + } + + iter->skip = 0; + } + + iter->p = NULL; + + /* Ensure we see any new tables. */ + smp_rmb(); + + iter->walker.tbl = bp_rht_dereference_rcu(tbl->future_tbl, ht); + if (iter->walker.tbl) { + iter->slot = 0; + iter->skip = 0; + return ERR_PTR(-EAGAIN); + } else { + iter->end_of_table = true; + } + + return NULL; +} + +/** + * bp_rhashtable_walk_next - Return the next object and advance the iterator + * @iter: Hash table iterator + * + * Note that you must call bp_rhashtable_walk_stop when you are finished + * with the walk. + * + * Returns the next object or NULL when the end of the table is reached. + * + * Returns -EAGAIN if resize event occurred. Note that the iterator + * will rewind back to the beginning and you may continue to use it. + */ +void *bp_rhashtable_walk_next(struct bp_rhashtable_iter *iter) +{ + struct bp_rhlist_head *list = iter->list; + struct bp_rhashtable *ht = iter->ht; + struct bp_rhash_head *p = iter->p; + bool rhlist = ht->rhlist; + + if (p) { + if (!rhlist || !(list = rcu_dereference(list->next))) { + p = rcu_dereference(p->next); + list = container_of(p, struct bp_rhlist_head, rhead); + } + if (!bp_rht_is_a_nulls(p)) { + iter->skip++; + iter->p = p; + iter->list = list; + return bp_rht_obj(ht, rhlist ? &list->rhead : p); + } + + /* At the end of this slot, switch to next one and then find + * next entry from that point. + */ + iter->skip = 0; + iter->slot++; + } + + return __bp_rhashtable_walk_find_next(iter); +} +EXPORT_SYMBOL_GPL(bp_rhashtable_walk_next); + +/** + * bp_rhashtable_walk_peek - Return the next object but don't advance the iterator + * @iter: Hash table iterator + * + * Returns the next object or NULL when the end of the table is reached. + * + * Returns -EAGAIN if resize event occurred. Note that the iterator + * will rewind back to the beginning and you may continue to use it. + */ +void *bp_rhashtable_walk_peek(struct bp_rhashtable_iter *iter) +{ + struct bp_rhlist_head *list = iter->list; + struct bp_rhashtable *ht = iter->ht; + struct bp_rhash_head *p = iter->p; + + if (p) + return bp_rht_obj(ht, ht->rhlist ? &list->rhead : p); + + /* No object found in current iter, find next one in the table. */ + + if (iter->skip) { + /* A nonzero skip value points to the next entry in the table + * beyond that last one that was found. Decrement skip so + * we find the current value. __bp_rhashtable_walk_find_next + * will restore the original value of skip assuming that + * the table hasn't changed. + */ + iter->skip--; + } + + return __bp_rhashtable_walk_find_next(iter); +} +EXPORT_SYMBOL_GPL(bp_rhashtable_walk_peek); + +/** + * bp_rhashtable_walk_stop - Finish a hash table walk + * @iter: Hash table iterator + * + * Finish a hash table walk. Does not reset the iterator to the start of the + * hash table. + */ +void bp_rhashtable_walk_stop(struct bp_rhashtable_iter *iter) + __releases(RCU) +{ + struct bp_rhashtable *ht; + struct bp_bucket_table *tbl = iter->walker.tbl; + + if (!tbl) + goto out; + + ht = iter->ht; + + spin_lock(&ht->lock); + if (tbl->rehash < tbl->size) + list_add(&iter->walker.list, &tbl->walkers); + else + iter->walker.tbl = NULL; + spin_unlock(&ht->lock); + +out: + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(bp_rhashtable_walk_stop); + +static size_t bp_rounded_hashtable_size(const struct bp_rhashtable_params *params) +{ + return max(roundup_pow_of_two(params->nelem_hint * 4 / 3), + (unsigned long)params->min_size); +} + +static u32 bp_rhashtable_jhash2(const void *key, u32 length, u32 seed) +{ + return jhash2(key, length, seed); +} + +/** + * bp_rhashtable_init - initialize a new hash table + * @ht: hash table to be initialized + * @params: configuration parameters + * + * Initializes a new hash table based on the provided configuration + * parameters. A table can be configured either with a variable or + * fixed length key: + * + * Configuration Example 1: Fixed length keys + * struct test_obj { + * int key; + * void * my_member; + * struct bp_rhash_head node; + * }; + * + * struct bp_rhashtable_params params = { + * .head_offset = offsetof(struct test_obj, node), + * .key_offset = offsetof(struct test_obj, key), + * .key_len = sizeof(int), + * .hashfn = jhash, + * .nulls_base = (1U << RHT_BASE_SHIFT), + * }; + * + * Configuration Example 2: Variable length keys + * struct test_obj { + * [...] + * struct bp_rhash_head node; + * }; + * + * u32 my_hash_fn(const void *data, u32 len, u32 seed) + * { + * struct test_obj *obj = data; + * + * return [... hash ...]; + * } + * + * struct bp_rhashtable_params params = { + * .head_offset = offsetof(struct test_obj, node), + * .hashfn = jhash, + * .obj_hashfn = my_hash_fn, + * }; + */ +int bp_rhashtable_init(struct bp_rhashtable *ht, + const struct bp_rhashtable_params *params) +{ + struct bp_bucket_table *tbl; + size_t size; + + size = BP_HASH_DEFAULT_SIZE; + + if ((!params->key_len && !params->obj_hashfn) || + (params->obj_hashfn && !params->obj_cmpfn)) + return -EINVAL; + + if (params->nulls_base && params->nulls_base < (1U << BP_RHT_BASE_SHIFT)) + return -EINVAL; + + memset(ht, 0, sizeof(*ht)); + mutex_init(&ht->mutex); + spin_lock_init(&ht->lock); + memcpy(&ht->p, params, sizeof(*params)); + + if (params->min_size) + ht->p.min_size = roundup_pow_of_two(params->min_size); + + /* Cap total entries at 2^31 to avoid nelems overflow. */ + ht->max_elems = 1u << 31; + + if (params->max_size) { + ht->p.max_size = rounddown_pow_of_two(params->max_size); + if (ht->p.max_size < ht->max_elems / 2) + ht->max_elems = ht->p.max_size * 2; + } + + ht->p.min_size = max_t(u16, ht->p.min_size, BP_HASH_MIN_SIZE); + + if (params->nelem_hint) + size = bp_rounded_hashtable_size(&ht->p); + + if (params->locks_mul) + ht->p.locks_mul = roundup_pow_of_two(params->locks_mul); + else + ht->p.locks_mul = BP_BUCKET_LOCKS_PER_CPU; + + ht->key_len = ht->p.key_len; + if (!params->hashfn) { + ht->p.hashfn = jhash; + + if (!(ht->key_len & (sizeof(u32) - 1))) { + ht->key_len /= sizeof(u32); + ht->p.hashfn = bp_rhashtable_jhash2; + } + } + + tbl = bp_bucket_table_alloc(ht, size, GFP_KERNEL); + if (tbl == NULL) + return -ENOMEM; + + atomic_set(&ht->nelems, 0); + + RCU_INIT_POINTER(ht->tbl, tbl); + + INIT_WORK(&ht->run_work, bp_rht_deferred_worker); + + return 0; +} +EXPORT_SYMBOL_GPL(bp_rhashtable_init); + +/** + * bp_rhltable_init - initialize a new hash list table + * @hlt: hash list table to be initialized + * @params: configuration parameters + * + * Initializes a new hash list table. + * + * See documentation for bp_rhashtable_init. + */ +int bp_rhltable_init(struct bp_rhltable *hlt, const struct bp_rhashtable_params *params) +{ + int err; + + /* No rhlist NULLs marking for now. */ + if (params->nulls_base) + return -EINVAL; + + err = bp_rhashtable_init(&hlt->ht, params); + hlt->ht.rhlist = true; + return err; +} +EXPORT_SYMBOL_GPL(bp_rhltable_init); + +static void bp_rhashtable_free_one(struct bp_rhashtable *ht, struct bp_rhash_head *obj, + void (*free_fn)(void *ptr, void *arg), + void *arg) +{ + struct bp_rhlist_head *list; + + if (!ht->rhlist) { + free_fn(bp_rht_obj(ht, obj), arg); + return; + } + + list = container_of(obj, struct bp_rhlist_head, rhead); + do { + obj = &list->rhead; + list = bp_rht_dereference(list->next, ht); + free_fn(bp_rht_obj(ht, obj), arg); + } while (list); +} + +/** + * bp_rhashtable_free_and_destroy - free elements and destroy hash table + * @ht: the hash table to destroy + * @free_fn: callback to release resources of element + * @arg: pointer passed to free_fn + * + * Stops an eventual async resize. If defined, invokes free_fn for each + * element to releasal resources. Please note that RCU protected + * readers may still be accessing the elements. Releasing of resources + * must occur in a compatible manner. Then frees the bucket array. + * + * This function will eventually sleep to wait for an async resize + * to complete. The caller is responsible that no further write operations + * occurs in parallel. + */ +void bp_rhashtable_free_and_destroy(struct bp_rhashtable *ht, + void (*free_fn)(void *ptr, void *arg), + void *arg) +{ + struct bp_bucket_table *tbl; + unsigned int i; + + cancel_work_sync(&ht->run_work); + + mutex_lock(&ht->mutex); + tbl = bp_rht_dereference(ht->tbl, ht); + if (free_fn) { + for (i = 0; i < tbl->size; i++) { + struct bp_rhash_head *pos, *next; + + cond_resched(); + for (pos = bp_rht_dereference(*bp_rht_bucket(tbl, i), ht), + next = !bp_rht_is_a_nulls(pos) ? + bp_rht_dereference(pos->next, ht) : NULL; + !bp_rht_is_a_nulls(pos); + pos = next, + next = !bp_rht_is_a_nulls(pos) ? + bp_rht_dereference(pos->next, ht) : NULL) + bp_rhashtable_free_one(ht, pos, free_fn, arg); + } + } + + bp_bucket_table_free(tbl); + mutex_unlock(&ht->mutex); +} +EXPORT_SYMBOL_GPL(bp_rhashtable_free_and_destroy); + +void bp_rhashtable_destroy(struct bp_rhashtable *ht) +{ + return bp_rhashtable_free_and_destroy(ht, NULL, NULL); +} +EXPORT_SYMBOL_GPL(bp_rhashtable_destroy); + +struct bp_rhash_head __rcu **bp_rht_bucket_nested(const struct bp_bucket_table *tbl, + unsigned int hash) +{ + const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); + static struct bp_rhash_head __rcu *rhnull = + (struct bp_rhash_head __rcu *)NULLS_MARKER(0); + unsigned int index = hash & ((1 << tbl->nest) - 1); + unsigned int size = tbl->size >> tbl->nest; + unsigned int subhash = hash; + union bp_nested_table *ntbl; + + ntbl = (union bp_nested_table *)rcu_dereference_raw(tbl->buckets[0]); + ntbl = bp_rht_dereference_bucket_rcu(ntbl[index].table, tbl, hash); + subhash >>= tbl->nest; + + while (ntbl && size > (1 << shift)) { + index = subhash & ((1 << shift) - 1); + ntbl = bp_rht_dereference_bucket_rcu(ntbl[index].table, + tbl, hash); + size >>= shift; + subhash >>= shift; + } + + if (!ntbl) + return &rhnull; + + return &ntbl[subhash].bucket; + +} +EXPORT_SYMBOL_GPL(bp_rht_bucket_nested); + +struct bp_rhash_head __rcu **bp_rht_bucket_nested_insert(struct bp_rhashtable *ht, + struct bp_bucket_table *tbl, + unsigned int hash) +{ + const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); + unsigned int index = hash & ((1 << tbl->nest) - 1); + unsigned int size = tbl->size >> tbl->nest; + union bp_nested_table *ntbl; + unsigned int shifted; + unsigned int nhash; + + ntbl = (union bp_nested_table *)rcu_dereference_raw(tbl->buckets[0]); + hash >>= tbl->nest; + nhash = index; + shifted = tbl->nest; + ntbl = bp_nested_table_alloc(ht, &ntbl[index].table, + size <= (1 << shift) ? shifted : 0, nhash); + + while (ntbl && size > (1 << shift)) { + index = hash & ((1 << shift) - 1); + size >>= shift; + hash >>= shift; + nhash |= index << shifted; + shifted += shift; + ntbl = bp_nested_table_alloc(ht, &ntbl[index].table, + size <= (1 << shift) ? shifted : 0, + nhash); + } + + if (!ntbl) + return NULL; + + return &ntbl[hash].bucket; + +} +EXPORT_SYMBOL_GPL(bp_rht_bucket_nested_insert); +#elif !(defined(HAVE_RHASHTABLE_TYPES) || (defined(HAVE_RHLTABLE) && defined(CONFIG_COMPAT_RHASHTABLE_FIXED))) + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define HASH_DEFAULT_SIZE 64UL +#define HASH_MIN_SIZE 4U +#define BUCKET_LOCKS_PER_CPU 32UL + +static u32 head_hashfn(struct rhashtable *ht, + const struct bucket_table *tbl, + const struct rhash_head *he) +{ + return rht_head_hashfn(ht, tbl, he, ht->p); +} + +#ifdef CONFIG_PROVE_LOCKING +#define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT)) + +int lockdep_rht_mutex_is_held(struct rhashtable *ht) +{ + return (debug_locks) ? lockdep_is_held(&ht->mutex) : 1; +} +EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held); + +int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash) +{ + spinlock_t *lock = rht_bucket_lock(tbl, hash); + + return (debug_locks) ? lockdep_is_held(lock) : 1; +} +EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held); +#else +#define ASSERT_RHT_MUTEX(HT) +#endif + + +static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl, + gfp_t gfp) +{ + unsigned int i, size; +#if defined(CONFIG_PROVE_LOCKING) + unsigned int nr_pcpus = 2; +#else + unsigned int nr_pcpus = num_possible_cpus(); +#endif + + nr_pcpus = min_t(unsigned int, nr_pcpus, 64UL); + size = roundup_pow_of_two(nr_pcpus * ht->p.locks_mul); + + /* Never allocate more than 0.5 locks per bucket */ + size = min_t(unsigned int, size, tbl->size >> 1); + + if (sizeof(spinlock_t) != 0) { + tbl->locks = NULL; +#ifdef CONFIG_NUMA + if (size * sizeof(spinlock_t) > PAGE_SIZE && + gfp == GFP_KERNEL) + tbl->locks = vmalloc(size * sizeof(spinlock_t)); +#endif + if (gfp != GFP_KERNEL) + gfp |= __GFP_NOWARN | __GFP_NORETRY; + + if (!tbl->locks) + tbl->locks = kmalloc_array(size, sizeof(spinlock_t), + gfp); + if (!tbl->locks) + return -ENOMEM; + for (i = 0; i < size; i++) + spin_lock_init(&tbl->locks[i]); + } + tbl->locks_mask = size - 1; + + return 0; +} + +static void bucket_table_free(const struct bucket_table *tbl) +{ + if (tbl) + kvfree(tbl->locks); + + kvfree(tbl); +} + +static void bucket_table_free_rcu(struct rcu_head *head) +{ + bucket_table_free(container_of(head, struct bucket_table, rcu)); +} + +static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, + size_t nbuckets, + gfp_t gfp) +{ + struct bucket_table *tbl = NULL; + size_t size; + int i; + + size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]); + if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER) || + gfp != GFP_KERNEL) + tbl = kzalloc(size, gfp | __GFP_NOWARN | __GFP_NORETRY); + if (tbl == NULL && gfp == GFP_KERNEL) + tbl = vzalloc(size); + if (tbl == NULL) + return NULL; + + tbl->size = nbuckets; + + if (alloc_bucket_locks(ht, tbl, gfp) < 0) { + bucket_table_free(tbl); + return NULL; + } + + INIT_LIST_HEAD(&tbl->walkers); + + get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd)); + + for (i = 0; i < nbuckets; i++) + INIT_RHT_NULLS_HEAD(tbl->buckets[i], ht, i); + + return tbl; +} + +static struct bucket_table *rhashtable_last_table(struct rhashtable *ht, + struct bucket_table *tbl) +{ + struct bucket_table *new_tbl; + + do { + new_tbl = tbl; + tbl = rht_dereference_rcu(tbl->future_tbl, ht); + } while (tbl); + + return new_tbl; +} + +static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash) +{ + struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); + struct bucket_table *new_tbl = rhashtable_last_table(ht, + rht_dereference_rcu(old_tbl->future_tbl, ht)); + struct rhash_head __rcu **pprev = &old_tbl->buckets[old_hash]; + int err = -ENOENT; + struct rhash_head *head, *next, *entry; + spinlock_t *new_bucket_lock; + unsigned int new_hash; + + rht_for_each(entry, old_tbl, old_hash) { + err = 0; + next = rht_dereference_bucket(entry->next, old_tbl, old_hash); + + if (rht_is_a_nulls(next)) + break; + + pprev = &entry->next; + } + + if (err) + goto out; + + new_hash = head_hashfn(ht, new_tbl, entry); + + new_bucket_lock = rht_bucket_lock(new_tbl, new_hash); + + spin_lock_nested(new_bucket_lock, SINGLE_DEPTH_NESTING); + head = rht_dereference_bucket(new_tbl->buckets[new_hash], + new_tbl, new_hash); + + RCU_INIT_POINTER(entry->next, head); + + rcu_assign_pointer(new_tbl->buckets[new_hash], entry); + spin_unlock(new_bucket_lock); + + rcu_assign_pointer(*pprev, next); + +out: + return err; +} + +static void rhashtable_rehash_chain(struct rhashtable *ht, + unsigned int old_hash) +{ + struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); + spinlock_t *old_bucket_lock; + + old_bucket_lock = rht_bucket_lock(old_tbl, old_hash); + + spin_lock_bh(old_bucket_lock); + while (!rhashtable_rehash_one(ht, old_hash)) + ; + old_tbl->rehash++; + spin_unlock_bh(old_bucket_lock); +} + +static int rhashtable_rehash_attach(struct rhashtable *ht, + struct bucket_table *old_tbl, + struct bucket_table *new_tbl) +{ + /* Protect future_tbl using the first bucket lock. */ + spin_lock_bh(old_tbl->locks); + + /* Did somebody beat us to it? */ + if (rcu_access_pointer(old_tbl->future_tbl)) { + spin_unlock_bh(old_tbl->locks); + return -EEXIST; + } + + /* Make insertions go into the new, empty table right away. Deletions + * and lookups will be attempted in both tables until we synchronize. + */ + rcu_assign_pointer(old_tbl->future_tbl, new_tbl); + + spin_unlock_bh(old_tbl->locks); + + return 0; +} + +static int rhashtable_rehash_table(struct rhashtable *ht) +{ + struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); + struct bucket_table *new_tbl; + struct rhashtable_walker *walker; + unsigned int old_hash; + + new_tbl = rht_dereference(old_tbl->future_tbl, ht); + if (!new_tbl) + return 0; + + for (old_hash = 0; old_hash < old_tbl->size; old_hash++) + rhashtable_rehash_chain(ht, old_hash); + + /* Publish the new table pointer. */ + rcu_assign_pointer(ht->tbl, new_tbl); + + spin_lock(&ht->lock); + list_for_each_entry(walker, &old_tbl->walkers, list) + walker->tbl = NULL; + spin_unlock(&ht->lock); + + /* Wait for readers. All new readers will see the new + * table, and thus no references to the old table will + * remain. + */ + call_rcu(&old_tbl->rcu, bucket_table_free_rcu); + + return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0; +} + +/** + * rhashtable_expand - Expand hash table while allowing concurrent lookups + * @ht: the hash table to expand + * + * A secondary bucket array is allocated and the hash entries are migrated. + * + * This function may only be called in a context where it is safe to call + * synchronize_rcu(), e.g. not within a rcu_read_lock() section. + * + * The caller must ensure that no concurrent resizing occurs by holding + * ht->mutex. + * + * It is valid to have concurrent insertions and deletions protected by per + * bucket locks or concurrent RCU protected lookups and traversals. + */ +static int rhashtable_expand(struct rhashtable *ht) +{ + struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht); + int err; + + ASSERT_RHT_MUTEX(ht); + + old_tbl = rhashtable_last_table(ht, old_tbl); + + new_tbl = bucket_table_alloc(ht, old_tbl->size * 2, GFP_KERNEL); + if (new_tbl == NULL) + return -ENOMEM; + + err = rhashtable_rehash_attach(ht, old_tbl, new_tbl); + if (err) + bucket_table_free(new_tbl); + + return err; +} + +/** + * rhashtable_shrink - Shrink hash table while allowing concurrent lookups + * @ht: the hash table to shrink + * + * This function shrinks the hash table to fit, i.e., the smallest + * size would not cause it to expand right away automatically. + * + * The caller must ensure that no concurrent resizing occurs by holding + * ht->mutex. + * + * The caller must ensure that no concurrent table mutations take place. + * It is however valid to have concurrent lookups if they are RCU protected. + * + * It is valid to have concurrent insertions and deletions protected by per + * bucket locks or concurrent RCU protected lookups and traversals. + */ +static int rhashtable_shrink(struct rhashtable *ht) +{ + struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht); + unsigned int nelems = atomic_read(&ht->nelems); + unsigned int size = 0; + int err; + + ASSERT_RHT_MUTEX(ht); + + if (nelems) + size = roundup_pow_of_two(nelems * 3 / 2); + if (size < ht->p.min_size) + size = ht->p.min_size; + + if (old_tbl->size <= size) + return 0; + + if (rht_dereference(old_tbl->future_tbl, ht)) + return -EEXIST; + + new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL); + if (new_tbl == NULL) + return -ENOMEM; + + err = rhashtable_rehash_attach(ht, old_tbl, new_tbl); + if (err) + bucket_table_free(new_tbl); + + return err; +} + +static void rht_deferred_worker(struct work_struct *work) +{ + struct rhashtable *ht; + struct bucket_table *tbl; + int err = 0; + + ht = container_of(work, struct rhashtable, run_work); + mutex_lock(&ht->mutex); + + tbl = rht_dereference(ht->tbl, ht); + tbl = rhashtable_last_table(ht, tbl); + + if (rht_grow_above_75(ht, tbl)) + rhashtable_expand(ht); + else if (ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl)) + rhashtable_shrink(ht); + + err = rhashtable_rehash_table(ht); + + mutex_unlock(&ht->mutex); + + if (err) + schedule_work(&ht->run_work); +} + +static int rhashtable_insert_rehash(struct rhashtable *ht, + struct bucket_table *tbl) +{ + struct bucket_table *old_tbl; + struct bucket_table *new_tbl; + unsigned int size; + int err; + + old_tbl = rht_dereference_rcu(ht->tbl, ht); + + size = tbl->size; + + err = -EBUSY; + + if (rht_grow_above_75(ht, tbl)) + size *= 2; + /* Do not schedule more than one rehash */ + else if (old_tbl != tbl) + goto fail; + + err = -ENOMEM; + + new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC); + if (new_tbl == NULL) + goto fail; + + err = rhashtable_rehash_attach(ht, tbl, new_tbl); + if (err) { + bucket_table_free(new_tbl); + if (err == -EEXIST) + err = 0; + } else + schedule_work(&ht->run_work); + + return err; + +fail: + /* Do not fail the insert if someone else did a rehash. */ + if (likely(rcu_dereference_raw(tbl->future_tbl))) + return 0; + + /* Schedule async rehash to retry allocation in process context. */ + if (err == -ENOMEM) + schedule_work(&ht->run_work); + + return err; +} + +static void *rhashtable_lookup_one(struct rhashtable *ht, + struct bucket_table *tbl, unsigned int hash, + const void *key, struct rhash_head *obj) +{ + struct rhashtable_compare_arg arg = { + .ht = ht, + .key = key, + }; + struct rhash_head __rcu **pprev; + struct rhash_head *head; + int elasticity; + +#if defined(HAVE_RHASHTABLE_INSECURE_ELASTICITY) || !defined(HAVE_RHLTABLE) + elasticity = ht->elasticity; +#else + elasticity = RHT_ELASTICITY; +#endif + pprev = &tbl->buckets[hash]; + rht_for_each(head, tbl, hash) { + struct rhlist_head *list; + struct rhlist_head *plist; + + elasticity--; + if (!key || + (ht->p.obj_cmpfn ? + ht->p.obj_cmpfn(&arg, rht_obj(ht, head)) : + rhashtable_compare(&arg, rht_obj(ht, head)))) { + pprev = &head->next; + continue; + } + + if (!ht->rhlist) + return rht_obj(ht, head); + + list = container_of(obj, struct rhlist_head, rhead); + plist = container_of(head, struct rhlist_head, rhead); + + RCU_INIT_POINTER(list->next, plist); + head = rht_dereference_bucket(head->next, tbl, hash); + RCU_INIT_POINTER(list->rhead.next, head); + rcu_assign_pointer(*pprev, obj); + + return NULL; + } + + if (elasticity <= 0) + return ERR_PTR(-EAGAIN); + + return ERR_PTR(-ENOENT); +} + +static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht, + struct bucket_table *tbl, + unsigned int hash, + struct rhash_head *obj, + void *data) +{ + struct bucket_table *new_tbl; + struct rhash_head *head; + + if (!IS_ERR_OR_NULL(data)) + return ERR_PTR(-EEXIST); + + if (PTR_ERR(data) != -EAGAIN && PTR_ERR(data) != -ENOENT) + return ERR_CAST(data); + + new_tbl = rcu_dereference(tbl->future_tbl); + if (new_tbl) + return new_tbl; + + if (PTR_ERR(data) != -ENOENT) + return ERR_CAST(data); + + if (unlikely(rht_grow_above_max(ht, tbl))) + return ERR_PTR(-E2BIG); + + if (unlikely(rht_grow_above_100(ht, tbl))) + return ERR_PTR(-EAGAIN); + + head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash); + + RCU_INIT_POINTER(obj->next, head); + if (ht->rhlist) { + struct rhlist_head *list; + + list = container_of(obj, struct rhlist_head, rhead); + RCU_INIT_POINTER(list->next, NULL); + } + + rcu_assign_pointer(tbl->buckets[hash], obj); + + atomic_inc(&ht->nelems); + if (rht_grow_above_75(ht, tbl)) + schedule_work(&ht->run_work); + + return NULL; +} + +static void *rhashtable_try_insert(struct rhashtable *ht, const void *key, + struct rhash_head *obj) +{ + struct bucket_table *new_tbl; + struct bucket_table *tbl; + unsigned int hash; + spinlock_t *lock; + void *data; + + tbl = rcu_dereference(ht->tbl); + + /* All insertions must grab the oldest table containing + * the hashed bucket that is yet to be rehashed. + */ + for (;;) { + hash = rht_head_hashfn(ht, tbl, obj, ht->p); + lock = rht_bucket_lock(tbl, hash); + spin_lock_bh(lock); + + if (tbl->rehash <= hash) + break; + + spin_unlock_bh(lock); + tbl = rcu_dereference(tbl->future_tbl); + } + + data = rhashtable_lookup_one(ht, tbl, hash, key, obj); + new_tbl = rhashtable_insert_one(ht, tbl, hash, obj, data); + if (PTR_ERR(new_tbl) != -EEXIST) + data = ERR_CAST(new_tbl); + + while (!IS_ERR_OR_NULL(new_tbl)) { + tbl = new_tbl; + hash = rht_head_hashfn(ht, tbl, obj, ht->p); + spin_lock_nested(rht_bucket_lock(tbl, hash), + SINGLE_DEPTH_NESTING); + + data = rhashtable_lookup_one(ht, tbl, hash, key, obj); + new_tbl = rhashtable_insert_one(ht, tbl, hash, obj, data); + if (PTR_ERR(new_tbl) != -EEXIST) + data = ERR_CAST(new_tbl); + + spin_unlock(rht_bucket_lock(tbl, hash)); + } + + spin_unlock_bh(lock); + + if (PTR_ERR(data) == -EAGAIN) + data = ERR_PTR(rhashtable_insert_rehash(ht, tbl) ?: + -EAGAIN); + + return data; +} + +void *rhashtable_insert_slow(struct rhashtable *ht, const void *key, + struct rhash_head *obj) +{ + void *data; + + do { + rcu_read_lock(); + data = rhashtable_try_insert(ht, key, obj); + rcu_read_unlock(); + } while (PTR_ERR(data) == -EAGAIN); + + return data; +} +EXPORT_SYMBOL_GPL(rhashtable_insert_slow); + +/** + * rhashtable_walk_enter - Initialise an iterator + * @ht: Table to walk over + * @iter: Hash table Iterator + * + * This function prepares a hash table walk. + * + * Note that if you restart a walk after rhashtable_walk_stop you + * may see the same object twice. Also, you may miss objects if + * there are removals in between rhashtable_walk_stop and the next + * call to rhashtable_walk_start. + * + * For a completely stable walk you should construct your own data + * structure outside the hash table. + * + * This function may sleep so you must not call it from interrupt + * context or with spin locks held. + * + * You must call rhashtable_walk_exit after this function returns. + */ +void rhashtable_walk_enter(struct rhashtable *ht, struct rhashtable_iter *iter) +{ + iter->ht = ht; + iter->p = NULL; + iter->slot = 0; + iter->skip = 0; + + spin_lock(&ht->lock); + iter->walker.tbl = + rcu_dereference_protected(ht->tbl, lockdep_is_held(&ht->lock)); + list_add(&iter->walker.list, &iter->walker.tbl->walkers); + spin_unlock(&ht->lock); +} +EXPORT_SYMBOL_GPL(rhashtable_walk_enter); + +/** + * rhashtable_walk_exit - Free an iterator + * @iter: Hash table Iterator + * + * This function frees resources allocated by rhashtable_walk_init. + */ +void rhashtable_walk_exit(struct rhashtable_iter *iter) +{ + spin_lock(&iter->ht->lock); + if (iter->walker.tbl) + list_del(&iter->walker.list); + spin_unlock(&iter->ht->lock); +} +EXPORT_SYMBOL_GPL(rhashtable_walk_exit); + +/** + * rhashtable_walk_start - Start a hash table walk + * @iter: Hash table iterator + * + * Start a hash table walk. Note that we take the RCU lock in all + * cases including when we return an error. So you must always call + * rhashtable_walk_stop to clean up. + * + * Returns zero if successful. + * + * Returns -EAGAIN if resize event occured. Note that the iterator + * will rewind back to the beginning and you may use it immediately + * by calling rhashtable_walk_next. + */ +int rhashtable_walk_start(struct rhashtable_iter *iter) + __acquires(RCU) +{ + struct rhashtable *ht = iter->ht; + + rcu_read_lock(); + + spin_lock(&ht->lock); + if (iter->walker.tbl) + list_del(&iter->walker.list); + spin_unlock(&ht->lock); + + if (!iter->walker.tbl) { + iter->walker.tbl = rht_dereference_rcu(ht->tbl, ht); + return -EAGAIN; + } + + return 0; +} +EXPORT_SYMBOL_GPL(rhashtable_walk_start); + +/** + * rhashtable_walk_next - Return the next object and advance the iterator + * @iter: Hash table iterator + * + * Note that you must call rhashtable_walk_stop when you are finished + * with the walk. + * + * Returns the next object or NULL when the end of the table is reached. + * + * Returns -EAGAIN if resize event occured. Note that the iterator + * will rewind back to the beginning and you may continue to use it. + */ +void *rhashtable_walk_next(struct rhashtable_iter *iter) +{ + struct bucket_table *tbl = iter->walker.tbl; + struct rhlist_head *list = iter->list; + struct rhashtable *ht = iter->ht; + struct rhash_head *p = iter->p; + bool rhlist = ht->rhlist; + + if (p) { + if (!rhlist || !(list = rcu_dereference(list->next))) { + p = rcu_dereference(p->next); + list = container_of(p, struct rhlist_head, rhead); + } + goto next; + } + + for (; iter->slot < tbl->size; iter->slot++) { + int skip = iter->skip; + + rht_for_each_rcu(p, tbl, iter->slot) { + if (rhlist) { + list = container_of(p, struct rhlist_head, + rhead); + do { + if (!skip) + goto next; + skip--; + list = rcu_dereference(list->next); + } while (list); + + continue; + } + if (!skip) + break; + skip--; + } + +next: + if (!rht_is_a_nulls(p)) { + iter->skip++; + iter->p = p; + iter->list = list; + return rht_obj(ht, rhlist ? &list->rhead : p); + } + + iter->skip = 0; + } + + iter->p = NULL; + + /* Ensure we see any new tables. */ + smp_rmb(); + + iter->walker.tbl = rht_dereference_rcu(tbl->future_tbl, ht); + if (iter->walker.tbl) { + iter->slot = 0; + iter->skip = 0; + return ERR_PTR(-EAGAIN); + } + + return NULL; +} +EXPORT_SYMBOL_GPL(rhashtable_walk_next); + +/** + * rhashtable_walk_stop - Finish a hash table walk + * @iter: Hash table iterator + * + * Finish a hash table walk. + */ +void rhashtable_walk_stop(struct rhashtable_iter *iter) + __releases(RCU) +{ + struct rhashtable *ht; + struct bucket_table *tbl = iter->walker.tbl; + + if (!tbl) + goto out; + + ht = iter->ht; + + spin_lock(&ht->lock); + if (tbl->rehash < tbl->size) + list_add(&iter->walker.list, &tbl->walkers); + else + iter->walker.tbl = NULL; + spin_unlock(&ht->lock); + + iter->p = NULL; + +out: + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(rhashtable_walk_stop); + +static size_t rounded_hashtable_size(const struct rhashtable_params *params) +{ + return max(roundup_pow_of_two(params->nelem_hint * 4 / 3), + (unsigned long)params->min_size); +} + +static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed) +{ + return jhash2(key, length, seed); +} + +/** + * rhashtable_init - initialize a new hash table + * @ht: hash table to be initialized + * @params: configuration parameters + * + * Initializes a new hash table based on the provided configuration + * parameters. A table can be configured either with a variable or + * fixed length key: + * + * Configuration Example 1: Fixed length keys + * struct test_obj { + * int key; + * void * my_member; + * struct rhash_head node; + * }; + * + * struct rhashtable_params params = { + * .head_offset = offsetof(struct test_obj, node), + * .key_offset = offsetof(struct test_obj, key), + * .key_len = sizeof(int), + * .hashfn = jhash, + * .nulls_base = (1U << RHT_BASE_SHIFT), + * }; + * + * Configuration Example 2: Variable length keys + * struct test_obj { + * [...] + * struct rhash_head node; + * }; + * + * u32 my_hash_fn(const void *data, u32 len, u32 seed) + * { + * struct test_obj *obj = data; + * + * return [... hash ...]; + * } + * + * struct rhashtable_params params = { + * .head_offset = offsetof(struct test_obj, node), + * .hashfn = jhash, + * .obj_hashfn = my_hash_fn, + * }; + */ +int rhashtable_init(struct rhashtable *ht, + const struct rhashtable_params *params) +{ + struct bucket_table *tbl; + size_t size; + + size = HASH_DEFAULT_SIZE; + + if ((!params->key_len && !params->obj_hashfn) || + (params->obj_hashfn && !params->obj_cmpfn)) + return -EINVAL; + + if (params->nulls_base && params->nulls_base < (1U << RHT_BASE_SHIFT)) + return -EINVAL; + + memset(ht, 0, sizeof(*ht)); + mutex_init(&ht->mutex); + spin_lock_init(&ht->lock); + memcpy(&ht->p, params, sizeof(*params)); + + if (params->min_size) + ht->p.min_size = roundup_pow_of_two(params->min_size); + + if (params->max_size) + ht->p.max_size = rounddown_pow_of_two(params->max_size); + +#if defined(HAVE_RHASHTABLE_MAX_ELEMS) + /* Cap total entries at 2^31 to avoid nelems overflow. */ + ht->max_elems = 1u << 31; + if (ht->p.max_size) { + if (ht->p.max_size < ht->max_elems / 2) + ht->max_elems = ht->p.max_size * 2; + } +#endif + +#if defined(HAVE_RHASHTABLE_INSECURE_MAX_ENTRIES) || !defined(HAVE_RHLTABLE) + if (params->insecure_max_entries) + ht->p.insecure_max_entries = + rounddown_pow_of_two(params->insecure_max_entries); + else + ht->p.insecure_max_entries = ht->p.max_size * 2; +#endif + +#if defined(CONFIG_COMPAT_RHASHTABLE_PARAM_COMPACTED) + ht->p.min_size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE); +#else + ht->p.min_size = max(ht->p.min_size, HASH_MIN_SIZE); +#endif + + if (params->nelem_hint) + size = rounded_hashtable_size(&ht->p); + + /* The maximum (not average) chain length grows with the + * size of the hash table, at a rate of (log N)/(log log N). + * The value of 16 is selected so that even if the hash + * table grew to 2^32 you would not expect the maximum + * chain length to exceed it unless we are under attack + * (or extremely unlucky). + * + * As this limit is only to detect attacks, we don't need + * to set it to a lower value as you'd need the chain + * length to vastly exceed 16 to have any real effect + * on the system. + */ +#if defined(HAVE_RHASHTABLE_INSECURE_ELASTICITY) || !defined(HAVE_RHLTABLE) + if (!params->insecure_elasticity) + ht->elasticity = 16; +#endif + + if (params->locks_mul) + ht->p.locks_mul = roundup_pow_of_two(params->locks_mul); + else + ht->p.locks_mul = BUCKET_LOCKS_PER_CPU; + + ht->key_len = ht->p.key_len; + if (!params->hashfn) { + ht->p.hashfn = jhash; + + if (!(ht->key_len & (sizeof(u32) - 1))) { + ht->key_len /= sizeof(u32); + ht->p.hashfn = rhashtable_jhash2; + } + } + + tbl = bucket_table_alloc(ht, size, GFP_KERNEL); + if (tbl == NULL) + return -ENOMEM; + + atomic_set(&ht->nelems, 0); + + RCU_INIT_POINTER(ht->tbl, tbl); + + INIT_WORK(&ht->run_work, rht_deferred_worker); + + return 0; +} +EXPORT_SYMBOL_GPL(rhashtable_init); + +/** + * rhltable_init - initialize a new hash list table + * @hlt: hash list table to be initialized + * @params: configuration parameters + * + * Initializes a new hash list table. + * + * See documentation for rhashtable_init. + */ +int rhltable_init(struct rhltable *hlt, const struct rhashtable_params *params) +{ + int err; + + /* No rhlist NULLs marking for now. */ + if (params->nulls_base) + return -EINVAL; + + err = rhashtable_init(&hlt->ht, params); + hlt->ht.rhlist = true; + return err; +} +EXPORT_SYMBOL_GPL(rhltable_init); + +static void rhashtable_free_one(struct rhashtable *ht, struct rhash_head *obj, + void (*free_fn)(void *ptr, void *arg), + void *arg) +{ + struct rhlist_head *list; + + if (!ht->rhlist) { + free_fn(rht_obj(ht, obj), arg); + return; + } + + list = container_of(obj, struct rhlist_head, rhead); + do { + obj = &list->rhead; + list = rht_dereference(list->next, ht); + free_fn(rht_obj(ht, obj), arg); + } while (list); +} + +/** + * rhashtable_free_and_destroy - free elements and destroy hash table + * @ht: the hash table to destroy + * @free_fn: callback to release resources of element + * @arg: pointer passed to free_fn + * + * Stops an eventual async resize. If defined, invokes free_fn for each + * element to releasal resources. Please note that RCU protected + * readers may still be accessing the elements. Releasing of resources + * must occur in a compatible manner. Then frees the bucket array. + * + * This function will eventually sleep to wait for an async resize + * to complete. The caller is responsible that no further write operations + * occurs in parallel. + */ +void rhashtable_free_and_destroy(struct rhashtable *ht, + void (*free_fn)(void *ptr, void *arg), + void *arg) +{ + const struct bucket_table *tbl; + unsigned int i; + + cancel_work_sync(&ht->run_work); + + mutex_lock(&ht->mutex); + tbl = rht_dereference(ht->tbl, ht); + if (free_fn) { + for (i = 0; i < tbl->size; i++) { + struct rhash_head *pos, *next; + + for (pos = rht_dereference(tbl->buckets[i], ht), + next = !rht_is_a_nulls(pos) ? + rht_dereference(pos->next, ht) : NULL; + !rht_is_a_nulls(pos); + pos = next, + next = !rht_is_a_nulls(pos) ? + rht_dereference(pos->next, ht) : NULL) + rhashtable_free_one(ht, pos, free_fn, arg); + } + } + + bucket_table_free(tbl); + mutex_unlock(&ht->mutex); +} +EXPORT_SYMBOL_GPL(rhashtable_free_and_destroy); + +void rhashtable_destroy(struct rhashtable *ht) +{ + return rhashtable_free_and_destroy(ht, NULL, NULL); +} +EXPORT_SYMBOL_GPL(rhashtable_destroy); +#endif diff --git a/src/mlnx-ofa_kernel-5.8/compat/sch_codel.c b/src/mlnx-ofa_kernel-5.8/compat/sch_codel.c new file mode 100644 index 0000000..02ed379 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/sch_codel.c @@ -0,0 +1,309 @@ +/* + * Codel - The Controlled-Delay Active Queue Management algorithm + * + * Copyright (C) 2011-2012 Kathleen Nichols + * Copyright (C) 2011-2012 Van Jacobson + * + * Implemented on linux by : + * Copyright (C) 2012 Michael D. Taht + * Copyright (C) 2012 Eric Dumazet + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the authors may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * Alternatively, provided that this notice is retained in full, this + * software may be distributed under the terms of the GNU General + * Public License ("GPL") version 2, in which case the provisions of the + * GPL apply INSTEAD OF those given above. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#define DEFAULT_CODEL_LIMIT 1000 + +struct codel_sched_data { + struct codel_params params; + struct codel_vars vars; + struct codel_stats stats; + u32 drop_overlimit; +#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,39)) + u32 limit; +#endif +}; + +/* This is the specific function called from codel_dequeue() + * to dequeue a packet from queue. Note: backlog is handled in + * codel, we dont need to reduce it here. + */ +static struct sk_buff *dequeue(struct codel_vars *vars, struct Qdisc *sch) +{ + struct sk_buff *skb = __skb_dequeue(&sch->q); + + prefetch(&skb->end); /* we'll need skb_shinfo() */ + return skb; +} + +static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch) +{ + struct codel_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + + skb = codel_dequeue(sch, &q->params, &q->vars, &q->stats, dequeue); + + /* We cant call qdisc_tree_decrease_qlen() if our qlen is 0, + * or HTB crashes. Defer it for next round. + */ + if (q->stats.drop_count && sch->q.qlen) { + qdisc_tree_decrease_qlen(sch, q->stats.drop_count); + q->stats.drop_count = 0; + } + if (skb) + qdisc_bstats_update(sch, skb); + return skb; +} + +static int codel_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct codel_sched_data *q; + + q = qdisc_priv(sch); + +#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,39)) + if (likely(qdisc_qlen(sch) < q->limit)) { +#else + if (likely(qdisc_qlen(sch) < sch->limit)) { +#endif + codel_set_enqueue_time(skb); + return qdisc_enqueue_tail(skb, sch); + } + q->drop_overlimit++; + return qdisc_drop(skb, sch); +} + +static const struct nla_policy codel_policy[TCA_CODEL_MAX + 1] = { + [TCA_CODEL_TARGET] = { .type = NLA_U32 }, + [TCA_CODEL_LIMIT] = { .type = NLA_U32 }, + [TCA_CODEL_INTERVAL] = { .type = NLA_U32 }, + [TCA_CODEL_ECN] = { .type = NLA_U32 }, +}; + +static int codel_change(struct Qdisc *sch, struct nlattr *opt) +{ + struct codel_sched_data *q = qdisc_priv(sch); + struct nlattr *tb[TCA_CODEL_MAX + 1]; + unsigned int qlen; + int err; + + if (!opt) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_CODEL_MAX, opt, codel_policy); + if (err < 0) + return err; + + sch_tree_lock(sch); + + if (tb[TCA_CODEL_TARGET]) { + u32 target = nla_get_u32(tb[TCA_CODEL_TARGET]); + + q->params.target = ((u64)target * NSEC_PER_USEC) >> CODEL_SHIFT; + } + + if (tb[TCA_CODEL_INTERVAL]) { + u32 interval = nla_get_u32(tb[TCA_CODEL_INTERVAL]); + + q->params.interval = ((u64)interval * NSEC_PER_USEC) >> CODEL_SHIFT; + } + + if (tb[TCA_CODEL_LIMIT]) +#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,39)) + q->limit = nla_get_u32(tb[TCA_CODEL_LIMIT]); +#else + sch->limit = nla_get_u32(tb[TCA_CODEL_LIMIT]); +#endif + + if (tb[TCA_CODEL_ECN]) + q->params.ecn = !!nla_get_u32(tb[TCA_CODEL_ECN]); + + qlen = sch->q.qlen; +#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,39)) + while (sch->q.qlen > q->limit) { +#else + while (sch->q.qlen > sch->limit) { +#endif + struct sk_buff *skb = __skb_dequeue(&sch->q); + + sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_drop(skb, sch); + } + qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); + + sch_tree_unlock(sch); + return 0; +} + +static int codel_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct codel_sched_data *q = qdisc_priv(sch); + +#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,39)) + q->limit = DEFAULT_CODEL_LIMIT; +#else + sch->limit = DEFAULT_CODEL_LIMIT; +#endif + + codel_params_init(&q->params); + codel_vars_init(&q->vars); + codel_stats_init(&q->stats); + + if (opt) { + int err = codel_change(sch, opt); + + if (err) + return err; + } + +#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,39)) + if (q->limit >= 1) +#else + if (sch->limit >= 1) +#endif + sch->flags |= TCQ_F_CAN_BYPASS; + else + sch->flags &= ~TCQ_F_CAN_BYPASS; + + return 0; +} + +static int codel_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct codel_sched_data *q = qdisc_priv(sch); + struct nlattr *opts; + + opts = nla_nest_start(skb, TCA_OPTIONS); + if (opts == NULL) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_CODEL_TARGET, + codel_time_to_us(q->params.target)) || + nla_put_u32(skb, TCA_CODEL_LIMIT, +#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,39)) + q->limit) || +#else + sch->limit) || +#endif + nla_put_u32(skb, TCA_CODEL_INTERVAL, + codel_time_to_us(q->params.interval)) || + nla_put_u32(skb, TCA_CODEL_ECN, + q->params.ecn)) + goto nla_put_failure; + + return nla_nest_end(skb, opts); + +nla_put_failure: + nla_nest_cancel(skb, opts); + return -1; +} + +static int codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ + const struct codel_sched_data *q = qdisc_priv(sch); + struct tc_codel_xstats st = { + .maxpacket = q->stats.maxpacket, + .count = q->vars.count, + .lastcount = q->vars.lastcount, + .drop_overlimit = q->drop_overlimit, + .ldelay = codel_time_to_us(q->vars.ldelay), + .dropping = q->vars.dropping, + .ecn_mark = q->stats.ecn_mark, + }; + + if (q->vars.dropping) { + codel_tdiff_t delta = q->vars.drop_next - codel_get_time(); + + if (delta >= 0) + st.drop_next = codel_time_to_us(delta); + else + st.drop_next = -codel_time_to_us(-delta); + } + + return gnet_stats_copy_app(d, &st, sizeof(st)); +} + +static void codel_reset(struct Qdisc *sch) +{ + struct codel_sched_data *q = qdisc_priv(sch); + + qdisc_reset_queue(sch); + codel_vars_init(&q->vars); +} + +static struct Qdisc_ops codel_qdisc_ops __read_mostly = { + .id = "codel", + .priv_size = sizeof(struct codel_sched_data), + + .enqueue = codel_qdisc_enqueue, + .dequeue = codel_qdisc_dequeue, +#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,6,28)) + .peek = qdisc_peek_dequeued, +#endif + .init = codel_init, + .reset = codel_reset, + .change = codel_change, + .dump = codel_dump, + .dump_stats = codel_dump_stats, + .owner = THIS_MODULE, +}; + +static int __init codel_module_init(void) +{ + return register_qdisc(&codel_qdisc_ops); +} + +static void __exit codel_module_exit(void) +{ + unregister_qdisc(&codel_qdisc_ops); +} + +module_init(codel_module_init) +module_exit(codel_module_exit) + +MODULE_DESCRIPTION("Controlled Delay queue discipline"); +MODULE_AUTHOR("Dave Taht"); +MODULE_AUTHOR("Eric Dumazet"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif diff --git a/src/mlnx-ofa_kernel-5.8/compat/sch_fq_codel.c b/src/mlnx-ofa_kernel-5.8/compat/sch_fq_codel.c new file mode 100644 index 0000000..b68d93b --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/sch_fq_codel.c @@ -0,0 +1,662 @@ +/* + * Fair Queue CoDel discipline + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Copyright (C) 2012 Eric Dumazet + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Fair Queue CoDel. + * + * Principles : + * Packets are classified (internal classifier or external) on flows. + * This is a Stochastic model (as we use a hash, several flows + * might be hashed on same slot) + * Each flow has a CoDel managed queue. + * Flows are linked onto two (Round Robin) lists, + * so that new flows have priority on old ones. + * + * For a given flow, packets are not reordered (CoDel uses a FIFO) + * head drops only. + * ECN capability is on by default. + * Low memory footprint (64 bytes per flow) + */ + +struct fq_codel_flow { + struct sk_buff *head; + struct sk_buff *tail; + struct list_head flowchain; + int deficit; + u32 dropped; /* number of drops (or ECN marks) on this flow */ + struct codel_vars cvars; +}; /* please try to keep this structure <= 64 bytes */ + +struct fq_codel_sched_data { + struct tcf_proto *filter_list; /* optional external classifier */ + struct fq_codel_flow *flows; /* Flows table [flows_cnt] */ + u32 *backlogs; /* backlog table [flows_cnt] */ + u32 flows_cnt; /* number of flows */ + u32 perturbation; /* hash perturbation */ + u32 quantum; /* psched_mtu(qdisc_dev(sch)); */ + struct codel_params cparams; + struct codel_stats cstats; + u32 drop_overlimit; + u32 new_flow_count; + + struct list_head new_flows; /* list of new flows */ + struct list_head old_flows; /* list of old flows */ +#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,39)) + u32 limit; +#endif +}; + +static unsigned int fq_codel_hash(const struct fq_codel_sched_data *q, + const struct sk_buff *skb) +{ + struct flow_keys keys; + unsigned int hash; + + skb_flow_dissect(skb, &keys); + hash = jhash_3words((__force u32)keys.dst, + (__force u32)keys.src ^ keys.ip_proto, + (__force u32)keys.ports, q->perturbation); + return ((u64)hash * q->flows_cnt) >> 32; +} + +static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch, + int *qerr) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + struct tcf_result res; + int result; + + if (TC_H_MAJ(skb->priority) == sch->handle && + TC_H_MIN(skb->priority) > 0 && + TC_H_MIN(skb->priority) <= q->flows_cnt) + return TC_H_MIN(skb->priority); + + if (!q->filter_list) + return fq_codel_hash(q, skb) + 1; + + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + result = tc_classify(skb, q->filter_list, &res); + if (result >= 0) { +#ifdef CONFIG_NET_CLS_ACT + switch (result) { + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + case TC_ACT_SHOT: + return 0; + } +#endif + if (TC_H_MIN(res.classid) <= q->flows_cnt) + return TC_H_MIN(res.classid); + } + return 0; +} + +/* helper functions : might be changed when/if skb use a standard list_head */ + +/* remove one skb from head of slot queue */ +static inline struct sk_buff *dequeue_head(struct fq_codel_flow *flow) +{ + struct sk_buff *skb = flow->head; + + flow->head = skb->next; + skb->next = NULL; + return skb; +} + +/* add skb to flow queue (tail add) */ +static inline void flow_queue_add(struct fq_codel_flow *flow, + struct sk_buff *skb) +{ + if (flow->head == NULL) + flow->head = skb; + else + flow->tail->next = skb; + flow->tail = skb; + skb->next = NULL; +} + +static unsigned int fq_codel_drop(struct Qdisc *sch) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + unsigned int maxbacklog = 0, idx = 0, i, len; + struct fq_codel_flow *flow; + + /* Queue is full! Find the fat flow and drop packet from it. + * This might sound expensive, but with 1024 flows, we scan + * 4KB of memory, and we dont need to handle a complex tree + * in fast path (packet queue/enqueue) with many cache misses. + */ + for (i = 0; i < q->flows_cnt; i++) { + if (q->backlogs[i] > maxbacklog) { + maxbacklog = q->backlogs[i]; + idx = i; + } + } + flow = &q->flows[idx]; + skb = dequeue_head(flow); + len = qdisc_pkt_len(skb); + q->backlogs[idx] -= len; + kfree_skb(skb); + sch->q.qlen--; + sch->qstats.drops++; + sch->qstats.backlog -= len; + flow->dropped++; + return idx; +} + +static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + unsigned int idx; + struct fq_codel_flow *flow; + int uninitialized_var(ret); + + idx = fq_codel_classify(skb, sch, &ret); + if (idx == 0) { + if (ret & __NET_XMIT_BYPASS) + sch->qstats.drops++; + kfree_skb(skb); + return ret; + } + idx--; + + codel_set_enqueue_time(skb); + flow = &q->flows[idx]; + flow_queue_add(flow, skb); + q->backlogs[idx] += qdisc_pkt_len(skb); + sch->qstats.backlog += qdisc_pkt_len(skb); + + if (list_empty(&flow->flowchain)) { + list_add_tail(&flow->flowchain, &q->new_flows); + codel_vars_init(&flow->cvars); + q->new_flow_count++; + flow->deficit = q->quantum; + flow->dropped = 0; + } +#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,39)) + if (++sch->q.qlen < q->limit) +#else + if (++sch->q.qlen < sch->limit) +#endif + return NET_XMIT_SUCCESS; + + q->drop_overlimit++; + /* Return Congestion Notification only if we dropped a packet + * from this flow. + */ + if (fq_codel_drop(sch) == idx) + return NET_XMIT_CN; + + /* As we dropped a packet, better let upper stack know this */ + qdisc_tree_decrease_qlen(sch, 1); + return NET_XMIT_SUCCESS; +} + +/* This is the specific function called from codel_dequeue() + * to dequeue a packet from queue. Note: backlog is handled in + * codel, we dont need to reduce it here. + */ +static struct sk_buff *dequeue(struct codel_vars *vars, struct Qdisc *sch) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + struct fq_codel_flow *flow; + struct sk_buff *skb = NULL; + + flow = container_of(vars, struct fq_codel_flow, cvars); + if (flow->head) { + skb = dequeue_head(flow); + q->backlogs[flow - q->flows] -= qdisc_pkt_len(skb); + sch->q.qlen--; + } + return skb; +} + +static struct sk_buff *fq_codel_dequeue(struct Qdisc *sch) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + struct fq_codel_flow *flow; + struct list_head *head; + u32 prev_drop_count, prev_ecn_mark; + +begin: + head = &q->new_flows; + if (list_empty(head)) { + head = &q->old_flows; + if (list_empty(head)) + return NULL; + } + flow = list_first_entry(head, struct fq_codel_flow, flowchain); + + if (flow->deficit <= 0) { + flow->deficit += q->quantum; + list_move_tail(&flow->flowchain, &q->old_flows); + goto begin; + } + + prev_drop_count = q->cstats.drop_count; + prev_ecn_mark = q->cstats.ecn_mark; + + skb = codel_dequeue(sch, &q->cparams, &flow->cvars, &q->cstats, + dequeue); + + flow->dropped += q->cstats.drop_count - prev_drop_count; + flow->dropped += q->cstats.ecn_mark - prev_ecn_mark; + + if (!skb) { + /* force a pass through old_flows to prevent starvation */ + if ((head == &q->new_flows) && !list_empty(&q->old_flows)) + list_move_tail(&flow->flowchain, &q->old_flows); + else + list_del_init(&flow->flowchain); + goto begin; + } + qdisc_bstats_update(sch, skb); + flow->deficit -= qdisc_pkt_len(skb); + /* We cant call qdisc_tree_decrease_qlen() if our qlen is 0, + * or HTB crashes. Defer it for next round. + */ + if (q->cstats.drop_count && sch->q.qlen) { + qdisc_tree_decrease_qlen(sch, q->cstats.drop_count); + q->cstats.drop_count = 0; + } + return skb; +} + +static void fq_codel_reset(struct Qdisc *sch) +{ + struct sk_buff *skb; + + while ((skb = fq_codel_dequeue(sch)) != NULL) + kfree_skb(skb); +} + +static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = { + [TCA_FQ_CODEL_TARGET] = { .type = NLA_U32 }, + [TCA_FQ_CODEL_LIMIT] = { .type = NLA_U32 }, + [TCA_FQ_CODEL_INTERVAL] = { .type = NLA_U32 }, + [TCA_FQ_CODEL_ECN] = { .type = NLA_U32 }, + [TCA_FQ_CODEL_FLOWS] = { .type = NLA_U32 }, + [TCA_FQ_CODEL_QUANTUM] = { .type = NLA_U32 }, +}; + +static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + struct nlattr *tb[TCA_FQ_CODEL_MAX + 1]; + int err; + + if (!opt) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_FQ_CODEL_MAX, opt, fq_codel_policy); + if (err < 0) + return err; + if (tb[TCA_FQ_CODEL_FLOWS]) { + if (q->flows) + return -EINVAL; + q->flows_cnt = nla_get_u32(tb[TCA_FQ_CODEL_FLOWS]); + if (!q->flows_cnt || + q->flows_cnt > 65536) + return -EINVAL; + } + sch_tree_lock(sch); + + if (tb[TCA_FQ_CODEL_TARGET]) { + u64 target = nla_get_u32(tb[TCA_FQ_CODEL_TARGET]); + + q->cparams.target = (target * NSEC_PER_USEC) >> CODEL_SHIFT; + } + + if (tb[TCA_FQ_CODEL_INTERVAL]) { + u64 interval = nla_get_u32(tb[TCA_FQ_CODEL_INTERVAL]); + + q->cparams.interval = (interval * NSEC_PER_USEC) >> CODEL_SHIFT; + } + + if (tb[TCA_FQ_CODEL_LIMIT]) +#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,39)) + q->limit = nla_get_u32(tb[TCA_FQ_CODEL_LIMIT]); +#else + sch->limit = nla_get_u32(tb[TCA_FQ_CODEL_LIMIT]); +#endif + + if (tb[TCA_FQ_CODEL_ECN]) + q->cparams.ecn = !!nla_get_u32(tb[TCA_FQ_CODEL_ECN]); + + if (tb[TCA_FQ_CODEL_QUANTUM]) + q->quantum = max(256U, nla_get_u32(tb[TCA_FQ_CODEL_QUANTUM])); + +#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,39)) + while (sch->q.qlen > q->limit) { +#else + while (sch->q.qlen > sch->limit) { +#endif + struct sk_buff *skb = fq_codel_dequeue(sch); + + kfree_skb(skb); + q->cstats.drop_count++; + } + qdisc_tree_decrease_qlen(sch, q->cstats.drop_count); + q->cstats.drop_count = 0; + + sch_tree_unlock(sch); + return 0; +} + +static void *fq_codel_zalloc(size_t sz) +{ + void *ptr = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN); + + if (!ptr) + ptr = vzalloc(sz); + return ptr; +} + +static void fq_codel_free(void *addr) +{ + if (addr) { + if (is_vmalloc_addr(addr)) + vfree(addr); + else + kfree(addr); + } +} + +static void fq_codel_destroy(struct Qdisc *sch) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + +#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,6,25)) + tcf_destroy_chain(&q->filter_list); +#else + tcf_destroy_chain(q->filter_list); +#endif + fq_codel_free(q->backlogs); + fq_codel_free(q->flows); +} + +static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + int i; + +#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,39)) + q->limit = 10*1024; +#else + sch->limit = 10*1024; +#endif + q->flows_cnt = 1024; + q->quantum = psched_mtu(qdisc_dev(sch)); + q->perturbation = net_random(); + INIT_LIST_HEAD(&q->new_flows); + INIT_LIST_HEAD(&q->old_flows); + codel_params_init(&q->cparams); + codel_stats_init(&q->cstats); + q->cparams.ecn = true; + + if (opt) { + int err = fq_codel_change(sch, opt); + if (err) + return err; + } + + if (!q->flows) { + q->flows = fq_codel_zalloc(q->flows_cnt * + sizeof(struct fq_codel_flow)); + if (!q->flows) + return -ENOMEM; + q->backlogs = fq_codel_zalloc(q->flows_cnt * sizeof(u32)); + if (!q->backlogs) { + fq_codel_free(q->flows); + return -ENOMEM; + } + for (i = 0; i < q->flows_cnt; i++) { + struct fq_codel_flow *flow = q->flows + i; + + INIT_LIST_HEAD(&flow->flowchain); + } + } +#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,39)) + if (q->limit >= 1) +#else + if (sch->limit >= 1) +#endif + sch->flags |= TCQ_F_CAN_BYPASS; + else + sch->flags &= ~TCQ_F_CAN_BYPASS; + return 0; +} + +static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + struct nlattr *opts; + + opts = nla_nest_start(skb, TCA_OPTIONS); + if (opts == NULL) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_FQ_CODEL_TARGET, + codel_time_to_us(q->cparams.target)) || + nla_put_u32(skb, TCA_FQ_CODEL_LIMIT, +#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,39)) + q->limit) || +#else + sch->limit) || +#endif + nla_put_u32(skb, TCA_FQ_CODEL_INTERVAL, + codel_time_to_us(q->cparams.interval)) || + nla_put_u32(skb, TCA_FQ_CODEL_ECN, + q->cparams.ecn) || + nla_put_u32(skb, TCA_FQ_CODEL_QUANTUM, + q->quantum) || + nla_put_u32(skb, TCA_FQ_CODEL_FLOWS, + q->flows_cnt)) + goto nla_put_failure; + + nla_nest_end(skb, opts); + return skb->len; + +nla_put_failure: + return -1; +} + +static int fq_codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + struct tc_fq_codel_xstats st = { + .type = TCA_FQ_CODEL_XSTATS_QDISC, + }; + struct list_head *pos; + + st.qdisc_stats.maxpacket = q->cstats.maxpacket; + st.qdisc_stats.drop_overlimit = q->drop_overlimit; + st.qdisc_stats.ecn_mark = q->cstats.ecn_mark; + st.qdisc_stats.new_flow_count = q->new_flow_count; + + list_for_each(pos, &q->new_flows) + st.qdisc_stats.new_flows_len++; + + list_for_each(pos, &q->old_flows) + st.qdisc_stats.old_flows_len++; + + return gnet_stats_copy_app(d, &st, sizeof(st)); +} + +static struct Qdisc *fq_codel_leaf(struct Qdisc *sch, unsigned long arg) +{ + return NULL; +} + +static unsigned long fq_codel_get(struct Qdisc *sch, u32 classid) +{ + return 0; +} + +static unsigned long fq_codel_bind(struct Qdisc *sch, unsigned long parent, + u32 classid) +{ + /* we cannot bypass queue discipline anymore */ + sch->flags &= ~TCQ_F_CAN_BYPASS; + return 0; +} + +static void fq_codel_put(struct Qdisc *q, unsigned long cl) +{ +} + +static struct tcf_proto **fq_codel_find_tcf(struct Qdisc *sch, unsigned long cl) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + + if (cl) + return NULL; + return &q->filter_list; +} + +static int fq_codel_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + tcm->tcm_handle |= TC_H_MIN(cl); + return 0; +} + +static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl, + struct gnet_dump *d) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + u32 idx = cl - 1; + struct gnet_stats_queue qs = { 0 }; + struct tc_fq_codel_xstats xstats; + + if (idx < q->flows_cnt) { + const struct fq_codel_flow *flow = &q->flows[idx]; + const struct sk_buff *skb = flow->head; + + memset(&xstats, 0, sizeof(xstats)); + xstats.type = TCA_FQ_CODEL_XSTATS_CLASS; + xstats.class_stats.deficit = flow->deficit; + xstats.class_stats.ldelay = + codel_time_to_us(flow->cvars.ldelay); + xstats.class_stats.count = flow->cvars.count; + xstats.class_stats.lastcount = flow->cvars.lastcount; + xstats.class_stats.dropping = flow->cvars.dropping; + if (flow->cvars.dropping) { + codel_tdiff_t delta = flow->cvars.drop_next - + codel_get_time(); + + xstats.class_stats.drop_next = (delta >= 0) ? + codel_time_to_us(delta) : + -codel_time_to_us(-delta); + } + while (skb) { + qs.qlen++; + skb = skb->next; + } + qs.backlog = q->backlogs[idx]; + qs.drops = flow->dropped; + } + if (gnet_stats_copy_queue(d, &qs) < 0) + return -1; + if (idx < q->flows_cnt) + return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); + return 0; +} + +static void fq_codel_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + unsigned int i; + + if (arg->stop) + return; + + for (i = 0; i < q->flows_cnt; i++) { + if (list_empty(&q->flows[i].flowchain) || + arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, i + 1, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } +} + +static const struct Qdisc_class_ops fq_codel_class_ops = { + .leaf = fq_codel_leaf, + .get = fq_codel_get, + .put = fq_codel_put, + .tcf_chain = fq_codel_find_tcf, + .bind_tcf = fq_codel_bind, + .unbind_tcf = fq_codel_put, + .dump = fq_codel_dump_class, + .dump_stats = fq_codel_dump_class_stats, + .walk = fq_codel_walk, +}; + +static struct Qdisc_ops fq_codel_qdisc_ops __read_mostly = { + .cl_ops = &fq_codel_class_ops, + .id = "fq_codel", + .priv_size = sizeof(struct fq_codel_sched_data), + .enqueue = fq_codel_enqueue, + .dequeue = fq_codel_dequeue, +#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,6,28)) + .peek = qdisc_peek_dequeued, +#endif + .drop = fq_codel_drop, + .init = fq_codel_init, + .reset = fq_codel_reset, + .destroy = fq_codel_destroy, + .change = fq_codel_change, + .dump = fq_codel_dump, + .dump_stats = fq_codel_dump_stats, + .owner = THIS_MODULE, +}; + +static int __init fq_codel_module_init(void) +{ + return register_qdisc(&fq_codel_qdisc_ops); +} + +static void __exit fq_codel_module_exit(void) +{ + unregister_qdisc(&fq_codel_qdisc_ops); +} + +module_init(fq_codel_module_init) +module_exit(fq_codel_module_exit) +MODULE_AUTHOR("Eric Dumazet"); +MODULE_LICENSE("GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif diff --git a/src/mlnx-ofa_kernel-5.8/compat/scripts/compat_firmware_install b/src/mlnx-ofa_kernel-5.8/compat/scripts/compat_firmware_install new file mode 100755 index 0000000..33e4fde --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/scripts/compat_firmware_install @@ -0,0 +1,21 @@ +#!/bin/sh + +if [ -f /usr/bin/lsb_release ]; then + LSB_RED_ID=$(/usr/bin/lsb_release -i -s) +else + LSB_RED_ID="Unknown" +fi + +case $LSB_RED_ID in +"Ubuntu") + mkdir -p /lib/udev/ /lib/udev/rules.d/ + cp udev/ubuntu/compat_firmware.sh /lib/udev/ + cp udev/ubuntu/50-compat_firmware.rules /lib/udev/rules.d/ + ;; +*) + mkdir -p /lib/udev/ /lib/udev/rules.d/ + cp udev/compat_firmware.sh /lib/udev/ + cp udev/50-compat_firmware.rules /lib/udev/rules.d/ + ;; +esac + diff --git a/src/mlnx-ofa_kernel-5.8/compat/scripts/gen-compat-autoconf.sh b/src/mlnx-ofa_kernel-5.8/compat/scripts/gen-compat-autoconf.sh new file mode 100755 index 0000000..8c1a13d --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/scripts/gen-compat-autoconf.sh @@ -0,0 +1,105 @@ +#!/bin/bash +# +# Copyright 2012 Luis R. Rodriguez +# Copyright 2011 Hauke Mehrtens +# Copyright 2011 John W. Linville +# +# Use this to parse a small .config equivalent looking file to generate +# our own autoconf.h. This file has defines for each config option +# just like the kernels include/generated/autoconf.h +# +# XXX: consider using scripts/kconfig/confdata.c instead. +# On the downside this would require the user to have libc though. + +# This indicates which is the oldest kernel we support +# Update this if you are adding support for older kernels. +OLDEST_KERNEL_SUPPORTED="2.6.24" + +if [ $# -ne 1 ]; then + echo "Usage $0 config-file" + exit +fi + +COMPAT_CONFIG="$1" + +if [ ! -f $COMPAT_CONFIG ]; then + echo "File $1 is not a file" + exit +fi + +# Defines a CONFIG_ option if not defined yet, this helps respect +# generated/autoconf.h +function define_config { + VAR=$1 + VALUE=$2 + case $VALUE in + n) # Try to undefine it + echo "#undef $VAR" + ;; + y) + echo "#ifndef $VAR" + echo "#define $VAR 1" + echo "#endif /* $VAR */" + ;; + m) + echo "#ifndef $VAR" + echo "#define $VAR 1" + echo "#endif /* $VAR */" + ;; + *) # Assume string + # XXX: add better checks to make sure what was on + # the right was indeed a string + echo "#ifndef $VAR" + echo "#define $VAR \"$VALUE\"" + echo "#endif /* $VAR */" + ;; + esac +} + +function kernel_version_req { + VERSION=$(echo $1 | sed -e 's/\./,/g') + echo "#if (LINUX_VERSION_CODE < KERNEL_VERSION($VERSION))" + echo "#error compat requirement: Linux >= $VERSION" + echo "#endif /* (LINUX_VERSION_CODE < KERNEL_VERSION($VERSION) */" +} + +cat <= \3)/' -e 's/\(#ifdef \)\(CONFIG_[^:space:]*\)/#if defined(\2) || defined(\2_MODULE)/' + continue + ;; + 'ifndef+CONFIG_'* ) + echo "#$i" | sed -e 's/+/ /' -e 's/\(ifndef CONFIG_COMPAT_KERNEL_3_\)\([0-9]*\)/if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,\2,0))/' -e 's/\(ifndef CONFIG_COMPAT_KERNEL_2_6_\)\([0-9]*\)/if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,\2))/' -e 's/\(ifndef CONFIG_COMPAT_RHEL_\)\([0-9]*\)_\([0-9]*\)/if (!defined(RHEL_MAJOR) || RHEL_MAJOR != \2 || RHEL_MINOR < \3)/' -e 's/\(#ifndef \)\(CONFIG_[^:space:]*\)/#if !defined(\2) \&\& !defined(\2_MODULE)/' + continue + ;; + 'else+#CONFIG_'* | 'endif+#CONFIG_'* ) + echo "#$i */" |sed -e 's/+#/ \/* /g' + continue + ;; + CONFIG_* ) + # Get the element on the left of the "=" + VAR=$(echo $i | cut -d"=" -f 1) + # Get the element on the right of the "=" + VALUE=$(echo $i | cut -d"=" -f 2) + + # Any other module which can *definitely* be built as a module goes here + define_config $VAR $VALUE + continue + ;; + esac +done + +echo "#endif /* COMPAT_AUTOCONF_INCLUDED */" diff --git a/src/mlnx-ofa_kernel-5.8/compat/scripts/gen-compat-config.sh b/src/mlnx-ofa_kernel-5.8/compat/scripts/gen-compat-config.sh new file mode 100755 index 0000000..3eba34e --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/scripts/gen-compat-config.sh @@ -0,0 +1,77 @@ +#!/bin/bash +# Copyright 2012 Luis R. Rodriguez +# Copyright 2012 Hauke Mehrtens +# +# This generates a bunch of CONFIG_COMPAT_KERNEL_2_6_22 +# CONFIG_COMPAT_KERNEL_3_0 .. etc for each kernel release you need an object +# for. +# +# Note: this is part of the compat.git project, not compat-wireless.git, +# send patches against compat.git. + +if [[ ! -f ${KLIB_BUILD}/Makefile ]]; then + exit +fi + +# Actual kernel version +KERNEL_VERSION=$(${MAKE} -C ${KLIB_BUILD} kernelversion | sed -n 's/^\([0-9]\)\..*/\1/p') + +# 3.0 kernel stuff +COMPAT_LATEST_VERSION="5" +KERNEL_SUBLEVEL="-1" + +# Note that this script will export all variables explicitly, +# trying to export all with a blanket "export" statement at +# the top of the generated file causes the build to slow down +# by an order of magnitude. + +if [[ ${KERNEL_VERSION} -eq "3" ]]; then + KERNEL_SUBLEVEL=$(${MAKE} -C ${KLIB_BUILD} kernelversion | sed -n 's/^3\.\([0-9]\+\).*/\1/p') +else + COMPAT_26LATEST_VERSION="39" + KERNEL_26SUBLEVEL=$(${MAKE} -C ${KLIB_BUILD} kernelversion | sed -n 's/^2\.6\.\([0-9]\+\).*/\1/p') + let KERNEL_26SUBLEVEL=${KERNEL_26SUBLEVEL}+1 + + for i in $(seq ${KERNEL_26SUBLEVEL} ${COMPAT_26LATEST_VERSION}); do + eval CONFIG_COMPAT_KERNEL_2_6_${i}=y + echo "export CONFIG_COMPAT_KERNEL_2_6_${i}=y" + done +fi + +let KERNEL_SUBLEVEL=${KERNEL_SUBLEVEL}+1 +for i in $(seq ${KERNEL_SUBLEVEL} ${COMPAT_LATEST_VERSION}); do + eval CONFIG_COMPAT_KERNEL_3_${i}=y + echo "export CONFIG_COMPAT_KERNEL_3_${i}=y" +done + +# The purpose of these seem to be the inverse of the above other varibales. +# The RHEL checks seem to annotate the existance of RHEL minor versions. +RHEL_MAJOR=$(grep ^RHEL_MAJOR ${KLIB_BUILD}/Makefile | sed -n 's/.*= *\(.*\)/\1/p') +if [[ ! -z ${RHEL_MAJOR} ]]; then + RHEL_MINOR=$(grep ^RHEL_MINOR $(KLIB_BUILD)/Makefile | sed -n 's/.*= *\(.*\)/\1/p') + for i in $(seq 0 ${RHEL_MINOR}); do + eval CONFIG_COMPAT_${RHEL_MAJOR}_${i}=y + echo "export CONFIG_COMPAT_${RHEL_MAJOR}_${i}=y" + done +fi + +if [[ ${CONFIG_COMPAT_KERNEL_2_6_33} = "y" ]]; then + echo "export CONFIG_COMPAT_FIRMWARE_CLASS=m" +fi + +if [[ ${CONFIG_COMPAT_KERNEL_2_6_36} = "y" ]]; then + echo "export CONFIG_COMPAT_KFIFO=y" +fi + +if [[ ${CONFIG_COMPAT_KERNEL_3_5} = "y" ]]; then + # We don't have 2.6.24 backport support yet for Codel / FQ CoDel + # For those who want to try this is what is required that I can tell + # so far: + # * struct Qdisc_ops + # - init and change callback ops use a different argument dataype + # - you need to parse data received from userspace differently + if [[ ${CONFIG_COMPAT_KERNEL_2_6_25} != "y" ]]; then + echo "export CONFIG_COMPAT_NET_SCH_CODEL=m" + echo "export CONFIG_COMPAT_NET_SCH_FQ_CODEL=m" + fi +fi diff --git a/src/mlnx-ofa_kernel-5.8/compat/scripts/skip-colors b/src/mlnx-ofa_kernel-5.8/compat/scripts/skip-colors new file mode 100755 index 0000000..121626f --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/scripts/skip-colors @@ -0,0 +1,2 @@ +#!/bin/bash +perl -pe 's|(\e)\[(\d+)(;*)(\d*)(\w)||g' diff --git a/src/mlnx-ofa_kernel-5.8/compat/string.c b/src/mlnx-ofa_kernel-5.8/compat/string.c new file mode 100644 index 0000000..7dbfda0 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/string.c @@ -0,0 +1,39 @@ +#ifndef HAVE_KSTRTOBOOL +int kstrtobool(const char *s, bool *res) +{ + if (!s) + return -EINVAL; + + switch (s[0]) { + case 'y': + case 'Y': + case '1': + *res = true; + return 0; + case 'n': + case 'N': + case '0': + *res = false; + return 0; + case 'o': + case 'O': + switch (s[1]) { + case 'n': + case 'N': + *res = true; + return 0; + case 'f': + case 'F': + *res = false; + return 0; + default: + break; + } + default: + break; + } + + return -EINVAL; +} +EXPORT_SYMBOL(kstrtobool); +#endif diff --git a/src/mlnx-ofa_kernel-5.8/compat/syscall.c b/src/mlnx-ofa_kernel-5.8/compat/syscall.c new file mode 100644 index 0000000..26362fb --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/syscall.c @@ -0,0 +1,18 @@ +#ifdef HAVE_XDP_SUPPORT + +#ifndef HAVE_BPF_PROG_SUB +#include +#include +void bpf_prog_sub(struct bpf_prog *prog, int i) +{ + /* Only to be used for undoing previous bpf_prog_add() in some + * error path. We still know that another entity in our call + * path holds a reference to the program, thus atomic_sub() can + * be safely used in such cases! + */ + WARN_ON(atomic_sub_return(i, &prog->aux->refcnt) == 0); +} +EXPORT_SYMBOL_GPL(bpf_prog_sub); +#endif /* HAVE_BPF_PROG_SUB */ + +#endif /* HAVE_XDP_SUPPORT */ diff --git a/src/mlnx-ofa_kernel-5.8/compat/uuid.c b/src/mlnx-ofa_kernel-5.8/compat/uuid.c new file mode 100644 index 0000000..ea08aa5 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/uuid.c @@ -0,0 +1,89 @@ +#ifndef HAVE_UUID_GEN + +#include +#include +#include +#include +#include +#include + +#ifndef HAVE_UUID_BE_TO_BIN +const u8 uuid_le_index[16] = {3,2,1,0,5,4,7,6,8,9,10,11,12,13,14,15}; +EXPORT_SYMBOL(uuid_le_index); +const u8 uuid_be_index[16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; +EXPORT_SYMBOL(uuid_be_index); + + +/** + * uuid_is_valid - checks if UUID string valid + * @uuid: UUID string to check + * + * Description: + * It checks if the UUID string is following the format: + * xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + * where x is a hex digit. + * + * Return: true if input is valid UUID string. + */ +bool uuid_is_valid(const char *uuid) +{ + unsigned int i; + + for (i = 0; i < UUID_STRING_LEN; i++) { + if (i == 8 || i == 13 || i == 18 || i == 23) { + if (uuid[i] != '-') + return false; + } else if (!isxdigit(uuid[i])) { + return false; + } + } + + return true; +} +EXPORT_SYMBOL(uuid_is_valid); +#endif + +static int __uuid_to_bin(const char *uuid, __u8 b[16], const u8 ei[16]) +{ + static const u8 si[16] = {0,2,4,6,9,11,14,16,19,21,24,26,28,30,32,34}; + unsigned int i; + + if (!uuid_is_valid(uuid)) + return -EINVAL; + + for (i = 0; i < 16; i++) { + int hi = hex_to_bin(uuid[si[i] + 0]); + int lo = hex_to_bin(uuid[si[i] + 1]); + + b[ei[i]] = (hi << 4) | lo; + } + + return 0; +} + +#ifndef HAVE_UUID_BE_TO_BIN +#define uuid_le_to_bin LINUX_BACKPORT(uuid_le_to_bin) +int uuid_le_to_bin(const char *uuid, uuid_le *u) +{ + return __uuid_to_bin(uuid, u->b, uuid_le_index); +} +EXPORT_SYMBOL(uuid_le_to_bin); + +#define uuid_be_to_bin LINUX_BACKPORT(uuid_be_to_bin) +int uuid_be_to_bin(const char *uuid, uuid_be *u) +{ + return __uuid_to_bin(uuid, u->b, uuid_be_index); +} +EXPORT_SYMBOL(uuid_be_to_bin); +#endif + +#ifndef HAVE_GUID_PARSE +const u8 guid_index[16] = {3,2,1,0,5,4,7,6,8,9,10,11,12,13,14,15}; + +int guid_parse(const char *uuid, guid_t *u) +{ + return __uuid_to_bin(uuid, u->b, guid_index); +} +EXPORT_SYMBOL(guid_parse); +#endif +#endif diff --git a/src/mlnx-ofa_kernel-5.8/compat/xarray.c b/src/mlnx-ofa_kernel-5.8/compat/xarray.c new file mode 100644 index 0000000..30302e6 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/xarray.c @@ -0,0 +1,2195 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * XArray implementation + * Copyright (c) 2017 Microsoft Corporation + * Author: Matthew Wilcox + */ + + +/* For RHEL 8.1/8.2 and kernels below 5.4 that has xarray + * but suffering of lack of fixes for xarray */ +#if defined(HAVE_XARRAY) && !defined(HAVE_XA_FOR_EACH_RANGE) +static bool xas_sibling(struct xa_state *xas) +{ + struct xa_node *node = xas->xa_node; + unsigned long mask; + + if (!IS_ENABLED(CONFIG_XARRAY_MULTI) || !node) + return false; + mask = (XA_CHUNK_SIZE << node->shift) - 1; + return (xas->xa_index & mask) > + ((unsigned long)xas->xa_offset << node->shift); +} + +static void *set_bounds(struct xa_state *xas) +{ + xas->xa_node = XAS_BOUNDS; + return NULL; +} + +void *xa_find_after(struct xarray *xa, unsigned long *indexp, + unsigned long max, xa_mark_t filter) +{ + XA_STATE(xas, xa, *indexp + 1); + void *entry; + + if (xas.xa_index == 0) + return NULL; + + rcu_read_lock(); + for (;;) { + if ((__force unsigned int)filter < XA_MAX_MARKS) { + if (xas.xa_index > max) { + xas.xa_node = XAS_RESTART; + entry = NULL; + } + else + entry = xas_find_marked(&xas, max, filter); + } + else { + if (xas.xa_index > max) + entry = set_bounds(&xas); + else + entry = xas_find(&xas, max); + } + + if (xas_invalid(&xas)) + break; + if (xas_sibling(&xas)) + continue; + if (!xas_retry(&xas, entry)) + break; + } + rcu_read_unlock(); + + if (entry) + *indexp = xas.xa_index; + return entry; +} +EXPORT_SYMBOL(xa_find_after); +#endif + +#ifndef HAVE_XARRAY +#include +#include +#include +#include +#include +#include + +/* + * Coding conventions in this file: + * + * @xa is used to refer to the entire xarray. + * @xas is the 'xarray operation state'. It may be either a pointer to + * an xa_state, or an xa_state stored on the stack. This is an unfortunate + * ambiguity. + * @index is the index of the entry being operated on + * @mark is an xa_mark_t; a small number indicating one of the mark bits. + * @node refers to an xa_node; usually the primary one being operated on by + * this function. + * @offset is the index into the slots array inside an xa_node. + * @parent refers to the @xa_node closer to the head than @node. + * @entry refers to something stored in a slot in the xarray + */ +#define radix_tree_node xa_node + +static inline unsigned int xa_lock_type(const struct xarray *xa) +{ + return (__force unsigned int)xa->xa_flags & 3; +} + +static inline void xas_lock_type(struct xa_state *xas, unsigned int lock_type) +{ + if (lock_type == XA_LOCK_IRQ) + xas_lock_irq(xas); + else if (lock_type == XA_LOCK_BH) + xas_lock_bh(xas); + else + xas_lock(xas); +} + +static inline void xas_unlock_type(struct xa_state *xas, unsigned int lock_type) +{ + if (lock_type == XA_LOCK_IRQ) + xas_unlock_irq(xas); + else if (lock_type == XA_LOCK_BH) + xas_unlock_bh(xas); + else + xas_unlock(xas); +} + +static inline bool xa_track_free(const struct xarray *xa) +{ + return xa->xa_flags & XA_FLAGS_TRACK_FREE; +} + +static inline bool xa_zero_busy(const struct xarray *xa) +{ + return xa->xa_flags & XA_FLAGS_ZERO_BUSY; +} + +static inline void xa_mark_set(struct xarray *xa, xa_mark_t mark) +{ + if (!(xa->xa_flags & XA_FLAGS_MARK(mark))) + xa->xa_flags |= XA_FLAGS_MARK(mark); +} + +static inline void xa_mark_clear(struct xarray *xa, xa_mark_t mark) +{ + if (xa->xa_flags & XA_FLAGS_MARK(mark)) + xa->xa_flags &= ~(XA_FLAGS_MARK(mark)); +} + +static inline unsigned long *node_marks(struct xa_node *node, xa_mark_t mark) +{ + return node->marks[(__force unsigned)mark]; +} + +static inline bool node_get_mark(struct xa_node *node, + unsigned int offset, xa_mark_t mark) +{ + return test_bit(offset, node_marks(node, mark)); +} + +/* returns true if the bit was set */ +static inline bool node_set_mark(struct xa_node *node, unsigned int offset, + xa_mark_t mark) +{ + return __test_and_set_bit(offset, node_marks(node, mark)); +} + +/* returns true if the bit was set */ +static inline bool node_clear_mark(struct xa_node *node, unsigned int offset, + xa_mark_t mark) +{ + return __test_and_clear_bit(offset, node_marks(node, mark)); +} + +static inline bool node_any_mark(struct xa_node *node, xa_mark_t mark) +{ + return !bitmap_empty(node_marks(node, mark), XA_CHUNK_SIZE); +} + +static inline void node_mark_all(struct xa_node *node, xa_mark_t mark) +{ + bitmap_fill(node_marks(node, mark), XA_CHUNK_SIZE); +} + +#define mark_inc(mark) do { \ + mark = (__force xa_mark_t)((__force unsigned)(mark) + 1); \ +} while (0) + +/* + * xas_squash_marks() - Merge all marks to the first entry + * @xas: Array operation state. + * + * Set a mark on the first entry if any entry has it set. Clear marks on + * all sibling entries. + */ +static void xas_squash_marks(const struct xa_state *xas) +{ + unsigned int mark = 0; + unsigned int limit = xas->xa_offset + xas->xa_sibs + 1; + + if (!xas->xa_sibs) + return; + + do { + unsigned long *marks = xas->xa_node->marks[mark]; + if (find_next_bit(marks, limit, xas->xa_offset + 1) == limit) + continue; + __set_bit(xas->xa_offset, marks); + bitmap_clear(marks, xas->xa_offset + 1, xas->xa_sibs); + } while (mark++ != (__force unsigned)XA_MARK_MAX); +} + +/* extracts the offset within this node from the index */ +static unsigned int get_offset(unsigned long index, struct xa_node *node) +{ + return (index >> node->shift) & XA_CHUNK_MASK; +} + +static void xas_set_offset(struct xa_state *xas) +{ + xas->xa_offset = get_offset(xas->xa_index, xas->xa_node); +} + +/* move the index either forwards (find) or backwards (sibling slot) */ +static void xas_move_index(struct xa_state *xas, unsigned long offset) +{ + unsigned int shift = xas->xa_node->shift; + xas->xa_index &= ~XA_CHUNK_MASK << shift; + xas->xa_index += offset << shift; +} + +static void xas_advance(struct xa_state *xas) +{ + xas->xa_offset++; + xas_move_index(xas, xas->xa_offset); +} + +static void *set_bounds(struct xa_state *xas) +{ + xas->xa_node = XAS_BOUNDS; + return NULL; +} + +/* + * Starts a walk. If the @xas is already valid, we assume that it's on + * the right path and just return where we've got to. If we're in an + * error state, return NULL. If the index is outside the current scope + * of the xarray, return NULL without changing @xas->xa_node. Otherwise + * set @xas->xa_node to NULL and return the current head of the array. + */ +static void *xas_start(struct xa_state *xas) +{ + void *entry; + + if (xas_valid(xas)) + return xas_reload(xas); + if (xas_error(xas)) + return NULL; + + entry = xa_head(xas->xa); + if (!xa_is_node(entry)) { + if (xas->xa_index) + return set_bounds(xas); + } else { + if ((xas->xa_index >> xa_to_node(entry)->shift) > XA_CHUNK_MASK) + return set_bounds(xas); + } + + xas->xa_node = NULL; + return entry; +} + +static void *xas_descend(struct xa_state *xas, struct xa_node *node) +{ + unsigned int offset = get_offset(xas->xa_index, node); + void *entry = xa_entry(xas->xa, node, offset); + + xas->xa_node = node; + if (xa_is_sibling(entry)) { + offset = xa_to_sibling(entry); + entry = xa_entry(xas->xa, node, offset); + } + + xas->xa_offset = offset; + return entry; +} + +/** + * xas_load() - Load an entry from the XArray (advanced). + * @xas: XArray operation state. + * + * Usually walks the @xas to the appropriate state to load the entry + * stored at xa_index. However, it will do nothing and return %NULL if + * @xas is in an error state. xas_load() will never expand the tree. + * + * If the xa_state is set up to operate on a multi-index entry, xas_load() + * may return %NULL or an internal entry, even if there are entries + * present within the range specified by @xas. + * + * Context: Any context. The caller should hold the xa_lock or the RCU lock. + * Return: Usually an entry in the XArray, but see description for exceptions. + */ +void *xas_load(struct xa_state *xas) +{ + void *entry = xas_start(xas); + + while (xa_is_node(entry)) { + struct xa_node *node = xa_to_node(entry); + + if (xas->xa_shift > node->shift) + break; + entry = xas_descend(xas, node); + if (node->shift == 0) + break; + } + return entry; +} +EXPORT_SYMBOL_GPL(xas_load); + +/* Move the radix tree node cache here */ +struct kmem_cache *compat_radix_tree_node_cachep; +void radix_tree_node_rcu_free(struct rcu_head *head) +{ + struct radix_tree_node *node = + container_of(head, struct radix_tree_node, rcu_head); + /* + * Must only free zeroed nodes into the slab. We can be left with + * non-NULL entries by radix_tree_free_nodes, so clear the entries + * and tags here. + */ + + memset(node->slots, 0, sizeof(node->slots)); + memset(node->tags, 0, sizeof(node->tags)); + INIT_LIST_HEAD(&node->private_list); + + kmem_cache_free(compat_radix_tree_node_cachep, node); + +} + +#define XA_RCU_FREE ((struct xarray *)1) + +static void xa_node_free(struct xa_node *node) +{ + XA_NODE_BUG_ON(node, !list_empty(&node->private_list)); + node->array = XA_RCU_FREE; + call_rcu(&node->rcu_head, radix_tree_node_rcu_free); +} + +/* + * xas_destroy() - Free any resources allocated during the XArray operation. + * @xas: XArray operation state. + * + * This function is now internal-only. + */ +static void xas_destroy(struct xa_state *xas) +{ + struct xa_node *node = xas->xa_alloc; + + if (!node) + return; + XA_NODE_BUG_ON(node, !list_empty(&node->private_list)); + kmem_cache_free(compat_radix_tree_node_cachep, node); + xas->xa_alloc = NULL; +} + +/** + * xas_nomem() - Allocate memory if needed. + * @xas: XArray operation state. + * @gfp: Memory allocation flags. + * + * If we need to add new nodes to the XArray, we try to allocate memory + * with GFP_NOWAIT while holding the lock, which will usually succeed. + * If it fails, @xas is flagged as needing memory to continue. The caller + * should drop the lock and call xas_nomem(). If xas_nomem() succeeds, + * the caller should retry the operation. + * + * Forward progress is guaranteed as one node is allocated here and + * stored in the xa_state where it will be found by xas_alloc(). More + * nodes will likely be found in the slab allocator, but we do not tie + * them up here. + * + * Return: true if memory was needed, and was successfully allocated. + */ +bool xas_nomem(struct xa_state *xas, gfp_t gfp) +{ + if (xas->xa_node != XA_ERROR(-ENOMEM)) { + xas_destroy(xas); + return false; + } + if (xas->xa->xa_flags & XA_FLAGS_ACCOUNT) + gfp |= __GFP_ACCOUNT; + xas->xa_alloc = kmem_cache_alloc(compat_radix_tree_node_cachep, gfp); + if (!xas->xa_alloc) + return false; + XA_NODE_BUG_ON(xas->xa_alloc, !list_empty(&xas->xa_alloc->private_list)); + xas->xa_node = XAS_RESTART; + return true; +} +EXPORT_SYMBOL_GPL(xas_nomem); + +/* + * __xas_nomem() - Drop locks and allocate memory if needed. + * @xas: XArray operation state. + * @gfp: Memory allocation flags. + * + * Internal variant of xas_nomem(). + * + * Return: true if memory was needed, and was successfully allocated. + */ +static bool __xas_nomem(struct xa_state *xas, gfp_t gfp) +#ifdef __must_hold + __must_hold(xas->xa->xa_lock) +#endif +{ + unsigned int lock_type = xa_lock_type(xas->xa); + + if (xas->xa_node != XA_ERROR(-ENOMEM)) { + xas_destroy(xas); + return false; + } + if (xas->xa->xa_flags & XA_FLAGS_ACCOUNT) + gfp |= __GFP_ACCOUNT; + if (gfpflags_allow_blocking(gfp)) { + xas_unlock_type(xas, lock_type); + xas->xa_alloc = kmem_cache_alloc(compat_radix_tree_node_cachep, gfp); + xas_lock_type(xas, lock_type); + } else { + xas->xa_alloc = kmem_cache_alloc(compat_radix_tree_node_cachep, gfp); + } + if (!xas->xa_alloc) + return false; + XA_NODE_BUG_ON(xas->xa_alloc, !list_empty(&xas->xa_alloc->private_list)); + xas->xa_node = XAS_RESTART; + return true; +} + +static void xas_update(struct xa_state *xas, struct xa_node *node) +{ + if (xas->xa_update) + xas->xa_update(node); + else + XA_NODE_BUG_ON(node, !list_empty(&node->private_list)); +} + +static void *xas_alloc(struct xa_state *xas, unsigned int shift) +{ + struct xa_node *parent = xas->xa_node; + struct xa_node *node = xas->xa_alloc; + + if (xas_invalid(xas)) + return NULL; + + if (node) { + xas->xa_alloc = NULL; + } else { + gfp_t gfp = GFP_NOWAIT | __GFP_NOWARN; + + if (xas->xa->xa_flags & XA_FLAGS_ACCOUNT) + gfp |= __GFP_ACCOUNT; + + node = kmem_cache_alloc(compat_radix_tree_node_cachep, gfp); + if (!node) { + xas_set_err(xas, -ENOMEM); + return NULL; + } + } + + if (parent) { + node->offset = xas->xa_offset; + parent->count++; + XA_NODE_BUG_ON(node, parent->count > XA_CHUNK_SIZE); + xas_update(xas, parent); + } + XA_NODE_BUG_ON(node, shift > BITS_PER_LONG); + XA_NODE_BUG_ON(node, !list_empty(&node->private_list)); + node->shift = shift; + node->count = 0; + node->nr_values = 0; + RCU_INIT_POINTER(node->parent, xas->xa_node); + node->array = xas->xa; + + return node; +} + +#ifdef CONFIG_XARRAY_MULTI +/* Returns the number of indices covered by a given xa_state */ +static unsigned long xas_size(const struct xa_state *xas) +{ + return (xas->xa_sibs + 1UL) << xas->xa_shift; +} +#endif + +/* + * Use this to calculate the maximum index that will need to be created + * in order to add the entry described by @xas. Because we cannot store a + * multiple-index entry at index 0, the calculation is a little more complex + * than you might expect. + */ +static unsigned long xas_max(struct xa_state *xas) +{ + unsigned long max = xas->xa_index; + +#ifdef CONFIG_XARRAY_MULTI + if (xas->xa_shift || xas->xa_sibs) { + unsigned long mask = xas_size(xas) - 1; + max |= mask; + if (mask == max) + max++; + } +#endif + + return max; +} + +/* The maximum index that can be contained in the array without expanding it */ +static unsigned long max_index(void *entry) +{ + if (!xa_is_node(entry)) + return 0; + return (XA_CHUNK_SIZE << xa_to_node(entry)->shift) - 1; +} + +static void xas_shrink(struct xa_state *xas) +{ + struct xarray *xa = xas->xa; + struct xa_node *node = xas->xa_node; + + for (;;) { + void *entry; + + XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE); + if (node->count != 1) + break; + entry = xa_entry_locked(xa, node, 0); + if (!entry) + break; + if (!xa_is_node(entry) && node->shift) + break; + if (xa_is_zero(entry) && xa_zero_busy(xa)) + entry = NULL; + xas->xa_node = XAS_BOUNDS; + + RCU_INIT_POINTER(xa->xa_head, entry); + if (xa_track_free(xa) && !node_get_mark(node, 0, XA_FREE_MARK)) + xa_mark_clear(xa, XA_FREE_MARK); + + node->count = 0; + node->nr_values = 0; + if (!xa_is_node(entry)) + RCU_INIT_POINTER(node->slots[0], XA_RETRY_ENTRY); + xas_update(xas, node); + xa_node_free(node); + if (!xa_is_node(entry)) + break; + node = xa_to_node(entry); + node->parent = NULL; + } +} + +/* + * xas_delete_node() - Attempt to delete an xa_node + * @xas: Array operation state. + * + * Attempts to delete the @xas->xa_node. This will fail if xa->node has + * a non-zero reference count. + */ +static void xas_delete_node(struct xa_state *xas) +{ + struct xa_node *node = xas->xa_node; + + for (;;) { + struct xa_node *parent; + + XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE); + if (node->count) + break; + + parent = xa_parent_locked(xas->xa, node); + xas->xa_node = parent; + xas->xa_offset = node->offset; + xa_node_free(node); + + if (!parent) { + xas->xa->xa_head = NULL; + xas->xa_node = XAS_BOUNDS; + return; + } + + parent->slots[xas->xa_offset] = NULL; + parent->count--; + XA_NODE_BUG_ON(parent, parent->count > XA_CHUNK_SIZE); + node = parent; + xas_update(xas, node); + } + + if (!node->parent) + xas_shrink(xas); +} + +/** + * xas_free_nodes() - Free this node and all nodes that it references + * @xas: Array operation state. + * @top: Node to free + * + * This node has been removed from the tree. We must now free it and all + * of its subnodes. There may be RCU walkers with references into the tree, + * so we must replace all entries with retry markers. + */ +static void xas_free_nodes(struct xa_state *xas, struct xa_node *top) +{ + unsigned int offset = 0; + struct xa_node *node = top; + + for (;;) { + void *entry = xa_entry_locked(xas->xa, node, offset); + + if (node->shift && xa_is_node(entry)) { + node = xa_to_node(entry); + offset = 0; + continue; + } + if (entry) + RCU_INIT_POINTER(node->slots[offset], XA_RETRY_ENTRY); + offset++; + while (offset == XA_CHUNK_SIZE) { + struct xa_node *parent; + + parent = xa_parent_locked(xas->xa, node); + offset = node->offset + 1; + node->count = 0; + node->nr_values = 0; + xas_update(xas, node); + xa_node_free(node); + if (node == top) + return; + node = parent; + } + } +} + +/* + * xas_expand adds nodes to the head of the tree until it has reached + * sufficient height to be able to contain @xas->xa_index + */ +static int xas_expand(struct xa_state *xas, void *head) +{ + struct xarray *xa = xas->xa; + struct xa_node *node = NULL; + unsigned int shift = 0; + unsigned long max = xas_max(xas); + + if (!head) { + if (max == 0) + return 0; + while ((max >> shift) >= XA_CHUNK_SIZE) + shift += XA_CHUNK_SHIFT; + return shift + XA_CHUNK_SHIFT; + } else if (xa_is_node(head)) { + node = xa_to_node(head); + shift = node->shift + XA_CHUNK_SHIFT; + } + xas->xa_node = NULL; + + while (max > max_index(head)) { + xa_mark_t mark = 0; + + XA_NODE_BUG_ON(node, shift > BITS_PER_LONG); + node = xas_alloc(xas, shift); + if (!node) + return -ENOMEM; + + node->count = 1; + if (xa_is_value(head)) + node->nr_values = 1; + RCU_INIT_POINTER(node->slots[0], head); + + /* Propagate the aggregated mark info to the new child */ + for (;;) { + if (xa_track_free(xa) && mark == XA_FREE_MARK) { + node_mark_all(node, XA_FREE_MARK); + if (!xa_marked(xa, XA_FREE_MARK)) { + node_clear_mark(node, 0, XA_FREE_MARK); + xa_mark_set(xa, XA_FREE_MARK); + } + } else if (xa_marked(xa, mark)) { + node_set_mark(node, 0, mark); + } + if (mark == XA_MARK_MAX) + break; + mark_inc(mark); + } + + /* + * Now that the new node is fully initialised, we can add + * it to the tree + */ + if (xa_is_node(head)) { + xa_to_node(head)->offset = 0; + rcu_assign_pointer(xa_to_node(head)->parent, node); + } + head = xa_mk_node(node); + rcu_assign_pointer(xa->xa_head, head); + xas_update(xas, node); + + shift += XA_CHUNK_SHIFT; + } + + xas->xa_node = node; + return shift; +} + +/* + * xas_create() - Create a slot to store an entry in. + * @xas: XArray operation state. + * @allow_root: %true if we can store the entry in the root directly + * + * Most users will not need to call this function directly, as it is called + * by xas_store(). It is useful for doing conditional store operations + * (see the xa_cmpxchg() implementation for an example). + * + * Return: If the slot already existed, returns the contents of this slot. + * If the slot was newly created, returns %NULL. If it failed to create the + * slot, returns %NULL and indicates the error in @xas. + */ +static void *xas_create(struct xa_state *xas, bool allow_root) +{ + struct xarray *xa = xas->xa; + void *entry; + void __rcu **slot; + struct xa_node *node = xas->xa_node; + int shift; + unsigned int order = xas->xa_shift; + + if (xas_top(node)) { + entry = xa_head_locked(xa); + xas->xa_node = NULL; + if (!entry && xa_zero_busy(xa)) + entry = XA_ZERO_ENTRY; + shift = xas_expand(xas, entry); + if (shift < 0) + return NULL; + if (!shift && !allow_root) + shift = XA_CHUNK_SHIFT; + entry = xa_head_locked(xa); + slot = &xa->xa_head; + } else if (xas_error(xas)) { + return NULL; + } else if (node) { + unsigned int offset = xas->xa_offset; + + shift = node->shift; + entry = xa_entry_locked(xa, node, offset); + slot = &node->slots[offset]; + } else { + shift = 0; + entry = xa_head_locked(xa); + slot = &xa->xa_head; + } + + while (shift > order) { + shift -= XA_CHUNK_SHIFT; + if (!entry) { + node = xas_alloc(xas, shift); + if (!node) + break; + if (xa_track_free(xa)) + node_mark_all(node, XA_FREE_MARK); + rcu_assign_pointer(*slot, xa_mk_node(node)); + } else if (xa_is_node(entry)) { + node = xa_to_node(entry); + } else { + break; + } + entry = xas_descend(xas, node); + slot = &node->slots[xas->xa_offset]; + } + + return entry; +} + +/** + * xas_create_range() - Ensure that stores to this range will succeed + * @xas: XArray operation state. + * + * Creates all of the slots in the range covered by @xas. Sets @xas to + * create single-index entries and positions it at the beginning of the + * range. This is for the benefit of users which have not yet been + * converted to use multi-index entries. + */ +void xas_create_range(struct xa_state *xas) +{ + unsigned long index = xas->xa_index; + unsigned char shift = xas->xa_shift; + unsigned char sibs = xas->xa_sibs; + + xas->xa_index |= ((sibs + 1UL) << shift) - 1; + if (xas_is_node(xas) && xas->xa_node->shift == xas->xa_shift) + xas->xa_offset |= sibs; + xas->xa_shift = 0; + xas->xa_sibs = 0; + + for (;;) { + xas_create(xas, true); + if (xas_error(xas)) + goto restore; + if (xas->xa_index <= (index | XA_CHUNK_MASK)) + goto success; + xas->xa_index -= XA_CHUNK_SIZE; + + for (;;) { + struct xa_node *node = xas->xa_node; + xas->xa_node = xa_parent_locked(xas->xa, node); + xas->xa_offset = node->offset - 1; + if (node->offset != 0) + break; + } + } + +restore: + xas->xa_shift = shift; + xas->xa_sibs = sibs; + xas->xa_index = index; + return; +success: + xas->xa_index = index; + if (xas->xa_node) + xas_set_offset(xas); +} +EXPORT_SYMBOL_GPL(xas_create_range); + +static void update_node(struct xa_state *xas, struct xa_node *node, + int count, int values) +{ + if (!node || (!count && !values)) + return; + + node->count += count; + node->nr_values += values; + XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE); + XA_NODE_BUG_ON(node, node->nr_values > XA_CHUNK_SIZE); + xas_update(xas, node); + if (count < 0) + xas_delete_node(xas); +} + +/** + * xas_store() - Store this entry in the XArray. + * @xas: XArray operation state. + * @entry: New entry. + * + * If @xas is operating on a multi-index entry, the entry returned by this + * function is essentially meaningless (it may be an internal entry or it + * may be %NULL, even if there are non-NULL entries at some of the indices + * covered by the range). This is not a problem for any current users, + * and can be changed if needed. + * + * Return: The old entry at this index. + */ +void *xas_store(struct xa_state *xas, void *entry) +{ + struct xa_node *node; + void __rcu **slot = &xas->xa->xa_head; + unsigned int offset, max; + int count = 0; + int values = 0; + void *first, *next; + bool value = xa_is_value(entry); + + if (entry) { + bool allow_root = !xa_is_node(entry) && !xa_is_zero(entry); + first = xas_create(xas, allow_root); + } else { + first = xas_load(xas); + } + + if (xas_invalid(xas)) + return first; + node = xas->xa_node; + if (node && (xas->xa_shift < node->shift)) + xas->xa_sibs = 0; + if ((first == entry) && !xas->xa_sibs) + return first; + + next = first; + offset = xas->xa_offset; + max = xas->xa_offset + xas->xa_sibs; + if (node) { + slot = &node->slots[offset]; + if (xas->xa_sibs) + xas_squash_marks(xas); + } + if (!entry) + xas_init_marks(xas); + + for (;;) { + /* + * Must clear the marks before setting the entry to NULL, + * otherwise xas_for_each_marked may find a NULL entry and + * stop early. rcu_assign_pointer contains a release barrier + * so the mark clearing will appear to happen before the + * entry is set to NULL. + */ + rcu_assign_pointer(*slot, entry); + if (xa_is_node(next) && (!node || node->shift)) + xas_free_nodes(xas, xa_to_node(next)); + if (!node) + break; + count += !next - !entry; + values += !xa_is_value(first) - !value; + if (entry) { + if (offset == max) + break; + if (!xa_is_sibling(entry)) + entry = xa_mk_sibling(xas->xa_offset); + } else { + if (offset == XA_CHUNK_MASK) + break; + } + next = xa_entry_locked(xas->xa, node, ++offset); + if (!xa_is_sibling(next)) { + if (!entry && (offset > max)) + break; + first = next; + } + slot++; + } + + update_node(xas, node, count, values); + return first; +} +EXPORT_SYMBOL_GPL(xas_store); + +/** + * xas_get_mark() - Returns the state of this mark. + * @xas: XArray operation state. + * @mark: Mark number. + * + * Return: true if the mark is set, false if the mark is clear or @xas + * is in an error state. + */ +bool xas_get_mark(const struct xa_state *xas, xa_mark_t mark) +{ + if (xas_invalid(xas)) + return false; + if (!xas->xa_node) + return xa_marked(xas->xa, mark); + return node_get_mark(xas->xa_node, xas->xa_offset, mark); +} +EXPORT_SYMBOL_GPL(xas_get_mark); + +/** + * xas_set_mark() - Sets the mark on this entry and its parents. + * @xas: XArray operation state. + * @mark: Mark number. + * + * Sets the specified mark on this entry, and walks up the tree setting it + * on all the ancestor entries. Does nothing if @xas has not been walked to + * an entry, or is in an error state. + */ +void xas_set_mark(const struct xa_state *xas, xa_mark_t mark) +{ + struct xa_node *node = xas->xa_node; + unsigned int offset = xas->xa_offset; + + if (xas_invalid(xas)) + return; + + while (node) { + if (node_set_mark(node, offset, mark)) + return; + offset = node->offset; + node = xa_parent_locked(xas->xa, node); + } + + if (!xa_marked(xas->xa, mark)) + xa_mark_set(xas->xa, mark); +} +EXPORT_SYMBOL_GPL(xas_set_mark); + +/** + * xas_clear_mark() - Clears the mark on this entry and its parents. + * @xas: XArray operation state. + * @mark: Mark number. + * + * Clears the specified mark on this entry, and walks back to the head + * attempting to clear it on all the ancestor entries. Does nothing if + * @xas has not been walked to an entry, or is in an error state. + */ +void xas_clear_mark(const struct xa_state *xas, xa_mark_t mark) +{ + struct xa_node *node = xas->xa_node; + unsigned int offset = xas->xa_offset; + + if (xas_invalid(xas)) + return; + + while (node) { + if (!node_clear_mark(node, offset, mark)) + return; + if (node_any_mark(node, mark)) + return; + + offset = node->offset; + node = xa_parent_locked(xas->xa, node); + } + + if (xa_marked(xas->xa, mark)) + xa_mark_clear(xas->xa, mark); +} +EXPORT_SYMBOL_GPL(xas_clear_mark); + +/** + * xas_init_marks() - Initialise all marks for the entry + * @xas: Array operations state. + * + * Initialise all marks for the entry specified by @xas. If we're tracking + * free entries with a mark, we need to set it on all entries. All other + * marks are cleared. + * + * This implementation is not as efficient as it could be; we may walk + * up the tree multiple times. + */ +void xas_init_marks(const struct xa_state *xas) +{ + xa_mark_t mark = 0; + + for (;;) { + if (xa_track_free(xas->xa) && mark == XA_FREE_MARK) + xas_set_mark(xas, mark); + else + xas_clear_mark(xas, mark); + if (mark == XA_MARK_MAX) + break; + mark_inc(mark); + } +} +EXPORT_SYMBOL_GPL(xas_init_marks); + +/** + * xas_pause() - Pause a walk to drop a lock. + * @xas: XArray operation state. + * + * Some users need to pause a walk and drop the lock they're holding in + * order to yield to a higher priority thread or carry out an operation + * on an entry. Those users should call this function before they drop + * the lock. It resets the @xas to be suitable for the next iteration + * of the loop after the user has reacquired the lock. If most entries + * found during a walk require you to call xas_pause(), the xa_for_each() + * iterator may be more appropriate. + * + * Note that xas_pause() only works for forward iteration. If a user needs + * to pause a reverse iteration, we will need a xas_pause_rev(). + */ +void xas_pause(struct xa_state *xas) +{ + struct xa_node *node = xas->xa_node; + + if (xas_invalid(xas)) + return; + + xas->xa_node = XAS_RESTART; + if (node) { + unsigned long offset = xas->xa_offset; + while (++offset < XA_CHUNK_SIZE) { + if (!xa_is_sibling(xa_entry(xas->xa, node, offset))) + break; + } + xas->xa_index += (offset - xas->xa_offset) << node->shift; + if (xas->xa_index == 0) + xas->xa_node = XAS_BOUNDS; + } else { + xas->xa_index++; + } +} +EXPORT_SYMBOL_GPL(xas_pause); + +/* + * __xas_prev() - Find the previous entry in the XArray. + * @xas: XArray operation state. + * + * Helper function for xas_prev() which handles all the complex cases + * out of line. + */ +void *__xas_prev(struct xa_state *xas) +{ + void *entry; + + if (!xas_frozen(xas->xa_node)) + xas->xa_index--; + if (!xas->xa_node) + return set_bounds(xas); + if (xas_not_node(xas->xa_node)) + return xas_load(xas); + + if (xas->xa_offset != get_offset(xas->xa_index, xas->xa_node)) + xas->xa_offset--; + + while (xas->xa_offset == 255) { + xas->xa_offset = xas->xa_node->offset - 1; + xas->xa_node = xa_parent(xas->xa, xas->xa_node); + if (!xas->xa_node) + return set_bounds(xas); + } + + for (;;) { + entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset); + if (!xa_is_node(entry)) + return entry; + + xas->xa_node = xa_to_node(entry); + xas_set_offset(xas); + } +} +EXPORT_SYMBOL_GPL(__xas_prev); + +/* + * __xas_next() - Find the next entry in the XArray. + * @xas: XArray operation state. + * + * Helper function for xas_next() which handles all the complex cases + * out of line. + */ +void *__xas_next(struct xa_state *xas) +{ + void *entry; + + if (!xas_frozen(xas->xa_node)) + xas->xa_index++; + if (!xas->xa_node) + return set_bounds(xas); + if (xas_not_node(xas->xa_node)) + return xas_load(xas); + + if (xas->xa_offset != get_offset(xas->xa_index, xas->xa_node)) + xas->xa_offset++; + + while (xas->xa_offset == XA_CHUNK_SIZE) { + xas->xa_offset = xas->xa_node->offset + 1; + xas->xa_node = xa_parent(xas->xa, xas->xa_node); + if (!xas->xa_node) + return set_bounds(xas); + } + + for (;;) { + entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset); + if (!xa_is_node(entry)) + return entry; + + xas->xa_node = xa_to_node(entry); + xas_set_offset(xas); + } +} +EXPORT_SYMBOL_GPL(__xas_next); + +/** + * xas_find() - Find the next present entry in the XArray. + * @xas: XArray operation state. + * @max: Highest index to return. + * + * If the @xas has not yet been walked to an entry, return the entry + * which has an index >= xas.xa_index. If it has been walked, the entry + * currently being pointed at has been processed, and so we move to the + * next entry. + * + * If no entry is found and the array is smaller than @max, the iterator + * is set to the smallest index not yet in the array. This allows @xas + * to be immediately passed to xas_store(). + * + * Return: The entry, if found, otherwise %NULL. + */ +void *xas_find(struct xa_state *xas, unsigned long max) +{ + void *entry; + + if (xas_error(xas) || xas->xa_node == XAS_BOUNDS) + return NULL; + if (xas->xa_index > max) + return set_bounds(xas); + + if (!xas->xa_node) { + xas->xa_index = 1; + return set_bounds(xas); + } else if (xas->xa_node == XAS_RESTART) { + entry = xas_load(xas); + if (entry || xas_not_node(xas->xa_node)) + return entry; + } else if (!xas->xa_node->shift && + xas->xa_offset != (xas->xa_index & XA_CHUNK_MASK)) { + xas->xa_offset = ((xas->xa_index - 1) & XA_CHUNK_MASK) + 1; + } + + xas_advance(xas); + + while (xas->xa_node && (xas->xa_index <= max)) { + if (unlikely(xas->xa_offset == XA_CHUNK_SIZE)) { + xas->xa_offset = xas->xa_node->offset + 1; + xas->xa_node = xa_parent(xas->xa, xas->xa_node); + continue; + } + + entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset); + if (xa_is_node(entry)) { + xas->xa_node = xa_to_node(entry); + xas->xa_offset = 0; + continue; + } + if (entry && !xa_is_sibling(entry)) + return entry; + + xas_advance(xas); + } + + if (!xas->xa_node) + xas->xa_node = XAS_BOUNDS; + return NULL; +} +EXPORT_SYMBOL_GPL(xas_find); + +/** + * xas_find_marked() - Find the next marked entry in the XArray. + * @xas: XArray operation state. + * @max: Highest index to return. + * @mark: Mark number to search for. + * + * If the @xas has not yet been walked to an entry, return the marked entry + * which has an index >= xas.xa_index. If it has been walked, the entry + * currently being pointed at has been processed, and so we return the + * first marked entry with an index > xas.xa_index. + * + * If no marked entry is found and the array is smaller than @max, @xas is + * set to the bounds state and xas->xa_index is set to the smallest index + * not yet in the array. This allows @xas to be immediately passed to + * xas_store(). + * + * If no entry is found before @max is reached, @xas is set to the restart + * state. + * + * Return: The entry, if found, otherwise %NULL. + */ +void *xas_find_marked(struct xa_state *xas, unsigned long max, xa_mark_t mark) +{ + bool advance = true; + unsigned int offset; + void *entry; + + if (xas_error(xas)) + return NULL; + if (xas->xa_index > max) + goto max; + + if (!xas->xa_node) { + xas->xa_index = 1; + goto out; + } else if (xas_top(xas->xa_node)) { + advance = false; + entry = xa_head(xas->xa); + xas->xa_node = NULL; + if (xas->xa_index > max_index(entry)) + goto out; + if (!xa_is_node(entry)) { + if (xa_marked(xas->xa, mark)) + return entry; + xas->xa_index = 1; + goto out; + } + xas->xa_node = xa_to_node(entry); + xas->xa_offset = xas->xa_index >> xas->xa_node->shift; + } + + while (xas->xa_index <= max) { + if (unlikely(xas->xa_offset == XA_CHUNK_SIZE)) { + xas->xa_offset = xas->xa_node->offset + 1; + xas->xa_node = xa_parent(xas->xa, xas->xa_node); + if (!xas->xa_node) + break; + advance = false; + continue; + } + + if (!advance) { + entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset); + if (xa_is_sibling(entry)) { + xas->xa_offset = xa_to_sibling(entry); + xas_move_index(xas, xas->xa_offset); + } + } + + offset = xas_find_chunk(xas, advance, mark); + if (offset > xas->xa_offset) { + advance = false; + xas_move_index(xas, offset); + /* Mind the wrap */ + if ((xas->xa_index - 1) >= max) + goto max; + xas->xa_offset = offset; + if (offset == XA_CHUNK_SIZE) + continue; + } + + entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset); + if (!entry && !(xa_track_free(xas->xa) && mark == XA_FREE_MARK)) + continue; + if (!xa_is_node(entry)) + return entry; + xas->xa_node = xa_to_node(entry); + xas_set_offset(xas); + } + +out: + if (xas->xa_index > max) + goto max; + return set_bounds(xas); +max: + xas->xa_node = XAS_RESTART; + return NULL; +} +EXPORT_SYMBOL_GPL(xas_find_marked); + +/** + * xas_find_conflict() - Find the next present entry in a range. + * @xas: XArray operation state. + * + * The @xas describes both a range and a position within that range. + * + * Context: Any context. Expects xa_lock to be held. + * Return: The next entry in the range covered by @xas or %NULL. + */ +void *xas_find_conflict(struct xa_state *xas) +{ + void *curr; + + if (xas_error(xas)) + return NULL; + + if (!xas->xa_node) + return NULL; + + if (xas_top(xas->xa_node)) { + curr = xas_start(xas); + if (!curr) + return NULL; + while (xa_is_node(curr)) { + struct xa_node *node = xa_to_node(curr); + curr = xas_descend(xas, node); + } + if (curr) + return curr; + } + + if (xas->xa_node->shift > xas->xa_shift) + return NULL; + + for (;;) { + if (xas->xa_node->shift == xas->xa_shift) { + if ((xas->xa_offset & xas->xa_sibs) == xas->xa_sibs) + break; + } else if (xas->xa_offset == XA_CHUNK_MASK) { + xas->xa_offset = xas->xa_node->offset; + xas->xa_node = xa_parent_locked(xas->xa, xas->xa_node); + if (!xas->xa_node) + break; + continue; + } + curr = xa_entry_locked(xas->xa, xas->xa_node, ++xas->xa_offset); + if (xa_is_sibling(curr)) + continue; + while (xa_is_node(curr)) { + xas->xa_node = xa_to_node(curr); + xas->xa_offset = 0; + curr = xa_entry_locked(xas->xa, xas->xa_node, 0); + } + if (curr) + return curr; + } + xas->xa_offset -= xas->xa_sibs; + return NULL; +} +EXPORT_SYMBOL_GPL(xas_find_conflict); + +/** + * xa_load() - Load an entry from an XArray. + * @xa: XArray. + * @index: index into array. + * + * Context: Any context. Takes and releases the RCU lock. + * Return: The entry at @index in @xa. + */ +void *xa_load(struct xarray *xa, unsigned long index) +{ + XA_STATE(xas, xa, index); + void *entry; + + rcu_read_lock(); + do { + entry = xas_load(&xas); + if (xa_is_zero(entry)) + entry = NULL; + } while (xas_retry(&xas, entry)); + rcu_read_unlock(); + + return entry; +} +EXPORT_SYMBOL(xa_load); + +static void *xas_result(struct xa_state *xas, void *curr) +{ + if (xa_is_zero(curr)) + return NULL; + if (xas_error(xas)) + curr = xas->xa_node; + return curr; +} + +/** + * __xa_erase() - Erase this entry from the XArray while locked. + * @xa: XArray. + * @index: Index into array. + * + * After this function returns, loading from @index will return %NULL. + * If the index is part of a multi-index entry, all indices will be erased + * and none of the entries will be part of a multi-index entry. + * + * Context: Any context. Expects xa_lock to be held on entry. + * Return: The entry which used to be at this index. + */ +void *__xa_erase(struct xarray *xa, unsigned long index) +{ + XA_STATE(xas, xa, index); + return xas_result(&xas, xas_store(&xas, NULL)); +} +EXPORT_SYMBOL(__xa_erase); + +/** + * xa_erase() - Erase this entry from the XArray. + * @xa: XArray. + * @index: Index of entry. + * + * After this function returns, loading from @index will return %NULL. + * If the index is part of a multi-index entry, all indices will be erased + * and none of the entries will be part of a multi-index entry. + * + * Context: Any context. Takes and releases the xa_lock. + * Return: The entry which used to be at this index. + */ +void *xa_erase(struct xarray *xa, unsigned long index) +{ + void *entry; + + xa_lock(xa); + entry = __xa_erase(xa, index); + xa_unlock(xa); + + return entry; +} +EXPORT_SYMBOL(xa_erase); + +/** + * __xa_store() - Store this entry in the XArray. + * @xa: XArray. + * @index: Index into array. + * @entry: New entry. + * @gfp: Memory allocation flags. + * + * You must already be holding the xa_lock when calling this function. + * It will drop the lock if needed to allocate memory, and then reacquire + * it afterwards. + * + * Context: Any context. Expects xa_lock to be held on entry. May + * release and reacquire xa_lock if @gfp flags permit. + * Return: The old entry at this index or xa_err() if an error happened. + */ +void *__xa_store(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) +{ + XA_STATE(xas, xa, index); + void *curr; + + if (WARN_ON_ONCE(xa_is_advanced(entry))) + return XA_ERROR(-EINVAL); + if (xa_track_free(xa) && !entry) + entry = XA_ZERO_ENTRY; + + do { + curr = xas_store(&xas, entry); + if (xa_track_free(xa)) + xas_clear_mark(&xas, XA_FREE_MARK); + } while (__xas_nomem(&xas, gfp)); + + return xas_result(&xas, curr); +} +EXPORT_SYMBOL(__xa_store); + +/** + * xa_store() - Store this entry in the XArray. + * @xa: XArray. + * @index: Index into array. + * @entry: New entry. + * @gfp: Memory allocation flags. + * + * After this function returns, loads from this index will return @entry. + * Storing into an existing multislot entry updates the entry of every index. + * The marks associated with @index are unaffected unless @entry is %NULL. + * + * Context: Any context. Takes and releases the xa_lock. + * May sleep if the @gfp flags permit. + * Return: The old entry at this index on success, xa_err(-EINVAL) if @entry + * cannot be stored in an XArray, or xa_err(-ENOMEM) if memory allocation + * failed. + */ +void *xa_store(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) +{ + void *curr; + + xa_lock(xa); + curr = __xa_store(xa, index, entry, gfp); + xa_unlock(xa); + + return curr; +} +EXPORT_SYMBOL(xa_store); + +/** + * __xa_cmpxchg() - Store this entry in the XArray. + * @xa: XArray. + * @index: Index into array. + * @old: Old value to test against. + * @entry: New entry. + * @gfp: Memory allocation flags. + * + * You must already be holding the xa_lock when calling this function. + * It will drop the lock if needed to allocate memory, and then reacquire + * it afterwards. + * + * Context: Any context. Expects xa_lock to be held on entry. May + * release and reacquire xa_lock if @gfp flags permit. + * Return: The old entry at this index or xa_err() if an error happened. + */ +void *__xa_cmpxchg(struct xarray *xa, unsigned long index, + void *old, void *entry, gfp_t gfp) +{ + XA_STATE(xas, xa, index); + void *curr; + + if (WARN_ON_ONCE(xa_is_advanced(entry))) + return XA_ERROR(-EINVAL); + + do { + curr = xas_load(&xas); + if (curr == old) { + xas_store(&xas, entry); + if (xa_track_free(xa) && entry && !curr) + xas_clear_mark(&xas, XA_FREE_MARK); + } + } while (__xas_nomem(&xas, gfp)); + + return xas_result(&xas, curr); +} +EXPORT_SYMBOL(__xa_cmpxchg); + +/** + * __xa_insert() - Store this entry in the XArray if no entry is present. + * @xa: XArray. + * @index: Index into array. + * @entry: New entry. + * @gfp: Memory allocation flags. + * + * Inserting a NULL entry will store a reserved entry (like xa_reserve()) + * if no entry is present. Inserting will fail if a reserved entry is + * present, even though loading from this index will return NULL. + * + * Context: Any context. Expects xa_lock to be held on entry. May + * release and reacquire xa_lock if @gfp flags permit. + * Return: 0 if the store succeeded. -EBUSY if another entry was present. + * -ENOMEM if memory could not be allocated. + */ +int __xa_insert(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) +{ + XA_STATE(xas, xa, index); + void *curr; + + if (WARN_ON_ONCE(xa_is_advanced(entry))) + return -EINVAL; + if (!entry) + entry = XA_ZERO_ENTRY; + + do { + curr = xas_load(&xas); + if (!curr) { + xas_store(&xas, entry); + if (xa_track_free(xa)) + xas_clear_mark(&xas, XA_FREE_MARK); + } else { + xas_set_err(&xas, -EBUSY); + } + } while (__xas_nomem(&xas, gfp)); + + return xas_error(&xas); +} +EXPORT_SYMBOL(__xa_insert); + +#ifdef CONFIG_XARRAY_MULTI +static void xas_set_range(struct xa_state *xas, unsigned long first, + unsigned long last) +{ + unsigned int shift = 0; + unsigned long sibs = last - first; + unsigned int offset = XA_CHUNK_MASK; + + xas_set(xas, first); + + while ((first & XA_CHUNK_MASK) == 0) { + if (sibs < XA_CHUNK_MASK) + break; + if ((sibs == XA_CHUNK_MASK) && (offset < XA_CHUNK_MASK)) + break; + shift += XA_CHUNK_SHIFT; + if (offset == XA_CHUNK_MASK) + offset = sibs & XA_CHUNK_MASK; + sibs >>= XA_CHUNK_SHIFT; + first >>= XA_CHUNK_SHIFT; + } + + offset = first & XA_CHUNK_MASK; + if (offset + sibs > XA_CHUNK_MASK) + sibs = XA_CHUNK_MASK - offset; + if ((((first + sibs + 1) << shift) - 1) > last) + sibs -= 1; + + xas->xa_shift = shift; + xas->xa_sibs = sibs; +} + +/** + * xa_store_range() - Store this entry at a range of indices in the XArray. + * @xa: XArray. + * @first: First index to affect. + * @last: Last index to affect. + * @entry: New entry. + * @gfp: Memory allocation flags. + * + * After this function returns, loads from any index between @first and @last, + * inclusive will return @entry. + * Storing into an existing multislot entry updates the entry of every index. + * The marks associated with @index are unaffected unless @entry is %NULL. + * + * Context: Process context. Takes and releases the xa_lock. May sleep + * if the @gfp flags permit. + * Return: %NULL on success, xa_err(-EINVAL) if @entry cannot be stored in + * an XArray, or xa_err(-ENOMEM) if memory allocation failed. + */ +void *xa_store_range(struct xarray *xa, unsigned long first, + unsigned long last, void *entry, gfp_t gfp) +{ + XA_STATE(xas, xa, 0); + + if (WARN_ON_ONCE(xa_is_internal(entry))) + return XA_ERROR(-EINVAL); + if (last < first) + return XA_ERROR(-EINVAL); + + do { + xas_lock(&xas); + if (entry) { + unsigned int order = BITS_PER_LONG; + if (last + 1) + order = __ffs(last + 1); + xas_set_order(&xas, last, order); + xas_create(&xas, true); + if (xas_error(&xas)) + goto unlock; + } + do { + xas_set_range(&xas, first, last); + xas_store(&xas, entry); + if (xas_error(&xas)) + goto unlock; + first += xas_size(&xas); + } while (first <= last); +unlock: + xas_unlock(&xas); + } while (xas_nomem(&xas, gfp)); + + return xas_result(&xas, NULL); +} +EXPORT_SYMBOL(xa_store_range); +#endif /* CONFIG_XARRAY_MULTI */ + +/** + * __xa_alloc() - Find somewhere to store this entry in the XArray. + * @xa: XArray. + * @id: Pointer to ID. + * @limit: Range for allocated ID. + * @entry: New entry. + * @gfp: Memory allocation flags. + * + * Finds an empty entry in @xa between @limit.min and @limit.max, + * stores the index into the @id pointer, then stores the entry at + * that index. A concurrent lookup will not see an uninitialised @id. + * + * Context: Any context. Expects xa_lock to be held on entry. May + * release and reacquire xa_lock if @gfp flags permit. + * Return: 0 on success, -ENOMEM if memory could not be allocated or + * -EBUSY if there are no free entries in @limit. + */ +int __xa_alloc(struct xarray *xa, u32 *id, void *entry, + struct xa_limit limit, gfp_t gfp) +{ + XA_STATE(xas, xa, 0); + + if (WARN_ON_ONCE(xa_is_advanced(entry))) + return -EINVAL; + if (WARN_ON_ONCE(!xa_track_free(xa))) + return -EINVAL; + + if (!entry) + entry = XA_ZERO_ENTRY; + + do { + xas.xa_index = limit.min; + xas_find_marked(&xas, limit.max, XA_FREE_MARK); + if (xas.xa_node == XAS_RESTART) + xas_set_err(&xas, -EBUSY); + else + *id = xas.xa_index; + xas_store(&xas, entry); + xas_clear_mark(&xas, XA_FREE_MARK); + } while (__xas_nomem(&xas, gfp)); + + return xas_error(&xas); +} +EXPORT_SYMBOL(__xa_alloc); + +/** + * __xa_alloc_cyclic() - Find somewhere to store this entry in the XArray. + * @xa: XArray. + * @id: Pointer to ID. + * @entry: New entry. + * @limit: Range of allocated ID. + * @next: Pointer to next ID to allocate. + * @gfp: Memory allocation flags. + * + * Finds an empty entry in @xa between @limit.min and @limit.max, + * stores the index into the @id pointer, then stores the entry at + * that index. A concurrent lookup will not see an uninitialised @id. + * The search for an empty entry will start at @next and will wrap + * around if necessary. + * + * Context: Any context. Expects xa_lock to be held on entry. May + * release and reacquire xa_lock if @gfp flags permit. + * Return: 0 if the allocation succeeded without wrapping. 1 if the + * allocation succeeded after wrapping, -ENOMEM if memory could not be + * allocated or -EBUSY if there are no free entries in @limit. + */ +int __xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry, + struct xa_limit limit, u32 *next, gfp_t gfp) +{ + u32 min = limit.min; + int ret; + + limit.min = max(min, *next); + ret = __xa_alloc(xa, id, entry, limit, gfp); + if ((xa->xa_flags & XA_FLAGS_ALLOC_WRAPPED) && ret == 0) { + xa->xa_flags &= ~XA_FLAGS_ALLOC_WRAPPED; + ret = 1; + } + + if (ret < 0 && limit.min > min) { + limit.min = min; + ret = __xa_alloc(xa, id, entry, limit, gfp); + if (ret == 0) + ret = 1; + } + + if (ret >= 0) { + *next = *id + 1; + if (*next == 0) + xa->xa_flags |= XA_FLAGS_ALLOC_WRAPPED; + } + return ret; +} +EXPORT_SYMBOL(__xa_alloc_cyclic); + +/** + * __xa_set_mark() - Set this mark on this entry while locked. + * @xa: XArray. + * @index: Index of entry. + * @mark: Mark number. + * + * Attempting to set a mark on a %NULL entry does not succeed. + * + * Context: Any context. Expects xa_lock to be held on entry. + */ +void __xa_set_mark(struct xarray *xa, unsigned long index, xa_mark_t mark) +{ + XA_STATE(xas, xa, index); + void *entry = xas_load(&xas); + + if (entry) + xas_set_mark(&xas, mark); +} +EXPORT_SYMBOL(__xa_set_mark); + +/** + * __xa_clear_mark() - Clear this mark on this entry while locked. + * @xa: XArray. + * @index: Index of entry. + * @mark: Mark number. + * + * Context: Any context. Expects xa_lock to be held on entry. + */ +void __xa_clear_mark(struct xarray *xa, unsigned long index, xa_mark_t mark) +{ + XA_STATE(xas, xa, index); + void *entry = xas_load(&xas); + + if (entry) + xas_clear_mark(&xas, mark); +} +EXPORT_SYMBOL(__xa_clear_mark); + +/** + * xa_get_mark() - Inquire whether this mark is set on this entry. + * @xa: XArray. + * @index: Index of entry. + * @mark: Mark number. + * + * This function uses the RCU read lock, so the result may be out of date + * by the time it returns. If you need the result to be stable, use a lock. + * + * Context: Any context. Takes and releases the RCU lock. + * Return: True if the entry at @index has this mark set, false if it doesn't. + */ +bool xa_get_mark(struct xarray *xa, unsigned long index, xa_mark_t mark) +{ + XA_STATE(xas, xa, index); + void *entry; + + rcu_read_lock(); + entry = xas_start(&xas); + while (xas_get_mark(&xas, mark)) { + if (!xa_is_node(entry)) + goto found; + entry = xas_descend(&xas, xa_to_node(entry)); + } + rcu_read_unlock(); + return false; + found: + rcu_read_unlock(); + return true; +} +EXPORT_SYMBOL(xa_get_mark); + +/** + * xa_set_mark() - Set this mark on this entry. + * @xa: XArray. + * @index: Index of entry. + * @mark: Mark number. + * + * Attempting to set a mark on a %NULL entry does not succeed. + * + * Context: Process context. Takes and releases the xa_lock. + */ +void xa_set_mark(struct xarray *xa, unsigned long index, xa_mark_t mark) +{ + xa_lock(xa); + __xa_set_mark(xa, index, mark); + xa_unlock(xa); +} +EXPORT_SYMBOL(xa_set_mark); + +/** + * xa_clear_mark() - Clear this mark on this entry. + * @xa: XArray. + * @index: Index of entry. + * @mark: Mark number. + * + * Clearing a mark always succeeds. + * + * Context: Process context. Takes and releases the xa_lock. + */ +void xa_clear_mark(struct xarray *xa, unsigned long index, xa_mark_t mark) +{ + xa_lock(xa); + __xa_clear_mark(xa, index, mark); + xa_unlock(xa); +} +EXPORT_SYMBOL(xa_clear_mark); + +/** + * xa_find() - Search the XArray for an entry. + * @xa: XArray. + * @indexp: Pointer to an index. + * @max: Maximum index to search to. + * @filter: Selection criterion. + * + * Finds the entry in @xa which matches the @filter, and has the lowest + * index that is at least @indexp and no more than @max. + * If an entry is found, @indexp is updated to be the index of the entry. + * This function is protected by the RCU read lock, so it may not find + * entries which are being simultaneously added. It will not return an + * %XA_RETRY_ENTRY; if you need to see retry entries, use xas_find(). + * + * Context: Any context. Takes and releases the RCU lock. + * Return: The entry, if found, otherwise %NULL. + */ +void *xa_find(struct xarray *xa, unsigned long *indexp, + unsigned long max, xa_mark_t filter) +{ + XA_STATE(xas, xa, *indexp); + void *entry; + + rcu_read_lock(); + do { + if ((__force unsigned int)filter < XA_MAX_MARKS) + entry = xas_find_marked(&xas, max, filter); + else + entry = xas_find(&xas, max); + } while (xas_retry(&xas, entry)); + rcu_read_unlock(); + + if (entry) + *indexp = xas.xa_index; + return entry; +} +EXPORT_SYMBOL(xa_find); + +static bool xas_sibling(struct xa_state *xas) +{ + struct xa_node *node = xas->xa_node; + unsigned long mask; + + if (!IS_ENABLED(CONFIG_XARRAY_MULTI) || !node) + return false; + mask = (XA_CHUNK_SIZE << node->shift) - 1; + return (xas->xa_index & mask) > + ((unsigned long)xas->xa_offset << node->shift); +} +/** + * xa_find_after() - Search the XArray for a present entry. + * @xa: XArray. + * @indexp: Pointer to an index. + * @max: Maximum index to search to. + * @filter: Selection criterion. + * + * Finds the entry in @xa which matches the @filter and has the lowest + * index that is above @indexp and no more than @max. + * If an entry is found, @indexp is updated to be the index of the entry. + * This function is protected by the RCU read lock, so it may miss entries + * which are being simultaneously added. It will not return an + * %XA_RETRY_ENTRY; if you need to see retry entries, use xas_find(). + * + * Context: Any context. Takes and releases the RCU lock. + * Return: The pointer, if found, otherwise %NULL. + */ +void *xa_find_after(struct xarray *xa, unsigned long *indexp, + unsigned long max, xa_mark_t filter) +{ + XA_STATE(xas, xa, *indexp + 1); + void *entry; + + if (xas.xa_index == 0) + return NULL; + + rcu_read_lock(); + for (;;) { + if ((__force unsigned int)filter < XA_MAX_MARKS) + entry = xas_find_marked(&xas, max, filter); + else + entry = xas_find(&xas, max); + + if (xas_invalid(&xas)) + break; + if (xas_sibling(&xas)) + continue; + if (!xas_retry(&xas, entry)) + break; + } + rcu_read_unlock(); + + if (entry) + *indexp = xas.xa_index; + return entry; +} +EXPORT_SYMBOL(xa_find_after); + +static unsigned int xas_extract_present(struct xa_state *xas, void **dst, + unsigned long max, unsigned int n) +{ + void *entry; + unsigned int i = 0; + + rcu_read_lock(); + xas_for_each(xas, entry, max) { + if (xas_retry(xas, entry)) + continue; + dst[i++] = entry; + if (i == n) + break; + } + rcu_read_unlock(); + + return i; +} + +static unsigned int xas_extract_marked(struct xa_state *xas, void **dst, + unsigned long max, unsigned int n, xa_mark_t mark) +{ + void *entry; + unsigned int i = 0; + + rcu_read_lock(); + xas_for_each_marked(xas, entry, max, mark) { + if (xas_retry(xas, entry)) + continue; + dst[i++] = entry; + if (i == n) + break; + } + rcu_read_unlock(); + + return i; +} + +/** + * xa_extract() - Copy selected entries from the XArray into a normal array. + * @xa: The source XArray to copy from. + * @dst: The buffer to copy entries into. + * @start: The first index in the XArray eligible to be selected. + * @max: The last index in the XArray eligible to be selected. + * @n: The maximum number of entries to copy. + * @filter: Selection criterion. + * + * Copies up to @n entries that match @filter from the XArray. The + * copied entries will have indices between @start and @max, inclusive. + * + * The @filter may be an XArray mark value, in which case entries which are + * marked with that mark will be copied. It may also be %XA_PRESENT, in + * which case all entries which are not %NULL will be copied. + * + * The entries returned may not represent a snapshot of the XArray at a + * moment in time. For example, if another thread stores to index 5, then + * index 10, calling xa_extract() may return the old contents of index 5 + * and the new contents of index 10. Indices not modified while this + * function is running will not be skipped. + * + * If you need stronger guarantees, holding the xa_lock across calls to this + * function will prevent concurrent modification. + * + * Context: Any context. Takes and releases the RCU lock. + * Return: The number of entries copied. + */ +unsigned int xa_extract(struct xarray *xa, void **dst, unsigned long start, + unsigned long max, unsigned int n, xa_mark_t filter) +{ + XA_STATE(xas, xa, start); + + if (!n) + return 0; + + if ((__force unsigned int)filter < XA_MAX_MARKS) + return xas_extract_marked(&xas, dst, max, n, filter); + return xas_extract_present(&xas, dst, max, n); +} +EXPORT_SYMBOL(xa_extract); + +/** + * xa_destroy() - Free all internal data structures. + * @xa: XArray. + * + * After calling this function, the XArray is empty and has freed all memory + * allocated for its internal data structures. You are responsible for + * freeing the objects referenced by the XArray. + * + * Context: Any context. Takes and releases the xa_lock, interrupt-safe. + */ +void xa_destroy(struct xarray *xa) +{ + XA_STATE(xas, xa, 0); + unsigned long flags; + void *entry; + + xas.xa_node = NULL; + xas_lock_irqsave(&xas, flags); + entry = xa_head_locked(xa); + RCU_INIT_POINTER(xa->xa_head, NULL); + xas_init_marks(&xas); + if (xa_zero_busy(xa)) + xa_mark_clear(xa, XA_FREE_MARK); + /* lockdep checks we're still holding the lock in xas_free_nodes() */ + if (xa_is_node(entry)) + xas_free_nodes(&xas, xa_to_node(entry)); + xas_unlock_irqrestore(&xas, flags); +} +EXPORT_SYMBOL(xa_destroy); + +#ifdef XA_DEBUG +void xa_dump_node(const struct xa_node *node) +{ + unsigned i, j; + + if (!node) + return; + if ((unsigned long)node & 3) { + pr_cont("node %px\n", node); + return; + } + + pr_cont("node %px %s %d parent %px shift %d count %d values %d " + "array %px list %px %px marks", + node, node->parent ? "offset" : "max", node->offset, + node->parent, node->shift, node->count, node->nr_values, + node->array, node->private_list.prev, node->private_list.next); + for (i = 0; i < XA_MAX_MARKS; i++) + for (j = 0; j < XA_MARK_LONGS; j++) + pr_cont(" %lx", node->marks[i][j]); + pr_cont("\n"); +} + +void xa_dump_index(unsigned long index, unsigned int shift) +{ + if (!shift) + pr_info("%lu: ", index); + else if (shift >= BITS_PER_LONG) + pr_info("0-%lu: ", ~0UL); + else + pr_info("%lu-%lu: ", index, index | ((1UL << shift) - 1)); +} + +void xa_dump_entry(const void *entry, unsigned long index, unsigned long shift) +{ + if (!entry) + return; + + xa_dump_index(index, shift); + + if (xa_is_node(entry)) { + if (shift == 0) { + pr_cont("%px\n", entry); + } else { + unsigned long i; + struct xa_node *node = xa_to_node(entry); + xa_dump_node(node); + for (i = 0; i < XA_CHUNK_SIZE; i++) + xa_dump_entry(node->slots[i], + index + (i << node->shift), node->shift); + } + } else if (xa_is_value(entry)) + pr_cont("value %ld (0x%lx) [%px]\n", xa_to_value(entry), + xa_to_value(entry), entry); + else if (!xa_is_internal(entry)) + pr_cont("%px\n", entry); + else if (xa_is_retry(entry)) + pr_cont("retry (%ld)\n", xa_to_internal(entry)); + else if (xa_is_sibling(entry)) + pr_cont("sibling (slot %ld)\n", xa_to_sibling(entry)); + else if (xa_is_zero(entry)) + pr_cont("zero (%ld)\n", xa_to_internal(entry)); + else + pr_cont("UNKNOWN ENTRY (%px)\n", entry); +} + +void xa_dump(const struct xarray *xa) +{ + void *entry = xa->xa_head; + unsigned int shift = 0; + + pr_info("xarray: %px head %px flags %x marks %d %d %d\n", xa, entry, + xa->xa_flags, xa_marked(xa, XA_MARK_0), + xa_marked(xa, XA_MARK_1), xa_marked(xa, XA_MARK_2)); + if (xa_is_node(entry)) + shift = xa_to_node(entry)->shift + XA_CHUNK_SHIFT; + xa_dump_entry(entry, 0, shift); +} +#endif + +static void +compat_radix_tree_node_ctor(void *arg) +{ + struct radix_tree_node *node = arg; + + memset(node, 0, sizeof(*node)); + INIT_LIST_HEAD(&node->private_list); +} + +void compat_radix_tree_init(void) +{ + BUILD_BUG_ON(RADIX_TREE_MAX_TAGS + __GFP_BITS_SHIFT > 32); + BUILD_BUG_ON(XA_CHUNK_SIZE > 255); + compat_radix_tree_node_cachep = kmem_cache_create("mlx_compat_radix_tree_node", + sizeof(struct radix_tree_node), 0, + SLAB_PANIC | SLAB_RECLAIM_ACCOUNT, + compat_radix_tree_node_ctor); +} +EXPORT_SYMBOL_GPL(compat_radix_tree_init); + +void compat_radix_tree_clean(void) +{ + kmem_cache_destroy(compat_radix_tree_node_cachep); +} +EXPORT_SYMBOL_GPL(compat_radix_tree_clean); + +#endif diff --git a/src/mlnx-ofa_kernel-5.8/compat/xz_crc32.c b/src/mlnx-ofa_kernel-5.8/compat/xz_crc32.c new file mode 100644 index 0000000..8f0107a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/xz_crc32.c @@ -0,0 +1,66 @@ +#if !(IS_ENABLED(CONFIG_XZ_DEC)) + +/* + * CRC32 using the polynomial from IEEE-802.3 + * + * Authors: Lasse Collin + * Igor Pavlov + * + * This file has been put into the public domain. + * You can do whatever you want with this file. + */ + +/* + * This is not the fastest implementation, but it is pretty compact. + * The fastest versions of xz_crc32() on modern CPUs without hardware + * accelerated CRC instruction are 3-5 times as fast as this version, + * but they are bigger and use more memory for the lookup table. + */ + +#include "xz_private.h" + +/* + * STATIC_RW_DATA is used in the pre-boot environment on some architectures. + * See for details. + */ +#ifndef STATIC_RW_DATA +# define STATIC_RW_DATA static +#endif + +#define xz_crc32_table LINUX_BACKPORT(xz_crc32_table) +STATIC_RW_DATA uint32_t xz_crc32_table[256]; + +#define xz_crc32_init LINUX_BACKPORT(xz_crc32_init) +XZ_EXTERN void xz_crc32_init(void) +{ + const uint32_t poly = 0xEDB88320; + + uint32_t i; + uint32_t j; + uint32_t r; + + for (i = 0; i < 256; ++i) { + r = i; + for (j = 0; j < 8; ++j) + r = (r >> 1) ^ (poly & ~((r & 1) - 1)); + + xz_crc32_table[i] = r; + } + + return; +} + +#define xz_crc32 LINUX_BACKPORT(xz_crc32) +XZ_EXTERN uint32_t xz_crc32(const uint8_t *buf, size_t size, uint32_t crc) +{ + crc = ~crc; + + while (size != 0) { + crc = xz_crc32_table[*buf++ ^ (crc & 0xFF)] ^ (crc >> 8); + --size; + } + + return ~crc; +} + +#endif /* !(IS_ENABLED(CONFIG_XZ_DEC)) */ diff --git a/src/mlnx-ofa_kernel-5.8/compat/xz_dec_bcj.c b/src/mlnx-ofa_kernel-5.8/compat/xz_dec_bcj.c new file mode 100644 index 0000000..8146f37 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/xz_dec_bcj.c @@ -0,0 +1,578 @@ +#if !(IS_ENABLED(CONFIG_XZ_DEC)) + +/* + * Branch/Call/Jump (BCJ) filter decoders + * + * Authors: Lasse Collin + * Igor Pavlov + * + * This file has been put into the public domain. + * You can do whatever you want with this file. + */ + +#include "xz_private.h" + +/* + * The rest of the file is inside this ifdef. It makes things a little more + * convenient when building without support for any BCJ filters. + */ +#ifdef XZ_DEC_BCJ + +struct xz_dec_bcj { + /* Type of the BCJ filter being used */ + enum { + BCJ_X86 = 4, /* x86 or x86-64 */ + BCJ_POWERPC = 5, /* Big endian only */ + BCJ_IA64 = 6, /* Big or little endian */ + BCJ_ARM = 7, /* Little endian only */ + BCJ_ARMTHUMB = 8, /* Little endian only */ + BCJ_SPARC = 9 /* Big or little endian */ + } type; + + /* + * Return value of the next filter in the chain. We need to preserve + * this information across calls, because we must not call the next + * filter anymore once it has returned XZ_STREAM_END. + */ + enum xz_ret ret; + + /* True if we are operating in single-call mode. */ + bool single_call; + + /* + * Absolute position relative to the beginning of the uncompressed + * data (in a single .xz Block). We care only about the lowest 32 + * bits so this doesn't need to be uint64_t even with big files. + */ + uint32_t pos; + + /* x86 filter state */ + uint32_t x86_prev_mask; + + /* Temporary space to hold the variables from struct xz_buf */ + uint8_t *out; + size_t out_pos; + size_t out_size; + + struct { + /* Amount of already filtered data in the beginning of buf */ + size_t filtered; + + /* Total amount of data currently stored in buf */ + size_t size; + + /* + * Buffer to hold a mix of filtered and unfiltered data. This + * needs to be big enough to hold Alignment + 2 * Look-ahead: + * + * Type Alignment Look-ahead + * x86 1 4 + * PowerPC 4 0 + * IA-64 16 0 + * ARM 4 0 + * ARM-Thumb 2 2 + * SPARC 4 0 + */ + uint8_t buf[16]; + } temp; +}; + +#ifdef XZ_DEC_X86 +/* + * This is used to test the most significant byte of a memory address + * in an x86 instruction. + */ +static inline int bcj_x86_test_msbyte(uint8_t b) +{ + return b == 0x00 || b == 0xFF; +} + +static size_t bcj_x86(struct xz_dec_bcj *s, uint8_t *buf, size_t size) +{ + static const bool mask_to_allowed_status[8] + = { true, true, true, false, true, false, false, false }; + + static const uint8_t mask_to_bit_num[8] = { 0, 1, 2, 2, 3, 3, 3, 3 }; + + size_t i; + size_t prev_pos = (size_t)-1; + uint32_t prev_mask = s->x86_prev_mask; + uint32_t src; + uint32_t dest; + uint32_t j; + uint8_t b; + + if (size <= 4) + return 0; + + size -= 4; + for (i = 0; i < size; ++i) { + if ((buf[i] & 0xFE) != 0xE8) + continue; + + prev_pos = i - prev_pos; + if (prev_pos > 3) { + prev_mask = 0; + } else { + prev_mask = (prev_mask << (prev_pos - 1)) & 7; + if (prev_mask != 0) { + b = buf[i + 4 - mask_to_bit_num[prev_mask]]; + if (!mask_to_allowed_status[prev_mask] + || bcj_x86_test_msbyte(b)) { + prev_pos = i; + prev_mask = (prev_mask << 1) | 1; + continue; + } + } + } + + prev_pos = i; + + if (bcj_x86_test_msbyte(buf[i + 4])) { + src = get_unaligned_le32(buf + i + 1); + while (true) { + dest = src - (s->pos + (uint32_t)i + 5); + if (prev_mask == 0) + break; + + j = mask_to_bit_num[prev_mask] * 8; + b = (uint8_t)(dest >> (24 - j)); + if (!bcj_x86_test_msbyte(b)) + break; + + src = dest ^ (((uint32_t)1 << (32 - j)) - 1); + } + + dest &= 0x01FFFFFF; + dest |= (uint32_t)0 - (dest & 0x01000000); + put_unaligned_le32(dest, buf + i + 1); + i += 4; + } else { + prev_mask = (prev_mask << 1) | 1; + } + } + + prev_pos = i - prev_pos; + s->x86_prev_mask = prev_pos > 3 ? 0 : prev_mask << (prev_pos - 1); + return i; +} +#endif + +#ifdef XZ_DEC_POWERPC +static size_t bcj_powerpc(struct xz_dec_bcj *s, uint8_t *buf, size_t size) +{ + size_t i; + uint32_t instr; + + for (i = 0; i + 4 <= size; i += 4) { + instr = get_unaligned_be32(buf + i); + if ((instr & 0xFC000003) == 0x48000001) { + instr &= 0x03FFFFFC; + instr -= s->pos + (uint32_t)i; + instr &= 0x03FFFFFC; + instr |= 0x48000001; + put_unaligned_be32(instr, buf + i); + } + } + + return i; +} +#endif + +#ifdef XZ_DEC_IA64 +static size_t bcj_ia64(struct xz_dec_bcj *s, uint8_t *buf, size_t size) +{ + static const uint8_t branch_table[32] = { + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 4, 4, 6, 6, 0, 0, 7, 7, + 4, 4, 0, 0, 4, 4, 0, 0 + }; + + /* + * The local variables take a little bit stack space, but it's less + * than what LZMA2 decoder takes, so it doesn't make sense to reduce + * stack usage here without doing that for the LZMA2 decoder too. + */ + + /* Loop counters */ + size_t i; + size_t j; + + /* Instruction slot (0, 1, or 2) in the 128-bit instruction word */ + uint32_t slot; + + /* Bitwise offset of the instruction indicated by slot */ + uint32_t bit_pos; + + /* bit_pos split into byte and bit parts */ + uint32_t byte_pos; + uint32_t bit_res; + + /* Address part of an instruction */ + uint32_t addr; + + /* Mask used to detect which instructions to convert */ + uint32_t mask; + + /* 41-bit instruction stored somewhere in the lowest 48 bits */ + uint64_t instr; + + /* Instruction normalized with bit_res for easier manipulation */ + uint64_t norm; + + for (i = 0; i + 16 <= size; i += 16) { + mask = branch_table[buf[i] & 0x1F]; + for (slot = 0, bit_pos = 5; slot < 3; ++slot, bit_pos += 41) { + if (((mask >> slot) & 1) == 0) + continue; + + byte_pos = bit_pos >> 3; + bit_res = bit_pos & 7; + instr = 0; + for (j = 0; j < 6; ++j) + instr |= (uint64_t)(buf[i + j + byte_pos]) + << (8 * j); + + norm = instr >> bit_res; + + if (((norm >> 37) & 0x0F) == 0x05 + && ((norm >> 9) & 0x07) == 0) { + addr = (norm >> 13) & 0x0FFFFF; + addr |= ((uint32_t)(norm >> 36) & 1) << 20; + addr <<= 4; + addr -= s->pos + (uint32_t)i; + addr >>= 4; + + norm &= ~((uint64_t)0x8FFFFF << 13); + norm |= (uint64_t)(addr & 0x0FFFFF) << 13; + norm |= (uint64_t)(addr & 0x100000) + << (36 - 20); + + instr &= (1 << bit_res) - 1; + instr |= norm << bit_res; + + for (j = 0; j < 6; j++) + buf[i + j + byte_pos] + = (uint8_t)(instr >> (8 * j)); + } + } + } + + return i; +} +#endif + +#ifdef XZ_DEC_ARM +static size_t bcj_arm(struct xz_dec_bcj *s, uint8_t *buf, size_t size) +{ + size_t i; + uint32_t addr; + + for (i = 0; i + 4 <= size; i += 4) { + if (buf[i + 3] == 0xEB) { + addr = (uint32_t)buf[i] | ((uint32_t)buf[i + 1] << 8) + | ((uint32_t)buf[i + 2] << 16); + addr <<= 2; + addr -= s->pos + (uint32_t)i + 8; + addr >>= 2; + buf[i] = (uint8_t)addr; + buf[i + 1] = (uint8_t)(addr >> 8); + buf[i + 2] = (uint8_t)(addr >> 16); + } + } + + return i; +} +#endif + +#ifdef XZ_DEC_ARMTHUMB +static size_t bcj_armthumb(struct xz_dec_bcj *s, uint8_t *buf, size_t size) +{ + size_t i; + uint32_t addr; + + for (i = 0; i + 4 <= size; i += 2) { + if ((buf[i + 1] & 0xF8) == 0xF0 + && (buf[i + 3] & 0xF8) == 0xF8) { + addr = (((uint32_t)buf[i + 1] & 0x07) << 19) + | ((uint32_t)buf[i] << 11) + | (((uint32_t)buf[i + 3] & 0x07) << 8) + | (uint32_t)buf[i + 2]; + addr <<= 1; + addr -= s->pos + (uint32_t)i + 4; + addr >>= 1; + buf[i + 1] = (uint8_t)(0xF0 | ((addr >> 19) & 0x07)); + buf[i] = (uint8_t)(addr >> 11); + buf[i + 3] = (uint8_t)(0xF8 | ((addr >> 8) & 0x07)); + buf[i + 2] = (uint8_t)addr; + i += 2; + } + } + + return i; +} +#endif + +#ifdef XZ_DEC_SPARC +static size_t bcj_sparc(struct xz_dec_bcj *s, uint8_t *buf, size_t size) +{ + size_t i; + uint32_t instr; + + for (i = 0; i + 4 <= size; i += 4) { + instr = get_unaligned_be32(buf + i); + if ((instr >> 22) == 0x100 || (instr >> 22) == 0x1FF) { + instr <<= 2; + instr -= s->pos + (uint32_t)i; + instr >>= 2; + instr = ((uint32_t)0x40000000 - (instr & 0x400000)) + | 0x40000000 | (instr & 0x3FFFFF); + put_unaligned_be32(instr, buf + i); + } + } + + return i; +} +#endif + +/* + * Apply the selected BCJ filter. Update *pos and s->pos to match the amount + * of data that got filtered. + * + * NOTE: This is implemented as a switch statement to avoid using function + * pointers, which could be problematic in the kernel boot code, which must + * avoid pointers to static data (at least on x86). + */ +static void bcj_apply(struct xz_dec_bcj *s, + uint8_t *buf, size_t *pos, size_t size) +{ + size_t filtered; + + buf += *pos; + size -= *pos; + + switch (s->type) { +#ifdef XZ_DEC_X86 + case BCJ_X86: + filtered = bcj_x86(s, buf, size); + break; +#endif +#ifdef XZ_DEC_POWERPC + case BCJ_POWERPC: + filtered = bcj_powerpc(s, buf, size); + break; +#endif +#ifdef XZ_DEC_IA64 + case BCJ_IA64: + filtered = bcj_ia64(s, buf, size); + break; +#endif +#ifdef XZ_DEC_ARM + case BCJ_ARM: + filtered = bcj_arm(s, buf, size); + break; +#endif +#ifdef XZ_DEC_ARMTHUMB + case BCJ_ARMTHUMB: + filtered = bcj_armthumb(s, buf, size); + break; +#endif +#ifdef XZ_DEC_SPARC + case BCJ_SPARC: + filtered = bcj_sparc(s, buf, size); + break; +#endif + default: + /* Never reached but silence compiler warnings. */ + filtered = 0; + break; + } + + *pos += filtered; + s->pos += filtered; +} + +/* + * Flush pending filtered data from temp to the output buffer. + * Move the remaining mixture of possibly filtered and unfiltered + * data to the beginning of temp. + */ +static void bcj_flush(struct xz_dec_bcj *s, struct xz_buf *b) +{ + size_t copy_size; + + copy_size = min_t(size_t, s->temp.filtered, b->out_size - b->out_pos); + memcpy(b->out + b->out_pos, s->temp.buf, copy_size); + b->out_pos += copy_size; + + s->temp.filtered -= copy_size; + s->temp.size -= copy_size; + memmove(s->temp.buf, s->temp.buf + copy_size, s->temp.size); +} + +/* + * The BCJ filter functions are primitive in sense that they process the + * data in chunks of 1-16 bytes. To hide this issue, this function does + * some buffering. + */ +XZ_EXTERN enum xz_ret xz_dec_bcj_run(struct xz_dec_bcj *s, + struct xz_dec_lzma2 *lzma2, + struct xz_buf *b) +{ + size_t out_start; + + /* + * Flush pending already filtered data to the output buffer. Return + * immediatelly if we couldn't flush everything, or if the next + * filter in the chain had already returned XZ_STREAM_END. + */ + if (s->temp.filtered > 0) { + bcj_flush(s, b); + if (s->temp.filtered > 0) + return XZ_OK; + + if (s->ret == XZ_STREAM_END) + return XZ_STREAM_END; + } + + /* + * If we have more output space than what is currently pending in + * temp, copy the unfiltered data from temp to the output buffer + * and try to fill the output buffer by decoding more data from the + * next filter in the chain. Apply the BCJ filter on the new data + * in the output buffer. If everything cannot be filtered, copy it + * to temp and rewind the output buffer position accordingly. + * + * This needs to be always run when temp.size == 0 to handle a special + * case where the output buffer is full and the next filter has no + * more output coming but hasn't returned XZ_STREAM_END yet. + */ + if (s->temp.size < b->out_size - b->out_pos || s->temp.size == 0) { + out_start = b->out_pos; + memcpy(b->out + b->out_pos, s->temp.buf, s->temp.size); + b->out_pos += s->temp.size; + + s->ret = xz_dec_lzma2_run(lzma2, b); + if (s->ret != XZ_STREAM_END + && (s->ret != XZ_OK || s->single_call)) + return s->ret; + + bcj_apply(s, b->out, &out_start, b->out_pos); + + /* + * As an exception, if the next filter returned XZ_STREAM_END, + * we can do that too, since the last few bytes that remain + * unfiltered are meant to remain unfiltered. + */ + if (s->ret == XZ_STREAM_END) + return XZ_STREAM_END; + + s->temp.size = b->out_pos - out_start; + b->out_pos -= s->temp.size; + memcpy(s->temp.buf, b->out + b->out_pos, s->temp.size); + + /* + * If there wasn't enough input to the next filter to fill + * the output buffer with unfiltered data, there's no point + * to try decoding more data to temp. + */ + if (b->out_pos + s->temp.size < b->out_size) + return XZ_OK; + } + + /* + * We have unfiltered data in temp. If the output buffer isn't full + * yet, try to fill the temp buffer by decoding more data from the + * next filter. Apply the BCJ filter on temp. Then we hopefully can + * fill the actual output buffer by copying filtered data from temp. + * A mix of filtered and unfiltered data may be left in temp; it will + * be taken care on the next call to this function. + */ + if (b->out_pos < b->out_size) { + /* Make b->out{,_pos,_size} temporarily point to s->temp. */ + s->out = b->out; + s->out_pos = b->out_pos; + s->out_size = b->out_size; + b->out = s->temp.buf; + b->out_pos = s->temp.size; + b->out_size = sizeof(s->temp.buf); + + s->ret = xz_dec_lzma2_run(lzma2, b); + + s->temp.size = b->out_pos; + b->out = s->out; + b->out_pos = s->out_pos; + b->out_size = s->out_size; + + if (s->ret != XZ_OK && s->ret != XZ_STREAM_END) + return s->ret; + + bcj_apply(s, s->temp.buf, &s->temp.filtered, s->temp.size); + + /* + * If the next filter returned XZ_STREAM_END, we mark that + * everything is filtered, since the last unfiltered bytes + * of the stream are meant to be left as is. + */ + if (s->ret == XZ_STREAM_END) + s->temp.filtered = s->temp.size; + + bcj_flush(s, b); + if (s->temp.filtered > 0) + return XZ_OK; + } + + return s->ret; +} + +XZ_EXTERN struct xz_dec_bcj *xz_dec_bcj_create(bool single_call) +{ + struct xz_dec_bcj *s = kmalloc(sizeof(*s), GFP_KERNEL); + if (s != NULL) + s->single_call = single_call; + + return s; +} + +XZ_EXTERN enum xz_ret xz_dec_bcj_reset(struct xz_dec_bcj *s, uint8_t id) +{ + switch (id) { +#ifdef XZ_DEC_X86 + case BCJ_X86: +#endif +#ifdef XZ_DEC_POWERPC + case BCJ_POWERPC: +#endif +#ifdef XZ_DEC_IA64 + case BCJ_IA64: +#endif +#ifdef XZ_DEC_ARM + case BCJ_ARM: +#endif +#ifdef XZ_DEC_ARMTHUMB + case BCJ_ARMTHUMB: +#endif +#ifdef XZ_DEC_SPARC + case BCJ_SPARC: +#endif + break; + + default: + /* Unsupported Filter ID */ + return XZ_OPTIONS_ERROR; + } + + s->type = id; + s->ret = XZ_OK; + s->pos = 0; + s->x86_prev_mask = 0; + s->temp.filtered = 0; + s->temp.size = 0; + + return XZ_OK; +} + +#endif + +#endif /* !(IS_ENABLED(CONFIG_XZ_DEC)) */ diff --git a/src/mlnx-ofa_kernel-5.8/compat/xz_dec_lzma2.c b/src/mlnx-ofa_kernel-5.8/compat/xz_dec_lzma2.c new file mode 100644 index 0000000..0e6819c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/xz_dec_lzma2.c @@ -0,0 +1,1179 @@ +#if !(IS_ENABLED(CONFIG_XZ_DEC)) + +/* + * LZMA2 decoder + * + * Authors: Lasse Collin + * Igor Pavlov + * + * This file has been put into the public domain. + * You can do whatever you want with this file. + */ + +#include "xz_private.h" +#include "xz_lzma2.h" + +/* + * Range decoder initialization eats the first five bytes of each LZMA chunk. + */ +#define RC_INIT_BYTES 5 + +/* + * Minimum number of usable input buffer to safely decode one LZMA symbol. + * The worst case is that we decode 22 bits using probabilities and 26 + * direct bits. This may decode at maximum of 20 bytes of input. However, + * lzma_main() does an extra normalization before returning, thus we + * need to put 21 here. + */ +#define LZMA_IN_REQUIRED 21 + +/* + * Dictionary (history buffer) + * + * These are always true: + * start <= pos <= full <= end + * pos <= limit <= end + * + * In multi-call mode, also these are true: + * end == size + * size <= size_max + * allocated <= size + * + * Most of these variables are size_t to support single-call mode, + * in which the dictionary variables address the actual output + * buffer directly. + */ +struct dictionary { + /* Beginning of the history buffer */ + uint8_t *buf; + + /* Old position in buf (before decoding more data) */ + size_t start; + + /* Position in buf */ + size_t pos; + + /* + * How full dictionary is. This is used to detect corrupt input that + * would read beyond the beginning of the uncompressed stream. + */ + size_t full; + + /* Write limit; we don't write to buf[limit] or later bytes. */ + size_t limit; + + /* + * End of the dictionary buffer. In multi-call mode, this is + * the same as the dictionary size. In single-call mode, this + * indicates the size of the output buffer. + */ + size_t end; + + /* + * Size of the dictionary as specified in Block Header. This is used + * together with "full" to detect corrupt input that would make us + * read beyond the beginning of the uncompressed stream. + */ + uint32_t size; + + /* + * Maximum allowed dictionary size in multi-call mode. + * This is ignored in single-call mode. + */ + uint32_t size_max; + + /* + * Amount of memory currently allocated for the dictionary. + * This is used only with XZ_DYNALLOC. (With XZ_PREALLOC, + * size_max is always the same as the allocated size.) + */ + uint32_t allocated; + + /* Operation mode */ + enum xz_mode mode; +}; + +/* Range decoder */ +struct rc_dec { + uint32_t range; + uint32_t code; + + /* + * Number of initializing bytes remaining to be read + * by rc_read_init(). + */ + uint32_t init_bytes_left; + + /* + * Buffer from which we read our input. It can be either + * temp.buf or the caller-provided input buffer. + */ + const uint8_t *in; + size_t in_pos; + size_t in_limit; +}; + +/* Probabilities for a length decoder. */ +struct lzma_len_dec { + /* Probability of match length being at least 10 */ + uint16_t choice; + + /* Probability of match length being at least 18 */ + uint16_t choice2; + + /* Probabilities for match lengths 2-9 */ + uint16_t low[POS_STATES_MAX][LEN_LOW_SYMBOLS]; + + /* Probabilities for match lengths 10-17 */ + uint16_t mid[POS_STATES_MAX][LEN_MID_SYMBOLS]; + + /* Probabilities for match lengths 18-273 */ + uint16_t high[LEN_HIGH_SYMBOLS]; +}; + +struct lzma_dec { + /* Distances of latest four matches */ + uint32_t rep0; + uint32_t rep1; + uint32_t rep2; + uint32_t rep3; + + /* Types of the most recently seen LZMA symbols */ + enum lzma_state state; + + /* + * Length of a match. This is updated so that dict_repeat can + * be called again to finish repeating the whole match. + */ + uint32_t len; + + /* + * LZMA properties or related bit masks (number of literal + * context bits, a mask dervied from the number of literal + * position bits, and a mask dervied from the number + * position bits) + */ + uint32_t lc; + uint32_t literal_pos_mask; /* (1 << lp) - 1 */ + uint32_t pos_mask; /* (1 << pb) - 1 */ + + /* If 1, it's a match. Otherwise it's a single 8-bit literal. */ + uint16_t is_match[STATES][POS_STATES_MAX]; + + /* If 1, it's a repeated match. The distance is one of rep0 .. rep3. */ + uint16_t is_rep[STATES]; + + /* + * If 0, distance of a repeated match is rep0. + * Otherwise check is_rep1. + */ + uint16_t is_rep0[STATES]; + + /* + * If 0, distance of a repeated match is rep1. + * Otherwise check is_rep2. + */ + uint16_t is_rep1[STATES]; + + /* If 0, distance of a repeated match is rep2. Otherwise it is rep3. */ + uint16_t is_rep2[STATES]; + + /* + * If 1, the repeated match has length of one byte. Otherwise + * the length is decoded from rep_len_decoder. + */ + uint16_t is_rep0_long[STATES][POS_STATES_MAX]; + + /* + * Probability tree for the highest two bits of the match + * distance. There is a separate probability tree for match + * lengths of 2 (i.e. MATCH_LEN_MIN), 3, 4, and [5, 273]. + */ + uint16_t dist_slot[DIST_STATES][DIST_SLOTS]; + + /* + * Probility trees for additional bits for match distance + * when the distance is in the range [4, 127]. + */ + uint16_t dist_special[FULL_DISTANCES - DIST_MODEL_END]; + + /* + * Probability tree for the lowest four bits of a match + * distance that is equal to or greater than 128. + */ + uint16_t dist_align[ALIGN_SIZE]; + + /* Length of a normal match */ + struct lzma_len_dec match_len_dec; + + /* Length of a repeated match */ + struct lzma_len_dec rep_len_dec; + + /* Probabilities of literals */ + uint16_t literal[LITERAL_CODERS_MAX][LITERAL_CODER_SIZE]; +}; + +struct lzma2_dec { + /* Position in xz_dec_lzma2_run(). */ + enum lzma2_seq { + SEQ_CONTROL, + SEQ_UNCOMPRESSED_1, + SEQ_UNCOMPRESSED_2, + SEQ_COMPRESSED_0, + SEQ_COMPRESSED_1, + SEQ_PROPERTIES, + SEQ_LZMA_PREPARE, + SEQ_LZMA_RUN, + SEQ_COPY + } sequence; + + /* Next position after decoding the compressed size of the chunk. */ + enum lzma2_seq next_sequence; + + /* Uncompressed size of LZMA chunk (2 MiB at maximum) */ + uint32_t uncompressed; + + /* + * Compressed size of LZMA chunk or compressed/uncompressed + * size of uncompressed chunk (64 KiB at maximum) + */ + uint32_t compressed; + + /* + * True if dictionary reset is needed. This is false before + * the first chunk (LZMA or uncompressed). + */ + bool need_dict_reset; + + /* + * True if new LZMA properties are needed. This is false + * before the first LZMA chunk. + */ + bool need_props; +}; + +struct xz_dec_lzma2 { + /* + * The order below is important on x86 to reduce code size and + * it shouldn't hurt on other platforms. Everything up to and + * including lzma.pos_mask are in the first 128 bytes on x86-32, + * which allows using smaller instructions to access those + * variables. On x86-64, fewer variables fit into the first 128 + * bytes, but this is still the best order without sacrificing + * the readability by splitting the structures. + */ + struct rc_dec rc; + struct dictionary dict; + struct lzma2_dec lzma2; + struct lzma_dec lzma; + + /* + * Temporary buffer which holds small number of input bytes between + * decoder calls. See lzma2_lzma() for details. + */ + struct { + uint32_t size; + uint8_t buf[3 * LZMA_IN_REQUIRED]; + } temp; +}; + +/************** + * Dictionary * + **************/ + +/* + * Reset the dictionary state. When in single-call mode, set up the beginning + * of the dictionary to point to the actual output buffer. + */ +static void dict_reset(struct dictionary *dict, struct xz_buf *b) +{ + if (DEC_IS_SINGLE(dict->mode)) { + dict->buf = b->out + b->out_pos; + dict->end = b->out_size - b->out_pos; + } + + dict->start = 0; + dict->pos = 0; + dict->limit = 0; + dict->full = 0; +} + +/* Set dictionary write limit */ +static void dict_limit(struct dictionary *dict, size_t out_max) +{ + if (dict->end - dict->pos <= out_max) + dict->limit = dict->end; + else + dict->limit = dict->pos + out_max; +} + +/* Return true if at least one byte can be written into the dictionary. */ +static inline bool dict_has_space(const struct dictionary *dict) +{ + return dict->pos < dict->limit; +} + +/* + * Get a byte from the dictionary at the given distance. The distance is + * assumed to valid, or as a special case, zero when the dictionary is + * still empty. This special case is needed for single-call decoding to + * avoid writing a '\0' to the end of the destination buffer. + */ +static inline uint32_t dict_get(const struct dictionary *dict, uint32_t dist) +{ + size_t offset = dict->pos - dist - 1; + + if (dist >= dict->pos) + offset += dict->end; + + return dict->full > 0 ? dict->buf[offset] : 0; +} + +/* + * Put one byte into the dictionary. It is assumed that there is space for it. + */ +static inline void dict_put(struct dictionary *dict, uint8_t byte) +{ + dict->buf[dict->pos++] = byte; + + if (dict->full < dict->pos) + dict->full = dict->pos; +} + +/* + * Repeat given number of bytes from the given distance. If the distance is + * invalid, false is returned. On success, true is returned and *len is + * updated to indicate how many bytes were left to be repeated. + */ +static bool dict_repeat(struct dictionary *dict, uint32_t *len, uint32_t dist) +{ + size_t back; + uint32_t left; + + if (dist >= dict->full || dist >= dict->size) + return false; + + left = min_t(size_t, dict->limit - dict->pos, *len); + *len -= left; + + back = dict->pos - dist - 1; + if (dist >= dict->pos) + back += dict->end; + + do { + dict->buf[dict->pos++] = dict->buf[back++]; + if (back == dict->end) + back = 0; + } while (--left > 0); + + if (dict->full < dict->pos) + dict->full = dict->pos; + + return true; +} + +/* Copy uncompressed data as is from input to dictionary and output buffers. */ +static void dict_uncompressed(struct dictionary *dict, struct xz_buf *b, + uint32_t *left) +{ + size_t copy_size; + + while (*left > 0 && b->in_pos < b->in_size + && b->out_pos < b->out_size) { + copy_size = min(b->in_size - b->in_pos, + b->out_size - b->out_pos); + if (copy_size > dict->end - dict->pos) + copy_size = dict->end - dict->pos; + if (copy_size > *left) + copy_size = *left; + + *left -= copy_size; + + memcpy(dict->buf + dict->pos, b->in + b->in_pos, copy_size); + dict->pos += copy_size; + + if (dict->full < dict->pos) + dict->full = dict->pos; + + if (DEC_IS_MULTI(dict->mode)) { + if (dict->pos == dict->end) + dict->pos = 0; + + memcpy(b->out + b->out_pos, b->in + b->in_pos, + copy_size); + } + + dict->start = dict->pos; + + b->out_pos += copy_size; + b->in_pos += copy_size; + } +} + +/* + * Flush pending data from dictionary to b->out. It is assumed that there is + * enough space in b->out. This is guaranteed because caller uses dict_limit() + * before decoding data into the dictionary. + */ +static uint32_t dict_flush(struct dictionary *dict, struct xz_buf *b) +{ + size_t copy_size = dict->pos - dict->start; + + if (DEC_IS_MULTI(dict->mode)) { + if (dict->pos == dict->end) + dict->pos = 0; + + memcpy(b->out + b->out_pos, dict->buf + dict->start, + copy_size); + } + + dict->start = dict->pos; + b->out_pos += copy_size; + return copy_size; +} + +/***************** + * Range decoder * + *****************/ + +/* Reset the range decoder. */ +static void rc_reset(struct rc_dec *rc) +{ + rc->range = (uint32_t)-1; + rc->code = 0; + rc->init_bytes_left = RC_INIT_BYTES; +} + +/* + * Read the first five initial bytes into rc->code if they haven't been + * read already. (Yes, the first byte gets completely ignored.) + */ +static bool rc_read_init(struct rc_dec *rc, struct xz_buf *b) +{ + while (rc->init_bytes_left > 0) { + if (b->in_pos == b->in_size) + return false; + + rc->code = (rc->code << 8) + b->in[b->in_pos++]; + --rc->init_bytes_left; + } + + return true; +} + +/* Return true if there may not be enough input for the next decoding loop. */ +static inline bool rc_limit_exceeded(const struct rc_dec *rc) +{ + return rc->in_pos > rc->in_limit; +} + +/* + * Return true if it is possible (from point of view of range decoder) that + * we have reached the end of the LZMA chunk. + */ +static inline bool rc_is_finished(const struct rc_dec *rc) +{ + return rc->code == 0; +} + +/* Read the next input byte if needed. */ +static __always_inline void rc_normalize(struct rc_dec *rc) +{ + if (rc->range < RC_TOP_VALUE) { + rc->range <<= RC_SHIFT_BITS; + rc->code = (rc->code << RC_SHIFT_BITS) + rc->in[rc->in_pos++]; + } +} + +/* + * Decode one bit. In some versions, this function has been splitted in three + * functions so that the compiler is supposed to be able to more easily avoid + * an extra branch. In this particular version of the LZMA decoder, this + * doesn't seem to be a good idea (tested with GCC 3.3.6, 3.4.6, and 4.3.3 + * on x86). Using a non-splitted version results in nicer looking code too. + * + * NOTE: This must return an int. Do not make it return a bool or the speed + * of the code generated by GCC 3.x decreases 10-15 %. (GCC 4.3 doesn't care, + * and it generates 10-20 % faster code than GCC 3.x from this file anyway.) + */ +static __always_inline int rc_bit(struct rc_dec *rc, uint16_t *prob) +{ + uint32_t bound; + int bit; + + rc_normalize(rc); + bound = (rc->range >> RC_BIT_MODEL_TOTAL_BITS) * *prob; + if (rc->code < bound) { + rc->range = bound; + *prob += (RC_BIT_MODEL_TOTAL - *prob) >> RC_MOVE_BITS; + bit = 0; + } else { + rc->range -= bound; + rc->code -= bound; + *prob -= *prob >> RC_MOVE_BITS; + bit = 1; + } + + return bit; +} + +/* Decode a bittree starting from the most significant bit. */ +static __always_inline uint32_t rc_bittree(struct rc_dec *rc, + uint16_t *probs, uint32_t limit) +{ + uint32_t symbol = 1; + + do { + if (rc_bit(rc, &probs[symbol])) + symbol = (symbol << 1) + 1; + else + symbol <<= 1; + } while (symbol < limit); + + return symbol; +} + +/* Decode a bittree starting from the least significant bit. */ +static __always_inline void rc_bittree_reverse(struct rc_dec *rc, + uint16_t *probs, + uint32_t *dest, uint32_t limit) +{ + uint32_t symbol = 1; + uint32_t i = 0; + + do { + if (rc_bit(rc, &probs[symbol])) { + symbol = (symbol << 1) + 1; + *dest += 1 << i; + } else { + symbol <<= 1; + } + } while (++i < limit); +} + +/* Decode direct bits (fixed fifty-fifty probability) */ +static inline void rc_direct(struct rc_dec *rc, uint32_t *dest, uint32_t limit) +{ + uint32_t mask; + + do { + rc_normalize(rc); + rc->range >>= 1; + rc->code -= rc->range; + mask = (uint32_t)0 - (rc->code >> 31); + rc->code += rc->range & mask; + *dest = (*dest << 1) + (mask + 1); + } while (--limit > 0); +} + +/******** + * LZMA * + ********/ + +/* Get pointer to literal coder probability array. */ +static uint16_t *lzma_literal_probs(struct xz_dec_lzma2 *s) +{ + uint32_t prev_byte = dict_get(&s->dict, 0); + uint32_t low = prev_byte >> (8 - s->lzma.lc); + uint32_t high = (s->dict.pos & s->lzma.literal_pos_mask) << s->lzma.lc; + return s->lzma.literal[low + high]; +} + +/* Decode a literal (one 8-bit byte) */ +static void lzma_literal(struct xz_dec_lzma2 *s) +{ + uint16_t *probs; + uint32_t symbol; + uint32_t match_byte; + uint32_t match_bit; + uint32_t offset; + uint32_t i; + + probs = lzma_literal_probs(s); + + if (lzma_state_is_literal(s->lzma.state)) { + symbol = rc_bittree(&s->rc, probs, 0x100); + } else { + symbol = 1; + match_byte = dict_get(&s->dict, s->lzma.rep0) << 1; + offset = 0x100; + + do { + match_bit = match_byte & offset; + match_byte <<= 1; + i = offset + match_bit + symbol; + + if (rc_bit(&s->rc, &probs[i])) { + symbol = (symbol << 1) + 1; + offset &= match_bit; + } else { + symbol <<= 1; + offset &= ~match_bit; + } + } while (symbol < 0x100); + } + + dict_put(&s->dict, (uint8_t)symbol); + lzma_state_literal(&s->lzma.state); +} + +/* Decode the length of the match into s->lzma.len. */ +static void lzma_len(struct xz_dec_lzma2 *s, struct lzma_len_dec *l, + uint32_t pos_state) +{ + uint16_t *probs; + uint32_t limit; + + if (!rc_bit(&s->rc, &l->choice)) { + probs = l->low[pos_state]; + limit = LEN_LOW_SYMBOLS; + s->lzma.len = MATCH_LEN_MIN; + } else { + if (!rc_bit(&s->rc, &l->choice2)) { + probs = l->mid[pos_state]; + limit = LEN_MID_SYMBOLS; + s->lzma.len = MATCH_LEN_MIN + LEN_LOW_SYMBOLS; + } else { + probs = l->high; + limit = LEN_HIGH_SYMBOLS; + s->lzma.len = MATCH_LEN_MIN + LEN_LOW_SYMBOLS + + LEN_MID_SYMBOLS; + } + } + + s->lzma.len += rc_bittree(&s->rc, probs, limit) - limit; +} + +/* Decode a match. The distance will be stored in s->lzma.rep0. */ +static void lzma_match(struct xz_dec_lzma2 *s, uint32_t pos_state) +{ + uint16_t *probs; + uint32_t dist_slot; + uint32_t limit; + + lzma_state_match(&s->lzma.state); + + s->lzma.rep3 = s->lzma.rep2; + s->lzma.rep2 = s->lzma.rep1; + s->lzma.rep1 = s->lzma.rep0; + + lzma_len(s, &s->lzma.match_len_dec, pos_state); + + probs = s->lzma.dist_slot[lzma_get_dist_state(s->lzma.len)]; + dist_slot = rc_bittree(&s->rc, probs, DIST_SLOTS) - DIST_SLOTS; + + if (dist_slot < DIST_MODEL_START) { + s->lzma.rep0 = dist_slot; + } else { + limit = (dist_slot >> 1) - 1; + s->lzma.rep0 = 2 + (dist_slot & 1); + + if (dist_slot < DIST_MODEL_END) { + s->lzma.rep0 <<= limit; + probs = s->lzma.dist_special + s->lzma.rep0 + - dist_slot - 1; + rc_bittree_reverse(&s->rc, probs, + &s->lzma.rep0, limit); + } else { + rc_direct(&s->rc, &s->lzma.rep0, limit - ALIGN_BITS); + s->lzma.rep0 <<= ALIGN_BITS; + rc_bittree_reverse(&s->rc, s->lzma.dist_align, + &s->lzma.rep0, ALIGN_BITS); + } + } +} + +/* + * Decode a repeated match. The distance is one of the four most recently + * seen matches. The distance will be stored in s->lzma.rep0. + */ +static void lzma_rep_match(struct xz_dec_lzma2 *s, uint32_t pos_state) +{ + uint32_t tmp; + + if (!rc_bit(&s->rc, &s->lzma.is_rep0[s->lzma.state])) { + if (!rc_bit(&s->rc, &s->lzma.is_rep0_long[ + s->lzma.state][pos_state])) { + lzma_state_short_rep(&s->lzma.state); + s->lzma.len = 1; + return; + } + } else { + if (!rc_bit(&s->rc, &s->lzma.is_rep1[s->lzma.state])) { + tmp = s->lzma.rep1; + } else { + if (!rc_bit(&s->rc, &s->lzma.is_rep2[s->lzma.state])) { + tmp = s->lzma.rep2; + } else { + tmp = s->lzma.rep3; + s->lzma.rep3 = s->lzma.rep2; + } + + s->lzma.rep2 = s->lzma.rep1; + } + + s->lzma.rep1 = s->lzma.rep0; + s->lzma.rep0 = tmp; + } + + lzma_state_long_rep(&s->lzma.state); + lzma_len(s, &s->lzma.rep_len_dec, pos_state); +} + +/* LZMA decoder core */ +static bool lzma_main(struct xz_dec_lzma2 *s) +{ + uint32_t pos_state; + + /* + * If the dictionary was reached during the previous call, try to + * finish the possibly pending repeat in the dictionary. + */ + if (dict_has_space(&s->dict) && s->lzma.len > 0) + dict_repeat(&s->dict, &s->lzma.len, s->lzma.rep0); + + /* + * Decode more LZMA symbols. One iteration may consume up to + * LZMA_IN_REQUIRED - 1 bytes. + */ + while (dict_has_space(&s->dict) && !rc_limit_exceeded(&s->rc)) { + pos_state = s->dict.pos & s->lzma.pos_mask; + + if (!rc_bit(&s->rc, &s->lzma.is_match[ + s->lzma.state][pos_state])) { + lzma_literal(s); + } else { + if (rc_bit(&s->rc, &s->lzma.is_rep[s->lzma.state])) + lzma_rep_match(s, pos_state); + else + lzma_match(s, pos_state); + + if (!dict_repeat(&s->dict, &s->lzma.len, s->lzma.rep0)) + return false; + } + } + + /* + * Having the range decoder always normalized when we are outside + * this function makes it easier to correctly handle end of the chunk. + */ + rc_normalize(&s->rc); + + return true; +} + +/* + * Reset the LZMA decoder and range decoder state. Dictionary is nore reset + * here, because LZMA state may be reset without resetting the dictionary. + */ +static void lzma_reset(struct xz_dec_lzma2 *s) +{ + uint16_t *probs; + size_t i; + + s->lzma.state = STATE_LIT_LIT; + s->lzma.rep0 = 0; + s->lzma.rep1 = 0; + s->lzma.rep2 = 0; + s->lzma.rep3 = 0; + + /* + * All probabilities are initialized to the same value. This hack + * makes the code smaller by avoiding a separate loop for each + * probability array. + * + * This could be optimized so that only that part of literal + * probabilities that are actually required. In the common case + * we would write 12 KiB less. + */ + probs = s->lzma.is_match[0]; + for (i = 0; i < PROBS_TOTAL; ++i) + probs[i] = RC_BIT_MODEL_TOTAL / 2; + + rc_reset(&s->rc); +} + +/* + * Decode and validate LZMA properties (lc/lp/pb) and calculate the bit masks + * from the decoded lp and pb values. On success, the LZMA decoder state is + * reset and true is returned. + */ +static bool lzma_props(struct xz_dec_lzma2 *s, uint8_t props) +{ + if (props > (4 * 5 + 4) * 9 + 8) + return false; + + s->lzma.pos_mask = 0; + while (props >= 9 * 5) { + props -= 9 * 5; + ++s->lzma.pos_mask; + } + + s->lzma.pos_mask = (1 << s->lzma.pos_mask) - 1; + + s->lzma.literal_pos_mask = 0; + while (props >= 9) { + props -= 9; + ++s->lzma.literal_pos_mask; + } + + s->lzma.lc = props; + + if (s->lzma.lc + s->lzma.literal_pos_mask > 4) + return false; + + s->lzma.literal_pos_mask = (1 << s->lzma.literal_pos_mask) - 1; + + lzma_reset(s); + + return true; +} + +/********* + * LZMA2 * + *********/ + +/* + * The LZMA decoder assumes that if the input limit (s->rc.in_limit) hasn't + * been exceeded, it is safe to read up to LZMA_IN_REQUIRED bytes. This + * wrapper function takes care of making the LZMA decoder's assumption safe. + * + * As long as there is plenty of input left to be decoded in the current LZMA + * chunk, we decode directly from the caller-supplied input buffer until + * there's LZMA_IN_REQUIRED bytes left. Those remaining bytes are copied into + * s->temp.buf, which (hopefully) gets filled on the next call to this + * function. We decode a few bytes from the temporary buffer so that we can + * continue decoding from the caller-supplied input buffer again. + */ +static bool lzma2_lzma(struct xz_dec_lzma2 *s, struct xz_buf *b) +{ + size_t in_avail; + uint32_t tmp; + + in_avail = b->in_size - b->in_pos; + if (s->temp.size > 0 || s->lzma2.compressed == 0) { + tmp = 2 * LZMA_IN_REQUIRED - s->temp.size; + if (tmp > s->lzma2.compressed - s->temp.size) + tmp = s->lzma2.compressed - s->temp.size; + if (tmp > in_avail) + tmp = in_avail; + + memcpy(s->temp.buf + s->temp.size, b->in + b->in_pos, tmp); + + if (s->temp.size + tmp == s->lzma2.compressed) { + memzero(s->temp.buf + s->temp.size + tmp, + sizeof(s->temp.buf) + - s->temp.size - tmp); + s->rc.in_limit = s->temp.size + tmp; + } else if (s->temp.size + tmp < LZMA_IN_REQUIRED) { + s->temp.size += tmp; + b->in_pos += tmp; + return true; + } else { + s->rc.in_limit = s->temp.size + tmp - LZMA_IN_REQUIRED; + } + + s->rc.in = s->temp.buf; + s->rc.in_pos = 0; + + if (!lzma_main(s) || s->rc.in_pos > s->temp.size + tmp) + return false; + + s->lzma2.compressed -= s->rc.in_pos; + + if (s->rc.in_pos < s->temp.size) { + s->temp.size -= s->rc.in_pos; + memmove(s->temp.buf, s->temp.buf + s->rc.in_pos, + s->temp.size); + return true; + } + + b->in_pos += s->rc.in_pos - s->temp.size; + s->temp.size = 0; + } + + in_avail = b->in_size - b->in_pos; + if (in_avail >= LZMA_IN_REQUIRED) { + s->rc.in = b->in; + s->rc.in_pos = b->in_pos; + + if (in_avail >= s->lzma2.compressed + LZMA_IN_REQUIRED) + s->rc.in_limit = b->in_pos + s->lzma2.compressed; + else + s->rc.in_limit = b->in_size - LZMA_IN_REQUIRED; + + if (!lzma_main(s)) + return false; + + in_avail = s->rc.in_pos - b->in_pos; + if (in_avail > s->lzma2.compressed) + return false; + + s->lzma2.compressed -= in_avail; + b->in_pos = s->rc.in_pos; + } + + in_avail = b->in_size - b->in_pos; + if (in_avail < LZMA_IN_REQUIRED) { + if (in_avail > s->lzma2.compressed) + in_avail = s->lzma2.compressed; + + memcpy(s->temp.buf, b->in + b->in_pos, in_avail); + s->temp.size = in_avail; + b->in_pos += in_avail; + } + + return true; +} + +/* + * Take care of the LZMA2 control layer, and forward the job of actual LZMA + * decoding or copying of uncompressed chunks to other functions. + */ +XZ_EXTERN enum xz_ret xz_dec_lzma2_run(struct xz_dec_lzma2 *s, + struct xz_buf *b) +{ + uint32_t tmp; + + while (b->in_pos < b->in_size || s->lzma2.sequence == SEQ_LZMA_RUN) { + switch (s->lzma2.sequence) { + case SEQ_CONTROL: + /* + * LZMA2 control byte + * + * Exact values: + * 0x00 End marker + * 0x01 Dictionary reset followed by + * an uncompressed chunk + * 0x02 Uncompressed chunk (no dictionary reset) + * + * Highest three bits (s->control & 0xE0): + * 0xE0 Dictionary reset, new properties and state + * reset, followed by LZMA compressed chunk + * 0xC0 New properties and state reset, followed + * by LZMA compressed chunk (no dictionary + * reset) + * 0xA0 State reset using old properties, + * followed by LZMA compressed chunk (no + * dictionary reset) + * 0x80 LZMA chunk (no dictionary or state reset) + * + * For LZMA compressed chunks, the lowest five bits + * (s->control & 1F) are the highest bits of the + * uncompressed size (bits 16-20). + * + * A new LZMA2 stream must begin with a dictionary + * reset. The first LZMA chunk must set new + * properties and reset the LZMA state. + * + * Values that don't match anything described above + * are invalid and we return XZ_DATA_ERROR. + */ + tmp = b->in[b->in_pos++]; + + if (tmp == 0x00) + return XZ_STREAM_END; + + if (tmp >= 0xE0 || tmp == 0x01) { + s->lzma2.need_props = true; + s->lzma2.need_dict_reset = false; + dict_reset(&s->dict, b); + } else if (s->lzma2.need_dict_reset) { + return XZ_DATA_ERROR; + } + + if (tmp >= 0x80) { + s->lzma2.uncompressed = (tmp & 0x1F) << 16; + s->lzma2.sequence = SEQ_UNCOMPRESSED_1; + + if (tmp >= 0xC0) { + /* + * When there are new properties, + * state reset is done at + * SEQ_PROPERTIES. + */ + s->lzma2.need_props = false; + s->lzma2.next_sequence + = SEQ_PROPERTIES; + + } else if (s->lzma2.need_props) { + return XZ_DATA_ERROR; + + } else { + s->lzma2.next_sequence + = SEQ_LZMA_PREPARE; + if (tmp >= 0xA0) + lzma_reset(s); + } + } else { + if (tmp > 0x02) + return XZ_DATA_ERROR; + + s->lzma2.sequence = SEQ_COMPRESSED_0; + s->lzma2.next_sequence = SEQ_COPY; + } + + break; + + case SEQ_UNCOMPRESSED_1: + s->lzma2.uncompressed + += (uint32_t)b->in[b->in_pos++] << 8; + s->lzma2.sequence = SEQ_UNCOMPRESSED_2; + break; + + case SEQ_UNCOMPRESSED_2: + s->lzma2.uncompressed + += (uint32_t)b->in[b->in_pos++] + 1; + s->lzma2.sequence = SEQ_COMPRESSED_0; + break; + + case SEQ_COMPRESSED_0: + s->lzma2.compressed + = (uint32_t)b->in[b->in_pos++] << 8; + s->lzma2.sequence = SEQ_COMPRESSED_1; + break; + + case SEQ_COMPRESSED_1: + s->lzma2.compressed + += (uint32_t)b->in[b->in_pos++] + 1; + s->lzma2.sequence = s->lzma2.next_sequence; + break; + + case SEQ_PROPERTIES: + if (!lzma_props(s, b->in[b->in_pos++])) + return XZ_DATA_ERROR; + + s->lzma2.sequence = SEQ_LZMA_PREPARE; + + /* Fall through */ + + case SEQ_LZMA_PREPARE: + if (s->lzma2.compressed < RC_INIT_BYTES) + return XZ_DATA_ERROR; + + if (!rc_read_init(&s->rc, b)) + return XZ_OK; + + s->lzma2.compressed -= RC_INIT_BYTES; + s->lzma2.sequence = SEQ_LZMA_RUN; + + /* Fall through */ + + case SEQ_LZMA_RUN: + /* + * Set dictionary limit to indicate how much we want + * to be encoded at maximum. Decode new data into the + * dictionary. Flush the new data from dictionary to + * b->out. Check if we finished decoding this chunk. + * In case the dictionary got full but we didn't fill + * the output buffer yet, we may run this loop + * multiple times without changing s->lzma2.sequence. + */ + dict_limit(&s->dict, min_t(size_t, + b->out_size - b->out_pos, + s->lzma2.uncompressed)); + if (!lzma2_lzma(s, b)) + return XZ_DATA_ERROR; + + s->lzma2.uncompressed -= dict_flush(&s->dict, b); + + if (s->lzma2.uncompressed == 0) { + if (s->lzma2.compressed > 0 || s->lzma.len > 0 + || !rc_is_finished(&s->rc)) + return XZ_DATA_ERROR; + + rc_reset(&s->rc); + s->lzma2.sequence = SEQ_CONTROL; + + } else if (b->out_pos == b->out_size + || (b->in_pos == b->in_size + && s->temp.size + < s->lzma2.compressed)) { + return XZ_OK; + } + + break; + + case SEQ_COPY: + dict_uncompressed(&s->dict, b, &s->lzma2.compressed); + if (s->lzma2.compressed > 0) + return XZ_OK; + + s->lzma2.sequence = SEQ_CONTROL; + break; + } + } + + return XZ_OK; +} + +XZ_EXTERN struct xz_dec_lzma2 *xz_dec_lzma2_create(enum xz_mode mode, + uint32_t dict_max) +{ + struct xz_dec_lzma2 *s = kmalloc(sizeof(*s), GFP_KERNEL); + if (s == NULL) + return NULL; + + s->dict.mode = mode; + s->dict.size_max = dict_max; + + if (DEC_IS_PREALLOC(mode)) { + s->dict.buf = vmalloc(dict_max); + if (s->dict.buf == NULL) { + kfree(s); + return NULL; + } + } else if (DEC_IS_DYNALLOC(mode)) { + s->dict.buf = NULL; + s->dict.allocated = 0; + } + + return s; +} + +XZ_EXTERN enum xz_ret xz_dec_lzma2_reset(struct xz_dec_lzma2 *s, uint8_t props) +{ + /* This limits dictionary size to 3 GiB to keep parsing simpler. */ + if (props > 39) + return XZ_OPTIONS_ERROR; + + s->dict.size = 2 + (props & 1); + s->dict.size <<= (props >> 1) + 11; + + if (DEC_IS_MULTI(s->dict.mode)) { + if (s->dict.size > s->dict.size_max) + return XZ_MEMLIMIT_ERROR; + + s->dict.end = s->dict.size; + + if (DEC_IS_DYNALLOC(s->dict.mode)) { + if (s->dict.allocated < s->dict.size) { + vfree(s->dict.buf); + s->dict.buf = vmalloc(s->dict.size); + if (s->dict.buf == NULL) { + s->dict.allocated = 0; + return XZ_MEM_ERROR; + } + } + } + } + + s->lzma.len = 0; + + s->lzma2.sequence = SEQ_CONTROL; + s->lzma2.need_dict_reset = true; + + s->temp.size = 0; + + return XZ_OK; +} + +XZ_EXTERN void xz_dec_lzma2_end(struct xz_dec_lzma2 *s) +{ + if (DEC_IS_MULTI(s->dict.mode)) + vfree(s->dict.buf); + + kfree(s); +} + +#endif /* !(IS_ENABLED(CONFIG_XZ_DEC)) */ diff --git a/src/mlnx-ofa_kernel-5.8/compat/xz_dec_stream.c b/src/mlnx-ofa_kernel-5.8/compat/xz_dec_stream.c new file mode 100644 index 0000000..a0f9d33 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/xz_dec_stream.c @@ -0,0 +1,825 @@ +#if !(IS_ENABLED(CONFIG_XZ_DEC)) + +/* + * .xz Stream decoder + * + * Author: Lasse Collin + * + * This file has been put into the public domain. + * You can do whatever you want with this file. + */ + +#include "xz_private.h" +#include "xz_stream.h" + +/* Hash used to validate the Index field */ +struct xz_dec_hash { + vli_type unpadded; + vli_type uncompressed; + uint32_t crc32; +}; + +struct xz_dec { + /* Position in dec_main() */ + enum { + SEQ_STREAM_HEADER, + SEQ_BLOCK_START, + SEQ_BLOCK_HEADER, + SEQ_BLOCK_UNCOMPRESS, + SEQ_BLOCK_PADDING, + SEQ_BLOCK_CHECK, + SEQ_INDEX, + SEQ_INDEX_PADDING, + SEQ_INDEX_CRC32, + SEQ_STREAM_FOOTER + } sequence; + + /* Position in variable-length integers and Check fields */ + uint32_t pos; + + /* Variable-length integer decoded by dec_vli() */ + vli_type vli; + + /* Saved in_pos and out_pos */ + size_t in_start; + size_t out_start; + + /* CRC32 value in Block or Index */ + uint32_t crc32; + + /* Type of the integrity check calculated from uncompressed data */ + enum xz_check check_type; + + /* Operation mode */ + enum xz_mode mode; + + /* + * True if the next call to xz_dec_run() is allowed to return + * XZ_BUF_ERROR. + */ + bool allow_buf_error; + + /* Information stored in Block Header */ + struct { + /* + * Value stored in the Compressed Size field, or + * VLI_UNKNOWN if Compressed Size is not present. + */ + vli_type compressed; + + /* + * Value stored in the Uncompressed Size field, or + * VLI_UNKNOWN if Uncompressed Size is not present. + */ + vli_type uncompressed; + + /* Size of the Block Header field */ + uint32_t size; + } block_header; + + /* Information collected when decoding Blocks */ + struct { + /* Observed compressed size of the current Block */ + vli_type compressed; + + /* Observed uncompressed size of the current Block */ + vli_type uncompressed; + + /* Number of Blocks decoded so far */ + vli_type count; + + /* + * Hash calculated from the Block sizes. This is used to + * validate the Index field. + */ + struct xz_dec_hash hash; + } block; + + /* Variables needed when verifying the Index field */ + struct { + /* Position in dec_index() */ + enum { + SEQ_INDEX_COUNT, + SEQ_INDEX_UNPADDED, + SEQ_INDEX_UNCOMPRESSED + } sequence; + + /* Size of the Index in bytes */ + vli_type size; + + /* Number of Records (matches block.count in valid files) */ + vli_type count; + + /* + * Hash calculated from the Records (matches block.hash in + * valid files). + */ + struct xz_dec_hash hash; + } index; + + /* + * Temporary buffer needed to hold Stream Header, Block Header, + * and Stream Footer. The Block Header is the biggest (1 KiB) + * so we reserve space according to that. buf[] has to be aligned + * to a multiple of four bytes; the size_t variables before it + * should guarantee this. + */ + struct { + size_t pos; + size_t size; + uint8_t buf[1024]; + } temp; + + struct xz_dec_lzma2 *lzma2; + +#ifdef XZ_DEC_BCJ + struct xz_dec_bcj *bcj; + bool bcj_active; +#endif +}; + +#ifdef XZ_DEC_ANY_CHECK +/* Sizes of the Check field with different Check IDs */ +static const uint8_t check_sizes[16] = { + 0, + 4, 4, 4, + 8, 8, 8, + 16, 16, 16, + 32, 32, 32, + 64, 64, 64 +}; +#endif + +/* + * Fill s->temp by copying data starting from b->in[b->in_pos]. Caller + * must have set s->temp.pos to indicate how much data we are supposed + * to copy into s->temp.buf. Return true once s->temp.pos has reached + * s->temp.size. + */ +static bool fill_temp(struct xz_dec *s, struct xz_buf *b) +{ + size_t copy_size = min_t(size_t, + b->in_size - b->in_pos, s->temp.size - s->temp.pos); + + memcpy(s->temp.buf + s->temp.pos, b->in + b->in_pos, copy_size); + b->in_pos += copy_size; + s->temp.pos += copy_size; + + if (s->temp.pos == s->temp.size) { + s->temp.pos = 0; + return true; + } + + return false; +} + +/* Decode a variable-length integer (little-endian base-128 encoding) */ +static enum xz_ret dec_vli(struct xz_dec *s, const uint8_t *in, + size_t *in_pos, size_t in_size) +{ + uint8_t byte; + + if (s->pos == 0) + s->vli = 0; + + while (*in_pos < in_size) { + byte = in[*in_pos]; + ++*in_pos; + + s->vli |= (vli_type)(byte & 0x7F) << s->pos; + + if ((byte & 0x80) == 0) { + /* Don't allow non-minimal encodings. */ + if (byte == 0 && s->pos != 0) + return XZ_DATA_ERROR; + + s->pos = 0; + return XZ_STREAM_END; + } + + s->pos += 7; + if (s->pos == 7 * VLI_BYTES_MAX) + return XZ_DATA_ERROR; + } + + return XZ_OK; +} + +/* + * Decode the Compressed Data field from a Block. Update and validate + * the observed compressed and uncompressed sizes of the Block so that + * they don't exceed the values possibly stored in the Block Header + * (validation assumes that no integer overflow occurs, since vli_type + * is normally uint64_t). Update the CRC32 if presence of the CRC32 + * field was indicated in Stream Header. + * + * Once the decoding is finished, validate that the observed sizes match + * the sizes possibly stored in the Block Header. Update the hash and + * Block count, which are later used to validate the Index field. + */ +static enum xz_ret dec_block(struct xz_dec *s, struct xz_buf *b) +{ + enum xz_ret ret; + + s->in_start = b->in_pos; + s->out_start = b->out_pos; + +#ifdef XZ_DEC_BCJ + if (s->bcj_active) + ret = xz_dec_bcj_run(s->bcj, s->lzma2, b); + else +#endif + ret = xz_dec_lzma2_run(s->lzma2, b); + + s->block.compressed += b->in_pos - s->in_start; + s->block.uncompressed += b->out_pos - s->out_start; + + /* + * There is no need to separately check for VLI_UNKNOWN, since + * the observed sizes are always smaller than VLI_UNKNOWN. + */ + if (s->block.compressed > s->block_header.compressed + || s->block.uncompressed + > s->block_header.uncompressed) + return XZ_DATA_ERROR; + + if (s->check_type == XZ_CHECK_CRC32) + s->crc32 = xz_crc32(b->out + s->out_start, + b->out_pos - s->out_start, s->crc32); + + if (ret == XZ_STREAM_END) { + if (s->block_header.compressed != VLI_UNKNOWN + && s->block_header.compressed + != s->block.compressed) + return XZ_DATA_ERROR; + + if (s->block_header.uncompressed != VLI_UNKNOWN + && s->block_header.uncompressed + != s->block.uncompressed) + return XZ_DATA_ERROR; + + s->block.hash.unpadded += s->block_header.size + + s->block.compressed; + +#ifdef XZ_DEC_ANY_CHECK + s->block.hash.unpadded += check_sizes[s->check_type]; +#else + if (s->check_type == XZ_CHECK_CRC32) + s->block.hash.unpadded += 4; +#endif + + s->block.hash.uncompressed += s->block.uncompressed; + s->block.hash.crc32 = xz_crc32( + (const uint8_t *)&s->block.hash, + sizeof(s->block.hash), s->block.hash.crc32); + + ++s->block.count; + } + + return ret; +} + +/* Update the Index size and the CRC32 value. */ +static void index_update(struct xz_dec *s, const struct xz_buf *b) +{ + size_t in_used = b->in_pos - s->in_start; + s->index.size += in_used; + s->crc32 = xz_crc32(b->in + s->in_start, in_used, s->crc32); +} + +/* + * Decode the Number of Records, Unpadded Size, and Uncompressed Size + * fields from the Index field. That is, Index Padding and CRC32 are not + * decoded by this function. + * + * This can return XZ_OK (more input needed), XZ_STREAM_END (everything + * successfully decoded), or XZ_DATA_ERROR (input is corrupt). + */ +static enum xz_ret dec_index(struct xz_dec *s, struct xz_buf *b) +{ + enum xz_ret ret; + + do { + ret = dec_vli(s, b->in, &b->in_pos, b->in_size); + if (ret != XZ_STREAM_END) { + index_update(s, b); + return ret; + } + + switch (s->index.sequence) { + case SEQ_INDEX_COUNT: + s->index.count = s->vli; + + /* + * Validate that the Number of Records field + * indicates the same number of Records as + * there were Blocks in the Stream. + */ + if (s->index.count != s->block.count) + return XZ_DATA_ERROR; + + s->index.sequence = SEQ_INDEX_UNPADDED; + break; + + case SEQ_INDEX_UNPADDED: + s->index.hash.unpadded += s->vli; + s->index.sequence = SEQ_INDEX_UNCOMPRESSED; + break; + + case SEQ_INDEX_UNCOMPRESSED: + s->index.hash.uncompressed += s->vli; + s->index.hash.crc32 = xz_crc32( + (const uint8_t *)&s->index.hash, + sizeof(s->index.hash), + s->index.hash.crc32); + --s->index.count; + s->index.sequence = SEQ_INDEX_UNPADDED; + break; + } + } while (s->index.count > 0); + + return XZ_STREAM_END; +} + +/* + * Validate that the next four input bytes match the value of s->crc32. + * s->pos must be zero when starting to validate the first byte. + */ +static enum xz_ret crc32_validate(struct xz_dec *s, struct xz_buf *b) +{ + do { + if (b->in_pos == b->in_size) + return XZ_OK; + + if (((s->crc32 >> s->pos) & 0xFF) != b->in[b->in_pos++]) + return XZ_DATA_ERROR; + + s->pos += 8; + + } while (s->pos < 32); + + s->crc32 = 0; + s->pos = 0; + + return XZ_STREAM_END; +} + +#ifdef XZ_DEC_ANY_CHECK +/* + * Skip over the Check field when the Check ID is not supported. + * Returns true once the whole Check field has been skipped over. + */ +static bool check_skip(struct xz_dec *s, struct xz_buf *b) +{ + while (s->pos < check_sizes[s->check_type]) { + if (b->in_pos == b->in_size) + return false; + + ++b->in_pos; + ++s->pos; + } + + s->pos = 0; + + return true; +} +#endif + +/* Decode the Stream Header field (the first 12 bytes of the .xz Stream). */ +static enum xz_ret dec_stream_header(struct xz_dec *s) +{ + if (!memeq(s->temp.buf, HEADER_MAGIC, HEADER_MAGIC_SIZE)) + return XZ_FORMAT_ERROR; + + if (xz_crc32(s->temp.buf + HEADER_MAGIC_SIZE, 2, 0) + != get_le32(s->temp.buf + HEADER_MAGIC_SIZE + 2)) + return XZ_DATA_ERROR; + + if (s->temp.buf[HEADER_MAGIC_SIZE] != 0) + return XZ_OPTIONS_ERROR; + + /* + * Of integrity checks, we support only none (Check ID = 0) and + * CRC32 (Check ID = 1). However, if XZ_DEC_ANY_CHECK is defined, + * we will accept other check types too, but then the check won't + * be verified and a warning (XZ_UNSUPPORTED_CHECK) will be given. + */ + s->check_type = s->temp.buf[HEADER_MAGIC_SIZE + 1]; + +#ifdef XZ_DEC_ANY_CHECK + if (s->check_type > XZ_CHECK_MAX) + return XZ_OPTIONS_ERROR; + + if (s->check_type > XZ_CHECK_CRC32) + return XZ_UNSUPPORTED_CHECK; +#else + if (s->check_type > XZ_CHECK_CRC32) + return XZ_OPTIONS_ERROR; +#endif + + return XZ_OK; +} + +/* Decode the Stream Footer field (the last 12 bytes of the .xz Stream) */ +static enum xz_ret dec_stream_footer(struct xz_dec *s) +{ + if (!memeq(s->temp.buf + 10, FOOTER_MAGIC, FOOTER_MAGIC_SIZE)) + return XZ_DATA_ERROR; + + if (xz_crc32(s->temp.buf + 4, 6, 0) != get_le32(s->temp.buf)) + return XZ_DATA_ERROR; + + /* + * Validate Backward Size. Note that we never added the size of the + * Index CRC32 field to s->index.size, thus we use s->index.size / 4 + * instead of s->index.size / 4 - 1. + */ + if ((s->index.size >> 2) != get_le32(s->temp.buf + 4)) + return XZ_DATA_ERROR; + + if (s->temp.buf[8] != 0 || s->temp.buf[9] != s->check_type) + return XZ_DATA_ERROR; + + /* + * Use XZ_STREAM_END instead of XZ_OK to be more convenient + * for the caller. + */ + return XZ_STREAM_END; +} + +/* Decode the Block Header and initialize the filter chain. */ +static enum xz_ret dec_block_header(struct xz_dec *s) +{ + enum xz_ret ret; + + /* + * Validate the CRC32. We know that the temp buffer is at least + * eight bytes so this is safe. + */ + s->temp.size -= 4; + if (xz_crc32(s->temp.buf, s->temp.size, 0) + != get_le32(s->temp.buf + s->temp.size)) + return XZ_DATA_ERROR; + + s->temp.pos = 2; + + /* + * Catch unsupported Block Flags. We support only one or two filters + * in the chain, so we catch that with the same test. + */ +#ifdef XZ_DEC_BCJ + if (s->temp.buf[1] & 0x3E) +#else + if (s->temp.buf[1] & 0x3F) +#endif + return XZ_OPTIONS_ERROR; + + /* Compressed Size */ + if (s->temp.buf[1] & 0x40) { + if (dec_vli(s, s->temp.buf, &s->temp.pos, s->temp.size) + != XZ_STREAM_END) + return XZ_DATA_ERROR; + + s->block_header.compressed = s->vli; + } else { + s->block_header.compressed = VLI_UNKNOWN; + } + + /* Uncompressed Size */ + if (s->temp.buf[1] & 0x80) { + if (dec_vli(s, s->temp.buf, &s->temp.pos, s->temp.size) + != XZ_STREAM_END) + return XZ_DATA_ERROR; + + s->block_header.uncompressed = s->vli; + } else { + s->block_header.uncompressed = VLI_UNKNOWN; + } + +#ifdef XZ_DEC_BCJ + /* If there are two filters, the first one must be a BCJ filter. */ + s->bcj_active = s->temp.buf[1] & 0x01; + if (s->bcj_active) { + if (s->temp.size - s->temp.pos < 2) + return XZ_OPTIONS_ERROR; + + ret = xz_dec_bcj_reset(s->bcj, s->temp.buf[s->temp.pos++]); + if (ret != XZ_OK) + return ret; + + /* + * We don't support custom start offset, + * so Size of Properties must be zero. + */ + if (s->temp.buf[s->temp.pos++] != 0x00) + return XZ_OPTIONS_ERROR; + } +#endif + + /* Valid Filter Flags always take at least two bytes. */ + if (s->temp.size - s->temp.pos < 2) + return XZ_DATA_ERROR; + + /* Filter ID = LZMA2 */ + if (s->temp.buf[s->temp.pos++] != 0x21) + return XZ_OPTIONS_ERROR; + + /* Size of Properties = 1-byte Filter Properties */ + if (s->temp.buf[s->temp.pos++] != 0x01) + return XZ_OPTIONS_ERROR; + + /* Filter Properties contains LZMA2 dictionary size. */ + if (s->temp.size - s->temp.pos < 1) + return XZ_DATA_ERROR; + + ret = xz_dec_lzma2_reset(s->lzma2, s->temp.buf[s->temp.pos++]); + if (ret != XZ_OK) + return ret; + + /* The rest must be Header Padding. */ + while (s->temp.pos < s->temp.size) + if (s->temp.buf[s->temp.pos++] != 0x00) + return XZ_OPTIONS_ERROR; + + s->temp.pos = 0; + s->block.compressed = 0; + s->block.uncompressed = 0; + + return XZ_OK; +} + +static enum xz_ret dec_main(struct xz_dec *s, struct xz_buf *b) +{ + enum xz_ret ret; + + /* + * Store the start position for the case when we are in the middle + * of the Index field. + */ + s->in_start = b->in_pos; + + while (true) { + switch (s->sequence) { + case SEQ_STREAM_HEADER: + /* + * Stream Header is copied to s->temp, and then + * decoded from there. This way if the caller + * gives us only little input at a time, we can + * still keep the Stream Header decoding code + * simple. Similar approach is used in many places + * in this file. + */ + if (!fill_temp(s, b)) + return XZ_OK; + + /* + * If dec_stream_header() returns + * XZ_UNSUPPORTED_CHECK, it is still possible + * to continue decoding if working in multi-call + * mode. Thus, update s->sequence before calling + * dec_stream_header(). + */ + s->sequence = SEQ_BLOCK_START; + + ret = dec_stream_header(s); + if (ret != XZ_OK) + return ret; + + case SEQ_BLOCK_START: + /* We need one byte of input to continue. */ + if (b->in_pos == b->in_size) + return XZ_OK; + + /* See if this is the beginning of the Index field. */ + if (b->in[b->in_pos] == 0) { + s->in_start = b->in_pos++; + s->sequence = SEQ_INDEX; + break; + } + + /* + * Calculate the size of the Block Header and + * prepare to decode it. + */ + s->block_header.size + = ((uint32_t)b->in[b->in_pos] + 1) * 4; + + s->temp.size = s->block_header.size; + s->temp.pos = 0; + s->sequence = SEQ_BLOCK_HEADER; + + case SEQ_BLOCK_HEADER: + if (!fill_temp(s, b)) + return XZ_OK; + + ret = dec_block_header(s); + if (ret != XZ_OK) + return ret; + + s->sequence = SEQ_BLOCK_UNCOMPRESS; + + case SEQ_BLOCK_UNCOMPRESS: + ret = dec_block(s, b); + if (ret != XZ_STREAM_END) + return ret; + + s->sequence = SEQ_BLOCK_PADDING; + + case SEQ_BLOCK_PADDING: + /* + * Size of Compressed Data + Block Padding + * must be a multiple of four. We don't need + * s->block.compressed for anything else + * anymore, so we use it here to test the size + * of the Block Padding field. + */ + while (s->block.compressed & 3) { + if (b->in_pos == b->in_size) + return XZ_OK; + + if (b->in[b->in_pos++] != 0) + return XZ_DATA_ERROR; + + ++s->block.compressed; + } + + s->sequence = SEQ_BLOCK_CHECK; + + case SEQ_BLOCK_CHECK: + if (s->check_type == XZ_CHECK_CRC32) { + ret = crc32_validate(s, b); + if (ret != XZ_STREAM_END) + return ret; + } +#ifdef XZ_DEC_ANY_CHECK + else if (!check_skip(s, b)) { + return XZ_OK; + } +#endif + + s->sequence = SEQ_BLOCK_START; + break; + + case SEQ_INDEX: + ret = dec_index(s, b); + if (ret != XZ_STREAM_END) + return ret; + + s->sequence = SEQ_INDEX_PADDING; + + case SEQ_INDEX_PADDING: + while ((s->index.size + (b->in_pos - s->in_start)) + & 3) { + if (b->in_pos == b->in_size) { + index_update(s, b); + return XZ_OK; + } + + if (b->in[b->in_pos++] != 0) + return XZ_DATA_ERROR; + } + + /* Finish the CRC32 value and Index size. */ + index_update(s, b); + + /* Compare the hashes to validate the Index field. */ + if (!memeq(&s->block.hash, &s->index.hash, + sizeof(s->block.hash))) + return XZ_DATA_ERROR; + + s->sequence = SEQ_INDEX_CRC32; + + case SEQ_INDEX_CRC32: + ret = crc32_validate(s, b); + if (ret != XZ_STREAM_END) + return ret; + + s->temp.size = STREAM_HEADER_SIZE; + s->sequence = SEQ_STREAM_FOOTER; + + case SEQ_STREAM_FOOTER: + if (!fill_temp(s, b)) + return XZ_OK; + + return dec_stream_footer(s); + } + } + + /* Never reached */ +} + +/* + * xz_dec_run() is a wrapper for dec_main() to handle some special cases in + * multi-call and single-call decoding. + * + * In multi-call mode, we must return XZ_BUF_ERROR when it seems clear that we + * are not going to make any progress anymore. This is to prevent the caller + * from calling us infinitely when the input file is truncated or otherwise + * corrupt. Since zlib-style API allows that the caller fills the input buffer + * only when the decoder doesn't produce any new output, we have to be careful + * to avoid returning XZ_BUF_ERROR too easily: XZ_BUF_ERROR is returned only + * after the second consecutive call to xz_dec_run() that makes no progress. + * + * In single-call mode, if we couldn't decode everything and no error + * occurred, either the input is truncated or the output buffer is too small. + * Since we know that the last input byte never produces any output, we know + * that if all the input was consumed and decoding wasn't finished, the file + * must be corrupt. Otherwise the output buffer has to be too small or the + * file is corrupt in a way that decoding it produces too big output. + * + * If single-call decoding fails, we reset b->in_pos and b->out_pos back to + * their original values. This is because with some filter chains there won't + * be any valid uncompressed data in the output buffer unless the decoding + * actually succeeds (that's the price to pay of using the output buffer as + * the workspace). + */ +XZ_EXTERN enum xz_ret xz_dec_run(struct xz_dec *s, struct xz_buf *b) +{ + size_t in_start; + size_t out_start; + enum xz_ret ret; + + if (DEC_IS_SINGLE(s->mode)) + xz_dec_reset(s); + + in_start = b->in_pos; + out_start = b->out_pos; + ret = dec_main(s, b); + + if (DEC_IS_SINGLE(s->mode)) { + if (ret == XZ_OK) + ret = b->in_pos == b->in_size + ? XZ_DATA_ERROR : XZ_BUF_ERROR; + + if (ret != XZ_STREAM_END) { + b->in_pos = in_start; + b->out_pos = out_start; + } + + } else if (ret == XZ_OK && in_start == b->in_pos + && out_start == b->out_pos) { + if (s->allow_buf_error) + ret = XZ_BUF_ERROR; + + s->allow_buf_error = true; + } else { + s->allow_buf_error = false; + } + + return ret; +} + +XZ_EXTERN struct xz_dec *xz_dec_init(enum xz_mode mode, uint32_t dict_max) +{ + struct xz_dec *s = kmalloc(sizeof(*s), GFP_KERNEL); + if (s == NULL) + return NULL; + + s->mode = mode; + +#ifdef XZ_DEC_BCJ + s->bcj = xz_dec_bcj_create(DEC_IS_SINGLE(mode)); + if (s->bcj == NULL) + goto error_bcj; +#endif + + s->lzma2 = xz_dec_lzma2_create(mode, dict_max); + if (s->lzma2 == NULL) + goto error_lzma2; + + xz_dec_reset(s); + return s; + +error_lzma2: +#ifdef XZ_DEC_BCJ + xz_dec_bcj_end(s->bcj); +error_bcj: +#endif + kfree(s); + return NULL; +} + +XZ_EXTERN void xz_dec_reset(struct xz_dec *s) +{ + s->sequence = SEQ_STREAM_HEADER; + s->allow_buf_error = false; + s->pos = 0; + s->crc32 = 0; + memzero(&s->block, sizeof(s->block)); + memzero(&s->index, sizeof(s->index)); + s->temp.pos = 0; + s->temp.size = STREAM_HEADER_SIZE; +} + +XZ_EXTERN void xz_dec_end(struct xz_dec *s) +{ + if (s != NULL) { + xz_dec_lzma2_end(s->lzma2); +#ifdef XZ_DEC_BCJ + xz_dec_bcj_end(s->bcj); +#endif + kfree(s); + } +} + +#endif /* !(IS_ENABLED(CONFIG_XZ_DEC)) */ diff --git a/src/mlnx-ofa_kernel-5.8/compat/xz_dec_syms.c b/src/mlnx-ofa_kernel-5.8/compat/xz_dec_syms.c new file mode 100644 index 0000000..0493061 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/xz_dec_syms.c @@ -0,0 +1,39 @@ +#if !(IS_ENABLED(CONFIG_XZ_DEC)) + +/* + * XZ decoder module information + * + * Author: Lasse Collin + * + * This file has been put into the public domain. + * You can do whatever you want with this file. + */ + +#include +#include +#include + +#define xz_dec_init LINUX_BACKPORT(xz_dec_init) +EXPORT_SYMBOL(xz_dec_init); +#define xz_dec_reset LINUX_BACKPORT(xz_dec_reset) +EXPORT_SYMBOL(xz_dec_reset); +#define xz_dec_run LINUX_BACKPORT(xz_dec_run) +EXPORT_SYMBOL(xz_dec_run); +#define xz_dec_end LINUX_BACKPORT(xz_dec_end) +EXPORT_SYMBOL(xz_dec_end); + +/* +MODULE_DESCRIPTION("XZ decompressor"); +MODULE_VERSION("1.0"); +MODULE_AUTHOR("Lasse Collin and Igor Pavlov"); +*/ +/* + * This code is in the public domain, but in Linux it's simplest to just + * say it's GPL and consider the authors as the copyright holders. + */ +/*MODULE_LICENSE("GPL");*/ +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif + +#endif /* !(IS_ENABLED(CONFIG_XZ_DEC)) */ diff --git a/src/mlnx-ofa_kernel-5.8/compat/xz_lzma2.h b/src/mlnx-ofa_kernel-5.8/compat/xz_lzma2.h new file mode 100644 index 0000000..071d67b --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/xz_lzma2.h @@ -0,0 +1,204 @@ +/* + * LZMA2 definitions + * + * Authors: Lasse Collin + * Igor Pavlov + * + * This file has been put into the public domain. + * You can do whatever you want with this file. + */ + +#ifndef XZ_LZMA2_H +#define XZ_LZMA2_H + +/* Range coder constants */ +#define RC_SHIFT_BITS 8 +#define RC_TOP_BITS 24 +#define RC_TOP_VALUE (1 << RC_TOP_BITS) +#define RC_BIT_MODEL_TOTAL_BITS 11 +#define RC_BIT_MODEL_TOTAL (1 << RC_BIT_MODEL_TOTAL_BITS) +#define RC_MOVE_BITS 5 + +/* + * Maximum number of position states. A position state is the lowest pb + * number of bits of the current uncompressed offset. In some places there + * are different sets of probabilities for different position states. + */ +#define POS_STATES_MAX (1 << 4) + +/* + * This enum is used to track which LZMA symbols have occurred most recently + * and in which order. This information is used to predict the next symbol. + * + * Symbols: + * - Literal: One 8-bit byte + * - Match: Repeat a chunk of data at some distance + * - Long repeat: Multi-byte match at a recently seen distance + * - Short repeat: One-byte repeat at a recently seen distance + * + * The symbol names are in from STATE_oldest_older_previous. REP means + * either short or long repeated match, and NONLIT means any non-literal. + */ +enum lzma_state { + STATE_LIT_LIT, + STATE_MATCH_LIT_LIT, + STATE_REP_LIT_LIT, + STATE_SHORTREP_LIT_LIT, + STATE_MATCH_LIT, + STATE_REP_LIT, + STATE_SHORTREP_LIT, + STATE_LIT_MATCH, + STATE_LIT_LONGREP, + STATE_LIT_SHORTREP, + STATE_NONLIT_MATCH, + STATE_NONLIT_REP +}; + +/* Total number of states */ +#define STATES 12 + +/* The lowest 7 states indicate that the previous state was a literal. */ +#define LIT_STATES 7 + +/* Indicate that the latest symbol was a literal. */ +static inline void lzma_state_literal(enum lzma_state *state) +{ + if (*state <= STATE_SHORTREP_LIT_LIT) + *state = STATE_LIT_LIT; + else if (*state <= STATE_LIT_SHORTREP) + *state -= 3; + else + *state -= 6; +} + +/* Indicate that the latest symbol was a match. */ +static inline void lzma_state_match(enum lzma_state *state) +{ + *state = *state < LIT_STATES ? STATE_LIT_MATCH : STATE_NONLIT_MATCH; +} + +/* Indicate that the latest state was a long repeated match. */ +static inline void lzma_state_long_rep(enum lzma_state *state) +{ + *state = *state < LIT_STATES ? STATE_LIT_LONGREP : STATE_NONLIT_REP; +} + +/* Indicate that the latest symbol was a short match. */ +static inline void lzma_state_short_rep(enum lzma_state *state) +{ + *state = *state < LIT_STATES ? STATE_LIT_SHORTREP : STATE_NONLIT_REP; +} + +/* Test if the previous symbol was a literal. */ +static inline bool lzma_state_is_literal(enum lzma_state state) +{ + return state < LIT_STATES; +} + +/* Each literal coder is divided in three sections: + * - 0x001-0x0FF: Without match byte + * - 0x101-0x1FF: With match byte; match bit is 0 + * - 0x201-0x2FF: With match byte; match bit is 1 + * + * Match byte is used when the previous LZMA symbol was something else than + * a literal (that is, it was some kind of match). + */ +#define LITERAL_CODER_SIZE 0x300 + +/* Maximum number of literal coders */ +#define LITERAL_CODERS_MAX (1 << 4) + +/* Minimum length of a match is two bytes. */ +#define MATCH_LEN_MIN 2 + +/* Match length is encoded with 4, 5, or 10 bits. + * + * Length Bits + * 2-9 4 = Choice=0 + 3 bits + * 10-17 5 = Choice=1 + Choice2=0 + 3 bits + * 18-273 10 = Choice=1 + Choice2=1 + 8 bits + */ +#define LEN_LOW_BITS 3 +#define LEN_LOW_SYMBOLS (1 << LEN_LOW_BITS) +#define LEN_MID_BITS 3 +#define LEN_MID_SYMBOLS (1 << LEN_MID_BITS) +#define LEN_HIGH_BITS 8 +#define LEN_HIGH_SYMBOLS (1 << LEN_HIGH_BITS) +#define LEN_SYMBOLS (LEN_LOW_SYMBOLS + LEN_MID_SYMBOLS + LEN_HIGH_SYMBOLS) + +/* + * Maximum length of a match is 273 which is a result of the encoding + * described above. + */ +#define MATCH_LEN_MAX (MATCH_LEN_MIN + LEN_SYMBOLS - 1) + +/* + * Different sets of probabilities are used for match distances that have + * very short match length: Lengths of 2, 3, and 4 bytes have a separate + * set of probabilities for each length. The matches with longer length + * use a shared set of probabilities. + */ +#define DIST_STATES 4 + +/* + * Get the index of the appropriate probability array for decoding + * the distance slot. + */ +static inline uint32_t lzma_get_dist_state(uint32_t len) +{ + return len < DIST_STATES + MATCH_LEN_MIN + ? len - MATCH_LEN_MIN : DIST_STATES - 1; +} + +/* + * The highest two bits of a 32-bit match distance are encoded using six bits. + * This six-bit value is called a distance slot. This way encoding a 32-bit + * value takes 6-36 bits, larger values taking more bits. + */ +#define DIST_SLOT_BITS 6 +#define DIST_SLOTS (1 << DIST_SLOT_BITS) + +/* Match distances up to 127 are fully encoded using probabilities. Since + * the highest two bits (distance slot) are always encoded using six bits, + * the distances 0-3 don't need any additional bits to encode, since the + * distance slot itself is the same as the actual distance. DIST_MODEL_START + * indicates the first distance slot where at least one additional bit is + * needed. + */ +#define DIST_MODEL_START 4 + +/* + * Match distances greater than 127 are encoded in three pieces: + * - distance slot: the highest two bits + * - direct bits: 2-26 bits below the highest two bits + * - alignment bits: four lowest bits + * + * Direct bits don't use any probabilities. + * + * The distance slot value of 14 is for distances 128-191. + */ +#define DIST_MODEL_END 14 + +/* Distance slots that indicate a distance <= 127. */ +#define FULL_DISTANCES_BITS (DIST_MODEL_END / 2) +#define FULL_DISTANCES (1 << FULL_DISTANCES_BITS) + +/* + * For match distances greater than 127, only the highest two bits and the + * lowest four bits (alignment) is encoded using probabilities. + */ +#define ALIGN_BITS 4 +#define ALIGN_SIZE (1 << ALIGN_BITS) +#define ALIGN_MASK (ALIGN_SIZE - 1) + +/* Total number of all probability variables */ +#define PROBS_TOTAL (1846 + LITERAL_CODERS_MAX * LITERAL_CODER_SIZE) + +/* + * LZMA remembers the four most recent match distances. Reusing these + * distances tends to take less space than re-encoding the actual + * distance value. + */ +#define REPS 4 + +#endif diff --git a/src/mlnx-ofa_kernel-5.8/compat/xz_private.h b/src/mlnx-ofa_kernel-5.8/compat/xz_private.h new file mode 100644 index 0000000..97e848b --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/xz_private.h @@ -0,0 +1,163 @@ +/* + * Private includes and definitions + * + * Author: Lasse Collin + * + * This file has been put into the public domain. + * You can do whatever you want with this file. + */ + +#ifndef XZ_PRIVATE_H +#define XZ_PRIVATE_H + +#ifdef __KERNEL__ +# include +# include +# include + /* XZ_PREBOOT may be defined only via decompress_unxz.c. */ +# ifndef XZ_PREBOOT +# include +# include +# include +# ifdef CONFIG_XZ_DEC_X86 +# define XZ_DEC_X86 +# endif +# ifdef CONFIG_XZ_DEC_POWERPC +# define XZ_DEC_POWERPC +# endif +# ifdef CONFIG_XZ_DEC_IA64 +# define XZ_DEC_IA64 +# endif +# ifdef CONFIG_XZ_DEC_ARM +# define XZ_DEC_ARM +# endif +# ifdef CONFIG_XZ_DEC_ARMTHUMB +# define XZ_DEC_ARMTHUMB +# endif +# ifdef CONFIG_XZ_DEC_SPARC +# define XZ_DEC_SPARC +# endif +# define memeq(a, b, size) (memcmp(a, b, size) == 0) +# define memzero(buf, size) memset(buf, 0, size) +# endif +# define get_le32(p) le32_to_cpup((const uint32_t *)(p)) +#else + /* + * For userspace builds, use a separate header to define the required + * macros and functions. This makes it easier to adapt the code into + * different environments and avoids clutter in the Linux kernel tree. + */ +# include "xz_config.h" +#endif + +/* If no specific decoding mode is requested, enable support for all modes. */ +#if !defined(XZ_DEC_SINGLE) && !defined(XZ_DEC_PREALLOC) \ + && !defined(XZ_DEC_DYNALLOC) +# define XZ_DEC_SINGLE +# define XZ_DEC_PREALLOC +# define XZ_DEC_DYNALLOC +#endif + +/* + * The DEC_IS_foo(mode) macros are used in "if" statements. If only some + * of the supported modes are enabled, these macros will evaluate to true or + * false at compile time and thus allow the compiler to omit unneeded code. + */ +#ifdef XZ_DEC_SINGLE +# define DEC_IS_SINGLE(mode) ((mode) == XZ_SINGLE) +#else +# define DEC_IS_SINGLE(mode) (false) +#endif + +#ifdef XZ_DEC_PREALLOC +# define DEC_IS_PREALLOC(mode) ((mode) == XZ_PREALLOC) +#else +# define DEC_IS_PREALLOC(mode) (false) +#endif + +#ifdef XZ_DEC_DYNALLOC +# define DEC_IS_DYNALLOC(mode) ((mode) == XZ_DYNALLOC) +#else +# define DEC_IS_DYNALLOC(mode) (false) +#endif + +#if !defined(XZ_DEC_SINGLE) +# define DEC_IS_MULTI(mode) (true) +#elif defined(XZ_DEC_PREALLOC) || defined(XZ_DEC_DYNALLOC) +# define DEC_IS_MULTI(mode) ((mode) != XZ_SINGLE) +#else +# define DEC_IS_MULTI(mode) (false) +#endif + +/* + * If any of the BCJ filter decoders are wanted, define XZ_DEC_BCJ. + * XZ_DEC_BCJ is used to enable generic support for BCJ decoders. + */ +#ifndef XZ_DEC_BCJ +# if defined(XZ_DEC_X86) || defined(XZ_DEC_POWERPC) \ + || defined(XZ_DEC_IA64) || defined(XZ_DEC_ARM) \ + || defined(XZ_DEC_ARM) || defined(XZ_DEC_ARMTHUMB) \ + || defined(XZ_DEC_SPARC) +# define XZ_DEC_BCJ +# endif +#endif + +/* + * Allocate memory for LZMA2 decoder. xz_dec_lzma2_reset() must be used + * before calling xz_dec_lzma2_run(). + */ +#define xz_dec_lzma2_create LINUX_BACKPORT(xz_dec_lzma2_create) +XZ_EXTERN struct xz_dec_lzma2 *xz_dec_lzma2_create(enum xz_mode mode, + uint32_t dict_max); + +/* + * Decode the LZMA2 properties (one byte) and reset the decoder. Return + * XZ_OK on success, XZ_MEMLIMIT_ERROR if the preallocated dictionary is not + * big enough, and XZ_OPTIONS_ERROR if props indicates something that this + * decoder doesn't support. + */ +#define xz_dec_lzma2_reset LINUX_BACKPORT(xz_dec_lzma2_reset) +XZ_EXTERN enum xz_ret xz_dec_lzma2_reset(struct xz_dec_lzma2 *s, + uint8_t props); + +/* Decode raw LZMA2 stream from b->in to b->out. */ +#define xz_dec_lzma2_run LINUX_BACKPORT(xz_dec_lzma2_run) +XZ_EXTERN enum xz_ret xz_dec_lzma2_run(struct xz_dec_lzma2 *s, + struct xz_buf *b); + +/* Free the memory allocated for the LZMA2 decoder. */ +#define xz_dec_lzma2_end LINUX_BACKPORT(xz_dec_lzma2_end) +XZ_EXTERN void xz_dec_lzma2_end(struct xz_dec_lzma2 *s); + +#ifdef XZ_DEC_BCJ +/* + * Allocate memory for BCJ decoders. xz_dec_bcj_reset() must be used before + * calling xz_dec_bcj_run(). + */ +#define xz_dec_bcj_create LINUX_BACKPORT(xz_dec_bcj_create) +XZ_EXTERN struct xz_dec_bcj *xz_dec_bcj_create(bool single_call); + +/* + * Decode the Filter ID of a BCJ filter. This implementation doesn't + * support custom start offsets, so no decoding of Filter Properties + * is needed. Returns XZ_OK if the given Filter ID is supported. + * Otherwise XZ_OPTIONS_ERROR is returned. + */ +#define xz_dec_bcj_reset LINUX_BACKPORT(xz_dec_bcj_reset) +XZ_EXTERN enum xz_ret xz_dec_bcj_reset(struct xz_dec_bcj *s, uint8_t id); + +/* + * Decode raw BCJ + LZMA2 stream. This must be used only if there actually is + * a BCJ filter in the chain. If the chain has only LZMA2, xz_dec_lzma2_run() + * must be called directly. + */ +#define xz_dec_bcj_run LINUX_BACKPORT(xz_dec_bcj_run) +XZ_EXTERN enum xz_ret xz_dec_bcj_run(struct xz_dec_bcj *s, + struct xz_dec_lzma2 *lzma2, + struct xz_buf *b); + +/* Free the memory allocated for the BCJ filters. */ +#define xz_dec_bcj_end(s) kfree(s) +#endif + +#endif diff --git a/src/mlnx-ofa_kernel-5.8/compat/xz_stream.h b/src/mlnx-ofa_kernel-5.8/compat/xz_stream.h new file mode 100644 index 0000000..66cb5a7 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat/xz_stream.h @@ -0,0 +1,62 @@ +/* + * Definitions for handling the .xz file format + * + * Author: Lasse Collin + * + * This file has been put into the public domain. + * You can do whatever you want with this file. + */ + +#ifndef XZ_STREAM_H +#define XZ_STREAM_H + +#if defined(__KERNEL__) && !XZ_INTERNAL_CRC32 +# include +# undef crc32 +# define xz_crc32(buf, size, crc) \ + (~crc32_le(~(uint32_t)(crc), buf, size)) +#endif + +/* + * See the .xz file format specification at + * http://tukaani.org/xz/xz-file-format.txt + * to understand the container format. + */ + +#define STREAM_HEADER_SIZE 12 + +#define HEADER_MAGIC "\3757zXZ" +#define HEADER_MAGIC_SIZE 6 + +#define FOOTER_MAGIC "YZ" +#define FOOTER_MAGIC_SIZE 2 + +/* + * Variable-length integer can hold a 63-bit unsigned integer or a special + * value indicating that the value is unknown. + * + * Experimental: vli_type can be defined to uint32_t to save a few bytes + * in code size (no effect on speed). Doing so limits the uncompressed and + * compressed size of the file to less than 256 MiB and may also weaken + * error detection slightly. + */ +typedef uint64_t vli_type; + +#define VLI_MAX ((vli_type)-1 / 2) +#define VLI_UNKNOWN ((vli_type)-1) + +/* Maximum encoded size of a VLI */ +#define VLI_BYTES_MAX (sizeof(vli_type) * 8 / 7) + +/* Integrity Check types */ +enum xz_check { + XZ_CHECK_NONE = 0, + XZ_CHECK_CRC32 = 1, + XZ_CHECK_CRC64 = 4, + XZ_CHECK_SHA256 = 10 +}; + +/* Maximum possible Check ID */ +#define XZ_CHECK_MAX 15 + +#endif diff --git a/src/mlnx-ofa_kernel-5.8/compat_base b/src/mlnx-ofa_kernel-5.8/compat_base new file mode 100644 index 0000000..d07335a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat_base @@ -0,0 +1 @@ +mlnx-ofa_kernel-compat-20221127-1152-9886a54 \ No newline at end of file diff --git a/src/mlnx-ofa_kernel-5.8/compat_base_tree b/src/mlnx-ofa_kernel-5.8/compat_base_tree new file mode 100644 index 0000000..ca995bc --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat_base_tree @@ -0,0 +1 @@ +mlnx_ofed/mlnx-ofa_kernel-4.0.git \ No newline at end of file diff --git a/src/mlnx-ofa_kernel-5.8/compat_base_tree_version b/src/mlnx-ofa_kernel-5.8/compat_base_tree_version new file mode 100644 index 0000000..e3aaf74 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat_base_tree_version @@ -0,0 +1 @@ +9886a54 \ No newline at end of file diff --git a/src/mlnx-ofa_kernel-5.8/compat_version b/src/mlnx-ofa_kernel-5.8/compat_version new file mode 100644 index 0000000..e3aaf74 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/compat_version @@ -0,0 +1 @@ +9886a54 \ No newline at end of file diff --git a/src/mlnx-ofa_kernel-5.8/configure b/src/mlnx-ofa_kernel-5.8/configure new file mode 120000 index 0000000..d64b740 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/configure @@ -0,0 +1 @@ +ofed_scripts/configure \ No newline at end of file diff --git a/src/mlnx-ofa_kernel-5.8/debian/changelog b/src/mlnx-ofa_kernel-5.8/debian/changelog new file mode 100644 index 0000000..4b55b64 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/debian/changelog @@ -0,0 +1,5 @@ +mlnx-ofed-kernel (5.8-OFED.5.8.1.1.2.1) unstable; urgency=low + + * Initial release (Closes: #125306) + + -- Vladimir Sokolovsky Mon, 31 Dec 2012 15:38:53 +0200 diff --git a/src/mlnx-ofa_kernel-5.8/debian/compat b/src/mlnx-ofa_kernel-5.8/debian/compat new file mode 100644 index 0000000..45a4fb7 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/debian/compat @@ -0,0 +1 @@ +8 diff --git a/src/mlnx-ofa_kernel-5.8/debian/control b/src/mlnx-ofa_kernel-5.8/debian/control new file mode 100644 index 0000000..2b00d80 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/debian/control @@ -0,0 +1,30 @@ +Source: mlnx-ofed-kernel +Section: net +Priority: optional +Maintainer: Vladimir Sokolovsky +Build-Depends: debhelper (>= 8.0.0), autotools-dev, bzip2, dkms, +Standards-Version: 3.9.2 +Homepage: http://www.mellanox.com/page/products_dyn?product_family=26&mtag=linux_sw_drivers + +Package: mlnx-ofed-kernel-utils +Architecture: any +Depends: ${misc:Depends}, coreutils, pciutils, grep, procps, module-init-tools | kmod, lsof, + mlnx-tools (>= 5.2.0), +Conflicts: mlnx-en-utils +Description: Userspace tools to restart and tune mlnx-ofed kernel modules + +Package: mlnx-ofed-kernel-dkms +Section: kernel +Architecture: all +Depends: dkms, ${misc:Depends}, mlnx-ofed-kernel-utils, libc6-dev, gcc, +Recommends: linux-headers-arm64 | linux-headers-powerpc | linux-headers-ppc64 | linux-headers-ppc64le | linux-headers-amd64 | linux-headers-generic | linux-headers +Provides: mlnx-rdma-rxe-dkms +Conflicts: mlnx-rdma-rxe-dkms +Replaces: mlnx-rdma-rxe-dkms +Description: DKMS support for mlnx-ofed kernel modules + This package provides integration with the DKMS infrastructure for + automatically building out of tree kernel modules. + . + This package provides the source code for the mlnx-ofed kernel modules. + . + This package contains the source to be built with dkms. diff --git a/src/mlnx-ofa_kernel-5.8/debian/control.no_dkms b/src/mlnx-ofa_kernel-5.8/debian/control.no_dkms new file mode 100644 index 0000000..16b941d --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/debian/control.no_dkms @@ -0,0 +1,23 @@ +Source: mlnx-ofed-kernel +Section: net +Priority: optional +Maintainer: Vladimir Sokolovsky +Build-Depends: debhelper (>= 8.0.0), autotools-dev, bzip2, make +Standards-Version: 3.9.2 +Homepage: http://www.mellanox.com/page/products_dyn?product_family=26&mtag=linux_sw_drivers + +Package: mlnx-ofed-kernel-utils +Architecture: any +Depends: ${misc:Depends}, coreutils, pciutils, grep, perl, procps, module-init-tools | kmod, lsof, + mlnx-tools (>= 5.2.0), +Description: Userspace tools to restart and tune mlnx-ofed kernel modules + +Package: mlnx-ofed-kernel-modules +Section: kernel +Architecture: any +Depends: ${misc:Depends}, mlnx-ofed-kernel-utils +Provides: mlnx-rdma-rxe-modules +Conflicts: mlnx-rdma-rxe-modules +Replaces: mlnx-rdma-rxe-modules +Description: mlnx-ofed kernel modules + This package provides the binary and source code for the mlnx-ofed kernel modules. diff --git a/src/mlnx-ofa_kernel-5.8/debian/copyright b/src/mlnx-ofa_kernel-5.8/debian/copyright new file mode 100644 index 0000000..53aa878 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/debian/copyright @@ -0,0 +1,19 @@ +Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ + +Files: * +Copyright: Copyright 2017 Mellanox Technologies +License: GPL-2 + Mellanox OFED (MLNX_OFED) Software distributed under the terms of the GNU General Public License ("GPL") version 2 as published by the Free Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ diff --git a/src/mlnx-ofa_kernel-5.8/debian/mlnx-ofed-kernel-dkms.postinst b/src/mlnx-ofa_kernel-5.8/debian/mlnx-ofed-kernel-dkms.postinst new file mode 100755 index 0000000..965fe8d --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/debian/mlnx-ofed-kernel-dkms.postinst @@ -0,0 +1,43 @@ +#!/bin/sh +set -e + +# Get the package version +NAME=mlnx-ofed-kernel +PACKAGE_NAME=$NAME-dkms +CVERSION=`dpkg-query -W -f='${Version}' $PACKAGE_NAME | awk -F "-" '{print $1}' | cut -d\: -f2` +ARCH=`uname -m` + +dkms_configure () { + POSTINST="/usr/src/$NAME-$CVERSION/ofed_scripts/common.postinst" + if [ -f "$POSTINST" ]; then + "$POSTINST" "$NAME" "$CVERSION" "/usr/share/$PACKAGE_NAME" "$ARCH" "$2" + return $? + fi + echo "WARNING: $POSTINST does not exist." >&2 + echo "ERROR: DKMS version is too old and $PACKAGE_NAME was not" >&2 + echo "built with legacy DKMS support." >&2 + echo "You must either rebuild $PACKAGE_NAME with legacy postinst" >&2 + echo "support or upgrade DKMS to a more current version." >&2 + return 1 +} + +case "$1" in + configure) + dkms_configure + ;; + + abort-upgrade|abort-remove|abort-deconfigure) + ;; + + *) + echo "postinst called with unknown argument \`$1'" >&2 + exit 1 + ;; +esac + +# dh_installdeb will replace this with shell code automatically +# generated by other debhelper scripts. + +#DEBHELPER# + +exit 0 diff --git a/src/mlnx-ofa_kernel-5.8/debian/mlnx-ofed-kernel-dkms.prerm b/src/mlnx-ofa_kernel-5.8/debian/mlnx-ofed-kernel-dkms.prerm new file mode 100755 index 0000000..b642c13 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/debian/mlnx-ofed-kernel-dkms.prerm @@ -0,0 +1,43 @@ +#!/bin/sh +set -e + +# Get the package version +NAME=mlnx-ofed-kernel +PACKAGE_NAME=$NAME-dkms +VERSION=`dpkg-query -W -f='${Version}' $PACKAGE_NAME | awk -F "-" '{print $1}' | cut -d\: -f2` + +case "$1" in + remove|upgrade|deconfigure) + if [ "`dkms status -m $NAME`" ]; then + dkms remove -m $NAME -v $VERSION --all || true + fi + ;; + + failed-upgrade) + ;; + + *) + echo "prerm called with unknown argument \`$1'" >&2 + exit 1 + ;; +esac + +# We remove files on prerm and not on postrm +# to be sure that on an upgrade we don't touch files +# generated by the new package +get_headers_dirs() { + # Filter list by priority 17 that should list only ones + # generated by our dkms_ofed_post_build.sh, so please don't use + # it yourself manually: + update-alternatives --query ofa_kernel_headers 2>/dev/null \ + | awk 'BEGIN {RS="\n\n"}; /Priority: 17/{print}' +} +header_dirs=`get_headers_dirs` +for h_dir in $headers_dir; do + update-alternatives --remove ofa_kernel_headers $h_dir + rm -rf $h_dir +done + +#DEBHELPER# + +exit 0 diff --git a/src/mlnx-ofa_kernel-5.8/debian/mlnx-ofed-kernel-modules.postinst b/src/mlnx-ofa_kernel-5.8/debian/mlnx-ofed-kernel-modules.postinst new file mode 100755 index 0000000..a67c0c4 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/debian/mlnx-ofed-kernel-modules.postinst @@ -0,0 +1,17 @@ +#!/bin/bash + +symlink="/usr/src/ofa_kernel/default" +if [ "$1" = "configure" ]; then + if [ -L "$symlink" ] && \ + ! update-alternatives --list ofa_kernel_headers >/dev/null 2>&1; + then + rm -f "$symlink" + fi + mkdir -p /usr/src/ofa_kernel + update-alternatives --install "$symlink" ofa_kernel_headers \ + /usr/src/ofa_kernel/@ARCH@/@KVER@ 30 +fi + +#DEBHELPER# + +exit 0 diff --git a/src/mlnx-ofa_kernel-5.8/debian/mlnx-ofed-kernel-modules.prerm b/src/mlnx-ofa_kernel-5.8/debian/mlnx-ofed-kernel-modules.prerm new file mode 100755 index 0000000..6f1b1df --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/debian/mlnx-ofed-kernel-modules.prerm @@ -0,0 +1,10 @@ +#!/bin/sh + +set -e + +if [ "$1" = remove ]; then + update-alternatives --remove ofa_kernel_headers \ + /usr/src/ofa_kernel/@ARCH@/@KVER@ +fi + +#DEBHELPER# diff --git a/src/mlnx-ofa_kernel-5.8/debian/mlnx-ofed-kernel-utils.examples b/src/mlnx-ofa_kernel-5.8/debian/mlnx-ofed-kernel-utils.examples new file mode 100644 index 0000000..58460fa --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/debian/mlnx-ofed-kernel-utils.examples @@ -0,0 +1,2 @@ +ofed_scripts/82-net-setup-link.rules +ofed_scripts/vf-net-link-name.sh diff --git a/src/mlnx-ofa_kernel-5.8/debian/mlnx-ofed-kernel-utils.postinst b/src/mlnx-ofa_kernel-5.8/debian/mlnx-ofed-kernel-utils.postinst new file mode 100755 index 0000000..279820d --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/debian/mlnx-ofed-kernel-utils.postinst @@ -0,0 +1,47 @@ +#!/bin/bash + +dist=`lsb_release -s -i | tr '[:upper:]' '[:lower:]'` +dist_ver=`lsb_release -s -r` + +if [ "X$dist" != "Xubuntu" ]; then + /usr/sbin/update-rc.d openibd defaults > /dev/null 2>&1 +fi + +if (systemctl 2>/dev/null | grep -qw "\\-\.mount" 2>/dev/null); then + systemctl daemon-reload >/dev/null 2>&1 || true + systemctl enable openibd >/dev/null 2>&1 || true + cat /proc/sys/kernel/random/boot_id 2>/dev/null | sed -e 's/-//g' > /var/run/openibd.bootid || true + test -s /var/run/openibd.bootid || echo manual > /var/run/openibd.bootid || true +fi + +# set specific module parameter for Ubuntu +echo "options ib_ipoib send_queue_size=128 recv_queue_size=128" >> /etc/modprobe.d/ib_ipoib.conf + +# Update limits.conf (but not for Containers) +if [ ! -e "/.dockerenv" ] && ! (grep -q docker /proc/self/cgroup 2>/dev/null); then + if [ -e /etc/security/limits.conf ]; then + LIMITS_UPDATED=0 + if ! (grep -qE "soft.*memlock" /etc/security/limits.conf 2>/dev/null); then + echo "* soft memlock unlimited" >> /etc/security/limits.conf + LIMITS_UPDATED=1 + fi + if ! (grep -qE "hard.*memlock" /etc/security/limits.conf 2>/dev/null); then + echo "* hard memlock unlimited" >> /etc/security/limits.conf + LIMITS_UPDATED=1 + fi + if [ $LIMITS_UPDATED -eq 1 ]; then + echo "Configured /etc/security/limits.conf" + fi + fi +fi + +for old_udev in \ + /etc/udev/rules.d/82-net-setup-link.rules \ + /etc/udev/rules.d/83-net-setup-link.rules \ + /etc/udev/rules.d/90-ib \ +; do + dpkg-maintscript-helper rm_conffile $old_udev 5.4-OFED.5.4.0.4.9 -- "$@" +done +dpkg-maintscript-helper rm_conffile /etc/infiniband/vf-net-link-name.sh 5.6-OFED.5.6.0.5.6 -- "$@" + +exit 0 diff --git a/src/mlnx-ofa_kernel-5.8/debian/mlnx-ofed-kernel-utils.postrm b/src/mlnx-ofa_kernel-5.8/debian/mlnx-ofed-kernel-utils.postrm new file mode 100755 index 0000000..8b8068f --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/debian/mlnx-ofed-kernel-utils.postrm @@ -0,0 +1,14 @@ +#!/bin/sh + +set -e + +for old_udev in \ + /etc/udev/rules.d/82-net-setup-link.rules \ + /etc/udev/rules.d/83-net-setup-link.rules \ + /etc/udev/rules.d/90-ib \ +; do + dpkg-maintscript-helper rm_conffile $old_udev 5.4-OFED.5.4.0.4.9 -- "$@" +done +dpkg-maintscript-helper rm_conffile /etc/infiniband/vf-net-link-name.sh 5.6-OFED.5.6.0.5.6 -- "$@" + +#DEBHELPER# diff --git a/src/mlnx-ofa_kernel-5.8/debian/mlnx-ofed-kernel-utils.prerm b/src/mlnx-ofa_kernel-5.8/debian/mlnx-ofed-kernel-utils.prerm new file mode 100755 index 0000000..79e96b5 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/debian/mlnx-ofed-kernel-utils.prerm @@ -0,0 +1,23 @@ +#!/bin/bash + +dist=`lsb_release -s -i | tr '[:upper:]' '[:lower:]'` +dist_ver=`lsb_release -s -r` + +if [ "X$dist" != "Xubuntu" ]; then + /usr/sbin/update-rc.d -f openibd remove > /dev/null 2>&1 +fi + +if (systemctl 2>/dev/null | grep -qw "\\-\.mount" 2>/dev/null); then + systemctl disable openibd >/dev/null 2>&1 || true +fi + +for old_udev in \ + /etc/udev/rules.d/82-net-setup-link.rules \ + /etc/udev/rules.d/83-net-setup-link.rules \ + /etc/udev/rules.d/90-ib \ +; do + dpkg-maintscript-helper rm_conffile $old_udev 5.4-OFED.5.4.0.4.9 -- "$@" +done +dpkg-maintscript-helper rm_conffile /etc/infiniband/vf-net-link-name.sh 5.6-OFED.5.6.0.5.6 -- "$@" + +exit 0 diff --git a/src/mlnx-ofa_kernel-5.8/debian/rules b/src/mlnx-ofa_kernel-5.8/debian/rules new file mode 100755 index 0000000..7e32e3c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/debian/rules @@ -0,0 +1,218 @@ +#!/usr/bin/make -f +# -*- makefile -*- +# Sample debian/rules that uses debhelper. +# This file was originally written by Joey Hess and Craig Small. +# As a special exception, when this file is copied by dh-make into a +# dh-make output file, you may use that output file without restriction. +# This special exception was added by Craig Small in version 0.37 of dh-make. +# +# This version is for a hypothetical package that can build a kernel modules +# architecture-dependant package via make-kpkg, as well as an +# architecture-independent module source package, and other packages +# either dep/indep for things like common files or userspace components +# needed for the kernel modules. + +# Uncomment this to turn on verbose mode. +#export DH_VERBOSE=1 + +-include /usr/share/dpkg/architecture.mk + +WITH_DKMS ?= 1 +WITH_MOD_SIGN ?= 0 +MLXNUMC = $(shell grep ^processor /proc/cpuinfo | wc -l) +NJOBS ?= $(shell if [ $(MLXNUMC) -lt 16 ]; then echo $(MLXNUMC); else echo 16; fi) + +pname:=mlnx-ofed-kernel +psource:=$(pname)-source +ifeq ($(WITH_DKMS),1) +pdkms:=$(pname)-dkms +else +pdkms:=$(pname)-modules +endif +putils:=$(pname)-utils + +pversion := $(shell dpkg-parsechangelog | sed -n 's/^Version: *\([^-]\+\)-.\+/\1/p') +prel := $(shell dpkg-parsechangelog | sed -n 's/^Version: *\([^-]\+\)-\(.\+\)/\2/p') + +export INSTALL_MOD_DIR:=updates +export INSTALL_MOD_PATH:=$(CURDIR)/debian/$(pdkms) + +DIST_NAME := $(shell lsb_release -si) +DIST_RELEASE := $(DIST_NAME)/$(shell lsb_release -sc) + +kernelver ?= $(shell uname -r) +kernelver1 = $(shell echo $(kernelver) | sed -e 's/_/-/g') +kernel_source_dir ?= "/lib/modules/$(kernelver)/build" +deb_arch = $(shell uname -m) + +%: +ifeq ($(WITH_DKMS),1) + dh $@ --with dkms +else + dh $@ +endif + +override_dh_auto_clean: + +override_dh_auto_configure: + # backup clean sources + /bin/rm -rf source || true + mkdir -p source + ls -1 | grep -v source | xargs -iELEM cp ELEM -r source +ifneq ($(WITH_DKMS),1) + @echo Building for $(kernelver) + find compat -type f -exec touch -t 200012201010 '{}' \; || true + $(CURDIR)/configure --kernel-version=$(kernelver) --kernel-sources=$(kernel_source_dir) $(shell $(CURDIR)/ofed_scripts/dkms_ofed $(kernelver) $(kernel_source_dir) get-config) --with-njobs=$(NJOBS) +endif + +mod_dev_dir = /usr/src/ofa_kernel/$(deb_arch)/$(kernelver) +override_dh_auto_build: +ifneq ($(WITH_DKMS),1) + make distclean || true + make -j$(NJOBS) +endif +override_dh_auto_test: + +override_dh_auto_install: +ifneq ($(WITH_DKMS),1) + make install_modules INSTALL_MOD_DIR=$(INSTALL_MOD_DIR) INSTALL_MOD_PATH=$(INSTALL_MOD_PATH) KERNELRELEASE=$(kernelver) + find $(INSTALL_MOD_PATH) \( -type f -a -name "modules.*" \) -delete +ifeq ($(WITH_MOD_SIGN),1) + ofed_scripts/tools/sign-modules $(INSTALL_MOD_PATH)/lib/modules/ $(kernel_source_dir) +endif +endif + + +ifeq ($(WITH_DKMS),1) + # For dkms + dh_installdirs -p$(pdkms) usr/src/$(pname)-$(pversion) + cp -a source/compat* debian/$(pdkms)/usr/src/$(pname)-$(pversion) + cp -a source/include debian/$(pdkms)/usr/src/$(pname)-$(pversion) + cp -a source/drivers debian/$(pdkms)/usr/src/$(pname)-$(pversion) + cp -a source/fs debian/$(pdkms)/usr/src/$(pname)-$(pversion) + cp -a source/net debian/$(pdkms)/usr/src/$(pname)-$(pversion) + cp -a source/block debian/$(pdkms)/usr/src/$(pname)-$(pversion) + cp -a source/backports debian/$(pdkms)/usr/src/$(pname)-$(pversion) || true + cp source/backports_applied debian/$(pdkms)/usr/src/$(pname)-$(pversion) || true + cp -a source/ofed_scripts debian/$(pdkms)/usr/src/$(pname)-$(pversion) + cp -a source/COPYING debian/$(pdkms)/usr/src/$(pname)-$(pversion) + $(CURDIR)/ofed_scripts/generate_dkms_conf.sh > debian/$(pdkms)/usr/src/$(pname)-$(pversion)/dkms.conf + cd debian/$(pdkms)/usr/src/$(pname)-$(pversion); ln -s ofed_scripts/configure + cd debian/$(pdkms)/usr/src/$(pname)-$(pversion); ln -s ofed_scripts/makefile + cd debian/$(pdkms)/usr/src/$(pname)-$(pversion); ln -s ofed_scripts/Makefile + cd debian/$(pdkms)/usr/src; ln -snf $(pname)-$(pversion) ofa_kernel-$(pversion) +endif + # + # Sources for building stuff over MLNX_OFED + # + # With DKMS, copy the below files and folders during post_build since the + # new DKMS versions started to remove the whole build directory. + # however, some old versions of DKMS are buggy and ignores POST_BUILD +ifeq ($(WITH_DKMS),1) + dh_installdirs -p$(pdkms) usr/src/ofa_kernel/ + # copy the ofa-kernel build headers using ofed_scripts/pre_build.sh in the DKMS MAKE step (called from ofed_scripts/pre_build.sh). +else + dh_installdirs -p$(pdkms) $(mod_dev_dir) + cp -a include/ debian/$(pdkms)$(mod_dev_dir) + cp -a config* debian/$(pdkms)$(mod_dev_dir) + cp -a compat* debian/$(pdkms)$(mod_dev_dir) + cp -a ofed_scripts debian/$(pdkms)$(mod_dev_dir) + cp -a Module*.symvers debian/$(pdkms)$(mod_dev_dir) +endif + + # Force DKMS to install our modules. + # This is mostly needed for modules that do not have a version number info, as DKMS + # will compare their srcversion field, which does not really say which module is newer. +ifeq ($(WITH_DKMS),1) + dh_installdirs -p$(pdkms) usr/share/dkms/modules_to_force_install/ + echo "$(pname)" > debian/$(pdkms)/usr/share/dkms/modules_to_force_install/$(pname).force +endif + + # For utils + dh_installdirs -p$(putils) etc/infiniband + dh_installdirs -p$(putils) etc/modprobe.d + dh_installdirs -p$(putils) sbin + dh_installdirs -p$(putils) bin + dh_installdirs -p$(putils) usr/bin + dh_installdirs -p$(putils) usr/sbin + dh_installdirs -p$(putils) lib/udev + dh_installdirs -p$(putils) lib/systemd/system + dh_installdirs -p$(putils) lib/udev/rules.d + dh_installdirs -p$(putils) etc/init.d + dh_installdirs -p$(putils) etc/init + dh_installdirs -p$(putils) etc/systemd/system + dh_installdirs -p$(putils) usr/share/mlnx_ofed + + cp source/ofed_scripts/openib.conf debian/$(putils)/etc/infiniband + cp source/ofed_scripts/mlx5.conf debian/$(putils)/etc/infiniband +ifneq ($(WITH_DKMS),1) + $(CURDIR)/ofed_scripts/install_helper $(CURDIR)/debian/$(putils) +endif + + install -m 0644 source/ofed_scripts/90-ib.rules debian/$(putils)/lib/udev/rules.d + install -m 0644 source/ofed_scripts/83-mlnx-sf-name.rules debian/$(putils)/lib/udev/rules.d + install -m 0644 ofed_scripts/openibd.service debian/$(putils)/lib/systemd/system + install -m 0644 ofed_scripts/mlnx_interface_mgr\@.service debian/$(putils)/etc/systemd/system + + # Prepare /etc/infiniband/info + echo '#!/bin/bash' > debian/$(putils)/etc/infiniband/info + echo >> debian/$(putils)/etc/infiniband/info + echo 'echo prefix=/usr' >> debian/$(putils)/etc/infiniband/info + echo 'echo Kernel=`uname -r`' >> debian/$(putils)/etc/infiniband/info + echo 'echo' >> debian/$(putils)/etc/infiniband/info + echo 'echo "Configure options: `/usr/src/$(pname)-$(pversion)/ofed_scripts/dkms_ofed $(kernelver) $(kernel_source_dir) get-config`"' >> debian/$(putils)/etc/infiniband/info + echo 'echo' >> debian/$(putils)/etc/infiniband/info + chmod 755 debian/$(putils)/etc/infiniband/info + + install -m 0755 source/ofed_scripts/sf-rep-netdev-rename debian/$(putils)/lib/udev + install -m 0755 source/ofed_scripts/auxdev-sf-netdev-rename debian/$(putils)/lib/udev + install -m 0755 source/ofed_scripts/net-interfaces debian/$(putils)/usr/sbin + install -m 0755 source/ofed_scripts/ibdev2netdev debian/$(putils)/usr/sbin + install -m 0644 source/ofed_scripts/mlnx-ofed-kernel-utils.openibd.upstart debian/$(putils)/etc/init/openibd.conf + install -m 0755 source/ofed_scripts/mlnx-ofed-kernel-utils.openibd.init debian/$(putils)/etc/init.d/openibd + install -m 0644 source/ofed_scripts/mlnx.conf debian/$(putils)/etc/modprobe.d + install -m 0644 source/ofed_scripts/mlnx-bf.conf debian/$(putils)/etc/modprobe.d + install -m 0644 source/ofed_scripts/ib_ipoib.conf debian/$(putils)/etc/modprobe.d + install -m 0755 source/ofed_scripts/mlnx_interface_mgr_deb.sh debian/$(putils)/bin/mlnx_interface_mgr.sh + install -m 0755 source/ofed_scripts/mlnx_conf_mgr.sh debian/$(putils)/bin/ + install -m 0755 source/ofed_scripts/setup_mr_cache.sh debian/$(putils)/usr/sbin + install -m 0755 source/ofed_scripts/odp_stat.sh debian/$(putils)/usr/sbin + install -m 0755 ofed_scripts/mlnx_bf_assign_ct_cores.sh debian/$(putils)/usr/share/mlnx_ofed + +override_dh_installinit: + + +ifneq ($(WITH_DKMS),1) +override_dh_gencontrol: + dh_gencontrol -- -v$(pversion)-$(prel).kver.$(kernelver1) +endif + +override_dh_installdeb: + dh_installdeb + # override conf files list + echo "/etc/infiniband/openib.conf" > debian/$(putils)/DEBIAN/conffiles + echo "/etc/infiniband/mlx5.conf" >> debian/$(putils)/DEBIAN/conffiles + echo "/etc/modprobe.d/mlnx.conf" >> debian/$(putils)/DEBIAN/conffiles + echo "/etc/modprobe.d/mlnx-bf.conf" >> debian/$(putils)/DEBIAN/conffiles + echo "/etc/modprobe.d/ib_ipoib.conf" >> debian/$(putils)/DEBIAN/conffiles +ifneq ($(WITH_DKMS),1) + sed -i \ + -e 's/@KVER@/$(kernelver)/' \ + -e 's/@ARCH@/$(deb_arch)/' \ + debian/$(pdkms)/DEBIAN/postinst \ + debian/$(pdkms)/DEBIAN/prerm +endif + +ifneq ($(MLNX_KO_NO_STRIP),1) +ifneq ($(WITH_DKMS),1) +override_dh_strip: + dh_strip + find debian -name '*.ko' | xargs strip -g +ifeq ($(WITH_MOD_SIGN),1) + ofed_scripts/tools/sign-modules $(INSTALL_MOD_PATH)/lib/modules/ $(kernel_source_dir) +endif +endif +endif + +override_dh_compress: + dh_compress -X vf-net-link-name.sh diff --git a/src/mlnx-ofa_kernel-5.8/debian/source/format b/src/mlnx-ofa_kernel-5.8/debian/source/format new file mode 100644 index 0000000..163aaf8 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/debian/source/format @@ -0,0 +1 @@ +3.0 (quilt) diff --git a/src/mlnx-ofa_kernel-5.8/devtools/add_metadata.sh b/src/mlnx-ofa_kernel-5.8/devtools/add_metadata.sh new file mode 100755 index 0000000..1aecd12 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/devtools/add_metadata.sh @@ -0,0 +1,475 @@ +#!/bin/bash +# +# Copyright (c) 2016 Mellanox Technologies. All rights reserved. +# +# This Software is licensed under one of the following licenses: +# +# 1) under the terms of the "Common Public License 1.0" a copy of which is +# available from the Open Source Initiative, see +# http://www.opensource.org/licenses/cpl.php. +# +# 2) under the terms of the "The BSD License" a copy of which is +# available from the Open Source Initiative, see +# http://www.opensource.org/licenses/bsd-license.php. +# +# 3) under the terms of the "GNU General Public License (GPL) Version 2" a +# copy of which is available from the Open Source Initiative, see +# http://www.opensource.org/licenses/gpl-license.php. +# +# Licensee has the right to choose one of the above licenses. +# +# Redistributions of source code must retain the above copyright +# notice and one of the license notices. +# +# Redistributions in binary form must reproduce both the above copyright +# notice, one of the license notices in the documentation +# and/or other materials provided with the distribution. +# +# +# Author: Alaa Hleihel - alaa@mellanox.com +# +######################################################################### + +WDIR=$(cd `dirname "${BASH_SOURCE[0]}"` && pwd | sed -e 's/devtools//') + +base= +num= +dry_run=0 +no_edit=0 +no_verify=0 +ref_db= +changeid_map= +def_feature= +def_ustatus= + +usage() +{ + cat < Add metadata for new commits after given base (commit ID) + -n, --num Add metadata for the last N commits in the current branch + + -f, --feature Feature name to assign to new commits. + Must exist in: 'metadata/features_metadata_db.csv' + -s, --upstream-status Upstream status to assign to new commits. + Valid values: [NA, ignore, in_progress, sent, accepted, rejected] + -g, --general add current upsream delta tag to general(f.e v5.6-rc2). + + --dry-run Just print, don't really change anything. + +Description for upstream status: + "NA" -----------> Patch is not applicable for upstream (scripts, Exp. API, etc..). + "ignore" -------> Patch that should be automatically dropped at next rebase (scripts changes). + "in_progress" --> Being prepared for Upstream submission. + "sent" ---------> Sent upstream, but not accepted yet. + "accepted" -----> Accepted upstream, should be automatically dropped at next rebase. + "rejected" -----> Sent upstream and got rejected, will be taken again to OFED at next rebase. +EOF +} + +while [ ! -z "$1" ] +do + case "$1" in + -a | --after) + base="$2" + shift + ;; + -n | --num) + num="$2" + shift + ;; + --dry-run) + dry_run=1 + ;; + --no-edit) + no_edit=1 + ;; + --no-verify) + no_verify=1 + ;; + -r | --ref-db) + ref_db="$2" + shift + ;; + -m | --change-id-map) + changeid_map="$2" + shift + ;; + -f | --feature) + def_feature="$2" + shift + ;; + -g | --general) + general="tag: $2" + shift + ;; + -s | --upstream-status) + def_ustatus="$2" + shift + ;; + -h | *help | *usage) + echo "This script will add metadata entries for given commits." + usage + exit 0 + ;; + *) + echo "-E- Unsupported option: $1" >&2 + exit 1 + ;; + esac + shift +done + + +is_backports_change_only() +{ + local cid=$1; shift + + tgt=0 + other=0 + for ff in $(git log -1 --name-only --pretty=format: $cid 2>/dev/null) + do + if [ -z "$ff" ]; then + continue + fi + case $ff in + backports* | *compat*) + tgt=1 + ;; + *) + other=1 + ;; + esac + done + + if [ $tgt -eq 1 -a $other -eq 0 ]; then + return 0 + else + return 1 + fi +} + +is_scripts_change_only() +{ + local cid=$1; shift + + tgt=0 + other=0 + for ff in $(git log -1 --name-only --pretty=format: $cid 2>/dev/null) + do + if [ -z "$ff" ]; then + continue + fi + case $ff in + *ofed_scripts* | *debian* | *devtools* | *metadata* | *scripts*) + tgt=1 + ;; + *) + other=1 + ;; + esac + done + + if [ $tgt -eq 1 -a $other -eq 0 ]; then + return 0 + else + return 1 + fi +} + +# get value of given tag if available in the commit message +get_by_tag() +{ + local cid=$1; shift + local tag=$1; shift + + echo $(git log -1 $cid | grep -iE -- "${tag}\s*:" | head -1 | cut -d":" -f"2" | sed -r -e 's/^\s//g') +} + +get_subject() +{ + local cid=$1; shift + + echo $(git log -1 --format="%s" $cid) +} + +get_feature_from_csv() +{ + local line=$1; shift + + echo $(echo "$line" | sed -r -e 's/.*;\s*feature=\s*//' -e 's/;\s*upstream_status.*//') +} + +get_upstream_from_csv() +{ + local line=$1; shift + + echo $(echo "$line" | sed -r -e 's/.*;\s*upstream_status=\s*//' -e 's/;\s*general.*//') +} + +get_general_from_csv() +{ + local line=$1; shift + + echo $(echo "$line" | sed -r -e 's/.*;\s*general=\s*//' -e 's/;.*//') +} + +get_line_from_ref() +{ + local uniqID=$1; shift + local ref_db=$1; shift + local subject=$1; shift + local line="" + + if [ "X$changeid_map" != "X" ]; then + uniqID=$(map_id_new_to_old $uniqID $changeid_map "$subject") + line=$(grep --no-filename -wr -- "$uniqID" ${ref_db}/*csv 2>/dev/null) + else + line=$(grep --no-filename -wr -- "subject=$subject;" ${ref_db}/*csv 2>/dev/null | tail -1) + fi + if [ "X$line" == "X" ]; then + return + fi + echo "$line" +} + +map_id_new_to_old() +{ + local newid=$1; shift + local changeid_map=$1; shift + local subject=$1; shift + + newid=$(echo -e "$newid" | sed -r -e 's/.*=\s*//g') + local line=$(grep --no-filename -wr -- "$newid" $changeid_map 2>/dev/null) + local oldid=$(echo "$line" | cut -d':' -f'1') + if [ "X$oldid" != "X" ]; then + echo "$oldid" + else + local line=$(grep --no-filename -wr -- "$subject" ${ref_db}/*csv 2>/dev/null | tail -1) + local oldid=$(echo "$line" | cut -d':' -f'1') + if [ "X$oldid" != "X" ]; then + echo "$oldid" + else + echo "$newid" + fi + fi +} + +get_feature_from_ref() +{ + local uniqID=$1; shift + local ref_db=$1; shift + local subject=$1; shift + + local line=$(get_line_from_ref "$uniqID" "$ref_db" "$subject") + if [ "X$line" == "X" ]; then + echo "" + return + fi + get_feature_from_csv "$line" +} + +get_upstream_status_from_ref() +{ + local uniqID=$1; shift + local ref_db=$1; shift + local subject=$1; shift + + local line=$(get_line_from_ref "$uniqID" "$ref_db" "$subject") + if [ "X$line" == "X" ]; then + echo "" + return + fi + local status=$(get_upstream_from_csv "$line") + if [ "X$status" == "X-1" ]; then + status=NA + fi + echo $status +} + +get_general_from_ref() +{ + local uniqID=$1; shift + local ref_db=$1; shift + local subject=$1; shift + + local line=$(get_line_from_ref "$uniqID" "$ref_db" "$subject") + if [ "X$line" == "X" ]; then + echo "" + return + fi + local tag=$(get_general_from_csv "$line") + if [ "X$tag" == "X-1" ]; then + echo "" + return + fi + echo $tag +} + +################################################################## +# +# main +# + +filter= +if [ "X$base" != "X" ]; then + filter="${base}.." +fi +if [ "X$num" != "X" ]; then + filter="-${num}" +fi +if [ "X$filter" == "X" ]; then + echo "-E- Missing arguments!" >&2 + echo + usage + exit 1 +fi + +if [ "X$ref_db" != "X" ] && ! test -d "$ref_db"; then + echo "-E- Giving --ref-db does not exist: '$ref_db' !" >&2 + exit 1 +fi + +commitIDs=$(git log --no-merges --format="%h" $filter | tac) +if [ -z "$commitIDs" ]; then + echo "-E- Failed to get list of commit IDs." >&2 + exit 1 +fi +if [ ! -z "$def_ustatus" ]; then + case $def_ustatus in + NA|rejected|accepted|in_progress|ignore) + ;; # Valid status + *) echo "-E- Valid status is one of the follow options: 'NA'|'rejected'|'accepted'|'in_progress'|'ignore'" + exit 1 + ;; + esac +fi +if [ "X$def_ustatus" = "Xaccepted" ];then + if [ "X$general" = "X" ]; then + echo "-E- -g|--general must be used in case of status accepted" + exit 1 + fi +else + if [ ! -z "$general" ]; then + echo "-E- -g|--general can be used only in case of status accepted" + exit 1 + fi +fi + +echo "Getting info about commits..." +echo ---------------------------------------------------- +csvfiles= +for cid in $commitIDs +do + if [ "X$cid" == "X" ]; then + continue + fi + author=$(git log --format="%aN" $cid| head -1 | sed -e 's/ /_/g') + changeID= + subject= + feature= + upstream= + + uniqID= + changeID=$(get_by_tag $cid "change-id") + if [ -z "$changeID" ]; then + # for merged commits w/o change ID take the commit ID + if (git branch -a --contains $cid 2>/dev/null | grep -qEi -- "remote|origin"); then + uniqID="commit-Id=${cid}" + else + echo "-E- Failed to get Change-Id for commit ID: $cid" >&2 + echo "Please add Change-Id and re-run the script." >&2 + exit 1 + fi + else + uniqID="Change-Id=${changeID}" + fi + if [ -z "$uniqID" ]; then + echo "-E- Failed to get unique Id for commit ID: $cid" >&2 + exit 1 + fi + subject=$(get_subject $cid) + feature=$(get_by_tag $cid "feature") + upstream=$(get_by_tag $cid "upstream(.*status)") + if [ -z "$general" ] + then + general=$(get_by_tag $cid "general") + fi + # auto-detect commits that changes only backports, ofed-scripts + if is_backports_change_only $cid ;then + feature="backports" + upstream="ignore" + fi + if is_scripts_change_only $cid ;then + feature="ofed_scripts" + upstream="ignore" + fi + if [ "X$ref_db" != "X" ]; then + if [ "X$feature" == "X" ]; then + feature=$(get_feature_from_ref "$uniqID" "$ref_db" "$subject") + fi + if [ "X$upstream" == "X" ]; then + upstream=$(get_upstream_status_from_ref "$uniqID" "$ref_db" "$subject") + fi + general=$(get_general_from_ref "$uniqID" "$ref_db" "$subject") + fi + + if [ "X$feature" == "X" ]; then + feature=$def_feature + fi + if [ "X$upstream" == "X" ]; then + upstream=$def_ustatus + fi + entry="$uniqID; subject=${subject}; feature=${feature}; upstream_status=${upstream}; general=${general};" + if [ "X$ref_db" != "X" ]; then + general="" #remove for each iteration + fi + echo "'$entry' to metadata/${author}.csv" + csvfile="${WDIR}/metadata/${author}.csv" + if [ $dry_run -eq 0 ]; then + mkdir -p $WDIR/metadata + if [ ! -e $csvfile ]; then + echo "sep=;" > $csvfile + fi + if (grep -q -- "$uniqID" $csvfile); then + echo "-W- $cid '${subject}' already exists in ${author}.csv , skipping..." >&2 + echo >&2 + else + echo "$entry" >> $csvfile + if ! (echo $csvfiles | grep -q -- "$csvfile"); then + csvfiles="$csvfiles $csvfile" + fi + fi + fi +done + +if [ $dry_run -eq 0 ]; then + if [ ! -z "$csvfiles" ]; then + if [ $no_edit -eq 0 ]; then + vim -o $csvfiles + fi + echo ---------------------------------------------------- + echo "Done, please amend these files to your last commit:" + echo "$csvfiles" + echo ---------------------------------------------------- + echo + if [ $no_verify -eq 0 ]; then + echo "Going to verify content of metadata files..." + sleep 3 + for ff in $csvfiles + do + cmd="$WDIR/devtools/verify_metadata.sh -p $ff" + echo "Going to run '$cmd'" + sleep 2 + $cmd + done + fi + else + echo "-E- no csv files were updated!" + exit 3 + fi +fi + diff --git a/src/mlnx-ofa_kernel-5.8/devtools/verify_metadata.sh b/src/mlnx-ofa_kernel-5.8/devtools/verify_metadata.sh new file mode 100755 index 0000000..ed6e1d6 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/devtools/verify_metadata.sh @@ -0,0 +1,241 @@ +#!/bin/bash +# +# Copyright (c) 2016 Mellanox Technologies. All rights reserved. +# +# This Software is licensed under one of the following licenses: +# +# 1) under the terms of the "Common Public License 1.0" a copy of which is +# available from the Open Source Initiative, see +# http://www.opensource.org/licenses/cpl.php. +# +# 2) under the terms of the "The BSD License" a copy of which is +# available from the Open Source Initiative, see +# http://www.opensource.org/licenses/bsd-license.php. +# +# 3) under the terms of the "GNU General Public License (GPL) Version 2" a +# copy of which is available from the Open Source Initiative, see +# http://www.opensource.org/licenses/gpl-license.php. +# +# Licensee has the right to choose one of the above licenses. +# +# Redistributions of source code must retain the above copyright +# notice and one of the license notices. +# +# Redistributions in binary form must reproduce both the above copyright +# notice, one of the license notices in the documentation +# and/or other materials provided with the distribution. +# +# +# Author: Alaa Hleihel - alaa@mellanox.com +# +######################################################################### + +WDIR=$(cd `dirname "${BASH_SOURCE[0]}"` && pwd | sed -e 's/devtools//') +ORIG_ARGS=$@ +path= + +FEATURES_DB="metadata/features_metadata_db.csv" +STATUS_DB="NA \ + ignore \ + in_progress \ + sent + accepted \ + rejected \ +" + +usage() +{ + cat < Path to the metadata file to test +EOF +} + +while [ ! -z "$1" ] +do + case "$1" in + -p | --path) + path="$2" + shift + ;; + -h | *help | *usage) + echo "This script will verify the content of a metadata file." + usage + exit 0 + ;; + *) + echo "-E- Unsupported option: $1" >&2 + exit 1 + ;; + esac + shift +done + + +get_subject() +{ + local cid=$1; shift + + echo $(git log -1 --format="%s" $cid) +} + +get_id_from_csv() +{ + local line=$1; shift + + echo $(echo "$line" | sed -r -e 's/.*Change-Id=\s*//' -e 's/;\s*subject=.*//') +} + +get_commitID() +{ + local uid=$1; shift + + if (git log --format="%h" -1 $uid >/dev/null 2>&1); then + echo "$uid" + else + echo $(git log --format="%h" -1 --grep="$uid" 2>/dev/null) + fi +} + +get_subject_from_csv() +{ + local line=$1; shift + + echo $(echo "$line" | sed -r -e 's/.*;\s*subject=\s*//' -e 's/;\s*feature.*//') +} + +get_feature_from_csv() +{ + local line=$1; shift + + echo $(echo "$line" | sed -r -e 's/.*;\s*feature=\s*//' -e 's/;\s*upstream_status.*//') +} + +get_upstream_from_csv() +{ + local line=$1; shift + + echo $(echo "$line" | sed -r -e 's/.*;\s*upstream_status=\s*//' -e 's/;\s*general.*//') +} + +get_tag_from_csv() +{ + local line=$1; shift + + echo "$line" | sed -e 's/.*tag://g' -e 's/;//g' +} + +################################################################## +# +# main +# +if [ ! -e "$path" ]; then + echo "-E- File doesn't exist '$path' !" >&2 + echo + usage + exit 1 +fi + +RC=0 +echo "Scanning file..." +while read -r line +do + case "$line" in + *sep*) + continue + ;; + esac + cerrs= + + uid=$(get_id_from_csv "$line") + if [ "X$uid" == "X" ]; then + cerrs="$cerrs\n-E- Missing unique ID!" + RC=$(( $RC + 1)) + echo -n "At line --> '$line'" + echo -e "$cerrs" + continue + fi + if [ $(grep -wq -- "$uid" $path | wc -l) -gt 1 ]; then + cerrs="$cerrs\n-E- unique ID '$uid' apprease twice in given csv file!" + RC=$(( $RC + 1)) + echo -n "At line --> '$line'" + echo -e "$cerrs" + continue + + fi + cid=$(get_commitID $uid) + if [ -z "$cid" ]; then + cerrs="$cerrs\n-E- Failed to get commit ID!" + RC=$(( $RC + 1)) + echo -n "At line --> '$line'" + echo -e "$cerrs" + continue + fi + commit_subject=$(get_subject $cid) + line_subject=$(get_subject_from_csv "$line") + if [ "X$commit_subject" != "X$line_subject" ]; then + cerrs="$cerrs\n-E- commit $cid subject is wrong (in csv:'$line_subject' vs. in commit:'$commit_subject') !" + RC=$(( $RC + 1)) + fi + + feature=$(get_feature_from_csv "$line") + if [ -z "$feature" ]; then + cerrs="$cerrs\n-E- missing feature field!" + RC=$(( $RC + 1)) + elif ! (grep -Ewq -- "name=\s*$feature" $WDIR/$FEATURES_DB); then + cerrs="$cerrs\n-E- feature '$feature' does not exist in '$FEATURES_DB' !" + RC=$(( $RC + 1)) + fi + + upstream=$(get_upstream_from_csv "$line") + if [ -z "$upstream" ]; then + cerrs="$cerrs\n-E- missing upstream_status field!" + RC=$(( $RC + 1)) + elif ! (echo -e "$STATUS_DB" | grep -wq -- "$upstream"); then + cerrs="$cerrs\n-E- invalid upstream_status '$upstream' !" + RC=$(( $RC + 1)) + fi + + upstream=$(get_upstream_from_csv "$line") + if (echo -e "accepted" | grep -wq -- "$upstream"); then + tag=$(get_tag_from_csv "$line") + if [ -z "$tag" ] ; then + cerrs="$cerrs\n-E- missing tag for the accepted commit!" + RC=$(( $RC + 1)) + elif ! echo $tag | grep -Eq '^v?(2\.6|[3-9])\.[0-9]+(-rc[1-9]+(-s)?)?$' ; then + cerrs="$cerrs\n-E- tag: $tag has wrong format! Expected format like: v5.3-rc1 or v5.3" + RC=$(( $RC + 1)) + fi + fi + + if (echo $feature | grep -Eq "_bugs$"); then + if (echo -e "in_progress NA" | grep -wq -- "$upstream"); then + commit_msg=$(git log -1 --format="%b" $cid) + if !(echo "$commit_msg" | grep -Eq "^[F|f]ixes: [0-9a-f]{12,40}" ); then + cerrs="$cerrs\n-E- Missing or wrong format of 'Fixes' line in commit message! Excpected format like: 'Fixes: <12+ chars of sha1>'" + RC=$(( $RC + 1)) + fi + fi + fi + + if [ ! -z "$cerrs" ]; then + echo -n "At line --> '$line'" + echo -e "$cerrs" + echo + fi + +done < <(cat $path) + + +echo "Found $RC issues." +if [ $RC -ne 0 ]; then + echo "Please fix the above issues by manaully editing '$path'." + echo "Then run the follwoing command to verify that all is OK:" + echo "# $0 $ORIG_ARGS" +else + echo "All passed." +fi +exit $RC diff --git a/src/mlnx-ofa_kernel-5.8/drivers/base/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/base/Makefile new file mode 100644 index 0000000..523e256 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/base/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_AUXILIARY_BUS) += auxiliary.o diff --git a/src/mlnx-ofa_kernel-5.8/drivers/base/auxiliary.c b/src/mlnx-ofa_kernel-5.8/drivers/base/auxiliary.c new file mode 100644 index 0000000..100abb7 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/base/auxiliary.c @@ -0,0 +1,423 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2019-2020 Intel Corporation + * + * Please see Documentation/driver-api/auxiliary_bus.rst for more information. + */ + +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "base.h" + +/** + * DOC: PURPOSE + * + * In some subsystems, the functionality of the core device (PCI/ACPI/other) is + * too complex for a single device to be managed by a monolithic driver (e.g. + * Sound Open Firmware), multiple devices might implement a common intersection + * of functionality (e.g. NICs + RDMA), or a driver may want to export an + * interface for another subsystem to drive (e.g. SIOV Physical Function export + * Virtual Function management). A split of the functionality into child- + * devices representing sub-domains of functionality makes it possible to + * compartmentalize, layer, and distribute domain-specific concerns via a Linux + * device-driver model. + * + * An example for this kind of requirement is the audio subsystem where a + * single IP is handling multiple entities such as HDMI, Soundwire, local + * devices such as mics/speakers etc. The split for the core's functionality + * can be arbitrary or be defined by the DSP firmware topology and include + * hooks for test/debug. This allows for the audio core device to be minimal + * and focused on hardware-specific control and communication. + * + * Each auxiliary_device represents a part of its parent functionality. The + * generic behavior can be extended and specialized as needed by encapsulating + * an auxiliary_device within other domain-specific structures and the use of + * .ops callbacks. Devices on the auxiliary bus do not share any structures and + * the use of a communication channel with the parent is domain-specific. + * + * Note that ops are intended as a way to augment instance behavior within a + * class of auxiliary devices, it is not the mechanism for exporting common + * infrastructure from the parent. Consider EXPORT_SYMBOL_NS() to convey + * infrastructure from the parent module to the auxiliary module(s). + */ + +/** + * DOC: USAGE + * + * The auxiliary bus is to be used when a driver and one or more kernel + * modules, who share a common header file with the driver, need a mechanism to + * connect and provide access to a shared object allocated by the + * auxiliary_device's registering driver. The registering driver for the + * auxiliary_device(s) and the kernel module(s) registering auxiliary_drivers + * can be from the same subsystem, or from multiple subsystems. + * + * The emphasis here is on a common generic interface that keeps subsystem + * customization out of the bus infrastructure. + * + * One example is a PCI network device that is RDMA-capable and exports a child + * device to be driven by an auxiliary_driver in the RDMA subsystem. The PCI + * driver allocates and registers an auxiliary_device for each physical + * function on the NIC. The RDMA driver registers an auxiliary_driver that + * claims each of these auxiliary_devices. This conveys data/ops published by + * the parent PCI device/driver to the RDMA auxiliary_driver. + * + * Another use case is for the PCI device to be split out into multiple sub + * functions. For each sub function an auxiliary_device is created. A PCI sub + * function driver binds to such devices that creates its own one or more class + * devices. A PCI sub function auxiliary device is likely to be contained in a + * struct with additional attributes such as user defined sub function number + * and optional attributes such as resources and a link to the parent device. + * These attributes could be used by systemd/udev; and hence should be + * initialized before a driver binds to an auxiliary_device. + * + * A key requirement for utilizing the auxiliary bus is that there is no + * dependency on a physical bus, device, register accesses or regmap support. + * These individual devices split from the core cannot live on the platform bus + * as they are not physical devices that are controlled by DT/ACPI. The same + * argument applies for not using MFD in this scenario as MFD relies on + * individual function devices being physical devices. + */ + +/** + * DOC: EXAMPLE + * + * Auxiliary devices are created and registered by a subsystem-level core + * device that needs to break up its functionality into smaller fragments. One + * way to extend the scope of an auxiliary_device is to encapsulate it within a + * domain- pecific structure defined by the parent device. This structure + * contains the auxiliary_device and any associated shared data/callbacks + * needed to establish the connection with the parent. + * + * An example is: + * + * .. code-block:: c + * + * struct foo { + * struct auxiliary_device auxdev; + * void (*connect)(struct auxiliary_device *auxdev); + * void (*disconnect)(struct auxiliary_device *auxdev); + * void *data; + * }; + * + * The parent device then registers the auxiliary_device by calling + * auxiliary_device_init(), and then auxiliary_device_add(), with the pointer + * to the auxdev member of the above structure. The parent provides a name for + * the auxiliary_device that, combined with the parent's KBUILD_MODNAME, + * creates a match_name that is be used for matching and binding with a driver. + * + * Whenever an auxiliary_driver is registered, based on the match_name, the + * auxiliary_driver's probe() is invoked for the matching devices. The + * auxiliary_driver can also be encapsulated inside custom drivers that make + * the core device's functionality extensible by adding additional + * domain-specific ops as follows: + * + * .. code-block:: c + * + * struct my_ops { + * void (*send)(struct auxiliary_device *auxdev); + * void (*receive)(struct auxiliary_device *auxdev); + * }; + * + * + * struct my_driver { + * struct auxiliary_driver auxiliary_drv; + * const struct my_ops ops; + * }; + * + * An example of this type of usage is: + * + * .. code-block:: c + * + * const struct auxiliary_device_id my_auxiliary_id_table[] = { + * { .name = "foo_mod.foo_dev" }, + * { }, + * }; + * + * const struct my_ops my_custom_ops = { + * .send = my_tx, + * .receive = my_rx, + * }; + * + * const struct my_driver my_drv = { + * .auxiliary_drv = { + * .name = "myauxiliarydrv", + * .id_table = my_auxiliary_id_table, + * .probe = my_probe, + * .remove = my_remove, + * .shutdown = my_shutdown, + * }, + * .ops = my_custom_ops, + * }; + */ + +static const struct auxiliary_device_id *auxiliary_match_id(const struct auxiliary_device_id *id, + const struct auxiliary_device *auxdev) +{ + for (; id->name[0]; id++) { + const char *p = strrchr(dev_name(&auxdev->dev), '.'); + int match_size; + + if (!p) + continue; + match_size = p - dev_name(&auxdev->dev); + + /* use dev_name(&auxdev->dev) prefix before last '.' char to match to */ + if (strlen(id->name) == match_size && + !strncmp(dev_name(&auxdev->dev), id->name, match_size)) + return id; + } + return NULL; +} + +static int auxiliary_match(struct device *dev, struct device_driver *drv) +{ + struct auxiliary_device *auxdev = to_auxiliary_dev(dev); + struct auxiliary_driver *auxdrv = to_auxiliary_drv(drv); + + return !!auxiliary_match_id(auxdrv->id_table, auxdev); +} + +static int auxiliary_uevent(struct device *dev, struct kobj_uevent_env *env) +{ + const char *name, *p; + + name = dev_name(dev); + p = strrchr(name, '.'); + + return add_uevent_var(env, "MODALIAS=%s%.*s", AUXILIARY_MODULE_PREFIX, + (int)(p - name), name); +} + +static const struct dev_pm_ops auxiliary_dev_pm_ops = { + SET_RUNTIME_PM_OPS(pm_generic_runtime_suspend, pm_generic_runtime_resume, NULL) + SET_SYSTEM_SLEEP_PM_OPS(pm_generic_suspend, pm_generic_resume) +}; + +static int auxiliary_bus_probe(struct device *dev) +{ + struct auxiliary_driver *auxdrv = to_auxiliary_drv(dev->driver); + struct auxiliary_device *auxdev = to_auxiliary_dev(dev); + int ret; + + ret = dev_pm_domain_attach(dev, true); + if (ret) { + dev_warn(dev, "Failed to attach to PM Domain : %d\n", ret); + return ret; + } + + ret = auxdrv->probe(auxdev, auxiliary_match_id(auxdrv->id_table, auxdev)); + if (ret) + dev_pm_domain_detach(dev, true); + + return ret; +} + +static void auxiliary_bus_remove(struct device *dev) +{ + struct auxiliary_driver *auxdrv = to_auxiliary_drv(dev->driver); + struct auxiliary_device *auxdev = to_auxiliary_dev(dev); + + if (auxdrv->remove) + auxdrv->remove(auxdev); + dev_pm_domain_detach(dev, true); +} + +static void auxiliary_bus_shutdown(struct device *dev) +{ + struct auxiliary_driver *auxdrv = NULL; + struct auxiliary_device *auxdev; + + if (dev->driver) { + auxdrv = to_auxiliary_drv(dev->driver); + auxdev = to_auxiliary_dev(dev); + } + + if (auxdrv && auxdrv->shutdown) + auxdrv->shutdown(auxdev); +} + +static struct bus_type auxiliary_bus_type = { + .name = "auxiliary", + .probe = auxiliary_bus_probe, + .remove = auxiliary_bus_remove, + .shutdown = auxiliary_bus_shutdown, + .match = auxiliary_match, + .uevent = auxiliary_uevent, + .pm = &auxiliary_dev_pm_ops, +}; + +/** + * auxiliary_device_init - check auxiliary_device and initialize + * @auxdev: auxiliary device struct + * + * This is the second step in the three-step process to register an + * auxiliary_device. + * + * When this function returns an error code, then the device_initialize will + * *not* have been performed, and the caller will be responsible to free any + * memory allocated for the auxiliary_device in the error path directly. + * + * It returns 0 on success. On success, the device_initialize has been + * performed. After this point any error unwinding will need to include a call + * to auxiliary_device_uninit(). In this post-initialize error scenario, a call + * to the device's .release callback will be triggered, and all memory clean-up + * is expected to be handled there. + */ +int auxiliary_device_init(struct auxiliary_device *auxdev) +{ + struct device *dev = &auxdev->dev; + + if (!dev->parent) { + pr_err("auxiliary_device has a NULL dev->parent\n"); + return -EINVAL; + } + + if (!auxdev->name) { + pr_err("auxiliary_device has a NULL name\n"); + return -EINVAL; + } + + dev->bus = &auxiliary_bus_type; + device_initialize(&auxdev->dev); + return 0; +} +EXPORT_SYMBOL_GPL(auxiliary_device_init); + +/** + * __auxiliary_device_add - add an auxiliary bus device + * @auxdev: auxiliary bus device to add to the bus + * @modname: name of the parent device's driver module + * + * This is the third step in the three-step process to register an + * auxiliary_device. + * + * This function must be called after a successful call to + * auxiliary_device_init(), which will perform the device_initialize. This + * means that if this returns an error code, then a call to + * auxiliary_device_uninit() must be performed so that the .release callback + * will be triggered to free the memory associated with the auxiliary_device. + * + * The expectation is that users will call the "auxiliary_device_add" macro so + * that the caller's KBUILD_MODNAME is automatically inserted for the modname + * parameter. Only if a user requires a custom name would this version be + * called directly. + */ +int __auxiliary_device_add(struct auxiliary_device *auxdev, const char *modname) +{ + struct device *dev = &auxdev->dev; + int ret; + + if (!modname) { + dev_err(dev, "auxiliary device modname is NULL\n"); + return -EINVAL; + } + + ret = dev_set_name(dev, "%s.%s.%d", modname, auxdev->name, auxdev->id); + if (ret) { + dev_err(dev, "auxiliary device dev_set_name failed: %d\n", ret); + return ret; + } + + ret = device_add(dev); + if (ret) + dev_err(dev, "adding auxiliary device failed!: %d\n", ret); + + return ret; +} +EXPORT_SYMBOL_GPL(__auxiliary_device_add); + +/** + * auxiliary_find_device - auxiliary device iterator for locating a particular device. + * @start: Device to begin with + * @data: Data to pass to match function + * @match: Callback function to check device + * + * This function returns a reference to a device that is 'found' + * for later use, as determined by the @match callback. + * + * The reference returned should be released with put_device(). + * + * The callback should return 0 if the device doesn't match and non-zero + * if it does. If the callback returns non-zero, this function will + * return to the caller and not iterate over any more devices. + */ +struct auxiliary_device *auxiliary_find_device(struct device *start, + const void *data, + int (*match)(struct device *dev, const void *data)) +{ + struct device *dev; + + dev = bus_find_device(&auxiliary_bus_type, start, data, match); + if (!dev) + return NULL; + + return to_auxiliary_dev(dev); +} +EXPORT_SYMBOL_GPL(auxiliary_find_device); + +/** + * __auxiliary_driver_register - register a driver for auxiliary bus devices + * @auxdrv: auxiliary_driver structure + * @owner: owning module/driver + * @modname: KBUILD_MODNAME for parent driver + * + * The expectation is that users will call the "auxiliary_driver_register" + * macro so that the caller's KBUILD_MODNAME is automatically inserted for the + * modname parameter. Only if a user requires a custom name would this version + * be called directly. + */ +int __auxiliary_driver_register(struct auxiliary_driver *auxdrv, + struct module *owner, const char *modname) +{ + int ret; + + if (WARN_ON(!auxdrv->probe) || WARN_ON(!auxdrv->id_table)) + return -EINVAL; + + if (auxdrv->name) + auxdrv->driver.name = kasprintf(GFP_KERNEL, "%s.%s", modname, + auxdrv->name); + else + auxdrv->driver.name = kasprintf(GFP_KERNEL, "%s", modname); + if (!auxdrv->driver.name) + return -ENOMEM; + + auxdrv->driver.owner = owner; + auxdrv->driver.bus = &auxiliary_bus_type; + auxdrv->driver.mod_name = modname; + + ret = driver_register(&auxdrv->driver); + if (ret) + kfree(auxdrv->driver.name); + + return ret; +} +EXPORT_SYMBOL_GPL(__auxiliary_driver_register); + +/** + * auxiliary_driver_unregister - unregister a driver + * @auxdrv: auxiliary_driver structure + */ +void auxiliary_driver_unregister(struct auxiliary_driver *auxdrv) +{ + driver_unregister(&auxdrv->driver); + kfree(auxdrv->driver.name); +} +EXPORT_SYMBOL_GPL(auxiliary_driver_unregister); + +void __init auxiliary_bus_init(void) +{ + WARN_ON(bus_register(&auxiliary_bus_type)); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/Kconfig b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/Kconfig new file mode 100644 index 0000000..33d3ce9 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/Kconfig @@ -0,0 +1,111 @@ +# SPDX-License-Identifier: GPL-2.0-only +menuconfig INFINIBAND + tristate "InfiniBand support" + depends on HAS_IOMEM && HAS_DMA + depends on NET + depends on INET + depends on m || IPV6 != m + depends on !ALPHA + select IRQ_POLL + select DIMLIB + help + Core support for InfiniBand (IB). Make sure to also select + any protocols you wish to use as well as drivers for your + InfiniBand hardware. + +if INFINIBAND + +config INFINIBAND_USER_MAD + tristate "InfiniBand userspace MAD support" + depends on INFINIBAND + help + Userspace InfiniBand Management Datagram (MAD) support. This + is the kernel side of the userspace MAD support, which allows + userspace processes to send and receive MADs. You will also + need libibumad from rdma-core + . + +config INFINIBAND_USER_ACCESS + tristate "InfiniBand userspace access (verbs and CM)" + depends on MMU + help + Userspace InfiniBand access support. This enables the + kernel side of userspace verbs and the userspace + communication manager (CM). This allows userspace processes + to set up connections and directly access InfiniBand + hardware for fast-path operations. You will also need + libibverbs, libibcm and a hardware driver library from + rdma-core . + +config INFINIBAND_USER_MEM + bool + depends on INFINIBAND_USER_ACCESS != n + depends on MMU + select DMA_SHARED_BUFFER + default y + +config INFINIBAND_ON_DEMAND_PAGING + bool "InfiniBand on-demand paging support" + depends on INFINIBAND_USER_MEM + select MMU_NOTIFIER + select INTERVAL_TREE + select HMM_MIRROR + default y + help + On demand paging support for the InfiniBand subsystem. + Together with driver support this allows registration of + memory regions without pinning their pages, fetching the + pages on demand instead. + +config INFINIBAND_ADDR_TRANS + bool "RDMA/CM" + depends on INFINIBAND + default y + help + Support for RDMA communication manager (CM). + This allows for a generic connection abstraction over RDMA. + +config INFINIBAND_ADDR_TRANS_CONFIGFS + bool + depends on INFINIBAND_ADDR_TRANS && CONFIGFS_FS && !(INFINIBAND=y && CONFIGFS_FS=m) + default y + help + ConfigFS support for RDMA communication manager (CM). + This allows the user to config the default GID type that the CM + uses for each device, when initiaing new connections. + +config INFINIBAND_VIRT_DMA + def_bool !HIGHMEM + +if INFINIBAND_USER_ACCESS || !INFINIBAND_USER_ACCESS +source "drivers/infiniband/hw/mthca/Kconfig" +source "drivers/infiniband/hw/qib/Kconfig" +source "drivers/infiniband/hw/cxgb4/Kconfig" +source "drivers/infiniband/hw/efa/Kconfig" +source "drivers/infiniband/hw/irdma/Kconfig" +source "drivers/infiniband/hw/mlx4/Kconfig" +source "drivers/infiniband/hw/mlx5/Kconfig" +source "drivers/infiniband/hw/ocrdma/Kconfig" +source "drivers/infiniband/hw/vmw_pvrdma/Kconfig" +source "drivers/infiniband/hw/usnic/Kconfig" +source "drivers/infiniband/hw/hns/Kconfig" +source "drivers/infiniband/hw/bnxt_re/Kconfig" +source "drivers/infiniband/hw/hfi1/Kconfig" +source "drivers/infiniband/hw/qedr/Kconfig" +source "drivers/infiniband/sw/rdmavt/Kconfig" +source "drivers/infiniband/sw/rxe/Kconfig" +source "drivers/infiniband/sw/siw/Kconfig" +endif + +source "drivers/infiniband/ulp/ipoib/Kconfig" + +source "drivers/infiniband/ulp/srp/Kconfig" +source "drivers/infiniband/ulp/srpt/Kconfig" + +source "drivers/infiniband/ulp/iser/Kconfig" +source "drivers/infiniband/ulp/isert/Kconfig" +source "drivers/infiniband/ulp/rtrs/Kconfig" + +source "drivers/infiniband/ulp/opa_vnic/Kconfig" + +endif # INFINIBAND diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/Makefile new file mode 100644 index 0000000..8603cdf --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_INFINIBAND) += core/ +obj-$(CONFIG_INFINIBAND) += hw/ +obj-$(CONFIG_INFINIBAND) += ulp/ +obj-$(CONFIG_INFINIBAND) += sw/ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/Makefile new file mode 100644 index 0000000..c054838 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/Makefile @@ -0,0 +1,55 @@ +# SPDX-License-Identifier: GPL-2.0 +infiniband-$(CONFIG_INFINIBAND_ADDR_TRANS) := rdma_cm.o +user_access-$(CONFIG_INFINIBAND_ADDR_TRANS) := rdma_ucm.o + +obj-$(CONFIG_INFINIBAND) += ib_core.o ib_cm.o iw_cm.o \ + $(infiniband-y) +obj-$(CONFIG_INFINIBAND_USER_MAD) += ib_umad.o +obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o $(user_access-y) +obj-$(CONFIG_INFINIBAND_USER_ACCESS_UCM) += ib_ucm.o + +obj-$(CONFIG_INFINIBAND_CORE_DUMMY) += ib_sa.o ib_mad.o ib_addr.o + +ib_sa-y := ib_sa_dummy.o +ib_mad-y := ib_mad_dummy.o +ib_addr-y := ib_addr_dummy.o + +ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o verbs_nvmf.o \ + device.o cache.o netlink.o \ + roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \ + multicast.o mad.o smi.o agent.o mad_rmpp.o \ + nldev.o restrack.o counters.o ib_core_uverbs.o \ + trace.o lag.o + +ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o +ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o + +ib_cm-y := cm.o cm_trace.o + +iw_cm-y := iwcm.o iwpm_util.o iwpm_msg.o + +CFLAGS_cma_trace.o += -I$(src) +rdma_cm-y := cma.o cma_trace.o + +rdma_cm-$(CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS) += cma_configfs.o + +rdma_ucm-y := ucma.o + +ib_umad-y := user_mad.o + +ib_ucm-y := ib_ucm_dummy.o + +ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \ + rdma_core.o uverbs_std_types.o uverbs_ioctl.o \ + uverbs_std_types_cq.o \ + uverbs_std_types_flow_action.o uverbs_std_types_dm.o \ + uverbs_std_types_mr.o uverbs_std_types_counters.o \ + uverbs_uapi.o uverbs_std_types_device.o \ + uverbs_std_types_async_fd.o \ + uverbs_std_types_srq.o \ + uverbs_std_types_wq.o \ + uverbs_std_types_qp.o + +ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o umem_dmabuf.o peer_mem.o +ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o + diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/addr.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/addr.c new file mode 100644 index 0000000..82a1eeb --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/addr.c @@ -0,0 +1,890 @@ +/* + * Copyright (c) 2005 Voltaire Inc. All rights reserved. + * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved. + * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved. + * Copyright (c) 2005 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "core_priv.h" + +struct addr_req { + struct list_head list; + struct sockaddr_storage src_addr; + struct sockaddr_storage dst_addr; + struct rdma_dev_addr *addr; + void *context; + void (*callback)(int status, struct sockaddr *src_addr, + struct rdma_dev_addr *addr, void *context); + unsigned long timeout; + struct delayed_work work; + bool resolve_by_gid_attr; /* Consider gid attr in resolve phase */ + int status; + u32 seq; +}; + +static atomic_t ib_nl_addr_request_seq = ATOMIC_INIT(0); + +static DEFINE_SPINLOCK(lock); +static LIST_HEAD(req_list); +static struct workqueue_struct *addr_wq; + +static const struct nla_policy ib_nl_addr_policy[LS_NLA_TYPE_MAX] = { + [LS_NLA_TYPE_DGID] = {.type = NLA_BINARY, + .len = sizeof(struct rdma_nla_ls_gid), + .validation_type = NLA_VALIDATE_MIN, + .min = sizeof(struct rdma_nla_ls_gid)}, +}; + +static inline bool ib_nl_is_good_ip_resp(const struct nlmsghdr *nlh) +{ + struct nlattr *tb[LS_NLA_TYPE_MAX] = {}; + int ret; + + if (nlh->nlmsg_flags & RDMA_NL_LS_F_ERR) + return false; + + ret = nla_parse_deprecated(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh), + nlmsg_len(nlh), ib_nl_addr_policy, NULL); + if (ret) + return false; + + return true; +} + +static void ib_nl_process_good_ip_rsep(const struct nlmsghdr *nlh) +{ + const struct nlattr *head, *curr; + union ib_gid gid; + struct addr_req *req; + int len, rem; + int found = 0; + + head = (const struct nlattr *)nlmsg_data(nlh); + len = nlmsg_len(nlh); + + nla_for_each_attr(curr, head, len, rem) { + if (curr->nla_type == LS_NLA_TYPE_DGID) + memcpy(&gid, nla_data(curr), nla_len(curr)); + } + + spin_lock_bh(&lock); + list_for_each_entry(req, &req_list, list) { + if (nlh->nlmsg_seq != req->seq) + continue; + /* We set the DGID part, the rest was set earlier */ + rdma_addr_set_dgid(req->addr, &gid); + req->status = 0; + found = 1; + break; + } + spin_unlock_bh(&lock); + + if (!found) + pr_info("Couldn't find request waiting for DGID: %pI6\n", + &gid); +} + +int ib_nl_handle_ip_res_resp(struct sk_buff *skb, + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + if ((nlh->nlmsg_flags & NLM_F_REQUEST) || + !(NETLINK_CB(skb).sk)) + return -EPERM; + + if (ib_nl_is_good_ip_resp(nlh)) + ib_nl_process_good_ip_rsep(nlh); + + return 0; +} + +static int ib_nl_ip_send_msg(struct rdma_dev_addr *dev_addr, + const void *daddr, + u32 seq, u16 family) +{ + struct sk_buff *skb = NULL; + struct nlmsghdr *nlh; + struct rdma_ls_ip_resolve_header *header; + void *data; + size_t size; + int attrtype; + int len; + + if (family == AF_INET) { + size = sizeof(struct in_addr); + attrtype = RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_IPV4; + } else { + size = sizeof(struct in6_addr); + attrtype = RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_IPV6; + } + + len = nla_total_size(sizeof(size)); + len += NLMSG_ALIGN(sizeof(*header)); + + skb = nlmsg_new(len, GFP_ATOMIC); + if (!skb) + return -ENOMEM; + + data = ibnl_put_msg(skb, &nlh, seq, 0, RDMA_NL_LS, + RDMA_NL_LS_OP_IP_RESOLVE, NLM_F_REQUEST); + if (!data) { + nlmsg_free(skb); + return -ENODATA; + } + + /* Construct the family header first */ + header = skb_put(skb, NLMSG_ALIGN(sizeof(*header))); + header->ifindex = dev_addr->bound_dev_if; + nla_put(skb, attrtype, size, daddr); + + /* Repair the nlmsg header length */ + nlmsg_end(skb, nlh); + rdma_nl_multicast(&init_net, skb, RDMA_NL_GROUP_LS, GFP_ATOMIC); + + /* Make the request retry, so when we get the response from userspace + * we will have something. + */ + return -ENODATA; +} + +int rdma_addr_size(const struct sockaddr *addr) +{ + switch (addr->sa_family) { + case AF_INET: + return sizeof(struct sockaddr_in); + case AF_INET6: + return sizeof(struct sockaddr_in6); + case AF_IB: + return sizeof(struct sockaddr_ib); + default: + return 0; + } +} +EXPORT_SYMBOL(rdma_addr_size); + +int rdma_addr_size_in6(struct sockaddr_in6 *addr) +{ + int ret = rdma_addr_size((struct sockaddr *) addr); + + return ret <= sizeof(*addr) ? ret : 0; +} +EXPORT_SYMBOL(rdma_addr_size_in6); + +int rdma_addr_size_kss(struct __kernel_sockaddr_storage *addr) +{ + int ret = rdma_addr_size((struct sockaddr *) addr); + + return ret <= sizeof(*addr) ? ret : 0; +} +EXPORT_SYMBOL(rdma_addr_size_kss); + +/** + * rdma_copy_src_l2_addr - Copy netdevice source addresses + * @dev_addr: Destination address pointer where to copy the addresses + * @dev: Netdevice whose source addresses to copy + * + * rdma_copy_src_l2_addr() copies source addresses from the specified netdevice. + * This includes unicast address, broadcast address, device type and + * interface index. + */ +void rdma_copy_src_l2_addr(struct rdma_dev_addr *dev_addr, + const struct net_device *dev) +{ + dev_addr->dev_type = dev->type; + memcpy(dev_addr->src_dev_addr, dev->dev_addr, MAX_ADDR_LEN); + memcpy(dev_addr->broadcast, dev->broadcast, MAX_ADDR_LEN); + dev_addr->bound_dev_if = dev->ifindex; +} +EXPORT_SYMBOL(rdma_copy_src_l2_addr); + +static struct net_device * +rdma_find_ndev_for_src_ip_rcu(struct net *net, const struct sockaddr *src_in) +{ + struct net_device *dev = NULL; + int ret = -EADDRNOTAVAIL; + + switch (src_in->sa_family) { + case AF_INET: + dev = __ip_dev_find(net, + ((const struct sockaddr_in *)src_in)->sin_addr.s_addr, + false); + if (dev) + ret = 0; + break; +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + for_each_netdev_rcu(net, dev) { + if (ipv6_chk_addr(net, + &((const struct sockaddr_in6 *)src_in)->sin6_addr, + dev, 1)) { + ret = 0; + break; + } + } + break; +#endif + } + return ret ? ERR_PTR(ret) : dev; +} + +int rdma_translate_ip(const struct sockaddr *addr, + struct rdma_dev_addr *dev_addr) +{ + struct net_device *dev; + + if (dev_addr->bound_dev_if) { + dev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); + if (!dev) + return -ENODEV; + rdma_copy_src_l2_addr(dev_addr, dev); + dev_put(dev); + return 0; + } + + rcu_read_lock(); + dev = rdma_find_ndev_for_src_ip_rcu(dev_addr->net, addr); + if (!IS_ERR(dev)) + rdma_copy_src_l2_addr(dev_addr, dev); + rcu_read_unlock(); + return PTR_ERR_OR_ZERO(dev); +} +EXPORT_SYMBOL(rdma_translate_ip); + +static void set_timeout(struct addr_req *req, unsigned long time) +{ + unsigned long delay; + + delay = time - jiffies; + if ((long)delay < 0) + delay = 0; + + mod_delayed_work(addr_wq, &req->work, delay); +} + +static void queue_req(struct addr_req *req) +{ + spin_lock_bh(&lock); + list_add_tail(&req->list, &req_list); + set_timeout(req, req->timeout); + spin_unlock_bh(&lock); +} + +static int ib_nl_fetch_ha(struct rdma_dev_addr *dev_addr, + const void *daddr, u32 seq, u16 family) +{ + if (!rdma_nl_chk_listeners(RDMA_NL_GROUP_LS)) + return -EADDRNOTAVAIL; + + return ib_nl_ip_send_msg(dev_addr, daddr, seq, family); +} + +static int dst_fetch_ha(const struct dst_entry *dst, + struct rdma_dev_addr *dev_addr, + const void *daddr) +{ + struct neighbour *n; + int ret = 0; + + n = dst_neigh_lookup(dst, daddr); + if (!n) + return -ENODATA; + + if (!(n->nud_state & NUD_VALID)) { + neigh_event_send(n, NULL); + ret = -ENODATA; + } else { + neigh_ha_snapshot(dev_addr->dst_dev_addr, n, dst->dev); + } + + neigh_release(n); + + return ret; +} + +static bool has_gateway(const struct dst_entry *dst, sa_family_t family) +{ + struct rtable *rt; + struct rt6_info *rt6; + + if (family == AF_INET) { + rt = container_of(dst, struct rtable, dst); + return rt->rt_uses_gateway; + } + + rt6 = container_of(dst, struct rt6_info, dst); + return rt6->rt6i_flags & RTF_GATEWAY; +} + +static int fetch_ha(const struct dst_entry *dst, struct rdma_dev_addr *dev_addr, + const struct sockaddr *dst_in, u32 seq) +{ + const struct sockaddr_in *dst_in4 = + (const struct sockaddr_in *)dst_in; + const struct sockaddr_in6 *dst_in6 = + (const struct sockaddr_in6 *)dst_in; + const void *daddr = (dst_in->sa_family == AF_INET) ? + (const void *)&dst_in4->sin_addr.s_addr : + (const void *)&dst_in6->sin6_addr; + sa_family_t family = dst_in->sa_family; + + might_sleep(); + + /* If we have a gateway in IB mode then it must be an IB network */ + if (has_gateway(dst, family) && dev_addr->network == RDMA_NETWORK_IB) + return ib_nl_fetch_ha(dev_addr, daddr, seq, family); + else + return dst_fetch_ha(dst, dev_addr, daddr); +} + +static int addr4_resolve(struct sockaddr *src_sock, + const struct sockaddr *dst_sock, + struct rdma_dev_addr *addr, + struct rtable **prt) +{ + struct sockaddr_in *src_in = (struct sockaddr_in *)src_sock; + const struct sockaddr_in *dst_in = + (const struct sockaddr_in *)dst_sock; + + __be32 src_ip = src_in->sin_addr.s_addr; + __be32 dst_ip = dst_in->sin_addr.s_addr; + struct rtable *rt; + struct flowi4 fl4; + int ret; + + memset(&fl4, 0, sizeof(fl4)); + fl4.daddr = dst_ip; + fl4.saddr = src_ip; + fl4.flowi4_oif = addr->bound_dev_if; + rt = ip_route_output_key(addr->net, &fl4); + ret = PTR_ERR_OR_ZERO(rt); + if (ret) + return ret; + + src_in->sin_addr.s_addr = fl4.saddr; + + addr->hoplimit = ip4_dst_hoplimit(&rt->dst); + + *prt = rt; + return 0; +} + +#if IS_ENABLED(CONFIG_IPV6) +static int addr6_resolve(struct sockaddr *src_sock, + const struct sockaddr *dst_sock, + struct rdma_dev_addr *addr, + struct dst_entry **pdst) +{ + struct sockaddr_in6 *src_in = (struct sockaddr_in6 *)src_sock; + const struct sockaddr_in6 *dst_in = + (const struct sockaddr_in6 *)dst_sock; + struct flowi6 fl6; + struct dst_entry *dst; + + memset(&fl6, 0, sizeof fl6); + fl6.daddr = dst_in->sin6_addr; + fl6.saddr = src_in->sin6_addr; + fl6.flowi6_oif = addr->bound_dev_if; + + dst = ipv6_stub->ipv6_dst_lookup_flow(addr->net, NULL, &fl6, NULL); + if (IS_ERR(dst)) + return PTR_ERR(dst); + + if (ipv6_addr_any(&src_in->sin6_addr)) + src_in->sin6_addr = fl6.saddr; + + addr->hoplimit = ip6_dst_hoplimit(dst); + + *pdst = dst; + return 0; +} +#else +static int addr6_resolve(struct sockaddr *src_sock, + const struct sockaddr *dst_sock, + struct rdma_dev_addr *addr, + struct dst_entry **pdst) +{ + return -EADDRNOTAVAIL; +} +#endif + +static int addr_resolve_neigh(const struct dst_entry *dst, + const struct sockaddr *dst_in, + struct rdma_dev_addr *addr, + unsigned int ndev_flags, + u32 seq) +{ + int ret = 0; + + if (ndev_flags & IFF_LOOPBACK) { + memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN); + } else { + if (!(ndev_flags & IFF_NOARP)) { + /* If the device doesn't do ARP internally */ + ret = fetch_ha(dst, addr, dst_in, seq); + } + } + return ret; +} + +static int copy_src_l2_addr(struct rdma_dev_addr *dev_addr, + const struct sockaddr *dst_in, + const struct dst_entry *dst, + const struct net_device *ndev) +{ + int ret = 0; + + if (dst->dev->flags & IFF_LOOPBACK) + ret = rdma_translate_ip(dst_in, dev_addr); + else + rdma_copy_src_l2_addr(dev_addr, dst->dev); + + /* + * If there's a gateway and type of device not ARPHRD_INFINIBAND, + * we're definitely in RoCE v2 (as RoCE v1 isn't routable) set the + * network type accordingly. + */ + if (has_gateway(dst, dst_in->sa_family) && + ndev->type != ARPHRD_INFINIBAND) + dev_addr->network = dst_in->sa_family == AF_INET ? + RDMA_NETWORK_IPV4 : + RDMA_NETWORK_IPV6; + else + dev_addr->network = RDMA_NETWORK_IB; + + return ret; +} + +static int rdma_set_src_addr_rcu(struct rdma_dev_addr *dev_addr, + unsigned int *ndev_flags, + const struct sockaddr *dst_in, + const struct dst_entry *dst) +{ + struct net_device *ndev = READ_ONCE(dst->dev); + + *ndev_flags = ndev->flags; + /* A physical device must be the RDMA device to use */ + if (ndev->flags & IFF_LOOPBACK) { + /* + * RDMA (IB/RoCE, iWarp) doesn't run on lo interface or + * loopback IP address. So if route is resolved to loopback + * interface, translate that to a real ndev based on non + * loopback IP address. + */ + ndev = rdma_find_ndev_for_src_ip_rcu(dev_net(ndev), dst_in); + if (IS_ERR(ndev)) + return -ENODEV; + } + + return copy_src_l2_addr(dev_addr, dst_in, dst, ndev); +} + +static int set_addr_netns_by_gid_rcu(struct rdma_dev_addr *addr) +{ + struct net_device *ndev; + + ndev = rdma_read_gid_attr_ndev_rcu(addr->sgid_attr); + if (IS_ERR(ndev)) + return PTR_ERR(ndev); + + /* + * Since we are holding the rcu, reading net and ifindex + * are safe without any additional reference; because + * change_net_namespace() in net/core/dev.c does rcu sync + * after it changes the state to IFF_DOWN and before + * updating netdev fields {net, ifindex}. + */ + addr->net = dev_net(ndev); + addr->bound_dev_if = ndev->ifindex; + return 0; +} + +static void rdma_addr_set_net_defaults(struct rdma_dev_addr *addr) +{ + addr->net = &init_net; + addr->bound_dev_if = 0; +} + +static int addr_resolve(struct sockaddr *src_in, + const struct sockaddr *dst_in, + struct rdma_dev_addr *addr, + bool resolve_neigh, + bool resolve_by_gid_attr, + u32 seq) +{ + struct dst_entry *dst = NULL; + unsigned int ndev_flags = 0; + struct rtable *rt = NULL; + int ret; + + if (!addr->net) { + pr_warn_ratelimited("%s: missing namespace\n", __func__); + return -EINVAL; + } + + rcu_read_lock(); + if (resolve_by_gid_attr) { + if (!addr->sgid_attr) { + rcu_read_unlock(); + pr_warn_ratelimited("%s: missing gid_attr\n", __func__); + return -EINVAL; + } + /* + * If the request is for a specific gid attribute of the + * rdma_dev_addr, derive net from the netdevice of the + * GID attribute. + */ + ret = set_addr_netns_by_gid_rcu(addr); + if (ret) { + rcu_read_unlock(); + return ret; + } + } + if (src_in->sa_family == AF_INET) { + ret = addr4_resolve(src_in, dst_in, addr, &rt); + dst = &rt->dst; + } else { + ret = addr6_resolve(src_in, dst_in, addr, &dst); + } + if (ret) { + rcu_read_unlock(); + goto done; + } + ret = rdma_set_src_addr_rcu(addr, &ndev_flags, dst_in, dst); + rcu_read_unlock(); + + /* + * Resolve neighbor destination address if requested and + * only if src addr translation didn't fail. + */ + if (!ret && resolve_neigh) + ret = addr_resolve_neigh(dst, dst_in, addr, ndev_flags, seq); + + if (src_in->sa_family == AF_INET) + ip_rt_put(rt); + else + dst_release(dst); +done: + /* + * Clear the addr net to go back to its original state, only if it was + * derived from GID attribute in this context. + */ + if (resolve_by_gid_attr) + rdma_addr_set_net_defaults(addr); + return ret; +} + +static void process_one_req(struct work_struct *_work) +{ + struct addr_req *req; + struct sockaddr *src_in, *dst_in; + + req = container_of(_work, struct addr_req, work.work); + + if (req->status == -ENODATA) { + src_in = (struct sockaddr *)&req->src_addr; + dst_in = (struct sockaddr *)&req->dst_addr; + req->status = addr_resolve(src_in, dst_in, req->addr, + true, req->resolve_by_gid_attr, + req->seq); + if (req->status && time_after_eq(jiffies, req->timeout)) { + req->status = -ETIMEDOUT; + } else if (req->status == -ENODATA) { + /* requeue the work for retrying again */ + spin_lock_bh(&lock); + if (!list_empty(&req->list)) + set_timeout(req, req->timeout); + spin_unlock_bh(&lock); + return; + } + } + + req->callback(req->status, (struct sockaddr *)&req->src_addr, + req->addr, req->context); + req->callback = NULL; + + spin_lock_bh(&lock); + /* + * Although the work will normally have been canceled by the workqueue, + * it can still be requeued as long as it is on the req_list. + */ + cancel_delayed_work(&req->work); + if (!list_empty(&req->list)) { + list_del_init(&req->list); + kfree(req); + } + spin_unlock_bh(&lock); +} + +int rdma_resolve_ip(struct sockaddr *src_addr, const struct sockaddr *dst_addr, + struct rdma_dev_addr *addr, unsigned long timeout_ms, + void (*callback)(int status, struct sockaddr *src_addr, + struct rdma_dev_addr *addr, void *context), + bool resolve_by_gid_attr, void *context) +{ + struct sockaddr *src_in, *dst_in; + struct addr_req *req; + int ret = 0; + + req = kzalloc(sizeof *req, GFP_KERNEL); + if (!req) + return -ENOMEM; + + src_in = (struct sockaddr *) &req->src_addr; + dst_in = (struct sockaddr *) &req->dst_addr; + + if (src_addr) { + if (src_addr->sa_family != dst_addr->sa_family) { + ret = -EINVAL; + goto err; + } + + memcpy(src_in, src_addr, rdma_addr_size(src_addr)); + } else { + src_in->sa_family = dst_addr->sa_family; + } + + memcpy(dst_in, dst_addr, rdma_addr_size(dst_addr)); + req->addr = addr; + req->callback = callback; + req->context = context; + req->resolve_by_gid_attr = resolve_by_gid_attr; + INIT_DELAYED_WORK(&req->work, process_one_req); + req->seq = (u32)atomic_inc_return(&ib_nl_addr_request_seq); + + req->status = addr_resolve(src_in, dst_in, addr, true, + req->resolve_by_gid_attr, req->seq); + switch (req->status) { + case 0: + req->timeout = jiffies; + queue_req(req); + break; + case -ENODATA: + req->timeout = msecs_to_jiffies(timeout_ms) + jiffies; + queue_req(req); + break; + default: + ret = req->status; + goto err; + } + return ret; +err: + kfree(req); + return ret; +} +EXPORT_SYMBOL(rdma_resolve_ip); + +int roce_resolve_route_from_path(struct sa_path_rec *rec, + const struct ib_gid_attr *attr) +{ + union { + struct sockaddr _sockaddr; + struct sockaddr_in _sockaddr_in; + struct sockaddr_in6 _sockaddr_in6; + } sgid, dgid; + struct rdma_dev_addr dev_addr = {}; + int ret; + + might_sleep(); + + if (rec->roce.route_resolved) + return 0; + + rdma_gid2ip((struct sockaddr *)&sgid, &rec->sgid); + rdma_gid2ip((struct sockaddr *)&dgid, &rec->dgid); + + if (sgid._sockaddr.sa_family != dgid._sockaddr.sa_family) + return -EINVAL; + + if (!attr || !attr->ndev) + return -EINVAL; + + dev_addr.net = &init_net; + dev_addr.sgid_attr = attr; + + ret = addr_resolve((struct sockaddr *)&sgid, (struct sockaddr *)&dgid, + &dev_addr, false, true, 0); + if (ret) + return ret; + + if ((dev_addr.network == RDMA_NETWORK_IPV4 || + dev_addr.network == RDMA_NETWORK_IPV6) && + rec->rec_type != SA_PATH_REC_TYPE_ROCE_V2) + return -EINVAL; + + rec->roce.route_resolved = true; + return 0; +} + +/** + * rdma_addr_cancel - Cancel resolve ip request + * @addr: Pointer to address structure given previously + * during rdma_resolve_ip(). + * rdma_addr_cancel() is synchronous function which cancels any pending + * request if there is any. + */ +void rdma_addr_cancel(struct rdma_dev_addr *addr) +{ + struct addr_req *req, *temp_req; + struct addr_req *found = NULL; + + spin_lock_bh(&lock); + list_for_each_entry_safe(req, temp_req, &req_list, list) { + if (req->addr == addr) { + /* + * Removing from the list means we take ownership of + * the req + */ + list_del_init(&req->list); + found = req; + break; + } + } + spin_unlock_bh(&lock); + + if (!found) + return; + + /* + * sync canceling the work after removing it from the req_list + * guarentees no work is running and none will be started. + */ + cancel_delayed_work_sync(&found->work); + kfree(found); +} +EXPORT_SYMBOL(rdma_addr_cancel); + +struct resolve_cb_context { + struct completion comp; + int status; +}; + +static void resolve_cb(int status, struct sockaddr *src_addr, + struct rdma_dev_addr *addr, void *context) +{ + ((struct resolve_cb_context *)context)->status = status; + complete(&((struct resolve_cb_context *)context)->comp); +} + +int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, + const union ib_gid *dgid, + u8 *dmac, const struct ib_gid_attr *sgid_attr, + int *hoplimit) +{ + struct rdma_dev_addr dev_addr; + struct resolve_cb_context ctx; + union { + struct sockaddr_in _sockaddr_in; + struct sockaddr_in6 _sockaddr_in6; + } sgid_addr, dgid_addr; + int ret; + + rdma_gid2ip((struct sockaddr *)&sgid_addr, sgid); + rdma_gid2ip((struct sockaddr *)&dgid_addr, dgid); + + memset(&dev_addr, 0, sizeof(dev_addr)); + dev_addr.net = &init_net; + dev_addr.sgid_attr = sgid_attr; + + init_completion(&ctx.comp); + ret = rdma_resolve_ip((struct sockaddr *)&sgid_addr, + (struct sockaddr *)&dgid_addr, &dev_addr, 1000, + resolve_cb, true, &ctx); + if (ret) + return ret; + + wait_for_completion(&ctx.comp); + + ret = ctx.status; + if (ret) + return ret; + + memcpy(dmac, dev_addr.dst_dev_addr, ETH_ALEN); + *hoplimit = dev_addr.hoplimit; + return 0; +} + +static int netevent_callback(struct notifier_block *self, unsigned long event, + void *ctx) +{ + struct addr_req *req; + + if (event == NETEVENT_NEIGH_UPDATE) { + struct neighbour *neigh = ctx; + + if (neigh->nud_state & NUD_VALID) { + spin_lock_bh(&lock); + list_for_each_entry(req, &req_list, list) + set_timeout(req, jiffies); + spin_unlock_bh(&lock); + } + } + return 0; +} + +static struct notifier_block nb = { + .notifier_call = netevent_callback +}; + +int addr_init(void) +{ + addr_wq = alloc_ordered_workqueue("ib_addr", 0); + if (!addr_wq) + return -ENOMEM; + + register_netevent_notifier(&nb); + + return 0; +} + +void addr_cleanup(void) +{ + unregister_netevent_notifier(&nb); + destroy_workqueue(addr_wq); + WARN_ON(!list_empty(&req_list)); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/agent.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/agent.c new file mode 100644 index 0000000..f82b426 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/agent.c @@ -0,0 +1,221 @@ +/* + * Copyright (c) 2004, 2005 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2004, 2005 Infinicon Corporation. All rights reserved. + * Copyright (c) 2004, 2005 Intel Corporation. All rights reserved. + * Copyright (c) 2004, 2005 Topspin Corporation. All rights reserved. + * Copyright (c) 2004-2007 Voltaire Corporation. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include + +#include "agent.h" +#include "smi.h" +#include "mad_priv.h" + +#define SPFX "ib_agent: " + +struct ib_agent_port_private { + struct list_head port_list; + struct ib_mad_agent *agent[2]; +}; + +static DEFINE_SPINLOCK(ib_agent_port_list_lock); +static LIST_HEAD(ib_agent_port_list); + +static struct ib_agent_port_private * +__ib_get_agent_port(const struct ib_device *device, int port_num) +{ + struct ib_agent_port_private *entry; + + list_for_each_entry(entry, &ib_agent_port_list, port_list) { + if (entry->agent[1]->device == device && + entry->agent[1]->port_num == port_num) + return entry; + } + return NULL; +} + +static struct ib_agent_port_private * +ib_get_agent_port(const struct ib_device *device, int port_num) +{ + struct ib_agent_port_private *entry; + unsigned long flags; + + spin_lock_irqsave(&ib_agent_port_list_lock, flags); + entry = __ib_get_agent_port(device, port_num); + spin_unlock_irqrestore(&ib_agent_port_list_lock, flags); + return entry; +} + +void agent_send_response(const struct ib_mad_hdr *mad_hdr, const struct ib_grh *grh, + const struct ib_wc *wc, const struct ib_device *device, + int port_num, int qpn, size_t resp_mad_len, bool opa) +{ + struct ib_agent_port_private *port_priv; + struct ib_mad_agent *agent; + struct ib_mad_send_buf *send_buf; + struct ib_ah *ah; + struct ib_mad_send_wr_private *mad_send_wr; + + if (rdma_cap_ib_switch(device)) + port_priv = ib_get_agent_port(device, 0); + else + port_priv = ib_get_agent_port(device, port_num); + + if (!port_priv) { + dev_err(&device->dev, "Unable to find port agent\n"); + return; + } + + agent = port_priv->agent[qpn]; + ah = ib_create_ah_from_wc(agent->qp->pd, wc, grh, port_num); + if (IS_ERR(ah)) { + dev_err(&device->dev, "ib_create_ah_from_wc error %ld\n", + PTR_ERR(ah)); + return; + } + + if (opa && mad_hdr->base_version != OPA_MGMT_BASE_VERSION) + resp_mad_len = IB_MGMT_MAD_SIZE; + + send_buf = ib_create_send_mad(agent, wc->src_qp, wc->pkey_index, 0, + IB_MGMT_MAD_HDR, + resp_mad_len - IB_MGMT_MAD_HDR, + GFP_KERNEL, + mad_hdr->base_version); + if (IS_ERR(send_buf)) { + dev_err(&device->dev, "ib_create_send_mad error\n"); + goto err1; + } + + memcpy(send_buf->mad, mad_hdr, resp_mad_len); + send_buf->ah = ah; + + if (rdma_cap_ib_switch(device)) { + mad_send_wr = container_of(send_buf, + struct ib_mad_send_wr_private, + send_buf); + mad_send_wr->send_wr.port_num = port_num; + } + + if (ib_post_send_mad(send_buf, NULL)) { + dev_err(&device->dev, "ib_post_send_mad error\n"); + goto err2; + } + return; +err2: + ib_free_send_mad(send_buf); +err1: + rdma_destroy_ah(ah, RDMA_DESTROY_AH_SLEEPABLE); +} + +static void agent_send_handler(struct ib_mad_agent *mad_agent, + struct ib_mad_send_wc *mad_send_wc) +{ + rdma_destroy_ah(mad_send_wc->send_buf->ah, RDMA_DESTROY_AH_SLEEPABLE); + ib_free_send_mad(mad_send_wc->send_buf); +} + +int ib_agent_port_open(struct ib_device *device, int port_num) +{ + struct ib_agent_port_private *port_priv; + unsigned long flags; + int ret; + + /* Create new device info */ + port_priv = kzalloc(sizeof *port_priv, GFP_KERNEL); + if (!port_priv) { + ret = -ENOMEM; + goto error1; + } + + if (rdma_cap_ib_smi(device, port_num)) { + /* Obtain send only MAD agent for SMI QP */ + port_priv->agent[0] = ib_register_mad_agent(device, port_num, + IB_QPT_SMI, NULL, 0, + &agent_send_handler, + NULL, NULL, 0); + if (IS_ERR(port_priv->agent[0])) { + ret = PTR_ERR(port_priv->agent[0]); + goto error2; + } + } + + /* Obtain send only MAD agent for GSI QP */ + port_priv->agent[1] = ib_register_mad_agent(device, port_num, + IB_QPT_GSI, NULL, 0, + &agent_send_handler, + NULL, NULL, 0); + if (IS_ERR(port_priv->agent[1])) { + ret = PTR_ERR(port_priv->agent[1]); + goto error3; + } + + spin_lock_irqsave(&ib_agent_port_list_lock, flags); + list_add_tail(&port_priv->port_list, &ib_agent_port_list); + spin_unlock_irqrestore(&ib_agent_port_list_lock, flags); + + return 0; + +error3: + if (port_priv->agent[0]) + ib_unregister_mad_agent(port_priv->agent[0]); +error2: + kfree(port_priv); +error1: + return ret; +} + +int ib_agent_port_close(struct ib_device *device, int port_num) +{ + struct ib_agent_port_private *port_priv; + unsigned long flags; + + spin_lock_irqsave(&ib_agent_port_list_lock, flags); + port_priv = __ib_get_agent_port(device, port_num); + if (port_priv == NULL) { + spin_unlock_irqrestore(&ib_agent_port_list_lock, flags); + dev_err(&device->dev, "Port %d not found\n", port_num); + return -ENODEV; + } + list_del(&port_priv->port_list); + spin_unlock_irqrestore(&ib_agent_port_list_lock, flags); + + ib_unregister_mad_agent(port_priv->agent[1]); + if (port_priv->agent[0]) + ib_unregister_mad_agent(port_priv->agent[0]); + + kfree(port_priv); + return 0; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/agent.h b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/agent.h new file mode 100644 index 0000000..65f92be --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/agent.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2004 Infinicon Corporation. All rights reserved. + * Copyright (c) 2004 Intel Corporation. All rights reserved. + * Copyright (c) 2004 Topspin Corporation. All rights reserved. + * Copyright (c) 2004 Voltaire Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __AGENT_H_ +#define __AGENT_H_ + +#include +#include + +extern int ib_agent_port_open(struct ib_device *device, int port_num); + +extern int ib_agent_port_close(struct ib_device *device, int port_num); + +extern void agent_send_response(const struct ib_mad_hdr *mad_hdr, const struct ib_grh *grh, + const struct ib_wc *wc, const struct ib_device *device, + int port_num, int qpn, size_t resp_mad_len, bool opa); + +#endif /* __AGENT_H_ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/cache.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/cache.c new file mode 100644 index 0000000..066bb19 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/cache.c @@ -0,0 +1,1729 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Intel Corporation. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2005 Voltaire, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "core_priv.h" + +struct ib_pkey_cache { + int table_len; + u16 table[]; +}; + +struct ib_update_work { + struct work_struct work; + struct ib_event event; + bool enforce_security; +}; + +union ib_gid zgid; +EXPORT_SYMBOL(zgid); + +enum gid_attr_find_mask { + GID_ATTR_FIND_MASK_GID = 1UL << 0, + GID_ATTR_FIND_MASK_NETDEV = 1UL << 1, + GID_ATTR_FIND_MASK_DEFAULT = 1UL << 2, + GID_ATTR_FIND_MASK_GID_TYPE = 1UL << 3, +}; + +enum gid_table_entry_state { + GID_TABLE_ENTRY_INVALID = 1, + GID_TABLE_ENTRY_VALID = 2, + /* + * Indicates that entry is pending to be removed, there may + * be active users of this GID entry. + * When last user of the GID entry releases reference to it, + * GID entry is detached from the table. + */ + GID_TABLE_ENTRY_PENDING_DEL = 3, +}; + +struct roce_gid_ndev_storage { + struct rcu_head rcu_head; + struct net_device *ndev; +}; + +struct ib_gid_table_entry { + struct kref kref; + struct work_struct del_work; + struct ib_gid_attr attr; + void *context; + /* Store the ndev pointer to release reference later on in + * call_rcu context because by that time gid_table_entry + * and attr might be already freed. So keep a copy of it. + * ndev_storage is freed by rcu callback. + */ + struct roce_gid_ndev_storage *ndev_storage; + enum gid_table_entry_state state; +}; + +struct ib_gid_table { + int sz; + /* In RoCE, adding a GID to the table requires: + * (a) Find if this GID is already exists. + * (b) Find a free space. + * (c) Write the new GID + * + * Delete requires different set of operations: + * (a) Find the GID + * (b) Delete it. + * + **/ + /* Any writer to data_vec must hold this lock and the write side of + * rwlock. Readers must hold only rwlock. All writers must be in a + * sleepable context. + */ + struct mutex lock; + /* rwlock protects data_vec[ix]->state and entry pointer. + */ + rwlock_t rwlock; + struct ib_gid_table_entry **data_vec; + /* bit field, each bit indicates the index of default GID */ + u32 default_gid_indices; +}; + +static void dispatch_gid_change_event(struct ib_device *ib_dev, u32 port) +{ + struct ib_event event; + + event.device = ib_dev; + event.element.port_num = port; + event.event = IB_EVENT_GID_CHANGE; + + ib_dispatch_event_clients(&event); +} + +static const char * const gid_type_str[] = { + /* IB/RoCE v1 value is set for IB_GID_TYPE_IB and IB_GID_TYPE_ROCE for + * user space compatibility reasons. + */ + [IB_GID_TYPE_IB] = "IB/RoCE v1", + [IB_GID_TYPE_ROCE] = "IB/RoCE v1", + [IB_GID_TYPE_ROCE_UDP_ENCAP] = "RoCE v2", +}; + +const char *ib_cache_gid_type_str(enum ib_gid_type gid_type) +{ + if (gid_type < ARRAY_SIZE(gid_type_str) && gid_type_str[gid_type]) + return gid_type_str[gid_type]; + + return "Invalid GID type"; +} +EXPORT_SYMBOL(ib_cache_gid_type_str); + +/** rdma_is_zero_gid - Check if given GID is zero or not. + * @gid: GID to check + * Returns true if given GID is zero, returns false otherwise. + */ +bool rdma_is_zero_gid(const union ib_gid *gid) +{ + return !memcmp(gid, &zgid, sizeof(*gid)); +} +EXPORT_SYMBOL(rdma_is_zero_gid); + +/** is_gid_index_default - Check if a given index belongs to + * reserved default GIDs or not. + * @table: GID table pointer + * @index: Index to check in GID table + * Returns true if index is one of the reserved default GID index otherwise + * returns false. + */ +static bool is_gid_index_default(const struct ib_gid_table *table, + unsigned int index) +{ + return index < 32 && (BIT(index) & table->default_gid_indices); +} + +int ib_cache_gid_parse_type_str(const char *buf) +{ + unsigned int i; + size_t len; + int err = -EINVAL; + + len = strlen(buf); + if (len == 0) + return -EINVAL; + + if (buf[len - 1] == '\n') + len--; + + for (i = 0; i < ARRAY_SIZE(gid_type_str); ++i) + if (gid_type_str[i] && !strncmp(buf, gid_type_str[i], len) && + len == strlen(gid_type_str[i])) { + err = i; + break; + } + + return err; +} +EXPORT_SYMBOL(ib_cache_gid_parse_type_str); + +static struct ib_gid_table *rdma_gid_table(struct ib_device *device, u32 port) +{ + return device->port_data[port].cache.gid; +} + +static bool is_gid_entry_free(const struct ib_gid_table_entry *entry) +{ + return !entry; +} + +static bool is_gid_entry_valid(const struct ib_gid_table_entry *entry) +{ + return entry && entry->state == GID_TABLE_ENTRY_VALID; +} + +static void schedule_free_gid(struct kref *kref) +{ + struct ib_gid_table_entry *entry = + container_of(kref, struct ib_gid_table_entry, kref); + + queue_work(ib_wq, &entry->del_work); +} + +static void put_gid_ndev(struct rcu_head *head) +{ + struct roce_gid_ndev_storage *storage = + container_of(head, struct roce_gid_ndev_storage, rcu_head); + + WARN_ON(!storage->ndev); + /* At this point its safe to release netdev reference, + * as all callers working on gid_attr->ndev are done + * using this netdev. + */ + dev_put(storage->ndev); + kfree(storage); +} + +static void free_gid_entry_locked(struct ib_gid_table_entry *entry) +{ + struct ib_device *device = entry->attr.device; + u32 port_num = entry->attr.port_num; + struct ib_gid_table *table = rdma_gid_table(device, port_num); + + dev_dbg(&device->dev, "%s port=%u index=%u gid %pI6\n", __func__, + port_num, entry->attr.index, entry->attr.gid.raw); + + write_lock_irq(&table->rwlock); + + /* + * The only way to avoid overwriting NULL in table is + * by comparing if it is same entry in table or not! + * If new entry in table is added by the time we free here, + * don't overwrite the table entry. + */ + if (entry == table->data_vec[entry->attr.index]) + table->data_vec[entry->attr.index] = NULL; + /* Now this index is ready to be allocated */ + write_unlock_irq(&table->rwlock); + + if (entry->ndev_storage) + call_rcu(&entry->ndev_storage->rcu_head, put_gid_ndev); + kfree(entry); +} + +static void free_gid_entry(struct kref *kref) +{ + struct ib_gid_table_entry *entry = + container_of(kref, struct ib_gid_table_entry, kref); + + free_gid_entry_locked(entry); +} + +/** + * free_gid_work - Release reference to the GID entry + * @work: Work structure to refer to GID entry which needs to be + * deleted. + * + * free_gid_work() frees the entry from the HCA's hardware table + * if provider supports it. It releases reference to netdevice. + */ +static void free_gid_work(struct work_struct *work) +{ + struct ib_gid_table_entry *entry = + container_of(work, struct ib_gid_table_entry, del_work); + struct ib_device *device = entry->attr.device; + u32 port_num = entry->attr.port_num; + struct ib_gid_table *table = rdma_gid_table(device, port_num); + + mutex_lock(&table->lock); + free_gid_entry_locked(entry); + mutex_unlock(&table->lock); +} + +static struct ib_gid_table_entry * +alloc_gid_entry(const struct ib_gid_attr *attr) +{ + struct ib_gid_table_entry *entry; + struct net_device *ndev; + + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return NULL; + + ndev = rcu_dereference_protected(attr->ndev, 1); + if (ndev) { + entry->ndev_storage = kzalloc(sizeof(*entry->ndev_storage), + GFP_KERNEL); + if (!entry->ndev_storage) { + kfree(entry); + return NULL; + } + dev_hold(ndev); + entry->ndev_storage->ndev = ndev; + } + kref_init(&entry->kref); + memcpy(&entry->attr, attr, sizeof(*attr)); + INIT_WORK(&entry->del_work, free_gid_work); + entry->state = GID_TABLE_ENTRY_INVALID; + return entry; +} + +static void store_gid_entry(struct ib_gid_table *table, + struct ib_gid_table_entry *entry) +{ + entry->state = GID_TABLE_ENTRY_VALID; + + dev_dbg(&entry->attr.device->dev, "%s port=%u index=%u gid %pI6\n", + __func__, entry->attr.port_num, entry->attr.index, + entry->attr.gid.raw); + + lockdep_assert_held(&table->lock); + write_lock_irq(&table->rwlock); + table->data_vec[entry->attr.index] = entry; + write_unlock_irq(&table->rwlock); +} + +static void get_gid_entry(struct ib_gid_table_entry *entry) +{ + kref_get(&entry->kref); +} + +static void put_gid_entry(struct ib_gid_table_entry *entry) +{ + kref_put(&entry->kref, schedule_free_gid); +} + +static void put_gid_entry_locked(struct ib_gid_table_entry *entry) +{ + kref_put(&entry->kref, free_gid_entry); +} + +static int add_roce_gid(struct ib_gid_table_entry *entry) +{ + const struct ib_gid_attr *attr = &entry->attr; + int ret; + + if (!attr->ndev) { + dev_err(&attr->device->dev, "%s NULL netdev port=%u index=%u\n", + __func__, attr->port_num, attr->index); + return -EINVAL; + } + if (rdma_cap_roce_gid_table(attr->device, attr->port_num)) { + ret = attr->device->ops.add_gid(attr, &entry->context); + if (ret) { + dev_err(&attr->device->dev, + "%s GID add failed port=%u index=%u\n", + __func__, attr->port_num, attr->index); + return ret; + } + } + return 0; +} + +/** + * del_gid - Delete GID table entry + * + * @ib_dev: IB device whose GID entry to be deleted + * @port: Port number of the IB device + * @table: GID table of the IB device for a port + * @ix: GID entry index to delete + * + */ +static void del_gid(struct ib_device *ib_dev, u32 port, + struct ib_gid_table *table, int ix) +{ + struct roce_gid_ndev_storage *ndev_storage; + struct ib_gid_table_entry *entry; + + lockdep_assert_held(&table->lock); + + dev_dbg(&ib_dev->dev, "%s port=%u index=%d gid %pI6\n", __func__, port, + ix, table->data_vec[ix]->attr.gid.raw); + + write_lock_irq(&table->rwlock); + entry = table->data_vec[ix]; + entry->state = GID_TABLE_ENTRY_PENDING_DEL; + /* + * For non RoCE protocol, GID entry slot is ready to use. + */ + if (!rdma_protocol_roce(ib_dev, port)) + table->data_vec[ix] = NULL; + write_unlock_irq(&table->rwlock); + + ndev_storage = entry->ndev_storage; + if (ndev_storage) { + entry->ndev_storage = NULL; + rcu_assign_pointer(entry->attr.ndev, NULL); + call_rcu(&ndev_storage->rcu_head, put_gid_ndev); + } + + if (rdma_cap_roce_gid_table(ib_dev, port)) + ib_dev->ops.del_gid(&entry->attr, &entry->context); + + put_gid_entry_locked(entry); +} + +/** + * add_modify_gid - Add or modify GID table entry + * + * @table: GID table in which GID to be added or modified + * @attr: Attributes of the GID + * + * Returns 0 on success or appropriate error code. It accepts zero + * GID addition for non RoCE ports for HCA's who report them as valid + * GID. However such zero GIDs are not added to the cache. + */ +static int add_modify_gid(struct ib_gid_table *table, + const struct ib_gid_attr *attr) +{ + struct ib_gid_table_entry *entry; + int ret = 0; + + /* + * Invalidate any old entry in the table to make it safe to write to + * this index. + */ + if (is_gid_entry_valid(table->data_vec[attr->index])) + del_gid(attr->device, attr->port_num, table, attr->index); + + /* + * Some HCA's report multiple GID entries with only one valid GID, and + * leave other unused entries as the zero GID. Convert zero GIDs to + * empty table entries instead of storing them. + */ + if (rdma_is_zero_gid(&attr->gid)) + return 0; + + entry = alloc_gid_entry(attr); + if (!entry) + return -ENOMEM; + + if (rdma_protocol_roce(attr->device, attr->port_num)) { + ret = add_roce_gid(entry); + if (ret) + goto done; + } + + store_gid_entry(table, entry); + return 0; + +done: + put_gid_entry(entry); + return ret; +} + +/* rwlock should be read locked, or lock should be held */ +static int find_gid(struct ib_gid_table *table, const union ib_gid *gid, + const struct ib_gid_attr *val, bool default_gid, + unsigned long mask, int *pempty) +{ + int i = 0; + int found = -1; + int empty = pempty ? -1 : 0; + + while (i < table->sz && (found < 0 || empty < 0)) { + struct ib_gid_table_entry *data = table->data_vec[i]; + struct ib_gid_attr *attr; + int curr_index = i; + + i++; + + /* find_gid() is used during GID addition where it is expected + * to return a free entry slot which is not duplicate. + * Free entry slot is requested and returned if pempty is set, + * so lookup free slot only if requested. + */ + if (pempty && empty < 0) { + if (is_gid_entry_free(data) && + default_gid == + is_gid_index_default(table, curr_index)) { + /* + * Found an invalid (free) entry; allocate it. + * If default GID is requested, then our + * found slot must be one of the DEFAULT + * reserved slots or we fail. + * This ensures that only DEFAULT reserved + * slots are used for default property GIDs. + */ + empty = curr_index; + } + } + + /* + * Additionally find_gid() is used to find valid entry during + * lookup operation; so ignore the entries which are marked as + * pending for removal and the entries which are marked as + * invalid. + */ + if (!is_gid_entry_valid(data)) + continue; + + if (found >= 0) + continue; + + attr = &data->attr; + if (mask & GID_ATTR_FIND_MASK_GID_TYPE && + attr->gid_type != val->gid_type) + continue; + + if (mask & GID_ATTR_FIND_MASK_GID && + memcmp(gid, &data->attr.gid, sizeof(*gid))) + continue; + + if (mask & GID_ATTR_FIND_MASK_NETDEV && + attr->ndev != val->ndev) + continue; + + if (mask & GID_ATTR_FIND_MASK_DEFAULT && + is_gid_index_default(table, curr_index) != default_gid) + continue; + + found = curr_index; + } + + if (pempty) + *pempty = empty; + + return found; +} + +static void make_default_gid(struct net_device *dev, union ib_gid *gid) +{ + gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL); + addrconf_ifid_eui48(&gid->raw[8], dev); +} + +static int __ib_cache_gid_add(struct ib_device *ib_dev, u32 port, + union ib_gid *gid, struct ib_gid_attr *attr, + unsigned long mask, bool default_gid) +{ + struct ib_gid_table *table; + int ret = 0; + int empty; + int ix; + + /* Do not allow adding zero GID in support of + * IB spec version 1.3 section 4.1.1 point (6) and + * section 12.7.10 and section 12.7.20 + */ + if (rdma_is_zero_gid(gid)) + return -EINVAL; + + table = rdma_gid_table(ib_dev, port); + + mutex_lock(&table->lock); + + ix = find_gid(table, gid, attr, default_gid, mask, &empty); + if (ix >= 0) + goto out_unlock; + + if (empty < 0) { + ret = -ENOSPC; + goto out_unlock; + } + attr->device = ib_dev; + attr->index = empty; + attr->port_num = port; + attr->gid = *gid; + ret = add_modify_gid(table, attr); + if (!ret) + dispatch_gid_change_event(ib_dev, port); + +out_unlock: + mutex_unlock(&table->lock); + if (ret) + pr_warn("%s: unable to add gid %pI6 error=%d\n", + __func__, gid->raw, ret); + return ret; +} + +int ib_cache_gid_add(struct ib_device *ib_dev, u32 port, + union ib_gid *gid, struct ib_gid_attr *attr) +{ + unsigned long mask = GID_ATTR_FIND_MASK_GID | + GID_ATTR_FIND_MASK_GID_TYPE | + GID_ATTR_FIND_MASK_NETDEV; + + return __ib_cache_gid_add(ib_dev, port, gid, attr, mask, false); +} + +static int +_ib_cache_gid_del(struct ib_device *ib_dev, u32 port, + union ib_gid *gid, struct ib_gid_attr *attr, + unsigned long mask, bool default_gid) +{ + struct ib_gid_table *table; + int ret = 0; + int ix; + + table = rdma_gid_table(ib_dev, port); + + mutex_lock(&table->lock); + + ix = find_gid(table, gid, attr, default_gid, mask, NULL); + if (ix < 0) { + ret = -EINVAL; + goto out_unlock; + } + + del_gid(ib_dev, port, table, ix); + dispatch_gid_change_event(ib_dev, port); + +out_unlock: + mutex_unlock(&table->lock); + if (ret) + pr_debug("%s: can't delete gid %pI6 error=%d\n", + __func__, gid->raw, ret); + return ret; +} + +int ib_cache_gid_del(struct ib_device *ib_dev, u32 port, + union ib_gid *gid, struct ib_gid_attr *attr) +{ + unsigned long mask = GID_ATTR_FIND_MASK_GID | + GID_ATTR_FIND_MASK_GID_TYPE | + GID_ATTR_FIND_MASK_DEFAULT | + GID_ATTR_FIND_MASK_NETDEV; + + return _ib_cache_gid_del(ib_dev, port, gid, attr, mask, false); +} + +int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u32 port, + struct net_device *ndev) +{ + struct ib_gid_table *table; + int ix; + bool deleted = false; + + table = rdma_gid_table(ib_dev, port); + + mutex_lock(&table->lock); + + for (ix = 0; ix < table->sz; ix++) { + if (is_gid_entry_valid(table->data_vec[ix]) && + table->data_vec[ix]->attr.ndev == ndev) { + del_gid(ib_dev, port, table, ix); + deleted = true; + } + } + + mutex_unlock(&table->lock); + + if (deleted) + dispatch_gid_change_event(ib_dev, port); + + return 0; +} + +/** + * rdma_find_gid_by_port - Returns the GID entry attributes when it finds + * a valid GID entry for given search parameters. It searches for the specified + * GID value in the local software cache. + * @ib_dev: The device to query. + * @gid: The GID value to search for. + * @gid_type: The GID type to search for. + * @port: The port number of the device where the GID value should be searched. + * @ndev: In RoCE, the net device of the device. NULL means ignore. + * + * Returns sgid attributes if the GID is found with valid reference or + * returns ERR_PTR for the error. + * The caller must invoke rdma_put_gid_attr() to release the reference. + */ +const struct ib_gid_attr * +rdma_find_gid_by_port(struct ib_device *ib_dev, + const union ib_gid *gid, + enum ib_gid_type gid_type, + u32 port, struct net_device *ndev) +{ + int local_index; + struct ib_gid_table *table; + unsigned long mask = GID_ATTR_FIND_MASK_GID | + GID_ATTR_FIND_MASK_GID_TYPE; + struct ib_gid_attr val = {.ndev = ndev, .gid_type = gid_type}; + const struct ib_gid_attr *attr; + unsigned long flags; + + if (!rdma_is_port_valid(ib_dev, port)) + return ERR_PTR(-ENOENT); + + table = rdma_gid_table(ib_dev, port); + + if (ndev) + mask |= GID_ATTR_FIND_MASK_NETDEV; + + read_lock_irqsave(&table->rwlock, flags); + local_index = find_gid(table, gid, &val, false, mask, NULL); + if (local_index >= 0) { + get_gid_entry(table->data_vec[local_index]); + attr = &table->data_vec[local_index]->attr; + read_unlock_irqrestore(&table->rwlock, flags); + return attr; + } + + read_unlock_irqrestore(&table->rwlock, flags); + return ERR_PTR(-ENOENT); +} +EXPORT_SYMBOL(rdma_find_gid_by_port); + +/** + * rdma_find_gid_by_filter - Returns the GID table attribute where a + * specified GID value occurs + * @ib_dev: The device to query. + * @gid: The GID value to search for. + * @port: The port number of the device where the GID value could be + * searched. + * @filter: The filter function is executed on any matching GID in the table. + * If the filter function returns true, the corresponding index is returned, + * otherwise, we continue searching the GID table. It's guaranteed that + * while filter is executed, ndev field is valid and the structure won't + * change. filter is executed in an atomic context. filter must not be NULL. + * @context: Private data to pass into the call-back. + * + * rdma_find_gid_by_filter() searches for the specified GID value + * of which the filter function returns true in the port's GID table. + * + */ +const struct ib_gid_attr *rdma_find_gid_by_filter( + struct ib_device *ib_dev, const union ib_gid *gid, u32 port, + bool (*filter)(const union ib_gid *gid, const struct ib_gid_attr *, + void *), + void *context) +{ + const struct ib_gid_attr *res = ERR_PTR(-ENOENT); + struct ib_gid_table *table; + unsigned long flags; + unsigned int i; + + if (!rdma_is_port_valid(ib_dev, port)) + return ERR_PTR(-EINVAL); + + table = rdma_gid_table(ib_dev, port); + + read_lock_irqsave(&table->rwlock, flags); + for (i = 0; i < table->sz; i++) { + struct ib_gid_table_entry *entry = table->data_vec[i]; + + if (!is_gid_entry_valid(entry)) + continue; + + if (memcmp(gid, &entry->attr.gid, sizeof(*gid))) + continue; + + if (filter(gid, &entry->attr, context)) { + get_gid_entry(entry); + res = &entry->attr; + break; + } + } + read_unlock_irqrestore(&table->rwlock, flags); + return res; +} + +static struct ib_gid_table *alloc_gid_table(int sz) +{ + struct ib_gid_table *table = kzalloc(sizeof(*table), GFP_KERNEL); + + if (!table) + return NULL; + + table->data_vec = kcalloc(sz, sizeof(*table->data_vec), GFP_KERNEL); + if (!table->data_vec) + goto err_free_table; + + mutex_init(&table->lock); + + table->sz = sz; + rwlock_init(&table->rwlock); + return table; + +err_free_table: + kfree(table); + return NULL; +} + +static void release_gid_table(struct ib_device *device, + struct ib_gid_table *table) +{ + bool leak = false; + int i; + + if (!table) + return; + + for (i = 0; i < table->sz; i++) { + if (is_gid_entry_free(table->data_vec[i])) + continue; + if (kref_read(&table->data_vec[i]->kref) > 1) { + dev_err(&device->dev, + "GID entry ref leak for index %d ref=%u\n", i, + kref_read(&table->data_vec[i]->kref)); + leak = true; + } + } + if (leak) + return; + + mutex_destroy(&table->lock); + kfree(table->data_vec); + kfree(table); +} + +static void cleanup_gid_table_port(struct ib_device *ib_dev, u32 port, + struct ib_gid_table *table) +{ + int i; + + if (!table) + return; + + mutex_lock(&table->lock); + for (i = 0; i < table->sz; ++i) { + if (is_gid_entry_valid(table->data_vec[i])) + del_gid(ib_dev, port, table, i); + } + mutex_unlock(&table->lock); +} + +void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u32 port, + struct net_device *ndev, + unsigned long gid_type_mask, + enum ib_cache_gid_default_mode mode) +{ + union ib_gid gid = { }; + struct ib_gid_attr gid_attr; + unsigned int gid_type; + unsigned long mask; + + mask = GID_ATTR_FIND_MASK_GID_TYPE | + GID_ATTR_FIND_MASK_DEFAULT | + GID_ATTR_FIND_MASK_NETDEV; + memset(&gid_attr, 0, sizeof(gid_attr)); + gid_attr.ndev = ndev; + + for (gid_type = 0; gid_type < IB_GID_TYPE_SIZE; ++gid_type) { + if (1UL << gid_type & ~gid_type_mask) + continue; + + gid_attr.gid_type = gid_type; + + if (mode == IB_CACHE_GID_DEFAULT_MODE_SET) { + make_default_gid(ndev, &gid); + __ib_cache_gid_add(ib_dev, port, &gid, + &gid_attr, mask, true); + } else if (mode == IB_CACHE_GID_DEFAULT_MODE_DELETE) { + _ib_cache_gid_del(ib_dev, port, &gid, + &gid_attr, mask, true); + } + } +} + +static void gid_table_reserve_default(struct ib_device *ib_dev, u32 port, + struct ib_gid_table *table) +{ + unsigned int i; + unsigned long roce_gid_type_mask; + unsigned int num_default_gids; + + roce_gid_type_mask = roce_gid_type_mask_support(ib_dev, port); + num_default_gids = hweight_long(roce_gid_type_mask); + /* Reserve starting indices for default GIDs */ + for (i = 0; i < num_default_gids && i < table->sz; i++) + table->default_gid_indices |= BIT(i); +} + + +static void gid_table_release_one(struct ib_device *ib_dev) +{ + u32 p; + + rdma_for_each_port (ib_dev, p) { + release_gid_table(ib_dev, ib_dev->port_data[p].cache.gid); + ib_dev->port_data[p].cache.gid = NULL; + } +} + +static int _gid_table_setup_one(struct ib_device *ib_dev) +{ + struct ib_gid_table *table; + u32 rdma_port; + + rdma_for_each_port (ib_dev, rdma_port) { + table = alloc_gid_table( + ib_dev->port_data[rdma_port].immutable.gid_tbl_len); + if (!table) + goto rollback_table_setup; + + gid_table_reserve_default(ib_dev, rdma_port, table); + ib_dev->port_data[rdma_port].cache.gid = table; + } + return 0; + +rollback_table_setup: + gid_table_release_one(ib_dev); + return -ENOMEM; +} + +static void gid_table_cleanup_one(struct ib_device *ib_dev) +{ + u32 p; + + rdma_for_each_port (ib_dev, p) + cleanup_gid_table_port(ib_dev, p, + ib_dev->port_data[p].cache.gid); +} + +static int gid_table_setup_one(struct ib_device *ib_dev) +{ + int err; + + err = _gid_table_setup_one(ib_dev); + + if (err) + return err; + + rdma_roce_rescan_device(ib_dev); + + return err; +} + +/** + * rdma_query_gid - Read the GID content from the GID software cache + * @device: Device to query the GID + * @port_num: Port number of the device + * @index: Index of the GID table entry to read + * @gid: Pointer to GID where to store the entry's GID + * + * rdma_query_gid() only reads the GID entry content for requested device, + * port and index. It reads for IB, RoCE and iWarp link layers. It doesn't + * hold any reference to the GID table entry in the HCA or software cache. + * + * Returns 0 on success or appropriate error code. + * + */ +int rdma_query_gid(struct ib_device *device, u32 port_num, + int index, union ib_gid *gid) +{ + struct ib_gid_table *table; + unsigned long flags; + int res; + + if (!rdma_is_port_valid(device, port_num)) + return -EINVAL; + + table = rdma_gid_table(device, port_num); + read_lock_irqsave(&table->rwlock, flags); + + if (index < 0 || index >= table->sz) { + res = -EINVAL; + goto done; + } + + if (!is_gid_entry_valid(table->data_vec[index])) { + res = -ENOENT; + goto done; + } + + memcpy(gid, &table->data_vec[index]->attr.gid, sizeof(*gid)); + res = 0; + +done: + read_unlock_irqrestore(&table->rwlock, flags); + return res; +} +EXPORT_SYMBOL(rdma_query_gid); + +/** + * rdma_read_gid_hw_context - Read the HW GID context from GID attribute + * @attr: Potinter to the GID attribute + * + * rdma_read_gid_hw_context() reads the drivers GID HW context corresponding + * to the SGID attr. Callers are required to already be holding the reference + * to an existing GID entry. + * + * Returns the HW GID context + * + */ +void *rdma_read_gid_hw_context(const struct ib_gid_attr *attr) +{ + return container_of(attr, struct ib_gid_table_entry, attr)->context; +} +EXPORT_SYMBOL(rdma_read_gid_hw_context); + +/** + * rdma_find_gid - Returns SGID attributes if the matching GID is found. + * @device: The device to query. + * @gid: The GID value to search for. + * @gid_type: The GID type to search for. + * @ndev: In RoCE, the net device of the device. NULL means ignore. + * + * rdma_find_gid() searches for the specified GID value in the software cache. + * + * Returns GID attributes if a valid GID is found or returns ERR_PTR for the + * error. The caller must invoke rdma_put_gid_attr() to release the reference. + * + */ +const struct ib_gid_attr *rdma_find_gid(struct ib_device *device, + const union ib_gid *gid, + enum ib_gid_type gid_type, + struct net_device *ndev) +{ + unsigned long mask = GID_ATTR_FIND_MASK_GID | + GID_ATTR_FIND_MASK_GID_TYPE; + struct ib_gid_attr gid_attr_val = {.ndev = ndev, .gid_type = gid_type}; + u32 p; + + if (ndev) + mask |= GID_ATTR_FIND_MASK_NETDEV; + + rdma_for_each_port(device, p) { + struct ib_gid_table *table; + unsigned long flags; + int index; + + table = device->port_data[p].cache.gid; + read_lock_irqsave(&table->rwlock, flags); + index = find_gid(table, gid, &gid_attr_val, false, mask, NULL); + if (index >= 0) { + const struct ib_gid_attr *attr; + + get_gid_entry(table->data_vec[index]); + attr = &table->data_vec[index]->attr; + read_unlock_irqrestore(&table->rwlock, flags); + return attr; + } + read_unlock_irqrestore(&table->rwlock, flags); + } + + return ERR_PTR(-ENOENT); +} +EXPORT_SYMBOL(rdma_find_gid); + +int ib_get_cached_pkey(struct ib_device *device, + u32 port_num, + int index, + u16 *pkey) +{ + struct ib_pkey_cache *cache; + unsigned long flags; + int ret = 0; + + if (!rdma_is_port_valid(device, port_num)) + return -EINVAL; + + read_lock_irqsave(&device->cache_lock, flags); + + cache = device->port_data[port_num].cache.pkey; + + if (!cache || index < 0 || index >= cache->table_len) + ret = -EINVAL; + else + *pkey = cache->table[index]; + + read_unlock_irqrestore(&device->cache_lock, flags); + + return ret; +} +EXPORT_SYMBOL(ib_get_cached_pkey); + +void ib_get_cached_subnet_prefix(struct ib_device *device, u32 port_num, + u64 *sn_pfx) +{ + unsigned long flags; + + read_lock_irqsave(&device->cache_lock, flags); + *sn_pfx = device->port_data[port_num].cache.subnet_prefix; + read_unlock_irqrestore(&device->cache_lock, flags); +} +EXPORT_SYMBOL(ib_get_cached_subnet_prefix); + +int ib_find_cached_pkey(struct ib_device *device, u32 port_num, + u16 pkey, u16 *index) +{ + struct ib_pkey_cache *cache; + unsigned long flags; + int i; + int ret = -ENOENT; + int partial_ix = -1; + + if (!rdma_is_port_valid(device, port_num)) + return -EINVAL; + + read_lock_irqsave(&device->cache_lock, flags); + + cache = device->port_data[port_num].cache.pkey; + if (!cache) { + ret = -EINVAL; + goto err; + } + + *index = -1; + + for (i = 0; i < cache->table_len; ++i) + if ((cache->table[i] & 0x7fff) == (pkey & 0x7fff)) { + if (cache->table[i] & 0x8000) { + *index = i; + ret = 0; + break; + } else { + partial_ix = i; + } + } + + if (ret && partial_ix >= 0) { + *index = partial_ix; + ret = 0; + } + +err: + read_unlock_irqrestore(&device->cache_lock, flags); + + return ret; +} +EXPORT_SYMBOL(ib_find_cached_pkey); + +int ib_find_exact_cached_pkey(struct ib_device *device, u32 port_num, + u16 pkey, u16 *index) +{ + struct ib_pkey_cache *cache; + unsigned long flags; + int i; + int ret = -ENOENT; + + if (!rdma_is_port_valid(device, port_num)) + return -EINVAL; + + read_lock_irqsave(&device->cache_lock, flags); + + cache = device->port_data[port_num].cache.pkey; + if (!cache) { + ret = -EINVAL; + goto err; + } + + *index = -1; + + for (i = 0; i < cache->table_len; ++i) + if (cache->table[i] == pkey) { + *index = i; + ret = 0; + break; + } + +err: + read_unlock_irqrestore(&device->cache_lock, flags); + + return ret; +} +EXPORT_SYMBOL(ib_find_exact_cached_pkey); + +int ib_get_cached_lmc(struct ib_device *device, u32 port_num, u8 *lmc) +{ + unsigned long flags; + int ret = 0; + + if (!rdma_is_port_valid(device, port_num)) + return -EINVAL; + + read_lock_irqsave(&device->cache_lock, flags); + *lmc = device->port_data[port_num].cache.lmc; + read_unlock_irqrestore(&device->cache_lock, flags); + + return ret; +} +EXPORT_SYMBOL(ib_get_cached_lmc); + +int ib_get_cached_port_state(struct ib_device *device, u32 port_num, + enum ib_port_state *port_state) +{ + unsigned long flags; + int ret = 0; + + if (!rdma_is_port_valid(device, port_num)) + return -EINVAL; + + read_lock_irqsave(&device->cache_lock, flags); + *port_state = device->port_data[port_num].cache.port_state; + read_unlock_irqrestore(&device->cache_lock, flags); + + return ret; +} +EXPORT_SYMBOL(ib_get_cached_port_state); + +/** + * rdma_get_gid_attr - Returns GID attributes for a port of a device + * at a requested gid_index, if a valid GID entry exists. + * @device: The device to query. + * @port_num: The port number on the device where the GID value + * is to be queried. + * @index: Index of the GID table entry whose attributes are to + * be queried. + * + * rdma_get_gid_attr() acquires reference count of gid attributes from the + * cached GID table. Caller must invoke rdma_put_gid_attr() to release + * reference to gid attribute regardless of link layer. + * + * Returns pointer to valid gid attribute or ERR_PTR for the appropriate error + * code. + */ +const struct ib_gid_attr * +rdma_get_gid_attr(struct ib_device *device, u32 port_num, int index) +{ + const struct ib_gid_attr *attr = ERR_PTR(-ENODATA); + struct ib_gid_table *table; + unsigned long flags; + + if (!rdma_is_port_valid(device, port_num)) + return ERR_PTR(-EINVAL); + + table = rdma_gid_table(device, port_num); + if (index < 0 || index >= table->sz) + return ERR_PTR(-EINVAL); + + read_lock_irqsave(&table->rwlock, flags); + if (!is_gid_entry_valid(table->data_vec[index])) + goto done; + + get_gid_entry(table->data_vec[index]); + attr = &table->data_vec[index]->attr; +done: + read_unlock_irqrestore(&table->rwlock, flags); + return attr; +} +EXPORT_SYMBOL(rdma_get_gid_attr); + +/** + * rdma_query_gid_table - Reads GID table entries of all the ports of a device up to max_entries. + * @device: The device to query. + * @entries: Entries where GID entries are returned. + * @max_entries: Maximum number of entries that can be returned. + * Entries array must be allocated to hold max_entries number of entries. + * + * Returns number of entries on success or appropriate error code. + */ +ssize_t rdma_query_gid_table(struct ib_device *device, + struct ib_uverbs_gid_entry *entries, + size_t max_entries) +{ + const struct ib_gid_attr *gid_attr; + ssize_t num_entries = 0, ret; + struct ib_gid_table *table; + u32 port_num, i; + struct net_device *ndev; + unsigned long flags; + + rdma_for_each_port(device, port_num) { + table = rdma_gid_table(device, port_num); + read_lock_irqsave(&table->rwlock, flags); + for (i = 0; i < table->sz; i++) { + if (!is_gid_entry_valid(table->data_vec[i])) + continue; + if (num_entries >= max_entries) { + ret = -EINVAL; + goto err; + } + + gid_attr = &table->data_vec[i]->attr; + + memcpy(&entries->gid, &gid_attr->gid, + sizeof(gid_attr->gid)); + entries->gid_index = gid_attr->index; + entries->port_num = gid_attr->port_num; + entries->gid_type = gid_attr->gid_type; + ndev = rcu_dereference_protected( + gid_attr->ndev, + lockdep_is_held(&table->rwlock)); + if (ndev) + entries->netdev_ifindex = ndev->ifindex; + + num_entries++; + entries++; + } + read_unlock_irqrestore(&table->rwlock, flags); + } + + return num_entries; +err: + read_unlock_irqrestore(&table->rwlock, flags); + return ret; +} +EXPORT_SYMBOL(rdma_query_gid_table); + +/** + * rdma_put_gid_attr - Release reference to the GID attribute + * @attr: Pointer to the GID attribute whose reference + * needs to be released. + * + * rdma_put_gid_attr() must be used to release reference whose + * reference is acquired using rdma_get_gid_attr() or any APIs + * which returns a pointer to the ib_gid_attr regardless of link layer + * of IB or RoCE. + * + */ +void rdma_put_gid_attr(const struct ib_gid_attr *attr) +{ + struct ib_gid_table_entry *entry = + container_of(attr, struct ib_gid_table_entry, attr); + + put_gid_entry(entry); +} +EXPORT_SYMBOL(rdma_put_gid_attr); + +/** + * rdma_hold_gid_attr - Get reference to existing GID attribute + * + * @attr: Pointer to the GID attribute whose reference + * needs to be taken. + * + * Increase the reference count to a GID attribute to keep it from being + * freed. Callers are required to already be holding a reference to attribute. + * + */ +void rdma_hold_gid_attr(const struct ib_gid_attr *attr) +{ + struct ib_gid_table_entry *entry = + container_of(attr, struct ib_gid_table_entry, attr); + + get_gid_entry(entry); +} +EXPORT_SYMBOL(rdma_hold_gid_attr); + +/** + * rdma_is_gid_attr_valid - Check if referenced GID attribute is valid or not + * + * @attr: Pointer to the GID attribute + * + * Returns true if the GID attribute is valid, or false otherwise. + * + */ +bool rdma_is_gid_attr_valid(const struct ib_gid_attr *attr) +{ + struct ib_gid_table_entry *entry = + container_of(attr, struct ib_gid_table_entry, attr); + struct ib_device *device = entry->attr.device; + u32 port_num = entry->attr.port_num; + struct ib_gid_table *table; + unsigned long flags; + bool valid; + + table = rdma_gid_table(device, port_num); + + read_lock_irqsave(&table->rwlock, flags); + valid = is_gid_entry_valid(table->data_vec[attr->index]); + read_unlock_irqrestore(&table->rwlock, flags); + return valid; +} + +/** + * rdma_read_gid_attr_ndev_rcu - Read GID attribute netdevice + * which must be in UP state. + * + * @attr:Pointer to the GID attribute + * + * Returns pointer to netdevice if the netdevice was attached to GID and + * netdevice is in UP state. Caller must hold RCU lock as this API + * reads the netdev flags which can change while netdevice migrates to + * different net namespace. Returns ERR_PTR with error code otherwise. + * + */ +struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr) +{ + struct ib_gid_table_entry *entry = + container_of(attr, struct ib_gid_table_entry, attr); + struct ib_device *device = entry->attr.device; + struct net_device *ndev = ERR_PTR(-EINVAL); + u32 port_num = entry->attr.port_num; + struct ib_gid_table *table; + unsigned long flags; + bool valid; + + table = rdma_gid_table(device, port_num); + + read_lock_irqsave(&table->rwlock, flags); + valid = is_gid_entry_valid(table->data_vec[attr->index]); + if (valid) { + ndev = rcu_dereference(attr->ndev); + if (!ndev) + ndev = ERR_PTR(-ENODEV); + } + read_unlock_irqrestore(&table->rwlock, flags); + return ndev; +} +EXPORT_SYMBOL(rdma_read_gid_attr_ndev_rcu); + +static int get_lower_dev_vlan(struct net_device *lower_dev, + struct netdev_nested_priv *priv) +{ + u16 *vlan_id = (u16 *)priv->data; + + if (is_vlan_dev(lower_dev)) + *vlan_id = vlan_dev_vlan_id(lower_dev); + + /* We are interested only in first level vlan device, so + * always return 1 to stop iterating over next level devices. + */ + return 1; +} + +/** + * rdma_read_gid_l2_fields - Read the vlan ID and source MAC address + * of a GID entry. + * + * @attr: GID attribute pointer whose L2 fields to be read + * @vlan_id: Pointer to vlan id to fill up if the GID entry has + * vlan id. It is optional. + * @smac: Pointer to smac to fill up for a GID entry. It is optional. + * + * rdma_read_gid_l2_fields() returns 0 on success and returns vlan id + * (if gid entry has vlan) and source MAC, or returns error. + */ +int rdma_read_gid_l2_fields(const struct ib_gid_attr *attr, + u16 *vlan_id, u8 *smac) +{ + struct netdev_nested_priv priv = { + .data = (void *)vlan_id, + }; + struct net_device *ndev; + + rcu_read_lock(); + ndev = rcu_dereference(attr->ndev); + if (!ndev) { + rcu_read_unlock(); + return -ENODEV; + } + if (smac) + ether_addr_copy(smac, ndev->dev_addr); + if (vlan_id) { + *vlan_id = 0xffff; + if (is_vlan_dev(ndev)) { + *vlan_id = vlan_dev_vlan_id(ndev); + } else { + /* If the netdev is upper device and if it's lower + * device is vlan device, consider vlan id of the + * the lower vlan device for this gid entry. + */ + netdev_walk_all_lower_dev_rcu(attr->ndev, + get_lower_dev_vlan, &priv); + } + } + rcu_read_unlock(); + return 0; +} +EXPORT_SYMBOL(rdma_read_gid_l2_fields); + +static int config_non_roce_gid_cache(struct ib_device *device, + u32 port, struct ib_port_attr *tprops) +{ + struct ib_gid_attr gid_attr = {}; + struct ib_gid_table *table; + int ret = 0; + int i; + + gid_attr.device = device; + gid_attr.port_num = port; + table = rdma_gid_table(device, port); + + mutex_lock(&table->lock); + for (i = 0; i < tprops->gid_tbl_len; ++i) { + if (!device->ops.query_gid) + continue; + ret = device->ops.query_gid(device, port, i, &gid_attr.gid); + if (ret) { + dev_warn(&device->dev, + "query_gid failed (%d) for index %d\n", ret, + i); + goto err; + } + gid_attr.index = i; + tprops->subnet_prefix = + be64_to_cpu(gid_attr.gid.global.subnet_prefix); + add_modify_gid(table, &gid_attr); + } +err: + mutex_unlock(&table->lock); + return ret; +} + +static int +ib_cache_update(struct ib_device *device, u32 port, bool update_gids, + bool update_pkeys, bool enforce_security) +{ + struct ib_port_attr *tprops = NULL; + struct ib_pkey_cache *pkey_cache = NULL; + struct ib_pkey_cache *old_pkey_cache = NULL; + int i; + int ret; + + if (!rdma_is_port_valid(device, port)) + return -EINVAL; + + tprops = kmalloc(sizeof *tprops, GFP_KERNEL); + if (!tprops) + return -ENOMEM; + + ret = ib_query_port(device, port, tprops); + if (ret) { + dev_warn(&device->dev, "ib_query_port failed (%d)\n", ret); + goto err; + } + + if (!rdma_protocol_roce(device, port) && update_gids) { + ret = config_non_roce_gid_cache(device, port, + tprops); + if (ret) + goto err; + } + + update_pkeys &= !!tprops->pkey_tbl_len; + + if (update_pkeys) { + pkey_cache = kmalloc(struct_size(pkey_cache, table, + tprops->pkey_tbl_len), + GFP_KERNEL); + if (!pkey_cache) { + ret = -ENOMEM; + goto err; + } + + pkey_cache->table_len = tprops->pkey_tbl_len; + + for (i = 0; i < pkey_cache->table_len; ++i) { + ret = ib_query_pkey(device, port, i, + pkey_cache->table + i); + if (ret) { + dev_warn(&device->dev, + "ib_query_pkey failed (%d) for index %d\n", + ret, i); + goto err; + } + } + } + + write_lock_irq(&device->cache_lock); + + if (update_pkeys) { + old_pkey_cache = device->port_data[port].cache.pkey; + device->port_data[port].cache.pkey = pkey_cache; + } + device->port_data[port].cache.lmc = tprops->lmc; + device->port_data[port].cache.port_state = tprops->state; + + device->port_data[port].cache.subnet_prefix = tprops->subnet_prefix; + write_unlock_irq(&device->cache_lock); + + if (enforce_security) + ib_security_cache_change(device, + port, + tprops->subnet_prefix); + + kfree(old_pkey_cache); + kfree(tprops); + return 0; + +err: + kfree(pkey_cache); + kfree(tprops); + return ret; +} + +static void ib_cache_event_task(struct work_struct *_work) +{ + struct ib_update_work *work = + container_of(_work, struct ib_update_work, work); + int ret; + + /* Before distributing the cache update event, first sync + * the cache. + */ + ret = ib_cache_update(work->event.device, work->event.element.port_num, + work->event.event == IB_EVENT_GID_CHANGE, + work->event.event == IB_EVENT_PKEY_CHANGE, + work->enforce_security); + + /* GID event is notified already for individual GID entries by + * dispatch_gid_change_event(). Hence, notifiy for rest of the + * events. + */ + if (!ret && work->event.event != IB_EVENT_GID_CHANGE) + ib_dispatch_event_clients(&work->event); + + kfree(work); +} + +static void ib_generic_event_task(struct work_struct *_work) +{ + struct ib_update_work *work = + container_of(_work, struct ib_update_work, work); + + ib_dispatch_event_clients(&work->event); + kfree(work); +} + +static bool is_cache_update_event(const struct ib_event *event) +{ + return (event->event == IB_EVENT_PORT_ERR || + event->event == IB_EVENT_PORT_ACTIVE || + event->event == IB_EVENT_LID_CHANGE || + event->event == IB_EVENT_PKEY_CHANGE || + event->event == IB_EVENT_CLIENT_REREGISTER || + event->event == IB_EVENT_GID_CHANGE); +} + +/** + * ib_dispatch_event - Dispatch an asynchronous event + * @event:Event to dispatch + * + * Low-level drivers must call ib_dispatch_event() to dispatch the + * event to all registered event handlers when an asynchronous event + * occurs. + */ +void ib_dispatch_event(const struct ib_event *event) +{ + struct ib_update_work *work; + + work = kzalloc(sizeof(*work), GFP_ATOMIC); + if (!work) + return; + + if (is_cache_update_event(event)) + INIT_WORK(&work->work, ib_cache_event_task); + else + INIT_WORK(&work->work, ib_generic_event_task); + + work->event = *event; + if (event->event == IB_EVENT_PKEY_CHANGE || + event->event == IB_EVENT_GID_CHANGE) + work->enforce_security = true; + + queue_work(ib_wq, &work->work); +} +EXPORT_SYMBOL(ib_dispatch_event); + +int ib_cache_setup_one(struct ib_device *device) +{ + u32 p; + int err; + + err = gid_table_setup_one(device); + if (err) + return err; + + rdma_for_each_port (device, p) { + err = ib_cache_update(device, p, true, true, true); + if (err) + return err; + } + + return 0; +} + +void ib_cache_release_one(struct ib_device *device) +{ + u32 p; + + /* + * The release function frees all the cache elements. + * This function should be called as part of freeing + * all the device's resources when the cache could no + * longer be accessed. + */ + rdma_for_each_port (device, p) + kfree(device->port_data[p].cache.pkey); + + gid_table_release_one(device); +} + +void ib_cache_cleanup_one(struct ib_device *device) +{ + /* The cleanup function waits for all in-progress workqueue + * elements and cleans up the GID cache. This function should be + * called after the device was removed from the devices list and + * all clients were removed, so the cache exists but is + * non-functional and shouldn't be updated anymore. + */ + flush_workqueue(ib_wq); + gid_table_cleanup_one(device); + + /* + * Flush the wq second time for any pending GID delete work. + */ + flush_workqueue(ib_wq); +} +/** + * rdma_check_gid_user_access - Check if user process can access + * this GID entry or not. + * @attr: Pointer to GID entry attribute + * + * rdma_check_gid_user_access() returns true if user process can access + * this GID attribute otherwise returns false. This API should be called + * from the userspace process context. + */ +bool rdma_check_gid_user_access(const struct ib_gid_attr *attr) +{ + bool allow; + /* + * For IB and iWarp, there is no netdevice associate with GID entry, + * For RoCE consider the netdevice's net ns to validate against the + * calling process. + */ + rcu_read_lock(); + if (!attr->ndev || + (attr->ndev && + net_eq(dev_net(attr->ndev), current->nsproxy->net_ns))) + allow = true; + else + allow = false; + rcu_read_unlock(); + return allow; +} + diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/cgroup.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/cgroup.c new file mode 100644 index 0000000..1f037fe --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/cgroup.c @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2016 Parav Pandit + */ + +#include "core_priv.h" + +/** + * ib_device_register_rdmacg - register with rdma cgroup. + * @device: device to register to participate in resource + * accounting by rdma cgroup. + * + * Register with the rdma cgroup. Should be called before + * exposing rdma device to user space applications to avoid + * resource accounting leak. + */ +void ib_device_register_rdmacg(struct ib_device *device) +{ + device->cg_device.name = device->name; + rdmacg_register_device(&device->cg_device); +} + +/** + * ib_device_unregister_rdmacg - unregister with rdma cgroup. + * @device: device to unregister. + * + * Unregister with the rdma cgroup. Should be called after + * all the resources are deallocated, and after a stage when any + * other resource allocation by user application cannot be done + * for this device to avoid any leak in accounting. + */ +void ib_device_unregister_rdmacg(struct ib_device *device) +{ + rdmacg_unregister_device(&device->cg_device); +} + +int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj, + struct ib_device *device, + enum rdmacg_resource_type resource_index) +{ + return rdmacg_try_charge(&cg_obj->cg, &device->cg_device, + resource_index); +} +EXPORT_SYMBOL(ib_rdmacg_try_charge); + +void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj, + struct ib_device *device, + enum rdmacg_resource_type resource_index) +{ + rdmacg_uncharge(cg_obj->cg, &device->cg_device, + resource_index); +} +EXPORT_SYMBOL(ib_rdmacg_uncharge); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/cm.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/cm.c new file mode 100644 index 0000000..1acc09b --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/cm.c @@ -0,0 +1,4548 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2004-2007 Intel Corporation. All rights reserved. + * Copyright (c) 2004 Topspin Corporation. All rights reserved. + * Copyright (c) 2004, 2005 Voltaire Corporation. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2019, Mellanox Technologies inc. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include "cm_msgs.h" +#include "core_priv.h" +#include "cm_trace.h" + +MODULE_AUTHOR("Sean Hefty"); +MODULE_DESCRIPTION("InfiniBand CM"); +MODULE_LICENSE("Dual BSD/GPL"); + +static const char * const ibcm_rej_reason_strs[] = { + [IB_CM_REJ_NO_QP] = "no QP", + [IB_CM_REJ_NO_EEC] = "no EEC", + [IB_CM_REJ_NO_RESOURCES] = "no resources", + [IB_CM_REJ_TIMEOUT] = "timeout", + [IB_CM_REJ_UNSUPPORTED] = "unsupported", + [IB_CM_REJ_INVALID_COMM_ID] = "invalid comm ID", + [IB_CM_REJ_INVALID_COMM_INSTANCE] = "invalid comm instance", + [IB_CM_REJ_INVALID_SERVICE_ID] = "invalid service ID", + [IB_CM_REJ_INVALID_TRANSPORT_TYPE] = "invalid transport type", + [IB_CM_REJ_STALE_CONN] = "stale conn", + [IB_CM_REJ_RDC_NOT_EXIST] = "RDC not exist", + [IB_CM_REJ_INVALID_GID] = "invalid GID", + [IB_CM_REJ_INVALID_LID] = "invalid LID", + [IB_CM_REJ_INVALID_SL] = "invalid SL", + [IB_CM_REJ_INVALID_TRAFFIC_CLASS] = "invalid traffic class", + [IB_CM_REJ_INVALID_HOP_LIMIT] = "invalid hop limit", + [IB_CM_REJ_INVALID_PACKET_RATE] = "invalid packet rate", + [IB_CM_REJ_INVALID_ALT_GID] = "invalid alt GID", + [IB_CM_REJ_INVALID_ALT_LID] = "invalid alt LID", + [IB_CM_REJ_INVALID_ALT_SL] = "invalid alt SL", + [IB_CM_REJ_INVALID_ALT_TRAFFIC_CLASS] = "invalid alt traffic class", + [IB_CM_REJ_INVALID_ALT_HOP_LIMIT] = "invalid alt hop limit", + [IB_CM_REJ_INVALID_ALT_PACKET_RATE] = "invalid alt packet rate", + [IB_CM_REJ_PORT_CM_REDIRECT] = "port CM redirect", + [IB_CM_REJ_PORT_REDIRECT] = "port redirect", + [IB_CM_REJ_INVALID_MTU] = "invalid MTU", + [IB_CM_REJ_INSUFFICIENT_RESP_RESOURCES] = "insufficient resp resources", + [IB_CM_REJ_CONSUMER_DEFINED] = "consumer defined", + [IB_CM_REJ_INVALID_RNR_RETRY] = "invalid RNR retry", + [IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID] = "duplicate local comm ID", + [IB_CM_REJ_INVALID_CLASS_VERSION] = "invalid class version", + [IB_CM_REJ_INVALID_FLOW_LABEL] = "invalid flow label", + [IB_CM_REJ_INVALID_ALT_FLOW_LABEL] = "invalid alt flow label", + [IB_CM_REJ_VENDOR_OPTION_NOT_SUPPORTED] = + "vendor option is not supported", +}; + +const char *__attribute_const__ ibcm_reject_msg(int reason) +{ + size_t index = reason; + + if (index < ARRAY_SIZE(ibcm_rej_reason_strs) && + ibcm_rej_reason_strs[index]) + return ibcm_rej_reason_strs[index]; + else + return "unrecognized reason"; +} +EXPORT_SYMBOL(ibcm_reject_msg); + +struct cm_id_private; +struct cm_work; +static int cm_add_one(struct ib_device *device); +static void cm_remove_one(struct ib_device *device, void *client_data); +static void cm_process_work(struct cm_id_private *cm_id_priv, + struct cm_work *work); +static int cm_send_sidr_rep_locked(struct cm_id_private *cm_id_priv, + struct ib_cm_sidr_rep_param *param); +static int cm_send_dreq_locked(struct cm_id_private *cm_id_priv, + const void *private_data, u8 private_data_len); +static int cm_send_drep_locked(struct cm_id_private *cm_id_priv, + void *private_data, u8 private_data_len); +static int cm_send_rej_locked(struct cm_id_private *cm_id_priv, + enum ib_cm_rej_reason reason, void *ari, + u8 ari_length, const void *private_data, + u8 private_data_len); + +static struct ib_client cm_client = { + .name = "cm", + .add = cm_add_one, + .remove = cm_remove_one +}; + +static struct ib_cm { + spinlock_t lock; + struct list_head device_list; + rwlock_t device_lock; + struct rb_root listen_service_table; + u64 listen_service_id; + /* struct rb_root peer_service_table; todo: fix peer to peer */ + struct rb_root remote_qp_table; + struct rb_root remote_id_table; + struct rb_root remote_sidr_table; + struct xarray local_id_table; + u32 local_id_next; + __be32 random_id_operand; + struct list_head timewait_list; + struct workqueue_struct *wq; +} cm; + +/* Counter indexes ordered by attribute ID */ +enum { + CM_REQ_COUNTER, + CM_MRA_COUNTER, + CM_REJ_COUNTER, + CM_REP_COUNTER, + CM_RTU_COUNTER, + CM_DREQ_COUNTER, + CM_DREP_COUNTER, + CM_SIDR_REQ_COUNTER, + CM_SIDR_REP_COUNTER, + CM_LAP_COUNTER, + CM_APR_COUNTER, + CM_ATTR_COUNT, + CM_ATTR_ID_OFFSET = 0x0010, +}; + +enum { + CM_XMIT, + CM_XMIT_RETRIES, + CM_RECV, + CM_RECV_DUPLICATES, + CM_COUNTER_GROUPS +}; + +struct cm_counter_attribute { + struct ib_port_attribute attr; + unsigned short group; + unsigned short index; +}; + +struct cm_port { + struct cm_device *cm_dev; + struct ib_mad_agent *mad_agent; + u32 port_num; + atomic_long_t counters[CM_COUNTER_GROUPS][CM_ATTR_COUNT]; +}; + +struct cm_device { + struct kref kref; + struct list_head list; + spinlock_t mad_agent_lock; + struct ib_device *ib_device; + u8 ack_delay; + int going_down; + struct cm_port *port[]; +}; + +struct cm_av { + struct cm_port *port; + struct rdma_ah_attr ah_attr; + u16 dlid_datapath; + u16 pkey_index; + u8 timeout; +}; + +struct cm_work { + struct delayed_work work; + struct list_head list; + struct cm_port *port; + struct ib_mad_recv_wc *mad_recv_wc; /* Received MADs */ + __be32 local_id; /* Established / timewait */ + __be32 remote_id; + struct ib_cm_event cm_event; + struct sa_path_rec path[]; +}; + +struct cm_timewait_info { + struct cm_work work; + struct list_head list; + struct rb_node remote_qp_node; + struct rb_node remote_id_node; + __be64 remote_ca_guid; + __be32 remote_qpn; + u8 inserted_remote_qp; + u8 inserted_remote_id; +}; + +struct cm_id_private { + struct ib_cm_id id; + + struct rb_node service_node; + struct rb_node sidr_id_node; + u32 sidr_slid; + spinlock_t lock; /* Do not acquire inside cm.lock */ + struct completion comp; + refcount_t refcount; + /* Number of clients sharing this ib_cm_id. Only valid for listeners. + * Protected by the cm.lock spinlock. + */ + int listen_sharecount; + struct rcu_head rcu; + + struct ib_mad_send_buf *msg; + struct cm_timewait_info *timewait_info; + /* todo: use alternate port on send failure */ + struct cm_av av; + struct cm_av alt_av; + + void *private_data; + __be64 tid; + __be32 local_qpn; + __be32 remote_qpn; + enum ib_qp_type qp_type; + __be32 sq_psn; + __be32 rq_psn; + int timeout_ms; + enum ib_mtu path_mtu; + __be16 pkey; + u8 private_data_len; + u8 max_cm_retries; + u8 responder_resources; + u8 initiator_depth; + u8 retry_count; + u8 rnr_retry_count; + u8 service_timeout; + u8 target_ack_delay; + + struct list_head work_list; + atomic_t work_count; + + struct rdma_ucm_ece ece; +}; + +static void cm_dev_release(struct kref *kref) +{ + struct cm_device *cm_dev = container_of(kref, struct cm_device, kref); + u32 i; + + rdma_for_each_port(cm_dev->ib_device, i) + kfree(cm_dev->port[i - 1]); + + kfree(cm_dev); +} + +static void cm_device_put(struct cm_device *cm_dev) +{ + kref_put(&cm_dev->kref, cm_dev_release); +} + +static void cm_work_handler(struct work_struct *work); + +static inline void cm_deref_id(struct cm_id_private *cm_id_priv) +{ + if (refcount_dec_and_test(&cm_id_priv->refcount)) + complete(&cm_id_priv->comp); +} + +static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv) +{ + struct ib_mad_agent *mad_agent; + struct ib_mad_send_buf *m; + struct ib_ah *ah; + + lockdep_assert_held(&cm_id_priv->lock); + + if (!cm_id_priv->av.port) + return ERR_PTR(-EINVAL); + + spin_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock); + mad_agent = cm_id_priv->av.port->mad_agent; + if (!mad_agent) { + m = ERR_PTR(-EINVAL); + goto out; + } + + ah = rdma_create_ah(mad_agent->qp->pd, &cm_id_priv->av.ah_attr, 0); + if (IS_ERR(ah)) { + m = ERR_CAST(ah); + goto out; + } + + m = ib_create_send_mad(mad_agent, cm_id_priv->id.remote_cm_qpn, + cm_id_priv->av.pkey_index, + 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA, + GFP_ATOMIC, + IB_MGMT_BASE_VERSION); + if (IS_ERR(m)) { + rdma_destroy_ah(ah, 0); + goto out; + } + + /* Timeout set by caller if response is expected. */ + m->ah = ah; + m->retries = cm_id_priv->max_cm_retries; + + refcount_inc(&cm_id_priv->refcount); + m->context[0] = cm_id_priv; + +out: + spin_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock); + return m; +} + +static void cm_free_msg(struct ib_mad_send_buf *msg) +{ + struct cm_id_private *cm_id_priv = msg->context[0]; + + if (msg->ah) + rdma_destroy_ah(msg->ah, 0); + cm_deref_id(cm_id_priv); + ib_free_send_mad(msg); +} + +static struct ib_mad_send_buf * +cm_alloc_priv_msg(struct cm_id_private *cm_id_priv) +{ + struct ib_mad_send_buf *msg; + + lockdep_assert_held(&cm_id_priv->lock); + + msg = cm_alloc_msg(cm_id_priv); + if (IS_ERR(msg)) + return msg; + cm_id_priv->msg = msg; + return msg; +} + +static void cm_free_priv_msg(struct ib_mad_send_buf *msg) +{ + struct cm_id_private *cm_id_priv = msg->context[0]; + + lockdep_assert_held(&cm_id_priv->lock); + + if (!WARN_ON(cm_id_priv->msg != msg)) + cm_id_priv->msg = NULL; + + if (msg->ah) + rdma_destroy_ah(msg->ah, 0); + cm_deref_id(cm_id_priv); + ib_free_send_mad(msg); +} + +static struct ib_mad_send_buf *cm_alloc_response_msg_no_ah(struct cm_port *port, + struct ib_mad_recv_wc *mad_recv_wc) +{ + return ib_create_send_mad(port->mad_agent, 1, mad_recv_wc->wc->pkey_index, + 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA, + GFP_ATOMIC, + IB_MGMT_BASE_VERSION); +} + +static int cm_create_response_msg_ah(struct cm_port *port, + struct ib_mad_recv_wc *mad_recv_wc, + struct ib_mad_send_buf *msg) +{ + struct ib_ah *ah; + + ah = ib_create_ah_from_wc(port->mad_agent->qp->pd, mad_recv_wc->wc, + mad_recv_wc->recv_buf.grh, port->port_num); + if (IS_ERR(ah)) + return PTR_ERR(ah); + + msg->ah = ah; + return 0; +} + +static int cm_alloc_response_msg(struct cm_port *port, + struct ib_mad_recv_wc *mad_recv_wc, + struct ib_mad_send_buf **msg) +{ + struct ib_mad_send_buf *m; + int ret; + + m = cm_alloc_response_msg_no_ah(port, mad_recv_wc); + if (IS_ERR(m)) + return PTR_ERR(m); + + ret = cm_create_response_msg_ah(port, mad_recv_wc, m); + if (ret) { + ib_free_send_mad(m); + return ret; + } + + *msg = m; + return 0; +} + +static void cm_free_response_msg(struct ib_mad_send_buf *msg) +{ + if (msg->ah) + rdma_destroy_ah(msg->ah, 0); + ib_free_send_mad(msg); +} + +static void *cm_copy_private_data(const void *private_data, u8 private_data_len) +{ + void *data; + + if (!private_data || !private_data_len) + return NULL; + + data = kmemdup(private_data, private_data_len, GFP_KERNEL); + if (!data) + return ERR_PTR(-ENOMEM); + + return data; +} + +static void cm_set_private_data(struct cm_id_private *cm_id_priv, + void *private_data, u8 private_data_len) +{ + if (cm_id_priv->private_data && cm_id_priv->private_data_len) + kfree(cm_id_priv->private_data); + + cm_id_priv->private_data = private_data; + cm_id_priv->private_data_len = private_data_len; +} + +static void cm_set_av_port(struct cm_av *av, struct cm_port *port) +{ + struct cm_port *old_port = av->port; + + if (old_port == port) + return; + + av->port = port; + if (old_port) + cm_device_put(old_port->cm_dev); + if (port) + kref_get(&port->cm_dev->kref); +} + +static void cm_init_av_for_lap(struct cm_port *port, struct ib_wc *wc, + struct rdma_ah_attr *ah_attr, struct cm_av *av) +{ + cm_set_av_port(av, port); + av->pkey_index = wc->pkey_index; + rdma_move_ah_attr(&av->ah_attr, ah_attr); +} + +static int cm_init_av_for_response(struct cm_port *port, struct ib_wc *wc, + struct ib_grh *grh, struct cm_av *av) +{ + cm_set_av_port(av, port); + av->pkey_index = wc->pkey_index; + return ib_init_ah_attr_from_wc(port->cm_dev->ib_device, + port->port_num, wc, + grh, &av->ah_attr); +} + +static struct cm_port * +get_cm_port_from_path(struct sa_path_rec *path, const struct ib_gid_attr *attr) +{ + struct cm_device *cm_dev; + struct cm_port *port = NULL; + unsigned long flags; + + if (attr) { + read_lock_irqsave(&cm.device_lock, flags); + list_for_each_entry(cm_dev, &cm.device_list, list) { + if (cm_dev->ib_device == attr->device) { + port = cm_dev->port[attr->port_num - 1]; + break; + } + } + read_unlock_irqrestore(&cm.device_lock, flags); + } else { + /* SGID attribute can be NULL in following + * conditions. + * (a) Alternative path + * (b) IB link layer without GRH + * (c) LAP send messages + */ + read_lock_irqsave(&cm.device_lock, flags); + list_for_each_entry(cm_dev, &cm.device_list, list) { + attr = rdma_find_gid(cm_dev->ib_device, + &path->sgid, + sa_conv_pathrec_to_gid_type(path), + NULL); + if (!IS_ERR(attr)) { + port = cm_dev->port[attr->port_num - 1]; + break; + } + } + read_unlock_irqrestore(&cm.device_lock, flags); + if (port) + rdma_put_gid_attr(attr); + } + return port; +} + +static int cm_init_av_by_path(struct sa_path_rec *path, + const struct ib_gid_attr *sgid_attr, + struct cm_av *av) +{ + struct rdma_ah_attr new_ah_attr; + struct cm_device *cm_dev; + struct cm_port *port; + int ret; + + port = get_cm_port_from_path(path, sgid_attr); + if (!port) + return -EINVAL; + cm_dev = port->cm_dev; + + ret = ib_find_cached_pkey(cm_dev->ib_device, port->port_num, + be16_to_cpu(path->pkey), &av->pkey_index); + if (ret) + return ret; + + cm_set_av_port(av, port); + + /* + * av->ah_attr might be initialized based on wc or during + * request processing time which might have reference to sgid_attr. + * So initialize a new ah_attr on stack. + * If initialization fails, old ah_attr is used for sending any + * responses. If initialization is successful, than new ah_attr + * is used by overwriting the old one. So that right ah_attr + * can be used to return an error response. + */ + ret = ib_init_ah_attr_from_path(cm_dev->ib_device, port->port_num, path, + &new_ah_attr, sgid_attr); + if (ret) + return ret; + + av->timeout = path->packet_life_time + 1; + rdma_move_ah_attr(&av->ah_attr, &new_ah_attr); + return 0; +} + +/* Move av created by cm_init_av_by_path(), so av.dgid is not moved */ +static void cm_move_av_from_path(struct cm_av *dest, struct cm_av *src) +{ + cm_set_av_port(dest, src->port); + cm_set_av_port(src, NULL); + dest->pkey_index = src->pkey_index; + rdma_move_ah_attr(&dest->ah_attr, &src->ah_attr); + dest->timeout = src->timeout; +} + +static void cm_destroy_av(struct cm_av *av) +{ + rdma_destroy_ah_attr(&av->ah_attr); + cm_set_av_port(av, NULL); +} + +static u32 cm_local_id(__be32 local_id) +{ + return (__force u32) (local_id ^ cm.random_id_operand); +} + +static struct cm_id_private *cm_acquire_id(__be32 local_id, __be32 remote_id) +{ + struct cm_id_private *cm_id_priv; + + rcu_read_lock(); + cm_id_priv = xa_load(&cm.local_id_table, cm_local_id(local_id)); + if (!cm_id_priv || cm_id_priv->id.remote_id != remote_id || + !refcount_inc_not_zero(&cm_id_priv->refcount)) + cm_id_priv = NULL; + rcu_read_unlock(); + + return cm_id_priv; +} + +/* + * Trivial helpers to strip endian annotation and compare; the + * endianness doesn't actually matter since we just need a stable + * order for the RB tree. + */ +static int be32_lt(__be32 a, __be32 b) +{ + return (__force u32) a < (__force u32) b; +} + +static int be32_gt(__be32 a, __be32 b) +{ + return (__force u32) a > (__force u32) b; +} + +static int be64_lt(__be64 a, __be64 b) +{ + return (__force u64) a < (__force u64) b; +} + +static int be64_gt(__be64 a, __be64 b) +{ + return (__force u64) a > (__force u64) b; +} + +/* + * Inserts a new cm_id_priv into the listen_service_table. Returns cm_id_priv + * if the new ID was inserted, NULL if it could not be inserted due to a + * collision, or the existing cm_id_priv ready for shared usage. + */ +static struct cm_id_private *cm_insert_listen(struct cm_id_private *cm_id_priv, + ib_cm_handler shared_handler) +{ + struct rb_node **link = &cm.listen_service_table.rb_node; + struct rb_node *parent = NULL; + struct cm_id_private *cur_cm_id_priv; + __be64 service_id = cm_id_priv->id.service_id; + __be64 service_mask = cm_id_priv->id.service_mask; + unsigned long flags; + + spin_lock_irqsave(&cm.lock, flags); + while (*link) { + parent = *link; + cur_cm_id_priv = rb_entry(parent, struct cm_id_private, + service_node); + if ((cur_cm_id_priv->id.service_mask & service_id) == + (service_mask & cur_cm_id_priv->id.service_id) && + (cm_id_priv->id.device == cur_cm_id_priv->id.device)) { + /* + * Sharing an ib_cm_id with different handlers is not + * supported + */ + if (cur_cm_id_priv->id.cm_handler != shared_handler || + cur_cm_id_priv->id.context || + WARN_ON(!cur_cm_id_priv->id.cm_handler)) { + spin_unlock_irqrestore(&cm.lock, flags); + return NULL; + } + refcount_inc(&cur_cm_id_priv->refcount); + cur_cm_id_priv->listen_sharecount++; + spin_unlock_irqrestore(&cm.lock, flags); + return cur_cm_id_priv; + } + + if (cm_id_priv->id.device < cur_cm_id_priv->id.device) + link = &(*link)->rb_left; + else if (cm_id_priv->id.device > cur_cm_id_priv->id.device) + link = &(*link)->rb_right; + else if (be64_lt(service_id, cur_cm_id_priv->id.service_id)) + link = &(*link)->rb_left; + else if (be64_gt(service_id, cur_cm_id_priv->id.service_id)) + link = &(*link)->rb_right; + else + link = &(*link)->rb_right; + } + cm_id_priv->listen_sharecount++; + rb_link_node(&cm_id_priv->service_node, parent, link); + rb_insert_color(&cm_id_priv->service_node, &cm.listen_service_table); + spin_unlock_irqrestore(&cm.lock, flags); + return cm_id_priv; +} + +static struct cm_id_private *cm_find_listen(struct ib_device *device, + __be64 service_id) +{ + struct rb_node *node = cm.listen_service_table.rb_node; + struct cm_id_private *cm_id_priv; + + while (node) { + cm_id_priv = rb_entry(node, struct cm_id_private, service_node); + if ((cm_id_priv->id.service_mask & service_id) == + cm_id_priv->id.service_id && + (cm_id_priv->id.device == device)) { + refcount_inc(&cm_id_priv->refcount); + return cm_id_priv; + } + if (device < cm_id_priv->id.device) + node = node->rb_left; + else if (device > cm_id_priv->id.device) + node = node->rb_right; + else if (be64_lt(service_id, cm_id_priv->id.service_id)) + node = node->rb_left; + else if (be64_gt(service_id, cm_id_priv->id.service_id)) + node = node->rb_right; + else + node = node->rb_right; + } + return NULL; +} + +static struct cm_timewait_info * +cm_insert_remote_id(struct cm_timewait_info *timewait_info) +{ + struct rb_node **link = &cm.remote_id_table.rb_node; + struct rb_node *parent = NULL; + struct cm_timewait_info *cur_timewait_info; + __be64 remote_ca_guid = timewait_info->remote_ca_guid; + __be32 remote_id = timewait_info->work.remote_id; + + while (*link) { + parent = *link; + cur_timewait_info = rb_entry(parent, struct cm_timewait_info, + remote_id_node); + if (be32_lt(remote_id, cur_timewait_info->work.remote_id)) + link = &(*link)->rb_left; + else if (be32_gt(remote_id, cur_timewait_info->work.remote_id)) + link = &(*link)->rb_right; + else if (be64_lt(remote_ca_guid, cur_timewait_info->remote_ca_guid)) + link = &(*link)->rb_left; + else if (be64_gt(remote_ca_guid, cur_timewait_info->remote_ca_guid)) + link = &(*link)->rb_right; + else + return cur_timewait_info; + } + timewait_info->inserted_remote_id = 1; + rb_link_node(&timewait_info->remote_id_node, parent, link); + rb_insert_color(&timewait_info->remote_id_node, &cm.remote_id_table); + return NULL; +} + +static struct cm_id_private *cm_find_remote_id(__be64 remote_ca_guid, + __be32 remote_id) +{ + struct rb_node *node = cm.remote_id_table.rb_node; + struct cm_timewait_info *timewait_info; + struct cm_id_private *res = NULL; + + spin_lock_irq(&cm.lock); + while (node) { + timewait_info = rb_entry(node, struct cm_timewait_info, + remote_id_node); + if (be32_lt(remote_id, timewait_info->work.remote_id)) + node = node->rb_left; + else if (be32_gt(remote_id, timewait_info->work.remote_id)) + node = node->rb_right; + else if (be64_lt(remote_ca_guid, timewait_info->remote_ca_guid)) + node = node->rb_left; + else if (be64_gt(remote_ca_guid, timewait_info->remote_ca_guid)) + node = node->rb_right; + else { + res = cm_acquire_id(timewait_info->work.local_id, + timewait_info->work.remote_id); + break; + } + } + spin_unlock_irq(&cm.lock); + return res; +} + +static struct cm_timewait_info * +cm_insert_remote_qpn(struct cm_timewait_info *timewait_info) +{ + struct rb_node **link = &cm.remote_qp_table.rb_node; + struct rb_node *parent = NULL; + struct cm_timewait_info *cur_timewait_info; + __be64 remote_ca_guid = timewait_info->remote_ca_guid; + __be32 remote_qpn = timewait_info->remote_qpn; + + while (*link) { + parent = *link; + cur_timewait_info = rb_entry(parent, struct cm_timewait_info, + remote_qp_node); + if (be32_lt(remote_qpn, cur_timewait_info->remote_qpn)) + link = &(*link)->rb_left; + else if (be32_gt(remote_qpn, cur_timewait_info->remote_qpn)) + link = &(*link)->rb_right; + else if (be64_lt(remote_ca_guid, cur_timewait_info->remote_ca_guid)) + link = &(*link)->rb_left; + else if (be64_gt(remote_ca_guid, cur_timewait_info->remote_ca_guid)) + link = &(*link)->rb_right; + else + return cur_timewait_info; + } + timewait_info->inserted_remote_qp = 1; + rb_link_node(&timewait_info->remote_qp_node, parent, link); + rb_insert_color(&timewait_info->remote_qp_node, &cm.remote_qp_table); + return NULL; +} + +static struct cm_id_private * +cm_insert_remote_sidr(struct cm_id_private *cm_id_priv) +{ + struct rb_node **link = &cm.remote_sidr_table.rb_node; + struct rb_node *parent = NULL; + struct cm_id_private *cur_cm_id_priv; + __be32 remote_id = cm_id_priv->id.remote_id; + + while (*link) { + parent = *link; + cur_cm_id_priv = rb_entry(parent, struct cm_id_private, + sidr_id_node); + if (be32_lt(remote_id, cur_cm_id_priv->id.remote_id)) + link = &(*link)->rb_left; + else if (be32_gt(remote_id, cur_cm_id_priv->id.remote_id)) + link = &(*link)->rb_right; + else { + if (cur_cm_id_priv->sidr_slid < cm_id_priv->sidr_slid) + link = &(*link)->rb_left; + else if (cur_cm_id_priv->sidr_slid > cm_id_priv->sidr_slid) + link = &(*link)->rb_right; + else + return cur_cm_id_priv; + } + } + rb_link_node(&cm_id_priv->sidr_id_node, parent, link); + rb_insert_color(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table); + return NULL; +} + +static struct cm_id_private *cm_alloc_id_priv(struct ib_device *device, + ib_cm_handler cm_handler, + void *context) +{ + struct cm_id_private *cm_id_priv; + u32 id; + int ret; + + cm_id_priv = kzalloc(sizeof *cm_id_priv, GFP_KERNEL); + if (!cm_id_priv) + return ERR_PTR(-ENOMEM); + + cm_id_priv->id.state = IB_CM_IDLE; + cm_id_priv->id.device = device; + cm_id_priv->id.cm_handler = cm_handler; + cm_id_priv->id.context = context; + cm_id_priv->id.remote_cm_qpn = 1; + + RB_CLEAR_NODE(&cm_id_priv->service_node); + RB_CLEAR_NODE(&cm_id_priv->sidr_id_node); + spin_lock_init(&cm_id_priv->lock); + init_completion(&cm_id_priv->comp); + INIT_LIST_HEAD(&cm_id_priv->work_list); + atomic_set(&cm_id_priv->work_count, -1); + refcount_set(&cm_id_priv->refcount, 1); + + ret = xa_alloc_cyclic(&cm.local_id_table, &id, NULL, xa_limit_32b, + &cm.local_id_next, GFP_KERNEL); + if (ret < 0) + goto error; + cm_id_priv->id.local_id = (__force __be32)id ^ cm.random_id_operand; + + return cm_id_priv; + +error: + kfree(cm_id_priv); + return ERR_PTR(ret); +} + +/* + * Make the ID visible to the MAD handlers and other threads that use the + * xarray. + */ +static void cm_finalize_id(struct cm_id_private *cm_id_priv) +{ + xa_store(&cm.local_id_table, cm_local_id(cm_id_priv->id.local_id), + cm_id_priv, GFP_ATOMIC); +} + +struct ib_cm_id *ib_create_cm_id(struct ib_device *device, + ib_cm_handler cm_handler, + void *context) +{ + struct cm_id_private *cm_id_priv; + + cm_id_priv = cm_alloc_id_priv(device, cm_handler, context); + if (IS_ERR(cm_id_priv)) + return ERR_CAST(cm_id_priv); + + cm_finalize_id(cm_id_priv); + return &cm_id_priv->id; +} +EXPORT_SYMBOL(ib_create_cm_id); + +static struct cm_work *cm_dequeue_work(struct cm_id_private *cm_id_priv) +{ + struct cm_work *work; + + if (list_empty(&cm_id_priv->work_list)) + return NULL; + + work = list_entry(cm_id_priv->work_list.next, struct cm_work, list); + list_del(&work->list); + return work; +} + +static void cm_free_work(struct cm_work *work) +{ + if (work->mad_recv_wc) + ib_free_recv_mad(work->mad_recv_wc); + kfree(work); +} + +static void cm_queue_work_unlock(struct cm_id_private *cm_id_priv, + struct cm_work *work) + __releases(&cm_id_priv->lock) +{ + bool immediate; + + /* + * To deliver the event to the user callback we have the drop the + * spinlock, however, we need to ensure that the user callback is single + * threaded and receives events in the temporal order. If there are + * already events being processed then thread new events onto a list, + * the thread currently processing will pick them up. + */ + immediate = atomic_inc_and_test(&cm_id_priv->work_count); + if (!immediate) { + list_add_tail(&work->list, &cm_id_priv->work_list); + /* + * This routine always consumes incoming reference. Once queued + * to the work_list then a reference is held by the thread + * currently running cm_process_work() and this reference is not + * needed. + */ + cm_deref_id(cm_id_priv); + } + spin_unlock_irq(&cm_id_priv->lock); + + if (immediate) + cm_process_work(cm_id_priv, work); +} + +static inline int cm_convert_to_ms(int iba_time) +{ + /* approximate conversion to ms from 4.096us x 2^iba_time */ + return 1 << max(iba_time - 8, 0); +} + +/* + * calculate: 4.096x2^ack_timeout = 4.096x2^ack_delay + 2x4.096x2^life_time + * Because of how ack_timeout is stored, adding one doubles the timeout. + * To avoid large timeouts, select the max(ack_delay, life_time + 1), and + * increment it (round up) only if the other is within 50%. + */ +static u8 cm_ack_timeout(u8 ca_ack_delay, u8 packet_life_time) +{ + int ack_timeout = packet_life_time + 1; + + if (ack_timeout >= ca_ack_delay) + ack_timeout += (ca_ack_delay >= (ack_timeout - 1)); + else + ack_timeout = ca_ack_delay + + (ack_timeout >= (ca_ack_delay - 1)); + + return min(31, ack_timeout); +} + +static void cm_remove_remote(struct cm_id_private *cm_id_priv) +{ + struct cm_timewait_info *timewait_info = cm_id_priv->timewait_info; + + if (timewait_info->inserted_remote_id) { + rb_erase(&timewait_info->remote_id_node, &cm.remote_id_table); + timewait_info->inserted_remote_id = 0; + } + + if (timewait_info->inserted_remote_qp) { + rb_erase(&timewait_info->remote_qp_node, &cm.remote_qp_table); + timewait_info->inserted_remote_qp = 0; + } +} + +static struct cm_timewait_info *cm_create_timewait_info(__be32 local_id) +{ + struct cm_timewait_info *timewait_info; + + timewait_info = kzalloc(sizeof *timewait_info, GFP_KERNEL); + if (!timewait_info) + return ERR_PTR(-ENOMEM); + + timewait_info->work.local_id = local_id; + INIT_DELAYED_WORK(&timewait_info->work.work, cm_work_handler); + timewait_info->work.cm_event.event = IB_CM_TIMEWAIT_EXIT; + return timewait_info; +} + +static void cm_enter_timewait(struct cm_id_private *cm_id_priv) +{ + int wait_time; + unsigned long flags; + struct cm_device *cm_dev; + + lockdep_assert_held(&cm_id_priv->lock); + + cm_dev = ib_get_client_data(cm_id_priv->id.device, &cm_client); + if (!cm_dev) + return; + + spin_lock_irqsave(&cm.lock, flags); + cm_remove_remote(cm_id_priv); + list_add_tail(&cm_id_priv->timewait_info->list, &cm.timewait_list); + spin_unlock_irqrestore(&cm.lock, flags); + + /* + * The cm_id could be destroyed by the user before we exit timewait. + * To protect against this, we search for the cm_id after exiting + * timewait before notifying the user that we've exited timewait. + */ + cm_id_priv->id.state = IB_CM_TIMEWAIT; + wait_time = cm_convert_to_ms(cm_id_priv->av.timeout); + + /* Check if the device started its remove_one */ + spin_lock_irqsave(&cm.lock, flags); + if (!cm_dev->going_down) + queue_delayed_work(cm.wq, &cm_id_priv->timewait_info->work.work, + msecs_to_jiffies(wait_time)); + spin_unlock_irqrestore(&cm.lock, flags); + + /* + * The timewait_info is converted into a work and gets freed during + * cm_free_work() in cm_timewait_handler(). + */ + BUILD_BUG_ON(offsetof(struct cm_timewait_info, work) != 0); + cm_id_priv->timewait_info = NULL; +} + +static void cm_reset_to_idle(struct cm_id_private *cm_id_priv) +{ + unsigned long flags; + + lockdep_assert_held(&cm_id_priv->lock); + + cm_id_priv->id.state = IB_CM_IDLE; + if (cm_id_priv->timewait_info) { + spin_lock_irqsave(&cm.lock, flags); + cm_remove_remote(cm_id_priv); + spin_unlock_irqrestore(&cm.lock, flags); + kfree(cm_id_priv->timewait_info); + cm_id_priv->timewait_info = NULL; + } +} + +static void cm_destroy_id(struct ib_cm_id *cm_id, int err) +{ + struct cm_id_private *cm_id_priv; + struct cm_work *work; + + cm_id_priv = container_of(cm_id, struct cm_id_private, id); + spin_lock_irq(&cm_id_priv->lock); +retest: + switch (cm_id->state) { + case IB_CM_LISTEN: + spin_lock(&cm.lock); + if (--cm_id_priv->listen_sharecount > 0) { + /* The id is still shared. */ + WARN_ON(refcount_read(&cm_id_priv->refcount) == 1); + spin_unlock(&cm.lock); + spin_unlock_irq(&cm_id_priv->lock); + cm_deref_id(cm_id_priv); + return; + } + cm_id->state = IB_CM_IDLE; + rb_erase(&cm_id_priv->service_node, &cm.listen_service_table); + RB_CLEAR_NODE(&cm_id_priv->service_node); + spin_unlock(&cm.lock); + break; + case IB_CM_SIDR_REQ_SENT: + cm_id->state = IB_CM_IDLE; + ib_cancel_mad(cm_id_priv->msg); + break; + case IB_CM_SIDR_REQ_RCVD: + cm_send_sidr_rep_locked(cm_id_priv, + &(struct ib_cm_sidr_rep_param){ + .status = IB_SIDR_REJECT }); + /* cm_send_sidr_rep_locked will not move to IDLE if it fails */ + cm_id->state = IB_CM_IDLE; + break; + case IB_CM_REQ_SENT: + case IB_CM_MRA_REQ_RCVD: + ib_cancel_mad(cm_id_priv->msg); + cm_send_rej_locked(cm_id_priv, IB_CM_REJ_TIMEOUT, + &cm_id_priv->id.device->node_guid, + sizeof(cm_id_priv->id.device->node_guid), + NULL, 0); + break; + case IB_CM_REQ_RCVD: + if (err == -ENOMEM) { + /* Do not reject to allow future retries. */ + cm_reset_to_idle(cm_id_priv); + } else { + cm_send_rej_locked(cm_id_priv, + IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, + NULL, 0); + } + break; + case IB_CM_REP_SENT: + case IB_CM_MRA_REP_RCVD: + ib_cancel_mad(cm_id_priv->msg); + cm_send_rej_locked(cm_id_priv, IB_CM_REJ_CONSUMER_DEFINED, NULL, + 0, NULL, 0); + goto retest; + case IB_CM_MRA_REQ_SENT: + case IB_CM_REP_RCVD: + case IB_CM_MRA_REP_SENT: + cm_send_rej_locked(cm_id_priv, IB_CM_REJ_CONSUMER_DEFINED, NULL, + 0, NULL, 0); + break; + case IB_CM_ESTABLISHED: + if (cm_id_priv->qp_type == IB_QPT_XRC_TGT) { + cm_id->state = IB_CM_IDLE; + break; + } + cm_send_dreq_locked(cm_id_priv, NULL, 0); + goto retest; + case IB_CM_DREQ_SENT: + ib_cancel_mad(cm_id_priv->msg); + cm_enter_timewait(cm_id_priv); + goto retest; + case IB_CM_DREQ_RCVD: + cm_send_drep_locked(cm_id_priv, NULL, 0); + WARN_ON(cm_id->state != IB_CM_TIMEWAIT); + goto retest; + case IB_CM_TIMEWAIT: + /* + * The cm_acquire_id in cm_timewait_handler will stop working + * once we do xa_erase below, so just move to idle here for + * consistency. + */ + cm_id->state = IB_CM_IDLE; + break; + case IB_CM_IDLE: + break; + } + WARN_ON(cm_id->state != IB_CM_IDLE); + + spin_lock(&cm.lock); + /* Required for cleanup paths related cm_req_handler() */ + if (cm_id_priv->timewait_info) { + cm_remove_remote(cm_id_priv); + kfree(cm_id_priv->timewait_info); + cm_id_priv->timewait_info = NULL; + } + + WARN_ON(cm_id_priv->listen_sharecount); + WARN_ON(!RB_EMPTY_NODE(&cm_id_priv->service_node)); + if (!RB_EMPTY_NODE(&cm_id_priv->sidr_id_node)) + rb_erase(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table); + spin_unlock(&cm.lock); + spin_unlock_irq(&cm_id_priv->lock); + + xa_erase(&cm.local_id_table, cm_local_id(cm_id->local_id)); + cm_deref_id(cm_id_priv); + wait_for_completion(&cm_id_priv->comp); + while ((work = cm_dequeue_work(cm_id_priv)) != NULL) + cm_free_work(work); + + cm_destroy_av(&cm_id_priv->av); + cm_destroy_av(&cm_id_priv->alt_av); + kfree(cm_id_priv->private_data); + kfree_rcu(cm_id_priv, rcu); +} + +void ib_destroy_cm_id(struct ib_cm_id *cm_id) +{ + cm_destroy_id(cm_id, 0); +} +EXPORT_SYMBOL(ib_destroy_cm_id); + +static int cm_init_listen(struct cm_id_private *cm_id_priv, __be64 service_id, + __be64 service_mask) +{ + service_mask = service_mask ? service_mask : ~cpu_to_be64(0); + service_id &= service_mask; + if ((service_id & IB_SERVICE_ID_AGN_MASK) == IB_CM_ASSIGN_SERVICE_ID && + (service_id != IB_CM_ASSIGN_SERVICE_ID)) + return -EINVAL; + + if (service_id == IB_CM_ASSIGN_SERVICE_ID) { + cm_id_priv->id.service_id = cpu_to_be64(cm.listen_service_id++); + cm_id_priv->id.service_mask = ~cpu_to_be64(0); + } else { + cm_id_priv->id.service_id = service_id; + cm_id_priv->id.service_mask = service_mask; + } + return 0; +} + +/** + * ib_cm_listen - Initiates listening on the specified service ID for + * connection and service ID resolution requests. + * @cm_id: Connection identifier associated with the listen request. + * @service_id: Service identifier matched against incoming connection + * and service ID resolution requests. The service ID should be specified + * network-byte order. If set to IB_CM_ASSIGN_SERVICE_ID, the CM will + * assign a service ID to the caller. + * @service_mask: Mask applied to service ID used to listen across a + * range of service IDs. If set to 0, the service ID is matched + * exactly. This parameter is ignored if %service_id is set to + * IB_CM_ASSIGN_SERVICE_ID. + */ +int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask) +{ + struct cm_id_private *cm_id_priv = + container_of(cm_id, struct cm_id_private, id); + unsigned long flags; + int ret; + + spin_lock_irqsave(&cm_id_priv->lock, flags); + if (cm_id_priv->id.state != IB_CM_IDLE) { + ret = -EINVAL; + goto out; + } + + ret = cm_init_listen(cm_id_priv, service_id, service_mask); + if (ret) + goto out; + + if (!cm_insert_listen(cm_id_priv, NULL)) { + ret = -EBUSY; + goto out; + } + + cm_id_priv->id.state = IB_CM_LISTEN; + ret = 0; + +out: + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + return ret; +} +EXPORT_SYMBOL(ib_cm_listen); + +/** + * ib_cm_insert_listen - Create a new listening ib_cm_id and listen on + * the given service ID. + * + * If there's an existing ID listening on that same device and service ID, + * return it. + * + * @device: Device associated with the cm_id. All related communication will + * be associated with the specified device. + * @cm_handler: Callback invoked to notify the user of CM events. + * @service_id: Service identifier matched against incoming connection + * and service ID resolution requests. The service ID should be specified + * network-byte order. If set to IB_CM_ASSIGN_SERVICE_ID, the CM will + * assign a service ID to the caller. + * + * Callers should call ib_destroy_cm_id when done with the listener ID. + */ +struct ib_cm_id *ib_cm_insert_listen(struct ib_device *device, + ib_cm_handler cm_handler, + __be64 service_id) +{ + struct cm_id_private *listen_id_priv; + struct cm_id_private *cm_id_priv; + int err = 0; + + /* Create an ID in advance, since the creation may sleep */ + cm_id_priv = cm_alloc_id_priv(device, cm_handler, NULL); + if (IS_ERR(cm_id_priv)) + return ERR_CAST(cm_id_priv); + + err = cm_init_listen(cm_id_priv, service_id, 0); + if (err) { + ib_destroy_cm_id(&cm_id_priv->id); + return ERR_PTR(err); + } + + spin_lock_irq(&cm_id_priv->lock); + listen_id_priv = cm_insert_listen(cm_id_priv, cm_handler); + if (listen_id_priv != cm_id_priv) { + spin_unlock_irq(&cm_id_priv->lock); + ib_destroy_cm_id(&cm_id_priv->id); + if (!listen_id_priv) + return ERR_PTR(-EINVAL); + return &listen_id_priv->id; + } + cm_id_priv->id.state = IB_CM_LISTEN; + spin_unlock_irq(&cm_id_priv->lock); + + /* + * A listen ID does not need to be in the xarray since it does not + * receive mads, is not placed in the remote_id or remote_qpn rbtree, + * and does not enter timewait. + */ + + return &cm_id_priv->id; +} +EXPORT_SYMBOL(ib_cm_insert_listen); + +static __be64 cm_form_tid(struct cm_id_private *cm_id_priv) +{ + u64 hi_tid = 0, low_tid; + + lockdep_assert_held(&cm_id_priv->lock); + + low_tid = (u64)cm_id_priv->id.local_id; + if (!cm_id_priv->av.port) + return cpu_to_be64(low_tid); + + spin_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock); + if (cm_id_priv->av.port->mad_agent) + hi_tid = ((u64)cm_id_priv->av.port->mad_agent->hi_tid) << 32; + spin_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock); + return cpu_to_be64(hi_tid | low_tid); +} + +static void cm_format_mad_hdr(struct ib_mad_hdr *hdr, + __be16 attr_id, __be64 tid) +{ + hdr->base_version = IB_MGMT_BASE_VERSION; + hdr->mgmt_class = IB_MGMT_CLASS_CM; + hdr->class_version = IB_CM_CLASS_VERSION; + hdr->method = IB_MGMT_METHOD_SEND; + hdr->attr_id = attr_id; + hdr->tid = tid; +} + +static void cm_format_mad_ece_hdr(struct ib_mad_hdr *hdr, __be16 attr_id, + __be64 tid, u32 attr_mod) +{ + cm_format_mad_hdr(hdr, attr_id, tid); + hdr->attr_mod = cpu_to_be32(attr_mod); +} + +static void cm_format_req(struct cm_req_msg *req_msg, + struct cm_id_private *cm_id_priv, + struct ib_cm_req_param *param) +{ + struct sa_path_rec *pri_path = param->primary_path; + struct sa_path_rec *alt_path = param->alternate_path; + bool pri_ext = false; + __be16 lid; + + if (pri_path->rec_type == SA_PATH_REC_TYPE_OPA) + pri_ext = opa_is_extended_lid(pri_path->opa.dlid, + pri_path->opa.slid); + + cm_format_mad_ece_hdr(&req_msg->hdr, CM_REQ_ATTR_ID, + cm_form_tid(cm_id_priv), param->ece.attr_mod); + + IBA_SET(CM_REQ_LOCAL_COMM_ID, req_msg, + be32_to_cpu(cm_id_priv->id.local_id)); + IBA_SET(CM_REQ_SERVICE_ID, req_msg, be64_to_cpu(param->service_id)); + IBA_SET(CM_REQ_LOCAL_CA_GUID, req_msg, + be64_to_cpu(cm_id_priv->id.device->node_guid)); + IBA_SET(CM_REQ_LOCAL_QPN, req_msg, param->qp_num); + IBA_SET(CM_REQ_INITIATOR_DEPTH, req_msg, param->initiator_depth); + IBA_SET(CM_REQ_REMOTE_CM_RESPONSE_TIMEOUT, req_msg, + param->remote_cm_response_timeout); + cm_req_set_qp_type(req_msg, param->qp_type); + IBA_SET(CM_REQ_END_TO_END_FLOW_CONTROL, req_msg, param->flow_control); + IBA_SET(CM_REQ_STARTING_PSN, req_msg, param->starting_psn); + IBA_SET(CM_REQ_LOCAL_CM_RESPONSE_TIMEOUT, req_msg, + param->local_cm_response_timeout); + IBA_SET(CM_REQ_PARTITION_KEY, req_msg, + be16_to_cpu(param->primary_path->pkey)); + IBA_SET(CM_REQ_PATH_PACKET_PAYLOAD_MTU, req_msg, + param->primary_path->mtu); + IBA_SET(CM_REQ_MAX_CM_RETRIES, req_msg, param->max_cm_retries); + + if (param->qp_type != IB_QPT_XRC_INI) { + IBA_SET(CM_REQ_RESPONDER_RESOURCES, req_msg, + param->responder_resources); + IBA_SET(CM_REQ_RETRY_COUNT, req_msg, param->retry_count); + IBA_SET(CM_REQ_RNR_RETRY_COUNT, req_msg, + param->rnr_retry_count); + IBA_SET(CM_REQ_SRQ, req_msg, param->srq); + } + + *IBA_GET_MEM_PTR(CM_REQ_PRIMARY_LOCAL_PORT_GID, req_msg) = + pri_path->sgid; + *IBA_GET_MEM_PTR(CM_REQ_PRIMARY_REMOTE_PORT_GID, req_msg) = + pri_path->dgid; + if (pri_ext) { + IBA_GET_MEM_PTR(CM_REQ_PRIMARY_LOCAL_PORT_GID, req_msg) + ->global.interface_id = + OPA_MAKE_ID(be32_to_cpu(pri_path->opa.slid)); + IBA_GET_MEM_PTR(CM_REQ_PRIMARY_REMOTE_PORT_GID, req_msg) + ->global.interface_id = + OPA_MAKE_ID(be32_to_cpu(pri_path->opa.dlid)); + } + if (pri_path->hop_limit <= 1) { + IBA_SET(CM_REQ_PRIMARY_LOCAL_PORT_LID, req_msg, + be16_to_cpu(pri_ext ? 0 : + htons(ntohl(sa_path_get_slid( + pri_path))))); + IBA_SET(CM_REQ_PRIMARY_REMOTE_PORT_LID, req_msg, + be16_to_cpu(pri_ext ? 0 : + htons(ntohl(sa_path_get_dlid( + pri_path))))); + } else { + + if (param->primary_path_inbound) { + lid = param->primary_path_inbound->ib.dlid; + IBA_SET(CM_REQ_PRIMARY_LOCAL_PORT_LID, req_msg, + be16_to_cpu(lid)); + } else + IBA_SET(CM_REQ_PRIMARY_LOCAL_PORT_LID, req_msg, + be16_to_cpu(IB_LID_PERMISSIVE)); + + /* Work-around until there's a way to obtain remote LID info */ + IBA_SET(CM_REQ_PRIMARY_REMOTE_PORT_LID, req_msg, + be16_to_cpu(IB_LID_PERMISSIVE)); + } + IBA_SET(CM_REQ_PRIMARY_FLOW_LABEL, req_msg, + be32_to_cpu(pri_path->flow_label)); + IBA_SET(CM_REQ_PRIMARY_PACKET_RATE, req_msg, pri_path->rate); + IBA_SET(CM_REQ_PRIMARY_TRAFFIC_CLASS, req_msg, pri_path->traffic_class); + IBA_SET(CM_REQ_PRIMARY_HOP_LIMIT, req_msg, pri_path->hop_limit); + IBA_SET(CM_REQ_PRIMARY_SL, req_msg, pri_path->sl); + IBA_SET(CM_REQ_PRIMARY_SUBNET_LOCAL, req_msg, + (pri_path->hop_limit <= 1)); + IBA_SET(CM_REQ_PRIMARY_LOCAL_ACK_TIMEOUT, req_msg, + cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay, + pri_path->packet_life_time)); + + if (alt_path) { + bool alt_ext = false; + + if (alt_path->rec_type == SA_PATH_REC_TYPE_OPA) + alt_ext = opa_is_extended_lid(alt_path->opa.dlid, + alt_path->opa.slid); + + *IBA_GET_MEM_PTR(CM_REQ_ALTERNATE_LOCAL_PORT_GID, req_msg) = + alt_path->sgid; + *IBA_GET_MEM_PTR(CM_REQ_ALTERNATE_REMOTE_PORT_GID, req_msg) = + alt_path->dgid; + if (alt_ext) { + IBA_GET_MEM_PTR(CM_REQ_ALTERNATE_LOCAL_PORT_GID, + req_msg) + ->global.interface_id = + OPA_MAKE_ID(be32_to_cpu(alt_path->opa.slid)); + IBA_GET_MEM_PTR(CM_REQ_ALTERNATE_REMOTE_PORT_GID, + req_msg) + ->global.interface_id = + OPA_MAKE_ID(be32_to_cpu(alt_path->opa.dlid)); + } + if (alt_path->hop_limit <= 1) { + IBA_SET(CM_REQ_ALTERNATE_LOCAL_PORT_LID, req_msg, + be16_to_cpu( + alt_ext ? 0 : + htons(ntohl(sa_path_get_slid( + alt_path))))); + IBA_SET(CM_REQ_ALTERNATE_REMOTE_PORT_LID, req_msg, + be16_to_cpu( + alt_ext ? 0 : + htons(ntohl(sa_path_get_dlid( + alt_path))))); + } else { + IBA_SET(CM_REQ_ALTERNATE_LOCAL_PORT_LID, req_msg, + be16_to_cpu(IB_LID_PERMISSIVE)); + IBA_SET(CM_REQ_ALTERNATE_REMOTE_PORT_LID, req_msg, + be16_to_cpu(IB_LID_PERMISSIVE)); + } + IBA_SET(CM_REQ_ALTERNATE_FLOW_LABEL, req_msg, + be32_to_cpu(alt_path->flow_label)); + IBA_SET(CM_REQ_ALTERNATE_PACKET_RATE, req_msg, alt_path->rate); + IBA_SET(CM_REQ_ALTERNATE_TRAFFIC_CLASS, req_msg, + alt_path->traffic_class); + IBA_SET(CM_REQ_ALTERNATE_HOP_LIMIT, req_msg, + alt_path->hop_limit); + IBA_SET(CM_REQ_ALTERNATE_SL, req_msg, alt_path->sl); + IBA_SET(CM_REQ_ALTERNATE_SUBNET_LOCAL, req_msg, + (alt_path->hop_limit <= 1)); + IBA_SET(CM_REQ_ALTERNATE_LOCAL_ACK_TIMEOUT, req_msg, + cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay, + alt_path->packet_life_time)); + } + IBA_SET(CM_REQ_VENDOR_ID, req_msg, param->ece.vendor_id); + + if (param->private_data && param->private_data_len) + IBA_SET_MEM(CM_REQ_PRIVATE_DATA, req_msg, param->private_data, + param->private_data_len); +} + +static int cm_validate_req_param(struct ib_cm_req_param *param) +{ + if (!param->primary_path) + return -EINVAL; + + if (param->qp_type != IB_QPT_RC && param->qp_type != IB_QPT_UC && + param->qp_type != IB_QPT_XRC_INI) + return -EINVAL; + + if (param->private_data && + param->private_data_len > IB_CM_REQ_PRIVATE_DATA_SIZE) + return -EINVAL; + + if (param->alternate_path && + (param->alternate_path->pkey != param->primary_path->pkey || + param->alternate_path->mtu != param->primary_path->mtu)) + return -EINVAL; + + return 0; +} + +int ib_send_cm_req(struct ib_cm_id *cm_id, + struct ib_cm_req_param *param) +{ + struct cm_av av = {}, alt_av = {}; + struct cm_id_private *cm_id_priv; + struct ib_mad_send_buf *msg; + struct cm_req_msg *req_msg; + unsigned long flags; + int ret; + + ret = cm_validate_req_param(param); + if (ret) + return ret; + + /* Verify that we're not in timewait. */ + cm_id_priv = container_of(cm_id, struct cm_id_private, id); + spin_lock_irqsave(&cm_id_priv->lock, flags); + if (cm_id->state != IB_CM_IDLE || WARN_ON(cm_id_priv->timewait_info)) { + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + return -EINVAL; + } + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + + cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv-> + id.local_id); + if (IS_ERR(cm_id_priv->timewait_info)) { + ret = PTR_ERR(cm_id_priv->timewait_info); + cm_id_priv->timewait_info = NULL; + return ret; + } + + ret = cm_init_av_by_path(param->primary_path, + param->ppath_sgid_attr, &av); + if (ret) + return ret; + if (param->alternate_path) { + ret = cm_init_av_by_path(param->alternate_path, NULL, + &alt_av); + if (ret) { + cm_destroy_av(&av); + return ret; + } + } + cm_id->service_id = param->service_id; + cm_id->service_mask = ~cpu_to_be64(0); + cm_id_priv->timeout_ms = cm_convert_to_ms( + param->primary_path->packet_life_time) * 2 + + cm_convert_to_ms( + param->remote_cm_response_timeout); + cm_id_priv->max_cm_retries = param->max_cm_retries; + cm_id_priv->initiator_depth = param->initiator_depth; + cm_id_priv->responder_resources = param->responder_resources; + cm_id_priv->retry_count = param->retry_count; + cm_id_priv->path_mtu = param->primary_path->mtu; + cm_id_priv->pkey = param->primary_path->pkey; + cm_id_priv->qp_type = param->qp_type; + + spin_lock_irqsave(&cm_id_priv->lock, flags); + + cm_move_av_from_path(&cm_id_priv->av, &av); + if (param->primary_path_outbound) + cm_id_priv->av.dlid_datapath = + be16_to_cpu(param->primary_path_outbound->ib.dlid); + + if (param->alternate_path) + cm_move_av_from_path(&cm_id_priv->alt_av, &alt_av); + + msg = cm_alloc_priv_msg(cm_id_priv); + if (IS_ERR(msg)) { + ret = PTR_ERR(msg); + goto out_unlock; + } + + req_msg = (struct cm_req_msg *)msg->mad; + cm_format_req(req_msg, cm_id_priv, param); + cm_id_priv->tid = req_msg->hdr.tid; + msg->timeout_ms = cm_id_priv->timeout_ms; + msg->context[1] = (void *)(unsigned long)IB_CM_REQ_SENT; + + cm_id_priv->local_qpn = cpu_to_be32(IBA_GET(CM_REQ_LOCAL_QPN, req_msg)); + cm_id_priv->rq_psn = cpu_to_be32(IBA_GET(CM_REQ_STARTING_PSN, req_msg)); + + trace_icm_send_req(&cm_id_priv->id); + ret = ib_post_send_mad(msg, NULL); + if (ret) + goto out_free; + BUG_ON(cm_id->state != IB_CM_IDLE); + cm_id->state = IB_CM_REQ_SENT; + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + return 0; +out_free: + cm_free_priv_msg(msg); +out_unlock: + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + return ret; +} +EXPORT_SYMBOL(ib_send_cm_req); + +static int cm_issue_rej(struct cm_port *port, + struct ib_mad_recv_wc *mad_recv_wc, + enum ib_cm_rej_reason reason, + enum cm_msg_response msg_rejected, + void *ari, u8 ari_length) +{ + struct ib_mad_send_buf *msg = NULL; + struct cm_rej_msg *rej_msg, *rcv_msg; + int ret; + + ret = cm_alloc_response_msg(port, mad_recv_wc, &msg); + if (ret) + return ret; + + /* We just need common CM header information. Cast to any message. */ + rcv_msg = (struct cm_rej_msg *) mad_recv_wc->recv_buf.mad; + rej_msg = (struct cm_rej_msg *) msg->mad; + + cm_format_mad_hdr(&rej_msg->hdr, CM_REJ_ATTR_ID, rcv_msg->hdr.tid); + IBA_SET(CM_REJ_REMOTE_COMM_ID, rej_msg, + IBA_GET(CM_REJ_LOCAL_COMM_ID, rcv_msg)); + IBA_SET(CM_REJ_LOCAL_COMM_ID, rej_msg, + IBA_GET(CM_REJ_REMOTE_COMM_ID, rcv_msg)); + IBA_SET(CM_REJ_MESSAGE_REJECTED, rej_msg, msg_rejected); + IBA_SET(CM_REJ_REASON, rej_msg, reason); + + if (ari && ari_length) { + IBA_SET(CM_REJ_REJECTED_INFO_LENGTH, rej_msg, ari_length); + IBA_SET_MEM(CM_REJ_ARI, rej_msg, ari, ari_length); + } + + trace_icm_issue_rej( + IBA_GET(CM_REJ_LOCAL_COMM_ID, rcv_msg), + IBA_GET(CM_REJ_REMOTE_COMM_ID, rcv_msg)); + ret = ib_post_send_mad(msg, NULL); + if (ret) + cm_free_response_msg(msg); + + return ret; +} + +static bool cm_req_has_alt_path(struct cm_req_msg *req_msg) +{ + return ((cpu_to_be16( + IBA_GET(CM_REQ_ALTERNATE_LOCAL_PORT_LID, req_msg))) || + (ib_is_opa_gid(IBA_GET_MEM_PTR(CM_REQ_ALTERNATE_LOCAL_PORT_GID, + req_msg)))); +} + +static void cm_path_set_rec_type(struct ib_device *ib_device, u32 port_num, + struct sa_path_rec *path, union ib_gid *gid) +{ + if (ib_is_opa_gid(gid) && rdma_cap_opa_ah(ib_device, port_num)) + path->rec_type = SA_PATH_REC_TYPE_OPA; + else + path->rec_type = SA_PATH_REC_TYPE_IB; +} + +static void cm_format_path_lid_from_req(struct cm_req_msg *req_msg, + struct sa_path_rec *primary_path, + struct sa_path_rec *alt_path, + struct ib_wc *wc) +{ + u32 lid; + + if (primary_path->rec_type != SA_PATH_REC_TYPE_OPA) { + sa_path_set_dlid(primary_path, wc->slid); + sa_path_set_slid(primary_path, + IBA_GET(CM_REQ_PRIMARY_REMOTE_PORT_LID, + req_msg)); + } else { + lid = opa_get_lid_from_gid(IBA_GET_MEM_PTR( + CM_REQ_PRIMARY_LOCAL_PORT_GID, req_msg)); + sa_path_set_dlid(primary_path, lid); + + lid = opa_get_lid_from_gid(IBA_GET_MEM_PTR( + CM_REQ_PRIMARY_REMOTE_PORT_GID, req_msg)); + sa_path_set_slid(primary_path, lid); + } + + if (!cm_req_has_alt_path(req_msg)) + return; + + if (alt_path->rec_type != SA_PATH_REC_TYPE_OPA) { + sa_path_set_dlid(alt_path, + IBA_GET(CM_REQ_ALTERNATE_LOCAL_PORT_LID, + req_msg)); + sa_path_set_slid(alt_path, + IBA_GET(CM_REQ_ALTERNATE_REMOTE_PORT_LID, + req_msg)); + } else { + lid = opa_get_lid_from_gid(IBA_GET_MEM_PTR( + CM_REQ_ALTERNATE_LOCAL_PORT_GID, req_msg)); + sa_path_set_dlid(alt_path, lid); + + lid = opa_get_lid_from_gid(IBA_GET_MEM_PTR( + CM_REQ_ALTERNATE_REMOTE_PORT_GID, req_msg)); + sa_path_set_slid(alt_path, lid); + } +} + +static void cm_format_paths_from_req(struct cm_req_msg *req_msg, + struct sa_path_rec *primary_path, + struct sa_path_rec *alt_path, + struct ib_wc *wc) +{ + primary_path->dgid = + *IBA_GET_MEM_PTR(CM_REQ_PRIMARY_LOCAL_PORT_GID, req_msg); + primary_path->sgid = + *IBA_GET_MEM_PTR(CM_REQ_PRIMARY_REMOTE_PORT_GID, req_msg); + primary_path->flow_label = + cpu_to_be32(IBA_GET(CM_REQ_PRIMARY_FLOW_LABEL, req_msg)); + primary_path->hop_limit = IBA_GET(CM_REQ_PRIMARY_HOP_LIMIT, req_msg); + primary_path->traffic_class = + IBA_GET(CM_REQ_PRIMARY_TRAFFIC_CLASS, req_msg); + primary_path->reversible = 1; + primary_path->pkey = + cpu_to_be16(IBA_GET(CM_REQ_PARTITION_KEY, req_msg)); + primary_path->sl = IBA_GET(CM_REQ_PRIMARY_SL, req_msg); + primary_path->mtu_selector = IB_SA_EQ; + primary_path->mtu = IBA_GET(CM_REQ_PATH_PACKET_PAYLOAD_MTU, req_msg); + primary_path->rate_selector = IB_SA_EQ; + primary_path->rate = IBA_GET(CM_REQ_PRIMARY_PACKET_RATE, req_msg); + primary_path->packet_life_time_selector = IB_SA_EQ; + primary_path->packet_life_time = + IBA_GET(CM_REQ_PRIMARY_LOCAL_ACK_TIMEOUT, req_msg); + primary_path->packet_life_time -= (primary_path->packet_life_time > 0); + primary_path->service_id = + cpu_to_be64(IBA_GET(CM_REQ_SERVICE_ID, req_msg)); + if (sa_path_is_roce(primary_path)) + primary_path->roce.route_resolved = false; + + if (cm_req_has_alt_path(req_msg)) { + alt_path->dgid = *IBA_GET_MEM_PTR( + CM_REQ_ALTERNATE_LOCAL_PORT_GID, req_msg); + alt_path->sgid = *IBA_GET_MEM_PTR( + CM_REQ_ALTERNATE_REMOTE_PORT_GID, req_msg); + alt_path->flow_label = cpu_to_be32( + IBA_GET(CM_REQ_ALTERNATE_FLOW_LABEL, req_msg)); + alt_path->hop_limit = + IBA_GET(CM_REQ_ALTERNATE_HOP_LIMIT, req_msg); + alt_path->traffic_class = + IBA_GET(CM_REQ_ALTERNATE_TRAFFIC_CLASS, req_msg); + alt_path->reversible = 1; + alt_path->pkey = + cpu_to_be16(IBA_GET(CM_REQ_PARTITION_KEY, req_msg)); + alt_path->sl = IBA_GET(CM_REQ_ALTERNATE_SL, req_msg); + alt_path->mtu_selector = IB_SA_EQ; + alt_path->mtu = + IBA_GET(CM_REQ_PATH_PACKET_PAYLOAD_MTU, req_msg); + alt_path->rate_selector = IB_SA_EQ; + alt_path->rate = IBA_GET(CM_REQ_ALTERNATE_PACKET_RATE, req_msg); + alt_path->packet_life_time_selector = IB_SA_EQ; + alt_path->packet_life_time = + IBA_GET(CM_REQ_ALTERNATE_LOCAL_ACK_TIMEOUT, req_msg); + alt_path->packet_life_time -= (alt_path->packet_life_time > 0); + alt_path->service_id = + cpu_to_be64(IBA_GET(CM_REQ_SERVICE_ID, req_msg)); + + if (sa_path_is_roce(alt_path)) + alt_path->roce.route_resolved = false; + } + cm_format_path_lid_from_req(req_msg, primary_path, alt_path, wc); +} + +static u16 cm_get_bth_pkey(struct cm_work *work) +{ + struct ib_device *ib_dev = work->port->cm_dev->ib_device; + u32 port_num = work->port->port_num; + u16 pkey_index = work->mad_recv_wc->wc->pkey_index; + u16 pkey; + int ret; + + ret = ib_get_cached_pkey(ib_dev, port_num, pkey_index, &pkey); + if (ret) { + dev_warn_ratelimited(&ib_dev->dev, "ib_cm: Couldn't retrieve pkey for incoming request (port %u, pkey index %u). %d\n", + port_num, pkey_index, ret); + return 0; + } + + return pkey; +} + +/** + * cm_opa_to_ib_sgid - Convert OPA SGID to IB SGID + * ULPs (such as IPoIB) do not understand OPA GIDs and will + * reject them as the local_gid will not match the sgid. Therefore, + * change the pathrec's SGID to an IB SGID. + * + * @work: Work completion + * @path: Path record + */ +static void cm_opa_to_ib_sgid(struct cm_work *work, + struct sa_path_rec *path) +{ + struct ib_device *dev = work->port->cm_dev->ib_device; + u32 port_num = work->port->port_num; + + if (rdma_cap_opa_ah(dev, port_num) && + (ib_is_opa_gid(&path->sgid))) { + union ib_gid sgid; + + if (rdma_query_gid(dev, port_num, 0, &sgid)) { + dev_warn(&dev->dev, + "Error updating sgid in CM request\n"); + return; + } + + path->sgid = sgid; + } +} + +static void cm_format_req_event(struct cm_work *work, + struct cm_id_private *cm_id_priv, + struct ib_cm_id *listen_id) +{ + struct cm_req_msg *req_msg; + struct ib_cm_req_event_param *param; + + req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad; + param = &work->cm_event.param.req_rcvd; + param->listen_id = listen_id; + param->bth_pkey = cm_get_bth_pkey(work); + param->port = cm_id_priv->av.port->port_num; + param->primary_path = &work->path[0]; + cm_opa_to_ib_sgid(work, param->primary_path); + if (cm_req_has_alt_path(req_msg)) { + param->alternate_path = &work->path[1]; + cm_opa_to_ib_sgid(work, param->alternate_path); + } else { + param->alternate_path = NULL; + } + param->remote_ca_guid = + cpu_to_be64(IBA_GET(CM_REQ_LOCAL_CA_GUID, req_msg)); + param->remote_qkey = IBA_GET(CM_REQ_LOCAL_Q_KEY, req_msg); + param->remote_qpn = IBA_GET(CM_REQ_LOCAL_QPN, req_msg); + param->qp_type = cm_req_get_qp_type(req_msg); + param->starting_psn = IBA_GET(CM_REQ_STARTING_PSN, req_msg); + param->responder_resources = IBA_GET(CM_REQ_INITIATOR_DEPTH, req_msg); + param->initiator_depth = IBA_GET(CM_REQ_RESPONDER_RESOURCES, req_msg); + param->local_cm_response_timeout = + IBA_GET(CM_REQ_REMOTE_CM_RESPONSE_TIMEOUT, req_msg); + param->flow_control = IBA_GET(CM_REQ_END_TO_END_FLOW_CONTROL, req_msg); + param->remote_cm_response_timeout = + IBA_GET(CM_REQ_LOCAL_CM_RESPONSE_TIMEOUT, req_msg); + param->retry_count = IBA_GET(CM_REQ_RETRY_COUNT, req_msg); + param->rnr_retry_count = IBA_GET(CM_REQ_RNR_RETRY_COUNT, req_msg); + param->srq = IBA_GET(CM_REQ_SRQ, req_msg); + param->ppath_sgid_attr = cm_id_priv->av.ah_attr.grh.sgid_attr; + param->ece.vendor_id = IBA_GET(CM_REQ_VENDOR_ID, req_msg); + param->ece.attr_mod = be32_to_cpu(req_msg->hdr.attr_mod); + + work->cm_event.private_data = + IBA_GET_MEM_PTR(CM_REQ_PRIVATE_DATA, req_msg); +} + +static void cm_process_work(struct cm_id_private *cm_id_priv, + struct cm_work *work) +{ + int ret; + + /* We will typically only have the current event to report. */ + ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, &work->cm_event); + cm_free_work(work); + + while (!ret && !atomic_add_negative(-1, &cm_id_priv->work_count)) { + spin_lock_irq(&cm_id_priv->lock); + work = cm_dequeue_work(cm_id_priv); + spin_unlock_irq(&cm_id_priv->lock); + if (!work) + return; + + ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, + &work->cm_event); + cm_free_work(work); + } + cm_deref_id(cm_id_priv); + if (ret) + cm_destroy_id(&cm_id_priv->id, ret); +} + +static void cm_format_mra(struct cm_mra_msg *mra_msg, + struct cm_id_private *cm_id_priv, + enum cm_msg_response msg_mraed, u8 service_timeout, + const void *private_data, u8 private_data_len) +{ + cm_format_mad_hdr(&mra_msg->hdr, CM_MRA_ATTR_ID, cm_id_priv->tid); + IBA_SET(CM_MRA_MESSAGE_MRAED, mra_msg, msg_mraed); + IBA_SET(CM_MRA_LOCAL_COMM_ID, mra_msg, + be32_to_cpu(cm_id_priv->id.local_id)); + IBA_SET(CM_MRA_REMOTE_COMM_ID, mra_msg, + be32_to_cpu(cm_id_priv->id.remote_id)); + IBA_SET(CM_MRA_SERVICE_TIMEOUT, mra_msg, service_timeout); + + if (private_data && private_data_len) + IBA_SET_MEM(CM_MRA_PRIVATE_DATA, mra_msg, private_data, + private_data_len); +} + +static void cm_format_rej(struct cm_rej_msg *rej_msg, + struct cm_id_private *cm_id_priv, + enum ib_cm_rej_reason reason, void *ari, + u8 ari_length, const void *private_data, + u8 private_data_len, enum ib_cm_state state) +{ + lockdep_assert_held(&cm_id_priv->lock); + + cm_format_mad_hdr(&rej_msg->hdr, CM_REJ_ATTR_ID, cm_id_priv->tid); + IBA_SET(CM_REJ_REMOTE_COMM_ID, rej_msg, + be32_to_cpu(cm_id_priv->id.remote_id)); + + switch (state) { + case IB_CM_REQ_RCVD: + IBA_SET(CM_REJ_LOCAL_COMM_ID, rej_msg, be32_to_cpu(0)); + IBA_SET(CM_REJ_MESSAGE_REJECTED, rej_msg, CM_MSG_RESPONSE_REQ); + break; + case IB_CM_MRA_REQ_SENT: + IBA_SET(CM_REJ_LOCAL_COMM_ID, rej_msg, + be32_to_cpu(cm_id_priv->id.local_id)); + IBA_SET(CM_REJ_MESSAGE_REJECTED, rej_msg, CM_MSG_RESPONSE_REQ); + break; + case IB_CM_REP_RCVD: + case IB_CM_MRA_REP_SENT: + IBA_SET(CM_REJ_LOCAL_COMM_ID, rej_msg, + be32_to_cpu(cm_id_priv->id.local_id)); + IBA_SET(CM_REJ_MESSAGE_REJECTED, rej_msg, CM_MSG_RESPONSE_REP); + break; + default: + IBA_SET(CM_REJ_LOCAL_COMM_ID, rej_msg, + be32_to_cpu(cm_id_priv->id.local_id)); + IBA_SET(CM_REJ_MESSAGE_REJECTED, rej_msg, + CM_MSG_RESPONSE_OTHER); + break; + } + + IBA_SET(CM_REJ_REASON, rej_msg, reason); + if (ari && ari_length) { + IBA_SET(CM_REJ_REJECTED_INFO_LENGTH, rej_msg, ari_length); + IBA_SET_MEM(CM_REJ_ARI, rej_msg, ari, ari_length); + } + + if (private_data && private_data_len) + IBA_SET_MEM(CM_REJ_PRIVATE_DATA, rej_msg, private_data, + private_data_len); +} + +static void cm_dup_req_handler(struct cm_work *work, + struct cm_id_private *cm_id_priv) +{ + struct ib_mad_send_buf *msg = NULL; + int ret; + + atomic_long_inc( + &work->port->counters[CM_RECV_DUPLICATES][CM_REQ_COUNTER]); + + /* Quick state check to discard duplicate REQs. */ + spin_lock_irq(&cm_id_priv->lock); + if (cm_id_priv->id.state == IB_CM_REQ_RCVD) { + spin_unlock_irq(&cm_id_priv->lock); + return; + } + spin_unlock_irq(&cm_id_priv->lock); + + ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg); + if (ret) + return; + + spin_lock_irq(&cm_id_priv->lock); + switch (cm_id_priv->id.state) { + case IB_CM_MRA_REQ_SENT: + cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv, + CM_MSG_RESPONSE_REQ, cm_id_priv->service_timeout, + cm_id_priv->private_data, + cm_id_priv->private_data_len); + break; + case IB_CM_TIMEWAIT: + cm_format_rej((struct cm_rej_msg *)msg->mad, cm_id_priv, + IB_CM_REJ_STALE_CONN, NULL, 0, NULL, 0, + IB_CM_TIMEWAIT); + break; + default: + goto unlock; + } + spin_unlock_irq(&cm_id_priv->lock); + + trace_icm_send_dup_req(&cm_id_priv->id); + ret = ib_post_send_mad(msg, NULL); + if (ret) + goto free; + return; + +unlock: spin_unlock_irq(&cm_id_priv->lock); +free: cm_free_response_msg(msg); +} + +static struct cm_id_private *cm_match_req(struct cm_work *work, + struct cm_id_private *cm_id_priv) +{ + struct cm_id_private *listen_cm_id_priv, *cur_cm_id_priv; + struct cm_timewait_info *timewait_info; + struct cm_req_msg *req_msg; + + req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad; + + /* Check for possible duplicate REQ. */ + spin_lock_irq(&cm.lock); + timewait_info = cm_insert_remote_id(cm_id_priv->timewait_info); + if (timewait_info) { + cur_cm_id_priv = cm_acquire_id(timewait_info->work.local_id, + timewait_info->work.remote_id); + spin_unlock_irq(&cm.lock); + if (cur_cm_id_priv) { + cm_dup_req_handler(work, cur_cm_id_priv); + cm_deref_id(cur_cm_id_priv); + } + return NULL; + } + + /* Check for stale connections. */ + timewait_info = cm_insert_remote_qpn(cm_id_priv->timewait_info); + if (timewait_info) { + cm_remove_remote(cm_id_priv); + cur_cm_id_priv = cm_acquire_id(timewait_info->work.local_id, + timewait_info->work.remote_id); + + spin_unlock_irq(&cm.lock); + cm_issue_rej(work->port, work->mad_recv_wc, + IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REQ, + NULL, 0); + if (cur_cm_id_priv) { + ib_send_cm_dreq(&cur_cm_id_priv->id, NULL, 0); + cm_deref_id(cur_cm_id_priv); + } + return NULL; + } + + /* Find matching listen request. */ + listen_cm_id_priv = cm_find_listen( + cm_id_priv->id.device, + cpu_to_be64(IBA_GET(CM_REQ_SERVICE_ID, req_msg))); + if (!listen_cm_id_priv) { + cm_remove_remote(cm_id_priv); + spin_unlock_irq(&cm.lock); + cm_issue_rej(work->port, work->mad_recv_wc, + IB_CM_REJ_INVALID_SERVICE_ID, CM_MSG_RESPONSE_REQ, + NULL, 0); + return NULL; + } + spin_unlock_irq(&cm.lock); + return listen_cm_id_priv; +} + +/* + * Work-around for inter-subnet connections. If the LIDs are permissive, + * we need to override the LID/SL data in the REQ with the LID information + * in the work completion. + */ +static void cm_process_routed_req(struct cm_req_msg *req_msg, struct ib_wc *wc) +{ + if (!IBA_GET(CM_REQ_PRIMARY_SUBNET_LOCAL, req_msg)) { + if (cpu_to_be16(IBA_GET(CM_REQ_PRIMARY_LOCAL_PORT_LID, + req_msg)) == IB_LID_PERMISSIVE) { + IBA_SET(CM_REQ_PRIMARY_LOCAL_PORT_LID, req_msg, + be16_to_cpu(ib_lid_be16(wc->slid))); + IBA_SET(CM_REQ_PRIMARY_SL, req_msg, wc->sl); + } + + if (cpu_to_be16(IBA_GET(CM_REQ_PRIMARY_REMOTE_PORT_LID, + req_msg)) == IB_LID_PERMISSIVE) + IBA_SET(CM_REQ_PRIMARY_REMOTE_PORT_LID, req_msg, + wc->dlid_path_bits); + } + + if (!IBA_GET(CM_REQ_ALTERNATE_SUBNET_LOCAL, req_msg)) { + if (cpu_to_be16(IBA_GET(CM_REQ_ALTERNATE_LOCAL_PORT_LID, + req_msg)) == IB_LID_PERMISSIVE) { + IBA_SET(CM_REQ_ALTERNATE_LOCAL_PORT_LID, req_msg, + be16_to_cpu(ib_lid_be16(wc->slid))); + IBA_SET(CM_REQ_ALTERNATE_SL, req_msg, wc->sl); + } + + if (cpu_to_be16(IBA_GET(CM_REQ_ALTERNATE_REMOTE_PORT_LID, + req_msg)) == IB_LID_PERMISSIVE) + IBA_SET(CM_REQ_ALTERNATE_REMOTE_PORT_LID, req_msg, + wc->dlid_path_bits); + } +} + +static int cm_req_handler(struct cm_work *work) +{ + struct cm_id_private *cm_id_priv, *listen_cm_id_priv; + struct cm_req_msg *req_msg; + const struct ib_global_route *grh; + const struct ib_gid_attr *gid_attr; + int ret; + + req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad; + + cm_id_priv = + cm_alloc_id_priv(work->port->cm_dev->ib_device, NULL, NULL); + if (IS_ERR(cm_id_priv)) + return PTR_ERR(cm_id_priv); + + cm_id_priv->id.remote_id = + cpu_to_be32(IBA_GET(CM_REQ_LOCAL_COMM_ID, req_msg)); + cm_id_priv->id.service_id = + cpu_to_be64(IBA_GET(CM_REQ_SERVICE_ID, req_msg)); + cm_id_priv->id.service_mask = ~cpu_to_be64(0); + cm_id_priv->tid = req_msg->hdr.tid; + cm_id_priv->timeout_ms = cm_convert_to_ms( + IBA_GET(CM_REQ_LOCAL_CM_RESPONSE_TIMEOUT, req_msg)); + cm_id_priv->max_cm_retries = IBA_GET(CM_REQ_MAX_CM_RETRIES, req_msg); + cm_id_priv->remote_qpn = + cpu_to_be32(IBA_GET(CM_REQ_LOCAL_QPN, req_msg)); + cm_id_priv->initiator_depth = + IBA_GET(CM_REQ_RESPONDER_RESOURCES, req_msg); + cm_id_priv->responder_resources = + IBA_GET(CM_REQ_INITIATOR_DEPTH, req_msg); + cm_id_priv->path_mtu = IBA_GET(CM_REQ_PATH_PACKET_PAYLOAD_MTU, req_msg); + cm_id_priv->pkey = cpu_to_be16(IBA_GET(CM_REQ_PARTITION_KEY, req_msg)); + cm_id_priv->sq_psn = cpu_to_be32(IBA_GET(CM_REQ_STARTING_PSN, req_msg)); + cm_id_priv->retry_count = IBA_GET(CM_REQ_RETRY_COUNT, req_msg); + cm_id_priv->rnr_retry_count = IBA_GET(CM_REQ_RNR_RETRY_COUNT, req_msg); + cm_id_priv->qp_type = cm_req_get_qp_type(req_msg); + + ret = cm_init_av_for_response(work->port, work->mad_recv_wc->wc, + work->mad_recv_wc->recv_buf.grh, + &cm_id_priv->av); + if (ret) + goto destroy; + cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv-> + id.local_id); + if (IS_ERR(cm_id_priv->timewait_info)) { + ret = PTR_ERR(cm_id_priv->timewait_info); + cm_id_priv->timewait_info = NULL; + goto destroy; + } + cm_id_priv->timewait_info->work.remote_id = cm_id_priv->id.remote_id; + cm_id_priv->timewait_info->remote_ca_guid = + cpu_to_be64(IBA_GET(CM_REQ_LOCAL_CA_GUID, req_msg)); + cm_id_priv->timewait_info->remote_qpn = cm_id_priv->remote_qpn; + + /* + * Note that the ID pointer is not in the xarray at this point, + * so this set is only visible to the local thread. + */ + cm_id_priv->id.state = IB_CM_REQ_RCVD; + + listen_cm_id_priv = cm_match_req(work, cm_id_priv); + if (!listen_cm_id_priv) { + trace_icm_no_listener_err(&cm_id_priv->id); + cm_id_priv->id.state = IB_CM_IDLE; + ret = -EINVAL; + goto destroy; + } + + memset(&work->path[0], 0, sizeof(work->path[0])); + if (cm_req_has_alt_path(req_msg)) + memset(&work->path[1], 0, sizeof(work->path[1])); + grh = rdma_ah_read_grh(&cm_id_priv->av.ah_attr); + gid_attr = grh->sgid_attr; + + if (cm_id_priv->av.ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE) { + work->path[0].rec_type = + sa_conv_gid_to_pathrec_type(gid_attr->gid_type); + } else { + cm_process_routed_req(req_msg, work->mad_recv_wc->wc); + cm_path_set_rec_type( + work->port->cm_dev->ib_device, work->port->port_num, + &work->path[0], + IBA_GET_MEM_PTR(CM_REQ_PRIMARY_LOCAL_PORT_GID, + req_msg)); + } + if (cm_req_has_alt_path(req_msg)) + work->path[1].rec_type = work->path[0].rec_type; + cm_format_paths_from_req(req_msg, &work->path[0], + &work->path[1], work->mad_recv_wc->wc); + if (cm_id_priv->av.ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE) + sa_path_set_dmac(&work->path[0], + cm_id_priv->av.ah_attr.roce.dmac); + work->path[0].hop_limit = grh->hop_limit; + + /* This destroy call is needed to pair with cm_init_av_for_response */ + cm_destroy_av(&cm_id_priv->av); + ret = cm_init_av_by_path(&work->path[0], gid_attr, &cm_id_priv->av); + if (ret) { + int err; + + err = rdma_query_gid(work->port->cm_dev->ib_device, + work->port->port_num, 0, + &work->path[0].sgid); + if (err) + ib_send_cm_rej(&cm_id_priv->id, IB_CM_REJ_INVALID_GID, + NULL, 0, NULL, 0); + else + ib_send_cm_rej(&cm_id_priv->id, IB_CM_REJ_INVALID_GID, + &work->path[0].sgid, + sizeof(work->path[0].sgid), + NULL, 0); + goto rejected; + } + if (cm_id_priv->av.ah_attr.type == RDMA_AH_ATTR_TYPE_IB) + cm_id_priv->av.dlid_datapath = + IBA_GET(CM_REQ_PRIMARY_LOCAL_PORT_LID, req_msg); + + if (cm_req_has_alt_path(req_msg)) { + ret = cm_init_av_by_path(&work->path[1], NULL, + &cm_id_priv->alt_av); + if (ret) { + ib_send_cm_rej(&cm_id_priv->id, + IB_CM_REJ_INVALID_ALT_GID, + &work->path[0].sgid, + sizeof(work->path[0].sgid), NULL, 0); + goto rejected; + } + } + + cm_id_priv->id.cm_handler = listen_cm_id_priv->id.cm_handler; + cm_id_priv->id.context = listen_cm_id_priv->id.context; + cm_format_req_event(work, cm_id_priv, &listen_cm_id_priv->id); + + /* Now MAD handlers can see the new ID */ + spin_lock_irq(&cm_id_priv->lock); + cm_finalize_id(cm_id_priv); + + /* Refcount belongs to the event, pairs with cm_process_work() */ + refcount_inc(&cm_id_priv->refcount); + cm_queue_work_unlock(cm_id_priv, work); + /* + * Since this ID was just created and was not made visible to other MAD + * handlers until the cm_finalize_id() above we know that the + * cm_process_work() will deliver the event and the listen_cm_id + * embedded in the event can be derefed here. + */ + cm_deref_id(listen_cm_id_priv); + return 0; + +rejected: + cm_deref_id(listen_cm_id_priv); +destroy: + ib_destroy_cm_id(&cm_id_priv->id); + return ret; +} + +static void cm_format_rep(struct cm_rep_msg *rep_msg, + struct cm_id_private *cm_id_priv, + struct ib_cm_rep_param *param) +{ + cm_format_mad_ece_hdr(&rep_msg->hdr, CM_REP_ATTR_ID, cm_id_priv->tid, + param->ece.attr_mod); + IBA_SET(CM_REP_LOCAL_COMM_ID, rep_msg, + be32_to_cpu(cm_id_priv->id.local_id)); + IBA_SET(CM_REP_REMOTE_COMM_ID, rep_msg, + be32_to_cpu(cm_id_priv->id.remote_id)); + IBA_SET(CM_REP_STARTING_PSN, rep_msg, param->starting_psn); + IBA_SET(CM_REP_RESPONDER_RESOURCES, rep_msg, + param->responder_resources); + IBA_SET(CM_REP_TARGET_ACK_DELAY, rep_msg, + cm_id_priv->av.port->cm_dev->ack_delay); + IBA_SET(CM_REP_FAILOVER_ACCEPTED, rep_msg, param->failover_accepted); + IBA_SET(CM_REP_RNR_RETRY_COUNT, rep_msg, param->rnr_retry_count); + IBA_SET(CM_REP_LOCAL_CA_GUID, rep_msg, + be64_to_cpu(cm_id_priv->id.device->node_guid)); + + if (cm_id_priv->qp_type != IB_QPT_XRC_TGT) { + IBA_SET(CM_REP_INITIATOR_DEPTH, rep_msg, + param->initiator_depth); + IBA_SET(CM_REP_END_TO_END_FLOW_CONTROL, rep_msg, + param->flow_control); + IBA_SET(CM_REP_SRQ, rep_msg, param->srq); + IBA_SET(CM_REP_LOCAL_QPN, rep_msg, param->qp_num); + } else { + IBA_SET(CM_REP_SRQ, rep_msg, 1); + IBA_SET(CM_REP_LOCAL_EE_CONTEXT_NUMBER, rep_msg, param->qp_num); + } + + IBA_SET(CM_REP_VENDOR_ID_L, rep_msg, param->ece.vendor_id); + IBA_SET(CM_REP_VENDOR_ID_M, rep_msg, param->ece.vendor_id >> 8); + IBA_SET(CM_REP_VENDOR_ID_H, rep_msg, param->ece.vendor_id >> 16); + + if (param->private_data && param->private_data_len) + IBA_SET_MEM(CM_REP_PRIVATE_DATA, rep_msg, param->private_data, + param->private_data_len); +} + +int ib_send_cm_rep(struct ib_cm_id *cm_id, + struct ib_cm_rep_param *param) +{ + struct cm_id_private *cm_id_priv; + struct ib_mad_send_buf *msg; + struct cm_rep_msg *rep_msg; + unsigned long flags; + int ret; + + if (param->private_data && + param->private_data_len > IB_CM_REP_PRIVATE_DATA_SIZE) + return -EINVAL; + + cm_id_priv = container_of(cm_id, struct cm_id_private, id); + spin_lock_irqsave(&cm_id_priv->lock, flags); + if (cm_id->state != IB_CM_REQ_RCVD && + cm_id->state != IB_CM_MRA_REQ_SENT) { + trace_icm_send_rep_err(cm_id_priv->id.local_id, cm_id->state); + ret = -EINVAL; + goto out; + } + + msg = cm_alloc_priv_msg(cm_id_priv); + if (IS_ERR(msg)) { + ret = PTR_ERR(msg); + goto out; + } + + rep_msg = (struct cm_rep_msg *) msg->mad; + cm_format_rep(rep_msg, cm_id_priv, param); + msg->timeout_ms = cm_id_priv->timeout_ms; + msg->context[1] = (void *) (unsigned long) IB_CM_REP_SENT; + + trace_icm_send_rep(cm_id); + ret = ib_post_send_mad(msg, NULL); + if (ret) + goto out_free; + + cm_id->state = IB_CM_REP_SENT; + cm_id_priv->initiator_depth = param->initiator_depth; + cm_id_priv->responder_resources = param->responder_resources; + cm_id_priv->rq_psn = cpu_to_be32(IBA_GET(CM_REP_STARTING_PSN, rep_msg)); + WARN_ONCE(param->qp_num & 0xFF000000, + "IBTA declares QPN to be 24 bits, but it is 0x%X\n", + param->qp_num); + cm_id_priv->local_qpn = cpu_to_be32(param->qp_num & 0xFFFFFF); + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + return 0; + +out_free: + cm_free_priv_msg(msg); +out: + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + return ret; +} +EXPORT_SYMBOL(ib_send_cm_rep); + +static void cm_format_rtu(struct cm_rtu_msg *rtu_msg, + struct cm_id_private *cm_id_priv, + const void *private_data, + u8 private_data_len) +{ + cm_format_mad_hdr(&rtu_msg->hdr, CM_RTU_ATTR_ID, cm_id_priv->tid); + IBA_SET(CM_RTU_LOCAL_COMM_ID, rtu_msg, + be32_to_cpu(cm_id_priv->id.local_id)); + IBA_SET(CM_RTU_REMOTE_COMM_ID, rtu_msg, + be32_to_cpu(cm_id_priv->id.remote_id)); + + if (private_data && private_data_len) + IBA_SET_MEM(CM_RTU_PRIVATE_DATA, rtu_msg, private_data, + private_data_len); +} + +int ib_send_cm_rtu(struct ib_cm_id *cm_id, + const void *private_data, + u8 private_data_len) +{ + struct cm_id_private *cm_id_priv; + struct ib_mad_send_buf *msg; + unsigned long flags; + void *data; + int ret; + + if (private_data && private_data_len > IB_CM_RTU_PRIVATE_DATA_SIZE) + return -EINVAL; + + data = cm_copy_private_data(private_data, private_data_len); + if (IS_ERR(data)) + return PTR_ERR(data); + + cm_id_priv = container_of(cm_id, struct cm_id_private, id); + spin_lock_irqsave(&cm_id_priv->lock, flags); + if (cm_id->state != IB_CM_REP_RCVD && + cm_id->state != IB_CM_MRA_REP_SENT) { + trace_icm_send_cm_rtu_err(cm_id); + ret = -EINVAL; + goto error; + } + + msg = cm_alloc_msg(cm_id_priv); + if (IS_ERR(msg)) { + ret = PTR_ERR(msg); + goto error; + } + + cm_format_rtu((struct cm_rtu_msg *) msg->mad, cm_id_priv, + private_data, private_data_len); + + trace_icm_send_rtu(cm_id); + ret = ib_post_send_mad(msg, NULL); + if (ret) { + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + cm_free_msg(msg); + kfree(data); + return ret; + } + + cm_id->state = IB_CM_ESTABLISHED; + cm_set_private_data(cm_id_priv, data, private_data_len); + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + return 0; + +error: spin_unlock_irqrestore(&cm_id_priv->lock, flags); + kfree(data); + return ret; +} +EXPORT_SYMBOL(ib_send_cm_rtu); + +static void cm_format_rep_event(struct cm_work *work, enum ib_qp_type qp_type) +{ + struct cm_rep_msg *rep_msg; + struct ib_cm_rep_event_param *param; + + rep_msg = (struct cm_rep_msg *)work->mad_recv_wc->recv_buf.mad; + param = &work->cm_event.param.rep_rcvd; + param->remote_ca_guid = + cpu_to_be64(IBA_GET(CM_REP_LOCAL_CA_GUID, rep_msg)); + param->remote_qkey = IBA_GET(CM_REP_LOCAL_Q_KEY, rep_msg); + param->remote_qpn = be32_to_cpu(cm_rep_get_qpn(rep_msg, qp_type)); + param->starting_psn = IBA_GET(CM_REP_STARTING_PSN, rep_msg); + param->responder_resources = IBA_GET(CM_REP_INITIATOR_DEPTH, rep_msg); + param->initiator_depth = IBA_GET(CM_REP_RESPONDER_RESOURCES, rep_msg); + param->target_ack_delay = IBA_GET(CM_REP_TARGET_ACK_DELAY, rep_msg); + param->failover_accepted = IBA_GET(CM_REP_FAILOVER_ACCEPTED, rep_msg); + param->flow_control = IBA_GET(CM_REP_END_TO_END_FLOW_CONTROL, rep_msg); + param->rnr_retry_count = IBA_GET(CM_REP_RNR_RETRY_COUNT, rep_msg); + param->srq = IBA_GET(CM_REP_SRQ, rep_msg); + param->ece.vendor_id = IBA_GET(CM_REP_VENDOR_ID_H, rep_msg) << 16; + param->ece.vendor_id |= IBA_GET(CM_REP_VENDOR_ID_M, rep_msg) << 8; + param->ece.vendor_id |= IBA_GET(CM_REP_VENDOR_ID_L, rep_msg); + param->ece.attr_mod = be32_to_cpu(rep_msg->hdr.attr_mod); + + work->cm_event.private_data = + IBA_GET_MEM_PTR(CM_REP_PRIVATE_DATA, rep_msg); +} + +static void cm_dup_rep_handler(struct cm_work *work) +{ + struct cm_id_private *cm_id_priv; + struct cm_rep_msg *rep_msg; + struct ib_mad_send_buf *msg = NULL; + int ret; + + rep_msg = (struct cm_rep_msg *) work->mad_recv_wc->recv_buf.mad; + cm_id_priv = cm_acquire_id( + cpu_to_be32(IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg)), + cpu_to_be32(IBA_GET(CM_REP_LOCAL_COMM_ID, rep_msg))); + if (!cm_id_priv) + return; + + atomic_long_inc( + &work->port->counters[CM_RECV_DUPLICATES][CM_REP_COUNTER]); + ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg); + if (ret) + goto deref; + + spin_lock_irq(&cm_id_priv->lock); + if (cm_id_priv->id.state == IB_CM_ESTABLISHED) + cm_format_rtu((struct cm_rtu_msg *) msg->mad, cm_id_priv, + cm_id_priv->private_data, + cm_id_priv->private_data_len); + else if (cm_id_priv->id.state == IB_CM_MRA_REP_SENT) + cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv, + CM_MSG_RESPONSE_REP, cm_id_priv->service_timeout, + cm_id_priv->private_data, + cm_id_priv->private_data_len); + else + goto unlock; + spin_unlock_irq(&cm_id_priv->lock); + + trace_icm_send_dup_rep(&cm_id_priv->id); + ret = ib_post_send_mad(msg, NULL); + if (ret) + goto free; + goto deref; + +unlock: spin_unlock_irq(&cm_id_priv->lock); +free: cm_free_response_msg(msg); +deref: cm_deref_id(cm_id_priv); +} + +static int cm_rep_handler(struct cm_work *work) +{ + struct cm_id_private *cm_id_priv; + struct cm_rep_msg *rep_msg; + int ret; + struct cm_id_private *cur_cm_id_priv; + struct cm_timewait_info *timewait_info; + + rep_msg = (struct cm_rep_msg *)work->mad_recv_wc->recv_buf.mad; + cm_id_priv = cm_acquire_id( + cpu_to_be32(IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg)), 0); + if (!cm_id_priv) { + cm_dup_rep_handler(work); + trace_icm_remote_no_priv_err( + IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg)); + return -EINVAL; + } + + cm_format_rep_event(work, cm_id_priv->qp_type); + + spin_lock_irq(&cm_id_priv->lock); + switch (cm_id_priv->id.state) { + case IB_CM_REQ_SENT: + case IB_CM_MRA_REQ_RCVD: + break; + default: + ret = -EINVAL; + trace_icm_rep_unknown_err( + IBA_GET(CM_REP_LOCAL_COMM_ID, rep_msg), + IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg), + cm_id_priv->id.state); + spin_unlock_irq(&cm_id_priv->lock); + goto error; + } + + cm_id_priv->timewait_info->work.remote_id = + cpu_to_be32(IBA_GET(CM_REP_LOCAL_COMM_ID, rep_msg)); + cm_id_priv->timewait_info->remote_ca_guid = + cpu_to_be64(IBA_GET(CM_REP_LOCAL_CA_GUID, rep_msg)); + cm_id_priv->timewait_info->remote_qpn = cm_rep_get_qpn(rep_msg, cm_id_priv->qp_type); + + spin_lock(&cm.lock); + /* Check for duplicate REP. */ + if (cm_insert_remote_id(cm_id_priv->timewait_info)) { + spin_unlock(&cm.lock); + spin_unlock_irq(&cm_id_priv->lock); + ret = -EINVAL; + trace_icm_insert_failed_err( + IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg)); + goto error; + } + /* Check for a stale connection. */ + timewait_info = cm_insert_remote_qpn(cm_id_priv->timewait_info); + if (timewait_info) { + cm_remove_remote(cm_id_priv); + cur_cm_id_priv = cm_acquire_id(timewait_info->work.local_id, + timewait_info->work.remote_id); + + spin_unlock(&cm.lock); + spin_unlock_irq(&cm_id_priv->lock); + cm_issue_rej(work->port, work->mad_recv_wc, + IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REP, + NULL, 0); + ret = -EINVAL; + trace_icm_staleconn_err( + IBA_GET(CM_REP_LOCAL_COMM_ID, rep_msg), + IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg)); + + if (cur_cm_id_priv) { + ib_send_cm_dreq(&cur_cm_id_priv->id, NULL, 0); + cm_deref_id(cur_cm_id_priv); + } + + goto error; + } + spin_unlock(&cm.lock); + + cm_id_priv->id.state = IB_CM_REP_RCVD; + cm_id_priv->id.remote_id = + cpu_to_be32(IBA_GET(CM_REP_LOCAL_COMM_ID, rep_msg)); + cm_id_priv->remote_qpn = cm_rep_get_qpn(rep_msg, cm_id_priv->qp_type); + cm_id_priv->initiator_depth = + IBA_GET(CM_REP_RESPONDER_RESOURCES, rep_msg); + cm_id_priv->responder_resources = + IBA_GET(CM_REP_INITIATOR_DEPTH, rep_msg); + cm_id_priv->sq_psn = cpu_to_be32(IBA_GET(CM_REP_STARTING_PSN, rep_msg)); + cm_id_priv->rnr_retry_count = IBA_GET(CM_REP_RNR_RETRY_COUNT, rep_msg); + cm_id_priv->target_ack_delay = + IBA_GET(CM_REP_TARGET_ACK_DELAY, rep_msg); + cm_id_priv->av.timeout = + cm_ack_timeout(cm_id_priv->target_ack_delay, + cm_id_priv->av.timeout - 1); + cm_id_priv->alt_av.timeout = + cm_ack_timeout(cm_id_priv->target_ack_delay, + cm_id_priv->alt_av.timeout - 1); + + ib_cancel_mad(cm_id_priv->msg); + cm_queue_work_unlock(cm_id_priv, work); + return 0; + +error: + cm_deref_id(cm_id_priv); + return ret; +} + +static int cm_establish_handler(struct cm_work *work) +{ + struct cm_id_private *cm_id_priv; + + /* See comment in cm_establish about lookup. */ + cm_id_priv = cm_acquire_id(work->local_id, work->remote_id); + if (!cm_id_priv) + return -EINVAL; + + spin_lock_irq(&cm_id_priv->lock); + if (cm_id_priv->id.state != IB_CM_ESTABLISHED) { + spin_unlock_irq(&cm_id_priv->lock); + goto out; + } + + ib_cancel_mad(cm_id_priv->msg); + cm_queue_work_unlock(cm_id_priv, work); + return 0; +out: + cm_deref_id(cm_id_priv); + return -EINVAL; +} + +static int cm_rtu_handler(struct cm_work *work) +{ + struct cm_id_private *cm_id_priv; + struct cm_rtu_msg *rtu_msg; + + rtu_msg = (struct cm_rtu_msg *)work->mad_recv_wc->recv_buf.mad; + cm_id_priv = cm_acquire_id( + cpu_to_be32(IBA_GET(CM_RTU_REMOTE_COMM_ID, rtu_msg)), + cpu_to_be32(IBA_GET(CM_RTU_LOCAL_COMM_ID, rtu_msg))); + if (!cm_id_priv) + return -EINVAL; + + work->cm_event.private_data = + IBA_GET_MEM_PTR(CM_RTU_PRIVATE_DATA, rtu_msg); + + spin_lock_irq(&cm_id_priv->lock); + if (cm_id_priv->id.state != IB_CM_REP_SENT && + cm_id_priv->id.state != IB_CM_MRA_REP_RCVD) { + spin_unlock_irq(&cm_id_priv->lock); + atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES] + [CM_RTU_COUNTER]); + goto out; + } + cm_id_priv->id.state = IB_CM_ESTABLISHED; + + ib_cancel_mad(cm_id_priv->msg); + cm_queue_work_unlock(cm_id_priv, work); + return 0; +out: + cm_deref_id(cm_id_priv); + return -EINVAL; +} + +static void cm_format_dreq(struct cm_dreq_msg *dreq_msg, + struct cm_id_private *cm_id_priv, + const void *private_data, + u8 private_data_len) +{ + cm_format_mad_hdr(&dreq_msg->hdr, CM_DREQ_ATTR_ID, + cm_form_tid(cm_id_priv)); + IBA_SET(CM_DREQ_LOCAL_COMM_ID, dreq_msg, + be32_to_cpu(cm_id_priv->id.local_id)); + IBA_SET(CM_DREQ_REMOTE_COMM_ID, dreq_msg, + be32_to_cpu(cm_id_priv->id.remote_id)); + IBA_SET(CM_DREQ_REMOTE_QPN_EECN, dreq_msg, + be32_to_cpu(cm_id_priv->remote_qpn)); + + if (private_data && private_data_len) + IBA_SET_MEM(CM_DREQ_PRIVATE_DATA, dreq_msg, private_data, + private_data_len); +} + +static int cm_send_dreq_locked(struct cm_id_private *cm_id_priv, + const void *private_data, u8 private_data_len) +{ + struct ib_mad_send_buf *msg; + int ret; + + lockdep_assert_held(&cm_id_priv->lock); + + if (private_data && private_data_len > IB_CM_DREQ_PRIVATE_DATA_SIZE) + return -EINVAL; + + if (cm_id_priv->id.state != IB_CM_ESTABLISHED) { + trace_icm_dreq_skipped(&cm_id_priv->id); + return -EINVAL; + } + + if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT || + cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD) + ib_cancel_mad(cm_id_priv->msg); + + msg = cm_alloc_priv_msg(cm_id_priv); + if (IS_ERR(msg)) { + cm_enter_timewait(cm_id_priv); + return PTR_ERR(msg); + } + + cm_format_dreq((struct cm_dreq_msg *) msg->mad, cm_id_priv, + private_data, private_data_len); + msg->timeout_ms = cm_id_priv->timeout_ms; + msg->context[1] = (void *) (unsigned long) IB_CM_DREQ_SENT; + + trace_icm_send_dreq(&cm_id_priv->id); + ret = ib_post_send_mad(msg, NULL); + if (ret) { + cm_enter_timewait(cm_id_priv); + cm_free_priv_msg(msg); + return ret; + } + + cm_id_priv->id.state = IB_CM_DREQ_SENT; + return 0; +} + +int ib_send_cm_dreq(struct ib_cm_id *cm_id, const void *private_data, + u8 private_data_len) +{ + struct cm_id_private *cm_id_priv = + container_of(cm_id, struct cm_id_private, id); + unsigned long flags; + int ret; + + spin_lock_irqsave(&cm_id_priv->lock, flags); + ret = cm_send_dreq_locked(cm_id_priv, private_data, private_data_len); + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + return ret; +} +EXPORT_SYMBOL(ib_send_cm_dreq); + +static void cm_format_drep(struct cm_drep_msg *drep_msg, + struct cm_id_private *cm_id_priv, + const void *private_data, + u8 private_data_len) +{ + cm_format_mad_hdr(&drep_msg->hdr, CM_DREP_ATTR_ID, cm_id_priv->tid); + IBA_SET(CM_DREP_LOCAL_COMM_ID, drep_msg, + be32_to_cpu(cm_id_priv->id.local_id)); + IBA_SET(CM_DREP_REMOTE_COMM_ID, drep_msg, + be32_to_cpu(cm_id_priv->id.remote_id)); + + if (private_data && private_data_len) + IBA_SET_MEM(CM_DREP_PRIVATE_DATA, drep_msg, private_data, + private_data_len); +} + +static int cm_send_drep_locked(struct cm_id_private *cm_id_priv, + void *private_data, u8 private_data_len) +{ + struct ib_mad_send_buf *msg; + int ret; + + lockdep_assert_held(&cm_id_priv->lock); + + if (private_data && private_data_len > IB_CM_DREP_PRIVATE_DATA_SIZE) + return -EINVAL; + + if (cm_id_priv->id.state != IB_CM_DREQ_RCVD) { + trace_icm_send_drep_err(&cm_id_priv->id); + kfree(private_data); + return -EINVAL; + } + + cm_set_private_data(cm_id_priv, private_data, private_data_len); + cm_enter_timewait(cm_id_priv); + + msg = cm_alloc_msg(cm_id_priv); + if (IS_ERR(msg)) + return PTR_ERR(msg); + + cm_format_drep((struct cm_drep_msg *) msg->mad, cm_id_priv, + private_data, private_data_len); + + trace_icm_send_drep(&cm_id_priv->id); + ret = ib_post_send_mad(msg, NULL); + if (ret) { + cm_free_msg(msg); + return ret; + } + return 0; +} + +int ib_send_cm_drep(struct ib_cm_id *cm_id, const void *private_data, + u8 private_data_len) +{ + struct cm_id_private *cm_id_priv = + container_of(cm_id, struct cm_id_private, id); + unsigned long flags; + void *data; + int ret; + + data = cm_copy_private_data(private_data, private_data_len); + if (IS_ERR(data)) + return PTR_ERR(data); + + spin_lock_irqsave(&cm_id_priv->lock, flags); + ret = cm_send_drep_locked(cm_id_priv, data, private_data_len); + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + return ret; +} +EXPORT_SYMBOL(ib_send_cm_drep); + +static int cm_issue_drep(struct cm_port *port, + struct ib_mad_recv_wc *mad_recv_wc) +{ + struct ib_mad_send_buf *msg = NULL; + struct cm_dreq_msg *dreq_msg; + struct cm_drep_msg *drep_msg; + int ret; + + ret = cm_alloc_response_msg(port, mad_recv_wc, &msg); + if (ret) + return ret; + + dreq_msg = (struct cm_dreq_msg *) mad_recv_wc->recv_buf.mad; + drep_msg = (struct cm_drep_msg *) msg->mad; + + cm_format_mad_hdr(&drep_msg->hdr, CM_DREP_ATTR_ID, dreq_msg->hdr.tid); + IBA_SET(CM_DREP_REMOTE_COMM_ID, drep_msg, + IBA_GET(CM_DREQ_LOCAL_COMM_ID, dreq_msg)); + IBA_SET(CM_DREP_LOCAL_COMM_ID, drep_msg, + IBA_GET(CM_DREQ_REMOTE_COMM_ID, dreq_msg)); + + trace_icm_issue_drep( + IBA_GET(CM_DREQ_LOCAL_COMM_ID, dreq_msg), + IBA_GET(CM_DREQ_REMOTE_COMM_ID, dreq_msg)); + ret = ib_post_send_mad(msg, NULL); + if (ret) + cm_free_response_msg(msg); + + return ret; +} + +static int cm_dreq_handler(struct cm_work *work) +{ + struct cm_id_private *cm_id_priv; + struct cm_dreq_msg *dreq_msg; + struct ib_mad_send_buf *msg = NULL; + + dreq_msg = (struct cm_dreq_msg *)work->mad_recv_wc->recv_buf.mad; + cm_id_priv = cm_acquire_id( + cpu_to_be32(IBA_GET(CM_DREQ_REMOTE_COMM_ID, dreq_msg)), + cpu_to_be32(IBA_GET(CM_DREQ_LOCAL_COMM_ID, dreq_msg))); + if (!cm_id_priv) { + atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES] + [CM_DREQ_COUNTER]); + cm_issue_drep(work->port, work->mad_recv_wc); + trace_icm_no_priv_err( + IBA_GET(CM_DREQ_LOCAL_COMM_ID, dreq_msg), + IBA_GET(CM_DREQ_REMOTE_COMM_ID, dreq_msg)); + return -EINVAL; + } + + work->cm_event.private_data = + IBA_GET_MEM_PTR(CM_DREQ_PRIVATE_DATA, dreq_msg); + + spin_lock_irq(&cm_id_priv->lock); + if (cm_id_priv->local_qpn != + cpu_to_be32(IBA_GET(CM_DREQ_REMOTE_QPN_EECN, dreq_msg))) + goto unlock; + + switch (cm_id_priv->id.state) { + case IB_CM_REP_SENT: + case IB_CM_DREQ_SENT: + case IB_CM_MRA_REP_RCVD: + ib_cancel_mad(cm_id_priv->msg); + break; + case IB_CM_ESTABLISHED: + if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT || + cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD) + ib_cancel_mad(cm_id_priv->msg); + break; + case IB_CM_TIMEWAIT: + atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES] + [CM_DREQ_COUNTER]); + msg = cm_alloc_response_msg_no_ah(work->port, work->mad_recv_wc); + if (IS_ERR(msg)) + goto unlock; + + cm_format_drep((struct cm_drep_msg *) msg->mad, cm_id_priv, + cm_id_priv->private_data, + cm_id_priv->private_data_len); + spin_unlock_irq(&cm_id_priv->lock); + + if (cm_create_response_msg_ah(work->port, work->mad_recv_wc, msg) || + ib_post_send_mad(msg, NULL)) + cm_free_response_msg(msg); + goto deref; + case IB_CM_DREQ_RCVD: + atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES] + [CM_DREQ_COUNTER]); + goto unlock; + default: + trace_icm_dreq_unknown_err(&cm_id_priv->id); + goto unlock; + } + cm_id_priv->id.state = IB_CM_DREQ_RCVD; + cm_id_priv->tid = dreq_msg->hdr.tid; + cm_queue_work_unlock(cm_id_priv, work); + return 0; + +unlock: spin_unlock_irq(&cm_id_priv->lock); +deref: cm_deref_id(cm_id_priv); + return -EINVAL; +} + +static int cm_drep_handler(struct cm_work *work) +{ + struct cm_id_private *cm_id_priv; + struct cm_drep_msg *drep_msg; + + drep_msg = (struct cm_drep_msg *)work->mad_recv_wc->recv_buf.mad; + cm_id_priv = cm_acquire_id( + cpu_to_be32(IBA_GET(CM_DREP_REMOTE_COMM_ID, drep_msg)), + cpu_to_be32(IBA_GET(CM_DREP_LOCAL_COMM_ID, drep_msg))); + if (!cm_id_priv) + return -EINVAL; + + work->cm_event.private_data = + IBA_GET_MEM_PTR(CM_DREP_PRIVATE_DATA, drep_msg); + + spin_lock_irq(&cm_id_priv->lock); + if (cm_id_priv->id.state != IB_CM_DREQ_SENT && + cm_id_priv->id.state != IB_CM_DREQ_RCVD) { + spin_unlock_irq(&cm_id_priv->lock); + goto out; + } + cm_enter_timewait(cm_id_priv); + + ib_cancel_mad(cm_id_priv->msg); + cm_queue_work_unlock(cm_id_priv, work); + return 0; +out: + cm_deref_id(cm_id_priv); + return -EINVAL; +} + +static int cm_send_rej_locked(struct cm_id_private *cm_id_priv, + enum ib_cm_rej_reason reason, void *ari, + u8 ari_length, const void *private_data, + u8 private_data_len) +{ + enum ib_cm_state state = cm_id_priv->id.state; + struct ib_mad_send_buf *msg; + int ret; + + lockdep_assert_held(&cm_id_priv->lock); + + if ((private_data && private_data_len > IB_CM_REJ_PRIVATE_DATA_SIZE) || + (ari && ari_length > IB_CM_REJ_ARI_LENGTH)) + return -EINVAL; + + switch (state) { + case IB_CM_REQ_SENT: + case IB_CM_MRA_REQ_RCVD: + case IB_CM_REQ_RCVD: + case IB_CM_MRA_REQ_SENT: + case IB_CM_REP_RCVD: + case IB_CM_MRA_REP_SENT: + cm_reset_to_idle(cm_id_priv); + msg = cm_alloc_msg(cm_id_priv); + if (IS_ERR(msg)) + return PTR_ERR(msg); + cm_format_rej((struct cm_rej_msg *)msg->mad, cm_id_priv, reason, + ari, ari_length, private_data, private_data_len, + state); + break; + case IB_CM_REP_SENT: + case IB_CM_MRA_REP_RCVD: + cm_enter_timewait(cm_id_priv); + msg = cm_alloc_msg(cm_id_priv); + if (IS_ERR(msg)) + return PTR_ERR(msg); + cm_format_rej((struct cm_rej_msg *)msg->mad, cm_id_priv, reason, + ari, ari_length, private_data, private_data_len, + state); + break; + default: + trace_icm_send_unknown_rej_err(&cm_id_priv->id); + return -EINVAL; + } + + trace_icm_send_rej(&cm_id_priv->id, reason); + ret = ib_post_send_mad(msg, NULL); + if (ret) { + cm_free_msg(msg); + return ret; + } + + return 0; +} + +int ib_send_cm_rej(struct ib_cm_id *cm_id, enum ib_cm_rej_reason reason, + void *ari, u8 ari_length, const void *private_data, + u8 private_data_len) +{ + struct cm_id_private *cm_id_priv = + container_of(cm_id, struct cm_id_private, id); + unsigned long flags; + int ret; + + spin_lock_irqsave(&cm_id_priv->lock, flags); + ret = cm_send_rej_locked(cm_id_priv, reason, ari, ari_length, + private_data, private_data_len); + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + return ret; +} +EXPORT_SYMBOL(ib_send_cm_rej); + +static void cm_format_rej_event(struct cm_work *work) +{ + struct cm_rej_msg *rej_msg; + struct ib_cm_rej_event_param *param; + + rej_msg = (struct cm_rej_msg *)work->mad_recv_wc->recv_buf.mad; + param = &work->cm_event.param.rej_rcvd; + param->ari = IBA_GET_MEM_PTR(CM_REJ_ARI, rej_msg); + param->ari_length = IBA_GET(CM_REJ_REJECTED_INFO_LENGTH, rej_msg); + param->reason = IBA_GET(CM_REJ_REASON, rej_msg); + work->cm_event.private_data = + IBA_GET_MEM_PTR(CM_REJ_PRIVATE_DATA, rej_msg); +} + +static struct cm_id_private *cm_acquire_rejected_id(struct cm_rej_msg *rej_msg) +{ + struct cm_id_private *cm_id_priv; + __be32 remote_id; + + remote_id = cpu_to_be32(IBA_GET(CM_REJ_LOCAL_COMM_ID, rej_msg)); + + if (IBA_GET(CM_REJ_REASON, rej_msg) == IB_CM_REJ_TIMEOUT) { + cm_id_priv = cm_find_remote_id( + *((__be64 *)IBA_GET_MEM_PTR(CM_REJ_ARI, rej_msg)), + remote_id); + } else if (IBA_GET(CM_REJ_MESSAGE_REJECTED, rej_msg) == + CM_MSG_RESPONSE_REQ) + cm_id_priv = cm_acquire_id( + cpu_to_be32(IBA_GET(CM_REJ_REMOTE_COMM_ID, rej_msg)), + 0); + else + cm_id_priv = cm_acquire_id( + cpu_to_be32(IBA_GET(CM_REJ_REMOTE_COMM_ID, rej_msg)), + remote_id); + + return cm_id_priv; +} + +static int cm_rej_handler(struct cm_work *work) +{ + struct cm_id_private *cm_id_priv; + struct cm_rej_msg *rej_msg; + + rej_msg = (struct cm_rej_msg *)work->mad_recv_wc->recv_buf.mad; + cm_id_priv = cm_acquire_rejected_id(rej_msg); + if (!cm_id_priv) + return -EINVAL; + + cm_format_rej_event(work); + + spin_lock_irq(&cm_id_priv->lock); + switch (cm_id_priv->id.state) { + case IB_CM_REQ_SENT: + case IB_CM_MRA_REQ_RCVD: + case IB_CM_REP_SENT: + case IB_CM_MRA_REP_RCVD: + ib_cancel_mad(cm_id_priv->msg); + fallthrough; + case IB_CM_REQ_RCVD: + case IB_CM_MRA_REQ_SENT: + if (IBA_GET(CM_REJ_REASON, rej_msg) == IB_CM_REJ_STALE_CONN) + cm_enter_timewait(cm_id_priv); + else + cm_reset_to_idle(cm_id_priv); + break; + case IB_CM_DREQ_SENT: + ib_cancel_mad(cm_id_priv->msg); + fallthrough; + case IB_CM_REP_RCVD: + case IB_CM_MRA_REP_SENT: + cm_enter_timewait(cm_id_priv); + break; + case IB_CM_ESTABLISHED: + if (cm_id_priv->id.lap_state == IB_CM_LAP_UNINIT || + cm_id_priv->id.lap_state == IB_CM_LAP_SENT) { + if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT) + ib_cancel_mad(cm_id_priv->msg); + cm_enter_timewait(cm_id_priv); + break; + } + fallthrough; + default: + trace_icm_rej_unknown_err(&cm_id_priv->id); + spin_unlock_irq(&cm_id_priv->lock); + goto out; + } + + cm_queue_work_unlock(cm_id_priv, work); + return 0; +out: + cm_deref_id(cm_id_priv); + return -EINVAL; +} + +int ib_send_cm_mra(struct ib_cm_id *cm_id, + u8 service_timeout, + const void *private_data, + u8 private_data_len) +{ + struct cm_id_private *cm_id_priv; + struct ib_mad_send_buf *msg; + enum ib_cm_state cm_state; + enum ib_cm_lap_state lap_state; + enum cm_msg_response msg_response; + void *data; + unsigned long flags; + int ret; + + if (private_data && private_data_len > IB_CM_MRA_PRIVATE_DATA_SIZE) + return -EINVAL; + + data = cm_copy_private_data(private_data, private_data_len); + if (IS_ERR(data)) + return PTR_ERR(data); + + cm_id_priv = container_of(cm_id, struct cm_id_private, id); + + spin_lock_irqsave(&cm_id_priv->lock, flags); + switch (cm_id_priv->id.state) { + case IB_CM_REQ_RCVD: + cm_state = IB_CM_MRA_REQ_SENT; + lap_state = cm_id->lap_state; + msg_response = CM_MSG_RESPONSE_REQ; + break; + case IB_CM_REP_RCVD: + cm_state = IB_CM_MRA_REP_SENT; + lap_state = cm_id->lap_state; + msg_response = CM_MSG_RESPONSE_REP; + break; + case IB_CM_ESTABLISHED: + if (cm_id->lap_state == IB_CM_LAP_RCVD) { + cm_state = cm_id->state; + lap_state = IB_CM_MRA_LAP_SENT; + msg_response = CM_MSG_RESPONSE_OTHER; + break; + } + fallthrough; + default: + trace_icm_send_mra_unknown_err(&cm_id_priv->id); + ret = -EINVAL; + goto error_unlock; + } + + if (!(service_timeout & IB_CM_MRA_FLAG_DELAY)) { + msg = cm_alloc_msg(cm_id_priv); + if (IS_ERR(msg)) { + ret = PTR_ERR(msg); + goto error_unlock; + } + + cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv, + msg_response, service_timeout, + private_data, private_data_len); + trace_icm_send_mra(cm_id); + ret = ib_post_send_mad(msg, NULL); + if (ret) + goto error_free_msg; + } + + cm_id->state = cm_state; + cm_id->lap_state = lap_state; + cm_id_priv->service_timeout = service_timeout; + cm_set_private_data(cm_id_priv, data, private_data_len); + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + return 0; + +error_free_msg: + cm_free_msg(msg); +error_unlock: + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + kfree(data); + return ret; +} +EXPORT_SYMBOL(ib_send_cm_mra); + +static struct cm_id_private *cm_acquire_mraed_id(struct cm_mra_msg *mra_msg) +{ + switch (IBA_GET(CM_MRA_MESSAGE_MRAED, mra_msg)) { + case CM_MSG_RESPONSE_REQ: + return cm_acquire_id( + cpu_to_be32(IBA_GET(CM_MRA_REMOTE_COMM_ID, mra_msg)), + 0); + case CM_MSG_RESPONSE_REP: + case CM_MSG_RESPONSE_OTHER: + return cm_acquire_id( + cpu_to_be32(IBA_GET(CM_MRA_REMOTE_COMM_ID, mra_msg)), + cpu_to_be32(IBA_GET(CM_MRA_LOCAL_COMM_ID, mra_msg))); + default: + return NULL; + } +} + +static int cm_mra_handler(struct cm_work *work) +{ + struct cm_id_private *cm_id_priv; + struct cm_mra_msg *mra_msg; + int timeout; + + mra_msg = (struct cm_mra_msg *)work->mad_recv_wc->recv_buf.mad; + cm_id_priv = cm_acquire_mraed_id(mra_msg); + if (!cm_id_priv) + return -EINVAL; + + work->cm_event.private_data = + IBA_GET_MEM_PTR(CM_MRA_PRIVATE_DATA, mra_msg); + work->cm_event.param.mra_rcvd.service_timeout = + IBA_GET(CM_MRA_SERVICE_TIMEOUT, mra_msg); + timeout = cm_convert_to_ms(IBA_GET(CM_MRA_SERVICE_TIMEOUT, mra_msg)) + + cm_convert_to_ms(cm_id_priv->av.timeout); + + spin_lock_irq(&cm_id_priv->lock); + switch (cm_id_priv->id.state) { + case IB_CM_REQ_SENT: + if (IBA_GET(CM_MRA_MESSAGE_MRAED, mra_msg) != + CM_MSG_RESPONSE_REQ || + ib_modify_mad(cm_id_priv->msg, timeout)) + goto out; + cm_id_priv->id.state = IB_CM_MRA_REQ_RCVD; + break; + case IB_CM_REP_SENT: + if (IBA_GET(CM_MRA_MESSAGE_MRAED, mra_msg) != + CM_MSG_RESPONSE_REP || + ib_modify_mad(cm_id_priv->msg, timeout)) + goto out; + cm_id_priv->id.state = IB_CM_MRA_REP_RCVD; + break; + case IB_CM_ESTABLISHED: + if (IBA_GET(CM_MRA_MESSAGE_MRAED, mra_msg) != + CM_MSG_RESPONSE_OTHER || + cm_id_priv->id.lap_state != IB_CM_LAP_SENT || + ib_modify_mad(cm_id_priv->msg, timeout)) { + if (cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD) + atomic_long_inc( + &work->port->counters[CM_RECV_DUPLICATES] + [CM_MRA_COUNTER]); + goto out; + } + cm_id_priv->id.lap_state = IB_CM_MRA_LAP_RCVD; + break; + case IB_CM_MRA_REQ_RCVD: + case IB_CM_MRA_REP_RCVD: + atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES] + [CM_MRA_COUNTER]); + fallthrough; + default: + trace_icm_mra_unknown_err(&cm_id_priv->id); + goto out; + } + + cm_id_priv->msg->context[1] = (void *) (unsigned long) + cm_id_priv->id.state; + cm_queue_work_unlock(cm_id_priv, work); + return 0; +out: + spin_unlock_irq(&cm_id_priv->lock); + cm_deref_id(cm_id_priv); + return -EINVAL; +} + +static void cm_format_path_lid_from_lap(struct cm_lap_msg *lap_msg, + struct sa_path_rec *path) +{ + u32 lid; + + if (path->rec_type != SA_PATH_REC_TYPE_OPA) { + sa_path_set_dlid(path, IBA_GET(CM_LAP_ALTERNATE_LOCAL_PORT_LID, + lap_msg)); + sa_path_set_slid(path, IBA_GET(CM_LAP_ALTERNATE_REMOTE_PORT_LID, + lap_msg)); + } else { + lid = opa_get_lid_from_gid(IBA_GET_MEM_PTR( + CM_LAP_ALTERNATE_LOCAL_PORT_GID, lap_msg)); + sa_path_set_dlid(path, lid); + + lid = opa_get_lid_from_gid(IBA_GET_MEM_PTR( + CM_LAP_ALTERNATE_REMOTE_PORT_GID, lap_msg)); + sa_path_set_slid(path, lid); + } +} + +static void cm_format_path_from_lap(struct cm_id_private *cm_id_priv, + struct sa_path_rec *path, + struct cm_lap_msg *lap_msg) +{ + path->dgid = *IBA_GET_MEM_PTR(CM_LAP_ALTERNATE_LOCAL_PORT_GID, lap_msg); + path->sgid = + *IBA_GET_MEM_PTR(CM_LAP_ALTERNATE_REMOTE_PORT_GID, lap_msg); + path->flow_label = + cpu_to_be32(IBA_GET(CM_LAP_ALTERNATE_FLOW_LABEL, lap_msg)); + path->hop_limit = IBA_GET(CM_LAP_ALTERNATE_HOP_LIMIT, lap_msg); + path->traffic_class = IBA_GET(CM_LAP_ALTERNATE_TRAFFIC_CLASS, lap_msg); + path->reversible = 1; + path->pkey = cm_id_priv->pkey; + path->sl = IBA_GET(CM_LAP_ALTERNATE_SL, lap_msg); + path->mtu_selector = IB_SA_EQ; + path->mtu = cm_id_priv->path_mtu; + path->rate_selector = IB_SA_EQ; + path->rate = IBA_GET(CM_LAP_ALTERNATE_PACKET_RATE, lap_msg); + path->packet_life_time_selector = IB_SA_EQ; + path->packet_life_time = + IBA_GET(CM_LAP_ALTERNATE_LOCAL_ACK_TIMEOUT, lap_msg); + path->packet_life_time -= (path->packet_life_time > 0); + cm_format_path_lid_from_lap(lap_msg, path); +} + +static int cm_lap_handler(struct cm_work *work) +{ + struct cm_id_private *cm_id_priv; + struct cm_lap_msg *lap_msg; + struct ib_cm_lap_event_param *param; + struct ib_mad_send_buf *msg = NULL; + struct rdma_ah_attr ah_attr; + struct cm_av alt_av = {}; + int ret; + + /* Currently Alternate path messages are not supported for + * RoCE link layer. + */ + if (rdma_protocol_roce(work->port->cm_dev->ib_device, + work->port->port_num)) + return -EINVAL; + + /* todo: verify LAP request and send reject APR if invalid. */ + lap_msg = (struct cm_lap_msg *)work->mad_recv_wc->recv_buf.mad; + cm_id_priv = cm_acquire_id( + cpu_to_be32(IBA_GET(CM_LAP_REMOTE_COMM_ID, lap_msg)), + cpu_to_be32(IBA_GET(CM_LAP_LOCAL_COMM_ID, lap_msg))); + if (!cm_id_priv) + return -EINVAL; + + param = &work->cm_event.param.lap_rcvd; + memset(&work->path[0], 0, sizeof(work->path[1])); + cm_path_set_rec_type(work->port->cm_dev->ib_device, + work->port->port_num, &work->path[0], + IBA_GET_MEM_PTR(CM_LAP_ALTERNATE_LOCAL_PORT_GID, + lap_msg)); + param->alternate_path = &work->path[0]; + cm_format_path_from_lap(cm_id_priv, param->alternate_path, lap_msg); + work->cm_event.private_data = + IBA_GET_MEM_PTR(CM_LAP_PRIVATE_DATA, lap_msg); + + ret = ib_init_ah_attr_from_wc(work->port->cm_dev->ib_device, + work->port->port_num, + work->mad_recv_wc->wc, + work->mad_recv_wc->recv_buf.grh, + &ah_attr); + if (ret) + goto deref; + + ret = cm_init_av_by_path(param->alternate_path, NULL, &alt_av); + if (ret) { + rdma_destroy_ah_attr(&ah_attr); + goto deref; + } + + spin_lock_irq(&cm_id_priv->lock); + cm_init_av_for_lap(work->port, work->mad_recv_wc->wc, + &ah_attr, &cm_id_priv->av); + cm_move_av_from_path(&cm_id_priv->alt_av, &alt_av); + + if (cm_id_priv->id.state != IB_CM_ESTABLISHED) + goto unlock; + + switch (cm_id_priv->id.lap_state) { + case IB_CM_LAP_UNINIT: + case IB_CM_LAP_IDLE: + break; + case IB_CM_MRA_LAP_SENT: + atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES] + [CM_LAP_COUNTER]); + msg = cm_alloc_response_msg_no_ah(work->port, work->mad_recv_wc); + if (IS_ERR(msg)) + goto unlock; + + cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv, + CM_MSG_RESPONSE_OTHER, + cm_id_priv->service_timeout, + cm_id_priv->private_data, + cm_id_priv->private_data_len); + spin_unlock_irq(&cm_id_priv->lock); + + if (cm_create_response_msg_ah(work->port, work->mad_recv_wc, msg) || + ib_post_send_mad(msg, NULL)) + cm_free_response_msg(msg); + goto deref; + case IB_CM_LAP_RCVD: + atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES] + [CM_LAP_COUNTER]); + goto unlock; + default: + goto unlock; + } + + cm_id_priv->id.lap_state = IB_CM_LAP_RCVD; + cm_id_priv->tid = lap_msg->hdr.tid; + cm_queue_work_unlock(cm_id_priv, work); + return 0; + +unlock: spin_unlock_irq(&cm_id_priv->lock); +deref: cm_deref_id(cm_id_priv); + return -EINVAL; +} + +static int cm_apr_handler(struct cm_work *work) +{ + struct cm_id_private *cm_id_priv; + struct cm_apr_msg *apr_msg; + + /* Currently Alternate path messages are not supported for + * RoCE link layer. + */ + if (rdma_protocol_roce(work->port->cm_dev->ib_device, + work->port->port_num)) + return -EINVAL; + + apr_msg = (struct cm_apr_msg *)work->mad_recv_wc->recv_buf.mad; + cm_id_priv = cm_acquire_id( + cpu_to_be32(IBA_GET(CM_APR_REMOTE_COMM_ID, apr_msg)), + cpu_to_be32(IBA_GET(CM_APR_LOCAL_COMM_ID, apr_msg))); + if (!cm_id_priv) + return -EINVAL; /* Unmatched reply. */ + + work->cm_event.param.apr_rcvd.ap_status = + IBA_GET(CM_APR_AR_STATUS, apr_msg); + work->cm_event.param.apr_rcvd.apr_info = + IBA_GET_MEM_PTR(CM_APR_ADDITIONAL_INFORMATION, apr_msg); + work->cm_event.param.apr_rcvd.info_len = + IBA_GET(CM_APR_ADDITIONAL_INFORMATION_LENGTH, apr_msg); + work->cm_event.private_data = + IBA_GET_MEM_PTR(CM_APR_PRIVATE_DATA, apr_msg); + + spin_lock_irq(&cm_id_priv->lock); + if (cm_id_priv->id.state != IB_CM_ESTABLISHED || + (cm_id_priv->id.lap_state != IB_CM_LAP_SENT && + cm_id_priv->id.lap_state != IB_CM_MRA_LAP_RCVD)) { + spin_unlock_irq(&cm_id_priv->lock); + goto out; + } + cm_id_priv->id.lap_state = IB_CM_LAP_IDLE; + ib_cancel_mad(cm_id_priv->msg); + cm_queue_work_unlock(cm_id_priv, work); + return 0; +out: + cm_deref_id(cm_id_priv); + return -EINVAL; +} + +static int cm_timewait_handler(struct cm_work *work) +{ + struct cm_timewait_info *timewait_info; + struct cm_id_private *cm_id_priv; + + timewait_info = container_of(work, struct cm_timewait_info, work); + spin_lock_irq(&cm.lock); + list_del(&timewait_info->list); + spin_unlock_irq(&cm.lock); + + cm_id_priv = cm_acquire_id(timewait_info->work.local_id, + timewait_info->work.remote_id); + if (!cm_id_priv) + return -EINVAL; + + spin_lock_irq(&cm_id_priv->lock); + if (cm_id_priv->id.state != IB_CM_TIMEWAIT || + cm_id_priv->remote_qpn != timewait_info->remote_qpn) { + spin_unlock_irq(&cm_id_priv->lock); + goto out; + } + cm_id_priv->id.state = IB_CM_IDLE; + cm_queue_work_unlock(cm_id_priv, work); + return 0; +out: + cm_deref_id(cm_id_priv); + return -EINVAL; +} + +static void cm_format_sidr_req(struct cm_sidr_req_msg *sidr_req_msg, + struct cm_id_private *cm_id_priv, + struct ib_cm_sidr_req_param *param) +{ + cm_format_mad_hdr(&sidr_req_msg->hdr, CM_SIDR_REQ_ATTR_ID, + cm_form_tid(cm_id_priv)); + IBA_SET(CM_SIDR_REQ_REQUESTID, sidr_req_msg, + be32_to_cpu(cm_id_priv->id.local_id)); + IBA_SET(CM_SIDR_REQ_PARTITION_KEY, sidr_req_msg, + be16_to_cpu(param->path->pkey)); + IBA_SET(CM_SIDR_REQ_SERVICEID, sidr_req_msg, + be64_to_cpu(param->service_id)); + + if (param->private_data && param->private_data_len) + IBA_SET_MEM(CM_SIDR_REQ_PRIVATE_DATA, sidr_req_msg, + param->private_data, param->private_data_len); +} + +int ib_send_cm_sidr_req(struct ib_cm_id *cm_id, + struct ib_cm_sidr_req_param *param) +{ + struct cm_id_private *cm_id_priv; + struct ib_mad_send_buf *msg; + struct cm_av av = {}; + unsigned long flags; + int ret; + + if (!param->path || (param->private_data && + param->private_data_len > IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE)) + return -EINVAL; + + cm_id_priv = container_of(cm_id, struct cm_id_private, id); + ret = cm_init_av_by_path(param->path, param->sgid_attr, &av); + if (ret) + return ret; + + spin_lock_irqsave(&cm_id_priv->lock, flags); + cm_move_av_from_path(&cm_id_priv->av, &av); + cm_id->service_id = param->service_id; + cm_id->service_mask = ~cpu_to_be64(0); + cm_id_priv->timeout_ms = param->timeout_ms; + cm_id_priv->max_cm_retries = param->max_cm_retries; + if (cm_id->state != IB_CM_IDLE) { + ret = -EINVAL; + goto out_unlock; + } + + msg = cm_alloc_priv_msg(cm_id_priv); + if (IS_ERR(msg)) { + ret = PTR_ERR(msg); + goto out_unlock; + } + + cm_format_sidr_req((struct cm_sidr_req_msg *)msg->mad, cm_id_priv, + param); + msg->timeout_ms = cm_id_priv->timeout_ms; + msg->context[1] = (void *)(unsigned long)IB_CM_SIDR_REQ_SENT; + + trace_icm_send_sidr_req(&cm_id_priv->id); + ret = ib_post_send_mad(msg, NULL); + if (ret) + goto out_free; + cm_id->state = IB_CM_SIDR_REQ_SENT; + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + return 0; +out_free: + cm_free_priv_msg(msg); +out_unlock: + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + return ret; +} +EXPORT_SYMBOL(ib_send_cm_sidr_req); + +static void cm_format_sidr_req_event(struct cm_work *work, + const struct cm_id_private *rx_cm_id, + struct ib_cm_id *listen_id) +{ + struct cm_sidr_req_msg *sidr_req_msg; + struct ib_cm_sidr_req_event_param *param; + + sidr_req_msg = (struct cm_sidr_req_msg *) + work->mad_recv_wc->recv_buf.mad; + param = &work->cm_event.param.sidr_req_rcvd; + param->pkey = IBA_GET(CM_SIDR_REQ_PARTITION_KEY, sidr_req_msg); + param->listen_id = listen_id; + param->service_id = + cpu_to_be64(IBA_GET(CM_SIDR_REQ_SERVICEID, sidr_req_msg)); + param->bth_pkey = cm_get_bth_pkey(work); + param->port = work->port->port_num; + param->sgid_attr = rx_cm_id->av.ah_attr.grh.sgid_attr; + work->cm_event.private_data = + IBA_GET_MEM_PTR(CM_SIDR_REQ_PRIVATE_DATA, sidr_req_msg); +} + +static int cm_sidr_req_handler(struct cm_work *work) +{ + struct cm_id_private *cm_id_priv, *listen_cm_id_priv; + struct cm_sidr_req_msg *sidr_req_msg; + struct ib_wc *wc; + int ret; + + cm_id_priv = + cm_alloc_id_priv(work->port->cm_dev->ib_device, NULL, NULL); + if (IS_ERR(cm_id_priv)) + return PTR_ERR(cm_id_priv); + + /* Record SGID/SLID and request ID for lookup. */ + sidr_req_msg = (struct cm_sidr_req_msg *) + work->mad_recv_wc->recv_buf.mad; + + cm_id_priv->id.remote_id = + cpu_to_be32(IBA_GET(CM_SIDR_REQ_REQUESTID, sidr_req_msg)); + cm_id_priv->id.service_id = + cpu_to_be64(IBA_GET(CM_SIDR_REQ_SERVICEID, sidr_req_msg)); + cm_id_priv->id.service_mask = ~cpu_to_be64(0); + cm_id_priv->tid = sidr_req_msg->hdr.tid; + + wc = work->mad_recv_wc->wc; + cm_id_priv->sidr_slid = wc->slid; + ret = cm_init_av_for_response(work->port, work->mad_recv_wc->wc, + work->mad_recv_wc->recv_buf.grh, + &cm_id_priv->av); + if (ret) + goto out; + + spin_lock_irq(&cm.lock); + listen_cm_id_priv = cm_insert_remote_sidr(cm_id_priv); + if (listen_cm_id_priv) { + spin_unlock_irq(&cm.lock); + atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES] + [CM_SIDR_REQ_COUNTER]); + goto out; /* Duplicate message. */ + } + cm_id_priv->id.state = IB_CM_SIDR_REQ_RCVD; + listen_cm_id_priv = cm_find_listen(cm_id_priv->id.device, + cm_id_priv->id.service_id); + if (!listen_cm_id_priv) { + spin_unlock_irq(&cm.lock); + ib_send_cm_sidr_rep(&cm_id_priv->id, + &(struct ib_cm_sidr_rep_param){ + .status = IB_SIDR_UNSUPPORTED }); + goto out; /* No match. */ + } + spin_unlock_irq(&cm.lock); + + cm_id_priv->id.cm_handler = listen_cm_id_priv->id.cm_handler; + cm_id_priv->id.context = listen_cm_id_priv->id.context; + + /* + * A SIDR ID does not need to be in the xarray since it does not receive + * mads, is not placed in the remote_id or remote_qpn rbtree, and does + * not enter timewait. + */ + + cm_format_sidr_req_event(work, cm_id_priv, &listen_cm_id_priv->id); + ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, &work->cm_event); + cm_free_work(work); + /* + * A pointer to the listen_cm_id is held in the event, so this deref + * must be after the event is delivered above. + */ + cm_deref_id(listen_cm_id_priv); + if (ret) + cm_destroy_id(&cm_id_priv->id, ret); + return 0; +out: + ib_destroy_cm_id(&cm_id_priv->id); + return -EINVAL; +} + +static void cm_format_sidr_rep(struct cm_sidr_rep_msg *sidr_rep_msg, + struct cm_id_private *cm_id_priv, + struct ib_cm_sidr_rep_param *param) +{ + cm_format_mad_ece_hdr(&sidr_rep_msg->hdr, CM_SIDR_REP_ATTR_ID, + cm_id_priv->tid, param->ece.attr_mod); + IBA_SET(CM_SIDR_REP_REQUESTID, sidr_rep_msg, + be32_to_cpu(cm_id_priv->id.remote_id)); + IBA_SET(CM_SIDR_REP_STATUS, sidr_rep_msg, param->status); + IBA_SET(CM_SIDR_REP_QPN, sidr_rep_msg, param->qp_num); + IBA_SET(CM_SIDR_REP_SERVICEID, sidr_rep_msg, + be64_to_cpu(cm_id_priv->id.service_id)); + IBA_SET(CM_SIDR_REP_Q_KEY, sidr_rep_msg, param->qkey); + IBA_SET(CM_SIDR_REP_VENDOR_ID_L, sidr_rep_msg, + param->ece.vendor_id & 0xFF); + IBA_SET(CM_SIDR_REP_VENDOR_ID_H, sidr_rep_msg, + (param->ece.vendor_id >> 8) & 0xFF); + + if (param->info && param->info_length) + IBA_SET_MEM(CM_SIDR_REP_ADDITIONAL_INFORMATION, sidr_rep_msg, + param->info, param->info_length); + + if (param->private_data && param->private_data_len) + IBA_SET_MEM(CM_SIDR_REP_PRIVATE_DATA, sidr_rep_msg, + param->private_data, param->private_data_len); +} + +static int cm_send_sidr_rep_locked(struct cm_id_private *cm_id_priv, + struct ib_cm_sidr_rep_param *param) +{ + struct ib_mad_send_buf *msg; + unsigned long flags; + int ret; + + lockdep_assert_held(&cm_id_priv->lock); + + if ((param->info && param->info_length > IB_CM_SIDR_REP_INFO_LENGTH) || + (param->private_data && + param->private_data_len > IB_CM_SIDR_REP_PRIVATE_DATA_SIZE)) + return -EINVAL; + + if (cm_id_priv->id.state != IB_CM_SIDR_REQ_RCVD) + return -EINVAL; + + msg = cm_alloc_msg(cm_id_priv); + if (IS_ERR(msg)) + return PTR_ERR(msg); + + cm_format_sidr_rep((struct cm_sidr_rep_msg *) msg->mad, cm_id_priv, + param); + trace_icm_send_sidr_rep(&cm_id_priv->id); + ret = ib_post_send_mad(msg, NULL); + if (ret) { + cm_free_msg(msg); + return ret; + } + cm_id_priv->id.state = IB_CM_IDLE; + spin_lock_irqsave(&cm.lock, flags); + if (!RB_EMPTY_NODE(&cm_id_priv->sidr_id_node)) { + rb_erase(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table); + RB_CLEAR_NODE(&cm_id_priv->sidr_id_node); + } + spin_unlock_irqrestore(&cm.lock, flags); + return 0; +} + +int ib_send_cm_sidr_rep(struct ib_cm_id *cm_id, + struct ib_cm_sidr_rep_param *param) +{ + struct cm_id_private *cm_id_priv = + container_of(cm_id, struct cm_id_private, id); + unsigned long flags; + int ret; + + spin_lock_irqsave(&cm_id_priv->lock, flags); + ret = cm_send_sidr_rep_locked(cm_id_priv, param); + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + return ret; +} +EXPORT_SYMBOL(ib_send_cm_sidr_rep); + +static void cm_format_sidr_rep_event(struct cm_work *work, + const struct cm_id_private *cm_id_priv) +{ + struct cm_sidr_rep_msg *sidr_rep_msg; + struct ib_cm_sidr_rep_event_param *param; + + sidr_rep_msg = (struct cm_sidr_rep_msg *) + work->mad_recv_wc->recv_buf.mad; + param = &work->cm_event.param.sidr_rep_rcvd; + param->status = IBA_GET(CM_SIDR_REP_STATUS, sidr_rep_msg); + param->qkey = IBA_GET(CM_SIDR_REP_Q_KEY, sidr_rep_msg); + param->qpn = IBA_GET(CM_SIDR_REP_QPN, sidr_rep_msg); + param->info = IBA_GET_MEM_PTR(CM_SIDR_REP_ADDITIONAL_INFORMATION, + sidr_rep_msg); + param->info_len = IBA_GET(CM_SIDR_REP_ADDITIONAL_INFORMATION_LENGTH, + sidr_rep_msg); + param->sgid_attr = cm_id_priv->av.ah_attr.grh.sgid_attr; + work->cm_event.private_data = + IBA_GET_MEM_PTR(CM_SIDR_REP_PRIVATE_DATA, sidr_rep_msg); +} + +static int cm_sidr_rep_handler(struct cm_work *work) +{ + struct cm_sidr_rep_msg *sidr_rep_msg; + struct cm_id_private *cm_id_priv; + + sidr_rep_msg = (struct cm_sidr_rep_msg *) + work->mad_recv_wc->recv_buf.mad; + cm_id_priv = cm_acquire_id( + cpu_to_be32(IBA_GET(CM_SIDR_REP_REQUESTID, sidr_rep_msg)), 0); + if (!cm_id_priv) + return -EINVAL; /* Unmatched reply. */ + + spin_lock_irq(&cm_id_priv->lock); + if (cm_id_priv->id.state != IB_CM_SIDR_REQ_SENT) { + spin_unlock_irq(&cm_id_priv->lock); + goto out; + } + cm_id_priv->id.state = IB_CM_IDLE; + ib_cancel_mad(cm_id_priv->msg); + spin_unlock_irq(&cm_id_priv->lock); + + cm_format_sidr_rep_event(work, cm_id_priv); + cm_process_work(cm_id_priv, work); + return 0; +out: + cm_deref_id(cm_id_priv); + return -EINVAL; +} + +static void cm_process_send_error(struct cm_id_private *cm_id_priv, + struct ib_mad_send_buf *msg, + enum ib_cm_state state, + enum ib_wc_status wc_status) +{ + struct ib_cm_event cm_event = {}; + int ret; + + /* Discard old sends or ones without a response. */ + spin_lock_irq(&cm_id_priv->lock); + if (msg != cm_id_priv->msg) { + spin_unlock_irq(&cm_id_priv->lock); + cm_free_msg(msg); + return; + } + cm_free_priv_msg(msg); + + if (state != cm_id_priv->id.state || wc_status == IB_WC_SUCCESS || + wc_status == IB_WC_WR_FLUSH_ERR) + goto out_unlock; + + trace_icm_mad_send_err(state, wc_status); + switch (state) { + case IB_CM_REQ_SENT: + case IB_CM_MRA_REQ_RCVD: + cm_reset_to_idle(cm_id_priv); + cm_event.event = IB_CM_REQ_ERROR; + break; + case IB_CM_REP_SENT: + case IB_CM_MRA_REP_RCVD: + cm_reset_to_idle(cm_id_priv); + cm_event.event = IB_CM_REP_ERROR; + break; + case IB_CM_DREQ_SENT: + cm_enter_timewait(cm_id_priv); + cm_event.event = IB_CM_DREQ_ERROR; + break; + case IB_CM_SIDR_REQ_SENT: + cm_id_priv->id.state = IB_CM_IDLE; + cm_event.event = IB_CM_SIDR_REQ_ERROR; + break; + default: + goto out_unlock; + } + spin_unlock_irq(&cm_id_priv->lock); + cm_event.param.send_status = wc_status; + + /* No other events can occur on the cm_id at this point. */ + ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, &cm_event); + if (ret) + ib_destroy_cm_id(&cm_id_priv->id); + return; +out_unlock: + spin_unlock_irq(&cm_id_priv->lock); +} + +static void cm_send_handler(struct ib_mad_agent *mad_agent, + struct ib_mad_send_wc *mad_send_wc) +{ + struct ib_mad_send_buf *msg = mad_send_wc->send_buf; + struct cm_id_private *cm_id_priv = msg->context[0]; + enum ib_cm_state state = + (enum ib_cm_state)(unsigned long)msg->context[1]; + struct cm_port *port; + u16 attr_index; + + port = mad_agent->context; + attr_index = be16_to_cpu(((struct ib_mad_hdr *) + msg->mad)->attr_id) - CM_ATTR_ID_OFFSET; + + /* + * If the send was in response to a received message (context[0] is not + * set to a cm_id), and is not a REJ, then it is a send that was + * manually retried. + */ + if (!cm_id_priv && (attr_index != CM_REJ_COUNTER)) + msg->retries = 1; + + atomic_long_add(1 + msg->retries, &port->counters[CM_XMIT][attr_index]); + if (msg->retries) + atomic_long_add(msg->retries, + &port->counters[CM_XMIT_RETRIES][attr_index]); + + if (cm_id_priv) + cm_process_send_error(cm_id_priv, msg, state, + mad_send_wc->status); + else + cm_free_response_msg(msg); +} + +static void cm_work_handler(struct work_struct *_work) +{ + struct cm_work *work = container_of(_work, struct cm_work, work.work); + int ret; + + switch (work->cm_event.event) { + case IB_CM_REQ_RECEIVED: + ret = cm_req_handler(work); + break; + case IB_CM_MRA_RECEIVED: + ret = cm_mra_handler(work); + break; + case IB_CM_REJ_RECEIVED: + ret = cm_rej_handler(work); + break; + case IB_CM_REP_RECEIVED: + ret = cm_rep_handler(work); + break; + case IB_CM_RTU_RECEIVED: + ret = cm_rtu_handler(work); + break; + case IB_CM_USER_ESTABLISHED: + ret = cm_establish_handler(work); + break; + case IB_CM_DREQ_RECEIVED: + ret = cm_dreq_handler(work); + break; + case IB_CM_DREP_RECEIVED: + ret = cm_drep_handler(work); + break; + case IB_CM_SIDR_REQ_RECEIVED: + ret = cm_sidr_req_handler(work); + break; + case IB_CM_SIDR_REP_RECEIVED: + ret = cm_sidr_rep_handler(work); + break; + case IB_CM_LAP_RECEIVED: + ret = cm_lap_handler(work); + break; + case IB_CM_APR_RECEIVED: + ret = cm_apr_handler(work); + break; + case IB_CM_TIMEWAIT_EXIT: + ret = cm_timewait_handler(work); + break; + default: + trace_icm_handler_err(work->cm_event.event); + ret = -EINVAL; + break; + } + if (ret) + cm_free_work(work); +} + +static int cm_establish(struct ib_cm_id *cm_id) +{ + struct cm_id_private *cm_id_priv; + struct cm_work *work; + unsigned long flags; + int ret = 0; + struct cm_device *cm_dev; + + cm_dev = ib_get_client_data(cm_id->device, &cm_client); + if (!cm_dev) + return -ENODEV; + + work = kmalloc(sizeof *work, GFP_ATOMIC); + if (!work) + return -ENOMEM; + + cm_id_priv = container_of(cm_id, struct cm_id_private, id); + spin_lock_irqsave(&cm_id_priv->lock, flags); + switch (cm_id->state) { + case IB_CM_REP_SENT: + case IB_CM_MRA_REP_RCVD: + cm_id->state = IB_CM_ESTABLISHED; + break; + case IB_CM_ESTABLISHED: + ret = -EISCONN; + break; + default: + trace_icm_establish_err(cm_id); + ret = -EINVAL; + break; + } + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + + if (ret) { + kfree(work); + goto out; + } + + /* + * The CM worker thread may try to destroy the cm_id before it + * can execute this work item. To prevent potential deadlock, + * we need to find the cm_id once we're in the context of the + * worker thread, rather than holding a reference on it. + */ + INIT_DELAYED_WORK(&work->work, cm_work_handler); + work->local_id = cm_id->local_id; + work->remote_id = cm_id->remote_id; + work->mad_recv_wc = NULL; + work->cm_event.event = IB_CM_USER_ESTABLISHED; + + /* Check if the device started its remove_one */ + spin_lock_irqsave(&cm.lock, flags); + if (!cm_dev->going_down) { + queue_delayed_work(cm.wq, &work->work, 0); + } else { + kfree(work); + ret = -ENODEV; + } + spin_unlock_irqrestore(&cm.lock, flags); + +out: + return ret; +} + +static int cm_migrate(struct ib_cm_id *cm_id) +{ + struct cm_id_private *cm_id_priv; + unsigned long flags; + int ret = 0; + + cm_id_priv = container_of(cm_id, struct cm_id_private, id); + spin_lock_irqsave(&cm_id_priv->lock, flags); + if (cm_id->state == IB_CM_ESTABLISHED && + (cm_id->lap_state == IB_CM_LAP_UNINIT || + cm_id->lap_state == IB_CM_LAP_IDLE)) { + cm_id->lap_state = IB_CM_LAP_IDLE; + cm_id_priv->av = cm_id_priv->alt_av; + } else + ret = -EINVAL; + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + + return ret; +} + +int ib_cm_notify(struct ib_cm_id *cm_id, enum ib_event_type event) +{ + int ret; + + switch (event) { + case IB_EVENT_COMM_EST: + ret = cm_establish(cm_id); + break; + case IB_EVENT_PATH_MIG: + ret = cm_migrate(cm_id); + break; + default: + ret = -EINVAL; + } + return ret; +} +EXPORT_SYMBOL(ib_cm_notify); + +static void cm_recv_handler(struct ib_mad_agent *mad_agent, + struct ib_mad_send_buf *send_buf, + struct ib_mad_recv_wc *mad_recv_wc) +{ + struct cm_port *port = mad_agent->context; + struct cm_work *work; + enum ib_cm_event_type event; + bool alt_path = false; + u16 attr_id; + int paths = 0; + int going_down = 0; + + switch (mad_recv_wc->recv_buf.mad->mad_hdr.attr_id) { + case CM_REQ_ATTR_ID: + alt_path = cm_req_has_alt_path((struct cm_req_msg *) + mad_recv_wc->recv_buf.mad); + paths = 1 + (alt_path != 0); + event = IB_CM_REQ_RECEIVED; + break; + case CM_MRA_ATTR_ID: + event = IB_CM_MRA_RECEIVED; + break; + case CM_REJ_ATTR_ID: + event = IB_CM_REJ_RECEIVED; + break; + case CM_REP_ATTR_ID: + event = IB_CM_REP_RECEIVED; + break; + case CM_RTU_ATTR_ID: + event = IB_CM_RTU_RECEIVED; + break; + case CM_DREQ_ATTR_ID: + event = IB_CM_DREQ_RECEIVED; + break; + case CM_DREP_ATTR_ID: + event = IB_CM_DREP_RECEIVED; + break; + case CM_SIDR_REQ_ATTR_ID: + event = IB_CM_SIDR_REQ_RECEIVED; + break; + case CM_SIDR_REP_ATTR_ID: + event = IB_CM_SIDR_REP_RECEIVED; + break; + case CM_LAP_ATTR_ID: + paths = 1; + event = IB_CM_LAP_RECEIVED; + break; + case CM_APR_ATTR_ID: + event = IB_CM_APR_RECEIVED; + break; + default: + ib_free_recv_mad(mad_recv_wc); + return; + } + + attr_id = be16_to_cpu(mad_recv_wc->recv_buf.mad->mad_hdr.attr_id); + atomic_long_inc(&port->counters[CM_RECV][attr_id - CM_ATTR_ID_OFFSET]); + + work = kmalloc(struct_size(work, path, paths), GFP_KERNEL); + if (!work) { + ib_free_recv_mad(mad_recv_wc); + return; + } + + INIT_DELAYED_WORK(&work->work, cm_work_handler); + work->cm_event.event = event; + work->mad_recv_wc = mad_recv_wc; + work->port = port; + + /* Check if the device started its remove_one */ + spin_lock_irq(&cm.lock); + if (!port->cm_dev->going_down) + queue_delayed_work(cm.wq, &work->work, 0); + else + going_down = 1; + spin_unlock_irq(&cm.lock); + + if (going_down) { + kfree(work); + ib_free_recv_mad(mad_recv_wc); + } +} + +static int cm_init_qp_init_attr(struct cm_id_private *cm_id_priv, + struct ib_qp_attr *qp_attr, + int *qp_attr_mask) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&cm_id_priv->lock, flags); + switch (cm_id_priv->id.state) { + case IB_CM_REQ_SENT: + case IB_CM_MRA_REQ_RCVD: + case IB_CM_REQ_RCVD: + case IB_CM_MRA_REQ_SENT: + case IB_CM_REP_RCVD: + case IB_CM_MRA_REP_SENT: + case IB_CM_REP_SENT: + case IB_CM_MRA_REP_RCVD: + case IB_CM_ESTABLISHED: + *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS | + IB_QP_PKEY_INDEX | IB_QP_PORT; + qp_attr->qp_access_flags = IB_ACCESS_REMOTE_WRITE; + if (cm_id_priv->responder_resources) + qp_attr->qp_access_flags |= IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_ATOMIC; + qp_attr->pkey_index = cm_id_priv->av.pkey_index; + if (cm_id_priv->av.port) + qp_attr->port_num = cm_id_priv->av.port->port_num; + ret = 0; + break; + default: + trace_icm_qp_init_err(&cm_id_priv->id); + ret = -EINVAL; + break; + } + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + return ret; +} + +static int cm_init_qp_rtr_attr(struct cm_id_private *cm_id_priv, + struct ib_qp_attr *qp_attr, + int *qp_attr_mask) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&cm_id_priv->lock, flags); + switch (cm_id_priv->id.state) { + case IB_CM_REQ_RCVD: + case IB_CM_MRA_REQ_SENT: + case IB_CM_REP_RCVD: + case IB_CM_MRA_REP_SENT: + case IB_CM_REP_SENT: + case IB_CM_MRA_REP_RCVD: + case IB_CM_ESTABLISHED: + *qp_attr_mask = IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU | + IB_QP_DEST_QPN | IB_QP_RQ_PSN; + qp_attr->ah_attr = cm_id_priv->av.ah_attr; + if ((qp_attr->ah_attr.type == RDMA_AH_ATTR_TYPE_IB) && + cm_id_priv->av.dlid_datapath && + (cm_id_priv->av.dlid_datapath != 0xffff)) + qp_attr->ah_attr.ib.dlid = cm_id_priv->av.dlid_datapath; + qp_attr->path_mtu = cm_id_priv->path_mtu; + qp_attr->dest_qp_num = be32_to_cpu(cm_id_priv->remote_qpn); + qp_attr->rq_psn = be32_to_cpu(cm_id_priv->rq_psn); + if (cm_id_priv->qp_type == IB_QPT_RC || + cm_id_priv->qp_type == IB_QPT_XRC_TGT) { + *qp_attr_mask |= IB_QP_MAX_DEST_RD_ATOMIC | + IB_QP_MIN_RNR_TIMER; + qp_attr->max_dest_rd_atomic = + cm_id_priv->responder_resources; + qp_attr->min_rnr_timer = 0; + } + if (rdma_ah_get_dlid(&cm_id_priv->alt_av.ah_attr) && + cm_id_priv->alt_av.port) { + *qp_attr_mask |= IB_QP_ALT_PATH; + qp_attr->alt_port_num = cm_id_priv->alt_av.port->port_num; + qp_attr->alt_pkey_index = cm_id_priv->alt_av.pkey_index; + qp_attr->alt_timeout = cm_id_priv->alt_av.timeout; + qp_attr->alt_ah_attr = cm_id_priv->alt_av.ah_attr; + } + ret = 0; + break; + default: + trace_icm_qp_rtr_err(&cm_id_priv->id); + ret = -EINVAL; + break; + } + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + return ret; +} + +static int cm_init_qp_rts_attr(struct cm_id_private *cm_id_priv, + struct ib_qp_attr *qp_attr, + int *qp_attr_mask) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&cm_id_priv->lock, flags); + switch (cm_id_priv->id.state) { + /* Allow transition to RTS before sending REP */ + case IB_CM_REQ_RCVD: + case IB_CM_MRA_REQ_SENT: + + case IB_CM_REP_RCVD: + case IB_CM_MRA_REP_SENT: + case IB_CM_REP_SENT: + case IB_CM_MRA_REP_RCVD: + case IB_CM_ESTABLISHED: + if (cm_id_priv->id.lap_state == IB_CM_LAP_UNINIT) { + *qp_attr_mask = IB_QP_STATE | IB_QP_SQ_PSN; + qp_attr->sq_psn = be32_to_cpu(cm_id_priv->sq_psn); + switch (cm_id_priv->qp_type) { + case IB_QPT_RC: + case IB_QPT_XRC_INI: + *qp_attr_mask |= IB_QP_RETRY_CNT | IB_QP_RNR_RETRY | + IB_QP_MAX_QP_RD_ATOMIC; + qp_attr->retry_cnt = cm_id_priv->retry_count; + qp_attr->rnr_retry = cm_id_priv->rnr_retry_count; + qp_attr->max_rd_atomic = cm_id_priv->initiator_depth; + fallthrough; + case IB_QPT_XRC_TGT: + *qp_attr_mask |= IB_QP_TIMEOUT; + qp_attr->timeout = cm_id_priv->av.timeout; + break; + default: + break; + } + if (rdma_ah_get_dlid(&cm_id_priv->alt_av.ah_attr)) { + *qp_attr_mask |= IB_QP_PATH_MIG_STATE; + qp_attr->path_mig_state = IB_MIG_REARM; + } + } else { + *qp_attr_mask = IB_QP_ALT_PATH | IB_QP_PATH_MIG_STATE; + if (cm_id_priv->alt_av.port) + qp_attr->alt_port_num = + cm_id_priv->alt_av.port->port_num; + qp_attr->alt_pkey_index = cm_id_priv->alt_av.pkey_index; + qp_attr->alt_timeout = cm_id_priv->alt_av.timeout; + qp_attr->alt_ah_attr = cm_id_priv->alt_av.ah_attr; + qp_attr->path_mig_state = IB_MIG_REARM; + } + ret = 0; + break; + default: + trace_icm_qp_rts_err(&cm_id_priv->id); + ret = -EINVAL; + break; + } + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + return ret; +} + +int ib_cm_init_qp_attr(struct ib_cm_id *cm_id, + struct ib_qp_attr *qp_attr, + int *qp_attr_mask) +{ + struct cm_id_private *cm_id_priv; + int ret; + + cm_id_priv = container_of(cm_id, struct cm_id_private, id); + switch (qp_attr->qp_state) { + case IB_QPS_INIT: + ret = cm_init_qp_init_attr(cm_id_priv, qp_attr, qp_attr_mask); + break; + case IB_QPS_RTR: + ret = cm_init_qp_rtr_attr(cm_id_priv, qp_attr, qp_attr_mask); + break; + case IB_QPS_RTS: + ret = cm_init_qp_rts_attr(cm_id_priv, qp_attr, qp_attr_mask); + break; + default: + ret = -EINVAL; + break; + } + return ret; +} +EXPORT_SYMBOL(ib_cm_init_qp_attr); + +static ssize_t cm_show_counter(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *attr, char *buf) +{ + struct cm_counter_attribute *cm_attr = + container_of(attr, struct cm_counter_attribute, attr); + struct cm_device *cm_dev = ib_get_client_data(ibdev, &cm_client); + + if (WARN_ON(!cm_dev)) + return -EINVAL; + + return sysfs_emit( + buf, "%ld\n", + atomic_long_read( + &cm_dev->port[port_num - 1] + ->counters[cm_attr->group][cm_attr->index])); +} + +#define CM_COUNTER_ATTR(_name, _group, _index) \ + { \ + .attr = __ATTR(_name, 0444, cm_show_counter, NULL), \ + .group = _group, .index = _index \ + } + +#define CM_COUNTER_GROUP(_group, _name) \ + static struct cm_counter_attribute cm_counter_attr_##_group[] = { \ + CM_COUNTER_ATTR(req, _group, CM_REQ_COUNTER), \ + CM_COUNTER_ATTR(mra, _group, CM_MRA_COUNTER), \ + CM_COUNTER_ATTR(rej, _group, CM_REJ_COUNTER), \ + CM_COUNTER_ATTR(rep, _group, CM_REP_COUNTER), \ + CM_COUNTER_ATTR(rtu, _group, CM_RTU_COUNTER), \ + CM_COUNTER_ATTR(dreq, _group, CM_DREQ_COUNTER), \ + CM_COUNTER_ATTR(drep, _group, CM_DREP_COUNTER), \ + CM_COUNTER_ATTR(sidr_req, _group, CM_SIDR_REQ_COUNTER), \ + CM_COUNTER_ATTR(sidr_rep, _group, CM_SIDR_REP_COUNTER), \ + CM_COUNTER_ATTR(lap, _group, CM_LAP_COUNTER), \ + CM_COUNTER_ATTR(apr, _group, CM_APR_COUNTER), \ + }; \ + static struct attribute *cm_counter_attrs_##_group[] = { \ + &cm_counter_attr_##_group[0].attr.attr, \ + &cm_counter_attr_##_group[1].attr.attr, \ + &cm_counter_attr_##_group[2].attr.attr, \ + &cm_counter_attr_##_group[3].attr.attr, \ + &cm_counter_attr_##_group[4].attr.attr, \ + &cm_counter_attr_##_group[5].attr.attr, \ + &cm_counter_attr_##_group[6].attr.attr, \ + &cm_counter_attr_##_group[7].attr.attr, \ + &cm_counter_attr_##_group[8].attr.attr, \ + &cm_counter_attr_##_group[9].attr.attr, \ + &cm_counter_attr_##_group[10].attr.attr, \ + NULL, \ + }; \ + static const struct attribute_group cm_counter_group_##_group = { \ + .name = _name, \ + .attrs = cm_counter_attrs_##_group, \ + }; + +CM_COUNTER_GROUP(CM_XMIT, "cm_tx_msgs") +CM_COUNTER_GROUP(CM_XMIT_RETRIES, "cm_tx_retries") +CM_COUNTER_GROUP(CM_RECV, "cm_rx_msgs") +CM_COUNTER_GROUP(CM_RECV_DUPLICATES, "cm_rx_duplicates") + +static const struct attribute_group *cm_counter_groups[] = { + &cm_counter_group_CM_XMIT, + &cm_counter_group_CM_XMIT_RETRIES, + &cm_counter_group_CM_RECV, + &cm_counter_group_CM_RECV_DUPLICATES, + NULL, +}; + +static int cm_add_one(struct ib_device *ib_device) +{ + struct cm_device *cm_dev; + struct cm_port *port; + struct ib_mad_reg_req reg_req = { + .mgmt_class = IB_MGMT_CLASS_CM, + .mgmt_class_version = IB_CM_CLASS_VERSION, + }; + struct ib_port_modify port_modify = { + .set_port_cap_mask = IB_PORT_CM_SUP + }; + unsigned long flags; + int ret; + int count = 0; + u32 i; + + cm_dev = kzalloc(struct_size(cm_dev, port, ib_device->phys_port_cnt), + GFP_KERNEL); + if (!cm_dev) + return -ENOMEM; + + kref_init(&cm_dev->kref); + spin_lock_init(&cm_dev->mad_agent_lock); + cm_dev->ib_device = ib_device; + cm_dev->ack_delay = ib_device->attrs.local_ca_ack_delay; + cm_dev->going_down = 0; + + ib_set_client_data(ib_device, &cm_client, cm_dev); + + set_bit(IB_MGMT_METHOD_SEND, reg_req.method_mask); + rdma_for_each_port (ib_device, i) { + if (!rdma_cap_ib_cm(ib_device, i)) + continue; + + port = kzalloc(sizeof *port, GFP_KERNEL); + if (!port) { + ret = -ENOMEM; + goto error1; + } + + cm_dev->port[i-1] = port; + port->cm_dev = cm_dev; + port->port_num = i; + + ret = ib_port_register_client_groups(ib_device, i, + cm_counter_groups); + if (ret) + goto error1; + + port->mad_agent = ib_register_mad_agent(ib_device, i, + IB_QPT_GSI, + ®_req, + 0, + cm_send_handler, + cm_recv_handler, + port, + 0); + if (IS_ERR(port->mad_agent)) { + ret = PTR_ERR(port->mad_agent); + goto error2; + } + + ret = ib_modify_port(ib_device, i, 0, &port_modify); + if (ret) + goto error3; + + count++; + } + + if (!count) { + ret = -EOPNOTSUPP; + goto free; + } + + write_lock_irqsave(&cm.device_lock, flags); + list_add_tail(&cm_dev->list, &cm.device_list); + write_unlock_irqrestore(&cm.device_lock, flags); + return 0; + +error3: + ib_unregister_mad_agent(port->mad_agent); +error2: + ib_port_unregister_client_groups(ib_device, i, cm_counter_groups); +error1: + port_modify.set_port_cap_mask = 0; + port_modify.clr_port_cap_mask = IB_PORT_CM_SUP; + while (--i) { + if (!rdma_cap_ib_cm(ib_device, i)) + continue; + + port = cm_dev->port[i-1]; + ib_modify_port(ib_device, port->port_num, 0, &port_modify); + ib_unregister_mad_agent(port->mad_agent); + ib_port_unregister_client_groups(ib_device, i, + cm_counter_groups); + } +free: + cm_device_put(cm_dev); + return ret; +} + +static void cm_remove_one(struct ib_device *ib_device, void *client_data) +{ + struct cm_device *cm_dev = client_data; + struct cm_port *port; + struct ib_port_modify port_modify = { + .clr_port_cap_mask = IB_PORT_CM_SUP + }; + unsigned long flags; + u32 i; + + write_lock_irqsave(&cm.device_lock, flags); + list_del(&cm_dev->list); + write_unlock_irqrestore(&cm.device_lock, flags); + + spin_lock_irq(&cm.lock); + cm_dev->going_down = 1; + spin_unlock_irq(&cm.lock); + + rdma_for_each_port (ib_device, i) { + struct ib_mad_agent *mad_agent; + + if (!rdma_cap_ib_cm(ib_device, i)) + continue; + + port = cm_dev->port[i-1]; + mad_agent = port->mad_agent; + ib_modify_port(ib_device, port->port_num, 0, &port_modify); + /* + * We flush the queue here after the going_down set, this + * verify that no new works will be queued in the recv handler, + * after that we can call the unregister_mad_agent + */ + flush_workqueue(cm.wq); + /* + * The above ensures no call paths from the work are running, + * the remaining paths all take the mad_agent_lock. + */ + spin_lock(&cm_dev->mad_agent_lock); + port->mad_agent = NULL; + spin_unlock(&cm_dev->mad_agent_lock); + ib_unregister_mad_agent(mad_agent); + ib_port_unregister_client_groups(ib_device, i, + cm_counter_groups); + } + + cm_device_put(cm_dev); +} + +static int __init ib_cm_init(void) +{ + int ret; + + INIT_LIST_HEAD(&cm.device_list); + rwlock_init(&cm.device_lock); + spin_lock_init(&cm.lock); + cm.listen_service_table = RB_ROOT; + cm.listen_service_id = be64_to_cpu(IB_CM_ASSIGN_SERVICE_ID); + cm.remote_id_table = RB_ROOT; + cm.remote_qp_table = RB_ROOT; + cm.remote_sidr_table = RB_ROOT; + xa_init_flags(&cm.local_id_table, XA_FLAGS_ALLOC); + get_random_bytes(&cm.random_id_operand, sizeof cm.random_id_operand); + INIT_LIST_HEAD(&cm.timewait_list); + + cm.wq = alloc_workqueue("ib_cm", 0, 1); + if (!cm.wq) { + ret = -ENOMEM; + goto error2; + } + + ret = ib_register_client(&cm_client); + if (ret) + goto error3; + + return 0; +error3: + destroy_workqueue(cm.wq); +error2: + return ret; +} + +static void __exit ib_cm_cleanup(void) +{ + struct cm_timewait_info *timewait_info, *tmp; + + spin_lock_irq(&cm.lock); + list_for_each_entry(timewait_info, &cm.timewait_list, list) + cancel_delayed_work(&timewait_info->work.work); + spin_unlock_irq(&cm.lock); + + ib_unregister_client(&cm_client); + destroy_workqueue(cm.wq); + + list_for_each_entry_safe(timewait_info, tmp, &cm.timewait_list, list) { + list_del(&timewait_info->list); + kfree(timewait_info); + } + + WARN_ON(!xa_empty(&cm.local_id_table)); +} + +module_init(ib_cm_init); +module_exit(ib_cm_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/cm_msgs.h b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/cm_msgs.h new file mode 100644 index 0000000..8462de7 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/cm_msgs.h @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2004, 2011 Intel Corporation. All rights reserved. + * Copyright (c) 2004 Topspin Corporation. All rights reserved. + * Copyright (c) 2004 Voltaire Corporation. All rights reserved. + * Copyright (c) 2019, Mellanox Technologies inc. All rights reserved. + */ +#ifndef CM_MSGS_H +#define CM_MSGS_H + +#include +#include +#include + +/* + * Parameters to routines below should be in network-byte order, and values + * are returned in network-byte order. + */ + +#define IB_CM_CLASS_VERSION 2 /* IB specification 1.2 */ + +static inline enum ib_qp_type cm_req_get_qp_type(struct cm_req_msg *req_msg) +{ + u8 transport_type = IBA_GET(CM_REQ_TRANSPORT_SERVICE_TYPE, req_msg); + switch (transport_type) { + case 0: return IB_QPT_RC; + case 1: return IB_QPT_UC; + case 3: + switch (IBA_GET(CM_REQ_EXTENDED_TRANSPORT_TYPE, req_msg)) { + case 1: return IB_QPT_XRC_TGT; + default: return 0; + } + default: return 0; + } +} + +static inline void cm_req_set_qp_type(struct cm_req_msg *req_msg, + enum ib_qp_type qp_type) +{ + switch (qp_type) { + case IB_QPT_UC: + IBA_SET(CM_REQ_TRANSPORT_SERVICE_TYPE, req_msg, 1); + break; + case IB_QPT_XRC_INI: + IBA_SET(CM_REQ_TRANSPORT_SERVICE_TYPE, req_msg, 3); + IBA_SET(CM_REQ_EXTENDED_TRANSPORT_TYPE, req_msg, 1); + break; + default: + IBA_SET(CM_REQ_TRANSPORT_SERVICE_TYPE, req_msg, 0); + } +} + +/* Message REJected or MRAed */ +enum cm_msg_response { + CM_MSG_RESPONSE_REQ = 0x0, + CM_MSG_RESPONSE_REP = 0x1, + CM_MSG_RESPONSE_OTHER = 0x2 +}; + +static inline __be32 cm_rep_get_qpn(struct cm_rep_msg *rep_msg, enum ib_qp_type qp_type) +{ + return (qp_type == IB_QPT_XRC_INI) ? + cpu_to_be32(IBA_GET(CM_REP_LOCAL_EE_CONTEXT_NUMBER, + rep_msg)) : + cpu_to_be32(IBA_GET(CM_REP_LOCAL_QPN, rep_msg)); +} + +#endif /* CM_MSGS_H */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/cm_trace.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/cm_trace.c new file mode 100644 index 0000000..8f3482f --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/cm_trace.c @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Trace points for the IB Connection Manager. + * + * Author: Chuck Lever + * + * Copyright (c) 2020, Oracle and/or its affiliates. + */ + +#include +#include "cma_priv.h" + +#define CREATE_TRACE_POINTS + +#include "cm_trace.h" diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/cm_trace.h b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/cm_trace.h new file mode 100644 index 0000000..e9d2826 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/cm_trace.h @@ -0,0 +1,414 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Trace point definitions for the RDMA Connect Manager. + * + * Author: Chuck Lever + * + * Copyright (c) 2020 Oracle and/or its affiliates. + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM ib_cma + +#if !defined(_TRACE_IB_CMA_H) || defined(TRACE_HEADER_MULTI_READ) + +#define _TRACE_IB_CMA_H + +#include +#include +#include + +/* + * enum ib_cm_state, from include/rdma/ib_cm.h + */ +#define IB_CM_STATE_LIST \ + ib_cm_state(IDLE) \ + ib_cm_state(LISTEN) \ + ib_cm_state(REQ_SENT) \ + ib_cm_state(REQ_RCVD) \ + ib_cm_state(MRA_REQ_SENT) \ + ib_cm_state(MRA_REQ_RCVD) \ + ib_cm_state(REP_SENT) \ + ib_cm_state(REP_RCVD) \ + ib_cm_state(MRA_REP_SENT) \ + ib_cm_state(MRA_REP_RCVD) \ + ib_cm_state(ESTABLISHED) \ + ib_cm_state(DREQ_SENT) \ + ib_cm_state(DREQ_RCVD) \ + ib_cm_state(TIMEWAIT) \ + ib_cm_state(SIDR_REQ_SENT) \ + ib_cm_state_end(SIDR_REQ_RCVD) + +#undef ib_cm_state +#undef ib_cm_state_end +#define ib_cm_state(x) TRACE_DEFINE_ENUM(IB_CM_##x); +#define ib_cm_state_end(x) TRACE_DEFINE_ENUM(IB_CM_##x); + +IB_CM_STATE_LIST + +#undef ib_cm_state +#undef ib_cm_state_end +#define ib_cm_state(x) { IB_CM_##x, #x }, +#define ib_cm_state_end(x) { IB_CM_##x, #x } + +#define show_ib_cm_state(x) \ + __print_symbolic(x, IB_CM_STATE_LIST) + +/* + * enum ib_cm_lap_state, from include/rdma/ib_cm.h + */ +#define IB_CM_LAP_STATE_LIST \ + ib_cm_lap_state(LAP_UNINIT) \ + ib_cm_lap_state(LAP_IDLE) \ + ib_cm_lap_state(LAP_SENT) \ + ib_cm_lap_state(LAP_RCVD) \ + ib_cm_lap_state(MRA_LAP_SENT) \ + ib_cm_lap_state_end(MRA_LAP_RCVD) + +#undef ib_cm_lap_state +#undef ib_cm_lap_state_end +#define ib_cm_lap_state(x) TRACE_DEFINE_ENUM(IB_CM_##x); +#define ib_cm_lap_state_end(x) TRACE_DEFINE_ENUM(IB_CM_##x); + +IB_CM_LAP_STATE_LIST + +#undef ib_cm_lap_state +#undef ib_cm_lap_state_end +#define ib_cm_lap_state(x) { IB_CM_##x, #x }, +#define ib_cm_lap_state_end(x) { IB_CM_##x, #x } + +#define show_ib_cm_lap_state(x) \ + __print_symbolic(x, IB_CM_LAP_STATE_LIST) + +/* + * enum ib_cm_rej_reason, from include/rdma/ib_cm.h + */ +#define IB_CM_REJ_REASON_LIST \ + ib_cm_rej_reason(REJ_NO_QP) \ + ib_cm_rej_reason(REJ_NO_EEC) \ + ib_cm_rej_reason(REJ_NO_RESOURCES) \ + ib_cm_rej_reason(REJ_TIMEOUT) \ + ib_cm_rej_reason(REJ_UNSUPPORTED) \ + ib_cm_rej_reason(REJ_INVALID_COMM_ID) \ + ib_cm_rej_reason(REJ_INVALID_COMM_INSTANCE) \ + ib_cm_rej_reason(REJ_INVALID_SERVICE_ID) \ + ib_cm_rej_reason(REJ_INVALID_TRANSPORT_TYPE) \ + ib_cm_rej_reason(REJ_STALE_CONN) \ + ib_cm_rej_reason(REJ_RDC_NOT_EXIST) \ + ib_cm_rej_reason(REJ_INVALID_GID) \ + ib_cm_rej_reason(REJ_INVALID_LID) \ + ib_cm_rej_reason(REJ_INVALID_SL) \ + ib_cm_rej_reason(REJ_INVALID_TRAFFIC_CLASS) \ + ib_cm_rej_reason(REJ_INVALID_HOP_LIMIT) \ + ib_cm_rej_reason(REJ_INVALID_PACKET_RATE) \ + ib_cm_rej_reason(REJ_INVALID_ALT_GID) \ + ib_cm_rej_reason(REJ_INVALID_ALT_LID) \ + ib_cm_rej_reason(REJ_INVALID_ALT_SL) \ + ib_cm_rej_reason(REJ_INVALID_ALT_TRAFFIC_CLASS) \ + ib_cm_rej_reason(REJ_INVALID_ALT_HOP_LIMIT) \ + ib_cm_rej_reason(REJ_INVALID_ALT_PACKET_RATE) \ + ib_cm_rej_reason(REJ_PORT_CM_REDIRECT) \ + ib_cm_rej_reason(REJ_PORT_REDIRECT) \ + ib_cm_rej_reason(REJ_INVALID_MTU) \ + ib_cm_rej_reason(REJ_INSUFFICIENT_RESP_RESOURCES) \ + ib_cm_rej_reason(REJ_CONSUMER_DEFINED) \ + ib_cm_rej_reason(REJ_INVALID_RNR_RETRY) \ + ib_cm_rej_reason(REJ_DUPLICATE_LOCAL_COMM_ID) \ + ib_cm_rej_reason(REJ_INVALID_CLASS_VERSION) \ + ib_cm_rej_reason(REJ_INVALID_FLOW_LABEL) \ + ib_cm_rej_reason(REJ_INVALID_ALT_FLOW_LABEL) \ + ib_cm_rej_reason_end(REJ_VENDOR_OPTION_NOT_SUPPORTED) + +#undef ib_cm_rej_reason +#undef ib_cm_rej_reason_end +#define ib_cm_rej_reason(x) TRACE_DEFINE_ENUM(IB_CM_##x); +#define ib_cm_rej_reason_end(x) TRACE_DEFINE_ENUM(IB_CM_##x); + +IB_CM_REJ_REASON_LIST + +#undef ib_cm_rej_reason +#undef ib_cm_rej_reason_end +#define ib_cm_rej_reason(x) { IB_CM_##x, #x }, +#define ib_cm_rej_reason_end(x) { IB_CM_##x, #x } + +#define show_ib_cm_rej_reason(x) \ + __print_symbolic(x, IB_CM_REJ_REASON_LIST) + +DECLARE_EVENT_CLASS(icm_id_class, + TP_PROTO( + const struct ib_cm_id *cm_id + ), + + TP_ARGS(cm_id), + + TP_STRUCT__entry( + __field(const void *, cm_id) /* for eBPF scripts */ + __field(unsigned int, local_id) + __field(unsigned int, remote_id) + __field(unsigned long, state) + __field(unsigned long, lap_state) + ), + + TP_fast_assign( + __entry->cm_id = cm_id; + __entry->local_id = be32_to_cpu(cm_id->local_id); + __entry->remote_id = be32_to_cpu(cm_id->remote_id); + __entry->state = cm_id->state; + __entry->lap_state = cm_id->lap_state; + ), + + TP_printk("local_id=%u remote_id=%u state=%s lap_state=%s", + __entry->local_id, __entry->remote_id, + show_ib_cm_state(__entry->state), + show_ib_cm_lap_state(__entry->lap_state) + ) +); + +#define DEFINE_CM_SEND_EVENT(name) \ + DEFINE_EVENT(icm_id_class, \ + icm_send_##name, \ + TP_PROTO( \ + const struct ib_cm_id *cm_id \ + ), \ + TP_ARGS(cm_id)) + +DEFINE_CM_SEND_EVENT(req); +DEFINE_CM_SEND_EVENT(rep); +DEFINE_CM_SEND_EVENT(dup_req); +DEFINE_CM_SEND_EVENT(dup_rep); +DEFINE_CM_SEND_EVENT(rtu); +DEFINE_CM_SEND_EVENT(mra); +DEFINE_CM_SEND_EVENT(sidr_req); +DEFINE_CM_SEND_EVENT(sidr_rep); +DEFINE_CM_SEND_EVENT(dreq); +DEFINE_CM_SEND_EVENT(drep); + +TRACE_EVENT(icm_send_rej, + TP_PROTO( + const struct ib_cm_id *cm_id, + enum ib_cm_rej_reason reason + ), + + TP_ARGS(cm_id, reason), + + TP_STRUCT__entry( + __field(const void *, cm_id) + __field(u32, local_id) + __field(u32, remote_id) + __field(unsigned long, state) + __field(unsigned long, reason) + ), + + TP_fast_assign( + __entry->cm_id = cm_id; + __entry->local_id = be32_to_cpu(cm_id->local_id); + __entry->remote_id = be32_to_cpu(cm_id->remote_id); + __entry->state = cm_id->state; + __entry->reason = reason; + ), + + TP_printk("local_id=%u remote_id=%u state=%s reason=%s", + __entry->local_id, __entry->remote_id, + show_ib_cm_state(__entry->state), + show_ib_cm_rej_reason(__entry->reason) + ) +); + +#define DEFINE_CM_ERR_EVENT(name) \ + DEFINE_EVENT(icm_id_class, \ + icm_##name##_err, \ + TP_PROTO( \ + const struct ib_cm_id *cm_id \ + ), \ + TP_ARGS(cm_id)) + +DEFINE_CM_ERR_EVENT(send_cm_rtu); +DEFINE_CM_ERR_EVENT(establish); +DEFINE_CM_ERR_EVENT(no_listener); +DEFINE_CM_ERR_EVENT(send_drep); +DEFINE_CM_ERR_EVENT(dreq_unknown); +DEFINE_CM_ERR_EVENT(send_unknown_rej); +DEFINE_CM_ERR_EVENT(rej_unknown); +DEFINE_CM_ERR_EVENT(send_mra_unknown); +DEFINE_CM_ERR_EVENT(mra_unknown); +DEFINE_CM_ERR_EVENT(qp_init); +DEFINE_CM_ERR_EVENT(qp_rtr); +DEFINE_CM_ERR_EVENT(qp_rts); + +DEFINE_EVENT(icm_id_class, \ + icm_dreq_skipped, \ + TP_PROTO( \ + const struct ib_cm_id *cm_id \ + ), \ + TP_ARGS(cm_id) \ +); + +DECLARE_EVENT_CLASS(icm_local_class, + TP_PROTO( + unsigned int local_id, + unsigned int remote_id + ), + + TP_ARGS(local_id, remote_id), + + TP_STRUCT__entry( + __field(unsigned int, local_id) + __field(unsigned int, remote_id) + ), + + TP_fast_assign( + __entry->local_id = local_id; + __entry->remote_id = remote_id; + ), + + TP_printk("local_id=%u remote_id=%u", + __entry->local_id, __entry->remote_id + ) +); + +#define DEFINE_CM_LOCAL_EVENT(name) \ + DEFINE_EVENT(icm_local_class, \ + icm_##name, \ + TP_PROTO( \ + unsigned int local_id, \ + unsigned int remote_id \ + ), \ + TP_ARGS(local_id, remote_id)) + +DEFINE_CM_LOCAL_EVENT(issue_rej); +DEFINE_CM_LOCAL_EVENT(issue_drep); +DEFINE_CM_LOCAL_EVENT(staleconn_err); +DEFINE_CM_LOCAL_EVENT(no_priv_err); + +DECLARE_EVENT_CLASS(icm_remote_class, + TP_PROTO( + u32 remote_id + ), + + TP_ARGS(remote_id), + + TP_STRUCT__entry( + __field(u32, remote_id) + ), + + TP_fast_assign( + __entry->remote_id = remote_id; + ), + + TP_printk("remote_id=%u", + __entry->remote_id + ) +); + +#define DEFINE_CM_REMOTE_EVENT(name) \ + DEFINE_EVENT(icm_remote_class, \ + icm_##name, \ + TP_PROTO( \ + u32 remote_id \ + ), \ + TP_ARGS(remote_id)) + +DEFINE_CM_REMOTE_EVENT(remote_no_priv_err); +DEFINE_CM_REMOTE_EVENT(insert_failed_err); + +TRACE_EVENT(icm_send_rep_err, + TP_PROTO( + __be32 local_id, + enum ib_cm_state state + ), + + TP_ARGS(local_id, state), + + TP_STRUCT__entry( + __field(unsigned int, local_id) + __field(unsigned long, state) + ), + + TP_fast_assign( + __entry->local_id = be32_to_cpu(local_id); + __entry->state = state; + ), + + TP_printk("local_id=%u state=%s", + __entry->local_id, show_ib_cm_state(__entry->state) + ) +); + +TRACE_EVENT(icm_rep_unknown_err, + TP_PROTO( + unsigned int local_id, + unsigned int remote_id, + enum ib_cm_state state + ), + + TP_ARGS(local_id, remote_id, state), + + TP_STRUCT__entry( + __field(unsigned int, local_id) + __field(unsigned int, remote_id) + __field(unsigned long, state) + ), + + TP_fast_assign( + __entry->local_id = local_id; + __entry->remote_id = remote_id; + __entry->state = state; + ), + + TP_printk("local_id=%u remote_id=%u state=%s", + __entry->local_id, __entry->remote_id, + show_ib_cm_state(__entry->state) + ) +); + +TRACE_EVENT(icm_handler_err, + TP_PROTO( + enum ib_cm_event_type event + ), + + TP_ARGS(event), + + TP_STRUCT__entry( + __field(unsigned long, event) + ), + + TP_fast_assign( + __entry->event = event; + ), + + TP_printk("unhandled event=%s", + rdma_show_ib_cm_event(__entry->event) + ) +); + +TRACE_EVENT(icm_mad_send_err, + TP_PROTO( + enum ib_cm_state state, + enum ib_wc_status wc_status + ), + + TP_ARGS(state, wc_status), + + TP_STRUCT__entry( + __field(unsigned long, state) + __field(unsigned long, wc_status) + ), + + TP_fast_assign( + __entry->state = state; + __entry->wc_status = wc_status; + ), + + TP_printk("state=%s completion status=%s", + show_ib_cm_state(__entry->state), + rdma_show_wc_status(__entry->wc_status) + ) +); + +#endif /* _TRACE_IB_CMA_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH ../../drivers/infiniband/core +#define TRACE_INCLUDE_FILE cm_trace + +#include diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/cma.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/cma.c new file mode 100644 index 0000000..727de73 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/cma.c @@ -0,0 +1,5475 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2005 Voltaire Inc. All rights reserved. + * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved. + * Copyright (c) 1999-2019, Mellanox Technologies, Inc. All rights reserved. + * Copyright (c) 2005-2006 Intel Corporation. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "core_priv.h" +#include "cma_priv.h" +#include "cma_trace.h" + +MODULE_AUTHOR("Sean Hefty"); +MODULE_DESCRIPTION("Generic RDMA CM Agent"); +MODULE_LICENSE("Dual BSD/GPL"); + +#define CMA_CM_RESPONSE_TIMEOUT 22 +#define CMA_MAX_CM_RETRIES 15 +#define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24) +#define CMA_IBOE_PACKET_LIFETIME 18 +#define CMA_PREFERRED_ROCE_GID_TYPE IB_GID_TYPE_ROCE_UDP_ENCAP + +static const char * const cma_events[] = { + [RDMA_CM_EVENT_ADDR_RESOLVED] = "address resolved", + [RDMA_CM_EVENT_ADDR_ERROR] = "address error", + [RDMA_CM_EVENT_ROUTE_RESOLVED] = "route resolved ", + [RDMA_CM_EVENT_ROUTE_ERROR] = "route error", + [RDMA_CM_EVENT_CONNECT_REQUEST] = "connect request", + [RDMA_CM_EVENT_CONNECT_RESPONSE] = "connect response", + [RDMA_CM_EVENT_CONNECT_ERROR] = "connect error", + [RDMA_CM_EVENT_UNREACHABLE] = "unreachable", + [RDMA_CM_EVENT_REJECTED] = "rejected", + [RDMA_CM_EVENT_ESTABLISHED] = "established", + [RDMA_CM_EVENT_DISCONNECTED] = "disconnected", + [RDMA_CM_EVENT_DEVICE_REMOVAL] = "device removal", + [RDMA_CM_EVENT_MULTICAST_JOIN] = "multicast join", + [RDMA_CM_EVENT_MULTICAST_ERROR] = "multicast error", + [RDMA_CM_EVENT_ADDR_CHANGE] = "address change", + [RDMA_CM_EVENT_TIMEWAIT_EXIT] = "timewait exit", +}; + +static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid, + enum ib_gid_type gid_type); + +const char *__attribute_const__ rdma_event_msg(enum rdma_cm_event_type event) +{ + size_t index = event; + + return (index < ARRAY_SIZE(cma_events) && cma_events[index]) ? + cma_events[index] : "unrecognized event"; +} +EXPORT_SYMBOL(rdma_event_msg); + +const char *__attribute_const__ rdma_reject_msg(struct rdma_cm_id *id, + int reason) +{ + if (rdma_ib_or_roce(id->device, id->port_num)) + return ibcm_reject_msg(reason); + + if (rdma_protocol_iwarp(id->device, id->port_num)) + return iwcm_reject_msg(reason); + + WARN_ON_ONCE(1); + return "unrecognized transport"; +} +EXPORT_SYMBOL(rdma_reject_msg); + +/** + * rdma_is_consumer_reject - return true if the consumer rejected the connect + * request. + * @id: Communication identifier that received the REJECT event. + * @reason: Value returned in the REJECT event status field. + */ +static bool rdma_is_consumer_reject(struct rdma_cm_id *id, int reason) +{ + if (rdma_ib_or_roce(id->device, id->port_num)) + return reason == IB_CM_REJ_CONSUMER_DEFINED; + + if (rdma_protocol_iwarp(id->device, id->port_num)) + return reason == -ECONNREFUSED; + + WARN_ON_ONCE(1); + return false; +} + +const void *rdma_consumer_reject_data(struct rdma_cm_id *id, + struct rdma_cm_event *ev, u8 *data_len) +{ + const void *p; + + if (rdma_is_consumer_reject(id, ev->status)) { + *data_len = ev->param.conn.private_data_len; + p = ev->param.conn.private_data; + } else { + *data_len = 0; + p = NULL; + } + return p; +} +EXPORT_SYMBOL(rdma_consumer_reject_data); + +/** + * rdma_iw_cm_id() - return the iw_cm_id pointer for this cm_id. + * @id: Communication Identifier + */ +struct iw_cm_id *rdma_iw_cm_id(struct rdma_cm_id *id) +{ + struct rdma_id_private *id_priv; + + id_priv = container_of(id, struct rdma_id_private, id); + if (id->device->node_type == RDMA_NODE_RNIC) + return id_priv->cm_id.iw; + return NULL; +} +EXPORT_SYMBOL(rdma_iw_cm_id); + +/** + * rdma_res_to_id() - return the rdma_cm_id pointer for this restrack. + * @res: rdma resource tracking entry pointer + */ +struct rdma_cm_id *rdma_res_to_id(struct rdma_restrack_entry *res) +{ + struct rdma_id_private *id_priv = + container_of(res, struct rdma_id_private, res); + + return &id_priv->id; +} +EXPORT_SYMBOL(rdma_res_to_id); + +static int cma_add_one(struct ib_device *device); +static void cma_remove_one(struct ib_device *device, void *client_data); + +static struct ib_client cma_client = { + .name = "cma", + .add = cma_add_one, + .remove = cma_remove_one +}; + +static struct ib_sa_client sa_client; +static LIST_HEAD(dev_list); +static LIST_HEAD(listen_any_list); +static DEFINE_MUTEX(lock); +static struct rb_root id_table = RB_ROOT; +/* Serialize operations of id_table tree */ +static DEFINE_SPINLOCK(id_table_lock); +static struct workqueue_struct *cma_wq; +static struct workqueue_struct *cma_netevent_wq; +static unsigned int cma_pernet_id; + +struct cma_pernet { + struct xarray tcp_ps; + struct xarray udp_ps; + struct xarray ipoib_ps; + struct xarray ib_ps; +}; + +static struct cma_pernet *cma_pernet(struct net *net) +{ + return net_generic(net, cma_pernet_id); +} + +static +struct xarray *cma_pernet_xa(struct net *net, enum rdma_ucm_port_space ps) +{ + struct cma_pernet *pernet = cma_pernet(net); + + switch (ps) { + case RDMA_PS_TCP: + return &pernet->tcp_ps; + case RDMA_PS_UDP: + return &pernet->udp_ps; + case RDMA_PS_IPOIB: + return &pernet->ipoib_ps; + case RDMA_PS_IB: + return &pernet->ib_ps; + default: + return NULL; + } +} + +struct id_table_entry { + struct list_head id_list; + struct rb_node rb_node; +}; + +struct cma_device { + struct list_head list; + struct ib_device *device; + struct completion comp; + refcount_t refcount; + struct list_head id_list; + enum ib_gid_type *default_gid_type; + u8 *default_roce_tos; +}; + +struct rdma_bind_list { + enum rdma_ucm_port_space ps; + struct hlist_head owners; + unsigned short port; +}; + +static int cma_ps_alloc(struct net *net, enum rdma_ucm_port_space ps, + struct rdma_bind_list *bind_list, int snum) +{ + struct xarray *xa = cma_pernet_xa(net, ps); + + return xa_insert(xa, snum, bind_list, GFP_KERNEL); +} + +static struct rdma_bind_list *cma_ps_find(struct net *net, + enum rdma_ucm_port_space ps, int snum) +{ + struct xarray *xa = cma_pernet_xa(net, ps); + + return xa_load(xa, snum); +} + +static void cma_ps_remove(struct net *net, enum rdma_ucm_port_space ps, + int snum) +{ + struct xarray *xa = cma_pernet_xa(net, ps); + + xa_erase(xa, snum); +} + +enum { + CMA_OPTION_AFONLY, +}; + +void cma_dev_get(struct cma_device *cma_dev) +{ + refcount_inc(&cma_dev->refcount); +} + +void cma_dev_put(struct cma_device *cma_dev) +{ + if (refcount_dec_and_test(&cma_dev->refcount)) + complete(&cma_dev->comp); +} + +struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter filter, + void *cookie) +{ + struct cma_device *cma_dev; + struct cma_device *found_cma_dev = NULL; + + mutex_lock(&lock); + + list_for_each_entry(cma_dev, &dev_list, list) + if (filter(cma_dev->device, cookie)) { + found_cma_dev = cma_dev; + break; + } + + if (found_cma_dev) + cma_dev_get(found_cma_dev); + mutex_unlock(&lock); + return found_cma_dev; +} + +int cma_get_default_gid_type(struct cma_device *cma_dev, + u32 port) +{ + if (!rdma_is_port_valid(cma_dev->device, port)) + return -EINVAL; + + return cma_dev->default_gid_type[port - rdma_start_port(cma_dev->device)]; +} + +int cma_set_default_gid_type(struct cma_device *cma_dev, + u32 port, + enum ib_gid_type default_gid_type) +{ + unsigned long supported_gids; + + if (!rdma_is_port_valid(cma_dev->device, port)) + return -EINVAL; + + if (default_gid_type == IB_GID_TYPE_IB && + rdma_protocol_roce_eth_encap(cma_dev->device, port)) + default_gid_type = IB_GID_TYPE_ROCE; + + supported_gids = roce_gid_type_mask_support(cma_dev->device, port); + + if (!(supported_gids & 1 << default_gid_type)) + return -EINVAL; + + cma_dev->default_gid_type[port - rdma_start_port(cma_dev->device)] = + default_gid_type; + + return 0; +} + +int cma_get_default_roce_tos(struct cma_device *cma_dev, u32 port) +{ + if (!rdma_is_port_valid(cma_dev->device, port)) + return -EINVAL; + + return cma_dev->default_roce_tos[port - rdma_start_port(cma_dev->device)]; +} + +int cma_set_default_roce_tos(struct cma_device *cma_dev, u32 port, + u8 default_roce_tos) +{ + if (!rdma_is_port_valid(cma_dev->device, port)) + return -EINVAL; + + cma_dev->default_roce_tos[port - rdma_start_port(cma_dev->device)] = + default_roce_tos; + + return 0; +} +struct ib_device *cma_get_ib_dev(struct cma_device *cma_dev) +{ + return cma_dev->device; +} + +/* + * Device removal can occur at anytime, so we need extra handling to + * serialize notifying the user of device removal with other callbacks. + * We do this by disabling removal notification while a callback is in process, + * and reporting it after the callback completes. + */ + +struct cma_multicast { + struct rdma_id_private *id_priv; + union { + struct ib_sa_multicast *sa_mc; + struct { + struct work_struct work; + struct rdma_cm_event event; + } iboe_join; + }; + struct list_head list; + void *context; + struct sockaddr_storage addr; + u8 join_state; +}; + +struct cma_work { + struct work_struct work; + struct rdma_id_private *id; + enum rdma_cm_state old_state; + enum rdma_cm_state new_state; + struct rdma_cm_event event; +}; + +struct cma_netevent_work { + struct work_struct work; + struct rdma_id_private *id_priv; +}; + +union cma_ip_addr { + struct in6_addr ip6; + struct { + __be32 pad[3]; + __be32 addr; + } ip4; +}; + +struct cma_hdr { + u8 cma_version; + u8 ip_version; /* IP version: 7:4 */ + __be16 port; + union cma_ip_addr src_addr; + union cma_ip_addr dst_addr; +}; + +#define CMA_VERSION 0x00 + +struct cma_req_info { + struct sockaddr_storage listen_addr_storage; + struct sockaddr_storage src_addr_storage; + struct ib_device *device; + union ib_gid local_gid; + __be64 service_id; + int port; + bool has_gid; + u16 pkey; +}; + +static int cma_comp_exch(struct rdma_id_private *id_priv, + enum rdma_cm_state comp, enum rdma_cm_state exch) +{ + unsigned long flags; + int ret; + + /* + * The FSM uses a funny double locking where state is protected by both + * the handler_mutex and the spinlock. State is not allowed to change + * to/from a handler_mutex protected value without also holding + * handler_mutex. + */ + if (comp == RDMA_CM_CONNECT || exch == RDMA_CM_CONNECT) + lockdep_assert_held(&id_priv->handler_mutex); + + spin_lock_irqsave(&id_priv->lock, flags); + if ((ret = (id_priv->state == comp))) + id_priv->state = exch; + spin_unlock_irqrestore(&id_priv->lock, flags); + return ret; +} + +static inline u8 cma_get_ip_ver(const struct cma_hdr *hdr) +{ + return hdr->ip_version >> 4; +} + +static void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver) +{ + hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF); +} + +static struct sockaddr *cma_src_addr(struct rdma_id_private *id_priv) +{ + return (struct sockaddr *)&id_priv->id.route.addr.src_addr; +} + +static inline struct sockaddr *cma_dst_addr(struct rdma_id_private *id_priv) +{ + return (struct sockaddr *)&id_priv->id.route.addr.dst_addr; +} + +static int cma_igmp_send(struct net_device *ndev, union ib_gid *mgid, bool join) +{ + struct in_device *in_dev = NULL; + + if (ndev) { + rtnl_lock(); + in_dev = __in_dev_get_rtnl(ndev); + if (in_dev) { + if (join) + ip_mc_inc_group(in_dev, + *(__be32 *)(mgid->raw + 12)); + else + ip_mc_dec_group(in_dev, + *(__be32 *)(mgid->raw + 12)); + } + rtnl_unlock(); + } + return (in_dev) ? 0 : -ENODEV; +} + +static int compare_netdev_and_ip(int ifindex_a, struct sockaddr *sa, + int ifindex_b, struct sockaddr *sb) +{ + if (ifindex_a != ifindex_b) + return (ifindex_a - ifindex_b); + + if (sa->sa_family != sb->sa_family) + return (sa->sa_family - sb->sa_family); + + if (sa->sa_family == AF_INET) + return (int)__be32_to_cpu( + ((struct sockaddr_in *)sa)->sin_addr.s_addr) - + (int)__be32_to_cpu( + ((struct sockaddr_in *)sb)->sin_addr.s_addr); + + return memcmp((char *)&((struct sockaddr_in6 *)sa)->sin6_addr, + (char *)&((struct sockaddr_in6 *)sb)->sin6_addr, + sizeof(((struct sockaddr_in6 *)sa)->sin6_addr)); +} + +static int cma_add_id_to_tree(struct rdma_id_private *node_id_priv) +{ + struct rb_node **new = &id_table.rb_node, *parent = NULL; + struct id_table_entry *this, *node; + struct rdma_id_private *id_priv; + unsigned long flags; + int result; + + node = kzalloc(sizeof(*node), GFP_KERNEL); + if (!node) + return -ENOMEM; + + spin_lock_irqsave(&id_table_lock, flags); + while (*new) { + this = container_of(*new, struct id_table_entry, rb_node); + id_priv = list_first_entry( + &this->id_list, struct rdma_id_private, id_list_entry); + result = compare_netdev_and_ip( + node_id_priv->id.route.addr.dev_addr.bound_dev_if, + cma_dst_addr(node_id_priv), + id_priv->id.route.addr.dev_addr.bound_dev_if, + cma_dst_addr(id_priv)); + + parent = *new; + if (result < 0) + new = &((*new)->rb_left); + else if (result > 0) + new = &((*new)->rb_right); + else { + list_add_tail(&node_id_priv->id_list_entry, + &this->id_list); + kfree(node); + goto unlock; + } + } + + INIT_LIST_HEAD(&node->id_list); + list_add_tail(&node_id_priv->id_list_entry, &node->id_list); + + rb_link_node(&node->rb_node, parent, new); + rb_insert_color(&node->rb_node, &id_table); + +unlock: + spin_unlock_irqrestore(&id_table_lock, flags); + return 0; +} + +static struct id_table_entry * +node_from_ndev_ip(struct rb_root *root, int ifindex, struct sockaddr *sa) +{ + + struct rb_node *node = root->rb_node; + struct rdma_id_private *node_id_priv; + struct id_table_entry *data; + int result; + + + while (node) { + data = container_of(node, struct id_table_entry, rb_node); + node_id_priv = list_first_entry( + &data->id_list, struct rdma_id_private, id_list_entry); + result = compare_netdev_and_ip( + ifindex, sa, + node_id_priv->id.route.addr.dev_addr.bound_dev_if, + cma_dst_addr(node_id_priv)); + if (result < 0) + node = node->rb_left; + else if (result > 0) + node = node->rb_right; + else + return data; + } + + return NULL; +} + +static void cma_remove_id_from_tree(struct rdma_id_private *id_priv) +{ + struct id_table_entry *data; + unsigned long flags; + + spin_lock_irqsave(&id_table_lock, flags); + if (list_empty(&id_priv->id_list_entry)) + goto out; + + data = node_from_ndev_ip(&id_table, + id_priv->id.route.addr.dev_addr.bound_dev_if, + cma_dst_addr(id_priv)); + if (!data) + goto out; + + list_del_init(&id_priv->id_list_entry); + if (list_empty(&data->id_list)) { + rb_erase(&data->rb_node, &id_table); + kfree(data); + } +out: + spin_unlock_irqrestore(&id_table_lock, flags); +} + +static void _cma_attach_to_dev(struct rdma_id_private *id_priv, + struct cma_device *cma_dev) +{ + cma_dev_get(cma_dev); + id_priv->cma_dev = cma_dev; + id_priv->id.device = cma_dev->device; + id_priv->id.route.addr.dev_addr.transport = + rdma_node_get_transport(cma_dev->device->node_type); + list_add_tail(&id_priv->device_item, &cma_dev->id_list); + + trace_cm_id_attach(id_priv, cma_dev->device); +} + +static void cma_attach_to_dev(struct rdma_id_private *id_priv, + struct cma_device *cma_dev) +{ + _cma_attach_to_dev(id_priv, cma_dev); + id_priv->gid_type = + cma_dev->default_gid_type[id_priv->id.port_num - + rdma_start_port(cma_dev->device)]; +} + +static void cma_release_dev(struct rdma_id_private *id_priv) +{ + mutex_lock(&lock); + list_del_init(&id_priv->device_item); + cma_dev_put(id_priv->cma_dev); + id_priv->cma_dev = NULL; + id_priv->id.device = NULL; + if (id_priv->id.route.addr.dev_addr.sgid_attr) { + rdma_put_gid_attr(id_priv->id.route.addr.dev_addr.sgid_attr); + id_priv->id.route.addr.dev_addr.sgid_attr = NULL; + } + mutex_unlock(&lock); +} + +static inline unsigned short cma_family(struct rdma_id_private *id_priv) +{ + return id_priv->id.route.addr.src_addr.ss_family; +} + +static int cma_set_qkey(struct rdma_id_private *id_priv, u32 qkey) +{ + struct ib_sa_mcmember_rec rec; + int ret = 0; + + if (id_priv->qkey) { + if (qkey && id_priv->qkey != qkey) + return -EINVAL; + return 0; + } + + if (qkey) { + id_priv->qkey = qkey; + return 0; + } + + switch (id_priv->id.ps) { + case RDMA_PS_UDP: + case RDMA_PS_IB: + id_priv->qkey = RDMA_UDP_QKEY; + break; + case RDMA_PS_IPOIB: + ib_addr_get_mgid(&id_priv->id.route.addr.dev_addr, &rec.mgid); + ret = ib_sa_get_mcmember_rec(id_priv->id.device, + id_priv->id.port_num, &rec.mgid, + &rec); + if (!ret) + id_priv->qkey = be32_to_cpu(rec.qkey); + break; + default: + break; + } + return ret; +} + +static void cma_translate_ib(struct sockaddr_ib *sib, struct rdma_dev_addr *dev_addr) +{ + dev_addr->dev_type = ARPHRD_INFINIBAND; + rdma_addr_set_sgid(dev_addr, (union ib_gid *) &sib->sib_addr); + ib_addr_set_pkey(dev_addr, ntohs(sib->sib_pkey)); +} + +static int cma_translate_addr(struct sockaddr *addr, struct rdma_dev_addr *dev_addr) +{ + int ret; + + if (addr->sa_family != AF_IB) { + ret = rdma_translate_ip(addr, dev_addr); + } else { + cma_translate_ib((struct sockaddr_ib *) addr, dev_addr); + ret = 0; + } + + return ret; +} + +static const struct ib_gid_attr * +cma_validate_port(struct ib_device *device, u32 port, + enum ib_gid_type gid_type, + union ib_gid *gid, + struct rdma_id_private *id_priv) +{ + struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; + int bound_if_index = dev_addr->bound_dev_if; + const struct ib_gid_attr *sgid_attr; + int dev_type = dev_addr->dev_type; + struct net_device *ndev = NULL; + + if (!rdma_dev_access_netns(device, id_priv->id.route.addr.dev_addr.net)) + return ERR_PTR(-ENODEV); + + if ((dev_type == ARPHRD_INFINIBAND) && !rdma_protocol_ib(device, port)) + return ERR_PTR(-ENODEV); + + if ((dev_type != ARPHRD_INFINIBAND) && rdma_protocol_ib(device, port)) + return ERR_PTR(-ENODEV); + + if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) { + ndev = dev_get_by_index(dev_addr->net, bound_if_index); + if (!ndev) + return ERR_PTR(-ENODEV); + } else { + gid_type = IB_GID_TYPE_IB; + } + + sgid_attr = rdma_find_gid_by_port(device, gid, gid_type, port, ndev); + if (ndev) + dev_put(ndev); + return sgid_attr; +} + +static void cma_bind_sgid_attr(struct rdma_id_private *id_priv, + const struct ib_gid_attr *sgid_attr) +{ + WARN_ON(id_priv->id.route.addr.dev_addr.sgid_attr); + id_priv->id.route.addr.dev_addr.sgid_attr = sgid_attr; +} + +/** + * cma_acquire_dev_by_src_ip - Acquire cma device, port, gid attribute + * based on source ip address. + * @id_priv: cm_id which should be bound to cma device + * + * cma_acquire_dev_by_src_ip() binds cm id to cma device, port and GID attribute + * based on source IP address. It returns 0 on success or error code otherwise. + * It is applicable to active and passive side cm_id. + */ +static int cma_acquire_dev_by_src_ip(struct rdma_id_private *id_priv) +{ + struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; + const struct ib_gid_attr *sgid_attr; + union ib_gid gid, iboe_gid, *gidp; + struct cma_device *cma_dev; + enum ib_gid_type gid_type; + int ret = -ENODEV; + u32 port; + + if (dev_addr->dev_type != ARPHRD_INFINIBAND && + id_priv->id.ps == RDMA_PS_IPOIB) + return -EINVAL; + + rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, + &iboe_gid); + + memcpy(&gid, dev_addr->src_dev_addr + + rdma_addr_gid_offset(dev_addr), sizeof(gid)); + + mutex_lock(&lock); + list_for_each_entry(cma_dev, &dev_list, list) { + rdma_for_each_port (cma_dev->device, port) { + gidp = rdma_protocol_roce(cma_dev->device, port) ? + &iboe_gid : &gid; + gid_type = cma_dev->default_gid_type[port - 1]; + sgid_attr = cma_validate_port(cma_dev->device, port, + gid_type, gidp, id_priv); + if (!IS_ERR(sgid_attr)) { + id_priv->id.port_num = port; + cma_bind_sgid_attr(id_priv, sgid_attr); + cma_attach_to_dev(id_priv, cma_dev); + ret = 0; + goto out; + } + } + } +out: + mutex_unlock(&lock); + return ret; +} + +/** + * cma_ib_acquire_dev - Acquire cma device, port and SGID attribute + * @id_priv: cm id to bind to cma device + * @listen_id_priv: listener cm id to match against + * @req: Pointer to req structure containaining incoming + * request information + * cma_ib_acquire_dev() acquires cma device, port and SGID attribute when + * rdma device matches for listen_id and incoming request. It also verifies + * that a GID table entry is present for the source address. + * Returns 0 on success, or returns error code otherwise. + */ +static int cma_ib_acquire_dev(struct rdma_id_private *id_priv, + const struct rdma_id_private *listen_id_priv, + struct cma_req_info *req) +{ + struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; + const struct ib_gid_attr *sgid_attr; + enum ib_gid_type gid_type; + union ib_gid gid; + + if (dev_addr->dev_type != ARPHRD_INFINIBAND && + id_priv->id.ps == RDMA_PS_IPOIB) + return -EINVAL; + + if (rdma_protocol_roce(req->device, req->port)) + rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, + &gid); + else + memcpy(&gid, dev_addr->src_dev_addr + + rdma_addr_gid_offset(dev_addr), sizeof(gid)); + + gid_type = listen_id_priv->cma_dev->default_gid_type[req->port - 1]; + sgid_attr = cma_validate_port(req->device, req->port, + gid_type, &gid, id_priv); + if (IS_ERR(sgid_attr)) + return PTR_ERR(sgid_attr); + + id_priv->id.port_num = req->port; + cma_bind_sgid_attr(id_priv, sgid_attr); + /* Need to acquire lock to protect against reader + * of cma_dev->id_list such as cma_netdev_callback() and + * cma_process_remove(). + */ + mutex_lock(&lock); + cma_attach_to_dev(id_priv, listen_id_priv->cma_dev); + mutex_unlock(&lock); + rdma_restrack_add(&id_priv->res); + return 0; +} + +static int cma_iw_acquire_dev(struct rdma_id_private *id_priv, + const struct rdma_id_private *listen_id_priv) +{ + struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; + const struct ib_gid_attr *sgid_attr; + struct cma_device *cma_dev; + enum ib_gid_type gid_type; + int ret = -ENODEV; + union ib_gid gid; + u32 port; + + if (dev_addr->dev_type != ARPHRD_INFINIBAND && + id_priv->id.ps == RDMA_PS_IPOIB) + return -EINVAL; + + memcpy(&gid, dev_addr->src_dev_addr + + rdma_addr_gid_offset(dev_addr), sizeof(gid)); + + mutex_lock(&lock); + + cma_dev = listen_id_priv->cma_dev; + port = listen_id_priv->id.port_num; + gid_type = listen_id_priv->gid_type; + sgid_attr = cma_validate_port(cma_dev->device, port, + gid_type, &gid, id_priv); + if (!IS_ERR(sgid_attr)) { + id_priv->id.port_num = port; + cma_bind_sgid_attr(id_priv, sgid_attr); + ret = 0; + goto out; + } + + list_for_each_entry(cma_dev, &dev_list, list) { + rdma_for_each_port (cma_dev->device, port) { + if (listen_id_priv->cma_dev == cma_dev && + listen_id_priv->id.port_num == port) + continue; + + gid_type = cma_dev->default_gid_type[port - 1]; + sgid_attr = cma_validate_port(cma_dev->device, port, + gid_type, &gid, id_priv); + if (!IS_ERR(sgid_attr)) { + id_priv->id.port_num = port; + cma_bind_sgid_attr(id_priv, sgid_attr); + ret = 0; + goto out; + } + } + } + +out: + if (!ret) { + cma_attach_to_dev(id_priv, cma_dev); + rdma_restrack_add(&id_priv->res); + } + + mutex_unlock(&lock); + return ret; +} + +/* + * Select the source IB device and address to reach the destination IB address. + */ +static int cma_resolve_ib_dev(struct rdma_id_private *id_priv) +{ + struct cma_device *cma_dev, *cur_dev; + struct sockaddr_ib *addr; + union ib_gid gid, sgid, *dgid; + unsigned int p; + u16 pkey, index; + enum ib_port_state port_state; + int ret; + int i; + + cma_dev = NULL; + addr = (struct sockaddr_ib *) cma_dst_addr(id_priv); + dgid = (union ib_gid *) &addr->sib_addr; + pkey = ntohs(addr->sib_pkey); + + mutex_lock(&lock); + list_for_each_entry(cur_dev, &dev_list, list) { + rdma_for_each_port (cur_dev->device, p) { + if (!rdma_cap_af_ib(cur_dev->device, p)) + continue; + + if (ib_find_cached_pkey(cur_dev->device, p, pkey, &index)) + continue; + + if (ib_get_cached_port_state(cur_dev->device, p, &port_state)) + continue; + + for (i = 0; i < cur_dev->device->port_data[p].immutable.gid_tbl_len; + ++i) { + ret = rdma_query_gid(cur_dev->device, p, i, + &gid); + if (ret) + continue; + + if (!memcmp(&gid, dgid, sizeof(gid))) { + cma_dev = cur_dev; + sgid = gid; + id_priv->id.port_num = p; + goto found; + } + + if (!cma_dev && (gid.global.subnet_prefix == + dgid->global.subnet_prefix) && + port_state == IB_PORT_ACTIVE) { + cma_dev = cur_dev; + sgid = gid; + id_priv->id.port_num = p; + goto found; + } + } + } + } + mutex_unlock(&lock); + return -ENODEV; + +found: + cma_attach_to_dev(id_priv, cma_dev); + rdma_restrack_add(&id_priv->res); + mutex_unlock(&lock); + addr = (struct sockaddr_ib *)cma_src_addr(id_priv); + memcpy(&addr->sib_addr, &sgid, sizeof(sgid)); + cma_translate_ib(addr, &id_priv->id.route.addr.dev_addr); + return 0; +} + +static void cma_id_get(struct rdma_id_private *id_priv) +{ + refcount_inc(&id_priv->refcount); +} + +static void cma_id_put(struct rdma_id_private *id_priv) +{ + if (refcount_dec_and_test(&id_priv->refcount)) + complete(&id_priv->comp); +} + +static struct rdma_id_private * +__rdma_create_id(struct net *net, rdma_cm_event_handler event_handler, + void *context, enum rdma_ucm_port_space ps, + enum ib_qp_type qp_type, const struct rdma_id_private *parent) +{ + struct rdma_id_private *id_priv; + + id_priv = kzalloc(sizeof *id_priv, GFP_KERNEL); + if (!id_priv) + return ERR_PTR(-ENOMEM); + + id_priv->state = RDMA_CM_IDLE; + id_priv->id.context = context; + id_priv->id.event_handler = event_handler; + id_priv->id.ps = ps; + id_priv->id.qp_type = qp_type; + id_priv->tos_set = false; + id_priv->timeout_set = false; + id_priv->min_rnr_timer_set = false; + id_priv->gid_type = IB_GID_TYPE_IB; + spin_lock_init(&id_priv->lock); + mutex_init(&id_priv->qp_mutex); + init_completion(&id_priv->comp); + refcount_set(&id_priv->refcount, 1); + mutex_init(&id_priv->handler_mutex); + INIT_LIST_HEAD(&id_priv->device_item); + INIT_LIST_HEAD(&id_priv->id_list_entry); + INIT_LIST_HEAD(&id_priv->listen_list); + INIT_LIST_HEAD(&id_priv->mc_list); + get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num); + id_priv->id.route.addr.dev_addr.net = get_net(net); + id_priv->seq_num &= 0x00ffffff; + + rdma_restrack_new(&id_priv->res, RDMA_RESTRACK_CM_ID); + if (parent) + rdma_restrack_parent_name(&id_priv->res, &parent->res); + + return id_priv; +} + +struct rdma_cm_id * +__rdma_create_kernel_id(struct net *net, rdma_cm_event_handler event_handler, + void *context, enum rdma_ucm_port_space ps, + enum ib_qp_type qp_type, const char *caller) +{ + struct rdma_id_private *ret; + + ret = __rdma_create_id(net, event_handler, context, ps, qp_type, NULL); + if (IS_ERR(ret)) + return ERR_CAST(ret); + + rdma_restrack_set_name(&ret->res, caller); + return &ret->id; +} +EXPORT_SYMBOL(__rdma_create_kernel_id); + +struct rdma_cm_id *rdma_create_user_id(rdma_cm_event_handler event_handler, + void *context, + enum rdma_ucm_port_space ps, + enum ib_qp_type qp_type) +{ + struct rdma_id_private *ret; + + ret = __rdma_create_id(current->nsproxy->net_ns, event_handler, context, + ps, qp_type, NULL); + if (IS_ERR(ret)) + return ERR_CAST(ret); + + rdma_restrack_set_name(&ret->res, NULL); + return &ret->id; +} +EXPORT_SYMBOL(rdma_create_user_id); + +static int cma_init_ud_qp(struct rdma_id_private *id_priv, struct ib_qp *qp) +{ + struct ib_qp_attr qp_attr; + int qp_attr_mask, ret; + + qp_attr.qp_state = IB_QPS_INIT; + ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); + if (ret) + return ret; + + ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); + if (ret) + return ret; + + qp_attr.qp_state = IB_QPS_RTR; + ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE); + if (ret) + return ret; + + qp_attr.qp_state = IB_QPS_RTS; + qp_attr.sq_psn = 0; + ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE | IB_QP_SQ_PSN); + + return ret; +} + +static int cma_init_conn_qp(struct rdma_id_private *id_priv, struct ib_qp *qp) +{ + struct ib_qp_attr qp_attr; + int qp_attr_mask, ret; + + qp_attr.qp_state = IB_QPS_INIT; + ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); + if (ret) + return ret; + + return ib_modify_qp(qp, &qp_attr, qp_attr_mask); +} + +int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd, + struct ib_qp_init_attr *qp_init_attr) +{ + struct rdma_id_private *id_priv; + struct ib_qp *qp; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + if (id->device != pd->device) { + ret = -EINVAL; + goto out_err; + } + + qp_init_attr->port_num = id->port_num; + qp = ib_create_qp(pd, qp_init_attr); + if (IS_ERR(qp)) { + ret = PTR_ERR(qp); + goto out_err; + } + + if (id->qp_type == IB_QPT_UD) + ret = cma_init_ud_qp(id_priv, qp); + else + ret = cma_init_conn_qp(id_priv, qp); + if (ret) + goto out_destroy; + + id->qp = qp; + id_priv->qp_num = qp->qp_num; + id_priv->srq = (qp->srq != NULL); + trace_cm_qp_create(id_priv, pd, qp_init_attr, 0); + return 0; +out_destroy: + ib_destroy_qp(qp); +out_err: + trace_cm_qp_create(id_priv, pd, qp_init_attr, ret); + return ret; +} +EXPORT_SYMBOL(rdma_create_qp); + +void rdma_destroy_qp(struct rdma_cm_id *id) +{ + struct rdma_id_private *id_priv; + + id_priv = container_of(id, struct rdma_id_private, id); + trace_cm_qp_destroy(id_priv); + mutex_lock(&id_priv->qp_mutex); + ib_destroy_qp(id_priv->id.qp); + id_priv->id.qp = NULL; + mutex_unlock(&id_priv->qp_mutex); +} +EXPORT_SYMBOL(rdma_destroy_qp); + +static int cma_modify_qp_rtr(struct rdma_id_private *id_priv, + struct rdma_conn_param *conn_param) +{ + struct ib_qp_attr qp_attr; + int qp_attr_mask, ret; + + mutex_lock(&id_priv->qp_mutex); + if (!id_priv->id.qp) { + ret = 0; + goto out; + } + + /* Need to update QP attributes from default values. */ + qp_attr.qp_state = IB_QPS_INIT; + ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); + if (ret) + goto out; + + ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); + if (ret) + goto out; + + qp_attr.qp_state = IB_QPS_RTR; + ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); + if (ret) + goto out; + + BUG_ON(id_priv->cma_dev->device != id_priv->id.device); + + if (conn_param) { + qp_attr.max_dest_rd_atomic = conn_param->responder_resources; + qp_attr.min_rnr_timer = conn_param->min_rnr_timer; + } + ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); +out: + mutex_unlock(&id_priv->qp_mutex); + return ret; +} + +static int cma_modify_qp_rts(struct rdma_id_private *id_priv, + struct rdma_conn_param *conn_param) +{ + struct ib_qp_attr qp_attr; + int qp_attr_mask, ret; + + mutex_lock(&id_priv->qp_mutex); + if (!id_priv->id.qp) { + ret = 0; + goto out; + } + + qp_attr.qp_state = IB_QPS_RTS; + ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); + if (ret) + goto out; + + if (conn_param) + qp_attr.max_rd_atomic = conn_param->initiator_depth; + ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); +out: + mutex_unlock(&id_priv->qp_mutex); + return ret; +} + +static int cma_modify_qp_err(struct rdma_id_private *id_priv) +{ + struct ib_qp_attr qp_attr; + int ret; + + mutex_lock(&id_priv->qp_mutex); + if (!id_priv->id.qp) { + ret = 0; + goto out; + } + + qp_attr.qp_state = IB_QPS_ERR; + ret = ib_modify_qp(id_priv->id.qp, &qp_attr, IB_QP_STATE); +out: + mutex_unlock(&id_priv->qp_mutex); + return ret; +} + +static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv, + struct ib_qp_attr *qp_attr, int *qp_attr_mask) +{ + struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; + int ret; + u16 pkey; + + if (rdma_cap_eth_ah(id_priv->id.device, id_priv->id.port_num)) + pkey = 0xffff; + else + pkey = ib_addr_get_pkey(dev_addr); + + ret = ib_find_cached_pkey(id_priv->id.device, id_priv->id.port_num, + pkey, &qp_attr->pkey_index); + if (ret) + return ret; + + qp_attr->port_num = id_priv->id.port_num; + *qp_attr_mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT; + + if (id_priv->id.qp_type == IB_QPT_UD) { + ret = cma_set_qkey(id_priv, 0); + if (ret) + return ret; + + qp_attr->qkey = id_priv->qkey; + *qp_attr_mask |= IB_QP_QKEY; + } else { + qp_attr->qp_access_flags = 0; + *qp_attr_mask |= IB_QP_ACCESS_FLAGS; + } + return 0; +} + +int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr, + int *qp_attr_mask) +{ + struct rdma_id_private *id_priv; + int ret = 0; + + id_priv = container_of(id, struct rdma_id_private, id); + if (rdma_cap_ib_cm(id->device, id->port_num)) { + if (!id_priv->cm_id.ib || (id_priv->id.qp_type == IB_QPT_UD)) + ret = cma_ib_init_qp_attr(id_priv, qp_attr, qp_attr_mask); + else + ret = ib_cm_init_qp_attr(id_priv->cm_id.ib, qp_attr, + qp_attr_mask); + + if (qp_attr->qp_state == IB_QPS_RTR) + qp_attr->rq_psn = id_priv->seq_num; + } else if (rdma_cap_iw_cm(id->device, id->port_num)) { + if (!id_priv->cm_id.iw) { + qp_attr->qp_access_flags = 0; + *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS; + } else + ret = iw_cm_init_qp_attr(id_priv->cm_id.iw, qp_attr, + qp_attr_mask); + qp_attr->port_num = id_priv->id.port_num; + *qp_attr_mask |= IB_QP_PORT; + } else { + ret = -ENOSYS; + } + + if ((*qp_attr_mask & IB_QP_TIMEOUT) && id_priv->timeout_set) + qp_attr->timeout = id_priv->timeout; + + if ((*qp_attr_mask & IB_QP_MIN_RNR_TIMER) && id_priv->min_rnr_timer_set) + qp_attr->min_rnr_timer = id_priv->min_rnr_timer; + + return ret; +} +EXPORT_SYMBOL(rdma_init_qp_attr); + +static inline bool cma_zero_addr(const struct sockaddr *addr) +{ + switch (addr->sa_family) { + case AF_INET: + return ipv4_is_zeronet(((struct sockaddr_in *)addr)->sin_addr.s_addr); + case AF_INET6: + return ipv6_addr_any(&((struct sockaddr_in6 *)addr)->sin6_addr); + case AF_IB: + return ib_addr_any(&((struct sockaddr_ib *)addr)->sib_addr); + default: + return false; + } +} + +static inline bool cma_loopback_addr(const struct sockaddr *addr) +{ + switch (addr->sa_family) { + case AF_INET: + return ipv4_is_loopback( + ((struct sockaddr_in *)addr)->sin_addr.s_addr); + case AF_INET6: + return ipv6_addr_loopback( + &((struct sockaddr_in6 *)addr)->sin6_addr); + case AF_IB: + return ib_addr_loopback( + &((struct sockaddr_ib *)addr)->sib_addr); + default: + return false; + } +} + +static inline bool cma_any_addr(const struct sockaddr *addr) +{ + return cma_zero_addr(addr) || cma_loopback_addr(addr); +} + +static int cma_addr_cmp(const struct sockaddr *src, const struct sockaddr *dst) +{ + if (src->sa_family != dst->sa_family) + return -1; + + switch (src->sa_family) { + case AF_INET: + return ((struct sockaddr_in *)src)->sin_addr.s_addr != + ((struct sockaddr_in *)dst)->sin_addr.s_addr; + case AF_INET6: { + struct sockaddr_in6 *src_addr6 = (struct sockaddr_in6 *)src; + struct sockaddr_in6 *dst_addr6 = (struct sockaddr_in6 *)dst; + bool link_local; + + if (ipv6_addr_cmp(&src_addr6->sin6_addr, + &dst_addr6->sin6_addr)) + return 1; + link_local = ipv6_addr_type(&dst_addr6->sin6_addr) & + IPV6_ADDR_LINKLOCAL; + /* Link local must match their scope_ids */ + return link_local ? (src_addr6->sin6_scope_id != + dst_addr6->sin6_scope_id) : + 0; + } + + default: + return ib_addr_cmp(&((struct sockaddr_ib *) src)->sib_addr, + &((struct sockaddr_ib *) dst)->sib_addr); + } +} + +static __be16 cma_port(const struct sockaddr *addr) +{ + struct sockaddr_ib *sib; + + switch (addr->sa_family) { + case AF_INET: + return ((struct sockaddr_in *) addr)->sin_port; + case AF_INET6: + return ((struct sockaddr_in6 *) addr)->sin6_port; + case AF_IB: + sib = (struct sockaddr_ib *) addr; + return htons((u16) (be64_to_cpu(sib->sib_sid) & + be64_to_cpu(sib->sib_sid_mask))); + default: + return 0; + } +} + +static inline int cma_any_port(const struct sockaddr *addr) +{ + return !cma_port(addr); +} + +static void cma_save_ib_info(struct sockaddr *src_addr, + struct sockaddr *dst_addr, + const struct rdma_cm_id *listen_id, + const struct sa_path_rec *path) +{ + struct sockaddr_ib *listen_ib, *ib; + + listen_ib = (struct sockaddr_ib *) &listen_id->route.addr.src_addr; + if (src_addr) { + ib = (struct sockaddr_ib *)src_addr; + ib->sib_family = AF_IB; + if (path) { + ib->sib_pkey = path->pkey; + ib->sib_flowinfo = path->flow_label; + memcpy(&ib->sib_addr, &path->sgid, 16); + ib->sib_sid = path->service_id; + ib->sib_scope_id = 0; + } else { + ib->sib_pkey = listen_ib->sib_pkey; + ib->sib_flowinfo = listen_ib->sib_flowinfo; + ib->sib_addr = listen_ib->sib_addr; + ib->sib_sid = listen_ib->sib_sid; + ib->sib_scope_id = listen_ib->sib_scope_id; + } + ib->sib_sid_mask = cpu_to_be64(0xffffffffffffffffULL); + } + if (dst_addr) { + ib = (struct sockaddr_ib *)dst_addr; + ib->sib_family = AF_IB; + if (path) { + ib->sib_pkey = path->pkey; + ib->sib_flowinfo = path->flow_label; + memcpy(&ib->sib_addr, &path->dgid, 16); + } + } +} + +static void cma_save_ip4_info(struct sockaddr_in *src_addr, + struct sockaddr_in *dst_addr, + struct cma_hdr *hdr, + __be16 local_port) +{ + if (src_addr) { + *src_addr = (struct sockaddr_in) { + .sin_family = AF_INET, + .sin_addr.s_addr = hdr->dst_addr.ip4.addr, + .sin_port = local_port, + }; + } + + if (dst_addr) { + *dst_addr = (struct sockaddr_in) { + .sin_family = AF_INET, + .sin_addr.s_addr = hdr->src_addr.ip4.addr, + .sin_port = hdr->port, + }; + } +} + +static void cma_save_ip6_info(struct sockaddr_in6 *src_addr, + struct sockaddr_in6 *dst_addr, + struct cma_hdr *hdr, + __be16 local_port) +{ + if (src_addr) { + *src_addr = (struct sockaddr_in6) { + .sin6_family = AF_INET6, + .sin6_addr = hdr->dst_addr.ip6, + .sin6_port = local_port, + }; + } + + if (dst_addr) { + *dst_addr = (struct sockaddr_in6) { + .sin6_family = AF_INET6, + .sin6_addr = hdr->src_addr.ip6, + .sin6_port = hdr->port, + }; + } +} + +static u16 cma_port_from_service_id(__be64 service_id) +{ + return (u16)be64_to_cpu(service_id); +} + +static int cma_save_ip_info(struct sockaddr *src_addr, + struct sockaddr *dst_addr, + const struct ib_cm_event *ib_event, + __be64 service_id) +{ + struct cma_hdr *hdr; + __be16 port; + + hdr = ib_event->private_data; + if (hdr->cma_version != CMA_VERSION) + return -EINVAL; + + port = htons(cma_port_from_service_id(service_id)); + + switch (cma_get_ip_ver(hdr)) { + case 4: + cma_save_ip4_info((struct sockaddr_in *)src_addr, + (struct sockaddr_in *)dst_addr, hdr, port); + break; + case 6: + cma_save_ip6_info((struct sockaddr_in6 *)src_addr, + (struct sockaddr_in6 *)dst_addr, hdr, port); + break; + default: + return -EAFNOSUPPORT; + } + + return 0; +} + +static int cma_save_net_info(struct sockaddr *src_addr, + struct sockaddr *dst_addr, + const struct rdma_cm_id *listen_id, + const struct ib_cm_event *ib_event, + sa_family_t sa_family, __be64 service_id) +{ + if (sa_family == AF_IB) { + if (ib_event->event == IB_CM_REQ_RECEIVED) + cma_save_ib_info(src_addr, dst_addr, listen_id, + ib_event->param.req_rcvd.primary_path); + else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) + cma_save_ib_info(src_addr, dst_addr, listen_id, NULL); + return 0; + } + + return cma_save_ip_info(src_addr, dst_addr, ib_event, service_id); +} + +static int cma_save_req_info(const struct ib_cm_event *ib_event, + struct cma_req_info *req) +{ + const struct ib_cm_req_event_param *req_param = + &ib_event->param.req_rcvd; + const struct ib_cm_sidr_req_event_param *sidr_param = + &ib_event->param.sidr_req_rcvd; + + switch (ib_event->event) { + case IB_CM_REQ_RECEIVED: + req->device = req_param->listen_id->device; + req->port = req_param->port; + memcpy(&req->local_gid, &req_param->primary_path->sgid, + sizeof(req->local_gid)); + req->has_gid = true; + req->service_id = req_param->primary_path->service_id; + req->pkey = be16_to_cpu(req_param->primary_path->pkey); + if (req->pkey != req_param->bth_pkey) + pr_warn_ratelimited("RDMA CMA: got different BTH P_Key (0x%x) and primary path P_Key (0x%x)\n" + "RDMA CMA: in the future this may cause the request to be dropped\n", + req_param->bth_pkey, req->pkey); + break; + case IB_CM_SIDR_REQ_RECEIVED: + req->device = sidr_param->listen_id->device; + req->port = sidr_param->port; + req->has_gid = false; + req->service_id = sidr_param->service_id; + req->pkey = sidr_param->pkey; + if (req->pkey != sidr_param->bth_pkey) + pr_warn_ratelimited("RDMA CMA: got different BTH P_Key (0x%x) and SIDR request payload P_Key (0x%x)\n" + "RDMA CMA: in the future this may cause the request to be dropped\n", + sidr_param->bth_pkey, req->pkey); + break; + default: + return -EINVAL; + } + + return 0; +} + +static bool validate_ipv4_net_dev(struct net_device *net_dev, + const struct sockaddr_in *dst_addr, + const struct sockaddr_in *src_addr) +{ + __be32 daddr = dst_addr->sin_addr.s_addr, + saddr = src_addr->sin_addr.s_addr; + struct fib_result res; + struct flowi4 fl4; + int err; + bool ret; + + if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || + ipv4_is_lbcast(daddr) || ipv4_is_zeronet(saddr) || + ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr) || + ipv4_is_loopback(saddr)) + return false; + + memset(&fl4, 0, sizeof(fl4)); + fl4.flowi4_oif = net_dev->ifindex; + fl4.daddr = daddr; + fl4.saddr = saddr; + + rcu_read_lock(); + err = fib_lookup(dev_net(net_dev), &fl4, &res, 0); + ret = err == 0 && FIB_RES_DEV(res) == net_dev; + rcu_read_unlock(); + + return ret; +} + +static bool validate_ipv6_net_dev(struct net_device *net_dev, + const struct sockaddr_in6 *dst_addr, + const struct sockaddr_in6 *src_addr) +{ +#if IS_ENABLED(CONFIG_IPV6) + const int strict = ipv6_addr_type(&dst_addr->sin6_addr) & + IPV6_ADDR_LINKLOCAL; + struct rt6_info *rt = rt6_lookup(dev_net(net_dev), &dst_addr->sin6_addr, + &src_addr->sin6_addr, net_dev->ifindex, + NULL, strict); + bool ret; + + if (!rt) + return false; + + ret = rt->rt6i_idev->dev == net_dev; + ip6_rt_put(rt); + + return ret; +#else + return false; +#endif +} + +static bool validate_net_dev(struct net_device *net_dev, + const struct sockaddr *daddr, + const struct sockaddr *saddr) +{ + const struct sockaddr_in *daddr4 = (const struct sockaddr_in *)daddr; + const struct sockaddr_in *saddr4 = (const struct sockaddr_in *)saddr; + const struct sockaddr_in6 *daddr6 = (const struct sockaddr_in6 *)daddr; + const struct sockaddr_in6 *saddr6 = (const struct sockaddr_in6 *)saddr; + + switch (daddr->sa_family) { + case AF_INET: + return saddr->sa_family == AF_INET && + validate_ipv4_net_dev(net_dev, daddr4, saddr4); + + case AF_INET6: + return saddr->sa_family == AF_INET6 && + validate_ipv6_net_dev(net_dev, daddr6, saddr6); + + default: + return false; + } +} + +static struct net_device * +roce_get_net_dev_by_cm_event(const struct ib_cm_event *ib_event) +{ + const struct ib_gid_attr *sgid_attr = NULL; + struct net_device *ndev; + + if (ib_event->event == IB_CM_REQ_RECEIVED) + sgid_attr = ib_event->param.req_rcvd.ppath_sgid_attr; + else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) + sgid_attr = ib_event->param.sidr_req_rcvd.sgid_attr; + + if (!sgid_attr) + return NULL; + + rcu_read_lock(); + ndev = rdma_read_gid_attr_ndev_rcu(sgid_attr); + if (IS_ERR(ndev)) + ndev = NULL; + else + dev_hold(ndev); + rcu_read_unlock(); + return ndev; +} + +static struct net_device *cma_get_net_dev(const struct ib_cm_event *ib_event, + struct cma_req_info *req) +{ + struct sockaddr *listen_addr = + (struct sockaddr *)&req->listen_addr_storage; + struct sockaddr *src_addr = (struct sockaddr *)&req->src_addr_storage; + struct net_device *net_dev; + const union ib_gid *gid = req->has_gid ? &req->local_gid : NULL; + int err; + + err = cma_save_ip_info(listen_addr, src_addr, ib_event, + req->service_id); + if (err) + return ERR_PTR(err); + + if (rdma_protocol_roce(req->device, req->port)) + net_dev = roce_get_net_dev_by_cm_event(ib_event); + else + net_dev = ib_get_net_dev_by_params(req->device, req->port, + req->pkey, + gid, listen_addr); + if (!net_dev) + return ERR_PTR(-ENODEV); + + return net_dev; +} + +static enum rdma_ucm_port_space rdma_ps_from_service_id(__be64 service_id) +{ + return (be64_to_cpu(service_id) >> 16) & 0xffff; +} + +static bool cma_match_private_data(struct rdma_id_private *id_priv, + const struct cma_hdr *hdr) +{ + struct sockaddr *addr = cma_src_addr(id_priv); + __be32 ip4_addr; + struct in6_addr ip6_addr; + + if (cma_any_addr(addr) && !id_priv->afonly) + return true; + + switch (addr->sa_family) { + case AF_INET: + ip4_addr = ((struct sockaddr_in *)addr)->sin_addr.s_addr; + if (cma_get_ip_ver(hdr) != 4) + return false; + if (!cma_any_addr(addr) && + hdr->dst_addr.ip4.addr != ip4_addr) + return false; + break; + case AF_INET6: + ip6_addr = ((struct sockaddr_in6 *)addr)->sin6_addr; + if (cma_get_ip_ver(hdr) != 6) + return false; + if (!cma_any_addr(addr) && + memcmp(&hdr->dst_addr.ip6, &ip6_addr, sizeof(ip6_addr))) + return false; + break; + case AF_IB: + return true; + default: + return false; + } + + return true; +} + +static bool cma_protocol_roce(const struct rdma_cm_id *id) +{ + struct ib_device *device = id->device; + const u32 port_num = id->port_num ?: rdma_start_port(device); + + return rdma_protocol_roce(device, port_num); +} + +static bool cma_is_req_ipv6_ll(const struct cma_req_info *req) +{ + const struct sockaddr *daddr = + (const struct sockaddr *)&req->listen_addr_storage; + const struct sockaddr_in6 *daddr6 = (const struct sockaddr_in6 *)daddr; + + /* Returns true if the req is for IPv6 link local */ + return (daddr->sa_family == AF_INET6 && + (ipv6_addr_type(&daddr6->sin6_addr) & IPV6_ADDR_LINKLOCAL)); +} + +static bool cma_match_net_dev(const struct rdma_cm_id *id, + const struct net_device *net_dev, + const struct cma_req_info *req) +{ + const struct rdma_addr *addr = &id->route.addr; + + if (!net_dev) + /* This request is an AF_IB request */ + return (!id->port_num || id->port_num == req->port) && + (addr->src_addr.ss_family == AF_IB); + + /* + * If the request is not for IPv6 link local, allow matching + * request to any netdevice of the one or multiport rdma device. + */ + if (!cma_is_req_ipv6_ll(req)) + return true; + /* + * Net namespaces must match, and if the listner is listening + * on a specific netdevice than netdevice must match as well. + */ + if (net_eq(dev_net(net_dev), addr->dev_addr.net) && + (!!addr->dev_addr.bound_dev_if == + (addr->dev_addr.bound_dev_if == net_dev->ifindex))) + return true; + else + return false; +} + +static struct rdma_id_private *cma_find_listener( + const struct rdma_bind_list *bind_list, + const struct ib_cm_id *cm_id, + const struct ib_cm_event *ib_event, + const struct cma_req_info *req, + const struct net_device *net_dev) +{ + struct rdma_id_private *id_priv, *id_priv_dev; + + lockdep_assert_held(&lock); + + if (!bind_list) + return ERR_PTR(-EINVAL); + + hlist_for_each_entry(id_priv, &bind_list->owners, node) { + if (cma_match_private_data(id_priv, ib_event->private_data)) { + if (id_priv->id.device == cm_id->device && + cma_match_net_dev(&id_priv->id, net_dev, req)) + return id_priv; + list_for_each_entry(id_priv_dev, + &id_priv->listen_list, + listen_item) { + if (id_priv_dev->id.device == cm_id->device && + cma_match_net_dev(&id_priv_dev->id, + net_dev, req)) + return id_priv_dev; + } + } + } + + return ERR_PTR(-EINVAL); +} + +static struct rdma_id_private * +cma_ib_id_from_event(struct ib_cm_id *cm_id, + const struct ib_cm_event *ib_event, + struct cma_req_info *req, + struct net_device **net_dev) +{ + struct rdma_bind_list *bind_list; + struct rdma_id_private *id_priv; + int err; + + err = cma_save_req_info(ib_event, req); + if (err) + return ERR_PTR(err); + + *net_dev = cma_get_net_dev(ib_event, req); + if (IS_ERR(*net_dev)) { + if (PTR_ERR(*net_dev) == -EAFNOSUPPORT) { + /* Assuming the protocol is AF_IB */ + *net_dev = NULL; + } else { + return ERR_CAST(*net_dev); + } + } + + mutex_lock(&lock); + /* + * Net namespace might be getting deleted while route lookup, + * cm_id lookup is in progress. Therefore, perform netdevice + * validation, cm_id lookup under rcu lock. + * RCU lock along with netdevice state check, synchronizes with + * netdevice migrating to different net namespace and also avoids + * case where net namespace doesn't get deleted while lookup is in + * progress. + * If the device state is not IFF_UP, its properties such as ifindex + * and nd_net cannot be trusted to remain valid without rcu lock. + * net/core/dev.c change_net_namespace() ensures to synchronize with + * ongoing operations on net device after device is closed using + * synchronize_net(). + */ + rcu_read_lock(); + if (*net_dev) { + /* + * If netdevice is down, it is likely that it is administratively + * down or it might be migrating to different namespace. + * In that case avoid further processing, as the net namespace + * or ifindex may change. + */ + if (((*net_dev)->flags & IFF_UP) == 0) { + id_priv = ERR_PTR(-EHOSTUNREACH); + goto err; + } + + if (!validate_net_dev(*net_dev, + (struct sockaddr *)&req->src_addr_storage, + (struct sockaddr *)&req->listen_addr_storage)) { + id_priv = ERR_PTR(-EHOSTUNREACH); + goto err; + } + } + + bind_list = cma_ps_find(*net_dev ? dev_net(*net_dev) : &init_net, + rdma_ps_from_service_id(req->service_id), + cma_port_from_service_id(req->service_id)); + id_priv = cma_find_listener(bind_list, cm_id, ib_event, req, *net_dev); +err: + rcu_read_unlock(); + mutex_unlock(&lock); + if (IS_ERR(id_priv) && *net_dev) { + dev_put(*net_dev); + *net_dev = NULL; + } + return id_priv; +} + +static inline u8 cma_user_data_offset(struct rdma_id_private *id_priv) +{ + return cma_family(id_priv) == AF_IB ? 0 : sizeof(struct cma_hdr); +} + +static void cma_cancel_route(struct rdma_id_private *id_priv) +{ + if (rdma_cap_ib_sa(id_priv->id.device, id_priv->id.port_num)) { + if (id_priv->query) + ib_sa_cancel_query(id_priv->query_id, id_priv->query); + } +} + +static void _cma_cancel_listens(struct rdma_id_private *id_priv) +{ + struct rdma_id_private *dev_id_priv; + + lockdep_assert_held(&lock); + + /* + * Remove from listen_any_list to prevent added devices from spawning + * additional listen requests. + */ + list_del_init(&id_priv->listen_any_item); + + while (!list_empty(&id_priv->listen_list)) { + dev_id_priv = + list_first_entry(&id_priv->listen_list, + struct rdma_id_private, listen_item); + /* sync with device removal to avoid duplicate destruction */ + list_del_init(&dev_id_priv->device_item); + list_del_init(&dev_id_priv->listen_item); + mutex_unlock(&lock); + + rdma_destroy_id(&dev_id_priv->id); + mutex_lock(&lock); + } +} + +static void cma_cancel_listens(struct rdma_id_private *id_priv) +{ + mutex_lock(&lock); + _cma_cancel_listens(id_priv); + mutex_unlock(&lock); +} + +static void cma_cancel_operation(struct rdma_id_private *id_priv, + enum rdma_cm_state state) +{ + switch (state) { + case RDMA_CM_ADDR_QUERY: + /* + * We can avoid doing the rdma_addr_cancel() based on state, + * only RDMA_CM_ADDR_QUERY has a work that could still execute. + * Notice that the addr_handler work could still be exiting + * outside this state, however due to the interaction with the + * handler_mutex the work is guaranteed not to touch id_priv + * during exit. + */ + rdma_addr_cancel(&id_priv->id.route.addr.dev_addr); + break; + case RDMA_CM_ROUTE_QUERY: + cma_cancel_route(id_priv); + break; + case RDMA_CM_LISTEN: + if (cma_any_addr(cma_src_addr(id_priv)) && !id_priv->cma_dev) + cma_cancel_listens(id_priv); + break; + default: + break; + } +} + +static void cma_release_port(struct rdma_id_private *id_priv) +{ + struct rdma_bind_list *bind_list = id_priv->bind_list; + struct net *net = id_priv->id.route.addr.dev_addr.net; + + if (!bind_list) + return; + + mutex_lock(&lock); + hlist_del(&id_priv->node); + if (hlist_empty(&bind_list->owners)) { + cma_ps_remove(net, bind_list->ps, bind_list->port); + kfree(bind_list); + } + mutex_unlock(&lock); +} + +static void destroy_mc(struct rdma_id_private *id_priv, + struct cma_multicast *mc) +{ + bool send_only = mc->join_state == BIT(SENDONLY_FULLMEMBER_JOIN); + + if (rdma_cap_ib_mcast(id_priv->id.device, id_priv->id.port_num)) + ib_sa_free_multicast(mc->sa_mc); + + if (rdma_protocol_roce(id_priv->id.device, id_priv->id.port_num)) { + struct rdma_dev_addr *dev_addr = + &id_priv->id.route.addr.dev_addr; + struct net_device *ndev = NULL; + + if (dev_addr->bound_dev_if) + ndev = dev_get_by_index(dev_addr->net, + dev_addr->bound_dev_if); + if (ndev && !send_only) { + enum ib_gid_type gid_type; + union ib_gid mgid; + + gid_type = id_priv->cma_dev->default_gid_type + [id_priv->id.port_num - + rdma_start_port( + id_priv->cma_dev->device)]; + cma_iboe_set_mgid((struct sockaddr *)&mc->addr, &mgid, + gid_type); + cma_igmp_send(ndev, &mgid, false); + } + dev_put(ndev); + + cancel_work_sync(&mc->iboe_join.work); + } + kfree(mc); +} + +static void cma_leave_mc_groups(struct rdma_id_private *id_priv) +{ + struct cma_multicast *mc; + + while (!list_empty(&id_priv->mc_list)) { + mc = list_first_entry(&id_priv->mc_list, struct cma_multicast, + list); + list_del(&mc->list); + destroy_mc(id_priv, mc); + } +} + +static void _destroy_id(struct rdma_id_private *id_priv, + enum rdma_cm_state state) +{ + cma_cancel_operation(id_priv, state); + + rdma_restrack_del(&id_priv->res); + cma_remove_id_from_tree(id_priv); + if (id_priv->cma_dev) { + if (rdma_cap_ib_cm(id_priv->id.device, 1)) { + if (id_priv->cm_id.ib) + ib_destroy_cm_id(id_priv->cm_id.ib); + } else if (rdma_cap_iw_cm(id_priv->id.device, 1)) { + if (id_priv->cm_id.iw) + iw_destroy_cm_id(id_priv->cm_id.iw); + } + cma_leave_mc_groups(id_priv); + cma_release_dev(id_priv); + } + + cma_release_port(id_priv); + cma_id_put(id_priv); + wait_for_completion(&id_priv->comp); + + if (id_priv->internal_id) + cma_id_put(id_priv->id.context); + + kfree(id_priv->id.route.path_rec); + kfree(id_priv->id.route.path_rec_inbound); + kfree(id_priv->id.route.path_rec_outbound); + + put_net(id_priv->id.route.addr.dev_addr.net); + kfree(id_priv); +} + +/* + * destroy an ID from within the handler_mutex. This ensures that no other + * handlers can start running concurrently. + */ +static void destroy_id_handler_unlock(struct rdma_id_private *id_priv) + __releases(&idprv->handler_mutex) +{ + enum rdma_cm_state state; + unsigned long flags; + + trace_cm_id_destroy(id_priv); + + /* + * Setting the state to destroyed under the handler mutex provides a + * fence against calling handler callbacks. If this is invoked due to + * the failure of a handler callback then it guarentees that no future + * handlers will be called. + */ + lockdep_assert_held(&id_priv->handler_mutex); + spin_lock_irqsave(&id_priv->lock, flags); + state = id_priv->state; + id_priv->state = RDMA_CM_DESTROYING; + spin_unlock_irqrestore(&id_priv->lock, flags); + mutex_unlock(&id_priv->handler_mutex); + _destroy_id(id_priv, state); +} + +void rdma_destroy_id(struct rdma_cm_id *id) +{ + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); + + mutex_lock(&id_priv->handler_mutex); + destroy_id_handler_unlock(id_priv); +} +EXPORT_SYMBOL(rdma_destroy_id); + +static int cma_rep_recv(struct rdma_id_private *id_priv) +{ + int ret; + + ret = cma_modify_qp_rtr(id_priv, NULL); + if (ret) + goto reject; + + ret = cma_modify_qp_rts(id_priv, NULL); + if (ret) + goto reject; + + trace_cm_send_rtu(id_priv); + ret = ib_send_cm_rtu(id_priv->cm_id.ib, NULL, 0); + if (ret) + goto reject; + + return 0; +reject: + pr_debug_ratelimited("RDMA CM: CONNECT_ERROR: failed to handle reply. status %d\n", ret); + cma_modify_qp_err(id_priv); + trace_cm_send_rej(id_priv); + ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED, + NULL, 0, NULL, 0); + return ret; +} + +static void cma_set_rep_event_data(struct rdma_cm_event *event, + const struct ib_cm_rep_event_param *rep_data, + void *private_data) +{ + event->param.conn.private_data = private_data; + event->param.conn.private_data_len = IB_CM_REP_PRIVATE_DATA_SIZE; + event->param.conn.responder_resources = rep_data->responder_resources; + event->param.conn.initiator_depth = rep_data->initiator_depth; + event->param.conn.flow_control = rep_data->flow_control; + event->param.conn.rnr_retry_count = rep_data->rnr_retry_count; + event->param.conn.srq = rep_data->srq; + event->param.conn.qp_num = rep_data->remote_qpn; + + event->ece.vendor_id = rep_data->ece.vendor_id; + event->ece.attr_mod = rep_data->ece.attr_mod; +} + +static int cma_cm_event_handler(struct rdma_id_private *id_priv, + struct rdma_cm_event *event) +{ + int ret; + + lockdep_assert_held(&id_priv->handler_mutex); + + trace_cm_event_handler(id_priv, event); + ret = id_priv->id.event_handler(&id_priv->id, event); + trace_cm_event_done(id_priv, event, ret); + return ret; +} + +static int cma_ib_handler(struct ib_cm_id *cm_id, + const struct ib_cm_event *ib_event) +{ + struct rdma_id_private *id_priv = cm_id->context; + struct rdma_cm_event event = {}; + enum rdma_cm_state state; + int ret; + + mutex_lock(&id_priv->handler_mutex); + state = READ_ONCE(id_priv->state); + if ((ib_event->event != IB_CM_TIMEWAIT_EXIT && + state != RDMA_CM_CONNECT) || + (ib_event->event == IB_CM_TIMEWAIT_EXIT && + state != RDMA_CM_DISCONNECT)) + goto out; + + switch (ib_event->event) { + case IB_CM_REQ_ERROR: + case IB_CM_REP_ERROR: + event.event = RDMA_CM_EVENT_UNREACHABLE; + event.status = -ETIMEDOUT; + break; + case IB_CM_REP_RECEIVED: + if (state == RDMA_CM_CONNECT && + (id_priv->id.qp_type != IB_QPT_UD)) { + trace_cm_send_mra(id_priv); + ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); + } + if (id_priv->id.qp) { + event.status = cma_rep_recv(id_priv); + event.event = event.status ? RDMA_CM_EVENT_CONNECT_ERROR : + RDMA_CM_EVENT_ESTABLISHED; + } else { + event.event = RDMA_CM_EVENT_CONNECT_RESPONSE; + } + cma_set_rep_event_data(&event, &ib_event->param.rep_rcvd, + ib_event->private_data); + break; + case IB_CM_RTU_RECEIVED: + case IB_CM_USER_ESTABLISHED: + event.event = RDMA_CM_EVENT_ESTABLISHED; + break; + case IB_CM_DREQ_ERROR: + event.status = -ETIMEDOUT; + fallthrough; + case IB_CM_DREQ_RECEIVED: + case IB_CM_DREP_RECEIVED: + if (!cma_comp_exch(id_priv, RDMA_CM_CONNECT, + RDMA_CM_DISCONNECT)) + goto out; + event.event = RDMA_CM_EVENT_DISCONNECTED; + break; + case IB_CM_TIMEWAIT_EXIT: + event.event = RDMA_CM_EVENT_TIMEWAIT_EXIT; + break; + case IB_CM_MRA_RECEIVED: + /* ignore event */ + goto out; + case IB_CM_REJ_RECEIVED: + pr_debug_ratelimited("RDMA CM: REJECTED: %s\n", rdma_reject_msg(&id_priv->id, + ib_event->param.rej_rcvd.reason)); + cma_modify_qp_err(id_priv); + event.status = ib_event->param.rej_rcvd.reason; + event.event = RDMA_CM_EVENT_REJECTED; + event.param.conn.private_data = ib_event->private_data; + event.param.conn.private_data_len = IB_CM_REJ_PRIVATE_DATA_SIZE; + break; + default: + pr_err("RDMA CMA: unexpected IB CM event: %d\n", + ib_event->event); + goto out; + } + + ret = cma_cm_event_handler(id_priv, &event); + if (ret) { + /* Destroy the CM ID by returning a non-zero value. */ + id_priv->cm_id.ib = NULL; + destroy_id_handler_unlock(id_priv); + return ret; + } +out: + mutex_unlock(&id_priv->handler_mutex); + return 0; +} + +static struct rdma_id_private * +cma_ib_new_conn_id(const struct rdma_cm_id *listen_id, + const struct ib_cm_event *ib_event, + struct net_device *net_dev) +{ + struct rdma_id_private *listen_id_priv; + struct rdma_id_private *id_priv; + struct rdma_cm_id *id; + struct rdma_route *rt; + const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family; + struct sa_path_rec *path = ib_event->param.req_rcvd.primary_path; + const __be64 service_id = + ib_event->param.req_rcvd.primary_path->service_id; + int ret; + + listen_id_priv = container_of(listen_id, struct rdma_id_private, id); + id_priv = __rdma_create_id(listen_id->route.addr.dev_addr.net, + listen_id->event_handler, listen_id->context, + listen_id->ps, + ib_event->param.req_rcvd.qp_type, + listen_id_priv); + if (IS_ERR(id_priv)) + return NULL; + + id = &id_priv->id; + if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr, + (struct sockaddr *)&id->route.addr.dst_addr, + listen_id, ib_event, ss_family, service_id)) + goto err; + + rt = &id->route; + rt->num_pri_alt_paths = ib_event->param.req_rcvd.alternate_path ? 2 : 1; + rt->path_rec = kmalloc_array(rt->num_pri_alt_paths, + sizeof(*rt->path_rec), GFP_KERNEL); + if (!rt->path_rec) + goto err; + + rt->path_rec[0] = *path; + if (rt->num_pri_alt_paths == 2) + rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path; + + if (net_dev) { + rdma_copy_src_l2_addr(&rt->addr.dev_addr, net_dev); + } else { + if (!cma_protocol_roce(listen_id) && + cma_any_addr(cma_src_addr(id_priv))) { + rt->addr.dev_addr.dev_type = ARPHRD_INFINIBAND; + rdma_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid); + ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey)); + } else if (!cma_any_addr(cma_src_addr(id_priv))) { + ret = cma_translate_addr(cma_src_addr(id_priv), &rt->addr.dev_addr); + if (ret) + goto err; + } + } + rdma_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid); + + id_priv->state = RDMA_CM_CONNECT; + return id_priv; + +err: + rdma_destroy_id(id); + return NULL; +} + +static struct rdma_id_private * +cma_ib_new_udp_id(const struct rdma_cm_id *listen_id, + const struct ib_cm_event *ib_event, + struct net_device *net_dev) +{ + const struct rdma_id_private *listen_id_priv; + struct rdma_id_private *id_priv; + struct rdma_cm_id *id; + const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family; + struct net *net = listen_id->route.addr.dev_addr.net; + int ret; + + listen_id_priv = container_of(listen_id, struct rdma_id_private, id); + id_priv = __rdma_create_id(net, listen_id->event_handler, + listen_id->context, listen_id->ps, IB_QPT_UD, + listen_id_priv); + if (IS_ERR(id_priv)) + return NULL; + + id = &id_priv->id; + if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr, + (struct sockaddr *)&id->route.addr.dst_addr, + listen_id, ib_event, ss_family, + ib_event->param.sidr_req_rcvd.service_id)) + goto err; + + if (net_dev) { + rdma_copy_src_l2_addr(&id->route.addr.dev_addr, net_dev); + } else { + if (!cma_any_addr(cma_src_addr(id_priv))) { + ret = cma_translate_addr(cma_src_addr(id_priv), + &id->route.addr.dev_addr); + if (ret) + goto err; + } + } + + id_priv->state = RDMA_CM_CONNECT; + return id_priv; +err: + rdma_destroy_id(id); + return NULL; +} + +static void cma_set_req_event_data(struct rdma_cm_event *event, + const struct ib_cm_req_event_param *req_data, + void *private_data, int offset) +{ + event->param.conn.private_data = private_data + offset; + event->param.conn.private_data_len = IB_CM_REQ_PRIVATE_DATA_SIZE - offset; + event->param.conn.responder_resources = req_data->responder_resources; + event->param.conn.initiator_depth = req_data->initiator_depth; + event->param.conn.flow_control = req_data->flow_control; + event->param.conn.retry_count = req_data->retry_count; + event->param.conn.rnr_retry_count = req_data->rnr_retry_count; + event->param.conn.srq = req_data->srq; + event->param.conn.qp_num = req_data->remote_qpn; + + event->ece.vendor_id = req_data->ece.vendor_id; + event->ece.attr_mod = req_data->ece.attr_mod; +} + +static int cma_ib_check_req_qp_type(const struct rdma_cm_id *id, + const struct ib_cm_event *ib_event) +{ + return (((ib_event->event == IB_CM_REQ_RECEIVED) && + (ib_event->param.req_rcvd.qp_type == id->qp_type)) || + ((ib_event->event == IB_CM_SIDR_REQ_RECEIVED) && + (id->qp_type == IB_QPT_UD)) || + (!id->qp_type)); +} + +static int cma_ib_req_handler(struct ib_cm_id *cm_id, + const struct ib_cm_event *ib_event) +{ + struct rdma_id_private *listen_id, *conn_id = NULL; + struct rdma_cm_event event = {}; + struct cma_req_info req = {}; + struct net_device *net_dev; + u8 offset; + int ret; + + listen_id = cma_ib_id_from_event(cm_id, ib_event, &req, &net_dev); + if (IS_ERR(listen_id)) + return PTR_ERR(listen_id); + + trace_cm_req_handler(listen_id, ib_event->event); + if (!cma_ib_check_req_qp_type(&listen_id->id, ib_event)) { + ret = -EINVAL; + goto net_dev_put; + } + + mutex_lock(&listen_id->handler_mutex); + if (READ_ONCE(listen_id->state) != RDMA_CM_LISTEN) { + ret = -ECONNABORTED; + goto err_unlock; + } + + offset = cma_user_data_offset(listen_id); + event.event = RDMA_CM_EVENT_CONNECT_REQUEST; + if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) { + conn_id = cma_ib_new_udp_id(&listen_id->id, ib_event, net_dev); + event.param.ud.private_data = ib_event->private_data + offset; + event.param.ud.private_data_len = + IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE - offset; + } else { + conn_id = cma_ib_new_conn_id(&listen_id->id, ib_event, net_dev); + cma_set_req_event_data(&event, &ib_event->param.req_rcvd, + ib_event->private_data, offset); + } + if (!conn_id) { + ret = -ENOMEM; + goto err_unlock; + } + + mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); + ret = cma_ib_acquire_dev(conn_id, listen_id, &req); + if (ret) { + destroy_id_handler_unlock(conn_id); + goto err_unlock; + } + + conn_id->cm_id.ib = cm_id; + cm_id->context = conn_id; + cm_id->cm_handler = cma_ib_handler; + + ret = cma_cm_event_handler(conn_id, &event); + if (ret) { + /* Destroy the CM ID by returning a non-zero value. */ + conn_id->cm_id.ib = NULL; + mutex_unlock(&listen_id->handler_mutex); + destroy_id_handler_unlock(conn_id); + goto net_dev_put; + } + + if (READ_ONCE(conn_id->state) == RDMA_CM_CONNECT && + conn_id->id.qp_type != IB_QPT_UD) { + trace_cm_send_mra(cm_id->context); + ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); + } + mutex_unlock(&conn_id->handler_mutex); + +err_unlock: + mutex_unlock(&listen_id->handler_mutex); + +net_dev_put: + if (net_dev) + dev_put(net_dev); + + return ret; +} + +__be64 rdma_get_service_id(struct rdma_cm_id *id, struct sockaddr *addr) +{ + if (addr->sa_family == AF_IB) + return ((struct sockaddr_ib *) addr)->sib_sid; + + return cpu_to_be64(((u64)id->ps << 16) + be16_to_cpu(cma_port(addr))); +} +EXPORT_SYMBOL(rdma_get_service_id); + +void rdma_read_gids(struct rdma_cm_id *cm_id, union ib_gid *sgid, + union ib_gid *dgid) +{ + struct rdma_addr *addr = &cm_id->route.addr; + + if (!cm_id->device) { + if (sgid) + memset(sgid, 0, sizeof(*sgid)); + if (dgid) + memset(dgid, 0, sizeof(*dgid)); + return; + } + + if (rdma_protocol_roce(cm_id->device, cm_id->port_num)) { + if (sgid) + rdma_ip2gid((struct sockaddr *)&addr->src_addr, sgid); + if (dgid) + rdma_ip2gid((struct sockaddr *)&addr->dst_addr, dgid); + } else { + if (sgid) + rdma_addr_get_sgid(&addr->dev_addr, sgid); + if (dgid) + rdma_addr_get_dgid(&addr->dev_addr, dgid); + } +} +EXPORT_SYMBOL(rdma_read_gids); + +static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) +{ + struct rdma_id_private *id_priv = iw_id->context; + struct rdma_cm_event event = {}; + int ret = 0; + struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr; + struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr; + + mutex_lock(&id_priv->handler_mutex); + if (READ_ONCE(id_priv->state) != RDMA_CM_CONNECT) + goto out; + + switch (iw_event->event) { + case IW_CM_EVENT_CLOSE: + event.event = RDMA_CM_EVENT_DISCONNECTED; + break; + case IW_CM_EVENT_CONNECT_REPLY: + memcpy(cma_src_addr(id_priv), laddr, + rdma_addr_size(laddr)); + memcpy(cma_dst_addr(id_priv), raddr, + rdma_addr_size(raddr)); + switch (iw_event->status) { + case 0: + event.event = RDMA_CM_EVENT_ESTABLISHED; + event.param.conn.initiator_depth = iw_event->ird; + event.param.conn.responder_resources = iw_event->ord; + break; + case -ECONNRESET: + case -ECONNREFUSED: + event.event = RDMA_CM_EVENT_REJECTED; + break; + case -ETIMEDOUT: + event.event = RDMA_CM_EVENT_UNREACHABLE; + break; + default: + event.event = RDMA_CM_EVENT_CONNECT_ERROR; + break; + } + break; + case IW_CM_EVENT_ESTABLISHED: + event.event = RDMA_CM_EVENT_ESTABLISHED; + event.param.conn.initiator_depth = iw_event->ird; + event.param.conn.responder_resources = iw_event->ord; + break; + default: + goto out; + } + + event.status = iw_event->status; + event.param.conn.private_data = iw_event->private_data; + event.param.conn.private_data_len = iw_event->private_data_len; + ret = cma_cm_event_handler(id_priv, &event); + if (ret) { + /* Destroy the CM ID by returning a non-zero value. */ + id_priv->cm_id.iw = NULL; + destroy_id_handler_unlock(id_priv); + return ret; + } + +out: + mutex_unlock(&id_priv->handler_mutex); + return ret; +} + +static int iw_conn_req_handler(struct iw_cm_id *cm_id, + struct iw_cm_event *iw_event) +{ + struct rdma_id_private *listen_id, *conn_id; + struct rdma_cm_event event = {}; + int ret = -ECONNABORTED; + struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr; + struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr; + + event.event = RDMA_CM_EVENT_CONNECT_REQUEST; + event.param.conn.private_data = iw_event->private_data; + event.param.conn.private_data_len = iw_event->private_data_len; + event.param.conn.initiator_depth = iw_event->ird; + event.param.conn.responder_resources = iw_event->ord; + + listen_id = cm_id->context; + + mutex_lock(&listen_id->handler_mutex); + if (READ_ONCE(listen_id->state) != RDMA_CM_LISTEN) + goto out; + + /* Create a new RDMA id for the new IW CM ID */ + conn_id = __rdma_create_id(listen_id->id.route.addr.dev_addr.net, + listen_id->id.event_handler, + listen_id->id.context, RDMA_PS_TCP, + IB_QPT_RC, listen_id); + if (IS_ERR(conn_id)) { + ret = -ENOMEM; + goto out; + } + mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); + conn_id->state = RDMA_CM_CONNECT; + + ret = rdma_translate_ip(laddr, &conn_id->id.route.addr.dev_addr); + if (ret) { + mutex_unlock(&listen_id->handler_mutex); + destroy_id_handler_unlock(conn_id); + return ret; + } + + ret = cma_iw_acquire_dev(conn_id, listen_id); + if (ret) { + mutex_unlock(&listen_id->handler_mutex); + destroy_id_handler_unlock(conn_id); + return ret; + } + + conn_id->cm_id.iw = cm_id; + cm_id->context = conn_id; + cm_id->cm_handler = cma_iw_handler; + + memcpy(cma_src_addr(conn_id), laddr, rdma_addr_size(laddr)); + memcpy(cma_dst_addr(conn_id), raddr, rdma_addr_size(raddr)); + + ret = cma_cm_event_handler(conn_id, &event); + if (ret) { + /* User wants to destroy the CM ID */ + conn_id->cm_id.iw = NULL; + mutex_unlock(&listen_id->handler_mutex); + destroy_id_handler_unlock(conn_id); + return ret; + } + + mutex_unlock(&conn_id->handler_mutex); + +out: + mutex_unlock(&listen_id->handler_mutex); + return ret; +} + +static int cma_ib_listen(struct rdma_id_private *id_priv) +{ + struct sockaddr *addr; + struct ib_cm_id *id; + __be64 svc_id; + + addr = cma_src_addr(id_priv); + svc_id = rdma_get_service_id(&id_priv->id, addr); + id = ib_cm_insert_listen(id_priv->id.device, + cma_ib_req_handler, svc_id); + if (IS_ERR(id)) + return PTR_ERR(id); + id_priv->cm_id.ib = id; + + return 0; +} + +static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog) +{ + int ret; + struct iw_cm_id *id; + + id = iw_create_cm_id(id_priv->id.device, + iw_conn_req_handler, + id_priv); + if (IS_ERR(id)) + return PTR_ERR(id); + + mutex_lock(&id_priv->qp_mutex); + id->tos = id_priv->tos; + id->tos_set = id_priv->tos_set; + mutex_unlock(&id_priv->qp_mutex); + id->afonly = id_priv->afonly; + id_priv->cm_id.iw = id; + + memcpy(&id_priv->cm_id.iw->local_addr, cma_src_addr(id_priv), + rdma_addr_size(cma_src_addr(id_priv))); + + ret = iw_cm_listen(id_priv->cm_id.iw, backlog); + + if (ret) { + iw_destroy_cm_id(id_priv->cm_id.iw); + id_priv->cm_id.iw = NULL; + } + + return ret; +} + +static int cma_listen_handler(struct rdma_cm_id *id, + struct rdma_cm_event *event) +{ + struct rdma_id_private *id_priv = id->context; + + /* Listening IDs are always destroyed on removal */ + if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) + return -1; + + id->context = id_priv->id.context; + id->event_handler = id_priv->id.event_handler; + trace_cm_event_handler(id_priv, event); + return id_priv->id.event_handler(id, event); +} + +static int cma_listen_on_dev(struct rdma_id_private *id_priv, + struct cma_device *cma_dev, + struct rdma_id_private **to_destroy) +{ + struct rdma_id_private *dev_id_priv; + struct net *net = id_priv->id.route.addr.dev_addr.net; + int ret; + + lockdep_assert_held(&lock); + + *to_destroy = NULL; + if (cma_family(id_priv) == AF_IB && !rdma_cap_ib_cm(cma_dev->device, 1)) + return 0; + + dev_id_priv = + __rdma_create_id(net, cma_listen_handler, id_priv, + id_priv->id.ps, id_priv->id.qp_type, id_priv); + if (IS_ERR(dev_id_priv)) + return PTR_ERR(dev_id_priv); + + dev_id_priv->state = RDMA_CM_ADDR_BOUND; + memcpy(cma_src_addr(dev_id_priv), cma_src_addr(id_priv), + rdma_addr_size(cma_src_addr(id_priv))); + + _cma_attach_to_dev(dev_id_priv, cma_dev); + rdma_restrack_add(&dev_id_priv->res); + cma_id_get(id_priv); + dev_id_priv->internal_id = 1; + dev_id_priv->afonly = id_priv->afonly; + mutex_lock(&id_priv->qp_mutex); + dev_id_priv->tos_set = id_priv->tos_set; + dev_id_priv->tos = id_priv->tos; + mutex_unlock(&id_priv->qp_mutex); + + ret = rdma_listen(&dev_id_priv->id, id_priv->backlog); + if (ret) + goto err_listen; + list_add_tail(&dev_id_priv->listen_item, &id_priv->listen_list); + return 0; +err_listen: + /* Caller must destroy this after releasing lock */ + *to_destroy = dev_id_priv; + dev_warn(&cma_dev->device->dev, "RDMA CMA: %s, error %d\n", __func__, ret); + return ret; +} + +static int cma_listen_on_all(struct rdma_id_private *id_priv) +{ + struct rdma_id_private *to_destroy; + struct cma_device *cma_dev; + int ret; + + mutex_lock(&lock); + list_add_tail(&id_priv->listen_any_item, &listen_any_list); + list_for_each_entry(cma_dev, &dev_list, list) { + ret = cma_listen_on_dev(id_priv, cma_dev, &to_destroy); + if (ret) { + /* Prevent racing with cma_process_remove() */ + if (to_destroy) + list_del_init(&to_destroy->device_item); + goto err_listen; + } + } + mutex_unlock(&lock); + return 0; + +err_listen: + _cma_cancel_listens(id_priv); + mutex_unlock(&lock); + if (to_destroy) + rdma_destroy_id(&to_destroy->id); + return ret; +} + +void rdma_set_service_type(struct rdma_cm_id *id, int tos) +{ + struct rdma_id_private *id_priv; + + id_priv = container_of(id, struct rdma_id_private, id); + mutex_lock(&id_priv->qp_mutex); + id_priv->tos = (u8) tos; + id_priv->tos_set = true; + mutex_unlock(&id_priv->qp_mutex); +} +EXPORT_SYMBOL(rdma_set_service_type); + +/** + * rdma_set_ack_timeout() - Set the ack timeout of QP associated + * with a connection identifier. + * @id: Communication identifier to associated with service type. + * @timeout: Ack timeout to set a QP, expressed as 4.096 * 2^(timeout) usec. + * + * This function should be called before rdma_connect() on active side, + * and on passive side before rdma_accept(). It is applicable to primary + * path only. The timeout will affect the local side of the QP, it is not + * negotiated with remote side and zero disables the timer. In case it is + * set before rdma_resolve_route, the value will also be used to determine + * PacketLifeTime for RoCE. + * + * Return: 0 for success + */ +int rdma_set_ack_timeout(struct rdma_cm_id *id, u8 timeout) +{ + struct rdma_id_private *id_priv; + + if (id->qp_type != IB_QPT_RC && id->qp_type != IB_QPT_XRC_INI) + return -EINVAL; + + id_priv = container_of(id, struct rdma_id_private, id); + mutex_lock(&id_priv->qp_mutex); + id_priv->timeout = timeout; + id_priv->timeout_set = true; + mutex_unlock(&id_priv->qp_mutex); + + return 0; +} +EXPORT_SYMBOL(rdma_set_ack_timeout); + +/** + * rdma_set_min_rnr_timer() - Set the minimum RNR Retry timer of the + * QP associated with a connection identifier. + * @id: Communication identifier to associated with service type. + * @min_rnr_timer: 5-bit value encoded as Table 45: "Encoding for RNR NAK + * Timer Field" in the IBTA specification. + * + * This function should be called before rdma_connect() on active + * side, and on passive side before rdma_accept(). The timer value + * will be associated with the local QP. When it receives a send it is + * not read to handle, typically if the receive queue is empty, an RNR + * Retry NAK is returned to the requester with the min_rnr_timer + * encoded. The requester will then wait at least the time specified + * in the NAK before retrying. The default is zero, which translates + * to a minimum RNR Timer value of 655 ms. + * + * Return: 0 for success + */ +int rdma_set_min_rnr_timer(struct rdma_cm_id *id, u8 min_rnr_timer) +{ + struct rdma_id_private *id_priv; + + /* It is a five-bit value */ + if (min_rnr_timer & 0xe0) + return -EINVAL; + + if (WARN_ON(id->qp_type != IB_QPT_RC && id->qp_type != IB_QPT_XRC_TGT)) + return -EINVAL; + + id_priv = container_of(id, struct rdma_id_private, id); + mutex_lock(&id_priv->qp_mutex); + id_priv->min_rnr_timer = min_rnr_timer; + id_priv->min_rnr_timer_set = true; + mutex_unlock(&id_priv->qp_mutex); + + return 0; +} +EXPORT_SYMBOL(rdma_set_min_rnr_timer); + +static void route_set_path_rec_inbound(struct cma_work *work, + struct sa_path_rec *path_rec) +{ + struct rdma_route *route = &work->id->id.route; + + if (!route->path_rec_inbound) { + route->path_rec_inbound = + kzalloc(sizeof(*route->path_rec_inbound), GFP_KERNEL); + if (!route->path_rec_inbound) + return; + } + + *route->path_rec_inbound = *path_rec; +} + +static void route_set_path_rec_outbound(struct cma_work *work, + struct sa_path_rec *path_rec) +{ + struct rdma_route *route = &work->id->id.route; + + if (!route->path_rec_outbound) { + route->path_rec_outbound = + kzalloc(sizeof(*route->path_rec_outbound), GFP_KERNEL); + if (!route->path_rec_outbound) + return; + } + + *route->path_rec_outbound = *path_rec; +} + +static void cma_query_handler(int status, struct sa_path_rec *path_rec, + int num_prs, void *context) +{ + struct cma_work *work = context; + struct rdma_route *route; + int i; + + route = &work->id->id.route; + + if (status) + goto fail; + + for (i = 0; i < num_prs; i++) { + if (!path_rec[i].flags || (path_rec[i].flags & IB_PATH_GMP)) + *route->path_rec = path_rec[i]; + else if (path_rec[i].flags & IB_PATH_INBOUND) + route_set_path_rec_inbound(work, &path_rec[i]); + else if (path_rec[i].flags & IB_PATH_OUTBOUND) + route_set_path_rec_outbound(work, &path_rec[i]); + } + if (!route->path_rec) { + status = -EINVAL; + goto fail; + } + + route->num_pri_alt_paths = 1; + queue_work(cma_wq, &work->work); + return; + +fail: + work->old_state = RDMA_CM_ROUTE_QUERY; + work->new_state = RDMA_CM_ADDR_RESOLVED; + work->event.event = RDMA_CM_EVENT_ROUTE_ERROR; + work->event.status = status; + pr_debug_ratelimited("RDMA CM: ROUTE_ERROR: failed to query path. status %d\n", + status); + queue_work(cma_wq, &work->work); +} + +static int cma_query_ib_route(struct rdma_id_private *id_priv, + unsigned long timeout_ms, struct cma_work *work) +{ + struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; + struct sa_path_rec path_rec; + ib_sa_comp_mask comp_mask; + struct sockaddr_in6 *sin6; + struct sockaddr_ib *sib; + + memset(&path_rec, 0, sizeof path_rec); + + if (rdma_cap_opa_ah(id_priv->id.device, id_priv->id.port_num)) + path_rec.rec_type = SA_PATH_REC_TYPE_OPA; + else + path_rec.rec_type = SA_PATH_REC_TYPE_IB; + rdma_addr_get_sgid(dev_addr, &path_rec.sgid); + rdma_addr_get_dgid(dev_addr, &path_rec.dgid); + path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr)); + path_rec.numb_path = 1; + path_rec.reversible = 1; + path_rec.service_id = rdma_get_service_id(&id_priv->id, + cma_dst_addr(id_priv)); + + comp_mask = IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID | + IB_SA_PATH_REC_PKEY | IB_SA_PATH_REC_NUMB_PATH | + IB_SA_PATH_REC_REVERSIBLE | IB_SA_PATH_REC_SERVICE_ID; + + switch (cma_family(id_priv)) { + case AF_INET: + path_rec.qos_class = cpu_to_be16((u16) id_priv->tos); + comp_mask |= IB_SA_PATH_REC_QOS_CLASS; + break; + case AF_INET6: + sin6 = (struct sockaddr_in6 *) cma_src_addr(id_priv); + path_rec.traffic_class = (u8) (be32_to_cpu(sin6->sin6_flowinfo) >> 20); + comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS; + break; + case AF_IB: + sib = (struct sockaddr_ib *) cma_src_addr(id_priv); + path_rec.traffic_class = (u8) (be32_to_cpu(sib->sib_flowinfo) >> 20); + comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS; + break; + } + + id_priv->query_id = ib_sa_path_rec_get(&sa_client, id_priv->id.device, + id_priv->id.port_num, &path_rec, + comp_mask, timeout_ms, 0, + GFP_KERNEL, cma_query_handler, + work, &id_priv->query); + + return (id_priv->query_id < 0) ? id_priv->query_id : 0; +} + +static void cma_iboe_join_work_handler(struct work_struct *work) +{ + struct cma_multicast *mc = + container_of(work, struct cma_multicast, iboe_join.work); + struct rdma_cm_event *event = &mc->iboe_join.event; + struct rdma_id_private *id_priv = mc->id_priv; + int ret; + + mutex_lock(&id_priv->handler_mutex); + if (READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING || + READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL) + goto out_unlock; + + ret = cma_cm_event_handler(id_priv, event); + WARN_ON(ret); + +out_unlock: + mutex_unlock(&id_priv->handler_mutex); + if (event->event == RDMA_CM_EVENT_MULTICAST_JOIN) + rdma_destroy_ah_attr(&event->param.ud.ah_attr); +} + +static void cma_work_handler(struct work_struct *_work) +{ + struct cma_work *work = container_of(_work, struct cma_work, work); + struct rdma_id_private *id_priv = work->id; + + mutex_lock(&id_priv->handler_mutex); + if (READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING || + READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL) + goto out_unlock; + if (work->old_state != 0 || work->new_state != 0) { + if (!cma_comp_exch(id_priv, work->old_state, work->new_state)) + goto out_unlock; + } + + if (cma_cm_event_handler(id_priv, &work->event)) { + cma_id_put(id_priv); + destroy_id_handler_unlock(id_priv); + goto out_free; + } + +out_unlock: + mutex_unlock(&id_priv->handler_mutex); + cma_id_put(id_priv); +out_free: + if (work->event.event == RDMA_CM_EVENT_MULTICAST_JOIN) + rdma_destroy_ah_attr(&work->event.param.ud.ah_attr); + kfree(work); +} + +static void cma_init_resolve_route_work(struct cma_work *work, + struct rdma_id_private *id_priv) +{ + work->id = id_priv; + INIT_WORK(&work->work, cma_work_handler); + work->old_state = RDMA_CM_ROUTE_QUERY; + work->new_state = RDMA_CM_ROUTE_RESOLVED; + work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; +} + +static void enqueue_resolve_addr_work(struct cma_work *work, + struct rdma_id_private *id_priv) +{ + /* Balances with cma_id_put() in cma_work_handler */ + cma_id_get(id_priv); + + work->id = id_priv; + INIT_WORK(&work->work, cma_work_handler); + work->old_state = RDMA_CM_ADDR_QUERY; + work->new_state = RDMA_CM_ADDR_RESOLVED; + work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED; + + queue_work(cma_wq, &work->work); +} + +static int cma_resolve_ib_route(struct rdma_id_private *id_priv, + unsigned long timeout_ms) +{ + struct rdma_route *route = &id_priv->id.route; + struct cma_work *work; + int ret; + + work = kzalloc(sizeof *work, GFP_KERNEL); + if (!work) + return -ENOMEM; + + cma_init_resolve_route_work(work, id_priv); + + if (!route->path_rec) + route->path_rec = kmalloc(sizeof *route->path_rec, GFP_KERNEL); + if (!route->path_rec) { + ret = -ENOMEM; + goto err1; + } + + ret = cma_query_ib_route(id_priv, timeout_ms, work); + if (ret) + goto err2; + + return 0; +err2: + kfree(route->path_rec); + route->path_rec = NULL; +err1: + kfree(work); + return ret; +} + +static enum ib_gid_type cma_route_gid_type(enum rdma_network_type network_type, + unsigned long supported_gids, + enum ib_gid_type default_gid) +{ + if ((network_type == RDMA_NETWORK_IPV4 || + network_type == RDMA_NETWORK_IPV6) && + test_bit(IB_GID_TYPE_ROCE_UDP_ENCAP, &supported_gids)) + return IB_GID_TYPE_ROCE_UDP_ENCAP; + + return default_gid; +} + +/* + * cma_iboe_set_path_rec_l2_fields() is helper function which sets + * path record type based on GID type. + * It also sets up other L2 fields which includes destination mac address + * netdev ifindex, of the path record. + * It returns the netdev of the bound interface for this path record entry. + */ +static struct net_device * +cma_iboe_set_path_rec_l2_fields(struct rdma_id_private *id_priv) +{ + struct rdma_route *route = &id_priv->id.route; + enum ib_gid_type gid_type = IB_GID_TYPE_ROCE; + struct rdma_addr *addr = &route->addr; + unsigned long supported_gids; + struct net_device *ndev; + + if (!addr->dev_addr.bound_dev_if) + return NULL; + + ndev = dev_get_by_index(addr->dev_addr.net, + addr->dev_addr.bound_dev_if); + if (!ndev) + return NULL; + + supported_gids = roce_gid_type_mask_support(id_priv->id.device, + id_priv->id.port_num); + gid_type = cma_route_gid_type(addr->dev_addr.network, + supported_gids, + id_priv->gid_type); + /* Use the hint from IP Stack to select GID Type */ + if (gid_type < ib_network_to_gid_type(addr->dev_addr.network)) + gid_type = ib_network_to_gid_type(addr->dev_addr.network); + route->path_rec->rec_type = sa_conv_gid_to_pathrec_type(gid_type); + + route->path_rec->roce.route_resolved = true; + sa_path_set_dmac(route->path_rec, addr->dev_addr.dst_dev_addr); + return ndev; +} + +int rdma_set_ib_path(struct rdma_cm_id *id, + struct sa_path_rec *path_rec) +{ + struct rdma_id_private *id_priv; + struct net_device *ndev; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, + RDMA_CM_ROUTE_RESOLVED)) + return -EINVAL; + + id->route.path_rec = kmemdup(path_rec, sizeof(*path_rec), + GFP_KERNEL); + if (!id->route.path_rec) { + ret = -ENOMEM; + goto err; + } + + if (rdma_protocol_roce(id->device, id->port_num)) { + ndev = cma_iboe_set_path_rec_l2_fields(id_priv); + if (!ndev) { + ret = -ENODEV; + goto err_free; + } + dev_put(ndev); + } + + id->route.num_pri_alt_paths = 1; + return 0; + +err_free: + kfree(id->route.path_rec); + id->route.path_rec = NULL; +err: + cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_ADDR_RESOLVED); + return ret; +} +EXPORT_SYMBOL(rdma_set_ib_path); + +static int cma_resolve_iw_route(struct rdma_id_private *id_priv) +{ + struct cma_work *work; + + work = kzalloc(sizeof *work, GFP_KERNEL); + if (!work) + return -ENOMEM; + + cma_init_resolve_route_work(work, id_priv); + queue_work(cma_wq, &work->work); + return 0; +} + +static int get_vlan_ndev_tc(struct net_device *vlan_ndev, int prio) +{ + struct net_device *dev; + + dev = vlan_dev_real_dev(vlan_ndev); + if (dev->num_tc) + return netdev_get_prio_tc_map(dev, prio); + + return (vlan_dev_get_egress_qos_mask(vlan_ndev, prio) & + VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; +} + +struct iboe_prio_tc_map { + int input_prio; + int output_tc; + bool found; +}; + +static int get_lower_vlan_dev_tc(struct net_device *dev, + struct netdev_nested_priv *priv) +{ + struct iboe_prio_tc_map *map = (struct iboe_prio_tc_map *)priv->data; + + if (is_vlan_dev(dev)) + map->output_tc = get_vlan_ndev_tc(dev, map->input_prio); + else if (dev->num_tc) + map->output_tc = netdev_get_prio_tc_map(dev, map->input_prio); + else + map->output_tc = 0; + /* We are interested only in first level VLAN device, so always + * return 1 to stop iterating over next level devices. + */ + map->found = true; + return 1; +} + +static int iboe_tos_to_sl(struct net_device *ndev, int tos) +{ + struct iboe_prio_tc_map prio_tc_map = {}; + int prio = rt_tos2priority(tos); + struct netdev_nested_priv priv; + + /* If VLAN device, get it directly from the VLAN netdev */ + if (is_vlan_dev(ndev)) + return get_vlan_ndev_tc(ndev, prio); + + prio_tc_map.input_prio = prio; + priv.data = (void *)&prio_tc_map; + rcu_read_lock(); + netdev_walk_all_lower_dev_rcu(ndev, + get_lower_vlan_dev_tc, + &priv); + rcu_read_unlock(); + /* If map is found from lower device, use it; Otherwise + * continue with the current netdevice to get priority to tc map. + */ + if (prio_tc_map.found) + return prio_tc_map.output_tc; + else if (ndev->num_tc) + return netdev_get_prio_tc_map(ndev, prio); + else + return 0; +} + +static __be32 cma_get_roce_udp_flow_label(struct rdma_id_private *id_priv) +{ + struct sockaddr_in6 *addr6; + u16 dport, sport; + u32 hash, fl; + + addr6 = (struct sockaddr_in6 *)cma_src_addr(id_priv); + fl = be32_to_cpu(addr6->sin6_flowinfo) & IB_GRH_FLOWLABEL_MASK; + if ((cma_family(id_priv) != AF_INET6) || !fl) { + dport = be16_to_cpu(cma_port(cma_dst_addr(id_priv))); + sport = be16_to_cpu(cma_port(cma_src_addr(id_priv))); + hash = (u32)sport * 31 + dport; + fl = hash & IB_GRH_FLOWLABEL_MASK; + } + + return cpu_to_be32(fl); +} + +static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) +{ + struct rdma_route *route = &id_priv->id.route; + struct rdma_addr *addr = &route->addr; + struct cma_work *work; + int ret; + struct net_device *ndev; + + u8 default_roce_tos = id_priv->cma_dev->default_roce_tos[id_priv->id.port_num - + rdma_start_port(id_priv->cma_dev->device)]; + u8 tos; + + mutex_lock(&id_priv->qp_mutex); + tos = id_priv->tos_set ? id_priv->tos : default_roce_tos; + mutex_unlock(&id_priv->qp_mutex); + + work = kzalloc(sizeof *work, GFP_KERNEL); + if (!work) + return -ENOMEM; + + route->path_rec = kzalloc(sizeof *route->path_rec, GFP_KERNEL); + if (!route->path_rec) { + ret = -ENOMEM; + goto err1; + } + + route->num_pri_alt_paths = 1; + + ndev = cma_iboe_set_path_rec_l2_fields(id_priv); + if (!ndev) { + ret = -ENODEV; + goto err2; + } + + rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, + &route->path_rec->sgid); + rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.dst_addr, + &route->path_rec->dgid); + + if (((struct sockaddr *)&id_priv->id.route.addr.dst_addr)->sa_family != AF_IB) + /* TODO: get the hoplimit from the inet/inet6 device */ + route->path_rec->hop_limit = addr->dev_addr.hoplimit; + else + route->path_rec->hop_limit = 1; + route->path_rec->reversible = 1; + route->path_rec->pkey = cpu_to_be16(0xffff); + route->path_rec->mtu_selector = IB_SA_EQ; + route->path_rec->sl = iboe_tos_to_sl(ndev, tos); + route->path_rec->traffic_class = tos; + route->path_rec->mtu = iboe_get_mtu(ndev->mtu); + route->path_rec->rate_selector = IB_SA_EQ; + route->path_rec->rate = iboe_get_rate(ndev); + dev_put(ndev); + route->path_rec->packet_life_time_selector = IB_SA_EQ; + /* In case ACK timeout is set, use this value to calculate + * PacketLifeTime. As per IBTA 12.7.34, + * local ACK timeout = (2 * PacketLifeTime + Local CA’s ACK delay). + * Assuming a negligible local ACK delay, we can use + * PacketLifeTime = local ACK timeout/2 + * as a reasonable approximation for RoCE networks. + */ + mutex_lock(&id_priv->qp_mutex); + if (id_priv->timeout_set && id_priv->timeout) + route->path_rec->packet_life_time = id_priv->timeout - 1; + else + route->path_rec->packet_life_time = CMA_IBOE_PACKET_LIFETIME; + mutex_unlock(&id_priv->qp_mutex); + + if (!route->path_rec->mtu) { + ret = -EINVAL; + goto err2; + } + + if (rdma_protocol_roce_udp_encap(id_priv->id.device, + id_priv->id.port_num)) + route->path_rec->flow_label = + cma_get_roce_udp_flow_label(id_priv); + + cma_init_resolve_route_work(work, id_priv); + queue_work(cma_wq, &work->work); + + return 0; + +err2: + kfree(route->path_rec); + route->path_rec = NULL; + route->num_pri_alt_paths = 0; +err1: + kfree(work); + return ret; +} + +int rdma_resolve_route(struct rdma_cm_id *id, unsigned long timeout_ms) +{ + struct rdma_id_private *id_priv; + int ret; + + if (!timeout_ms) + return -EINVAL; + + id_priv = container_of(id, struct rdma_id_private, id); + if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ROUTE_QUERY)) + return -EINVAL; + + cma_id_get(id_priv); + if (rdma_cap_ib_sa(id->device, id->port_num)) + ret = cma_resolve_ib_route(id_priv, timeout_ms); + else if (rdma_protocol_roce(id->device, id->port_num)) { + ret = cma_resolve_iboe_route(id_priv); + if (!ret) + cma_add_id_to_tree(id_priv); + } + else if (rdma_protocol_iwarp(id->device, id->port_num)) + ret = cma_resolve_iw_route(id_priv); + else + ret = -ENOSYS; + + if (ret) + goto err; + + return 0; +err: + cma_comp_exch(id_priv, RDMA_CM_ROUTE_QUERY, RDMA_CM_ADDR_RESOLVED); + cma_id_put(id_priv); + return ret; +} +EXPORT_SYMBOL(rdma_resolve_route); + +static void cma_set_loopback(struct sockaddr *addr) +{ + switch (addr->sa_family) { + case AF_INET: + ((struct sockaddr_in *) addr)->sin_addr.s_addr = htonl(INADDR_LOOPBACK); + break; + case AF_INET6: + ipv6_addr_set(&((struct sockaddr_in6 *) addr)->sin6_addr, + 0, 0, 0, htonl(1)); + break; + default: + ib_addr_set(&((struct sockaddr_ib *) addr)->sib_addr, + 0, 0, 0, htonl(1)); + break; + } +} + +static int cma_bind_loopback(struct rdma_id_private *id_priv) +{ + struct cma_device *cma_dev, *cur_dev; + union ib_gid gid; + enum ib_port_state port_state; + unsigned int p; + u16 pkey; + int ret; + + cma_dev = NULL; + mutex_lock(&lock); + list_for_each_entry(cur_dev, &dev_list, list) { + if (cma_family(id_priv) == AF_IB && + !rdma_cap_ib_cm(cur_dev->device, 1)) + continue; + + if (!cma_dev) + cma_dev = cur_dev; + + rdma_for_each_port (cur_dev->device, p) { + if (!ib_get_cached_port_state(cur_dev->device, p, &port_state) && + port_state == IB_PORT_ACTIVE) { + cma_dev = cur_dev; + goto port_found; + } + } + } + + if (!cma_dev) { + ret = -ENODEV; + goto out; + } + + p = 1; + +port_found: + ret = rdma_query_gid(cma_dev->device, p, 0, &gid); + if (ret) + goto out; + + ret = ib_get_cached_pkey(cma_dev->device, p, 0, &pkey); + if (ret) + goto out; + + id_priv->id.route.addr.dev_addr.dev_type = + (rdma_protocol_ib(cma_dev->device, p)) ? + ARPHRD_INFINIBAND : ARPHRD_ETHER; + + rdma_addr_set_sgid(&id_priv->id.route.addr.dev_addr, &gid); + ib_addr_set_pkey(&id_priv->id.route.addr.dev_addr, pkey); + id_priv->id.port_num = p; + cma_attach_to_dev(id_priv, cma_dev); + rdma_restrack_add(&id_priv->res); + cma_set_loopback(cma_src_addr(id_priv)); +out: + mutex_unlock(&lock); + return ret; +} + +static void addr_handler(int status, struct sockaddr *src_addr, + struct rdma_dev_addr *dev_addr, void *context) +{ + struct rdma_id_private *id_priv = context; + struct rdma_cm_event event = {}; + struct sockaddr *addr; + struct sockaddr_storage old_addr; + + mutex_lock(&id_priv->handler_mutex); + if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, + RDMA_CM_ADDR_RESOLVED)) + goto out; + + /* + * Store the previous src address, so that if we fail to acquire + * matching rdma device, old address can be restored back, which helps + * to cancel the cma listen operation correctly. + */ + addr = cma_src_addr(id_priv); + memcpy(&old_addr, addr, rdma_addr_size(addr)); + memcpy(addr, src_addr, rdma_addr_size(src_addr)); + if (!status && !id_priv->cma_dev) { + status = cma_acquire_dev_by_src_ip(id_priv); + if (status) + pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to acquire device. status %d\n", + status); + rdma_restrack_add(&id_priv->res); + } else if (status) { + pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to resolve IP. status %d\n", status); + } + + if (status) { + memcpy(addr, &old_addr, + rdma_addr_size((struct sockaddr *)&old_addr)); + if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, + RDMA_CM_ADDR_BOUND)) + goto out; + event.event = RDMA_CM_EVENT_ADDR_ERROR; + event.status = status; + } else + event.event = RDMA_CM_EVENT_ADDR_RESOLVED; + + if (cma_cm_event_handler(id_priv, &event)) { + destroy_id_handler_unlock(id_priv); + return; + } +out: + mutex_unlock(&id_priv->handler_mutex); +} + +static int cma_resolve_loopback(struct rdma_id_private *id_priv) +{ + struct cma_work *work; + union ib_gid gid; + int ret; + + work = kzalloc(sizeof *work, GFP_KERNEL); + if (!work) + return -ENOMEM; + + if (!id_priv->cma_dev) { + ret = cma_bind_loopback(id_priv); + if (ret) + goto err; + } + + rdma_addr_get_sgid(&id_priv->id.route.addr.dev_addr, &gid); + rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, &gid); + + enqueue_resolve_addr_work(work, id_priv); + return 0; +err: + kfree(work); + return ret; +} + +static int cma_resolve_ib_addr(struct rdma_id_private *id_priv) +{ + struct cma_work *work; + int ret; + + work = kzalloc(sizeof *work, GFP_KERNEL); + if (!work) + return -ENOMEM; + + if (!id_priv->cma_dev) { + ret = cma_resolve_ib_dev(id_priv); + if (ret) + goto err; + } + + rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, (union ib_gid *) + &(((struct sockaddr_ib *) &id_priv->id.route.addr.dst_addr)->sib_addr)); + + enqueue_resolve_addr_work(work, id_priv); + return 0; +err: + kfree(work); + return ret; +} + +static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, + const struct sockaddr *dst_addr) +{ + struct sockaddr_storage zero_sock = {}; + + if (src_addr && src_addr->sa_family) + return rdma_bind_addr(id, src_addr); + + /* + * When the src_addr is not specified, automatically supply an any addr + */ + zero_sock.ss_family = dst_addr->sa_family; + if (IS_ENABLED(CONFIG_IPV6) && dst_addr->sa_family == AF_INET6) { + struct sockaddr_in6 *src_addr6 = + (struct sockaddr_in6 *)&zero_sock; + struct sockaddr_in6 *dst_addr6 = + (struct sockaddr_in6 *)dst_addr; + + src_addr6->sin6_scope_id = dst_addr6->sin6_scope_id; + if (ipv6_addr_type(&dst_addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL) + id->route.addr.dev_addr.bound_dev_if = + dst_addr6->sin6_scope_id; + } else if (dst_addr->sa_family == AF_IB) { + ((struct sockaddr_ib *)&zero_sock)->sib_pkey = + ((struct sockaddr_ib *)dst_addr)->sib_pkey; + } + return rdma_bind_addr(id, (struct sockaddr *)&zero_sock); +} + +/* + * If required, resolve the source address for bind and leave the id_priv in + * state RDMA_CM_ADDR_BOUND. This oddly uses the state to determine the prior + * calls made by ULP, a previously bound ID will not be re-bound and src_addr is + * ignored. + */ +static int resolve_prepare_src(struct rdma_id_private *id_priv, + struct sockaddr *src_addr, + const struct sockaddr *dst_addr) +{ + int ret; + + memcpy(cma_dst_addr(id_priv), dst_addr, rdma_addr_size(dst_addr)); + if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY)) { + /* For a well behaved ULP state will be RDMA_CM_IDLE */ + ret = cma_bind_addr(&id_priv->id, src_addr, dst_addr); + if (ret) + goto err_dst; + if (WARN_ON(!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, + RDMA_CM_ADDR_QUERY))) { + ret = -EINVAL; + goto err_dst; + } + } + + if (cma_family(id_priv) != dst_addr->sa_family) { + ret = -EINVAL; + goto err_state; + } + return 0; + +err_state: + cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND); +err_dst: + memset(cma_dst_addr(id_priv), 0, rdma_addr_size(dst_addr)); + return ret; +} + +int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, + const struct sockaddr *dst_addr, unsigned long timeout_ms) +{ + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); + int ret; + + ret = resolve_prepare_src(id_priv, src_addr, dst_addr); + if (ret) + return ret; + + if (cma_any_addr(dst_addr)) { + ret = cma_resolve_loopback(id_priv); + } else { + if (dst_addr->sa_family == AF_IB) { + ret = cma_resolve_ib_addr(id_priv); + } else { + /* + * The FSM can return back to RDMA_CM_ADDR_BOUND after + * rdma_resolve_ip() is called, eg through the error + * path in addr_handler(). If this happens the existing + * request must be canceled before issuing a new one. + * Since canceling a request is a bit slow and this + * oddball path is rare, keep track once a request has + * been issued. The track turns out to be a permanent + * state since this is the only cancel as it is + * immediately before rdma_resolve_ip(). + */ + if (id_priv->used_resolve_ip) + rdma_addr_cancel(&id->route.addr.dev_addr); + else + id_priv->used_resolve_ip = 1; + ret = rdma_resolve_ip(cma_src_addr(id_priv), dst_addr, + &id->route.addr.dev_addr, + timeout_ms, addr_handler, + false, id_priv); + } + } + if (ret) + goto err; + + return 0; +err: + cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND); + return ret; +} +EXPORT_SYMBOL(rdma_resolve_addr); + +int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse) +{ + struct rdma_id_private *id_priv; + unsigned long flags; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + spin_lock_irqsave(&id_priv->lock, flags); + if ((reuse && id_priv->state != RDMA_CM_LISTEN) || + id_priv->state == RDMA_CM_IDLE) { + id_priv->reuseaddr = reuse; + ret = 0; + } else { + ret = -EINVAL; + } + spin_unlock_irqrestore(&id_priv->lock, flags); + return ret; +} +EXPORT_SYMBOL(rdma_set_reuseaddr); + +int rdma_set_afonly(struct rdma_cm_id *id, int afonly) +{ + struct rdma_id_private *id_priv; + unsigned long flags; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + spin_lock_irqsave(&id_priv->lock, flags); + if (id_priv->state == RDMA_CM_IDLE || id_priv->state == RDMA_CM_ADDR_BOUND) { + id_priv->options |= (1 << CMA_OPTION_AFONLY); + id_priv->afonly = afonly; + ret = 0; + } else { + ret = -EINVAL; + } + spin_unlock_irqrestore(&id_priv->lock, flags); + return ret; +} +EXPORT_SYMBOL(rdma_set_afonly); + +static void cma_bind_port(struct rdma_bind_list *bind_list, + struct rdma_id_private *id_priv) +{ + struct sockaddr *addr; + struct sockaddr_ib *sib; + u64 sid, mask; + __be16 port; + + lockdep_assert_held(&lock); + + addr = cma_src_addr(id_priv); + port = htons(bind_list->port); + + switch (addr->sa_family) { + case AF_INET: + ((struct sockaddr_in *) addr)->sin_port = port; + break; + case AF_INET6: + ((struct sockaddr_in6 *) addr)->sin6_port = port; + break; + case AF_IB: + sib = (struct sockaddr_ib *) addr; + sid = be64_to_cpu(sib->sib_sid); + mask = be64_to_cpu(sib->sib_sid_mask); + sib->sib_sid = cpu_to_be64((sid & mask) | (u64) ntohs(port)); + sib->sib_sid_mask = cpu_to_be64(~0ULL); + break; + } + id_priv->bind_list = bind_list; + hlist_add_head(&id_priv->node, &bind_list->owners); +} + +static int cma_alloc_port(enum rdma_ucm_port_space ps, + struct rdma_id_private *id_priv, unsigned short snum) +{ + struct rdma_bind_list *bind_list; + int ret; + + lockdep_assert_held(&lock); + + bind_list = kzalloc(sizeof *bind_list, GFP_KERNEL); + if (!bind_list) + return -ENOMEM; + + ret = cma_ps_alloc(id_priv->id.route.addr.dev_addr.net, ps, bind_list, + snum); + if (ret < 0) + goto err; + + bind_list->ps = ps; + bind_list->port = snum; + cma_bind_port(bind_list, id_priv); + return 0; +err: + kfree(bind_list); + return ret == -ENOSPC ? -EADDRNOTAVAIL : ret; +} + +static int cma_port_is_unique(struct rdma_bind_list *bind_list, + struct rdma_id_private *id_priv) +{ + struct rdma_id_private *cur_id; + struct sockaddr *daddr = cma_dst_addr(id_priv); + struct sockaddr *saddr = cma_src_addr(id_priv); + __be16 dport = cma_port(daddr); + + lockdep_assert_held(&lock); + + hlist_for_each_entry(cur_id, &bind_list->owners, node) { + struct sockaddr *cur_daddr = cma_dst_addr(cur_id); + struct sockaddr *cur_saddr = cma_src_addr(cur_id); + __be16 cur_dport = cma_port(cur_daddr); + + if (id_priv == cur_id) + continue; + + /* different dest port -> unique */ + if (!cma_any_port(daddr) && + !cma_any_port(cur_daddr) && + (dport != cur_dport)) + continue; + + /* different src address -> unique */ + if (!cma_any_addr(saddr) && + !cma_any_addr(cur_saddr) && + cma_addr_cmp(saddr, cur_saddr)) + continue; + + /* different dst address -> unique */ + if (!cma_any_addr(daddr) && + !cma_any_addr(cur_daddr) && + cma_addr_cmp(daddr, cur_daddr)) + continue; + + return -EADDRNOTAVAIL; + } + return 0; +} + +static int cma_alloc_any_port(enum rdma_ucm_port_space ps, + struct rdma_id_private *id_priv) +{ + static unsigned int last_used_port; + int low, high, remaining; + unsigned int rover; + struct net *net = id_priv->id.route.addr.dev_addr.net; + + lockdep_assert_held(&lock); + + inet_get_local_port_range(net, &low, &high); + remaining = (high - low) + 1; + rover = prandom_u32() % remaining + low; +retry: + if (last_used_port != rover) { + struct rdma_bind_list *bind_list; + int ret; + + bind_list = cma_ps_find(net, ps, (unsigned short)rover); + + if (!bind_list) { + ret = cma_alloc_port(ps, id_priv, rover); + } else { + ret = cma_port_is_unique(bind_list, id_priv); + if (!ret) + cma_bind_port(bind_list, id_priv); + } + /* + * Remember previously used port number in order to avoid + * re-using same port immediately after it is closed. + */ + if (!ret) + last_used_port = rover; + if (ret != -EADDRNOTAVAIL) + return ret; + } + if (--remaining) { + rover++; + if ((rover < low) || (rover > high)) + rover = low; + goto retry; + } + return -EADDRNOTAVAIL; +} + +/* + * Check that the requested port is available. This is called when trying to + * bind to a specific port, or when trying to listen on a bound port. In + * the latter case, the provided id_priv may already be on the bind_list, but + * we still need to check that it's okay to start listening. + */ +static int cma_check_port(struct rdma_bind_list *bind_list, + struct rdma_id_private *id_priv, uint8_t reuseaddr) +{ + struct rdma_id_private *cur_id; + struct sockaddr *addr, *cur_addr; + + lockdep_assert_held(&lock); + + addr = cma_src_addr(id_priv); + hlist_for_each_entry(cur_id, &bind_list->owners, node) { + if (id_priv == cur_id) + continue; + + if (reuseaddr && cur_id->reuseaddr) + continue; + + cur_addr = cma_src_addr(cur_id); + if (id_priv->afonly && cur_id->afonly && + (addr->sa_family != cur_addr->sa_family)) + continue; + + if (cma_any_addr(addr) || cma_any_addr(cur_addr)) + return -EADDRNOTAVAIL; + + if (!cma_addr_cmp(addr, cur_addr)) + return -EADDRINUSE; + } + return 0; +} + +static int cma_use_port(enum rdma_ucm_port_space ps, + struct rdma_id_private *id_priv) +{ + struct rdma_bind_list *bind_list; + unsigned short snum; + int ret; + + lockdep_assert_held(&lock); + + snum = ntohs(cma_port(cma_src_addr(id_priv))); + if (snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) + return -EACCES; + + bind_list = cma_ps_find(id_priv->id.route.addr.dev_addr.net, ps, snum); + if (!bind_list) { + ret = cma_alloc_port(ps, id_priv, snum); + } else { + ret = cma_check_port(bind_list, id_priv, id_priv->reuseaddr); + if (!ret) + cma_bind_port(bind_list, id_priv); + } + return ret; +} + +static enum rdma_ucm_port_space +cma_select_inet_ps(struct rdma_id_private *id_priv) +{ + switch (id_priv->id.ps) { + case RDMA_PS_TCP: + case RDMA_PS_UDP: + case RDMA_PS_IPOIB: + case RDMA_PS_IB: + return id_priv->id.ps; + default: + + return 0; + } +} + +static enum rdma_ucm_port_space +cma_select_ib_ps(struct rdma_id_private *id_priv) +{ + enum rdma_ucm_port_space ps = 0; + struct sockaddr_ib *sib; + u64 sid_ps, mask, sid; + + sib = (struct sockaddr_ib *) cma_src_addr(id_priv); + mask = be64_to_cpu(sib->sib_sid_mask) & RDMA_IB_IP_PS_MASK; + sid = be64_to_cpu(sib->sib_sid) & mask; + + if ((id_priv->id.ps == RDMA_PS_IB) && (sid == (RDMA_IB_IP_PS_IB & mask))) { + sid_ps = RDMA_IB_IP_PS_IB; + ps = RDMA_PS_IB; + } else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_TCP)) && + (sid == (RDMA_IB_IP_PS_TCP & mask))) { + sid_ps = RDMA_IB_IP_PS_TCP; + ps = RDMA_PS_TCP; + } else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_UDP)) && + (sid == (RDMA_IB_IP_PS_UDP & mask))) { + sid_ps = RDMA_IB_IP_PS_UDP; + ps = RDMA_PS_UDP; + } + + if (ps) { + sib->sib_sid = cpu_to_be64(sid_ps | ntohs(cma_port((struct sockaddr *) sib))); + sib->sib_sid_mask = cpu_to_be64(RDMA_IB_IP_PS_MASK | + be64_to_cpu(sib->sib_sid_mask)); + } + return ps; +} + +static int cma_get_port(struct rdma_id_private *id_priv) +{ + enum rdma_ucm_port_space ps; + int ret; + + if (cma_family(id_priv) != AF_IB) + ps = cma_select_inet_ps(id_priv); + else + ps = cma_select_ib_ps(id_priv); + if (!ps) + return -EPROTONOSUPPORT; + + mutex_lock(&lock); + if (cma_any_port(cma_src_addr(id_priv))) + ret = cma_alloc_any_port(ps, id_priv); + else + ret = cma_use_port(ps, id_priv); + mutex_unlock(&lock); + + return ret; +} + +static int cma_check_linklocal(struct rdma_dev_addr *dev_addr, + struct sockaddr *addr) +{ +#if IS_ENABLED(CONFIG_IPV6) + struct sockaddr_in6 *sin6; + + if (addr->sa_family != AF_INET6) + return 0; + + sin6 = (struct sockaddr_in6 *) addr; + + if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)) + return 0; + + if (!sin6->sin6_scope_id) + return -EINVAL; + + dev_addr->bound_dev_if = sin6->sin6_scope_id; +#endif + return 0; +} + +int rdma_listen(struct rdma_cm_id *id, int backlog) +{ + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); + int ret; + + if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_LISTEN)) { + struct sockaddr_in any_in = { + .sin_family = AF_INET, + .sin_addr.s_addr = htonl(INADDR_ANY), + }; + + /* For a well behaved ULP state will be RDMA_CM_IDLE */ + ret = rdma_bind_addr(id, (struct sockaddr *)&any_in); + if (ret) + return ret; + if (WARN_ON(!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, + RDMA_CM_LISTEN))) + return -EINVAL; + } + + /* + * Once the ID reaches RDMA_CM_LISTEN it is not allowed to be reusable + * any more, and has to be unique in the bind list. + */ + if (id_priv->reuseaddr) { + mutex_lock(&lock); + ret = cma_check_port(id_priv->bind_list, id_priv, 0); + if (!ret) + id_priv->reuseaddr = 0; + mutex_unlock(&lock); + if (ret) + goto err; + } + + id_priv->backlog = backlog; + if (id_priv->cma_dev) { + if (rdma_cap_ib_cm(id->device, 1)) { + ret = cma_ib_listen(id_priv); + if (ret) + goto err; + } else if (rdma_cap_iw_cm(id->device, 1)) { + ret = cma_iw_listen(id_priv, backlog); + if (ret) + goto err; + } else { + ret = -ENOSYS; + goto err; + } + } else { + ret = cma_listen_on_all(id_priv); + if (ret) + goto err; + } + + return 0; +err: + id_priv->backlog = 0; + /* + * All the failure paths that lead here will not allow the req_handler's + * to have run. + */ + cma_comp_exch(id_priv, RDMA_CM_LISTEN, RDMA_CM_ADDR_BOUND); + return ret; +} +EXPORT_SYMBOL(rdma_listen); + +int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) +{ + struct rdma_id_private *id_priv; + int ret; + struct sockaddr *daddr; + + if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6 && + addr->sa_family != AF_IB) + return -EAFNOSUPPORT; + + id_priv = container_of(id, struct rdma_id_private, id); + if (!cma_comp_exch(id_priv, RDMA_CM_IDLE, RDMA_CM_ADDR_BOUND)) + return -EINVAL; + + ret = cma_check_linklocal(&id->route.addr.dev_addr, addr); + if (ret) + goto err1; + + memcpy(cma_src_addr(id_priv), addr, rdma_addr_size(addr)); + if (!cma_any_addr(addr)) { + ret = cma_translate_addr(addr, &id->route.addr.dev_addr); + if (ret) + goto err1; + + ret = cma_acquire_dev_by_src_ip(id_priv); + if (ret) + goto err1; + } + + if (!(id_priv->options & (1 << CMA_OPTION_AFONLY))) { + if (addr->sa_family == AF_INET) + id_priv->afonly = 1; +#if IS_ENABLED(CONFIG_IPV6) + else if (addr->sa_family == AF_INET6) { + struct net *net = id_priv->id.route.addr.dev_addr.net; + + id_priv->afonly = net->ipv6.sysctl.bindv6only; + } +#endif + } + daddr = cma_dst_addr(id_priv); + daddr->sa_family = addr->sa_family; + + ret = cma_get_port(id_priv); + if (ret) + goto err2; + + if (!cma_any_addr(addr)) + rdma_restrack_add(&id_priv->res); + return 0; +err2: + if (id_priv->cma_dev) + cma_release_dev(id_priv); +err1: + cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_IDLE); + return ret; +} +EXPORT_SYMBOL(rdma_bind_addr); + +static int cma_format_hdr(void *hdr, struct rdma_id_private *id_priv) +{ + struct cma_hdr *cma_hdr; + + cma_hdr = hdr; + cma_hdr->cma_version = CMA_VERSION; + if (cma_family(id_priv) == AF_INET) { + struct sockaddr_in *src4, *dst4; + + src4 = (struct sockaddr_in *) cma_src_addr(id_priv); + dst4 = (struct sockaddr_in *) cma_dst_addr(id_priv); + + cma_set_ip_ver(cma_hdr, 4); + cma_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr; + cma_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr; + cma_hdr->port = src4->sin_port; + } else if (cma_family(id_priv) == AF_INET6) { + struct sockaddr_in6 *src6, *dst6; + + src6 = (struct sockaddr_in6 *) cma_src_addr(id_priv); + dst6 = (struct sockaddr_in6 *) cma_dst_addr(id_priv); + + cma_set_ip_ver(cma_hdr, 6); + cma_hdr->src_addr.ip6 = src6->sin6_addr; + cma_hdr->dst_addr.ip6 = dst6->sin6_addr; + cma_hdr->port = src6->sin6_port; + } + return 0; +} + +static int cma_sidr_rep_handler(struct ib_cm_id *cm_id, + const struct ib_cm_event *ib_event) +{ + struct rdma_id_private *id_priv = cm_id->context; + struct rdma_cm_event event = {}; + const struct ib_cm_sidr_rep_event_param *rep = + &ib_event->param.sidr_rep_rcvd; + int ret; + + mutex_lock(&id_priv->handler_mutex); + if (READ_ONCE(id_priv->state) != RDMA_CM_CONNECT) + goto out; + + switch (ib_event->event) { + case IB_CM_SIDR_REQ_ERROR: + event.event = RDMA_CM_EVENT_UNREACHABLE; + event.status = -ETIMEDOUT; + break; + case IB_CM_SIDR_REP_RECEIVED: + event.param.ud.private_data = ib_event->private_data; + event.param.ud.private_data_len = IB_CM_SIDR_REP_PRIVATE_DATA_SIZE; + if (rep->status != IB_SIDR_SUCCESS) { + event.event = RDMA_CM_EVENT_UNREACHABLE; + event.status = ib_event->param.sidr_rep_rcvd.status; + pr_debug_ratelimited("RDMA CM: UNREACHABLE: bad SIDR reply. status %d\n", + event.status); + break; + } + ret = cma_set_qkey(id_priv, rep->qkey); + if (ret) { + pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to set qkey. status %d\n", ret); + event.event = RDMA_CM_EVENT_ADDR_ERROR; + event.status = ret; + break; + } + ib_init_ah_attr_from_path(id_priv->id.device, + id_priv->id.port_num, + id_priv->id.route.path_rec, + &event.param.ud.ah_attr, + rep->sgid_attr); + event.param.ud.qp_num = rep->qpn; + event.param.ud.qkey = rep->qkey; + event.event = RDMA_CM_EVENT_ESTABLISHED; + event.status = 0; + break; + default: + pr_err("RDMA CMA: unexpected IB CM event: %d\n", + ib_event->event); + goto out; + } + + ret = cma_cm_event_handler(id_priv, &event); + + rdma_destroy_ah_attr(&event.param.ud.ah_attr); + if (ret) { + /* Destroy the CM ID by returning a non-zero value. */ + id_priv->cm_id.ib = NULL; + destroy_id_handler_unlock(id_priv); + return ret; + } +out: + mutex_unlock(&id_priv->handler_mutex); + return 0; +} + +static int cma_resolve_ib_udp(struct rdma_id_private *id_priv, + struct rdma_conn_param *conn_param) +{ + struct ib_cm_sidr_req_param req; + struct ib_cm_id *id; + void *private_data; + u8 offset; + int ret; + + memset(&req, 0, sizeof req); + offset = cma_user_data_offset(id_priv); + if (check_add_overflow(offset, conn_param->private_data_len, &req.private_data_len)) + return -EINVAL; + + if (req.private_data_len) { + private_data = kzalloc(req.private_data_len, GFP_ATOMIC); + if (!private_data) + return -ENOMEM; + } else { + private_data = NULL; + } + + if (conn_param->private_data && conn_param->private_data_len) + memcpy(private_data + offset, conn_param->private_data, + conn_param->private_data_len); + + if (private_data) { + ret = cma_format_hdr(private_data, id_priv); + if (ret) + goto out; + req.private_data = private_data; + } + + id = ib_create_cm_id(id_priv->id.device, cma_sidr_rep_handler, + id_priv); + if (IS_ERR(id)) { + ret = PTR_ERR(id); + goto out; + } + id_priv->cm_id.ib = id; + + req.path = id_priv->id.route.path_rec; + req.sgid_attr = id_priv->id.route.addr.dev_addr.sgid_attr; + req.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv)); + req.timeout_ms = 1 << (CMA_CM_RESPONSE_TIMEOUT - 8); + req.max_cm_retries = CMA_MAX_CM_RETRIES; + + trace_cm_send_sidr_req(id_priv); + ret = ib_send_cm_sidr_req(id_priv->cm_id.ib, &req); + if (ret) { + ib_destroy_cm_id(id_priv->cm_id.ib); + id_priv->cm_id.ib = NULL; + } +out: + kfree(private_data); + return ret; +} + +static int cma_connect_ib(struct rdma_id_private *id_priv, + struct rdma_conn_param *conn_param) +{ + struct ib_cm_req_param req; + struct rdma_route *route; + void *private_data; + struct ib_cm_id *id; + u8 offset; + int ret; + + memset(&req, 0, sizeof req); + offset = cma_user_data_offset(id_priv); + if (check_add_overflow(offset, conn_param->private_data_len, &req.private_data_len)) + return -EINVAL; + + if (req.private_data_len) { + private_data = kzalloc(req.private_data_len, GFP_ATOMIC); + if (!private_data) + return -ENOMEM; + } else { + private_data = NULL; + } + + if (conn_param->private_data && conn_param->private_data_len) + memcpy(private_data + offset, conn_param->private_data, + conn_param->private_data_len); + + id = ib_create_cm_id(id_priv->id.device, cma_ib_handler, id_priv); + if (IS_ERR(id)) { + ret = PTR_ERR(id); + goto out; + } + id_priv->cm_id.ib = id; + + route = &id_priv->id.route; + if (private_data) { + ret = cma_format_hdr(private_data, id_priv); + if (ret) + goto out; + req.private_data = private_data; + } + + req.primary_path = &route->path_rec[0]; + req.primary_path_inbound = route->path_rec_inbound; + req.primary_path_outbound = route->path_rec_outbound; + if (route->num_pri_alt_paths == 2) + req.alternate_path = &route->path_rec[1]; + + req.ppath_sgid_attr = id_priv->id.route.addr.dev_addr.sgid_attr; + /* Alternate path SGID attribute currently unsupported */ + req.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv)); + req.qp_num = id_priv->qp_num; + req.qp_type = id_priv->id.qp_type; + req.starting_psn = id_priv->seq_num; + req.responder_resources = conn_param->responder_resources; + req.initiator_depth = conn_param->initiator_depth; + req.flow_control = conn_param->flow_control; + req.retry_count = min_t(u8, 7, conn_param->retry_count); + req.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count); + req.remote_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT; + req.local_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT; + req.max_cm_retries = CMA_MAX_CM_RETRIES; + req.srq = id_priv->srq ? 1 : 0; + req.ece.vendor_id = id_priv->ece.vendor_id; + req.ece.attr_mod = id_priv->ece.attr_mod; + + trace_cm_send_req(id_priv); + ret = ib_send_cm_req(id_priv->cm_id.ib, &req); +out: + if (ret && !IS_ERR(id)) { + ib_destroy_cm_id(id); + id_priv->cm_id.ib = NULL; + } + + kfree(private_data); + return ret; +} + +static int cma_connect_iw(struct rdma_id_private *id_priv, + struct rdma_conn_param *conn_param) +{ + struct iw_cm_id *cm_id; + int ret; + struct iw_cm_conn_param iw_param; + + cm_id = iw_create_cm_id(id_priv->id.device, cma_iw_handler, id_priv); + if (IS_ERR(cm_id)) + return PTR_ERR(cm_id); + + mutex_lock(&id_priv->qp_mutex); + cm_id->tos = id_priv->tos; + cm_id->tos_set = id_priv->tos_set; + mutex_unlock(&id_priv->qp_mutex); + + id_priv->cm_id.iw = cm_id; + + memcpy(&cm_id->local_addr, cma_src_addr(id_priv), + rdma_addr_size(cma_src_addr(id_priv))); + memcpy(&cm_id->remote_addr, cma_dst_addr(id_priv), + rdma_addr_size(cma_dst_addr(id_priv))); + + ret = cma_modify_qp_rtr(id_priv, conn_param); + if (ret) + goto out; + + if (conn_param) { + iw_param.ord = conn_param->initiator_depth; + iw_param.ird = conn_param->responder_resources; + iw_param.private_data = conn_param->private_data; + iw_param.private_data_len = conn_param->private_data_len; + iw_param.qpn = id_priv->id.qp ? id_priv->qp_num : conn_param->qp_num; + } else { + memset(&iw_param, 0, sizeof iw_param); + iw_param.qpn = id_priv->qp_num; + } + ret = iw_cm_connect(cm_id, &iw_param); +out: + if (ret) { + iw_destroy_cm_id(cm_id); + id_priv->cm_id.iw = NULL; + } + return ret; +} + +/** + * rdma_connect_locked - Initiate an active connection request. + * @id: Connection identifier to connect. + * @conn_param: Connection information used for connected QPs. + * + * Same as rdma_connect() but can only be called from the + * RDMA_CM_EVENT_ROUTE_RESOLVED handler callback. + */ +int rdma_connect_locked(struct rdma_cm_id *id, + struct rdma_conn_param *conn_param) +{ + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); + int ret; + + if (!cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_CONNECT)) + return -EINVAL; + + if (!id->qp) { + id_priv->qp_num = conn_param->qp_num; + id_priv->srq = conn_param->srq; + } + + if (rdma_cap_ib_cm(id->device, id->port_num)) { + if (id->qp_type == IB_QPT_UD) + ret = cma_resolve_ib_udp(id_priv, conn_param); + else + ret = cma_connect_ib(id_priv, conn_param); + } else if (rdma_cap_iw_cm(id->device, id->port_num)) { + ret = cma_connect_iw(id_priv, conn_param); + } else { + ret = -ENOSYS; + } + if (ret) + goto err_state; + return 0; +err_state: + cma_comp_exch(id_priv, RDMA_CM_CONNECT, RDMA_CM_ROUTE_RESOLVED); + return ret; +} +EXPORT_SYMBOL(rdma_connect_locked); + +/** + * rdma_connect - Initiate an active connection request. + * @id: Connection identifier to connect. + * @conn_param: Connection information used for connected QPs. + * + * Users must have resolved a route for the rdma_cm_id to connect with by having + * called rdma_resolve_route before calling this routine. + * + * This call will either connect to a remote QP or obtain remote QP information + * for unconnected rdma_cm_id's. The actual operation is based on the + * rdma_cm_id's port space. + */ +int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) +{ + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); + int ret; + + mutex_lock(&id_priv->handler_mutex); + ret = rdma_connect_locked(id, conn_param); + mutex_unlock(&id_priv->handler_mutex); + return ret; +} +EXPORT_SYMBOL(rdma_connect); + +/** + * rdma_connect_ece - Initiate an active connection request with ECE data. + * @id: Connection identifier to connect. + * @conn_param: Connection information used for connected QPs. + * @ece: ECE parameters + * + * See rdma_connect() explanation. + */ +int rdma_connect_ece(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, + struct rdma_ucm_ece *ece) +{ + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); + + id_priv->ece.vendor_id = ece->vendor_id; + id_priv->ece.attr_mod = ece->attr_mod; + + return rdma_connect(id, conn_param); +} +EXPORT_SYMBOL(rdma_connect_ece); + +static int cma_accept_ib(struct rdma_id_private *id_priv, + struct rdma_conn_param *conn_param) +{ + struct ib_cm_rep_param rep; + int ret; + + ret = cma_modify_qp_rtr(id_priv, conn_param); + if (ret) + goto out; + + ret = cma_modify_qp_rts(id_priv, conn_param); + if (ret) + goto out; + + memset(&rep, 0, sizeof rep); + rep.qp_num = id_priv->qp_num; + rep.starting_psn = id_priv->seq_num; + rep.private_data = conn_param->private_data; + rep.private_data_len = conn_param->private_data_len; + rep.responder_resources = conn_param->responder_resources; + rep.initiator_depth = conn_param->initiator_depth; + rep.failover_accepted = 0; + rep.flow_control = conn_param->flow_control; + rep.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count); + rep.srq = id_priv->srq ? 1 : 0; + rep.ece.vendor_id = id_priv->ece.vendor_id; + rep.ece.attr_mod = id_priv->ece.attr_mod; + + trace_cm_send_rep(id_priv); + ret = ib_send_cm_rep(id_priv->cm_id.ib, &rep); +out: + return ret; +} + +static int cma_accept_iw(struct rdma_id_private *id_priv, + struct rdma_conn_param *conn_param) +{ + struct iw_cm_conn_param iw_param; + int ret; + + if (!conn_param) + return -EINVAL; + + ret = cma_modify_qp_rtr(id_priv, conn_param); + if (ret) + return ret; + + iw_param.ord = conn_param->initiator_depth; + iw_param.ird = conn_param->responder_resources; + iw_param.private_data = conn_param->private_data; + iw_param.private_data_len = conn_param->private_data_len; + if (id_priv->id.qp) + iw_param.qpn = id_priv->qp_num; + else + iw_param.qpn = conn_param->qp_num; + + return iw_cm_accept(id_priv->cm_id.iw, &iw_param); +} + +static int cma_send_sidr_rep(struct rdma_id_private *id_priv, + enum ib_cm_sidr_status status, u32 qkey, + const void *private_data, int private_data_len) +{ + struct ib_cm_sidr_rep_param rep; + int ret; + + memset(&rep, 0, sizeof rep); + rep.status = status; + if (status == IB_SIDR_SUCCESS) { + ret = cma_set_qkey(id_priv, qkey); + if (ret) + return ret; + rep.qp_num = id_priv->qp_num; + rep.qkey = id_priv->qkey; + + rep.ece.vendor_id = id_priv->ece.vendor_id; + rep.ece.attr_mod = id_priv->ece.attr_mod; + } + + rep.private_data = private_data; + rep.private_data_len = private_data_len; + + trace_cm_send_sidr_rep(id_priv); + return ib_send_cm_sidr_rep(id_priv->cm_id.ib, &rep); +} + +/** + * rdma_accept - Called to accept a connection request or response. + * @id: Connection identifier associated with the request. + * @conn_param: Information needed to establish the connection. This must be + * provided if accepting a connection request. If accepting a connection + * response, this parameter must be NULL. + * + * Typically, this routine is only called by the listener to accept a connection + * request. It must also be called on the active side of a connection if the + * user is performing their own QP transitions. + * + * In the case of error, a reject message is sent to the remote side and the + * state of the qp associated with the id is modified to error, such that any + * previously posted receive buffers would be flushed. + * + * This function is for use by kernel ULPs and must be called from under the + * handler callback. + */ +int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) +{ + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); + int ret; + + lockdep_assert_held(&id_priv->handler_mutex); + + if (READ_ONCE(id_priv->state) != RDMA_CM_CONNECT) + return -EINVAL; + + if (!id->qp && conn_param) { + id_priv->qp_num = conn_param->qp_num; + id_priv->srq = conn_param->srq; + } + + if (rdma_cap_ib_cm(id->device, id->port_num)) { + if (id->qp_type == IB_QPT_UD) { + if (conn_param) + ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS, + conn_param->qkey, + conn_param->private_data, + conn_param->private_data_len); + else + ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS, + 0, NULL, 0); + } else { + if (conn_param) + ret = cma_accept_ib(id_priv, conn_param); + else + ret = cma_rep_recv(id_priv); + } + } else if (rdma_cap_iw_cm(id->device, id->port_num)) { + ret = cma_accept_iw(id_priv, conn_param); + } else { + ret = -ENOSYS; + } + if (ret) + goto reject; + + return 0; +reject: + cma_modify_qp_err(id_priv); + rdma_reject(id, NULL, 0, IB_CM_REJ_CONSUMER_DEFINED); + return ret; +} +EXPORT_SYMBOL(rdma_accept); + +int rdma_accept_ece(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, + struct rdma_ucm_ece *ece) +{ + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); + + id_priv->ece.vendor_id = ece->vendor_id; + id_priv->ece.attr_mod = ece->attr_mod; + + return rdma_accept(id, conn_param); +} +EXPORT_SYMBOL(rdma_accept_ece); + +void rdma_lock_handler(struct rdma_cm_id *id) +{ + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); + + mutex_lock(&id_priv->handler_mutex); +} +EXPORT_SYMBOL(rdma_lock_handler); + +void rdma_unlock_handler(struct rdma_cm_id *id) +{ + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); + + mutex_unlock(&id_priv->handler_mutex); +} +EXPORT_SYMBOL(rdma_unlock_handler); + +int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event) +{ + struct rdma_id_private *id_priv; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + if (!id_priv->cm_id.ib) + return -EINVAL; + + switch (id->device->node_type) { + case RDMA_NODE_IB_CA: + ret = ib_cm_notify(id_priv->cm_id.ib, event); + break; + default: + ret = 0; + break; + } + return ret; +} +EXPORT_SYMBOL(rdma_notify); + +int rdma_reject(struct rdma_cm_id *id, const void *private_data, + u8 private_data_len, u8 reason) +{ + struct rdma_id_private *id_priv; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + if (!id_priv->cm_id.ib) + return -EINVAL; + + if (rdma_cap_ib_cm(id->device, id->port_num)) { + if (id->qp_type == IB_QPT_UD) { + ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT, 0, + private_data, private_data_len); + } else { + trace_cm_send_rej(id_priv); + ret = ib_send_cm_rej(id_priv->cm_id.ib, reason, NULL, 0, + private_data, private_data_len); + } + } else if (rdma_cap_iw_cm(id->device, id->port_num)) { + ret = iw_cm_reject(id_priv->cm_id.iw, + private_data, private_data_len); + } else { + ret = -ENOSYS; + } + + return ret; +} +EXPORT_SYMBOL(rdma_reject); + +int rdma_disconnect(struct rdma_cm_id *id) +{ + struct rdma_id_private *id_priv; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + if (!id_priv->cm_id.ib) + return -EINVAL; + + if (rdma_cap_ib_cm(id->device, id->port_num)) { + ret = cma_modify_qp_err(id_priv); + if (ret) + goto out; + /* Initiate or respond to a disconnect. */ + trace_cm_disconnect(id_priv); + if (ib_send_cm_dreq(id_priv->cm_id.ib, NULL, 0)) { + if (!ib_send_cm_drep(id_priv->cm_id.ib, NULL, 0)) + trace_cm_sent_drep(id_priv); + } else { + trace_cm_sent_dreq(id_priv); + } + } else if (rdma_cap_iw_cm(id->device, id->port_num)) { + ret = iw_cm_disconnect(id_priv->cm_id.iw, 0); + } else + ret = -EINVAL; + +out: + return ret; +} +EXPORT_SYMBOL(rdma_disconnect); + +static void cma_make_mc_event(int status, struct rdma_id_private *id_priv, + struct ib_sa_multicast *multicast, + struct rdma_cm_event *event, + struct cma_multicast *mc) +{ + struct rdma_dev_addr *dev_addr; + enum ib_gid_type gid_type; + struct net_device *ndev; + + if (!status) + status = cma_set_qkey(id_priv, be32_to_cpu(multicast->rec.qkey)); + else + pr_debug_ratelimited("RDMA CM: MULTICAST_ERROR: failed to join multicast. status %d\n", + status); + + event->status = status; + event->param.ud.private_data = mc->context; + if (status) { + event->event = RDMA_CM_EVENT_MULTICAST_ERROR; + return; + } + + dev_addr = &id_priv->id.route.addr.dev_addr; + ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); + gid_type = + id_priv->cma_dev + ->default_gid_type[id_priv->id.port_num - + rdma_start_port( + id_priv->cma_dev->device)]; + + event->event = RDMA_CM_EVENT_MULTICAST_JOIN; + if (ib_init_ah_from_mcmember(id_priv->id.device, id_priv->id.port_num, + &multicast->rec, ndev, gid_type, + &event->param.ud.ah_attr)) { + event->event = RDMA_CM_EVENT_MULTICAST_ERROR; + goto out; + } + + event->param.ud.qp_num = 0xFFFFFF; + event->param.ud.qkey = be32_to_cpu(multicast->rec.qkey); + +out: + if (ndev) + dev_put(ndev); +} + +static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) +{ + struct cma_multicast *mc = multicast->context; + struct rdma_id_private *id_priv = mc->id_priv; + struct rdma_cm_event event = {}; + int ret = 0; + + mutex_lock(&id_priv->handler_mutex); + if (READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL || + READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING) + goto out; + + cma_make_mc_event(status, id_priv, multicast, &event, mc); + ret = cma_cm_event_handler(id_priv, &event); + rdma_destroy_ah_attr(&event.param.ud.ah_attr); + WARN_ON(ret); + +out: + mutex_unlock(&id_priv->handler_mutex); + return 0; +} + +static void cma_set_mgid(struct rdma_id_private *id_priv, + struct sockaddr *addr, union ib_gid *mgid) +{ + unsigned char mc_map[MAX_ADDR_LEN]; + struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; + struct sockaddr_in *sin = (struct sockaddr_in *) addr; + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) addr; + + if (cma_any_addr(addr)) { + memset(mgid, 0, sizeof *mgid); + } else if ((addr->sa_family == AF_INET6) && + ((be32_to_cpu(sin6->sin6_addr.s6_addr32[0]) & 0xFFF0FFFF) == + 0xFF10A01B)) { + /* IPv6 address is an SA assigned MGID. */ + memcpy(mgid, &sin6->sin6_addr, sizeof *mgid); + } else if (addr->sa_family == AF_IB) { + memcpy(mgid, &((struct sockaddr_ib *) addr)->sib_addr, sizeof *mgid); + } else if (addr->sa_family == AF_INET6) { + ipv6_ib_mc_map(&sin6->sin6_addr, dev_addr->broadcast, mc_map); + if (id_priv->id.ps == RDMA_PS_UDP) + mc_map[7] = 0x01; /* Use RDMA CM signature */ + *mgid = *(union ib_gid *) (mc_map + 4); + } else { + ip_ib_mc_map(sin->sin_addr.s_addr, dev_addr->broadcast, mc_map); + if (id_priv->id.ps == RDMA_PS_UDP) + mc_map[7] = 0x01; /* Use RDMA CM signature */ + *mgid = *(union ib_gid *) (mc_map + 4); + } +} + +static int cma_join_ib_multicast(struct rdma_id_private *id_priv, + struct cma_multicast *mc) +{ + struct ib_sa_mcmember_rec rec; + struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; + ib_sa_comp_mask comp_mask; + int ret; + + ib_addr_get_mgid(dev_addr, &rec.mgid); + ret = ib_sa_get_mcmember_rec(id_priv->id.device, id_priv->id.port_num, + &rec.mgid, &rec); + if (ret) + return ret; + + ret = cma_set_qkey(id_priv, 0); + if (ret) + return ret; + + cma_set_mgid(id_priv, (struct sockaddr *) &mc->addr, &rec.mgid); + rec.qkey = cpu_to_be32(id_priv->qkey); + rdma_addr_get_sgid(dev_addr, &rec.port_gid); + rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr)); + rec.join_state = mc->join_state; + + comp_mask = IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID | + IB_SA_MCMEMBER_REC_PKEY | IB_SA_MCMEMBER_REC_JOIN_STATE | + IB_SA_MCMEMBER_REC_QKEY | IB_SA_MCMEMBER_REC_SL | + IB_SA_MCMEMBER_REC_FLOW_LABEL | + IB_SA_MCMEMBER_REC_TRAFFIC_CLASS; + + if (id_priv->id.ps == RDMA_PS_IPOIB) + comp_mask |= IB_SA_MCMEMBER_REC_RATE | + IB_SA_MCMEMBER_REC_RATE_SELECTOR | + IB_SA_MCMEMBER_REC_MTU_SELECTOR | + IB_SA_MCMEMBER_REC_MTU | + IB_SA_MCMEMBER_REC_HOP_LIMIT; + + mc->sa_mc = ib_sa_join_multicast(&sa_client, id_priv->id.device, + id_priv->id.port_num, &rec, comp_mask, + GFP_KERNEL, cma_ib_mc_handler, mc); + return PTR_ERR_OR_ZERO(mc->sa_mc); +} + +static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid, + enum ib_gid_type gid_type) +{ + struct sockaddr_in *sin = (struct sockaddr_in *)addr; + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr; + + if (cma_any_addr(addr)) { + memset(mgid, 0, sizeof *mgid); + } else if (addr->sa_family == AF_INET6) { + memcpy(mgid, &sin6->sin6_addr, sizeof *mgid); + } else { + mgid->raw[0] = + (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) ? 0 : 0xff; + mgid->raw[1] = + (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) ? 0 : 0x0e; + mgid->raw[2] = 0; + mgid->raw[3] = 0; + mgid->raw[4] = 0; + mgid->raw[5] = 0; + mgid->raw[6] = 0; + mgid->raw[7] = 0; + mgid->raw[8] = 0; + mgid->raw[9] = 0; + mgid->raw[10] = 0xff; + mgid->raw[11] = 0xff; + *(__be32 *)(&mgid->raw[12]) = sin->sin_addr.s_addr; + } +} + +static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, + struct cma_multicast *mc) +{ + struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; + int err = 0; + struct sockaddr *addr = (struct sockaddr *)&mc->addr; + struct net_device *ndev = NULL; + struct ib_sa_multicast ib; + enum ib_gid_type gid_type; + bool send_only; + + send_only = mc->join_state == BIT(SENDONLY_FULLMEMBER_JOIN); + + if (cma_zero_addr(addr)) + return -EINVAL; + + gid_type = id_priv->cma_dev->default_gid_type[id_priv->id.port_num - + rdma_start_port(id_priv->cma_dev->device)]; + cma_iboe_set_mgid(addr, &ib.rec.mgid, gid_type); + + ib.rec.pkey = cpu_to_be16(0xffff); + if (id_priv->id.ps == RDMA_PS_UDP) + ib.rec.qkey = cpu_to_be32(RDMA_UDP_QKEY); + + if (dev_addr->bound_dev_if) + ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); + if (!ndev) + return -ENODEV; + + ib.rec.rate = iboe_get_rate(ndev); + ib.rec.hop_limit = 1; + ib.rec.mtu = iboe_get_mtu(ndev->mtu); + + if (addr->sa_family == AF_INET) { + if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) { + ib.rec.hop_limit = IPV6_DEFAULT_HOPLIMIT; + if (!send_only) { + err = cma_igmp_send(ndev, &ib.rec.mgid, + true); + } + } + } else { + if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) + err = -ENOTSUPP; + } + dev_put(ndev); + if (err || !ib.rec.mtu) + return err ?: -EINVAL; + + rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, + &ib.rec.port_gid); + INIT_WORK(&mc->iboe_join.work, cma_iboe_join_work_handler); + cma_make_mc_event(0, id_priv, &ib, &mc->iboe_join.event, mc); + queue_work(cma_wq, &mc->iboe_join.work); + return 0; +} + +int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, + u8 join_state, void *context) +{ + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); + struct cma_multicast *mc; + int ret; + + /* Not supported for kernel QPs */ + if (WARN_ON(id->qp)) + return -EINVAL; + + /* ULP is calling this wrong. */ + if (!id->device || (READ_ONCE(id_priv->state) != RDMA_CM_ADDR_BOUND && + READ_ONCE(id_priv->state) != RDMA_CM_ADDR_RESOLVED)) + return -EINVAL; + + mc = kzalloc(sizeof(*mc), GFP_KERNEL); + if (!mc) + return -ENOMEM; + + memcpy(&mc->addr, addr, rdma_addr_size(addr)); + mc->context = context; + mc->id_priv = id_priv; + mc->join_state = join_state; + + if (rdma_protocol_roce(id->device, id->port_num)) { + ret = cma_iboe_join_multicast(id_priv, mc); + if (ret) + goto out_err; + } else if (rdma_cap_ib_mcast(id->device, id->port_num)) { + ret = cma_join_ib_multicast(id_priv, mc); + if (ret) + goto out_err; + } else { + ret = -ENOSYS; + goto out_err; + } + + spin_lock(&id_priv->lock); + list_add(&mc->list, &id_priv->mc_list); + spin_unlock(&id_priv->lock); + + return 0; +out_err: + kfree(mc); + return ret; +} +EXPORT_SYMBOL(rdma_join_multicast); + +void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr) +{ + struct rdma_id_private *id_priv; + struct cma_multicast *mc; + + id_priv = container_of(id, struct rdma_id_private, id); + spin_lock_irq(&id_priv->lock); + list_for_each_entry(mc, &id_priv->mc_list, list) { + if (memcmp(&mc->addr, addr, rdma_addr_size(addr)) != 0) + continue; + list_del(&mc->list); + spin_unlock_irq(&id_priv->lock); + + WARN_ON(id_priv->cma_dev->device != id->device); + destroy_mc(id_priv, mc); + return; + } + spin_unlock_irq(&id_priv->lock); +} +EXPORT_SYMBOL(rdma_leave_multicast); + +static int cma_netdev_change(struct net_device *ndev, struct rdma_id_private *id_priv) +{ + struct rdma_dev_addr *dev_addr; + struct cma_work *work; + + dev_addr = &id_priv->id.route.addr.dev_addr; + + if ((dev_addr->bound_dev_if == ndev->ifindex) && + (net_eq(dev_net(ndev), dev_addr->net)) && + memcmp(dev_addr->src_dev_addr, ndev->dev_addr, ndev->addr_len)) { + pr_info("RDMA CM addr change for ndev %s used by id %p\n", + ndev->name, &id_priv->id); + work = kzalloc(sizeof *work, GFP_KERNEL); + if (!work) + return -ENOMEM; + + INIT_WORK(&work->work, cma_work_handler); + work->id = id_priv; + work->event.event = RDMA_CM_EVENT_ADDR_CHANGE; + cma_id_get(id_priv); + queue_work(cma_wq, &work->work); + } + + return 0; +} + +static int cma_netdev_callback(struct notifier_block *self, unsigned long event, + void *ptr) +{ + struct net_device *ndev = netdev_notifier_info_to_dev(ptr); + struct cma_device *cma_dev; + struct rdma_id_private *id_priv; + int ret = NOTIFY_DONE; + + if (event != NETDEV_BONDING_FAILOVER) + return NOTIFY_DONE; + + if (!netif_is_bond_master(ndev)) + return NOTIFY_DONE; + + mutex_lock(&lock); + list_for_each_entry(cma_dev, &dev_list, list) + list_for_each_entry(id_priv, &cma_dev->id_list, device_item) { + ret = cma_netdev_change(ndev, id_priv); + if (ret) + goto out; + } + +out: + mutex_unlock(&lock); + return ret; +} + +static void cma_netevent_work_handler(struct work_struct *_work) +{ + struct cma_netevent_work *network = + container_of(_work, struct cma_netevent_work, work); + struct rdma_cm_event event = {}; + + mutex_lock(&network->id_priv->handler_mutex); + + if (READ_ONCE(network->id_priv->state) == RDMA_CM_DESTROYING || + READ_ONCE(network->id_priv->state) == RDMA_CM_DEVICE_REMOVAL) + goto out_unlock; + + event.event = RDMA_CM_EVENT_UNREACHABLE; + event.status = -ETIMEDOUT; + + if (cma_cm_event_handler(network->id_priv, &event)) { + __acquire(&network->id_priv->handler_mutex); + network->id_priv->cm_id.ib = NULL; + cma_id_put(network->id_priv); + destroy_id_handler_unlock(network->id_priv); + kfree(network); + return; + } + +out_unlock: + mutex_unlock(&network->id_priv->handler_mutex); + cma_id_put(network->id_priv); + kfree(network); +} + +static int cma_netevent_callback(struct notifier_block *self, + unsigned long event, void *ctx) +{ + struct id_table_entry *ips_node = NULL; + struct rdma_id_private *current_id; + struct cma_netevent_work *network; + struct neighbour *neigh = ctx; + unsigned long flags; + + if (event != NETEVENT_NEIGH_UPDATE) + return NOTIFY_DONE; + + spin_lock_irqsave(&id_table_lock, flags); + if (neigh->tbl->family == AF_INET6) { + struct sockaddr_in6 neigh_sock_6; + + neigh_sock_6.sin6_family = AF_INET6; + neigh_sock_6.sin6_addr = *(struct in6_addr *)neigh->primary_key; + ips_node = node_from_ndev_ip(&id_table, neigh->dev->ifindex, + (struct sockaddr *)&neigh_sock_6); + } else if (neigh->tbl->family == AF_INET) { + struct sockaddr_in neigh_sock_4; + + neigh_sock_4.sin_family = AF_INET; + neigh_sock_4.sin_addr.s_addr = *(__be32 *)(neigh->primary_key); + ips_node = node_from_ndev_ip(&id_table, neigh->dev->ifindex, + (struct sockaddr *)&neigh_sock_4); + } else + goto out; + + if (!ips_node) + goto out; + + list_for_each_entry(current_id, &ips_node->id_list, id_list_entry) { + if (!memcmp(current_id->id.route.addr.dev_addr.dst_dev_addr, + neigh->ha, ETH_ALEN)) + continue; + network = kzalloc(sizeof(*network), GFP_ATOMIC); + if (!network) + goto out; + + INIT_WORK(&network->work, cma_netevent_work_handler); + network->id_priv = current_id; + cma_id_get(current_id); + queue_work(cma_netevent_wq, &network->work); + } +out: + spin_unlock_irqrestore(&id_table_lock, flags); + return NOTIFY_DONE; +} + +static struct notifier_block cma_nb = { + .notifier_call = cma_netdev_callback +}; + +static struct notifier_block cma_netevent_cb = { + .notifier_call = cma_netevent_callback +}; + +static void cma_send_device_removal_put(struct rdma_id_private *id_priv) +{ + struct rdma_cm_event event = { .event = RDMA_CM_EVENT_DEVICE_REMOVAL }; + enum rdma_cm_state state; + unsigned long flags; + + mutex_lock(&id_priv->handler_mutex); + /* Record that we want to remove the device */ + spin_lock_irqsave(&id_priv->lock, flags); + state = id_priv->state; + if (state == RDMA_CM_DESTROYING || state == RDMA_CM_DEVICE_REMOVAL) { + spin_unlock_irqrestore(&id_priv->lock, flags); + mutex_unlock(&id_priv->handler_mutex); + cma_id_put(id_priv); + return; + } + id_priv->state = RDMA_CM_DEVICE_REMOVAL; + spin_unlock_irqrestore(&id_priv->lock, flags); + + if (cma_cm_event_handler(id_priv, &event)) { + /* + * At this point the ULP promises it won't call + * rdma_destroy_id() concurrently + */ + cma_id_put(id_priv); + mutex_unlock(&id_priv->handler_mutex); + trace_cm_id_destroy(id_priv); + _destroy_id(id_priv, state); + return; + } + mutex_unlock(&id_priv->handler_mutex); + + /* + * If this races with destroy then the thread that first assigns state + * to a destroying does the cancel. + */ + cma_cancel_operation(id_priv, state); + cma_id_put(id_priv); +} + +static void cma_process_remove(struct cma_device *cma_dev) +{ + mutex_lock(&lock); + while (!list_empty(&cma_dev->id_list)) { + struct rdma_id_private *id_priv = list_first_entry( + &cma_dev->id_list, struct rdma_id_private, device_item); + + list_del_init(&id_priv->listen_item); + list_del_init(&id_priv->device_item); + cma_id_get(id_priv); + mutex_unlock(&lock); + + cma_send_device_removal_put(id_priv); + + mutex_lock(&lock); + } + mutex_unlock(&lock); + + cma_dev_put(cma_dev); + wait_for_completion(&cma_dev->comp); +} + +static bool cma_supported(struct ib_device *device) +{ + u32 i; + + rdma_for_each_port(device, i) { + if (rdma_cap_ib_cm(device, i) || rdma_cap_iw_cm(device, i)) + return true; + } + return false; +} + +static int cma_add_one(struct ib_device *device) +{ + struct rdma_id_private *to_destroy; + struct cma_device *cma_dev; + struct rdma_id_private *id_priv; + unsigned long supported_gids = 0; + int ret; + u32 i; + + if (!cma_supported(device)) + return -EOPNOTSUPP; + + cma_dev = kmalloc(sizeof(*cma_dev), GFP_KERNEL); + if (!cma_dev) + return -ENOMEM; + + cma_dev->device = device; + cma_dev->default_gid_type = kcalloc(device->phys_port_cnt, + sizeof(*cma_dev->default_gid_type), + GFP_KERNEL); + if (!cma_dev->default_gid_type) { + ret = -ENOMEM; + goto free_cma_dev; + } + + cma_dev->default_roce_tos = kcalloc(device->phys_port_cnt, + sizeof(*cma_dev->default_roce_tos), + GFP_KERNEL); + if (!cma_dev->default_roce_tos) { + ret = -ENOMEM; + goto free_gid_type; + } + + rdma_for_each_port (device, i) { + supported_gids = roce_gid_type_mask_support(device, i); + WARN_ON(!supported_gids); + if (supported_gids & (1 << CMA_PREFERRED_ROCE_GID_TYPE)) + cma_dev->default_gid_type[i - rdma_start_port(device)] = + CMA_PREFERRED_ROCE_GID_TYPE; + else + cma_dev->default_gid_type[i - rdma_start_port(device)] = + find_first_bit(&supported_gids, BITS_PER_LONG); + cma_dev->default_roce_tos[i - rdma_start_port(device)] = 0; + } + + init_completion(&cma_dev->comp); + refcount_set(&cma_dev->refcount, 1); + INIT_LIST_HEAD(&cma_dev->id_list); + ib_set_client_data(device, &cma_client, cma_dev); + + mutex_lock(&lock); + list_add_tail(&cma_dev->list, &dev_list); + list_for_each_entry(id_priv, &listen_any_list, listen_any_item) { + ret = cma_listen_on_dev(id_priv, cma_dev, &to_destroy); + if (ret) + goto free_listen; + } + mutex_unlock(&lock); + + trace_cm_add_one(device); + return 0; + +free_listen: + list_del(&cma_dev->list); + mutex_unlock(&lock); + + /* cma_process_remove() will delete to_destroy */ + cma_process_remove(cma_dev); + kfree(cma_dev->default_roce_tos); +free_gid_type: + kfree(cma_dev->default_gid_type); + +free_cma_dev: + kfree(cma_dev); + return ret; +} + +static void cma_remove_one(struct ib_device *device, void *client_data) +{ + struct cma_device *cma_dev = client_data; + + trace_cm_remove_one(device); + + mutex_lock(&lock); + list_del(&cma_dev->list); + mutex_unlock(&lock); + + cma_process_remove(cma_dev); + kfree(cma_dev->default_roce_tos); + kfree(cma_dev->default_gid_type); + kfree(cma_dev); +} + +static int cma_init_net(struct net *net) +{ + struct cma_pernet *pernet = cma_pernet(net); + + xa_init(&pernet->tcp_ps); + xa_init(&pernet->udp_ps); + xa_init(&pernet->ipoib_ps); + xa_init(&pernet->ib_ps); + + return 0; +} + +static void cma_exit_net(struct net *net) +{ + struct cma_pernet *pernet = cma_pernet(net); + + WARN_ON(!xa_empty(&pernet->tcp_ps)); + WARN_ON(!xa_empty(&pernet->udp_ps)); + WARN_ON(!xa_empty(&pernet->ipoib_ps)); + WARN_ON(!xa_empty(&pernet->ib_ps)); +} + +static struct pernet_operations cma_pernet_operations = { + .init = cma_init_net, + .exit = cma_exit_net, + .id = &cma_pernet_id, + .size = sizeof(struct cma_pernet), +}; + +static int __init cma_init(void) +{ + int ret; + + /* + * There is a rare lock ordering dependency in cma_netdev_callback() + * that only happens when bonding is enabled. Teach lockdep that rtnl + * must never be nested under lock so it can find these without having + * to test with bonding. + */ + if (IS_ENABLED(CONFIG_LOCKDEP)) { + rtnl_lock(); + mutex_lock(&lock); + mutex_unlock(&lock); + rtnl_unlock(); + } + + cma_wq = alloc_ordered_workqueue("rdma_cm", WQ_MEM_RECLAIM); + if (!cma_wq) + return -ENOMEM; + + cma_netevent_wq = alloc_ordered_workqueue("rdma_cm_netevent", 0); + if (!cma_netevent_wq) { + ret = -ENOMEM; + goto err_netevent_wq; + } + + ret = register_pernet_subsys(&cma_pernet_operations); + if (ret) + goto err_wq; + + ib_sa_register_client(&sa_client); + register_netdevice_notifier(&cma_nb); + register_netevent_notifier(&cma_netevent_cb); + + ret = ib_register_client(&cma_client); + if (ret) + goto err; + + ret = cma_configfs_init(); + if (ret) + goto err_ib; + + return 0; + +err_ib: + ib_unregister_client(&cma_client); +err: + unregister_netevent_notifier(&cma_netevent_cb); + unregister_netdevice_notifier(&cma_nb); + ib_sa_unregister_client(&sa_client); + unregister_pernet_subsys(&cma_pernet_operations); +err_wq: + destroy_workqueue(cma_netevent_wq); +err_netevent_wq: + destroy_workqueue(cma_wq); + return ret; +} + +static void __exit cma_cleanup(void) +{ + cma_configfs_exit(); + ib_unregister_client(&cma_client); + unregister_netevent_notifier(&cma_netevent_cb); + unregister_netdevice_notifier(&cma_nb); + ib_sa_unregister_client(&sa_client); + unregister_pernet_subsys(&cma_pernet_operations); + destroy_workqueue(cma_netevent_wq); + destroy_workqueue(cma_wq); +} + +module_init(cma_init); +module_exit(cma_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/cma_configfs.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/cma_configfs.c new file mode 100644 index 0000000..9ac16e0 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/cma_configfs.c @@ -0,0 +1,367 @@ +/* + * Copyright (c) 2015, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#include "core_priv.h" +#include "cma_priv.h" + +struct cma_device; + +struct cma_dev_group; + +struct cma_dev_port_group { + u32 port_num; + struct cma_dev_group *cma_dev_group; + struct config_group group; +}; + +struct cma_dev_group { + char name[IB_DEVICE_NAME_MAX]; + struct config_group device_group; + struct config_group ports_group; + struct cma_dev_port_group *ports; +}; + +static struct cma_dev_port_group *to_dev_port_group(struct config_item *item) +{ + struct config_group *group; + + if (!item) + return NULL; + + group = container_of(item, struct config_group, cg_item); + return container_of(group, struct cma_dev_port_group, group); +} + +static bool filter_by_name(struct ib_device *ib_dev, void *cookie) +{ + return !strcmp(dev_name(&ib_dev->dev), cookie); +} + +static int cma_configfs_params_get(struct config_item *item, + struct cma_device **pcma_dev, + struct cma_dev_port_group **pgroup) +{ + struct cma_dev_port_group *group = to_dev_port_group(item); + struct cma_device *cma_dev; + + if (!group) + return -ENODEV; + + cma_dev = cma_enum_devices_by_ibdev(filter_by_name, + group->cma_dev_group->name); + if (!cma_dev) + return -ENODEV; + + *pcma_dev = cma_dev; + *pgroup = group; + + return 0; +} + +static void cma_configfs_params_put(struct cma_device *cma_dev) +{ + cma_dev_put(cma_dev); +} + +static ssize_t default_roce_mode_show(struct config_item *item, + char *buf) +{ + struct cma_device *cma_dev; + struct cma_dev_port_group *group; + int gid_type; + ssize_t ret; + + ret = cma_configfs_params_get(item, &cma_dev, &group); + if (ret) + return ret; + + gid_type = cma_get_default_gid_type(cma_dev, group->port_num); + cma_configfs_params_put(cma_dev); + + if (gid_type < 0) + return gid_type; + + return sysfs_emit(buf, "%s\n", ib_cache_gid_type_str(gid_type)); +} + +static ssize_t default_roce_mode_store(struct config_item *item, + const char *buf, size_t count) +{ + struct cma_device *cma_dev; + struct cma_dev_port_group *group; + int gid_type; + ssize_t ret; + + ret = cma_configfs_params_get(item, &cma_dev, &group); + if (ret) + return ret; + + gid_type = ib_cache_gid_parse_type_str(buf); + if (gid_type < 0) { + cma_configfs_params_put(cma_dev); + return -EINVAL; + } + + ret = cma_set_default_gid_type(cma_dev, group->port_num, gid_type); + + cma_configfs_params_put(cma_dev); + + return !ret ? strnlen(buf, count) : ret; +} + +CONFIGFS_ATTR(, default_roce_mode); + +static ssize_t default_roce_tos_show(struct config_item *item, char *buf) +{ + struct cma_device *cma_dev; + struct cma_dev_port_group *group; + ssize_t ret; + u8 tos; + + ret = cma_configfs_params_get(item, &cma_dev, &group); + if (ret) + return ret; + + tos = cma_get_default_roce_tos(cma_dev, group->port_num); + cma_configfs_params_put(cma_dev); + + return sysfs_emit(buf, "%u\n", tos); +} + +static ssize_t default_roce_tos_store(struct config_item *item, + const char *buf, size_t count) +{ + struct cma_device *cma_dev; + struct cma_dev_port_group *group; + ssize_t ret; + u8 tos; + + ret = kstrtou8(buf, 0, &tos); + if (ret) + return ret; + + ret = cma_configfs_params_get(item, &cma_dev, &group); + if (ret) + return ret; + + ret = cma_set_default_roce_tos(cma_dev, group->port_num, tos); + cma_configfs_params_put(cma_dev); + + return ret ? ret : strnlen(buf, count); +} + +CONFIGFS_ATTR(, default_roce_tos); + +static struct configfs_attribute *cma_configfs_attributes[] = { + &attr_default_roce_mode, + &attr_default_roce_tos, + NULL, +}; + +static const struct config_item_type cma_port_group_type = { + .ct_attrs = cma_configfs_attributes, + .ct_owner = THIS_MODULE +}; + +static int make_cma_ports(struct cma_dev_group *cma_dev_group, + struct cma_device *cma_dev) +{ + struct cma_dev_port_group *ports; + struct ib_device *ibdev; + u32 ports_num; + u32 i; + + ibdev = cma_get_ib_dev(cma_dev); + + if (!ibdev) + return -ENODEV; + + ports_num = ibdev->phys_port_cnt; + ports = kcalloc(ports_num, sizeof(*cma_dev_group->ports), + GFP_KERNEL); + + if (!ports) + return -ENOMEM; + + for (i = 0; i < ports_num; i++) { + char port_str[10]; + + ports[i].port_num = i + 1; + snprintf(port_str, sizeof(port_str), "%u", i + 1); + ports[i].cma_dev_group = cma_dev_group; + config_group_init_type_name(&ports[i].group, + port_str, + &cma_port_group_type); + configfs_add_default_group(&ports[i].group, + &cma_dev_group->ports_group); + + } + cma_dev_group->ports = ports; + return 0; +} + +static void release_cma_dev(struct config_item *item) +{ + struct config_group *group = container_of(item, struct config_group, + cg_item); + struct cma_dev_group *cma_dev_group = container_of(group, + struct cma_dev_group, + device_group); + + kfree(cma_dev_group); +}; + +static void release_cma_ports_group(struct config_item *item) +{ + struct config_group *group = container_of(item, struct config_group, + cg_item); + struct cma_dev_group *cma_dev_group = container_of(group, + struct cma_dev_group, + ports_group); + + kfree(cma_dev_group->ports); + cma_dev_group->ports = NULL; +}; + +static struct configfs_item_operations cma_ports_item_ops = { + .release = release_cma_ports_group +}; + +static const struct config_item_type cma_ports_group_type = { + .ct_item_ops = &cma_ports_item_ops, + .ct_owner = THIS_MODULE +}; + +static struct configfs_item_operations cma_device_item_ops = { + .release = release_cma_dev +}; + +static const struct config_item_type cma_device_group_type = { + .ct_item_ops = &cma_device_item_ops, + .ct_owner = THIS_MODULE +}; + +static struct config_group *make_cma_dev(struct config_group *group, + const char *name) +{ + int err = -ENODEV; + struct cma_device *cma_dev = cma_enum_devices_by_ibdev(filter_by_name, + (void *)name); + struct cma_dev_group *cma_dev_group = NULL; + + if (!cma_dev) + goto fail; + + cma_dev_group = kzalloc(sizeof(*cma_dev_group), GFP_KERNEL); + + if (!cma_dev_group) { + err = -ENOMEM; + goto fail; + } + + strlcpy(cma_dev_group->name, name, sizeof(cma_dev_group->name)); + + config_group_init_type_name(&cma_dev_group->ports_group, "ports", + &cma_ports_group_type); + + err = make_cma_ports(cma_dev_group, cma_dev); + if (err) + goto fail; + + config_group_init_type_name(&cma_dev_group->device_group, name, + &cma_device_group_type); + configfs_add_default_group(&cma_dev_group->ports_group, + &cma_dev_group->device_group); + + cma_dev_put(cma_dev); + return &cma_dev_group->device_group; + +fail: + if (cma_dev) + cma_dev_put(cma_dev); + kfree(cma_dev_group); + return ERR_PTR(err); +} + +static void drop_cma_dev(struct config_group *cgroup, struct config_item *item) +{ + struct config_group *group = + container_of(item, struct config_group, cg_item); + struct cma_dev_group *cma_dev_group = + container_of(group, struct cma_dev_group, device_group); + + configfs_remove_default_groups(&cma_dev_group->ports_group); + configfs_remove_default_groups(&cma_dev_group->device_group); + config_item_put(item); +} + +static struct configfs_group_operations cma_subsys_group_ops = { + .make_group = make_cma_dev, + .drop_item = drop_cma_dev, +}; + +static const struct config_item_type cma_subsys_type = { + .ct_group_ops = &cma_subsys_group_ops, + .ct_owner = THIS_MODULE, +}; + +static struct configfs_subsystem cma_subsys = { + .su_group = { + .cg_item = { + .ci_namebuf = "rdma_cm", + .ci_type = &cma_subsys_type, + }, + }, +}; + +int __init cma_configfs_init(void) +{ + int ret; + + config_group_init(&cma_subsys.su_group); + mutex_init(&cma_subsys.su_mutex); + ret = configfs_register_subsystem(&cma_subsys); + if (ret) + mutex_destroy(&cma_subsys.su_mutex); + return ret; +} + +void __exit cma_configfs_exit(void) +{ + configfs_unregister_subsystem(&cma_subsys); + mutex_destroy(&cma_subsys.su_mutex); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/cma_priv.h b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/cma_priv.h new file mode 100644 index 0000000..b7354c9 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/cma_priv.h @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2005 Voltaire Inc. All rights reserved. + * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved. + * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved. + * Copyright (c) 2005-2006 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _CMA_PRIV_H +#define _CMA_PRIV_H + +enum rdma_cm_state { + RDMA_CM_IDLE, + RDMA_CM_ADDR_QUERY, + RDMA_CM_ADDR_RESOLVED, + RDMA_CM_ROUTE_QUERY, + RDMA_CM_ROUTE_RESOLVED, + RDMA_CM_CONNECT, + RDMA_CM_DISCONNECT, + RDMA_CM_ADDR_BOUND, + RDMA_CM_LISTEN, + RDMA_CM_DEVICE_REMOVAL, + RDMA_CM_DESTROYING +}; + +struct rdma_id_private { + struct rdma_cm_id id; + + struct rdma_bind_list *bind_list; + struct hlist_node node; + union { + struct list_head device_item; /* On cma_device->id_list */ + struct list_head listen_any_item; /* On listen_any_list */ + }; + union { + /* On rdma_id_private->listen_list */ + struct list_head listen_item; + struct list_head listen_list; + }; + struct list_head id_list_entry; + struct cma_device *cma_dev; + struct list_head mc_list; + + int internal_id; + enum rdma_cm_state state; + spinlock_t lock; + struct mutex qp_mutex; + + struct completion comp; + refcount_t refcount; + struct mutex handler_mutex; + + int backlog; + int timeout_ms; + struct ib_sa_query *query; + int query_id; + union { + struct ib_cm_id *ib; + struct iw_cm_id *iw; + } cm_id; + + u32 seq_num; + u32 qkey; + u32 qp_num; + u32 options; + u8 srq; + u8 tos; + u8 tos_set:1; + u8 timeout_set:1; + u8 min_rnr_timer_set:1; + u8 reuseaddr; + u8 afonly; + u8 timeout; + u8 min_rnr_timer; + u8 used_resolve_ip; + enum ib_gid_type gid_type; + + /* + * Internal to RDMA/core, don't use in the drivers + */ + struct rdma_restrack_entry res; + struct rdma_ucm_ece ece; +}; + +#if IS_ENABLED(CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS) +int cma_configfs_init(void); +void cma_configfs_exit(void); +#else +static inline int cma_configfs_init(void) +{ + return 0; +} + +static inline void cma_configfs_exit(void) +{ +} +#endif + +void cma_dev_get(struct cma_device *dev); +void cma_dev_put(struct cma_device *dev); +typedef bool (*cma_device_filter)(struct ib_device *, void *); +struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter filter, + void *cookie); +int cma_get_default_gid_type(struct cma_device *dev, u32 port); +int cma_set_default_gid_type(struct cma_device *dev, u32 port, + enum ib_gid_type default_gid_type); +int cma_get_default_roce_tos(struct cma_device *dev, u32 port); +int cma_set_default_roce_tos(struct cma_device *dev, u32 port, + u8 default_roce_tos); +struct ib_device *cma_get_ib_dev(struct cma_device *dev); + +#endif /* _CMA_PRIV_H */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/cma_trace.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/cma_trace.c new file mode 100644 index 0000000..b314a28 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/cma_trace.c @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Trace points for the RDMA Connection Manager. + * + * Author: Chuck Lever + * + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. + */ + +#define CREATE_TRACE_POINTS + +#include +#include +#include "cma_priv.h" + +#include "cma_trace.h" diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/cma_trace.h b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/cma_trace.h new file mode 100644 index 0000000..e452642 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/cma_trace.h @@ -0,0 +1,361 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Trace point definitions for the RDMA Connect Manager. + * + * Author: Chuck Lever + * + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM rdma_cma + +#if !defined(_TRACE_RDMA_CMA_H) || defined(TRACE_HEADER_MULTI_READ) + +#define _TRACE_RDMA_CMA_H + +#include +#include + + +DECLARE_EVENT_CLASS(cma_fsm_class, + TP_PROTO( + const struct rdma_id_private *id_priv + ), + + TP_ARGS(id_priv), + + TP_STRUCT__entry( + __field(u32, cm_id) + __field(u32, tos) + __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) + __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) + ), + + TP_fast_assign( + __entry->cm_id = id_priv->res.id; + __entry->tos = id_priv->tos; + memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr, + sizeof(struct sockaddr_in6)); + memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr, + sizeof(struct sockaddr_in6)); + ), + + TP_printk("cm.id=%u src=%pISpc dst=%pISpc tos=%u", + __entry->cm_id, __entry->srcaddr, __entry->dstaddr, __entry->tos + ) +); + +#define DEFINE_CMA_FSM_EVENT(name) \ + DEFINE_EVENT(cma_fsm_class, cm_##name, \ + TP_PROTO( \ + const struct rdma_id_private *id_priv \ + ), \ + TP_ARGS(id_priv)) + +DEFINE_CMA_FSM_EVENT(send_rtu); +DEFINE_CMA_FSM_EVENT(send_rej); +DEFINE_CMA_FSM_EVENT(send_mra); +DEFINE_CMA_FSM_EVENT(send_sidr_req); +DEFINE_CMA_FSM_EVENT(send_sidr_rep); +DEFINE_CMA_FSM_EVENT(disconnect); +DEFINE_CMA_FSM_EVENT(sent_drep); +DEFINE_CMA_FSM_EVENT(sent_dreq); +DEFINE_CMA_FSM_EVENT(id_destroy); + +TRACE_EVENT(cm_id_attach, + TP_PROTO( + const struct rdma_id_private *id_priv, + const struct ib_device *device + ), + + TP_ARGS(id_priv, device), + + TP_STRUCT__entry( + __field(u32, cm_id) + __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) + __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) + __string(devname, device->name) + ), + + TP_fast_assign( + __entry->cm_id = id_priv->res.id; + memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr, + sizeof(struct sockaddr_in6)); + memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr, + sizeof(struct sockaddr_in6)); + __assign_str(devname, device->name); + ), + + TP_printk("cm.id=%u src=%pISpc dst=%pISpc device=%s", + __entry->cm_id, __entry->srcaddr, __entry->dstaddr, + __get_str(devname) + ) +); + +DECLARE_EVENT_CLASS(cma_qp_class, + TP_PROTO( + const struct rdma_id_private *id_priv + ), + + TP_ARGS(id_priv), + + TP_STRUCT__entry( + __field(u32, cm_id) + __field(u32, tos) + __field(u32, qp_num) + __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) + __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) + ), + + TP_fast_assign( + __entry->cm_id = id_priv->res.id; + __entry->tos = id_priv->tos; + __entry->qp_num = id_priv->qp_num; + memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr, + sizeof(struct sockaddr_in6)); + memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr, + sizeof(struct sockaddr_in6)); + ), + + TP_printk("cm.id=%u src=%pISpc dst=%pISpc tos=%u qp_num=%u", + __entry->cm_id, __entry->srcaddr, __entry->dstaddr, __entry->tos, + __entry->qp_num + ) +); + +#define DEFINE_CMA_QP_EVENT(name) \ + DEFINE_EVENT(cma_qp_class, cm_##name, \ + TP_PROTO( \ + const struct rdma_id_private *id_priv \ + ), \ + TP_ARGS(id_priv)) + +DEFINE_CMA_QP_EVENT(send_req); +DEFINE_CMA_QP_EVENT(send_rep); +DEFINE_CMA_QP_EVENT(qp_destroy); + +/* + * enum ib_wp_type, from include/rdma/ib_verbs.h + */ +#define IB_QP_TYPE_LIST \ + ib_qp_type(SMI) \ + ib_qp_type(GSI) \ + ib_qp_type(RC) \ + ib_qp_type(UC) \ + ib_qp_type(UD) \ + ib_qp_type(RAW_IPV6) \ + ib_qp_type(RAW_ETHERTYPE) \ + ib_qp_type(RAW_PACKET) \ + ib_qp_type(XRC_INI) \ + ib_qp_type_end(XRC_TGT) + +#undef ib_qp_type +#undef ib_qp_type_end + +#define ib_qp_type(x) TRACE_DEFINE_ENUM(IB_QPT_##x); +#define ib_qp_type_end(x) TRACE_DEFINE_ENUM(IB_QPT_##x); + +IB_QP_TYPE_LIST + +#undef ib_qp_type +#undef ib_qp_type_end + +#define ib_qp_type(x) { IB_QPT_##x, #x }, +#define ib_qp_type_end(x) { IB_QPT_##x, #x } + +#define rdma_show_qp_type(x) \ + __print_symbolic(x, IB_QP_TYPE_LIST) + + +TRACE_EVENT(cm_qp_create, + TP_PROTO( + const struct rdma_id_private *id_priv, + const struct ib_pd *pd, + const struct ib_qp_init_attr *qp_init_attr, + int rc + ), + + TP_ARGS(id_priv, pd, qp_init_attr, rc), + + TP_STRUCT__entry( + __field(u32, cm_id) + __field(u32, pd_id) + __field(u32, tos) + __field(u32, qp_num) + __field(u32, send_wr) + __field(u32, recv_wr) + __field(int, rc) + __field(unsigned long, qp_type) + __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) + __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) + ), + + TP_fast_assign( + __entry->cm_id = id_priv->res.id; + __entry->pd_id = pd->res.id; + __entry->tos = id_priv->tos; + __entry->send_wr = qp_init_attr->cap.max_send_wr; + __entry->recv_wr = qp_init_attr->cap.max_recv_wr; + __entry->rc = rc; + if (!rc) { + __entry->qp_num = id_priv->qp_num; + __entry->qp_type = id_priv->id.qp_type; + } else { + __entry->qp_num = 0; + __entry->qp_type = 0; + } + memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr, + sizeof(struct sockaddr_in6)); + memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr, + sizeof(struct sockaddr_in6)); + ), + + TP_printk("cm.id=%u src=%pISpc dst=%pISpc tos=%u pd.id=%u qp_type=%s" + " send_wr=%u recv_wr=%u qp_num=%u rc=%d", + __entry->cm_id, __entry->srcaddr, __entry->dstaddr, + __entry->tos, __entry->pd_id, + rdma_show_qp_type(__entry->qp_type), __entry->send_wr, + __entry->recv_wr, __entry->qp_num, __entry->rc + ) +); + +TRACE_EVENT(cm_req_handler, + TP_PROTO( + const struct rdma_id_private *id_priv, + int event + ), + + TP_ARGS(id_priv, event), + + TP_STRUCT__entry( + __field(u32, cm_id) + __field(u32, tos) + __field(unsigned long, event) + __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) + __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) + ), + + TP_fast_assign( + __entry->cm_id = id_priv->res.id; + __entry->tos = id_priv->tos; + __entry->event = event; + memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr, + sizeof(struct sockaddr_in6)); + memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr, + sizeof(struct sockaddr_in6)); + ), + + TP_printk("cm.id=%u src=%pISpc dst=%pISpc tos=%u %s (%lu)", + __entry->cm_id, __entry->srcaddr, __entry->dstaddr, __entry->tos, + rdma_show_ib_cm_event(__entry->event), __entry->event + ) +); + +TRACE_EVENT(cm_event_handler, + TP_PROTO( + const struct rdma_id_private *id_priv, + const struct rdma_cm_event *event + ), + + TP_ARGS(id_priv, event), + + TP_STRUCT__entry( + __field(u32, cm_id) + __field(u32, tos) + __field(unsigned long, event) + __field(int, status) + __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) + __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) + ), + + TP_fast_assign( + __entry->cm_id = id_priv->res.id; + __entry->tos = id_priv->tos; + __entry->event = event->event; + __entry->status = event->status; + memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr, + sizeof(struct sockaddr_in6)); + memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr, + sizeof(struct sockaddr_in6)); + ), + + TP_printk("cm.id=%u src=%pISpc dst=%pISpc tos=%u %s (%lu/%d)", + __entry->cm_id, __entry->srcaddr, __entry->dstaddr, __entry->tos, + rdma_show_cm_event(__entry->event), __entry->event, + __entry->status + ) +); + +TRACE_EVENT(cm_event_done, + TP_PROTO( + const struct rdma_id_private *id_priv, + const struct rdma_cm_event *event, + int result + ), + + TP_ARGS(id_priv, event, result), + + TP_STRUCT__entry( + __field(u32, cm_id) + __field(u32, tos) + __field(unsigned long, event) + __field(int, result) + __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) + __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) + ), + + TP_fast_assign( + __entry->cm_id = id_priv->res.id; + __entry->tos = id_priv->tos; + __entry->event = event->event; + __entry->result = result; + memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr, + sizeof(struct sockaddr_in6)); + memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr, + sizeof(struct sockaddr_in6)); + ), + + TP_printk("cm.id=%u src=%pISpc dst=%pISpc tos=%u %s consumer returns %d", + __entry->cm_id, __entry->srcaddr, __entry->dstaddr, __entry->tos, + rdma_show_cm_event(__entry->event), __entry->result + ) +); + +DECLARE_EVENT_CLASS(cma_client_class, + TP_PROTO( + const struct ib_device *device + ), + + TP_ARGS(device), + + TP_STRUCT__entry( + __string(name, device->name) + ), + + TP_fast_assign( + __assign_str(name, device->name); + ), + + TP_printk("device name=%s", + __get_str(name) + ) +); + +#define DEFINE_CMA_CLIENT_EVENT(name) \ + DEFINE_EVENT(cma_client_class, cm_##name, \ + TP_PROTO( \ + const struct ib_device *device \ + ), \ + TP_ARGS(device)) + +DEFINE_CMA_CLIENT_EVENT(add_one); +DEFINE_CMA_CLIENT_EVENT(remove_one); + +#endif /* _TRACE_RDMA_CMA_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE cma_trace + +#include diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/core_priv.h b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/core_priv.h new file mode 100644 index 0000000..7a52ec5 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/core_priv.h @@ -0,0 +1,377 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _CORE_PRIV_H +#define _CORE_PRIV_H + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include "mad_priv.h" +#include "restrack.h" + +/* Total number of ports combined across all struct ib_devices's */ +#define RDMA_MAX_PORTS 8192 + +struct pkey_index_qp_list { + struct list_head pkey_index_list; + u16 pkey_index; + /* Lock to hold while iterating the qp_list. */ + spinlock_t qp_list_lock; + struct list_head qp_list; +}; + +/** + * struct rdma_dev_net - rdma net namespace metadata for a net + * @nl_sock: Pointer to netlink socket + * @net: Pointer to owner net namespace + * @id: xarray id to identify the net namespace. + */ +struct rdma_dev_net { + struct sock *nl_sock; + possible_net_t net; + u32 id; +}; + +extern const struct attribute_group ib_dev_attr_group; +extern bool ib_devices_shared_netns; +extern unsigned int rdma_dev_net_id; + +static inline struct rdma_dev_net *rdma_net_to_dev_net(struct net *net) +{ + return net_generic(net, rdma_dev_net_id); +} + +int ib_device_rename(struct ib_device *ibdev, const char *name); +int ib_device_set_dim(struct ib_device *ibdev, u8 use_dim); + +typedef void (*roce_netdev_callback)(struct ib_device *device, u32 port, + struct net_device *idev, void *cookie); + +typedef bool (*roce_netdev_filter)(struct ib_device *device, u32 port, + struct net_device *idev, void *cookie); + +struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, + u32 port); + +void ib_enum_roce_netdev(struct ib_device *ib_dev, + roce_netdev_filter filter, + void *filter_cookie, + roce_netdev_callback cb, + void *cookie, unsigned long ndev_event); +void ib_enum_all_roce_netdevs(roce_netdev_filter filter, + void *filter_cookie, + roce_netdev_callback cb, + void *cookie, unsigned long ndev_event); + +typedef int (*nldev_callback)(struct ib_device *device, + struct sk_buff *skb, + struct netlink_callback *cb, + unsigned int idx); + +int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb, + struct netlink_callback *cb); + +struct ib_client_nl_info { + struct sk_buff *nl_msg; + struct device *cdev; + u32 port; + u64 abi; +}; +int ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name, + struct ib_client_nl_info *res); + +enum ib_cache_gid_default_mode { + IB_CACHE_GID_DEFAULT_MODE_SET, + IB_CACHE_GID_DEFAULT_MODE_DELETE +}; + +int ib_cache_gid_parse_type_str(const char *buf); + +const char *ib_cache_gid_type_str(enum ib_gid_type gid_type); + +void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u32 port, + struct net_device *ndev, + unsigned long gid_type_mask, + enum ib_cache_gid_default_mode mode); + +int ib_cache_gid_add(struct ib_device *ib_dev, u32 port, + union ib_gid *gid, struct ib_gid_attr *attr); + +int ib_cache_gid_del(struct ib_device *ib_dev, u32 port, + union ib_gid *gid, struct ib_gid_attr *attr); + +int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u32 port, + struct net_device *ndev); + +int roce_gid_mgmt_init(void); +void roce_gid_mgmt_cleanup(void); + +unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u32 port); + +int ib_cache_setup_one(struct ib_device *device); +void ib_cache_cleanup_one(struct ib_device *device); +void ib_cache_release_one(struct ib_device *device); +void ib_dispatch_event_clients(struct ib_event *event); + +#ifdef CONFIG_CGROUP_RDMA +void ib_device_register_rdmacg(struct ib_device *device); +void ib_device_unregister_rdmacg(struct ib_device *device); + +int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj, + struct ib_device *device, + enum rdmacg_resource_type resource_index); + +void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj, + struct ib_device *device, + enum rdmacg_resource_type resource_index); +#else +static inline void ib_device_register_rdmacg(struct ib_device *device) +{ +} + +static inline void ib_device_unregister_rdmacg(struct ib_device *device) +{ +} + +static inline int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj, + struct ib_device *device, + enum rdmacg_resource_type resource_index) +{ + return 0; +} + +static inline void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj, + struct ib_device *device, + enum rdmacg_resource_type resource_index) +{ +} +#endif + +static inline bool rdma_is_upper_dev_rcu(struct net_device *dev, + struct net_device *upper) +{ + return netdev_has_upper_dev_all_rcu(dev, upper); +} + +int addr_init(void); +void addr_cleanup(void); + +int ib_mad_init(void); +void ib_mad_cleanup(void); + +int ib_sa_init(void); +void ib_sa_cleanup(void); + +void rdma_nl_init(void); +void rdma_nl_exit(void); + +int ib_nl_handle_resolve_resp(struct sk_buff *skb, + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack); +int ib_nl_handle_set_timeout(struct sk_buff *skb, + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack); +int ib_nl_handle_ip_res_resp(struct sk_buff *skb, + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack); + +void ib_get_cached_subnet_prefix(struct ib_device *device, + u32 port_num, + u64 *sn_pfx); + +#ifdef CONFIG_SECURITY_INFINIBAND +void ib_security_release_port_pkey_list(struct ib_device *device); + +void ib_security_cache_change(struct ib_device *device, + u32 port_num, + u64 subnet_prefix); + +int ib_security_modify_qp(struct ib_qp *qp, + struct ib_qp_attr *qp_attr, + int qp_attr_mask, + struct ib_udata *udata); + +int ib_create_qp_security(struct ib_qp *qp, struct ib_device *dev); +void ib_destroy_qp_security_begin(struct ib_qp_security *sec); +void ib_destroy_qp_security_abort(struct ib_qp_security *sec); +void ib_destroy_qp_security_end(struct ib_qp_security *sec); +int ib_open_shared_qp_security(struct ib_qp *qp, struct ib_device *dev); +void ib_close_shared_qp_security(struct ib_qp_security *sec); +int ib_mad_agent_security_setup(struct ib_mad_agent *agent, + enum ib_qp_type qp_type); +void ib_mad_agent_security_cleanup(struct ib_mad_agent *agent); +int ib_mad_enforce_security(struct ib_mad_agent_private *map, u16 pkey_index); +void ib_mad_agent_security_change(void); +#else +static inline void ib_security_release_port_pkey_list(struct ib_device *device) +{ +} + +static inline void ib_security_cache_change(struct ib_device *device, + u32 port_num, + u64 subnet_prefix) +{ +} + +static inline int ib_security_modify_qp(struct ib_qp *qp, + struct ib_qp_attr *qp_attr, + int qp_attr_mask, + struct ib_udata *udata) +{ + return qp->device->ops.modify_qp(qp->real_qp, + qp_attr, + qp_attr_mask, + udata); +} + +static inline int ib_create_qp_security(struct ib_qp *qp, + struct ib_device *dev) +{ + return 0; +} + +static inline void ib_destroy_qp_security_begin(struct ib_qp_security *sec) +{ +} + +static inline void ib_destroy_qp_security_abort(struct ib_qp_security *sec) +{ +} + +static inline void ib_destroy_qp_security_end(struct ib_qp_security *sec) +{ +} + +static inline int ib_open_shared_qp_security(struct ib_qp *qp, + struct ib_device *dev) +{ + return 0; +} + +static inline void ib_close_shared_qp_security(struct ib_qp_security *sec) +{ +} + +static inline int ib_mad_agent_security_setup(struct ib_mad_agent *agent, + enum ib_qp_type qp_type) +{ + return 0; +} + +static inline void ib_mad_agent_security_cleanup(struct ib_mad_agent *agent) +{ +} + +static inline int ib_mad_enforce_security(struct ib_mad_agent_private *map, + u16 pkey_index) +{ + return 0; +} + +static inline void ib_mad_agent_security_change(void) +{ +} +#endif + +struct ib_device *ib_device_get_by_index(const struct net *net, u32 index); + +/* RDMA device netlink */ +void nldev_init(void); +void nldev_exit(void); + +struct ib_qp *ib_create_qp_user(struct ib_device *dev, struct ib_pd *pd, + struct ib_qp_init_attr *attr, + struct ib_udata *udata, + struct ib_uqp_object *uobj, const char *caller); + +void ib_qp_usecnt_inc(struct ib_qp *qp); +void ib_qp_usecnt_dec(struct ib_qp *qp); + +struct rdma_dev_addr; +int rdma_resolve_ip_route(struct sockaddr *src_addr, + const struct sockaddr *dst_addr, + struct rdma_dev_addr *addr); + +int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, + const union ib_gid *dgid, + u8 *dmac, const struct ib_gid_attr *sgid_attr, + int *hoplimit); +void rdma_copy_src_l2_addr(struct rdma_dev_addr *dev_addr, + const struct net_device *dev); + +struct sa_path_rec; +int roce_resolve_route_from_path(struct sa_path_rec *rec, + const struct ib_gid_attr *attr); + +struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr); + +void ib_free_port_attrs(struct ib_core_device *coredev); +int ib_setup_port_attrs(struct ib_core_device *coredev); +struct rdma_hw_stats *ib_get_hw_stats_port(struct ib_device *ibdev, u32 port_num); +void ib_device_release_hw_stats(struct hw_stats_device_data *data); +int ib_setup_device_attrs(struct ib_device *ibdev); + +int rdma_compatdev_set(u8 enable); + +int ib_port_register_client_groups(struct ib_device *ibdev, u32 port_num, + const struct attribute_group **groups); +void ib_port_unregister_client_groups(struct ib_device *ibdev, u32 port_num, + const struct attribute_group **groups); + +int ib_device_set_netns_put(struct sk_buff *skb, + struct ib_device *dev, u32 ns_fd); + +int rdma_nl_net_init(struct rdma_dev_net *rnet); +void rdma_nl_net_exit(struct rdma_dev_net *rnet); + +struct rdma_umap_priv { + struct vm_area_struct *vma; + struct list_head list; + struct rdma_user_mmap_entry *entry; +}; + +void rdma_umap_priv_init(struct rdma_umap_priv *priv, + struct vm_area_struct *vma, + struct rdma_user_mmap_entry *entry); + +void ib_cq_pool_cleanup(struct ib_device *dev); +bool rdma_check_gid_user_access(const struct ib_gid_attr *attr); + +#endif /* _CORE_PRIV_H */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/counters.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/counters.c new file mode 100644 index 0000000..af59486 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/counters.c @@ -0,0 +1,669 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2019 Mellanox Technologies. All rights reserved. + */ +#include +#include + +#include "core_priv.h" +#include "restrack.h" + +#define ALL_AUTO_MODE_MASKS (RDMA_COUNTER_MASK_QP_TYPE | RDMA_COUNTER_MASK_PID) + +static int __counter_set_mode(struct rdma_port_counter *port_counter, + enum rdma_nl_counter_mode new_mode, + enum rdma_nl_counter_mask new_mask) +{ + if (new_mode == RDMA_COUNTER_MODE_AUTO) { + if (new_mask & (~ALL_AUTO_MODE_MASKS)) + return -EINVAL; + if (port_counter->num_counters) + return -EBUSY; + } + + port_counter->mode.mode = new_mode; + port_counter->mode.mask = new_mask; + return 0; +} + +/* + * rdma_counter_set_auto_mode() - Turn on/off per-port auto mode + * + * @dev: Device to operate + * @port: Port to use + * @mask: Mask to configure + * @extack: Message to the user + * + * Return 0 on success. If counter mode wasn't changed then it is considered + * as success as well. + * Return -EBUSY when changing to auto mode while there are bounded counters. + * + */ +int rdma_counter_set_auto_mode(struct ib_device *dev, u32 port, + enum rdma_nl_counter_mask mask, + struct netlink_ext_ack *extack) +{ + struct rdma_port_counter *port_counter; + enum rdma_nl_counter_mode mode; + int ret; + + port_counter = &dev->port_data[port].port_counter; + if (!port_counter->hstats) + return -EOPNOTSUPP; + + mutex_lock(&port_counter->lock); + if (mask) + mode = RDMA_COUNTER_MODE_AUTO; + else + mode = (port_counter->num_counters) ? RDMA_COUNTER_MODE_MANUAL : + RDMA_COUNTER_MODE_NONE; + + if (port_counter->mode.mode == mode && + port_counter->mode.mask == mask) { + ret = 0; + goto out; + } + + ret = __counter_set_mode(port_counter, mode, mask); + +out: + mutex_unlock(&port_counter->lock); + if (ret == -EBUSY) + NL_SET_ERR_MSG( + extack, + "Modifying auto mode is not allowed when there is a bound QP"); + return ret; +} + +static void auto_mode_init_counter(struct rdma_counter *counter, + const struct ib_qp *qp, + enum rdma_nl_counter_mask new_mask) +{ + struct auto_mode_param *param = &counter->mode.param; + + counter->mode.mode = RDMA_COUNTER_MODE_AUTO; + counter->mode.mask = new_mask; + + if (new_mask & RDMA_COUNTER_MASK_QP_TYPE) + param->qp_type = qp->qp_type; +} + +static int __rdma_counter_bind_qp(struct rdma_counter *counter, + struct ib_qp *qp) +{ + int ret; + + if (qp->counter) + return -EINVAL; + + if (!qp->device->ops.counter_bind_qp) + return -EOPNOTSUPP; + + mutex_lock(&counter->lock); + ret = qp->device->ops.counter_bind_qp(counter, qp); + mutex_unlock(&counter->lock); + + return ret; +} + +int rdma_counter_modify(struct ib_device *dev, u32 port, + unsigned int index, bool enable) +{ + struct rdma_hw_stats *stats; + int ret = 0; + + if (!dev->ops.modify_hw_stat) + return -EOPNOTSUPP; + + stats = ib_get_hw_stats_port(dev, port); + if (!stats || index >= stats->num_counters || + !(stats->descs[index].flags & IB_STAT_FLAG_OPTIONAL)) + return -EINVAL; + + mutex_lock(&stats->lock); + + if (enable != test_bit(index, stats->is_disabled)) + goto out; + + ret = dev->ops.modify_hw_stat(dev, port, index, enable); + if (ret) + goto out; + + if (enable) + clear_bit(index, stats->is_disabled); + else + set_bit(index, stats->is_disabled); +out: + mutex_unlock(&stats->lock); + return ret; +} + +static struct rdma_counter *alloc_and_bind(struct ib_device *dev, u32 port, + struct ib_qp *qp, + enum rdma_nl_counter_mode mode) +{ + struct rdma_port_counter *port_counter; + struct rdma_counter *counter; + int ret; + + if (!dev->ops.counter_dealloc || !dev->ops.counter_alloc_stats) + return NULL; + + counter = kzalloc(sizeof(*counter), GFP_KERNEL); + if (!counter) + return NULL; + + counter->device = dev; + counter->port = port; + + rdma_restrack_new(&counter->res, RDMA_RESTRACK_COUNTER); + counter->stats = dev->ops.counter_alloc_stats(counter); + if (!counter->stats) + goto err_stats; + + port_counter = &dev->port_data[port].port_counter; + mutex_lock(&port_counter->lock); + switch (mode) { + case RDMA_COUNTER_MODE_MANUAL: + ret = __counter_set_mode(port_counter, RDMA_COUNTER_MODE_MANUAL, + 0); + if (ret) { + mutex_unlock(&port_counter->lock); + goto err_mode; + } + break; + case RDMA_COUNTER_MODE_AUTO: + auto_mode_init_counter(counter, qp, port_counter->mode.mask); + break; + default: + ret = -EOPNOTSUPP; + mutex_unlock(&port_counter->lock); + goto err_mode; + } + + port_counter->num_counters++; + mutex_unlock(&port_counter->lock); + + counter->mode.mode = mode; + kref_init(&counter->kref); + mutex_init(&counter->lock); + + ret = __rdma_counter_bind_qp(counter, qp); + if (ret) + goto err_mode; + + rdma_restrack_parent_name(&counter->res, &qp->res); + rdma_restrack_add(&counter->res); + return counter; + +err_mode: + rdma_free_hw_stats_struct(counter->stats); +err_stats: + rdma_restrack_put(&counter->res); + kfree(counter); + return NULL; +} + +static void rdma_counter_free(struct rdma_counter *counter) +{ + struct rdma_port_counter *port_counter; + + port_counter = &counter->device->port_data[counter->port].port_counter; + mutex_lock(&port_counter->lock); + port_counter->num_counters--; + if (!port_counter->num_counters && + (port_counter->mode.mode == RDMA_COUNTER_MODE_MANUAL)) + __counter_set_mode(port_counter, RDMA_COUNTER_MODE_NONE, 0); + + mutex_unlock(&port_counter->lock); + + rdma_restrack_del(&counter->res); + rdma_free_hw_stats_struct(counter->stats); + kfree(counter); +} + +static bool auto_mode_match(struct ib_qp *qp, struct rdma_counter *counter, + enum rdma_nl_counter_mask auto_mask) +{ + struct auto_mode_param *param = &counter->mode.param; + bool match = true; + + if (auto_mask & RDMA_COUNTER_MASK_QP_TYPE) + match &= (param->qp_type == qp->qp_type); + + if (auto_mask & RDMA_COUNTER_MASK_PID) + match &= (task_pid_nr(counter->res.task) == + task_pid_nr(qp->res.task)); + + return match; +} + +static int __rdma_counter_unbind_qp(struct ib_qp *qp) +{ + struct rdma_counter *counter = qp->counter; + int ret; + + if (!qp->device->ops.counter_unbind_qp) + return -EOPNOTSUPP; + + mutex_lock(&counter->lock); + ret = qp->device->ops.counter_unbind_qp(qp); + mutex_unlock(&counter->lock); + + return ret; +} + +static void counter_history_stat_update(struct rdma_counter *counter) +{ + struct ib_device *dev = counter->device; + struct rdma_port_counter *port_counter; + int i; + + port_counter = &dev->port_data[counter->port].port_counter; + if (!port_counter->hstats) + return; + + rdma_counter_query_stats(counter); + + for (i = 0; i < counter->stats->num_counters; i++) + port_counter->hstats->value[i] += counter->stats->value[i]; +} + +/* + * rdma_get_counter_auto_mode - Find the counter that @qp should be bound + * with in auto mode + * + * Return: The counter (with ref-count increased) if found + */ +static struct rdma_counter *rdma_get_counter_auto_mode(struct ib_qp *qp, + u32 port) +{ + struct rdma_port_counter *port_counter; + struct rdma_counter *counter = NULL; + struct ib_device *dev = qp->device; + struct rdma_restrack_entry *res; + struct rdma_restrack_root *rt; + unsigned long id = 0; + + port_counter = &dev->port_data[port].port_counter; + rt = &dev->res[RDMA_RESTRACK_COUNTER]; + xa_lock(&rt->xa); + xa_for_each(&rt->xa, id, res) { + counter = container_of(res, struct rdma_counter, res); + if ((counter->device != qp->device) || (counter->port != port)) + goto next; + + if (auto_mode_match(qp, counter, port_counter->mode.mask)) + break; +next: + counter = NULL; + } + + if (counter && !kref_get_unless_zero(&counter->kref)) + counter = NULL; + + xa_unlock(&rt->xa); + return counter; +} + +static void counter_release(struct kref *kref) +{ + struct rdma_counter *counter; + + counter = container_of(kref, struct rdma_counter, kref); + counter_history_stat_update(counter); + counter->device->ops.counter_dealloc(counter); + rdma_counter_free(counter); +} + +/* + * rdma_counter_bind_qp_auto - Check and bind the QP to a counter base on + * the auto-mode rule + */ +int rdma_counter_bind_qp_auto(struct ib_qp *qp, u32 port) +{ + struct rdma_port_counter *port_counter; + struct ib_device *dev = qp->device; + struct rdma_counter *counter; + int ret; + + if (!rdma_restrack_is_tracked(&qp->res) || rdma_is_kernel_res(&qp->res)) + return 0; + + if (!rdma_is_port_valid(dev, port)) + return -EINVAL; + + port_counter = &dev->port_data[port].port_counter; + if (port_counter->mode.mode != RDMA_COUNTER_MODE_AUTO) + return 0; + + counter = rdma_get_counter_auto_mode(qp, port); + if (counter) { + ret = __rdma_counter_bind_qp(counter, qp); + if (ret) { + kref_put(&counter->kref, counter_release); + return ret; + } + } else { + counter = alloc_and_bind(dev, port, qp, RDMA_COUNTER_MODE_AUTO); + if (!counter) + return -ENOMEM; + } + + return 0; +} + +/* + * rdma_counter_unbind_qp - Unbind a qp from a counter + * @force: + * true - Decrease the counter ref-count anyway (e.g., qp destroy) + */ +int rdma_counter_unbind_qp(struct ib_qp *qp, bool force) +{ + struct rdma_counter *counter = qp->counter; + int ret; + + if (!counter) + return -EINVAL; + + ret = __rdma_counter_unbind_qp(qp); + if (ret && !force) + return ret; + + kref_put(&counter->kref, counter_release); + return 0; +} + +int rdma_counter_query_stats(struct rdma_counter *counter) +{ + struct ib_device *dev = counter->device; + int ret; + + if (!dev->ops.counter_update_stats) + return -EINVAL; + + mutex_lock(&counter->lock); + ret = dev->ops.counter_update_stats(counter); + mutex_unlock(&counter->lock); + + return ret; +} + +static u64 get_running_counters_hwstat_sum(struct ib_device *dev, + u32 port, u32 index) +{ + struct rdma_restrack_entry *res; + struct rdma_restrack_root *rt; + struct rdma_counter *counter; + unsigned long id = 0; + u64 sum = 0; + + rt = &dev->res[RDMA_RESTRACK_COUNTER]; + xa_lock(&rt->xa); + xa_for_each(&rt->xa, id, res) { + if (!rdma_restrack_get(res)) + continue; + + xa_unlock(&rt->xa); + + counter = container_of(res, struct rdma_counter, res); + if ((counter->device != dev) || (counter->port != port) || + rdma_counter_query_stats(counter)) + goto next; + + sum += counter->stats->value[index]; + +next: + xa_lock(&rt->xa); + rdma_restrack_put(res); + } + + xa_unlock(&rt->xa); + return sum; +} + +/* + * rdma_counter_get_hwstat_value() - Get the sum value of all counters on a + * specific port, including the running ones and history data + */ +u64 rdma_counter_get_hwstat_value(struct ib_device *dev, u32 port, u32 index) +{ + struct rdma_port_counter *port_counter; + u64 sum; + + port_counter = &dev->port_data[port].port_counter; + if (!port_counter->hstats) + return 0; + + sum = get_running_counters_hwstat_sum(dev, port, index); + sum += port_counter->hstats->value[index]; + + return sum; +} + +static struct ib_qp *rdma_counter_get_qp(struct ib_device *dev, u32 qp_num) +{ + struct rdma_restrack_entry *res = NULL; + struct ib_qp *qp = NULL; + + res = rdma_restrack_get_byid(dev, RDMA_RESTRACK_QP, qp_num); + if (IS_ERR(res)) + return NULL; + + qp = container_of(res, struct ib_qp, res); + if (qp->qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW)) + goto err; + + return qp; + +err: + rdma_restrack_put(res); + return NULL; +} + +static struct rdma_counter *rdma_get_counter_by_id(struct ib_device *dev, + u32 counter_id) +{ + struct rdma_restrack_entry *res; + struct rdma_counter *counter; + + res = rdma_restrack_get_byid(dev, RDMA_RESTRACK_COUNTER, counter_id); + if (IS_ERR(res)) + return NULL; + + counter = container_of(res, struct rdma_counter, res); + kref_get(&counter->kref); + rdma_restrack_put(res); + + return counter; +} + +/* + * rdma_counter_bind_qpn() - Bind QP @qp_num to counter @counter_id + */ +int rdma_counter_bind_qpn(struct ib_device *dev, u32 port, + u32 qp_num, u32 counter_id) +{ + struct rdma_port_counter *port_counter; + struct rdma_counter *counter; + struct ib_qp *qp; + int ret; + + port_counter = &dev->port_data[port].port_counter; + if (port_counter->mode.mode == RDMA_COUNTER_MODE_AUTO) + return -EINVAL; + + qp = rdma_counter_get_qp(dev, qp_num); + if (!qp) + return -ENOENT; + + counter = rdma_get_counter_by_id(dev, counter_id); + if (!counter) { + ret = -ENOENT; + goto err; + } + + if (rdma_is_kernel_res(&counter->res) != rdma_is_kernel_res(&qp->res)) { + ret = -EINVAL; + goto err_task; + } + + if ((counter->device != qp->device) || (counter->port != qp->port)) { + ret = -EINVAL; + goto err_task; + } + + ret = __rdma_counter_bind_qp(counter, qp); + if (ret) + goto err_task; + + rdma_restrack_put(&qp->res); + return 0; + +err_task: + kref_put(&counter->kref, counter_release); +err: + rdma_restrack_put(&qp->res); + return ret; +} + +/* + * rdma_counter_bind_qpn_alloc() - Alloc a counter and bind QP @qp_num to it + * The id of new counter is returned in @counter_id + */ +int rdma_counter_bind_qpn_alloc(struct ib_device *dev, u32 port, + u32 qp_num, u32 *counter_id) +{ + struct rdma_port_counter *port_counter; + struct rdma_counter *counter; + struct ib_qp *qp; + int ret; + + if (!rdma_is_port_valid(dev, port)) + return -EINVAL; + + port_counter = &dev->port_data[port].port_counter; + if (!port_counter->hstats) + return -EOPNOTSUPP; + + if (port_counter->mode.mode == RDMA_COUNTER_MODE_AUTO) + return -EINVAL; + + qp = rdma_counter_get_qp(dev, qp_num); + if (!qp) + return -ENOENT; + + if (rdma_is_port_valid(dev, qp->port) && (qp->port != port)) { + ret = -EINVAL; + goto err; + } + + counter = alloc_and_bind(dev, port, qp, RDMA_COUNTER_MODE_MANUAL); + if (!counter) { + ret = -ENOMEM; + goto err; + } + + if (counter_id) + *counter_id = counter->id; + + rdma_restrack_put(&qp->res); + return 0; + +err: + rdma_restrack_put(&qp->res); + return ret; +} + +/* + * rdma_counter_unbind_qpn() - Unbind QP @qp_num from a counter + */ +int rdma_counter_unbind_qpn(struct ib_device *dev, u32 port, + u32 qp_num, u32 counter_id) +{ + struct rdma_port_counter *port_counter; + struct ib_qp *qp; + int ret; + + if (!rdma_is_port_valid(dev, port)) + return -EINVAL; + + qp = rdma_counter_get_qp(dev, qp_num); + if (!qp) + return -ENOENT; + + if (rdma_is_port_valid(dev, qp->port) && (qp->port != port)) { + ret = -EINVAL; + goto out; + } + + port_counter = &dev->port_data[port].port_counter; + if (!qp->counter || qp->counter->id != counter_id || + port_counter->mode.mode != RDMA_COUNTER_MODE_MANUAL) { + ret = -EINVAL; + goto out; + } + + ret = rdma_counter_unbind_qp(qp, false); + +out: + rdma_restrack_put(&qp->res); + return ret; +} + +int rdma_counter_get_mode(struct ib_device *dev, u32 port, + enum rdma_nl_counter_mode *mode, + enum rdma_nl_counter_mask *mask) +{ + struct rdma_port_counter *port_counter; + + port_counter = &dev->port_data[port].port_counter; + *mode = port_counter->mode.mode; + *mask = port_counter->mode.mask; + + return 0; +} + +void rdma_counter_init(struct ib_device *dev) +{ + struct rdma_port_counter *port_counter; + u32 port, i; + + if (!dev->port_data) + return; + + rdma_for_each_port(dev, port) { + port_counter = &dev->port_data[port].port_counter; + port_counter->mode.mode = RDMA_COUNTER_MODE_NONE; + mutex_init(&port_counter->lock); + + if (!dev->ops.alloc_hw_port_stats) + continue; + + port_counter->hstats = dev->ops.alloc_hw_port_stats(dev, port); + if (!port_counter->hstats) + goto fail; + } + + return; + +fail: + for (i = port; i >= rdma_start_port(dev); i--) { + port_counter = &dev->port_data[port].port_counter; + rdma_free_hw_stats_struct(port_counter->hstats); + port_counter->hstats = NULL; + mutex_destroy(&port_counter->lock); + } +} + +void rdma_counter_release(struct ib_device *dev) +{ + struct rdma_port_counter *port_counter; + u32 port; + + rdma_for_each_port(dev, port) { + port_counter = &dev->port_data[port].port_counter; + rdma_free_hw_stats_struct(port_counter->hstats); + mutex_destroy(&port_counter->lock); + } +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/cq.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/cq.c new file mode 100644 index 0000000..e0ceb76 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/cq.c @@ -0,0 +1,512 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2015 HGST, a Western Digital Company. + */ +#include +#include +#include +#include + +#include "core_priv.h" + +#include +/* Max size for shared CQ, may require tuning */ +#define IB_MAX_SHARED_CQ_SZ 4096U + +/* # of WCs to poll for with a single call to ib_poll_cq */ +#define IB_POLL_BATCH 16 +#define IB_POLL_BATCH_DIRECT 8 + +/* # of WCs to iterate over before yielding */ +#define IB_POLL_BUDGET_IRQ 256 +#define IB_POLL_BUDGET_WORKQUEUE 65536 + +#define IB_POLL_FLAGS \ + (IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS) + +static const struct dim_cq_moder +rdma_dim_prof[RDMA_DIM_PARAMS_NUM_PROFILES] = { + {1, 0, 1, 0}, + {1, 0, 4, 0}, + {2, 0, 4, 0}, + {2, 0, 8, 0}, + {4, 0, 8, 0}, + {16, 0, 8, 0}, + {16, 0, 16, 0}, + {32, 0, 16, 0}, + {32, 0, 32, 0}, +}; + +static void ib_cq_rdma_dim_work(struct work_struct *w) +{ + struct dim *dim = container_of(w, struct dim, work); + struct ib_cq *cq = dim->priv; + + u16 usec = rdma_dim_prof[dim->profile_ix].usec; + u16 comps = rdma_dim_prof[dim->profile_ix].comps; + + dim->state = DIM_START_MEASURE; + + trace_cq_modify(cq, comps, usec); + cq->device->ops.modify_cq(cq, comps, usec); +} + +static void rdma_dim_init(struct ib_cq *cq) +{ + struct dim *dim; + + if (!cq->device->ops.modify_cq || !cq->device->use_cq_dim || + cq->poll_ctx == IB_POLL_DIRECT) + return; + + dim = kzalloc(sizeof(struct dim), GFP_KERNEL); + if (!dim) + return; + + dim->state = DIM_START_MEASURE; + dim->tune_state = DIM_GOING_RIGHT; + dim->profile_ix = RDMA_DIM_START_PROFILE; + dim->priv = cq; + cq->dim = dim; + + INIT_WORK(&dim->work, ib_cq_rdma_dim_work); +} + +static void rdma_dim_destroy(struct ib_cq *cq) +{ + if (!cq->dim) + return; + + cancel_work_sync(&cq->dim->work); + kfree(cq->dim); +} + +static int __poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc) +{ + int rc; + + rc = ib_poll_cq(cq, num_entries, wc); + trace_cq_poll(cq, num_entries, rc); + return rc; +} + +static int __ib_process_cq(struct ib_cq *cq, int budget, struct ib_wc *wcs, + int batch) +{ + int i, n, completed = 0; + + trace_cq_process(cq); + + /* + * budget might be (-1) if the caller does not + * want to bound this call, thus we need unsigned + * minimum here. + */ + while ((n = __poll_cq(cq, min_t(u32, batch, + budget - completed), wcs)) > 0) { + for (i = 0; i < n; i++) { + struct ib_wc *wc = &wcs[i]; + + if (wc->wr_cqe) + wc->wr_cqe->done(cq, wc); + else + WARN_ON_ONCE(wc->status == IB_WC_SUCCESS); + } + + completed += n; + + if (n != batch || (budget != -1 && completed >= budget)) + break; + } + + return completed; +} + +/** + * ib_process_cq_direct - process a CQ in caller context + * @cq: CQ to process + * @budget: number of CQEs to poll for + * + * This function is used to process all outstanding CQ entries. + * It does not offload CQ processing to a different context and does + * not ask for completion interrupts from the HCA. + * Using direct processing on CQ with non IB_POLL_DIRECT type may trigger + * concurrent processing. + * + * Note: do not pass -1 as %budget unless it is guaranteed that the number + * of completions that will be processed is small. + */ +int ib_process_cq_direct(struct ib_cq *cq, int budget) +{ + struct ib_wc wcs[IB_POLL_BATCH_DIRECT]; + + return __ib_process_cq(cq, budget, wcs, IB_POLL_BATCH_DIRECT); +} +EXPORT_SYMBOL(ib_process_cq_direct); + +static void ib_cq_completion_direct(struct ib_cq *cq, void *private) +{ + WARN_ONCE(1, "got unsolicited completion for CQ 0x%p\n", cq); +} + +static int ib_poll_handler(struct irq_poll *iop, int budget) +{ + struct ib_cq *cq = container_of(iop, struct ib_cq, iop); + struct dim *dim = cq->dim; + int completed; + + completed = __ib_process_cq(cq, budget, cq->wc, IB_POLL_BATCH); + if (completed < budget) { + irq_poll_complete(&cq->iop); + if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) { + trace_cq_reschedule(cq); + irq_poll_sched(&cq->iop); + } + } + + if (dim) + rdma_dim(dim, completed); + + return completed; +} + +static void ib_cq_completion_softirq(struct ib_cq *cq, void *private) +{ + trace_cq_schedule(cq); + irq_poll_sched(&cq->iop); +} + +static void ib_cq_poll_work(struct work_struct *work) +{ + struct ib_cq *cq = container_of(work, struct ib_cq, work); + int completed; + + completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE, cq->wc, + IB_POLL_BATCH); + if (completed >= IB_POLL_BUDGET_WORKQUEUE || + ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) + queue_work(cq->comp_wq, &cq->work); + else if (cq->dim) + rdma_dim(cq->dim, completed); +} + +static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private) +{ + trace_cq_schedule(cq); + queue_work(cq->comp_wq, &cq->work); +} + +/** + * __ib_alloc_cq - allocate a completion queue + * @dev: device to allocate the CQ for + * @private: driver private data, accessible from cq->cq_context + * @nr_cqe: number of CQEs to allocate + * @comp_vector: HCA completion vectors for this CQ + * @poll_ctx: context to poll the CQ from. + * @caller: module owner name. + * @skip_tracking: avoid resource tracking + * + * This is the proper interface to allocate a CQ for in-kernel users. A + * CQ allocated with this interface will automatically be polled from the + * specified context. The ULP must use wr->wr_cqe instead of wr->wr_id + * to use this CQ abstraction. + */ +struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private, int nr_cqe, + int comp_vector, enum ib_poll_context poll_ctx, + const char *caller, bool skip_tracking) +{ + struct ib_cq_init_attr cq_attr = { + .cqe = nr_cqe, + .comp_vector = comp_vector, + }; + struct ib_cq *cq; + int ret = -ENOMEM; + + cq = rdma_zalloc_drv_obj(dev, ib_cq); + if (!cq) + return ERR_PTR(ret); + + cq->device = dev; + cq->cq_context = private; + cq->poll_ctx = poll_ctx; + atomic_set(&cq->usecnt, 0); + cq->comp_vector = comp_vector; + + cq->wc = kmalloc_array(IB_POLL_BATCH, sizeof(*cq->wc), GFP_KERNEL); + if (!cq->wc) + goto out_free_cq; + + rdma_restrack_new(&cq->res, RDMA_RESTRACK_CQ); + rdma_restrack_set_name(&cq->res, caller); + + ret = dev->ops.create_cq(cq, &cq_attr, NULL); + if (ret) + goto out_free_wc; + + rdma_dim_init(cq); + + switch (cq->poll_ctx) { + case IB_POLL_DIRECT: + cq->comp_handler = ib_cq_completion_direct; + break; + case IB_POLL_SOFTIRQ: + cq->comp_handler = ib_cq_completion_softirq; + + irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ, ib_poll_handler); + ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + break; + case IB_POLL_WORKQUEUE: + case IB_POLL_UNBOUND_WORKQUEUE: + cq->comp_handler = ib_cq_completion_workqueue; + INIT_WORK(&cq->work, ib_cq_poll_work); + ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + cq->comp_wq = (cq->poll_ctx == IB_POLL_WORKQUEUE) ? + ib_comp_wq : ib_comp_unbound_wq; + break; + default: + ret = -EINVAL; + goto out_destroy_cq; + } + + if (skip_tracking) + rdma_restrack_dontrack(&cq->res); + else + rdma_restrack_add(&cq->res); + trace_cq_alloc(cq, nr_cqe, comp_vector, poll_ctx); + return cq; + +out_destroy_cq: + rdma_dim_destroy(cq); + cq->device->ops.destroy_cq(cq, NULL); +out_free_wc: + rdma_restrack_put(&cq->res); + kfree(cq->wc); +out_free_cq: + kfree(cq); + trace_cq_alloc_error(nr_cqe, comp_vector, poll_ctx, ret); + return ERR_PTR(ret); +} +EXPORT_SYMBOL(__ib_alloc_cq); + +/** + * __ib_alloc_cq_any - allocate a completion queue + * @dev: device to allocate the CQ for + * @private: driver private data, accessible from cq->cq_context + * @nr_cqe: number of CQEs to allocate + * @poll_ctx: context to poll the CQ from + * @caller: module owner name + * + * Attempt to spread ULP Completion Queues over each device's interrupt + * vectors. A simple best-effort mechanism is used. + */ +struct ib_cq *__ib_alloc_cq_any(struct ib_device *dev, void *private, + int nr_cqe, enum ib_poll_context poll_ctx, + const char *caller) +{ + static atomic_t counter; + int comp_vector = 0; + + if (dev->num_comp_vectors > 1) + comp_vector = + atomic_inc_return(&counter) % + min_t(int, dev->num_comp_vectors, num_online_cpus()); + + return __ib_alloc_cq(dev, private, nr_cqe, comp_vector, poll_ctx, + caller, false); +} +EXPORT_SYMBOL(__ib_alloc_cq_any); + +/** + * ib_free_cq - free a completion queue + * @cq: completion queue to free. + */ +void ib_free_cq(struct ib_cq *cq) +{ + int ret; + + if (WARN_ON_ONCE(atomic_read(&cq->usecnt))) + return; + if (WARN_ON_ONCE(cq->cqe_used)) + return; + + switch (cq->poll_ctx) { + case IB_POLL_DIRECT: + break; + case IB_POLL_SOFTIRQ: + irq_poll_disable(&cq->iop); + break; + case IB_POLL_WORKQUEUE: + case IB_POLL_UNBOUND_WORKQUEUE: + cancel_work_sync(&cq->work); + break; + default: + WARN_ON_ONCE(1); + } + + rdma_dim_destroy(cq); + trace_cq_free(cq); + ret = cq->device->ops.destroy_cq(cq, NULL); + WARN_ONCE(ret, "Destroy of kernel CQ shouldn't fail"); + rdma_restrack_del(&cq->res); + kfree(cq->wc); + kfree(cq); +} +EXPORT_SYMBOL(ib_free_cq); + +void ib_cq_pool_cleanup(struct ib_device *dev) +{ + struct ib_cq *cq, *n; + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(dev->cq_pools); i++) { + list_for_each_entry_safe(cq, n, &dev->cq_pools[i], + pool_entry) { + WARN_ON(cq->cqe_used); + list_del(&cq->pool_entry); + cq->shared = false; + ib_free_cq(cq); + } + } +} + +static int ib_alloc_cqs(struct ib_device *dev, unsigned int nr_cqes, + enum ib_poll_context poll_ctx) +{ + LIST_HEAD(tmp_list); + unsigned int nr_cqs, i; + struct ib_cq *cq, *n; + int ret; + + if (poll_ctx > IB_POLL_LAST_POOL_TYPE) { + WARN_ON_ONCE(poll_ctx > IB_POLL_LAST_POOL_TYPE); + return -EINVAL; + } + + /* + * Allocate at least as many CQEs as requested, and otherwise + * a reasonable batch size so that we can share CQs between + * multiple users instead of allocating a larger number of CQs. + */ + nr_cqes = min_t(unsigned int, dev->attrs.max_cqe, + max(nr_cqes, IB_MAX_SHARED_CQ_SZ)); + nr_cqs = min_t(unsigned int, dev->num_comp_vectors, num_online_cpus()); + for (i = 0; i < nr_cqs; i++) { + cq = ib_alloc_cq(dev, NULL, nr_cqes, i, poll_ctx); + if (IS_ERR(cq)) { + ret = PTR_ERR(cq); + goto out_free_cqs; + } + cq->shared = true; + list_add_tail(&cq->pool_entry, &tmp_list); + } + + spin_lock_irq(&dev->cq_pools_lock); + list_splice(&tmp_list, &dev->cq_pools[poll_ctx]); + spin_unlock_irq(&dev->cq_pools_lock); + + return 0; + +out_free_cqs: + list_for_each_entry_safe(cq, n, &tmp_list, pool_entry) { + cq->shared = false; + ib_free_cq(cq); + } + return ret; +} + +/** + * ib_cq_pool_get() - Find the least used completion queue that matches + * a given cpu hint (or least used for wild card affinity) and fits + * nr_cqe. + * @dev: rdma device + * @nr_cqe: number of needed cqe entries + * @comp_vector_hint: completion vector hint (-1) for the driver to assign + * a comp vector based on internal counter + * @poll_ctx: cq polling context + * + * Finds a cq that satisfies @comp_vector_hint and @nr_cqe requirements and + * claim entries in it for us. In case there is no available cq, allocate + * a new cq with the requirements and add it to the device pool. + * IB_POLL_DIRECT cannot be used for shared cqs so it is not a valid value + * for @poll_ctx. + */ +struct ib_cq *ib_cq_pool_get(struct ib_device *dev, unsigned int nr_cqe, + int comp_vector_hint, + enum ib_poll_context poll_ctx) +{ + static unsigned int default_comp_vector; + unsigned int vector, num_comp_vectors; + struct ib_cq *cq, *found = NULL; + int ret; + + if (poll_ctx > IB_POLL_LAST_POOL_TYPE) { + WARN_ON_ONCE(poll_ctx > IB_POLL_LAST_POOL_TYPE); + return ERR_PTR(-EINVAL); + } + + num_comp_vectors = + min_t(unsigned int, dev->num_comp_vectors, num_online_cpus()); + /* Project the affinty to the device completion vector range */ + if (comp_vector_hint < 0) { + comp_vector_hint = + (READ_ONCE(default_comp_vector) + 1) % num_comp_vectors; + WRITE_ONCE(default_comp_vector, comp_vector_hint); + } + vector = comp_vector_hint % num_comp_vectors; + + /* + * Find the least used CQ with correct affinity and + * enough free CQ entries + */ + while (!found) { + spin_lock_irq(&dev->cq_pools_lock); + list_for_each_entry(cq, &dev->cq_pools[poll_ctx], + pool_entry) { + /* + * Check to see if we have found a CQ with the + * correct completion vector + */ + if (vector != cq->comp_vector) + continue; + if (cq->cqe_used + nr_cqe > cq->cqe) + continue; + found = cq; + break; + } + + if (found) { + found->cqe_used += nr_cqe; + spin_unlock_irq(&dev->cq_pools_lock); + + return found; + } + spin_unlock_irq(&dev->cq_pools_lock); + + /* + * Didn't find a match or ran out of CQs in the device + * pool, allocate a new array of CQs. + */ + ret = ib_alloc_cqs(dev, nr_cqe, poll_ctx); + if (ret) + return ERR_PTR(ret); + } + + return found; +} +EXPORT_SYMBOL(ib_cq_pool_get); + +/** + * ib_cq_pool_put - Return a CQ taken from a shared pool. + * @cq: The CQ to return. + * @nr_cqe: The max number of cqes that the user had requested. + */ +void ib_cq_pool_put(struct ib_cq *cq, unsigned int nr_cqe) +{ + if (WARN_ON_ONCE(nr_cqe > cq->cqe_used)) + return; + + spin_lock_irq(&cq->device->cq_pools_lock); + cq->cqe_used -= nr_cqe; + spin_unlock_irq(&cq->device->cq_pools_lock); +} +EXPORT_SYMBOL(ib_cq_pool_put); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/device.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/device.c new file mode 100644 index 0000000..9babab2 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/device.c @@ -0,0 +1,2901 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "core_priv.h" +#include "restrack.h" + +MODULE_AUTHOR("Roland Dreier"); +MODULE_DESCRIPTION("core kernel InfiniBand API"); +MODULE_LICENSE("Dual BSD/GPL"); + +struct workqueue_struct *ib_comp_wq; +struct workqueue_struct *ib_comp_unbound_wq; +struct workqueue_struct *ib_wq; +EXPORT_SYMBOL_GPL(ib_wq); +static struct workqueue_struct *ib_unreg_wq; + +/* + * Each of the three rwsem locks (devices, clients, client_data) protects the + * xarray of the same name. Specifically it allows the caller to assert that + * the MARK will/will not be changing under the lock, and for devices and + * clients, that the value in the xarray is still a valid pointer. Change of + * the MARK is linked to the object state, so holding the lock and testing the + * MARK also asserts that the contained object is in a certain state. + * + * This is used to build a two stage register/unregister flow where objects + * can continue to be in the xarray even though they are still in progress to + * register/unregister. + * + * The xarray itself provides additional locking, and restartable iteration, + * which is also relied on. + * + * Locks should not be nested, with the exception of client_data, which is + * allowed to nest under the read side of the other two locks. + * + * The devices_rwsem also protects the device name list, any change or + * assignment of device name must also hold the write side to guarantee unique + * names. + */ + +/* + * devices contains devices that have had their names assigned. The + * devices may not be registered. Users that care about the registration + * status need to call ib_device_try_get() on the device to ensure it is + * registered, and keep it registered, for the required duration. + * + */ +static DEFINE_XARRAY_FLAGS(devices, XA_FLAGS_ALLOC); +static DECLARE_RWSEM(devices_rwsem); +#define DEVICE_REGISTERED XA_MARK_1 + +static u32 highest_client_id; +#define CLIENT_REGISTERED XA_MARK_1 +static DEFINE_XARRAY_FLAGS(clients, XA_FLAGS_ALLOC); +static DECLARE_RWSEM(clients_rwsem); + +static void ib_client_put(struct ib_client *client) +{ + if (refcount_dec_and_test(&client->uses)) + complete(&client->uses_zero); +} + +/* + * If client_data is registered then the corresponding client must also still + * be registered. + */ +#define CLIENT_DATA_REGISTERED XA_MARK_1 + +unsigned int rdma_dev_net_id; + +/* + * A list of net namespaces is maintained in an xarray. This is necessary + * because we can't get the locking right using the existing net ns list. We + * would require a init_net callback after the list is updated. + */ +static DEFINE_XARRAY_FLAGS(rdma_nets, XA_FLAGS_ALLOC); +/* + * rwsem to protect accessing the rdma_nets xarray entries. + */ +static DECLARE_RWSEM(rdma_nets_rwsem); + +bool ib_devices_shared_netns = true; +module_param_named(netns_mode, ib_devices_shared_netns, bool, 0444); +MODULE_PARM_DESC(netns_mode, + "Share device among net namespaces; default=1 (shared)"); +/** + * rdma_dev_access_netns() - Return whether an rdma device can be accessed + * from a specified net namespace or not. + * @dev: Pointer to rdma device which needs to be checked + * @net: Pointer to net namesapce for which access to be checked + * + * When the rdma device is in shared mode, it ignores the net namespace. + * When the rdma device is exclusive to a net namespace, rdma device net + * namespace is checked against the specified one. + */ +bool rdma_dev_access_netns(const struct ib_device *dev, const struct net *net) +{ + return (ib_devices_shared_netns || + net_eq(read_pnet(&dev->coredev.rdma_net), net)); +} +EXPORT_SYMBOL(rdma_dev_access_netns); + +/* + * xarray has this behavior where it won't iterate over NULL values stored in + * allocated arrays. So we need our own iterator to see all values stored in + * the array. This does the same thing as xa_for_each except that it also + * returns NULL valued entries if the array is allocating. Simplified to only + * work on simple xarrays. + */ +static void *xan_find_marked(struct xarray *xa, unsigned long *indexp, + xa_mark_t filter) +{ + XA_STATE(xas, xa, *indexp); + void *entry; + + rcu_read_lock(); + do { + entry = xas_find_marked(&xas, ULONG_MAX, filter); + if (xa_is_zero(entry)) + break; + } while (xas_retry(&xas, entry)); + rcu_read_unlock(); + + if (entry) { + *indexp = xas.xa_index; + if (xa_is_zero(entry)) + return NULL; + return entry; + } + return XA_ERROR(-ENOENT); +} +#define xan_for_each_marked(xa, index, entry, filter) \ + for (index = 0, entry = xan_find_marked(xa, &(index), filter); \ + !xa_is_err(entry); \ + (index)++, entry = xan_find_marked(xa, &(index), filter)) + +/* RCU hash table mapping netdevice pointers to struct ib_port_data */ +static DEFINE_SPINLOCK(ndev_hash_lock); +static DECLARE_HASHTABLE(ndev_hash, 5); + +static void free_netdevs(struct ib_device *ib_dev); +static void ib_unregister_work(struct work_struct *work); +static void __ib_unregister_device(struct ib_device *device); +static int ib_security_change(struct notifier_block *nb, unsigned long event, + void *lsm_data); +static void ib_policy_change_task(struct work_struct *work); +static DECLARE_WORK(ib_policy_change_work, ib_policy_change_task); + +static void __ibdev_printk(const char *level, const struct ib_device *ibdev, + struct va_format *vaf) +{ + if (ibdev && ibdev->dev.parent) + dev_printk_emit(level[1] - '0', + ibdev->dev.parent, + "%s %s %s: %pV", + dev_driver_string(ibdev->dev.parent), + dev_name(ibdev->dev.parent), + dev_name(&ibdev->dev), + vaf); + else if (ibdev) + printk("%s%s: %pV", + level, dev_name(&ibdev->dev), vaf); + else + printk("%s(NULL ib_device): %pV", level, vaf); +} + +void ibdev_printk(const char *level, const struct ib_device *ibdev, + const char *format, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, format); + + vaf.fmt = format; + vaf.va = &args; + + __ibdev_printk(level, ibdev, &vaf); + + va_end(args); +} +EXPORT_SYMBOL(ibdev_printk); + +#define define_ibdev_printk_level(func, level) \ +void func(const struct ib_device *ibdev, const char *fmt, ...) \ +{ \ + struct va_format vaf; \ + va_list args; \ + \ + va_start(args, fmt); \ + \ + vaf.fmt = fmt; \ + vaf.va = &args; \ + \ + __ibdev_printk(level, ibdev, &vaf); \ + \ + va_end(args); \ +} \ +EXPORT_SYMBOL(func); + +define_ibdev_printk_level(ibdev_emerg, KERN_EMERG); +define_ibdev_printk_level(ibdev_alert, KERN_ALERT); +define_ibdev_printk_level(ibdev_crit, KERN_CRIT); +define_ibdev_printk_level(ibdev_err, KERN_ERR); +define_ibdev_printk_level(ibdev_warn, KERN_WARNING); +define_ibdev_printk_level(ibdev_notice, KERN_NOTICE); +define_ibdev_printk_level(ibdev_info, KERN_INFO); + +static struct notifier_block ibdev_lsm_nb = { + .notifier_call = ib_security_change, +}; + +static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, + struct net *net); + +/* Pointer to the RCU head at the start of the ib_port_data array */ +struct ib_port_data_rcu { + struct rcu_head rcu_head; + struct ib_port_data pdata[]; +}; + +static void ib_device_check_mandatory(struct ib_device *device) +{ +#define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device_ops, x), #x } + static const struct { + size_t offset; + char *name; + } mandatory_table[] = { + IB_MANDATORY_FUNC(query_device), + IB_MANDATORY_FUNC(query_port), + IB_MANDATORY_FUNC(alloc_pd), + IB_MANDATORY_FUNC(dealloc_pd), + IB_MANDATORY_FUNC(create_qp), + IB_MANDATORY_FUNC(modify_qp), + IB_MANDATORY_FUNC(destroy_qp), + IB_MANDATORY_FUNC(post_send), + IB_MANDATORY_FUNC(post_recv), + IB_MANDATORY_FUNC(create_cq), + IB_MANDATORY_FUNC(destroy_cq), + IB_MANDATORY_FUNC(poll_cq), + IB_MANDATORY_FUNC(req_notify_cq), + IB_MANDATORY_FUNC(get_dma_mr), + IB_MANDATORY_FUNC(reg_user_mr), + IB_MANDATORY_FUNC(dereg_mr), + IB_MANDATORY_FUNC(get_port_immutable) + }; + int i; + + device->kverbs_provider = true; + for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { + if (!*(void **) ((void *) &device->ops + + mandatory_table[i].offset)) { + device->kverbs_provider = false; + break; + } + } +} + +/* + * Caller must perform ib_device_put() to return the device reference count + * when ib_device_get_by_index() returns valid device pointer. + */ +struct ib_device *ib_device_get_by_index(const struct net *net, u32 index) +{ + struct ib_device *device; + + down_read(&devices_rwsem); + device = xa_load(&devices, index); + if (device) { + if (!rdma_dev_access_netns(device, net)) { + device = NULL; + goto out; + } + + if (!ib_device_try_get(device)) + device = NULL; + } +out: + up_read(&devices_rwsem); + return device; +} + +/** + * ib_device_put - Release IB device reference + * @device: device whose reference to be released + * + * ib_device_put() releases reference to the IB device to allow it to be + * unregistered and eventually free. + */ +void ib_device_put(struct ib_device *device) +{ + if (refcount_dec_and_test(&device->refcount)) + complete(&device->unreg_completion); +} +EXPORT_SYMBOL(ib_device_put); + +static struct ib_device *__ib_device_get_by_name(const char *name) +{ + struct ib_device *device; + unsigned long index; + + xa_for_each (&devices, index, device) + if (!strcmp(name, dev_name(&device->dev))) + return device; + + return NULL; +} + +/** + * ib_device_get_by_name - Find an IB device by name + * @name: The name to look for + * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) + * + * Find and hold an ib_device by its name. The caller must call + * ib_device_put() on the returned pointer. + */ +struct ib_device *ib_device_get_by_name(const char *name, + enum rdma_driver_id driver_id) +{ + struct ib_device *device; + + down_read(&devices_rwsem); + device = __ib_device_get_by_name(name); + if (device && driver_id != RDMA_DRIVER_UNKNOWN && + device->ops.driver_id != driver_id) + device = NULL; + + if (device) { + if (!ib_device_try_get(device)) + device = NULL; + } + up_read(&devices_rwsem); + return device; +} +EXPORT_SYMBOL(ib_device_get_by_name); + +static int rename_compat_devs(struct ib_device *device) +{ + struct ib_core_device *cdev; + unsigned long index; + int ret = 0; + + mutex_lock(&device->compat_devs_mutex); + xa_for_each (&device->compat_devs, index, cdev) { + ret = device_rename(&cdev->dev, dev_name(&device->dev)); + if (ret) { + dev_warn(&cdev->dev, + "Fail to rename compatdev to new name %s\n", + dev_name(&device->dev)); + break; + } + } + mutex_unlock(&device->compat_devs_mutex); + return ret; +} + +int ib_device_rename(struct ib_device *ibdev, const char *name) +{ + unsigned long index; + void *client_data; + int ret; + + down_write(&devices_rwsem); + if (!strcmp(name, dev_name(&ibdev->dev))) { + up_write(&devices_rwsem); + return 0; + } + + if (__ib_device_get_by_name(name)) { + up_write(&devices_rwsem); + return -EEXIST; + } + + ret = device_rename(&ibdev->dev, name); + if (ret) { + up_write(&devices_rwsem); + return ret; + } + + strlcpy(ibdev->name, name, IB_DEVICE_NAME_MAX); + ret = rename_compat_devs(ibdev); + + downgrade_write(&devices_rwsem); + down_read(&ibdev->client_data_rwsem); + xan_for_each_marked(&ibdev->client_data, index, client_data, + CLIENT_DATA_REGISTERED) { + struct ib_client *client = xa_load(&clients, index); + + if (!client || !client->rename) + continue; + + client->rename(ibdev, client_data); + } + up_read(&ibdev->client_data_rwsem); + up_read(&devices_rwsem); + return 0; +} + +int ib_device_set_dim(struct ib_device *ibdev, u8 use_dim) +{ + if (use_dim > 1) + return -EINVAL; + ibdev->use_cq_dim = use_dim; + + return 0; +} + +static int alloc_name(struct ib_device *ibdev, const char *name) +{ + struct ib_device *device; + unsigned long index; + struct ida inuse; + int rc; + int i; + + lockdep_assert_held_write(&devices_rwsem); + ida_init(&inuse); + xa_for_each (&devices, index, device) { + char buf[IB_DEVICE_NAME_MAX]; + + if (sscanf(dev_name(&device->dev), name, &i) != 1) + continue; + if (i < 0 || i >= INT_MAX) + continue; + snprintf(buf, sizeof buf, name, i); + if (strcmp(buf, dev_name(&device->dev)) != 0) + continue; + + rc = ida_alloc_range(&inuse, i, i, GFP_KERNEL); + if (rc < 0) + goto out; + } + + rc = ida_alloc(&inuse, GFP_KERNEL); + if (rc < 0) + goto out; + + rc = dev_set_name(&ibdev->dev, name, rc); +out: + ida_destroy(&inuse); + return rc; +} + +static void ib_device_release(struct device *device) +{ + struct ib_device *dev = container_of(device, struct ib_device, dev); + + free_netdevs(dev); + WARN_ON(refcount_read(&dev->refcount)); + if (dev->hw_stats_data) + ib_device_release_hw_stats(dev->hw_stats_data); + if (dev->port_data) { + ib_cache_release_one(dev); + ib_security_release_port_pkey_list(dev); + rdma_counter_release(dev); + kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu, + pdata[0]), + rcu_head); + } + + mutex_destroy(&dev->unregistration_lock); + mutex_destroy(&dev->compat_devs_mutex); + + xa_destroy(&dev->compat_devs); + xa_destroy(&dev->client_data); + kfree_rcu(dev, rcu_head); +} + +static int ib_device_uevent(struct device *device, + struct kobj_uevent_env *env) +{ + if (add_uevent_var(env, "NAME=%s", dev_name(device))) + return -ENOMEM; + + /* + * It would be nice to pass the node GUID with the event... + */ + + return 0; +} + +static const void *net_namespace(struct device *d) +{ + struct ib_core_device *coredev = + container_of(d, struct ib_core_device, dev); + + return read_pnet(&coredev->rdma_net); +} + +static struct class ib_class = { + .name = "infiniband", + .dev_release = ib_device_release, + .dev_uevent = ib_device_uevent, + .ns_type = &net_ns_type_operations, + .namespace = net_namespace, +}; + +static void rdma_init_coredev(struct ib_core_device *coredev, + struct ib_device *dev, struct net *net) +{ + /* This BUILD_BUG_ON is intended to catch layout change + * of union of ib_core_device and device. + * dev must be the first element as ib_core and providers + * driver uses it. Adding anything in ib_core_device before + * device will break this assumption. + */ + BUILD_BUG_ON(offsetof(struct ib_device, coredev.dev) != + offsetof(struct ib_device, dev)); + + coredev->dev.class = &ib_class; + coredev->dev.groups = dev->groups; + device_initialize(&coredev->dev); + coredev->owner = dev; + INIT_LIST_HEAD(&coredev->port_list); + write_pnet(&coredev->rdma_net, net); +} + +/** + * _ib_alloc_device - allocate an IB device struct + * @size:size of structure to allocate + * + * Low-level drivers should use ib_alloc_device() to allocate &struct + * ib_device. @size is the size of the structure to be allocated, + * including any private data used by the low-level driver. + * ib_dealloc_device() must be used to free structures allocated with + * ib_alloc_device(). + */ +struct ib_device *_ib_alloc_device(size_t size) +{ + struct ib_device *device; + unsigned int i; + + if (WARN_ON(size < sizeof(struct ib_device))) + return NULL; + + device = kzalloc(size, GFP_KERNEL); + if (!device) + return NULL; + + if (rdma_restrack_init(device)) { + kfree(device); + return NULL; + } + + rdma_init_coredev(&device->coredev, device, &init_net); + + INIT_LIST_HEAD(&device->event_handler_list); + spin_lock_init(&device->qp_open_list_lock); + init_rwsem(&device->event_handler_rwsem); + mutex_init(&device->unregistration_lock); + /* + * client_data needs to be alloc because we don't want our mark to be + * destroyed if the user stores NULL in the client data. + */ + xa_init_flags(&device->client_data, XA_FLAGS_ALLOC); + init_rwsem(&device->client_data_rwsem); + xa_init_flags(&device->compat_devs, XA_FLAGS_ALLOC); + mutex_init(&device->compat_devs_mutex); + init_completion(&device->unreg_completion); + INIT_WORK(&device->unregistration_work, ib_unregister_work); + + spin_lock_init(&device->cq_pools_lock); + for (i = 0; i < ARRAY_SIZE(device->cq_pools); i++) + INIT_LIST_HEAD(&device->cq_pools[i]); + + rwlock_init(&device->cache_lock); + + device->uverbs_cmd_mask = + BIT_ULL(IB_USER_VERBS_CMD_ALLOC_MW) | + BIT_ULL(IB_USER_VERBS_CMD_ALLOC_PD) | + BIT_ULL(IB_USER_VERBS_CMD_ATTACH_MCAST) | + BIT_ULL(IB_USER_VERBS_CMD_CLOSE_XRCD) | + BIT_ULL(IB_USER_VERBS_CMD_CREATE_AH) | + BIT_ULL(IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | + BIT_ULL(IB_USER_VERBS_CMD_CREATE_CQ) | + BIT_ULL(IB_USER_VERBS_CMD_CREATE_QP) | + BIT_ULL(IB_USER_VERBS_CMD_CREATE_SRQ) | + BIT_ULL(IB_USER_VERBS_CMD_CREATE_XSRQ) | + BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_MW) | + BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_PD) | + BIT_ULL(IB_USER_VERBS_CMD_DEREG_MR) | + BIT_ULL(IB_USER_VERBS_CMD_DESTROY_AH) | + BIT_ULL(IB_USER_VERBS_CMD_DESTROY_CQ) | + BIT_ULL(IB_USER_VERBS_CMD_DESTROY_QP) | + BIT_ULL(IB_USER_VERBS_CMD_DESTROY_SRQ) | + BIT_ULL(IB_USER_VERBS_CMD_DETACH_MCAST) | + BIT_ULL(IB_USER_VERBS_CMD_GET_CONTEXT) | + BIT_ULL(IB_USER_VERBS_CMD_MODIFY_QP) | + BIT_ULL(IB_USER_VERBS_CMD_MODIFY_SRQ) | + BIT_ULL(IB_USER_VERBS_CMD_OPEN_QP) | + BIT_ULL(IB_USER_VERBS_CMD_OPEN_XRCD) | + BIT_ULL(IB_USER_VERBS_CMD_QUERY_DEVICE) | + BIT_ULL(IB_USER_VERBS_CMD_QUERY_PORT) | + BIT_ULL(IB_USER_VERBS_CMD_QUERY_QP) | + BIT_ULL(IB_USER_VERBS_CMD_QUERY_SRQ) | + BIT_ULL(IB_USER_VERBS_CMD_REG_MR) | + BIT_ULL(IB_USER_VERBS_CMD_REREG_MR) | + BIT_ULL(IB_USER_VERBS_CMD_RESIZE_CQ); + return device; +} +EXPORT_SYMBOL(_ib_alloc_device); + +/** + * ib_dealloc_device - free an IB device struct + * @device:structure to free + * + * Free a structure allocated with ib_alloc_device(). + */ +void ib_dealloc_device(struct ib_device *device) +{ + if (device->ops.dealloc_driver) + device->ops.dealloc_driver(device); + + /* + * ib_unregister_driver() requires all devices to remain in the xarray + * while their ops are callable. The last op we call is dealloc_driver + * above. This is needed to create a fence on op callbacks prior to + * allowing the driver module to unload. + */ + down_write(&devices_rwsem); + if (xa_load(&devices, device->index) == device) + xa_erase(&devices, device->index); + up_write(&devices_rwsem); + + /* Expedite releasing netdev references */ + free_netdevs(device); + + WARN_ON(!xa_empty(&device->compat_devs)); + WARN_ON(!xa_empty(&device->client_data)); + WARN_ON(refcount_read(&device->refcount)); + rdma_restrack_clean(device); + /* Balances with device_initialize */ + put_device(&device->dev); +} +EXPORT_SYMBOL(ib_dealloc_device); + +/* + * add_client_context() and remove_client_context() must be safe against + * parallel calls on the same device - registration/unregistration of both the + * device and client can be occurring in parallel. + * + * The routines need to be a fence, any caller must not return until the add + * or remove is fully completed. + */ +static int add_client_context(struct ib_device *device, + struct ib_client *client) +{ + int ret = 0; + + if (!device->kverbs_provider && !client->no_kverbs_req) + return 0; + + down_write(&device->client_data_rwsem); + /* + * So long as the client is registered hold both the client and device + * unregistration locks. + */ + if (!refcount_inc_not_zero(&client->uses)) + goto out_unlock; + refcount_inc(&device->refcount); + + /* + * Another caller to add_client_context got here first and has already + * completely initialized context. + */ + if (xa_get_mark(&device->client_data, client->client_id, + CLIENT_DATA_REGISTERED)) + goto out; + + ret = xa_err(xa_store(&device->client_data, client->client_id, NULL, + GFP_KERNEL)); + if (ret) + goto out; + downgrade_write(&device->client_data_rwsem); + if (client->add) { + if (client->add(device)) { + /* + * If a client fails to add then the error code is + * ignored, but we won't call any more ops on this + * client. + */ + xa_erase(&device->client_data, client->client_id); + up_read(&device->client_data_rwsem); + ib_device_put(device); + ib_client_put(client); + return 0; + } + } + + /* Readers shall not see a client until add has been completed */ + xa_set_mark(&device->client_data, client->client_id, + CLIENT_DATA_REGISTERED); + up_read(&device->client_data_rwsem); + return 0; + +out: + ib_device_put(device); + ib_client_put(client); +out_unlock: + up_write(&device->client_data_rwsem); + return ret; +} + +static void remove_client_context(struct ib_device *device, + unsigned int client_id) +{ + struct ib_client *client; + void *client_data; + + down_write(&device->client_data_rwsem); + if (!xa_get_mark(&device->client_data, client_id, + CLIENT_DATA_REGISTERED)) { + up_write(&device->client_data_rwsem); + return; + } + client_data = xa_load(&device->client_data, client_id); + xa_clear_mark(&device->client_data, client_id, CLIENT_DATA_REGISTERED); + client = xa_load(&clients, client_id); + up_write(&device->client_data_rwsem); + + /* + * Notice we cannot be holding any exclusive locks when calling the + * remove callback as the remove callback can recurse back into any + * public functions in this module and thus try for any locks those + * functions take. + * + * For this reason clients and drivers should not call the + * unregistration functions will holdling any locks. + */ + if (client->remove) + client->remove(device, client_data); + + xa_erase(&device->client_data, client_id); + ib_device_put(device); + ib_client_put(client); +} + +static int alloc_port_data(struct ib_device *device) +{ + struct ib_port_data_rcu *pdata_rcu; + u32 port; + + if (device->port_data) + return 0; + + /* This can only be called once the physical port range is defined */ + if (WARN_ON(!device->phys_port_cnt)) + return -EINVAL; + + /* Reserve U32_MAX so the logic to go over all the ports is sane */ + if (WARN_ON(device->phys_port_cnt == U32_MAX)) + return -EINVAL; + + /* + * device->port_data is indexed directly by the port number to make + * access to this data as efficient as possible. + * + * Therefore port_data is declared as a 1 based array with potential + * empty slots at the beginning. + */ + pdata_rcu = kzalloc(struct_size(pdata_rcu, pdata, + rdma_end_port(device) + 1), + GFP_KERNEL); + if (!pdata_rcu) + return -ENOMEM; + /* + * The rcu_head is put in front of the port data array and the stored + * pointer is adjusted since we never need to see that member until + * kfree_rcu. + */ + device->port_data = pdata_rcu->pdata; + + rdma_for_each_port (device, port) { + struct ib_port_data *pdata = &device->port_data[port]; + + pdata->ib_dev = device; + spin_lock_init(&pdata->pkey_list_lock); + INIT_LIST_HEAD(&pdata->pkey_list); + spin_lock_init(&pdata->netdev_lock); + INIT_HLIST_NODE(&pdata->ndev_hash_link); + } + return 0; +} + +static int verify_immutable(const struct ib_device *dev, u32 port) +{ + return WARN_ON(!rdma_cap_ib_mad(dev, port) && + rdma_max_mad_size(dev, port) != 0); +} + +static int setup_port_data(struct ib_device *device) +{ + u32 port; + int ret; + + ret = alloc_port_data(device); + if (ret) + return ret; + + rdma_for_each_port (device, port) { + struct ib_port_data *pdata = &device->port_data[port]; + + ret = device->ops.get_port_immutable(device, port, + &pdata->immutable); + if (ret) + return ret; + + if (verify_immutable(device, port)) + return -EINVAL; + } + return 0; +} + +/** + * ib_port_immutable_read() - Read rdma port's immutable data + * @dev: IB device + * @port: port number whose immutable data to read. It starts with index 1 and + * valid upto including rdma_end_port(). + */ +const struct ib_port_immutable* +ib_port_immutable_read(struct ib_device *dev, unsigned int port) +{ + WARN_ON(!rdma_is_port_valid(dev, port)); + return &dev->port_data[port].immutable; +} +EXPORT_SYMBOL(ib_port_immutable_read); + +void ib_get_device_fw_str(struct ib_device *dev, char *str) +{ + if (dev->ops.get_dev_fw_str) + dev->ops.get_dev_fw_str(dev, str); + else + str[0] = '\0'; +} +EXPORT_SYMBOL(ib_get_device_fw_str); + +static void ib_policy_change_task(struct work_struct *work) +{ + struct ib_device *dev; + unsigned long index; + + down_read(&devices_rwsem); + xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { + unsigned int i; + + rdma_for_each_port (dev, i) { + u64 sp; + ib_get_cached_subnet_prefix(dev, i, &sp); + ib_security_cache_change(dev, i, sp); + } + } + up_read(&devices_rwsem); +} + +static int ib_security_change(struct notifier_block *nb, unsigned long event, + void *lsm_data) +{ + if (event != LSM_POLICY_CHANGE) + return NOTIFY_DONE; + + schedule_work(&ib_policy_change_work); + ib_mad_agent_security_change(); + + return NOTIFY_OK; +} + +static void compatdev_release(struct device *dev) +{ + struct ib_core_device *cdev = + container_of(dev, struct ib_core_device, dev); + + kfree(cdev); +} + +static int add_one_compat_dev(struct ib_device *device, + struct rdma_dev_net *rnet) +{ + struct ib_core_device *cdev; + int ret; + + lockdep_assert_held(&rdma_nets_rwsem); + if (!ib_devices_shared_netns) + return 0; + + /* + * Create and add compat device in all namespaces other than where it + * is currently bound to. + */ + if (net_eq(read_pnet(&rnet->net), + read_pnet(&device->coredev.rdma_net))) + return 0; + + /* + * The first of init_net() or ib_register_device() to take the + * compat_devs_mutex wins and gets to add the device. Others will wait + * for completion here. + */ + mutex_lock(&device->compat_devs_mutex); + cdev = xa_load(&device->compat_devs, rnet->id); + if (cdev) { + ret = 0; + goto done; + } + ret = xa_reserve(&device->compat_devs, rnet->id, GFP_KERNEL); + if (ret) + goto done; + + cdev = kzalloc(sizeof(*cdev), GFP_KERNEL); + if (!cdev) { + ret = -ENOMEM; + goto cdev_err; + } + + cdev->dev.parent = device->dev.parent; + rdma_init_coredev(cdev, device, read_pnet(&rnet->net)); + cdev->dev.release = compatdev_release; + ret = dev_set_name(&cdev->dev, "%s", dev_name(&device->dev)); + if (ret) + goto add_err; + + ret = device_add(&cdev->dev); + if (ret) + goto add_err; + ret = ib_setup_port_attrs(cdev); + if (ret) + goto port_err; + + ret = xa_err(xa_store(&device->compat_devs, rnet->id, + cdev, GFP_KERNEL)); + if (ret) + goto insert_err; + + mutex_unlock(&device->compat_devs_mutex); + return 0; + +insert_err: + ib_free_port_attrs(cdev); +port_err: + device_del(&cdev->dev); +add_err: + put_device(&cdev->dev); +cdev_err: + xa_release(&device->compat_devs, rnet->id); +done: + mutex_unlock(&device->compat_devs_mutex); + return ret; +} + +static void remove_one_compat_dev(struct ib_device *device, u32 id) +{ + struct ib_core_device *cdev; + + mutex_lock(&device->compat_devs_mutex); + cdev = xa_erase(&device->compat_devs, id); + mutex_unlock(&device->compat_devs_mutex); + if (cdev) { + ib_free_port_attrs(cdev); + device_del(&cdev->dev); + put_device(&cdev->dev); + } +} + +static void remove_compat_devs(struct ib_device *device) +{ + struct ib_core_device *cdev; + unsigned long index; + + xa_for_each (&device->compat_devs, index, cdev) + remove_one_compat_dev(device, index); +} + +static int add_compat_devs(struct ib_device *device) +{ + struct rdma_dev_net *rnet; + unsigned long index; + int ret = 0; + + lockdep_assert_held(&devices_rwsem); + + down_read(&rdma_nets_rwsem); + xa_for_each (&rdma_nets, index, rnet) { + ret = add_one_compat_dev(device, rnet); + if (ret) + break; + } + up_read(&rdma_nets_rwsem); + return ret; +} + +static void remove_all_compat_devs(void) +{ + struct ib_compat_device *cdev; + struct ib_device *dev; + unsigned long index; + + down_read(&devices_rwsem); + xa_for_each (&devices, index, dev) { + unsigned long c_index = 0; + + /* Hold nets_rwsem so that any other thread modifying this + * system param can sync with this thread. + */ + down_read(&rdma_nets_rwsem); + xa_for_each (&dev->compat_devs, c_index, cdev) + remove_one_compat_dev(dev, c_index); + up_read(&rdma_nets_rwsem); + } + up_read(&devices_rwsem); +} + +static int add_all_compat_devs(void) +{ + struct rdma_dev_net *rnet; + struct ib_device *dev; + unsigned long index; + int ret = 0; + + down_read(&devices_rwsem); + xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { + unsigned long net_index = 0; + + /* Hold nets_rwsem so that any other thread modifying this + * system param can sync with this thread. + */ + down_read(&rdma_nets_rwsem); + xa_for_each (&rdma_nets, net_index, rnet) { + ret = add_one_compat_dev(dev, rnet); + if (ret) + break; + } + up_read(&rdma_nets_rwsem); + } + up_read(&devices_rwsem); + if (ret) + remove_all_compat_devs(); + return ret; +} + +int rdma_compatdev_set(u8 enable) +{ + struct rdma_dev_net *rnet; + unsigned long index; + int ret = 0; + + down_write(&rdma_nets_rwsem); + if (ib_devices_shared_netns == enable) { + up_write(&rdma_nets_rwsem); + return 0; + } + + /* enable/disable of compat devices is not supported + * when more than default init_net exists. + */ + xa_for_each (&rdma_nets, index, rnet) { + ret++; + break; + } + if (!ret) + ib_devices_shared_netns = enable; + up_write(&rdma_nets_rwsem); + if (ret) + return -EBUSY; + + if (enable) + ret = add_all_compat_devs(); + else + remove_all_compat_devs(); + return ret; +} + +static void rdma_dev_exit_net(struct net *net) +{ + struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); + struct ib_device *dev; + unsigned long index; + int ret; + + down_write(&rdma_nets_rwsem); + /* + * Prevent the ID from being re-used and hide the id from xa_for_each. + */ + ret = xa_err(xa_store(&rdma_nets, rnet->id, NULL, GFP_KERNEL)); + WARN_ON(ret); + up_write(&rdma_nets_rwsem); + + down_read(&devices_rwsem); + xa_for_each (&devices, index, dev) { + get_device(&dev->dev); + /* + * Release the devices_rwsem so that pontentially blocking + * device_del, doesn't hold the devices_rwsem for too long. + */ + up_read(&devices_rwsem); + + remove_one_compat_dev(dev, rnet->id); + + /* + * If the real device is in the NS then move it back to init. + */ + rdma_dev_change_netns(dev, net, &init_net); + + put_device(&dev->dev); + down_read(&devices_rwsem); + } + up_read(&devices_rwsem); + + rdma_nl_net_exit(rnet); + xa_erase(&rdma_nets, rnet->id); +} + +static __net_init int rdma_dev_init_net(struct net *net) +{ + struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); + unsigned long index; + struct ib_device *dev; + int ret; + + write_pnet(&rnet->net, net); + + ret = rdma_nl_net_init(rnet); + if (ret) + return ret; + + /* No need to create any compat devices in default init_net. */ + if (net_eq(net, &init_net)) + return 0; + + ret = xa_alloc(&rdma_nets, &rnet->id, rnet, xa_limit_32b, GFP_KERNEL); + if (ret) { + rdma_nl_net_exit(rnet); + return ret; + } + + down_read(&devices_rwsem); + xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { + /* Hold nets_rwsem so that netlink command cannot change + * system configuration for device sharing mode. + */ + down_read(&rdma_nets_rwsem); + ret = add_one_compat_dev(dev, rnet); + up_read(&rdma_nets_rwsem); + if (ret) + break; + } + up_read(&devices_rwsem); + + if (ret) + rdma_dev_exit_net(net); + + return ret; +} + +/* + * Assign the unique string device name and the unique device index. This is + * undone by ib_dealloc_device. + */ +static int assign_name(struct ib_device *device, const char *name) +{ + static u32 last_id; + int ret; + + down_write(&devices_rwsem); + /* Assign a unique name to the device */ + if (strchr(name, '%')) + ret = alloc_name(device, name); + else + ret = dev_set_name(&device->dev, name); + if (ret) + goto out; + + if (__ib_device_get_by_name(dev_name(&device->dev))) { + ret = -ENFILE; + goto out; + } + strlcpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX); + + ret = xa_alloc_cyclic(&devices, &device->index, device, xa_limit_31b, + &last_id, GFP_KERNEL); + if (ret > 0) + ret = 0; + +out: + up_write(&devices_rwsem); + return ret; +} + +/* + * setup_device() allocates memory and sets up data that requires calling the + * device ops, this is the only reason these actions are not done during + * ib_alloc_device. It is undone by ib_dealloc_device(). + */ +static int setup_device(struct ib_device *device) +{ + struct ib_udata uhw = {.outlen = 0, .inlen = 0}; + int ret; + + ib_device_check_mandatory(device); + + ret = setup_port_data(device); + if (ret) { + dev_warn(&device->dev, "Couldn't create per-port data\n"); + return ret; + } + + memset(&device->attrs, 0, sizeof(device->attrs)); + ret = device->ops.query_device(device, &device->attrs, &uhw); + if (ret) { + dev_warn(&device->dev, + "Couldn't query the device attributes\n"); + return ret; + } + + return 0; +} + +static void disable_device(struct ib_device *device) +{ + u32 cid; + + WARN_ON(!refcount_read(&device->refcount)); + + down_write(&devices_rwsem); + xa_clear_mark(&devices, device->index, DEVICE_REGISTERED); + up_write(&devices_rwsem); + + /* + * Remove clients in LIFO order, see assign_client_id. This could be + * more efficient if xarray learns to reverse iterate. Since no new + * clients can be added to this ib_device past this point we only need + * the maximum possible client_id value here. + */ + down_read(&clients_rwsem); + cid = highest_client_id; + up_read(&clients_rwsem); + while (cid) { + cid--; + remove_client_context(device, cid); + } + + ib_cq_pool_cleanup(device); + + /* Pairs with refcount_set in enable_device */ + ib_device_put(device); + wait_for_completion(&device->unreg_completion); + + /* + * compat devices must be removed after device refcount drops to zero. + * Otherwise init_net() may add more compatdevs after removing compat + * devices and before device is disabled. + */ + remove_compat_devs(device); +} + +/* + * An enabled device is visible to all clients and to all the public facing + * APIs that return a device pointer. This always returns with a new get, even + * if it fails. + */ +static int enable_device_and_get(struct ib_device *device) +{ + struct ib_client *client; + unsigned long index; + int ret = 0; + + /* + * One ref belongs to the xa and the other belongs to this + * thread. This is needed to guard against parallel unregistration. + */ + refcount_set(&device->refcount, 2); + down_write(&devices_rwsem); + xa_set_mark(&devices, device->index, DEVICE_REGISTERED); + + /* + * By using downgrade_write() we ensure that no other thread can clear + * DEVICE_REGISTERED while we are completing the client setup. + */ + downgrade_write(&devices_rwsem); + + if (device->ops.enable_driver) { + ret = device->ops.enable_driver(device); + if (ret) + goto out; + } + + down_read(&clients_rwsem); + xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { + ret = add_client_context(device, client); + if (ret) + break; + } + up_read(&clients_rwsem); + if (!ret) + ret = add_compat_devs(device); +out: + up_read(&devices_rwsem); + return ret; +} + +static void prevent_dealloc_device(struct ib_device *ib_dev) +{ +} + +/** + * ib_register_device - Register an IB device with IB core + * @device: Device to register + * @name: unique string device name. This may include a '%' which will + * cause a unique index to be added to the passed device name. + * @dma_device: pointer to a DMA-capable device. If %NULL, then the IB + * device will be used. In this case the caller should fully + * setup the ibdev for DMA. This usually means using dma_virt_ops. + * + * Low-level drivers use ib_register_device() to register their + * devices with the IB core. All registered clients will receive a + * callback for each device that is added. @device must be allocated + * with ib_alloc_device(). + * + * If the driver uses ops.dealloc_driver and calls any ib_unregister_device() + * asynchronously then the device pointer may become freed as soon as this + * function returns. + */ +int ib_register_device(struct ib_device *device, const char *name, + struct device *dma_device) +{ + int ret; + + ret = assign_name(device, name); + if (ret) + return ret; + + /* + * If the caller does not provide a DMA capable device then the IB core + * will set up ib_sge and scatterlist structures that stash the kernel + * virtual address into the address field. + */ + WARN_ON(dma_device && !dma_device->dma_parms); + device->dma_device = dma_device; + + ret = setup_device(device); + if (ret) + return ret; + + ret = ib_cache_setup_one(device); + if (ret) { + dev_warn(&device->dev, + "Couldn't set up InfiniBand P_Key/GID cache\n"); + return ret; + } + + device->groups[0] = &ib_dev_attr_group; + device->groups[1] = device->ops.device_group; + ret = ib_setup_device_attrs(device); + if (ret) + goto cache_cleanup; + + ib_device_register_rdmacg(device); + + rdma_counter_init(device); + + /* + * Ensure that ADD uevent is not fired because it + * is too early amd device is not initialized yet. + */ + dev_set_uevent_suppress(&device->dev, true); + ret = device_add(&device->dev); + if (ret) + goto cg_cleanup; + + ret = ib_setup_port_attrs(&device->coredev); + if (ret) { + dev_warn(&device->dev, + "Couldn't register device with driver model\n"); + goto dev_cleanup; + } + + ret = enable_device_and_get(device); + if (ret) { + void (*dealloc_fn)(struct ib_device *); + + /* + * If we hit this error flow then we don't want to + * automatically dealloc the device since the caller is + * expected to call ib_dealloc_device() after + * ib_register_device() fails. This is tricky due to the + * possibility for a parallel unregistration along with this + * error flow. Since we have a refcount here we know any + * parallel flow is stopped in disable_device and will see the + * special dealloc_driver pointer, causing the responsibility to + * ib_dealloc_device() to revert back to this thread. + */ + dealloc_fn = device->ops.dealloc_driver; + device->ops.dealloc_driver = prevent_dealloc_device; + ib_device_put(device); + __ib_unregister_device(device); + device->ops.dealloc_driver = dealloc_fn; + dev_set_uevent_suppress(&device->dev, false); + return ret; + } + dev_set_uevent_suppress(&device->dev, false); + /* Mark for userspace that device is ready */ + kobject_uevent(&device->dev.kobj, KOBJ_ADD); + ib_device_put(device); + + /* + * Workaround (fix) for an issue where, ip event notifier, + * missed out to add GID entries, because ibdev is not yet added + * to the list; and first call to roce_rescan_device() during GID + * table setup also didn't add the GID, because IP was not present. + * So trigger the scane one more time after ibdev is part of the + * core's list. + * TODO: Do a proper fix to scan only IP based GIDs without + * net_rwsem and rtnl lock and still synchronize with ifa_list! + */ + rdma_roce_rescan_device(device); + + return 0; + +dev_cleanup: + device_del(&device->dev); +cg_cleanup: + dev_set_uevent_suppress(&device->dev, false); + ib_device_unregister_rdmacg(device); +cache_cleanup: + ib_cache_cleanup_one(device); + return ret; +} +EXPORT_SYMBOL(ib_register_device); + +/* Callers must hold a get on the device. */ +static void __ib_unregister_device(struct ib_device *ib_dev) +{ + /* + * We have a registration lock so that all the calls to unregister are + * fully fenced, once any unregister returns the device is truely + * unregistered even if multiple callers are unregistering it at the + * same time. This also interacts with the registration flow and + * provides sane semantics if register and unregister are racing. + */ + mutex_lock(&ib_dev->unregistration_lock); + if (!refcount_read(&ib_dev->refcount)) + goto out; + + disable_device(ib_dev); + + /* Expedite removing unregistered pointers from the hash table */ + free_netdevs(ib_dev); + + ib_free_port_attrs(&ib_dev->coredev); + device_del(&ib_dev->dev); + ib_device_unregister_rdmacg(ib_dev); + ib_cache_cleanup_one(ib_dev); + + /* + * Drivers using the new flow may not call ib_dealloc_device except + * in error unwind prior to registration success. + */ + if (ib_dev->ops.dealloc_driver && + ib_dev->ops.dealloc_driver != prevent_dealloc_device) { + WARN_ON(kref_read(&ib_dev->dev.kobj.kref) <= 1); + ib_dealloc_device(ib_dev); + } +out: + mutex_unlock(&ib_dev->unregistration_lock); +} + +/** + * ib_unregister_device - Unregister an IB device + * @ib_dev: The device to unregister + * + * Unregister an IB device. All clients will receive a remove callback. + * + * Callers should call this routine only once, and protect against races with + * registration. Typically it should only be called as part of a remove + * callback in an implementation of driver core's struct device_driver and + * related. + * + * If ops.dealloc_driver is used then ib_dev will be freed upon return from + * this function. + */ +void ib_unregister_device(struct ib_device *ib_dev) +{ + get_device(&ib_dev->dev); + __ib_unregister_device(ib_dev); + put_device(&ib_dev->dev); +} +EXPORT_SYMBOL(ib_unregister_device); + +/** + * ib_unregister_device_and_put - Unregister a device while holding a 'get' + * @ib_dev: The device to unregister + * + * This is the same as ib_unregister_device(), except it includes an internal + * ib_device_put() that should match a 'get' obtained by the caller. + * + * It is safe to call this routine concurrently from multiple threads while + * holding the 'get'. When the function returns the device is fully + * unregistered. + * + * Drivers using this flow MUST use the driver_unregister callback to clean up + * their resources associated with the device and dealloc it. + */ +void ib_unregister_device_and_put(struct ib_device *ib_dev) +{ + WARN_ON(!ib_dev->ops.dealloc_driver); + get_device(&ib_dev->dev); + ib_device_put(ib_dev); + __ib_unregister_device(ib_dev); + put_device(&ib_dev->dev); +} +EXPORT_SYMBOL(ib_unregister_device_and_put); + +/** + * ib_unregister_driver - Unregister all IB devices for a driver + * @driver_id: The driver to unregister + * + * This implements a fence for device unregistration. It only returns once all + * devices associated with the driver_id have fully completed their + * unregistration and returned from ib_unregister_device*(). + * + * If device's are not yet unregistered it goes ahead and starts unregistering + * them. + * + * This does not block creation of new devices with the given driver_id, that + * is the responsibility of the caller. + */ +void ib_unregister_driver(enum rdma_driver_id driver_id) +{ + struct ib_device *ib_dev; + unsigned long index; + + down_read(&devices_rwsem); + xa_for_each (&devices, index, ib_dev) { + if (ib_dev->ops.driver_id != driver_id) + continue; + + get_device(&ib_dev->dev); + up_read(&devices_rwsem); + + WARN_ON(!ib_dev->ops.dealloc_driver); + __ib_unregister_device(ib_dev); + + put_device(&ib_dev->dev); + down_read(&devices_rwsem); + } + up_read(&devices_rwsem); +} +EXPORT_SYMBOL(ib_unregister_driver); + +static void ib_unregister_work(struct work_struct *work) +{ + struct ib_device *ib_dev = + container_of(work, struct ib_device, unregistration_work); + + __ib_unregister_device(ib_dev); + put_device(&ib_dev->dev); +} + +/** + * ib_unregister_device_queued - Unregister a device using a work queue + * @ib_dev: The device to unregister + * + * This schedules an asynchronous unregistration using a WQ for the device. A + * driver should use this to avoid holding locks while doing unregistration, + * such as holding the RTNL lock. + * + * Drivers using this API must use ib_unregister_driver before module unload + * to ensure that all scheduled unregistrations have completed. + */ +void ib_unregister_device_queued(struct ib_device *ib_dev) +{ + WARN_ON(!refcount_read(&ib_dev->refcount)); + WARN_ON(!ib_dev->ops.dealloc_driver); + get_device(&ib_dev->dev); + if (!queue_work(ib_unreg_wq, &ib_dev->unregistration_work)) + put_device(&ib_dev->dev); +} +EXPORT_SYMBOL(ib_unregister_device_queued); + +/* + * The caller must pass in a device that has the kref held and the refcount + * released. If the device is in cur_net and still registered then it is moved + * into net. + */ +static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, + struct net *net) +{ + int ret2 = -EINVAL; + int ret; + + mutex_lock(&device->unregistration_lock); + + /* + * If a device not under ib_device_get() or if the unregistration_lock + * is not held, the namespace can be changed, or it can be unregistered. + * Check again under the lock. + */ + if (refcount_read(&device->refcount) == 0 || + !net_eq(cur_net, read_pnet(&device->coredev.rdma_net))) { + ret = -ENODEV; + goto out; + } + + kobject_uevent(&device->dev.kobj, KOBJ_REMOVE); + disable_device(device); + + /* + * At this point no one can be using the device, so it is safe to + * change the namespace. + */ + write_pnet(&device->coredev.rdma_net, net); + + down_read(&devices_rwsem); + /* + * Currently rdma devices are system wide unique. So the device name + * is guaranteed free in the new namespace. Publish the new namespace + * at the sysfs level. + */ + ret = device_rename(&device->dev, dev_name(&device->dev)); + up_read(&devices_rwsem); + if (ret) { + dev_warn(&device->dev, + "%s: Couldn't rename device after namespace change\n", + __func__); + /* Try and put things back and re-enable the device */ + write_pnet(&device->coredev.rdma_net, cur_net); + } + + ret2 = enable_device_and_get(device); + if (ret2) { + /* + * This shouldn't really happen, but if it does, let the user + * retry at later point. So don't disable the device. + */ + dev_warn(&device->dev, + "%s: Couldn't re-enable device after namespace change\n", + __func__); + } + kobject_uevent(&device->dev.kobj, KOBJ_ADD); + + ib_device_put(device); +out: + mutex_unlock(&device->unregistration_lock); + if (ret) + return ret; + return ret2; +} + +int ib_device_set_netns_put(struct sk_buff *skb, + struct ib_device *dev, u32 ns_fd) +{ + struct net *net; + int ret; + + net = get_net_ns_by_fd(ns_fd); + if (IS_ERR(net)) { + ret = PTR_ERR(net); + goto net_err; + } + + if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { + ret = -EPERM; + goto ns_err; + } + + /* + * All the ib_clients, including uverbs, are reset when the namespace is + * changed and this cannot be blocked waiting for userspace to do + * something, so disassociation is mandatory. + */ + if (!dev->ops.disassociate_ucontext || ib_devices_shared_netns) { + ret = -EOPNOTSUPP; + goto ns_err; + } + + get_device(&dev->dev); + ib_device_put(dev); + ret = rdma_dev_change_netns(dev, current->nsproxy->net_ns, net); + put_device(&dev->dev); + + put_net(net); + return ret; + +ns_err: + put_net(net); +net_err: + ib_device_put(dev); + return ret; +} + +static struct pernet_operations rdma_dev_net_ops = { + .init = rdma_dev_init_net, + .exit = rdma_dev_exit_net, + .id = &rdma_dev_net_id, + .size = sizeof(struct rdma_dev_net), +}; + +static int assign_client_id(struct ib_client *client) +{ + int ret; + + down_write(&clients_rwsem); + /* + * The add/remove callbacks must be called in FIFO/LIFO order. To + * achieve this we assign client_ids so they are sorted in + * registration order. + */ + client->client_id = highest_client_id; + ret = xa_insert(&clients, client->client_id, client, GFP_KERNEL); + if (ret) + goto out; + + highest_client_id++; + xa_set_mark(&clients, client->client_id, CLIENT_REGISTERED); + +out: + up_write(&clients_rwsem); + return ret; +} + +static void remove_client_id(struct ib_client *client) +{ + down_write(&clients_rwsem); + xa_erase(&clients, client->client_id); + for (; highest_client_id; highest_client_id--) + if (xa_load(&clients, highest_client_id - 1)) + break; + up_write(&clients_rwsem); +} + +/** + * ib_register_client - Register an IB client + * @client:Client to register + * + * Upper level users of the IB drivers can use ib_register_client() to + * register callbacks for IB device addition and removal. When an IB + * device is added, each registered client's add method will be called + * (in the order the clients were registered), and when a device is + * removed, each client's remove method will be called (in the reverse + * order that clients were registered). In addition, when + * ib_register_client() is called, the client will receive an add + * callback for all devices already registered. + */ +int ib_register_client(struct ib_client *client) +{ + struct ib_device *device; + unsigned long index; + int ret; + + refcount_set(&client->uses, 1); + init_completion(&client->uses_zero); + ret = assign_client_id(client); + if (ret) + return ret; + + down_read(&devices_rwsem); + xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) { + ret = add_client_context(device, client); + if (ret) { + up_read(&devices_rwsem); + ib_unregister_client(client); + return ret; + } + } + up_read(&devices_rwsem); + return 0; +} +EXPORT_SYMBOL(ib_register_client); + +/** + * ib_unregister_client - Unregister an IB client + * @client:Client to unregister + * + * Upper level users use ib_unregister_client() to remove their client + * registration. When ib_unregister_client() is called, the client + * will receive a remove callback for each IB device still registered. + * + * This is a full fence, once it returns no client callbacks will be called, + * or are running in another thread. + */ +void ib_unregister_client(struct ib_client *client) +{ + struct ib_device *device; + unsigned long index; + + down_write(&clients_rwsem); + ib_client_put(client); + xa_clear_mark(&clients, client->client_id, CLIENT_REGISTERED); + up_write(&clients_rwsem); + + /* We do not want to have locks while calling client->remove() */ + rcu_read_lock(); + xa_for_each (&devices, index, device) { + if (!ib_device_try_get(device)) + continue; + rcu_read_unlock(); + + remove_client_context(device, client->client_id); + + ib_device_put(device); + rcu_read_lock(); + } + rcu_read_unlock(); + + /* + * remove_client_context() is not a fence, it can return even though a + * removal is ongoing. Wait until all removals are completed. + */ + wait_for_completion(&client->uses_zero); + remove_client_id(client); +} +EXPORT_SYMBOL(ib_unregister_client); + +static int __ib_get_global_client_nl_info(const char *client_name, + struct ib_client_nl_info *res) +{ + struct ib_client *client; + unsigned long index; + int ret = -ENOENT; + + down_read(&clients_rwsem); + xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { + if (strcmp(client->name, client_name) != 0) + continue; + if (!client->get_global_nl_info) { + ret = -EOPNOTSUPP; + break; + } + ret = client->get_global_nl_info(res); + if (WARN_ON(ret == -ENOENT)) + ret = -EINVAL; + if (!ret && res->cdev) + get_device(res->cdev); + break; + } + up_read(&clients_rwsem); + return ret; +} + +static int __ib_get_client_nl_info(struct ib_device *ibdev, + const char *client_name, + struct ib_client_nl_info *res) +{ + unsigned long index; + void *client_data; + int ret = -ENOENT; + + down_read(&ibdev->client_data_rwsem); + xan_for_each_marked (&ibdev->client_data, index, client_data, + CLIENT_DATA_REGISTERED) { + struct ib_client *client = xa_load(&clients, index); + + if (!client || strcmp(client->name, client_name) != 0) + continue; + if (!client->get_nl_info) { + ret = -EOPNOTSUPP; + break; + } + ret = client->get_nl_info(ibdev, client_data, res); + if (WARN_ON(ret == -ENOENT)) + ret = -EINVAL; + + /* + * The cdev is guaranteed valid as long as we are inside the + * client_data_rwsem as remove_one can't be called. Keep it + * valid for the caller. + */ + if (!ret && res->cdev) + get_device(res->cdev); + break; + } + up_read(&ibdev->client_data_rwsem); + + return ret; +} + +/** + * ib_get_client_nl_info - Fetch the nl_info from a client + * @ibdev: IB device + * @client_name: Name of the client + * @res: Result of the query + */ +int ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name, + struct ib_client_nl_info *res) +{ + int ret; + + if (ibdev) + ret = __ib_get_client_nl_info(ibdev, client_name, res); + else + ret = __ib_get_global_client_nl_info(client_name, res); +#ifdef CONFIG_MODULES + if (ret == -ENOENT) { + request_module("rdma-client-%s", client_name); + if (ibdev) + ret = __ib_get_client_nl_info(ibdev, client_name, res); + else + ret = __ib_get_global_client_nl_info(client_name, res); + } +#endif + if (ret) { + if (ret == -ENOENT) + return -EOPNOTSUPP; + return ret; + } + + if (WARN_ON(!res->cdev)) + return -EINVAL; + return 0; +} + +/** + * ib_set_client_data - Set IB client context + * @device:Device to set context for + * @client:Client to set context for + * @data:Context to set + * + * ib_set_client_data() sets client context data that can be retrieved with + * ib_get_client_data(). This can only be called while the client is + * registered to the device, once the ib_client remove() callback returns this + * cannot be called. + */ +void ib_set_client_data(struct ib_device *device, struct ib_client *client, + void *data) +{ + void *rc; + + if (WARN_ON(IS_ERR(data))) + data = NULL; + + rc = xa_store(&device->client_data, client->client_id, data, + GFP_KERNEL); + WARN_ON(xa_is_err(rc)); +} +EXPORT_SYMBOL(ib_set_client_data); + +/** + * ib_register_event_handler - Register an IB event handler + * @event_handler:Handler to register + * + * ib_register_event_handler() registers an event handler that will be + * called back when asynchronous IB events occur (as defined in + * chapter 11 of the InfiniBand Architecture Specification). This + * callback occurs in workqueue context. + */ +void ib_register_event_handler(struct ib_event_handler *event_handler) +{ + down_write(&event_handler->device->event_handler_rwsem); + list_add_tail(&event_handler->list, + &event_handler->device->event_handler_list); + up_write(&event_handler->device->event_handler_rwsem); +} +EXPORT_SYMBOL(ib_register_event_handler); + +/** + * ib_unregister_event_handler - Unregister an event handler + * @event_handler:Handler to unregister + * + * Unregister an event handler registered with + * ib_register_event_handler(). + */ +void ib_unregister_event_handler(struct ib_event_handler *event_handler) +{ + down_write(&event_handler->device->event_handler_rwsem); + list_del(&event_handler->list); + up_write(&event_handler->device->event_handler_rwsem); +} +EXPORT_SYMBOL(ib_unregister_event_handler); + +void ib_dispatch_event_clients(struct ib_event *event) +{ + struct ib_event_handler *handler; + + down_read(&event->device->event_handler_rwsem); + + list_for_each_entry(handler, &event->device->event_handler_list, list) + handler->handler(handler, event); + + up_read(&event->device->event_handler_rwsem); +} + +static int iw_query_port(struct ib_device *device, + u32 port_num, + struct ib_port_attr *port_attr) +{ + struct in_device *inetdev; + struct net_device *netdev; + + memset(port_attr, 0, sizeof(*port_attr)); + + netdev = ib_device_get_netdev(device, port_num); + if (!netdev) + return -ENODEV; + + port_attr->max_mtu = IB_MTU_4096; + port_attr->active_mtu = ib_mtu_int_to_enum(netdev->mtu); + + if (!netif_carrier_ok(netdev)) { + port_attr->state = IB_PORT_DOWN; + port_attr->phys_state = IB_PORT_PHYS_STATE_DISABLED; + } else { + rcu_read_lock(); + inetdev = __in_dev_get_rcu(netdev); + + if (inetdev && inetdev->ifa_list) { + port_attr->state = IB_PORT_ACTIVE; + port_attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP; + } else { + port_attr->state = IB_PORT_INIT; + port_attr->phys_state = + IB_PORT_PHYS_STATE_PORT_CONFIGURATION_TRAINING; + } + + rcu_read_unlock(); + } + + dev_put(netdev); + return device->ops.query_port(device, port_num, port_attr); +} + +static int __ib_query_port(struct ib_device *device, + u32 port_num, + struct ib_port_attr *port_attr) +{ + int err; + + memset(port_attr, 0, sizeof(*port_attr)); + + err = device->ops.query_port(device, port_num, port_attr); + if (err || port_attr->subnet_prefix) + return err; + + if (rdma_port_get_link_layer(device, port_num) != + IB_LINK_LAYER_INFINIBAND) + return 0; + + ib_get_cached_subnet_prefix(device, port_num, + &port_attr->subnet_prefix); + return 0; +} + +/** + * ib_query_port - Query IB port attributes + * @device:Device to query + * @port_num:Port number to query + * @port_attr:Port attributes + * + * ib_query_port() returns the attributes of a port through the + * @port_attr pointer. + */ +int ib_query_port(struct ib_device *device, + u32 port_num, + struct ib_port_attr *port_attr) +{ + if (!rdma_is_port_valid(device, port_num)) + return -EINVAL; + + if (rdma_protocol_iwarp(device, port_num)) + return iw_query_port(device, port_num, port_attr); + else + return __ib_query_port(device, port_num, port_attr); +} +EXPORT_SYMBOL(ib_query_port); + +static void add_ndev_hash(struct ib_port_data *pdata) +{ + unsigned long flags; + + might_sleep(); + + spin_lock_irqsave(&ndev_hash_lock, flags); + if (hash_hashed(&pdata->ndev_hash_link)) { + hash_del_rcu(&pdata->ndev_hash_link); + spin_unlock_irqrestore(&ndev_hash_lock, flags); + /* + * We cannot do hash_add_rcu after a hash_del_rcu until the + * grace period + */ + synchronize_rcu(); + spin_lock_irqsave(&ndev_hash_lock, flags); + } + if (pdata->netdev) + hash_add_rcu(ndev_hash, &pdata->ndev_hash_link, + (uintptr_t)pdata->netdev); + spin_unlock_irqrestore(&ndev_hash_lock, flags); +} + +/** + * ib_device_set_netdev - Associate the ib_dev with an underlying net_device + * @ib_dev: Device to modify + * @ndev: net_device to affiliate, may be NULL + * @port: IB port the net_device is connected to + * + * Drivers should use this to link the ib_device to a netdev so the netdev + * shows up in interfaces like ib_enum_roce_netdev. Only one netdev may be + * affiliated with any port. + * + * The caller must ensure that the given ndev is not unregistered or + * unregistering, and that either the ib_device is unregistered or + * ib_device_set_netdev() is called with NULL when the ndev sends a + * NETDEV_UNREGISTER event. + */ +int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, + u32 port) +{ + struct net_device *old_ndev; + struct ib_port_data *pdata; + unsigned long flags; + int ret; + + /* + * Drivers wish to call this before ib_register_driver, so we have to + * setup the port data early. + */ + ret = alloc_port_data(ib_dev); + if (ret) + return ret; + + if (!rdma_is_port_valid(ib_dev, port)) + return -EINVAL; + + pdata = &ib_dev->port_data[port]; + spin_lock_irqsave(&pdata->netdev_lock, flags); + old_ndev = rcu_dereference_protected( + pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); + if (old_ndev == ndev) { + spin_unlock_irqrestore(&pdata->netdev_lock, flags); + return 0; + } + + if (ndev) + dev_hold(ndev); + rcu_assign_pointer(pdata->netdev, ndev); + spin_unlock_irqrestore(&pdata->netdev_lock, flags); + + add_ndev_hash(pdata); + if (old_ndev) + dev_put(old_ndev); + + return 0; +} +EXPORT_SYMBOL(ib_device_set_netdev); + +static void free_netdevs(struct ib_device *ib_dev) +{ + unsigned long flags; + u32 port; + + if (!ib_dev->port_data) + return; + + rdma_for_each_port (ib_dev, port) { + struct ib_port_data *pdata = &ib_dev->port_data[port]; + struct net_device *ndev; + + spin_lock_irqsave(&pdata->netdev_lock, flags); + ndev = rcu_dereference_protected( + pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); + if (ndev) { + spin_lock(&ndev_hash_lock); + hash_del_rcu(&pdata->ndev_hash_link); + spin_unlock(&ndev_hash_lock); + + /* + * If this is the last dev_put there is still a + * synchronize_rcu before the netdev is kfreed, so we + * can continue to rely on unlocked pointer + * comparisons after the put + */ + rcu_assign_pointer(pdata->netdev, NULL); + dev_put(ndev); + } + spin_unlock_irqrestore(&pdata->netdev_lock, flags); + } +} + +struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, + u32 port) +{ + struct ib_port_data *pdata; + struct net_device *res; + + if (!rdma_is_port_valid(ib_dev, port)) + return NULL; + + pdata = &ib_dev->port_data[port]; + + /* + * New drivers should use ib_device_set_netdev() not the legacy + * get_netdev(). + */ + if (ib_dev->ops.get_netdev) + res = ib_dev->ops.get_netdev(ib_dev, port); + else { + spin_lock(&pdata->netdev_lock); + res = rcu_dereference_protected( + pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); + if (res) + dev_hold(res); + spin_unlock(&pdata->netdev_lock); + } + + /* + * If we are starting to unregister expedite things by preventing + * propagation of an unregistering netdev. + */ + if (res && res->reg_state != NETREG_REGISTERED) { + dev_put(res); + return NULL; + } + + return res; +} + +/** + * ib_device_get_by_netdev - Find an IB device associated with a netdev + * @ndev: netdev to locate + * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) + * + * Find and hold an ib_device that is associated with a netdev via + * ib_device_set_netdev(). The caller must call ib_device_put() on the + * returned pointer. + */ +struct ib_device *ib_device_get_by_netdev(struct net_device *ndev, + enum rdma_driver_id driver_id) +{ + struct ib_device *res = NULL; + struct ib_port_data *cur; + + rcu_read_lock(); + hash_for_each_possible_rcu (ndev_hash, cur, ndev_hash_link, + (uintptr_t)ndev) { + if (rcu_access_pointer(cur->netdev) == ndev && + (driver_id == RDMA_DRIVER_UNKNOWN || + cur->ib_dev->ops.driver_id == driver_id) && + ib_device_try_get(cur->ib_dev)) { + res = cur->ib_dev; + break; + } + } + rcu_read_unlock(); + + return res; +} +EXPORT_SYMBOL(ib_device_get_by_netdev); + +/** + * ib_enum_roce_netdev - enumerate all RoCE ports + * @ib_dev : IB device we want to query + * @filter: Should we call the callback? + * @filter_cookie: Cookie passed to filter + * @cb: Callback to call for each found RoCE ports + * @cookie: Cookie passed back to the callback + * @ndev_event: Netdev event notified using ndev notifier chain + * + * Enumerates all of the physical RoCE ports of ib_dev + * which are related to netdevice and calls callback() on each + * device for which filter() function returns non zero. + */ +void ib_enum_roce_netdev(struct ib_device *ib_dev, + roce_netdev_filter filter, + void *filter_cookie, + roce_netdev_callback cb, + void *cookie, unsigned long ndev_event) +{ + u32 port; + + /* + * rdma bond device doesn't need to do any GID handling during + * netdevice failover event. + */ + if (ib_dev->dev_immutable.bond_device && + ndev_event == NETDEV_BONDING_FAILOVER) + return; + + rdma_for_each_port (ib_dev, port) + if (rdma_protocol_roce(ib_dev, port)) { + struct net_device *idev = + ib_device_get_netdev(ib_dev, port); + + if (filter(ib_dev, port, idev, filter_cookie)) + cb(ib_dev, port, idev, cookie); + + if (idev) + dev_put(idev); + } +} + +/** + * ib_enum_all_roce_netdevs - enumerate all RoCE devices + * @filter: Should we call the callback? + * @filter_cookie: Cookie passed to filter + * @cb: Callback to call for each found RoCE ports + * @cookie: Cookie passed back to the callback + * @ndev_event: Netdev event occurred through netdev notifier chain + * + * Enumerates all RoCE devices' physical ports which are related + * to netdevices and calls callback() on each device for which + * filter() function returns non zero. + */ +void ib_enum_all_roce_netdevs(roce_netdev_filter filter, + void *filter_cookie, + roce_netdev_callback cb, + void *cookie, unsigned long ndev_event) +{ + struct ib_device *dev; + unsigned long index; + + down_read(&devices_rwsem); + xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) + ib_enum_roce_netdev(dev, filter, filter_cookie, cb, + cookie, ndev_event); + up_read(&devices_rwsem); +} + +/* + * ib_enum_all_devs - enumerate all ib_devices + * @cb: Callback to call for each found ib_device + * + * Enumerates all ib_devices and calls callback() on each device. + */ +int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb, + struct netlink_callback *cb) +{ + unsigned long index; + struct ib_device *dev; + unsigned int idx = 0; + int ret = 0; + + down_read(&devices_rwsem); + xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { + if (!rdma_dev_access_netns(dev, sock_net(skb->sk))) + continue; + + ret = nldev_cb(dev, skb, cb, idx); + if (ret) + break; + idx++; + } + up_read(&devices_rwsem); + return ret; +} + +/** + * ib_query_pkey - Get P_Key table entry + * @device:Device to query + * @port_num:Port number to query + * @index:P_Key table index to query + * @pkey:Returned P_Key + * + * ib_query_pkey() fetches the specified P_Key table entry. + */ +int ib_query_pkey(struct ib_device *device, + u32 port_num, u16 index, u16 *pkey) +{ + if (!rdma_is_port_valid(device, port_num)) + return -EINVAL; + + if (!device->ops.query_pkey) + return -EOPNOTSUPP; + + return device->ops.query_pkey(device, port_num, index, pkey); +} +EXPORT_SYMBOL(ib_query_pkey); + +/** + * ib_modify_device - Change IB device attributes + * @device:Device to modify + * @device_modify_mask:Mask of attributes to change + * @device_modify:New attribute values + * + * ib_modify_device() changes a device's attributes as specified by + * the @device_modify_mask and @device_modify structure. + */ +int ib_modify_device(struct ib_device *device, + int device_modify_mask, + struct ib_device_modify *device_modify) +{ + if (!device->ops.modify_device) + return -EOPNOTSUPP; + + return device->ops.modify_device(device, device_modify_mask, + device_modify); +} +EXPORT_SYMBOL(ib_modify_device); + +/** + * ib_modify_port - Modifies the attributes for the specified port. + * @device: The device to modify. + * @port_num: The number of the port to modify. + * @port_modify_mask: Mask used to specify which attributes of the port + * to change. + * @port_modify: New attribute values for the port. + * + * ib_modify_port() changes a port's attributes as specified by the + * @port_modify_mask and @port_modify structure. + */ +int ib_modify_port(struct ib_device *device, + u32 port_num, int port_modify_mask, + struct ib_port_modify *port_modify) +{ + int rc; + + if (!rdma_is_port_valid(device, port_num)) + return -EINVAL; + + if (device->ops.modify_port) + rc = device->ops.modify_port(device, port_num, + port_modify_mask, + port_modify); + else if (rdma_protocol_roce(device, port_num) && + ((port_modify->set_port_cap_mask & ~IB_PORT_CM_SUP) == 0 || + (port_modify->clr_port_cap_mask & ~IB_PORT_CM_SUP) == 0)) + rc = 0; + else + rc = -EOPNOTSUPP; + return rc; +} +EXPORT_SYMBOL(ib_modify_port); + +/** + * ib_find_gid - Returns the port number and GID table index where + * a specified GID value occurs. Its searches only for IB link layer. + * @device: The device to query. + * @gid: The GID value to search for. + * @port_num: The port number of the device where the GID value was found. + * @index: The index into the GID table where the GID was found. This + * parameter may be NULL. + */ +int ib_find_gid(struct ib_device *device, union ib_gid *gid, + u32 *port_num, u16 *index) +{ + union ib_gid tmp_gid; + u32 port; + int ret, i; + + rdma_for_each_port (device, port) { + if (!rdma_protocol_ib(device, port)) + continue; + + for (i = 0; i < device->port_data[port].immutable.gid_tbl_len; + ++i) { + ret = rdma_query_gid(device, port, i, &tmp_gid); + if (ret) + continue; + + if (!memcmp(&tmp_gid, gid, sizeof *gid)) { + *port_num = port; + if (index) + *index = i; + return 0; + } + } + } + + return -ENOENT; +} +EXPORT_SYMBOL(ib_find_gid); + +/** + * ib_find_pkey - Returns the PKey table index where a specified + * PKey value occurs. + * @device: The device to query. + * @port_num: The port number of the device to search for the PKey. + * @pkey: The PKey value to search for. + * @index: The index into the PKey table where the PKey was found. + */ +int ib_find_pkey(struct ib_device *device, + u32 port_num, u16 pkey, u16 *index) +{ + int ret, i; + u16 tmp_pkey; + int partial_ix = -1; + + for (i = 0; i < device->port_data[port_num].immutable.pkey_tbl_len; + ++i) { + ret = ib_query_pkey(device, port_num, i, &tmp_pkey); + if (ret) + return ret; + if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) { + /* if there is full-member pkey take it.*/ + if (tmp_pkey & 0x8000) { + *index = i; + return 0; + } + if (partial_ix < 0) + partial_ix = i; + } + } + + /*no full-member, if exists take the limited*/ + if (partial_ix >= 0) { + *index = partial_ix; + return 0; + } + return -ENOENT; +} +EXPORT_SYMBOL(ib_find_pkey); + +/** + * ib_get_net_dev_by_params() - Return the appropriate net_dev + * for a received CM request + * @dev: An RDMA device on which the request has been received. + * @port: Port number on the RDMA device. + * @pkey: The Pkey the request came on. + * @gid: A GID that the net_dev uses to communicate. + * @addr: Contains the IP address that the request specified as its + * destination. + * + */ +struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, + u32 port, + u16 pkey, + const union ib_gid *gid, + const struct sockaddr *addr) +{ + struct net_device *net_dev = NULL; + unsigned long index; + void *client_data; + + if (!rdma_protocol_ib(dev, port)) + return NULL; + + /* + * Holding the read side guarantees that the client will not become + * unregistered while we are calling get_net_dev_by_params() + */ + down_read(&dev->client_data_rwsem); + xan_for_each_marked (&dev->client_data, index, client_data, + CLIENT_DATA_REGISTERED) { + struct ib_client *client = xa_load(&clients, index); + + if (!client || !client->get_net_dev_by_params) + continue; + + net_dev = client->get_net_dev_by_params(dev, port, pkey, gid, + addr, client_data); + if (net_dev) + break; + } + up_read(&dev->client_data_rwsem); + + return net_dev; +} +EXPORT_SYMBOL(ib_get_net_dev_by_params); + +void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) +{ + struct ib_device_ops *dev_ops = &dev->ops; +#define SET_DEVICE_OP(ptr, name) \ + do { \ + if (ops->name) \ + if (!((ptr)->name)) \ + (ptr)->name = ops->name; \ + } while (0) + +#define SET_OBJ_SIZE(ptr, name) SET_DEVICE_OP(ptr, size_##name) + + if (ops->driver_id != RDMA_DRIVER_UNKNOWN) { + WARN_ON(dev_ops->driver_id != RDMA_DRIVER_UNKNOWN && + dev_ops->driver_id != ops->driver_id); + dev_ops->driver_id = ops->driver_id; + } + if (ops->owner) { + WARN_ON(dev_ops->owner && dev_ops->owner != ops->owner); + dev_ops->owner = ops->owner; + } + if (ops->uverbs_abi_ver) + dev_ops->uverbs_abi_ver = ops->uverbs_abi_ver; + + dev_ops->uverbs_no_driver_id_binding |= + ops->uverbs_no_driver_id_binding; + + SET_DEVICE_OP(dev_ops, add_gid); + SET_DEVICE_OP(dev_ops, advise_mr); + SET_DEVICE_OP(dev_ops, alloc_dm); + SET_DEVICE_OP(dev_ops, alloc_hw_device_stats); + SET_DEVICE_OP(dev_ops, alloc_hw_port_stats); + SET_DEVICE_OP(dev_ops, alloc_mr); + SET_DEVICE_OP(dev_ops, alloc_mr_integrity); + SET_DEVICE_OP(dev_ops, alloc_mw); + SET_DEVICE_OP(dev_ops, alloc_pd); + SET_DEVICE_OP(dev_ops, alloc_rdma_netdev); + SET_DEVICE_OP(dev_ops, alloc_ucontext); + SET_DEVICE_OP(dev_ops, alloc_xrcd); + SET_DEVICE_OP(dev_ops, attach_mcast); + SET_DEVICE_OP(dev_ops, check_mr_status); + SET_DEVICE_OP(dev_ops, counter_alloc_stats); + SET_DEVICE_OP(dev_ops, counter_bind_qp); + SET_DEVICE_OP(dev_ops, counter_dealloc); + SET_DEVICE_OP(dev_ops, counter_unbind_qp); + SET_DEVICE_OP(dev_ops, counter_update_stats); + SET_DEVICE_OP(dev_ops, create_ah); + SET_DEVICE_OP(dev_ops, create_counters); + SET_DEVICE_OP(dev_ops, create_cq); + SET_DEVICE_OP(dev_ops, create_flow); + SET_DEVICE_OP(dev_ops, create_flow_action_esp); + SET_DEVICE_OP(dev_ops, create_qp); + SET_DEVICE_OP(dev_ops, create_rwq_ind_table); + SET_DEVICE_OP(dev_ops, create_srq); + SET_DEVICE_OP(dev_ops, create_user_ah); + SET_DEVICE_OP(dev_ops, create_wq); + SET_DEVICE_OP(dev_ops, dealloc_dm); + SET_DEVICE_OP(dev_ops, dealloc_driver); + SET_DEVICE_OP(dev_ops, dealloc_mw); + SET_DEVICE_OP(dev_ops, dealloc_pd); + SET_DEVICE_OP(dev_ops, dealloc_ucontext); + SET_DEVICE_OP(dev_ops, dealloc_xrcd); + SET_DEVICE_OP(dev_ops, del_gid); + SET_DEVICE_OP(dev_ops, dereg_mr); + SET_DEVICE_OP(dev_ops, destroy_ah); + SET_DEVICE_OP(dev_ops, destroy_counters); + SET_DEVICE_OP(dev_ops, destroy_cq); + SET_DEVICE_OP(dev_ops, destroy_flow); + SET_DEVICE_OP(dev_ops, destroy_flow_action); + SET_DEVICE_OP(dev_ops, destroy_qp); + SET_DEVICE_OP(dev_ops, destroy_rwq_ind_table); + SET_DEVICE_OP(dev_ops, destroy_srq); + SET_DEVICE_OP(dev_ops, destroy_wq); + SET_DEVICE_OP(dev_ops, device_group); + SET_DEVICE_OP(dev_ops, detach_mcast); + SET_DEVICE_OP(dev_ops, disassociate_ucontext); + SET_DEVICE_OP(dev_ops, drain_rq); + SET_DEVICE_OP(dev_ops, drain_sq); + SET_DEVICE_OP(dev_ops, enable_driver); + SET_DEVICE_OP(dev_ops, fill_res_cm_id_entry); + SET_DEVICE_OP(dev_ops, fill_res_cq_entry); + SET_DEVICE_OP(dev_ops, fill_res_cq_entry_raw); + SET_DEVICE_OP(dev_ops, fill_res_mr_entry); + SET_DEVICE_OP(dev_ops, fill_res_mr_entry_raw); + SET_DEVICE_OP(dev_ops, fill_res_qp_entry); + SET_DEVICE_OP(dev_ops, fill_res_qp_entry_raw); + SET_DEVICE_OP(dev_ops, fill_stat_mr_entry); + SET_DEVICE_OP(dev_ops, get_dev_fw_str); + SET_DEVICE_OP(dev_ops, get_dma_mr); + SET_DEVICE_OP(dev_ops, get_hw_stats); + SET_DEVICE_OP(dev_ops, get_link_layer); + SET_DEVICE_OP(dev_ops, get_netdev); + SET_DEVICE_OP(dev_ops, get_numa_node); + SET_DEVICE_OP(dev_ops, get_port_immutable); + SET_DEVICE_OP(dev_ops, get_vector_affinity); + SET_DEVICE_OP(dev_ops, get_vf_config); + SET_DEVICE_OP(dev_ops, get_vf_guid); + SET_DEVICE_OP(dev_ops, get_vf_stats); + SET_DEVICE_OP(dev_ops, iw_accept); + SET_DEVICE_OP(dev_ops, iw_add_ref); + SET_DEVICE_OP(dev_ops, iw_connect); + SET_DEVICE_OP(dev_ops, iw_create_listen); + SET_DEVICE_OP(dev_ops, iw_destroy_listen); + SET_DEVICE_OP(dev_ops, iw_get_qp); + SET_DEVICE_OP(dev_ops, iw_reject); + SET_DEVICE_OP(dev_ops, iw_rem_ref); + SET_DEVICE_OP(dev_ops, map_mr_sg); + SET_DEVICE_OP(dev_ops, map_mr_sg_pi); + SET_DEVICE_OP(dev_ops, mmap); + SET_DEVICE_OP(dev_ops, mmap_free); + SET_DEVICE_OP(dev_ops, modify_ah); + SET_DEVICE_OP(dev_ops, modify_cq); + SET_DEVICE_OP(dev_ops, modify_device); + SET_DEVICE_OP(dev_ops, modify_flow_action_esp); + SET_DEVICE_OP(dev_ops, modify_hw_stat); + SET_DEVICE_OP(dev_ops, modify_port); + SET_DEVICE_OP(dev_ops, modify_qp); + SET_DEVICE_OP(dev_ops, modify_srq); + SET_DEVICE_OP(dev_ops, modify_wq); + SET_DEVICE_OP(dev_ops, peek_cq); + SET_DEVICE_OP(dev_ops, poll_cq); + SET_DEVICE_OP(dev_ops, port_groups); + SET_DEVICE_OP(dev_ops, post_recv); + SET_DEVICE_OP(dev_ops, post_send); + SET_DEVICE_OP(dev_ops, post_srq_recv); + SET_DEVICE_OP(dev_ops, process_mad); + SET_DEVICE_OP(dev_ops, query_ah); + SET_DEVICE_OP(dev_ops, query_device); + SET_DEVICE_OP(dev_ops, query_gid); + SET_DEVICE_OP(dev_ops, query_pkey); + SET_DEVICE_OP(dev_ops, query_port); + SET_DEVICE_OP(dev_ops, query_qp); + SET_DEVICE_OP(dev_ops, query_srq); + SET_DEVICE_OP(dev_ops, query_ucontext); + SET_DEVICE_OP(dev_ops, rdma_netdev_get_params); + SET_DEVICE_OP(dev_ops, read_counters); + SET_DEVICE_OP(dev_ops, reg_dm_mr); + SET_DEVICE_OP(dev_ops, reg_user_mr); + SET_DEVICE_OP(dev_ops, reg_user_mr_dmabuf); + SET_DEVICE_OP(dev_ops, req_notify_cq); + SET_DEVICE_OP(dev_ops, rereg_user_mr); + SET_DEVICE_OP(dev_ops, resize_cq); + SET_DEVICE_OP(dev_ops, set_vf_guid); + SET_DEVICE_OP(dev_ops, set_vf_link_state); + + SET_DEVICE_OP(dev_ops, create_nvmf_backend_ctrl); + SET_DEVICE_OP(dev_ops, destroy_nvmf_backend_ctrl); + SET_DEVICE_OP(dev_ops, attach_nvmf_ns); + SET_DEVICE_OP(dev_ops, detach_nvmf_ns); + SET_DEVICE_OP(dev_ops, query_nvmf_ns); + + SET_OBJ_SIZE(dev_ops, ib_ah); + SET_OBJ_SIZE(dev_ops, ib_counters); + SET_OBJ_SIZE(dev_ops, ib_cq); + SET_OBJ_SIZE(dev_ops, ib_mw); + SET_OBJ_SIZE(dev_ops, ib_pd); + SET_OBJ_SIZE(dev_ops, ib_qp); + SET_OBJ_SIZE(dev_ops, ib_rwq_ind_table); + SET_OBJ_SIZE(dev_ops, ib_srq); + SET_OBJ_SIZE(dev_ops, ib_ucontext); + SET_OBJ_SIZE(dev_ops, ib_xrcd); +} +EXPORT_SYMBOL(ib_set_device_ops); + +#ifdef CONFIG_INFINIBAND_VIRT_DMA +int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents) +{ + struct scatterlist *s; + int i; + + for_each_sg(sg, s, nents, i) { + sg_dma_address(s) = (uintptr_t)sg_virt(s); + sg_dma_len(s) = s->length; + } + return nents; +} +EXPORT_SYMBOL(ib_dma_virt_map_sg); +#endif /* CONFIG_INFINIBAND_VIRT_DMA */ + +static const struct rdma_nl_cbs ibnl_ls_cb_table[RDMA_NL_LS_NUM_OPS] = { + [RDMA_NL_LS_OP_RESOLVE] = { + .doit = ib_nl_handle_resolve_resp, + .flags = RDMA_NL_ADMIN_PERM, + }, + [RDMA_NL_LS_OP_SET_TIMEOUT] = { + .doit = ib_nl_handle_set_timeout, + .flags = RDMA_NL_ADMIN_PERM, + }, + [RDMA_NL_LS_OP_IP_RESOLVE] = { + .doit = ib_nl_handle_ip_res_resp, + .flags = RDMA_NL_ADMIN_PERM, + }, +}; + +static int __init ib_core_init(void) +{ + int ret = -ENOMEM; + + ib_wq = alloc_workqueue("infiniband", 0, 0); + if (!ib_wq) + return -ENOMEM; + + ib_unreg_wq = alloc_workqueue("ib-unreg-wq", WQ_UNBOUND, + WQ_UNBOUND_MAX_ACTIVE); + if (!ib_unreg_wq) + goto err; + + ib_comp_wq = alloc_workqueue("ib-comp-wq", + WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, 0); + if (!ib_comp_wq) + goto err_unbound; + + ib_comp_unbound_wq = + alloc_workqueue("ib-comp-unb-wq", + WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM | + WQ_SYSFS, WQ_UNBOUND_MAX_ACTIVE); + if (!ib_comp_unbound_wq) + goto err_comp; + + ret = class_register(&ib_class); + if (ret) { + pr_warn("Couldn't create InfiniBand device class\n"); + goto err_comp_unbound; + } + + rdma_nl_init(); + + ret = addr_init(); + if (ret) { + pr_warn("Couldn't init IB address resolution\n"); + goto err_ibnl; + } + + ret = ib_mad_init(); + if (ret) { + pr_warn("Couldn't init IB MAD\n"); + goto err_addr; + } + + ret = ib_sa_init(); + if (ret) { + pr_warn("Couldn't init SA\n"); + goto err_mad; + } + + ret = register_blocking_lsm_notifier(&ibdev_lsm_nb); + if (ret) { + pr_warn("Couldn't register LSM notifier. ret %d\n", ret); + goto err_sa; + } + + ret = register_pernet_device(&rdma_dev_net_ops); + if (ret) { + pr_warn("Couldn't init compat dev. ret %d\n", ret); + goto err_compat; + } + + nldev_init(); + rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table); + roce_gid_mgmt_init(); + + return 0; + +err_compat: + unregister_blocking_lsm_notifier(&ibdev_lsm_nb); +err_sa: + ib_sa_cleanup(); +err_mad: + ib_mad_cleanup(); +err_addr: + addr_cleanup(); +err_ibnl: + class_unregister(&ib_class); +err_comp_unbound: + destroy_workqueue(ib_comp_unbound_wq); +err_comp: + destroy_workqueue(ib_comp_wq); +err_unbound: + destroy_workqueue(ib_unreg_wq); +err: + destroy_workqueue(ib_wq); + return ret; +} + +static void __exit ib_core_cleanup(void) +{ + roce_gid_mgmt_cleanup(); + nldev_exit(); + rdma_nl_unregister(RDMA_NL_LS); + unregister_pernet_device(&rdma_dev_net_ops); + unregister_blocking_lsm_notifier(&ibdev_lsm_nb); + ib_sa_cleanup(); + ib_mad_cleanup(); + addr_cleanup(); + rdma_nl_exit(); + class_unregister(&ib_class); + destroy_workqueue(ib_comp_unbound_wq); + destroy_workqueue(ib_comp_wq); + /* Make sure that any pending umem accounting work is done. */ + destroy_workqueue(ib_wq); + destroy_workqueue(ib_unreg_wq); + WARN_ON(!xa_empty(&clients)); + WARN_ON(!xa_empty(&devices)); +} + +MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4); + +/* ib core relies on netdev stack to first register net_ns_type_operations + * ns kobject type before ib_core initialization. + */ +fs_initcall(ib_core_init); +module_exit(ib_core_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/ib_addr_dummy.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/ib_addr_dummy.c new file mode 100644 index 0000000..16ef728 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/ib_addr_dummy.c @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "ib_addr" +#define PFX DRV_NAME ": " +#define DRV_VERSION "2.0.0" +#define DRV_RELDATE "November 15, 2016" + +MODULE_AUTHOR("Alaa Hleihel"); +MODULE_DESCRIPTION("ib_addr dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init ib_addr_init(void) +{ + return 0; +} + +static void __exit ib_addr_cleanup(void) +{ +} + +module_init(ib_addr_init); +module_exit(ib_addr_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/ib_core_uverbs.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/ib_core_uverbs.c new file mode 100644 index 0000000..8bb0ae4 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/ib_core_uverbs.c @@ -0,0 +1,372 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright 2019 Marvell. All rights reserved. + */ +#include +#include "uverbs.h" +#include "core_priv.h" + +#ifndef pgprot_decrypted +#define pgprot_decrypted(prot) (prot) +#endif + +/** + * rdma_umap_priv_init() - Initialize the private data of a vma + * + * @priv: The already allocated private data + * @vma: The vm area struct that needs private data + * @entry: entry into the mmap_xa that needs to be linked with + * this vma + * + * Each time we map IO memory into user space this keeps track of the + * mapping. When the device is hot-unplugged we 'zap' the mmaps in user space + * to point to the zero page and allow the hot unplug to proceed. + * + * This is necessary for cases like PCI physical hot unplug as the actual BAR + * memory may vanish after this and access to it from userspace could MCE. + * + * RDMA drivers supporting disassociation must have their user space designed + * to cope in some way with their IO pages going to the zero page. + * + */ +void rdma_umap_priv_init(struct rdma_umap_priv *priv, + struct vm_area_struct *vma, + struct rdma_user_mmap_entry *entry) +{ + struct ib_uverbs_file *ufile = vma->vm_file->private_data; + + priv->vma = vma; + if (entry) { + kref_get(&entry->ref); + priv->entry = entry; + } + vma->vm_private_data = priv; + /* vm_ops is setup in ib_uverbs_mmap() to avoid module dependencies */ + + mutex_lock(&ufile->umap_lock); + list_add(&priv->list, &ufile->umaps); + mutex_unlock(&ufile->umap_lock); +} +EXPORT_SYMBOL(rdma_umap_priv_init); + +/** + * rdma_user_mmap_io() - Map IO memory into a process + * + * @ucontext: associated user context + * @vma: the vma related to the current mmap call + * @pfn: pfn to map + * @size: size to map + * @prot: pgprot to use in remap call + * @entry: mmap_entry retrieved from rdma_user_mmap_entry_get(), or NULL + * if mmap_entry is not used by the driver + * + * This is to be called by drivers as part of their mmap() functions if they + * wish to send something like PCI-E BAR memory to userspace. + * + * Return -EINVAL on wrong flags or size, -EAGAIN on failure to map. 0 on + * success. + */ +int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma, + unsigned long pfn, unsigned long size, pgprot_t prot, + struct rdma_user_mmap_entry *entry) +{ + struct ib_uverbs_file *ufile = ucontext->ufile; + struct rdma_umap_priv *priv; + + if (!(vma->vm_flags & VM_SHARED)) + return -EINVAL; + + if (vma->vm_end - vma->vm_start != size) + return -EINVAL; + + /* Driver is using this wrong, must be called by ib_uverbs_mmap */ + if (WARN_ON(!vma->vm_file || + vma->vm_file->private_data != ufile)) + return -EINVAL; + lockdep_assert_held(&ufile->device->disassociate_srcu); + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + prot = pgprot_decrypted(prot); + vma->vm_page_prot = prot; + if (io_remap_pfn_range(vma, vma->vm_start, pfn, size, prot)) { + kfree(priv); + return -EAGAIN; + } + + rdma_umap_priv_init(priv, vma, entry); + return 0; +} +EXPORT_SYMBOL(rdma_user_mmap_io); + +/** + * rdma_user_mmap_entry_get_pgoff() - Get an entry from the mmap_xa + * + * @ucontext: associated user context + * @pgoff: The mmap offset >> PAGE_SHIFT + * + * This function is called when a user tries to mmap with an offset (returned + * by rdma_user_mmap_get_offset()) it initially received from the driver. The + * rdma_user_mmap_entry was created by the function + * rdma_user_mmap_entry_insert(). This function increases the refcnt of the + * entry so that it won't be deleted from the xarray in the meantime. + * + * Return an reference to an entry if exists or NULL if there is no + * match. rdma_user_mmap_entry_put() must be called to put the reference. + */ +struct rdma_user_mmap_entry * +rdma_user_mmap_entry_get_pgoff(struct ib_ucontext *ucontext, + unsigned long pgoff) +{ + struct rdma_user_mmap_entry *entry; + + if (pgoff > U32_MAX) + return NULL; + + xa_lock(&ucontext->mmap_xa); + + entry = xa_load(&ucontext->mmap_xa, pgoff); + + /* + * If refcount is zero, entry is already being deleted, driver_removed + * indicates that the no further mmaps are possible and we waiting for + * the active VMAs to be closed. + */ + if (!entry || entry->start_pgoff != pgoff || entry->driver_removed || + !kref_get_unless_zero(&entry->ref)) + goto err; + + xa_unlock(&ucontext->mmap_xa); + + ibdev_dbg(ucontext->device, "mmap: pgoff[%#lx] npages[%#zx] returned\n", + pgoff, entry->npages); + + return entry; + +err: + xa_unlock(&ucontext->mmap_xa); + return NULL; +} +EXPORT_SYMBOL(rdma_user_mmap_entry_get_pgoff); + +/** + * rdma_user_mmap_entry_get() - Get an entry from the mmap_xa + * + * @ucontext: associated user context + * @vma: the vma being mmap'd into + * + * This function is like rdma_user_mmap_entry_get_pgoff() except that it also + * checks that the VMA is correct. + */ +struct rdma_user_mmap_entry * +rdma_user_mmap_entry_get(struct ib_ucontext *ucontext, + struct vm_area_struct *vma) +{ + struct rdma_user_mmap_entry *entry; + + if (!(vma->vm_flags & VM_SHARED)) + return NULL; + entry = rdma_user_mmap_entry_get_pgoff(ucontext, vma->vm_pgoff); + if (!entry) + return NULL; + if (entry->npages * PAGE_SIZE != vma->vm_end - vma->vm_start) { + rdma_user_mmap_entry_put(entry); + return NULL; + } + return entry; +} +EXPORT_SYMBOL(rdma_user_mmap_entry_get); + +static void rdma_user_mmap_entry_free(struct kref *kref) +{ + struct rdma_user_mmap_entry *entry = + container_of(kref, struct rdma_user_mmap_entry, ref); + struct ib_ucontext *ucontext = entry->ucontext; + unsigned long i; + + /* + * Erase all entries occupied by this single entry, this is deferred + * until all VMA are closed so that the mmap offsets remain unique. + */ + xa_lock(&ucontext->mmap_xa); + for (i = 0; i < entry->npages; i++) + __xa_erase(&ucontext->mmap_xa, entry->start_pgoff + i); + xa_unlock(&ucontext->mmap_xa); + + ibdev_dbg(ucontext->device, "mmap: pgoff[%#lx] npages[%#zx] removed\n", + entry->start_pgoff, entry->npages); + + if (ucontext->device->ops.mmap_free) + ucontext->device->ops.mmap_free(entry); +} + +/** + * rdma_user_mmap_entry_put() - Drop reference to the mmap entry + * + * @entry: an entry in the mmap_xa + * + * This function is called when the mapping is closed if it was + * an io mapping or when the driver is done with the entry for + * some other reason. + * Should be called after rdma_user_mmap_entry_get was called + * and entry is no longer needed. This function will erase the + * entry and free it if its refcnt reaches zero. + */ +void rdma_user_mmap_entry_put(struct rdma_user_mmap_entry *entry) +{ + kref_put(&entry->ref, rdma_user_mmap_entry_free); +} +EXPORT_SYMBOL(rdma_user_mmap_entry_put); + +/** + * rdma_user_mmap_entry_remove() - Drop reference to entry and + * mark it as unmmapable + * + * @entry: the entry to insert into the mmap_xa + * + * Drivers can call this to prevent userspace from creating more mappings for + * entry, however existing mmaps continue to exist and ops->mmap_free() will + * not be called until all user mmaps are destroyed. + */ +void rdma_user_mmap_entry_remove(struct rdma_user_mmap_entry *entry) +{ + if (!entry) + return; + + xa_lock(&entry->ucontext->mmap_xa); + entry->driver_removed = true; + xa_unlock(&entry->ucontext->mmap_xa); + kref_put(&entry->ref, rdma_user_mmap_entry_free); +} +EXPORT_SYMBOL(rdma_user_mmap_entry_remove); + +/** + * rdma_user_mmap_entry_insert_range() - Insert an entry to the mmap_xa + * in a given range. + * + * @ucontext: associated user context. + * @entry: the entry to insert into the mmap_xa + * @length: length of the address that will be mmapped + * @min_pgoff: minimum pgoff to be returned + * @max_pgoff: maximum pgoff to be returned + * + * This function should be called by drivers that use the rdma_user_mmap + * interface for implementing their mmap syscall A database of mmap offsets is + * handled in the core and helper functions are provided to insert entries + * into the database and extract entries when the user calls mmap with the + * given offset. The function allocates a unique page offset in a given range + * that should be provided to user, the user will use the offset to retrieve + * information such as address to be mapped and how. + * + * Return: 0 on success and -ENOMEM on failure + */ +int rdma_user_mmap_entry_insert_range(struct ib_ucontext *ucontext, + struct rdma_user_mmap_entry *entry, + size_t length, u32 min_pgoff, + u32 max_pgoff) +{ + struct ib_uverbs_file *ufile = ucontext->ufile; + XA_STATE(xas, &ucontext->mmap_xa, min_pgoff); + u32 xa_first, xa_last, npages; + int err; + u32 i; + + if (!entry) + return -EINVAL; + + kref_init(&entry->ref); + entry->ucontext = ucontext; + + /* + * We want the whole allocation to be done without interruption from a + * different thread. The allocation requires finding a free range and + * storing. During the xa_insert the lock could be released, possibly + * allowing another thread to choose the same range. + */ + mutex_lock(&ufile->umap_lock); + + xa_lock(&ucontext->mmap_xa); + + /* We want to find an empty range */ + npages = (u32)DIV_ROUND_UP(length, PAGE_SIZE); + entry->npages = npages; + while (true) { + /* First find an empty index */ + xas_find_marked(&xas, max_pgoff, XA_FREE_MARK); + if (xas.xa_node == XAS_RESTART) + goto err_unlock; + + xa_first = xas.xa_index; + + /* Is there enough room to have the range? */ + if (check_add_overflow(xa_first, npages, &xa_last)) + goto err_unlock; + + /* + * Now look for the next present entry. If an entry doesn't + * exist, we found an empty range and can proceed. + */ + xas_next_entry(&xas, xa_last - 1); + if (xas.xa_node == XAS_BOUNDS || xas.xa_index >= xa_last) + break; + } + + for (i = xa_first; i < xa_last; i++) { + err = __xa_insert(&ucontext->mmap_xa, i, entry, GFP_KERNEL); + if (err) + goto err_undo; + } + + /* + * Internally the kernel uses a page offset, in libc this is a byte + * offset. Drivers should not return pgoff to userspace. + */ + entry->start_pgoff = xa_first; + xa_unlock(&ucontext->mmap_xa); + mutex_unlock(&ufile->umap_lock); + + ibdev_dbg(ucontext->device, "mmap: pgoff[%#lx] npages[%#x] inserted\n", + entry->start_pgoff, npages); + + return 0; + +err_undo: + for (; i > xa_first; i--) + __xa_erase(&ucontext->mmap_xa, i - 1); + +err_unlock: + xa_unlock(&ucontext->mmap_xa); + mutex_unlock(&ufile->umap_lock); + return -ENOMEM; +} +EXPORT_SYMBOL(rdma_user_mmap_entry_insert_range); + +/** + * rdma_user_mmap_entry_insert() - Insert an entry to the mmap_xa. + * + * @ucontext: associated user context. + * @entry: the entry to insert into the mmap_xa + * @length: length of the address that will be mmapped + * + * This function should be called by drivers that use the rdma_user_mmap + * interface for handling user mmapped addresses. The database is handled in + * the core and helper functions are provided to insert entries into the + * database and extract entries when the user calls mmap with the given offset. + * The function allocates a unique page offset that should be provided to user, + * the user will use the offset to retrieve information such as address to + * be mapped and how. + * + * Return: 0 on success and -ENOMEM on failure + */ +int rdma_user_mmap_entry_insert(struct ib_ucontext *ucontext, + struct rdma_user_mmap_entry *entry, + size_t length) +{ + return rdma_user_mmap_entry_insert_range(ucontext, entry, length, 0, + U32_MAX); +} +EXPORT_SYMBOL(rdma_user_mmap_entry_insert); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/ib_mad_dummy.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/ib_mad_dummy.c new file mode 100644 index 0000000..2be21ac --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/ib_mad_dummy.c @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "ib_mad" +#define PFX DRV_NAME ": " +#define DRV_VERSION "2.0.0" +#define DRV_RELDATE "November 15, 2016" + +MODULE_AUTHOR("Alaa Hleihel"); +MODULE_DESCRIPTION("ib_mad dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init ib_mad_init(void) +{ + return 0; +} + +static void __exit ib_mad_cleanup(void) +{ +} + +module_init(ib_mad_init); +module_exit(ib_mad_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/ib_peer_mem.h b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/ib_peer_mem.h new file mode 100644 index 0000000..248530a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/ib_peer_mem.h @@ -0,0 +1,65 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2014-2020, Mellanox Technologies. All rights reserved. + */ +#ifndef RDMA_IB_PEER_MEM_H +#define RDMA_IB_PEER_MEM_H + +#include +#include +#include +#include + +struct ib_peer_memory_statistics { + atomic64_t num_alloc_mrs; + atomic64_t num_dealloc_mrs; + atomic64_t num_reg_pages; + atomic64_t num_dereg_pages; + atomic64_t num_reg_bytes; + atomic64_t num_dereg_bytes; + unsigned long num_free_callbacks; +}; + +struct ib_peer_memory_client { + struct kobject kobj; + refcount_t usecnt; + struct completion usecnt_zero; + const struct peer_memory_client *peer_mem; + struct list_head core_peer_list; + struct ib_peer_memory_statistics stats; + struct xarray umem_xa; + u32 xa_cyclic_next; + bool invalidation_required; +}; + +enum ib_umem_mapped_state { + UMEM_PEER_UNMAPPED, + UMEM_PEER_MAPPED, + UMEM_PEER_INVALIDATED, +}; + +struct ib_umem_peer { + struct ib_umem umem; + struct kref kref; + /* peer memory that manages this umem */ + struct ib_peer_memory_client *ib_peer_client; + void *peer_client_context; + umem_invalidate_func_t invalidation_func; + void *invalidation_private; + struct mutex mapping_lock; + enum ib_umem_mapped_state mapped_state; + u32 xa_id; + struct scatterlist *first_sg; + dma_addr_t first_dma_address; + unsigned int first_dma_length; + unsigned int first_length; + struct scatterlist *last_sg; + unsigned int last_dma_length; + unsigned int last_length; +}; + +struct ib_umem *ib_peer_umem_get(struct ib_umem *old_umem, int old_ret, + unsigned long peer_mem_flags); +void ib_peer_umem_release(struct ib_umem *umem); + +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/ib_sa_dummy.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/ib_sa_dummy.c new file mode 100644 index 0000000..840c893 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/ib_sa_dummy.c @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "ib_sa" +#define PFX DRV_NAME ": " +#define DRV_VERSION "2.0.0" +#define DRV_RELDATE "November 15, 2016" + +MODULE_AUTHOR("Alaa Hleihel"); +MODULE_DESCRIPTION("ib_sa dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init ib_sa_init(void) +{ + return 0; +} + +static void __exit ib_sa_cleanup(void) +{ +} + +module_init(ib_sa_init); +module_exit(ib_sa_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/ib_ucm_dummy.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/ib_ucm_dummy.c new file mode 100644 index 0000000..c963f79 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/ib_ucm_dummy.c @@ -0,0 +1,61 @@ + +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "ib_ucm" +#define PFX DRV_NAME ": " +#define DRV_VERSION "2.0.0" +#define DRV_RELDATE "November 15, 2016" + +MODULE_AUTHOR("Valentine Fatiev"); +MODULE_DESCRIPTION("ib_ucm dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init ib_ucm_init(void) +{ + return 0; +} + +static void __exit ib_ucm_cleanup(void) +{ +} + +module_init(ib_ucm_init); +module_exit(ib_ucm_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/iwcm.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/iwcm.c new file mode 100644 index 0000000..2b47073 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/iwcm.c @@ -0,0 +1,1223 @@ +/* + * Copyright (c) 2004, 2005 Intel Corporation. All rights reserved. + * Copyright (c) 2004 Topspin Corporation. All rights reserved. + * Copyright (c) 2004, 2005 Voltaire Corporation. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * Copyright (c) 2005 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "iwcm.h" + +MODULE_AUTHOR("Tom Tucker"); +MODULE_DESCRIPTION("iWARP CM"); +MODULE_LICENSE("Dual BSD/GPL"); + +static const char * const iwcm_rej_reason_strs[] = { + [ECONNRESET] = "reset by remote host", + [ECONNREFUSED] = "refused by remote application", + [ETIMEDOUT] = "setup timeout", +}; + +const char *__attribute_const__ iwcm_reject_msg(int reason) +{ + size_t index; + + /* iWARP uses negative errnos */ + index = -reason; + + if (index < ARRAY_SIZE(iwcm_rej_reason_strs) && + iwcm_rej_reason_strs[index]) + return iwcm_rej_reason_strs[index]; + else + return "unrecognized reason"; +} +EXPORT_SYMBOL(iwcm_reject_msg); + +static struct rdma_nl_cbs iwcm_nl_cb_table[RDMA_NL_IWPM_NUM_OPS] = { + [RDMA_NL_IWPM_REG_PID] = {.dump = iwpm_register_pid_cb}, + [RDMA_NL_IWPM_ADD_MAPPING] = {.dump = iwpm_add_mapping_cb}, + [RDMA_NL_IWPM_QUERY_MAPPING] = {.dump = iwpm_add_and_query_mapping_cb}, + [RDMA_NL_IWPM_REMOTE_INFO] = {.dump = iwpm_remote_info_cb}, + [RDMA_NL_IWPM_HANDLE_ERR] = {.dump = iwpm_mapping_error_cb}, + [RDMA_NL_IWPM_MAPINFO] = {.dump = iwpm_mapping_info_cb}, + [RDMA_NL_IWPM_MAPINFO_NUM] = {.dump = iwpm_ack_mapping_info_cb}, + [RDMA_NL_IWPM_HELLO] = {.dump = iwpm_hello_cb} +}; + +static struct workqueue_struct *iwcm_wq; +struct iwcm_work { + struct work_struct work; + struct iwcm_id_private *cm_id; + struct list_head list; + struct iw_cm_event event; + struct list_head free_list; +}; + +static unsigned int default_backlog = 256; + +static struct ctl_table_header *iwcm_ctl_table_hdr; +static struct ctl_table iwcm_ctl_table[] = { + { + .procname = "default_backlog", + .data = &default_backlog, + .maxlen = sizeof(default_backlog), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + +/* + * The following services provide a mechanism for pre-allocating iwcm_work + * elements. The design pre-allocates them based on the cm_id type: + * LISTENING IDS: Get enough elements preallocated to handle the + * listen backlog. + * ACTIVE IDS: 4: CONNECT_REPLY, ESTABLISHED, DISCONNECT, CLOSE + * PASSIVE IDS: 3: ESTABLISHED, DISCONNECT, CLOSE + * + * Allocating them in connect and listen avoids having to deal + * with allocation failures on the event upcall from the provider (which + * is called in the interrupt context). + * + * One exception is when creating the cm_id for incoming connection requests. + * There are two cases: + * 1) in the event upcall, cm_event_handler(), for a listening cm_id. If + * the backlog is exceeded, then no more connection request events will + * be processed. cm_event_handler() returns -ENOMEM in this case. Its up + * to the provider to reject the connection request. + * 2) in the connection request workqueue handler, cm_conn_req_handler(). + * If work elements cannot be allocated for the new connect request cm_id, + * then IWCM will call the provider reject method. This is ok since + * cm_conn_req_handler() runs in the workqueue thread context. + */ + +static struct iwcm_work *get_work(struct iwcm_id_private *cm_id_priv) +{ + struct iwcm_work *work; + + if (list_empty(&cm_id_priv->work_free_list)) + return NULL; + work = list_entry(cm_id_priv->work_free_list.next, struct iwcm_work, + free_list); + list_del_init(&work->free_list); + return work; +} + +static void put_work(struct iwcm_work *work) +{ + list_add(&work->free_list, &work->cm_id->work_free_list); +} + +static void dealloc_work_entries(struct iwcm_id_private *cm_id_priv) +{ + struct list_head *e, *tmp; + + list_for_each_safe(e, tmp, &cm_id_priv->work_free_list) { + list_del(e); + kfree(list_entry(e, struct iwcm_work, free_list)); + } +} + +static int alloc_work_entries(struct iwcm_id_private *cm_id_priv, int count) +{ + struct iwcm_work *work; + + BUG_ON(!list_empty(&cm_id_priv->work_free_list)); + while (count--) { + work = kmalloc(sizeof(struct iwcm_work), GFP_KERNEL); + if (!work) { + dealloc_work_entries(cm_id_priv); + return -ENOMEM; + } + work->cm_id = cm_id_priv; + INIT_LIST_HEAD(&work->list); + put_work(work); + } + return 0; +} + +/* + * Save private data from incoming connection requests to + * iw_cm_event, so the low level driver doesn't have to. Adjust + * the event ptr to point to the local copy. + */ +static int copy_private_data(struct iw_cm_event *event) +{ + void *p; + + p = kmemdup(event->private_data, event->private_data_len, GFP_ATOMIC); + if (!p) + return -ENOMEM; + event->private_data = p; + return 0; +} + +static void free_cm_id(struct iwcm_id_private *cm_id_priv) +{ + dealloc_work_entries(cm_id_priv); + kfree(cm_id_priv); +} + +/* + * Release a reference on cm_id. If the last reference is being + * released, free the cm_id and return 1. + */ +static int iwcm_deref_id(struct iwcm_id_private *cm_id_priv) +{ + if (refcount_dec_and_test(&cm_id_priv->refcount)) { + BUG_ON(!list_empty(&cm_id_priv->work_list)); + free_cm_id(cm_id_priv); + return 1; + } + + return 0; +} + +static void add_ref(struct iw_cm_id *cm_id) +{ + struct iwcm_id_private *cm_id_priv; + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); + refcount_inc(&cm_id_priv->refcount); +} + +static void rem_ref(struct iw_cm_id *cm_id) +{ + struct iwcm_id_private *cm_id_priv; + + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); + + (void)iwcm_deref_id(cm_id_priv); +} + +static int cm_event_handler(struct iw_cm_id *cm_id, struct iw_cm_event *event); + +struct iw_cm_id *iw_create_cm_id(struct ib_device *device, + iw_cm_handler cm_handler, + void *context) +{ + struct iwcm_id_private *cm_id_priv; + + cm_id_priv = kzalloc(sizeof(*cm_id_priv), GFP_KERNEL); + if (!cm_id_priv) + return ERR_PTR(-ENOMEM); + + cm_id_priv->state = IW_CM_STATE_IDLE; + cm_id_priv->id.device = device; + cm_id_priv->id.cm_handler = cm_handler; + cm_id_priv->id.context = context; + cm_id_priv->id.event_handler = cm_event_handler; + cm_id_priv->id.add_ref = add_ref; + cm_id_priv->id.rem_ref = rem_ref; + spin_lock_init(&cm_id_priv->lock); + refcount_set(&cm_id_priv->refcount, 1); + init_waitqueue_head(&cm_id_priv->connect_wait); + init_completion(&cm_id_priv->destroy_comp); + INIT_LIST_HEAD(&cm_id_priv->work_list); + INIT_LIST_HEAD(&cm_id_priv->work_free_list); + + return &cm_id_priv->id; +} +EXPORT_SYMBOL(iw_create_cm_id); + + +static int iwcm_modify_qp_err(struct ib_qp *qp) +{ + struct ib_qp_attr qp_attr; + + if (!qp) + return -EINVAL; + + qp_attr.qp_state = IB_QPS_ERR; + return ib_modify_qp(qp, &qp_attr, IB_QP_STATE); +} + +/* + * This is really the RDMAC CLOSING state. It is most similar to the + * IB SQD QP state. + */ +static int iwcm_modify_qp_sqd(struct ib_qp *qp) +{ + struct ib_qp_attr qp_attr; + + BUG_ON(qp == NULL); + qp_attr.qp_state = IB_QPS_SQD; + return ib_modify_qp(qp, &qp_attr, IB_QP_STATE); +} + +/* + * CM_ID <-- CLOSING + * + * Block if a passive or active connection is currently being processed. Then + * process the event as follows: + * - If we are ESTABLISHED, move to CLOSING and modify the QP state + * based on the abrupt flag + * - If the connection is already in the CLOSING or IDLE state, the peer is + * disconnecting concurrently with us and we've already seen the + * DISCONNECT event -- ignore the request and return 0 + * - Disconnect on a listening endpoint returns -EINVAL + */ +int iw_cm_disconnect(struct iw_cm_id *cm_id, int abrupt) +{ + struct iwcm_id_private *cm_id_priv; + unsigned long flags; + int ret = 0; + struct ib_qp *qp = NULL; + + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); + /* Wait if we're currently in a connect or accept downcall */ + wait_event(cm_id_priv->connect_wait, + !test_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags)); + + spin_lock_irqsave(&cm_id_priv->lock, flags); + switch (cm_id_priv->state) { + case IW_CM_STATE_ESTABLISHED: + cm_id_priv->state = IW_CM_STATE_CLOSING; + + /* QP could be for user-mode client */ + if (cm_id_priv->qp) + qp = cm_id_priv->qp; + else + ret = -EINVAL; + break; + case IW_CM_STATE_LISTEN: + ret = -EINVAL; + break; + case IW_CM_STATE_CLOSING: + /* remote peer closed first */ + case IW_CM_STATE_IDLE: + /* accept or connect returned !0 */ + break; + case IW_CM_STATE_CONN_RECV: + /* + * App called disconnect before/without calling accept after + * connect_request event delivered. + */ + break; + case IW_CM_STATE_CONN_SENT: + /* Can only get here if wait above fails */ + default: + BUG(); + } + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + + if (qp) { + if (abrupt) + ret = iwcm_modify_qp_err(qp); + else + ret = iwcm_modify_qp_sqd(qp); + + /* + * If both sides are disconnecting the QP could + * already be in ERR or SQD states + */ + ret = 0; + } + + return ret; +} +EXPORT_SYMBOL(iw_cm_disconnect); + +/* + * CM_ID <-- DESTROYING + * + * Clean up all resources associated with the connection and release + * the initial reference taken by iw_create_cm_id. + */ +static void destroy_cm_id(struct iw_cm_id *cm_id) +{ + struct iwcm_id_private *cm_id_priv; + struct ib_qp *qp; + unsigned long flags; + + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); + /* + * Wait if we're currently in a connect or accept downcall. A + * listening endpoint should never block here. + */ + wait_event(cm_id_priv->connect_wait, + !test_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags)); + + /* + * Since we're deleting the cm_id, drop any events that + * might arrive before the last dereference. + */ + set_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags); + + spin_lock_irqsave(&cm_id_priv->lock, flags); + qp = cm_id_priv->qp; + cm_id_priv->qp = NULL; + + switch (cm_id_priv->state) { + case IW_CM_STATE_LISTEN: + cm_id_priv->state = IW_CM_STATE_DESTROYING; + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + /* destroy the listening endpoint */ + cm_id->device->ops.iw_destroy_listen(cm_id); + spin_lock_irqsave(&cm_id_priv->lock, flags); + break; + case IW_CM_STATE_ESTABLISHED: + cm_id_priv->state = IW_CM_STATE_DESTROYING; + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + /* Abrupt close of the connection */ + (void)iwcm_modify_qp_err(qp); + spin_lock_irqsave(&cm_id_priv->lock, flags); + break; + case IW_CM_STATE_IDLE: + case IW_CM_STATE_CLOSING: + cm_id_priv->state = IW_CM_STATE_DESTROYING; + break; + case IW_CM_STATE_CONN_RECV: + /* + * App called destroy before/without calling accept after + * receiving connection request event notification or + * returned non zero from the event callback function. + * In either case, must tell the provider to reject. + */ + cm_id_priv->state = IW_CM_STATE_DESTROYING; + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + cm_id->device->ops.iw_reject(cm_id, NULL, 0); + spin_lock_irqsave(&cm_id_priv->lock, flags); + break; + case IW_CM_STATE_CONN_SENT: + case IW_CM_STATE_DESTROYING: + default: + BUG(); + break; + } + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + if (qp) + cm_id_priv->id.device->ops.iw_rem_ref(qp); + + if (cm_id->mapped) { + iwpm_remove_mapinfo(&cm_id->local_addr, &cm_id->m_local_addr); + iwpm_remove_mapping(&cm_id->local_addr, RDMA_NL_IWCM); + } + + (void)iwcm_deref_id(cm_id_priv); +} + +/* + * This function is only called by the application thread and cannot + * be called by the event thread. The function will wait for all + * references to be released on the cm_id and then kfree the cm_id + * object. + */ +void iw_destroy_cm_id(struct iw_cm_id *cm_id) +{ + destroy_cm_id(cm_id); +} +EXPORT_SYMBOL(iw_destroy_cm_id); + +/** + * iw_cm_check_wildcard - If IP address is 0 then use original + * @pm_addr: sockaddr containing the ip to check for wildcard + * @cm_addr: sockaddr containing the actual IP address + * @cm_outaddr: sockaddr to set IP addr which leaving port + * + * Checks the pm_addr for wildcard and then sets cm_outaddr's + * IP to the actual (cm_addr). + */ +static void iw_cm_check_wildcard(struct sockaddr_storage *pm_addr, + struct sockaddr_storage *cm_addr, + struct sockaddr_storage *cm_outaddr) +{ + if (pm_addr->ss_family == AF_INET) { + struct sockaddr_in *pm4_addr = (struct sockaddr_in *)pm_addr; + + if (pm4_addr->sin_addr.s_addr == htonl(INADDR_ANY)) { + struct sockaddr_in *cm4_addr = + (struct sockaddr_in *)cm_addr; + struct sockaddr_in *cm4_outaddr = + (struct sockaddr_in *)cm_outaddr; + + cm4_outaddr->sin_addr = cm4_addr->sin_addr; + } + } else { + struct sockaddr_in6 *pm6_addr = (struct sockaddr_in6 *)pm_addr; + + if (ipv6_addr_type(&pm6_addr->sin6_addr) == IPV6_ADDR_ANY) { + struct sockaddr_in6 *cm6_addr = + (struct sockaddr_in6 *)cm_addr; + struct sockaddr_in6 *cm6_outaddr = + (struct sockaddr_in6 *)cm_outaddr; + + cm6_outaddr->sin6_addr = cm6_addr->sin6_addr; + } + } +} + +/** + * iw_cm_map - Use portmapper to map the ports + * @cm_id: connection manager pointer + * @active: Indicates the active side when true + * returns nonzero for error only if iwpm_create_mapinfo() fails + * + * Tries to add a mapping for a port using the Portmapper. If + * successful in mapping the IP/Port it will check the remote + * mapped IP address for a wildcard IP address and replace the + * zero IP address with the remote_addr. + */ +static int iw_cm_map(struct iw_cm_id *cm_id, bool active) +{ + const char *devname = dev_name(&cm_id->device->dev); + const char *ifname = cm_id->device->iw_ifname; + struct iwpm_dev_data pm_reg_msg = {}; + struct iwpm_sa_data pm_msg; + int status; + + if (strlen(devname) >= sizeof(pm_reg_msg.dev_name) || + strlen(ifname) >= sizeof(pm_reg_msg.if_name)) + return -EINVAL; + + cm_id->m_local_addr = cm_id->local_addr; + cm_id->m_remote_addr = cm_id->remote_addr; + + strcpy(pm_reg_msg.dev_name, devname); + strcpy(pm_reg_msg.if_name, ifname); + + if (iwpm_register_pid(&pm_reg_msg, RDMA_NL_IWCM) || + !iwpm_valid_pid()) + return 0; + + cm_id->mapped = true; + pm_msg.loc_addr = cm_id->local_addr; + pm_msg.rem_addr = cm_id->remote_addr; + pm_msg.flags = (cm_id->device->iw_driver_flags & IW_F_NO_PORT_MAP) ? + IWPM_FLAGS_NO_PORT_MAP : 0; + if (active) + status = iwpm_add_and_query_mapping(&pm_msg, + RDMA_NL_IWCM); + else + status = iwpm_add_mapping(&pm_msg, RDMA_NL_IWCM); + + if (!status) { + cm_id->m_local_addr = pm_msg.mapped_loc_addr; + if (active) { + cm_id->m_remote_addr = pm_msg.mapped_rem_addr; + iw_cm_check_wildcard(&pm_msg.mapped_rem_addr, + &cm_id->remote_addr, + &cm_id->m_remote_addr); + } + } + + return iwpm_create_mapinfo(&cm_id->local_addr, + &cm_id->m_local_addr, + RDMA_NL_IWCM, pm_msg.flags); +} + +/* + * CM_ID <-- LISTEN + * + * Start listening for connect requests. Generates one CONNECT_REQUEST + * event for each inbound connect request. + */ +int iw_cm_listen(struct iw_cm_id *cm_id, int backlog) +{ + struct iwcm_id_private *cm_id_priv; + unsigned long flags; + int ret; + + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); + + if (!backlog) + backlog = default_backlog; + + ret = alloc_work_entries(cm_id_priv, backlog); + if (ret) + return ret; + + spin_lock_irqsave(&cm_id_priv->lock, flags); + switch (cm_id_priv->state) { + case IW_CM_STATE_IDLE: + cm_id_priv->state = IW_CM_STATE_LISTEN; + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + ret = iw_cm_map(cm_id, false); + if (!ret) + ret = cm_id->device->ops.iw_create_listen(cm_id, + backlog); + if (ret) + cm_id_priv->state = IW_CM_STATE_IDLE; + spin_lock_irqsave(&cm_id_priv->lock, flags); + break; + default: + ret = -EINVAL; + } + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + + return ret; +} +EXPORT_SYMBOL(iw_cm_listen); + +/* + * CM_ID <-- IDLE + * + * Rejects an inbound connection request. No events are generated. + */ +int iw_cm_reject(struct iw_cm_id *cm_id, + const void *private_data, + u8 private_data_len) +{ + struct iwcm_id_private *cm_id_priv; + unsigned long flags; + int ret; + + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); + set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); + + spin_lock_irqsave(&cm_id_priv->lock, flags); + if (cm_id_priv->state != IW_CM_STATE_CONN_RECV) { + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); + wake_up_all(&cm_id_priv->connect_wait); + return -EINVAL; + } + cm_id_priv->state = IW_CM_STATE_IDLE; + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + + ret = cm_id->device->ops.iw_reject(cm_id, private_data, + private_data_len); + + clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); + wake_up_all(&cm_id_priv->connect_wait); + + return ret; +} +EXPORT_SYMBOL(iw_cm_reject); + +/* + * CM_ID <-- ESTABLISHED + * + * Accepts an inbound connection request and generates an ESTABLISHED + * event. Callers of iw_cm_disconnect and iw_destroy_cm_id will block + * until the ESTABLISHED event is received from the provider. + */ +int iw_cm_accept(struct iw_cm_id *cm_id, + struct iw_cm_conn_param *iw_param) +{ + struct iwcm_id_private *cm_id_priv; + struct ib_qp *qp; + unsigned long flags; + int ret; + + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); + set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); + + spin_lock_irqsave(&cm_id_priv->lock, flags); + if (cm_id_priv->state != IW_CM_STATE_CONN_RECV) { + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); + wake_up_all(&cm_id_priv->connect_wait); + return -EINVAL; + } + /* Get the ib_qp given the QPN */ + qp = cm_id->device->ops.iw_get_qp(cm_id->device, iw_param->qpn); + if (!qp) { + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); + wake_up_all(&cm_id_priv->connect_wait); + return -EINVAL; + } + cm_id->device->ops.iw_add_ref(qp); + cm_id_priv->qp = qp; + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + + ret = cm_id->device->ops.iw_accept(cm_id, iw_param); + if (ret) { + /* An error on accept precludes provider events */ + BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_RECV); + cm_id_priv->state = IW_CM_STATE_IDLE; + spin_lock_irqsave(&cm_id_priv->lock, flags); + qp = cm_id_priv->qp; + cm_id_priv->qp = NULL; + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + if (qp) + cm_id->device->ops.iw_rem_ref(qp); + clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); + wake_up_all(&cm_id_priv->connect_wait); + } + + return ret; +} +EXPORT_SYMBOL(iw_cm_accept); + +/* + * Active Side: CM_ID <-- CONN_SENT + * + * If successful, results in the generation of a CONNECT_REPLY + * event. iw_cm_disconnect and iw_cm_destroy will block until the + * CONNECT_REPLY event is received from the provider. + */ +int iw_cm_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param) +{ + struct iwcm_id_private *cm_id_priv; + int ret; + unsigned long flags; + struct ib_qp *qp = NULL; + + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); + + ret = alloc_work_entries(cm_id_priv, 4); + if (ret) + return ret; + + set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); + spin_lock_irqsave(&cm_id_priv->lock, flags); + + if (cm_id_priv->state != IW_CM_STATE_IDLE) { + ret = -EINVAL; + goto err; + } + + /* Get the ib_qp given the QPN */ + qp = cm_id->device->ops.iw_get_qp(cm_id->device, iw_param->qpn); + if (!qp) { + ret = -EINVAL; + goto err; + } + cm_id->device->ops.iw_add_ref(qp); + cm_id_priv->qp = qp; + cm_id_priv->state = IW_CM_STATE_CONN_SENT; + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + + ret = iw_cm_map(cm_id, true); + if (!ret) + ret = cm_id->device->ops.iw_connect(cm_id, iw_param); + if (!ret) + return 0; /* success */ + + spin_lock_irqsave(&cm_id_priv->lock, flags); + qp = cm_id_priv->qp; + cm_id_priv->qp = NULL; + cm_id_priv->state = IW_CM_STATE_IDLE; +err: + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + if (qp) + cm_id->device->ops.iw_rem_ref(qp); + clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); + wake_up_all(&cm_id_priv->connect_wait); + return ret; +} +EXPORT_SYMBOL(iw_cm_connect); + +/* + * Passive Side: new CM_ID <-- CONN_RECV + * + * Handles an inbound connect request. The function creates a new + * iw_cm_id to represent the new connection and inherits the client + * callback function and other attributes from the listening parent. + * + * The work item contains a pointer to the listen_cm_id and the event. The + * listen_cm_id contains the client cm_handler, context and + * device. These are copied when the device is cloned. The event + * contains the new four tuple. + * + * An error on the child should not affect the parent, so this + * function does not return a value. + */ +static void cm_conn_req_handler(struct iwcm_id_private *listen_id_priv, + struct iw_cm_event *iw_event) +{ + unsigned long flags; + struct iw_cm_id *cm_id; + struct iwcm_id_private *cm_id_priv; + int ret; + + /* + * The provider should never generate a connection request + * event with a bad status. + */ + BUG_ON(iw_event->status); + + cm_id = iw_create_cm_id(listen_id_priv->id.device, + listen_id_priv->id.cm_handler, + listen_id_priv->id.context); + /* If the cm_id could not be created, ignore the request */ + if (IS_ERR(cm_id)) + goto out; + + cm_id->provider_data = iw_event->provider_data; + cm_id->m_local_addr = iw_event->local_addr; + cm_id->m_remote_addr = iw_event->remote_addr; + cm_id->local_addr = listen_id_priv->id.local_addr; + + ret = iwpm_get_remote_info(&listen_id_priv->id.m_local_addr, + &iw_event->remote_addr, + &cm_id->remote_addr, + RDMA_NL_IWCM); + if (ret) { + cm_id->remote_addr = iw_event->remote_addr; + } else { + iw_cm_check_wildcard(&listen_id_priv->id.m_local_addr, + &iw_event->local_addr, + &cm_id->local_addr); + iw_event->local_addr = cm_id->local_addr; + iw_event->remote_addr = cm_id->remote_addr; + } + + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); + cm_id_priv->state = IW_CM_STATE_CONN_RECV; + + /* + * We could be destroying the listening id. If so, ignore this + * upcall. + */ + spin_lock_irqsave(&listen_id_priv->lock, flags); + if (listen_id_priv->state != IW_CM_STATE_LISTEN) { + spin_unlock_irqrestore(&listen_id_priv->lock, flags); + iw_cm_reject(cm_id, NULL, 0); + iw_destroy_cm_id(cm_id); + goto out; + } + spin_unlock_irqrestore(&listen_id_priv->lock, flags); + + ret = alloc_work_entries(cm_id_priv, 3); + if (ret) { + iw_cm_reject(cm_id, NULL, 0); + iw_destroy_cm_id(cm_id); + goto out; + } + + /* Call the client CM handler */ + ret = cm_id->cm_handler(cm_id, iw_event); + if (ret) { + iw_cm_reject(cm_id, NULL, 0); + iw_destroy_cm_id(cm_id); + } + +out: + if (iw_event->private_data_len) + kfree(iw_event->private_data); +} + +/* + * Passive Side: CM_ID <-- ESTABLISHED + * + * The provider generated an ESTABLISHED event which means that + * the MPA negotion has completed successfully and we are now in MPA + * FPDU mode. + * + * This event can only be received in the CONN_RECV state. If the + * remote peer closed, the ESTABLISHED event would be received followed + * by the CLOSE event. If the app closes, it will block until we wake + * it up after processing this event. + */ +static int cm_conn_est_handler(struct iwcm_id_private *cm_id_priv, + struct iw_cm_event *iw_event) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&cm_id_priv->lock, flags); + + /* + * We clear the CONNECT_WAIT bit here to allow the callback + * function to call iw_cm_disconnect. Calling iw_destroy_cm_id + * from a callback handler is not allowed. + */ + clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); + BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_RECV); + cm_id_priv->state = IW_CM_STATE_ESTABLISHED; + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event); + wake_up_all(&cm_id_priv->connect_wait); + + return ret; +} + +/* + * Active Side: CM_ID <-- ESTABLISHED + * + * The app has called connect and is waiting for the established event to + * post it's requests to the server. This event will wake up anyone + * blocked in iw_cm_disconnect or iw_destroy_id. + */ +static int cm_conn_rep_handler(struct iwcm_id_private *cm_id_priv, + struct iw_cm_event *iw_event) +{ + struct ib_qp *qp = NULL; + unsigned long flags; + int ret; + + spin_lock_irqsave(&cm_id_priv->lock, flags); + /* + * Clear the connect wait bit so a callback function calling + * iw_cm_disconnect will not wait and deadlock this thread + */ + clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); + BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_SENT); + if (iw_event->status == 0) { + cm_id_priv->id.m_local_addr = iw_event->local_addr; + cm_id_priv->id.m_remote_addr = iw_event->remote_addr; + iw_event->local_addr = cm_id_priv->id.local_addr; + iw_event->remote_addr = cm_id_priv->id.remote_addr; + cm_id_priv->state = IW_CM_STATE_ESTABLISHED; + } else { + /* REJECTED or RESET */ + qp = cm_id_priv->qp; + cm_id_priv->qp = NULL; + cm_id_priv->state = IW_CM_STATE_IDLE; + } + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + if (qp) + cm_id_priv->id.device->ops.iw_rem_ref(qp); + ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event); + + if (iw_event->private_data_len) + kfree(iw_event->private_data); + + /* Wake up waiters on connect complete */ + wake_up_all(&cm_id_priv->connect_wait); + + return ret; +} + +/* + * CM_ID <-- CLOSING + * + * If in the ESTABLISHED state, move to CLOSING. + */ +static void cm_disconnect_handler(struct iwcm_id_private *cm_id_priv, + struct iw_cm_event *iw_event) +{ + unsigned long flags; + + spin_lock_irqsave(&cm_id_priv->lock, flags); + if (cm_id_priv->state == IW_CM_STATE_ESTABLISHED) + cm_id_priv->state = IW_CM_STATE_CLOSING; + spin_unlock_irqrestore(&cm_id_priv->lock, flags); +} + +/* + * CM_ID <-- IDLE + * + * If in the ESTBLISHED or CLOSING states, the QP will have have been + * moved by the provider to the ERR state. Disassociate the CM_ID from + * the QP, move to IDLE, and remove the 'connected' reference. + * + * If in some other state, the cm_id was destroyed asynchronously. + * This is the last reference that will result in waking up + * the app thread blocked in iw_destroy_cm_id. + */ +static int cm_close_handler(struct iwcm_id_private *cm_id_priv, + struct iw_cm_event *iw_event) +{ + struct ib_qp *qp; + unsigned long flags; + int ret = 0, notify_event = 0; + spin_lock_irqsave(&cm_id_priv->lock, flags); + qp = cm_id_priv->qp; + cm_id_priv->qp = NULL; + + switch (cm_id_priv->state) { + case IW_CM_STATE_ESTABLISHED: + case IW_CM_STATE_CLOSING: + cm_id_priv->state = IW_CM_STATE_IDLE; + notify_event = 1; + break; + case IW_CM_STATE_DESTROYING: + break; + default: + BUG(); + } + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + + if (qp) + cm_id_priv->id.device->ops.iw_rem_ref(qp); + if (notify_event) + ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event); + return ret; +} + +static int process_event(struct iwcm_id_private *cm_id_priv, + struct iw_cm_event *iw_event) +{ + int ret = 0; + + switch (iw_event->event) { + case IW_CM_EVENT_CONNECT_REQUEST: + cm_conn_req_handler(cm_id_priv, iw_event); + break; + case IW_CM_EVENT_CONNECT_REPLY: + ret = cm_conn_rep_handler(cm_id_priv, iw_event); + break; + case IW_CM_EVENT_ESTABLISHED: + ret = cm_conn_est_handler(cm_id_priv, iw_event); + break; + case IW_CM_EVENT_DISCONNECT: + cm_disconnect_handler(cm_id_priv, iw_event); + break; + case IW_CM_EVENT_CLOSE: + ret = cm_close_handler(cm_id_priv, iw_event); + break; + default: + BUG(); + } + + return ret; +} + +/* + * Process events on the work_list for the cm_id. If the callback + * function requests that the cm_id be deleted, a flag is set in the + * cm_id flags to indicate that when the last reference is + * removed, the cm_id is to be destroyed. This is necessary to + * distinguish between an object that will be destroyed by the app + * thread asleep on the destroy_comp list vs. an object destroyed + * here synchronously when the last reference is removed. + */ +static void cm_work_handler(struct work_struct *_work) +{ + struct iwcm_work *work = container_of(_work, struct iwcm_work, work); + struct iw_cm_event levent; + struct iwcm_id_private *cm_id_priv = work->cm_id; + unsigned long flags; + int empty; + int ret = 0; + + spin_lock_irqsave(&cm_id_priv->lock, flags); + empty = list_empty(&cm_id_priv->work_list); + while (!empty) { + work = list_entry(cm_id_priv->work_list.next, + struct iwcm_work, list); + list_del_init(&work->list); + empty = list_empty(&cm_id_priv->work_list); + levent = work->event; + put_work(work); + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + + if (!test_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags)) { + ret = process_event(cm_id_priv, &levent); + if (ret) + destroy_cm_id(&cm_id_priv->id); + } else + pr_debug("dropping event %d\n", levent.event); + if (iwcm_deref_id(cm_id_priv)) + return; + if (empty) + return; + spin_lock_irqsave(&cm_id_priv->lock, flags); + } + spin_unlock_irqrestore(&cm_id_priv->lock, flags); +} + +/* + * This function is called on interrupt context. Schedule events on + * the iwcm_wq thread to allow callback functions to downcall into + * the CM and/or block. Events are queued to a per-CM_ID + * work_list. If this is the first event on the work_list, the work + * element is also queued on the iwcm_wq thread. + * + * Each event holds a reference on the cm_id. Until the last posted + * event has been delivered and processed, the cm_id cannot be + * deleted. + * + * Returns: + * 0 - the event was handled. + * -ENOMEM - the event was not handled due to lack of resources. + */ +static int cm_event_handler(struct iw_cm_id *cm_id, + struct iw_cm_event *iw_event) +{ + struct iwcm_work *work; + struct iwcm_id_private *cm_id_priv; + unsigned long flags; + int ret = 0; + + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); + + spin_lock_irqsave(&cm_id_priv->lock, flags); + work = get_work(cm_id_priv); + if (!work) { + ret = -ENOMEM; + goto out; + } + + INIT_WORK(&work->work, cm_work_handler); + work->cm_id = cm_id_priv; + work->event = *iw_event; + + if ((work->event.event == IW_CM_EVENT_CONNECT_REQUEST || + work->event.event == IW_CM_EVENT_CONNECT_REPLY) && + work->event.private_data_len) { + ret = copy_private_data(&work->event); + if (ret) { + put_work(work); + goto out; + } + } + + refcount_inc(&cm_id_priv->refcount); + if (list_empty(&cm_id_priv->work_list)) { + list_add_tail(&work->list, &cm_id_priv->work_list); + queue_work(iwcm_wq, &work->work); + } else + list_add_tail(&work->list, &cm_id_priv->work_list); +out: + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + return ret; +} + +static int iwcm_init_qp_init_attr(struct iwcm_id_private *cm_id_priv, + struct ib_qp_attr *qp_attr, + int *qp_attr_mask) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&cm_id_priv->lock, flags); + switch (cm_id_priv->state) { + case IW_CM_STATE_IDLE: + case IW_CM_STATE_CONN_SENT: + case IW_CM_STATE_CONN_RECV: + case IW_CM_STATE_ESTABLISHED: + *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS; + qp_attr->qp_access_flags = IB_ACCESS_REMOTE_WRITE| + IB_ACCESS_REMOTE_READ; + ret = 0; + break; + default: + ret = -EINVAL; + break; + } + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + return ret; +} + +static int iwcm_init_qp_rts_attr(struct iwcm_id_private *cm_id_priv, + struct ib_qp_attr *qp_attr, + int *qp_attr_mask) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&cm_id_priv->lock, flags); + switch (cm_id_priv->state) { + case IW_CM_STATE_IDLE: + case IW_CM_STATE_CONN_SENT: + case IW_CM_STATE_CONN_RECV: + case IW_CM_STATE_ESTABLISHED: + *qp_attr_mask = 0; + ret = 0; + break; + default: + ret = -EINVAL; + break; + } + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + return ret; +} + +int iw_cm_init_qp_attr(struct iw_cm_id *cm_id, + struct ib_qp_attr *qp_attr, + int *qp_attr_mask) +{ + struct iwcm_id_private *cm_id_priv; + int ret; + + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); + switch (qp_attr->qp_state) { + case IB_QPS_INIT: + case IB_QPS_RTR: + ret = iwcm_init_qp_init_attr(cm_id_priv, + qp_attr, qp_attr_mask); + break; + case IB_QPS_RTS: + ret = iwcm_init_qp_rts_attr(cm_id_priv, + qp_attr, qp_attr_mask); + break; + default: + ret = -EINVAL; + break; + } + return ret; +} +EXPORT_SYMBOL(iw_cm_init_qp_attr); + +static int __init iw_cm_init(void) +{ + int ret; + + ret = iwpm_init(RDMA_NL_IWCM); + if (ret) + return ret; + + iwcm_wq = alloc_ordered_workqueue("iw_cm_wq", 0); + if (!iwcm_wq) + goto err_alloc; + + iwcm_ctl_table_hdr = register_net_sysctl(&init_net, "net/iw_cm", + iwcm_ctl_table); + if (!iwcm_ctl_table_hdr) { + pr_err("iw_cm: couldn't register sysctl paths\n"); + goto err_sysctl; + } + + rdma_nl_register(RDMA_NL_IWCM, iwcm_nl_cb_table); + return 0; + +err_sysctl: + destroy_workqueue(iwcm_wq); +err_alloc: + iwpm_exit(RDMA_NL_IWCM); + return -ENOMEM; +} + +static void __exit iw_cm_cleanup(void) +{ + rdma_nl_unregister(RDMA_NL_IWCM); + unregister_net_sysctl_table(iwcm_ctl_table_hdr); + destroy_workqueue(iwcm_wq); + iwpm_exit(RDMA_NL_IWCM); +} + +MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_IWCM, 2); + +module_init(iw_cm_init); +module_exit(iw_cm_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/iwcm.h b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/iwcm.h new file mode 100644 index 0000000..bf74639 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/iwcm.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2005 Network Appliance, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef IWCM_H +#define IWCM_H + +enum iw_cm_state { + IW_CM_STATE_IDLE, /* unbound, inactive */ + IW_CM_STATE_LISTEN, /* listen waiting for connect */ + IW_CM_STATE_CONN_RECV, /* inbound waiting for user accept */ + IW_CM_STATE_CONN_SENT, /* outbound waiting for peer accept */ + IW_CM_STATE_ESTABLISHED, /* established */ + IW_CM_STATE_CLOSING, /* disconnect */ + IW_CM_STATE_DESTROYING /* object being deleted */ +}; + +struct iwcm_id_private { + struct iw_cm_id id; + enum iw_cm_state state; + unsigned long flags; + struct ib_qp *qp; + struct completion destroy_comp; + wait_queue_head_t connect_wait; + struct list_head work_list; + spinlock_t lock; + refcount_t refcount; + struct list_head work_free_list; +}; + +#define IWCM_F_DROP_EVENTS 1 +#define IWCM_F_CONNECT_WAIT 2 + +#endif /* IWCM_H */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/iwpm_msg.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/iwpm_msg.c new file mode 100644 index 0000000..3c9a986 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/iwpm_msg.c @@ -0,0 +1,846 @@ +/* + * Copyright (c) 2014 Intel Corporation. All rights reserved. + * Copyright (c) 2014 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "iwpm_util.h" + +static const char iwpm_ulib_name[IWPM_ULIBNAME_SIZE] = "iWarpPortMapperUser"; +u16 iwpm_ulib_version = IWPM_UABI_VERSION_MIN; +static int iwpm_user_pid = IWPM_PID_UNDEFINED; +static atomic_t echo_nlmsg_seq; + +/** + * iwpm_valid_pid - Check if the userspace iwarp port mapper pid is valid + * + * Returns true if the pid is greater than zero, otherwise returns false + */ +int iwpm_valid_pid(void) +{ + return iwpm_user_pid > 0; +} + +/** + * iwpm_register_pid - Send a netlink query to userspace + * to get the iwarp port mapper pid + * @pm_msg: Contains driver info to send to the userspace port mapper + * @nl_client: The index of the netlink client + * + * nlmsg attributes: + * [IWPM_NLA_REG_PID_SEQ] + * [IWPM_NLA_REG_IF_NAME] + * [IWPM_NLA_REG_IBDEV_NAME] + * [IWPM_NLA_REG_ULIB_NAME] + */ +int iwpm_register_pid(struct iwpm_dev_data *pm_msg, u8 nl_client) +{ + struct sk_buff *skb = NULL; + struct iwpm_nlmsg_request *nlmsg_request = NULL; + struct nlmsghdr *nlh; + u32 msg_seq; + const char *err_str = ""; + int ret = -EINVAL; + + if (iwpm_check_registration(nl_client, IWPM_REG_VALID) || + iwpm_user_pid == IWPM_PID_UNAVAILABLE) + return 0; + skb = iwpm_create_nlmsg(RDMA_NL_IWPM_REG_PID, &nlh, nl_client); + if (!skb) { + err_str = "Unable to create a nlmsg"; + goto pid_query_error; + } + nlh->nlmsg_seq = iwpm_get_nlmsg_seq(); + nlmsg_request = iwpm_get_nlmsg_request(nlh->nlmsg_seq, nl_client, GFP_KERNEL); + if (!nlmsg_request) { + err_str = "Unable to allocate netlink request"; + goto pid_query_error; + } + msg_seq = atomic_read(&echo_nlmsg_seq); + + /* fill in the pid request message */ + err_str = "Unable to put attribute of the nlmsg"; + ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq, IWPM_NLA_REG_PID_SEQ); + if (ret) + goto pid_query_error; + ret = ibnl_put_attr(skb, nlh, IFNAMSIZ, + pm_msg->if_name, IWPM_NLA_REG_IF_NAME); + if (ret) + goto pid_query_error; + ret = ibnl_put_attr(skb, nlh, IWPM_DEVNAME_SIZE, + pm_msg->dev_name, IWPM_NLA_REG_IBDEV_NAME); + if (ret) + goto pid_query_error; + ret = ibnl_put_attr(skb, nlh, IWPM_ULIBNAME_SIZE, + (char *)iwpm_ulib_name, IWPM_NLA_REG_ULIB_NAME); + if (ret) + goto pid_query_error; + + nlmsg_end(skb, nlh); + + pr_debug("%s: Multicasting a nlmsg (dev = %s ifname = %s iwpm = %s)\n", + __func__, pm_msg->dev_name, pm_msg->if_name, iwpm_ulib_name); + + ret = rdma_nl_multicast(&init_net, skb, RDMA_NL_GROUP_IWPM, GFP_KERNEL); + if (ret) { + skb = NULL; /* skb is freed in the netlink send-op handling */ + iwpm_user_pid = IWPM_PID_UNAVAILABLE; + err_str = "Unable to send a nlmsg"; + goto pid_query_error; + } + nlmsg_request->req_buffer = pm_msg; + ret = iwpm_wait_complete_req(nlmsg_request); + return ret; +pid_query_error: + pr_info("%s: %s (client = %u)\n", __func__, err_str, nl_client); + dev_kfree_skb(skb); + if (nlmsg_request) + iwpm_free_nlmsg_request(&nlmsg_request->kref); + return ret; +} + +/** + * iwpm_add_mapping - Send a netlink add mapping request to + * the userspace port mapper + * @pm_msg: Contains the local ip/tcp address info to send + * @nl_client: The index of the netlink client + * + * nlmsg attributes: + * [IWPM_NLA_MANAGE_MAPPING_SEQ] + * [IWPM_NLA_MANAGE_ADDR] + * [IWPM_NLA_MANAGE_FLAGS] + * + * If the request is successful, the pm_msg stores + * the port mapper response (mapped address info) + */ +int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) +{ + struct sk_buff *skb = NULL; + struct iwpm_nlmsg_request *nlmsg_request = NULL; + struct nlmsghdr *nlh; + u32 msg_seq; + const char *err_str = ""; + int ret = -EINVAL; + + if (!iwpm_valid_pid()) + return 0; + if (!iwpm_check_registration(nl_client, IWPM_REG_VALID)) { + err_str = "Unregistered port mapper client"; + goto add_mapping_error; + } + skb = iwpm_create_nlmsg(RDMA_NL_IWPM_ADD_MAPPING, &nlh, nl_client); + if (!skb) { + err_str = "Unable to create a nlmsg"; + goto add_mapping_error; + } + nlh->nlmsg_seq = iwpm_get_nlmsg_seq(); + nlmsg_request = iwpm_get_nlmsg_request(nlh->nlmsg_seq, nl_client, GFP_KERNEL); + if (!nlmsg_request) { + err_str = "Unable to allocate netlink request"; + goto add_mapping_error; + } + msg_seq = atomic_read(&echo_nlmsg_seq); + /* fill in the add mapping message */ + err_str = "Unable to put attribute of the nlmsg"; + ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq, + IWPM_NLA_MANAGE_MAPPING_SEQ); + if (ret) + goto add_mapping_error; + ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage), + &pm_msg->loc_addr, IWPM_NLA_MANAGE_ADDR); + if (ret) + goto add_mapping_error; + + /* If flags are required and we're not V4, then return a quiet error */ + if (pm_msg->flags && iwpm_ulib_version == IWPM_UABI_VERSION_MIN) { + ret = -EINVAL; + goto add_mapping_error_nowarn; + } + if (iwpm_ulib_version > IWPM_UABI_VERSION_MIN) { + ret = ibnl_put_attr(skb, nlh, sizeof(u32), &pm_msg->flags, + IWPM_NLA_MANAGE_FLAGS); + if (ret) + goto add_mapping_error; + } + + nlmsg_end(skb, nlh); + nlmsg_request->req_buffer = pm_msg; + + ret = rdma_nl_unicast_wait(&init_net, skb, iwpm_user_pid); + if (ret) { + skb = NULL; /* skb is freed in the netlink send-op handling */ + iwpm_user_pid = IWPM_PID_UNDEFINED; + err_str = "Unable to send a nlmsg"; + goto add_mapping_error; + } + ret = iwpm_wait_complete_req(nlmsg_request); + return ret; +add_mapping_error: + pr_info("%s: %s (client = %u)\n", __func__, err_str, nl_client); +add_mapping_error_nowarn: + dev_kfree_skb(skb); + if (nlmsg_request) + iwpm_free_nlmsg_request(&nlmsg_request->kref); + return ret; +} + +/** + * iwpm_add_and_query_mapping - Process the port mapper response to + * iwpm_add_and_query_mapping request + * @pm_msg: Contains the local ip/tcp address info to send + * @nl_client: The index of the netlink client + * + * nlmsg attributes: + * [IWPM_NLA_QUERY_MAPPING_SEQ] + * [IWPM_NLA_QUERY_LOCAL_ADDR] + * [IWPM_NLA_QUERY_REMOTE_ADDR] + * [IWPM_NLA_QUERY_FLAGS] + */ +int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) +{ + struct sk_buff *skb = NULL; + struct iwpm_nlmsg_request *nlmsg_request = NULL; + struct nlmsghdr *nlh; + u32 msg_seq; + const char *err_str = ""; + int ret = -EINVAL; + + if (!iwpm_valid_pid()) + return 0; + if (!iwpm_check_registration(nl_client, IWPM_REG_VALID)) { + err_str = "Unregistered port mapper client"; + goto query_mapping_error; + } + ret = -ENOMEM; + skb = iwpm_create_nlmsg(RDMA_NL_IWPM_QUERY_MAPPING, &nlh, nl_client); + if (!skb) { + err_str = "Unable to create a nlmsg"; + goto query_mapping_error; + } + nlh->nlmsg_seq = iwpm_get_nlmsg_seq(); + nlmsg_request = iwpm_get_nlmsg_request(nlh->nlmsg_seq, + nl_client, GFP_KERNEL); + if (!nlmsg_request) { + err_str = "Unable to allocate netlink request"; + goto query_mapping_error; + } + msg_seq = atomic_read(&echo_nlmsg_seq); + + /* fill in the query message */ + err_str = "Unable to put attribute of the nlmsg"; + ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq, + IWPM_NLA_QUERY_MAPPING_SEQ); + if (ret) + goto query_mapping_error; + ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage), + &pm_msg->loc_addr, IWPM_NLA_QUERY_LOCAL_ADDR); + if (ret) + goto query_mapping_error; + ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage), + &pm_msg->rem_addr, IWPM_NLA_QUERY_REMOTE_ADDR); + if (ret) + goto query_mapping_error; + + /* If flags are required and we're not V4, then return a quite error */ + if (pm_msg->flags && iwpm_ulib_version == IWPM_UABI_VERSION_MIN) { + ret = -EINVAL; + goto query_mapping_error_nowarn; + } + if (iwpm_ulib_version > IWPM_UABI_VERSION_MIN) { + ret = ibnl_put_attr(skb, nlh, sizeof(u32), &pm_msg->flags, + IWPM_NLA_QUERY_FLAGS); + if (ret) + goto query_mapping_error; + } + + nlmsg_end(skb, nlh); + nlmsg_request->req_buffer = pm_msg; + + ret = rdma_nl_unicast_wait(&init_net, skb, iwpm_user_pid); + if (ret) { + skb = NULL; /* skb is freed in the netlink send-op handling */ + err_str = "Unable to send a nlmsg"; + goto query_mapping_error; + } + ret = iwpm_wait_complete_req(nlmsg_request); + return ret; +query_mapping_error: + pr_info("%s: %s (client = %u)\n", __func__, err_str, nl_client); +query_mapping_error_nowarn: + dev_kfree_skb(skb); + if (nlmsg_request) + iwpm_free_nlmsg_request(&nlmsg_request->kref); + return ret; +} + +/** + * iwpm_remove_mapping - Send a netlink remove mapping request + * to the userspace port mapper + * + * @local_addr: Local ip/tcp address to remove + * @nl_client: The index of the netlink client + * + * nlmsg attributes: + * [IWPM_NLA_MANAGE_MAPPING_SEQ] + * [IWPM_NLA_MANAGE_ADDR] + */ +int iwpm_remove_mapping(struct sockaddr_storage *local_addr, u8 nl_client) +{ + struct sk_buff *skb = NULL; + struct nlmsghdr *nlh; + u32 msg_seq; + const char *err_str = ""; + int ret = -EINVAL; + + if (!iwpm_valid_pid()) + return 0; + if (iwpm_check_registration(nl_client, IWPM_REG_UNDEF)) { + err_str = "Unregistered port mapper client"; + goto remove_mapping_error; + } + skb = iwpm_create_nlmsg(RDMA_NL_IWPM_REMOVE_MAPPING, &nlh, nl_client); + if (!skb) { + ret = -ENOMEM; + err_str = "Unable to create a nlmsg"; + goto remove_mapping_error; + } + msg_seq = atomic_read(&echo_nlmsg_seq); + nlh->nlmsg_seq = iwpm_get_nlmsg_seq(); + err_str = "Unable to put attribute of the nlmsg"; + ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq, + IWPM_NLA_MANAGE_MAPPING_SEQ); + if (ret) + goto remove_mapping_error; + ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage), + local_addr, IWPM_NLA_MANAGE_ADDR); + if (ret) + goto remove_mapping_error; + + nlmsg_end(skb, nlh); + + ret = rdma_nl_unicast_wait(&init_net, skb, iwpm_user_pid); + if (ret) { + skb = NULL; /* skb is freed in the netlink send-op handling */ + iwpm_user_pid = IWPM_PID_UNDEFINED; + err_str = "Unable to send a nlmsg"; + goto remove_mapping_error; + } + iwpm_print_sockaddr(local_addr, + "remove_mapping: Local sockaddr:"); + return 0; +remove_mapping_error: + pr_info("%s: %s (client = %u)\n", __func__, err_str, nl_client); + if (skb) + dev_kfree_skb_any(skb); + return ret; +} + +/* netlink attribute policy for the received response to register pid request */ +static const struct nla_policy resp_reg_policy[IWPM_NLA_RREG_PID_MAX] = { + [IWPM_NLA_RREG_PID_SEQ] = { .type = NLA_U32 }, + [IWPM_NLA_RREG_IBDEV_NAME] = { .type = NLA_STRING, + .len = IWPM_DEVNAME_SIZE - 1 }, + [IWPM_NLA_RREG_ULIB_NAME] = { .type = NLA_STRING, + .len = IWPM_ULIBNAME_SIZE - 1 }, + [IWPM_NLA_RREG_ULIB_VER] = { .type = NLA_U16 }, + [IWPM_NLA_RREG_PID_ERR] = { .type = NLA_U16 } +}; + +/** + * iwpm_register_pid_cb - Process the port mapper response to + * iwpm_register_pid query + * @skb: The socket buffer + * @cb: Contains the received message (payload and netlink header) + * + * If successful, the function receives the userspace port mapper pid + * which is used in future communication with the port mapper + */ +int iwpm_register_pid_cb(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct iwpm_nlmsg_request *nlmsg_request = NULL; + struct nlattr *nltb[IWPM_NLA_RREG_PID_MAX]; + struct iwpm_dev_data *pm_msg; + char *dev_name, *iwpm_name; + u32 msg_seq; + u8 nl_client; + u16 iwpm_version; + const char *msg_type = "Register Pid response"; + + if (iwpm_parse_nlmsg(cb, IWPM_NLA_RREG_PID_MAX, + resp_reg_policy, nltb, msg_type)) + return -EINVAL; + + msg_seq = nla_get_u32(nltb[IWPM_NLA_RREG_PID_SEQ]); + nlmsg_request = iwpm_find_nlmsg_request(msg_seq); + if (!nlmsg_request) { + pr_info("%s: Could not find a matching request (seq = %u)\n", + __func__, msg_seq); + return -EINVAL; + } + pm_msg = nlmsg_request->req_buffer; + nl_client = nlmsg_request->nl_client; + dev_name = (char *)nla_data(nltb[IWPM_NLA_RREG_IBDEV_NAME]); + iwpm_name = (char *)nla_data(nltb[IWPM_NLA_RREG_ULIB_NAME]); + iwpm_version = nla_get_u16(nltb[IWPM_NLA_RREG_ULIB_VER]); + + /* check device name, ulib name and version */ + if (strcmp(pm_msg->dev_name, dev_name) || + strcmp(iwpm_ulib_name, iwpm_name) || + iwpm_version < IWPM_UABI_VERSION_MIN) { + + pr_info("%s: Incorrect info (dev = %s name = %s version = %u)\n", + __func__, dev_name, iwpm_name, iwpm_version); + nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR; + goto register_pid_response_exit; + } + iwpm_user_pid = cb->nlh->nlmsg_pid; + iwpm_ulib_version = iwpm_version; + if (iwpm_ulib_version < IWPM_UABI_VERSION) + pr_warn_once("%s: Down level iwpmd/pid %d. Continuing...", + __func__, iwpm_user_pid); + atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); + pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n", + __func__, iwpm_user_pid); + iwpm_set_registration(nl_client, IWPM_REG_VALID); +register_pid_response_exit: + nlmsg_request->request_done = 1; + /* always for found nlmsg_request */ + kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request); + barrier(); + up(&nlmsg_request->sem); + return 0; +} + +/* netlink attribute policy for the received response to add mapping request */ +static const struct nla_policy resp_add_policy[IWPM_NLA_RMANAGE_MAPPING_MAX] = { + [IWPM_NLA_RMANAGE_MAPPING_SEQ] = { .type = NLA_U32 }, + [IWPM_NLA_RMANAGE_ADDR] = { + .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RMANAGE_MAPPED_LOC_ADDR] = { + .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RMANAGE_MAPPING_ERR] = { .type = NLA_U16 } +}; + +/** + * iwpm_add_mapping_cb - Process the port mapper response to + * iwpm_add_mapping request + * @skb: The socket buffer + * @cb: Contains the received message (payload and netlink header) + */ +int iwpm_add_mapping_cb(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct iwpm_sa_data *pm_msg; + struct iwpm_nlmsg_request *nlmsg_request = NULL; + struct nlattr *nltb[IWPM_NLA_RMANAGE_MAPPING_MAX]; + struct sockaddr_storage *local_sockaddr; + struct sockaddr_storage *mapped_sockaddr; + const char *msg_type; + u32 msg_seq; + + msg_type = "Add Mapping response"; + if (iwpm_parse_nlmsg(cb, IWPM_NLA_RMANAGE_MAPPING_MAX, + resp_add_policy, nltb, msg_type)) + return -EINVAL; + + atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); + + msg_seq = nla_get_u32(nltb[IWPM_NLA_RMANAGE_MAPPING_SEQ]); + nlmsg_request = iwpm_find_nlmsg_request(msg_seq); + if (!nlmsg_request) { + pr_info("%s: Could not find a matching request (seq = %u)\n", + __func__, msg_seq); + return -EINVAL; + } + pm_msg = nlmsg_request->req_buffer; + local_sockaddr = (struct sockaddr_storage *) + nla_data(nltb[IWPM_NLA_RMANAGE_ADDR]); + mapped_sockaddr = (struct sockaddr_storage *) + nla_data(nltb[IWPM_NLA_RMANAGE_MAPPED_LOC_ADDR]); + + if (iwpm_compare_sockaddr(local_sockaddr, &pm_msg->loc_addr)) { + nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR; + goto add_mapping_response_exit; + } + if (mapped_sockaddr->ss_family != local_sockaddr->ss_family) { + pr_info("%s: Sockaddr family doesn't match the requested one\n", + __func__); + nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR; + goto add_mapping_response_exit; + } + memcpy(&pm_msg->mapped_loc_addr, mapped_sockaddr, + sizeof(*mapped_sockaddr)); + iwpm_print_sockaddr(&pm_msg->loc_addr, + "add_mapping: Local sockaddr:"); + iwpm_print_sockaddr(&pm_msg->mapped_loc_addr, + "add_mapping: Mapped local sockaddr:"); + +add_mapping_response_exit: + nlmsg_request->request_done = 1; + /* always for found request */ + kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request); + barrier(); + up(&nlmsg_request->sem); + return 0; +} + +/* netlink attribute policy for the response to add and query mapping request + * and response with remote address info + */ +static const struct nla_policy resp_query_policy[IWPM_NLA_RQUERY_MAPPING_MAX] = { + [IWPM_NLA_RQUERY_MAPPING_SEQ] = { .type = NLA_U32 }, + [IWPM_NLA_RQUERY_LOCAL_ADDR] = { + .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RQUERY_REMOTE_ADDR] = { + .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RQUERY_MAPPED_LOC_ADDR] = { + .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RQUERY_MAPPED_REM_ADDR] = { + .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RQUERY_MAPPING_ERR] = { .type = NLA_U16 } +}; + +/** + * iwpm_add_and_query_mapping_cb - Process the port mapper response to + * iwpm_add_and_query_mapping request + * @skb: The socket buffer + * @cb: Contains the received message (payload and netlink header) + */ +int iwpm_add_and_query_mapping_cb(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct iwpm_sa_data *pm_msg; + struct iwpm_nlmsg_request *nlmsg_request = NULL; + struct nlattr *nltb[IWPM_NLA_RQUERY_MAPPING_MAX]; + struct sockaddr_storage *local_sockaddr, *remote_sockaddr; + struct sockaddr_storage *mapped_loc_sockaddr, *mapped_rem_sockaddr; + const char *msg_type; + u32 msg_seq; + u16 err_code; + + msg_type = "Query Mapping response"; + if (iwpm_parse_nlmsg(cb, IWPM_NLA_RQUERY_MAPPING_MAX, + resp_query_policy, nltb, msg_type)) + return -EINVAL; + atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); + + msg_seq = nla_get_u32(nltb[IWPM_NLA_RQUERY_MAPPING_SEQ]); + nlmsg_request = iwpm_find_nlmsg_request(msg_seq); + if (!nlmsg_request) { + pr_info("%s: Could not find a matching request (seq = %u)\n", + __func__, msg_seq); + return -EINVAL; + } + pm_msg = nlmsg_request->req_buffer; + local_sockaddr = (struct sockaddr_storage *) + nla_data(nltb[IWPM_NLA_RQUERY_LOCAL_ADDR]); + remote_sockaddr = (struct sockaddr_storage *) + nla_data(nltb[IWPM_NLA_RQUERY_REMOTE_ADDR]); + mapped_loc_sockaddr = (struct sockaddr_storage *) + nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_LOC_ADDR]); + mapped_rem_sockaddr = (struct sockaddr_storage *) + nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_REM_ADDR]); + + err_code = nla_get_u16(nltb[IWPM_NLA_RQUERY_MAPPING_ERR]); + if (err_code == IWPM_REMOTE_QUERY_REJECT) { + pr_info("%s: Received a Reject (pid = %u, echo seq = %u)\n", + __func__, cb->nlh->nlmsg_pid, msg_seq); + nlmsg_request->err_code = IWPM_REMOTE_QUERY_REJECT; + } + if (iwpm_compare_sockaddr(local_sockaddr, &pm_msg->loc_addr) || + iwpm_compare_sockaddr(remote_sockaddr, &pm_msg->rem_addr)) { + pr_info("%s: Incorrect local sockaddr\n", __func__); + nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR; + goto query_mapping_response_exit; + } + if (mapped_loc_sockaddr->ss_family != local_sockaddr->ss_family || + mapped_rem_sockaddr->ss_family != remote_sockaddr->ss_family) { + pr_info("%s: Sockaddr family doesn't match the requested one\n", + __func__); + nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR; + goto query_mapping_response_exit; + } + memcpy(&pm_msg->mapped_loc_addr, mapped_loc_sockaddr, + sizeof(*mapped_loc_sockaddr)); + memcpy(&pm_msg->mapped_rem_addr, mapped_rem_sockaddr, + sizeof(*mapped_rem_sockaddr)); + + iwpm_print_sockaddr(&pm_msg->loc_addr, + "query_mapping: Local sockaddr:"); + iwpm_print_sockaddr(&pm_msg->mapped_loc_addr, + "query_mapping: Mapped local sockaddr:"); + iwpm_print_sockaddr(&pm_msg->rem_addr, + "query_mapping: Remote sockaddr:"); + iwpm_print_sockaddr(&pm_msg->mapped_rem_addr, + "query_mapping: Mapped remote sockaddr:"); +query_mapping_response_exit: + nlmsg_request->request_done = 1; + /* always for found request */ + kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request); + barrier(); + up(&nlmsg_request->sem); + return 0; +} + +/** + * iwpm_remote_info_cb - Process remote connecting peer address info, which + * the port mapper has received from the connecting peer + * @skb: The socket buffer + * @cb: Contains the received message (payload and netlink header) + * + * Stores the IPv4/IPv6 address info in a hash table + */ +int iwpm_remote_info_cb(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct nlattr *nltb[IWPM_NLA_RQUERY_MAPPING_MAX]; + struct sockaddr_storage *local_sockaddr, *remote_sockaddr; + struct sockaddr_storage *mapped_loc_sockaddr, *mapped_rem_sockaddr; + struct iwpm_remote_info *rem_info; + const char *msg_type; + u8 nl_client; + int ret = -EINVAL; + + msg_type = "Remote Mapping info"; + if (iwpm_parse_nlmsg(cb, IWPM_NLA_RQUERY_MAPPING_MAX, + resp_query_policy, nltb, msg_type)) + return ret; + + nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type); + atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); + + local_sockaddr = (struct sockaddr_storage *) + nla_data(nltb[IWPM_NLA_RQUERY_LOCAL_ADDR]); + remote_sockaddr = (struct sockaddr_storage *) + nla_data(nltb[IWPM_NLA_RQUERY_REMOTE_ADDR]); + mapped_loc_sockaddr = (struct sockaddr_storage *) + nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_LOC_ADDR]); + mapped_rem_sockaddr = (struct sockaddr_storage *) + nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_REM_ADDR]); + + if (mapped_loc_sockaddr->ss_family != local_sockaddr->ss_family || + mapped_rem_sockaddr->ss_family != remote_sockaddr->ss_family) { + pr_info("%s: Sockaddr family doesn't match the requested one\n", + __func__); + return ret; + } + rem_info = kzalloc(sizeof(struct iwpm_remote_info), GFP_ATOMIC); + if (!rem_info) { + ret = -ENOMEM; + return ret; + } + memcpy(&rem_info->mapped_loc_sockaddr, mapped_loc_sockaddr, + sizeof(struct sockaddr_storage)); + memcpy(&rem_info->remote_sockaddr, remote_sockaddr, + sizeof(struct sockaddr_storage)); + memcpy(&rem_info->mapped_rem_sockaddr, mapped_rem_sockaddr, + sizeof(struct sockaddr_storage)); + rem_info->nl_client = nl_client; + + iwpm_add_remote_info(rem_info); + + iwpm_print_sockaddr(local_sockaddr, + "remote_info: Local sockaddr:"); + iwpm_print_sockaddr(mapped_loc_sockaddr, + "remote_info: Mapped local sockaddr:"); + iwpm_print_sockaddr(remote_sockaddr, + "remote_info: Remote sockaddr:"); + iwpm_print_sockaddr(mapped_rem_sockaddr, + "remote_info: Mapped remote sockaddr:"); + return ret; +} + +/* netlink attribute policy for the received request for mapping info */ +static const struct nla_policy resp_mapinfo_policy[IWPM_NLA_MAPINFO_REQ_MAX] = { + [IWPM_NLA_MAPINFO_ULIB_NAME] = { .type = NLA_STRING, + .len = IWPM_ULIBNAME_SIZE - 1 }, + [IWPM_NLA_MAPINFO_ULIB_VER] = { .type = NLA_U16 } +}; + +/** + * iwpm_mapping_info_cb - Process a notification that the userspace + * port mapper daemon is started + * @skb: The socket buffer + * @cb: Contains the received message (payload and netlink header) + * + * Using the received port mapper pid, send all the local mapping + * info records to the userspace port mapper + */ +int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct nlattr *nltb[IWPM_NLA_MAPINFO_REQ_MAX]; + const char *msg_type = "Mapping Info response"; + u8 nl_client; + char *iwpm_name; + u16 iwpm_version; + int ret = -EINVAL; + + if (iwpm_parse_nlmsg(cb, IWPM_NLA_MAPINFO_REQ_MAX, + resp_mapinfo_policy, nltb, msg_type)) { + pr_info("%s: Unable to parse nlmsg\n", __func__); + return ret; + } + iwpm_name = (char *)nla_data(nltb[IWPM_NLA_MAPINFO_ULIB_NAME]); + iwpm_version = nla_get_u16(nltb[IWPM_NLA_MAPINFO_ULIB_VER]); + if (strcmp(iwpm_ulib_name, iwpm_name) || + iwpm_version < IWPM_UABI_VERSION_MIN) { + pr_info("%s: Invalid port mapper name = %s version = %u\n", + __func__, iwpm_name, iwpm_version); + return ret; + } + nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type); + iwpm_set_registration(nl_client, IWPM_REG_INCOMPL); + atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); + iwpm_user_pid = cb->nlh->nlmsg_pid; + + if (iwpm_ulib_version < IWPM_UABI_VERSION) + pr_warn_once("%s: Down level iwpmd/pid %d. Continuing...", + __func__, iwpm_user_pid); + + if (!iwpm_mapinfo_available()) + return 0; + pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n", + __func__, iwpm_user_pid); + ret = iwpm_send_mapinfo(nl_client, iwpm_user_pid); + return ret; +} + +/* netlink attribute policy for the received mapping info ack */ +static const struct nla_policy ack_mapinfo_policy[IWPM_NLA_MAPINFO_NUM_MAX] = { + [IWPM_NLA_MAPINFO_SEQ] = { .type = NLA_U32 }, + [IWPM_NLA_MAPINFO_SEND_NUM] = { .type = NLA_U32 }, + [IWPM_NLA_MAPINFO_ACK_NUM] = { .type = NLA_U32 } +}; + +/** + * iwpm_ack_mapping_info_cb - Process the port mapper ack for + * the provided local mapping info records + * @skb: The socket buffer + * @cb: Contains the received message (payload and netlink header) + */ +int iwpm_ack_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct nlattr *nltb[IWPM_NLA_MAPINFO_NUM_MAX]; + u32 mapinfo_send, mapinfo_ack; + const char *msg_type = "Mapping Info Ack"; + + if (iwpm_parse_nlmsg(cb, IWPM_NLA_MAPINFO_NUM_MAX, + ack_mapinfo_policy, nltb, msg_type)) + return -EINVAL; + mapinfo_send = nla_get_u32(nltb[IWPM_NLA_MAPINFO_SEND_NUM]); + mapinfo_ack = nla_get_u32(nltb[IWPM_NLA_MAPINFO_ACK_NUM]); + if (mapinfo_ack != mapinfo_send) + pr_info("%s: Invalid mapinfo number (sent = %u ack-ed = %u)\n", + __func__, mapinfo_send, mapinfo_ack); + atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); + return 0; +} + +/* netlink attribute policy for the received port mapper error message */ +static const struct nla_policy map_error_policy[IWPM_NLA_ERR_MAX] = { + [IWPM_NLA_ERR_SEQ] = { .type = NLA_U32 }, + [IWPM_NLA_ERR_CODE] = { .type = NLA_U16 }, +}; + +/** + * iwpm_mapping_error_cb - Process port mapper notification for error + * + * @skb: The socket buffer + * @cb: Contains the received message (payload and netlink header) + */ +int iwpm_mapping_error_cb(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct iwpm_nlmsg_request *nlmsg_request = NULL; + int nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type); + struct nlattr *nltb[IWPM_NLA_ERR_MAX]; + u32 msg_seq; + u16 err_code; + const char *msg_type = "Mapping Error Msg"; + + if (iwpm_parse_nlmsg(cb, IWPM_NLA_ERR_MAX, + map_error_policy, nltb, msg_type)) + return -EINVAL; + + msg_seq = nla_get_u32(nltb[IWPM_NLA_ERR_SEQ]); + err_code = nla_get_u16(nltb[IWPM_NLA_ERR_CODE]); + pr_info("%s: Received msg seq = %u err code = %u client = %d\n", + __func__, msg_seq, err_code, nl_client); + /* look for nlmsg_request */ + nlmsg_request = iwpm_find_nlmsg_request(msg_seq); + if (!nlmsg_request) { + /* not all errors have associated requests */ + pr_debug("Could not find matching req (seq = %u)\n", msg_seq); + return 0; + } + atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); + nlmsg_request->err_code = err_code; + nlmsg_request->request_done = 1; + /* always for found request */ + kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request); + barrier(); + up(&nlmsg_request->sem); + return 0; +} + +/* netlink attribute policy for the received hello request */ +static const struct nla_policy hello_policy[IWPM_NLA_HELLO_MAX] = { + [IWPM_NLA_HELLO_ABI_VERSION] = { .type = NLA_U16 } +}; + +/** + * iwpm_hello_cb - Process a hello message from iwpmd + * + * @skb: The socket buffer + * @cb: Contains the received message (payload and netlink header) + * + * Using the received port mapper pid, send the kernel's abi_version + * after adjusting it to support the iwpmd version. + */ +int iwpm_hello_cb(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct nlattr *nltb[IWPM_NLA_HELLO_MAX]; + const char *msg_type = "Hello request"; + u8 nl_client; + u16 abi_version; + int ret = -EINVAL; + + if (iwpm_parse_nlmsg(cb, IWPM_NLA_HELLO_MAX, hello_policy, nltb, + msg_type)) { + pr_info("%s: Unable to parse nlmsg\n", __func__); + return ret; + } + abi_version = nla_get_u16(nltb[IWPM_NLA_HELLO_ABI_VERSION]); + nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type); + iwpm_set_registration(nl_client, IWPM_REG_INCOMPL); + atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); + iwpm_ulib_version = min_t(u16, IWPM_UABI_VERSION, abi_version); + pr_debug("Using ABI version %u\n", iwpm_ulib_version); + iwpm_user_pid = cb->nlh->nlmsg_pid; + ret = iwpm_send_hello(nl_client, iwpm_user_pid, iwpm_ulib_version); + return ret; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/iwpm_util.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/iwpm_util.c new file mode 100644 index 0000000..358a2db --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/iwpm_util.c @@ -0,0 +1,793 @@ +/* + * Copyright (c) 2014 Chelsio, Inc. All rights reserved. + * Copyright (c) 2014 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "iwpm_util.h" + +#define IWPM_MAPINFO_HASH_SIZE 512 +#define IWPM_MAPINFO_HASH_MASK (IWPM_MAPINFO_HASH_SIZE - 1) +#define IWPM_REMINFO_HASH_SIZE 64 +#define IWPM_REMINFO_HASH_MASK (IWPM_REMINFO_HASH_SIZE - 1) +#define IWPM_MSG_SIZE 512 + +static LIST_HEAD(iwpm_nlmsg_req_list); +static DEFINE_SPINLOCK(iwpm_nlmsg_req_lock); + +static struct hlist_head *iwpm_hash_bucket; +static DEFINE_SPINLOCK(iwpm_mapinfo_lock); + +static struct hlist_head *iwpm_reminfo_bucket; +static DEFINE_SPINLOCK(iwpm_reminfo_lock); + +static struct iwpm_admin_data iwpm_admin; + +/** + * iwpm_init - Allocate resources for the iwarp port mapper + * @nl_client: The index of the netlink client + * + * Should be called when network interface goes up. + */ +int iwpm_init(u8 nl_client) +{ + iwpm_hash_bucket = kcalloc(IWPM_MAPINFO_HASH_SIZE, + sizeof(struct hlist_head), GFP_KERNEL); + if (!iwpm_hash_bucket) + return -ENOMEM; + + iwpm_reminfo_bucket = kcalloc(IWPM_REMINFO_HASH_SIZE, + sizeof(struct hlist_head), GFP_KERNEL); + if (!iwpm_reminfo_bucket) { + kfree(iwpm_hash_bucket); + return -ENOMEM; + } + + iwpm_set_registration(nl_client, IWPM_REG_UNDEF); + pr_debug("%s: Mapinfo and reminfo tables are created\n", __func__); + return 0; +} + +static void free_hash_bucket(void); +static void free_reminfo_bucket(void); + +/** + * iwpm_exit - Deallocate resources for the iwarp port mapper + * @nl_client: The index of the netlink client + * + * Should be called when network interface goes down. + */ +int iwpm_exit(u8 nl_client) +{ + free_hash_bucket(); + free_reminfo_bucket(); + pr_debug("%s: Resources are destroyed\n", __func__); + iwpm_set_registration(nl_client, IWPM_REG_UNDEF); + return 0; +} + +static struct hlist_head *get_mapinfo_hash_bucket(struct sockaddr_storage *, + struct sockaddr_storage *); + +/** + * iwpm_create_mapinfo - Store local and mapped IPv4/IPv6 address + * info in a hash table + * @local_sockaddr: Local ip/tcp address + * @mapped_sockaddr: Mapped local ip/tcp address + * @nl_client: The index of the netlink client + * @map_flags: IWPM mapping flags + */ +int iwpm_create_mapinfo(struct sockaddr_storage *local_sockaddr, + struct sockaddr_storage *mapped_sockaddr, + u8 nl_client, u32 map_flags) +{ + struct hlist_head *hash_bucket_head = NULL; + struct iwpm_mapping_info *map_info; + unsigned long flags; + int ret = -EINVAL; + + map_info = kzalloc(sizeof(struct iwpm_mapping_info), GFP_KERNEL); + if (!map_info) + return -ENOMEM; + + memcpy(&map_info->local_sockaddr, local_sockaddr, + sizeof(struct sockaddr_storage)); + memcpy(&map_info->mapped_sockaddr, mapped_sockaddr, + sizeof(struct sockaddr_storage)); + map_info->nl_client = nl_client; + map_info->map_flags = map_flags; + + spin_lock_irqsave(&iwpm_mapinfo_lock, flags); + if (iwpm_hash_bucket) { + hash_bucket_head = get_mapinfo_hash_bucket( + &map_info->local_sockaddr, + &map_info->mapped_sockaddr); + if (hash_bucket_head) { + hlist_add_head(&map_info->hlist_node, hash_bucket_head); + ret = 0; + } + } + spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); + + if (!hash_bucket_head) + kfree(map_info); + return ret; +} + +/** + * iwpm_remove_mapinfo - Remove local and mapped IPv4/IPv6 address + * info from the hash table + * @local_sockaddr: Local ip/tcp address + * @mapped_local_addr: Mapped local ip/tcp address + * + * Returns err code if mapping info is not found in the hash table, + * otherwise returns 0 + */ +int iwpm_remove_mapinfo(struct sockaddr_storage *local_sockaddr, + struct sockaddr_storage *mapped_local_addr) +{ + struct hlist_node *tmp_hlist_node; + struct hlist_head *hash_bucket_head; + struct iwpm_mapping_info *map_info = NULL; + unsigned long flags; + int ret = -EINVAL; + + spin_lock_irqsave(&iwpm_mapinfo_lock, flags); + if (iwpm_hash_bucket) { + hash_bucket_head = get_mapinfo_hash_bucket( + local_sockaddr, + mapped_local_addr); + if (!hash_bucket_head) + goto remove_mapinfo_exit; + + hlist_for_each_entry_safe(map_info, tmp_hlist_node, + hash_bucket_head, hlist_node) { + + if (!iwpm_compare_sockaddr(&map_info->mapped_sockaddr, + mapped_local_addr)) { + + hlist_del_init(&map_info->hlist_node); + kfree(map_info); + ret = 0; + break; + } + } + } +remove_mapinfo_exit: + spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); + return ret; +} + +static void free_hash_bucket(void) +{ + struct hlist_node *tmp_hlist_node; + struct iwpm_mapping_info *map_info; + unsigned long flags; + int i; + + /* remove all the mapinfo data from the list */ + spin_lock_irqsave(&iwpm_mapinfo_lock, flags); + for (i = 0; i < IWPM_MAPINFO_HASH_SIZE; i++) { + hlist_for_each_entry_safe(map_info, tmp_hlist_node, + &iwpm_hash_bucket[i], hlist_node) { + + hlist_del_init(&map_info->hlist_node); + kfree(map_info); + } + } + /* free the hash list */ + kfree(iwpm_hash_bucket); + iwpm_hash_bucket = NULL; + spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); +} + +static void free_reminfo_bucket(void) +{ + struct hlist_node *tmp_hlist_node; + struct iwpm_remote_info *rem_info; + unsigned long flags; + int i; + + /* remove all the remote info from the list */ + spin_lock_irqsave(&iwpm_reminfo_lock, flags); + for (i = 0; i < IWPM_REMINFO_HASH_SIZE; i++) { + hlist_for_each_entry_safe(rem_info, tmp_hlist_node, + &iwpm_reminfo_bucket[i], hlist_node) { + + hlist_del_init(&rem_info->hlist_node); + kfree(rem_info); + } + } + /* free the hash list */ + kfree(iwpm_reminfo_bucket); + iwpm_reminfo_bucket = NULL; + spin_unlock_irqrestore(&iwpm_reminfo_lock, flags); +} + +static struct hlist_head *get_reminfo_hash_bucket(struct sockaddr_storage *, + struct sockaddr_storage *); + +void iwpm_add_remote_info(struct iwpm_remote_info *rem_info) +{ + struct hlist_head *hash_bucket_head; + unsigned long flags; + + spin_lock_irqsave(&iwpm_reminfo_lock, flags); + if (iwpm_reminfo_bucket) { + hash_bucket_head = get_reminfo_hash_bucket( + &rem_info->mapped_loc_sockaddr, + &rem_info->mapped_rem_sockaddr); + if (hash_bucket_head) + hlist_add_head(&rem_info->hlist_node, hash_bucket_head); + } + spin_unlock_irqrestore(&iwpm_reminfo_lock, flags); +} + +/** + * iwpm_get_remote_info - Get the remote connecting peer address info + * + * @mapped_loc_addr: Mapped local address of the listening peer + * @mapped_rem_addr: Mapped remote address of the connecting peer + * @remote_addr: To store the remote address of the connecting peer + * @nl_client: The index of the netlink client + * + * The remote address info is retrieved and provided to the client in + * the remote_addr. After that it is removed from the hash table + */ +int iwpm_get_remote_info(struct sockaddr_storage *mapped_loc_addr, + struct sockaddr_storage *mapped_rem_addr, + struct sockaddr_storage *remote_addr, + u8 nl_client) +{ + struct hlist_node *tmp_hlist_node; + struct hlist_head *hash_bucket_head; + struct iwpm_remote_info *rem_info = NULL; + unsigned long flags; + int ret = -EINVAL; + + spin_lock_irqsave(&iwpm_reminfo_lock, flags); + if (iwpm_reminfo_bucket) { + hash_bucket_head = get_reminfo_hash_bucket( + mapped_loc_addr, + mapped_rem_addr); + if (!hash_bucket_head) + goto get_remote_info_exit; + hlist_for_each_entry_safe(rem_info, tmp_hlist_node, + hash_bucket_head, hlist_node) { + + if (!iwpm_compare_sockaddr(&rem_info->mapped_loc_sockaddr, + mapped_loc_addr) && + !iwpm_compare_sockaddr(&rem_info->mapped_rem_sockaddr, + mapped_rem_addr)) { + + memcpy(remote_addr, &rem_info->remote_sockaddr, + sizeof(struct sockaddr_storage)); + iwpm_print_sockaddr(remote_addr, + "get_remote_info: Remote sockaddr:"); + + hlist_del_init(&rem_info->hlist_node); + kfree(rem_info); + ret = 0; + break; + } + } + } +get_remote_info_exit: + spin_unlock_irqrestore(&iwpm_reminfo_lock, flags); + return ret; +} + +struct iwpm_nlmsg_request *iwpm_get_nlmsg_request(__u32 nlmsg_seq, + u8 nl_client, gfp_t gfp) +{ + struct iwpm_nlmsg_request *nlmsg_request = NULL; + unsigned long flags; + + nlmsg_request = kzalloc(sizeof(struct iwpm_nlmsg_request), gfp); + if (!nlmsg_request) + return NULL; + + spin_lock_irqsave(&iwpm_nlmsg_req_lock, flags); + list_add_tail(&nlmsg_request->inprocess_list, &iwpm_nlmsg_req_list); + spin_unlock_irqrestore(&iwpm_nlmsg_req_lock, flags); + + kref_init(&nlmsg_request->kref); + kref_get(&nlmsg_request->kref); + nlmsg_request->nlmsg_seq = nlmsg_seq; + nlmsg_request->nl_client = nl_client; + nlmsg_request->request_done = 0; + nlmsg_request->err_code = 0; + sema_init(&nlmsg_request->sem, 1); + down(&nlmsg_request->sem); + return nlmsg_request; +} + +void iwpm_free_nlmsg_request(struct kref *kref) +{ + struct iwpm_nlmsg_request *nlmsg_request; + unsigned long flags; + + nlmsg_request = container_of(kref, struct iwpm_nlmsg_request, kref); + + spin_lock_irqsave(&iwpm_nlmsg_req_lock, flags); + list_del_init(&nlmsg_request->inprocess_list); + spin_unlock_irqrestore(&iwpm_nlmsg_req_lock, flags); + + if (!nlmsg_request->request_done) + pr_debug("%s Freeing incomplete nlmsg request (seq = %u).\n", + __func__, nlmsg_request->nlmsg_seq); + kfree(nlmsg_request); +} + +struct iwpm_nlmsg_request *iwpm_find_nlmsg_request(__u32 echo_seq) +{ + struct iwpm_nlmsg_request *nlmsg_request; + struct iwpm_nlmsg_request *found_request = NULL; + unsigned long flags; + + spin_lock_irqsave(&iwpm_nlmsg_req_lock, flags); + list_for_each_entry(nlmsg_request, &iwpm_nlmsg_req_list, + inprocess_list) { + if (nlmsg_request->nlmsg_seq == echo_seq) { + found_request = nlmsg_request; + kref_get(&nlmsg_request->kref); + break; + } + } + spin_unlock_irqrestore(&iwpm_nlmsg_req_lock, flags); + return found_request; +} + +int iwpm_wait_complete_req(struct iwpm_nlmsg_request *nlmsg_request) +{ + int ret; + + ret = down_timeout(&nlmsg_request->sem, IWPM_NL_TIMEOUT); + if (ret) { + ret = -EINVAL; + pr_info("%s: Timeout %d sec for netlink request (seq = %u)\n", + __func__, (IWPM_NL_TIMEOUT/HZ), nlmsg_request->nlmsg_seq); + } else { + ret = nlmsg_request->err_code; + } + kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request); + return ret; +} + +int iwpm_get_nlmsg_seq(void) +{ + return atomic_inc_return(&iwpm_admin.nlmsg_seq); +} + +/* valid client */ +u32 iwpm_get_registration(u8 nl_client) +{ + return iwpm_admin.reg_list[nl_client]; +} + +/* valid client */ +void iwpm_set_registration(u8 nl_client, u32 reg) +{ + iwpm_admin.reg_list[nl_client] = reg; +} + +/* valid client */ +u32 iwpm_check_registration(u8 nl_client, u32 reg) +{ + return (iwpm_get_registration(nl_client) & reg); +} + +int iwpm_compare_sockaddr(struct sockaddr_storage *a_sockaddr, + struct sockaddr_storage *b_sockaddr) +{ + if (a_sockaddr->ss_family != b_sockaddr->ss_family) + return 1; + if (a_sockaddr->ss_family == AF_INET) { + struct sockaddr_in *a4_sockaddr = + (struct sockaddr_in *)a_sockaddr; + struct sockaddr_in *b4_sockaddr = + (struct sockaddr_in *)b_sockaddr; + if (!memcmp(&a4_sockaddr->sin_addr, + &b4_sockaddr->sin_addr, sizeof(struct in_addr)) + && a4_sockaddr->sin_port == b4_sockaddr->sin_port) + return 0; + + } else if (a_sockaddr->ss_family == AF_INET6) { + struct sockaddr_in6 *a6_sockaddr = + (struct sockaddr_in6 *)a_sockaddr; + struct sockaddr_in6 *b6_sockaddr = + (struct sockaddr_in6 *)b_sockaddr; + if (!memcmp(&a6_sockaddr->sin6_addr, + &b6_sockaddr->sin6_addr, sizeof(struct in6_addr)) + && a6_sockaddr->sin6_port == b6_sockaddr->sin6_port) + return 0; + + } else { + pr_err("%s: Invalid sockaddr family\n", __func__); + } + return 1; +} + +struct sk_buff *iwpm_create_nlmsg(u32 nl_op, struct nlmsghdr **nlh, + int nl_client) +{ + struct sk_buff *skb = NULL; + + skb = dev_alloc_skb(IWPM_MSG_SIZE); + if (!skb) + goto create_nlmsg_exit; + + if (!(ibnl_put_msg(skb, nlh, 0, 0, nl_client, nl_op, + NLM_F_REQUEST))) { + pr_warn("%s: Unable to put the nlmsg header\n", __func__); + dev_kfree_skb(skb); + skb = NULL; + } +create_nlmsg_exit: + return skb; +} + +int iwpm_parse_nlmsg(struct netlink_callback *cb, int policy_max, + const struct nla_policy *nlmsg_policy, + struct nlattr *nltb[], const char *msg_type) +{ + int nlh_len = 0; + int ret; + const char *err_str = ""; + + ret = nlmsg_validate_deprecated(cb->nlh, nlh_len, policy_max - 1, + nlmsg_policy, NULL); + if (ret) { + err_str = "Invalid attribute"; + goto parse_nlmsg_error; + } + ret = nlmsg_parse_deprecated(cb->nlh, nlh_len, nltb, policy_max - 1, + nlmsg_policy, NULL); + if (ret) { + err_str = "Unable to parse the nlmsg"; + goto parse_nlmsg_error; + } + ret = iwpm_validate_nlmsg_attr(nltb, policy_max); + if (ret) { + err_str = "Invalid NULL attribute"; + goto parse_nlmsg_error; + } + return 0; +parse_nlmsg_error: + pr_warn("%s: %s (msg type %s ret = %d)\n", + __func__, err_str, msg_type, ret); + return ret; +} + +void iwpm_print_sockaddr(struct sockaddr_storage *sockaddr, char *msg) +{ + struct sockaddr_in6 *sockaddr_v6; + struct sockaddr_in *sockaddr_v4; + + switch (sockaddr->ss_family) { + case AF_INET: + sockaddr_v4 = (struct sockaddr_in *)sockaddr; + pr_debug("%s IPV4 %pI4: %u(0x%04X)\n", + msg, &sockaddr_v4->sin_addr, + ntohs(sockaddr_v4->sin_port), + ntohs(sockaddr_v4->sin_port)); + break; + case AF_INET6: + sockaddr_v6 = (struct sockaddr_in6 *)sockaddr; + pr_debug("%s IPV6 %pI6: %u(0x%04X)\n", + msg, &sockaddr_v6->sin6_addr, + ntohs(sockaddr_v6->sin6_port), + ntohs(sockaddr_v6->sin6_port)); + break; + default: + break; + } +} + +static u32 iwpm_ipv6_jhash(struct sockaddr_in6 *ipv6_sockaddr) +{ + u32 ipv6_hash = jhash(&ipv6_sockaddr->sin6_addr, sizeof(struct in6_addr), 0); + u32 hash = jhash_2words(ipv6_hash, (__force u32) ipv6_sockaddr->sin6_port, 0); + return hash; +} + +static u32 iwpm_ipv4_jhash(struct sockaddr_in *ipv4_sockaddr) +{ + u32 ipv4_hash = jhash(&ipv4_sockaddr->sin_addr, sizeof(struct in_addr), 0); + u32 hash = jhash_2words(ipv4_hash, (__force u32) ipv4_sockaddr->sin_port, 0); + return hash; +} + +static int get_hash_bucket(struct sockaddr_storage *a_sockaddr, + struct sockaddr_storage *b_sockaddr, u32 *hash) +{ + u32 a_hash, b_hash; + + if (a_sockaddr->ss_family == AF_INET) { + a_hash = iwpm_ipv4_jhash((struct sockaddr_in *) a_sockaddr); + b_hash = iwpm_ipv4_jhash((struct sockaddr_in *) b_sockaddr); + + } else if (a_sockaddr->ss_family == AF_INET6) { + a_hash = iwpm_ipv6_jhash((struct sockaddr_in6 *) a_sockaddr); + b_hash = iwpm_ipv6_jhash((struct sockaddr_in6 *) b_sockaddr); + } else { + pr_err("%s: Invalid sockaddr family\n", __func__); + return -EINVAL; + } + + if (a_hash == b_hash) /* if port mapper isn't available */ + *hash = a_hash; + else + *hash = jhash_2words(a_hash, b_hash, 0); + return 0; +} + +static struct hlist_head *get_mapinfo_hash_bucket(struct sockaddr_storage + *local_sockaddr, struct sockaddr_storage + *mapped_sockaddr) +{ + u32 hash; + int ret; + + ret = get_hash_bucket(local_sockaddr, mapped_sockaddr, &hash); + if (ret) + return NULL; + return &iwpm_hash_bucket[hash & IWPM_MAPINFO_HASH_MASK]; +} + +static struct hlist_head *get_reminfo_hash_bucket(struct sockaddr_storage + *mapped_loc_sockaddr, struct sockaddr_storage + *mapped_rem_sockaddr) +{ + u32 hash; + int ret; + + ret = get_hash_bucket(mapped_loc_sockaddr, mapped_rem_sockaddr, &hash); + if (ret) + return NULL; + return &iwpm_reminfo_bucket[hash & IWPM_REMINFO_HASH_MASK]; +} + +static int send_mapinfo_num(u32 mapping_num, u8 nl_client, int iwpm_pid) +{ + struct sk_buff *skb = NULL; + struct nlmsghdr *nlh; + u32 msg_seq; + const char *err_str = ""; + int ret = -EINVAL; + + skb = iwpm_create_nlmsg(RDMA_NL_IWPM_MAPINFO_NUM, &nlh, nl_client); + if (!skb) { + err_str = "Unable to create a nlmsg"; + goto mapinfo_num_error; + } + nlh->nlmsg_seq = iwpm_get_nlmsg_seq(); + msg_seq = 0; + err_str = "Unable to put attribute of mapinfo number nlmsg"; + ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq, IWPM_NLA_MAPINFO_SEQ); + if (ret) + goto mapinfo_num_error; + ret = ibnl_put_attr(skb, nlh, sizeof(u32), + &mapping_num, IWPM_NLA_MAPINFO_SEND_NUM); + if (ret) + goto mapinfo_num_error; + + nlmsg_end(skb, nlh); + + ret = rdma_nl_unicast(&init_net, skb, iwpm_pid); + if (ret) { + skb = NULL; + err_str = "Unable to send a nlmsg"; + goto mapinfo_num_error; + } + pr_debug("%s: Sent mapping number = %u\n", __func__, mapping_num); + return 0; +mapinfo_num_error: + pr_info("%s: %s\n", __func__, err_str); + dev_kfree_skb(skb); + return ret; +} + +static int send_nlmsg_done(struct sk_buff *skb, u8 nl_client, int iwpm_pid) +{ + struct nlmsghdr *nlh = NULL; + int ret = 0; + + if (!skb) + return ret; + if (!(ibnl_put_msg(skb, &nlh, 0, 0, nl_client, + RDMA_NL_IWPM_MAPINFO, NLM_F_MULTI))) { + pr_warn("%s Unable to put NLMSG_DONE\n", __func__); + dev_kfree_skb(skb); + return -ENOMEM; + } + nlh->nlmsg_type = NLMSG_DONE; + ret = rdma_nl_unicast(&init_net, skb, iwpm_pid); + if (ret) + pr_warn("%s Unable to send a nlmsg\n", __func__); + return ret; +} + +int iwpm_send_mapinfo(u8 nl_client, int iwpm_pid) +{ + struct iwpm_mapping_info *map_info; + struct sk_buff *skb = NULL; + struct nlmsghdr *nlh; + int skb_num = 0, mapping_num = 0; + int i = 0, nlmsg_bytes = 0; + unsigned long flags; + const char *err_str = ""; + int ret; + + skb = dev_alloc_skb(NLMSG_GOODSIZE); + if (!skb) { + ret = -ENOMEM; + err_str = "Unable to allocate skb"; + goto send_mapping_info_exit; + } + skb_num++; + spin_lock_irqsave(&iwpm_mapinfo_lock, flags); + ret = -EINVAL; + for (i = 0; i < IWPM_MAPINFO_HASH_SIZE; i++) { + hlist_for_each_entry(map_info, &iwpm_hash_bucket[i], + hlist_node) { + if (map_info->nl_client != nl_client) + continue; + nlh = NULL; + if (!(ibnl_put_msg(skb, &nlh, 0, 0, nl_client, + RDMA_NL_IWPM_MAPINFO, NLM_F_MULTI))) { + ret = -ENOMEM; + err_str = "Unable to put the nlmsg header"; + goto send_mapping_info_unlock; + } + err_str = "Unable to put attribute of the nlmsg"; + ret = ibnl_put_attr(skb, nlh, + sizeof(struct sockaddr_storage), + &map_info->local_sockaddr, + IWPM_NLA_MAPINFO_LOCAL_ADDR); + if (ret) + goto send_mapping_info_unlock; + + ret = ibnl_put_attr(skb, nlh, + sizeof(struct sockaddr_storage), + &map_info->mapped_sockaddr, + IWPM_NLA_MAPINFO_MAPPED_ADDR); + if (ret) + goto send_mapping_info_unlock; + + if (iwpm_ulib_version > IWPM_UABI_VERSION_MIN) { + ret = ibnl_put_attr(skb, nlh, sizeof(u32), + &map_info->map_flags, + IWPM_NLA_MAPINFO_FLAGS); + if (ret) + goto send_mapping_info_unlock; + } + + nlmsg_end(skb, nlh); + + iwpm_print_sockaddr(&map_info->local_sockaddr, + "send_mapping_info: Local sockaddr:"); + iwpm_print_sockaddr(&map_info->mapped_sockaddr, + "send_mapping_info: Mapped local sockaddr:"); + mapping_num++; + nlmsg_bytes += nlh->nlmsg_len; + + /* check if all mappings can fit in one skb */ + if (NLMSG_GOODSIZE - nlmsg_bytes < nlh->nlmsg_len * 2) { + /* and leave room for NLMSG_DONE */ + nlmsg_bytes = 0; + skb_num++; + spin_unlock_irqrestore(&iwpm_mapinfo_lock, + flags); + /* send the skb */ + ret = send_nlmsg_done(skb, nl_client, iwpm_pid); + skb = NULL; + if (ret) { + err_str = "Unable to send map info"; + goto send_mapping_info_exit; + } + if (skb_num == IWPM_MAPINFO_SKB_COUNT) { + ret = -ENOMEM; + err_str = "Insufficient skbs for map info"; + goto send_mapping_info_exit; + } + skb = dev_alloc_skb(NLMSG_GOODSIZE); + if (!skb) { + ret = -ENOMEM; + err_str = "Unable to allocate skb"; + goto send_mapping_info_exit; + } + spin_lock_irqsave(&iwpm_mapinfo_lock, flags); + } + } + } +send_mapping_info_unlock: + spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); +send_mapping_info_exit: + if (ret) { + pr_warn("%s: %s (ret = %d)\n", __func__, err_str, ret); + dev_kfree_skb(skb); + return ret; + } + send_nlmsg_done(skb, nl_client, iwpm_pid); + return send_mapinfo_num(mapping_num, nl_client, iwpm_pid); +} + +int iwpm_mapinfo_available(void) +{ + unsigned long flags; + int full_bucket = 0, i = 0; + + spin_lock_irqsave(&iwpm_mapinfo_lock, flags); + if (iwpm_hash_bucket) { + for (i = 0; i < IWPM_MAPINFO_HASH_SIZE; i++) { + if (!hlist_empty(&iwpm_hash_bucket[i])) { + full_bucket = 1; + break; + } + } + } + spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); + return full_bucket; +} + +int iwpm_send_hello(u8 nl_client, int iwpm_pid, u16 abi_version) +{ + struct sk_buff *skb = NULL; + struct nlmsghdr *nlh; + const char *err_str; + int ret = -EINVAL; + + skb = iwpm_create_nlmsg(RDMA_NL_IWPM_HELLO, &nlh, nl_client); + if (!skb) { + err_str = "Unable to create a nlmsg"; + goto hello_num_error; + } + nlh->nlmsg_seq = iwpm_get_nlmsg_seq(); + err_str = "Unable to put attribute of abi_version into nlmsg"; + ret = ibnl_put_attr(skb, nlh, sizeof(u16), &abi_version, + IWPM_NLA_HELLO_ABI_VERSION); + if (ret) + goto hello_num_error; + nlmsg_end(skb, nlh); + + ret = rdma_nl_unicast(&init_net, skb, iwpm_pid); + if (ret) { + skb = NULL; + err_str = "Unable to send a nlmsg"; + goto hello_num_error; + } + pr_debug("%s: Sent hello abi_version = %u\n", __func__, abi_version); + return 0; +hello_num_error: + pr_info("%s: %s\n", __func__, err_str); + dev_kfree_skb(skb); + return ret; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/iwpm_util.h b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/iwpm_util.h new file mode 100644 index 0000000..3a42ad4 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/iwpm_util.h @@ -0,0 +1,266 @@ +/* + * Copyright (c) 2014 Intel Corporation. All rights reserved. + * Copyright (c) 2014 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _IWPM_UTIL_H +#define _IWPM_UTIL_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#define IWPM_NL_RETRANS 3 +#define IWPM_NL_TIMEOUT (10*HZ) +#define IWPM_MAPINFO_SKB_COUNT 20 + +#define IWPM_PID_UNDEFINED -1 +#define IWPM_PID_UNAVAILABLE -2 + +#define IWPM_REG_UNDEF 0x01 +#define IWPM_REG_VALID 0x02 +#define IWPM_REG_INCOMPL 0x04 + +struct iwpm_nlmsg_request { + struct list_head inprocess_list; + __u32 nlmsg_seq; + void *req_buffer; + u8 nl_client; + u8 request_done; + u16 err_code; + struct semaphore sem; + struct kref kref; +}; + +struct iwpm_mapping_info { + struct hlist_node hlist_node; + struct sockaddr_storage local_sockaddr; + struct sockaddr_storage mapped_sockaddr; + u8 nl_client; + u32 map_flags; +}; + +struct iwpm_remote_info { + struct hlist_node hlist_node; + struct sockaddr_storage remote_sockaddr; + struct sockaddr_storage mapped_loc_sockaddr; + struct sockaddr_storage mapped_rem_sockaddr; + u8 nl_client; +}; + +struct iwpm_admin_data { + atomic_t nlmsg_seq; + u32 reg_list[RDMA_NL_NUM_CLIENTS]; +}; + +/** + * iwpm_get_nlmsg_request - Allocate and initialize netlink message request + * @nlmsg_seq: Sequence number of the netlink message + * @nl_client: The index of the netlink client + * @gfp: Indicates how the memory for the request should be allocated + * + * Returns the newly allocated netlink request object if successful, + * otherwise returns NULL + */ +struct iwpm_nlmsg_request *iwpm_get_nlmsg_request(__u32 nlmsg_seq, + u8 nl_client, gfp_t gfp); + +/** + * iwpm_free_nlmsg_request - Deallocate netlink message request + * @kref: Holds reference of netlink message request + */ +void iwpm_free_nlmsg_request(struct kref *kref); + +/** + * iwpm_find_nlmsg_request - Find netlink message request in the request list + * @echo_seq: Sequence number of the netlink request to find + * + * Returns the found netlink message request, + * if not found, returns NULL + */ +struct iwpm_nlmsg_request *iwpm_find_nlmsg_request(__u32 echo_seq); + +/** + * iwpm_wait_complete_req - Block while servicing the netlink request + * @nlmsg_request: Netlink message request to service + * + * Wakes up, after the request is completed or expired + * Returns 0 if the request is complete without error + */ +int iwpm_wait_complete_req(struct iwpm_nlmsg_request *nlmsg_request); + +/** + * iwpm_get_nlmsg_seq - Get the sequence number for a netlink + * message to send to the port mapper + * + * Returns the sequence number for the netlink message. + */ +int iwpm_get_nlmsg_seq(void); + +/** + * iwpm_add_remote_info - Add remote address info of the connecting peer + * to the remote info hash table + * @reminfo: The remote info to be added + */ +void iwpm_add_remote_info(struct iwpm_remote_info *reminfo); + +/** + * iwpm_check_registration - Check if the client registration + * matches the given one + * @nl_client: The index of the netlink client + * @reg: The given registration type to compare with + * + * Call iwpm_register_pid() to register a client + * Returns true if the client registration matches reg, + * otherwise returns false + */ +u32 iwpm_check_registration(u8 nl_client, u32 reg); + +/** + * iwpm_set_registration - Set the client registration + * @nl_client: The index of the netlink client + * @reg: Registration type to set + */ +void iwpm_set_registration(u8 nl_client, u32 reg); + +/** + * iwpm_get_registration - Get the client registration + * @nl_client: The index of the netlink client + * + * Returns the client registration type + */ +u32 iwpm_get_registration(u8 nl_client); + +/** + * iwpm_send_mapinfo - Send local and mapped IPv4/IPv6 address info of + * a client to the user space port mapper + * @nl_client: The index of the netlink client + * @iwpm_pid: The pid of the user space port mapper + * + * If successful, returns the number of sent mapping info records + */ +int iwpm_send_mapinfo(u8 nl_client, int iwpm_pid); + +/** + * iwpm_mapinfo_available - Check if any mapping info records is available + * in the hash table + * + * Returns 1 if mapping information is available, otherwise returns 0 + */ +int iwpm_mapinfo_available(void); + +/** + * iwpm_compare_sockaddr - Compare two sockaddr storage structs + * @a_sockaddr: first sockaddr to compare + * @b_sockaddr: second sockaddr to compare + * + * Return: 0 if they are holding the same ip/tcp address info, + * otherwise returns 1 + */ +int iwpm_compare_sockaddr(struct sockaddr_storage *a_sockaddr, + struct sockaddr_storage *b_sockaddr); + +/** + * iwpm_validate_nlmsg_attr - Check for NULL netlink attributes + * @nltb: Holds address of each netlink message attributes + * @nla_count: Number of netlink message attributes + * + * Returns error if any of the nla_count attributes is NULL + */ +static inline int iwpm_validate_nlmsg_attr(struct nlattr *nltb[], + int nla_count) +{ + int i; + for (i = 1; i < nla_count; i++) { + if (!nltb[i]) + return -EINVAL; + } + return 0; +} + +/** + * iwpm_create_nlmsg - Allocate skb and form a netlink message + * @nl_op: Netlink message opcode + * @nlh: Holds address of the netlink message header in skb + * @nl_client: The index of the netlink client + * + * Returns the newly allcated skb, or NULL if the tailroom of the skb + * is insufficient to store the message header and payload + */ +struct sk_buff *iwpm_create_nlmsg(u32 nl_op, struct nlmsghdr **nlh, + int nl_client); + +/** + * iwpm_parse_nlmsg - Validate and parse the received netlink message + * @cb: Netlink callback structure + * @policy_max: Maximum attribute type to be expected + * @nlmsg_policy: Validation policy + * @nltb: Array to store policy_max parsed elements + * @msg_type: Type of netlink message + * + * Returns 0 on success or a negative error code + */ +int iwpm_parse_nlmsg(struct netlink_callback *cb, int policy_max, + const struct nla_policy *nlmsg_policy, + struct nlattr *nltb[], const char *msg_type); + +/** + * iwpm_print_sockaddr - Print IPv4/IPv6 address and TCP port + * @sockaddr: Socket address to print + * @msg: Message to print + */ +void iwpm_print_sockaddr(struct sockaddr_storage *sockaddr, char *msg); + +/** + * iwpm_send_hello - Send hello response to iwpmd + * + * @nl_client: The index of the netlink client + * @iwpm_pid: The pid of the user space port mapper + * @abi_version: The kernel's abi_version + * + * Returns 0 on success or a negative error code + */ +int iwpm_send_hello(u8 nl_client, int iwpm_pid, u16 abi_version); +extern u16 iwpm_ulib_version; +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/lag.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/lag.c new file mode 100644 index 0000000..7063e41 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/lag.c @@ -0,0 +1,138 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2020 Mellanox Technologies. All rights reserved. + */ + +#include +#include +#include + +static struct sk_buff *rdma_build_skb(struct ib_device *device, + struct net_device *netdev, + struct rdma_ah_attr *ah_attr, + gfp_t flags) +{ + struct ipv6hdr *ip6h; + struct sk_buff *skb; + struct ethhdr *eth; + struct iphdr *iph; + struct udphdr *uh; + u8 smac[ETH_ALEN]; + bool is_ipv4; + int hdr_len; + + is_ipv4 = ipv6_addr_v4mapped((struct in6_addr *)ah_attr->grh.dgid.raw); + hdr_len = ETH_HLEN + sizeof(struct udphdr) + LL_RESERVED_SPACE(netdev); + hdr_len += is_ipv4 ? sizeof(struct iphdr) : sizeof(struct ipv6hdr); + + skb = alloc_skb(hdr_len, flags); + if (!skb) + return NULL; + + skb->dev = netdev; + skb_reserve(skb, hdr_len); + skb_push(skb, sizeof(struct udphdr)); + skb_reset_transport_header(skb); + uh = udp_hdr(skb); + uh->source = + htons(rdma_flow_label_to_udp_sport(ah_attr->grh.flow_label)); + uh->dest = htons(ROCE_V2_UDP_DPORT); + uh->len = htons(sizeof(struct udphdr)); + + if (is_ipv4) { + skb_push(skb, sizeof(struct iphdr)); + skb_reset_network_header(skb); + iph = ip_hdr(skb); + iph->frag_off = 0; + iph->version = 4; + iph->protocol = IPPROTO_UDP; + iph->ihl = 0x5; + iph->tot_len = htons(sizeof(struct udphdr) + sizeof(struct + iphdr)); + memcpy(&iph->saddr, ah_attr->grh.sgid_attr->gid.raw + 12, + sizeof(struct in_addr)); + memcpy(&iph->daddr, ah_attr->grh.dgid.raw + 12, + sizeof(struct in_addr)); + } else { + skb_push(skb, sizeof(struct ipv6hdr)); + skb_reset_network_header(skb); + ip6h = ipv6_hdr(skb); + ip6h->version = 6; + ip6h->nexthdr = IPPROTO_UDP; + memcpy(&ip6h->flow_lbl, &ah_attr->grh.flow_label, + sizeof(*ip6h->flow_lbl)); + memcpy(&ip6h->saddr, ah_attr->grh.sgid_attr->gid.raw, + sizeof(struct in6_addr)); + memcpy(&ip6h->daddr, ah_attr->grh.dgid.raw, + sizeof(struct in6_addr)); + } + + skb_push(skb, sizeof(struct ethhdr)); + skb_reset_mac_header(skb); + eth = eth_hdr(skb); + skb->protocol = eth->h_proto = htons(is_ipv4 ? ETH_P_IP : ETH_P_IPV6); + rdma_read_gid_l2_fields(ah_attr->grh.sgid_attr, NULL, smac); + memcpy(eth->h_source, smac, ETH_ALEN); + memcpy(eth->h_dest, ah_attr->roce.dmac, ETH_ALEN); + + return skb; +} + +static struct net_device *rdma_get_xmit_slave_udp(struct ib_device *device, + struct net_device *master, + struct rdma_ah_attr *ah_attr, + gfp_t flags) +{ + struct net_device *slave; + struct sk_buff *skb; + + skb = rdma_build_skb(device, master, ah_attr, flags); + if (!skb) + return ERR_PTR(-ENOMEM); + + rcu_read_lock(); + slave = netdev_get_xmit_slave(master, skb, + !!(device->lag_flags & + RDMA_LAG_FLAGS_HASH_ALL_SLAVES)); + if (slave) + dev_hold(slave); + rcu_read_unlock(); + kfree_skb(skb); + return slave; +} + +void rdma_lag_put_ah_roce_slave(struct net_device *xmit_slave) +{ + if (xmit_slave) + dev_put(xmit_slave); +} + +struct net_device *rdma_lag_get_ah_roce_slave(struct ib_device *device, + struct rdma_ah_attr *ah_attr, + gfp_t flags) +{ + struct net_device *slave = NULL; + struct net_device *master; + + if (!(ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE && + ah_attr->grh.sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP && + ah_attr->grh.flow_label)) + return NULL; + + rcu_read_lock(); + master = rdma_read_gid_attr_ndev_rcu(ah_attr->grh.sgid_attr); + if (IS_ERR(master)) { + rcu_read_unlock(); + return master; + } + dev_hold(master); + rcu_read_unlock(); + + if (!netif_is_bond_master(master)) + goto put; + + slave = rdma_get_xmit_slave_udp(device, master, ah_attr, flags); +put: + dev_put(master); + return slave; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/mad.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/mad.c new file mode 100644 index 0000000..64254ef --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/mad.c @@ -0,0 +1,4145 @@ +/* + * Copyright (c) 2004-2007 Voltaire, Inc. All rights reserved. + * Copyright (c) 2005 Intel Corporation. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2009 HNR Consulting. All rights reserved. + * Copyright (c) 2014,2018 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifdef pr_fmt +#undef pr_fmt +#endif +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include + +#include "mad_priv.h" +#include "core_priv.h" +#include "mad_rmpp.h" +#include "smi.h" +#include "opa_smi.h" +#include "agent.h" + +#define CREATE_TRACE_POINTS +#include + +#ifdef CONFIG_TRACEPOINTS +static void create_mad_addr_info(struct ib_mad_send_wr_private *mad_send_wr, + struct ib_mad_qp_info *qp_info, + struct trace_event_raw_ib_mad_send_template *entry) +{ + u16 pkey; + struct ib_device *dev = qp_info->port_priv->device; + u32 pnum = qp_info->port_priv->port_num; + struct ib_ud_wr *wr = &mad_send_wr->send_wr; + struct rdma_ah_attr attr = {}; + + rdma_query_ah(wr->ah, &attr); + + /* These are common */ + entry->sl = attr.sl; + ib_query_pkey(dev, pnum, wr->pkey_index, &pkey); + entry->pkey = pkey; + entry->rqpn = wr->remote_qpn; + entry->rqkey = wr->remote_qkey; + entry->dlid = rdma_ah_get_dlid(&attr); +} +#endif + +static int mad_sendq_size = IB_MAD_QP_SEND_SIZE; +static int mad_recvq_size = IB_MAD_QP_RECV_SIZE; +static int mad_smp_window = IB_MAD_QP_SMP_WINDOW; + +module_param_named(send_queue_size, mad_sendq_size, int, 0444); +MODULE_PARM_DESC(send_queue_size, "Size of send queue in number of work requests"); +module_param_named(recv_queue_size, mad_recvq_size, int, 0444); +MODULE_PARM_DESC(recv_queue_size, "Size of receive queue in number of work requests"); +module_param_named(mad_smp_window, mad_smp_window, int, 0444); +MODULE_PARM_DESC(mad_smp_window, "Maximun number of outgoing SMP requests"); + +static DEFINE_XARRAY_ALLOC1(ib_mad_clients); +static u32 ib_mad_client_next; +static struct list_head ib_mad_port_list; + +/* + * Timeout FIFO (tf) param + */ +enum { + /* min time between 2 consecutive activations of tf workqueue */ + MIN_BETWEEN_ACTIVATIONS_MS = 5 +}; + +/* + * SA congestion control params + */ +enum { + SA_CC_DEFAULT_OUTSTANDING_SA_MADS = 16, + SA_CC_MIN_OUTSTANDING_SA_MADS = 1, + SA_CC_MAX_OUTSTANDING_SA_MADS = 1 << 20, + + SA_CC_DEFAULT_MAD_TIME_MS = 20, + SA_CC_MIN_MAD_TIME_MS = 1, + SA_CC_MAX_MAD_TIME_MS = 10000, + + SA_CC_DEFAULT_QUEUE_SIZE = 1 << 16, + SA_CC_MIN_QUEUE_SIZE = 16, + SA_CC_MAX_QUEUE_SIZE = 1 << 20, +}; + +/* Port list lock */ +static DEFINE_SPINLOCK(ib_mad_port_list_lock); + +/* Forward declarations */ +static int method_in_use(struct ib_mad_mgmt_method_table **method, + struct ib_mad_reg_req *mad_reg_req); +static void remove_mad_reg_req(struct ib_mad_agent_private *priv); +static struct ib_mad_agent_private *find_mad_agent( + struct ib_mad_port_private *port_priv, + const struct ib_mad_hdr *mad); +static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info, + struct ib_mad_private *mad); +static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv); +static void timeout_sends(struct work_struct *work); +static void local_completions(struct work_struct *work); +static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req, + struct ib_mad_agent_private *agent_priv, + u8 mgmt_class); +static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req, + struct ib_mad_agent_private *agent_priv); +static bool ib_mad_send_error(struct ib_mad_port_private *port_priv, + struct ib_wc *wc); +static void ib_mad_send_done(struct ib_cq *cq, struct ib_wc *wc); + +static int send_sa_cc_mad(struct ib_mad_send_wr_private *mad_send_wr, + u32 timeout_ms, u32 retries_left); + +/* + * Timeout FIFO functions - implements FIFO with timeout mechanism + */ + +static void activate_timeout_handler_task(struct timer_list *t) +{ + struct to_fifo *tf; + + tf = from_timer(tf, t, timer); + del_timer(&tf->timer); + queue_work(tf->workq, &tf->work); +} + +static unsigned long adjusted_time(unsigned long last, unsigned long next) +{ + unsigned long min_next; + + min_next = last + msecs_to_jiffies(MIN_BETWEEN_ACTIVATIONS_MS); + if (time_after(min_next, next)) + return min_next; + + return next; +} + +static void notify_failure(struct ib_mad_send_wr_private *mad_send_wr, + enum ib_wc_status status) +{ + struct ib_mad_send_wc mad_send_wc; + struct ib_mad_agent_private *mad_agent_priv; + + mad_send_wc.status = status; + mad_send_wc.vendor_err = 0; + mad_send_wc.send_buf = &mad_send_wr->send_buf; + mad_agent_priv = mad_send_wr->mad_agent_priv; + mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, + &mad_send_wc); +} + +static inline struct sa_cc_data * +get_cc_obj(struct ib_mad_send_wr_private *mad_send_wr) +{ + return &mad_send_wr->mad_agent_priv->qp_info->port_priv->sa_cc; +} + +static inline struct ib_mad_send_wr_private *tfe_to_mad(struct tf_entry *tfe) +{ + return container_of(tfe, struct ib_mad_send_wr_private, tf_list); +} + +static void timeout_handler_task(struct work_struct *work) +{ + struct tf_entry *tmp1, *tmp2; + struct list_head *list_item, exp_lst; + unsigned long flags, curr_time; + int lst_empty; + struct to_fifo *tf; + + tf = container_of(work, struct to_fifo, work); + do { + INIT_LIST_HEAD(&exp_lst); + + spin_lock_irqsave(&tf->lists_lock, flags); + curr_time = jiffies; + list_for_each(list_item, &tf->to_head) { + tmp1 = list_entry(list_item, struct tf_entry, to_list); + if (time_before(curr_time, tmp1->exp_time)) + break; + list_del(&tmp1->fifo_list); + tf->num_items--; + } + + /* cut list up to and including list_item->prev */ + list_cut_position(&exp_lst, &tf->to_head, list_item->prev); + spin_unlock_irqrestore(&tf->lists_lock, flags); + + lst_empty = list_empty(&exp_lst); + list_for_each_entry_safe(tmp1, tmp2, &exp_lst, to_list) { + list_del(&tmp1->to_list); + if (tmp1->canceled) { + tmp1->canceled = 0; + notify_failure(tfe_to_mad(tmp1), + IB_WC_WR_FLUSH_ERR); + } else { + notify_failure(tfe_to_mad(tmp1), + IB_WC_RESP_TIMEOUT_ERR); + } + } + } while (!lst_empty); + + spin_lock_irqsave(&tf->lists_lock, flags); + if (!list_empty(&tf->to_head)) { + tmp1 = list_entry(tf->to_head.next, struct tf_entry, to_list); + mod_timer(&tf->timer, adjusted_time(curr_time, tmp1->exp_time)); + } + spin_unlock_irqrestore(&tf->lists_lock, flags); +} + +/** + * tf_create - creates new timeout-fifo object + * + * Allocate and initialize new timeout-fifo object + */ +static struct to_fifo *tf_create(void) +{ + struct to_fifo *tf; + + tf = kzalloc(sizeof(*tf), GFP_KERNEL); + if (tf) { + tf->workq = create_singlethread_workqueue("to_fifo"); + if (!tf->workq) { + kfree(tf); + return NULL; + } + spin_lock_init(&tf->lists_lock); + INIT_LIST_HEAD(&tf->to_head); + INIT_LIST_HEAD(&tf->fifo_head); + timer_setup(&tf->timer, activate_timeout_handler_task, 0); + INIT_WORK(&tf->work, timeout_handler_task); + tf->timer.expires = jiffies; + tf->stop_enqueue = 0; + tf->num_items = 0; + } + + return tf; +} + +/** + * tf_enqueue - enqueue item to timeout-fifo object + * @tf:timeout-fifo object + * @item: item to enqueue. + * @timeout_ms: item expiration time in ms. + * + * Enqueue item to fifo and modify expiration timer when required. + * + * Returns 0 on success and negative on failure. + */ +static int tf_enqueue(struct sa_cc_data *cc_obj, struct tf_entry *item, + u32 timeout_ms) +{ + struct to_fifo *tf = cc_obj->tf; + struct tf_entry *tmp; + struct list_head *list_item; + unsigned long flags; + + item->exp_time = jiffies + msecs_to_jiffies(timeout_ms); + + spin_lock_irqsave(&tf->lists_lock, flags); + if (tf->num_items >= cc_obj->queue_size || tf->stop_enqueue) { + cc_obj->drops++; + spin_unlock_irqrestore(&tf->lists_lock, flags); + return -EBUSY; + } + + /* Insert item to timeout list */ + list_for_each_prev(list_item, &tf->to_head) { + tmp = list_entry(list_item, struct tf_entry, to_list); + if (time_after(item->exp_time, tmp->exp_time)) + break; + } + + list_add(&item->to_list, list_item); + + /* Insert item to fifo list */ + list_add_tail(&item->fifo_list, &tf->fifo_head); + + tf->num_items++; + + /* modify expiration timer if required */ + if (list_item == &tf->to_head) + mod_timer(&tf->timer, item->exp_time); + + spin_unlock_irqrestore(&tf->lists_lock, flags); + + return 0; +} + +/** + * tf_dequeue - dequeue item from timeout-fifo object + * @tf:timeout-fifo object + * @time_left_ms: returns the time left for expiration in ms. + * + * Dequeue item from fifo and modify expiration timer when required. + * + * Returns pointer to tf_entry on success and NULL on failure. + */ +static struct tf_entry *tf_dequeue(struct to_fifo *tf, u32 *time_left_ms) +{ + unsigned long flags; + unsigned long time_left; + struct tf_entry *tmp, *tmp1; + + spin_lock_irqsave(&tf->lists_lock, flags); + if (list_empty(&tf->fifo_head)) { + spin_unlock_irqrestore(&tf->lists_lock, flags); + return NULL; + } + + list_for_each_entry(tmp, &tf->fifo_head, fifo_list) { + if (!tmp->canceled) + break; + } + + if (tmp->canceled) { + spin_unlock_irqrestore(&tf->lists_lock, flags); + return NULL; + } + + /* modify timer in case enqueued item is the next to expire */ + if (tf->to_head.next == &tmp->to_list) { + if (list_is_last(&tmp->to_list, &tf->to_head)) { + del_timer(&tf->timer); + } else { + tmp1 = list_entry(tmp->to_list.next, + struct tf_entry, to_list); + mod_timer(&tf->timer, tmp1->exp_time); + } + } + list_del(&tmp->fifo_list); + list_del(&tmp->to_list); + tf->num_items--; + spin_unlock_irqrestore(&tf->lists_lock, flags); + + time_left = tmp->exp_time - jiffies; + if ((long)time_left <= 0) + time_left = 0; + *time_left_ms = jiffies_to_msecs(time_left); + + return tmp; +} + +static void tf_stop_enqueue(struct to_fifo *tf) +{ + unsigned long flags; + + spin_lock_irqsave(&tf->lists_lock, flags); + tf->stop_enqueue = 1; + spin_unlock_irqrestore(&tf->lists_lock, flags); +} + +/** + * tf_free - free empty timeout-fifo object + * @tf:timeout-fifo object + * + */ +static void tf_free(struct to_fifo *tf) +{ + del_timer_sync(&tf->timer); + flush_workqueue(tf->workq); + destroy_workqueue(tf->workq); + + kfree(tf); +} + +/** + * tf_free_agent - free MADs related to specific MAD agent from timeout-fifo + * @tf:timeout-fifo object + * @mad_agent_priv: MAD agent. + * + */ +static void tf_free_agent(struct to_fifo *tf, + struct ib_mad_agent_private *mad_agent_priv) +{ + unsigned long flags; + struct tf_entry *tmp, *tmp1; + struct list_head tmp_head; + + INIT_LIST_HEAD(&tmp_head); + spin_lock_irqsave(&tf->lists_lock, flags); + list_for_each_entry_safe(tmp, tmp1, &tf->fifo_head, fifo_list) { + if (tfe_to_mad(tmp)->mad_agent_priv == mad_agent_priv) { + list_del(&tmp->to_list); + list_move(&tmp->fifo_list, &tmp_head); + tf->num_items--; + } + } + spin_unlock_irqrestore(&tf->lists_lock, flags); + + list_for_each_entry_safe(tmp, tmp1, &tmp_head, fifo_list) { + list_del(&tmp->fifo_list); + notify_failure(tfe_to_mad(tmp), IB_WC_WR_FLUSH_ERR); + } +} + +/** + * tf_modify_item - to modify expiration time for specific item + * @tf:timeout-fifo object + * @mad_agent_priv: MAD agent. + * @send_buf: the MAD to modify in queue + * @timeout_ms: new timeout to set. + * + * Returns 0 if item found on list and -ENXIO if not. + * + * Note: The send_buf may point on MAD that is already released. + * Therefore we can't use this struct before finding it in the list + */ +static int tf_modify_item(struct to_fifo *tf, + struct ib_mad_agent_private *mad_agent_priv, + struct ib_mad_send_buf *send_buf, u32 timeout_ms) +{ + struct tf_entry *tmp, *item; + struct list_head *list_item; + unsigned long flags; + int found = 0; + + spin_lock_irqsave(&tf->lists_lock, flags); + list_for_each_entry(item, &tf->fifo_head, fifo_list) { + if (tfe_to_mad(item)->mad_agent_priv == mad_agent_priv && + &tfe_to_mad(item)->send_buf == send_buf) { + found = 1; + break; + } + } + + if (!found) { + spin_unlock_irqrestore(&tf->lists_lock, flags); + return -ENXIO; + } + + item->exp_time = jiffies + msecs_to_jiffies(timeout_ms); + + if (timeout_ms) { + list_del(&item->to_list); + list_for_each_prev(list_item, &tf->to_head) { + tmp = list_entry(list_item, struct tf_entry, to_list); + if (time_after(item->exp_time, tmp->exp_time)) + break; + } + list_add(&item->to_list, list_item); + + /* modify expiration timer if required */ + if (list_item == &tf->to_head) + mod_timer(&tf->timer, item->exp_time); + } else { + /* + * when item canceled (timeout_ms == 0) move item to + * head of timeout list and to the tail of fifo list + */ + item->canceled = 1; + list_move(&item->to_list, &tf->to_head); + list_move_tail(&item->fifo_list, &tf->fifo_head); + mod_timer(&tf->timer, item->exp_time); + } + spin_unlock_irqrestore(&tf->lists_lock, flags); + + return 0; +} + +/* + * SA congestion control functions + */ + +/* + * Defines which MAD is under congestion control. + */ +static int is_sa_cc_mad(struct ib_mad_send_wr_private *mad_send_wr) +{ + struct ib_mad_hdr *mad; + + mad = (struct ib_mad_hdr *)mad_send_wr->send_buf.mad; + + return (mad_send_wr->send_buf.timeout_ms) && + (mad->mgmt_class == IB_MGMT_CLASS_SUBN_ADM) && + ((mad->method == IB_MGMT_METHOD_GET) || + (mad->method == IB_MGMT_METHOD_SET)); +} + +/* + * Notify that SA congestion controlled MAD is done. + * to allow dequeuing SA MAD from congestion control queue. + */ +static void sa_cc_mad_done(struct sa_cc_data *cc_obj) +{ + unsigned long flags; + struct tf_entry *tfe; + struct ib_mad_send_wr_private *mad_send_wr; + u32 time_left_ms, timeout_ms, retries; + int ret; + + do { + spin_lock_irqsave(&cc_obj->lock, flags); + tfe = tf_dequeue(cc_obj->tf, &time_left_ms); + if (!tfe) { + if (cc_obj->outstanding > 0) + cc_obj->outstanding--; + spin_unlock_irqrestore(&cc_obj->lock, flags); + break; + } + spin_unlock_irqrestore(&cc_obj->lock, flags); + mad_send_wr = tfe_to_mad(tfe); + time_left_ms += cc_obj->time_sa_mad; + if (time_left_ms > mad_send_wr->send_buf.timeout_ms) { + retries = time_left_ms / + mad_send_wr->send_buf.timeout_ms - 1; + timeout_ms = mad_send_wr->send_buf.timeout_ms; + } else { + retries = 0; + timeout_ms = time_left_ms; + } + ret = send_sa_cc_mad(mad_send_wr, timeout_ms, retries); + if (ret) { + if (ret == -ENOMEM) + notify_failure(mad_send_wr, IB_WC_GENERAL_ERR); + else + notify_failure(mad_send_wr, + IB_WC_LOC_QP_OP_ERR); + } + } while (ret); +} + +/* + * Send SA MAD under congestion control. + */ +static int sa_cc_mad_send(struct ib_mad_send_wr_private *mad_send_wr) +{ + unsigned long flags; + int ret; + struct sa_cc_data *cc_obj; + + cc_obj = get_cc_obj(mad_send_wr); + spin_lock_irqsave(&cc_obj->lock, flags); + if (cc_obj->outstanding < cc_obj->max_outstanding) { + cc_obj->outstanding++; + spin_unlock_irqrestore(&cc_obj->lock, flags); + ret = send_sa_cc_mad(mad_send_wr, + mad_send_wr->send_buf.timeout_ms, + mad_send_wr->retries_left); + if (ret) + sa_cc_mad_done(cc_obj); + + } else { + int qtime = (mad_send_wr->send_buf.timeout_ms * + (mad_send_wr->retries_left + 1)) + - cc_obj->time_sa_mad; + + if (qtime < 0) + qtime = 0; + ret = tf_enqueue(cc_obj, &mad_send_wr->tf_list, (u32)qtime); + + spin_unlock_irqrestore(&cc_obj->lock, flags); + } + + return ret; +} + +static int init_sa_cc_sysfs(struct ib_device *device); +static void cleanup_sa_cc_sysfs(struct ib_device *device); +static int init_sa_cc_sysfs_ports(struct sa_cc_data *cc_obj); +static void cleanup_sa_cc_sysfs_ports(struct sa_cc_data *cc_obj); + +/* + * Initialize SA congestion control. + */ +static int sa_cc_init(struct sa_cc_data *cc_obj) +{ + int err; + + err = init_sa_cc_sysfs_ports(cc_obj); + if (err) + return err; + spin_lock_init(&cc_obj->lock); + cc_obj->queue_size = SA_CC_DEFAULT_QUEUE_SIZE; + cc_obj->time_sa_mad = SA_CC_DEFAULT_MAD_TIME_MS; + cc_obj->max_outstanding = SA_CC_DEFAULT_OUTSTANDING_SA_MADS; + cc_obj->outstanding = 0; + cc_obj->drops = 0; + cc_obj->tf = tf_create(); + if (!cc_obj->tf) { + err = -ENOMEM; + goto sysfs_cleanup; + } + return 0; + +sysfs_cleanup: + cleanup_sa_cc_sysfs_ports(cc_obj); + return err; +} + +/* + * Cancel SA MADs from congestion control queue. + */ +static void cancel_sa_cc_mads(struct ib_mad_agent_private *mad_agent_priv) +{ + tf_free_agent(mad_agent_priv->qp_info->port_priv->sa_cc.tf, + mad_agent_priv); +} + +/* + * Modify timeout of SA MAD on congestion control queue. + */ +static int modify_sa_cc_mad(struct ib_mad_agent_private *mad_agent_priv, + struct ib_mad_send_buf *send_buf, u32 timeout_ms) +{ + struct sa_cc_data *cc_obj = &mad_agent_priv->qp_info->port_priv->sa_cc; + int ret; + int qtime = 0; + + if (timeout_ms > cc_obj->time_sa_mad) + qtime = timeout_ms - cc_obj->time_sa_mad; + + ret = tf_modify_item(cc_obj->tf, + mad_agent_priv, send_buf, (u32)qtime); + return ret; +} + +static void sa_cc_destroy(struct sa_cc_data *cc_obj) +{ + struct ib_mad_send_wr_private *mad_send_wr; + struct tf_entry *tfe; + struct ib_mad_send_wc mad_send_wc; + struct ib_mad_agent_private *mad_agent_priv; + u32 time_left_ms; + + mad_send_wc.status = IB_WC_WR_FLUSH_ERR; + mad_send_wc.vendor_err = 0; + + cleanup_sa_cc_sysfs_ports(cc_obj); + tf_stop_enqueue(cc_obj->tf); + tfe = tf_dequeue(cc_obj->tf, &time_left_ms); + while (tfe) { + mad_send_wr = tfe_to_mad(tfe); + mad_send_wc.send_buf = &mad_send_wr->send_buf; + mad_agent_priv = mad_send_wr->mad_agent_priv; + mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, + &mad_send_wc); + tfe = tf_dequeue(cc_obj->tf, &time_left_ms); + } + tf_free(cc_obj->tf); +} + +/* + * Returns a ib_mad_port_private structure or NULL for a device/port + * Assumes ib_mad_port_list_lock is being held + */ +static inline struct ib_mad_port_private * +__ib_get_mad_port(struct ib_device *device, u32 port_num) +{ + struct ib_mad_port_private *entry; + + list_for_each_entry(entry, &ib_mad_port_list, port_list) { + if (entry->device == device && entry->port_num == port_num) + return entry; + } + return NULL; +} + +/* + * Wrapper function to return a ib_mad_port_private structure or NULL + * for a device/port + */ +static inline struct ib_mad_port_private * +ib_get_mad_port(struct ib_device *device, u32 port_num) +{ + struct ib_mad_port_private *entry; + unsigned long flags; + + spin_lock_irqsave(&ib_mad_port_list_lock, flags); + entry = __ib_get_mad_port(device, port_num); + spin_unlock_irqrestore(&ib_mad_port_list_lock, flags); + + return entry; +} + +static inline u8 convert_mgmt_class(u8 mgmt_class) +{ + /* Alias IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE to 0 */ + return mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE ? + 0 : mgmt_class; +} + +static int get_spl_qp_index(enum ib_qp_type qp_type) +{ + switch (qp_type) { + case IB_QPT_SMI: + return 0; + case IB_QPT_GSI: + return 1; + default: + return -1; + } +} + +static int vendor_class_index(u8 mgmt_class) +{ + return mgmt_class - IB_MGMT_CLASS_VENDOR_RANGE2_START; +} + +static int is_vendor_class(u8 mgmt_class) +{ + if ((mgmt_class < IB_MGMT_CLASS_VENDOR_RANGE2_START) || + (mgmt_class > IB_MGMT_CLASS_VENDOR_RANGE2_END)) + return 0; + return 1; +} + +static int is_vendor_oui(char *oui) +{ + if (oui[0] || oui[1] || oui[2]) + return 1; + return 0; +} + +static int is_vendor_method_in_use( + struct ib_mad_mgmt_vendor_class *vendor_class, + struct ib_mad_reg_req *mad_reg_req) +{ + struct ib_mad_mgmt_method_table *method; + int i; + + for (i = 0; i < MAX_MGMT_OUI; i++) { + if (!memcmp(vendor_class->oui[i], mad_reg_req->oui, 3)) { + method = vendor_class->method_table[i]; + if (method) { + if (method_in_use(&method, mad_reg_req)) + return 1; + else + break; + } + } + } + return 0; +} + +int ib_response_mad(const struct ib_mad_hdr *hdr) +{ + return ((hdr->method & IB_MGMT_METHOD_RESP) || + (hdr->method == IB_MGMT_METHOD_TRAP_REPRESS) || + ((hdr->mgmt_class == IB_MGMT_CLASS_BM) && + (hdr->attr_mod & IB_BM_ATTR_MOD_RESP))); +} +EXPORT_SYMBOL(ib_response_mad); + +/* + * ib_register_mad_agent - Register to send/receive MADs + * + * Context: Process context. + */ +struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, + u32 port_num, + enum ib_qp_type qp_type, + struct ib_mad_reg_req *mad_reg_req, + u8 rmpp_version, + ib_mad_send_handler send_handler, + ib_mad_recv_handler recv_handler, + void *context, + u32 registration_flags) +{ + struct ib_mad_port_private *port_priv; + struct ib_mad_agent *ret = ERR_PTR(-EINVAL); + struct ib_mad_agent_private *mad_agent_priv; + struct ib_mad_reg_req *reg_req = NULL; + struct ib_mad_mgmt_class_table *class; + struct ib_mad_mgmt_vendor_class_table *vendor; + struct ib_mad_mgmt_vendor_class *vendor_class; + struct ib_mad_mgmt_method_table *method; + int ret2, qpn; + u8 mgmt_class, vclass; + + if ((qp_type == IB_QPT_SMI && !rdma_cap_ib_smi(device, port_num)) || + (qp_type == IB_QPT_GSI && !rdma_cap_ib_cm(device, port_num))) + return ERR_PTR(-EPROTONOSUPPORT); + + /* Validate parameters */ + qpn = get_spl_qp_index(qp_type); + if (qpn == -1) { + dev_dbg_ratelimited(&device->dev, "%s: invalid QP Type %d\n", + __func__, qp_type); + goto error1; + } + + if (rmpp_version && rmpp_version != IB_MGMT_RMPP_VERSION) { + dev_dbg_ratelimited(&device->dev, + "%s: invalid RMPP Version %u\n", + __func__, rmpp_version); + goto error1; + } + + /* Validate MAD registration request if supplied */ + if (mad_reg_req) { + if (mad_reg_req->mgmt_class_version >= MAX_MGMT_VERSION) { + dev_dbg_ratelimited(&device->dev, + "%s: invalid Class Version %u\n", + __func__, + mad_reg_req->mgmt_class_version); + goto error1; + } + if (!recv_handler) { + dev_dbg_ratelimited(&device->dev, + "%s: no recv_handler\n", __func__); + goto error1; + } + if (mad_reg_req->mgmt_class >= MAX_MGMT_CLASS) { + /* + * IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE is the only + * one in this range currently allowed + */ + if (mad_reg_req->mgmt_class != + IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { + dev_dbg_ratelimited(&device->dev, + "%s: Invalid Mgmt Class 0x%x\n", + __func__, mad_reg_req->mgmt_class); + goto error1; + } + } else if (mad_reg_req->mgmt_class == 0) { + /* + * Class 0 is reserved in IBA and is used for + * aliasing of IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE + */ + dev_dbg_ratelimited(&device->dev, + "%s: Invalid Mgmt Class 0\n", + __func__); + goto error1; + } else if (is_vendor_class(mad_reg_req->mgmt_class)) { + /* + * If class is in "new" vendor range, + * ensure supplied OUI is not zero + */ + if (!is_vendor_oui(mad_reg_req->oui)) { + dev_dbg_ratelimited(&device->dev, + "%s: No OUI specified for class 0x%x\n", + __func__, + mad_reg_req->mgmt_class); + goto error1; + } + } + /* Make sure class supplied is consistent with RMPP */ + if (!ib_is_mad_class_rmpp(mad_reg_req->mgmt_class)) { + if (rmpp_version) { + dev_dbg_ratelimited(&device->dev, + "%s: RMPP version for non-RMPP class 0x%x\n", + __func__, mad_reg_req->mgmt_class); + goto error1; + } + } + + /* Make sure class supplied is consistent with QP type */ + if (qp_type == IB_QPT_SMI) { + if ((mad_reg_req->mgmt_class != + IB_MGMT_CLASS_SUBN_LID_ROUTED) && + (mad_reg_req->mgmt_class != + IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) { + dev_dbg_ratelimited(&device->dev, + "%s: Invalid SM QP type: class 0x%x\n", + __func__, mad_reg_req->mgmt_class); + goto error1; + } + } else { + if ((mad_reg_req->mgmt_class == + IB_MGMT_CLASS_SUBN_LID_ROUTED) || + (mad_reg_req->mgmt_class == + IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) { + dev_dbg_ratelimited(&device->dev, + "%s: Invalid GS QP type: class 0x%x\n", + __func__, mad_reg_req->mgmt_class); + goto error1; + } + } + } else { + /* No registration request supplied */ + if (!send_handler) + goto error1; + if (registration_flags & IB_MAD_USER_RMPP) + goto error1; + } + + /* Validate device and port */ + port_priv = ib_get_mad_port(device, port_num); + if (!port_priv) { + dev_dbg_ratelimited(&device->dev, "%s: Invalid port %u\n", + __func__, port_num); + ret = ERR_PTR(-ENODEV); + goto error1; + } + + /* Verify the QP requested is supported. For example, Ethernet devices + * will not have QP0. + */ + if (!port_priv->qp_info[qpn].qp) { + dev_dbg_ratelimited(&device->dev, "%s: QP %d not supported\n", + __func__, qpn); + ret = ERR_PTR(-EPROTONOSUPPORT); + goto error1; + } + + /* Allocate structures */ + mad_agent_priv = kzalloc(sizeof *mad_agent_priv, GFP_KERNEL); + if (!mad_agent_priv) { + ret = ERR_PTR(-ENOMEM); + goto error1; + } + + if (mad_reg_req) { + reg_req = kmemdup(mad_reg_req, sizeof *reg_req, GFP_KERNEL); + if (!reg_req) { + ret = ERR_PTR(-ENOMEM); + goto error3; + } + } + + /* Now, fill in the various structures */ + mad_agent_priv->qp_info = &port_priv->qp_info[qpn]; + mad_agent_priv->reg_req = reg_req; + mad_agent_priv->agent.rmpp_version = rmpp_version; + mad_agent_priv->agent.device = device; + mad_agent_priv->agent.recv_handler = recv_handler; + mad_agent_priv->agent.send_handler = send_handler; + mad_agent_priv->agent.context = context; + mad_agent_priv->agent.qp = port_priv->qp_info[qpn].qp; + mad_agent_priv->agent.port_num = port_num; + mad_agent_priv->agent.flags = registration_flags; + spin_lock_init(&mad_agent_priv->lock); + INIT_LIST_HEAD(&mad_agent_priv->send_list); + INIT_LIST_HEAD(&mad_agent_priv->wait_list); + INIT_LIST_HEAD(&mad_agent_priv->done_list); + INIT_LIST_HEAD(&mad_agent_priv->rmpp_list); + INIT_DELAYED_WORK(&mad_agent_priv->timed_work, timeout_sends); + INIT_LIST_HEAD(&mad_agent_priv->local_list); + INIT_WORK(&mad_agent_priv->local_work, local_completions); + refcount_set(&mad_agent_priv->refcount, 1); + init_completion(&mad_agent_priv->comp); + + ret2 = ib_mad_agent_security_setup(&mad_agent_priv->agent, qp_type); + if (ret2) { + ret = ERR_PTR(ret2); + goto error4; + } + + /* + * The mlx4 driver uses the top byte to distinguish which virtual + * function generated the MAD, so we must avoid using it. + */ + ret2 = xa_alloc_cyclic(&ib_mad_clients, &mad_agent_priv->agent.hi_tid, + mad_agent_priv, XA_LIMIT(0, (1 << 24) - 1), + &ib_mad_client_next, GFP_KERNEL); + if (ret2 < 0) { + ret = ERR_PTR(ret2); + goto error5; + } + + /* + * Make sure MAD registration (if supplied) + * is non overlapping with any existing ones + */ + spin_lock_irq(&port_priv->reg_lock); + if (mad_reg_req) { + mgmt_class = convert_mgmt_class(mad_reg_req->mgmt_class); + if (!is_vendor_class(mgmt_class)) { + class = port_priv->version[mad_reg_req-> + mgmt_class_version].class; + if (class) { + method = class->method_table[mgmt_class]; + if (method) { + if (method_in_use(&method, + mad_reg_req)) + goto error6; + } + } + ret2 = add_nonoui_reg_req(mad_reg_req, mad_agent_priv, + mgmt_class); + } else { + /* "New" vendor class range */ + vendor = port_priv->version[mad_reg_req-> + mgmt_class_version].vendor; + if (vendor) { + vclass = vendor_class_index(mgmt_class); + vendor_class = vendor->vendor_class[vclass]; + if (vendor_class) { + if (is_vendor_method_in_use( + vendor_class, + mad_reg_req)) + goto error6; + } + } + ret2 = add_oui_reg_req(mad_reg_req, mad_agent_priv); + } + if (ret2) { + ret = ERR_PTR(ret2); + goto error6; + } + } + spin_unlock_irq(&port_priv->reg_lock); + + trace_ib_mad_create_agent(mad_agent_priv); + return &mad_agent_priv->agent; +error6: + spin_unlock_irq(&port_priv->reg_lock); + xa_erase(&ib_mad_clients, mad_agent_priv->agent.hi_tid); +error5: + ib_mad_agent_security_cleanup(&mad_agent_priv->agent); +error4: + kfree(reg_req); +error3: + kfree(mad_agent_priv); +error1: + return ret; +} +EXPORT_SYMBOL(ib_register_mad_agent); + +static inline void deref_mad_agent(struct ib_mad_agent_private *mad_agent_priv) +{ + if (refcount_dec_and_test(&mad_agent_priv->refcount)) + complete(&mad_agent_priv->comp); +} + +static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv) +{ + struct ib_mad_port_private *port_priv; + + /* Note that we could still be handling received MADs */ + trace_ib_mad_unregister_agent(mad_agent_priv); + + /* + * Canceling all sends results in dropping received response + * MADs, preventing us from queuing additional work + */ + cancel_mads(mad_agent_priv); + port_priv = mad_agent_priv->qp_info->port_priv; + cancel_delayed_work(&mad_agent_priv->timed_work); + + spin_lock_irq(&port_priv->reg_lock); + remove_mad_reg_req(mad_agent_priv); + spin_unlock_irq(&port_priv->reg_lock); + xa_erase(&ib_mad_clients, mad_agent_priv->agent.hi_tid); + + flush_workqueue(port_priv->wq); + + deref_mad_agent(mad_agent_priv); + wait_for_completion(&mad_agent_priv->comp); + ib_cancel_rmpp_recvs(mad_agent_priv); + + ib_mad_agent_security_cleanup(&mad_agent_priv->agent); + + kfree(mad_agent_priv->reg_req); + kfree_rcu(mad_agent_priv, rcu); +} + +static inline struct smp_window * +get_smp_window_obj(struct ib_mad_send_wr_private *mad_send_wr) +{ + return &mad_send_wr->mad_agent_priv->qp_info->port_priv->smp_window; +} + +static void smp_mad_done(struct ib_mad_send_wr_private *mad_send_wr) +{ + struct ib_mad_qp_info *qp_info; + struct smp_window *smp = get_smp_window_obj(mad_send_wr); + unsigned long flags; + struct ib_mad_list_head *mad_list; + struct ib_mad_send_wr_private *queued_send_wr; + struct ib_mad_agent *mad_agent; + struct list_head *list; + struct ib_mad_send_wc mad_send_wc = { }; + int err; + + qp_info = mad_send_wr->mad_agent_priv->qp_info; + spin_lock_irqsave(&qp_info->send_queue.lock, flags); + + if (smp->outstanding) + smp->outstanding--; + +retry: + if (list_empty(&smp->overflow_list)) { + spin_unlock_irqrestore(&qp_info->send_queue.lock, flags); + return; + } + + mad_list = container_of(smp->overflow_list.next, + struct ib_mad_list_head, list); + queued_send_wr = container_of(mad_list, + struct ib_mad_send_wr_private, + mad_list); + list_del(&mad_list->list); + mad_agent = queued_send_wr->send_buf.mad_agent; + + if (qp_info->send_queue.count < qp_info->send_queue.max_active) { + err = ib_post_send(mad_agent->qp, &queued_send_wr->send_wr.wr, + NULL); + list = &qp_info->send_queue.list; + } else { + err = 0; + list = &qp_info->overflow_list; + } + + if (!err) { + /* + * We count SMP MADs on the QP's overflow list as MADs that + * were sent to the wire + */ + smp->outstanding++; + qp_info->send_queue.count++; + list_add_tail(&queued_send_wr->mad_list.list, list); + } + spin_unlock_irqrestore(&qp_info->send_queue.lock, flags); + if (err) { + dev_err(&qp_info->port_priv->device->dev, + "ib_post_send failed: %d\n", err); + /* Complete the failed mad */ + ib_dma_unmap_single(mad_agent->device, + queued_send_wr->header_mapping, + queued_send_wr->sg_list[0].length, + DMA_TO_DEVICE); + ib_dma_unmap_single(mad_agent->device, + queued_send_wr->payload_mapping, + queued_send_wr->sg_list[1].length, + DMA_TO_DEVICE); + mad_send_wc.send_buf = &queued_send_wr->send_buf; + mad_send_wc.status = IB_WC_LOC_QP_OP_ERR; + ib_mad_complete_send_wr(queued_send_wr, &mad_send_wc); + /* + * Try to send the next mad + * When working with limited SMP MAD window, we must send a new + * MAD to the wire for each MAD done + */ + spin_lock_irqsave(&qp_info->send_queue.lock, flags); + goto retry; + } +} + +/* + * ib_unregister_mad_agent - Unregisters a client from using MAD services + * + * Context: Process context. + */ +void ib_unregister_mad_agent(struct ib_mad_agent *mad_agent) +{ + struct ib_mad_agent_private *mad_agent_priv; + + mad_agent_priv = container_of(mad_agent, + struct ib_mad_agent_private, + agent); + unregister_mad_agent(mad_agent_priv); +} +EXPORT_SYMBOL(ib_unregister_mad_agent); + +static void dequeue_mad(struct ib_mad_list_head *mad_list) +{ + struct ib_mad_queue *mad_queue; + unsigned long flags; + + mad_queue = mad_list->mad_queue; + spin_lock_irqsave(&mad_queue->lock, flags); + list_del(&mad_list->list); + mad_queue->count--; + spin_unlock_irqrestore(&mad_queue->lock, flags); +} + +static void build_smp_wc(struct ib_qp *qp, struct ib_cqe *cqe, u16 slid, + u16 pkey_index, u32 port_num, struct ib_wc *wc) +{ + memset(wc, 0, sizeof *wc); + wc->wr_cqe = cqe; + wc->status = IB_WC_SUCCESS; + wc->opcode = IB_WC_RECV; + wc->pkey_index = pkey_index; + wc->byte_len = sizeof(struct ib_mad) + sizeof(struct ib_grh); + wc->src_qp = IB_QP0; + wc->qp = qp; + wc->slid = slid; + wc->sl = 0; + wc->dlid_path_bits = 0; + wc->port_num = port_num; +} + +static size_t mad_priv_size(const struct ib_mad_private *mp) +{ + return sizeof(struct ib_mad_private) + mp->mad_size; +} + +static struct ib_mad_private *alloc_mad_private(size_t mad_size, gfp_t flags) +{ + size_t size = sizeof(struct ib_mad_private) + mad_size; + struct ib_mad_private *ret = kzalloc(size, flags); + + if (ret) + ret->mad_size = mad_size; + + return ret; +} + +static size_t port_mad_size(const struct ib_mad_port_private *port_priv) +{ + return rdma_max_mad_size(port_priv->device, port_priv->port_num); +} + +static size_t mad_priv_dma_size(const struct ib_mad_private *mp) +{ + return sizeof(struct ib_grh) + mp->mad_size; +} + +static int is_smp_mad(struct ib_mad_send_wr_private *mad_send_wr) +{ + struct ib_mad_hdr *mad; + + mad = (struct ib_mad_hdr *)mad_send_wr->send_buf.mad; + + return (mad_send_wr->send_buf.timeout_ms) && + ((mad->mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED) || + (mad->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) && + ((mad->method == IB_MGMT_METHOD_GET) || + (mad->method == IB_MGMT_METHOD_SET)); +} + +static void smp_window_init(struct smp_window *smp) +{ + smp->max_outstanding = mad_smp_window; + smp->outstanding = 0; + INIT_LIST_HEAD(&smp->overflow_list); +} + +/* + * Return 0 if SMP is to be sent + * Return 1 if SMP was consumed locally (whether or not solicited) + * Return < 0 if error + */ +static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv, + struct ib_mad_send_wr_private *mad_send_wr) +{ + int ret = 0; + struct ib_smp *smp = mad_send_wr->send_buf.mad; + struct opa_smp *opa_smp = (struct opa_smp *)smp; + unsigned long flags; + struct ib_mad_local_private *local; + struct ib_mad_private *mad_priv; + struct ib_mad_port_private *port_priv; + struct ib_mad_agent_private *recv_mad_agent = NULL; + struct ib_device *device = mad_agent_priv->agent.device; + u32 port_num; + struct ib_wc mad_wc; + struct ib_ud_wr *send_wr = &mad_send_wr->send_wr; + size_t mad_size = port_mad_size(mad_agent_priv->qp_info->port_priv); + u16 out_mad_pkey_index = 0; + u16 drslid; + bool opa = rdma_cap_opa_mad(mad_agent_priv->qp_info->port_priv->device, + mad_agent_priv->qp_info->port_priv->port_num); + + if (rdma_cap_ib_switch(device) && + smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + port_num = send_wr->port_num; + else + port_num = mad_agent_priv->agent.port_num; + + /* + * Directed route handling starts if the initial LID routed part of + * a request or the ending LID routed part of a response is empty. + * If we are at the start of the LID routed part, don't update the + * hop_ptr or hop_cnt. See section 14.2.2, Vol 1 IB spec. + */ + if (opa && smp->class_version == OPA_SM_CLASS_VERSION) { + u32 opa_drslid; + + trace_ib_mad_handle_out_opa_smi(opa_smp); + + if ((opa_get_smp_direction(opa_smp) + ? opa_smp->route.dr.dr_dlid : opa_smp->route.dr.dr_slid) == + OPA_LID_PERMISSIVE && + opa_smi_handle_dr_smp_send(opa_smp, + rdma_cap_ib_switch(device), + port_num) == IB_SMI_DISCARD) { + ret = -EINVAL; + dev_err(&device->dev, "OPA Invalid directed route\n"); + goto out; + } + opa_drslid = be32_to_cpu(opa_smp->route.dr.dr_slid); + if (opa_drslid != be32_to_cpu(OPA_LID_PERMISSIVE) && + opa_drslid & 0xffff0000) { + ret = -EINVAL; + dev_err(&device->dev, "OPA Invalid dr_slid 0x%x\n", + opa_drslid); + goto out; + } + drslid = (u16)(opa_drslid & 0x0000ffff); + + /* Check to post send on QP or process locally */ + if (opa_smi_check_local_smp(opa_smp, device) == IB_SMI_DISCARD && + opa_smi_check_local_returning_smp(opa_smp, device) == IB_SMI_DISCARD) + goto out; + } else { + trace_ib_mad_handle_out_ib_smi(smp); + + if ((ib_get_smp_direction(smp) ? smp->dr_dlid : smp->dr_slid) == + IB_LID_PERMISSIVE && + smi_handle_dr_smp_send(smp, rdma_cap_ib_switch(device), port_num) == + IB_SMI_DISCARD) { + ret = -EINVAL; + dev_err(&device->dev, "Invalid directed route\n"); + goto out; + } + drslid = be16_to_cpu(smp->dr_slid); + + /* Check to post send on QP or process locally */ + if (smi_check_local_smp(smp, device) == IB_SMI_DISCARD && + smi_check_local_returning_smp(smp, device) == IB_SMI_DISCARD) + goto out; + } + + local = kmalloc(sizeof *local, GFP_ATOMIC); + if (!local) { + ret = -ENOMEM; + goto out; + } + local->mad_priv = NULL; + local->recv_mad_agent = NULL; + mad_priv = alloc_mad_private(mad_size, GFP_ATOMIC); + if (!mad_priv) { + ret = -ENOMEM; + kfree(local); + goto out; + } + + build_smp_wc(mad_agent_priv->agent.qp, + send_wr->wr.wr_cqe, drslid, + send_wr->pkey_index, + send_wr->port_num, &mad_wc); + + if (opa && smp->base_version == OPA_MGMT_BASE_VERSION) { + mad_wc.byte_len = mad_send_wr->send_buf.hdr_len + + mad_send_wr->send_buf.data_len + + sizeof(struct ib_grh); + } + + /* No GRH for DR SMP */ + ret = device->ops.process_mad(device, 0, port_num, &mad_wc, NULL, + (const struct ib_mad *)smp, + (struct ib_mad *)mad_priv->mad, &mad_size, + &out_mad_pkey_index); + switch (ret) { + case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY: + if (ib_response_mad((const struct ib_mad_hdr *)mad_priv->mad) && + mad_agent_priv->agent.recv_handler) { + local->mad_priv = mad_priv; + local->recv_mad_agent = mad_agent_priv; + /* + * Reference MAD agent until receive + * side of local completion handled + */ + refcount_inc(&mad_agent_priv->refcount); + } else + kfree(mad_priv); + break; + case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED: + kfree(mad_priv); + break; + case IB_MAD_RESULT_SUCCESS: + /* Treat like an incoming receive MAD */ + port_priv = ib_get_mad_port(mad_agent_priv->agent.device, + mad_agent_priv->agent.port_num); + if (port_priv) { + memcpy(mad_priv->mad, smp, mad_priv->mad_size); + recv_mad_agent = find_mad_agent(port_priv, + (const struct ib_mad_hdr *)mad_priv->mad); + } + if (!port_priv || !recv_mad_agent) { + /* + * No receiving agent so drop packet and + * generate send completion. + */ + kfree(mad_priv); + break; + } + local->mad_priv = mad_priv; + local->recv_mad_agent = recv_mad_agent; + break; + default: + kfree(mad_priv); + kfree(local); + ret = -EINVAL; + goto out; + } + + local->mad_send_wr = mad_send_wr; + if (opa) { + local->mad_send_wr->send_wr.pkey_index = out_mad_pkey_index; + local->return_wc_byte_len = mad_size; + } + /* Reference MAD agent until send side of local completion handled */ + refcount_inc(&mad_agent_priv->refcount); + /* Queue local completion to local list */ + spin_lock_irqsave(&mad_agent_priv->lock, flags); + list_add_tail(&local->completion_list, &mad_agent_priv->local_list); + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + queue_work(mad_agent_priv->qp_info->port_priv->wq, + &mad_agent_priv->local_work); + ret = 1; +out: + return ret; +} + +static int get_pad_size(int hdr_len, int data_len, size_t mad_size) +{ + int seg_size, pad; + + seg_size = mad_size - hdr_len; + if (data_len && seg_size) { + pad = seg_size - data_len % seg_size; + return pad == seg_size ? 0 : pad; + } else + return seg_size; +} + +static void free_send_rmpp_list(struct ib_mad_send_wr_private *mad_send_wr) +{ + struct ib_rmpp_segment *s, *t; + + list_for_each_entry_safe(s, t, &mad_send_wr->rmpp_list, list) { + list_del(&s->list); + kfree(s); + } +} + +static int alloc_send_rmpp_list(struct ib_mad_send_wr_private *send_wr, + size_t mad_size, gfp_t gfp_mask) +{ + struct ib_mad_send_buf *send_buf = &send_wr->send_buf; + struct ib_rmpp_mad *rmpp_mad = send_buf->mad; + struct ib_rmpp_segment *seg = NULL; + int left, seg_size, pad; + + send_buf->seg_size = mad_size - send_buf->hdr_len; + send_buf->seg_rmpp_size = mad_size - IB_MGMT_RMPP_HDR; + seg_size = send_buf->seg_size; + pad = send_wr->pad; + + /* Allocate data segments. */ + for (left = send_buf->data_len + pad; left > 0; left -= seg_size) { + seg = kmalloc(sizeof(*seg) + seg_size, gfp_mask); + if (!seg) { + free_send_rmpp_list(send_wr); + return -ENOMEM; + } + seg->num = ++send_buf->seg_count; + list_add_tail(&seg->list, &send_wr->rmpp_list); + } + + /* Zero any padding */ + if (pad) + memset(seg->data + seg_size - pad, 0, pad); + + rmpp_mad->rmpp_hdr.rmpp_version = send_wr->mad_agent_priv-> + agent.rmpp_version; + rmpp_mad->rmpp_hdr.rmpp_type = IB_MGMT_RMPP_TYPE_DATA; + ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE); + + send_wr->cur_seg = container_of(send_wr->rmpp_list.next, + struct ib_rmpp_segment, list); + send_wr->last_ack_seg = send_wr->cur_seg; + return 0; +} + +int ib_mad_kernel_rmpp_agent(const struct ib_mad_agent *agent) +{ + return agent->rmpp_version && !(agent->flags & IB_MAD_USER_RMPP); +} +EXPORT_SYMBOL(ib_mad_kernel_rmpp_agent); + +struct ib_mad_send_buf *ib_create_send_mad(struct ib_mad_agent *mad_agent, + u32 remote_qpn, u16 pkey_index, + int rmpp_active, int hdr_len, + int data_len, gfp_t gfp_mask, + u8 base_version) +{ + struct ib_mad_agent_private *mad_agent_priv; + struct ib_mad_send_wr_private *mad_send_wr; + int pad, message_size, ret, size; + void *buf; + size_t mad_size; + bool opa; + + mad_agent_priv = container_of(mad_agent, struct ib_mad_agent_private, + agent); + + opa = rdma_cap_opa_mad(mad_agent->device, mad_agent->port_num); + + if (opa && base_version == OPA_MGMT_BASE_VERSION) + mad_size = sizeof(struct opa_mad); + else + mad_size = sizeof(struct ib_mad); + + pad = get_pad_size(hdr_len, data_len, mad_size); + message_size = hdr_len + data_len + pad; + + if (ib_mad_kernel_rmpp_agent(mad_agent)) { + if (!rmpp_active && message_size > mad_size) + return ERR_PTR(-EINVAL); + } else + if (rmpp_active || message_size > mad_size) + return ERR_PTR(-EINVAL); + + size = rmpp_active ? hdr_len : mad_size; + buf = kzalloc(sizeof *mad_send_wr + size, gfp_mask); + if (!buf) + return ERR_PTR(-ENOMEM); + + mad_send_wr = buf + size; + INIT_LIST_HEAD(&mad_send_wr->rmpp_list); + mad_send_wr->send_buf.mad = buf; + mad_send_wr->send_buf.hdr_len = hdr_len; + mad_send_wr->send_buf.data_len = data_len; + mad_send_wr->pad = pad; + + mad_send_wr->mad_agent_priv = mad_agent_priv; + mad_send_wr->sg_list[0].length = hdr_len; + mad_send_wr->sg_list[0].lkey = mad_agent->qp->pd->local_dma_lkey; + + /* OPA MADs don't have to be the full 2048 bytes */ + if (opa && base_version == OPA_MGMT_BASE_VERSION && + data_len < mad_size - hdr_len) + mad_send_wr->sg_list[1].length = data_len; + else + mad_send_wr->sg_list[1].length = mad_size - hdr_len; + + mad_send_wr->sg_list[1].lkey = mad_agent->qp->pd->local_dma_lkey; + + mad_send_wr->mad_list.cqe.done = ib_mad_send_done; + + mad_send_wr->send_wr.wr.wr_cqe = &mad_send_wr->mad_list.cqe; + mad_send_wr->send_wr.wr.sg_list = mad_send_wr->sg_list; + mad_send_wr->send_wr.wr.num_sge = 2; + mad_send_wr->send_wr.wr.opcode = IB_WR_SEND; + mad_send_wr->send_wr.wr.send_flags = IB_SEND_SIGNALED; + mad_send_wr->send_wr.remote_qpn = remote_qpn; + mad_send_wr->send_wr.remote_qkey = IB_QP_SET_QKEY; + mad_send_wr->send_wr.pkey_index = pkey_index; + + if (rmpp_active) { + ret = alloc_send_rmpp_list(mad_send_wr, mad_size, gfp_mask); + if (ret) { + kfree(buf); + return ERR_PTR(ret); + } + } + + mad_send_wr->send_buf.mad_agent = mad_agent; + refcount_inc(&mad_agent_priv->refcount); + return &mad_send_wr->send_buf; +} +EXPORT_SYMBOL(ib_create_send_mad); + +int ib_get_mad_data_offset(u8 mgmt_class) +{ + if (mgmt_class == IB_MGMT_CLASS_SUBN_ADM) + return IB_MGMT_SA_HDR; + else if ((mgmt_class == IB_MGMT_CLASS_DEVICE_MGMT) || + (mgmt_class == IB_MGMT_CLASS_DEVICE_ADM) || + (mgmt_class == IB_MGMT_CLASS_BIS)) + return IB_MGMT_DEVICE_HDR; + else if ((mgmt_class >= IB_MGMT_CLASS_VENDOR_RANGE2_START) && + (mgmt_class <= IB_MGMT_CLASS_VENDOR_RANGE2_END)) + return IB_MGMT_VENDOR_HDR; + else + return IB_MGMT_MAD_HDR; +} +EXPORT_SYMBOL(ib_get_mad_data_offset); + +int ib_is_mad_class_rmpp(u8 mgmt_class) +{ + if ((mgmt_class == IB_MGMT_CLASS_SUBN_ADM) || + (mgmt_class == IB_MGMT_CLASS_DEVICE_MGMT) || + (mgmt_class == IB_MGMT_CLASS_DEVICE_ADM) || + (mgmt_class == IB_MGMT_CLASS_BIS) || + ((mgmt_class >= IB_MGMT_CLASS_VENDOR_RANGE2_START) && + (mgmt_class <= IB_MGMT_CLASS_VENDOR_RANGE2_END))) + return 1; + return 0; +} +EXPORT_SYMBOL(ib_is_mad_class_rmpp); + +void *ib_get_rmpp_segment(struct ib_mad_send_buf *send_buf, int seg_num) +{ + struct ib_mad_send_wr_private *mad_send_wr; + struct list_head *list; + + mad_send_wr = container_of(send_buf, struct ib_mad_send_wr_private, + send_buf); + list = &mad_send_wr->cur_seg->list; + + if (mad_send_wr->cur_seg->num < seg_num) { + list_for_each_entry(mad_send_wr->cur_seg, list, list) + if (mad_send_wr->cur_seg->num == seg_num) + break; + } else if (mad_send_wr->cur_seg->num > seg_num) { + list_for_each_entry_reverse(mad_send_wr->cur_seg, list, list) + if (mad_send_wr->cur_seg->num == seg_num) + break; + } + return mad_send_wr->cur_seg->data; +} +EXPORT_SYMBOL(ib_get_rmpp_segment); + +static inline void *ib_get_payload(struct ib_mad_send_wr_private *mad_send_wr) +{ + if (mad_send_wr->send_buf.seg_count) + return ib_get_rmpp_segment(&mad_send_wr->send_buf, + mad_send_wr->seg_num); + else + return mad_send_wr->send_buf.mad + + mad_send_wr->send_buf.hdr_len; +} + +void ib_free_send_mad(struct ib_mad_send_buf *send_buf) +{ + struct ib_mad_agent_private *mad_agent_priv; + struct ib_mad_send_wr_private *mad_send_wr; + + mad_agent_priv = container_of(send_buf->mad_agent, + struct ib_mad_agent_private, agent); + mad_send_wr = container_of(send_buf, struct ib_mad_send_wr_private, + send_buf); + + free_send_rmpp_list(mad_send_wr); + kfree(send_buf->mad); + deref_mad_agent(mad_agent_priv); +} +EXPORT_SYMBOL(ib_free_send_mad); + +int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr) +{ + struct ib_mad_qp_info *qp_info; + struct smp_window *smp = NULL; + struct list_head *list; + struct ib_mad_agent *mad_agent; + struct ib_sge *sge; + unsigned long flags; + int ret; + + /* Set WR ID to find mad_send_wr upon completion */ + qp_info = mad_send_wr->mad_agent_priv->qp_info; + mad_send_wr->mad_list.mad_queue = &qp_info->send_queue; + mad_send_wr->mad_list.cqe.done = ib_mad_send_done; + mad_send_wr->send_wr.wr.wr_cqe = &mad_send_wr->mad_list.cqe; + + mad_agent = mad_send_wr->send_buf.mad_agent; + sge = mad_send_wr->sg_list; + sge[0].addr = ib_dma_map_single(mad_agent->device, + mad_send_wr->send_buf.mad, + sge[0].length, + DMA_TO_DEVICE); + if (unlikely(ib_dma_mapping_error(mad_agent->device, sge[0].addr))) + return -ENOMEM; + + mad_send_wr->header_mapping = sge[0].addr; + + sge[1].addr = ib_dma_map_single(mad_agent->device, + ib_get_payload(mad_send_wr), + sge[1].length, + DMA_TO_DEVICE); + if (unlikely(ib_dma_mapping_error(mad_agent->device, sge[1].addr))) { + ib_dma_unmap_single(mad_agent->device, + mad_send_wr->header_mapping, + sge[0].length, DMA_TO_DEVICE); + return -ENOMEM; + } + mad_send_wr->payload_mapping = sge[1].addr; + + spin_lock_irqsave(&qp_info->send_queue.lock, flags); + if (is_smp_mad(mad_send_wr)) { + mad_send_wr->is_smp_mad = 1; + smp = get_smp_window_obj(mad_send_wr); + if (smp->outstanding >= smp->max_outstanding) { + list_add_tail(&mad_send_wr->mad_list.list, + &smp->overflow_list); + ret = 0; + goto unlock_and_exit; + } else { + smp->outstanding++; + } + } + if (qp_info->send_queue.count < qp_info->send_queue.max_active) { + trace_ib_mad_ib_send_mad(mad_send_wr, qp_info); + ret = ib_post_send(mad_agent->qp, &mad_send_wr->send_wr.wr, + NULL); + list = &qp_info->send_queue.list; + } else { + ret = 0; + list = &qp_info->overflow_list; + } + + if (!ret) { + qp_info->send_queue.count++; + list_add_tail(&mad_send_wr->mad_list.list, list); + } +unlock_and_exit: + spin_unlock_irqrestore(&qp_info->send_queue.lock, flags); + if (ret) { + ib_dma_unmap_single(mad_agent->device, + mad_send_wr->header_mapping, + sge[0].length, DMA_TO_DEVICE); + ib_dma_unmap_single(mad_agent->device, + mad_send_wr->payload_mapping, + sge[1].length, DMA_TO_DEVICE); + } + return ret; +} + +/* + * Send SA MAD that passed congestion control + */ +static int send_sa_cc_mad(struct ib_mad_send_wr_private *mad_send_wr, + u32 timeout_ms, u32 retries_left) +{ + int ret; + unsigned long flags; + struct ib_mad_agent_private *mad_agent_priv; + + mad_agent_priv = mad_send_wr->mad_agent_priv; + mad_send_wr->timeout = msecs_to_jiffies(timeout_ms); + mad_send_wr->retries_left = retries_left; + mad_send_wr->refcount = 1 + (mad_send_wr->timeout > 0); + + /* Reference MAD agent until send completes */ + refcount_inc(&mad_agent_priv->refcount); + spin_lock_irqsave(&mad_agent_priv->lock, flags); + if (mad_agent_priv->send_list_closed) { + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + deref_mad_agent(mad_agent_priv); + return -EIO; + } + list_add_tail(&mad_send_wr->agent_list, + &mad_agent_priv->send_list); + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + + ret = ib_send_mad(mad_send_wr); + if (ret < 0) { + /* Fail send request */ + spin_lock_irqsave(&mad_agent_priv->lock, flags); + list_del(&mad_send_wr->agent_list); + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + deref_mad_agent(mad_agent_priv); + } + + return ret; +} + +/* + * ib_post_send_mad - Posts MAD(s) to the send queue of the QP associated + * with the registered client + */ +int ib_post_send_mad(struct ib_mad_send_buf *send_buf, + struct ib_mad_send_buf **bad_send_buf) +{ + struct ib_mad_agent_private *mad_agent_priv; + struct ib_mad_send_buf *next_send_buf; + struct ib_mad_send_wr_private *mad_send_wr; + unsigned long flags; + int ret = -EINVAL; + + /* Walk list of send WRs and post each on send list */ + for (; send_buf; send_buf = next_send_buf) { + mad_send_wr = container_of(send_buf, + struct ib_mad_send_wr_private, + send_buf); + mad_agent_priv = mad_send_wr->mad_agent_priv; + + ret = ib_mad_enforce_security(mad_agent_priv, + mad_send_wr->send_wr.pkey_index); + if (ret) + goto error; + + if (!send_buf->mad_agent->send_handler || + (send_buf->timeout_ms && + !send_buf->mad_agent->recv_handler)) { + ret = -EINVAL; + goto error; + } + + if (!ib_is_mad_class_rmpp(((struct ib_mad_hdr *) send_buf->mad)->mgmt_class)) { + if (mad_agent_priv->agent.rmpp_version) { + ret = -EINVAL; + goto error; + } + } + + /* + * Save pointer to next work request to post in case the + * current one completes, and the user modifies the work + * request associated with the completion + */ + next_send_buf = send_buf->next; + mad_send_wr->send_wr.ah = send_buf->ah; + + if (((struct ib_mad_hdr *) send_buf->mad)->mgmt_class == + IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { + ret = handle_outgoing_dr_smp(mad_agent_priv, + mad_send_wr); + if (ret < 0) /* error */ + goto error; + else if (ret == 1) /* locally consumed */ + continue; + } + + mad_send_wr->tid = ((struct ib_mad_hdr *) send_buf->mad)->tid; + /* Timeout will be updated after send completes */ + mad_send_wr->timeout = msecs_to_jiffies(send_buf->timeout_ms); + mad_send_wr->max_retries = send_buf->retries; + mad_send_wr->retries_left = send_buf->retries; + send_buf->retries = 0; + /* Reference for work request to QP + response */ + mad_send_wr->refcount = 1 + (mad_send_wr->timeout > 0); + mad_send_wr->status = IB_WC_SUCCESS; + + if (is_sa_cc_mad(mad_send_wr)) { + mad_send_wr->is_sa_cc_mad = 1; + ret = sa_cc_mad_send(mad_send_wr); + if (ret < 0) + goto error; + } else { + /* Reference MAD agent until send completes */ + refcount_inc(&mad_agent_priv->refcount); + spin_lock_irqsave(&mad_agent_priv->lock, flags); + list_add_tail(&mad_send_wr->agent_list, + &mad_agent_priv->send_list); + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + + if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) { + ret = ib_send_rmpp_mad(mad_send_wr); + if (ret >= 0 && ret != IB_RMPP_RESULT_CONSUMED) + ret = ib_send_mad(mad_send_wr); + } else { + ret = ib_send_mad(mad_send_wr); + } + if (ret < 0) { + /* Fail send request */ + spin_lock_irqsave(&mad_agent_priv->lock, flags); + list_del(&mad_send_wr->agent_list); + spin_unlock_irqrestore(&mad_agent_priv->lock, + flags); + deref_mad_agent(mad_agent_priv); + goto error; + } + } + } + return 0; +error: + if (bad_send_buf) + *bad_send_buf = send_buf; + return ret; +} +EXPORT_SYMBOL(ib_post_send_mad); + +/* + * ib_free_recv_mad - Returns data buffers used to receive + * a MAD to the access layer + */ +void ib_free_recv_mad(struct ib_mad_recv_wc *mad_recv_wc) +{ + struct ib_mad_recv_buf *mad_recv_buf, *temp_recv_buf; + struct ib_mad_private_header *mad_priv_hdr; + struct ib_mad_private *priv; + struct list_head free_list; + + INIT_LIST_HEAD(&free_list); + list_splice_init(&mad_recv_wc->rmpp_list, &free_list); + + list_for_each_entry_safe(mad_recv_buf, temp_recv_buf, + &free_list, list) { + mad_recv_wc = container_of(mad_recv_buf, struct ib_mad_recv_wc, + recv_buf); + mad_priv_hdr = container_of(mad_recv_wc, + struct ib_mad_private_header, + recv_wc); + priv = container_of(mad_priv_hdr, struct ib_mad_private, + header); + kfree(priv); + } +} +EXPORT_SYMBOL(ib_free_recv_mad); + +static int method_in_use(struct ib_mad_mgmt_method_table **method, + struct ib_mad_reg_req *mad_reg_req) +{ + int i; + + for_each_set_bit(i, mad_reg_req->method_mask, IB_MGMT_MAX_METHODS) { + if ((*method)->agent[i]) { + pr_err("Method %d already in use\n", i); + return -EINVAL; + } + } + return 0; +} + +static int allocate_method_table(struct ib_mad_mgmt_method_table **method) +{ + /* Allocate management method table */ + *method = kzalloc(sizeof **method, GFP_ATOMIC); + return (*method) ? 0 : (-ENOMEM); +} + +/* + * Check to see if there are any methods still in use + */ +static int check_method_table(struct ib_mad_mgmt_method_table *method) +{ + int i; + + for (i = 0; i < IB_MGMT_MAX_METHODS; i++) + if (method->agent[i]) + return 1; + return 0; +} + +/* + * Check to see if there are any method tables for this class still in use + */ +static int check_class_table(struct ib_mad_mgmt_class_table *class) +{ + int i; + + for (i = 0; i < MAX_MGMT_CLASS; i++) + if (class->method_table[i]) + return 1; + return 0; +} + +static int check_vendor_class(struct ib_mad_mgmt_vendor_class *vendor_class) +{ + int i; + + for (i = 0; i < MAX_MGMT_OUI; i++) + if (vendor_class->method_table[i]) + return 1; + return 0; +} + +static int find_vendor_oui(struct ib_mad_mgmt_vendor_class *vendor_class, + const char *oui) +{ + int i; + + for (i = 0; i < MAX_MGMT_OUI; i++) + /* Is there matching OUI for this vendor class ? */ + if (!memcmp(vendor_class->oui[i], oui, 3)) + return i; + + return -1; +} + +static int check_vendor_table(struct ib_mad_mgmt_vendor_class_table *vendor) +{ + int i; + + for (i = 0; i < MAX_MGMT_VENDOR_RANGE2; i++) + if (vendor->vendor_class[i]) + return 1; + + return 0; +} + +static void remove_methods_mad_agent(struct ib_mad_mgmt_method_table *method, + struct ib_mad_agent_private *agent) +{ + int i; + + /* Remove any methods for this mad agent */ + for (i = 0; i < IB_MGMT_MAX_METHODS; i++) + if (method->agent[i] == agent) + method->agent[i] = NULL; +} + +static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req, + struct ib_mad_agent_private *agent_priv, + u8 mgmt_class) +{ + struct ib_mad_port_private *port_priv; + struct ib_mad_mgmt_class_table **class; + struct ib_mad_mgmt_method_table **method; + int i, ret; + + port_priv = agent_priv->qp_info->port_priv; + class = &port_priv->version[mad_reg_req->mgmt_class_version].class; + if (!*class) { + /* Allocate management class table for "new" class version */ + *class = kzalloc(sizeof **class, GFP_ATOMIC); + if (!*class) { + ret = -ENOMEM; + goto error1; + } + + /* Allocate method table for this management class */ + method = &(*class)->method_table[mgmt_class]; + if ((ret = allocate_method_table(method))) + goto error2; + } else { + method = &(*class)->method_table[mgmt_class]; + if (!*method) { + /* Allocate method table for this management class */ + if ((ret = allocate_method_table(method))) + goto error1; + } + } + + /* Now, make sure methods are not already in use */ + if (method_in_use(method, mad_reg_req)) + goto error3; + + /* Finally, add in methods being registered */ + for_each_set_bit(i, mad_reg_req->method_mask, IB_MGMT_MAX_METHODS) + (*method)->agent[i] = agent_priv; + + return 0; + +error3: + /* Remove any methods for this mad agent */ + remove_methods_mad_agent(*method, agent_priv); + /* Now, check to see if there are any methods in use */ + if (!check_method_table(*method)) { + /* If not, release management method table */ + kfree(*method); + *method = NULL; + } + ret = -EINVAL; + goto error1; +error2: + kfree(*class); + *class = NULL; +error1: + return ret; +} + +static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req, + struct ib_mad_agent_private *agent_priv) +{ + struct ib_mad_port_private *port_priv; + struct ib_mad_mgmt_vendor_class_table **vendor_table; + struct ib_mad_mgmt_vendor_class_table *vendor = NULL; + struct ib_mad_mgmt_vendor_class *vendor_class = NULL; + struct ib_mad_mgmt_method_table **method; + int i, ret = -ENOMEM; + u8 vclass; + + /* "New" vendor (with OUI) class */ + vclass = vendor_class_index(mad_reg_req->mgmt_class); + port_priv = agent_priv->qp_info->port_priv; + vendor_table = &port_priv->version[ + mad_reg_req->mgmt_class_version].vendor; + if (!*vendor_table) { + /* Allocate mgmt vendor class table for "new" class version */ + vendor = kzalloc(sizeof *vendor, GFP_ATOMIC); + if (!vendor) + goto error1; + + *vendor_table = vendor; + } + if (!(*vendor_table)->vendor_class[vclass]) { + /* Allocate table for this management vendor class */ + vendor_class = kzalloc(sizeof *vendor_class, GFP_ATOMIC); + if (!vendor_class) + goto error2; + + (*vendor_table)->vendor_class[vclass] = vendor_class; + } + for (i = 0; i < MAX_MGMT_OUI; i++) { + /* Is there matching OUI for this vendor class ? */ + if (!memcmp((*vendor_table)->vendor_class[vclass]->oui[i], + mad_reg_req->oui, 3)) { + method = &(*vendor_table)->vendor_class[ + vclass]->method_table[i]; + if (!*method) + goto error3; + goto check_in_use; + } + } + for (i = 0; i < MAX_MGMT_OUI; i++) { + /* OUI slot available ? */ + if (!is_vendor_oui((*vendor_table)->vendor_class[ + vclass]->oui[i])) { + method = &(*vendor_table)->vendor_class[ + vclass]->method_table[i]; + /* Allocate method table for this OUI */ + if (!*method) { + ret = allocate_method_table(method); + if (ret) + goto error3; + } + memcpy((*vendor_table)->vendor_class[vclass]->oui[i], + mad_reg_req->oui, 3); + goto check_in_use; + } + } + dev_err(&agent_priv->agent.device->dev, "All OUI slots in use\n"); + goto error3; + +check_in_use: + /* Now, make sure methods are not already in use */ + if (method_in_use(method, mad_reg_req)) + goto error4; + + /* Finally, add in methods being registered */ + for_each_set_bit(i, mad_reg_req->method_mask, IB_MGMT_MAX_METHODS) + (*method)->agent[i] = agent_priv; + + return 0; + +error4: + /* Remove any methods for this mad agent */ + remove_methods_mad_agent(*method, agent_priv); + /* Now, check to see if there are any methods in use */ + if (!check_method_table(*method)) { + /* If not, release management method table */ + kfree(*method); + *method = NULL; + } + ret = -EINVAL; +error3: + if (vendor_class) { + (*vendor_table)->vendor_class[vclass] = NULL; + kfree(vendor_class); + } +error2: + if (vendor) { + *vendor_table = NULL; + kfree(vendor); + } +error1: + return ret; +} + +static void remove_mad_reg_req(struct ib_mad_agent_private *agent_priv) +{ + struct ib_mad_port_private *port_priv; + struct ib_mad_mgmt_class_table *class; + struct ib_mad_mgmt_method_table *method; + struct ib_mad_mgmt_vendor_class_table *vendor; + struct ib_mad_mgmt_vendor_class *vendor_class; + int index; + u8 mgmt_class; + + /* + * Was MAD registration request supplied + * with original registration ? + */ + if (!agent_priv->reg_req) + goto out; + + port_priv = agent_priv->qp_info->port_priv; + mgmt_class = convert_mgmt_class(agent_priv->reg_req->mgmt_class); + class = port_priv->version[ + agent_priv->reg_req->mgmt_class_version].class; + if (!class) + goto vendor_check; + + method = class->method_table[mgmt_class]; + if (method) { + /* Remove any methods for this mad agent */ + remove_methods_mad_agent(method, agent_priv); + /* Now, check to see if there are any methods still in use */ + if (!check_method_table(method)) { + /* If not, release management method table */ + kfree(method); + class->method_table[mgmt_class] = NULL; + /* Any management classes left ? */ + if (!check_class_table(class)) { + /* If not, release management class table */ + kfree(class); + port_priv->version[ + agent_priv->reg_req-> + mgmt_class_version].class = NULL; + } + } + } + +vendor_check: + if (!is_vendor_class(mgmt_class)) + goto out; + + /* normalize mgmt_class to vendor range 2 */ + mgmt_class = vendor_class_index(agent_priv->reg_req->mgmt_class); + vendor = port_priv->version[ + agent_priv->reg_req->mgmt_class_version].vendor; + + if (!vendor) + goto out; + + vendor_class = vendor->vendor_class[mgmt_class]; + if (vendor_class) { + index = find_vendor_oui(vendor_class, agent_priv->reg_req->oui); + if (index < 0) + goto out; + method = vendor_class->method_table[index]; + if (method) { + /* Remove any methods for this mad agent */ + remove_methods_mad_agent(method, agent_priv); + /* + * Now, check to see if there are + * any methods still in use + */ + if (!check_method_table(method)) { + /* If not, release management method table */ + kfree(method); + vendor_class->method_table[index] = NULL; + memset(vendor_class->oui[index], 0, 3); + /* Any OUIs left ? */ + if (!check_vendor_class(vendor_class)) { + /* If not, release vendor class table */ + kfree(vendor_class); + vendor->vendor_class[mgmt_class] = NULL; + /* Any other vendor classes left ? */ + if (!check_vendor_table(vendor)) { + kfree(vendor); + port_priv->version[ + agent_priv->reg_req-> + mgmt_class_version]. + vendor = NULL; + } + } + } + } + } + +out: + return; +} + +static struct ib_mad_agent_private * +find_mad_agent(struct ib_mad_port_private *port_priv, + const struct ib_mad_hdr *mad_hdr) +{ + struct ib_mad_agent_private *mad_agent = NULL; + unsigned long flags; + + if (ib_response_mad(mad_hdr)) { + u32 hi_tid; + + /* + * Routing is based on high 32 bits of transaction ID + * of MAD. + */ + hi_tid = be64_to_cpu(mad_hdr->tid) >> 32; + rcu_read_lock(); + mad_agent = xa_load(&ib_mad_clients, hi_tid); + if (mad_agent && !refcount_inc_not_zero(&mad_agent->refcount)) + mad_agent = NULL; + rcu_read_unlock(); + } else { + struct ib_mad_mgmt_class_table *class; + struct ib_mad_mgmt_method_table *method; + struct ib_mad_mgmt_vendor_class_table *vendor; + struct ib_mad_mgmt_vendor_class *vendor_class; + const struct ib_vendor_mad *vendor_mad; + int index; + + spin_lock_irqsave(&port_priv->reg_lock, flags); + /* + * Routing is based on version, class, and method + * For "newer" vendor MADs, also based on OUI + */ + if (mad_hdr->class_version >= MAX_MGMT_VERSION) + goto out; + if (!is_vendor_class(mad_hdr->mgmt_class)) { + class = port_priv->version[ + mad_hdr->class_version].class; + if (!class) + goto out; + if (convert_mgmt_class(mad_hdr->mgmt_class) >= + ARRAY_SIZE(class->method_table)) + goto out; + method = class->method_table[convert_mgmt_class( + mad_hdr->mgmt_class)]; + if (method) + mad_agent = method->agent[mad_hdr->method & + ~IB_MGMT_METHOD_RESP]; + } else { + vendor = port_priv->version[ + mad_hdr->class_version].vendor; + if (!vendor) + goto out; + vendor_class = vendor->vendor_class[vendor_class_index( + mad_hdr->mgmt_class)]; + if (!vendor_class) + goto out; + /* Find matching OUI */ + vendor_mad = (const struct ib_vendor_mad *)mad_hdr; + index = find_vendor_oui(vendor_class, vendor_mad->oui); + if (index == -1) + goto out; + method = vendor_class->method_table[index]; + if (method) { + mad_agent = method->agent[mad_hdr->method & + ~IB_MGMT_METHOD_RESP]; + } + } + if (mad_agent) + refcount_inc(&mad_agent->refcount); +out: + spin_unlock_irqrestore(&port_priv->reg_lock, flags); + } + + if (mad_agent && !mad_agent->agent.recv_handler) { + dev_notice(&port_priv->device->dev, + "No receive handler for client %p on port %u\n", + &mad_agent->agent, port_priv->port_num); + deref_mad_agent(mad_agent); + mad_agent = NULL; + } + + return mad_agent; +} + +static int validate_mad(const struct ib_mad_hdr *mad_hdr, + const struct ib_mad_qp_info *qp_info, + bool opa) +{ + int valid = 0; + u32 qp_num = qp_info->qp->qp_num; + + /* Make sure MAD base version is understood */ + if (mad_hdr->base_version != IB_MGMT_BASE_VERSION && + (!opa || mad_hdr->base_version != OPA_MGMT_BASE_VERSION)) { + pr_err("MAD received with unsupported base version %u %s\n", + mad_hdr->base_version, opa ? "(opa)" : ""); + goto out; + } + + /* Filter SMI packets sent to other than QP0 */ + if ((mad_hdr->mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED) || + (mad_hdr->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) { + if (qp_num == 0) + valid = 1; + } else { + /* CM attributes other than ClassPortInfo only use Send method */ + if ((mad_hdr->mgmt_class == IB_MGMT_CLASS_CM) && + (mad_hdr->attr_id != IB_MGMT_CLASSPORTINFO_ATTR_ID) && + (mad_hdr->method != IB_MGMT_METHOD_SEND)) + goto out; + /* Filter GSI packets sent to QP0 */ + if (qp_num != 0) + valid = 1; + } + +out: + return valid; +} + +static int is_rmpp_data_mad(const struct ib_mad_agent_private *mad_agent_priv, + const struct ib_mad_hdr *mad_hdr) +{ + struct ib_rmpp_mad *rmpp_mad; + + rmpp_mad = (struct ib_rmpp_mad *)mad_hdr; + return !mad_agent_priv->agent.rmpp_version || + !ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent) || + !(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & + IB_MGMT_RMPP_FLAG_ACTIVE) || + (rmpp_mad->rmpp_hdr.rmpp_type == IB_MGMT_RMPP_TYPE_DATA); +} + +static inline int rcv_has_same_class(const struct ib_mad_send_wr_private *wr, + const struct ib_mad_recv_wc *rwc) +{ + return ((struct ib_mad_hdr *)(wr->send_buf.mad))->mgmt_class == + rwc->recv_buf.mad->mad_hdr.mgmt_class; +} + +static inline int +rcv_has_same_gid(const struct ib_mad_agent_private *mad_agent_priv, + const struct ib_mad_send_wr_private *wr, + const struct ib_mad_recv_wc *rwc) +{ + struct rdma_ah_attr attr; + u8 send_resp, rcv_resp; + union ib_gid sgid; + struct ib_device *device = mad_agent_priv->agent.device; + u32 port_num = mad_agent_priv->agent.port_num; + u8 lmc; + bool has_grh; + + send_resp = ib_response_mad((struct ib_mad_hdr *)wr->send_buf.mad); + rcv_resp = ib_response_mad(&rwc->recv_buf.mad->mad_hdr); + + if (send_resp == rcv_resp) + /* both requests, or both responses. GIDs different */ + return 0; + + if (rdma_query_ah(wr->send_buf.ah, &attr)) + /* Assume not equal, to avoid false positives. */ + return 0; + + has_grh = !!(rdma_ah_get_ah_flags(&attr) & IB_AH_GRH); + if (has_grh != !!(rwc->wc->wc_flags & IB_WC_GRH)) + /* one has GID, other does not. Assume different */ + return 0; + + if (!send_resp && rcv_resp) { + /* is request/response. */ + if (!has_grh) { + if (ib_get_cached_lmc(device, port_num, &lmc)) + return 0; + return (!lmc || !((rdma_ah_get_path_bits(&attr) ^ + rwc->wc->dlid_path_bits) & + ((1 << lmc) - 1))); + } else { + const struct ib_global_route *grh = + rdma_ah_read_grh(&attr); + + if (rdma_query_gid(device, port_num, + grh->sgid_index, &sgid)) + return 0; + return !memcmp(sgid.raw, rwc->recv_buf.grh->dgid.raw, + 16); + } + } + + if (!has_grh) + return rdma_ah_get_dlid(&attr) == rwc->wc->slid; + else + return !memcmp(rdma_ah_read_grh(&attr)->dgid.raw, + rwc->recv_buf.grh->sgid.raw, + 16); +} + +static inline int is_direct(u8 class) +{ + return (class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE); +} + +struct ib_mad_send_wr_private* +ib_find_send_mad(const struct ib_mad_agent_private *mad_agent_priv, + const struct ib_mad_recv_wc *wc) +{ + struct ib_mad_send_wr_private *wr; + const struct ib_mad_hdr *mad_hdr; + + mad_hdr = &wc->recv_buf.mad->mad_hdr; + + list_for_each_entry(wr, &mad_agent_priv->wait_list, agent_list) { + if ((wr->tid == mad_hdr->tid) && + rcv_has_same_class(wr, wc) && + /* + * Don't check GID for direct routed MADs. + * These might have permissive LIDs. + */ + (is_direct(mad_hdr->mgmt_class) || + rcv_has_same_gid(mad_agent_priv, wr, wc))) + return (wr->status == IB_WC_SUCCESS) ? wr : NULL; + } + + /* + * It's possible to receive the response before we've + * been notified that the send has completed + */ + list_for_each_entry(wr, &mad_agent_priv->send_list, agent_list) { + if (is_rmpp_data_mad(mad_agent_priv, wr->send_buf.mad) && + wr->tid == mad_hdr->tid && + wr->timeout && + rcv_has_same_class(wr, wc) && + /* + * Don't check GID for direct routed MADs. + * These might have permissive LIDs. + */ + (is_direct(mad_hdr->mgmt_class) || + rcv_has_same_gid(mad_agent_priv, wr, wc))) + /* Verify request has not been canceled */ + return (wr->status == IB_WC_SUCCESS) ? wr : NULL; + } + return NULL; +} + +void ib_mark_mad_done(struct ib_mad_send_wr_private *mad_send_wr) +{ + mad_send_wr->timeout = 0; + if (mad_send_wr->refcount == 1) + list_move_tail(&mad_send_wr->agent_list, + &mad_send_wr->mad_agent_priv->done_list); +} + +static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv, + struct ib_mad_recv_wc *mad_recv_wc) +{ + struct ib_mad_send_wr_private *mad_send_wr; + struct ib_mad_send_wc mad_send_wc; + unsigned long flags; + int ret; + + INIT_LIST_HEAD(&mad_recv_wc->rmpp_list); + ret = ib_mad_enforce_security(mad_agent_priv, + mad_recv_wc->wc->pkey_index); + if (ret) { + ib_free_recv_mad(mad_recv_wc); + deref_mad_agent(mad_agent_priv); + return; + } + + list_add(&mad_recv_wc->recv_buf.list, &mad_recv_wc->rmpp_list); + if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) { + mad_recv_wc = ib_process_rmpp_recv_wc(mad_agent_priv, + mad_recv_wc); + if (!mad_recv_wc) { + deref_mad_agent(mad_agent_priv); + return; + } + } + + /* Complete corresponding request */ + if (ib_response_mad(&mad_recv_wc->recv_buf.mad->mad_hdr)) { + spin_lock_irqsave(&mad_agent_priv->lock, flags); + mad_send_wr = ib_find_send_mad(mad_agent_priv, mad_recv_wc); + if (!mad_send_wr) { + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + if (!ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent) + && ib_is_mad_class_rmpp(mad_recv_wc->recv_buf.mad->mad_hdr.mgmt_class) + && (ib_get_rmpp_flags(&((struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad)->rmpp_hdr) + & IB_MGMT_RMPP_FLAG_ACTIVE)) { + /* user rmpp is in effect + * and this is an active RMPP MAD + */ + mad_agent_priv->agent.recv_handler( + &mad_agent_priv->agent, NULL, + mad_recv_wc); + deref_mad_agent(mad_agent_priv); + } else { + /* not user rmpp, revert to normal behavior and + * drop the mad + */ + ib_free_recv_mad(mad_recv_wc); + deref_mad_agent(mad_agent_priv); + return; + } + } else { + ib_mark_mad_done(mad_send_wr); + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + + /* Defined behavior is to complete response before request */ + mad_agent_priv->agent.recv_handler( + &mad_agent_priv->agent, + &mad_send_wr->send_buf, + mad_recv_wc); + deref_mad_agent(mad_agent_priv); + + mad_send_wc.status = IB_WC_SUCCESS; + mad_send_wc.vendor_err = 0; + mad_send_wc.send_buf = &mad_send_wr->send_buf; + ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc); + } + } else { + mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent, NULL, + mad_recv_wc); + deref_mad_agent(mad_agent_priv); + } +} + +static enum smi_action handle_ib_smi(const struct ib_mad_port_private *port_priv, + const struct ib_mad_qp_info *qp_info, + const struct ib_wc *wc, + u32 port_num, + struct ib_mad_private *recv, + struct ib_mad_private *response) +{ + enum smi_forward_action retsmi; + struct ib_smp *smp = (struct ib_smp *)recv->mad; + + trace_ib_mad_handle_ib_smi(smp); + + if (smi_handle_dr_smp_recv(smp, + rdma_cap_ib_switch(port_priv->device), + port_num, + port_priv->device->phys_port_cnt) == + IB_SMI_DISCARD) + return IB_SMI_DISCARD; + + retsmi = smi_check_forward_dr_smp(smp); + if (retsmi == IB_SMI_LOCAL) + return IB_SMI_HANDLE; + + if (retsmi == IB_SMI_SEND) { /* don't forward */ + if (smi_handle_dr_smp_send(smp, + rdma_cap_ib_switch(port_priv->device), + port_num) == IB_SMI_DISCARD) + return IB_SMI_DISCARD; + + if (smi_check_local_smp(smp, port_priv->device) == IB_SMI_DISCARD) + return IB_SMI_DISCARD; + } else if (rdma_cap_ib_switch(port_priv->device)) { + /* forward case for switches */ + memcpy(response, recv, mad_priv_size(response)); + response->header.recv_wc.wc = &response->header.wc; + response->header.recv_wc.recv_buf.mad = (struct ib_mad *)response->mad; + response->header.recv_wc.recv_buf.grh = &response->grh; + + agent_send_response((const struct ib_mad_hdr *)response->mad, + &response->grh, wc, + port_priv->device, + smi_get_fwd_port(smp), + qp_info->qp->qp_num, + response->mad_size, + false); + + return IB_SMI_DISCARD; + } + return IB_SMI_HANDLE; +} + +static bool generate_unmatched_resp(const struct ib_mad_private *recv, + struct ib_mad_private *response, + size_t *resp_len, bool opa) +{ + const struct ib_mad_hdr *recv_hdr = (const struct ib_mad_hdr *)recv->mad; + struct ib_mad_hdr *resp_hdr = (struct ib_mad_hdr *)response->mad; + + if (recv_hdr->method == IB_MGMT_METHOD_GET || + recv_hdr->method == IB_MGMT_METHOD_SET) { + memcpy(response, recv, mad_priv_size(response)); + response->header.recv_wc.wc = &response->header.wc; + response->header.recv_wc.recv_buf.mad = (struct ib_mad *)response->mad; + response->header.recv_wc.recv_buf.grh = &response->grh; + resp_hdr->method = IB_MGMT_METHOD_GET_RESP; + resp_hdr->status = cpu_to_be16(IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD_ATTRIB); + if (recv_hdr->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + resp_hdr->status |= IB_SMP_DIRECTION; + + if (opa && recv_hdr->base_version == OPA_MGMT_BASE_VERSION) { + if (recv_hdr->mgmt_class == + IB_MGMT_CLASS_SUBN_LID_ROUTED || + recv_hdr->mgmt_class == + IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + *resp_len = opa_get_smp_header_size( + (struct opa_smp *)recv->mad); + else + *resp_len = sizeof(struct ib_mad_hdr); + } + + return true; + } else { + return false; + } +} + +static enum smi_action +handle_opa_smi(struct ib_mad_port_private *port_priv, + struct ib_mad_qp_info *qp_info, + struct ib_wc *wc, + u32 port_num, + struct ib_mad_private *recv, + struct ib_mad_private *response) +{ + enum smi_forward_action retsmi; + struct opa_smp *smp = (struct opa_smp *)recv->mad; + + trace_ib_mad_handle_opa_smi(smp); + + if (opa_smi_handle_dr_smp_recv(smp, + rdma_cap_ib_switch(port_priv->device), + port_num, + port_priv->device->phys_port_cnt) == + IB_SMI_DISCARD) + return IB_SMI_DISCARD; + + retsmi = opa_smi_check_forward_dr_smp(smp); + if (retsmi == IB_SMI_LOCAL) + return IB_SMI_HANDLE; + + if (retsmi == IB_SMI_SEND) { /* don't forward */ + if (opa_smi_handle_dr_smp_send(smp, + rdma_cap_ib_switch(port_priv->device), + port_num) == IB_SMI_DISCARD) + return IB_SMI_DISCARD; + + if (opa_smi_check_local_smp(smp, port_priv->device) == + IB_SMI_DISCARD) + return IB_SMI_DISCARD; + + } else if (rdma_cap_ib_switch(port_priv->device)) { + /* forward case for switches */ + memcpy(response, recv, mad_priv_size(response)); + response->header.recv_wc.wc = &response->header.wc; + response->header.recv_wc.recv_buf.opa_mad = + (struct opa_mad *)response->mad; + response->header.recv_wc.recv_buf.grh = &response->grh; + + agent_send_response((const struct ib_mad_hdr *)response->mad, + &response->grh, wc, + port_priv->device, + opa_smi_get_fwd_port(smp), + qp_info->qp->qp_num, + recv->header.wc.byte_len, + true); + + return IB_SMI_DISCARD; + } + + return IB_SMI_HANDLE; +} + +static enum smi_action +handle_smi(struct ib_mad_port_private *port_priv, + struct ib_mad_qp_info *qp_info, + struct ib_wc *wc, + u32 port_num, + struct ib_mad_private *recv, + struct ib_mad_private *response, + bool opa) +{ + struct ib_mad_hdr *mad_hdr = (struct ib_mad_hdr *)recv->mad; + + if (opa && mad_hdr->base_version == OPA_MGMT_BASE_VERSION && + mad_hdr->class_version == OPA_SM_CLASS_VERSION) + return handle_opa_smi(port_priv, qp_info, wc, port_num, recv, + response); + + return handle_ib_smi(port_priv, qp_info, wc, port_num, recv, response); +} + +static void ib_mad_recv_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct ib_mad_port_private *port_priv = cq->cq_context; + struct ib_mad_list_head *mad_list = + container_of(wc->wr_cqe, struct ib_mad_list_head, cqe); + struct ib_mad_qp_info *qp_info; + struct ib_mad_private_header *mad_priv_hdr; + struct ib_mad_private *recv, *response = NULL; + struct ib_mad_agent_private *mad_agent; + u32 port_num; + int ret = IB_MAD_RESULT_SUCCESS; + size_t mad_size; + u16 resp_mad_pkey_index = 0; + bool opa; + + if (list_empty_careful(&port_priv->port_list)) + return; + + if (wc->status != IB_WC_SUCCESS) { + /* + * Receive errors indicate that the QP has entered the error + * state - error handling/shutdown code will cleanup + */ + return; + } + + qp_info = mad_list->mad_queue->qp_info; + dequeue_mad(mad_list); + + opa = rdma_cap_opa_mad(qp_info->port_priv->device, + qp_info->port_priv->port_num); + + mad_priv_hdr = container_of(mad_list, struct ib_mad_private_header, + mad_list); + recv = container_of(mad_priv_hdr, struct ib_mad_private, header); + ib_dma_unmap_single(port_priv->device, + recv->header.mapping, + mad_priv_dma_size(recv), + DMA_FROM_DEVICE); + + /* Setup MAD receive work completion from "normal" work completion */ + recv->header.wc = *wc; + recv->header.recv_wc.wc = &recv->header.wc; + + if (opa && ((struct ib_mad_hdr *)(recv->mad))->base_version == OPA_MGMT_BASE_VERSION) { + recv->header.recv_wc.mad_len = wc->byte_len - sizeof(struct ib_grh); + recv->header.recv_wc.mad_seg_size = sizeof(struct opa_mad); + } else { + recv->header.recv_wc.mad_len = sizeof(struct ib_mad); + recv->header.recv_wc.mad_seg_size = sizeof(struct ib_mad); + } + + recv->header.recv_wc.recv_buf.mad = (struct ib_mad *)recv->mad; + recv->header.recv_wc.recv_buf.grh = &recv->grh; + + /* Validate MAD */ + if (!validate_mad((const struct ib_mad_hdr *)recv->mad, qp_info, opa)) + goto out; + + trace_ib_mad_recv_done_handler(qp_info, wc, + (struct ib_mad_hdr *)recv->mad); + + mad_size = recv->mad_size; + response = alloc_mad_private(mad_size, GFP_KERNEL); + if (!response) + goto out; + + if (rdma_cap_ib_switch(port_priv->device)) + port_num = wc->port_num; + else + port_num = port_priv->port_num; + + if (((struct ib_mad_hdr *)recv->mad)->mgmt_class == + IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { + if (handle_smi(port_priv, qp_info, wc, port_num, recv, + response, opa) + == IB_SMI_DISCARD) + goto out; + } + + /* Give driver "right of first refusal" on incoming MAD */ + if (port_priv->device->ops.process_mad) { + ret = port_priv->device->ops.process_mad( + port_priv->device, 0, port_priv->port_num, wc, + &recv->grh, (const struct ib_mad *)recv->mad, + (struct ib_mad *)response->mad, &mad_size, + &resp_mad_pkey_index); + + if (opa) + wc->pkey_index = resp_mad_pkey_index; + + if (ret & IB_MAD_RESULT_SUCCESS) { + if (ret & IB_MAD_RESULT_CONSUMED) + goto out; + if (ret & IB_MAD_RESULT_REPLY) { + agent_send_response((const struct ib_mad_hdr *)response->mad, + &recv->grh, wc, + port_priv->device, + port_num, + qp_info->qp->qp_num, + mad_size, opa); + goto out; + } + } + } + + mad_agent = find_mad_agent(port_priv, (const struct ib_mad_hdr *)recv->mad); + if (mad_agent) { + trace_ib_mad_recv_done_agent(mad_agent); + ib_mad_complete_recv(mad_agent, &recv->header.recv_wc); + /* + * recv is freed up in error cases in ib_mad_complete_recv + * or via recv_handler in ib_mad_complete_recv() + */ + recv = NULL; + } else if ((ret & IB_MAD_RESULT_SUCCESS) && + generate_unmatched_resp(recv, response, &mad_size, opa)) { + agent_send_response((const struct ib_mad_hdr *)response->mad, &recv->grh, wc, + port_priv->device, port_num, + qp_info->qp->qp_num, mad_size, opa); + } + +out: + /* Post another receive request for this QP */ + if (response) { + ib_mad_post_receive_mads(qp_info, response); + kfree(recv); + } else + ib_mad_post_receive_mads(qp_info, recv); +} + +static void adjust_timeout(struct ib_mad_agent_private *mad_agent_priv) +{ + struct ib_mad_send_wr_private *mad_send_wr; + unsigned long delay; + + if (list_empty(&mad_agent_priv->wait_list)) { + cancel_delayed_work(&mad_agent_priv->timed_work); + } else { + mad_send_wr = list_entry(mad_agent_priv->wait_list.next, + struct ib_mad_send_wr_private, + agent_list); + + if (time_after(mad_agent_priv->timeout, + mad_send_wr->timeout)) { + mad_agent_priv->timeout = mad_send_wr->timeout; + delay = mad_send_wr->timeout - jiffies; + if ((long)delay <= 0) + delay = 1; + mod_delayed_work(mad_agent_priv->qp_info->port_priv->wq, + &mad_agent_priv->timed_work, delay); + } + } +} + +static void wait_for_response(struct ib_mad_send_wr_private *mad_send_wr) +{ + struct ib_mad_agent_private *mad_agent_priv; + struct ib_mad_send_wr_private *temp_mad_send_wr; + struct list_head *list_item; + unsigned long delay; + + mad_agent_priv = mad_send_wr->mad_agent_priv; + list_del(&mad_send_wr->agent_list); + + delay = mad_send_wr->timeout; + mad_send_wr->timeout += jiffies; + + if (delay) { + list_for_each_prev(list_item, &mad_agent_priv->wait_list) { + temp_mad_send_wr = list_entry(list_item, + struct ib_mad_send_wr_private, + agent_list); + if (time_after(mad_send_wr->timeout, + temp_mad_send_wr->timeout)) + break; + } + } else { + list_item = &mad_agent_priv->wait_list; + } + + list_add(&mad_send_wr->agent_list, list_item); + + /* Reschedule a work item if we have a shorter timeout */ + if (mad_agent_priv->wait_list.next == &mad_send_wr->agent_list) + mod_delayed_work(mad_agent_priv->qp_info->port_priv->wq, + &mad_agent_priv->timed_work, delay); +} + +void ib_reset_mad_timeout(struct ib_mad_send_wr_private *mad_send_wr, + unsigned long timeout_ms) +{ + mad_send_wr->timeout = msecs_to_jiffies(timeout_ms); + wait_for_response(mad_send_wr); +} + +/* + * Process a send work completion + */ +void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr, + struct ib_mad_send_wc *mad_send_wc) +{ + struct ib_mad_agent_private *mad_agent_priv; + unsigned long flags; + int ret; + + mad_agent_priv = mad_send_wr->mad_agent_priv; + spin_lock_irqsave(&mad_agent_priv->lock, flags); + if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) { + ret = ib_process_rmpp_send_wc(mad_send_wr, mad_send_wc); + if (ret == IB_RMPP_RESULT_CONSUMED) + goto done; + } else + ret = IB_RMPP_RESULT_UNHANDLED; + + if (mad_send_wc->status != IB_WC_SUCCESS && + mad_send_wr->status == IB_WC_SUCCESS) { + mad_send_wr->status = mad_send_wc->status; + mad_send_wr->refcount -= (mad_send_wr->timeout > 0); + } + + if (--mad_send_wr->refcount > 0) { + if (mad_send_wr->refcount == 1 && mad_send_wr->timeout && + mad_send_wr->status == IB_WC_SUCCESS) { + wait_for_response(mad_send_wr); + } + goto done; + } + + /* Remove send from MAD agent and notify client of completion */ + list_del(&mad_send_wr->agent_list); + adjust_timeout(mad_agent_priv); + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + + if (mad_send_wr->status != IB_WC_SUCCESS) + mad_send_wc->status = mad_send_wr->status; + if (ret == IB_RMPP_RESULT_INTERNAL) + ib_rmpp_send_handler(mad_send_wc); + else { + if (mad_send_wr->is_smp_mad) + smp_mad_done(mad_send_wr); + else if (mad_send_wr->is_sa_cc_mad) + sa_cc_mad_done(get_cc_obj(mad_send_wr)); + mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, + mad_send_wc); + } + + /* Release reference on agent taken when sending */ + deref_mad_agent(mad_agent_priv); + return; +done: + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); +} + +static void ib_mad_send_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct ib_mad_port_private *port_priv = cq->cq_context; + struct ib_mad_list_head *mad_list = + container_of(wc->wr_cqe, struct ib_mad_list_head, cqe); + struct ib_mad_send_wr_private *mad_send_wr, *queued_send_wr; + struct ib_mad_qp_info *qp_info; + struct ib_mad_queue *send_queue; + struct ib_mad_send_wc mad_send_wc; + unsigned long flags; + int ret; + + if (list_empty_careful(&port_priv->port_list)) + return; + + if (wc->status != IB_WC_SUCCESS) { + if (!ib_mad_send_error(port_priv, wc)) + return; + } + + mad_send_wr = container_of(mad_list, struct ib_mad_send_wr_private, + mad_list); + send_queue = mad_list->mad_queue; + qp_info = send_queue->qp_info; + + trace_ib_mad_send_done_agent(mad_send_wr->mad_agent_priv); + trace_ib_mad_send_done_handler(mad_send_wr, wc); + +retry: + ib_dma_unmap_single(mad_send_wr->send_buf.mad_agent->device, + mad_send_wr->header_mapping, + mad_send_wr->sg_list[0].length, DMA_TO_DEVICE); + ib_dma_unmap_single(mad_send_wr->send_buf.mad_agent->device, + mad_send_wr->payload_mapping, + mad_send_wr->sg_list[1].length, DMA_TO_DEVICE); + queued_send_wr = NULL; + spin_lock_irqsave(&send_queue->lock, flags); + list_del(&mad_list->list); + + /* Move queued send to the send queue */ + if (send_queue->count-- > send_queue->max_active) { + mad_list = container_of(qp_info->overflow_list.next, + struct ib_mad_list_head, list); + queued_send_wr = container_of(mad_list, + struct ib_mad_send_wr_private, + mad_list); + list_move_tail(&mad_list->list, &send_queue->list); + } + spin_unlock_irqrestore(&send_queue->lock, flags); + + mad_send_wc.send_buf = &mad_send_wr->send_buf; + mad_send_wc.status = wc->status; + mad_send_wc.vendor_err = wc->vendor_err; + ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc); + + if (queued_send_wr) { + trace_ib_mad_send_done_resend(queued_send_wr, qp_info); + ret = ib_post_send(qp_info->qp, &queued_send_wr->send_wr.wr, + NULL); + if (ret) { + dev_err(&port_priv->device->dev, + "ib_post_send failed: %d\n", ret); + mad_send_wr = queued_send_wr; + wc->status = IB_WC_LOC_QP_OP_ERR; + goto retry; + } + } +} + +static void mark_sends_for_retry(struct ib_mad_qp_info *qp_info) +{ + struct ib_mad_send_wr_private *mad_send_wr; + struct ib_mad_list_head *mad_list; + unsigned long flags; + + spin_lock_irqsave(&qp_info->send_queue.lock, flags); + list_for_each_entry(mad_list, &qp_info->send_queue.list, list) { + mad_send_wr = container_of(mad_list, + struct ib_mad_send_wr_private, + mad_list); + mad_send_wr->retry = 1; + } + spin_unlock_irqrestore(&qp_info->send_queue.lock, flags); +} + +static bool ib_mad_send_error(struct ib_mad_port_private *port_priv, + struct ib_wc *wc) +{ + struct ib_mad_list_head *mad_list = + container_of(wc->wr_cqe, struct ib_mad_list_head, cqe); + struct ib_mad_qp_info *qp_info = mad_list->mad_queue->qp_info; + struct ib_mad_send_wr_private *mad_send_wr; + int ret; + + /* + * Send errors will transition the QP to SQE - move + * QP to RTS and repost flushed work requests + */ + mad_send_wr = container_of(mad_list, struct ib_mad_send_wr_private, + mad_list); + if (wc->status == IB_WC_WR_FLUSH_ERR) { + if (mad_send_wr->retry) { + /* Repost send */ + mad_send_wr->retry = 0; + trace_ib_mad_error_handler(mad_send_wr, qp_info); + ret = ib_post_send(qp_info->qp, &mad_send_wr->send_wr.wr, + NULL); + if (!ret) + return false; + } + } else { + struct ib_qp_attr *attr; + + /* Transition QP to RTS and fail offending send */ + attr = kmalloc(sizeof *attr, GFP_KERNEL); + if (attr) { + attr->qp_state = IB_QPS_RTS; + attr->cur_qp_state = IB_QPS_SQE; + ret = ib_modify_qp(qp_info->qp, attr, + IB_QP_STATE | IB_QP_CUR_STATE); + kfree(attr); + if (ret) + dev_err(&port_priv->device->dev, + "%s - ib_modify_qp to RTS: %d\n", + __func__, ret); + else + mark_sends_for_retry(qp_info); + } + } + + return true; +} + +static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv) +{ + unsigned long flags; + struct ib_mad_qp_info *qp_info = mad_agent_priv->qp_info; + struct smp_window *smp = + &mad_agent_priv->qp_info->port_priv->smp_window; + struct ib_mad_send_wr_private *mad_send_wr, *temp_mad_send_wr; + struct ib_mad_send_wc mad_send_wc; + struct list_head cancel_list; + + INIT_LIST_HEAD(&cancel_list); + + cancel_sa_cc_mads(mad_agent_priv); + spin_lock_irqsave(&mad_agent_priv->lock, flags); + mad_agent_priv->send_list_closed = 1; + list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr, + &mad_agent_priv->send_list, agent_list) { + if (mad_send_wr->status == IB_WC_SUCCESS) { + mad_send_wr->status = IB_WC_WR_FLUSH_ERR; + mad_send_wr->refcount -= (mad_send_wr->timeout > 0); + } + } + + /* Empty wait list to prevent receives from finding a request */ + list_splice_init(&mad_agent_priv->wait_list, &cancel_list); + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + + /* Report all cancelled requests */ + mad_send_wc.status = IB_WC_WR_FLUSH_ERR; + mad_send_wc.vendor_err = 0; + + list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr, + &cancel_list, agent_list) { + mad_send_wc.send_buf = &mad_send_wr->send_buf; + list_del(&mad_send_wr->agent_list); + if (mad_send_wr->is_smp_mad) + smp->outstanding--; + else if (mad_send_wr->is_sa_cc_mad) + sa_cc_mad_done(get_cc_obj(mad_send_wr)); + mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, + &mad_send_wc); + deref_mad_agent(mad_agent_priv); + } + + spin_lock_irqsave(&qp_info->send_queue.lock, flags); + list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr, + &smp->overflow_list, mad_list.list) { + if (mad_send_wr->mad_agent_priv != mad_agent_priv) + continue; + mad_send_wc.send_buf = &mad_send_wr->send_buf; + list_del(&mad_send_wr->mad_list.list); + mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, + &mad_send_wc); + deref_mad_agent(mad_agent_priv); + } + spin_unlock_irqrestore(&qp_info->send_queue.lock, flags); +} + +static struct ib_mad_send_wr_private* +find_send_wr(struct ib_mad_agent_private *mad_agent_priv, + struct ib_mad_send_buf *send_buf) +{ + struct ib_mad_send_wr_private *mad_send_wr; + + list_for_each_entry(mad_send_wr, &mad_agent_priv->wait_list, + agent_list) { + if (&mad_send_wr->send_buf == send_buf) + return mad_send_wr; + } + + list_for_each_entry(mad_send_wr, &mad_agent_priv->send_list, + agent_list) { + if (is_rmpp_data_mad(mad_agent_priv, + mad_send_wr->send_buf.mad) && + &mad_send_wr->send_buf == send_buf) + return mad_send_wr; + } + return NULL; +} + +int ib_modify_mad(struct ib_mad_send_buf *send_buf, u32 timeout_ms) +{ + struct ib_mad_agent_private *mad_agent_priv; + struct ib_mad_send_wr_private *mad_send_wr; + unsigned long flags; + int active; + + if (!send_buf) + return -EINVAL; + + mad_agent_priv = container_of(send_buf->mad_agent, + struct ib_mad_agent_private, agent); + spin_lock_irqsave(&mad_agent_priv->lock, flags); + mad_send_wr = find_send_wr(mad_agent_priv, send_buf); + if (!mad_send_wr) { + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + if (modify_sa_cc_mad(mad_agent_priv, send_buf, timeout_ms)) + return -EINVAL; + return 0; + } + if (mad_send_wr->status != IB_WC_SUCCESS) { + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + return -EINVAL; + } + + active = (!mad_send_wr->timeout || mad_send_wr->refcount > 1); + if (!timeout_ms) { + mad_send_wr->status = IB_WC_WR_FLUSH_ERR; + mad_send_wr->refcount -= (mad_send_wr->timeout > 0); + } + + mad_send_wr->send_buf.timeout_ms = timeout_ms; + if (active) + mad_send_wr->timeout = msecs_to_jiffies(timeout_ms); + else + ib_reset_mad_timeout(mad_send_wr, timeout_ms); + + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + return 0; +} +EXPORT_SYMBOL(ib_modify_mad); + +static void local_completions(struct work_struct *work) +{ + struct ib_mad_agent_private *mad_agent_priv; + struct ib_mad_local_private *local; + struct ib_mad_agent_private *recv_mad_agent; + unsigned long flags; + int free_mad; + struct ib_wc wc; + struct ib_mad_send_wc mad_send_wc; + bool opa; + + mad_agent_priv = + container_of(work, struct ib_mad_agent_private, local_work); + + opa = rdma_cap_opa_mad(mad_agent_priv->qp_info->port_priv->device, + mad_agent_priv->qp_info->port_priv->port_num); + + spin_lock_irqsave(&mad_agent_priv->lock, flags); + while (!list_empty(&mad_agent_priv->local_list)) { + local = list_entry(mad_agent_priv->local_list.next, + struct ib_mad_local_private, + completion_list); + list_del(&local->completion_list); + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + free_mad = 0; + if (local->mad_priv) { + u8 base_version; + recv_mad_agent = local->recv_mad_agent; + if (!recv_mad_agent) { + dev_err(&mad_agent_priv->agent.device->dev, + "No receive MAD agent for local completion\n"); + free_mad = 1; + goto local_send_completion; + } + + /* + * Defined behavior is to complete response + * before request + */ + build_smp_wc(recv_mad_agent->agent.qp, + local->mad_send_wr->send_wr.wr.wr_cqe, + be16_to_cpu(IB_LID_PERMISSIVE), + local->mad_send_wr->send_wr.pkey_index, + recv_mad_agent->agent.port_num, &wc); + + local->mad_priv->header.recv_wc.wc = &wc; + + base_version = ((struct ib_mad_hdr *)(local->mad_priv->mad))->base_version; + if (opa && base_version == OPA_MGMT_BASE_VERSION) { + local->mad_priv->header.recv_wc.mad_len = local->return_wc_byte_len; + local->mad_priv->header.recv_wc.mad_seg_size = sizeof(struct opa_mad); + } else { + local->mad_priv->header.recv_wc.mad_len = sizeof(struct ib_mad); + local->mad_priv->header.recv_wc.mad_seg_size = sizeof(struct ib_mad); + } + + INIT_LIST_HEAD(&local->mad_priv->header.recv_wc.rmpp_list); + list_add(&local->mad_priv->header.recv_wc.recv_buf.list, + &local->mad_priv->header.recv_wc.rmpp_list); + local->mad_priv->header.recv_wc.recv_buf.grh = NULL; + local->mad_priv->header.recv_wc.recv_buf.mad = + (struct ib_mad *)local->mad_priv->mad; + recv_mad_agent->agent.recv_handler( + &recv_mad_agent->agent, + &local->mad_send_wr->send_buf, + &local->mad_priv->header.recv_wc); + spin_lock_irqsave(&recv_mad_agent->lock, flags); + deref_mad_agent(recv_mad_agent); + spin_unlock_irqrestore(&recv_mad_agent->lock, flags); + } + +local_send_completion: + /* Complete send */ + mad_send_wc.status = IB_WC_SUCCESS; + mad_send_wc.vendor_err = 0; + mad_send_wc.send_buf = &local->mad_send_wr->send_buf; + mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, + &mad_send_wc); + + spin_lock_irqsave(&mad_agent_priv->lock, flags); + deref_mad_agent(mad_agent_priv); + if (free_mad) + kfree(local->mad_priv); + kfree(local); + } + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); +} + +static int retry_send(struct ib_mad_send_wr_private *mad_send_wr) +{ + int ret; + + if (!mad_send_wr->retries_left) + return -ETIMEDOUT; + + mad_send_wr->retries_left--; + mad_send_wr->send_buf.retries++; + + mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms); + + if (ib_mad_kernel_rmpp_agent(&mad_send_wr->mad_agent_priv->agent)) { + ret = ib_retry_rmpp(mad_send_wr); + switch (ret) { + case IB_RMPP_RESULT_UNHANDLED: + ret = ib_send_mad(mad_send_wr); + break; + case IB_RMPP_RESULT_CONSUMED: + ret = 0; + break; + default: + ret = -ECOMM; + break; + } + } else + ret = ib_send_mad(mad_send_wr); + + if (!ret) { + if (mad_send_wr->is_smp_mad) + smp_mad_done(mad_send_wr); + mad_send_wr->refcount++; + list_add_tail(&mad_send_wr->agent_list, + &mad_send_wr->mad_agent_priv->send_list); + } + return ret; +} + +static void timeout_sends(struct work_struct *work) +{ + struct ib_mad_agent_private *mad_agent_priv; + struct ib_mad_send_wr_private *mad_send_wr; + struct ib_mad_send_wc mad_send_wc; + unsigned long flags, delay; + + mad_agent_priv = container_of(work, struct ib_mad_agent_private, + timed_work.work); + mad_send_wc.vendor_err = 0; + + spin_lock_irqsave(&mad_agent_priv->lock, flags); + while (!list_empty(&mad_agent_priv->wait_list)) { + mad_send_wr = list_entry(mad_agent_priv->wait_list.next, + struct ib_mad_send_wr_private, + agent_list); + + if (time_after(mad_send_wr->timeout, jiffies)) { + delay = mad_send_wr->timeout - jiffies; + if ((long)delay <= 0) + delay = 1; + queue_delayed_work(mad_agent_priv->qp_info-> + port_priv->wq, + &mad_agent_priv->timed_work, delay); + break; + } + + list_del(&mad_send_wr->agent_list); + if (mad_send_wr->status == IB_WC_SUCCESS && + !retry_send(mad_send_wr)) + continue; + + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + + if (mad_send_wr->status == IB_WC_SUCCESS) + mad_send_wc.status = IB_WC_RESP_TIMEOUT_ERR; + else + mad_send_wc.status = mad_send_wr->status; + mad_send_wc.send_buf = &mad_send_wr->send_buf; + if (mad_send_wr->is_smp_mad) + smp_mad_done(mad_send_wr); + else if (mad_send_wr->is_sa_cc_mad) + sa_cc_mad_done(get_cc_obj(mad_send_wr)); + mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, + &mad_send_wc); + + deref_mad_agent(mad_agent_priv); + spin_lock_irqsave(&mad_agent_priv->lock, flags); + } + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); +} + +/* + * Allocate receive MADs and post receive WRs for them + */ +static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info, + struct ib_mad_private *mad) +{ + unsigned long flags; + int post, ret; + struct ib_mad_private *mad_priv; + struct ib_sge sg_list; + struct ib_recv_wr recv_wr; + struct ib_mad_queue *recv_queue = &qp_info->recv_queue; + + /* Initialize common scatter list fields */ + sg_list.lkey = qp_info->port_priv->pd->local_dma_lkey; + + /* Initialize common receive WR fields */ + recv_wr.next = NULL; + recv_wr.sg_list = &sg_list; + recv_wr.num_sge = 1; + + do { + /* Allocate and map receive buffer */ + if (mad) { + mad_priv = mad; + mad = NULL; + } else { + mad_priv = alloc_mad_private(port_mad_size(qp_info->port_priv), + GFP_ATOMIC); + if (!mad_priv) { + ret = -ENOMEM; + break; + } + } + sg_list.length = mad_priv_dma_size(mad_priv); + sg_list.addr = ib_dma_map_single(qp_info->port_priv->device, + &mad_priv->grh, + mad_priv_dma_size(mad_priv), + DMA_FROM_DEVICE); + if (unlikely(ib_dma_mapping_error(qp_info->port_priv->device, + sg_list.addr))) { + kfree(mad_priv); + ret = -ENOMEM; + break; + } + mad_priv->header.mapping = sg_list.addr; + mad_priv->header.mad_list.mad_queue = recv_queue; + mad_priv->header.mad_list.cqe.done = ib_mad_recv_done; + recv_wr.wr_cqe = &mad_priv->header.mad_list.cqe; + + /* Post receive WR */ + spin_lock_irqsave(&recv_queue->lock, flags); + post = (++recv_queue->count < recv_queue->max_active); + list_add_tail(&mad_priv->header.mad_list.list, &recv_queue->list); + spin_unlock_irqrestore(&recv_queue->lock, flags); + ret = ib_post_recv(qp_info->qp, &recv_wr, NULL); + if (ret) { + spin_lock_irqsave(&recv_queue->lock, flags); + list_del(&mad_priv->header.mad_list.list); + recv_queue->count--; + spin_unlock_irqrestore(&recv_queue->lock, flags); + ib_dma_unmap_single(qp_info->port_priv->device, + mad_priv->header.mapping, + mad_priv_dma_size(mad_priv), + DMA_FROM_DEVICE); + kfree(mad_priv); + dev_err(&qp_info->port_priv->device->dev, + "ib_post_recv failed: %d\n", ret); + break; + } + } while (post); + + return ret; +} + +/* + * Return all the posted receive MADs + */ +static void cleanup_recv_queue(struct ib_mad_qp_info *qp_info) +{ + struct ib_mad_private_header *mad_priv_hdr; + struct ib_mad_private *recv; + struct ib_mad_list_head *mad_list; + + if (!qp_info->qp) + return; + + while (!list_empty(&qp_info->recv_queue.list)) { + + mad_list = list_entry(qp_info->recv_queue.list.next, + struct ib_mad_list_head, list); + mad_priv_hdr = container_of(mad_list, + struct ib_mad_private_header, + mad_list); + recv = container_of(mad_priv_hdr, struct ib_mad_private, + header); + + /* Remove from posted receive MAD list */ + list_del(&mad_list->list); + + ib_dma_unmap_single(qp_info->port_priv->device, + recv->header.mapping, + mad_priv_dma_size(recv), + DMA_FROM_DEVICE); + kfree(recv); + } + + qp_info->recv_queue.count = 0; +} + +/* + * Start the port + */ +static int ib_mad_port_start(struct ib_mad_port_private *port_priv) +{ + int ret, i; + struct ib_qp_attr *attr; + struct ib_qp *qp; + u16 pkey_index; + + attr = kmalloc(sizeof *attr, GFP_KERNEL); + if (!attr) + return -ENOMEM; + + ret = ib_find_pkey(port_priv->device, port_priv->port_num, + IB_DEFAULT_PKEY_FULL, &pkey_index); + if (ret) + pkey_index = 0; + + for (i = 0; i < IB_MAD_QPS_CORE; i++) { + qp = port_priv->qp_info[i].qp; + if (!qp) + continue; + + /* + * PKey index for QP1 is irrelevant but + * one is needed for the Reset to Init transition + */ + attr->qp_state = IB_QPS_INIT; + attr->pkey_index = pkey_index; + attr->qkey = (qp->qp_num == 0) ? 0 : IB_QP1_QKEY; + ret = ib_modify_qp(qp, attr, IB_QP_STATE | + IB_QP_PKEY_INDEX | IB_QP_QKEY); + if (ret) { + dev_err(&port_priv->device->dev, + "Couldn't change QP%d state to INIT: %d\n", + i, ret); + goto out; + } + + attr->qp_state = IB_QPS_RTR; + ret = ib_modify_qp(qp, attr, IB_QP_STATE); + if (ret) { + dev_err(&port_priv->device->dev, + "Couldn't change QP%d state to RTR: %d\n", + i, ret); + goto out; + } + + attr->qp_state = IB_QPS_RTS; + attr->sq_psn = IB_MAD_SEND_Q_PSN; + ret = ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_SQ_PSN); + if (ret) { + dev_err(&port_priv->device->dev, + "Couldn't change QP%d state to RTS: %d\n", + i, ret); + goto out; + } + } + + ret = ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP); + if (ret) { + dev_err(&port_priv->device->dev, + "Failed to request completion notification: %d\n", + ret); + goto out; + } + + for (i = 0; i < IB_MAD_QPS_CORE; i++) { + if (!port_priv->qp_info[i].qp) + continue; + + ret = ib_mad_post_receive_mads(&port_priv->qp_info[i], NULL); + if (ret) { + dev_err(&port_priv->device->dev, + "Couldn't post receive WRs\n"); + goto out; + } + } +out: + kfree(attr); + return ret; +} + +static void qp_event_handler(struct ib_event *event, void *qp_context) +{ + struct ib_mad_qp_info *qp_info = qp_context; + + /* It's worse than that! He's dead, Jim! */ + dev_err(&qp_info->port_priv->device->dev, + "Fatal error (%d) on MAD QP (%u)\n", + event->event, qp_info->qp->qp_num); +} + +static void init_mad_queue(struct ib_mad_qp_info *qp_info, + struct ib_mad_queue *mad_queue) +{ + mad_queue->qp_info = qp_info; + mad_queue->count = 0; + spin_lock_init(&mad_queue->lock); + INIT_LIST_HEAD(&mad_queue->list); +} + +static void init_mad_qp(struct ib_mad_port_private *port_priv, + struct ib_mad_qp_info *qp_info) +{ + qp_info->port_priv = port_priv; + init_mad_queue(qp_info, &qp_info->send_queue); + init_mad_queue(qp_info, &qp_info->recv_queue); + INIT_LIST_HEAD(&qp_info->overflow_list); +} + +static int create_mad_qp(struct ib_mad_qp_info *qp_info, + enum ib_qp_type qp_type) +{ + struct ib_qp_init_attr qp_init_attr; + int ret; + + memset(&qp_init_attr, 0, sizeof qp_init_attr); + qp_init_attr.send_cq = qp_info->port_priv->cq; + qp_init_attr.recv_cq = qp_info->port_priv->cq; + qp_init_attr.sq_sig_type = IB_SIGNAL_ALL_WR; + qp_init_attr.cap.max_send_wr = mad_sendq_size; + qp_init_attr.cap.max_recv_wr = mad_recvq_size; + qp_init_attr.cap.max_send_sge = IB_MAD_SEND_REQ_MAX_SG; + qp_init_attr.cap.max_recv_sge = IB_MAD_RECV_REQ_MAX_SG; + qp_init_attr.qp_type = qp_type; + qp_init_attr.port_num = qp_info->port_priv->port_num; + qp_init_attr.qp_context = qp_info; + qp_init_attr.event_handler = qp_event_handler; + qp_info->qp = ib_create_qp(qp_info->port_priv->pd, &qp_init_attr); + if (IS_ERR(qp_info->qp)) { + dev_err(&qp_info->port_priv->device->dev, + "Couldn't create ib_mad QP%d\n", + get_spl_qp_index(qp_type)); + ret = PTR_ERR(qp_info->qp); + goto error; + } + /* Use minimum queue sizes unless the CQ is resized */ + qp_info->send_queue.max_active = mad_sendq_size; + qp_info->recv_queue.max_active = mad_recvq_size; + return 0; + +error: + return ret; +} + +static void destroy_mad_qp(struct ib_mad_qp_info *qp_info) +{ + if (!qp_info->qp) + return; + + ib_destroy_qp(qp_info->qp); +} + +/* + * Open the port + * Create the QP, PD, MR, and CQ if needed + */ +static int ib_mad_port_open(struct ib_device *device, + u32 port_num) +{ + int ret, cq_size; + struct ib_mad_port_private *port_priv; + unsigned long flags; + char name[sizeof "ib_mad123"]; + int has_smi; + + if (WARN_ON(rdma_max_mad_size(device, port_num) < IB_MGMT_MAD_SIZE)) + return -EFAULT; + + if (WARN_ON(rdma_cap_opa_mad(device, port_num) && + rdma_max_mad_size(device, port_num) < OPA_MGMT_MAD_SIZE)) + return -EFAULT; + + /* Create new device info */ + port_priv = kzalloc(sizeof *port_priv, GFP_KERNEL); + if (!port_priv) + return -ENOMEM; + + port_priv->device = device; + port_priv->port_num = port_num; + spin_lock_init(&port_priv->reg_lock); + init_mad_qp(port_priv, &port_priv->qp_info[0]); + init_mad_qp(port_priv, &port_priv->qp_info[1]); + + cq_size = mad_sendq_size + mad_recvq_size; + has_smi = rdma_cap_ib_smi(device, port_num); + if (has_smi) + cq_size *= 2; + + port_priv->pd = ib_alloc_pd(device, 0); + if (IS_ERR(port_priv->pd)) { + dev_err(&device->dev, "Couldn't create ib_mad PD\n"); + ret = PTR_ERR(port_priv->pd); + goto error3; + } + + port_priv->cq = ib_alloc_cq(port_priv->device, port_priv, cq_size, 0, + IB_POLL_UNBOUND_WORKQUEUE); + if (IS_ERR(port_priv->cq)) { + dev_err(&device->dev, "Couldn't create ib_mad CQ\n"); + ret = PTR_ERR(port_priv->cq); + goto error4; + } + + if (has_smi) { + ret = create_mad_qp(&port_priv->qp_info[0], IB_QPT_SMI); + if (ret) + goto error6; + } + ret = create_mad_qp(&port_priv->qp_info[1], IB_QPT_GSI); + if (ret) + goto error7; + + snprintf(name, sizeof(name), "ib_mad%u", port_num); + port_priv->wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM); + if (!port_priv->wq) { + ret = -ENOMEM; + goto error8; + } + + if (sa_cc_init(&port_priv->sa_cc)) + goto error9; + + spin_lock_irqsave(&ib_mad_port_list_lock, flags); + list_add_tail(&port_priv->port_list, &ib_mad_port_list); + spin_unlock_irqrestore(&ib_mad_port_list_lock, flags); + + smp_window_init(&port_priv->smp_window); + + ret = ib_mad_port_start(port_priv); + if (ret) { + dev_err(&device->dev, "Couldn't start port\n"); + goto error10; + } + + return 0; + +error10: + spin_lock_irqsave(&ib_mad_port_list_lock, flags); + list_del_init(&port_priv->port_list); + spin_unlock_irqrestore(&ib_mad_port_list_lock, flags); + + sa_cc_destroy(&port_priv->sa_cc); +error9: + destroy_workqueue(port_priv->wq); +error8: + destroy_mad_qp(&port_priv->qp_info[1]); +error7: + destroy_mad_qp(&port_priv->qp_info[0]); +error6: + ib_free_cq(port_priv->cq); + cleanup_recv_queue(&port_priv->qp_info[1]); + cleanup_recv_queue(&port_priv->qp_info[0]); +error4: + ib_dealloc_pd(port_priv->pd); +error3: + kfree(port_priv); + + return ret; +} + +/* + * Close the port + * If there are no classes using the port, free the port + * resources (CQ, MR, PD, QP) and remove the port's info structure + */ +static int ib_mad_port_close(struct ib_device *device, u32 port_num) +{ + struct ib_mad_port_private *port_priv; + unsigned long flags; + + spin_lock_irqsave(&ib_mad_port_list_lock, flags); + port_priv = __ib_get_mad_port(device, port_num); + if (port_priv == NULL) { + spin_unlock_irqrestore(&ib_mad_port_list_lock, flags); + dev_err(&device->dev, "Port %u not found\n", port_num); + return -ENODEV; + } + list_del_init(&port_priv->port_list); + spin_unlock_irqrestore(&ib_mad_port_list_lock, flags); + + destroy_workqueue(port_priv->wq); + sa_cc_destroy(&port_priv->sa_cc); + destroy_mad_qp(&port_priv->qp_info[1]); + destroy_mad_qp(&port_priv->qp_info[0]); + ib_free_cq(port_priv->cq); + ib_dealloc_pd(port_priv->pd); + cleanup_recv_queue(&port_priv->qp_info[1]); + cleanup_recv_queue(&port_priv->qp_info[0]); + /* XXX: Handle deallocation of MAD registration tables */ + + kfree(port_priv); + + return 0; +} + +static int ib_mad_init_device(struct ib_device *device) +{ + int start, i; + unsigned int count = 0; + int ret; + + ret = init_sa_cc_sysfs(device); + if (ret) { + dev_err(&device->dev, "Couldn't open mad congestion control sysfs\n"); + return ret; + } + + start = rdma_start_port(device); + + for (i = start; i <= rdma_end_port(device); i++) { + if (!rdma_cap_ib_mad(device, i)) + continue; + + ret = ib_mad_port_open(device, i); + if (ret) { + dev_err(&device->dev, "Couldn't open port %d\n", i); + goto error; + } + ret = ib_agent_port_open(device, i); + if (ret) { + dev_err(&device->dev, + "Couldn't open port %d for agents\n", i); + goto error_agent; + } + count++; + } + if (!count) { + cleanup_sa_cc_sysfs(device); + return -EOPNOTSUPP; + } + + return 0; + +error_agent: + if (ib_mad_port_close(device, i)) + dev_err(&device->dev, "Couldn't close port %d\n", i); + +error: + while (--i >= start) { + if (!rdma_cap_ib_mad(device, i)) + continue; + + if (ib_agent_port_close(device, i)) + dev_err(&device->dev, + "Couldn't close port %d for agents\n", i); + if (ib_mad_port_close(device, i)) + dev_err(&device->dev, "Couldn't close port %d\n", i); + } + + cleanup_sa_cc_sysfs(device); + return ret; +} + +static void ib_mad_remove_device(struct ib_device *device, void *client_data) +{ + unsigned int i; + + rdma_for_each_port (device, i) { + if (!rdma_cap_ib_mad(device, i)) + continue; + + if (ib_agent_port_close(device, i)) + dev_err(&device->dev, + "Couldn't close port %u for agents\n", i); + if (ib_mad_port_close(device, i)) + dev_err(&device->dev, "Couldn't close port %u\n", i); + } + + cleanup_sa_cc_sysfs(device); +} + +struct sa_cc_attribute { + struct attribute attr; + ssize_t (*show)(struct sa_cc_data *, char *buf); + ssize_t (*store)(struct sa_cc_data *, const char *buf, size_t count); +}; + +static ssize_t max_outstanding_store(struct sa_cc_data *cc_obj, + const char *buf, size_t count) +{ + unsigned long var; + + if (kstrtol(buf, 0, &var)) + return -EINVAL; + + if (var < SA_CC_MIN_OUTSTANDING_SA_MADS || + var > SA_CC_MAX_OUTSTANDING_SA_MADS) + return -EINVAL; + + cc_obj->max_outstanding = var; + + return count; +} + +static ssize_t max_outstanding_show(struct sa_cc_data *cc_obj, + char *buf) +{ + return sprintf(buf, "%lu\n", cc_obj->max_outstanding); +} + +static ssize_t drops_store(struct sa_cc_data *cc_obj, + const char *buf, size_t count) +{ + unsigned long var; + + if (kstrtol(buf, 0, &var) || (var != 0)) + return -EINVAL; + + cc_obj->drops = 0; + return count; +} + +static ssize_t drops_show(struct sa_cc_data *cc_obj, char *buf) +{ + return sprintf(buf, "%lu\n", cc_obj->drops); +} + +static ssize_t time_sa_mad_store(struct sa_cc_data *cc_obj, const char *buf, + size_t count) +{ + unsigned long var; + + if (kstrtol(buf, 0, &var)) + return -EINVAL; + + if (var < SA_CC_MIN_MAD_TIME_MS || + var > SA_CC_MAX_MAD_TIME_MS) + return -EINVAL; + + cc_obj->time_sa_mad = var; + + return count; +} + +static ssize_t time_sa_mad_show(struct sa_cc_data *cc_obj, + char *buf) +{ + return sprintf(buf, "%lu\n", cc_obj->time_sa_mad); +} + +static ssize_t queue_size_store(struct sa_cc_data *cc_obj, const char *buf, + size_t count) +{ + unsigned long var; + + if (kstrtol(buf, 0, &var)) + return -EINVAL; + + if (var < SA_CC_MIN_QUEUE_SIZE || + var > SA_CC_MAX_QUEUE_SIZE) + return -EINVAL; + + cc_obj->queue_size = var; + + return count; +} + +static ssize_t queue_size_show(struct sa_cc_data *cc_obj, + char *buf) +{ + return sprintf(buf, "%lu\n", cc_obj->queue_size); +} + +static ssize_t sa_cc_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t size) +{ + struct sa_cc_attribute *sa = container_of(attr, + struct sa_cc_attribute, + attr); + struct sa_cc_data *cc_obj = container_of(kobj, struct sa_cc_data, + kobj); + + if (!sa->store) + return -EIO; + + return sa->store(cc_obj, buf, size); +} + +static ssize_t sa_cc_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct sa_cc_attribute *sa = container_of(attr, + struct sa_cc_attribute, + attr); + struct sa_cc_data *cc_obj = container_of(kobj, struct sa_cc_data, + kobj); + + if (!sa->show) + return -EIO; + + return sa->show(cc_obj, buf); +} + +static const struct sysfs_ops sa_cc_sysfs_ops = { + .show = sa_cc_attr_show, + .store = sa_cc_attr_store, +}; + +#define SA_CC_ATTR(_name) struct sa_cc_attribute sa_cc_attr_##_name = \ + __ATTR(_name, 0644, _name##_show, _name##_store) + +static SA_CC_ATTR(queue_size); +static SA_CC_ATTR(time_sa_mad); +static SA_CC_ATTR(max_outstanding); +static SA_CC_ATTR(drops); + +static struct attribute *sa_cc_default_attrs[] = { + &sa_cc_attr_queue_size.attr, + &sa_cc_attr_time_sa_mad.attr, + &sa_cc_attr_max_outstanding.attr, + &sa_cc_attr_drops.attr, + NULL +}; + +static struct kobj_type sa_cc_type = { + .sysfs_ops = &sa_cc_sysfs_ops, + .default_attrs = sa_cc_default_attrs +}; + +static void cleanup_sa_cc_sysfs_ports(struct sa_cc_data *cc_obj) +{ + kobject_put(&cc_obj->kobj); +} + +static int init_sa_cc_sysfs_ports(struct sa_cc_data *cc_obj) +{ + struct ib_mad_port_private *port_priv; + int err; + + port_priv = container_of(cc_obj, struct ib_mad_port_private, sa_cc); + err = kobject_init_and_add(&cc_obj->kobj, + &sa_cc_type, + port_priv->device->mad_sa_cc_kobj, + "%d", port_priv->port_num); + if (err) { + pr_err("failed to register mad_sa_cc sysfs object for port %d\n", + port_priv->port_num); + return -ENOMEM; + } + return 0; +} + +static void cleanup_sa_cc_sysfs(struct ib_device *device) +{ + if (device->mad_sa_cc_kobj) { + kobject_put(device->mad_sa_cc_kobj); + device->mad_sa_cc_kobj = NULL; + } +} + +static int init_sa_cc_sysfs(struct ib_device *device) +{ + struct device *dev = &device->dev; + + device->mad_sa_cc_kobj = kobject_create_and_add("mad_sa_cc", + &dev->kobj); + if (!device->mad_sa_cc_kobj) { + pr_err("failed to register mad_sa_cc sysfs object\n"); + return -ENOMEM; + } + + return 0; +} + +static struct ib_client mad_client = { + .name = "mad", + .add = ib_mad_init_device, + .remove = ib_mad_remove_device +}; + +int ib_mad_init(void) +{ + mad_recvq_size = min(mad_recvq_size, IB_MAD_QP_MAX_SIZE); + mad_recvq_size = max(mad_recvq_size, IB_MAD_QP_MIN_SIZE); + + mad_sendq_size = min(mad_sendq_size, IB_MAD_QP_MAX_SIZE); + mad_sendq_size = max(mad_sendq_size, IB_MAD_QP_MIN_SIZE); + + mad_smp_window = min(mad_smp_window, IB_MAD_QP_MAX_SIZE); + // Allow to used small values for latency measurement and benchmarking + mad_smp_window = max(mad_smp_window, 1); + + INIT_LIST_HEAD(&ib_mad_port_list); + + if (ib_register_client(&mad_client)) { + pr_err("Couldn't register ib_mad client\n"); + return -EINVAL; + } + + return 0; +} + +void ib_mad_cleanup(void) +{ + ib_unregister_client(&mad_client); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/mad_priv.h b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/mad_priv.h new file mode 100644 index 0000000..01631ef --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/mad_priv.h @@ -0,0 +1,274 @@ +/* + * Copyright (c) 2004, 2005, Voltaire, Inc. All rights reserved. + * Copyright (c) 2005 Intel Corporation. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2009 HNR Consulting. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __IB_MAD_PRIV_H__ +#define __IB_MAD_PRIV_H__ + +#include +#include +#include +#include +#include +#include + +#define IB_MAD_QPS_CORE 2 /* Always QP0 and QP1 as a minimum */ + +/* QP and CQ parameters */ +#define IB_MAD_QP_SEND_SIZE 128 +#define IB_MAD_QP_RECV_SIZE 512 +#define IB_MAD_QP_SMP_WINDOW 128 /* Use INT_MAX to disable the feature */ +#define IB_MAD_QP_MIN_SIZE 64 +#define IB_MAD_QP_MAX_SIZE 8192 +#define IB_MAD_SEND_REQ_MAX_SG 2 +#define IB_MAD_RECV_REQ_MAX_SG 1 + +#define IB_MAD_SEND_Q_PSN 0 + +/* Registration table sizes */ +#define MAX_MGMT_CLASS 80 +#define MAX_MGMT_VERSION 0x83 +#define MAX_MGMT_OUI 8 +#define MAX_MGMT_VENDOR_RANGE2 (IB_MGMT_CLASS_VENDOR_RANGE2_END - \ + IB_MGMT_CLASS_VENDOR_RANGE2_START + 1) + +struct ib_mad_list_head { + struct list_head list; + struct ib_cqe cqe; + struct ib_mad_queue *mad_queue; +}; + +struct ib_mad_private_header { + struct ib_mad_list_head mad_list; + struct ib_mad_recv_wc recv_wc; + struct ib_wc wc; + u64 mapping; +} __packed; + +struct ib_mad_private { + struct ib_mad_private_header header; + size_t mad_size; + struct ib_grh grh; + u8 mad[]; +} __packed; + +struct ib_rmpp_segment { + struct list_head list; + u32 num; + u8 data[]; +}; + +struct ib_mad_agent_private { + struct ib_mad_agent agent; + struct ib_mad_reg_req *reg_req; + struct ib_mad_qp_info *qp_info; + + spinlock_t lock; + struct list_head send_list; + struct list_head wait_list; + struct list_head done_list; + struct delayed_work timed_work; + unsigned long timeout; + struct list_head local_list; + struct work_struct local_work; + struct list_head rmpp_list; + + refcount_t refcount; + int send_list_closed; + union { + struct completion comp; + struct rcu_head rcu; + }; +}; + +struct ib_mad_snoop_private { + struct ib_mad_agent agent; + struct ib_mad_qp_info *qp_info; + int snoop_index; + int mad_snoop_flags; + struct completion comp; +}; + +/* Structure for timeout-fifo entry */ +struct tf_entry { + unsigned long exp_time; /* entry expiration time */ + struct list_head fifo_list; /* to keep entries in fifo order */ + struct list_head to_list; /* to keep entries in timeout order */ + int canceled; /* indicates whether entry is canceled */ +}; + +struct ib_mad_send_wr_private { + struct ib_mad_list_head mad_list; + struct list_head agent_list; + struct ib_mad_agent_private *mad_agent_priv; + struct ib_mad_send_buf send_buf; + u64 header_mapping; + u64 payload_mapping; + struct ib_ud_wr send_wr; + struct ib_sge sg_list[IB_MAD_SEND_REQ_MAX_SG]; + __be64 tid; + unsigned long timeout; + int max_retries; + int retries_left; + int retry; + int refcount; + enum ib_wc_status status; + + /* RMPP control */ + struct list_head rmpp_list; + struct ib_rmpp_segment *last_ack_seg; + struct ib_rmpp_segment *cur_seg; + int last_ack; + int seg_num; + int newwin; + int pad; + + /* SMP window */ + int is_smp_mad; + + /* SA congestion controlled MAD */ + int is_sa_cc_mad; + struct tf_entry tf_list; +}; + +struct ib_mad_local_private { + struct list_head completion_list; + struct ib_mad_private *mad_priv; + struct ib_mad_agent_private *recv_mad_agent; + struct ib_mad_send_wr_private *mad_send_wr; + size_t return_wc_byte_len; +}; + +struct ib_mad_mgmt_method_table { + struct ib_mad_agent_private *agent[IB_MGMT_MAX_METHODS]; +}; + +struct ib_mad_mgmt_class_table { + struct ib_mad_mgmt_method_table *method_table[MAX_MGMT_CLASS]; +}; + +struct ib_mad_mgmt_vendor_class { + u8 oui[MAX_MGMT_OUI][3]; + struct ib_mad_mgmt_method_table *method_table[MAX_MGMT_OUI]; +}; + +struct ib_mad_mgmt_vendor_class_table { + struct ib_mad_mgmt_vendor_class *vendor_class[MAX_MGMT_VENDOR_RANGE2]; +}; + +struct ib_mad_mgmt_version_table { + struct ib_mad_mgmt_class_table *class; + struct ib_mad_mgmt_vendor_class_table *vendor; +}; + +struct ib_mad_queue { + spinlock_t lock; + struct list_head list; + int count; + int max_active; + struct ib_mad_qp_info *qp_info; +}; + +struct ib_mad_qp_info { + struct ib_mad_port_private *port_priv; + struct ib_qp *qp; + struct ib_mad_queue send_queue; + struct ib_mad_queue recv_queue; + struct list_head overflow_list; + spinlock_t snoop_lock; + struct ib_mad_snoop_private **snoop_table; + int snoop_table_size; + atomic_t snoop_count; +}; + +struct smp_window { + unsigned long outstanding; + unsigned long max_outstanding; + struct list_head overflow_list; +}; + +struct to_fifo { + struct list_head to_head; + struct list_head fifo_head; + spinlock_t lists_lock; + struct timer_list timer; + struct work_struct work; + u32 num_items; + int stop_enqueue; + struct workqueue_struct *workq; +}; + +/* SA congestion control data */ +struct sa_cc_data { + spinlock_t lock; + unsigned long outstanding; + unsigned long queue_size; + unsigned long time_sa_mad; + unsigned long max_outstanding; + unsigned long drops; + struct kobject kobj; + struct to_fifo *tf; +}; + +struct ib_mad_port_private { + struct list_head port_list; + struct ib_device *device; + int port_num; + struct ib_cq *cq; + struct ib_pd *pd; + + spinlock_t reg_lock; + struct ib_mad_mgmt_version_table version[MAX_MGMT_VERSION]; + struct workqueue_struct *wq; + struct ib_mad_qp_info qp_info[IB_MAD_QPS_CORE]; + + struct smp_window smp_window; + struct sa_cc_data sa_cc; +}; + +int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr); + +struct ib_mad_send_wr_private * +ib_find_send_mad(const struct ib_mad_agent_private *mad_agent_priv, + const struct ib_mad_recv_wc *mad_recv_wc); + +void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr, + struct ib_mad_send_wc *mad_send_wc); + +void ib_mark_mad_done(struct ib_mad_send_wr_private *mad_send_wr); + +void ib_reset_mad_timeout(struct ib_mad_send_wr_private *mad_send_wr, + unsigned long timeout_ms); + +#endif /* __IB_MAD_PRIV_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/mad_rmpp.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/mad_rmpp.c new file mode 100644 index 0000000..8af0619 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/mad_rmpp.c @@ -0,0 +1,960 @@ +/* + * Copyright (c) 2005 Intel Inc. All rights reserved. + * Copyright (c) 2005-2006 Voltaire, Inc. All rights reserved. + * Copyright (c) 2014 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include "mad_priv.h" +#include "mad_rmpp.h" + +enum rmpp_state { + RMPP_STATE_ACTIVE, + RMPP_STATE_TIMEOUT, + RMPP_STATE_COMPLETE +}; + +struct mad_rmpp_recv { + struct ib_mad_agent_private *agent; + struct list_head list; + struct delayed_work timeout_work; + struct delayed_work cleanup_work; + struct completion comp; + enum rmpp_state state; + spinlock_t lock; + refcount_t refcount; + + struct ib_ah *ah; + struct ib_mad_recv_wc *rmpp_wc; + struct ib_mad_recv_buf *cur_seg_buf; + int last_ack; + int seg_num; + int newwin; + int repwin; + + __be64 tid; + u32 src_qp; + u32 slid; + u8 mgmt_class; + u8 class_version; + u8 method; + u8 base_version; +}; + +static inline void deref_rmpp_recv(struct mad_rmpp_recv *rmpp_recv) +{ + if (refcount_dec_and_test(&rmpp_recv->refcount)) + complete(&rmpp_recv->comp); +} + +static void destroy_rmpp_recv(struct mad_rmpp_recv *rmpp_recv) +{ + deref_rmpp_recv(rmpp_recv); + wait_for_completion(&rmpp_recv->comp); + rdma_destroy_ah(rmpp_recv->ah, RDMA_DESTROY_AH_SLEEPABLE); + kfree(rmpp_recv); +} + +void ib_cancel_rmpp_recvs(struct ib_mad_agent_private *agent) +{ + struct mad_rmpp_recv *rmpp_recv, *temp_rmpp_recv; + unsigned long flags; + + spin_lock_irqsave(&agent->lock, flags); + list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) { + cancel_delayed_work(&rmpp_recv->timeout_work); + cancel_delayed_work(&rmpp_recv->cleanup_work); + } + spin_unlock_irqrestore(&agent->lock, flags); + + flush_workqueue(agent->qp_info->port_priv->wq); + + list_for_each_entry_safe(rmpp_recv, temp_rmpp_recv, + &agent->rmpp_list, list) { + list_del(&rmpp_recv->list); + if (rmpp_recv->state != RMPP_STATE_COMPLETE) + ib_free_recv_mad(rmpp_recv->rmpp_wc); + destroy_rmpp_recv(rmpp_recv); + } +} + +static void format_ack(struct ib_mad_send_buf *msg, + struct ib_rmpp_mad *data, + struct mad_rmpp_recv *rmpp_recv) +{ + struct ib_rmpp_mad *ack = msg->mad; + unsigned long flags; + + memcpy(ack, &data->mad_hdr, msg->hdr_len); + + ack->mad_hdr.method ^= IB_MGMT_METHOD_RESP; + ack->rmpp_hdr.rmpp_type = IB_MGMT_RMPP_TYPE_ACK; + ib_set_rmpp_flags(&ack->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE); + + spin_lock_irqsave(&rmpp_recv->lock, flags); + rmpp_recv->last_ack = rmpp_recv->seg_num; + ack->rmpp_hdr.seg_num = cpu_to_be32(rmpp_recv->seg_num); + ack->rmpp_hdr.paylen_newwin = cpu_to_be32(rmpp_recv->newwin); + spin_unlock_irqrestore(&rmpp_recv->lock, flags); +} + +static void ack_recv(struct mad_rmpp_recv *rmpp_recv, + struct ib_mad_recv_wc *recv_wc) +{ + struct ib_mad_send_buf *msg; + int ret, hdr_len; + + hdr_len = ib_get_mad_data_offset(recv_wc->recv_buf.mad->mad_hdr.mgmt_class); + msg = ib_create_send_mad(&rmpp_recv->agent->agent, recv_wc->wc->src_qp, + recv_wc->wc->pkey_index, 1, hdr_len, + 0, GFP_KERNEL, + IB_MGMT_BASE_VERSION); + if (IS_ERR(msg)) + return; + + format_ack(msg, (struct ib_rmpp_mad *) recv_wc->recv_buf.mad, rmpp_recv); + msg->ah = rmpp_recv->ah; + ret = ib_post_send_mad(msg, NULL); + if (ret) + ib_free_send_mad(msg); +} + +static struct ib_mad_send_buf *alloc_response_msg(struct ib_mad_agent *agent, + struct ib_mad_recv_wc *recv_wc) +{ + struct ib_mad_send_buf *msg; + struct ib_ah *ah; + int hdr_len; + + ah = ib_create_ah_from_wc(agent->qp->pd, recv_wc->wc, + recv_wc->recv_buf.grh, agent->port_num); + if (IS_ERR(ah)) + return (void *) ah; + + hdr_len = ib_get_mad_data_offset(recv_wc->recv_buf.mad->mad_hdr.mgmt_class); + msg = ib_create_send_mad(agent, recv_wc->wc->src_qp, + recv_wc->wc->pkey_index, 1, + hdr_len, 0, GFP_KERNEL, + IB_MGMT_BASE_VERSION); + if (IS_ERR(msg)) + rdma_destroy_ah(ah, RDMA_DESTROY_AH_SLEEPABLE); + else { + msg->ah = ah; + msg->context[0] = ah; + } + + return msg; +} + +static void ack_ds_ack(struct ib_mad_agent_private *agent, + struct ib_mad_recv_wc *recv_wc) +{ + struct ib_mad_send_buf *msg; + struct ib_rmpp_mad *rmpp_mad; + int ret; + + msg = alloc_response_msg(&agent->agent, recv_wc); + if (IS_ERR(msg)) + return; + + rmpp_mad = msg->mad; + memcpy(rmpp_mad, recv_wc->recv_buf.mad, msg->hdr_len); + + rmpp_mad->mad_hdr.method ^= IB_MGMT_METHOD_RESP; + ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE); + rmpp_mad->rmpp_hdr.seg_num = 0; + rmpp_mad->rmpp_hdr.paylen_newwin = cpu_to_be32(1); + + ret = ib_post_send_mad(msg, NULL); + if (ret) { + rdma_destroy_ah(msg->ah, RDMA_DESTROY_AH_SLEEPABLE); + ib_free_send_mad(msg); + } +} + +void ib_rmpp_send_handler(struct ib_mad_send_wc *mad_send_wc) +{ + if (mad_send_wc->send_buf->context[0] == mad_send_wc->send_buf->ah) + rdma_destroy_ah(mad_send_wc->send_buf->ah, + RDMA_DESTROY_AH_SLEEPABLE); + ib_free_send_mad(mad_send_wc->send_buf); +} + +static void nack_recv(struct ib_mad_agent_private *agent, + struct ib_mad_recv_wc *recv_wc, u8 rmpp_status) +{ + struct ib_mad_send_buf *msg; + struct ib_rmpp_mad *rmpp_mad; + int ret; + + msg = alloc_response_msg(&agent->agent, recv_wc); + if (IS_ERR(msg)) + return; + + rmpp_mad = msg->mad; + memcpy(rmpp_mad, recv_wc->recv_buf.mad, msg->hdr_len); + + rmpp_mad->mad_hdr.method ^= IB_MGMT_METHOD_RESP; + rmpp_mad->rmpp_hdr.rmpp_version = IB_MGMT_RMPP_VERSION; + rmpp_mad->rmpp_hdr.rmpp_type = IB_MGMT_RMPP_TYPE_ABORT; + ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE); + rmpp_mad->rmpp_hdr.rmpp_status = rmpp_status; + rmpp_mad->rmpp_hdr.seg_num = 0; + rmpp_mad->rmpp_hdr.paylen_newwin = 0; + + ret = ib_post_send_mad(msg, NULL); + if (ret) { + rdma_destroy_ah(msg->ah, RDMA_DESTROY_AH_SLEEPABLE); + ib_free_send_mad(msg); + } +} + +static void recv_timeout_handler(struct work_struct *work) +{ + struct mad_rmpp_recv *rmpp_recv = + container_of(work, struct mad_rmpp_recv, timeout_work.work); + struct ib_mad_recv_wc *rmpp_wc; + unsigned long flags; + + spin_lock_irqsave(&rmpp_recv->agent->lock, flags); + if (rmpp_recv->state != RMPP_STATE_ACTIVE) { + spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags); + return; + } + rmpp_recv->state = RMPP_STATE_TIMEOUT; + list_del(&rmpp_recv->list); + spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags); + + rmpp_wc = rmpp_recv->rmpp_wc; + nack_recv(rmpp_recv->agent, rmpp_wc, IB_MGMT_RMPP_STATUS_T2L); + destroy_rmpp_recv(rmpp_recv); + ib_free_recv_mad(rmpp_wc); +} + +static void recv_cleanup_handler(struct work_struct *work) +{ + struct mad_rmpp_recv *rmpp_recv = + container_of(work, struct mad_rmpp_recv, cleanup_work.work); + unsigned long flags; + + spin_lock_irqsave(&rmpp_recv->agent->lock, flags); + list_del(&rmpp_recv->list); + spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags); + destroy_rmpp_recv(rmpp_recv); +} + +static struct mad_rmpp_recv * +create_rmpp_recv(struct ib_mad_agent_private *agent, + struct ib_mad_recv_wc *mad_recv_wc) +{ + struct mad_rmpp_recv *rmpp_recv; + struct ib_mad_hdr *mad_hdr; + + rmpp_recv = kmalloc(sizeof *rmpp_recv, GFP_KERNEL); + if (!rmpp_recv) + return NULL; + + rmpp_recv->ah = ib_create_ah_from_wc(agent->agent.qp->pd, + mad_recv_wc->wc, + mad_recv_wc->recv_buf.grh, + agent->agent.port_num); + if (IS_ERR(rmpp_recv->ah)) + goto error; + + rmpp_recv->agent = agent; + init_completion(&rmpp_recv->comp); + INIT_DELAYED_WORK(&rmpp_recv->timeout_work, recv_timeout_handler); + INIT_DELAYED_WORK(&rmpp_recv->cleanup_work, recv_cleanup_handler); + spin_lock_init(&rmpp_recv->lock); + rmpp_recv->state = RMPP_STATE_ACTIVE; + refcount_set(&rmpp_recv->refcount, 1); + + rmpp_recv->rmpp_wc = mad_recv_wc; + rmpp_recv->cur_seg_buf = &mad_recv_wc->recv_buf; + rmpp_recv->newwin = 1; + rmpp_recv->seg_num = 1; + rmpp_recv->last_ack = 0; + rmpp_recv->repwin = 1; + + mad_hdr = &mad_recv_wc->recv_buf.mad->mad_hdr; + rmpp_recv->tid = mad_hdr->tid; + rmpp_recv->src_qp = mad_recv_wc->wc->src_qp; + rmpp_recv->slid = mad_recv_wc->wc->slid; + rmpp_recv->mgmt_class = mad_hdr->mgmt_class; + rmpp_recv->class_version = mad_hdr->class_version; + rmpp_recv->method = mad_hdr->method; + rmpp_recv->base_version = mad_hdr->base_version; + return rmpp_recv; + +error: kfree(rmpp_recv); + return NULL; +} + +static struct mad_rmpp_recv * +find_rmpp_recv(struct ib_mad_agent_private *agent, + struct ib_mad_recv_wc *mad_recv_wc) +{ + struct mad_rmpp_recv *rmpp_recv; + struct ib_mad_hdr *mad_hdr = &mad_recv_wc->recv_buf.mad->mad_hdr; + + list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) { + if (rmpp_recv->tid == mad_hdr->tid && + rmpp_recv->src_qp == mad_recv_wc->wc->src_qp && + rmpp_recv->slid == mad_recv_wc->wc->slid && + rmpp_recv->mgmt_class == mad_hdr->mgmt_class && + rmpp_recv->class_version == mad_hdr->class_version && + rmpp_recv->method == mad_hdr->method) + return rmpp_recv; + } + return NULL; +} + +static struct mad_rmpp_recv * +acquire_rmpp_recv(struct ib_mad_agent_private *agent, + struct ib_mad_recv_wc *mad_recv_wc) +{ + struct mad_rmpp_recv *rmpp_recv; + unsigned long flags; + + spin_lock_irqsave(&agent->lock, flags); + rmpp_recv = find_rmpp_recv(agent, mad_recv_wc); + if (rmpp_recv) + refcount_inc(&rmpp_recv->refcount); + spin_unlock_irqrestore(&agent->lock, flags); + return rmpp_recv; +} + +static struct mad_rmpp_recv * +insert_rmpp_recv(struct ib_mad_agent_private *agent, + struct mad_rmpp_recv *rmpp_recv) +{ + struct mad_rmpp_recv *cur_rmpp_recv; + + cur_rmpp_recv = find_rmpp_recv(agent, rmpp_recv->rmpp_wc); + if (!cur_rmpp_recv) + list_add_tail(&rmpp_recv->list, &agent->rmpp_list); + + return cur_rmpp_recv; +} + +static inline int get_last_flag(struct ib_mad_recv_buf *seg) +{ + struct ib_rmpp_mad *rmpp_mad; + + rmpp_mad = (struct ib_rmpp_mad *) seg->mad; + return ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_LAST; +} + +static inline int get_seg_num(struct ib_mad_recv_buf *seg) +{ + struct ib_rmpp_mad *rmpp_mad; + + rmpp_mad = (struct ib_rmpp_mad *) seg->mad; + return be32_to_cpu(rmpp_mad->rmpp_hdr.seg_num); +} + +static inline struct ib_mad_recv_buf *get_next_seg(struct list_head *rmpp_list, + struct ib_mad_recv_buf *seg) +{ + if (seg->list.next == rmpp_list) + return NULL; + + return container_of(seg->list.next, struct ib_mad_recv_buf, list); +} + +static inline int window_size(struct ib_mad_agent_private *agent) +{ + return max(agent->qp_info->recv_queue.max_active >> 3, 1); +} + +static struct ib_mad_recv_buf *find_seg_location(struct list_head *rmpp_list, + int seg_num) +{ + struct ib_mad_recv_buf *seg_buf; + int cur_seg_num; + + list_for_each_entry_reverse(seg_buf, rmpp_list, list) { + cur_seg_num = get_seg_num(seg_buf); + if (seg_num > cur_seg_num) + return seg_buf; + if (seg_num == cur_seg_num) + break; + } + return NULL; +} + +static void update_seg_num(struct mad_rmpp_recv *rmpp_recv, + struct ib_mad_recv_buf *new_buf) +{ + struct list_head *rmpp_list = &rmpp_recv->rmpp_wc->rmpp_list; + + while (new_buf && (get_seg_num(new_buf) == rmpp_recv->seg_num + 1)) { + rmpp_recv->cur_seg_buf = new_buf; + rmpp_recv->seg_num++; + new_buf = get_next_seg(rmpp_list, new_buf); + } +} + +static inline int get_mad_len(struct mad_rmpp_recv *rmpp_recv) +{ + struct ib_rmpp_mad *rmpp_mad; + int hdr_size, data_size, pad; + bool opa = rdma_cap_opa_mad(rmpp_recv->agent->qp_info->port_priv->device, + rmpp_recv->agent->qp_info->port_priv->port_num); + + rmpp_mad = (struct ib_rmpp_mad *)rmpp_recv->cur_seg_buf->mad; + + hdr_size = ib_get_mad_data_offset(rmpp_mad->mad_hdr.mgmt_class); + if (opa && rmpp_recv->base_version == OPA_MGMT_BASE_VERSION) { + data_size = sizeof(struct opa_rmpp_mad) - hdr_size; + pad = OPA_MGMT_RMPP_DATA - be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin); + if (pad > OPA_MGMT_RMPP_DATA || pad < 0) + pad = 0; + } else { + data_size = sizeof(struct ib_rmpp_mad) - hdr_size; + pad = IB_MGMT_RMPP_DATA - be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin); + if (pad > IB_MGMT_RMPP_DATA || pad < 0) + pad = 0; + } + + return hdr_size + rmpp_recv->seg_num * data_size - pad; +} + +static struct ib_mad_recv_wc *complete_rmpp(struct mad_rmpp_recv *rmpp_recv) +{ + struct ib_mad_recv_wc *rmpp_wc; + + ack_recv(rmpp_recv, rmpp_recv->rmpp_wc); + if (rmpp_recv->seg_num > 1) + cancel_delayed_work(&rmpp_recv->timeout_work); + + rmpp_wc = rmpp_recv->rmpp_wc; + rmpp_wc->mad_len = get_mad_len(rmpp_recv); + /* 10 seconds until we can find the packet lifetime */ + queue_delayed_work(rmpp_recv->agent->qp_info->port_priv->wq, + &rmpp_recv->cleanup_work, msecs_to_jiffies(10000)); + return rmpp_wc; +} + +static struct ib_mad_recv_wc * +continue_rmpp(struct ib_mad_agent_private *agent, + struct ib_mad_recv_wc *mad_recv_wc) +{ + struct mad_rmpp_recv *rmpp_recv; + struct ib_mad_recv_buf *prev_buf; + struct ib_mad_recv_wc *done_wc; + int seg_num; + unsigned long flags; + + rmpp_recv = acquire_rmpp_recv(agent, mad_recv_wc); + if (!rmpp_recv) + goto drop1; + + seg_num = get_seg_num(&mad_recv_wc->recv_buf); + + spin_lock_irqsave(&rmpp_recv->lock, flags); + if ((rmpp_recv->state == RMPP_STATE_TIMEOUT) || + (seg_num > rmpp_recv->newwin)) + goto drop3; + + if ((seg_num <= rmpp_recv->last_ack) || + (rmpp_recv->state == RMPP_STATE_COMPLETE)) { + spin_unlock_irqrestore(&rmpp_recv->lock, flags); + ack_recv(rmpp_recv, mad_recv_wc); + goto drop2; + } + + prev_buf = find_seg_location(&rmpp_recv->rmpp_wc->rmpp_list, seg_num); + if (!prev_buf) + goto drop3; + + done_wc = NULL; + list_add(&mad_recv_wc->recv_buf.list, &prev_buf->list); + if (rmpp_recv->cur_seg_buf == prev_buf) { + update_seg_num(rmpp_recv, &mad_recv_wc->recv_buf); + if (get_last_flag(rmpp_recv->cur_seg_buf)) { + rmpp_recv->state = RMPP_STATE_COMPLETE; + spin_unlock_irqrestore(&rmpp_recv->lock, flags); + done_wc = complete_rmpp(rmpp_recv); + goto out; + } else if (rmpp_recv->seg_num == rmpp_recv->newwin) { + rmpp_recv->newwin += window_size(agent); + spin_unlock_irqrestore(&rmpp_recv->lock, flags); + ack_recv(rmpp_recv, mad_recv_wc); + goto out; + } + } + spin_unlock_irqrestore(&rmpp_recv->lock, flags); +out: + deref_rmpp_recv(rmpp_recv); + return done_wc; + +drop3: spin_unlock_irqrestore(&rmpp_recv->lock, flags); +drop2: deref_rmpp_recv(rmpp_recv); +drop1: ib_free_recv_mad(mad_recv_wc); + return NULL; +} + +static struct ib_mad_recv_wc * +start_rmpp(struct ib_mad_agent_private *agent, + struct ib_mad_recv_wc *mad_recv_wc) +{ + struct mad_rmpp_recv *rmpp_recv; + unsigned long flags; + + rmpp_recv = create_rmpp_recv(agent, mad_recv_wc); + if (!rmpp_recv) { + ib_free_recv_mad(mad_recv_wc); + return NULL; + } + + spin_lock_irqsave(&agent->lock, flags); + if (insert_rmpp_recv(agent, rmpp_recv)) { + spin_unlock_irqrestore(&agent->lock, flags); + /* duplicate first MAD */ + destroy_rmpp_recv(rmpp_recv); + return continue_rmpp(agent, mad_recv_wc); + } + refcount_inc(&rmpp_recv->refcount); + + if (get_last_flag(&mad_recv_wc->recv_buf)) { + rmpp_recv->state = RMPP_STATE_COMPLETE; + spin_unlock_irqrestore(&agent->lock, flags); + complete_rmpp(rmpp_recv); + } else { + spin_unlock_irqrestore(&agent->lock, flags); + /* 40 seconds until we can find the packet lifetimes */ + queue_delayed_work(agent->qp_info->port_priv->wq, + &rmpp_recv->timeout_work, + msecs_to_jiffies(40000)); + rmpp_recv->newwin += window_size(agent); + ack_recv(rmpp_recv, mad_recv_wc); + mad_recv_wc = NULL; + } + deref_rmpp_recv(rmpp_recv); + return mad_recv_wc; +} + +static int send_next_seg(struct ib_mad_send_wr_private *mad_send_wr) +{ + struct ib_rmpp_mad *rmpp_mad; + int timeout; + u32 paylen = 0; + + rmpp_mad = mad_send_wr->send_buf.mad; + ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE); + rmpp_mad->rmpp_hdr.seg_num = cpu_to_be32(++mad_send_wr->seg_num); + + if (mad_send_wr->seg_num == 1) { + rmpp_mad->rmpp_hdr.rmpp_rtime_flags |= IB_MGMT_RMPP_FLAG_FIRST; + paylen = (mad_send_wr->send_buf.seg_count * + mad_send_wr->send_buf.seg_rmpp_size) - + mad_send_wr->pad; + } + + if (mad_send_wr->seg_num == mad_send_wr->send_buf.seg_count) { + rmpp_mad->rmpp_hdr.rmpp_rtime_flags |= IB_MGMT_RMPP_FLAG_LAST; + paylen = mad_send_wr->send_buf.seg_rmpp_size - mad_send_wr->pad; + } + rmpp_mad->rmpp_hdr.paylen_newwin = cpu_to_be32(paylen); + + /* 2 seconds for an ACK until we can find the packet lifetime */ + timeout = mad_send_wr->send_buf.timeout_ms; + if (!timeout || timeout > 2000) + mad_send_wr->timeout = msecs_to_jiffies(2000); + + return ib_send_mad(mad_send_wr); +} + +static void abort_send(struct ib_mad_agent_private *agent, + struct ib_mad_recv_wc *mad_recv_wc, u8 rmpp_status) +{ + struct ib_mad_send_wr_private *mad_send_wr; + struct ib_mad_send_wc wc; + unsigned long flags; + + spin_lock_irqsave(&agent->lock, flags); + mad_send_wr = ib_find_send_mad(agent, mad_recv_wc); + if (!mad_send_wr) + goto out; /* Unmatched send */ + + if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) || + (!mad_send_wr->timeout) || (mad_send_wr->status != IB_WC_SUCCESS)) + goto out; /* Send is already done */ + + ib_mark_mad_done(mad_send_wr); + spin_unlock_irqrestore(&agent->lock, flags); + + wc.status = IB_WC_REM_ABORT_ERR; + wc.vendor_err = rmpp_status; + wc.send_buf = &mad_send_wr->send_buf; + ib_mad_complete_send_wr(mad_send_wr, &wc); + return; +out: + spin_unlock_irqrestore(&agent->lock, flags); +} + +static inline void adjust_last_ack(struct ib_mad_send_wr_private *wr, + int seg_num) +{ + struct list_head *list; + + wr->last_ack = seg_num; + list = &wr->last_ack_seg->list; + list_for_each_entry(wr->last_ack_seg, list, list) + if (wr->last_ack_seg->num == seg_num) + break; +} + +static void process_ds_ack(struct ib_mad_agent_private *agent, + struct ib_mad_recv_wc *mad_recv_wc, int newwin) +{ + struct mad_rmpp_recv *rmpp_recv; + + rmpp_recv = find_rmpp_recv(agent, mad_recv_wc); + if (rmpp_recv && rmpp_recv->state == RMPP_STATE_COMPLETE) + rmpp_recv->repwin = newwin; +} + +static void process_rmpp_ack(struct ib_mad_agent_private *agent, + struct ib_mad_recv_wc *mad_recv_wc) +{ + struct ib_mad_send_wr_private *mad_send_wr; + struct ib_rmpp_mad *rmpp_mad; + unsigned long flags; + int seg_num, newwin, ret; + + rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad; + if (rmpp_mad->rmpp_hdr.rmpp_status) { + abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS); + nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS); + return; + } + + seg_num = be32_to_cpu(rmpp_mad->rmpp_hdr.seg_num); + newwin = be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin); + if (newwin < seg_num) { + abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_W2S); + nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_W2S); + return; + } + + spin_lock_irqsave(&agent->lock, flags); + mad_send_wr = ib_find_send_mad(agent, mad_recv_wc); + if (!mad_send_wr) { + if (!seg_num) + process_ds_ack(agent, mad_recv_wc, newwin); + goto out; /* Unmatched or DS RMPP ACK */ + } + + if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) && + (mad_send_wr->timeout)) { + spin_unlock_irqrestore(&agent->lock, flags); + ack_ds_ack(agent, mad_recv_wc); + return; /* Repeated ACK for DS RMPP transaction */ + } + + if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) || + (!mad_send_wr->timeout) || (mad_send_wr->status != IB_WC_SUCCESS)) + goto out; /* Send is already done */ + + if (seg_num > mad_send_wr->send_buf.seg_count || + seg_num > mad_send_wr->newwin) { + spin_unlock_irqrestore(&agent->lock, flags); + abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_S2B); + nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_S2B); + return; + } + + if (newwin < mad_send_wr->newwin || seg_num < mad_send_wr->last_ack) + goto out; /* Old ACK */ + + if (seg_num > mad_send_wr->last_ack) { + adjust_last_ack(mad_send_wr, seg_num); + mad_send_wr->retries_left = mad_send_wr->max_retries; + } + mad_send_wr->newwin = newwin; + if (mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) { + /* If no response is expected, the ACK completes the send */ + if (!mad_send_wr->send_buf.timeout_ms) { + struct ib_mad_send_wc wc; + + ib_mark_mad_done(mad_send_wr); + spin_unlock_irqrestore(&agent->lock, flags); + + wc.status = IB_WC_SUCCESS; + wc.vendor_err = 0; + wc.send_buf = &mad_send_wr->send_buf; + ib_mad_complete_send_wr(mad_send_wr, &wc); + return; + } + if (mad_send_wr->refcount == 1) + ib_reset_mad_timeout(mad_send_wr, + mad_send_wr->send_buf.timeout_ms); + spin_unlock_irqrestore(&agent->lock, flags); + ack_ds_ack(agent, mad_recv_wc); + return; + } else if (mad_send_wr->refcount == 1 && + mad_send_wr->seg_num < mad_send_wr->newwin && + mad_send_wr->seg_num < mad_send_wr->send_buf.seg_count) { + /* Send failure will just result in a timeout/retry */ + ret = send_next_seg(mad_send_wr); + if (ret) + goto out; + + mad_send_wr->refcount++; + list_move_tail(&mad_send_wr->agent_list, + &mad_send_wr->mad_agent_priv->send_list); + } +out: + spin_unlock_irqrestore(&agent->lock, flags); +} + +static struct ib_mad_recv_wc * +process_rmpp_data(struct ib_mad_agent_private *agent, + struct ib_mad_recv_wc *mad_recv_wc) +{ + struct ib_rmpp_hdr *rmpp_hdr; + u8 rmpp_status; + + rmpp_hdr = &((struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad)->rmpp_hdr; + + if (rmpp_hdr->rmpp_status) { + rmpp_status = IB_MGMT_RMPP_STATUS_BAD_STATUS; + goto bad; + } + + if (rmpp_hdr->seg_num == cpu_to_be32(1)) { + if (!(ib_get_rmpp_flags(rmpp_hdr) & IB_MGMT_RMPP_FLAG_FIRST)) { + rmpp_status = IB_MGMT_RMPP_STATUS_BAD_SEG; + goto bad; + } + return start_rmpp(agent, mad_recv_wc); + } else { + if (ib_get_rmpp_flags(rmpp_hdr) & IB_MGMT_RMPP_FLAG_FIRST) { + rmpp_status = IB_MGMT_RMPP_STATUS_BAD_SEG; + goto bad; + } + return continue_rmpp(agent, mad_recv_wc); + } +bad: + nack_recv(agent, mad_recv_wc, rmpp_status); + ib_free_recv_mad(mad_recv_wc); + return NULL; +} + +static void process_rmpp_stop(struct ib_mad_agent_private *agent, + struct ib_mad_recv_wc *mad_recv_wc) +{ + struct ib_rmpp_mad *rmpp_mad; + + rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad; + + if (rmpp_mad->rmpp_hdr.rmpp_status != IB_MGMT_RMPP_STATUS_RESX) { + abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS); + nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS); + } else + abort_send(agent, mad_recv_wc, rmpp_mad->rmpp_hdr.rmpp_status); +} + +static void process_rmpp_abort(struct ib_mad_agent_private *agent, + struct ib_mad_recv_wc *mad_recv_wc) +{ + struct ib_rmpp_mad *rmpp_mad; + + rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad; + + if (rmpp_mad->rmpp_hdr.rmpp_status < IB_MGMT_RMPP_STATUS_ABORT_MIN || + rmpp_mad->rmpp_hdr.rmpp_status > IB_MGMT_RMPP_STATUS_ABORT_MAX) { + abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS); + nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS); + } else + abort_send(agent, mad_recv_wc, rmpp_mad->rmpp_hdr.rmpp_status); +} + +struct ib_mad_recv_wc * +ib_process_rmpp_recv_wc(struct ib_mad_agent_private *agent, + struct ib_mad_recv_wc *mad_recv_wc) +{ + struct ib_rmpp_mad *rmpp_mad; + + rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad; + if (!(rmpp_mad->rmpp_hdr.rmpp_rtime_flags & IB_MGMT_RMPP_FLAG_ACTIVE)) + return mad_recv_wc; + + if (rmpp_mad->rmpp_hdr.rmpp_version != IB_MGMT_RMPP_VERSION) { + abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_UNV); + nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_UNV); + goto out; + } + + switch (rmpp_mad->rmpp_hdr.rmpp_type) { + case IB_MGMT_RMPP_TYPE_DATA: + return process_rmpp_data(agent, mad_recv_wc); + case IB_MGMT_RMPP_TYPE_ACK: + process_rmpp_ack(agent, mad_recv_wc); + break; + case IB_MGMT_RMPP_TYPE_STOP: + process_rmpp_stop(agent, mad_recv_wc); + break; + case IB_MGMT_RMPP_TYPE_ABORT: + process_rmpp_abort(agent, mad_recv_wc); + break; + default: + abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BADT); + nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BADT); + break; + } +out: + ib_free_recv_mad(mad_recv_wc); + return NULL; +} + +static int init_newwin(struct ib_mad_send_wr_private *mad_send_wr) +{ + struct ib_mad_agent_private *agent = mad_send_wr->mad_agent_priv; + struct ib_mad_hdr *mad_hdr = mad_send_wr->send_buf.mad; + struct mad_rmpp_recv *rmpp_recv; + struct rdma_ah_attr ah_attr; + unsigned long flags; + int newwin = 1; + + if (!(mad_hdr->method & IB_MGMT_METHOD_RESP)) + goto out; + + spin_lock_irqsave(&agent->lock, flags); + list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) { + if (rmpp_recv->tid != mad_hdr->tid || + rmpp_recv->mgmt_class != mad_hdr->mgmt_class || + rmpp_recv->class_version != mad_hdr->class_version || + (rmpp_recv->method & IB_MGMT_METHOD_RESP)) + continue; + + if (rdma_query_ah(mad_send_wr->send_buf.ah, &ah_attr)) + continue; + + if (rmpp_recv->slid == rdma_ah_get_dlid(&ah_attr)) { + newwin = rmpp_recv->repwin; + break; + } + } + spin_unlock_irqrestore(&agent->lock, flags); +out: + return newwin; +} + +int ib_send_rmpp_mad(struct ib_mad_send_wr_private *mad_send_wr) +{ + struct ib_rmpp_mad *rmpp_mad; + int ret; + + rmpp_mad = mad_send_wr->send_buf.mad; + if (!(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & + IB_MGMT_RMPP_FLAG_ACTIVE)) + return IB_RMPP_RESULT_UNHANDLED; + + if (rmpp_mad->rmpp_hdr.rmpp_type != IB_MGMT_RMPP_TYPE_DATA) { + mad_send_wr->seg_num = 1; + return IB_RMPP_RESULT_INTERNAL; + } + + mad_send_wr->newwin = init_newwin(mad_send_wr); + + /* We need to wait for the final ACK even if there isn't a response */ + mad_send_wr->refcount += (mad_send_wr->timeout == 0); + ret = send_next_seg(mad_send_wr); + if (!ret) + return IB_RMPP_RESULT_CONSUMED; + return ret; +} + +int ib_process_rmpp_send_wc(struct ib_mad_send_wr_private *mad_send_wr, + struct ib_mad_send_wc *mad_send_wc) +{ + struct ib_rmpp_mad *rmpp_mad; + int ret; + + rmpp_mad = mad_send_wr->send_buf.mad; + if (!(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & + IB_MGMT_RMPP_FLAG_ACTIVE)) + return IB_RMPP_RESULT_UNHANDLED; /* RMPP not active */ + + if (rmpp_mad->rmpp_hdr.rmpp_type != IB_MGMT_RMPP_TYPE_DATA) + return IB_RMPP_RESULT_INTERNAL; /* ACK, STOP, or ABORT */ + + if (mad_send_wc->status != IB_WC_SUCCESS || + mad_send_wr->status != IB_WC_SUCCESS) + return IB_RMPP_RESULT_PROCESSED; /* Canceled or send error */ + + if (!mad_send_wr->timeout) + return IB_RMPP_RESULT_PROCESSED; /* Response received */ + + if (mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) { + mad_send_wr->timeout = + msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms); + return IB_RMPP_RESULT_PROCESSED; /* Send done */ + } + + if (mad_send_wr->seg_num == mad_send_wr->newwin || + mad_send_wr->seg_num == mad_send_wr->send_buf.seg_count) + return IB_RMPP_RESULT_PROCESSED; /* Wait for ACK */ + + ret = send_next_seg(mad_send_wr); + if (ret) { + mad_send_wc->status = IB_WC_GENERAL_ERR; + return IB_RMPP_RESULT_PROCESSED; + } + return IB_RMPP_RESULT_CONSUMED; +} + +int ib_retry_rmpp(struct ib_mad_send_wr_private *mad_send_wr) +{ + struct ib_rmpp_mad *rmpp_mad; + int ret; + + rmpp_mad = mad_send_wr->send_buf.mad; + if (!(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & + IB_MGMT_RMPP_FLAG_ACTIVE)) + return IB_RMPP_RESULT_UNHANDLED; /* RMPP not active */ + + if (mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) + return IB_RMPP_RESULT_PROCESSED; + + mad_send_wr->seg_num = mad_send_wr->last_ack; + mad_send_wr->cur_seg = mad_send_wr->last_ack_seg; + + ret = send_next_seg(mad_send_wr); + if (ret) + return IB_RMPP_RESULT_PROCESSED; + + return IB_RMPP_RESULT_CONSUMED; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/mad_rmpp.h b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/mad_rmpp.h new file mode 100644 index 0000000..3d336bf --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/mad_rmpp.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2005 Intel Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __MAD_RMPP_H__ +#define __MAD_RMPP_H__ + +enum { + IB_RMPP_RESULT_PROCESSED, + IB_RMPP_RESULT_CONSUMED, + IB_RMPP_RESULT_INTERNAL, + IB_RMPP_RESULT_UNHANDLED +}; + +int ib_send_rmpp_mad(struct ib_mad_send_wr_private *mad_send_wr); + +struct ib_mad_recv_wc * +ib_process_rmpp_recv_wc(struct ib_mad_agent_private *agent, + struct ib_mad_recv_wc *mad_recv_wc); + +int ib_process_rmpp_send_wc(struct ib_mad_send_wr_private *mad_send_wr, + struct ib_mad_send_wc *mad_send_wc); + +void ib_rmpp_send_handler(struct ib_mad_send_wc *mad_send_wc); + +void ib_cancel_rmpp_recvs(struct ib_mad_agent_private *agent); + +int ib_retry_rmpp(struct ib_mad_send_wr_private *mad_send_wr); + +#endif /* __MAD_RMPP_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/mr_pool.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/mr_pool.c new file mode 100644 index 0000000..c0e2df1 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/mr_pool.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2016 HGST, a Western Digital Company. + */ +#include +#include + +struct ib_mr *ib_mr_pool_get(struct ib_qp *qp, struct list_head *list) +{ + struct ib_mr *mr; + unsigned long flags; + + spin_lock_irqsave(&qp->mr_lock, flags); + mr = list_first_entry_or_null(list, struct ib_mr, qp_entry); + if (mr) { + list_del(&mr->qp_entry); + qp->mrs_used++; + } + spin_unlock_irqrestore(&qp->mr_lock, flags); + + return mr; +} +EXPORT_SYMBOL(ib_mr_pool_get); + +void ib_mr_pool_put(struct ib_qp *qp, struct list_head *list, struct ib_mr *mr) +{ + unsigned long flags; + + spin_lock_irqsave(&qp->mr_lock, flags); + list_add(&mr->qp_entry, list); + qp->mrs_used--; + spin_unlock_irqrestore(&qp->mr_lock, flags); +} +EXPORT_SYMBOL(ib_mr_pool_put); + +int ib_mr_pool_init(struct ib_qp *qp, struct list_head *list, int nr, + enum ib_mr_type type, u32 max_num_sg, u32 max_num_meta_sg) +{ + struct ib_mr *mr; + unsigned long flags; + int ret, i; + + for (i = 0; i < nr; i++) { + if (type == IB_MR_TYPE_INTEGRITY) + mr = ib_alloc_mr_integrity(qp->pd, max_num_sg, + max_num_meta_sg); + else + mr = ib_alloc_mr(qp->pd, type, max_num_sg); + if (IS_ERR(mr)) { + ret = PTR_ERR(mr); + goto out; + } + + spin_lock_irqsave(&qp->mr_lock, flags); + list_add_tail(&mr->qp_entry, list); + spin_unlock_irqrestore(&qp->mr_lock, flags); + } + + return 0; +out: + ib_mr_pool_destroy(qp, list); + return ret; +} +EXPORT_SYMBOL(ib_mr_pool_init); + +void ib_mr_pool_destroy(struct ib_qp *qp, struct list_head *list) +{ + struct ib_mr *mr; + unsigned long flags; + + spin_lock_irqsave(&qp->mr_lock, flags); + while (!list_empty(list)) { + mr = list_first_entry(list, struct ib_mr, qp_entry); + list_del(&mr->qp_entry); + + spin_unlock_irqrestore(&qp->mr_lock, flags); + ib_dereg_mr(mr); + spin_lock_irqsave(&qp->mr_lock, flags); + } + spin_unlock_irqrestore(&qp->mr_lock, flags); +} +EXPORT_SYMBOL(ib_mr_pool_destroy); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/multicast.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/multicast.c new file mode 100644 index 0000000..061c027 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/multicast.c @@ -0,0 +1,906 @@ +/* + * Copyright (c) 2006 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "sa.h" + +static int mcast_add_one(struct ib_device *device); +static void mcast_remove_one(struct ib_device *device, void *client_data); + +static struct ib_client mcast_client = { + .name = "ib_multicast", + .add = mcast_add_one, + .remove = mcast_remove_one +}; + +static struct ib_sa_client sa_client; +static struct workqueue_struct *mcast_wq; +static union ib_gid mgid0; + +struct mcast_device; + +struct mcast_port { + struct mcast_device *dev; + spinlock_t lock; + struct rb_root table; + refcount_t refcount; + struct completion comp; + u32 port_num; +}; + +struct mcast_device { + struct ib_device *device; + struct ib_event_handler event_handler; + int start_port; + int end_port; + struct mcast_port port[]; +}; + +enum mcast_state { + MCAST_JOINING, + MCAST_MEMBER, + MCAST_ERROR, +}; + +enum mcast_group_state { + MCAST_IDLE, + MCAST_BUSY, + MCAST_GROUP_ERROR, + MCAST_PKEY_EVENT +}; + +enum { + MCAST_INVALID_PKEY_INDEX = 0xFFFF +}; + +struct mcast_member; + +struct mcast_group { + struct ib_sa_mcmember_rec rec; + struct rb_node node; + struct mcast_port *port; + spinlock_t lock; + struct work_struct work; + struct list_head pending_list; + struct list_head active_list; + struct mcast_member *last_join; + int members[NUM_JOIN_MEMBERSHIP_TYPES]; + atomic_t refcount; + enum mcast_group_state state; + struct ib_sa_query *query; + u16 pkey_index; + u8 leave_state; + int retries; +}; + +struct mcast_member { + struct ib_sa_multicast multicast; + struct ib_sa_client *client; + struct mcast_group *group; + struct list_head list; + enum mcast_state state; + refcount_t refcount; + struct completion comp; +}; + +static void join_handler(int status, struct ib_sa_mcmember_rec *rec, + void *context); +static void leave_handler(int status, struct ib_sa_mcmember_rec *rec, + void *context); + +static struct mcast_group *mcast_find(struct mcast_port *port, + union ib_gid *mgid) +{ + struct rb_node *node = port->table.rb_node; + struct mcast_group *group; + int ret; + + while (node) { + group = rb_entry(node, struct mcast_group, node); + ret = memcmp(mgid->raw, group->rec.mgid.raw, sizeof *mgid); + if (!ret) + return group; + + if (ret < 0) + node = node->rb_left; + else + node = node->rb_right; + } + return NULL; +} + +static struct mcast_group *mcast_insert(struct mcast_port *port, + struct mcast_group *group, + int allow_duplicates) +{ + struct rb_node **link = &port->table.rb_node; + struct rb_node *parent = NULL; + struct mcast_group *cur_group; + int ret; + + while (*link) { + parent = *link; + cur_group = rb_entry(parent, struct mcast_group, node); + + ret = memcmp(group->rec.mgid.raw, cur_group->rec.mgid.raw, + sizeof group->rec.mgid); + if (ret < 0) + link = &(*link)->rb_left; + else if (ret > 0) + link = &(*link)->rb_right; + else if (allow_duplicates) + link = &(*link)->rb_left; + else + return cur_group; + } + rb_link_node(&group->node, parent, link); + rb_insert_color(&group->node, &port->table); + return NULL; +} + +static void deref_port(struct mcast_port *port) +{ + if (refcount_dec_and_test(&port->refcount)) + complete(&port->comp); +} + +static void release_group(struct mcast_group *group) +{ + struct mcast_port *port = group->port; + unsigned long flags; + + spin_lock_irqsave(&port->lock, flags); + if (atomic_dec_and_test(&group->refcount)) { + rb_erase(&group->node, &port->table); + spin_unlock_irqrestore(&port->lock, flags); + kfree(group); + deref_port(port); + } else + spin_unlock_irqrestore(&port->lock, flags); +} + +static void deref_member(struct mcast_member *member) +{ + if (refcount_dec_and_test(&member->refcount)) + complete(&member->comp); +} + +static void queue_join(struct mcast_member *member) +{ + struct mcast_group *group = member->group; + unsigned long flags; + + spin_lock_irqsave(&group->lock, flags); + list_add_tail(&member->list, &group->pending_list); + if (group->state == MCAST_IDLE) { + group->state = MCAST_BUSY; + atomic_inc(&group->refcount); + queue_work(mcast_wq, &group->work); + } + spin_unlock_irqrestore(&group->lock, flags); +} + +/* + * A multicast group has four types of members: full member, non member, + * sendonly non member and sendonly full member. + * We need to keep track of the number of members of each + * type based on their join state. Adjust the number of members the belong to + * the specified join states. + */ +static void adjust_membership(struct mcast_group *group, u8 join_state, int inc) +{ + int i; + + for (i = 0; i < NUM_JOIN_MEMBERSHIP_TYPES; i++, join_state >>= 1) + if (join_state & 0x1) + group->members[i] += inc; +} + +/* + * If a multicast group has zero members left for a particular join state, but + * the group is still a member with the SA, we need to leave that join state. + * Determine which join states we still belong to, but that do not have any + * active members. + */ +static u8 get_leave_state(struct mcast_group *group) +{ + u8 leave_state = 0; + int i; + + for (i = 0; i < NUM_JOIN_MEMBERSHIP_TYPES; i++) + if (!group->members[i]) + leave_state |= (0x1 << i); + + return leave_state & group->rec.join_state; +} + +static int check_selector(ib_sa_comp_mask comp_mask, + ib_sa_comp_mask selector_mask, + ib_sa_comp_mask value_mask, + u8 selector, u8 src_value, u8 dst_value) +{ + int err; + + if (!(comp_mask & selector_mask) || !(comp_mask & value_mask)) + return 0; + + switch (selector) { + case IB_SA_GT: + err = (src_value <= dst_value); + break; + case IB_SA_LT: + err = (src_value >= dst_value); + break; + case IB_SA_EQ: + err = (src_value != dst_value); + break; + default: + err = 0; + break; + } + + return err; +} + +static int cmp_rec(struct ib_sa_mcmember_rec *src, + struct ib_sa_mcmember_rec *dst, ib_sa_comp_mask comp_mask) +{ + /* MGID must already match */ + + if (comp_mask & IB_SA_MCMEMBER_REC_PORT_GID && + memcmp(&src->port_gid, &dst->port_gid, sizeof src->port_gid)) + return -EINVAL; + if (comp_mask & IB_SA_MCMEMBER_REC_QKEY && src->qkey != dst->qkey) + return -EINVAL; + if (comp_mask & IB_SA_MCMEMBER_REC_MLID && src->mlid != dst->mlid) + return -EINVAL; + if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_MTU_SELECTOR, + IB_SA_MCMEMBER_REC_MTU, dst->mtu_selector, + src->mtu, dst->mtu)) + return -EINVAL; + if (comp_mask & IB_SA_MCMEMBER_REC_TRAFFIC_CLASS && + src->traffic_class != dst->traffic_class) + return -EINVAL; + if (comp_mask & IB_SA_MCMEMBER_REC_PKEY && src->pkey != dst->pkey) + return -EINVAL; + if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_RATE_SELECTOR, + IB_SA_MCMEMBER_REC_RATE, dst->rate_selector, + src->rate, dst->rate)) + return -EINVAL; + if (check_selector(comp_mask, + IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR, + IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME, + dst->packet_life_time_selector, + src->packet_life_time, dst->packet_life_time)) + return -EINVAL; + if (comp_mask & IB_SA_MCMEMBER_REC_SL && src->sl != dst->sl) + return -EINVAL; + if (comp_mask & IB_SA_MCMEMBER_REC_FLOW_LABEL && + src->flow_label != dst->flow_label) + return -EINVAL; + if (comp_mask & IB_SA_MCMEMBER_REC_HOP_LIMIT && + src->hop_limit != dst->hop_limit) + return -EINVAL; + if (comp_mask & IB_SA_MCMEMBER_REC_SCOPE && src->scope != dst->scope) + return -EINVAL; + + /* join_state checked separately, proxy_join ignored */ + + return 0; +} + +static int send_join(struct mcast_group *group, struct mcast_member *member) +{ + struct mcast_port *port = group->port; + int ret; + + group->last_join = member; + ret = ib_sa_mcmember_rec_query(&sa_client, port->dev->device, + port->port_num, IB_MGMT_METHOD_SET, + &member->multicast.rec, + member->multicast.comp_mask, + 1000, 3, GFP_KERNEL, join_handler, group, + &group->query); + return (ret > 0) ? 0 : ret; +} + +static int send_leave(struct mcast_group *group, u8 leave_state) +{ + struct mcast_port *port = group->port; + struct ib_sa_mcmember_rec rec; + int ret; + + rec = group->rec; + rec.join_state = leave_state; + group->leave_state = leave_state; + + ret = ib_sa_mcmember_rec_query(&sa_client, port->dev->device, + port->port_num, IB_SA_METHOD_DELETE, &rec, + IB_SA_MCMEMBER_REC_MGID | + IB_SA_MCMEMBER_REC_PORT_GID | + IB_SA_MCMEMBER_REC_JOIN_STATE, + 1000, 3, GFP_KERNEL, leave_handler, + group, &group->query); + return (ret > 0) ? 0 : ret; +} + +static void join_group(struct mcast_group *group, struct mcast_member *member, + u8 join_state) +{ + member->state = MCAST_MEMBER; + adjust_membership(group, join_state, 1); + group->rec.join_state |= join_state; + member->multicast.rec = group->rec; + member->multicast.rec.join_state = join_state; + list_move(&member->list, &group->active_list); +} + +static int fail_join(struct mcast_group *group, struct mcast_member *member, + int status) +{ + spin_lock_irq(&group->lock); + list_del_init(&member->list); + spin_unlock_irq(&group->lock); + return member->multicast.callback(status, &member->multicast); +} + +static void process_group_error(struct mcast_group *group) +{ + struct mcast_member *member; + int ret = 0; + u16 pkey_index; + + if (group->state == MCAST_PKEY_EVENT) + ret = ib_find_pkey(group->port->dev->device, + group->port->port_num, + be16_to_cpu(group->rec.pkey), &pkey_index); + + spin_lock_irq(&group->lock); + if (group->state == MCAST_PKEY_EVENT && !ret && + group->pkey_index == pkey_index) + goto out; + + while (!list_empty(&group->active_list)) { + member = list_entry(group->active_list.next, + struct mcast_member, list); + refcount_inc(&member->refcount); + list_del_init(&member->list); + adjust_membership(group, member->multicast.rec.join_state, -1); + member->state = MCAST_ERROR; + spin_unlock_irq(&group->lock); + + ret = member->multicast.callback(-ENETRESET, + &member->multicast); + deref_member(member); + if (ret) + ib_sa_free_multicast(&member->multicast); + spin_lock_irq(&group->lock); + } + + group->rec.join_state = 0; +out: + group->state = MCAST_BUSY; + spin_unlock_irq(&group->lock); +} + +static void mcast_work_handler(struct work_struct *work) +{ + struct mcast_group *group; + struct mcast_member *member; + struct ib_sa_multicast *multicast; + int status, ret; + u8 join_state; + + group = container_of(work, typeof(*group), work); +retest: + spin_lock_irq(&group->lock); + while (!list_empty(&group->pending_list) || + (group->state != MCAST_BUSY)) { + + if (group->state != MCAST_BUSY) { + spin_unlock_irq(&group->lock); + process_group_error(group); + goto retest; + } + + member = list_entry(group->pending_list.next, + struct mcast_member, list); + multicast = &member->multicast; + join_state = multicast->rec.join_state; + refcount_inc(&member->refcount); + + if (join_state == (group->rec.join_state & join_state)) { + status = cmp_rec(&group->rec, &multicast->rec, + multicast->comp_mask); + if (!status) + join_group(group, member, join_state); + else + list_del_init(&member->list); + spin_unlock_irq(&group->lock); + ret = multicast->callback(status, multicast); + } else { + spin_unlock_irq(&group->lock); + status = send_join(group, member); + if (!status) { + deref_member(member); + return; + } + ret = fail_join(group, member, status); + } + + deref_member(member); + if (ret) + ib_sa_free_multicast(&member->multicast); + spin_lock_irq(&group->lock); + } + + join_state = get_leave_state(group); + if (join_state) { + group->rec.join_state &= ~join_state; + spin_unlock_irq(&group->lock); + if (send_leave(group, join_state)) + goto retest; + } else { + group->state = MCAST_IDLE; + spin_unlock_irq(&group->lock); + release_group(group); + } +} + +/* + * Fail a join request if it is still active - at the head of the pending queue. + */ +static void process_join_error(struct mcast_group *group, int status) +{ + struct mcast_member *member; + int ret; + + spin_lock_irq(&group->lock); + member = list_entry(group->pending_list.next, + struct mcast_member, list); + if (group->last_join == member) { + refcount_inc(&member->refcount); + list_del_init(&member->list); + spin_unlock_irq(&group->lock); + ret = member->multicast.callback(status, &member->multicast); + deref_member(member); + if (ret) + ib_sa_free_multicast(&member->multicast); + } else + spin_unlock_irq(&group->lock); +} + +static void join_handler(int status, struct ib_sa_mcmember_rec *rec, + void *context) +{ + struct mcast_group *group = context; + u16 pkey_index = MCAST_INVALID_PKEY_INDEX; + + if (status) + process_join_error(group, status); + else { + int mgids_changed, is_mgid0; + + if (ib_find_pkey(group->port->dev->device, + group->port->port_num, be16_to_cpu(rec->pkey), + &pkey_index)) + pkey_index = MCAST_INVALID_PKEY_INDEX; + + spin_lock_irq(&group->port->lock); + if (group->state == MCAST_BUSY && + group->pkey_index == MCAST_INVALID_PKEY_INDEX) + group->pkey_index = pkey_index; + mgids_changed = memcmp(&rec->mgid, &group->rec.mgid, + sizeof(group->rec.mgid)); + group->rec = *rec; + if (mgids_changed) { + rb_erase(&group->node, &group->port->table); + is_mgid0 = !memcmp(&mgid0, &group->rec.mgid, + sizeof(mgid0)); + mcast_insert(group->port, group, is_mgid0); + } + spin_unlock_irq(&group->port->lock); + } + mcast_work_handler(&group->work); +} + +static void leave_handler(int status, struct ib_sa_mcmember_rec *rec, + void *context) +{ + struct mcast_group *group = context; + + if (status && group->retries > 0 && + !send_leave(group, group->leave_state)) + group->retries--; + else + mcast_work_handler(&group->work); +} + +static struct mcast_group *acquire_group(struct mcast_port *port, + union ib_gid *mgid, gfp_t gfp_mask) +{ + struct mcast_group *group, *cur_group; + unsigned long flags; + int is_mgid0; + + is_mgid0 = !memcmp(&mgid0, mgid, sizeof mgid0); + if (!is_mgid0) { + spin_lock_irqsave(&port->lock, flags); + group = mcast_find(port, mgid); + if (group) + goto found; + spin_unlock_irqrestore(&port->lock, flags); + } + + group = kzalloc(sizeof *group, gfp_mask); + if (!group) + return NULL; + + group->retries = 3; + group->port = port; + group->rec.mgid = *mgid; + group->pkey_index = MCAST_INVALID_PKEY_INDEX; + INIT_LIST_HEAD(&group->pending_list); + INIT_LIST_HEAD(&group->active_list); + INIT_WORK(&group->work, mcast_work_handler); + spin_lock_init(&group->lock); + + spin_lock_irqsave(&port->lock, flags); + cur_group = mcast_insert(port, group, is_mgid0); + if (cur_group) { + kfree(group); + group = cur_group; + } else + refcount_inc(&port->refcount); +found: + atomic_inc(&group->refcount); + spin_unlock_irqrestore(&port->lock, flags); + return group; +} + +/* + * We serialize all join requests to a single group to make our lives much + * easier. Otherwise, two users could try to join the same group + * simultaneously, with different configurations, one could leave while the + * join is in progress, etc., which makes locking around error recovery + * difficult. + */ +struct ib_sa_multicast * +ib_sa_join_multicast(struct ib_sa_client *client, + struct ib_device *device, u32 port_num, + struct ib_sa_mcmember_rec *rec, + ib_sa_comp_mask comp_mask, gfp_t gfp_mask, + int (*callback)(int status, + struct ib_sa_multicast *multicast), + void *context) +{ + struct mcast_device *dev; + struct mcast_member *member; + struct ib_sa_multicast *multicast; + int ret; + + dev = ib_get_client_data(device, &mcast_client); + if (!dev) + return ERR_PTR(-ENODEV); + + member = kmalloc(sizeof *member, gfp_mask); + if (!member) + return ERR_PTR(-ENOMEM); + + ib_sa_client_get(client); + member->client = client; + member->multicast.rec = *rec; + member->multicast.comp_mask = comp_mask; + member->multicast.callback = callback; + member->multicast.context = context; + init_completion(&member->comp); + refcount_set(&member->refcount, 1); + member->state = MCAST_JOINING; + + member->group = acquire_group(&dev->port[port_num - dev->start_port], + &rec->mgid, gfp_mask); + if (!member->group) { + ret = -ENOMEM; + goto err; + } + + /* + * The user will get the multicast structure in their callback. They + * could then free the multicast structure before we can return from + * this routine. So we save the pointer to return before queuing + * any callback. + */ + multicast = &member->multicast; + queue_join(member); + return multicast; + +err: + ib_sa_client_put(client); + kfree(member); + return ERR_PTR(ret); +} +EXPORT_SYMBOL(ib_sa_join_multicast); + +void ib_sa_free_multicast(struct ib_sa_multicast *multicast) +{ + struct mcast_member *member; + struct mcast_group *group; + + member = container_of(multicast, struct mcast_member, multicast); + group = member->group; + + spin_lock_irq(&group->lock); + if (member->state == MCAST_MEMBER) + adjust_membership(group, multicast->rec.join_state, -1); + + list_del_init(&member->list); + + if (group->state == MCAST_IDLE) { + group->state = MCAST_BUSY; + spin_unlock_irq(&group->lock); + /* Continue to hold reference on group until callback */ + queue_work(mcast_wq, &group->work); + } else { + spin_unlock_irq(&group->lock); + release_group(group); + } + + deref_member(member); + wait_for_completion(&member->comp); + ib_sa_client_put(member->client); + kfree(member); +} +EXPORT_SYMBOL(ib_sa_free_multicast); + +int ib_sa_get_mcmember_rec(struct ib_device *device, u32 port_num, + union ib_gid *mgid, struct ib_sa_mcmember_rec *rec) +{ + struct mcast_device *dev; + struct mcast_port *port; + struct mcast_group *group; + unsigned long flags; + int ret = 0; + + dev = ib_get_client_data(device, &mcast_client); + if (!dev) + return -ENODEV; + + port = &dev->port[port_num - dev->start_port]; + spin_lock_irqsave(&port->lock, flags); + group = mcast_find(port, mgid); + if (group) + *rec = group->rec; + else + ret = -EADDRNOTAVAIL; + spin_unlock_irqrestore(&port->lock, flags); + + return ret; +} +EXPORT_SYMBOL(ib_sa_get_mcmember_rec); + +/** + * ib_init_ah_from_mcmember - Initialize AH attribute from multicast + * member record and gid of the device. + * @device: RDMA device + * @port_num: Port of the rdma device to consider + * @rec: Multicast member record to use + * @ndev: Optional netdevice, applicable only for RoCE + * @gid_type: GID type to consider + * @ah_attr: AH attribute to fillup on successful completion + * + * ib_init_ah_from_mcmember() initializes AH attribute based on multicast + * member record and other device properties. On success the caller is + * responsible to call rdma_destroy_ah_attr on the ah_attr. Returns 0 on + * success or appropriate error code. + * + */ +int ib_init_ah_from_mcmember(struct ib_device *device, u32 port_num, + struct ib_sa_mcmember_rec *rec, + struct net_device *ndev, + enum ib_gid_type gid_type, + struct rdma_ah_attr *ah_attr) +{ + const struct ib_gid_attr *sgid_attr; + + /* GID table is not based on the netdevice for IB link layer, + * so ignore ndev during search. + */ + if (rdma_protocol_ib(device, port_num)) + ndev = NULL; + else if (!rdma_protocol_roce(device, port_num)) + return -EINVAL; + + sgid_attr = rdma_find_gid_by_port(device, &rec->port_gid, + gid_type, port_num, ndev); + if (IS_ERR(sgid_attr)) + return PTR_ERR(sgid_attr); + + memset(ah_attr, 0, sizeof(*ah_attr)); + ah_attr->type = rdma_ah_find_type(device, port_num); + + rdma_ah_set_dlid(ah_attr, be16_to_cpu(rec->mlid)); + rdma_ah_set_sl(ah_attr, rec->sl); + rdma_ah_set_port_num(ah_attr, port_num); + rdma_ah_set_static_rate(ah_attr, rec->rate); + rdma_move_grh_sgid_attr(ah_attr, &rec->mgid, + be32_to_cpu(rec->flow_label), + rec->hop_limit, rec->traffic_class, + sgid_attr); + return 0; +} +EXPORT_SYMBOL(ib_init_ah_from_mcmember); + +static void mcast_groups_event(struct mcast_port *port, + enum mcast_group_state state) +{ + struct mcast_group *group; + struct rb_node *node; + unsigned long flags; + + spin_lock_irqsave(&port->lock, flags); + for (node = rb_first(&port->table); node; node = rb_next(node)) { + group = rb_entry(node, struct mcast_group, node); + spin_lock(&group->lock); + if (group->state == MCAST_IDLE) { + atomic_inc(&group->refcount); + queue_work(mcast_wq, &group->work); + } + if (group->state != MCAST_GROUP_ERROR) + group->state = state; + spin_unlock(&group->lock); + } + spin_unlock_irqrestore(&port->lock, flags); +} + +static void mcast_event_handler(struct ib_event_handler *handler, + struct ib_event *event) +{ + struct mcast_device *dev; + int index; + + dev = container_of(handler, struct mcast_device, event_handler); + if (!rdma_cap_ib_mcast(dev->device, event->element.port_num)) + return; + + index = event->element.port_num - dev->start_port; + + switch (event->event) { + case IB_EVENT_PORT_ERR: + case IB_EVENT_LID_CHANGE: + case IB_EVENT_CLIENT_REREGISTER: + mcast_groups_event(&dev->port[index], MCAST_GROUP_ERROR); + break; + case IB_EVENT_PKEY_CHANGE: + mcast_groups_event(&dev->port[index], MCAST_PKEY_EVENT); + break; + default: + break; + } +} + +static int mcast_add_one(struct ib_device *device) +{ + struct mcast_device *dev; + struct mcast_port *port; + int i; + int count = 0; + + dev = kmalloc(struct_size(dev, port, device->phys_port_cnt), + GFP_KERNEL); + if (!dev) + return -ENOMEM; + + dev->start_port = rdma_start_port(device); + dev->end_port = rdma_end_port(device); + + for (i = 0; i <= dev->end_port - dev->start_port; i++) { + if (!rdma_cap_ib_mcast(device, dev->start_port + i)) + continue; + port = &dev->port[i]; + port->dev = dev; + port->port_num = dev->start_port + i; + spin_lock_init(&port->lock); + port->table = RB_ROOT; + init_completion(&port->comp); + refcount_set(&port->refcount, 1); + ++count; + } + + if (!count) { + kfree(dev); + return -EOPNOTSUPP; + } + + dev->device = device; + ib_set_client_data(device, &mcast_client, dev); + + INIT_IB_EVENT_HANDLER(&dev->event_handler, device, mcast_event_handler); + ib_register_event_handler(&dev->event_handler); + return 0; +} + +static void mcast_remove_one(struct ib_device *device, void *client_data) +{ + struct mcast_device *dev = client_data; + struct mcast_port *port; + int i; + + ib_unregister_event_handler(&dev->event_handler); + flush_workqueue(mcast_wq); + + for (i = 0; i <= dev->end_port - dev->start_port; i++) { + if (rdma_cap_ib_mcast(device, dev->start_port + i)) { + port = &dev->port[i]; + deref_port(port); + wait_for_completion(&port->comp); + } + } + + kfree(dev); +} + +int mcast_init(void) +{ + int ret; + + mcast_wq = alloc_ordered_workqueue("ib_mcast", WQ_MEM_RECLAIM); + if (!mcast_wq) + return -ENOMEM; + + ib_sa_register_client(&sa_client); + + ret = ib_register_client(&mcast_client); + if (ret) + goto err; + return 0; + +err: + ib_sa_unregister_client(&sa_client); + destroy_workqueue(mcast_wq); + return ret; +} + +void mcast_cleanup(void) +{ + ib_unregister_client(&mcast_client); + ib_sa_unregister_client(&sa_client); + destroy_workqueue(mcast_wq); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/netlink.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/netlink.c new file mode 100644 index 0000000..53bfe23 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/netlink.c @@ -0,0 +1,334 @@ +/* + * Copyright (c) 2017 Mellanox Technologies Inc. All rights reserved. + * Copyright (c) 2010 Voltaire Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifdef pr_fmt +#undef pr_fmt +#endif +#define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__ + +#include +#include +#include +#include +#include +#include +#include +#include "core_priv.h" + +static struct { + const struct rdma_nl_cbs *cb_table; + /* Synchronizes between ongoing netlink commands and netlink client + * unregistration. + */ + struct rw_semaphore sem; +} rdma_nl_types[RDMA_NL_NUM_CLIENTS]; + +bool rdma_nl_chk_listeners(unsigned int group) +{ + struct rdma_dev_net *rnet = rdma_net_to_dev_net(&init_net); + + return netlink_has_listeners(rnet->nl_sock, group); +} +EXPORT_SYMBOL(rdma_nl_chk_listeners); + +static bool is_nl_msg_valid(unsigned int type, unsigned int op) +{ + static const unsigned int max_num_ops[RDMA_NL_NUM_CLIENTS] = { + [RDMA_NL_IWCM] = RDMA_NL_IWPM_NUM_OPS, + [RDMA_NL_LS] = RDMA_NL_LS_NUM_OPS, + [RDMA_NL_NLDEV] = RDMA_NLDEV_NUM_OPS, + }; + + /* + * This BUILD_BUG_ON is intended to catch addition of new + * RDMA netlink protocol without updating the array above. + */ + BUILD_BUG_ON(RDMA_NL_NUM_CLIENTS != 6); + + if (type >= RDMA_NL_NUM_CLIENTS) + return false; + + return (op < max_num_ops[type]) ? true : false; +} + +static const struct rdma_nl_cbs * +get_cb_table(const struct sk_buff *skb, unsigned int type, unsigned int op) +{ + const struct rdma_nl_cbs *cb_table; + + /* + * Currently only NLDEV client is supporting netlink commands in + * non init_net net namespace. + */ + if (sock_net(skb->sk) != &init_net && type != RDMA_NL_NLDEV) + return NULL; + + cb_table = READ_ONCE(rdma_nl_types[type].cb_table); + if (!cb_table) { + /* + * Didn't get valid reference of the table, attempt module + * load once. + */ + up_read(&rdma_nl_types[type].sem); + + request_module("rdma-netlink-subsys-%u", type); + + down_read(&rdma_nl_types[type].sem); + cb_table = READ_ONCE(rdma_nl_types[type].cb_table); + } + if (!cb_table || (!cb_table[op].dump && !cb_table[op].doit)) + return NULL; + return cb_table; +} + +void rdma_nl_register(unsigned int index, + const struct rdma_nl_cbs cb_table[]) +{ + if (WARN_ON(!is_nl_msg_valid(index, 0)) || + WARN_ON(READ_ONCE(rdma_nl_types[index].cb_table))) + return; + + /* Pairs with the READ_ONCE in is_nl_valid() */ + smp_store_release(&rdma_nl_types[index].cb_table, cb_table); +} +EXPORT_SYMBOL(rdma_nl_register); + +void rdma_nl_unregister(unsigned int index) +{ + down_write(&rdma_nl_types[index].sem); + rdma_nl_types[index].cb_table = NULL; + up_write(&rdma_nl_types[index].sem); +} +EXPORT_SYMBOL(rdma_nl_unregister); + +void *ibnl_put_msg(struct sk_buff *skb, struct nlmsghdr **nlh, int seq, + int len, int client, int op, int flags) +{ + *nlh = nlmsg_put(skb, 0, seq, RDMA_NL_GET_TYPE(client, op), len, flags); + if (!*nlh) + return NULL; + return nlmsg_data(*nlh); +} +EXPORT_SYMBOL(ibnl_put_msg); + +int ibnl_put_attr(struct sk_buff *skb, struct nlmsghdr *nlh, + int len, void *data, int type) +{ + if (nla_put(skb, type, len, data)) { + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; + } + return 0; +} +EXPORT_SYMBOL(ibnl_put_attr); + +static int rdma_nl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + int type = nlh->nlmsg_type; + unsigned int index = RDMA_NL_GET_CLIENT(type); + unsigned int op = RDMA_NL_GET_OP(type); + const struct rdma_nl_cbs *cb_table; + int err = -EINVAL; + + if (!is_nl_msg_valid(index, op)) + return -EINVAL; + + down_read(&rdma_nl_types[index].sem); + cb_table = get_cb_table(skb, index, op); + if (!cb_table) + goto done; + + if ((cb_table[op].flags & RDMA_NL_ADMIN_PERM) && + !netlink_capable(skb, CAP_NET_ADMIN)) { + err = -EPERM; + goto done; + } + + /* + * LS responses overload the 0x100 (NLM_F_ROOT) flag. Don't + * mistakenly call the .dump() function. + */ + if (index == RDMA_NL_LS) { + if (cb_table[op].doit) + err = cb_table[op].doit(skb, nlh, extack); + goto done; + } + /* FIXME: Convert IWCM to properly handle doit callbacks */ + if ((nlh->nlmsg_flags & NLM_F_DUMP) || index == RDMA_NL_IWCM) { + struct netlink_dump_control c = { + .dump = cb_table[op].dump, + }; + if (c.dump) + err = netlink_dump_start(skb->sk, skb, nlh, &c); + goto done; + } + + if (cb_table[op].doit) + err = cb_table[op].doit(skb, nlh, extack); +done: + up_read(&rdma_nl_types[index].sem); + return err; +} + +/* + * This function is similar to netlink_rcv_skb with one exception: + * It calls to the callback for the netlink messages without NLM_F_REQUEST + * flag. These messages are intended for RDMA_NL_LS consumer, so it is allowed + * for that consumer only. + */ +static int rdma_nl_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, + struct nlmsghdr *, + struct netlink_ext_ack *)) +{ + struct netlink_ext_ack extack = {}; + struct nlmsghdr *nlh; + int err; + + while (skb->len >= nlmsg_total_size(0)) { + int msglen; + + nlh = nlmsg_hdr(skb); + err = 0; + + if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len) + return 0; + + /* + * Generally speaking, the only requests are handled + * by the kernel, but RDMA_NL_LS is different, because it + * runs backward netlink scheme. Kernel initiates messages + * and waits for reply with data to keep pathrecord cache + * in sync. + */ + if (!(nlh->nlmsg_flags & NLM_F_REQUEST) && + (RDMA_NL_GET_CLIENT(nlh->nlmsg_type) != RDMA_NL_LS)) + goto ack; + + /* Skip control messages */ + if (nlh->nlmsg_type < NLMSG_MIN_TYPE) + goto ack; + + err = cb(skb, nlh, &extack); + if (err == -EINTR) + goto skip; + +ack: + if (nlh->nlmsg_flags & NLM_F_ACK || err) + netlink_ack(skb, nlh, err, &extack); + +skip: + msglen = NLMSG_ALIGN(nlh->nlmsg_len); + if (msglen > skb->len) + msglen = skb->len; + skb_pull(skb, msglen); + } + + return 0; +} + +static void rdma_nl_rcv(struct sk_buff *skb) +{ + rdma_nl_rcv_skb(skb, &rdma_nl_rcv_msg); +} + +int rdma_nl_unicast(struct net *net, struct sk_buff *skb, u32 pid) +{ + struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); + int err; + + err = netlink_unicast(rnet->nl_sock, skb, pid, MSG_DONTWAIT); + return (err < 0) ? err : 0; +} +EXPORT_SYMBOL(rdma_nl_unicast); + +int rdma_nl_unicast_wait(struct net *net, struct sk_buff *skb, __u32 pid) +{ + struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); + int err; + + err = netlink_unicast(rnet->nl_sock, skb, pid, 0); + return (err < 0) ? err : 0; +} +EXPORT_SYMBOL(rdma_nl_unicast_wait); + +int rdma_nl_multicast(struct net *net, struct sk_buff *skb, + unsigned int group, gfp_t flags) +{ + struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); + + return nlmsg_multicast(rnet->nl_sock, skb, 0, group, flags); +} +EXPORT_SYMBOL(rdma_nl_multicast); + +void rdma_nl_init(void) +{ + int idx; + + for (idx = 0; idx < RDMA_NL_NUM_CLIENTS; idx++) + init_rwsem(&rdma_nl_types[idx].sem); +} + +void rdma_nl_exit(void) +{ + int idx; + + for (idx = 0; idx < RDMA_NL_NUM_CLIENTS; idx++) + WARN(rdma_nl_types[idx].cb_table, + "Netlink client %d wasn't released prior to unloading %s\n", + idx, KBUILD_MODNAME); +} + +int rdma_nl_net_init(struct rdma_dev_net *rnet) +{ + struct net *net = read_pnet(&rnet->net); + struct netlink_kernel_cfg cfg = { + .input = rdma_nl_rcv, + }; + struct sock *nls; + + nls = netlink_kernel_create(net, NETLINK_RDMA, &cfg); + if (!nls) + return -ENOMEM; + + nls->sk_sndtimeo = 10 * HZ; + rnet->nl_sock = nls; + return 0; +} + +void rdma_nl_net_exit(struct rdma_dev_net *rnet) +{ + netlink_kernel_release(rnet->nl_sock); +} + +MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_RDMA); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/nldev.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/nldev.c new file mode 100644 index 0000000..d525e33 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/nldev.c @@ -0,0 +1,2545 @@ +/* + * Copyright (c) 2017 Mellanox Technologies. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "core_priv.h" +#include "cma_priv.h" +#include "restrack.h" +#include "uverbs.h" + +typedef int (*res_fill_func_t)(struct sk_buff*, bool, + struct rdma_restrack_entry*, uint32_t); + +/* + * Sort array elements by the netlink attribute name + */ +static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { + [RDMA_NLDEV_ATTR_CHARDEV] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_CHARDEV_ABI] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_CHARDEV_NAME] = { .type = NLA_NUL_STRING, + .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, + [RDMA_NLDEV_ATTR_CHARDEV_TYPE] = { .type = NLA_NUL_STRING, + .len = RDMA_NLDEV_ATTR_CHARDEV_TYPE_SIZE }, + [RDMA_NLDEV_ATTR_DEV_DIM] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_DEV_INDEX] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, + .len = IB_DEVICE_NAME_MAX }, + [RDMA_NLDEV_ATTR_DEV_NODE_TYPE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_DEV_PROTOCOL] = { .type = NLA_NUL_STRING, + .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, + [RDMA_NLDEV_ATTR_DRIVER] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_DRIVER_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_DRIVER_PRINT_TYPE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_DRIVER_STRING] = { .type = NLA_NUL_STRING, + .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, + [RDMA_NLDEV_ATTR_DRIVER_S32] = { .type = NLA_S32 }, + [RDMA_NLDEV_ATTR_DRIVER_S64] = { .type = NLA_S64 }, + [RDMA_NLDEV_ATTR_DRIVER_U32] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_DRIVER_U64] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_FW_VERSION] = { .type = NLA_NUL_STRING, + .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, + [RDMA_NLDEV_ATTR_LID] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_LINK_TYPE] = { .type = NLA_NUL_STRING, + .len = IFNAMSIZ }, + [RDMA_NLDEV_ATTR_LMC] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_NDEV_INDEX] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_NDEV_NAME] = { .type = NLA_NUL_STRING, + .len = IFNAMSIZ }, + [RDMA_NLDEV_ATTR_NODE_GUID] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_PORT_INDEX] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_PORT_PHYS_STATE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_PORT_STATE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_RES_CM_ID] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_CM_IDN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_CQ] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_CQE] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_CQN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_CQ_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_CTX] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_CTXN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_CTX_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_DST_ADDR] = { + .len = sizeof(struct __kernel_sockaddr_storage) }, + [RDMA_NLDEV_ATTR_RES_IOVA] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_RES_KERN_NAME] = { .type = NLA_NUL_STRING, + .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, + [RDMA_NLDEV_ATTR_RES_LKEY] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_LQPN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_MR] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_MRLEN] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_RES_MRN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_MR_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_PATH_MIG_STATE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_RES_PD] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_PDN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_PD_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_PID] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_POLL_CTX] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_RES_PS] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_QP] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_QP_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_RAW] = { .type = NLA_BINARY }, + [RDMA_NLDEV_ATTR_RES_RKEY] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_RQPN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_RQ_PSN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_SQ_PSN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_SRC_ADDR] = { + .len = sizeof(struct __kernel_sockaddr_storage) }, + [RDMA_NLDEV_ATTR_RES_STATE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_RES_SUMMARY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR]= { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME]= { .type = NLA_NUL_STRING, + .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, + [RDMA_NLDEV_ATTR_RES_TYPE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY]= { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_USECNT] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_RES_SRQ] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_SRQN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_SRQ_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_MIN_RANGE] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_MAX_RANGE] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_SM_LID] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_SUBNET_PREFIX] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_STAT_MODE] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_STAT_RES] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_STAT_COUNTER] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_STAT_COUNTER_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_STAT_COUNTER_ID] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_STAT_HWCOUNTERS] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME] = { .type = NLA_NUL_STRING }, + [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_VALUE] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_SYS_IMAGE_GUID] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_UVERBS_DRIVER_ID] = { .type = NLA_U32 }, + [RDMA_NLDEV_NET_NS_FD] = { .type = NLA_U32 }, + [RDMA_NLDEV_SYS_ATTR_NETNS_MODE] = { .type = NLA_U8 }, + [RDMA_NLDEV_SYS_ATTR_COPY_ON_FORK] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_INDEX] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_DYNAMIC] = { .type = NLA_U8 }, +}; + +static int put_driver_name_print_type(struct sk_buff *msg, const char *name, + enum rdma_nldev_print_type print_type) +{ + if (nla_put_string(msg, RDMA_NLDEV_ATTR_DRIVER_STRING, name)) + return -EMSGSIZE; + if (print_type != RDMA_NLDEV_PRINT_TYPE_UNSPEC && + nla_put_u8(msg, RDMA_NLDEV_ATTR_DRIVER_PRINT_TYPE, print_type)) + return -EMSGSIZE; + + return 0; +} + +static int _rdma_nl_put_driver_u32(struct sk_buff *msg, const char *name, + enum rdma_nldev_print_type print_type, + u32 value) +{ + if (put_driver_name_print_type(msg, name, print_type)) + return -EMSGSIZE; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_DRIVER_U32, value)) + return -EMSGSIZE; + + return 0; +} + +static int _rdma_nl_put_driver_u64(struct sk_buff *msg, const char *name, + enum rdma_nldev_print_type print_type, + u64 value) +{ + if (put_driver_name_print_type(msg, name, print_type)) + return -EMSGSIZE; + if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_DRIVER_U64, value, + RDMA_NLDEV_ATTR_PAD)) + return -EMSGSIZE; + + return 0; +} + +int rdma_nl_put_driver_string(struct sk_buff *msg, const char *name, + const char *str) +{ + if (put_driver_name_print_type(msg, name, + RDMA_NLDEV_PRINT_TYPE_UNSPEC)) + return -EMSGSIZE; + if (nla_put_string(msg, RDMA_NLDEV_ATTR_DRIVER_STRING, str)) + return -EMSGSIZE; + + return 0; +} +EXPORT_SYMBOL(rdma_nl_put_driver_string); + +int rdma_nl_put_driver_u32(struct sk_buff *msg, const char *name, u32 value) +{ + return _rdma_nl_put_driver_u32(msg, name, RDMA_NLDEV_PRINT_TYPE_UNSPEC, + value); +} +EXPORT_SYMBOL(rdma_nl_put_driver_u32); + +int rdma_nl_put_driver_u32_hex(struct sk_buff *msg, const char *name, + u32 value) +{ + return _rdma_nl_put_driver_u32(msg, name, RDMA_NLDEV_PRINT_TYPE_HEX, + value); +} +EXPORT_SYMBOL(rdma_nl_put_driver_u32_hex); + +int rdma_nl_put_driver_u64(struct sk_buff *msg, const char *name, u64 value) +{ + return _rdma_nl_put_driver_u64(msg, name, RDMA_NLDEV_PRINT_TYPE_UNSPEC, + value); +} +EXPORT_SYMBOL(rdma_nl_put_driver_u64); + +int rdma_nl_put_driver_u64_hex(struct sk_buff *msg, const char *name, u64 value) +{ + return _rdma_nl_put_driver_u64(msg, name, RDMA_NLDEV_PRINT_TYPE_HEX, + value); +} +EXPORT_SYMBOL(rdma_nl_put_driver_u64_hex); + +static int fill_nldev_handle(struct sk_buff *msg, struct ib_device *device) +{ + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index)) + return -EMSGSIZE; + if (nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME, + dev_name(&device->dev))) + return -EMSGSIZE; + + return 0; +} + +static int fill_dev_info(struct sk_buff *msg, struct ib_device *device) +{ + char fw[IB_FW_VERSION_NAME_MAX]; + int ret = 0; + u32 port; + + if (fill_nldev_handle(msg, device)) + return -EMSGSIZE; + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, rdma_end_port(device))) + return -EMSGSIZE; + + BUILD_BUG_ON(sizeof(device->attrs.device_cap_flags) != sizeof(u64)); + if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CAP_FLAGS, + device->attrs.device_cap_flags, + RDMA_NLDEV_ATTR_PAD)) + return -EMSGSIZE; + + ib_get_device_fw_str(device, fw); + /* Device without FW has strlen(fw) = 0 */ + if (strlen(fw) && nla_put_string(msg, RDMA_NLDEV_ATTR_FW_VERSION, fw)) + return -EMSGSIZE; + + if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_NODE_GUID, + be64_to_cpu(device->node_guid), + RDMA_NLDEV_ATTR_PAD)) + return -EMSGSIZE; + if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_SYS_IMAGE_GUID, + be64_to_cpu(device->attrs.sys_image_guid), + RDMA_NLDEV_ATTR_PAD)) + return -EMSGSIZE; + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_NODE_TYPE, device->node_type)) + return -EMSGSIZE; + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_DIM, device->use_cq_dim)) + return -EMSGSIZE; + + /* + * Link type is determined on first port and mlx4 device + * which can potentially have two different link type for the same + * IB device is considered as better to be avoided in the future, + */ + port = rdma_start_port(device); + if (rdma_cap_opa_mad(device, port)) + ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, "opa"); + else if (rdma_protocol_ib(device, port)) + ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, "ib"); + else if (rdma_protocol_iwarp(device, port)) + ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, "iw"); + else if (rdma_protocol_roce(device, port)) + ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, "roce"); + else if (rdma_protocol_usnic(device, port)) + ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, + "usnic"); + return ret; +} + +static int fill_port_info(struct sk_buff *msg, + struct ib_device *device, u32 port, + const struct net *net) +{ + struct net_device *netdev = NULL; + struct ib_port_attr attr; + int ret; + u64 cap_flags = 0; + + if (fill_nldev_handle(msg, device)) + return -EMSGSIZE; + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port)) + return -EMSGSIZE; + + ret = ib_query_port(device, port, &attr); + if (ret) + return ret; + + if (rdma_protocol_ib(device, port)) { + BUILD_BUG_ON((sizeof(attr.port_cap_flags) + + sizeof(attr.port_cap_flags2)) > sizeof(u64)); + cap_flags = attr.port_cap_flags | + ((u64)attr.port_cap_flags2 << 32); + if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CAP_FLAGS, + cap_flags, RDMA_NLDEV_ATTR_PAD)) + return -EMSGSIZE; + if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_SUBNET_PREFIX, + attr.subnet_prefix, RDMA_NLDEV_ATTR_PAD)) + return -EMSGSIZE; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_LID, attr.lid)) + return -EMSGSIZE; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_SM_LID, attr.sm_lid)) + return -EMSGSIZE; + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_LMC, attr.lmc)) + return -EMSGSIZE; + } + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_PORT_STATE, attr.state)) + return -EMSGSIZE; + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_PORT_PHYS_STATE, attr.phys_state)) + return -EMSGSIZE; + + netdev = ib_device_get_netdev(device, port); + if (netdev && net_eq(dev_net(netdev), net)) { + ret = nla_put_u32(msg, + RDMA_NLDEV_ATTR_NDEV_INDEX, netdev->ifindex); + if (ret) + goto out; + ret = nla_put_string(msg, + RDMA_NLDEV_ATTR_NDEV_NAME, netdev->name); + } + +out: + if (netdev) + dev_put(netdev); + return ret; +} + +static int fill_res_info_entry(struct sk_buff *msg, + const char *name, u64 curr) +{ + struct nlattr *entry_attr; + + entry_attr = nla_nest_start_noflag(msg, + RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY); + if (!entry_attr) + return -EMSGSIZE; + + if (nla_put_string(msg, RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME, name)) + goto err; + if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR, curr, + RDMA_NLDEV_ATTR_PAD)) + goto err; + + nla_nest_end(msg, entry_attr); + return 0; + +err: + nla_nest_cancel(msg, entry_attr); + return -EMSGSIZE; +} + +static int fill_res_info(struct sk_buff *msg, struct ib_device *device) +{ + static const char * const names[RDMA_RESTRACK_MAX] = { + [RDMA_RESTRACK_PD] = "pd", + [RDMA_RESTRACK_CQ] = "cq", + [RDMA_RESTRACK_QP] = "qp", + [RDMA_RESTRACK_CM_ID] = "cm_id", + [RDMA_RESTRACK_MR] = "mr", + [RDMA_RESTRACK_CTX] = "ctx", + [RDMA_RESTRACK_SRQ] = "srq", + }; + + struct nlattr *table_attr; + int ret, i, curr; + + if (fill_nldev_handle(msg, device)) + return -EMSGSIZE; + + table_attr = nla_nest_start_noflag(msg, RDMA_NLDEV_ATTR_RES_SUMMARY); + if (!table_attr) + return -EMSGSIZE; + + for (i = 0; i < RDMA_RESTRACK_MAX; i++) { + if (!names[i]) + continue; + curr = rdma_restrack_count(device, i); + ret = fill_res_info_entry(msg, names[i], curr); + if (ret) + goto err; + } + + nla_nest_end(msg, table_attr); + return 0; + +err: + nla_nest_cancel(msg, table_attr); + return ret; +} + +static int fill_res_name_pid(struct sk_buff *msg, + struct rdma_restrack_entry *res) +{ + int err = 0; + + /* + * For user resources, user is should read /proc/PID/comm to get the + * name of the task file. + */ + if (rdma_is_kernel_res(res)) { + err = nla_put_string(msg, RDMA_NLDEV_ATTR_RES_KERN_NAME, + res->kern_name); + } else { + pid_t pid; + + pid = task_pid_vnr(res->task); + /* + * Task is dead and in zombie state. + * There is no need to print PID anymore. + */ + if (pid) + /* + * This part is racy, task can be killed and PID will + * be zero right here but it is ok, next query won't + * return PID. We don't promise real-time reflection + * of SW objects. + */ + err = nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PID, pid); + } + + return err ? -EMSGSIZE : 0; +} + +static int fill_res_qp_entry_query(struct sk_buff *msg, + struct rdma_restrack_entry *res, + struct ib_device *dev, + struct ib_qp *qp) +{ + struct ib_qp_init_attr qp_init_attr; + struct ib_qp_attr qp_attr; + int ret; + + ret = ib_query_qp(qp, &qp_attr, 0, &qp_init_attr); + if (ret) + return ret; + + if (qp->qp_type == IB_QPT_RC || qp->qp_type == IB_QPT_UC) { + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_RQPN, + qp_attr.dest_qp_num)) + goto err; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_RQ_PSN, + qp_attr.rq_psn)) + goto err; + } + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_SQ_PSN, qp_attr.sq_psn)) + goto err; + + if (qp->qp_type == IB_QPT_RC || qp->qp_type == IB_QPT_UC || + qp->qp_type == IB_QPT_XRC_INI || qp->qp_type == IB_QPT_XRC_TGT) { + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_PATH_MIG_STATE, + qp_attr.path_mig_state)) + goto err; + } + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, qp->qp_type)) + goto err; + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_STATE, qp_attr.qp_state)) + goto err; + + if (dev->ops.fill_res_qp_entry) + return dev->ops.fill_res_qp_entry(msg, qp); + return 0; + +err: return -EMSGSIZE; +} + +static int fill_res_qp_entry(struct sk_buff *msg, bool has_cap_net_admin, + struct rdma_restrack_entry *res, uint32_t port) +{ + struct ib_qp *qp = container_of(res, struct ib_qp, res); + struct ib_device *dev = qp->device; + int ret; + + if (port && port != qp->port) + return -EAGAIN; + + /* In create_qp() port is not set yet */ + if (qp->port && nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, qp->port)) + return -EINVAL; + + ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qp->qp_num); + if (ret) + return -EMSGSIZE; + + if (!rdma_is_kernel_res(res) && + nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, qp->pd->res.id)) + return -EMSGSIZE; + + ret = fill_res_name_pid(msg, res); + if (ret) + return -EMSGSIZE; + + return fill_res_qp_entry_query(msg, res, dev, qp); +} + +static int fill_res_qp_raw_entry(struct sk_buff *msg, bool has_cap_net_admin, + struct rdma_restrack_entry *res, uint32_t port) +{ + struct ib_qp *qp = container_of(res, struct ib_qp, res); + struct ib_device *dev = qp->device; + + if (port && port != qp->port) + return -EAGAIN; + if (!dev->ops.fill_res_qp_entry_raw) + return -EINVAL; + return dev->ops.fill_res_qp_entry_raw(msg, qp); +} + +static int fill_res_cm_id_entry(struct sk_buff *msg, bool has_cap_net_admin, + struct rdma_restrack_entry *res, uint32_t port) +{ + struct rdma_id_private *id_priv = + container_of(res, struct rdma_id_private, res); + struct ib_device *dev = id_priv->id.device; + struct rdma_cm_id *cm_id = &id_priv->id; + + if (port && port != cm_id->port_num) + return -EAGAIN; + + if (cm_id->port_num && + nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, cm_id->port_num)) + goto err; + + if (id_priv->qp_num) { + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, id_priv->qp_num)) + goto err; + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, cm_id->qp_type)) + goto err; + } + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PS, cm_id->ps)) + goto err; + + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_STATE, id_priv->state)) + goto err; + + if (cm_id->route.addr.src_addr.ss_family && + nla_put(msg, RDMA_NLDEV_ATTR_RES_SRC_ADDR, + sizeof(cm_id->route.addr.src_addr), + &cm_id->route.addr.src_addr)) + goto err; + if (cm_id->route.addr.dst_addr.ss_family && + nla_put(msg, RDMA_NLDEV_ATTR_RES_DST_ADDR, + sizeof(cm_id->route.addr.dst_addr), + &cm_id->route.addr.dst_addr)) + goto err; + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CM_IDN, res->id)) + goto err; + + if (fill_res_name_pid(msg, res)) + goto err; + + if (dev->ops.fill_res_cm_id_entry) + return dev->ops.fill_res_cm_id_entry(msg, cm_id); + return 0; + +err: return -EMSGSIZE; +} + +static int fill_res_cq_entry(struct sk_buff *msg, bool has_cap_net_admin, + struct rdma_restrack_entry *res, uint32_t port) +{ + struct ib_cq *cq = container_of(res, struct ib_cq, res); + struct ib_device *dev = cq->device; + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQE, cq->cqe)) + return -EMSGSIZE; + if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_USECNT, + atomic_read(&cq->usecnt), RDMA_NLDEV_ATTR_PAD)) + return -EMSGSIZE; + + /* Poll context is only valid for kernel CQs */ + if (rdma_is_kernel_res(res) && + nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_POLL_CTX, cq->poll_ctx)) + return -EMSGSIZE; + + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_DIM, (cq->dim != NULL))) + return -EMSGSIZE; + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQN, res->id)) + return -EMSGSIZE; + if (!rdma_is_kernel_res(res) && + nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CTXN, + cq->uobject->uevent.uobject.context->res.id)) + return -EMSGSIZE; + + if (fill_res_name_pid(msg, res)) + return -EMSGSIZE; + + return (dev->ops.fill_res_cq_entry) ? + dev->ops.fill_res_cq_entry(msg, cq) : 0; +} + +static int fill_res_cq_raw_entry(struct sk_buff *msg, bool has_cap_net_admin, + struct rdma_restrack_entry *res, uint32_t port) +{ + struct ib_cq *cq = container_of(res, struct ib_cq, res); + struct ib_device *dev = cq->device; + + if (!dev->ops.fill_res_cq_entry_raw) + return -EINVAL; + return dev->ops.fill_res_cq_entry_raw(msg, cq); +} + +static int fill_res_mr_entry(struct sk_buff *msg, bool has_cap_net_admin, + struct rdma_restrack_entry *res, uint32_t port) +{ + struct ib_mr *mr = container_of(res, struct ib_mr, res); + struct ib_device *dev = mr->pd->device; + + if (has_cap_net_admin) { + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_RKEY, mr->rkey)) + return -EMSGSIZE; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LKEY, mr->lkey)) + return -EMSGSIZE; + } + + if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_MRLEN, mr->length, + RDMA_NLDEV_ATTR_PAD)) + return -EMSGSIZE; + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_MRN, res->id)) + return -EMSGSIZE; + + if (!rdma_is_kernel_res(res) && + nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, mr->pd->res.id)) + return -EMSGSIZE; + + if (fill_res_name_pid(msg, res)) + return -EMSGSIZE; + + return (dev->ops.fill_res_mr_entry) ? + dev->ops.fill_res_mr_entry(msg, mr) : + 0; +} + +static int fill_res_mr_raw_entry(struct sk_buff *msg, bool has_cap_net_admin, + struct rdma_restrack_entry *res, uint32_t port) +{ + struct ib_mr *mr = container_of(res, struct ib_mr, res); + struct ib_device *dev = mr->pd->device; + + if (!dev->ops.fill_res_mr_entry_raw) + return -EINVAL; + return dev->ops.fill_res_mr_entry_raw(msg, mr); +} + +static int fill_res_pd_entry(struct sk_buff *msg, bool has_cap_net_admin, + struct rdma_restrack_entry *res, uint32_t port) +{ + struct ib_pd *pd = container_of(res, struct ib_pd, res); + + if (has_cap_net_admin) { + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY, + pd->local_dma_lkey)) + goto err; + if ((pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) && + nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY, + pd->unsafe_global_rkey)) + goto err; + } + if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_USECNT, + atomic_read(&pd->usecnt), RDMA_NLDEV_ATTR_PAD)) + goto err; + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, res->id)) + goto err; + + if (!rdma_is_kernel_res(res) && + nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CTXN, + pd->uobject->context->res.id)) + goto err; + + return fill_res_name_pid(msg, res); + +err: return -EMSGSIZE; +} + +static int fill_res_ctx_entry(struct sk_buff *msg, bool has_cap_net_admin, + struct rdma_restrack_entry *res, uint32_t port) +{ + struct ib_ucontext *ctx = container_of(res, struct ib_ucontext, res); + + if (rdma_is_kernel_res(res)) + return 0; + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CTXN, ctx->res.id)) + return -EMSGSIZE; + + return fill_res_name_pid(msg, res); +} + +static int fill_res_range_qp_entry(struct sk_buff *msg, uint32_t min_range, + uint32_t max_range) +{ + struct nlattr *entry_attr; + + if (!min_range) + return 0; + + entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP_ENTRY); + if (!entry_attr) + return -EMSGSIZE; + + if (min_range == max_range) { + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, min_range)) + goto err; + } else { + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_MIN_RANGE, min_range)) + goto err; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_MAX_RANGE, max_range)) + goto err; + } + nla_nest_end(msg, entry_attr); + return 0; + +err: + nla_nest_cancel(msg, entry_attr); + return -EMSGSIZE; +} + +static int fill_res_srq_qps(struct sk_buff *msg, struct ib_srq *srq) +{ + uint32_t min_range = 0, prev = 0; + struct rdma_restrack_entry *res; + struct rdma_restrack_root *rt; + struct nlattr *table_attr; + struct ib_qp *qp = NULL; + unsigned long id = 0; + + table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP); + if (!table_attr) + return -EMSGSIZE; + + rt = &srq->device->res[RDMA_RESTRACK_QP]; + xa_lock(&rt->xa); + xa_for_each(&rt->xa, id, res) { + if (!rdma_restrack_get(res)) + continue; + + qp = container_of(res, struct ib_qp, res); + if (!qp->srq || (qp->srq->res.id != srq->res.id)) { + rdma_restrack_put(res); + continue; + } + + if (qp->qp_num < prev) + /* qp_num should be ascending */ + goto err_loop; + + if (min_range == 0) { + min_range = qp->qp_num; + } else if (qp->qp_num > (prev + 1)) { + if (fill_res_range_qp_entry(msg, min_range, prev)) + goto err_loop; + + min_range = qp->qp_num; + } + prev = qp->qp_num; + rdma_restrack_put(res); + } + + xa_unlock(&rt->xa); + + if (fill_res_range_qp_entry(msg, min_range, prev)) + goto err; + + nla_nest_end(msg, table_attr); + return 0; + +err_loop: + rdma_restrack_put(res); + xa_unlock(&rt->xa); +err: + nla_nest_cancel(msg, table_attr); + return -EMSGSIZE; +} + +static int fill_res_srq_entry(struct sk_buff *msg, bool has_cap_net_admin, + struct rdma_restrack_entry *res, uint32_t port) +{ + struct ib_srq *srq = container_of(res, struct ib_srq, res); + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_SRQN, srq->res.id)) + goto err; + + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, srq->srq_type)) + goto err; + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, srq->pd->res.id)) + goto err; + + if (ib_srq_has_cq(srq->srq_type)) { + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQN, + srq->ext.cq->res.id)) + goto err; + } + + if (fill_res_srq_qps(msg, srq)) + goto err; + + return fill_res_name_pid(msg, res); + +err: + return -EMSGSIZE; +} + +static int fill_stat_counter_mode(struct sk_buff *msg, + struct rdma_counter *counter) +{ + struct rdma_counter_mode *m = &counter->mode; + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_MODE, m->mode)) + return -EMSGSIZE; + + if (m->mode == RDMA_COUNTER_MODE_AUTO) { + if ((m->mask & RDMA_COUNTER_MASK_QP_TYPE) && + nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, m->param.qp_type)) + return -EMSGSIZE; + + if ((m->mask & RDMA_COUNTER_MASK_PID) && + fill_res_name_pid(msg, &counter->res)) + return -EMSGSIZE; + } + + return 0; +} + +static int fill_stat_counter_qp_entry(struct sk_buff *msg, u32 qpn) +{ + struct nlattr *entry_attr; + + entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP_ENTRY); + if (!entry_attr) + return -EMSGSIZE; + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qpn)) + goto err; + + nla_nest_end(msg, entry_attr); + return 0; + +err: + nla_nest_cancel(msg, entry_attr); + return -EMSGSIZE; +} + +static int fill_stat_counter_qps(struct sk_buff *msg, + struct rdma_counter *counter) +{ + struct rdma_restrack_entry *res; + struct rdma_restrack_root *rt; + struct nlattr *table_attr; + struct ib_qp *qp = NULL; + unsigned long id = 0; + int ret = 0; + + table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP); + + rt = &counter->device->res[RDMA_RESTRACK_QP]; + xa_lock(&rt->xa); + xa_for_each(&rt->xa, id, res) { + qp = container_of(res, struct ib_qp, res); + if (!qp->counter || (qp->counter->id != counter->id)) + continue; + + ret = fill_stat_counter_qp_entry(msg, qp->qp_num); + if (ret) + goto err; + } + + xa_unlock(&rt->xa); + nla_nest_end(msg, table_attr); + return 0; + +err: + xa_unlock(&rt->xa); + nla_nest_cancel(msg, table_attr); + return ret; +} + +int rdma_nl_stat_hwcounter_entry(struct sk_buff *msg, const char *name, + u64 value) +{ + struct nlattr *entry_attr; + + entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY); + if (!entry_attr) + return -EMSGSIZE; + + if (nla_put_string(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME, + name)) + goto err; + if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_VALUE, + value, RDMA_NLDEV_ATTR_PAD)) + goto err; + + nla_nest_end(msg, entry_attr); + return 0; + +err: + nla_nest_cancel(msg, entry_attr); + return -EMSGSIZE; +} +EXPORT_SYMBOL(rdma_nl_stat_hwcounter_entry); + +static int fill_stat_mr_entry(struct sk_buff *msg, bool has_cap_net_admin, + struct rdma_restrack_entry *res, uint32_t port) +{ + struct ib_mr *mr = container_of(res, struct ib_mr, res); + struct ib_device *dev = mr->pd->device; + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_MRN, res->id)) + goto err; + + if (dev->ops.fill_stat_mr_entry) + return dev->ops.fill_stat_mr_entry(msg, mr); + return 0; + +err: + return -EMSGSIZE; +} + +static int fill_stat_counter_hwcounters(struct sk_buff *msg, + struct rdma_counter *counter) +{ + struct rdma_hw_stats *st = counter->stats; + struct nlattr *table_attr; + int i; + + table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTERS); + if (!table_attr) + return -EMSGSIZE; + + mutex_lock(&st->lock); + for (i = 0; i < st->num_counters; i++) { + if (test_bit(i, st->is_disabled)) + continue; + if (rdma_nl_stat_hwcounter_entry(msg, st->descs[i].name, + st->value[i])) + goto err; + } + mutex_unlock(&st->lock); + + nla_nest_end(msg, table_attr); + return 0; + +err: + mutex_unlock(&st->lock); + nla_nest_cancel(msg, table_attr); + return -EMSGSIZE; +} + +static int fill_res_counter_entry(struct sk_buff *msg, bool has_cap_net_admin, + struct rdma_restrack_entry *res, + uint32_t port) +{ + struct rdma_counter *counter = + container_of(res, struct rdma_counter, res); + + if (port && port != counter->port) + return -EAGAIN; + + /* Dump it even query failed */ + rdma_counter_query_stats(counter); + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, counter->port) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, counter->id) || + fill_stat_counter_mode(msg, counter) || + fill_stat_counter_qps(msg, counter) || + fill_stat_counter_hwcounters(msg, counter)) + return -EMSGSIZE; + + return 0; +} + +static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct ib_device *device; + struct sk_buff *msg; + u32 index; + int err; + + err = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + + device = ib_device_get_by_index(sock_net(skb->sk), index); + if (!device) + return -EINVAL; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + err = -ENOMEM; + goto err; + } + + nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET), + 0, 0); + + err = fill_dev_info(msg, device); + if (err) + goto err_free; + + nlmsg_end(msg, nlh); + + ib_device_put(device); + return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); + +err_free: + nlmsg_free(msg); +err: + ib_device_put(device); + return err; +} + +static int nldev_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct ib_device *device; + u32 index; + int err; + + err = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(sock_net(skb->sk), index); + if (!device) + return -EINVAL; + + if (tb[RDMA_NLDEV_ATTR_DEV_NAME]) { + char name[IB_DEVICE_NAME_MAX] = {}; + + nla_strscpy(name, tb[RDMA_NLDEV_ATTR_DEV_NAME], + IB_DEVICE_NAME_MAX); + if (strlen(name) == 0) { + err = -EINVAL; + goto done; + } + err = ib_device_rename(device, name); + goto done; + } + + if (tb[RDMA_NLDEV_NET_NS_FD]) { + u32 ns_fd; + + ns_fd = nla_get_u32(tb[RDMA_NLDEV_NET_NS_FD]); + err = ib_device_set_netns_put(skb, device, ns_fd); + goto put_done; + } + + if (tb[RDMA_NLDEV_ATTR_DEV_DIM]) { + u8 use_dim; + + use_dim = nla_get_u8(tb[RDMA_NLDEV_ATTR_DEV_DIM]); + err = ib_device_set_dim(device, use_dim); + goto done; + } + +done: + ib_device_put(device); +put_done: + return err; +} + +static int _nldev_get_dumpit(struct ib_device *device, + struct sk_buff *skb, + struct netlink_callback *cb, + unsigned int idx) +{ + int start = cb->args[0]; + struct nlmsghdr *nlh; + + if (idx < start) + return 0; + + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET), + 0, NLM_F_MULTI); + + if (fill_dev_info(skb, device)) { + nlmsg_cancel(skb, nlh); + goto out; + } + + nlmsg_end(skb, nlh); + + idx++; + +out: cb->args[0] = idx; + return skb->len; +} + +static int nldev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + /* + * There is no need to take lock, because + * we are relying on ib_core's locking. + */ + return ib_enum_all_devs(_nldev_get_dumpit, skb, cb); +} + +static int nldev_port_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct ib_device *device; + struct sk_buff *msg; + u32 index; + u32 port; + int err; + + err = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (err || + !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || + !tb[RDMA_NLDEV_ATTR_PORT_INDEX]) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(sock_net(skb->sk), index); + if (!device) + return -EINVAL; + + port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); + if (!rdma_is_port_valid(device, port)) { + err = -EINVAL; + goto err; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + err = -ENOMEM; + goto err; + } + + nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET), + 0, 0); + + err = fill_port_info(msg, device, port, sock_net(skb->sk)); + if (err) + goto err_free; + + nlmsg_end(msg, nlh); + ib_device_put(device); + + return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); + +err_free: + nlmsg_free(msg); +err: + ib_device_put(device); + return err; +} + +static int nldev_port_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct ib_device *device; + int start = cb->args[0]; + struct nlmsghdr *nlh; + u32 idx = 0; + u32 ifindex; + int err; + unsigned int p; + + err = nlmsg_parse_deprecated(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NULL); + if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) + return -EINVAL; + + ifindex = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(sock_net(skb->sk), ifindex); + if (!device) + return -EINVAL; + + rdma_for_each_port (device, p) { + /* + * The dumpit function returns all information from specific + * index. This specific index is taken from the netlink + * messages request sent by user and it is available + * in cb->args[0]. + * + * Usually, the user doesn't fill this field and it causes + * to return everything. + * + */ + if (idx < start) { + idx++; + continue; + } + + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, + RDMA_NLDEV_CMD_PORT_GET), + 0, NLM_F_MULTI); + + if (fill_port_info(skb, device, p, sock_net(skb->sk))) { + nlmsg_cancel(skb, nlh); + goto out; + } + idx++; + nlmsg_end(skb, nlh); + } + +out: + ib_device_put(device); + cb->args[0] = idx; + return skb->len; +} + +static int nldev_res_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct ib_device *device; + struct sk_buff *msg; + u32 index; + int ret; + + ret = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(sock_net(skb->sk), index); + if (!device) + return -EINVAL; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + ret = -ENOMEM; + goto err; + } + + nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_RES_GET), + 0, 0); + + ret = fill_res_info(msg, device); + if (ret) + goto err_free; + + nlmsg_end(msg, nlh); + ib_device_put(device); + return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); + +err_free: + nlmsg_free(msg); +err: + ib_device_put(device); + return ret; +} + +static int _nldev_res_get_dumpit(struct ib_device *device, + struct sk_buff *skb, + struct netlink_callback *cb, + unsigned int idx) +{ + int start = cb->args[0]; + struct nlmsghdr *nlh; + + if (idx < start) + return 0; + + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_RES_GET), + 0, NLM_F_MULTI); + + if (fill_res_info(skb, device)) { + nlmsg_cancel(skb, nlh); + goto out; + } + nlmsg_end(skb, nlh); + + idx++; + +out: + cb->args[0] = idx; + return skb->len; +} + +static int nldev_res_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return ib_enum_all_devs(_nldev_res_get_dumpit, skb, cb); +} + +struct nldev_fill_res_entry { + enum rdma_nldev_attr nldev_attr; + u8 flags; + u32 entry; + u32 id; +}; + +enum nldev_res_flags { + NLDEV_PER_DEV = 1 << 0, +}; + +static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = { + [RDMA_RESTRACK_QP] = { + .nldev_attr = RDMA_NLDEV_ATTR_RES_QP, + .entry = RDMA_NLDEV_ATTR_RES_QP_ENTRY, + .id = RDMA_NLDEV_ATTR_RES_LQPN, + }, + [RDMA_RESTRACK_CM_ID] = { + .nldev_attr = RDMA_NLDEV_ATTR_RES_CM_ID, + .entry = RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY, + .id = RDMA_NLDEV_ATTR_RES_CM_IDN, + }, + [RDMA_RESTRACK_CQ] = { + .nldev_attr = RDMA_NLDEV_ATTR_RES_CQ, + .flags = NLDEV_PER_DEV, + .entry = RDMA_NLDEV_ATTR_RES_CQ_ENTRY, + .id = RDMA_NLDEV_ATTR_RES_CQN, + }, + [RDMA_RESTRACK_MR] = { + .nldev_attr = RDMA_NLDEV_ATTR_RES_MR, + .flags = NLDEV_PER_DEV, + .entry = RDMA_NLDEV_ATTR_RES_MR_ENTRY, + .id = RDMA_NLDEV_ATTR_RES_MRN, + }, + [RDMA_RESTRACK_PD] = { + .nldev_attr = RDMA_NLDEV_ATTR_RES_PD, + .flags = NLDEV_PER_DEV, + .entry = RDMA_NLDEV_ATTR_RES_PD_ENTRY, + .id = RDMA_NLDEV_ATTR_RES_PDN, + }, + [RDMA_RESTRACK_COUNTER] = { + .nldev_attr = RDMA_NLDEV_ATTR_STAT_COUNTER, + .entry = RDMA_NLDEV_ATTR_STAT_COUNTER_ENTRY, + .id = RDMA_NLDEV_ATTR_STAT_COUNTER_ID, + }, + [RDMA_RESTRACK_CTX] = { + .nldev_attr = RDMA_NLDEV_ATTR_RES_CTX, + .flags = NLDEV_PER_DEV, + .entry = RDMA_NLDEV_ATTR_RES_CTX_ENTRY, + .id = RDMA_NLDEV_ATTR_RES_CTXN, + }, + [RDMA_RESTRACK_SRQ] = { + .nldev_attr = RDMA_NLDEV_ATTR_RES_SRQ, + .flags = NLDEV_PER_DEV, + .entry = RDMA_NLDEV_ATTR_RES_SRQ_ENTRY, + .id = RDMA_NLDEV_ATTR_RES_SRQN, + }, + +}; + +static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, + enum rdma_restrack_type res_type, + res_fill_func_t fill_func) +{ + const struct nldev_fill_res_entry *fe = &fill_entries[res_type]; + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct rdma_restrack_entry *res; + struct ib_device *device; + u32 index, id, port = 0; + bool has_cap_net_admin; + struct sk_buff *msg; + int ret; + + ret = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !fe->id || !tb[fe->id]) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(sock_net(skb->sk), index); + if (!device) + return -EINVAL; + + if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) { + port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); + if (!rdma_is_port_valid(device, port)) { + ret = -EINVAL; + goto err; + } + } + + if ((port && fe->flags & NLDEV_PER_DEV) || + (!port && ~fe->flags & NLDEV_PER_DEV)) { + ret = -EINVAL; + goto err; + } + + id = nla_get_u32(tb[fe->id]); + res = rdma_restrack_get_byid(device, res_type, id); + if (IS_ERR(res)) { + ret = PTR_ERR(res); + goto err; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + ret = -ENOMEM; + goto err_get; + } + + nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, + RDMA_NL_GET_OP(nlh->nlmsg_type)), + 0, 0); + + if (fill_nldev_handle(msg, device)) { + ret = -EMSGSIZE; + goto err_free; + } + + has_cap_net_admin = netlink_capable(skb, CAP_NET_ADMIN); + + ret = fill_func(msg, has_cap_net_admin, res, port); + if (ret) + goto err_free; + + rdma_restrack_put(res); + nlmsg_end(msg, nlh); + ib_device_put(device); + return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); + +err_free: + nlmsg_free(msg); +err_get: + rdma_restrack_put(res); +err: + ib_device_put(device); + return ret; +} + +static int res_get_common_dumpit(struct sk_buff *skb, + struct netlink_callback *cb, + enum rdma_restrack_type res_type, + res_fill_func_t fill_func) +{ + const struct nldev_fill_res_entry *fe = &fill_entries[res_type]; + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct rdma_restrack_entry *res; + struct rdma_restrack_root *rt; + int err, ret = 0, idx = 0; + struct nlattr *table_attr; + struct nlattr *entry_attr; + struct ib_device *device; + int start = cb->args[0]; + bool has_cap_net_admin; + struct nlmsghdr *nlh; + unsigned long id; + u32 index, port = 0; + bool filled = false; + + err = nlmsg_parse_deprecated(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NULL); + /* + * Right now, we are expecting the device index to get res information, + * but it is possible to extend this code to return all devices in + * one shot by checking the existence of RDMA_NLDEV_ATTR_DEV_INDEX. + * if it doesn't exist, we will iterate over all devices. + * + * But it is not needed for now. + */ + if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(sock_net(skb->sk), index); + if (!device) + return -EINVAL; + + /* + * If no PORT_INDEX is supplied, we will return all QPs from that device + */ + if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) { + port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); + if (!rdma_is_port_valid(device, port)) { + ret = -EINVAL; + goto err_index; + } + } + + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, + RDMA_NL_GET_OP(cb->nlh->nlmsg_type)), + 0, NLM_F_MULTI); + + if (fill_nldev_handle(skb, device)) { + ret = -EMSGSIZE; + goto err; + } + + table_attr = nla_nest_start_noflag(skb, fe->nldev_attr); + if (!table_attr) { + ret = -EMSGSIZE; + goto err; + } + + has_cap_net_admin = netlink_capable(cb->skb, CAP_NET_ADMIN); + + rt = &device->res[res_type]; + xa_lock(&rt->xa); + /* + * FIXME: if the skip ahead is something common this loop should + * use xas_for_each & xas_pause to optimize, we can have a lot of + * objects. + */ + xa_for_each(&rt->xa, id, res) { + if (idx < start || !rdma_restrack_get(res)) + goto next; + + xa_unlock(&rt->xa); + + filled = true; + + entry_attr = nla_nest_start_noflag(skb, fe->entry); + if (!entry_attr) { + ret = -EMSGSIZE; + rdma_restrack_put(res); + goto msg_full; + } + + ret = fill_func(skb, has_cap_net_admin, res, port); + + rdma_restrack_put(res); + + if (ret) { + nla_nest_cancel(skb, entry_attr); + if (ret == -EMSGSIZE) + goto msg_full; + if (ret == -EAGAIN) + goto again; + goto res_err; + } + nla_nest_end(skb, entry_attr); +again: xa_lock(&rt->xa); +next: idx++; + } + xa_unlock(&rt->xa); + +msg_full: + nla_nest_end(skb, table_attr); + nlmsg_end(skb, nlh); + cb->args[0] = idx; + + /* + * No more entries to fill, cancel the message and + * return 0 to mark end of dumpit. + */ + if (!filled) + goto err; + + ib_device_put(device); + return skb->len; + +res_err: + nla_nest_cancel(skb, table_attr); + +err: + nlmsg_cancel(skb, nlh); + +err_index: + ib_device_put(device); + return ret; +} + +#define RES_GET_FUNCS(name, type) \ + static int nldev_res_get_##name##_dumpit(struct sk_buff *skb, \ + struct netlink_callback *cb) \ + { \ + return res_get_common_dumpit(skb, cb, type, \ + fill_res_##name##_entry); \ + } \ + static int nldev_res_get_##name##_doit(struct sk_buff *skb, \ + struct nlmsghdr *nlh, \ + struct netlink_ext_ack *extack) \ + { \ + return res_get_common_doit(skb, nlh, extack, type, \ + fill_res_##name##_entry); \ + } + +RES_GET_FUNCS(qp, RDMA_RESTRACK_QP); +RES_GET_FUNCS(qp_raw, RDMA_RESTRACK_QP); +RES_GET_FUNCS(cm_id, RDMA_RESTRACK_CM_ID); +RES_GET_FUNCS(cq, RDMA_RESTRACK_CQ); +RES_GET_FUNCS(cq_raw, RDMA_RESTRACK_CQ); +RES_GET_FUNCS(pd, RDMA_RESTRACK_PD); +RES_GET_FUNCS(mr, RDMA_RESTRACK_MR); +RES_GET_FUNCS(mr_raw, RDMA_RESTRACK_MR); +RES_GET_FUNCS(counter, RDMA_RESTRACK_COUNTER); +RES_GET_FUNCS(ctx, RDMA_RESTRACK_CTX); +RES_GET_FUNCS(srq, RDMA_RESTRACK_SRQ); + +static LIST_HEAD(link_ops); +static DECLARE_RWSEM(link_ops_rwsem); + +static const struct rdma_link_ops *link_ops_get(const char *type) +{ + const struct rdma_link_ops *ops; + + list_for_each_entry(ops, &link_ops, list) { + if (!strcmp(ops->type, type)) + goto out; + } + ops = NULL; +out: + return ops; +} + +void rdma_link_register(struct rdma_link_ops *ops) +{ + down_write(&link_ops_rwsem); + if (WARN_ON_ONCE(link_ops_get(ops->type))) + goto out; + list_add(&ops->list, &link_ops); +out: + up_write(&link_ops_rwsem); +} +EXPORT_SYMBOL(rdma_link_register); + +void rdma_link_unregister(struct rdma_link_ops *ops) +{ + down_write(&link_ops_rwsem); + list_del(&ops->list); + up_write(&link_ops_rwsem); +} +EXPORT_SYMBOL(rdma_link_unregister); + +static int nldev_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + char ibdev_name[IB_DEVICE_NAME_MAX]; + const struct rdma_link_ops *ops; + char ndev_name[IFNAMSIZ]; + struct net_device *ndev; + char type[IFNAMSIZ]; + int err; + + err = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (err || !tb[RDMA_NLDEV_ATTR_DEV_NAME] || + !tb[RDMA_NLDEV_ATTR_LINK_TYPE] || !tb[RDMA_NLDEV_ATTR_NDEV_NAME]) + return -EINVAL; + + nla_strscpy(ibdev_name, tb[RDMA_NLDEV_ATTR_DEV_NAME], + sizeof(ibdev_name)); + if (strchr(ibdev_name, '%') || strlen(ibdev_name) == 0) + return -EINVAL; + + nla_strscpy(type, tb[RDMA_NLDEV_ATTR_LINK_TYPE], sizeof(type)); + nla_strscpy(ndev_name, tb[RDMA_NLDEV_ATTR_NDEV_NAME], + sizeof(ndev_name)); + + ndev = dev_get_by_name(sock_net(skb->sk), ndev_name); + if (!ndev) + return -ENODEV; + + down_read(&link_ops_rwsem); + ops = link_ops_get(type); +#ifdef CONFIG_MODULES + if (!ops) { + up_read(&link_ops_rwsem); + request_module("rdma-link-%s", type); + down_read(&link_ops_rwsem); + ops = link_ops_get(type); + } +#endif + err = ops ? ops->newlink(ibdev_name, ndev) : -EINVAL; + up_read(&link_ops_rwsem); + dev_put(ndev); + + return err; +} + +static int nldev_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct ib_device *device; + u32 index; + int err; + + err = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(sock_net(skb->sk), index); + if (!device) + return -EINVAL; + + if (!(device->attrs.device_cap_flags & IB_DEVICE_ALLOW_USER_UNREG)) { + ib_device_put(device); + return -EINVAL; + } + + ib_unregister_device_and_put(device); + return 0; +} + +static int nldev_get_chardev(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + char client_name[RDMA_NLDEV_ATTR_CHARDEV_TYPE_SIZE]; + struct ib_client_nl_info data = {}; + struct ib_device *ibdev = NULL; + struct sk_buff *msg; + u32 index; + int err; + + err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, + extack); + if (err || !tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE]) + return -EINVAL; + + nla_strscpy(client_name, tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE], + sizeof(client_name)); + + if (tb[RDMA_NLDEV_ATTR_DEV_INDEX]) { + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + ibdev = ib_device_get_by_index(sock_net(skb->sk), index); + if (!ibdev) + return -EINVAL; + + if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) { + data.port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); + if (!rdma_is_port_valid(ibdev, data.port)) { + err = -EINVAL; + goto out_put; + } + } else { + data.port = -1; + } + } else if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) { + return -EINVAL; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + err = -ENOMEM; + goto out_put; + } + nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, + RDMA_NLDEV_CMD_GET_CHARDEV), + 0, 0); + + data.nl_msg = msg; + err = ib_get_client_nl_info(ibdev, client_name, &data); + if (err) + goto out_nlmsg; + + err = nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CHARDEV, + huge_encode_dev(data.cdev->devt), + RDMA_NLDEV_ATTR_PAD); + if (err) + goto out_data; + err = nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CHARDEV_ABI, data.abi, + RDMA_NLDEV_ATTR_PAD); + if (err) + goto out_data; + if (nla_put_string(msg, RDMA_NLDEV_ATTR_CHARDEV_NAME, + dev_name(data.cdev))) { + err = -EMSGSIZE; + goto out_data; + } + + nlmsg_end(msg, nlh); + put_device(data.cdev); + if (ibdev) + ib_device_put(ibdev); + return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); + +out_data: + put_device(data.cdev); +out_nlmsg: + nlmsg_free(msg); +out_put: + if (ibdev) + ib_device_put(ibdev); + return err; +} + +static int nldev_sys_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct sk_buff *msg; + int err; + + err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (err) + return err; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, + RDMA_NLDEV_CMD_SYS_GET), + 0, 0); + + err = nla_put_u8(msg, RDMA_NLDEV_SYS_ATTR_NETNS_MODE, + (u8)ib_devices_shared_netns); + if (err) { + nlmsg_free(msg); + return err; + } + + /* + * Copy-on-fork is supported. + * See commits: + * 70e806e4e645 ("mm: Do early cow for pinned pages during fork() for ptes") + * 4eae4efa2c29 ("hugetlb: do early cow when page pinned on src mm") + * for more details. Don't backport this without them. + * + * Return value ignored on purpose, assume copy-on-fork is not + * supported in case of failure. + */ + nla_put_u8(msg, RDMA_NLDEV_SYS_ATTR_COPY_ON_FORK, 1); + + nlmsg_end(msg, nlh); + return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); +} + +static int nldev_set_sys_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + u8 enable; + int err; + + err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (err || !tb[RDMA_NLDEV_SYS_ATTR_NETNS_MODE]) + return -EINVAL; + + enable = nla_get_u8(tb[RDMA_NLDEV_SYS_ATTR_NETNS_MODE]); + /* Only 0 and 1 are supported */ + if (enable > 1) + return -EINVAL; + + err = rdma_compatdev_set(enable); + return err; +} + +static int nldev_stat_set_mode_doit(struct sk_buff *msg, + struct netlink_ext_ack *extack, + struct nlattr *tb[], + struct ib_device *device, u32 port) +{ + u32 mode, mask = 0, qpn, cntn = 0; + int ret; + + /* Currently only counter for QP is supported */ + if (!tb[RDMA_NLDEV_ATTR_STAT_RES] || + nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES]) != RDMA_NLDEV_ATTR_RES_QP) + return -EINVAL; + + mode = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_MODE]); + if (mode == RDMA_COUNTER_MODE_AUTO) { + if (tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]) + mask = nla_get_u32( + tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]); + return rdma_counter_set_auto_mode(device, port, mask, extack); + } + + if (!tb[RDMA_NLDEV_ATTR_RES_LQPN]) + return -EINVAL; + + qpn = nla_get_u32(tb[RDMA_NLDEV_ATTR_RES_LQPN]); + if (tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]) { + cntn = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]); + ret = rdma_counter_bind_qpn(device, port, qpn, cntn); + if (ret) + return ret; + } else { + ret = rdma_counter_bind_qpn_alloc(device, port, qpn, &cntn); + if (ret) + return ret; + } + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, cntn) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qpn)) { + ret = -EMSGSIZE; + goto err_fill; + } + + return 0; + +err_fill: + rdma_counter_unbind_qpn(device, port, qpn, cntn); + return ret; +} + +static int nldev_stat_set_counter_dynamic_doit(struct nlattr *tb[], + struct ib_device *device, + u32 port) +{ + struct rdma_hw_stats *stats; + struct nlattr *entry_attr; + unsigned long *target; + int rem, i, ret = 0; + u32 index; + + stats = ib_get_hw_stats_port(device, port); + if (!stats) + return -EINVAL; + + target = kcalloc(BITS_TO_LONGS(stats->num_counters), + sizeof(*stats->is_disabled), GFP_KERNEL); + if (!target) + return -ENOMEM; + + nla_for_each_nested(entry_attr, tb[RDMA_NLDEV_ATTR_STAT_HWCOUNTERS], + rem) { + index = nla_get_u32(entry_attr); + if ((index >= stats->num_counters) || + !(stats->descs[index].flags & IB_STAT_FLAG_OPTIONAL)) { + ret = -EINVAL; + goto out; + } + + set_bit(index, target); + } + + for (i = 0; i < stats->num_counters; i++) { + if (!(stats->descs[i].flags & IB_STAT_FLAG_OPTIONAL)) + continue; + + ret = rdma_counter_modify(device, port, i, test_bit(i, target)); + if (ret) + goto out; + } + +out: + kfree(target); + return ret; +} + +static int nldev_stat_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct ib_device *device; + struct sk_buff *msg; + u32 index, port; + int ret; + + ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, + extack); + if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || + !tb[RDMA_NLDEV_ATTR_PORT_INDEX]) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(sock_net(skb->sk), index); + if (!device) + return -EINVAL; + + port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); + if (!rdma_is_port_valid(device, port)) { + ret = -EINVAL; + goto err_put_device; + } + + if (!tb[RDMA_NLDEV_ATTR_STAT_MODE] && + !tb[RDMA_NLDEV_ATTR_STAT_HWCOUNTERS]) { + ret = -EINVAL; + goto err_put_device; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + ret = -ENOMEM; + goto err_put_device; + } + nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, + RDMA_NLDEV_CMD_STAT_SET), + 0, 0); + if (fill_nldev_handle(msg, device) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port)) { + ret = -EMSGSIZE; + goto err_free_msg; + } + + if (tb[RDMA_NLDEV_ATTR_STAT_MODE]) { + ret = nldev_stat_set_mode_doit(msg, extack, tb, device, port); + if (ret) + goto err_free_msg; + } + + if (tb[RDMA_NLDEV_ATTR_STAT_HWCOUNTERS]) { + ret = nldev_stat_set_counter_dynamic_doit(tb, device, port); + if (ret) + goto err_free_msg; + } + + nlmsg_end(msg, nlh); + ib_device_put(device); + return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); + +err_free_msg: + nlmsg_free(msg); +err_put_device: + ib_device_put(device); + return ret; +} + +static int nldev_stat_del_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct ib_device *device; + struct sk_buff *msg; + u32 index, port, qpn, cntn; + int ret; + + ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (ret || !tb[RDMA_NLDEV_ATTR_STAT_RES] || + !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX] || + !tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID] || + !tb[RDMA_NLDEV_ATTR_RES_LQPN]) + return -EINVAL; + + if (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES]) != RDMA_NLDEV_ATTR_RES_QP) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(sock_net(skb->sk), index); + if (!device) + return -EINVAL; + + port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); + if (!rdma_is_port_valid(device, port)) { + ret = -EINVAL; + goto err; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + ret = -ENOMEM; + goto err; + } + nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, + RDMA_NLDEV_CMD_STAT_SET), + 0, 0); + + cntn = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]); + qpn = nla_get_u32(tb[RDMA_NLDEV_ATTR_RES_LQPN]); + if (fill_nldev_handle(msg, device) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, cntn) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qpn)) { + ret = -EMSGSIZE; + goto err_fill; + } + + ret = rdma_counter_unbind_qpn(device, port, qpn, cntn); + if (ret) + goto err_fill; + + nlmsg_end(msg, nlh); + ib_device_put(device); + return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); + +err_fill: + nlmsg_free(msg); +err: + ib_device_put(device); + return ret; +} + +static int stat_get_doit_default_counter(struct sk_buff *skb, + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, + struct nlattr *tb[]) +{ + struct rdma_hw_stats *stats; + struct nlattr *table_attr; + struct ib_device *device; + int ret, num_cnts, i; + struct sk_buff *msg; + u32 index, port; + u64 v; + + if (!tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX]) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(sock_net(skb->sk), index); + if (!device) + return -EINVAL; + + if (!device->ops.alloc_hw_port_stats || !device->ops.get_hw_stats) { + ret = -EINVAL; + goto err; + } + + port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); + stats = ib_get_hw_stats_port(device, port); + if (!stats) { + ret = -EINVAL; + goto err; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + ret = -ENOMEM; + goto err; + } + + nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, + RDMA_NLDEV_CMD_STAT_GET), + 0, 0); + + if (fill_nldev_handle(msg, device) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port)) { + ret = -EMSGSIZE; + goto err_msg; + } + + mutex_lock(&stats->lock); + + num_cnts = device->ops.get_hw_stats(device, stats, port, 0); + if (num_cnts < 0) { + ret = -EINVAL; + goto err_stats; + } + + table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTERS); + if (!table_attr) { + ret = -EMSGSIZE; + goto err_stats; + } + for (i = 0; i < num_cnts; i++) { + if (test_bit(i, stats->is_disabled)) + continue; + + v = stats->value[i] + + rdma_counter_get_hwstat_value(device, port, i); + if (rdma_nl_stat_hwcounter_entry(msg, + stats->descs[i].name, v)) { + ret = -EMSGSIZE; + goto err_table; + } + } + nla_nest_end(msg, table_attr); + + mutex_unlock(&stats->lock); + nlmsg_end(msg, nlh); + ib_device_put(device); + return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); + +err_table: + nla_nest_cancel(msg, table_attr); +err_stats: + mutex_unlock(&stats->lock); +err_msg: + nlmsg_free(msg); +err: + ib_device_put(device); + return ret; +} + +static int stat_get_doit_qp(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, struct nlattr *tb[]) + +{ + static enum rdma_nl_counter_mode mode; + static enum rdma_nl_counter_mask mask; + struct ib_device *device; + struct sk_buff *msg; + u32 index, port; + int ret; + + if (tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]) + return nldev_res_get_counter_doit(skb, nlh, extack); + + if (!tb[RDMA_NLDEV_ATTR_STAT_MODE] || + !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX]) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(sock_net(skb->sk), index); + if (!device) + return -EINVAL; + + port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); + if (!rdma_is_port_valid(device, port)) { + ret = -EINVAL; + goto err; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + ret = -ENOMEM; + goto err; + } + + nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, + RDMA_NLDEV_CMD_STAT_GET), + 0, 0); + + ret = rdma_counter_get_mode(device, port, &mode, &mask); + if (ret) + goto err_msg; + + if (fill_nldev_handle(msg, device) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_MODE, mode)) { + ret = -EMSGSIZE; + goto err_msg; + } + + if ((mode == RDMA_COUNTER_MODE_AUTO) && + nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK, mask)) { + ret = -EMSGSIZE; + goto err_msg; + } + + nlmsg_end(msg, nlh); + ib_device_put(device); + return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); + +err_msg: + nlmsg_free(msg); +err: + ib_device_put(device); + return ret; +} + +static int nldev_stat_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + int ret; + + ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (ret) + return -EINVAL; + + if (!tb[RDMA_NLDEV_ATTR_STAT_RES]) + return stat_get_doit_default_counter(skb, nlh, extack, tb); + + switch (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES])) { + case RDMA_NLDEV_ATTR_RES_QP: + ret = stat_get_doit_qp(skb, nlh, extack, tb); + break; + case RDMA_NLDEV_ATTR_RES_MR: + ret = res_get_common_doit(skb, nlh, extack, RDMA_RESTRACK_MR, + fill_stat_mr_entry); + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +static int nldev_stat_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + int ret; + + ret = nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NULL); + if (ret || !tb[RDMA_NLDEV_ATTR_STAT_RES]) + return -EINVAL; + + switch (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES])) { + case RDMA_NLDEV_ATTR_RES_QP: + ret = nldev_res_get_counter_dumpit(skb, cb); + break; + case RDMA_NLDEV_ATTR_RES_MR: + ret = res_get_common_dumpit(skb, cb, RDMA_RESTRACK_MR, + fill_stat_mr_entry); + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +static int nldev_stat_get_counter_status_doit(struct sk_buff *skb, + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX], *table, *entry; + struct rdma_hw_stats *stats; + struct ib_device *device; + struct sk_buff *msg; + u32 devid, port; + int ret, i; + + ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || + !tb[RDMA_NLDEV_ATTR_PORT_INDEX]) + return -EINVAL; + + devid = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(sock_net(skb->sk), devid); + if (!device) + return -EINVAL; + + port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); + if (!rdma_is_port_valid(device, port)) { + ret = -EINVAL; + goto err; + } + + stats = ib_get_hw_stats_port(device, port); + if (!stats) { + ret = -EINVAL; + goto err; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + ret = -ENOMEM; + goto err; + } + + nlh = nlmsg_put( + msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_STAT_GET_STATUS), + 0, 0); + + ret = -EMSGSIZE; + if (fill_nldev_handle(msg, device) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port)) + goto err_msg; + + table = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTERS); + if (!table) + goto err_msg; + + mutex_lock(&stats->lock); + for (i = 0; i < stats->num_counters; i++) { + entry = nla_nest_start(msg, + RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY); + if (!entry) + goto err_msg_table; + + if (nla_put_string(msg, + RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME, + stats->descs[i].name) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_INDEX, i)) + goto err_msg_entry; + + if ((stats->descs[i].flags & IB_STAT_FLAG_OPTIONAL) && + (nla_put_u8(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_DYNAMIC, + !test_bit(i, stats->is_disabled)))) + goto err_msg_entry; + + nla_nest_end(msg, entry); + } + mutex_unlock(&stats->lock); + + nla_nest_end(msg, table); + nlmsg_end(msg, nlh); + ib_device_put(device); + return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); + +err_msg_entry: + nla_nest_cancel(msg, entry); +err_msg_table: + mutex_unlock(&stats->lock); + nla_nest_cancel(msg, table); +err_msg: + nlmsg_free(msg); +err: + ib_device_put(device); + return ret; +} + +static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { + [RDMA_NLDEV_CMD_GET] = { + .doit = nldev_get_doit, + .dump = nldev_get_dumpit, + }, + [RDMA_NLDEV_CMD_GET_CHARDEV] = { + .doit = nldev_get_chardev, + }, + [RDMA_NLDEV_CMD_SET] = { + .doit = nldev_set_doit, + .flags = RDMA_NL_ADMIN_PERM, + }, + [RDMA_NLDEV_CMD_NEWLINK] = { + .doit = nldev_newlink, + .flags = RDMA_NL_ADMIN_PERM, + }, + [RDMA_NLDEV_CMD_DELLINK] = { + .doit = nldev_dellink, + .flags = RDMA_NL_ADMIN_PERM, + }, + [RDMA_NLDEV_CMD_PORT_GET] = { + .doit = nldev_port_get_doit, + .dump = nldev_port_get_dumpit, + }, + [RDMA_NLDEV_CMD_RES_GET] = { + .doit = nldev_res_get_doit, + .dump = nldev_res_get_dumpit, + }, + [RDMA_NLDEV_CMD_RES_QP_GET] = { + .doit = nldev_res_get_qp_doit, + .dump = nldev_res_get_qp_dumpit, + }, + [RDMA_NLDEV_CMD_RES_CM_ID_GET] = { + .doit = nldev_res_get_cm_id_doit, + .dump = nldev_res_get_cm_id_dumpit, + }, + [RDMA_NLDEV_CMD_RES_CQ_GET] = { + .doit = nldev_res_get_cq_doit, + .dump = nldev_res_get_cq_dumpit, + }, + [RDMA_NLDEV_CMD_RES_MR_GET] = { + .doit = nldev_res_get_mr_doit, + .dump = nldev_res_get_mr_dumpit, + }, + [RDMA_NLDEV_CMD_RES_PD_GET] = { + .doit = nldev_res_get_pd_doit, + .dump = nldev_res_get_pd_dumpit, + }, + [RDMA_NLDEV_CMD_RES_CTX_GET] = { + .doit = nldev_res_get_ctx_doit, + .dump = nldev_res_get_ctx_dumpit, + }, + [RDMA_NLDEV_CMD_RES_SRQ_GET] = { + .doit = nldev_res_get_srq_doit, + .dump = nldev_res_get_srq_dumpit, + }, + [RDMA_NLDEV_CMD_SYS_GET] = { + .doit = nldev_sys_get_doit, + }, + [RDMA_NLDEV_CMD_SYS_SET] = { + .doit = nldev_set_sys_set_doit, + }, + [RDMA_NLDEV_CMD_STAT_SET] = { + .doit = nldev_stat_set_doit, + .flags = RDMA_NL_ADMIN_PERM, + }, + [RDMA_NLDEV_CMD_STAT_GET] = { + .doit = nldev_stat_get_doit, + .dump = nldev_stat_get_dumpit, + }, + [RDMA_NLDEV_CMD_STAT_DEL] = { + .doit = nldev_stat_del_doit, + .flags = RDMA_NL_ADMIN_PERM, + }, + [RDMA_NLDEV_CMD_RES_QP_GET_RAW] = { + .doit = nldev_res_get_qp_raw_doit, + .dump = nldev_res_get_qp_raw_dumpit, + .flags = RDMA_NL_ADMIN_PERM, + }, + [RDMA_NLDEV_CMD_RES_CQ_GET_RAW] = { + .doit = nldev_res_get_cq_raw_doit, + .dump = nldev_res_get_cq_raw_dumpit, + .flags = RDMA_NL_ADMIN_PERM, + }, + [RDMA_NLDEV_CMD_RES_MR_GET_RAW] = { + .doit = nldev_res_get_mr_raw_doit, + .dump = nldev_res_get_mr_raw_dumpit, + .flags = RDMA_NL_ADMIN_PERM, + }, + [RDMA_NLDEV_CMD_STAT_GET_STATUS] = { + .doit = nldev_stat_get_counter_status_doit, + }, +}; + +void __init nldev_init(void) +{ + rdma_nl_register(RDMA_NL_NLDEV, nldev_cb_table); +} + +void __exit nldev_exit(void) +{ + rdma_nl_unregister(RDMA_NL_NLDEV); +} + +MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_NLDEV, 5); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/opa_smi.h b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/opa_smi.h new file mode 100644 index 0000000..64e2822 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/opa_smi.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2014 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef __OPA_SMI_H_ +#define __OPA_SMI_H_ + +#include +#include + +#include "smi.h" + +enum smi_action opa_smi_handle_dr_smp_recv(struct opa_smp *smp, bool is_switch, + u32 port_num, int phys_port_cnt); +int opa_smi_get_fwd_port(struct opa_smp *smp); +extern enum smi_forward_action opa_smi_check_forward_dr_smp(struct opa_smp *smp); +extern enum smi_action opa_smi_handle_dr_smp_send(struct opa_smp *smp, + bool is_switch, u32 port_num); + +/* + * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM + * via process_mad + */ +static inline enum smi_action opa_smi_check_local_smp(struct opa_smp *smp, + struct ib_device *device) +{ + /* C14-9:3 -- We're at the end of the DR segment of path */ + /* C14-9:4 -- Hop Pointer = Hop Count + 1 -> give to SMA/SM */ + return (device->ops.process_mad && + !opa_get_smp_direction(smp) && + (smp->hop_ptr == smp->hop_cnt + 1)) ? + IB_SMI_HANDLE : IB_SMI_DISCARD; +} + +/* + * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM + * via process_mad + */ +static inline enum smi_action opa_smi_check_local_returning_smp(struct opa_smp *smp, + struct ib_device *device) +{ + /* C14-13:3 -- We're at the end of the DR segment of path */ + /* C14-13:4 -- Hop Pointer == 0 -> give to SM */ + return (device->ops.process_mad && + opa_get_smp_direction(smp) && + !smp->hop_ptr) ? IB_SMI_HANDLE : IB_SMI_DISCARD; +} + +#endif /* __OPA_SMI_H_ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/packer.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/packer.c new file mode 100644 index 0000000..19b1ee3 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/packer.c @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2004 Topspin Corporation. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include + +static u64 value_read(int offset, int size, void *structure) +{ + switch (size) { + case 1: return *(u8 *) (structure + offset); + case 2: return be16_to_cpup((__be16 *) (structure + offset)); + case 4: return be32_to_cpup((__be32 *) (structure + offset)); + case 8: return be64_to_cpup((__be64 *) (structure + offset)); + default: + pr_warn("Field size %d bits not handled\n", size * 8); + return 0; + } +} + +/** + * ib_pack - Pack a structure into a buffer + * @desc:Array of structure field descriptions + * @desc_len:Number of entries in @desc + * @structure:Structure to pack from + * @buf:Buffer to pack into + * + * ib_pack() packs a list of structure fields into a buffer, + * controlled by the array of fields in @desc. + */ +void ib_pack(const struct ib_field *desc, + int desc_len, + void *structure, + void *buf) +{ + int i; + + for (i = 0; i < desc_len; ++i) { + if (desc[i].size_bits <= 32) { + int shift; + u32 val; + __be32 mask; + __be32 *addr; + + shift = 32 - desc[i].offset_bits - desc[i].size_bits; + if (desc[i].struct_size_bytes) + val = value_read(desc[i].struct_offset_bytes, + desc[i].struct_size_bytes, + structure) << shift; + else + val = 0; + + mask = cpu_to_be32(((1ull << desc[i].size_bits) - 1) << shift); + addr = (__be32 *) buf + desc[i].offset_words; + *addr = (*addr & ~mask) | (cpu_to_be32(val) & mask); + } else if (desc[i].size_bits <= 64) { + int shift; + u64 val; + __be64 mask; + __be64 *addr; + + shift = 64 - desc[i].offset_bits - desc[i].size_bits; + if (desc[i].struct_size_bytes) + val = value_read(desc[i].struct_offset_bytes, + desc[i].struct_size_bytes, + structure) << shift; + else + val = 0; + + mask = cpu_to_be64((~0ull >> (64 - desc[i].size_bits)) << shift); + addr = (__be64 *) ((__be32 *) buf + desc[i].offset_words); + *addr = (*addr & ~mask) | (cpu_to_be64(val) & mask); + } else { + if (desc[i].offset_bits % 8 || + desc[i].size_bits % 8) { + pr_warn("Structure field %s of size %d bits is not byte-aligned\n", + desc[i].field_name, desc[i].size_bits); + } + + if (desc[i].struct_size_bytes) + memcpy(buf + desc[i].offset_words * 4 + + desc[i].offset_bits / 8, + structure + desc[i].struct_offset_bytes, + desc[i].size_bits / 8); + else + memset(buf + desc[i].offset_words * 4 + + desc[i].offset_bits / 8, + 0, + desc[i].size_bits / 8); + } + } +} +EXPORT_SYMBOL(ib_pack); + +static void value_write(int offset, int size, u64 val, void *structure) +{ + switch (size * 8) { + case 8: *( u8 *) (structure + offset) = val; break; + case 16: *(__be16 *) (structure + offset) = cpu_to_be16(val); break; + case 32: *(__be32 *) (structure + offset) = cpu_to_be32(val); break; + case 64: *(__be64 *) (structure + offset) = cpu_to_be64(val); break; + default: + pr_warn("Field size %d bits not handled\n", size * 8); + } +} + +/** + * ib_unpack - Unpack a buffer into a structure + * @desc:Array of structure field descriptions + * @desc_len:Number of entries in @desc + * @buf:Buffer to unpack from + * @structure:Structure to unpack into + * + * ib_pack() unpacks a list of structure fields from a buffer, + * controlled by the array of fields in @desc. + */ +void ib_unpack(const struct ib_field *desc, + int desc_len, + void *buf, + void *structure) +{ + int i; + + for (i = 0; i < desc_len; ++i) { + if (!desc[i].struct_size_bytes) + continue; + + if (desc[i].size_bits <= 32) { + int shift; + u32 val; + u32 mask; + __be32 *addr; + + shift = 32 - desc[i].offset_bits - desc[i].size_bits; + mask = ((1ull << desc[i].size_bits) - 1) << shift; + addr = (__be32 *) buf + desc[i].offset_words; + val = (be32_to_cpup(addr) & mask) >> shift; + value_write(desc[i].struct_offset_bytes, + desc[i].struct_size_bytes, + val, + structure); + } else if (desc[i].size_bits <= 64) { + int shift; + u64 val; + u64 mask; + __be64 *addr; + + shift = 64 - desc[i].offset_bits - desc[i].size_bits; + mask = (~0ull >> (64 - desc[i].size_bits)) << shift; + addr = (__be64 *) buf + desc[i].offset_words; + val = (be64_to_cpup(addr) & mask) >> shift; + value_write(desc[i].struct_offset_bytes, + desc[i].struct_size_bytes, + val, + structure); + } else { + if (desc[i].offset_bits % 8 || + desc[i].size_bits % 8) { + pr_warn("Structure field %s of size %d bits is not byte-aligned\n", + desc[i].field_name, desc[i].size_bits); + } + + memcpy(structure + desc[i].struct_offset_bytes, + buf + desc[i].offset_words * 4 + + desc[i].offset_bits / 8, + desc[i].size_bits / 8); + } + } +} +EXPORT_SYMBOL(ib_unpack); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/peer_mem.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/peer_mem.c new file mode 100644 index 0000000..95faf61 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/peer_mem.c @@ -0,0 +1,690 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2014-2020, Mellanox Technologies. All rights reserved. + */ + +#include +#include +#include +#include "ib_peer_mem.h" + +static DEFINE_MUTEX(peer_memory_mutex); +static LIST_HEAD(peer_memory_list); +static struct kobject *peers_kobj; +#define PEER_NO_INVALIDATION_ID U32_MAX + +static int ib_invalidate_peer_memory(void *reg_handle, u64 core_context); + +struct peer_mem_attribute { + struct attribute attr; + ssize_t (*show)(struct ib_peer_memory_client *ib_peer_client, + struct peer_mem_attribute *attr, char *buf); + ssize_t (*store)(struct ib_peer_memory_client *ib_peer_client, + struct peer_mem_attribute *attr, const char *buf, + size_t count); +}; +#define PEER_ATTR_RO(_name) \ + struct peer_mem_attribute peer_attr_ ## _name = __ATTR_RO(_name) + +static ssize_t version_show(struct ib_peer_memory_client *ib_peer_client, + struct peer_mem_attribute *attr, char *buf) +{ + return scnprintf(buf, PAGE_SIZE, "%s\n", + ib_peer_client->peer_mem->version); +} +static PEER_ATTR_RO(version); + +static ssize_t num_alloc_mrs_show(struct ib_peer_memory_client *ib_peer_client, + struct peer_mem_attribute *attr, char *buf) +{ + return scnprintf( + buf, PAGE_SIZE, "%llu\n", + (u64)atomic64_read(&ib_peer_client->stats.num_alloc_mrs)); +} +static PEER_ATTR_RO(num_alloc_mrs); + +static ssize_t +num_dealloc_mrs_show(struct ib_peer_memory_client *ib_peer_client, + struct peer_mem_attribute *attr, char *buf) + +{ + return scnprintf( + buf, PAGE_SIZE, "%llu\n", + (u64)atomic64_read(&ib_peer_client->stats.num_dealloc_mrs)); +} +static PEER_ATTR_RO(num_dealloc_mrs); + +static ssize_t num_reg_pages_show(struct ib_peer_memory_client *ib_peer_client, + struct peer_mem_attribute *attr, char *buf) +{ + return scnprintf( + buf, PAGE_SIZE, "%llu\n", + (u64)atomic64_read(&ib_peer_client->stats.num_reg_pages)); +} +static PEER_ATTR_RO(num_reg_pages); + +static ssize_t +num_dereg_pages_show(struct ib_peer_memory_client *ib_peer_client, + struct peer_mem_attribute *attr, char *buf) +{ + return scnprintf( + buf, PAGE_SIZE, "%llu\n", + (u64)atomic64_read(&ib_peer_client->stats.num_dereg_pages)); +} +static PEER_ATTR_RO(num_dereg_pages); + +static ssize_t num_reg_bytes_show(struct ib_peer_memory_client *ib_peer_client, + struct peer_mem_attribute *attr, char *buf) +{ + return scnprintf( + buf, PAGE_SIZE, "%llu\n", + (u64)atomic64_read(&ib_peer_client->stats.num_reg_bytes)); +} +static PEER_ATTR_RO(num_reg_bytes); + +static ssize_t +num_dereg_bytes_show(struct ib_peer_memory_client *ib_peer_client, + struct peer_mem_attribute *attr, char *buf) +{ + return scnprintf( + buf, PAGE_SIZE, "%llu\n", + (u64)atomic64_read(&ib_peer_client->stats.num_dereg_bytes)); +} +static PEER_ATTR_RO(num_dereg_bytes); + +static ssize_t +num_free_callbacks_show(struct ib_peer_memory_client *ib_peer_client, + struct peer_mem_attribute *attr, char *buf) +{ + return scnprintf(buf, PAGE_SIZE, "%lu\n", + ib_peer_client->stats.num_free_callbacks); +} +static PEER_ATTR_RO(num_free_callbacks); + +static struct attribute *peer_mem_attrs[] = { + &peer_attr_version.attr, + &peer_attr_num_alloc_mrs.attr, + &peer_attr_num_dealloc_mrs.attr, + &peer_attr_num_reg_pages.attr, + &peer_attr_num_dereg_pages.attr, + &peer_attr_num_reg_bytes.attr, + &peer_attr_num_dereg_bytes.attr, + &peer_attr_num_free_callbacks.attr, + NULL, +}; + +static const struct attribute_group peer_mem_attr_group = { + .attrs = peer_mem_attrs, +}; + +static ssize_t peer_attr_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct peer_mem_attribute *peer_attr = + container_of(attr, struct peer_mem_attribute, attr); + + if (!peer_attr->show) + return -EIO; + return peer_attr->show(container_of(kobj, struct ib_peer_memory_client, + kobj), + peer_attr, buf); +} + +static const struct sysfs_ops peer_mem_sysfs_ops = { + .show = peer_attr_show, +}; + +static void ib_peer_memory_client_release(struct kobject *kobj) +{ + struct ib_peer_memory_client *ib_peer_client = + container_of(kobj, struct ib_peer_memory_client, kobj); + + kfree(ib_peer_client); +} + +static struct kobj_type peer_mem_type = { + .sysfs_ops = &peer_mem_sysfs_ops, + .release = ib_peer_memory_client_release, +}; + +static int ib_memory_peer_check_mandatory(const struct peer_memory_client + *peer_client) +{ +#define PEER_MEM_MANDATORY_FUNC(x) {offsetof(struct peer_memory_client, x), #x} + int i; + static const struct { + size_t offset; + char *name; + } mandatory_table[] = { + PEER_MEM_MANDATORY_FUNC(acquire), + PEER_MEM_MANDATORY_FUNC(get_pages), + PEER_MEM_MANDATORY_FUNC(put_pages), + PEER_MEM_MANDATORY_FUNC(dma_map), + PEER_MEM_MANDATORY_FUNC(dma_unmap), + }; + + for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { + if (!*(void **)((void *)peer_client + + mandatory_table[i].offset)) { + pr_err("Peer memory %s is missing mandatory function %s\n", + peer_client->name, mandatory_table[i].name); + return -EINVAL; + } + } + + return 0; +} + +void * +ib_register_peer_memory_client(const struct peer_memory_client *peer_client, + invalidate_peer_memory *invalidate_callback) +{ + struct ib_peer_memory_client *ib_peer_client; + int ret; + + if (ib_memory_peer_check_mandatory(peer_client)) + return NULL; + + ib_peer_client = kzalloc(sizeof(*ib_peer_client), GFP_KERNEL); + if (!ib_peer_client) + return NULL; + kobject_init(&ib_peer_client->kobj, &peer_mem_type); + refcount_set(&ib_peer_client->usecnt, 1); + init_completion(&ib_peer_client->usecnt_zero); + ib_peer_client->peer_mem = peer_client; + xa_init_flags(&ib_peer_client->umem_xa, XA_FLAGS_ALLOC); + + /* + * If the peer wants the invalidation_callback then all memory users + * linked to that peer must support invalidation. + */ + if (invalidate_callback) { + *invalidate_callback = ib_invalidate_peer_memory; + ib_peer_client->invalidation_required = true; + } + + mutex_lock(&peer_memory_mutex); + if (!peers_kobj) { + /* Created under /sys/kernel/mm */ + peers_kobj = kobject_create_and_add("memory_peers", mm_kobj); + if (!peers_kobj) + goto err_unlock; + } + + ret = kobject_add(&ib_peer_client->kobj, peers_kobj, peer_client->name); + if (ret) + goto err_parent; + + ret = sysfs_create_group(&ib_peer_client->kobj, + &peer_mem_attr_group); + if (ret) + goto err_parent; + list_add_tail(&ib_peer_client->core_peer_list, &peer_memory_list); + mutex_unlock(&peer_memory_mutex); + return ib_peer_client; + +err_parent: + if (list_empty(&peer_memory_list)) { + kobject_put(peers_kobj); + peers_kobj = NULL; + } +err_unlock: + mutex_unlock(&peer_memory_mutex); + kobject_put(&ib_peer_client->kobj); + return NULL; +} +EXPORT_SYMBOL(ib_register_peer_memory_client); + +void ib_unregister_peer_memory_client(void *reg_handle) +{ + struct ib_peer_memory_client *ib_peer_client = reg_handle; + + mutex_lock(&peer_memory_mutex); + list_del(&ib_peer_client->core_peer_list); + if (list_empty(&peer_memory_list)) { + kobject_put(peers_kobj); + peers_kobj = NULL; + } + mutex_unlock(&peer_memory_mutex); + + /* + * Wait for all umems to be destroyed before returning. Once + * ib_unregister_peer_memory_client() returns no umems will call any + * peer_mem ops. + */ + if (refcount_dec_and_test(&ib_peer_client->usecnt)) + complete(&ib_peer_client->usecnt_zero); + wait_for_completion(&ib_peer_client->usecnt_zero); + + kobject_put(&ib_peer_client->kobj); +} +EXPORT_SYMBOL(ib_unregister_peer_memory_client); + +static struct ib_peer_memory_client * +ib_get_peer_client(unsigned long addr, size_t size, + unsigned long peer_mem_flags, void **peer_client_context) +{ + struct ib_peer_memory_client *ib_peer_client; + int ret = 0; + + mutex_lock(&peer_memory_mutex); + list_for_each_entry(ib_peer_client, &peer_memory_list, + core_peer_list) { + if (ib_peer_client->invalidation_required && + (!(peer_mem_flags & IB_PEER_MEM_INVAL_SUPP))) + continue; + ret = ib_peer_client->peer_mem->acquire(addr, size, NULL, NULL, + peer_client_context); + if (ret > 0) { + refcount_inc(&ib_peer_client->usecnt); + mutex_unlock(&peer_memory_mutex); + return ib_peer_client; + } + } + mutex_unlock(&peer_memory_mutex); + return NULL; +} + +static void ib_put_peer_client(struct ib_peer_memory_client *ib_peer_client, + void *peer_client_context) +{ + if (ib_peer_client->peer_mem->release) + ib_peer_client->peer_mem->release(peer_client_context); + if (refcount_dec_and_test(&ib_peer_client->usecnt)) + complete(&ib_peer_client->usecnt_zero); +} + +static void ib_peer_umem_kref_release(struct kref *kref) +{ + struct ib_umem_peer *umem_p = + container_of(kref, struct ib_umem_peer, kref); + + mutex_destroy(&umem_p->mapping_lock); + kfree(umem_p); +} + +static void ib_unmap_peer_client(struct ib_umem_peer *umem_p, + enum ib_umem_mapped_state cur_state, + enum ib_umem_mapped_state to_state) +{ + struct ib_peer_memory_client *ib_peer_client = umem_p->ib_peer_client; + const struct peer_memory_client *peer_mem = ib_peer_client->peer_mem; + struct ib_umem *umem = &umem_p->umem; + + if (cur_state == UMEM_PEER_MAPPED && + (to_state == UMEM_PEER_UNMAPPED || + to_state == UMEM_PEER_INVALIDATED)) { + /* + * In the invalidated state we will never touch the sg again, + * but the client might, so fix it anyhow. + */ + if (umem_p->last_sg) { + umem_p->last_sg->length = umem_p->last_length; + sg_dma_len(umem_p->last_sg) = umem_p->last_dma_length; + } + + if (umem_p->first_sg) { + umem_p->first_sg->dma_address = + umem_p->first_dma_address; + umem_p->first_sg->length = umem_p->first_length; + sg_dma_len(umem_p->first_sg) = umem_p->first_dma_length; + } + + if (to_state == UMEM_PEER_UNMAPPED) { + peer_mem->dma_unmap(&umem_p->umem.sgt_append.sgt, + umem_p->peer_client_context, + umem_p->umem.ibdev->dma_device); + peer_mem->put_pages(&umem_p->umem.sgt_append.sgt, + umem_p->peer_client_context); + } + + memset(&umem->sgt_append.sgt, 0, sizeof(umem->sgt_append.sgt)); + atomic64_inc(&ib_peer_client->stats.num_dealloc_mrs); + } + + if ((cur_state == UMEM_PEER_MAPPED && to_state == UMEM_PEER_UNMAPPED) || + (cur_state == UMEM_PEER_INVALIDATED && + to_state == UMEM_PEER_UNMAPPED)) { + atomic64_add(umem->sgt_append.sgt.nents, + &ib_peer_client->stats.num_dereg_pages); + atomic64_add(umem->length, + &ib_peer_client->stats.num_dereg_bytes); + } + umem_p->mapped_state = to_state; +} + +/* + * True if the client should do unmap itself after the invalidate callback + * returns. Clients operating in this mode need to use this locking pattern: + * + * client_invalidate: + * mutex_lock(&client_lock) + * invalidate_callback(): + * mutex_lock(mapping_lock) + * mutex_unlock(mapping_lock) + * client_dma_unmap() + * client_put_pages() + * mutex_unlock(&client_lock) + * + * ib_umem_stop_invalidation_notifier(): + * mutex_lock(mapping_lock) + * mutex_unlock(mapping_lock) + * peer_mem->dma_unmap(): + * mutex_lock(&client_lock) + * client_dma_unmap() + * mutex_unlock(&client_lock) + * peer_mem->put_pages(): + * mutex_lock(&client_lock) + * client_put_pages() + * mutex_unlock(&client_lock) + * + * ib_peer_umem_release(): + * peer_mem->release(): + * mutex_lock(&client_lock) + * mutex_unlock(&client_lock) + * + * Noting that dma_unmap/put_pages can be called even though invalidate has + * already done the unmap, and release() can be called concurrently with + * invalidate. The client must protect itself against these races. + */ +static bool ib_peer_unmap_on_invalidate(struct ib_umem_peer *umem_p) +{ + const struct peer_memory_client *peer_mem = + umem_p->ib_peer_client->peer_mem; + const struct peer_memory_client_ex *peer_mem_ex; + + if (peer_mem->version[IB_PEER_MEMORY_VER_MAX - 1] == 0) + return false; + peer_mem_ex = container_of(peer_mem, const struct peer_memory_client_ex, + client); + if (peer_mem_ex->ex_size < + offsetofend(struct peer_memory_client_ex, flags)) + return false; + return peer_mem_ex->flags & PEER_MEM_INVALIDATE_UNMAPS; +} + +static int ib_invalidate_peer_memory(void *reg_handle, u64 core_context) +{ + struct ib_peer_memory_client *ib_peer_client = reg_handle; + struct ib_umem_peer *umem_p; + + /* + * The client is not required to fence against invalidation during + * put_pages() as that would deadlock when we call put_pages() here. + * Thus the core_context cannot be a umem pointer as we have no control + * over the lifetime. Since we won't change the kABI for this to add a + * proper kref, an xarray is used. + */ + xa_lock(&ib_peer_client->umem_xa); + ib_peer_client->stats.num_free_callbacks += 1; + umem_p = xa_load(&ib_peer_client->umem_xa, core_context); + if (!umem_p) + goto out_unlock; + kref_get(&umem_p->kref); + xa_unlock(&ib_peer_client->umem_xa); + + mutex_lock(&umem_p->mapping_lock); + /* + * For flows that require invalidation the invalidation_func should not + * be NULL while the device can be doing DMA. The mapping_lock ensures + * that the device is ready to receive an invalidation before one is + * triggered here. + */ + if (umem_p->mapped_state == UMEM_PEER_MAPPED && + umem_p->invalidation_func) + umem_p->invalidation_func(&umem_p->umem, + umem_p->invalidation_private); + if (ib_peer_unmap_on_invalidate(umem_p)) + ib_unmap_peer_client(umem_p, umem_p->mapped_state, + UMEM_PEER_INVALIDATED); + else + ib_unmap_peer_client(umem_p, umem_p->mapped_state, + UMEM_PEER_UNMAPPED); + mutex_unlock(&umem_p->mapping_lock); + kref_put(&umem_p->kref, ib_peer_umem_kref_release); + return 0; + +out_unlock: + xa_unlock(&ib_peer_client->umem_xa); + return 0; +} + +void ib_umem_activate_invalidation_notifier(struct ib_umem *umem, + umem_invalidate_func_t func, + void *priv) +{ + struct ib_umem_peer *umem_p = + container_of(umem, struct ib_umem_peer, umem); + + if (WARN_ON(!umem->is_peer)) + return; + if (umem_p->xa_id == PEER_NO_INVALIDATION_ID) + return; + + umem_p->invalidation_func = func; + umem_p->invalidation_private = priv; + /* Pairs with the lock in ib_peer_umem_get() */ + mutex_unlock(&umem_p->mapping_lock); + + /* At this point func can be called asynchronously */ +} +EXPORT_SYMBOL(ib_umem_activate_invalidation_notifier); + +/* + * Caller has blocked DMA and will no longer be able to handle invalidate + * callbacks. Callers using invalidation must call this function before calling + * ib_peer_umem_release(). ib_umem_activate_invalidation_notifier() is optional + * before doing this. + */ +void ib_umem_stop_invalidation_notifier(struct ib_umem *umem) +{ + struct ib_umem_peer *umem_p = + container_of(umem, struct ib_umem_peer, umem); + bool unmap_on_invalidate = ib_peer_unmap_on_invalidate(umem_p); + enum ib_umem_mapped_state cur_state; + + if (umem_p->invalidation_func) { + mutex_lock(&umem_p->mapping_lock); + umem_p->invalidation_func = NULL; + } else if (umem_p->xa_id != PEER_NO_INVALIDATION_ID) { + mutex_lock(&umem_p->mapping_lock); + } else { + /* + * Haven't called ib_umem_activate_invalidation_notifier() yet, + * still have the lock + */ + } + + if (!unmap_on_invalidate) { + ib_unmap_peer_client(umem_p, umem_p->mapped_state, + UMEM_PEER_UNMAPPED); + } else { + /* Block ib_invalidate_peer_memory() */ + cur_state = umem_p->mapped_state; + umem_p->mapped_state = UMEM_PEER_UNMAPPED; + } + mutex_unlock(&umem_p->mapping_lock); + + if (unmap_on_invalidate) + ib_unmap_peer_client(umem_p, cur_state, UMEM_PEER_UNMAPPED); + +} +EXPORT_SYMBOL(ib_umem_stop_invalidation_notifier); + +static void fix_peer_sgls(struct ib_umem_peer *umem_p, + unsigned long peer_page_size) +{ + struct ib_umem *umem = &umem_p->umem; + struct scatterlist *sg; + int i; + + for_each_sgtable_dma_sg(&umem->sgt_append.sgt, sg, i) { + if (i == 0) { + unsigned long offset; + + umem_p->first_sg = sg; + umem_p->first_dma_address = sg->dma_address; + umem_p->first_dma_length = sg_dma_len(sg); + umem_p->first_length = sg->length; + + offset = ALIGN_DOWN(umem->address, PAGE_SIZE) - + ALIGN_DOWN(umem->address, peer_page_size); + sg->dma_address += offset; + sg_dma_len(sg) -= offset; + sg->length -= offset; + } + + if (i == umem_p->umem.sgt_append.sgt.nents - 1) { + unsigned long trim; + + umem_p->last_sg = sg; + umem_p->last_dma_length = sg_dma_len(sg); + umem_p->last_length = sg->length; + + trim = ALIGN(umem->address + umem->length, + peer_page_size) - + ALIGN(umem->address + umem->length, PAGE_SIZE); + sg_dma_len(sg) -= trim; + sg->length -= trim; + } + } +} + +struct ib_umem *ib_peer_umem_get(struct ib_umem *old_umem, int old_ret, + unsigned long peer_mem_flags) +{ + struct ib_peer_memory_client *ib_peer_client; + unsigned long peer_page_size; + void *peer_client_context; + struct ib_umem_peer *umem_p; + int ret; + + ib_peer_client = + ib_get_peer_client(old_umem->address, old_umem->length, + peer_mem_flags, &peer_client_context); + if (!ib_peer_client) + return ERR_PTR(old_ret); + + umem_p = kzalloc(sizeof(*umem_p), GFP_KERNEL); + if (!umem_p) { + ret = -ENOMEM; + goto err_client; + } + + kref_init(&umem_p->kref); + umem_p->umem = *old_umem; + memset(&umem_p->umem.sgt_append.sgt, 0, sizeof(umem_p->umem.sgt_append.sgt)); + umem_p->umem.is_peer = 1; + umem_p->ib_peer_client = ib_peer_client; + umem_p->peer_client_context = peer_client_context; + mutex_init(&umem_p->mapping_lock); + umem_p->xa_id = PEER_NO_INVALIDATION_ID; + + mutex_lock(&umem_p->mapping_lock); + if (ib_peer_client->invalidation_required) { + ret = xa_alloc_cyclic(&ib_peer_client->umem_xa, &umem_p->xa_id, + umem_p, + XA_LIMIT(0, PEER_NO_INVALIDATION_ID - 1), + &ib_peer_client->xa_cyclic_next, + GFP_KERNEL); + if (ret < 0) + goto err_umem; + } + + /* + * We always request write permissions to the pages, to force breaking + * of any CoW during the registration of the MR. For read-only MRs we + * use the "force" flag to indicate that CoW breaking is required but + * the registration should not fail if referencing read-only areas. + */ + ret = ib_peer_client->peer_mem->get_pages(umem_p->umem.address, + umem_p->umem.length, 1, + !umem_p->umem.writable, NULL, + peer_client_context, + umem_p->xa_id); + if (ret) + goto err_xa; + + ret = ib_peer_client->peer_mem->dma_map(&umem_p->umem.sgt_append.sgt, + peer_client_context, + umem_p->umem.ibdev->dma_device, + 0, &umem_p->umem.sgt_append.sgt.nents); + if (ret) + goto err_pages; + + peer_page_size = + ib_peer_client->peer_mem->get_page_size(peer_client_context); + if (peer_page_size != PAGE_SIZE) + fix_peer_sgls(umem_p, peer_page_size); + + umem_p->mapped_state = UMEM_PEER_MAPPED; + atomic64_add(umem_p->umem.sgt_append.sgt.nents, &ib_peer_client->stats.num_reg_pages); + atomic64_add(umem_p->umem.length, &ib_peer_client->stats.num_reg_bytes); + atomic64_inc(&ib_peer_client->stats.num_alloc_mrs); + + /* + * If invalidation is allowed then the caller must call + * ib_umem_activate_invalidation_notifier() or ib_peer_umem_release() to + * unlock this mutex. This call should be done after the last read to + * sg_head, once the caller is ready for the invalidation function to be + * called. + */ + if (umem_p->xa_id == PEER_NO_INVALIDATION_ID) + mutex_unlock(&umem_p->mapping_lock); + + /* + * On success the old umem is replaced with the new, larger, allocation + */ + kfree(old_umem); + return &umem_p->umem; + +err_pages: + ib_peer_client->peer_mem->put_pages(&umem_p->umem.sgt_append.sgt, + umem_p->peer_client_context); +err_xa: + if (umem_p->xa_id != PEER_NO_INVALIDATION_ID) + xa_erase(&umem_p->ib_peer_client->umem_xa, umem_p->xa_id); +err_umem: + mutex_unlock(&umem_p->mapping_lock); + kref_put(&umem_p->kref, ib_peer_umem_kref_release); +err_client: + ib_put_peer_client(ib_peer_client, peer_client_context); + return ERR_PTR(ret); +} + +void ib_peer_umem_release(struct ib_umem *umem) +{ + struct ib_umem_peer *umem_p = + container_of(umem, struct ib_umem_peer, umem); + + /* + * If ib_umem_activate_invalidation_notifier() is called then + * ib_umem_stop_invalidation_notifier() must be called before release. + */ + WARN_ON(umem_p->invalidation_func); + + /* For no invalidation cases, make sure it is unmapped */ + ib_unmap_peer_client(umem_p, umem_p->mapped_state, UMEM_PEER_UNMAPPED); + + if (umem_p->xa_id != PEER_NO_INVALIDATION_ID) + xa_erase(&umem_p->ib_peer_client->umem_xa, umem_p->xa_id); + ib_put_peer_client(umem_p->ib_peer_client, umem_p->peer_client_context); + umem_p->ib_peer_client = NULL; + + /* Must match ib_umem_release() */ + atomic64_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm); + mmdrop(umem->owning_mm); + + kref_put(&umem_p->kref, ib_peer_umem_kref_release); +} + +/* Use it like this: +struct peer_memory_client_ex peer_memory_test = { + .client = { + .version = "1.0", + .version[IB_PEER_MEMORY_VER_MAX-1] = 1, + }, + .ex_size = sizeof(struct peer_memory_client_ex), + .flags = PEER_MEM_INVALIDATE_UNMAPS, +}; +*/ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/rdma_core.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/rdma_core.c new file mode 100644 index 0000000..94d83b6 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/rdma_core.c @@ -0,0 +1,1015 @@ +/* + * Copyright (c) 2016, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "uverbs.h" +#include "core_priv.h" +#include "rdma_core.h" + +static void uverbs_uobject_free(struct kref *ref) +{ + kfree_rcu(container_of(ref, struct ib_uobject, ref), rcu); +} + +/* + * In order to indicate we no longer needs this uobject, uverbs_uobject_put + * is called. When the reference count is decreased, the uobject is freed. + * For example, this is used when attaching a completion channel to a CQ. + */ +void uverbs_uobject_put(struct ib_uobject *uobject) +{ + kref_put(&uobject->ref, uverbs_uobject_free); +} +EXPORT_SYMBOL(uverbs_uobject_put); + +static int uverbs_try_lock_object(struct ib_uobject *uobj, + enum rdma_lookup_mode mode) +{ + /* + * When a shared access is required, we use a positive counter. Each + * shared access request checks that the value != -1 and increment it. + * Exclusive access is required for operations like write or destroy. + * In exclusive access mode, we check that the counter is zero (nobody + * claimed this object) and we set it to -1. Releasing a shared access + * lock is done simply by decreasing the counter. As for exclusive + * access locks, since only a single one of them is is allowed + * concurrently, setting the counter to zero is enough for releasing + * this lock. + */ + switch (mode) { + case UVERBS_LOOKUP_READ: + return atomic_fetch_add_unless(&uobj->usecnt, 1, -1) == -1 ? + -EBUSY : 0; + case UVERBS_LOOKUP_WRITE: + /* lock is exclusive */ + return atomic_cmpxchg(&uobj->usecnt, 0, -1) == 0 ? 0 : -EBUSY; + case UVERBS_LOOKUP_DESTROY: + return 0; + } + return 0; +} + +static void assert_uverbs_usecnt(struct ib_uobject *uobj, + enum rdma_lookup_mode mode) +{ +#ifdef CONFIG_LOCKDEP + switch (mode) { + case UVERBS_LOOKUP_READ: + WARN_ON(atomic_read(&uobj->usecnt) <= 0); + break; + case UVERBS_LOOKUP_WRITE: + WARN_ON(atomic_read(&uobj->usecnt) != -1); + break; + case UVERBS_LOOKUP_DESTROY: + break; + } +#endif +} + +/* + * This must be called with the hw_destroy_rwsem locked for read or write, + * also the uobject itself must be locked for write. + * + * Upon return the HW object is guaranteed to be destroyed. + * + * For RDMA_REMOVE_ABORT, the hw_destroy_rwsem is not required to be held, + * however the type's allocat_commit function cannot have been called and the + * uobject cannot be on the uobjects_lists + * + * For RDMA_REMOVE_DESTROY the caller should be holding a kref (eg via + * rdma_lookup_get_uobject) and the object is left in a state where the caller + * needs to call rdma_lookup_put_uobject. + * + * For all other destroy modes this function internally unlocks the uobject + * and consumes the kref on the uobj. + */ +static int uverbs_destroy_uobject(struct ib_uobject *uobj, + enum rdma_remove_reason reason, + struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_file *ufile = attrs->ufile; + unsigned long flags; + int ret; + + lockdep_assert_held(&ufile->hw_destroy_rwsem); + assert_uverbs_usecnt(uobj, UVERBS_LOOKUP_WRITE); + + if (reason == RDMA_REMOVE_ABORT) { + WARN_ON(!list_empty(&uobj->list)); + WARN_ON(!uobj->context); + uobj->uapi_object->type_class->alloc_abort(uobj); + } else if (uobj->object) { + ret = uobj->uapi_object->type_class->destroy_hw(uobj, reason, + attrs); + if (ret) + /* Nothing to be done, wait till ucontext will clean it */ + return ret; + + uobj->object = NULL; + } + + uobj->context = NULL; + + /* + * For DESTROY the usecnt is not changed, the caller is expected to + * manage it via uobj_put_destroy(). Only DESTROY can remove the IDR + * handle. + */ + if (reason != RDMA_REMOVE_DESTROY) + atomic_set(&uobj->usecnt, 0); + else + uobj->uapi_object->type_class->remove_handle(uobj); + + if (!list_empty(&uobj->list)) { + spin_lock_irqsave(&ufile->uobjects_lock, flags); + list_del_init(&uobj->list); + spin_unlock_irqrestore(&ufile->uobjects_lock, flags); + + /* + * Pairs with the get in rdma_alloc_commit_uobject(), could + * destroy uobj. + */ + uverbs_uobject_put(uobj); + } + + /* + * When aborting the stack kref remains owned by the core code, and is + * not transferred into the type. Pairs with the get in alloc_uobj + */ + if (reason == RDMA_REMOVE_ABORT) + uverbs_uobject_put(uobj); + + return 0; +} + +/* + * This calls uverbs_destroy_uobject() using the RDMA_REMOVE_DESTROY + * sequence. It should only be used from command callbacks. On success the + * caller must pair this with uobj_put_destroy(). This + * version requires the caller to have already obtained an + * LOOKUP_DESTROY uobject kref. + */ +int uobj_destroy(struct ib_uobject *uobj, struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_file *ufile = attrs->ufile; + int ret; + + down_read(&ufile->hw_destroy_rwsem); + + /* + * Once the uobject is destroyed by RDMA_REMOVE_DESTROY then it is left + * write locked as the callers put it back with UVERBS_LOOKUP_DESTROY. + * This is because any other concurrent thread can still see the object + * in the xarray due to RCU. Leaving it locked ensures nothing else will + * touch it. + */ + ret = uverbs_try_lock_object(uobj, UVERBS_LOOKUP_WRITE); + if (ret) + goto out_unlock; + + ret = uverbs_destroy_uobject(uobj, RDMA_REMOVE_DESTROY, attrs); + if (ret) { + atomic_set(&uobj->usecnt, 0); + goto out_unlock; + } + +out_unlock: + up_read(&ufile->hw_destroy_rwsem); + return ret; +} + +/* + * uobj_get_destroy destroys the HW object and returns a handle to the uobj + * with a NULL object pointer. The caller must pair this with + * uobj_put_destroy(). + */ +struct ib_uobject *__uobj_get_destroy(const struct uverbs_api_object *obj, + u32 id, struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj; + int ret; + + uobj = rdma_lookup_get_uobject(obj, attrs->ufile, id, + UVERBS_LOOKUP_DESTROY, attrs); + if (IS_ERR(uobj)) + return uobj; + + ret = uobj_destroy(uobj, attrs); + if (ret) { + rdma_lookup_put_uobject(uobj, UVERBS_LOOKUP_DESTROY); + return ERR_PTR(ret); + } + + return uobj; +} + +/* + * Does both uobj_get_destroy() and uobj_put_destroy(). Returns 0 on success + * (negative errno on failure). For use by callers that do not need the uobj. + */ +int __uobj_perform_destroy(const struct uverbs_api_object *obj, u32 id, + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj; + + uobj = __uobj_get_destroy(obj, id, attrs); + if (IS_ERR(uobj)) + return PTR_ERR(uobj); + uobj_put_destroy(uobj); + return 0; +} + +/* alloc_uobj must be undone by uverbs_destroy_uobject() */ +static struct ib_uobject *alloc_uobj(struct uverbs_attr_bundle *attrs, + const struct uverbs_api_object *obj) +{ + struct ib_uverbs_file *ufile = attrs->ufile; + struct ib_uobject *uobj; + + if (!attrs->context) { + struct ib_ucontext *ucontext = + ib_uverbs_get_ucontext_file(ufile); + + if (IS_ERR(ucontext)) + return ERR_CAST(ucontext); + attrs->context = ucontext; + } + + uobj = kzalloc(obj->type_attrs->obj_size, GFP_KERNEL); + if (!uobj) + return ERR_PTR(-ENOMEM); + /* + * user_handle should be filled by the handler, + * The object is added to the list in the commit stage. + */ + uobj->ufile = ufile; + uobj->context = attrs->context; + INIT_LIST_HEAD(&uobj->list); + uobj->uapi_object = obj; + /* + * Allocated objects start out as write locked to deny any other + * syscalls from accessing them until they are committed. See + * rdma_alloc_commit_uobject + */ + atomic_set(&uobj->usecnt, -1); + kref_init(&uobj->ref); + + return uobj; +} + +static int idr_add_uobj(struct ib_uobject *uobj) +{ + /* + * We start with allocating an idr pointing to NULL. This represents an + * object which isn't initialized yet. We'll replace it later on with + * the real object once we commit. + */ + return xa_alloc(&uobj->ufile->idr, &uobj->id, NULL, xa_limit_32b, + GFP_KERNEL); +} + +/* Returns the ib_uobject or an error. The caller should check for IS_ERR. */ +static struct ib_uobject * +lookup_get_idr_uobject(const struct uverbs_api_object *obj, + struct ib_uverbs_file *ufile, s64 id, + enum rdma_lookup_mode mode) +{ + struct ib_uobject *uobj; + + if (id < 0 || id > ULONG_MAX) + return ERR_PTR(-EINVAL); + + rcu_read_lock(); + /* + * The idr_find is guaranteed to return a pointer to something that + * isn't freed yet, or NULL, as the free after idr_remove goes through + * kfree_rcu(). However the object may still have been released and + * kfree() could be called at any time. + */ + uobj = xa_load(&ufile->idr, id); + if (!uobj || !kref_get_unless_zero(&uobj->ref)) + uobj = ERR_PTR(-ENOENT); + rcu_read_unlock(); + return uobj; +} + +static struct ib_uobject * +lookup_get_fd_uobject(const struct uverbs_api_object *obj, + struct ib_uverbs_file *ufile, s64 id, + enum rdma_lookup_mode mode) +{ + const struct uverbs_obj_fd_type *fd_type; + struct file *f; + struct ib_uobject *uobject; + int fdno = id; + + if (fdno != id) + return ERR_PTR(-EINVAL); + + if (mode != UVERBS_LOOKUP_READ) + return ERR_PTR(-EOPNOTSUPP); + + if (!obj->type_attrs) + return ERR_PTR(-EIO); + fd_type = + container_of(obj->type_attrs, struct uverbs_obj_fd_type, type); + + f = fget(fdno); + if (!f) + return ERR_PTR(-EBADF); + + uobject = f->private_data; + /* + * fget(id) ensures we are not currently running + * uverbs_uobject_fd_release(), and the caller is expected to ensure + * that release is never done while a call to lookup is possible. + */ + if (f->f_op != fd_type->fops || uobject->ufile != ufile) { + fput(f); + return ERR_PTR(-EBADF); + } + + uverbs_uobject_get(uobject); + return uobject; +} + +struct ib_uobject *rdma_lookup_get_uobject(const struct uverbs_api_object *obj, + struct ib_uverbs_file *ufile, s64 id, + enum rdma_lookup_mode mode, + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj; + int ret; + + if (obj == ERR_PTR(-ENOMSG)) { + /* must be UVERBS_IDR_ANY_OBJECT, see uapi_get_object() */ + uobj = lookup_get_idr_uobject(NULL, ufile, id, mode); + if (IS_ERR(uobj)) + return uobj; + } else { + if (IS_ERR(obj)) + return ERR_PTR(-EINVAL); + + uobj = obj->type_class->lookup_get(obj, ufile, id, mode); + if (IS_ERR(uobj)) + return uobj; + + if (uobj->uapi_object != obj) { + ret = -EINVAL; + goto free; + } + } + + /* + * If we have been disassociated block every command except for + * DESTROY based commands. + */ + if (mode != UVERBS_LOOKUP_DESTROY && + !srcu_dereference(ufile->device->ib_dev, + &ufile->device->disassociate_srcu)) { + ret = -EIO; + goto free; + } + + ret = uverbs_try_lock_object(uobj, mode); + if (ret) + goto free; + if (attrs) + attrs->context = uobj->context; + + return uobj; +free: + uobj->uapi_object->type_class->lookup_put(uobj, mode); + uverbs_uobject_put(uobj); + return ERR_PTR(ret); +} + +static struct ib_uobject * +alloc_begin_idr_uobject(const struct uverbs_api_object *obj, + struct uverbs_attr_bundle *attrs) +{ + int ret; + struct ib_uobject *uobj; + + uobj = alloc_uobj(attrs, obj); + if (IS_ERR(uobj)) + return uobj; + + ret = idr_add_uobj(uobj); + if (ret) + goto uobj_put; + + ret = ib_rdmacg_try_charge(&uobj->cg_obj, uobj->context->device, + RDMACG_RESOURCE_HCA_OBJECT); + if (ret) + goto remove; + + return uobj; + +remove: + xa_erase(&attrs->ufile->idr, uobj->id); +uobj_put: + uverbs_uobject_put(uobj); + return ERR_PTR(ret); +} + +static struct ib_uobject * +alloc_begin_fd_uobject(const struct uverbs_api_object *obj, + struct uverbs_attr_bundle *attrs) +{ + const struct uverbs_obj_fd_type *fd_type; + int new_fd; + struct ib_uobject *uobj, *ret; + struct file *filp; + + uobj = alloc_uobj(attrs, obj); + if (IS_ERR(uobj)) + return uobj; + + fd_type = + container_of(obj->type_attrs, struct uverbs_obj_fd_type, type); + if (WARN_ON(fd_type->fops->release != &uverbs_uobject_fd_release && + fd_type->fops->release != &uverbs_async_event_release)) { + ret = ERR_PTR(-EINVAL); + goto err_fd; + } + + new_fd = get_unused_fd_flags(O_CLOEXEC); + if (new_fd < 0) { + ret = ERR_PTR(new_fd); + goto err_fd; + } + + /* Note that uverbs_uobject_fd_release() is called during abort */ + filp = anon_inode_getfile(fd_type->name, fd_type->fops, NULL, + fd_type->flags); + if (IS_ERR(filp)) { + ret = ERR_CAST(filp); + goto err_getfile; + } + uobj->object = filp; + + uobj->id = new_fd; + return uobj; + +err_getfile: + put_unused_fd(new_fd); +err_fd: + uverbs_uobject_put(uobj); + return ret; +} + +struct ib_uobject *rdma_alloc_begin_uobject(const struct uverbs_api_object *obj, + struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_file *ufile = attrs->ufile; + struct ib_uobject *ret; + + if (IS_ERR(obj)) + return ERR_PTR(-EINVAL); + + /* + * The hw_destroy_rwsem is held across the entire object creation and + * released during rdma_alloc_commit_uobject or + * rdma_alloc_abort_uobject + */ + if (!down_read_trylock(&ufile->hw_destroy_rwsem)) + return ERR_PTR(-EIO); + + ret = obj->type_class->alloc_begin(obj, attrs); + if (IS_ERR(ret)) { + up_read(&ufile->hw_destroy_rwsem); + return ret; + } + return ret; +} + +static void alloc_abort_idr_uobject(struct ib_uobject *uobj) +{ + ib_rdmacg_uncharge(&uobj->cg_obj, uobj->context->device, + RDMACG_RESOURCE_HCA_OBJECT); + + xa_erase(&uobj->ufile->idr, uobj->id); +} + +static int __must_check destroy_hw_idr_uobject(struct ib_uobject *uobj, + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) +{ + const struct uverbs_obj_idr_type *idr_type = + container_of(uobj->uapi_object->type_attrs, + struct uverbs_obj_idr_type, type); + int ret = idr_type->destroy_object(uobj, why, attrs); + + if (ret) + return ret; + + if (why == RDMA_REMOVE_ABORT) + return 0; + + ib_rdmacg_uncharge(&uobj->cg_obj, uobj->context->device, + RDMACG_RESOURCE_HCA_OBJECT); + + return 0; +} + +static void remove_handle_idr_uobject(struct ib_uobject *uobj) +{ + xa_erase(&uobj->ufile->idr, uobj->id); + /* Matches the kref in alloc_commit_idr_uobject */ + uverbs_uobject_put(uobj); +} + +static void alloc_abort_fd_uobject(struct ib_uobject *uobj) +{ + struct file *filp = uobj->object; + + fput(filp); + put_unused_fd(uobj->id); +} + +static int __must_check destroy_hw_fd_uobject(struct ib_uobject *uobj, + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) +{ + const struct uverbs_obj_fd_type *fd_type = container_of( + uobj->uapi_object->type_attrs, struct uverbs_obj_fd_type, type); + + fd_type->destroy_object(uobj, why); + return 0; +} + +static void remove_handle_fd_uobject(struct ib_uobject *uobj) +{ +} + +static void alloc_commit_idr_uobject(struct ib_uobject *uobj) +{ + struct ib_uverbs_file *ufile = uobj->ufile; + void *old; + + /* + * We already allocated this IDR with a NULL object, so + * this shouldn't fail. + * + * NOTE: Storing the uobj transfers our kref on uobj to the XArray. + * It will be put by remove_commit_idr_uobject() + */ + old = xa_store(&ufile->idr, uobj->id, uobj, GFP_KERNEL); + WARN_ON(old != NULL); +} + +static void swap_idr_uobjects(struct ib_uobject *obj_old, + struct ib_uobject *obj_new) +{ + struct ib_uverbs_file *ufile = obj_old->ufile; + void *old; + + /* + * New must be an object that been allocated but not yet committed, this + * moves the pre-committed state to obj_old, new still must be comitted. + */ + old = xa_cmpxchg(&ufile->idr, obj_old->id, obj_old, XA_ZERO_ENTRY, + GFP_KERNEL); + if (WARN_ON(old != obj_old)) + return; + + swap(obj_old->id, obj_new->id); + + old = xa_cmpxchg(&ufile->idr, obj_old->id, NULL, obj_old, GFP_KERNEL); + WARN_ON(old != NULL); +} + +static void alloc_commit_fd_uobject(struct ib_uobject *uobj) +{ + int fd = uobj->id; + struct file *filp = uobj->object; + + /* Matching put will be done in uverbs_uobject_fd_release() */ + kref_get(&uobj->ufile->ref); + + /* This shouldn't be used anymore. Use the file object instead */ + uobj->id = 0; + + /* + * NOTE: Once we install the file we loose ownership of our kref on + * uobj. It will be put by uverbs_uobject_fd_release() + */ + filp->private_data = uobj; + fd_install(fd, filp); +} + +/* + * In all cases rdma_alloc_commit_uobject() consumes the kref to uobj and the + * caller can no longer assume uobj is valid. If this function fails it + * destroys the uboject, including the attached HW object. + */ +void rdma_alloc_commit_uobject(struct ib_uobject *uobj, + struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_file *ufile = attrs->ufile; + + /* kref is held so long as the uobj is on the uobj list. */ + uverbs_uobject_get(uobj); + spin_lock_irq(&ufile->uobjects_lock); + list_add(&uobj->list, &ufile->uobjects); + spin_unlock_irq(&ufile->uobjects_lock); + + /* matches atomic_set(-1) in alloc_uobj */ + atomic_set(&uobj->usecnt, 0); + + /* alloc_commit consumes the uobj kref */ + uobj->uapi_object->type_class->alloc_commit(uobj); + + /* Matches the down_read in rdma_alloc_begin_uobject */ + up_read(&ufile->hw_destroy_rwsem); +} + +/* + * new_uobj will be assigned to the handle currently used by to_uobj, and + * to_uobj will be destroyed. + * + * Upon return the caller must do: + * rdma_alloc_commit_uobject(new_uobj) + * uobj_put_destroy(to_uobj) + * + * to_uobj must have a write get but the put mode switches to destroy once + * this is called. + */ +void rdma_assign_uobject(struct ib_uobject *to_uobj, struct ib_uobject *new_uobj, + struct uverbs_attr_bundle *attrs) +{ + assert_uverbs_usecnt(new_uobj, UVERBS_LOOKUP_WRITE); + + if (WARN_ON(to_uobj->uapi_object != new_uobj->uapi_object || + !to_uobj->uapi_object->type_class->swap_uobjects)) + return; + + to_uobj->uapi_object->type_class->swap_uobjects(to_uobj, new_uobj); + + /* + * If this fails then the uobject is still completely valid (though with + * a new ID) and we leak it until context close. + */ + uverbs_destroy_uobject(to_uobj, RDMA_REMOVE_DESTROY, attrs); +} + +/* + * This consumes the kref for uobj. It is up to the caller to unwind the HW + * object and anything else connected to uobj before calling this. + */ +void rdma_alloc_abort_uobject(struct ib_uobject *uobj, + struct uverbs_attr_bundle *attrs, + bool hw_obj_valid) +{ + struct ib_uverbs_file *ufile = uobj->ufile; + int ret; + + if (hw_obj_valid) { + ret = uobj->uapi_object->type_class->destroy_hw( + uobj, RDMA_REMOVE_ABORT, attrs); + /* + * If the driver couldn't destroy the object then go ahead and + * commit it. Leaking objects that can't be destroyed is only + * done during FD close after the driver has a few more tries to + * destroy it. + */ + if (WARN_ON(ret)) + return rdma_alloc_commit_uobject(uobj, attrs); + } + + uverbs_destroy_uobject(uobj, RDMA_REMOVE_ABORT, attrs); + + /* Matches the down_read in rdma_alloc_begin_uobject */ + up_read(&ufile->hw_destroy_rwsem); +} + +static void lookup_put_idr_uobject(struct ib_uobject *uobj, + enum rdma_lookup_mode mode) +{ +} + +static void lookup_put_fd_uobject(struct ib_uobject *uobj, + enum rdma_lookup_mode mode) +{ + struct file *filp = uobj->object; + + WARN_ON(mode != UVERBS_LOOKUP_READ); + /* + * This indirectly calls uverbs_uobject_fd_release() and free the + * object + */ + fput(filp); +} + +void rdma_lookup_put_uobject(struct ib_uobject *uobj, + enum rdma_lookup_mode mode) +{ + assert_uverbs_usecnt(uobj, mode); + /* + * In order to unlock an object, either decrease its usecnt for + * read access or zero it in case of exclusive access. See + * uverbs_try_lock_object for locking schema information. + */ + switch (mode) { + case UVERBS_LOOKUP_READ: + atomic_dec(&uobj->usecnt); + break; + case UVERBS_LOOKUP_WRITE: + atomic_set(&uobj->usecnt, 0); + break; + case UVERBS_LOOKUP_DESTROY: + break; + } + + uobj->uapi_object->type_class->lookup_put(uobj, mode); + /* Pairs with the kref obtained by type->lookup_get */ + uverbs_uobject_put(uobj); +} + +void setup_ufile_idr_uobject(struct ib_uverbs_file *ufile) +{ + xa_init_flags(&ufile->idr, XA_FLAGS_ALLOC); +} + +void release_ufile_idr_uobject(struct ib_uverbs_file *ufile) +{ + struct ib_uobject *entry; + unsigned long id; + + /* + * At this point uverbs_cleanup_ufile() is guaranteed to have run, and + * there are no HW objects left, however the xarray is still populated + * with anything that has not been cleaned up by userspace. Since the + * kref on ufile is 0, nothing is allowed to call lookup_get. + * + * This is an optimized equivalent to remove_handle_idr_uobject + */ + xa_for_each(&ufile->idr, id, entry) { + WARN_ON(entry->object); + uverbs_uobject_put(entry); + } + + xa_destroy(&ufile->idr); +} + +const struct uverbs_obj_type_class uverbs_idr_class = { + .alloc_begin = alloc_begin_idr_uobject, + .lookup_get = lookup_get_idr_uobject, + .alloc_commit = alloc_commit_idr_uobject, + .alloc_abort = alloc_abort_idr_uobject, + .lookup_put = lookup_put_idr_uobject, + .destroy_hw = destroy_hw_idr_uobject, + .remove_handle = remove_handle_idr_uobject, + .swap_uobjects = swap_idr_uobjects, +}; +EXPORT_SYMBOL(uverbs_idr_class); + +/* + * Users of UVERBS_TYPE_ALLOC_FD should set this function as the struct + * file_operations release method. + */ +int uverbs_uobject_fd_release(struct inode *inode, struct file *filp) +{ + struct ib_uverbs_file *ufile; + struct ib_uobject *uobj; + + /* + * This can only happen if the fput came from alloc_abort_fd_uobject() + */ + if (!filp->private_data) + return 0; + uobj = filp->private_data; + ufile = uobj->ufile; + + if (down_read_trylock(&ufile->hw_destroy_rwsem)) { + struct uverbs_attr_bundle attrs = { + .context = uobj->context, + .ufile = ufile, + }; + + /* + * lookup_get_fd_uobject holds the kref on the struct file any + * time a FD uobj is locked, which prevents this release + * method from being invoked. Meaning we can always get the + * write lock here, or we have a kernel bug. + */ + WARN_ON(uverbs_try_lock_object(uobj, UVERBS_LOOKUP_WRITE)); + uverbs_destroy_uobject(uobj, RDMA_REMOVE_CLOSE, &attrs); + up_read(&ufile->hw_destroy_rwsem); + } + + /* Matches the get in alloc_commit_fd_uobject() */ + kref_put(&ufile->ref, ib_uverbs_release_file); + + /* Pairs with filp->private_data in alloc_begin_fd_uobject */ + uverbs_uobject_put(uobj); + return 0; +} +EXPORT_SYMBOL(uverbs_uobject_fd_release); + +/* + * Drop the ucontext off the ufile and completely disconnect it from the + * ib_device + */ +static void ufile_destroy_ucontext(struct ib_uverbs_file *ufile, + enum rdma_remove_reason reason) +{ + struct ib_ucontext *ucontext = ufile->ucontext; + struct ib_device *ib_dev = ucontext->device; + + /* + * If we are closing the FD then the user mmap VMAs must have + * already been destroyed as they hold on to the filep, otherwise + * they need to be zap'd. + */ + if (reason == RDMA_REMOVE_DRIVER_REMOVE) { + uverbs_user_mmap_disassociate(ufile); + if (ib_dev->ops.disassociate_ucontext) + ib_dev->ops.disassociate_ucontext(ucontext); + } + + ib_rdmacg_uncharge(&ucontext->cg_obj, ib_dev, + RDMACG_RESOURCE_HCA_HANDLE); + + rdma_restrack_del(&ucontext->res); + + ib_dev->ops.dealloc_ucontext(ucontext); + WARN_ON(!xa_empty(&ucontext->mmap_xa)); + kfree(ucontext); + + ufile->ucontext = NULL; +} + +static int __uverbs_cleanup_ufile(struct ib_uverbs_file *ufile, + enum rdma_remove_reason reason) +{ + struct ib_uobject *obj, *next_obj; + int ret = -EINVAL; + struct uverbs_attr_bundle attrs = { .ufile = ufile }; + + /* + * This shouldn't run while executing other commands on this + * context. Thus, the only thing we should take care of is + * releasing a FD while traversing this list. The FD could be + * closed and released from the _release fop of this FD. + * In order to mitigate this, we add a lock. + * We take and release the lock per traversal in order to let + * other threads (which might still use the FDs) chance to run. + */ + list_for_each_entry_safe(obj, next_obj, &ufile->uobjects, list) { + attrs.context = obj->context; + /* + * if we hit this WARN_ON, that means we are + * racing with a lookup_get. + */ + WARN_ON(uverbs_try_lock_object(obj, UVERBS_LOOKUP_WRITE)); + if (reason == RDMA_REMOVE_DRIVER_FAILURE) + obj->object = NULL; + if (!uverbs_destroy_uobject(obj, reason, &attrs)) + ret = 0; + else + atomic_set(&obj->usecnt, 0); + } + + if (reason == RDMA_REMOVE_DRIVER_FAILURE) { + WARN_ON(!list_empty(&ufile->uobjects)); + return 0; + } + return ret; +} + +/* + * Destroy the ucontext and every uobject associated with it. + * + * This is internally locked and can be called in parallel from multiple + * contexts. + */ +void uverbs_destroy_ufile_hw(struct ib_uverbs_file *ufile, + enum rdma_remove_reason reason) +{ + down_write(&ufile->hw_destroy_rwsem); + + /* + * If a ucontext was never created then we can't have any uobjects to + * cleanup, nothing to do. + */ + if (!ufile->ucontext) + goto done; + + while (!list_empty(&ufile->uobjects) && + !__uverbs_cleanup_ufile(ufile, reason)) { + } + + if (WARN_ON(!list_empty(&ufile->uobjects))) + __uverbs_cleanup_ufile(ufile, RDMA_REMOVE_DRIVER_FAILURE); + ufile_destroy_ucontext(ufile, reason); + +done: + up_write(&ufile->hw_destroy_rwsem); +} + +const struct uverbs_obj_type_class uverbs_fd_class = { + .alloc_begin = alloc_begin_fd_uobject, + .lookup_get = lookup_get_fd_uobject, + .alloc_commit = alloc_commit_fd_uobject, + .alloc_abort = alloc_abort_fd_uobject, + .lookup_put = lookup_put_fd_uobject, + .destroy_hw = destroy_hw_fd_uobject, + .remove_handle = remove_handle_fd_uobject, +}; +EXPORT_SYMBOL(uverbs_fd_class); + +struct ib_uobject * +uverbs_get_uobject_from_file(u16 object_id, enum uverbs_obj_access access, + s64 id, struct uverbs_attr_bundle *attrs) +{ + const struct uverbs_api_object *obj = + uapi_get_object(attrs->ufile->device->uapi, object_id); + + switch (access) { + case UVERBS_ACCESS_READ: + return rdma_lookup_get_uobject(obj, attrs->ufile, id, + UVERBS_LOOKUP_READ, attrs); + case UVERBS_ACCESS_DESTROY: + /* Actual destruction is done inside uverbs_handle_method */ + return rdma_lookup_get_uobject(obj, attrs->ufile, id, + UVERBS_LOOKUP_DESTROY, attrs); + case UVERBS_ACCESS_WRITE: + return rdma_lookup_get_uobject(obj, attrs->ufile, id, + UVERBS_LOOKUP_WRITE, attrs); + case UVERBS_ACCESS_NEW: + return rdma_alloc_begin_uobject(obj, attrs); + default: + WARN_ON(true); + return ERR_PTR(-EOPNOTSUPP); + } +} + +void uverbs_finalize_object(struct ib_uobject *uobj, + enum uverbs_obj_access access, bool hw_obj_valid, + bool commit, struct uverbs_attr_bundle *attrs) +{ + /* + * refcounts should be handled at the object level and not at the + * uobject level. Refcounts of the objects themselves are done in + * handlers. + */ + + switch (access) { + case UVERBS_ACCESS_READ: + rdma_lookup_put_uobject(uobj, UVERBS_LOOKUP_READ); + break; + case UVERBS_ACCESS_WRITE: + rdma_lookup_put_uobject(uobj, UVERBS_LOOKUP_WRITE); + break; + case UVERBS_ACCESS_DESTROY: + if (uobj) + rdma_lookup_put_uobject(uobj, UVERBS_LOOKUP_DESTROY); + break; + case UVERBS_ACCESS_NEW: + if (commit) + rdma_alloc_commit_uobject(uobj, attrs); + else + rdma_alloc_abort_uobject(uobj, attrs, hw_obj_valid); + break; + default: + WARN_ON(true); + } +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/rdma_core.h b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/rdma_core.h new file mode 100644 index 0000000..33706da --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/rdma_core.h @@ -0,0 +1,191 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. + * Copyright (c) 2005-2017 Mellanox Technologies. All rights reserved. + * Copyright (c) 2005 Voltaire, Inc. All rights reserved. + * Copyright (c) 2005 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef RDMA_CORE_H +#define RDMA_CORE_H + +#include +#include +#include +#include +#include + +struct ib_uverbs_device; + +void uverbs_destroy_ufile_hw(struct ib_uverbs_file *ufile, + enum rdma_remove_reason reason); + +int uobj_destroy(struct ib_uobject *uobj, struct uverbs_attr_bundle *attrs); + +/* + * Get an ib_uobject that corresponds to the given id from ufile, assuming + * the object is from the given type. Lock it to the required access when + * applicable. + * This function could create (access == NEW), destroy (access == DESTROY) + * or unlock (access == READ || access == WRITE) objects if required. + * The action will be finalized only when uverbs_finalize_object or + * uverbs_finalize_objects are called. + */ +struct ib_uobject * +uverbs_get_uobject_from_file(u16 object_id, enum uverbs_obj_access access, + s64 id, struct uverbs_attr_bundle *attrs); + +void uverbs_finalize_object(struct ib_uobject *uobj, + enum uverbs_obj_access access, bool hw_obj_valid, + bool commit, struct uverbs_attr_bundle *attrs); + +int uverbs_output_written(const struct uverbs_attr_bundle *bundle, size_t idx); + +void setup_ufile_idr_uobject(struct ib_uverbs_file *ufile); +void release_ufile_idr_uobject(struct ib_uverbs_file *ufile); + +struct ib_udata *uverbs_get_cleared_udata(struct uverbs_attr_bundle *attrs); + +/* + * This is the runtime description of the uverbs API, used by the syscall + * machinery to validate and dispatch calls. + */ + +/* + * Depending on ID the slot pointer in the radix tree points at one of these + * structs. + */ + +struct uverbs_api_ioctl_method { + int(__rcu *handler)(struct uverbs_attr_bundle *attrs); + DECLARE_BITMAP(attr_mandatory, UVERBS_API_ATTR_BKEY_LEN); + u16 bundle_size; + u8 use_stack:1; + u8 driver_method:1; + u8 disabled:1; + u8 has_udata:1; + u8 key_bitmap_len; + u8 destroy_bkey; +}; + +struct uverbs_api_write_method { + int (*handler)(struct uverbs_attr_bundle *attrs); + u8 disabled:1; + u8 is_ex:1; + u8 has_udata:1; + u8 has_resp:1; + u8 req_size; + u8 resp_size; +}; + +struct uverbs_api_attr { + struct uverbs_attr_spec spec; +}; + +struct uverbs_api { + /* radix tree contains struct uverbs_api_* pointers */ + struct radix_tree_root radix; + enum rdma_driver_id driver_id; + + unsigned int num_write; + unsigned int num_write_ex; + struct uverbs_api_write_method notsupp_method; + const struct uverbs_api_write_method **write_methods; + const struct uverbs_api_write_method **write_ex_methods; +}; + +/* + * Get an uverbs_api_object that corresponds to the given object_id. + * Note: + * -ENOMSG means that any object is allowed to match during lookup. + */ +static inline const struct uverbs_api_object * +uapi_get_object(struct uverbs_api *uapi, u16 object_id) +{ + const struct uverbs_api_object *res; + + if (object_id == UVERBS_IDR_ANY_OBJECT) + return ERR_PTR(-ENOMSG); + + res = radix_tree_lookup(&uapi->radix, uapi_key_obj(object_id)); + if (!res) + return ERR_PTR(-ENOENT); + + return res; +} + +char *uapi_key_format(char *S, unsigned int key); +struct uverbs_api *uverbs_alloc_api(struct ib_device *ibdev); +void uverbs_disassociate_api_pre(struct ib_uverbs_device *uverbs_dev); +void uverbs_disassociate_api(struct uverbs_api *uapi); +void uverbs_destroy_api(struct uverbs_api *uapi); +void uapi_compute_bundle_size(struct uverbs_api_ioctl_method *method_elm, + unsigned int num_attrs); +void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile); + +extern const struct uapi_definition uverbs_def_obj_async_fd[]; +extern const struct uapi_definition uverbs_def_obj_counters[]; +extern const struct uapi_definition uverbs_def_obj_cq[]; +extern const struct uapi_definition uverbs_def_obj_device[]; +extern const struct uapi_definition uverbs_def_obj_dm[]; +extern const struct uapi_definition uverbs_def_obj_flow_action[]; +extern const struct uapi_definition uverbs_def_obj_intf[]; +extern const struct uapi_definition uverbs_def_obj_mr[]; +extern const struct uapi_definition uverbs_def_obj_qp[]; +extern const struct uapi_definition uverbs_def_obj_srq[]; +extern const struct uapi_definition uverbs_def_obj_wq[]; +extern const struct uapi_definition uverbs_def_write_intf[]; + +static inline const struct uverbs_api_write_method * +uapi_get_method(const struct uverbs_api *uapi, u32 command) +{ + u32 cmd_idx = command & IB_USER_VERBS_CMD_COMMAND_MASK; + + if (command & ~(u32)(IB_USER_VERBS_CMD_FLAG_EXTENDED | + IB_USER_VERBS_CMD_COMMAND_MASK)) + return ERR_PTR(-EINVAL); + + if (command & IB_USER_VERBS_CMD_FLAG_EXTENDED) { + if (cmd_idx >= uapi->num_write_ex) + return ERR_PTR(-EOPNOTSUPP); + return uapi->write_ex_methods[cmd_idx]; + } + + if (cmd_idx >= uapi->num_write) + return ERR_PTR(-EOPNOTSUPP); + return uapi->write_methods[cmd_idx]; +} + +void uverbs_fill_udata(struct uverbs_attr_bundle *bundle, + struct ib_udata *udata, unsigned int attr_in, + unsigned int attr_out); + +#endif /* RDMA_CORE_H */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/restrack.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/restrack.c new file mode 100644 index 0000000..01a499a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/restrack.c @@ -0,0 +1,353 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "cma_priv.h" +#include "restrack.h" + +/** + * rdma_restrack_init() - initialize and allocate resource tracking + * @dev: IB device + * + * Return: 0 on success + */ +int rdma_restrack_init(struct ib_device *dev) +{ + struct rdma_restrack_root *rt; + int i; + + dev->res = kcalloc(RDMA_RESTRACK_MAX, sizeof(*rt), GFP_KERNEL); + if (!dev->res) + return -ENOMEM; + + rt = dev->res; + + for (i = 0; i < RDMA_RESTRACK_MAX; i++) + xa_init_flags(&rt[i].xa, XA_FLAGS_ALLOC); + + return 0; +} + +static const char *type2str(enum rdma_restrack_type type) +{ + static const char * const names[RDMA_RESTRACK_MAX] = { + [RDMA_RESTRACK_PD] = "PD", + [RDMA_RESTRACK_CQ] = "CQ", + [RDMA_RESTRACK_QP] = "QP", + [RDMA_RESTRACK_CM_ID] = "CM_ID", + [RDMA_RESTRACK_MR] = "MR", + [RDMA_RESTRACK_CTX] = "CTX", + [RDMA_RESTRACK_COUNTER] = "COUNTER", + [RDMA_RESTRACK_SRQ] = "SRQ", + }; + + return names[type]; +}; + +/** + * rdma_restrack_clean() - clean resource tracking + * @dev: IB device + */ +void rdma_restrack_clean(struct ib_device *dev) +{ + struct rdma_restrack_root *rt = dev->res; + struct rdma_restrack_entry *e; + char buf[TASK_COMM_LEN]; + bool found = false; + const char *owner; + int i; + + for (i = 0 ; i < RDMA_RESTRACK_MAX; i++) { + struct xarray *xa = &dev->res[i].xa; + + if (!xa_empty(xa)) { + unsigned long index; + + if (!found) { + pr_err("restrack: %s", CUT_HERE); + dev_err(&dev->dev, "BUG: RESTRACK detected leak of resources\n"); + } + xa_for_each(xa, index, e) { + if (rdma_is_kernel_res(e)) { + owner = e->kern_name; + } else { + /* + * There is no need to call get_task_struct here, + * because we can be here only if there are more + * get_task_struct() call than put_task_struct(). + */ + get_task_comm(buf, e->task); + owner = buf; + } + + pr_err("restrack: %s %s object allocated by %s is not freed\n", + rdma_is_kernel_res(e) ? "Kernel" : + "User", + type2str(e->type), owner); + } + found = true; + } + xa_destroy(xa); + } + if (found) + pr_err("restrack: %s", CUT_HERE); + + kfree(rt); +} + +/** + * rdma_restrack_count() - the current usage of specific object + * @dev: IB device + * @type: actual type of object to operate + */ +int rdma_restrack_count(struct ib_device *dev, enum rdma_restrack_type type) +{ + struct rdma_restrack_root *rt = &dev->res[type]; + struct rdma_restrack_entry *e; + XA_STATE(xas, &rt->xa, 0); + u32 cnt = 0; + + xa_lock(&rt->xa); + xas_for_each(&xas, e, U32_MAX) + cnt++; + xa_unlock(&rt->xa); + return cnt; +} +EXPORT_SYMBOL(rdma_restrack_count); + +static struct ib_device *res_to_dev(struct rdma_restrack_entry *res) +{ + switch (res->type) { + case RDMA_RESTRACK_PD: + return container_of(res, struct ib_pd, res)->device; + case RDMA_RESTRACK_CQ: + return container_of(res, struct ib_cq, res)->device; + case RDMA_RESTRACK_QP: + return container_of(res, struct ib_qp, res)->device; + case RDMA_RESTRACK_CM_ID: + return container_of(res, struct rdma_id_private, + res)->id.device; + case RDMA_RESTRACK_MR: + return container_of(res, struct ib_mr, res)->device; + case RDMA_RESTRACK_CTX: + return container_of(res, struct ib_ucontext, res)->device; + case RDMA_RESTRACK_COUNTER: + return container_of(res, struct rdma_counter, res)->device; + case RDMA_RESTRACK_SRQ: + return container_of(res, struct ib_srq, res)->device; + default: + WARN_ONCE(true, "Wrong resource tracking type %u\n", res->type); + return NULL; + } +} + +/** + * rdma_restrack_attach_task() - attach the task onto this resource, + * valid for user space restrack entries. + * @res: resource entry + * @task: the task to attach + */ +static void rdma_restrack_attach_task(struct rdma_restrack_entry *res, + struct task_struct *task) +{ + if (WARN_ON_ONCE(!task)) + return; + + if (res->task) + put_task_struct(res->task); + get_task_struct(task); + res->task = task; + res->user = true; +} + +/** + * rdma_restrack_set_name() - set the task for this resource + * @res: resource entry + * @caller: kernel name, the current task will be used if the caller is NULL. + */ +void rdma_restrack_set_name(struct rdma_restrack_entry *res, const char *caller) +{ + if (caller) { + res->kern_name = caller; + return; + } + + rdma_restrack_attach_task(res, current); +} +EXPORT_SYMBOL(rdma_restrack_set_name); + +/** + * rdma_restrack_parent_name() - set the restrack name properties based + * on parent restrack + * @dst: destination resource entry + * @parent: parent resource entry + */ +void rdma_restrack_parent_name(struct rdma_restrack_entry *dst, + const struct rdma_restrack_entry *parent) +{ + if (rdma_is_kernel_res(parent)) + dst->kern_name = parent->kern_name; + else + rdma_restrack_attach_task(dst, parent->task); +} +EXPORT_SYMBOL(rdma_restrack_parent_name); + +/** + * rdma_restrack_new() - Initializes new restrack entry to allow _put() interface + * to release memory in fully automatic way. + * @res: Entry to initialize + * @type: REstrack type + */ +void rdma_restrack_new(struct rdma_restrack_entry *res, + enum rdma_restrack_type type) +{ + kref_init(&res->kref); + init_completion(&res->comp); + res->type = type; +} +EXPORT_SYMBOL(rdma_restrack_new); + +/** + * rdma_restrack_add() - add object to the reource tracking database + * @res: resource entry + */ +void rdma_restrack_add(struct rdma_restrack_entry *res) +{ + struct ib_device *dev = res_to_dev(res); + struct rdma_restrack_root *rt; + int ret = 0; + + if (!dev) + return; + + if (res->no_track) + goto out; + + rt = &dev->res[res->type]; + + if (res->type == RDMA_RESTRACK_QP) { + /* Special case to ensure that LQPN points to right QP */ + struct ib_qp *qp = container_of(res, struct ib_qp, res); + + WARN_ONCE(qp->qp_num >> 24 || qp->port >> 8, + "QP number 0x%0X and port 0x%0X", qp->qp_num, + qp->port); + res->id = qp->qp_num; + if (qp->qp_type == IB_QPT_SMI || qp->qp_type == IB_QPT_GSI) + res->id |= qp->port << 24; + ret = xa_insert(&rt->xa, res->id, res, GFP_KERNEL); + if (ret) + res->id = 0; + } else if (res->type == RDMA_RESTRACK_COUNTER) { + /* Special case to ensure that cntn points to right counter */ + struct rdma_counter *counter; + + counter = container_of(res, struct rdma_counter, res); + ret = xa_insert(&rt->xa, counter->id, res, GFP_KERNEL); + res->id = ret ? 0 : counter->id; + } else { + ret = xa_alloc_cyclic(&rt->xa, &res->id, res, xa_limit_32b, + &rt->next_id, GFP_KERNEL); + ret = (ret < 0) ? ret : 0; + } + +out: + if (!ret) + res->valid = true; +} +EXPORT_SYMBOL(rdma_restrack_add); + +int __must_check rdma_restrack_get(struct rdma_restrack_entry *res) +{ + return kref_get_unless_zero(&res->kref); +} +EXPORT_SYMBOL(rdma_restrack_get); + +/** + * rdma_restrack_get_byid() - translate from ID to restrack object + * @dev: IB device + * @type: resource track type + * @id: ID to take a look + * + * Return: Pointer to restrack entry or -ENOENT in case of error. + */ +struct rdma_restrack_entry * +rdma_restrack_get_byid(struct ib_device *dev, + enum rdma_restrack_type type, u32 id) +{ + struct rdma_restrack_root *rt = &dev->res[type]; + struct rdma_restrack_entry *res; + + xa_lock(&rt->xa); + res = xa_load(&rt->xa, id); + if (!res || !rdma_restrack_get(res)) + res = ERR_PTR(-ENOENT); + xa_unlock(&rt->xa); + + return res; +} +EXPORT_SYMBOL(rdma_restrack_get_byid); + +static void restrack_release(struct kref *kref) +{ + struct rdma_restrack_entry *res; + + res = container_of(kref, struct rdma_restrack_entry, kref); + if (res->task) { + put_task_struct(res->task); + res->task = NULL; + } + complete(&res->comp); +} + +int rdma_restrack_put(struct rdma_restrack_entry *res) +{ + return kref_put(&res->kref, restrack_release); +} +EXPORT_SYMBOL(rdma_restrack_put); + +/** + * rdma_restrack_del() - delete object from the reource tracking database + * @res: resource entry + */ +void rdma_restrack_del(struct rdma_restrack_entry *res) +{ + struct rdma_restrack_entry *old; + struct rdma_restrack_root *rt; + struct ib_device *dev; + + if (!res->valid) { + if (res->task) { + put_task_struct(res->task); + res->task = NULL; + } + return; + } + + if (res->no_track) + goto out; + + dev = res_to_dev(res); + if (WARN_ON(!dev)) + return; + + rt = &dev->res[res->type]; + + old = xa_erase(&rt->xa, res->id); + WARN_ON(old != res); + +out: + res->valid = false; + rdma_restrack_put(res); + wait_for_completion(&res->comp); +} +EXPORT_SYMBOL(rdma_restrack_del); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/restrack.h b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/restrack.h new file mode 100644 index 0000000..6a04fc4 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/restrack.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2017-2019 Mellanox Technologies. All rights reserved. + */ + +#ifndef _RDMA_CORE_RESTRACK_H_ +#define _RDMA_CORE_RESTRACK_H_ + +#include + +/** + * struct rdma_restrack_root - main resource tracking management + * entity, per-device + */ +struct rdma_restrack_root { + /** + * @xa: Array of XArray structure to hold restrack entries. + */ + struct xarray xa; + /** + * @next_id: Next ID to support cyclic allocation + */ + u32 next_id; +}; + +int rdma_restrack_init(struct ib_device *dev); +void rdma_restrack_clean(struct ib_device *dev); +void rdma_restrack_add(struct rdma_restrack_entry *res); +void rdma_restrack_del(struct rdma_restrack_entry *res); +void rdma_restrack_new(struct rdma_restrack_entry *res, + enum rdma_restrack_type type); +void rdma_restrack_set_name(struct rdma_restrack_entry *res, + const char *caller); +void rdma_restrack_parent_name(struct rdma_restrack_entry *dst, + const struct rdma_restrack_entry *parent); +#endif /* _RDMA_CORE_RESTRACK_H_ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/roce_gid_mgmt.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/roce_gid_mgmt.c new file mode 100644 index 0000000..aee9ffb --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/roce_gid_mgmt.c @@ -0,0 +1,938 @@ +/* + * Copyright (c) 2015, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "core_priv.h" + +#include +#include + +/* For in6_dev_get/in6_dev_put */ +#include +#include + +#include +#include + +static struct workqueue_struct *gid_cache_wq; + +enum gid_op_type { + GID_DEL = 0, + GID_ADD +}; + +struct update_gid_event_work { + struct work_struct work; + union ib_gid gid; + struct ib_gid_attr gid_attr; + enum gid_op_type gid_op; + unsigned long ndev_event; +}; + +#define ROCE_NETDEV_CALLBACK_SZ 3 +struct netdev_event_work_cmd { + roce_netdev_callback cb; + roce_netdev_filter filter; + struct net_device *ndev; + struct net_device *filter_ndev; +}; + +struct netdev_event_work { + struct work_struct work; + struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ]; + /* Indicates which netdev event occurred in notifier chain. */ + unsigned long ndev_event; +}; + +static const struct { + bool (*is_supported)(const struct ib_device *device, u32 port_num); + enum ib_gid_type gid_type; +} PORT_CAP_TO_GID_TYPE[] = { + {rdma_protocol_roce_eth_encap, IB_GID_TYPE_ROCE}, + {rdma_protocol_roce_udp_encap, IB_GID_TYPE_ROCE_UDP_ENCAP}, +}; + +#define CAP_TO_GID_TABLE_SIZE ARRAY_SIZE(PORT_CAP_TO_GID_TYPE) + +unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u32 port) +{ + int i; + unsigned int ret_flags = 0; + + if (!rdma_protocol_roce(ib_dev, port)) + return 1UL << IB_GID_TYPE_IB; + + for (i = 0; i < CAP_TO_GID_TABLE_SIZE; i++) + if (PORT_CAP_TO_GID_TYPE[i].is_supported(ib_dev, port)) + ret_flags |= 1UL << PORT_CAP_TO_GID_TYPE[i].gid_type; + + return ret_flags; +} +EXPORT_SYMBOL(roce_gid_type_mask_support); + +static void update_gid(enum gid_op_type gid_op, struct ib_device *ib_dev, + u32 port, union ib_gid *gid, + struct ib_gid_attr *gid_attr) +{ + int i; + unsigned long gid_type_mask = roce_gid_type_mask_support(ib_dev, port); + + for (i = 0; i < IB_GID_TYPE_SIZE; i++) { + if ((1UL << i) & gid_type_mask) { + gid_attr->gid_type = i; + switch (gid_op) { + case GID_ADD: + ib_cache_gid_add(ib_dev, port, + gid, gid_attr); + break; + case GID_DEL: + ib_cache_gid_del(ib_dev, port, + gid, gid_attr); + break; + } + } + } +} + +enum bonding_slave_state { + BONDING_SLAVE_STATE_ACTIVE = 1UL << 0, + BONDING_SLAVE_STATE_INACTIVE = 1UL << 1, + /* No primary slave or the device isn't a slave in bonding */ + BONDING_SLAVE_STATE_NA = 1UL << 2, +}; + +static enum bonding_slave_state is_eth_active_slave_of_bonding_rcu(struct net_device *dev, + struct net_device *upper) +{ + if (upper && netif_is_bond_master(upper)) { + struct net_device *pdev = + bond_option_active_slave_get_rcu(netdev_priv(upper)); + + if (pdev) + return dev == pdev ? BONDING_SLAVE_STATE_ACTIVE : + BONDING_SLAVE_STATE_INACTIVE; + } + + return BONDING_SLAVE_STATE_NA; +} + +#define REQUIRED_BOND_STATES (BONDING_SLAVE_STATE_ACTIVE | \ + BONDING_SLAVE_STATE_NA) +static bool +is_eth_port_of_netdev_filter(struct ib_device *ib_dev, u32 port, + struct net_device *rdma_ndev, void *cookie) +{ + struct net_device *real_dev; + bool res; + + if (!rdma_ndev) + return false; + + rcu_read_lock(); + real_dev = rdma_vlan_dev_real_dev(cookie); + if (!real_dev) + real_dev = cookie; + + res = ((rdma_is_upper_dev_rcu(rdma_ndev, cookie) && + (is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) & + REQUIRED_BOND_STATES)) || + real_dev == rdma_ndev); + + rcu_read_unlock(); + return res; +} + +static bool +is_eth_port_inactive_slave_filter(struct ib_device *ib_dev, u32 port, + struct net_device *rdma_ndev, void *cookie) +{ + struct net_device *master_dev; + bool res; + + if (!rdma_ndev) + return false; + + rcu_read_lock(); + master_dev = netdev_master_upper_dev_get_rcu(rdma_ndev); + res = is_eth_active_slave_of_bonding_rcu(rdma_ndev, master_dev) == + BONDING_SLAVE_STATE_INACTIVE; + rcu_read_unlock(); + + return res; +} + +/** + * is_ndev_for_default_gid_filter - Check if a given netdevice + * can be considered for default GIDs or not. + * @ib_dev: IB device to check + * @port: Port to consider for adding default GID + * @rdma_ndev: rdma netdevice pointer + * @cookie: Netdevice to consider to form a default GID + * + * is_ndev_for_default_gid_filter() returns true if a given netdevice can be + * considered for deriving default RoCE GID, returns false otherwise. + */ +static bool +is_ndev_for_default_gid_filter(struct ib_device *ib_dev, u32 port, + struct net_device *rdma_ndev, void *cookie) +{ + struct net_device *cookie_ndev = cookie; + bool res; + + if (!rdma_ndev) + return false; + + rcu_read_lock(); + + /* + * When rdma netdevice is used in bonding, bonding master netdevice + * should be considered for default GIDs. Therefore, ignore slave rdma + * netdevices when bonding is considered. + * Additionally when event(cookie) netdevice is bond master device, + * make sure that it the upper netdevice of rdma netdevice. + */ + res = ((cookie_ndev == rdma_ndev && !netif_is_bond_slave(rdma_ndev)) || + (netif_is_bond_master(cookie_ndev) && + rdma_is_upper_dev_rcu(rdma_ndev, cookie_ndev))); + + rcu_read_unlock(); + return res; +} + +static bool pass_all_filter(struct ib_device *ib_dev, u32 port, + struct net_device *rdma_ndev, void *cookie) +{ + return true; +} + +static bool upper_device_filter(struct ib_device *ib_dev, u32 port, + struct net_device *rdma_ndev, void *cookie) +{ + bool res; + + if (!rdma_ndev) + return false; + + if (rdma_ndev == cookie) + return true; + + rcu_read_lock(); + res = rdma_is_upper_dev_rcu(rdma_ndev, cookie); + rcu_read_unlock(); + + return res; +} + +/** + * is_upper_ndev_bond_master_filter - Check if a given netdevice + * is bond master device of netdevice of the the RDMA device of port. + * @ib_dev: IB device to check + * @port: Port to consider for adding default GID + * @rdma_ndev: Pointer to rdma netdevice + * @cookie: Netdevice to consider to form a default GID + * + * is_upper_ndev_bond_master_filter() returns true if a cookie_netdev + * is bond master device and rdma_ndev is its lower netdevice. It might + * not have been established as slave device yet. + */ +static bool +is_upper_ndev_bond_master_filter(struct ib_device *ib_dev, u32 port, + struct net_device *rdma_ndev, + void *cookie) +{ + struct net_device *cookie_ndev = cookie; + bool match = false; + + if (!rdma_ndev) + return false; + + rcu_read_lock(); + if (netif_is_bond_master(cookie_ndev) && + rdma_is_upper_dev_rcu(rdma_ndev, cookie_ndev)) + match = true; + rcu_read_unlock(); + return match; +} + +static void update_gid_ip(enum gid_op_type gid_op, + struct ib_device *ib_dev, + u32 port, struct net_device *ndev, + struct sockaddr *addr) +{ + union ib_gid gid; + struct ib_gid_attr gid_attr; + + rdma_ip2gid(addr, &gid); + memset(&gid_attr, 0, sizeof(gid_attr)); + gid_attr.ndev = ndev; + + update_gid(gid_op, ib_dev, port, &gid, &gid_attr); +} + +static void bond_delete_netdev_default_gids(struct ib_device *ib_dev, + u32 port, + struct net_device *rdma_ndev, + struct net_device *event_ndev) +{ + struct net_device *real_dev = rdma_vlan_dev_real_dev(event_ndev); + unsigned long gid_type_mask; + + if (!rdma_ndev) + return; + + if (!real_dev) + real_dev = event_ndev; + + rcu_read_lock(); + + if (((rdma_ndev != event_ndev && + !rdma_is_upper_dev_rcu(rdma_ndev, event_ndev)) || + is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) + == + BONDING_SLAVE_STATE_INACTIVE)) { + rcu_read_unlock(); + return; + } + + rcu_read_unlock(); + + gid_type_mask = roce_gid_type_mask_support(ib_dev, port); + + ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev, + gid_type_mask, + IB_CACHE_GID_DEFAULT_MODE_DELETE); +} + +static void enum_netdev_ipv4_ips(struct ib_device *ib_dev, + u32 port, struct net_device *ndev) +{ + const struct in_ifaddr *ifa; + struct in_device *in_dev; + struct sin_list { + struct list_head list; + struct sockaddr_in ip; + }; + struct sin_list *sin_iter; + struct sin_list *sin_temp; + + LIST_HEAD(sin_list); + if (ndev->reg_state >= NETREG_UNREGISTERING) + return; + + rcu_read_lock(); + in_dev = __in_dev_get_rcu(ndev); + if (!in_dev) { + rcu_read_unlock(); + return; + } + + in_dev_for_each_ifa_rcu(ifa, in_dev) { + struct sin_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC); + + if (!entry) + continue; + + entry->ip.sin_family = AF_INET; + entry->ip.sin_addr.s_addr = ifa->ifa_address; + list_add_tail(&entry->list, &sin_list); + } + + rcu_read_unlock(); + + list_for_each_entry_safe(sin_iter, sin_temp, &sin_list, list) { + update_gid_ip(GID_ADD, ib_dev, port, ndev, + (struct sockaddr *)&sin_iter->ip); + list_del(&sin_iter->list); + kfree(sin_iter); + } +} + +static void enum_netdev_ipv6_ips(struct ib_device *ib_dev, + u32 port, struct net_device *ndev) +{ + struct inet6_ifaddr *ifp; + struct inet6_dev *in6_dev; + struct sin6_list { + struct list_head list; + struct sockaddr_in6 sin6; + }; + struct sin6_list *sin6_iter; + struct sin6_list *sin6_temp; + struct ib_gid_attr gid_attr = {.ndev = ndev}; + LIST_HEAD(sin6_list); + + if (ndev->reg_state >= NETREG_UNREGISTERING) + return; + + in6_dev = in6_dev_get(ndev); + if (!in6_dev) + return; + + read_lock_bh(&in6_dev->lock); + list_for_each_entry(ifp, &in6_dev->addr_list, if_list) { + struct sin6_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC); + + if (!entry) + continue; + + entry->sin6.sin6_family = AF_INET6; + entry->sin6.sin6_addr = ifp->addr; + list_add_tail(&entry->list, &sin6_list); + } + read_unlock_bh(&in6_dev->lock); + + in6_dev_put(in6_dev); + + list_for_each_entry_safe(sin6_iter, sin6_temp, &sin6_list, list) { + union ib_gid gid; + + rdma_ip2gid((struct sockaddr *)&sin6_iter->sin6, &gid); + update_gid(GID_ADD, ib_dev, port, &gid, &gid_attr); + list_del(&sin6_iter->list); + kfree(sin6_iter); + } +} + +static void _add_netdev_ips(struct ib_device *ib_dev, u32 port, + struct net_device *ndev) +{ + enum_netdev_ipv4_ips(ib_dev, port, ndev); + if (IS_ENABLED(CONFIG_IPV6)) + enum_netdev_ipv6_ips(ib_dev, port, ndev); +} + +static void add_netdev_ips(struct ib_device *ib_dev, u32 port, + struct net_device *rdma_ndev, void *cookie) +{ + _add_netdev_ips(ib_dev, port, cookie); +} + +static void del_netdev_ips(struct ib_device *ib_dev, u32 port, + struct net_device *rdma_ndev, void *cookie) +{ + ib_cache_gid_del_all_netdev_gids(ib_dev, port, cookie); +} + +/** + * del_default_gids - Delete default GIDs of the event/cookie netdevice + * @ib_dev: RDMA device pointer + * @port: Port of the RDMA device whose GID table to consider + * @rdma_ndev: Unused rdma netdevice + * @cookie: Pointer to event netdevice + * + * del_default_gids() deletes the default GIDs of the event/cookie netdevice. + */ +static void del_default_gids(struct ib_device *ib_dev, u32 port, + struct net_device *rdma_ndev, void *cookie) +{ + struct net_device *cookie_ndev = cookie; + unsigned long gid_type_mask; + + gid_type_mask = roce_gid_type_mask_support(ib_dev, port); + + ib_cache_gid_set_default_gid(ib_dev, port, cookie_ndev, gid_type_mask, + IB_CACHE_GID_DEFAULT_MODE_DELETE); +} + +static void add_default_gids(struct ib_device *ib_dev, u32 port, + struct net_device *rdma_ndev, void *cookie) +{ + struct net_device *event_ndev = cookie; + unsigned long gid_type_mask; + + gid_type_mask = roce_gid_type_mask_support(ib_dev, port); + ib_cache_gid_set_default_gid(ib_dev, port, event_ndev, gid_type_mask, + IB_CACHE_GID_DEFAULT_MODE_SET); +} + +static void enum_all_gids_of_dev_cb(struct ib_device *ib_dev, + u32 port, + struct net_device *rdma_ndev, + void *cookie) +{ + struct net *net; + struct net_device *ndev; + + /* Lock the rtnl to make sure the netdevs does not move under + * our feet + */ + rtnl_lock(); + down_read(&net_rwsem); + for_each_net(net) + for_each_netdev(net, ndev) { + /* + * Filter and add default GIDs of the primary netdevice + * when not in bonding mode, or add default GIDs + * of bond master device, when in bonding mode. + */ + if (is_ndev_for_default_gid_filter(ib_dev, port, + rdma_ndev, ndev)) + add_default_gids(ib_dev, port, rdma_ndev, ndev); + + if (is_eth_port_of_netdev_filter(ib_dev, port, + rdma_ndev, ndev)) + _add_netdev_ips(ib_dev, port, ndev); + } + up_read(&net_rwsem); + rtnl_unlock(); +} + +/** + * rdma_roce_rescan_device - Rescan all of the network devices in the system + * and add their gids, as needed, to the relevant RoCE devices. + * + * @ib_dev: the rdma device + */ +void rdma_roce_rescan_device(struct ib_device *ib_dev) +{ + /* While rescanning the device, send NETDEV_UP event as intent is + * to add new GID entries for the netdev. + */ + ib_enum_roce_netdev(ib_dev, pass_all_filter, NULL, + enum_all_gids_of_dev_cb, NULL, NETDEV_UP); +} +EXPORT_SYMBOL(rdma_roce_rescan_device); + +static void callback_for_addr_gid_device_scan(struct ib_device *device, + u32 port, + struct net_device *rdma_ndev, + void *cookie) +{ + struct update_gid_event_work *parsed = cookie; + + return update_gid(parsed->gid_op, device, + port, &parsed->gid, + &parsed->gid_attr); +} + +struct upper_list { + struct list_head list; + struct net_device *upper; +}; + +static int netdev_upper_walk(struct net_device *upper, + struct netdev_nested_priv *priv) +{ + struct upper_list *entry = kmalloc(sizeof(*entry), GFP_ATOMIC); + struct list_head *upper_list = (struct list_head *)priv->data; + + if (!entry) + return 0; + + list_add_tail(&entry->list, upper_list); + dev_hold(upper); + entry->upper = upper; + + return 0; +} + +static void handle_netdev_upper(struct ib_device *ib_dev, u32 port, + void *cookie, + void (*handle_netdev)(struct ib_device *ib_dev, + u32 port, + struct net_device *ndev)) +{ + struct net_device *ndev = cookie; + struct netdev_nested_priv priv; + struct upper_list *upper_iter; + struct upper_list *upper_temp; + LIST_HEAD(upper_list); + + priv.data = &upper_list; + rcu_read_lock(); + netdev_walk_all_upper_dev_rcu(ndev, netdev_upper_walk, &priv); + rcu_read_unlock(); + + handle_netdev(ib_dev, port, ndev); + list_for_each_entry_safe(upper_iter, upper_temp, &upper_list, + list) { + handle_netdev(ib_dev, port, upper_iter->upper); + dev_put(upper_iter->upper); + list_del(&upper_iter->list); + kfree(upper_iter); + } +} + +static void _roce_del_all_netdev_gids(struct ib_device *ib_dev, u32 port, + struct net_device *event_ndev) +{ + ib_cache_gid_del_all_netdev_gids(ib_dev, port, event_ndev); +} + +static void del_netdev_upper_ips(struct ib_device *ib_dev, u32 port, + struct net_device *rdma_ndev, void *cookie) +{ + handle_netdev_upper(ib_dev, port, cookie, _roce_del_all_netdev_gids); +} + +static void add_netdev_upper_ips(struct ib_device *ib_dev, u32 port, + struct net_device *rdma_ndev, void *cookie) +{ + handle_netdev_upper(ib_dev, port, cookie, _add_netdev_ips); +} + +static void del_netdev_default_ips_join(struct ib_device *ib_dev, u32 port, + struct net_device *rdma_ndev, + void *cookie) +{ + struct net_device *master_ndev; + + rcu_read_lock(); + master_ndev = netdev_master_upper_dev_get_rcu(rdma_ndev); + if (master_ndev) + dev_hold(master_ndev); + rcu_read_unlock(); + + if (master_ndev) { + bond_delete_netdev_default_gids(ib_dev, port, rdma_ndev, + master_ndev); + dev_put(master_ndev); + } +} + +/* The following functions operate on all IB devices. netdevice_event and + * addr_event execute ib_enum_all_roce_netdevs through a work. + * ib_enum_all_roce_netdevs iterates through all IB devices. + */ + +static void netdevice_event_work_handler(struct work_struct *_work) +{ + struct netdev_event_work *work = + container_of(_work, struct netdev_event_work, work); + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(work->cmds) && work->cmds[i].cb; i++) { + ib_enum_all_roce_netdevs(work->cmds[i].filter, + work->cmds[i].filter_ndev, + work->cmds[i].cb, + work->cmds[i].ndev, work->ndev_event); + dev_put(work->cmds[i].ndev); + dev_put(work->cmds[i].filter_ndev); + } + + kfree(work); +} + +static int netdevice_queue_work(struct netdev_event_work_cmd *cmds, + struct net_device *ndev, unsigned long event) +{ + unsigned int i; + struct netdev_event_work *ndev_work = + kmalloc(sizeof(*ndev_work), GFP_KERNEL); + + if (!ndev_work) + return NOTIFY_DONE; + + memcpy(ndev_work->cmds, cmds, sizeof(ndev_work->cmds)); + for (i = 0; i < ARRAY_SIZE(ndev_work->cmds) && ndev_work->cmds[i].cb; i++) { + if (!ndev_work->cmds[i].ndev) + ndev_work->cmds[i].ndev = ndev; + if (!ndev_work->cmds[i].filter_ndev) + ndev_work->cmds[i].filter_ndev = ndev; + dev_hold(ndev_work->cmds[i].ndev); + dev_hold(ndev_work->cmds[i].filter_ndev); + } + ndev_work->ndev_event = event; + INIT_WORK(&ndev_work->work, netdevice_event_work_handler); + + queue_work(gid_cache_wq, &ndev_work->work); + + return NOTIFY_DONE; +} + +static const struct netdev_event_work_cmd add_cmd = { + .cb = add_netdev_ips, + .filter = is_eth_port_of_netdev_filter +}; + +static const struct netdev_event_work_cmd add_cmd_upper_ips = { + .cb = add_netdev_upper_ips, + .filter = is_eth_port_of_netdev_filter +}; + +static void +ndev_event_unlink(struct netdev_notifier_changeupper_info *changeupper_info, + struct netdev_event_work_cmd *cmds) +{ + static const struct netdev_event_work_cmd + upper_ips_del_cmd = { + .cb = del_netdev_upper_ips, + .filter = upper_device_filter + }; + + cmds[0] = upper_ips_del_cmd; + cmds[0].ndev = changeupper_info->upper_dev; + cmds[1] = add_cmd; +} + +static const struct netdev_event_work_cmd bonding_default_add_cmd = { + .cb = add_default_gids, + .filter = is_upper_ndev_bond_master_filter +}; + +static void +ndev_event_link(struct net_device *event_ndev, + struct netdev_notifier_changeupper_info *changeupper_info, + struct netdev_event_work_cmd *cmds) +{ + static const struct netdev_event_work_cmd + bonding_default_del_cmd = { + .cb = del_default_gids, + .filter = is_upper_ndev_bond_master_filter + }; + /* + * When a lower netdev is linked to its upper bonding + * netdev, delete lower slave netdev's default GIDs. + */ + cmds[0] = bonding_default_del_cmd; + cmds[0].ndev = event_ndev; + cmds[0].filter_ndev = changeupper_info->upper_dev; + + /* Now add bonding upper device default GIDs */ + cmds[1] = bonding_default_add_cmd; + cmds[1].ndev = changeupper_info->upper_dev; + cmds[1].filter_ndev = changeupper_info->upper_dev; + + /* Now add bonding upper device IP based GIDs */ + cmds[2] = add_cmd_upper_ips; + cmds[2].ndev = changeupper_info->upper_dev; + cmds[2].filter_ndev = changeupper_info->upper_dev; +} + +static void netdevice_event_changeupper(struct net_device *event_ndev, + struct netdev_notifier_changeupper_info *changeupper_info, + struct netdev_event_work_cmd *cmds) +{ + if (changeupper_info->linking) + ndev_event_link(event_ndev, changeupper_info, cmds); + else + ndev_event_unlink(changeupper_info, cmds); +} + +static const struct netdev_event_work_cmd add_default_gid_cmd = { + .cb = add_default_gids, + .filter = is_ndev_for_default_gid_filter, +}; + +static int netdevice_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + static const struct netdev_event_work_cmd del_cmd = { + .cb = del_netdev_ips, .filter = pass_all_filter}; + static const struct netdev_event_work_cmd + bonding_default_del_cmd_join = { + .cb = del_netdev_default_ips_join, + .filter = is_eth_port_inactive_slave_filter + }; + static const struct netdev_event_work_cmd + netdev_del_cmd = { + .cb = del_netdev_ips, + .filter = is_eth_port_of_netdev_filter + }; + static const struct netdev_event_work_cmd bonding_event_ips_del_cmd = { + .cb = del_netdev_upper_ips, .filter = upper_device_filter}; + struct net_device *ndev = netdev_notifier_info_to_dev(ptr); + struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ] = { {NULL} }; + + if (ndev->type != ARPHRD_ETHER) + return NOTIFY_DONE; + + switch (event) { + case NETDEV_REGISTER: + case NETDEV_UP: + cmds[0] = bonding_default_del_cmd_join; + cmds[1] = add_default_gid_cmd; + cmds[2] = add_cmd; + break; + + case NETDEV_UNREGISTER: + if (ndev->reg_state < NETREG_UNREGISTERED) + cmds[0] = del_cmd; + else + return NOTIFY_DONE; + break; + + case NETDEV_CHANGEADDR: + cmds[0] = netdev_del_cmd; + if (ndev->reg_state == NETREG_REGISTERED) { + cmds[1] = add_default_gid_cmd; + cmds[2] = add_cmd; + } + break; + + case NETDEV_CHANGEUPPER: + netdevice_event_changeupper(ndev, + container_of(ptr, struct netdev_notifier_changeupper_info, info), + cmds); + break; + + case NETDEV_BONDING_FAILOVER: + cmds[0] = bonding_event_ips_del_cmd; + /* Add default GIDs of the bond device */ + cmds[1] = bonding_default_add_cmd; + /* Add IP based GIDs of the bond device */ + cmds[2] = add_cmd_upper_ips; + break; + + default: + return NOTIFY_DONE; + } + + return netdevice_queue_work(cmds, ndev, event); +} + +static void update_gid_event_work_handler(struct work_struct *_work) +{ + struct update_gid_event_work *work = + container_of(_work, struct update_gid_event_work, work); + + ib_enum_all_roce_netdevs(is_eth_port_of_netdev_filter, + work->gid_attr.ndev, + callback_for_addr_gid_device_scan, + work, work->ndev_event); + + dev_put(work->gid_attr.ndev); + kfree(work); +} + +static int addr_event(struct notifier_block *this, unsigned long event, + struct sockaddr *sa, struct net_device *ndev) +{ + struct update_gid_event_work *work; + enum gid_op_type gid_op; + + if (ndev->type != ARPHRD_ETHER) + return NOTIFY_DONE; + + switch (event) { + case NETDEV_UP: + gid_op = GID_ADD; + break; + + case NETDEV_DOWN: + gid_op = GID_DEL; + break; + + default: + return NOTIFY_DONE; + } + + work = kmalloc(sizeof(*work), GFP_ATOMIC); + if (!work) + return NOTIFY_DONE; + + INIT_WORK(&work->work, update_gid_event_work_handler); + + rdma_ip2gid(sa, &work->gid); + work->gid_op = gid_op; + work->ndev_event = event; + + memset(&work->gid_attr, 0, sizeof(work->gid_attr)); + dev_hold(ndev); + work->gid_attr.ndev = ndev; + + queue_work(gid_cache_wq, &work->work); + + return NOTIFY_DONE; +} + +static int inetaddr_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct sockaddr_in in; + struct net_device *ndev; + struct in_ifaddr *ifa = ptr; + + in.sin_family = AF_INET; + in.sin_addr.s_addr = ifa->ifa_address; + ndev = ifa->ifa_dev->dev; + + return addr_event(this, event, (struct sockaddr *)&in, ndev); +} + +static int inet6addr_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct sockaddr_in6 in6; + struct net_device *ndev; + struct inet6_ifaddr *ifa6 = ptr; + + in6.sin6_family = AF_INET6; + in6.sin6_addr = ifa6->addr; + ndev = ifa6->idev->dev; + + return addr_event(this, event, (struct sockaddr *)&in6, ndev); +} + +static struct notifier_block nb_netdevice = { + .notifier_call = netdevice_event +}; + +static struct notifier_block nb_inetaddr = { + .notifier_call = inetaddr_event +}; + +static struct notifier_block nb_inet6addr = { + .notifier_call = inet6addr_event +}; + +int __init roce_gid_mgmt_init(void) +{ + gid_cache_wq = alloc_ordered_workqueue("gid-cache-wq", 0); + if (!gid_cache_wq) + return -ENOMEM; + + register_inetaddr_notifier(&nb_inetaddr); + if (IS_ENABLED(CONFIG_IPV6)) + register_inet6addr_notifier(&nb_inet6addr); + /* We relay on the netdevice notifier to enumerate all + * existing devices in the system. Register to this notifier + * last to make sure we will not miss any IP add/del + * callbacks. + */ + register_netdevice_notifier(&nb_netdevice); + + return 0; +} + +void __exit roce_gid_mgmt_cleanup(void) +{ + if (IS_ENABLED(CONFIG_IPV6)) + unregister_inet6addr_notifier(&nb_inet6addr); + unregister_inetaddr_notifier(&nb_inetaddr); + unregister_netdevice_notifier(&nb_netdevice); + /* Ensure all gid deletion tasks complete before we go down, + * to avoid any reference to free'd memory. By the time + * ib-core is removed, all physical devices have been removed, + * so no issue with remaining hardware contexts. + */ + destroy_workqueue(gid_cache_wq); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/rw.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/rw.c new file mode 100644 index 0000000..22bfa40 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/rw.c @@ -0,0 +1,762 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2016 HGST, a Western Digital Company. + */ +#include +#include +#include +#include +#include +#include +#include + +enum { + RDMA_RW_SINGLE_WR, + RDMA_RW_MULTI_WR, + RDMA_RW_MR, + RDMA_RW_SIG_MR, +}; + +static bool rdma_rw_force_mr; +module_param_named(force_mr, rdma_rw_force_mr, bool, 0); +MODULE_PARM_DESC(force_mr, "Force usage of MRs for RDMA READ/WRITE operations"); + +/* + * Report whether memory registration should be used. Memory registration must + * be used for iWarp devices because of iWARP-specific limitations. Memory + * registration is also enabled if registering memory might yield better + * performance than using multiple SGE entries, see rdma_rw_io_needs_mr() + */ +static inline bool rdma_rw_can_use_mr(struct ib_device *dev, u32 port_num) +{ + if (rdma_protocol_iwarp(dev, port_num)) + return true; + if (dev->attrs.max_sgl_rd) + return true; + if (unlikely(rdma_rw_force_mr)) + return true; + return false; +} + +/* + * Check if the device will use memory registration for this RW operation. + * For RDMA READs we must use MRs on iWarp and can optionally use them as an + * optimization otherwise. Additionally we have a debug option to force usage + * of MRs to help testing this code path. + */ +static inline bool rdma_rw_io_needs_mr(struct ib_device *dev, u32 port_num, + enum dma_data_direction dir, int dma_nents) +{ + if (dir == DMA_FROM_DEVICE) { + if (rdma_protocol_iwarp(dev, port_num)) + return true; + if (dev->attrs.max_sgl_rd && dma_nents > dev->attrs.max_sgl_rd) + return true; + } + if (unlikely(rdma_rw_force_mr)) + return true; + return false; +} + +static inline u32 rdma_rw_fr_page_list_len(struct ib_device *dev, + bool pi_support) +{ + u32 max_pages; + + if (pi_support) + max_pages = dev->attrs.max_pi_fast_reg_page_list_len; + else + max_pages = dev->attrs.max_fast_reg_page_list_len; + + /* arbitrary limit to avoid allocating gigantic resources */ + return min_t(u32, max_pages, 256); +} + +static inline int rdma_rw_inv_key(struct rdma_rw_reg_ctx *reg) +{ + int count = 0; + + if (reg->mr->need_inval) { + reg->inv_wr.opcode = IB_WR_LOCAL_INV; + reg->inv_wr.ex.invalidate_rkey = reg->mr->lkey; + reg->inv_wr.next = ®->reg_wr.wr; + count++; + } else { + reg->inv_wr.next = NULL; + } + + return count; +} + +/* Caller must have zero-initialized *reg. */ +static int rdma_rw_init_one_mr(struct ib_qp *qp, u32 port_num, + struct rdma_rw_reg_ctx *reg, struct scatterlist *sg, + u32 sg_cnt, u32 offset) +{ + u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device, + qp->integrity_en); + u32 nents = min(sg_cnt, pages_per_mr); + int count = 0, ret; + + reg->mr = ib_mr_pool_get(qp, &qp->rdma_mrs); + if (!reg->mr) + return -EAGAIN; + + count += rdma_rw_inv_key(reg); + + ret = ib_map_mr_sg(reg->mr, sg, nents, &offset, PAGE_SIZE); + if (ret < 0 || ret < nents) { + ib_mr_pool_put(qp, &qp->rdma_mrs, reg->mr); + return -EINVAL; + } + + reg->reg_wr.wr.opcode = IB_WR_REG_MR; + reg->reg_wr.mr = reg->mr; + reg->reg_wr.access = IB_ACCESS_LOCAL_WRITE; + if (rdma_protocol_iwarp(qp->device, port_num)) + reg->reg_wr.access |= IB_ACCESS_REMOTE_WRITE; + count++; + + reg->sge.addr = reg->mr->iova; + reg->sge.length = reg->mr->length; + return count; +} + +static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + u32 port_num, struct scatterlist *sg, u32 sg_cnt, u32 offset, + u64 remote_addr, u32 rkey, enum dma_data_direction dir) +{ + struct rdma_rw_reg_ctx *prev = NULL; + u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device, + qp->integrity_en); + int i, j, ret = 0, count = 0; + + ctx->nr_ops = DIV_ROUND_UP(sg_cnt, pages_per_mr); + ctx->reg = kcalloc(ctx->nr_ops, sizeof(*ctx->reg), GFP_KERNEL); + if (!ctx->reg) { + ret = -ENOMEM; + goto out; + } + + for (i = 0; i < ctx->nr_ops; i++) { + struct rdma_rw_reg_ctx *reg = &ctx->reg[i]; + u32 nents = min(sg_cnt, pages_per_mr); + + ret = rdma_rw_init_one_mr(qp, port_num, reg, sg, sg_cnt, + offset); + if (ret < 0) + goto out_free; + count += ret; + + if (prev) { + if (reg->mr->need_inval) + prev->wr.wr.next = ®->inv_wr; + else + prev->wr.wr.next = ®->reg_wr.wr; + } + + reg->reg_wr.wr.next = ®->wr.wr; + + reg->wr.wr.sg_list = ®->sge; + reg->wr.wr.num_sge = 1; + reg->wr.remote_addr = remote_addr; + reg->wr.rkey = rkey; + if (dir == DMA_TO_DEVICE) { + reg->wr.wr.opcode = IB_WR_RDMA_WRITE; + } else if (!rdma_cap_read_inv(qp->device, port_num)) { + reg->wr.wr.opcode = IB_WR_RDMA_READ; + } else { + reg->wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV; + reg->wr.wr.ex.invalidate_rkey = reg->mr->lkey; + } + count++; + + remote_addr += reg->sge.length; + sg_cnt -= nents; + for (j = 0; j < nents; j++) + sg = sg_next(sg); + prev = reg; + offset = 0; + } + + if (prev) + prev->wr.wr.next = NULL; + + ctx->type = RDMA_RW_MR; + return count; + +out_free: + while (--i >= 0) + ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr); + kfree(ctx->reg); +out: + return ret; +} + +static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + struct scatterlist *sg, u32 sg_cnt, u32 offset, + u64 remote_addr, u32 rkey, enum dma_data_direction dir) +{ + u32 max_sge = dir == DMA_TO_DEVICE ? qp->max_write_sge : + qp->max_read_sge; + struct ib_sge *sge; + u32 total_len = 0, i, j; + + ctx->nr_ops = DIV_ROUND_UP(sg_cnt, max_sge); + + ctx->map.sges = sge = kcalloc(sg_cnt, sizeof(*sge), GFP_KERNEL); + if (!ctx->map.sges) + goto out; + + ctx->map.wrs = kcalloc(ctx->nr_ops, sizeof(*ctx->map.wrs), GFP_KERNEL); + if (!ctx->map.wrs) + goto out_free_sges; + + for (i = 0; i < ctx->nr_ops; i++) { + struct ib_rdma_wr *rdma_wr = &ctx->map.wrs[i]; + u32 nr_sge = min(sg_cnt, max_sge); + + if (dir == DMA_TO_DEVICE) + rdma_wr->wr.opcode = IB_WR_RDMA_WRITE; + else + rdma_wr->wr.opcode = IB_WR_RDMA_READ; + rdma_wr->remote_addr = remote_addr + total_len; + rdma_wr->rkey = rkey; + rdma_wr->wr.num_sge = nr_sge; + rdma_wr->wr.sg_list = sge; + + for (j = 0; j < nr_sge; j++, sg = sg_next(sg)) { + sge->addr = sg_dma_address(sg) + offset; + sge->length = sg_dma_len(sg) - offset; + sge->lkey = qp->pd->local_dma_lkey; + + total_len += sge->length; + sge++; + sg_cnt--; + offset = 0; + } + + rdma_wr->wr.next = i + 1 < ctx->nr_ops ? + &ctx->map.wrs[i + 1].wr : NULL; + } + + ctx->type = RDMA_RW_MULTI_WR; + return ctx->nr_ops; + +out_free_sges: + kfree(ctx->map.sges); +out: + return -ENOMEM; +} + +static int rdma_rw_init_single_wr(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + struct scatterlist *sg, u32 offset, u64 remote_addr, u32 rkey, + enum dma_data_direction dir) +{ + struct ib_rdma_wr *rdma_wr = &ctx->single.wr; + + ctx->nr_ops = 1; + + ctx->single.sge.lkey = qp->pd->local_dma_lkey; + ctx->single.sge.addr = sg_dma_address(sg) + offset; + ctx->single.sge.length = sg_dma_len(sg) - offset; + + memset(rdma_wr, 0, sizeof(*rdma_wr)); + if (dir == DMA_TO_DEVICE) + rdma_wr->wr.opcode = IB_WR_RDMA_WRITE; + else + rdma_wr->wr.opcode = IB_WR_RDMA_READ; + rdma_wr->wr.sg_list = &ctx->single.sge; + rdma_wr->wr.num_sge = 1; + rdma_wr->remote_addr = remote_addr; + rdma_wr->rkey = rkey; + + ctx->type = RDMA_RW_SINGLE_WR; + return 1; +} + +static void rdma_rw_unmap_sg(struct ib_device *dev, struct scatterlist *sg, + u32 sg_cnt, enum dma_data_direction dir) +{ + if (is_pci_p2pdma_page(sg_page(sg))) + pci_p2pdma_unmap_sg(dev->dma_device, sg, sg_cnt, dir); + else + ib_dma_unmap_sg(dev, sg, sg_cnt, dir); +} + +static int rdma_rw_map_sgtable(struct ib_device *dev, struct sg_table *sgt, + enum dma_data_direction dir) +{ + int nents; + + if (is_pci_p2pdma_page(sg_page(sgt->sgl))) { + if (WARN_ON_ONCE(ib_uses_virt_dma(dev))) + return 0; + nents = pci_p2pdma_map_sg(dev->dma_device, sgt->sgl, + sgt->orig_nents, dir); + if (!nents) + return -EIO; + sgt->nents = nents; + return 0; + } + return ib_dma_map_sgtable_attrs(dev, sgt, dir, 0); +} + +/** + * rdma_rw_ctx_init - initialize a RDMA READ/WRITE context + * @ctx: context to initialize + * @qp: queue pair to operate on + * @port_num: port num to which the connection is bound + * @sg: scatterlist to READ/WRITE from/to + * @sg_cnt: number of entries in @sg + * @sg_offset: current byte offset into @sg + * @remote_addr:remote address to read/write (relative to @rkey) + * @rkey: remote key to operate on + * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ + * + * Returns the number of WQEs that will be needed on the workqueue if + * successful, or a negative error code. + */ +int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 port_num, + struct scatterlist *sg, u32 sg_cnt, u32 sg_offset, + u64 remote_addr, u32 rkey, enum dma_data_direction dir) +{ + struct ib_device *dev = qp->pd->device; + struct sg_table sgt = { + .sgl = sg, + .orig_nents = sg_cnt, + }; + int ret; + + ret = rdma_rw_map_sgtable(dev, &sgt, dir); + if (ret) + return ret; + sg_cnt = sgt.nents; + + /* + * Skip to the S/G entry that sg_offset falls into: + */ + for (;;) { + u32 len = sg_dma_len(sg); + + if (sg_offset < len) + break; + + sg = sg_next(sg); + sg_offset -= len; + sg_cnt--; + } + + ret = -EIO; + if (WARN_ON_ONCE(sg_cnt == 0)) + goto out_unmap_sg; + + if (rdma_rw_io_needs_mr(qp->device, port_num, dir, sg_cnt)) { + ret = rdma_rw_init_mr_wrs(ctx, qp, port_num, sg, sg_cnt, + sg_offset, remote_addr, rkey, dir); + } else if (sg_cnt > 1) { + ret = rdma_rw_init_map_wrs(ctx, qp, sg, sg_cnt, sg_offset, + remote_addr, rkey, dir); + } else { + ret = rdma_rw_init_single_wr(ctx, qp, sg, sg_offset, + remote_addr, rkey, dir); + } + + if (ret < 0) + goto out_unmap_sg; + return ret; + +out_unmap_sg: + rdma_rw_unmap_sg(dev, sgt.sgl, sgt.orig_nents, dir); + return ret; +} +EXPORT_SYMBOL(rdma_rw_ctx_init); + +/** + * rdma_rw_ctx_signature_init - initialize a RW context with signature offload + * @ctx: context to initialize + * @qp: queue pair to operate on + * @port_num: port num to which the connection is bound + * @sg: scatterlist to READ/WRITE from/to + * @sg_cnt: number of entries in @sg + * @prot_sg: scatterlist to READ/WRITE protection information from/to + * @prot_sg_cnt: number of entries in @prot_sg + * @sig_attrs: signature offloading algorithms + * @remote_addr:remote address to read/write (relative to @rkey) + * @rkey: remote key to operate on + * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ + * + * Returns the number of WQEs that will be needed on the workqueue if + * successful, or a negative error code. + */ +int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + u32 port_num, struct scatterlist *sg, u32 sg_cnt, + struct scatterlist *prot_sg, u32 prot_sg_cnt, + struct ib_sig_attrs *sig_attrs, + u64 remote_addr, u32 rkey, enum dma_data_direction dir) +{ + struct ib_device *dev = qp->pd->device; + u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device, + qp->integrity_en); + struct sg_table sgt = { + .sgl = sg, + .orig_nents = sg_cnt, + }; + struct sg_table prot_sgt = { + .sgl = prot_sg, + .orig_nents = prot_sg_cnt, + }; + struct ib_rdma_wr *rdma_wr; + int count = 0, ret; + + if (sg_cnt > pages_per_mr || prot_sg_cnt > pages_per_mr) { + pr_err("SG count too large: sg_cnt=%u, prot_sg_cnt=%u, pages_per_mr=%u\n", + sg_cnt, prot_sg_cnt, pages_per_mr); + return -EINVAL; + } + + ret = rdma_rw_map_sgtable(dev, &sgt, dir); + if (ret) + return ret; + + if (prot_sg_cnt) { + ret = rdma_rw_map_sgtable(dev, &prot_sgt, dir); + if (ret) + goto out_unmap_sg; + } + + ctx->type = RDMA_RW_SIG_MR; + ctx->nr_ops = 1; + ctx->reg = kzalloc(sizeof(*ctx->reg), GFP_KERNEL); + if (!ctx->reg) { + ret = -ENOMEM; + goto out_unmap_prot_sg; + } + + ctx->reg->mr = ib_mr_pool_get(qp, &qp->sig_mrs); + if (!ctx->reg->mr) { + ret = -EAGAIN; + goto out_free_ctx; + } + + count += rdma_rw_inv_key(ctx->reg); + + memcpy(ctx->reg->mr->sig_attrs, sig_attrs, sizeof(struct ib_sig_attrs)); + + ret = ib_map_mr_sg_pi(ctx->reg->mr, sg, sgt.nents, NULL, prot_sg, + prot_sgt.nents, NULL, SZ_4K); + if (unlikely(ret)) { + pr_err("failed to map PI sg (%u)\n", + sgt.nents + prot_sgt.nents); + goto out_destroy_sig_mr; + } + + ctx->reg->reg_wr.wr.opcode = IB_WR_REG_MR_INTEGRITY; + ctx->reg->reg_wr.wr.wr_cqe = NULL; + ctx->reg->reg_wr.wr.num_sge = 0; + ctx->reg->reg_wr.wr.send_flags = 0; + ctx->reg->reg_wr.access = IB_ACCESS_LOCAL_WRITE; + if (rdma_protocol_iwarp(qp->device, port_num)) + ctx->reg->reg_wr.access |= IB_ACCESS_REMOTE_WRITE; + ctx->reg->reg_wr.mr = ctx->reg->mr; + ctx->reg->reg_wr.key = ctx->reg->mr->lkey; + count++; + + ctx->reg->sge.addr = ctx->reg->mr->iova; + ctx->reg->sge.length = ctx->reg->mr->length; + if (sig_attrs->wire.sig_type == IB_SIG_TYPE_NONE) + ctx->reg->sge.length -= ctx->reg->mr->sig_attrs->meta_length; + + rdma_wr = &ctx->reg->wr; + rdma_wr->wr.sg_list = &ctx->reg->sge; + rdma_wr->wr.num_sge = 1; + rdma_wr->remote_addr = remote_addr; + rdma_wr->rkey = rkey; + if (dir == DMA_TO_DEVICE) + rdma_wr->wr.opcode = IB_WR_RDMA_WRITE; + else + rdma_wr->wr.opcode = IB_WR_RDMA_READ; + ctx->reg->reg_wr.wr.next = &rdma_wr->wr; + count++; + + return count; + +out_destroy_sig_mr: + ib_mr_pool_put(qp, &qp->sig_mrs, ctx->reg->mr); +out_free_ctx: + kfree(ctx->reg); +out_unmap_prot_sg: + if (prot_sgt.nents) + rdma_rw_unmap_sg(dev, prot_sgt.sgl, prot_sgt.orig_nents, dir); +out_unmap_sg: + rdma_rw_unmap_sg(dev, sgt.sgl, sgt.orig_nents, dir); + return ret; +} +EXPORT_SYMBOL(rdma_rw_ctx_signature_init); + +/* + * Now that we are going to post the WRs we can update the lkey and need_inval + * state on the MRs. If we were doing this at init time, we would get double + * or missing invalidations if a context was initialized but not actually + * posted. + */ +static void rdma_rw_update_lkey(struct rdma_rw_reg_ctx *reg, bool need_inval) +{ + reg->mr->need_inval = need_inval; + ib_update_fast_reg_key(reg->mr, ib_inc_rkey(reg->mr->lkey)); + reg->reg_wr.key = reg->mr->lkey; + reg->sge.lkey = reg->mr->lkey; +} + +/** + * rdma_rw_ctx_wrs - return chain of WRs for a RDMA READ or WRITE operation + * @ctx: context to operate on + * @qp: queue pair to operate on + * @port_num: port num to which the connection is bound + * @cqe: completion queue entry for the last WR + * @chain_wr: WR to append to the posted chain + * + * Return the WR chain for the set of RDMA READ/WRITE operations described by + * @ctx, as well as any memory registration operations needed. If @chain_wr + * is non-NULL the WR it points to will be appended to the chain of WRs posted. + * If @chain_wr is not set @cqe must be set so that the caller gets a + * completion notification. + */ +struct ib_send_wr *rdma_rw_ctx_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + u32 port_num, struct ib_cqe *cqe, struct ib_send_wr *chain_wr) +{ + struct ib_send_wr *first_wr, *last_wr; + int i; + + switch (ctx->type) { + case RDMA_RW_SIG_MR: + case RDMA_RW_MR: + for (i = 0; i < ctx->nr_ops; i++) { + rdma_rw_update_lkey(&ctx->reg[i], + ctx->reg[i].wr.wr.opcode != + IB_WR_RDMA_READ_WITH_INV); + } + + if (ctx->reg[0].inv_wr.next) + first_wr = &ctx->reg[0].inv_wr; + else + first_wr = &ctx->reg[0].reg_wr.wr; + last_wr = &ctx->reg[ctx->nr_ops - 1].wr.wr; + break; + case RDMA_RW_MULTI_WR: + first_wr = &ctx->map.wrs[0].wr; + last_wr = &ctx->map.wrs[ctx->nr_ops - 1].wr; + break; + case RDMA_RW_SINGLE_WR: + first_wr = &ctx->single.wr.wr; + last_wr = &ctx->single.wr.wr; + break; + default: + BUG(); + } + + if (chain_wr) { + last_wr->next = chain_wr; + } else { + last_wr->wr_cqe = cqe; + last_wr->send_flags |= IB_SEND_SIGNALED; + } + + return first_wr; +} +EXPORT_SYMBOL(rdma_rw_ctx_wrs); + +/** + * rdma_rw_ctx_post - post a RDMA READ or RDMA WRITE operation + * @ctx: context to operate on + * @qp: queue pair to operate on + * @port_num: port num to which the connection is bound + * @cqe: completion queue entry for the last WR + * @chain_wr: WR to append to the posted chain + * + * Post the set of RDMA READ/WRITE operations described by @ctx, as well as + * any memory registration operations needed. If @chain_wr is non-NULL the + * WR it points to will be appended to the chain of WRs posted. If @chain_wr + * is not set @cqe must be set so that the caller gets a completion + * notification. + */ +int rdma_rw_ctx_post(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 port_num, + struct ib_cqe *cqe, struct ib_send_wr *chain_wr) +{ + struct ib_send_wr *first_wr; + + first_wr = rdma_rw_ctx_wrs(ctx, qp, port_num, cqe, chain_wr); + return ib_post_send(qp, first_wr, NULL); +} +EXPORT_SYMBOL(rdma_rw_ctx_post); + +/** + * rdma_rw_ctx_destroy - release all resources allocated by rdma_rw_ctx_init + * @ctx: context to release + * @qp: queue pair to operate on + * @port_num: port num to which the connection is bound + * @sg: scatterlist that was used for the READ/WRITE + * @sg_cnt: number of entries in @sg + * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ + */ +void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + u32 port_num, struct scatterlist *sg, u32 sg_cnt, + enum dma_data_direction dir) +{ + int i; + + switch (ctx->type) { + case RDMA_RW_MR: + for (i = 0; i < ctx->nr_ops; i++) + ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr); + kfree(ctx->reg); + break; + case RDMA_RW_MULTI_WR: + kfree(ctx->map.wrs); + kfree(ctx->map.sges); + break; + case RDMA_RW_SINGLE_WR: + break; + default: + BUG(); + break; + } + + rdma_rw_unmap_sg(qp->pd->device, sg, sg_cnt, dir); +} +EXPORT_SYMBOL(rdma_rw_ctx_destroy); + +/** + * rdma_rw_ctx_destroy_signature - release all resources allocated by + * rdma_rw_ctx_signature_init + * @ctx: context to release + * @qp: queue pair to operate on + * @port_num: port num to which the connection is bound + * @sg: scatterlist that was used for the READ/WRITE + * @sg_cnt: number of entries in @sg + * @prot_sg: scatterlist that was used for the READ/WRITE of the PI + * @prot_sg_cnt: number of entries in @prot_sg + * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ + */ +void rdma_rw_ctx_destroy_signature(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + u32 port_num, struct scatterlist *sg, u32 sg_cnt, + struct scatterlist *prot_sg, u32 prot_sg_cnt, + enum dma_data_direction dir) +{ + if (WARN_ON_ONCE(ctx->type != RDMA_RW_SIG_MR)) + return; + + ib_mr_pool_put(qp, &qp->sig_mrs, ctx->reg->mr); + kfree(ctx->reg); + + if (prot_sg_cnt) + rdma_rw_unmap_sg(qp->pd->device, prot_sg, prot_sg_cnt, dir); + rdma_rw_unmap_sg(qp->pd->device, sg, sg_cnt, dir); +} +EXPORT_SYMBOL(rdma_rw_ctx_destroy_signature); + +/** + * rdma_rw_mr_factor - return number of MRs required for a payload + * @device: device handling the connection + * @port_num: port num to which the connection is bound + * @maxpages: maximum payload pages per rdma_rw_ctx + * + * Returns the number of MRs the device requires to move @maxpayload + * bytes. The returned value is used during transport creation to + * compute max_rdma_ctxts and the size of the transport's Send and + * Send Completion Queues. + */ +unsigned int rdma_rw_mr_factor(struct ib_device *device, u32 port_num, + unsigned int maxpages) +{ + unsigned int mr_pages; + + if (rdma_rw_can_use_mr(device, port_num)) + mr_pages = rdma_rw_fr_page_list_len(device, false); + else + mr_pages = device->attrs.max_sge_rd; + return DIV_ROUND_UP(maxpages, mr_pages); +} +EXPORT_SYMBOL(rdma_rw_mr_factor); + +void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr) +{ + u32 factor; + + WARN_ON_ONCE(attr->port_num == 0); + + /* + * Each context needs at least one RDMA READ or WRITE WR. + * + * For some hardware we might need more, eventually we should ask the + * HCA driver for a multiplier here. + */ + factor = 1; + + /* + * If the devices needs MRs to perform RDMA READ or WRITE operations, + * we'll need two additional MRs for the registrations and the + * invalidation. + */ + if (attr->create_flags & IB_QP_CREATE_INTEGRITY_EN || + rdma_rw_can_use_mr(dev, attr->port_num)) + factor += 2; /* inv + reg */ + + attr->cap.max_send_wr += factor * attr->cap.max_rdma_ctxs; + + /* + * But maybe we were just too high in the sky and the device doesn't + * even support all we need, and we'll have to live with what we get.. + */ + attr->cap.max_send_wr = + min_t(u32, attr->cap.max_send_wr, dev->attrs.max_qp_wr); +} + +int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr) +{ + struct ib_device *dev = qp->pd->device; + u32 nr_mrs = 0, nr_sig_mrs = 0, max_num_sg = 0; + int ret = 0; + + if (attr->create_flags & IB_QP_CREATE_INTEGRITY_EN) { + nr_sig_mrs = attr->cap.max_rdma_ctxs; + nr_mrs = attr->cap.max_rdma_ctxs; + max_num_sg = rdma_rw_fr_page_list_len(dev, true); + } else if (rdma_rw_can_use_mr(dev, attr->port_num)) { + nr_mrs = attr->cap.max_rdma_ctxs; + max_num_sg = rdma_rw_fr_page_list_len(dev, false); + } + + if (nr_mrs) { + ret = ib_mr_pool_init(qp, &qp->rdma_mrs, nr_mrs, + IB_MR_TYPE_MEM_REG, + max_num_sg, 0); + if (ret) { + pr_err("%s: failed to allocated %u MRs\n", + __func__, nr_mrs); + return ret; + } + } + + if (nr_sig_mrs) { + ret = ib_mr_pool_init(qp, &qp->sig_mrs, nr_sig_mrs, + IB_MR_TYPE_INTEGRITY, max_num_sg, max_num_sg); + if (ret) { + pr_err("%s: failed to allocated %u SIG MRs\n", + __func__, nr_sig_mrs); + goto out_free_rdma_mrs; + } + } + + return 0; + +out_free_rdma_mrs: + ib_mr_pool_destroy(qp, &qp->rdma_mrs); + return ret; +} + +void rdma_rw_cleanup_mrs(struct ib_qp *qp) +{ + ib_mr_pool_destroy(qp, &qp->sig_mrs); + ib_mr_pool_destroy(qp, &qp->rdma_mrs); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/sa.h b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/sa.h new file mode 100644 index 0000000..9375880 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/sa.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Voltaire, Inc. All rights reserved. + * Copyright (c) 2006 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef SA_H +#define SA_H + +#include + +static inline void ib_sa_client_get(struct ib_sa_client *client) +{ + atomic_inc(&client->users); +} + +static inline void ib_sa_client_put(struct ib_sa_client *client) +{ + if (atomic_dec_and_test(&client->users)) + complete(&client->comp); +} + +int ib_sa_mcmember_rec_query(struct ib_sa_client *client, + struct ib_device *device, u32 port_num, u8 method, + struct ib_sa_mcmember_rec *rec, + ib_sa_comp_mask comp_mask, + unsigned long timeout_ms, int retries, gfp_t gfp_mask, + void (*callback)(int status, + struct ib_sa_mcmember_rec *resp, + void *context), + void *context, struct ib_sa_query **sa_query); + +int mcast_init(void); +void mcast_cleanup(void); + +#endif /* SA_H */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/sa_query.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/sa_query.c new file mode 100644 index 0000000..0011d53 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/sa_query.c @@ -0,0 +1,2360 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Voltaire, Inc. All rights reserved. + * Copyright (c) 2006 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "sa.h" +#include "core_priv.h" + +#define IB_SA_LOCAL_SVC_TIMEOUT_MIN 100 +#define IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT 2000 +#define IB_SA_LOCAL_SVC_TIMEOUT_MAX 200000 +#define IB_SA_CPI_MAX_RETRY_CNT 3 +#define IB_SA_CPI_RETRY_WAIT 1000 /*msecs */ +static int sa_local_svc_timeout_ms = IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT; + +struct ib_sa_sm_ah { + struct ib_ah *ah; + struct kref ref; + u16 pkey_index; + u8 src_path_mask; +}; + +enum rdma_class_port_info_type { + RDMA_CLASS_PORT_INFO_IB, + RDMA_CLASS_PORT_INFO_OPA +}; + +struct rdma_class_port_info { + enum rdma_class_port_info_type type; + union { + struct ib_class_port_info ib; + struct opa_class_port_info opa; + }; +}; + +struct ib_sa_classport_cache { + bool valid; + int retry_cnt; + struct rdma_class_port_info data; +}; + +struct ib_sa_port { + struct ib_mad_agent *agent; + struct ib_sa_sm_ah *sm_ah; + struct work_struct update_task; + struct ib_sa_classport_cache classport_info; + struct delayed_work ib_cpi_work; + spinlock_t classport_lock; /* protects class port info set */ + spinlock_t ah_lock; + u32 port_num; +}; + +struct ib_sa_device { + int start_port, end_port; + struct ib_event_handler event_handler; + struct ib_sa_port port[]; +}; + +struct ib_sa_query { + void (*callback)(struct ib_sa_query *sa_query, int status, + int num_prs, struct ib_sa_mad *mad); + void (*release)(struct ib_sa_query *); + struct ib_sa_client *client; + struct ib_sa_port *port; + struct ib_mad_send_buf *mad_buf; + struct ib_sa_sm_ah *sm_ah; + int id; + u32 flags; + struct list_head list; /* Local svc request list */ + u32 seq; /* Local svc request sequence number */ + unsigned long timeout; /* Local svc timeout */ + u8 path_use; /* How will the pathrecord be used */ + + /* A separate buffer to save pathrecords of a response, as in cases + * like IB/netlink, mulptiple pathrecords are supported, so that + * mad->data is not large enough to hold them + */ + void *resp_pr_data; +}; + +#define IB_SA_ENABLE_LOCAL_SERVICE 0x00000001 +#define IB_SA_CANCEL 0x00000002 +#define IB_SA_QUERY_OPA 0x00000004 + +struct ib_sa_path_query { + void (*callback)(int status, struct sa_path_rec *rec, + int num_paths, void *context); + void *context; + struct ib_sa_query sa_query; + struct sa_path_rec *conv_pr; +}; + +struct ib_sa_guidinfo_query { + void (*callback)(int, struct ib_sa_guidinfo_rec *, void *); + void *context; + struct ib_sa_query sa_query; +}; + +struct ib_sa_classport_info_query { + void (*callback)(void *); + void *context; + struct ib_sa_query sa_query; +}; + +struct ib_sa_mcmember_query { + void (*callback)(int, struct ib_sa_mcmember_rec *, void *); + void *context; + struct ib_sa_query sa_query; +}; + +static LIST_HEAD(ib_nl_request_list); +static DEFINE_SPINLOCK(ib_nl_request_lock); +static atomic_t ib_nl_sa_request_seq; +static struct workqueue_struct *ib_nl_wq; +static struct delayed_work ib_nl_timed_work; +static const struct nla_policy ib_nl_policy[LS_NLA_TYPE_MAX] = { + [LS_NLA_TYPE_PATH_RECORD] = {.type = NLA_BINARY, + .len = sizeof(struct ib_path_rec_data)}, + [LS_NLA_TYPE_TIMEOUT] = {.type = NLA_U32}, + [LS_NLA_TYPE_SERVICE_ID] = {.type = NLA_U64}, + [LS_NLA_TYPE_DGID] = {.type = NLA_BINARY, + .len = sizeof(struct rdma_nla_ls_gid)}, + [LS_NLA_TYPE_SGID] = {.type = NLA_BINARY, + .len = sizeof(struct rdma_nla_ls_gid)}, + [LS_NLA_TYPE_TCLASS] = {.type = NLA_U8}, + [LS_NLA_TYPE_PKEY] = {.type = NLA_U16}, + [LS_NLA_TYPE_QOS_CLASS] = {.type = NLA_U16}, +}; + + +static int ib_sa_add_one(struct ib_device *device); +static void ib_sa_remove_one(struct ib_device *device, void *client_data); + +static struct ib_client sa_client = { + .name = "sa", + .add = ib_sa_add_one, + .remove = ib_sa_remove_one +}; + +static DEFINE_XARRAY_FLAGS(queries, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ); + +static DEFINE_SPINLOCK(tid_lock); +static u32 tid; + +#define PATH_REC_FIELD(field) \ + .struct_offset_bytes = offsetof(struct sa_path_rec, field), \ + .struct_size_bytes = sizeof_field(struct sa_path_rec, field), \ + .field_name = "sa_path_rec:" #field + +static const struct ib_field path_rec_table[] = { + { PATH_REC_FIELD(service_id), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 64 }, + { PATH_REC_FIELD(dgid), + .offset_words = 2, + .offset_bits = 0, + .size_bits = 128 }, + { PATH_REC_FIELD(sgid), + .offset_words = 6, + .offset_bits = 0, + .size_bits = 128 }, + { PATH_REC_FIELD(ib.dlid), + .offset_words = 10, + .offset_bits = 0, + .size_bits = 16 }, + { PATH_REC_FIELD(ib.slid), + .offset_words = 10, + .offset_bits = 16, + .size_bits = 16 }, + { PATH_REC_FIELD(ib.raw_traffic), + .offset_words = 11, + .offset_bits = 0, + .size_bits = 1 }, + { RESERVED, + .offset_words = 11, + .offset_bits = 1, + .size_bits = 3 }, + { PATH_REC_FIELD(flow_label), + .offset_words = 11, + .offset_bits = 4, + .size_bits = 20 }, + { PATH_REC_FIELD(hop_limit), + .offset_words = 11, + .offset_bits = 24, + .size_bits = 8 }, + { PATH_REC_FIELD(traffic_class), + .offset_words = 12, + .offset_bits = 0, + .size_bits = 8 }, + { PATH_REC_FIELD(reversible), + .offset_words = 12, + .offset_bits = 8, + .size_bits = 1 }, + { PATH_REC_FIELD(numb_path), + .offset_words = 12, + .offset_bits = 9, + .size_bits = 7 }, + { PATH_REC_FIELD(pkey), + .offset_words = 12, + .offset_bits = 16, + .size_bits = 16 }, + { PATH_REC_FIELD(qos_class), + .offset_words = 13, + .offset_bits = 0, + .size_bits = 12 }, + { PATH_REC_FIELD(sl), + .offset_words = 13, + .offset_bits = 12, + .size_bits = 4 }, + { PATH_REC_FIELD(mtu_selector), + .offset_words = 13, + .offset_bits = 16, + .size_bits = 2 }, + { PATH_REC_FIELD(mtu), + .offset_words = 13, + .offset_bits = 18, + .size_bits = 6 }, + { PATH_REC_FIELD(rate_selector), + .offset_words = 13, + .offset_bits = 24, + .size_bits = 2 }, + { PATH_REC_FIELD(rate), + .offset_words = 13, + .offset_bits = 26, + .size_bits = 6 }, + { PATH_REC_FIELD(packet_life_time_selector), + .offset_words = 14, + .offset_bits = 0, + .size_bits = 2 }, + { PATH_REC_FIELD(packet_life_time), + .offset_words = 14, + .offset_bits = 2, + .size_bits = 6 }, + { PATH_REC_FIELD(preference), + .offset_words = 14, + .offset_bits = 8, + .size_bits = 8 }, + { RESERVED, + .offset_words = 14, + .offset_bits = 16, + .size_bits = 48 }, +}; + +#define OPA_PATH_REC_FIELD(field) \ + .struct_offset_bytes = \ + offsetof(struct sa_path_rec, field), \ + .struct_size_bytes = \ + sizeof_field(struct sa_path_rec, field), \ + .field_name = "sa_path_rec:" #field + +static const struct ib_field opa_path_rec_table[] = { + { OPA_PATH_REC_FIELD(service_id), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 64 }, + { OPA_PATH_REC_FIELD(dgid), + .offset_words = 2, + .offset_bits = 0, + .size_bits = 128 }, + { OPA_PATH_REC_FIELD(sgid), + .offset_words = 6, + .offset_bits = 0, + .size_bits = 128 }, + { OPA_PATH_REC_FIELD(opa.dlid), + .offset_words = 10, + .offset_bits = 0, + .size_bits = 32 }, + { OPA_PATH_REC_FIELD(opa.slid), + .offset_words = 11, + .offset_bits = 0, + .size_bits = 32 }, + { OPA_PATH_REC_FIELD(opa.raw_traffic), + .offset_words = 12, + .offset_bits = 0, + .size_bits = 1 }, + { RESERVED, + .offset_words = 12, + .offset_bits = 1, + .size_bits = 3 }, + { OPA_PATH_REC_FIELD(flow_label), + .offset_words = 12, + .offset_bits = 4, + .size_bits = 20 }, + { OPA_PATH_REC_FIELD(hop_limit), + .offset_words = 12, + .offset_bits = 24, + .size_bits = 8 }, + { OPA_PATH_REC_FIELD(traffic_class), + .offset_words = 13, + .offset_bits = 0, + .size_bits = 8 }, + { OPA_PATH_REC_FIELD(reversible), + .offset_words = 13, + .offset_bits = 8, + .size_bits = 1 }, + { OPA_PATH_REC_FIELD(numb_path), + .offset_words = 13, + .offset_bits = 9, + .size_bits = 7 }, + { OPA_PATH_REC_FIELD(pkey), + .offset_words = 13, + .offset_bits = 16, + .size_bits = 16 }, + { OPA_PATH_REC_FIELD(opa.l2_8B), + .offset_words = 14, + .offset_bits = 0, + .size_bits = 1 }, + { OPA_PATH_REC_FIELD(opa.l2_10B), + .offset_words = 14, + .offset_bits = 1, + .size_bits = 1 }, + { OPA_PATH_REC_FIELD(opa.l2_9B), + .offset_words = 14, + .offset_bits = 2, + .size_bits = 1 }, + { OPA_PATH_REC_FIELD(opa.l2_16B), + .offset_words = 14, + .offset_bits = 3, + .size_bits = 1 }, + { RESERVED, + .offset_words = 14, + .offset_bits = 4, + .size_bits = 2 }, + { OPA_PATH_REC_FIELD(opa.qos_type), + .offset_words = 14, + .offset_bits = 6, + .size_bits = 2 }, + { OPA_PATH_REC_FIELD(opa.qos_priority), + .offset_words = 14, + .offset_bits = 8, + .size_bits = 8 }, + { RESERVED, + .offset_words = 14, + .offset_bits = 16, + .size_bits = 3 }, + { OPA_PATH_REC_FIELD(sl), + .offset_words = 14, + .offset_bits = 19, + .size_bits = 5 }, + { RESERVED, + .offset_words = 14, + .offset_bits = 24, + .size_bits = 8 }, + { OPA_PATH_REC_FIELD(mtu_selector), + .offset_words = 15, + .offset_bits = 0, + .size_bits = 2 }, + { OPA_PATH_REC_FIELD(mtu), + .offset_words = 15, + .offset_bits = 2, + .size_bits = 6 }, + { OPA_PATH_REC_FIELD(rate_selector), + .offset_words = 15, + .offset_bits = 8, + .size_bits = 2 }, + { OPA_PATH_REC_FIELD(rate), + .offset_words = 15, + .offset_bits = 10, + .size_bits = 6 }, + { OPA_PATH_REC_FIELD(packet_life_time_selector), + .offset_words = 15, + .offset_bits = 16, + .size_bits = 2 }, + { OPA_PATH_REC_FIELD(packet_life_time), + .offset_words = 15, + .offset_bits = 18, + .size_bits = 6 }, + { OPA_PATH_REC_FIELD(preference), + .offset_words = 15, + .offset_bits = 24, + .size_bits = 8 }, +}; + +#define MCMEMBER_REC_FIELD(field) \ + .struct_offset_bytes = offsetof(struct ib_sa_mcmember_rec, field), \ + .struct_size_bytes = sizeof_field(struct ib_sa_mcmember_rec, field), \ + .field_name = "sa_mcmember_rec:" #field + +static const struct ib_field mcmember_rec_table[] = { + { MCMEMBER_REC_FIELD(mgid), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 128 }, + { MCMEMBER_REC_FIELD(port_gid), + .offset_words = 4, + .offset_bits = 0, + .size_bits = 128 }, + { MCMEMBER_REC_FIELD(qkey), + .offset_words = 8, + .offset_bits = 0, + .size_bits = 32 }, + { MCMEMBER_REC_FIELD(mlid), + .offset_words = 9, + .offset_bits = 0, + .size_bits = 16 }, + { MCMEMBER_REC_FIELD(mtu_selector), + .offset_words = 9, + .offset_bits = 16, + .size_bits = 2 }, + { MCMEMBER_REC_FIELD(mtu), + .offset_words = 9, + .offset_bits = 18, + .size_bits = 6 }, + { MCMEMBER_REC_FIELD(traffic_class), + .offset_words = 9, + .offset_bits = 24, + .size_bits = 8 }, + { MCMEMBER_REC_FIELD(pkey), + .offset_words = 10, + .offset_bits = 0, + .size_bits = 16 }, + { MCMEMBER_REC_FIELD(rate_selector), + .offset_words = 10, + .offset_bits = 16, + .size_bits = 2 }, + { MCMEMBER_REC_FIELD(rate), + .offset_words = 10, + .offset_bits = 18, + .size_bits = 6 }, + { MCMEMBER_REC_FIELD(packet_life_time_selector), + .offset_words = 10, + .offset_bits = 24, + .size_bits = 2 }, + { MCMEMBER_REC_FIELD(packet_life_time), + .offset_words = 10, + .offset_bits = 26, + .size_bits = 6 }, + { MCMEMBER_REC_FIELD(sl), + .offset_words = 11, + .offset_bits = 0, + .size_bits = 4 }, + { MCMEMBER_REC_FIELD(flow_label), + .offset_words = 11, + .offset_bits = 4, + .size_bits = 20 }, + { MCMEMBER_REC_FIELD(hop_limit), + .offset_words = 11, + .offset_bits = 24, + .size_bits = 8 }, + { MCMEMBER_REC_FIELD(scope), + .offset_words = 12, + .offset_bits = 0, + .size_bits = 4 }, + { MCMEMBER_REC_FIELD(join_state), + .offset_words = 12, + .offset_bits = 4, + .size_bits = 4 }, + { MCMEMBER_REC_FIELD(proxy_join), + .offset_words = 12, + .offset_bits = 8, + .size_bits = 1 }, + { RESERVED, + .offset_words = 12, + .offset_bits = 9, + .size_bits = 23 }, +}; + +#define CLASSPORTINFO_REC_FIELD(field) \ + .struct_offset_bytes = offsetof(struct ib_class_port_info, field), \ + .struct_size_bytes = sizeof_field(struct ib_class_port_info, field), \ + .field_name = "ib_class_port_info:" #field + +static const struct ib_field ib_classport_info_rec_table[] = { + { CLASSPORTINFO_REC_FIELD(base_version), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 8 }, + { CLASSPORTINFO_REC_FIELD(class_version), + .offset_words = 0, + .offset_bits = 8, + .size_bits = 8 }, + { CLASSPORTINFO_REC_FIELD(capability_mask), + .offset_words = 0, + .offset_bits = 16, + .size_bits = 16 }, + { CLASSPORTINFO_REC_FIELD(cap_mask2_resp_time), + .offset_words = 1, + .offset_bits = 0, + .size_bits = 32 }, + { CLASSPORTINFO_REC_FIELD(redirect_gid), + .offset_words = 2, + .offset_bits = 0, + .size_bits = 128 }, + { CLASSPORTINFO_REC_FIELD(redirect_tcslfl), + .offset_words = 6, + .offset_bits = 0, + .size_bits = 32 }, + { CLASSPORTINFO_REC_FIELD(redirect_lid), + .offset_words = 7, + .offset_bits = 0, + .size_bits = 16 }, + { CLASSPORTINFO_REC_FIELD(redirect_pkey), + .offset_words = 7, + .offset_bits = 16, + .size_bits = 16 }, + + { CLASSPORTINFO_REC_FIELD(redirect_qp), + .offset_words = 8, + .offset_bits = 0, + .size_bits = 32 }, + { CLASSPORTINFO_REC_FIELD(redirect_qkey), + .offset_words = 9, + .offset_bits = 0, + .size_bits = 32 }, + + { CLASSPORTINFO_REC_FIELD(trap_gid), + .offset_words = 10, + .offset_bits = 0, + .size_bits = 128 }, + { CLASSPORTINFO_REC_FIELD(trap_tcslfl), + .offset_words = 14, + .offset_bits = 0, + .size_bits = 32 }, + + { CLASSPORTINFO_REC_FIELD(trap_lid), + .offset_words = 15, + .offset_bits = 0, + .size_bits = 16 }, + { CLASSPORTINFO_REC_FIELD(trap_pkey), + .offset_words = 15, + .offset_bits = 16, + .size_bits = 16 }, + + { CLASSPORTINFO_REC_FIELD(trap_hlqp), + .offset_words = 16, + .offset_bits = 0, + .size_bits = 32 }, + { CLASSPORTINFO_REC_FIELD(trap_qkey), + .offset_words = 17, + .offset_bits = 0, + .size_bits = 32 }, +}; + +#define OPA_CLASSPORTINFO_REC_FIELD(field) \ + .struct_offset_bytes =\ + offsetof(struct opa_class_port_info, field), \ + .struct_size_bytes = \ + sizeof_field(struct opa_class_port_info, field), \ + .field_name = "opa_class_port_info:" #field + +static const struct ib_field opa_classport_info_rec_table[] = { + { OPA_CLASSPORTINFO_REC_FIELD(base_version), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 8 }, + { OPA_CLASSPORTINFO_REC_FIELD(class_version), + .offset_words = 0, + .offset_bits = 8, + .size_bits = 8 }, + { OPA_CLASSPORTINFO_REC_FIELD(cap_mask), + .offset_words = 0, + .offset_bits = 16, + .size_bits = 16 }, + { OPA_CLASSPORTINFO_REC_FIELD(cap_mask2_resp_time), + .offset_words = 1, + .offset_bits = 0, + .size_bits = 32 }, + { OPA_CLASSPORTINFO_REC_FIELD(redirect_gid), + .offset_words = 2, + .offset_bits = 0, + .size_bits = 128 }, + { OPA_CLASSPORTINFO_REC_FIELD(redirect_tc_fl), + .offset_words = 6, + .offset_bits = 0, + .size_bits = 32 }, + { OPA_CLASSPORTINFO_REC_FIELD(redirect_lid), + .offset_words = 7, + .offset_bits = 0, + .size_bits = 32 }, + { OPA_CLASSPORTINFO_REC_FIELD(redirect_sl_qp), + .offset_words = 8, + .offset_bits = 0, + .size_bits = 32 }, + { OPA_CLASSPORTINFO_REC_FIELD(redirect_qkey), + .offset_words = 9, + .offset_bits = 0, + .size_bits = 32 }, + { OPA_CLASSPORTINFO_REC_FIELD(trap_gid), + .offset_words = 10, + .offset_bits = 0, + .size_bits = 128 }, + { OPA_CLASSPORTINFO_REC_FIELD(trap_tc_fl), + .offset_words = 14, + .offset_bits = 0, + .size_bits = 32 }, + { OPA_CLASSPORTINFO_REC_FIELD(trap_lid), + .offset_words = 15, + .offset_bits = 0, + .size_bits = 32 }, + { OPA_CLASSPORTINFO_REC_FIELD(trap_hl_qp), + .offset_words = 16, + .offset_bits = 0, + .size_bits = 32 }, + { OPA_CLASSPORTINFO_REC_FIELD(trap_qkey), + .offset_words = 17, + .offset_bits = 0, + .size_bits = 32 }, + { OPA_CLASSPORTINFO_REC_FIELD(trap_pkey), + .offset_words = 18, + .offset_bits = 0, + .size_bits = 16 }, + { OPA_CLASSPORTINFO_REC_FIELD(redirect_pkey), + .offset_words = 18, + .offset_bits = 16, + .size_bits = 16 }, + { OPA_CLASSPORTINFO_REC_FIELD(trap_sl_rsvd), + .offset_words = 19, + .offset_bits = 0, + .size_bits = 8 }, + { RESERVED, + .offset_words = 19, + .offset_bits = 8, + .size_bits = 24 }, +}; + +#define GUIDINFO_REC_FIELD(field) \ + .struct_offset_bytes = offsetof(struct ib_sa_guidinfo_rec, field), \ + .struct_size_bytes = sizeof_field(struct ib_sa_guidinfo_rec, field), \ + .field_name = "sa_guidinfo_rec:" #field + +static const struct ib_field guidinfo_rec_table[] = { + { GUIDINFO_REC_FIELD(lid), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 16 }, + { GUIDINFO_REC_FIELD(block_num), + .offset_words = 0, + .offset_bits = 16, + .size_bits = 8 }, + { GUIDINFO_REC_FIELD(res1), + .offset_words = 0, + .offset_bits = 24, + .size_bits = 8 }, + { GUIDINFO_REC_FIELD(res2), + .offset_words = 1, + .offset_bits = 0, + .size_bits = 32 }, + { GUIDINFO_REC_FIELD(guid_info_list), + .offset_words = 2, + .offset_bits = 0, + .size_bits = 512 }, +}; + +static inline void ib_sa_disable_local_svc(struct ib_sa_query *query) +{ + query->flags &= ~IB_SA_ENABLE_LOCAL_SERVICE; +} + +static inline int ib_sa_query_cancelled(struct ib_sa_query *query) +{ + return (query->flags & IB_SA_CANCEL); +} + +static void ib_nl_set_path_rec_attrs(struct sk_buff *skb, + struct ib_sa_query *query) +{ + struct sa_path_rec *sa_rec = query->mad_buf->context[1]; + struct ib_sa_mad *mad = query->mad_buf->mad; + ib_sa_comp_mask comp_mask = mad->sa_hdr.comp_mask; + u16 val16; + u64 val64; + struct rdma_ls_resolve_header *header; + + query->mad_buf->context[1] = NULL; + + /* Construct the family header first */ + header = skb_put(skb, NLMSG_ALIGN(sizeof(*header))); + strscpy_pad(header->device_name, + dev_name(&query->port->agent->device->dev), + LS_DEVICE_NAME_MAX); + header->port_num = query->port->port_num; + + if ((comp_mask & IB_SA_PATH_REC_REVERSIBLE) && + sa_rec->reversible != 0) + query->path_use = LS_RESOLVE_PATH_USE_ALL; + else + query->path_use = LS_RESOLVE_PATH_USE_UNIDIRECTIONAL; + header->path_use = query->path_use; + + /* Now build the attributes */ + if (comp_mask & IB_SA_PATH_REC_SERVICE_ID) { + val64 = be64_to_cpu(sa_rec->service_id); + nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_SERVICE_ID, + sizeof(val64), &val64); + } + if (comp_mask & IB_SA_PATH_REC_DGID) + nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_DGID, + sizeof(sa_rec->dgid), &sa_rec->dgid); + if (comp_mask & IB_SA_PATH_REC_SGID) + nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_SGID, + sizeof(sa_rec->sgid), &sa_rec->sgid); + if (comp_mask & IB_SA_PATH_REC_TRAFFIC_CLASS) + nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_TCLASS, + sizeof(sa_rec->traffic_class), &sa_rec->traffic_class); + + if (comp_mask & IB_SA_PATH_REC_PKEY) { + val16 = be16_to_cpu(sa_rec->pkey); + nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_PKEY, + sizeof(val16), &val16); + } + if (comp_mask & IB_SA_PATH_REC_QOS_CLASS) { + val16 = be16_to_cpu(sa_rec->qos_class); + nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_QOS_CLASS, + sizeof(val16), &val16); + } +} + +static int ib_nl_get_path_rec_attrs_len(ib_sa_comp_mask comp_mask) +{ + int len = 0; + + if (comp_mask & IB_SA_PATH_REC_SERVICE_ID) + len += nla_total_size(sizeof(u64)); + if (comp_mask & IB_SA_PATH_REC_DGID) + len += nla_total_size(sizeof(struct rdma_nla_ls_gid)); + if (comp_mask & IB_SA_PATH_REC_SGID) + len += nla_total_size(sizeof(struct rdma_nla_ls_gid)); + if (comp_mask & IB_SA_PATH_REC_TRAFFIC_CLASS) + len += nla_total_size(sizeof(u8)); + if (comp_mask & IB_SA_PATH_REC_PKEY) + len += nla_total_size(sizeof(u16)); + if (comp_mask & IB_SA_PATH_REC_QOS_CLASS) + len += nla_total_size(sizeof(u16)); + + /* + * Make sure that at least some of the required comp_mask bits are + * set. + */ + if (WARN_ON(len == 0)) + return len; + + /* Add the family header */ + len += NLMSG_ALIGN(sizeof(struct rdma_ls_resolve_header)); + + return len; +} + +static int ib_nl_make_request(struct ib_sa_query *query, gfp_t gfp_mask) +{ + struct sk_buff *skb = NULL; + struct nlmsghdr *nlh; + void *data; + struct ib_sa_mad *mad; + int len; + unsigned long flags; + unsigned long delay; + gfp_t gfp_flag; + int ret; + + INIT_LIST_HEAD(&query->list); + query->seq = (u32)atomic_inc_return(&ib_nl_sa_request_seq); + + mad = query->mad_buf->mad; + len = ib_nl_get_path_rec_attrs_len(mad->sa_hdr.comp_mask); + if (len <= 0) + return -EMSGSIZE; + + skb = nlmsg_new(len, gfp_mask); + if (!skb) + return -ENOMEM; + + /* Put nlmsg header only for now */ + data = ibnl_put_msg(skb, &nlh, query->seq, 0, RDMA_NL_LS, + RDMA_NL_LS_OP_RESOLVE, NLM_F_REQUEST); + if (!data) { + nlmsg_free(skb); + return -EMSGSIZE; + } + + /* Add attributes */ + ib_nl_set_path_rec_attrs(skb, query); + + /* Repair the nlmsg header length */ + nlmsg_end(skb, nlh); + + gfp_flag = ((gfp_mask & GFP_ATOMIC) == GFP_ATOMIC) ? GFP_ATOMIC : + GFP_NOWAIT; + + spin_lock_irqsave(&ib_nl_request_lock, flags); + ret = rdma_nl_multicast(&init_net, skb, RDMA_NL_GROUP_LS, gfp_flag); + + if (ret) + goto out; + + /* Put the request on the list.*/ + delay = msecs_to_jiffies(sa_local_svc_timeout_ms); + query->timeout = delay + jiffies; + list_add_tail(&query->list, &ib_nl_request_list); + /* Start the timeout if this is the only request */ + if (ib_nl_request_list.next == &query->list) + queue_delayed_work(ib_nl_wq, &ib_nl_timed_work, delay); + +out: + spin_unlock_irqrestore(&ib_nl_request_lock, flags); + + return ret; +} + +static int ib_nl_cancel_request(struct ib_sa_query *query) +{ + unsigned long flags; + struct ib_sa_query *wait_query; + int found = 0; + + spin_lock_irqsave(&ib_nl_request_lock, flags); + list_for_each_entry(wait_query, &ib_nl_request_list, list) { + /* Let the timeout to take care of the callback */ + if (query == wait_query) { + query->flags |= IB_SA_CANCEL; + query->timeout = jiffies; + list_move(&query->list, &ib_nl_request_list); + found = 1; + mod_delayed_work(ib_nl_wq, &ib_nl_timed_work, 1); + break; + } + } + spin_unlock_irqrestore(&ib_nl_request_lock, flags); + + return found; +} + +static void send_handler(struct ib_mad_agent *agent, + struct ib_mad_send_wc *mad_send_wc); + +static void ib_nl_process_good_resolve_rsp(struct ib_sa_query *query, + const struct nlmsghdr *nlh) +{ + struct ib_path_rec_data *srec, *drec; + struct ib_sa_path_query *path_query; + struct ib_mad_send_wc mad_send_wc; + const struct nlattr *head, *curr; + struct ib_sa_mad *mad = NULL; + int len, rem, num_prs = 0; + u32 mask = 0; + int status = -EIO; + + if (!query->callback) + goto out; + + path_query = container_of(query, struct ib_sa_path_query, sa_query); + mad = query->mad_buf->mad; + if (!path_query->conv_pr && + (be16_to_cpu(mad->mad_hdr.attr_id) == IB_SA_ATTR_PATH_REC)) { + /* Need a larger buffer for possible multiple PRs */ + query->resp_pr_data = kvcalloc(RDMA_PRIMARY_PATH_MAX_REC_NUM, + sizeof(*drec), GFP_KERNEL); + if (!query->resp_pr_data) { + query->callback(query, -ENOMEM, 0, NULL); + return; + } + } + + head = (const struct nlattr *) nlmsg_data(nlh); + len = nlmsg_len(nlh); + switch (query->path_use) { + case LS_RESOLVE_PATH_USE_UNIDIRECTIONAL: + mask = IB_PATH_PRIMARY | IB_PATH_OUTBOUND; + break; + + case LS_RESOLVE_PATH_USE_ALL: + mask = IB_PATH_PRIMARY; + break; + + case LS_RESOLVE_PATH_USE_GMP: + default: + mask = IB_PATH_PRIMARY | IB_PATH_GMP | + IB_PATH_BIDIRECTIONAL; + break; + } + + drec = (struct ib_path_rec_data *)query->resp_pr_data; + nla_for_each_attr(curr, head, len, rem) { + if (curr->nla_type != LS_NLA_TYPE_PATH_RECORD) + continue; + + srec = nla_data(curr); + if ((srec->flags & mask) != mask) + continue; + + status = 0; + if (!drec) { + memcpy(mad->data, srec->path_rec, + sizeof(srec->path_rec)); + num_prs = 1; + break; + } + + memcpy(drec, srec, sizeof(*drec)); + drec++; + num_prs++; + if (num_prs >= RDMA_PRIMARY_PATH_MAX_REC_NUM) + break; + } + + if (!status) + mad->mad_hdr.method |= IB_MGMT_METHOD_RESP; + + query->callback(query, status, num_prs, mad); + kvfree(query->resp_pr_data); + query->resp_pr_data = NULL; + +out: + mad_send_wc.send_buf = query->mad_buf; + mad_send_wc.status = IB_WC_SUCCESS; + send_handler(query->mad_buf->mad_agent, &mad_send_wc); +} + +static void ib_nl_request_timeout(struct work_struct *work) +{ + unsigned long flags; + struct ib_sa_query *query; + unsigned long delay; + struct ib_mad_send_wc mad_send_wc; + int ret; + + spin_lock_irqsave(&ib_nl_request_lock, flags); + while (!list_empty(&ib_nl_request_list)) { + query = list_entry(ib_nl_request_list.next, + struct ib_sa_query, list); + + if (time_after(query->timeout, jiffies)) { + delay = query->timeout - jiffies; + if ((long)delay <= 0) + delay = 1; + queue_delayed_work(ib_nl_wq, &ib_nl_timed_work, delay); + break; + } + + list_del(&query->list); + ib_sa_disable_local_svc(query); + /* Hold the lock to protect against query cancellation */ + if (ib_sa_query_cancelled(query)) + ret = -1; + else + ret = ib_post_send_mad(query->mad_buf, NULL); + if (ret) { + mad_send_wc.send_buf = query->mad_buf; + mad_send_wc.status = IB_WC_WR_FLUSH_ERR; + spin_unlock_irqrestore(&ib_nl_request_lock, flags); + send_handler(query->port->agent, &mad_send_wc); + spin_lock_irqsave(&ib_nl_request_lock, flags); + } + } + spin_unlock_irqrestore(&ib_nl_request_lock, flags); +} + +int ib_nl_handle_set_timeout(struct sk_buff *skb, + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + int timeout, delta, abs_delta; + const struct nlattr *attr; + unsigned long flags; + struct ib_sa_query *query; + long delay = 0; + struct nlattr *tb[LS_NLA_TYPE_MAX]; + int ret; + + if (!(nlh->nlmsg_flags & NLM_F_REQUEST) || + !(NETLINK_CB(skb).sk)) + return -EPERM; + + ret = nla_parse_deprecated(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh), + nlmsg_len(nlh), ib_nl_policy, NULL); + attr = (const struct nlattr *)tb[LS_NLA_TYPE_TIMEOUT]; + if (ret || !attr) + goto settimeout_out; + + timeout = *(int *) nla_data(attr); + if (timeout < IB_SA_LOCAL_SVC_TIMEOUT_MIN) + timeout = IB_SA_LOCAL_SVC_TIMEOUT_MIN; + if (timeout > IB_SA_LOCAL_SVC_TIMEOUT_MAX) + timeout = IB_SA_LOCAL_SVC_TIMEOUT_MAX; + + delta = timeout - sa_local_svc_timeout_ms; + if (delta < 0) + abs_delta = -delta; + else + abs_delta = delta; + + if (delta != 0) { + spin_lock_irqsave(&ib_nl_request_lock, flags); + sa_local_svc_timeout_ms = timeout; + list_for_each_entry(query, &ib_nl_request_list, list) { + if (delta < 0 && abs_delta > query->timeout) + query->timeout = 0; + else + query->timeout += delta; + + /* Get the new delay from the first entry */ + if (!delay) { + delay = query->timeout - jiffies; + if (delay <= 0) + delay = 1; + } + } + if (delay) + mod_delayed_work(ib_nl_wq, &ib_nl_timed_work, + (unsigned long)delay); + spin_unlock_irqrestore(&ib_nl_request_lock, flags); + } + +settimeout_out: + return 0; +} + +static inline int ib_nl_is_good_resolve_resp(const struct nlmsghdr *nlh) +{ + struct nlattr *tb[LS_NLA_TYPE_MAX]; + int ret; + + if (nlh->nlmsg_flags & RDMA_NL_LS_F_ERR) + return 0; + + ret = nla_parse_deprecated(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh), + nlmsg_len(nlh), ib_nl_policy, NULL); + if (ret) + return 0; + + return 1; +} + +int ib_nl_handle_resolve_resp(struct sk_buff *skb, + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + unsigned long flags; + struct ib_sa_query *query; + struct ib_mad_send_buf *send_buf; + struct ib_mad_send_wc mad_send_wc; + int found = 0; + int ret; + + if ((nlh->nlmsg_flags & NLM_F_REQUEST) || + !(NETLINK_CB(skb).sk)) + return -EPERM; + + spin_lock_irqsave(&ib_nl_request_lock, flags); + list_for_each_entry(query, &ib_nl_request_list, list) { + /* + * If the query is cancelled, let the timeout routine + * take care of it. + */ + if (nlh->nlmsg_seq == query->seq) { + found = !ib_sa_query_cancelled(query); + if (found) + list_del(&query->list); + break; + } + } + + if (!found) { + spin_unlock_irqrestore(&ib_nl_request_lock, flags); + goto resp_out; + } + + send_buf = query->mad_buf; + + if (!ib_nl_is_good_resolve_resp(nlh)) { + /* if the result is a failure, send out the packet via IB */ + ib_sa_disable_local_svc(query); + ret = ib_post_send_mad(query->mad_buf, NULL); + spin_unlock_irqrestore(&ib_nl_request_lock, flags); + if (ret) { + mad_send_wc.send_buf = send_buf; + mad_send_wc.status = IB_WC_GENERAL_ERR; + send_handler(query->port->agent, &mad_send_wc); + } + } else { + spin_unlock_irqrestore(&ib_nl_request_lock, flags); + ib_nl_process_good_resolve_rsp(query, nlh); + } + +resp_out: + return 0; +} + +static void free_sm_ah(struct kref *kref) +{ + struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah, ref); + + rdma_destroy_ah(sm_ah->ah, 0); + kfree(sm_ah); +} + +void ib_sa_register_client(struct ib_sa_client *client) +{ + atomic_set(&client->users, 1); + init_completion(&client->comp); +} +EXPORT_SYMBOL(ib_sa_register_client); + +void ib_sa_unregister_client(struct ib_sa_client *client) +{ + ib_sa_client_put(client); + wait_for_completion(&client->comp); +} +EXPORT_SYMBOL(ib_sa_unregister_client); + +/** + * ib_sa_cancel_query - try to cancel an SA query + * @id:ID of query to cancel + * @query:query pointer to cancel + * + * Try to cancel an SA query. If the id and query don't match up or + * the query has already completed, nothing is done. Otherwise the + * query is canceled and will complete with a status of -EINTR. + */ +void ib_sa_cancel_query(int id, struct ib_sa_query *query) +{ + unsigned long flags; + struct ib_mad_send_buf *mad_buf; + + xa_lock_irqsave(&queries, flags); + if (xa_load(&queries, id) != query) { + xa_unlock_irqrestore(&queries, flags); + return; + } + mad_buf = query->mad_buf; + xa_unlock_irqrestore(&queries, flags); + + /* + * If the query is still on the netlink request list, schedule + * it to be cancelled by the timeout routine. Otherwise, it has been + * sent to the MAD layer and has to be cancelled from there. + */ + if (!ib_nl_cancel_request(query)) + ib_cancel_mad(mad_buf); +} +EXPORT_SYMBOL(ib_sa_cancel_query); + +static u8 get_src_path_mask(struct ib_device *device, u32 port_num) +{ + struct ib_sa_device *sa_dev; + struct ib_sa_port *port; + unsigned long flags; + u8 src_path_mask; + + sa_dev = ib_get_client_data(device, &sa_client); + if (!sa_dev) + return 0x7f; + + port = &sa_dev->port[port_num - sa_dev->start_port]; + spin_lock_irqsave(&port->ah_lock, flags); + src_path_mask = port->sm_ah ? port->sm_ah->src_path_mask : 0x7f; + spin_unlock_irqrestore(&port->ah_lock, flags); + + return src_path_mask; +} + +static int init_ah_attr_grh_fields(struct ib_device *device, u32 port_num, + struct sa_path_rec *rec, + struct rdma_ah_attr *ah_attr, + const struct ib_gid_attr *gid_attr) +{ + enum ib_gid_type type = sa_conv_pathrec_to_gid_type(rec); + + if (!gid_attr) { + gid_attr = rdma_find_gid_by_port(device, &rec->sgid, type, + port_num, NULL); + if (IS_ERR(gid_attr)) + return PTR_ERR(gid_attr); + } else + rdma_hold_gid_attr(gid_attr); + + rdma_move_grh_sgid_attr(ah_attr, &rec->dgid, + be32_to_cpu(rec->flow_label), + rec->hop_limit, rec->traffic_class, + gid_attr); + return 0; +} + +/** + * ib_init_ah_attr_from_path - Initialize address handle attributes based on + * an SA path record. + * @device: Device associated ah attributes initialization. + * @port_num: Port on the specified device. + * @rec: path record entry to use for ah attributes initialization. + * @ah_attr: address handle attributes to initialization from path record. + * @gid_attr: SGID attribute to consider during initialization. + * + * When ib_init_ah_attr_from_path() returns success, + * (a) for IB link layer it optionally contains a reference to SGID attribute + * when GRH is present for IB link layer. + * (b) for RoCE link layer it contains a reference to SGID attribute. + * User must invoke rdma_destroy_ah_attr() to release reference to SGID + * attributes which are initialized using ib_init_ah_attr_from_path(). + */ +int ib_init_ah_attr_from_path(struct ib_device *device, u32 port_num, + struct sa_path_rec *rec, + struct rdma_ah_attr *ah_attr, + const struct ib_gid_attr *gid_attr) +{ + int ret = 0; + + memset(ah_attr, 0, sizeof(*ah_attr)); + ah_attr->type = rdma_ah_find_type(device, port_num); + rdma_ah_set_sl(ah_attr, rec->sl); + rdma_ah_set_port_num(ah_attr, port_num); + rdma_ah_set_static_rate(ah_attr, rec->rate); + + if (sa_path_is_roce(rec)) { + ret = roce_resolve_route_from_path(rec, gid_attr); + if (ret) + return ret; + + memcpy(ah_attr->roce.dmac, sa_path_get_dmac(rec), ETH_ALEN); + } else { + rdma_ah_set_dlid(ah_attr, be32_to_cpu(sa_path_get_dlid(rec))); + if (sa_path_is_opa(rec) && + rdma_ah_get_dlid(ah_attr) == be16_to_cpu(IB_LID_PERMISSIVE)) + rdma_ah_set_make_grd(ah_attr, true); + + rdma_ah_set_path_bits(ah_attr, + be32_to_cpu(sa_path_get_slid(rec)) & + get_src_path_mask(device, port_num)); + } + + if (rec->hop_limit > 0 || sa_path_is_roce(rec)) + ret = init_ah_attr_grh_fields(device, port_num, + rec, ah_attr, gid_attr); + return ret; +} +EXPORT_SYMBOL(ib_init_ah_attr_from_path); + +static int alloc_mad(struct ib_sa_query *query, gfp_t gfp_mask) +{ + struct rdma_ah_attr ah_attr; + unsigned long flags; + + spin_lock_irqsave(&query->port->ah_lock, flags); + if (!query->port->sm_ah) { + spin_unlock_irqrestore(&query->port->ah_lock, flags); + return -EAGAIN; + } + kref_get(&query->port->sm_ah->ref); + query->sm_ah = query->port->sm_ah; + spin_unlock_irqrestore(&query->port->ah_lock, flags); + + /* + * Always check if sm_ah has valid dlid assigned, + * before querying for class port info + */ + if ((rdma_query_ah(query->sm_ah->ah, &ah_attr) < 0) || + !rdma_is_valid_unicast_lid(&ah_attr)) { + kref_put(&query->sm_ah->ref, free_sm_ah); + return -EAGAIN; + } + query->mad_buf = ib_create_send_mad(query->port->agent, 1, + query->sm_ah->pkey_index, + 0, IB_MGMT_SA_HDR, IB_MGMT_SA_DATA, + gfp_mask, + ((query->flags & IB_SA_QUERY_OPA) ? + OPA_MGMT_BASE_VERSION : + IB_MGMT_BASE_VERSION)); + if (IS_ERR(query->mad_buf)) { + kref_put(&query->sm_ah->ref, free_sm_ah); + return -ENOMEM; + } + + query->mad_buf->ah = query->sm_ah->ah; + + return 0; +} + +static void free_mad(struct ib_sa_query *query) +{ + ib_free_send_mad(query->mad_buf); + kref_put(&query->sm_ah->ref, free_sm_ah); +} + +static void init_mad(struct ib_sa_query *query, struct ib_mad_agent *agent) +{ + struct ib_sa_mad *mad = query->mad_buf->mad; + unsigned long flags; + + memset(mad, 0, sizeof *mad); + + if (query->flags & IB_SA_QUERY_OPA) { + mad->mad_hdr.base_version = OPA_MGMT_BASE_VERSION; + mad->mad_hdr.class_version = OPA_SA_CLASS_VERSION; + } else { + mad->mad_hdr.base_version = IB_MGMT_BASE_VERSION; + mad->mad_hdr.class_version = IB_SA_CLASS_VERSION; + } + mad->mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM; + spin_lock_irqsave(&tid_lock, flags); + mad->mad_hdr.tid = + cpu_to_be64(((u64) agent->hi_tid) << 32 | tid++); + spin_unlock_irqrestore(&tid_lock, flags); +} + +static int send_mad(struct ib_sa_query *query, unsigned long timeout_ms, + int retries, gfp_t gfp_mask) +{ + unsigned long flags; + int ret, id; + const int nmbr_sa_query_retries = 10; + + xa_lock_irqsave(&queries, flags); + ret = __xa_alloc(&queries, &id, query, xa_limit_32b, gfp_mask); + xa_unlock_irqrestore(&queries, flags); + if (ret < 0) + return ret; + + query->mad_buf->timeout_ms = timeout_ms / nmbr_sa_query_retries; + query->mad_buf->retries = nmbr_sa_query_retries; + if (!query->mad_buf->timeout_ms) { + /* Special case, very small timeout_ms */ + query->mad_buf->timeout_ms = 1; + query->mad_buf->retries = timeout_ms; + } + query->mad_buf->context[0] = query; + query->id = id; + + if ((query->flags & IB_SA_ENABLE_LOCAL_SERVICE) && + (!(query->flags & IB_SA_QUERY_OPA))) { + if (rdma_nl_chk_listeners(RDMA_NL_GROUP_LS)) { + if (!ib_nl_make_request(query, gfp_mask)) + return id; + } + ib_sa_disable_local_svc(query); + } + + ret = ib_post_send_mad(query->mad_buf, NULL); + if (ret) { + xa_lock_irqsave(&queries, flags); + __xa_erase(&queries, id); + xa_unlock_irqrestore(&queries, flags); + } + + /* + * It's not safe to dereference query any more, because the + * send may already have completed and freed the query in + * another context. + */ + return ret ? ret : id; +} + +void ib_sa_unpack_path(void *attribute, struct sa_path_rec *rec) +{ + ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table), attribute, rec); +} +EXPORT_SYMBOL(ib_sa_unpack_path); + +void ib_sa_pack_path(struct sa_path_rec *rec, void *attribute) +{ + ib_pack(path_rec_table, ARRAY_SIZE(path_rec_table), rec, attribute); +} +EXPORT_SYMBOL(ib_sa_pack_path); + +static bool ib_sa_opa_pathrecord_support(struct ib_sa_client *client, + struct ib_sa_device *sa_dev, + u32 port_num) +{ + struct ib_sa_port *port; + unsigned long flags; + bool ret = false; + + port = &sa_dev->port[port_num - sa_dev->start_port]; + spin_lock_irqsave(&port->classport_lock, flags); + if (!port->classport_info.valid) + goto ret; + + if (port->classport_info.data.type == RDMA_CLASS_PORT_INFO_OPA) + ret = opa_get_cpi_capmask2(&port->classport_info.data.opa) & + OPA_CLASS_PORT_INFO_PR_SUPPORT; +ret: + spin_unlock_irqrestore(&port->classport_lock, flags); + return ret; +} + +enum opa_pr_supported { + PR_NOT_SUPPORTED, + PR_OPA_SUPPORTED, + PR_IB_SUPPORTED +}; + +/* + * opa_pr_query_possible - Check if current PR query can be an OPA query. + * + * Retuns PR_NOT_SUPPORTED if a path record query is not + * possible, PR_OPA_SUPPORTED if an OPA path record query + * is possible and PR_IB_SUPPORTED if an IB path record + * query is possible. + */ +static int opa_pr_query_possible(struct ib_sa_client *client, + struct ib_sa_device *sa_dev, + struct ib_device *device, u32 port_num) +{ + struct ib_port_attr port_attr; + + if (ib_query_port(device, port_num, &port_attr)) + return PR_NOT_SUPPORTED; + + if (ib_sa_opa_pathrecord_support(client, sa_dev, port_num)) + return PR_OPA_SUPPORTED; + + if (port_attr.lid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) + return PR_NOT_SUPPORTED; + else + return PR_IB_SUPPORTED; +} + +static void ib_sa_pr_callback_single(struct ib_sa_path_query *query, + int status, struct ib_sa_mad *mad) +{ + struct sa_path_rec rec = {}; + + ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table), + mad->data, &rec); + rec.rec_type = SA_PATH_REC_TYPE_IB; + sa_path_set_dmac_zero(&rec); + + if (query->conv_pr) { + struct sa_path_rec opa; + + memset(&opa, 0, sizeof(struct sa_path_rec)); + sa_convert_path_ib_to_opa(&opa, &rec); + query->callback(status, &opa, 1, query->context); + } else { + query->callback(status, &rec, 1, query->context); + } +} + +/** + * ib_sa_pr_callback_multiple() - Parse path records then do callback. + * + * In a multiple-PR case the PRs are saved in "query->resp_pr_data" + * (instead of"mad->data") and with "ib_path_rec_data" structure format, + * so that rec->flags can be set to indicate the type of PR. + * This is valid only in IB fabric. + */ +static void ib_sa_pr_callback_multiple(struct ib_sa_path_query *query, + int status, int num_prs, + struct ib_path_rec_data *rec_data) +{ + struct sa_path_rec *rec; + int i; + + rec = kvcalloc(num_prs, sizeof(*rec), GFP_KERNEL); + if (!rec) { + query->callback(-ENOMEM, NULL, 0, query->context); + return; + } + + for (i = 0; i < num_prs; i++) { + ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table), + rec_data[i].path_rec, rec + i); + rec[i].rec_type = SA_PATH_REC_TYPE_IB; + sa_path_set_dmac_zero(rec + i); + rec[i].flags = rec_data[i].flags; + } + + query->callback(status, rec, num_prs, query->context); + kvfree(rec); +} + +static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query, + int status, int num_prs, + struct ib_sa_mad *mad) +{ + struct ib_sa_path_query *query = + container_of(sa_query, struct ib_sa_path_query, sa_query); + struct sa_path_rec rec; + + if (!mad || !num_prs) { + query->callback(status, NULL, 0, query->context); + return; + } + + if (sa_query->flags & IB_SA_QUERY_OPA) { + if (num_prs != 1) { + query->callback(-EINVAL, NULL, 0, query->context); + return; + } + + ib_unpack(opa_path_rec_table, ARRAY_SIZE(opa_path_rec_table), + mad->data, &rec); + rec.rec_type = SA_PATH_REC_TYPE_OPA; + query->callback(status, &rec, num_prs, query->context); + } else { + if (!sa_query->resp_pr_data) + ib_sa_pr_callback_single(query, status, mad); + else + ib_sa_pr_callback_multiple(query, status, num_prs, + sa_query->resp_pr_data); + } +} + +static void ib_sa_path_rec_release(struct ib_sa_query *sa_query) +{ + struct ib_sa_path_query *query = + container_of(sa_query, struct ib_sa_path_query, sa_query); + + kfree(query->conv_pr); + kfree(query); +} + +/** + * ib_sa_path_rec_get - Start a Path get query + * @client:SA client + * @device:device to send query on + * @port_num: port number to send query on + * @rec:Path Record to send in query + * @comp_mask:component mask to send in query + * @timeout_ms:time to wait for response + * @retries:retries to send for response + * @gfp_mask:GFP mask to use for internal allocations + * @callback:function called when query completes, times out or is + * canceled + * @context:opaque user context passed to callback + * @sa_query:query context, used to cancel query + * + * Send a Path Record Get query to the SA to look up a path. The + * callback function will be called when the query completes (or + * fails); status is 0 for a successful response, -EINTR if the query + * is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error + * occurred sending the query. The resp parameter of the callback is + * only valid if status is 0. + * + * If the return value of ib_sa_path_rec_get() is negative, it is an + * error code. Otherwise it is a query ID that can be used to cancel + * the query. + */ +int ib_sa_path_rec_get(struct ib_sa_client *client, + struct ib_device *device, u32 port_num, + struct sa_path_rec *rec, + ib_sa_comp_mask comp_mask, + unsigned long timeout_ms, int retries, gfp_t gfp_mask, + void (*callback)(int status, + struct sa_path_rec *resp, + int num_paths, void *context), + void *context, + struct ib_sa_query **sa_query) +{ + struct ib_sa_path_query *query; + struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); + struct ib_sa_port *port; + struct ib_mad_agent *agent; + struct ib_sa_mad *mad; + enum opa_pr_supported status; + int ret; + + if (!sa_dev) + return -ENODEV; + + if ((rec->rec_type != SA_PATH_REC_TYPE_IB) && + (rec->rec_type != SA_PATH_REC_TYPE_OPA)) + return -EINVAL; + + port = &sa_dev->port[port_num - sa_dev->start_port]; + agent = port->agent; + + query = kzalloc(sizeof(*query), gfp_mask); + if (!query) + return -ENOMEM; + + query->sa_query.port = port; + if (rec->rec_type == SA_PATH_REC_TYPE_OPA) { + status = opa_pr_query_possible(client, sa_dev, device, port_num); + if (status == PR_NOT_SUPPORTED) { + ret = -EINVAL; + goto err1; + } else if (status == PR_OPA_SUPPORTED) { + query->sa_query.flags |= IB_SA_QUERY_OPA; + } else { + query->conv_pr = + kmalloc(sizeof(*query->conv_pr), gfp_mask); + if (!query->conv_pr) { + ret = -ENOMEM; + goto err1; + } + } + } + + ret = alloc_mad(&query->sa_query, gfp_mask); + if (ret) + goto err2; + + ib_sa_client_get(client); + query->sa_query.client = client; + query->callback = callback; + query->context = context; + + mad = query->sa_query.mad_buf->mad; + init_mad(&query->sa_query, agent); + + query->sa_query.callback = callback ? ib_sa_path_rec_callback : NULL; + query->sa_query.release = ib_sa_path_rec_release; + mad->mad_hdr.method = IB_MGMT_METHOD_GET; + mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_PATH_REC); + mad->sa_hdr.comp_mask = comp_mask; + + if (query->sa_query.flags & IB_SA_QUERY_OPA) { + ib_pack(opa_path_rec_table, ARRAY_SIZE(opa_path_rec_table), + rec, mad->data); + } else if (query->conv_pr) { + sa_convert_path_opa_to_ib(query->conv_pr, rec); + ib_pack(path_rec_table, ARRAY_SIZE(path_rec_table), + query->conv_pr, mad->data); + } else { + ib_pack(path_rec_table, ARRAY_SIZE(path_rec_table), + rec, mad->data); + } + + *sa_query = &query->sa_query; + + query->sa_query.flags |= IB_SA_ENABLE_LOCAL_SERVICE; + query->sa_query.mad_buf->context[1] = (query->conv_pr) ? + query->conv_pr : rec; + + ret = send_mad(&query->sa_query, timeout_ms, retries, gfp_mask); + if (ret < 0) + goto err3; + + return ret; + +err3: + *sa_query = NULL; + ib_sa_client_put(query->sa_query.client); + free_mad(&query->sa_query); +err2: + kfree(query->conv_pr); +err1: + kfree(query); + return ret; +} +EXPORT_SYMBOL(ib_sa_path_rec_get); + +static void ib_sa_mcmember_rec_callback(struct ib_sa_query *sa_query, + int status, int num_prs, + struct ib_sa_mad *mad) +{ + struct ib_sa_mcmember_query *query = + container_of(sa_query, struct ib_sa_mcmember_query, sa_query); + + if (mad) { + struct ib_sa_mcmember_rec rec; + + ib_unpack(mcmember_rec_table, ARRAY_SIZE(mcmember_rec_table), + mad->data, &rec); + query->callback(status, &rec, query->context); + } else + query->callback(status, NULL, query->context); +} + +static void ib_sa_mcmember_rec_release(struct ib_sa_query *sa_query) +{ + kfree(container_of(sa_query, struct ib_sa_mcmember_query, sa_query)); +} + +int ib_sa_mcmember_rec_query(struct ib_sa_client *client, + struct ib_device *device, u32 port_num, + u8 method, + struct ib_sa_mcmember_rec *rec, + ib_sa_comp_mask comp_mask, + unsigned long timeout_ms,int retries, gfp_t gfp_mask, + void (*callback)(int status, + struct ib_sa_mcmember_rec *resp, + void *context), + void *context, + struct ib_sa_query **sa_query) +{ + struct ib_sa_mcmember_query *query; + struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); + struct ib_sa_port *port; + struct ib_mad_agent *agent; + struct ib_sa_mad *mad; + int ret; + + if (!sa_dev) + return -ENODEV; + + port = &sa_dev->port[port_num - sa_dev->start_port]; + agent = port->agent; + + query = kzalloc(sizeof(*query), gfp_mask); + if (!query) + return -ENOMEM; + + query->sa_query.port = port; + ret = alloc_mad(&query->sa_query, gfp_mask); + if (ret) + goto err1; + + ib_sa_client_get(client); + query->sa_query.client = client; + query->callback = callback; + query->context = context; + + mad = query->sa_query.mad_buf->mad; + init_mad(&query->sa_query, agent); + + query->sa_query.callback = callback ? ib_sa_mcmember_rec_callback : NULL; + query->sa_query.release = ib_sa_mcmember_rec_release; + mad->mad_hdr.method = method; + mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_MC_MEMBER_REC); + mad->sa_hdr.comp_mask = comp_mask; + + ib_pack(mcmember_rec_table, ARRAY_SIZE(mcmember_rec_table), + rec, mad->data); + + *sa_query = &query->sa_query; + + ret = send_mad(&query->sa_query, timeout_ms, retries, gfp_mask); + if (ret < 0) + goto err2; + + return ret; + +err2: + *sa_query = NULL; + ib_sa_client_put(query->sa_query.client); + free_mad(&query->sa_query); + +err1: + kfree(query); + return ret; +} + +/* Support GuidInfoRecord */ +static void ib_sa_guidinfo_rec_callback(struct ib_sa_query *sa_query, + int status, int num_paths, + struct ib_sa_mad *mad) +{ + struct ib_sa_guidinfo_query *query = + container_of(sa_query, struct ib_sa_guidinfo_query, sa_query); + + if (mad) { + struct ib_sa_guidinfo_rec rec; + + ib_unpack(guidinfo_rec_table, ARRAY_SIZE(guidinfo_rec_table), + mad->data, &rec); + query->callback(status, &rec, query->context); + } else + query->callback(status, NULL, query->context); +} + +static void ib_sa_guidinfo_rec_release(struct ib_sa_query *sa_query) +{ + kfree(container_of(sa_query, struct ib_sa_guidinfo_query, sa_query)); +} + +int ib_sa_guid_info_rec_query(struct ib_sa_client *client, + struct ib_device *device, u32 port_num, + struct ib_sa_guidinfo_rec *rec, + ib_sa_comp_mask comp_mask, u8 method, + unsigned long timeout_ms, int retries, gfp_t gfp_mask, + void (*callback)(int status, + struct ib_sa_guidinfo_rec *resp, + void *context), + void *context, + struct ib_sa_query **sa_query) +{ + struct ib_sa_guidinfo_query *query; + struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); + struct ib_sa_port *port; + struct ib_mad_agent *agent; + struct ib_sa_mad *mad; + int ret; + + if (!sa_dev) + return -ENODEV; + + if (method != IB_MGMT_METHOD_GET && + method != IB_MGMT_METHOD_SET && + method != IB_SA_METHOD_DELETE) { + return -EINVAL; + } + + port = &sa_dev->port[port_num - sa_dev->start_port]; + agent = port->agent; + + query = kzalloc(sizeof(*query), gfp_mask); + if (!query) + return -ENOMEM; + + query->sa_query.port = port; + ret = alloc_mad(&query->sa_query, gfp_mask); + if (ret) + goto err1; + + ib_sa_client_get(client); + query->sa_query.client = client; + query->callback = callback; + query->context = context; + + mad = query->sa_query.mad_buf->mad; + init_mad(&query->sa_query, agent); + + query->sa_query.callback = callback ? ib_sa_guidinfo_rec_callback : NULL; + query->sa_query.release = ib_sa_guidinfo_rec_release; + + mad->mad_hdr.method = method; + mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_GUID_INFO_REC); + mad->sa_hdr.comp_mask = comp_mask; + + ib_pack(guidinfo_rec_table, ARRAY_SIZE(guidinfo_rec_table), rec, + mad->data); + + *sa_query = &query->sa_query; + + ret = send_mad(&query->sa_query, timeout_ms, retries, gfp_mask); + if (ret < 0) + goto err2; + + return ret; + +err2: + *sa_query = NULL; + ib_sa_client_put(query->sa_query.client); + free_mad(&query->sa_query); + +err1: + kfree(query); + return ret; +} +EXPORT_SYMBOL(ib_sa_guid_info_rec_query); + +struct ib_classport_info_context { + struct completion done; + struct ib_sa_query *sa_query; +}; + +static void ib_classportinfo_cb(void *context) +{ + struct ib_classport_info_context *cb_ctx = context; + + complete(&cb_ctx->done); +} + +static void ib_sa_classport_info_rec_callback(struct ib_sa_query *sa_query, + int status, int num_prs, + struct ib_sa_mad *mad) +{ + unsigned long flags; + struct ib_sa_classport_info_query *query = + container_of(sa_query, struct ib_sa_classport_info_query, sa_query); + struct ib_sa_classport_cache *info = &sa_query->port->classport_info; + + if (mad) { + if (sa_query->flags & IB_SA_QUERY_OPA) { + struct opa_class_port_info rec; + + ib_unpack(opa_classport_info_rec_table, + ARRAY_SIZE(opa_classport_info_rec_table), + mad->data, &rec); + + spin_lock_irqsave(&sa_query->port->classport_lock, + flags); + if (!status && !info->valid) { + memcpy(&info->data.opa, &rec, + sizeof(info->data.opa)); + + info->valid = true; + info->data.type = RDMA_CLASS_PORT_INFO_OPA; + } + spin_unlock_irqrestore(&sa_query->port->classport_lock, + flags); + + } else { + struct ib_class_port_info rec; + + ib_unpack(ib_classport_info_rec_table, + ARRAY_SIZE(ib_classport_info_rec_table), + mad->data, &rec); + + spin_lock_irqsave(&sa_query->port->classport_lock, + flags); + if (!status && !info->valid) { + memcpy(&info->data.ib, &rec, + sizeof(info->data.ib)); + + info->valid = true; + info->data.type = RDMA_CLASS_PORT_INFO_IB; + } + spin_unlock_irqrestore(&sa_query->port->classport_lock, + flags); + } + } + query->callback(query->context); +} + +static void ib_sa_classport_info_rec_release(struct ib_sa_query *sa_query) +{ + kfree(container_of(sa_query, struct ib_sa_classport_info_query, + sa_query)); +} + +static int ib_sa_classport_info_rec_query(struct ib_sa_port *port, + unsigned long timeout_ms, + int retries, + void (*callback)(void *context), + void *context, + struct ib_sa_query **sa_query) +{ + struct ib_mad_agent *agent; + struct ib_sa_classport_info_query *query; + struct ib_sa_mad *mad; + gfp_t gfp_mask = GFP_KERNEL; + int ret; + + agent = port->agent; + + query = kzalloc(sizeof(*query), gfp_mask); + if (!query) + return -ENOMEM; + + query->sa_query.port = port; + query->sa_query.flags |= rdma_cap_opa_ah(port->agent->device, + port->port_num) ? + IB_SA_QUERY_OPA : 0; + ret = alloc_mad(&query->sa_query, gfp_mask); + if (ret) + goto err_free; + + query->callback = callback; + query->context = context; + + mad = query->sa_query.mad_buf->mad; + init_mad(&query->sa_query, agent); + + query->sa_query.callback = ib_sa_classport_info_rec_callback; + query->sa_query.release = ib_sa_classport_info_rec_release; + mad->mad_hdr.method = IB_MGMT_METHOD_GET; + mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_CLASS_PORTINFO); + mad->sa_hdr.comp_mask = 0; + *sa_query = &query->sa_query; + + ret = send_mad(&query->sa_query, timeout_ms, retries, gfp_mask); + if (ret < 0) + goto err_free_mad; + + return ret; + +err_free_mad: + *sa_query = NULL; + free_mad(&query->sa_query); + +err_free: + kfree(query); + return ret; +} + +static void update_ib_cpi(struct work_struct *work) +{ + struct ib_sa_port *port = + container_of(work, struct ib_sa_port, ib_cpi_work.work); + struct ib_classport_info_context *cb_context; + unsigned long flags; + int ret; + + /* If the classport info is valid, nothing + * to do here. + */ + spin_lock_irqsave(&port->classport_lock, flags); + if (port->classport_info.valid) { + spin_unlock_irqrestore(&port->classport_lock, flags); + return; + } + spin_unlock_irqrestore(&port->classport_lock, flags); + + cb_context = kmalloc(sizeof(*cb_context), GFP_KERNEL); + if (!cb_context) + goto err_nomem; + + init_completion(&cb_context->done); + + ret = ib_sa_classport_info_rec_query(port, 3000, 0, + ib_classportinfo_cb, cb_context, + &cb_context->sa_query); + if (ret < 0) + goto free_cb_err; + wait_for_completion(&cb_context->done); +free_cb_err: + kfree(cb_context); + spin_lock_irqsave(&port->classport_lock, flags); + + /* If the classport info is still not valid, the query should have + * failed for some reason. Retry issuing the query + */ + if (!port->classport_info.valid) { + port->classport_info.retry_cnt++; + if (port->classport_info.retry_cnt <= + IB_SA_CPI_MAX_RETRY_CNT) { + unsigned long delay = + msecs_to_jiffies(IB_SA_CPI_RETRY_WAIT); + + queue_delayed_work(ib_wq, &port->ib_cpi_work, delay); + } + } + spin_unlock_irqrestore(&port->classport_lock, flags); + +err_nomem: + return; +} + +static void send_handler(struct ib_mad_agent *agent, + struct ib_mad_send_wc *mad_send_wc) +{ + struct ib_sa_query *query = mad_send_wc->send_buf->context[0]; + unsigned long flags; + + if (query->callback) + switch (mad_send_wc->status) { + case IB_WC_SUCCESS: + /* No callback -- already got recv */ + break; + case IB_WC_RESP_TIMEOUT_ERR: + query->callback(query, -ETIMEDOUT, 0, NULL); + break; + case IB_WC_WR_FLUSH_ERR: + query->callback(query, -EINTR, 0, NULL); + break; + default: + query->callback(query, -EIO, 0, NULL); + break; + } + + xa_lock_irqsave(&queries, flags); + __xa_erase(&queries, query->id); + xa_unlock_irqrestore(&queries, flags); + + free_mad(query); + if (query->client) + ib_sa_client_put(query->client); + query->release(query); +} + +static void recv_handler(struct ib_mad_agent *mad_agent, + struct ib_mad_send_buf *send_buf, + struct ib_mad_recv_wc *mad_recv_wc) +{ + struct ib_sa_query *query; + + if (!send_buf) + return; + + query = send_buf->context[0]; + if (query->callback) { + if (mad_recv_wc->wc->status == IB_WC_SUCCESS) + query->callback(query, + mad_recv_wc->recv_buf.mad->mad_hdr.status ? + -EINVAL : 0, 1, + (struct ib_sa_mad *) mad_recv_wc->recv_buf.mad); + else + query->callback(query, -EIO, 0, NULL); + } + + ib_free_recv_mad(mad_recv_wc); +} + +static void update_sm_ah(struct work_struct *work) +{ + struct ib_sa_port *port = + container_of(work, struct ib_sa_port, update_task); + struct ib_sa_sm_ah *new_ah; + struct ib_port_attr port_attr; + struct rdma_ah_attr ah_attr; + bool grh_required; + + if (ib_query_port(port->agent->device, port->port_num, &port_attr)) { + pr_warn("Couldn't query port\n"); + return; + } + + new_ah = kmalloc(sizeof(*new_ah), GFP_KERNEL); + if (!new_ah) + return; + + kref_init(&new_ah->ref); + new_ah->src_path_mask = (1 << port_attr.lmc) - 1; + + new_ah->pkey_index = 0; + if (ib_find_pkey(port->agent->device, port->port_num, + IB_DEFAULT_PKEY_FULL, &new_ah->pkey_index)) + pr_err("Couldn't find index for default PKey\n"); + + memset(&ah_attr, 0, sizeof(ah_attr)); + ah_attr.type = rdma_ah_find_type(port->agent->device, + port->port_num); + rdma_ah_set_dlid(&ah_attr, port_attr.sm_lid); + rdma_ah_set_sl(&ah_attr, port_attr.sm_sl); + rdma_ah_set_port_num(&ah_attr, port->port_num); + + grh_required = rdma_is_grh_required(port->agent->device, + port->port_num); + + /* + * The OPA sm_lid of 0xFFFF needs special handling so that it can be + * differentiated from a permissive LID of 0xFFFF. We set the + * grh_required flag here so the SA can program the DGID in the + * address handle appropriately + */ + if (ah_attr.type == RDMA_AH_ATTR_TYPE_OPA && + (grh_required || + port_attr.sm_lid == be16_to_cpu(IB_LID_PERMISSIVE))) + rdma_ah_set_make_grd(&ah_attr, true); + + if (ah_attr.type == RDMA_AH_ATTR_TYPE_IB && grh_required) { + rdma_ah_set_ah_flags(&ah_attr, IB_AH_GRH); + rdma_ah_set_subnet_prefix(&ah_attr, + cpu_to_be64(port_attr.subnet_prefix)); + rdma_ah_set_interface_id(&ah_attr, + cpu_to_be64(IB_SA_WELL_KNOWN_GUID)); + } + + new_ah->ah = rdma_create_ah(port->agent->qp->pd, &ah_attr, + RDMA_CREATE_AH_SLEEPABLE); + if (IS_ERR(new_ah->ah)) { + pr_warn("Couldn't create new SM AH\n"); + kfree(new_ah); + return; + } + + spin_lock_irq(&port->ah_lock); + if (port->sm_ah) + kref_put(&port->sm_ah->ref, free_sm_ah); + port->sm_ah = new_ah; + spin_unlock_irq(&port->ah_lock); +} + +static void ib_sa_event(struct ib_event_handler *handler, + struct ib_event *event) +{ + if (event->event == IB_EVENT_PORT_ERR || + event->event == IB_EVENT_PORT_ACTIVE || + event->event == IB_EVENT_LID_CHANGE || + event->event == IB_EVENT_PKEY_CHANGE || + event->event == IB_EVENT_SM_CHANGE || + event->event == IB_EVENT_CLIENT_REREGISTER) { + unsigned long flags; + struct ib_sa_device *sa_dev = + container_of(handler, typeof(*sa_dev), event_handler); + u32 port_num = event->element.port_num - sa_dev->start_port; + struct ib_sa_port *port = &sa_dev->port[port_num]; + + if (!rdma_cap_ib_sa(handler->device, port->port_num)) + return; + + spin_lock_irqsave(&port->ah_lock, flags); + if (port->sm_ah) + kref_put(&port->sm_ah->ref, free_sm_ah); + port->sm_ah = NULL; + spin_unlock_irqrestore(&port->ah_lock, flags); + + if (event->event == IB_EVENT_SM_CHANGE || + event->event == IB_EVENT_CLIENT_REREGISTER || + event->event == IB_EVENT_LID_CHANGE || + event->event == IB_EVENT_PORT_ACTIVE) { + unsigned long delay = + msecs_to_jiffies(IB_SA_CPI_RETRY_WAIT); + + spin_lock_irqsave(&port->classport_lock, flags); + port->classport_info.valid = false; + port->classport_info.retry_cnt = 0; + spin_unlock_irqrestore(&port->classport_lock, flags); + queue_delayed_work(ib_wq, + &port->ib_cpi_work, delay); + } + queue_work(ib_wq, &sa_dev->port[port_num].update_task); + } +} + +static int ib_sa_add_one(struct ib_device *device) +{ + struct ib_sa_device *sa_dev; + int s, e, i; + int count = 0; + int ret; + + s = rdma_start_port(device); + e = rdma_end_port(device); + + sa_dev = kzalloc(struct_size(sa_dev, port, e - s + 1), GFP_KERNEL); + if (!sa_dev) + return -ENOMEM; + + sa_dev->start_port = s; + sa_dev->end_port = e; + + for (i = 0; i <= e - s; ++i) { + spin_lock_init(&sa_dev->port[i].ah_lock); + if (!rdma_cap_ib_sa(device, i + 1)) + continue; + + sa_dev->port[i].sm_ah = NULL; + sa_dev->port[i].port_num = i + s; + + spin_lock_init(&sa_dev->port[i].classport_lock); + sa_dev->port[i].classport_info.valid = false; + + sa_dev->port[i].agent = + ib_register_mad_agent(device, i + s, IB_QPT_GSI, + NULL, 0, send_handler, + recv_handler, sa_dev, 0); + if (IS_ERR(sa_dev->port[i].agent)) { + ret = PTR_ERR(sa_dev->port[i].agent); + goto err; + } + + INIT_WORK(&sa_dev->port[i].update_task, update_sm_ah); + INIT_DELAYED_WORK(&sa_dev->port[i].ib_cpi_work, + update_ib_cpi); + + count++; + } + + if (!count) { + ret = -EOPNOTSUPP; + goto free; + } + + ib_set_client_data(device, &sa_client, sa_dev); + + /* + * We register our event handler after everything is set up, + * and then update our cached info after the event handler is + * registered to avoid any problems if a port changes state + * during our initialization. + */ + + INIT_IB_EVENT_HANDLER(&sa_dev->event_handler, device, ib_sa_event); + ib_register_event_handler(&sa_dev->event_handler); + + for (i = 0; i <= e - s; ++i) { + if (rdma_cap_ib_sa(device, i + 1)) + update_sm_ah(&sa_dev->port[i].update_task); + } + + return 0; + +err: + while (--i >= 0) { + if (rdma_cap_ib_sa(device, i + 1)) + ib_unregister_mad_agent(sa_dev->port[i].agent); + } +free: + kfree(sa_dev); + return ret; +} + +static void ib_sa_remove_one(struct ib_device *device, void *client_data) +{ + struct ib_sa_device *sa_dev = client_data; + int i; + + ib_unregister_event_handler(&sa_dev->event_handler); + flush_workqueue(ib_wq); + + for (i = 0; i <= sa_dev->end_port - sa_dev->start_port; ++i) { + if (rdma_cap_ib_sa(device, i + 1)) { + cancel_delayed_work_sync(&sa_dev->port[i].ib_cpi_work); + ib_unregister_mad_agent(sa_dev->port[i].agent); + if (sa_dev->port[i].sm_ah) + kref_put(&sa_dev->port[i].sm_ah->ref, free_sm_ah); + } + + } + + kfree(sa_dev); +} + +int ib_sa_init(void) +{ + int ret; + + get_random_bytes(&tid, sizeof tid); + + atomic_set(&ib_nl_sa_request_seq, 0); + + ret = ib_register_client(&sa_client); + if (ret) { + pr_err("Couldn't register ib_sa client\n"); + goto err1; + } + + ret = mcast_init(); + if (ret) { + pr_err("Couldn't initialize multicast handling\n"); + goto err2; + } + + ib_nl_wq = alloc_ordered_workqueue("ib_nl_sa_wq", WQ_MEM_RECLAIM); + if (!ib_nl_wq) { + ret = -ENOMEM; + goto err3; + } + + INIT_DELAYED_WORK(&ib_nl_timed_work, ib_nl_request_timeout); + + return 0; + +err3: + mcast_cleanup(); +err2: + ib_unregister_client(&sa_client); +err1: + return ret; +} + +void ib_sa_cleanup(void) +{ + cancel_delayed_work(&ib_nl_timed_work); + destroy_workqueue(ib_nl_wq); + mcast_cleanup(); + ib_unregister_client(&sa_client); + WARN_ON(!xa_empty(&queries)); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/security.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/security.c new file mode 100644 index 0000000..3512c2e --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/security.c @@ -0,0 +1,750 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include +#include +#include "core_priv.h" +#include "mad_priv.h" + +static LIST_HEAD(mad_agent_list); +/* Lock to protect mad_agent_list */ +static DEFINE_SPINLOCK(mad_agent_list_lock); + +static struct pkey_index_qp_list *get_pkey_idx_qp_list(struct ib_port_pkey *pp) +{ + struct pkey_index_qp_list *pkey = NULL; + struct pkey_index_qp_list *tmp_pkey; + struct ib_device *dev = pp->sec->dev; + + spin_lock(&dev->port_data[pp->port_num].pkey_list_lock); + list_for_each_entry (tmp_pkey, &dev->port_data[pp->port_num].pkey_list, + pkey_index_list) { + if (tmp_pkey->pkey_index == pp->pkey_index) { + pkey = tmp_pkey; + break; + } + } + spin_unlock(&dev->port_data[pp->port_num].pkey_list_lock); + return pkey; +} + +static int get_pkey_and_subnet_prefix(struct ib_port_pkey *pp, + u16 *pkey, + u64 *subnet_prefix) +{ + struct ib_device *dev = pp->sec->dev; + int ret; + + ret = ib_get_cached_pkey(dev, pp->port_num, pp->pkey_index, pkey); + if (ret) + return ret; + + ib_get_cached_subnet_prefix(dev, pp->port_num, subnet_prefix); + + return ret; +} + +static int enforce_qp_pkey_security(u16 pkey, + u64 subnet_prefix, + struct ib_qp_security *qp_sec) +{ + struct ib_qp_security *shared_qp_sec; + int ret; + + ret = security_ib_pkey_access(qp_sec->security, subnet_prefix, pkey); + if (ret) + return ret; + + list_for_each_entry(shared_qp_sec, + &qp_sec->shared_qp_list, + shared_qp_list) { + ret = security_ib_pkey_access(shared_qp_sec->security, + subnet_prefix, + pkey); + if (ret) + return ret; + } + return 0; +} + +/* The caller of this function must hold the QP security + * mutex of the QP of the security structure in *pps. + * + * It takes separate ports_pkeys and security structure + * because in some cases the pps will be for a new settings + * or the pps will be for the real QP and security structure + * will be for a shared QP. + */ +static int check_qp_port_pkey_settings(struct ib_ports_pkeys *pps, + struct ib_qp_security *sec) +{ + u64 subnet_prefix; + u16 pkey; + int ret = 0; + + if (!pps) + return 0; + + if (pps->main.state != IB_PORT_PKEY_NOT_VALID) { + ret = get_pkey_and_subnet_prefix(&pps->main, + &pkey, + &subnet_prefix); + if (ret) + return ret; + + ret = enforce_qp_pkey_security(pkey, + subnet_prefix, + sec); + if (ret) + return ret; + } + + if (pps->alt.state != IB_PORT_PKEY_NOT_VALID) { + ret = get_pkey_and_subnet_prefix(&pps->alt, + &pkey, + &subnet_prefix); + if (ret) + return ret; + + ret = enforce_qp_pkey_security(pkey, + subnet_prefix, + sec); + } + + return ret; +} + +/* The caller of this function must hold the QP security + * mutex. + */ +static void qp_to_error(struct ib_qp_security *sec) +{ + struct ib_qp_security *shared_qp_sec; + struct ib_qp_attr attr = { + .qp_state = IB_QPS_ERR + }; + struct ib_event event = { + .event = IB_EVENT_QP_FATAL + }; + + /* If the QP is in the process of being destroyed + * the qp pointer in the security structure is + * undefined. It cannot be modified now. + */ + if (sec->destroying) + return; + + ib_modify_qp(sec->qp, + &attr, + IB_QP_STATE); + + if (sec->qp->event_handler && sec->qp->qp_context) { + event.element.qp = sec->qp; + sec->qp->event_handler(&event, + sec->qp->qp_context); + } + + list_for_each_entry(shared_qp_sec, + &sec->shared_qp_list, + shared_qp_list) { + struct ib_qp *qp = shared_qp_sec->qp; + + if (qp->event_handler && qp->qp_context) { + event.element.qp = qp; + event.device = qp->device; + qp->event_handler(&event, + qp->qp_context); + } + } +} + +static inline void check_pkey_qps(struct pkey_index_qp_list *pkey, + struct ib_device *device, + u32 port_num, + u64 subnet_prefix) +{ + struct ib_port_pkey *pp, *tmp_pp; + bool comp; + LIST_HEAD(to_error_list); + u16 pkey_val; + + if (!ib_get_cached_pkey(device, + port_num, + pkey->pkey_index, + &pkey_val)) { + spin_lock(&pkey->qp_list_lock); + list_for_each_entry(pp, &pkey->qp_list, qp_list) { + if (atomic_read(&pp->sec->error_list_count)) + continue; + + if (enforce_qp_pkey_security(pkey_val, + subnet_prefix, + pp->sec)) { + atomic_inc(&pp->sec->error_list_count); + list_add(&pp->to_error_list, + &to_error_list); + } + } + spin_unlock(&pkey->qp_list_lock); + } + + list_for_each_entry_safe(pp, + tmp_pp, + &to_error_list, + to_error_list) { + mutex_lock(&pp->sec->mutex); + qp_to_error(pp->sec); + list_del(&pp->to_error_list); + atomic_dec(&pp->sec->error_list_count); + comp = pp->sec->destroying; + mutex_unlock(&pp->sec->mutex); + + if (comp) + complete(&pp->sec->error_complete); + } +} + +/* The caller of this function must hold the QP security + * mutex. + */ +static int port_pkey_list_insert(struct ib_port_pkey *pp) +{ + struct pkey_index_qp_list *tmp_pkey; + struct pkey_index_qp_list *pkey; + struct ib_device *dev; + u32 port_num = pp->port_num; + int ret = 0; + + if (pp->state != IB_PORT_PKEY_VALID) + return 0; + + dev = pp->sec->dev; + + pkey = get_pkey_idx_qp_list(pp); + + if (!pkey) { + bool found = false; + + pkey = kzalloc(sizeof(*pkey), GFP_KERNEL); + if (!pkey) + return -ENOMEM; + + spin_lock(&dev->port_data[port_num].pkey_list_lock); + /* Check for the PKey again. A racing process may + * have created it. + */ + list_for_each_entry(tmp_pkey, + &dev->port_data[port_num].pkey_list, + pkey_index_list) { + if (tmp_pkey->pkey_index == pp->pkey_index) { + kfree(pkey); + pkey = tmp_pkey; + found = true; + break; + } + } + + if (!found) { + pkey->pkey_index = pp->pkey_index; + spin_lock_init(&pkey->qp_list_lock); + INIT_LIST_HEAD(&pkey->qp_list); + list_add(&pkey->pkey_index_list, + &dev->port_data[port_num].pkey_list); + } + spin_unlock(&dev->port_data[port_num].pkey_list_lock); + } + + spin_lock(&pkey->qp_list_lock); + list_add(&pp->qp_list, &pkey->qp_list); + spin_unlock(&pkey->qp_list_lock); + + pp->state = IB_PORT_PKEY_LISTED; + + return ret; +} + +/* The caller of this function must hold the QP security + * mutex. + */ +static void port_pkey_list_remove(struct ib_port_pkey *pp) +{ + struct pkey_index_qp_list *pkey; + + if (pp->state != IB_PORT_PKEY_LISTED) + return; + + pkey = get_pkey_idx_qp_list(pp); + + spin_lock(&pkey->qp_list_lock); + list_del(&pp->qp_list); + spin_unlock(&pkey->qp_list_lock); + + /* The setting may still be valid, i.e. after + * a destroy has failed for example. + */ + pp->state = IB_PORT_PKEY_VALID; +} + +static void destroy_qp_security(struct ib_qp_security *sec) +{ + security_ib_free_security(sec->security); + kfree(sec->ports_pkeys); + kfree(sec); +} + +/* The caller of this function must hold the QP security + * mutex. + */ +static struct ib_ports_pkeys *get_new_pps(const struct ib_qp *qp, + const struct ib_qp_attr *qp_attr, + int qp_attr_mask) +{ + struct ib_ports_pkeys *new_pps; + struct ib_ports_pkeys *qp_pps = qp->qp_sec->ports_pkeys; + + new_pps = kzalloc(sizeof(*new_pps), GFP_KERNEL); + if (!new_pps) + return NULL; + + if (qp_attr_mask & IB_QP_PORT) + new_pps->main.port_num = qp_attr->port_num; + else if (qp_pps) + new_pps->main.port_num = qp_pps->main.port_num; + + if (qp_attr_mask & IB_QP_PKEY_INDEX) + new_pps->main.pkey_index = qp_attr->pkey_index; + else if (qp_pps) + new_pps->main.pkey_index = qp_pps->main.pkey_index; + + if (((qp_attr_mask & IB_QP_PKEY_INDEX) && + (qp_attr_mask & IB_QP_PORT)) || + (qp_pps && qp_pps->main.state != IB_PORT_PKEY_NOT_VALID)) + new_pps->main.state = IB_PORT_PKEY_VALID; + + if (qp_attr_mask & IB_QP_ALT_PATH) { + new_pps->alt.port_num = qp_attr->alt_port_num; + new_pps->alt.pkey_index = qp_attr->alt_pkey_index; + new_pps->alt.state = IB_PORT_PKEY_VALID; + } else if (qp_pps) { + new_pps->alt.port_num = qp_pps->alt.port_num; + new_pps->alt.pkey_index = qp_pps->alt.pkey_index; + if (qp_pps->alt.state != IB_PORT_PKEY_NOT_VALID) + new_pps->alt.state = IB_PORT_PKEY_VALID; + } + + new_pps->main.sec = qp->qp_sec; + new_pps->alt.sec = qp->qp_sec; + return new_pps; +} + +int ib_open_shared_qp_security(struct ib_qp *qp, struct ib_device *dev) +{ + struct ib_qp *real_qp = qp->real_qp; + int ret; + + ret = ib_create_qp_security(qp, dev); + + if (ret) + return ret; + + if (!qp->qp_sec) + return 0; + + mutex_lock(&real_qp->qp_sec->mutex); + ret = check_qp_port_pkey_settings(real_qp->qp_sec->ports_pkeys, + qp->qp_sec); + + if (ret) + goto ret; + + if (qp != real_qp) + list_add(&qp->qp_sec->shared_qp_list, + &real_qp->qp_sec->shared_qp_list); +ret: + mutex_unlock(&real_qp->qp_sec->mutex); + if (ret) + destroy_qp_security(qp->qp_sec); + + return ret; +} + +void ib_close_shared_qp_security(struct ib_qp_security *sec) +{ + struct ib_qp *real_qp = sec->qp->real_qp; + + mutex_lock(&real_qp->qp_sec->mutex); + list_del(&sec->shared_qp_list); + mutex_unlock(&real_qp->qp_sec->mutex); + + destroy_qp_security(sec); +} + +int ib_create_qp_security(struct ib_qp *qp, struct ib_device *dev) +{ + unsigned int i; + bool is_ib = false; + int ret; + + rdma_for_each_port (dev, i) { + is_ib = rdma_protocol_ib(dev, i); + if (is_ib) + break; + } + + /* If this isn't an IB device don't create the security context */ + if (!is_ib) + return 0; + + qp->qp_sec = kzalloc(sizeof(*qp->qp_sec), GFP_KERNEL); + if (!qp->qp_sec) + return -ENOMEM; + + qp->qp_sec->qp = qp; + qp->qp_sec->dev = dev; + mutex_init(&qp->qp_sec->mutex); + INIT_LIST_HEAD(&qp->qp_sec->shared_qp_list); + atomic_set(&qp->qp_sec->error_list_count, 0); + init_completion(&qp->qp_sec->error_complete); + ret = security_ib_alloc_security(&qp->qp_sec->security); + if (ret) { + kfree(qp->qp_sec); + qp->qp_sec = NULL; + } + + return ret; +} +EXPORT_SYMBOL(ib_create_qp_security); + +void ib_destroy_qp_security_begin(struct ib_qp_security *sec) +{ + /* Return if not IB */ + if (!sec) + return; + + mutex_lock(&sec->mutex); + + /* Remove the QP from the lists so it won't get added to + * a to_error_list during the destroy process. + */ + if (sec->ports_pkeys) { + port_pkey_list_remove(&sec->ports_pkeys->main); + port_pkey_list_remove(&sec->ports_pkeys->alt); + } + + /* If the QP is already in one or more of those lists + * the destroying flag will ensure the to error flow + * doesn't operate on an undefined QP. + */ + sec->destroying = true; + + /* Record the error list count to know how many completions + * to wait for. + */ + sec->error_comps_pending = atomic_read(&sec->error_list_count); + + mutex_unlock(&sec->mutex); +} + +void ib_destroy_qp_security_abort(struct ib_qp_security *sec) +{ + int ret; + int i; + + /* Return if not IB */ + if (!sec) + return; + + /* If a concurrent cache update is in progress this + * QP security could be marked for an error state + * transition. Wait for this to complete. + */ + for (i = 0; i < sec->error_comps_pending; i++) + wait_for_completion(&sec->error_complete); + + mutex_lock(&sec->mutex); + sec->destroying = false; + + /* Restore the position in the lists and verify + * access is still allowed in case a cache update + * occurred while attempting to destroy. + * + * Because these setting were listed already + * and removed during ib_destroy_qp_security_begin + * we know the pkey_index_qp_list for the PKey + * already exists so port_pkey_list_insert won't fail. + */ + if (sec->ports_pkeys) { + port_pkey_list_insert(&sec->ports_pkeys->main); + port_pkey_list_insert(&sec->ports_pkeys->alt); + } + + ret = check_qp_port_pkey_settings(sec->ports_pkeys, sec); + if (ret) + qp_to_error(sec); + + mutex_unlock(&sec->mutex); +} + +void ib_destroy_qp_security_end(struct ib_qp_security *sec) +{ + int i; + + /* Return if not IB */ + if (!sec) + return; + + /* If a concurrent cache update is occurring we must + * wait until this QP security structure is processed + * in the QP to error flow before destroying it because + * the to_error_list is in use. + */ + for (i = 0; i < sec->error_comps_pending; i++) + wait_for_completion(&sec->error_complete); + + destroy_qp_security(sec); +} + +void ib_security_cache_change(struct ib_device *device, + u32 port_num, + u64 subnet_prefix) +{ + struct pkey_index_qp_list *pkey; + + list_for_each_entry (pkey, &device->port_data[port_num].pkey_list, + pkey_index_list) { + check_pkey_qps(pkey, + device, + port_num, + subnet_prefix); + } +} + +void ib_security_release_port_pkey_list(struct ib_device *device) +{ + struct pkey_index_qp_list *pkey, *tmp_pkey; + unsigned int i; + + rdma_for_each_port (device, i) { + list_for_each_entry_safe(pkey, + tmp_pkey, + &device->port_data[i].pkey_list, + pkey_index_list) { + list_del(&pkey->pkey_index_list); + kfree(pkey); + } + } +} + +int ib_security_modify_qp(struct ib_qp *qp, + struct ib_qp_attr *qp_attr, + int qp_attr_mask, + struct ib_udata *udata) +{ + int ret = 0; + struct ib_ports_pkeys *tmp_pps; + struct ib_ports_pkeys *new_pps = NULL; + struct ib_qp *real_qp = qp->real_qp; + bool special_qp = (real_qp->qp_type == IB_QPT_SMI || + real_qp->qp_type == IB_QPT_GSI || + real_qp->qp_type >= IB_QPT_RESERVED1); + bool pps_change = ((qp_attr_mask & (IB_QP_PKEY_INDEX | IB_QP_PORT)) || + (qp_attr_mask & IB_QP_ALT_PATH)); + + WARN_ONCE((qp_attr_mask & IB_QP_PORT && + rdma_protocol_ib(real_qp->device, qp_attr->port_num) && + !real_qp->qp_sec), + "%s: QP security is not initialized for IB QP: %u\n", + __func__, real_qp->qp_num); + + /* The port/pkey settings are maintained only for the real QP. Open + * handles on the real QP will be in the shared_qp_list. When + * enforcing security on the real QP all the shared QPs will be + * checked as well. + */ + + if (pps_change && !special_qp && real_qp->qp_sec) { + mutex_lock(&real_qp->qp_sec->mutex); + new_pps = get_new_pps(real_qp, + qp_attr, + qp_attr_mask); + if (!new_pps) { + mutex_unlock(&real_qp->qp_sec->mutex); + return -ENOMEM; + } + /* Add this QP to the lists for the new port + * and pkey settings before checking for permission + * in case there is a concurrent cache update + * occurring. Walking the list for a cache change + * doesn't acquire the security mutex unless it's + * sending the QP to error. + */ + ret = port_pkey_list_insert(&new_pps->main); + + if (!ret) + ret = port_pkey_list_insert(&new_pps->alt); + + if (!ret) + ret = check_qp_port_pkey_settings(new_pps, + real_qp->qp_sec); + } + + if (!ret) + ret = real_qp->device->ops.modify_qp(real_qp, + qp_attr, + qp_attr_mask, + udata); + + if (new_pps) { + /* Clean up the lists and free the appropriate + * ports_pkeys structure. + */ + if (ret) { + tmp_pps = new_pps; + } else { + tmp_pps = real_qp->qp_sec->ports_pkeys; + real_qp->qp_sec->ports_pkeys = new_pps; + } + + if (tmp_pps) { + port_pkey_list_remove(&tmp_pps->main); + port_pkey_list_remove(&tmp_pps->alt); + } + kfree(tmp_pps); + mutex_unlock(&real_qp->qp_sec->mutex); + } + return ret; +} + +static int ib_security_pkey_access(struct ib_device *dev, + u32 port_num, + u16 pkey_index, + void *sec) +{ + u64 subnet_prefix; + u16 pkey; + int ret; + + if (!rdma_protocol_ib(dev, port_num)) + return 0; + + ret = ib_get_cached_pkey(dev, port_num, pkey_index, &pkey); + if (ret) + return ret; + + ib_get_cached_subnet_prefix(dev, port_num, &subnet_prefix); + + return security_ib_pkey_access(sec, subnet_prefix, pkey); +} + +void ib_mad_agent_security_change(void) +{ + struct ib_mad_agent *ag; + + spin_lock(&mad_agent_list_lock); + list_for_each_entry(ag, + &mad_agent_list, + mad_agent_sec_list) + WRITE_ONCE(ag->smp_allowed, + !security_ib_endport_manage_subnet(ag->security, + dev_name(&ag->device->dev), ag->port_num)); + spin_unlock(&mad_agent_list_lock); +} + +int ib_mad_agent_security_setup(struct ib_mad_agent *agent, + enum ib_qp_type qp_type) +{ + int ret; + + if (!rdma_protocol_ib(agent->device, agent->port_num)) + return 0; + + INIT_LIST_HEAD(&agent->mad_agent_sec_list); + + ret = security_ib_alloc_security(&agent->security); + if (ret) + return ret; + + if (qp_type != IB_QPT_SMI) + return 0; + + spin_lock(&mad_agent_list_lock); + ret = security_ib_endport_manage_subnet(agent->security, + dev_name(&agent->device->dev), + agent->port_num); + if (ret) + goto free_security; + + WRITE_ONCE(agent->smp_allowed, true); + list_add(&agent->mad_agent_sec_list, &mad_agent_list); + spin_unlock(&mad_agent_list_lock); + return 0; + +free_security: + spin_unlock(&mad_agent_list_lock); + security_ib_free_security(agent->security); + return ret; +} + +void ib_mad_agent_security_cleanup(struct ib_mad_agent *agent) +{ + if (!rdma_protocol_ib(agent->device, agent->port_num)) + return; + + if (agent->qp->qp_type == IB_QPT_SMI) { + spin_lock(&mad_agent_list_lock); + list_del(&agent->mad_agent_sec_list); + spin_unlock(&mad_agent_list_lock); + } + + security_ib_free_security(agent->security); +} + +int ib_mad_enforce_security(struct ib_mad_agent_private *map, u16 pkey_index) +{ + if (!rdma_protocol_ib(map->agent.device, map->agent.port_num)) + return 0; + + if (map->agent.qp->qp_type == IB_QPT_SMI) { + if (!READ_ONCE(map->agent.smp_allowed)) + return -EACCES; + return 0; + } + + return ib_security_pkey_access(map->agent.device, + map->agent.port_num, + pkey_index, + map->agent.security); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/smi.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/smi.c new file mode 100644 index 0000000..45f09b7 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/smi.c @@ -0,0 +1,338 @@ +/* + * Copyright (c) 2004, 2005 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2004, 2005 Infinicon Corporation. All rights reserved. + * Copyright (c) 2004, 2005 Intel Corporation. All rights reserved. + * Copyright (c) 2004, 2005 Topspin Corporation. All rights reserved. + * Copyright (c) 2004-2007 Voltaire Corporation. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2014 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include "smi.h" +#include "opa_smi.h" + +static enum smi_action __smi_handle_dr_smp_send(bool is_switch, u32 port_num, + u8 *hop_ptr, u8 hop_cnt, + const u8 *initial_path, + const u8 *return_path, + u8 direction, + bool dr_dlid_is_permissive, + bool dr_slid_is_permissive) +{ + /* See section 14.2.2.2, Vol 1 IB spec */ + /* C14-6 -- valid hop_cnt values are from 0 to 63 */ + if (hop_cnt >= IB_SMP_MAX_PATH_HOPS) + return IB_SMI_DISCARD; + + if (!direction) { + /* C14-9:1 */ + if (hop_cnt && *hop_ptr == 0) { + (*hop_ptr)++; + return (initial_path[*hop_ptr] == + port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD); + } + + /* C14-9:2 */ + if (*hop_ptr && *hop_ptr < hop_cnt) { + if (!is_switch) + return IB_SMI_DISCARD; + + /* return_path set when received */ + (*hop_ptr)++; + return (initial_path[*hop_ptr] == + port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD); + } + + /* C14-9:3 -- We're at the end of the DR segment of path */ + if (*hop_ptr == hop_cnt) { + /* return_path set when received */ + (*hop_ptr)++; + return (is_switch || + dr_dlid_is_permissive ? + IB_SMI_HANDLE : IB_SMI_DISCARD); + } + + /* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */ + /* C14-9:5 -- Fail unreasonable hop pointer */ + return (*hop_ptr == hop_cnt + 1 ? IB_SMI_HANDLE : IB_SMI_DISCARD); + + } else { + /* C14-13:1 */ + if (hop_cnt && *hop_ptr == hop_cnt + 1) { + (*hop_ptr)--; + return (return_path[*hop_ptr] == + port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD); + } + + /* C14-13:2 */ + if (2 <= *hop_ptr && *hop_ptr <= hop_cnt) { + if (!is_switch) + return IB_SMI_DISCARD; + + (*hop_ptr)--; + return (return_path[*hop_ptr] == + port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD); + } + + /* C14-13:3 -- at the end of the DR segment of path */ + if (*hop_ptr == 1) { + (*hop_ptr)--; + /* C14-13:3 -- SMPs destined for SM shouldn't be here */ + return (is_switch || + dr_slid_is_permissive ? + IB_SMI_HANDLE : IB_SMI_DISCARD); + } + + /* C14-13:4 -- hop_ptr = 0 -> should have gone to SM */ + if (*hop_ptr == 0) + return IB_SMI_HANDLE; + + /* C14-13:5 -- Check for unreasonable hop pointer */ + return IB_SMI_DISCARD; + } +} + +/* + * Fixup a directed route SMP for sending + * Return IB_SMI_DISCARD if the SMP should be discarded + */ +enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp, + bool is_switch, u32 port_num) +{ + return __smi_handle_dr_smp_send(is_switch, port_num, + &smp->hop_ptr, smp->hop_cnt, + smp->initial_path, + smp->return_path, + ib_get_smp_direction(smp), + smp->dr_dlid == IB_LID_PERMISSIVE, + smp->dr_slid == IB_LID_PERMISSIVE); +} + +enum smi_action opa_smi_handle_dr_smp_send(struct opa_smp *smp, + bool is_switch, u32 port_num) +{ + return __smi_handle_dr_smp_send(is_switch, port_num, + &smp->hop_ptr, smp->hop_cnt, + smp->route.dr.initial_path, + smp->route.dr.return_path, + opa_get_smp_direction(smp), + smp->route.dr.dr_dlid == + OPA_LID_PERMISSIVE, + smp->route.dr.dr_slid == + OPA_LID_PERMISSIVE); +} + +static enum smi_action __smi_handle_dr_smp_recv(bool is_switch, u32 port_num, + int phys_port_cnt, + u8 *hop_ptr, u8 hop_cnt, + const u8 *initial_path, + u8 *return_path, + u8 direction, + bool dr_dlid_is_permissive, + bool dr_slid_is_permissive) +{ + /* See section 14.2.2.2, Vol 1 IB spec */ + /* C14-6 -- valid hop_cnt values are from 0 to 63 */ + if (hop_cnt >= IB_SMP_MAX_PATH_HOPS) + return IB_SMI_DISCARD; + + if (!direction) { + /* C14-9:1 -- sender should have incremented hop_ptr */ + if (hop_cnt && *hop_ptr == 0) + return IB_SMI_DISCARD; + + /* C14-9:2 -- intermediate hop */ + if (*hop_ptr && *hop_ptr < hop_cnt) { + if (!is_switch) + return IB_SMI_DISCARD; + + return_path[*hop_ptr] = port_num; + /* hop_ptr updated when sending */ + return (initial_path[*hop_ptr+1] <= phys_port_cnt ? + IB_SMI_HANDLE : IB_SMI_DISCARD); + } + + /* C14-9:3 -- We're at the end of the DR segment of path */ + if (*hop_ptr == hop_cnt) { + if (hop_cnt) + return_path[*hop_ptr] = port_num; + /* hop_ptr updated when sending */ + + return (is_switch || + dr_dlid_is_permissive ? + IB_SMI_HANDLE : IB_SMI_DISCARD); + } + + /* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */ + /* C14-9:5 -- fail unreasonable hop pointer */ + return (*hop_ptr == hop_cnt + 1 ? IB_SMI_HANDLE : IB_SMI_DISCARD); + + } else { + + /* C14-13:1 */ + if (hop_cnt && *hop_ptr == hop_cnt + 1) { + (*hop_ptr)--; + return (return_path[*hop_ptr] == + port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD); + } + + /* C14-13:2 */ + if (2 <= *hop_ptr && *hop_ptr <= hop_cnt) { + if (!is_switch) + return IB_SMI_DISCARD; + + /* hop_ptr updated when sending */ + return (return_path[*hop_ptr-1] <= phys_port_cnt ? + IB_SMI_HANDLE : IB_SMI_DISCARD); + } + + /* C14-13:3 -- We're at the end of the DR segment of path */ + if (*hop_ptr == 1) { + if (dr_slid_is_permissive) { + /* giving SMP to SM - update hop_ptr */ + (*hop_ptr)--; + return IB_SMI_HANDLE; + } + /* hop_ptr updated when sending */ + return (is_switch ? IB_SMI_HANDLE : IB_SMI_DISCARD); + } + + /* C14-13:4 -- hop_ptr = 0 -> give to SM */ + /* C14-13:5 -- Check for unreasonable hop pointer */ + return (*hop_ptr == 0 ? IB_SMI_HANDLE : IB_SMI_DISCARD); + } +} + +/* + * Adjust information for a received SMP + * Return IB_SMI_DISCARD if the SMP should be dropped + */ +enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, bool is_switch, + u32 port_num, int phys_port_cnt) +{ + return __smi_handle_dr_smp_recv(is_switch, port_num, phys_port_cnt, + &smp->hop_ptr, smp->hop_cnt, + smp->initial_path, + smp->return_path, + ib_get_smp_direction(smp), + smp->dr_dlid == IB_LID_PERMISSIVE, + smp->dr_slid == IB_LID_PERMISSIVE); +} + +/* + * Adjust information for a received SMP + * Return IB_SMI_DISCARD if the SMP should be dropped + */ +enum smi_action opa_smi_handle_dr_smp_recv(struct opa_smp *smp, bool is_switch, + u32 port_num, int phys_port_cnt) +{ + return __smi_handle_dr_smp_recv(is_switch, port_num, phys_port_cnt, + &smp->hop_ptr, smp->hop_cnt, + smp->route.dr.initial_path, + smp->route.dr.return_path, + opa_get_smp_direction(smp), + smp->route.dr.dr_dlid == + OPA_LID_PERMISSIVE, + smp->route.dr.dr_slid == + OPA_LID_PERMISSIVE); +} + +static enum smi_forward_action __smi_check_forward_dr_smp(u8 hop_ptr, u8 hop_cnt, + u8 direction, + bool dr_dlid_is_permissive, + bool dr_slid_is_permissive) +{ + if (!direction) { + /* C14-9:2 -- intermediate hop */ + if (hop_ptr && hop_ptr < hop_cnt) + return IB_SMI_FORWARD; + + /* C14-9:3 -- at the end of the DR segment of path */ + if (hop_ptr == hop_cnt) + return (dr_dlid_is_permissive ? + IB_SMI_SEND : IB_SMI_LOCAL); + + /* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */ + if (hop_ptr == hop_cnt + 1) + return IB_SMI_SEND; + } else { + /* C14-13:2 -- intermediate hop */ + if (2 <= hop_ptr && hop_ptr <= hop_cnt) + return IB_SMI_FORWARD; + + /* C14-13:3 -- at the end of the DR segment of path */ + if (hop_ptr == 1) + return (!dr_slid_is_permissive ? + IB_SMI_SEND : IB_SMI_LOCAL); + } + return IB_SMI_LOCAL; + +} + +enum smi_forward_action smi_check_forward_dr_smp(struct ib_smp *smp) +{ + return __smi_check_forward_dr_smp(smp->hop_ptr, smp->hop_cnt, + ib_get_smp_direction(smp), + smp->dr_dlid == IB_LID_PERMISSIVE, + smp->dr_slid == IB_LID_PERMISSIVE); +} + +enum smi_forward_action opa_smi_check_forward_dr_smp(struct opa_smp *smp) +{ + return __smi_check_forward_dr_smp(smp->hop_ptr, smp->hop_cnt, + opa_get_smp_direction(smp), + smp->route.dr.dr_dlid == + OPA_LID_PERMISSIVE, + smp->route.dr.dr_slid == + OPA_LID_PERMISSIVE); +} + +/* + * Return the forwarding port number from initial_path for outgoing SMP and + * from return_path for returning SMP + */ +int smi_get_fwd_port(struct ib_smp *smp) +{ + return (!ib_get_smp_direction(smp) ? smp->initial_path[smp->hop_ptr+1] : + smp->return_path[smp->hop_ptr-1]); +} + +/* + * Return the forwarding port number from initial_path for outgoing SMP and + * from return_path for returning SMP + */ +int opa_smi_get_fwd_port(struct opa_smp *smp) +{ + return !opa_get_smp_direction(smp) ? smp->route.dr.initial_path[smp->hop_ptr+1] : + smp->route.dr.return_path[smp->hop_ptr-1]; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/smi.h b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/smi.h new file mode 100644 index 0000000..e350ed6 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/smi.h @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2004 Infinicon Corporation. All rights reserved. + * Copyright (c) 2004 Intel Corporation. All rights reserved. + * Copyright (c) 2004 Topspin Corporation. All rights reserved. + * Copyright (c) 2004-2007 Voltaire Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef __SMI_H_ +#define __SMI_H_ + +#include + +enum smi_action { + IB_SMI_DISCARD, + IB_SMI_HANDLE +}; + +enum smi_forward_action { + IB_SMI_LOCAL, /* SMP should be completed up the stack */ + IB_SMI_SEND, /* received DR SMP should be forwarded to the send queue */ + IB_SMI_FORWARD /* SMP should be forwarded (for switches only) */ +}; + +enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, bool is_switch, + u32 port_num, int phys_port_cnt); +int smi_get_fwd_port(struct ib_smp *smp); +extern enum smi_forward_action smi_check_forward_dr_smp(struct ib_smp *smp); +extern enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp, + bool is_switch, u32 port_num); + +/* + * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM + * via process_mad + */ +static inline enum smi_action smi_check_local_smp(struct ib_smp *smp, + struct ib_device *device) +{ + /* C14-9:3 -- We're at the end of the DR segment of path */ + /* C14-9:4 -- Hop Pointer = Hop Count + 1 -> give to SMA/SM */ + return ((device->ops.process_mad && + !ib_get_smp_direction(smp) && + (smp->hop_ptr == smp->hop_cnt + 1)) ? + IB_SMI_HANDLE : IB_SMI_DISCARD); +} + +/* + * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM + * via process_mad + */ +static inline enum smi_action smi_check_local_returning_smp(struct ib_smp *smp, + struct ib_device *device) +{ + /* C14-13:3 -- We're at the end of the DR segment of path */ + /* C14-13:4 -- Hop Pointer == 0 -> give to SM */ + return ((device->ops.process_mad && + ib_get_smp_direction(smp) && + !smp->hop_ptr) ? IB_SMI_HANDLE : IB_SMI_DISCARD); +} + +#endif /* __SMI_H_ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/sysfs.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/sysfs.c new file mode 100644 index 0000000..19127c4 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/sysfs.c @@ -0,0 +1,1506 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "core_priv.h" + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +struct port_table_attribute { + struct ib_port_attribute attr; + char name[8]; + int index; + __be16 attr_id; +}; + +struct gid_attr_group { + struct ib_port *port; + struct kobject kobj; + struct attribute_group groups[2]; + const struct attribute_group *groups_list[3]; + struct port_table_attribute attrs_list[]; +}; + +struct ib_port { + struct kobject kobj; + struct ib_device *ibdev; + struct gid_attr_group *gid_attr_group; + struct hw_stats_port_data *hw_stats_data; + + struct attribute_group groups[3]; + const struct attribute_group *groups_list[5]; + u32 port_num; + struct port_table_attribute attrs_list[]; +}; + +struct hw_stats_device_attribute { + struct device_attribute attr; + ssize_t (*show)(struct ib_device *ibdev, struct rdma_hw_stats *stats, + unsigned int index, unsigned int port_num, char *buf); + ssize_t (*store)(struct ib_device *ibdev, struct rdma_hw_stats *stats, + unsigned int index, unsigned int port_num, + const char *buf, size_t count); +}; + +struct hw_stats_port_attribute { + struct ib_port_attribute attr; + ssize_t (*show)(struct ib_device *ibdev, struct rdma_hw_stats *stats, + unsigned int index, unsigned int port_num, char *buf); + ssize_t (*store)(struct ib_device *ibdev, struct rdma_hw_stats *stats, + unsigned int index, unsigned int port_num, + const char *buf, size_t count); +}; + +struct hw_stats_device_data { + struct attribute_group group; + struct rdma_hw_stats *stats; + struct hw_stats_device_attribute attrs[]; +}; + +struct hw_stats_port_data { + struct rdma_hw_stats *stats; + struct hw_stats_port_attribute attrs[]; +}; + +static ssize_t port_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct ib_port_attribute *port_attr = + container_of(attr, struct ib_port_attribute, attr); + struct ib_port *p = container_of(kobj, struct ib_port, kobj); + + if (!port_attr->show) + return -EIO; + + return port_attr->show(p->ibdev, p->port_num, port_attr, buf); +} + +static ssize_t port_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t count) +{ + struct ib_port_attribute *port_attr = + container_of(attr, struct ib_port_attribute, attr); + struct ib_port *p = container_of(kobj, struct ib_port, kobj); + + if (!port_attr->store) + return -EIO; + return port_attr->store(p->ibdev, p->port_num, port_attr, buf, count); +} + +struct ib_device *ib_port_sysfs_get_ibdev_kobj(struct kobject *kobj, + u32 *port_num) +{ + struct ib_port *port = container_of(kobj, struct ib_port, kobj); + + *port_num = port->port_num; + return port->ibdev; +} +EXPORT_SYMBOL(ib_port_sysfs_get_ibdev_kobj); + +static const struct sysfs_ops port_sysfs_ops = { + .show = port_attr_show, + .store = port_attr_store +}; + +static ssize_t hw_stat_device_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hw_stats_device_attribute *stat_attr = + container_of(attr, struct hw_stats_device_attribute, attr); + struct ib_device *ibdev = container_of(dev, struct ib_device, dev); + + return stat_attr->show(ibdev, ibdev->hw_stats_data->stats, + stat_attr - ibdev->hw_stats_data->attrs, 0, buf); +} + +static ssize_t hw_stat_device_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct hw_stats_device_attribute *stat_attr = + container_of(attr, struct hw_stats_device_attribute, attr); + struct ib_device *ibdev = container_of(dev, struct ib_device, dev); + + return stat_attr->store(ibdev, ibdev->hw_stats_data->stats, + stat_attr - ibdev->hw_stats_data->attrs, 0, buf, + count); +} + +static ssize_t hw_stat_port_show(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *attr, char *buf) +{ + struct hw_stats_port_attribute *stat_attr = + container_of(attr, struct hw_stats_port_attribute, attr); + struct ib_port *port = ibdev->port_data[port_num].sysfs; + + return stat_attr->show(ibdev, port->hw_stats_data->stats, + stat_attr - port->hw_stats_data->attrs, + port->port_num, buf); +} + +static ssize_t hw_stat_port_store(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *attr, + const char *buf, size_t count) +{ + struct hw_stats_port_attribute *stat_attr = + container_of(attr, struct hw_stats_port_attribute, attr); + struct ib_port *port = ibdev->port_data[port_num].sysfs; + + return stat_attr->store(ibdev, port->hw_stats_data->stats, + stat_attr - port->hw_stats_data->attrs, + port->port_num, buf, count); +} + +static ssize_t gid_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct ib_port_attribute *port_attr = + container_of(attr, struct ib_port_attribute, attr); + struct ib_port *p = container_of(kobj, struct gid_attr_group, + kobj)->port; + + if (!port_attr->show) + return -EIO; + + return port_attr->show(p->ibdev, p->port_num, port_attr, buf); +} + +static const struct sysfs_ops gid_attr_sysfs_ops = { + .show = gid_attr_show +}; + +static ssize_t state_show(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *unused, char *buf) +{ + struct ib_port_attr attr; + ssize_t ret; + + static const char *state_name[] = { + [IB_PORT_NOP] = "NOP", + [IB_PORT_DOWN] = "DOWN", + [IB_PORT_INIT] = "INIT", + [IB_PORT_ARMED] = "ARMED", + [IB_PORT_ACTIVE] = "ACTIVE", + [IB_PORT_ACTIVE_DEFER] = "ACTIVE_DEFER" + }; + + ret = ib_query_port(ibdev, port_num, &attr); + if (ret) + return ret; + + return sysfs_emit(buf, "%d: %s\n", attr.state, + attr.state >= 0 && + attr.state < ARRAY_SIZE(state_name) ? + state_name[attr.state] : + "UNKNOWN"); +} + +static ssize_t lid_show(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *unused, char *buf) +{ + struct ib_port_attr attr; + ssize_t ret; + + ret = ib_query_port(ibdev, port_num, &attr); + if (ret) + return ret; + + return sysfs_emit(buf, "0x%x\n", attr.lid); +} + +static ssize_t lid_mask_count_show(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *unused, char *buf) +{ + struct ib_port_attr attr; + ssize_t ret; + + ret = ib_query_port(ibdev, port_num, &attr); + if (ret) + return ret; + + return sysfs_emit(buf, "%u\n", attr.lmc); +} + +static ssize_t sm_lid_show(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *unused, char *buf) +{ + struct ib_port_attr attr; + ssize_t ret; + + ret = ib_query_port(ibdev, port_num, &attr); + if (ret) + return ret; + + return sysfs_emit(buf, "0x%x\n", attr.sm_lid); +} + +static ssize_t has_smi_show(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *unused, char *buf) +{ + struct ib_port_attr attr; + ssize_t ret; + + ret = ib_query_port(ibdev, port_num, &attr); + if (ret) + return ret; + + return sprintf(buf, "%d\n", attr.has_smi); +} + +static ssize_t sm_sl_show(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *unused, char *buf) +{ + struct ib_port_attr attr; + ssize_t ret; + + ret = ib_query_port(ibdev, port_num, &attr); + if (ret) + return ret; + + return sysfs_emit(buf, "%u\n", attr.sm_sl); +} + +static ssize_t cap_mask_show(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *unused, char *buf) +{ + struct ib_port_attr attr; + ssize_t ret; + + ret = ib_query_port(ibdev, port_num, &attr); + if (ret) + return ret; + + return sysfs_emit(buf, "0x%08x\n", attr.port_cap_flags); +} + +static ssize_t rate_show(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *unused, char *buf) +{ + struct ib_port_attr attr; + char *speed = ""; + int rate; /* in deci-Gb/sec */ + ssize_t ret; + + ret = ib_query_port(ibdev, port_num, &attr); + if (ret) + return ret; + + if (!attr.active_speed && !attr.active_width) + return sprintf(buf, "0 GB/sec\n"); + + switch (attr.active_speed) { + case IB_SPEED_DDR: + speed = " DDR"; + rate = 50; + break; + case IB_SPEED_QDR: + speed = " QDR"; + rate = 100; + break; + case IB_SPEED_FDR10: + speed = " FDR10"; + rate = 100; + break; + case IB_SPEED_FDR: + speed = " FDR"; + rate = 140; + break; + case IB_SPEED_EDR: + speed = " EDR"; + rate = 250; + break; + case IB_SPEED_HDR: + speed = " HDR"; + rate = 500; + break; + case IB_SPEED_NDR: + speed = " NDR"; + rate = 1000; + break; + case IB_SPEED_SDR: + default: /* default to SDR for invalid rates */ + speed = " SDR"; + rate = 25; + break; + } + + rate *= ib_width_enum_to_int(attr.active_width); + if (rate < 0) + return -EINVAL; + + return sysfs_emit(buf, "%d%s Gb/sec (%dX%s)\n", rate / 10, + rate % 10 ? ".5" : "", + ib_width_enum_to_int(attr.active_width), speed); +} + +static const char *phys_state_to_str(enum ib_port_phys_state phys_state) +{ + static const char *phys_state_str[] = { + "", + "Sleep", + "Polling", + "Disabled", + "PortConfigurationTraining", + "LinkUp", + "LinkErrorRecovery", + "Phy Test", + }; + + if (phys_state < ARRAY_SIZE(phys_state_str)) + return phys_state_str[phys_state]; + return ""; +} + +static ssize_t phys_state_show(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *unused, char *buf) +{ + struct ib_port_attr attr; + + ssize_t ret; + + ret = ib_query_port(ibdev, port_num, &attr); + if (ret) + return ret; + + return sysfs_emit(buf, "%u: %s\n", attr.phys_state, + phys_state_to_str(attr.phys_state)); +} + +static ssize_t link_layer_show(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *unused, char *buf) +{ + const char *output; + + switch (rdma_port_get_link_layer(ibdev, port_num)) { + case IB_LINK_LAYER_INFINIBAND: + output = "InfiniBand"; + break; + case IB_LINK_LAYER_ETHERNET: + output = "Ethernet"; + break; + default: + output = "Unknown"; + break; + } + + return sysfs_emit(buf, "%s\n", output); +} + +static IB_PORT_ATTR_RO(state); +static IB_PORT_ATTR_RO(lid); +static IB_PORT_ATTR_RO(lid_mask_count); +static IB_PORT_ATTR_RO(sm_lid); +static IB_PORT_ATTR_RO(sm_sl); +static IB_PORT_ATTR_RO(cap_mask); +static IB_PORT_ATTR_RO(rate); +static IB_PORT_ATTR_RO(phys_state); +static IB_PORT_ATTR_RO(link_layer); +static IB_PORT_ATTR_RO(has_smi); + +static struct attribute *port_default_attrs[] = { + &ib_port_attr_state.attr, + &ib_port_attr_lid.attr, + &ib_port_attr_lid_mask_count.attr, + &ib_port_attr_sm_lid.attr, + &ib_port_attr_has_smi.attr, + &ib_port_attr_sm_sl.attr, + &ib_port_attr_cap_mask.attr, + &ib_port_attr_rate.attr, + &ib_port_attr_phys_state.attr, + &ib_port_attr_link_layer.attr, + NULL +}; +ATTRIBUTE_GROUPS(port_default); + +static ssize_t print_ndev(const struct ib_gid_attr *gid_attr, char *buf) +{ + struct net_device *ndev; + int ret = -EINVAL; + + rcu_read_lock(); + ndev = rcu_dereference(gid_attr->ndev); + if (ndev) + ret = sysfs_emit(buf, "%s\n", ndev->name); + rcu_read_unlock(); + return ret; +} + +static ssize_t print_gid_type(const struct ib_gid_attr *gid_attr, char *buf) +{ + return sysfs_emit(buf, "%s\n", + ib_cache_gid_type_str(gid_attr->gid_type)); +} + +static ssize_t _show_port_gid_attr( + struct ib_device *ibdev, u32 port_num, struct ib_port_attribute *attr, + char *buf, + ssize_t (*print)(const struct ib_gid_attr *gid_attr, char *buf)) +{ + struct port_table_attribute *tab_attr = + container_of(attr, struct port_table_attribute, attr); + const struct ib_gid_attr *gid_attr; + ssize_t ret; + + gid_attr = rdma_get_gid_attr(ibdev, port_num, tab_attr->index); + if (IS_ERR(gid_attr)) + /* -EINVAL is returned for user space compatibility reasons. */ + return -EINVAL; + + if (rdma_check_gid_user_access(gid_attr)) + ret = print(gid_attr, buf); + else + ret = -EINVAL; + + rdma_put_gid_attr(gid_attr); + return ret; +} + +static ssize_t show_port_gid(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *attr, char *buf) +{ + struct port_table_attribute *tab_attr = + container_of(attr, struct port_table_attribute, attr); + const struct ib_gid_attr *gid_attr; + int len; + + gid_attr = rdma_get_gid_attr(ibdev, port_num, tab_attr->index); + if (IS_ERR(gid_attr)) { + const union ib_gid zgid = {}; + + /* If reading GID fails, it is likely due to GID entry being + * empty (invalid) or reserved GID in the table. User space + * expects to read GID table entries as long as it given index + * is within GID table size. Administrative/debugging tool + * fails to query rest of the GID entries if it hits error + * while querying a GID of the given index. To avoid user + * space throwing such error on fail to read gid, return zero + * GID as before. This maintains backward compatibility. + */ + return sysfs_emit(buf, "%pI6\n", zgid.raw); + } + + if (rdma_check_gid_user_access(gid_attr)) + len = sysfs_emit(buf, "%pI6\n", gid_attr->gid.raw); + else + len = sysfs_emit(buf, "%pI6\n", zgid.raw); + + rdma_put_gid_attr(gid_attr); + return len; +} + +static ssize_t show_port_gid_attr_ndev(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *attr, + char *buf) +{ + return _show_port_gid_attr(ibdev, port_num, attr, buf, print_ndev); +} + +static ssize_t show_port_gid_attr_gid_type(struct ib_device *ibdev, + u32 port_num, + struct ib_port_attribute *attr, + char *buf) +{ + return _show_port_gid_attr(ibdev, port_num, attr, buf, print_gid_type); +} + +static ssize_t show_port_pkey(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *attr, char *buf) +{ + struct port_table_attribute *tab_attr = + container_of(attr, struct port_table_attribute, attr); + u16 pkey; + int ret; + + ret = ib_query_pkey(ibdev, port_num, tab_attr->index, &pkey); + if (ret) + return ret; + + return sysfs_emit(buf, "0x%04x\n", pkey); +} + +#define PORT_PMA_ATTR(_name, _counter, _width, _offset) \ +struct port_table_attribute port_pma_attr_##_name = { \ + .attr = __ATTR(_name, S_IRUGO, show_pma_counter, NULL), \ + .index = (_offset) | ((_width) << 16) | ((_counter) << 24), \ + .attr_id = IB_PMA_PORT_COUNTERS, \ +} + +#define PORT_PMA_ATTR_EXT(_name, _width, _offset) \ +struct port_table_attribute port_pma_attr_ext_##_name = { \ + .attr = __ATTR(_name, S_IRUGO, show_pma_counter, NULL), \ + .index = (_offset) | ((_width) << 16), \ + .attr_id = IB_PMA_PORT_COUNTERS_EXT, \ +} + +/* + * Get a Perfmgmt MAD block of data. + * Returns error code or the number of bytes retrieved. + */ +static int get_perf_mad(struct ib_device *dev, int port_num, __be16 attr, + void *data, int offset, size_t size) +{ + struct ib_mad *in_mad; + struct ib_mad *out_mad; + size_t mad_size = sizeof(*out_mad); + u16 out_mad_pkey_index = 0; + ssize_t ret; + + if (!dev->ops.process_mad) + return -ENOSYS; + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kzalloc(sizeof(*out_mad), GFP_KERNEL); + if (!in_mad || !out_mad) { + ret = -ENOMEM; + goto out; + } + + in_mad->mad_hdr.base_version = 1; + in_mad->mad_hdr.mgmt_class = IB_MGMT_CLASS_PERF_MGMT; + in_mad->mad_hdr.class_version = 1; + in_mad->mad_hdr.method = IB_MGMT_METHOD_GET; + in_mad->mad_hdr.attr_id = attr; + + if (attr != IB_PMA_CLASS_PORT_INFO) + in_mad->data[41] = port_num; /* PortSelect field */ + + if ((dev->ops.process_mad(dev, IB_MAD_IGNORE_MKEY, port_num, NULL, NULL, + in_mad, out_mad, &mad_size, + &out_mad_pkey_index) & + (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) != + (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) { + ret = -EINVAL; + goto out; + } + memcpy(data, out_mad->data + offset, size); + ret = size; +out: + kfree(in_mad); + kfree(out_mad); + return ret; +} + +static ssize_t show_pma_counter(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *attr, char *buf) +{ + struct port_table_attribute *tab_attr = + container_of(attr, struct port_table_attribute, attr); + int offset = tab_attr->index & 0xffff; + int width = (tab_attr->index >> 16) & 0xff; + int ret; + u8 data[8]; + int len; + + ret = get_perf_mad(ibdev, port_num, tab_attr->attr_id, &data, + 40 + offset / 8, sizeof(data)); + if (ret < 0) + return ret; + + switch (width) { + case 4: + len = sysfs_emit(buf, "%d\n", + (*data >> (4 - (offset % 8))) & 0xf); + break; + case 8: + len = sysfs_emit(buf, "%u\n", *data); + break; + case 16: + len = sysfs_emit(buf, "%u\n", be16_to_cpup((__be16 *)data)); + break; + case 32: + len = sysfs_emit(buf, "%u\n", be32_to_cpup((__be32 *)data)); + break; + case 64: + len = sysfs_emit(buf, "%llu\n", be64_to_cpup((__be64 *)data)); + break; + default: + len = 0; + break; + } + + return len; +} + +static PORT_PMA_ATTR(symbol_error , 0, 16, 32); +static PORT_PMA_ATTR(link_error_recovery , 1, 8, 48); +static PORT_PMA_ATTR(link_downed , 2, 8, 56); +static PORT_PMA_ATTR(port_rcv_errors , 3, 16, 64); +static PORT_PMA_ATTR(port_rcv_remote_physical_errors, 4, 16, 80); +static PORT_PMA_ATTR(port_rcv_switch_relay_errors , 5, 16, 96); +static PORT_PMA_ATTR(port_xmit_discards , 6, 16, 112); +static PORT_PMA_ATTR(port_xmit_constraint_errors , 7, 8, 128); +static PORT_PMA_ATTR(port_rcv_constraint_errors , 8, 8, 136); +static PORT_PMA_ATTR(local_link_integrity_errors , 9, 4, 152); +static PORT_PMA_ATTR(excessive_buffer_overrun_errors, 10, 4, 156); +static PORT_PMA_ATTR(VL15_dropped , 11, 16, 176); +static PORT_PMA_ATTR(port_xmit_data , 12, 32, 192); +static PORT_PMA_ATTR(port_rcv_data , 13, 32, 224); +static PORT_PMA_ATTR(port_xmit_packets , 14, 32, 256); +static PORT_PMA_ATTR(port_rcv_packets , 15, 32, 288); +static PORT_PMA_ATTR(port_xmit_wait , 0, 32, 320); + +/* + * Counters added by extended set + */ +static PORT_PMA_ATTR_EXT(port_xmit_data , 64, 64); +static PORT_PMA_ATTR_EXT(port_rcv_data , 64, 128); +static PORT_PMA_ATTR_EXT(port_xmit_packets , 64, 192); +static PORT_PMA_ATTR_EXT(port_rcv_packets , 64, 256); +static PORT_PMA_ATTR_EXT(unicast_xmit_packets , 64, 320); +static PORT_PMA_ATTR_EXT(unicast_rcv_packets , 64, 384); +static PORT_PMA_ATTR_EXT(multicast_xmit_packets , 64, 448); +static PORT_PMA_ATTR_EXT(multicast_rcv_packets , 64, 512); + +static struct attribute *pma_attrs[] = { + &port_pma_attr_symbol_error.attr.attr, + &port_pma_attr_link_error_recovery.attr.attr, + &port_pma_attr_link_downed.attr.attr, + &port_pma_attr_port_rcv_errors.attr.attr, + &port_pma_attr_port_rcv_remote_physical_errors.attr.attr, + &port_pma_attr_port_rcv_switch_relay_errors.attr.attr, + &port_pma_attr_port_xmit_discards.attr.attr, + &port_pma_attr_port_xmit_constraint_errors.attr.attr, + &port_pma_attr_port_rcv_constraint_errors.attr.attr, + &port_pma_attr_local_link_integrity_errors.attr.attr, + &port_pma_attr_excessive_buffer_overrun_errors.attr.attr, + &port_pma_attr_VL15_dropped.attr.attr, + &port_pma_attr_port_xmit_data.attr.attr, + &port_pma_attr_port_rcv_data.attr.attr, + &port_pma_attr_port_xmit_packets.attr.attr, + &port_pma_attr_port_rcv_packets.attr.attr, + &port_pma_attr_port_xmit_wait.attr.attr, + NULL +}; + +static struct attribute *pma_attrs_ext[] = { + &port_pma_attr_symbol_error.attr.attr, + &port_pma_attr_link_error_recovery.attr.attr, + &port_pma_attr_link_downed.attr.attr, + &port_pma_attr_port_rcv_errors.attr.attr, + &port_pma_attr_port_rcv_remote_physical_errors.attr.attr, + &port_pma_attr_port_rcv_switch_relay_errors.attr.attr, + &port_pma_attr_port_xmit_discards.attr.attr, + &port_pma_attr_port_xmit_constraint_errors.attr.attr, + &port_pma_attr_port_rcv_constraint_errors.attr.attr, + &port_pma_attr_local_link_integrity_errors.attr.attr, + &port_pma_attr_excessive_buffer_overrun_errors.attr.attr, + &port_pma_attr_VL15_dropped.attr.attr, + &port_pma_attr_ext_port_xmit_data.attr.attr, + &port_pma_attr_ext_port_rcv_data.attr.attr, + &port_pma_attr_ext_port_xmit_packets.attr.attr, + &port_pma_attr_port_xmit_wait.attr.attr, + &port_pma_attr_ext_port_rcv_packets.attr.attr, + &port_pma_attr_ext_unicast_rcv_packets.attr.attr, + &port_pma_attr_ext_unicast_xmit_packets.attr.attr, + &port_pma_attr_ext_multicast_rcv_packets.attr.attr, + &port_pma_attr_ext_multicast_xmit_packets.attr.attr, + NULL +}; + +static struct attribute *pma_attrs_noietf[] = { + &port_pma_attr_symbol_error.attr.attr, + &port_pma_attr_link_error_recovery.attr.attr, + &port_pma_attr_link_downed.attr.attr, + &port_pma_attr_port_rcv_errors.attr.attr, + &port_pma_attr_port_rcv_remote_physical_errors.attr.attr, + &port_pma_attr_port_rcv_switch_relay_errors.attr.attr, + &port_pma_attr_port_xmit_discards.attr.attr, + &port_pma_attr_port_xmit_constraint_errors.attr.attr, + &port_pma_attr_port_rcv_constraint_errors.attr.attr, + &port_pma_attr_local_link_integrity_errors.attr.attr, + &port_pma_attr_excessive_buffer_overrun_errors.attr.attr, + &port_pma_attr_VL15_dropped.attr.attr, + &port_pma_attr_ext_port_xmit_data.attr.attr, + &port_pma_attr_ext_port_rcv_data.attr.attr, + &port_pma_attr_ext_port_xmit_packets.attr.attr, + &port_pma_attr_ext_port_rcv_packets.attr.attr, + &port_pma_attr_port_xmit_wait.attr.attr, + NULL +}; + +static const struct attribute_group pma_group = { + .name = "counters", + .attrs = pma_attrs +}; + +static const struct attribute_group pma_group_ext = { + .name = "counters", + .attrs = pma_attrs_ext +}; + +static const struct attribute_group pma_group_noietf = { + .name = "counters", + .attrs = pma_attrs_noietf +}; + +static void ib_port_release(struct kobject *kobj) +{ + struct ib_port *port = container_of(kobj, struct ib_port, kobj); + int i; + + for (i = 0; i != ARRAY_SIZE(port->groups); i++) + kfree(port->groups[i].attrs); + if (port->hw_stats_data) + rdma_free_hw_stats_struct(port->hw_stats_data->stats); + kfree(port->hw_stats_data); + kvfree(port); +} + +static void ib_port_gid_attr_release(struct kobject *kobj) +{ + struct gid_attr_group *gid_attr_group = + container_of(kobj, struct gid_attr_group, kobj); + int i; + + for (i = 0; i != ARRAY_SIZE(gid_attr_group->groups); i++) + kfree(gid_attr_group->groups[i].attrs); + kfree(gid_attr_group); +} + +static struct kobj_type port_type = { + .release = ib_port_release, + .sysfs_ops = &port_sysfs_ops, + .default_groups = port_default_groups, +}; + +static struct kobj_type gid_attr_type = { + .sysfs_ops = &gid_attr_sysfs_ops, + .release = ib_port_gid_attr_release +}; + +/* + * Figure out which counter table to use depending on + * the device capabilities. + */ +static const struct attribute_group *get_counter_table(struct ib_device *dev, + int port_num) +{ + struct ib_class_port_info cpi; + + if (get_perf_mad(dev, port_num, IB_PMA_CLASS_PORT_INFO, + &cpi, 40, sizeof(cpi)) >= 0) { + if (cpi.capability_mask & IB_PMA_CLASS_CAP_EXT_WIDTH) + /* We have extended counters */ + return &pma_group_ext; + + if (cpi.capability_mask & IB_PMA_CLASS_CAP_EXT_WIDTH_NOIETF) + /* But not the IETF ones */ + return &pma_group_noietf; + } + + /* Fall back to normal counters */ + return &pma_group; +} + +static int update_hw_stats(struct ib_device *dev, struct rdma_hw_stats *stats, + u32 port_num, int index) +{ + int ret; + + if (time_is_after_eq_jiffies(stats->timestamp + stats->lifespan)) + return 0; + ret = dev->ops.get_hw_stats(dev, stats, port_num, index); + if (ret < 0) + return ret; + if (ret == stats->num_counters) + stats->timestamp = jiffies; + + return 0; +} + +static int print_hw_stat(struct ib_device *dev, int port_num, + struct rdma_hw_stats *stats, int index, char *buf) +{ + u64 v = rdma_counter_get_hwstat_value(dev, port_num, index); + + return sysfs_emit(buf, "%llu\n", stats->value[index] + v); +} + +static ssize_t show_hw_stats(struct ib_device *ibdev, + struct rdma_hw_stats *stats, unsigned int index, + unsigned int port_num, char *buf) +{ + int ret; + + mutex_lock(&stats->lock); + ret = update_hw_stats(ibdev, stats, port_num, index); + if (ret) + goto unlock; + ret = print_hw_stat(ibdev, port_num, stats, index, buf); +unlock: + mutex_unlock(&stats->lock); + + return ret; +} + +static ssize_t show_stats_lifespan(struct ib_device *ibdev, + struct rdma_hw_stats *stats, + unsigned int index, unsigned int port_num, + char *buf) +{ + int msecs; + + mutex_lock(&stats->lock); + msecs = jiffies_to_msecs(stats->lifespan); + mutex_unlock(&stats->lock); + + return sysfs_emit(buf, "%d\n", msecs); +} + +static ssize_t set_stats_lifespan(struct ib_device *ibdev, + struct rdma_hw_stats *stats, + unsigned int index, unsigned int port_num, + const char *buf, size_t count) +{ + int msecs; + int jiffies; + int ret; + + ret = kstrtoint(buf, 10, &msecs); + if (ret) + return ret; + if (msecs < 0 || msecs > 10000) + return -EINVAL; + jiffies = msecs_to_jiffies(msecs); + + mutex_lock(&stats->lock); + stats->lifespan = jiffies; + mutex_unlock(&stats->lock); + + return count; +} + +static struct hw_stats_device_data * +alloc_hw_stats_device(struct ib_device *ibdev) +{ + struct hw_stats_device_data *data; + struct rdma_hw_stats *stats; + + if (!ibdev->ops.alloc_hw_device_stats) + return ERR_PTR(-EOPNOTSUPP); + stats = ibdev->ops.alloc_hw_device_stats(ibdev); + if (!stats) + return ERR_PTR(-ENOMEM); + if (!stats->descs || stats->num_counters <= 0) + goto err_free_stats; + + /* + * Two extra attribue elements here, one for the lifespan entry and + * one to NULL terminate the list for the sysfs core code + */ + data = kzalloc(struct_size(data, attrs, stats->num_counters + 1), + GFP_KERNEL); + if (!data) + goto err_free_stats; + data->group.attrs = kcalloc(stats->num_counters + 2, + sizeof(*data->group.attrs), GFP_KERNEL); + if (!data->group.attrs) + goto err_free_data; + + data->group.name = "hw_counters"; + data->stats = stats; + return data; + +err_free_data: + kfree(data); +err_free_stats: + rdma_free_hw_stats_struct(stats); + return ERR_PTR(-ENOMEM); +} + +void ib_device_release_hw_stats(struct hw_stats_device_data *data) +{ + kfree(data->group.attrs); + rdma_free_hw_stats_struct(data->stats); + kfree(data); +} + +int ib_setup_device_attrs(struct ib_device *ibdev) +{ + struct hw_stats_device_attribute *attr; + struct hw_stats_device_data *data; + bool opstat_skipped = false; + int i, ret, pos = 0; + + data = alloc_hw_stats_device(ibdev); + if (IS_ERR(data)) { + if (PTR_ERR(data) == -EOPNOTSUPP) + return 0; + return PTR_ERR(data); + } + ibdev->hw_stats_data = data; + + ret = ibdev->ops.get_hw_stats(ibdev, data->stats, 0, + data->stats->num_counters); + if (ret != data->stats->num_counters) { + if (WARN_ON(ret >= 0)) + return -EINVAL; + return ret; + } + + data->stats->timestamp = jiffies; + + for (i = 0; i < data->stats->num_counters; i++) { + if (data->stats->descs[i].flags & IB_STAT_FLAG_OPTIONAL) { + opstat_skipped = true; + continue; + } + + WARN_ON(opstat_skipped); + attr = &data->attrs[pos]; + sysfs_attr_init(&attr->attr.attr); + attr->attr.attr.name = data->stats->descs[i].name; + attr->attr.attr.mode = 0444; + attr->attr.show = hw_stat_device_show; + attr->show = show_hw_stats; + data->group.attrs[pos] = &attr->attr.attr; + pos++; + } + + attr = &data->attrs[pos]; + sysfs_attr_init(&attr->attr.attr); + attr->attr.attr.name = "lifespan"; + attr->attr.attr.mode = 0644; + attr->attr.show = hw_stat_device_show; + attr->show = show_stats_lifespan; + attr->attr.store = hw_stat_device_store; + attr->store = set_stats_lifespan; + data->group.attrs[pos] = &attr->attr.attr; + for (i = 0; i != ARRAY_SIZE(ibdev->groups); i++) + if (!ibdev->groups[i]) { + ibdev->groups[i] = &data->group; + return 0; + } + WARN(true, "struct ib_device->groups is too small"); + return -EINVAL; +} + +static struct hw_stats_port_data * +alloc_hw_stats_port(struct ib_port *port, struct attribute_group *group) +{ + struct ib_device *ibdev = port->ibdev; + struct hw_stats_port_data *data; + struct rdma_hw_stats *stats; + + if (!ibdev->ops.alloc_hw_port_stats) + return ERR_PTR(-EOPNOTSUPP); + stats = ibdev->ops.alloc_hw_port_stats(port->ibdev, port->port_num); + if (!stats) + return ERR_PTR(-ENOMEM); + if (!stats->descs || stats->num_counters <= 0) + goto err_free_stats; + + /* + * Two extra attribue elements here, one for the lifespan entry and + * one to NULL terminate the list for the sysfs core code + */ + data = kzalloc(struct_size(data, attrs, stats->num_counters + 1), + GFP_KERNEL); + if (!data) + goto err_free_stats; + group->attrs = kcalloc(stats->num_counters + 2, + sizeof(*group->attrs), GFP_KERNEL); + if (!group->attrs) + goto err_free_data; + + group->name = "hw_counters"; + data->stats = stats; + return data; + +err_free_data: + kfree(data); +err_free_stats: + rdma_free_hw_stats_struct(stats); + return ERR_PTR(-ENOMEM); +} + +static int setup_hw_port_stats(struct ib_port *port, + struct attribute_group *group) +{ + struct hw_stats_port_attribute *attr; + struct hw_stats_port_data *data; + bool opstat_skipped = false; + int i, ret, pos = 0; + + data = alloc_hw_stats_port(port, group); + if (IS_ERR(data)) + return PTR_ERR(data); + + ret = port->ibdev->ops.get_hw_stats(port->ibdev, data->stats, + port->port_num, + data->stats->num_counters); + if (ret != data->stats->num_counters) { + if (WARN_ON(ret >= 0)) + return -EINVAL; + return ret; + } + + data->stats->timestamp = jiffies; + + for (i = 0; i < data->stats->num_counters; i++) { + if (data->stats->descs[i].flags & IB_STAT_FLAG_OPTIONAL) { + opstat_skipped = true; + continue; + } + + WARN_ON(opstat_skipped); + attr = &data->attrs[pos]; + sysfs_attr_init(&attr->attr.attr); + attr->attr.attr.name = data->stats->descs[i].name; + attr->attr.attr.mode = 0444; + attr->attr.show = hw_stat_port_show; + attr->show = show_hw_stats; + group->attrs[pos] = &attr->attr.attr; + pos++; + } + + attr = &data->attrs[pos]; + sysfs_attr_init(&attr->attr.attr); + attr->attr.attr.name = "lifespan"; + attr->attr.attr.mode = 0644; + attr->attr.show = hw_stat_port_show; + attr->show = show_stats_lifespan; + attr->attr.store = hw_stat_port_store; + attr->store = set_stats_lifespan; + group->attrs[pos] = &attr->attr.attr; + + port->hw_stats_data = data; + return 0; +} + +struct rdma_hw_stats *ib_get_hw_stats_port(struct ib_device *ibdev, + u32 port_num) +{ + if (!ibdev->port_data || !rdma_is_port_valid(ibdev, port_num) || + !ibdev->port_data[port_num].sysfs->hw_stats_data) + return NULL; + return ibdev->port_data[port_num].sysfs->hw_stats_data->stats; +} + +static int +alloc_port_table_group(const char *name, struct attribute_group *group, + struct port_table_attribute *attrs, size_t num, + ssize_t (*show)(struct ib_device *ibdev, u32 port_num, + struct ib_port_attribute *, char *buf)) +{ + struct attribute **attr_list; + int i; + + attr_list = kcalloc(num + 1, sizeof(*attr_list), GFP_KERNEL); + if (!attr_list) + return -ENOMEM; + + for (i = 0; i < num; i++) { + struct port_table_attribute *element = &attrs[i]; + + if (snprintf(element->name, sizeof(element->name), "%d", i) >= + sizeof(element->name)) + goto err; + + sysfs_attr_init(&element->attr.attr); + element->attr.attr.name = element->name; + element->attr.attr.mode = 0444; + element->attr.show = show; + element->index = i; + + attr_list[i] = &element->attr.attr; + } + group->name = name; + group->attrs = attr_list; + return 0; +err: + kfree(attr_list); + return -EINVAL; +} + +/* + * Create the sysfs: + * ibp0s9/ports/XX/gid_attrs/{ndevs,types}/YYY + * YYY is the gid table index in decimal + */ +static int setup_gid_attrs(struct ib_port *port, + const struct ib_port_attr *attr) +{ + struct gid_attr_group *gid_attr_group; + int ret; + + gid_attr_group = kzalloc(struct_size(gid_attr_group, attrs_list, + attr->gid_tbl_len * 2), + GFP_KERNEL); + if (!gid_attr_group) + return -ENOMEM; + gid_attr_group->port = port; + kobject_init(&gid_attr_group->kobj, &gid_attr_type); + + ret = alloc_port_table_group("ndevs", &gid_attr_group->groups[0], + gid_attr_group->attrs_list, + attr->gid_tbl_len, + show_port_gid_attr_ndev); + if (ret) + goto err_put; + gid_attr_group->groups_list[0] = &gid_attr_group->groups[0]; + + ret = alloc_port_table_group( + "types", &gid_attr_group->groups[1], + gid_attr_group->attrs_list + attr->gid_tbl_len, + attr->gid_tbl_len, show_port_gid_attr_gid_type); + if (ret) + goto err_put; + gid_attr_group->groups_list[1] = &gid_attr_group->groups[1]; + + ret = kobject_add(&gid_attr_group->kobj, &port->kobj, "gid_attrs"); + if (ret) + goto err_put; + ret = sysfs_create_groups(&gid_attr_group->kobj, + gid_attr_group->groups_list); + if (ret) + goto err_del; + port->gid_attr_group = gid_attr_group; + return 0; + +err_del: + kobject_del(&gid_attr_group->kobj); +err_put: + kobject_put(&gid_attr_group->kobj); + return ret; +} + +static void destroy_gid_attrs(struct ib_port *port) +{ + struct gid_attr_group *gid_attr_group = port->gid_attr_group; + + if (!gid_attr_group) + return; + sysfs_remove_groups(&gid_attr_group->kobj, gid_attr_group->groups_list); + kobject_del(&gid_attr_group->kobj); + kobject_put(&gid_attr_group->kobj); +} + +/* + * Create the sysfs: + * ibp0s9/ports/XX/{gids,pkeys,counters}/YYY + */ +static struct ib_port *setup_port(struct ib_core_device *coredev, int port_num, + const struct ib_port_attr *attr) +{ + struct ib_device *device = rdma_device_to_ibdev(&coredev->dev); + bool is_full_dev = &device->coredev == coredev; + const struct attribute_group **cur_group; + struct ib_port *p; + int ret; + + p = kvzalloc(struct_size(p, attrs_list, + attr->gid_tbl_len + attr->pkey_tbl_len), + GFP_KERNEL); + if (!p) + return ERR_PTR(-ENOMEM); + p->ibdev = device; + p->port_num = port_num; + kobject_init(&p->kobj, &port_type); + + if (device->port_data && is_full_dev) + device->port_data[port_num].sysfs = p; + + cur_group = p->groups_list; + ret = alloc_port_table_group("gids", &p->groups[0], p->attrs_list, + attr->gid_tbl_len, show_port_gid); + if (ret) + goto err_put; + *cur_group++ = &p->groups[0]; + + if (attr->pkey_tbl_len) { + ret = alloc_port_table_group("pkeys", &p->groups[1], + p->attrs_list + attr->gid_tbl_len, + attr->pkey_tbl_len, show_port_pkey); + if (ret) + goto err_put; + *cur_group++ = &p->groups[1]; + } + + /* + * If port == 0, it means hw_counters are per device and not per + * port, so holder should be device. Therefore skip per port + * counter initialization. + */ + if (port_num && is_full_dev) { + ret = setup_hw_port_stats(p, &p->groups[2]); + if (ret && ret != -EOPNOTSUPP) + goto err_put; + if (!ret) + *cur_group++ = &p->groups[2]; + } + + if (device->ops.process_mad && is_full_dev) + *cur_group++ = get_counter_table(device, port_num); + + ret = kobject_add(&p->kobj, coredev->ports_kobj, "%d", port_num); + if (ret) + goto err_put; + ret = sysfs_create_groups(&p->kobj, p->groups_list); + if (ret) + goto err_del; + if (is_full_dev) { + ret = sysfs_create_groups(&p->kobj, device->ops.port_groups); + if (ret) + goto err_groups; + } + + list_add_tail(&p->kobj.entry, &coredev->port_list); + return p; + +err_groups: + sysfs_remove_groups(&p->kobj, p->groups_list); +err_del: + kobject_del(&p->kobj); +err_put: + if (device->port_data && is_full_dev) + device->port_data[port_num].sysfs = NULL; + kobject_put(&p->kobj); + return ERR_PTR(ret); +} + +static void destroy_port(struct ib_core_device *coredev, struct ib_port *port) +{ + bool is_full_dev = &port->ibdev->coredev == coredev; + + list_del(&port->kobj.entry); + if (is_full_dev) + sysfs_remove_groups(&port->kobj, port->ibdev->ops.port_groups); + + sysfs_remove_groups(&port->kobj, port->groups_list); + kobject_del(&port->kobj); + + if (port->ibdev->port_data && + port->ibdev->port_data[port->port_num].sysfs == port) + port->ibdev->port_data[port->port_num].sysfs = NULL; + + kobject_put(&port->kobj); +} + +static const char *node_type_string(int node_type) +{ + switch (node_type) { + case RDMA_NODE_IB_CA: + return "CA"; + case RDMA_NODE_IB_SWITCH: + return "switch"; + case RDMA_NODE_IB_ROUTER: + return "router"; + case RDMA_NODE_RNIC: + return "RNIC"; + case RDMA_NODE_USNIC: + return "usNIC"; + case RDMA_NODE_USNIC_UDP: + return "usNIC UDP"; + case RDMA_NODE_UNSPECIFIED: + return "unspecified"; + } + return ""; +} + +static ssize_t node_type_show(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct ib_device *dev = rdma_device_to_ibdev(device); + + return sysfs_emit(buf, "%u: %s\n", dev->node_type, + node_type_string(dev->node_type)); +} +static DEVICE_ATTR_RO(node_type); + +static ssize_t sys_image_guid_show(struct device *device, + struct device_attribute *dev_attr, char *buf) +{ + struct ib_device *dev = rdma_device_to_ibdev(device); + __be16 *guid = (__be16 *)&dev->attrs.sys_image_guid; + + return sysfs_emit(buf, "%04x:%04x:%04x:%04x\n", + be16_to_cpu(guid[0]), + be16_to_cpu(guid[1]), + be16_to_cpu(guid[2]), + be16_to_cpu(guid[3])); +} +static DEVICE_ATTR_RO(sys_image_guid); + +static ssize_t node_guid_show(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct ib_device *dev = rdma_device_to_ibdev(device); + __be16 *node_guid = (__be16 *)&dev->node_guid; + + return sysfs_emit(buf, "%04x:%04x:%04x:%04x\n", + be16_to_cpu(node_guid[0]), + be16_to_cpu(node_guid[1]), + be16_to_cpu(node_guid[2]), + be16_to_cpu(node_guid[3])); +} +static DEVICE_ATTR_RO(node_guid); + +static ssize_t node_desc_show(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct ib_device *dev = rdma_device_to_ibdev(device); + + return sysfs_emit(buf, "%.64s\n", dev->node_desc); +} + +static ssize_t node_desc_store(struct device *device, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct ib_device *dev = rdma_device_to_ibdev(device); + struct ib_device_modify desc = {}; + int ret; + + if (!dev->ops.modify_device) + return -EOPNOTSUPP; + + memcpy(desc.node_desc, buf, min_t(int, count, IB_DEVICE_NODE_DESC_MAX)); + ret = ib_modify_device(dev, IB_DEVICE_MODIFY_NODE_DESC, &desc); + if (ret) + return ret; + + return count; +} +static DEVICE_ATTR_RW(node_desc); + +static ssize_t fw_ver_show(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct ib_device *dev = rdma_device_to_ibdev(device); + char version[IB_FW_VERSION_NAME_MAX] = {}; + + ib_get_device_fw_str(dev, version); + + return sysfs_emit(buf, "%s\n", version); +} +static DEVICE_ATTR_RO(fw_ver); + +static struct attribute *ib_dev_attrs[] = { + &dev_attr_node_type.attr, + &dev_attr_node_guid.attr, + &dev_attr_sys_image_guid.attr, + &dev_attr_fw_ver.attr, + &dev_attr_node_desc.attr, + NULL, +}; + +const struct attribute_group ib_dev_attr_group = { + .attrs = ib_dev_attrs, +}; + +void ib_free_port_attrs(struct ib_core_device *coredev) +{ + struct kobject *p, *t; + + list_for_each_entry_safe(p, t, &coredev->port_list, entry) { + struct ib_port *port = container_of(p, struct ib_port, kobj); + + destroy_gid_attrs(port); + destroy_port(coredev, port); + } + + kobject_put(coredev->ports_kobj); +} + +int ib_setup_port_attrs(struct ib_core_device *coredev) +{ + struct ib_device *device = rdma_device_to_ibdev(&coredev->dev); + u32 port_num; + int ret; + + coredev->ports_kobj = kobject_create_and_add("ports", + &coredev->dev.kobj); + if (!coredev->ports_kobj) + return -ENOMEM; + + rdma_for_each_port (device, port_num) { + struct ib_port_attr attr; + struct ib_port *port; + + ret = ib_query_port(device, port_num, &attr); + if (ret) + goto err_put; + + port = setup_port(coredev, port_num, &attr); + if (IS_ERR(port)) { + ret = PTR_ERR(port); + goto err_put; + } + + ret = setup_gid_attrs(port, &attr); + if (ret) + goto err_put; + } + return 0; + +err_put: + ib_free_port_attrs(coredev); + return ret; +} + +/** + * ib_port_register_client_groups - Add an ib_client's attributes to the port + * + * @ibdev: IB device to add counters + * @port_num: valid port number + * @groups: Group list of attributes + * + * Do not use. Only for legacy sysfs compatibility. + */ +int ib_port_register_client_groups(struct ib_device *ibdev, u32 port_num, + const struct attribute_group **groups) +{ + return sysfs_create_groups(&ibdev->port_data[port_num].sysfs->kobj, + groups); +} +EXPORT_SYMBOL(ib_port_register_client_groups); + +void ib_port_unregister_client_groups(struct ib_device *ibdev, u32 port_num, + const struct attribute_group **groups) +{ + return sysfs_remove_groups(&ibdev->port_data[port_num].sysfs->kobj, + groups); +} +EXPORT_SYMBOL(ib_port_unregister_client_groups); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/trace.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/trace.c new file mode 100644 index 0000000..31e7860 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/trace.c @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Trace points for core RDMA functions. + * + * Author: Chuck Lever + * + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. + */ + +#define CREATE_TRACE_POINTS + +#include diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/ucma.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/ucma.c new file mode 100644 index 0000000..bf42650 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/ucma.c @@ -0,0 +1,1896 @@ +/* + * Copyright (c) 2005-2006 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include "core_priv.h" + +MODULE_AUTHOR("Sean Hefty"); +MODULE_DESCRIPTION("RDMA Userspace Connection Manager Access"); +MODULE_LICENSE("Dual BSD/GPL"); + +static unsigned int max_backlog = 1024; + +static struct ctl_table_header *ucma_ctl_table_hdr; +static struct ctl_table ucma_ctl_table[] = { + { + .procname = "max_backlog", + .data = &max_backlog, + .maxlen = sizeof max_backlog, + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + +struct ucma_file { + struct mutex mut; + struct file *filp; + struct list_head ctx_list; + struct list_head event_list; + wait_queue_head_t poll_wait; +}; + +struct ucma_context { + u32 id; + struct completion comp; + refcount_t ref; + int events_reported; + atomic_t backlog; + + struct ucma_file *file; + struct rdma_cm_id *cm_id; + struct mutex mutex; + u64 uid; + + struct list_head list; + struct list_head mc_list; + struct work_struct close_work; +}; + +struct ucma_multicast { + struct ucma_context *ctx; + u32 id; + int events_reported; + + u64 uid; + u8 join_state; + struct list_head list; + struct sockaddr_storage addr; +}; + +struct ucma_event { + struct ucma_context *ctx; + struct ucma_context *conn_req_ctx; + struct ucma_multicast *mc; + struct list_head list; + struct rdma_ucm_event_resp resp; +}; + +static DEFINE_XARRAY_ALLOC(ctx_table); +static DEFINE_XARRAY_ALLOC(multicast_table); + +static const struct file_operations ucma_fops; +static int ucma_destroy_private_ctx(struct ucma_context *ctx); + +static inline struct ucma_context *_ucma_find_context(int id, + struct ucma_file *file) +{ + struct ucma_context *ctx; + + ctx = xa_load(&ctx_table, id); + if (!ctx) + ctx = ERR_PTR(-ENOENT); + else if (ctx->file != file) + ctx = ERR_PTR(-EINVAL); + return ctx; +} + +static struct ucma_context *ucma_get_ctx(struct ucma_file *file, int id) +{ + struct ucma_context *ctx; + + xa_lock(&ctx_table); + ctx = _ucma_find_context(id, file); + if (!IS_ERR(ctx)) + if (!refcount_inc_not_zero(&ctx->ref)) + ctx = ERR_PTR(-ENXIO); + xa_unlock(&ctx_table); + return ctx; +} + +static void ucma_put_ctx(struct ucma_context *ctx) +{ + if (refcount_dec_and_test(&ctx->ref)) + complete(&ctx->comp); +} + +/* + * Same as ucm_get_ctx but requires that ->cm_id->device is valid, eg that the + * CM_ID is bound. + */ +static struct ucma_context *ucma_get_ctx_dev(struct ucma_file *file, int id) +{ + struct ucma_context *ctx = ucma_get_ctx(file, id); + + if (IS_ERR(ctx)) + return ctx; + if (!ctx->cm_id->device) { + ucma_put_ctx(ctx); + return ERR_PTR(-EINVAL); + } + return ctx; +} + +static void ucma_close_id(struct work_struct *work) +{ + struct ucma_context *ctx = container_of(work, struct ucma_context, close_work); + + /* once all inflight tasks are finished, we close all underlying + * resources. The context is still alive till its explicit destryoing + * by its creator. This puts back the xarray's reference. + */ + ucma_put_ctx(ctx); + wait_for_completion(&ctx->comp); + /* No new events will be generated after destroying the id. */ + rdma_destroy_id(ctx->cm_id); + + /* Reading the cm_id without holding a positive ref is not allowed */ + ctx->cm_id = NULL; +} + +static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file) +{ + struct ucma_context *ctx; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return NULL; + + INIT_WORK(&ctx->close_work, ucma_close_id); + init_completion(&ctx->comp); + INIT_LIST_HEAD(&ctx->mc_list); + /* So list_del() will work if we don't do ucma_finish_ctx() */ + INIT_LIST_HEAD(&ctx->list); + ctx->file = file; + mutex_init(&ctx->mutex); + + if (xa_alloc(&ctx_table, &ctx->id, NULL, xa_limit_32b, GFP_KERNEL)) { + kfree(ctx); + return NULL; + } + return ctx; +} + +static void ucma_set_ctx_cm_id(struct ucma_context *ctx, + struct rdma_cm_id *cm_id) +{ + refcount_set(&ctx->ref, 1); + ctx->cm_id = cm_id; +} + +static void ucma_finish_ctx(struct ucma_context *ctx) +{ + lockdep_assert_held(&ctx->file->mut); + list_add_tail(&ctx->list, &ctx->file->ctx_list); + xa_store(&ctx_table, ctx->id, ctx, GFP_KERNEL); +} + +static void ucma_copy_conn_event(struct rdma_ucm_conn_param *dst, + struct rdma_conn_param *src) +{ + if (src->private_data_len) + memcpy(dst->private_data, src->private_data, + src->private_data_len); + dst->private_data_len = src->private_data_len; + dst->responder_resources = src->responder_resources; + dst->initiator_depth = src->initiator_depth; + dst->flow_control = src->flow_control; + dst->retry_count = src->retry_count; + dst->rnr_retry_count = src->rnr_retry_count; + dst->srq = src->srq; + dst->qp_num = src->qp_num; +} + +static void ucma_copy_ud_event(struct ib_device *device, + struct rdma_ucm_ud_param *dst, + struct rdma_ud_param *src) +{ + if (src->private_data_len) + memcpy(dst->private_data, src->private_data, + src->private_data_len); + dst->private_data_len = src->private_data_len; + ib_copy_ah_attr_to_user(device, &dst->ah_attr, &src->ah_attr); + dst->qp_num = src->qp_num; + dst->qkey = src->qkey; +} + +static struct ucma_event *ucma_create_uevent(struct ucma_context *ctx, + struct rdma_cm_event *event) +{ + struct ucma_event *uevent; + + uevent = kzalloc(sizeof(*uevent), GFP_KERNEL); + if (!uevent) + return NULL; + + uevent->ctx = ctx; + switch (event->event) { + case RDMA_CM_EVENT_MULTICAST_JOIN: + case RDMA_CM_EVENT_MULTICAST_ERROR: + uevent->mc = (struct ucma_multicast *) + event->param.ud.private_data; + uevent->resp.uid = uevent->mc->uid; + uevent->resp.id = uevent->mc->id; + break; + default: + uevent->resp.uid = ctx->uid; + uevent->resp.id = ctx->id; + break; + } + uevent->resp.event = event->event; + uevent->resp.status = event->status; + if (ctx->cm_id->qp_type == IB_QPT_UD) + ucma_copy_ud_event(ctx->cm_id->device, &uevent->resp.param.ud, + &event->param.ud); + else + ucma_copy_conn_event(&uevent->resp.param.conn, + &event->param.conn); + + uevent->resp.ece.vendor_id = event->ece.vendor_id; + uevent->resp.ece.attr_mod = event->ece.attr_mod; + return uevent; +} + +static int ucma_connect_event_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) +{ + struct ucma_context *listen_ctx = cm_id->context; + struct ucma_context *ctx; + struct ucma_event *uevent; + + if (!atomic_add_unless(&listen_ctx->backlog, -1, 0)) + return -ENOMEM; + ctx = ucma_alloc_ctx(listen_ctx->file); + if (!ctx) + goto err_backlog; + ucma_set_ctx_cm_id(ctx, cm_id); + + uevent = ucma_create_uevent(listen_ctx, event); + if (!uevent) + goto err_alloc; + uevent->conn_req_ctx = ctx; + uevent->resp.id = ctx->id; + + ctx->cm_id->context = ctx; + + mutex_lock(&ctx->file->mut); + ucma_finish_ctx(ctx); + list_add_tail(&uevent->list, &ctx->file->event_list); + mutex_unlock(&ctx->file->mut); + wake_up_interruptible(&ctx->file->poll_wait); + return 0; + +err_alloc: + ucma_destroy_private_ctx(ctx); +err_backlog: + atomic_inc(&listen_ctx->backlog); + /* Returning error causes the new ID to be destroyed */ + return -ENOMEM; +} + +static int ucma_event_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) +{ + struct ucma_event *uevent; + struct ucma_context *ctx = cm_id->context; + + if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST) + return ucma_connect_event_handler(cm_id, event); + + /* + * We ignore events for new connections until userspace has set their + * context. This can only happen if an error occurs on a new connection + * before the user accepts it. This is okay, since the accept will just + * fail later. However, we do need to release the underlying HW + * resources in case of a device removal event. + */ + if (ctx->uid) { + uevent = ucma_create_uevent(ctx, event); + if (!uevent) + return 0; + + mutex_lock(&ctx->file->mut); + list_add_tail(&uevent->list, &ctx->file->event_list); + mutex_unlock(&ctx->file->mut); + wake_up_interruptible(&ctx->file->poll_wait); + } + + if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) { + xa_lock(&ctx_table); + if (xa_load(&ctx_table, ctx->id) == ctx) + queue_work(system_unbound_wq, &ctx->close_work); + xa_unlock(&ctx_table); + } + return 0; +} + +static ssize_t ucma_get_event(struct ucma_file *file, const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_get_event cmd; + struct ucma_event *uevent; + + /* + * Old 32 bit user space does not send the 4 byte padding in the + * reserved field. We don't care, allow it to keep working. + */ + if (out_len < sizeof(uevent->resp) - sizeof(uevent->resp.reserved) - + sizeof(uevent->resp.ece)) + return -ENOSPC; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + mutex_lock(&file->mut); + while (list_empty(&file->event_list)) { + mutex_unlock(&file->mut); + + if (file->filp->f_flags & O_NONBLOCK) + return -EAGAIN; + + if (wait_event_interruptible(file->poll_wait, + !list_empty(&file->event_list))) + return -ERESTARTSYS; + + mutex_lock(&file->mut); + } + + uevent = list_first_entry(&file->event_list, struct ucma_event, list); + + if (copy_to_user(u64_to_user_ptr(cmd.response), + &uevent->resp, + min_t(size_t, out_len, sizeof(uevent->resp)))) { + mutex_unlock(&file->mut); + return -EFAULT; + } + + list_del(&uevent->list); + uevent->ctx->events_reported++; + if (uevent->mc) + uevent->mc->events_reported++; + if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) + atomic_inc(&uevent->ctx->backlog); + mutex_unlock(&file->mut); + + kfree(uevent); + return 0; +} + +static int ucma_get_qp_type(struct rdma_ucm_create_id *cmd, enum ib_qp_type *qp_type) +{ + switch (cmd->ps) { + case RDMA_PS_TCP: + *qp_type = IB_QPT_RC; + return 0; + case RDMA_PS_UDP: + case RDMA_PS_IPOIB: + *qp_type = IB_QPT_UD; + return 0; + case RDMA_PS_IB: + *qp_type = cmd->qp_type; + return 0; + default: + return -EINVAL; + } +} + +static ssize_t ucma_create_id(struct ucma_file *file, const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_create_id cmd; + struct rdma_ucm_create_id_resp resp; + struct ucma_context *ctx; + struct rdma_cm_id *cm_id; + enum ib_qp_type qp_type; + int ret; + + if (out_len < sizeof(resp)) + return -ENOSPC; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + ret = ucma_get_qp_type(&cmd, &qp_type); + if (ret) + return ret; + + ctx = ucma_alloc_ctx(file); + if (!ctx) + return -ENOMEM; + + ctx->uid = cmd.uid; + cm_id = rdma_create_user_id(ucma_event_handler, ctx, cmd.ps, qp_type); + if (IS_ERR(cm_id)) { + ret = PTR_ERR(cm_id); + goto err1; + } + ucma_set_ctx_cm_id(ctx, cm_id); + + resp.id = ctx->id; + if (copy_to_user(u64_to_user_ptr(cmd.response), + &resp, sizeof(resp))) { + ret = -EFAULT; + goto err1; + } + + mutex_lock(&file->mut); + ucma_finish_ctx(ctx); + mutex_unlock(&file->mut); + return 0; + +err1: + ucma_destroy_private_ctx(ctx); + return ret; +} + +static void ucma_cleanup_multicast(struct ucma_context *ctx) +{ + struct ucma_multicast *mc, *tmp; + + xa_lock(&multicast_table); + list_for_each_entry_safe(mc, tmp, &ctx->mc_list, list) { + list_del(&mc->list); + /* + * At this point mc->ctx->ref is 0 so the mc cannot leave the + * lock on the reader and this is enough serialization + */ + __xa_erase(&multicast_table, mc->id); + kfree(mc); + } + xa_unlock(&multicast_table); +} + +static void ucma_cleanup_mc_events(struct ucma_multicast *mc) +{ + struct ucma_event *uevent, *tmp; + + rdma_lock_handler(mc->ctx->cm_id); + mutex_lock(&mc->ctx->file->mut); + list_for_each_entry_safe(uevent, tmp, &mc->ctx->file->event_list, list) { + if (uevent->mc != mc) + continue; + + list_del(&uevent->list); + kfree(uevent); + } + mutex_unlock(&mc->ctx->file->mut); + rdma_unlock_handler(mc->ctx->cm_id); +} + +static int ucma_cleanup_ctx_events(struct ucma_context *ctx) +{ + int events_reported; + struct ucma_event *uevent, *tmp; + LIST_HEAD(list); + + /* Cleanup events not yet reported to the user.*/ + mutex_lock(&ctx->file->mut); + list_for_each_entry_safe(uevent, tmp, &ctx->file->event_list, list) { + if (uevent->ctx != ctx) + continue; + + if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST && + xa_cmpxchg(&ctx_table, uevent->conn_req_ctx->id, + uevent->conn_req_ctx, XA_ZERO_ENTRY, + GFP_KERNEL) == uevent->conn_req_ctx) { + list_move_tail(&uevent->list, &list); + continue; + } + list_del(&uevent->list); + kfree(uevent); + } + list_del(&ctx->list); + events_reported = ctx->events_reported; + mutex_unlock(&ctx->file->mut); + + /* + * If this was a listening ID then any connections spawned from it that + * have not been delivered to userspace are cleaned up too. Must be done + * outside any locks. + */ + list_for_each_entry_safe(uevent, tmp, &list, list) { + ucma_destroy_private_ctx(uevent->conn_req_ctx); + kfree(uevent); + } + return events_reported; +} + +/* + * When this is called the xarray must have a XA_ZERO_ENTRY in the ctx->id (ie + * the ctx is not public to the user). This either because: + * - ucma_finish_ctx() hasn't been called + * - xa_cmpxchg() succeed to remove the entry (only one thread can succeed) + */ +static int ucma_destroy_private_ctx(struct ucma_context *ctx) +{ + int events_reported; + + /* + * Destroy the underlying cm_id. New work queuing is prevented now by + * the removal from the xarray. Once the work is cancled ref will either + * be 0 because the work ran to completion and consumed the ref from the + * xarray, or it will be positive because we still have the ref from the + * xarray. This can also be 0 in cases where cm_id was never set + */ + cancel_work_sync(&ctx->close_work); + if (refcount_read(&ctx->ref)) + ucma_close_id(&ctx->close_work); + + events_reported = ucma_cleanup_ctx_events(ctx); + ucma_cleanup_multicast(ctx); + + WARN_ON(xa_cmpxchg(&ctx_table, ctx->id, XA_ZERO_ENTRY, NULL, + GFP_KERNEL) != NULL); + mutex_destroy(&ctx->mutex); + kfree(ctx); + return events_reported; +} + +static ssize_t ucma_destroy_id(struct ucma_file *file, const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_destroy_id cmd; + struct rdma_ucm_destroy_id_resp resp; + struct ucma_context *ctx; + int ret = 0; + + if (out_len < sizeof(resp)) + return -ENOSPC; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + xa_lock(&ctx_table); + ctx = _ucma_find_context(cmd.id, file); + if (!IS_ERR(ctx)) { + if (__xa_cmpxchg(&ctx_table, ctx->id, ctx, XA_ZERO_ENTRY, + GFP_KERNEL) != ctx) + ctx = ERR_PTR(-ENOENT); + } + xa_unlock(&ctx_table); + + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + resp.events_reported = ucma_destroy_private_ctx(ctx); + if (copy_to_user(u64_to_user_ptr(cmd.response), + &resp, sizeof(resp))) + ret = -EFAULT; + + return ret; +} + +static ssize_t ucma_bind_ip(struct ucma_file *file, const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_bind_ip cmd; + struct ucma_context *ctx; + int ret; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + if (!rdma_addr_size_in6(&cmd.addr)) + return -EINVAL; + + ctx = ucma_get_ctx(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + mutex_lock(&ctx->mutex); + ret = rdma_bind_addr(ctx->cm_id, (struct sockaddr *) &cmd.addr); + mutex_unlock(&ctx->mutex); + + ucma_put_ctx(ctx); + return ret; +} + +static ssize_t ucma_bind(struct ucma_file *file, const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_bind cmd; + struct ucma_context *ctx; + int ret; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + if (cmd.reserved || !cmd.addr_size || + cmd.addr_size != rdma_addr_size_kss(&cmd.addr)) + return -EINVAL; + + ctx = ucma_get_ctx(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + mutex_lock(&ctx->mutex); + ret = rdma_bind_addr(ctx->cm_id, (struct sockaddr *) &cmd.addr); + mutex_unlock(&ctx->mutex); + ucma_put_ctx(ctx); + return ret; +} + +static ssize_t ucma_resolve_ip(struct ucma_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_resolve_ip cmd; + struct ucma_context *ctx; + int ret; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + if ((cmd.src_addr.sin6_family && !rdma_addr_size_in6(&cmd.src_addr)) || + !rdma_addr_size_in6(&cmd.dst_addr)) + return -EINVAL; + + ctx = ucma_get_ctx(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + mutex_lock(&ctx->mutex); + ret = rdma_resolve_addr(ctx->cm_id, (struct sockaddr *) &cmd.src_addr, + (struct sockaddr *) &cmd.dst_addr, cmd.timeout_ms); + mutex_unlock(&ctx->mutex); + ucma_put_ctx(ctx); + return ret; +} + +static ssize_t ucma_resolve_addr(struct ucma_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_resolve_addr cmd; + struct ucma_context *ctx; + int ret; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + if (cmd.reserved || + (cmd.src_size && (cmd.src_size != rdma_addr_size_kss(&cmd.src_addr))) || + !cmd.dst_size || (cmd.dst_size != rdma_addr_size_kss(&cmd.dst_addr))) + return -EINVAL; + + ctx = ucma_get_ctx(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + mutex_lock(&ctx->mutex); + ret = rdma_resolve_addr(ctx->cm_id, (struct sockaddr *) &cmd.src_addr, + (struct sockaddr *) &cmd.dst_addr, cmd.timeout_ms); + mutex_unlock(&ctx->mutex); + ucma_put_ctx(ctx); + return ret; +} + +static ssize_t ucma_resolve_route(struct ucma_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_resolve_route cmd; + struct ucma_context *ctx; + int ret; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + ctx = ucma_get_ctx_dev(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + mutex_lock(&ctx->mutex); + ret = rdma_resolve_route(ctx->cm_id, cmd.timeout_ms); + mutex_unlock(&ctx->mutex); + ucma_put_ctx(ctx); + return ret; +} + +static void ucma_copy_ib_route(struct rdma_ucm_query_route_resp *resp, + struct rdma_route *route) +{ + struct rdma_dev_addr *dev_addr; + + resp->num_paths = route->num_pri_alt_paths; + switch (route->num_pri_alt_paths) { + case 0: + dev_addr = &route->addr.dev_addr; + rdma_addr_get_dgid(dev_addr, + (union ib_gid *) &resp->ib_route[0].dgid); + rdma_addr_get_sgid(dev_addr, + (union ib_gid *) &resp->ib_route[0].sgid); + resp->ib_route[0].pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr)); + break; + case 2: + ib_copy_path_rec_to_user(&resp->ib_route[1], + &route->path_rec[1]); + fallthrough; + case 1: + ib_copy_path_rec_to_user(&resp->ib_route[0], + &route->path_rec[0]); + break; + default: + break; + } +} + +static void ucma_copy_iboe_route(struct rdma_ucm_query_route_resp *resp, + struct rdma_route *route) +{ + + resp->num_paths = route->num_pri_alt_paths; + switch (route->num_pri_alt_paths) { + case 0: + rdma_ip2gid((struct sockaddr *)&route->addr.dst_addr, + (union ib_gid *)&resp->ib_route[0].dgid); + rdma_ip2gid((struct sockaddr *)&route->addr.src_addr, + (union ib_gid *)&resp->ib_route[0].sgid); + resp->ib_route[0].pkey = cpu_to_be16(0xffff); + break; + case 2: + ib_copy_path_rec_to_user(&resp->ib_route[1], + &route->path_rec[1]); + fallthrough; + case 1: + ib_copy_path_rec_to_user(&resp->ib_route[0], + &route->path_rec[0]); + break; + default: + break; + } +} + +static void ucma_copy_iw_route(struct rdma_ucm_query_route_resp *resp, + struct rdma_route *route) +{ + struct rdma_dev_addr *dev_addr; + + dev_addr = &route->addr.dev_addr; + rdma_addr_get_dgid(dev_addr, (union ib_gid *) &resp->ib_route[0].dgid); + rdma_addr_get_sgid(dev_addr, (union ib_gid *) &resp->ib_route[0].sgid); +} + +static ssize_t ucma_query_route(struct ucma_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_query cmd; + struct rdma_ucm_query_route_resp resp; + struct ucma_context *ctx; + struct sockaddr *addr; + int ret = 0; + + if (out_len < offsetof(struct rdma_ucm_query_route_resp, ibdev_index)) + return -ENOSPC; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + ctx = ucma_get_ctx(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + mutex_lock(&ctx->mutex); + memset(&resp, 0, sizeof resp); + addr = (struct sockaddr *) &ctx->cm_id->route.addr.src_addr; + memcpy(&resp.src_addr, addr, addr->sa_family == AF_INET ? + sizeof(struct sockaddr_in) : + sizeof(struct sockaddr_in6)); + addr = (struct sockaddr *) &ctx->cm_id->route.addr.dst_addr; + memcpy(&resp.dst_addr, addr, addr->sa_family == AF_INET ? + sizeof(struct sockaddr_in) : + sizeof(struct sockaddr_in6)); + if (!ctx->cm_id->device) + goto out; + + resp.node_guid = (__force __u64) ctx->cm_id->device->node_guid; + resp.ibdev_index = ctx->cm_id->device->index; + resp.port_num = ctx->cm_id->port_num; + + if (rdma_cap_ib_sa(ctx->cm_id->device, ctx->cm_id->port_num)) + ucma_copy_ib_route(&resp, &ctx->cm_id->route); + else if (rdma_protocol_roce(ctx->cm_id->device, ctx->cm_id->port_num)) + ucma_copy_iboe_route(&resp, &ctx->cm_id->route); + else if (rdma_protocol_iwarp(ctx->cm_id->device, ctx->cm_id->port_num)) + ucma_copy_iw_route(&resp, &ctx->cm_id->route); + +out: + mutex_unlock(&ctx->mutex); + if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, + min_t(size_t, out_len, sizeof(resp)))) + ret = -EFAULT; + + ucma_put_ctx(ctx); + return ret; +} + +static void ucma_query_device_addr(struct rdma_cm_id *cm_id, + struct rdma_ucm_query_addr_resp *resp) +{ + if (!cm_id->device) + return; + + resp->node_guid = (__force __u64) cm_id->device->node_guid; + resp->ibdev_index = cm_id->device->index; + resp->port_num = cm_id->port_num; + resp->pkey = (__force __u16) cpu_to_be16( + ib_addr_get_pkey(&cm_id->route.addr.dev_addr)); +} + +static ssize_t ucma_query_addr(struct ucma_context *ctx, + void __user *response, int out_len) +{ + struct rdma_ucm_query_addr_resp resp; + struct sockaddr *addr; + int ret = 0; + + if (out_len < offsetof(struct rdma_ucm_query_addr_resp, ibdev_index)) + return -ENOSPC; + + memset(&resp, 0, sizeof resp); + + addr = (struct sockaddr *) &ctx->cm_id->route.addr.src_addr; + resp.src_size = rdma_addr_size(addr); + memcpy(&resp.src_addr, addr, resp.src_size); + + addr = (struct sockaddr *) &ctx->cm_id->route.addr.dst_addr; + resp.dst_size = rdma_addr_size(addr); + memcpy(&resp.dst_addr, addr, resp.dst_size); + + ucma_query_device_addr(ctx->cm_id, &resp); + + if (copy_to_user(response, &resp, min_t(size_t, out_len, sizeof(resp)))) + ret = -EFAULT; + + return ret; +} + +static ssize_t ucma_query_path(struct ucma_context *ctx, + void __user *response, int out_len) +{ + struct rdma_ucm_query_path_resp *resp; + int i, ret = 0; + + if (out_len < sizeof(*resp)) + return -ENOSPC; + + resp = kzalloc(out_len, GFP_KERNEL); + if (!resp) + return -ENOMEM; + + resp->num_paths = ctx->cm_id->route.num_pri_alt_paths; + for (i = 0, out_len -= sizeof(*resp); + i < resp->num_paths && out_len > sizeof(struct ib_path_rec_data); + i++, out_len -= sizeof(struct ib_path_rec_data)) { + struct sa_path_rec *rec = &ctx->cm_id->route.path_rec[i]; + + resp->path_data[i].flags = IB_PATH_GMP | IB_PATH_PRIMARY | + IB_PATH_BIDIRECTIONAL; + if (rec->rec_type == SA_PATH_REC_TYPE_OPA) { + struct sa_path_rec ib; + + sa_convert_path_opa_to_ib(&ib, rec); + ib_sa_pack_path(&ib, &resp->path_data[i].path_rec); + + } else { + ib_sa_pack_path(rec, &resp->path_data[i].path_rec); + } + } + + if (copy_to_user(response, resp, struct_size(resp, path_data, i))) + ret = -EFAULT; + + kfree(resp); + return ret; +} + +static ssize_t ucma_query_gid(struct ucma_context *ctx, + void __user *response, int out_len) +{ + struct rdma_ucm_query_addr_resp resp; + struct sockaddr_ib *addr; + int ret = 0; + + if (out_len < offsetof(struct rdma_ucm_query_addr_resp, ibdev_index)) + return -ENOSPC; + + memset(&resp, 0, sizeof resp); + + ucma_query_device_addr(ctx->cm_id, &resp); + + addr = (struct sockaddr_ib *) &resp.src_addr; + resp.src_size = sizeof(*addr); + if (ctx->cm_id->route.addr.src_addr.ss_family == AF_IB) { + memcpy(addr, &ctx->cm_id->route.addr.src_addr, resp.src_size); + } else { + addr->sib_family = AF_IB; + addr->sib_pkey = (__force __be16) resp.pkey; + rdma_read_gids(ctx->cm_id, (union ib_gid *)&addr->sib_addr, + NULL); + addr->sib_sid = rdma_get_service_id(ctx->cm_id, (struct sockaddr *) + &ctx->cm_id->route.addr.src_addr); + } + + addr = (struct sockaddr_ib *) &resp.dst_addr; + resp.dst_size = sizeof(*addr); + if (ctx->cm_id->route.addr.dst_addr.ss_family == AF_IB) { + memcpy(addr, &ctx->cm_id->route.addr.dst_addr, resp.dst_size); + } else { + addr->sib_family = AF_IB; + addr->sib_pkey = (__force __be16) resp.pkey; + rdma_read_gids(ctx->cm_id, NULL, + (union ib_gid *)&addr->sib_addr); + addr->sib_sid = rdma_get_service_id(ctx->cm_id, (struct sockaddr *) + &ctx->cm_id->route.addr.dst_addr); + } + + if (copy_to_user(response, &resp, min_t(size_t, out_len, sizeof(resp)))) + ret = -EFAULT; + + return ret; +} + +static ssize_t ucma_query(struct ucma_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_query cmd; + struct ucma_context *ctx; + void __user *response; + int ret; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + response = u64_to_user_ptr(cmd.response); + ctx = ucma_get_ctx(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + mutex_lock(&ctx->mutex); + switch (cmd.option) { + case RDMA_USER_CM_QUERY_ADDR: + ret = ucma_query_addr(ctx, response, out_len); + break; + case RDMA_USER_CM_QUERY_PATH: + ret = ucma_query_path(ctx, response, out_len); + break; + case RDMA_USER_CM_QUERY_GID: + ret = ucma_query_gid(ctx, response, out_len); + break; + default: + ret = -ENOSYS; + break; + } + mutex_unlock(&ctx->mutex); + + ucma_put_ctx(ctx); + return ret; +} + +static void ucma_copy_conn_param(struct rdma_cm_id *id, + struct rdma_conn_param *dst, + struct rdma_ucm_conn_param *src) +{ + dst->private_data = src->private_data; + dst->private_data_len = src->private_data_len; + dst->responder_resources = src->responder_resources; + dst->initiator_depth = src->initiator_depth; + dst->flow_control = src->flow_control; + dst->retry_count = src->retry_count; + dst->rnr_retry_count = src->rnr_retry_count; + dst->srq = src->srq; + dst->qp_num = src->qp_num & 0xFFFFFF; + dst->qkey = (id->route.addr.src_addr.ss_family == AF_IB) ? src->qkey : 0; +} + +static ssize_t ucma_connect(struct ucma_file *file, const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_conn_param conn_param; + struct rdma_ucm_ece ece = {}; + struct rdma_ucm_connect cmd; + struct ucma_context *ctx; + size_t in_size; + int ret; + + if (in_len < offsetofend(typeof(cmd), reserved)) + return -EINVAL; + in_size = min_t(size_t, in_len, sizeof(cmd)); + if (copy_from_user(&cmd, inbuf, in_size)) + return -EFAULT; + + if (!cmd.conn_param.valid) + return -EINVAL; + + ctx = ucma_get_ctx_dev(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + ucma_copy_conn_param(ctx->cm_id, &conn_param, &cmd.conn_param); + if (offsetofend(typeof(cmd), ece) <= in_size) { + ece.vendor_id = cmd.ece.vendor_id; + ece.attr_mod = cmd.ece.attr_mod; + } + + mutex_lock(&ctx->mutex); + ret = rdma_connect_ece(ctx->cm_id, &conn_param, &ece); + mutex_unlock(&ctx->mutex); + ucma_put_ctx(ctx); + return ret; +} + +static ssize_t ucma_listen(struct ucma_file *file, const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_listen cmd; + struct ucma_context *ctx; + int ret; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + ctx = ucma_get_ctx(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + if (cmd.backlog <= 0 || cmd.backlog > max_backlog) + cmd.backlog = max_backlog; + atomic_set(&ctx->backlog, cmd.backlog); + + mutex_lock(&ctx->mutex); + ret = rdma_listen(ctx->cm_id, cmd.backlog); + mutex_unlock(&ctx->mutex); + ucma_put_ctx(ctx); + return ret; +} + +static ssize_t ucma_accept(struct ucma_file *file, const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_accept cmd; + struct rdma_conn_param conn_param; + struct rdma_ucm_ece ece = {}; + struct ucma_context *ctx; + size_t in_size; + int ret; + + if (in_len < offsetofend(typeof(cmd), reserved)) + return -EINVAL; + in_size = min_t(size_t, in_len, sizeof(cmd)); + if (copy_from_user(&cmd, inbuf, in_size)) + return -EFAULT; + + ctx = ucma_get_ctx_dev(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + if (offsetofend(typeof(cmd), ece) <= in_size) { + ece.vendor_id = cmd.ece.vendor_id; + ece.attr_mod = cmd.ece.attr_mod; + } + + if (cmd.conn_param.valid) { + ucma_copy_conn_param(ctx->cm_id, &conn_param, &cmd.conn_param); + mutex_lock(&ctx->mutex); + rdma_lock_handler(ctx->cm_id); + ret = rdma_accept_ece(ctx->cm_id, &conn_param, &ece); + if (!ret) { + /* The uid must be set atomically with the handler */ + ctx->uid = cmd.uid; + } + rdma_unlock_handler(ctx->cm_id); + mutex_unlock(&ctx->mutex); + } else { + mutex_lock(&ctx->mutex); + rdma_lock_handler(ctx->cm_id); + ret = rdma_accept_ece(ctx->cm_id, NULL, &ece); + rdma_unlock_handler(ctx->cm_id); + mutex_unlock(&ctx->mutex); + } + ucma_put_ctx(ctx); + return ret; +} + +static ssize_t ucma_reject(struct ucma_file *file, const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_reject cmd; + struct ucma_context *ctx; + int ret; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + if (!cmd.reason) + cmd.reason = IB_CM_REJ_CONSUMER_DEFINED; + + switch (cmd.reason) { + case IB_CM_REJ_CONSUMER_DEFINED: + case IB_CM_REJ_VENDOR_OPTION_NOT_SUPPORTED: + break; + default: + return -EINVAL; + } + + ctx = ucma_get_ctx_dev(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + mutex_lock(&ctx->mutex); + ret = rdma_reject(ctx->cm_id, cmd.private_data, cmd.private_data_len, + cmd.reason); + mutex_unlock(&ctx->mutex); + ucma_put_ctx(ctx); + return ret; +} + +static ssize_t ucma_disconnect(struct ucma_file *file, const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_disconnect cmd; + struct ucma_context *ctx; + int ret; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + ctx = ucma_get_ctx_dev(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + mutex_lock(&ctx->mutex); + ret = rdma_disconnect(ctx->cm_id); + mutex_unlock(&ctx->mutex); + ucma_put_ctx(ctx); + return ret; +} + +static ssize_t ucma_init_qp_attr(struct ucma_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_init_qp_attr cmd; + struct ib_uverbs_qp_attr resp; + struct ucma_context *ctx; + struct ib_qp_attr qp_attr; + int ret; + + if (out_len < sizeof(resp)) + return -ENOSPC; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + if (cmd.qp_state > IB_QPS_ERR) + return -EINVAL; + + ctx = ucma_get_ctx_dev(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + resp.qp_attr_mask = 0; + memset(&qp_attr, 0, sizeof qp_attr); + qp_attr.qp_state = cmd.qp_state; + mutex_lock(&ctx->mutex); + ret = rdma_init_qp_attr(ctx->cm_id, &qp_attr, &resp.qp_attr_mask); + mutex_unlock(&ctx->mutex); + if (ret) + goto out; + + ib_copy_qp_attr_to_user(ctx->cm_id->device, &resp, &qp_attr); + if (copy_to_user(u64_to_user_ptr(cmd.response), + &resp, sizeof(resp))) + ret = -EFAULT; + +out: + ucma_put_ctx(ctx); + return ret; +} + +static int ucma_set_option_id(struct ucma_context *ctx, int optname, + void *optval, size_t optlen) +{ + int ret = 0; + + switch (optname) { + case RDMA_OPTION_ID_TOS: + if (optlen != sizeof(u8)) { + ret = -EINVAL; + break; + } + rdma_set_service_type(ctx->cm_id, *((u8 *) optval)); + break; + case RDMA_OPTION_ID_REUSEADDR: + if (optlen != sizeof(int)) { + ret = -EINVAL; + break; + } + ret = rdma_set_reuseaddr(ctx->cm_id, *((int *) optval) ? 1 : 0); + break; + case RDMA_OPTION_ID_AFONLY: + if (optlen != sizeof(int)) { + ret = -EINVAL; + break; + } + ret = rdma_set_afonly(ctx->cm_id, *((int *) optval) ? 1 : 0); + break; + case RDMA_OPTION_ID_ACK_TIMEOUT: + if (optlen != sizeof(u8)) { + ret = -EINVAL; + break; + } + ret = rdma_set_ack_timeout(ctx->cm_id, *((u8 *)optval)); + break; + default: + ret = -ENOSYS; + } + + return ret; +} + +static int ucma_set_ib_path(struct ucma_context *ctx, + struct ib_path_rec_data *path_data, size_t optlen) +{ + struct sa_path_rec sa_path; + struct rdma_cm_event event; + int ret; + + if (optlen % sizeof(*path_data)) + return -EINVAL; + + for (; optlen; optlen -= sizeof(*path_data), path_data++) { + if (path_data->flags == (IB_PATH_GMP | IB_PATH_PRIMARY | + IB_PATH_BIDIRECTIONAL)) + break; + } + + if (!optlen) + return -EINVAL; + + if (!ctx->cm_id->device) + return -EINVAL; + + memset(&sa_path, 0, sizeof(sa_path)); + + sa_path.rec_type = SA_PATH_REC_TYPE_IB; + ib_sa_unpack_path(path_data->path_rec, &sa_path); + + if (rdma_cap_opa_ah(ctx->cm_id->device, ctx->cm_id->port_num)) { + struct sa_path_rec opa; + + sa_convert_path_ib_to_opa(&opa, &sa_path); + mutex_lock(&ctx->mutex); + ret = rdma_set_ib_path(ctx->cm_id, &opa); + mutex_unlock(&ctx->mutex); + } else { + mutex_lock(&ctx->mutex); + ret = rdma_set_ib_path(ctx->cm_id, &sa_path); + mutex_unlock(&ctx->mutex); + } + if (ret) + return ret; + + memset(&event, 0, sizeof event); + event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; + return ucma_event_handler(ctx->cm_id, &event); +} + +static int ucma_set_option_ib(struct ucma_context *ctx, int optname, + void *optval, size_t optlen) +{ + int ret; + + switch (optname) { + case RDMA_OPTION_IB_PATH: + ret = ucma_set_ib_path(ctx, optval, optlen); + break; + default: + ret = -ENOSYS; + } + + return ret; +} + +static int ucma_set_option_level(struct ucma_context *ctx, int level, + int optname, void *optval, size_t optlen) +{ + int ret; + + switch (level) { + case RDMA_OPTION_ID: + mutex_lock(&ctx->mutex); + ret = ucma_set_option_id(ctx, optname, optval, optlen); + mutex_unlock(&ctx->mutex); + break; + case RDMA_OPTION_IB: + ret = ucma_set_option_ib(ctx, optname, optval, optlen); + break; + default: + ret = -ENOSYS; + } + + return ret; +} + +static ssize_t ucma_set_option(struct ucma_file *file, const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_set_option cmd; + struct ucma_context *ctx; + void *optval; + int ret; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + if (unlikely(cmd.optlen > KMALLOC_MAX_SIZE)) + return -EINVAL; + + ctx = ucma_get_ctx(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + optval = memdup_user(u64_to_user_ptr(cmd.optval), + cmd.optlen); + if (IS_ERR(optval)) { + ret = PTR_ERR(optval); + goto out; + } + + ret = ucma_set_option_level(ctx, cmd.level, cmd.optname, optval, + cmd.optlen); + kfree(optval); + +out: + ucma_put_ctx(ctx); + return ret; +} + +static ssize_t ucma_notify(struct ucma_file *file, const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_notify cmd; + struct ucma_context *ctx; + int ret = -EINVAL; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + ctx = ucma_get_ctx(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + mutex_lock(&ctx->mutex); + if (ctx->cm_id->device) + ret = rdma_notify(ctx->cm_id, (enum ib_event_type)cmd.event); + mutex_unlock(&ctx->mutex); + + ucma_put_ctx(ctx); + return ret; +} + +static ssize_t ucma_process_join(struct ucma_file *file, + struct rdma_ucm_join_mcast *cmd, int out_len) +{ + struct rdma_ucm_create_id_resp resp; + struct ucma_context *ctx; + struct ucma_multicast *mc; + struct sockaddr *addr; + int ret; + u8 join_state; + + if (out_len < sizeof(resp)) + return -ENOSPC; + + addr = (struct sockaddr *) &cmd->addr; + if (cmd->addr_size != rdma_addr_size(addr)) + return -EINVAL; + + if (cmd->join_flags == RDMA_MC_JOIN_FLAG_FULLMEMBER) + join_state = BIT(FULLMEMBER_JOIN); + else if (cmd->join_flags == RDMA_MC_JOIN_FLAG_SENDONLY_FULLMEMBER) + join_state = BIT(SENDONLY_FULLMEMBER_JOIN); + else + return -EINVAL; + + ctx = ucma_get_ctx_dev(file, cmd->id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + mc = kzalloc(sizeof(*mc), GFP_KERNEL); + if (!mc) { + ret = -ENOMEM; + goto err_put_ctx; + } + + mc->ctx = ctx; + mc->join_state = join_state; + mc->uid = cmd->uid; + memcpy(&mc->addr, addr, cmd->addr_size); + + xa_lock(&multicast_table); + if (__xa_alloc(&multicast_table, &mc->id, NULL, xa_limit_32b, + GFP_KERNEL)) { + ret = -ENOMEM; + goto err_free_mc; + } + + list_add_tail(&mc->list, &ctx->mc_list); + xa_unlock(&multicast_table); + + mutex_lock(&ctx->mutex); + ret = rdma_join_multicast(ctx->cm_id, (struct sockaddr *)&mc->addr, + join_state, mc); + mutex_unlock(&ctx->mutex); + if (ret) + goto err_xa_erase; + + resp.id = mc->id; + if (copy_to_user(u64_to_user_ptr(cmd->response), + &resp, sizeof(resp))) { + ret = -EFAULT; + goto err_leave_multicast; + } + + xa_store(&multicast_table, mc->id, mc, 0); + + ucma_put_ctx(ctx); + return 0; + +err_leave_multicast: + mutex_lock(&ctx->mutex); + rdma_leave_multicast(ctx->cm_id, (struct sockaddr *) &mc->addr); + mutex_unlock(&ctx->mutex); + ucma_cleanup_mc_events(mc); +err_xa_erase: + xa_lock(&multicast_table); + list_del(&mc->list); + __xa_erase(&multicast_table, mc->id); +err_free_mc: + xa_unlock(&multicast_table); + kfree(mc); +err_put_ctx: + ucma_put_ctx(ctx); + return ret; +} + +static ssize_t ucma_join_ip_multicast(struct ucma_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_join_ip_mcast cmd; + struct rdma_ucm_join_mcast join_cmd; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + join_cmd.response = cmd.response; + join_cmd.uid = cmd.uid; + join_cmd.id = cmd.id; + join_cmd.addr_size = rdma_addr_size_in6(&cmd.addr); + if (!join_cmd.addr_size) + return -EINVAL; + + join_cmd.join_flags = RDMA_MC_JOIN_FLAG_FULLMEMBER; + memcpy(&join_cmd.addr, &cmd.addr, join_cmd.addr_size); + + return ucma_process_join(file, &join_cmd, out_len); +} + +static ssize_t ucma_join_multicast(struct ucma_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_join_mcast cmd; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + if (!rdma_addr_size_kss(&cmd.addr)) + return -EINVAL; + + return ucma_process_join(file, &cmd, out_len); +} + +static ssize_t ucma_leave_multicast(struct ucma_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_destroy_id cmd; + struct rdma_ucm_destroy_id_resp resp; + struct ucma_multicast *mc; + int ret = 0; + + if (out_len < sizeof(resp)) + return -ENOSPC; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + xa_lock(&multicast_table); + mc = xa_load(&multicast_table, cmd.id); + if (!mc) + mc = ERR_PTR(-ENOENT); + else if (READ_ONCE(mc->ctx->file) != file) + mc = ERR_PTR(-EINVAL); + else if (!refcount_inc_not_zero(&mc->ctx->ref)) + mc = ERR_PTR(-ENXIO); + + if (IS_ERR(mc)) { + xa_unlock(&multicast_table); + ret = PTR_ERR(mc); + goto out; + } + + list_del(&mc->list); + __xa_erase(&multicast_table, mc->id); + xa_unlock(&multicast_table); + + mutex_lock(&mc->ctx->mutex); + rdma_leave_multicast(mc->ctx->cm_id, (struct sockaddr *) &mc->addr); + mutex_unlock(&mc->ctx->mutex); + + ucma_cleanup_mc_events(mc); + + ucma_put_ctx(mc->ctx); + resp.events_reported = mc->events_reported; + kfree(mc); + + if (copy_to_user(u64_to_user_ptr(cmd.response), + &resp, sizeof(resp))) + ret = -EFAULT; +out: + return ret; +} + +static ssize_t ucma_migrate_id(struct ucma_file *new_file, + const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_migrate_id cmd; + struct rdma_ucm_migrate_resp resp; + struct ucma_event *uevent, *tmp; + struct ucma_context *ctx; + LIST_HEAD(event_list); + struct fd f; + struct ucma_file *cur_file; + int ret = 0; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + /* Get current fd to protect against it being closed */ + f = fdget(cmd.fd); + if (!f.file) + return -ENOENT; + if (f.file->f_op != &ucma_fops) { + ret = -EINVAL; + goto file_put; + } + cur_file = f.file->private_data; + + /* Validate current fd and prevent destruction of id. */ + ctx = ucma_get_ctx(cur_file, cmd.id); + if (IS_ERR(ctx)) { + ret = PTR_ERR(ctx); + goto file_put; + } + + rdma_lock_handler(ctx->cm_id); + /* + * ctx->file can only be changed under the handler & xa_lock. xa_load() + * must be checked again to ensure the ctx hasn't begun destruction + * since the ucma_get_ctx(). + */ + xa_lock(&ctx_table); + if (_ucma_find_context(cmd.id, cur_file) != ctx) { + xa_unlock(&ctx_table); + ret = -ENOENT; + goto err_unlock; + } + ctx->file = new_file; + xa_unlock(&ctx_table); + + mutex_lock(&cur_file->mut); + list_del(&ctx->list); + /* + * At this point lock_handler() prevents addition of new uevents for + * this ctx. + */ + list_for_each_entry_safe(uevent, tmp, &cur_file->event_list, list) + if (uevent->ctx == ctx) + list_move_tail(&uevent->list, &event_list); + resp.events_reported = ctx->events_reported; + mutex_unlock(&cur_file->mut); + + mutex_lock(&new_file->mut); + list_add_tail(&ctx->list, &new_file->ctx_list); + list_splice_tail(&event_list, &new_file->event_list); + mutex_unlock(&new_file->mut); + + if (copy_to_user(u64_to_user_ptr(cmd.response), + &resp, sizeof(resp))) + ret = -EFAULT; + +err_unlock: + rdma_unlock_handler(ctx->cm_id); + ucma_put_ctx(ctx); +file_put: + fdput(f); + return ret; +} + +static ssize_t (*ucma_cmd_table[])(struct ucma_file *file, + const char __user *inbuf, + int in_len, int out_len) = { + [RDMA_USER_CM_CMD_CREATE_ID] = ucma_create_id, + [RDMA_USER_CM_CMD_DESTROY_ID] = ucma_destroy_id, + [RDMA_USER_CM_CMD_BIND_IP] = ucma_bind_ip, + [RDMA_USER_CM_CMD_RESOLVE_IP] = ucma_resolve_ip, + [RDMA_USER_CM_CMD_RESOLVE_ROUTE] = ucma_resolve_route, + [RDMA_USER_CM_CMD_QUERY_ROUTE] = ucma_query_route, + [RDMA_USER_CM_CMD_CONNECT] = ucma_connect, + [RDMA_USER_CM_CMD_LISTEN] = ucma_listen, + [RDMA_USER_CM_CMD_ACCEPT] = ucma_accept, + [RDMA_USER_CM_CMD_REJECT] = ucma_reject, + [RDMA_USER_CM_CMD_DISCONNECT] = ucma_disconnect, + [RDMA_USER_CM_CMD_INIT_QP_ATTR] = ucma_init_qp_attr, + [RDMA_USER_CM_CMD_GET_EVENT] = ucma_get_event, + [RDMA_USER_CM_CMD_GET_OPTION] = NULL, + [RDMA_USER_CM_CMD_SET_OPTION] = ucma_set_option, + [RDMA_USER_CM_CMD_NOTIFY] = ucma_notify, + [RDMA_USER_CM_CMD_JOIN_IP_MCAST] = ucma_join_ip_multicast, + [RDMA_USER_CM_CMD_LEAVE_MCAST] = ucma_leave_multicast, + [RDMA_USER_CM_CMD_MIGRATE_ID] = ucma_migrate_id, + [RDMA_USER_CM_CMD_QUERY] = ucma_query, + [RDMA_USER_CM_CMD_BIND] = ucma_bind, + [RDMA_USER_CM_CMD_RESOLVE_ADDR] = ucma_resolve_addr, + [RDMA_USER_CM_CMD_JOIN_MCAST] = ucma_join_multicast +}; + +static ssize_t ucma_write(struct file *filp, const char __user *buf, + size_t len, loff_t *pos) +{ + struct ucma_file *file = filp->private_data; + struct rdma_ucm_cmd_hdr hdr; + ssize_t ret; + + if (!ib_safe_file_access(filp)) { + pr_err_once("%s: process %d (%s) changed security contexts after opening file descriptor, this is not allowed.\n", + __func__, task_tgid_vnr(current), current->comm); + return -EACCES; + } + + if (len < sizeof(hdr)) + return -EINVAL; + + if (copy_from_user(&hdr, buf, sizeof(hdr))) + return -EFAULT; + + if (hdr.cmd >= ARRAY_SIZE(ucma_cmd_table)) + return -EINVAL; + hdr.cmd = array_index_nospec(hdr.cmd, ARRAY_SIZE(ucma_cmd_table)); + + if (hdr.in + sizeof(hdr) > len) + return -EINVAL; + + if (!ucma_cmd_table[hdr.cmd]) + return -ENOSYS; + + ret = ucma_cmd_table[hdr.cmd](file, buf + sizeof(hdr), hdr.in, hdr.out); + if (!ret) + ret = len; + + return ret; +} + +static __poll_t ucma_poll(struct file *filp, struct poll_table_struct *wait) +{ + struct ucma_file *file = filp->private_data; + __poll_t mask = 0; + + poll_wait(filp, &file->poll_wait, wait); + + if (!list_empty(&file->event_list)) + mask = EPOLLIN | EPOLLRDNORM; + + return mask; +} + +/* + * ucma_open() does not need the BKL: + * + * - no global state is referred to; + * - there is no ioctl method to race against; + * - no further module initialization is required for open to work + * after the device is registered. + */ +static int ucma_open(struct inode *inode, struct file *filp) +{ + struct ucma_file *file; + + file = kmalloc(sizeof *file, GFP_KERNEL); + if (!file) + return -ENOMEM; + + INIT_LIST_HEAD(&file->event_list); + INIT_LIST_HEAD(&file->ctx_list); + init_waitqueue_head(&file->poll_wait); + mutex_init(&file->mut); + + filp->private_data = file; + file->filp = filp; + + return stream_open(inode, filp); +} + +static int ucma_close(struct inode *inode, struct file *filp) +{ + struct ucma_file *file = filp->private_data; + + /* + * All paths that touch ctx_list or ctx_list starting from write() are + * prevented by this being a FD release function. The list_add_tail() in + * ucma_connect_event_handler() can run concurrently, however it only + * adds to the list *after* a listening ID. By only reading the first of + * the list, and relying on ucma_destroy_private_ctx() to block + * ucma_connect_event_handler(), no additional locking is needed. + */ + while (!list_empty(&file->ctx_list)) { + struct ucma_context *ctx = list_first_entry( + &file->ctx_list, struct ucma_context, list); + + WARN_ON(xa_cmpxchg(&ctx_table, ctx->id, ctx, XA_ZERO_ENTRY, + GFP_KERNEL) != ctx); + ucma_destroy_private_ctx(ctx); + } + kfree(file); + return 0; +} + +static const struct file_operations ucma_fops = { + .owner = THIS_MODULE, + .open = ucma_open, + .release = ucma_close, + .write = ucma_write, + .poll = ucma_poll, + .llseek = no_llseek, +}; + +static struct miscdevice ucma_misc = { + .minor = MISC_DYNAMIC_MINOR, + .name = "rdma_cm", + .nodename = "infiniband/rdma_cm", + .mode = 0666, + .fops = &ucma_fops, +}; + +static int ucma_get_global_nl_info(struct ib_client_nl_info *res) +{ + res->abi = RDMA_USER_CM_ABI_VERSION; + res->cdev = ucma_misc.this_device; + return 0; +} + +static struct ib_client rdma_cma_client = { + .name = "rdma_cm", + .get_global_nl_info = ucma_get_global_nl_info, +}; +MODULE_ALIAS_RDMA_CLIENT("rdma_cm"); + +static ssize_t abi_version_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%d\n", RDMA_USER_CM_ABI_VERSION); +} +static DEVICE_ATTR_RO(abi_version); + +static int __init ucma_init(void) +{ + int ret; + + ret = misc_register(&ucma_misc); + if (ret) + return ret; + + ret = device_create_file(ucma_misc.this_device, &dev_attr_abi_version); + if (ret) { + pr_err("rdma_ucm: couldn't create abi_version attr\n"); + goto err1; + } + + ucma_ctl_table_hdr = register_net_sysctl(&init_net, "net/rdma_ucm", ucma_ctl_table); + if (!ucma_ctl_table_hdr) { + pr_err("rdma_ucm: couldn't register sysctl paths\n"); + ret = -ENOMEM; + goto err2; + } + + ret = ib_register_client(&rdma_cma_client); + if (ret) + goto err3; + + return 0; +err3: + unregister_net_sysctl_table(ucma_ctl_table_hdr); +err2: + device_remove_file(ucma_misc.this_device, &dev_attr_abi_version); +err1: + misc_deregister(&ucma_misc); + return ret; +} + +static void __exit ucma_cleanup(void) +{ + ib_unregister_client(&rdma_cma_client); + unregister_net_sysctl_table(ucma_ctl_table_hdr); + device_remove_file(ucma_misc.this_device, &dev_attr_abi_version); + misc_deregister(&ucma_misc); +} + +module_init(ucma_init); +module_exit(ucma_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/ud_header.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/ud_header.c new file mode 100644 index 0000000..64d9c49 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/ud_header.c @@ -0,0 +1,547 @@ +/* + * Copyright (c) 2004 Topspin Corporation. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#include + +#define STRUCT_FIELD(header, field) \ + .struct_offset_bytes = offsetof(struct ib_unpacked_ ## header, field), \ + .struct_size_bytes = sizeof_field(struct ib_unpacked_ ## header, field), \ + .field_name = #header ":" #field + +static const struct ib_field lrh_table[] = { + { STRUCT_FIELD(lrh, virtual_lane), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 4 }, + { STRUCT_FIELD(lrh, link_version), + .offset_words = 0, + .offset_bits = 4, + .size_bits = 4 }, + { STRUCT_FIELD(lrh, service_level), + .offset_words = 0, + .offset_bits = 8, + .size_bits = 4 }, + { RESERVED, + .offset_words = 0, + .offset_bits = 12, + .size_bits = 2 }, + { STRUCT_FIELD(lrh, link_next_header), + .offset_words = 0, + .offset_bits = 14, + .size_bits = 2 }, + { STRUCT_FIELD(lrh, destination_lid), + .offset_words = 0, + .offset_bits = 16, + .size_bits = 16 }, + { RESERVED, + .offset_words = 1, + .offset_bits = 0, + .size_bits = 5 }, + { STRUCT_FIELD(lrh, packet_length), + .offset_words = 1, + .offset_bits = 5, + .size_bits = 11 }, + { STRUCT_FIELD(lrh, source_lid), + .offset_words = 1, + .offset_bits = 16, + .size_bits = 16 } +}; + +static const struct ib_field eth_table[] = { + { STRUCT_FIELD(eth, dmac_h), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 32 }, + { STRUCT_FIELD(eth, dmac_l), + .offset_words = 1, + .offset_bits = 0, + .size_bits = 16 }, + { STRUCT_FIELD(eth, smac_h), + .offset_words = 1, + .offset_bits = 16, + .size_bits = 16 }, + { STRUCT_FIELD(eth, smac_l), + .offset_words = 2, + .offset_bits = 0, + .size_bits = 32 }, + { STRUCT_FIELD(eth, type), + .offset_words = 3, + .offset_bits = 0, + .size_bits = 16 } +}; + +static const struct ib_field vlan_table[] = { + { STRUCT_FIELD(vlan, tag), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 16 }, + { STRUCT_FIELD(vlan, type), + .offset_words = 0, + .offset_bits = 16, + .size_bits = 16 } +}; + +static const struct ib_field ip4_table[] = { + { STRUCT_FIELD(ip4, ver), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 4 }, + { STRUCT_FIELD(ip4, hdr_len), + .offset_words = 0, + .offset_bits = 4, + .size_bits = 4 }, + { STRUCT_FIELD(ip4, tos), + .offset_words = 0, + .offset_bits = 8, + .size_bits = 8 }, + { STRUCT_FIELD(ip4, tot_len), + .offset_words = 0, + .offset_bits = 16, + .size_bits = 16 }, + { STRUCT_FIELD(ip4, id), + .offset_words = 1, + .offset_bits = 0, + .size_bits = 16 }, + { STRUCT_FIELD(ip4, frag_off), + .offset_words = 1, + .offset_bits = 16, + .size_bits = 16 }, + { STRUCT_FIELD(ip4, ttl), + .offset_words = 2, + .offset_bits = 0, + .size_bits = 8 }, + { STRUCT_FIELD(ip4, protocol), + .offset_words = 2, + .offset_bits = 8, + .size_bits = 8 }, + { STRUCT_FIELD(ip4, check), + .offset_words = 2, + .offset_bits = 16, + .size_bits = 16 }, + { STRUCT_FIELD(ip4, saddr), + .offset_words = 3, + .offset_bits = 0, + .size_bits = 32 }, + { STRUCT_FIELD(ip4, daddr), + .offset_words = 4, + .offset_bits = 0, + .size_bits = 32 } +}; + +static const struct ib_field udp_table[] = { + { STRUCT_FIELD(udp, sport), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 16 }, + { STRUCT_FIELD(udp, dport), + .offset_words = 0, + .offset_bits = 16, + .size_bits = 16 }, + { STRUCT_FIELD(udp, length), + .offset_words = 1, + .offset_bits = 0, + .size_bits = 16 }, + { STRUCT_FIELD(udp, csum), + .offset_words = 1, + .offset_bits = 16, + .size_bits = 16 } +}; + +static const struct ib_field grh_table[] = { + { STRUCT_FIELD(grh, ip_version), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 4 }, + { STRUCT_FIELD(grh, traffic_class), + .offset_words = 0, + .offset_bits = 4, + .size_bits = 8 }, + { STRUCT_FIELD(grh, flow_label), + .offset_words = 0, + .offset_bits = 12, + .size_bits = 20 }, + { STRUCT_FIELD(grh, payload_length), + .offset_words = 1, + .offset_bits = 0, + .size_bits = 16 }, + { STRUCT_FIELD(grh, next_header), + .offset_words = 1, + .offset_bits = 16, + .size_bits = 8 }, + { STRUCT_FIELD(grh, hop_limit), + .offset_words = 1, + .offset_bits = 24, + .size_bits = 8 }, + { STRUCT_FIELD(grh, source_gid), + .offset_words = 2, + .offset_bits = 0, + .size_bits = 128 }, + { STRUCT_FIELD(grh, destination_gid), + .offset_words = 6, + .offset_bits = 0, + .size_bits = 128 } +}; + +static const struct ib_field bth_table[] = { + { STRUCT_FIELD(bth, opcode), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 8 }, + { STRUCT_FIELD(bth, solicited_event), + .offset_words = 0, + .offset_bits = 8, + .size_bits = 1 }, + { STRUCT_FIELD(bth, mig_req), + .offset_words = 0, + .offset_bits = 9, + .size_bits = 1 }, + { STRUCT_FIELD(bth, pad_count), + .offset_words = 0, + .offset_bits = 10, + .size_bits = 2 }, + { STRUCT_FIELD(bth, transport_header_version), + .offset_words = 0, + .offset_bits = 12, + .size_bits = 4 }, + { STRUCT_FIELD(bth, pkey), + .offset_words = 0, + .offset_bits = 16, + .size_bits = 16 }, + { RESERVED, + .offset_words = 1, + .offset_bits = 0, + .size_bits = 8 }, + { STRUCT_FIELD(bth, destination_qpn), + .offset_words = 1, + .offset_bits = 8, + .size_bits = 24 }, + { STRUCT_FIELD(bth, ack_req), + .offset_words = 2, + .offset_bits = 0, + .size_bits = 1 }, + { RESERVED, + .offset_words = 2, + .offset_bits = 1, + .size_bits = 7 }, + { STRUCT_FIELD(bth, psn), + .offset_words = 2, + .offset_bits = 8, + .size_bits = 24 } +}; + +static const struct ib_field deth_table[] = { + { STRUCT_FIELD(deth, qkey), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 32 }, + { RESERVED, + .offset_words = 1, + .offset_bits = 0, + .size_bits = 8 }, + { STRUCT_FIELD(deth, source_qpn), + .offset_words = 1, + .offset_bits = 8, + .size_bits = 24 } +}; + +__sum16 ib_ud_ip4_csum(struct ib_ud_header *header) +{ + struct iphdr iph; + + iph.ihl = 5; + iph.version = 4; + iph.tos = header->ip4.tos; + iph.tot_len = header->ip4.tot_len; + iph.id = header->ip4.id; + iph.frag_off = header->ip4.frag_off; + iph.ttl = header->ip4.ttl; + iph.protocol = header->ip4.protocol; + iph.check = 0; + iph.saddr = header->ip4.saddr; + iph.daddr = header->ip4.daddr; + + return ip_fast_csum((u8 *)&iph, iph.ihl); +} +EXPORT_SYMBOL(ib_ud_ip4_csum); + +/** + * ib_ud_header_init - Initialize UD header structure + * @payload_bytes:Length of packet payload + * @lrh_present: specify if LRH is present + * @eth_present: specify if Eth header is present + * @vlan_present: packet is tagged vlan + * @grh_present: GRH flag (if non-zero, GRH will be included) + * @ip_version: if non-zero, IP header, V4 or V6, will be included + * @udp_present :if non-zero, UDP header will be included + * @immediate_present: specify if immediate data is present + * @header:Structure to initialize + */ +int ib_ud_header_init(int payload_bytes, + int lrh_present, + int eth_present, + int vlan_present, + int grh_present, + int ip_version, + int udp_present, + int immediate_present, + struct ib_ud_header *header) +{ + size_t udp_bytes = udp_present ? IB_UDP_BYTES : 0; + + grh_present = grh_present && !ip_version; + memset(header, 0, sizeof *header); + + /* + * UDP header without IP header doesn't make sense + */ + if (udp_present && ip_version != 4 && ip_version != 6) + return -EINVAL; + + if (lrh_present) { + u16 packet_length; + + header->lrh.link_version = 0; + header->lrh.link_next_header = + grh_present ? IB_LNH_IBA_GLOBAL : IB_LNH_IBA_LOCAL; + packet_length = (IB_LRH_BYTES + + IB_BTH_BYTES + + IB_DETH_BYTES + + (grh_present ? IB_GRH_BYTES : 0) + + payload_bytes + + 4 + /* ICRC */ + 3) / 4; /* round up */ + header->lrh.packet_length = cpu_to_be16(packet_length); + } + + if (vlan_present) + header->eth.type = cpu_to_be16(ETH_P_8021Q); + + if (ip_version == 6 || grh_present) { + header->grh.ip_version = 6; + header->grh.payload_length = + cpu_to_be16((udp_bytes + + IB_BTH_BYTES + + IB_DETH_BYTES + + payload_bytes + + 4 + /* ICRC */ + 3) & ~3); /* round up */ + header->grh.next_header = udp_present ? IPPROTO_UDP : 0x1b; + } + + if (ip_version == 4) { + header->ip4.ver = 4; /* version 4 */ + header->ip4.hdr_len = 5; /* 5 words */ + header->ip4.tot_len = + cpu_to_be16(IB_IP4_BYTES + + udp_bytes + + IB_BTH_BYTES + + IB_DETH_BYTES + + payload_bytes + + 4); /* ICRC */ + header->ip4.protocol = IPPROTO_UDP; + } + if (udp_present && ip_version) + header->udp.length = + cpu_to_be16(IB_UDP_BYTES + + IB_BTH_BYTES + + IB_DETH_BYTES + + payload_bytes + + 4); /* ICRC */ + + if (immediate_present) + header->bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE; + else + header->bth.opcode = IB_OPCODE_UD_SEND_ONLY; + header->bth.pad_count = (4 - payload_bytes) & 3; + header->bth.transport_header_version = 0; + + header->lrh_present = lrh_present; + header->eth_present = eth_present; + header->vlan_present = vlan_present; + header->grh_present = grh_present || (ip_version == 6); + header->ipv4_present = ip_version == 4; + header->udp_present = udp_present; + header->immediate_present = immediate_present; + return 0; +} +EXPORT_SYMBOL(ib_ud_header_init); + +/** + * ib_ud_header_pack - Pack UD header struct into wire format + * @header:UD header struct + * @buf:Buffer to pack into + * + * ib_ud_header_pack() packs the UD header structure @header into wire + * format in the buffer @buf. + */ +int ib_ud_header_pack(struct ib_ud_header *header, + void *buf) +{ + int len = 0; + + if (header->lrh_present) { + ib_pack(lrh_table, ARRAY_SIZE(lrh_table), + &header->lrh, buf + len); + len += IB_LRH_BYTES; + } + if (header->eth_present) { + ib_pack(eth_table, ARRAY_SIZE(eth_table), + &header->eth, buf + len); + len += IB_ETH_BYTES; + } + if (header->vlan_present) { + ib_pack(vlan_table, ARRAY_SIZE(vlan_table), + &header->vlan, buf + len); + len += IB_VLAN_BYTES; + } + if (header->grh_present) { + ib_pack(grh_table, ARRAY_SIZE(grh_table), + &header->grh, buf + len); + len += IB_GRH_BYTES; + } + if (header->ipv4_present) { + ib_pack(ip4_table, ARRAY_SIZE(ip4_table), + &header->ip4, buf + len); + len += IB_IP4_BYTES; + } + if (header->udp_present) { + ib_pack(udp_table, ARRAY_SIZE(udp_table), + &header->udp, buf + len); + len += IB_UDP_BYTES; + } + + ib_pack(bth_table, ARRAY_SIZE(bth_table), + &header->bth, buf + len); + len += IB_BTH_BYTES; + + ib_pack(deth_table, ARRAY_SIZE(deth_table), + &header->deth, buf + len); + len += IB_DETH_BYTES; + + if (header->immediate_present) { + memcpy(buf + len, &header->immediate_data, sizeof header->immediate_data); + len += sizeof header->immediate_data; + } + + return len; +} +EXPORT_SYMBOL(ib_ud_header_pack); + +/** + * ib_ud_header_unpack - Unpack UD header struct from wire format + * @header:UD header struct + * @buf:Buffer to pack into + * + * ib_ud_header_pack() unpacks the UD header structure @header from wire + * format in the buffer @buf. + */ +int ib_ud_header_unpack(void *buf, + struct ib_ud_header *header) +{ + ib_unpack(lrh_table, ARRAY_SIZE(lrh_table), + buf, &header->lrh); + buf += IB_LRH_BYTES; + + if (header->lrh.link_version != 0) { + pr_warn("Invalid LRH.link_version %u\n", + header->lrh.link_version); + return -EINVAL; + } + + switch (header->lrh.link_next_header) { + case IB_LNH_IBA_LOCAL: + header->grh_present = 0; + break; + + case IB_LNH_IBA_GLOBAL: + header->grh_present = 1; + ib_unpack(grh_table, ARRAY_SIZE(grh_table), + buf, &header->grh); + buf += IB_GRH_BYTES; + + if (header->grh.ip_version != 6) { + pr_warn("Invalid GRH.ip_version %u\n", + header->grh.ip_version); + return -EINVAL; + } + if (header->grh.next_header != 0x1b) { + pr_warn("Invalid GRH.next_header 0x%02x\n", + header->grh.next_header); + return -EINVAL; + } + break; + + default: + pr_warn("Invalid LRH.link_next_header %u\n", + header->lrh.link_next_header); + return -EINVAL; + } + + ib_unpack(bth_table, ARRAY_SIZE(bth_table), + buf, &header->bth); + buf += IB_BTH_BYTES; + + switch (header->bth.opcode) { + case IB_OPCODE_UD_SEND_ONLY: + header->immediate_present = 0; + break; + case IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE: + header->immediate_present = 1; + break; + default: + pr_warn("Invalid BTH.opcode 0x%02x\n", header->bth.opcode); + return -EINVAL; + } + + if (header->bth.transport_header_version != 0) { + pr_warn("Invalid BTH.transport_header_version %u\n", + header->bth.transport_header_version); + return -EINVAL; + } + + ib_unpack(deth_table, ARRAY_SIZE(deth_table), + buf, &header->deth); + buf += IB_DETH_BYTES; + + if (header->immediate_present) + memcpy(&header->immediate_data, buf, sizeof header->immediate_data); + + return 0; +} +EXPORT_SYMBOL(ib_ud_header_unpack); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/umem.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/umem.c new file mode 100644 index 0000000..ff9be18 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/umem.c @@ -0,0 +1,369 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Cisco Systems. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * Copyright (c) 2020 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "uverbs.h" + +#include "ib_peer_mem.h" + +static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty) +{ + bool make_dirty = umem->writable && dirty; + struct scatterlist *sg; + unsigned int i; + + if (dirty) + ib_dma_unmap_sgtable_attrs(dev, &umem->sgt_append.sgt, + DMA_BIDIRECTIONAL, 0); + + for_each_sgtable_sg(&umem->sgt_append.sgt, sg, i) + unpin_user_page_range_dirty_lock(sg_page(sg), + DIV_ROUND_UP(sg->length, PAGE_SIZE), make_dirty); + + sg_free_append_table(&umem->sgt_append); +} + +/** + * ib_umem_find_best_pgsz - Find best HW page size to use for this MR + * + * @umem: umem struct + * @pgsz_bitmap: bitmap of HW supported page sizes + * @virt: IOVA + * + * This helper is intended for HW that support multiple page + * sizes but can do only a single page size in an MR. + * + * Returns 0 if the umem requires page sizes not supported by + * the driver to be mapped. Drivers always supporting PAGE_SIZE + * or smaller will never see a 0 result. + */ +unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem, + unsigned long pgsz_bitmap, + unsigned long virt) +{ + struct scatterlist *sg; + unsigned long va, pgoff; + dma_addr_t mask; + int i; + + if (umem->is_odp) { + unsigned int page_size = BIT(to_ib_umem_odp(umem)->page_shift); + + /* ODP must always be self consistent. */ + if (!(pgsz_bitmap & page_size)) + return 0; + return page_size; + } + + /* rdma_for_each_block() has a bug if the page size is smaller than the + * page size used to build the umem. For now prevent smaller page sizes + * from being returned. + */ + pgsz_bitmap &= GENMASK(BITS_PER_LONG - 1, PAGE_SHIFT); + + umem->iova = va = virt; + /* The best result is the smallest page size that results in the minimum + * number of required pages. Compute the largest page size that could + * work based on VA address bits that don't change. + */ + mask = pgsz_bitmap & + GENMASK(BITS_PER_LONG - 1, + bits_per((umem->length - 1 + virt) ^ virt)); + /* offset into first SGL */ + pgoff = umem->address & ~PAGE_MASK; + + for_each_sgtable_dma_sg(&umem->sgt_append.sgt, sg, i) { + /* Walk SGL and reduce max page size if VA/PA bits differ + * for any address. + */ + mask |= (sg_dma_address(sg) + pgoff) ^ va; + va += sg_dma_len(sg) - pgoff; + /* Except for the last entry, the ending iova alignment sets + * the maximum possible page size as the low bits of the iova + * must be zero when starting the next chunk. + */ + if (i != (umem->sgt_append.sgt.nents - 1)) + mask |= va; + pgoff = 0; + } + + /* The mask accumulates 1's in each position where the VA and physical + * address differ, thus the length of trailing 0 is the largest page + * size that can pass the VA through to the physical. + */ + if (mask) + pgsz_bitmap &= GENMASK(count_trailing_zeros(mask), 0); + return pgsz_bitmap ? rounddown_pow_of_two(pgsz_bitmap) : 0; +} +EXPORT_SYMBOL(ib_umem_find_best_pgsz); + +/** + * __ib_umem_get - Pin and DMA map userspace memory. + * + * @device: IB device to connect UMEM + * @addr: userspace virtual address to start at + * @size: length of region to pin + * @access: IB_ACCESS_xxx flags for memory being pinned + * @peer_mem_flags: IB_PEER_MEM_xxx flags for memory being used + */ +static struct ib_umem *__ib_umem_get(struct ib_device *device, + unsigned long addr, size_t size, int access, + unsigned long peer_mem_flags) +{ + struct ib_umem *umem; + struct page **page_list; + unsigned long lock_limit; + unsigned long new_pinned; + unsigned long cur_base; + unsigned long dma_attr = 0; + struct mm_struct *mm; + unsigned long npages; + int pinned, ret; + unsigned int gup_flags = FOLL_WRITE; + + /* + * If the combination of the addr and size requested for this memory + * region causes an integer overflow, return error. + */ + if (((addr + size) < addr) || + PAGE_ALIGN(addr + size) < (addr + size)) { + pr_err("%s: integer overflow, size=%zu\n", __func__, size); + return ERR_PTR(-EINVAL); + } + + if (!can_do_mlock()) { + pr_err("%s: no mlock permission\n", __func__); + return ERR_PTR(-EPERM); + } + + if (access & IB_ACCESS_ON_DEMAND) + return ERR_PTR(-EOPNOTSUPP); + + umem = kzalloc(sizeof(*umem), GFP_KERNEL); + if (!umem) + return ERR_PTR(-ENOMEM); + umem->ibdev = device; + umem->length = size; + umem->address = addr; + /* + * Drivers should call ib_umem_find_best_pgsz() to set the iova + * correctly. + */ + umem->iova = addr; + umem->writable = ib_access_writable(access); + umem->owning_mm = mm = current->mm; + mmgrab(mm); + + page_list = (struct page **) __get_free_page(GFP_KERNEL); + if (!page_list) { + ret = -ENOMEM; + goto umem_kfree; + } + + npages = ib_umem_num_pages(umem); + if (npages == 0 || npages > UINT_MAX) { + ret = -EINVAL; + goto out; + } + + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + + new_pinned = atomic64_add_return(npages, &mm->pinned_vm); + if (new_pinned > lock_limit && !capable(CAP_IPC_LOCK)) { + atomic64_sub(npages, &mm->pinned_vm); + ret = -ENOMEM; + goto out; + } + + cur_base = addr & PAGE_MASK; + + if (!umem->writable) + gup_flags |= FOLL_FORCE; + + while (npages) { + cond_resched(); + pinned = pin_user_pages_fast(cur_base, + min_t(unsigned long, npages, + PAGE_SIZE / + sizeof(struct page *)), + gup_flags | FOLL_LONGTERM, page_list); + if (pinned < 0) { + ret = pinned; + pr_debug("%s: failed to get user pages, nr_pages=%lu, flags=%u\n", __func__, + min_t(unsigned long, npages, + PAGE_SIZE / sizeof(struct page *)), + gup_flags); + goto umem_release; + } + + cur_base += pinned * PAGE_SIZE; + npages -= pinned; + ret = sg_alloc_append_table_from_pages( + &umem->sgt_append, page_list, pinned, 0, + pinned << PAGE_SHIFT, ib_dma_max_seg_size(device), + npages, GFP_KERNEL); + if (ret) { + unpin_user_pages_dirty_lock(page_list, pinned, 0); + goto umem_release; + } + } + + if (access & IB_ACCESS_RELAXED_ORDERING) + dma_attr |= DMA_ATTR_WEAK_ORDERING; + + ret = ib_dma_map_sgtable_attrs(device, &umem->sgt_append.sgt, + DMA_BIDIRECTIONAL, dma_attr); + if (ret) { + pr_err("%s: failed to map scatterlist, npages=%lu\n", __func__, + npages); + goto umem_release; + } + goto out; + +umem_release: + __ib_umem_release(device, umem, 0); + + /* + * If the address belongs to peer memory client, then the first + * call to get_user_pages will fail. In this case, try to get + * these pages from the peers. + */ + //FIXME: this placement is horrible + if (ret < 0 && peer_mem_flags & IB_PEER_MEM_ALLOW) { + struct ib_umem *new_umem; + + new_umem = ib_peer_umem_get(umem, ret, peer_mem_flags); + if (IS_ERR(new_umem)) { + ret = PTR_ERR(new_umem); + goto vma; + } + umem = new_umem; + ret = 0; + goto out; + } +vma: + atomic64_sub(ib_umem_num_pages(umem), &mm->pinned_vm); +out: + free_page((unsigned long) page_list); +umem_kfree: + if (ret) { + mmdrop(umem->owning_mm); + kfree(umem); + } + return ret ? ERR_PTR(ret) : umem; +} + +struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr, + size_t size, int access) +{ + return __ib_umem_get(device, addr, size, access, 0); +} +EXPORT_SYMBOL(ib_umem_get); + +struct ib_umem *ib_umem_get_peer(struct ib_device *device, unsigned long addr, + size_t size, int access, + unsigned long peer_mem_flags) +{ + return __ib_umem_get(device, addr, size, access, + IB_PEER_MEM_ALLOW | peer_mem_flags); +} +EXPORT_SYMBOL(ib_umem_get_peer); + +/** + * ib_umem_release - release memory pinned with ib_umem_get + * @umem: umem struct to release + */ +void ib_umem_release(struct ib_umem *umem) +{ + if (!umem) + return; + if (umem->is_dmabuf) + return ib_umem_dmabuf_release(to_ib_umem_dmabuf(umem)); + if (umem->is_odp) + return ib_umem_odp_release(to_ib_umem_odp(umem)); + + if (umem->is_peer) + return ib_peer_umem_release(umem); + __ib_umem_release(umem->ibdev, umem, 1); + + atomic64_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm); + mmdrop(umem->owning_mm); + kfree(umem); +} +EXPORT_SYMBOL(ib_umem_release); + +/* + * Copy from the given ib_umem's pages to the given buffer. + * + * umem - the umem to copy from + * offset - offset to start copying from + * dst - destination buffer + * length - buffer length + * + * Returns 0 on success, or an error code. + */ +int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset, + size_t length) +{ + size_t end = offset + length; + int ret; + + if (offset > umem->length || length > umem->length - offset) { + pr_err("%s not in range. offset: %zd umem length: %zd end: %zd\n", + __func__, offset, umem->length, end); + return -EINVAL; + } + + ret = sg_pcopy_to_buffer(umem->sgt_append.sgt.sgl, + umem->sgt_append.sgt.orig_nents, dst, length, + offset + ib_umem_offset(umem)); + + if (ret < 0) + return ret; + else if (ret != length) + return -EINVAL; + else + return 0; +} +EXPORT_SYMBOL(ib_umem_copy_from); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/umem_dmabuf.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/umem_dmabuf.c new file mode 100644 index 0000000..f076074 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/umem_dmabuf.c @@ -0,0 +1,231 @@ +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) +/* + * Copyright (c) 2020 Intel Corporation. All rights reserved. + */ + +#include +#include +#include +#include + +#include "uverbs.h" + +MODULE_IMPORT_NS(DMA_BUF); + +int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf) +{ + struct sg_table *sgt; + struct scatterlist *sg; + struct dma_fence *fence; + unsigned long start, end, cur = 0; + unsigned int nmap = 0; + int i; + + dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv); + + if (umem_dmabuf->sgt) + goto wait_fence; + + sgt = dma_buf_map_attachment(umem_dmabuf->attach, DMA_BIDIRECTIONAL); + if (IS_ERR(sgt)) + return PTR_ERR(sgt); + + /* modify the sg list in-place to match umem address and length */ + + start = ALIGN_DOWN(umem_dmabuf->umem.address, PAGE_SIZE); + end = ALIGN(umem_dmabuf->umem.address + umem_dmabuf->umem.length, + PAGE_SIZE); + for_each_sgtable_dma_sg(sgt, sg, i) { + if (start < cur + sg_dma_len(sg) && cur < end) + nmap++; + if (cur <= start && start < cur + sg_dma_len(sg)) { + unsigned long offset = start - cur; + + umem_dmabuf->first_sg = sg; + umem_dmabuf->first_sg_offset = offset; + sg_dma_address(sg) += offset; + sg_dma_len(sg) -= offset; + cur += offset; + } + if (cur < end && end <= cur + sg_dma_len(sg)) { + unsigned long trim = cur + sg_dma_len(sg) - end; + + umem_dmabuf->last_sg = sg; + umem_dmabuf->last_sg_trim = trim; + sg_dma_len(sg) -= trim; + break; + } + cur += sg_dma_len(sg); + } + + umem_dmabuf->umem.sgt_append.sgt.sgl = umem_dmabuf->first_sg; + umem_dmabuf->umem.sgt_append.sgt.nents = nmap; + umem_dmabuf->sgt = sgt; + +wait_fence: + /* + * Although the sg list is valid now, the content of the pages + * may be not up-to-date. Wait for the exporter to finish + * the migration. + */ + fence = dma_resv_excl_fence(umem_dmabuf->attach->dmabuf->resv); + if (fence) + return dma_fence_wait(fence, false); + + return 0; +} +EXPORT_SYMBOL(ib_umem_dmabuf_map_pages); + +void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf) +{ + dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv); + + if (!umem_dmabuf->sgt) + return; + + /* retore the original sg list */ + if (umem_dmabuf->first_sg) { + sg_dma_address(umem_dmabuf->first_sg) -= + umem_dmabuf->first_sg_offset; + sg_dma_len(umem_dmabuf->first_sg) += + umem_dmabuf->first_sg_offset; + umem_dmabuf->first_sg = NULL; + umem_dmabuf->first_sg_offset = 0; + } + if (umem_dmabuf->last_sg) { + sg_dma_len(umem_dmabuf->last_sg) += + umem_dmabuf->last_sg_trim; + umem_dmabuf->last_sg = NULL; + umem_dmabuf->last_sg_trim = 0; + } + + dma_buf_unmap_attachment(umem_dmabuf->attach, umem_dmabuf->sgt, + DMA_BIDIRECTIONAL); + + umem_dmabuf->sgt = NULL; +} +EXPORT_SYMBOL(ib_umem_dmabuf_unmap_pages); + +struct ib_umem_dmabuf *ib_umem_dmabuf_get(struct ib_device *device, + unsigned long offset, size_t size, + int fd, int access, + const struct dma_buf_attach_ops *ops) +{ + struct dma_buf *dmabuf; + struct ib_umem_dmabuf *umem_dmabuf; + struct ib_umem *umem; + unsigned long end; + struct ib_umem_dmabuf *ret = ERR_PTR(-EINVAL); + + if (check_add_overflow(offset, (unsigned long)size, &end)) + return ret; + + if (unlikely(!ops || !ops->move_notify)) + return ret; + + dmabuf = dma_buf_get(fd); + if (IS_ERR(dmabuf)) + return ERR_CAST(dmabuf); + + if (dmabuf->size < end) + goto out_release_dmabuf; + + umem_dmabuf = kzalloc(sizeof(*umem_dmabuf), GFP_KERNEL); + if (!umem_dmabuf) { + ret = ERR_PTR(-ENOMEM); + goto out_release_dmabuf; + } + + umem = &umem_dmabuf->umem; + umem->ibdev = device; + umem->length = size; + umem->address = offset; + umem->writable = ib_access_writable(access); + umem->is_dmabuf = 1; + + if (!ib_umem_num_pages(umem)) + goto out_free_umem; + + umem_dmabuf->attach = dma_buf_dynamic_attach( + dmabuf, + device->dma_device, + ops, + umem_dmabuf); + if (IS_ERR(umem_dmabuf->attach)) { + ret = ERR_CAST(umem_dmabuf->attach); + goto out_free_umem; + } + return umem_dmabuf; + +out_free_umem: + kfree(umem_dmabuf); + +out_release_dmabuf: + dma_buf_put(dmabuf); + return ret; +} +EXPORT_SYMBOL(ib_umem_dmabuf_get); + +static void +ib_umem_dmabuf_unsupported_move_notify(struct dma_buf_attachment *attach) +{ + struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv; + + ibdev_warn_ratelimited(umem_dmabuf->umem.ibdev, + "Invalidate callback should not be called when memory is pinned\n"); +} + +static struct dma_buf_attach_ops ib_umem_dmabuf_attach_pinned_ops = { + .allow_peer2peer = true, + .move_notify = ib_umem_dmabuf_unsupported_move_notify, +}; + +struct ib_umem_dmabuf *ib_umem_dmabuf_get_pinned(struct ib_device *device, + unsigned long offset, + size_t size, int fd, + int access) +{ + struct ib_umem_dmabuf *umem_dmabuf; + int err; + + umem_dmabuf = ib_umem_dmabuf_get(device, offset, size, fd, access, + &ib_umem_dmabuf_attach_pinned_ops); + if (IS_ERR(umem_dmabuf)) + return umem_dmabuf; + + dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL); + err = dma_buf_pin(umem_dmabuf->attach); + if (err) + goto err_release; + umem_dmabuf->pinned = 1; + + err = ib_umem_dmabuf_map_pages(umem_dmabuf); + if (err) + goto err_unpin; + dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv); + + return umem_dmabuf; + +err_unpin: + dma_buf_unpin(umem_dmabuf->attach); +err_release: + dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv); + ib_umem_release(&umem_dmabuf->umem); + return ERR_PTR(err); +} +EXPORT_SYMBOL(ib_umem_dmabuf_get_pinned); + +void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf) +{ + struct dma_buf *dmabuf = umem_dmabuf->attach->dmabuf; + + dma_resv_lock(dmabuf->resv, NULL); + ib_umem_dmabuf_unmap_pages(umem_dmabuf); + if (umem_dmabuf->pinned) + dma_buf_unpin(umem_dmabuf->attach); + dma_resv_unlock(dmabuf->resv); + + dma_buf_detach(dmabuf, umem_dmabuf->attach); + dma_buf_put(dmabuf); + kfree(umem_dmabuf); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/umem_odp.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/umem_odp.c new file mode 100644 index 0000000..aead24c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/umem_odp.c @@ -0,0 +1,517 @@ +/* + * Copyright (c) 2014 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "uverbs.h" + +static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp, + const struct mmu_interval_notifier_ops *ops) +{ + int ret; + + umem_odp->umem.is_odp = 1; + mutex_init(&umem_odp->umem_mutex); + + if (!umem_odp->is_implicit_odp) { + size_t page_size = 1UL << umem_odp->page_shift; + unsigned long start; + unsigned long end; + size_t ndmas, npfns; + + start = ALIGN_DOWN(umem_odp->umem.address, page_size); + if (check_add_overflow(umem_odp->umem.address, + (unsigned long)umem_odp->umem.length, + &end)) + return -EOVERFLOW; + end = ALIGN(end, page_size); + if (unlikely(end < page_size)) + return -EOVERFLOW; + + ndmas = (end - start) >> umem_odp->page_shift; + if (!ndmas) + return -EINVAL; + + npfns = (end - start) >> PAGE_SHIFT; + umem_odp->pfn_list = kvcalloc( + npfns, sizeof(*umem_odp->pfn_list), GFP_KERNEL); + if (!umem_odp->pfn_list) + return -ENOMEM; + + umem_odp->dma_list = kvcalloc( + ndmas, sizeof(*umem_odp->dma_list), GFP_KERNEL); + if (!umem_odp->dma_list) { + ret = -ENOMEM; + goto out_pfn_list; + } + + ret = mmu_interval_notifier_insert(&umem_odp->notifier, + umem_odp->umem.owning_mm, + start, end - start, ops); + if (ret) + goto out_dma_list; + } + + return 0; + +out_dma_list: + kvfree(umem_odp->dma_list); +out_pfn_list: + kvfree(umem_odp->pfn_list); + return ret; +} + +/** + * ib_umem_odp_alloc_implicit - Allocate a parent implicit ODP umem + * + * Implicit ODP umems do not have a VA range and do not have any page lists. + * They exist only to hold the per_mm reference to help the driver create + * children umems. + * + * @device: IB device to create UMEM + * @access: ib_reg_mr access flags + */ +struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_device *device, + int access) +{ + struct ib_umem *umem; + struct ib_umem_odp *umem_odp; + int ret; + + if (access & IB_ACCESS_HUGETLB) + return ERR_PTR(-EINVAL); + + umem_odp = kzalloc(sizeof(*umem_odp), GFP_KERNEL); + if (!umem_odp) + return ERR_PTR(-ENOMEM); + umem = &umem_odp->umem; + umem->ibdev = device; + umem->writable = ib_access_writable(access); + umem->owning_mm = current->mm; + umem_odp->is_implicit_odp = 1; + umem_odp->page_shift = PAGE_SHIFT; + + umem_odp->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); + ret = ib_init_umem_odp(umem_odp, NULL); + if (ret) { + put_pid(umem_odp->tgid); + kfree(umem_odp); + return ERR_PTR(ret); + } + return umem_odp; +} +EXPORT_SYMBOL(ib_umem_odp_alloc_implicit); + +/** + * ib_umem_odp_alloc_child - Allocate a child ODP umem under an implicit + * parent ODP umem + * + * @root: The parent umem enclosing the child. This must be allocated using + * ib_alloc_implicit_odp_umem() + * @addr: The starting userspace VA + * @size: The length of the userspace VA + * @ops: MMU interval ops, currently only @invalidate + */ +struct ib_umem_odp * +ib_umem_odp_alloc_child(struct ib_umem_odp *root, unsigned long addr, + size_t size, + const struct mmu_interval_notifier_ops *ops) +{ + /* + * Caller must ensure that root cannot be freed during the call to + * ib_alloc_odp_umem. + */ + struct ib_umem_odp *odp_data; + struct ib_umem *umem; + int ret; + + if (WARN_ON(!root->is_implicit_odp)) + return ERR_PTR(-EINVAL); + + odp_data = kzalloc(sizeof(*odp_data), GFP_KERNEL); + if (!odp_data) + return ERR_PTR(-ENOMEM); + umem = &odp_data->umem; + umem->ibdev = root->umem.ibdev; + umem->length = size; + umem->address = addr; + umem->writable = root->umem.writable; + umem->owning_mm = root->umem.owning_mm; + odp_data->page_shift = PAGE_SHIFT; + odp_data->notifier.ops = ops; + + /* + * A mmget must be held when registering a notifier, the owming_mm only + * has a mm_grab at this point. + */ + if (!mmget_not_zero(umem->owning_mm)) { + ret = -EFAULT; + goto out_free; + } + + odp_data->tgid = get_pid(root->tgid); + ret = ib_init_umem_odp(odp_data, ops); + if (ret) + goto out_tgid; + mmput(umem->owning_mm); + return odp_data; + +out_tgid: + put_pid(odp_data->tgid); + mmput(umem->owning_mm); +out_free: + kfree(odp_data); + return ERR_PTR(ret); +} +EXPORT_SYMBOL(ib_umem_odp_alloc_child); + +/** + * ib_umem_odp_get - Create a umem_odp for a userspace va + * + * @device: IB device struct to get UMEM + * @addr: userspace virtual address to start at + * @size: length of region to pin + * @access: IB_ACCESS_xxx flags for memory being pinned + * @ops: MMU interval ops, currently only @invalidate + * + * The driver should use when the access flags indicate ODP memory. It avoids + * pinning, instead, stores the mm for future page fault handling in + * conjunction with MMU notifiers. + */ +struct ib_umem_odp *ib_umem_odp_get(struct ib_device *device, + unsigned long addr, size_t size, int access, + const struct mmu_interval_notifier_ops *ops) +{ + struct ib_umem_odp *umem_odp; + int ret; + + if (WARN_ON_ONCE(!(access & IB_ACCESS_ON_DEMAND))) + return ERR_PTR(-EINVAL); + + umem_odp = kzalloc(sizeof(struct ib_umem_odp), GFP_KERNEL); + if (!umem_odp) + return ERR_PTR(-ENOMEM); + + umem_odp->umem.ibdev = device; + umem_odp->umem.length = size; + umem_odp->umem.address = addr; + umem_odp->umem.writable = ib_access_writable(access); + umem_odp->umem.owning_mm = current->mm; + umem_odp->notifier.ops = ops; + + umem_odp->page_shift = PAGE_SHIFT; +#ifdef CONFIG_HUGETLB_PAGE + if (access & IB_ACCESS_HUGETLB) + umem_odp->page_shift = HPAGE_SHIFT; +#endif + + umem_odp->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); + ret = ib_init_umem_odp(umem_odp, ops); + if (ret) + goto err_put_pid; + return umem_odp; + +err_put_pid: + put_pid(umem_odp->tgid); + kfree(umem_odp); + return ERR_PTR(ret); +} +EXPORT_SYMBOL(ib_umem_odp_get); + +void ib_umem_odp_release(struct ib_umem_odp *umem_odp) +{ + /* + * Ensure that no more pages are mapped in the umem. + * + * It is the driver's responsibility to ensure, before calling us, + * that the hardware will not attempt to access the MR any more. + */ + if (!umem_odp->is_implicit_odp) { + mutex_lock(&umem_odp->umem_mutex); + ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp), + ib_umem_end(umem_odp)); + mutex_unlock(&umem_odp->umem_mutex); + mmu_interval_notifier_remove(&umem_odp->notifier); + kvfree(umem_odp->dma_list); + kvfree(umem_odp->pfn_list); + } + put_pid(umem_odp->tgid); + kfree(umem_odp); +} +EXPORT_SYMBOL(ib_umem_odp_release); + +/* + * Map for DMA and insert a single page into the on-demand paging page tables. + * + * @umem: the umem to insert the page to. + * @dma_index: index in the umem to add the dma to. + * @page: the page struct to map and add. + * @access_mask: access permissions needed for this page. + * + * The function returns -EFAULT if the DMA mapping operation fails. + * + */ +static int ib_umem_odp_map_dma_single_page( + struct ib_umem_odp *umem_odp, + unsigned int dma_index, + struct page *page, + u64 access_mask) +{ + struct ib_device *dev = umem_odp->umem.ibdev; + dma_addr_t *dma_addr = &umem_odp->dma_list[dma_index]; + + if (*dma_addr) { + /* + * If the page is already dma mapped it means it went through + * a non-invalidating trasition, like read-only to writable. + * Resync the flags. + */ + *dma_addr = (*dma_addr & ODP_DMA_ADDR_MASK) | access_mask; + return 0; + } + + *dma_addr = ib_dma_map_page(dev, page, 0, 1 << umem_odp->page_shift, + DMA_BIDIRECTIONAL); + if (ib_dma_mapping_error(dev, *dma_addr)) { + *dma_addr = 0; + return -EFAULT; + } + umem_odp->npages++; + *dma_addr |= access_mask; + return 0; +} + +/** + * ib_umem_odp_map_dma_and_lock - DMA map userspace memory in an ODP MR and lock it. + * + * Maps the range passed in the argument to DMA addresses. + * The DMA addresses of the mapped pages is updated in umem_odp->dma_list. + * Upon success the ODP MR will be locked to let caller complete its device + * page table update. + * + * Returns the number of pages mapped in success, negative error code + * for failure. + * @umem_odp: the umem to map and pin + * @user_virt: the address from which we need to map. + * @bcnt: the minimal number of bytes to pin and map. The mapping might be + * bigger due to alignment, and may also be smaller in case of an error + * pinning or mapping a page. The actual pages mapped is returned in + * the return value. + * @access_mask: bit mask of the requested access permissions for the given + * range. + * @fault: is faulting required for the given range + */ +int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt, + u64 bcnt, u64 access_mask, bool fault) + __acquires(&umem_odp->umem_mutex) +{ + struct task_struct *owning_process = NULL; + struct mm_struct *owning_mm = umem_odp->umem.owning_mm; + int pfn_index, dma_index, ret = 0, start_idx; + unsigned int page_shift, hmm_order, pfn_start_idx; + unsigned long num_pfns, current_seq; + struct hmm_range range = {}; + unsigned long timeout; + + if (access_mask == 0) + return -EINVAL; + + if (user_virt < ib_umem_start(umem_odp) || + user_virt + bcnt > ib_umem_end(umem_odp)) + return -EFAULT; + + page_shift = umem_odp->page_shift; + + /* + * owning_process is allowed to be NULL, this means somehow the mm is + * existing beyond the lifetime of the originating process.. Presumably + * mmget_not_zero will fail in this case. + */ + owning_process = get_pid_task(umem_odp->tgid, PIDTYPE_PID); + if (!owning_process || !mmget_not_zero(owning_mm)) { + ret = -EINVAL; + goto out_put_task; + } + + range.notifier = &umem_odp->notifier; + range.start = ALIGN_DOWN(user_virt, 1UL << page_shift); + range.end = ALIGN(user_virt + bcnt, 1UL << page_shift); + pfn_start_idx = (range.start - ib_umem_start(umem_odp)) >> PAGE_SHIFT; + num_pfns = (range.end - range.start) >> PAGE_SHIFT; + if (fault) { + range.default_flags = HMM_PFN_REQ_FAULT; + + if (access_mask & ODP_WRITE_ALLOWED_BIT) + range.default_flags |= HMM_PFN_REQ_WRITE; + } + + range.hmm_pfns = &(umem_odp->pfn_list[pfn_start_idx]); + timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); + +retry: + current_seq = range.notifier_seq = + mmu_interval_read_begin(&umem_odp->notifier); + + mmap_read_lock(owning_mm); + ret = hmm_range_fault(&range); + mmap_read_unlock(owning_mm); + if (unlikely(ret)) { + if (ret == -EBUSY && !time_after(jiffies, timeout)) + goto retry; + goto out_put_mm; + } + + start_idx = (range.start - ib_umem_start(umem_odp)) >> page_shift; + dma_index = start_idx; + + mutex_lock(&umem_odp->umem_mutex); + if (mmu_interval_read_retry(&umem_odp->notifier, current_seq)) { + mutex_unlock(&umem_odp->umem_mutex); + goto retry; + } + + for (pfn_index = 0; pfn_index < num_pfns; + pfn_index += 1 << (page_shift - PAGE_SHIFT), dma_index++) { + + if (fault) { + /* + * Since we asked for hmm_range_fault() to populate + * pages it shouldn't return an error entry on success. + */ + WARN_ON(range.hmm_pfns[pfn_index] & HMM_PFN_ERROR); + WARN_ON(!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)); + } else { + if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)) { + WARN_ON(umem_odp->dma_list[dma_index]); + continue; + } + access_mask = ODP_READ_ALLOWED_BIT; + if (range.hmm_pfns[pfn_index] & HMM_PFN_WRITE) + access_mask |= ODP_WRITE_ALLOWED_BIT; + } + + hmm_order = hmm_pfn_to_map_order(range.hmm_pfns[pfn_index]); + /* If a hugepage was detected and ODP wasn't set for, the umem + * page_shift will be used, the opposite case is an error. + */ + if (hmm_order + PAGE_SHIFT < page_shift) { + ret = -EINVAL; + ibdev_dbg(umem_odp->umem.ibdev, + "%s: un-expected hmm_order %u, page_shift %u\n", + __func__, hmm_order, page_shift); + break; + } + + ret = ib_umem_odp_map_dma_single_page( + umem_odp, dma_index, hmm_pfn_to_page(range.hmm_pfns[pfn_index]), + access_mask); + if (ret < 0) { + ibdev_dbg(umem_odp->umem.ibdev, + "ib_umem_odp_map_dma_single_page failed with error %d\n", ret); + break; + } + } + /* upon sucesss lock should stay on hold for the callee */ + if (!ret) + ret = dma_index - start_idx; + else + mutex_unlock(&umem_odp->umem_mutex); + +out_put_mm: + mmput(owning_mm); +out_put_task: + if (owning_process) + put_task_struct(owning_process); + return ret; +} +EXPORT_SYMBOL(ib_umem_odp_map_dma_and_lock); + +void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, + u64 bound) +{ + dma_addr_t dma_addr; + dma_addr_t dma; + int idx; + u64 addr; + struct ib_device *dev = umem_odp->umem.ibdev; + + lockdep_assert_held(&umem_odp->umem_mutex); + + virt = max_t(u64, virt, ib_umem_start(umem_odp)); + bound = min_t(u64, bound, ib_umem_end(umem_odp)); + for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) { + idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift; + dma = umem_odp->dma_list[idx]; + + /* The access flags guaranteed a valid DMA address in case was NULL */ + if (dma) { + unsigned long pfn_idx = (addr - ib_umem_start(umem_odp)) >> PAGE_SHIFT; + struct page *page = hmm_pfn_to_page(umem_odp->pfn_list[pfn_idx]); + + dma_addr = dma & ODP_DMA_ADDR_MASK; + ib_dma_unmap_page(dev, dma_addr, + BIT(umem_odp->page_shift), + DMA_BIDIRECTIONAL); + if (dma & ODP_WRITE_ALLOWED_BIT) { + struct page *head_page = compound_head(page); + /* + * set_page_dirty prefers being called with + * the page lock. However, MMU notifiers are + * called sometimes with and sometimes without + * the lock. We rely on the umem_mutex instead + * to prevent other mmu notifiers from + * continuing and allowing the page mapping to + * be removed. + */ + set_page_dirty(head_page); + } + umem_odp->dma_list[idx] = 0; + umem_odp->npages--; + } + } +} +EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/user_mad.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/user_mad.c new file mode 100644 index 0000000..ee979da --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/user_mad.c @@ -0,0 +1,1497 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Voltaire, Inc. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2008 Cisco. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifdef pr_fmt +#undef pr_fmt +#endif +#define pr_fmt(fmt) "user_mad: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include "core_priv.h" + +MODULE_AUTHOR("Roland Dreier"); +MODULE_DESCRIPTION("InfiniBand userspace MAD packet access"); +MODULE_LICENSE("Dual BSD/GPL"); + +enum { + IB_UMAD_MAX_PORTS = RDMA_MAX_PORTS, + IB_UMAD_MAX_AGENTS = 32, + + IB_UMAD_MAJOR = 231, + IB_UMAD_MINOR_BASE = 0, + IB_UMAD_NUM_FIXED_MINOR = 64, + IB_UMAD_NUM_DYNAMIC_MINOR = IB_UMAD_MAX_PORTS - IB_UMAD_NUM_FIXED_MINOR, + IB_ISSM_MINOR_BASE = IB_UMAD_NUM_FIXED_MINOR, +}; + +/* + * Our lifetime rules for these structs are the following: + * device special file is opened, we take a reference on the + * ib_umad_port's struct ib_umad_device. We drop these + * references in the corresponding close(). + * + * In addition to references coming from open character devices, there + * is one more reference to each ib_umad_device representing the + * module's reference taken when allocating the ib_umad_device in + * ib_umad_add_one(). + * + * When destroying an ib_umad_device, we drop the module's reference. + */ + +struct ib_umad_port { + struct cdev cdev; + struct device dev; + struct cdev sm_cdev; + struct device sm_dev; + struct semaphore sm_sem; + + struct mutex file_mutex; + struct list_head file_list; + + struct ib_device *ib_dev; + struct ib_umad_device *umad_dev; + int dev_num; + u32 port_num; +}; + +struct ib_umad_device { + struct kref kref; + struct ib_umad_port ports[]; +}; + +struct ib_umad_file { + struct mutex mutex; + struct ib_umad_port *port; + struct list_head recv_list; + struct list_head send_list; + struct list_head port_list; + spinlock_t send_lock; + wait_queue_head_t recv_wait; + struct ib_mad_agent *agent[IB_UMAD_MAX_AGENTS]; + int agents_dead; + u8 use_pkey_index; + u8 already_used; +}; + +struct ib_umad_packet { + struct ib_mad_send_buf *msg; + struct ib_mad_recv_wc *recv_wc; + struct list_head list; + int length; + struct ib_user_mad mad; +}; + +#define CREATE_TRACE_POINTS +#include + +static const dev_t base_umad_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE); +static const dev_t base_issm_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE) + + IB_UMAD_NUM_FIXED_MINOR; +static dev_t dynamic_umad_dev; +static dev_t dynamic_issm_dev; + +static DEFINE_IDA(umad_ida); + +static int ib_umad_add_one(struct ib_device *device); +static void ib_umad_remove_one(struct ib_device *device, void *client_data); + +static void ib_umad_dev_free(struct kref *kref) +{ + struct ib_umad_device *dev = + container_of(kref, struct ib_umad_device, kref); + + kfree(dev); +} + +static void ib_umad_dev_get(struct ib_umad_device *dev) +{ + kref_get(&dev->kref); +} + +static void ib_umad_dev_put(struct ib_umad_device *dev) +{ + kref_put(&dev->kref, ib_umad_dev_free); +} + +static int hdr_size(struct ib_umad_file *file) +{ + return file->use_pkey_index ? sizeof(struct ib_user_mad_hdr) : + sizeof(struct ib_user_mad_hdr_old); +} + +/* caller must hold file->mutex */ +static struct ib_mad_agent *__get_agent(struct ib_umad_file *file, int id) +{ + return file->agents_dead ? NULL : file->agent[id]; +} + +static int queue_packet(struct ib_umad_file *file, + struct ib_mad_agent *agent, + struct ib_umad_packet *packet) +{ + int ret = 1; + + mutex_lock(&file->mutex); + + for (packet->mad.hdr.id = 0; + packet->mad.hdr.id < IB_UMAD_MAX_AGENTS; + packet->mad.hdr.id++) + if (agent == __get_agent(file, packet->mad.hdr.id)) { + list_add_tail(&packet->list, &file->recv_list); + wake_up_interruptible(&file->recv_wait); + ret = 0; + break; + } + + mutex_unlock(&file->mutex); + + return ret; +} + +static void dequeue_send(struct ib_umad_file *file, + struct ib_umad_packet *packet) +{ + spin_lock_irq(&file->send_lock); + list_del(&packet->list); + spin_unlock_irq(&file->send_lock); +} + +static void send_handler(struct ib_mad_agent *agent, + struct ib_mad_send_wc *send_wc) +{ + struct ib_umad_file *file = agent->context; + struct ib_umad_packet *packet = send_wc->send_buf->context[0]; + + dequeue_send(file, packet); + rdma_destroy_ah(packet->msg->ah, RDMA_DESTROY_AH_SLEEPABLE); + ib_free_send_mad(packet->msg); + + if (send_wc->status == IB_WC_RESP_TIMEOUT_ERR) { + packet->length = IB_MGMT_MAD_HDR; + packet->mad.hdr.status = ETIMEDOUT; + if (!queue_packet(file, agent, packet)) + return; + } + kfree(packet); +} + +static void recv_handler(struct ib_mad_agent *agent, + struct ib_mad_send_buf *send_buf, + struct ib_mad_recv_wc *mad_recv_wc) +{ + struct ib_umad_file *file = agent->context; + struct ib_umad_packet *packet; + + if (mad_recv_wc->wc->status != IB_WC_SUCCESS) + goto err1; + + packet = kzalloc(sizeof *packet, GFP_KERNEL); + if (!packet) + goto err1; + + packet->length = mad_recv_wc->mad_len; + packet->recv_wc = mad_recv_wc; + + packet->mad.hdr.status = 0; + packet->mad.hdr.length = hdr_size(file) + mad_recv_wc->mad_len; + packet->mad.hdr.qpn = cpu_to_be32(mad_recv_wc->wc->src_qp); + /* + * On OPA devices it is okay to lose the upper 16 bits of LID as this + * information is obtained elsewhere. Mask off the upper 16 bits. + */ + if (rdma_cap_opa_mad(agent->device, agent->port_num)) + packet->mad.hdr.lid = ib_lid_be16(0xFFFF & + mad_recv_wc->wc->slid); + else + packet->mad.hdr.lid = ib_lid_be16(mad_recv_wc->wc->slid); + packet->mad.hdr.sl = mad_recv_wc->wc->sl; + packet->mad.hdr.path_bits = mad_recv_wc->wc->dlid_path_bits; + packet->mad.hdr.pkey_index = mad_recv_wc->wc->pkey_index; + packet->mad.hdr.grh_present = !!(mad_recv_wc->wc->wc_flags & IB_WC_GRH); + if (packet->mad.hdr.grh_present) { + struct rdma_ah_attr ah_attr; + const struct ib_global_route *grh; + int ret; + + ret = ib_init_ah_attr_from_wc(agent->device, agent->port_num, + mad_recv_wc->wc, + mad_recv_wc->recv_buf.grh, + &ah_attr); + if (ret) + goto err2; + + grh = rdma_ah_read_grh(&ah_attr); + packet->mad.hdr.gid_index = grh->sgid_index; + packet->mad.hdr.hop_limit = grh->hop_limit; + packet->mad.hdr.traffic_class = grh->traffic_class; + memcpy(packet->mad.hdr.gid, &grh->dgid, 16); + packet->mad.hdr.flow_label = cpu_to_be32(grh->flow_label); + rdma_destroy_ah_attr(&ah_attr); + } + + if (queue_packet(file, agent, packet)) + goto err2; + return; + +err2: + kfree(packet); +err1: + ib_free_recv_mad(mad_recv_wc); +} + +static ssize_t copy_recv_mad(struct ib_umad_file *file, char __user *buf, + struct ib_umad_packet *packet, size_t count) +{ + struct ib_mad_recv_buf *recv_buf; + int left, seg_payload, offset, max_seg_payload; + size_t seg_size; + + recv_buf = &packet->recv_wc->recv_buf; + seg_size = packet->recv_wc->mad_seg_size; + + /* We need enough room to copy the first (or only) MAD segment. */ + if ((packet->length <= seg_size && + count < hdr_size(file) + packet->length) || + (packet->length > seg_size && + count < hdr_size(file) + seg_size)) + return -EINVAL; + + if (copy_to_user(buf, &packet->mad, hdr_size(file))) + return -EFAULT; + + buf += hdr_size(file); + seg_payload = min_t(int, packet->length, seg_size); + if (copy_to_user(buf, recv_buf->mad, seg_payload)) + return -EFAULT; + + if (seg_payload < packet->length) { + /* + * Multipacket RMPP MAD message. Copy remainder of message. + * Note that last segment may have a shorter payload. + */ + if (count < hdr_size(file) + packet->length) { + /* + * The buffer is too small, return the first RMPP segment, + * which includes the RMPP message length. + */ + return -ENOSPC; + } + offset = ib_get_mad_data_offset(recv_buf->mad->mad_hdr.mgmt_class); + max_seg_payload = seg_size - offset; + + for (left = packet->length - seg_payload, buf += seg_payload; + left; left -= seg_payload, buf += seg_payload) { + recv_buf = container_of(recv_buf->list.next, + struct ib_mad_recv_buf, list); + seg_payload = min(left, max_seg_payload); + if (copy_to_user(buf, ((void *) recv_buf->mad) + offset, + seg_payload)) + return -EFAULT; + } + } + + trace_ib_umad_read_recv(file, &packet->mad.hdr, &recv_buf->mad->mad_hdr); + + return hdr_size(file) + packet->length; +} + +static ssize_t copy_send_mad(struct ib_umad_file *file, char __user *buf, + struct ib_umad_packet *packet, size_t count) +{ + ssize_t size = hdr_size(file) + packet->length; + + if (count < size) + return -EINVAL; + + if (copy_to_user(buf, &packet->mad, hdr_size(file))) + return -EFAULT; + + buf += hdr_size(file); + + if (copy_to_user(buf, packet->mad.data, packet->length)) + return -EFAULT; + + trace_ib_umad_read_send(file, &packet->mad.hdr, + (struct ib_mad_hdr *)&packet->mad.data); + + return size; +} + +static ssize_t ib_umad_read(struct file *filp, char __user *buf, + size_t count, loff_t *pos) +{ + struct ib_umad_file *file = filp->private_data; + struct ib_umad_packet *packet; + ssize_t ret; + + if (count < hdr_size(file)) + return -EINVAL; + + mutex_lock(&file->mutex); + + if (file->agents_dead) { + mutex_unlock(&file->mutex); + return -EIO; + } + + while (list_empty(&file->recv_list)) { + mutex_unlock(&file->mutex); + + if (filp->f_flags & O_NONBLOCK) + return -EAGAIN; + + if (wait_event_interruptible(file->recv_wait, + !list_empty(&file->recv_list))) + return -ERESTARTSYS; + + mutex_lock(&file->mutex); + } + + if (file->agents_dead) { + mutex_unlock(&file->mutex); + return -EIO; + } + + packet = list_entry(file->recv_list.next, struct ib_umad_packet, list); + list_del(&packet->list); + + mutex_unlock(&file->mutex); + + if (packet->recv_wc) + ret = copy_recv_mad(file, buf, packet, count); + else + ret = copy_send_mad(file, buf, packet, count); + + if (ret < 0) { + /* Requeue packet */ + mutex_lock(&file->mutex); + list_add(&packet->list, &file->recv_list); + mutex_unlock(&file->mutex); + } else { + if (packet->recv_wc) + ib_free_recv_mad(packet->recv_wc); + kfree(packet); + } + return ret; +} + +static int copy_rmpp_mad(struct ib_mad_send_buf *msg, const char __user *buf) +{ + int left, seg; + + /* Copy class specific header */ + if ((msg->hdr_len > IB_MGMT_RMPP_HDR) && + copy_from_user(msg->mad + IB_MGMT_RMPP_HDR, buf + IB_MGMT_RMPP_HDR, + msg->hdr_len - IB_MGMT_RMPP_HDR)) + return -EFAULT; + + /* All headers are in place. Copy data segments. */ + for (seg = 1, left = msg->data_len, buf += msg->hdr_len; left > 0; + seg++, left -= msg->seg_size, buf += msg->seg_size) { + if (copy_from_user(ib_get_rmpp_segment(msg, seg), buf, + min(left, msg->seg_size))) + return -EFAULT; + } + return 0; +} + +static int same_destination(struct ib_user_mad_hdr *hdr1, + struct ib_user_mad_hdr *hdr2) +{ + if (!hdr1->grh_present && !hdr2->grh_present) + return (hdr1->lid == hdr2->lid); + + if (hdr1->grh_present && hdr2->grh_present) + return !memcmp(hdr1->gid, hdr2->gid, 16); + + return 0; +} + +static int is_duplicate(struct ib_umad_file *file, + struct ib_umad_packet *packet) +{ + struct ib_umad_packet *sent_packet; + struct ib_mad_hdr *sent_hdr, *hdr; + + hdr = (struct ib_mad_hdr *) packet->mad.data; + list_for_each_entry(sent_packet, &file->send_list, list) { + sent_hdr = (struct ib_mad_hdr *) sent_packet->mad.data; + + if ((hdr->tid != sent_hdr->tid) || + (hdr->mgmt_class != sent_hdr->mgmt_class)) + continue; + + /* + * No need to be overly clever here. If two new operations have + * the same TID, reject the second as a duplicate. This is more + * restrictive than required by the spec. + */ + if (!ib_response_mad(hdr)) { + if (!ib_response_mad(sent_hdr)) + return 1; + continue; + } else if (!ib_response_mad(sent_hdr)) + continue; + + if (same_destination(&packet->mad.hdr, &sent_packet->mad.hdr)) + return 1; + } + + return 0; +} + +static ssize_t ib_umad_write(struct file *filp, const char __user *buf, + size_t count, loff_t *pos) +{ + struct ib_umad_file *file = filp->private_data; + struct ib_umad_packet *packet; + struct ib_mad_agent *agent; + struct rdma_ah_attr ah_attr; + struct ib_ah *ah; + struct ib_rmpp_mad *rmpp_mad; + __be64 *tid; + int ret, data_len, hdr_len, copy_offset, rmpp_active; + u8 base_version; + + if (count < hdr_size(file) + IB_MGMT_RMPP_HDR) + return -EINVAL; + + packet = kzalloc(sizeof *packet + IB_MGMT_RMPP_HDR, GFP_KERNEL); + if (!packet) + return -ENOMEM; + + if (copy_from_user(&packet->mad, buf, hdr_size(file))) { + ret = -EFAULT; + goto err; + } + + if (packet->mad.hdr.id >= IB_UMAD_MAX_AGENTS) { + ret = -EINVAL; + goto err; + } + + buf += hdr_size(file); + + if (copy_from_user(packet->mad.data, buf, IB_MGMT_RMPP_HDR)) { + ret = -EFAULT; + goto err; + } + + mutex_lock(&file->mutex); + + trace_ib_umad_write(file, &packet->mad.hdr, + (struct ib_mad_hdr *)&packet->mad.data); + + agent = __get_agent(file, packet->mad.hdr.id); + if (!agent) { + ret = -EIO; + goto err_up; + } + + memset(&ah_attr, 0, sizeof ah_attr); + ah_attr.type = rdma_ah_find_type(agent->device, + file->port->port_num); + rdma_ah_set_dlid(&ah_attr, be16_to_cpu(packet->mad.hdr.lid)); + rdma_ah_set_sl(&ah_attr, packet->mad.hdr.sl); + rdma_ah_set_path_bits(&ah_attr, packet->mad.hdr.path_bits); + rdma_ah_set_port_num(&ah_attr, file->port->port_num); + if (packet->mad.hdr.grh_present) { + rdma_ah_set_grh(&ah_attr, NULL, + be32_to_cpu(packet->mad.hdr.flow_label), + packet->mad.hdr.gid_index, + packet->mad.hdr.hop_limit, + packet->mad.hdr.traffic_class); + rdma_ah_set_dgid_raw(&ah_attr, packet->mad.hdr.gid); + } + + ah = rdma_create_user_ah(agent->qp->pd, &ah_attr, NULL); + if (IS_ERR(ah)) { + ret = PTR_ERR(ah); + goto err_up; + } + + rmpp_mad = (struct ib_rmpp_mad *) packet->mad.data; + hdr_len = ib_get_mad_data_offset(rmpp_mad->mad_hdr.mgmt_class); + + if (ib_is_mad_class_rmpp(rmpp_mad->mad_hdr.mgmt_class) + && ib_mad_kernel_rmpp_agent(agent)) { + copy_offset = IB_MGMT_RMPP_HDR; + rmpp_active = ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & + IB_MGMT_RMPP_FLAG_ACTIVE; + } else { + copy_offset = IB_MGMT_MAD_HDR; + rmpp_active = 0; + } + + base_version = ((struct ib_mad_hdr *)&packet->mad.data)->base_version; + data_len = count - hdr_size(file) - hdr_len; + packet->msg = ib_create_send_mad(agent, + be32_to_cpu(packet->mad.hdr.qpn), + packet->mad.hdr.pkey_index, rmpp_active, + hdr_len, data_len, GFP_KERNEL, + base_version); + if (IS_ERR(packet->msg)) { + ret = PTR_ERR(packet->msg); + goto err_ah; + } + + packet->msg->ah = ah; + packet->msg->timeout_ms = packet->mad.hdr.timeout_ms; + packet->msg->retries = packet->mad.hdr.retries; + packet->msg->context[0] = packet; + + /* Copy MAD header. Any RMPP header is already in place. */ + memcpy(packet->msg->mad, packet->mad.data, IB_MGMT_MAD_HDR); + + if (!rmpp_active) { + if (copy_from_user(packet->msg->mad + copy_offset, + buf + copy_offset, + hdr_len + data_len - copy_offset)) { + ret = -EFAULT; + goto err_msg; + } + } else { + ret = copy_rmpp_mad(packet->msg, buf); + if (ret) + goto err_msg; + } + + /* + * Set the high-order part of the transaction ID to make MADs from + * different agents unique, and allow routing responses back to the + * original requestor. + */ + if (!ib_response_mad(packet->msg->mad)) { + tid = &((struct ib_mad_hdr *) packet->msg->mad)->tid; + *tid = cpu_to_be64(((u64) agent->hi_tid) << 32 | + (be64_to_cpup(tid) & 0xffffffff)); + rmpp_mad->mad_hdr.tid = *tid; + } + + if (!ib_mad_kernel_rmpp_agent(agent) + && ib_is_mad_class_rmpp(rmpp_mad->mad_hdr.mgmt_class) + && (ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_ACTIVE)) { + spin_lock_irq(&file->send_lock); + list_add_tail(&packet->list, &file->send_list); + spin_unlock_irq(&file->send_lock); + } else { + spin_lock_irq(&file->send_lock); + ret = is_duplicate(file, packet); + if (!ret) + list_add_tail(&packet->list, &file->send_list); + spin_unlock_irq(&file->send_lock); + if (ret) { + ret = -EINVAL; + goto err_msg; + } + } + + ret = ib_post_send_mad(packet->msg, NULL); + if (ret) + goto err_send; + + mutex_unlock(&file->mutex); + return count; + +err_send: + dequeue_send(file, packet); +err_msg: + ib_free_send_mad(packet->msg); +err_ah: + rdma_destroy_ah(ah, RDMA_DESTROY_AH_SLEEPABLE); +err_up: + mutex_unlock(&file->mutex); +err: + kfree(packet); + return ret; +} + +static __poll_t ib_umad_poll(struct file *filp, struct poll_table_struct *wait) +{ + struct ib_umad_file *file = filp->private_data; + + /* we will always be able to post a MAD send */ + __poll_t mask = EPOLLOUT | EPOLLWRNORM; + + mutex_lock(&file->mutex); + poll_wait(filp, &file->recv_wait, wait); + + if (!list_empty(&file->recv_list)) + mask |= EPOLLIN | EPOLLRDNORM; + if (file->agents_dead) + mask = EPOLLERR; + mutex_unlock(&file->mutex); + + return mask; +} + +static int ib_umad_reg_agent(struct ib_umad_file *file, void __user *arg, + int compat_method_mask) +{ + struct ib_user_mad_reg_req ureq; + struct ib_mad_reg_req req; + struct ib_mad_agent *agent = NULL; + int agent_id; + int ret; + + mutex_lock(&file->port->file_mutex); + mutex_lock(&file->mutex); + + if (!file->port->ib_dev) { + dev_notice(&file->port->dev, "%s: invalid device\n", __func__); + ret = -EPIPE; + goto out; + } + + if (copy_from_user(&ureq, arg, sizeof ureq)) { + ret = -EFAULT; + goto out; + } + + if (ureq.qpn != 0 && ureq.qpn != 1) { + dev_notice(&file->port->dev, + "%s: invalid QPN %u specified\n", __func__, + ureq.qpn); + ret = -EINVAL; + goto out; + } + + for (agent_id = 0; agent_id < IB_UMAD_MAX_AGENTS; ++agent_id) + if (!__get_agent(file, agent_id)) + goto found; + + dev_notice(&file->port->dev, "%s: Max Agents (%u) reached\n", __func__, + IB_UMAD_MAX_AGENTS); + + ret = -ENOMEM; + goto out; + +found: + if (ureq.mgmt_class) { + memset(&req, 0, sizeof(req)); + req.mgmt_class = ureq.mgmt_class; + req.mgmt_class_version = ureq.mgmt_class_version; + memcpy(req.oui, ureq.oui, sizeof req.oui); + + if (compat_method_mask) { + u32 *umm = (u32 *) ureq.method_mask; + int i; + + for (i = 0; i < BITS_TO_LONGS(IB_MGMT_MAX_METHODS); ++i) + req.method_mask[i] = + umm[i * 2] | ((u64) umm[i * 2 + 1] << 32); + } else + memcpy(req.method_mask, ureq.method_mask, + sizeof req.method_mask); + } + + agent = ib_register_mad_agent(file->port->ib_dev, file->port->port_num, + ureq.qpn ? IB_QPT_GSI : IB_QPT_SMI, + ureq.mgmt_class ? &req : NULL, + ureq.rmpp_version, + send_handler, recv_handler, file, 0); + if (IS_ERR(agent)) { + ret = PTR_ERR(agent); + agent = NULL; + goto out; + } + + if (put_user(agent_id, + (u32 __user *) (arg + offsetof(struct ib_user_mad_reg_req, id)))) { + ret = -EFAULT; + goto out; + } + + if (!file->already_used) { + file->already_used = 1; + if (!file->use_pkey_index) { + dev_warn(&file->port->dev, + "process %s did not enable P_Key index support.\n", + current->comm); + dev_warn(&file->port->dev, + " Documentation/infiniband/user_mad.rst has info on the new ABI.\n"); + } + } + + file->agent[agent_id] = agent; + ret = 0; + +out: + mutex_unlock(&file->mutex); + + if (ret && agent) + ib_unregister_mad_agent(agent); + + mutex_unlock(&file->port->file_mutex); + + return ret; +} + +static int ib_umad_reg_agent2(struct ib_umad_file *file, void __user *arg) +{ + struct ib_user_mad_reg_req2 ureq; + struct ib_mad_reg_req req; + struct ib_mad_agent *agent = NULL; + int agent_id; + int ret; + + mutex_lock(&file->port->file_mutex); + mutex_lock(&file->mutex); + + if (!file->port->ib_dev) { + dev_notice(&file->port->dev, "%s: invalid device\n", __func__); + ret = -EPIPE; + goto out; + } + + if (copy_from_user(&ureq, arg, sizeof(ureq))) { + ret = -EFAULT; + goto out; + } + + if (ureq.qpn != 0 && ureq.qpn != 1) { + dev_notice(&file->port->dev, "%s: invalid QPN %u specified\n", + __func__, ureq.qpn); + ret = -EINVAL; + goto out; + } + + if (ureq.flags & ~IB_USER_MAD_REG_FLAGS_CAP) { + dev_notice(&file->port->dev, + "%s failed: invalid registration flags specified 0x%x; supported 0x%x\n", + __func__, ureq.flags, IB_USER_MAD_REG_FLAGS_CAP); + ret = -EINVAL; + + if (put_user((u32)IB_USER_MAD_REG_FLAGS_CAP, + (u32 __user *) (arg + offsetof(struct + ib_user_mad_reg_req2, flags)))) + ret = -EFAULT; + + goto out; + } + + for (agent_id = 0; agent_id < IB_UMAD_MAX_AGENTS; ++agent_id) + if (!__get_agent(file, agent_id)) + goto found; + + dev_notice(&file->port->dev, "%s: Max Agents (%u) reached\n", __func__, + IB_UMAD_MAX_AGENTS); + ret = -ENOMEM; + goto out; + +found: + if (ureq.mgmt_class) { + memset(&req, 0, sizeof(req)); + req.mgmt_class = ureq.mgmt_class; + req.mgmt_class_version = ureq.mgmt_class_version; + if (ureq.oui & 0xff000000) { + dev_notice(&file->port->dev, + "%s failed: oui invalid 0x%08x\n", __func__, + ureq.oui); + ret = -EINVAL; + goto out; + } + req.oui[2] = ureq.oui & 0x0000ff; + req.oui[1] = (ureq.oui & 0x00ff00) >> 8; + req.oui[0] = (ureq.oui & 0xff0000) >> 16; + memcpy(req.method_mask, ureq.method_mask, + sizeof(req.method_mask)); + } + + agent = ib_register_mad_agent(file->port->ib_dev, file->port->port_num, + ureq.qpn ? IB_QPT_GSI : IB_QPT_SMI, + ureq.mgmt_class ? &req : NULL, + ureq.rmpp_version, + send_handler, recv_handler, file, + ureq.flags); + if (IS_ERR(agent)) { + ret = PTR_ERR(agent); + agent = NULL; + goto out; + } + + if (put_user(agent_id, + (u32 __user *)(arg + + offsetof(struct ib_user_mad_reg_req2, id)))) { + ret = -EFAULT; + goto out; + } + + if (!file->already_used) { + file->already_used = 1; + file->use_pkey_index = 1; + } + + file->agent[agent_id] = agent; + ret = 0; + +out: + mutex_unlock(&file->mutex); + + if (ret && agent) + ib_unregister_mad_agent(agent); + + mutex_unlock(&file->port->file_mutex); + + return ret; +} + + +static int ib_umad_unreg_agent(struct ib_umad_file *file, u32 __user *arg) +{ + struct ib_mad_agent *agent = NULL; + u32 id; + int ret = 0; + + if (get_user(id, arg)) + return -EFAULT; + if (id >= IB_UMAD_MAX_AGENTS) + return -EINVAL; + + mutex_lock(&file->port->file_mutex); + mutex_lock(&file->mutex); + + id = array_index_nospec(id, IB_UMAD_MAX_AGENTS); + if (!__get_agent(file, id)) { + ret = -EINVAL; + goto out; + } + + agent = file->agent[id]; + file->agent[id] = NULL; + +out: + mutex_unlock(&file->mutex); + + if (agent) + ib_unregister_mad_agent(agent); + + mutex_unlock(&file->port->file_mutex); + + return ret; +} + +static long ib_umad_enable_pkey(struct ib_umad_file *file) +{ + int ret = 0; + + mutex_lock(&file->mutex); + if (file->already_used) + ret = -EINVAL; + else + file->use_pkey_index = 1; + mutex_unlock(&file->mutex); + + return ret; +} + +static long ib_umad_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + switch (cmd) { + case IB_USER_MAD_REGISTER_AGENT: + return ib_umad_reg_agent(filp->private_data, (void __user *) arg, 0); + case IB_USER_MAD_UNREGISTER_AGENT: + return ib_umad_unreg_agent(filp->private_data, (__u32 __user *) arg); + case IB_USER_MAD_ENABLE_PKEY: + return ib_umad_enable_pkey(filp->private_data); + case IB_USER_MAD_REGISTER_AGENT2: + return ib_umad_reg_agent2(filp->private_data, (void __user *) arg); + default: + return -ENOIOCTLCMD; + } +} + +#ifdef CONFIG_COMPAT +static long ib_umad_compat_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + switch (cmd) { + case IB_USER_MAD_REGISTER_AGENT: + return ib_umad_reg_agent(filp->private_data, compat_ptr(arg), 1); + case IB_USER_MAD_UNREGISTER_AGENT: + return ib_umad_unreg_agent(filp->private_data, compat_ptr(arg)); + case IB_USER_MAD_ENABLE_PKEY: + return ib_umad_enable_pkey(filp->private_data); + case IB_USER_MAD_REGISTER_AGENT2: + return ib_umad_reg_agent2(filp->private_data, compat_ptr(arg)); + default: + return -ENOIOCTLCMD; + } +} +#endif + +/* + * ib_umad_open() does not need the BKL: + * + * - the ib_umad_port structures are properly reference counted, and + * everything else is purely local to the file being created, so + * races against other open calls are not a problem; + * - the ioctl method does not affect any global state outside of the + * file structure being operated on; + */ +static int ib_umad_open(struct inode *inode, struct file *filp) +{ + struct ib_umad_port *port; + struct ib_umad_file *file; + int ret = 0; + + port = container_of(inode->i_cdev, struct ib_umad_port, cdev); + + mutex_lock(&port->file_mutex); + + if (!port->ib_dev) { + ret = -ENXIO; + goto out; + } + + if (!rdma_dev_access_netns(port->ib_dev, current->nsproxy->net_ns)) { + ret = -EPERM; + goto out; + } + + file = kzalloc(sizeof(*file), GFP_KERNEL); + if (!file) { + ret = -ENOMEM; + goto out; + } + + mutex_init(&file->mutex); + spin_lock_init(&file->send_lock); + INIT_LIST_HEAD(&file->recv_list); + INIT_LIST_HEAD(&file->send_list); + init_waitqueue_head(&file->recv_wait); + + file->port = port; + filp->private_data = file; + + list_add_tail(&file->port_list, &port->file_list); + + stream_open(inode, filp); +out: + mutex_unlock(&port->file_mutex); + return ret; +} + +static int ib_umad_close(struct inode *inode, struct file *filp) +{ + struct ib_umad_file *file = filp->private_data; + struct ib_umad_packet *packet, *tmp; + int already_dead; + int i; + + mutex_lock(&file->port->file_mutex); + mutex_lock(&file->mutex); + + already_dead = file->agents_dead; + file->agents_dead = 1; + + list_for_each_entry_safe(packet, tmp, &file->recv_list, list) { + if (packet->recv_wc) + ib_free_recv_mad(packet->recv_wc); + kfree(packet); + } + + list_del(&file->port_list); + + mutex_unlock(&file->mutex); + + if (!already_dead) + for (i = 0; i < IB_UMAD_MAX_AGENTS; ++i) + if (file->agent[i]) + ib_unregister_mad_agent(file->agent[i]); + + mutex_unlock(&file->port->file_mutex); + mutex_destroy(&file->mutex); + kfree(file); + return 0; +} + +static const struct file_operations umad_fops = { + .owner = THIS_MODULE, + .read = ib_umad_read, + .write = ib_umad_write, + .poll = ib_umad_poll, + .unlocked_ioctl = ib_umad_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ib_umad_compat_ioctl, +#endif + .open = ib_umad_open, + .release = ib_umad_close, + .llseek = no_llseek, +}; + +static int ib_umad_sm_open(struct inode *inode, struct file *filp) +{ + struct ib_umad_port *port; + struct ib_port_modify props = { + .set_port_cap_mask = IB_PORT_SM + }; + int ret; + + port = container_of(inode->i_cdev, struct ib_umad_port, sm_cdev); + + if (filp->f_flags & O_NONBLOCK) { + if (down_trylock(&port->sm_sem)) { + ret = -EAGAIN; + goto fail; + } + } else { + if (down_interruptible(&port->sm_sem)) { + ret = -ERESTARTSYS; + goto fail; + } + } + + if (!rdma_dev_access_netns(port->ib_dev, current->nsproxy->net_ns)) { + ret = -EPERM; + goto err_up_sem; + } + + ret = ib_modify_port(port->ib_dev, port->port_num, 0, &props); + if (ret) + goto err_up_sem; + + filp->private_data = port; + + nonseekable_open(inode, filp); + return 0; + +err_up_sem: + up(&port->sm_sem); + +fail: + return ret; +} + +static int ib_umad_sm_close(struct inode *inode, struct file *filp) +{ + struct ib_umad_port *port = filp->private_data; + struct ib_port_modify props = { + .clr_port_cap_mask = IB_PORT_SM + }; + int ret = 0; + + mutex_lock(&port->file_mutex); + if (port->ib_dev) + ret = ib_modify_port(port->ib_dev, port->port_num, 0, &props); + mutex_unlock(&port->file_mutex); + + up(&port->sm_sem); + + return ret; +} + +static const struct file_operations umad_sm_fops = { + .owner = THIS_MODULE, + .open = ib_umad_sm_open, + .release = ib_umad_sm_close, + .llseek = no_llseek, +}; + +static struct ib_umad_port *get_port(struct ib_device *ibdev, + struct ib_umad_device *umad_dev, + u32 port) +{ + if (!umad_dev) + return ERR_PTR(-EOPNOTSUPP); + if (!rdma_is_port_valid(ibdev, port)) + return ERR_PTR(-EINVAL); + if (!rdma_cap_ib_mad(ibdev, port)) + return ERR_PTR(-EOPNOTSUPP); + + return &umad_dev->ports[port - rdma_start_port(ibdev)]; +} + +static int ib_umad_get_nl_info(struct ib_device *ibdev, void *client_data, + struct ib_client_nl_info *res) +{ + struct ib_umad_port *port = get_port(ibdev, client_data, res->port); + + if (IS_ERR(port)) + return PTR_ERR(port); + + res->abi = IB_USER_MAD_ABI_VERSION; + res->cdev = &port->dev; + return 0; +} + +static struct ib_client umad_client = { + .name = "umad", + .add = ib_umad_add_one, + .remove = ib_umad_remove_one, + .get_nl_info = ib_umad_get_nl_info, +}; +MODULE_ALIAS_RDMA_CLIENT("umad"); + +static int ib_issm_get_nl_info(struct ib_device *ibdev, void *client_data, + struct ib_client_nl_info *res) +{ + struct ib_umad_port *port = get_port(ibdev, client_data, res->port); + + if (IS_ERR(port)) + return PTR_ERR(port); + + res->abi = IB_USER_MAD_ABI_VERSION; + res->cdev = &port->sm_dev; + return 0; +} + +static struct ib_client issm_client = { + .name = "issm", + .get_nl_info = ib_issm_get_nl_info, +}; +MODULE_ALIAS_RDMA_CLIENT("issm"); + +static ssize_t ibdev_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct ib_umad_port *port = dev_get_drvdata(dev); + + if (!port) + return -ENODEV; + + return sysfs_emit(buf, "%s\n", dev_name(&port->ib_dev->dev)); +} +static DEVICE_ATTR_RO(ibdev); + +static ssize_t port_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct ib_umad_port *port = dev_get_drvdata(dev); + + if (!port) + return -ENODEV; + + return sysfs_emit(buf, "%d\n", port->port_num); +} +static DEVICE_ATTR_RO(port); + +static struct attribute *umad_class_dev_attrs[] = { + &dev_attr_ibdev.attr, + &dev_attr_port.attr, + NULL, +}; +ATTRIBUTE_GROUPS(umad_class_dev); + +static char *umad_devnode(struct device *dev, umode_t *mode) +{ + return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev)); +} + +static ssize_t abi_version_show(struct class *class, + struct class_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%d\n", IB_USER_MAD_ABI_VERSION); +} +static CLASS_ATTR_RO(abi_version); + +static struct attribute *umad_class_attrs[] = { + &class_attr_abi_version.attr, + NULL, +}; +ATTRIBUTE_GROUPS(umad_class); + +static struct class umad_class = { + .name = "infiniband_mad", + .devnode = umad_devnode, + .class_groups = umad_class_groups, + .dev_groups = umad_class_dev_groups, +}; + +static void ib_umad_release_port(struct device *device) +{ + struct ib_umad_port *port = dev_get_drvdata(device); + struct ib_umad_device *umad_dev = port->umad_dev; + + ib_umad_dev_put(umad_dev); +} + +static void ib_umad_init_port_dev(struct device *dev, + struct ib_umad_port *port, + const struct ib_device *device) +{ + device_initialize(dev); + ib_umad_dev_get(port->umad_dev); + dev->class = &umad_class; + dev->parent = device->dev.parent; + dev_set_drvdata(dev, port); + dev->release = ib_umad_release_port; +} + +static int ib_umad_init_port(struct ib_device *device, int port_num, + struct ib_umad_device *umad_dev, + struct ib_umad_port *port) +{ + int devnum; + dev_t base_umad; + dev_t base_issm; + int ret; + + devnum = ida_alloc_max(&umad_ida, IB_UMAD_MAX_PORTS - 1, GFP_KERNEL); + if (devnum < 0) + return -1; + port->dev_num = devnum; + if (devnum >= IB_UMAD_NUM_FIXED_MINOR) { + base_umad = dynamic_umad_dev + devnum - IB_UMAD_NUM_FIXED_MINOR; + base_issm = dynamic_issm_dev + devnum - IB_UMAD_NUM_FIXED_MINOR; + } else { + base_umad = devnum + base_umad_dev; + base_issm = devnum + base_issm_dev; + } + + port->ib_dev = device; + port->umad_dev = umad_dev; + port->port_num = port_num; + sema_init(&port->sm_sem, 1); + mutex_init(&port->file_mutex); + INIT_LIST_HEAD(&port->file_list); + + ib_umad_init_port_dev(&port->dev, port, device); + port->dev.devt = base_umad; + dev_set_name(&port->dev, "umad%d", port->dev_num); + cdev_init(&port->cdev, &umad_fops); + port->cdev.owner = THIS_MODULE; + + ret = cdev_device_add(&port->cdev, &port->dev); + if (ret) + goto err_cdev; + + ib_umad_init_port_dev(&port->sm_dev, port, device); + port->sm_dev.devt = base_issm; + dev_set_name(&port->sm_dev, "issm%d", port->dev_num); + cdev_init(&port->sm_cdev, &umad_sm_fops); + port->sm_cdev.owner = THIS_MODULE; + + ret = cdev_device_add(&port->sm_cdev, &port->sm_dev); + if (ret) + goto err_dev; + + return 0; + +err_dev: + put_device(&port->sm_dev); + cdev_device_del(&port->cdev, &port->dev); +err_cdev: + put_device(&port->dev); + ida_free(&umad_ida, devnum); + return ret; +} + +static void ib_umad_kill_port(struct ib_umad_port *port) +{ + struct ib_umad_file *file; + int id; + + cdev_device_del(&port->cdev, &port->dev); + mutex_lock(&port->file_mutex); + + /* Mark ib_dev NULL and block ioctl or other file ops to progress + * further. + */ + port->ib_dev = NULL; + + list_for_each_entry(file, &port->file_list, port_list) { + mutex_lock(&file->mutex); + file->agents_dead = 1; + wake_up_interruptible(&file->recv_wait); + mutex_unlock(&file->mutex); + + for (id = 0; id < IB_UMAD_MAX_AGENTS; ++id) + if (file->agent[id]) + ib_unregister_mad_agent(file->agent[id]); + } + + mutex_unlock(&port->file_mutex); + + cdev_device_del(&port->sm_cdev, &port->sm_dev); + ida_free(&umad_ida, port->dev_num); + + /* balances device_initialize() */ + put_device(&port->sm_dev); + put_device(&port->dev); +} + +static int ib_umad_add_one(struct ib_device *device) +{ + struct ib_umad_device *umad_dev; + int s, e, i; + int count = 0; + int ret; + + s = rdma_start_port(device); + e = rdma_end_port(device); + + umad_dev = kzalloc(struct_size(umad_dev, ports, e - s + 1), GFP_KERNEL); + if (!umad_dev) + return -ENOMEM; + + kref_init(&umad_dev->kref); + for (i = s; i <= e; ++i) { + if (!rdma_cap_ib_mad(device, i)) + continue; + + ret = ib_umad_init_port(device, i, umad_dev, + &umad_dev->ports[i - s]); + if (ret) + goto err; + + count++; + } + + if (!count) { + ret = -EOPNOTSUPP; + goto free; + } + + ib_set_client_data(device, &umad_client, umad_dev); + + return 0; + +err: + while (--i >= s) { + if (!rdma_cap_ib_mad(device, i)) + continue; + + ib_umad_kill_port(&umad_dev->ports[i - s]); + } +free: + /* balances kref_init */ + ib_umad_dev_put(umad_dev); + return ret; +} + +static void ib_umad_remove_one(struct ib_device *device, void *client_data) +{ + struct ib_umad_device *umad_dev = client_data; + unsigned int i; + + rdma_for_each_port (device, i) { + if (rdma_cap_ib_mad(device, i)) + ib_umad_kill_port( + &umad_dev->ports[i - rdma_start_port(device)]); + } + /* balances kref_init() */ + ib_umad_dev_put(umad_dev); +} + +static int __init ib_umad_init(void) +{ + int ret; + + ret = register_chrdev_region(base_umad_dev, + IB_UMAD_NUM_FIXED_MINOR * 2, + umad_class.name); + if (ret) { + pr_err("couldn't register device number\n"); + goto out; + } + + ret = alloc_chrdev_region(&dynamic_umad_dev, 0, + IB_UMAD_NUM_DYNAMIC_MINOR * 2, + umad_class.name); + if (ret) { + pr_err("couldn't register dynamic device number\n"); + goto out_alloc; + } + dynamic_issm_dev = dynamic_umad_dev + IB_UMAD_NUM_DYNAMIC_MINOR; + + ret = class_register(&umad_class); + if (ret) { + pr_err("couldn't create class infiniband_mad\n"); + goto out_chrdev; + } + + ret = ib_register_client(&umad_client); + if (ret) + goto out_class; + + ret = ib_register_client(&issm_client); + if (ret) + goto out_client; + + return 0; + +out_client: + ib_unregister_client(&umad_client); +out_class: + class_unregister(&umad_class); + +out_chrdev: + unregister_chrdev_region(dynamic_umad_dev, + IB_UMAD_NUM_DYNAMIC_MINOR * 2); + +out_alloc: + unregister_chrdev_region(base_umad_dev, + IB_UMAD_NUM_FIXED_MINOR * 2); + +out: + return ret; +} + +static void __exit ib_umad_cleanup(void) +{ + ib_unregister_client(&issm_client); + ib_unregister_client(&umad_client); + class_unregister(&umad_class); + unregister_chrdev_region(base_umad_dev, + IB_UMAD_NUM_FIXED_MINOR * 2); + unregister_chrdev_region(dynamic_umad_dev, + IB_UMAD_NUM_DYNAMIC_MINOR * 2); +} + +module_init(ib_umad_init); +module_exit(ib_umad_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs.h b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs.h new file mode 100644 index 0000000..821d93c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs.h @@ -0,0 +1,322 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * Copyright (c) 2005 Voltaire, Inc. All rights reserved. + * Copyright (c) 2005 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef UVERBS_H +#define UVERBS_H + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define UVERBS_MODULE_NAME ib_uverbs +#include + +static inline void +ib_uverbs_init_udata(struct ib_udata *udata, + const void __user *ibuf, + void __user *obuf, + size_t ilen, size_t olen) +{ + udata->inbuf = ibuf; + udata->outbuf = obuf; + udata->inlen = ilen; + udata->outlen = olen; +} + +static inline void +ib_uverbs_init_udata_buf_or_null(struct ib_udata *udata, + const void __user *ibuf, + void __user *obuf, + size_t ilen, size_t olen) +{ + ib_uverbs_init_udata(udata, + ilen ? ibuf : NULL, olen ? obuf : NULL, + ilen, olen); +} + +/* + * Our lifetime rules for these structs are the following: + * + * struct ib_uverbs_device: One reference is held by the module and + * released in ib_uverbs_remove_one(). Another reference is taken by + * ib_uverbs_open() each time the character special file is opened, + * and released in ib_uverbs_release_file() when the file is released. + * + * struct ib_uverbs_file: One reference is held by the VFS and + * released when the file is closed. Another reference is taken when + * an asynchronous event queue file is created and released when the + * event file is closed. + * + * struct ib_uverbs_event_queue: Base structure for + * struct ib_uverbs_async_event_file and struct ib_uverbs_completion_event_file. + * One reference is held by the VFS and released when the file is closed. + * For asynchronous event files, another reference is held by the corresponding + * main context file and released when that file is closed. For completion + * event files, a reference is taken when a CQ is created that uses the file, + * and released when the CQ is destroyed. + */ + +struct ib_uverbs_device { + refcount_t refcount; + u32 num_comp_vectors; + struct completion comp; + struct device dev; + /* First group for device attributes, NULL terminated array */ + const struct attribute_group *groups[2]; + struct ib_device __rcu *ib_dev; + int devnum; + struct cdev cdev; + struct rb_root xrcd_tree; + struct mutex xrcd_tree_mutex; + struct srcu_struct disassociate_srcu; + struct mutex lists_mutex; /* protect lists */ + struct list_head uverbs_file_list; + struct uverbs_api *uapi; +}; + +struct ib_uverbs_event_queue { + spinlock_t lock; + int is_closed; + wait_queue_head_t poll_wait; + struct fasync_struct *async_queue; + struct list_head event_list; +}; + +struct ib_uverbs_async_event_file { + struct ib_uobject uobj; + struct ib_uverbs_event_queue ev_queue; + struct ib_event_handler event_handler; +}; + +struct ib_uverbs_completion_event_file { + struct ib_uobject uobj; + struct ib_uverbs_event_queue ev_queue; +}; + +struct ib_uverbs_file { + struct kref ref; + struct ib_uverbs_device *device; + struct mutex ucontext_lock; + /* + * ucontext must be accessed via ib_uverbs_get_ucontext() or with + * ucontext_lock held + */ + struct ib_ucontext *ucontext; + struct ib_uverbs_async_event_file *default_async_file; + struct list_head list; + + /* + * To access the uobjects list hw_destroy_rwsem must be held for write + * OR hw_destroy_rwsem held for read AND uobjects_lock held. + * hw_destroy_rwsem should be called across any destruction of the HW + * object of an associated uobject. + */ + struct rw_semaphore hw_destroy_rwsem; + spinlock_t uobjects_lock; + struct list_head uobjects; + + struct mutex umap_lock; + struct list_head umaps; + struct page *disassociate_page; + + struct xarray idr; +}; + +struct ib_uverbs_event { + union { + struct ib_uverbs_async_event_desc async; + struct ib_uverbs_comp_event_desc comp; + } desc; + struct list_head list; + struct list_head obj_list; + u32 *counter; +}; + +struct ib_uverbs_mcast_entry { + struct list_head list; + union ib_gid gid; + u16 lid; +}; + +struct ib_uevent_object { + struct ib_uobject uobject; + struct ib_uverbs_async_event_file *event_file; + /* List member for ib_uverbs_async_event_file list */ + struct list_head event_list; + u32 events_reported; +}; + +struct ib_uxrcd_object { + struct ib_uobject uobject; + atomic_t refcnt; +}; + +struct ib_usrq_object { + struct ib_uevent_object uevent; + struct ib_uxrcd_object *uxrcd; +}; + +struct ib_uqp_object { + struct ib_uevent_object uevent; + /* lock for mcast list */ + struct mutex mcast_lock; + struct list_head mcast_list; + struct ib_uxrcd_object *uxrcd; +}; + +struct ib_uwq_object { + struct ib_uevent_object uevent; +}; + +struct ib_ucq_object { + struct ib_uevent_object uevent; + struct list_head comp_list; + u32 comp_events_reported; +}; + +extern const struct file_operations uverbs_event_fops; +extern const struct file_operations uverbs_async_event_fops; +void ib_uverbs_init_event_queue(struct ib_uverbs_event_queue *ev_queue); +void ib_uverbs_init_async_event_file(struct ib_uverbs_async_event_file *ev_file); +void ib_uverbs_free_event_queue(struct ib_uverbs_event_queue *event_queue); +void ib_uverbs_flow_resources_free(struct ib_uflow_resources *uflow_res); +int uverbs_async_event_release(struct inode *inode, struct file *filp); + +int ib_alloc_ucontext(struct uverbs_attr_bundle *attrs); +int ib_init_ucontext(struct uverbs_attr_bundle *attrs); + +void ib_uverbs_release_ucq(struct ib_uverbs_completion_event_file *ev_file, + struct ib_ucq_object *uobj); +void ib_uverbs_release_uevent(struct ib_uevent_object *uobj); +void ib_uverbs_release_file(struct kref *ref); +void ib_uverbs_async_handler(struct ib_uverbs_async_event_file *async_file, + __u64 element, __u64 event, + struct list_head *obj_list, u32 *counter); + +void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context); +void ib_uverbs_cq_event_handler(struct ib_event *event, void *context_ptr); +void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr); +void ib_uverbs_wq_event_handler(struct ib_event *event, void *context_ptr); +void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr); +int ib_uverbs_dealloc_xrcd(struct ib_uobject *uobject, struct ib_xrcd *xrcd, + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs); + +int uverbs_dealloc_mw(struct ib_mw *mw); +void ib_uverbs_detach_umcast(struct ib_qp *qp, + struct ib_uqp_object *uobj); + +long ib_uverbs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); + +struct ib_uverbs_flow_spec { + union { + union { + struct ib_uverbs_flow_spec_hdr hdr; + struct { + __u32 type; + __u16 size; + __u16 reserved; + }; + }; + struct ib_uverbs_flow_spec_eth eth; + struct ib_uverbs_flow_spec_ipv4 ipv4; + struct ib_uverbs_flow_spec_esp esp; + struct ib_uverbs_flow_spec_tcp_udp tcp_udp; + struct ib_uverbs_flow_spec_ipv6 ipv6; + struct ib_uverbs_flow_spec_action_tag flow_tag; + struct ib_uverbs_flow_spec_action_drop drop; + struct ib_uverbs_flow_spec_action_handle action; + struct ib_uverbs_flow_spec_action_count flow_count; + }; +}; + +int ib_uverbs_kern_spec_to_ib_spec_filter(enum ib_flow_spec_type type, + const void *kern_spec_mask, + const void *kern_spec_val, + size_t kern_filter_sz, + union ib_flow_spec *ib_spec); + +/* + * ib_uverbs_query_port_resp.port_cap_flags started out as just a copy of the + * PortInfo CapabilityMask, but was extended with unique bits. + */ +static inline u32 make_port_cap_flags(const struct ib_port_attr *attr) +{ + u32 res; + + /* All IBA CapabilityMask bits are passed through here, except bit 26, + * which is overridden with IP_BASED_GIDS. This is due to a historical + * mistake in the implementation of IP_BASED_GIDS. Otherwise all other + * bits match the IBA definition across all kernel versions. + */ + res = attr->port_cap_flags & ~(u32)IB_UVERBS_PCF_IP_BASED_GIDS; + + if (attr->ip_gids) + res |= IB_UVERBS_PCF_IP_BASED_GIDS; + + return res; +} + +static inline struct ib_uverbs_async_event_file * +ib_uverbs_get_async_event(struct uverbs_attr_bundle *attrs, + u16 id) +{ + struct ib_uobject *async_ev_file_uobj; + struct ib_uverbs_async_event_file *async_ev_file; + + async_ev_file_uobj = uverbs_attr_get_uobject(attrs, id); + if (IS_ERR(async_ev_file_uobj)) + async_ev_file = READ_ONCE(attrs->ufile->default_async_file); + else + async_ev_file = container_of(async_ev_file_uobj, + struct ib_uverbs_async_event_file, + uobj); + if (async_ev_file) + uverbs_uobject_get(&async_ev_file->uobj); + return async_ev_file; +} + +void copy_port_attr_to_resp(struct ib_port_attr *attr, + struct ib_uverbs_query_port_resp *resp, + struct ib_device *ib_dev, u8 port_num); +#endif /* UVERBS_H */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_cmd.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_cmd.c new file mode 100644 index 0000000..ae47a0b --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_cmd.c @@ -0,0 +1,4044 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005, 2006, 2007 Cisco Systems. All rights reserved. + * Copyright (c) 2005 PathScale, Inc. All rights reserved. + * Copyright (c) 2006 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#include + +#include +#include +#include "rdma_core.h" + +#include "uverbs.h" +#include "core_priv.h" + +/* + * Copy a response to userspace. If the provided 'resp' is larger than the + * user buffer it is silently truncated. If the user provided a larger buffer + * then the trailing portion is zero filled. + * + * These semantics are intended to support future extension of the output + * structures. + */ +static int uverbs_response(struct uverbs_attr_bundle *attrs, const void *resp, + size_t resp_len) +{ + int ret; + + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CORE_OUT)) + return uverbs_copy_to_struct_or_zero( + attrs, UVERBS_ATTR_CORE_OUT, resp, resp_len); + + if (copy_to_user(attrs->ucore.outbuf, resp, + min(attrs->ucore.outlen, resp_len))) + return -EFAULT; + + if (resp_len < attrs->ucore.outlen) { + /* + * Zero fill any extra memory that user + * space might have provided. + */ + ret = clear_user(attrs->ucore.outbuf + resp_len, + attrs->ucore.outlen - resp_len); + if (ret) + return -EFAULT; + } + + return 0; +} + +/* + * Copy a request from userspace. If the provided 'req' is larger than the + * user buffer then the user buffer is zero extended into the 'req'. If 'req' + * is smaller than the user buffer then the uncopied bytes in the user buffer + * must be zero. + */ +static int uverbs_request(struct uverbs_attr_bundle *attrs, void *req, + size_t req_len) +{ + if (copy_from_user(req, attrs->ucore.inbuf, + min(attrs->ucore.inlen, req_len))) + return -EFAULT; + + if (attrs->ucore.inlen < req_len) { + memset(req + attrs->ucore.inlen, 0, + req_len - attrs->ucore.inlen); + } else if (attrs->ucore.inlen > req_len) { + if (!ib_is_buffer_cleared(attrs->ucore.inbuf + req_len, + attrs->ucore.inlen - req_len)) + return -EOPNOTSUPP; + } + return 0; +} + +/* + * Generate the value for the 'response_length' protocol used by write_ex. + * This is the number of bytes the kernel actually wrote. Userspace can use + * this to detect what structure members in the response the kernel + * understood. + */ +static u32 uverbs_response_length(struct uverbs_attr_bundle *attrs, + size_t resp_len) +{ + return min_t(size_t, attrs->ucore.outlen, resp_len); +} + +/* + * The iterator version of the request interface is for handlers that need to + * step over a flex array at the end of a command header. + */ +struct uverbs_req_iter { + const void __user *cur; + const void __user *end; +}; + +static int uverbs_request_start(struct uverbs_attr_bundle *attrs, + struct uverbs_req_iter *iter, + void *req, + size_t req_len) +{ + if (attrs->ucore.inlen < req_len) + return -ENOSPC; + + if (copy_from_user(req, attrs->ucore.inbuf, req_len)) + return -EFAULT; + + iter->cur = attrs->ucore.inbuf + req_len; + iter->end = attrs->ucore.inbuf + attrs->ucore.inlen; + return 0; +} + +static int uverbs_request_next(struct uverbs_req_iter *iter, void *val, + size_t len) +{ + if (iter->cur + len > iter->end) + return -ENOSPC; + + if (copy_from_user(val, iter->cur, len)) + return -EFAULT; + + iter->cur += len; + return 0; +} + +static const void __user *uverbs_request_next_ptr(struct uverbs_req_iter *iter, + size_t len) +{ + const void __user *res = iter->cur; + + if (iter->cur + len > iter->end) + return (void __force __user *)ERR_PTR(-ENOSPC); + iter->cur += len; + return res; +} + +static int uverbs_request_finish(struct uverbs_req_iter *iter) +{ + if (!ib_is_buffer_cleared(iter->cur, iter->end - iter->cur)) + return -EOPNOTSUPP; + return 0; +} + +/* + * When calling a destroy function during an error unwind we need to pass in + * the udata that is sanitized of all user arguments. Ie from the driver + * perspective it looks like no udata was passed. + */ +struct ib_udata *uverbs_get_cleared_udata(struct uverbs_attr_bundle *attrs) +{ + attrs->driver_udata = (struct ib_udata){}; + return &attrs->driver_udata; +} + +static struct ib_uverbs_completion_event_file * +_ib_uverbs_lookup_comp_file(s32 fd, struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = ufd_get_read(UVERBS_OBJECT_COMP_CHANNEL, + fd, attrs); + + if (IS_ERR(uobj)) + return (void *)uobj; + + uverbs_uobject_get(uobj); + uobj_put_read(uobj); + + return container_of(uobj, struct ib_uverbs_completion_event_file, + uobj); +} +#define ib_uverbs_lookup_comp_file(_fd, _ufile) \ + _ib_uverbs_lookup_comp_file((_fd)*typecheck(s32, _fd), _ufile) + +int ib_alloc_ucontext(struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_file *ufile = attrs->ufile; + struct ib_ucontext *ucontext; + struct ib_device *ib_dev; + + ib_dev = srcu_dereference(ufile->device->ib_dev, + &ufile->device->disassociate_srcu); + if (!ib_dev) + return -EIO; + + ucontext = rdma_zalloc_drv_obj(ib_dev, ib_ucontext); + if (!ucontext) + return -ENOMEM; + + ucontext->device = ib_dev; + ucontext->ufile = ufile; + xa_init_flags(&ucontext->mmap_xa, XA_FLAGS_ALLOC); + + rdma_restrack_new(&ucontext->res, RDMA_RESTRACK_CTX); + rdma_restrack_set_name(&ucontext->res, NULL); + attrs->context = ucontext; + return 0; +} + +int ib_init_ucontext(struct uverbs_attr_bundle *attrs) +{ + struct ib_ucontext *ucontext = attrs->context; + struct ib_uverbs_file *file = attrs->ufile; + int ret; + + if (!down_read_trylock(&file->hw_destroy_rwsem)) + return -EIO; + mutex_lock(&file->ucontext_lock); + if (file->ucontext) { + ret = -EINVAL; + goto err; + } + + ret = ib_rdmacg_try_charge(&ucontext->cg_obj, ucontext->device, + RDMACG_RESOURCE_HCA_HANDLE); + if (ret) + goto err; + + ret = ucontext->device->ops.alloc_ucontext(ucontext, + &attrs->driver_udata); + if (ret) + goto err_uncharge; + + rdma_restrack_add(&ucontext->res); + + /* + * Make sure that ib_uverbs_get_ucontext() sees the pointer update + * only after all writes to setup the ucontext have completed + */ + smp_store_release(&file->ucontext, ucontext); + + mutex_unlock(&file->ucontext_lock); + up_read(&file->hw_destroy_rwsem); + return 0; + +err_uncharge: + ib_rdmacg_uncharge(&ucontext->cg_obj, ucontext->device, + RDMACG_RESOURCE_HCA_HANDLE); +err: + mutex_unlock(&file->ucontext_lock); + up_read(&file->hw_destroy_rwsem); + return ret; +} + +static int ib_uverbs_get_context(struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_get_context_resp resp; + struct ib_uverbs_get_context cmd; + struct ib_device *ib_dev; + struct ib_uobject *uobj; + int ret; + + ret = uverbs_request(attrs, &cmd, sizeof(cmd)); + if (ret) + return ret; + + ret = ib_alloc_ucontext(attrs); + if (ret) + return ret; + + uobj = uobj_alloc(UVERBS_OBJECT_ASYNC_EVENT, attrs, &ib_dev); + if (IS_ERR(uobj)) { + ret = PTR_ERR(uobj); + goto err_ucontext; + } + + resp = (struct ib_uverbs_get_context_resp){ + .num_comp_vectors = attrs->ufile->device->num_comp_vectors, + .async_fd = uobj->id, + }; + ret = uverbs_response(attrs, &resp, sizeof(resp)); + if (ret) + goto err_uobj; + + ret = ib_init_ucontext(attrs); + if (ret) + goto err_uobj; + + ib_uverbs_init_async_event_file( + container_of(uobj, struct ib_uverbs_async_event_file, uobj)); + rdma_alloc_commit_uobject(uobj, attrs); + return 0; + +err_uobj: + rdma_alloc_abort_uobject(uobj, attrs, false); +err_ucontext: + rdma_restrack_put(&attrs->context->res); + kfree(attrs->context); + attrs->context = NULL; + return ret; +} + +static void copy_query_dev_fields(struct ib_ucontext *ucontext, + struct ib_uverbs_query_device_resp *resp, + struct ib_device_attr *attr) +{ + struct ib_device *ib_dev = ucontext->device; + + resp->fw_ver = attr->fw_ver; + resp->node_guid = ib_dev->node_guid; + resp->sys_image_guid = attr->sys_image_guid; + resp->max_mr_size = attr->max_mr_size; + resp->page_size_cap = attr->page_size_cap; + resp->vendor_id = attr->vendor_id; + resp->vendor_part_id = attr->vendor_part_id; + resp->hw_ver = attr->hw_ver; + resp->max_qp = attr->max_qp; + resp->max_qp_wr = attr->max_qp_wr; + resp->device_cap_flags = lower_32_bits(attr->device_cap_flags); + resp->max_sge = min(attr->max_send_sge, attr->max_recv_sge); + resp->max_sge_rd = attr->max_sge_rd; + resp->max_cq = attr->max_cq; + resp->max_cqe = attr->max_cqe; + resp->max_mr = attr->max_mr; + resp->max_pd = attr->max_pd; + resp->max_qp_rd_atom = attr->max_qp_rd_atom; + resp->max_ee_rd_atom = attr->max_ee_rd_atom; + resp->max_res_rd_atom = attr->max_res_rd_atom; + resp->max_qp_init_rd_atom = attr->max_qp_init_rd_atom; + resp->max_ee_init_rd_atom = attr->max_ee_init_rd_atom; + resp->atomic_cap = attr->atomic_cap; + resp->max_ee = attr->max_ee; + resp->max_rdd = attr->max_rdd; + resp->max_mw = attr->max_mw; + resp->max_raw_ipv6_qp = attr->max_raw_ipv6_qp; + resp->max_raw_ethy_qp = attr->max_raw_ethy_qp; + resp->max_mcast_grp = attr->max_mcast_grp; + resp->max_mcast_qp_attach = attr->max_mcast_qp_attach; + resp->max_total_mcast_qp_attach = attr->max_total_mcast_qp_attach; + resp->max_ah = attr->max_ah; + resp->max_srq = attr->max_srq; + resp->max_srq_wr = attr->max_srq_wr; + resp->max_srq_sge = attr->max_srq_sge; + resp->max_pkeys = attr->max_pkeys; + resp->local_ca_ack_delay = attr->local_ca_ack_delay; + resp->phys_port_cnt = min_t(u32, ib_dev->phys_port_cnt, U8_MAX); +} + +static int ib_uverbs_query_device(struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_query_device cmd; + struct ib_uverbs_query_device_resp resp; + struct ib_ucontext *ucontext; + int ret; + + ucontext = ib_uverbs_get_ucontext(attrs); + if (IS_ERR(ucontext)) + return PTR_ERR(ucontext); + + ret = uverbs_request(attrs, &cmd, sizeof(cmd)); + if (ret) + return ret; + + memset(&resp, 0, sizeof resp); + copy_query_dev_fields(ucontext, &resp, &ucontext->device->attrs); + + return uverbs_response(attrs, &resp, sizeof(resp)); +} + +static int ib_uverbs_query_port(struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_query_port cmd; + struct ib_uverbs_query_port_resp resp; + struct ib_port_attr attr; + int ret; + struct ib_ucontext *ucontext; + struct ib_device *ib_dev; + + ucontext = ib_uverbs_get_ucontext(attrs); + if (IS_ERR(ucontext)) + return PTR_ERR(ucontext); + ib_dev = ucontext->device; + + ret = uverbs_request(attrs, &cmd, sizeof(cmd)); + if (ret) + return ret; + + ret = ib_query_port(ib_dev, cmd.port_num, &attr); + if (ret) + return ret; + + memset(&resp, 0, sizeof resp); + copy_port_attr_to_resp(&attr, &resp, ib_dev, cmd.port_num); + + return uverbs_response(attrs, &resp, sizeof(resp)); +} + +static int ib_uverbs_alloc_pd(struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_alloc_pd_resp resp = {}; + struct ib_uverbs_alloc_pd cmd; + struct ib_uobject *uobj; + struct ib_pd *pd; + int ret; + struct ib_device *ib_dev; + + ret = uverbs_request(attrs, &cmd, sizeof(cmd)); + if (ret) + return ret; + + uobj = uobj_alloc(UVERBS_OBJECT_PD, attrs, &ib_dev); + if (IS_ERR(uobj)) + return PTR_ERR(uobj); + + pd = rdma_zalloc_drv_obj(ib_dev, ib_pd); + if (!pd) { + ret = -ENOMEM; + goto err; + } + + pd->device = ib_dev; + pd->uobject = uobj; + atomic_set(&pd->usecnt, 0); + + rdma_restrack_new(&pd->res, RDMA_RESTRACK_PD); + rdma_restrack_set_name(&pd->res, NULL); + + ret = ib_dev->ops.alloc_pd(pd, &attrs->driver_udata); + if (ret) + goto err_alloc; + rdma_restrack_add(&pd->res); + + uobj->object = pd; + uobj_finalize_uobj_create(uobj, attrs); + + resp.pd_handle = uobj->id; + return uverbs_response(attrs, &resp, sizeof(resp)); + +err_alloc: + rdma_restrack_put(&pd->res); + kfree(pd); +err: + uobj_alloc_abort(uobj, attrs); + return ret; +} + +static int ib_uverbs_dealloc_pd(struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_dealloc_pd cmd; + int ret; + + ret = uverbs_request(attrs, &cmd, sizeof(cmd)); + if (ret) + return ret; + + return uobj_perform_destroy(UVERBS_OBJECT_PD, cmd.pd_handle, attrs); +} + +struct xrcd_table_entry { + struct rb_node node; + struct ib_xrcd *xrcd; + struct inode *inode; +}; + +static int xrcd_table_insert(struct ib_uverbs_device *dev, + struct inode *inode, + struct ib_xrcd *xrcd) +{ + struct xrcd_table_entry *entry, *scan; + struct rb_node **p = &dev->xrcd_tree.rb_node; + struct rb_node *parent = NULL; + + entry = kmalloc(sizeof *entry, GFP_KERNEL); + if (!entry) + return -ENOMEM; + + entry->xrcd = xrcd; + entry->inode = inode; + + while (*p) { + parent = *p; + scan = rb_entry(parent, struct xrcd_table_entry, node); + + if (inode < scan->inode) { + p = &(*p)->rb_left; + } else if (inode > scan->inode) { + p = &(*p)->rb_right; + } else { + kfree(entry); + return -EEXIST; + } + } + + rb_link_node(&entry->node, parent, p); + rb_insert_color(&entry->node, &dev->xrcd_tree); + igrab(inode); + return 0; +} + +static struct xrcd_table_entry *xrcd_table_search(struct ib_uverbs_device *dev, + struct inode *inode) +{ + struct xrcd_table_entry *entry; + struct rb_node *p = dev->xrcd_tree.rb_node; + + while (p) { + entry = rb_entry(p, struct xrcd_table_entry, node); + + if (inode < entry->inode) + p = p->rb_left; + else if (inode > entry->inode) + p = p->rb_right; + else + return entry; + } + + return NULL; +} + +static struct ib_xrcd *find_xrcd(struct ib_uverbs_device *dev, struct inode *inode) +{ + struct xrcd_table_entry *entry; + + entry = xrcd_table_search(dev, inode); + if (!entry) + return NULL; + + return entry->xrcd; +} + +static void xrcd_table_delete(struct ib_uverbs_device *dev, + struct inode *inode) +{ + struct xrcd_table_entry *entry; + + entry = xrcd_table_search(dev, inode); + if (entry) { + iput(inode); + rb_erase(&entry->node, &dev->xrcd_tree); + kfree(entry); + } +} + +static int ib_uverbs_open_xrcd(struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_device *ibudev = attrs->ufile->device; + struct ib_uverbs_open_xrcd_resp resp = {}; + struct ib_uverbs_open_xrcd cmd; + struct ib_uxrcd_object *obj; + struct ib_xrcd *xrcd = NULL; + struct inode *inode = NULL; + int new_xrcd = 0; + struct ib_device *ib_dev; + struct fd f = {}; + int ret; + + ret = uverbs_request(attrs, &cmd, sizeof(cmd)); + if (ret) + return ret; + + mutex_lock(&ibudev->xrcd_tree_mutex); + + if (cmd.fd != -1) { + /* search for file descriptor */ + f = fdget(cmd.fd); + if (!f.file) { + ret = -EBADF; + goto err_tree_mutex_unlock; + } + + inode = file_inode(f.file); + xrcd = find_xrcd(ibudev, inode); + if (!xrcd && !(cmd.oflags & O_CREAT)) { + /* no file descriptor. Need CREATE flag */ + ret = -EAGAIN; + goto err_tree_mutex_unlock; + } + + if (xrcd && cmd.oflags & O_EXCL) { + ret = -EINVAL; + goto err_tree_mutex_unlock; + } + } + + obj = (struct ib_uxrcd_object *)uobj_alloc(UVERBS_OBJECT_XRCD, attrs, + &ib_dev); + if (IS_ERR(obj)) { + ret = PTR_ERR(obj); + goto err_tree_mutex_unlock; + } + + if (!xrcd) { + xrcd = ib_alloc_xrcd_user(ib_dev, inode, &attrs->driver_udata); + if (IS_ERR(xrcd)) { + ret = PTR_ERR(xrcd); + goto err; + } + new_xrcd = 1; + } + + atomic_set(&obj->refcnt, 0); + obj->uobject.object = xrcd; + + if (inode) { + if (new_xrcd) { + /* create new inode/xrcd table entry */ + ret = xrcd_table_insert(ibudev, inode, xrcd); + if (ret) + goto err_dealloc_xrcd; + } + atomic_inc(&xrcd->usecnt); + } + + if (f.file) + fdput(f); + + mutex_unlock(&ibudev->xrcd_tree_mutex); + uobj_finalize_uobj_create(&obj->uobject, attrs); + + resp.xrcd_handle = obj->uobject.id; + return uverbs_response(attrs, &resp, sizeof(resp)); + +err_dealloc_xrcd: + ib_dealloc_xrcd_user(xrcd, uverbs_get_cleared_udata(attrs)); + +err: + uobj_alloc_abort(&obj->uobject, attrs); + +err_tree_mutex_unlock: + if (f.file) + fdput(f); + + mutex_unlock(&ibudev->xrcd_tree_mutex); + + return ret; +} + +static int ib_uverbs_close_xrcd(struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_close_xrcd cmd; + int ret; + + ret = uverbs_request(attrs, &cmd, sizeof(cmd)); + if (ret) + return ret; + + return uobj_perform_destroy(UVERBS_OBJECT_XRCD, cmd.xrcd_handle, attrs); +} + +int ib_uverbs_dealloc_xrcd(struct ib_uobject *uobject, struct ib_xrcd *xrcd, + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) +{ + struct inode *inode; + int ret; + struct ib_uverbs_device *dev = attrs->ufile->device; + + inode = xrcd->inode; + if (inode && !atomic_dec_and_test(&xrcd->usecnt)) + return 0; + + ret = ib_dealloc_xrcd_user(xrcd, &attrs->driver_udata); + if (ret) { + atomic_inc(&xrcd->usecnt); + return ret; + } + + if (inode) + xrcd_table_delete(dev, inode); + + return 0; +} + +static int ib_uverbs_reg_mr(struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_reg_mr_resp resp = {}; + struct ib_uverbs_reg_mr cmd; + struct ib_uobject *uobj; + struct ib_pd *pd; + struct ib_mr *mr; + int ret; + struct ib_device *ib_dev; + + ret = uverbs_request(attrs, &cmd, sizeof(cmd)); + if (ret) + return ret; + + if ((cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK)) + return -EINVAL; + + uobj = uobj_alloc(UVERBS_OBJECT_MR, attrs, &ib_dev); + if (IS_ERR(uobj)) + return PTR_ERR(uobj); + + ret = ib_check_mr_access(ib_dev, cmd.access_flags); + if (ret) + goto err_free; + + pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, attrs); + if (!pd) { + ret = -EINVAL; + goto err_free; + } + + mr = pd->device->ops.reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va, + cmd.access_flags, + &attrs->driver_udata); + if (IS_ERR(mr)) { + ret = PTR_ERR(mr); + goto err_put; + } + + mr->device = pd->device; + mr->pd = pd; + mr->type = IB_MR_TYPE_USER; + mr->dm = NULL; + mr->sig_attrs = NULL; + mr->uobject = uobj; + atomic_inc(&pd->usecnt); + mr->iova = cmd.hca_va; + + rdma_restrack_new(&mr->res, RDMA_RESTRACK_MR); + rdma_restrack_set_name(&mr->res, NULL); + rdma_restrack_add(&mr->res); + + uobj->object = mr; + uobj_put_obj_read(pd); + uobj_finalize_uobj_create(uobj, attrs); + + resp.lkey = mr->lkey; + resp.rkey = mr->rkey; + resp.mr_handle = uobj->id; + return uverbs_response(attrs, &resp, sizeof(resp)); + +err_put: + uobj_put_obj_read(pd); +err_free: + uobj_alloc_abort(uobj, attrs); + return ret; +} + +static int ib_uverbs_rereg_mr(struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_rereg_mr cmd; + struct ib_uverbs_rereg_mr_resp resp; + struct ib_mr *mr; + int ret; + struct ib_uobject *uobj; + struct ib_uobject *new_uobj; + struct ib_device *ib_dev; + struct ib_pd *orig_pd; + struct ib_pd *new_pd; + struct ib_mr *new_mr; + + ret = uverbs_request(attrs, &cmd, sizeof(cmd)); + if (ret) + return ret; + + if (!cmd.flags) + return -EINVAL; + + if (cmd.flags & ~IB_MR_REREG_SUPPORTED) + return -EOPNOTSUPP; + + if ((cmd.flags & IB_MR_REREG_TRANS) && + (cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK)) + return -EINVAL; + + uobj = uobj_get_write(UVERBS_OBJECT_MR, cmd.mr_handle, attrs); + if (IS_ERR(uobj)) + return PTR_ERR(uobj); + + mr = uobj->object; + + if (mr->dm) { + ret = -EINVAL; + goto put_uobjs; + } + + if (cmd.flags & IB_MR_REREG_ACCESS) { + ret = ib_check_mr_access(mr->device, cmd.access_flags); + if (ret) + goto put_uobjs; + } + + orig_pd = mr->pd; + if (cmd.flags & IB_MR_REREG_PD) { + new_pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, + attrs); + if (!new_pd) { + ret = -EINVAL; + goto put_uobjs; + } + } else { + new_pd = mr->pd; + } + + /* + * The driver might create a new HW object as part of the rereg, we need + * to have a uobject ready to hold it. + */ + new_uobj = uobj_alloc(UVERBS_OBJECT_MR, attrs, &ib_dev); + if (IS_ERR(new_uobj)) { + ret = PTR_ERR(new_uobj); + goto put_uobj_pd; + } + + new_mr = ib_dev->ops.rereg_user_mr(mr, cmd.flags, cmd.start, cmd.length, + cmd.hca_va, cmd.access_flags, new_pd, + &attrs->driver_udata); + if (IS_ERR(new_mr)) { + ret = PTR_ERR(new_mr); + goto put_new_uobj; + } + if (new_mr) { + new_mr->device = new_pd->device; + new_mr->pd = new_pd; + new_mr->type = IB_MR_TYPE_USER; + new_mr->uobject = uobj; + atomic_inc(&new_pd->usecnt); + new_uobj->object = new_mr; + + rdma_restrack_new(&new_mr->res, RDMA_RESTRACK_MR); + rdma_restrack_set_name(&new_mr->res, NULL); + rdma_restrack_add(&new_mr->res); + + /* + * The new uobj for the new HW object is put into the same spot + * in the IDR and the old uobj & HW object is deleted. + */ + rdma_assign_uobject(uobj, new_uobj, attrs); + rdma_alloc_commit_uobject(new_uobj, attrs); + uobj_put_destroy(uobj); + new_uobj = NULL; + uobj = NULL; + mr = new_mr; + } else { + if (cmd.flags & IB_MR_REREG_PD) { + atomic_dec(&orig_pd->usecnt); + mr->pd = new_pd; + atomic_inc(&new_pd->usecnt); + } + if (cmd.flags & IB_MR_REREG_TRANS) + mr->iova = cmd.hca_va; + } + + memset(&resp, 0, sizeof(resp)); + resp.lkey = mr->lkey; + resp.rkey = mr->rkey; + + ret = uverbs_response(attrs, &resp, sizeof(resp)); + +put_new_uobj: + if (new_uobj) + uobj_alloc_abort(new_uobj, attrs); +put_uobj_pd: + if (cmd.flags & IB_MR_REREG_PD) + uobj_put_obj_read(new_pd); + +put_uobjs: + if (uobj) + uobj_put_write(uobj); + + return ret; +} + +static int ib_uverbs_dereg_mr(struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_dereg_mr cmd; + int ret; + + ret = uverbs_request(attrs, &cmd, sizeof(cmd)); + if (ret) + return ret; + + return uobj_perform_destroy(UVERBS_OBJECT_MR, cmd.mr_handle, attrs); +} + +static int ib_uverbs_alloc_mw(struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_alloc_mw cmd; + struct ib_uverbs_alloc_mw_resp resp = {}; + struct ib_uobject *uobj; + struct ib_pd *pd; + struct ib_mw *mw; + int ret; + struct ib_device *ib_dev; + + ret = uverbs_request(attrs, &cmd, sizeof(cmd)); + if (ret) + return ret; + + uobj = uobj_alloc(UVERBS_OBJECT_MW, attrs, &ib_dev); + if (IS_ERR(uobj)) + return PTR_ERR(uobj); + + pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, attrs); + if (!pd) { + ret = -EINVAL; + goto err_free; + } + + if (cmd.mw_type != IB_MW_TYPE_1 && cmd.mw_type != IB_MW_TYPE_2) { + ret = -EINVAL; + goto err_put; + } + + mw = rdma_zalloc_drv_obj(ib_dev, ib_mw); + if (!mw) { + ret = -ENOMEM; + goto err_put; + } + + mw->device = ib_dev; + mw->pd = pd; + mw->uobject = uobj; + mw->type = cmd.mw_type; + + ret = pd->device->ops.alloc_mw(mw, &attrs->driver_udata); + if (ret) + goto err_alloc; + + atomic_inc(&pd->usecnt); + + uobj->object = mw; + uobj_put_obj_read(pd); + uobj_finalize_uobj_create(uobj, attrs); + + resp.rkey = mw->rkey; + resp.mw_handle = uobj->id; + return uverbs_response(attrs, &resp, sizeof(resp)); + +err_alloc: + kfree(mw); +err_put: + uobj_put_obj_read(pd); +err_free: + uobj_alloc_abort(uobj, attrs); + return ret; +} + +static int ib_uverbs_dealloc_mw(struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_dealloc_mw cmd; + int ret; + + ret = uverbs_request(attrs, &cmd, sizeof(cmd)); + if (ret) + return ret; + + return uobj_perform_destroy(UVERBS_OBJECT_MW, cmd.mw_handle, attrs); +} + +static int ib_uverbs_create_comp_channel(struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_create_comp_channel cmd; + struct ib_uverbs_create_comp_channel_resp resp; + struct ib_uobject *uobj; + struct ib_uverbs_completion_event_file *ev_file; + struct ib_device *ib_dev; + int ret; + + ret = uverbs_request(attrs, &cmd, sizeof(cmd)); + if (ret) + return ret; + + uobj = uobj_alloc(UVERBS_OBJECT_COMP_CHANNEL, attrs, &ib_dev); + if (IS_ERR(uobj)) + return PTR_ERR(uobj); + + ev_file = container_of(uobj, struct ib_uverbs_completion_event_file, + uobj); + ib_uverbs_init_event_queue(&ev_file->ev_queue); + uobj_finalize_uobj_create(uobj, attrs); + + resp.fd = uobj->id; + return uverbs_response(attrs, &resp, sizeof(resp)); +} + +static int create_cq(struct uverbs_attr_bundle *attrs, + struct ib_uverbs_ex_create_cq *cmd) +{ + struct ib_ucq_object *obj; + struct ib_uverbs_completion_event_file *ev_file = NULL; + struct ib_cq *cq; + int ret; + struct ib_uverbs_ex_create_cq_resp resp = {}; + struct ib_cq_init_attr attr = {}; + struct ib_device *ib_dev; + + if (cmd->comp_vector >= attrs->ufile->device->num_comp_vectors) + return -EINVAL; + + obj = (struct ib_ucq_object *)uobj_alloc(UVERBS_OBJECT_CQ, attrs, + &ib_dev); + if (IS_ERR(obj)) + return PTR_ERR(obj); + + if (cmd->comp_channel >= 0) { + ev_file = ib_uverbs_lookup_comp_file(cmd->comp_channel, attrs); + if (IS_ERR(ev_file)) { + ret = PTR_ERR(ev_file); + goto err; + } + } + + obj->uevent.uobject.user_handle = cmd->user_handle; + INIT_LIST_HEAD(&obj->comp_list); + INIT_LIST_HEAD(&obj->uevent.event_list); + + attr.cqe = cmd->cqe; + attr.comp_vector = cmd->comp_vector; + attr.flags = cmd->flags; + + cq = rdma_zalloc_drv_obj(ib_dev, ib_cq); + if (!cq) { + ret = -ENOMEM; + goto err_file; + } + cq->device = ib_dev; + cq->uobject = obj; + cq->comp_handler = ib_uverbs_comp_handler; + cq->event_handler = ib_uverbs_cq_event_handler; + cq->cq_context = ev_file ? &ev_file->ev_queue : NULL; + atomic_set(&cq->usecnt, 0); + + rdma_restrack_new(&cq->res, RDMA_RESTRACK_CQ); + rdma_restrack_set_name(&cq->res, NULL); + + ret = ib_dev->ops.create_cq(cq, &attr, &attrs->driver_udata); + if (ret) + goto err_free; + rdma_restrack_add(&cq->res); + + obj->uevent.uobject.object = cq; + obj->uevent.event_file = READ_ONCE(attrs->ufile->default_async_file); + if (obj->uevent.event_file) + uverbs_uobject_get(&obj->uevent.event_file->uobj); + uobj_finalize_uobj_create(&obj->uevent.uobject, attrs); + + resp.base.cq_handle = obj->uevent.uobject.id; + resp.base.cqe = cq->cqe; + resp.response_length = uverbs_response_length(attrs, sizeof(resp)); + return uverbs_response(attrs, &resp, sizeof(resp)); + +err_free: + rdma_restrack_put(&cq->res); + kfree(cq); +err_file: + if (ev_file) + ib_uverbs_release_ucq(ev_file, obj); +err: + uobj_alloc_abort(&obj->uevent.uobject, attrs); + return ret; +} + +static int ib_uverbs_create_cq(struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_create_cq cmd; + struct ib_uverbs_ex_create_cq cmd_ex; + int ret; + + ret = uverbs_request(attrs, &cmd, sizeof(cmd)); + if (ret) + return ret; + + memset(&cmd_ex, 0, sizeof(cmd_ex)); + cmd_ex.user_handle = cmd.user_handle; + cmd_ex.cqe = cmd.cqe; + cmd_ex.comp_vector = cmd.comp_vector; + cmd_ex.comp_channel = cmd.comp_channel; + + return create_cq(attrs, &cmd_ex); +} + +static int ib_uverbs_ex_create_cq(struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_ex_create_cq cmd; + int ret; + + ret = uverbs_request(attrs, &cmd, sizeof(cmd)); + if (ret) + return ret; + + if (cmd.comp_mask) + return -EINVAL; + + if (cmd.reserved) + return -EINVAL; + + return create_cq(attrs, &cmd); +} + +static int ib_uverbs_resize_cq(struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_resize_cq cmd; + struct ib_uverbs_resize_cq_resp resp = {}; + struct ib_cq *cq; + int ret; + + ret = uverbs_request(attrs, &cmd, sizeof(cmd)); + if (ret) + return ret; + + cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs); + if (!cq) + return -EINVAL; + + ret = cq->device->ops.resize_cq(cq, cmd.cqe, &attrs->driver_udata); + if (ret) + goto out; + + resp.cqe = cq->cqe; + + ret = uverbs_response(attrs, &resp, sizeof(resp)); +out: + rdma_lookup_put_uobject(&cq->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); + + return ret; +} + +static int copy_wc_to_user(struct ib_device *ib_dev, void __user *dest, + struct ib_wc *wc) +{ + struct ib_uverbs_wc tmp; + + tmp.wr_id = wc->wr_id; + tmp.status = wc->status; + tmp.opcode = wc->opcode; + tmp.vendor_err = wc->vendor_err; + tmp.byte_len = wc->byte_len; + tmp.ex.imm_data = wc->ex.imm_data; + tmp.qp_num = wc->qp->qp_num; + tmp.src_qp = wc->src_qp; + tmp.wc_flags = wc->wc_flags; + tmp.pkey_index = wc->pkey_index; + if (rdma_cap_opa_ah(ib_dev, wc->port_num)) + tmp.slid = OPA_TO_IB_UCAST_LID(wc->slid); + else + tmp.slid = ib_lid_cpu16(wc->slid); + tmp.sl = wc->sl; + tmp.dlid_path_bits = wc->dlid_path_bits; + tmp.port_num = wc->port_num; + tmp.reserved = 0; + + if (copy_to_user(dest, &tmp, sizeof tmp)) + return -EFAULT; + + return 0; +} + +static int ib_uverbs_poll_cq(struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_poll_cq cmd; + struct ib_uverbs_poll_cq_resp resp; + u8 __user *header_ptr; + u8 __user *data_ptr; + struct ib_cq *cq; + struct ib_wc wc; + int ret; + + ret = uverbs_request(attrs, &cmd, sizeof(cmd)); + if (ret) + return ret; + + cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs); + if (!cq) + return -EINVAL; + + /* we copy a struct ib_uverbs_poll_cq_resp to user space */ + header_ptr = attrs->ucore.outbuf; + data_ptr = header_ptr + sizeof resp; + + memset(&resp, 0, sizeof resp); + while (resp.count < cmd.ne) { + ret = ib_poll_cq(cq, 1, &wc); + if (ret < 0) + goto out_put; + if (!ret) + break; + + ret = copy_wc_to_user(cq->device, data_ptr, &wc); + if (ret) + goto out_put; + + data_ptr += sizeof(struct ib_uverbs_wc); + ++resp.count; + } + + if (copy_to_user(header_ptr, &resp, sizeof resp)) { + ret = -EFAULT; + goto out_put; + } + ret = 0; + + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CORE_OUT)) + ret = uverbs_output_written(attrs, UVERBS_ATTR_CORE_OUT); + +out_put: + rdma_lookup_put_uobject(&cq->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); + return ret; +} + +static int ib_uverbs_req_notify_cq(struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_req_notify_cq cmd; + struct ib_cq *cq; + int ret; + + ret = uverbs_request(attrs, &cmd, sizeof(cmd)); + if (ret) + return ret; + + cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs); + if (!cq) + return -EINVAL; + + ib_req_notify_cq(cq, cmd.solicited_only ? + IB_CQ_SOLICITED : IB_CQ_NEXT_COMP); + + rdma_lookup_put_uobject(&cq->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); + return 0; +} + +static int ib_uverbs_destroy_cq(struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_destroy_cq cmd; + struct ib_uverbs_destroy_cq_resp resp; + struct ib_uobject *uobj; + struct ib_ucq_object *obj; + int ret; + + ret = uverbs_request(attrs, &cmd, sizeof(cmd)); + if (ret) + return ret; + + uobj = uobj_get_destroy(UVERBS_OBJECT_CQ, cmd.cq_handle, attrs); + if (IS_ERR(uobj)) + return PTR_ERR(uobj); + + obj = container_of(uobj, struct ib_ucq_object, uevent.uobject); + memset(&resp, 0, sizeof(resp)); + resp.comp_events_reported = obj->comp_events_reported; + resp.async_events_reported = obj->uevent.events_reported; + + uobj_put_destroy(uobj); + + return uverbs_response(attrs, &resp, sizeof(resp)); +} + +static int create_qp(struct uverbs_attr_bundle *attrs, + struct ib_uverbs_ex_create_qp *cmd) +{ + struct ib_uqp_object *obj; + struct ib_device *device; + struct ib_pd *pd = NULL; + struct ib_xrcd *xrcd = NULL; + struct ib_uobject *xrcd_uobj = ERR_PTR(-ENOENT); + struct ib_cq *scq = NULL, *rcq = NULL; + struct ib_srq *srq = NULL; + struct ib_qp *qp; + struct ib_qp_init_attr attr = {}; + struct ib_uverbs_ex_create_qp_resp resp = {}; + int ret; + struct ib_rwq_ind_table *ind_tbl = NULL; + bool has_sq = true; + struct ib_device *ib_dev; + + switch (cmd->qp_type) { + case IB_QPT_RAW_PACKET: + if (!capable(CAP_NET_RAW)) + return -EPERM; + break; + case IB_QPT_RC: + case IB_QPT_UC: + case IB_QPT_UD: + case IB_QPT_XRC_INI: + case IB_QPT_XRC_TGT: + case IB_QPT_DRIVER: + break; + default: + return -EINVAL; + } + + obj = (struct ib_uqp_object *)uobj_alloc(UVERBS_OBJECT_QP, attrs, + &ib_dev); + if (IS_ERR(obj)) + return PTR_ERR(obj); + obj->uxrcd = NULL; + obj->uevent.uobject.user_handle = cmd->user_handle; + mutex_init(&obj->mcast_lock); + + if (cmd->comp_mask & IB_UVERBS_CREATE_QP_MASK_IND_TABLE) { + ind_tbl = uobj_get_obj_read(rwq_ind_table, + UVERBS_OBJECT_RWQ_IND_TBL, + cmd->rwq_ind_tbl_handle, attrs); + if (!ind_tbl) { + ret = -EINVAL; + goto err_put; + } + + attr.rwq_ind_tbl = ind_tbl; + } + + if (ind_tbl && (cmd->max_recv_wr || cmd->max_recv_sge || cmd->is_srq)) { + ret = -EINVAL; + goto err_put; + } + + if (ind_tbl && !cmd->max_send_wr) + has_sq = false; + + if (cmd->qp_type == IB_QPT_XRC_TGT) { + xrcd_uobj = uobj_get_read(UVERBS_OBJECT_XRCD, cmd->pd_handle, + attrs); + + if (IS_ERR(xrcd_uobj)) { + ret = -EINVAL; + goto err_put; + } + + xrcd = (struct ib_xrcd *)xrcd_uobj->object; + if (!xrcd) { + ret = -EINVAL; + goto err_put; + } + device = xrcd->device; + } else { + if (cmd->qp_type == IB_QPT_XRC_INI) { + cmd->max_recv_wr = 0; + cmd->max_recv_sge = 0; + } else { + if (cmd->is_srq) { + srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, + cmd->srq_handle, attrs); + if (!srq || srq->srq_type == IB_SRQT_XRC) { + ret = -EINVAL; + goto err_put; + } + } + + if (!ind_tbl) { + if (cmd->recv_cq_handle != cmd->send_cq_handle) { + rcq = uobj_get_obj_read( + cq, UVERBS_OBJECT_CQ, + cmd->recv_cq_handle, attrs); + if (!rcq) { + ret = -EINVAL; + goto err_put; + } + } + } + } + + if (has_sq) + scq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, + cmd->send_cq_handle, attrs); + if (!ind_tbl && cmd->qp_type != IB_QPT_XRC_INI) + rcq = rcq ?: scq; + pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd->pd_handle, + attrs); + if (!pd || (!scq && has_sq)) { + ret = -EINVAL; + goto err_put; + } + + device = pd->device; + } + + attr.event_handler = ib_uverbs_qp_event_handler; + attr.send_cq = scq; + attr.recv_cq = rcq; + attr.srq = srq; + attr.xrcd = xrcd; + attr.sq_sig_type = cmd->sq_sig_all ? IB_SIGNAL_ALL_WR : + IB_SIGNAL_REQ_WR; + attr.qp_type = cmd->qp_type; + + attr.cap.max_send_wr = cmd->max_send_wr; + attr.cap.max_recv_wr = cmd->max_recv_wr; + attr.cap.max_send_sge = cmd->max_send_sge; + attr.cap.max_recv_sge = cmd->max_recv_sge; + attr.cap.max_inline_data = cmd->max_inline_data; + + INIT_LIST_HEAD(&obj->uevent.event_list); + INIT_LIST_HEAD(&obj->mcast_list); + + attr.create_flags = cmd->create_flags; + if (attr.create_flags & ~(IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK | + IB_QP_CREATE_CROSS_CHANNEL | + IB_QP_CREATE_MANAGED_SEND | + IB_QP_CREATE_MANAGED_RECV | + IB_QP_CREATE_SCATTER_FCS | + IB_QP_CREATE_CVLAN_STRIPPING | + IB_QP_CREATE_SOURCE_QPN | + IB_QP_CREATE_PCI_WRITE_END_PADDING)) { + ret = -EINVAL; + goto err_put; + } + + if (attr.create_flags & IB_QP_CREATE_SOURCE_QPN) { + if (!capable(CAP_NET_RAW)) { + ret = -EPERM; + goto err_put; + } + + attr.source_qpn = cmd->source_qpn; + } + + qp = ib_create_qp_user(device, pd, &attr, &attrs->driver_udata, obj, + KBUILD_MODNAME); + if (IS_ERR(qp)) { + ret = PTR_ERR(qp); + goto err_put; + } + ib_qp_usecnt_inc(qp); + + obj->uevent.uobject.object = qp; + obj->uevent.event_file = READ_ONCE(attrs->ufile->default_async_file); + if (obj->uevent.event_file) + uverbs_uobject_get(&obj->uevent.event_file->uobj); + + if (xrcd) { + obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, + uobject); + atomic_inc(&obj->uxrcd->refcnt); + uobj_put_read(xrcd_uobj); + } + + if (pd) + uobj_put_obj_read(pd); + if (scq) + rdma_lookup_put_uobject(&scq->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); + if (rcq && rcq != scq) + rdma_lookup_put_uobject(&rcq->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); + if (srq) + rdma_lookup_put_uobject(&srq->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); + if (ind_tbl) + uobj_put_obj_read(ind_tbl); + uobj_finalize_uobj_create(&obj->uevent.uobject, attrs); + + resp.base.qpn = qp->qp_num; + resp.base.qp_handle = obj->uevent.uobject.id; + resp.base.max_recv_sge = attr.cap.max_recv_sge; + resp.base.max_send_sge = attr.cap.max_send_sge; + resp.base.max_recv_wr = attr.cap.max_recv_wr; + resp.base.max_send_wr = attr.cap.max_send_wr; + resp.base.max_inline_data = attr.cap.max_inline_data; + resp.response_length = uverbs_response_length(attrs, sizeof(resp)); + return uverbs_response(attrs, &resp, sizeof(resp)); + +err_put: + if (!IS_ERR(xrcd_uobj)) + uobj_put_read(xrcd_uobj); + if (pd) + uobj_put_obj_read(pd); + if (scq) + rdma_lookup_put_uobject(&scq->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); + if (rcq && rcq != scq) + rdma_lookup_put_uobject(&rcq->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); + if (srq) + rdma_lookup_put_uobject(&srq->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); + if (ind_tbl) + uobj_put_obj_read(ind_tbl); + + uobj_alloc_abort(&obj->uevent.uobject, attrs); + return ret; +} + +static int ib_uverbs_create_qp(struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_create_qp cmd; + struct ib_uverbs_ex_create_qp cmd_ex; + int ret; + + ret = uverbs_request(attrs, &cmd, sizeof(cmd)); + if (ret) + return ret; + + memset(&cmd_ex, 0, sizeof(cmd_ex)); + cmd_ex.user_handle = cmd.user_handle; + cmd_ex.pd_handle = cmd.pd_handle; + cmd_ex.send_cq_handle = cmd.send_cq_handle; + cmd_ex.recv_cq_handle = cmd.recv_cq_handle; + cmd_ex.srq_handle = cmd.srq_handle; + cmd_ex.max_send_wr = cmd.max_send_wr; + cmd_ex.max_recv_wr = cmd.max_recv_wr; + cmd_ex.max_send_sge = cmd.max_send_sge; + cmd_ex.max_recv_sge = cmd.max_recv_sge; + cmd_ex.max_inline_data = cmd.max_inline_data; + cmd_ex.sq_sig_all = cmd.sq_sig_all; + cmd_ex.qp_type = cmd.qp_type; + cmd_ex.is_srq = cmd.is_srq; + + return create_qp(attrs, &cmd_ex); +} + +static int ib_uverbs_ex_create_qp(struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_ex_create_qp cmd; + int ret; + + ret = uverbs_request(attrs, &cmd, sizeof(cmd)); + if (ret) + return ret; + + if (cmd.comp_mask & ~IB_UVERBS_CREATE_QP_SUP_COMP_MASK) + return -EINVAL; + + if (cmd.reserved) + return -EINVAL; + + return create_qp(attrs, &cmd); +} + +static int ib_uverbs_open_qp(struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_create_qp_resp resp = {}; + struct ib_uverbs_open_qp cmd; + struct ib_uqp_object *obj; + struct ib_xrcd *xrcd; + struct ib_qp *qp; + struct ib_qp_open_attr attr = {}; + int ret; + struct ib_uobject *xrcd_uobj; + struct ib_device *ib_dev; + + ret = uverbs_request(attrs, &cmd, sizeof(cmd)); + if (ret) + return ret; + + obj = (struct ib_uqp_object *)uobj_alloc(UVERBS_OBJECT_QP, attrs, + &ib_dev); + if (IS_ERR(obj)) + return PTR_ERR(obj); + + xrcd_uobj = uobj_get_read(UVERBS_OBJECT_XRCD, cmd.pd_handle, attrs); + if (IS_ERR(xrcd_uobj)) { + ret = -EINVAL; + goto err_put; + } + + xrcd = (struct ib_xrcd *)xrcd_uobj->object; + if (!xrcd) { + ret = -EINVAL; + goto err_xrcd; + } + + attr.event_handler = ib_uverbs_qp_event_handler; + attr.qp_num = cmd.qpn; + attr.qp_type = cmd.qp_type; + + INIT_LIST_HEAD(&obj->uevent.event_list); + INIT_LIST_HEAD(&obj->mcast_list); + + qp = ib_open_qp(xrcd, &attr); + if (IS_ERR(qp)) { + ret = PTR_ERR(qp); + goto err_xrcd; + } + + obj->uevent.uobject.object = qp; + obj->uevent.uobject.user_handle = cmd.user_handle; + + obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject); + atomic_inc(&obj->uxrcd->refcnt); + qp->uobject = obj; + uobj_put_read(xrcd_uobj); + uobj_finalize_uobj_create(&obj->uevent.uobject, attrs); + + resp.qpn = qp->qp_num; + resp.qp_handle = obj->uevent.uobject.id; + return uverbs_response(attrs, &resp, sizeof(resp)); + +err_xrcd: + uobj_put_read(xrcd_uobj); +err_put: + uobj_alloc_abort(&obj->uevent.uobject, attrs); + return ret; +} + +static void copy_ah_attr_to_uverbs(struct ib_uverbs_qp_dest *uverb_attr, + struct rdma_ah_attr *rdma_attr) +{ + const struct ib_global_route *grh; + + uverb_attr->dlid = rdma_ah_get_dlid(rdma_attr); + uverb_attr->sl = rdma_ah_get_sl(rdma_attr); + uverb_attr->src_path_bits = rdma_ah_get_path_bits(rdma_attr); + uverb_attr->static_rate = rdma_ah_get_static_rate(rdma_attr); + uverb_attr->is_global = !!(rdma_ah_get_ah_flags(rdma_attr) & + IB_AH_GRH); + if (uverb_attr->is_global) { + grh = rdma_ah_read_grh(rdma_attr); + memcpy(uverb_attr->dgid, grh->dgid.raw, 16); + uverb_attr->flow_label = grh->flow_label; + uverb_attr->sgid_index = grh->sgid_index; + uverb_attr->hop_limit = grh->hop_limit; + uverb_attr->traffic_class = grh->traffic_class; + } + uverb_attr->port_num = rdma_ah_get_port_num(rdma_attr); +} + +static int ib_uverbs_query_qp(struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_query_qp cmd; + struct ib_uverbs_query_qp_resp resp; + struct ib_qp *qp; + struct ib_qp_attr *attr; + struct ib_qp_init_attr *init_attr; + int ret; + + ret = uverbs_request(attrs, &cmd, sizeof(cmd)); + if (ret) + return ret; + + attr = kmalloc(sizeof *attr, GFP_KERNEL); + init_attr = kmalloc(sizeof *init_attr, GFP_KERNEL); + if (!attr || !init_attr) { + ret = -ENOMEM; + goto out; + } + + qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs); + if (!qp) { + ret = -EINVAL; + goto out; + } + + ret = ib_query_qp(qp, attr, cmd.attr_mask, init_attr); + + rdma_lookup_put_uobject(&qp->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); + + if (ret) + goto out; + + memset(&resp, 0, sizeof resp); + + resp.qp_state = attr->qp_state; + resp.cur_qp_state = attr->cur_qp_state; + resp.path_mtu = attr->path_mtu; + resp.path_mig_state = attr->path_mig_state; + resp.qkey = attr->qkey; + resp.rq_psn = attr->rq_psn; + resp.sq_psn = attr->sq_psn; + resp.dest_qp_num = attr->dest_qp_num; + resp.qp_access_flags = attr->qp_access_flags; + resp.pkey_index = attr->pkey_index; + resp.alt_pkey_index = attr->alt_pkey_index; + resp.sq_draining = attr->sq_draining; + resp.max_rd_atomic = attr->max_rd_atomic; + resp.max_dest_rd_atomic = attr->max_dest_rd_atomic; + resp.min_rnr_timer = attr->min_rnr_timer; + resp.port_num = attr->port_num; + resp.timeout = attr->timeout; + resp.retry_cnt = attr->retry_cnt; + resp.rnr_retry = attr->rnr_retry; + resp.alt_port_num = attr->alt_port_num; + resp.alt_timeout = attr->alt_timeout; + + copy_ah_attr_to_uverbs(&resp.dest, &attr->ah_attr); + copy_ah_attr_to_uverbs(&resp.alt_dest, &attr->alt_ah_attr); + + resp.max_send_wr = init_attr->cap.max_send_wr; + resp.max_recv_wr = init_attr->cap.max_recv_wr; + resp.max_send_sge = init_attr->cap.max_send_sge; + resp.max_recv_sge = init_attr->cap.max_recv_sge; + resp.max_inline_data = init_attr->cap.max_inline_data; + resp.sq_sig_all = init_attr->sq_sig_type == IB_SIGNAL_ALL_WR; + + ret = uverbs_response(attrs, &resp, sizeof(resp)); + +out: + kfree(attr); + kfree(init_attr); + + return ret; +} + +/* Remove ignored fields set in the attribute mask */ +static int modify_qp_mask(enum ib_qp_type qp_type, int mask) +{ + switch (qp_type) { + case IB_QPT_XRC_INI: + return mask & ~(IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER); + case IB_QPT_XRC_TGT: + return mask & ~(IB_QP_MAX_QP_RD_ATOMIC | IB_QP_RETRY_CNT | + IB_QP_RNR_RETRY); + default: + return mask; + } +} + +static void copy_ah_attr_from_uverbs(struct ib_qp *qp, + struct rdma_ah_attr *rdma_attr, + struct ib_uverbs_qp_dest *uverb_attr) +{ + rdma_attr->type = rdma_ah_find_type(qp->device, qp->real_qp->port); + if (uverb_attr->is_global) { + rdma_ah_set_grh(rdma_attr, NULL, + uverb_attr->flow_label, + uverb_attr->sgid_index, + uverb_attr->hop_limit, + uverb_attr->traffic_class); + rdma_ah_set_dgid_raw(rdma_attr, uverb_attr->dgid); + } else { + rdma_ah_set_ah_flags(rdma_attr, 0); + } + rdma_ah_set_dlid(rdma_attr, uverb_attr->dlid); + rdma_ah_set_sl(rdma_attr, uverb_attr->sl); + rdma_ah_set_path_bits(rdma_attr, uverb_attr->src_path_bits); + rdma_ah_set_static_rate(rdma_attr, uverb_attr->static_rate); + rdma_ah_set_port_num(rdma_attr, uverb_attr->port_num); + rdma_ah_set_make_grd(rdma_attr, false); +} + +static int modify_qp(struct uverbs_attr_bundle *attrs, + struct ib_uverbs_ex_modify_qp *cmd) +{ + struct ib_qp_attr *attr; + struct ib_qp *qp; + int ret; + + attr = kzalloc(sizeof(*attr), GFP_KERNEL); + if (!attr) + return -ENOMEM; + + qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd->base.qp_handle, + attrs); + if (!qp) { + ret = -EINVAL; + goto out; + } + + if ((cmd->base.attr_mask & IB_QP_PORT) && + !rdma_is_port_valid(qp->device, cmd->base.port_num)) { + ret = -EINVAL; + goto release_qp; + } + + if ((cmd->base.attr_mask & IB_QP_AV)) { + if (!rdma_is_port_valid(qp->device, cmd->base.dest.port_num)) { + ret = -EINVAL; + goto release_qp; + } + + if (cmd->base.attr_mask & IB_QP_STATE && + cmd->base.qp_state == IB_QPS_RTR) { + /* We are in INIT->RTR TRANSITION (if we are not, + * this transition will be rejected in subsequent checks). + * In the INIT->RTR transition, we cannot have IB_QP_PORT set, + * but the IB_QP_STATE flag is required. + * + * Since kernel 3.14 (commit dbf727de7440), the uverbs driver, + * when IB_QP_AV is set, has required inclusion of a valid + * port number in the primary AV. (AVs are created and handled + * differently for infiniband and ethernet (RoCE) ports). + * + * Check the port number included in the primary AV against + * the port number in the qp struct, which was set (and saved) + * in the RST->INIT transition. + */ + if (cmd->base.dest.port_num != qp->real_qp->port) { + ret = -EINVAL; + goto release_qp; + } + } else { + /* We are in SQD->SQD. (If we are not, this transition will + * be rejected later in the verbs layer checks). + * Check for both IB_QP_PORT and IB_QP_AV, these can be set + * together in the SQD->SQD transition. + * + * If only IP_QP_AV was set, add in IB_QP_PORT as well (the + * verbs layer driver does not track primary port changes + * resulting from path migration. Thus, in SQD, if the primary + * AV is modified, the primary port should also be modified). + * + * Note that in this transition, the IB_QP_STATE flag + * is not allowed. + */ + if (((cmd->base.attr_mask & (IB_QP_AV | IB_QP_PORT)) + == (IB_QP_AV | IB_QP_PORT)) && + cmd->base.port_num != cmd->base.dest.port_num) { + ret = -EINVAL; + goto release_qp; + } + if ((cmd->base.attr_mask & (IB_QP_AV | IB_QP_PORT)) + == IB_QP_AV) { + cmd->base.attr_mask |= IB_QP_PORT; + cmd->base.port_num = cmd->base.dest.port_num; + } + } + } + + if ((cmd->base.attr_mask & IB_QP_ALT_PATH) && + (!rdma_is_port_valid(qp->device, cmd->base.alt_port_num) || + !rdma_is_port_valid(qp->device, cmd->base.alt_dest.port_num) || + cmd->base.alt_port_num != cmd->base.alt_dest.port_num)) { + ret = -EINVAL; + goto release_qp; + } + + if ((cmd->base.attr_mask & IB_QP_CUR_STATE && + cmd->base.cur_qp_state > IB_QPS_ERR) || + (cmd->base.attr_mask & IB_QP_STATE && + cmd->base.qp_state > IB_QPS_ERR)) { + ret = -EINVAL; + goto release_qp; + } + + if (cmd->base.attr_mask & IB_QP_STATE) + attr->qp_state = cmd->base.qp_state; + if (cmd->base.attr_mask & IB_QP_CUR_STATE) + attr->cur_qp_state = cmd->base.cur_qp_state; + if (cmd->base.attr_mask & IB_QP_PATH_MTU) + attr->path_mtu = cmd->base.path_mtu; + if (cmd->base.attr_mask & IB_QP_PATH_MIG_STATE) + attr->path_mig_state = cmd->base.path_mig_state; + if (cmd->base.attr_mask & IB_QP_QKEY) + attr->qkey = cmd->base.qkey; + if (cmd->base.attr_mask & IB_QP_RQ_PSN) + attr->rq_psn = cmd->base.rq_psn; + if (cmd->base.attr_mask & IB_QP_SQ_PSN) + attr->sq_psn = cmd->base.sq_psn; + if (cmd->base.attr_mask & IB_QP_DEST_QPN) + attr->dest_qp_num = cmd->base.dest_qp_num; + if (cmd->base.attr_mask & IB_QP_ACCESS_FLAGS) + attr->qp_access_flags = cmd->base.qp_access_flags; + if (cmd->base.attr_mask & IB_QP_PKEY_INDEX) + attr->pkey_index = cmd->base.pkey_index; + if (cmd->base.attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY) + attr->en_sqd_async_notify = cmd->base.en_sqd_async_notify; + if (cmd->base.attr_mask & IB_QP_MAX_QP_RD_ATOMIC) + attr->max_rd_atomic = cmd->base.max_rd_atomic; + if (cmd->base.attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) + attr->max_dest_rd_atomic = cmd->base.max_dest_rd_atomic; + if (cmd->base.attr_mask & IB_QP_MIN_RNR_TIMER) + attr->min_rnr_timer = cmd->base.min_rnr_timer; + if (cmd->base.attr_mask & IB_QP_PORT) + attr->port_num = cmd->base.port_num; + if (cmd->base.attr_mask & IB_QP_TIMEOUT) + attr->timeout = cmd->base.timeout; + if (cmd->base.attr_mask & IB_QP_RETRY_CNT) + attr->retry_cnt = cmd->base.retry_cnt; + if (cmd->base.attr_mask & IB_QP_RNR_RETRY) + attr->rnr_retry = cmd->base.rnr_retry; + if (cmd->base.attr_mask & IB_QP_ALT_PATH) { + attr->alt_port_num = cmd->base.alt_port_num; + attr->alt_timeout = cmd->base.alt_timeout; + attr->alt_pkey_index = cmd->base.alt_pkey_index; + } + if (cmd->base.attr_mask & IB_QP_RATE_LIMIT) + attr->rate_limit = cmd->rate_limit; + + if (cmd->base.attr_mask & IB_QP_AV) + copy_ah_attr_from_uverbs(qp, &attr->ah_attr, + &cmd->base.dest); + + if (cmd->base.attr_mask & IB_QP_ALT_PATH) + copy_ah_attr_from_uverbs(qp, &attr->alt_ah_attr, + &cmd->base.alt_dest); + + ret = ib_modify_qp_with_udata(qp, attr, + modify_qp_mask(qp->qp_type, + cmd->base.attr_mask), + &attrs->driver_udata); + +release_qp: + rdma_lookup_put_uobject(&qp->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); +out: + kfree(attr); + + return ret; +} + +static int ib_uverbs_modify_qp(struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_ex_modify_qp cmd; + int ret; + + ret = uverbs_request(attrs, &cmd.base, sizeof(cmd.base)); + if (ret) + return ret; + + if (cmd.base.attr_mask & ~IB_QP_ATTR_STANDARD_BITS) + return -EOPNOTSUPP; + + return modify_qp(attrs, &cmd); +} + +static int ib_uverbs_ex_modify_qp(struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_ex_modify_qp cmd; + struct ib_uverbs_ex_modify_qp_resp resp = { + .response_length = uverbs_response_length(attrs, sizeof(resp)) + }; + int ret; + + ret = uverbs_request(attrs, &cmd, sizeof(cmd)); + if (ret) + return ret; + + /* + * Last bit is reserved for extending the attr_mask by + * using another field. + */ + if (cmd.base.attr_mask & ~(IB_QP_ATTR_STANDARD_BITS | IB_QP_RATE_LIMIT)) + return -EOPNOTSUPP; + + ret = modify_qp(attrs, &cmd); + if (ret) + return ret; + + return uverbs_response(attrs, &resp, sizeof(resp)); +} + +static int ib_uverbs_destroy_qp(struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_destroy_qp cmd; + struct ib_uverbs_destroy_qp_resp resp; + struct ib_uobject *uobj; + struct ib_uqp_object *obj; + int ret; + + ret = uverbs_request(attrs, &cmd, sizeof(cmd)); + if (ret) + return ret; + + uobj = uobj_get_destroy(UVERBS_OBJECT_QP, cmd.qp_handle, attrs); + if (IS_ERR(uobj)) + return PTR_ERR(uobj); + + obj = container_of(uobj, struct ib_uqp_object, uevent.uobject); + memset(&resp, 0, sizeof(resp)); + resp.events_reported = obj->uevent.events_reported; + + uobj_put_destroy(uobj); + + return uverbs_response(attrs, &resp, sizeof(resp)); +} + +static void *alloc_wr(size_t wr_size, __u32 num_sge) +{ + if (num_sge >= (U32_MAX - ALIGN(wr_size, sizeof(struct ib_sge))) / + sizeof(struct ib_sge)) + return NULL; + + return kmalloc(ALIGN(wr_size, sizeof(struct ib_sge)) + + num_sge * sizeof(struct ib_sge), + GFP_KERNEL); +} + +static int ib_uverbs_post_send(struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_post_send cmd; + struct ib_uverbs_post_send_resp resp; + struct ib_uverbs_send_wr *user_wr; + struct ib_send_wr *wr = NULL, *last, *next; + const struct ib_send_wr *bad_wr; + struct ib_qp *qp; + int i, sg_ind; + int is_ud; + int ret, ret2; + size_t next_size; + const struct ib_sge __user *sgls; + const void __user *wqes; + struct uverbs_req_iter iter; + + ret = uverbs_request_start(attrs, &iter, &cmd, sizeof(cmd)); + if (ret) + return ret; + wqes = uverbs_request_next_ptr(&iter, cmd.wqe_size * cmd.wr_count); + if (IS_ERR(wqes)) + return PTR_ERR(wqes); + sgls = uverbs_request_next_ptr( + &iter, cmd.sge_count * sizeof(struct ib_uverbs_sge)); + if (IS_ERR(sgls)) + return PTR_ERR(sgls); + ret = uverbs_request_finish(&iter); + if (ret) + return ret; + + user_wr = kmalloc(cmd.wqe_size, GFP_KERNEL); + if (!user_wr) + return -ENOMEM; + + qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs); + if (!qp) { + ret = -EINVAL; + goto out; + } + + is_ud = qp->qp_type == IB_QPT_UD; + sg_ind = 0; + last = NULL; + for (i = 0; i < cmd.wr_count; ++i) { + if (copy_from_user(user_wr, wqes + i * cmd.wqe_size, + cmd.wqe_size)) { + ret = -EFAULT; + goto out_put; + } + + if (user_wr->num_sge + sg_ind > cmd.sge_count) { + ret = -EINVAL; + goto out_put; + } + + if (is_ud) { + struct ib_ud_wr *ud; + + if (user_wr->opcode != IB_WR_SEND && + user_wr->opcode != IB_WR_SEND_WITH_IMM) { + ret = -EINVAL; + goto out_put; + } + + next_size = sizeof(*ud); + ud = alloc_wr(next_size, user_wr->num_sge); + if (!ud) { + ret = -ENOMEM; + goto out_put; + } + + ud->ah = uobj_get_obj_read(ah, UVERBS_OBJECT_AH, + user_wr->wr.ud.ah, attrs); + if (!ud->ah) { + kfree(ud); + ret = -EINVAL; + goto out_put; + } + ud->remote_qpn = user_wr->wr.ud.remote_qpn; + ud->remote_qkey = user_wr->wr.ud.remote_qkey; + + next = &ud->wr; + } else if (user_wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM || + user_wr->opcode == IB_WR_RDMA_WRITE || + user_wr->opcode == IB_WR_RDMA_READ) { + struct ib_rdma_wr *rdma; + + next_size = sizeof(*rdma); + rdma = alloc_wr(next_size, user_wr->num_sge); + if (!rdma) { + ret = -ENOMEM; + goto out_put; + } + + rdma->remote_addr = user_wr->wr.rdma.remote_addr; + rdma->rkey = user_wr->wr.rdma.rkey; + + next = &rdma->wr; + } else if (user_wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP || + user_wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD) { + struct ib_atomic_wr *atomic; + + next_size = sizeof(*atomic); + atomic = alloc_wr(next_size, user_wr->num_sge); + if (!atomic) { + ret = -ENOMEM; + goto out_put; + } + + atomic->remote_addr = user_wr->wr.atomic.remote_addr; + atomic->compare_add = user_wr->wr.atomic.compare_add; + atomic->swap = user_wr->wr.atomic.swap; + atomic->rkey = user_wr->wr.atomic.rkey; + + next = &atomic->wr; + } else if (user_wr->opcode == IB_WR_SEND || + user_wr->opcode == IB_WR_SEND_WITH_IMM || + user_wr->opcode == IB_WR_SEND_WITH_INV) { + next_size = sizeof(*next); + next = alloc_wr(next_size, user_wr->num_sge); + if (!next) { + ret = -ENOMEM; + goto out_put; + } + } else { + ret = -EINVAL; + goto out_put; + } + + if (user_wr->opcode == IB_WR_SEND_WITH_IMM || + user_wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) { + next->ex.imm_data = + (__be32 __force) user_wr->ex.imm_data; + } else if (user_wr->opcode == IB_WR_SEND_WITH_INV) { + next->ex.invalidate_rkey = user_wr->ex.invalidate_rkey; + } + + if (!last) + wr = next; + else + last->next = next; + last = next; + + next->next = NULL; + next->wr_id = user_wr->wr_id; + next->num_sge = user_wr->num_sge; + next->opcode = user_wr->opcode; + next->send_flags = user_wr->send_flags; + + if (next->num_sge) { + next->sg_list = (void *) next + + ALIGN(next_size, sizeof(struct ib_sge)); + if (copy_from_user(next->sg_list, sgls + sg_ind, + next->num_sge * + sizeof(struct ib_sge))) { + ret = -EFAULT; + goto out_put; + } + sg_ind += next->num_sge; + } else + next->sg_list = NULL; + } + + resp.bad_wr = 0; + ret = qp->device->ops.post_send(qp->real_qp, wr, &bad_wr); + if (ret) + for (next = wr; next; next = next->next) { + ++resp.bad_wr; + if (next == bad_wr) + break; + } + + ret2 = uverbs_response(attrs, &resp, sizeof(resp)); + if (ret2) + ret = ret2; + +out_put: + rdma_lookup_put_uobject(&qp->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); + + while (wr) { + if (is_ud && ud_wr(wr)->ah) + uobj_put_obj_read(ud_wr(wr)->ah); + next = wr->next; + kfree(wr); + wr = next; + } + +out: + kfree(user_wr); + + return ret; +} + +static struct ib_recv_wr * +ib_uverbs_unmarshall_recv(struct uverbs_req_iter *iter, u32 wr_count, + u32 wqe_size, u32 sge_count) +{ + struct ib_uverbs_recv_wr *user_wr; + struct ib_recv_wr *wr = NULL, *last, *next; + int sg_ind; + int i; + int ret; + const struct ib_sge __user *sgls; + const void __user *wqes; + + if (wqe_size < sizeof(struct ib_uverbs_recv_wr)) + return ERR_PTR(-EINVAL); + + wqes = uverbs_request_next_ptr(iter, wqe_size * wr_count); + if (IS_ERR(wqes)) + return ERR_CAST(wqes); + sgls = uverbs_request_next_ptr( + iter, sge_count * sizeof(struct ib_uverbs_sge)); + if (IS_ERR(sgls)) + return ERR_CAST(sgls); + ret = uverbs_request_finish(iter); + if (ret) + return ERR_PTR(ret); + + user_wr = kmalloc(wqe_size, GFP_KERNEL); + if (!user_wr) + return ERR_PTR(-ENOMEM); + + sg_ind = 0; + last = NULL; + for (i = 0; i < wr_count; ++i) { + if (copy_from_user(user_wr, wqes + i * wqe_size, + wqe_size)) { + ret = -EFAULT; + goto err; + } + + if (user_wr->num_sge + sg_ind > sge_count) { + ret = -EINVAL; + goto err; + } + + if (user_wr->num_sge >= + (U32_MAX - ALIGN(sizeof(*next), sizeof(struct ib_sge))) / + sizeof(struct ib_sge)) { + ret = -EINVAL; + goto err; + } + + next = kmalloc(ALIGN(sizeof(*next), sizeof(struct ib_sge)) + + user_wr->num_sge * sizeof(struct ib_sge), + GFP_KERNEL); + if (!next) { + ret = -ENOMEM; + goto err; + } + + if (!last) + wr = next; + else + last->next = next; + last = next; + + next->next = NULL; + next->wr_id = user_wr->wr_id; + next->num_sge = user_wr->num_sge; + + if (next->num_sge) { + next->sg_list = (void *)next + + ALIGN(sizeof(*next), sizeof(struct ib_sge)); + if (copy_from_user(next->sg_list, sgls + sg_ind, + next->num_sge * + sizeof(struct ib_sge))) { + ret = -EFAULT; + goto err; + } + sg_ind += next->num_sge; + } else + next->sg_list = NULL; + } + + kfree(user_wr); + return wr; + +err: + kfree(user_wr); + + while (wr) { + next = wr->next; + kfree(wr); + wr = next; + } + + return ERR_PTR(ret); +} + +static int ib_uverbs_post_recv(struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_post_recv cmd; + struct ib_uverbs_post_recv_resp resp; + struct ib_recv_wr *wr, *next; + const struct ib_recv_wr *bad_wr; + struct ib_qp *qp; + int ret, ret2; + struct uverbs_req_iter iter; + + ret = uverbs_request_start(attrs, &iter, &cmd, sizeof(cmd)); + if (ret) + return ret; + + wr = ib_uverbs_unmarshall_recv(&iter, cmd.wr_count, cmd.wqe_size, + cmd.sge_count); + if (IS_ERR(wr)) + return PTR_ERR(wr); + + qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs); + if (!qp) { + ret = -EINVAL; + goto out; + } + + resp.bad_wr = 0; + ret = qp->device->ops.post_recv(qp->real_qp, wr, &bad_wr); + + rdma_lookup_put_uobject(&qp->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); + if (ret) { + for (next = wr; next; next = next->next) { + ++resp.bad_wr; + if (next == bad_wr) + break; + } + } + + ret2 = uverbs_response(attrs, &resp, sizeof(resp)); + if (ret2) + ret = ret2; +out: + while (wr) { + next = wr->next; + kfree(wr); + wr = next; + } + + return ret; +} + +static int ib_uverbs_post_srq_recv(struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_post_srq_recv cmd; + struct ib_uverbs_post_srq_recv_resp resp; + struct ib_recv_wr *wr, *next; + const struct ib_recv_wr *bad_wr; + struct ib_srq *srq; + int ret, ret2; + struct uverbs_req_iter iter; + + ret = uverbs_request_start(attrs, &iter, &cmd, sizeof(cmd)); + if (ret) + return ret; + + wr = ib_uverbs_unmarshall_recv(&iter, cmd.wr_count, cmd.wqe_size, + cmd.sge_count); + if (IS_ERR(wr)) + return PTR_ERR(wr); + + srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, attrs); + if (!srq) { + ret = -EINVAL; + goto out; + } + + resp.bad_wr = 0; + ret = srq->device->ops.post_srq_recv(srq, wr, &bad_wr); + + rdma_lookup_put_uobject(&srq->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); + + if (ret) + for (next = wr; next; next = next->next) { + ++resp.bad_wr; + if (next == bad_wr) + break; + } + + ret2 = uverbs_response(attrs, &resp, sizeof(resp)); + if (ret2) + ret = ret2; + +out: + while (wr) { + next = wr->next; + kfree(wr); + wr = next; + } + + return ret; +} + +static int ib_uverbs_create_ah(struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_create_ah cmd; + struct ib_uverbs_create_ah_resp resp; + struct ib_uobject *uobj; + struct ib_pd *pd; + struct ib_ah *ah; + struct rdma_ah_attr attr = {}; + int ret; + struct ib_device *ib_dev; + + ret = uverbs_request(attrs, &cmd, sizeof(cmd)); + if (ret) + return ret; + + uobj = uobj_alloc(UVERBS_OBJECT_AH, attrs, &ib_dev); + if (IS_ERR(uobj)) + return PTR_ERR(uobj); + + if (!rdma_is_port_valid(ib_dev, cmd.attr.port_num)) { + ret = -EINVAL; + goto err; + } + + pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, attrs); + if (!pd) { + ret = -EINVAL; + goto err; + } + + attr.type = rdma_ah_find_type(ib_dev, cmd.attr.port_num); + rdma_ah_set_make_grd(&attr, false); + rdma_ah_set_dlid(&attr, cmd.attr.dlid); + rdma_ah_set_sl(&attr, cmd.attr.sl); + rdma_ah_set_path_bits(&attr, cmd.attr.src_path_bits); + rdma_ah_set_static_rate(&attr, cmd.attr.static_rate); + rdma_ah_set_port_num(&attr, cmd.attr.port_num); + + if (cmd.attr.is_global) { + rdma_ah_set_grh(&attr, NULL, cmd.attr.grh.flow_label, + cmd.attr.grh.sgid_index, + cmd.attr.grh.hop_limit, + cmd.attr.grh.traffic_class); + rdma_ah_set_dgid_raw(&attr, cmd.attr.grh.dgid); + } else { + rdma_ah_set_ah_flags(&attr, 0); + } + + ah = rdma_create_user_ah(pd, &attr, &attrs->driver_udata); + if (IS_ERR(ah)) { + ret = PTR_ERR(ah); + goto err_put; + } + + ah->uobject = uobj; + uobj->user_handle = cmd.user_handle; + uobj->object = ah; + uobj_put_obj_read(pd); + uobj_finalize_uobj_create(uobj, attrs); + + resp.ah_handle = uobj->id; + return uverbs_response(attrs, &resp, sizeof(resp)); + +err_put: + uobj_put_obj_read(pd); +err: + uobj_alloc_abort(uobj, attrs); + return ret; +} + +static int ib_uverbs_destroy_ah(struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_destroy_ah cmd; + int ret; + + ret = uverbs_request(attrs, &cmd, sizeof(cmd)); + if (ret) + return ret; + + return uobj_perform_destroy(UVERBS_OBJECT_AH, cmd.ah_handle, attrs); +} + +static int ib_uverbs_attach_mcast(struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_attach_mcast cmd; + struct ib_qp *qp; + struct ib_uqp_object *obj; + struct ib_uverbs_mcast_entry *mcast; + int ret; + + ret = uverbs_request(attrs, &cmd, sizeof(cmd)); + if (ret) + return ret; + + qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs); + if (!qp) + return -EINVAL; + + obj = qp->uobject; + + mutex_lock(&obj->mcast_lock); + list_for_each_entry(mcast, &obj->mcast_list, list) + if (cmd.mlid == mcast->lid && + !memcmp(cmd.gid, mcast->gid.raw, sizeof mcast->gid.raw)) { + ret = 0; + goto out_put; + } + + mcast = kmalloc(sizeof *mcast, GFP_KERNEL); + if (!mcast) { + ret = -ENOMEM; + goto out_put; + } + + mcast->lid = cmd.mlid; + memcpy(mcast->gid.raw, cmd.gid, sizeof mcast->gid.raw); + + ret = ib_attach_mcast(qp, &mcast->gid, cmd.mlid); + if (!ret) + list_add_tail(&mcast->list, &obj->mcast_list); + else + kfree(mcast); + +out_put: + mutex_unlock(&obj->mcast_lock); + rdma_lookup_put_uobject(&qp->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); + + return ret; +} + +static int ib_uverbs_detach_mcast(struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_detach_mcast cmd; + struct ib_uqp_object *obj; + struct ib_qp *qp; + struct ib_uverbs_mcast_entry *mcast; + int ret; + bool found = false; + + ret = uverbs_request(attrs, &cmd, sizeof(cmd)); + if (ret) + return ret; + + qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs); + if (!qp) + return -EINVAL; + + obj = qp->uobject; + mutex_lock(&obj->mcast_lock); + + list_for_each_entry(mcast, &obj->mcast_list, list) + if (cmd.mlid == mcast->lid && + !memcmp(cmd.gid, mcast->gid.raw, sizeof mcast->gid.raw)) { + list_del(&mcast->list); + kfree(mcast); + found = true; + break; + } + + if (!found) { + ret = -EINVAL; + goto out_put; + } + + ret = ib_detach_mcast(qp, (union ib_gid *)cmd.gid, cmd.mlid); + +out_put: + mutex_unlock(&obj->mcast_lock); + rdma_lookup_put_uobject(&qp->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); + return ret; +} + +struct ib_uflow_resources *flow_resources_alloc(size_t num_specs) +{ + struct ib_uflow_resources *resources; + + resources = kzalloc(sizeof(*resources), GFP_KERNEL); + + if (!resources) + return NULL; + + if (!num_specs) + goto out; + + resources->counters = + kcalloc(num_specs, sizeof(*resources->counters), GFP_KERNEL); + resources->collection = + kcalloc(num_specs, sizeof(*resources->collection), GFP_KERNEL); + + if (!resources->counters || !resources->collection) + goto err; + +out: + resources->max = num_specs; + return resources; + +err: + kfree(resources->counters); + kfree(resources); + + return NULL; +} +EXPORT_SYMBOL(flow_resources_alloc); + +void ib_uverbs_flow_resources_free(struct ib_uflow_resources *uflow_res) +{ + unsigned int i; + + if (!uflow_res) + return; + + for (i = 0; i < uflow_res->collection_num; i++) + atomic_dec(&uflow_res->collection[i]->usecnt); + + for (i = 0; i < uflow_res->counters_num; i++) + atomic_dec(&uflow_res->counters[i]->usecnt); + + kfree(uflow_res->collection); + kfree(uflow_res->counters); + kfree(uflow_res); +} +EXPORT_SYMBOL(ib_uverbs_flow_resources_free); + +void flow_resources_add(struct ib_uflow_resources *uflow_res, + enum ib_flow_spec_type type, + void *ibobj) +{ + WARN_ON(uflow_res->num >= uflow_res->max); + + switch (type) { + case IB_FLOW_SPEC_ACTION_HANDLE: + atomic_inc(&((struct ib_flow_action *)ibobj)->usecnt); + uflow_res->collection[uflow_res->collection_num++] = + (struct ib_flow_action *)ibobj; + break; + case IB_FLOW_SPEC_ACTION_COUNT: + atomic_inc(&((struct ib_counters *)ibobj)->usecnt); + uflow_res->counters[uflow_res->counters_num++] = + (struct ib_counters *)ibobj; + break; + default: + WARN_ON(1); + } + + uflow_res->num++; +} +EXPORT_SYMBOL(flow_resources_add); + +static int kern_spec_to_ib_spec_action(struct uverbs_attr_bundle *attrs, + struct ib_uverbs_flow_spec *kern_spec, + union ib_flow_spec *ib_spec, + struct ib_uflow_resources *uflow_res) +{ + ib_spec->type = kern_spec->type; + switch (ib_spec->type) { + case IB_FLOW_SPEC_ACTION_TAG: + if (kern_spec->flow_tag.size != + sizeof(struct ib_uverbs_flow_spec_action_tag)) + return -EINVAL; + + ib_spec->flow_tag.size = sizeof(struct ib_flow_spec_action_tag); + ib_spec->flow_tag.tag_id = kern_spec->flow_tag.tag_id; + break; + case IB_FLOW_SPEC_ACTION_DROP: + if (kern_spec->drop.size != + sizeof(struct ib_uverbs_flow_spec_action_drop)) + return -EINVAL; + + ib_spec->drop.size = sizeof(struct ib_flow_spec_action_drop); + break; + case IB_FLOW_SPEC_ACTION_HANDLE: + if (kern_spec->action.size != + sizeof(struct ib_uverbs_flow_spec_action_handle)) + return -EOPNOTSUPP; + ib_spec->action.act = uobj_get_obj_read(flow_action, + UVERBS_OBJECT_FLOW_ACTION, + kern_spec->action.handle, + attrs); + if (!ib_spec->action.act) + return -EINVAL; + ib_spec->action.size = + sizeof(struct ib_flow_spec_action_handle); + flow_resources_add(uflow_res, + IB_FLOW_SPEC_ACTION_HANDLE, + ib_spec->action.act); + uobj_put_obj_read(ib_spec->action.act); + break; + case IB_FLOW_SPEC_ACTION_COUNT: + if (kern_spec->flow_count.size != + sizeof(struct ib_uverbs_flow_spec_action_count)) + return -EINVAL; + ib_spec->flow_count.counters = + uobj_get_obj_read(counters, + UVERBS_OBJECT_COUNTERS, + kern_spec->flow_count.handle, + attrs); + if (!ib_spec->flow_count.counters) + return -EINVAL; + ib_spec->flow_count.size = + sizeof(struct ib_flow_spec_action_count); + flow_resources_add(uflow_res, + IB_FLOW_SPEC_ACTION_COUNT, + ib_spec->flow_count.counters); + uobj_put_obj_read(ib_spec->flow_count.counters); + break; + default: + return -EINVAL; + } + return 0; +} + +static ssize_t spec_filter_size(const void *kern_spec_filter, u16 kern_filter_size, + u16 ib_real_filter_sz) +{ + /* + * User space filter structures must be 64 bit aligned, otherwise this + * may pass, but we won't handle additional new attributes. + */ + + if (kern_filter_size > ib_real_filter_sz) { + if (memchr_inv(kern_spec_filter + + ib_real_filter_sz, 0, + kern_filter_size - ib_real_filter_sz)) + return -EINVAL; + return ib_real_filter_sz; + } + return kern_filter_size; +} + +int ib_uverbs_kern_spec_to_ib_spec_filter(enum ib_flow_spec_type type, + const void *kern_spec_mask, + const void *kern_spec_val, + size_t kern_filter_sz, + union ib_flow_spec *ib_spec) +{ + ssize_t actual_filter_sz; + ssize_t ib_filter_sz; + + /* User flow spec size must be aligned to 4 bytes */ + if (kern_filter_sz != ALIGN(kern_filter_sz, 4)) + return -EINVAL; + + ib_spec->type = type; + + if (ib_spec->type == (IB_FLOW_SPEC_INNER | IB_FLOW_SPEC_VXLAN_TUNNEL)) + return -EINVAL; + + switch (ib_spec->type & ~IB_FLOW_SPEC_INNER) { + case IB_FLOW_SPEC_ETH: + ib_filter_sz = offsetof(struct ib_flow_eth_filter, real_sz); + actual_filter_sz = spec_filter_size(kern_spec_mask, + kern_filter_sz, + ib_filter_sz); + if (actual_filter_sz <= 0) + return -EINVAL; + ib_spec->size = sizeof(struct ib_flow_spec_eth); + memcpy(&ib_spec->eth.val, kern_spec_val, actual_filter_sz); + memcpy(&ib_spec->eth.mask, kern_spec_mask, actual_filter_sz); + break; + case IB_FLOW_SPEC_IPV4: + ib_filter_sz = offsetof(struct ib_flow_ipv4_filter, real_sz); + actual_filter_sz = spec_filter_size(kern_spec_mask, + kern_filter_sz, + ib_filter_sz); + if (actual_filter_sz <= 0) + return -EINVAL; + ib_spec->size = sizeof(struct ib_flow_spec_ipv4); + memcpy(&ib_spec->ipv4.val, kern_spec_val, actual_filter_sz); + memcpy(&ib_spec->ipv4.mask, kern_spec_mask, actual_filter_sz); + break; + case IB_FLOW_SPEC_IPV6: + ib_filter_sz = offsetof(struct ib_flow_ipv6_filter, real_sz); + actual_filter_sz = spec_filter_size(kern_spec_mask, + kern_filter_sz, + ib_filter_sz); + if (actual_filter_sz <= 0) + return -EINVAL; + ib_spec->size = sizeof(struct ib_flow_spec_ipv6); + memcpy(&ib_spec->ipv6.val, kern_spec_val, actual_filter_sz); + memcpy(&ib_spec->ipv6.mask, kern_spec_mask, actual_filter_sz); + + if ((ntohl(ib_spec->ipv6.mask.flow_label)) >= BIT(20) || + (ntohl(ib_spec->ipv6.val.flow_label)) >= BIT(20)) + return -EINVAL; + break; + case IB_FLOW_SPEC_TCP: + case IB_FLOW_SPEC_UDP: + ib_filter_sz = offsetof(struct ib_flow_tcp_udp_filter, real_sz); + actual_filter_sz = spec_filter_size(kern_spec_mask, + kern_filter_sz, + ib_filter_sz); + if (actual_filter_sz <= 0) + return -EINVAL; + ib_spec->size = sizeof(struct ib_flow_spec_tcp_udp); + memcpy(&ib_spec->tcp_udp.val, kern_spec_val, actual_filter_sz); + memcpy(&ib_spec->tcp_udp.mask, kern_spec_mask, actual_filter_sz); + break; + case IB_FLOW_SPEC_VXLAN_TUNNEL: + ib_filter_sz = offsetof(struct ib_flow_tunnel_filter, real_sz); + actual_filter_sz = spec_filter_size(kern_spec_mask, + kern_filter_sz, + ib_filter_sz); + if (actual_filter_sz <= 0) + return -EINVAL; + ib_spec->tunnel.size = sizeof(struct ib_flow_spec_tunnel); + memcpy(&ib_spec->tunnel.val, kern_spec_val, actual_filter_sz); + memcpy(&ib_spec->tunnel.mask, kern_spec_mask, actual_filter_sz); + + if ((ntohl(ib_spec->tunnel.mask.tunnel_id)) >= BIT(24) || + (ntohl(ib_spec->tunnel.val.tunnel_id)) >= BIT(24)) + return -EINVAL; + break; + case IB_FLOW_SPEC_ESP: + ib_filter_sz = offsetof(struct ib_flow_esp_filter, real_sz); + actual_filter_sz = spec_filter_size(kern_spec_mask, + kern_filter_sz, + ib_filter_sz); + if (actual_filter_sz <= 0) + return -EINVAL; + ib_spec->esp.size = sizeof(struct ib_flow_spec_esp); + memcpy(&ib_spec->esp.val, kern_spec_val, actual_filter_sz); + memcpy(&ib_spec->esp.mask, kern_spec_mask, actual_filter_sz); + break; + case IB_FLOW_SPEC_GRE: + ib_filter_sz = offsetof(struct ib_flow_gre_filter, real_sz); + actual_filter_sz = spec_filter_size(kern_spec_mask, + kern_filter_sz, + ib_filter_sz); + if (actual_filter_sz <= 0) + return -EINVAL; + ib_spec->gre.size = sizeof(struct ib_flow_spec_gre); + memcpy(&ib_spec->gre.val, kern_spec_val, actual_filter_sz); + memcpy(&ib_spec->gre.mask, kern_spec_mask, actual_filter_sz); + break; + case IB_FLOW_SPEC_MPLS: + ib_filter_sz = offsetof(struct ib_flow_mpls_filter, real_sz); + actual_filter_sz = spec_filter_size(kern_spec_mask, + kern_filter_sz, + ib_filter_sz); + if (actual_filter_sz <= 0) + return -EINVAL; + ib_spec->mpls.size = sizeof(struct ib_flow_spec_mpls); + memcpy(&ib_spec->mpls.val, kern_spec_val, actual_filter_sz); + memcpy(&ib_spec->mpls.mask, kern_spec_mask, actual_filter_sz); + break; + default: + return -EINVAL; + } + return 0; +} + +static int kern_spec_to_ib_spec_filter(struct ib_uverbs_flow_spec *kern_spec, + union ib_flow_spec *ib_spec) +{ + size_t kern_filter_sz; + void *kern_spec_mask; + void *kern_spec_val; + + if (check_sub_overflow((size_t)kern_spec->hdr.size, + sizeof(struct ib_uverbs_flow_spec_hdr), + &kern_filter_sz)) + return -EINVAL; + + kern_filter_sz /= 2; + + kern_spec_val = (void *)kern_spec + + sizeof(struct ib_uverbs_flow_spec_hdr); + kern_spec_mask = kern_spec_val + kern_filter_sz; + + return ib_uverbs_kern_spec_to_ib_spec_filter(kern_spec->type, + kern_spec_mask, + kern_spec_val, + kern_filter_sz, ib_spec); +} + +static int kern_spec_to_ib_spec(struct uverbs_attr_bundle *attrs, + struct ib_uverbs_flow_spec *kern_spec, + union ib_flow_spec *ib_spec, + struct ib_uflow_resources *uflow_res) +{ + if (kern_spec->reserved) + return -EINVAL; + + if (kern_spec->type >= IB_FLOW_SPEC_ACTION_TAG) + return kern_spec_to_ib_spec_action(attrs, kern_spec, ib_spec, + uflow_res); + else + return kern_spec_to_ib_spec_filter(kern_spec, ib_spec); +} + +static int ib_uverbs_ex_create_wq(struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_ex_create_wq cmd; + struct ib_uverbs_ex_create_wq_resp resp = {}; + struct ib_uwq_object *obj; + int err = 0; + struct ib_cq *cq; + struct ib_pd *pd; + struct ib_wq *wq; + struct ib_wq_init_attr wq_init_attr = {}; + struct ib_device *ib_dev; + + err = uverbs_request(attrs, &cmd, sizeof(cmd)); + if (err) + return err; + + if (cmd.comp_mask) + return -EOPNOTSUPP; + + obj = (struct ib_uwq_object *)uobj_alloc(UVERBS_OBJECT_WQ, attrs, + &ib_dev); + if (IS_ERR(obj)) + return PTR_ERR(obj); + + pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, attrs); + if (!pd) { + err = -EINVAL; + goto err_uobj; + } + + cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs); + if (!cq) { + err = -EINVAL; + goto err_put_pd; + } + + wq_init_attr.cq = cq; + wq_init_attr.max_sge = cmd.max_sge; + wq_init_attr.max_wr = cmd.max_wr; + wq_init_attr.wq_type = cmd.wq_type; + wq_init_attr.event_handler = ib_uverbs_wq_event_handler; + wq_init_attr.create_flags = cmd.create_flags; + INIT_LIST_HEAD(&obj->uevent.event_list); + obj->uevent.uobject.user_handle = cmd.user_handle; + + wq = pd->device->ops.create_wq(pd, &wq_init_attr, &attrs->driver_udata); + if (IS_ERR(wq)) { + err = PTR_ERR(wq); + goto err_put_cq; + } + + wq->uobject = obj; + obj->uevent.uobject.object = wq; + wq->wq_type = wq_init_attr.wq_type; + wq->cq = cq; + wq->pd = pd; + wq->device = pd->device; + atomic_set(&wq->usecnt, 0); + atomic_inc(&pd->usecnt); + atomic_inc(&cq->usecnt); + obj->uevent.event_file = READ_ONCE(attrs->ufile->default_async_file); + if (obj->uevent.event_file) + uverbs_uobject_get(&obj->uevent.event_file->uobj); + + uobj_put_obj_read(pd); + rdma_lookup_put_uobject(&cq->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); + uobj_finalize_uobj_create(&obj->uevent.uobject, attrs); + + resp.wq_handle = obj->uevent.uobject.id; + resp.max_sge = wq_init_attr.max_sge; + resp.max_wr = wq_init_attr.max_wr; + resp.wqn = wq->wq_num; + resp.response_length = uverbs_response_length(attrs, sizeof(resp)); + return uverbs_response(attrs, &resp, sizeof(resp)); + +err_put_cq: + rdma_lookup_put_uobject(&cq->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); +err_put_pd: + uobj_put_obj_read(pd); +err_uobj: + uobj_alloc_abort(&obj->uevent.uobject, attrs); + + return err; +} + +static int ib_uverbs_ex_destroy_wq(struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_ex_destroy_wq cmd; + struct ib_uverbs_ex_destroy_wq_resp resp = {}; + struct ib_uobject *uobj; + struct ib_uwq_object *obj; + int ret; + + ret = uverbs_request(attrs, &cmd, sizeof(cmd)); + if (ret) + return ret; + + if (cmd.comp_mask) + return -EOPNOTSUPP; + + resp.response_length = uverbs_response_length(attrs, sizeof(resp)); + uobj = uobj_get_destroy(UVERBS_OBJECT_WQ, cmd.wq_handle, attrs); + if (IS_ERR(uobj)) + return PTR_ERR(uobj); + + obj = container_of(uobj, struct ib_uwq_object, uevent.uobject); + resp.events_reported = obj->uevent.events_reported; + + uobj_put_destroy(uobj); + + return uverbs_response(attrs, &resp, sizeof(resp)); +} + +static int ib_uverbs_ex_modify_wq(struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_ex_modify_wq cmd; + struct ib_wq *wq; + struct ib_wq_attr wq_attr = {}; + int ret; + + ret = uverbs_request(attrs, &cmd, sizeof(cmd)); + if (ret) + return ret; + + if (!cmd.attr_mask) + return -EINVAL; + + if (cmd.attr_mask > (IB_WQ_STATE | IB_WQ_CUR_STATE | IB_WQ_FLAGS)) + return -EINVAL; + + wq = uobj_get_obj_read(wq, UVERBS_OBJECT_WQ, cmd.wq_handle, attrs); + if (!wq) + return -EINVAL; + + if (cmd.attr_mask & IB_WQ_FLAGS) { + wq_attr.flags = cmd.flags; + wq_attr.flags_mask = cmd.flags_mask; + } + + if (cmd.attr_mask & IB_WQ_CUR_STATE) { + if (cmd.curr_wq_state > IB_WQS_ERR) + return -EINVAL; + + wq_attr.curr_wq_state = cmd.curr_wq_state; + } else { + wq_attr.curr_wq_state = wq->state; + } + + if (cmd.attr_mask & IB_WQ_STATE) { + if (cmd.wq_state > IB_WQS_ERR) + return -EINVAL; + + wq_attr.wq_state = cmd.wq_state; + } else { + wq_attr.wq_state = wq_attr.curr_wq_state; + } + + ret = wq->device->ops.modify_wq(wq, &wq_attr, cmd.attr_mask, + &attrs->driver_udata); + rdma_lookup_put_uobject(&wq->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); + return ret; +} + +static int ib_uverbs_ex_create_rwq_ind_table(struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_ex_create_rwq_ind_table cmd; + struct ib_uverbs_ex_create_rwq_ind_table_resp resp = {}; + struct ib_uobject *uobj; + int err; + struct ib_rwq_ind_table_init_attr init_attr = {}; + struct ib_rwq_ind_table *rwq_ind_tbl; + struct ib_wq **wqs = NULL; + u32 *wqs_handles = NULL; + struct ib_wq *wq = NULL; + int i, num_read_wqs; + u32 num_wq_handles; + struct uverbs_req_iter iter; + struct ib_device *ib_dev; + + err = uverbs_request_start(attrs, &iter, &cmd, sizeof(cmd)); + if (err) + return err; + + if (cmd.comp_mask) + return -EOPNOTSUPP; + + if (cmd.log_ind_tbl_size > IB_USER_VERBS_MAX_LOG_IND_TBL_SIZE) + return -EINVAL; + + num_wq_handles = 1 << cmd.log_ind_tbl_size; + wqs_handles = kcalloc(num_wq_handles, sizeof(*wqs_handles), + GFP_KERNEL); + if (!wqs_handles) + return -ENOMEM; + + err = uverbs_request_next(&iter, wqs_handles, + num_wq_handles * sizeof(__u32)); + if (err) + goto err_free; + + err = uverbs_request_finish(&iter); + if (err) + goto err_free; + + wqs = kcalloc(num_wq_handles, sizeof(*wqs), GFP_KERNEL); + if (!wqs) { + err = -ENOMEM; + goto err_free; + } + + for (num_read_wqs = 0; num_read_wqs < num_wq_handles; + num_read_wqs++) { + wq = uobj_get_obj_read(wq, UVERBS_OBJECT_WQ, + wqs_handles[num_read_wqs], attrs); + if (!wq) { + err = -EINVAL; + goto put_wqs; + } + + wqs[num_read_wqs] = wq; + atomic_inc(&wqs[num_read_wqs]->usecnt); + } + + uobj = uobj_alloc(UVERBS_OBJECT_RWQ_IND_TBL, attrs, &ib_dev); + if (IS_ERR(uobj)) { + err = PTR_ERR(uobj); + goto put_wqs; + } + + rwq_ind_tbl = rdma_zalloc_drv_obj(ib_dev, ib_rwq_ind_table); + if (!rwq_ind_tbl) { + err = -ENOMEM; + goto err_uobj; + } + + init_attr.log_ind_tbl_size = cmd.log_ind_tbl_size; + init_attr.ind_tbl = wqs; + + rwq_ind_tbl->ind_tbl = wqs; + rwq_ind_tbl->log_ind_tbl_size = init_attr.log_ind_tbl_size; + rwq_ind_tbl->uobject = uobj; + uobj->object = rwq_ind_tbl; + rwq_ind_tbl->device = ib_dev; + atomic_set(&rwq_ind_tbl->usecnt, 0); + + err = ib_dev->ops.create_rwq_ind_table(rwq_ind_tbl, &init_attr, + &attrs->driver_udata); + if (err) + goto err_create; + + for (i = 0; i < num_wq_handles; i++) + rdma_lookup_put_uobject(&wqs[i]->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); + kfree(wqs_handles); + uobj_finalize_uobj_create(uobj, attrs); + + resp.ind_tbl_handle = uobj->id; + resp.ind_tbl_num = rwq_ind_tbl->ind_tbl_num; + resp.response_length = uverbs_response_length(attrs, sizeof(resp)); + return uverbs_response(attrs, &resp, sizeof(resp)); + +err_create: + kfree(rwq_ind_tbl); +err_uobj: + uobj_alloc_abort(uobj, attrs); +put_wqs: + for (i = 0; i < num_read_wqs; i++) { + rdma_lookup_put_uobject(&wqs[i]->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); + atomic_dec(&wqs[i]->usecnt); + } +err_free: + kfree(wqs_handles); + kfree(wqs); + return err; +} + +static int ib_uverbs_ex_destroy_rwq_ind_table(struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_ex_destroy_rwq_ind_table cmd; + int ret; + + ret = uverbs_request(attrs, &cmd, sizeof(cmd)); + if (ret) + return ret; + + if (cmd.comp_mask) + return -EOPNOTSUPP; + + return uobj_perform_destroy(UVERBS_OBJECT_RWQ_IND_TBL, + cmd.ind_tbl_handle, attrs); +} + +static int ib_uverbs_ex_create_flow(struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_create_flow cmd; + struct ib_uverbs_create_flow_resp resp = {}; + struct ib_uobject *uobj; + struct ib_flow *flow_id; + struct ib_uverbs_flow_attr *kern_flow_attr; + struct ib_flow_attr *flow_attr; + struct ib_qp *qp; + struct ib_uflow_resources *uflow_res; + struct ib_uverbs_flow_spec_hdr *kern_spec; + struct uverbs_req_iter iter; + int err; + void *ib_spec; + int i; + struct ib_device *ib_dev; + + err = uverbs_request_start(attrs, &iter, &cmd, sizeof(cmd)); + if (err) + return err; + + if (cmd.comp_mask) + return -EINVAL; + + if (!capable(CAP_NET_RAW)) + return -EPERM; + + if (cmd.flow_attr.flags >= IB_FLOW_ATTR_FLAGS_RESERVED) + return -EINVAL; + + if ((cmd.flow_attr.flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP) && + ((cmd.flow_attr.type == IB_FLOW_ATTR_ALL_DEFAULT) || + (cmd.flow_attr.type == IB_FLOW_ATTR_MC_DEFAULT))) + return -EINVAL; + + if (cmd.flow_attr.num_of_specs > IB_FLOW_SPEC_SUPPORT_LAYERS) + return -EINVAL; + + if (cmd.flow_attr.size > + (cmd.flow_attr.num_of_specs * sizeof(struct ib_uverbs_flow_spec))) + return -EINVAL; + + if (cmd.flow_attr.reserved[0] || + cmd.flow_attr.reserved[1]) + return -EINVAL; + + if (cmd.flow_attr.num_of_specs) { + kern_flow_attr = kmalloc(sizeof(*kern_flow_attr) + cmd.flow_attr.size, + GFP_KERNEL); + if (!kern_flow_attr) + return -ENOMEM; + + *kern_flow_attr = cmd.flow_attr; + err = uverbs_request_next(&iter, &kern_flow_attr->flow_specs, + cmd.flow_attr.size); + if (err) + goto err_free_attr; + } else { + kern_flow_attr = &cmd.flow_attr; + } + + err = uverbs_request_finish(&iter); + if (err) + goto err_free_attr; + + uobj = uobj_alloc(UVERBS_OBJECT_FLOW, attrs, &ib_dev); + if (IS_ERR(uobj)) { + err = PTR_ERR(uobj); + goto err_free_attr; + } + + if (!rdma_is_port_valid(uobj->context->device, cmd.flow_attr.port)) { + err = -EINVAL; + goto err_uobj; + } + + qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs); + if (!qp) { + err = -EINVAL; + goto err_uobj; + } + + if (qp->qp_type != IB_QPT_UD && qp->qp_type != IB_QPT_RAW_PACKET) { + err = -EINVAL; + goto err_put; + } + + flow_attr = kzalloc(struct_size(flow_attr, flows, + cmd.flow_attr.num_of_specs), GFP_KERNEL); + if (!flow_attr) { + err = -ENOMEM; + goto err_put; + } + uflow_res = flow_resources_alloc(cmd.flow_attr.num_of_specs); + if (!uflow_res) { + err = -ENOMEM; + goto err_free_flow_attr; + } + + flow_attr->type = kern_flow_attr->type; + flow_attr->priority = kern_flow_attr->priority; + flow_attr->num_of_specs = kern_flow_attr->num_of_specs; + flow_attr->port = kern_flow_attr->port; + flow_attr->flags = kern_flow_attr->flags; + flow_attr->size = sizeof(*flow_attr); + + kern_spec = kern_flow_attr->flow_specs; + ib_spec = flow_attr + 1; + for (i = 0; i < flow_attr->num_of_specs && + cmd.flow_attr.size >= sizeof(*kern_spec) && + cmd.flow_attr.size >= kern_spec->size; + i++) { + err = kern_spec_to_ib_spec( + attrs, (struct ib_uverbs_flow_spec *)kern_spec, + ib_spec, uflow_res); + if (err) + goto err_free; + + flow_attr->size += + ((union ib_flow_spec *) ib_spec)->size; + cmd.flow_attr.size -= kern_spec->size; + kern_spec = ((void *)kern_spec) + kern_spec->size; + ib_spec += ((union ib_flow_spec *) ib_spec)->size; + } + if (cmd.flow_attr.size || (i != flow_attr->num_of_specs)) { + pr_warn("create flow failed, flow %d: %u bytes left from uverb cmd\n", + i, cmd.flow_attr.size); + err = -EINVAL; + goto err_free; + } + + flow_id = qp->device->ops.create_flow(qp, flow_attr, + &attrs->driver_udata); + + if (IS_ERR(flow_id)) { + err = PTR_ERR(flow_id); + goto err_free; + } + + ib_set_flow(uobj, flow_id, qp, qp->device, uflow_res); + + rdma_lookup_put_uobject(&qp->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); + kfree(flow_attr); + + if (cmd.flow_attr.num_of_specs) + kfree(kern_flow_attr); + uobj_finalize_uobj_create(uobj, attrs); + + resp.flow_handle = uobj->id; + return uverbs_response(attrs, &resp, sizeof(resp)); + +err_free: + ib_uverbs_flow_resources_free(uflow_res); +err_free_flow_attr: + kfree(flow_attr); +err_put: + rdma_lookup_put_uobject(&qp->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); +err_uobj: + uobj_alloc_abort(uobj, attrs); +err_free_attr: + if (cmd.flow_attr.num_of_specs) + kfree(kern_flow_attr); + return err; +} + +static int ib_uverbs_ex_destroy_flow(struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_destroy_flow cmd; + int ret; + + ret = uverbs_request(attrs, &cmd, sizeof(cmd)); + if (ret) + return ret; + + if (cmd.comp_mask) + return -EINVAL; + + return uobj_perform_destroy(UVERBS_OBJECT_FLOW, cmd.flow_handle, attrs); +} + +static int __uverbs_create_xsrq(struct uverbs_attr_bundle *attrs, + struct ib_uverbs_create_xsrq *cmd, + struct ib_udata *udata) +{ + struct ib_uverbs_create_srq_resp resp = {}; + struct ib_usrq_object *obj; + struct ib_pd *pd; + struct ib_srq *srq; + struct ib_srq_init_attr attr; + int ret; + struct ib_uobject *xrcd_uobj; + struct ib_device *ib_dev; + + obj = (struct ib_usrq_object *)uobj_alloc(UVERBS_OBJECT_SRQ, attrs, + &ib_dev); + if (IS_ERR(obj)) + return PTR_ERR(obj); + + if (cmd->srq_type == IB_SRQT_TM) + attr.ext.tag_matching.max_num_tags = cmd->max_num_tags; + + if (cmd->srq_type == IB_SRQT_XRC) { + xrcd_uobj = uobj_get_read(UVERBS_OBJECT_XRCD, cmd->xrcd_handle, + attrs); + if (IS_ERR(xrcd_uobj)) { + ret = -EINVAL; + goto err; + } + + attr.ext.xrc.xrcd = (struct ib_xrcd *)xrcd_uobj->object; + if (!attr.ext.xrc.xrcd) { + ret = -EINVAL; + goto err_put_xrcd; + } + + obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject); + atomic_inc(&obj->uxrcd->refcnt); + } + + if (ib_srq_has_cq(cmd->srq_type)) { + attr.ext.cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, + cmd->cq_handle, attrs); + if (!attr.ext.cq) { + ret = -EINVAL; + goto err_put_xrcd; + } + } + + pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd->pd_handle, attrs); + if (!pd) { + ret = -EINVAL; + goto err_put_cq; + } + + attr.event_handler = ib_uverbs_srq_event_handler; + attr.srq_type = cmd->srq_type; + attr.attr.max_wr = cmd->max_wr; + attr.attr.max_sge = cmd->max_sge; + attr.attr.srq_limit = cmd->srq_limit; + + INIT_LIST_HEAD(&obj->uevent.event_list); + obj->uevent.uobject.user_handle = cmd->user_handle; + + srq = ib_create_srq_user(pd, &attr, obj, udata); + if (IS_ERR(srq)) { + ret = PTR_ERR(srq); + goto err_put_pd; + } + + obj->uevent.uobject.object = srq; + obj->uevent.uobject.user_handle = cmd->user_handle; + obj->uevent.event_file = READ_ONCE(attrs->ufile->default_async_file); + if (obj->uevent.event_file) + uverbs_uobject_get(&obj->uevent.event_file->uobj); + + if (cmd->srq_type == IB_SRQT_XRC) + resp.srqn = srq->ext.xrc.srq_num; + + if (cmd->srq_type == IB_SRQT_XRC) + uobj_put_read(xrcd_uobj); + + if (ib_srq_has_cq(cmd->srq_type)) + rdma_lookup_put_uobject(&attr.ext.cq->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); + + uobj_put_obj_read(pd); + uobj_finalize_uobj_create(&obj->uevent.uobject, attrs); + + resp.srq_handle = obj->uevent.uobject.id; + resp.max_wr = attr.attr.max_wr; + resp.max_sge = attr.attr.max_sge; + return uverbs_response(attrs, &resp, sizeof(resp)); + +err_put_pd: + uobj_put_obj_read(pd); +err_put_cq: + if (ib_srq_has_cq(cmd->srq_type)) + rdma_lookup_put_uobject(&attr.ext.cq->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); + +err_put_xrcd: + if (cmd->srq_type == IB_SRQT_XRC) { + atomic_dec(&obj->uxrcd->refcnt); + uobj_put_read(xrcd_uobj); + } + +err: + uobj_alloc_abort(&obj->uevent.uobject, attrs); + return ret; +} + +static int ib_uverbs_create_srq(struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_create_srq cmd; + struct ib_uverbs_create_xsrq xcmd; + int ret; + + ret = uverbs_request(attrs, &cmd, sizeof(cmd)); + if (ret) + return ret; + + memset(&xcmd, 0, sizeof(xcmd)); + xcmd.response = cmd.response; + xcmd.user_handle = cmd.user_handle; + xcmd.srq_type = IB_SRQT_BASIC; + xcmd.pd_handle = cmd.pd_handle; + xcmd.max_wr = cmd.max_wr; + xcmd.max_sge = cmd.max_sge; + xcmd.srq_limit = cmd.srq_limit; + + return __uverbs_create_xsrq(attrs, &xcmd, &attrs->driver_udata); +} + +static int ib_uverbs_create_xsrq(struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_create_xsrq cmd; + int ret; + + ret = uverbs_request(attrs, &cmd, sizeof(cmd)); + if (ret) + return ret; + + return __uverbs_create_xsrq(attrs, &cmd, &attrs->driver_udata); +} + +static int ib_uverbs_modify_srq(struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_modify_srq cmd; + struct ib_srq *srq; + struct ib_srq_attr attr; + int ret; + + ret = uverbs_request(attrs, &cmd, sizeof(cmd)); + if (ret) + return ret; + + srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, attrs); + if (!srq) + return -EINVAL; + + attr.max_wr = cmd.max_wr; + attr.srq_limit = cmd.srq_limit; + + ret = srq->device->ops.modify_srq(srq, &attr, cmd.attr_mask, + &attrs->driver_udata); + + rdma_lookup_put_uobject(&srq->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); + + return ret; +} + +static int ib_uverbs_query_srq(struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_query_srq cmd; + struct ib_uverbs_query_srq_resp resp; + struct ib_srq_attr attr; + struct ib_srq *srq; + int ret; + + ret = uverbs_request(attrs, &cmd, sizeof(cmd)); + if (ret) + return ret; + + srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, attrs); + if (!srq) + return -EINVAL; + + ret = ib_query_srq(srq, &attr); + + rdma_lookup_put_uobject(&srq->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); + + if (ret) + return ret; + + memset(&resp, 0, sizeof resp); + + resp.max_wr = attr.max_wr; + resp.max_sge = attr.max_sge; + resp.srq_limit = attr.srq_limit; + + return uverbs_response(attrs, &resp, sizeof(resp)); +} + +static int ib_uverbs_destroy_srq(struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_destroy_srq cmd; + struct ib_uverbs_destroy_srq_resp resp; + struct ib_uobject *uobj; + struct ib_uevent_object *obj; + int ret; + + ret = uverbs_request(attrs, &cmd, sizeof(cmd)); + if (ret) + return ret; + + uobj = uobj_get_destroy(UVERBS_OBJECT_SRQ, cmd.srq_handle, attrs); + if (IS_ERR(uobj)) + return PTR_ERR(uobj); + + obj = container_of(uobj, struct ib_uevent_object, uobject); + memset(&resp, 0, sizeof(resp)); + resp.events_reported = obj->events_reported; + + uobj_put_destroy(uobj); + + return uverbs_response(attrs, &resp, sizeof(resp)); +} + +static int ib_uverbs_ex_query_device(struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_ex_query_device_resp resp = {}; + struct ib_uverbs_ex_query_device cmd; + struct ib_device_attr attr = {0}; + struct ib_ucontext *ucontext; + struct ib_device *ib_dev; + int err; + + ucontext = ib_uverbs_get_ucontext(attrs); + if (IS_ERR(ucontext)) + return PTR_ERR(ucontext); + ib_dev = ucontext->device; + + err = uverbs_request(attrs, &cmd, sizeof(cmd)); + if (err) + return err; + + if (cmd.comp_mask) + return -EINVAL; + + if (cmd.reserved) + return -EINVAL; + + err = ib_dev->ops.query_device(ib_dev, &attr, &attrs->driver_udata); + if (err) + return err; + + copy_query_dev_fields(ucontext, &resp.base, &attr); + + resp.odp_caps.general_caps = attr.odp_caps.general_caps; + resp.odp_caps.per_transport_caps.rc_odp_caps = + attr.odp_caps.per_transport_caps.rc_odp_caps; + resp.odp_caps.per_transport_caps.uc_odp_caps = + attr.odp_caps.per_transport_caps.uc_odp_caps; + resp.odp_caps.per_transport_caps.ud_odp_caps = + attr.odp_caps.per_transport_caps.ud_odp_caps; + resp.xrc_odp_caps = attr.odp_caps.per_transport_caps.xrc_odp_caps; + + resp.timestamp_mask = attr.timestamp_mask; + resp.hca_core_clock = attr.hca_core_clock; + resp.device_cap_flags_ex = attr.device_cap_flags; + resp.rss_caps.supported_qpts = attr.rss_caps.supported_qpts; + resp.rss_caps.max_rwq_indirection_tables = + attr.rss_caps.max_rwq_indirection_tables; + resp.rss_caps.max_rwq_indirection_table_size = + attr.rss_caps.max_rwq_indirection_table_size; + resp.max_wq_type_rq = attr.max_wq_type_rq; + resp.raw_packet_caps = attr.raw_packet_caps; + resp.tm_caps.max_rndv_hdr_size = attr.tm_caps.max_rndv_hdr_size; + resp.tm_caps.max_num_tags = attr.tm_caps.max_num_tags; + resp.tm_caps.max_ops = attr.tm_caps.max_ops; + resp.tm_caps.max_sge = attr.tm_caps.max_sge; + resp.tm_caps.flags = attr.tm_caps.flags; + resp.cq_moderation_caps.max_cq_moderation_count = + attr.cq_caps.max_cq_moderation_count; + resp.cq_moderation_caps.max_cq_moderation_period = + attr.cq_caps.max_cq_moderation_period; + resp.max_dm_size = attr.max_dm_size; + resp.response_length = uverbs_response_length(attrs, sizeof(resp)); + + return uverbs_response(attrs, &resp, sizeof(resp)); +} + +static int ib_uverbs_ex_modify_cq(struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_ex_modify_cq cmd; + struct ib_cq *cq; + int ret; + + ret = uverbs_request(attrs, &cmd, sizeof(cmd)); + if (ret) + return ret; + + if (!cmd.attr_mask || cmd.reserved) + return -EINVAL; + + if (cmd.attr_mask > IB_CQ_MODERATE) + return -EOPNOTSUPP; + + cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs); + if (!cq) + return -EINVAL; + + ret = rdma_set_cq_moderation(cq, cmd.attr.cq_count, cmd.attr.cq_period); + + rdma_lookup_put_uobject(&cq->uobject->uevent.uobject, + UVERBS_LOOKUP_READ); + return ret; +} + +/* + * Describe the input structs for write(). Some write methods have an input + * only struct, most have an input and output. If the struct has an output then + * the 'response' u64 must be the first field in the request structure. + * + * If udata is present then both the request and response structs have a + * trailing driver_data flex array. In this case the size of the base struct + * cannot be changed. + */ +#define UAPI_DEF_WRITE_IO(req, resp) \ + .write.has_resp = 1 + \ + BUILD_BUG_ON_ZERO(offsetof(req, response) != 0) + \ + BUILD_BUG_ON_ZERO(sizeof_field(req, response) != \ + sizeof(u64)), \ + .write.req_size = sizeof(req), .write.resp_size = sizeof(resp) + +#define UAPI_DEF_WRITE_I(req) .write.req_size = sizeof(req) + +#define UAPI_DEF_WRITE_UDATA_IO(req, resp) \ + UAPI_DEF_WRITE_IO(req, resp), \ + .write.has_udata = \ + 1 + \ + BUILD_BUG_ON_ZERO(offsetof(req, driver_data) != \ + sizeof(req)) + \ + BUILD_BUG_ON_ZERO(offsetof(resp, driver_data) != \ + sizeof(resp)) + +#define UAPI_DEF_WRITE_UDATA_I(req) \ + UAPI_DEF_WRITE_I(req), \ + .write.has_udata = \ + 1 + BUILD_BUG_ON_ZERO(offsetof(req, driver_data) != \ + sizeof(req)) + +/* + * The _EX versions are for use with WRITE_EX and allow the last struct member + * to be specified. Buffers that do not include that member will be rejected. + */ +#define UAPI_DEF_WRITE_IO_EX(req, req_last_member, resp, resp_last_member) \ + .write.has_resp = 1, \ + .write.req_size = offsetofend(req, req_last_member), \ + .write.resp_size = offsetofend(resp, resp_last_member) + +#define UAPI_DEF_WRITE_I_EX(req, req_last_member) \ + .write.req_size = offsetofend(req, req_last_member) + +const struct uapi_definition uverbs_def_write_intf[] = { + DECLARE_UVERBS_OBJECT( + UVERBS_OBJECT_AH, + DECLARE_UVERBS_WRITE(IB_USER_VERBS_CMD_CREATE_AH, + ib_uverbs_create_ah, + UAPI_DEF_WRITE_UDATA_IO( + struct ib_uverbs_create_ah, + struct ib_uverbs_create_ah_resp)), + DECLARE_UVERBS_WRITE( + IB_USER_VERBS_CMD_DESTROY_AH, + ib_uverbs_destroy_ah, + UAPI_DEF_WRITE_I(struct ib_uverbs_destroy_ah)), + UAPI_DEF_OBJ_NEEDS_FN(create_user_ah), + UAPI_DEF_OBJ_NEEDS_FN(destroy_ah)), + + DECLARE_UVERBS_OBJECT( + UVERBS_OBJECT_COMP_CHANNEL, + DECLARE_UVERBS_WRITE( + IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL, + ib_uverbs_create_comp_channel, + UAPI_DEF_WRITE_IO( + struct ib_uverbs_create_comp_channel, + struct ib_uverbs_create_comp_channel_resp))), + + DECLARE_UVERBS_OBJECT( + UVERBS_OBJECT_CQ, + DECLARE_UVERBS_WRITE(IB_USER_VERBS_CMD_CREATE_CQ, + ib_uverbs_create_cq, + UAPI_DEF_WRITE_UDATA_IO( + struct ib_uverbs_create_cq, + struct ib_uverbs_create_cq_resp), + UAPI_DEF_METHOD_NEEDS_FN(create_cq)), + DECLARE_UVERBS_WRITE( + IB_USER_VERBS_CMD_DESTROY_CQ, + ib_uverbs_destroy_cq, + UAPI_DEF_WRITE_IO(struct ib_uverbs_destroy_cq, + struct ib_uverbs_destroy_cq_resp), + UAPI_DEF_METHOD_NEEDS_FN(destroy_cq)), + DECLARE_UVERBS_WRITE( + IB_USER_VERBS_CMD_POLL_CQ, + ib_uverbs_poll_cq, + UAPI_DEF_WRITE_IO(struct ib_uverbs_poll_cq, + struct ib_uverbs_poll_cq_resp), + UAPI_DEF_METHOD_NEEDS_FN(poll_cq)), + DECLARE_UVERBS_WRITE( + IB_USER_VERBS_CMD_REQ_NOTIFY_CQ, + ib_uverbs_req_notify_cq, + UAPI_DEF_WRITE_I(struct ib_uverbs_req_notify_cq), + UAPI_DEF_METHOD_NEEDS_FN(req_notify_cq)), + DECLARE_UVERBS_WRITE(IB_USER_VERBS_CMD_RESIZE_CQ, + ib_uverbs_resize_cq, + UAPI_DEF_WRITE_UDATA_IO( + struct ib_uverbs_resize_cq, + struct ib_uverbs_resize_cq_resp), + UAPI_DEF_METHOD_NEEDS_FN(resize_cq)), + DECLARE_UVERBS_WRITE_EX( + IB_USER_VERBS_EX_CMD_CREATE_CQ, + ib_uverbs_ex_create_cq, + UAPI_DEF_WRITE_IO_EX(struct ib_uverbs_ex_create_cq, + reserved, + struct ib_uverbs_ex_create_cq_resp, + response_length), + UAPI_DEF_METHOD_NEEDS_FN(create_cq)), + DECLARE_UVERBS_WRITE_EX( + IB_USER_VERBS_EX_CMD_MODIFY_CQ, + ib_uverbs_ex_modify_cq, + UAPI_DEF_WRITE_I(struct ib_uverbs_ex_modify_cq), + UAPI_DEF_METHOD_NEEDS_FN(modify_cq))), + + DECLARE_UVERBS_OBJECT( + UVERBS_OBJECT_DEVICE, + DECLARE_UVERBS_WRITE(IB_USER_VERBS_CMD_GET_CONTEXT, + ib_uverbs_get_context, + UAPI_DEF_WRITE_UDATA_IO( + struct ib_uverbs_get_context, + struct ib_uverbs_get_context_resp)), + DECLARE_UVERBS_WRITE( + IB_USER_VERBS_CMD_QUERY_DEVICE, + ib_uverbs_query_device, + UAPI_DEF_WRITE_IO(struct ib_uverbs_query_device, + struct ib_uverbs_query_device_resp)), + DECLARE_UVERBS_WRITE( + IB_USER_VERBS_CMD_QUERY_PORT, + ib_uverbs_query_port, + UAPI_DEF_WRITE_IO(struct ib_uverbs_query_port, + struct ib_uverbs_query_port_resp), + UAPI_DEF_METHOD_NEEDS_FN(query_port)), + DECLARE_UVERBS_WRITE_EX( + IB_USER_VERBS_EX_CMD_QUERY_DEVICE, + ib_uverbs_ex_query_device, + UAPI_DEF_WRITE_IO_EX( + struct ib_uverbs_ex_query_device, + reserved, + struct ib_uverbs_ex_query_device_resp, + response_length), + UAPI_DEF_METHOD_NEEDS_FN(query_device)), + UAPI_DEF_OBJ_NEEDS_FN(alloc_ucontext), + UAPI_DEF_OBJ_NEEDS_FN(dealloc_ucontext)), + + DECLARE_UVERBS_OBJECT( + UVERBS_OBJECT_FLOW, + DECLARE_UVERBS_WRITE_EX( + IB_USER_VERBS_EX_CMD_CREATE_FLOW, + ib_uverbs_ex_create_flow, + UAPI_DEF_WRITE_IO_EX(struct ib_uverbs_create_flow, + flow_attr, + struct ib_uverbs_create_flow_resp, + flow_handle), + UAPI_DEF_METHOD_NEEDS_FN(create_flow)), + DECLARE_UVERBS_WRITE_EX( + IB_USER_VERBS_EX_CMD_DESTROY_FLOW, + ib_uverbs_ex_destroy_flow, + UAPI_DEF_WRITE_I(struct ib_uverbs_destroy_flow), + UAPI_DEF_METHOD_NEEDS_FN(destroy_flow))), + + DECLARE_UVERBS_OBJECT( + UVERBS_OBJECT_MR, + DECLARE_UVERBS_WRITE(IB_USER_VERBS_CMD_DEREG_MR, + ib_uverbs_dereg_mr, + UAPI_DEF_WRITE_I(struct ib_uverbs_dereg_mr), + UAPI_DEF_METHOD_NEEDS_FN(dereg_mr)), + DECLARE_UVERBS_WRITE( + IB_USER_VERBS_CMD_REG_MR, + ib_uverbs_reg_mr, + UAPI_DEF_WRITE_UDATA_IO(struct ib_uverbs_reg_mr, + struct ib_uverbs_reg_mr_resp), + UAPI_DEF_METHOD_NEEDS_FN(reg_user_mr)), + DECLARE_UVERBS_WRITE( + IB_USER_VERBS_CMD_REREG_MR, + ib_uverbs_rereg_mr, + UAPI_DEF_WRITE_UDATA_IO(struct ib_uverbs_rereg_mr, + struct ib_uverbs_rereg_mr_resp), + UAPI_DEF_METHOD_NEEDS_FN(rereg_user_mr))), + + DECLARE_UVERBS_OBJECT( + UVERBS_OBJECT_MW, + DECLARE_UVERBS_WRITE( + IB_USER_VERBS_CMD_ALLOC_MW, + ib_uverbs_alloc_mw, + UAPI_DEF_WRITE_UDATA_IO(struct ib_uverbs_alloc_mw, + struct ib_uverbs_alloc_mw_resp), + UAPI_DEF_METHOD_NEEDS_FN(alloc_mw)), + DECLARE_UVERBS_WRITE( + IB_USER_VERBS_CMD_DEALLOC_MW, + ib_uverbs_dealloc_mw, + UAPI_DEF_WRITE_I(struct ib_uverbs_dealloc_mw), + UAPI_DEF_METHOD_NEEDS_FN(dealloc_mw))), + + DECLARE_UVERBS_OBJECT( + UVERBS_OBJECT_PD, + DECLARE_UVERBS_WRITE( + IB_USER_VERBS_CMD_ALLOC_PD, + ib_uverbs_alloc_pd, + UAPI_DEF_WRITE_UDATA_IO(struct ib_uverbs_alloc_pd, + struct ib_uverbs_alloc_pd_resp), + UAPI_DEF_METHOD_NEEDS_FN(alloc_pd)), + DECLARE_UVERBS_WRITE( + IB_USER_VERBS_CMD_DEALLOC_PD, + ib_uverbs_dealloc_pd, + UAPI_DEF_WRITE_I(struct ib_uverbs_dealloc_pd), + UAPI_DEF_METHOD_NEEDS_FN(dealloc_pd))), + + DECLARE_UVERBS_OBJECT( + UVERBS_OBJECT_QP, + DECLARE_UVERBS_WRITE( + IB_USER_VERBS_CMD_ATTACH_MCAST, + ib_uverbs_attach_mcast, + UAPI_DEF_WRITE_I(struct ib_uverbs_attach_mcast), + UAPI_DEF_METHOD_NEEDS_FN(attach_mcast), + UAPI_DEF_METHOD_NEEDS_FN(detach_mcast)), + DECLARE_UVERBS_WRITE(IB_USER_VERBS_CMD_CREATE_QP, + ib_uverbs_create_qp, + UAPI_DEF_WRITE_UDATA_IO( + struct ib_uverbs_create_qp, + struct ib_uverbs_create_qp_resp), + UAPI_DEF_METHOD_NEEDS_FN(create_qp)), + DECLARE_UVERBS_WRITE( + IB_USER_VERBS_CMD_DESTROY_QP, + ib_uverbs_destroy_qp, + UAPI_DEF_WRITE_IO(struct ib_uverbs_destroy_qp, + struct ib_uverbs_destroy_qp_resp), + UAPI_DEF_METHOD_NEEDS_FN(destroy_qp)), + DECLARE_UVERBS_WRITE( + IB_USER_VERBS_CMD_DETACH_MCAST, + ib_uverbs_detach_mcast, + UAPI_DEF_WRITE_I(struct ib_uverbs_detach_mcast), + UAPI_DEF_METHOD_NEEDS_FN(detach_mcast)), + DECLARE_UVERBS_WRITE( + IB_USER_VERBS_CMD_MODIFY_QP, + ib_uverbs_modify_qp, + UAPI_DEF_WRITE_I(struct ib_uverbs_modify_qp), + UAPI_DEF_METHOD_NEEDS_FN(modify_qp)), + DECLARE_UVERBS_WRITE( + IB_USER_VERBS_CMD_POST_RECV, + ib_uverbs_post_recv, + UAPI_DEF_WRITE_IO(struct ib_uverbs_post_recv, + struct ib_uverbs_post_recv_resp), + UAPI_DEF_METHOD_NEEDS_FN(post_recv)), + DECLARE_UVERBS_WRITE( + IB_USER_VERBS_CMD_POST_SEND, + ib_uverbs_post_send, + UAPI_DEF_WRITE_IO(struct ib_uverbs_post_send, + struct ib_uverbs_post_send_resp), + UAPI_DEF_METHOD_NEEDS_FN(post_send)), + DECLARE_UVERBS_WRITE( + IB_USER_VERBS_CMD_QUERY_QP, + ib_uverbs_query_qp, + UAPI_DEF_WRITE_IO(struct ib_uverbs_query_qp, + struct ib_uverbs_query_qp_resp), + UAPI_DEF_METHOD_NEEDS_FN(query_qp)), + DECLARE_UVERBS_WRITE_EX( + IB_USER_VERBS_EX_CMD_CREATE_QP, + ib_uverbs_ex_create_qp, + UAPI_DEF_WRITE_IO_EX(struct ib_uverbs_ex_create_qp, + comp_mask, + struct ib_uverbs_ex_create_qp_resp, + response_length), + UAPI_DEF_METHOD_NEEDS_FN(create_qp)), + DECLARE_UVERBS_WRITE_EX( + IB_USER_VERBS_EX_CMD_MODIFY_QP, + ib_uverbs_ex_modify_qp, + UAPI_DEF_WRITE_IO_EX(struct ib_uverbs_ex_modify_qp, + base, + struct ib_uverbs_ex_modify_qp_resp, + response_length), + UAPI_DEF_METHOD_NEEDS_FN(modify_qp))), + + DECLARE_UVERBS_OBJECT( + UVERBS_OBJECT_RWQ_IND_TBL, + DECLARE_UVERBS_WRITE_EX( + IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL, + ib_uverbs_ex_create_rwq_ind_table, + UAPI_DEF_WRITE_IO_EX( + struct ib_uverbs_ex_create_rwq_ind_table, + log_ind_tbl_size, + struct ib_uverbs_ex_create_rwq_ind_table_resp, + ind_tbl_num), + UAPI_DEF_METHOD_NEEDS_FN(create_rwq_ind_table)), + DECLARE_UVERBS_WRITE_EX( + IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL, + ib_uverbs_ex_destroy_rwq_ind_table, + UAPI_DEF_WRITE_I( + struct ib_uverbs_ex_destroy_rwq_ind_table), + UAPI_DEF_METHOD_NEEDS_FN(destroy_rwq_ind_table))), + + DECLARE_UVERBS_OBJECT( + UVERBS_OBJECT_WQ, + DECLARE_UVERBS_WRITE_EX( + IB_USER_VERBS_EX_CMD_CREATE_WQ, + ib_uverbs_ex_create_wq, + UAPI_DEF_WRITE_IO_EX(struct ib_uverbs_ex_create_wq, + max_sge, + struct ib_uverbs_ex_create_wq_resp, + wqn), + UAPI_DEF_METHOD_NEEDS_FN(create_wq)), + DECLARE_UVERBS_WRITE_EX( + IB_USER_VERBS_EX_CMD_DESTROY_WQ, + ib_uverbs_ex_destroy_wq, + UAPI_DEF_WRITE_IO_EX(struct ib_uverbs_ex_destroy_wq, + wq_handle, + struct ib_uverbs_ex_destroy_wq_resp, + reserved), + UAPI_DEF_METHOD_NEEDS_FN(destroy_wq)), + DECLARE_UVERBS_WRITE_EX( + IB_USER_VERBS_EX_CMD_MODIFY_WQ, + ib_uverbs_ex_modify_wq, + UAPI_DEF_WRITE_I_EX(struct ib_uverbs_ex_modify_wq, + curr_wq_state), + UAPI_DEF_METHOD_NEEDS_FN(modify_wq))), + + DECLARE_UVERBS_OBJECT( + UVERBS_OBJECT_SRQ, + DECLARE_UVERBS_WRITE(IB_USER_VERBS_CMD_CREATE_SRQ, + ib_uverbs_create_srq, + UAPI_DEF_WRITE_UDATA_IO( + struct ib_uverbs_create_srq, + struct ib_uverbs_create_srq_resp), + UAPI_DEF_METHOD_NEEDS_FN(create_srq)), + DECLARE_UVERBS_WRITE(IB_USER_VERBS_CMD_CREATE_XSRQ, + ib_uverbs_create_xsrq, + UAPI_DEF_WRITE_UDATA_IO( + struct ib_uverbs_create_xsrq, + struct ib_uverbs_create_srq_resp), + UAPI_DEF_METHOD_NEEDS_FN(create_srq)), + DECLARE_UVERBS_WRITE( + IB_USER_VERBS_CMD_DESTROY_SRQ, + ib_uverbs_destroy_srq, + UAPI_DEF_WRITE_IO(struct ib_uverbs_destroy_srq, + struct ib_uverbs_destroy_srq_resp), + UAPI_DEF_METHOD_NEEDS_FN(destroy_srq)), + DECLARE_UVERBS_WRITE( + IB_USER_VERBS_CMD_MODIFY_SRQ, + ib_uverbs_modify_srq, + UAPI_DEF_WRITE_UDATA_I(struct ib_uverbs_modify_srq), + UAPI_DEF_METHOD_NEEDS_FN(modify_srq)), + DECLARE_UVERBS_WRITE( + IB_USER_VERBS_CMD_POST_SRQ_RECV, + ib_uverbs_post_srq_recv, + UAPI_DEF_WRITE_IO(struct ib_uverbs_post_srq_recv, + struct ib_uverbs_post_srq_recv_resp), + UAPI_DEF_METHOD_NEEDS_FN(post_srq_recv)), + DECLARE_UVERBS_WRITE( + IB_USER_VERBS_CMD_QUERY_SRQ, + ib_uverbs_query_srq, + UAPI_DEF_WRITE_IO(struct ib_uverbs_query_srq, + struct ib_uverbs_query_srq_resp), + UAPI_DEF_METHOD_NEEDS_FN(query_srq))), + + DECLARE_UVERBS_OBJECT( + UVERBS_OBJECT_XRCD, + DECLARE_UVERBS_WRITE( + IB_USER_VERBS_CMD_CLOSE_XRCD, + ib_uverbs_close_xrcd, + UAPI_DEF_WRITE_I(struct ib_uverbs_close_xrcd)), + DECLARE_UVERBS_WRITE(IB_USER_VERBS_CMD_OPEN_QP, + ib_uverbs_open_qp, + UAPI_DEF_WRITE_UDATA_IO( + struct ib_uverbs_open_qp, + struct ib_uverbs_create_qp_resp)), + DECLARE_UVERBS_WRITE(IB_USER_VERBS_CMD_OPEN_XRCD, + ib_uverbs_open_xrcd, + UAPI_DEF_WRITE_UDATA_IO( + struct ib_uverbs_open_xrcd, + struct ib_uverbs_open_xrcd_resp)), + UAPI_DEF_OBJ_NEEDS_FN(alloc_xrcd), + UAPI_DEF_OBJ_NEEDS_FN(dealloc_xrcd)), + + {}, +}; diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_ioctl.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_ioctl.c new file mode 100644 index 0000000..0cbbb66 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_ioctl.c @@ -0,0 +1,836 @@ +/* + * Copyright (c) 2017, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include "rdma_core.h" +#include "uverbs.h" + +struct bundle_alloc_head { + struct bundle_alloc_head *next; + u8 data[]; +}; + +struct bundle_priv { + /* Must be first */ + struct bundle_alloc_head alloc_head; + struct bundle_alloc_head *allocated_mem; + size_t internal_avail; + size_t internal_used; + + struct radix_tree_root *radix; + const struct uverbs_api_ioctl_method *method_elm; + void __rcu **radix_slots; + unsigned long radix_slots_len; + u32 method_key; + + struct ib_uverbs_attr __user *user_attrs; + struct ib_uverbs_attr *uattrs; + + DECLARE_BITMAP(uobj_finalize, UVERBS_API_ATTR_BKEY_LEN); + DECLARE_BITMAP(spec_finalize, UVERBS_API_ATTR_BKEY_LEN); + DECLARE_BITMAP(uobj_hw_obj_valid, UVERBS_API_ATTR_BKEY_LEN); + + /* + * Must be last. bundle ends in a flex array which overlaps + * internal_buffer. + */ + struct uverbs_attr_bundle bundle; + u64 internal_buffer[32]; +}; + +/* + * Each method has an absolute minimum amount of memory it needs to allocate, + * precompute that amount and determine if the onstack memory can be used or + * if allocation is need. + */ +void uapi_compute_bundle_size(struct uverbs_api_ioctl_method *method_elm, + unsigned int num_attrs) +{ + struct bundle_priv *pbundle; + size_t bundle_size = + offsetof(struct bundle_priv, internal_buffer) + + sizeof(*pbundle->bundle.attrs) * method_elm->key_bitmap_len + + sizeof(*pbundle->uattrs) * num_attrs; + + method_elm->use_stack = bundle_size <= sizeof(*pbundle); + method_elm->bundle_size = + ALIGN(bundle_size + 256, sizeof(*pbundle->internal_buffer)); + + /* Do not want order-2 allocations for this. */ + WARN_ON_ONCE(method_elm->bundle_size > PAGE_SIZE); +} + +/** + * _uverbs_alloc() - Quickly allocate memory for use with a bundle + * @bundle: The bundle + * @size: Number of bytes to allocate + * @flags: Allocator flags + * + * The bundle allocator is intended for allocations that are connected with + * processing the system call related to the bundle. The allocated memory is + * always freed once the system call completes, and cannot be freed any other + * way. + * + * This tries to use a small pool of pre-allocated memory for performance. + */ +__malloc void *_uverbs_alloc(struct uverbs_attr_bundle *bundle, size_t size, + gfp_t flags) +{ + struct bundle_priv *pbundle = + container_of(bundle, struct bundle_priv, bundle); + size_t new_used; + void *res; + + if (check_add_overflow(size, pbundle->internal_used, &new_used)) + return ERR_PTR(-EOVERFLOW); + + if (new_used > pbundle->internal_avail) { + struct bundle_alloc_head *buf; + + buf = kvmalloc(struct_size(buf, data, size), flags); + if (!buf) + return ERR_PTR(-ENOMEM); + buf->next = pbundle->allocated_mem; + pbundle->allocated_mem = buf; + return buf->data; + } + + res = (void *)pbundle->internal_buffer + pbundle->internal_used; + pbundle->internal_used = + ALIGN(new_used, sizeof(*pbundle->internal_buffer)); + if (want_init_on_alloc(flags)) + memset(res, 0, size); + return res; +} +EXPORT_SYMBOL(_uverbs_alloc); + +static bool uverbs_is_attr_cleared(const struct ib_uverbs_attr *uattr, + u16 len) +{ + if (uattr->len > sizeof_field(struct ib_uverbs_attr, data)) + return ib_is_buffer_cleared(u64_to_user_ptr(uattr->data) + len, + uattr->len - len); + + return !memchr_inv((const void *)&uattr->data + len, + 0, uattr->len - len); +} + +static int uverbs_set_output(const struct uverbs_attr_bundle *bundle, + const struct uverbs_attr *attr) +{ + struct bundle_priv *pbundle = + container_of(bundle, struct bundle_priv, bundle); + u16 flags; + + flags = pbundle->uattrs[attr->ptr_attr.uattr_idx].flags | + UVERBS_ATTR_F_VALID_OUTPUT; + if (put_user(flags, + &pbundle->user_attrs[attr->ptr_attr.uattr_idx].flags)) + return -EFAULT; + return 0; +} + +static int uverbs_process_idrs_array(struct bundle_priv *pbundle, + const struct uverbs_api_attr *attr_uapi, + struct uverbs_objs_arr_attr *attr, + struct ib_uverbs_attr *uattr, + u32 attr_bkey) +{ + const struct uverbs_attr_spec *spec = &attr_uapi->spec; + size_t array_len; + u32 *idr_vals; + int ret = 0; + size_t i; + + if (uattr->attr_data.reserved) + return -EINVAL; + + if (uattr->len % sizeof(u32)) + return -EINVAL; + + array_len = uattr->len / sizeof(u32); + if (array_len < spec->u2.objs_arr.min_len || + array_len > spec->u2.objs_arr.max_len) + return -EINVAL; + + attr->uobjects = + uverbs_alloc(&pbundle->bundle, + array_size(array_len, sizeof(*attr->uobjects))); + if (IS_ERR(attr->uobjects)) + return PTR_ERR(attr->uobjects); + + /* + * Since idr is 4B and *uobjects is >= 4B, we can use attr->uobjects + * to store idrs array and avoid additional memory allocation. The + * idrs array is offset to the end of the uobjects array so we will be + * able to read idr and replace with a pointer. + */ + idr_vals = (u32 *)(attr->uobjects + array_len) - array_len; + + if (uattr->len > sizeof(uattr->data)) { + ret = copy_from_user(idr_vals, u64_to_user_ptr(uattr->data), + uattr->len); + if (ret) + return -EFAULT; + } else { + memcpy(idr_vals, &uattr->data, uattr->len); + } + + for (i = 0; i != array_len; i++) { + attr->uobjects[i] = uverbs_get_uobject_from_file( + spec->u2.objs_arr.obj_type, spec->u2.objs_arr.access, + idr_vals[i], &pbundle->bundle); + if (IS_ERR(attr->uobjects[i])) { + ret = PTR_ERR(attr->uobjects[i]); + break; + } + } + + attr->len = i; + __set_bit(attr_bkey, pbundle->spec_finalize); + return ret; +} + +static void uverbs_free_idrs_array(const struct uverbs_api_attr *attr_uapi, + struct uverbs_objs_arr_attr *attr, + bool commit, + struct uverbs_attr_bundle *attrs) +{ + const struct uverbs_attr_spec *spec = &attr_uapi->spec; + size_t i; + + for (i = 0; i != attr->len; i++) + uverbs_finalize_object(attr->uobjects[i], + spec->u2.objs_arr.access, false, commit, + attrs); +} + +static int uverbs_process_attr(struct bundle_priv *pbundle, + const struct uverbs_api_attr *attr_uapi, + struct ib_uverbs_attr *uattr, u32 attr_bkey) +{ + const struct uverbs_attr_spec *spec = &attr_uapi->spec; + struct uverbs_attr *e = &pbundle->bundle.attrs[attr_bkey]; + const struct uverbs_attr_spec *val_spec = spec; + struct uverbs_obj_attr *o_attr; + + switch (spec->type) { + case UVERBS_ATTR_TYPE_ENUM_IN: + if (uattr->attr_data.enum_data.elem_id >= spec->u.enum_def.num_elems) + return -EOPNOTSUPP; + + if (uattr->attr_data.enum_data.reserved) + return -EINVAL; + + val_spec = &spec->u2.enum_def.ids[uattr->attr_data.enum_data.elem_id]; + + /* Currently we only support PTR_IN based enums */ + if (val_spec->type != UVERBS_ATTR_TYPE_PTR_IN) + return -EOPNOTSUPP; + + e->ptr_attr.enum_id = uattr->attr_data.enum_data.elem_id; + fallthrough; + case UVERBS_ATTR_TYPE_PTR_IN: + /* Ensure that any data provided by userspace beyond the known + * struct is zero. Userspace that knows how to use some future + * longer struct will fail here if used with an old kernel and + * non-zero content, making ABI compat/discovery simpler. + */ + if (uattr->len > val_spec->u.ptr.len && + val_spec->zero_trailing && + !uverbs_is_attr_cleared(uattr, val_spec->u.ptr.len)) + return -EOPNOTSUPP; + + fallthrough; + case UVERBS_ATTR_TYPE_PTR_OUT: + if (uattr->len < val_spec->u.ptr.min_len || + (!val_spec->zero_trailing && + uattr->len > val_spec->u.ptr.len)) + return -EINVAL; + + if (spec->type != UVERBS_ATTR_TYPE_ENUM_IN && + uattr->attr_data.reserved) + return -EINVAL; + + e->ptr_attr.uattr_idx = uattr - pbundle->uattrs; + e->ptr_attr.len = uattr->len; + + if (val_spec->alloc_and_copy && !uverbs_attr_ptr_is_inline(e)) { + void *p; + + p = uverbs_alloc(&pbundle->bundle, uattr->len); + if (IS_ERR(p)) + return PTR_ERR(p); + + e->ptr_attr.ptr = p; + + if (copy_from_user(p, u64_to_user_ptr(uattr->data), + uattr->len)) + return -EFAULT; + } else { + e->ptr_attr.data = uattr->data; + } + break; + + case UVERBS_ATTR_TYPE_IDR: + case UVERBS_ATTR_TYPE_FD: + if (uattr->attr_data.reserved) + return -EINVAL; + + if (uattr->len != 0) + return -EINVAL; + + o_attr = &e->obj_attr; + o_attr->attr_elm = attr_uapi; + + /* + * The type of uattr->data is u64 for UVERBS_ATTR_TYPE_IDR and + * s64 for UVERBS_ATTR_TYPE_FD. We can cast the u64 to s64 + * here without caring about truncation as we know that the + * IDR implementation today rejects negative IDs + */ + o_attr->uobject = uverbs_get_uobject_from_file( + spec->u.obj.obj_type, spec->u.obj.access, + uattr->data_s64, &pbundle->bundle); + if (IS_ERR(o_attr->uobject)) + return PTR_ERR(o_attr->uobject); + __set_bit(attr_bkey, pbundle->uobj_finalize); + + if (spec->u.obj.access == UVERBS_ACCESS_NEW) { + unsigned int uattr_idx = uattr - pbundle->uattrs; + s64 id = o_attr->uobject->id; + + /* Copy the allocated id to the user-space */ + if (put_user(id, &pbundle->user_attrs[uattr_idx].data)) + return -EFAULT; + } + + break; + + case UVERBS_ATTR_TYPE_IDRS_ARRAY: + return uverbs_process_idrs_array(pbundle, attr_uapi, + &e->objs_arr_attr, uattr, + attr_bkey); + default: + return -EOPNOTSUPP; + } + + return 0; +} + +/* + * We search the radix tree with the method prefix and now we want to fast + * search the suffix bits to get a particular attribute pointer. It is not + * totally clear to me if this breaks the radix tree encasulation or not, but + * it uses the iter data to determine if the method iter points at the same + * chunk that will store the attribute, if so it just derefs it directly. By + * construction in most kernel configs the method and attrs will all fit in a + * single radix chunk, so in most cases this will have no search. Other cases + * this falls back to a full search. + */ +static void __rcu **uapi_get_attr_for_method(struct bundle_priv *pbundle, + u32 attr_key) +{ + void __rcu **slot; + + if (likely(attr_key < pbundle->radix_slots_len)) { + void *entry; + + slot = pbundle->radix_slots + attr_key; + entry = rcu_dereference_raw(*slot); + if (likely(!radix_tree_is_internal_node(entry) && entry)) + return slot; + } + + return radix_tree_lookup_slot(pbundle->radix, + pbundle->method_key | attr_key); +} + +static int uverbs_set_attr(struct bundle_priv *pbundle, + struct ib_uverbs_attr *uattr) +{ + u32 attr_key = uapi_key_attr(uattr->attr_id); + u32 attr_bkey = uapi_bkey_attr(attr_key); + const struct uverbs_api_attr *attr; + void __rcu **slot; + int ret; + + slot = uapi_get_attr_for_method(pbundle, attr_key); + if (!slot) { + /* + * Kernel does not support the attribute but user-space says it + * is mandatory + */ + if (uattr->flags & UVERBS_ATTR_F_MANDATORY) + return -EPROTONOSUPPORT; + return 0; + } + attr = rcu_dereference_protected(*slot, true); + + /* Reject duplicate attributes from user-space */ + if (test_bit(attr_bkey, pbundle->bundle.attr_present)) + return -EINVAL; + + ret = uverbs_process_attr(pbundle, attr, uattr, attr_bkey); + if (ret) + return ret; + + __set_bit(attr_bkey, pbundle->bundle.attr_present); + + return 0; +} + +static int ib_uverbs_run_method(struct bundle_priv *pbundle, + unsigned int num_attrs) +{ + int (*handler)(struct uverbs_attr_bundle *attrs); + size_t uattrs_size = array_size(sizeof(*pbundle->uattrs), num_attrs); + unsigned int destroy_bkey = pbundle->method_elm->destroy_bkey; + unsigned int i; + int ret; + + /* See uverbs_disassociate_api() */ + handler = srcu_dereference( + pbundle->method_elm->handler, + &pbundle->bundle.ufile->device->disassociate_srcu); + if (!handler) + return -EIO; + + pbundle->uattrs = uverbs_alloc(&pbundle->bundle, uattrs_size); + if (IS_ERR(pbundle->uattrs)) + return PTR_ERR(pbundle->uattrs); + if (copy_from_user(pbundle->uattrs, pbundle->user_attrs, uattrs_size)) + return -EFAULT; + + for (i = 0; i != num_attrs; i++) { + ret = uverbs_set_attr(pbundle, &pbundle->uattrs[i]); + if (unlikely(ret)) + return ret; + } + + /* User space did not provide all the mandatory attributes */ + if (unlikely(!bitmap_subset(pbundle->method_elm->attr_mandatory, + pbundle->bundle.attr_present, + pbundle->method_elm->key_bitmap_len))) + return -EINVAL; + + if (pbundle->method_elm->has_udata) + uverbs_fill_udata(&pbundle->bundle, + &pbundle->bundle.driver_udata, + UVERBS_ATTR_UHW_IN, UVERBS_ATTR_UHW_OUT); + else + pbundle->bundle.driver_udata = (struct ib_udata){}; + + if (destroy_bkey != UVERBS_API_ATTR_BKEY_LEN) { + struct uverbs_obj_attr *destroy_attr = + &pbundle->bundle.attrs[destroy_bkey].obj_attr; + + ret = uobj_destroy(destroy_attr->uobject, &pbundle->bundle); + if (ret) + return ret; + __clear_bit(destroy_bkey, pbundle->uobj_finalize); + + ret = handler(&pbundle->bundle); + uobj_put_destroy(destroy_attr->uobject); + } else { + ret = handler(&pbundle->bundle); + } + + /* + * Until the drivers are revised to use the bundle directly we have to + * assume that the driver wrote to its UHW_OUT and flag userspace + * appropriately. + */ + if (!ret && pbundle->method_elm->has_udata) { + const struct uverbs_attr *attr = + uverbs_attr_get(&pbundle->bundle, UVERBS_ATTR_UHW_OUT); + + if (!IS_ERR(attr)) + ret = uverbs_set_output(&pbundle->bundle, attr); + } + + /* + * EPROTONOSUPPORT is ONLY to be returned if the ioctl framework can + * not invoke the method because the request is not supported. No + * other cases should return this code. + */ + if (WARN_ON_ONCE(ret == -EPROTONOSUPPORT)) + return -EINVAL; + + return ret; +} + +static void bundle_destroy(struct bundle_priv *pbundle, bool commit) +{ + unsigned int key_bitmap_len = pbundle->method_elm->key_bitmap_len; + struct bundle_alloc_head *memblock; + unsigned int i; + + /* fast path for simple uobjects */ + i = -1; + while ((i = find_next_bit(pbundle->uobj_finalize, key_bitmap_len, + i + 1)) < key_bitmap_len) { + struct uverbs_attr *attr = &pbundle->bundle.attrs[i]; + + uverbs_finalize_object( + attr->obj_attr.uobject, + attr->obj_attr.attr_elm->spec.u.obj.access, + test_bit(i, pbundle->uobj_hw_obj_valid), + commit, + &pbundle->bundle); + } + + i = -1; + while ((i = find_next_bit(pbundle->spec_finalize, key_bitmap_len, + i + 1)) < key_bitmap_len) { + struct uverbs_attr *attr = &pbundle->bundle.attrs[i]; + const struct uverbs_api_attr *attr_uapi; + void __rcu **slot; + + slot = uapi_get_attr_for_method( + pbundle, + pbundle->method_key | uapi_bkey_to_key_attr(i)); + if (WARN_ON(!slot)) + continue; + + attr_uapi = rcu_dereference_protected(*slot, true); + + if (attr_uapi->spec.type == UVERBS_ATTR_TYPE_IDRS_ARRAY) { + uverbs_free_idrs_array(attr_uapi, &attr->objs_arr_attr, + commit, &pbundle->bundle); + } + } + + for (memblock = pbundle->allocated_mem; memblock;) { + struct bundle_alloc_head *tmp = memblock; + + memblock = memblock->next; + kvfree(tmp); + } +} + +static int ib_uverbs_cmd_verbs(struct ib_uverbs_file *ufile, + struct ib_uverbs_ioctl_hdr *hdr, + struct ib_uverbs_attr __user *user_attrs) +{ + const struct uverbs_api_ioctl_method *method_elm; + struct uverbs_api *uapi = ufile->device->uapi; + struct radix_tree_iter attrs_iter; + struct bundle_priv *pbundle; + struct bundle_priv onstack; + void __rcu **slot; + int ret; + + if (unlikely(hdr->driver_id != uapi->driver_id)) + return -EINVAL; + + slot = radix_tree_iter_lookup( + &uapi->radix, &attrs_iter, + uapi_key_obj(hdr->object_id) | + uapi_key_ioctl_method(hdr->method_id)); + if (unlikely(!slot)) + return -EPROTONOSUPPORT; + method_elm = rcu_dereference_protected(*slot, true); + + if (!method_elm->use_stack) { + pbundle = kmalloc(method_elm->bundle_size, GFP_KERNEL); + if (!pbundle) + return -ENOMEM; + pbundle->internal_avail = + method_elm->bundle_size - + offsetof(struct bundle_priv, internal_buffer); + pbundle->alloc_head.next = NULL; + pbundle->allocated_mem = &pbundle->alloc_head; + } else { + pbundle = &onstack; + pbundle->internal_avail = sizeof(pbundle->internal_buffer); + pbundle->allocated_mem = NULL; + } + + /* Space for the pbundle->bundle.attrs flex array */ + pbundle->method_elm = method_elm; + pbundle->method_key = attrs_iter.index; + pbundle->bundle.ufile = ufile; + pbundle->bundle.context = NULL; /* only valid if bundle has uobject */ + pbundle->radix = &uapi->radix; + pbundle->radix_slots = slot; + pbundle->radix_slots_len = radix_tree_chunk_size(&attrs_iter); + pbundle->user_attrs = user_attrs; + + pbundle->internal_used = ALIGN(pbundle->method_elm->key_bitmap_len * + sizeof(*pbundle->bundle.attrs), + sizeof(*pbundle->internal_buffer)); + memset(pbundle->bundle.attr_present, 0, + sizeof(pbundle->bundle.attr_present)); + memset(pbundle->uobj_finalize, 0, sizeof(pbundle->uobj_finalize)); + memset(pbundle->spec_finalize, 0, sizeof(pbundle->spec_finalize)); + memset(pbundle->uobj_hw_obj_valid, 0, + sizeof(pbundle->uobj_hw_obj_valid)); + + ret = ib_uverbs_run_method(pbundle, hdr->num_attrs); + bundle_destroy(pbundle, ret == 0); + return ret; +} + +long ib_uverbs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct ib_uverbs_file *file = filp->private_data; + struct ib_uverbs_ioctl_hdr __user *user_hdr = + (struct ib_uverbs_ioctl_hdr __user *)arg; + struct ib_uverbs_ioctl_hdr hdr; + int srcu_key; + int err; + + if (unlikely(cmd != RDMA_VERBS_IOCTL)) + return -ENOIOCTLCMD; + + err = copy_from_user(&hdr, user_hdr, sizeof(hdr)); + if (err) + return -EFAULT; + + if (hdr.length > PAGE_SIZE || + hdr.length != struct_size(&hdr, attrs, hdr.num_attrs)) + return -EINVAL; + + if (hdr.reserved1 || hdr.reserved2) + return -EPROTONOSUPPORT; + + srcu_key = srcu_read_lock(&file->device->disassociate_srcu); + err = ib_uverbs_cmd_verbs(file, &hdr, user_hdr->attrs); + srcu_read_unlock(&file->device->disassociate_srcu, srcu_key); + return err; +} + +int uverbs_get_flags64(u64 *to, const struct uverbs_attr_bundle *attrs_bundle, + size_t idx, u64 allowed_bits) +{ + const struct uverbs_attr *attr; + u64 flags; + + attr = uverbs_attr_get(attrs_bundle, idx); + /* Missing attribute means 0 flags */ + if (IS_ERR(attr)) { + *to = 0; + return 0; + } + + /* + * New userspace code should use 8 bytes to pass flags, but we + * transparently support old userspaces that were using 4 bytes as + * well. + */ + if (attr->ptr_attr.len == 8) + flags = attr->ptr_attr.data; + else if (attr->ptr_attr.len == 4) + flags = *(u32 *)&attr->ptr_attr.data; + else + return -EINVAL; + + if (flags & ~allowed_bits) + return -EINVAL; + + *to = flags; + return 0; +} +EXPORT_SYMBOL(uverbs_get_flags64); + +int uverbs_set_flags64(const struct uverbs_attr_bundle *attrs_bundle, + size_t idx, const u64 flags) +{ + return uverbs_copy_to(attrs_bundle, idx, &flags, sizeof(u64)); +} +EXPORT_SYMBOL(uverbs_set_flags64); + +int uverbs_get_flags32(u32 *to, const struct uverbs_attr_bundle *attrs_bundle, + size_t idx, u64 allowed_bits) +{ + u64 flags; + int ret; + + ret = uverbs_get_flags64(&flags, attrs_bundle, idx, allowed_bits); + if (ret) + return ret; + + if (flags > U32_MAX) + return -EINVAL; + *to = flags; + + return 0; +} +EXPORT_SYMBOL(uverbs_get_flags32); + +/* + * Fill a ib_udata struct (core or uhw) using the given attribute IDs. + * This is primarily used to convert the UVERBS_ATTR_UHW() into the + * ib_udata format used by the drivers. + */ +void uverbs_fill_udata(struct uverbs_attr_bundle *bundle, + struct ib_udata *udata, unsigned int attr_in, + unsigned int attr_out) +{ + struct bundle_priv *pbundle = + container_of(bundle, struct bundle_priv, bundle); + const struct uverbs_attr *in = + uverbs_attr_get(&pbundle->bundle, attr_in); + const struct uverbs_attr *out = + uverbs_attr_get(&pbundle->bundle, attr_out); + + if (!IS_ERR(in)) { + udata->inlen = in->ptr_attr.len; + if (uverbs_attr_ptr_is_inline(in)) + udata->inbuf = + &pbundle->user_attrs[in->ptr_attr.uattr_idx] + .data; + else + udata->inbuf = u64_to_user_ptr(in->ptr_attr.data); + } else { + udata->inbuf = NULL; + udata->inlen = 0; + } + + if (!IS_ERR(out)) { + udata->outbuf = u64_to_user_ptr(out->ptr_attr.data); + udata->outlen = out->ptr_attr.len; + } else { + udata->outbuf = NULL; + udata->outlen = 0; + } +} + +int uverbs_copy_to(const struct uverbs_attr_bundle *bundle, size_t idx, + const void *from, size_t size) +{ + const struct uverbs_attr *attr = uverbs_attr_get(bundle, idx); + size_t min_size; + + if (IS_ERR(attr)) + return PTR_ERR(attr); + + min_size = min_t(size_t, attr->ptr_attr.len, size); + if (copy_to_user(u64_to_user_ptr(attr->ptr_attr.data), from, min_size)) + return -EFAULT; + + return uverbs_set_output(bundle, attr); +} +EXPORT_SYMBOL(uverbs_copy_to); + + +/* + * This is only used if the caller has directly used copy_to_use to write the + * data. It signals to user space that the buffer is filled in. + */ +int uverbs_output_written(const struct uverbs_attr_bundle *bundle, size_t idx) +{ + const struct uverbs_attr *attr = uverbs_attr_get(bundle, idx); + + if (IS_ERR(attr)) + return PTR_ERR(attr); + + return uverbs_set_output(bundle, attr); +} + +int _uverbs_get_const_signed(s64 *to, + const struct uverbs_attr_bundle *attrs_bundle, + size_t idx, s64 lower_bound, u64 upper_bound, + s64 *def_val) +{ + const struct uverbs_attr *attr; + + attr = uverbs_attr_get(attrs_bundle, idx); + if (IS_ERR(attr)) { + if ((PTR_ERR(attr) != -ENOENT) || !def_val) + return PTR_ERR(attr); + + *to = *def_val; + } else { + *to = attr->ptr_attr.data; + } + + if (*to < lower_bound || (*to > 0 && (u64)*to > upper_bound)) + return -EINVAL; + + return 0; +} +EXPORT_SYMBOL(_uverbs_get_const_signed); + +int _uverbs_get_const_unsigned(u64 *to, + const struct uverbs_attr_bundle *attrs_bundle, + size_t idx, u64 upper_bound, u64 *def_val) +{ + const struct uverbs_attr *attr; + + attr = uverbs_attr_get(attrs_bundle, idx); + if (IS_ERR(attr)) { + if ((PTR_ERR(attr) != -ENOENT) || !def_val) + return PTR_ERR(attr); + + *to = *def_val; + } else { + *to = attr->ptr_attr.data; + } + + if (*to > upper_bound) + return -EINVAL; + + return 0; +} +EXPORT_SYMBOL(_uverbs_get_const_unsigned); + +int uverbs_copy_to_struct_or_zero(const struct uverbs_attr_bundle *bundle, + size_t idx, const void *from, size_t size) +{ + const struct uverbs_attr *attr = uverbs_attr_get(bundle, idx); + + if (IS_ERR(attr)) + return PTR_ERR(attr); + + if (size < attr->ptr_attr.len) { + if (clear_user(u64_to_user_ptr(attr->ptr_attr.data) + size, + attr->ptr_attr.len - size)) + return -EFAULT; + } + return uverbs_copy_to(bundle, idx, from, size); +} +EXPORT_SYMBOL(uverbs_copy_to_struct_or_zero); + +/* Once called an abort will call through to the type's destroy_hw() */ +void uverbs_finalize_uobj_create(const struct uverbs_attr_bundle *bundle, + u16 idx) +{ + struct bundle_priv *pbundle = + container_of(bundle, struct bundle_priv, bundle); + + __set_bit(uapi_bkey_attr(uapi_key_attr(idx)), + pbundle->uobj_hw_obj_valid); +} +EXPORT_SYMBOL(uverbs_finalize_uobj_create); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_main.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_main.c new file mode 100644 index 0000000..d544340 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_main.c @@ -0,0 +1,1317 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * Copyright (c) 2005 Voltaire, Inc. All rights reserved. + * Copyright (c) 2005 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include "uverbs.h" +#include "core_priv.h" +#include "rdma_core.h" + +MODULE_AUTHOR("Roland Dreier"); +MODULE_DESCRIPTION("InfiniBand userspace verbs access"); +MODULE_LICENSE("Dual BSD/GPL"); + +enum { + IB_UVERBS_MAJOR = 231, + IB_UVERBS_BASE_MINOR = 192, + IB_UVERBS_MAX_DEVICES = RDMA_MAX_PORTS, + IB_UVERBS_NUM_FIXED_MINOR = 32, + IB_UVERBS_NUM_DYNAMIC_MINOR = IB_UVERBS_MAX_DEVICES - IB_UVERBS_NUM_FIXED_MINOR, +}; + +#define IB_UVERBS_BASE_DEV MKDEV(IB_UVERBS_MAJOR, IB_UVERBS_BASE_MINOR) + +static dev_t dynamic_uverbs_dev; +static struct class *uverbs_class; + +static DEFINE_IDA(uverbs_ida); +static int ib_uverbs_add_one(struct ib_device *device); +static void ib_uverbs_remove_one(struct ib_device *device, void *client_data); + +/* + * Must be called with the ufile->device->disassociate_srcu held, and the lock + * must be held until use of the ucontext is finished. + */ +struct ib_ucontext *ib_uverbs_get_ucontext_file(struct ib_uverbs_file *ufile) +{ + /* + * We do not hold the hw_destroy_rwsem lock for this flow, instead + * srcu is used. It does not matter if someone races this with + * get_context, we get NULL or valid ucontext. + */ + struct ib_ucontext *ucontext = smp_load_acquire(&ufile->ucontext); + + if (!srcu_dereference(ufile->device->ib_dev, + &ufile->device->disassociate_srcu)) + return ERR_PTR(-EIO); + + if (!ucontext) + return ERR_PTR(-EINVAL); + + return ucontext; +} +EXPORT_SYMBOL(ib_uverbs_get_ucontext_file); + +int uverbs_dealloc_mw(struct ib_mw *mw) +{ + struct ib_pd *pd = mw->pd; + int ret; + + ret = mw->device->ops.dealloc_mw(mw); + if (ret) + return ret; + + atomic_dec(&pd->usecnt); + kfree(mw); + return ret; +} + +static void ib_uverbs_release_dev(struct device *device) +{ + struct ib_uverbs_device *dev = + container_of(device, struct ib_uverbs_device, dev); + + uverbs_destroy_api(dev->uapi); + cleanup_srcu_struct(&dev->disassociate_srcu); + mutex_destroy(&dev->lists_mutex); + mutex_destroy(&dev->xrcd_tree_mutex); + kfree(dev); +} + +void ib_uverbs_release_ucq(struct ib_uverbs_completion_event_file *ev_file, + struct ib_ucq_object *uobj) +{ + struct ib_uverbs_event *evt, *tmp; + + if (ev_file) { + spin_lock_irq(&ev_file->ev_queue.lock); + list_for_each_entry_safe(evt, tmp, &uobj->comp_list, obj_list) { + list_del(&evt->list); + kfree(evt); + } + spin_unlock_irq(&ev_file->ev_queue.lock); + + uverbs_uobject_put(&ev_file->uobj); + } + + ib_uverbs_release_uevent(&uobj->uevent); +} + +void ib_uverbs_release_uevent(struct ib_uevent_object *uobj) +{ + struct ib_uverbs_async_event_file *async_file = uobj->event_file; + struct ib_uverbs_event *evt, *tmp; + + if (!async_file) + return; + + spin_lock_irq(&async_file->ev_queue.lock); + list_for_each_entry_safe(evt, tmp, &uobj->event_list, obj_list) { + list_del(&evt->list); + kfree(evt); + } + spin_unlock_irq(&async_file->ev_queue.lock); + uverbs_uobject_put(&async_file->uobj); +} + +void ib_uverbs_detach_umcast(struct ib_qp *qp, + struct ib_uqp_object *uobj) +{ + struct ib_uverbs_mcast_entry *mcast, *tmp; + + list_for_each_entry_safe(mcast, tmp, &uobj->mcast_list, list) { + ib_detach_mcast(qp, &mcast->gid, mcast->lid); + list_del(&mcast->list); + kfree(mcast); + } +} + +static void ib_uverbs_comp_dev(struct ib_uverbs_device *dev) +{ + complete(&dev->comp); +} + +void ib_uverbs_release_file(struct kref *ref) +{ + struct ib_uverbs_file *file = + container_of(ref, struct ib_uverbs_file, ref); + struct ib_device *ib_dev; + int srcu_key; + + release_ufile_idr_uobject(file); + + srcu_key = srcu_read_lock(&file->device->disassociate_srcu); + ib_dev = srcu_dereference(file->device->ib_dev, + &file->device->disassociate_srcu); + if (ib_dev && !ib_dev->ops.disassociate_ucontext) + module_put(ib_dev->ops.owner); + srcu_read_unlock(&file->device->disassociate_srcu, srcu_key); + + if (refcount_dec_and_test(&file->device->refcount)) + ib_uverbs_comp_dev(file->device); + + if (file->default_async_file) + uverbs_uobject_put(&file->default_async_file->uobj); + put_device(&file->device->dev); + + if (file->disassociate_page) + __free_pages(file->disassociate_page, 0); + mutex_destroy(&file->umap_lock); + mutex_destroy(&file->ucontext_lock); + kfree(file); +} + +static ssize_t ib_uverbs_event_read(struct ib_uverbs_event_queue *ev_queue, + struct file *filp, char __user *buf, + size_t count, loff_t *pos, + size_t eventsz) +{ + struct ib_uverbs_event *event; + int ret = 0; + + spin_lock_irq(&ev_queue->lock); + + while (list_empty(&ev_queue->event_list)) { + spin_unlock_irq(&ev_queue->lock); + + if (filp->f_flags & O_NONBLOCK) + return -EAGAIN; + + if (wait_event_interruptible(ev_queue->poll_wait, + (!list_empty(&ev_queue->event_list) || + ev_queue->is_closed))) + return -ERESTARTSYS; + + spin_lock_irq(&ev_queue->lock); + + /* If device was disassociated and no event exists set an error */ + if (list_empty(&ev_queue->event_list) && ev_queue->is_closed) { + spin_unlock_irq(&ev_queue->lock); + return -EIO; + } + } + + event = list_entry(ev_queue->event_list.next, struct ib_uverbs_event, list); + + if (eventsz > count) { + ret = -EINVAL; + event = NULL; + } else { + list_del(ev_queue->event_list.next); + if (event->counter) { + ++(*event->counter); + list_del(&event->obj_list); + } + } + + spin_unlock_irq(&ev_queue->lock); + + if (event) { + if (copy_to_user(buf, event, eventsz)) + ret = -EFAULT; + else + ret = eventsz; + } + + kfree(event); + + return ret; +} + +static ssize_t ib_uverbs_async_event_read(struct file *filp, char __user *buf, + size_t count, loff_t *pos) +{ + struct ib_uverbs_async_event_file *file = filp->private_data; + + return ib_uverbs_event_read(&file->ev_queue, filp, buf, count, pos, + sizeof(struct ib_uverbs_async_event_desc)); +} + +static ssize_t ib_uverbs_comp_event_read(struct file *filp, char __user *buf, + size_t count, loff_t *pos) +{ + struct ib_uverbs_completion_event_file *comp_ev_file = + filp->private_data; + + return ib_uverbs_event_read(&comp_ev_file->ev_queue, filp, buf, count, + pos, + sizeof(struct ib_uverbs_comp_event_desc)); +} + +static __poll_t ib_uverbs_event_poll(struct ib_uverbs_event_queue *ev_queue, + struct file *filp, + struct poll_table_struct *wait) +{ + __poll_t pollflags = 0; + + poll_wait(filp, &ev_queue->poll_wait, wait); + + spin_lock_irq(&ev_queue->lock); + if (!list_empty(&ev_queue->event_list)) + pollflags = EPOLLIN | EPOLLRDNORM; + else if (ev_queue->is_closed) + pollflags = EPOLLERR; + spin_unlock_irq(&ev_queue->lock); + + return pollflags; +} + +static __poll_t ib_uverbs_async_event_poll(struct file *filp, + struct poll_table_struct *wait) +{ + struct ib_uverbs_async_event_file *file = filp->private_data; + + return ib_uverbs_event_poll(&file->ev_queue, filp, wait); +} + +static __poll_t ib_uverbs_comp_event_poll(struct file *filp, + struct poll_table_struct *wait) +{ + struct ib_uverbs_completion_event_file *comp_ev_file = + filp->private_data; + + return ib_uverbs_event_poll(&comp_ev_file->ev_queue, filp, wait); +} + +static int ib_uverbs_async_event_fasync(int fd, struct file *filp, int on) +{ + struct ib_uverbs_async_event_file *file = filp->private_data; + + return fasync_helper(fd, filp, on, &file->ev_queue.async_queue); +} + +static int ib_uverbs_comp_event_fasync(int fd, struct file *filp, int on) +{ + struct ib_uverbs_completion_event_file *comp_ev_file = + filp->private_data; + + return fasync_helper(fd, filp, on, &comp_ev_file->ev_queue.async_queue); +} + +const struct file_operations uverbs_event_fops = { + .owner = THIS_MODULE, + .read = ib_uverbs_comp_event_read, + .poll = ib_uverbs_comp_event_poll, + .release = uverbs_uobject_fd_release, + .fasync = ib_uverbs_comp_event_fasync, + .llseek = no_llseek, +}; + +const struct file_operations uverbs_async_event_fops = { + .owner = THIS_MODULE, + .read = ib_uverbs_async_event_read, + .poll = ib_uverbs_async_event_poll, + .release = uverbs_async_event_release, + .fasync = ib_uverbs_async_event_fasync, + .llseek = no_llseek, +}; + +void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context) +{ + struct ib_uverbs_event_queue *ev_queue = cq_context; + struct ib_ucq_object *uobj; + struct ib_uverbs_event *entry; + unsigned long flags; + + if (!ev_queue) + return; + + spin_lock_irqsave(&ev_queue->lock, flags); + if (ev_queue->is_closed) { + spin_unlock_irqrestore(&ev_queue->lock, flags); + return; + } + + entry = kmalloc(sizeof(*entry), GFP_ATOMIC); + if (!entry) { + spin_unlock_irqrestore(&ev_queue->lock, flags); + return; + } + + uobj = cq->uobject; + + entry->desc.comp.cq_handle = cq->uobject->uevent.uobject.user_handle; + entry->counter = &uobj->comp_events_reported; + + list_add_tail(&entry->list, &ev_queue->event_list); + list_add_tail(&entry->obj_list, &uobj->comp_list); + spin_unlock_irqrestore(&ev_queue->lock, flags); + + wake_up_interruptible(&ev_queue->poll_wait); + kill_fasync(&ev_queue->async_queue, SIGIO, POLL_IN); +} + +void ib_uverbs_async_handler(struct ib_uverbs_async_event_file *async_file, + __u64 element, __u64 event, + struct list_head *obj_list, u32 *counter) +{ + struct ib_uverbs_event *entry; + unsigned long flags; + + if (!async_file) + return; + + spin_lock_irqsave(&async_file->ev_queue.lock, flags); + if (async_file->ev_queue.is_closed) { + spin_unlock_irqrestore(&async_file->ev_queue.lock, flags); + return; + } + + entry = kmalloc(sizeof(*entry), GFP_ATOMIC); + if (!entry) { + spin_unlock_irqrestore(&async_file->ev_queue.lock, flags); + return; + } + + entry->desc.async.element = element; + entry->desc.async.event_type = event; + entry->desc.async.reserved = 0; + entry->counter = counter; + + list_add_tail(&entry->list, &async_file->ev_queue.event_list); + if (obj_list) + list_add_tail(&entry->obj_list, obj_list); + spin_unlock_irqrestore(&async_file->ev_queue.lock, flags); + + wake_up_interruptible(&async_file->ev_queue.poll_wait); + kill_fasync(&async_file->ev_queue.async_queue, SIGIO, POLL_IN); +} + +static void uverbs_uobj_event(struct ib_uevent_object *eobj, + struct ib_event *event) +{ + ib_uverbs_async_handler(eobj->event_file, + eobj->uobject.user_handle, event->event, + &eobj->event_list, &eobj->events_reported); +} + +void ib_uverbs_cq_event_handler(struct ib_event *event, void *context_ptr) +{ + uverbs_uobj_event(&event->element.cq->uobject->uevent, event); +} + +void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr) +{ + /* for XRC target qp's, check that qp is live */ + if (!event->element.qp->uobject) + return; + + uverbs_uobj_event(&event->element.qp->uobject->uevent, event); +} + +void ib_uverbs_wq_event_handler(struct ib_event *event, void *context_ptr) +{ + uverbs_uobj_event(&event->element.wq->uobject->uevent, event); +} + +void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr) +{ + uverbs_uobj_event(&event->element.srq->uobject->uevent, event); +} + +static void ib_uverbs_event_handler(struct ib_event_handler *handler, + struct ib_event *event) +{ + ib_uverbs_async_handler( + container_of(handler, struct ib_uverbs_async_event_file, + event_handler), + event->element.port_num, event->event, NULL, NULL); +} + +void ib_uverbs_init_event_queue(struct ib_uverbs_event_queue *ev_queue) +{ + spin_lock_init(&ev_queue->lock); + INIT_LIST_HEAD(&ev_queue->event_list); + init_waitqueue_head(&ev_queue->poll_wait); + ev_queue->is_closed = 0; + ev_queue->async_queue = NULL; +} + +void ib_uverbs_init_async_event_file( + struct ib_uverbs_async_event_file *async_file) +{ + struct ib_uverbs_file *uverbs_file = async_file->uobj.ufile; + struct ib_device *ib_dev = async_file->uobj.context->device; + + ib_uverbs_init_event_queue(&async_file->ev_queue); + + /* The first async_event_file becomes the default one for the file. */ + mutex_lock(&uverbs_file->ucontext_lock); + if (!uverbs_file->default_async_file) { + /* Pairs with the put in ib_uverbs_release_file */ + uverbs_uobject_get(&async_file->uobj); + smp_store_release(&uverbs_file->default_async_file, async_file); + } + mutex_unlock(&uverbs_file->ucontext_lock); + + INIT_IB_EVENT_HANDLER(&async_file->event_handler, ib_dev, + ib_uverbs_event_handler); + ib_register_event_handler(&async_file->event_handler); +} + +static ssize_t verify_hdr(struct ib_uverbs_cmd_hdr *hdr, + struct ib_uverbs_ex_cmd_hdr *ex_hdr, size_t count, + const struct uverbs_api_write_method *method_elm) +{ + if (method_elm->is_ex) { + count -= sizeof(*hdr) + sizeof(*ex_hdr); + + if ((hdr->in_words + ex_hdr->provider_in_words) * 8 != count) + return -EINVAL; + + if (hdr->in_words * 8 < method_elm->req_size) + return -ENOSPC; + + if (ex_hdr->cmd_hdr_reserved) + return -EINVAL; + + if (ex_hdr->response) { + if (!hdr->out_words && !ex_hdr->provider_out_words) + return -EINVAL; + + if (hdr->out_words * 8 < method_elm->resp_size) + return -ENOSPC; + + if (!access_ok(u64_to_user_ptr(ex_hdr->response), + (hdr->out_words + ex_hdr->provider_out_words) * 8)) + return -EFAULT; + } else { + if (hdr->out_words || ex_hdr->provider_out_words) + return -EINVAL; + } + + return 0; + } + + /* not extended command */ + if (hdr->in_words * 4 != count) + return -EINVAL; + + if (count < method_elm->req_size + sizeof(hdr)) { + /* + * rdma-core v18 and v19 have a bug where they send DESTROY_CQ + * with a 16 byte write instead of 24. Old kernels didn't + * check the size so they allowed this. Now that the size is + * checked provide a compatibility work around to not break + * those userspaces. + */ + if (hdr->command == IB_USER_VERBS_CMD_DESTROY_CQ && + count == 16) { + hdr->in_words = 6; + return 0; + } + return -ENOSPC; + } + if (hdr->out_words * 4 < method_elm->resp_size) + return -ENOSPC; + + return 0; +} + +static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, + size_t count, loff_t *pos) +{ + struct ib_uverbs_file *file = filp->private_data; + const struct uverbs_api_write_method *method_elm; + struct uverbs_api *uapi = file->device->uapi; + struct ib_uverbs_ex_cmd_hdr ex_hdr; + struct ib_uverbs_cmd_hdr hdr; + struct uverbs_attr_bundle bundle; + int srcu_key; + ssize_t ret; + + if (!ib_safe_file_access(filp)) { + pr_err_once("uverbs_write: process %d (%s) changed security contexts after opening file descriptor, this is not allowed.\n", + task_tgid_vnr(current), current->comm); + return -EACCES; + } + + if (count < sizeof(hdr)) + return -EINVAL; + + if (copy_from_user(&hdr, buf, sizeof(hdr))) + return -EFAULT; + + method_elm = uapi_get_method(uapi, hdr.command); + if (IS_ERR(method_elm)) + return PTR_ERR(method_elm); + + if (method_elm->is_ex) { + if (count < (sizeof(hdr) + sizeof(ex_hdr))) + return -EINVAL; + if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr))) + return -EFAULT; + } + + ret = verify_hdr(&hdr, &ex_hdr, count, method_elm); + if (ret) + return ret; + + srcu_key = srcu_read_lock(&file->device->disassociate_srcu); + + buf += sizeof(hdr); + + memset(bundle.attr_present, 0, sizeof(bundle.attr_present)); + bundle.ufile = file; + bundle.context = NULL; /* only valid if bundle has uobject */ + bundle.uobject = NULL; + if (!method_elm->is_ex) { + size_t in_len = hdr.in_words * 4 - sizeof(hdr); + size_t out_len = hdr.out_words * 4; + u64 response = 0; + + if (method_elm->has_udata) { + bundle.driver_udata.inlen = + in_len - method_elm->req_size; + in_len = method_elm->req_size; + if (bundle.driver_udata.inlen) + bundle.driver_udata.inbuf = buf + in_len; + else + bundle.driver_udata.inbuf = NULL; + } else { + memset(&bundle.driver_udata, 0, + sizeof(bundle.driver_udata)); + } + + if (method_elm->has_resp) { + /* + * The macros check that if has_resp is set + * then the command request structure starts + * with a '__aligned u64 response' member. + */ + ret = get_user(response, (const u64 __user *)buf); + if (ret) + goto out_unlock; + + if (method_elm->has_udata) { + bundle.driver_udata.outlen = + out_len - method_elm->resp_size; + out_len = method_elm->resp_size; + if (bundle.driver_udata.outlen) + bundle.driver_udata.outbuf = + u64_to_user_ptr(response + + out_len); + else + bundle.driver_udata.outbuf = NULL; + } + } else { + bundle.driver_udata.outlen = 0; + bundle.driver_udata.outbuf = NULL; + } + + ib_uverbs_init_udata_buf_or_null( + &bundle.ucore, buf, u64_to_user_ptr(response), + in_len, out_len); + } else { + buf += sizeof(ex_hdr); + + ib_uverbs_init_udata_buf_or_null(&bundle.ucore, buf, + u64_to_user_ptr(ex_hdr.response), + hdr.in_words * 8, hdr.out_words * 8); + + ib_uverbs_init_udata_buf_or_null( + &bundle.driver_udata, buf + bundle.ucore.inlen, + u64_to_user_ptr(ex_hdr.response) + bundle.ucore.outlen, + ex_hdr.provider_in_words * 8, + ex_hdr.provider_out_words * 8); + + } + + ret = method_elm->handler(&bundle); + if (bundle.uobject) + uverbs_finalize_object(bundle.uobject, UVERBS_ACCESS_NEW, true, + !ret, &bundle); +out_unlock: + srcu_read_unlock(&file->device->disassociate_srcu, srcu_key); + return (ret) ? : count; +} + +static const struct vm_operations_struct rdma_umap_ops; + +static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma) +{ + struct ib_uverbs_file *file = filp->private_data; + struct ib_ucontext *ucontext; + int ret = 0; + int srcu_key; + + srcu_key = srcu_read_lock(&file->device->disassociate_srcu); + ucontext = ib_uverbs_get_ucontext_file(file); + if (IS_ERR(ucontext)) { + ret = PTR_ERR(ucontext); + goto out; + } + vma->vm_ops = &rdma_umap_ops; + ret = ucontext->device->ops.mmap(ucontext, vma); +out: + srcu_read_unlock(&file->device->disassociate_srcu, srcu_key); + return ret; +} + +/* + * The VMA has been dup'd, initialize the vm_private_data with a new tracking + * struct + */ +static void rdma_umap_open(struct vm_area_struct *vma) +{ + struct ib_uverbs_file *ufile = vma->vm_file->private_data; + struct rdma_umap_priv *opriv = vma->vm_private_data; + struct rdma_umap_priv *priv; + + if (!opriv) + return; + + /* We are racing with disassociation */ + if (!down_read_trylock(&ufile->hw_destroy_rwsem)) + goto out_zap; + /* + * Disassociation already completed, the VMA should already be zapped. + */ + if (!ufile->ucontext) + goto out_unlock; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + goto out_unlock; + rdma_umap_priv_init(priv, vma, opriv->entry); + + up_read(&ufile->hw_destroy_rwsem); + return; + +out_unlock: + up_read(&ufile->hw_destroy_rwsem); +out_zap: + /* + * We can't allow the VMA to be created with the actual IO pages, that + * would break our API contract, and it can't be stopped at this + * point, so zap it. + */ + vma->vm_private_data = NULL; + zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); +} + +static void rdma_umap_close(struct vm_area_struct *vma) +{ + struct ib_uverbs_file *ufile = vma->vm_file->private_data; + struct rdma_umap_priv *priv = vma->vm_private_data; + + if (!priv) + return; + + /* + * The vma holds a reference on the struct file that created it, which + * in turn means that the ib_uverbs_file is guaranteed to exist at + * this point. + */ + mutex_lock(&ufile->umap_lock); + if (priv->entry) + rdma_user_mmap_entry_put(priv->entry); + + list_del(&priv->list); + mutex_unlock(&ufile->umap_lock); + kfree(priv); +} + +/* + * Once the zap_vma_ptes has been called touches to the VMA will come here and + * we return a dummy writable zero page for all the pfns. + */ +static vm_fault_t rdma_umap_fault(struct vm_fault *vmf) +{ + struct ib_uverbs_file *ufile = vmf->vma->vm_file->private_data; + struct rdma_umap_priv *priv = vmf->vma->vm_private_data; + vm_fault_t ret = 0; + + if (!priv) + return VM_FAULT_SIGBUS; + + /* Read only pages can just use the system zero page. */ + if (!(vmf->vma->vm_flags & (VM_WRITE | VM_MAYWRITE))) { + vmf->page = ZERO_PAGE(vmf->address); + get_page(vmf->page); + return 0; + } + + mutex_lock(&ufile->umap_lock); + if (!ufile->disassociate_page) + ufile->disassociate_page = + alloc_pages(vmf->gfp_mask | __GFP_ZERO, 0); + + if (ufile->disassociate_page) { + /* + * This VMA is forced to always be shared so this doesn't have + * to worry about COW. + */ + vmf->page = ufile->disassociate_page; + get_page(vmf->page); + } else { + ret = VM_FAULT_SIGBUS; + } + mutex_unlock(&ufile->umap_lock); + + return ret; +} + +static const struct vm_operations_struct rdma_umap_ops = { + .open = rdma_umap_open, + .close = rdma_umap_close, + .fault = rdma_umap_fault, +}; + +void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile) +{ + struct rdma_umap_priv *priv, *next_priv; + + lockdep_assert_held(&ufile->hw_destroy_rwsem); + + while (1) { + struct mm_struct *mm = NULL; + + /* Get an arbitrary mm pointer that hasn't been cleaned yet */ + mutex_lock(&ufile->umap_lock); + while (!list_empty(&ufile->umaps)) { + int ret; + + priv = list_first_entry(&ufile->umaps, + struct rdma_umap_priv, list); + mm = priv->vma->vm_mm; + ret = mmget_not_zero(mm); + if (!ret) { + list_del_init(&priv->list); + if (priv->entry) { + rdma_user_mmap_entry_put(priv->entry); + priv->entry = NULL; + } + mm = NULL; + continue; + } + break; + } + mutex_unlock(&ufile->umap_lock); + if (!mm) + return; + + /* + * The umap_lock is nested under mmap_lock since it used within + * the vma_ops callbacks, so we have to clean the list one mm + * at a time to get the lock ordering right. Typically there + * will only be one mm, so no big deal. + */ + mmap_read_lock(mm); + mutex_lock(&ufile->umap_lock); + list_for_each_entry_safe (priv, next_priv, &ufile->umaps, + list) { + struct vm_area_struct *vma = priv->vma; + + if (vma->vm_mm != mm) + continue; + list_del_init(&priv->list); + + zap_vma_ptes(vma, vma->vm_start, + vma->vm_end - vma->vm_start); + + if (priv->entry) { + rdma_user_mmap_entry_put(priv->entry); + priv->entry = NULL; + } + } + mutex_unlock(&ufile->umap_lock); + mmap_read_unlock(mm); + mmput(mm); + } +} + +/* + * ib_uverbs_open() does not need the BKL: + * + * - the ib_uverbs_device structures are properly reference counted and + * everything else is purely local to the file being created, so + * races against other open calls are not a problem; + * - there is no ioctl method to race against; + * - the open method will either immediately run -ENXIO, or all + * required initialization will be done. + */ +static int ib_uverbs_open(struct inode *inode, struct file *filp) +{ + struct ib_uverbs_device *dev; + struct ib_uverbs_file *file; + struct ib_device *ib_dev; + int ret; + int module_dependent; + int srcu_key; + + dev = container_of(inode->i_cdev, struct ib_uverbs_device, cdev); + if (!refcount_inc_not_zero(&dev->refcount)) + return -ENXIO; + + get_device(&dev->dev); + srcu_key = srcu_read_lock(&dev->disassociate_srcu); + mutex_lock(&dev->lists_mutex); + ib_dev = srcu_dereference(dev->ib_dev, + &dev->disassociate_srcu); + if (!ib_dev) { + ret = -EIO; + goto err; + } + + if (!rdma_dev_access_netns(ib_dev, current->nsproxy->net_ns)) { + ret = -EPERM; + goto err; + } + + /* In case IB device supports disassociate ucontext, there is no hard + * dependency between uverbs device and its low level device. + */ + module_dependent = !(ib_dev->ops.disassociate_ucontext); + + if (module_dependent) { + if (!try_module_get(ib_dev->ops.owner)) { + ret = -ENODEV; + goto err; + } + } + + file = kzalloc(sizeof(*file), GFP_KERNEL); + if (!file) { + ret = -ENOMEM; + if (module_dependent) + goto err_module; + + goto err; + } + + file->device = dev; + kref_init(&file->ref); + mutex_init(&file->ucontext_lock); + + spin_lock_init(&file->uobjects_lock); + INIT_LIST_HEAD(&file->uobjects); + init_rwsem(&file->hw_destroy_rwsem); + mutex_init(&file->umap_lock); + INIT_LIST_HEAD(&file->umaps); + + filp->private_data = file; + list_add_tail(&file->list, &dev->uverbs_file_list); + mutex_unlock(&dev->lists_mutex); + srcu_read_unlock(&dev->disassociate_srcu, srcu_key); + + setup_ufile_idr_uobject(file); + + return stream_open(inode, filp); + +err_module: + module_put(ib_dev->ops.owner); + +err: + mutex_unlock(&dev->lists_mutex); + srcu_read_unlock(&dev->disassociate_srcu, srcu_key); + if (refcount_dec_and_test(&dev->refcount)) + ib_uverbs_comp_dev(dev); + + put_device(&dev->dev); + return ret; +} + +static int ib_uverbs_close(struct inode *inode, struct file *filp) +{ + struct ib_uverbs_file *file = filp->private_data; + + uverbs_destroy_ufile_hw(file, RDMA_REMOVE_CLOSE); + + mutex_lock(&file->device->lists_mutex); + list_del_init(&file->list); + mutex_unlock(&file->device->lists_mutex); + + kref_put(&file->ref, ib_uverbs_release_file); + + return 0; +} + +static const struct file_operations uverbs_fops = { + .owner = THIS_MODULE, + .write = ib_uverbs_write, + .open = ib_uverbs_open, + .release = ib_uverbs_close, + .llseek = no_llseek, + .unlocked_ioctl = ib_uverbs_ioctl, + .compat_ioctl = compat_ptr_ioctl, +}; + +static const struct file_operations uverbs_mmap_fops = { + .owner = THIS_MODULE, + .write = ib_uverbs_write, + .mmap = ib_uverbs_mmap, + .open = ib_uverbs_open, + .release = ib_uverbs_close, + .llseek = no_llseek, + .unlocked_ioctl = ib_uverbs_ioctl, + .compat_ioctl = compat_ptr_ioctl, +}; + +static int ib_uverbs_get_nl_info(struct ib_device *ibdev, void *client_data, + struct ib_client_nl_info *res) +{ + struct ib_uverbs_device *uverbs_dev = client_data; + int ret; + + if (res->port != -1) + return -EINVAL; + + res->abi = ibdev->ops.uverbs_abi_ver; + res->cdev = &uverbs_dev->dev; + + /* + * To support DRIVER_ID binding in userspace some of the driver need + * upgrading to expose their PCI dependent revision information + * through get_context instead of relying on modalias matching. When + * the drivers are fixed they can drop this flag. + */ + if (!ibdev->ops.uverbs_no_driver_id_binding) { + ret = nla_put_u32(res->nl_msg, RDMA_NLDEV_ATTR_UVERBS_DRIVER_ID, + ibdev->ops.driver_id); + if (ret) + return ret; + } + return 0; +} + +static struct ib_client uverbs_client = { + .name = "uverbs", + .no_kverbs_req = true, + .add = ib_uverbs_add_one, + .remove = ib_uverbs_remove_one, + .get_nl_info = ib_uverbs_get_nl_info, +}; +MODULE_ALIAS_RDMA_CLIENT("uverbs"); + +static ssize_t ibdev_show(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct ib_uverbs_device *dev = + container_of(device, struct ib_uverbs_device, dev); + int ret = -ENODEV; + int srcu_key; + struct ib_device *ib_dev; + + srcu_key = srcu_read_lock(&dev->disassociate_srcu); + ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu); + if (ib_dev) + ret = sysfs_emit(buf, "%s\n", dev_name(&ib_dev->dev)); + srcu_read_unlock(&dev->disassociate_srcu, srcu_key); + + return ret; +} +static DEVICE_ATTR_RO(ibdev); + +static ssize_t abi_version_show(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct ib_uverbs_device *dev = + container_of(device, struct ib_uverbs_device, dev); + int ret = -ENODEV; + int srcu_key; + struct ib_device *ib_dev; + + srcu_key = srcu_read_lock(&dev->disassociate_srcu); + ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu); + if (ib_dev) + ret = sysfs_emit(buf, "%u\n", ib_dev->ops.uverbs_abi_ver); + srcu_read_unlock(&dev->disassociate_srcu, srcu_key); + + return ret; +} +static DEVICE_ATTR_RO(abi_version); + +static struct attribute *ib_dev_attrs[] = { + &dev_attr_abi_version.attr, + &dev_attr_ibdev.attr, + NULL, +}; + +static const struct attribute_group dev_attr_group = { + .attrs = ib_dev_attrs, +}; + +static CLASS_ATTR_STRING(abi_version, S_IRUGO, + __stringify(IB_USER_VERBS_ABI_VERSION)); + +static int ib_uverbs_create_uapi(struct ib_device *device, + struct ib_uverbs_device *uverbs_dev) +{ + struct uverbs_api *uapi; + + uapi = uverbs_alloc_api(device); + if (IS_ERR(uapi)) + return PTR_ERR(uapi); + + uverbs_dev->uapi = uapi; + return 0; +} + +static int ib_uverbs_add_one(struct ib_device *device) +{ + int devnum; + dev_t base; + struct ib_uverbs_device *uverbs_dev; + int ret; + + if (!device->ops.alloc_ucontext) + return -EOPNOTSUPP; + + uverbs_dev = kzalloc(sizeof(*uverbs_dev), GFP_KERNEL); + if (!uverbs_dev) + return -ENOMEM; + + ret = init_srcu_struct(&uverbs_dev->disassociate_srcu); + if (ret) { + kfree(uverbs_dev); + return -ENOMEM; + } + + device_initialize(&uverbs_dev->dev); + uverbs_dev->dev.class = uverbs_class; + uverbs_dev->dev.parent = device->dev.parent; + uverbs_dev->dev.release = ib_uverbs_release_dev; + uverbs_dev->groups[0] = &dev_attr_group; + uverbs_dev->dev.groups = uverbs_dev->groups; + refcount_set(&uverbs_dev->refcount, 1); + init_completion(&uverbs_dev->comp); + uverbs_dev->xrcd_tree = RB_ROOT; + mutex_init(&uverbs_dev->xrcd_tree_mutex); + mutex_init(&uverbs_dev->lists_mutex); + INIT_LIST_HEAD(&uverbs_dev->uverbs_file_list); + rcu_assign_pointer(uverbs_dev->ib_dev, device); + uverbs_dev->num_comp_vectors = device->num_comp_vectors; + + devnum = ida_alloc_max(&uverbs_ida, IB_UVERBS_MAX_DEVICES - 1, + GFP_KERNEL); + if (devnum < 0) { + ret = -ENOMEM; + goto err; + } + uverbs_dev->devnum = devnum; + if (devnum >= IB_UVERBS_NUM_FIXED_MINOR) + base = dynamic_uverbs_dev + devnum - IB_UVERBS_NUM_FIXED_MINOR; + else + base = IB_UVERBS_BASE_DEV + devnum; + + ret = ib_uverbs_create_uapi(device, uverbs_dev); + if (ret) + goto err_uapi; + + uverbs_dev->dev.devt = base; + dev_set_name(&uverbs_dev->dev, "uverbs%d", uverbs_dev->devnum); + + cdev_init(&uverbs_dev->cdev, + device->ops.mmap ? &uverbs_mmap_fops : &uverbs_fops); + uverbs_dev->cdev.owner = THIS_MODULE; + + ret = cdev_device_add(&uverbs_dev->cdev, &uverbs_dev->dev); + if (ret) + goto err_uapi; + + ib_set_client_data(device, &uverbs_client, uverbs_dev); + return 0; + +err_uapi: + ida_free(&uverbs_ida, devnum); +err: + if (refcount_dec_and_test(&uverbs_dev->refcount)) + ib_uverbs_comp_dev(uverbs_dev); + wait_for_completion(&uverbs_dev->comp); + put_device(&uverbs_dev->dev); + return ret; +} + +static void ib_uverbs_free_hw_resources(struct ib_uverbs_device *uverbs_dev, + struct ib_device *ib_dev) +{ + struct ib_uverbs_file *file; + + /* Pending running commands to terminate */ + uverbs_disassociate_api_pre(uverbs_dev); + + mutex_lock(&uverbs_dev->lists_mutex); + while (!list_empty(&uverbs_dev->uverbs_file_list)) { + file = list_first_entry(&uverbs_dev->uverbs_file_list, + struct ib_uverbs_file, list); + list_del_init(&file->list); + kref_get(&file->ref); + + /* We must release the mutex before going ahead and calling + * uverbs_cleanup_ufile, as it might end up indirectly calling + * uverbs_close, for example due to freeing the resources (e.g + * mmput). + */ + mutex_unlock(&uverbs_dev->lists_mutex); + + uverbs_destroy_ufile_hw(file, RDMA_REMOVE_DRIVER_REMOVE); + kref_put(&file->ref, ib_uverbs_release_file); + + mutex_lock(&uverbs_dev->lists_mutex); + } + mutex_unlock(&uverbs_dev->lists_mutex); + + uverbs_disassociate_api(uverbs_dev->uapi); +} + +static void ib_uverbs_remove_one(struct ib_device *device, void *client_data) +{ + struct ib_uverbs_device *uverbs_dev = client_data; + int wait_clients = 1; + + cdev_device_del(&uverbs_dev->cdev, &uverbs_dev->dev); + ida_free(&uverbs_ida, uverbs_dev->devnum); + + if (device->ops.disassociate_ucontext) { + /* We disassociate HW resources and immediately return. + * Userspace will see a EIO errno for all future access. + * Upon returning, ib_device may be freed internally and is not + * valid any more. + * uverbs_device is still available until all clients close + * their files, then the uverbs device ref count will be zero + * and its resources will be freed. + * Note: At this point no more files can be opened since the + * cdev was deleted, however active clients can still issue + * commands and close their open files. + */ + ib_uverbs_free_hw_resources(uverbs_dev, device); + wait_clients = 0; + } + + if (refcount_dec_and_test(&uverbs_dev->refcount)) + ib_uverbs_comp_dev(uverbs_dev); + if (wait_clients) + wait_for_completion(&uverbs_dev->comp); + + put_device(&uverbs_dev->dev); +} + +static char *uverbs_devnode(struct device *dev, umode_t *mode) +{ + if (mode) + *mode = 0666; + return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev)); +} + +static int __init ib_uverbs_init(void) +{ + int ret; + + ret = register_chrdev_region(IB_UVERBS_BASE_DEV, + IB_UVERBS_NUM_FIXED_MINOR, + "infiniband_verbs"); + if (ret) { + pr_err("user_verbs: couldn't register device number\n"); + goto out; + } + + ret = alloc_chrdev_region(&dynamic_uverbs_dev, 0, + IB_UVERBS_NUM_DYNAMIC_MINOR, + "infiniband_verbs"); + if (ret) { + pr_err("couldn't register dynamic device number\n"); + goto out_alloc; + } + + uverbs_class = class_create(THIS_MODULE, "infiniband_verbs"); + if (IS_ERR(uverbs_class)) { + ret = PTR_ERR(uverbs_class); + pr_err("user_verbs: couldn't create class infiniband_verbs\n"); + goto out_chrdev; + } + + uverbs_class->devnode = uverbs_devnode; + + ret = class_create_file(uverbs_class, &class_attr_abi_version.attr); + if (ret) { + pr_err("user_verbs: couldn't create abi_version attribute\n"); + goto out_class; + } + + ret = ib_register_client(&uverbs_client); + if (ret) { + pr_err("user_verbs: couldn't register client\n"); + goto out_class; + } + + return 0; + +out_class: + class_destroy(uverbs_class); + +out_chrdev: + unregister_chrdev_region(dynamic_uverbs_dev, + IB_UVERBS_NUM_DYNAMIC_MINOR); + +out_alloc: + unregister_chrdev_region(IB_UVERBS_BASE_DEV, + IB_UVERBS_NUM_FIXED_MINOR); + +out: + return ret; +} + +static void __exit ib_uverbs_cleanup(void) +{ + ib_unregister_client(&uverbs_client); + class_destroy(uverbs_class); + unregister_chrdev_region(IB_UVERBS_BASE_DEV, + IB_UVERBS_NUM_FIXED_MINOR); + unregister_chrdev_region(dynamic_uverbs_dev, + IB_UVERBS_NUM_DYNAMIC_MINOR); + mmu_notifier_synchronize(); +} + +module_init(ib_uverbs_init); +module_exit(ib_uverbs_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_marshall.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_marshall.c new file mode 100644 index 0000000..11a0806 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_marshall.c @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2005 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#define OPA_DEFAULT_GID_PREFIX cpu_to_be64(0xfe80000000000000ULL) +static int rdma_ah_conv_opa_to_ib(struct ib_device *dev, + struct rdma_ah_attr *ib, + struct rdma_ah_attr *opa) +{ + struct ib_port_attr port_attr; + int ret = 0; + + /* Do structure copy and the over-write fields */ + *ib = *opa; + + ib->type = RDMA_AH_ATTR_TYPE_IB; + rdma_ah_set_grh(ib, NULL, 0, 0, 1, 0); + + if (ib_query_port(dev, opa->port_num, &port_attr)) { + /* Set to default subnet to indicate error */ + rdma_ah_set_subnet_prefix(ib, OPA_DEFAULT_GID_PREFIX); + ret = -EINVAL; + } else { + rdma_ah_set_subnet_prefix(ib, + cpu_to_be64(port_attr.subnet_prefix)); + } + rdma_ah_set_interface_id(ib, OPA_MAKE_ID(rdma_ah_get_dlid(opa))); + return ret; +} + +void ib_copy_ah_attr_to_user(struct ib_device *device, + struct ib_uverbs_ah_attr *dst, + struct rdma_ah_attr *ah_attr) +{ + struct rdma_ah_attr *src = ah_attr; + struct rdma_ah_attr conv_ah; + + memset(&dst->grh, 0, sizeof(dst->grh)); + + if ((ah_attr->type == RDMA_AH_ATTR_TYPE_OPA) && + (rdma_ah_get_dlid(ah_attr) > be16_to_cpu(IB_LID_PERMISSIVE)) && + (!rdma_ah_conv_opa_to_ib(device, &conv_ah, ah_attr))) + src = &conv_ah; + + dst->dlid = rdma_ah_get_dlid(src); + dst->sl = rdma_ah_get_sl(src); + dst->src_path_bits = rdma_ah_get_path_bits(src); + dst->static_rate = rdma_ah_get_static_rate(src); + dst->is_global = rdma_ah_get_ah_flags(src) & + IB_AH_GRH ? 1 : 0; + if (dst->is_global) { + const struct ib_global_route *grh = rdma_ah_read_grh(src); + + memcpy(dst->grh.dgid, grh->dgid.raw, sizeof(grh->dgid)); + dst->grh.flow_label = grh->flow_label; + dst->grh.sgid_index = grh->sgid_index; + dst->grh.hop_limit = grh->hop_limit; + dst->grh.traffic_class = grh->traffic_class; + } + dst->port_num = rdma_ah_get_port_num(src); + dst->reserved = 0; +} +EXPORT_SYMBOL(ib_copy_ah_attr_to_user); + +void ib_copy_qp_attr_to_user(struct ib_device *device, + struct ib_uverbs_qp_attr *dst, + struct ib_qp_attr *src) +{ + dst->qp_state = src->qp_state; + dst->cur_qp_state = src->cur_qp_state; + dst->path_mtu = src->path_mtu; + dst->path_mig_state = src->path_mig_state; + dst->qkey = src->qkey; + dst->rq_psn = src->rq_psn; + dst->sq_psn = src->sq_psn; + dst->dest_qp_num = src->dest_qp_num; + dst->qp_access_flags = src->qp_access_flags; + + dst->max_send_wr = src->cap.max_send_wr; + dst->max_recv_wr = src->cap.max_recv_wr; + dst->max_send_sge = src->cap.max_send_sge; + dst->max_recv_sge = src->cap.max_recv_sge; + dst->max_inline_data = src->cap.max_inline_data; + + ib_copy_ah_attr_to_user(device, &dst->ah_attr, &src->ah_attr); + ib_copy_ah_attr_to_user(device, &dst->alt_ah_attr, &src->alt_ah_attr); + + dst->pkey_index = src->pkey_index; + dst->alt_pkey_index = src->alt_pkey_index; + dst->en_sqd_async_notify = src->en_sqd_async_notify; + dst->sq_draining = src->sq_draining; + dst->max_rd_atomic = src->max_rd_atomic; + dst->max_dest_rd_atomic = src->max_dest_rd_atomic; + dst->min_rnr_timer = src->min_rnr_timer; + dst->port_num = src->port_num; + dst->timeout = src->timeout; + dst->retry_cnt = src->retry_cnt; + dst->rnr_retry = src->rnr_retry; + dst->alt_port_num = src->alt_port_num; + dst->alt_timeout = src->alt_timeout; + memset(dst->reserved, 0, sizeof(dst->reserved)); +} +EXPORT_SYMBOL(ib_copy_qp_attr_to_user); + +static void __ib_copy_path_rec_to_user(struct ib_user_path_rec *dst, + struct sa_path_rec *src) +{ + memcpy(dst->dgid, src->dgid.raw, sizeof(src->dgid)); + memcpy(dst->sgid, src->sgid.raw, sizeof(src->sgid)); + + dst->dlid = htons(ntohl(sa_path_get_dlid(src))); + dst->slid = htons(ntohl(sa_path_get_slid(src))); + dst->raw_traffic = sa_path_get_raw_traffic(src); + dst->flow_label = src->flow_label; + dst->hop_limit = src->hop_limit; + dst->traffic_class = src->traffic_class; + dst->reversible = src->reversible; + dst->numb_path = src->numb_path; + dst->pkey = src->pkey; + dst->sl = src->sl; + dst->mtu_selector = src->mtu_selector; + dst->mtu = src->mtu; + dst->rate_selector = src->rate_selector; + dst->rate = src->rate; + dst->packet_life_time = src->packet_life_time; + dst->preference = src->preference; + dst->packet_life_time_selector = src->packet_life_time_selector; +} + +void ib_copy_path_rec_to_user(struct ib_user_path_rec *dst, + struct sa_path_rec *src) +{ + struct sa_path_rec rec; + + if (src->rec_type == SA_PATH_REC_TYPE_OPA) { + sa_convert_path_opa_to_ib(&rec, src); + __ib_copy_path_rec_to_user(dst, &rec); + return; + } + __ib_copy_path_rec_to_user(dst, src); +} +EXPORT_SYMBOL(ib_copy_path_rec_to_user); + +void ib_copy_path_rec_from_user(struct sa_path_rec *dst, + struct ib_user_path_rec *src) +{ + u32 slid, dlid; + + memset(dst, 0, sizeof(*dst)); + if ((ib_is_opa_gid((union ib_gid *)src->sgid)) || + (ib_is_opa_gid((union ib_gid *)src->dgid))) { + dst->rec_type = SA_PATH_REC_TYPE_OPA; + slid = opa_get_lid_from_gid((union ib_gid *)src->sgid); + dlid = opa_get_lid_from_gid((union ib_gid *)src->dgid); + } else { + dst->rec_type = SA_PATH_REC_TYPE_IB; + slid = ntohs(src->slid); + dlid = ntohs(src->dlid); + } + memcpy(dst->dgid.raw, src->dgid, sizeof dst->dgid); + memcpy(dst->sgid.raw, src->sgid, sizeof dst->sgid); + + sa_path_set_dlid(dst, dlid); + sa_path_set_slid(dst, slid); + sa_path_set_raw_traffic(dst, src->raw_traffic); + dst->flow_label = src->flow_label; + dst->hop_limit = src->hop_limit; + dst->traffic_class = src->traffic_class; + dst->reversible = src->reversible; + dst->numb_path = src->numb_path; + dst->pkey = src->pkey; + dst->sl = src->sl; + dst->mtu_selector = src->mtu_selector; + dst->mtu = src->mtu; + dst->rate_selector = src->rate_selector; + dst->rate = src->rate; + dst->packet_life_time = src->packet_life_time; + dst->preference = src->preference; + dst->packet_life_time_selector = src->packet_life_time_selector; + + /* TODO: No need to set this */ + sa_path_set_dmac_zero(dst); +} +EXPORT_SYMBOL(ib_copy_path_rec_from_user); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_std_types.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_std_types.c new file mode 100644 index 0000000..13776a6 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_std_types.c @@ -0,0 +1,269 @@ +/* + * Copyright (c) 2017, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include "rdma_core.h" +#include "uverbs.h" + +static int uverbs_free_ah(struct ib_uobject *uobject, + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) +{ + return rdma_destroy_ah_user((struct ib_ah *)uobject->object, + RDMA_DESTROY_AH_SLEEPABLE, + &attrs->driver_udata); +} + +static int uverbs_free_flow(struct ib_uobject *uobject, + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) +{ + struct ib_flow *flow = (struct ib_flow *)uobject->object; + struct ib_uflow_object *uflow = + container_of(uobject, struct ib_uflow_object, uobject); + struct ib_qp *qp = flow->qp; + int ret; + + ret = flow->device->ops.destroy_flow(flow); + if (!ret) { + if (qp) + atomic_dec(&qp->usecnt); + ib_uverbs_flow_resources_free(uflow->resources); + } + + return ret; +} + +static int uverbs_free_mw(struct ib_uobject *uobject, + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) +{ + return uverbs_dealloc_mw((struct ib_mw *)uobject->object); +} + +static int uverbs_free_rwq_ind_tbl(struct ib_uobject *uobject, + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) +{ + struct ib_rwq_ind_table *rwq_ind_tbl = uobject->object; + struct ib_wq **ind_tbl = rwq_ind_tbl->ind_tbl; + u32 table_size = (1 << rwq_ind_tbl->log_ind_tbl_size); + int ret, i; + + if (atomic_read(&rwq_ind_tbl->usecnt)) + return -EBUSY; + + ret = rwq_ind_tbl->device->ops.destroy_rwq_ind_table(rwq_ind_tbl); + if (ret) + return ret; + + for (i = 0; i < table_size; i++) + atomic_dec(&ind_tbl[i]->usecnt); + + kfree(rwq_ind_tbl); + kfree(ind_tbl); + return 0; +} + +static int uverbs_free_xrcd(struct ib_uobject *uobject, + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) +{ + struct ib_xrcd *xrcd = uobject->object; + struct ib_uxrcd_object *uxrcd = + container_of(uobject, struct ib_uxrcd_object, uobject); + int ret; + + if (atomic_read(&uxrcd->refcnt)) + return -EBUSY; + + mutex_lock(&attrs->ufile->device->xrcd_tree_mutex); + ret = ib_uverbs_dealloc_xrcd(uobject, xrcd, why, attrs); + mutex_unlock(&attrs->ufile->device->xrcd_tree_mutex); + + return ret; +} + +static int uverbs_free_pd(struct ib_uobject *uobject, + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) +{ + struct ib_pd *pd = uobject->object; + + if (atomic_read(&pd->usecnt)) + return -EBUSY; + + return ib_dealloc_pd_user(pd, &attrs->driver_udata); +} + +void ib_uverbs_free_event_queue(struct ib_uverbs_event_queue *event_queue) +{ + struct ib_uverbs_event *entry, *tmp; + + spin_lock_irq(&event_queue->lock); + /* + * The user must ensure that no new items are added to the event_list + * once is_closed is set. + */ + event_queue->is_closed = 1; + spin_unlock_irq(&event_queue->lock); + wake_up_interruptible(&event_queue->poll_wait); + kill_fasync(&event_queue->async_queue, SIGIO, POLL_IN); + + spin_lock_irq(&event_queue->lock); + list_for_each_entry_safe(entry, tmp, &event_queue->event_list, list) { + if (entry->counter) + list_del(&entry->obj_list); + list_del(&entry->list); + kfree(entry); + } + spin_unlock_irq(&event_queue->lock); +} + +static void +uverbs_completion_event_file_destroy_uobj(struct ib_uobject *uobj, + enum rdma_remove_reason why) +{ + struct ib_uverbs_completion_event_file *file = + container_of(uobj, struct ib_uverbs_completion_event_file, + uobj); + + ib_uverbs_free_event_queue(&file->ev_queue); +} + +int uverbs_destroy_def_handler(struct uverbs_attr_bundle *attrs) +{ + return 0; +} +EXPORT_SYMBOL(uverbs_destroy_def_handler); + +DECLARE_UVERBS_NAMED_OBJECT( + UVERBS_OBJECT_COMP_CHANNEL, + UVERBS_TYPE_ALLOC_FD(sizeof(struct ib_uverbs_completion_event_file), + uverbs_completion_event_file_destroy_uobj, + &uverbs_event_fops, + "[infinibandevent]", + O_RDONLY)); + +DECLARE_UVERBS_NAMED_METHOD_DESTROY( + UVERBS_METHOD_MW_DESTROY, + UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_MW_HANDLE, + UVERBS_OBJECT_MW, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_MW, + UVERBS_TYPE_ALLOC_IDR(uverbs_free_mw), + &UVERBS_METHOD(UVERBS_METHOD_MW_DESTROY)); + +DECLARE_UVERBS_NAMED_METHOD_DESTROY( + UVERBS_METHOD_AH_DESTROY, + UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_AH_HANDLE, + UVERBS_OBJECT_AH, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_AH, + UVERBS_TYPE_ALLOC_IDR(uverbs_free_ah), + &UVERBS_METHOD(UVERBS_METHOD_AH_DESTROY)); + +DECLARE_UVERBS_NAMED_METHOD_DESTROY( + UVERBS_METHOD_FLOW_DESTROY, + UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_FLOW_HANDLE, + UVERBS_OBJECT_FLOW, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT( + UVERBS_OBJECT_FLOW, + UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uflow_object), + uverbs_free_flow), + &UVERBS_METHOD(UVERBS_METHOD_FLOW_DESTROY)); + +DECLARE_UVERBS_NAMED_METHOD_DESTROY( + UVERBS_METHOD_RWQ_IND_TBL_DESTROY, + UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_RWQ_IND_TBL_HANDLE, + UVERBS_OBJECT_RWQ_IND_TBL, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_RWQ_IND_TBL, + UVERBS_TYPE_ALLOC_IDR(uverbs_free_rwq_ind_tbl), + &UVERBS_METHOD(UVERBS_METHOD_RWQ_IND_TBL_DESTROY)); + +DECLARE_UVERBS_NAMED_METHOD_DESTROY( + UVERBS_METHOD_XRCD_DESTROY, + UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_XRCD_HANDLE, + UVERBS_OBJECT_XRCD, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT( + UVERBS_OBJECT_XRCD, + UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uxrcd_object), + uverbs_free_xrcd), + &UVERBS_METHOD(UVERBS_METHOD_XRCD_DESTROY)); + +DECLARE_UVERBS_NAMED_METHOD_DESTROY( + UVERBS_METHOD_PD_DESTROY, + UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_PD_HANDLE, + UVERBS_OBJECT_PD, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_PD, + UVERBS_TYPE_ALLOC_IDR(uverbs_free_pd), + &UVERBS_METHOD(UVERBS_METHOD_PD_DESTROY)); + +const struct uapi_definition uverbs_def_obj_intf[] = { + UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_PD, + UAPI_DEF_OBJ_NEEDS_FN(dealloc_pd)), + UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_COMP_CHANNEL, + UAPI_DEF_OBJ_NEEDS_FN(dealloc_pd)), + UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_AH, + UAPI_DEF_OBJ_NEEDS_FN(destroy_ah)), + UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_MW, + UAPI_DEF_OBJ_NEEDS_FN(dealloc_mw)), + UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_FLOW, + UAPI_DEF_OBJ_NEEDS_FN(destroy_flow)), + UAPI_DEF_CHAIN_OBJ_TREE_NAMED( + UVERBS_OBJECT_RWQ_IND_TBL, + UAPI_DEF_OBJ_NEEDS_FN(destroy_rwq_ind_table)), + UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_XRCD, + UAPI_DEF_OBJ_NEEDS_FN(dealloc_xrcd)), + {} +}; diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_std_types_async_fd.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_std_types_async_fd.c new file mode 100644 index 0000000..cc24cfd --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_std_types_async_fd.c @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2019, Mellanox Technologies inc. All rights reserved. + */ + +#include +#include +#include "rdma_core.h" +#include "uverbs.h" + +static int UVERBS_HANDLER(UVERBS_METHOD_ASYNC_EVENT_ALLOC)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = + uverbs_attr_get_uobject(attrs, UVERBS_METHOD_ASYNC_EVENT_ALLOC); + + ib_uverbs_init_async_event_file( + container_of(uobj, struct ib_uverbs_async_event_file, uobj)); + return 0; +} + +static void uverbs_async_event_destroy_uobj(struct ib_uobject *uobj, + enum rdma_remove_reason why) +{ + struct ib_uverbs_async_event_file *event_file = + container_of(uobj, struct ib_uverbs_async_event_file, uobj); + + ib_unregister_event_handler(&event_file->event_handler); + + if (why == RDMA_REMOVE_DRIVER_REMOVE) + ib_uverbs_async_handler(event_file, 0, IB_EVENT_DEVICE_FATAL, + NULL, NULL); +} + +int uverbs_async_event_release(struct inode *inode, struct file *filp) +{ + struct ib_uverbs_async_event_file *event_file; + struct ib_uobject *uobj = filp->private_data; + int ret; + + if (!uobj) + return uverbs_uobject_fd_release(inode, filp); + + event_file = + container_of(uobj, struct ib_uverbs_async_event_file, uobj); + + /* + * The async event FD has to deliver IB_EVENT_DEVICE_FATAL even after + * disassociation, so cleaning the event list must only happen after + * release. The user knows it has reached the end of the event stream + * when it sees IB_EVENT_DEVICE_FATAL. + */ + uverbs_uobject_get(uobj); + ret = uverbs_uobject_fd_release(inode, filp); + ib_uverbs_free_event_queue(&event_file->ev_queue); + uverbs_uobject_put(uobj); + return ret; +} + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_ASYNC_EVENT_ALLOC, + UVERBS_ATTR_FD(UVERBS_ATTR_ASYNC_EVENT_ALLOC_FD_HANDLE, + UVERBS_OBJECT_ASYNC_EVENT, + UVERBS_ACCESS_NEW, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT( + UVERBS_OBJECT_ASYNC_EVENT, + UVERBS_TYPE_ALLOC_FD(sizeof(struct ib_uverbs_async_event_file), + uverbs_async_event_destroy_uobj, + &uverbs_async_event_fops, + "[infinibandevent]", + O_RDONLY), + &UVERBS_METHOD(UVERBS_METHOD_ASYNC_EVENT_ALLOC)); + +const struct uapi_definition uverbs_def_obj_async_fd[] = { + UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_ASYNC_EVENT), + {} +}; diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_std_types_counters.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_std_types_counters.c new file mode 100644 index 0000000..999da9c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_std_types_counters.c @@ -0,0 +1,161 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2018, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "rdma_core.h" +#include "uverbs.h" +#include + +static int uverbs_free_counters(struct ib_uobject *uobject, + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) +{ + struct ib_counters *counters = uobject->object; + int ret; + + if (atomic_read(&counters->usecnt)) + return -EBUSY; + + ret = counters->device->ops.destroy_counters(counters); + if (ret) + return ret; + kfree(counters); + return 0; +} + +static int UVERBS_HANDLER(UVERBS_METHOD_COUNTERS_CREATE)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = uverbs_attr_get_uobject( + attrs, UVERBS_ATTR_CREATE_COUNTERS_HANDLE); + struct ib_device *ib_dev = attrs->context->device; + struct ib_counters *counters; + int ret; + + /* + * This check should be removed once the infrastructure + * have the ability to remove methods from parse tree once + * such condition is met. + */ + if (!ib_dev->ops.create_counters) + return -EOPNOTSUPP; + + counters = rdma_zalloc_drv_obj(ib_dev, ib_counters); + if (!counters) + return -ENOMEM; + + counters->device = ib_dev; + counters->uobject = uobj; + uobj->object = counters; + atomic_set(&counters->usecnt, 0); + + ret = ib_dev->ops.create_counters(counters, attrs); + if (ret) + kfree(counters); + + return ret; +} + +static int UVERBS_HANDLER(UVERBS_METHOD_COUNTERS_READ)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_counters_read_attr read_attr = {}; + const struct uverbs_attr *uattr; + struct ib_counters *counters = + uverbs_attr_get_obj(attrs, UVERBS_ATTR_READ_COUNTERS_HANDLE); + int ret; + + if (!counters->device->ops.read_counters) + return -EOPNOTSUPP; + + if (!atomic_read(&counters->usecnt)) + return -EINVAL; + + ret = uverbs_get_flags32(&read_attr.flags, attrs, + UVERBS_ATTR_READ_COUNTERS_FLAGS, + IB_UVERBS_READ_COUNTERS_PREFER_CACHED); + if (ret) + return ret; + + uattr = uverbs_attr_get(attrs, UVERBS_ATTR_READ_COUNTERS_BUFF); + read_attr.ncounters = uattr->ptr_attr.len / sizeof(u64); + read_attr.counters_buff = uverbs_zalloc( + attrs, array_size(read_attr.ncounters, sizeof(u64))); + if (IS_ERR(read_attr.counters_buff)) + return PTR_ERR(read_attr.counters_buff); + + ret = counters->device->ops.read_counters(counters, &read_attr, attrs); + if (ret) + return ret; + + return uverbs_copy_to(attrs, UVERBS_ATTR_READ_COUNTERS_BUFF, + read_attr.counters_buff, + read_attr.ncounters * sizeof(u64)); +} + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_COUNTERS_CREATE, + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_COUNTERS_HANDLE, + UVERBS_OBJECT_COUNTERS, + UVERBS_ACCESS_NEW, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD_DESTROY( + UVERBS_METHOD_COUNTERS_DESTROY, + UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_COUNTERS_HANDLE, + UVERBS_OBJECT_COUNTERS, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_COUNTERS_READ, + UVERBS_ATTR_IDR(UVERBS_ATTR_READ_COUNTERS_HANDLE, + UVERBS_OBJECT_COUNTERS, + UVERBS_ACCESS_READ, + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_READ_COUNTERS_BUFF, + UVERBS_ATTR_MIN_SIZE(0), + UA_MANDATORY), + UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_READ_COUNTERS_FLAGS, + enum ib_uverbs_read_counters_flags)); + +DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_COUNTERS, + UVERBS_TYPE_ALLOC_IDR(uverbs_free_counters), + &UVERBS_METHOD(UVERBS_METHOD_COUNTERS_CREATE), + &UVERBS_METHOD(UVERBS_METHOD_COUNTERS_DESTROY), + &UVERBS_METHOD(UVERBS_METHOD_COUNTERS_READ)); + +const struct uapi_definition uverbs_def_obj_counters[] = { + UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_COUNTERS, + UAPI_DEF_OBJ_NEEDS_FN(destroy_counters)), + {} +}; diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_std_types_cq.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_std_types_cq.c new file mode 100644 index 0000000..370ad7c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_std_types_cq.c @@ -0,0 +1,222 @@ +/* + * Copyright (c) 2017, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include "rdma_core.h" +#include "uverbs.h" +#include "restrack.h" + +static int uverbs_free_cq(struct ib_uobject *uobject, + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) +{ + struct ib_cq *cq = uobject->object; + struct ib_uverbs_event_queue *ev_queue = cq->cq_context; + struct ib_ucq_object *ucq = + container_of(uobject, struct ib_ucq_object, uevent.uobject); + int ret; + + ret = ib_destroy_cq_user(cq, &attrs->driver_udata); + if (ret) + return ret; + + ib_uverbs_release_ucq( + ev_queue ? container_of(ev_queue, + struct ib_uverbs_completion_event_file, + ev_queue) : + NULL, + ucq); + return 0; +} + +static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_ucq_object *obj = container_of( + uverbs_attr_get_uobject(attrs, UVERBS_ATTR_CREATE_CQ_HANDLE), + typeof(*obj), uevent.uobject); + struct ib_device *ib_dev = attrs->context->device; + int ret; + u64 user_handle; + struct ib_cq_init_attr attr = {}; + struct ib_cq *cq; + struct ib_uverbs_completion_event_file *ev_file = NULL; + struct ib_uobject *ev_file_uobj; + + if (!ib_dev->ops.create_cq || !ib_dev->ops.destroy_cq) + return -EOPNOTSUPP; + + ret = uverbs_copy_from(&attr.comp_vector, attrs, + UVERBS_ATTR_CREATE_CQ_COMP_VECTOR); + if (!ret) + ret = uverbs_copy_from(&attr.cqe, attrs, + UVERBS_ATTR_CREATE_CQ_CQE); + if (!ret) + ret = uverbs_copy_from(&user_handle, attrs, + UVERBS_ATTR_CREATE_CQ_USER_HANDLE); + if (ret) + return ret; + + ret = uverbs_get_flags32(&attr.flags, attrs, + UVERBS_ATTR_CREATE_CQ_FLAGS, + IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION | + IB_UVERBS_CQ_FLAGS_IGNORE_OVERRUN); + if (ret) + return ret; + + ev_file_uobj = uverbs_attr_get_uobject(attrs, UVERBS_ATTR_CREATE_CQ_COMP_CHANNEL); + if (!IS_ERR(ev_file_uobj)) { + ev_file = container_of(ev_file_uobj, + struct ib_uverbs_completion_event_file, + uobj); + uverbs_uobject_get(ev_file_uobj); + } + + obj->uevent.event_file = ib_uverbs_get_async_event( + attrs, UVERBS_ATTR_CREATE_CQ_EVENT_FD); + + if (attr.comp_vector >= attrs->ufile->device->num_comp_vectors) { + ret = -EINVAL; + goto err_event_file; + } + + INIT_LIST_HEAD(&obj->comp_list); + INIT_LIST_HEAD(&obj->uevent.event_list); + + cq = rdma_zalloc_drv_obj(ib_dev, ib_cq); + if (!cq) { + ret = -ENOMEM; + goto err_event_file; + } + + cq->device = ib_dev; + cq->uobject = obj; + cq->comp_handler = ib_uverbs_comp_handler; + cq->event_handler = ib_uverbs_cq_event_handler; + cq->cq_context = ev_file ? &ev_file->ev_queue : NULL; + atomic_set(&cq->usecnt, 0); + + rdma_restrack_new(&cq->res, RDMA_RESTRACK_CQ); + rdma_restrack_set_name(&cq->res, NULL); + + ret = ib_dev->ops.create_cq(cq, &attr, &attrs->driver_udata); + if (ret) + goto err_free; + + obj->uevent.uobject.object = cq; + obj->uevent.uobject.user_handle = user_handle; + rdma_restrack_add(&cq->res); + uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_CREATE_CQ_HANDLE); + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_CQ_RESP_CQE, &cq->cqe, + sizeof(cq->cqe)); + return ret; + +err_free: + rdma_restrack_put(&cq->res); + kfree(cq); +err_event_file: + if (obj->uevent.event_file) + uverbs_uobject_put(&obj->uevent.event_file->uobj); + if (ev_file) + uverbs_uobject_put(ev_file_uobj); + return ret; +}; + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_CQ_CREATE, + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_CQ_HANDLE, + UVERBS_OBJECT_CQ, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_CQE, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_USER_HANDLE, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_FD(UVERBS_ATTR_CREATE_CQ_COMP_CHANNEL, + UVERBS_OBJECT_COMP_CHANNEL, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_COMP_VECTOR, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_CREATE_CQ_FLAGS, + enum ib_uverbs_ex_create_cq_flags), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_CQ_RESP_CQE, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_FD(UVERBS_ATTR_CREATE_CQ_EVENT_FD, + UVERBS_OBJECT_ASYNC_EVENT, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_UHW()); + +static int UVERBS_HANDLER(UVERBS_METHOD_CQ_DESTROY)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = + uverbs_attr_get_uobject(attrs, UVERBS_ATTR_DESTROY_CQ_HANDLE); + struct ib_ucq_object *obj = + container_of(uobj, struct ib_ucq_object, uevent.uobject); + struct ib_uverbs_destroy_cq_resp resp = { + .comp_events_reported = obj->comp_events_reported, + .async_events_reported = obj->uevent.events_reported + }; + + return uverbs_copy_to(attrs, UVERBS_ATTR_DESTROY_CQ_RESP, &resp, + sizeof(resp)); +} + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_CQ_DESTROY, + UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_CQ_HANDLE, + UVERBS_OBJECT_CQ, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_DESTROY_CQ_RESP, + UVERBS_ATTR_TYPE(struct ib_uverbs_destroy_cq_resp), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT( + UVERBS_OBJECT_CQ, + UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_ucq_object), uverbs_free_cq), + &UVERBS_METHOD(UVERBS_METHOD_CQ_CREATE), + &UVERBS_METHOD(UVERBS_METHOD_CQ_DESTROY) +); + +const struct uapi_definition uverbs_def_obj_cq[] = { + UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_CQ, + UAPI_DEF_OBJ_NEEDS_FN(destroy_cq)), + {} +}; diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_std_types_device.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_std_types_device.c new file mode 100644 index 0000000..0496848 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_std_types_device.c @@ -0,0 +1,503 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2018, Mellanox Technologies inc. All rights reserved. + */ + +#include +#include +#include "rdma_core.h" +#include "uverbs.h" +#include +#include +#include + +/* + * This ioctl method allows calling any defined write or write_ex + * handler. This essentially replaces the hdr/ex_hdr system with the ioctl + * marshalling, and brings the non-ex path into the same marshalling as the ex + * path. + */ +static int UVERBS_HANDLER(UVERBS_METHOD_INVOKE_WRITE)( + struct uverbs_attr_bundle *attrs) +{ + struct uverbs_api *uapi = attrs->ufile->device->uapi; + const struct uverbs_api_write_method *method_elm; + u32 cmd; + int rc; + + rc = uverbs_get_const(&cmd, attrs, UVERBS_ATTR_WRITE_CMD); + if (rc) + return rc; + + method_elm = uapi_get_method(uapi, cmd); + if (IS_ERR(method_elm)) + return PTR_ERR(method_elm); + + uverbs_fill_udata(attrs, &attrs->ucore, UVERBS_ATTR_CORE_IN, + UVERBS_ATTR_CORE_OUT); + + if (attrs->ucore.inlen < method_elm->req_size || + attrs->ucore.outlen < method_elm->resp_size) + return -ENOSPC; + + attrs->uobject = NULL; + rc = method_elm->handler(attrs); + if (attrs->uobject) + uverbs_finalize_object(attrs->uobject, UVERBS_ACCESS_NEW, true, + !rc, attrs); + return rc; +} + +DECLARE_UVERBS_NAMED_METHOD(UVERBS_METHOD_INVOKE_WRITE, + UVERBS_ATTR_CONST_IN(UVERBS_ATTR_WRITE_CMD, + enum ib_uverbs_write_cmds, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CORE_IN, + UVERBS_ATTR_MIN_SIZE(sizeof(u32)), + UA_OPTIONAL), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CORE_OUT, + UVERBS_ATTR_MIN_SIZE(0), + UA_OPTIONAL), + UVERBS_ATTR_UHW()); + +static uint32_t * +gather_objects_handle(struct ib_uverbs_file *ufile, + const struct uverbs_api_object *uapi_object, + struct uverbs_attr_bundle *attrs, + ssize_t out_len, + u64 *total) +{ + u64 max_count = out_len / sizeof(u32); + struct ib_uobject *obj; + u64 count = 0; + u32 *handles; + + /* Allocated memory that cannot page out where we gather + * all object ids under a spin_lock. + */ + handles = uverbs_zalloc(attrs, out_len); + if (IS_ERR(handles)) + return handles; + + spin_lock_irq(&ufile->uobjects_lock); + list_for_each_entry(obj, &ufile->uobjects, list) { + u32 obj_id = obj->id; + + if (obj->uapi_object != uapi_object) + continue; + + if (count >= max_count) + break; + + handles[count] = obj_id; + count++; + } + spin_unlock_irq(&ufile->uobjects_lock); + + *total = count; + return handles; +} + +static int UVERBS_HANDLER(UVERBS_METHOD_INFO_HANDLES)( + struct uverbs_attr_bundle *attrs) +{ + const struct uverbs_api_object *uapi_object; + ssize_t out_len; + u64 total = 0; + u16 object_id; + u32 *handles; + int ret; + + out_len = uverbs_attr_get_len(attrs, UVERBS_ATTR_INFO_HANDLES_LIST); + if (out_len <= 0 || (out_len % sizeof(u32) != 0)) + return -EINVAL; + + ret = uverbs_get_const(&object_id, attrs, UVERBS_ATTR_INFO_OBJECT_ID); + if (ret) + return ret; + + uapi_object = uapi_get_object(attrs->ufile->device->uapi, object_id); + if (IS_ERR(uapi_object)) + return PTR_ERR(uapi_object); + + handles = gather_objects_handle(attrs->ufile, uapi_object, attrs, + out_len, &total); + if (IS_ERR(handles)) + return PTR_ERR(handles); + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_INFO_HANDLES_LIST, handles, + sizeof(u32) * total); + if (ret) + goto err; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_INFO_TOTAL_HANDLES, &total, + sizeof(total)); +err: + return ret; +} + +void copy_port_attr_to_resp(struct ib_port_attr *attr, + struct ib_uverbs_query_port_resp *resp, + struct ib_device *ib_dev, u8 port_num) +{ + resp->state = attr->state; + resp->max_mtu = attr->max_mtu; + resp->active_mtu = attr->active_mtu; + resp->gid_tbl_len = attr->gid_tbl_len; + resp->port_cap_flags = make_port_cap_flags(attr); + resp->max_msg_sz = attr->max_msg_sz; + resp->bad_pkey_cntr = attr->bad_pkey_cntr; + resp->qkey_viol_cntr = attr->qkey_viol_cntr; + resp->pkey_tbl_len = attr->pkey_tbl_len; + + if (rdma_is_grh_required(ib_dev, port_num)) + resp->flags |= IB_UVERBS_QPF_GRH_REQUIRED; + + if (rdma_cap_opa_ah(ib_dev, port_num)) { + resp->lid = OPA_TO_IB_UCAST_LID(attr->lid); + resp->sm_lid = OPA_TO_IB_UCAST_LID(attr->sm_lid); + } else { + resp->lid = ib_lid_cpu16(attr->lid); + resp->sm_lid = ib_lid_cpu16(attr->sm_lid); + } + + resp->lmc = attr->lmc; + resp->max_vl_num = attr->max_vl_num; + resp->sm_sl = attr->sm_sl; + resp->subnet_timeout = attr->subnet_timeout; + resp->init_type_reply = attr->init_type_reply; + resp->active_width = attr->active_width; + /* This ABI needs to be extended to provide any speed more than IB_SPEED_NDR */ + resp->active_speed = min_t(u16, attr->active_speed, IB_SPEED_NDR); + resp->phys_state = attr->phys_state; + resp->link_layer = rdma_port_get_link_layer(ib_dev, port_num); +} + +static int UVERBS_HANDLER(UVERBS_METHOD_QUERY_PORT)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_device *ib_dev; + struct ib_port_attr attr = {}; + struct ib_uverbs_query_port_resp_ex resp = {}; + struct ib_ucontext *ucontext; + int ret; + u8 port_num; + + ucontext = ib_uverbs_get_ucontext(attrs); + if (IS_ERR(ucontext)) + return PTR_ERR(ucontext); + ib_dev = ucontext->device; + + /* FIXME: Extend the UAPI_DEF_OBJ_NEEDS_FN stuff.. */ + if (!ib_dev->ops.query_port) + return -EOPNOTSUPP; + + ret = uverbs_get_const(&port_num, attrs, + UVERBS_ATTR_QUERY_PORT_PORT_NUM); + if (ret) + return ret; + + ret = ib_query_port(ib_dev, port_num, &attr); + if (ret) + return ret; + + copy_port_attr_to_resp(&attr, &resp.legacy_resp, ib_dev, port_num); + resp.port_cap_flags2 = attr.port_cap_flags2; + + return uverbs_copy_to_struct_or_zero(attrs, UVERBS_ATTR_QUERY_PORT_RESP, + &resp, sizeof(resp)); +} + +static int UVERBS_HANDLER(UVERBS_METHOD_GET_CONTEXT)( + struct uverbs_attr_bundle *attrs) +{ + u32 num_comp = attrs->ufile->device->num_comp_vectors; + u64 core_support = IB_UVERBS_CORE_SUPPORT_OPTIONAL_MR_ACCESS; + int ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_GET_CONTEXT_NUM_COMP_VECTORS, + &num_comp, sizeof(num_comp)); + if (IS_UVERBS_COPY_ERR(ret)) + return ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_GET_CONTEXT_CORE_SUPPORT, + &core_support, sizeof(core_support)); + if (IS_UVERBS_COPY_ERR(ret)) + return ret; + + ret = ib_alloc_ucontext(attrs); + if (ret) + return ret; + ret = ib_init_ucontext(attrs); + if (ret) { + kfree(attrs->context); + attrs->context = NULL; + return ret; + } + return 0; +} + +static int UVERBS_HANDLER(UVERBS_METHOD_QUERY_CONTEXT)( + struct uverbs_attr_bundle *attrs) +{ + u64 core_support = IB_UVERBS_CORE_SUPPORT_OPTIONAL_MR_ACCESS; + struct ib_ucontext *ucontext; + struct ib_device *ib_dev; + u32 num_comp; + int ret; + + ucontext = ib_uverbs_get_ucontext(attrs); + if (IS_ERR(ucontext)) + return PTR_ERR(ucontext); + ib_dev = ucontext->device; + + if (!ib_dev->ops.query_ucontext) + return -EOPNOTSUPP; + + num_comp = attrs->ufile->device->num_comp_vectors; + ret = uverbs_copy_to(attrs, UVERBS_ATTR_QUERY_CONTEXT_NUM_COMP_VECTORS, + &num_comp, sizeof(num_comp)); + if (IS_UVERBS_COPY_ERR(ret)) + return ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_QUERY_CONTEXT_CORE_SUPPORT, + &core_support, sizeof(core_support)); + if (IS_UVERBS_COPY_ERR(ret)) + return ret; + + return ucontext->device->ops.query_ucontext(ucontext, attrs); +} + +static int copy_gid_entries_to_user(struct uverbs_attr_bundle *attrs, + struct ib_uverbs_gid_entry *entries, + size_t num_entries, size_t user_entry_size) +{ + const struct uverbs_attr *attr; + void __user *user_entries; + size_t copy_len; + int ret; + int i; + + if (user_entry_size == sizeof(*entries)) { + ret = uverbs_copy_to(attrs, + UVERBS_ATTR_QUERY_GID_TABLE_RESP_ENTRIES, + entries, sizeof(*entries) * num_entries); + return ret; + } + + copy_len = min_t(size_t, user_entry_size, sizeof(*entries)); + attr = uverbs_attr_get(attrs, UVERBS_ATTR_QUERY_GID_TABLE_RESP_ENTRIES); + if (IS_ERR(attr)) + return PTR_ERR(attr); + + user_entries = u64_to_user_ptr(attr->ptr_attr.data); + for (i = 0; i < num_entries; i++) { + if (copy_to_user(user_entries, entries, copy_len)) + return -EFAULT; + + if (user_entry_size > sizeof(*entries)) { + if (clear_user(user_entries + sizeof(*entries), + user_entry_size - sizeof(*entries))) + return -EFAULT; + } + + entries++; + user_entries += user_entry_size; + } + + return uverbs_output_written(attrs, + UVERBS_ATTR_QUERY_GID_TABLE_RESP_ENTRIES); +} + +static int UVERBS_HANDLER(UVERBS_METHOD_QUERY_GID_TABLE)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_gid_entry *entries; + struct ib_ucontext *ucontext; + struct ib_device *ib_dev; + size_t user_entry_size; + ssize_t num_entries; + int max_entries; + u32 flags; + int ret; + + ret = uverbs_get_flags32(&flags, attrs, + UVERBS_ATTR_QUERY_GID_TABLE_FLAGS, 0); + if (ret) + return ret; + + ret = uverbs_get_const(&user_entry_size, attrs, + UVERBS_ATTR_QUERY_GID_TABLE_ENTRY_SIZE); + if (ret) + return ret; + + if (!user_entry_size) + return -EINVAL; + + max_entries = uverbs_attr_ptr_get_array_size( + attrs, UVERBS_ATTR_QUERY_GID_TABLE_RESP_ENTRIES, + user_entry_size); + if (max_entries <= 0) + return max_entries ?: -EINVAL; + + ucontext = ib_uverbs_get_ucontext(attrs); + if (IS_ERR(ucontext)) + return PTR_ERR(ucontext); + ib_dev = ucontext->device; + + entries = uverbs_kcalloc(attrs, max_entries, sizeof(*entries)); + if (IS_ERR(entries)) + return PTR_ERR(entries); + + num_entries = rdma_query_gid_table(ib_dev, entries, max_entries); + if (num_entries < 0) + return -EINVAL; + + ret = copy_gid_entries_to_user(attrs, entries, num_entries, + user_entry_size); + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, + UVERBS_ATTR_QUERY_GID_TABLE_RESP_NUM_ENTRIES, + &num_entries, sizeof(num_entries)); + return ret; +} + +static int UVERBS_HANDLER(UVERBS_METHOD_QUERY_GID_ENTRY)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_gid_entry entry = {}; + const struct ib_gid_attr *gid_attr; + struct ib_ucontext *ucontext; + struct ib_device *ib_dev; + struct net_device *ndev; + u32 gid_index; + u32 port_num; + u32 flags; + int ret; + + ret = uverbs_get_flags32(&flags, attrs, + UVERBS_ATTR_QUERY_GID_ENTRY_FLAGS, 0); + if (ret) + return ret; + + ret = uverbs_get_const(&port_num, attrs, + UVERBS_ATTR_QUERY_GID_ENTRY_PORT); + if (ret) + return ret; + + ret = uverbs_get_const(&gid_index, attrs, + UVERBS_ATTR_QUERY_GID_ENTRY_GID_INDEX); + if (ret) + return ret; + + ucontext = ib_uverbs_get_ucontext(attrs); + if (IS_ERR(ucontext)) + return PTR_ERR(ucontext); + ib_dev = ucontext->device; + + if (!rdma_is_port_valid(ib_dev, port_num)) + return -EINVAL; + + gid_attr = rdma_get_gid_attr(ib_dev, port_num, gid_index); + if (IS_ERR(gid_attr)) + return PTR_ERR(gid_attr); + + memcpy(&entry.gid, &gid_attr->gid, sizeof(gid_attr->gid)); + entry.gid_index = gid_attr->index; + entry.port_num = gid_attr->port_num; + entry.gid_type = gid_attr->gid_type; + + rcu_read_lock(); + ndev = rdma_read_gid_attr_ndev_rcu(gid_attr); + if (IS_ERR(ndev)) { + if (PTR_ERR(ndev) != -ENODEV) { + ret = PTR_ERR(ndev); + rcu_read_unlock(); + goto out; + } + } else { + entry.netdev_ifindex = ndev->ifindex; + } + rcu_read_unlock(); + + ret = uverbs_copy_to_struct_or_zero( + attrs, UVERBS_ATTR_QUERY_GID_ENTRY_RESP_ENTRY, &entry, + sizeof(entry)); +out: + rdma_put_gid_attr(gid_attr); + return ret; +} + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_GET_CONTEXT, + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_GET_CONTEXT_NUM_COMP_VECTORS, + UVERBS_ATTR_TYPE(u32), UA_OPTIONAL), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_GET_CONTEXT_CORE_SUPPORT, + UVERBS_ATTR_TYPE(u64), UA_OPTIONAL), + UVERBS_ATTR_UHW()); + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_QUERY_CONTEXT, + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_CONTEXT_NUM_COMP_VECTORS, + UVERBS_ATTR_TYPE(u32), UA_OPTIONAL), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_CONTEXT_CORE_SUPPORT, + UVERBS_ATTR_TYPE(u64), UA_OPTIONAL)); + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_INFO_HANDLES, + /* Also includes any device specific object ids */ + UVERBS_ATTR_CONST_IN(UVERBS_ATTR_INFO_OBJECT_ID, + enum uverbs_default_objects, UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_INFO_TOTAL_HANDLES, + UVERBS_ATTR_TYPE(u32), UA_OPTIONAL), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_INFO_HANDLES_LIST, + UVERBS_ATTR_MIN_SIZE(sizeof(u32)), UA_OPTIONAL)); + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_QUERY_PORT, + UVERBS_ATTR_CONST_IN(UVERBS_ATTR_QUERY_PORT_PORT_NUM, u8, UA_MANDATORY), + UVERBS_ATTR_PTR_OUT( + UVERBS_ATTR_QUERY_PORT_RESP, + UVERBS_ATTR_STRUCT(struct ib_uverbs_query_port_resp_ex, + reserved), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_QUERY_GID_TABLE, + UVERBS_ATTR_CONST_IN(UVERBS_ATTR_QUERY_GID_TABLE_ENTRY_SIZE, u64, + UA_MANDATORY), + UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_QUERY_GID_TABLE_FLAGS, u32, + UA_OPTIONAL), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_GID_TABLE_RESP_ENTRIES, + UVERBS_ATTR_MIN_SIZE(0), UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_GID_TABLE_RESP_NUM_ENTRIES, + UVERBS_ATTR_TYPE(u64), UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_QUERY_GID_ENTRY, + UVERBS_ATTR_CONST_IN(UVERBS_ATTR_QUERY_GID_ENTRY_PORT, u32, + UA_MANDATORY), + UVERBS_ATTR_CONST_IN(UVERBS_ATTR_QUERY_GID_ENTRY_GID_INDEX, u32, + UA_MANDATORY), + UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_QUERY_GID_ENTRY_FLAGS, u32, + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_GID_ENTRY_RESP_ENTRY, + UVERBS_ATTR_STRUCT(struct ib_uverbs_gid_entry, + netdev_ifindex), + UA_MANDATORY)); + +DECLARE_UVERBS_GLOBAL_METHODS(UVERBS_OBJECT_DEVICE, + &UVERBS_METHOD(UVERBS_METHOD_GET_CONTEXT), + &UVERBS_METHOD(UVERBS_METHOD_INVOKE_WRITE), + &UVERBS_METHOD(UVERBS_METHOD_INFO_HANDLES), + &UVERBS_METHOD(UVERBS_METHOD_QUERY_PORT), + &UVERBS_METHOD(UVERBS_METHOD_QUERY_CONTEXT), + &UVERBS_METHOD(UVERBS_METHOD_QUERY_GID_TABLE), + &UVERBS_METHOD(UVERBS_METHOD_QUERY_GID_ENTRY)); + +const struct uapi_definition uverbs_def_obj_device[] = { + UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_DEVICE), + {}, +}; diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_std_types_dm.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_std_types_dm.c new file mode 100644 index 0000000..98c522c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_std_types_dm.c @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2018, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "rdma_core.h" +#include "uverbs.h" +#include + +static int uverbs_free_dm(struct ib_uobject *uobject, + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) +{ + struct ib_dm *dm = uobject->object; + + if (atomic_read(&dm->usecnt)) + return -EBUSY; + + return dm->device->ops.dealloc_dm(dm, attrs); +} + +static int UVERBS_HANDLER(UVERBS_METHOD_DM_ALLOC)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_dm_alloc_attr attr = {}; + struct ib_uobject *uobj = + uverbs_attr_get(attrs, UVERBS_ATTR_ALLOC_DM_HANDLE) + ->obj_attr.uobject; + struct ib_device *ib_dev = attrs->context->device; + struct ib_dm *dm; + int ret; + + if (!ib_dev->ops.alloc_dm) + return -EOPNOTSUPP; + + ret = uverbs_copy_from(&attr.length, attrs, + UVERBS_ATTR_ALLOC_DM_LENGTH); + if (ret) + return ret; + + ret = uverbs_copy_from(&attr.alignment, attrs, + UVERBS_ATTR_ALLOC_DM_ALIGNMENT); + if (ret) + return ret; + + dm = ib_dev->ops.alloc_dm(ib_dev, attrs->context, &attr, attrs); + if (IS_ERR(dm)) + return PTR_ERR(dm); + + dm->device = ib_dev; + dm->length = attr.length; + dm->uobject = uobj; + atomic_set(&dm->usecnt, 0); + + uobj->object = dm; + + return 0; +} + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_DM_ALLOC, + UVERBS_ATTR_IDR(UVERBS_ATTR_ALLOC_DM_HANDLE, + UVERBS_OBJECT_DM, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_ALLOC_DM_LENGTH, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_ALLOC_DM_ALIGNMENT, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD_DESTROY( + UVERBS_METHOD_DM_FREE, + UVERBS_ATTR_IDR(UVERBS_ATTR_FREE_DM_HANDLE, + UVERBS_OBJECT_DM, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_DM, + UVERBS_TYPE_ALLOC_IDR(uverbs_free_dm), + &UVERBS_METHOD(UVERBS_METHOD_DM_ALLOC), + &UVERBS_METHOD(UVERBS_METHOD_DM_FREE)); + +const struct uapi_definition uverbs_def_obj_dm[] = { + UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_DM, + UAPI_DEF_OBJ_NEEDS_FN(dealloc_dm)), + {} +}; diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_std_types_flow_action.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_std_types_flow_action.c new file mode 100644 index 0000000..d42ed7f --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_std_types_flow_action.c @@ -0,0 +1,447 @@ +/* + * Copyright (c) 2018, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "rdma_core.h" +#include "uverbs.h" +#include + +static int uverbs_free_flow_action(struct ib_uobject *uobject, + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) +{ + struct ib_flow_action *action = uobject->object; + + if (atomic_read(&action->usecnt)) + return -EBUSY; + + return action->device->ops.destroy_flow_action(action); +} + +static u64 esp_flags_uverbs_to_verbs(struct uverbs_attr_bundle *attrs, + u32 flags, bool is_modify) +{ + u64 verbs_flags = flags; + + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_ESN)) + verbs_flags |= IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED; + + if (is_modify && uverbs_attr_is_valid(attrs, + UVERBS_ATTR_FLOW_ACTION_ESP_ATTRS)) + verbs_flags |= IB_FLOW_ACTION_ESP_FLAGS_MOD_ESP_ATTRS; + + return verbs_flags; +}; + +static int validate_flow_action_esp_keymat_aes_gcm(struct ib_flow_action_attrs_esp_keymats *keymat) +{ + struct ib_uverbs_flow_action_esp_keymat_aes_gcm *aes_gcm = + &keymat->keymat.aes_gcm; + + if (aes_gcm->iv_algo > IB_UVERBS_FLOW_ACTION_IV_ALGO_SEQ) + return -EOPNOTSUPP; + + if (aes_gcm->key_len != 32 && + aes_gcm->key_len != 24 && + aes_gcm->key_len != 16) + return -EINVAL; + + if (aes_gcm->icv_len != 16 && + aes_gcm->icv_len != 8 && + aes_gcm->icv_len != 12) + return -EINVAL; + + return 0; +} + +static int (* const flow_action_esp_keymat_validate[])(struct ib_flow_action_attrs_esp_keymats *keymat) = { + [IB_UVERBS_FLOW_ACTION_ESP_KEYMAT_AES_GCM] = validate_flow_action_esp_keymat_aes_gcm, +}; + +static int flow_action_esp_replay_none(struct ib_flow_action_attrs_esp_replays *replay, + bool is_modify) +{ + /* This is used in order to modify an esp flow action with an enabled + * replay protection to a disabled one. This is only supported via + * modify, as in create verb we can simply drop the REPLAY attribute and + * achieve the same thing. + */ + return is_modify ? 0 : -EINVAL; +} + +static int flow_action_esp_replay_def_ok(struct ib_flow_action_attrs_esp_replays *replay, + bool is_modify) +{ + /* Some replay protections could always be enabled without validating + * anything. + */ + return 0; +} + +static int (* const flow_action_esp_replay_validate[])(struct ib_flow_action_attrs_esp_replays *replay, + bool is_modify) = { + [IB_UVERBS_FLOW_ACTION_ESP_REPLAY_NONE] = flow_action_esp_replay_none, + [IB_UVERBS_FLOW_ACTION_ESP_REPLAY_BMP] = flow_action_esp_replay_def_ok, +}; + +static int parse_esp_ip(enum ib_flow_spec_type proto, + const void __user *val_ptr, + size_t len, union ib_flow_spec *out) +{ + int ret; + const struct ib_uverbs_flow_ipv4_filter ipv4 = { + .src_ip = cpu_to_be32(0xffffffffUL), + .dst_ip = cpu_to_be32(0xffffffffUL), + .proto = 0xff, + .tos = 0xff, + .ttl = 0xff, + .flags = 0xff, + }; + const struct ib_uverbs_flow_ipv6_filter ipv6 = { + .src_ip = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + .dst_ip = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + .flow_label = cpu_to_be32(0xffffffffUL), + .next_hdr = 0xff, + .traffic_class = 0xff, + .hop_limit = 0xff, + }; + union { + struct ib_uverbs_flow_ipv4_filter ipv4; + struct ib_uverbs_flow_ipv6_filter ipv6; + } user_val = {}; + const void *user_pmask; + size_t val_len; + + /* If the flow IPv4/IPv6 flow specifications are extended, the mask + * should be changed as well. + */ + BUILD_BUG_ON(offsetof(struct ib_uverbs_flow_ipv4_filter, flags) + + sizeof(ipv4.flags) != sizeof(ipv4)); + BUILD_BUG_ON(offsetof(struct ib_uverbs_flow_ipv6_filter, reserved) + + sizeof(ipv6.reserved) != sizeof(ipv6)); + + switch (proto) { + case IB_FLOW_SPEC_IPV4: + if (len > sizeof(user_val.ipv4) && + !ib_is_buffer_cleared(val_ptr + sizeof(user_val.ipv4), + len - sizeof(user_val.ipv4))) + return -EOPNOTSUPP; + + val_len = min_t(size_t, len, sizeof(user_val.ipv4)); + ret = copy_from_user(&user_val.ipv4, val_ptr, + val_len); + if (ret) + return -EFAULT; + + user_pmask = &ipv4; + break; + case IB_FLOW_SPEC_IPV6: + if (len > sizeof(user_val.ipv6) && + !ib_is_buffer_cleared(val_ptr + sizeof(user_val.ipv6), + len - sizeof(user_val.ipv6))) + return -EOPNOTSUPP; + + val_len = min_t(size_t, len, sizeof(user_val.ipv6)); + ret = copy_from_user(&user_val.ipv6, val_ptr, + val_len); + if (ret) + return -EFAULT; + + user_pmask = &ipv6; + break; + default: + return -EOPNOTSUPP; + } + + return ib_uverbs_kern_spec_to_ib_spec_filter(proto, user_pmask, + &user_val, + val_len, out); +} + +static int flow_action_esp_get_encap(struct ib_flow_spec_list *out, + struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_flow_action_esp_encap uverbs_encap; + int ret; + + ret = uverbs_copy_from(&uverbs_encap, attrs, + UVERBS_ATTR_FLOW_ACTION_ESP_ENCAP); + if (ret) + return ret; + + /* We currently support only one encap */ + if (uverbs_encap.next_ptr) + return -EOPNOTSUPP; + + if (uverbs_encap.type != IB_FLOW_SPEC_IPV4 && + uverbs_encap.type != IB_FLOW_SPEC_IPV6) + return -EOPNOTSUPP; + + return parse_esp_ip(uverbs_encap.type, + u64_to_user_ptr(uverbs_encap.val_ptr), + uverbs_encap.len, + &out->spec); +} + +struct ib_flow_action_esp_attr { + struct ib_flow_action_attrs_esp hdr; + struct ib_flow_action_attrs_esp_keymats keymat; + struct ib_flow_action_attrs_esp_replays replay; + /* We currently support only one spec */ + struct ib_flow_spec_list encap; +}; + +#define ESP_LAST_SUPPORTED_FLAG IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW +static int parse_flow_action_esp(struct ib_device *ib_dev, + struct uverbs_attr_bundle *attrs, + struct ib_flow_action_esp_attr *esp_attr, + bool is_modify) +{ + struct ib_uverbs_flow_action_esp uverbs_esp = {}; + int ret; + + /* Optional param, if it doesn't exist, we get -ENOENT and skip it */ + ret = uverbs_copy_from(&esp_attr->hdr.esn, attrs, + UVERBS_ATTR_FLOW_ACTION_ESP_ESN); + if (IS_UVERBS_COPY_ERR(ret)) + return ret; + + /* This can be called from FLOW_ACTION_ESP_MODIFY where + * UVERBS_ATTR_FLOW_ACTION_ESP_ATTRS is optional + */ + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_ATTRS)) { + ret = uverbs_copy_from_or_zero(&uverbs_esp, attrs, + UVERBS_ATTR_FLOW_ACTION_ESP_ATTRS); + if (ret) + return ret; + + if (uverbs_esp.flags & ~((ESP_LAST_SUPPORTED_FLAG << 1) - 1)) + return -EOPNOTSUPP; + + esp_attr->hdr.spi = uverbs_esp.spi; + esp_attr->hdr.seq = uverbs_esp.seq; + esp_attr->hdr.tfc_pad = uverbs_esp.tfc_pad; + esp_attr->hdr.hard_limit_pkts = uverbs_esp.hard_limit_pkts; + } + esp_attr->hdr.flags = esp_flags_uverbs_to_verbs(attrs, uverbs_esp.flags, + is_modify); + + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_KEYMAT)) { + esp_attr->keymat.protocol = + uverbs_attr_get_enum_id(attrs, + UVERBS_ATTR_FLOW_ACTION_ESP_KEYMAT); + ret = uverbs_copy_from_or_zero(&esp_attr->keymat.keymat, + attrs, + UVERBS_ATTR_FLOW_ACTION_ESP_KEYMAT); + if (ret) + return ret; + + ret = flow_action_esp_keymat_validate[esp_attr->keymat.protocol](&esp_attr->keymat); + if (ret) + return ret; + + esp_attr->hdr.keymat = &esp_attr->keymat; + } + + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_REPLAY)) { + esp_attr->replay.protocol = + uverbs_attr_get_enum_id(attrs, + UVERBS_ATTR_FLOW_ACTION_ESP_REPLAY); + + ret = uverbs_copy_from_or_zero(&esp_attr->replay.replay, + attrs, + UVERBS_ATTR_FLOW_ACTION_ESP_REPLAY); + if (ret) + return ret; + + ret = flow_action_esp_replay_validate[esp_attr->replay.protocol](&esp_attr->replay, + is_modify); + if (ret) + return ret; + + esp_attr->hdr.replay = &esp_attr->replay; + } + + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_ENCAP)) { + ret = flow_action_esp_get_encap(&esp_attr->encap, attrs); + if (ret) + return ret; + + esp_attr->hdr.encap = &esp_attr->encap; + } + + return 0; +} + +static int UVERBS_HANDLER(UVERBS_METHOD_FLOW_ACTION_ESP_CREATE)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = uverbs_attr_get_uobject( + attrs, UVERBS_ATTR_CREATE_FLOW_ACTION_ESP_HANDLE); + struct ib_device *ib_dev = attrs->context->device; + int ret; + struct ib_flow_action *action; + struct ib_flow_action_esp_attr esp_attr = {}; + + if (!ib_dev->ops.create_flow_action_esp) + return -EOPNOTSUPP; + + ret = parse_flow_action_esp(ib_dev, attrs, &esp_attr, false); + if (ret) + return ret; + + /* No need to check as this attribute is marked as MANDATORY */ + action = ib_dev->ops.create_flow_action_esp(ib_dev, &esp_attr.hdr, + attrs); + if (IS_ERR(action)) + return PTR_ERR(action); + + uverbs_flow_action_fill_action(action, uobj, ib_dev, + IB_FLOW_ACTION_ESP); + + return 0; +} + +static int UVERBS_HANDLER(UVERBS_METHOD_FLOW_ACTION_ESP_MODIFY)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = uverbs_attr_get_uobject( + attrs, UVERBS_ATTR_MODIFY_FLOW_ACTION_ESP_HANDLE); + struct ib_flow_action *action = uobj->object; + int ret; + struct ib_flow_action_esp_attr esp_attr = {}; + + if (!action->device->ops.modify_flow_action_esp) + return -EOPNOTSUPP; + + ret = parse_flow_action_esp(action->device, attrs, &esp_attr, true); + if (ret) + return ret; + + if (action->type != IB_FLOW_ACTION_ESP) + return -EINVAL; + + return action->device->ops.modify_flow_action_esp(action, + &esp_attr.hdr, + attrs); +} + +static const struct uverbs_attr_spec uverbs_flow_action_esp_keymat[] = { + [IB_UVERBS_FLOW_ACTION_ESP_KEYMAT_AES_GCM] = { + .type = UVERBS_ATTR_TYPE_PTR_IN, + UVERBS_ATTR_STRUCT( + struct ib_uverbs_flow_action_esp_keymat_aes_gcm, + aes_key), + }, +}; + +static const struct uverbs_attr_spec uverbs_flow_action_esp_replay[] = { + [IB_UVERBS_FLOW_ACTION_ESP_REPLAY_NONE] = { + .type = UVERBS_ATTR_TYPE_PTR_IN, + UVERBS_ATTR_NO_DATA(), + }, + [IB_UVERBS_FLOW_ACTION_ESP_REPLAY_BMP] = { + .type = UVERBS_ATTR_TYPE_PTR_IN, + UVERBS_ATTR_STRUCT(struct ib_uverbs_flow_action_esp_replay_bmp, + size), + }, +}; + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_FLOW_ACTION_ESP_CREATE, + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_FLOW_ACTION_ESP_HANDLE, + UVERBS_OBJECT_FLOW_ACTION, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_FLOW_ACTION_ESP_ATTRS, + UVERBS_ATTR_STRUCT(struct ib_uverbs_flow_action_esp, + hard_limit_pkts), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_FLOW_ACTION_ESP_ESN, + UVERBS_ATTR_TYPE(__u32), + UA_OPTIONAL), + UVERBS_ATTR_ENUM_IN(UVERBS_ATTR_FLOW_ACTION_ESP_KEYMAT, + uverbs_flow_action_esp_keymat, + UA_MANDATORY), + UVERBS_ATTR_ENUM_IN(UVERBS_ATTR_FLOW_ACTION_ESP_REPLAY, + uverbs_flow_action_esp_replay, + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN( + UVERBS_ATTR_FLOW_ACTION_ESP_ENCAP, + UVERBS_ATTR_TYPE(struct ib_uverbs_flow_action_esp_encap), + UA_OPTIONAL)); + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_FLOW_ACTION_ESP_MODIFY, + UVERBS_ATTR_IDR(UVERBS_ATTR_MODIFY_FLOW_ACTION_ESP_HANDLE, + UVERBS_OBJECT_FLOW_ACTION, + UVERBS_ACCESS_WRITE, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_FLOW_ACTION_ESP_ATTRS, + UVERBS_ATTR_STRUCT(struct ib_uverbs_flow_action_esp, + hard_limit_pkts), + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_FLOW_ACTION_ESP_ESN, + UVERBS_ATTR_TYPE(__u32), + UA_OPTIONAL), + UVERBS_ATTR_ENUM_IN(UVERBS_ATTR_FLOW_ACTION_ESP_KEYMAT, + uverbs_flow_action_esp_keymat, + UA_OPTIONAL), + UVERBS_ATTR_ENUM_IN(UVERBS_ATTR_FLOW_ACTION_ESP_REPLAY, + uverbs_flow_action_esp_replay, + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN( + UVERBS_ATTR_FLOW_ACTION_ESP_ENCAP, + UVERBS_ATTR_TYPE(struct ib_uverbs_flow_action_esp_encap), + UA_OPTIONAL)); + +DECLARE_UVERBS_NAMED_METHOD_DESTROY( + UVERBS_METHOD_FLOW_ACTION_DESTROY, + UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_FLOW_ACTION_HANDLE, + UVERBS_OBJECT_FLOW_ACTION, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT( + UVERBS_OBJECT_FLOW_ACTION, + UVERBS_TYPE_ALLOC_IDR(uverbs_free_flow_action), + &UVERBS_METHOD(UVERBS_METHOD_FLOW_ACTION_ESP_CREATE), + &UVERBS_METHOD(UVERBS_METHOD_FLOW_ACTION_DESTROY), + &UVERBS_METHOD(UVERBS_METHOD_FLOW_ACTION_ESP_MODIFY)); + +const struct uapi_definition uverbs_def_obj_flow_action[] = { + UAPI_DEF_CHAIN_OBJ_TREE_NAMED( + UVERBS_OBJECT_FLOW_ACTION, + UAPI_DEF_OBJ_NEEDS_FN(destroy_flow_action)), + {} +}; diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_std_types_mr.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_std_types_mr.c new file mode 100644 index 0000000..03e1db5 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_std_types_mr.c @@ -0,0 +1,385 @@ +/* + * Copyright (c) 2018, Mellanox Technologies inc. All rights reserved. + * Copyright (c) 2020, Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "rdma_core.h" +#include "uverbs.h" +#include +#include "restrack.h" + +static int uverbs_free_mr(struct ib_uobject *uobject, + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) +{ + return ib_dereg_mr_user((struct ib_mr *)uobject->object, + &attrs->driver_udata); +} + +static int UVERBS_HANDLER(UVERBS_METHOD_ADVISE_MR)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_pd *pd = + uverbs_attr_get_obj(attrs, UVERBS_ATTR_ADVISE_MR_PD_HANDLE); + enum ib_uverbs_advise_mr_advice advice; + struct ib_device *ib_dev = pd->device; + struct ib_sge *sg_list; + int num_sge; + u32 flags; + int ret; + + /* FIXME: Extend the UAPI_DEF_OBJ_NEEDS_FN stuff.. */ + if (!ib_dev->ops.advise_mr) + return -EOPNOTSUPP; + + ret = uverbs_get_const(&advice, attrs, UVERBS_ATTR_ADVISE_MR_ADVICE); + if (ret) + return ret; + + ret = uverbs_get_flags32(&flags, attrs, UVERBS_ATTR_ADVISE_MR_FLAGS, + IB_UVERBS_ADVISE_MR_FLAG_FLUSH); + if (ret) + return ret; + + num_sge = uverbs_attr_ptr_get_array_size( + attrs, UVERBS_ATTR_ADVISE_MR_SGE_LIST, sizeof(struct ib_sge)); + if (num_sge <= 0) + return num_sge; + + sg_list = uverbs_attr_get_alloced_ptr(attrs, + UVERBS_ATTR_ADVISE_MR_SGE_LIST); + return ib_dev->ops.advise_mr(pd, advice, flags, sg_list, num_sge, + attrs); +} + +static int UVERBS_HANDLER(UVERBS_METHOD_DM_MR_REG)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_dm_mr_attr attr = {}; + struct ib_uobject *uobj = + uverbs_attr_get_uobject(attrs, UVERBS_ATTR_REG_DM_MR_HANDLE); + struct ib_dm *dm = + uverbs_attr_get_obj(attrs, UVERBS_ATTR_REG_DM_MR_DM_HANDLE); + struct ib_pd *pd = + uverbs_attr_get_obj(attrs, UVERBS_ATTR_REG_DM_MR_PD_HANDLE); + struct ib_device *ib_dev = pd->device; + + struct ib_mr *mr; + int ret; + + if (!ib_dev->ops.reg_dm_mr) + return -EOPNOTSUPP; + + ret = uverbs_copy_from(&attr.offset, attrs, UVERBS_ATTR_REG_DM_MR_OFFSET); + if (ret) + return ret; + + ret = uverbs_copy_from(&attr.length, attrs, + UVERBS_ATTR_REG_DM_MR_LENGTH); + if (ret) + return ret; + + ret = uverbs_get_flags32(&attr.access_flags, attrs, + UVERBS_ATTR_REG_DM_MR_ACCESS_FLAGS, + IB_ACCESS_SUPPORTED); + if (ret) + return ret; + + if (!(attr.access_flags & IB_ZERO_BASED)) + return -EINVAL; + + ret = ib_check_mr_access(ib_dev, attr.access_flags); + if (ret) + return ret; + + if (attr.offset > dm->length || attr.length > dm->length || + attr.length > dm->length - attr.offset) + return -EINVAL; + + mr = pd->device->ops.reg_dm_mr(pd, dm, &attr, attrs); + if (IS_ERR(mr)) + return PTR_ERR(mr); + + mr->device = pd->device; + mr->pd = pd; + mr->type = IB_MR_TYPE_DM; + mr->dm = dm; + mr->uobject = uobj; + atomic_inc(&pd->usecnt); + atomic_inc(&dm->usecnt); + + rdma_restrack_new(&mr->res, RDMA_RESTRACK_MR); + rdma_restrack_set_name(&mr->res, NULL); + rdma_restrack_add(&mr->res); + uobj->object = mr; + + uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_REG_DM_MR_HANDLE); + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_REG_DM_MR_RESP_LKEY, &mr->lkey, + sizeof(mr->lkey)); + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_REG_DM_MR_RESP_RKEY, + &mr->rkey, sizeof(mr->rkey)); + return ret; +} + +static int UVERBS_HANDLER(UVERBS_METHOD_QUERY_MR)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_mr *mr = + uverbs_attr_get_obj(attrs, UVERBS_ATTR_QUERY_MR_HANDLE); + int ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_QUERY_MR_RESP_LKEY, &mr->lkey, + sizeof(mr->lkey)); + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_QUERY_MR_RESP_RKEY, + &mr->rkey, sizeof(mr->rkey)); + + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_QUERY_MR_RESP_LENGTH, + &mr->length, sizeof(mr->length)); + + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_QUERY_MR_RESP_IOVA, + &mr->iova, sizeof(mr->iova)); + + return IS_UVERBS_COPY_ERR(ret) ? ret : 0; +} + +static int UVERBS_HANDLER(UVERBS_METHOD_REG_DMABUF_MR)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = + uverbs_attr_get_uobject(attrs, UVERBS_ATTR_REG_DMABUF_MR_HANDLE); + struct ib_pd *pd = + uverbs_attr_get_obj(attrs, UVERBS_ATTR_REG_DMABUF_MR_PD_HANDLE); + struct ib_device *ib_dev = pd->device; + + u64 offset, length, iova; + u32 fd, access_flags; + struct ib_mr *mr; + int ret; + + if (!ib_dev->ops.reg_user_mr_dmabuf) + return -EOPNOTSUPP; + + ret = uverbs_copy_from(&offset, attrs, + UVERBS_ATTR_REG_DMABUF_MR_OFFSET); + if (ret) + return ret; + + ret = uverbs_copy_from(&length, attrs, + UVERBS_ATTR_REG_DMABUF_MR_LENGTH); + if (ret) + return ret; + + ret = uverbs_copy_from(&iova, attrs, + UVERBS_ATTR_REG_DMABUF_MR_IOVA); + if (ret) + return ret; + + if ((offset & ~PAGE_MASK) != (iova & ~PAGE_MASK)) + return -EINVAL; + + ret = uverbs_copy_from(&fd, attrs, + UVERBS_ATTR_REG_DMABUF_MR_FD); + if (ret) + return ret; + + ret = uverbs_get_flags32(&access_flags, attrs, + UVERBS_ATTR_REG_DMABUF_MR_ACCESS_FLAGS, + IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_ATOMIC | + IB_ACCESS_RELAXED_ORDERING); + if (ret) + return ret; + + ret = ib_check_mr_access(ib_dev, access_flags); + if (ret) + return ret; + + mr = pd->device->ops.reg_user_mr_dmabuf(pd, offset, length, iova, fd, + access_flags, + &attrs->driver_udata); + if (IS_ERR(mr)) + return PTR_ERR(mr); + + mr->device = pd->device; + mr->pd = pd; + mr->type = IB_MR_TYPE_USER; + mr->uobject = uobj; + atomic_inc(&pd->usecnt); + + rdma_restrack_new(&mr->res, RDMA_RESTRACK_MR); + rdma_restrack_set_name(&mr->res, NULL); + rdma_restrack_add(&mr->res); + uobj->object = mr; + + uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_REG_DMABUF_MR_HANDLE); + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_REG_DMABUF_MR_RESP_LKEY, + &mr->lkey, sizeof(mr->lkey)); + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_REG_DMABUF_MR_RESP_RKEY, + &mr->rkey, sizeof(mr->rkey)); + return ret; +} + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_ADVISE_MR, + UVERBS_ATTR_IDR(UVERBS_ATTR_ADVISE_MR_PD_HANDLE, + UVERBS_OBJECT_PD, + UVERBS_ACCESS_READ, + UA_MANDATORY), + UVERBS_ATTR_CONST_IN(UVERBS_ATTR_ADVISE_MR_ADVICE, + enum ib_uverbs_advise_mr_advice, + UA_MANDATORY), + UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_ADVISE_MR_FLAGS, + enum ib_uverbs_advise_mr_flag, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_ADVISE_MR_SGE_LIST, + UVERBS_ATTR_MIN_SIZE(sizeof(struct ib_uverbs_sge)), + UA_MANDATORY, + UA_ALLOC_AND_COPY)); + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_QUERY_MR, + UVERBS_ATTR_IDR(UVERBS_ATTR_QUERY_MR_HANDLE, + UVERBS_OBJECT_MR, + UVERBS_ACCESS_READ, + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_MR_RESP_RKEY, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_MR_RESP_LKEY, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_MR_RESP_LENGTH, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_MR_RESP_IOVA, + UVERBS_ATTR_TYPE(u64), + UA_OPTIONAL)); + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_DM_MR_REG, + UVERBS_ATTR_IDR(UVERBS_ATTR_REG_DM_MR_HANDLE, + UVERBS_OBJECT_MR, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_DM_MR_OFFSET, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_DM_MR_LENGTH, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_IDR(UVERBS_ATTR_REG_DM_MR_PD_HANDLE, + UVERBS_OBJECT_PD, + UVERBS_ACCESS_READ, + UA_MANDATORY), + UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_REG_DM_MR_ACCESS_FLAGS, + enum ib_access_flags), + UVERBS_ATTR_IDR(UVERBS_ATTR_REG_DM_MR_DM_HANDLE, + UVERBS_OBJECT_DM, + UVERBS_ACCESS_READ, + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_REG_DM_MR_RESP_LKEY, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_REG_DM_MR_RESP_RKEY, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_REG_DMABUF_MR, + UVERBS_ATTR_IDR(UVERBS_ATTR_REG_DMABUF_MR_HANDLE, + UVERBS_OBJECT_MR, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_IDR(UVERBS_ATTR_REG_DMABUF_MR_PD_HANDLE, + UVERBS_OBJECT_PD, + UVERBS_ACCESS_READ, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_DMABUF_MR_OFFSET, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_DMABUF_MR_LENGTH, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_DMABUF_MR_IOVA, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_DMABUF_MR_FD, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_REG_DMABUF_MR_ACCESS_FLAGS, + enum ib_access_flags), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_REG_DMABUF_MR_RESP_LKEY, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_REG_DMABUF_MR_RESP_RKEY, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD_DESTROY( + UVERBS_METHOD_MR_DESTROY, + UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_MR_HANDLE, + UVERBS_OBJECT_MR, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT( + UVERBS_OBJECT_MR, + UVERBS_TYPE_ALLOC_IDR(uverbs_free_mr), + &UVERBS_METHOD(UVERBS_METHOD_ADVISE_MR), + &UVERBS_METHOD(UVERBS_METHOD_DM_MR_REG), + &UVERBS_METHOD(UVERBS_METHOD_MR_DESTROY), + &UVERBS_METHOD(UVERBS_METHOD_QUERY_MR), + &UVERBS_METHOD(UVERBS_METHOD_REG_DMABUF_MR)); + +const struct uapi_definition uverbs_def_obj_mr[] = { + UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_MR, + UAPI_DEF_OBJ_NEEDS_FN(dereg_mr)), + {} +}; diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_std_types_qp.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_std_types_qp.c new file mode 100644 index 0000000..dd10754 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_std_types_qp.c @@ -0,0 +1,380 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. + */ + +#include +#include "rdma_core.h" +#include "uverbs.h" +#include "core_priv.h" + +static int uverbs_free_qp(struct ib_uobject *uobject, + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) +{ + struct ib_qp *qp = uobject->object; + struct ib_uqp_object *uqp = + container_of(uobject, struct ib_uqp_object, uevent.uobject); + int ret; + + /* + * If this is a user triggered destroy then do not allow destruction + * until the user cleans up all the mcast bindings. Unlike in other + * places we forcibly clean up the mcast attachments for !DESTROY + * because the mcast attaches are not ubojects and will not be + * destroyed by anything else during cleanup processing. + */ + if (why == RDMA_REMOVE_DESTROY) { + if (!list_empty(&uqp->mcast_list)) + return -EBUSY; + } else if (qp == qp->real_qp) { + ib_uverbs_detach_umcast(qp, uqp); + } + + ret = ib_destroy_qp_user(qp, &attrs->driver_udata); + if (ret) + return ret; + + if (uqp->uxrcd) + atomic_dec(&uqp->uxrcd->refcnt); + + ib_uverbs_release_uevent(&uqp->uevent); + return 0; +} + +static int check_creation_flags(enum ib_qp_type qp_type, + u32 create_flags) +{ + create_flags &= ~IB_UVERBS_QP_CREATE_SQ_SIG_ALL; + + if (!create_flags || qp_type == IB_QPT_DRIVER) + return 0; + + if (qp_type != IB_QPT_RAW_PACKET && qp_type != IB_QPT_UD) + return -EINVAL; + + if ((create_flags & IB_UVERBS_QP_CREATE_SCATTER_FCS || + create_flags & IB_UVERBS_QP_CREATE_CVLAN_STRIPPING) && + qp_type != IB_QPT_RAW_PACKET) + return -EINVAL; + + return 0; +} + +static void set_caps(struct ib_qp_init_attr *attr, + struct ib_uverbs_qp_cap *cap, bool req) +{ + if (req) { + attr->cap.max_send_wr = cap->max_send_wr; + attr->cap.max_recv_wr = cap->max_recv_wr; + attr->cap.max_send_sge = cap->max_send_sge; + attr->cap.max_recv_sge = cap->max_recv_sge; + attr->cap.max_inline_data = cap->max_inline_data; + } else { + cap->max_send_wr = attr->cap.max_send_wr; + cap->max_recv_wr = attr->cap.max_recv_wr; + cap->max_send_sge = attr->cap.max_send_sge; + cap->max_recv_sge = attr->cap.max_recv_sge; + cap->max_inline_data = attr->cap.max_inline_data; + } +} + +static int UVERBS_HANDLER(UVERBS_METHOD_QP_CREATE)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uqp_object *obj = container_of( + uverbs_attr_get_uobject(attrs, UVERBS_ATTR_CREATE_QP_HANDLE), + typeof(*obj), uevent.uobject); + struct ib_qp_init_attr attr = {}; + struct ib_uverbs_qp_cap cap = {}; + struct ib_rwq_ind_table *rwq_ind_tbl = NULL; + struct ib_qp *qp; + struct ib_pd *pd = NULL; + struct ib_srq *srq = NULL; + struct ib_cq *recv_cq = NULL; + struct ib_cq *send_cq = NULL; + struct ib_xrcd *xrcd = NULL; + struct ib_uobject *xrcd_uobj = NULL; + struct ib_device *device; + u64 user_handle; + int ret; + + ret = uverbs_copy_from_or_zero(&cap, attrs, + UVERBS_ATTR_CREATE_QP_CAP); + if (!ret) + ret = uverbs_copy_from(&user_handle, attrs, + UVERBS_ATTR_CREATE_QP_USER_HANDLE); + if (!ret) + ret = uverbs_get_const(&attr.qp_type, attrs, + UVERBS_ATTR_CREATE_QP_TYPE); + if (ret) + return ret; + + switch (attr.qp_type) { + case IB_QPT_XRC_TGT: + if (uverbs_attr_is_valid(attrs, + UVERBS_ATTR_CREATE_QP_RECV_CQ_HANDLE) || + uverbs_attr_is_valid(attrs, + UVERBS_ATTR_CREATE_QP_SEND_CQ_HANDLE) || + uverbs_attr_is_valid(attrs, + UVERBS_ATTR_CREATE_QP_PD_HANDLE) || + uverbs_attr_is_valid(attrs, + UVERBS_ATTR_CREATE_QP_IND_TABLE_HANDLE)) + return -EINVAL; + + xrcd_uobj = uverbs_attr_get_uobject(attrs, + UVERBS_ATTR_CREATE_QP_XRCD_HANDLE); + if (IS_ERR(xrcd_uobj)) + return PTR_ERR(xrcd_uobj); + + xrcd = (struct ib_xrcd *)xrcd_uobj->object; + if (!xrcd) + return -EINVAL; + device = xrcd->device; + break; + case IB_UVERBS_QPT_RAW_PACKET: + if (!capable(CAP_NET_RAW)) + return -EPERM; + fallthrough; + case IB_UVERBS_QPT_RC: + case IB_UVERBS_QPT_UC: + case IB_UVERBS_QPT_UD: + case IB_UVERBS_QPT_XRC_INI: + case IB_UVERBS_QPT_DRIVER: + if (uverbs_attr_is_valid(attrs, + UVERBS_ATTR_CREATE_QP_XRCD_HANDLE) || + (uverbs_attr_is_valid(attrs, + UVERBS_ATTR_CREATE_QP_SRQ_HANDLE) && + attr.qp_type == IB_QPT_XRC_INI)) + return -EINVAL; + + pd = uverbs_attr_get_obj(attrs, + UVERBS_ATTR_CREATE_QP_PD_HANDLE); + if (IS_ERR(pd)) + return PTR_ERR(pd); + + rwq_ind_tbl = uverbs_attr_get_obj(attrs, + UVERBS_ATTR_CREATE_QP_IND_TABLE_HANDLE); + if (!IS_ERR(rwq_ind_tbl)) { + if (cap.max_recv_wr || cap.max_recv_sge || + uverbs_attr_is_valid(attrs, + UVERBS_ATTR_CREATE_QP_RECV_CQ_HANDLE) || + uverbs_attr_is_valid(attrs, + UVERBS_ATTR_CREATE_QP_SRQ_HANDLE)) + return -EINVAL; + + /* send_cq is optinal */ + if (cap.max_send_wr) { + send_cq = uverbs_attr_get_obj(attrs, + UVERBS_ATTR_CREATE_QP_SEND_CQ_HANDLE); + if (IS_ERR(send_cq)) + return PTR_ERR(send_cq); + } + attr.rwq_ind_tbl = rwq_ind_tbl; + } else { + send_cq = uverbs_attr_get_obj(attrs, + UVERBS_ATTR_CREATE_QP_SEND_CQ_HANDLE); + if (IS_ERR(send_cq)) + return PTR_ERR(send_cq); + + if (attr.qp_type != IB_QPT_XRC_INI) { + recv_cq = uverbs_attr_get_obj(attrs, + UVERBS_ATTR_CREATE_QP_RECV_CQ_HANDLE); + if (IS_ERR(recv_cq)) + return PTR_ERR(recv_cq); + } + } + + device = pd->device; + break; + default: + return -EINVAL; + } + + ret = uverbs_get_flags32(&attr.create_flags, attrs, + UVERBS_ATTR_CREATE_QP_FLAGS, + IB_UVERBS_QP_CREATE_BLOCK_MULTICAST_LOOPBACK | + IB_UVERBS_QP_CREATE_SCATTER_FCS | + IB_UVERBS_QP_CREATE_CVLAN_STRIPPING | + IB_UVERBS_QP_CREATE_PCI_WRITE_END_PADDING | + IB_UVERBS_QP_CREATE_SQ_SIG_ALL); + if (ret) + return ret; + + ret = check_creation_flags(attr.qp_type, attr.create_flags); + if (ret) + return ret; + + if (uverbs_attr_is_valid(attrs, + UVERBS_ATTR_CREATE_QP_SOURCE_QPN)) { + ret = uverbs_copy_from(&attr.source_qpn, attrs, + UVERBS_ATTR_CREATE_QP_SOURCE_QPN); + if (ret) + return ret; + attr.create_flags |= IB_QP_CREATE_SOURCE_QPN; + } + + srq = uverbs_attr_get_obj(attrs, + UVERBS_ATTR_CREATE_QP_SRQ_HANDLE); + if (!IS_ERR(srq)) { + if ((srq->srq_type == IB_SRQT_XRC && + attr.qp_type != IB_QPT_XRC_TGT) || + (srq->srq_type != IB_SRQT_XRC && + attr.qp_type == IB_QPT_XRC_TGT)) + return -EINVAL; + attr.srq = srq; + } + + obj->uevent.event_file = ib_uverbs_get_async_event(attrs, + UVERBS_ATTR_CREATE_QP_EVENT_FD); + INIT_LIST_HEAD(&obj->uevent.event_list); + INIT_LIST_HEAD(&obj->mcast_list); + obj->uevent.uobject.user_handle = user_handle; + attr.event_handler = ib_uverbs_qp_event_handler; + attr.send_cq = send_cq; + attr.recv_cq = recv_cq; + attr.xrcd = xrcd; + if (attr.create_flags & IB_UVERBS_QP_CREATE_SQ_SIG_ALL) { + /* This creation bit is uverbs one, need to mask before + * calling drivers. It was added to prevent an extra user attr + * only for that when using ioctl. + */ + attr.create_flags &= ~IB_UVERBS_QP_CREATE_SQ_SIG_ALL; + attr.sq_sig_type = IB_SIGNAL_ALL_WR; + } else { + attr.sq_sig_type = IB_SIGNAL_REQ_WR; + } + + set_caps(&attr, &cap, true); + mutex_init(&obj->mcast_lock); + + qp = ib_create_qp_user(device, pd, &attr, &attrs->driver_udata, obj, + KBUILD_MODNAME); + if (IS_ERR(qp)) { + ret = PTR_ERR(qp); + goto err_put; + } + ib_qp_usecnt_inc(qp); + + if (attr.qp_type == IB_QPT_XRC_TGT) { + obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, + uobject); + atomic_inc(&obj->uxrcd->refcnt); + } + + obj->uevent.uobject.object = qp; + uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_CREATE_QP_HANDLE); + + set_caps(&attr, &cap, false); + ret = uverbs_copy_to_struct_or_zero(attrs, + UVERBS_ATTR_CREATE_QP_RESP_CAP, &cap, + sizeof(cap)); + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_QP_RESP_QP_NUM, + &qp->qp_num, + sizeof(qp->qp_num)); + + return ret; +err_put: + if (obj->uevent.event_file) + uverbs_uobject_put(&obj->uevent.event_file->uobj); + return ret; +}; + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_QP_CREATE, + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_QP_HANDLE, + UVERBS_OBJECT_QP, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_QP_XRCD_HANDLE, + UVERBS_OBJECT_XRCD, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_QP_PD_HANDLE, + UVERBS_OBJECT_PD, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_QP_SRQ_HANDLE, + UVERBS_OBJECT_SRQ, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_QP_SEND_CQ_HANDLE, + UVERBS_OBJECT_CQ, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_QP_RECV_CQ_HANDLE, + UVERBS_OBJECT_CQ, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_QP_IND_TABLE_HANDLE, + UVERBS_OBJECT_RWQ_IND_TBL, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_QP_USER_HANDLE, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_QP_CAP, + UVERBS_ATTR_STRUCT(struct ib_uverbs_qp_cap, + max_inline_data), + UA_MANDATORY), + UVERBS_ATTR_CONST_IN(UVERBS_ATTR_CREATE_QP_TYPE, + enum ib_uverbs_qp_type, + UA_MANDATORY), + UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_CREATE_QP_FLAGS, + enum ib_uverbs_qp_create_flags, + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_QP_SOURCE_QPN, + UVERBS_ATTR_TYPE(u32), + UA_OPTIONAL), + UVERBS_ATTR_FD(UVERBS_ATTR_CREATE_QP_EVENT_FD, + UVERBS_OBJECT_ASYNC_EVENT, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_QP_RESP_CAP, + UVERBS_ATTR_STRUCT(struct ib_uverbs_qp_cap, + max_inline_data), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_QP_RESP_QP_NUM, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_UHW()); + +static int UVERBS_HANDLER(UVERBS_METHOD_QP_DESTROY)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = + uverbs_attr_get_uobject(attrs, UVERBS_ATTR_DESTROY_QP_HANDLE); + struct ib_uqp_object *obj = + container_of(uobj, struct ib_uqp_object, uevent.uobject); + struct ib_uverbs_destroy_qp_resp resp = { + .events_reported = obj->uevent.events_reported + }; + + return uverbs_copy_to(attrs, UVERBS_ATTR_DESTROY_QP_RESP, &resp, + sizeof(resp)); +} + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_QP_DESTROY, + UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_QP_HANDLE, + UVERBS_OBJECT_QP, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_DESTROY_QP_RESP, + UVERBS_ATTR_TYPE(struct ib_uverbs_destroy_qp_resp), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT( + UVERBS_OBJECT_QP, + UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uqp_object), uverbs_free_qp), + &UVERBS_METHOD(UVERBS_METHOD_QP_CREATE), + &UVERBS_METHOD(UVERBS_METHOD_QP_DESTROY)); + +const struct uapi_definition uverbs_def_obj_qp[] = { + UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_QP, + UAPI_DEF_OBJ_NEEDS_FN(destroy_qp)), + {} +}; diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_std_types_srq.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_std_types_srq.c new file mode 100644 index 0000000..e5513f8 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_std_types_srq.c @@ -0,0 +1,234 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. + */ + +#include +#include "rdma_core.h" +#include "uverbs.h" + +static int uverbs_free_srq(struct ib_uobject *uobject, + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) +{ + struct ib_srq *srq = uobject->object; + struct ib_uevent_object *uevent = + container_of(uobject, struct ib_uevent_object, uobject); + enum ib_srq_type srq_type = srq->srq_type; + int ret; + + ret = ib_destroy_srq_user(srq, &attrs->driver_udata); + if (ret) + return ret; + + if (srq_type == IB_SRQT_XRC) { + struct ib_usrq_object *us = + container_of(uobject, struct ib_usrq_object, + uevent.uobject); + + atomic_dec(&us->uxrcd->refcnt); + } + + ib_uverbs_release_uevent(uevent); + return 0; +} + +static int UVERBS_HANDLER(UVERBS_METHOD_SRQ_CREATE)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_usrq_object *obj = container_of( + uverbs_attr_get_uobject(attrs, UVERBS_ATTR_CREATE_SRQ_HANDLE), + typeof(*obj), uevent.uobject); + struct ib_pd *pd = + uverbs_attr_get_obj(attrs, UVERBS_ATTR_CREATE_SRQ_PD_HANDLE); + struct ib_srq_init_attr attr = {}; + struct ib_uobject *xrcd_uobj; + struct ib_srq *srq; + u64 user_handle; + int ret; + + ret = uverbs_copy_from(&attr.attr.max_sge, attrs, + UVERBS_ATTR_CREATE_SRQ_MAX_SGE); + if (!ret) + ret = uverbs_copy_from(&attr.attr.max_wr, attrs, + UVERBS_ATTR_CREATE_SRQ_MAX_WR); + if (!ret) + ret = uverbs_copy_from(&attr.attr.srq_limit, attrs, + UVERBS_ATTR_CREATE_SRQ_LIMIT); + if (!ret) + ret = uverbs_copy_from(&user_handle, attrs, + UVERBS_ATTR_CREATE_SRQ_USER_HANDLE); + if (!ret) + ret = uverbs_get_const(&attr.srq_type, attrs, + UVERBS_ATTR_CREATE_SRQ_TYPE); + if (ret) + return ret; + + if (ib_srq_has_cq(attr.srq_type)) { + attr.ext.cq = uverbs_attr_get_obj(attrs, + UVERBS_ATTR_CREATE_SRQ_CQ_HANDLE); + if (IS_ERR(attr.ext.cq)) + return PTR_ERR(attr.ext.cq); + } + + switch (attr.srq_type) { + case IB_UVERBS_SRQT_XRC: + xrcd_uobj = uverbs_attr_get_uobject(attrs, + UVERBS_ATTR_CREATE_SRQ_XRCD_HANDLE); + if (IS_ERR(xrcd_uobj)) + return PTR_ERR(xrcd_uobj); + + attr.ext.xrc.xrcd = (struct ib_xrcd *)xrcd_uobj->object; + if (!attr.ext.xrc.xrcd) + return -EINVAL; + obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, + uobject); + atomic_inc(&obj->uxrcd->refcnt); + break; + case IB_UVERBS_SRQT_TM: + ret = uverbs_copy_from(&attr.ext.tag_matching.max_num_tags, + attrs, + UVERBS_ATTR_CREATE_SRQ_MAX_NUM_TAGS); + if (ret) + return ret; + break; + case IB_UVERBS_SRQT_BASIC: + break; + default: + return -EINVAL; + } + + obj->uevent.event_file = ib_uverbs_get_async_event(attrs, + UVERBS_ATTR_CREATE_SRQ_EVENT_FD); + INIT_LIST_HEAD(&obj->uevent.event_list); + attr.event_handler = ib_uverbs_srq_event_handler; + obj->uevent.uobject.user_handle = user_handle; + + srq = ib_create_srq_user(pd, &attr, obj, &attrs->driver_udata); + if (IS_ERR(srq)) { + ret = PTR_ERR(srq); + goto err; + } + + obj->uevent.uobject.object = srq; + uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_CREATE_SRQ_HANDLE); + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_SRQ_RESP_MAX_WR, + &attr.attr.max_wr, + sizeof(attr.attr.max_wr)); + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_SRQ_RESP_MAX_SGE, + &attr.attr.max_sge, + sizeof(attr.attr.max_sge)); + if (ret) + return ret; + + if (attr.srq_type == IB_SRQT_XRC) { + ret = uverbs_copy_to(attrs, + UVERBS_ATTR_CREATE_SRQ_RESP_SRQ_NUM, + &srq->ext.xrc.srq_num, + sizeof(srq->ext.xrc.srq_num)); + if (ret) + return ret; + } + + return 0; +err: + if (obj->uevent.event_file) + uverbs_uobject_put(&obj->uevent.event_file->uobj); + if (attr.srq_type == IB_SRQT_XRC) + atomic_dec(&obj->uxrcd->refcnt); + return ret; +}; + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_SRQ_CREATE, + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_SRQ_HANDLE, + UVERBS_OBJECT_SRQ, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_SRQ_PD_HANDLE, + UVERBS_OBJECT_PD, + UVERBS_ACCESS_READ, + UA_MANDATORY), + UVERBS_ATTR_CONST_IN(UVERBS_ATTR_CREATE_SRQ_TYPE, + enum ib_uverbs_srq_type, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_SRQ_USER_HANDLE, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_SRQ_MAX_WR, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_SRQ_MAX_SGE, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_SRQ_LIMIT, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_SRQ_XRCD_HANDLE, + UVERBS_OBJECT_XRCD, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_SRQ_CQ_HANDLE, + UVERBS_OBJECT_CQ, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_SRQ_MAX_NUM_TAGS, + UVERBS_ATTR_TYPE(u32), + UA_OPTIONAL), + UVERBS_ATTR_FD(UVERBS_ATTR_CREATE_SRQ_EVENT_FD, + UVERBS_OBJECT_ASYNC_EVENT, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_SRQ_RESP_MAX_WR, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_SRQ_RESP_MAX_SGE, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_SRQ_RESP_SRQ_NUM, + UVERBS_ATTR_TYPE(u32), + UA_OPTIONAL), + UVERBS_ATTR_UHW()); + +static int UVERBS_HANDLER(UVERBS_METHOD_SRQ_DESTROY)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = + uverbs_attr_get_uobject(attrs, UVERBS_ATTR_DESTROY_SRQ_HANDLE); + struct ib_usrq_object *obj = + container_of(uobj, struct ib_usrq_object, uevent.uobject); + struct ib_uverbs_destroy_srq_resp resp = { + .events_reported = obj->uevent.events_reported + }; + + return uverbs_copy_to(attrs, UVERBS_ATTR_DESTROY_SRQ_RESP, &resp, + sizeof(resp)); +} + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_SRQ_DESTROY, + UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_SRQ_HANDLE, + UVERBS_OBJECT_SRQ, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_DESTROY_SRQ_RESP, + UVERBS_ATTR_TYPE(struct ib_uverbs_destroy_srq_resp), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT( + UVERBS_OBJECT_SRQ, + UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_usrq_object), + uverbs_free_srq), + &UVERBS_METHOD(UVERBS_METHOD_SRQ_CREATE), + &UVERBS_METHOD(UVERBS_METHOD_SRQ_DESTROY) +); + +const struct uapi_definition uverbs_def_obj_srq[] = { + UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_SRQ, + UAPI_DEF_OBJ_NEEDS_FN(destroy_srq)), + {} +}; diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_std_types_wq.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_std_types_wq.c new file mode 100644 index 0000000..7ded833 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_std_types_wq.c @@ -0,0 +1,194 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. + */ + +#include +#include "rdma_core.h" +#include "uverbs.h" + +static int uverbs_free_wq(struct ib_uobject *uobject, + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) +{ + struct ib_wq *wq = uobject->object; + struct ib_uwq_object *uwq = + container_of(uobject, struct ib_uwq_object, uevent.uobject); + int ret; + + ret = ib_destroy_wq_user(wq, &attrs->driver_udata); + if (ret) + return ret; + + ib_uverbs_release_uevent(&uwq->uevent); + return 0; +} + +static int UVERBS_HANDLER(UVERBS_METHOD_WQ_CREATE)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uwq_object *obj = container_of( + uverbs_attr_get_uobject(attrs, UVERBS_ATTR_CREATE_WQ_HANDLE), + typeof(*obj), uevent.uobject); + struct ib_pd *pd = + uverbs_attr_get_obj(attrs, UVERBS_ATTR_CREATE_WQ_PD_HANDLE); + struct ib_cq *cq = + uverbs_attr_get_obj(attrs, UVERBS_ATTR_CREATE_WQ_CQ_HANDLE); + struct ib_wq_init_attr wq_init_attr = {}; + struct ib_wq *wq; + u64 user_handle; + int ret; + + ret = uverbs_get_flags32(&wq_init_attr.create_flags, attrs, + UVERBS_ATTR_CREATE_WQ_FLAGS, + IB_UVERBS_WQ_FLAGS_CVLAN_STRIPPING | + IB_UVERBS_WQ_FLAGS_SCATTER_FCS | + IB_UVERBS_WQ_FLAGS_DELAY_DROP | + IB_UVERBS_WQ_FLAGS_PCI_WRITE_END_PADDING); + if (!ret) + ret = uverbs_copy_from(&wq_init_attr.max_sge, attrs, + UVERBS_ATTR_CREATE_WQ_MAX_SGE); + if (!ret) + ret = uverbs_copy_from(&wq_init_attr.max_wr, attrs, + UVERBS_ATTR_CREATE_WQ_MAX_WR); + if (!ret) + ret = uverbs_copy_from(&user_handle, attrs, + UVERBS_ATTR_CREATE_WQ_USER_HANDLE); + if (!ret) + ret = uverbs_get_const(&wq_init_attr.wq_type, attrs, + UVERBS_ATTR_CREATE_WQ_TYPE); + if (ret) + return ret; + + if (wq_init_attr.wq_type != IB_WQT_RQ) + return -EINVAL; + + obj->uevent.event_file = ib_uverbs_get_async_event(attrs, + UVERBS_ATTR_CREATE_WQ_EVENT_FD); + obj->uevent.uobject.user_handle = user_handle; + INIT_LIST_HEAD(&obj->uevent.event_list); + wq_init_attr.event_handler = ib_uverbs_wq_event_handler; + wq_init_attr.wq_context = attrs->ufile; + wq_init_attr.cq = cq; + + wq = pd->device->ops.create_wq(pd, &wq_init_attr, &attrs->driver_udata); + if (IS_ERR(wq)) { + ret = PTR_ERR(wq); + goto err; + } + + obj->uevent.uobject.object = wq; + wq->wq_type = wq_init_attr.wq_type; + wq->cq = cq; + wq->pd = pd; + wq->device = pd->device; + wq->wq_context = wq_init_attr.wq_context; + atomic_set(&wq->usecnt, 0); + atomic_inc(&pd->usecnt); + atomic_inc(&cq->usecnt); + wq->uobject = obj; + uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_CREATE_WQ_HANDLE); + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_WQ_RESP_MAX_WR, + &wq_init_attr.max_wr, + sizeof(wq_init_attr.max_wr)); + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_WQ_RESP_MAX_SGE, + &wq_init_attr.max_sge, + sizeof(wq_init_attr.max_sge)); + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_WQ_RESP_WQ_NUM, + &wq->wq_num, + sizeof(wq->wq_num)); + return ret; + +err: + if (obj->uevent.event_file) + uverbs_uobject_put(&obj->uevent.event_file->uobj); + return ret; +}; + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_WQ_CREATE, + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_WQ_HANDLE, + UVERBS_OBJECT_WQ, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_WQ_PD_HANDLE, + UVERBS_OBJECT_PD, + UVERBS_ACCESS_READ, + UA_MANDATORY), + UVERBS_ATTR_CONST_IN(UVERBS_ATTR_CREATE_WQ_TYPE, + enum ib_wq_type, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_WQ_USER_HANDLE, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_WQ_MAX_WR, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_WQ_MAX_SGE, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_CREATE_WQ_FLAGS, + enum ib_uverbs_wq_flags, + UA_MANDATORY), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_WQ_CQ_HANDLE, + UVERBS_OBJECT_CQ, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_FD(UVERBS_ATTR_CREATE_WQ_EVENT_FD, + UVERBS_OBJECT_ASYNC_EVENT, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_WQ_RESP_MAX_WR, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_WQ_RESP_MAX_SGE, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_WQ_RESP_WQ_NUM, + UVERBS_ATTR_TYPE(u32), + UA_OPTIONAL), + UVERBS_ATTR_UHW()); + +static int UVERBS_HANDLER(UVERBS_METHOD_WQ_DESTROY)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = + uverbs_attr_get_uobject(attrs, UVERBS_ATTR_DESTROY_WQ_HANDLE); + struct ib_uwq_object *obj = + container_of(uobj, struct ib_uwq_object, uevent.uobject); + + return uverbs_copy_to(attrs, UVERBS_ATTR_DESTROY_WQ_RESP, + &obj->uevent.events_reported, + sizeof(obj->uevent.events_reported)); +} + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_WQ_DESTROY, + UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_WQ_HANDLE, + UVERBS_OBJECT_WQ, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_DESTROY_WQ_RESP, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY)); + + +DECLARE_UVERBS_NAMED_OBJECT( + UVERBS_OBJECT_WQ, + UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uwq_object), uverbs_free_wq), + &UVERBS_METHOD(UVERBS_METHOD_WQ_CREATE), + &UVERBS_METHOD(UVERBS_METHOD_WQ_DESTROY) +); + +const struct uapi_definition uverbs_def_obj_wq[] = { + UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_WQ, + UAPI_DEF_OBJ_NEEDS_FN(destroy_wq)), + {} +}; diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_uapi.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_uapi.c new file mode 100644 index 0000000..a02916a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/uverbs_uapi.c @@ -0,0 +1,734 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2017, Mellanox Technologies inc. All rights reserved. + */ +#include +#include +#include +#include "rdma_core.h" +#include "uverbs.h" + +static int ib_uverbs_notsupp(struct uverbs_attr_bundle *attrs) +{ + return -EOPNOTSUPP; +} + +static void *uapi_add_elm(struct uverbs_api *uapi, u32 key, size_t alloc_size) +{ + void *elm; + int rc; + + if (key == UVERBS_API_KEY_ERR) + return ERR_PTR(-EOVERFLOW); + + elm = kzalloc(alloc_size, GFP_KERNEL); + if (!elm) + return ERR_PTR(-ENOMEM); + rc = radix_tree_insert(&uapi->radix, key, elm); + if (rc) { + kfree(elm); + return ERR_PTR(rc); + } + + return elm; +} + +static void *uapi_add_get_elm(struct uverbs_api *uapi, u32 key, + size_t alloc_size, bool *exists) +{ + void *elm; + + elm = uapi_add_elm(uapi, key, alloc_size); + if (!IS_ERR(elm)) { + *exists = false; + return elm; + } + + if (elm != ERR_PTR(-EEXIST)) + return elm; + + elm = radix_tree_lookup(&uapi->radix, key); + if (WARN_ON(!elm)) + return ERR_PTR(-EINVAL); + *exists = true; + return elm; +} + +static int uapi_create_write(struct uverbs_api *uapi, + struct ib_device *ibdev, + const struct uapi_definition *def, + u32 obj_key, + u32 *cur_method_key) +{ + struct uverbs_api_write_method *method_elm; + u32 method_key = obj_key; + bool exists; + + if (def->write.is_ex) + method_key |= uapi_key_write_ex_method(def->write.command_num); + else + method_key |= uapi_key_write_method(def->write.command_num); + + method_elm = uapi_add_get_elm(uapi, method_key, sizeof(*method_elm), + &exists); + if (IS_ERR(method_elm)) + return PTR_ERR(method_elm); + + if (WARN_ON(exists && (def->write.is_ex != method_elm->is_ex))) + return -EINVAL; + + method_elm->is_ex = def->write.is_ex; + method_elm->handler = def->func_write; + if (!def->write.is_ex) + method_elm->disabled = !(ibdev->uverbs_cmd_mask & + BIT_ULL(def->write.command_num)); + + if (!def->write.is_ex && def->func_write) { + method_elm->has_udata = def->write.has_udata; + method_elm->has_resp = def->write.has_resp; + method_elm->req_size = def->write.req_size; + method_elm->resp_size = def->write.resp_size; + } + + *cur_method_key = method_key; + return 0; +} + +static int uapi_merge_method(struct uverbs_api *uapi, + struct uverbs_api_object *obj_elm, u32 obj_key, + const struct uverbs_method_def *method, + bool is_driver) +{ + u32 method_key = obj_key | uapi_key_ioctl_method(method->id); + struct uverbs_api_ioctl_method *method_elm; + unsigned int i; + bool exists; + + if (!method->attrs) + return 0; + + method_elm = uapi_add_get_elm(uapi, method_key, sizeof(*method_elm), + &exists); + if (IS_ERR(method_elm)) + return PTR_ERR(method_elm); + if (exists) { + /* + * This occurs when a driver uses ADD_UVERBS_ATTRIBUTES_SIMPLE + */ + if (WARN_ON(method->handler)) + return -EINVAL; + } else { + WARN_ON(!method->handler); + rcu_assign_pointer(method_elm->handler, method->handler); + if (method->handler != uverbs_destroy_def_handler) + method_elm->driver_method = is_driver; + } + + for (i = 0; i != method->num_attrs; i++) { + const struct uverbs_attr_def *attr = (*method->attrs)[i]; + struct uverbs_api_attr *attr_slot; + + if (!attr) + continue; + + /* + * ENUM_IN contains the 'ids' pointer to the driver's .rodata, + * so if it is specified by a driver then it always makes this + * into a driver method. + */ + if (attr->attr.type == UVERBS_ATTR_TYPE_ENUM_IN) + method_elm->driver_method |= is_driver; + + /* + * Like other uobject based things we only support a single + * uobject being NEW'd or DESTROY'd + */ + if (attr->attr.type == UVERBS_ATTR_TYPE_IDRS_ARRAY) { + u8 access = attr->attr.u2.objs_arr.access; + + if (WARN_ON(access == UVERBS_ACCESS_NEW || + access == UVERBS_ACCESS_DESTROY)) + return -EINVAL; + } + + attr_slot = + uapi_add_elm(uapi, method_key | uapi_key_attr(attr->id), + sizeof(*attr_slot)); + /* Attributes are not allowed to be modified by drivers */ + if (IS_ERR(attr_slot)) + return PTR_ERR(attr_slot); + + attr_slot->spec = attr->attr; + } + + return 0; +} + +static int uapi_merge_obj_tree(struct uverbs_api *uapi, + const struct uverbs_object_def *obj, + bool is_driver) +{ + struct uverbs_api_object *obj_elm; + unsigned int i; + u32 obj_key; + bool exists; + int rc; + + obj_key = uapi_key_obj(obj->id); + obj_elm = uapi_add_get_elm(uapi, obj_key, sizeof(*obj_elm), &exists); + if (IS_ERR(obj_elm)) + return PTR_ERR(obj_elm); + + if (obj->type_attrs) { + if (WARN_ON(obj_elm->type_attrs)) + return -EINVAL; + + obj_elm->id = obj->id; + obj_elm->type_attrs = obj->type_attrs; + obj_elm->type_class = obj->type_attrs->type_class; + /* + * Today drivers are only permitted to use idr_class and + * fd_class types. We can revoke the IDR types during + * disassociation, and the FD types require the driver to use + * struct file_operations.owner to prevent the driver module + * code from unloading while the file is open. This provides + * enough safety that uverbs_uobject_fd_release() will + * continue to work. Drivers using FD are responsible to + * handle disassociation of the device on their own. + */ + if (WARN_ON(is_driver && + obj->type_attrs->type_class != &uverbs_idr_class && + obj->type_attrs->type_class != &uverbs_fd_class)) + return -EINVAL; + } + + if (!obj->methods) + return 0; + + for (i = 0; i != obj->num_methods; i++) { + const struct uverbs_method_def *method = (*obj->methods)[i]; + + if (!method) + continue; + + rc = uapi_merge_method(uapi, obj_elm, obj_key, method, + is_driver); + if (rc) + return rc; + } + + return 0; +} + +static int uapi_disable_elm(struct uverbs_api *uapi, + const struct uapi_definition *def, + u32 obj_key, + u32 method_key) +{ + bool exists; + + if (def->scope == UAPI_SCOPE_OBJECT) { + struct uverbs_api_object *obj_elm; + + obj_elm = uapi_add_get_elm( + uapi, obj_key, sizeof(*obj_elm), &exists); + if (IS_ERR(obj_elm)) + return PTR_ERR(obj_elm); + obj_elm->disabled = 1; + return 0; + } + + if (def->scope == UAPI_SCOPE_METHOD && + uapi_key_is_ioctl_method(method_key)) { + struct uverbs_api_ioctl_method *method_elm; + + method_elm = uapi_add_get_elm(uapi, method_key, + sizeof(*method_elm), &exists); + if (IS_ERR(method_elm)) + return PTR_ERR(method_elm); + method_elm->disabled = 1; + return 0; + } + + if (def->scope == UAPI_SCOPE_METHOD && + (uapi_key_is_write_method(method_key) || + uapi_key_is_write_ex_method(method_key))) { + struct uverbs_api_write_method *write_elm; + + write_elm = uapi_add_get_elm(uapi, method_key, + sizeof(*write_elm), &exists); + if (IS_ERR(write_elm)) + return PTR_ERR(write_elm); + write_elm->disabled = 1; + return 0; + } + + WARN_ON(true); + return -EINVAL; +} + +static int uapi_merge_def(struct uverbs_api *uapi, struct ib_device *ibdev, + const struct uapi_definition *def_list, + bool is_driver) +{ + const struct uapi_definition *def = def_list; + u32 cur_obj_key = UVERBS_API_KEY_ERR; + u32 cur_method_key = UVERBS_API_KEY_ERR; + bool exists; + int rc; + + if (!def_list) + return 0; + + for (;; def++) { + switch ((enum uapi_definition_kind)def->kind) { + case UAPI_DEF_CHAIN: + rc = uapi_merge_def(uapi, ibdev, def->chain, is_driver); + if (rc) + return rc; + continue; + + case UAPI_DEF_CHAIN_OBJ_TREE: + if (WARN_ON(def->object_start.object_id != + def->chain_obj_tree->id)) + return -EINVAL; + + cur_obj_key = uapi_key_obj(def->object_start.object_id); + rc = uapi_merge_obj_tree(uapi, def->chain_obj_tree, + is_driver); + if (rc) + return rc; + continue; + + case UAPI_DEF_END: + return 0; + + case UAPI_DEF_IS_SUPPORTED_DEV_FN: { + void **ibdev_fn = + (void *)(&ibdev->ops) + def->needs_fn_offset; + + if (*ibdev_fn) + continue; + rc = uapi_disable_elm( + uapi, def, cur_obj_key, cur_method_key); + if (rc) + return rc; + continue; + } + + case UAPI_DEF_IS_SUPPORTED_FUNC: + if (def->func_is_supported(ibdev)) + continue; + rc = uapi_disable_elm( + uapi, def, cur_obj_key, cur_method_key); + if (rc) + return rc; + continue; + + case UAPI_DEF_OBJECT_START: { + struct uverbs_api_object *obj_elm; + + cur_obj_key = uapi_key_obj(def->object_start.object_id); + obj_elm = uapi_add_get_elm(uapi, cur_obj_key, + sizeof(*obj_elm), &exists); + if (IS_ERR(obj_elm)) + return PTR_ERR(obj_elm); + continue; + } + + case UAPI_DEF_WRITE: + rc = uapi_create_write( + uapi, ibdev, def, cur_obj_key, &cur_method_key); + if (rc) + return rc; + continue; + } + WARN_ON(true); + return -EINVAL; + } +} + +static int +uapi_finalize_ioctl_method(struct uverbs_api *uapi, + struct uverbs_api_ioctl_method *method_elm, + u32 method_key) +{ + struct radix_tree_iter iter; + unsigned int num_attrs = 0; + unsigned int max_bkey = 0; + bool single_uobj = false; + void __rcu **slot; + + method_elm->destroy_bkey = UVERBS_API_ATTR_BKEY_LEN; + radix_tree_for_each_slot (slot, &uapi->radix, &iter, + uapi_key_attrs_start(method_key)) { + struct uverbs_api_attr *elm = + rcu_dereference_protected(*slot, true); + u32 attr_key = iter.index & UVERBS_API_ATTR_KEY_MASK; + u32 attr_bkey = uapi_bkey_attr(attr_key); + u8 type = elm->spec.type; + + if (uapi_key_attr_to_ioctl_method(iter.index) != + uapi_key_attr_to_ioctl_method(method_key)) + break; + + if (elm->spec.mandatory) + __set_bit(attr_bkey, method_elm->attr_mandatory); + + if (elm->spec.is_udata) + method_elm->has_udata = true; + + if (type == UVERBS_ATTR_TYPE_IDR || + type == UVERBS_ATTR_TYPE_FD) { + u8 access = elm->spec.u.obj.access; + + /* + * Verbs specs may only have one NEW/DESTROY, we don't + * have the infrastructure to abort multiple NEW's or + * cope with multiple DESTROY failure. + */ + if (access == UVERBS_ACCESS_NEW || + access == UVERBS_ACCESS_DESTROY) { + if (WARN_ON(single_uobj)) + return -EINVAL; + + single_uobj = true; + if (WARN_ON(!elm->spec.mandatory)) + return -EINVAL; + } + + if (access == UVERBS_ACCESS_DESTROY) + method_elm->destroy_bkey = attr_bkey; + } + + max_bkey = max(max_bkey, attr_bkey); + num_attrs++; + } + + method_elm->key_bitmap_len = max_bkey + 1; + WARN_ON(method_elm->key_bitmap_len > UVERBS_API_ATTR_BKEY_LEN); + + uapi_compute_bundle_size(method_elm, num_attrs); + return 0; +} + +static int uapi_finalize(struct uverbs_api *uapi) +{ + const struct uverbs_api_write_method **data; + unsigned long max_write_ex = 0; + unsigned long max_write = 0; + struct radix_tree_iter iter; + void __rcu **slot; + int rc; + int i; + + radix_tree_for_each_slot (slot, &uapi->radix, &iter, 0) { + struct uverbs_api_ioctl_method *method_elm = + rcu_dereference_protected(*slot, true); + + if (uapi_key_is_ioctl_method(iter.index)) { + rc = uapi_finalize_ioctl_method(uapi, method_elm, + iter.index); + if (rc) + return rc; + } + + if (uapi_key_is_write_method(iter.index)) + max_write = max(max_write, + iter.index & UVERBS_API_ATTR_KEY_MASK); + if (uapi_key_is_write_ex_method(iter.index)) + max_write_ex = + max(max_write_ex, + iter.index & UVERBS_API_ATTR_KEY_MASK); + } + + uapi->notsupp_method.handler = ib_uverbs_notsupp; + uapi->num_write = max_write + 1; + uapi->num_write_ex = max_write_ex + 1; + data = kmalloc_array(uapi->num_write + uapi->num_write_ex, + sizeof(*uapi->write_methods), GFP_KERNEL); + if (!data) + return -ENOMEM; + + for (i = 0; i != uapi->num_write + uapi->num_write_ex; i++) + data[i] = &uapi->notsupp_method; + uapi->write_methods = data; + uapi->write_ex_methods = data + uapi->num_write; + + radix_tree_for_each_slot (slot, &uapi->radix, &iter, 0) { + if (uapi_key_is_write_method(iter.index)) + uapi->write_methods[iter.index & + UVERBS_API_ATTR_KEY_MASK] = + rcu_dereference_protected(*slot, true); + if (uapi_key_is_write_ex_method(iter.index)) + uapi->write_ex_methods[iter.index & + UVERBS_API_ATTR_KEY_MASK] = + rcu_dereference_protected(*slot, true); + } + + return 0; +} + +static void uapi_remove_range(struct uverbs_api *uapi, u32 start, u32 last) +{ + struct radix_tree_iter iter; + void __rcu **slot; + + radix_tree_for_each_slot (slot, &uapi->radix, &iter, start) { + if (iter.index > last) + return; + kfree(rcu_dereference_protected(*slot, true)); + radix_tree_iter_delete(&uapi->radix, &iter, slot); + } +} + +static void uapi_remove_object(struct uverbs_api *uapi, u32 obj_key) +{ + uapi_remove_range(uapi, obj_key, + obj_key | UVERBS_API_METHOD_KEY_MASK | + UVERBS_API_ATTR_KEY_MASK); +} + +static void uapi_remove_method(struct uverbs_api *uapi, u32 method_key) +{ + uapi_remove_range(uapi, method_key, + method_key | UVERBS_API_ATTR_KEY_MASK); +} + + +static u32 uapi_get_obj_id(struct uverbs_attr_spec *spec) +{ + if (spec->type == UVERBS_ATTR_TYPE_IDR || + spec->type == UVERBS_ATTR_TYPE_FD) + return spec->u.obj.obj_type; + if (spec->type == UVERBS_ATTR_TYPE_IDRS_ARRAY) + return spec->u2.objs_arr.obj_type; + return UVERBS_API_KEY_ERR; +} + +static void uapi_key_okay(u32 key) +{ + unsigned int count = 0; + + if (uapi_key_is_object(key)) + count++; + if (uapi_key_is_ioctl_method(key)) + count++; + if (uapi_key_is_write_method(key)) + count++; + if (uapi_key_is_write_ex_method(key)) + count++; + if (uapi_key_is_attr(key)) + count++; + WARN(count != 1, "Bad count %u key=%x", count, key); +} + +static void uapi_finalize_disable(struct uverbs_api *uapi) +{ + struct radix_tree_iter iter; + u32 starting_key = 0; + bool scan_again = false; + void __rcu **slot; + +again: + radix_tree_for_each_slot (slot, &uapi->radix, &iter, starting_key) { + uapi_key_okay(iter.index); + + if (uapi_key_is_object(iter.index)) { + struct uverbs_api_object *obj_elm = + rcu_dereference_protected(*slot, true); + + if (obj_elm->disabled) { + /* Have to check all the attrs again */ + scan_again = true; + starting_key = iter.index; + uapi_remove_object(uapi, iter.index); + goto again; + } + continue; + } + + if (uapi_key_is_ioctl_method(iter.index)) { + struct uverbs_api_ioctl_method *method_elm = + rcu_dereference_protected(*slot, true); + + if (method_elm->disabled) { + starting_key = iter.index; + uapi_remove_method(uapi, iter.index); + goto again; + } + continue; + } + + if (uapi_key_is_write_method(iter.index) || + uapi_key_is_write_ex_method(iter.index)) { + struct uverbs_api_write_method *method_elm = + rcu_dereference_protected(*slot, true); + + if (method_elm->disabled) { + kfree(method_elm); + radix_tree_iter_delete(&uapi->radix, &iter, slot); + } + continue; + } + + if (uapi_key_is_attr(iter.index)) { + struct uverbs_api_attr *attr_elm = + rcu_dereference_protected(*slot, true); + const struct uverbs_api_object *tmp_obj; + u32 obj_key; + + /* + * If the method has a mandatory object handle + * attribute which relies on an object which is not + * present then the entire method is uncallable. + */ + if (!attr_elm->spec.mandatory) + continue; + obj_key = uapi_get_obj_id(&attr_elm->spec); + if (obj_key == UVERBS_API_KEY_ERR) + continue; + tmp_obj = uapi_get_object(uapi, obj_key); + if (IS_ERR(tmp_obj)) { + if (PTR_ERR(tmp_obj) == -ENOMSG) + continue; + } else { + if (!tmp_obj->disabled) + continue; + } + + starting_key = iter.index; + uapi_remove_method( + uapi, + iter.index & (UVERBS_API_OBJ_KEY_MASK | + UVERBS_API_METHOD_KEY_MASK)); + goto again; + } + + WARN_ON(false); + } + + if (!scan_again) + return; + scan_again = false; + starting_key = 0; + goto again; +} + +void uverbs_destroy_api(struct uverbs_api *uapi) +{ + if (!uapi) + return; + + uapi_remove_range(uapi, 0, U32_MAX); + kfree(uapi->write_methods); + kfree(uapi); +} + +static const struct uapi_definition uverbs_core_api[] = { + UAPI_DEF_CHAIN(uverbs_def_obj_async_fd), + UAPI_DEF_CHAIN(uverbs_def_obj_counters), + UAPI_DEF_CHAIN(uverbs_def_obj_cq), + UAPI_DEF_CHAIN(uverbs_def_obj_device), + UAPI_DEF_CHAIN(uverbs_def_obj_dm), + UAPI_DEF_CHAIN(uverbs_def_obj_flow_action), + UAPI_DEF_CHAIN(uverbs_def_obj_intf), + UAPI_DEF_CHAIN(uverbs_def_obj_mr), + UAPI_DEF_CHAIN(uverbs_def_obj_qp), + UAPI_DEF_CHAIN(uverbs_def_obj_srq), + UAPI_DEF_CHAIN(uverbs_def_obj_wq), + UAPI_DEF_CHAIN(uverbs_def_write_intf), + {}, +}; + +struct uverbs_api *uverbs_alloc_api(struct ib_device *ibdev) +{ + struct uverbs_api *uapi; + int rc; + + uapi = kzalloc(sizeof(*uapi), GFP_KERNEL); + if (!uapi) + return ERR_PTR(-ENOMEM); + + INIT_RADIX_TREE(&uapi->radix, GFP_KERNEL); + uapi->driver_id = ibdev->ops.driver_id; + + rc = uapi_merge_def(uapi, ibdev, uverbs_core_api, false); + if (rc) + goto err; + rc = uapi_merge_def(uapi, ibdev, ibdev->driver_def, true); + if (rc) + goto err; + + uapi_finalize_disable(uapi); + rc = uapi_finalize(uapi); + if (rc) + goto err; + + return uapi; +err: + if (rc != -ENOMEM) + dev_err(&ibdev->dev, + "Setup of uverbs_api failed, kernel parsing tree description is not valid (%d)??\n", + rc); + + uverbs_destroy_api(uapi); + return ERR_PTR(rc); +} + +/* + * The pre version is done before destroying the HW objects, it only blocks + * off method access. All methods that require the ib_dev or the module data + * must test one of these assignments prior to continuing. + */ +void uverbs_disassociate_api_pre(struct ib_uverbs_device *uverbs_dev) +{ + struct uverbs_api *uapi = uverbs_dev->uapi; + struct radix_tree_iter iter; + void __rcu **slot; + + rcu_assign_pointer(uverbs_dev->ib_dev, NULL); + + radix_tree_for_each_slot (slot, &uapi->radix, &iter, 0) { + if (uapi_key_is_ioctl_method(iter.index)) { + struct uverbs_api_ioctl_method *method_elm = + rcu_dereference_protected(*slot, true); + + if (method_elm->driver_method) + rcu_assign_pointer(method_elm->handler, NULL); + } + } + + synchronize_srcu(&uverbs_dev->disassociate_srcu); +} + +/* + * Called when a driver disassociates from the ib_uverbs_device. The + * assumption is that the driver module will unload after. Replace everything + * related to the driver with NULL as a safety measure. + */ +void uverbs_disassociate_api(struct uverbs_api *uapi) +{ + struct radix_tree_iter iter; + void __rcu **slot; + + radix_tree_for_each_slot (slot, &uapi->radix, &iter, 0) { + if (uapi_key_is_object(iter.index)) { + struct uverbs_api_object *object_elm = + rcu_dereference_protected(*slot, true); + + /* + * Some type_attrs are in the driver module. We don't + * bother to keep track of which since there should be + * no use of this after disassociate. + */ + object_elm->type_attrs = NULL; + } else if (uapi_key_is_attr(iter.index)) { + struct uverbs_api_attr *elm = + rcu_dereference_protected(*slot, true); + + if (elm->spec.type == UVERBS_ATTR_TYPE_ENUM_IN) + elm->spec.u2.enum_def.ids = NULL; + } + } +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/verbs.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/verbs.c new file mode 100644 index 0000000..553c74a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/verbs.c @@ -0,0 +1,3055 @@ +/* + * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2004 Infinicon Corporation. All rights reserved. + * Copyright (c) 2004 Intel Corporation. All rights reserved. + * Copyright (c) 2004 Topspin Corporation. All rights reserved. + * Copyright (c) 2004 Voltaire Corporation. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "core_priv.h" +#include + +static int ib_resolve_eth_dmac(struct ib_device *device, + struct rdma_ah_attr *ah_attr); + +static const char * const ib_events[] = { + [IB_EVENT_CQ_ERR] = "CQ error", + [IB_EVENT_QP_FATAL] = "QP fatal error", + [IB_EVENT_QP_REQ_ERR] = "QP request error", + [IB_EVENT_QP_ACCESS_ERR] = "QP access error", + [IB_EVENT_COMM_EST] = "communication established", + [IB_EVENT_SQ_DRAINED] = "send queue drained", + [IB_EVENT_PATH_MIG] = "path migration successful", + [IB_EVENT_PATH_MIG_ERR] = "path migration error", + [IB_EVENT_DEVICE_FATAL] = "device fatal error", + [IB_EVENT_PORT_ACTIVE] = "port active", + [IB_EVENT_PORT_ERR] = "port error", + [IB_EVENT_LID_CHANGE] = "LID change", + [IB_EVENT_PKEY_CHANGE] = "P_key change", + [IB_EVENT_SM_CHANGE] = "SM change", + [IB_EVENT_SRQ_ERR] = "SRQ error", + [IB_EVENT_SRQ_LIMIT_REACHED] = "SRQ limit reached", + [IB_EVENT_QP_LAST_WQE_REACHED] = "last WQE reached", + [IB_EVENT_CLIENT_REREGISTER] = "client reregister", + [IB_EVENT_GID_CHANGE] = "GID changed", + [IB_EXP_EVENT_XRQ_QP_ERR] = "XRQ QP error", + [IB_EVENT_XRQ_NVMF_BACKEND_CTRL_PCI_ERR] = "XRQ NVMF backend ctrl PCI error", + [IB_EVENT_XRQ_NVMF_BACKEND_CTRL_TO_ERR] = "XRQ NVMF backend ctrl timeout error", +}; + +const char *__attribute_const__ ib_event_msg(enum ib_event_type event) +{ + size_t index = event; + + return (index < ARRAY_SIZE(ib_events) && ib_events[index]) ? + ib_events[index] : "unrecognized event"; +} +EXPORT_SYMBOL(ib_event_msg); + +static const char * const wc_statuses[] = { + [IB_WC_SUCCESS] = "success", + [IB_WC_LOC_LEN_ERR] = "local length error", + [IB_WC_LOC_QP_OP_ERR] = "local QP operation error", + [IB_WC_LOC_EEC_OP_ERR] = "local EE context operation error", + [IB_WC_LOC_PROT_ERR] = "local protection error", + [IB_WC_WR_FLUSH_ERR] = "WR flushed", + [IB_WC_MW_BIND_ERR] = "memory bind operation error", + [IB_WC_BAD_RESP_ERR] = "bad response error", + [IB_WC_LOC_ACCESS_ERR] = "local access error", + [IB_WC_REM_INV_REQ_ERR] = "remote invalid request error", + [IB_WC_REM_ACCESS_ERR] = "remote access error", + [IB_WC_REM_OP_ERR] = "remote operation error", + [IB_WC_RETRY_EXC_ERR] = "transport retry counter exceeded", + [IB_WC_RNR_RETRY_EXC_ERR] = "RNR retry counter exceeded", + [IB_WC_LOC_RDD_VIOL_ERR] = "local RDD violation error", + [IB_WC_REM_INV_RD_REQ_ERR] = "remote invalid RD request", + [IB_WC_REM_ABORT_ERR] = "operation aborted", + [IB_WC_INV_EECN_ERR] = "invalid EE context number", + [IB_WC_INV_EEC_STATE_ERR] = "invalid EE context state", + [IB_WC_FATAL_ERR] = "fatal error", + [IB_WC_RESP_TIMEOUT_ERR] = "response timeout error", + [IB_WC_GENERAL_ERR] = "general error", + [IB_WC_SIG_PIPELINE_CANCELED] = "pipelined WR canceled", +}; + +const char *__attribute_const__ ib_wc_status_msg(enum ib_wc_status status) +{ + size_t index = status; + + return (index < ARRAY_SIZE(wc_statuses) && wc_statuses[index]) ? + wc_statuses[index] : "unrecognized status"; +} +EXPORT_SYMBOL(ib_wc_status_msg); + +__attribute_const__ int ib_rate_to_mult(enum ib_rate rate) +{ + switch (rate) { + case IB_RATE_2_5_GBPS: return 1; + case IB_RATE_5_GBPS: return 2; + case IB_RATE_10_GBPS: return 4; + case IB_RATE_20_GBPS: return 8; + case IB_RATE_30_GBPS: return 12; + case IB_RATE_40_GBPS: return 16; + case IB_RATE_60_GBPS: return 24; + case IB_RATE_80_GBPS: return 32; + case IB_RATE_120_GBPS: return 48; + case IB_RATE_14_GBPS: return 6; + case IB_RATE_56_GBPS: return 22; + case IB_RATE_112_GBPS: return 45; + case IB_RATE_168_GBPS: return 67; + case IB_RATE_25_GBPS: return 10; + case IB_RATE_100_GBPS: return 40; + case IB_RATE_200_GBPS: return 80; + case IB_RATE_300_GBPS: return 120; + case IB_RATE_28_GBPS: return 11; + case IB_RATE_50_GBPS: return 20; + case IB_RATE_400_GBPS: return 160; + case IB_RATE_600_GBPS: return 240; + case IB_RATE_800_GBPS: return 320; + case IB_RATE_1200_GBPS: return 480; + default: return -1; + } +} +EXPORT_SYMBOL(ib_rate_to_mult); + +__attribute_const__ enum ib_rate mult_to_ib_rate(int mult) +{ + switch (mult) { + case 1: return IB_RATE_2_5_GBPS; + case 2: return IB_RATE_5_GBPS; + case 4: return IB_RATE_10_GBPS; + case 8: return IB_RATE_20_GBPS; + case 12: return IB_RATE_30_GBPS; + case 16: return IB_RATE_40_GBPS; + case 24: return IB_RATE_60_GBPS; + case 32: return IB_RATE_80_GBPS; + case 48: return IB_RATE_120_GBPS; + case 6: return IB_RATE_14_GBPS; + case 22: return IB_RATE_56_GBPS; + case 45: return IB_RATE_112_GBPS; + case 67: return IB_RATE_168_GBPS; + case 10: return IB_RATE_25_GBPS; + case 40: return IB_RATE_100_GBPS; + case 80: return IB_RATE_200_GBPS; + case 120: return IB_RATE_300_GBPS; + case 11: return IB_RATE_28_GBPS; + case 20: return IB_RATE_50_GBPS; + case 160: return IB_RATE_400_GBPS; + case 240: return IB_RATE_600_GBPS; + case 320: return IB_RATE_800_GBPS; + case 480: return IB_RATE_1200_GBPS; + default: return IB_RATE_PORT_CURRENT; + } +} +EXPORT_SYMBOL(mult_to_ib_rate); + +__attribute_const__ int ib_rate_to_mbps(enum ib_rate rate) +{ + switch (rate) { + case IB_RATE_2_5_GBPS: return 2500; + case IB_RATE_5_GBPS: return 5000; + case IB_RATE_10_GBPS: return 10000; + case IB_RATE_20_GBPS: return 20000; + case IB_RATE_30_GBPS: return 30000; + case IB_RATE_40_GBPS: return 40000; + case IB_RATE_60_GBPS: return 60000; + case IB_RATE_80_GBPS: return 80000; + case IB_RATE_120_GBPS: return 120000; + case IB_RATE_14_GBPS: return 14062; + case IB_RATE_56_GBPS: return 56250; + case IB_RATE_112_GBPS: return 112500; + case IB_RATE_168_GBPS: return 168750; + case IB_RATE_25_GBPS: return 25781; + case IB_RATE_100_GBPS: return 103125; + case IB_RATE_200_GBPS: return 206250; + case IB_RATE_300_GBPS: return 309375; + case IB_RATE_28_GBPS: return 28125; + case IB_RATE_50_GBPS: return 53125; + case IB_RATE_400_GBPS: return 425000; + case IB_RATE_600_GBPS: return 637500; + case IB_RATE_800_GBPS: return 850000; + case IB_RATE_1200_GBPS: return 1275000; + default: return -1; + } +} +EXPORT_SYMBOL(ib_rate_to_mbps); + +__attribute_const__ enum rdma_transport_type +rdma_node_get_transport(unsigned int node_type) +{ + + if (node_type == RDMA_NODE_USNIC) + return RDMA_TRANSPORT_USNIC; + if (node_type == RDMA_NODE_USNIC_UDP) + return RDMA_TRANSPORT_USNIC_UDP; + if (node_type == RDMA_NODE_RNIC) + return RDMA_TRANSPORT_IWARP; + if (node_type == RDMA_NODE_UNSPECIFIED) + return RDMA_TRANSPORT_UNSPECIFIED; + + return RDMA_TRANSPORT_IB; +} +EXPORT_SYMBOL(rdma_node_get_transport); + +enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, + u32 port_num) +{ + enum rdma_transport_type lt; + if (device->ops.get_link_layer) + return device->ops.get_link_layer(device, port_num); + + lt = rdma_node_get_transport(device->node_type); + if (lt == RDMA_TRANSPORT_IB) + return IB_LINK_LAYER_INFINIBAND; + + return IB_LINK_LAYER_ETHERNET; +} +EXPORT_SYMBOL(rdma_port_get_link_layer); + +/* Protection domains */ + +/** + * __ib_alloc_pd - Allocates an unused protection domain. + * @device: The device on which to allocate the protection domain. + * @flags: protection domain flags + * @caller: caller's build-time module name + * + * A protection domain object provides an association between QPs, shared + * receive queues, address handles, memory regions, and memory windows. + * + * Every PD has a local_dma_lkey which can be used as the lkey value for local + * memory operations. + */ +struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, + const char *caller, bool skip_tracking) +{ + struct ib_pd *pd; + int mr_access_flags = 0; + int ret; + + pd = rdma_zalloc_drv_obj(device, ib_pd); + if (!pd) + return ERR_PTR(-ENOMEM); + + pd->device = device; + pd->flags = flags; + + rdma_restrack_new(&pd->res, RDMA_RESTRACK_PD); + rdma_restrack_set_name(&pd->res, caller); + + ret = device->ops.alloc_pd(pd, NULL); + if (ret) { + rdma_restrack_put(&pd->res); + kfree(pd); + return ERR_PTR(ret); + } + + if (skip_tracking) + rdma_restrack_dontrack(&pd->res); + else + rdma_restrack_add(&pd->res); + + if (device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) + pd->local_dma_lkey = device->local_dma_lkey; + else + mr_access_flags |= IB_ACCESS_LOCAL_WRITE; + + if (flags & IB_PD_UNSAFE_GLOBAL_RKEY) { + pr_warn("%s: enabling unsafe global rkey\n", caller); + mr_access_flags |= IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_WRITE; + } + + if (mr_access_flags) { + struct ib_mr *mr; + + mr = pd->device->ops.get_dma_mr(pd, mr_access_flags); + if (IS_ERR(mr)) { + ib_dealloc_pd(pd); + return ERR_CAST(mr); + } + + mr->device = pd->device; + mr->pd = pd; + mr->type = IB_MR_TYPE_DMA; + mr->uobject = NULL; + mr->need_inval = false; + + pd->__internal_mr = mr; + + if (!(device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) + pd->local_dma_lkey = pd->__internal_mr->lkey; + + if (flags & IB_PD_UNSAFE_GLOBAL_RKEY) + pd->unsafe_global_rkey = pd->__internal_mr->rkey; + } + + return pd; +} +EXPORT_SYMBOL(__ib_alloc_pd); + +/** + * ib_dealloc_pd_user - Deallocates a protection domain. + * @pd: The protection domain to deallocate. + * @udata: Valid user data or NULL for kernel object + * + * It is an error to call this function while any resources in the pd still + * exist. The caller is responsible to synchronously destroy them and + * guarantee no new allocations will happen. + */ +int ib_dealloc_pd_user(struct ib_pd *pd, struct ib_udata *udata) +{ + int ret; + + if (pd->__internal_mr) { + ret = pd->device->ops.dereg_mr(pd->__internal_mr, NULL); + WARN_ON(ret); + pd->__internal_mr = NULL; + } + + ret = pd->device->ops.dealloc_pd(pd, udata); + if (ret) + return ret; + + rdma_restrack_del(&pd->res); + kfree(pd); + return ret; +} +EXPORT_SYMBOL(ib_dealloc_pd_user); + +/* Address handles */ + +/** + * rdma_copy_ah_attr - Copy rdma ah attribute from source to destination. + * @dest: Pointer to destination ah_attr. Contents of the destination + * pointer is assumed to be invalid and attribute are overwritten. + * @src: Pointer to source ah_attr. + */ +void rdma_copy_ah_attr(struct rdma_ah_attr *dest, + const struct rdma_ah_attr *src) +{ + *dest = *src; + if (dest->grh.sgid_attr) + rdma_hold_gid_attr(dest->grh.sgid_attr); +} +EXPORT_SYMBOL(rdma_copy_ah_attr); + +/** + * rdma_replace_ah_attr - Replace valid ah_attr with new new one. + * @old: Pointer to existing ah_attr which needs to be replaced. + * old is assumed to be valid or zero'd + * @new: Pointer to the new ah_attr. + * + * rdma_replace_ah_attr() first releases any reference in the old ah_attr if + * old the ah_attr is valid; after that it copies the new attribute and holds + * the reference to the replaced ah_attr. + */ +void rdma_replace_ah_attr(struct rdma_ah_attr *old, + const struct rdma_ah_attr *new) +{ + rdma_destroy_ah_attr(old); + *old = *new; + if (old->grh.sgid_attr) + rdma_hold_gid_attr(old->grh.sgid_attr); +} +EXPORT_SYMBOL(rdma_replace_ah_attr); + +/** + * rdma_move_ah_attr - Move ah_attr pointed by source to destination. + * @dest: Pointer to destination ah_attr to copy to. + * dest is assumed to be valid or zero'd + * @src: Pointer to the new ah_attr. + * + * rdma_move_ah_attr() first releases any reference in the destination ah_attr + * if it is valid. This also transfers ownership of internal references from + * src to dest, making src invalid in the process. No new reference of the src + * ah_attr is taken. + */ +void rdma_move_ah_attr(struct rdma_ah_attr *dest, struct rdma_ah_attr *src) +{ + rdma_destroy_ah_attr(dest); + *dest = *src; + src->grh.sgid_attr = NULL; +} +EXPORT_SYMBOL(rdma_move_ah_attr); + +/* + * Validate that the rdma_ah_attr is valid for the device before passing it + * off to the driver. + */ +static int rdma_check_ah_attr(struct ib_device *device, + struct rdma_ah_attr *ah_attr) +{ + if (!rdma_is_port_valid(device, ah_attr->port_num)) + return -EINVAL; + + if ((rdma_is_grh_required(device, ah_attr->port_num) || + ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) && + !(ah_attr->ah_flags & IB_AH_GRH)) + return -EINVAL; + + if (ah_attr->grh.sgid_attr) { + /* + * Make sure the passed sgid_attr is consistent with the + * parameters + */ + if (ah_attr->grh.sgid_attr->index != ah_attr->grh.sgid_index || + ah_attr->grh.sgid_attr->port_num != ah_attr->port_num) + return -EINVAL; + } + return 0; +} + +/* + * If the ah requires a GRH then ensure that sgid_attr pointer is filled in. + * On success the caller is responsible to call rdma_unfill_sgid_attr(). + */ +static int rdma_fill_sgid_attr(struct ib_device *device, + struct rdma_ah_attr *ah_attr, + const struct ib_gid_attr **old_sgid_attr) +{ + const struct ib_gid_attr *sgid_attr; + struct ib_global_route *grh; + int ret; + + *old_sgid_attr = ah_attr->grh.sgid_attr; + + ret = rdma_check_ah_attr(device, ah_attr); + if (ret) + return ret; + + if (!(ah_attr->ah_flags & IB_AH_GRH)) + return 0; + + grh = rdma_ah_retrieve_grh(ah_attr); + if (grh->sgid_attr) + return 0; + + sgid_attr = + rdma_get_gid_attr(device, ah_attr->port_num, grh->sgid_index); + if (IS_ERR(sgid_attr)) + return PTR_ERR(sgid_attr); + + /* Move ownerhip of the kref into the ah_attr */ + grh->sgid_attr = sgid_attr; + return 0; +} + +static void rdma_unfill_sgid_attr(struct rdma_ah_attr *ah_attr, + const struct ib_gid_attr *old_sgid_attr) +{ + /* + * Fill didn't change anything, the caller retains ownership of + * whatever it passed + */ + if (ah_attr->grh.sgid_attr == old_sgid_attr) + return; + + /* + * Otherwise, we need to undo what rdma_fill_sgid_attr so the caller + * doesn't see any change in the rdma_ah_attr. If we get here + * old_sgid_attr is NULL. + */ + rdma_destroy_ah_attr(ah_attr); +} + +static const struct ib_gid_attr * +rdma_update_sgid_attr(struct rdma_ah_attr *ah_attr, + const struct ib_gid_attr *old_attr) +{ + if (old_attr) + rdma_put_gid_attr(old_attr); + if (ah_attr->ah_flags & IB_AH_GRH) { + rdma_hold_gid_attr(ah_attr->grh.sgid_attr); + return ah_attr->grh.sgid_attr; + } + return NULL; +} + +static struct ib_ah *_rdma_create_ah(struct ib_pd *pd, + struct rdma_ah_attr *ah_attr, + u32 flags, + struct ib_udata *udata, + struct net_device *xmit_slave) +{ + struct rdma_ah_init_attr init_attr = {}; + struct ib_device *device = pd->device; + struct ib_ah *ah; + int ret; + + might_sleep_if(flags & RDMA_CREATE_AH_SLEEPABLE); + + if (!udata && !device->ops.create_ah) + return ERR_PTR(-EOPNOTSUPP); + + ah = rdma_zalloc_drv_obj_gfp( + device, ib_ah, + (flags & RDMA_CREATE_AH_SLEEPABLE) ? GFP_KERNEL : GFP_ATOMIC); + if (!ah) + return ERR_PTR(-ENOMEM); + + ah->device = device; + ah->pd = pd; + ah->type = ah_attr->type; + ah->sgid_attr = rdma_update_sgid_attr(ah_attr, NULL); + init_attr.ah_attr = ah_attr; + init_attr.flags = flags; + init_attr.xmit_slave = xmit_slave; + + if (udata) + ret = device->ops.create_user_ah(ah, &init_attr, udata); + else + ret = device->ops.create_ah(ah, &init_attr, NULL); + if (ret) { + kfree(ah); + return ERR_PTR(ret); + } + + atomic_inc(&pd->usecnt); + return ah; +} + +/** + * rdma_create_ah - Creates an address handle for the + * given address vector. + * @pd: The protection domain associated with the address handle. + * @ah_attr: The attributes of the address vector. + * @flags: Create address handle flags (see enum rdma_create_ah_flags). + * + * It returns 0 on success and returns appropriate error code on error. + * The address handle is used to reference a local or global destination + * in all UD QP post sends. + */ +struct ib_ah *rdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr, + u32 flags) +{ + const struct ib_gid_attr *old_sgid_attr; + struct net_device *slave; + struct ib_ah *ah; + int ret; + + ret = rdma_fill_sgid_attr(pd->device, ah_attr, &old_sgid_attr); + if (ret) + return ERR_PTR(ret); + slave = rdma_lag_get_ah_roce_slave(pd->device, ah_attr, + (flags & RDMA_CREATE_AH_SLEEPABLE) ? + GFP_KERNEL : GFP_ATOMIC); + if (IS_ERR(slave)) { + rdma_unfill_sgid_attr(ah_attr, old_sgid_attr); + return (void *)slave; + } + ah = _rdma_create_ah(pd, ah_attr, flags, NULL, slave); + rdma_lag_put_ah_roce_slave(slave); + rdma_unfill_sgid_attr(ah_attr, old_sgid_attr); + return ah; +} +EXPORT_SYMBOL(rdma_create_ah); + +/** + * rdma_create_user_ah - Creates an address handle for the + * given address vector. + * It resolves destination mac address for ah attribute of RoCE type. + * @pd: The protection domain associated with the address handle. + * @ah_attr: The attributes of the address vector. + * @udata: pointer to user's input output buffer information need by + * provider driver. + * + * It returns 0 on success and returns appropriate error code on error. + * The address handle is used to reference a local or global destination + * in all UD QP post sends. + */ +struct ib_ah *rdma_create_user_ah(struct ib_pd *pd, + struct rdma_ah_attr *ah_attr, + struct ib_udata *udata) +{ + const struct ib_gid_attr *old_sgid_attr; + struct ib_ah *ah; + int err; + + err = rdma_fill_sgid_attr(pd->device, ah_attr, &old_sgid_attr); + if (err) + return ERR_PTR(err); + + if (ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) { + err = ib_resolve_eth_dmac(pd->device, ah_attr); + if (err) { + ah = ERR_PTR(err); + goto out; + } + } + + ah = _rdma_create_ah(pd, ah_attr, RDMA_CREATE_AH_SLEEPABLE, + udata, NULL); + +out: + rdma_unfill_sgid_attr(ah_attr, old_sgid_attr); + return ah; +} +EXPORT_SYMBOL(rdma_create_user_ah); + +int ib_get_rdma_header_version(const union rdma_network_hdr *hdr) +{ + const struct iphdr *ip4h = (struct iphdr *)&hdr->roce4grh; + struct iphdr ip4h_checked; + const struct ipv6hdr *ip6h = (struct ipv6hdr *)&hdr->ibgrh; + + /* If it's IPv6, the version must be 6, otherwise, the first + * 20 bytes (before the IPv4 header) are garbled. + */ + if (ip6h->version != 6) + return (ip4h->version == 4) ? 4 : 0; + /* version may be 6 or 4 because the first 20 bytes could be garbled */ + + /* RoCE v2 requires no options, thus header length + * must be 5 words + */ + if (ip4h->ihl != 5) + return 6; + + /* Verify checksum. + * We can't write on scattered buffers so we need to copy to + * temp buffer. + */ + memcpy(&ip4h_checked, ip4h, sizeof(ip4h_checked)); + ip4h_checked.check = 0; + ip4h_checked.check = ip_fast_csum((u8 *)&ip4h_checked, 5); + /* if IPv4 header checksum is OK, believe it */ + if (ip4h->check == ip4h_checked.check) + return 4; + return 6; +} +EXPORT_SYMBOL(ib_get_rdma_header_version); + +static enum rdma_network_type ib_get_net_type_by_grh(struct ib_device *device, + u32 port_num, + const struct ib_grh *grh) +{ + int grh_version; + + if (rdma_protocol_ib(device, port_num)) + return RDMA_NETWORK_IB; + + grh_version = ib_get_rdma_header_version((union rdma_network_hdr *)grh); + + if (grh_version == 4) + return RDMA_NETWORK_IPV4; + + if (grh->next_hdr == IPPROTO_UDP) + return RDMA_NETWORK_IPV6; + + return RDMA_NETWORK_ROCE_V1; +} + +struct find_gid_index_context { + u16 vlan_id; + enum ib_gid_type gid_type; +}; + +static bool find_gid_index(const union ib_gid *gid, + const struct ib_gid_attr *gid_attr, + void *context) +{ + struct find_gid_index_context *ctx = context; + u16 vlan_id = 0xffff; + int ret; + + if (ctx->gid_type != gid_attr->gid_type) + return false; + + ret = rdma_read_gid_l2_fields(gid_attr, &vlan_id, NULL); + if (ret) + return false; + + return ctx->vlan_id == vlan_id; +} + +static const struct ib_gid_attr * +get_sgid_attr_from_eth(struct ib_device *device, u32 port_num, + u16 vlan_id, const union ib_gid *sgid, + enum ib_gid_type gid_type) +{ + struct find_gid_index_context context = {.vlan_id = vlan_id, + .gid_type = gid_type}; + + return rdma_find_gid_by_filter(device, sgid, port_num, find_gid_index, + &context); +} + +int ib_get_gids_from_rdma_hdr(const union rdma_network_hdr *hdr, + enum rdma_network_type net_type, + union ib_gid *sgid, union ib_gid *dgid) +{ + struct sockaddr_in src_in; + struct sockaddr_in dst_in; + __be32 src_saddr, dst_saddr; + + if (!sgid || !dgid) + return -EINVAL; + + if (net_type == RDMA_NETWORK_IPV4) { + memcpy(&src_in.sin_addr.s_addr, + &hdr->roce4grh.saddr, 4); + memcpy(&dst_in.sin_addr.s_addr, + &hdr->roce4grh.daddr, 4); + src_saddr = src_in.sin_addr.s_addr; + dst_saddr = dst_in.sin_addr.s_addr; + ipv6_addr_set_v4mapped(src_saddr, + (struct in6_addr *)sgid); + ipv6_addr_set_v4mapped(dst_saddr, + (struct in6_addr *)dgid); + return 0; + } else if (net_type == RDMA_NETWORK_IPV6 || + net_type == RDMA_NETWORK_IB || RDMA_NETWORK_ROCE_V1) { + *dgid = hdr->ibgrh.dgid; + *sgid = hdr->ibgrh.sgid; + return 0; + } else { + return -EINVAL; + } +} +EXPORT_SYMBOL(ib_get_gids_from_rdma_hdr); + +/* Resolve destination mac address and hop limit for unicast destination + * GID entry, considering the source GID entry as well. + * ah_attribute must have have valid port_num, sgid_index. + */ +static int ib_resolve_unicast_gid_dmac(struct ib_device *device, + struct rdma_ah_attr *ah_attr) +{ + struct ib_global_route *grh = rdma_ah_retrieve_grh(ah_attr); + const struct ib_gid_attr *sgid_attr = grh->sgid_attr; + int hop_limit = 0xff; + int ret = 0; + + /* If destination is link local and source GID is RoCEv1, + * IP stack is not used. + */ + if (rdma_link_local_addr((struct in6_addr *)grh->dgid.raw) && + sgid_attr->gid_type == IB_GID_TYPE_ROCE) { + rdma_get_ll_mac((struct in6_addr *)grh->dgid.raw, + ah_attr->roce.dmac); + return ret; + } + + ret = rdma_addr_find_l2_eth_by_grh(&sgid_attr->gid, &grh->dgid, + ah_attr->roce.dmac, + sgid_attr, &hop_limit); + + grh->hop_limit = hop_limit; + return ret; +} + +/* + * This function initializes address handle attributes from the incoming packet. + * Incoming packet has dgid of the receiver node on which this code is + * getting executed and, sgid contains the GID of the sender. + * + * When resolving mac address of destination, the arrived dgid is used + * as sgid and, sgid is used as dgid because sgid contains destinations + * GID whom to respond to. + * + * On success the caller is responsible to call rdma_destroy_ah_attr on the + * attr. + */ +int ib_init_ah_attr_from_wc(struct ib_device *device, u32 port_num, + const struct ib_wc *wc, const struct ib_grh *grh, + struct rdma_ah_attr *ah_attr) +{ + u32 flow_class; + int ret; + enum rdma_network_type net_type = RDMA_NETWORK_IB; + enum ib_gid_type gid_type = IB_GID_TYPE_IB; + const struct ib_gid_attr *sgid_attr; + int hoplimit = 0xff; + union ib_gid dgid; + union ib_gid sgid; + + might_sleep(); + + memset(ah_attr, 0, sizeof *ah_attr); + ah_attr->type = rdma_ah_find_type(device, port_num); + if (rdma_cap_eth_ah(device, port_num)) { + if (wc->wc_flags & IB_WC_WITH_NETWORK_HDR_TYPE) + net_type = wc->network_hdr_type; + else + net_type = ib_get_net_type_by_grh(device, port_num, grh); + gid_type = ib_network_to_gid_type(net_type); + } + ret = ib_get_gids_from_rdma_hdr((union rdma_network_hdr *)grh, net_type, + &sgid, &dgid); + if (ret) + return ret; + + rdma_ah_set_sl(ah_attr, wc->sl); + rdma_ah_set_port_num(ah_attr, port_num); + + if (rdma_protocol_roce(device, port_num)) { + u16 vlan_id = wc->wc_flags & IB_WC_WITH_VLAN ? + wc->vlan_id : 0xffff; + bool ll_dest_addr = rdma_link_local_addr((struct in6_addr *)sgid.raw); + + if (!(wc->wc_flags & IB_WC_GRH)) + return -EPROTOTYPE; + + sgid_attr = get_sgid_attr_from_eth(device, port_num, + vlan_id, &dgid, + gid_type); + if (IS_ERR(sgid_attr)) + return PTR_ERR(sgid_attr); + + flow_class = be32_to_cpu(grh->version_tclass_flow); + if (ll_dest_addr && wc->wc_flags & IB_WC_WITH_VLAN && + wc->wc_flags & IB_WC_WITH_SMAC) { + memcpy(ah_attr->roce.dmac, wc->smac, ETH_ALEN); + hoplimit = 1; + rdma_move_grh_sgid_attr(ah_attr, + &sgid, + flow_class & 0xFFFFF, + hoplimit, + (flow_class >> 20) & 0xFF, + sgid_attr); + return 0; + } else { + + rdma_move_grh_sgid_attr(ah_attr, + &sgid, + flow_class & 0xFFFFF, + hoplimit, + (flow_class >> 20) & 0xFF, + sgid_attr); + return ib_resolve_unicast_gid_dmac(device, ah_attr); + } + } else { + rdma_ah_set_dlid(ah_attr, wc->slid); + rdma_ah_set_path_bits(ah_attr, wc->dlid_path_bits); + + if ((wc->wc_flags & IB_WC_GRH) == 0) + return 0; + + if (dgid.global.interface_id != + cpu_to_be64(IB_SA_WELL_KNOWN_GUID)) { + sgid_attr = rdma_find_gid_by_port( + device, &dgid, IB_GID_TYPE_IB, port_num, NULL); + } else + sgid_attr = rdma_get_gid_attr(device, port_num, 0); + + if (IS_ERR(sgid_attr)) + return PTR_ERR(sgid_attr); + flow_class = be32_to_cpu(grh->version_tclass_flow); + rdma_move_grh_sgid_attr(ah_attr, + &sgid, + flow_class & 0xFFFFF, + hoplimit, + (flow_class >> 20) & 0xFF, + sgid_attr); + + return 0; + } +} +EXPORT_SYMBOL(ib_init_ah_attr_from_wc); + +/** + * rdma_move_grh_sgid_attr - Sets the sgid attribute of GRH, taking ownership + * of the reference + * + * @attr: Pointer to AH attribute structure + * @dgid: Destination GID + * @flow_label: Flow label + * @hop_limit: Hop limit + * @traffic_class: traffic class + * @sgid_attr: Pointer to SGID attribute + * + * This takes ownership of the sgid_attr reference. The caller must ensure + * rdma_destroy_ah_attr() is called before destroying the rdma_ah_attr after + * calling this function. + */ +void rdma_move_grh_sgid_attr(struct rdma_ah_attr *attr, union ib_gid *dgid, + u32 flow_label, u8 hop_limit, u8 traffic_class, + const struct ib_gid_attr *sgid_attr) +{ + rdma_ah_set_grh(attr, dgid, flow_label, sgid_attr->index, hop_limit, + traffic_class); + attr->grh.sgid_attr = sgid_attr; +} +EXPORT_SYMBOL(rdma_move_grh_sgid_attr); + +/** + * rdma_destroy_ah_attr - Release reference to SGID attribute of + * ah attribute. + * @ah_attr: Pointer to ah attribute + * + * Release reference to the SGID attribute of the ah attribute if it is + * non NULL. It is safe to call this multiple times, and safe to call it on + * a zero initialized ah_attr. + */ +void rdma_destroy_ah_attr(struct rdma_ah_attr *ah_attr) +{ + if (ah_attr->grh.sgid_attr) { + rdma_put_gid_attr(ah_attr->grh.sgid_attr); + ah_attr->grh.sgid_attr = NULL; + } +} +EXPORT_SYMBOL(rdma_destroy_ah_attr); + +struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, const struct ib_wc *wc, + const struct ib_grh *grh, u32 port_num) +{ + struct rdma_ah_attr ah_attr; + struct ib_ah *ah; + int ret; + + ret = ib_init_ah_attr_from_wc(pd->device, port_num, wc, grh, &ah_attr); + if (ret) + return ERR_PTR(ret); + + ah = rdma_create_ah(pd, &ah_attr, RDMA_CREATE_AH_SLEEPABLE); + + rdma_destroy_ah_attr(&ah_attr); + return ah; +} +EXPORT_SYMBOL(ib_create_ah_from_wc); + +int rdma_modify_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr) +{ + const struct ib_gid_attr *old_sgid_attr; + int ret; + + if (ah->type != ah_attr->type) + return -EINVAL; + + ret = rdma_fill_sgid_attr(ah->device, ah_attr, &old_sgid_attr); + if (ret) + return ret; + + ret = ah->device->ops.modify_ah ? + ah->device->ops.modify_ah(ah, ah_attr) : + -EOPNOTSUPP; + + ah->sgid_attr = rdma_update_sgid_attr(ah_attr, ah->sgid_attr); + rdma_unfill_sgid_attr(ah_attr, old_sgid_attr); + return ret; +} +EXPORT_SYMBOL(rdma_modify_ah); + +int rdma_query_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr) +{ + ah_attr->grh.sgid_attr = NULL; + + return ah->device->ops.query_ah ? + ah->device->ops.query_ah(ah, ah_attr) : + -EOPNOTSUPP; +} +EXPORT_SYMBOL(rdma_query_ah); + +int rdma_destroy_ah_user(struct ib_ah *ah, u32 flags, struct ib_udata *udata) +{ + const struct ib_gid_attr *sgid_attr = ah->sgid_attr; + struct ib_pd *pd; + int ret; + + might_sleep_if(flags & RDMA_DESTROY_AH_SLEEPABLE); + + pd = ah->pd; + + ret = ah->device->ops.destroy_ah(ah, flags); + if (ret) + return ret; + + atomic_dec(&pd->usecnt); + if (sgid_attr) + rdma_put_gid_attr(sgid_attr); + + kfree(ah); + return ret; +} +EXPORT_SYMBOL(rdma_destroy_ah_user); + +/* Shared receive queues */ + +/** + * ib_create_srq_user - Creates a SRQ associated with the specified protection + * domain. + * @pd: The protection domain associated with the SRQ. + * @srq_init_attr: A list of initial attributes required to create the + * SRQ. If SRQ creation succeeds, then the attributes are updated to + * the actual capabilities of the created SRQ. + * @uobject: uobject pointer if this is not a kernel SRQ + * @udata: udata pointer if this is not a kernel SRQ + * + * srq_attr->max_wr and srq_attr->max_sge are read the determine the + * requested size of the SRQ, and set to the actual values allocated + * on return. If ib_create_srq() succeeds, then max_wr and max_sge + * will always be at least as large as the requested values. + */ +struct ib_srq *ib_create_srq_user(struct ib_pd *pd, + struct ib_srq_init_attr *srq_init_attr, + struct ib_usrq_object *uobject, + struct ib_udata *udata) +{ + struct ib_srq *srq; + int ret; + + srq = rdma_zalloc_drv_obj(pd->device, ib_srq); + if (!srq) + return ERR_PTR(-ENOMEM); + + srq->device = pd->device; + srq->pd = pd; + srq->event_handler = srq_init_attr->event_handler; + srq->srq_context = srq_init_attr->srq_context; + srq->srq_type = srq_init_attr->srq_type; + srq->uobject = uobject; + + if (ib_srq_has_cq(srq->srq_type)) { + srq->ext.cq = srq_init_attr->ext.cq; + atomic_inc(&srq->ext.cq->usecnt); + } + if (srq->srq_type == IB_SRQT_XRC) { + srq->ext.xrc.xrcd = srq_init_attr->ext.xrc.xrcd; + if (srq->ext.xrc.xrcd) + atomic_inc(&srq->ext.xrc.xrcd->usecnt); + } + atomic_inc(&pd->usecnt); + + rdma_restrack_new(&srq->res, RDMA_RESTRACK_SRQ); + rdma_restrack_parent_name(&srq->res, &pd->res); + + ret = pd->device->ops.create_srq(srq, srq_init_attr, udata); + if (ret) { + rdma_restrack_put(&srq->res); + atomic_dec(&srq->pd->usecnt); + if (srq->srq_type == IB_SRQT_XRC && srq->ext.xrc.xrcd) + atomic_dec(&srq->ext.xrc.xrcd->usecnt); + if (ib_srq_has_cq(srq->srq_type)) + atomic_dec(&srq->ext.cq->usecnt); + kfree(srq); + return ERR_PTR(ret); + } + + rdma_restrack_add(&srq->res); + + return srq; +} +EXPORT_SYMBOL(ib_create_srq_user); + +int ib_modify_srq(struct ib_srq *srq, + struct ib_srq_attr *srq_attr, + enum ib_srq_attr_mask srq_attr_mask) +{ + return srq->device->ops.modify_srq ? + srq->device->ops.modify_srq(srq, srq_attr, srq_attr_mask, + NULL) : -EOPNOTSUPP; +} +EXPORT_SYMBOL(ib_modify_srq); + +int ib_query_srq(struct ib_srq *srq, + struct ib_srq_attr *srq_attr) +{ + return srq->device->ops.query_srq ? + srq->device->ops.query_srq(srq, srq_attr) : -EOPNOTSUPP; +} +EXPORT_SYMBOL(ib_query_srq); + +int ib_destroy_srq_user(struct ib_srq *srq, struct ib_udata *udata) +{ + int ret; + + if (atomic_read(&srq->usecnt)) + return -EBUSY; + + ret = srq->device->ops.destroy_srq(srq, udata); + if (ret) + return ret; + + atomic_dec(&srq->pd->usecnt); + if (srq->srq_type == IB_SRQT_XRC && srq->ext.xrc.xrcd) + atomic_dec(&srq->ext.xrc.xrcd->usecnt); + if (ib_srq_has_cq(srq->srq_type)) + atomic_dec(&srq->ext.cq->usecnt); + rdma_restrack_del(&srq->res); + kfree(srq); + + return ret; +} +EXPORT_SYMBOL(ib_destroy_srq_user); + +/* Queue pairs */ + +static void __ib_shared_qp_event_handler(struct ib_event *event, void *context) +{ + struct ib_qp *qp = context; + unsigned long flags; + + spin_lock_irqsave(&qp->device->qp_open_list_lock, flags); + list_for_each_entry(event->element.qp, &qp->open_list, open_list) + if (event->element.qp->event_handler) + event->element.qp->event_handler(event, event->element.qp->qp_context); + spin_unlock_irqrestore(&qp->device->qp_open_list_lock, flags); +} + +static struct ib_qp *__ib_open_qp(struct ib_qp *real_qp, + void (*event_handler)(struct ib_event *, void *), + void *qp_context) +{ + struct ib_qp *qp; + unsigned long flags; + int err; + + qp = kzalloc(sizeof *qp, GFP_KERNEL); + if (!qp) + return ERR_PTR(-ENOMEM); + + qp->real_qp = real_qp; + err = ib_open_shared_qp_security(qp, real_qp->device); + if (err) { + kfree(qp); + return ERR_PTR(err); + } + + qp->real_qp = real_qp; + atomic_inc(&real_qp->usecnt); + qp->device = real_qp->device; + qp->event_handler = event_handler; + qp->qp_context = qp_context; + qp->qp_num = real_qp->qp_num; + qp->qp_type = real_qp->qp_type; + + spin_lock_irqsave(&real_qp->device->qp_open_list_lock, flags); + list_add(&qp->open_list, &real_qp->open_list); + spin_unlock_irqrestore(&real_qp->device->qp_open_list_lock, flags); + + return qp; +} + +struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd, + struct ib_qp_open_attr *qp_open_attr) +{ + struct ib_qp *qp, *real_qp; + + if (qp_open_attr->qp_type != IB_QPT_XRC_TGT) + return ERR_PTR(-EINVAL); + + down_read(&xrcd->tgt_qps_rwsem); + real_qp = xa_load(&xrcd->tgt_qps, qp_open_attr->qp_num); + if (!real_qp) { + up_read(&xrcd->tgt_qps_rwsem); + return ERR_PTR(-EINVAL); + } + qp = __ib_open_qp(real_qp, qp_open_attr->event_handler, + qp_open_attr->qp_context); + up_read(&xrcd->tgt_qps_rwsem); + return qp; +} +EXPORT_SYMBOL(ib_open_qp); + +static struct ib_qp *create_xrc_qp_user(struct ib_qp *qp, + struct ib_qp_init_attr *qp_init_attr) +{ + struct ib_qp *real_qp = qp; + int err; + + qp->event_handler = __ib_shared_qp_event_handler; + qp->qp_context = qp; + qp->pd = NULL; + qp->send_cq = qp->recv_cq = NULL; + qp->srq = NULL; + qp->xrcd = qp_init_attr->xrcd; + atomic_inc(&qp_init_attr->xrcd->usecnt); + INIT_LIST_HEAD(&qp->open_list); + + qp = __ib_open_qp(real_qp, qp_init_attr->event_handler, + qp_init_attr->qp_context); + if (IS_ERR(qp)) + return qp; + + err = xa_err(xa_store(&qp_init_attr->xrcd->tgt_qps, real_qp->qp_num, + real_qp, GFP_KERNEL)); + if (err) { + ib_close_qp(qp); + return ERR_PTR(err); + } + return qp; +} + +static struct ib_qp *create_qp(struct ib_device *dev, struct ib_pd *pd, + struct ib_qp_init_attr *attr, + struct ib_udata *udata, + struct ib_uqp_object *uobj, const char *caller) +{ + struct ib_udata dummy = {}; + struct ib_qp *qp; + int ret; + + if (!dev->ops.create_qp) + return ERR_PTR(-EOPNOTSUPP); + + qp = rdma_zalloc_drv_obj_numa(dev, ib_qp); + if (!qp) + return ERR_PTR(-ENOMEM); + + qp->device = dev; + qp->pd = pd; + qp->uobject = uobj; + qp->real_qp = qp; + + qp->qp_type = attr->qp_type; + qp->rwq_ind_tbl = attr->rwq_ind_tbl; + qp->srq = attr->srq; + qp->event_handler = attr->event_handler; + qp->port = attr->port_num; + qp->qp_context = attr->qp_context; + + spin_lock_init(&qp->mr_lock); + INIT_LIST_HEAD(&qp->rdma_mrs); + INIT_LIST_HEAD(&qp->sig_mrs); + + qp->send_cq = attr->send_cq; + qp->recv_cq = attr->recv_cq; + + rdma_restrack_new(&qp->res, RDMA_RESTRACK_QP); + WARN_ONCE(!udata && !caller, "Missing kernel QP owner"); + rdma_restrack_set_name(&qp->res, udata ? NULL : caller); + ret = dev->ops.create_qp(qp, attr, udata); + if (ret) + goto err_create; + + /* + * TODO: The mlx4 internally overwrites send_cq and recv_cq. + * Unfortunately, it is not an easy task to fix that driver. + */ + qp->send_cq = attr->send_cq; + qp->recv_cq = attr->recv_cq; + + ret = ib_create_qp_security(qp, dev); + if (ret) + goto err_security; + + rdma_restrack_add(&qp->res); + return qp; + +err_security: + qp->device->ops.destroy_qp(qp, udata ? &dummy : NULL); +err_create: + rdma_restrack_put(&qp->res); + kfree(qp); + return ERR_PTR(ret); + +} + +/** + * ib_create_qp_user - Creates a QP associated with the specified protection + * domain. + * @dev: IB device + * @pd: The protection domain associated with the QP. + * @attr: A list of initial attributes required to create the + * QP. If QP creation succeeds, then the attributes are updated to + * the actual capabilities of the created QP. + * @udata: User data + * @uobj: uverbs obect + * @caller: caller's build-time module name + */ +struct ib_qp *ib_create_qp_user(struct ib_device *dev, struct ib_pd *pd, + struct ib_qp_init_attr *attr, + struct ib_udata *udata, + struct ib_uqp_object *uobj, const char *caller) +{ + struct ib_qp *qp, *xrc_qp; + + if (attr->qp_type == IB_QPT_XRC_TGT) + qp = create_qp(dev, pd, attr, NULL, NULL, caller); + else + qp = create_qp(dev, pd, attr, udata, uobj, NULL); + if (attr->qp_type != IB_QPT_XRC_TGT || IS_ERR(qp)) + return qp; + + xrc_qp = create_xrc_qp_user(qp, attr); + if (IS_ERR(xrc_qp)) { + ib_destroy_qp(qp); + return xrc_qp; + } + + xrc_qp->uobject = uobj; + return xrc_qp; +} +EXPORT_SYMBOL(ib_create_qp_user); + +void ib_qp_usecnt_inc(struct ib_qp *qp) +{ + if (qp->pd) + atomic_inc(&qp->pd->usecnt); + if (qp->send_cq) + atomic_inc(&qp->send_cq->usecnt); + if (qp->recv_cq) + atomic_inc(&qp->recv_cq->usecnt); + if (qp->srq) + atomic_inc(&qp->srq->usecnt); + if (qp->rwq_ind_tbl) + atomic_inc(&qp->rwq_ind_tbl->usecnt); +} +EXPORT_SYMBOL(ib_qp_usecnt_inc); + +void ib_qp_usecnt_dec(struct ib_qp *qp) +{ + if (qp->rwq_ind_tbl) + atomic_dec(&qp->rwq_ind_tbl->usecnt); + if (qp->srq) + atomic_dec(&qp->srq->usecnt); + if (qp->recv_cq) + atomic_dec(&qp->recv_cq->usecnt); + if (qp->send_cq) + atomic_dec(&qp->send_cq->usecnt); + if (qp->pd) + atomic_dec(&qp->pd->usecnt); +} +EXPORT_SYMBOL(ib_qp_usecnt_dec); + +struct ib_qp *ib_create_qp_kernel(struct ib_pd *pd, + struct ib_qp_init_attr *qp_init_attr, + const char *caller) +{ + struct ib_device *device = pd->device; + struct ib_qp *qp; + int ret; + + /* + * If the callers is using the RDMA API calculate the resources + * needed for the RDMA READ/WRITE operations. + * + * Note that these callers need to pass in a port number. + */ + if (qp_init_attr->cap.max_rdma_ctxs) + rdma_rw_init_qp(device, qp_init_attr); + + qp = create_qp(device, pd, qp_init_attr, NULL, NULL, caller); + if (IS_ERR(qp)) + return qp; + + ib_qp_usecnt_inc(qp); + + if (qp_init_attr->cap.max_rdma_ctxs) { + ret = rdma_rw_init_mrs(qp, qp_init_attr); + if (ret) + goto err; + } + + /* + * Note: all hw drivers guarantee that max_send_sge is lower than + * the device RDMA WRITE SGE limit but not all hw drivers ensure that + * max_send_sge <= max_sge_rd. + */ + qp->max_write_sge = qp_init_attr->cap.max_send_sge; + qp->max_read_sge = min_t(u32, qp_init_attr->cap.max_send_sge, + device->attrs.max_sge_rd); + if (qp_init_attr->create_flags & IB_QP_CREATE_INTEGRITY_EN) + qp->integrity_en = true; + + return qp; + +err: + ib_destroy_qp(qp); + return ERR_PTR(ret); + +} +EXPORT_SYMBOL(ib_create_qp_kernel); + +static const struct { + int valid; + enum ib_qp_attr_mask req_param[IB_QPT_MAX]; + enum ib_qp_attr_mask opt_param[IB_QPT_MAX]; +} qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = { + [IB_QPS_RESET] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_INIT] = { + .valid = 1, + .req_param = { + [IB_QPT_UD] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_QKEY), + [IB_QPT_RAW_PACKET] = IB_QP_PORT, + [IB_QPT_UC] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS), + [IB_QPT_RC] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS), + [IB_QPT_XRC_INI] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS), + [IB_QPT_XRC_TGT] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS), + [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + } + }, + }, + [IB_QPS_INIT] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_ERR] = { .valid = 1 }, + [IB_QPS_INIT] = { + .valid = 1, + .opt_param = { + [IB_QPT_UD] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_QKEY), + [IB_QPT_UC] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS), + [IB_QPT_RC] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS), + [IB_QPT_XRC_INI] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS), + [IB_QPT_XRC_TGT] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS), + [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + } + }, + [IB_QPS_RTR] = { + .valid = 1, + .req_param = { + [IB_QPT_UC] = (IB_QP_AV | + IB_QP_PATH_MTU | + IB_QP_DEST_QPN | + IB_QP_RQ_PSN), + [IB_QPT_RC] = (IB_QP_AV | + IB_QP_PATH_MTU | + IB_QP_DEST_QPN | + IB_QP_RQ_PSN | + IB_QP_MAX_DEST_RD_ATOMIC | + IB_QP_MIN_RNR_TIMER), + [IB_QPT_XRC_INI] = (IB_QP_AV | + IB_QP_PATH_MTU | + IB_QP_DEST_QPN | + IB_QP_RQ_PSN), + [IB_QPT_XRC_TGT] = (IB_QP_AV | + IB_QP_PATH_MTU | + IB_QP_DEST_QPN | + IB_QP_RQ_PSN | + IB_QP_MAX_DEST_RD_ATOMIC | + IB_QP_MIN_RNR_TIMER), + }, + .opt_param = { + [IB_QPT_UD] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + [IB_QPT_UC] = (IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PKEY_INDEX), + [IB_QPT_RC] = (IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PKEY_INDEX), + [IB_QPT_XRC_INI] = (IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PKEY_INDEX), + [IB_QPT_XRC_TGT] = (IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PKEY_INDEX), + [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + }, + }, + }, + [IB_QPS_RTR] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_ERR] = { .valid = 1 }, + [IB_QPS_RTS] = { + .valid = 1, + .req_param = { + [IB_QPT_UD] = IB_QP_SQ_PSN, + [IB_QPT_UC] = IB_QP_SQ_PSN, + [IB_QPT_RC] = (IB_QP_TIMEOUT | + IB_QP_RETRY_CNT | + IB_QP_RNR_RETRY | + IB_QP_SQ_PSN | + IB_QP_MAX_QP_RD_ATOMIC), + [IB_QPT_XRC_INI] = (IB_QP_TIMEOUT | + IB_QP_RETRY_CNT | + IB_QP_RNR_RETRY | + IB_QP_SQ_PSN | + IB_QP_MAX_QP_RD_ATOMIC), + [IB_QPT_XRC_TGT] = (IB_QP_TIMEOUT | + IB_QP_SQ_PSN), + [IB_QPT_SMI] = IB_QP_SQ_PSN, + [IB_QPT_GSI] = IB_QP_SQ_PSN, + }, + .opt_param = { + [IB_QPT_UD] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + [IB_QPT_UC] = (IB_QP_CUR_STATE | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PATH_MIG_STATE), + [IB_QPT_RC] = (IB_QP_CUR_STATE | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_MIN_RNR_TIMER | + IB_QP_PATH_MIG_STATE | + IB_QP_OFFLOAD_TYPE), + [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PATH_MIG_STATE), + [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_MIN_RNR_TIMER | + IB_QP_PATH_MIG_STATE), + [IB_QPT_SMI] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + [IB_QPT_GSI] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + [IB_QPT_RAW_PACKET] = IB_QP_RATE_LIMIT, + } + } + }, + [IB_QPS_RTS] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_ERR] = { .valid = 1 }, + [IB_QPS_RTS] = { + .valid = 1, + .opt_param = { + [IB_QPT_UD] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + [IB_QPT_UC] = (IB_QP_CUR_STATE | + IB_QP_ACCESS_FLAGS | + IB_QP_ALT_PATH | + IB_QP_PATH_MIG_STATE), + [IB_QPT_RC] = (IB_QP_CUR_STATE | + IB_QP_ACCESS_FLAGS | + IB_QP_ALT_PATH | + IB_QP_PATH_MIG_STATE | + IB_QP_MIN_RNR_TIMER | + IB_QP_OFFLOAD_TYPE | + IB_QP_RMPN_XRQN), + [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE | + IB_QP_ACCESS_FLAGS | + IB_QP_ALT_PATH | + IB_QP_PATH_MIG_STATE), + [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE | + IB_QP_ACCESS_FLAGS | + IB_QP_ALT_PATH | + IB_QP_PATH_MIG_STATE | + IB_QP_MIN_RNR_TIMER), + [IB_QPT_SMI] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + [IB_QPT_GSI] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + [IB_QPT_RAW_PACKET] = IB_QP_RATE_LIMIT, + } + }, + [IB_QPS_SQD] = { + .valid = 1, + .opt_param = { + [IB_QPT_UD] = IB_QP_EN_SQD_ASYNC_NOTIFY, + [IB_QPT_UC] = IB_QP_EN_SQD_ASYNC_NOTIFY, + [IB_QPT_RC] = IB_QP_EN_SQD_ASYNC_NOTIFY, + [IB_QPT_XRC_INI] = IB_QP_EN_SQD_ASYNC_NOTIFY, + [IB_QPT_XRC_TGT] = IB_QP_EN_SQD_ASYNC_NOTIFY, /* ??? */ + [IB_QPT_SMI] = IB_QP_EN_SQD_ASYNC_NOTIFY, + [IB_QPT_GSI] = IB_QP_EN_SQD_ASYNC_NOTIFY + } + }, + }, + [IB_QPS_SQD] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_ERR] = { .valid = 1 }, + [IB_QPS_RTS] = { + .valid = 1, + .opt_param = { + [IB_QPT_UD] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + [IB_QPT_UC] = (IB_QP_CUR_STATE | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PATH_MIG_STATE), + [IB_QPT_RC] = (IB_QP_CUR_STATE | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_MIN_RNR_TIMER | + IB_QP_PATH_MIG_STATE), + [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PATH_MIG_STATE), + [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_MIN_RNR_TIMER | + IB_QP_PATH_MIG_STATE), + [IB_QPT_SMI] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + [IB_QPT_GSI] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + } + }, + [IB_QPS_SQD] = { + .valid = 1, + .opt_param = { + [IB_QPT_UD] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + [IB_QPT_UC] = (IB_QP_AV | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PKEY_INDEX | + IB_QP_PATH_MIG_STATE), + [IB_QPT_RC] = (IB_QP_PORT | + IB_QP_AV | + IB_QP_TIMEOUT | + IB_QP_RETRY_CNT | + IB_QP_RNR_RETRY | + IB_QP_MAX_QP_RD_ATOMIC | + IB_QP_MAX_DEST_RD_ATOMIC | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PKEY_INDEX | + IB_QP_MIN_RNR_TIMER | + IB_QP_PATH_MIG_STATE), + [IB_QPT_XRC_INI] = (IB_QP_PORT | + IB_QP_AV | + IB_QP_TIMEOUT | + IB_QP_RETRY_CNT | + IB_QP_RNR_RETRY | + IB_QP_MAX_QP_RD_ATOMIC | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PKEY_INDEX | + IB_QP_PATH_MIG_STATE), + [IB_QPT_XRC_TGT] = (IB_QP_PORT | + IB_QP_AV | + IB_QP_TIMEOUT | + IB_QP_MAX_DEST_RD_ATOMIC | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PKEY_INDEX | + IB_QP_MIN_RNR_TIMER | + IB_QP_PATH_MIG_STATE), + [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + } + } + }, + [IB_QPS_SQE] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_ERR] = { .valid = 1 }, + [IB_QPS_RTS] = { + .valid = 1, + .opt_param = { + [IB_QPT_UD] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + [IB_QPT_UC] = (IB_QP_CUR_STATE | + IB_QP_ACCESS_FLAGS), + [IB_QPT_SMI] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + [IB_QPT_GSI] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + } + } + }, + [IB_QPS_ERR] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_ERR] = { .valid = 1 } + } +}; + +bool ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state, + enum ib_qp_type type, enum ib_qp_attr_mask mask) +{ + enum ib_qp_attr_mask req_param, opt_param; + + if (mask & IB_QP_CUR_STATE && + cur_state != IB_QPS_RTR && cur_state != IB_QPS_RTS && + cur_state != IB_QPS_SQD && cur_state != IB_QPS_SQE) + return false; + + if (!qp_state_table[cur_state][next_state].valid) + return false; + + req_param = qp_state_table[cur_state][next_state].req_param[type]; + opt_param = qp_state_table[cur_state][next_state].opt_param[type]; + + if ((mask & req_param) != req_param) + return false; + + if (mask & ~(req_param | opt_param | IB_QP_STATE)) + return false; + + return true; +} +EXPORT_SYMBOL(ib_modify_qp_is_ok); + +/** + * ib_resolve_eth_dmac - Resolve destination mac address + * @device: Device to consider + * @ah_attr: address handle attribute which describes the + * source and destination parameters + * ib_resolve_eth_dmac() resolves destination mac address and L3 hop limit It + * returns 0 on success or appropriate error code. It initializes the + * necessary ah_attr fields when call is successful. + */ +static int ib_resolve_eth_dmac(struct ib_device *device, + struct rdma_ah_attr *ah_attr) +{ + int ret = 0; + + if (!rdma_check_gid_user_access(ah_attr->grh.sgid_attr)) + return -ENODEV; + + if (rdma_is_multicast_addr((struct in6_addr *)ah_attr->grh.dgid.raw)) { + if (ipv6_addr_v4mapped((struct in6_addr *)ah_attr->grh.dgid.raw)) { + __be32 addr = 0; + + memcpy(&addr, ah_attr->grh.dgid.raw + 12, 4); + ip_eth_mc_map(addr, (char *)ah_attr->roce.dmac); + } else { + ipv6_eth_mc_map((struct in6_addr *)ah_attr->grh.dgid.raw, + (char *)ah_attr->roce.dmac); + } + } else { + ret = ib_resolve_unicast_gid_dmac(device, ah_attr); + } + return ret; +} + +static bool is_qp_type_connected(const struct ib_qp *qp) +{ + return (qp->qp_type == IB_QPT_UC || + qp->qp_type == IB_QPT_RC || + qp->qp_type == IB_QPT_XRC_INI || + qp->qp_type == IB_QPT_XRC_TGT); +} + +/* + * IB core internal function to perform QP attributes modification. + */ +static int _ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + u32 port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; + const struct ib_gid_attr *old_sgid_attr_av; + const struct ib_gid_attr *old_sgid_attr_alt_av; + int ret; + + attr->xmit_slave = NULL; + if (attr_mask & IB_QP_AV) { + ret = rdma_fill_sgid_attr(qp->device, &attr->ah_attr, + &old_sgid_attr_av); + if (ret) + return ret; + + if (attr->ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE && + is_qp_type_connected(qp)) { + struct net_device *slave; + + /* + * If the user provided the qp_attr then we have to + * resolve it. Kerne users have to provide already + * resolved rdma_ah_attr's. + */ + if (udata) { + ret = ib_resolve_eth_dmac(qp->device, + &attr->ah_attr); + if (ret) + goto out_av; + } + slave = rdma_lag_get_ah_roce_slave(qp->device, + &attr->ah_attr, + GFP_KERNEL); + if (IS_ERR(slave)) { + ret = PTR_ERR(slave); + goto out_av; + } + attr->xmit_slave = slave; + } + } + if (attr_mask & IB_QP_ALT_PATH) { + /* + * FIXME: This does not track the migration state, so if the + * user loads a new alternate path after the HW has migrated + * from primary->alternate we will keep the wrong + * references. This is OK for IB because the reference + * counting does not serve any functional purpose. + */ + ret = rdma_fill_sgid_attr(qp->device, &attr->alt_ah_attr, + &old_sgid_attr_alt_av); + if (ret) + goto out_av; + + /* + * Today the core code can only handle alternate paths and APM + * for IB. Ban them in roce mode. + */ + if (!(rdma_protocol_ib(qp->device, + attr->alt_ah_attr.port_num) && + rdma_protocol_ib(qp->device, port))) { + ret = -EINVAL; + goto out; + } + } + + if (rdma_ib_or_roce(qp->device, port)) { + if (attr_mask & IB_QP_RQ_PSN && attr->rq_psn & ~0xffffff) { + dev_warn(&qp->device->dev, + "%s rq_psn overflow, masking to 24 bits\n", + __func__); + attr->rq_psn &= 0xffffff; + } + + if (attr_mask & IB_QP_SQ_PSN && attr->sq_psn & ~0xffffff) { + dev_warn(&qp->device->dev, + " %s sq_psn overflow, masking to 24 bits\n", + __func__); + attr->sq_psn &= 0xffffff; + } + } + + /* + * Bind this qp to a counter automatically based on the rdma counter + * rules. This only set in RST2INIT with port specified + */ + if (!qp->counter && (attr_mask & IB_QP_PORT) && + ((attr_mask & IB_QP_STATE) && attr->qp_state == IB_QPS_INIT)) + rdma_counter_bind_qp_auto(qp, attr->port_num); + + ret = ib_security_modify_qp(qp, attr, attr_mask, udata); + if (ret) + goto out; + + if (attr_mask & IB_QP_PORT) + qp->port = attr->port_num; + if (attr_mask & IB_QP_AV) + qp->av_sgid_attr = + rdma_update_sgid_attr(&attr->ah_attr, qp->av_sgid_attr); + if (attr_mask & IB_QP_ALT_PATH) + qp->alt_path_sgid_attr = rdma_update_sgid_attr( + &attr->alt_ah_attr, qp->alt_path_sgid_attr); + +out: + if (attr_mask & IB_QP_ALT_PATH) + rdma_unfill_sgid_attr(&attr->alt_ah_attr, old_sgid_attr_alt_av); +out_av: + if (attr_mask & IB_QP_AV) { + rdma_lag_put_ah_roce_slave(attr->xmit_slave); + rdma_unfill_sgid_attr(&attr->ah_attr, old_sgid_attr_av); + } + return ret; +} + +/** + * ib_modify_qp_with_udata - Modifies the attributes for the specified QP. + * @ib_qp: The QP to modify. + * @attr: On input, specifies the QP attributes to modify. On output, + * the current values of selected QP attributes are returned. + * @attr_mask: A bit-mask used to specify which attributes of the QP + * are being modified. + * @udata: pointer to user's input output buffer information + * are being modified. + * It returns 0 on success and returns appropriate error code on error. + */ +int ib_modify_qp_with_udata(struct ib_qp *ib_qp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + return _ib_modify_qp(ib_qp->real_qp, attr, attr_mask, udata); +} +EXPORT_SYMBOL(ib_modify_qp_with_udata); + +int ib_get_eth_speed(struct ib_device *dev, u32 port_num, u16 *speed, u8 *width) +{ + int rc; + u32 netdev_speed; + struct net_device *netdev; + struct ethtool_link_ksettings lksettings; + + if (rdma_port_get_link_layer(dev, port_num) != IB_LINK_LAYER_ETHERNET) + return -EINVAL; + + netdev = ib_device_get_netdev(dev, port_num); + if (!netdev) + return -ENODEV; + + rtnl_lock(); + rc = __ethtool_get_link_ksettings(netdev, &lksettings); + rtnl_unlock(); + + dev_put(netdev); + + if (!rc && lksettings.base.speed != (u32)SPEED_UNKNOWN) { + netdev_speed = lksettings.base.speed; + } else { + netdev_speed = SPEED_1000; + pr_warn("%s speed is unknown, defaulting to %u\n", netdev->name, + netdev_speed); + } + + if (netdev_speed <= SPEED_1000) { + *width = IB_WIDTH_1X; + *speed = IB_SPEED_SDR; + } else if (netdev_speed <= SPEED_10000) { + *width = IB_WIDTH_1X; + *speed = IB_SPEED_FDR10; + } else if (netdev_speed <= SPEED_20000) { + *width = IB_WIDTH_4X; + *speed = IB_SPEED_DDR; + } else if (netdev_speed <= SPEED_25000) { + *width = IB_WIDTH_1X; + *speed = IB_SPEED_EDR; + } else if (netdev_speed <= SPEED_40000) { + *width = IB_WIDTH_4X; + *speed = IB_SPEED_FDR10; + } else { + *width = IB_WIDTH_4X; + *speed = IB_SPEED_EDR; + } + + return 0; +} +EXPORT_SYMBOL(ib_get_eth_speed); + +int ib_modify_qp(struct ib_qp *qp, + struct ib_qp_attr *qp_attr, + int qp_attr_mask) +{ + return _ib_modify_qp(qp->real_qp, qp_attr, qp_attr_mask, NULL); +} +EXPORT_SYMBOL(ib_modify_qp); + +int ib_query_qp(struct ib_qp *qp, + struct ib_qp_attr *qp_attr, + int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr) +{ + qp_attr->ah_attr.grh.sgid_attr = NULL; + qp_attr->alt_ah_attr.grh.sgid_attr = NULL; + + return qp->device->ops.query_qp ? + qp->device->ops.query_qp(qp->real_qp, qp_attr, qp_attr_mask, + qp_init_attr) : -EOPNOTSUPP; +} +EXPORT_SYMBOL(ib_query_qp); + +int ib_close_qp(struct ib_qp *qp) +{ + struct ib_qp *real_qp; + unsigned long flags; + + real_qp = qp->real_qp; + if (real_qp == qp) + return -EINVAL; + + spin_lock_irqsave(&real_qp->device->qp_open_list_lock, flags); + list_del(&qp->open_list); + spin_unlock_irqrestore(&real_qp->device->qp_open_list_lock, flags); + + atomic_dec(&real_qp->usecnt); + if (qp->qp_sec) + ib_close_shared_qp_security(qp->qp_sec); + kfree(qp); + + return 0; +} +EXPORT_SYMBOL(ib_close_qp); + +static int __ib_destroy_shared_qp(struct ib_qp *qp) +{ + struct ib_xrcd *xrcd; + struct ib_qp *real_qp; + int ret; + + real_qp = qp->real_qp; + xrcd = real_qp->xrcd; + down_write(&xrcd->tgt_qps_rwsem); + ib_close_qp(qp); + if (atomic_read(&real_qp->usecnt) == 0) + xa_erase(&xrcd->tgt_qps, real_qp->qp_num); + else + real_qp = NULL; + up_write(&xrcd->tgt_qps_rwsem); + + if (real_qp) { + ret = ib_destroy_qp(real_qp); + if (!ret) + atomic_dec(&xrcd->usecnt); + } + + return 0; +} + +int ib_destroy_qp_user(struct ib_qp *qp, struct ib_udata *udata) +{ + const struct ib_gid_attr *alt_path_sgid_attr = qp->alt_path_sgid_attr; + const struct ib_gid_attr *av_sgid_attr = qp->av_sgid_attr; + struct ib_qp_security *sec; + int ret; + + WARN_ON_ONCE(qp->mrs_used > 0); + + if (atomic_read(&qp->usecnt)) + return -EBUSY; + + if (qp->real_qp != qp) + return __ib_destroy_shared_qp(qp); + + sec = qp->qp_sec; + if (sec) + ib_destroy_qp_security_begin(sec); + + if (!qp->uobject) + rdma_rw_cleanup_mrs(qp); + + rdma_counter_unbind_qp(qp, true); + ret = qp->device->ops.destroy_qp(qp, udata); + if (ret) { + if (sec) + ib_destroy_qp_security_abort(sec); + return ret; + } + + if (alt_path_sgid_attr) + rdma_put_gid_attr(alt_path_sgid_attr); + if (av_sgid_attr) + rdma_put_gid_attr(av_sgid_attr); + + ib_qp_usecnt_dec(qp); + if (sec) + ib_destroy_qp_security_end(sec); + + rdma_restrack_del(&qp->res); + kfree(qp); + return ret; +} +EXPORT_SYMBOL(ib_destroy_qp_user); + +/* Completion queues */ + +struct ib_cq *__ib_create_cq(struct ib_device *device, + ib_comp_handler comp_handler, + void (*event_handler)(struct ib_event *, void *), + void *cq_context, + const struct ib_cq_init_attr *cq_attr, + const char *caller) +{ + struct ib_cq *cq; + int ret; + + cq = rdma_zalloc_drv_obj(device, ib_cq); + if (!cq) + return ERR_PTR(-ENOMEM); + + cq->device = device; + cq->uobject = NULL; + cq->comp_handler = comp_handler; + cq->event_handler = event_handler; + cq->cq_context = cq_context; + atomic_set(&cq->usecnt, 0); + + rdma_restrack_new(&cq->res, RDMA_RESTRACK_CQ); + rdma_restrack_set_name(&cq->res, caller); + + ret = device->ops.create_cq(cq, cq_attr, NULL); + if (ret) { + rdma_restrack_put(&cq->res); + kfree(cq); + return ERR_PTR(ret); + } + + rdma_restrack_add(&cq->res); + return cq; +} +EXPORT_SYMBOL(__ib_create_cq); + +int rdma_set_cq_moderation(struct ib_cq *cq, u16 cq_count, u16 cq_period) +{ + if (cq->shared) + return -EOPNOTSUPP; + + return cq->device->ops.modify_cq ? + cq->device->ops.modify_cq(cq, cq_count, + cq_period) : -EOPNOTSUPP; +} +EXPORT_SYMBOL(rdma_set_cq_moderation); + +int ib_destroy_cq_user(struct ib_cq *cq, struct ib_udata *udata) +{ + int ret; + + if (WARN_ON_ONCE(cq->shared)) + return -EOPNOTSUPP; + + if (atomic_read(&cq->usecnt)) + return -EBUSY; + + ret = cq->device->ops.destroy_cq(cq, udata); + if (ret) + return ret; + + rdma_restrack_del(&cq->res); + kfree(cq); + return ret; +} +EXPORT_SYMBOL(ib_destroy_cq_user); + +int ib_resize_cq(struct ib_cq *cq, int cqe) +{ + if (cq->shared) + return -EOPNOTSUPP; + + return cq->device->ops.resize_cq ? + cq->device->ops.resize_cq(cq, cqe, NULL) : -EOPNOTSUPP; +} +EXPORT_SYMBOL(ib_resize_cq); + +/* Memory regions */ + +struct ib_mr *ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int access_flags) +{ + struct ib_mr *mr; + + if (access_flags & IB_ACCESS_ON_DEMAND) { + if (!(pd->device->attrs.device_cap_flags & + IB_DEVICE_ON_DEMAND_PAGING)) { + pr_debug("ODP support not available\n"); + return ERR_PTR(-EINVAL); + } + } + + mr = pd->device->ops.reg_user_mr(pd, start, length, virt_addr, + access_flags, NULL); + + if (IS_ERR(mr)) + return mr; + + mr->device = pd->device; + mr->type = IB_MR_TYPE_USER; + mr->pd = pd; + mr->dm = NULL; + atomic_inc(&pd->usecnt); + + rdma_restrack_new(&mr->res, RDMA_RESTRACK_MR); + rdma_restrack_parent_name(&mr->res, &pd->res); + rdma_restrack_add(&mr->res); + + return mr; +} +EXPORT_SYMBOL(ib_reg_user_mr); + +int ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice, + u32 flags, struct ib_sge *sg_list, u32 num_sge) +{ + if (!pd->device->ops.advise_mr) + return -EOPNOTSUPP; + + if (!num_sge) + return 0; + + return pd->device->ops.advise_mr(pd, advice, flags, sg_list, num_sge, + NULL); +} +EXPORT_SYMBOL(ib_advise_mr); + +int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata) +{ + struct ib_pd *pd = mr->pd; + struct ib_dm *dm = mr->dm; + struct ib_sig_attrs *sig_attrs = mr->sig_attrs; + int ret; + + trace_mr_dereg(mr); + rdma_restrack_del(&mr->res); + ret = mr->device->ops.dereg_mr(mr, udata); + if (!ret) { + atomic_dec(&pd->usecnt); + if (dm) + atomic_dec(&dm->usecnt); + kfree(sig_attrs); + } + + return ret; +} +EXPORT_SYMBOL(ib_dereg_mr_user); + +/** + * ib_alloc_mr() - Allocates a memory region + * @pd: protection domain associated with the region + * @mr_type: memory region type + * @max_num_sg: maximum sg entries available for registration. + * + * Notes: + * Memory registeration page/sg lists must not exceed max_num_sg. + * For mr_type IB_MR_TYPE_MEM_REG, the total length cannot exceed + * max_num_sg * used_page_size. + * + */ +struct ib_mr *ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, + u32 max_num_sg) +{ + struct ib_mr *mr; + + if (!pd->device->ops.alloc_mr) { + mr = ERR_PTR(-EOPNOTSUPP); + goto out; + } + + if (mr_type == IB_MR_TYPE_INTEGRITY) { + WARN_ON_ONCE(1); + mr = ERR_PTR(-EINVAL); + goto out; + } + + mr = pd->device->ops.alloc_mr(pd, mr_type, max_num_sg); + if (IS_ERR(mr)) + goto out; + + mr->device = pd->device; + mr->pd = pd; + mr->dm = NULL; + mr->uobject = NULL; + atomic_inc(&pd->usecnt); + mr->need_inval = false; + mr->type = mr_type; + mr->sig_attrs = NULL; + + rdma_restrack_new(&mr->res, RDMA_RESTRACK_MR); + rdma_restrack_parent_name(&mr->res, &pd->res); + rdma_restrack_add(&mr->res); +out: + trace_mr_alloc(pd, mr_type, max_num_sg, mr); + return mr; +} +EXPORT_SYMBOL(ib_alloc_mr); + +/** + * ib_alloc_mr_integrity() - Allocates an integrity memory region + * @pd: protection domain associated with the region + * @max_num_data_sg: maximum data sg entries available for registration + * @max_num_meta_sg: maximum metadata sg entries available for + * registration + * + * Notes: + * Memory registration page/sg lists must not exceed max_num_sg, + * also the integrity page/sg lists must not exceed max_num_meta_sg. + * + */ +struct ib_mr *ib_alloc_mr_integrity(struct ib_pd *pd, + u32 max_num_data_sg, + u32 max_num_meta_sg) +{ + struct ib_mr *mr; + struct ib_sig_attrs *sig_attrs; + + if (!pd->device->ops.alloc_mr_integrity || + !pd->device->ops.map_mr_sg_pi) { + mr = ERR_PTR(-EOPNOTSUPP); + goto out; + } + + if (!max_num_meta_sg) { + mr = ERR_PTR(-EINVAL); + goto out; + } + + sig_attrs = kzalloc(sizeof(struct ib_sig_attrs), GFP_KERNEL); + if (!sig_attrs) { + mr = ERR_PTR(-ENOMEM); + goto out; + } + + mr = pd->device->ops.alloc_mr_integrity(pd, max_num_data_sg, + max_num_meta_sg); + if (IS_ERR(mr)) { + kfree(sig_attrs); + goto out; + } + + mr->device = pd->device; + mr->pd = pd; + mr->dm = NULL; + mr->uobject = NULL; + atomic_inc(&pd->usecnt); + mr->need_inval = false; + mr->type = IB_MR_TYPE_INTEGRITY; + mr->sig_attrs = sig_attrs; + + rdma_restrack_new(&mr->res, RDMA_RESTRACK_MR); + rdma_restrack_parent_name(&mr->res, &pd->res); + rdma_restrack_add(&mr->res); +out: + trace_mr_integ_alloc(pd, max_num_data_sg, max_num_meta_sg, mr); + return mr; +} +EXPORT_SYMBOL(ib_alloc_mr_integrity); + +/* Multicast groups */ + +static bool is_valid_mcast_lid(struct ib_qp *qp, u16 lid) +{ + struct ib_qp_init_attr init_attr = {}; + struct ib_qp_attr attr = {}; + int num_eth_ports = 0; + unsigned int port; + + /* If QP state >= init, it is assigned to a port and we can check this + * port only. + */ + if (!ib_query_qp(qp, &attr, IB_QP_STATE | IB_QP_PORT, &init_attr)) { + if (attr.qp_state >= IB_QPS_INIT) { + if (rdma_port_get_link_layer(qp->device, attr.port_num) != + IB_LINK_LAYER_INFINIBAND) + return true; + goto lid_check; + } + } + + /* Can't get a quick answer, iterate over all ports */ + rdma_for_each_port(qp->device, port) + if (rdma_port_get_link_layer(qp->device, port) != + IB_LINK_LAYER_INFINIBAND) + num_eth_ports++; + + /* If we have at lease one Ethernet port, RoCE annex declares that + * multicast LID should be ignored. We can't tell at this step if the + * QP belongs to an IB or Ethernet port. + */ + if (num_eth_ports) + return true; + + /* If all the ports are IB, we can check according to IB spec. */ +lid_check: + return !(lid < be16_to_cpu(IB_MULTICAST_LID_BASE) || + lid == be16_to_cpu(IB_LID_PERMISSIVE)); +} + +int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) +{ + int ret; + + if (!qp->device->ops.attach_mcast) + return -EOPNOTSUPP; + + if (!rdma_is_multicast_addr((struct in6_addr *)gid->raw) || + qp->qp_type != IB_QPT_UD || !is_valid_mcast_lid(qp, lid)) + return -EINVAL; + + ret = qp->device->ops.attach_mcast(qp, gid, lid); + if (!ret) + atomic_inc(&qp->usecnt); + return ret; +} +EXPORT_SYMBOL(ib_attach_mcast); + +int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) +{ + int ret; + + if (!qp->device->ops.detach_mcast) + return -EOPNOTSUPP; + + if (!rdma_is_multicast_addr((struct in6_addr *)gid->raw) || + qp->qp_type != IB_QPT_UD || !is_valid_mcast_lid(qp, lid)) + return -EINVAL; + + ret = qp->device->ops.detach_mcast(qp, gid, lid); + if (!ret) + atomic_dec(&qp->usecnt); + return ret; +} +EXPORT_SYMBOL(ib_detach_mcast); + +/** + * ib_alloc_xrcd_user - Allocates an XRC domain. + * @device: The device on which to allocate the XRC domain. + * @inode: inode to connect XRCD + * @udata: Valid user data or NULL for kernel object + */ +struct ib_xrcd *ib_alloc_xrcd_user(struct ib_device *device, + struct inode *inode, struct ib_udata *udata) +{ + struct ib_xrcd *xrcd; + int ret; + + if (!device->ops.alloc_xrcd) + return ERR_PTR(-EOPNOTSUPP); + + xrcd = rdma_zalloc_drv_obj(device, ib_xrcd); + if (!xrcd) + return ERR_PTR(-ENOMEM); + + xrcd->device = device; + xrcd->inode = inode; + atomic_set(&xrcd->usecnt, 0); + init_rwsem(&xrcd->tgt_qps_rwsem); + xa_init(&xrcd->tgt_qps); + + ret = device->ops.alloc_xrcd(xrcd, udata); + if (ret) + goto err; + return xrcd; +err: + kfree(xrcd); + return ERR_PTR(ret); +} +EXPORT_SYMBOL(ib_alloc_xrcd_user); + +/** + * ib_dealloc_xrcd_user - Deallocates an XRC domain. + * @xrcd: The XRC domain to deallocate. + * @udata: Valid user data or NULL for kernel object + */ +int ib_dealloc_xrcd_user(struct ib_xrcd *xrcd, struct ib_udata *udata) +{ + int ret; + + if (atomic_read(&xrcd->usecnt)) + return -EBUSY; + + WARN_ON(!xa_empty(&xrcd->tgt_qps)); + ret = xrcd->device->ops.dealloc_xrcd(xrcd, udata); + if (ret) + return ret; + kfree(xrcd); + return ret; +} +EXPORT_SYMBOL(ib_dealloc_xrcd_user); + +/** + * ib_create_wq - Creates a WQ associated with the specified protection + * domain. + * @pd: The protection domain associated with the WQ. + * @wq_attr: A list of initial attributes required to create the + * WQ. If WQ creation succeeds, then the attributes are updated to + * the actual capabilities of the created WQ. + * + * wq_attr->max_wr and wq_attr->max_sge determine + * the requested size of the WQ, and set to the actual values allocated + * on return. + * If ib_create_wq() succeeds, then max_wr and max_sge will always be + * at least as large as the requested values. + */ +struct ib_wq *ib_create_wq(struct ib_pd *pd, + struct ib_wq_init_attr *wq_attr) +{ + struct ib_wq *wq; + + if (!pd->device->ops.create_wq) + return ERR_PTR(-EOPNOTSUPP); + + wq = pd->device->ops.create_wq(pd, wq_attr, NULL); + if (!IS_ERR(wq)) { + wq->event_handler = wq_attr->event_handler; + wq->wq_context = wq_attr->wq_context; + wq->wq_type = wq_attr->wq_type; + wq->cq = wq_attr->cq; + wq->device = pd->device; + wq->pd = pd; + wq->uobject = NULL; + atomic_inc(&pd->usecnt); + atomic_inc(&wq_attr->cq->usecnt); + atomic_set(&wq->usecnt, 0); + } + return wq; +} +EXPORT_SYMBOL(ib_create_wq); + +/** + * ib_destroy_wq_user - Destroys the specified user WQ. + * @wq: The WQ to destroy. + * @udata: Valid user data + */ +int ib_destroy_wq_user(struct ib_wq *wq, struct ib_udata *udata) +{ + struct ib_cq *cq = wq->cq; + struct ib_pd *pd = wq->pd; + int ret; + + if (atomic_read(&wq->usecnt)) + return -EBUSY; + + ret = wq->device->ops.destroy_wq(wq, udata); + if (ret) + return ret; + + atomic_dec(&pd->usecnt); + atomic_dec(&cq->usecnt); + return ret; +} +EXPORT_SYMBOL(ib_destroy_wq_user); + +int ib_check_mr_status(struct ib_mr *mr, u32 check_mask, + struct ib_mr_status *mr_status) +{ + if (!mr->device->ops.check_mr_status) + return -EOPNOTSUPP; + + return mr->device->ops.check_mr_status(mr, check_mask, mr_status); +} +EXPORT_SYMBOL(ib_check_mr_status); + +int ib_set_vf_link_state(struct ib_device *device, int vf, u32 port, + int state) +{ + if (!device->ops.set_vf_link_state) + return -EOPNOTSUPP; + + return device->ops.set_vf_link_state(device, vf, port, state); +} +EXPORT_SYMBOL(ib_set_vf_link_state); + +int ib_get_vf_config(struct ib_device *device, int vf, u32 port, + struct ifla_vf_info *info) +{ + if (!device->ops.get_vf_config) + return -EOPNOTSUPP; + + return device->ops.get_vf_config(device, vf, port, info); +} +EXPORT_SYMBOL(ib_get_vf_config); + +int ib_get_vf_stats(struct ib_device *device, int vf, u32 port, + struct ifla_vf_stats *stats) +{ + if (!device->ops.get_vf_stats) + return -EOPNOTSUPP; + + return device->ops.get_vf_stats(device, vf, port, stats); +} +EXPORT_SYMBOL(ib_get_vf_stats); + +int ib_set_vf_guid(struct ib_device *device, int vf, u32 port, u64 guid, + int type) +{ + if (!device->ops.set_vf_guid) + return -EOPNOTSUPP; + + return device->ops.set_vf_guid(device, vf, port, guid, type); +} +EXPORT_SYMBOL(ib_set_vf_guid); + +int ib_get_vf_guid(struct ib_device *device, int vf, u32 port, + struct ifla_vf_guid *node_guid, + struct ifla_vf_guid *port_guid) +{ + if (!device->ops.get_vf_guid) + return -EOPNOTSUPP; + + return device->ops.get_vf_guid(device, vf, port, node_guid, port_guid); +} +EXPORT_SYMBOL(ib_get_vf_guid); +/** + * ib_map_mr_sg_pi() - Map the dma mapped SG lists for PI (protection + * information) and set an appropriate memory region for registration. + * @mr: memory region + * @data_sg: dma mapped scatterlist for data + * @data_sg_nents: number of entries in data_sg + * @data_sg_offset: offset in bytes into data_sg + * @meta_sg: dma mapped scatterlist for metadata + * @meta_sg_nents: number of entries in meta_sg + * @meta_sg_offset: offset in bytes into meta_sg + * @page_size: page vector desired page size + * + * Constraints: + * - The MR must be allocated with type IB_MR_TYPE_INTEGRITY. + * + * Return: 0 on success. + * + * After this completes successfully, the memory region + * is ready for registration. + */ +int ib_map_mr_sg_pi(struct ib_mr *mr, struct scatterlist *data_sg, + int data_sg_nents, unsigned int *data_sg_offset, + struct scatterlist *meta_sg, int meta_sg_nents, + unsigned int *meta_sg_offset, unsigned int page_size) +{ + if (unlikely(!mr->device->ops.map_mr_sg_pi || + WARN_ON_ONCE(mr->type != IB_MR_TYPE_INTEGRITY))) + return -EOPNOTSUPP; + + mr->page_size = page_size; + + return mr->device->ops.map_mr_sg_pi(mr, data_sg, data_sg_nents, + data_sg_offset, meta_sg, + meta_sg_nents, meta_sg_offset); +} +EXPORT_SYMBOL(ib_map_mr_sg_pi); + +/** + * ib_map_mr_sg() - Map the largest prefix of a dma mapped SG list + * and set it the memory region. + * @mr: memory region + * @sg: dma mapped scatterlist + * @sg_nents: number of entries in sg + * @sg_offset: offset in bytes into sg + * @page_size: page vector desired page size + * + * Constraints: + * + * - The first sg element is allowed to have an offset. + * - Each sg element must either be aligned to page_size or virtually + * contiguous to the previous element. In case an sg element has a + * non-contiguous offset, the mapping prefix will not include it. + * - The last sg element is allowed to have length less than page_size. + * - If sg_nents total byte length exceeds the mr max_num_sge * page_size + * then only max_num_sg entries will be mapped. + * - If the MR was allocated with type IB_MR_TYPE_SG_GAPS, none of these + * constraints holds and the page_size argument is ignored. + * + * Returns the number of sg elements that were mapped to the memory region. + * + * After this completes successfully, the memory region + * is ready for registration. + */ +int ib_map_mr_sg(struct ib_mr *mr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset, unsigned int page_size) +{ + if (unlikely(!mr->device->ops.map_mr_sg)) + return -EOPNOTSUPP; + + mr->page_size = page_size; + + return mr->device->ops.map_mr_sg(mr, sg, sg_nents, sg_offset); +} +EXPORT_SYMBOL(ib_map_mr_sg); + +/** + * ib_sg_to_pages() - Convert the largest prefix of a sg list + * to a page vector + * @mr: memory region + * @sgl: dma mapped scatterlist + * @sg_nents: number of entries in sg + * @sg_offset_p: ==== ======================================================= + * IN start offset in bytes into sg + * OUT offset in bytes for element n of the sg of the first + * byte that has not been processed where n is the return + * value of this function. + * ==== ======================================================= + * @set_page: driver page assignment function pointer + * + * Core service helper for drivers to convert the largest + * prefix of given sg list to a page vector. The sg list + * prefix converted is the prefix that meet the requirements + * of ib_map_mr_sg. + * + * Returns the number of sg elements that were assigned to + * a page vector. + */ +int ib_sg_to_pages(struct ib_mr *mr, struct scatterlist *sgl, int sg_nents, + unsigned int *sg_offset_p, int (*set_page)(struct ib_mr *, u64)) +{ + struct scatterlist *sg; + u64 last_end_dma_addr = 0; + unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0; + unsigned int last_page_off = 0; + u64 page_mask = ~((u64)mr->page_size - 1); + int i, ret; + + if (unlikely(sg_nents <= 0 || sg_offset > sg_dma_len(&sgl[0]))) + return -EINVAL; + + mr->iova = sg_dma_address(&sgl[0]) + sg_offset; + mr->length = 0; + + for_each_sg(sgl, sg, sg_nents, i) { + u64 dma_addr = sg_dma_address(sg) + sg_offset; + u64 prev_addr = dma_addr; + unsigned int dma_len = sg_dma_len(sg) - sg_offset; + u64 end_dma_addr = dma_addr + dma_len; + u64 page_addr = dma_addr & page_mask; + + /* + * For the second and later elements, check whether either the + * end of element i-1 or the start of element i is not aligned + * on a page boundary. + */ + if (i && (last_page_off != 0 || page_addr != dma_addr)) { + /* Stop mapping if there is a gap. */ + if (last_end_dma_addr != dma_addr) + break; + + /* + * Coalesce this element with the last. If it is small + * enough just update mr->length. Otherwise start + * mapping from the next page. + */ + goto next_page; + } + + do { + ret = set_page(mr, page_addr); + if (unlikely(ret < 0)) { + sg_offset = prev_addr - sg_dma_address(sg); + mr->length += prev_addr - dma_addr; + if (sg_offset_p) + *sg_offset_p = sg_offset; + return i || sg_offset ? i : ret; + } + prev_addr = page_addr; +next_page: + page_addr += mr->page_size; + } while (page_addr < end_dma_addr); + + mr->length += dma_len; + last_end_dma_addr = end_dma_addr; + last_page_off = end_dma_addr & ~page_mask; + + sg_offset = 0; + } + + if (sg_offset_p) + *sg_offset_p = 0; + return i; +} +EXPORT_SYMBOL(ib_sg_to_pages); + +struct ib_drain_cqe { + struct ib_cqe cqe; + struct completion done; +}; + +static void ib_drain_qp_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct ib_drain_cqe *cqe = container_of(wc->wr_cqe, struct ib_drain_cqe, + cqe); + + complete(&cqe->done); +} + +/* + * Post a WR and block until its completion is reaped for the SQ. + */ +static void __ib_drain_sq(struct ib_qp *qp) +{ + struct ib_cq *cq = qp->send_cq; + struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR }; + struct ib_drain_cqe sdrain; + struct ib_rdma_wr swr = { + .wr = { + .next = NULL, + { .wr_cqe = &sdrain.cqe, }, + .opcode = IB_WR_RDMA_WRITE, + }, + }; + int ret; + + ret = ib_modify_qp(qp, &attr, IB_QP_STATE); + if (ret) { + WARN_ONCE(ret, "failed to drain send queue: %d\n", ret); + return; + } + + sdrain.cqe.done = ib_drain_qp_done; + init_completion(&sdrain.done); + + ret = ib_post_send(qp, &swr.wr, NULL); + if (ret) { + WARN_ONCE(ret, "failed to drain send queue: %d\n", ret); + return; + } + + if (cq->poll_ctx == IB_POLL_DIRECT) + while (wait_for_completion_timeout(&sdrain.done, HZ / 10) <= 0) + ib_process_cq_direct(cq, -1); + else + wait_for_completion(&sdrain.done); +} + +/* + * Post a WR and block until its completion is reaped for the RQ. + */ +static void __ib_drain_rq(struct ib_qp *qp) +{ + struct ib_cq *cq = qp->recv_cq; + struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR }; + struct ib_drain_cqe rdrain; + struct ib_recv_wr rwr = {}; + int ret; + + ret = ib_modify_qp(qp, &attr, IB_QP_STATE); + if (ret) { + WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret); + return; + } + + rwr.wr_cqe = &rdrain.cqe; + rdrain.cqe.done = ib_drain_qp_done; + init_completion(&rdrain.done); + + ret = ib_post_recv(qp, &rwr, NULL); + if (ret) { + WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret); + return; + } + + if (cq->poll_ctx == IB_POLL_DIRECT) + while (wait_for_completion_timeout(&rdrain.done, HZ / 10) <= 0) + ib_process_cq_direct(cq, -1); + else + wait_for_completion(&rdrain.done); +} + +/** + * ib_drain_sq() - Block until all SQ CQEs have been consumed by the + * application. + * @qp: queue pair to drain + * + * If the device has a provider-specific drain function, then + * call that. Otherwise call the generic drain function + * __ib_drain_sq(). + * + * The caller must: + * + * ensure there is room in the CQ and SQ for the drain work request and + * completion. + * + * allocate the CQ using ib_alloc_cq(). + * + * ensure that there are no other contexts that are posting WRs concurrently. + * Otherwise the drain is not guaranteed. + */ +void ib_drain_sq(struct ib_qp *qp) +{ + if (qp->device->ops.drain_sq) + qp->device->ops.drain_sq(qp); + else + __ib_drain_sq(qp); + trace_cq_drain_complete(qp->send_cq); +} +EXPORT_SYMBOL(ib_drain_sq); + +/** + * ib_drain_rq() - Block until all RQ CQEs have been consumed by the + * application. + * @qp: queue pair to drain + * + * If the device has a provider-specific drain function, then + * call that. Otherwise call the generic drain function + * __ib_drain_rq(). + * + * The caller must: + * + * ensure there is room in the CQ and RQ for the drain work request and + * completion. + * + * allocate the CQ using ib_alloc_cq(). + * + * ensure that there are no other contexts that are posting WRs concurrently. + * Otherwise the drain is not guaranteed. + */ +void ib_drain_rq(struct ib_qp *qp) +{ + if (qp->device->ops.drain_rq) + qp->device->ops.drain_rq(qp); + else + __ib_drain_rq(qp); + trace_cq_drain_complete(qp->recv_cq); +} +EXPORT_SYMBOL(ib_drain_rq); + +/** + * ib_drain_qp() - Block until all CQEs have been consumed by the + * application on both the RQ and SQ. + * @qp: queue pair to drain + * + * The caller must: + * + * ensure there is room in the CQ(s), SQ, and RQ for drain work requests + * and completions. + * + * allocate the CQs using ib_alloc_cq(). + * + * ensure that there are no other contexts that are posting WRs concurrently. + * Otherwise the drain is not guaranteed. + */ +void ib_drain_qp(struct ib_qp *qp) +{ + ib_drain_sq(qp); + if (!qp->srq) + ib_drain_rq(qp); +} +EXPORT_SYMBOL(ib_drain_qp); + +struct net_device *rdma_alloc_netdev(struct ib_device *device, u32 port_num, + enum rdma_netdev_t type, const char *name, + unsigned char name_assign_type, + void (*setup)(struct net_device *), + int force_fail) +{ + struct rdma_netdev_alloc_params params; + struct net_device *netdev; + int rc; + + if (!device->ops.rdma_netdev_get_params || force_fail) + return ERR_PTR(-EOPNOTSUPP); + + rc = device->ops.rdma_netdev_get_params(device, port_num, type, + ¶ms); + if (rc) + return ERR_PTR(rc); + + netdev = alloc_netdev_mqs(params.sizeof_priv, name, name_assign_type, + setup, params.txqs, params.rxqs); + if (!netdev) + return ERR_PTR(-ENOMEM); + + return netdev; +} +EXPORT_SYMBOL(rdma_alloc_netdev); + +int rdma_init_netdev(struct ib_device *device, u32 port_num, + enum rdma_netdev_t type, const char *name, + unsigned char name_assign_type, + void (*setup)(struct net_device *), + struct net_device *netdev, + int force_fail) +{ + struct rdma_netdev_alloc_params params; + int rc; + + if (!device->ops.rdma_netdev_get_params || force_fail) + return -EOPNOTSUPP; + + rc = device->ops.rdma_netdev_get_params(device, port_num, type, + ¶ms); + if (rc) + return rc; + + return params.initialize_rdma_netdev(device, port_num, + netdev, params.param); +} +EXPORT_SYMBOL(rdma_init_netdev); + +void __rdma_block_iter_start(struct ib_block_iter *biter, + struct scatterlist *sglist, unsigned int nents, + unsigned long pgsz) +{ + memset(biter, 0, sizeof(struct ib_block_iter)); + biter->__sg = sglist; + biter->__sg_nents = nents; + + /* Driver provides best block size to use */ + biter->__pg_bit = __fls(pgsz); +} +EXPORT_SYMBOL(__rdma_block_iter_start); + +bool __rdma_block_iter_next(struct ib_block_iter *biter) +{ + unsigned int block_offset; + + if (!biter->__sg_nents || !biter->__sg) + return false; + + biter->__dma_addr = sg_dma_address(biter->__sg) + biter->__sg_advance; + block_offset = biter->__dma_addr & (BIT_ULL(biter->__pg_bit) - 1); + biter->__sg_advance += BIT_ULL(biter->__pg_bit) - block_offset; + + if (biter->__sg_advance >= sg_dma_len(biter->__sg)) { + biter->__sg_advance = 0; + biter->__sg = sg_next(biter->__sg); + biter->__sg_nents--; + } + + return true; +} +EXPORT_SYMBOL(__rdma_block_iter_next); + +/** + * rdma_alloc_hw_stats_struct - Helper function to allocate dynamic struct + * for the drivers. + * @descs: array of static descriptors + * @num_counters: number of elements in array + * @lifespan: milliseconds between updates + */ +struct rdma_hw_stats *rdma_alloc_hw_stats_struct( + const struct rdma_stat_desc *descs, int num_counters, + unsigned long lifespan) +{ + struct rdma_hw_stats *stats; + + stats = kzalloc(struct_size(stats, value, num_counters), GFP_KERNEL); + if (!stats) + return NULL; + + stats->is_disabled = kcalloc(BITS_TO_LONGS(num_counters), + sizeof(*stats->is_disabled), GFP_KERNEL); + if (!stats->is_disabled) + goto err; + + stats->descs = descs; + stats->num_counters = num_counters; + stats->lifespan = msecs_to_jiffies(lifespan); + mutex_init(&stats->lock); + + return stats; + +err: + kfree(stats); + return NULL; +} +EXPORT_SYMBOL(rdma_alloc_hw_stats_struct); + +/** + * rdma_free_hw_stats_struct - Helper function to release rdma_hw_stats + * @stats: statistics to release + */ +void rdma_free_hw_stats_struct(struct rdma_hw_stats *stats) +{ + if (!stats) + return; + + kfree(stats->is_disabled); + kfree(stats); +} +EXPORT_SYMBOL(rdma_free_hw_stats_struct); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/verbs_nvmf.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/verbs_nvmf.c new file mode 100644 index 0000000..7f0728a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/core/verbs_nvmf.c @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2015 Mellanox Technologies Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "core_priv.h" +/* NVMEoF target offload */ +int ib_query_nvmf_ns(struct ib_nvmf_ns *ns, struct ib_nvmf_ns_attr *ns_attr) +{ + return ns->ctrl->srq->device->ops.query_nvmf_ns ? + ns->ctrl->srq->device->ops.query_nvmf_ns(ns, ns_attr) : -ENOSYS; +} +EXPORT_SYMBOL(ib_query_nvmf_ns); + +struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags) +{ + struct ib_mr *mr; + int err; + + err = ib_check_mr_access(pd->device, mr_access_flags); + if (err) + return ERR_PTR(err); + + mr = pd->device->ops.get_dma_mr(pd, mr_access_flags); + + if (!IS_ERR(mr)) { + mr->device = pd->device; + mr->pd = pd; + mr->uobject = NULL; + atomic_inc(&pd->usecnt); + mr->need_inval = false; + } + + return mr; +} +EXPORT_SYMBOL(ib_get_dma_mr); + +/* NVMEoF target offload */ +struct ib_nvmf_ctrl *ib_create_nvmf_backend_ctrl(struct ib_srq *srq, + struct ib_nvmf_backend_ctrl_init_attr *init_attr) +{ + struct ib_nvmf_ctrl *ctrl; + + if (!srq->device->ops.create_nvmf_backend_ctrl) + return ERR_PTR(-ENOSYS); + if (srq->srq_type != IB_EXP_SRQT_NVMF) + return ERR_PTR(-EINVAL); + + ctrl = srq->device->ops.create_nvmf_backend_ctrl(srq, init_attr); + if (!IS_ERR(ctrl)) { + atomic_set(&ctrl->usecnt, 0); + ctrl->srq = srq; + ctrl->event_handler = init_attr->event_handler; + ctrl->be_context = init_attr->be_context; + atomic_inc(&srq->usecnt); + } + + return ctrl; +} +EXPORT_SYMBOL_GPL(ib_create_nvmf_backend_ctrl); + +int ib_destroy_nvmf_backend_ctrl(struct ib_nvmf_ctrl *ctrl) +{ + struct ib_srq *srq = ctrl->srq; + int ret; + + if (atomic_read(&ctrl->usecnt)) + return -EBUSY; + + ret = srq->device->ops.destroy_nvmf_backend_ctrl(ctrl); + if (!ret) + atomic_dec(&srq->usecnt); + + return ret; +} +EXPORT_SYMBOL_GPL(ib_destroy_nvmf_backend_ctrl); + +struct ib_nvmf_ns *ib_attach_nvmf_ns(struct ib_nvmf_ctrl *ctrl, + struct ib_nvmf_ns_init_attr *init_attr) +{ + struct ib_srq *srq = ctrl->srq; + struct ib_nvmf_ns *ns; + + if (!srq->device->ops.attach_nvmf_ns) + return ERR_PTR(-ENOSYS); + if (srq->srq_type != IB_EXP_SRQT_NVMF) + return ERR_PTR(-EINVAL); + + ns = srq->device->ops.attach_nvmf_ns(ctrl, init_attr); + if (!IS_ERR(ns)) { + ns->ctrl = ctrl; + atomic_inc(&ctrl->usecnt); + } + + return ns; +} +EXPORT_SYMBOL_GPL(ib_attach_nvmf_ns); + +int ib_detach_nvmf_ns(struct ib_nvmf_ns *ns) +{ + struct ib_nvmf_ctrl *ctrl = ns->ctrl; + struct ib_srq *srq = ctrl->srq; + int ret; + + ret = srq->device->ops.detach_nvmf_ns(ns); + if (!ret) + atomic_dec(&ctrl->usecnt); + + return ret; +} +EXPORT_SYMBOL_GPL(ib_detach_nvmf_ns); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/debug/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/debug/Makefile new file mode 100644 index 0000000..e9d9f4b --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/debug/Makefile @@ -0,0 +1,3 @@ +EXTRA_CFLAGS := $(subst $(KERNEL_MEMTRACK_CFLAGS),,$(EXTRA_CFLAGS)) + +obj-m += memtrack.o diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/debug/memtrack.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/debug/memtrack.c new file mode 100644 index 0000000..8ece987 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/debug/memtrack.c @@ -0,0 +1,1428 @@ +/* + This software is available to you under a choice of one of two + licenses. You may choose to be licensed under the terms of the GNU + General Public License (GPL) Version 2, available at + , or the OpenIB.org BSD + license, available in the LICENSE.TXT file accompanying this + software. These details are also available at + . + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + + Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved. +*/ + +#define C_MEMTRACK_C + +#ifdef kmalloc + #undef kmalloc +#endif +#ifdef kmalloc_array + #undef kmalloc_array +#endif +#ifdef kmemdup + #undef kmemdup +#endif +#ifdef kstrdup + #undef kstrdup +#endif +#ifdef kfree + #undef kfree +#endif +#ifdef vmalloc + #undef vmalloc +#endif +#ifdef vzalloc + #undef vzalloc +#endif +#ifdef vzalloc_node + #undef vzalloc_node +#endif +#ifdef __vmalloc + #undef __vmalloc +#endif +#ifdef vfree + #undef vfree +#endif +#ifdef kvfree + #undef kvfree +#endif +#ifdef memdup_user + #undef memdup_user +#endif +#ifdef memdup_user_nul + #undef memdup_user_nul +#endif +#ifdef kmem_cache_alloc + #undef kmem_cache_alloc +#endif +#ifdef kmem_cache_zalloc + #undef kmem_cache_zalloc +#endif +#ifdef kmem_cache_free + #undef kmem_cache_free +#endif +#ifdef kasprintf + #undef kasprintf +#endif +#ifdef ioremap + #undef ioremap +#endif +#ifdef ioremap_wc + #undef ioremap_wc +#endif +#ifdef io_mapping_create_wc + #undef io_mapping_create_wc +#endif +#ifdef io_mapping_free + #undef io_mapping_free +#endif +#ifdef ioremap_nocache + #undef ioremap_nocache +#endif +#ifdef iounmap + #undef iounmap +#endif +#ifdef alloc_pages + #undef alloc_pages +#endif +#ifdef dev_alloc_pages + #undef dev_alloc_pages +#endif +#ifdef free_pages + #undef free_pages +#endif +#ifdef split_page + #undef split_page +#endif +#ifdef get_page + #undef get_page +#endif +#ifdef put_page + #undef put_page +#endif +#ifdef create_workqueue + #undef create_workqueue +#endif +#ifdef create_rt_workqueue + #undef create_rt_workqueue +#endif +#ifdef create_freezeable_workqueue + #undef create_freezeable_workqueue +#endif +#ifdef create_singlethread_workqueue + #undef create_singlethread_workqueue +#endif +#ifdef destroy_workqueue + #undef destroy_workqueue +#endif +#ifdef kvzalloc + #undef kvzalloc +#endif +#ifdef kvmalloc + #undef kvmalloc +#endif +#ifdef kvmalloc_array + #undef kvmalloc_array +#endif +#ifdef kvmalloc_node + #undef kvmalloc_node +#endif +#ifdef kvzalloc_node + #undef kvzalloc_node +#endif +#ifdef kcalloc_node + #undef kcalloc_node +#endif +#ifdef kvcalloc + #undef kvcalloc +#endif +/* if kernel version < 2.6.37, it's defined in compat as singlethread_workqueue */ +#if defined(alloc_ordered_workqueue) && LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 37) + #undef alloc_ordered_workqueue +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "memtrack.h" + +#include + + +MODULE_AUTHOR("Mellanox Technologies LTD."); +MODULE_DESCRIPTION("Memory allocations tracking"); +MODULE_LICENSE("GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif + +#define MEMTRACK_HASH_SZ ((1<<15)-19) /* prime: http://www.utm.edu/research/primes/lists/2small/0bit.html */ +#define MAX_FILENAME_LEN 31 +#define MAX_FILE_PATH_LEN 256 +#define MAX_MODULE_NAME_LEN 20 +#define MAX_FUNC_NAME_LEN 20 + +#define memtrack_spin_lock(spl, flags) spin_lock_irqsave(spl, flags) +#define memtrack_spin_unlock(spl, flags) spin_unlock_irqrestore(spl, flags) + +/* if a bit is set then the corresponding allocation is tracked. + bit0 corresponds to MEMTRACK_KMALLOC, bit1 corresponds to MEMTRACK_VMALLOC etc. */ +static unsigned long track_mask = -1; /* effectively everything */ +module_param(track_mask, ulong, 0444); +MODULE_PARM_DESC(track_mask, "bitmask defining what is tracked"); + +/* if a bit is set then the corresponding allocation is strictly tracked. + That is, before inserting the whole range is checked to not overlap any + of the allocations already in the database */ +static unsigned long strict_track_mask = 0; /* no strict tracking */ +module_param(strict_track_mask, ulong, 0444); +MODULE_PARM_DESC(strict_track_mask, "bitmask which allocation requires strict tracking"); + +/* Sets the frequency of allocations failures injections + if set to 0 all allocation should succeed */ +static unsigned int inject_freq = 0; +module_param(inject_freq, uint, 0644); +MODULE_PARM_DESC(inject_freq, "Error injection frequency, default is 0 (disabled)"); + +static int random_mem = 1; +module_param(random_mem, uint, 0644); +MODULE_PARM_DESC(random_mem, "When set, randomize allocated memory, default is 1 (enabled)"); + +/* + * Number of failures that can be for each allocation + */ +static int num_failures = 1; +module_param(num_failures, uint, 0644); +MODULE_PARM_DESC(num_failures, "Number of failures that can be for each allocation"); + + +struct memtrack_injected_info_t { + unsigned int num_failures; + unsigned long line_num; + char module_name[MAX_MODULE_NAME_LEN]; + char file_name[MAX_FILE_PATH_LEN]; + char func_name[MAX_FUNC_NAME_LEN]; + char caller_func_name[MAX_FUNC_NAME_LEN]; + struct list_head list; +}; + +struct memtrack_ignore_func_info_t { + char func_name[MAX_FUNC_NAME_LEN]; + struct list_head list; +}; + +struct memtrack_targeted_modules_info_t { + char module_name[MAX_MODULE_NAME_LEN]; + struct list_head list; +}; + +static LIST_HEAD(injected_info_list); +static LIST_HEAD(ignored_func_list); +static LIST_HEAD(targeted_modules_list); +static DEFINE_MUTEX(ignored_func_mutex); +static DEFINE_MUTEX(targeted_modules_mutex); + +#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 16) +static struct kobject *memtrack_obj; +#endif + +struct memtrack_meminfo_t { + unsigned long addr; + unsigned long size; + unsigned long line_num; + unsigned long dev; + unsigned long addr2; + int direction; + struct memtrack_meminfo_t *next; + struct list_head list; /* used to link all items from a certain type together */ + char filename[MAX_FILENAME_LEN + 1]; /* putting the char array last is better for struct. packing */ + char ext_info[32]; +}; + +static struct kmem_cache *meminfo_cache; + +struct tracked_obj_desc_t { + struct memtrack_meminfo_t *mem_hash[MEMTRACK_HASH_SZ]; + spinlock_t hash_lock; + unsigned long count; /* size of memory tracked (*malloc) or number of objects tracked */ + struct list_head tracked_objs_head; /* head of list of all objects */ + int strict_track; /* if 1 then for each object inserted check if it overlaps any of the objects already in the list */ +}; + +static struct tracked_obj_desc_t *tracked_objs_arr[MEMTRACK_NUM_OF_MEMTYPES]; + +static const char *rsc_names[MEMTRACK_NUM_OF_MEMTYPES] = { + "kmalloc", + "vmalloc", + "kmem_cache_alloc", + "io_remap", + "create_workqueue", + "alloc_pages", + "ib_dma_map_single", + "ib_dma_map_page", + "ib_dma_map_sg" +}; + +static const char *rsc_free_names[MEMTRACK_NUM_OF_MEMTYPES] = { + "kfree", + "vfree", + "kmem_cache_free", + "io_unmap", + "destory_workqueue", + "free_pages", + "ib_dma_unmap_single", + "ib_dma_unmap_page", + "ib_dma_unmap_sg" +}; + +static inline const char *memtype_alloc_str(enum memtrack_memtype_t memtype) +{ + switch (memtype) { + case MEMTRACK_KMALLOC: + case MEMTRACK_VMALLOC: + case MEMTRACK_KMEM_OBJ: + case MEMTRACK_IOREMAP: + case MEMTRACK_WORK_QUEUE: + case MEMTRACK_PAGE_ALLOC: + case MEMTRACK_DMA_MAP_SINGLE: + case MEMTRACK_DMA_MAP_PAGE: + case MEMTRACK_DMA_MAP_SG: + return rsc_names[memtype]; + default: + return "(Unknown allocation type)"; + } +} + +static inline const char *memtype_free_str(enum memtrack_memtype_t memtype) +{ + switch (memtype) { + case MEMTRACK_KMALLOC: + case MEMTRACK_VMALLOC: + case MEMTRACK_KMEM_OBJ: + case MEMTRACK_IOREMAP: + case MEMTRACK_WORK_QUEUE: + case MEMTRACK_PAGE_ALLOC: + case MEMTRACK_DMA_MAP_SINGLE: + case MEMTRACK_DMA_MAP_PAGE: + case MEMTRACK_DMA_MAP_SG: + return rsc_free_names[memtype]; + default: + return "(Unknown allocation type)"; + } +} + +#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 16) +int find_ignore_func_info(const char *func_name) +{ + struct memtrack_ignore_func_info_t *tmp_ignore_func_info_p; + + list_for_each_entry(tmp_ignore_func_info_p, &ignored_func_list, list) { + if (!strcmp(tmp_ignore_func_info_p->func_name, func_name)) + return 1; + } + return 0; +} + +int find_targeted_module_info(char *module_name) +{ + struct memtrack_targeted_modules_info_t *tmp_targeted_module_info_p; + list_for_each_entry(tmp_targeted_module_info_p, + &targeted_modules_list, + list) { + if (!strcmp(tmp_targeted_module_info_p->module_name, + module_name)) + return 1; + } + return 0; +} + +static ssize_t show_targeted_modules(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + ssize_t len = 0; + struct memtrack_targeted_modules_info_t *tmp_targeted_module_p; + mutex_lock(&targeted_modules_mutex); + if (list_empty(&targeted_modules_list)) { + len = sprintf(&buf[len], "all\n"); + } else { + list_for_each_entry(tmp_targeted_module_p, + &targeted_modules_list, + list) { + len += sprintf(&buf[len], "%s\n", + tmp_targeted_module_p->module_name); + } + } + mutex_unlock(&targeted_modules_mutex); + return len; +} + +static ssize_t store_targeted_module(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + char module_name[MAX_MODULE_NAME_LEN]; + struct memtrack_targeted_modules_info_t *new_targeted_module_info_p, + *tmp_targeted_module_info_p, + *targeted_module_info_p; + + if (sscanf(buf, "%s", module_name)) { + if (!strcmp("reset", module_name)) { + mutex_lock(&targeted_modules_mutex); + list_for_each_entry_safe(targeted_module_info_p, + tmp_targeted_module_info_p, + &targeted_modules_list, + list) { + list_del(&targeted_module_info_p->list); + kfree(targeted_module_info_p); + } + mutex_unlock(&targeted_modules_mutex); + return count; + } + } + + if (sscanf(buf, "+%s", module_name)) { + if (!find_targeted_module_info(module_name)) { + new_targeted_module_info_p = + kmalloc(sizeof(*new_targeted_module_info_p), + GFP_KERNEL); + if (!new_targeted_module_info_p) { + printk(KERN_ERR "memtrack::%s: failed to allocate new targeted module info\n", __func__); + mutex_unlock(&targeted_modules_mutex); + return -ENOMEM; + } + strcpy(new_targeted_module_info_p->module_name, + module_name); + list_add_tail(&(new_targeted_module_info_p->list), + &(targeted_modules_list)); + mutex_unlock(&targeted_modules_mutex); + return count; + } else { + return -EEXIST; + } + } + + if (sscanf(buf, "-%s", module_name)) { + if (find_targeted_module_info(module_name)) { + mutex_lock(&targeted_modules_mutex); + list_for_each_entry_safe(targeted_module_info_p, + tmp_targeted_module_info_p, + &targeted_modules_list, + list) { + if (!strcmp(targeted_module_info_p->module_name, + module_name)) { + list_del(&targeted_module_info_p->list); + kfree(targeted_module_info_p); + break; + } + } + mutex_unlock(&targeted_modules_mutex); + return count; + } else { + return -EINVAL; + } + } + + return -EPERM; +} + +static ssize_t show_ingore_functions(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + ssize_t len = 0; + struct memtrack_ignore_func_info_t *tmp_ignore_func_p; + mutex_lock(&ignored_func_mutex); + list_for_each_entry(tmp_ignore_func_p, &ignored_func_list, list) { + len += sprintf(&buf[len], "%s\n", tmp_ignore_func_p->func_name); + } + mutex_unlock(&ignored_func_mutex); + return len; +} + +static ssize_t store_ingore_function(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + char func_name[MAX_FUNC_NAME_LEN]; + struct memtrack_ignore_func_info_t *new_ignore_func_p, + *tmp_ignore_func_p, + *ignore_func_p; + + if (sscanf(buf, "%s", func_name)) { + if (!strcmp("reset", func_name)) { + mutex_lock(&ignored_func_mutex); + list_for_each_entry_safe(ignore_func_p, + tmp_ignore_func_p, + &ignored_func_list, + list) { + list_del(&ignore_func_p->list); + kfree(ignore_func_p); + } + mutex_unlock(&ignored_func_mutex); + return count; + } + } + + if (sscanf(buf, "+%s", func_name)) { + if (!find_ignore_func_info(func_name)) { + mutex_lock(&ignored_func_mutex); + new_ignore_func_p = + kmalloc(sizeof(*new_ignore_func_p), + GFP_KERNEL); + if (!new_ignore_func_p) { + printk(KERN_ERR "memtrack::%s: failed to allocate new ignored func info\n", __func__); + mutex_unlock(&ignored_func_mutex); + return -ENOMEM; + } + strcpy(new_ignore_func_p->func_name, func_name); + list_add_tail(&(new_ignore_func_p->list), + &(ignored_func_list)); + mutex_unlock(&ignored_func_mutex); + return count; + } else { + return -EEXIST; + } + } + + if (sscanf(buf, "-%s", func_name)) { + if (find_ignore_func_info(func_name)) { + mutex_lock(&ignored_func_mutex); + list_for_each_entry_safe(ignore_func_p, + tmp_ignore_func_p, + &ignored_func_list, + list) { + if (!strcmp(ignore_func_p->func_name, + func_name)) { + list_del(&ignore_func_p->list); + kfree(ignore_func_p); + break; + } + } + mutex_unlock(&ignored_func_mutex); + return count; + } else { + return -EINVAL; + } + } + + return -EPERM; +} + +static struct kobj_attribute ignore_func_attribute = + __ATTR(ignore_funcs, S_IRUGO, + show_ingore_functions, + store_ingore_function); + +static struct kobj_attribute targeted_modules_attribute = + __ATTR(targeted_modules, S_IRUGO, + show_targeted_modules, + store_targeted_module); + + +static struct attribute *attrs[] = { + &ignore_func_attribute.attr, + &targeted_modules_attribute.attr, + NULL, +}; + + +static struct attribute_group attr_group = { + .attrs = attrs, +}; + +#endif + +/* + * overlap_a_b + */ +static inline int overlap_a_b(unsigned long a_start, unsigned long a_end, + unsigned long b_start, unsigned long b_end) +{ + if ((b_start > a_end) || (a_start > b_end)) + return 0; + + return 1; +} + +/* + * check_overlap + */ +static void check_overlap(enum memtrack_memtype_t memtype, + struct memtrack_meminfo_t *mem_info_p, + struct tracked_obj_desc_t *obj_desc_p) +{ + struct list_head *pos, *next; + struct memtrack_meminfo_t *cur; + unsigned long start_a, end_a, start_b, end_b; + + start_a = mem_info_p->addr; + end_a = mem_info_p->addr + mem_info_p->size - 1; + + list_for_each_safe(pos, next, &obj_desc_p->tracked_objs_head) { + cur = list_entry(pos, struct memtrack_meminfo_t, list); + + start_b = cur->addr; + end_b = cur->addr + cur->size - 1; + + if (overlap_a_b(start_a, end_a, start_b, end_b)) + printk(KERN_ERR "%s overlaps! new_start=0x%lx, new_end=0x%lx, item_start=0x%lx, item_end=0x%lx\n", + memtype_alloc_str(memtype), mem_info_p->addr, + mem_info_p->addr + mem_info_p->size - 1, cur->addr, + cur->addr + cur->size - 1); + } +} + +/* Invoke on memory allocation */ +void memtrack_alloc(enum memtrack_memtype_t memtype, unsigned long dev, + unsigned long addr, unsigned long size, unsigned long addr2, + int direction, const char *filename, + const unsigned long line_num, int alloc_flags) +{ + unsigned long hash_val; + struct memtrack_meminfo_t *cur_mem_info_p, *new_mem_info_p; + struct tracked_obj_desc_t *obj_desc_p; + unsigned long flags; + + if (memtype == MEMTRACK_KVMALLOC) + memtype = is_vmalloc_addr((const void *)addr) ? MEMTRACK_VMALLOC : MEMTRACK_KMALLOC; + + if (memtype >= MEMTRACK_NUM_OF_MEMTYPES) { + printk(KERN_ERR "%s: Invalid memory type (%d)\n", __func__, memtype); + return; + } + + if (!tracked_objs_arr[memtype]) { + /* object is not tracked */ + return; + } + obj_desc_p = tracked_objs_arr[memtype]; + + hash_val = addr % MEMTRACK_HASH_SZ; + + new_mem_info_p = (struct memtrack_meminfo_t *)kmem_cache_alloc(meminfo_cache, alloc_flags); + if (new_mem_info_p == NULL) { + printk(KERN_ERR "%s: Failed allocating kmem_cache item for new mem_info. " + "Lost tracking on allocation at %s:%lu...\n", __func__, + filename, line_num); + return; + } + /* save allocation properties */ + new_mem_info_p->addr = addr; + new_mem_info_p->size = size; + new_mem_info_p->dev = dev; + new_mem_info_p->addr2 = addr2; + new_mem_info_p->direction = direction; + + new_mem_info_p->line_num = line_num; + *new_mem_info_p->ext_info = '\0'; + /* Make sure that we will print out the path tail if the given filename is longer + * than MAX_FILENAME_LEN. (otherwise, we will not see the name of the actual file + * in the printout -- only the path head! + */ + if (strlen(filename) > MAX_FILENAME_LEN) + strncpy(new_mem_info_p->filename, filename + strlen(filename) - MAX_FILENAME_LEN, MAX_FILENAME_LEN); + else + strncpy(new_mem_info_p->filename, filename, MAX_FILENAME_LEN); + + new_mem_info_p->filename[MAX_FILENAME_LEN] = 0; /* NULL terminate anyway */ + + memtrack_spin_lock(&obj_desc_p->hash_lock, flags); + /* make sure given memory location is not already allocated */ + if ((memtype != MEMTRACK_DMA_MAP_SINGLE) && (memtype != MEMTRACK_DMA_MAP_PAGE) && + (memtype != MEMTRACK_DMA_MAP_SG)) { + + /* make sure given memory location is not already allocated */ + cur_mem_info_p = obj_desc_p->mem_hash[hash_val]; + while (cur_mem_info_p != NULL) { + if ((cur_mem_info_p->addr == addr) && (cur_mem_info_p->dev == dev)) { + /* Found given address in the database */ + printk(KERN_ERR "mtl rsc inconsistency: %s: %s::%lu: %s @ addr=0x%lX which is already known from %s:%lu\n", + __func__, filename, line_num, + memtype_alloc_str(memtype), addr, + cur_mem_info_p->filename, + cur_mem_info_p->line_num); + memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); + kmem_cache_free(meminfo_cache, new_mem_info_p); + return; + } + cur_mem_info_p = cur_mem_info_p->next; + } + } + /* not found - we can put in the hash bucket */ + /* link as first */ + new_mem_info_p->next = obj_desc_p->mem_hash[hash_val]; + obj_desc_p->mem_hash[hash_val] = new_mem_info_p; + if (obj_desc_p->strict_track) + check_overlap(memtype, new_mem_info_p, obj_desc_p); + obj_desc_p->count += size; + list_add(&new_mem_info_p->list, &obj_desc_p->tracked_objs_head); + + memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); + return; +} +EXPORT_SYMBOL(memtrack_alloc); + +/* Invoke on memory free */ +void memtrack_free(enum memtrack_memtype_t memtype, unsigned long dev, + unsigned long addr, unsigned long size, int direction, + const char *filename, const unsigned long line_num) +{ + unsigned long hash_val; + struct memtrack_meminfo_t *cur_mem_info_p, *prev_mem_info_p; + struct tracked_obj_desc_t *obj_desc_p; + unsigned long flags; + + if (memtype >= MEMTRACK_NUM_OF_MEMTYPES) { + printk(KERN_ERR "%s: Invalid memory type (%d)\n", __func__, memtype); + return; + } + + if (!tracked_objs_arr[memtype]) { + /* object is not tracked */ + return; + } + obj_desc_p = tracked_objs_arr[memtype]; + + hash_val = addr % MEMTRACK_HASH_SZ; + + memtrack_spin_lock(&obj_desc_p->hash_lock, flags); + /* find mem_info of given memory location */ + prev_mem_info_p = NULL; + cur_mem_info_p = obj_desc_p->mem_hash[hash_val]; + while (cur_mem_info_p != NULL) { + if ((cur_mem_info_p->addr == addr) && (cur_mem_info_p->dev == dev)) { + /* Found given address in the database */ + if ((memtype == MEMTRACK_DMA_MAP_SINGLE) || (memtype == MEMTRACK_DMA_MAP_PAGE) || + (memtype == MEMTRACK_DMA_MAP_SG)) { + if (direction != cur_mem_info_p->direction) + printk(KERN_ERR "mtl rsc inconsistency: %s: %s::%lu: %s bad direction for addr 0x%lX: alloc:0x%x, free:0x%x (allocated in %s::%lu)\n", + __func__, filename, line_num, memtype_free_str(memtype), addr, cur_mem_info_p->direction, direction, + cur_mem_info_p->filename, cur_mem_info_p->line_num); + + if (size != cur_mem_info_p->size) + printk(KERN_ERR "mtl rsc inconsistency: %s: %s::%lu: %s bad size for addr 0x%lX: size:%lu, free:%lu (allocated in %s::%lu)\n", + __func__, filename, line_num, memtype_free_str(memtype), addr, cur_mem_info_p->size, size, + cur_mem_info_p->filename, cur_mem_info_p->line_num); + } + + /* Remove from the bucket/list */ + if (prev_mem_info_p == NULL) + obj_desc_p->mem_hash[hash_val] = cur_mem_info_p->next; /* removing first */ + else + prev_mem_info_p->next = cur_mem_info_p->next; /* "crossover" */ + + list_del(&cur_mem_info_p->list); + + obj_desc_p->count -= cur_mem_info_p->size; + memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); + kmem_cache_free(meminfo_cache, cur_mem_info_p); + return; + } + prev_mem_info_p = cur_mem_info_p; + cur_mem_info_p = cur_mem_info_p->next; + } + + /* not found */ + printk(KERN_ERR "mtl rsc inconsistency: %s: %s::%lu: %s for unknown address=0x%lX, device=0x%lX\n", + __func__, filename, line_num, memtype_free_str(memtype), addr, dev); + memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); + return; +} +EXPORT_SYMBOL(memtrack_free); + +/* + * This function recognizes allocations which + * may be released by kernel (e.g. skb) and + * therefore not trackable by memtrack. + * The allocations are recognized by the name + * of their calling function. + */ +int is_non_trackable_alloc_func(const char *func_name) +{ + static const char * const str_str_arr[] = { + /* functions containing these strings consider non trackable */ + "skb", + }; + static const char * const str_str_excep_arr[] = { + /* functions which are exception to the str_str_arr table */ + "ipoib_cm_skb_too_long" + }; + static const char * const str_cmp_arr[] = { + /* functions that allocate SKBs */ + "mlx4_en_alloc_frags", + "mlx4_en_alloc_frag", + "mlx4_en_init_allocator", + "mlx4_en_free_frag", + "mlx4_en_free_rx_desc", + "mlx4_en_destroy_allocator", + "mlx4_en_complete_rx_desc", + "mlx4_alloc_pages", + "mlx4_alloc_page", + "mlx4_crdump_collect_crspace", + "mlx4_crdump_collect_fw_health", + "mlx5e_page_alloc_mapped", + "mlx5e_put_page", + /* vnic skb functions */ + "free_single_frag", + "vnic_alloc_rx_skb", + "vnic_rx_skb", + "vnic_alloc_frag", + "vnic_empty_rx_entry", + "vnic_init_allocator", + "vnic_destroy_allocator", + "sdp_post_recv", + "sdp_rx_ring_purge", + "sdp_post_srcavail", + "sk_stream_alloc_page", + "update_send_head", + "sdp_bcopy_get", + "sdp_destroy_resources", + "tcf_exts_init", + "tcf_hashinfo_init", + /* sw steering functions */ + "dr_icm_chunk_create", + /* release order0 pages, for old kernels only */ + "mlx5e_free_xdpsq_desc", + /* kTLS resync dump */ + "tx_sync_info_get", + "mlx5e_ktls_tx_handle_resync_dump_comp", + }; + size_t str_str_arr_size = sizeof(str_str_arr)/sizeof(char *); + size_t str_str_excep_size = sizeof(str_str_excep_arr)/sizeof(char *); + size_t str_cmp_arr_size = sizeof(str_cmp_arr)/sizeof(char *); + + int i, j; + + for (i = 0; i < str_str_arr_size; ++i) + if (strstr(func_name, str_str_arr[i])) { + for (j = 0; j < str_str_excep_size; ++j) + if (!strcmp(func_name, str_str_excep_arr[j])) + return 0; + return 1; + } + for (i = 0; i < str_cmp_arr_size; ++i) + if (!strcmp(func_name, str_cmp_arr[i])) + return 1; + return 0; +} +EXPORT_SYMBOL(is_non_trackable_alloc_func); + +/* + * In some cases we need to free a memory + * we defined as "non trackable" (see + * is_non_trackable_alloc_func). + * This function recognizes such releases + * by the name of their calling function. + */ +int is_non_trackable_free_func(const char *func_name) +{ + static const char * const str_cmp_arr[] = { + /* sw steering functions */ + "mlx5dr_icm_free_chunk", + /* external function in mdev module */ + "create_store", + /* functions in mlxdevm.c uses memory allocated by nla_strdup */ + "mlxdevm_nl_cmd_rate_new_doit", + "mlxdevm_nl_cmd_rate_del_doit", + "mlxdevm_rate_node_get_doit_locked", + "mlxdevm_cmd_rate_set_node", + "mlxdevm_cmd_rate_set_leaf", + }; + size_t str_cmp_arr_size = sizeof(str_cmp_arr)/sizeof(char *); + int i; + + for (i = 0; i < str_cmp_arr_size; ++i) + if (!strcmp(func_name, str_cmp_arr[i])) + return 1; + + return 0; +} +EXPORT_SYMBOL(is_non_trackable_free_func); + +/* Check if put_page tracking should be skipped since it is called from + * untracked caller function (func_name). + * Return values: + * 1 - Should be skipped + * 0 - Shouldn't be skipped + */ +int is_umem_put_page(const char *func_name) +{ + const char func_str[18] = "__ib_umem_release"; + const char func_str1[12] = "ib_umem_get"; + const char func_str2[32] = "ib_umem_odp_map_dma_single_page"; + const char func_str3[26] = "ib_umem_odp_map_dma_pages"; + + return ((strstr(func_name, func_str) != NULL) || + (strstr(func_name, func_str1) != NULL) || + (strstr(func_name, func_str2) != NULL) || + (strstr(func_name, func_str3) != NULL)) ? 1 : 0; +} +EXPORT_SYMBOL(is_umem_put_page); + +/* Check page order size + When Freeing a page allocation it checks whether + we are trying to free the same size + we asked to allocate */ +int memtrack_check_size(enum memtrack_memtype_t memtype, unsigned long addr, + unsigned long size, const char *filename, + const unsigned long line_num) +{ + unsigned long hash_val; + struct memtrack_meminfo_t *cur_mem_info_p; + struct tracked_obj_desc_t *obj_desc_p; + unsigned long flags; + int ret = 0; + + if (memtype >= MEMTRACK_NUM_OF_MEMTYPES) { + printk(KERN_ERR "%s: Invalid memory type (%d)\n", __func__, memtype); + return 1; + } + + if (!tracked_objs_arr[memtype]) { + /* object is not tracked */ + return 1; + } + obj_desc_p = tracked_objs_arr[memtype]; + + hash_val = addr % MEMTRACK_HASH_SZ; + + memtrack_spin_lock(&obj_desc_p->hash_lock, flags); + /* find mem_info of given memory location */ + cur_mem_info_p = obj_desc_p->mem_hash[hash_val]; + while (cur_mem_info_p != NULL) { + if (cur_mem_info_p->addr == addr) { + /* Found given address in the database - check size */ + if (cur_mem_info_p->size != size) { + printk(KERN_ERR "mtl size inconsistency: %s: %s::%lu: try to %s at address=0x%lX with size %lu while was created with size %lu\n", + __func__, filename, line_num, memtype_free_str(memtype), + addr, size, cur_mem_info_p->size); + snprintf(cur_mem_info_p->ext_info, sizeof(cur_mem_info_p->ext_info), + "invalid free size %lu\n", size); + ret = 1; + } + memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); + return ret; + } + cur_mem_info_p = cur_mem_info_p->next; + } + + /* not found - This function will not give any indication + but will only check the correct size\order + For inconsistency the 'free' function will check that */ + memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); + return 1; +} +EXPORT_SYMBOL(memtrack_check_size); + +/* Search for a specific addr whether it exist in the + current data-base. + It will print an error msg if we get an unexpected result, + Return value: 0 - if addr exist, else 1 */ +int memtrack_is_new_addr(enum memtrack_memtype_t memtype, unsigned long addr, int expect_exist, + const char *filename, const unsigned long line_num) +{ + unsigned long hash_val; + struct memtrack_meminfo_t *cur_mem_info_p; + struct tracked_obj_desc_t *obj_desc_p; + unsigned long flags; + + if (memtype >= MEMTRACK_NUM_OF_MEMTYPES) { + printk(KERN_ERR "%s: Invalid memory type (%d)\n", __func__, memtype); + return 1; + } + + if (!tracked_objs_arr[memtype]) { + /* object is not tracked */ + return 0; + } + obj_desc_p = tracked_objs_arr[memtype]; + + hash_val = addr % MEMTRACK_HASH_SZ; + + memtrack_spin_lock(&obj_desc_p->hash_lock, flags); + /* find mem_info of given memory location */ + cur_mem_info_p = obj_desc_p->mem_hash[hash_val]; + while (cur_mem_info_p != NULL) { + if (cur_mem_info_p->addr == addr) { + /* Found given address in the database - exiting */ + memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); + return 0; + } + cur_mem_info_p = cur_mem_info_p->next; + } + + /* not found */ + if (expect_exist) + printk(KERN_ERR "mtl rsc inconsistency: %s: %s::%lu: %s for unknown address=0x%lX\n", + __func__, filename, line_num, memtype_free_str(memtype), addr); + + memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); + return 1; +} +EXPORT_SYMBOL(memtrack_is_new_addr); + +/* Return current page reference counter */ +int memtrack_get_page_ref_count(unsigned long addr) +{ + unsigned long hash_val; + struct memtrack_meminfo_t *cur_mem_info_p; + struct tracked_obj_desc_t *obj_desc_p; + unsigned long flags; + /* This function is called only for page allocation */ + enum memtrack_memtype_t memtype = MEMTRACK_PAGE_ALLOC; + int ref_conut = 0; + + if (!tracked_objs_arr[memtype]) { + /* object is not tracked */ + return ref_conut; + } + obj_desc_p = tracked_objs_arr[memtype]; + + hash_val = addr % MEMTRACK_HASH_SZ; + + memtrack_spin_lock(&obj_desc_p->hash_lock, flags); + /* find mem_info of given memory location */ + cur_mem_info_p = obj_desc_p->mem_hash[hash_val]; + while (cur_mem_info_p != NULL) { + if (cur_mem_info_p->addr == addr) { + /* Found given address in the database - check ref-count */ + struct page *page = (struct page *)(cur_mem_info_p->addr); +#ifdef HAVE_MM_PAGE__COUNT + ref_conut = atomic_read(&page->_count); +#else + ref_conut = atomic_read(&page->_refcount); +#endif + memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); + return ref_conut; + } + cur_mem_info_p = cur_mem_info_p->next; + } + + /* not found */ + memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); + return ref_conut; +} +EXPORT_SYMBOL(memtrack_get_page_ref_count); + +/* Report current allocations status (for all memory types) */ +static void memtrack_report(void) +{ + enum memtrack_memtype_t memtype; + unsigned long cur_bucket; + struct memtrack_meminfo_t *cur_mem_info_p; + int serial = 1; + struct tracked_obj_desc_t *obj_desc_p; + unsigned long flags; + unsigned long detected_leaks = 0; + + printk(KERN_INFO "%s: Currently known allocations:\n", __func__); + for (memtype = 0; memtype < MEMTRACK_NUM_OF_MEMTYPES; memtype++) { + if (tracked_objs_arr[memtype]) { + printk(KERN_INFO "%d) %s:\n", serial, memtype_alloc_str(memtype)); + obj_desc_p = tracked_objs_arr[memtype]; + /* Scan all buckets to find existing allocations */ + /* TBD: this may be optimized by holding a linked list of all hash items */ + for (cur_bucket = 0; cur_bucket < MEMTRACK_HASH_SZ; cur_bucket++) { + memtrack_spin_lock(&obj_desc_p->hash_lock, flags); /* protect per bucket/list */ + cur_mem_info_p = obj_desc_p->mem_hash[cur_bucket]; + while (cur_mem_info_p != NULL) { /* scan bucket */ + printk_ratelimited(KERN_INFO "%s::%lu: %s(%lu)==%lX dev=%lX %s\n", + cur_mem_info_p->filename, + cur_mem_info_p->line_num, + memtype_alloc_str(memtype), + cur_mem_info_p->size, + cur_mem_info_p->addr, + cur_mem_info_p->dev, + cur_mem_info_p->ext_info); + cur_mem_info_p = cur_mem_info_p->next; + ++ detected_leaks; + } /* while cur_mem_info_p */ + memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); + } /* for cur_bucket */ + serial++; + } + } /* for memtype */ + printk(KERN_INFO "%s: Summary: %lu leak(s) detected\n", __func__, detected_leaks); +} + + + +static struct proc_dir_entry *memtrack_tree; + +static enum memtrack_memtype_t get_rsc_by_name(const char *name) +{ + enum memtrack_memtype_t i; + + for (i = 0; i < MEMTRACK_NUM_OF_MEMTYPES; ++i) { + if (strcmp(name, rsc_names[i]) == 0) + return i; + } + + return i; +} + + +static ssize_t memtrack_read(struct file *filp, + char __user *buf, + size_t size, + loff_t *offset) +{ + unsigned long cur, flags; + loff_t pos = *offset; + static char kbuf[20]; + static int file_len; + int _read, to_ret, left; + const char *fname; + enum memtrack_memtype_t memtype; + + if (pos < 0) + return -EINVAL; + + fname = filp->f_path.dentry->d_name.name; + + memtype = get_rsc_by_name(fname); + if (memtype >= MEMTRACK_NUM_OF_MEMTYPES) { + printk(KERN_ERR "invalid file name\n"); + return -EINVAL; + } + + if (pos == 0) { + memtrack_spin_lock(&tracked_objs_arr[memtype]->hash_lock, flags); + cur = tracked_objs_arr[memtype]->count; + memtrack_spin_unlock(&tracked_objs_arr[memtype]->hash_lock, flags); + _read = sprintf(kbuf, "%lu\n", cur); + if (_read < 0) + return _read; + else + file_len = _read; + } + + left = file_len - pos; + to_ret = (left < size) ? left : size; + if (copy_to_user(buf, kbuf+pos, to_ret)) + return -EFAULT; + else { + *offset = pos + to_ret; + return to_ret; + } +} + +static const struct proc_ops memtrack_proc_ops = { + .proc_read = memtrack_read, +}; + +static const char *memtrack_proc_entry_name = "mt_memtrack"; + +static int create_procfs_tree(void) +{ + struct proc_dir_entry *dir_ent; + struct proc_dir_entry *proc_ent; + int i, j; + unsigned long bit_mask; + + dir_ent = proc_mkdir(memtrack_proc_entry_name, NULL); + if (!dir_ent) + return -1; + + memtrack_tree = dir_ent; + + for (i = 0, bit_mask = 1; i < MEMTRACK_NUM_OF_MEMTYPES; ++i, bit_mask <<= 1) { + if (bit_mask & track_mask) { + proc_ent = proc_create_data(rsc_names[i], S_IRUGO, memtrack_tree, &memtrack_proc_ops, NULL); + if (!proc_ent) { + printk(KERN_INFO "Warning: Cannot create /proc/%s/%s\n", + memtrack_proc_entry_name, rsc_names[i]); + goto undo_create_root; + } + } + } + + goto exit_ok; + +undo_create_root: + for (j = 0, bit_mask = 1; j < i; ++j, bit_mask <<= 1) { + if (bit_mask & track_mask) + remove_proc_entry(rsc_names[j], memtrack_tree); + } + remove_proc_entry(memtrack_proc_entry_name, NULL); + return -1; + +exit_ok: + return 0; +} + + +static void destroy_procfs_tree(void) +{ + int i; + unsigned long bit_mask; + + for (i = 0, bit_mask = 1; i < MEMTRACK_NUM_OF_MEMTYPES; ++i, bit_mask <<= 1) { + if (bit_mask & track_mask) + remove_proc_entry(rsc_names[i], memtrack_tree); + + } + remove_proc_entry(memtrack_proc_entry_name, NULL); +} + +int inject_error_record(char *module_name, char *file_name, char *func_name, + const char *caller_func_name, unsigned long line_num) +{ + struct memtrack_injected_info_t *tmp_injected_info_p, + *new_injected_info_p; + + list_for_each_entry(tmp_injected_info_p, &injected_info_list, list) { + if (!strcmp(module_name, tmp_injected_info_p->module_name) + && !strcmp(file_name, tmp_injected_info_p->file_name) + && line_num == tmp_injected_info_p->line_num) { + if (tmp_injected_info_p->num_failures >= num_failures) + return 0; + else + return ++tmp_injected_info_p->num_failures; + } + } + + new_injected_info_p = kmalloc(sizeof(*new_injected_info_p), GFP_KERNEL); + if (!new_injected_info_p) { + printk(KERN_ERR "memtrack::%s: failed to allocate new injected info\n", __func__); + return 0; + } + strcpy(new_injected_info_p->module_name, module_name); + strcpy(new_injected_info_p->func_name, func_name); + strcpy(new_injected_info_p->caller_func_name, caller_func_name); + strcpy(new_injected_info_p->file_name, file_name); + new_injected_info_p->line_num = line_num; + new_injected_info_p->num_failures = 1; + INIT_LIST_HEAD(&new_injected_info_p->list); + list_add_tail(&(new_injected_info_p->list), &(injected_info_list)); + return 1; +} + +int memtrack_inject_error(struct module *module_obj, + char *file_name, + char *func_name, + const char *caller_func_name, + unsigned long line_num) +{ + int val = 0; + + if (!strcmp(module_obj->name, "memtrack")) + return 0; + +#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 16) + if (!list_empty(&targeted_modules_list) && + !find_targeted_module_info(module_obj->name)) { + return 0; + } + + if (find_ignore_func_info(caller_func_name)) + return 0; +#endif + if (inject_freq) { + if (!(random32() % inject_freq)) { + val = inject_error_record(module_obj->name, + file_name, func_name, + caller_func_name, + line_num); + } + } + + return val; +} +EXPORT_SYMBOL(memtrack_inject_error); + +int memtrack_randomize_mem(void) +{ + return random_mem; +} +EXPORT_SYMBOL(memtrack_randomize_mem); + +/* module entry points */ + +int init_module(void) +{ + enum memtrack_memtype_t i; + int j; + unsigned long bit_mask; + + + /* create a cache for the memtrack_meminfo_t strcutures */ + meminfo_cache = kmem_cache_create("memtrack_meminfo_t", + sizeof(struct memtrack_meminfo_t), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!meminfo_cache) { + printk(KERN_ERR "memtrack::%s: failed to allocate meminfo cache\n", __func__); + return -1; + } + + /* initialize array of descriptors */ + memset(tracked_objs_arr, 0, sizeof(tracked_objs_arr)); + + /* create a tracking object descriptor for all required objects */ + for (i = 0, bit_mask = 1; i < MEMTRACK_NUM_OF_MEMTYPES; ++i, bit_mask <<= 1) { + if (bit_mask & track_mask) { + tracked_objs_arr[i] = vmalloc(sizeof(struct tracked_obj_desc_t)); + if (!tracked_objs_arr[i]) { + printk(KERN_ERR "memtrack: failed to allocate tracking object\n"); + goto undo_cache_create; + } + + memset(tracked_objs_arr[i], 0, sizeof(struct tracked_obj_desc_t)); + spin_lock_init(&tracked_objs_arr[i]->hash_lock); + INIT_LIST_HEAD(&tracked_objs_arr[i]->tracked_objs_head); + if (bit_mask & strict_track_mask) + tracked_objs_arr[i]->strict_track = 1; + else + tracked_objs_arr[i]->strict_track = 0; + } + } + + + if (create_procfs_tree()) { + printk(KERN_ERR "%s: create_procfs_tree() failed\n", __FILE__); + goto undo_cache_create; + } + +#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 16) + memtrack_obj = kobject_create_and_add("memtrack", kernel_kobj); + if (!memtrack_obj) { + printk(KERN_ERR "memtrack::%s: failed to create memtrack kobject\n", __func__); + goto undo_procfs_tree; + } + + if (sysfs_create_group(memtrack_obj, &attr_group)) { + printk(KERN_ERR "memtrack::%s: failed to create memtrack sysfs\n", __func__); + goto undo_kobject; + } +#endif + printk(KERN_INFO "memtrack::%s done.\n", __func__); + + return 0; + +#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 16) +undo_kobject: + kobject_put(memtrack_obj); + +undo_procfs_tree: + destroy_procfs_tree(); +#endif + +undo_cache_create: + for (j = 0; j < i; ++j) { + if (tracked_objs_arr[j]) + vfree(tracked_objs_arr[j]); + } + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 19) + if (kmem_cache_destroy(meminfo_cache) != 0) + printk(KERN_ERR "Failed on kmem_cache_destroy!\n"); +#else + kmem_cache_destroy(meminfo_cache); +#endif + return -1; +} + + +void cleanup_module(void) +{ + enum memtrack_memtype_t memtype; + unsigned long cur_bucket; + struct memtrack_meminfo_t *cur_mem_info_p, *next_mem_info_p; + struct memtrack_injected_info_t *injected_info_p, *tmp_injected_info_p; + struct memtrack_ignore_func_info_t *ignore_func_p, *tmp_ignore_func_p; + struct memtrack_targeted_modules_info_t *targeted_module_p, + *tmp_targeted_module_p; + struct tracked_obj_desc_t *obj_desc_p; + unsigned long flags; + + + memtrack_report(); + + + destroy_procfs_tree(); + +#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 16) + kobject_put(memtrack_obj); +#endif + + /* clean up any hash table left-overs */ + for (memtype = 0; memtype < MEMTRACK_NUM_OF_MEMTYPES; memtype++) { + /* Scan all buckets to find existing allocations */ + /* TBD: this may be optimized by holding a linked list of all hash items */ + if (tracked_objs_arr[memtype]) { + obj_desc_p = tracked_objs_arr[memtype]; + for (cur_bucket = 0; cur_bucket < MEMTRACK_HASH_SZ; cur_bucket++) { + memtrack_spin_lock(&obj_desc_p->hash_lock, flags); /* protect per bucket/list */ + cur_mem_info_p = obj_desc_p->mem_hash[cur_bucket]; + while (cur_mem_info_p != NULL) { /* scan bucket */ + next_mem_info_p = cur_mem_info_p->next; /* save "next" pointer before the "free" */ + kmem_cache_free(meminfo_cache, cur_mem_info_p); + cur_mem_info_p = next_mem_info_p; + } /* while cur_mem_info_p */ + memtrack_spin_unlock(&obj_desc_p->hash_lock, flags); + } /* for cur_bucket */ + vfree(obj_desc_p); + } + } /* for memtype */ + + /* print report of injected memory failures */ + printk(KERN_INFO "memtrack::cleanup_module: Report of injected memroy failures:\n"); + list_for_each_entry_safe(injected_info_p, + tmp_injected_info_p, + &injected_info_list, + list) { + printk(KERN_INFO "Module=%s file=%s func=%s caller=%s line=%lu num_failures=%d", + injected_info_p->module_name, + injected_info_p->file_name + + strlen(injected_info_p->file_name)/2, + injected_info_p->func_name, + injected_info_p->caller_func_name, + injected_info_p->line_num, + injected_info_p->num_failures); + list_del(&injected_info_p->list); + kfree(injected_info_p); + } + + /* Free ignore function list*/ + list_for_each_entry_safe(ignore_func_p, + tmp_ignore_func_p, + &ignored_func_list, + list) { + list_del(&ignore_func_p->list); + kfree(ignore_func_p); + } + + /* Free targeted module list*/ + list_for_each_entry_safe(targeted_module_p, + tmp_targeted_module_p, + &targeted_modules_list, + list) { + list_del(&targeted_module_p->list); + kfree(targeted_module_p); + } + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 19) + if (kmem_cache_destroy(meminfo_cache) != 0) + printk(KERN_ERR "memtrack::cleanup_module: Failed on kmem_cache_destroy!\n"); +#else + kmem_cache_destroy(meminfo_cache); +#endif + printk(KERN_INFO "memtrack::cleanup_module done.\n"); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/debug/memtrack.h b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/debug/memtrack.h new file mode 100644 index 0000000..afffb1e --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/debug/memtrack.h @@ -0,0 +1,110 @@ +/* + This software is available to you under a choice of one of two + licenses. You may choose to be licensed under the terms of the GNU + General Public License (GPL) Version 2, available at + , or the OpenIB.org BSD + license, available in the LICENSE.TXT file accompanying this + software. These details are also available at + . + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + + Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved. +*/ + +#ifndef H_MEMTRACK_H +#define H_MEMTRACK_H + +enum memtrack_memtype_t { + MEMTRACK_KMALLOC, + MEMTRACK_VMALLOC, + MEMTRACK_KMEM_OBJ, + MEMTRACK_IOREMAP, /* IO-RE/UN-MAP */ + MEMTRACK_WORK_QUEUE, /* Handle work-queue create & destroy */ + MEMTRACK_PAGE_ALLOC, /* Handle page allocation and free */ + MEMTRACK_DMA_MAP_SINGLE,/* Handle ib_dma_single map and unmap */ + MEMTRACK_DMA_MAP_PAGE, /* Handle ib_dma_page map and unmap */ + MEMTRACK_DMA_MAP_SG, /* Handle ib_dma_sg map and unmap with and without attributes */ + MEMTRACK_NUM_OF_MEMTYPES, + + /* Pseudo memtypes that are converted to actual memtypes above. */ + MEMTRACK_KVMALLOC, /* Auto determine MEMTRACK_KMALLOC or MEMTRACK_VMALLOC */ +}; + +/* Invoke on memory allocation */ +void memtrack_alloc(enum memtrack_memtype_t memtype, unsigned long dev, + unsigned long addr, unsigned long size, unsigned long addr2, + int direction, const char *filename, + const unsigned long line_num, int alloc_flags); + +/* Invoke on memory free */ +void memtrack_free(enum memtrack_memtype_t memtype, unsigned long dev, + unsigned long addr, unsigned long size, int direction, + const char *filename, const unsigned long line_num); + +/* + * This function recognizes allocations which + * may be released by kernel (e.g. skb & vnic) and + * therefore not trackable by memtrack. + * The allocations are recognized by the name + * of their calling function. + */ +int is_non_trackable_alloc_func(const char *func_name); +/* + * In some cases we need to free a memory + * we defined as "non trackable" (see + * is_non_trackable_alloc_func). + * This function recognizes such releases + * by the name of their calling function. + */ +int is_non_trackable_free_func(const char *func_name); + +/* WA - In this function handles confirm + the the function name is + '__ib_umem_release' or 'ib_umem_get' + In this case we won't track the + memory there because the kernel + was the one who allocated it. + Return value: + 1 - if the function name is match, else 0 */ +int is_umem_put_page(const char *func_name); + +/* Check page order size + When Freeing a page allocation it checks whether + we are trying to free the same amount of pages + we ask to allocate (In log2(order)). + In case an error if found it will print + an error msg */ +int memtrack_check_size(enum memtrack_memtype_t memtype, unsigned long addr, + unsigned long size, const char *filename, + const unsigned long line_num); + +/* Search for a specific addr whether it exist in the + current data-base. + If not it will print an error msg, + Return value: 0 - if addr exist, else 1 */ +int memtrack_is_new_addr(enum memtrack_memtype_t memtype, unsigned long addr, int expect_exist, + const char *filename, const unsigned long line_num); + +/* Return current page reference counter */ +int memtrack_get_page_ref_count(unsigned long addr); + +/* Report current allocations status (for all memory types) */ +/* we do not export this function since it is used by cleanup_module only */ +/* void memtrack_report(void); */ + +/* Allow support of error injections */ +int memtrack_inject_error(struct module *module_obj, char *file_name, char *func_name, + const char *caller_func_name, unsigned long line_num); + +/* randomize allocated memory */ +int memtrack_randomize_mem(void); + +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/debug/mtrack.h b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/debug/mtrack.h new file mode 100644 index 0000000..2f3ff14 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/debug/mtrack.h @@ -0,0 +1,1032 @@ +#ifndef __mtrack_h_ +#define __mtrack_h_ + +#include "memtrack.h" + +#include +#include +#include +#include +#include /* For ioremap_nocache, ioremap, iounmap */ +#include +# include /* For ioremap_nocache, ioremap, iounmap */ +#include /* For all page handling */ +#include /* For all work-queue handling */ +#include /* For using scatterlists */ +#include /* For skbufs handling */ +#include /* For copy from/to user */ +#include +#include + +#define MEMTRACK_ERROR_INJECTION_MESSAGE(module, file, line, call_func, func) ({ \ + printk(KERN_ERR "%s::%s::%s failure injected at %s:%d\n", module->name, call_func, func, file, line);\ + dump_stack(); \ +}) + +#ifdef ZERO_OR_NULL_PTR +#define IS_VALID_ADDR(addr) (!ZERO_OR_NULL_PTR(addr)) +#else +#define IS_VALID_ADDR(addr) (addr) +#endif + +#ifdef CONFIG_ARM64 +#ifndef CONFIG_GENERIC_IOREMAP +#undef ioremap +static inline void *ioremap(phys_addr_t phys_addr, size_t size) +{ + return __ioremap(phys_addr, size, __pgprot(PROT_DEVICE_nGnRE)); +} +#endif /* CONFIG_GENERIC_IOREMAP */ + +#undef ioremap_nocache +static inline void *ioremap_nocache(phys_addr_t phys_addr, size_t size) +{ +#ifndef CONFIG_GENERIC_IOREMAP + return __ioremap(phys_addr, size, __pgprot(PROT_DEVICE_nGnRE)); +#else + return ioremap_prot(phys_addr, size, PROT_DEVICE_nGnRE); +#endif +} + +#undef ioremap_wc +static inline void *ioremap_wc(phys_addr_t phys_addr, size_t size) +{ +#ifndef CONFIG_GENERIC_IOREMAP + return __ioremap(phys_addr, size, __pgprot(PROT_NORMAL_NC)); +#else + return ioremap_prot(phys_addr, size, PROT_NORMAL_NC); +#endif +} + +/* ARCH_HAS_IOREMAP_WC was defined for arm64 until 2014-07-24 */ +#ifndef ARCH_HAS_IOREMAP_WC +#define ARCH_HAS_IOREMAP_WC 1 +#endif + +#ifdef iounmap +#undef iounmap +static inline void iounmap(void *addr) +{ + __iounmap(addr); +} +#endif /* iounmap */ +#endif /* CONFIG_ARM64 */ + +#define kzalloc(size, flags) ({ \ + void *__memtrack_addr = NULL; \ + \ + if (memtrack_inject_error(THIS_MODULE, __FILE__, "kzalloc", __func__, __LINE__)) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(THIS_MODULE, __FILE__, __LINE__, __func__, "kzalloc");\ + else \ + __memtrack_addr = kzalloc(size, flags); \ + if (IS_VALID_ADDR(__memtrack_addr) && !is_non_trackable_alloc_func(__func__)) { \ + memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, flags); \ + } \ + __memtrack_addr; \ +}) + +#define kzalloc_node(size, flags, node) ({ \ + void *__memtrack_addr = NULL; \ + \ + if (memtrack_inject_error(THIS_MODULE, __FILE__, "kzalloc_node", __func__, __LINE__)) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(THIS_MODULE, __FILE__, __LINE__, __func__, "kzalloc_node"); \ + else \ + __memtrack_addr = kzalloc_node(size, flags, node); \ + if (IS_VALID_ADDR(__memtrack_addr) && (size) > 0 && \ + !is_non_trackable_alloc_func(__func__)) { \ + memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, flags); \ + } \ + __memtrack_addr; \ +}) + +#define kvzalloc(size, flags) ({ \ + void *__memtrack_addr = NULL; \ + \ + if (memtrack_inject_error(THIS_MODULE, __FILE__, "kvzalloc", __func__, __LINE__)) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(THIS_MODULE, __FILE__, __LINE__, __func__, "kvzalloc"); \ + else \ + __memtrack_addr = kvzalloc(size, flags); \ + if (IS_VALID_ADDR(__memtrack_addr) && !is_non_trackable_alloc_func(__func__)) { \ + memtrack_alloc(MEMTRACK_KVMALLOC, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, flags); \ + } \ + __memtrack_addr; \ +}) + +#define kvmalloc_array(n, size, flags) ({ \ + void *__memtrack_addr = NULL; \ + \ + if (memtrack_inject_error(THIS_MODULE, __FILE__, "kvmalloc_array", __func__, __LINE__)) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(THIS_MODULE, __FILE__, __LINE__, __func__, "kvmalloc_array"); \ + else \ + __memtrack_addr = kvmalloc_array(n, size, flags); \ + if (IS_VALID_ADDR(__memtrack_addr) && \ + !is_non_trackable_alloc_func(__func__) && (n) * (size) > 0) { \ + memtrack_alloc(MEMTRACK_KVMALLOC, 0UL, (unsigned long)(__memtrack_addr), (n)*size, 0UL, 0, __FILE__, __LINE__, flags); \ + } \ + __memtrack_addr; \ +}) + +#define kvcalloc(n, size, flags) ({ \ + void *__memtrack_addr = NULL; \ + \ + if (memtrack_inject_error(THIS_MODULE, __FILE__, "kvcalloc", __func__, __LINE__)) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(THIS_MODULE, __FILE__, __LINE__, __func__, "kvcalloc"); \ + else \ + __memtrack_addr = kvcalloc(n, size, flags); \ + if (IS_VALID_ADDR(__memtrack_addr) && \ + !is_non_trackable_alloc_func(__func__)) { \ + memtrack_alloc(MEMTRACK_KVMALLOC, 0UL, (unsigned long)(__memtrack_addr),(n)*(size), 0UL, 0, __FILE__, __LINE__, flags); \ + } \ + __memtrack_addr; \ +}) + +#define kcalloc_node(n, size, flags, node) ({ \ + void *__memtrack_addr = NULL; \ + \ + if (memtrack_inject_error(THIS_MODULE, __FILE__, "kcalloc_node", __func__, __LINE__)) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(THIS_MODULE, __FILE__, __LINE__, __func__, "kcalloc_node"); \ + else \ + __memtrack_addr = kcalloc_node(n, size, flags, node); \ + if (IS_VALID_ADDR(__memtrack_addr) && (size) > 0 && \ + !is_non_trackable_alloc_func(__func__)) { \ + memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr),(n) * (size), 0UL, 0, __FILE__, __LINE__, flags); \ + } \ + __memtrack_addr; \ +}) + +#define kcalloc(n, size, flags) ({ \ + void *__memtrack_addr = NULL; \ + \ + if (memtrack_inject_error(THIS_MODULE, __FILE__, "kcalloc", __func__, __LINE__)) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(THIS_MODULE, __FILE__, __LINE__, __func__, "kcalloc");\ + else \ + __memtrack_addr = kcalloc(n, size, flags); \ + if (IS_VALID_ADDR(__memtrack_addr) && (n) * (size) > 0 && \ + !is_non_trackable_alloc_func(__func__)) { \ + memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), (n)*(size), 0UL, 0, __FILE__, __LINE__, flags); \ + } \ + __memtrack_addr; \ +}) + +#define kmalloc(sz, flgs) ({ \ + void *__memtrack_addr = NULL; \ + \ + if (memtrack_inject_error(THIS_MODULE, __FILE__, "kmalloc", __func__, __LINE__)) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(THIS_MODULE, __FILE__, __LINE__, __func__, "kmalloc");\ + else \ + __memtrack_addr = kmalloc(sz, flgs); \ + if (IS_VALID_ADDR(__memtrack_addr)) { \ + memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), sz, 0UL, 0, __FILE__, __LINE__, flgs); \ + if (memtrack_randomize_mem()) \ + memset(__memtrack_addr, 0x5A, sz); \ + } \ + __memtrack_addr; \ +}) + +#define kmalloc_node(sz, flgs, node) ({ \ + void *__memtrack_addr = NULL; \ + \ + if (memtrack_inject_error(THIS_MODULE, __FILE__, "kmalloc_node", __func__, __LINE__)) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(THIS_MODULE, __FILE__, __LINE__, __func__, "kmalloc_node"); \ + else \ + __memtrack_addr = kmalloc_node(sz, flgs, node); \ + if (__memtrack_addr) { \ + memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), sz, 0UL, 0, __FILE__, __LINE__, flgs); \ + if (memtrack_randomize_mem() && ((flgs) == GFP_KERNEL)) \ + memset(__memtrack_addr, 0x5A, sz); \ + } \ + __memtrack_addr; \ +}) + +#define krealloc(p, new_size, flags) ({ \ + void *__memtrack_addr = NULL; \ + void *__old_addr = (void *)p; \ + \ + if (memtrack_inject_error(THIS_MODULE, __FILE__, "krealloc", __func__, __LINE__)) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(THIS_MODULE, __FILE__, __LINE__, __func__, "krealloc");\ + else { \ + if (IS_VALID_ADDR(__old_addr) && \ + !is_non_trackable_alloc_func(__func__)) { \ + memtrack_free(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__old_addr), 0UL, 0, __FILE__, __LINE__); \ + } \ + __memtrack_addr = krealloc(p, new_size, flags); \ + } \ + if (IS_VALID_ADDR(__memtrack_addr) && !is_non_trackable_alloc_func(__func__)) { \ + memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), new_size, 0UL, 0, __FILE__, __LINE__, flags);\ + } \ + __memtrack_addr; \ +}) + +#define kvmalloc(sz, flgs) ({ \ + void *__memtrack_addr = NULL; \ + \ + if (memtrack_inject_error(THIS_MODULE, __FILE__, "kvmalloc", __func__, __LINE__)) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(THIS_MODULE, __FILE__, __LINE__, __func__, "kvmalloc"); \ + else \ + __memtrack_addr = kvmalloc(sz, flgs); \ + if (IS_VALID_ADDR(__memtrack_addr) && !is_non_trackable_alloc_func(__func__)) {\ + memtrack_alloc(MEMTRACK_KVMALLOC, 0UL, (unsigned long)(__memtrack_addr), sz, 0UL, 0, __FILE__, __LINE__, flgs); \ + if (memtrack_randomize_mem() && ((flgs) == GFP_KERNEL)) \ + memset(__memtrack_addr, 0x5A, sz); \ + } \ + __memtrack_addr; \ +}) + +#define kvmalloc_node(sz, flgs, node) ({ \ + void *__memtrack_addr = NULL; \ + \ + if (memtrack_inject_error(THIS_MODULE, __FILE__, "kvmalloc_node", __func__, __LINE__)) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(THIS_MODULE, __FILE__, __LINE__, __func__, "kvmalloc_node"); \ + else \ + __memtrack_addr = kvmalloc_node(sz, flgs, node); \ + if (IS_VALID_ADDR(__memtrack_addr)) { \ + memtrack_alloc(MEMTRACK_KVMALLOC, 0UL, (unsigned long)(__memtrack_addr), sz, 0UL, 0, __FILE__, __LINE__, flgs); \ + if (memtrack_randomize_mem() && ((flgs) == GFP_KERNEL)) \ + memset(__memtrack_addr, 0x5A, sz); \ + } \ + __memtrack_addr; \ +}) + +#define kvzalloc_node(sz, flgs, node) ({ \ + void *__memtrack_addr = NULL; \ + \ + if (memtrack_inject_error(THIS_MODULE, __FILE__, "kvzalloc_node", __func__, __LINE__)) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(THIS_MODULE, __FILE__, __LINE__, __func__, "kvzalloc_node"); \ + else \ + __memtrack_addr = kvzalloc_node(sz, flgs, node); \ + if (IS_VALID_ADDR(__memtrack_addr)) { \ + memtrack_alloc(MEMTRACK_KVMALLOC, 0UL, (unsigned long)(__memtrack_addr), sz, 0UL, 0, __FILE__, __LINE__, flgs); \ + } \ + __memtrack_addr; \ +}) + +#define kmalloc_array(n, size, flags) ({ \ + void *__memtrack_addr = NULL; \ + \ + if (memtrack_inject_error(THIS_MODULE, __FILE__, "kmalloc_array", __func__, __LINE__)) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(THIS_MODULE, __FILE__, __LINE__, __func__, "kmalloc_array"); \ + else \ + __memtrack_addr = kmalloc_array(n, size, flags); \ + if (IS_VALID_ADDR(__memtrack_addr) && (n) * (size) > 0) { \ + memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), (n)*(size), 0UL, 0, __FILE__, __LINE__, flags); \ + } \ + __memtrack_addr; \ +}) + +#define kmemdup(src, sz, flgs) ({ \ + void *__memtrack_addr = NULL; \ + \ + if (memtrack_inject_error(THIS_MODULE, __FILE__, "kmemdup", __func__, __LINE__)) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(THIS_MODULE, __FILE__, __LINE__, __func__, "kmemdup");\ + else \ + __memtrack_addr = kmemdup(src, sz, flgs); \ + if (IS_VALID_ADDR(__memtrack_addr)) { \ + memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), sz, 0UL, 0, __FILE__, __LINE__, flgs); \ + } \ + __memtrack_addr; \ +}) + +#ifndef kstrdup +#define kstrdup(src, flgs) ({ \ + void *__memtrack_addr = NULL; \ + size_t sz = strlen(src) + 1; \ + \ + if (memtrack_inject_error(THIS_MODULE, __FILE__, "kstrdup", __func__, __LINE__)) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(THIS_MODULE, __FILE__, __LINE__, __func__, "kstrdup");\ + else \ + __memtrack_addr = kstrdup(src, flgs); \ + if (IS_VALID_ADDR(__memtrack_addr)) { \ + memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), sz, 0UL, 0, __FILE__, __LINE__, flgs); \ + } \ + __memtrack_addr; \ +}) +#endif + +#define kfree(addr) ({ \ + void *__memtrack_addr = (void *)addr; \ + \ + if (IS_VALID_ADDR(__memtrack_addr) && \ + !is_non_trackable_free_func(__func__)) { \ + memtrack_free(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \ + } \ + kfree(__memtrack_addr); \ +}) + +#ifdef CONFIG_COMPAT_RCU +#ifdef kfree_rcu + #undef kfree_rcu +#endif +#if !defined(__kvfree_rcu) && !defined(__kfree_rcu) +/* + * Removed __kvfree_rcu macro upstream v5.12 + * commit 5ea5d1ed572c ("rcu: Eliminate the __kvfree_rcu() macro") + */ +#define __kvfree_rcu(head, offset) \ + do { \ + BUILD_BUG_ON(!__is_kvfree_rcu_offset(offset)); \ + kvfree_call_rcu(head, (rcu_callback_t)(unsigned long)(offset)); \ + } while (0) +#endif /* !defined(__kvfree_rcu) && !defined(kfree_rcu) */ + +#ifdef __kvfree_rcu +#define kfree_rcu_2(addr, rcu_head) ({ \ + void *__memtrack_addr = (void *)addr; \ + \ + if (IS_VALID_ADDR(__memtrack_addr) && \ + !is_non_trackable_free_func(__func__)) { \ + memtrack_free(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \ + } \ + __kvfree_rcu(&((addr)->rcu_head), offsetof(typeof(*(addr)), rcu_head)); \ +}) + +#define __kvfree_rcu_1(ptr) \ +do { \ + typeof(ptr) ___p = (ptr); \ + \ + if (___p) \ + kvfree_call_rcu(NULL, (rcu_callback_t) (___p)); \ +} while (0) +#else +#define kfree_rcu_2(addr, rcu_head) ({ \ + void *__memtrack_addr = (void *)addr; \ + \ + if (IS_VALID_ADDR(__memtrack_addr) && \ + !is_non_trackable_free_func(__func__)) { \ + memtrack_free(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \ + } \ + __kfree_rcu(&((addr)->rcu_head), offsetof(typeof(*(addr)), rcu_head)); \ +}) + +#define __kvfree_rcu_1(ptr) \ +do { \ + typeof(ptr) ___p = (ptr); \ + \ + if (___p) \ + kfree_call_rcu(NULL, (rcu_callback_t) (___p)); \ +} while (0) +#endif /* __kvfree_rcu */ + +/* commit 1835f475e351 ("rcu: Introduce single argument kvfree_rcu() interface") */ +#undef kvfree_rcu_arg_1 +#undef kvfree_rcu_arg_2 + +#define kvfree_rcu_arg_1(ptr) ({ \ + void *__memtrack_addr = (void *)ptr; \ + \ + if (IS_VALID_ADDR(__memtrack_addr) && \ + !is_non_trackable_free_func(__func__)) { \ + memtrack_free(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \ + } \ + __kvfree_rcu_1(ptr); \ +}) + +#define kfree_rcu(ptr, rhf...) kvfree_rcu(ptr, ## rhf) +#define kvfree_rcu(...) KVFREE_GET_MACRO(__VA_ARGS__, \ + kvfree_rcu_arg_2, kvfree_rcu_arg_1)(__VA_ARGS__) + +#define KVFREE_GET_MACRO(_1, _2, NAME, ...) NAME +#define kvfree_rcu_arg_2(ptr, rhf) kfree_rcu_2(ptr, rhf) +#endif /* CONFIG_COMPAT_RCU */ + +#define vmalloc(size) ({ \ + void *__memtrack_addr = NULL; \ + \ + if (memtrack_inject_error(THIS_MODULE, __FILE__, "vmalloc", __func__, __LINE__)) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(THIS_MODULE, __FILE__, __LINE__, __func__, "vmalloc");\ + else \ + __memtrack_addr = vmalloc(size); \ + if (IS_VALID_ADDR(__memtrack_addr)) { \ + memtrack_alloc(MEMTRACK_VMALLOC, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ + if (memtrack_randomize_mem()) \ + memset(__memtrack_addr, 0x5A, size); \ + } \ + __memtrack_addr; \ +}) + +#ifndef vzalloc +#define vzalloc(size) ({ \ + void *__memtrack_addr = NULL; \ + \ + if (memtrack_inject_error(THIS_MODULE, __FILE__, "vzalloc", __func__, __LINE__)) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(THIS_MODULE, __FILE__, __LINE__, __func__, "vzalloc");\ + else \ + __memtrack_addr = vzalloc(size); \ + if (IS_VALID_ADDR(__memtrack_addr)) { \ + memtrack_alloc(MEMTRACK_VMALLOC, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ + } \ + __memtrack_addr; \ +}) +#endif + +#ifndef vzalloc_node +#define vzalloc_node(size, node) ({ \ + void *__memtrack_addr = NULL; \ + \ + if (memtrack_inject_error(THIS_MODULE, __FILE__, "vzalloc_node", __func__, __LINE__)) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(THIS_MODULE, __FILE__, __LINE__, __func__, "vzalloc_node"); \ + else \ + __memtrack_addr = vzalloc_node(size, node); \ + if (IS_VALID_ADDR(__memtrack_addr)) { \ + memtrack_alloc(MEMTRACK_VMALLOC, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ + } \ + __memtrack_addr; \ +}) +#endif + +#ifndef __vmalloc +#ifdef HAVE_VMALLOC_3_PARAM +#define __vmalloc(size, mask, prot) ({ \ + void *__memtrack_addr = NULL; \ + \ + if (memtrack_inject_error(THIS_MODULE, __FILE__, "__vmalloc", __func__, __LINE__)) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(THIS_MODULE, __FILE__, __LINE__, __func__, "__vmalloc"); \ + else \ + __memtrack_addr = __vmalloc(size, mask, prot); \ + if (IS_VALID_ADDR(__memtrack_addr)) { \ + memtrack_alloc(MEMTRACK_VMALLOC, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ + if (memtrack_randomize_mem()) \ + memset(__memtrack_addr, 0x5A, size); \ + } \ + __memtrack_addr; \ +}) +#else +#define __vmalloc(size, mask) ({ \ + void *__memtrack_addr = NULL; \ + \ + if (memtrack_inject_error(THIS_MODULE, __FILE__, "__vmalloc", __func__, __LINE__)) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(THIS_MODULE, __FILE__, __LINE__, __func__, "__vmalloc"); \ + else \ + __memtrack_addr = __vmalloc(size, mask); \ + if (IS_VALID_ADDR(__memtrack_addr)) { \ + memtrack_alloc(MEMTRACK_VMALLOC, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ + if (memtrack_randomize_mem()) \ + memset(__memtrack_addr, 0x5A, size); \ + } \ + __memtrack_addr; \ +}) +#endif +#endif + +#define vmalloc_node(size, node) ({ \ + void *__memtrack_addr = NULL; \ + \ + if (memtrack_inject_error(THIS_MODULE, __FILE__, "vmalloc_node", __func__, __LINE__)) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(THIS_MODULE, __FILE__, __LINE__, __func__, "vmalloc_node"); \ + else \ + __memtrack_addr = vmalloc_node(size, node); \ + if (IS_VALID_ADDR(__memtrack_addr)) { \ + memtrack_alloc(MEMTRACK_VMALLOC, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ + if (memtrack_randomize_mem()) \ + memset(__memtrack_addr, 0x5A, size); \ + } \ + __memtrack_addr; \ +}) + +#define vfree(addr) ({ \ + void *__memtrack_addr = (void *)addr; \ + if (IS_VALID_ADDR(__memtrack_addr) && !is_non_trackable_free_func(__func__)) { \ + memtrack_free(MEMTRACK_VMALLOC, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \ + } \ + vfree(__memtrack_addr); \ +}) + +#ifndef kvfree +#define kvfree(addr) ({ \ + void *__memtrack_addr = (void *)addr; \ + if (IS_VALID_ADDR(__memtrack_addr) && !is_non_trackable_free_func(__func__)) { \ + if (is_vmalloc_addr(__memtrack_addr)) { \ + memtrack_free(MEMTRACK_VMALLOC, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \ + } else { \ + memtrack_free(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \ + } \ + } \ + kvfree(__memtrack_addr); \ +}) +#endif + +#ifndef memdup_user +#define memdup_user(user_addr, size) ({ \ + void *__memtrack_addr = NULL; \ + \ + if (memtrack_inject_error(THIS_MODULE, __FILE__, "memdup_user", __func__, __LINE__)) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(THIS_MODULE, __FILE__, __LINE__, __func__, "memdup_user"); \ + else \ + __memtrack_addr = memdup_user(user_addr, size); \ + \ + if (IS_VALID_ADDR(__memtrack_addr)) { \ + memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_KERNEL); \ + } \ + __memtrack_addr; \ +}) +#endif + +#ifndef memdup_user_nul +#define memdup_user_nul(user_addr, size) ({ \ + void *__memtrack_addr = NULL; \ + \ + if (memtrack_inject_error(THIS_MODULE, __FILE__, "memdup_user_nul", __func__, __LINE__)) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(THIS_MODULE, __FILE__, __LINE__, __func__, "memdup_user_nul"); \ + else \ + __memtrack_addr = memdup_user_nul(user_addr, size); \ + \ + if (IS_VALID_ADDR(__memtrack_addr)) { \ + memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_KERNEL); \ + } \ + __memtrack_addr; \ +}) +#endif + +#define kmem_cache_alloc(cache, flags) ({ \ + void *__memtrack_addr = NULL; \ + \ + if (memtrack_inject_error(THIS_MODULE, __FILE__, "kmem_cache_alloc", __func__, __LINE__)) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(THIS_MODULE, __FILE__, __LINE__, __func__, "kmem_cache_alloc"); \ + else \ + __memtrack_addr = kmem_cache_alloc(cache, flags); \ + if (IS_VALID_ADDR(__memtrack_addr)) { \ + memtrack_alloc(MEMTRACK_KMEM_OBJ, 0UL, (unsigned long)(__memtrack_addr), 1, 0UL, 0, __FILE__, __LINE__, flags); \ + } \ + __memtrack_addr; \ +}) + +#define kmem_cache_zalloc(cache, flags) ({ \ + void *__memtrack_addr = NULL; \ + \ + __memtrack_addr = kmem_cache_alloc(cache, flags | __GFP_ZERO); \ + __memtrack_addr; \ +}) + +#define kmem_cache_free(cache, addr) ({ \ + void *__memtrack_addr = (void *)addr; \ + \ + if (IS_VALID_ADDR(__memtrack_addr)) { \ + memtrack_free(MEMTRACK_KMEM_OBJ, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \ + } \ + kmem_cache_free(cache, __memtrack_addr); \ +}) + +#ifndef kasprintf +#define kasprintf(gfp, fmt, ...) ({ \ + void *__memtrack_addr = NULL; \ + \ + if (memtrack_inject_error(THIS_MODULE, __FILE__, "kasprintf", __func__, __LINE__)) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(THIS_MODULE, __FILE__, __LINE__, __func__, "kasprintf"); \ + else \ + __memtrack_addr = kasprintf(gfp, fmt, __VA_ARGS__); \ + if (IS_VALID_ADDR(__memtrack_addr) && strncmp((char *)__memtrack_addr, "infiniband", 10)) { \ + memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), strlen((char *)__memtrack_addr), 0UL, 0, __FILE__, __LINE__, gfp); \ + } \ + __memtrack_addr; \ +}) +#endif + +/* All IO-MAP handling */ +#ifdef ioremap + #undef ioremap +#endif +#define ioremap(phys_addr, size) ({ \ + void __iomem *__memtrack_addr = NULL; \ + \ + if (memtrack_inject_error(THIS_MODULE, __FILE__, "ioremap", __func__, __LINE__)) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(THIS_MODULE, __FILE__, __LINE__, __func__, "ioremap");\ + else \ + __memtrack_addr = ioremap(phys_addr, size); \ + if (IS_VALID_ADDR(__memtrack_addr)) { \ + memtrack_alloc(MEMTRACK_IOREMAP, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ + } \ + __memtrack_addr; \ +}) + +#ifdef ioremap_wc + #undef ioremap_wc +#endif +#ifdef ARCH_HAS_IOREMAP_WC +#define ioremap_wc(phys_addr, size) ({ \ + void __iomem *__memtrack_addr = NULL; \ + \ + if (memtrack_inject_error(THIS_MODULE, __FILE__, "ioremap_wc", __func__, __LINE__)) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(THIS_MODULE, __FILE__, __LINE__, __func__, "ioremap_wc");\ + else \ + __memtrack_addr = ioremap_wc(phys_addr, size); \ + if (IS_VALID_ADDR(__memtrack_addr)) { \ + memtrack_alloc(MEMTRACK_IOREMAP, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ + } \ + __memtrack_addr; \ +}) +#else +#define ioremap_wc(phys_addr, size) ({ \ + void __iomem *__memtrack_addr = NULL; \ + \ + if (memtrack_inject_error(THIS_MODULE, __FILE__, "ioremap_wc", __func__, __LINE__)) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(THIS_MODULE, __FILE__, __LINE__, __func__, "ioremap_wc");\ + else \ + __memtrack_addr = ioremap_nocache(phys_addr, size); \ + __memtrack_addr; \ +}) +#endif + +#define io_mapping_create_wc(base, size) ({ \ + void __iomem *__memtrack_addr = NULL; \ + \ + if (memtrack_inject_error(THIS_MODULE, __FILE__, "io_mapping_create_wc", __func__, __LINE__)) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(THIS_MODULE, __FILE__, __LINE__, __func__, "io_mapping_create_wc"); \ + else \ + __memtrack_addr = io_mapping_create_wc(base, size); \ + if (IS_VALID_ADDR(__memtrack_addr)) { \ + memtrack_alloc(MEMTRACK_IOREMAP, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ + } \ + __memtrack_addr; \ +}) + +#define io_mapping_free(addr) ({ \ + void *__memtrack_addr = (void *)addr; \ + \ + if (IS_VALID_ADDR(__memtrack_addr)) { \ + memtrack_free(MEMTRACK_IOREMAP, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \ + } \ + io_mapping_free(__memtrack_addr); \ +}) + +#ifdef ioremap_nocache + #undef ioremap_nocache +#endif +#ifdef CONFIG_PPC +#define ioremap_nocache(phys_addr, size) ({ \ + void __iomem *__memtrack_addr = NULL; \ + \ + if (memtrack_inject_error(THIS_MODULE, __FILE__, "ioremap_nocache", __func__, __LINE__)) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(THIS_MODULE, __FILE__, __LINE__, __func__, "ioremap_nocache"); \ + else \ + __memtrack_addr = ioremap(phys_addr, size); \ + __memtrack_addr; \ +}) +#else +#define ioremap_nocache(phys_addr, size) ({ \ + void __iomem *__memtrack_addr = NULL; \ + \ + if (memtrack_inject_error(THIS_MODULE, __FILE__, "ioremap_nocache", __func__, __LINE__)) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(THIS_MODULE, __FILE__, __LINE__, __func__, "ioremap_nocache"); \ + else \ + __memtrack_addr = ioremap_nocache(phys_addr, size); \ + if (IS_VALID_ADDR(__memtrack_addr)) { \ + memtrack_alloc(MEMTRACK_IOREMAP, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ + } \ + __memtrack_addr; \ +}) +#endif /* PPC */ + +#ifdef iounmap + #undef iounmap +#endif +#define iounmap(addr) ({ \ + void *__memtrack_addr = (void *)addr; \ + \ + if (IS_VALID_ADDR(__memtrack_addr)) { \ + memtrack_free(MEMTRACK_IOREMAP, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \ + } \ + iounmap(__memtrack_addr); \ +}) + + +/* All Page handlers */ +/* TODO: Catch netif_rx for page dereference */ +#define alloc_pages_node(nid, gfp_mask, order) ({ \ + struct page *page_addr = NULL; \ + \ + if (memtrack_inject_error(THIS_MODULE, __FILE__, "alloc_pages_node", __func__, __LINE__)) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(THIS_MODULE, __FILE__, __LINE__, __func__, "alloc_pages_node"); \ + else \ + page_addr = (struct page *)alloc_pages_node(nid, gfp_mask, order); \ + if (page_addr && !is_non_trackable_alloc_func(__func__)) { \ + memtrack_alloc(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(page_addr), (unsigned long)(order), 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ + } \ + page_addr; \ +}) + +#define dev_alloc_pages(order) ({ \ + struct page *page_addr = NULL; \ + \ + if (memtrack_inject_error(THIS_MODULE, __FILE__, "dev_alloc_pages", __func__, __LINE__)) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(THIS_MODULE, __FILE__, __LINE__, __func__, "dev_alloc_pages"); \ + else \ + page_addr = (struct page *)dev_alloc_pages(order); \ + if (page_addr && !is_non_trackable_alloc_func(__func__)) { \ + memtrack_alloc(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(page_addr), (unsigned long)(order), 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ + } \ + page_addr; \ +}) + +#ifdef HAVE_SPLIT_PAGE_EXPORTED +#define split_page(pg, order) ({ \ + if (memtrack_inject_error(THIS_MODULE, __FILE__, "split_page", __func__, __LINE__)) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(THIS_MODULE, __FILE__, __LINE__, __func__, "split_page"); \ + else { \ + int i; \ + split_page(pg, order); \ + for (i = 1; i < (1 << order); i++) { \ + struct page *page_addr = &pg[i]; \ + if (page_addr && !is_non_trackable_alloc_func(__func__)) { \ + memtrack_alloc(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(page_addr), (unsigned long)(order), 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ + } \ + } \ + } \ +}) +#endif + +#ifdef CONFIG_NUMA +#define alloc_pages(gfp_mask, order) ({ \ + struct page *page_addr = NULL; \ + \ + if (memtrack_inject_error(THIS_MODULE, __FILE__, "alloc_pages", __func__, __LINE__)) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(THIS_MODULE, __FILE__, __LINE__, __func__, "alloc_pages"); \ + else \ + page_addr = (struct page *)alloc_pages(gfp_mask, order); \ + if (page_addr && !is_non_trackable_alloc_func(__func__)) { \ + memtrack_alloc(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(page_addr), (unsigned long)(order), 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ + } \ + page_addr; \ +}) +#else +#ifdef alloc_pages + #undef alloc_pages +#endif +#define alloc_pages(gfp_mask, order) ({ \ + struct page *page_addr; \ + \ + page_addr = (struct page *)alloc_pages_node(numa_node_id(), gfp_mask, order); \ + page_addr; \ +}) +#endif + +#define __get_free_pages(gfp_mask, order) ({ \ + struct page *page_addr = NULL; \ + \ + if (memtrack_inject_error(THIS_MODULE, __FILE__, "__get_free_pages", __func__, __LINE__)) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(THIS_MODULE, __FILE__, __LINE__, __func__, "__get_free_pages"); \ + else \ + page_addr = (struct page *)__get_free_pages(gfp_mask, order); \ + if (page_addr && !is_non_trackable_alloc_func(__func__)) { \ + memtrack_alloc(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(page_addr), (unsigned long)(order), 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ + } \ + (unsigned long)page_addr; \ +}) + +#define get_zeroed_page(gfp_mask) ({ \ + struct page *page_addr = NULL; \ + \ + if (memtrack_inject_error(THIS_MODULE, __FILE__, "get_zeroed_page", __func__, __LINE__)) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(THIS_MODULE, __FILE__, __LINE__, __func__, "get_zeroed_page"); \ + else \ + page_addr = (struct page *)get_zeroed_page(gfp_mask); \ + if (page_addr && !is_non_trackable_alloc_func(__func__)) { \ + memtrack_alloc(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(page_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ + } \ + (unsigned long)page_addr; \ +}) + +#define __free_pages(addr, order) ({ \ + void *__memtrack_addr = (void *)addr; \ + \ + if (__memtrack_addr && !is_non_trackable_alloc_func(__func__)) { \ + if (!memtrack_check_size(MEMTRACK_PAGE_ALLOC, (unsigned long)(__memtrack_addr), (unsigned long)(order), __FILE__, __LINE__)) \ + memtrack_free(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \ + } \ + __free_pages(addr, order); \ +}) + + +#define free_pages(addr, order) ({ \ + void *__memtrack_addr = (void *)addr; \ + \ + if (__memtrack_addr && !is_non_trackable_alloc_func(__func__)) { \ + if (!memtrack_check_size(MEMTRACK_PAGE_ALLOC, (unsigned long)(__memtrack_addr), (unsigned long)(order), __FILE__, __LINE__)) \ + memtrack_free(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \ + } \ + free_pages(addr, order); \ +}) + + +#define get_page(addr) ({ \ + void *__memtrack_addr = (void *)addr; \ + \ + if (__memtrack_addr && !is_non_trackable_alloc_func(__func__)) { \ + if (memtrack_is_new_addr(MEMTRACK_PAGE_ALLOC, (unsigned long)(__memtrack_addr), 0, __FILE__, __LINE__)) { \ + memtrack_alloc(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(__memtrack_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ + } \ + } \ + get_page(addr); \ +}) + +#define get_user_pages_fast(start, nr_pages, write, pages) ({ \ + int __memtrack_rc = -1; \ + \ + if (memtrack_inject_error(THIS_MODULE, __FILE__, "get_user_pages_fast", __func__, __LINE__)) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(THIS_MODULE, __FILE__, __LINE__, __func__, "get_user_pages_fast"); \ + else \ + __memtrack_rc = get_user_pages_fast(start, nr_pages, write, pages); \ + if (__memtrack_rc > 0 && !is_non_trackable_alloc_func(__func__)) { \ + int __memtrack_i; \ + \ + for (__memtrack_i = 0; __memtrack_i < __memtrack_rc; __memtrack_i++) \ + memtrack_alloc(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(pages[__memtrack_i]), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ + } \ + __memtrack_rc; \ +}) + +#define put_page(addr) ({ \ + void *__memtrack_addr = (void *)addr; \ + \ + if (__memtrack_addr && !is_non_trackable_alloc_func(__func__)) { \ + /* Check whether this is not part of umem put page & not */\ + /* a new addr and the ref-count is 1 then we'll free this addr */\ + /* Don't change the order these conditions */ \ + if (!is_umem_put_page(__func__) && \ + !memtrack_is_new_addr(MEMTRACK_PAGE_ALLOC, (unsigned long)(__memtrack_addr), 1, __FILE__, __LINE__) && \ + (memtrack_get_page_ref_count((unsigned long)(__memtrack_addr)) == 1)) { \ + memtrack_free(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \ + } \ + } \ + put_page(addr); \ +}) + + +/* Work-Queue handlers */ +#ifdef create_workqueue + #undef create_workqueue +#endif +#ifdef create_rt_workqueue + #undef create_rt_workqueue +#endif +#ifdef create_freezeable_workqueue + #undef create_freezeable_workqueue +#endif +#ifdef create_singlethread_workqueue + #undef create_singlethread_workqueue +#endif + +#if defined(alloc_ordered_workqueue) + #undef alloc_ordered_workqueue +#endif + +#ifdef alloc_workqueue +/* In kernels < 5.1, alloc_workqueue was a macro */ +#undef alloc_workqueue +#ifdef CONFIG_LOCKDEP +#define alloc_workqueue(name, flags, max_active, args...) \ +({ \ + static struct lock_class_key __key; \ + const char *__lock_name; \ + struct workqueue_struct *wq_addr = NULL; \ + \ + if (__builtin_constant_p(name)) \ + __lock_name = (name); \ + else \ + __lock_name = #name; \ + \ + if (memtrack_inject_error(THIS_MODULE, __FILE__, "alloc_workqueue", __func__, __LINE__)) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(THIS_MODULE, __FILE__, __LINE__, __func__, "alloc_workqueue"); \ + else \ + wq_addr = __alloc_workqueue_key((name), (flags), (max_active), \ + &__key, __lock_name, ##args); \ + if (wq_addr) { \ + memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ + } \ + wq_addr; \ +}) +#else +#define alloc_workqueue(name, flags, max_active, args...) ({ \ + struct workqueue_struct *wq_addr = NULL; \ + \ + if (memtrack_inject_error(THIS_MODULE, __FILE__, "alloc_workqueue", __func__, __LINE__)) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(THIS_MODULE, __FILE__, __LINE__, __func__, "alloc_workqueue"); \ + else \ + wq_addr = __alloc_workqueue_key((name), (flags), (max_active), \ + NULL, NULL, ##args); \ + if (wq_addr) { \ + memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ + } \ + wq_addr; \ +}) +#endif +#else +/* In kernels >= 5.1, alloc_workqueue is a function */ +#define alloc_workqueue(name, flags, max_active, args...) ({ \ + struct workqueue_struct *wq_addr = NULL; \ + \ + if (memtrack_inject_error(THIS_MODULE, __FILE__, "alloc_workqueue", __func__, __LINE__)) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(THIS_MODULE, __FILE__, __LINE__, __func__, "alloc_workqueue"); \ + else \ + wq_addr = alloc_workqueue(name, flags, max_active, ##args); \ + if (wq_addr) { \ + memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \ + } \ + wq_addr; \ +}) +#endif + +#define WQ_RESCUER 1 << 7 /* internal: workqueue has rescuer */ + +#define create_workqueue(name) \ + alloc_workqueue((name), WQ_RESCUER, 1); + +#define create_freezeable_workqueue(name) \ + alloc_workqueue((name), WQ_FREEZEABLE | WQ_UNBOUND | WQ_RESCUER, 1); + +#define create_singlethread_workqueue(name) \ + alloc_workqueue((name), WQ_UNBOUND | WQ_RESCUER, 1); + +#define alloc_ordered_workqueue(name, flags, args...) \ + alloc_workqueue((name), WQ_UNBOUND | __WQ_ORDERED | (flags), 1, ##args) + +#define destroy_workqueue(wq_addr) ({ \ + void *__memtrack_addr = (void *)wq_addr; \ + \ + if (__memtrack_addr) { \ + memtrack_free(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \ + } \ + destroy_workqueue(wq_addr); \ +}) + +/* ONLY error injection to functions that we don't monitor */ +#define alloc_skb(size, prio) ({ \ + struct sk_buff *__memtrack_skb = NULL; \ + \ + if (memtrack_inject_error(THIS_MODULE, __FILE__, "alloc_skb", __func__, __LINE__)) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(THIS_MODULE, __FILE__, __LINE__, __func__, "alloc_skb"); \ + else \ + __memtrack_skb = alloc_skb(size, prio); \ + __memtrack_skb; \ +}) + +#define dev_alloc_skb(size) ({ \ + struct sk_buff *__memtrack_skb = NULL; \ + \ + if (memtrack_inject_error(THIS_MODULE, __FILE__, "dev_alloc_skb", __func__, __LINE__)) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(THIS_MODULE, __FILE__, __LINE__, __func__, "dev_alloc_skb"); \ + else \ + __memtrack_skb = dev_alloc_skb(size); \ + __memtrack_skb; \ +}) + +#define alloc_skb_fclone(size, prio) ({ \ + struct sk_buff *__memtrack_skb = NULL; \ + \ + if (memtrack_inject_error(THIS_MODULE, __FILE__, "alloc_skb_fclone", __func__, __LINE__)) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(THIS_MODULE, __FILE__, __LINE__, __func__, "alloc_skb_fclone"); \ + else \ + __memtrack_skb = alloc_skb_fclone(size, prio); \ + __memtrack_skb; \ +}) + +#define copy_from_user(to, from, n) ({ \ + int ret = n; \ + \ + if (memtrack_inject_error(THIS_MODULE, __FILE__, "copy_from_user", __func__, __LINE__)) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(THIS_MODULE, __FILE__, __LINE__, __func__, "copy_from_user"); \ + else \ + ret = copy_from_user(to, from, n); \ + ret; \ +}) + +#define copy_to_user(to, from, n) ({ \ + int ret = n; \ + \ + if (memtrack_inject_error(THIS_MODULE, __FILE__, "copy_to_user", __func__, __LINE__)) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(THIS_MODULE, __FILE__, __LINE__, __func__, "copy_to_user"); \ + else \ + ret = copy_to_user(to, from, n); \ + ret; \ +}) + +#define sysfs_create_file(kobj, attr) ({ \ + int ret = -ENOSYS; \ + \ + if (memtrack_inject_error(THIS_MODULE, __FILE__, "sysfs_create_file", __func__, __LINE__)) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(THIS_MODULE, __FILE__, __LINE__, __func__, "sysfs_create_file"); \ + else \ + ret = sysfs_create_file(kobj, attr); \ + ret; \ +}) + +#define sysfs_create_link(kobj, target, name) ({ \ + int ret = -ENOSYS; \ + \ + if (memtrack_inject_error(THIS_MODULE, __FILE__, "sysfs_create_link", __func__, __LINE__)) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(THIS_MODULE, __FILE__, __LINE__, __func__, "sysfs_create_link"); \ + else \ + ret = sysfs_create_link(kobj, target, name); \ + ret; \ +}) + +#define sysfs_create_group(kobj, grp) ({ \ + int ret = -ENOSYS; \ + \ + if (memtrack_inject_error(THIS_MODULE, __FILE__, "sysfs_create_group", __func__, __LINE__)) \ + MEMTRACK_ERROR_INJECTION_MESSAGE(THIS_MODULE, __FILE__, __LINE__, __func__, "sysfs_create_group"); \ + else \ + ret = sysfs_create_group(kobj, grp); \ + ret; \ +}) + +#endif /* __mtrack_h_ */ + diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/Makefile new file mode 100644 index 0000000..023476c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/Makefile @@ -0,0 +1,18 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_INFINIBAND_MTHCA) += mthca/ +obj-$(CONFIG_INFINIBAND_QIB) += qib/ +obj-$(CONFIG_INFINIBAND_CXGB3) += cxgb3/ +obj-$(CONFIG_INFINIBAND_CXGB4) += cxgb4/ +obj-m += efa/ # For SLES12 SP5 errata kernels. +obj-$(CONFIG_INFINIBAND_IRDMA) += irdma/ +obj-$(CONFIG_INFINIBAND_I40IW) += i40iw/ +obj-$(CONFIG_MLX4_INFINIBAND) += mlx4/ +obj-$(CONFIG_MLX5_INFINIBAND) += mlx5/ +obj-$(CONFIG_INFINIBAND_NES) += nes/ +obj-$(CONFIG_INFINIBAND_OCRDMA) += ocrdma/ +obj-$(CONFIG_INFINIBAND_VMWARE_PVRDMA) += vmw_pvrdma/ +obj-$(CONFIG_INFINIBAND_USNIC) += usnic/ +obj-$(CONFIG_INFINIBAND_HFI1) += hfi1/ +obj-$(CONFIG_INFINIBAND_HNS) += hns/ +obj-$(CONFIG_INFINIBAND_QEDR) += qedr/ +obj-$(CONFIG_INFINIBAND_BNXT_RE) += bnxt_re/ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/amso1100/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/amso1100/Makefile new file mode 100644 index 0000000..d83b9f5 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/amso1100/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_INFINIBAND_AMSO1100) += iw_c2.o + +iw_c2-y := main.o diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/amso1100/main.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/amso1100/main.c new file mode 100644 index 0000000..fc97942 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/amso1100/main.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2012 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "iw_c2" +#define DRV_VERSION "5.8-1.1.2" +#define DRV_RELDATE "27 Nov 2022" + +MODULE_AUTHOR("Vladimir Sokolovsky"); +MODULE_DESCRIPTION("iw_c2 dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init iw_c2_init(void) +{ + return 0; +} + +static void __exit iw_c2_cleanup(void) +{ +} + +module_init(iw_c2_init); +module_exit(iw_c2_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/bnxt_re/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/bnxt_re/Makefile new file mode 100644 index 0000000..79d63fc --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/bnxt_re/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_INFINIBAND_BNXT_RE) += bnxt_re.o + +bnxt_re-y := main.o diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/bnxt_re/main.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/bnxt_re/main.c new file mode 100644 index 0000000..36da31e --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/bnxt_re/main.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2017 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "bnxt_re" +#define DRV_VERSION "2.0.0" +#define DRV_RELDATE "March 06, 2017" + +MODULE_AUTHOR("Alaa Hleihel"); +MODULE_DESCRIPTION("bnxt_re dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init bnxt_re_init(void) +{ + return 0; +} + +static void __exit bnxt_re_cleanup(void) +{ +} + +module_init(bnxt_re_init); +module_exit(bnxt_re_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/cxgb3/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/cxgb3/Makefile new file mode 100644 index 0000000..0e3cd4f --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/cxgb3/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_INFINIBAND_CXGB3) += iw_cxgb3.o + +iw_cxgb3-y := main.o diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/cxgb3/main.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/cxgb3/main.c new file mode 100644 index 0000000..c5d51ae --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/cxgb3/main.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2012 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "iw_cxgb3" +#define DRV_VERSION "5.8-1.1.2" +#define DRV_RELDATE "27 Nov 2022" + +MODULE_AUTHOR("Alaa Hleihel"); +MODULE_DESCRIPTION("iw_cxgb3 dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init iw_cxgb3_init(void) +{ + return 0; +} + +static void __exit iw_cxgb3_cleanup(void) +{ +} + +module_init(iw_cxgb3_init); +module_exit(iw_cxgb3_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/cxgb4/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/cxgb4/Makefile new file mode 100644 index 0000000..64b3c40 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/cxgb4/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_INFINIBAND_CXGB4) += iw_cxgb4.o + +iw_cxgb4-y := main.o diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/cxgb4/main.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/cxgb4/main.c new file mode 100644 index 0000000..9fc3839 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/cxgb4/main.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2012 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "iw_cxgb4" +#define DRV_VERSION "5.8-1.1.2" +#define DRV_RELDATE "27 Nov 2022" + +MODULE_AUTHOR("Alaa Hleihel"); +MODULE_DESCRIPTION("iw_cxgb4 dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init iw_cxgb4_init(void) +{ + return 0; +} + +static void __exit iw_cxgb4_cleanup(void) +{ +} + +module_init(iw_cxgb4_init); +module_exit(iw_cxgb4_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/efa/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/efa/Makefile new file mode 100644 index 0000000..c692717 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/efa/Makefile @@ -0,0 +1,3 @@ +obj-m += efa.o + +efa-y := efa_main.o diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/efa/efa_main.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/efa/efa_main.c new file mode 100644 index 0000000..822bc26 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/efa/efa_main.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "efa" +#define DRV_VERSION "1.1" +#define DRV_RELDATE "Jun 11, 2019" + +MODULE_AUTHOR("Talat Batheesh"); +MODULE_DESCRIPTION("efa dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init efa_init(void) +{ + return 0; +} + +static void __exit efa_cleanup(void) +{ +} + +module_init(efa_init); +module_exit(efa_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/ehca/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/ehca/Makefile new file mode 100644 index 0000000..8c72d47 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/ehca/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_INFINIBAND_EHCA) += ib_ehca.o diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/ehca/ib_ehca.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/ehca/ib_ehca.c new file mode 100644 index 0000000..00640e6 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/ehca/ib_ehca.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "ib_ehca" +#define DRV_VERSION "1.1" +#define DRV_RELDATE "April 03, 2016" + +MODULE_AUTHOR("Alaa Hleihel"); +MODULE_DESCRIPTION("ib_ehca dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init ib_ehca_init(void) +{ + return 0; +} + +static void __exit ib_ehca_cleanup(void) +{ +} + +module_init(ib_ehca_init); +module_exit(ib_ehca_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/hfi1/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/hfi1/Makefile new file mode 100644 index 0000000..75e38f1 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/hfi1/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_INFINIBAND_HFI1) += hfi1.o + +hfi1-y := main.o diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/hfi1/main.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/hfi1/main.c new file mode 100644 index 0000000..27e7079 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/hfi1/main.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "hfi1" +#define DRV_VERSION "2.0.0" +#define DRV_RELDATE "August 14, 2016" + +MODULE_AUTHOR("Alaa Hleihel"); +MODULE_DESCRIPTION("hfi1 dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init hfi1_init(void) +{ + return 0; +} + +static void __exit hfi1_cleanup(void) +{ +} + +module_init(hfi1_init); +module_exit(hfi1_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/hns/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/hns/Makefile new file mode 100644 index 0000000..1f27456 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/hns/Makefile @@ -0,0 +1,11 @@ +ifneq ($(filter m,$(CONFIG_INFINIBAND_HNS) $(CONFIG_INFINIBAND_HISILICON_HNS)),) +obj-m += hns-roce.o +endif + +hns-roce-y := main.o + +obj-$(CONFIG_INFINIBAND_HNS_HIP06) += hns-roce-hw-v1.o +hns-roce-hw-v1-objs := hns_roce_hw_v1.o +obj-$(CONFIG_INFINIBAND_HNS_HIP08) += hns-roce-hw-v2.o +hns-roce-hw-v2-objs := hns_roce_hw_v2.o + diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/hns/hns_roce_hw_v1.c new file mode 100644 index 0000000..97c225e --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/hns/hns_roce_hw_v1.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "hns_roce_hw_v1" +#define DRV_VERSION "4.6.0" +#define DRV_RELDATE "April 10, 2019" + +MODULE_AUTHOR("Talat Batheesh"); +MODULE_DESCRIPTION("hns_roce_hw_v1 dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init hns_roce_hw_v1_init(void) +{ + return 0; +} + +static void __exit hns_roce_hw_v1_exit(void) +{ +} + +module_init(hns_roce_hw_v1_init); +module_exit(hns_roce_hw_v1_exit); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/hns/hns_roce_hw_v2.c new file mode 100644 index 0000000..c616318 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "hns_roce_hw_v2" +#define DRV_VERSION "4.6.0" +#define DRV_RELDATE "April 10, 2019" + +MODULE_AUTHOR("Talat Batheesh"); +MODULE_DESCRIPTION("hns_roce_hw_v2 dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init hns_roce_hw_v2_init(void) +{ + return 0; +} + +static void __exit hns_roce_hw_v2_exit(void) +{ +} + +module_init(hns_roce_hw_v2_init); +module_exit(hns_roce_hw_v2_exit); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/hns/main.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/hns/main.c new file mode 100644 index 0000000..504156f --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/hns/main.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "hns-roce" +#define DRV_VERSION "2.0.0" +#define DRV_RELDATE "November 06, 2016" + +MODULE_AUTHOR("Alaa Hleihel"); +MODULE_DESCRIPTION("hns-roce dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init hns_roce_init(void) +{ + return 0; +} + +static void __exit hns_roce_cleanup(void) +{ +} + +module_init(hns_roce_init); +module_exit(hns_roce_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/i40iw/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/i40iw/Makefile new file mode 100644 index 0000000..d390d6e --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/i40iw/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_INFINIBAND_I40IW) += i40iw.o + +i40iw-y := main.o diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/i40iw/main.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/i40iw/main.c new file mode 100644 index 0000000..6b5937b --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/i40iw/main.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "i40iw" +#define DRV_VERSION "2.0.0" +#define DRV_RELDATE "August 14, 2016" + +MODULE_AUTHOR("Alaa Hleihel"); +MODULE_DESCRIPTION("i40iw dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init i40iw_init(void) +{ + return 0; +} + +static void __exit i40iw_cleanup(void) +{ +} + +module_init(i40iw_init); +module_exit(i40iw_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/ipath/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/ipath/Makefile new file mode 100644 index 0000000..b3ab92b --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/ipath/Makefile @@ -0,0 +1,4 @@ +ccflags-y := -DIPATH_IDSTR='"QLogic kernel.org driver"' \ + -DIPATH_KERN_TYPE=0 + +obj-$(CONFIG_INFINIBAND_IPATH) += ib_ipath.o diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/ipath/ib_ipath.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/ipath/ib_ipath.c new file mode 100644 index 0000000..5bb8244 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/ipath/ib_ipath.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2017 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "ib_ipath" +#define DRV_VERSION "5.8-1.1.2" +#define DRV_RELDATE "27 Nov 2022" + +MODULE_AUTHOR("Alaa Hleihel"); +MODULE_DESCRIPTION("ib_ipath dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init ib_ipath_init(void) +{ + return 0; +} + +static void __exit ib_ipath_cleanup(void) +{ +} + +module_init(ib_ipath_init); +module_exit(ib_ipath_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/irdma/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/irdma/Makefile new file mode 100644 index 0000000..1ae77c4 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/irdma/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_INFINIBAND_IRDMA) += irdma.o + +irdma-objs := irdma_dummy.o diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/irdma/irdma_dummy.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/irdma/irdma_dummy.c new file mode 100644 index 0000000..c0cbba0 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/irdma/irdma_dummy.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2014 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "irdma" +#define DRV_VERSION "5.7" +#define DRV_RELDATE "Feb 13, 2022" + +MODULE_AUTHOR("Roy Novich"); +MODULE_DESCRIPTION("irdma dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init irdma_init(void) +{ + return 0; +} + +static void __exit irdma_cleanup(void) +{ +} + +module_init(irdma_init); +module_exit(irdma_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx4/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx4/Makefile new file mode 100644 index 0000000..147aabe --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx4/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_MLX4_INFINIBAND) += mlx4_ib.o + +mlx4_ib-y := main.o diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx4/main.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx4/main.c new file mode 100644 index 0000000..45581bd --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx4/main.c @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2020 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME mlx4_ib +#define DRV_VERSION "5.8-1.1.2" +#define DRV_RELDATE "27 Nov 2022" + +MODULE_AUTHOR("Tzafrir Cohen"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +#define dummy_module_code(name) \ +MODULE_DESCRIPTION(#name " dummy kernel module"); \ +\ +static int __init name##_init(void) \ +{ \ + return 0; \ +} \ +\ +static void __exit name##_cleanup(void) \ +{ \ +} \ +\ +module_init(name##_init); \ +module_exit(name##_cleanup); \ + +dummy_module_code(DRV_NAME); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/Kconfig b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/Kconfig new file mode 100644 index 0000000..ef1ff42 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/Kconfig @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0-only +config MLX5_INFINIBAND + tristate "Mellanox 5th generation network adapters (ConnectX series) support" + depends on NETDEVICES && ETHERNET && PCI && MLX5_CORE + help + This driver provides low-level InfiniBand support for + Mellanox Connect-IB PCI Express host channel adapters (HCAs). + This is required to use InfiniBand protocols such as + IP-over-IB or SRP with these devices. diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/Makefile new file mode 100644 index 0000000..e514706 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/Makefile @@ -0,0 +1,33 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_MLX5_INFINIBAND) += mlx5_ib.o + +mlx5_ib-y := ah.o \ + cmd.o \ + cong.o \ + counters.o \ + cq.o \ + dm.o \ + doorbell.o \ + gsi.o \ + ib_virt.o \ + mad.o \ + main.o \ + main_ext.o \ + mem.o \ + mr.o \ + nvmf.o \ + qp.o \ + qpc.o \ + qp_nvmf.o \ + restrack.o \ + srq.o \ + srq_cmd.o \ + srq_nvmf.o \ + wr.o + +mlx5_ib-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += odp.o +mlx5_ib-$(CONFIG_MLX5_ESWITCH) += ib_rep.o +mlx5_ib-$(CONFIG_INFINIBAND_USER_ACCESS) += devx.o \ + fs.o \ + qos.o \ + std_types.o diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/ah.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/ah.c new file mode 100644 index 0000000..671004c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/ah.c @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "mlx5_ib.h" + +static __be16 mlx5_ah_get_udp_sport(const struct mlx5_ib_dev *dev, + const struct rdma_ah_attr *ah_attr) +{ + enum ib_gid_type gid_type = ah_attr->grh.sgid_attr->gid_type; + __be16 sport; + + if ((gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) && + (rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH) && + (ah_attr->grh.flow_label & IB_GRH_FLOWLABEL_MASK)) + sport = cpu_to_be16( + rdma_flow_label_to_udp_sport(ah_attr->grh.flow_label)); + else + sport = mlx5_get_roce_udp_sport_min(dev, + ah_attr->grh.sgid_attr); + + return sport; +} + +static void create_ib_ah(struct mlx5_ib_dev *dev, struct mlx5_ib_ah *ah, + struct rdma_ah_init_attr *init_attr) +{ + struct rdma_ah_attr *ah_attr = init_attr->ah_attr; + enum ib_gid_type gid_type; + + if (rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH) { + const struct ib_global_route *grh = rdma_ah_read_grh(ah_attr); + + memcpy(ah->av.rgid, &grh->dgid, 16); + ah->av.grh_gid_fl = cpu_to_be32(grh->flow_label | + (1 << 30) | + grh->sgid_index << 20); + ah->av.hop_limit = grh->hop_limit; + ah->av.tclass = grh->traffic_class; + } + + ah->av.stat_rate_sl = (rdma_ah_get_static_rate(ah_attr) << 4); + + if (ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) { + u8 tmp_hop_limit = 0; + if (init_attr->xmit_slave) + ah->xmit_port = + mlx5_lag_get_slave_port(dev->mdev, + init_attr->xmit_slave); + gid_type = ah_attr->grh.sgid_attr->gid_type; + + if ((gid_type != IB_GID_TYPE_IB) && + (ah_attr->grh.hop_limit < 2)) + tmp_hop_limit = IPV6_DEFAULT_HOPLIMIT; + else + tmp_hop_limit = ah_attr->grh.hop_limit; + + ah->av.hop_limit = (dev->ttld[ah_attr->port_num - 1].val) ? + dev->ttld[ah_attr->port_num - 1].val : tmp_hop_limit; + + memcpy(ah->av.rmac, ah_attr->roce.dmac, + sizeof(ah_attr->roce.dmac)); + ah->av.udp_sport = mlx5_ah_get_udp_sport(dev, ah_attr); + ah->av.stat_rate_sl |= (rdma_ah_get_sl(ah_attr) & 0x7) << 1; + if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) +#define MLX5_ECN_ENABLED BIT(1) + ah->av.tclass |= MLX5_ECN_ENABLED; + } else { + ah->av.rlid = cpu_to_be16(rdma_ah_get_dlid(ah_attr)); + ah->av.fl_mlid = rdma_ah_get_path_bits(ah_attr) & 0x7f; + ah->av.stat_rate_sl |= (rdma_ah_get_sl(ah_attr) & 0xf); + } +} + +int mlx5_ib_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, + struct ib_udata *udata) + +{ + struct rdma_ah_attr *ah_attr = init_attr->ah_attr; + struct mlx5_ib_ah *ah = to_mah(ibah); + struct mlx5_ib_dev *dev = to_mdev(ibah->device); + enum rdma_ah_attr_type ah_type = ah_attr->type; + + if ((ah_type == RDMA_AH_ATTR_TYPE_ROCE) && + !(rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH)) + return -EINVAL; + + if (ah_type == RDMA_AH_ATTR_TYPE_ROCE && udata) { + int err; + struct mlx5_ib_create_ah_resp resp = {}; + u32 min_resp_len = + offsetofend(struct mlx5_ib_create_ah_resp, dmac); + + if (udata->outlen < min_resp_len) + return -EINVAL; + + resp.response_length = min_resp_len; + + memcpy(resp.dmac, ah_attr->roce.dmac, ETH_ALEN); + err = ib_copy_to_udata(udata, &resp, resp.response_length); + if (err) + return err; + } + + create_ib_ah(dev, ah, init_attr); + return 0; +} + +int mlx5_ib_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr) +{ + struct mlx5_ib_ah *ah = to_mah(ibah); + u32 tmp; + + memset(ah_attr, 0, sizeof(*ah_attr)); + ah_attr->type = ibah->type; + + tmp = be32_to_cpu(ah->av.grh_gid_fl); + if (tmp & (1 << 30)) { + rdma_ah_set_grh(ah_attr, NULL, + tmp & 0xfffff, + (tmp >> 20) & 0xff, + ah->av.hop_limit, + ah->av.tclass); + rdma_ah_set_dgid_raw(ah_attr, ah->av.rgid); + } + rdma_ah_set_dlid(ah_attr, be16_to_cpu(ah->av.rlid)); + rdma_ah_set_static_rate(ah_attr, ah->av.stat_rate_sl >> 4); + rdma_ah_set_sl(ah_attr, ah->av.stat_rate_sl & 0xf); + + return 0; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/cmd.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/cmd.c new file mode 100644 index 0000000..ff3742b --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/cmd.c @@ -0,0 +1,234 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2017-2020, Mellanox Technologies inc. All rights reserved. + */ + +#include "cmd.h" + +int mlx5_cmd_dump_fill_mkey(struct mlx5_core_dev *dev, u32 *mkey) +{ + u32 out[MLX5_ST_SZ_DW(query_special_contexts_out)] = {}; + u32 in[MLX5_ST_SZ_DW(query_special_contexts_in)] = {}; + int err; + + MLX5_SET(query_special_contexts_in, in, opcode, + MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS); + err = mlx5_cmd_exec_inout(dev, query_special_contexts, in, out); + if (!err) + *mkey = MLX5_GET(query_special_contexts_out, out, + dump_fill_mkey); + return err; +} + +int mlx5_cmd_null_mkey(struct mlx5_core_dev *dev, u32 *null_mkey) +{ + u32 out[MLX5_ST_SZ_DW(query_special_contexts_out)] = {}; + u32 in[MLX5_ST_SZ_DW(query_special_contexts_in)] = {}; + int err; + + MLX5_SET(query_special_contexts_in, in, opcode, + MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS); + err = mlx5_cmd_exec_inout(dev, query_special_contexts, in, out); + if (!err) + *null_mkey = MLX5_GET(query_special_contexts_out, out, + null_mkey); + return err; +} + +int mlx5_cmd_query_cong_params(struct mlx5_core_dev *dev, int cong_point, + void *out) +{ + u32 in[MLX5_ST_SZ_DW(query_cong_params_in)] = {}; + + MLX5_SET(query_cong_params_in, in, opcode, + MLX5_CMD_OP_QUERY_CONG_PARAMS); + MLX5_SET(query_cong_params_in, in, cong_protocol, cong_point); + + return mlx5_cmd_exec_inout(dev, query_cong_params, in, out); +} + +void mlx5_cmd_destroy_tir(struct mlx5_core_dev *dev, u32 tirn, u16 uid) +{ + u32 in[MLX5_ST_SZ_DW(destroy_tir_in)] = {}; + + MLX5_SET(destroy_tir_in, in, opcode, MLX5_CMD_OP_DESTROY_TIR); + MLX5_SET(destroy_tir_in, in, tirn, tirn); + MLX5_SET(destroy_tir_in, in, uid, uid); + mlx5_cmd_exec_in(dev, destroy_tir, in); +} + +void mlx5_cmd_destroy_tis(struct mlx5_core_dev *dev, u32 tisn, u16 uid) +{ + u32 in[MLX5_ST_SZ_DW(destroy_tis_in)] = {}; + + MLX5_SET(destroy_tis_in, in, opcode, MLX5_CMD_OP_DESTROY_TIS); + MLX5_SET(destroy_tis_in, in, tisn, tisn); + MLX5_SET(destroy_tis_in, in, uid, uid); + mlx5_cmd_exec_in(dev, destroy_tis, in); +} + +int mlx5_cmd_destroy_rqt(struct mlx5_core_dev *dev, u32 rqtn, u16 uid) +{ + u32 in[MLX5_ST_SZ_DW(destroy_rqt_in)] = {}; + + MLX5_SET(destroy_rqt_in, in, opcode, MLX5_CMD_OP_DESTROY_RQT); + MLX5_SET(destroy_rqt_in, in, rqtn, rqtn); + MLX5_SET(destroy_rqt_in, in, uid, uid); + return mlx5_cmd_exec_in(dev, destroy_rqt, in); +} + +int mlx5_cmd_alloc_transport_domain(struct mlx5_core_dev *dev, u32 *tdn, + u16 uid) +{ + u32 in[MLX5_ST_SZ_DW(alloc_transport_domain_in)] = {}; + u32 out[MLX5_ST_SZ_DW(alloc_transport_domain_out)] = {}; + int err; + + MLX5_SET(alloc_transport_domain_in, in, opcode, + MLX5_CMD_OP_ALLOC_TRANSPORT_DOMAIN); + MLX5_SET(alloc_transport_domain_in, in, uid, uid); + + err = mlx5_cmd_exec_inout(dev, alloc_transport_domain, in, out); + if (!err) + *tdn = MLX5_GET(alloc_transport_domain_out, out, + transport_domain); + + return err; +} + +void mlx5_cmd_dealloc_transport_domain(struct mlx5_core_dev *dev, u32 tdn, + u16 uid) +{ + u32 in[MLX5_ST_SZ_DW(dealloc_transport_domain_in)] = {}; + + MLX5_SET(dealloc_transport_domain_in, in, opcode, + MLX5_CMD_OP_DEALLOC_TRANSPORT_DOMAIN); + MLX5_SET(dealloc_transport_domain_in, in, uid, uid); + MLX5_SET(dealloc_transport_domain_in, in, transport_domain, tdn); + mlx5_cmd_exec_in(dev, dealloc_transport_domain, in); +} + +int mlx5_cmd_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn, u16 uid) +{ + u32 in[MLX5_ST_SZ_DW(dealloc_pd_in)] = {}; + + MLX5_SET(dealloc_pd_in, in, opcode, MLX5_CMD_OP_DEALLOC_PD); + MLX5_SET(dealloc_pd_in, in, pd, pdn); + MLX5_SET(dealloc_pd_in, in, uid, uid); + return mlx5_cmd_exec_in(dev, dealloc_pd, in); +} + +int mlx5_cmd_attach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, + u32 qpn, u16 uid) +{ + u32 in[MLX5_ST_SZ_DW(attach_to_mcg_in)] = {}; + void *gid; + + MLX5_SET(attach_to_mcg_in, in, opcode, MLX5_CMD_OP_ATTACH_TO_MCG); + MLX5_SET(attach_to_mcg_in, in, qpn, qpn); + MLX5_SET(attach_to_mcg_in, in, uid, uid); + gid = MLX5_ADDR_OF(attach_to_mcg_in, in, multicast_gid); + memcpy(gid, mgid, sizeof(*mgid)); + return mlx5_cmd_exec_in(dev, attach_to_mcg, in); +} + +int mlx5_cmd_detach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, + u32 qpn, u16 uid) +{ + u32 in[MLX5_ST_SZ_DW(detach_from_mcg_in)] = {}; + void *gid; + + MLX5_SET(detach_from_mcg_in, in, opcode, MLX5_CMD_OP_DETACH_FROM_MCG); + MLX5_SET(detach_from_mcg_in, in, qpn, qpn); + MLX5_SET(detach_from_mcg_in, in, uid, uid); + gid = MLX5_ADDR_OF(detach_from_mcg_in, in, multicast_gid); + memcpy(gid, mgid, sizeof(*mgid)); + return mlx5_cmd_exec_in(dev, detach_from_mcg, in); +} + +int mlx5_cmd_xrcd_alloc(struct mlx5_core_dev *dev, u32 *xrcdn, u16 uid) +{ + u32 out[MLX5_ST_SZ_DW(alloc_xrcd_out)] = {}; + u32 in[MLX5_ST_SZ_DW(alloc_xrcd_in)] = {}; + int err; + + MLX5_SET(alloc_xrcd_in, in, opcode, MLX5_CMD_OP_ALLOC_XRCD); + MLX5_SET(alloc_xrcd_in, in, uid, uid); + err = mlx5_cmd_exec_inout(dev, alloc_xrcd, in, out); + if (!err) + *xrcdn = MLX5_GET(alloc_xrcd_out, out, xrcd); + return err; +} + +int mlx5_cmd_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn, u16 uid) +{ + u32 in[MLX5_ST_SZ_DW(dealloc_xrcd_in)] = {}; + + MLX5_SET(dealloc_xrcd_in, in, opcode, MLX5_CMD_OP_DEALLOC_XRCD); + MLX5_SET(dealloc_xrcd_in, in, xrcd, xrcdn); + MLX5_SET(dealloc_xrcd_in, in, uid, uid); + return mlx5_cmd_exec_in(dev, dealloc_xrcd, in); +} + +int mlx5_cmd_mad_ifc(struct mlx5_core_dev *dev, const void *inb, void *outb, + u16 opmod, u8 port) +{ + int outlen = MLX5_ST_SZ_BYTES(mad_ifc_out); + int inlen = MLX5_ST_SZ_BYTES(mad_ifc_in); + int err = -ENOMEM; + void *data; + void *resp; + u32 *out; + u32 *in; + + in = kzalloc(inlen, GFP_KERNEL); + out = kzalloc(outlen, GFP_KERNEL); + if (!in || !out) + goto out; + + MLX5_SET(mad_ifc_in, in, opcode, MLX5_CMD_OP_MAD_IFC); + MLX5_SET(mad_ifc_in, in, op_mod, opmod); + MLX5_SET(mad_ifc_in, in, port, port); + + data = MLX5_ADDR_OF(mad_ifc_in, in, mad); + memcpy(data, inb, MLX5_FLD_SZ_BYTES(mad_ifc_in, mad)); + + err = mlx5_cmd_exec_inout(dev, mad_ifc, in, out); + if (err) + goto out; + + resp = MLX5_ADDR_OF(mad_ifc_out, out, response_mad_packet); + memcpy(outb, resp, + MLX5_FLD_SZ_BYTES(mad_ifc_out, response_mad_packet)); + +out: + kfree(out); + kfree(in); + return err; +} + +int mlx5_cmd_uar_alloc(struct mlx5_core_dev *dev, u32 *uarn, u16 uid) +{ + u32 out[MLX5_ST_SZ_DW(alloc_uar_out)] = {}; + u32 in[MLX5_ST_SZ_DW(alloc_uar_in)] = {}; + int err; + + MLX5_SET(alloc_uar_in, in, opcode, MLX5_CMD_OP_ALLOC_UAR); + MLX5_SET(alloc_uar_in, in, uid, uid); + err = mlx5_cmd_exec_inout(dev, alloc_uar, in, out); + if (err) + return err; + + *uarn = MLX5_GET(alloc_uar_out, out, uar); + return 0; +} + +int mlx5_cmd_uar_dealloc(struct mlx5_core_dev *dev, u32 uarn, u16 uid) +{ + u32 in[MLX5_ST_SZ_DW(dealloc_uar_in)] = {}; + + MLX5_SET(dealloc_uar_in, in, opcode, MLX5_CMD_OP_DEALLOC_UAR); + MLX5_SET(dealloc_uar_in, in, uar, uarn); + MLX5_SET(dealloc_uar_in, in, uid, uid); + return mlx5_cmd_exec_in(dev, dealloc_uar, in); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/cmd.h b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/cmd.h new file mode 100644 index 0000000..ee46638 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/cmd.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2017, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX5_IB_CMD_H +#define MLX5_IB_CMD_H + +#include "mlx5_ib.h" +#include +#include + +int mlx5_cmd_dump_fill_mkey(struct mlx5_core_dev *dev, u32 *mkey); +int mlx5_cmd_null_mkey(struct mlx5_core_dev *dev, u32 *null_mkey); +int mlx5_cmd_query_cong_params(struct mlx5_core_dev *dev, int cong_point, + void *out); +int mlx5_cmd_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn, u16 uid); +void mlx5_cmd_destroy_tir(struct mlx5_core_dev *dev, u32 tirn, u16 uid); +void mlx5_cmd_destroy_tis(struct mlx5_core_dev *dev, u32 tisn, u16 uid); +int mlx5_cmd_destroy_rqt(struct mlx5_core_dev *dev, u32 rqtn, u16 uid); +int mlx5_cmd_alloc_transport_domain(struct mlx5_core_dev *dev, u32 *tdn, + u16 uid); +void mlx5_cmd_dealloc_transport_domain(struct mlx5_core_dev *dev, u32 tdn, + u16 uid); +int mlx5_cmd_attach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, + u32 qpn, u16 uid); +int mlx5_cmd_detach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, + u32 qpn, u16 uid); +int mlx5_cmd_xrcd_alloc(struct mlx5_core_dev *dev, u32 *xrcdn, u16 uid); +int mlx5_cmd_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn, u16 uid); +int mlx5_cmd_mad_ifc(struct mlx5_core_dev *dev, const void *inb, void *outb, + u16 opmod, u8 port); +int mlx5_cmd_uar_alloc(struct mlx5_core_dev *dev, u32 *uarn, u16 uid); +int mlx5_cmd_uar_dealloc(struct mlx5_core_dev *dev, u32 uarn, u16 uid); +#endif /* MLX5_IB_CMD_H */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/cong.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/cong.c new file mode 100644 index 0000000..88e7c58 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/cong.c @@ -0,0 +1,464 @@ +/* + * Copyright (c) 2013-2017, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include "mlx5_ib.h" +#include "cmd.h" + +enum mlx5_ib_cong_node_type { + MLX5_IB_RROCE_ECN_RP = 1, + MLX5_IB_RROCE_ECN_NP = 2, +}; + +static const char * const mlx5_ib_dbg_cc_name[] = { + "rp_clamp_tgt_rate", + "rp_clamp_tgt_rate_ati", + "rp_time_reset", + "rp_byte_reset", + "rp_threshold", + "rp_ai_rate", + "rp_max_rate", + "rp_hai_rate", + "rp_min_dec_fac", + "rp_min_rate", + "rp_rate_to_set_on_first_cnp", + "rp_dce_tcp_g", + "rp_dce_tcp_rtt", + "rp_rate_reduce_monitor_period", + "rp_initial_alpha_value", + "rp_gd", + "np_min_time_between_cnps", + "np_cnp_dscp", + "np_cnp_prio_mode", + "np_cnp_prio", +}; + +#define MLX5_IB_RP_CLAMP_TGT_RATE_ATTR BIT(1) +#define MLX5_IB_RP_CLAMP_TGT_RATE_ATI_ATTR BIT(2) +#define MLX5_IB_RP_TIME_RESET_ATTR BIT(3) +#define MLX5_IB_RP_BYTE_RESET_ATTR BIT(4) +#define MLX5_IB_RP_THRESHOLD_ATTR BIT(5) +#define MLX5_IB_RP_MAX_RATE_ATTR BIT(6) +#define MLX5_IB_RP_AI_RATE_ATTR BIT(7) +#define MLX5_IB_RP_HAI_RATE_ATTR BIT(8) +#define MLX5_IB_RP_MIN_DEC_FAC_ATTR BIT(9) +#define MLX5_IB_RP_MIN_RATE_ATTR BIT(10) +#define MLX5_IB_RP_RATE_TO_SET_ON_FIRST_CNP_ATTR BIT(11) +#define MLX5_IB_RP_DCE_TCP_G_ATTR BIT(12) +#define MLX5_IB_RP_DCE_TCP_RTT_ATTR BIT(13) +#define MLX5_IB_RP_RATE_REDUCE_MONITOR_PERIOD_ATTR BIT(14) +#define MLX5_IB_RP_INITIAL_ALPHA_VALUE_ATTR BIT(15) +#define MLX5_IB_RP_GD_ATTR BIT(16) + +#define MLX5_IB_NP_MIN_TIME_BETWEEN_CNPS_ATTR BIT(2) +#define MLX5_IB_NP_CNP_DSCP_ATTR BIT(3) +#define MLX5_IB_NP_CNP_PRIO_MODE_ATTR BIT(4) + +static enum mlx5_ib_cong_node_type +mlx5_ib_param_to_node(enum mlx5_ib_dbg_cc_types param_offset) +{ + if (param_offset >= MLX5_IB_DBG_CC_RP_CLAMP_TGT_RATE && + param_offset <= MLX5_IB_DBG_CC_RP_GD) + return MLX5_IB_RROCE_ECN_RP; + else + return MLX5_IB_RROCE_ECN_NP; +} + +static u32 mlx5_get_cc_param_val(void *field, int offset) +{ + switch (offset) { + case MLX5_IB_DBG_CC_RP_CLAMP_TGT_RATE: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + clamp_tgt_rate); + case MLX5_IB_DBG_CC_RP_CLAMP_TGT_RATE_ATI: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + clamp_tgt_rate_after_time_inc); + case MLX5_IB_DBG_CC_RP_TIME_RESET: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + rpg_time_reset); + case MLX5_IB_DBG_CC_RP_BYTE_RESET: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + rpg_byte_reset); + case MLX5_IB_DBG_CC_RP_THRESHOLD: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + rpg_threshold); + case MLX5_IB_DBG_CC_RP_AI_RATE: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + rpg_ai_rate); + case MLX5_IB_DBG_CC_RP_MAX_RATE: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + rpg_max_rate); + case MLX5_IB_DBG_CC_RP_HAI_RATE: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + rpg_hai_rate); + case MLX5_IB_DBG_CC_RP_MIN_DEC_FAC: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + rpg_min_dec_fac); + case MLX5_IB_DBG_CC_RP_MIN_RATE: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + rpg_min_rate); + case MLX5_IB_DBG_CC_RP_RATE_TO_SET_ON_FIRST_CNP: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + rate_to_set_on_first_cnp); + case MLX5_IB_DBG_CC_RP_DCE_TCP_G: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + dce_tcp_g); + case MLX5_IB_DBG_CC_RP_DCE_TCP_RTT: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + dce_tcp_rtt); + case MLX5_IB_DBG_CC_RP_RATE_REDUCE_MONITOR_PERIOD: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + rate_reduce_monitor_period); + case MLX5_IB_DBG_CC_RP_INITIAL_ALPHA_VALUE: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + initial_alpha_value); + case MLX5_IB_DBG_CC_RP_GD: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + rpg_gd); + case MLX5_IB_DBG_CC_NP_MIN_TIME_BETWEEN_CNPS: + return MLX5_GET(cong_control_r_roce_ecn_np, field, + min_time_between_cnps); + case MLX5_IB_DBG_CC_NP_CNP_DSCP: + return MLX5_GET(cong_control_r_roce_ecn_np, field, + cnp_dscp); + case MLX5_IB_DBG_CC_NP_CNP_PRIO_MODE: + return MLX5_GET(cong_control_r_roce_ecn_np, field, + cnp_prio_mode); + case MLX5_IB_DBG_CC_NP_CNP_PRIO: + return MLX5_GET(cong_control_r_roce_ecn_np, field, + cnp_802p_prio); + default: + return 0; + } +} + +static void mlx5_ib_set_cc_param_mask_val(void *field, int offset, + u32 var, u32 *attr_mask) +{ + switch (offset) { + case MLX5_IB_DBG_CC_RP_CLAMP_TGT_RATE: + *attr_mask |= MLX5_IB_RP_CLAMP_TGT_RATE_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + clamp_tgt_rate, var); + break; + case MLX5_IB_DBG_CC_RP_CLAMP_TGT_RATE_ATI: + *attr_mask |= MLX5_IB_RP_CLAMP_TGT_RATE_ATI_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + clamp_tgt_rate_after_time_inc, var); + break; + case MLX5_IB_DBG_CC_RP_TIME_RESET: + *attr_mask |= MLX5_IB_RP_TIME_RESET_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + rpg_time_reset, var); + break; + case MLX5_IB_DBG_CC_RP_BYTE_RESET: + *attr_mask |= MLX5_IB_RP_BYTE_RESET_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + rpg_byte_reset, var); + break; + case MLX5_IB_DBG_CC_RP_THRESHOLD: + *attr_mask |= MLX5_IB_RP_THRESHOLD_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + rpg_threshold, var); + break; + case MLX5_IB_DBG_CC_RP_AI_RATE: + *attr_mask |= MLX5_IB_RP_AI_RATE_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + rpg_ai_rate, var); + break; + case MLX5_IB_DBG_CC_RP_MAX_RATE: + *attr_mask |= MLX5_IB_RP_MAX_RATE_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + rpg_max_rate, var); + break; + case MLX5_IB_DBG_CC_RP_HAI_RATE: + *attr_mask |= MLX5_IB_RP_HAI_RATE_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + rpg_hai_rate, var); + break; + case MLX5_IB_DBG_CC_RP_MIN_DEC_FAC: + *attr_mask |= MLX5_IB_RP_MIN_DEC_FAC_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + rpg_min_dec_fac, var); + break; + case MLX5_IB_DBG_CC_RP_MIN_RATE: + *attr_mask |= MLX5_IB_RP_MIN_RATE_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + rpg_min_rate, var); + break; + case MLX5_IB_DBG_CC_RP_RATE_TO_SET_ON_FIRST_CNP: + *attr_mask |= MLX5_IB_RP_RATE_TO_SET_ON_FIRST_CNP_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + rate_to_set_on_first_cnp, var); + break; + case MLX5_IB_DBG_CC_RP_DCE_TCP_G: + *attr_mask |= MLX5_IB_RP_DCE_TCP_G_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + dce_tcp_g, var); + break; + case MLX5_IB_DBG_CC_RP_DCE_TCP_RTT: + *attr_mask |= MLX5_IB_RP_DCE_TCP_RTT_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + dce_tcp_rtt, var); + break; + case MLX5_IB_DBG_CC_RP_RATE_REDUCE_MONITOR_PERIOD: + *attr_mask |= MLX5_IB_RP_RATE_REDUCE_MONITOR_PERIOD_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + rate_reduce_monitor_period, var); + break; + case MLX5_IB_DBG_CC_RP_INITIAL_ALPHA_VALUE: + *attr_mask |= MLX5_IB_RP_INITIAL_ALPHA_VALUE_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + initial_alpha_value, var); + break; + case MLX5_IB_DBG_CC_RP_GD: + *attr_mask |= MLX5_IB_RP_GD_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + rpg_gd, var); + break; + case MLX5_IB_DBG_CC_NP_MIN_TIME_BETWEEN_CNPS: + *attr_mask |= MLX5_IB_NP_MIN_TIME_BETWEEN_CNPS_ATTR; + MLX5_SET(cong_control_r_roce_ecn_np, field, + min_time_between_cnps, var); + break; + case MLX5_IB_DBG_CC_NP_CNP_DSCP: + *attr_mask |= MLX5_IB_NP_CNP_DSCP_ATTR; + MLX5_SET(cong_control_r_roce_ecn_np, field, cnp_dscp, var); + break; + case MLX5_IB_DBG_CC_NP_CNP_PRIO_MODE: + *attr_mask |= MLX5_IB_NP_CNP_PRIO_MODE_ATTR; + MLX5_SET(cong_control_r_roce_ecn_np, field, cnp_prio_mode, var); + break; + case MLX5_IB_DBG_CC_NP_CNP_PRIO: + *attr_mask |= MLX5_IB_NP_CNP_PRIO_MODE_ATTR; + MLX5_SET(cong_control_r_roce_ecn_np, field, cnp_prio_mode, 0); + MLX5_SET(cong_control_r_roce_ecn_np, field, cnp_802p_prio, var); + break; + } +} + +static int mlx5_ib_get_cc_params(struct mlx5_ib_dev *dev, u32 port_num, + int offset, u32 *var) +{ + int outlen = MLX5_ST_SZ_BYTES(query_cong_params_out); + void *out; + void *field; + int err; + enum mlx5_ib_cong_node_type node; + struct mlx5_core_dev *mdev; + + /* Takes a 1-based port number */ + mdev = mlx5_ib_get_native_port_mdev(dev, port_num + 1, NULL); + if (!mdev) + return -ENODEV; + + out = kvzalloc(outlen, GFP_KERNEL); + if (!out) { + err = -ENOMEM; + goto alloc_err; + } + + node = mlx5_ib_param_to_node(offset); + + err = mlx5_cmd_query_cong_params(mdev, node, out); + if (err) + goto free; + + field = MLX5_ADDR_OF(query_cong_params_out, out, congestion_parameters); + *var = mlx5_get_cc_param_val(field, offset); + +free: + kvfree(out); +alloc_err: + mlx5_ib_put_native_port_mdev(dev, port_num + 1); + return err; +} + +static int mlx5_ib_set_cc_params(struct mlx5_ib_dev *dev, u32 port_num, + int offset, u32 var) +{ + int inlen = MLX5_ST_SZ_BYTES(modify_cong_params_in); + void *in; + void *field; + enum mlx5_ib_cong_node_type node; + struct mlx5_core_dev *mdev; + u32 attr_mask = 0; + int err; + + /* Takes a 1-based port number */ + mdev = mlx5_ib_get_native_port_mdev(dev, port_num + 1, NULL); + if (!mdev) + return -ENODEV; + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto alloc_err; + } + + MLX5_SET(modify_cong_params_in, in, opcode, + MLX5_CMD_OP_MODIFY_CONG_PARAMS); + + node = mlx5_ib_param_to_node(offset); + MLX5_SET(modify_cong_params_in, in, cong_protocol, node); + + field = MLX5_ADDR_OF(modify_cong_params_in, in, congestion_parameters); + mlx5_ib_set_cc_param_mask_val(field, offset, var, &attr_mask); + + field = MLX5_ADDR_OF(modify_cong_params_in, in, field_select); + MLX5_SET(field_select_r_roce_rp, field, field_select_r_roce_rp, + attr_mask); + + err = mlx5_cmd_exec_in(dev->mdev, modify_cong_params, in); + kvfree(in); +alloc_err: + mlx5_ib_put_native_port_mdev(dev, port_num + 1); + return err; +} + +static ssize_t set_param(struct file *filp, const char __user *buf, + size_t count, loff_t *pos) +{ + struct mlx5_ib_dbg_param *param = filp->private_data; + int offset = param->offset; + char lbuf[11] = { }; + u32 var; + int ret; + + if (count > sizeof(lbuf)) + return -EINVAL; + + if (copy_from_user(lbuf, buf, count)) + return -EFAULT; + + lbuf[sizeof(lbuf) - 1] = '\0'; + + if (kstrtou32(lbuf, 0, &var)) + return -EINVAL; + + ret = mlx5_ib_set_cc_params(param->dev, param->port_num, offset, var); + return ret ? ret : count; +} + +static ssize_t get_param(struct file *filp, char __user *buf, size_t count, + loff_t *pos) +{ + struct mlx5_ib_dbg_param *param = filp->private_data; + int offset = param->offset; + u32 var = 0; + int ret; + char lbuf[11]; + + ret = mlx5_ib_get_cc_params(param->dev, param->port_num, offset, &var); + if (ret) + return ret; + + ret = snprintf(lbuf, sizeof(lbuf), "%d\n", var); + if (ret < 0) + return ret; + + return simple_read_from_buffer(buf, count, pos, lbuf, ret); +} + +static const struct file_operations dbg_cc_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .write = set_param, + .read = get_param, +}; + +void mlx5_ib_cleanup_cong_debugfs(struct mlx5_ib_dev *dev, u32 port_num) +{ + if (!mlx5_debugfs_root || + dev->is_rep || + !dev->port[port_num].dbg_cc_params || + !dev->port[port_num].dbg_cc_params->root) + return; + + debugfs_remove_recursive(dev->port[port_num].dbg_cc_params->root); + kfree(dev->port[port_num].dbg_cc_params); + dev->port[port_num].dbg_cc_params = NULL; +} + +void mlx5_ib_init_cong_debugfs(struct mlx5_ib_dev *dev, u32 port_num) +{ + struct mlx5_ib_dbg_cc_params *dbg_cc_params; + struct mlx5_core_dev *mdev; + int i; + + if (!mlx5_debugfs_root || dev->is_rep) + return; + + /* Takes a 1-based port number */ + mdev = mlx5_ib_get_native_port_mdev(dev, port_num + 1, NULL); + if (!mdev) + return; + + if (!MLX5_CAP_GEN(mdev, cc_query_allowed) || + !MLX5_CAP_GEN(mdev, cc_modify_allowed)) + goto put_mdev; + + dbg_cc_params = kzalloc(sizeof(*dbg_cc_params), GFP_KERNEL); + if (!dbg_cc_params) + goto err; + + dev->port[port_num].dbg_cc_params = dbg_cc_params; + + dbg_cc_params->root = debugfs_create_dir("cc_params", mlx5_debugfs_get_dev_root(mdev)); + + for (i = 0; i < MLX5_IB_DBG_CC_MAX; i++) { + dbg_cc_params->params[i].offset = i; + dbg_cc_params->params[i].dev = dev; + dbg_cc_params->params[i].port_num = port_num; + dbg_cc_params->params[i].dentry = + debugfs_create_file(mlx5_ib_dbg_cc_name[i], + 0600, dbg_cc_params->root, + &dbg_cc_params->params[i], + &dbg_cc_fops); + } + +put_mdev: + mlx5_ib_put_native_port_mdev(dev, port_num + 1); + return; + +err: + mlx5_ib_warn(dev, "cong debugfs failure\n"); + mlx5_ib_cleanup_cong_debugfs(dev, port_num); + mlx5_ib_put_native_port_mdev(dev, port_num + 1); + + /* + * We don't want to fail driver if debugfs failed to initialize, + * so we are not forwarding error to the user. + */ + return; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/counters.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/counters.c new file mode 100644 index 0000000..b79b036 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/counters.c @@ -0,0 +1,931 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2013-2020, Mellanox Technologies inc. All rights reserved. + */ + +#include "mlx5_ib.h" +#include +#include "counters.h" +#include "ib_rep.h" +#include "qp.h" + +struct mlx5_ib_counter { + const char *name; + size_t offset; + u32 type; +}; + +#define INIT_Q_COUNTER(_name) \ + { .name = #_name, .offset = MLX5_BYTE_OFF(query_q_counter_out, _name)} + +static const struct mlx5_ib_counter basic_q_cnts[] = { + INIT_Q_COUNTER(rx_write_requests), + INIT_Q_COUNTER(rx_read_requests), + INIT_Q_COUNTER(rx_atomic_requests), + INIT_Q_COUNTER(out_of_buffer), +}; + +static const struct mlx5_ib_counter out_of_seq_q_cnts[] = { + INIT_Q_COUNTER(out_of_sequence), +}; + +static const struct mlx5_ib_counter retrans_q_cnts[] = { + INIT_Q_COUNTER(duplicate_request), + INIT_Q_COUNTER(rnr_nak_retry_err), + INIT_Q_COUNTER(packet_seq_err), + INIT_Q_COUNTER(implied_nak_seq_err), + INIT_Q_COUNTER(local_ack_timeout_err), + INIT_Q_COUNTER(rx_dct_connect), +}; + +#define INIT_CONG_COUNTER(_name) \ + { .name = #_name, .offset = \ + MLX5_BYTE_OFF(query_cong_statistics_out, _name ## _high)} + +static const struct mlx5_ib_counter cong_cnts[] = { + INIT_CONG_COUNTER(rp_cnp_ignored), + INIT_CONG_COUNTER(rp_cnp_handled), + INIT_CONG_COUNTER(np_ecn_marked_roce_packets), + INIT_CONG_COUNTER(np_cnp_sent), +}; + +static const struct mlx5_ib_counter extended_err_cnts[] = { + INIT_Q_COUNTER(resp_local_length_error), + INIT_Q_COUNTER(resp_cqe_error), + INIT_Q_COUNTER(req_cqe_error), + INIT_Q_COUNTER(req_remote_invalid_request), + INIT_Q_COUNTER(req_remote_access_errors), + INIT_Q_COUNTER(resp_remote_access_errors), + INIT_Q_COUNTER(resp_cqe_flush_error), + INIT_Q_COUNTER(req_cqe_flush_error), +}; + +static const struct mlx5_ib_counter roce_accl_cnts[] = { + INIT_Q_COUNTER(roce_adp_retrans), + INIT_Q_COUNTER(roce_adp_retrans_to), + INIT_Q_COUNTER(roce_slow_restart), + INIT_Q_COUNTER(roce_slow_restart_cnps), + INIT_Q_COUNTER(roce_slow_restart_trans), +}; + +#define INIT_EXT_PPCNT_COUNTER(_name) \ + { .name = #_name, .offset = \ + MLX5_BYTE_OFF(ppcnt_reg, \ + counter_set.eth_extended_cntrs_grp_data_layout._name##_high)} + +static const struct mlx5_ib_counter ext_ppcnt_cnts[] = { + INIT_EXT_PPCNT_COUNTER(rx_icrc_encapsulated), +}; + +#define INIT_OP_COUNTER(_name, _type) \ + { .name = #_name, .type = MLX5_IB_OPCOUNTER_##_type} + +static const struct mlx5_ib_counter basic_op_cnts[] = { + INIT_OP_COUNTER(cc_rx_ce_pkts, CC_RX_CE_PKTS), +}; + +static const struct mlx5_ib_counter rdmarx_cnp_op_cnts[] = { + INIT_OP_COUNTER(cc_rx_cnp_pkts, CC_RX_CNP_PKTS), +}; + +static const struct mlx5_ib_counter rdmatx_cnp_op_cnts[] = { + INIT_OP_COUNTER(cc_tx_cnp_pkts, CC_TX_CNP_PKTS), +}; + +static int mlx5_ib_read_counters(struct ib_counters *counters, + struct ib_counters_read_attr *read_attr, + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_ib_mcounters *mcounters = to_mcounters(counters); + struct mlx5_read_counters_attr mread_attr = {}; + struct mlx5_ib_flow_counters_desc *desc; + int ret, i; + + mutex_lock(&mcounters->mcntrs_mutex); + if (mcounters->cntrs_max_index > read_attr->ncounters) { + ret = -EINVAL; + goto err_bound; + } + + mread_attr.out = kcalloc(mcounters->counters_num, sizeof(u64), + GFP_KERNEL); + if (!mread_attr.out) { + ret = -ENOMEM; + goto err_bound; + } + + mread_attr.hw_cntrs_hndl = mcounters->hw_cntrs_hndl; + mread_attr.flags = read_attr->flags; + ret = mcounters->read_counters(counters->device, &mread_attr); + if (ret) + goto err_read; + + /* do the pass over the counters data array to assign according to the + * descriptions and indexing pairs + */ + desc = mcounters->counters_data; + for (i = 0; i < mcounters->ncounters; i++) + read_attr->counters_buff[desc[i].index] += mread_attr.out[desc[i].description]; + +err_read: + kfree(mread_attr.out); +err_bound: + mutex_unlock(&mcounters->mcntrs_mutex); + return ret; +} + +static int mlx5_ib_destroy_counters(struct ib_counters *counters) +{ + struct mlx5_ib_mcounters *mcounters = to_mcounters(counters); + + mlx5_ib_counters_clear_description(counters); + if (mcounters->hw_cntrs_hndl) + mlx5_fc_destroy(to_mdev(counters->device)->mdev, + mcounters->hw_cntrs_hndl); + return 0; +} + +static int mlx5_ib_create_counters(struct ib_counters *counters, + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_ib_mcounters *mcounters = to_mcounters(counters); + + mutex_init(&mcounters->mcntrs_mutex); + return 0; +} + + +static const struct mlx5_ib_counters *get_counters(struct mlx5_ib_dev *dev, + u32 port_num) +{ + return is_mdev_switchdev_mode(dev->mdev) ? &dev->port[0].cnts : + &dev->port[port_num].cnts; +} + +/** + * mlx5_ib_get_counters_id - Returns counters id to use for device+port + * @dev: Pointer to mlx5 IB device + * @port_num: Zero based port number + * + * mlx5_ib_get_counters_id() Returns counters set id to use for given + * device port combination in switchdev and non switchdev mode of the + * parent device. + */ +u16 mlx5_ib_get_counters_id(struct mlx5_ib_dev *dev, u32 port_num) +{ + const struct mlx5_ib_counters *cnts = get_counters(dev, port_num); + + return cnts->set_id; +} + +static struct rdma_hw_stats *do_alloc_stats(const struct mlx5_ib_counters *cnts) +{ + struct rdma_hw_stats *stats; + u32 num_hw_counters; + int i; + + num_hw_counters = cnts->num_q_counters + cnts->num_cong_counters + + cnts->num_ext_ppcnt_counters; + stats = rdma_alloc_hw_stats_struct(cnts->descs, + num_hw_counters + + cnts->num_op_counters, + RDMA_HW_STATS_DEFAULT_LIFESPAN); + if (!stats) + return NULL; + + for (i = 0; i < cnts->num_op_counters; i++) + set_bit(num_hw_counters + i, stats->is_disabled); + + return stats; +} + +static struct rdma_hw_stats * +mlx5_ib_alloc_hw_device_stats(struct ib_device *ibdev) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + const struct mlx5_ib_counters *cnts = &dev->port[0].cnts; + + return do_alloc_stats(cnts); +} + +static struct rdma_hw_stats * +mlx5_ib_alloc_hw_port_stats(struct ib_device *ibdev, u32 port_num) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + const struct mlx5_ib_counters *cnts = &dev->port[port_num - 1].cnts; + + return do_alloc_stats(cnts); +} + +static int mlx5_ib_query_q_counters(struct mlx5_core_dev *mdev, + const struct mlx5_ib_counters *cnts, + struct rdma_hw_stats *stats, + u16 set_id) +{ + u32 out[MLX5_ST_SZ_DW(query_q_counter_out)] = {}; + u32 in[MLX5_ST_SZ_DW(query_q_counter_in)] = {}; + __be32 val; + int ret, i; + + MLX5_SET(query_q_counter_in, in, opcode, MLX5_CMD_OP_QUERY_Q_COUNTER); + MLX5_SET(query_q_counter_in, in, counter_set_id, set_id); + ret = mlx5_cmd_exec_inout(mdev, query_q_counter, in, out); + if (ret) + return ret; + + for (i = 0; i < cnts->num_q_counters; i++) { + val = *(__be32 *)((void *)out + cnts->offsets[i]); + stats->value[i] = (u64)be32_to_cpu(val); + } + + return 0; +} + +static int mlx5_ib_query_ext_ppcnt_counters(struct mlx5_ib_dev *dev, + const struct mlx5_ib_counters *cnts, + struct rdma_hw_stats *stats) +{ + int offset = cnts->num_q_counters + cnts->num_cong_counters; + u32 in[MLX5_ST_SZ_DW(ppcnt_reg)] = {}; + int sz = MLX5_ST_SZ_BYTES(ppcnt_reg); + int ret, i; + void *out; + + out = kvzalloc(sz, GFP_KERNEL); + if (!out) + return -ENOMEM; + + MLX5_SET(ppcnt_reg, in, local_port, 1); + MLX5_SET(ppcnt_reg, in, grp, MLX5_ETHERNET_EXTENDED_COUNTERS_GROUP); + ret = mlx5_core_access_reg(dev->mdev, in, sz, out, sz, MLX5_REG_PPCNT, + 0, 0); + if (ret) + goto free; + + for (i = 0; i < cnts->num_ext_ppcnt_counters; i++) + stats->value[i + offset] = + be64_to_cpup((__be64 *)(out + + cnts->offsets[i + offset])); +free: + kvfree(out); + return ret; +} + +static int do_get_hw_stats(struct ib_device *ibdev, + struct rdma_hw_stats *stats, + u32 port_num, int index) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + const struct mlx5_ib_counters *cnts = get_counters(dev, port_num - 1); + struct mlx5_core_dev *mdev; + int ret, num_counters; + u32 mdev_port_num; + + if (!stats) + return -EINVAL; + + num_counters = cnts->num_q_counters + + cnts->num_cong_counters + + cnts->num_ext_ppcnt_counters; + + /* q_counters are per IB device, query the master mdev */ + ret = mlx5_ib_query_q_counters(dev->mdev, cnts, stats, cnts->set_id); + if (ret) + return ret; + + if (MLX5_CAP_PCAM_FEATURE(dev->mdev, rx_icrc_encapsulated_counter)) { + ret = mlx5_ib_query_ext_ppcnt_counters(dev, cnts, stats); + if (ret) + return ret; + } + + if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) { + mdev = mlx5_ib_get_native_port_mdev(dev, port_num, + &mdev_port_num); + if (!mdev) { + /* If port is not affiliated yet, its in down state + * which doesn't have any counters yet, so it would be + * zero. So no need to read from the HCA. + */ + goto done; + } + ret = mlx5_lag_query_cong_counters(dev->mdev, + stats->value + + cnts->num_q_counters, + cnts->num_cong_counters, + cnts->offsets + + cnts->num_q_counters); + + mlx5_ib_put_native_port_mdev(dev, port_num); + if (ret) + return ret; + } + +done: + return num_counters; +} + +static int do_get_op_stat(struct ib_device *ibdev, + struct rdma_hw_stats *stats, + u32 port_num, int index) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + const struct mlx5_ib_counters *cnts; + const struct mlx5_ib_op_fc *opfcs; + u64 packets = 0, bytes; + u32 type; + int ret; + + cnts = get_counters(dev, port_num - 1); + opfcs = cnts->opfcs; + type = *(u32 *)cnts->descs[index].priv; + if (type >= MLX5_IB_OPCOUNTER_MAX) + return -EINVAL; + + if (!opfcs[type].fc) + goto out; + + ret = mlx5_fc_query(dev->mdev, opfcs[type].fc, + &packets, &bytes); + if (ret) + return ret; + +out: + stats->value[index] = packets; + return index; +} + +static int do_get_op_stats(struct ib_device *ibdev, + struct rdma_hw_stats *stats, + u32 port_num) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + const struct mlx5_ib_counters *cnts; + int index, ret, num_hw_counters; + + cnts = get_counters(dev, port_num - 1); + num_hw_counters = cnts->num_q_counters + cnts->num_cong_counters + + cnts->num_ext_ppcnt_counters; + for (index = num_hw_counters; + index < (num_hw_counters + cnts->num_op_counters); index++) { + ret = do_get_op_stat(ibdev, stats, port_num, index); + if (ret != index) + return ret; + } + + return cnts->num_op_counters; +} + +static int mlx5_ib_get_hw_stats(struct ib_device *ibdev, + struct rdma_hw_stats *stats, + u32 port_num, int index) +{ + int num_counters, num_hw_counters, num_op_counters; + struct mlx5_ib_dev *dev = to_mdev(ibdev); + const struct mlx5_ib_counters *cnts; + + cnts = get_counters(dev, port_num - 1); + num_hw_counters = cnts->num_q_counters + cnts->num_cong_counters + + cnts->num_ext_ppcnt_counters; + num_counters = num_hw_counters + cnts->num_op_counters; + + if (index < 0 || index > num_counters) + return -EINVAL; + else if (index > 0 && index < num_hw_counters) + return do_get_hw_stats(ibdev, stats, port_num, index); + else if (index >= num_hw_counters && index < num_counters) + return do_get_op_stat(ibdev, stats, port_num, index); + + num_hw_counters = do_get_hw_stats(ibdev, stats, port_num, index); + if (num_hw_counters < 0) + return num_hw_counters; + + num_op_counters = do_get_op_stats(ibdev, stats, port_num); + if (num_op_counters < 0) + return num_op_counters; + + return num_hw_counters + num_op_counters; +} + +static struct rdma_hw_stats * +mlx5_ib_counter_alloc_stats(struct rdma_counter *counter) +{ + struct mlx5_ib_dev *dev = to_mdev(counter->device); + const struct mlx5_ib_counters *cnts = + get_counters(dev, counter->port - 1); + + return do_alloc_stats(cnts); +} + +static int mlx5_ib_counter_update_stats(struct rdma_counter *counter) +{ + struct mlx5_ib_dev *dev = to_mdev(counter->device); + const struct mlx5_ib_counters *cnts = + get_counters(dev, counter->port - 1); + + return mlx5_ib_query_q_counters(dev->mdev, cnts, + counter->stats, counter->id); +} + +static int mlx5_ib_counter_dealloc(struct rdma_counter *counter) +{ + struct mlx5_ib_dev *dev = to_mdev(counter->device); + u32 in[MLX5_ST_SZ_DW(dealloc_q_counter_in)] = {}; + + if (!counter->id) + return 0; + + MLX5_SET(dealloc_q_counter_in, in, opcode, + MLX5_CMD_OP_DEALLOC_Q_COUNTER); + MLX5_SET(dealloc_q_counter_in, in, counter_set_id, counter->id); + return mlx5_cmd_exec_in(dev->mdev, dealloc_q_counter, in); +} + +static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter, + struct ib_qp *qp) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->device); + int err; + + if (!counter->id) { + u32 out[MLX5_ST_SZ_DW(alloc_q_counter_out)] = {}; + u32 in[MLX5_ST_SZ_DW(alloc_q_counter_in)] = {}; + + MLX5_SET(alloc_q_counter_in, in, opcode, + MLX5_CMD_OP_ALLOC_Q_COUNTER); + MLX5_SET(alloc_q_counter_in, in, uid, MLX5_SHARED_RESOURCE_UID); + err = mlx5_cmd_exec_inout(dev->mdev, alloc_q_counter, in, out); + if (err) + return err; + counter->id = + MLX5_GET(alloc_q_counter_out, out, counter_set_id); + } + + err = mlx5_ib_qp_set_counter(qp, counter); + if (err) + goto fail_set_counter; + + return 0; + +fail_set_counter: + mlx5_ib_counter_dealloc(counter); + counter->id = 0; + + return err; +} + +static int mlx5_ib_counter_unbind_qp(struct ib_qp *qp) +{ + return mlx5_ib_qp_set_counter(qp, NULL); +} + +static void mlx5_ib_fill_counters(struct mlx5_ib_dev *dev, + struct rdma_stat_desc *descs, size_t *offsets) +{ + int i; + int j = 0; + + for (i = 0; i < ARRAY_SIZE(basic_q_cnts); i++, j++) { + descs[j].name = basic_q_cnts[i].name; + offsets[j] = basic_q_cnts[i].offset; + } + + if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt)) { + for (i = 0; i < ARRAY_SIZE(out_of_seq_q_cnts); i++, j++) { + descs[j].name = out_of_seq_q_cnts[i].name; + offsets[j] = out_of_seq_q_cnts[i].offset; + } + } + + if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) { + for (i = 0; i < ARRAY_SIZE(retrans_q_cnts); i++, j++) { + descs[j].name = retrans_q_cnts[i].name; + offsets[j] = retrans_q_cnts[i].offset; + } + } + + if (MLX5_CAP_GEN(dev->mdev, enhanced_error_q_counters)) { + for (i = 0; i < ARRAY_SIZE(extended_err_cnts); i++, j++) { + descs[j].name = extended_err_cnts[i].name; + offsets[j] = extended_err_cnts[i].offset; + } + } + + if (MLX5_CAP_GEN(dev->mdev, roce_accl)) { + for (i = 0; i < ARRAY_SIZE(roce_accl_cnts); i++, j++) { + descs[j].name = roce_accl_cnts[i].name; + offsets[j] = roce_accl_cnts[i].offset; + } + } + + if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) { + for (i = 0; i < ARRAY_SIZE(cong_cnts); i++, j++) { + descs[j].name = cong_cnts[i].name; + offsets[j] = cong_cnts[i].offset; + } + } + + if (MLX5_CAP_PCAM_FEATURE(dev->mdev, rx_icrc_encapsulated_counter)) { + for (i = 0; i < ARRAY_SIZE(ext_ppcnt_cnts); i++, j++) { + descs[j].name = ext_ppcnt_cnts[i].name; + offsets[j] = ext_ppcnt_cnts[i].offset; + } + } + + for (i = 0; i < ARRAY_SIZE(basic_op_cnts); i++, j++) { + descs[j].name = basic_op_cnts[i].name; + descs[j].flags |= IB_STAT_FLAG_OPTIONAL; + descs[j].priv = &basic_op_cnts[i].type; + } + + if (MLX5_CAP_FLOWTABLE(dev->mdev, + ft_field_support_2_nic_receive_rdma.bth_opcode)) { + for (i = 0; i < ARRAY_SIZE(rdmarx_cnp_op_cnts); i++, j++) { + descs[j].name = rdmarx_cnp_op_cnts[i].name; + descs[j].flags |= IB_STAT_FLAG_OPTIONAL; + descs[j].priv = &rdmarx_cnp_op_cnts[i].type; + } + } + + if (MLX5_CAP_FLOWTABLE(dev->mdev, + ft_field_support_2_nic_transmit_rdma.bth_opcode)) { + for (i = 0; i < ARRAY_SIZE(rdmatx_cnp_op_cnts); i++, j++) { + descs[j].name = rdmatx_cnp_op_cnts[i].name; + descs[j].flags |= IB_STAT_FLAG_OPTIONAL; + descs[j].priv = &rdmatx_cnp_op_cnts[i].type; + } + } +} + + +static int __mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev, + struct mlx5_ib_counters *cnts) +{ + u32 num_counters, num_op_counters; + + num_counters = ARRAY_SIZE(basic_q_cnts); + + if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt)) + num_counters += ARRAY_SIZE(out_of_seq_q_cnts); + + if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) + num_counters += ARRAY_SIZE(retrans_q_cnts); + + if (MLX5_CAP_GEN(dev->mdev, enhanced_error_q_counters)) + num_counters += ARRAY_SIZE(extended_err_cnts); + + if (MLX5_CAP_GEN(dev->mdev, roce_accl)) + num_counters += ARRAY_SIZE(roce_accl_cnts); + + cnts->num_q_counters = num_counters; + + if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) { + cnts->num_cong_counters = ARRAY_SIZE(cong_cnts); + num_counters += ARRAY_SIZE(cong_cnts); + } + if (MLX5_CAP_PCAM_FEATURE(dev->mdev, rx_icrc_encapsulated_counter)) { + cnts->num_ext_ppcnt_counters = ARRAY_SIZE(ext_ppcnt_cnts); + num_counters += ARRAY_SIZE(ext_ppcnt_cnts); + } + + num_op_counters = ARRAY_SIZE(basic_op_cnts); + + if (MLX5_CAP_FLOWTABLE(dev->mdev, + ft_field_support_2_nic_receive_rdma.bth_opcode)) + num_op_counters += ARRAY_SIZE(rdmarx_cnp_op_cnts); + + if (MLX5_CAP_FLOWTABLE(dev->mdev, + ft_field_support_2_nic_transmit_rdma.bth_opcode)) + num_op_counters += ARRAY_SIZE(rdmatx_cnp_op_cnts); + + cnts->num_op_counters = num_op_counters; + num_counters += num_op_counters; + cnts->descs = kcalloc(num_counters, + sizeof(struct rdma_stat_desc), GFP_KERNEL); + if (!cnts->descs) + return -ENOMEM; + + cnts->offsets = kcalloc(num_counters, + sizeof(*cnts->offsets), GFP_KERNEL); + if (!cnts->offsets) + goto err; + + return 0; + +err: + kfree(cnts->descs); + cnts->descs = NULL; + return -ENOMEM; +} + +static void mlx5_ib_dealloc_counters(struct mlx5_ib_dev *dev) +{ + u32 in[MLX5_ST_SZ_DW(dealloc_q_counter_in)] = {}; + int num_cnt_ports; + int i, j; + + num_cnt_ports = is_mdev_switchdev_mode(dev->mdev) ? 1 : dev->num_ports; + + MLX5_SET(dealloc_q_counter_in, in, opcode, + MLX5_CMD_OP_DEALLOC_Q_COUNTER); + + for (i = 0; i < num_cnt_ports; i++) { + if (dev->port[i].cnts.set_id) { + MLX5_SET(dealloc_q_counter_in, in, counter_set_id, + dev->port[i].cnts.set_id); + mlx5_cmd_exec_in(dev->mdev, dealloc_q_counter, in); + } + kfree(dev->port[i].cnts.descs); + kfree(dev->port[i].cnts.offsets); + + for (j = 0; j < MLX5_IB_OPCOUNTER_MAX; j++) { + if (!dev->port[i].cnts.opfcs[j].fc) + continue; + + if (IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS)) + mlx5_ib_fs_remove_op_fc(dev, + &dev->port[i].cnts.opfcs[j], j); + mlx5_fc_destroy(dev->mdev, + dev->port[i].cnts.opfcs[j].fc); + dev->port[i].cnts.opfcs[j].fc = NULL; + } + } +} + +static int mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev) +{ + u32 out[MLX5_ST_SZ_DW(alloc_q_counter_out)] = {}; + u32 in[MLX5_ST_SZ_DW(alloc_q_counter_in)] = {}; + int num_cnt_ports; + int err = 0; + int i; + bool is_shared; + + MLX5_SET(alloc_q_counter_in, in, opcode, MLX5_CMD_OP_ALLOC_Q_COUNTER); + is_shared = MLX5_CAP_GEN(dev->mdev, log_max_uctx) != 0; + num_cnt_ports = is_mdev_switchdev_mode(dev->mdev) ? 1 : dev->num_ports; + + for (i = 0; i < num_cnt_ports; i++) { + err = __mlx5_ib_alloc_counters(dev, &dev->port[i].cnts); + if (err) + goto err_alloc; + + mlx5_ib_fill_counters(dev, dev->port[i].cnts.descs, + dev->port[i].cnts.offsets); + + MLX5_SET(alloc_q_counter_in, in, uid, + is_shared ? MLX5_SHARED_RESOURCE_UID : 0); + + err = mlx5_cmd_exec_inout(dev->mdev, alloc_q_counter, in, out); + if (err) { + mlx5_ib_warn(dev, + "couldn't allocate queue counter for port %d, err %d\n", + i + 1, err); + goto err_alloc; + } + + dev->port[i].cnts.set_id = + MLX5_GET(alloc_q_counter_out, out, counter_set_id); + } + return 0; + +err_alloc: + mlx5_ib_dealloc_counters(dev); + return err; +} + +static int read_flow_counters(struct ib_device *ibdev, + struct mlx5_read_counters_attr *read_attr) +{ + struct mlx5_fc *fc = read_attr->hw_cntrs_hndl; + struct mlx5_ib_dev *dev = to_mdev(ibdev); + + return mlx5_fc_query(dev->mdev, fc, + &read_attr->out[IB_COUNTER_PACKETS], + &read_attr->out[IB_COUNTER_BYTES]); +} + +/* flow counters currently expose two counters packets and bytes */ +#define FLOW_COUNTERS_NUM 2 +static int counters_set_description( + struct ib_counters *counters, enum mlx5_ib_counters_type counters_type, + struct mlx5_ib_flow_counters_desc *desc_data, u32 ncounters) +{ + struct mlx5_ib_mcounters *mcounters = to_mcounters(counters); + u32 cntrs_max_index = 0; + int i; + + if (counters_type != MLX5_IB_COUNTERS_FLOW) + return -EINVAL; + + /* init the fields for the object */ + mcounters->type = counters_type; + mcounters->read_counters = read_flow_counters; + mcounters->counters_num = FLOW_COUNTERS_NUM; + mcounters->ncounters = ncounters; + /* each counter entry have both description and index pair */ + for (i = 0; i < ncounters; i++) { + if (desc_data[i].description > IB_COUNTER_BYTES) + return -EINVAL; + + if (cntrs_max_index <= desc_data[i].index) + cntrs_max_index = desc_data[i].index + 1; + } + + mutex_lock(&mcounters->mcntrs_mutex); + mcounters->counters_data = desc_data; + mcounters->cntrs_max_index = cntrs_max_index; + mutex_unlock(&mcounters->mcntrs_mutex); + + return 0; +} + +#define MAX_COUNTERS_NUM (USHRT_MAX / (sizeof(u32) * 2)) +int mlx5_ib_flow_counters_set_data(struct ib_counters *ibcounters, + struct mlx5_ib_create_flow *ucmd) +{ + struct mlx5_ib_mcounters *mcounters = to_mcounters(ibcounters); + struct mlx5_ib_flow_counters_data *cntrs_data = NULL; + struct mlx5_ib_flow_counters_desc *desc_data = NULL; + bool hw_hndl = false; + int ret = 0; + + if (ucmd && ucmd->ncounters_data != 0) { + cntrs_data = ucmd->data; + if (cntrs_data->ncounters > MAX_COUNTERS_NUM) + return -EINVAL; + + desc_data = kcalloc(cntrs_data->ncounters, + sizeof(*desc_data), + GFP_KERNEL); + if (!desc_data) + return -ENOMEM; + + if (copy_from_user(desc_data, + u64_to_user_ptr(cntrs_data->counters_data), + sizeof(*desc_data) * cntrs_data->ncounters)) { + ret = -EFAULT; + goto free; + } + } + + if (!mcounters->hw_cntrs_hndl) { + mcounters->hw_cntrs_hndl = mlx5_fc_create( + to_mdev(ibcounters->device)->mdev, false); + if (IS_ERR(mcounters->hw_cntrs_hndl)) { + ret = PTR_ERR(mcounters->hw_cntrs_hndl); + goto free; + } + hw_hndl = true; + } + + if (desc_data) { + /* counters already bound to at least one flow */ + if (mcounters->cntrs_max_index) { + ret = -EINVAL; + goto free_hndl; + } + + ret = counters_set_description(ibcounters, + MLX5_IB_COUNTERS_FLOW, + desc_data, + cntrs_data->ncounters); + if (ret) + goto free_hndl; + + } else if (!mcounters->cntrs_max_index) { + /* counters not bound yet, must have udata passed */ + ret = -EINVAL; + goto free_hndl; + } + + return 0; + +free_hndl: + if (hw_hndl) { + mlx5_fc_destroy(to_mdev(ibcounters->device)->mdev, + mcounters->hw_cntrs_hndl); + mcounters->hw_cntrs_hndl = NULL; + } +free: + kfree(desc_data); + return ret; +} + +void mlx5_ib_counters_clear_description(struct ib_counters *counters) +{ + struct mlx5_ib_mcounters *mcounters; + + if (!counters || atomic_read(&counters->usecnt) != 1) + return; + + mcounters = to_mcounters(counters); + + mutex_lock(&mcounters->mcntrs_mutex); + kfree(mcounters->counters_data); + mcounters->counters_data = NULL; + mcounters->cntrs_max_index = 0; + mutex_unlock(&mcounters->mcntrs_mutex); +} + +static int mlx5_ib_modify_stat(struct ib_device *device, u32 port, + unsigned int index, bool enable) +{ + struct mlx5_ib_dev *dev = to_mdev(device); + struct mlx5_ib_counters *cnts; + struct mlx5_ib_op_fc *opfc; + u32 num_hw_counters, type; + int ret; + + cnts = &dev->port[port - 1].cnts; + num_hw_counters = cnts->num_q_counters + cnts->num_cong_counters + + cnts->num_ext_ppcnt_counters; + if (index < num_hw_counters || + index >= (num_hw_counters + cnts->num_op_counters)) + return -EINVAL; + + if (!(cnts->descs[index].flags & IB_STAT_FLAG_OPTIONAL)) + return -EINVAL; + + type = *(u32 *)cnts->descs[index].priv; + if (type >= MLX5_IB_OPCOUNTER_MAX) + return -EINVAL; + + opfc = &cnts->opfcs[type]; + + if (enable) { + if (opfc->fc) + return -EEXIST; + + opfc->fc = mlx5_fc_create(dev->mdev, false); + if (IS_ERR(opfc->fc)) + return PTR_ERR(opfc->fc); + + ret = mlx5_ib_fs_add_op_fc(dev, port, opfc, type); + if (ret) { + mlx5_fc_destroy(dev->mdev, opfc->fc); + opfc->fc = NULL; + } + return ret; + } + + if (!opfc->fc) + return -EINVAL; + + mlx5_ib_fs_remove_op_fc(dev, opfc, type); + mlx5_fc_destroy(dev->mdev, opfc->fc); + opfc->fc = NULL; + return 0; +} + +static const struct ib_device_ops hw_stats_ops = { + .alloc_hw_port_stats = mlx5_ib_alloc_hw_port_stats, + .get_hw_stats = mlx5_ib_get_hw_stats, + .counter_bind_qp = mlx5_ib_counter_bind_qp, + .counter_unbind_qp = mlx5_ib_counter_unbind_qp, + .counter_dealloc = mlx5_ib_counter_dealloc, + .counter_alloc_stats = mlx5_ib_counter_alloc_stats, + .counter_update_stats = mlx5_ib_counter_update_stats, + .modify_hw_stat = IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) ? + mlx5_ib_modify_stat : NULL, +}; + +static const struct ib_device_ops hw_switchdev_stats_ops = { + .alloc_hw_device_stats = mlx5_ib_alloc_hw_device_stats, + .get_hw_stats = mlx5_ib_get_hw_stats, + .counter_bind_qp = mlx5_ib_counter_bind_qp, + .counter_unbind_qp = mlx5_ib_counter_unbind_qp, + .counter_dealloc = mlx5_ib_counter_dealloc, + .counter_alloc_stats = mlx5_ib_counter_alloc_stats, + .counter_update_stats = mlx5_ib_counter_update_stats, +}; + +static const struct ib_device_ops counters_ops = { + .create_counters = mlx5_ib_create_counters, + .destroy_counters = mlx5_ib_destroy_counters, + .read_counters = mlx5_ib_read_counters, + + INIT_RDMA_OBJ_SIZE(ib_counters, mlx5_ib_mcounters, ibcntrs), +}; + +int mlx5_ib_counters_init(struct mlx5_ib_dev *dev) +{ + ib_set_device_ops(&dev->ib_dev, &counters_ops); + + if (!MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) + return 0; + + if (is_mdev_switchdev_mode(dev->mdev)) + ib_set_device_ops(&dev->ib_dev, &hw_switchdev_stats_ops); + else + ib_set_device_ops(&dev->ib_dev, &hw_stats_ops); + return mlx5_ib_alloc_counters(dev); +} + +void mlx5_ib_counters_cleanup(struct mlx5_ib_dev *dev) +{ + if (!MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) + return; + + mlx5_ib_dealloc_counters(dev); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/counters.h b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/counters.h new file mode 100644 index 0000000..6bcaaa5 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/counters.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2013-2020, Mellanox Technologies inc. All rights reserved. + */ + +#ifndef _MLX5_IB_COUNTERS_H +#define _MLX5_IB_COUNTERS_H + +#include "mlx5_ib.h" + +int mlx5_ib_counters_init(struct mlx5_ib_dev *dev); +void mlx5_ib_counters_cleanup(struct mlx5_ib_dev *dev); +void mlx5_ib_counters_clear_description(struct ib_counters *counters); +int mlx5_ib_flow_counters_set_data(struct ib_counters *ibcounters, + struct mlx5_ib_create_flow *ucmd); +u16 mlx5_ib_get_counters_id(struct mlx5_ib_dev *dev, u32 port_num); +#endif /* _MLX5_IB_COUNTERS_H */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/cq.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/cq.c new file mode 100644 index 0000000..a239e22 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/cq.c @@ -0,0 +1,1439 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include "mlx5_ib.h" +#include "srq.h" +#include "qp.h" + +static void mlx5_ib_cq_comp(struct mlx5_core_cq *cq, struct mlx5_eqe *eqe) +{ + struct ib_cq *ibcq = &to_mibcq(cq)->ibcq; + + ibcq->comp_handler(ibcq, ibcq->cq_context); +} + +static void mlx5_ib_cq_event(struct mlx5_core_cq *mcq, enum mlx5_event type) +{ + struct mlx5_ib_cq *cq = container_of(mcq, struct mlx5_ib_cq, mcq); + struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device); + struct ib_cq *ibcq = &cq->ibcq; + struct ib_event event; + + if (type != MLX5_EVENT_TYPE_CQ_ERROR) { + mlx5_ib_warn(dev, "Unexpected event type %d on CQ %06x\n", + type, mcq->cqn); + return; + } + + if (ibcq->event_handler) { + event.device = &dev->ib_dev; + event.event = IB_EVENT_CQ_ERR; + event.element.cq = ibcq; + ibcq->event_handler(&event, ibcq->cq_context); + } +} + +static void *get_cqe(struct mlx5_ib_cq *cq, int n) +{ + return mlx5_frag_buf_get_wqe(&cq->buf.fbc, n); +} + +static u8 sw_ownership_bit(int n, int nent) +{ + return (n & nent) ? 1 : 0; +} + +static void *get_sw_cqe(struct mlx5_ib_cq *cq, int n) +{ + void *cqe = get_cqe(cq, n & cq->ibcq.cqe); + struct mlx5_cqe64 *cqe64; + + cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64; + + if (likely(get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) && + !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & (cq->ibcq.cqe + 1)))) { + return cqe; + } else { + return NULL; + } +} + +static void *next_cqe_sw(struct mlx5_ib_cq *cq) +{ + return get_sw_cqe(cq, cq->mcq.cons_index); +} + +static enum ib_wc_opcode get_umr_comp(struct mlx5_ib_wq *wq, int idx) +{ + switch (wq->wr_data[idx]) { + case MLX5_IB_WR_UMR: + return 0; + + case IB_WR_LOCAL_INV: + return IB_WC_LOCAL_INV; + + case IB_WR_REG_MR: + return IB_WC_REG_MR; + + default: + pr_warn("unknown completion status\n"); + return 0; + } +} + +static void handle_good_req(struct ib_wc *wc, struct mlx5_cqe64 *cqe, + struct mlx5_ib_wq *wq, int idx) +{ + wc->wc_flags = 0; + switch (be32_to_cpu(cqe->sop_drop_qpn) >> 24) { + case MLX5_OPCODE_RDMA_WRITE_IMM: + wc->wc_flags |= IB_WC_WITH_IMM; + fallthrough; + case MLX5_OPCODE_RDMA_WRITE: + wc->opcode = IB_WC_RDMA_WRITE; + break; + case MLX5_OPCODE_SEND_IMM: + wc->wc_flags |= IB_WC_WITH_IMM; + fallthrough; + case MLX5_OPCODE_NOP: + case MLX5_OPCODE_SEND: + case MLX5_OPCODE_SEND_INVAL: + wc->opcode = IB_WC_SEND; + break; + case MLX5_OPCODE_RDMA_READ: + wc->opcode = IB_WC_RDMA_READ; + wc->byte_len = be32_to_cpu(cqe->byte_cnt); + break; + case MLX5_OPCODE_ATOMIC_CS: + wc->opcode = IB_WC_COMP_SWAP; + wc->byte_len = 8; + break; + case MLX5_OPCODE_ATOMIC_FA: + wc->opcode = IB_WC_FETCH_ADD; + wc->byte_len = 8; + break; + case MLX5_OPCODE_ATOMIC_MASKED_CS: + wc->opcode = IB_WC_MASKED_COMP_SWAP; + wc->byte_len = 8; + break; + case MLX5_OPCODE_ATOMIC_MASKED_FA: + wc->opcode = IB_WC_MASKED_FETCH_ADD; + wc->byte_len = 8; + break; + case MLX5_OPCODE_UMR: + wc->opcode = get_umr_comp(wq, idx); + break; + } +} + +enum { + MLX5_GRH_IN_BUFFER = 1, + MLX5_GRH_IN_CQE = 2, +}; + +static void handle_responder(struct ib_wc *wc, struct mlx5_cqe64 *cqe, + struct mlx5_ib_qp *qp) +{ + enum rdma_link_layer ll = rdma_port_get_link_layer(qp->ibqp.device, 1); + struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.device); + struct mlx5_ib_srq *srq = NULL; + struct mlx5_ib_wq *wq; + u16 wqe_ctr; + u8 roce_packet_type; + bool vlan_present; + u8 g; + + if (qp->ibqp.srq || qp->ibqp.xrcd) { + struct mlx5_core_srq *msrq = NULL; + + if (qp->ibqp.xrcd) { + msrq = mlx5_cmd_get_srq(dev, be32_to_cpu(cqe->srqn)); + if (msrq) + srq = to_mibsrq(msrq); + } else { + srq = to_msrq(qp->ibqp.srq); + } + if (srq) { + wqe_ctr = be16_to_cpu(cqe->wqe_counter); + wc->wr_id = srq->wrid[wqe_ctr]; + mlx5_ib_free_srq_wqe(srq, wqe_ctr); + if (msrq) + mlx5_core_res_put(&msrq->common); + } + } else { + wq = &qp->rq; + wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; + ++wq->tail; + } + wc->byte_len = be32_to_cpu(cqe->byte_cnt); + + switch (get_cqe_opcode(cqe)) { + case MLX5_CQE_RESP_WR_IMM: + wc->opcode = IB_WC_RECV_RDMA_WITH_IMM; + wc->wc_flags = IB_WC_WITH_IMM; + wc->ex.imm_data = cqe->immediate; + break; + case MLX5_CQE_RESP_SEND: + wc->opcode = IB_WC_RECV; + wc->wc_flags = IB_WC_IP_CSUM_OK; + if (unlikely(!((cqe->hds_ip_ext & CQE_L3_OK) && + (cqe->hds_ip_ext & CQE_L4_OK)))) + wc->wc_flags = 0; + break; + case MLX5_CQE_RESP_SEND_IMM: + wc->opcode = IB_WC_RECV; + wc->wc_flags = IB_WC_WITH_IMM; + wc->ex.imm_data = cqe->immediate; + break; + case MLX5_CQE_RESP_SEND_INV: + wc->opcode = IB_WC_RECV; + wc->wc_flags = IB_WC_WITH_INVALIDATE; + wc->ex.invalidate_rkey = be32_to_cpu(cqe->inval_rkey); + break; + } + wc->src_qp = be32_to_cpu(cqe->flags_rqpn) & 0xffffff; + wc->dlid_path_bits = cqe->ml_path; + g = (be32_to_cpu(cqe->flags_rqpn) >> 28) & 3; + wc->wc_flags |= g ? IB_WC_GRH : 0; + if (is_qp1(qp->type)) { + u16 pkey = be32_to_cpu(cqe->pkey) & 0xffff; + + ib_find_cached_pkey(&dev->ib_dev, qp->port, pkey, + &wc->pkey_index); + } else { + wc->pkey_index = 0; + } + + if (ll != IB_LINK_LAYER_ETHERNET) { + wc->slid = be16_to_cpu(cqe->slid); + wc->sl = (be32_to_cpu(cqe->flags_rqpn) >> 24) & 0xf; + return; + } + + wc->slid = 0; + vlan_present = cqe->l4_l3_hdr_type & 0x1; + roce_packet_type = (be32_to_cpu(cqe->flags_rqpn) >> 24) & 0x3; + if (vlan_present) { + wc->vlan_id = (be16_to_cpu(cqe->vlan_info)) & 0xfff; + wc->sl = (be16_to_cpu(cqe->vlan_info) >> 13) & 0x7; + wc->wc_flags |= IB_WC_WITH_VLAN; + } else { + wc->sl = 0; + } + + switch (roce_packet_type) { + case MLX5_CQE_ROCE_L3_HEADER_TYPE_GRH: + wc->network_hdr_type = RDMA_NETWORK_ROCE_V1; + break; + case MLX5_CQE_ROCE_L3_HEADER_TYPE_IPV6: + wc->network_hdr_type = RDMA_NETWORK_IPV6; + break; + case MLX5_CQE_ROCE_L3_HEADER_TYPE_IPV4: + wc->network_hdr_type = RDMA_NETWORK_IPV4; + break; + } + wc->wc_flags |= IB_WC_WITH_NETWORK_HDR_TYPE; +} + +static void dump_cqe(struct mlx5_ib_dev *dev, struct mlx5_err_cqe *cqe) +{ + mlx5_ib_warn(dev, "dump error cqe\n"); + mlx5_dump_err_cqe(dev->mdev, cqe); +} + +static void mlx5_handle_error_cqe(struct mlx5_ib_dev *dev, + struct mlx5_err_cqe *cqe, + struct ib_wc *wc) +{ + int dump = 1; + + switch (cqe->syndrome) { + case MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR: + wc->status = IB_WC_LOC_LEN_ERR; + break; + case MLX5_CQE_SYNDROME_LOCAL_QP_OP_ERR: + wc->status = IB_WC_LOC_QP_OP_ERR; + break; + case MLX5_CQE_SYNDROME_LOCAL_PROT_ERR: + wc->status = IB_WC_LOC_PROT_ERR; + break; + case MLX5_CQE_SYNDROME_WR_FLUSH_ERR: + dump = 0; + wc->status = IB_WC_WR_FLUSH_ERR; + break; + case MLX5_CQE_SYNDROME_MW_BIND_ERR: + wc->status = IB_WC_MW_BIND_ERR; + break; + case MLX5_CQE_SYNDROME_BAD_RESP_ERR: + wc->status = IB_WC_BAD_RESP_ERR; + break; + case MLX5_CQE_SYNDROME_LOCAL_ACCESS_ERR: + wc->status = IB_WC_LOC_ACCESS_ERR; + break; + case MLX5_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR: + wc->status = IB_WC_REM_INV_REQ_ERR; + break; + case MLX5_CQE_SYNDROME_REMOTE_ACCESS_ERR: + wc->status = IB_WC_REM_ACCESS_ERR; + break; + case MLX5_CQE_SYNDROME_REMOTE_OP_ERR: + wc->status = IB_WC_REM_OP_ERR; + break; + case MLX5_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR: + wc->status = IB_WC_RETRY_EXC_ERR; + dump = 0; + break; + case MLX5_CQE_SYNDROME_RNR_RETRY_EXC_ERR: + wc->status = IB_WC_RNR_RETRY_EXC_ERR; + dump = 0; + break; + case MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR: + wc->status = IB_WC_REM_ABORT_ERR; + break; + default: + wc->status = IB_WC_GENERAL_ERR; + break; + } + + wc->vendor_err = cqe->vendor_err_synd; + if (dump) { + mlx5_ib_warn(dev, "WC error: %d, Message: %s\n", wc->status, + ib_wc_status_msg(wc->status)); + dump_cqe(dev, cqe); + } +} + +static void handle_atomics(struct mlx5_ib_qp *qp, struct mlx5_cqe64 *cqe64, + u16 tail, u16 head) +{ + u16 idx; + + do { + idx = tail & (qp->sq.wqe_cnt - 1); + if (idx == head) + break; + + tail = qp->sq.w_list[idx].next; + } while (1); + tail = qp->sq.w_list[idx].next; + qp->sq.last_poll = tail; +} + +static void free_cq_buf(struct mlx5_ib_dev *dev, struct mlx5_ib_cq_buf *buf) +{ + mlx5_frag_buf_free(dev->mdev, &buf->frag_buf); +} + +static void get_sig_err_item(struct mlx5_sig_err_cqe *cqe, + struct ib_sig_err *item) +{ + u16 syndrome = be16_to_cpu(cqe->syndrome); + +#define GUARD_ERR (1 << 13) +#define APPTAG_ERR (1 << 12) +#define REFTAG_ERR (1 << 11) + + if (syndrome & GUARD_ERR) { + item->err_type = IB_SIG_BAD_GUARD; + item->expected = be32_to_cpu(cqe->expected_trans_sig) >> 16; + item->actual = be32_to_cpu(cqe->actual_trans_sig) >> 16; + } else + if (syndrome & REFTAG_ERR) { + item->err_type = IB_SIG_BAD_REFTAG; + item->expected = be32_to_cpu(cqe->expected_reftag); + item->actual = be32_to_cpu(cqe->actual_reftag); + } else + if (syndrome & APPTAG_ERR) { + item->err_type = IB_SIG_BAD_APPTAG; + item->expected = be32_to_cpu(cqe->expected_trans_sig) & 0xffff; + item->actual = be32_to_cpu(cqe->actual_trans_sig) & 0xffff; + } else { + pr_err("Got signature completion error with bad syndrome %04x\n", + syndrome); + } + + item->sig_err_offset = be64_to_cpu(cqe->err_offset); + item->key = be32_to_cpu(cqe->mkey); +} + +static void sw_comp(struct mlx5_ib_qp *qp, int num_entries, struct ib_wc *wc, + int *npolled, bool is_send) +{ + struct mlx5_ib_wq *wq; + unsigned int cur; + int np; + int i; + + wq = (is_send) ? &qp->sq : &qp->rq; + cur = wq->head - wq->tail; + np = *npolled; + + if (cur == 0) + return; + + for (i = 0; i < cur && np < num_entries; i++) { + unsigned int idx; + + idx = (is_send) ? wq->last_poll : wq->tail; + idx &= (wq->wqe_cnt - 1); + wc->wr_id = wq->wrid[idx]; + wc->status = IB_WC_WR_FLUSH_ERR; + wc->vendor_err = MLX5_CQE_SYNDROME_WR_FLUSH_ERR; + wq->tail++; + if (is_send) + wq->last_poll = wq->w_list[idx].next; + np++; + wc->qp = &qp->ibqp; + wc++; + } + *npolled = np; +} + +static void mlx5_ib_poll_sw_comp(struct mlx5_ib_cq *cq, int num_entries, + struct ib_wc *wc, int *npolled) +{ + struct mlx5_ib_qp *qp; + + *npolled = 0; + /* Find uncompleted WQEs belonging to that cq and return mmics ones */ + list_for_each_entry(qp, &cq->list_send_qp, cq_send_list) { + sw_comp(qp, num_entries, wc + *npolled, npolled, true); + if (*npolled >= num_entries) + return; + } + + list_for_each_entry(qp, &cq->list_recv_qp, cq_recv_list) { + sw_comp(qp, num_entries, wc + *npolled, npolled, false); + if (*npolled >= num_entries) + return; + } +} + +static int mlx5_poll_one(struct mlx5_ib_cq *cq, + struct mlx5_ib_qp **cur_qp, + struct ib_wc *wc) +{ + struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device); + struct mlx5_err_cqe *err_cqe; + struct mlx5_cqe64 *cqe64; + struct mlx5_core_qp *mqp; + struct mlx5_ib_wq *wq; + uint8_t opcode; + uint32_t qpn; + u16 wqe_ctr; + void *cqe; + int idx; + +repoll: + cqe = next_cqe_sw(cq); + if (!cqe) + return -EAGAIN; + + cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64; + + ++cq->mcq.cons_index; + + /* Make sure we read CQ entry contents after we've checked the + * ownership bit. + */ + rmb(); + + opcode = get_cqe_opcode(cqe64); + if (unlikely(opcode == MLX5_CQE_RESIZE_CQ)) { + if (likely(cq->resize_buf)) { + free_cq_buf(dev, &cq->buf); + cq->buf = *cq->resize_buf; + kfree(cq->resize_buf); + cq->resize_buf = NULL; + goto repoll; + } else { + mlx5_ib_warn(dev, "unexpected resize cqe\n"); + } + } + + qpn = ntohl(cqe64->sop_drop_qpn) & 0xffffff; + if (!*cur_qp || (qpn != (*cur_qp)->ibqp.qp_num)) { + /* We do not have to take the QP table lock here, + * because CQs will be locked while QPs are removed + * from the table. + */ + mqp = radix_tree_lookup(&dev->qp_table.tree, qpn); + *cur_qp = to_mibqp(mqp); + } + + wc->qp = &(*cur_qp)->ibqp; + switch (opcode) { + case MLX5_CQE_REQ: + wq = &(*cur_qp)->sq; + wqe_ctr = be16_to_cpu(cqe64->wqe_counter); + idx = wqe_ctr & (wq->wqe_cnt - 1); + handle_good_req(wc, cqe64, wq, idx); + handle_atomics(*cur_qp, cqe64, wq->last_poll, idx); + wc->wr_id = wq->wrid[idx]; + wq->tail = wq->wqe_head[idx] + 1; + if (unlikely(wq->wr_data[idx] == MLX5_IB_WR_SIG_CANCELED)) + wc->status = IB_WC_SIG_PIPELINE_CANCELED; + else + wc->status = IB_WC_SUCCESS; + break; + break; + case MLX5_CQE_RESP_WR_IMM: + case MLX5_CQE_RESP_SEND: + case MLX5_CQE_RESP_SEND_IMM: + case MLX5_CQE_RESP_SEND_INV: + handle_responder(wc, cqe64, *cur_qp); + wc->status = IB_WC_SUCCESS; + break; + case MLX5_CQE_RESIZE_CQ: + break; + case MLX5_CQE_REQ_ERR: + case MLX5_CQE_RESP_ERR: + err_cqe = (struct mlx5_err_cqe *)cqe64; + mlx5_handle_error_cqe(dev, err_cqe, wc); + mlx5_ib_dbg(dev, "%s error cqe on cqn 0x%x:\n", + opcode == MLX5_CQE_REQ_ERR ? + "Requestor" : "Responder", cq->mcq.cqn); + mlx5_ib_dbg(dev, "syndrome 0x%x, vendor syndrome 0x%x\n", + err_cqe->syndrome, err_cqe->vendor_err_synd); + if (wc->status != IB_WC_WR_FLUSH_ERR && + (*cur_qp)->type == MLX5_IB_QPT_REG_UMR) + dev->umrc.state = MLX5_UMR_STATE_RECOVER; + + if (opcode == MLX5_CQE_REQ_ERR) { + wq = &(*cur_qp)->sq; + wqe_ctr = be16_to_cpu(cqe64->wqe_counter); + idx = wqe_ctr & (wq->wqe_cnt - 1); + wc->wr_id = wq->wrid[idx]; + wq->tail = wq->wqe_head[idx] + 1; + } else { + struct mlx5_ib_srq *srq; + + if ((*cur_qp)->ibqp.srq) { + srq = to_msrq((*cur_qp)->ibqp.srq); + wqe_ctr = be16_to_cpu(cqe64->wqe_counter); + wc->wr_id = srq->wrid[wqe_ctr]; + mlx5_ib_free_srq_wqe(srq, wqe_ctr); + } else { + wq = &(*cur_qp)->rq; + wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; + ++wq->tail; + } + } + break; + case MLX5_CQE_SIG_ERR: { + struct mlx5_sig_err_cqe *sig_err_cqe = + (struct mlx5_sig_err_cqe *)cqe64; + struct mlx5_core_sig_ctx *sig; + + xa_lock(&dev->sig_mrs); + sig = xa_load(&dev->sig_mrs, + mlx5_base_mkey(be32_to_cpu(sig_err_cqe->mkey))); + get_sig_err_item(sig_err_cqe, &sig->err_item); + sig->sig_err_exists = true; + sig->sigerr_count++; + + mlx5_ib_warn(dev, "CQN: 0x%x Got SIGERR on key: 0x%x err_type %x err_offset %llx expected %x actual %x\n", + cq->mcq.cqn, sig->err_item.key, + sig->err_item.err_type, + sig->err_item.sig_err_offset, + sig->err_item.expected, + sig->err_item.actual); + + xa_unlock(&dev->sig_mrs); + goto repoll; + } + } + + return 0; +} + +static int poll_soft_wc(struct mlx5_ib_cq *cq, int num_entries, + struct ib_wc *wc, bool is_fatal_err) +{ + struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device); + struct mlx5_ib_wc *soft_wc, *next; + int npolled = 0; + + list_for_each_entry_safe(soft_wc, next, &cq->wc_list, list) { + if (npolled >= num_entries) + break; + + mlx5_ib_dbg(dev, "polled software generated completion on CQ 0x%x\n", + cq->mcq.cqn); + + if (unlikely(is_fatal_err)) { + soft_wc->wc.status = IB_WC_WR_FLUSH_ERR; + soft_wc->wc.vendor_err = MLX5_CQE_SYNDROME_WR_FLUSH_ERR; + } + wc[npolled++] = soft_wc->wc; + list_del(&soft_wc->list); + atomic_set(&soft_wc->in_use, 0); + } + + return npolled; +} + +int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) +{ + struct mlx5_ib_cq *cq = to_mcq(ibcq); + struct mlx5_ib_qp *cur_qp = NULL; + struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device); + struct mlx5_core_dev *mdev = dev->mdev; + unsigned long flags; + int soft_polled = 0; + int npolled; + + spin_lock_irqsave(&cq->lock, flags); + if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { + /* make sure no soft wqe's are waiting */ + if (unlikely(!list_empty(&cq->wc_list))) + soft_polled = poll_soft_wc(cq, num_entries, wc, true); + + mlx5_ib_poll_sw_comp(cq, num_entries - soft_polled, + wc + soft_polled, &npolled); + goto out; + } + + if (unlikely(!list_empty(&cq->wc_list))) + soft_polled = poll_soft_wc(cq, num_entries, wc, false); + + for (npolled = 0; npolled < num_entries - soft_polled; npolled++) { + if (mlx5_poll_one(cq, &cur_qp, wc + soft_polled + npolled)) + break; + } + + if (npolled) + mlx5_cq_set_ci(&cq->mcq); +out: + spin_unlock_irqrestore(&cq->lock, flags); + + return soft_polled + npolled; +} + +int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) +{ + struct mlx5_core_dev *mdev = to_mdev(ibcq->device)->mdev; + struct mlx5_ib_cq *cq = to_mcq(ibcq); + void __iomem *uar_page = mdev->priv.uar->map; + unsigned long irq_flags; + int ret = 0; + + spin_lock_irqsave(&cq->lock, irq_flags); + if (cq->notify_flags != IB_CQ_NEXT_COMP) + cq->notify_flags = flags & IB_CQ_SOLICITED_MASK; + + if ((flags & IB_CQ_REPORT_MISSED_EVENTS) && !list_empty(&cq->wc_list)) + ret = 1; + spin_unlock_irqrestore(&cq->lock, irq_flags); + + mlx5_cq_arm(&cq->mcq, + (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ? + MLX5_CQ_DB_REQ_NOT_SOL : MLX5_CQ_DB_REQ_NOT, + uar_page, to_mcq(ibcq)->mcq.cons_index); + + return ret; +} + +static int alloc_cq_frag_buf(struct mlx5_ib_dev *dev, + struct mlx5_ib_cq_buf *buf, + int nent, + int cqe_size) +{ + struct mlx5_frag_buf *frag_buf = &buf->frag_buf; + u8 log_wq_stride = 6 + (cqe_size == 128 ? 1 : 0); + u8 log_wq_sz = ilog2(cqe_size); + int err; + + err = mlx5_frag_buf_alloc_node(dev->mdev, + nent * cqe_size, + frag_buf, + dev->mdev->priv.numa_node); + if (err) + return err; + + mlx5_init_fbc(frag_buf->frags, log_wq_stride, log_wq_sz, &buf->fbc); + + buf->cqe_size = cqe_size; + buf->nent = nent; + + return 0; +} + +enum { + MLX5_CQE_RES_FORMAT_HASH = 0, + MLX5_CQE_RES_FORMAT_CSUM = 1, + MLX5_CQE_RES_FORMAT_CSUM_STRIDX = 3, +}; + +static int mini_cqe_res_format_to_hw(struct mlx5_ib_dev *dev, u8 format) +{ + switch (format) { + case MLX5_IB_CQE_RES_FORMAT_HASH: + return MLX5_CQE_RES_FORMAT_HASH; + case MLX5_IB_CQE_RES_FORMAT_CSUM: + return MLX5_CQE_RES_FORMAT_CSUM; + case MLX5_IB_CQE_RES_FORMAT_CSUM_STRIDX: + if (MLX5_CAP_GEN(dev->mdev, mini_cqe_resp_stride_index)) + return MLX5_CQE_RES_FORMAT_CSUM_STRIDX; + return -EOPNOTSUPP; + default: + return -EINVAL; + } +} + +static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata, + struct mlx5_ib_cq *cq, int entries, u32 **cqb, + int *cqe_size, int *index, int *inlen) +{ + struct mlx5_ib_create_cq ucmd = {}; + unsigned long page_size; + unsigned int page_offset_quantized; + size_t ucmdlen; + __be64 *pas; + int ncont; + void *cqc; + int err; + struct mlx5_ib_ucontext *context = rdma_udata_to_drv_context( + udata, struct mlx5_ib_ucontext, ibucontext); + + ucmdlen = min(udata->inlen, sizeof(ucmd)); + if (ucmdlen < offsetof(struct mlx5_ib_create_cq, flags)) + return -EINVAL; + + if (ib_copy_from_udata(&ucmd, udata, ucmdlen)) + return -EFAULT; + + if ((ucmd.flags & ~(MLX5_IB_CREATE_CQ_FLAGS_CQE_128B_PAD | + MLX5_IB_CREATE_CQ_FLAGS_UAR_PAGE_INDEX | + MLX5_IB_CREATE_CQ_FLAGS_REAL_TIME_TS))) + return -EINVAL; + + if ((ucmd.cqe_size != 64 && ucmd.cqe_size != 128) || + ucmd.reserved0 || ucmd.reserved1) + return -EINVAL; + + *cqe_size = ucmd.cqe_size; + + cq->buf.umem = ib_umem_get_peer(&dev->ib_dev, ucmd.buf_addr, + entries * ucmd.cqe_size, + IB_ACCESS_LOCAL_WRITE, 0); + if (IS_ERR(cq->buf.umem)) { + err = PTR_ERR(cq->buf.umem); + return err; + } + + page_size = mlx5_umem_find_best_cq_quantized_pgoff( + cq->buf.umem, cqc, log_page_size, MLX5_ADAPTER_PAGE_SHIFT, + page_offset, 64, &page_offset_quantized); + if (!page_size) { + err = -EINVAL; + goto err_umem; + } + + err = mlx5_ib_db_map_user(context, ucmd.db_addr, &cq->db); + if (err) + goto err_umem; + + ncont = ib_umem_num_dma_blocks(cq->buf.umem, page_size); + mlx5_ib_dbg( + dev, + "addr 0x%llx, size %u, npages %zu, page_size %lu, ncont %d\n", + ucmd.buf_addr, entries * ucmd.cqe_size, + ib_umem_num_pages(cq->buf.umem), page_size, ncont); + + *inlen = MLX5_ST_SZ_BYTES(create_cq_in) + + MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) * ncont; + *cqb = kvzalloc(*inlen, GFP_KERNEL); + if (!*cqb) { + err = -ENOMEM; + goto err_db; + } + + pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, *cqb, pas); + mlx5_ib_populate_pas(cq->buf.umem, page_size, pas, 0); + + cqc = MLX5_ADDR_OF(create_cq_in, *cqb, cq_context); + MLX5_SET(cqc, cqc, log_page_size, + order_base_2(page_size) - MLX5_ADAPTER_PAGE_SHIFT); + MLX5_SET(cqc, cqc, page_offset, page_offset_quantized); + + if (ucmd.flags & MLX5_IB_CREATE_CQ_FLAGS_UAR_PAGE_INDEX) { + *index = ucmd.uar_page_index; + } else if (context->bfregi.lib_uar_dyn) { + err = -EINVAL; + goto err_cqb; + } else { + *index = context->bfregi.sys_pages[0]; + } + + if (ucmd.cqe_comp_en == 1) { + int mini_cqe_format; + + if (!((*cqe_size == 128 && + MLX5_CAP_GEN(dev->mdev, cqe_compression_128)) || + (*cqe_size == 64 && + MLX5_CAP_GEN(dev->mdev, cqe_compression)))) { + err = -EOPNOTSUPP; + mlx5_ib_warn(dev, "CQE compression is not supported for size %d!\n", + *cqe_size); + goto err_cqb; + } + + mini_cqe_format = + mini_cqe_res_format_to_hw(dev, + ucmd.cqe_comp_res_format); + if (mini_cqe_format < 0) { + err = mini_cqe_format; + mlx5_ib_dbg(dev, "CQE compression res format %d error: %d\n", + ucmd.cqe_comp_res_format, err); + goto err_cqb; + } + + MLX5_SET(cqc, cqc, cqe_comp_en, 1); + MLX5_SET(cqc, cqc, mini_cqe_res_format, mini_cqe_format); + } + + if (ucmd.flags & MLX5_IB_CREATE_CQ_FLAGS_CQE_128B_PAD) { + if (*cqe_size != 128 || + !MLX5_CAP_GEN(dev->mdev, cqe_128_always)) { + err = -EOPNOTSUPP; + mlx5_ib_warn(dev, + "CQE padding is not supported for CQE size of %dB!\n", + *cqe_size); + goto err_cqb; + } + + cq->private_flags |= MLX5_IB_CQ_PR_FLAGS_CQE_128_PAD; + } + + if (ucmd.flags & MLX5_IB_CREATE_CQ_FLAGS_REAL_TIME_TS) + cq->private_flags |= MLX5_IB_CQ_PR_FLAGS_REAL_TIME_TS; + + MLX5_SET(create_cq_in, *cqb, uid, context->devx_uid); + return 0; + +err_cqb: + kvfree(*cqb); + +err_db: + mlx5_ib_db_unmap_user(context, &cq->db); + +err_umem: + ib_umem_release(cq->buf.umem); + return err; +} + +static void destroy_cq_user(struct mlx5_ib_cq *cq, struct ib_udata *udata) +{ + struct mlx5_ib_ucontext *context = rdma_udata_to_drv_context( + udata, struct mlx5_ib_ucontext, ibucontext); + + mlx5_ib_db_unmap_user(context, &cq->db); + ib_umem_release(cq->buf.umem); +} + +static void init_cq_frag_buf(struct mlx5_ib_cq_buf *buf) +{ + int i; + void *cqe; + struct mlx5_cqe64 *cqe64; + + for (i = 0; i < buf->nent; i++) { + cqe = mlx5_frag_buf_get_wqe(&buf->fbc, i); + cqe64 = buf->cqe_size == 64 ? cqe : cqe + 64; + cqe64->op_own = MLX5_CQE_INVALID << 4; + } +} + +static int create_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, + int entries, int cqe_size, + u32 **cqb, int *index, int *inlen) +{ + __be64 *pas; + void *cqc; + int err; + + err = mlx5_db_alloc(dev->mdev, &cq->db); + if (err) + return err; + + cq->mcq.set_ci_db = cq->db.db; + cq->mcq.arm_db = cq->db.db + 1; + cq->mcq.cqe_sz = cqe_size; + + err = alloc_cq_frag_buf(dev, &cq->buf, entries, cqe_size); + if (err) + goto err_db; + + init_cq_frag_buf(&cq->buf); + + *inlen = MLX5_ST_SZ_BYTES(create_cq_in) + + MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) * + cq->buf.frag_buf.npages; + *cqb = kvzalloc(*inlen, GFP_KERNEL); + if (!*cqb) { + err = -ENOMEM; + goto err_buf; + } + + pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, *cqb, pas); + mlx5_fill_page_frag_array(&cq->buf.frag_buf, pas); + + cqc = MLX5_ADDR_OF(create_cq_in, *cqb, cq_context); + MLX5_SET(cqc, cqc, log_page_size, + cq->buf.frag_buf.page_shift - + MLX5_ADAPTER_PAGE_SHIFT); + + *index = dev->mdev->priv.uar->index; + + return 0; + +err_buf: + free_cq_buf(dev, &cq->buf); + +err_db: + mlx5_db_free(dev->mdev, &cq->db); + return err; +} + +static void destroy_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq) +{ + free_cq_buf(dev, &cq->buf); + mlx5_db_free(dev->mdev, &cq->db); +} + +static void notify_soft_wc_handler(struct work_struct *work) +{ + struct mlx5_ib_cq *cq = container_of(work, struct mlx5_ib_cq, + notify_work); + + cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); +} + +int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata) +{ + struct ib_device *ibdev = ibcq->device; + int entries = attr->cqe; + int vector = attr->comp_vector; + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_ib_cq *cq = to_mcq(ibcq); + u32 out[MLX5_ST_SZ_DW(create_cq_out)]; + int index; + int inlen; + u32 *cqb = NULL; + void *cqc; + int cqe_size; + int eqn; + int err; + + if (entries < 0 || + (entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz)))) + return -EINVAL; + + if (check_cq_create_flags(attr->flags)) + return -EOPNOTSUPP; + + entries = roundup_pow_of_two(entries + 1); + if (entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz))) + return -EINVAL; + + cq->ibcq.cqe = entries - 1; + mutex_init(&cq->resize_mutex); + spin_lock_init(&cq->lock); + cq->resize_buf = NULL; + cq->resize_umem = NULL; + cq->create_flags = attr->flags; + INIT_LIST_HEAD(&cq->list_send_qp); + INIT_LIST_HEAD(&cq->list_recv_qp); + + if (udata) { + err = create_cq_user(dev, udata, cq, entries, &cqb, &cqe_size, + &index, &inlen); + if (err) + return err; + } else { + cqe_size = cache_line_size() == 128 ? 128 : 64; + err = create_cq_kernel(dev, cq, entries, cqe_size, &cqb, + &index, &inlen); + if (err) + return err; + + INIT_WORK(&cq->notify_work, notify_soft_wc_handler); + } + + err = mlx5_vector2eqn(dev->mdev, vector, &eqn); + if (err) + goto err_cqb; + + cq->cqe_size = cqe_size; + + cqc = MLX5_ADDR_OF(create_cq_in, cqb, cq_context); + MLX5_SET(cqc, cqc, cqe_sz, + cqe_sz_to_mlx_sz(cqe_size, + cq->private_flags & + MLX5_IB_CQ_PR_FLAGS_CQE_128_PAD)); + MLX5_SET(cqc, cqc, log_cq_size, ilog2(entries)); + MLX5_SET(cqc, cqc, uar_page, index); + MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn); + MLX5_SET64(cqc, cqc, dbr_addr, cq->db.dma); + if (cq->create_flags & IB_UVERBS_CQ_FLAGS_IGNORE_OVERRUN) + MLX5_SET(cqc, cqc, oi, 1); + + err = mlx5_core_create_cq(dev->mdev, &cq->mcq, cqb, inlen, out, sizeof(out)); + if (err) + goto err_cqb; + + mlx5_ib_dbg(dev, "cqn 0x%x\n", cq->mcq.cqn); + if (udata) + cq->mcq.tasklet_ctx.comp = mlx5_ib_cq_comp; + else + cq->mcq.comp = mlx5_ib_cq_comp; + cq->mcq.event = mlx5_ib_cq_event; + + INIT_LIST_HEAD(&cq->wc_list); + + if (udata) + if (ib_copy_to_udata(udata, &cq->mcq.cqn, sizeof(__u32))) { + err = -EFAULT; + goto err_cmd; + } + + + kvfree(cqb); + return 0; + +err_cmd: + mlx5_core_destroy_cq(dev->mdev, &cq->mcq); + +err_cqb: + kvfree(cqb); + if (udata) + destroy_cq_user(cq, udata); + else + destroy_cq_kernel(dev, cq); + return err; +} + +int mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(cq->device); + struct mlx5_ib_cq *mcq = to_mcq(cq); + int ret; + + ret = mlx5_core_destroy_cq(dev->mdev, &mcq->mcq); + if (ret) + return ret; + + if (udata) + destroy_cq_user(mcq, udata); + else + destroy_cq_kernel(dev, mcq); + return 0; +} + +static int is_equal_rsn(struct mlx5_cqe64 *cqe64, u32 rsn) +{ + return rsn == (ntohl(cqe64->sop_drop_qpn) & 0xffffff); +} + +void __mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 rsn, struct mlx5_ib_srq *srq) +{ + struct mlx5_cqe64 *cqe64, *dest64; + void *cqe, *dest; + u32 prod_index; + int nfreed = 0; + u8 owner_bit; + + if (!cq) + return; + + /* First we need to find the current producer index, so we + * know where to start cleaning from. It doesn't matter if HW + * adds new entries after this loop -- the QP we're worried + * about is already in RESET, so the new entries won't come + * from our QP and therefore don't need to be checked. + */ + for (prod_index = cq->mcq.cons_index; get_sw_cqe(cq, prod_index); prod_index++) + if (prod_index == cq->mcq.cons_index + cq->ibcq.cqe) + break; + + /* Now sweep backwards through the CQ, removing CQ entries + * that match our QP by copying older entries on top of them. + */ + while ((int) --prod_index - (int) cq->mcq.cons_index >= 0) { + cqe = get_cqe(cq, prod_index & cq->ibcq.cqe); + cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64; + if (is_equal_rsn(cqe64, rsn)) { + if (srq && (ntohl(cqe64->srqn) & 0xffffff)) + mlx5_ib_free_srq_wqe(srq, be16_to_cpu(cqe64->wqe_counter)); + ++nfreed; + } else if (nfreed) { + dest = get_cqe(cq, (prod_index + nfreed) & cq->ibcq.cqe); + dest64 = (cq->mcq.cqe_sz == 64) ? dest : dest + 64; + owner_bit = dest64->op_own & MLX5_CQE_OWNER_MASK; + memcpy(dest, cqe, cq->mcq.cqe_sz); + dest64->op_own = owner_bit | + (dest64->op_own & ~MLX5_CQE_OWNER_MASK); + } + } + + if (nfreed) { + cq->mcq.cons_index += nfreed; + /* Make sure update of buffer contents is done before + * updating consumer index. + */ + wmb(); + mlx5_cq_set_ci(&cq->mcq); + } +} + +void mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq) +{ + if (!cq) + return; + + spin_lock_irq(&cq->lock); + __mlx5_ib_cq_clean(cq, qpn, srq); + spin_unlock_irq(&cq->lock); +} + +int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period) +{ + struct mlx5_ib_dev *dev = to_mdev(cq->device); + struct mlx5_ib_cq *mcq = to_mcq(cq); + int err; + + if (!MLX5_CAP_GEN(dev->mdev, cq_moderation)) + return -EOPNOTSUPP; + + if (cq_period > MLX5_MAX_CQ_PERIOD) + return -EINVAL; + + err = mlx5_core_modify_cq_moderation(dev->mdev, &mcq->mcq, + cq_period, cq_count); + if (err) + mlx5_ib_warn(dev, "modify cq 0x%x failed\n", mcq->mcq.cqn); + + return err; +} + +static int resize_user(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, + int entries, struct ib_udata *udata, + int *cqe_size) +{ + struct mlx5_ib_resize_cq ucmd; + struct ib_umem *umem; + int err; + + err = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)); + if (err) + return err; + + if (ucmd.reserved0 || ucmd.reserved1) + return -EINVAL; + + /* check multiplication overflow */ + if (ucmd.cqe_size && SIZE_MAX / ucmd.cqe_size <= entries - 1) + return -EINVAL; + + umem = ib_umem_get_peer(&dev->ib_dev, ucmd.buf_addr, + (size_t)ucmd.cqe_size * entries, + IB_ACCESS_LOCAL_WRITE, 0); + if (IS_ERR(umem)) { + err = PTR_ERR(umem); + return err; + } + + cq->resize_umem = umem; + *cqe_size = ucmd.cqe_size; + + return 0; +} + +static int resize_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, + int entries, int cqe_size) +{ + int err; + + cq->resize_buf = kzalloc(sizeof(*cq->resize_buf), GFP_KERNEL); + if (!cq->resize_buf) + return -ENOMEM; + + err = alloc_cq_frag_buf(dev, cq->resize_buf, entries, cqe_size); + if (err) + goto ex; + + init_cq_frag_buf(cq->resize_buf); + + return 0; + +ex: + kfree(cq->resize_buf); + return err; +} + +static int copy_resize_cqes(struct mlx5_ib_cq *cq) +{ + struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device); + struct mlx5_cqe64 *scqe64; + struct mlx5_cqe64 *dcqe64; + void *start_cqe; + void *scqe; + void *dcqe; + int ssize; + int dsize; + int i; + u8 sw_own; + + ssize = cq->buf.cqe_size; + dsize = cq->resize_buf->cqe_size; + if (ssize != dsize) { + mlx5_ib_warn(dev, "resize from different cqe size is not supported\n"); + return -EINVAL; + } + + i = cq->mcq.cons_index; + scqe = get_sw_cqe(cq, i); + scqe64 = ssize == 64 ? scqe : scqe + 64; + start_cqe = scqe; + if (!scqe) { + mlx5_ib_warn(dev, "expected cqe in sw ownership\n"); + return -EINVAL; + } + + while (get_cqe_opcode(scqe64) != MLX5_CQE_RESIZE_CQ) { + dcqe = mlx5_frag_buf_get_wqe(&cq->resize_buf->fbc, + (i + 1) & cq->resize_buf->nent); + dcqe64 = dsize == 64 ? dcqe : dcqe + 64; + sw_own = sw_ownership_bit(i + 1, cq->resize_buf->nent); + memcpy(dcqe, scqe, dsize); + dcqe64->op_own = (dcqe64->op_own & ~MLX5_CQE_OWNER_MASK) | sw_own; + + ++i; + scqe = get_sw_cqe(cq, i); + scqe64 = ssize == 64 ? scqe : scqe + 64; + if (!scqe) { + mlx5_ib_warn(dev, "expected cqe in sw ownership\n"); + return -EINVAL; + } + + if (scqe == start_cqe) { + pr_warn("resize CQ failed to get resize CQE, CQN 0x%x\n", + cq->mcq.cqn); + return -ENOMEM; + } + } + ++cq->mcq.cons_index; + return 0; +} + +int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(ibcq->device); + struct mlx5_ib_cq *cq = to_mcq(ibcq); + void *cqc; + u32 *in; + int err; + int npas; + __be64 *pas; + unsigned int page_offset_quantized = 0; + unsigned int page_shift; + int inlen; + int cqe_size; + unsigned long flags; + + if (!MLX5_CAP_GEN(dev->mdev, cq_resize)) { + pr_info("Firmware does not support resize CQ\n"); + return -ENOSYS; + } + + if (entries < 1 || + entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz))) { + mlx5_ib_warn(dev, "wrong entries number %d, max %d\n", + entries, + 1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz)); + return -EINVAL; + } + + entries = roundup_pow_of_two(entries + 1); + if (entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz)) + 1) + return -EINVAL; + + if (entries == ibcq->cqe + 1) + return 0; + + mutex_lock(&cq->resize_mutex); + if (udata) { + unsigned long page_size; + + err = resize_user(dev, cq, entries, udata, &cqe_size); + if (err) + goto ex; + + page_size = mlx5_umem_find_best_cq_quantized_pgoff( + cq->resize_umem, cqc, log_page_size, + MLX5_ADAPTER_PAGE_SHIFT, page_offset, 64, + &page_offset_quantized); + if (!page_size) { + err = -EINVAL; + goto ex_resize; + } + npas = ib_umem_num_dma_blocks(cq->resize_umem, page_size); + page_shift = order_base_2(page_size); + } else { + struct mlx5_frag_buf *frag_buf; + + cqe_size = 64; + err = resize_kernel(dev, cq, entries, cqe_size); + if (err) + goto ex; + frag_buf = &cq->resize_buf->frag_buf; + npas = frag_buf->npages; + page_shift = frag_buf->page_shift; + } + + inlen = MLX5_ST_SZ_BYTES(modify_cq_in) + + MLX5_FLD_SZ_BYTES(modify_cq_in, pas[0]) * npas; + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto ex_resize; + } + + pas = (__be64 *)MLX5_ADDR_OF(modify_cq_in, in, pas); + if (udata) + mlx5_ib_populate_pas(cq->resize_umem, 1UL << page_shift, pas, + 0); + else + mlx5_fill_page_frag_array(&cq->resize_buf->frag_buf, pas); + + MLX5_SET(modify_cq_in, in, + modify_field_select_resize_field_select.resize_field_select.resize_field_select, + MLX5_MODIFY_CQ_MASK_LOG_SIZE | + MLX5_MODIFY_CQ_MASK_PG_OFFSET | + MLX5_MODIFY_CQ_MASK_PG_SIZE); + + cqc = MLX5_ADDR_OF(modify_cq_in, in, cq_context); + + MLX5_SET(cqc, cqc, log_page_size, + page_shift - MLX5_ADAPTER_PAGE_SHIFT); + MLX5_SET(cqc, cqc, page_offset, page_offset_quantized); + MLX5_SET(cqc, cqc, cqe_sz, + cqe_sz_to_mlx_sz(cqe_size, + cq->private_flags & + MLX5_IB_CQ_PR_FLAGS_CQE_128_PAD)); + MLX5_SET(cqc, cqc, log_cq_size, ilog2(entries)); + + MLX5_SET(modify_cq_in, in, op_mod, MLX5_CQ_OPMOD_RESIZE); + MLX5_SET(modify_cq_in, in, cqn, cq->mcq.cqn); + + err = mlx5_core_modify_cq(dev->mdev, &cq->mcq, in, inlen); + if (err) + goto ex_alloc; + + if (udata) { + cq->ibcq.cqe = entries - 1; + ib_umem_release(cq->buf.umem); + cq->buf.umem = cq->resize_umem; + cq->resize_umem = NULL; + } else { + struct mlx5_ib_cq_buf tbuf; + int resized = 0; + + spin_lock_irqsave(&cq->lock, flags); + if (cq->resize_buf) { + err = copy_resize_cqes(cq); + if (!err) { + tbuf = cq->buf; + cq->buf = *cq->resize_buf; + kfree(cq->resize_buf); + cq->resize_buf = NULL; + resized = 1; + } + } + cq->ibcq.cqe = entries - 1; + spin_unlock_irqrestore(&cq->lock, flags); + if (resized) + free_cq_buf(dev, &tbuf); + } + mutex_unlock(&cq->resize_mutex); + + kvfree(in); + return 0; + +ex_alloc: + kvfree(in); + +ex_resize: + ib_umem_release(cq->resize_umem); + if (!udata) { + free_cq_buf(dev, cq->resize_buf); + cq->resize_buf = NULL; + } +ex: + mutex_unlock(&cq->resize_mutex); + return err; +} + +int mlx5_ib_get_cqe_size(struct ib_cq *ibcq) +{ + struct mlx5_ib_cq *cq; + + if (!ibcq) + return 128; + + cq = to_mcq(ibcq); + return cq->cqe_size; +} + +/* Called from atomic context */ +void mlx5_ib_generate_wc(struct ib_cq *ibcq, struct mlx5_ib_wc *soft_wc) +{ + struct mlx5_ib_cq *cq = to_mcq(ibcq); + unsigned long flags; + + spin_lock_irqsave(&cq->lock, flags); + list_add_tail(&soft_wc->list, &cq->wc_list); + atomic_set(&soft_wc->in_use, 1); + if (cq->notify_flags == IB_CQ_NEXT_COMP || + soft_wc->wc.status != IB_WC_SUCCESS) { + cq->notify_flags = 0; + schedule_work(&cq->notify_work); + } + spin_unlock_irqrestore(&cq->lock, flags); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/devx.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/devx.c new file mode 100644 index 0000000..4ec0222 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/devx.c @@ -0,0 +1,3286 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2018, Mellanox Technologies inc. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "mlx5_ib.h" +#include "ib_rep.h" +#include "devx.h" +#include "qp.h" +#include + +#define UVERBS_MODULE_NAME mlx5_ib +#include + +static void dispatch_event_fd(struct list_head *fd_list, const void *data); + +enum devx_obj_flags { + DEVX_OBJ_FLAGS_INDIRECT_MKEY = 1 << 0, + DEVX_OBJ_FLAGS_DCT = 1 << 1, + DEVX_OBJ_FLAGS_CQ = 1 << 2, +}; + +struct devx_async_data { + struct mlx5_ib_dev *mdev; + struct list_head list; + struct devx_async_cmd_event_file *ev_file; + struct mlx5_async_work cb_work; + u16 cmd_out_len; + /* must be last field in this structure */ + struct mlx5_ib_uapi_devx_async_cmd_hdr hdr; +}; + +struct devx_async_event_data { + struct list_head list; /* headed in ev_file->event_list */ + struct mlx5_ib_uapi_devx_async_event_hdr hdr; +}; + +/* first level XA value data structure */ +struct devx_event { + struct xarray object_ids; /* second XA level, Key = object id */ + struct list_head unaffiliated_list; +}; + +/* second level XA value data structure */ +struct devx_obj_event { + struct rcu_head rcu; + struct list_head obj_sub_list; +}; + +struct devx_event_subscription { + struct list_head file_list; /* headed in ev_file-> + * subscribed_events_list + */ + struct list_head xa_list; /* headed in devx_event->unaffiliated_list or + * devx_obj_event->obj_sub_list + */ + struct list_head obj_list; /* headed in devx_object */ + struct list_head event_list; /* headed in ev_file->event_list or in + * temp list via subscription + */ + + u8 is_cleaned:1; + u32 xa_key_level1; + u32 xa_key_level2; + struct rcu_head rcu; + u64 cookie; + struct devx_async_event_file *ev_file; + struct eventfd_ctx *eventfd; +}; + +struct devx_async_event_file { + struct ib_uobject uobj; + /* Head of events that are subscribed to this FD */ + struct list_head subscribed_events_list; + spinlock_t lock; + wait_queue_head_t poll_wait; + struct list_head event_list; + struct mlx5_ib_dev *dev; + u8 omit_data:1; + u8 is_overflow_err:1; + u8 is_destroyed:1; +}; + +struct devx_umem { + struct mlx5_core_dev *mdev; + struct ib_umem *umem; + u32 dinlen; + u32 dinbox[MLX5_ST_SZ_DW(destroy_umem_in)]; +}; + +struct devx_umem_reg_cmd { + void *in; + u32 inlen; + u32 out[MLX5_ST_SZ_DW(create_umem_out)]; +}; + +static struct mlx5_ib_ucontext * +devx_ufile2uctx(const struct uverbs_attr_bundle *attrs) +{ + return to_mucontext(ib_uverbs_get_ucontext(attrs)); +} + +int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user) +{ + u32 in[MLX5_ST_SZ_DW(create_uctx_in)] = {}; + u32 out[MLX5_ST_SZ_DW(create_uctx_out)] = {}; + void *uctx; + int err; + u16 uid; + u32 cap = 0; + + /* 0 means not supported */ + if (!MLX5_CAP_GEN(dev->mdev, log_max_uctx)) + return -EINVAL; + + uctx = MLX5_ADDR_OF(create_uctx_in, in, uctx); + if (is_user && capable(CAP_NET_RAW) && + (MLX5_CAP_GEN(dev->mdev, uctx_cap) & MLX5_UCTX_CAP_RAW_TX)) + cap |= MLX5_UCTX_CAP_RAW_TX; + if (is_user && capable(CAP_SYS_RAWIO) && + (MLX5_CAP_GEN(dev->mdev, uctx_cap) & + MLX5_UCTX_CAP_INTERNAL_DEV_RES)) + cap |= MLX5_UCTX_CAP_INTERNAL_DEV_RES; + + MLX5_SET(create_uctx_in, in, opcode, MLX5_CMD_OP_CREATE_UCTX); + MLX5_SET(uctx, uctx, cap, cap); + + err = mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out)); + if (err) + return err; + + uid = MLX5_GET(create_uctx_out, out, uid); + return uid; +} + +void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, u16 uid) +{ + u32 in[MLX5_ST_SZ_DW(destroy_uctx_in)] = {}; + u32 out[MLX5_ST_SZ_DW(destroy_uctx_out)] = {}; + + MLX5_SET(destroy_uctx_in, in, opcode, MLX5_CMD_OP_DESTROY_UCTX); + MLX5_SET(destroy_uctx_in, in, uid, uid); + + mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out)); +} + +static bool is_legacy_unaffiliated_event_num(u16 event_num) +{ + switch (event_num) { + case MLX5_EVENT_TYPE_PORT_CHANGE: + return true; + default: + return false; + } +} + +static bool is_legacy_obj_event_num(u16 event_num) +{ + switch (event_num) { + case MLX5_EVENT_TYPE_PATH_MIG: + case MLX5_EVENT_TYPE_COMM_EST: + case MLX5_EVENT_TYPE_SQ_DRAINED: + case MLX5_EVENT_TYPE_SRQ_LAST_WQE: + case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT: + case MLX5_EVENT_TYPE_CQ_ERROR: + case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: + case MLX5_EVENT_TYPE_PATH_MIG_FAILED: + case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: + case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: + case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR: + case MLX5_EVENT_TYPE_DCT_DRAINED: + case MLX5_EVENT_TYPE_COMP: + case MLX5_EVENT_TYPE_DCT_KEY_VIOLATION: + case MLX5_EVENT_TYPE_XRQ_ERROR: + return true; + default: + return false; + } +} + +static u16 get_legacy_obj_type(u16 opcode) +{ + switch (opcode) { + case MLX5_CMD_OP_CREATE_RQ: + return MLX5_EVENT_QUEUE_TYPE_RQ; + case MLX5_CMD_OP_CREATE_QP: + return MLX5_EVENT_QUEUE_TYPE_QP; + case MLX5_CMD_OP_CREATE_SQ: + return MLX5_EVENT_QUEUE_TYPE_SQ; + case MLX5_CMD_OP_CREATE_DCT: + return MLX5_EVENT_QUEUE_TYPE_DCT; + default: + return 0; + } +} + +static u16 get_dec_obj_type(struct devx_obj *obj, u16 event_num) +{ + u16 opcode; + + opcode = (obj->obj_id >> 32) & 0xffff; + + if (is_legacy_obj_event_num(event_num)) + return get_legacy_obj_type(opcode); + + switch (opcode) { + case MLX5_CMD_OP_CREATE_GENERAL_OBJECT: + return (obj->obj_id >> 48); + case MLX5_CMD_OP_CREATE_RQ: + return MLX5_OBJ_TYPE_RQ; + case MLX5_CMD_OP_CREATE_QP: + return MLX5_OBJ_TYPE_QP; + case MLX5_CMD_OP_CREATE_SQ: + return MLX5_OBJ_TYPE_SQ; + case MLX5_CMD_OP_CREATE_DCT: + return MLX5_OBJ_TYPE_DCT; + case MLX5_CMD_OP_CREATE_TIR: + return MLX5_OBJ_TYPE_TIR; + case MLX5_CMD_OP_CREATE_TIS: + return MLX5_OBJ_TYPE_TIS; + case MLX5_CMD_OP_CREATE_PSV: + return MLX5_OBJ_TYPE_PSV; + case MLX5_OBJ_TYPE_MKEY: + return MLX5_OBJ_TYPE_MKEY; + case MLX5_CMD_OP_CREATE_RMP: + return MLX5_OBJ_TYPE_RMP; + case MLX5_CMD_OP_CREATE_XRC_SRQ: + return MLX5_OBJ_TYPE_XRC_SRQ; + case MLX5_CMD_OP_CREATE_XRQ: + return MLX5_OBJ_TYPE_XRQ; + case MLX5_CMD_OP_CREATE_RQT: + return MLX5_OBJ_TYPE_RQT; + case MLX5_CMD_OP_ALLOC_FLOW_COUNTER: + return MLX5_OBJ_TYPE_FLOW_COUNTER; + case MLX5_CMD_OP_CREATE_CQ: + return MLX5_OBJ_TYPE_CQ; + default: + return 0; + } +} + +static u16 get_event_obj_type(unsigned long event_type, struct mlx5_eqe *eqe) +{ + switch (event_type) { + case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: + case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: + case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: + case MLX5_EVENT_TYPE_SRQ_LAST_WQE: + case MLX5_EVENT_TYPE_PATH_MIG: + case MLX5_EVENT_TYPE_PATH_MIG_FAILED: + case MLX5_EVENT_TYPE_COMM_EST: + case MLX5_EVENT_TYPE_SQ_DRAINED: + case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT: + case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR: + return eqe->data.qp_srq.type; + case MLX5_EVENT_TYPE_CQ_ERROR: + case MLX5_EVENT_TYPE_XRQ_ERROR: + return 0; + case MLX5_EVENT_TYPE_DCT_DRAINED: + case MLX5_EVENT_TYPE_DCT_KEY_VIOLATION: + return MLX5_EVENT_QUEUE_TYPE_DCT; + default: + return MLX5_GET(affiliated_event_header, &eqe->data, obj_type); + } +} + +static u32 get_dec_obj_id(u64 obj_id) +{ + return (obj_id & 0xffffffff); +} + +/* + * As the obj_id in the firmware is not globally unique the object type + * must be considered upon checking for a valid object id. + * For that the opcode of the creator command is encoded as part of the obj_id. + */ +static u64 get_enc_obj_id(u32 opcode, u32 obj_id) +{ + return ((u64)opcode << 32) | obj_id; +} + +static u32 devx_get_created_obj_id(const void *in, const void *out, u16 opcode) +{ + switch (opcode) { + case MLX5_CMD_OP_CREATE_GENERAL_OBJECT: + return MLX5_GET(general_obj_out_cmd_hdr, out, obj_id); + case MLX5_CMD_OP_CREATE_UMEM: + return MLX5_GET(create_umem_out, out, umem_id); + case MLX5_CMD_OP_CREATE_MKEY: + return MLX5_GET(create_mkey_out, out, mkey_index); + case MLX5_CMD_OP_CREATE_CQ: + return MLX5_GET(create_cq_out, out, cqn); + case MLX5_CMD_OP_ALLOC_PD: + return MLX5_GET(alloc_pd_out, out, pd); + case MLX5_CMD_OP_ALLOC_TRANSPORT_DOMAIN: + return MLX5_GET(alloc_transport_domain_out, out, + transport_domain); + case MLX5_CMD_OP_CREATE_RMP: + return MLX5_GET(create_rmp_out, out, rmpn); + case MLX5_CMD_OP_CREATE_SQ: + return MLX5_GET(create_sq_out, out, sqn); + case MLX5_CMD_OP_CREATE_RQ: + return MLX5_GET(create_rq_out, out, rqn); + case MLX5_CMD_OP_CREATE_RQT: + return MLX5_GET(create_rqt_out, out, rqtn); + case MLX5_CMD_OP_CREATE_TIR: + return MLX5_GET(create_tir_out, out, tirn); + case MLX5_CMD_OP_CREATE_TIS: + return MLX5_GET(create_tis_out, out, tisn); + case MLX5_CMD_OP_ALLOC_Q_COUNTER: + return MLX5_GET(alloc_q_counter_out, out, counter_set_id); + case MLX5_CMD_OP_CREATE_FLOW_TABLE: + return MLX5_GET(create_flow_table_out, out, table_id); + case MLX5_CMD_OP_CREATE_FLOW_GROUP: + return MLX5_GET(create_flow_group_out, out, group_id); + case MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY: + return MLX5_GET(set_fte_in, in, flow_index); + case MLX5_CMD_OP_ALLOC_FLOW_COUNTER: + return MLX5_GET(alloc_flow_counter_out, out, flow_counter_id); + case MLX5_CMD_OP_ALLOC_PACKET_REFORMAT_CONTEXT: + return MLX5_GET(alloc_packet_reformat_context_out, out, + packet_reformat_id); + case MLX5_CMD_OP_ALLOC_MODIFY_HEADER_CONTEXT: + return MLX5_GET(alloc_modify_header_context_out, out, + modify_header_id); + case MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT: + return MLX5_GET(create_scheduling_element_out, out, + scheduling_element_id); + case MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT: + return MLX5_GET(add_vxlan_udp_dport_in, in, vxlan_udp_port); + case MLX5_CMD_OP_SET_L2_TABLE_ENTRY: + return MLX5_GET(set_l2_table_entry_in, in, table_index); + case MLX5_CMD_OP_CREATE_QP: + return MLX5_GET(create_qp_out, out, qpn); + case MLX5_CMD_OP_CREATE_SRQ: + return MLX5_GET(create_srq_out, out, srqn); + case MLX5_CMD_OP_CREATE_XRC_SRQ: + return MLX5_GET(create_xrc_srq_out, out, xrc_srqn); + case MLX5_CMD_OP_CREATE_DCT: + return MLX5_GET(create_dct_out, out, dctn); + case MLX5_CMD_OP_CREATE_XRQ: + return MLX5_GET(create_xrq_out, out, xrqn); + case MLX5_CMD_OP_ATTACH_TO_MCG: + return MLX5_GET(attach_to_mcg_in, in, qpn); + case MLX5_CMD_OP_ALLOC_XRCD: + return MLX5_GET(alloc_xrcd_out, out, xrcd); + case MLX5_CMD_OP_CREATE_PSV: + return MLX5_GET(create_psv_out, out, psv0_index); + default: + /* The entry must match to one of the devx_is_obj_create_cmd */ + WARN_ON(true); + return 0; + } +} + +static u64 devx_get_obj_id(const void *in) +{ + u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, in, opcode); + u64 obj_id; + + switch (opcode) { + case MLX5_CMD_OP_MODIFY_GENERAL_OBJECT: + case MLX5_CMD_OP_QUERY_GENERAL_OBJECT: + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_GENERAL_OBJECT | + MLX5_GET(general_obj_in_cmd_hdr, in, + obj_type) << 16, + MLX5_GET(general_obj_in_cmd_hdr, in, + obj_id)); + break; + case MLX5_CMD_OP_QUERY_MKEY: + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_MKEY, + MLX5_GET(query_mkey_in, in, + mkey_index)); + break; + case MLX5_CMD_OP_QUERY_CQ: + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_CQ, + MLX5_GET(query_cq_in, in, cqn)); + break; + case MLX5_CMD_OP_MODIFY_CQ: + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_CQ, + MLX5_GET(modify_cq_in, in, cqn)); + break; + case MLX5_CMD_OP_QUERY_SQ: + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_SQ, + MLX5_GET(query_sq_in, in, sqn)); + break; + case MLX5_CMD_OP_MODIFY_SQ: + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_SQ, + MLX5_GET(modify_sq_in, in, sqn)); + break; + case MLX5_CMD_OP_QUERY_RQ: + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_RQ, + MLX5_GET(query_rq_in, in, rqn)); + break; + case MLX5_CMD_OP_MODIFY_RQ: + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_RQ, + MLX5_GET(modify_rq_in, in, rqn)); + break; + case MLX5_CMD_OP_QUERY_RMP: + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_RMP, + MLX5_GET(query_rmp_in, in, rmpn)); + break; + case MLX5_CMD_OP_MODIFY_RMP: + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_RMP, + MLX5_GET(modify_rmp_in, in, rmpn)); + break; + case MLX5_CMD_OP_QUERY_RQT: + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_RQT, + MLX5_GET(query_rqt_in, in, rqtn)); + break; + case MLX5_CMD_OP_MODIFY_RQT: + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_RQT, + MLX5_GET(modify_rqt_in, in, rqtn)); + break; + case MLX5_CMD_OP_QUERY_TIR: + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_TIR, + MLX5_GET(query_tir_in, in, tirn)); + break; + case MLX5_CMD_OP_MODIFY_TIR: + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_TIR, + MLX5_GET(modify_tir_in, in, tirn)); + break; + case MLX5_CMD_OP_QUERY_TIS: + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_TIS, + MLX5_GET(query_tis_in, in, tisn)); + break; + case MLX5_CMD_OP_MODIFY_TIS: + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_TIS, + MLX5_GET(modify_tis_in, in, tisn)); + break; + case MLX5_CMD_OP_QUERY_FLOW_TABLE: + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_FLOW_TABLE, + MLX5_GET(query_flow_table_in, in, + table_id)); + break; + case MLX5_CMD_OP_MODIFY_FLOW_TABLE: + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_FLOW_TABLE, + MLX5_GET(modify_flow_table_in, in, + table_id)); + break; + case MLX5_CMD_OP_QUERY_FLOW_GROUP: + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_FLOW_GROUP, + MLX5_GET(query_flow_group_in, in, + group_id)); + break; + case MLX5_CMD_OP_QUERY_FLOW_TABLE_ENTRY: + obj_id = get_enc_obj_id(MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY, + MLX5_GET(query_fte_in, in, + flow_index)); + break; + case MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY: + obj_id = get_enc_obj_id(MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY, + MLX5_GET(set_fte_in, in, flow_index)); + break; + case MLX5_CMD_OP_QUERY_Q_COUNTER: + obj_id = get_enc_obj_id(MLX5_CMD_OP_ALLOC_Q_COUNTER, + MLX5_GET(query_q_counter_in, in, + counter_set_id)); + break; + case MLX5_CMD_OP_QUERY_FLOW_COUNTER: + obj_id = get_enc_obj_id(MLX5_CMD_OP_ALLOC_FLOW_COUNTER, + MLX5_GET(query_flow_counter_in, in, + flow_counter_id)); + break; + case MLX5_CMD_OP_QUERY_MODIFY_HEADER_CONTEXT: + obj_id = get_enc_obj_id(MLX5_CMD_OP_ALLOC_MODIFY_HEADER_CONTEXT, + MLX5_GET(query_modify_header_context_in, + in, modify_header_id)); + break; + case MLX5_CMD_OP_QUERY_SCHEDULING_ELEMENT: + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT, + MLX5_GET(query_scheduling_element_in, + in, scheduling_element_id)); + break; + case MLX5_CMD_OP_MODIFY_SCHEDULING_ELEMENT: + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT, + MLX5_GET(modify_scheduling_element_in, + in, scheduling_element_id)); + break; + case MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT: + obj_id = get_enc_obj_id(MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT, + MLX5_GET(add_vxlan_udp_dport_in, in, + vxlan_udp_port)); + break; + case MLX5_CMD_OP_QUERY_L2_TABLE_ENTRY: + obj_id = get_enc_obj_id(MLX5_CMD_OP_SET_L2_TABLE_ENTRY, + MLX5_GET(query_l2_table_entry_in, in, + table_index)); + break; + case MLX5_CMD_OP_SET_L2_TABLE_ENTRY: + obj_id = get_enc_obj_id(MLX5_CMD_OP_SET_L2_TABLE_ENTRY, + MLX5_GET(set_l2_table_entry_in, in, + table_index)); + break; + case MLX5_CMD_OP_QUERY_QP: + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_QP, + MLX5_GET(query_qp_in, in, qpn)); + break; + case MLX5_CMD_OP_RST2INIT_QP: + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_QP, + MLX5_GET(rst2init_qp_in, in, qpn)); + break; + case MLX5_CMD_OP_INIT2INIT_QP: + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_QP, + MLX5_GET(init2init_qp_in, in, qpn)); + break; + case MLX5_CMD_OP_INIT2RTR_QP: + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_QP, + MLX5_GET(init2rtr_qp_in, in, qpn)); + break; + case MLX5_CMD_OP_RTR2RTS_QP: + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_QP, + MLX5_GET(rtr2rts_qp_in, in, qpn)); + break; + case MLX5_CMD_OP_RTS2RTS_QP: + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_QP, + MLX5_GET(rts2rts_qp_in, in, qpn)); + break; + case MLX5_CMD_OP_SQERR2RTS_QP: + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_QP, + MLX5_GET(sqerr2rts_qp_in, in, qpn)); + break; + case MLX5_CMD_OP_2ERR_QP: + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_QP, + MLX5_GET(qp_2err_in, in, qpn)); + break; + case MLX5_CMD_OP_2RST_QP: + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_QP, + MLX5_GET(qp_2rst_in, in, qpn)); + break; + case MLX5_CMD_OP_QUERY_DCT: + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_DCT, + MLX5_GET(query_dct_in, in, dctn)); + break; + case MLX5_CMD_OP_QUERY_XRQ: + case MLX5_CMD_OP_QUERY_XRQ_DC_PARAMS_ENTRY: + case MLX5_CMD_OP_QUERY_XRQ_ERROR_PARAMS: + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_XRQ, + MLX5_GET(query_xrq_in, in, xrqn)); + break; + case MLX5_CMD_OP_QUERY_XRC_SRQ: + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_XRC_SRQ, + MLX5_GET(query_xrc_srq_in, in, + xrc_srqn)); + break; + case MLX5_CMD_OP_ARM_XRC_SRQ: + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_XRC_SRQ, + MLX5_GET(arm_xrc_srq_in, in, xrc_srqn)); + break; + case MLX5_CMD_OP_QUERY_SRQ: + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_SRQ, + MLX5_GET(query_srq_in, in, srqn)); + break; + case MLX5_CMD_OP_ARM_RQ: + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_RQ, + MLX5_GET(arm_rq_in, in, srq_number)); + break; + case MLX5_CMD_OP_ARM_DCT_FOR_KEY_VIOLATION: + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_DCT, + MLX5_GET(drain_dct_in, in, dctn)); + break; + case MLX5_CMD_OP_ARM_XRQ: + case MLX5_CMD_OP_SET_XRQ_DC_PARAMS_ENTRY: + case MLX5_CMD_OP_RELEASE_XRQ_ERROR: + case MLX5_CMD_OP_MODIFY_XRQ: + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_XRQ, + MLX5_GET(arm_xrq_in, in, xrqn)); + break; + case MLX5_CMD_OP_QUERY_PACKET_REFORMAT_CONTEXT: + obj_id = get_enc_obj_id + (MLX5_CMD_OP_ALLOC_PACKET_REFORMAT_CONTEXT, + MLX5_GET(query_packet_reformat_context_in, + in, packet_reformat_id)); + break; + default: + obj_id = 0; + } + + return obj_id; +} + +static bool devx_is_valid_obj_id(struct uverbs_attr_bundle *attrs, + struct ib_uobject *uobj, const void *in) +{ + struct mlx5_ib_dev *dev = mlx5_udata_to_mdev(&attrs->driver_udata); + u64 obj_id = devx_get_obj_id(in); + + if (!obj_id) + return false; + + switch (uobj_get_object_id(uobj)) { + case UVERBS_OBJECT_CQ: + return get_enc_obj_id(MLX5_CMD_OP_CREATE_CQ, + to_mcq(uobj->object)->mcq.cqn) == + obj_id; + + case UVERBS_OBJECT_SRQ: + { + struct mlx5_core_srq *srq = &(to_msrq(uobj->object)->msrq); + u16 opcode; + + switch (srq->common.res) { + case MLX5_RES_XSRQ: + opcode = MLX5_CMD_OP_CREATE_XRC_SRQ; + break; + case MLX5_RES_XRQ: + opcode = MLX5_CMD_OP_CREATE_XRQ; + break; + default: + if (!dev->mdev->issi) + opcode = MLX5_CMD_OP_CREATE_SRQ; + else + opcode = MLX5_CMD_OP_CREATE_RMP; + } + + return get_enc_obj_id(opcode, + to_msrq(uobj->object)->msrq.srqn) == + obj_id; + } + + case UVERBS_OBJECT_QP: + { + struct mlx5_ib_qp *qp = to_mqp(uobj->object); + + if (qp->type == IB_QPT_RAW_PACKET || + (qp->flags & IB_QP_CREATE_SOURCE_QPN)) { + struct mlx5_ib_raw_packet_qp *raw_packet_qp = + &qp->raw_packet_qp; + struct mlx5_ib_rq *rq = &raw_packet_qp->rq; + struct mlx5_ib_sq *sq = &raw_packet_qp->sq; + + return (get_enc_obj_id(MLX5_CMD_OP_CREATE_RQ, + rq->base.mqp.qpn) == obj_id || + get_enc_obj_id(MLX5_CMD_OP_CREATE_SQ, + sq->base.mqp.qpn) == obj_id || + get_enc_obj_id(MLX5_CMD_OP_CREATE_TIR, + rq->tirn) == obj_id || + get_enc_obj_id(MLX5_CMD_OP_CREATE_TIS, + sq->tisn) == obj_id); + } + + if (qp->type == MLX5_IB_QPT_DCT) + return get_enc_obj_id(MLX5_CMD_OP_CREATE_DCT, + qp->dct.mdct.mqp.qpn) == obj_id; + return get_enc_obj_id(MLX5_CMD_OP_CREATE_QP, + qp->ibqp.qp_num) == obj_id; + } + + case UVERBS_OBJECT_WQ: + return get_enc_obj_id(MLX5_CMD_OP_CREATE_RQ, + to_mrwq(uobj->object)->core_qp.qpn) == + obj_id; + + case UVERBS_OBJECT_RWQ_IND_TBL: + return get_enc_obj_id(MLX5_CMD_OP_CREATE_RQT, + to_mrwq_ind_table(uobj->object)->rqtn) == + obj_id; + + case MLX5_IB_OBJECT_DEVX_OBJ: + return ((struct devx_obj *)uobj->object)->obj_id == obj_id; + + default: + return false; + } +} + +static void devx_set_umem_valid(const void *in) +{ + u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, in, opcode); + + switch (opcode) { + case MLX5_CMD_OP_CREATE_MKEY: + MLX5_SET(create_mkey_in, in, mkey_umem_valid, 1); + break; + case MLX5_CMD_OP_CREATE_CQ: + { + void *cqc; + + MLX5_SET(create_cq_in, in, cq_umem_valid, 1); + cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context); + MLX5_SET(cqc, cqc, dbr_umem_valid, 1); + break; + } + case MLX5_CMD_OP_CREATE_QP: + { + void *qpc; + + qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); + MLX5_SET(qpc, qpc, dbr_umem_valid, 1); + MLX5_SET(create_qp_in, in, wq_umem_valid, 1); + break; + } + + case MLX5_CMD_OP_CREATE_RQ: + { + void *rqc, *wq; + + rqc = MLX5_ADDR_OF(create_rq_in, in, ctx); + wq = MLX5_ADDR_OF(rqc, rqc, wq); + MLX5_SET(wq, wq, dbr_umem_valid, 1); + MLX5_SET(wq, wq, wq_umem_valid, 1); + break; + } + + case MLX5_CMD_OP_CREATE_SQ: + { + void *sqc, *wq; + + sqc = MLX5_ADDR_OF(create_sq_in, in, ctx); + wq = MLX5_ADDR_OF(sqc, sqc, wq); + MLX5_SET(wq, wq, dbr_umem_valid, 1); + MLX5_SET(wq, wq, wq_umem_valid, 1); + break; + } + + case MLX5_CMD_OP_MODIFY_CQ: + MLX5_SET(modify_cq_in, in, cq_umem_valid, 1); + break; + + case MLX5_CMD_OP_CREATE_RMP: + { + void *rmpc, *wq; + + rmpc = MLX5_ADDR_OF(create_rmp_in, in, ctx); + wq = MLX5_ADDR_OF(rmpc, rmpc, wq); + MLX5_SET(wq, wq, dbr_umem_valid, 1); + MLX5_SET(wq, wq, wq_umem_valid, 1); + break; + } + + case MLX5_CMD_OP_CREATE_XRQ: + { + void *xrqc, *wq; + + xrqc = MLX5_ADDR_OF(create_xrq_in, in, xrq_context); + wq = MLX5_ADDR_OF(xrqc, xrqc, wq); + MLX5_SET(wq, wq, dbr_umem_valid, 1); + MLX5_SET(wq, wq, wq_umem_valid, 1); + break; + } + + case MLX5_CMD_OP_CREATE_XRC_SRQ: + { + void *xrc_srqc; + + MLX5_SET(create_xrc_srq_in, in, xrc_srq_umem_valid, 1); + xrc_srqc = MLX5_ADDR_OF(create_xrc_srq_in, in, + xrc_srq_context_entry); + MLX5_SET(xrc_srqc, xrc_srqc, dbr_umem_valid, 1); + break; + } + + default: + return; + } +} + +static bool devx_is_obj_create_cmd(const void *in, u16 *opcode) +{ + *opcode = MLX5_GET(general_obj_in_cmd_hdr, in, opcode); + + switch (*opcode) { + case MLX5_CMD_OP_CREATE_GENERAL_OBJECT: + case MLX5_CMD_OP_CREATE_MKEY: + case MLX5_CMD_OP_CREATE_CQ: + case MLX5_CMD_OP_ALLOC_PD: + case MLX5_CMD_OP_ALLOC_TRANSPORT_DOMAIN: + case MLX5_CMD_OP_CREATE_RMP: + case MLX5_CMD_OP_CREATE_SQ: + case MLX5_CMD_OP_CREATE_RQ: + case MLX5_CMD_OP_CREATE_RQT: + case MLX5_CMD_OP_CREATE_TIR: + case MLX5_CMD_OP_CREATE_TIS: + case MLX5_CMD_OP_ALLOC_Q_COUNTER: + case MLX5_CMD_OP_CREATE_FLOW_TABLE: + case MLX5_CMD_OP_CREATE_FLOW_GROUP: + case MLX5_CMD_OP_ALLOC_FLOW_COUNTER: + case MLX5_CMD_OP_ALLOC_PACKET_REFORMAT_CONTEXT: + case MLX5_CMD_OP_ALLOC_MODIFY_HEADER_CONTEXT: + case MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT: + case MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT: + case MLX5_CMD_OP_SET_L2_TABLE_ENTRY: + case MLX5_CMD_OP_CREATE_QP: + case MLX5_CMD_OP_CREATE_SRQ: + case MLX5_CMD_OP_CREATE_XRC_SRQ: + case MLX5_CMD_OP_CREATE_DCT: + case MLX5_CMD_OP_CREATE_XRQ: + case MLX5_CMD_OP_ATTACH_TO_MCG: + case MLX5_CMD_OP_ALLOC_XRCD: + return true; + case MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY: + { + u16 op_mod = MLX5_GET(set_fte_in, in, op_mod); + if (op_mod == 0) + return true; + return false; + } + case MLX5_CMD_OP_CREATE_PSV: + { + u8 num_psv = MLX5_GET(create_psv_in, in, num_psv); + + if (num_psv == 1) + return true; + return false; + } + default: + return false; + } +} + +static bool devx_is_obj_modify_cmd(const void *in) +{ + u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, in, opcode); + + switch (opcode) { + case MLX5_CMD_OP_MODIFY_GENERAL_OBJECT: + case MLX5_CMD_OP_MODIFY_CQ: + case MLX5_CMD_OP_MODIFY_RMP: + case MLX5_CMD_OP_MODIFY_SQ: + case MLX5_CMD_OP_MODIFY_RQ: + case MLX5_CMD_OP_MODIFY_RQT: + case MLX5_CMD_OP_MODIFY_TIR: + case MLX5_CMD_OP_MODIFY_TIS: + case MLX5_CMD_OP_MODIFY_FLOW_TABLE: + case MLX5_CMD_OP_MODIFY_SCHEDULING_ELEMENT: + case MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT: + case MLX5_CMD_OP_SET_L2_TABLE_ENTRY: + case MLX5_CMD_OP_RST2INIT_QP: + case MLX5_CMD_OP_INIT2RTR_QP: + case MLX5_CMD_OP_INIT2INIT_QP: + case MLX5_CMD_OP_RTR2RTS_QP: + case MLX5_CMD_OP_RTS2RTS_QP: + case MLX5_CMD_OP_SQERR2RTS_QP: + case MLX5_CMD_OP_2ERR_QP: + case MLX5_CMD_OP_2RST_QP: + case MLX5_CMD_OP_ARM_XRC_SRQ: + case MLX5_CMD_OP_ARM_RQ: + case MLX5_CMD_OP_ARM_DCT_FOR_KEY_VIOLATION: + case MLX5_CMD_OP_ARM_XRQ: + case MLX5_CMD_OP_SET_XRQ_DC_PARAMS_ENTRY: + case MLX5_CMD_OP_RELEASE_XRQ_ERROR: + case MLX5_CMD_OP_MODIFY_XRQ: + return true; + case MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY: + { + u16 op_mod = MLX5_GET(set_fte_in, in, op_mod); + + if (op_mod == 1) + return true; + return false; + } + default: + return false; + } +} + +static bool devx_is_obj_query_cmd(const void *in) +{ + u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, in, opcode); + + switch (opcode) { + case MLX5_CMD_OP_QUERY_GENERAL_OBJECT: + case MLX5_CMD_OP_QUERY_MKEY: + case MLX5_CMD_OP_QUERY_CQ: + case MLX5_CMD_OP_QUERY_RMP: + case MLX5_CMD_OP_QUERY_SQ: + case MLX5_CMD_OP_QUERY_RQ: + case MLX5_CMD_OP_QUERY_RQT: + case MLX5_CMD_OP_QUERY_TIR: + case MLX5_CMD_OP_QUERY_TIS: + case MLX5_CMD_OP_QUERY_Q_COUNTER: + case MLX5_CMD_OP_QUERY_FLOW_TABLE: + case MLX5_CMD_OP_QUERY_FLOW_GROUP: + case MLX5_CMD_OP_QUERY_FLOW_TABLE_ENTRY: + case MLX5_CMD_OP_QUERY_FLOW_COUNTER: + case MLX5_CMD_OP_QUERY_MODIFY_HEADER_CONTEXT: + case MLX5_CMD_OP_QUERY_SCHEDULING_ELEMENT: + case MLX5_CMD_OP_QUERY_L2_TABLE_ENTRY: + case MLX5_CMD_OP_QUERY_QP: + case MLX5_CMD_OP_QUERY_SRQ: + case MLX5_CMD_OP_QUERY_XRC_SRQ: + case MLX5_CMD_OP_QUERY_DCT: + case MLX5_CMD_OP_QUERY_XRQ: + case MLX5_CMD_OP_QUERY_XRQ_DC_PARAMS_ENTRY: + case MLX5_CMD_OP_QUERY_XRQ_ERROR_PARAMS: + case MLX5_CMD_OP_QUERY_PACKET_REFORMAT_CONTEXT: + return true; + default: + return false; + } +} + +static bool devx_is_whitelist_cmd(void *in) +{ + u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, in, opcode); + + switch (opcode) { + case MLX5_CMD_OP_QUERY_HCA_CAP: + case MLX5_CMD_OP_QUERY_HCA_VPORT_CONTEXT: + case MLX5_CMD_OP_QUERY_ESW_VPORT_CONTEXT: + case MLX5_CMD_OP_QUERY_ESW_FUNCTIONS: + return true; + default: + return false; + } +} + +static int devx_get_uid(struct mlx5_ib_ucontext *c, void *cmd_in) +{ + if (devx_is_whitelist_cmd(cmd_in)) { + struct mlx5_ib_dev *dev; + + if (c->devx_uid) + return c->devx_uid; + + dev = to_mdev(c->ibucontext.device); + if (dev->devx_whitelist_uid) + return dev->devx_whitelist_uid; + + return -EOPNOTSUPP; + } + + if (!c->devx_uid) + return -EINVAL; + + return c->devx_uid; +} + +static bool devx_is_general_cmd(void *in, struct mlx5_ib_dev *dev) +{ + u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, in, opcode); + + /* Pass all cmds for vhca_tunnel as general, tracking is done in FW */ + if ((MLX5_CAP_GEN_64(dev->mdev, vhca_tunnel_commands) && + MLX5_GET(general_obj_in_cmd_hdr, in, vhca_tunnel_id)) || + (opcode >= MLX5_CMD_OP_GENERAL_START && + opcode < MLX5_CMD_OP_GENERAL_END)) + return true; + + switch (opcode) { + case MLX5_CMD_OP_QUERY_HCA_CAP: + case MLX5_CMD_OP_QUERY_HCA_VPORT_CONTEXT: + case MLX5_CMD_OP_QUERY_ESW_VPORT_CONTEXT: + case MLX5_CMD_OP_QUERY_VPORT_STATE: + case MLX5_CMD_OP_QUERY_ADAPTER: + case MLX5_CMD_OP_QUERY_ISSI: + case MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT: + case MLX5_CMD_OP_QUERY_ROCE_ADDRESS: + case MLX5_CMD_OP_QUERY_VNIC_ENV: + case MLX5_CMD_OP_QUERY_VPORT_COUNTER: + case MLX5_CMD_OP_GET_DROPPED_PACKET_LOG: + case MLX5_CMD_OP_NOP: + case MLX5_CMD_OP_QUERY_CONG_STATUS: + case MLX5_CMD_OP_QUERY_CONG_PARAMS: + case MLX5_CMD_OP_QUERY_CONG_STATISTICS: + case MLX5_CMD_OP_QUERY_LAG: + case MLX5_CMD_OP_QUERY_ESW_FUNCTIONS: + return true; + default: + return false; + } +} + +static int mlx5_ib_fill_vport_vhca_id(struct mlx5_core_dev *mdev, + struct uverbs_attr_bundle *attrs, + u16 vport_num, + u64 *comp_mask) +{ + size_t out_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out); + u32 in [MLX5_ST_SZ_DW(query_hca_cap_in)] = {}; + u16 vport_vhca_id; + int err = 0; + void *out; + + out = kzalloc(out_sz, GFP_KERNEL); + if (!out) + return -ENOMEM; + + if (!uverbs_attr_is_valid(attrs, + MLX5_IB_ATTR_DEVX_QUERY_PORT_VPORT_VHCA_ID) || + (vport_num == MLX5_VPORT_UPLINK)) + goto out; + + MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP); + MLX5_SET(query_hca_cap_in, in, other_function, true); + MLX5_SET(query_hca_cap_in, in, function_id, vport_num); + MLX5_SET(query_hca_cap_in, in, op_mod, + MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE | + HCA_CAP_OPMOD_GET_CUR); + + err = mlx5_cmd_exec(mdev, in, sizeof(in), out, out_sz); + if (err) + goto out; + + vport_vhca_id = MLX5_GET(query_hca_cap_out, out, + capability.cmd_hca_cap.vhca_id); + + err = uverbs_copy_to(attrs, MLX5_IB_ATTR_DEVX_QUERY_PORT_VPORT_VHCA_ID, + &vport_vhca_id, sizeof(vport_vhca_id)); + if (!err) + *comp_mask |= MLX5_IB_UAPI_QUERY_PORT_VPORT_VHCA_ID_OLD; + +out: + kfree(out); + return err; +} + +static int mlx5_ib_fill_vport_icm_addr(struct mlx5_core_dev *mdev, + struct uverbs_attr_bundle *attrs, + u16 vport_num, + u64 *comp_mask) +{ + u32 out[MLX5_ST_SZ_DW(query_esw_vport_context_out)] = {}; + u32 in[MLX5_ST_SZ_DW(query_esw_vport_context_in)] = {}; + u64 icm_rx; + u64 icm_tx; + int err; + + if (vport_num == MLX5_VPORT_UPLINK) { + /* TODO: FDB sw_owner check */ + icm_rx = MLX5_CAP_ESW_FLOWTABLE_64(mdev, sw_steering_uplink_icm_address_rx); + icm_tx = MLX5_CAP_ESW_FLOWTABLE_64(mdev, sw_steering_uplink_icm_address_tx); + } else { + MLX5_SET(query_esw_vport_context_in, in, opcode, + MLX5_CMD_OP_QUERY_ESW_VPORT_CONTEXT); + MLX5_SET(query_esw_vport_context_in, in, vport_number, + vport_num); + MLX5_SET(query_esw_vport_context_in, in, other_vport, true); + + err = mlx5_cmd_exec_inout(mdev, query_esw_vport_context, in, out); + + if (err) + return err; + + icm_rx = MLX5_GET64(query_esw_vport_context_out, out, + esw_vport_context.sw_steering_vport_icm_address_rx); + + icm_tx = MLX5_GET64(query_esw_vport_context_out, out, + esw_vport_context.sw_steering_vport_icm_address_tx); + } + + /* TODO: FDB sw_owner check */ + if (uverbs_attr_is_valid(attrs, + MLX5_IB_ATTR_DEVX_QUERY_PORT_VPORT_ICM_RX) && + icm_rx) { + if (uverbs_copy_to(attrs, + MLX5_IB_ATTR_DEVX_QUERY_PORT_VPORT_ICM_RX, + &icm_rx, sizeof(icm_rx))) + return -EFAULT; + *comp_mask |= MLX5_IB_UAPI_QUERY_PORT_VPORT_ICM_RX_OLD; + } + + /* TODO: FDB sw_owner check */ + if (uverbs_attr_is_valid(attrs, + MLX5_IB_ATTR_DEVX_QUERY_PORT_VPORT_ICM_TX) && + icm_tx) { + if (uverbs_copy_to(attrs, + MLX5_IB_ATTR_DEVX_QUERY_PORT_VPORT_ICM_TX, + &icm_tx, sizeof(icm_tx))) + return -EFAULT; + *comp_mask |= MLX5_IB_UAPI_QUERY_PORT_VPORT_ICM_TX_OLD; + } + + return 0; +} + +static int mlx5_ib_fill_vport_ctx(struct mlx5_ib_dev *dev, + struct uverbs_attr_bundle *attrs, + u32 port_num) +{ + struct mlx5_eswitch_rep *rep; + struct mlx5_core_dev *mdev; + u64 comp_mask = 0; + u16 vport_num; + int err; + + rep = dev->port[port_num - 1].rep; + + if (!rep) + goto fill_comp_mask; + + mdev = mlx5_eswitch_get_core_dev(rep->esw); + vport_num = rep->vport; + if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_DEVX_QUERY_PORT_VPORT)) { + if (uverbs_copy_to(attrs, MLX5_IB_ATTR_DEVX_QUERY_PORT_VPORT, + &vport_num, sizeof(vport_num))) + return -EFAULT; + comp_mask |= MLX5_IB_UAPI_QUERY_PORT_VPORT_OLD; + } + + if (uverbs_attr_is_valid(attrs, + MLX5_IB_ATTR_DEVX_QUERY_PORT_ESW_OWNER_VHCA_ID)) { + u16 vhca_id = MLX5_CAP_GEN(mdev, vhca_id); + + if (uverbs_copy_to(attrs, + MLX5_IB_ATTR_DEVX_QUERY_PORT_ESW_OWNER_VHCA_ID, + &vhca_id, sizeof(vhca_id))) + return -EFAULT; + comp_mask |= MLX5_IB_UAPI_QUERY_PORT_ESW_OWNER_VHCA_ID_OLD; + } + + err = mlx5_ib_fill_vport_vhca_id(mdev, attrs, vport_num, &comp_mask); + if (err) + return err; + + err = mlx5_ib_fill_vport_icm_addr(mdev, attrs, vport_num, &comp_mask); + if (err) + return err; + + if (uverbs_attr_is_valid(attrs, + MLX5_IB_ATTR_DEVX_QUERY_PORT_MATCH_REG_C_0) && + mlx5_ib_eswitch_vport_match_metadata_enabled(rep->esw)) { + struct mlx5_ib_uapi_devx_reg_32 reg_c0 = {}; + reg_c0.value = + mlx5_ib_eswitch_get_vport_metadata_for_match(rep->esw, + rep->vport); + reg_c0.mask = mlx5_ib_eswitch_get_vport_metadata_mask(); + + if (uverbs_copy_to(attrs, + MLX5_IB_ATTR_DEVX_QUERY_PORT_MATCH_REG_C_0, + ®_c0, sizeof(reg_c0))) + return -EFAULT; + comp_mask |= MLX5_IB_UAPI_QUERY_PORT_MATCH_REG_C_0_OLD; + } + +fill_comp_mask: + return uverbs_set_flags64(attrs, MLX5_IB_ATTR_DEVX_QUERY_PORT_COMP_MASK, + comp_mask); +} + +static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_QUERY_PORT)( + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_ib_ucontext *c; + struct mlx5_ib_dev *dev; + u32 port_num; + + if (uverbs_copy_from(&port_num, attrs, + MLX5_IB_ATTR_DEVX_QUERY_PORT_NUM)) + return -EFAULT; + + c = devx_ufile2uctx(attrs); + if (IS_ERR(c)) + return PTR_ERR(c); + dev = to_mdev(c->ibucontext.device); + + if (!rdma_is_port_valid(&dev->ib_dev, port_num)) + return -EINVAL; + + if (!is_mdev_switchdev_mode(dev->mdev)) + return -EOPNOTSUPP; + + return mlx5_ib_fill_vport_ctx(dev, attrs, port_num); +} + +static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_QUERY_EQN)( + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_ib_ucontext *c; + struct mlx5_ib_dev *dev; + int user_vector; + int dev_eqn; + int err; + + if (uverbs_copy_from(&user_vector, attrs, + MLX5_IB_ATTR_DEVX_QUERY_EQN_USER_VEC)) + return -EFAULT; + + c = devx_ufile2uctx(attrs); + if (IS_ERR(c)) + return PTR_ERR(c); + dev = to_mdev(c->ibucontext.device); + + err = mlx5_vector2eqn(dev->mdev, user_vector, &dev_eqn); + if (err < 0) + return err; + + if (uverbs_copy_to(attrs, MLX5_IB_ATTR_DEVX_QUERY_EQN_DEV_EQN, + &dev_eqn, sizeof(dev_eqn))) + return -EFAULT; + + return 0; +} + +/* + *Security note: + * The hardware protection mechanism works like this: Each device object that + * is subject to UAR doorbells (QP/SQ/CQ) gets a UAR ID (called uar_page in + * the device specification manual) upon its creation. Then upon doorbell, + * hardware fetches the object context for which the doorbell was rang, and + * validates that the UAR through which the DB was rang matches the UAR ID + * of the object. + * If no match the doorbell is silently ignored by the hardware. Of course, + * the user cannot ring a doorbell on a UAR that was not mapped to it. + * Now in devx, as the devx kernel does not manipulate the QP/SQ/CQ command + * mailboxes (except tagging them with UID), we expose to the user its UAR + * ID, so it can embed it in these objects in the expected specification + * format. So the only thing the user can do is hurt itself by creating a + * QP/SQ/CQ with a UAR ID other than his, and then in this case other users + * may ring a doorbell on its objects. + * The consequence of that will be that another user can schedule a QP/SQ + * of the buggy user for execution (just insert it to the hardware schedule + * queue or arm its CQ for event generation), no further harm is expected. + */ +static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_QUERY_UAR)( + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_ib_ucontext *c; + struct mlx5_ib_dev *dev; + u32 user_idx; + s32 dev_idx; + + c = devx_ufile2uctx(attrs); + if (IS_ERR(c)) + return PTR_ERR(c); + dev = to_mdev(c->ibucontext.device); + + if (uverbs_copy_from(&user_idx, attrs, + MLX5_IB_ATTR_DEVX_QUERY_UAR_USER_IDX)) + return -EFAULT; + + dev_idx = bfregn_to_uar_index(dev, &c->bfregi, user_idx, true); + if (dev_idx < 0) + return dev_idx; + + if (uverbs_copy_to(attrs, MLX5_IB_ATTR_DEVX_QUERY_UAR_DEV_IDX, + &dev_idx, sizeof(dev_idx))) + return -EFAULT; + + return 0; +} + +static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OTHER)( + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_ib_ucontext *c; + struct mlx5_ib_dev *dev; + void *cmd_in = uverbs_attr_get_alloced_ptr( + attrs, MLX5_IB_ATTR_DEVX_OTHER_CMD_IN); + int cmd_out_len = uverbs_attr_get_len(attrs, + MLX5_IB_ATTR_DEVX_OTHER_CMD_OUT); + void *cmd_out; + int err, err2; + int uid; + + c = devx_ufile2uctx(attrs); + if (IS_ERR(c)) + return PTR_ERR(c); + dev = to_mdev(c->ibucontext.device); + + uid = devx_get_uid(c, cmd_in); + if (uid < 0) + return uid; + + /* Only white list of some general HCA commands are allowed for this method. */ + if (!devx_is_general_cmd(cmd_in, dev)) + return -EINVAL; + + cmd_out = uverbs_zalloc(attrs, cmd_out_len); + if (IS_ERR(cmd_out)) + return PTR_ERR(cmd_out); + + MLX5_SET(general_obj_in_cmd_hdr, cmd_in, uid, uid); + err = mlx5_cmd_do(dev->mdev, cmd_in, + uverbs_attr_get_len(attrs, MLX5_IB_ATTR_DEVX_OTHER_CMD_IN), + cmd_out, cmd_out_len); + if (err && err != -EREMOTEIO) + return err; + + err2 = uverbs_copy_to(attrs, MLX5_IB_ATTR_DEVX_OTHER_CMD_OUT, cmd_out, + cmd_out_len); + + return err2 ?: err; +} + +static void devx_obj_build_destroy_cmd(void *in, void *out, void *din, + u32 *dinlen, + u32 *obj_id) +{ + u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, in, opcode); + u16 uid = MLX5_GET(general_obj_in_cmd_hdr, in, uid); + + *obj_id = devx_get_created_obj_id(in, out, opcode); + *dinlen = MLX5_ST_SZ_BYTES(general_obj_in_cmd_hdr); + MLX5_SET(general_obj_in_cmd_hdr, din, uid, uid); + + switch (opcode) { + case MLX5_CMD_OP_CREATE_GENERAL_OBJECT: + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT); + MLX5_SET(general_obj_in_cmd_hdr, din, obj_id, *obj_id); + MLX5_SET(general_obj_in_cmd_hdr, din, obj_type, + MLX5_GET(general_obj_in_cmd_hdr, in, obj_type)); + break; + + case MLX5_CMD_OP_CREATE_UMEM: + MLX5_SET(destroy_umem_in, din, opcode, + MLX5_CMD_OP_DESTROY_UMEM); + MLX5_SET(destroy_umem_in, din, umem_id, *obj_id); + break; + case MLX5_CMD_OP_CREATE_MKEY: + MLX5_SET(destroy_mkey_in, din, opcode, + MLX5_CMD_OP_DESTROY_MKEY); + MLX5_SET(destroy_mkey_in, din, mkey_index, *obj_id); + break; + case MLX5_CMD_OP_CREATE_CQ: + MLX5_SET(destroy_cq_in, din, opcode, MLX5_CMD_OP_DESTROY_CQ); + MLX5_SET(destroy_cq_in, din, cqn, *obj_id); + break; + case MLX5_CMD_OP_ALLOC_PD: + MLX5_SET(dealloc_pd_in, din, opcode, MLX5_CMD_OP_DEALLOC_PD); + MLX5_SET(dealloc_pd_in, din, pd, *obj_id); + break; + case MLX5_CMD_OP_ALLOC_TRANSPORT_DOMAIN: + MLX5_SET(dealloc_transport_domain_in, din, opcode, + MLX5_CMD_OP_DEALLOC_TRANSPORT_DOMAIN); + MLX5_SET(dealloc_transport_domain_in, din, transport_domain, + *obj_id); + break; + case MLX5_CMD_OP_CREATE_RMP: + MLX5_SET(destroy_rmp_in, din, opcode, MLX5_CMD_OP_DESTROY_RMP); + MLX5_SET(destroy_rmp_in, din, rmpn, *obj_id); + break; + case MLX5_CMD_OP_CREATE_SQ: + MLX5_SET(destroy_sq_in, din, opcode, MLX5_CMD_OP_DESTROY_SQ); + MLX5_SET(destroy_sq_in, din, sqn, *obj_id); + break; + case MLX5_CMD_OP_CREATE_RQ: + MLX5_SET(destroy_rq_in, din, opcode, MLX5_CMD_OP_DESTROY_RQ); + MLX5_SET(destroy_rq_in, din, rqn, *obj_id); + break; + case MLX5_CMD_OP_CREATE_RQT: + MLX5_SET(destroy_rqt_in, din, opcode, MLX5_CMD_OP_DESTROY_RQT); + MLX5_SET(destroy_rqt_in, din, rqtn, *obj_id); + break; + case MLX5_CMD_OP_CREATE_TIR: + MLX5_SET(destroy_tir_in, din, opcode, MLX5_CMD_OP_DESTROY_TIR); + MLX5_SET(destroy_tir_in, din, tirn, *obj_id); + break; + case MLX5_CMD_OP_CREATE_TIS: + MLX5_SET(destroy_tis_in, din, opcode, MLX5_CMD_OP_DESTROY_TIS); + MLX5_SET(destroy_tis_in, din, tisn, *obj_id); + break; + case MLX5_CMD_OP_ALLOC_Q_COUNTER: + MLX5_SET(dealloc_q_counter_in, din, opcode, + MLX5_CMD_OP_DEALLOC_Q_COUNTER); + MLX5_SET(dealloc_q_counter_in, din, counter_set_id, *obj_id); + break; + case MLX5_CMD_OP_CREATE_FLOW_TABLE: + *dinlen = MLX5_ST_SZ_BYTES(destroy_flow_table_in); + MLX5_SET(destroy_flow_table_in, din, other_vport, + MLX5_GET(create_flow_table_in, in, other_vport)); + MLX5_SET(destroy_flow_table_in, din, vport_number, + MLX5_GET(create_flow_table_in, in, vport_number)); + MLX5_SET(destroy_flow_table_in, din, table_type, + MLX5_GET(create_flow_table_in, in, table_type)); + MLX5_SET(destroy_flow_table_in, din, table_id, *obj_id); + MLX5_SET(destroy_flow_table_in, din, opcode, + MLX5_CMD_OP_DESTROY_FLOW_TABLE); + break; + case MLX5_CMD_OP_CREATE_FLOW_GROUP: + *dinlen = MLX5_ST_SZ_BYTES(destroy_flow_group_in); + MLX5_SET(destroy_flow_group_in, din, other_vport, + MLX5_GET(create_flow_group_in, in, other_vport)); + MLX5_SET(destroy_flow_group_in, din, vport_number, + MLX5_GET(create_flow_group_in, in, vport_number)); + MLX5_SET(destroy_flow_group_in, din, table_type, + MLX5_GET(create_flow_group_in, in, table_type)); + MLX5_SET(destroy_flow_group_in, din, table_id, + MLX5_GET(create_flow_group_in, in, table_id)); + MLX5_SET(destroy_flow_group_in, din, group_id, *obj_id); + MLX5_SET(destroy_flow_group_in, din, opcode, + MLX5_CMD_OP_DESTROY_FLOW_GROUP); + break; + case MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY: + *dinlen = MLX5_ST_SZ_BYTES(delete_fte_in); + MLX5_SET(delete_fte_in, din, other_vport, + MLX5_GET(set_fte_in, in, other_vport)); + MLX5_SET(delete_fte_in, din, vport_number, + MLX5_GET(set_fte_in, in, vport_number)); + MLX5_SET(delete_fte_in, din, table_type, + MLX5_GET(set_fte_in, in, table_type)); + MLX5_SET(delete_fte_in, din, table_id, + MLX5_GET(set_fte_in, in, table_id)); + MLX5_SET(delete_fte_in, din, flow_index, *obj_id); + MLX5_SET(delete_fte_in, din, opcode, + MLX5_CMD_OP_DELETE_FLOW_TABLE_ENTRY); + break; + case MLX5_CMD_OP_ALLOC_FLOW_COUNTER: + MLX5_SET(dealloc_flow_counter_in, din, opcode, + MLX5_CMD_OP_DEALLOC_FLOW_COUNTER); + MLX5_SET(dealloc_flow_counter_in, din, flow_counter_id, + *obj_id); + break; + case MLX5_CMD_OP_ALLOC_PACKET_REFORMAT_CONTEXT: + MLX5_SET(dealloc_packet_reformat_context_in, din, opcode, + MLX5_CMD_OP_DEALLOC_PACKET_REFORMAT_CONTEXT); + MLX5_SET(dealloc_packet_reformat_context_in, din, + packet_reformat_id, *obj_id); + break; + case MLX5_CMD_OP_ALLOC_MODIFY_HEADER_CONTEXT: + MLX5_SET(dealloc_modify_header_context_in, din, opcode, + MLX5_CMD_OP_DEALLOC_MODIFY_HEADER_CONTEXT); + MLX5_SET(dealloc_modify_header_context_in, din, + modify_header_id, *obj_id); + break; + case MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT: + *dinlen = MLX5_ST_SZ_BYTES(destroy_scheduling_element_in); + MLX5_SET(destroy_scheduling_element_in, din, + scheduling_hierarchy, + MLX5_GET(create_scheduling_element_in, in, + scheduling_hierarchy)); + MLX5_SET(destroy_scheduling_element_in, din, + scheduling_element_id, *obj_id); + MLX5_SET(destroy_scheduling_element_in, din, opcode, + MLX5_CMD_OP_DESTROY_SCHEDULING_ELEMENT); + break; + case MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT: + *dinlen = MLX5_ST_SZ_BYTES(delete_vxlan_udp_dport_in); + MLX5_SET(delete_vxlan_udp_dport_in, din, vxlan_udp_port, *obj_id); + MLX5_SET(delete_vxlan_udp_dport_in, din, opcode, + MLX5_CMD_OP_DELETE_VXLAN_UDP_DPORT); + break; + case MLX5_CMD_OP_SET_L2_TABLE_ENTRY: + *dinlen = MLX5_ST_SZ_BYTES(delete_l2_table_entry_in); + MLX5_SET(delete_l2_table_entry_in, din, table_index, *obj_id); + MLX5_SET(delete_l2_table_entry_in, din, opcode, + MLX5_CMD_OP_DELETE_L2_TABLE_ENTRY); + break; + case MLX5_CMD_OP_CREATE_QP: + MLX5_SET(destroy_qp_in, din, opcode, MLX5_CMD_OP_DESTROY_QP); + MLX5_SET(destroy_qp_in, din, qpn, *obj_id); + break; + case MLX5_CMD_OP_CREATE_SRQ: + MLX5_SET(destroy_srq_in, din, opcode, MLX5_CMD_OP_DESTROY_SRQ); + MLX5_SET(destroy_srq_in, din, srqn, *obj_id); + break; + case MLX5_CMD_OP_CREATE_XRC_SRQ: + MLX5_SET(destroy_xrc_srq_in, din, opcode, + MLX5_CMD_OP_DESTROY_XRC_SRQ); + MLX5_SET(destroy_xrc_srq_in, din, xrc_srqn, *obj_id); + break; + case MLX5_CMD_OP_CREATE_DCT: + MLX5_SET(destroy_dct_in, din, opcode, MLX5_CMD_OP_DESTROY_DCT); + MLX5_SET(destroy_dct_in, din, dctn, *obj_id); + break; + case MLX5_CMD_OP_CREATE_XRQ: + MLX5_SET(destroy_xrq_in, din, opcode, MLX5_CMD_OP_DESTROY_XRQ); + MLX5_SET(destroy_xrq_in, din, xrqn, *obj_id); + break; + case MLX5_CMD_OP_ATTACH_TO_MCG: + *dinlen = MLX5_ST_SZ_BYTES(detach_from_mcg_in); + MLX5_SET(detach_from_mcg_in, din, qpn, + MLX5_GET(attach_to_mcg_in, in, qpn)); + memcpy(MLX5_ADDR_OF(detach_from_mcg_in, din, multicast_gid), + MLX5_ADDR_OF(attach_to_mcg_in, in, multicast_gid), + MLX5_FLD_SZ_BYTES(attach_to_mcg_in, multicast_gid)); + MLX5_SET(detach_from_mcg_in, din, opcode, + MLX5_CMD_OP_DETACH_FROM_MCG); + MLX5_SET(detach_from_mcg_in, din, qpn, *obj_id); + break; + case MLX5_CMD_OP_ALLOC_XRCD: + MLX5_SET(dealloc_xrcd_in, din, opcode, + MLX5_CMD_OP_DEALLOC_XRCD); + MLX5_SET(dealloc_xrcd_in, din, xrcd, *obj_id); + break; + case MLX5_CMD_OP_CREATE_PSV: + MLX5_SET(destroy_psv_in, din, opcode, + MLX5_CMD_OP_DESTROY_PSV); + MLX5_SET(destroy_psv_in, din, psvn, *obj_id); + break; + default: + /* The entry must match to one of the devx_is_obj_create_cmd */ + WARN_ON(true); + break; + } +} + +static int devx_handle_mkey_indirect(struct devx_obj *obj, + struct mlx5_ib_dev *dev, + void *in, void *out) +{ + struct mlx5_ib_mkey *mkey = &obj->mkey; + void *mkc; + u8 key; + + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + key = MLX5_GET(mkc, mkc, mkey_7_0); + mkey->key = mlx5_idx_to_mkey( + MLX5_GET(create_mkey_out, out, mkey_index)) | key; + mkey->type = MLX5_MKEY_INDIRECT_DEVX; + mkey->ndescs = MLX5_GET(mkc, mkc, translations_octword_size); + init_waitqueue_head(&mkey->wait); + + return mlx5r_store_odp_mkey(dev, mkey); +} + +static int devx_handle_mkey_create(struct mlx5_ib_dev *dev, + struct devx_obj *obj, + void *in, int in_len) +{ + int min_len = MLX5_BYTE_OFF(create_mkey_in, memory_key_mkey_entry) + + MLX5_FLD_SZ_BYTES(create_mkey_in, + memory_key_mkey_entry); + void *mkc; + u8 access_mode; + + if (in_len < min_len) + return -EINVAL; + + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + + access_mode = MLX5_GET(mkc, mkc, access_mode_1_0); + access_mode |= MLX5_GET(mkc, mkc, access_mode_4_2) << 2; + + if (access_mode == MLX5_MKC_ACCESS_MODE_KLMS || + access_mode == MLX5_MKC_ACCESS_MODE_KSM) { + if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) + obj->flags |= DEVX_OBJ_FLAGS_INDIRECT_MKEY; + return 0; + } + + MLX5_SET(create_mkey_in, in, mkey_umem_valid, 1); + return 0; +} + +static void devx_cleanup_subscription(struct mlx5_ib_dev *dev, + struct devx_event_subscription *sub) +{ + struct devx_event *event; + struct devx_obj_event *xa_val_level2; + + if (sub->is_cleaned) + return; + + sub->is_cleaned = 1; + list_del_rcu(&sub->xa_list); + + if (list_empty(&sub->obj_list)) + return; + + list_del_rcu(&sub->obj_list); + /* check whether key level 1 for this obj_sub_list is empty */ + event = xa_load(&dev->devx_event_table.event_xa, + sub->xa_key_level1); + WARN_ON(!event); + + xa_val_level2 = xa_load(&event->object_ids, sub->xa_key_level2); + if (list_empty(&xa_val_level2->obj_sub_list)) { + xa_erase(&event->object_ids, + sub->xa_key_level2); + kfree_rcu(xa_val_level2, rcu); + } +} + +static int devx_obj_cleanup(struct ib_uobject *uobject, + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) +{ + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)]; + struct mlx5_devx_event_table *devx_event_table; + struct devx_obj *obj = uobject->object; + struct devx_event_subscription *sub_entry, *tmp; + struct mlx5_ib_dev *dev; + int ret; + + dev = mlx5_udata_to_mdev(&attrs->driver_udata); + if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY && + xa_erase(&obj->ib_dev->odp_mkeys, + mlx5_base_mkey(obj->mkey.key))) + /* + * The pagefault_single_data_segment() does commands against + * the mmkey, we must wait for that to stop before freeing the + * mkey, as another allocation could get the same mkey #. + */ + mlx5r_deref_wait_odp_mkey(&obj->mkey); + + if (obj->flags & DEVX_OBJ_FLAGS_DCT) + ret = mlx5_core_destroy_dct(obj->ib_dev, &obj->core_dct); + else if (obj->flags & DEVX_OBJ_FLAGS_CQ) + ret = mlx5_core_destroy_cq(obj->ib_dev->mdev, &obj->core_cq); + else + ret = mlx5_cmd_exec(obj->ib_dev->mdev, obj->dinbox, + obj->dinlen, out, sizeof(out)); + if (ret) + return ret; + + devx_event_table = &dev->devx_event_table; + + mutex_lock(&devx_event_table->event_xa_lock); + list_for_each_entry_safe(sub_entry, tmp, &obj->event_sub, obj_list) + devx_cleanup_subscription(dev, sub_entry); + mutex_unlock(&devx_event_table->event_xa_lock); + + kfree(obj); + return ret; +} + +static void devx_cq_comp(struct mlx5_core_cq *mcq, struct mlx5_eqe *eqe) +{ + struct devx_obj *obj = container_of(mcq, struct devx_obj, core_cq); + struct mlx5_devx_event_table *table; + struct devx_event *event; + struct devx_obj_event *obj_event; + u32 obj_id = mcq->cqn; + + table = &obj->ib_dev->devx_event_table; + rcu_read_lock(); + event = xa_load(&table->event_xa, MLX5_EVENT_TYPE_COMP); + if (!event) + goto out; + + obj_event = xa_load(&event->object_ids, obj_id); + if (!obj_event) + goto out; + + dispatch_event_fd(&obj_event->obj_sub_list, eqe); +out: + rcu_read_unlock(); +} + +static bool is_apu_cq(struct mlx5_ib_dev *dev, const void *in) +{ + if (!MLX5_CAP_GEN(dev->mdev, apu) || + !MLX5_GET(cqc, MLX5_ADDR_OF(create_cq_in, in, cq_context), apu_cq)) + return false; + + return true; +} + +static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)( + struct uverbs_attr_bundle *attrs) +{ + void *cmd_in = uverbs_attr_get_alloced_ptr(attrs, MLX5_IB_ATTR_DEVX_OBJ_CREATE_CMD_IN); + int cmd_out_len = uverbs_attr_get_len(attrs, + MLX5_IB_ATTR_DEVX_OBJ_CREATE_CMD_OUT); + int cmd_in_len = uverbs_attr_get_len(attrs, + MLX5_IB_ATTR_DEVX_OBJ_CREATE_CMD_IN); + void *cmd_out; + struct ib_uobject *uobj = uverbs_attr_get_uobject( + attrs, MLX5_IB_ATTR_DEVX_OBJ_CREATE_HANDLE); + struct mlx5_ib_ucontext *c = rdma_udata_to_drv_context( + &attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext); + struct mlx5_ib_dev *dev = to_mdev(c->ibucontext.device); + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)]; + struct devx_obj *obj; + u16 obj_type = 0; + int err, err2 = 0; + int uid; + u32 obj_id; + u16 opcode; + + if (MLX5_GET(general_obj_in_cmd_hdr, cmd_in, vhca_tunnel_id)) + return -EINVAL; + + uid = devx_get_uid(c, cmd_in); + if (uid < 0) + return uid; + + if (!devx_is_obj_create_cmd(cmd_in, &opcode)) + return -EINVAL; + + cmd_out = uverbs_zalloc(attrs, cmd_out_len); + if (IS_ERR(cmd_out)) + return PTR_ERR(cmd_out); + + obj = kzalloc(sizeof(struct devx_obj), GFP_KERNEL); + if (!obj) + return -ENOMEM; + + MLX5_SET(general_obj_in_cmd_hdr, cmd_in, uid, uid); + if (opcode == MLX5_CMD_OP_CREATE_MKEY) { + err = devx_handle_mkey_create(dev, obj, cmd_in, cmd_in_len); + if (err) + goto obj_free; + } else { + devx_set_umem_valid(cmd_in); + } + + if (opcode == MLX5_CMD_OP_CREATE_DCT) { + obj->flags |= DEVX_OBJ_FLAGS_DCT; + err = mlx5_core_create_dct(dev, &obj->core_dct, cmd_in, + cmd_in_len, cmd_out, cmd_out_len); + } else if (opcode == MLX5_CMD_OP_CREATE_CQ && + !is_apu_cq(dev, cmd_in)) { + obj->flags |= DEVX_OBJ_FLAGS_CQ; + obj->core_cq.comp = devx_cq_comp; + err = mlx5_create_cq(dev->mdev, &obj->core_cq, + cmd_in, cmd_in_len, cmd_out, + cmd_out_len); + } else { + err = mlx5_cmd_do(dev->mdev, cmd_in, cmd_in_len, + cmd_out, cmd_out_len); + } + + if (err == -EREMOTEIO) + err2 = uverbs_copy_to(attrs, + MLX5_IB_ATTR_DEVX_OBJ_CREATE_CMD_OUT, + cmd_out, cmd_out_len); + if (err) + goto obj_free; + + if (opcode == MLX5_CMD_OP_ALLOC_FLOW_COUNTER) { + u8 bulk = MLX5_GET(alloc_flow_counter_in, + cmd_in, + flow_counter_bulk); + obj->flow_counter_bulk_size = 128UL * bulk; + } + + uobj->object = obj; + INIT_LIST_HEAD(&obj->event_sub); + obj->ib_dev = dev; + devx_obj_build_destroy_cmd(cmd_in, cmd_out, obj->dinbox, &obj->dinlen, + &obj_id); + WARN_ON(obj->dinlen > MLX5_MAX_DESTROY_INBOX_SIZE_DW * sizeof(u32)); + + err = uverbs_copy_to(attrs, MLX5_IB_ATTR_DEVX_OBJ_CREATE_CMD_OUT, cmd_out, cmd_out_len); + if (err) + goto obj_destroy; + + if (opcode == MLX5_CMD_OP_CREATE_GENERAL_OBJECT) + obj_type = MLX5_GET(general_obj_in_cmd_hdr, cmd_in, obj_type); + obj->obj_id = get_enc_obj_id(opcode | obj_type << 16, obj_id); + + if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) { + err = devx_handle_mkey_indirect(obj, dev, cmd_in, cmd_out); + if (err) + goto obj_destroy; + } + return 0; + +obj_destroy: + if (obj->flags & DEVX_OBJ_FLAGS_DCT) + mlx5_core_destroy_dct(obj->ib_dev, &obj->core_dct); + else if (obj->flags & DEVX_OBJ_FLAGS_CQ) + mlx5_core_destroy_cq(obj->ib_dev->mdev, &obj->core_cq); + else + mlx5_cmd_exec(obj->ib_dev->mdev, obj->dinbox, obj->dinlen, out, + sizeof(out)); +obj_free: + kfree(obj); + return err2 ?: err; +} + +static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_MODIFY)( + struct uverbs_attr_bundle *attrs) +{ + void *cmd_in = uverbs_attr_get_alloced_ptr(attrs, MLX5_IB_ATTR_DEVX_OBJ_MODIFY_CMD_IN); + int cmd_out_len = uverbs_attr_get_len(attrs, + MLX5_IB_ATTR_DEVX_OBJ_MODIFY_CMD_OUT); + struct ib_uobject *uobj = uverbs_attr_get_uobject(attrs, + MLX5_IB_ATTR_DEVX_OBJ_MODIFY_HANDLE); + struct mlx5_ib_ucontext *c = rdma_udata_to_drv_context( + &attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext); + struct mlx5_ib_dev *mdev = to_mdev(c->ibucontext.device); + void *cmd_out; + int err, err2; + int uid; + + if (MLX5_GET(general_obj_in_cmd_hdr, cmd_in, vhca_tunnel_id)) + return -EINVAL; + + uid = devx_get_uid(c, cmd_in); + if (uid < 0) + return uid; + + if (!devx_is_obj_modify_cmd(cmd_in)) + return -EINVAL; + + if (!devx_is_valid_obj_id(attrs, uobj, cmd_in)) + return -EINVAL; + + cmd_out = uverbs_zalloc(attrs, cmd_out_len); + if (IS_ERR(cmd_out)) + return PTR_ERR(cmd_out); + + MLX5_SET(general_obj_in_cmd_hdr, cmd_in, uid, uid); + devx_set_umem_valid(cmd_in); + + err = mlx5_cmd_do(mdev->mdev, cmd_in, + uverbs_attr_get_len(attrs, MLX5_IB_ATTR_DEVX_OBJ_MODIFY_CMD_IN), + cmd_out, cmd_out_len); + if (err && err != -EREMOTEIO) + return err; + + err2 = uverbs_copy_to(attrs, MLX5_IB_ATTR_DEVX_OBJ_MODIFY_CMD_OUT, + cmd_out, cmd_out_len); + + return err2 ?: err; +} + +static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_QUERY)( + struct uverbs_attr_bundle *attrs) +{ + void *cmd_in = uverbs_attr_get_alloced_ptr(attrs, MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_IN); + int cmd_out_len = uverbs_attr_get_len(attrs, + MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_OUT); + struct ib_uobject *uobj = uverbs_attr_get_uobject(attrs, + MLX5_IB_ATTR_DEVX_OBJ_QUERY_HANDLE); + struct mlx5_ib_ucontext *c = rdma_udata_to_drv_context( + &attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext); + void *cmd_out; + int err, err2; + int uid; + struct mlx5_ib_dev *mdev = to_mdev(c->ibucontext.device); + + if (MLX5_GET(general_obj_in_cmd_hdr, cmd_in, vhca_tunnel_id)) + return -EINVAL; + + uid = devx_get_uid(c, cmd_in); + if (uid < 0) + return uid; + + if (!devx_is_obj_query_cmd(cmd_in)) + return -EINVAL; + + if (!devx_is_valid_obj_id(attrs, uobj, cmd_in)) + return -EINVAL; + + cmd_out = uverbs_zalloc(attrs, cmd_out_len); + if (IS_ERR(cmd_out)) + return PTR_ERR(cmd_out); + + MLX5_SET(general_obj_in_cmd_hdr, cmd_in, uid, uid); + err = mlx5_cmd_do(mdev->mdev, cmd_in, + uverbs_attr_get_len(attrs, MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_IN), + cmd_out, cmd_out_len); + if (err && err != -EREMOTEIO) + return err; + + err2 = uverbs_copy_to(attrs, MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_OUT, + cmd_out, cmd_out_len); + + return err2 ?: err; +} + +struct devx_async_event_queue { + spinlock_t lock; + wait_queue_head_t poll_wait; + struct list_head event_list; + atomic_t bytes_in_use; + u8 is_destroyed:1; +}; + +struct devx_async_cmd_event_file { + struct ib_uobject uobj; + struct devx_async_event_queue ev_queue; + struct mlx5_async_ctx async_ctx; +}; + +static void devx_init_event_queue(struct devx_async_event_queue *ev_queue) +{ + spin_lock_init(&ev_queue->lock); + INIT_LIST_HEAD(&ev_queue->event_list); + init_waitqueue_head(&ev_queue->poll_wait); + atomic_set(&ev_queue->bytes_in_use, 0); + ev_queue->is_destroyed = 0; +} + +static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC)( + struct uverbs_attr_bundle *attrs) +{ + struct devx_async_cmd_event_file *ev_file; + + struct ib_uobject *uobj = uverbs_attr_get_uobject( + attrs, MLX5_IB_ATTR_DEVX_ASYNC_CMD_FD_ALLOC_HANDLE); + struct mlx5_ib_dev *mdev = mlx5_udata_to_mdev(&attrs->driver_udata); + + ev_file = container_of(uobj, struct devx_async_cmd_event_file, + uobj); + devx_init_event_queue(&ev_file->ev_queue); + mlx5_cmd_init_async_ctx(mdev->mdev, &ev_file->async_ctx); + return 0; +} + +static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_ASYNC_EVENT_FD_ALLOC)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = uverbs_attr_get_uobject( + attrs, MLX5_IB_ATTR_DEVX_ASYNC_EVENT_FD_ALLOC_HANDLE); + struct devx_async_event_file *ev_file; + struct mlx5_ib_ucontext *c = rdma_udata_to_drv_context( + &attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext); + struct mlx5_ib_dev *dev = to_mdev(c->ibucontext.device); + u32 flags; + int err; + + err = uverbs_get_flags32(&flags, attrs, + MLX5_IB_ATTR_DEVX_ASYNC_EVENT_FD_ALLOC_FLAGS, + MLX5_IB_UAPI_DEVX_CR_EV_CH_FLAGS_OMIT_DATA); + + if (err) + return err; + + ev_file = container_of(uobj, struct devx_async_event_file, + uobj); + spin_lock_init(&ev_file->lock); + INIT_LIST_HEAD(&ev_file->event_list); + init_waitqueue_head(&ev_file->poll_wait); + if (flags & MLX5_IB_UAPI_DEVX_CR_EV_CH_FLAGS_OMIT_DATA) + ev_file->omit_data = 1; + INIT_LIST_HEAD(&ev_file->subscribed_events_list); + ev_file->dev = dev; + get_device(&dev->ib_dev.dev); + return 0; +} + +static void devx_query_callback(int status, struct mlx5_async_work *context) +{ + struct devx_async_data *async_data = + container_of(context, struct devx_async_data, cb_work); + struct devx_async_cmd_event_file *ev_file = async_data->ev_file; + struct devx_async_event_queue *ev_queue = &ev_file->ev_queue; + unsigned long flags; + + /* + * Note that if the struct devx_async_cmd_event_file uobj begins to be + * destroyed it will block at mlx5_cmd_cleanup_async_ctx() until this + * routine returns, ensuring that it always remains valid here. + */ + spin_lock_irqsave(&ev_queue->lock, flags); + list_add_tail(&async_data->list, &ev_queue->event_list); + spin_unlock_irqrestore(&ev_queue->lock, flags); + + wake_up_interruptible(&ev_queue->poll_wait); +} + +#define MAX_ASYNC_BYTES_IN_USE (1024 * 1024) /* 1MB */ + +static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_ASYNC_QUERY)( + struct uverbs_attr_bundle *attrs) +{ + void *cmd_in = uverbs_attr_get_alloced_ptr(attrs, + MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_CMD_IN); + struct ib_uobject *uobj = uverbs_attr_get_uobject( + attrs, + MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_HANDLE); + u16 cmd_out_len; + struct mlx5_ib_ucontext *c = rdma_udata_to_drv_context( + &attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext); + struct ib_uobject *fd_uobj; + int err; + int uid; + struct mlx5_ib_dev *mdev = to_mdev(c->ibucontext.device); + struct devx_async_cmd_event_file *ev_file; + struct devx_async_data *async_data; + + if (MLX5_GET(general_obj_in_cmd_hdr, cmd_in, vhca_tunnel_id)) + return -EINVAL; + + uid = devx_get_uid(c, cmd_in); + if (uid < 0) + return uid; + + if (!devx_is_obj_query_cmd(cmd_in)) + return -EINVAL; + + err = uverbs_get_const(&cmd_out_len, attrs, + MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_OUT_LEN); + if (err) + return err; + + if (!devx_is_valid_obj_id(attrs, uobj, cmd_in)) + return -EINVAL; + + fd_uobj = uverbs_attr_get_uobject(attrs, + MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_FD); + if (IS_ERR(fd_uobj)) + return PTR_ERR(fd_uobj); + + ev_file = container_of(fd_uobj, struct devx_async_cmd_event_file, + uobj); + + if (atomic_add_return(cmd_out_len, &ev_file->ev_queue.bytes_in_use) > + MAX_ASYNC_BYTES_IN_USE) { + atomic_sub(cmd_out_len, &ev_file->ev_queue.bytes_in_use); + return -EAGAIN; + } + + async_data = kvzalloc(struct_size(async_data, hdr.out_data, + cmd_out_len), GFP_KERNEL); + if (!async_data) { + err = -ENOMEM; + goto sub_bytes; + } + + err = uverbs_copy_from(&async_data->hdr.wr_id, attrs, + MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_WR_ID); + if (err) + goto free_async; + + async_data->cmd_out_len = cmd_out_len; + async_data->mdev = mdev; + async_data->ev_file = ev_file; + + MLX5_SET(general_obj_in_cmd_hdr, cmd_in, uid, uid); + err = mlx5_cmd_exec_cb(&ev_file->async_ctx, cmd_in, + uverbs_attr_get_len(attrs, + MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_CMD_IN), + async_data->hdr.out_data, + async_data->cmd_out_len, + devx_query_callback, &async_data->cb_work); + + if (err) + goto free_async; + + return 0; + +free_async: + kvfree(async_data); +sub_bytes: + atomic_sub(cmd_out_len, &ev_file->ev_queue.bytes_in_use); + return err; +} + +static void +subscribe_event_xa_dealloc(struct mlx5_devx_event_table *devx_event_table, + u32 key_level1, + bool is_level2, + u32 key_level2) +{ + struct devx_event *event; + struct devx_obj_event *xa_val_level2; + + /* Level 1 is valid for future use, no need to free */ + if (!is_level2) + return; + + event = xa_load(&devx_event_table->event_xa, key_level1); + WARN_ON(!event); + + xa_val_level2 = xa_load(&event->object_ids, + key_level2); + if (list_empty(&xa_val_level2->obj_sub_list)) { + xa_erase(&event->object_ids, + key_level2); + kfree_rcu(xa_val_level2, rcu); + } +} + +static int +subscribe_event_xa_alloc(struct mlx5_devx_event_table *devx_event_table, + u32 key_level1, + bool is_level2, + u32 key_level2) +{ + struct devx_obj_event *obj_event; + struct devx_event *event; + int err; + + event = xa_load(&devx_event_table->event_xa, key_level1); + if (!event) { + event = kzalloc(sizeof(*event), GFP_KERNEL); + if (!event) + return -ENOMEM; + + INIT_LIST_HEAD(&event->unaffiliated_list); + xa_init(&event->object_ids); + + err = xa_insert(&devx_event_table->event_xa, + key_level1, + event, + GFP_KERNEL); + if (err) { + kfree(event); + return err; + } + } + + if (!is_level2) + return 0; + + obj_event = xa_load(&event->object_ids, key_level2); + if (!obj_event) { + obj_event = kzalloc(sizeof(*obj_event), GFP_KERNEL); + if (!obj_event) + /* Level1 is valid for future use, no need to free */ + return -ENOMEM; + + err = xa_insert(&event->object_ids, + key_level2, + obj_event, + GFP_KERNEL); + if (err) { + kfree(obj_event); + return err; + } + INIT_LIST_HEAD(&obj_event->obj_sub_list); + } + + return 0; +} + +static bool is_valid_events_legacy(int num_events, u16 *event_type_num_list, + struct devx_obj *obj) +{ + int i; + + for (i = 0; i < num_events; i++) { + if (obj) { + if (!is_legacy_obj_event_num(event_type_num_list[i])) + return false; + } else if (!is_legacy_unaffiliated_event_num( + event_type_num_list[i])) { + return false; + } + } + + return true; +} + +#define MAX_SUPP_EVENT_NUM 255 +static bool is_valid_events(struct mlx5_core_dev *dev, + int num_events, u16 *event_type_num_list, + struct devx_obj *obj) +{ + __be64 *aff_events; + __be64 *unaff_events; + int mask_entry; + int mask_bit; + int i; + + if (MLX5_CAP_GEN(dev, event_cap)) { + aff_events = MLX5_CAP_DEV_EVENT(dev, + user_affiliated_events); + unaff_events = MLX5_CAP_DEV_EVENT(dev, + user_unaffiliated_events); + } else { + return is_valid_events_legacy(num_events, event_type_num_list, + obj); + } + + for (i = 0; i < num_events; i++) { + if (event_type_num_list[i] > MAX_SUPP_EVENT_NUM) + return false; + + mask_entry = event_type_num_list[i] / 64; + mask_bit = event_type_num_list[i] % 64; + + if (obj) { + /* CQ completion */ + if (event_type_num_list[i] == 0) + continue; + + if (!(be64_to_cpu(aff_events[mask_entry]) & + (1ull << mask_bit))) + return false; + + continue; + } + + if (!(be64_to_cpu(unaff_events[mask_entry]) & + (1ull << mask_bit))) + return false; + } + + return true; +} + +#define MAX_NUM_EVENTS 16 +static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_SUBSCRIBE_EVENT)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *devx_uobj = uverbs_attr_get_uobject( + attrs, + MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_OBJ_HANDLE); + struct mlx5_ib_ucontext *c = rdma_udata_to_drv_context( + &attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext); + struct mlx5_ib_dev *dev = to_mdev(c->ibucontext.device); + struct ib_uobject *fd_uobj; + struct devx_obj *obj = NULL; + struct devx_async_event_file *ev_file; + struct mlx5_devx_event_table *devx_event_table = &dev->devx_event_table; + u16 *event_type_num_list; + struct devx_event_subscription *event_sub, *tmp_sub; + struct list_head sub_list; + int redirect_fd; + bool use_eventfd = false; + int num_events; + int num_alloc_xa_entries = 0; + u16 obj_type = 0; + u64 cookie = 0; + u32 obj_id = 0; + int err; + int i; + + if (!c->devx_uid) + return -EINVAL; + + if (!IS_ERR(devx_uobj)) { + obj = (struct devx_obj *)devx_uobj->object; + if (obj) + obj_id = get_dec_obj_id(obj->obj_id); + } + + fd_uobj = uverbs_attr_get_uobject(attrs, + MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_HANDLE); + if (IS_ERR(fd_uobj)) + return PTR_ERR(fd_uobj); + + ev_file = container_of(fd_uobj, struct devx_async_event_file, + uobj); + + if (uverbs_attr_is_valid(attrs, + MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_NUM)) { + err = uverbs_copy_from(&redirect_fd, attrs, + MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_NUM); + if (err) + return err; + + use_eventfd = true; + } + + if (uverbs_attr_is_valid(attrs, + MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_COOKIE)) { + if (use_eventfd) + return -EINVAL; + + err = uverbs_copy_from(&cookie, attrs, + MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_COOKIE); + if (err) + return err; + } + + num_events = uverbs_attr_ptr_get_array_size( + attrs, MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_TYPE_NUM_LIST, + sizeof(u16)); + + if (num_events < 0) + return num_events; + + if (num_events > MAX_NUM_EVENTS) + return -EINVAL; + + event_type_num_list = uverbs_attr_get_alloced_ptr(attrs, + MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_TYPE_NUM_LIST); + + if (!is_valid_events(dev->mdev, num_events, event_type_num_list, obj)) + return -EINVAL; + + INIT_LIST_HEAD(&sub_list); + + /* Protect from concurrent subscriptions to same XA entries to allow + * both to succeed + */ + mutex_lock(&devx_event_table->event_xa_lock); + for (i = 0; i < num_events; i++) { + u32 key_level1; + + if (obj) + obj_type = get_dec_obj_type(obj, + event_type_num_list[i]); + key_level1 = event_type_num_list[i] | obj_type << 16; + + err = subscribe_event_xa_alloc(devx_event_table, + key_level1, + obj, + obj_id); + if (err) + goto err; + + num_alloc_xa_entries++; + event_sub = kzalloc(sizeof(*event_sub), GFP_KERNEL); + if (!event_sub) { + err = -ENOMEM; + goto err; + } + + list_add_tail(&event_sub->event_list, &sub_list); + uverbs_uobject_get(&ev_file->uobj); + if (use_eventfd) { + event_sub->eventfd = + eventfd_ctx_fdget(redirect_fd); + + if (IS_ERR(event_sub->eventfd)) { + err = PTR_ERR(event_sub->eventfd); + event_sub->eventfd = NULL; + goto err; + } + } + + event_sub->cookie = cookie; + event_sub->ev_file = ev_file; + /* May be needed upon cleanup the devx object/subscription */ + event_sub->xa_key_level1 = key_level1; + event_sub->xa_key_level2 = obj_id; + INIT_LIST_HEAD(&event_sub->obj_list); + } + + /* Once all the allocations and the XA data insertions were done we + * can go ahead and add all the subscriptions to the relevant lists + * without concern of a failure. + */ + list_for_each_entry_safe(event_sub, tmp_sub, &sub_list, event_list) { + struct devx_event *event; + struct devx_obj_event *obj_event; + + list_del_init(&event_sub->event_list); + + spin_lock_irq(&ev_file->lock); + list_add_tail_rcu(&event_sub->file_list, + &ev_file->subscribed_events_list); + spin_unlock_irq(&ev_file->lock); + + event = xa_load(&devx_event_table->event_xa, + event_sub->xa_key_level1); + WARN_ON(!event); + + if (!obj) { + list_add_tail_rcu(&event_sub->xa_list, + &event->unaffiliated_list); + continue; + } + + obj_event = xa_load(&event->object_ids, obj_id); + WARN_ON(!obj_event); + list_add_tail_rcu(&event_sub->xa_list, + &obj_event->obj_sub_list); + list_add_tail_rcu(&event_sub->obj_list, + &obj->event_sub); + } + + mutex_unlock(&devx_event_table->event_xa_lock); + return 0; + +err: + list_for_each_entry_safe(event_sub, tmp_sub, &sub_list, event_list) { + list_del(&event_sub->event_list); + + subscribe_event_xa_dealloc(devx_event_table, + event_sub->xa_key_level1, + obj, + obj_id); + + if (event_sub->eventfd) + eventfd_ctx_put(event_sub->eventfd); + uverbs_uobject_put(&event_sub->ev_file->uobj); + kfree(event_sub); + } + + mutex_unlock(&devx_event_table->event_xa_lock); + return err; +} + +static int devx_umem_get(struct mlx5_ib_dev *dev, struct ib_ucontext *ucontext, + struct uverbs_attr_bundle *attrs, + struct devx_umem *obj) +{ + u64 addr; + size_t size; + u32 access; + int err; + + if (uverbs_copy_from(&addr, attrs, MLX5_IB_ATTR_DEVX_UMEM_REG_ADDR) || + uverbs_copy_from(&size, attrs, MLX5_IB_ATTR_DEVX_UMEM_REG_LEN)) + return -EFAULT; + + err = uverbs_get_flags32(&access, attrs, + MLX5_IB_ATTR_DEVX_UMEM_REG_ACCESS, + IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ); + if (err) + return err; + + err = ib_check_mr_access(&dev->ib_dev, access); + if (err) + return err; + + obj->umem = ib_umem_get_peer(&dev->ib_dev, addr, size, access, 0); + if (IS_ERR(obj->umem)) + return PTR_ERR(obj->umem); + return 0; +} + +static unsigned int devx_umem_find_best_pgsize(struct ib_umem *umem, + unsigned long pgsz_bitmap) +{ + unsigned long page_size; + + /* Don't bother checking larger page sizes as offset must be zero and + * total DEVX umem length must be equal to total umem length. + */ + pgsz_bitmap &= GENMASK_ULL(max_t(u64, order_base_2(umem->length), + PAGE_SHIFT), + MLX5_ADAPTER_PAGE_SHIFT); + if (!pgsz_bitmap) + return 0; + + page_size = ib_umem_find_best_pgoff(umem, pgsz_bitmap, U64_MAX); + if (!page_size) + return 0; + + /* If the page_size is less than the CPU page size then we can use the + * offset and create a umem which is a subset of the page list. + * For larger page sizes we can't be sure the DMA list reflects the + * VA so we must ensure that the umem extent is exactly equal to the + * page list. Reduce the page size until one of these cases is true. + */ + while ((ib_umem_dma_offset(umem, page_size) != 0 || + (umem->length % page_size) != 0) && + page_size > PAGE_SIZE) + page_size /= 2; + + return page_size; +} + +static int devx_umem_reg_cmd_alloc(struct mlx5_ib_dev *dev, + struct uverbs_attr_bundle *attrs, + struct devx_umem *obj, + struct devx_umem_reg_cmd *cmd) +{ + unsigned long pgsz_bitmap; + unsigned int page_size; + __be64 *mtt; + void *umem; + int ret; + + /* + * If the user does not pass in pgsz_bitmap then the user promises not + * to use umem_offset!=0 in any commands that allocate on top of the + * umem. + * + * If the user wants to use a umem_offset then it must pass in + * pgsz_bitmap which guides the maximum page size and thus maximum + * object alignment inside the umem. See the PRM. + * + * Users are not allowed to use IOVA here, mkeys are not supported on + * umem. + */ + ret = uverbs_get_const_default(&pgsz_bitmap, attrs, + MLX5_IB_ATTR_DEVX_UMEM_REG_PGSZ_BITMAP, + GENMASK_ULL(63, + min(PAGE_SHIFT, MLX5_ADAPTER_PAGE_SHIFT))); + if (ret) + return ret; + + page_size = devx_umem_find_best_pgsize(obj->umem, pgsz_bitmap); + if (!page_size) + return -EINVAL; + + cmd->inlen = MLX5_ST_SZ_BYTES(create_umem_in) + + (MLX5_ST_SZ_BYTES(mtt) * + ib_umem_num_dma_blocks(obj->umem, page_size)); + cmd->in = uverbs_zalloc(attrs, cmd->inlen); + if (IS_ERR(cmd->in)) + return PTR_ERR(cmd->in); + + umem = MLX5_ADDR_OF(create_umem_in, cmd->in, umem); + mtt = (__be64 *)MLX5_ADDR_OF(umem, umem, mtt); + + MLX5_SET(create_umem_in, cmd->in, opcode, MLX5_CMD_OP_CREATE_UMEM); + MLX5_SET64(umem, umem, num_of_mtt, + ib_umem_num_dma_blocks(obj->umem, page_size)); + MLX5_SET(umem, umem, log_page_size, + order_base_2(page_size) - MLX5_ADAPTER_PAGE_SHIFT); + MLX5_SET(umem, umem, page_offset, + ib_umem_dma_offset(obj->umem, page_size)); + if (obj->umem->is_peer) + MLX5_SET(umem, umem, ats, MLX5_CAP_GEN(dev->mdev, ats)); + + mlx5_ib_populate_pas(obj->umem, page_size, mtt, + (obj->umem->writable ? MLX5_IB_MTT_WRITE : 0) | + MLX5_IB_MTT_READ); + if (obj->umem->is_peer) + MLX5_SET(umem, umem, ats, MLX5_CAP_GEN(dev->mdev, ats)); + + return 0; +} + +static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_UMEM_REG)( + struct uverbs_attr_bundle *attrs) +{ + struct devx_umem_reg_cmd cmd; + struct devx_umem *obj; + struct ib_uobject *uobj = uverbs_attr_get_uobject( + attrs, MLX5_IB_ATTR_DEVX_UMEM_REG_HANDLE); + u32 obj_id; + struct mlx5_ib_ucontext *c = rdma_udata_to_drv_context( + &attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext); + struct mlx5_ib_dev *dev = to_mdev(c->ibucontext.device); + int err; + + if (!c->devx_uid) + return -EINVAL; + + obj = kzalloc(sizeof(struct devx_umem), GFP_KERNEL); + if (!obj) + return -ENOMEM; + + err = devx_umem_get(dev, &c->ibucontext, attrs, obj); + if (err) + goto err_obj_free; + + err = devx_umem_reg_cmd_alloc(dev, attrs, obj, &cmd); + if (err) + goto err_umem_release; + + MLX5_SET(create_umem_in, cmd.in, uid, c->devx_uid); + err = mlx5_cmd_exec(dev->mdev, cmd.in, cmd.inlen, cmd.out, + sizeof(cmd.out)); + if (err) + goto err_umem_release; + + obj->mdev = dev->mdev; + uobj->object = obj; + devx_obj_build_destroy_cmd(cmd.in, cmd.out, obj->dinbox, &obj->dinlen, &obj_id); + uverbs_finalize_uobj_create(attrs, MLX5_IB_ATTR_DEVX_UMEM_REG_HANDLE); + + err = uverbs_copy_to(attrs, MLX5_IB_ATTR_DEVX_UMEM_REG_OUT_ID, &obj_id, + sizeof(obj_id)); + return err; + +err_umem_release: + ib_umem_release(obj->umem); +err_obj_free: + kfree(obj); + return err; +} + +static int devx_umem_cleanup(struct ib_uobject *uobject, + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) +{ + struct devx_umem *obj = uobject->object; + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)]; + int err; + + err = mlx5_cmd_exec(obj->mdev, obj->dinbox, obj->dinlen, out, sizeof(out)); + if (err) + return err; + + ib_umem_release(obj->umem); + kfree(obj); + return 0; +} + +static bool is_unaffiliated_event(struct mlx5_core_dev *dev, + unsigned long event_type) +{ + __be64 *unaff_events; + int mask_entry; + int mask_bit; + + if (!MLX5_CAP_GEN(dev, event_cap)) + return is_legacy_unaffiliated_event_num(event_type); + + unaff_events = MLX5_CAP_DEV_EVENT(dev, + user_unaffiliated_events); + WARN_ON(event_type > MAX_SUPP_EVENT_NUM); + + mask_entry = event_type / 64; + mask_bit = event_type % 64; + + if (!(be64_to_cpu(unaff_events[mask_entry]) & (1ull << mask_bit))) + return false; + + return true; +} + +static u32 devx_get_obj_id_from_event(unsigned long event_type, void *data) +{ + struct mlx5_eqe *eqe = data; + u32 obj_id = 0; + + switch (event_type) { + case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR: + case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT: + case MLX5_EVENT_TYPE_PATH_MIG: + case MLX5_EVENT_TYPE_COMM_EST: + case MLX5_EVENT_TYPE_SQ_DRAINED: + case MLX5_EVENT_TYPE_SRQ_LAST_WQE: + case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: + case MLX5_EVENT_TYPE_PATH_MIG_FAILED: + case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: + case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: + obj_id = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff; + break; + case MLX5_EVENT_TYPE_XRQ_ERROR: + obj_id = be32_to_cpu(eqe->data.xrq_err.type_xrqn) & 0xffffff; + break; + case MLX5_EVENT_TYPE_DCT_DRAINED: + case MLX5_EVENT_TYPE_DCT_KEY_VIOLATION: + obj_id = be32_to_cpu(eqe->data.dct.dctn) & 0xffffff; + break; + case MLX5_EVENT_TYPE_CQ_ERROR: + obj_id = be32_to_cpu(eqe->data.cq_err.cqn) & 0xffffff; + break; + default: + obj_id = MLX5_GET(affiliated_event_header, &eqe->data, obj_id); + break; + } + + return obj_id; +} + +static int deliver_event(struct devx_event_subscription *event_sub, + const void *data) +{ + struct devx_async_event_file *ev_file; + struct devx_async_event_data *event_data; + unsigned long flags; + + ev_file = event_sub->ev_file; + + if (ev_file->omit_data) { + spin_lock_irqsave(&ev_file->lock, flags); + if (!list_empty(&event_sub->event_list) || + ev_file->is_destroyed) { + spin_unlock_irqrestore(&ev_file->lock, flags); + return 0; + } + + list_add_tail(&event_sub->event_list, &ev_file->event_list); + spin_unlock_irqrestore(&ev_file->lock, flags); + wake_up_interruptible(&ev_file->poll_wait); + return 0; + } + + event_data = kzalloc(sizeof(*event_data) + sizeof(struct mlx5_eqe), + GFP_ATOMIC); + if (!event_data) { + spin_lock_irqsave(&ev_file->lock, flags); + ev_file->is_overflow_err = 1; + spin_unlock_irqrestore(&ev_file->lock, flags); + return -ENOMEM; + } + + event_data->hdr.cookie = event_sub->cookie; + memcpy(event_data->hdr.out_data, data, sizeof(struct mlx5_eqe)); + + spin_lock_irqsave(&ev_file->lock, flags); + if (!ev_file->is_destroyed) + list_add_tail(&event_data->list, &ev_file->event_list); + else + kfree(event_data); + spin_unlock_irqrestore(&ev_file->lock, flags); + wake_up_interruptible(&ev_file->poll_wait); + + return 0; +} + +static void dispatch_event_fd(struct list_head *fd_list, + const void *data) +{ + struct devx_event_subscription *item; + + list_for_each_entry_rcu(item, fd_list, xa_list) { + if (item->eventfd) + eventfd_signal(item->eventfd, 1); + else + deliver_event(item, data); + } +} + +static int devx_event_notifier(struct notifier_block *nb, + unsigned long event_type, void *data) +{ + struct mlx5_devx_event_table *table; + struct mlx5_ib_dev *dev; + struct devx_event *event; + struct devx_obj_event *obj_event; + u16 obj_type = 0; + bool is_unaffiliated; + u32 obj_id; + + /* Explicit filtering to kernel events which may occur frequently */ + if (event_type == MLX5_EVENT_TYPE_CMD || + event_type == MLX5_EVENT_TYPE_PAGE_REQUEST) + return NOTIFY_OK; + + table = container_of(nb, struct mlx5_devx_event_table, devx_nb.nb); + dev = container_of(table, struct mlx5_ib_dev, devx_event_table); + is_unaffiliated = is_unaffiliated_event(dev->mdev, event_type); + + if (!is_unaffiliated) + obj_type = get_event_obj_type(event_type, data); + + rcu_read_lock(); + event = xa_load(&table->event_xa, event_type | (obj_type << 16)); + if (!event) { + rcu_read_unlock(); + return NOTIFY_DONE; + } + + if (is_unaffiliated) { + dispatch_event_fd(&event->unaffiliated_list, data); + rcu_read_unlock(); + return NOTIFY_OK; + } + + obj_id = devx_get_obj_id_from_event(event_type, data); + obj_event = xa_load(&event->object_ids, obj_id); + if (!obj_event) { + rcu_read_unlock(); + return NOTIFY_DONE; + } + + dispatch_event_fd(&obj_event->obj_sub_list, data); + + rcu_read_unlock(); + return NOTIFY_OK; +} + +int mlx5_ib_devx_init(struct mlx5_ib_dev *dev) +{ + struct mlx5_devx_event_table *table = &dev->devx_event_table; + int uid; + + uid = mlx5_ib_devx_create(dev, false); + if (uid > 0) { + dev->devx_whitelist_uid = uid; + xa_init(&table->event_xa); + mutex_init(&table->event_xa_lock); + MLX5_NB_INIT(&table->devx_nb, devx_event_notifier, NOTIFY_ANY); + mlx5_eq_notifier_register(dev->mdev, &table->devx_nb); + } + + return 0; +} + +void mlx5_ib_devx_cleanup(struct mlx5_ib_dev *dev) +{ + struct mlx5_devx_event_table *table = &dev->devx_event_table; + struct devx_event_subscription *sub, *tmp; + struct devx_event *event; + void *entry; + unsigned long id; + + if (dev->devx_whitelist_uid) { + mlx5_eq_notifier_unregister(dev->mdev, &table->devx_nb); + mutex_lock(&dev->devx_event_table.event_xa_lock); + xa_for_each(&table->event_xa, id, entry) { + event = entry; + list_for_each_entry_safe( + sub, tmp, &event->unaffiliated_list, xa_list) + devx_cleanup_subscription(dev, sub); + kfree(entry); + } + mutex_unlock(&dev->devx_event_table.event_xa_lock); + xa_destroy(&table->event_xa); + + mlx5_ib_devx_destroy(dev, dev->devx_whitelist_uid); + } +} + +static ssize_t devx_async_cmd_event_read(struct file *filp, char __user *buf, + size_t count, loff_t *pos) +{ + struct devx_async_cmd_event_file *comp_ev_file = filp->private_data; + struct devx_async_event_queue *ev_queue = &comp_ev_file->ev_queue; + struct devx_async_data *event; + int ret = 0; + size_t eventsz; + + spin_lock_irq(&ev_queue->lock); + + while (list_empty(&ev_queue->event_list)) { + spin_unlock_irq(&ev_queue->lock); + + if (filp->f_flags & O_NONBLOCK) + return -EAGAIN; + + if (wait_event_interruptible( + ev_queue->poll_wait, + (!list_empty(&ev_queue->event_list) || + ev_queue->is_destroyed))) { + return -ERESTARTSYS; + } + + spin_lock_irq(&ev_queue->lock); + if (ev_queue->is_destroyed) { + spin_unlock_irq(&ev_queue->lock); + return -EIO; + } + } + + event = list_entry(ev_queue->event_list.next, + struct devx_async_data, list); + eventsz = event->cmd_out_len + + sizeof(struct mlx5_ib_uapi_devx_async_cmd_hdr); + + if (eventsz > count) { + spin_unlock_irq(&ev_queue->lock); + return -ENOSPC; + } + + list_del(ev_queue->event_list.next); + spin_unlock_irq(&ev_queue->lock); + + if (copy_to_user(buf, &event->hdr, eventsz)) + ret = -EFAULT; + else + ret = eventsz; + + atomic_sub(event->cmd_out_len, &ev_queue->bytes_in_use); + kvfree(event); + return ret; +} + +static __poll_t devx_async_cmd_event_poll(struct file *filp, + struct poll_table_struct *wait) +{ + struct devx_async_cmd_event_file *comp_ev_file = filp->private_data; + struct devx_async_event_queue *ev_queue = &comp_ev_file->ev_queue; + __poll_t pollflags = 0; + + poll_wait(filp, &ev_queue->poll_wait, wait); + + spin_lock_irq(&ev_queue->lock); + if (ev_queue->is_destroyed) + pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; + else if (!list_empty(&ev_queue->event_list)) + pollflags = EPOLLIN | EPOLLRDNORM; + spin_unlock_irq(&ev_queue->lock); + + return pollflags; +} + +static const struct file_operations devx_async_cmd_event_fops = { + .owner = THIS_MODULE, + .read = devx_async_cmd_event_read, + .poll = devx_async_cmd_event_poll, + .release = uverbs_uobject_fd_release, + .llseek = no_llseek, +}; + +static ssize_t devx_async_event_read(struct file *filp, char __user *buf, + size_t count, loff_t *pos) +{ + struct devx_async_event_file *ev_file = filp->private_data; + struct devx_event_subscription *event_sub; + struct devx_async_event_data *event; + int ret = 0; + size_t eventsz; + bool omit_data; + void *event_data; + + omit_data = ev_file->omit_data; + + spin_lock_irq(&ev_file->lock); + + if (ev_file->is_overflow_err) { + ev_file->is_overflow_err = 0; + spin_unlock_irq(&ev_file->lock); + return -EOVERFLOW; + } + + + while (list_empty(&ev_file->event_list)) { + spin_unlock_irq(&ev_file->lock); + + if (filp->f_flags & O_NONBLOCK) + return -EAGAIN; + + if (wait_event_interruptible(ev_file->poll_wait, + (!list_empty(&ev_file->event_list) || + ev_file->is_destroyed))) { + return -ERESTARTSYS; + } + + spin_lock_irq(&ev_file->lock); + if (ev_file->is_destroyed) { + spin_unlock_irq(&ev_file->lock); + return -EIO; + } + } + + if (omit_data) { + event_sub = list_first_entry(&ev_file->event_list, + struct devx_event_subscription, + event_list); + eventsz = sizeof(event_sub->cookie); + event_data = &event_sub->cookie; + } else { + event = list_first_entry(&ev_file->event_list, + struct devx_async_event_data, list); + eventsz = sizeof(struct mlx5_eqe) + + sizeof(struct mlx5_ib_uapi_devx_async_event_hdr); + event_data = &event->hdr; + } + + if (eventsz > count) { + spin_unlock_irq(&ev_file->lock); + return -EINVAL; + } + + if (omit_data) + list_del_init(&event_sub->event_list); + else + list_del(&event->list); + + spin_unlock_irq(&ev_file->lock); + + if (copy_to_user(buf, event_data, eventsz)) + /* This points to an application issue, not a kernel concern */ + ret = -EFAULT; + else + ret = eventsz; + + if (!omit_data) + kfree(event); + return ret; +} + +static __poll_t devx_async_event_poll(struct file *filp, + struct poll_table_struct *wait) +{ + struct devx_async_event_file *ev_file = filp->private_data; + __poll_t pollflags = 0; + + poll_wait(filp, &ev_file->poll_wait, wait); + + spin_lock_irq(&ev_file->lock); + if (ev_file->is_destroyed) + pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; + else if (!list_empty(&ev_file->event_list)) + pollflags = EPOLLIN | EPOLLRDNORM; + spin_unlock_irq(&ev_file->lock); + + return pollflags; +} + +static void devx_free_subscription(struct rcu_head *rcu) +{ + struct devx_event_subscription *event_sub = + container_of(rcu, struct devx_event_subscription, rcu); + + if (event_sub->eventfd) + eventfd_ctx_put(event_sub->eventfd); + uverbs_uobject_put(&event_sub->ev_file->uobj); + kfree(event_sub); +} + +static const struct file_operations devx_async_event_fops = { + .owner = THIS_MODULE, + .read = devx_async_event_read, + .poll = devx_async_event_poll, + .release = uverbs_uobject_fd_release, + .llseek = no_llseek, +}; + +static void devx_async_cmd_event_destroy_uobj(struct ib_uobject *uobj, + enum rdma_remove_reason why) +{ + struct devx_async_cmd_event_file *comp_ev_file = + container_of(uobj, struct devx_async_cmd_event_file, + uobj); + struct devx_async_event_queue *ev_queue = &comp_ev_file->ev_queue; + struct devx_async_data *entry, *tmp; + + spin_lock_irq(&ev_queue->lock); + ev_queue->is_destroyed = 1; + spin_unlock_irq(&ev_queue->lock); + wake_up_interruptible(&ev_queue->poll_wait); + + mlx5_cmd_cleanup_async_ctx(&comp_ev_file->async_ctx); + + spin_lock_irq(&comp_ev_file->ev_queue.lock); + list_for_each_entry_safe(entry, tmp, + &comp_ev_file->ev_queue.event_list, list) { + list_del(&entry->list); + kvfree(entry); + } + spin_unlock_irq(&comp_ev_file->ev_queue.lock); +}; + +static void devx_async_event_destroy_uobj(struct ib_uobject *uobj, + enum rdma_remove_reason why) +{ + struct devx_async_event_file *ev_file = + container_of(uobj, struct devx_async_event_file, + uobj); + struct devx_event_subscription *event_sub, *event_sub_tmp; + struct mlx5_ib_dev *dev = ev_file->dev; + + spin_lock_irq(&ev_file->lock); + ev_file->is_destroyed = 1; + + /* free the pending events allocation */ + if (ev_file->omit_data) { + struct devx_event_subscription *event_sub, *tmp; + + list_for_each_entry_safe(event_sub, tmp, &ev_file->event_list, + event_list) + list_del_init(&event_sub->event_list); + + } else { + struct devx_async_event_data *entry, *tmp; + + list_for_each_entry_safe(entry, tmp, &ev_file->event_list, + list) { + list_del(&entry->list); + kfree(entry); + } + } + + spin_unlock_irq(&ev_file->lock); + wake_up_interruptible(&ev_file->poll_wait); + + mutex_lock(&dev->devx_event_table.event_xa_lock); + /* delete the subscriptions which are related to this FD */ + list_for_each_entry_safe(event_sub, event_sub_tmp, + &ev_file->subscribed_events_list, file_list) { + devx_cleanup_subscription(dev, event_sub); + list_del_rcu(&event_sub->file_list); + /* subscription may not be used by the read API any more */ + call_rcu(&event_sub->rcu, devx_free_subscription); + } + mutex_unlock(&dev->devx_event_table.event_xa_lock); + + put_device(&dev->ib_dev.dev); +}; + +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_DEVX_UMEM_REG, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_DEVX_UMEM_REG_HANDLE, + MLX5_IB_OBJECT_DEVX_UMEM, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_DEVX_UMEM_REG_ADDR, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_DEVX_UMEM_REG_LEN, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_DEVX_UMEM_REG_ACCESS, + enum ib_access_flags), + UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_DEVX_UMEM_REG_PGSZ_BITMAP, + u64), + UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_DEVX_UMEM_REG_OUT_ID, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD_DESTROY( + MLX5_IB_METHOD_DEVX_UMEM_DEREG, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_DEVX_UMEM_DEREG_HANDLE, + MLX5_IB_OBJECT_DEVX_UMEM, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_DEVX_QUERY_PORT, + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_DEVX_QUERY_PORT_NUM, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_FLAGS_OUT(MLX5_IB_ATTR_DEVX_QUERY_PORT_COMP_MASK, + enum mlx5_ib_uapi_devx_query_port_comp_mask, + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_DEVX_QUERY_PORT_VPORT, + UVERBS_ATTR_TYPE(u16), + UA_OPTIONAL), + UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_DEVX_QUERY_PORT_VPORT_VHCA_ID, + UVERBS_ATTR_TYPE(u16), + UA_OPTIONAL), + UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_DEVX_QUERY_PORT_ESW_OWNER_VHCA_ID, + UVERBS_ATTR_TYPE(u16), + UA_OPTIONAL), + UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_DEVX_QUERY_PORT_VPORT_ICM_RX, + UVERBS_ATTR_TYPE(u64), + UA_OPTIONAL), + UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_DEVX_QUERY_PORT_VPORT_ICM_TX, + UVERBS_ATTR_TYPE(u64), + UA_OPTIONAL), + UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_DEVX_QUERY_PORT_MATCH_REG_C_0, + UVERBS_ATTR_STRUCT(struct mlx5_ib_uapi_devx_reg_32, + mask), + UA_OPTIONAL)); + +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_DEVX_QUERY_EQN, + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_DEVX_QUERY_EQN_USER_VEC, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_DEVX_QUERY_EQN_DEV_EQN, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_DEVX_QUERY_UAR, + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_DEVX_QUERY_UAR_USER_IDX, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_DEVX_QUERY_UAR_DEV_IDX, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_DEVX_OTHER, + UVERBS_ATTR_PTR_IN( + MLX5_IB_ATTR_DEVX_OTHER_CMD_IN, + UVERBS_ATTR_MIN_SIZE(MLX5_ST_SZ_BYTES(general_obj_in_cmd_hdr)), + UA_MANDATORY, + UA_ALLOC_AND_COPY), + UVERBS_ATTR_PTR_OUT( + MLX5_IB_ATTR_DEVX_OTHER_CMD_OUT, + UVERBS_ATTR_MIN_SIZE(MLX5_ST_SZ_BYTES(general_obj_out_cmd_hdr)), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_DEVX_OBJ_CREATE, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_DEVX_OBJ_CREATE_HANDLE, + MLX5_IB_OBJECT_DEVX_OBJ, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN( + MLX5_IB_ATTR_DEVX_OBJ_CREATE_CMD_IN, + UVERBS_ATTR_MIN_SIZE(MLX5_ST_SZ_BYTES(general_obj_in_cmd_hdr)), + UA_MANDATORY, + UA_ALLOC_AND_COPY), + UVERBS_ATTR_PTR_OUT( + MLX5_IB_ATTR_DEVX_OBJ_CREATE_CMD_OUT, + UVERBS_ATTR_MIN_SIZE(MLX5_ST_SZ_BYTES(general_obj_out_cmd_hdr)), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD_DESTROY( + MLX5_IB_METHOD_DEVX_OBJ_DESTROY, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_DEVX_OBJ_DESTROY_HANDLE, + MLX5_IB_OBJECT_DEVX_OBJ, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_DEVX_OBJ_MODIFY, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_DEVX_OBJ_MODIFY_HANDLE, + UVERBS_IDR_ANY_OBJECT, + UVERBS_ACCESS_WRITE, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN( + MLX5_IB_ATTR_DEVX_OBJ_MODIFY_CMD_IN, + UVERBS_ATTR_MIN_SIZE(MLX5_ST_SZ_BYTES(general_obj_in_cmd_hdr)), + UA_MANDATORY, + UA_ALLOC_AND_COPY), + UVERBS_ATTR_PTR_OUT( + MLX5_IB_ATTR_DEVX_OBJ_MODIFY_CMD_OUT, + UVERBS_ATTR_MIN_SIZE(MLX5_ST_SZ_BYTES(general_obj_out_cmd_hdr)), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_DEVX_OBJ_QUERY, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_DEVX_OBJ_QUERY_HANDLE, + UVERBS_IDR_ANY_OBJECT, + UVERBS_ACCESS_READ, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN( + MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_IN, + UVERBS_ATTR_MIN_SIZE(MLX5_ST_SZ_BYTES(general_obj_in_cmd_hdr)), + UA_MANDATORY, + UA_ALLOC_AND_COPY), + UVERBS_ATTR_PTR_OUT( + MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_OUT, + UVERBS_ATTR_MIN_SIZE(MLX5_ST_SZ_BYTES(general_obj_out_cmd_hdr)), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_DEVX_OBJ_ASYNC_QUERY, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_DEVX_OBJ_QUERY_HANDLE, + UVERBS_IDR_ANY_OBJECT, + UVERBS_ACCESS_READ, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN( + MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_IN, + UVERBS_ATTR_MIN_SIZE(MLX5_ST_SZ_BYTES(general_obj_in_cmd_hdr)), + UA_MANDATORY, + UA_ALLOC_AND_COPY), + UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_OUT_LEN, + u16, UA_MANDATORY), + UVERBS_ATTR_FD(MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_FD, + MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD, + UVERBS_ACCESS_READ, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_WR_ID, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_DEVX_SUBSCRIBE_EVENT, + UVERBS_ATTR_FD(MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_HANDLE, + MLX5_IB_OBJECT_DEVX_ASYNC_EVENT_FD, + UVERBS_ACCESS_READ, + UA_MANDATORY), + UVERBS_ATTR_IDR(MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_OBJ_HANDLE, + MLX5_IB_OBJECT_DEVX_OBJ, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_TYPE_NUM_LIST, + UVERBS_ATTR_MIN_SIZE(sizeof(u16)), + UA_MANDATORY, + UA_ALLOC_AND_COPY), + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_COOKIE, + UVERBS_ATTR_TYPE(u64), + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_DEVX_SUBSCRIBE_EVENT_FD_NUM, + UVERBS_ATTR_TYPE(u32), + UA_OPTIONAL)); + +DECLARE_UVERBS_GLOBAL_METHODS(MLX5_IB_OBJECT_DEVX, + &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OTHER), + &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_QUERY_UAR), + &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_QUERY_EQN), + &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_QUERY_PORT), + &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_SUBSCRIBE_EVENT)); + +DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_DEVX_OBJ, + UVERBS_TYPE_ALLOC_IDR(devx_obj_cleanup), + &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OBJ_CREATE), + &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OBJ_DESTROY), + &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OBJ_MODIFY), + &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OBJ_QUERY), + &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OBJ_ASYNC_QUERY)); + +DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_DEVX_UMEM, + UVERBS_TYPE_ALLOC_IDR(devx_umem_cleanup), + &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_UMEM_REG), + &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_UMEM_DEREG)); + + +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC, + UVERBS_ATTR_FD(MLX5_IB_ATTR_DEVX_ASYNC_CMD_FD_ALLOC_HANDLE, + MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD, + UVERBS_ACCESS_NEW, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT( + MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD, + UVERBS_TYPE_ALLOC_FD(sizeof(struct devx_async_cmd_event_file), + devx_async_cmd_event_destroy_uobj, + &devx_async_cmd_event_fops, "[devx_async_cmd]", + O_RDONLY), + &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC)); + +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_DEVX_ASYNC_EVENT_FD_ALLOC, + UVERBS_ATTR_FD(MLX5_IB_ATTR_DEVX_ASYNC_EVENT_FD_ALLOC_HANDLE, + MLX5_IB_OBJECT_DEVX_ASYNC_EVENT_FD, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_DEVX_ASYNC_EVENT_FD_ALLOC_FLAGS, + enum mlx5_ib_uapi_devx_create_event_channel_flags, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT( + MLX5_IB_OBJECT_DEVX_ASYNC_EVENT_FD, + UVERBS_TYPE_ALLOC_FD(sizeof(struct devx_async_event_file), + devx_async_event_destroy_uobj, + &devx_async_event_fops, "[devx_async_event]", + O_RDONLY), + &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_ASYNC_EVENT_FD_ALLOC)); + +static bool devx_is_supported(struct ib_device *device) +{ + struct mlx5_ib_dev *dev = to_mdev(device); + + return MLX5_CAP_GEN(dev->mdev, log_max_uctx); +} + +const struct uapi_definition mlx5_ib_devx_defs[] = { + UAPI_DEF_CHAIN_OBJ_TREE_NAMED( + MLX5_IB_OBJECT_DEVX, + UAPI_DEF_IS_OBJ_SUPPORTED(devx_is_supported)), + UAPI_DEF_CHAIN_OBJ_TREE_NAMED( + MLX5_IB_OBJECT_DEVX_OBJ, + UAPI_DEF_IS_OBJ_SUPPORTED(devx_is_supported)), + UAPI_DEF_CHAIN_OBJ_TREE_NAMED( + MLX5_IB_OBJECT_DEVX_UMEM, + UAPI_DEF_IS_OBJ_SUPPORTED(devx_is_supported)), + UAPI_DEF_CHAIN_OBJ_TREE_NAMED( + MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD, + UAPI_DEF_IS_OBJ_SUPPORTED(devx_is_supported)), + UAPI_DEF_CHAIN_OBJ_TREE_NAMED( + MLX5_IB_OBJECT_DEVX_ASYNC_EVENT_FD, + UAPI_DEF_IS_OBJ_SUPPORTED(devx_is_supported)), + {}, +}; diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/devx.h b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/devx.h new file mode 100644 index 0000000..ee22132 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/devx.h @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2019-2020, Mellanox Technologies inc. All rights reserved. + */ + +#ifndef _MLX5_IB_DEVX_H +#define _MLX5_IB_DEVX_H + +#include "mlx5_ib.h" + +#define MLX5_MAX_DESTROY_INBOX_SIZE_DW MLX5_ST_SZ_DW(delete_fte_in) +struct devx_obj { + struct mlx5_ib_dev *ib_dev; + u64 obj_id; + u32 dinlen; /* destroy inbox length */ + u32 dinbox[MLX5_MAX_DESTROY_INBOX_SIZE_DW]; + u32 flags; + union { + struct mlx5_ib_mkey mkey; + struct mlx5_core_dct core_dct; + struct mlx5_core_cq core_cq; + u32 flow_counter_bulk_size; + }; + struct list_head event_sub; /* holds devx_event_subscription entries */ +}; +#if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) +int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user); +void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, u16 uid); +int mlx5_ib_devx_init(struct mlx5_ib_dev *dev); +void mlx5_ib_devx_cleanup(struct mlx5_ib_dev *dev); +#else +static inline int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user) +{ + return -EOPNOTSUPP; +} +static inline void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, u16 uid) {} +static inline int mlx5_ib_devx_init(struct mlx5_ib_dev *dev) +{ + return 0; +} +static inline void mlx5_ib_devx_cleanup(struct mlx5_ib_dev *dev) +{ +} +#endif +#endif /* _MLX5_IB_DEVX_H */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/dm.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/dm.c new file mode 100644 index 0000000..f53f129 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/dm.c @@ -0,0 +1,600 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2021, Mellanox Technologies inc. All rights reserved. + */ + +#include +#include "dm.h" + +#define UVERBS_MODULE_NAME mlx5_ib +#include + +static int mlx5_cmd_alloc_memic(struct mlx5_dm *dm, phys_addr_t *addr, + u64 length, u32 alignment) +{ + struct mlx5_core_dev *dev = dm->dev; + u64 num_memic_hw_pages = MLX5_CAP_DEV_MEM(dev, memic_bar_size) + >> PAGE_SHIFT; + u64 hw_start_addr = MLX5_CAP64_DEV_MEM(dev, memic_bar_start_addr); + u32 max_alignment = MLX5_CAP_DEV_MEM(dev, log_max_memic_addr_alignment); + u32 num_pages = DIV_ROUND_UP(length, PAGE_SIZE); + u32 out[MLX5_ST_SZ_DW(alloc_memic_out)] = {}; + u32 in[MLX5_ST_SZ_DW(alloc_memic_in)] = {}; + u32 mlx5_alignment; + u64 page_idx = 0; + int ret = 0; + + if (!length || (length & MLX5_MEMIC_ALLOC_SIZE_MASK)) + return -EINVAL; + + /* mlx5 device sets alignment as 64*2^driver_value + * so normalizing is needed. + */ + mlx5_alignment = (alignment < MLX5_MEMIC_BASE_ALIGN) ? 0 : + alignment - MLX5_MEMIC_BASE_ALIGN; + if (mlx5_alignment > max_alignment) + return -EINVAL; + + MLX5_SET(alloc_memic_in, in, opcode, MLX5_CMD_OP_ALLOC_MEMIC); + MLX5_SET(alloc_memic_in, in, range_size, num_pages * PAGE_SIZE); + MLX5_SET(alloc_memic_in, in, memic_size, length); + MLX5_SET(alloc_memic_in, in, log_memic_addr_alignment, + mlx5_alignment); + + while (page_idx < num_memic_hw_pages) { + spin_lock(&dm->lock); + page_idx = bitmap_find_next_zero_area(dm->memic_alloc_pages, + num_memic_hw_pages, + page_idx, + num_pages, 0); + + if (page_idx < num_memic_hw_pages) + bitmap_set(dm->memic_alloc_pages, + page_idx, num_pages); + + spin_unlock(&dm->lock); + + if (page_idx >= num_memic_hw_pages) + break; + + MLX5_SET64(alloc_memic_in, in, range_start_addr, + hw_start_addr + (page_idx * PAGE_SIZE)); + + ret = mlx5_cmd_exec_inout(dev, alloc_memic, in, out); + if (ret) { + spin_lock(&dm->lock); + bitmap_clear(dm->memic_alloc_pages, + page_idx, num_pages); + spin_unlock(&dm->lock); + + if (ret == -EAGAIN) { + page_idx++; + continue; + } + + return ret; + } + + *addr = dev->bar_addr + + MLX5_GET64(alloc_memic_out, out, memic_start_addr); + + return 0; + } + + return -ENOMEM; +} + +void mlx5_cmd_dealloc_memic(struct mlx5_dm *dm, phys_addr_t addr, + u64 length) +{ + struct mlx5_core_dev *dev = dm->dev; + u64 hw_start_addr = MLX5_CAP64_DEV_MEM(dev, memic_bar_start_addr); + u32 num_pages = DIV_ROUND_UP(length, PAGE_SIZE); + u32 in[MLX5_ST_SZ_DW(dealloc_memic_in)] = {}; + u64 start_page_idx; + int err; + + addr -= dev->bar_addr; + start_page_idx = (addr - hw_start_addr) >> PAGE_SHIFT; + + MLX5_SET(dealloc_memic_in, in, opcode, MLX5_CMD_OP_DEALLOC_MEMIC); + MLX5_SET64(dealloc_memic_in, in, memic_start_addr, addr); + MLX5_SET(dealloc_memic_in, in, memic_size, length); + + err = mlx5_cmd_exec_in(dev, dealloc_memic, in); + if (err) + return; + + spin_lock(&dm->lock); + bitmap_clear(dm->memic_alloc_pages, + start_page_idx, num_pages); + spin_unlock(&dm->lock); +} + +void mlx5_cmd_dealloc_memic_op(struct mlx5_dm *dm, phys_addr_t addr, + u8 operation) +{ + u32 in[MLX5_ST_SZ_DW(modify_memic_in)] = {}; + struct mlx5_core_dev *dev = dm->dev; + + MLX5_SET(modify_memic_in, in, opcode, MLX5_CMD_OP_MODIFY_MEMIC); + MLX5_SET(modify_memic_in, in, op_mod, MLX5_MODIFY_MEMIC_OP_MOD_DEALLOC); + MLX5_SET(modify_memic_in, in, memic_operation_type, operation); + MLX5_SET64(modify_memic_in, in, memic_start_addr, addr - dev->bar_addr); + + mlx5_cmd_exec_in(dev, modify_memic, in); +} + +static int mlx5_cmd_alloc_memic_op(struct mlx5_dm *dm, phys_addr_t addr, + u8 operation, phys_addr_t *op_addr) +{ + u32 out[MLX5_ST_SZ_DW(modify_memic_out)] = {}; + u32 in[MLX5_ST_SZ_DW(modify_memic_in)] = {}; + struct mlx5_core_dev *dev = dm->dev; + int err; + + MLX5_SET(modify_memic_in, in, opcode, MLX5_CMD_OP_MODIFY_MEMIC); + MLX5_SET(modify_memic_in, in, op_mod, MLX5_MODIFY_MEMIC_OP_MOD_ALLOC); + MLX5_SET(modify_memic_in, in, memic_operation_type, operation); + MLX5_SET64(modify_memic_in, in, memic_start_addr, addr - dev->bar_addr); + + err = mlx5_cmd_exec_inout(dev, modify_memic, in, out); + if (err) + return err; + + *op_addr = dev->bar_addr + + MLX5_GET64(modify_memic_out, out, memic_operation_addr); + return 0; +} + +static int add_dm_mmap_entry(struct ib_ucontext *context, + struct mlx5_user_mmap_entry *mentry, u8 mmap_flag, + size_t size, u64 address) +{ + mentry->mmap_flag = mmap_flag; + mentry->address = address; + + return rdma_user_mmap_entry_insert_range( + context, &mentry->rdma_entry, size, + MLX5_IB_MMAP_DEVICE_MEM << 16, + (MLX5_IB_MMAP_DEVICE_MEM << 16) + (1UL << 16) - 1); +} + +static void mlx5_ib_dm_memic_free(struct kref *kref) +{ + struct mlx5_ib_dm_memic *dm = + container_of(kref, struct mlx5_ib_dm_memic, ref); + struct mlx5_ib_dev *dev = to_mdev(dm->base.ibdm.device); + + mlx5_cmd_dealloc_memic(&dev->dm, dm->base.dev_addr, dm->base.size); + kfree(dm); +} + +static int copy_op_to_user(struct mlx5_ib_dm_op_entry *op_entry, + struct uverbs_attr_bundle *attrs) +{ + u64 start_offset; + u16 page_idx; + int err; + + page_idx = op_entry->mentry.rdma_entry.start_pgoff & 0xFFFF; + start_offset = op_entry->op_addr & ~PAGE_MASK; + err = uverbs_copy_to(attrs, MLX5_IB_ATTR_DM_MAP_OP_ADDR_RESP_PAGE_INDEX, + &page_idx, sizeof(page_idx)); + if (err) + return err; + + return uverbs_copy_to(attrs, + MLX5_IB_ATTR_DM_MAP_OP_ADDR_RESP_START_OFFSET, + &start_offset, sizeof(start_offset)); +} + +static int map_existing_op(struct mlx5_ib_dm_memic *dm, u8 op, + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_ib_dm_op_entry *op_entry; + + op_entry = xa_load(&dm->ops, op); + if (!op_entry) + return -ENOENT; + + return copy_op_to_user(op_entry, attrs); +} + +static int UVERBS_HANDLER(MLX5_IB_METHOD_DM_MAP_OP_ADDR)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = uverbs_attr_get_uobject( + attrs, MLX5_IB_ATTR_DM_MAP_OP_ADDR_REQ_HANDLE); + struct mlx5_ib_dev *dev = to_mdev(uobj->context->device); + struct ib_dm *ibdm = uobj->object; + struct mlx5_ib_dm_memic *dm = to_memic(ibdm); + struct mlx5_ib_dm_op_entry *op_entry; + int err; + u8 op; + + err = uverbs_copy_from(&op, attrs, MLX5_IB_ATTR_DM_MAP_OP_ADDR_REQ_OP); + if (err) + return err; + + if (op >= BITS_PER_TYPE(u32)) + return -EOPNOTSUPP; + + if (!(MLX5_CAP_DEV_MEM(dev->mdev, memic_operations) & BIT(op))) + return -EOPNOTSUPP; + + mutex_lock(&dm->ops_xa_lock); + err = map_existing_op(dm, op, attrs); + if (!err || err != -ENOENT) + goto err_unlock; + + op_entry = kzalloc(sizeof(*op_entry), GFP_KERNEL); + if (!op_entry) + goto err_unlock; + + err = mlx5_cmd_alloc_memic_op(&dev->dm, dm->base.dev_addr, op, + &op_entry->op_addr); + if (err) { + kfree(op_entry); + goto err_unlock; + } + op_entry->op = op; + op_entry->dm = dm; + + err = add_dm_mmap_entry(uobj->context, &op_entry->mentry, + MLX5_IB_MMAP_TYPE_MEMIC_OP, dm->base.size, + op_entry->op_addr & PAGE_MASK); + if (err) { + mlx5_cmd_dealloc_memic_op(&dev->dm, dm->base.dev_addr, op); + kfree(op_entry); + goto err_unlock; + } + /* From this point, entry will be freed by mmap_free */ + kref_get(&dm->ref); + + err = copy_op_to_user(op_entry, attrs); + if (err) + goto err_remove; + + err = xa_insert(&dm->ops, op, op_entry, GFP_KERNEL); + if (err) + goto err_remove; + mutex_unlock(&dm->ops_xa_lock); + + return 0; + +err_remove: + rdma_user_mmap_entry_remove(&op_entry->mentry.rdma_entry); +err_unlock: + mutex_unlock(&dm->ops_xa_lock); + + return err; +} + +static struct ib_dm *handle_alloc_dm_memic(struct ib_ucontext *ctx, + struct ib_dm_alloc_attr *attr, + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_dm *dm_db = &to_mdev(ctx->device)->dm; + struct mlx5_ib_dm_memic *dm; + u64 start_offset; + u16 page_idx; + int err; + u64 address; + + if (!MLX5_CAP_DEV_MEM(dm_db->dev, memic)) + return ERR_PTR(-EOPNOTSUPP); + + dm = kzalloc(sizeof(*dm), GFP_KERNEL); + if (!dm) + return ERR_PTR(-ENOMEM); + + dm->base.type = MLX5_IB_UAPI_DM_TYPE_MEMIC; + dm->base.size = roundup(attr->length, MLX5_MEMIC_BASE_SIZE); + dm->base.ibdm.device = ctx->device; + + kref_init(&dm->ref); + xa_init(&dm->ops); + mutex_init(&dm->ops_xa_lock); + dm->req_length = attr->length; + + err = mlx5_cmd_alloc_memic(dm_db, &dm->base.dev_addr, + dm->base.size, attr->alignment); + if (err) { + kfree(dm); + return ERR_PTR(err); + } + + address = dm->base.dev_addr & PAGE_MASK; + err = add_dm_mmap_entry(ctx, &dm->mentry, MLX5_IB_MMAP_TYPE_MEMIC, + dm->base.size, address); + if (err) { + mlx5_cmd_dealloc_memic(dm_db, dm->base.dev_addr, dm->base.size); + kfree(dm); + return ERR_PTR(err); + } + + page_idx = dm->mentry.rdma_entry.start_pgoff & 0xFFFF; + err = uverbs_copy_to(attrs, MLX5_IB_ATTR_ALLOC_DM_RESP_PAGE_INDEX, + &page_idx, sizeof(page_idx)); + if (err) + goto err_copy; + + start_offset = dm->base.dev_addr & ~PAGE_MASK; + err = uverbs_copy_to(attrs, + MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET, + &start_offset, sizeof(start_offset)); + if (err) + goto err_copy; + + return &dm->base.ibdm; + +err_copy: + rdma_user_mmap_entry_remove(&dm->mentry.rdma_entry); + return ERR_PTR(err); +} + +static enum mlx5_sw_icm_type get_icm_type(int uapi_type) +{ + switch (uapi_type) { + case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM: + return MLX5_SW_ICM_TYPE_HEADER_MODIFY; + case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_PATTERN_SW_ICM: + return MLX5_SW_ICM_TYPE_HEADER_MODIFY_PATTERN; + case MLX5_IB_UAPI_DM_TYPE_ENCAP_SW_ICM: + return MLX5_SW_ICM_TYPE_SW_ENCAP; + default: + return MLX5_SW_ICM_TYPE_STEERING; + } +} + +static struct ib_dm *handle_alloc_dm_sw_icm(struct ib_ucontext *ctx, + struct ib_dm_alloc_attr *attr, + struct uverbs_attr_bundle *attrs, + int type) +{ + struct mlx5_core_dev *dev = to_mdev(ctx->device)->mdev; + enum mlx5_sw_icm_type icm_type = get_icm_type(type); + struct mlx5_ib_dm_icm *dm; + u64 act_size; + int err; + + dm = kzalloc(sizeof(*dm), GFP_KERNEL); + if (!dm) + return ERR_PTR(-ENOMEM); + + dm->base.type = type; + dm->base.ibdm.device = ctx->device; + + if (!capable(CAP_SYS_RAWIO) || !capable(CAP_NET_RAW)) { + err = -EPERM; + goto free; + } + + if (!(MLX5_CAP_FLOWTABLE_NIC_RX(dev, sw_owner) || + MLX5_CAP_FLOWTABLE_NIC_TX(dev, sw_owner) || + MLX5_CAP_FLOWTABLE_NIC_RX(dev, sw_owner_v2) || + MLX5_CAP_FLOWTABLE_NIC_TX(dev, sw_owner_v2))) { + err = -EOPNOTSUPP; + goto free; + } + + /* Allocation size must a multiple of the basic block size + * and a power of 2. + */ + act_size = round_up(attr->length, MLX5_SW_ICM_BLOCK_SIZE(dev)); + act_size = roundup_pow_of_two(act_size); + + dm->base.size = act_size; + err = mlx5_dm_sw_icm_alloc(dev, icm_type, act_size, attr->alignment, + to_mucontext(ctx)->devx_uid, + &dm->base.dev_addr, &dm->obj_id); + if (err) + goto free; + + err = uverbs_copy_to(attrs, MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET, + &dm->base.dev_addr, sizeof(dm->base.dev_addr)); + if (err) { + mlx5_dm_sw_icm_dealloc(dev, icm_type, dm->base.size, + to_mucontext(ctx)->devx_uid, + dm->base.dev_addr, dm->obj_id); + goto free; + } + return &dm->base.ibdm; +free: + kfree(dm); + return ERR_PTR(err); +} + +struct ib_dm *mlx5_ib_alloc_dm(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_dm_alloc_attr *attr, + struct uverbs_attr_bundle *attrs) +{ + enum mlx5_ib_uapi_dm_type type; + int err; + + err = uverbs_get_const_default(&type, attrs, + MLX5_IB_ATTR_ALLOC_DM_REQ_TYPE, + MLX5_IB_UAPI_DM_TYPE_MEMIC); + if (err) + return ERR_PTR(err); + + mlx5_ib_dbg(to_mdev(ibdev), "alloc_dm req: dm_type=%d user_length=0x%llx log_alignment=%d\n", + type, attr->length, attr->alignment); + + switch (type) { + case MLX5_IB_UAPI_DM_TYPE_MEMIC: + return handle_alloc_dm_memic(context, attr, attrs); + case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM: + case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM: + case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_PATTERN_SW_ICM: + case MLX5_IB_UAPI_DM_TYPE_ENCAP_SW_ICM: + return handle_alloc_dm_sw_icm(context, attr, attrs, type); + default: + return ERR_PTR(-EOPNOTSUPP); + } +} + +static void dm_memic_remove_ops(struct mlx5_ib_dm_memic *dm) +{ + struct mlx5_ib_dm_op_entry *entry; + unsigned long idx; + + mutex_lock(&dm->ops_xa_lock); + xa_for_each(&dm->ops, idx, entry) { + xa_erase(&dm->ops, idx); + rdma_user_mmap_entry_remove(&entry->mentry.rdma_entry); + } + mutex_unlock(&dm->ops_xa_lock); +} + +static void mlx5_dm_memic_dealloc(struct mlx5_ib_dm_memic *dm) +{ + dm_memic_remove_ops(dm); + rdma_user_mmap_entry_remove(&dm->mentry.rdma_entry); +} + +static int mlx5_dm_icm_dealloc(struct mlx5_ib_ucontext *ctx, + struct mlx5_ib_dm_icm *dm) +{ + enum mlx5_sw_icm_type type = get_icm_type(dm->base.type); + struct mlx5_core_dev *dev = to_mdev(dm->base.ibdm.device)->mdev; + int err; + + err = mlx5_dm_sw_icm_dealloc(dev, type, dm->base.size, ctx->devx_uid, + dm->base.dev_addr, dm->obj_id); + if (!err) + kfree(dm); + return 0; +} + +static int mlx5_ib_dealloc_dm(struct ib_dm *ibdm, + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_ib_ucontext *ctx = rdma_udata_to_drv_context( + &attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext); + struct mlx5_ib_dm *dm = to_mdm(ibdm); + + switch (dm->type) { + case MLX5_IB_UAPI_DM_TYPE_MEMIC: + mlx5_dm_memic_dealloc(to_memic(ibdm)); + return 0; + case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM: + case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM: + case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_PATTERN_SW_ICM: + case MLX5_IB_UAPI_DM_TYPE_ENCAP_SW_ICM: + return mlx5_dm_icm_dealloc(ctx, to_icm(ibdm)); + default: + return -EOPNOTSUPP; + } +} + +static int UVERBS_HANDLER(MLX5_IB_METHOD_DM_QUERY)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_dm *ibdm = + uverbs_attr_get_obj(attrs, MLX5_IB_ATTR_QUERY_DM_REQ_HANDLE); + struct mlx5_ib_dm *dm = to_mdm(ibdm); + struct mlx5_ib_dm_memic *memic; + u64 start_offset; + u16 page_idx; + int err; + + if (dm->type != MLX5_IB_UAPI_DM_TYPE_MEMIC) + return -EOPNOTSUPP; + + memic = to_memic(ibdm); + page_idx = memic->mentry.rdma_entry.start_pgoff & 0xFFFF; + err = uverbs_copy_to(attrs, MLX5_IB_ATTR_QUERY_DM_RESP_PAGE_INDEX, + &page_idx, sizeof(page_idx)); + if (err) + return err; + + start_offset = memic->base.dev_addr & ~PAGE_MASK; + err = uverbs_copy_to(attrs, MLX5_IB_ATTR_QUERY_DM_RESP_START_OFFSET, + &start_offset, sizeof(start_offset)); + if (err) + return err; + + return uverbs_copy_to(attrs, MLX5_IB_ATTR_QUERY_DM_RESP_LENGTH, + &memic->req_length, + sizeof(memic->req_length)); +} + +void mlx5_ib_dm_mmap_free(struct mlx5_ib_dev *dev, + struct mlx5_user_mmap_entry *mentry) +{ + struct mlx5_ib_dm_op_entry *op_entry; + struct mlx5_ib_dm_memic *mdm; + + switch (mentry->mmap_flag) { + case MLX5_IB_MMAP_TYPE_MEMIC: + mdm = container_of(mentry, struct mlx5_ib_dm_memic, mentry); + kref_put(&mdm->ref, mlx5_ib_dm_memic_free); + break; + case MLX5_IB_MMAP_TYPE_MEMIC_OP: + op_entry = container_of(mentry, struct mlx5_ib_dm_op_entry, + mentry); + mdm = op_entry->dm; + mlx5_cmd_dealloc_memic_op(&dev->dm, mdm->base.dev_addr, + op_entry->op); + kfree(op_entry); + kref_put(&mdm->ref, mlx5_ib_dm_memic_free); + break; + default: + WARN_ON(true); + } +} + +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_DM_QUERY, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_QUERY_DM_REQ_HANDLE, UVERBS_OBJECT_DM, + UVERBS_ACCESS_READ, UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_QUERY_DM_RESP_START_OFFSET, + UVERBS_ATTR_TYPE(u64), UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_QUERY_DM_RESP_PAGE_INDEX, + UVERBS_ATTR_TYPE(u16), UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_QUERY_DM_RESP_LENGTH, + UVERBS_ATTR_TYPE(u64), UA_MANDATORY)); + +ADD_UVERBS_ATTRIBUTES_SIMPLE( + mlx5_ib_dm, UVERBS_OBJECT_DM, UVERBS_METHOD_DM_ALLOC, + UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET, + UVERBS_ATTR_TYPE(u64), UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_ALLOC_DM_RESP_PAGE_INDEX, + UVERBS_ATTR_TYPE(u16), UA_OPTIONAL), + UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_ALLOC_DM_REQ_TYPE, + enum mlx5_ib_uapi_dm_type, UA_OPTIONAL)); + +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_DM_MAP_OP_ADDR, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_DM_MAP_OP_ADDR_REQ_HANDLE, + UVERBS_OBJECT_DM, + UVERBS_ACCESS_READ, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_DM_MAP_OP_ADDR_REQ_OP, + UVERBS_ATTR_TYPE(u8), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_DM_MAP_OP_ADDR_RESP_START_OFFSET, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_DM_MAP_OP_ADDR_RESP_PAGE_INDEX, + UVERBS_ATTR_TYPE(u16), + UA_OPTIONAL)); + +DECLARE_UVERBS_GLOBAL_METHODS(UVERBS_OBJECT_DM, + &UVERBS_METHOD(MLX5_IB_METHOD_DM_MAP_OP_ADDR), + &UVERBS_METHOD(MLX5_IB_METHOD_DM_QUERY)); + +const struct uapi_definition mlx5_ib_dm_defs[] = { + UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_DM, &mlx5_ib_dm), + UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_DM), + {}, +}; + +const struct ib_device_ops mlx5_ib_dev_dm_ops = { + .alloc_dm = mlx5_ib_alloc_dm, + .dealloc_dm = mlx5_ib_dealloc_dm, + .reg_dm_mr = mlx5_ib_reg_dm_mr, +}; diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/dm.h b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/dm.h new file mode 100644 index 0000000..9674a80 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/dm.h @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2021, Mellanox Technologies inc. All rights reserved. + */ + +#ifndef _MLX5_IB_DM_H +#define _MLX5_IB_DM_H + +#include "mlx5_ib.h" + +extern const struct ib_device_ops mlx5_ib_dev_dm_ops; +extern const struct uapi_definition mlx5_ib_dm_defs[]; + +struct mlx5_ib_dm { + struct ib_dm ibdm; + u32 type; + phys_addr_t dev_addr; + size_t size; +}; + +struct mlx5_ib_dm_op_entry { + struct mlx5_user_mmap_entry mentry; + phys_addr_t op_addr; + struct mlx5_ib_dm_memic *dm; + u8 op; +}; + +struct mlx5_ib_dm_memic { + struct mlx5_ib_dm base; + struct mlx5_user_mmap_entry mentry; + struct xarray ops; + struct mutex ops_xa_lock; + struct kref ref; + size_t req_length; +}; + +struct mlx5_ib_dm_icm { + struct mlx5_ib_dm base; + u32 obj_id; +}; + +static inline struct mlx5_ib_dm *to_mdm(struct ib_dm *ibdm) +{ + return container_of(ibdm, struct mlx5_ib_dm, ibdm); +} + +static inline struct mlx5_ib_dm_memic *to_memic(struct ib_dm *ibdm) +{ + return container_of(ibdm, struct mlx5_ib_dm_memic, base.ibdm); +} + +static inline struct mlx5_ib_dm_icm *to_icm(struct ib_dm *ibdm) +{ + return container_of(ibdm, struct mlx5_ib_dm_icm, base.ibdm); +} + +struct ib_dm *mlx5_ib_alloc_dm(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_dm_alloc_attr *attr, + struct uverbs_attr_bundle *attrs); +void mlx5_ib_dm_mmap_free(struct mlx5_ib_dev *dev, + struct mlx5_user_mmap_entry *mentry); +void mlx5_cmd_dealloc_memic(struct mlx5_dm *dm, phys_addr_t addr, + u64 length); +void mlx5_cmd_dealloc_memic_op(struct mlx5_dm *dm, phys_addr_t addr, + u8 operation); + +#endif /* _MLX5_IB_DM_H */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/doorbell.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/doorbell.c new file mode 100644 index 0000000..b5200b4 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/doorbell.c @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#include "mlx5_ib.h" + +struct mlx5_ib_user_db_page { + struct list_head list; + struct ib_umem *umem; + unsigned long user_virt; + int refcnt; + struct mm_struct *mm; +}; + +int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, unsigned long virt, + struct mlx5_db *db) +{ + struct mlx5_ib_user_db_page *page; + int err = 0; + + mutex_lock(&context->db_page_mutex); + + list_for_each_entry(page, &context->db_page_list, list) + if ((current->mm == page->mm) && + (page->user_virt == (virt & PAGE_MASK))) + goto found; + + page = kmalloc(sizeof(*page), GFP_KERNEL); + if (!page) { + err = -ENOMEM; + goto out; + } + + page->user_virt = (virt & PAGE_MASK); + page->refcnt = 0; + page->umem = + ib_umem_get_peer(context->ibucontext.device, virt & PAGE_MASK, + PAGE_SIZE, 0, 0); + if (IS_ERR(page->umem)) { + err = PTR_ERR(page->umem); + kfree(page); + goto out; + } + mmgrab(current->mm); + page->mm = current->mm; + + list_add(&page->list, &context->db_page_list); + +found: + db->dma = sg_dma_address(page->umem->sgt_append.sgt.sgl) + + (virt & ~PAGE_MASK); + db->u.user_page = page; + ++page->refcnt; + +out: + mutex_unlock(&context->db_page_mutex); + + return err; +} + +void mlx5_ib_db_unmap_user(struct mlx5_ib_ucontext *context, struct mlx5_db *db) +{ + mutex_lock(&context->db_page_mutex); + + if (!--db->u.user_page->refcnt) { + list_del(&db->u.user_page->list); + mmdrop(db->u.user_page->mm); + ib_umem_release(db->u.user_page->umem); + kfree(db->u.user_page); + } + + mutex_unlock(&context->db_page_mutex); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/fs.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/fs.c new file mode 100644 index 0000000..a713de7 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/fs.c @@ -0,0 +1,2819 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2018, Mellanox Technologies inc. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "mlx5_ib.h" +#include "counters.h" +#include "devx.h" +#include "fs.h" + +#define UVERBS_MODULE_NAME mlx5_ib +#include + +enum { + MATCH_CRITERIA_ENABLE_OUTER_BIT, + MATCH_CRITERIA_ENABLE_MISC_BIT, + MATCH_CRITERIA_ENABLE_INNER_BIT, + MATCH_CRITERIA_ENABLE_MISC2_BIT +}; + +#define HEADER_IS_ZERO(match_criteria, headers) \ + !(memchr_inv(MLX5_ADDR_OF(fte_match_param, match_criteria, headers), \ + 0, MLX5_FLD_SZ_BYTES(fte_match_param, headers))) \ + +static u8 get_match_criteria_enable(u32 *match_criteria) +{ + u8 match_criteria_enable; + + match_criteria_enable = + (!HEADER_IS_ZERO(match_criteria, outer_headers)) << + MATCH_CRITERIA_ENABLE_OUTER_BIT; + match_criteria_enable |= + (!HEADER_IS_ZERO(match_criteria, misc_parameters)) << + MATCH_CRITERIA_ENABLE_MISC_BIT; + match_criteria_enable |= + (!HEADER_IS_ZERO(match_criteria, inner_headers)) << + MATCH_CRITERIA_ENABLE_INNER_BIT; + match_criteria_enable |= + (!HEADER_IS_ZERO(match_criteria, misc_parameters_2)) << + MATCH_CRITERIA_ENABLE_MISC2_BIT; + + return match_criteria_enable; +} + +static int set_proto(void *outer_c, void *outer_v, u8 mask, u8 val) +{ + u8 entry_mask; + u8 entry_val; + int err = 0; + + if (!mask) + goto out; + + entry_mask = MLX5_GET(fte_match_set_lyr_2_4, outer_c, + ip_protocol); + entry_val = MLX5_GET(fte_match_set_lyr_2_4, outer_v, + ip_protocol); + if (!entry_mask) { + MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_protocol, mask); + MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_protocol, val); + goto out; + } + /* Don't override existing ip protocol */ + if (mask != entry_mask || val != entry_val) + err = -EINVAL; +out: + return err; +} + +static void set_flow_label(void *misc_c, void *misc_v, u32 mask, u32 val, + bool inner) +{ + if (inner) { + MLX5_SET(fte_match_set_misc, + misc_c, inner_ipv6_flow_label, mask); + MLX5_SET(fte_match_set_misc, + misc_v, inner_ipv6_flow_label, val); + } else { + MLX5_SET(fte_match_set_misc, + misc_c, outer_ipv6_flow_label, mask); + MLX5_SET(fte_match_set_misc, + misc_v, outer_ipv6_flow_label, val); + } +} + +static void set_tos(void *outer_c, void *outer_v, u8 mask, u8 val) +{ + MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_ecn, mask); + MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_ecn, val); + MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_dscp, mask >> 2); + MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_dscp, val >> 2); +} + +static int check_mpls_supp_fields(u32 field_support, const __be32 *set_mask) +{ + if (MLX5_GET(fte_match_mpls, set_mask, mpls_label) && + !(field_support & MLX5_FIELD_SUPPORT_MPLS_LABEL)) + return -EOPNOTSUPP; + + if (MLX5_GET(fte_match_mpls, set_mask, mpls_exp) && + !(field_support & MLX5_FIELD_SUPPORT_MPLS_EXP)) + return -EOPNOTSUPP; + + if (MLX5_GET(fte_match_mpls, set_mask, mpls_s_bos) && + !(field_support & MLX5_FIELD_SUPPORT_MPLS_S_BOS)) + return -EOPNOTSUPP; + + if (MLX5_GET(fte_match_mpls, set_mask, mpls_ttl) && + !(field_support & MLX5_FIELD_SUPPORT_MPLS_TTL)) + return -EOPNOTSUPP; + + return 0; +} + +#define LAST_ETH_FIELD vlan_tag +#define LAST_IB_FIELD sl +#define LAST_IPV4_FIELD tos +#define LAST_IPV6_FIELD traffic_class +#define LAST_TCP_UDP_FIELD src_port +#define LAST_TUNNEL_FIELD tunnel_id +#define LAST_FLOW_TAG_FIELD tag_id +#define LAST_DROP_FIELD size +#define LAST_COUNTERS_FIELD counters + +/* Field is the last supported field */ +#define FIELDS_NOT_SUPPORTED(filter, field) \ + memchr_inv((void *)&filter.field + sizeof(filter.field), 0, \ + sizeof(filter) - offsetofend(typeof(filter), field)) + +int parse_flow_flow_action(struct mlx5_ib_flow_action *maction, + bool is_egress, + struct mlx5_flow_act *action) +{ + + switch (maction->ib_action.type) { + case IB_FLOW_ACTION_ESP: + if (action->action & (MLX5_FLOW_CONTEXT_ACTION_ENCRYPT | + MLX5_FLOW_CONTEXT_ACTION_DECRYPT)) + return -EINVAL; + /* Currently only AES_GCM keymat is supported by the driver */ + action->esp_id = (uintptr_t)maction->esp_aes_gcm.ctx; + action->action |= is_egress ? + MLX5_FLOW_CONTEXT_ACTION_ENCRYPT : + MLX5_FLOW_CONTEXT_ACTION_DECRYPT; + return 0; + case IB_FLOW_ACTION_UNSPECIFIED: + if (maction->flow_action_raw.sub_type == + MLX5_IB_FLOW_ACTION_MODIFY_HEADER) { + if (action->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR) + return -EINVAL; + action->action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR; + action->modify_hdr = + maction->flow_action_raw.modify_hdr; + return 0; + } + if (maction->flow_action_raw.sub_type == + MLX5_IB_FLOW_ACTION_DECAP) { + if (action->action & MLX5_FLOW_CONTEXT_ACTION_DECAP) + return -EINVAL; + action->action |= MLX5_FLOW_CONTEXT_ACTION_DECAP; + return 0; + } + if (maction->flow_action_raw.sub_type == + MLX5_IB_FLOW_ACTION_PACKET_REFORMAT) { + if (action->action & + MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT) + return -EINVAL; + action->action |= + MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT; + action->pkt_reformat = + maction->flow_action_raw.pkt_reformat; + return 0; + } + fallthrough; + default: + return -EOPNOTSUPP; + } +} + +static int parse_flow_attr(struct mlx5_core_dev *mdev, + struct mlx5_flow_spec *spec, + const union ib_flow_spec *ib_spec, + const struct ib_flow_attr *flow_attr, + struct mlx5_flow_act *action, u32 prev_type) +{ + struct mlx5_flow_context *flow_context = &spec->flow_context; + u32 *match_c = spec->match_criteria; + u32 *match_v = spec->match_value; + void *misc_params_c = MLX5_ADDR_OF(fte_match_param, match_c, + misc_parameters); + void *misc_params_v = MLX5_ADDR_OF(fte_match_param, match_v, + misc_parameters); + void *misc_params2_c = MLX5_ADDR_OF(fte_match_param, match_c, + misc_parameters_2); + void *misc_params2_v = MLX5_ADDR_OF(fte_match_param, match_v, + misc_parameters_2); + void *headers_c; + void *headers_v; + int match_ipv; + int ret; + + if (ib_spec->type & IB_FLOW_SPEC_INNER) { + headers_c = MLX5_ADDR_OF(fte_match_param, match_c, + inner_headers); + headers_v = MLX5_ADDR_OF(fte_match_param, match_v, + inner_headers); + match_ipv = MLX5_CAP_FLOWTABLE_NIC_RX(mdev, + ft_field_support.inner_ip_version); + } else { + headers_c = MLX5_ADDR_OF(fte_match_param, match_c, + outer_headers); + headers_v = MLX5_ADDR_OF(fte_match_param, match_v, + outer_headers); + match_ipv = MLX5_CAP_FLOWTABLE_NIC_RX(mdev, + ft_field_support.outer_ip_version); + } + + switch (ib_spec->type & ~IB_FLOW_SPEC_INNER) { + case IB_FLOW_SPEC_ETH: + if (FIELDS_NOT_SUPPORTED(ib_spec->eth.mask, LAST_ETH_FIELD)) + return -EOPNOTSUPP; + + ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c, + dmac_47_16), + ib_spec->eth.mask.dst_mac); + ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, + dmac_47_16), + ib_spec->eth.val.dst_mac); + + ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c, + smac_47_16), + ib_spec->eth.mask.src_mac); + ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, + smac_47_16), + ib_spec->eth.val.src_mac); + + if (ib_spec->eth.mask.vlan_tag) { + MLX5_SET(fte_match_set_lyr_2_4, headers_c, + cvlan_tag, 1); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, + cvlan_tag, 1); + + MLX5_SET(fte_match_set_lyr_2_4, headers_c, + first_vid, ntohs(ib_spec->eth.mask.vlan_tag)); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, + first_vid, ntohs(ib_spec->eth.val.vlan_tag)); + + MLX5_SET(fte_match_set_lyr_2_4, headers_c, + first_cfi, + ntohs(ib_spec->eth.mask.vlan_tag) >> 12); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, + first_cfi, + ntohs(ib_spec->eth.val.vlan_tag) >> 12); + + MLX5_SET(fte_match_set_lyr_2_4, headers_c, + first_prio, + ntohs(ib_spec->eth.mask.vlan_tag) >> 13); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, + first_prio, + ntohs(ib_spec->eth.val.vlan_tag) >> 13); + } + MLX5_SET(fte_match_set_lyr_2_4, headers_c, + ethertype, ntohs(ib_spec->eth.mask.ether_type)); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, + ethertype, ntohs(ib_spec->eth.val.ether_type)); + break; + case IB_FLOW_SPEC_IPV4: + if (FIELDS_NOT_SUPPORTED(ib_spec->ipv4.mask, LAST_IPV4_FIELD)) + return -EOPNOTSUPP; + + if (match_ipv) { + MLX5_SET(fte_match_set_lyr_2_4, headers_c, + ip_version, 0xf); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, + ip_version, MLX5_FS_IPV4_VERSION); + } else { + MLX5_SET(fte_match_set_lyr_2_4, headers_c, + ethertype, 0xffff); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, + ethertype, ETH_P_IP); + } + + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c, + src_ipv4_src_ipv6.ipv4_layout.ipv4), + &ib_spec->ipv4.mask.src_ip, + sizeof(ib_spec->ipv4.mask.src_ip)); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, + src_ipv4_src_ipv6.ipv4_layout.ipv4), + &ib_spec->ipv4.val.src_ip, + sizeof(ib_spec->ipv4.val.src_ip)); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c, + dst_ipv4_dst_ipv6.ipv4_layout.ipv4), + &ib_spec->ipv4.mask.dst_ip, + sizeof(ib_spec->ipv4.mask.dst_ip)); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, + dst_ipv4_dst_ipv6.ipv4_layout.ipv4), + &ib_spec->ipv4.val.dst_ip, + sizeof(ib_spec->ipv4.val.dst_ip)); + + set_tos(headers_c, headers_v, + ib_spec->ipv4.mask.tos, ib_spec->ipv4.val.tos); + + if (set_proto(headers_c, headers_v, + ib_spec->ipv4.mask.proto, + ib_spec->ipv4.val.proto)) + return -EINVAL; + break; + case IB_FLOW_SPEC_IPV6: + if (FIELDS_NOT_SUPPORTED(ib_spec->ipv6.mask, LAST_IPV6_FIELD)) + return -EOPNOTSUPP; + + if (match_ipv) { + MLX5_SET(fte_match_set_lyr_2_4, headers_c, + ip_version, 0xf); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, + ip_version, MLX5_FS_IPV6_VERSION); + } else { + MLX5_SET(fte_match_set_lyr_2_4, headers_c, + ethertype, 0xffff); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, + ethertype, ETH_P_IPV6); + } + + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c, + src_ipv4_src_ipv6.ipv6_layout.ipv6), + &ib_spec->ipv6.mask.src_ip, + sizeof(ib_spec->ipv6.mask.src_ip)); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, + src_ipv4_src_ipv6.ipv6_layout.ipv6), + &ib_spec->ipv6.val.src_ip, + sizeof(ib_spec->ipv6.val.src_ip)); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c, + dst_ipv4_dst_ipv6.ipv6_layout.ipv6), + &ib_spec->ipv6.mask.dst_ip, + sizeof(ib_spec->ipv6.mask.dst_ip)); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, + dst_ipv4_dst_ipv6.ipv6_layout.ipv6), + &ib_spec->ipv6.val.dst_ip, + sizeof(ib_spec->ipv6.val.dst_ip)); + + set_tos(headers_c, headers_v, + ib_spec->ipv6.mask.traffic_class, + ib_spec->ipv6.val.traffic_class); + + if (set_proto(headers_c, headers_v, + ib_spec->ipv6.mask.next_hdr, + ib_spec->ipv6.val.next_hdr)) + return -EINVAL; + + set_flow_label(misc_params_c, misc_params_v, + ntohl(ib_spec->ipv6.mask.flow_label), + ntohl(ib_spec->ipv6.val.flow_label), + ib_spec->type & IB_FLOW_SPEC_INNER); + break; + case IB_FLOW_SPEC_ESP: + if (ib_spec->esp.mask.seq) + return -EOPNOTSUPP; + + MLX5_SET(fte_match_set_misc, misc_params_c, outer_esp_spi, + ntohl(ib_spec->esp.mask.spi)); + MLX5_SET(fte_match_set_misc, misc_params_v, outer_esp_spi, + ntohl(ib_spec->esp.val.spi)); + break; + case IB_FLOW_SPEC_TCP: + if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask, + LAST_TCP_UDP_FIELD)) + return -EOPNOTSUPP; + + if (set_proto(headers_c, headers_v, 0xff, IPPROTO_TCP)) + return -EINVAL; + + MLX5_SET(fte_match_set_lyr_2_4, headers_c, tcp_sport, + ntohs(ib_spec->tcp_udp.mask.src_port)); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, tcp_sport, + ntohs(ib_spec->tcp_udp.val.src_port)); + + MLX5_SET(fte_match_set_lyr_2_4, headers_c, tcp_dport, + ntohs(ib_spec->tcp_udp.mask.dst_port)); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, tcp_dport, + ntohs(ib_spec->tcp_udp.val.dst_port)); + break; + case IB_FLOW_SPEC_UDP: + if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask, + LAST_TCP_UDP_FIELD)) + return -EOPNOTSUPP; + + if (set_proto(headers_c, headers_v, 0xff, IPPROTO_UDP)) + return -EINVAL; + + MLX5_SET(fte_match_set_lyr_2_4, headers_c, udp_sport, + ntohs(ib_spec->tcp_udp.mask.src_port)); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_sport, + ntohs(ib_spec->tcp_udp.val.src_port)); + + MLX5_SET(fte_match_set_lyr_2_4, headers_c, udp_dport, + ntohs(ib_spec->tcp_udp.mask.dst_port)); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_dport, + ntohs(ib_spec->tcp_udp.val.dst_port)); + break; + case IB_FLOW_SPEC_GRE: + if (ib_spec->gre.mask.c_ks_res0_ver) + return -EOPNOTSUPP; + + if (set_proto(headers_c, headers_v, 0xff, IPPROTO_GRE)) + return -EINVAL; + + MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol, + 0xff); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol, + IPPROTO_GRE); + + MLX5_SET(fte_match_set_misc, misc_params_c, gre_protocol, + ntohs(ib_spec->gre.mask.protocol)); + MLX5_SET(fte_match_set_misc, misc_params_v, gre_protocol, + ntohs(ib_spec->gre.val.protocol)); + + memcpy(MLX5_ADDR_OF(fte_match_set_misc, misc_params_c, + gre_key.nvgre.hi), + &ib_spec->gre.mask.key, + sizeof(ib_spec->gre.mask.key)); + memcpy(MLX5_ADDR_OF(fte_match_set_misc, misc_params_v, + gre_key.nvgre.hi), + &ib_spec->gre.val.key, + sizeof(ib_spec->gre.val.key)); + break; + case IB_FLOW_SPEC_MPLS: + switch (prev_type) { + case IB_FLOW_SPEC_UDP: + if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev, + ft_field_support.outer_first_mpls_over_udp), + &ib_spec->mpls.mask.tag)) + return -EOPNOTSUPP; + + memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v, + outer_first_mpls_over_udp), + &ib_spec->mpls.val.tag, + sizeof(ib_spec->mpls.val.tag)); + memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c, + outer_first_mpls_over_udp), + &ib_spec->mpls.mask.tag, + sizeof(ib_spec->mpls.mask.tag)); + break; + case IB_FLOW_SPEC_GRE: + if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev, + ft_field_support.outer_first_mpls_over_gre), + &ib_spec->mpls.mask.tag)) + return -EOPNOTSUPP; + + memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v, + outer_first_mpls_over_gre), + &ib_spec->mpls.val.tag, + sizeof(ib_spec->mpls.val.tag)); + memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c, + outer_first_mpls_over_gre), + &ib_spec->mpls.mask.tag, + sizeof(ib_spec->mpls.mask.tag)); + break; + default: + if (ib_spec->type & IB_FLOW_SPEC_INNER) { + if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev, + ft_field_support.inner_first_mpls), + &ib_spec->mpls.mask.tag)) + return -EOPNOTSUPP; + + memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v, + inner_first_mpls), + &ib_spec->mpls.val.tag, + sizeof(ib_spec->mpls.val.tag)); + memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c, + inner_first_mpls), + &ib_spec->mpls.mask.tag, + sizeof(ib_spec->mpls.mask.tag)); + } else { + if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev, + ft_field_support.outer_first_mpls), + &ib_spec->mpls.mask.tag)) + return -EOPNOTSUPP; + + memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v, + outer_first_mpls), + &ib_spec->mpls.val.tag, + sizeof(ib_spec->mpls.val.tag)); + memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c, + outer_first_mpls), + &ib_spec->mpls.mask.tag, + sizeof(ib_spec->mpls.mask.tag)); + } + } + break; + case IB_FLOW_SPEC_VXLAN_TUNNEL: + if (FIELDS_NOT_SUPPORTED(ib_spec->tunnel.mask, + LAST_TUNNEL_FIELD)) + return -EOPNOTSUPP; + + MLX5_SET(fte_match_set_misc, misc_params_c, vxlan_vni, + ntohl(ib_spec->tunnel.mask.tunnel_id)); + MLX5_SET(fte_match_set_misc, misc_params_v, vxlan_vni, + ntohl(ib_spec->tunnel.val.tunnel_id)); + break; + case IB_FLOW_SPEC_ACTION_TAG: + if (FIELDS_NOT_SUPPORTED(ib_spec->flow_tag, + LAST_FLOW_TAG_FIELD)) + return -EOPNOTSUPP; + if (ib_spec->flow_tag.tag_id >= BIT(24)) + return -EINVAL; + + flow_context->flow_tag = ib_spec->flow_tag.tag_id; + flow_context->flags |= FLOW_CONTEXT_HAS_TAG; + break; + case IB_FLOW_SPEC_ACTION_DROP: + if (FIELDS_NOT_SUPPORTED(ib_spec->drop, + LAST_DROP_FIELD)) + return -EOPNOTSUPP; + action->action |= MLX5_FLOW_CONTEXT_ACTION_DROP; + break; + case IB_FLOW_SPEC_ACTION_HANDLE: + ret = parse_flow_flow_action(to_mflow_act(ib_spec->action.act), + flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS, action); + if (ret) + return ret; + break; + case IB_FLOW_SPEC_ACTION_COUNT: + if (FIELDS_NOT_SUPPORTED(ib_spec->flow_count, + LAST_COUNTERS_FIELD)) + return -EOPNOTSUPP; + + /* for now support only one counters spec per flow */ + if (action->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) + return -EINVAL; + + action->counters = ib_spec->flow_count.counters; + action->action |= MLX5_FLOW_CONTEXT_ACTION_COUNT; + break; + default: + return -EINVAL; + } + + return 0; +} + +/* If a flow could catch both multicast and unicast packets, + * it won't fall into the multicast flow steering table and this rule + * could steal other multicast packets. + */ +static bool flow_is_multicast_only(const struct ib_flow_attr *ib_attr) +{ + union ib_flow_spec *flow_spec; + + if (ib_attr->type != IB_FLOW_ATTR_NORMAL || + ib_attr->num_of_specs < 1) + return false; + + flow_spec = (union ib_flow_spec *)(ib_attr + 1); + if (flow_spec->type == IB_FLOW_SPEC_IPV4) { + struct ib_flow_spec_ipv4 *ipv4_spec; + + ipv4_spec = (struct ib_flow_spec_ipv4 *)flow_spec; + if (ipv4_is_multicast(ipv4_spec->val.dst_ip)) + return true; + + return false; + } + + if (flow_spec->type == IB_FLOW_SPEC_ETH) { + struct ib_flow_spec_eth *eth_spec; + + eth_spec = (struct ib_flow_spec_eth *)flow_spec; + return is_multicast_ether_addr(eth_spec->mask.dst_mac) && + is_multicast_ether_addr(eth_spec->val.dst_mac); + } + + return false; +} + +enum valid_spec { + VALID_SPEC_INVALID, + VALID_SPEC_VALID, + VALID_SPEC_NA, +}; + +static bool is_valid_ethertype(struct mlx5_core_dev *mdev, + const struct ib_flow_attr *flow_attr, + bool check_inner) +{ + union ib_flow_spec *ib_spec = (union ib_flow_spec *)(flow_attr + 1); + int match_ipv = check_inner ? + MLX5_CAP_FLOWTABLE_NIC_RX(mdev, + ft_field_support.inner_ip_version) : + MLX5_CAP_FLOWTABLE_NIC_RX(mdev, + ft_field_support.outer_ip_version); + int inner_bit = check_inner ? IB_FLOW_SPEC_INNER : 0; + bool ipv4_spec_valid, ipv6_spec_valid; + unsigned int ip_spec_type = 0; + bool has_ethertype = false; + unsigned int spec_index; + bool mask_valid = true; + u16 eth_type = 0; + bool type_valid; + + /* Validate that ethertype is correct */ + for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) { + if ((ib_spec->type == (IB_FLOW_SPEC_ETH | inner_bit)) && + ib_spec->eth.mask.ether_type) { + mask_valid = (ib_spec->eth.mask.ether_type == + htons(0xffff)); + has_ethertype = true; + eth_type = ntohs(ib_spec->eth.val.ether_type); + } else if ((ib_spec->type == (IB_FLOW_SPEC_IPV4 | inner_bit)) || + (ib_spec->type == (IB_FLOW_SPEC_IPV6 | inner_bit))) { + ip_spec_type = ib_spec->type; + } + ib_spec = (void *)ib_spec + ib_spec->size; + } + + type_valid = (!has_ethertype) || (!ip_spec_type); + if (!type_valid && mask_valid) { + ipv4_spec_valid = (eth_type == ETH_P_IP) && + (ip_spec_type == (IB_FLOW_SPEC_IPV4 | inner_bit)); + ipv6_spec_valid = (eth_type == ETH_P_IPV6) && + (ip_spec_type == (IB_FLOW_SPEC_IPV6 | inner_bit)); + + type_valid = (ipv4_spec_valid) || (ipv6_spec_valid) || + (((eth_type == ETH_P_MPLS_UC) || + (eth_type == ETH_P_MPLS_MC)) && match_ipv); + } + + return type_valid; +} + +static bool is_valid_attr(struct mlx5_core_dev *mdev, + const struct ib_flow_attr *flow_attr) +{ + return is_valid_ethertype(mdev, flow_attr, false) && + is_valid_ethertype(mdev, flow_attr, true); +} + +static void put_flow_table(struct mlx5_ib_dev *dev, + struct mlx5_ib_flow_prio *prio, bool ft_added) +{ + prio->refcount -= !!ft_added; + if (!prio->refcount) { + mlx5_destroy_flow_table(prio->flow_table); + prio->flow_table = NULL; + } +} + +static int mlx5_ib_destroy_flow(struct ib_flow *flow_id) +{ + struct mlx5_ib_flow_handler *handler = container_of(flow_id, + struct mlx5_ib_flow_handler, + ibflow); + struct mlx5_ib_flow_handler *iter, *tmp; + struct mlx5_ib_dev *dev = handler->dev; + + mutex_lock(&dev->flow_db->lock); + + list_for_each_entry_safe(iter, tmp, &handler->list, list) { + mlx5_del_flow_rules(iter->rule); + put_flow_table(dev, iter->prio, true); + list_del(&iter->list); + kfree(iter); + } + + mlx5_del_flow_rules(handler->rule); + put_flow_table(dev, handler->prio, true); + mlx5_ib_counters_clear_description(handler->ibcounters); + mutex_unlock(&dev->flow_db->lock); + if (handler->flow_matcher) + atomic_dec(&handler->flow_matcher->usecnt); + kfree(handler); + + return 0; +} + +static int ib_prio_to_core_prio(unsigned int priority, bool dont_trap) +{ + priority *= 2; + if (!dont_trap) + priority++; + return priority; +} + +enum flow_table_type { + MLX5_IB_FT_RX, + MLX5_IB_FT_TX +}; + +#define MLX5_FS_MAX_TYPES 6 +#define MLX5_FS_MAX_ENTRIES BIT(16) + +static bool mlx5_ib_shared_ft_allowed(struct ib_device *device) +{ + struct mlx5_ib_dev *dev = to_mdev(device); + + return MLX5_CAP_GEN(dev->mdev, shared_object_to_user_object_allowed); +} + +static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_ib_dev *dev, + struct mlx5_flow_namespace *ns, + struct mlx5_ib_flow_prio *prio, + int priority, + int num_entries, int num_groups, + u32 flags) +{ + struct mlx5_flow_table_attr ft_attr = {}; + struct mlx5_flow_table *ft; + + if (mlx5_ib_shared_ft_allowed(&dev->ib_dev)) + ft_attr.uid = MLX5_SHARED_RESOURCE_UID; + ft_attr.prio = priority; + ft_attr.max_fte = num_entries; + ft_attr.flags = flags; + ft_attr.autogroup.max_num_groups = num_groups; + ft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr); + if (IS_ERR(ft)) + return ERR_CAST(ft); + + prio->flow_table = ft; + prio->refcount = 0; + return prio; +} + +static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev, + struct ib_flow_attr *flow_attr, + enum flow_table_type ft_type) +{ + bool dont_trap = flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP; + struct mlx5_flow_namespace *ns = NULL; + enum mlx5_flow_namespace_type fn_type; + struct mlx5_ib_flow_prio *prio; + struct mlx5_flow_table *ft; + int max_table_size; + int num_entries; + int num_groups; + bool esw_encap; + u32 flags = 0; + int priority; + + max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, + log_max_ft_size)); + esw_encap = mlx5_eswitch_get_encap_mode(dev->mdev) != + DEVLINK_ESWITCH_ENCAP_MODE_NONE; + + switch (flow_attr->type) { + case IB_FLOW_ATTR_NORMAL: + if (flow_is_multicast_only(flow_attr) && !dont_trap) + priority = MLX5_IB_FLOW_MCAST_PRIO; + else + priority = ib_prio_to_core_prio(flow_attr->priority, + dont_trap); + if (ft_type == MLX5_IB_FT_RX) { + fn_type = MLX5_FLOW_NAMESPACE_BYPASS; + prio = &dev->flow_db->prios[priority]; + if (!dev->is_rep && !esw_encap && + MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, decap)) + flags |= MLX5_FLOW_TABLE_TUNNEL_EN_DECAP; + if (!dev->is_rep && !esw_encap && + MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, + reformat_l3_tunnel_to_l2)) + flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT; + } else { + max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_TX( + dev->mdev, log_max_ft_size)); + fn_type = MLX5_FLOW_NAMESPACE_EGRESS; + prio = &dev->flow_db->egress_prios[priority]; + if (!dev->is_rep && !esw_encap && + MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, reformat)) + flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT; + } + ns = mlx5_get_flow_namespace(dev->mdev, fn_type); + num_entries = MLX5_FS_MAX_ENTRIES; + num_groups = MLX5_FS_MAX_TYPES; + break; + case IB_FLOW_ATTR_ALL_DEFAULT: + case IB_FLOW_ATTR_MC_DEFAULT: + ns = mlx5_get_flow_namespace(dev->mdev, + MLX5_FLOW_NAMESPACE_LEFTOVERS); + build_leftovers_ft_param(&priority, &num_entries, &num_groups); + prio = &dev->flow_db->prios[MLX5_IB_FLOW_LEFTOVERS_PRIO]; + break; + case IB_FLOW_ATTR_SNIFFER: + if (!MLX5_CAP_FLOWTABLE(dev->mdev, + allow_sniffer_and_nic_rx_shared_tir)) + return ERR_PTR(-EOPNOTSUPP); + + ns = mlx5_get_flow_namespace( + dev->mdev, ft_type == MLX5_IB_FT_RX ? + MLX5_FLOW_NAMESPACE_SNIFFER_RX : + MLX5_FLOW_NAMESPACE_SNIFFER_TX); + + prio = &dev->flow_db->sniffer[ft_type]; + priority = 0; + num_entries = 1; + num_groups = 1; + break; + default: + break; + } + + if (!ns) + return ERR_PTR(-EOPNOTSUPP); + + max_table_size = min_t(int, num_entries, max_table_size); + + ft = prio->flow_table; + if (!ft) + return _get_prio(dev, ns, prio, priority, max_table_size, + num_groups, flags); + + return prio; +} + +enum { + RDMA_RX_ECN_OPCOUNTER_PRIO, + RDMA_RX_CNP_OPCOUNTER_PRIO, +}; + +enum { + RDMA_TX_CNP_OPCOUNTER_PRIO, +}; + +static int set_vhca_port_spec(struct mlx5_ib_dev *dev, u32 port_num, + struct mlx5_flow_spec *spec) +{ + if (!MLX5_CAP_FLOWTABLE_RDMA_RX(dev->mdev, + ft_field_support.source_vhca_port) || + !MLX5_CAP_FLOWTABLE_RDMA_TX(dev->mdev, + ft_field_support.source_vhca_port)) + return -EOPNOTSUPP; + + MLX5_SET_TO_ONES(fte_match_param, &spec->match_criteria, + misc_parameters.source_vhca_port); + MLX5_SET(fte_match_param, &spec->match_value, + misc_parameters.source_vhca_port, port_num); + + return 0; +} + +static int set_ecn_ce_spec(struct mlx5_ib_dev *dev, u32 port_num, + struct mlx5_flow_spec *spec, int ipv) +{ + if (!MLX5_CAP_FLOWTABLE_RDMA_RX(dev->mdev, + ft_field_support.outer_ip_version)) + return -EOPNOTSUPP; + + if (mlx5_core_mp_enabled(dev->mdev) && + set_vhca_port_spec(dev, port_num, spec)) + return -EOPNOTSUPP; + + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + outer_headers.ip_ecn); + MLX5_SET(fte_match_param, spec->match_value, outer_headers.ip_ecn, + INET_ECN_CE); + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + outer_headers.ip_version); + MLX5_SET(fte_match_param, spec->match_value, outer_headers.ip_version, + ipv); + + spec->match_criteria_enable = + get_match_criteria_enable(spec->match_criteria); + + return 0; +} + +static int set_cnp_spec(struct mlx5_ib_dev *dev, u32 port_num, + struct mlx5_flow_spec *spec) +{ + if (mlx5_core_mp_enabled(dev->mdev) && + set_vhca_port_spec(dev, port_num, spec)) + return -EOPNOTSUPP; + + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + misc_parameters.bth_opcode); + MLX5_SET(fte_match_param, spec->match_value, misc_parameters.bth_opcode, + IB_BTH_OPCODE_CNP); + + spec->match_criteria_enable = + get_match_criteria_enable(spec->match_criteria); + + return 0; +} + +int mlx5_ib_fs_add_op_fc(struct mlx5_ib_dev *dev, u32 port_num, + struct mlx5_ib_op_fc *opfc, + enum mlx5_ib_optional_counter_type type) +{ + enum mlx5_flow_namespace_type fn_type; + int priority, i, err, spec_num; + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_destination dst; + struct mlx5_flow_namespace *ns; + struct mlx5_ib_flow_prio *prio; + struct mlx5_flow_spec *spec; + + spec = kcalloc(MAX_OPFC_RULES, sizeof(*spec), GFP_KERNEL); + if (!spec) + return -ENOMEM; + + switch (type) { + case MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS: + if (set_ecn_ce_spec(dev, port_num, &spec[0], + MLX5_FS_IPV4_VERSION) || + set_ecn_ce_spec(dev, port_num, &spec[1], + MLX5_FS_IPV6_VERSION)) { + err = -EOPNOTSUPP; + goto free; + } + spec_num = 2; + fn_type = MLX5_FLOW_NAMESPACE_RDMA_RX_COUNTERS; + priority = RDMA_RX_ECN_OPCOUNTER_PRIO; + break; + + case MLX5_IB_OPCOUNTER_CC_RX_CNP_PKTS: + if (!MLX5_CAP_FLOWTABLE(dev->mdev, + ft_field_support_2_nic_receive_rdma.bth_opcode) || + set_cnp_spec(dev, port_num, &spec[0])) { + err = -EOPNOTSUPP; + goto free; + } + spec_num = 1; + fn_type = MLX5_FLOW_NAMESPACE_RDMA_RX_COUNTERS; + priority = RDMA_RX_CNP_OPCOUNTER_PRIO; + break; + + case MLX5_IB_OPCOUNTER_CC_TX_CNP_PKTS: + if (!MLX5_CAP_FLOWTABLE(dev->mdev, + ft_field_support_2_nic_transmit_rdma.bth_opcode) || + set_cnp_spec(dev, port_num, &spec[0])) { + err = -EOPNOTSUPP; + goto free; + } + spec_num = 1; + fn_type = MLX5_FLOW_NAMESPACE_RDMA_TX_COUNTERS; + priority = RDMA_TX_CNP_OPCOUNTER_PRIO; + break; + + default: + err = -EOPNOTSUPP; + goto free; + } + + ns = mlx5_get_flow_namespace(dev->mdev, fn_type); + if (!ns) { + err = -EOPNOTSUPP; + goto free; + } + + prio = &dev->flow_db->opfcs[type]; + if (!prio->flow_table) { + prio = _get_prio(dev, ns, prio, priority, + dev->num_ports * MAX_OPFC_RULES, 1, 0); + if (IS_ERR(prio)) { + err = PTR_ERR(prio); + goto free; + } + } + + dst.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; + dst.counter_id = mlx5_fc_id(opfc->fc); + + flow_act.action = + MLX5_FLOW_CONTEXT_ACTION_COUNT | MLX5_FLOW_CONTEXT_ACTION_ALLOW; + + for (i = 0; i < spec_num; i++) { + opfc->rule[i] = mlx5_add_flow_rules(prio->flow_table, &spec[i], + &flow_act, &dst, 1); + if (IS_ERR(opfc->rule[i])) { + err = PTR_ERR(opfc->rule[i]); + goto del_rules; + } + } + prio->refcount += spec_num; + kfree(spec); + + return 0; + +del_rules: + for (i -= 1; i >= 0; i--) + mlx5_del_flow_rules(opfc->rule[i]); + put_flow_table(dev, prio, false); +free: + kfree(spec); + return err; +} + +void mlx5_ib_fs_remove_op_fc(struct mlx5_ib_dev *dev, + struct mlx5_ib_op_fc *opfc, + enum mlx5_ib_optional_counter_type type) +{ + int i; + + for (i = 0; i < MAX_OPFC_RULES && opfc->rule[i]; i++) { + mlx5_del_flow_rules(opfc->rule[i]); + put_flow_table(dev, &dev->flow_db->opfcs[type], true); + } +} + +static void set_underlay_qp(struct mlx5_ib_dev *dev, + struct mlx5_flow_spec *spec, + u32 underlay_qpn) +{ + void *misc_params_c = MLX5_ADDR_OF(fte_match_param, + spec->match_criteria, + misc_parameters); + void *misc_params_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, + misc_parameters); + + if (underlay_qpn && + MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, + ft_field_support.bth_dst_qp)) { + MLX5_SET(fte_match_set_misc, + misc_params_v, bth_dst_qp, underlay_qpn); + MLX5_SET(fte_match_set_misc, + misc_params_c, bth_dst_qp, 0xffffff); + } +} + +static void mlx5_ib_set_rule_source_port(struct mlx5_ib_dev *dev, + struct mlx5_flow_spec *spec, + struct mlx5_eswitch_rep *rep) +{ + struct mlx5_eswitch *esw = dev->mdev->priv.eswitch; + void *misc; + + if (mlx5_eswitch_vport_match_metadata_enabled(esw)) { + misc = MLX5_ADDR_OF(fte_match_param, spec->match_value, + misc_parameters_2); + + MLX5_SET(fte_match_set_misc2, misc, metadata_reg_c_0, + mlx5_eswitch_get_vport_metadata_for_match(rep->esw, + rep->vport)); + misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, + misc_parameters_2); + + MLX5_SET(fte_match_set_misc2, misc, metadata_reg_c_0, + mlx5_eswitch_get_vport_metadata_mask()); + } else { + misc = MLX5_ADDR_OF(fte_match_param, spec->match_value, + misc_parameters); + + MLX5_SET(fte_match_set_misc, misc, source_port, rep->vport); + + misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, + misc_parameters); + + MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_port); + } +} + +static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev, + struct mlx5_ib_flow_prio *ft_prio, + const struct ib_flow_attr *flow_attr, + struct mlx5_flow_destination *dst, + u32 underlay_qpn, + struct mlx5_ib_create_flow *ucmd) +{ + struct mlx5_flow_table *ft = ft_prio->flow_table; + struct mlx5_ib_flow_handler *handler; + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_spec *spec; + struct mlx5_flow_destination dest_arr[2] = {}; + struct mlx5_flow_destination *rule_dst = dest_arr; + const void *ib_flow = (const void *)flow_attr + sizeof(*flow_attr); + unsigned int spec_index; + u32 prev_type = 0; + int err = 0; + int dest_num = 0; + bool is_egress = flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS; + + if (!is_valid_attr(dev->mdev, flow_attr)) + return ERR_PTR(-EINVAL); + + if (dev->is_rep && is_egress) + return ERR_PTR(-EINVAL); + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + handler = kzalloc(sizeof(*handler), GFP_KERNEL); + if (!handler || !spec) { + err = -ENOMEM; + goto free; + } + + INIT_LIST_HEAD(&handler->list); + + for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) { + err = parse_flow_attr(dev->mdev, spec, + ib_flow, flow_attr, &flow_act, + prev_type); + if (err < 0) + goto free; + + prev_type = ((union ib_flow_spec *)ib_flow)->type; + ib_flow += ((union ib_flow_spec *)ib_flow)->size; + } + + if (dst && !(flow_act.action & MLX5_FLOW_CONTEXT_ACTION_DROP)) { + memcpy(&dest_arr[0], dst, sizeof(*dst)); + dest_num++; + } + + if (!flow_is_multicast_only(flow_attr)) + set_underlay_qp(dev, spec, underlay_qpn); + + if (dev->is_rep && flow_attr->type != IB_FLOW_ATTR_SNIFFER) { + struct mlx5_eswitch_rep *rep; + + rep = dev->port[flow_attr->port - 1].rep; + if (!rep) { + err = -EINVAL; + goto free; + } + + mlx5_ib_set_rule_source_port(dev, spec, rep); + } + + spec->match_criteria_enable = get_match_criteria_enable(spec->match_criteria); + + if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_COUNT) { + struct mlx5_ib_mcounters *mcounters; + + err = mlx5_ib_flow_counters_set_data(flow_act.counters, ucmd); + if (err) + goto free; + + mcounters = to_mcounters(flow_act.counters); + handler->ibcounters = flow_act.counters; + dest_arr[dest_num].type = + MLX5_FLOW_DESTINATION_TYPE_COUNTER; + dest_arr[dest_num].counter_id = + mlx5_fc_id(mcounters->hw_cntrs_hndl); + dest_num++; + } + + if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_DROP) { + if (!dest_num) + rule_dst = NULL; + } else { + if (flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP) + flow_act.action |= + MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO; + if (is_egress) + flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_ALLOW; + else if (dest_num) + flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + } + + if ((spec->flow_context.flags & FLOW_CONTEXT_HAS_TAG) && + (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT || + flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT)) { + mlx5_ib_warn(dev, "Flow tag %u and attribute type %x isn't allowed in leftovers\n", + spec->flow_context.flow_tag, flow_attr->type); + err = -EINVAL; + goto free; + } + handler->rule = mlx5_add_flow_rules(ft, spec, + &flow_act, + rule_dst, dest_num); + + if (IS_ERR(handler->rule)) { + err = PTR_ERR(handler->rule); + goto free; + } + + ft_prio->refcount++; + handler->prio = ft_prio; + handler->dev = dev; + + ft_prio->flow_table = ft; +free: + if (err && handler) { + mlx5_ib_counters_clear_description(handler->ibcounters); + kfree(handler); + } + kvfree(spec); + return err ? ERR_PTR(err) : handler; +} + +static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev, + struct mlx5_ib_flow_prio *ft_prio, + const struct ib_flow_attr *flow_attr, + struct mlx5_flow_destination *dst) +{ + return _create_flow_rule(dev, ft_prio, flow_attr, dst, 0, NULL); +} + +enum { + LEFTOVERS_MC, + LEFTOVERS_UC, +}; + +static struct mlx5_ib_flow_handler *create_leftovers_rule(struct mlx5_ib_dev *dev, + struct mlx5_ib_flow_prio *ft_prio, + struct ib_flow_attr *flow_attr, + struct mlx5_flow_destination *dst) +{ + struct mlx5_ib_flow_handler *handler_ucast = NULL; + struct mlx5_ib_flow_handler *handler = NULL; + + static struct { + struct ib_flow_attr flow_attr; + struct ib_flow_spec_eth eth_flow; + } leftovers_specs[] = { + [LEFTOVERS_MC] = { + .flow_attr = { + .num_of_specs = 1, + .size = sizeof(leftovers_specs[0]) + }, + .eth_flow = { + .type = IB_FLOW_SPEC_ETH, + .size = sizeof(struct ib_flow_spec_eth), + .mask = {.dst_mac = {0x1} }, + .val = {.dst_mac = {0x1} } + } + }, + [LEFTOVERS_UC] = { + .flow_attr = { + .num_of_specs = 1, + .size = sizeof(leftovers_specs[0]) + }, + .eth_flow = { + .type = IB_FLOW_SPEC_ETH, + .size = sizeof(struct ib_flow_spec_eth), + .mask = {.dst_mac = {0x1} }, + .val = {.dst_mac = {} } + } + } + }; + + handler = create_flow_rule(dev, ft_prio, + &leftovers_specs[LEFTOVERS_MC].flow_attr, + dst); + if (!IS_ERR(handler) && + flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT) { + handler_ucast = create_flow_rule(dev, ft_prio, + &leftovers_specs[LEFTOVERS_UC].flow_attr, + dst); + if (IS_ERR(handler_ucast)) { + mlx5_del_flow_rules(handler->rule); + ft_prio->refcount--; + kfree(handler); + handler = handler_ucast; + } else { + list_add(&handler_ucast->list, &handler->list); + } + } + + return handler; +} + +static struct mlx5_ib_flow_handler *create_sniffer_rule(struct mlx5_ib_dev *dev, + struct mlx5_ib_flow_prio *ft_rx, + struct mlx5_ib_flow_prio *ft_tx, + struct mlx5_flow_destination *dst) +{ + struct mlx5_ib_flow_handler *handler_rx; + struct mlx5_ib_flow_handler *handler_tx; + int err; + static const struct ib_flow_attr flow_attr = { + .num_of_specs = 0, + .type = IB_FLOW_ATTR_SNIFFER, + .size = sizeof(flow_attr) + }; + + handler_rx = create_flow_rule(dev, ft_rx, &flow_attr, dst); + if (IS_ERR(handler_rx)) { + err = PTR_ERR(handler_rx); + goto err; + } + + handler_tx = create_flow_rule(dev, ft_tx, &flow_attr, dst); + if (IS_ERR(handler_tx)) { + err = PTR_ERR(handler_tx); + goto err_tx; + } + + list_add(&handler_tx->list, &handler_rx->list); + + return handler_rx; + +err_tx: + mlx5_del_flow_rules(handler_rx->rule); + ft_rx->refcount--; + kfree(handler_rx); +err: + return ERR_PTR(err); +} + +static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp, + struct ib_flow_attr *flow_attr, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->device); + struct mlx5_ib_qp *mqp = to_mqp(qp); + struct mlx5_ib_flow_handler *handler = NULL; + struct mlx5_flow_destination *dst = NULL; + struct mlx5_ib_flow_prio *ft_prio_tx = NULL; + struct mlx5_ib_flow_prio *ft_prio; + bool is_egress = flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS; + struct mlx5_ib_create_flow *ucmd = NULL, ucmd_hdr; + size_t min_ucmd_sz, required_ucmd_sz; + int err; + int underlay_qpn; + + if (udata && udata->inlen) { + min_ucmd_sz = offsetofend(struct mlx5_ib_create_flow, reserved); + if (udata->inlen < min_ucmd_sz) + return ERR_PTR(-EOPNOTSUPP); + + err = ib_copy_from_udata(&ucmd_hdr, udata, min_ucmd_sz); + if (err) + return ERR_PTR(err); + + /* currently supports only one counters data */ + if (ucmd_hdr.ncounters_data > 1) + return ERR_PTR(-EINVAL); + + required_ucmd_sz = min_ucmd_sz + + sizeof(struct mlx5_ib_flow_counters_data) * + ucmd_hdr.ncounters_data; + if (udata->inlen > required_ucmd_sz && + !ib_is_udata_cleared(udata, required_ucmd_sz, + udata->inlen - required_ucmd_sz)) + return ERR_PTR(-EOPNOTSUPP); + + ucmd = kzalloc(required_ucmd_sz, GFP_KERNEL); + if (!ucmd) + return ERR_PTR(-ENOMEM); + + err = ib_copy_from_udata(ucmd, udata, required_ucmd_sz); + if (err) + goto free_ucmd; + } + + if (flow_attr->priority > MLX5_IB_FLOW_LAST_PRIO) { + err = -ENOMEM; + goto free_ucmd; + } + + if (flow_attr->flags & + ~(IB_FLOW_ATTR_FLAGS_DONT_TRAP | IB_FLOW_ATTR_FLAGS_EGRESS)) { + err = -EINVAL; + goto free_ucmd; + } + + if (is_egress && + (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT || + flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT)) { + err = -EINVAL; + goto free_ucmd; + } + + dst = kzalloc(sizeof(*dst), GFP_KERNEL); + if (!dst) { + err = -ENOMEM; + goto free_ucmd; + } + + mutex_lock(&dev->flow_db->lock); + + ft_prio = get_flow_table(dev, flow_attr, + is_egress ? MLX5_IB_FT_TX : MLX5_IB_FT_RX); + if (IS_ERR(ft_prio)) { + err = PTR_ERR(ft_prio); + goto unlock; + } + if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) { + ft_prio_tx = get_flow_table(dev, flow_attr, MLX5_IB_FT_TX); + if (IS_ERR(ft_prio_tx)) { + err = PTR_ERR(ft_prio_tx); + ft_prio_tx = NULL; + goto destroy_ft; + } + } + + if (is_egress) { + dst->type = MLX5_FLOW_DESTINATION_TYPE_PORT; + } else { + dst->type = MLX5_FLOW_DESTINATION_TYPE_TIR; + if (mqp->is_rss) + dst->tir_num = mqp->rss_qp.tirn; + else + dst->tir_num = mqp->raw_packet_qp.rq.tirn; + } + + switch (flow_attr->type) { + case IB_FLOW_ATTR_NORMAL: + underlay_qpn = (mqp->flags & IB_QP_CREATE_SOURCE_QPN) ? + mqp->underlay_qpn : + 0; + handler = _create_flow_rule(dev, ft_prio, flow_attr, dst, + underlay_qpn, ucmd); + break; + case IB_FLOW_ATTR_ALL_DEFAULT: + case IB_FLOW_ATTR_MC_DEFAULT: + handler = create_leftovers_rule(dev, ft_prio, flow_attr, dst); + break; + case IB_FLOW_ATTR_SNIFFER: + handler = create_sniffer_rule(dev, ft_prio, ft_prio_tx, dst); + break; + default: + err = -EINVAL; + goto destroy_ft; + } + + if (IS_ERR(handler)) { + err = PTR_ERR(handler); + handler = NULL; + goto destroy_ft; + } + + mutex_unlock(&dev->flow_db->lock); + kfree(dst); + kfree(ucmd); + + return &handler->ibflow; + +destroy_ft: + put_flow_table(dev, ft_prio, false); + if (ft_prio_tx) + put_flow_table(dev, ft_prio_tx, false); +unlock: + mutex_unlock(&dev->flow_db->lock); + kfree(dst); +free_ucmd: + kfree(ucmd); + return ERR_PTR(err); +} + +static struct mlx5_ib_flow_prio * +_get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority, + enum mlx5_flow_namespace_type ns_type, + bool mcast) +{ + struct mlx5_flow_namespace *ns = NULL; + struct mlx5_ib_flow_prio *prio = NULL; + int max_table_size = 0; + bool esw_encap; + u32 flags = 0; + int priority; + + if (mcast) + priority = MLX5_IB_FLOW_MCAST_PRIO; + else + priority = ib_prio_to_core_prio(user_priority, false); + + esw_encap = mlx5_eswitch_get_encap_mode(dev->mdev) != + DEVLINK_ESWITCH_ENCAP_MODE_NONE; + switch (ns_type) { + case MLX5_FLOW_NAMESPACE_BYPASS: + max_table_size = BIT( + MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, log_max_ft_size)); + if (MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, decap) && !esw_encap) + flags |= MLX5_FLOW_TABLE_TUNNEL_EN_DECAP; + if (MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, + reformat_l3_tunnel_to_l2) && + !esw_encap) + flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT; + break; + case MLX5_FLOW_NAMESPACE_EGRESS: + max_table_size = BIT( + MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, log_max_ft_size)); + if (MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, reformat) && + !esw_encap) + flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT; + break; + case MLX5_FLOW_NAMESPACE_FDB_BYPASS: + max_table_size = BIT( + MLX5_CAP_ESW_FLOWTABLE_FDB(dev->mdev, log_max_ft_size)); + if (MLX5_CAP_ESW_FLOWTABLE_FDB(dev->mdev, decap) && esw_encap) + flags |= MLX5_FLOW_TABLE_TUNNEL_EN_DECAP; + if (MLX5_CAP_ESW_FLOWTABLE_FDB(dev->mdev, + reformat_l3_tunnel_to_l2) && + esw_encap) + flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT; + priority = user_priority; + break; + case MLX5_FLOW_NAMESPACE_RDMA_RX: + max_table_size = BIT( + MLX5_CAP_FLOWTABLE_RDMA_RX(dev->mdev, log_max_ft_size)); + priority = user_priority; + break; + case MLX5_FLOW_NAMESPACE_RDMA_TX: + max_table_size = BIT( + MLX5_CAP_FLOWTABLE_RDMA_TX(dev->mdev, log_max_ft_size)); + priority = user_priority; + break; + default: + break; + } + + max_table_size = min_t(int, max_table_size, MLX5_FS_MAX_ENTRIES); + + ns = mlx5_get_flow_namespace(dev->mdev, ns_type); + if (!ns) + return ERR_PTR(-EOPNOTSUPP); + + switch (ns_type) { + case MLX5_FLOW_NAMESPACE_BYPASS: + prio = &dev->flow_db->prios[priority]; + break; + case MLX5_FLOW_NAMESPACE_EGRESS: + prio = &dev->flow_db->egress_prios[priority]; + break; + case MLX5_FLOW_NAMESPACE_FDB_BYPASS: + prio = &dev->flow_db->fdb[priority]; + break; + case MLX5_FLOW_NAMESPACE_RDMA_RX: + prio = &dev->flow_db->rdma_rx[priority]; + break; + case MLX5_FLOW_NAMESPACE_RDMA_TX: + prio = &dev->flow_db->rdma_tx[priority]; + break; + default: return ERR_PTR(-EINVAL); + } + + if (!prio) + return ERR_PTR(-EINVAL); + + if (prio->flow_table) + return prio; + + return _get_prio(dev, ns, prio, priority, max_table_size, + MLX5_FS_MAX_TYPES, flags); +} + +static struct mlx5_ib_flow_handler * +_create_raw_flow_rule(struct mlx5_ib_dev *dev, + struct mlx5_ib_flow_prio *ft_prio, + struct mlx5_flow_destination *dst, + struct mlx5_ib_flow_matcher *fs_matcher, + struct mlx5_flow_context *flow_context, + struct mlx5_flow_act *flow_act, + void *cmd_in, int inlen, + int dst_num) +{ + struct mlx5_ib_flow_handler *handler; + struct mlx5_flow_spec *spec; + struct mlx5_flow_table *ft = ft_prio->flow_table; + int err = 0; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + handler = kzalloc(sizeof(*handler), GFP_KERNEL); + if (!handler || !spec) { + err = -ENOMEM; + goto free; + } + + INIT_LIST_HEAD(&handler->list); + + memcpy(spec->match_value, cmd_in, inlen); + memcpy(spec->match_criteria, fs_matcher->matcher_mask.match_params, + fs_matcher->mask_len); + spec->match_criteria_enable = fs_matcher->match_criteria_enable; + spec->flow_context = *flow_context; + + handler->rule = mlx5_add_flow_rules(ft, spec, + flow_act, dst, dst_num); + + if (IS_ERR(handler->rule)) { + err = PTR_ERR(handler->rule); + goto free; + } + + ft_prio->refcount++; + handler->prio = ft_prio; + handler->dev = dev; + ft_prio->flow_table = ft; + +free: + if (err) + kfree(handler); + kvfree(spec); + return err ? ERR_PTR(err) : handler; +} + +static bool raw_fs_is_multicast(struct mlx5_ib_flow_matcher *fs_matcher, + void *match_v) +{ + void *match_c; + void *match_v_set_lyr_2_4, *match_c_set_lyr_2_4; + void *dmac, *dmac_mask; + void *ipv4, *ipv4_mask; + + if (!(fs_matcher->match_criteria_enable & + (1 << MATCH_CRITERIA_ENABLE_OUTER_BIT))) + return false; + + match_c = fs_matcher->matcher_mask.match_params; + match_v_set_lyr_2_4 = MLX5_ADDR_OF(fte_match_param, match_v, + outer_headers); + match_c_set_lyr_2_4 = MLX5_ADDR_OF(fte_match_param, match_c, + outer_headers); + + dmac = MLX5_ADDR_OF(fte_match_set_lyr_2_4, match_v_set_lyr_2_4, + dmac_47_16); + dmac_mask = MLX5_ADDR_OF(fte_match_set_lyr_2_4, match_c_set_lyr_2_4, + dmac_47_16); + + if (is_multicast_ether_addr(dmac) && + is_multicast_ether_addr(dmac_mask)) + return true; + + ipv4 = MLX5_ADDR_OF(fte_match_set_lyr_2_4, match_v_set_lyr_2_4, + dst_ipv4_dst_ipv6.ipv4_layout.ipv4); + + ipv4_mask = MLX5_ADDR_OF(fte_match_set_lyr_2_4, match_c_set_lyr_2_4, + dst_ipv4_dst_ipv6.ipv4_layout.ipv4); + + if (ipv4_is_multicast(*(__be32 *)(ipv4)) && + ipv4_is_multicast(*(__be32 *)(ipv4_mask))) + return true; + + return false; +} + +static struct mlx5_ib_flow_handler *raw_fs_rule_add( + struct mlx5_ib_dev *dev, struct mlx5_ib_flow_matcher *fs_matcher, + struct mlx5_flow_context *flow_context, struct mlx5_flow_act *flow_act, + u32 counter_id, void *cmd_in, int inlen, int dest_id, int dest_type) +{ + struct mlx5_flow_destination *dst; + struct mlx5_ib_flow_prio *ft_prio; + struct mlx5_ib_flow_handler *handler; + int dst_num = 0; + bool mcast; + int err; + + if (fs_matcher->flow_type != MLX5_IB_FLOW_TYPE_NORMAL) + return ERR_PTR(-EOPNOTSUPP); + + if (fs_matcher->priority > MLX5_IB_FLOW_LAST_PRIO) + return ERR_PTR(-ENOMEM); + + dst = kcalloc(2, sizeof(*dst), GFP_KERNEL); + if (!dst) + return ERR_PTR(-ENOMEM); + + mcast = raw_fs_is_multicast(fs_matcher, cmd_in); + mutex_lock(&dev->flow_db->lock); + + ft_prio = _get_flow_table(dev, fs_matcher->priority, + fs_matcher->ns_type, mcast); + if (IS_ERR(ft_prio)) { + err = PTR_ERR(ft_prio); + goto unlock; + } + + switch (dest_type) { + case MLX5_FLOW_DESTINATION_TYPE_TIR: + dst[dst_num].type = dest_type; + dst[dst_num++].tir_num = dest_id; + flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + break; + case MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE: + dst[dst_num].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM; + dst[dst_num++].ft_num = dest_id; + flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + break; + case MLX5_FLOW_DESTINATION_TYPE_PORT: + dst[dst_num++].type = MLX5_FLOW_DESTINATION_TYPE_PORT; + flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_ALLOW; + break; + default: + break; + } + + if (flow_act->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) { + dst[dst_num].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; + dst[dst_num].counter_id = counter_id; + dst_num++; + } + + handler = _create_raw_flow_rule(dev, ft_prio, dst_num ? dst : NULL, + fs_matcher, flow_context, flow_act, + cmd_in, inlen, dst_num); + + if (IS_ERR(handler)) { + err = PTR_ERR(handler); + goto destroy_ft; + } + + mutex_unlock(&dev->flow_db->lock); + atomic_inc(&fs_matcher->usecnt); + handler->flow_matcher = fs_matcher; + + kfree(dst); + + return handler; + +destroy_ft: + put_flow_table(dev, ft_prio, false); +unlock: + mutex_unlock(&dev->flow_db->lock); + kfree(dst); + + return ERR_PTR(err); +} + +static u32 mlx5_ib_flow_action_flags_to_accel_xfrm_flags(u32 mlx5_flags) +{ + u32 flags = 0; + + if (mlx5_flags & MLX5_IB_UAPI_FLOW_ACTION_FLAGS_REQUIRE_METADATA) + flags |= MLX5_ACCEL_XFRM_FLAG_REQUIRE_METADATA; + + return flags; +} + +#define MLX5_FLOW_ACTION_ESP_CREATE_LAST_SUPPORTED \ + MLX5_IB_UAPI_FLOW_ACTION_FLAGS_REQUIRE_METADATA +static struct ib_flow_action * +mlx5_ib_create_flow_action_esp(struct ib_device *device, + const struct ib_flow_action_attrs_esp *attr, + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_ib_dev *mdev = to_mdev(device); + struct ib_uverbs_flow_action_esp_keymat_aes_gcm *aes_gcm; + struct mlx5_accel_esp_xfrm_attrs accel_attrs = {}; + struct mlx5_ib_flow_action *action; + u64 action_flags; + u64 flags; + int err = 0; + + err = uverbs_get_flags64( + &action_flags, attrs, MLX5_IB_ATTR_CREATE_FLOW_ACTION_FLAGS, + ((MLX5_FLOW_ACTION_ESP_CREATE_LAST_SUPPORTED << 1) - 1)); + if (err) + return ERR_PTR(err); + + flags = mlx5_ib_flow_action_flags_to_accel_xfrm_flags(action_flags); + + /* We current only support a subset of the standard features. Only a + * keymat of type AES_GCM, with icv_len == 16, iv_algo == SEQ and esn + * (with overlap). Full offload mode isn't supported. + */ + if (!attr->keymat || attr->replay || attr->encap || + attr->spi || attr->seq || attr->tfc_pad || + attr->hard_limit_pkts || + (attr->flags & ~(IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED | + IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ENCRYPT))) + return ERR_PTR(-EOPNOTSUPP); + + if (attr->keymat->protocol != + IB_UVERBS_FLOW_ACTION_ESP_KEYMAT_AES_GCM) + return ERR_PTR(-EOPNOTSUPP); + + aes_gcm = &attr->keymat->keymat.aes_gcm; + + if (aes_gcm->icv_len != 16 || + aes_gcm->iv_algo != IB_UVERBS_FLOW_ACTION_IV_ALGO_SEQ) + return ERR_PTR(-EOPNOTSUPP); + + action = kmalloc(sizeof(*action), GFP_KERNEL); + if (!action) + return ERR_PTR(-ENOMEM); + + action->esp_aes_gcm.ib_flags = attr->flags; + memcpy(&accel_attrs.keymat.aes_gcm.aes_key, &aes_gcm->aes_key, + sizeof(accel_attrs.keymat.aes_gcm.aes_key)); + accel_attrs.keymat.aes_gcm.key_len = aes_gcm->key_len * 8; + memcpy(&accel_attrs.keymat.aes_gcm.salt, &aes_gcm->salt, + sizeof(accel_attrs.keymat.aes_gcm.salt)); + memcpy(&accel_attrs.keymat.aes_gcm.seq_iv, &aes_gcm->iv, + sizeof(accel_attrs.keymat.aes_gcm.seq_iv)); + accel_attrs.keymat.aes_gcm.icv_len = aes_gcm->icv_len * 8; + accel_attrs.keymat.aes_gcm.iv_algo = MLX5_ACCEL_ESP_AES_GCM_IV_ALGO_SEQ; + accel_attrs.keymat_type = MLX5_ACCEL_ESP_KEYMAT_AES_GCM; + + accel_attrs.esn = attr->esn; + if (attr->flags & IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED) + accel_attrs.flags |= MLX5_ACCEL_ESP_FLAGS_ESN_TRIGGERED; + if (attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW) + accel_attrs.flags |= MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP; + + if (attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ENCRYPT) + accel_attrs.action |= MLX5_ACCEL_ESP_ACTION_ENCRYPT; + + action->esp_aes_gcm.ctx = + mlx5_accel_esp_create_xfrm(mdev->mdev, &accel_attrs, flags); + if (IS_ERR(action->esp_aes_gcm.ctx)) { + err = PTR_ERR(action->esp_aes_gcm.ctx); + goto err_parse; + } + + action->esp_aes_gcm.ib_flags = attr->flags; + + return &action->ib_action; + +err_parse: + kfree(action); + return ERR_PTR(err); +} + +static int +mlx5_ib_modify_flow_action_esp(struct ib_flow_action *action, + const struct ib_flow_action_attrs_esp *attr, + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_ib_flow_action *maction = to_mflow_act(action); + struct mlx5_accel_esp_xfrm_attrs accel_attrs; + int err = 0; + + if (attr->keymat || attr->replay || attr->encap || + attr->spi || attr->seq || attr->tfc_pad || + attr->hard_limit_pkts || + (attr->flags & ~(IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED | + IB_FLOW_ACTION_ESP_FLAGS_MOD_ESP_ATTRS | + IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW))) + return -EOPNOTSUPP; + + /* Only the ESN value or the MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP can + * be modified. + */ + if (!(maction->esp_aes_gcm.ib_flags & + IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED) && + attr->flags & (IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED | + IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW)) + return -EINVAL; + + memcpy(&accel_attrs, &maction->esp_aes_gcm.ctx->attrs, + sizeof(accel_attrs)); + + accel_attrs.esn = attr->esn; + if (attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW) + accel_attrs.flags |= MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP; + else + accel_attrs.flags &= ~MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP; + + err = mlx5_accel_esp_modify_xfrm(maction->esp_aes_gcm.ctx, + &accel_attrs); + if (err) + return err; + + maction->esp_aes_gcm.ib_flags &= + ~IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW; + maction->esp_aes_gcm.ib_flags |= + attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW; + + return 0; +} + +static void destroy_flow_action_raw(struct mlx5_ib_flow_action *maction) +{ + switch (maction->flow_action_raw.sub_type) { + case MLX5_IB_FLOW_ACTION_MODIFY_HEADER: + mlx5_modify_header_dealloc(maction->flow_action_raw.dev->mdev, + maction->flow_action_raw.modify_hdr); + break; + case MLX5_IB_FLOW_ACTION_PACKET_REFORMAT: + mlx5_packet_reformat_dealloc(maction->flow_action_raw.dev->mdev, + maction->flow_action_raw.pkt_reformat); + break; + case MLX5_IB_FLOW_ACTION_DECAP: + break; + default: + break; + } +} + +static int mlx5_ib_destroy_flow_action(struct ib_flow_action *action) +{ + struct mlx5_ib_flow_action *maction = to_mflow_act(action); + + switch (action->type) { + case IB_FLOW_ACTION_ESP: + /* + * We only support aes_gcm by now, so we implicitly know this is + * the underline crypto. + */ + mlx5_accel_esp_destroy_xfrm(maction->esp_aes_gcm.ctx); + break; + case IB_FLOW_ACTION_UNSPECIFIED: + destroy_flow_action_raw(maction); + break; + default: + WARN_ON(true); + break; + } + + kfree(maction); + return 0; +} + +static int +mlx5_ib_ft_type_to_namespace(enum mlx5_ib_uapi_flow_table_type table_type, + enum mlx5_flow_namespace_type *namespace) +{ + switch (table_type) { + case MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_RX: + *namespace = MLX5_FLOW_NAMESPACE_BYPASS; + break; + case MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX: + *namespace = MLX5_FLOW_NAMESPACE_EGRESS; + break; + case MLX5_IB_UAPI_FLOW_TABLE_TYPE_FDB: + *namespace = MLX5_FLOW_NAMESPACE_FDB_BYPASS; + break; + case MLX5_IB_UAPI_FLOW_TABLE_TYPE_RDMA_RX: + *namespace = MLX5_FLOW_NAMESPACE_RDMA_RX; + break; + case MLX5_IB_UAPI_FLOW_TABLE_TYPE_RDMA_TX: + *namespace = MLX5_FLOW_NAMESPACE_RDMA_TX; + break; + default: + return -EINVAL; + } + + return 0; +} + +static const struct uverbs_attr_spec mlx5_ib_flow_type[] = { + [MLX5_IB_FLOW_TYPE_NORMAL] = { + .type = UVERBS_ATTR_TYPE_PTR_IN, + .u.ptr = { + .len = sizeof(u16), /* data is priority */ + .min_len = sizeof(u16), + } + }, + [MLX5_IB_FLOW_TYPE_SNIFFER] = { + .type = UVERBS_ATTR_TYPE_PTR_IN, + UVERBS_ATTR_NO_DATA(), + }, + [MLX5_IB_FLOW_TYPE_ALL_DEFAULT] = { + .type = UVERBS_ATTR_TYPE_PTR_IN, + UVERBS_ATTR_NO_DATA(), + }, + [MLX5_IB_FLOW_TYPE_MC_DEFAULT] = { + .type = UVERBS_ATTR_TYPE_PTR_IN, + UVERBS_ATTR_NO_DATA(), + }, +}; + +static bool is_flow_dest(void *obj, int *dest_id, int *dest_type) +{ + struct devx_obj *devx_obj = obj; + u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, devx_obj->dinbox, opcode); + + switch (opcode) { + case MLX5_CMD_OP_DESTROY_TIR: + *dest_type = MLX5_FLOW_DESTINATION_TYPE_TIR; + *dest_id = MLX5_GET(general_obj_in_cmd_hdr, devx_obj->dinbox, + obj_id); + return true; + + case MLX5_CMD_OP_DESTROY_FLOW_TABLE: + *dest_type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + *dest_id = MLX5_GET(destroy_flow_table_in, devx_obj->dinbox, + table_id); + return true; + default: + return false; + } +} + +static int get_dests(struct uverbs_attr_bundle *attrs, + struct mlx5_ib_flow_matcher *fs_matcher, int *dest_id, + int *dest_type, struct ib_qp **qp, u32 *flags) +{ + bool dest_devx, dest_qp; + void *devx_obj; + int err; + + dest_devx = uverbs_attr_is_valid(attrs, + MLX5_IB_ATTR_CREATE_FLOW_DEST_DEVX); + dest_qp = uverbs_attr_is_valid(attrs, + MLX5_IB_ATTR_CREATE_FLOW_DEST_QP); + + *flags = 0; + err = uverbs_get_flags32(flags, attrs, MLX5_IB_ATTR_CREATE_FLOW_FLAGS, + MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DEFAULT_MISS | + MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DROP); + if (err) + return err; + + /* Both flags are not allowed */ + if (*flags & MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DEFAULT_MISS && + *flags & MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DROP) + return -EINVAL; + + if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_BYPASS) { + if (dest_devx && (dest_qp || *flags)) + return -EINVAL; + else if (dest_qp && *flags) + return -EINVAL; + } + + /* Allow only DEVX object, drop as dest for FDB */ + if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_FDB_BYPASS && + !(dest_devx || (*flags & MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DROP))) + return -EINVAL; + + /* Allow only DEVX object or QP as dest when inserting to RDMA_RX */ + if ((fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_RX) && + ((!dest_devx && !dest_qp) || (dest_devx && dest_qp))) + return -EINVAL; + + *qp = NULL; + if (dest_devx) { + devx_obj = + uverbs_attr_get_obj(attrs, + MLX5_IB_ATTR_CREATE_FLOW_DEST_DEVX); + + /* Verify that the given DEVX object is a flow + * steering destination. + */ + if (!is_flow_dest(devx_obj, dest_id, dest_type)) + return -EINVAL; + /* Allow only flow table as dest when inserting to FDB or RDMA_RX */ + if ((fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_FDB_BYPASS || + fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_RX) && + *dest_type != MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE) + return -EINVAL; + } else if (dest_qp) { + struct mlx5_ib_qp *mqp; + + *qp = uverbs_attr_get_obj(attrs, + MLX5_IB_ATTR_CREATE_FLOW_DEST_QP); + if (IS_ERR(*qp)) + return PTR_ERR(*qp); + + if ((*qp)->qp_type != IB_QPT_RAW_PACKET) + return -EINVAL; + + mqp = to_mqp(*qp); + if (mqp->is_rss) + *dest_id = mqp->rss_qp.tirn; + else + *dest_id = mqp->raw_packet_qp.rq.tirn; + *dest_type = MLX5_FLOW_DESTINATION_TYPE_TIR; + } else if ((fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_EGRESS || + fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TX) && + !(*flags & MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DROP)) { + *dest_type = MLX5_FLOW_DESTINATION_TYPE_PORT; + } + + if (*dest_type == MLX5_FLOW_DESTINATION_TYPE_TIR && + (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_EGRESS || + fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TX)) + return -EINVAL; + + return 0; +} + +static bool is_flow_counter(void *obj, u32 offset, u32 *counter_id) +{ + struct devx_obj *devx_obj = obj; + u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, devx_obj->dinbox, opcode); + + if (opcode == MLX5_CMD_OP_DEALLOC_FLOW_COUNTER) { + + if (offset && offset >= devx_obj->flow_counter_bulk_size) + return false; + + *counter_id = MLX5_GET(dealloc_flow_counter_in, + devx_obj->dinbox, + flow_counter_id); + *counter_id += offset; + return true; + } + + return false; +} + +#define MLX5_IB_CREATE_FLOW_MAX_FLOW_ACTIONS 2 +static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_flow_context flow_context = {.flow_tag = + MLX5_FS_DEFAULT_FLOW_TAG}; + u32 *offset_attr, offset = 0, counter_id = 0; + int dest_id, dest_type = -1, inlen, len, ret, i; + struct mlx5_ib_flow_handler *flow_handler; + struct mlx5_ib_flow_matcher *fs_matcher; + struct ib_uobject **arr_flow_actions; + struct ib_uflow_resources *uflow_res; + struct mlx5_flow_act flow_act = {}; + struct ib_qp *qp = NULL; + void *devx_obj, *cmd_in; + struct ib_uobject *uobj; + struct mlx5_ib_dev *dev; + u32 flags; + + if (!capable(CAP_NET_RAW)) + return -EPERM; + + fs_matcher = uverbs_attr_get_obj(attrs, + MLX5_IB_ATTR_CREATE_FLOW_MATCHER); + uobj = uverbs_attr_get_uobject(attrs, MLX5_IB_ATTR_CREATE_FLOW_HANDLE); + dev = mlx5_udata_to_mdev(&attrs->driver_udata); + + if (get_dests(attrs, fs_matcher, &dest_id, &dest_type, &qp, &flags)) + return -EINVAL; + + if (flags & MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DEFAULT_MISS) + flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_NS; + + if (flags & MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DROP) + flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_DROP; + + len = uverbs_attr_get_uobjs_arr(attrs, + MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX, &arr_flow_actions); + if (len) { + devx_obj = arr_flow_actions[0]->object; + + if (uverbs_attr_is_valid(attrs, + MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX_OFFSET)) { + + int num_offsets = uverbs_attr_ptr_get_array_size( + attrs, + MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX_OFFSET, + sizeof(u32)); + + if (num_offsets != 1) + return -EINVAL; + + offset_attr = uverbs_attr_get_alloced_ptr( + attrs, + MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX_OFFSET); + offset = *offset_attr; + } + + if (!is_flow_counter(devx_obj, offset, &counter_id)) + return -EINVAL; + + flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_COUNT; + } + + cmd_in = uverbs_attr_get_alloced_ptr( + attrs, MLX5_IB_ATTR_CREATE_FLOW_MATCH_VALUE); + inlen = uverbs_attr_get_len(attrs, + MLX5_IB_ATTR_CREATE_FLOW_MATCH_VALUE); + + uflow_res = flow_resources_alloc(MLX5_IB_CREATE_FLOW_MAX_FLOW_ACTIONS); + if (!uflow_res) + return -ENOMEM; + + len = uverbs_attr_get_uobjs_arr(attrs, + MLX5_IB_ATTR_CREATE_FLOW_ARR_FLOW_ACTIONS, &arr_flow_actions); + for (i = 0; i < len; i++) { + struct mlx5_ib_flow_action *maction = + to_mflow_act(arr_flow_actions[i]->object); + + ret = parse_flow_flow_action(maction, false, &flow_act); + if (ret) + goto err_out; + flow_resources_add(uflow_res, IB_FLOW_SPEC_ACTION_HANDLE, + arr_flow_actions[i]->object); + } + + ret = uverbs_copy_from(&flow_context.flow_tag, attrs, + MLX5_IB_ATTR_CREATE_FLOW_TAG); + if (!ret) { + if (flow_context.flow_tag >= BIT(24)) { + ret = -EINVAL; + goto err_out; + } + flow_context.flags |= FLOW_CONTEXT_HAS_TAG; + } + + flow_handler = + raw_fs_rule_add(dev, fs_matcher, &flow_context, &flow_act, + counter_id, cmd_in, inlen, dest_id, dest_type); + if (IS_ERR(flow_handler)) { + ret = PTR_ERR(flow_handler); + goto err_out; + } + + ib_set_flow(uobj, &flow_handler->ibflow, qp, &dev->ib_dev, uflow_res); + + return 0; +err_out: + ib_uverbs_flow_resources_free(uflow_res); + return ret; +} + +static int flow_matcher_cleanup(struct ib_uobject *uobject, + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_ib_flow_matcher *obj = uobject->object; + + if (atomic_read(&obj->usecnt)) + return -EBUSY; + + kfree(obj); + return 0; +} + +static int steering_anchor_cleanup(struct ib_uobject *uobject, + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_ib_steering_anchor *obj = uobject->object; + + if (atomic_read(&obj->usecnt)) + return -EBUSY; + + mutex_lock(&obj->dev->flow_db->lock); + put_flow_table(obj->dev, obj->ft_prio, true); + mutex_unlock(&obj->dev->flow_db->lock); + + kfree(obj); + return 0; +} + +static int mlx5_ib_matcher_ns(struct uverbs_attr_bundle *attrs, + struct mlx5_ib_flow_matcher *obj) +{ + enum mlx5_ib_uapi_flow_table_type ft_type = + MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_RX; + u32 flags; + int err; + + /* New users should use MLX5_IB_ATTR_FLOW_MATCHER_FT_TYPE and older + * users should switch to it. We leave this to not break userspace + */ + if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_FLOW_MATCHER_FT_TYPE) && + uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_FLOW_MATCHER_FLOW_FLAGS)) + return -EINVAL; + + if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_FLOW_MATCHER_FT_TYPE)) { + err = uverbs_get_const(&ft_type, attrs, + MLX5_IB_ATTR_FLOW_MATCHER_FT_TYPE); + if (err) + return err; + + err = mlx5_ib_ft_type_to_namespace(ft_type, &obj->ns_type); + if (err) + return err; + + return 0; + } + + if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_FLOW_MATCHER_FLOW_FLAGS)) { + err = uverbs_get_flags32(&flags, attrs, + MLX5_IB_ATTR_FLOW_MATCHER_FLOW_FLAGS, + IB_FLOW_ATTR_FLAGS_EGRESS); + if (err) + return err; + + if (flags) + return mlx5_ib_ft_type_to_namespace( + MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX, + &obj->ns_type); + } + + obj->ns_type = MLX5_FLOW_NAMESPACE_BYPASS; + + return 0; +} + +static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_MATCHER_CREATE)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = uverbs_attr_get_uobject( + attrs, MLX5_IB_ATTR_FLOW_MATCHER_CREATE_HANDLE); + struct mlx5_ib_dev *dev = mlx5_udata_to_mdev(&attrs->driver_udata); + struct mlx5_ib_flow_matcher *obj; + int err; + + obj = kzalloc(sizeof(struct mlx5_ib_flow_matcher), GFP_KERNEL); + if (!obj) + return -ENOMEM; + + obj->mask_len = uverbs_attr_get_len( + attrs, MLX5_IB_ATTR_FLOW_MATCHER_MATCH_MASK); + err = uverbs_copy_from(&obj->matcher_mask, + attrs, + MLX5_IB_ATTR_FLOW_MATCHER_MATCH_MASK); + if (err) + goto end; + + obj->flow_type = uverbs_attr_get_enum_id( + attrs, MLX5_IB_ATTR_FLOW_MATCHER_FLOW_TYPE); + + if (obj->flow_type == MLX5_IB_FLOW_TYPE_NORMAL) { + err = uverbs_copy_from(&obj->priority, + attrs, + MLX5_IB_ATTR_FLOW_MATCHER_FLOW_TYPE); + if (err) + goto end; + } + + err = uverbs_copy_from(&obj->match_criteria_enable, + attrs, + MLX5_IB_ATTR_FLOW_MATCHER_MATCH_CRITERIA); + if (err) + goto end; + + err = mlx5_ib_matcher_ns(attrs, obj); + if (err) + goto end; + + if (obj->ns_type == MLX5_FLOW_NAMESPACE_FDB_BYPASS && + mlx5_eswitch_mode(dev->mdev) != MLX5_ESWITCH_OFFLOADS) { + err = -EINVAL; + goto end; + } + + uobj->object = obj; + obj->mdev = dev->mdev; + atomic_set(&obj->usecnt, 0); + return 0; + +end: + kfree(obj); + return err; +} + +static int UVERBS_HANDLER(MLX5_IB_METHOD_STEERING_ANCHOR_CREATE)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = uverbs_attr_get_uobject( + attrs, MLX5_IB_ATTR_STEERING_ANCHOR_CREATE_HANDLE); + struct mlx5_ib_dev *dev = mlx5_udata_to_mdev(&attrs->driver_udata); + enum mlx5_ib_uapi_flow_table_type ib_uapi_ft_type; + enum mlx5_flow_namespace_type ns_type; + struct mlx5_ib_steering_anchor *obj; + struct mlx5_ib_flow_prio *ft_prio; + u16 priority; + u32 ft_id; + int err; + + if (!capable(CAP_NET_RAW)) + return -EPERM; + + err = uverbs_get_const(&ib_uapi_ft_type, attrs, + MLX5_IB_ATTR_STEERING_ANCHOR_FT_TYPE); + if (err) + return err; + + err = mlx5_ib_ft_type_to_namespace(ib_uapi_ft_type, &ns_type); + if (err) + return err; + + err = uverbs_copy_from(&priority, attrs, + MLX5_IB_ATTR_STEERING_ANCHOR_PRIORITY); + if (err) + return err; + + obj = kzalloc(sizeof(*obj), GFP_KERNEL); + if (!obj) + return -ENOMEM; + + mutex_lock(&dev->flow_db->lock); + ft_prio = _get_flow_table(dev, priority, ns_type, 0); + if (IS_ERR(ft_prio)) { + mutex_unlock(&dev->flow_db->lock); + err = PTR_ERR(ft_prio); + goto free_obj; + } + + ft_prio->refcount++; + ft_id = mlx5_flow_table_id(ft_prio->flow_table); + mutex_unlock(&dev->flow_db->lock); + + err = uverbs_copy_to(attrs, MLX5_IB_ATTR_STEERING_ANCHOR_FT_ID, + &ft_id, sizeof(ft_id)); + if (err) + goto put_flow_table; + + uobj->object = obj; + obj->dev = dev; + obj->ft_prio = ft_prio; + atomic_set(&obj->usecnt, 0); + + return 0; + +put_flow_table: + mutex_lock(&dev->flow_db->lock); + put_flow_table(dev, ft_prio, true); + mutex_unlock(&dev->flow_db->lock); +free_obj: + kfree(obj); + + return err; +} + +static struct ib_flow_action * +mlx5_ib_create_modify_header(struct mlx5_ib_dev *dev, + enum mlx5_ib_uapi_flow_table_type ft_type, + u8 num_actions, void *in) +{ + enum mlx5_flow_namespace_type namespace; + struct mlx5_ib_flow_action *maction; + int ret; + + ret = mlx5_ib_ft_type_to_namespace(ft_type, &namespace); + if (ret) + return ERR_PTR(-EINVAL); + + maction = kzalloc(sizeof(*maction), GFP_KERNEL); + if (!maction) + return ERR_PTR(-ENOMEM); + + maction->flow_action_raw.modify_hdr = + mlx5_modify_header_alloc(dev->mdev, namespace, num_actions, in); + + if (IS_ERR(maction->flow_action_raw.modify_hdr)) { + ret = PTR_ERR(maction->flow_action_raw.modify_hdr); + kfree(maction); + return ERR_PTR(ret); + } + maction->flow_action_raw.sub_type = + MLX5_IB_FLOW_ACTION_MODIFY_HEADER; + maction->flow_action_raw.dev = dev; + + return &maction->ib_action; +} + +static bool mlx5_ib_modify_header_supported(struct mlx5_ib_dev *dev) +{ + return MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, + max_modify_header_actions) || + MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, + max_modify_header_actions) || + MLX5_CAP_FLOWTABLE_RDMA_TX(dev->mdev, + max_modify_header_actions); +} + +static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = uverbs_attr_get_uobject( + attrs, MLX5_IB_ATTR_CREATE_MODIFY_HEADER_HANDLE); + struct mlx5_ib_dev *mdev = mlx5_udata_to_mdev(&attrs->driver_udata); + enum mlx5_ib_uapi_flow_table_type ft_type; + struct ib_flow_action *action; + int num_actions; + void *in; + int ret; + + if (!mlx5_ib_modify_header_supported(mdev)) + return -EOPNOTSUPP; + + in = uverbs_attr_get_alloced_ptr(attrs, + MLX5_IB_ATTR_CREATE_MODIFY_HEADER_ACTIONS_PRM); + + num_actions = uverbs_attr_ptr_get_array_size( + attrs, MLX5_IB_ATTR_CREATE_MODIFY_HEADER_ACTIONS_PRM, + MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto)); + if (num_actions < 0) + return num_actions; + + ret = uverbs_get_const(&ft_type, attrs, + MLX5_IB_ATTR_CREATE_MODIFY_HEADER_FT_TYPE); + if (ret) + return ret; + action = mlx5_ib_create_modify_header(mdev, ft_type, num_actions, in); + if (IS_ERR(action)) + return PTR_ERR(action); + + uverbs_flow_action_fill_action(action, uobj, &mdev->ib_dev, + IB_FLOW_ACTION_UNSPECIFIED); + + return 0; +} + +static bool mlx5_ib_flow_action_packet_reformat_valid(struct mlx5_ib_dev *ibdev, + u8 packet_reformat_type, + u8 ft_type) +{ + switch (packet_reformat_type) { + case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L2_TUNNEL: + if (ft_type == MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX) + return MLX5_CAP_FLOWTABLE(ibdev->mdev, + encap_general_header); + break; + case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L3_TUNNEL: + if (ft_type == MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX) + return MLX5_CAP_FLOWTABLE_NIC_TX(ibdev->mdev, + reformat_l2_to_l3_tunnel); + break; + case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L3_TUNNEL_TO_L2: + if (ft_type == MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_RX) + return MLX5_CAP_FLOWTABLE_NIC_RX(ibdev->mdev, + reformat_l3_tunnel_to_l2); + break; + case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TUNNEL_TO_L2: + if (ft_type == MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_RX) + return MLX5_CAP_FLOWTABLE_NIC_RX(ibdev->mdev, decap); + break; + default: + break; + } + + return false; +} + +static int mlx5_ib_dv_to_prm_packet_reforamt_type(u8 dv_prt, u8 *prm_prt) +{ + switch (dv_prt) { + case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L2_TUNNEL: + *prm_prt = MLX5_REFORMAT_TYPE_L2_TO_L2_TUNNEL; + break; + case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L3_TUNNEL_TO_L2: + *prm_prt = MLX5_REFORMAT_TYPE_L3_TUNNEL_TO_L2; + break; + case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L3_TUNNEL: + *prm_prt = MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int mlx5_ib_flow_action_create_packet_reformat_ctx( + struct mlx5_ib_dev *dev, + struct mlx5_ib_flow_action *maction, + u8 ft_type, u8 dv_prt, + void *in, size_t len) +{ + struct mlx5_pkt_reformat_params reformat_params; + enum mlx5_flow_namespace_type namespace; + u8 prm_prt; + int ret; + + ret = mlx5_ib_ft_type_to_namespace(ft_type, &namespace); + if (ret) + return ret; + + ret = mlx5_ib_dv_to_prm_packet_reforamt_type(dv_prt, &prm_prt); + if (ret) + return ret; + + memset(&reformat_params, 0, sizeof(reformat_params)); + reformat_params.type = prm_prt; + reformat_params.size = len; + reformat_params.data = in; + maction->flow_action_raw.pkt_reformat = + mlx5_packet_reformat_alloc(dev->mdev, &reformat_params, + namespace); + if (IS_ERR(maction->flow_action_raw.pkt_reformat)) { + ret = PTR_ERR(maction->flow_action_raw.pkt_reformat); + return ret; + } + + maction->flow_action_raw.sub_type = + MLX5_IB_FLOW_ACTION_PACKET_REFORMAT; + maction->flow_action_raw.dev = dev; + + return 0; +} + +static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_ACTION_CREATE_PACKET_REFORMAT)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = uverbs_attr_get_uobject(attrs, + MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_HANDLE); + struct mlx5_ib_dev *mdev = mlx5_udata_to_mdev(&attrs->driver_udata); + enum mlx5_ib_uapi_flow_action_packet_reformat_type dv_prt; + enum mlx5_ib_uapi_flow_table_type ft_type; + struct mlx5_ib_flow_action *maction; + int ret; + + ret = uverbs_get_const(&ft_type, attrs, + MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_FT_TYPE); + if (ret) + return ret; + + ret = uverbs_get_const(&dv_prt, attrs, + MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_TYPE); + if (ret) + return ret; + + if (!mlx5_ib_flow_action_packet_reformat_valid(mdev, dv_prt, ft_type)) + return -EOPNOTSUPP; + + maction = kzalloc(sizeof(*maction), GFP_KERNEL); + if (!maction) + return -ENOMEM; + + if (dv_prt == + MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TUNNEL_TO_L2) { + maction->flow_action_raw.sub_type = + MLX5_IB_FLOW_ACTION_DECAP; + maction->flow_action_raw.dev = mdev; + } else { + void *in; + int len; + + in = uverbs_attr_get_alloced_ptr(attrs, + MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_DATA_BUF); + if (IS_ERR(in)) { + ret = PTR_ERR(in); + goto free_maction; + } + + len = uverbs_attr_get_len(attrs, + MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_DATA_BUF); + + ret = mlx5_ib_flow_action_create_packet_reformat_ctx(mdev, + maction, ft_type, dv_prt, in, len); + if (ret) + goto free_maction; + } + + uverbs_flow_action_fill_action(&maction->ib_action, uobj, &mdev->ib_dev, + IB_FLOW_ACTION_UNSPECIFIED); + return 0; + +free_maction: + kfree(maction); + return ret; +} + +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_CREATE_FLOW, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_FLOW_HANDLE, + UVERBS_OBJECT_FLOW, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN( + MLX5_IB_ATTR_CREATE_FLOW_MATCH_VALUE, + UVERBS_ATTR_SIZE(1, sizeof(struct mlx5_ib_match_params)), + UA_MANDATORY, + UA_ALLOC_AND_COPY), + UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_FLOW_MATCHER, + MLX5_IB_OBJECT_FLOW_MATCHER, + UVERBS_ACCESS_READ, + UA_MANDATORY), + UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_FLOW_DEST_QP, + UVERBS_OBJECT_QP, + UVERBS_ACCESS_READ), + UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_FLOW_DEST_DEVX, + MLX5_IB_OBJECT_DEVX_OBJ, + UVERBS_ACCESS_READ), + UVERBS_ATTR_IDRS_ARR(MLX5_IB_ATTR_CREATE_FLOW_ARR_FLOW_ACTIONS, + UVERBS_OBJECT_FLOW_ACTION, + UVERBS_ACCESS_READ, 1, + MLX5_IB_CREATE_FLOW_MAX_FLOW_ACTIONS, + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_CREATE_FLOW_TAG, + UVERBS_ATTR_TYPE(u32), + UA_OPTIONAL), + UVERBS_ATTR_IDRS_ARR(MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX, + MLX5_IB_OBJECT_DEVX_OBJ, + UVERBS_ACCESS_READ, 1, 1, + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX_OFFSET, + UVERBS_ATTR_MIN_SIZE(sizeof(u32)), + UA_OPTIONAL, + UA_ALLOC_AND_COPY), + UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_CREATE_FLOW_FLAGS, + enum mlx5_ib_create_flow_flags, + UA_OPTIONAL)); + +DECLARE_UVERBS_NAMED_METHOD_DESTROY( + MLX5_IB_METHOD_DESTROY_FLOW, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_FLOW_HANDLE, + UVERBS_OBJECT_FLOW, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY)); + +ADD_UVERBS_METHODS(mlx5_ib_fs, + UVERBS_OBJECT_FLOW, + &UVERBS_METHOD(MLX5_IB_METHOD_CREATE_FLOW), + &UVERBS_METHOD(MLX5_IB_METHOD_DESTROY_FLOW)); + +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_MODIFY_HEADER_HANDLE, + UVERBS_OBJECT_FLOW_ACTION, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_CREATE_MODIFY_HEADER_ACTIONS_PRM, + UVERBS_ATTR_MIN_SIZE(MLX5_UN_SZ_BYTES( + set_add_copy_action_in_auto)), + UA_MANDATORY, + UA_ALLOC_AND_COPY), + UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_CREATE_MODIFY_HEADER_FT_TYPE, + enum mlx5_ib_uapi_flow_table_type, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_FLOW_ACTION_CREATE_PACKET_REFORMAT, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_HANDLE, + UVERBS_OBJECT_FLOW_ACTION, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_DATA_BUF, + UVERBS_ATTR_MIN_SIZE(1), + UA_ALLOC_AND_COPY, + UA_OPTIONAL), + UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_TYPE, + enum mlx5_ib_uapi_flow_action_packet_reformat_type, + UA_MANDATORY), + UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_FT_TYPE, + enum mlx5_ib_uapi_flow_table_type, + UA_MANDATORY)); + +ADD_UVERBS_METHODS( + mlx5_ib_flow_actions, + UVERBS_OBJECT_FLOW_ACTION, + &UVERBS_METHOD(MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER), + &UVERBS_METHOD(MLX5_IB_METHOD_FLOW_ACTION_CREATE_PACKET_REFORMAT)); + +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_FLOW_MATCHER_CREATE, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_FLOW_MATCHER_CREATE_HANDLE, + MLX5_IB_OBJECT_FLOW_MATCHER, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN( + MLX5_IB_ATTR_FLOW_MATCHER_MATCH_MASK, + UVERBS_ATTR_SIZE(1, sizeof(struct mlx5_ib_match_params)), + UA_MANDATORY), + UVERBS_ATTR_ENUM_IN(MLX5_IB_ATTR_FLOW_MATCHER_FLOW_TYPE, + mlx5_ib_flow_type, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_FLOW_MATCHER_MATCH_CRITERIA, + UVERBS_ATTR_TYPE(u8), + UA_MANDATORY), + UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_FLOW_MATCHER_FLOW_FLAGS, + enum ib_flow_flags, + UA_OPTIONAL), + UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_FLOW_MATCHER_FT_TYPE, + enum mlx5_ib_uapi_flow_table_type, + UA_OPTIONAL)); + +DECLARE_UVERBS_NAMED_METHOD_DESTROY( + MLX5_IB_METHOD_FLOW_MATCHER_DESTROY, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_FLOW_MATCHER_DESTROY_HANDLE, + MLX5_IB_OBJECT_FLOW_MATCHER, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_FLOW_MATCHER, + UVERBS_TYPE_ALLOC_IDR(flow_matcher_cleanup), + &UVERBS_METHOD(MLX5_IB_METHOD_FLOW_MATCHER_CREATE), + &UVERBS_METHOD(MLX5_IB_METHOD_FLOW_MATCHER_DESTROY)); + +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_STEERING_ANCHOR_CREATE, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_STEERING_ANCHOR_CREATE_HANDLE, + MLX5_IB_OBJECT_STEERING_ANCHOR, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_STEERING_ANCHOR_FT_TYPE, + enum mlx5_ib_uapi_flow_table_type, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_STEERING_ANCHOR_PRIORITY, + UVERBS_ATTR_TYPE(u16), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_STEERING_ANCHOR_FT_ID, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD_DESTROY( + MLX5_IB_METHOD_STEERING_ANCHOR_DESTROY, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_STEERING_ANCHOR_DESTROY_HANDLE, + MLX5_IB_OBJECT_STEERING_ANCHOR, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT( + MLX5_IB_OBJECT_STEERING_ANCHOR, + UVERBS_TYPE_ALLOC_IDR(steering_anchor_cleanup), + &UVERBS_METHOD(MLX5_IB_METHOD_STEERING_ANCHOR_CREATE), + &UVERBS_METHOD(MLX5_IB_METHOD_STEERING_ANCHOR_DESTROY)); + +const struct uapi_definition mlx5_ib_flow_defs[] = { + UAPI_DEF_CHAIN_OBJ_TREE_NAMED( + MLX5_IB_OBJECT_FLOW_MATCHER), + UAPI_DEF_CHAIN_OBJ_TREE( + UVERBS_OBJECT_FLOW, + &mlx5_ib_fs), + UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_FLOW_ACTION, + &mlx5_ib_flow_actions), + UAPI_DEF_CHAIN_OBJ_TREE_NAMED( + MLX5_IB_OBJECT_STEERING_ANCHOR, + UAPI_DEF_IS_OBJ_SUPPORTED(mlx5_ib_shared_ft_allowed)), + {}, +}; + +static const struct ib_device_ops flow_ops = { + .create_flow = mlx5_ib_create_flow, + .destroy_flow = mlx5_ib_destroy_flow, + .destroy_flow_action = mlx5_ib_destroy_flow_action, +}; + +static const struct ib_device_ops flow_ipsec_ops = { + .create_flow_action_esp = mlx5_ib_create_flow_action_esp, + .modify_flow_action_esp = mlx5_ib_modify_flow_action_esp, +}; + +int mlx5_ib_fs_init(struct mlx5_ib_dev *dev) +{ + dev->flow_db = kzalloc(sizeof(*dev->flow_db), GFP_KERNEL); + + if (!dev->flow_db) + return -ENOMEM; + + mutex_init(&dev->flow_db->lock); + + ib_set_device_ops(&dev->ib_dev, &flow_ops); + if (mlx5_accel_ipsec_device_caps(dev->mdev) & + MLX5_ACCEL_IPSEC_CAP_DEVICE) + ib_set_device_ops(&dev->ib_dev, &flow_ipsec_ops); + + return 0; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/fs.h b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/fs.h new file mode 100644 index 0000000..ad320ad --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/fs.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2013-2020, Mellanox Technologies inc. All rights reserved. + */ + +#ifndef _MLX5_IB_FS_H +#define _MLX5_IB_FS_H + +#include "mlx5_ib.h" + +#if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) +int mlx5_ib_fs_init(struct mlx5_ib_dev *dev); +#else +static inline int mlx5_ib_fs_init(struct mlx5_ib_dev *dev) +{ + dev->flow_db = kzalloc(sizeof(*dev->flow_db), GFP_KERNEL); + + if (!dev->flow_db) + return -ENOMEM; + + mutex_init(&dev->flow_db->lock); + return 0; +} +#endif +static inline void mlx5_ib_fs_cleanup(struct mlx5_ib_dev *dev) +{ + kfree(dev->flow_db); +} +#endif /* _MLX5_IB_FS_H */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/gsi.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/gsi.c new file mode 100644 index 0000000..43e6ddc --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/gsi.c @@ -0,0 +1,497 @@ +/* + * Copyright (c) 2016, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "mlx5_ib.h" + +struct mlx5_ib_gsi_wr { + struct ib_cqe cqe; + struct mlx5_ib_wc mwc; + bool completed:1; +}; + +static bool mlx5_ib_deth_sqpn_cap(struct mlx5_ib_dev *dev) +{ + return MLX5_CAP_GEN(dev->mdev, set_deth_sqpn); +} + +/* Call with gsi->lock locked */ +static void generate_completions(struct mlx5_ib_qp *mqp) +{ + struct mlx5_ib_gsi_qp *gsi = &mqp->gsi; + struct ib_cq *gsi_cq = mqp->ibqp.send_cq; + struct mlx5_ib_gsi_wr *wr; + u32 index; + + for (index = gsi->outstanding_ci; index != gsi->outstanding_pi; + index++) { + wr = &gsi->outstanding_wrs[index % gsi->cap.max_send_wr]; + + if (!wr->completed) + break; + + mlx5_ib_generate_wc(gsi_cq, &wr->mwc); + wr->completed = false; + } + + gsi->outstanding_ci = index; +} + +static void handle_single_completion(struct ib_cq *cq, struct ib_wc *wc) +{ + struct mlx5_ib_gsi_qp *gsi = cq->cq_context; + struct mlx5_ib_gsi_wr *wr = + container_of(wc->wr_cqe, struct mlx5_ib_gsi_wr, cqe); + struct mlx5_ib_qp *mqp = container_of(gsi, struct mlx5_ib_qp, gsi); + u64 wr_id; + unsigned long flags; + + spin_lock_irqsave(&gsi->lock, flags); + wr->completed = true; + wr_id = wr->mwc.wc.wr_id; + wr->mwc.wc = *wc; + wr->mwc.wc.wr_id = wr_id; + wr->mwc.wc.qp = &mqp->ibqp; + + generate_completions(mqp); + spin_unlock_irqrestore(&gsi->lock, flags); +} + +int mlx5_ib_create_gsi(struct ib_pd *pd, struct mlx5_ib_qp *mqp, + struct ib_qp_init_attr *attr) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_ib_gsi_qp *gsi; + struct ib_qp_init_attr hw_init_attr = *attr; + const u8 port_num = attr->port_num; + int num_qps = 0; + int ret, i; + + if (mlx5_ib_deth_sqpn_cap(dev)) { + if (MLX5_CAP_GEN(dev->mdev, + port_type) == MLX5_CAP_PORT_TYPE_IB) + num_qps = pd->device->attrs.max_pkeys; + else if (dev->lag_active) + num_qps = dev->lag_ports; + } + + gsi = &mqp->gsi; + gsi->tx_qps = kcalloc(num_qps, sizeof(*gsi->tx_qps), GFP_KERNEL); + if (!gsi->tx_qps) + return -ENOMEM; + + gsi->outstanding_wrs = + kcalloc(attr->cap.max_send_wr, sizeof(*gsi->outstanding_wrs), + GFP_KERNEL); + if (!gsi->outstanding_wrs) { + ret = -ENOMEM; + goto err_free_tx; + } + + for (i = 0; i < attr->cap.max_send_wr; i++) + atomic_set(&gsi->outstanding_wrs[i].mwc.in_use, 0); + + if (dev->devr.ports[port_num - 1].gsi) { + mlx5_ib_warn(dev, "GSI QP already exists on port %d\n", + port_num); + ret = -EBUSY; + goto err_free_wrs; + } + gsi->num_qps = num_qps; + spin_lock_init(&gsi->lock); + + gsi->cap = attr->cap; + gsi->port_num = port_num; + + gsi->cq = ib_alloc_cq(pd->device, gsi, attr->cap.max_send_wr, 0, + IB_POLL_SOFTIRQ); + if (IS_ERR(gsi->cq)) { + mlx5_ib_warn(dev, "unable to create send CQ for GSI QP. error %ld\n", + PTR_ERR(gsi->cq)); + ret = PTR_ERR(gsi->cq); + goto err_free_wrs; + } + + hw_init_attr.qp_type = MLX5_IB_QPT_HW_GSI; + hw_init_attr.send_cq = gsi->cq; + if (num_qps) { + hw_init_attr.cap.max_send_wr = 0; + hw_init_attr.cap.max_send_sge = 0; + hw_init_attr.cap.max_inline_data = 0; + } + + gsi->rx_qp = ib_create_qp(pd, &hw_init_attr); + if (IS_ERR(gsi->rx_qp)) { + mlx5_ib_warn(dev, "unable to create hardware GSI QP. error %ld\n", + PTR_ERR(gsi->rx_qp)); + ret = PTR_ERR(gsi->rx_qp); + goto err_destroy_cq; + } + + dev->devr.ports[attr->port_num - 1].gsi = gsi; + return 0; + +err_destroy_cq: + ib_free_cq(gsi->cq); +err_free_wrs: + kfree(gsi->outstanding_wrs); +err_free_tx: + kfree(gsi->tx_qps); + return ret; +} + +int mlx5_ib_destroy_gsi(struct mlx5_ib_qp *mqp) +{ + struct mlx5_ib_dev *dev = to_mdev(mqp->ibqp.device); + struct mlx5_ib_gsi_qp *gsi = &mqp->gsi; + const int port_num = gsi->port_num; + int qp_index; + int ret; + + ret = ib_destroy_qp(gsi->rx_qp); + if (ret) { + mlx5_ib_warn(dev, "unable to destroy hardware GSI QP. error %d\n", + ret); + return ret; + } + dev->devr.ports[port_num - 1].gsi = NULL; + gsi->rx_qp = NULL; + + for (qp_index = 0; qp_index < gsi->num_qps; ++qp_index) { + if (!gsi->tx_qps[qp_index]) + continue; + WARN_ON_ONCE(ib_destroy_qp(gsi->tx_qps[qp_index])); + gsi->tx_qps[qp_index] = NULL; + } + + ib_free_cq(gsi->cq); + + kfree(gsi->outstanding_wrs); + kfree(gsi->tx_qps); + return 0; +} + +static struct ib_qp *create_gsi_ud_qp(struct mlx5_ib_gsi_qp *gsi) +{ + struct ib_pd *pd = gsi->rx_qp->pd; + struct ib_qp_init_attr init_attr = { + .event_handler = gsi->rx_qp->event_handler, + .qp_context = gsi->rx_qp->qp_context, + .send_cq = gsi->cq, + .recv_cq = gsi->rx_qp->recv_cq, + .cap = { + .max_send_wr = gsi->cap.max_send_wr, + .max_send_sge = gsi->cap.max_send_sge, + .max_inline_data = gsi->cap.max_inline_data, + }, + .qp_type = IB_QPT_UD, + .create_flags = MLX5_IB_QP_CREATE_SQPN_QP1, + }; + + return ib_create_qp(pd, &init_attr); +} + +static int modify_to_rts(struct mlx5_ib_gsi_qp *gsi, struct ib_qp *qp, + u16 pkey_index) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->device); + struct ib_qp_attr attr; + int mask; + int ret; + + mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_QKEY | IB_QP_PORT; + attr.qp_state = IB_QPS_INIT; + attr.pkey_index = pkey_index; + attr.qkey = IB_QP1_QKEY; + attr.port_num = gsi->port_num; + ret = ib_modify_qp(qp, &attr, mask); + if (ret) { + mlx5_ib_err(dev, "could not change QP%d state to INIT: %d\n", + qp->qp_num, ret); + return ret; + } + + attr.qp_state = IB_QPS_RTR; + ret = ib_modify_qp(qp, &attr, IB_QP_STATE); + if (ret) { + mlx5_ib_err(dev, "could not change QP%d state to RTR: %d\n", + qp->qp_num, ret); + return ret; + } + + attr.qp_state = IB_QPS_RTS; + attr.sq_psn = 0; + ret = ib_modify_qp(qp, &attr, IB_QP_STATE | IB_QP_SQ_PSN); + if (ret) { + mlx5_ib_err(dev, "could not change QP%d state to RTS: %d\n", + qp->qp_num, ret); + return ret; + } + + return 0; +} + +static void setup_qp(struct mlx5_ib_gsi_qp *gsi, u16 qp_index) +{ + struct ib_device *device = gsi->rx_qp->device; + struct mlx5_ib_dev *dev = to_mdev(device); + int pkey_index = qp_index; + struct mlx5_ib_qp *mqp; + struct ib_qp *qp; + unsigned long flags; + u16 pkey; + int ret; + + if (MLX5_CAP_GEN(dev->mdev, port_type) != MLX5_CAP_PORT_TYPE_IB) + pkey_index = 0; + + ret = ib_query_pkey(device, gsi->port_num, pkey_index, &pkey); + if (ret) { + mlx5_ib_warn(dev, "unable to read P_Key at port %d, index %d\n", + gsi->port_num, qp_index); + return; + } + + if (!pkey) { + mlx5_ib_dbg(dev, "invalid P_Key at port %d, index %d. Skipping.\n", + gsi->port_num, qp_index); + return; + } + + spin_lock_irqsave(&gsi->lock, flags); + qp = gsi->tx_qps[qp_index]; + spin_unlock_irqrestore(&gsi->lock, flags); + if (qp) { + mlx5_ib_dbg(dev, "already existing GSI TX QP at port %d, index %d. Skipping\n", + gsi->port_num, qp_index); + return; + } + + qp = create_gsi_ud_qp(gsi); + if (IS_ERR(qp)) { + mlx5_ib_warn(dev, "unable to create hardware UD QP for GSI: %ld\n", + PTR_ERR(qp)); + return; + } + + mqp = to_mqp(qp); + if (dev->lag_active) + mqp->gsi_lag_port = qp_index + 1; + ret = modify_to_rts(gsi, qp, pkey_index); + if (ret) + goto err_destroy_qp; + + spin_lock_irqsave(&gsi->lock, flags); + WARN_ON_ONCE(gsi->tx_qps[qp_index]); + gsi->tx_qps[qp_index] = qp; + spin_unlock_irqrestore(&gsi->lock, flags); + + return; + +err_destroy_qp: + WARN_ON_ONCE(qp); +} + +int mlx5_ib_gsi_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr, + int attr_mask) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->device); + struct mlx5_ib_qp *mqp = to_mqp(qp); + struct mlx5_ib_gsi_qp *gsi = &mqp->gsi; + u16 qp_index; + int ret; + + mlx5_ib_dbg(dev, "modifying GSI QP to state %d\n", attr->qp_state); + + ret = ib_modify_qp(gsi->rx_qp, attr, attr_mask); + if (ret) { + mlx5_ib_warn(dev, "unable to modify GSI rx QP: %d\n", ret); + return ret; + } + + if (to_mqp(gsi->rx_qp)->state != IB_QPS_RTS) + return 0; + + for (qp_index = 0; qp_index < gsi->num_qps; ++qp_index) + setup_qp(gsi, qp_index); + return 0; +} + +int mlx5_ib_gsi_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr) +{ + struct mlx5_ib_qp *mqp = to_mqp(qp); + struct mlx5_ib_gsi_qp *gsi = &mqp->gsi; + int ret; + + ret = ib_query_qp(gsi->rx_qp, qp_attr, qp_attr_mask, qp_init_attr); + qp_init_attr->cap = gsi->cap; + return ret; +} + +/* Call with gsi->lock locked */ +static int mlx5_ib_add_outstanding_wr(struct mlx5_ib_qp *mqp, + struct ib_ud_wr *wr, struct ib_wc *wc) +{ + struct mlx5_ib_gsi_qp *gsi = &mqp->gsi; + struct mlx5_ib_dev *dev = to_mdev(gsi->rx_qp->device); + struct mlx5_ib_gsi_wr *gsi_wr; + + if (gsi->outstanding_pi == gsi->outstanding_ci + gsi->cap.max_send_wr) { + mlx5_ib_warn(dev, "no available GSI work request.\n"); + return -ENOMEM; + } + + gsi_wr = &gsi->outstanding_wrs[gsi->outstanding_pi % + gsi->cap.max_send_wr]; + if (atomic_read(&gsi_wr->mwc.in_use)) { + mlx5_ib_warn(dev, "no available GSI work completion.\n"); + return -ENOMEM; + } + + gsi->outstanding_pi++; + + if (!wc) { + memset(&gsi_wr->mwc.wc, 0, sizeof(gsi_wr->mwc.wc)); + gsi_wr->mwc.wc.pkey_index = wr->pkey_index; + gsi_wr->mwc.wc.wr_id = wr->wr.wr_id; + } else { + gsi_wr->mwc.wc = *wc; + gsi_wr->completed = true; + } + + gsi_wr->cqe.done = &handle_single_completion; + wr->wr.wr_cqe = &gsi_wr->cqe; + + return 0; +} + +/* Call with gsi->lock locked */ +static int mlx5_ib_gsi_silent_drop(struct mlx5_ib_qp *mqp, struct ib_ud_wr *wr) +{ + struct ib_wc wc = { + { .wr_id = wr->wr.wr_id }, + .status = IB_WC_SUCCESS, + .opcode = IB_WC_SEND, + .qp = &mqp->ibqp, + }; + int ret; + + ret = mlx5_ib_add_outstanding_wr(mqp, wr, &wc); + if (ret) + return ret; + + generate_completions(mqp); + + return 0; +} + +/* Call with gsi->lock locked */ +static struct ib_qp *get_tx_qp(struct mlx5_ib_gsi_qp *gsi, struct ib_ud_wr *wr) +{ + struct mlx5_ib_dev *dev = to_mdev(gsi->rx_qp->device); + struct mlx5_ib_ah *ah = to_mah(wr->ah); + int qp_index = wr->pkey_index; + + if (!gsi->num_qps) + return gsi->rx_qp; + + if (dev->lag_active && ah->xmit_port) + qp_index = ah->xmit_port - 1; + + if (qp_index >= gsi->num_qps) + return NULL; + + return gsi->tx_qps[qp_index]; +} + +int mlx5_ib_gsi_post_send(struct ib_qp *qp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) +{ + struct mlx5_ib_qp *mqp = to_mqp(qp); + struct mlx5_ib_gsi_qp *gsi = &mqp->gsi; + struct ib_qp *tx_qp; + unsigned long flags; + int ret; + + for (; wr; wr = wr->next) { + struct ib_ud_wr cur_wr = *ud_wr(wr); + + cur_wr.wr.next = NULL; + + spin_lock_irqsave(&gsi->lock, flags); + tx_qp = get_tx_qp(gsi, &cur_wr); + if (!tx_qp) { + ret = mlx5_ib_gsi_silent_drop(mqp, &cur_wr); + if (ret) + goto err; + spin_unlock_irqrestore(&gsi->lock, flags); + continue; + } + + ret = mlx5_ib_add_outstanding_wr(mqp, &cur_wr, NULL); + if (ret) + goto err; + + ret = ib_post_send(tx_qp, &cur_wr.wr, bad_wr); + if (ret) { + /* Undo the effect of adding the outstanding wr */ + gsi->outstanding_pi--; + goto err; + } + spin_unlock_irqrestore(&gsi->lock, flags); + } + + return 0; + +err: + spin_unlock_irqrestore(&gsi->lock, flags); + *bad_wr = wr; + return ret; +} + +int mlx5_ib_gsi_post_recv(struct ib_qp *qp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) +{ + struct mlx5_ib_qp *mqp = to_mqp(qp); + struct mlx5_ib_gsi_qp *gsi = &mqp->gsi; + + return ib_post_recv(gsi->rx_qp, wr, bad_wr); +} + +void mlx5_ib_gsi_pkey_change(struct mlx5_ib_gsi_qp *gsi) +{ + u16 qp_index; + + for (qp_index = 0; qp_index < gsi->num_qps; ++qp_index) + setup_qp(gsi, qp_index); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/ib_rep.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/ib_rep.c new file mode 100644 index 0000000..be4280e --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/ib_rep.c @@ -0,0 +1,251 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2018 Mellanox Technologies. All rights reserved. + */ + +#include +#include "ib_rep.h" +#include "srq.h" + +static int +mlx5_ib_set_vport_rep(struct mlx5_core_dev *dev, + struct mlx5_eswitch_rep *rep, + int vport_index) +{ + struct mlx5_ib_dev *ibdev; + + ibdev = mlx5_eswitch_uplink_get_proto_dev(dev->priv.eswitch, REP_IB); + if (!ibdev) + return -EINVAL; + + ibdev->port[vport_index].rep = rep; + rep->rep_data[REP_IB].priv = ibdev; + write_lock(&ibdev->port[vport_index].roce.netdev_lock); + ibdev->port[vport_index].roce.netdev = + mlx5_ib_get_rep_netdev(rep->esw, rep->vport); + write_unlock(&ibdev->port[vport_index].roce.netdev_lock); + + return 0; +} + +static void mlx5_ib_register_peer_vport_reps(struct mlx5_core_dev *mdev); + +static int +mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) +{ + u32 num_ports = mlx5_eswitch_get_total_vports(dev); + const struct mlx5_ib_profile *profile; + struct mlx5_core_dev *peer_dev; + struct mlx5_ib_dev *ibdev; + u32 peer_num_ports; + int vport_index; + int ret; + + vport_index = rep->vport_index; + + if (mlx5_lag_is_shared_fdb(dev)) { + peer_dev = mlx5_lag_get_peer_mdev(dev); + peer_num_ports = mlx5_eswitch_get_total_vports(peer_dev); + if (mlx5_lag_is_master(dev)) { + /* Only 1 ib port is the representor for both uplinks */ + num_ports += peer_num_ports - 1; + } else { + if (rep->vport == MLX5_VPORT_UPLINK) + return 0; + vport_index += peer_num_ports; + dev = peer_dev; + } + } + + if (rep->vport == MLX5_VPORT_UPLINK) + profile = &raw_eth_profile; + else + return mlx5_ib_set_vport_rep(dev, rep, vport_index); + + ibdev = ib_alloc_device(mlx5_ib_dev, ib_dev); + if (!ibdev) + return -ENOMEM; + + ibdev->port = kcalloc(num_ports, sizeof(*ibdev->port), + GFP_KERNEL); + if (!ibdev->port) { + ret = -ENOMEM; + goto fail_port; + } + + ibdev->is_rep = true; + ibdev->port[vport_index].rep = rep; + ibdev->port[vport_index].roce.netdev = + mlx5_ib_get_rep_netdev(dev->priv.eswitch, rep->vport); + ibdev->mdev = dev; + ibdev->num_ports = num_ports; + + ret = __mlx5_ib_add(ibdev, profile); + if (ret) + goto fail_add; + + rep->rep_data[REP_IB].priv = ibdev; + if (mlx5_lag_is_shared_fdb(dev)) + mlx5_ib_register_peer_vport_reps(dev); + + return 0; + +fail_add: + kfree(ibdev->port); +fail_port: + ib_dealloc_device(&ibdev->ib_dev); + return ret; +} + +static void *mlx5_ib_rep_to_dev(struct mlx5_eswitch_rep *rep) +{ + return rep->rep_data[REP_IB].priv; +} + +static void +mlx5_ib_vport_rep_unload(struct mlx5_eswitch_rep *rep) +{ + struct mlx5_core_dev *mdev = mlx5_eswitch_get_core_dev(rep->esw); + struct mlx5_ib_dev *dev = mlx5_ib_rep_to_dev(rep); + int vport_index = rep->vport_index; + struct mlx5_ib_port *port; + + if (WARN_ON(!mdev)) + return; + + if (mlx5_lag_is_shared_fdb(mdev) && + !mlx5_lag_is_master(mdev)) { + struct mlx5_core_dev *peer_mdev; + + if (rep->vport == MLX5_VPORT_UPLINK) + return; + peer_mdev = mlx5_lag_get_peer_mdev(mdev); + vport_index += mlx5_eswitch_get_total_vports(peer_mdev); + } + + if (!dev) + return; + + port = &dev->port[vport_index]; + write_lock(&port->roce.netdev_lock); + port->roce.netdev = NULL; + write_unlock(&port->roce.netdev_lock); + rep->rep_data[REP_IB].priv = NULL; + port->rep = NULL; + + if (rep->vport == MLX5_VPORT_UPLINK) { + struct mlx5_core_dev *peer_mdev; + struct mlx5_eswitch *esw; + + if (mlx5_lag_is_shared_fdb(mdev)) { + peer_mdev = mlx5_lag_get_peer_mdev(mdev); + esw = peer_mdev->priv.eswitch; + mlx5_eswitch_unregister_vport_reps(esw, REP_IB); + } + __mlx5_ib_remove(dev, dev->profile, MLX5_IB_STAGE_MAX); + } +} + +static const struct mlx5_eswitch_rep_ops rep_ops = { + .load = mlx5_ib_vport_rep_load, + .unload = mlx5_ib_vport_rep_unload, + .get_proto_dev = mlx5_ib_rep_to_dev, +}; + +static void mlx5_ib_register_peer_vport_reps(struct mlx5_core_dev *mdev) +{ + struct mlx5_core_dev *peer_mdev = mlx5_lag_get_peer_mdev(mdev); + struct mlx5_eswitch *esw; + + if (!peer_mdev) + return; + + esw = peer_mdev->priv.eswitch; + mlx5_eswitch_register_vport_reps(esw, &rep_ops, REP_IB); +} + +struct net_device *mlx5_ib_get_rep_netdev(struct mlx5_eswitch *esw, + u16 vport_num) +{ + return mlx5_eswitch_get_proto_dev(esw, vport_num, REP_ETH); +} + +u32 mlx5_ib_eswitch_vport_match_metadata_enabled(struct mlx5_eswitch *esw) +{ + return mlx5_eswitch_vport_match_metadata_enabled(esw); +} + +u32 mlx5_ib_eswitch_get_vport_metadata_for_match(struct mlx5_eswitch *esw, + u16 vport) +{ + return mlx5_eswitch_get_vport_metadata_for_match(esw, vport); +} + +u32 mlx5_ib_eswitch_get_vport_metadata_mask(void) +{ + return mlx5_eswitch_get_vport_metadata_mask(); +} + +struct mlx5_flow_handle *create_flow_rule_vport_sq(struct mlx5_ib_dev *dev, + struct mlx5_ib_sq *sq, + u32 port) +{ + struct mlx5_eswitch *esw = dev->mdev->priv.eswitch; + struct mlx5_eswitch_rep *rep; + + if (!dev->is_rep || !port) + return NULL; + + if (!dev->port[port - 1].rep) + return ERR_PTR(-EINVAL); + + rep = dev->port[port - 1].rep; + + return mlx5_eswitch_add_send_to_vport_rule(esw, esw, rep, sq->base.mqp.qpn); +} + +static int mlx5r_rep_probe(struct auxiliary_device *adev, + const struct auxiliary_device_id *id) +{ + struct mlx5_adev *idev = container_of(adev, struct mlx5_adev, adev); + struct mlx5_core_dev *mdev = idev->mdev; + struct mlx5_eswitch *esw; + + esw = mdev->priv.eswitch; + mlx5_eswitch_register_vport_reps(esw, &rep_ops, REP_IB); + return 0; +} + +static void mlx5r_rep_remove(struct auxiliary_device *adev) +{ + struct mlx5_adev *idev = container_of(adev, struct mlx5_adev, adev); + struct mlx5_core_dev *mdev = idev->mdev; + struct mlx5_eswitch *esw; + + esw = mdev->priv.eswitch; + mlx5_eswitch_unregister_vport_reps(esw, REP_IB); +} + +static const struct auxiliary_device_id mlx5r_rep_id_table[] = { + { .name = MLX5_ADEV_NAME ".rdma-rep", }, + {}, +}; + +MODULE_DEVICE_TABLE(auxiliary_mlx5r_rep_id_table, mlx5r_rep_id_table); + +static struct auxiliary_driver mlx5r_rep_driver = { + .name = "rep", + .probe = mlx5r_rep_probe, + .remove = mlx5r_rep_remove, + .id_table = mlx5r_rep_id_table, +}; + +int mlx5r_rep_init(void) +{ + return auxiliary_driver_register(&mlx5r_rep_driver); +} + +void mlx5r_rep_cleanup(void) +{ + auxiliary_driver_unregister(&mlx5r_rep_driver); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/ib_rep.h b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/ib_rep.h new file mode 100644 index 0000000..77c6d48 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/ib_rep.h @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2018 Mellanox Technologies. All rights reserved. + */ + +#ifndef __MLX5_IB_REP_H__ +#define __MLX5_IB_REP_H__ + +#include +#include "mlx5_ib.h" + +extern const struct mlx5_ib_profile raw_eth_profile; + +#ifdef CONFIG_MLX5_ESWITCH +int mlx5r_rep_init(void); +void mlx5r_rep_cleanup(void); +struct mlx5_flow_handle *create_flow_rule_vport_sq(struct mlx5_ib_dev *dev, + struct mlx5_ib_sq *sq, + u32 port); +struct net_device *mlx5_ib_get_rep_netdev(struct mlx5_eswitch *esw, + u16 vport_num); +u32 mlx5_ib_eswitch_vport_match_metadata_enabled(struct mlx5_eswitch *esw); +u32 mlx5_ib_eswitch_get_vport_metadata_for_match(struct mlx5_eswitch *esw, + u16 vport); +u32 mlx5_ib_eswitch_get_vport_metadata_mask(void); +#else /* CONFIG_MLX5_ESWITCH */ +static inline int mlx5r_rep_init(void) { return 0; } +static inline void mlx5r_rep_cleanup(void) {} +static inline +struct mlx5_flow_handle *create_flow_rule_vport_sq(struct mlx5_ib_dev *dev, + struct mlx5_ib_sq *sq, + u32 port) +{ + return NULL; +} + +static inline +struct net_device *mlx5_ib_get_rep_netdev(struct mlx5_eswitch *esw, + u16 vport_num) +{ + return NULL; +} + +static inline +u32 mlx5_ib_eswitch_vport_match_metadata_enabled(struct mlx5_eswitch *esw) +{ + return 0; +}; + +static inline +u32 mlx5_ib_eswitch_get_vport_metadata_for_match(struct mlx5_eswitch *esw, + u16 vport) +{ + return 0; +}; + +static inline +u32 mlx5_ib_eswitch_get_vport_metadata_mask(void) +{ + return 0; +}; +#endif +#endif /* __MLX5_IB_REP_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/ib_virt.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/ib_virt.c new file mode 100644 index 0000000..f2f6287 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/ib_virt.c @@ -0,0 +1,225 @@ +/* + * Copyright (c) 2016, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include "mlx5_ib.h" + +static inline u32 mlx_to_net_policy(enum port_state_policy mlx_policy) +{ + switch (mlx_policy) { + case MLX5_POLICY_DOWN: + return IFLA_VF_LINK_STATE_DISABLE; + case MLX5_POLICY_UP: + return IFLA_VF_LINK_STATE_ENABLE; + case MLX5_POLICY_FOLLOW: + return IFLA_VF_LINK_STATE_AUTO; + default: + return __IFLA_VF_LINK_STATE_MAX; + } +} + +int mlx5_ib_get_vf_config(struct ib_device *device, int vf, u32 port, + struct ifla_vf_info *info) +{ + struct mlx5_ib_dev *dev = to_mdev(device); + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_hca_vport_context *rep; + int err; + + rep = kzalloc(sizeof(*rep), GFP_KERNEL); + if (!rep) + return -ENOMEM; + + err = mlx5_query_hca_vport_context(mdev, 1, 1, vf + 1, rep); + if (err) { + mlx5_ib_warn(dev, "failed to query port policy for vf %d (%d)\n", + vf, err); + goto free; + } + memset(info, 0, sizeof(*info)); + info->linkstate = mlx_to_net_policy(rep->policy); + if (info->linkstate == __IFLA_VF_LINK_STATE_MAX) + err = -EINVAL; + +free: + kfree(rep); + return err; +} + +static inline enum port_state_policy net_to_mlx_policy(int policy) +{ + switch (policy) { + case IFLA_VF_LINK_STATE_DISABLE: + return MLX5_POLICY_DOWN; + case IFLA_VF_LINK_STATE_ENABLE: + return MLX5_POLICY_UP; + case IFLA_VF_LINK_STATE_AUTO: + return MLX5_POLICY_FOLLOW; + default: + return MLX5_POLICY_INVALID; + } +} + +int mlx5_ib_set_vf_link_state(struct ib_device *device, int vf, + u32 port, int state) +{ + struct mlx5_ib_dev *dev = to_mdev(device); + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_hca_vport_context *in; + struct mlx5_vf_context *vfs_ctx = mdev->priv.sriov.vfs_ctx; + int err; + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) + return -ENOMEM; + + in->policy = net_to_mlx_policy(state); + if (in->policy == MLX5_POLICY_INVALID) { + err = -EINVAL; + goto out; + } + in->field_select = MLX5_HCA_VPORT_SEL_STATE_POLICY; + err = mlx5_core_modify_hca_vport_context(mdev, 1, 1, vf + 1, in); + if (!err) + vfs_ctx[vf].policy = in->policy; + +out: + kfree(in); + return err; +} + +int mlx5_ib_get_vf_stats(struct ib_device *device, int vf, + u32 port, struct ifla_vf_stats *stats) +{ + int out_sz = MLX5_ST_SZ_BYTES(query_vport_counter_out); + struct mlx5_core_dev *mdev; + struct mlx5_ib_dev *dev; + void *out; + int err; + + dev = to_mdev(device); + mdev = dev->mdev; + + out = kzalloc(out_sz, GFP_KERNEL); + if (!out) + return -ENOMEM; + + err = mlx5_core_query_vport_counter(mdev, true, vf, port, out); + if (err) + goto ex; + + stats->rx_packets = MLX5_GET64_PR(query_vport_counter_out, out, received_ib_unicast.packets); + stats->tx_packets = MLX5_GET64_PR(query_vport_counter_out, out, transmitted_ib_unicast.packets); + stats->rx_bytes = MLX5_GET64_PR(query_vport_counter_out, out, received_ib_unicast.octets); + stats->tx_bytes = MLX5_GET64_PR(query_vport_counter_out, out, transmitted_ib_unicast.octets); + stats->multicast = MLX5_GET64_PR(query_vport_counter_out, out, received_ib_multicast.packets); + +ex: + kfree(out); + return err; +} + +static int set_vf_node_guid(struct ib_device *device, int vf, u32 port, + u64 guid) +{ + struct mlx5_ib_dev *dev = to_mdev(device); + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_hca_vport_context *in; + struct mlx5_vf_context *vfs_ctx = mdev->priv.sriov.vfs_ctx; + int err; + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) + return -ENOMEM; + + in->field_select = MLX5_HCA_VPORT_SEL_NODE_GUID; + in->node_guid = guid; + err = mlx5_core_modify_hca_vport_context(mdev, 1, 1, vf + 1, in); + if (!err) { + vfs_ctx[vf].node_guid = guid; + vfs_ctx[vf].node_guid_valid = 1; + } + kfree(in); + return err; +} + +static int set_vf_port_guid(struct ib_device *device, int vf, u32 port, + u64 guid) +{ + struct mlx5_ib_dev *dev = to_mdev(device); + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_hca_vport_context *in; + struct mlx5_vf_context *vfs_ctx = mdev->priv.sriov.vfs_ctx; + int err; + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) + return -ENOMEM; + + in->field_select = MLX5_HCA_VPORT_SEL_PORT_GUID; + in->port_guid = guid; + err = mlx5_core_modify_hca_vport_context(mdev, 1, 1, vf + 1, in); + if (!err) { + vfs_ctx[vf].port_guid = guid; + vfs_ctx[vf].port_guid_valid = 1; + } + kfree(in); + return err; +} + +int mlx5_ib_set_vf_guid(struct ib_device *device, int vf, u32 port, + u64 guid, int type) +{ + if (type == IFLA_VF_IB_NODE_GUID) + return set_vf_node_guid(device, vf, port, guid); + else if (type == IFLA_VF_IB_PORT_GUID) + return set_vf_port_guid(device, vf, port, guid); + + return -EINVAL; +} + +int mlx5_ib_get_vf_guid(struct ib_device *device, int vf, u32 port, + struct ifla_vf_guid *node_guid, + struct ifla_vf_guid *port_guid) +{ + struct mlx5_ib_dev *dev = to_mdev(device); + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_vf_context *vfs_ctx = mdev->priv.sriov.vfs_ctx; + + node_guid->guid = + vfs_ctx[vf].node_guid_valid ? vfs_ctx[vf].node_guid : 0; + port_guid->guid = + vfs_ctx[vf].port_guid_valid ? vfs_ctx[vf].port_guid : 0; + + return 0; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/mad.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/mad.c new file mode 100644 index 0000000..209ec08 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/mad.c @@ -0,0 +1,646 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include "mlx5_ib.h" +#include "cmd.h" + +enum { + MLX5_IB_VENDOR_CLASS1 = 0x9, + MLX5_IB_VENDOR_CLASS2 = 0xa +}; + +static bool can_do_mad_ifc(struct mlx5_ib_dev *dev, u32 port_num, + struct ib_mad *in_mad) +{ + if (in_mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED && + in_mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + return true; + return dev->port_caps[port_num - 1].has_smi; +} + +static int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey, + int ignore_bkey, u32 port, const struct ib_wc *in_wc, + const struct ib_grh *in_grh, const void *in_mad, + void *response_mad) +{ + u8 op_modifier = 0; + + if (!can_do_mad_ifc(dev, port, (struct ib_mad *)in_mad)) + return -EPERM; + + /* Key check traps can't be generated unless we have in_wc to + * tell us where to send the trap. + */ + if (ignore_mkey || !in_wc) + op_modifier |= 0x1; + if (ignore_bkey || !in_wc) + op_modifier |= 0x2; + + return mlx5_cmd_mad_ifc(dev->mdev, in_mad, response_mad, op_modifier, + port); +} + +static void pma_cnt_ext_assign(struct ib_pma_portcounters_ext *pma_cnt_ext, + void *out) +{ +#define MLX5_SUM_CNT(p, cntr1, cntr2) \ + (MLX5_GET64(query_vport_counter_out, p, cntr1) + \ + MLX5_GET64(query_vport_counter_out, p, cntr2)) + + pma_cnt_ext->port_xmit_data = + cpu_to_be64(MLX5_SUM_CNT(out, transmitted_ib_unicast.octets, + transmitted_ib_multicast.octets) >> 2); + pma_cnt_ext->port_rcv_data = + cpu_to_be64(MLX5_SUM_CNT(out, received_ib_unicast.octets, + received_ib_multicast.octets) >> 2); + pma_cnt_ext->port_xmit_packets = + cpu_to_be64(MLX5_SUM_CNT(out, transmitted_ib_unicast.packets, + transmitted_ib_multicast.packets)); + pma_cnt_ext->port_rcv_packets = + cpu_to_be64(MLX5_SUM_CNT(out, received_ib_unicast.packets, + received_ib_multicast.packets)); + pma_cnt_ext->port_unicast_xmit_packets = + MLX5_GET64_BE(query_vport_counter_out, + out, transmitted_ib_unicast.packets); + pma_cnt_ext->port_unicast_rcv_packets = + MLX5_GET64_BE(query_vport_counter_out, + out, received_ib_unicast.packets); + pma_cnt_ext->port_multicast_xmit_packets = + MLX5_GET64_BE(query_vport_counter_out, + out, transmitted_ib_multicast.packets); + pma_cnt_ext->port_multicast_rcv_packets = + MLX5_GET64_BE(query_vport_counter_out, + out, received_ib_multicast.packets); +} + +static void pma_cnt_assign(struct ib_pma_portcounters *pma_cnt, + void *out) +{ + /* Traffic counters will be reported in + * their 64bit form via ib_pma_portcounters_ext by default. + */ + void *out_pma = MLX5_ADDR_OF(ppcnt_reg, out, + counter_set); + +#define MLX5_ASSIGN_PMA_CNTR(counter_var, counter_name) { \ + counter_var = MLX5_GET_BE(typeof(counter_var), \ + ib_port_cntrs_grp_data_layout, \ + out_pma, counter_name); \ + } + + MLX5_ASSIGN_PMA_CNTR(pma_cnt->symbol_error_counter, + symbol_error_counter); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->link_error_recovery_counter, + link_error_recovery_counter); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->link_downed_counter, + link_downed_counter); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->port_rcv_errors, + port_rcv_errors); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->port_rcv_remphys_errors, + port_rcv_remote_physical_errors); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->port_rcv_switch_relay_errors, + port_rcv_switch_relay_errors); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->port_xmit_discards, + port_xmit_discards); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->port_xmit_constraint_errors, + port_xmit_constraint_errors); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->port_xmit_wait, + port_xmit_wait); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->port_rcv_constraint_errors, + port_rcv_constraint_errors); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->link_overrun_errors, + link_overrun_errors); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->vl15_dropped, + vl_15_dropped); +} + +static int mlx5_core_query_ib_ppcnt(struct mlx5_core_dev *dev, + void *out, size_t sz) +{ + u32 *in; + int err; + + in = kvzalloc(sz, GFP_KERNEL); + if (!in) { + err = -ENOMEM; + return err; + } + + MLX5_SET(ppcnt_reg, in, local_port, 1); + + MLX5_SET(ppcnt_reg, in, grp, MLX5_INFINIBAND_PORT_COUNTERS_GROUP); + err = mlx5_core_access_reg(dev, in, sz, out, + sz, MLX5_REG_PPCNT, 0, 0); + + kvfree(in); + return err; +} + +static int process_pma_cmd(struct mlx5_ib_dev *dev, u32 port_num, + const struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + struct mlx5_core_dev *mdev; + bool native_port = true; + u32 mdev_port_num; + void *out_cnt; + int err; + + mdev = mlx5_ib_get_native_port_mdev(dev, port_num, &mdev_port_num); + if (!mdev) { + /* Fail to get the native port, likely due to 2nd port is still + * unaffiliated. In such case default to 1st port and attached + * PF device. + */ + native_port = false; + mdev = dev->mdev; + mdev_port_num = 1; + } + if (MLX5_CAP_GEN(dev->mdev, num_ports) == 1) { + /* set local port to one for Function-Per-Port HCA. */ + mdev = dev->mdev; + mdev_port_num = 1; + } + + /* Declaring support of extended counters */ + if (in_mad->mad_hdr.attr_id == IB_PMA_CLASS_PORT_INFO) { + struct ib_class_port_info cpi = {}; + + cpi.capability_mask = IB_PMA_CLASS_CAP_EXT_WIDTH; + memcpy((out_mad->data + 40), &cpi, sizeof(cpi)); + err = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; + goto done; + } + + if (in_mad->mad_hdr.attr_id == IB_PMA_PORT_COUNTERS_EXT) { + struct ib_pma_portcounters_ext *pma_cnt_ext = + (struct ib_pma_portcounters_ext *)(out_mad->data + 40); + int sz = MLX5_ST_SZ_BYTES(query_vport_counter_out); + + out_cnt = kvzalloc(sz, GFP_KERNEL); + if (!out_cnt) { + err = IB_MAD_RESULT_FAILURE; + goto done; + } + + err = mlx5_core_query_vport_counter(mdev, 0, 0, mdev_port_num, + out_cnt); + if (!err) + pma_cnt_ext_assign(pma_cnt_ext, out_cnt); + } else { + struct ib_pma_portcounters *pma_cnt = + (struct ib_pma_portcounters *)(out_mad->data + 40); + int sz = MLX5_ST_SZ_BYTES(ppcnt_reg); + + out_cnt = kvzalloc(sz, GFP_KERNEL); + if (!out_cnt) { + err = IB_MAD_RESULT_FAILURE; + goto done; + } + + err = mlx5_core_query_ib_ppcnt(mdev, out_cnt, sz); + if (!err) + pma_cnt_assign(pma_cnt, out_cnt); + } + kvfree(out_cnt); + err = err ? IB_MAD_RESULT_FAILURE : + IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; +done: + if (native_port) + mlx5_ib_put_native_port_mdev(dev, port_num); + return err; +} + +int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u32 port_num, + const struct ib_wc *in_wc, const struct ib_grh *in_grh, + const struct ib_mad *in, struct ib_mad *out, + size_t *out_mad_size, u16 *out_mad_pkey_index) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + u8 mgmt_class = in->mad_hdr.mgmt_class; + u8 method = in->mad_hdr.method; + u16 slid; + int err; + + slid = in_wc ? ib_lid_cpu16(in_wc->slid) : + be16_to_cpu(IB_LID_PERMISSIVE); + + if (method == IB_MGMT_METHOD_TRAP && !slid) + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; + + switch (mgmt_class) { + case IB_MGMT_CLASS_SUBN_LID_ROUTED: + case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE: { + if (method != IB_MGMT_METHOD_GET && + method != IB_MGMT_METHOD_SET && + method != IB_MGMT_METHOD_TRAP_REPRESS) + return IB_MAD_RESULT_SUCCESS; + + /* Don't process SMInfo queries -- the SMA can't handle them. + */ + if (in->mad_hdr.attr_id == IB_SMP_ATTR_SM_INFO) + return IB_MAD_RESULT_SUCCESS; + } break; + case IB_MGMT_CLASS_PERF_MGMT: + if (MLX5_CAP_GEN(dev->mdev, vport_counters) && + method == IB_MGMT_METHOD_GET) + return process_pma_cmd(dev, port_num, in, out); + fallthrough; + case MLX5_IB_VENDOR_CLASS1: + case MLX5_IB_VENDOR_CLASS2: + case IB_MGMT_CLASS_CONG_MGMT: { + if (method != IB_MGMT_METHOD_GET && + method != IB_MGMT_METHOD_SET) + return IB_MAD_RESULT_SUCCESS; + } break; + default: + return IB_MAD_RESULT_SUCCESS; + } + + err = mlx5_MAD_IFC(to_mdev(ibdev), mad_flags & IB_MAD_IGNORE_MKEY, + mad_flags & IB_MAD_IGNORE_BKEY, port_num, in_wc, + in_grh, in, out); + if (err) + return IB_MAD_RESULT_FAILURE; + + /* set return bit in status of directed route responses */ + if (mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + out->mad_hdr.status |= cpu_to_be16(1 << 15); + + if (method == IB_MGMT_METHOD_TRAP_REPRESS) + /* no response for trap repress */ + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; + + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; +} + +int mlx5_query_ext_port_caps(struct mlx5_ib_dev *dev, unsigned int port) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + u16 packet_error; + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + ib_init_query_mad(in_mad); + in_mad->attr_id = MLX5_ATTR_EXTENDED_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + err = mlx5_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad); + + packet_error = be16_to_cpu(out_mad->status); + + dev->port_caps[port - 1].ext_port_cap = (!err && !packet_error) ? + MLX_EXT_PORT_CAP_FLAG_EXTENDED_PORT_INFO : 0; + +out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +static int mlx5_query_mad_ifc_smp_attr_node_info(struct ib_device *ibdev, + struct ib_smp *out_mad) +{ + struct ib_smp *in_mad = NULL; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + if (!in_mad) + return -ENOMEM; + + ib_init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_NODE_INFO; + + err = mlx5_MAD_IFC(to_mdev(ibdev), 1, 1, 1, NULL, NULL, in_mad, + out_mad); + + kfree(in_mad); + return err; +} + +int mlx5_query_mad_ifc_system_image_guid(struct ib_device *ibdev, + __be64 *sys_image_guid) +{ + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!out_mad) + return -ENOMEM; + + err = mlx5_query_mad_ifc_smp_attr_node_info(ibdev, out_mad); + if (err) + goto out; + + memcpy(sys_image_guid, out_mad->data + 4, 8); + +out: + kfree(out_mad); + + return err; +} + +int mlx5_query_mad_ifc_max_pkeys(struct ib_device *ibdev, + u16 *max_pkeys) +{ + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!out_mad) + return -ENOMEM; + + err = mlx5_query_mad_ifc_smp_attr_node_info(ibdev, out_mad); + if (err) + goto out; + + *max_pkeys = be16_to_cpup((__be16 *)(out_mad->data + 28)); + +out: + kfree(out_mad); + + return err; +} + +int mlx5_query_mad_ifc_vendor_id(struct ib_device *ibdev, + u32 *vendor_id) +{ + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!out_mad) + return -ENOMEM; + + err = mlx5_query_mad_ifc_smp_attr_node_info(ibdev, out_mad); + if (err) + goto out; + + *vendor_id = be32_to_cpup((__be32 *)(out_mad->data + 36)) & 0xffff; + +out: + kfree(out_mad); + + return err; +} + +int mlx5_query_mad_ifc_node_desc(struct mlx5_ib_dev *dev, char *node_desc) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + ib_init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_NODE_DESC; + + err = mlx5_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + memcpy(node_desc, out_mad->data, IB_DEVICE_NODE_DESC_MAX); +out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +int mlx5_query_mad_ifc_node_guid(struct mlx5_ib_dev *dev, __be64 *node_guid) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + ib_init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_NODE_INFO; + + err = mlx5_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + memcpy(node_guid, out_mad->data + 12, 8); +out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +int mlx5_query_mad_ifc_pkey(struct ib_device *ibdev, u32 port, u16 index, + u16 *pkey) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + ib_init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_PKEY_TABLE; + in_mad->attr_mod = cpu_to_be32(index / 32); + + err = mlx5_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, + out_mad); + if (err) + goto out; + + *pkey = be16_to_cpu(((__be16 *)out_mad->data)[index % 32]); + +out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +int mlx5_query_mad_ifc_gids(struct ib_device *ibdev, u32 port, int index, + union ib_gid *gid) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + ib_init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + err = mlx5_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, + out_mad); + if (err) + goto out; + + memcpy(gid->raw, out_mad->data + 8, 8); + + ib_init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_GUID_INFO; + in_mad->attr_mod = cpu_to_be32(index / 8); + + err = mlx5_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, + out_mad); + if (err) + goto out; + + memcpy(gid->raw + 8, out_mad->data + (index % 8) * 8, 8); + +out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +int mlx5_query_mad_ifc_port(struct ib_device *ibdev, u32 port, + struct ib_port_attr *props) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_core_dev *mdev = dev->mdev; + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int ext_active_speed; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); + out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + /* props being zeroed by the caller, avoid zeroing it here */ + + ib_init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + err = mlx5_MAD_IFC(dev, 1, 1, port, NULL, NULL, in_mad, out_mad); + if (err) { + mlx5_ib_warn(dev, "err %d\n", err); + goto out; + } + + props->lid = be16_to_cpup((__be16 *)(out_mad->data + 16)); + props->lmc = out_mad->data[34] & 0x7; + props->sm_lid = be16_to_cpup((__be16 *)(out_mad->data + 18)); + props->sm_sl = out_mad->data[36] & 0xf; + props->state = out_mad->data[32] & 0xf; + props->phys_state = out_mad->data[33] >> 4; + props->port_cap_flags = be32_to_cpup((__be32 *)(out_mad->data + 20)); + props->gid_tbl_len = out_mad->data[50]; + props->max_msg_sz = 1 << MLX5_CAP_GEN(mdev, log_max_msg); + props->pkey_tbl_len = dev->pkey_table_len; + props->bad_pkey_cntr = be16_to_cpup((__be16 *)(out_mad->data + 46)); + props->qkey_viol_cntr = be16_to_cpup((__be16 *)(out_mad->data + 48)); + props->active_width = out_mad->data[31] & 0xf; + props->active_speed = out_mad->data[35] >> 4; + props->max_mtu = out_mad->data[41] & 0xf; + props->active_mtu = out_mad->data[36] >> 4; + props->subnet_timeout = out_mad->data[51] & 0x1f; + props->max_vl_num = out_mad->data[37] >> 4; + props->init_type_reply = out_mad->data[41] >> 4; + + if (props->port_cap_flags & IB_PORT_CAP_MASK2_SUP) { + props->port_cap_flags2 = + be16_to_cpup((__be16 *)(out_mad->data + 60)); + + if (props->port_cap_flags2 & IB_PORT_LINK_WIDTH_2X_SUP) + props->active_width = out_mad->data[31] & 0x1f; + } + + /* Check if extended speeds (EDR/FDR/...) are supported */ + if (props->port_cap_flags & IB_PORT_EXTENDED_SPEEDS_SUP) { + ext_active_speed = out_mad->data[62] >> 4; + + switch (ext_active_speed) { + case 1: + props->active_speed = 16; /* FDR */ + break; + case 2: + props->active_speed = 32; /* EDR */ + break; + case 4: + if (props->port_cap_flags & IB_PORT_CAP_MASK2_SUP && + props->port_cap_flags2 & IB_PORT_LINK_SPEED_HDR_SUP) + props->active_speed = IB_SPEED_HDR; + break; + case 8: + if (props->port_cap_flags & IB_PORT_CAP_MASK2_SUP && + props->port_cap_flags2 & IB_PORT_LINK_SPEED_NDR_SUP) + props->active_speed = IB_SPEED_NDR; + break; + } + } + + /* If reported active speed is QDR, check if is FDR-10 */ + if (props->active_speed == 4) { + if (dev->port_caps[port - 1].ext_port_cap & + MLX_EXT_PORT_CAP_FLAG_EXTENDED_PORT_INFO) { + ib_init_query_mad(in_mad); + in_mad->attr_id = MLX5_ATTR_EXTENDED_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + err = mlx5_MAD_IFC(dev, 1, 1, port, + NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + /* Checking LinkSpeedActive for FDR-10 */ + if (out_mad->data[15] & 0x1) + props->active_speed = 8; + } + } + +out: + kfree(in_mad); + kfree(out_mad); + + return err; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/main.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/main.c new file mode 100644 index 0000000..7216f05 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/main.c @@ -0,0 +1,4893 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2013-2020, Mellanox Technologies inc. All rights reserved. + * Copyright (c) 2020, Intel Corporation. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_MLX5_EN_MACSEC +#include +#endif +#include +#include +#include +#include +#include +#include +#include "mlx5_ib.h" +#include "ib_rep.h" +#include "cmd.h" +#include "devx.h" +#include "dm.h" +#include "fs.h" +#include "srq.h" +#include "qp.h" +#include "wr.h" +#include "restrack.h" +#include "counters.h" +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_MLX5_EN_MACSEC +#include +#endif + +#define UVERBS_MODULE_NAME mlx5_ib +#include + +MODULE_AUTHOR("Eli Cohen "); +MODULE_DESCRIPTION("Mellanox 5th generation network adapters (ConnectX series) IB driver"); +MODULE_LICENSE("Dual BSD/GPL"); + +MODULE_ALIAS("auxiliary:mlx5_core.rdma"); +MODULE_ALIAS("auxiliary:mlx5_core.multiport"); +MODULE_ALIAS("auxiliary:mlx5_core.rdma-rep"); + +struct mlx5_ib_event_work { + struct work_struct work; + union { + struct mlx5_ib_dev *dev; + struct mlx5_ib_multiport_info *mpi; + }; + bool is_slave; + unsigned int event; + void *param; +}; + +enum { + MLX5_ATOMIC_SIZE_QP_8BYTES = 1 << 3, +}; + +struct workqueue_struct *mlx5_ib_sigerr_sqd_wq; +static struct workqueue_struct *mlx5_ib_event_wq; +static LIST_HEAD(mlx5_ib_unaffiliated_port_list); +static LIST_HEAD(mlx5_ib_dev_list); +/* + * This mutex should be held when accessing either of the above lists + */ +static DEFINE_MUTEX(mlx5_ib_multiport_mutex); + +struct mlx5_ib_dev *mlx5_ib_get_ibdev_from_mpi(struct mlx5_ib_multiport_info *mpi) +{ + struct mlx5_ib_dev *dev; + + mutex_lock(&mlx5_ib_multiport_mutex); + dev = mpi->ibdev; + mutex_unlock(&mlx5_ib_multiport_mutex); + return dev; +} + +static enum rdma_link_layer +mlx5_port_type_cap_to_rdma_ll(int port_type_cap) +{ + switch (port_type_cap) { + case MLX5_CAP_PORT_TYPE_IB: + return IB_LINK_LAYER_INFINIBAND; + case MLX5_CAP_PORT_TYPE_ETH: + return IB_LINK_LAYER_ETHERNET; + default: + return IB_LINK_LAYER_UNSPECIFIED; + } +} + +static enum rdma_link_layer +mlx5_ib_port_link_layer(struct ib_device *device, u32 port_num) +{ + struct mlx5_ib_dev *dev = to_mdev(device); + int port_type_cap = MLX5_CAP_GEN(dev->mdev, port_type); + + return mlx5_port_type_cap_to_rdma_ll(port_type_cap); +} + +static int get_port_state(struct ib_device *ibdev, + u32 port_num, + enum ib_port_state *state) +{ + struct ib_port_attr attr; + int ret; + + memset(&attr, 0, sizeof(attr)); + ret = ibdev->ops.query_port(ibdev, port_num, &attr); + if (!ret) + *state = attr.state; + return ret; +} + +static struct mlx5_roce *mlx5_get_rep_roce(struct mlx5_ib_dev *dev, + struct net_device *ndev, + struct net_device *upper, + u32 *port_num) +{ + struct net_device *rep_ndev; + struct mlx5_ib_port *port; + int i; + + for (i = 0; i < dev->num_ports; i++) { + port = &dev->port[i]; + if (!port->rep) + continue; + + if (upper == ndev && port->rep->vport == MLX5_VPORT_UPLINK) { + *port_num = i + 1; + return &port->roce; + } + + if (upper && port->rep->vport == MLX5_VPORT_UPLINK) + continue; + + read_lock(&port->roce.netdev_lock); + rep_ndev = mlx5_ib_get_rep_netdev(port->rep->esw, + port->rep->vport); + if (rep_ndev == ndev) { + read_unlock(&port->roce.netdev_lock); + *port_num = i + 1; + return &port->roce; + } + read_unlock(&port->roce.netdev_lock); + } + + return NULL; +} + +static int mlx5_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct mlx5_roce *roce = container_of(this, struct mlx5_roce, nb); + struct net_device *ndev = netdev_notifier_info_to_dev(ptr); + u32 port_num = roce->native_port_num; + struct mlx5_core_dev *mdev; + struct mlx5_ib_dev *ibdev; + + ibdev = roce->dev; + mdev = mlx5_ib_get_native_port_mdev(ibdev, port_num, NULL); + if (!mdev) + return NOTIFY_DONE; + + switch (event) { + case NETDEV_REGISTER: + /* Should already be registered during the load */ + if (ibdev->is_rep) + break; + write_lock(&roce->netdev_lock); + if (ndev->dev.parent == mdev->device) + roce->netdev = ndev; + write_unlock(&roce->netdev_lock); + break; + + case NETDEV_UNREGISTER: + /* In case of reps, ib device goes away before the netdevs */ + write_lock(&roce->netdev_lock); + if (roce->netdev == ndev) + roce->netdev = NULL; + write_unlock(&roce->netdev_lock); + break; + + case NETDEV_CHANGE: + case NETDEV_UP: + case NETDEV_DOWN: { + struct net_device *lag_ndev = mlx5_lag_get_roce_netdev(mdev); + struct net_device *upper = NULL; + + if (lag_ndev) { + upper = netdev_master_upper_dev_get(lag_ndev); + dev_put(lag_ndev); + } + + if (ibdev->is_rep) + roce = mlx5_get_rep_roce(ibdev, ndev, upper, &port_num); + if (!roce) + return NOTIFY_DONE; + if ((upper == ndev || + ((!upper || ibdev->is_rep) && ndev == roce->netdev)) && + ibdev->ib_active) { + struct ib_event ibev = { }; + enum ib_port_state port_state; + + if (get_port_state(&ibdev->ib_dev, port_num, + &port_state)) + goto done; + + if (roce->last_port_state == port_state) + goto done; + + roce->last_port_state = port_state; + ibev.device = &ibdev->ib_dev; + if (port_state == IB_PORT_DOWN) + ibev.event = IB_EVENT_PORT_ERR; + else if (port_state == IB_PORT_ACTIVE) + ibev.event = IB_EVENT_PORT_ACTIVE; + else + goto done; + + ibev.element.port_num = port_num; + ib_dispatch_event(&ibev); + } + break; + } + + default: + break; + } +done: + mlx5_ib_put_native_port_mdev(ibdev, port_num); + return NOTIFY_DONE; +} + +static struct net_device *mlx5_ib_get_netdev(struct ib_device *device, + u32 port_num) +{ + struct mlx5_ib_dev *ibdev = to_mdev(device); + struct net_device *ndev; + struct mlx5_core_dev *mdev; + + mdev = mlx5_ib_get_native_port_mdev(ibdev, port_num, NULL); + if (!mdev) + return NULL; + if (ibdev->is_rep) + goto reg; + + ndev = mlx5_lag_get_roce_netdev(mdev); + if (ndev) + goto out; + +reg: + /* Ensure ndev does not disappear before we invoke dev_hold() + */ + read_lock(&ibdev->port[port_num - 1].roce.netdev_lock); + ndev = ibdev->port[port_num - 1].roce.netdev; + if (ndev) + dev_hold(ndev); + read_unlock(&ibdev->port[port_num - 1].roce.netdev_lock); + +out: + mlx5_ib_put_native_port_mdev(ibdev, port_num); + return ndev; +} + +struct mlx5_core_dev *mlx5_ib_get_native_port_mdev(struct mlx5_ib_dev *ibdev, + u32 ib_port_num, + u32 *native_port_num) +{ + enum rdma_link_layer ll = mlx5_ib_port_link_layer(&ibdev->ib_dev, + ib_port_num); + struct mlx5_core_dev *mdev = NULL; + struct mlx5_ib_multiport_info *mpi; + struct mlx5_ib_port *port; + + if (!mlx5_core_mp_enabled(ibdev->mdev) || + ll != IB_LINK_LAYER_ETHERNET) { + if (native_port_num) + *native_port_num = ib_port_num; + return ibdev->mdev; + } + + if (native_port_num) + *native_port_num = 1; + + port = &ibdev->port[ib_port_num - 1]; + spin_lock(&port->mp.mpi_lock); + mpi = ibdev->port[ib_port_num - 1].mp.mpi; + if (mpi && !mpi->unaffiliate) { + mdev = mpi->mdev; + /* If it's the master no need to refcount, it'll exist + * as long as the ib_dev exists. + */ + if (!mpi->is_master) + mpi->mdev_refcnt++; + } + spin_unlock(&port->mp.mpi_lock); + + return mdev; +} + +void mlx5_ib_put_native_port_mdev(struct mlx5_ib_dev *ibdev, u32 port_num) +{ + enum rdma_link_layer ll = mlx5_ib_port_link_layer(&ibdev->ib_dev, + port_num); + struct mlx5_ib_multiport_info *mpi; + struct mlx5_ib_port *port; + + if (!mlx5_core_mp_enabled(ibdev->mdev) || ll != IB_LINK_LAYER_ETHERNET) + return; + + port = &ibdev->port[port_num - 1]; + + spin_lock(&port->mp.mpi_lock); + mpi = ibdev->port[port_num - 1].mp.mpi; + if (mpi->is_master) + goto out; + + mpi->mdev_refcnt--; + if (mpi->unaffiliate) + complete(&mpi->unref_comp); +out: + spin_unlock(&port->mp.mpi_lock); +} + +static int translate_eth_legacy_proto_oper(u32 eth_proto_oper, + u16 *active_speed, u8 *active_width) +{ + switch (eth_proto_oper) { + case MLX5E_PROT_MASK(MLX5E_1000BASE_CX_SGMII): + case MLX5E_PROT_MASK(MLX5E_1000BASE_KX): + case MLX5E_PROT_MASK(MLX5E_100BASE_TX): + case MLX5E_PROT_MASK(MLX5E_1000BASE_T): + *active_width = IB_WIDTH_1X; + *active_speed = IB_SPEED_SDR; + break; + case MLX5E_PROT_MASK(MLX5E_10GBASE_T): + case MLX5E_PROT_MASK(MLX5E_10GBASE_CX4): + case MLX5E_PROT_MASK(MLX5E_10GBASE_KX4): + case MLX5E_PROT_MASK(MLX5E_10GBASE_KR): + case MLX5E_PROT_MASK(MLX5E_10GBASE_CR): + case MLX5E_PROT_MASK(MLX5E_10GBASE_SR): + case MLX5E_PROT_MASK(MLX5E_10GBASE_ER): + *active_width = IB_WIDTH_1X; + *active_speed = IB_SPEED_QDR; + break; + case MLX5E_PROT_MASK(MLX5E_25GBASE_CR): + case MLX5E_PROT_MASK(MLX5E_25GBASE_KR): + case MLX5E_PROT_MASK(MLX5E_25GBASE_SR): + *active_width = IB_WIDTH_1X; + *active_speed = IB_SPEED_EDR; + break; + case MLX5E_PROT_MASK(MLX5E_40GBASE_CR4): + case MLX5E_PROT_MASK(MLX5E_40GBASE_KR4): + case MLX5E_PROT_MASK(MLX5E_40GBASE_SR4): + case MLX5E_PROT_MASK(MLX5E_40GBASE_LR4): + *active_width = IB_WIDTH_4X; + *active_speed = IB_SPEED_QDR; + break; + case MLX5E_PROT_MASK(MLX5E_50GBASE_CR2): + case MLX5E_PROT_MASK(MLX5E_50GBASE_KR2): + case MLX5E_PROT_MASK(MLX5E_50GBASE_SR2): + *active_width = IB_WIDTH_1X; + *active_speed = IB_SPEED_HDR; + break; + case MLX5E_PROT_MASK(MLX5E_56GBASE_R4): + *active_width = IB_WIDTH_4X; + *active_speed = IB_SPEED_FDR; + break; + case MLX5E_PROT_MASK(MLX5E_100GBASE_CR4): + case MLX5E_PROT_MASK(MLX5E_100GBASE_SR4): + case MLX5E_PROT_MASK(MLX5E_100GBASE_KR4): + case MLX5E_PROT_MASK(MLX5E_100GBASE_LR4): + *active_width = IB_WIDTH_4X; + *active_speed = IB_SPEED_EDR; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int translate_eth_ext_proto_oper(u32 eth_proto_oper, u16 *active_speed, + u8 *active_width) +{ + switch (eth_proto_oper) { + case MLX5E_PROT_MASK(MLX5E_SGMII_100M): + case MLX5E_PROT_MASK(MLX5E_1000BASE_X_SGMII): + *active_width = IB_WIDTH_1X; + *active_speed = IB_SPEED_SDR; + break; + case MLX5E_PROT_MASK(MLX5E_5GBASE_R): + *active_width = IB_WIDTH_1X; + *active_speed = IB_SPEED_DDR; + break; + case MLX5E_PROT_MASK(MLX5E_10GBASE_XFI_XAUI_1): + *active_width = IB_WIDTH_1X; + *active_speed = IB_SPEED_QDR; + break; + case MLX5E_PROT_MASK(MLX5E_40GBASE_XLAUI_4_XLPPI_4): + *active_width = IB_WIDTH_4X; + *active_speed = IB_SPEED_QDR; + break; + case MLX5E_PROT_MASK(MLX5E_25GAUI_1_25GBASE_CR_KR): + *active_width = IB_WIDTH_1X; + *active_speed = IB_SPEED_EDR; + break; + case MLX5E_PROT_MASK(MLX5E_50GAUI_2_LAUI_2_50GBASE_CR2_KR2): + *active_width = IB_WIDTH_2X; + *active_speed = IB_SPEED_EDR; + break; + case MLX5E_PROT_MASK(MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR): + *active_width = IB_WIDTH_1X; + *active_speed = IB_SPEED_HDR; + break; + case MLX5E_PROT_MASK(MLX5E_CAUI_4_100GBASE_CR4_KR4): + *active_width = IB_WIDTH_4X; + *active_speed = IB_SPEED_EDR; + break; + case MLX5E_PROT_MASK(MLX5E_100GAUI_2_100GBASE_CR2_KR2): + *active_width = IB_WIDTH_2X; + *active_speed = IB_SPEED_HDR; + break; + case MLX5E_PROT_MASK(MLX5E_100GAUI_1_100GBASE_CR_KR): + *active_width = IB_WIDTH_1X; + *active_speed = IB_SPEED_NDR; + break; + case MLX5E_PROT_MASK(MLX5E_200GAUI_4_200GBASE_CR4_KR4): + *active_width = IB_WIDTH_4X; + *active_speed = IB_SPEED_HDR; + break; + case MLX5E_PROT_MASK(MLX5E_200GAUI_2_200GBASE_CR2_KR2): + *active_width = IB_WIDTH_2X; + *active_speed = IB_SPEED_NDR; + break; + case MLX5E_PROT_MASK(MLX5E_400GAUI_8): + *active_width = IB_WIDTH_8X; + *active_speed = IB_SPEED_HDR; + break; + case MLX5E_PROT_MASK(MLX5E_400GAUI_4_400GBASE_CR4_KR4): + *active_width = IB_WIDTH_4X; + *active_speed = IB_SPEED_NDR; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int translate_eth_proto_oper(u32 eth_proto_oper, u16 *active_speed, + u8 *active_width, bool ext) +{ + return ext ? + translate_eth_ext_proto_oper(eth_proto_oper, active_speed, + active_width) : + translate_eth_legacy_proto_oper(eth_proto_oper, active_speed, + active_width); +} + +static int mlx5_query_port_roce(struct ib_device *device, u32 port_num, + struct ib_port_attr *props) +{ + struct mlx5_ib_dev *dev = to_mdev(device); + u32 out[MLX5_ST_SZ_DW(ptys_reg)] = {0}; + struct mlx5_core_dev *mdev; + struct net_device *ndev, *upper; + enum ib_mtu ndev_ib_mtu; + bool put_mdev = true; + u32 eth_prot_oper; + u32 mdev_port_num; + bool ext; + int err; + + mdev = mlx5_ib_get_native_port_mdev(dev, port_num, &mdev_port_num); + if (!mdev) { + /* This means the port isn't affiliated yet. Get the + * info for the master port instead. + */ + put_mdev = false; + mdev = dev->mdev; + mdev_port_num = 1; + port_num = 1; + } + + /* Possible bad flows are checked before filling out props so in case + * of an error it will still be zeroed out. + * Use native port in case of reps + */ + if (dev->is_rep) + err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN, + 1); + else + err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN, + mdev_port_num); + if (err) + goto out; + ext = !!MLX5_GET_ETH_PROTO(ptys_reg, out, true, eth_proto_capability); + eth_prot_oper = MLX5_GET_ETH_PROTO(ptys_reg, out, ext, eth_proto_oper); + + props->active_width = IB_WIDTH_4X; + props->active_speed = IB_SPEED_QDR; + + translate_eth_proto_oper(eth_prot_oper, &props->active_speed, + &props->active_width, ext); + + if (!dev->is_rep && dev->mdev->roce.roce_en) { + u16 qkey_viol_cntr; + + props->port_cap_flags |= IB_PORT_CM_SUP; + props->ip_gids = true; + props->gid_tbl_len = MLX5_CAP_ROCE(dev->mdev, + roce_address_table_size); + mlx5_query_nic_vport_qkey_viol_cntr(mdev, &qkey_viol_cntr); + props->qkey_viol_cntr = qkey_viol_cntr; + } + props->max_mtu = IB_MTU_4096; + props->max_msg_sz = 1 << MLX5_CAP_GEN(dev->mdev, log_max_msg); + props->pkey_tbl_len = 1; + props->state = IB_PORT_DOWN; + props->phys_state = IB_PORT_PHYS_STATE_DISABLED; + + /* If this is a stub query for an unaffiliated port stop here */ + if (!put_mdev) { + props->state = IB_PORT_NOP; + goto out; + } + + ndev = mlx5_ib_get_netdev(device, port_num); + if (!ndev) { + /* Port may not be even created in HW, like SF's port */ + props->state = IB_PORT_NOP; + goto out; + } + + if (dev->lag_active) { + rcu_read_lock(); + upper = netdev_master_upper_dev_get_rcu(ndev); + if (upper) { + dev_put(ndev); + ndev = upper; + dev_hold(ndev); + } + rcu_read_unlock(); + } + + if (netif_running(ndev) && netif_carrier_ok(ndev)) { + props->state = IB_PORT_ACTIVE; + props->phys_state = IB_PORT_PHYS_STATE_LINK_UP; + } + + ndev_ib_mtu = iboe_get_mtu(ndev->mtu); + + dev_put(ndev); + + props->active_mtu = min(props->max_mtu, ndev_ib_mtu); +out: + if (put_mdev) + mlx5_ib_put_native_port_mdev(dev, port_num); + return err; +} + +static int set_roce_addr(struct mlx5_ib_dev *dev, u32 port_num, + unsigned int index, const union ib_gid *gid, + const struct ib_gid_attr *attr) +{ + enum ib_gid_type gid_type; + u16 vlan_id = 0xffff; + u8 roce_version = 0; + u8 roce_l3_type = 0; + u8 mac[ETH_ALEN]; + int ret; + + gid_type = attr->gid_type; + if (gid) { + ret = rdma_read_gid_l2_fields(attr, &vlan_id, &mac[0]); + if (ret) + return ret; + } + + switch (gid_type) { + case IB_GID_TYPE_ROCE: + roce_version = MLX5_ROCE_VERSION_1; + break; + case IB_GID_TYPE_ROCE_UDP_ENCAP: + roce_version = MLX5_ROCE_VERSION_2; + if (gid && ipv6_addr_v4mapped((void *)gid)) + roce_l3_type = MLX5_ROCE_L3_TYPE_IPV4; + else + roce_l3_type = MLX5_ROCE_L3_TYPE_IPV6; + break; + + default: + mlx5_ib_warn(dev, "Unexpected GID type %u\n", gid_type); + } + + return mlx5_core_roce_gid_set(dev->mdev, index, roce_version, + roce_l3_type, gid->raw, mac, + vlan_id < VLAN_CFI_MASK, vlan_id, + port_num); +} + +#ifdef CONFIG_MLX5_EN_MACSEC +#define NIC_RDMA_BOTH_DIRS_CAPS (MLX5_FT_NIC_RX_2_NIC_RX_RDMA | MLX5_FT_NIC_TX_RDMA_2_NIC_TX) + +static int add_gid_macsec_operations(const struct ib_gid_attr *attr) +{ + struct mlx5_ib_dev *dev = to_mdev(attr->device); + const struct ib_gid_attr *physical_gid; + struct mlx5_reserved_gids *mgids; + struct net_device *ndev; + int ret = 0, i; + union { + struct sockaddr_in sockaddr_in; + struct sockaddr_in6 sockaddr_in6; + } addr; + + if (((MLX5_CAP_GEN_2(dev->mdev, flow_table_type_2_type) & + NIC_RDMA_BOTH_DIRS_CAPS) != NIC_RDMA_BOTH_DIRS_CAPS) || + !MLX5_CAP_FLOWTABLE_RDMA_TX(dev->mdev, max_modify_header_actions)) { + mlx5_ib_dbg(dev, "Failed to add RoCE MACsec, capabilities not supported\n"); + return 0; + } + + rcu_read_lock(); + ndev = rcu_dereference(attr->ndev); + if (!ndev) { + rcu_read_unlock(); + ret = -ENODEV; + goto out_rcu; + } + dev_hold(ndev); + rcu_read_unlock(); + + if (!netif_is_macsec(ndev)) + goto out; + + if (!netdev_macsec_is_offloaded(ndev)) + goto out; + + if (!(macsec_get_real_dev(ndev)->features & + NETIF_F_HW_MACSEC)) { + ret = -EOPNOTSUPP; + goto out; + } + + physical_gid = rdma_find_gid(attr->device, &attr->gid, + attr->gid_type, NULL); + if (IS_ERR(physical_gid)) + goto no_gid_ambig; + + ret = set_roce_addr(to_mdev(physical_gid->device), + physical_gid->port_num, + physical_gid->index, NULL, + physical_gid); + if (ret) + goto gid_err; + for (i = 0; i < MLX5_MAX_MACSEC_GIDS; i++) { + mgids = &dev->reserved_gids[attr->port_num - 1][i]; + if (mgids->macsec_index == -1) { + mgids->macsec_index = attr->index; + mgids->physical_gid = physical_gid; + break; + } + } +no_gid_ambig: + rdma_gid2ip((struct sockaddr *)&addr, &attr->gid); + ret = mlx5e_macsec_fs_add_roce_rule(ndev, (struct sockaddr *)&addr); + if (ret) + goto rule_err; + + dev_put(ndev); + return 0; + +rule_err: + if (!IS_ERR(physical_gid)) { + set_roce_addr(to_mdev(physical_gid->device), physical_gid->port_num, + physical_gid->index, &physical_gid->gid, physical_gid); + dev->reserved_gids[attr->port_num - 1][i].macsec_index = -1; + } +gid_err: + if (!IS_ERR(physical_gid)) + rdma_put_gid_attr(physical_gid); +out: + dev_put(ndev); +out_rcu: + return ret; +} +#endif + +static int mlx5_ib_add_gid(const struct ib_gid_attr *attr, + __always_unused void **context) +{ + int ret; + + ret = set_roce_addr(to_mdev(attr->device), attr->port_num, + attr->index, &attr->gid, attr); + if (ret) + return ret; + +#ifdef CONFIG_MLX5_EN_MACSEC + ret = add_gid_macsec_operations(attr); + if (ret) + set_roce_addr(to_mdev(attr->device), attr->port_num, + attr->index, NULL, attr); +#endif + return ret; +} + +#ifdef CONFIG_MLX5_EN_MACSEC +static void del_gid_macsec_operations(const struct ib_gid_attr *attr) +{ + struct mlx5_ib_dev *dev = to_mdev(attr->device); + struct mlx5_reserved_gids *mgids; + int i; + + if (((MLX5_CAP_GEN_2(dev->mdev, flow_table_type_2_type) & + NIC_RDMA_BOTH_DIRS_CAPS) != NIC_RDMA_BOTH_DIRS_CAPS) || + !MLX5_CAP_FLOWTABLE_RDMA_TX(dev->mdev, max_modify_header_actions)) { + mlx5_ib_dbg(dev, "Failed to add RoCE MACsec, capabilities not supported\n"); + return; + } + + for (i = 0; i < MLX5_MAX_MACSEC_GIDS; i++) { + mgids = &dev->reserved_gids[attr->port_num - 1][i]; + if (mgids->macsec_index == attr->index) { + const struct ib_gid_attr *physical_gid = mgids->physical_gid; + + set_roce_addr(to_mdev(physical_gid->device), + physical_gid->port_num, + physical_gid->index, + &physical_gid->gid, physical_gid); + + rdma_put_gid_attr(physical_gid); + mgids->macsec_index = -1; + break; + } + } +} +#endif + +static int mlx5_ib_del_gid(const struct ib_gid_attr *attr, + __always_unused void **context) +{ + int ret; + + ret = set_roce_addr(to_mdev(attr->device), attr->port_num, + attr->index, NULL, attr); +#ifdef CONFIG_MLX5_EN_MACSEC + if (ret) + return ret; + + del_gid_macsec_operations(attr); +#endif + + return ret; +} + +__be16 mlx5_get_roce_udp_sport_min(const struct mlx5_ib_dev *dev, + const struct ib_gid_attr *attr) +{ + if (attr->gid_type != IB_GID_TYPE_ROCE_UDP_ENCAP) + return 0; + + return cpu_to_be16(MLX5_CAP_ROCE(dev->mdev, r_roce_min_src_udp_port)); +} + +static int mlx5_use_mad_ifc(struct mlx5_ib_dev *dev) +{ + if (MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_IB) + return !MLX5_CAP_GEN(dev->mdev, ib_virt); + return 0; +} + +enum { + MLX5_VPORT_ACCESS_METHOD_MAD, + MLX5_VPORT_ACCESS_METHOD_HCA, + MLX5_VPORT_ACCESS_METHOD_NIC, +}; + +static int mlx5_get_vport_access_method(struct ib_device *ibdev) +{ + if (mlx5_use_mad_ifc(to_mdev(ibdev))) + return MLX5_VPORT_ACCESS_METHOD_MAD; + + if (mlx5_ib_port_link_layer(ibdev, 1) == + IB_LINK_LAYER_ETHERNET) + return MLX5_VPORT_ACCESS_METHOD_NIC; + + return MLX5_VPORT_ACCESS_METHOD_HCA; +} + +static void get_atomic_caps(struct mlx5_ib_dev *dev, + u8 atomic_size_qp, + struct ib_device_attr *props) +{ + u8 tmp; + u8 atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations); + u8 atomic_req_8B_endianness_mode = + MLX5_CAP_ATOMIC(dev->mdev, atomic_req_8B_endianness_mode); + + /* Check if HW supports 8 bytes standard atomic operations and capable + * of host endianness respond + */ + tmp = MLX5_ATOMIC_OPS_CMP_SWAP | MLX5_ATOMIC_OPS_FETCH_ADD; + if (((atomic_operations & tmp) == tmp) && + (atomic_size_qp & MLX5_ATOMIC_SIZE_QP_8BYTES) && + (atomic_req_8B_endianness_mode)) { + props->atomic_cap = IB_ATOMIC_HCA; + } else { + props->atomic_cap = IB_ATOMIC_NONE; + } +} + +static void get_atomic_caps_qp(struct mlx5_ib_dev *dev, + struct ib_device_attr *props) +{ + u8 atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp); + + get_atomic_caps(dev, atomic_size_qp, props); +} + +static int mlx5_query_system_image_guid(struct ib_device *ibdev, + __be64 *sys_image_guid) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_core_dev *mdev = dev->mdev; + u64 tmp; + int err; + + switch (mlx5_get_vport_access_method(ibdev)) { + case MLX5_VPORT_ACCESS_METHOD_MAD: + return mlx5_query_mad_ifc_system_image_guid(ibdev, + sys_image_guid); + + case MLX5_VPORT_ACCESS_METHOD_HCA: + err = mlx5_query_hca_vport_system_image_guid(mdev, &tmp); + break; + + case MLX5_VPORT_ACCESS_METHOD_NIC: + err = mlx5_query_nic_vport_system_image_guid(mdev, &tmp); + break; + + default: + return -EINVAL; + } + + if (!err) + *sys_image_guid = cpu_to_be64(tmp); + + return err; + +} + +static int mlx5_query_max_pkeys(struct ib_device *ibdev, + u16 *max_pkeys) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_core_dev *mdev = dev->mdev; + + switch (mlx5_get_vport_access_method(ibdev)) { + case MLX5_VPORT_ACCESS_METHOD_MAD: + return mlx5_query_mad_ifc_max_pkeys(ibdev, max_pkeys); + + case MLX5_VPORT_ACCESS_METHOD_HCA: + case MLX5_VPORT_ACCESS_METHOD_NIC: + *max_pkeys = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev, + pkey_table_size)); + return 0; + + default: + return -EINVAL; + } +} + +static int mlx5_query_vendor_id(struct ib_device *ibdev, + u32 *vendor_id) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + + switch (mlx5_get_vport_access_method(ibdev)) { + case MLX5_VPORT_ACCESS_METHOD_MAD: + return mlx5_query_mad_ifc_vendor_id(ibdev, vendor_id); + + case MLX5_VPORT_ACCESS_METHOD_HCA: + case MLX5_VPORT_ACCESS_METHOD_NIC: + return mlx5_core_query_vendor_id(dev->mdev, vendor_id); + + default: + return -EINVAL; + } +} + +static int mlx5_query_node_guid(struct mlx5_ib_dev *dev, + __be64 *node_guid) +{ + u64 tmp; + int err; + + switch (mlx5_get_vport_access_method(&dev->ib_dev)) { + case MLX5_VPORT_ACCESS_METHOD_MAD: + return mlx5_query_mad_ifc_node_guid(dev, node_guid); + + case MLX5_VPORT_ACCESS_METHOD_HCA: + err = mlx5_query_hca_vport_node_guid(dev->mdev, &tmp); + break; + + case MLX5_VPORT_ACCESS_METHOD_NIC: + err = mlx5_query_nic_vport_node_guid(dev->mdev, 0, &tmp); + break; + + default: + return -EINVAL; + } + + if (!err) + *node_guid = cpu_to_be64(tmp); + + return err; +} + +struct mlx5_reg_node_desc { + u8 desc[IB_DEVICE_NODE_DESC_MAX]; +}; + +static int mlx5_query_node_desc(struct mlx5_ib_dev *dev, char *node_desc) +{ + struct mlx5_reg_node_desc in; + + if (mlx5_use_mad_ifc(dev)) + return mlx5_query_mad_ifc_node_desc(dev, node_desc); + + memset(&in, 0, sizeof(in)); + + return mlx5_core_access_reg(dev->mdev, &in, sizeof(in), node_desc, + sizeof(struct mlx5_reg_node_desc), + MLX5_REG_NODE_DESC, 0, 0); +} + +static int mlx5_ib_query_device(struct ib_device *ibdev, + struct ib_device_attr *props, + struct ib_udata *uhw) +{ + size_t uhw_outlen = (uhw) ? uhw->outlen : 0; + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_core_dev *mdev = dev->mdev; + int err = -ENOMEM; + int max_sq_desc; + int max_rq_sg; + int max_sq_sg; + u64 min_page_size = 1ull << MLX5_CAP_GEN(mdev, log_pg_sz); + bool raw_support = !mlx5_core_mp_enabled(mdev); + struct mlx5_ib_query_device_resp resp = {}; + size_t resp_len; + u64 max_tso; + + resp_len = sizeof(resp.comp_mask) + sizeof(resp.response_length); + if (uhw_outlen && uhw_outlen < resp_len) + return -EINVAL; + + resp.response_length = resp_len; + + if (uhw && uhw->inlen && !ib_is_udata_cleared(uhw, 0, uhw->inlen)) + return -EINVAL; + + memset(props, 0, sizeof(*props)); + err = mlx5_query_system_image_guid(ibdev, + &props->sys_image_guid); + if (err) + return err; + + props->max_pkeys = dev->pkey_table_len; + + err = mlx5_query_vendor_id(ibdev, &props->vendor_id); + if (err) + return err; + + props->fw_ver = ((u64)fw_rev_maj(dev->mdev) << 32) | + (fw_rev_min(dev->mdev) << 16) | + fw_rev_sub(dev->mdev); + props->device_cap_flags = IB_DEVICE_CHANGE_PHY_PORT | + IB_DEVICE_PORT_ACTIVE_EVENT | + IB_DEVICE_SYS_IMAGE_GUID | + IB_DEVICE_RC_RNR_NAK_GEN; + + if (MLX5_CAP_GEN(mdev, pkv)) + props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR; + if (MLX5_CAP_GEN(mdev, qkv)) + props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR; + if (MLX5_CAP_GEN(mdev, apm)) + props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG; + if (MLX5_CAP_GEN(mdev, xrc)) + props->device_cap_flags |= IB_DEVICE_XRC; + if (MLX5_CAP_GEN(mdev, imaicl)) { + props->device_cap_flags |= IB_DEVICE_MEM_WINDOW | + IB_DEVICE_MEM_WINDOW_TYPE_2B; + props->max_mw = 1 << MLX5_CAP_GEN(mdev, log_max_mkey); + /* We support 'Gappy' memory registration too */ +#ifdef CONFIG_GPU_DIRECT_STORAGE + if (MLX5_CAP_GEN(mdev, ats) == MLX5_CAP_GEN(mdev, relaxed_ordering_read)) + props->device_cap_flags |= IB_DEVICE_SG_GAPS_REG; +#else + props->device_cap_flags |= IB_DEVICE_SG_GAPS_REG; +#endif + } + /* IB_WR_REG_MR always requires changing the entity size with UMR */ + if (!MLX5_CAP_GEN(dev->mdev, umr_modify_entity_size_disabled)) + props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; + if (MLX5_CAP_GEN(mdev, sho)) { + props->device_cap_flags |= IB_DEVICE_INTEGRITY_HANDOVER; + /* At this stage no support for signature handover */ + props->sig_prot_cap = IB_PROT_T10DIF_TYPE_1 | + IB_PROT_T10DIF_TYPE_2 | + IB_PROT_T10DIF_TYPE_3; + props->sig_guard_cap = IB_GUARD_T10DIF_CRC | + IB_GUARD_T10DIF_CSUM; + } + if (MLX5_CAP_GEN(mdev, block_lb_mc)) + props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK; + + if (MLX5_CAP_GEN(mdev, nvmf_target_offload)) { + props->device_cap_flags |= IB_DEVICE_NVMF_TARGET_OFFLOAD; + props->nvmf_caps = dev->nvmf_caps; + } + + if (MLX5_CAP_GEN(mdev, rts2rts_qp_rmp)) + props->device_cap_flags |= IB_DEVICE_QP_MODIFY_RMP; + + if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) && raw_support) { + if (MLX5_CAP_ETH(mdev, csum_cap)) { + /* Legacy bit to support old userspace libraries */ + props->device_cap_flags |= IB_DEVICE_RAW_IP_CSUM; + props->raw_packet_caps |= IB_RAW_PACKET_CAP_IP_CSUM; + } + + if (MLX5_CAP_ETH(dev->mdev, vlan_cap)) + props->raw_packet_caps |= + IB_RAW_PACKET_CAP_CVLAN_STRIPPING; + + if (offsetofend(typeof(resp), tso_caps) <= uhw_outlen) { + max_tso = MLX5_CAP_ETH(mdev, max_lso_cap); + if (max_tso) { + resp.tso_caps.max_tso = 1 << max_tso; + resp.tso_caps.supported_qpts |= + 1 << IB_QPT_RAW_PACKET; + resp.response_length += sizeof(resp.tso_caps); + } + } + + if (offsetofend(typeof(resp), rss_caps) <= uhw_outlen) { + resp.rss_caps.rx_hash_function = + MLX5_RX_HASH_FUNC_TOEPLITZ; + resp.rss_caps.rx_hash_fields_mask = + MLX5_RX_HASH_SRC_IPV4 | + MLX5_RX_HASH_DST_IPV4 | + MLX5_RX_HASH_SRC_IPV6 | + MLX5_RX_HASH_DST_IPV6 | + MLX5_RX_HASH_SRC_PORT_TCP | + MLX5_RX_HASH_DST_PORT_TCP | + MLX5_RX_HASH_SRC_PORT_UDP | + MLX5_RX_HASH_DST_PORT_UDP | + MLX5_RX_HASH_INNER; + if (mlx5_accel_ipsec_device_caps(dev->mdev) & + MLX5_ACCEL_IPSEC_CAP_DEVICE) + resp.rss_caps.rx_hash_fields_mask |= + MLX5_RX_HASH_IPSEC_SPI; + resp.response_length += sizeof(resp.rss_caps); + } + } else { + if (offsetofend(typeof(resp), tso_caps) <= uhw_outlen) + resp.response_length += sizeof(resp.tso_caps); + if (offsetofend(typeof(resp), rss_caps) <= uhw_outlen) + resp.response_length += sizeof(resp.rss_caps); + } + + if (MLX5_CAP_GEN(mdev, ipoib_basic_offloads)) { + props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM; + props->device_cap_flags |= IB_DEVICE_UD_TSO; + } + + if (MLX5_CAP_GEN(dev->mdev, rq_delay_drop) && + MLX5_CAP_GEN(dev->mdev, general_notification_event) && + raw_support) + props->raw_packet_caps |= IB_RAW_PACKET_CAP_DELAY_DROP; + + if (MLX5_CAP_GEN(mdev, ipoib_enhanced_offloads) && + MLX5_CAP_IPOIB_ENHANCED(mdev, csum_cap)) + props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM; + + if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) && + MLX5_CAP_ETH(dev->mdev, scatter_fcs) && + raw_support) { + /* Legacy bit to support old userspace libraries */ + props->device_cap_flags |= IB_DEVICE_RAW_SCATTER_FCS; + props->raw_packet_caps |= IB_RAW_PACKET_CAP_SCATTER_FCS; + } + + if (MLX5_CAP_DEV_MEM(mdev, memic)) { + props->max_dm_size = + MLX5_CAP_DEV_MEM(mdev, max_memic_size); + } + + if (mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_BYPASS)) + props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING; + + if (MLX5_CAP_GEN(mdev, end_pad)) + props->device_cap_flags |= IB_DEVICE_PCI_WRITE_END_PADDING; + + if (MLX5_CAP_GEN(mdev, drain_sigerr)) + props->device_cap_flags |= IB_DEVICE_SIGNATURE_PIPELINE; + + props->vendor_part_id = mdev->pdev->device; + props->hw_ver = mdev->pdev->revision; + + props->max_mr_size = ~0ull; + props->page_size_cap = ~(min_page_size - 1); + props->max_qp = 1 << MLX5_CAP_GEN(mdev, log_max_qp); + props->max_qp_wr = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz); + max_rq_sg = MLX5_CAP_GEN(mdev, max_wqe_sz_rq) / + sizeof(struct mlx5_wqe_data_seg); + max_sq_desc = min_t(int, MLX5_CAP_GEN(mdev, max_wqe_sz_sq), 512); + max_sq_sg = (max_sq_desc - sizeof(struct mlx5_wqe_ctrl_seg) - + sizeof(struct mlx5_wqe_raddr_seg)) / + sizeof(struct mlx5_wqe_data_seg); + props->max_send_sge = max_sq_sg; + props->max_recv_sge = max_rq_sg; + props->max_sge_rd = MLX5_MAX_SGE_RD; + props->max_cq = 1 << MLX5_CAP_GEN(mdev, log_max_cq); + props->max_cqe = (1 << MLX5_CAP_GEN(mdev, log_max_cq_sz)) - 1; + props->max_mr = 1 << MLX5_CAP_GEN(mdev, log_max_mkey); + props->max_pd = 1 << MLX5_CAP_GEN(mdev, log_max_pd); + props->max_qp_rd_atom = 1 << MLX5_CAP_GEN(mdev, log_max_ra_req_qp); + props->max_qp_init_rd_atom = 1 << MLX5_CAP_GEN(mdev, log_max_ra_res_qp); + props->max_srq = 1 << MLX5_CAP_GEN(mdev, log_max_srq); + props->max_srq_wr = (1 << MLX5_CAP_GEN(mdev, log_max_srq_sz)) - 1; + props->local_ca_ack_delay = MLX5_CAP_GEN(mdev, local_ca_ack_delay); + props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp; + props->max_srq_sge = max_rq_sg - 1; + props->max_fast_reg_page_list_len = + 1 << MLX5_CAP_GEN(mdev, log_max_klm_list_size); + props->max_pi_fast_reg_page_list_len = + props->max_fast_reg_page_list_len / 2; + props->max_sgl_rd = + MLX5_CAP_GEN(mdev, max_sgl_for_optimized_performance); + get_atomic_caps_qp(dev, props); + props->masked_atomic_cap = IB_ATOMIC_NONE; + props->max_mcast_grp = 1 << MLX5_CAP_GEN(mdev, log_max_mcg); + props->max_mcast_qp_attach = MLX5_CAP_GEN(mdev, max_qp_mcg); + props->max_total_mcast_qp_attach = props->max_mcast_qp_attach * + props->max_mcast_grp; + props->max_ah = INT_MAX; + props->hca_core_clock = MLX5_CAP_GEN(mdev, device_frequency_khz); + props->timestamp_mask = 0x7FFFFFFFFFFFFFFFULL; + + if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) { + if (dev->odp_caps.general_caps & IB_ODP_SUPPORT) + props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING; + props->odp_caps = dev->odp_caps; + if (!uhw) { + /* ODP for kernel QPs is not implemented for receive + * WQEs and SRQ WQEs + */ + props->odp_caps.per_transport_caps.rc_odp_caps &= + ~(IB_ODP_SUPPORT_READ | + IB_ODP_SUPPORT_SRQ_RECV); + props->odp_caps.per_transport_caps.uc_odp_caps &= + ~(IB_ODP_SUPPORT_READ | + IB_ODP_SUPPORT_SRQ_RECV); + props->odp_caps.per_transport_caps.ud_odp_caps &= + ~(IB_ODP_SUPPORT_READ | + IB_ODP_SUPPORT_SRQ_RECV); + props->odp_caps.per_transport_caps.xrc_odp_caps &= + ~(IB_ODP_SUPPORT_READ | + IB_ODP_SUPPORT_SRQ_RECV); + } + } + + if (MLX5_CAP_GEN(mdev, cd)) + props->device_cap_flags |= IB_DEVICE_CROSS_CHANNEL; + + if (mlx5_core_is_vf(mdev)) + props->device_cap_flags |= IB_DEVICE_VIRTUAL_FUNCTION; + + if (mlx5_ib_port_link_layer(ibdev, 1) == + IB_LINK_LAYER_ETHERNET && raw_support) { + props->rss_caps.max_rwq_indirection_tables = + 1 << MLX5_CAP_GEN(dev->mdev, log_max_rqt); + props->rss_caps.max_rwq_indirection_table_size = + 1 << MLX5_CAP_GEN(dev->mdev, log_max_rqt_size); + props->rss_caps.supported_qpts = 1 << IB_QPT_RAW_PACKET; + props->max_wq_type_rq = + 1 << MLX5_CAP_GEN(dev->mdev, log_max_rq); + } + + if (MLX5_CAP_GEN(mdev, tag_matching)) { + props->tm_caps.max_num_tags = + (1 << MLX5_CAP_GEN(mdev, log_tag_matching_list_sz)) - 1; + props->tm_caps.max_ops = + 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz); + props->tm_caps.max_sge = MLX5_TM_MAX_SGE; + } + + if (MLX5_CAP_GEN(mdev, tag_matching) && + MLX5_CAP_GEN(mdev, rndv_offload_rc)) { + props->tm_caps.flags = IB_TM_CAP_RNDV_RC; + props->tm_caps.max_rndv_hdr_size = MLX5_TM_MAX_RNDV_MSG_SIZE; + } + + if (MLX5_CAP_GEN(dev->mdev, cq_moderation)) { + props->cq_caps.max_cq_moderation_count = + MLX5_MAX_CQ_COUNT; + props->cq_caps.max_cq_moderation_period = + MLX5_MAX_CQ_PERIOD; + } + + if (offsetofend(typeof(resp), cqe_comp_caps) <= uhw_outlen) { + resp.response_length += sizeof(resp.cqe_comp_caps); + + if (MLX5_CAP_GEN(dev->mdev, cqe_compression)) { + resp.cqe_comp_caps.max_num = + MLX5_CAP_GEN(dev->mdev, + cqe_compression_max_num); + + resp.cqe_comp_caps.supported_format = + MLX5_IB_CQE_RES_FORMAT_HASH | + MLX5_IB_CQE_RES_FORMAT_CSUM; + + if (MLX5_CAP_GEN(dev->mdev, mini_cqe_resp_stride_index)) + resp.cqe_comp_caps.supported_format |= + MLX5_IB_CQE_RES_FORMAT_CSUM_STRIDX; + } + } + + if (offsetofend(typeof(resp), packet_pacing_caps) <= uhw_outlen && + raw_support) { + if (MLX5_CAP_QOS(mdev, packet_pacing) && + MLX5_CAP_GEN(mdev, qos)) { + resp.packet_pacing_caps.qp_rate_limit_max = + MLX5_CAP_QOS(mdev, packet_pacing_max_rate); + resp.packet_pacing_caps.qp_rate_limit_min = + MLX5_CAP_QOS(mdev, packet_pacing_min_rate); + resp.packet_pacing_caps.supported_qpts |= + 1 << IB_QPT_RAW_PACKET; + if (MLX5_CAP_QOS(mdev, packet_pacing_burst_bound) && + MLX5_CAP_QOS(mdev, packet_pacing_typical_size)) + resp.packet_pacing_caps.cap_flags |= + MLX5_IB_PP_SUPPORT_BURST; + } + resp.response_length += sizeof(resp.packet_pacing_caps); + } + + if (offsetofend(typeof(resp), mlx5_ib_support_multi_pkt_send_wqes) <= + uhw_outlen) { + if (MLX5_CAP_ETH(mdev, multi_pkt_send_wqe)) + resp.mlx5_ib_support_multi_pkt_send_wqes = + MLX5_IB_ALLOW_MPW; + + if (MLX5_CAP_ETH(mdev, enhanced_multi_pkt_send_wqe)) + resp.mlx5_ib_support_multi_pkt_send_wqes |= + MLX5_IB_SUPPORT_EMPW; + + resp.response_length += + sizeof(resp.mlx5_ib_support_multi_pkt_send_wqes); + } + + if (offsetofend(typeof(resp), flags) <= uhw_outlen) { + resp.response_length += sizeof(resp.flags); + + if (MLX5_CAP_GEN(mdev, cqe_compression_128)) + resp.flags |= + MLX5_IB_QUERY_DEV_RESP_FLAGS_CQE_128B_COMP; + + if (MLX5_CAP_GEN(mdev, cqe_128_always)) + resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_CQE_128B_PAD; + if (MLX5_CAP_GEN(mdev, qp_packet_based)) + resp.flags |= + MLX5_IB_QUERY_DEV_RESP_PACKET_BASED_CREDIT_MODE; + + resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_SCAT2CQE_DCT; + } + + if (offsetofend(typeof(resp), sw_parsing_caps) <= uhw_outlen) { + resp.response_length += sizeof(resp.sw_parsing_caps); + if (MLX5_CAP_ETH(mdev, swp)) { + resp.sw_parsing_caps.sw_parsing_offloads |= + MLX5_IB_SW_PARSING; + + if (MLX5_CAP_ETH(mdev, swp_csum)) + resp.sw_parsing_caps.sw_parsing_offloads |= + MLX5_IB_SW_PARSING_CSUM; + + if (MLX5_CAP_ETH(mdev, swp_lso)) + resp.sw_parsing_caps.sw_parsing_offloads |= + MLX5_IB_SW_PARSING_LSO; + + if (resp.sw_parsing_caps.sw_parsing_offloads) + resp.sw_parsing_caps.supported_qpts = + BIT(IB_QPT_RAW_PACKET); + } + } + + if (offsetofend(typeof(resp), striding_rq_caps) <= uhw_outlen && + raw_support) { + resp.response_length += sizeof(resp.striding_rq_caps); + if (MLX5_CAP_GEN(mdev, striding_rq)) { + resp.striding_rq_caps.min_single_stride_log_num_of_bytes = + MLX5_MIN_SINGLE_STRIDE_LOG_NUM_BYTES; + resp.striding_rq_caps.max_single_stride_log_num_of_bytes = + MLX5_MAX_SINGLE_STRIDE_LOG_NUM_BYTES; + if (MLX5_CAP_GEN(dev->mdev, ext_stride_num_range)) + resp.striding_rq_caps + .min_single_wqe_log_num_of_strides = + MLX5_EXT_MIN_SINGLE_WQE_LOG_NUM_STRIDES; + else + resp.striding_rq_caps + .min_single_wqe_log_num_of_strides = + MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES; + resp.striding_rq_caps.max_single_wqe_log_num_of_strides = + MLX5_MAX_SINGLE_WQE_LOG_NUM_STRIDES; + resp.striding_rq_caps.supported_qpts = + BIT(IB_QPT_RAW_PACKET); + } + } + + if (offsetofend(typeof(resp), tunnel_offloads_caps) <= uhw_outlen) { + resp.response_length += sizeof(resp.tunnel_offloads_caps); + if (MLX5_CAP_ETH(mdev, tunnel_stateless_vxlan)) + resp.tunnel_offloads_caps |= + MLX5_IB_TUNNELED_OFFLOADS_VXLAN; + if (MLX5_CAP_ETH(mdev, tunnel_stateless_geneve_rx)) + resp.tunnel_offloads_caps |= + MLX5_IB_TUNNELED_OFFLOADS_GENEVE; + if (MLX5_CAP_ETH(mdev, tunnel_stateless_gre)) + resp.tunnel_offloads_caps |= + MLX5_IB_TUNNELED_OFFLOADS_GRE; + if (MLX5_CAP_ETH(mdev, tunnel_stateless_mpls_over_gre)) + resp.tunnel_offloads_caps |= + MLX5_IB_TUNNELED_OFFLOADS_MPLS_GRE; + if (MLX5_CAP_ETH(mdev, tunnel_stateless_mpls_over_udp)) + resp.tunnel_offloads_caps |= + MLX5_IB_TUNNELED_OFFLOADS_MPLS_UDP; + } + + if (offsetofend(typeof(resp), dci_streams_caps) <= uhw_outlen) { + resp.response_length += sizeof(resp.dci_streams_caps); + + resp.dci_streams_caps.max_log_num_concurent = + MLX5_CAP_GEN(mdev, log_max_dci_stream_channels); + + resp.dci_streams_caps.max_log_num_errored = + MLX5_CAP_GEN(mdev, log_max_dci_errored_streams); + } + + if (uhw_outlen) { + err = ib_copy_to_udata(uhw, &resp, resp.response_length); + + if (err) + return err; + } + + return 0; +} + +static void translate_active_width(struct ib_device *ibdev, u16 active_width, + u8 *ib_width) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + + if (active_width & MLX5_PTYS_WIDTH_1X) + *ib_width = IB_WIDTH_1X; + else if (active_width & MLX5_PTYS_WIDTH_2X) + *ib_width = IB_WIDTH_2X; + else if (active_width & MLX5_PTYS_WIDTH_4X) + *ib_width = IB_WIDTH_4X; + else if (active_width & MLX5_PTYS_WIDTH_8X) + *ib_width = IB_WIDTH_8X; + else if (active_width & MLX5_PTYS_WIDTH_12X) + *ib_width = IB_WIDTH_12X; + else { + mlx5_ib_dbg(dev, "Invalid active_width %d, setting width to default value: 4x\n", + active_width); + *ib_width = IB_WIDTH_4X; + } + + return; +} + +static int mlx5_mtu_to_ib_mtu(int mtu) +{ + switch (mtu) { + case 256: return 1; + case 512: return 2; + case 1024: return 3; + case 2048: return 4; + case 4096: return 5; + default: + pr_warn("invalid mtu\n"); + return -1; + } +} + +enum ib_max_vl_num { + __IB_MAX_VL_0 = 1, + __IB_MAX_VL_0_1 = 2, + __IB_MAX_VL_0_3 = 3, + __IB_MAX_VL_0_7 = 4, + __IB_MAX_VL_0_14 = 5, +}; + +enum mlx5_vl_hw_cap { + MLX5_VL_HW_0 = 1, + MLX5_VL_HW_0_1 = 2, + MLX5_VL_HW_0_2 = 3, + MLX5_VL_HW_0_3 = 4, + MLX5_VL_HW_0_4 = 5, + MLX5_VL_HW_0_5 = 6, + MLX5_VL_HW_0_6 = 7, + MLX5_VL_HW_0_7 = 8, + MLX5_VL_HW_0_14 = 15 +}; + +static int translate_max_vl_num(struct ib_device *ibdev, u8 vl_hw_cap, + u8 *max_vl_num) +{ + switch (vl_hw_cap) { + case MLX5_VL_HW_0: + *max_vl_num = __IB_MAX_VL_0; + break; + case MLX5_VL_HW_0_1: + *max_vl_num = __IB_MAX_VL_0_1; + break; + case MLX5_VL_HW_0_3: + *max_vl_num = __IB_MAX_VL_0_3; + break; + case MLX5_VL_HW_0_7: + *max_vl_num = __IB_MAX_VL_0_7; + break; + case MLX5_VL_HW_0_14: + *max_vl_num = __IB_MAX_VL_0_14; + break; + + default: + return -EINVAL; + } + + return 0; +} + +static int mlx5_query_hca_port(struct ib_device *ibdev, u32 port, + struct ib_port_attr *props) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_hca_vport_context *rep; + u16 max_mtu; + u16 oper_mtu; + int err; + u16 ib_link_width_oper; + u8 vl_hw_cap; + + rep = kzalloc(sizeof(*rep), GFP_KERNEL); + if (!rep) { + err = -ENOMEM; + goto out; + } + + /* props being zeroed by the caller, avoid zeroing it here */ + + err = mlx5_query_hca_vport_context(mdev, 0, port, 0, rep); + if (err) + goto out; + + props->lid = rep->lid; + props->lmc = rep->lmc; + props->sm_lid = rep->sm_lid; + props->has_smi = rep->has_smi; + props->sm_sl = rep->sm_sl; + props->state = rep->vport_state; + props->phys_state = rep->port_physical_state; + props->port_cap_flags = rep->cap_mask1; + props->gid_tbl_len = mlx5_get_gid_table_len(MLX5_CAP_GEN(mdev, gid_table_size)); + props->max_msg_sz = 1 << MLX5_CAP_GEN(mdev, log_max_msg); + props->pkey_tbl_len = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev, pkey_table_size)); + props->bad_pkey_cntr = rep->pkey_violation_counter; + props->qkey_viol_cntr = rep->qkey_violation_counter; + props->subnet_timeout = rep->subnet_timeout; + props->init_type_reply = rep->init_type_reply; + + if (props->port_cap_flags & IB_PORT_CAP_MASK2_SUP) + props->port_cap_flags2 = rep->cap_mask2; + + err = mlx5_query_ib_port_oper(mdev, &ib_link_width_oper, + &props->active_speed, port); + if (err) + goto out; + + translate_active_width(ibdev, ib_link_width_oper, &props->active_width); + + mlx5_query_port_max_mtu(mdev, &max_mtu, port); + + props->max_mtu = mlx5_mtu_to_ib_mtu(max_mtu); + + mlx5_query_port_oper_mtu(mdev, &oper_mtu, port); + + props->active_mtu = mlx5_mtu_to_ib_mtu(oper_mtu); + + err = mlx5_query_port_vl_hw_cap(mdev, &vl_hw_cap, port); + if (err) + goto out; + + err = translate_max_vl_num(ibdev, vl_hw_cap, + &props->max_vl_num); +out: + kfree(rep); + return err; +} + +int mlx5_ib_query_port(struct ib_device *ibdev, u32 port, + struct ib_port_attr *props) +{ + unsigned int count; + int ret; + + switch (mlx5_get_vport_access_method(ibdev)) { + case MLX5_VPORT_ACCESS_METHOD_MAD: + ret = mlx5_query_mad_ifc_port(ibdev, port, props); + break; + + case MLX5_VPORT_ACCESS_METHOD_HCA: + ret = mlx5_query_hca_port(ibdev, port, props); + break; + + case MLX5_VPORT_ACCESS_METHOD_NIC: + ret = mlx5_query_port_roce(ibdev, port, props); + break; + + default: + ret = -EINVAL; + } + + if (!ret && props) { + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_core_dev *mdev; + bool put_mdev = true; + + mdev = mlx5_ib_get_native_port_mdev(dev, port, NULL); + if (!mdev) { + /* If the port isn't affiliated yet query the master. + * The master and slave will have the same values. + */ + mdev = dev->mdev; + port = 1; + put_mdev = false; + } + count = mlx5_core_reserved_gids_count(mdev); + if (put_mdev) + mlx5_ib_put_native_port_mdev(dev, port); + props->gid_tbl_len -= count; + } + return ret; +} + +static int mlx5_ib_rep_query_port(struct ib_device *ibdev, u32 port, + struct ib_port_attr *props) +{ + return mlx5_query_port_roce(ibdev, port, props); +} + +static int mlx5_ib_rep_query_pkey(struct ib_device *ibdev, u32 port, u16 index, + u16 *pkey) +{ + /* Default special Pkey for representor device port as per the + * IB specification 1.3 section 10.9.1.2. + */ + *pkey = 0xffff; + return 0; +} + +static int mlx5_ib_query_gid(struct ib_device *ibdev, u32 port, int index, + union ib_gid *gid) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_core_dev *mdev = dev->mdev; + + switch (mlx5_get_vport_access_method(ibdev)) { + case MLX5_VPORT_ACCESS_METHOD_MAD: + return mlx5_query_mad_ifc_gids(ibdev, port, index, gid); + + case MLX5_VPORT_ACCESS_METHOD_HCA: + return mlx5_query_hca_vport_gid(mdev, 0, port, 0, index, gid); + + default: + return -EINVAL; + } + +} + +static int mlx5_query_hca_nic_pkey(struct ib_device *ibdev, u32 port, + u16 index, u16 *pkey) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_core_dev *mdev; + bool put_mdev = true; + u32 mdev_port_num; + int err; + + mdev = mlx5_ib_get_native_port_mdev(dev, port, &mdev_port_num); + if (!mdev) { + /* The port isn't affiliated yet, get the PKey from the master + * port. For RoCE the PKey tables will be the same. + */ + put_mdev = false; + mdev = dev->mdev; + mdev_port_num = 1; + } + + err = mlx5_query_hca_vport_pkey(mdev, 0, mdev_port_num, 0, + index, pkey); + if (put_mdev) + mlx5_ib_put_native_port_mdev(dev, port); + + return err; +} + +static int mlx5_ib_query_pkey(struct ib_device *ibdev, u32 port, u16 index, + u16 *pkey) +{ + switch (mlx5_get_vport_access_method(ibdev)) { + case MLX5_VPORT_ACCESS_METHOD_MAD: + return mlx5_query_mad_ifc_pkey(ibdev, port, index, pkey); + + case MLX5_VPORT_ACCESS_METHOD_HCA: + case MLX5_VPORT_ACCESS_METHOD_NIC: + return mlx5_query_hca_nic_pkey(ibdev, port, index, pkey); + default: + return -EINVAL; + } +} + +static int mlx5_ib_modify_device(struct ib_device *ibdev, int mask, + struct ib_device_modify *props) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_reg_node_desc in; + struct mlx5_reg_node_desc out; + int err; + + if (mask & ~IB_DEVICE_MODIFY_NODE_DESC) + return -EOPNOTSUPP; + + if (!(mask & IB_DEVICE_MODIFY_NODE_DESC)) + return 0; + + /* + * If possible, pass node desc to FW, so it can generate + * a 144 trap. If cmd fails, just ignore. + */ + memcpy(&in, props->node_desc, IB_DEVICE_NODE_DESC_MAX); + err = mlx5_core_access_reg(dev->mdev, &in, sizeof(in), &out, + sizeof(out), MLX5_REG_NODE_DESC, 0, 1); + if (err) + return err; + + memcpy(ibdev->node_desc, props->node_desc, IB_DEVICE_NODE_DESC_MAX); + + return err; +} + +static int set_port_caps_atomic(struct mlx5_ib_dev *dev, u32 port_num, u32 mask, + u32 value) +{ + struct mlx5_hca_vport_context ctx = {}; + struct mlx5_core_dev *mdev; + u32 mdev_port_num; + int err; + + mdev = mlx5_ib_get_native_port_mdev(dev, port_num, &mdev_port_num); + if (!mdev) + return -ENODEV; + + err = mlx5_query_hca_vport_context(mdev, 0, mdev_port_num, 0, &ctx); + if (err) + goto out; + + if (~ctx.cap_mask1_perm & mask) { + mlx5_ib_warn(dev, "trying to change bitmask 0x%X but change supported 0x%X\n", + mask, ctx.cap_mask1_perm); + err = -EINVAL; + goto out; + } + + ctx.cap_mask1 = value; + ctx.cap_mask1_perm = mask; + err = mlx5_core_modify_hca_vport_context(mdev, 0, mdev_port_num, + 0, &ctx); + +out: + mlx5_ib_put_native_port_mdev(dev, port_num); + + return err; +} + +static int mlx5_ib_modify_port(struct ib_device *ibdev, u32 port, int mask, + struct ib_port_modify *props) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct ib_port_attr attr; + u32 tmp; + int err; + u32 change_mask; + u32 value; + bool is_ib = (mlx5_ib_port_link_layer(ibdev, port) == + IB_LINK_LAYER_INFINIBAND); + + /* CM layer calls ib_modify_port() regardless of the link layer. For + * Ethernet ports, qkey violation and Port capabilities are meaningless. + */ + if (!is_ib) + return 0; + + if (MLX5_CAP_GEN(dev->mdev, ib_virt) && is_ib) { + change_mask = props->clr_port_cap_mask | props->set_port_cap_mask; + value = ~props->clr_port_cap_mask | props->set_port_cap_mask; + return set_port_caps_atomic(dev, port, change_mask, value); + } + + mutex_lock(&dev->cap_mask_mutex); + + err = ib_query_port(ibdev, port, &attr); + if (err) + goto out; + + tmp = (attr.port_cap_flags | props->set_port_cap_mask) & + ~props->clr_port_cap_mask; + + err = mlx5_set_port_caps(dev->mdev, port, tmp); + +out: + mutex_unlock(&dev->cap_mask_mutex); + return err; +} + +static void print_lib_caps(struct mlx5_ib_dev *dev, u64 caps) +{ + mlx5_ib_dbg(dev, "MLX5_LIB_CAP_4K_UAR = %s\n", + caps & MLX5_LIB_CAP_4K_UAR ? "y" : "n"); +} + +static u16 calc_dynamic_bfregs(int uars_per_sys_page) +{ + /* Large page with non 4k uar support might limit the dynamic size */ + if (uars_per_sys_page == 1 && PAGE_SIZE > 4096) + return MLX5_MIN_DYN_BFREGS; + + return MLX5_MAX_DYN_BFREGS; +} + +static int calc_total_bfregs(struct mlx5_ib_dev *dev, bool lib_uar_4k, + struct mlx5_ib_alloc_ucontext_req_v2 *req, + struct mlx5_bfreg_info *bfregi) +{ + int uars_per_sys_page; + int bfregs_per_sys_page; + int ref_bfregs = req->total_num_bfregs; + + if (req->total_num_bfregs == 0) + return -EINVAL; + + BUILD_BUG_ON(MLX5_MAX_BFREGS % MLX5_NON_FP_BFREGS_IN_PAGE); + BUILD_BUG_ON(MLX5_MAX_BFREGS < MLX5_NON_FP_BFREGS_IN_PAGE); + + if (req->total_num_bfregs > MLX5_MAX_BFREGS) + return -ENOMEM; + + uars_per_sys_page = get_uars_per_sys_page(dev, lib_uar_4k); + bfregs_per_sys_page = uars_per_sys_page * MLX5_NON_FP_BFREGS_PER_UAR; + /* This holds the required static allocation asked by the user */ + req->total_num_bfregs = ALIGN(req->total_num_bfregs, bfregs_per_sys_page); + if (req->num_low_latency_bfregs > req->total_num_bfregs - 1) + return -EINVAL; + + bfregi->num_static_sys_pages = req->total_num_bfregs / bfregs_per_sys_page; + bfregi->num_dyn_bfregs = ALIGN(calc_dynamic_bfregs(uars_per_sys_page), bfregs_per_sys_page); + bfregi->total_num_bfregs = req->total_num_bfregs + bfregi->num_dyn_bfregs; + bfregi->num_sys_pages = bfregi->total_num_bfregs / bfregs_per_sys_page; + + mlx5_ib_dbg(dev, "uar_4k: fw support %s, lib support %s, user requested %d bfregs, allocated %d, total bfregs %d, using %d sys pages\n", + MLX5_CAP_GEN(dev->mdev, uar_4k) ? "yes" : "no", + lib_uar_4k ? "yes" : "no", ref_bfregs, + req->total_num_bfregs, bfregi->total_num_bfregs, + bfregi->num_sys_pages); + + return 0; +} + +static int allocate_uars(struct mlx5_ib_dev *dev, struct mlx5_ib_ucontext *context) +{ + struct mlx5_bfreg_info *bfregi; + int err; + int i; + + bfregi = &context->bfregi; + for (i = 0; i < bfregi->num_static_sys_pages; i++) { + err = mlx5_cmd_uar_alloc(dev->mdev, &bfregi->sys_pages[i], + context->devx_uid); + if (err) + goto error; + + mlx5_ib_dbg(dev, "allocated uar %d\n", bfregi->sys_pages[i]); + } + + for (i = bfregi->num_static_sys_pages; i < bfregi->num_sys_pages; i++) + bfregi->sys_pages[i] = MLX5_IB_INVALID_UAR_INDEX; + + return 0; + +error: + for (--i; i >= 0; i--) + if (mlx5_cmd_uar_dealloc(dev->mdev, bfregi->sys_pages[i], + context->devx_uid)) + mlx5_ib_warn(dev, "failed to free uar %d\n", i); + + return err; +} + +static void deallocate_uars(struct mlx5_ib_dev *dev, + struct mlx5_ib_ucontext *context) +{ + struct mlx5_bfreg_info *bfregi; + int i; + + bfregi = &context->bfregi; + for (i = 0; i < bfregi->num_sys_pages; i++) + if (i < bfregi->num_static_sys_pages || + bfregi->sys_pages[i] != MLX5_IB_INVALID_UAR_INDEX) + mlx5_cmd_uar_dealloc(dev->mdev, bfregi->sys_pages[i], + context->devx_uid); +} + +int mlx5_ib_enable_lb(struct mlx5_ib_dev *dev, bool td, bool qp) +{ + int err = 0; + + mutex_lock(&dev->lb.mutex); + if (td) + dev->lb.user_td++; + if (qp) + dev->lb.qps++; + + if (dev->lb.user_td == 2 || + dev->lb.qps == 1) { + if (!dev->lb.enabled) { + err = mlx5_nic_vport_update_local_lb(dev->mdev, true); + dev->lb.enabled = true; + } + } + + mutex_unlock(&dev->lb.mutex); + + return err; +} + +void mlx5_ib_disable_lb(struct mlx5_ib_dev *dev, bool td, bool qp) +{ + mutex_lock(&dev->lb.mutex); + if (td) + dev->lb.user_td--; + if (qp) + dev->lb.qps--; + + if (dev->lb.user_td == 1 && + dev->lb.qps == 0) { + if (dev->lb.enabled) { + mlx5_nic_vport_update_local_lb(dev->mdev, false); + dev->lb.enabled = false; + } + } + + mutex_unlock(&dev->lb.mutex); +} + +static int mlx5_ib_alloc_transport_domain(struct mlx5_ib_dev *dev, u32 *tdn, + u16 uid) +{ + int err; + + if (!MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) + return 0; + + err = mlx5_cmd_alloc_transport_domain(dev->mdev, tdn, uid); + if (err) + return err; + + if ((MLX5_CAP_GEN(dev->mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) || + (!MLX5_CAP_GEN(dev->mdev, disable_local_lb_uc) && + !MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc))) + return err; + + return mlx5_ib_enable_lb(dev, true, false); +} + +static void mlx5_ib_dealloc_transport_domain(struct mlx5_ib_dev *dev, u32 tdn, + u16 uid) +{ + if (!MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) + return; + + mlx5_cmd_dealloc_transport_domain(dev->mdev, tdn, uid); + + if ((MLX5_CAP_GEN(dev->mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) || + (!MLX5_CAP_GEN(dev->mdev, disable_local_lb_uc) && + !MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc))) + return; + + mlx5_ib_disable_lb(dev, true, false); +} + +static int set_ucontext_resp(struct ib_ucontext *uctx, + struct mlx5_ib_alloc_ucontext_resp *resp) +{ + struct ib_device *ibdev = uctx->device; + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_ib_ucontext *context = to_mucontext(uctx); + struct mlx5_bfreg_info *bfregi = &context->bfregi; + int err; + + if (MLX5_CAP_GEN(dev->mdev, dump_fill_mkey)) { + err = mlx5_cmd_dump_fill_mkey(dev->mdev, + &resp->dump_fill_mkey); + if (err) + return err; + resp->comp_mask |= + MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_DUMP_FILL_MKEY; + } + + resp->qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp); + if (dev->wc_support) + resp->bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev, + log_bf_reg_size); + resp->cache_line_size = cache_line_size(); + resp->max_sq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq); + resp->max_rq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq); + resp->max_send_wqebb = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz); + resp->max_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz); + resp->max_srq_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz); + resp->cqe_version = context->cqe_version; + resp->log_uar_size = MLX5_CAP_GEN(dev->mdev, uar_4k) ? + MLX5_ADAPTER_PAGE_SHIFT : PAGE_SHIFT; + resp->num_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ? + MLX5_CAP_GEN(dev->mdev, + num_of_uars_per_page) : 1; + + if (mlx5_accel_ipsec_device_caps(dev->mdev) & + MLX5_ACCEL_IPSEC_CAP_DEVICE) { + if (mlx5_get_flow_namespace(dev->mdev, + MLX5_FLOW_NAMESPACE_EGRESS)) + resp->flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM; + if (mlx5_accel_ipsec_device_caps(dev->mdev) & + MLX5_ACCEL_IPSEC_CAP_REQUIRED_METADATA) + resp->flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_REQ_METADATA; + if (MLX5_CAP_FLOWTABLE(dev->mdev, flow_table_properties_nic_receive.ft_field_support.outer_esp_spi)) + resp->flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_SPI_STEERING; + if (mlx5_accel_ipsec_device_caps(dev->mdev) & + MLX5_ACCEL_IPSEC_CAP_TX_IV_IS_ESN) + resp->flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_TX_IV_IS_ESN; + /* MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_FULL_OFFLOAD is currently always 0 */ + } + + resp->tot_bfregs = bfregi->lib_uar_dyn ? 0 : + bfregi->total_num_bfregs - bfregi->num_dyn_bfregs; + resp->num_ports = dev->num_ports; + resp->cmds_supp_uhw |= MLX5_USER_CMDS_SUPP_UHW_QUERY_DEVICE | + MLX5_USER_CMDS_SUPP_UHW_CREATE_AH; + + if (mlx5_ib_port_link_layer(ibdev, 1) == IB_LINK_LAYER_ETHERNET) { + mlx5_query_min_inline(dev->mdev, &resp->eth_min_inline); + resp->eth_min_inline++; + } + + if (dev->mdev->clock_info) + resp->clock_info_versions = BIT(MLX5_IB_CLOCK_INFO_V1); + + /* + * We don't want to expose information from the PCI bar that is located + * after 4096 bytes, so if the arch only supports larger pages, let's + * pretend we don't support reading the HCA's core clock. This is also + * forced by mmap function. + */ + if (PAGE_SIZE <= 4096) { + resp->comp_mask |= + MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET; + resp->hca_core_clock_offset = + offsetof(struct mlx5_init_seg, + internal_timer_h) % PAGE_SIZE; + } + + if (MLX5_CAP_GEN(dev->mdev, ece_support)) + resp->comp_mask |= MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_ECE; + + if (rt_supported(MLX5_CAP_GEN(dev->mdev, sq_ts_format)) && + rt_supported(MLX5_CAP_GEN(dev->mdev, rq_ts_format)) && + rt_supported(MLX5_CAP_ROCE(dev->mdev, qp_ts_format))) + resp->comp_mask |= + MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_REAL_TIME_TS; + + resp->num_dyn_bfregs = bfregi->num_dyn_bfregs; + + if (MLX5_CAP_GEN(dev->mdev, drain_sigerr)) + resp->comp_mask |= MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_SQD2RTS; + + return 0; +} + +static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx, + struct ib_udata *udata) +{ + struct ib_device *ibdev = uctx->device; + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_ib_alloc_ucontext_req_v2 req = {}; + struct mlx5_ib_alloc_ucontext_resp resp = {}; + struct mlx5_ib_ucontext *context = to_mucontext(uctx); + struct mlx5_bfreg_info *bfregi; + int ver; + int err; + size_t min_req_v2 = offsetof(struct mlx5_ib_alloc_ucontext_req_v2, + max_cqe_version); + bool lib_uar_4k; + bool lib_uar_dyn; + + if (!dev->ib_active) + return -EAGAIN; + + if (udata->inlen == sizeof(struct mlx5_ib_alloc_ucontext_req)) + ver = 0; + else if (udata->inlen >= min_req_v2) + ver = 2; + else + return -EINVAL; + + err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req))); + if (err) + return err; + + if (req.flags & ~MLX5_IB_ALLOC_UCTX_DEVX) + return -EOPNOTSUPP; + + if (req.comp_mask || req.reserved0 || req.reserved1 || req.reserved2) + return -EOPNOTSUPP; + + req.total_num_bfregs = ALIGN(req.total_num_bfregs, + MLX5_NON_FP_BFREGS_PER_UAR); + if (req.num_low_latency_bfregs > req.total_num_bfregs - 1) + return -EINVAL; + + if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) { + err = mlx5_ib_devx_create(dev, true); + if (err < 0) + goto out_ctx; + context->devx_uid = err; + } + + lib_uar_4k = req.lib_caps & MLX5_LIB_CAP_4K_UAR; + lib_uar_dyn = req.lib_caps & MLX5_LIB_CAP_DYN_UAR; + bfregi = &context->bfregi; + + if (lib_uar_dyn) { + bfregi->lib_uar_dyn = lib_uar_dyn; + goto uar_done; + } + + /* updates req->total_num_bfregs */ + err = calc_total_bfregs(dev, lib_uar_4k, &req, bfregi); + if (err) + goto out_devx; + + mutex_init(&bfregi->lock); + bfregi->lib_uar_4k = lib_uar_4k; + bfregi->count = kcalloc(bfregi->total_num_bfregs, sizeof(*bfregi->count), + GFP_KERNEL); + if (!bfregi->count) { + err = -ENOMEM; + goto out_devx; + } + + bfregi->sys_pages = kcalloc(bfregi->num_sys_pages, + sizeof(*bfregi->sys_pages), + GFP_KERNEL); + if (!bfregi->sys_pages) { + err = -ENOMEM; + goto out_count; + } + + err = allocate_uars(dev, context); + if (err) + goto out_sys_pages; + +uar_done: + err = mlx5_ib_alloc_transport_domain(dev, &context->tdn, + context->devx_uid); + if (err) + goto out_uars; + + INIT_LIST_HEAD(&context->db_page_list); + mutex_init(&context->db_page_mutex); + + context->cqe_version = min_t(__u8, + (__u8)MLX5_CAP_GEN(dev->mdev, cqe_version), + req.max_cqe_version); + + err = set_ucontext_resp(uctx, &resp); + if (err) + goto out_mdev; + + resp.response_length = min(udata->outlen, sizeof(resp)); + err = ib_copy_to_udata(udata, &resp, resp.response_length); + if (err) + goto out_mdev; + + bfregi->ver = ver; + bfregi->num_low_latency_bfregs = req.num_low_latency_bfregs; + context->lib_caps = req.lib_caps; + print_lib_caps(dev, context->lib_caps); + + if (mlx5_ib_lag_should_assign_affinity(dev)) { + u32 port = mlx5_core_native_port_num(dev->mdev) - 1; + + atomic_set(&context->tx_port_affinity, + atomic_add_return( + 1, &dev->port[port].roce.tx_port_affinity)); + } + + return 0; + +out_mdev: + mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid); + +out_uars: + deallocate_uars(dev, context); + +out_sys_pages: + kfree(bfregi->sys_pages); + +out_count: + kfree(bfregi->count); + +out_devx: + if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) + mlx5_ib_devx_destroy(dev, context->devx_uid); + +out_ctx: + return err; +} + +static int mlx5_ib_query_ucontext(struct ib_ucontext *ibcontext, + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_ib_alloc_ucontext_resp uctx_resp = {}; + int ret; + + ret = set_ucontext_resp(ibcontext, &uctx_resp); + if (ret) + return ret; + + uctx_resp.response_length = + min_t(size_t, + uverbs_attr_get_len(attrs, + MLX5_IB_ATTR_QUERY_CONTEXT_RESP_UCTX), + sizeof(uctx_resp)); + + ret = uverbs_copy_to_struct_or_zero(attrs, + MLX5_IB_ATTR_QUERY_CONTEXT_RESP_UCTX, + &uctx_resp, + sizeof(uctx_resp)); + return ret; +} + +static void mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) +{ + struct mlx5_ib_ucontext *context = to_mucontext(ibcontext); + struct mlx5_ib_dev *dev = to_mdev(ibcontext->device); + struct mlx5_bfreg_info *bfregi; + + bfregi = &context->bfregi; + mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid); + + deallocate_uars(dev, context); + kfree(bfregi->sys_pages); + kfree(bfregi->count); + + if (context->devx_uid) + mlx5_ib_devx_destroy(dev, context->devx_uid); +} + +static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev, + int uar_idx) +{ + int fw_uars_per_page; + + fw_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ? MLX5_UARS_IN_PAGE : 1; + + return (dev->mdev->bar_addr >> PAGE_SHIFT) + uar_idx / fw_uars_per_page; +} + +static u64 uar_index2paddress(struct mlx5_ib_dev *dev, + int uar_idx) +{ + unsigned int fw_uars_per_page; + + fw_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ? + MLX5_UARS_IN_PAGE : 1; + + return (dev->mdev->bar_addr + (uar_idx / fw_uars_per_page) * PAGE_SIZE); +} + +static int get_command(unsigned long offset) +{ + return (offset >> MLX5_IB_MMAP_CMD_SHIFT) & MLX5_IB_MMAP_CMD_MASK; +} + +static int get_arg(unsigned long offset) +{ + return offset & ((1 << MLX5_IB_MMAP_CMD_SHIFT) - 1); +} + +static int get_index(unsigned long offset) +{ + return get_arg(offset); +} + +/* Index resides in an extra byte to enable larger values than 255 */ +static int get_extended_index(unsigned long offset) +{ + return get_arg(offset) | ((offset >> 16) & 0xff) << 8; +} + + +static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext) +{ +} + +static inline char *mmap_cmd2str(enum mlx5_ib_mmap_cmd cmd) +{ + switch (cmd) { + case MLX5_IB_MMAP_WC_PAGE: + return "WC"; + case MLX5_IB_MMAP_REGULAR_PAGE: + return "best effort WC"; + case MLX5_IB_MMAP_NC_PAGE: + return "NC"; + case MLX5_IB_MMAP_DEVICE_MEM: + return "Device Memory"; + default: + return NULL; + } +} + +static int mlx5_ib_mmap_clock_info_page(struct mlx5_ib_dev *dev, + struct vm_area_struct *vma, + struct mlx5_ib_ucontext *context) +{ + if ((vma->vm_end - vma->vm_start != PAGE_SIZE) || + !(vma->vm_flags & VM_SHARED)) + return -EINVAL; + + if (get_index(vma->vm_pgoff) != MLX5_IB_CLOCK_INFO_V1) + return -EOPNOTSUPP; + + if (vma->vm_flags & (VM_WRITE | VM_EXEC)) + return -EPERM; + vma->vm_flags &= ~VM_MAYWRITE; + + if (!dev->mdev->clock_info) + return -EOPNOTSUPP; + + return vm_insert_page(vma, vma->vm_start, + virt_to_page(dev->mdev->clock_info)); +} + +static void mlx5_ib_mmap_free(struct rdma_user_mmap_entry *entry) +{ + struct mlx5_user_mmap_entry *mentry = to_mmmap(entry); + struct mlx5_ib_dev *dev = to_mdev(entry->ucontext->device); + struct mlx5_var_table *var_table = &dev->var_table; + struct mlx5_ib_ucontext *context = to_mucontext(entry->ucontext); + + switch (mentry->mmap_flag) { + case MLX5_IB_MMAP_TYPE_MEMIC: + case MLX5_IB_MMAP_TYPE_MEMIC_OP: + mlx5_ib_dm_mmap_free(dev, mentry); + break; + case MLX5_IB_MMAP_TYPE_VAR: + mutex_lock(&var_table->bitmap_lock); + clear_bit(mentry->page_idx, var_table->bitmap); + mutex_unlock(&var_table->bitmap_lock); + kfree(mentry); + break; + case MLX5_IB_MMAP_TYPE_UAR_WC: + case MLX5_IB_MMAP_TYPE_UAR_NC: + mlx5_cmd_uar_dealloc(dev->mdev, mentry->page_idx, + context->devx_uid); + kfree(mentry); + break; + default: + WARN_ON(true); + } +} + +static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd, + struct vm_area_struct *vma, + struct mlx5_ib_ucontext *context) +{ + struct mlx5_bfreg_info *bfregi = &context->bfregi; + int err; + unsigned long idx; + phys_addr_t pfn; + pgprot_t prot; + u32 bfreg_dyn_idx = 0; + u32 uar_index; + int dyn_uar = (cmd == MLX5_IB_MMAP_ALLOC_WC); + int max_valid_idx = dyn_uar ? bfregi->num_sys_pages : + bfregi->num_static_sys_pages; + + if (bfregi->lib_uar_dyn) + return -EINVAL; + + if (vma->vm_end - vma->vm_start != PAGE_SIZE) + return -EINVAL; + + if (dyn_uar) + idx = get_extended_index(vma->vm_pgoff) + bfregi->num_static_sys_pages; + else + idx = get_index(vma->vm_pgoff); + + if (idx >= max_valid_idx) { + mlx5_ib_warn(dev, "invalid uar index %lu, max=%d\n", + idx, max_valid_idx); + return -EINVAL; + } + + switch (cmd) { + case MLX5_IB_MMAP_WC_PAGE: + case MLX5_IB_MMAP_ALLOC_WC: + case MLX5_IB_MMAP_REGULAR_PAGE: + /* For MLX5_IB_MMAP_REGULAR_PAGE do the best effort to get WC */ + prot = pgprot_writecombine(vma->vm_page_prot); + break; + case MLX5_IB_MMAP_NC_PAGE: + prot = pgprot_noncached(vma->vm_page_prot); + break; + default: + return -EINVAL; + } + + if (dyn_uar) { + int uars_per_page; + + uars_per_page = get_uars_per_sys_page(dev, bfregi->lib_uar_4k); + bfreg_dyn_idx = idx * (uars_per_page * MLX5_NON_FP_BFREGS_PER_UAR); + if (bfreg_dyn_idx >= bfregi->total_num_bfregs) { + mlx5_ib_warn(dev, "invalid bfreg_dyn_idx %u, max=%u\n", + bfreg_dyn_idx, bfregi->total_num_bfregs); + return -EINVAL; + } + + mutex_lock(&bfregi->lock); + /* Fail if uar already allocated, first bfreg index of each + * page holds its count. + */ + if (bfregi->count[bfreg_dyn_idx]) { + mlx5_ib_warn(dev, "wrong offset, idx %lu is busy, bfregn=%u\n", idx, bfreg_dyn_idx); + mutex_unlock(&bfregi->lock); + return -EINVAL; + } + + bfregi->count[bfreg_dyn_idx]++; + mutex_unlock(&bfregi->lock); + + err = mlx5_cmd_uar_alloc(dev->mdev, &uar_index, + context->devx_uid); + if (err) { + mlx5_ib_warn(dev, "UAR alloc failed\n"); + goto free_bfreg; + } + } else { + uar_index = bfregi->sys_pages[idx]; + } + + pfn = uar_index2pfn(dev, uar_index); + mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn %pa\n", idx, &pfn); + + err = rdma_user_mmap_io(&context->ibucontext, vma, pfn, PAGE_SIZE, + prot, NULL); + if (err) { + mlx5_ib_err(dev, + "rdma_user_mmap_io failed with error=%d, mmap_cmd=%s\n", + err, mmap_cmd2str(cmd)); + goto err; + } + + if (dyn_uar) + bfregi->sys_pages[idx] = uar_index; + return 0; + +err: + if (!dyn_uar) + return err; + + mlx5_cmd_uar_dealloc(dev->mdev, idx, context->devx_uid); + +free_bfreg: + mlx5_ib_free_bfreg(dev, bfregi, bfreg_dyn_idx); + + return err; +} + +static unsigned long mlx5_vma_to_pgoff(struct vm_area_struct *vma) +{ + unsigned long idx; + u8 command; + + command = get_command(vma->vm_pgoff); + idx = get_extended_index(vma->vm_pgoff); + + return (command << 16 | idx); +} + +static int mlx5_ib_mmap_offset(struct mlx5_ib_dev *dev, + struct vm_area_struct *vma, + struct ib_ucontext *ucontext) +{ + struct mlx5_user_mmap_entry *mentry; + struct rdma_user_mmap_entry *entry; + unsigned long pgoff; + pgprot_t prot; + phys_addr_t pfn; + int ret; + + pgoff = mlx5_vma_to_pgoff(vma); + entry = rdma_user_mmap_entry_get_pgoff(ucontext, pgoff); + if (!entry) + return -EINVAL; + + mentry = to_mmmap(entry); + pfn = (mentry->address >> PAGE_SHIFT); + if (mentry->mmap_flag == MLX5_IB_MMAP_TYPE_VAR || + mentry->mmap_flag == MLX5_IB_MMAP_TYPE_UAR_NC) + prot = pgprot_noncached(vma->vm_page_prot); + else + prot = pgprot_writecombine(vma->vm_page_prot); + ret = rdma_user_mmap_io(ucontext, vma, pfn, + entry->npages * PAGE_SIZE, + prot, + entry); + rdma_user_mmap_entry_put(&mentry->rdma_entry); + return ret; +} + +static u64 mlx5_entry_to_mmap_offset(struct mlx5_user_mmap_entry *entry) +{ + u64 cmd = (entry->rdma_entry.start_pgoff >> 16) & 0xFFFF; + u64 index = entry->rdma_entry.start_pgoff & 0xFFFF; + + return (((index >> 8) << 16) | (cmd << MLX5_IB_MMAP_CMD_SHIFT) | + (index & 0xFF)) << PAGE_SHIFT; +} + +static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma) +{ + struct mlx5_ib_ucontext *context = to_mucontext(ibcontext); + struct mlx5_ib_dev *dev = to_mdev(ibcontext->device); + unsigned long command; + phys_addr_t pfn; + + command = get_command(vma->vm_pgoff); + switch (command) { + case MLX5_IB_MMAP_MAP_DC_INFO_PAGE: + return mlx5_ib_mmap_dc_info_page(dev, vma); + case MLX5_IB_MMAP_WC_PAGE: + case MLX5_IB_MMAP_ALLOC_WC: + if (!dev->wc_support) + return -EPERM; + fallthrough; + case MLX5_IB_MMAP_NC_PAGE: + case MLX5_IB_MMAP_REGULAR_PAGE: + return uar_mmap(dev, command, vma, context); + + case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES: + return -ENOSYS; + + case MLX5_IB_MMAP_CORE_CLOCK: + if (vma->vm_end - vma->vm_start != PAGE_SIZE) + return -EINVAL; + + if (vma->vm_flags & VM_WRITE) + return -EPERM; + vma->vm_flags &= ~VM_MAYWRITE; + + /* Don't expose to user-space information it shouldn't have */ + if (PAGE_SIZE > 4096) + return -EOPNOTSUPP; + + pfn = (dev->mdev->iseg_base + + offsetof(struct mlx5_init_seg, internal_timer_h)) >> + PAGE_SHIFT; + return rdma_user_mmap_io(&context->ibucontext, vma, pfn, + PAGE_SIZE, + pgprot_noncached(vma->vm_page_prot), + NULL); + case MLX5_IB_MMAP_CLOCK_INFO: + return mlx5_ib_mmap_clock_info_page(dev, vma, context); + + default: + return mlx5_ib_mmap_offset(dev, vma, ibcontext); + } + + return 0; +} + +static int mlx5_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) +{ + struct mlx5_ib_pd *pd = to_mpd(ibpd); + struct ib_device *ibdev = ibpd->device; + struct mlx5_ib_alloc_pd_resp resp; + int err; + u32 out[MLX5_ST_SZ_DW(alloc_pd_out)] = {}; + u32 in[MLX5_ST_SZ_DW(alloc_pd_in)] = {}; + u16 uid = 0; + struct mlx5_ib_ucontext *context = rdma_udata_to_drv_context( + udata, struct mlx5_ib_ucontext, ibucontext); + + uid = context ? context->devx_uid : 0; + MLX5_SET(alloc_pd_in, in, opcode, MLX5_CMD_OP_ALLOC_PD); + MLX5_SET(alloc_pd_in, in, uid, uid); + err = mlx5_cmd_exec_inout(to_mdev(ibdev)->mdev, alloc_pd, in, out); + if (err) + return err; + + pd->pdn = MLX5_GET(alloc_pd_out, out, pd); + pd->uid = uid; + if (udata) { + resp.pdn = pd->pdn; + if (ib_copy_to_udata(udata, &resp, sizeof(resp))) { + mlx5_cmd_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn, uid); + return -EFAULT; + } + } + + return 0; +} + +static int mlx5_ib_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata) +{ + struct mlx5_ib_dev *mdev = to_mdev(pd->device); + struct mlx5_ib_pd *mpd = to_mpd(pd); + + return mlx5_cmd_dealloc_pd(mdev->mdev, mpd->pdn, mpd->uid); +} + +static int mlx5_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + struct mlx5_ib_qp *mqp = to_mqp(ibqp); + int err; + u16 uid; + + uid = ibqp->pd ? + to_mpd(ibqp->pd)->uid : 0; + + if (mqp->flags & IB_QP_CREATE_SOURCE_QPN) { + mlx5_ib_dbg(dev, "Attaching a multi cast group to underlay QP is not supported\n"); + return -EOPNOTSUPP; + } + + err = mlx5_cmd_attach_mcg(dev->mdev, gid, ibqp->qp_num, uid); + if (err) + mlx5_ib_warn(dev, "failed attaching QPN 0x%x, MGID %pI6\n", + ibqp->qp_num, gid->raw); + + return err; +} + +static int mlx5_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + int err; + u16 uid; + + uid = ibqp->pd ? + to_mpd(ibqp->pd)->uid : 0; + err = mlx5_cmd_detach_mcg(dev->mdev, gid, ibqp->qp_num, uid); + if (err) + mlx5_ib_warn(dev, "failed detaching QPN 0x%x, MGID %pI6\n", + ibqp->qp_num, gid->raw); + + return err; +} + +static int init_node_data(struct mlx5_ib_dev *dev) +{ + int err; + + err = mlx5_query_node_desc(dev, dev->ib_dev.node_desc); + if (err) + return err; + + dev->mdev->rev_id = dev->mdev->pdev->revision; + + return mlx5_query_node_guid(dev, &dev->ib_dev.node_guid); +} + +static ssize_t fw_pages_show(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct mlx5_ib_dev *dev = + rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev); + + return sysfs_emit(buf, "%d\n", dev->mdev->priv.fw_pages); +} +static DEVICE_ATTR_RO(fw_pages); + +static ssize_t reg_pages_show(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct mlx5_ib_dev *dev = + rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev); + + return sysfs_emit(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages)); +} +static DEVICE_ATTR_RO(reg_pages); + +static ssize_t hca_type_show(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct mlx5_ib_dev *dev = + rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev); + + return sysfs_emit(buf, "MT%d\n", dev->mdev->pdev->device); +} +static DEVICE_ATTR_RO(hca_type); + +static ssize_t hw_rev_show(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct mlx5_ib_dev *dev = + rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev); + + return sysfs_emit(buf, "%x\n", dev->mdev->rev_id); +} +static DEVICE_ATTR_RO(hw_rev); + +static ssize_t board_id_show(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct mlx5_ib_dev *dev = + rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev); + + return sysfs_emit(buf, "%.*s\n", MLX5_BOARD_ID_LEN, + dev->mdev->board_id); +} +static DEVICE_ATTR_RO(board_id); + +static struct attribute *mlx5_class_attributes[] = { + &dev_attr_hw_rev.attr, + &dev_attr_hca_type.attr, + &dev_attr_board_id.attr, + &dev_attr_fw_pages.attr, + &dev_attr_reg_pages.attr, + NULL, +}; + +static const struct attribute_group mlx5_attr_group = { + .attrs = mlx5_class_attributes, +}; + +static void pkey_change_handler(struct work_struct *work) +{ + struct mlx5_ib_port_resources *ports = + container_of(work, struct mlx5_ib_port_resources, + pkey_change_work); + + if (!ports->gsi) + /* + * We got this event before device was fully configured + * and MAD registration code wasn't called/finished yet. + */ + return; + + mlx5_ib_gsi_pkey_change(ports->gsi); +} + +static void mlx5_ib_handle_internal_error(struct mlx5_ib_dev *ibdev) +{ + struct mlx5_ib_qp *mqp; + struct mlx5_ib_cq *send_mcq, *recv_mcq; + struct mlx5_core_cq *mcq; + struct list_head cq_armed_list; + unsigned long flags_qp; + unsigned long flags_cq; + unsigned long flags; + + INIT_LIST_HEAD(&cq_armed_list); + + /* Go over qp list reside on that ibdev, sync with create/destroy qp.*/ + spin_lock_irqsave(&ibdev->reset_flow_resource_lock, flags); + list_for_each_entry(mqp, &ibdev->qp_list, qps_list) { + spin_lock_irqsave(&mqp->sq.lock, flags_qp); + if (mqp->sq.tail != mqp->sq.head) { + send_mcq = to_mcq(mqp->ibqp.send_cq); + spin_lock_irqsave(&send_mcq->lock, flags_cq); + if (send_mcq->mcq.comp && + mqp->ibqp.send_cq->comp_handler) { + if (!send_mcq->mcq.reset_notify_added) { + send_mcq->mcq.reset_notify_added = 1; + list_add_tail(&send_mcq->mcq.reset_notify, + &cq_armed_list); + } + } + spin_unlock_irqrestore(&send_mcq->lock, flags_cq); + } + spin_unlock_irqrestore(&mqp->sq.lock, flags_qp); + spin_lock_irqsave(&mqp->rq.lock, flags_qp); + /* no handling is needed for SRQ */ + if (!mqp->ibqp.srq) { + if (mqp->rq.tail != mqp->rq.head) { + recv_mcq = to_mcq(mqp->ibqp.recv_cq); + spin_lock_irqsave(&recv_mcq->lock, flags_cq); + if (recv_mcq->mcq.comp && + mqp->ibqp.recv_cq->comp_handler) { + if (!recv_mcq->mcq.reset_notify_added) { + recv_mcq->mcq.reset_notify_added = 1; + list_add_tail(&recv_mcq->mcq.reset_notify, + &cq_armed_list); + } + } + spin_unlock_irqrestore(&recv_mcq->lock, + flags_cq); + } + } + spin_unlock_irqrestore(&mqp->rq.lock, flags_qp); + } + /*At that point all inflight post send were put to be executed as of we + * lock/unlock above locks Now need to arm all involved CQs. + */ + list_for_each_entry(mcq, &cq_armed_list, reset_notify) { + mcq->comp(mcq, NULL); + } + spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags); +} + +static void delay_drop_handler(struct work_struct *work) +{ + int err; + struct mlx5_ib_delay_drop *delay_drop = + container_of(work, struct mlx5_ib_delay_drop, + delay_drop_work); + + atomic_inc(&delay_drop->events_cnt); + + mutex_lock(&delay_drop->lock); + err = mlx5_core_set_delay_drop(delay_drop->dev, delay_drop->timeout); + if (err) { + mlx5_ib_warn(delay_drop->dev, "Failed to set delay drop, timeout=%u\n", + delay_drop->timeout); + delay_drop->activate = false; + } + mutex_unlock(&delay_drop->lock); +} + +static void handle_general_event(struct mlx5_ib_dev *ibdev, struct mlx5_eqe *eqe, + struct ib_event *ibev) +{ + u32 port = (eqe->data.port.port >> 4) & 0xf; + + switch (eqe->sub_type) { + case MLX5_GENERAL_SUBTYPE_DELAY_DROP_TIMEOUT: + if (mlx5_ib_port_link_layer(&ibdev->ib_dev, port) == + IB_LINK_LAYER_ETHERNET) + schedule_work(&ibdev->delay_drop.delay_drop_work); + break; + default: /* do nothing */ + return; + } +} + +static int handle_port_change(struct mlx5_ib_dev *ibdev, struct mlx5_eqe *eqe, + struct ib_event *ibev) +{ + u32 port = (eqe->data.port.port >> 4) & 0xf; + + ibev->element.port_num = port; + + switch (eqe->sub_type) { + case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE: + case MLX5_PORT_CHANGE_SUBTYPE_DOWN: + case MLX5_PORT_CHANGE_SUBTYPE_INITIALIZED: + /* In RoCE, port up/down events are handled in + * mlx5_netdev_event(). + */ + if (mlx5_ib_port_link_layer(&ibdev->ib_dev, port) == + IB_LINK_LAYER_ETHERNET) + return -EINVAL; + + ibev->event = (eqe->sub_type == MLX5_PORT_CHANGE_SUBTYPE_ACTIVE) ? + IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR; + break; + + case MLX5_PORT_CHANGE_SUBTYPE_LID: + ibev->event = IB_EVENT_LID_CHANGE; + break; + + case MLX5_PORT_CHANGE_SUBTYPE_PKEY: + ibev->event = IB_EVENT_PKEY_CHANGE; + schedule_work(&ibdev->devr.ports[port - 1].pkey_change_work); + break; + + case MLX5_PORT_CHANGE_SUBTYPE_GUID: + ibev->event = IB_EVENT_GID_CHANGE; + break; + + case MLX5_PORT_CHANGE_SUBTYPE_CLIENT_REREG: + ibev->event = IB_EVENT_CLIENT_REREGISTER; + break; + default: + return -EINVAL; + } + + return 0; +} + +static void mlx5_ib_handle_event(struct work_struct *_work) +{ + struct mlx5_ib_event_work *work = + container_of(_work, struct mlx5_ib_event_work, work); + struct mlx5_ib_dev *ibdev; + struct ib_event ibev; + bool fatal = false; + + if (work->is_slave) + ibdev = mlx5_ib_get_ibdev_from_mpi(work->mpi); + else + ibdev = work->dev; + + if (!ibdev) + goto out; + + switch (work->event) { + case MLX5_DEV_EVENT_SYS_ERROR: + ibev.event = IB_EVENT_DEVICE_FATAL; + mlx5_ib_handle_internal_error(ibdev); + ibev.element.port_num = (u8)(unsigned long)work->param; + fatal = true; + break; + case MLX5_EVENT_TYPE_PORT_CHANGE: + if (handle_port_change(ibdev, work->param, &ibev)) + goto out; + break; + case MLX5_EVENT_TYPE_GENERAL_EVENT: + handle_general_event(ibdev, work->param, &ibev); + fallthrough; + default: + goto out; + } + + ibev.device = &ibdev->ib_dev; + + if (!rdma_is_port_valid(&ibdev->ib_dev, ibev.element.port_num)) { + mlx5_ib_warn(ibdev, "warning: event on port %d\n", ibev.element.port_num); + goto out; + } + + if (ibdev->ib_active) + ib_dispatch_event(&ibev); + + if (fatal) + ibdev->ib_active = false; +out: + kfree(work); +} + +static int mlx5_ib_event(struct notifier_block *nb, + unsigned long event, void *param) +{ + struct mlx5_ib_event_work *work; + + work = kmalloc(sizeof(*work), GFP_ATOMIC); + if (!work) + return NOTIFY_DONE; + + INIT_WORK(&work->work, mlx5_ib_handle_event); + work->dev = container_of(nb, struct mlx5_ib_dev, mdev_events); + work->is_slave = false; + work->param = param; + work->event = event; + + queue_work(mlx5_ib_event_wq, &work->work); + + return NOTIFY_OK; +} + +static int mlx5_ib_event_slave_port(struct notifier_block *nb, + unsigned long event, void *param) +{ + struct mlx5_ib_event_work *work; + + work = kmalloc(sizeof(*work), GFP_ATOMIC); + if (!work) + return NOTIFY_DONE; + + INIT_WORK(&work->work, mlx5_ib_handle_event); + work->mpi = container_of(nb, struct mlx5_ib_multiport_info, mdev_events); + work->is_slave = true; + work->param = param; + work->event = event; + queue_work(mlx5_ib_event_wq, &work->work); + + return NOTIFY_OK; +} + +static int set_has_smi_cap(struct mlx5_ib_dev *dev) +{ + struct mlx5_hca_vport_context vport_ctx; + int err; + int port; + + if (MLX5_CAP_GEN(dev->mdev, port_type) != MLX5_CAP_PORT_TYPE_IB) + return 0; + + for (port = 1; port <= dev->num_ports; port++) { + if (!MLX5_CAP_GEN(dev->mdev, ib_virt)) { + dev->port_caps[port - 1].has_smi = true; + continue; + } + err = mlx5_query_hca_vport_context(dev->mdev, 0, port, 0, + &vport_ctx); + if (err) { + mlx5_ib_err(dev, "query_hca_vport_context for port=%d failed %d\n", + port, err); + return err; + } + dev->port_caps[port - 1].has_smi = vport_ctx.has_smi; + } + + return 0; +} + +static void get_ext_port_caps(struct mlx5_ib_dev *dev) +{ + unsigned int port; + + rdma_for_each_port (&dev->ib_dev, port) + mlx5_query_ext_port_caps(dev, port); +} + +static u8 mlx5_get_umr_fence(u8 umr_fence_cap) +{ + switch (umr_fence_cap) { + case MLX5_CAP_UMR_FENCE_NONE: + return MLX5_FENCE_MODE_NONE; + case MLX5_CAP_UMR_FENCE_SMALL: + return MLX5_FENCE_MODE_INITIATOR_SMALL; + default: + return MLX5_FENCE_MODE_STRONG_ORDERING; + } +} + +static int mlx5_ib_dev_res_init(struct mlx5_ib_dev *dev) +{ + struct mlx5_ib_resources *devr = &dev->devr; + struct ib_srq_init_attr attr; + struct ib_device *ibdev; + struct ib_cq_init_attr cq_attr = {.cqe = 1}; + int port; + int ret = 0; + + ibdev = &dev->ib_dev; + + if (!MLX5_CAP_GEN(dev->mdev, xrc)) + return -EOPNOTSUPP; + + devr->p0 = ib_alloc_pd(ibdev, 0); + if (IS_ERR(devr->p0)) + return PTR_ERR(devr->p0); + + devr->c0 = ib_create_cq(ibdev, NULL, NULL, NULL, &cq_attr); + if (IS_ERR(devr->c0)) { + ret = PTR_ERR(devr->c0); + goto error1; + } + + ret = mlx5_cmd_xrcd_alloc(dev->mdev, &devr->xrcdn0, 0); + if (ret) + goto error2; + + ret = mlx5_cmd_xrcd_alloc(dev->mdev, &devr->xrcdn1, 0); + if (ret) + goto error3; + + memset(&attr, 0, sizeof(attr)); + attr.attr.max_sge = 1; + attr.attr.max_wr = 1; + attr.srq_type = IB_SRQT_XRC; + attr.ext.cq = devr->c0; + + devr->s0 = ib_create_srq(devr->p0, &attr); + if (IS_ERR(devr->s0)) { + ret = PTR_ERR(devr->s0); + goto err_create; + } + + memset(&attr, 0, sizeof(attr)); + attr.attr.max_sge = 1; + attr.attr.max_wr = 1; + attr.srq_type = IB_SRQT_BASIC; + + devr->s1 = ib_create_srq(devr->p0, &attr); + if (IS_ERR(devr->s1)) { + ret = PTR_ERR(devr->s1); + goto error6; + } + + for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) + INIT_WORK(&devr->ports[port].pkey_change_work, + pkey_change_handler); + + return 0; + +error6: + ib_destroy_srq(devr->s0); +err_create: + mlx5_cmd_xrcd_dealloc(dev->mdev, devr->xrcdn1, 0); +error3: + mlx5_cmd_xrcd_dealloc(dev->mdev, devr->xrcdn0, 0); +error2: + ib_destroy_cq(devr->c0); +error1: + ib_dealloc_pd(devr->p0); + return ret; +} + +static void mlx5_ib_dev_res_cleanup(struct mlx5_ib_dev *dev) +{ + struct mlx5_ib_resources *devr = &dev->devr; + int port; + + /* + * Make sure no change P_Key work items are still executing. + * + * At this stage, the mlx5_ib_event should be unregistered + * and it ensures that no new works are added. + */ + for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) + cancel_work_sync(&devr->ports[port].pkey_change_work); + + ib_destroy_srq(devr->s1); + ib_destroy_srq(devr->s0); + mlx5_cmd_xrcd_dealloc(dev->mdev, devr->xrcdn1, 0); + mlx5_cmd_xrcd_dealloc(dev->mdev, devr->xrcdn0, 0); + ib_destroy_cq(devr->c0); + ib_dealloc_pd(devr->p0); +} + +static u32 get_core_cap_flags(struct ib_device *ibdev, + struct mlx5_hca_vport_context *rep) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, 1); + u8 l3_type_cap = MLX5_CAP_ROCE(dev->mdev, l3_type); + u8 roce_version_cap = MLX5_CAP_ROCE(dev->mdev, roce_version); + bool raw_support = !mlx5_core_mp_enabled(dev->mdev); + u32 ret = 0; + + if (rep->grh_required) + ret |= RDMA_CORE_CAP_IB_GRH_REQUIRED; + + if (ll == IB_LINK_LAYER_INFINIBAND) + return ret | RDMA_CORE_PORT_IBA_IB; + + if (raw_support) + ret |= RDMA_CORE_PORT_RAW_PACKET; + if (dev->is_rep) + return ret; + + if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV4_CAP)) + return ret; + + if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV6_CAP)) + return ret; + + if (roce_version_cap & MLX5_ROCE_VERSION_1_CAP) + ret |= RDMA_CORE_PORT_IBA_ROCE; + + if (roce_version_cap & MLX5_ROCE_VERSION_2_CAP) + ret |= RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP; + + return ret; +} + +static int mlx5_port_immutable(struct ib_device *ibdev, u32 port_num, + struct ib_port_immutable *immutable) +{ + struct ib_port_attr attr; + struct mlx5_ib_dev *dev = to_mdev(ibdev); + enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, port_num); + struct mlx5_hca_vport_context rep = {0}; + int err; + + err = ib_query_port(ibdev, port_num, &attr); + if (err) + return err; + + if (ll == IB_LINK_LAYER_INFINIBAND) { + err = mlx5_query_hca_vport_context(dev->mdev, 0, port_num, 0, + &rep); + if (err) + return err; + } + + immutable->pkey_tbl_len = attr.pkey_tbl_len; + immutable->gid_tbl_len = attr.gid_tbl_len; + immutable->core_cap_flags = get_core_cap_flags(ibdev, &rep); + if (!dev->is_rep && + ((ll == IB_LINK_LAYER_INFINIBAND) || MLX5_CAP_GEN(dev->mdev, roce))) + immutable->max_mad_size = IB_MGMT_MAD_SIZE; + + return 0; +} + +static int mlx5_port_rep_immutable(struct ib_device *ibdev, u32 port_num, + struct ib_port_immutable *immutable) +{ + struct ib_port_attr attr; + int err; + + immutable->core_cap_flags = RDMA_CORE_PORT_RAW_PACKET; + + err = ib_query_port(ibdev, port_num, &attr); + if (err) + return err; + + immutable->pkey_tbl_len = attr.pkey_tbl_len; + immutable->gid_tbl_len = attr.gid_tbl_len; + immutable->core_cap_flags = RDMA_CORE_PORT_RAW_PACKET; + + return 0; +} + +static void get_dev_fw_str(struct ib_device *ibdev, char *str) +{ + struct mlx5_ib_dev *dev = + container_of(ibdev, struct mlx5_ib_dev, ib_dev); + snprintf(str, IB_FW_VERSION_NAME_MAX, "%d.%d.%04d", + fw_rev_maj(dev->mdev), fw_rev_min(dev->mdev), + fw_rev_sub(dev->mdev)); +} + +static int mlx5_eth_lag_init(struct mlx5_ib_dev *dev) +{ + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_flow_namespace *ns = mlx5_get_flow_namespace(mdev, + MLX5_FLOW_NAMESPACE_LAG); + struct mlx5_flow_table *ft; + int err; + + if (!ns || !mlx5_lag_is_active(mdev) || mlx5_lag_is_mpesw(mdev)) + return 0; + + err = mlx5_cmd_create_vport_lag(mdev); + if (err) + return err; + + ft = mlx5_create_lag_demux_flow_table(ns, 0, 0); + if (IS_ERR(ft)) { + err = PTR_ERR(ft); + goto err_destroy_vport_lag; + } + + dev->flow_db->lag_demux_ft = ft; + dev->lag_ports = mlx5_lag_get_num_ports(mdev); + dev->lag_active = true; + return 0; + +err_destroy_vport_lag: + mlx5_cmd_destroy_vport_lag(mdev); + return err; +} + +static void mlx5_eth_lag_cleanup(struct mlx5_ib_dev *dev) +{ + struct mlx5_core_dev *mdev = dev->mdev; + + if (dev->lag_active) { + dev->lag_active = false; + + mlx5_destroy_flow_table(dev->flow_db->lag_demux_ft); + dev->flow_db->lag_demux_ft = NULL; + + mlx5_cmd_destroy_vport_lag(mdev); + } +} + +static int mlx5_add_netdev_notifier(struct mlx5_ib_dev *dev, u32 port_num) +{ + int err; + + dev->port[port_num].roce.nb.notifier_call = mlx5_netdev_event; + err = register_netdevice_notifier(&dev->port[port_num].roce.nb); + if (err) { + dev->port[port_num].roce.nb.notifier_call = NULL; + return err; + } + + return 0; +} + +static void mlx5_remove_netdev_notifier(struct mlx5_ib_dev *dev, u32 port_num) +{ + if (dev->port[port_num].roce.nb.notifier_call) { + unregister_netdevice_notifier(&dev->port[port_num].roce.nb); + dev->port[port_num].roce.nb.notifier_call = NULL; + } +} + +static int mlx5_enable_eth(struct mlx5_ib_dev *dev) +{ + int err; + + if (!dev->is_rep && dev->profile != &raw_eth_profile) { + err = mlx5_nic_vport_enable_roce(dev->mdev); + if (err) + return err; + } + + err = mlx5_eth_lag_init(dev); + if (err) + goto err_disable_roce; + + return 0; + +err_disable_roce: + if (!dev->is_rep && dev->profile != &raw_eth_profile) + mlx5_nic_vport_disable_roce(dev->mdev); + + return err; +} + +static void mlx5_disable_eth(struct mlx5_ib_dev *dev) +{ + mlx5_eth_lag_cleanup(dev); + if (!dev->is_rep && dev->profile != &raw_eth_profile) + mlx5_nic_vport_disable_roce(dev->mdev); +} + +static int mlx5_ib_rn_get_params(struct ib_device *device, u32 port_num, + enum rdma_netdev_t type, + struct rdma_netdev_alloc_params *params) +{ + if (type != RDMA_NETDEV_IPOIB) + return -EOPNOTSUPP; + + return mlx5_rdma_rn_get_params(to_mdev(device)->mdev, device, params); +} + +static ssize_t delay_drop_timeout_read(struct file *filp, char __user *buf, + size_t count, loff_t *pos) +{ + struct mlx5_ib_delay_drop *delay_drop = filp->private_data; + char lbuf[20]; + int len; + + len = snprintf(lbuf, sizeof(lbuf), "%u\n", delay_drop->timeout); + return simple_read_from_buffer(buf, count, pos, lbuf, len); +} + +static ssize_t delay_drop_timeout_write(struct file *filp, const char __user *buf, + size_t count, loff_t *pos) +{ + struct mlx5_ib_delay_drop *delay_drop = filp->private_data; + u32 timeout; + u32 var; + + if (kstrtouint_from_user(buf, count, 0, &var)) + return -EFAULT; + + timeout = min_t(u32, roundup(var, 100), MLX5_MAX_DELAY_DROP_TIMEOUT_MS * + 1000); + if (timeout != var) + mlx5_ib_dbg(delay_drop->dev, "Round delay drop timeout to %u usec\n", + timeout); + + delay_drop->timeout = timeout; + + return count; +} + +static const struct file_operations fops_delay_drop_timeout = { + .owner = THIS_MODULE, + .open = simple_open, + .write = delay_drop_timeout_write, + .read = delay_drop_timeout_read, +}; + +static void mlx5_ib_unbind_slave_port(struct mlx5_ib_dev *ibdev, + struct mlx5_ib_multiport_info *mpi) +{ + u32 port_num = mlx5_core_native_port_num(mpi->mdev) - 1; + struct mlx5_ib_port *port = &ibdev->port[port_num]; + int comps; + int err; + int i; + + lockdep_assert_held(&mlx5_ib_multiport_mutex); + + mlx5_ib_cleanup_cong_debugfs(ibdev, port_num); + + spin_lock(&port->mp.mpi_lock); + if (!mpi->ibdev) { + spin_unlock(&port->mp.mpi_lock); + return; + } + + mpi->ibdev = NULL; + + spin_unlock(&port->mp.mpi_lock); + if (mpi->mdev_events.notifier_call) + mlx5_notifier_unregister(mpi->mdev, &mpi->mdev_events); + mpi->mdev_events.notifier_call = NULL; + mlx5_remove_netdev_notifier(ibdev, port_num); + spin_lock(&port->mp.mpi_lock); + + comps = mpi->mdev_refcnt; + if (comps) { + mpi->unaffiliate = true; + init_completion(&mpi->unref_comp); + spin_unlock(&port->mp.mpi_lock); + + for (i = 0; i < comps; i++) + wait_for_completion(&mpi->unref_comp); + + spin_lock(&port->mp.mpi_lock); + mpi->unaffiliate = false; + } + + port->mp.mpi = NULL; + + spin_unlock(&port->mp.mpi_lock); + + err = mlx5_nic_vport_unaffiliate_multiport(mpi->mdev); + + mlx5_ib_dbg(ibdev, "unaffiliated port %u\n", port_num + 1); + /* Log an error, still needed to cleanup the pointers and add + * it back to the list. + */ + if (err) + mlx5_ib_err(ibdev, "Failed to unaffiliate port %u\n", + port_num + 1); + + ibdev->port[port_num].roce.last_port_state = IB_PORT_DOWN; +} + +static bool mlx5_ib_bind_slave_port(struct mlx5_ib_dev *ibdev, + struct mlx5_ib_multiport_info *mpi) +{ + u32 port_num = mlx5_core_native_port_num(mpi->mdev) - 1; + int err; + + lockdep_assert_held(&mlx5_ib_multiport_mutex); + + spin_lock(&ibdev->port[port_num].mp.mpi_lock); + if (ibdev->port[port_num].mp.mpi) { + mlx5_ib_dbg(ibdev, "port %u already affiliated.\n", + port_num + 1); + spin_unlock(&ibdev->port[port_num].mp.mpi_lock); + return false; + } + + ibdev->port[port_num].mp.mpi = mpi; + mpi->ibdev = ibdev; + mpi->mdev_events.notifier_call = NULL; + spin_unlock(&ibdev->port[port_num].mp.mpi_lock); + + err = mlx5_nic_vport_affiliate_multiport(ibdev->mdev, mpi->mdev); + if (err) + goto unbind; + + err = mlx5_add_netdev_notifier(ibdev, port_num); + if (err) { + mlx5_ib_err(ibdev, "failed adding netdev notifier for port %u\n", + port_num + 1); + goto unbind; + } + + mpi->mdev_events.notifier_call = mlx5_ib_event_slave_port; + mlx5_notifier_register(mpi->mdev, &mpi->mdev_events); + + mlx5_ib_init_cong_debugfs(ibdev, port_num); + + return true; + +unbind: + mlx5_ib_unbind_slave_port(ibdev, mpi); + return false; +} + +static int mlx5_ib_init_multiport_master(struct mlx5_ib_dev *dev) +{ + u32 port_num = mlx5_core_native_port_num(dev->mdev) - 1; + enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev, + port_num + 1); + struct mlx5_ib_multiport_info *mpi; + int err; + u32 i; + + if (!mlx5_core_is_mp_master(dev->mdev) || ll != IB_LINK_LAYER_ETHERNET) + return 0; + + err = mlx5_query_nic_vport_system_image_guid(dev->mdev, + &dev->sys_image_guid); + if (err) + return err; + + err = mlx5_nic_vport_enable_roce(dev->mdev); + if (err) + return err; + + mutex_lock(&mlx5_ib_multiport_mutex); + for (i = 0; i < dev->num_ports; i++) { + bool bound = false; + + /* build a stub multiport info struct for the native port. */ + if (i == port_num) { + mpi = kzalloc(sizeof(*mpi), GFP_KERNEL); + if (!mpi) { + mutex_unlock(&mlx5_ib_multiport_mutex); + mlx5_nic_vport_disable_roce(dev->mdev); + return -ENOMEM; + } + + mpi->is_master = true; + mpi->mdev = dev->mdev; + mpi->sys_image_guid = dev->sys_image_guid; + dev->port[i].mp.mpi = mpi; + mpi->ibdev = dev; + mpi = NULL; + continue; + } + + list_for_each_entry(mpi, &mlx5_ib_unaffiliated_port_list, + list) { + if (dev->sys_image_guid == mpi->sys_image_guid && + (mlx5_core_native_port_num(mpi->mdev) - 1) == i) { + bound = mlx5_ib_bind_slave_port(dev, mpi); + } + + if (bound) { + dev_dbg(mpi->mdev->device, + "removing port from unaffiliated list.\n"); + mlx5_ib_dbg(dev, "port %d bound\n", i + 1); + list_del(&mpi->list); + break; + } + } + if (!bound) + mlx5_ib_dbg(dev, "no free port found for port %d\n", + i + 1); + } + + list_add_tail(&dev->ib_dev_list, &mlx5_ib_dev_list); + mutex_unlock(&mlx5_ib_multiport_mutex); + return err; +} + +static void mlx5_ib_cleanup_multiport_master(struct mlx5_ib_dev *dev) +{ + u32 port_num = mlx5_core_native_port_num(dev->mdev) - 1; + enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev, + port_num + 1); + u32 i; + + if (!mlx5_core_is_mp_master(dev->mdev) || ll != IB_LINK_LAYER_ETHERNET) + return; + + mutex_lock(&mlx5_ib_multiport_mutex); + for (i = 0; i < dev->num_ports; i++) { + if (dev->port[i].mp.mpi) { + /* Destroy the native port stub */ + if (i == port_num) { + kfree(dev->port[i].mp.mpi); + dev->port[i].mp.mpi = NULL; + } else { + mlx5_ib_dbg(dev, "unbinding port_num: %u\n", + i + 1); + list_add_tail(&dev->port[i].mp.mpi->list, + &mlx5_ib_unaffiliated_port_list); + mlx5_ib_unbind_slave_port(dev, + dev->port[i].mp.mpi); + } + } + } + + mlx5_ib_dbg(dev, "removing from devlist\n"); + list_del(&dev->ib_dev_list); + mutex_unlock(&mlx5_ib_multiport_mutex); + + mlx5_nic_vport_disable_roce(dev->mdev); +} + +static int mmap_obj_cleanup(struct ib_uobject *uobject, + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_user_mmap_entry *obj = uobject->object; + + rdma_user_mmap_entry_remove(&obj->rdma_entry); + return 0; +} + +static int mlx5_rdma_user_mmap_entry_insert(struct mlx5_ib_ucontext *c, + struct mlx5_user_mmap_entry *entry, + size_t length) +{ + return rdma_user_mmap_entry_insert_range( + &c->ibucontext, &entry->rdma_entry, length, + (MLX5_IB_MMAP_OFFSET_START << 16), + ((MLX5_IB_MMAP_OFFSET_END << 16) + (1UL << 16) - 1)); +} + +static struct mlx5_user_mmap_entry * +alloc_var_entry(struct mlx5_ib_ucontext *c) +{ + struct mlx5_user_mmap_entry *entry; + struct mlx5_var_table *var_table; + u32 page_idx; + int err; + + var_table = &to_mdev(c->ibucontext.device)->var_table; + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return ERR_PTR(-ENOMEM); + + mutex_lock(&var_table->bitmap_lock); + page_idx = find_first_zero_bit(var_table->bitmap, + var_table->num_var_hw_entries); + if (page_idx >= var_table->num_var_hw_entries) { + err = -ENOSPC; + mutex_unlock(&var_table->bitmap_lock); + goto end; + } + + set_bit(page_idx, var_table->bitmap); + mutex_unlock(&var_table->bitmap_lock); + + entry->address = var_table->hw_start_addr + + (page_idx * var_table->stride_size); + entry->page_idx = page_idx; + entry->mmap_flag = MLX5_IB_MMAP_TYPE_VAR; + + err = mlx5_rdma_user_mmap_entry_insert(c, entry, + var_table->stride_size); + if (err) + goto err_insert; + + return entry; + +err_insert: + mutex_lock(&var_table->bitmap_lock); + clear_bit(page_idx, var_table->bitmap); + mutex_unlock(&var_table->bitmap_lock); +end: + kfree(entry); + return ERR_PTR(err); +} + +static int UVERBS_HANDLER(MLX5_IB_METHOD_VAR_OBJ_ALLOC)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = uverbs_attr_get_uobject( + attrs, MLX5_IB_ATTR_VAR_OBJ_ALLOC_HANDLE); + struct mlx5_ib_ucontext *c; + struct mlx5_user_mmap_entry *entry; + u64 mmap_offset; + u32 length; + int err; + + c = to_mucontext(ib_uverbs_get_ucontext(attrs)); + if (IS_ERR(c)) + return PTR_ERR(c); + + entry = alloc_var_entry(c); + if (IS_ERR(entry)) + return PTR_ERR(entry); + + mmap_offset = mlx5_entry_to_mmap_offset(entry); + length = entry->rdma_entry.npages * PAGE_SIZE; + uobj->object = entry; + uverbs_finalize_uobj_create(attrs, MLX5_IB_ATTR_VAR_OBJ_ALLOC_HANDLE); + + err = uverbs_copy_to(attrs, MLX5_IB_ATTR_VAR_OBJ_ALLOC_MMAP_OFFSET, + &mmap_offset, sizeof(mmap_offset)); + if (err) + return err; + + err = uverbs_copy_to(attrs, MLX5_IB_ATTR_VAR_OBJ_ALLOC_PAGE_ID, + &entry->page_idx, sizeof(entry->page_idx)); + if (err) + return err; + + err = uverbs_copy_to(attrs, MLX5_IB_ATTR_VAR_OBJ_ALLOC_MMAP_LENGTH, + &length, sizeof(length)); + return err; +} + +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_VAR_OBJ_ALLOC, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_VAR_OBJ_ALLOC_HANDLE, + MLX5_IB_OBJECT_VAR, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_VAR_OBJ_ALLOC_PAGE_ID, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_VAR_OBJ_ALLOC_MMAP_LENGTH, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_VAR_OBJ_ALLOC_MMAP_OFFSET, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD_DESTROY( + MLX5_IB_METHOD_VAR_OBJ_DESTROY, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_VAR_OBJ_DESTROY_HANDLE, + MLX5_IB_OBJECT_VAR, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_VAR, + UVERBS_TYPE_ALLOC_IDR(mmap_obj_cleanup), + &UVERBS_METHOD(MLX5_IB_METHOD_VAR_OBJ_ALLOC), + &UVERBS_METHOD(MLX5_IB_METHOD_VAR_OBJ_DESTROY)); + +static bool var_is_supported(struct ib_device *device) +{ + struct mlx5_ib_dev *dev = to_mdev(device); + + return (MLX5_CAP_GEN_64(dev->mdev, general_obj_types) & + MLX5_GENERAL_OBJ_TYPES_CAP_VIRTIO_NET_Q); +} + +static struct mlx5_user_mmap_entry * +alloc_uar_entry(struct mlx5_ib_ucontext *c, + enum mlx5_ib_uapi_uar_alloc_type alloc_type) +{ + struct mlx5_user_mmap_entry *entry; + struct mlx5_ib_dev *dev; + u32 uar_index; + int err; + + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return ERR_PTR(-ENOMEM); + + dev = to_mdev(c->ibucontext.device); + err = mlx5_cmd_uar_alloc(dev->mdev, &uar_index, c->devx_uid); + if (err) + goto end; + + entry->page_idx = uar_index; + entry->address = uar_index2paddress(dev, uar_index); + if (alloc_type == MLX5_IB_UAPI_UAR_ALLOC_TYPE_BF) + entry->mmap_flag = MLX5_IB_MMAP_TYPE_UAR_WC; + else + entry->mmap_flag = MLX5_IB_MMAP_TYPE_UAR_NC; + + err = mlx5_rdma_user_mmap_entry_insert(c, entry, PAGE_SIZE); + if (err) + goto err_insert; + + return entry; + +err_insert: + mlx5_cmd_uar_dealloc(dev->mdev, uar_index, c->devx_uid); +end: + kfree(entry); + return ERR_PTR(err); +} + +static int UVERBS_HANDLER(MLX5_IB_METHOD_UAR_OBJ_ALLOC)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = uverbs_attr_get_uobject( + attrs, MLX5_IB_ATTR_UAR_OBJ_ALLOC_HANDLE); + enum mlx5_ib_uapi_uar_alloc_type alloc_type; + struct mlx5_ib_ucontext *c; + struct mlx5_user_mmap_entry *entry; + u64 mmap_offset; + u32 length; + int err; + + c = to_mucontext(ib_uverbs_get_ucontext(attrs)); + if (IS_ERR(c)) + return PTR_ERR(c); + + err = uverbs_get_const(&alloc_type, attrs, + MLX5_IB_ATTR_UAR_OBJ_ALLOC_TYPE); + if (err) + return err; + + if (alloc_type != MLX5_IB_UAPI_UAR_ALLOC_TYPE_BF && + alloc_type != MLX5_IB_UAPI_UAR_ALLOC_TYPE_NC) + return -EOPNOTSUPP; + + if (!to_mdev(c->ibucontext.device)->wc_support && + alloc_type == MLX5_IB_UAPI_UAR_ALLOC_TYPE_BF) + return -EOPNOTSUPP; + + entry = alloc_uar_entry(c, alloc_type); + if (IS_ERR(entry)) + return PTR_ERR(entry); + + mmap_offset = mlx5_entry_to_mmap_offset(entry); + length = entry->rdma_entry.npages * PAGE_SIZE; + uobj->object = entry; + uverbs_finalize_uobj_create(attrs, MLX5_IB_ATTR_UAR_OBJ_ALLOC_HANDLE); + + err = uverbs_copy_to(attrs, MLX5_IB_ATTR_UAR_OBJ_ALLOC_MMAP_OFFSET, + &mmap_offset, sizeof(mmap_offset)); + if (err) + return err; + + err = uverbs_copy_to(attrs, MLX5_IB_ATTR_UAR_OBJ_ALLOC_PAGE_ID, + &entry->page_idx, sizeof(entry->page_idx)); + if (err) + return err; + + err = uverbs_copy_to(attrs, MLX5_IB_ATTR_UAR_OBJ_ALLOC_MMAP_LENGTH, + &length, sizeof(length)); + return err; +} + +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_UAR_OBJ_ALLOC, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_UAR_OBJ_ALLOC_HANDLE, + MLX5_IB_OBJECT_UAR, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_UAR_OBJ_ALLOC_TYPE, + enum mlx5_ib_uapi_uar_alloc_type, + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_UAR_OBJ_ALLOC_PAGE_ID, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_UAR_OBJ_ALLOC_MMAP_LENGTH, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_UAR_OBJ_ALLOC_MMAP_OFFSET, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD_DESTROY( + MLX5_IB_METHOD_UAR_OBJ_DESTROY, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_UAR_OBJ_DESTROY_HANDLE, + MLX5_IB_OBJECT_UAR, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_UAR, + UVERBS_TYPE_ALLOC_IDR(mmap_obj_cleanup), + &UVERBS_METHOD(MLX5_IB_METHOD_UAR_OBJ_ALLOC), + &UVERBS_METHOD(MLX5_IB_METHOD_UAR_OBJ_DESTROY)); + +ADD_UVERBS_ATTRIBUTES_SIMPLE( + mlx5_ib_flow_action, + UVERBS_OBJECT_FLOW_ACTION, + UVERBS_METHOD_FLOW_ACTION_ESP_CREATE, + UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_CREATE_FLOW_ACTION_FLAGS, + enum mlx5_ib_uapi_flow_action_flags)); + +ADD_UVERBS_ATTRIBUTES_SIMPLE( + mlx5_ib_query_context, + UVERBS_OBJECT_DEVICE, + UVERBS_METHOD_QUERY_CONTEXT, + UVERBS_ATTR_PTR_OUT( + MLX5_IB_ATTR_QUERY_CONTEXT_RESP_UCTX, + UVERBS_ATTR_STRUCT(struct mlx5_ib_alloc_ucontext_resp, + dump_fill_mkey), + UA_MANDATORY)); + +static const struct uapi_definition mlx5_ib_defs[] = { + UAPI_DEF_CHAIN(mlx5_ib_devx_defs), + UAPI_DEF_CHAIN(mlx5_ib_flow_defs), + UAPI_DEF_CHAIN(mlx5_ib_qos_defs), + UAPI_DEF_CHAIN(mlx5_ib_std_types_defs), + UAPI_DEF_CHAIN(mlx5_ib_dm_defs), + + UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_FLOW_ACTION, + &mlx5_ib_flow_action), + UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_DEVICE, &mlx5_ib_query_context), + UAPI_DEF_CHAIN_OBJ_TREE_NAMED(MLX5_IB_OBJECT_VAR, + UAPI_DEF_IS_OBJ_SUPPORTED(var_is_supported)), + UAPI_DEF_CHAIN_OBJ_TREE_NAMED(MLX5_IB_OBJECT_UAR), + {} +}; + +static void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev) +{ + mlx5_ib_cleanup_multiport_master(dev); + WARN_ON(!xa_empty(&dev->odp_mkeys)); + mutex_destroy(&dev->cap_mask_mutex); + WARN_ON(!xa_empty(&dev->sig_mrs)); + WARN_ON(!bitmap_empty(dev->dm.memic_alloc_pages, MLX5_MAX_MEMIC_PAGES)); +} + +static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) +{ + struct mlx5_core_dev *mdev = dev->mdev; + int err; + int i, j; + + dev->ib_dev.node_type = RDMA_NODE_IB_CA; + dev->ib_dev.local_dma_lkey = 0 /* not supported for now */; + dev->ib_dev.phys_port_cnt = dev->num_ports; + dev->ib_dev.dev.parent = mdev->device; + dev->ib_dev.lag_flags = RDMA_LAG_FLAGS_HASH_ALL_SLAVES; + + for (i = 0; i < dev->num_ports; i++) { + spin_lock_init(&dev->port[i].mp.mpi_lock); + rwlock_init(&dev->port[i].roce.netdev_lock); + dev->port[i].roce.dev = dev; + dev->port[i].roce.native_port_num = i + 1; + dev->port[i].roce.last_port_state = IB_PORT_DOWN; + } + + for (i = 0; i < MLX5_MAX_PORTS; i++) { + for (j = 0; j < MLX5_MAX_MACSEC_GIDS; j++) + dev->reserved_gids[i][j].macsec_index = -1; + } + + err = mlx5_ib_init_multiport_master(dev); + if (err) + return err; + + err = set_has_smi_cap(dev); + if (err) + goto err_mp; + + err = mlx5_query_max_pkeys(&dev->ib_dev, &dev->pkey_table_len); + if (err) + goto err_mp; + + if (mlx5_use_mad_ifc(dev)) + get_ext_port_caps(dev); + + dev->ib_dev.num_comp_vectors = mlx5_comp_vectors_count(mdev); + if (mlx5_lag_is_active(dev->mdev)) + dev->ib_dev.dev_immutable.bond_device = true; + + mutex_init(&dev->cap_mask_mutex); + INIT_LIST_HEAD(&dev->qp_list); + spin_lock_init(&dev->reset_flow_resource_lock); + xa_init(&dev->odp_mkeys); + xa_init(&dev->sig_mrs); + atomic_set(&dev->mkey_var, 0); + + spin_lock_init(&dev->dm.lock); + dev->dm.dev = mdev; + return 0; + +err_mp: + mlx5_ib_cleanup_multiport_master(dev); + return err; +} + +static int mlx5_ib_enable_driver(struct ib_device *dev) +{ + struct mlx5_ib_dev *mdev = to_mdev(dev); + int ret; + + ret = mlx5_ib_test_wc(mdev); + mlx5_ib_dbg(mdev, "Write-Combining %s", + mdev->wc_support ? "supported" : "not supported"); + + return ret; +} + +static const struct ib_device_ops mlx5_ib_dev_ops = { + .owner = THIS_MODULE, + .driver_id = RDMA_DRIVER_MLX5, + .uverbs_abi_ver = MLX5_IB_UVERBS_ABI_VERSION, + + .add_gid = mlx5_ib_add_gid, + .alloc_mr = mlx5_ib_alloc_mr, + .alloc_mr_integrity = mlx5_ib_alloc_mr_integrity, + .alloc_pd = mlx5_ib_alloc_pd, + .alloc_ucontext = mlx5_ib_alloc_ucontext, + .attach_mcast = mlx5_ib_mcg_attach, + .check_mr_status = mlx5_ib_check_mr_status, + .create_ah = mlx5_ib_create_ah, + .create_cq = mlx5_ib_create_cq, + .create_qp = mlx5_ib_create_qp, + .create_srq = mlx5_ib_create_srq, + .create_user_ah = mlx5_ib_create_ah, + .dealloc_pd = mlx5_ib_dealloc_pd, + .dealloc_ucontext = mlx5_ib_dealloc_ucontext, + .del_gid = mlx5_ib_del_gid, + .dereg_mr = mlx5_ib_dereg_mr, + .destroy_ah = mlx5_ib_destroy_ah, + .destroy_cq = mlx5_ib_destroy_cq, + .destroy_qp = mlx5_ib_destroy_qp, + .destroy_srq = mlx5_ib_destroy_srq, + .detach_mcast = mlx5_ib_mcg_detach, + .disassociate_ucontext = mlx5_ib_disassociate_ucontext, + .drain_rq = mlx5_ib_drain_rq, + .drain_sq = mlx5_ib_drain_sq, + .device_group = &mlx5_attr_group, + .enable_driver = mlx5_ib_enable_driver, + .get_dev_fw_str = get_dev_fw_str, + .get_dma_mr = mlx5_ib_get_dma_mr, + .get_link_layer = mlx5_ib_port_link_layer, + .map_mr_sg = mlx5_ib_map_mr_sg, + .map_mr_sg_pi = mlx5_ib_map_mr_sg_pi, + .mmap = mlx5_ib_mmap, + .mmap_free = mlx5_ib_mmap_free, + .modify_cq = mlx5_ib_modify_cq, + .modify_device = mlx5_ib_modify_device, + .modify_port = mlx5_ib_modify_port, + .modify_qp = mlx5_ib_modify_qp, + .modify_srq = mlx5_ib_modify_srq, + .poll_cq = mlx5_ib_poll_cq, + .post_recv = mlx5_ib_post_recv_nodrain, + .post_send = mlx5_ib_post_send_nodrain, + .post_srq_recv = mlx5_ib_post_srq_recv, + .process_mad = mlx5_ib_process_mad, + .query_ah = mlx5_ib_query_ah, + .query_device = mlx5_ib_query_device, + .query_gid = mlx5_ib_query_gid, + .query_pkey = mlx5_ib_query_pkey, + .query_qp = mlx5_ib_query_qp, + .query_srq = mlx5_ib_query_srq, + .query_ucontext = mlx5_ib_query_ucontext, + .reg_user_mr = mlx5_ib_reg_user_mr, + .reg_user_mr_dmabuf = mlx5_ib_reg_user_mr_dmabuf, + .req_notify_cq = mlx5_ib_arm_cq, + .rereg_user_mr = mlx5_ib_rereg_user_mr, + .resize_cq = mlx5_ib_resize_cq, + + INIT_RDMA_OBJ_SIZE(ib_ah, mlx5_ib_ah, ibah), + INIT_RDMA_OBJ_SIZE(ib_counters, mlx5_ib_mcounters, ibcntrs), + INIT_RDMA_OBJ_SIZE(ib_cq, mlx5_ib_cq, ibcq), + INIT_RDMA_OBJ_SIZE(ib_pd, mlx5_ib_pd, ibpd), + INIT_RDMA_OBJ_SIZE(ib_qp, mlx5_ib_qp, ibqp), + INIT_RDMA_OBJ_SIZE(ib_srq, mlx5_ib_srq, ibsrq), + INIT_RDMA_OBJ_SIZE(ib_ucontext, mlx5_ib_ucontext, ibucontext), +}; + +static const struct ib_device_ops mlx5_ib_dev_ipoib_enhanced_ops = { + .rdma_netdev_get_params = mlx5_ib_rn_get_params, +}; + +static const struct ib_device_ops mlx5_ib_dev_nvmf_ops = { + .create_nvmf_backend_ctrl = mlx5_ib_create_nvmf_backend_ctrl, + .destroy_nvmf_backend_ctrl = mlx5_ib_destroy_nvmf_backend_ctrl, + .attach_nvmf_ns = mlx5_ib_attach_nvmf_ns, + .detach_nvmf_ns = mlx5_ib_detach_nvmf_ns, +}; + +static const struct ib_device_ops mlx5_ib_dev_frontend_ns_context_ops = { + .query_nvmf_ns = mlx5_ib_query_nvmf_ns, +}; + +static const struct ib_device_ops mlx5_ib_dev_sriov_ops = { + .get_vf_config = mlx5_ib_get_vf_config, + .get_vf_guid = mlx5_ib_get_vf_guid, + .get_vf_stats = mlx5_ib_get_vf_stats, + .set_vf_guid = mlx5_ib_set_vf_guid, + .set_vf_link_state = mlx5_ib_set_vf_link_state, +}; + +static const struct ib_device_ops mlx5_ib_dev_mw_ops = { + .alloc_mw = mlx5_ib_alloc_mw, + .dealloc_mw = mlx5_ib_dealloc_mw, + + INIT_RDMA_OBJ_SIZE(ib_mw, mlx5_ib_mw, ibmw), +}; + +static const struct ib_device_ops mlx5_ib_dev_xrc_ops = { + .alloc_xrcd = mlx5_ib_alloc_xrcd, + .dealloc_xrcd = mlx5_ib_dealloc_xrcd, + + INIT_RDMA_OBJ_SIZE(ib_xrcd, mlx5_ib_xrcd, ibxrcd), +}; + +static int mlx5_ib_init_var_table(struct mlx5_ib_dev *dev) +{ + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_var_table *var_table = &dev->var_table; + u8 log_doorbell_bar_size; + u8 log_doorbell_stride; + u64 bar_size; + + log_doorbell_bar_size = MLX5_CAP_DEV_VDPA_EMULATION(mdev, + log_doorbell_bar_size); + log_doorbell_stride = MLX5_CAP_DEV_VDPA_EMULATION(mdev, + log_doorbell_stride); + var_table->hw_start_addr = dev->mdev->bar_addr + + MLX5_CAP64_DEV_VDPA_EMULATION(mdev, + doorbell_bar_offset); + bar_size = (1ULL << log_doorbell_bar_size) * 4096; + var_table->stride_size = 1ULL << log_doorbell_stride; + var_table->num_var_hw_entries = div_u64(bar_size, + var_table->stride_size); + mutex_init(&var_table->bitmap_lock); + var_table->bitmap = bitmap_zalloc(var_table->num_var_hw_entries, + GFP_KERNEL); + return (var_table->bitmap) ? 0 : -ENOMEM; +} + +static void mlx5_ib_stage_caps_cleanup(struct mlx5_ib_dev *dev) +{ + bitmap_free(dev->var_table.bitmap); +} + +static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev) +{ + struct mlx5_core_dev *mdev = dev->mdev; + int err; + + if (MLX5_CAP_GEN(mdev, ipoib_enhanced_offloads) && + IS_ENABLED(CONFIG_MLX5_CORE_IPOIB)) + ib_set_device_ops(&dev->ib_dev, + &mlx5_ib_dev_ipoib_enhanced_ops); + + if (MLX5_CAP_GEN(mdev, nvmf_target_offload)) { + mlx5_ib_internal_fill_nvmf_caps(dev); + ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_nvmf_ops); + + if (MLX5_CAP_NVMF(mdev, frontend_namespace_context)) + ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_frontend_ns_context_ops); + } + + if (mlx5_core_is_pf(mdev)) + ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_sriov_ops); + + dev->umr_fence = mlx5_get_umr_fence(MLX5_CAP_GEN(mdev, umr_fence)); + + if (MLX5_CAP_GEN(mdev, imaicl)) + ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_mw_ops); + + if (MLX5_CAP_GEN(mdev, xrc)) + ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_xrc_ops); + + if (MLX5_CAP_DEV_MEM(mdev, memic) || + MLX5_CAP_GEN_64(dev->mdev, general_obj_types) & + MLX5_GENERAL_OBJ_TYPES_CAP_SW_ICM) + ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_dm_ops); + + ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_ops); + + if (IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS)) + dev->ib_dev.driver_def = mlx5_ib_defs; + + err = init_node_data(dev); + if (err) + return err; + + if ((MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) && + (MLX5_CAP_GEN(dev->mdev, disable_local_lb_uc) || + MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc))) + mutex_init(&dev->lb.mutex); + + if (MLX5_CAP_GEN_64(dev->mdev, general_obj_types) & + MLX5_GENERAL_OBJ_TYPES_CAP_VIRTIO_NET_Q) { + err = mlx5_ib_init_var_table(dev); + if (err) + return err; + } + + dev->ib_dev.use_cq_dim = true; + + return 0; +} + +static const struct ib_device_ops mlx5_ib_dev_port_ops = { + .get_port_immutable = mlx5_port_immutable, + .query_port = mlx5_ib_query_port, +}; + +static int mlx5_ib_stage_non_default_cb(struct mlx5_ib_dev *dev) +{ + ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_port_ops); + return 0; +} + +static const struct ib_device_ops mlx5_ib_dev_port_rep_ops = { + .get_port_immutable = mlx5_port_rep_immutable, + .query_port = mlx5_ib_rep_query_port, + .query_pkey = mlx5_ib_rep_query_pkey, +}; + +static int mlx5_ib_stage_raw_eth_non_default_cb(struct mlx5_ib_dev *dev) +{ + ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_port_rep_ops); + return 0; +} + +static const struct ib_device_ops mlx5_ib_dev_common_roce_ops = { + .create_rwq_ind_table = mlx5_ib_create_rwq_ind_table, + .create_wq = mlx5_ib_create_wq, + .destroy_rwq_ind_table = mlx5_ib_destroy_rwq_ind_table, + .destroy_wq = mlx5_ib_destroy_wq, + .get_netdev = mlx5_ib_get_netdev, + .modify_wq = mlx5_ib_modify_wq, + + INIT_RDMA_OBJ_SIZE(ib_rwq_ind_table, mlx5_ib_rwq_ind_table, + ib_rwq_ind_tbl), +}; + +static int mlx5_ib_roce_init(struct mlx5_ib_dev *dev) +{ + struct mlx5_core_dev *mdev = dev->mdev; + enum rdma_link_layer ll; + int port_type_cap; + u32 port_num = 0; + int err; + + port_type_cap = MLX5_CAP_GEN(mdev, port_type); + ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap); + + if (ll == IB_LINK_LAYER_ETHERNET) { + ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_common_roce_ops); + + port_num = mlx5_core_native_port_num(dev->mdev) - 1; + + /* Register only for native ports */ + err = mlx5_add_netdev_notifier(dev, port_num); + if (err) + return err; + + err = mlx5_enable_eth(dev); + if (err) + goto cleanup; + } + + return 0; +cleanup: + mlx5_remove_netdev_notifier(dev, port_num); + return err; +} + +static void mlx5_ib_roce_cleanup(struct mlx5_ib_dev *dev) +{ + struct mlx5_core_dev *mdev = dev->mdev; + enum rdma_link_layer ll; + int port_type_cap; + u32 port_num; + + port_type_cap = MLX5_CAP_GEN(mdev, port_type); + ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap); + + if (ll == IB_LINK_LAYER_ETHERNET) { + mlx5_disable_eth(dev); + + port_num = mlx5_core_native_port_num(dev->mdev) - 1; + mlx5_remove_netdev_notifier(dev, port_num); + } +} + +static int mlx5_ib_stage_cong_debugfs_init(struct mlx5_ib_dev *dev) +{ + mlx5_ib_init_cong_debugfs(dev, + mlx5_core_native_port_num(dev->mdev) - 1); + return 0; +} + +static void mlx5_ib_stage_cong_debugfs_cleanup(struct mlx5_ib_dev *dev) +{ + mlx5_ib_cleanup_cong_debugfs(dev, + mlx5_core_native_port_num(dev->mdev) - 1); +} + +static int mlx5_ib_stage_uar_init(struct mlx5_ib_dev *dev) +{ + dev->mdev->priv.uar = mlx5_get_uars_page(dev->mdev); + return PTR_ERR_OR_ZERO(dev->mdev->priv.uar); +} + +static void mlx5_ib_stage_uar_cleanup(struct mlx5_ib_dev *dev) +{ + mlx5_put_uars_page(dev->mdev, dev->mdev->priv.uar); +} + +static int mlx5_ib_stage_bfrag_init(struct mlx5_ib_dev *dev) +{ + int err; + + err = mlx5_alloc_bfreg(dev->mdev, &dev->bfreg, false, false); + if (err) + return err; + + err = mlx5_alloc_bfreg(dev->mdev, &dev->fp_bfreg, false, true); + if (err) + mlx5_free_bfreg(dev->mdev, &dev->bfreg); + + return err; +} + +static void mlx5_ib_stage_bfrag_cleanup(struct mlx5_ib_dev *dev) +{ + mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg); + mlx5_free_bfreg(dev->mdev, &dev->bfreg); +} + +static int mlx5_ib_stage_ib_reg_init(struct mlx5_ib_dev *dev) +{ + const char *name; + + if (!mlx5_lag_is_active(dev->mdev)) + name = "mlx5_%d"; + else + name = "mlx5_bond_%d"; + return ib_register_device(&dev->ib_dev, name, &dev->mdev->pdev->dev); +} + +static void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev) +{ + int err; + + err = mlx5_mr_cache_cleanup(dev); + if (err) + mlx5_ib_warn(dev, "mr cache cleanup failed\n"); + + if (dev->umrc.qp) + ib_destroy_qp(dev->umrc.qp); + if (dev->umrc.cq) + ib_free_cq(dev->umrc.cq); + if (dev->umrc.pd) + ib_dealloc_pd(dev->umrc.pd); +} + +static void mlx5_ib_stage_ib_reg_cleanup(struct mlx5_ib_dev *dev) +{ + ib_unregister_device(&dev->ib_dev); +} + +enum { + MAX_UMR_WR = 128, +}; + +static int mlx5r_umr_qp_rst2rts(struct mlx5_ib_dev *dev, struct ib_qp *qp) +{ + struct ib_qp_attr attr = {}; + int ret; + + attr.qp_state = IB_QPS_INIT; + attr.port_num = 1; + ret = ib_modify_qp(qp, &attr, + IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT); + if (ret) { + mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n"); + return ret; + } + + memset(&attr, 0, sizeof(attr)); + attr.qp_state = IB_QPS_RTR; + + ret = ib_modify_qp(qp, &attr, IB_QP_STATE); + if (ret) { + mlx5_ib_dbg(dev, "Couldn't modify umr QP to rtr\n"); + return ret; + } + + memset(&attr, 0, sizeof(attr)); + attr.qp_state = IB_QPS_RTS; + ret = ib_modify_qp(qp, &attr, IB_QP_STATE); + if (ret) { + mlx5_ib_dbg(dev, "Couldn't modify umr QP to rts\n"); + return ret; + } + + return 0; +} + +static int mlx5_ib_stage_post_ib_reg_umr_init(struct mlx5_ib_dev *dev) +{ + struct ib_qp_init_attr *init_attr; + struct ib_pd *pd; + struct ib_cq *cq; + struct ib_qp *qp; + int ret; + + init_attr = kzalloc(sizeof(*init_attr), GFP_KERNEL); + if (!init_attr) + return -ENOMEM; + + pd = ib_alloc_pd_notrack(&dev->ib_dev, 0); + if (IS_ERR(pd)) { + mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n"); + ret = PTR_ERR(pd); + goto error_0; + } + + cq = ib_alloc_cq_notrack(&dev->ib_dev, NULL, 128, 0, IB_POLL_SOFTIRQ, NULL); + if (IS_ERR(cq)) { + mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n"); + ret = PTR_ERR(cq); + goto error_2; + } + + init_attr->send_cq = cq; + init_attr->recv_cq = cq; + init_attr->sq_sig_type = IB_SIGNAL_ALL_WR; + init_attr->cap.max_send_wr = MAX_UMR_WR; + init_attr->cap.max_send_sge = 1; + init_attr->qp_type = MLX5_IB_QPT_REG_UMR; + init_attr->port_num = 1; + qp = ib_create_qp(pd, init_attr); + if (IS_ERR(qp)) { + mlx5_ib_dbg(dev, "Couldn't create sync UMR QP\n"); + ret = PTR_ERR(qp); + goto error_3; + } + + ret = mlx5r_umr_qp_rst2rts(dev, qp); + if (ret) + goto error_4; + + dev->umrc.qp = qp; + dev->umrc.cq = cq; + dev->umrc.pd = pd; + + sema_init(&dev->umrc.sem, MAX_UMR_WR); + mutex_init(&dev->umrc.lock); + ret = mlx5_mr_cache_init(dev); + if (ret) { + mlx5_ib_warn(dev, "mr cache init failed %d\n", ret); + goto error_4; + } + + kfree(init_attr); + + return 0; + +error_4: + ib_destroy_qp(qp); + dev->umrc.qp = NULL; + +error_3: + ib_free_cq(cq); + dev->umrc.cq = NULL; + +error_2: + ib_dealloc_pd(pd); + dev->umrc.pd = NULL; + +error_0: + kfree(init_attr); + return ret; +} + +int mlx5r_umr_recover(struct mlx5_ib_dev *dev) +{ + struct umr_common *umrc = &dev->umrc; + struct ib_qp_attr attr; + int err; + + attr.qp_state = IB_QPS_RESET; + err = ib_modify_qp(umrc->qp, &attr, IB_QP_STATE); + if (err) { + mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n"); + goto err; + } + + err = mlx5r_umr_qp_rst2rts(dev, umrc->qp); + if (err) + goto err; + + umrc->state = MLX5_UMR_STATE_ACTIVE; + return 0; + +err: + umrc->state = MLX5_UMR_STATE_ERR; + return err; +} + +static int mlx5_ib_stage_delay_drop_init(struct mlx5_ib_dev *dev) +{ + struct dentry *root; + + if (!(dev->ib_dev.attrs.raw_packet_caps & IB_RAW_PACKET_CAP_DELAY_DROP)) + return 0; + + mutex_init(&dev->delay_drop.lock); + dev->delay_drop.dev = dev; + dev->delay_drop.activate = false; + dev->delay_drop.timeout = MLX5_MAX_DELAY_DROP_TIMEOUT_MS * 1000; + INIT_WORK(&dev->delay_drop.delay_drop_work, delay_drop_handler); + atomic_set(&dev->delay_drop.rqs_cnt, 0); + atomic_set(&dev->delay_drop.events_cnt, 0); + + if (!mlx5_debugfs_root || dev->is_rep) + return 0; + + root = debugfs_create_dir("delay_drop", mlx5_debugfs_get_dev_root(dev->mdev)); + dev->delay_drop.dir_debugfs = root; + + debugfs_create_atomic_t("num_timeout_events", 0400, root, + &dev->delay_drop.events_cnt); + debugfs_create_atomic_t("num_rqs", 0400, root, + &dev->delay_drop.rqs_cnt); + debugfs_create_file("timeout", 0600, root, &dev->delay_drop, + &fops_delay_drop_timeout); + return 0; +} + +static void mlx5_ib_stage_delay_drop_cleanup(struct mlx5_ib_dev *dev) +{ + if (!(dev->ib_dev.attrs.raw_packet_caps & IB_RAW_PACKET_CAP_DELAY_DROP)) + return; + + cancel_work_sync(&dev->delay_drop.delay_drop_work); + if (!dev->delay_drop.dir_debugfs) + return; + + debugfs_remove_recursive(dev->delay_drop.dir_debugfs); + dev->delay_drop.dir_debugfs = NULL; +} + +static int mlx5_ib_stage_dc_tracer_init(struct mlx5_ib_dev *dev) +{ + if (MLX5_CAP_GEN(dev->mdev, port_type) == + MLX5_CAP_PORT_TYPE_IB) { + if (mlx5_ib_init_dc_improvements(dev)) + mlx5_ib_dbg(dev, "init_dc_improvements - continuing\n"); + } + + return 0; +} + +static void mlx5_ib_stage_dc_tracer_cleanup(struct mlx5_ib_dev *dev) +{ + if (MLX5_CAP_GEN(dev->mdev, port_type) == + MLX5_CAP_PORT_TYPE_IB) + mlx5_ib_cleanup_dc_improvements(dev); +} + +static int mlx5_ib_stage_dev_notifier_init(struct mlx5_ib_dev *dev) +{ + dev->mdev_events.notifier_call = mlx5_ib_event; + mlx5_notifier_register(dev->mdev, &dev->mdev_events); + return 0; +} + +static void mlx5_ib_stage_dev_notifier_cleanup(struct mlx5_ib_dev *dev) +{ + mlx5_notifier_unregister(dev->mdev, &dev->mdev_events); +} + +int mlx5_ib_stage_ttl_sysfs_init(struct mlx5_ib_dev *dev) +{ + int err; + err = init_ttl_sysfs(dev); + if (err) { + mlx5_ib_err(dev, "Fail to init ttl sysfs\n"); + return err; + } + + return 0; +} + +void mlx5_ib_stage_ttl_sysfs_cleanup(struct mlx5_ib_dev *dev) +{ + cleanup_ttl_sysfs(dev); +} + +void __mlx5_ib_remove(struct mlx5_ib_dev *dev, + const struct mlx5_ib_profile *profile, + int stage) +{ + dev->ib_active = false; + + /* Number of stages to cleanup */ + while (stage) { + stage--; + if (profile->stage[stage].cleanup) + profile->stage[stage].cleanup(dev); + } + + kfree(dev->port); + ib_dealloc_device(&dev->ib_dev); +} + +int mlx5_ib_stage_tc_sysfs_init(struct mlx5_ib_dev *dev) +{ + int err; + err = init_tc_sysfs(dev); + if (err) { + mlx5_ib_err(dev, "Fail to init tc sysfs\n"); + return err; + } + + return 0; +} + +void mlx5_ib_stage_tc_sysfs_cleanup(struct mlx5_ib_dev *dev) +{ + cleanup_tc_sysfs(dev); +} + +int __mlx5_ib_add(struct mlx5_ib_dev *dev, + const struct mlx5_ib_profile *profile) +{ + int err; + int i; + + dev->profile = profile; + + for (i = 0; i < MLX5_IB_STAGE_MAX; i++) { + if (profile->stage[i].init) { + err = profile->stage[i].init(dev); + if (err) + goto err_out; + } + } + + dev->ib_active = true; + return 0; + +err_out: + /* Clean up stages which were initialized */ + while (i) { + i--; + if (profile->stage[i].cleanup) + profile->stage[i].cleanup(dev); + } + return -ENOMEM; +} + +static const struct mlx5_ib_profile pf_profile = { + STAGE_CREATE(MLX5_IB_STAGE_INIT, + mlx5_ib_stage_init_init, + mlx5_ib_stage_init_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_FS, + mlx5_ib_fs_init, + mlx5_ib_fs_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_CAPS, + mlx5_ib_stage_caps_init, + mlx5_ib_stage_caps_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_NON_DEFAULT_CB, + mlx5_ib_stage_non_default_cb, + NULL), + STAGE_CREATE(MLX5_IB_STAGE_ROCE, + mlx5_ib_roce_init, + mlx5_ib_roce_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_QP, + mlx5_init_qp_table, + mlx5_cleanup_qp_table), + STAGE_CREATE(MLX5_IB_STAGE_SRQ, + mlx5_init_srq_table, + mlx5_cleanup_srq_table), + STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES, + mlx5_ib_dev_res_init, + mlx5_ib_dev_res_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER, + mlx5_ib_stage_dev_notifier_init, + mlx5_ib_stage_dev_notifier_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_ODP, + mlx5_ib_odp_init_one, + mlx5_ib_odp_cleanup_one), + STAGE_CREATE(MLX5_IB_STAGE_COUNTERS, + mlx5_ib_counters_init, + mlx5_ib_counters_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_CONG_DEBUGFS, + mlx5_ib_stage_cong_debugfs_init, + mlx5_ib_stage_cong_debugfs_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_UAR, + mlx5_ib_stage_uar_init, + mlx5_ib_stage_uar_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_BFREG, + mlx5_ib_stage_bfrag_init, + mlx5_ib_stage_bfrag_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_PRE_IB_REG_UMR, + NULL, + mlx5_ib_stage_pre_ib_reg_umr_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_WHITELIST_UID, + mlx5_ib_devx_init, + mlx5_ib_devx_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_IB_REG, + mlx5_ib_stage_ib_reg_init, + mlx5_ib_stage_ib_reg_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR, + mlx5_ib_stage_post_ib_reg_umr_init, + NULL), + STAGE_CREATE(MLX5_IB_STAGE_DELAY_DROP, + mlx5_ib_stage_delay_drop_init, + mlx5_ib_stage_delay_drop_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_DC_TRACER, + mlx5_ib_stage_dc_tracer_init, + mlx5_ib_stage_dc_tracer_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_RESTRACK, + mlx5_ib_restrack_init, + NULL), + STAGE_CREATE(MLX5_IB_STAGE_TTL_SYSFS, + mlx5_ib_stage_ttl_sysfs_init, + mlx5_ib_stage_ttl_sysfs_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_TC_SYSFS, + mlx5_ib_stage_tc_sysfs_init, + mlx5_ib_stage_tc_sysfs_cleanup), +}; + +const struct mlx5_ib_profile raw_eth_profile = { + STAGE_CREATE(MLX5_IB_STAGE_INIT, + mlx5_ib_stage_init_init, + mlx5_ib_stage_init_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_FS, + mlx5_ib_fs_init, + mlx5_ib_fs_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_CAPS, + mlx5_ib_stage_caps_init, + mlx5_ib_stage_caps_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_NON_DEFAULT_CB, + mlx5_ib_stage_raw_eth_non_default_cb, + NULL), + STAGE_CREATE(MLX5_IB_STAGE_ROCE, + mlx5_ib_roce_init, + mlx5_ib_roce_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_QP, + mlx5_init_qp_table, + mlx5_cleanup_qp_table), + STAGE_CREATE(MLX5_IB_STAGE_SRQ, + mlx5_init_srq_table, + mlx5_cleanup_srq_table), + STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES, + mlx5_ib_dev_res_init, + mlx5_ib_dev_res_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER, + mlx5_ib_stage_dev_notifier_init, + mlx5_ib_stage_dev_notifier_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_COUNTERS, + mlx5_ib_counters_init, + mlx5_ib_counters_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_CONG_DEBUGFS, + mlx5_ib_stage_cong_debugfs_init, + mlx5_ib_stage_cong_debugfs_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_UAR, + mlx5_ib_stage_uar_init, + mlx5_ib_stage_uar_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_BFREG, + mlx5_ib_stage_bfrag_init, + mlx5_ib_stage_bfrag_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_PRE_IB_REG_UMR, + NULL, + mlx5_ib_stage_pre_ib_reg_umr_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_WHITELIST_UID, + mlx5_ib_devx_init, + mlx5_ib_devx_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_IB_REG, + mlx5_ib_stage_ib_reg_init, + mlx5_ib_stage_ib_reg_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR, + mlx5_ib_stage_post_ib_reg_umr_init, + NULL), + STAGE_CREATE(MLX5_IB_STAGE_RESTRACK, + mlx5_ib_restrack_init, + NULL), + STAGE_CREATE(MLX5_IB_STAGE_TTL_SYSFS, + mlx5_ib_stage_ttl_sysfs_init, + mlx5_ib_stage_ttl_sysfs_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_TC_SYSFS, + mlx5_ib_stage_tc_sysfs_init, + mlx5_ib_stage_tc_sysfs_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_DELAY_DROP, + mlx5_ib_stage_delay_drop_init, + mlx5_ib_stage_delay_drop_cleanup), +}; + +static int mlx5r_mp_probe(struct auxiliary_device *adev, + const struct auxiliary_device_id *id) +{ + struct mlx5_adev *idev = container_of(adev, struct mlx5_adev, adev); + struct mlx5_core_dev *mdev = idev->mdev; + struct mlx5_ib_multiport_info *mpi; + struct mlx5_ib_dev *dev; + bool bound = false; + int err; + + mpi = kzalloc(sizeof(*mpi), GFP_KERNEL); + if (!mpi) + return -ENOMEM; + + mpi->mdev = mdev; + err = mlx5_query_nic_vport_system_image_guid(mdev, + &mpi->sys_image_guid); + if (err) { + kfree(mpi); + return err; + } + + mutex_lock(&mlx5_ib_multiport_mutex); + list_for_each_entry(dev, &mlx5_ib_dev_list, ib_dev_list) { + if (dev->sys_image_guid == mpi->sys_image_guid) + bound = mlx5_ib_bind_slave_port(dev, mpi); + + if (bound) { + rdma_roce_rescan_device(&dev->ib_dev); + mpi->ibdev->ib_active = true; + break; + } + } + + if (!bound) { + list_add_tail(&mpi->list, &mlx5_ib_unaffiliated_port_list); + dev_dbg(mdev->device, + "no suitable IB device found to bind to, added to unaffiliated list.\n"); + } + mutex_unlock(&mlx5_ib_multiport_mutex); + + auxiliary_set_drvdata(adev, mpi); + return 0; +} + +static void mlx5r_mp_remove(struct auxiliary_device *adev) +{ + struct mlx5_ib_multiport_info *mpi; + + mpi = auxiliary_get_drvdata(adev); + mutex_lock(&mlx5_ib_multiport_mutex); + if (mpi->ibdev) + mlx5_ib_unbind_slave_port(mpi->ibdev, mpi); + else + list_del(&mpi->list); + mutex_unlock(&mlx5_ib_multiport_mutex); + kfree(mpi); +} + +static int mlx5r_probe(struct auxiliary_device *adev, + const struct auxiliary_device_id *id) +{ + struct mlx5_adev *idev = container_of(adev, struct mlx5_adev, adev); + struct mlx5_core_dev *mdev = idev->mdev; + const struct mlx5_ib_profile *profile; + int port_type_cap, num_ports, ret; + enum rdma_link_layer ll; + struct mlx5_ib_dev *dev; + + port_type_cap = MLX5_CAP_GEN(mdev, port_type); + ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap); + + num_ports = max(MLX5_CAP_GEN(mdev, num_ports), + MLX5_CAP_GEN(mdev, num_vhca_ports)); + dev = ib_alloc_device(mlx5_ib_dev, ib_dev); + if (!dev) + return -ENOMEM; + dev->port = kcalloc(num_ports, sizeof(*dev->port), + GFP_KERNEL); + if (!dev->port) { + ib_dealloc_device(&dev->ib_dev); + return -ENOMEM; + } + + dev->mdev = mdev; + dev->num_ports = num_ports; + + mdev->roce.enabled = mlx5_get_roce_state(mdev); + if (ll == IB_LINK_LAYER_ETHERNET && !mlx5_get_roce_state(mdev)) + profile = &raw_eth_profile; + else + profile = &pf_profile; + + ret = __mlx5_ib_add(dev, profile); + if (ret) { + kfree(dev->port); + ib_dealloc_device(&dev->ib_dev); + return ret; + } + + auxiliary_set_drvdata(adev, dev); + return 0; +} + +static void mlx5r_remove(struct auxiliary_device *adev) +{ + struct mlx5_ib_dev *dev; + + dev = auxiliary_get_drvdata(adev); + __mlx5_ib_remove(dev, dev->profile, MLX5_IB_STAGE_MAX); +} + +static const struct auxiliary_device_id mlx5r_mp_id_table[] = { + { .name = MLX5_ADEV_NAME ".multiport", }, + {}, +}; + +static const struct auxiliary_device_id mlx5r_id_table[] = { + { .name = MLX5_ADEV_NAME ".rdma", }, + {}, +}; + +MODULE_DEVICE_TABLE(auxiliary_mlx5r_mp_id_table, mlx5r_mp_id_table); +MODULE_DEVICE_TABLE(auxiliary_mlx5r_id_table, mlx5r_id_table); + +static struct auxiliary_driver mlx5r_mp_driver = { + .name = "multiport", + .probe = mlx5r_mp_probe, + .remove = mlx5r_mp_remove, + .id_table = mlx5r_mp_id_table, +}; + +static struct auxiliary_driver mlx5r_driver = { + .name = "rdma", + .probe = mlx5r_probe, + .remove = mlx5r_remove, + .id_table = mlx5r_id_table, +}; + +static int __init mlx5_ib_init(void) +{ + int ret; + + xlt_emergency_page = (void *)__get_free_page(GFP_KERNEL); + if (!xlt_emergency_page) + return -ENOMEM; + + mlx5_ib_event_wq = alloc_ordered_workqueue("mlx5_ib_event_wq", 0); + if (!mlx5_ib_event_wq) { + ret = -ENOMEM; + goto err_free_xlt_page; + } + + mlx5_ib_sigerr_sqd_wq = + create_singlethread_workqueue("mlx5_ib_sigerr_sqd_wq"); + if (!mlx5_ib_sigerr_sqd_wq) { + ret = -ENOMEM; + goto rep_err; + } + + mlx5_ib_odp_init(); + ret = mlx5r_rep_init(); + if (ret) + goto rep_err; + ret = auxiliary_driver_register(&mlx5r_mp_driver); + if (ret) + goto mp_err; + ret = auxiliary_driver_register(&mlx5r_driver); + if (ret) + goto drv_err; + return 0; + +drv_err: + auxiliary_driver_unregister(&mlx5r_mp_driver); +mp_err: + mlx5r_rep_cleanup(); +rep_err: + destroy_workqueue(mlx5_ib_event_wq); +err_free_xlt_page: + free_page((unsigned long)xlt_emergency_page); + + return ret; +} + +static void __exit mlx5_ib_cleanup(void) +{ + auxiliary_driver_unregister(&mlx5r_driver); + auxiliary_driver_unregister(&mlx5r_mp_driver); + mlx5r_rep_cleanup(); + destroy_workqueue(mlx5_ib_sigerr_sqd_wq); + destroy_workqueue(mlx5_ib_event_wq); + free_page((unsigned long)xlt_emergency_page); +} + +module_init(mlx5_ib_init); +module_exit(mlx5_ib_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/main_ext.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/main_ext.c new file mode 100644 index 0000000..0d85ad2 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/main_ext.c @@ -0,0 +1,1725 @@ +/* + * Copyright (c) 2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include "mlx5_ib.h" +#include "qp.h" + +#include "../../core/restrack.h" + +/* mlx5_set_ttl feature infra */ +struct ttl_attribute { + struct attribute attr; + ssize_t (*show)(struct mlx5_ttl_data *, struct ttl_attribute *, char *buf); + ssize_t (*store)(struct mlx5_ttl_data *, struct ttl_attribute *, + const char *buf, size_t count); +}; + +#define TTL_ATTR(_name, _mode, _show, _store) \ +struct ttl_attribute ttl_attr_##_name = __ATTR(_name, _mode, _show, _store) + +static ssize_t ttl_show(struct mlx5_ttl_data *ttld, struct ttl_attribute *unused, char *buf) +{ + return sprintf(buf, "%d\n", ttld->val); +} + +static ssize_t ttl_store(struct mlx5_ttl_data *ttld, struct ttl_attribute *unused, + const char *buf, size_t count) +{ + unsigned long var; + + if (kstrtol(buf, 0, &var) || var > 0xff) + return -EINVAL; + + ttld->val = var; + return count; +} + +static TTL_ATTR(ttl, 0644, ttl_show, ttl_store); + +static struct attribute *ttl_attrs[] = { + &ttl_attr_ttl.attr, + NULL +}; + +static ssize_t ttl_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct ttl_attribute *ttl_attr = container_of(attr, struct ttl_attribute, attr); + struct mlx5_ttl_data *d = container_of(kobj, struct mlx5_ttl_data, kobj); + + return ttl_attr->show(d, ttl_attr, buf); +} + +static ssize_t ttl_attr_store(struct kobject *kobj, + struct attribute *attr, const char *buf, size_t count) +{ + struct ttl_attribute *ttl_attr = container_of(attr, struct ttl_attribute, attr); + struct mlx5_ttl_data *d = container_of(kobj, struct mlx5_ttl_data, kobj); + + return ttl_attr->store(d, ttl_attr, buf, count); +} + +static const struct sysfs_ops ttl_sysfs_ops = { + .show = ttl_attr_show, + .store = ttl_attr_store +}; + +static struct kobj_type ttl_type = { + .sysfs_ops = &ttl_sysfs_ops, + .default_attrs = ttl_attrs +}; + +int init_ttl_sysfs(struct mlx5_ib_dev *dev) +{ + struct device *device = &dev->ib_dev.dev; + int num_ports; + int port; + int err; + + dev->ttl_kobj = kobject_create_and_add("ttl", &device->kobj); + if (!dev->ttl_kobj) + return -ENOMEM; + num_ports = max(MLX5_CAP_GEN(dev->mdev, num_ports), + MLX5_CAP_GEN(dev->mdev, num_vhca_ports)); + for (port = 1; port <= num_ports; port++) { + struct mlx5_ttl_data *ttld = &dev->ttld[port - 1]; + + err = kobject_init_and_add(&ttld->kobj, &ttl_type, dev->ttl_kobj, "%d", port); + if (err) + goto err; + ttld->val = 0; + } + return 0; +err: + cleanup_ttl_sysfs(dev); + return err; +} + +void cleanup_ttl_sysfs(struct mlx5_ib_dev *dev) +{ + if (dev->ttl_kobj) { + int num_ports; + int port; + + kobject_put(dev->ttl_kobj); + dev->ttl_kobj = NULL; + num_ports = max(MLX5_CAP_GEN(dev->mdev, num_ports), + MLX5_CAP_GEN(dev->mdev, num_vhca_ports)); + for (port = 1; port <= num_ports; port++) { + struct mlx5_ttl_data *ttld = &dev->ttld[port - 1]; + + if (ttld->kobj.state_initialized) + kobject_put(&ttld->kobj); + } + } +} + +/* mlx5_force_tc feature*/ + +static int check_string_match(const char *str, const char *str2) +{ + int str2_len; + int str_len; + + if (!str || !str2) + return -EINVAL; + + str_len = strlen(str); + str2_len = strlen(str2); + + if (str_len <= str2_len) + return -EINVAL; + + return memcmp(str, str2, str2_len); +} + +static void tclass_set_mask_32(u32 *mask, int bits) +{ + *mask = 0; + if (!bits) + bits = 32; + while (bits) { + *mask = (*mask << 1) | 1; + --bits; + } +} + +static int tclass_parse_src_ip(const char *str, void *store, void *store_mask) +{ + const char *end = NULL; + + return !in4_pton(str, -1, (u8 *)store, -1, &end); +} + +static int tclass_parse_dst_ip(const char *str, void *store, void *store_mask) +{ + const char *end = NULL; + int mask = 0; + int ret; + + ret = !in4_pton(str, -1, (u8 *)store, -1, &end); + + if (ret) + return -EINVAL; + + if (strlen(end)) { + if (*end != '/') + return -EINVAL; + ret = kstrtoint(end + 1, 0, &mask); + if (ret || mask < 0 || mask > 32) + return -EINVAL; + } + + tclass_set_mask_32(store_mask, mask); + + return ret; +} + +static int tclass_parse_ip6(const char *str, void *store, void *store_mask) +{ + const char *end = NULL; + + return !in6_pton(str, -1, (u8 *)store, -1, &end); +} + +static int tclass_parse_tclass(const char *str, void *ptr, void *store_mask) +{ + int *tclass = ptr; + int ret; + + ret = kstrtoint(str, 0, tclass); + + if (ret || *tclass > 0xff) + return -EINVAL; + + return 0; +} + +static int tclass_compare_src_ips(struct tclass_match *match, + struct tclass_match *match2, + bool with_mask) +{ + return (*(u32 *)match->s_addr != *(u32 *)match2->s_addr); +} + +static int tclass_compare_dst_ips(struct tclass_match *match, + struct tclass_match *match2, + bool with_mask) +{ + u32 mask = -1; + + if (with_mask) + mask = *(u32 *)match->d_addr_m; + + return ((*(u32 *)match->d_addr & mask) != + ((*(u32 *)match2->d_addr) & mask)); +} + +static int tclass_compare_ip6s(void *ip1, void *ip2, int size) +{ + return memcmp(ip1, ip2, size); +} + +static int tclass_compare_src_ip6s(struct tclass_match *match, + struct tclass_match *match2, + bool with_mask) +{ + return tclass_compare_ip6s(match->s_addr, match2->s_addr, + sizeof(match->s_addr)); +} + +static int tclass_compare_dst_ip6s(struct tclass_match *match, + struct tclass_match *match2, + bool with_mask) +{ + return tclass_compare_ip6s(match->d_addr, match2->d_addr, + sizeof(match->d_addr)); +} + +static size_t tclass_print_src_ip(struct tclass_match *match, + char *buf, size_t size) +{ + return snprintf(buf, size, "src_ip=%pI4,", match->s_addr); +} + +static size_t tclass_print_dst_ip(struct tclass_match *match, + char *buf, size_t size) +{ + return snprintf(buf, size, "dst_ip=%pI4/%d,", + match->d_addr, hweight32(*(int *)match->d_addr_m)); +} + +static size_t tclass_print_src_ip6(struct tclass_match *match, + char *buf, size_t size) +{ + return snprintf(buf, size, "src_ip6=%pI6,", match->s_addr); +} + +static size_t tclass_print_dst_ip6(struct tclass_match *match, + char *buf, size_t size) +{ + return snprintf(buf, size, "dst_ip6=%pI6,", match->d_addr); +} + +static size_t tclass_print_tclass(struct tclass_match *match, + char *buf, size_t size) +{ + return snprintf(buf, size, "tclass=%d\n", match->tclass); +} + +static const struct tclass_parse_node parse_tree[] = { + TCLASS_CREATE_PARSE_NODE(TCLASS_MATCH_SRC_ADDR_IP, tclass_parse_src_ip, + tclass_compare_src_ips, + tclass_print_src_ip, "src_ip=", + TCLASS_MATCH_MASK_SRC_ADDR_IP, + s_addr, s_addr), + TCLASS_CREATE_PARSE_NODE(TCLASS_MATCH_DST_ADDR_IP, tclass_parse_dst_ip, + tclass_compare_dst_ips, + tclass_print_dst_ip, "dst_ip=", + TCLASS_MATCH_MASK_DST_ADDR_IP, + d_addr, d_addr_m), + TCLASS_CREATE_PARSE_NODE(TCLASS_MATCH_SRC_ADDR_IP6, tclass_parse_ip6, + tclass_compare_src_ip6s, + tclass_print_src_ip6, "src_ip6=", + TCLASS_MATCH_MASK_SRC_ADDR_IP6, + s_addr, s_addr), + TCLASS_CREATE_PARSE_NODE(TCLASS_MATCH_DST_ADDR_IP6, tclass_parse_ip6, + tclass_compare_dst_ip6s, + tclass_print_dst_ip6, "dst_ip6=", + TCLASS_MATCH_MASK_DST_ADDR_IP6, + d_addr, d_addr_m), + TCLASS_CREATE_PARSE_NODE(TCLASS_MATCH_TCLASS, tclass_parse_tclass, + NULL, + tclass_print_tclass, "tclass=", + TCLASS_MATCH_MASK_TCLASS, tclass, tclass), + TCLASS_CREATE_PARSE_NODE(TCLASS_MATCH_TCLASS_NO_PREFIX, + tclass_parse_tclass, + NULL, + NULL, "", + TCLASS_MATCH_MASK_TCLASS, tclass, tclass), +}; + +static int tclass_verify_match(struct tclass_match *match) +{ + if (!(match->mask & TCLASS_MATCH_MASK_TCLASS)) + return -EINVAL; + + if ((match->mask & (TCLASS_MATCH_MASK_SRC_ADDR_IP | + TCLASS_MATCH_MASK_DST_ADDR_IP)) && + (match->mask & (TCLASS_MATCH_MASK_SRC_ADDR_IP6 | + TCLASS_MATCH_MASK_DST_ADDR_IP6))) + return -EINVAL; + + return 0; +} + +static int tclass_parse_input(char *str, struct tclass_match *match) +{ + char *p; + int ret; + int i; + + while ((p = strsep(&str, ",")) != NULL) { + if (!*p) + continue; + + p = strim(p); /* Removing whitespace */ + for (i = 0; i < ARRAY_SIZE(parse_tree); i++) { + const struct tclass_parse_node *node; + + node = &parse_tree[i]; + if (!check_string_match(p, node->pattern)) { + ret = parse_tree[i].parse(p + + strlen(node->pattern), + (char *)match + + node->v_offset, + (char *)match + + node->m_offset); + if (ret) + return -EINVAL; + match->mask |= node->mask; + break; + } + } + if (i == ARRAY_SIZE(parse_tree)) + return -EINVAL; + } + + return tclass_verify_match(match); +} + +static struct tclass_match *tclass_find_empty(struct mlx5_tc_data *tcd) +{ + int i; + + for (i = 0; i < TCLASS_MAX_RULES; i++) + if (!tcd->rule[i].mask) + return &tcd->rule[i]; + return NULL; +} + +static struct tclass_match *tclass_find_match(struct mlx5_tc_data *tcd, + struct tclass_match *match, + u32 mask, + bool with_mask) +{ + int ret; + int i; + int j; + + mask |= TCLASS_MATCH_MASK_TCLASS; + + for (i = 0; i < TCLASS_MAX_RULES; i++) { + if (tcd->rule[i].mask == mask) { + ret = -1; + for (j = 0; j < ARRAY_SIZE(parse_tree); j++) { + const struct tclass_parse_node *node; + + node = &parse_tree[j]; + if (mask & node->mask && node->compare) { + ret = node->compare(&tcd->rule[i], + match, + with_mask); + if (ret) + break; + } + } + if (!ret) + return &tcd->rule[i]; + } + } + + return NULL; +} + +void tclass_get_tclass_locked(struct mlx5_ib_dev *dev, + struct mlx5_tc_data *tcd, + const struct rdma_ah_attr *ah, + u8 port, + u8 *tclass, + bool *global_tc) +{ + struct tclass_match *res_match = NULL; + struct tclass_match match = {}; + enum ib_gid_type gid_type; + union ib_gid gid; + int mask; + int err; + + if (tcd->val >= 0) { + *global_tc = true; + *tclass = tcd->val; + } else if (ah && ah->type == RDMA_AH_ATTR_TYPE_ROCE) { + *global_tc = false; + err = rdma_query_gid(&dev->ib_dev, port, ah->grh.sgid_index, + &gid); + if (err) + goto out; + + gid_type = ah->grh.sgid_attr->gid_type; + if (gid_type != IB_GID_TYPE_ROCE_UDP_ENCAP) + goto out; + + if (ipv6_addr_v4mapped((struct in6_addr *)&gid)) { + match.mask = TCLASS_MATCH_MASK_SRC_ADDR_IP | + TCLASS_MATCH_MASK_DST_ADDR_IP; + memcpy(match.s_addr, gid.raw + 12, 4); + memcpy(match.d_addr, ah->grh.dgid.raw + 12, 4); + } else { + match.mask = TCLASS_MATCH_MASK_SRC_ADDR_IP6 | + TCLASS_MATCH_MASK_DST_ADDR_IP6; + memcpy(match.s_addr, gid.raw, sizeof(match.s_addr)); + memcpy(match.d_addr, ah->grh.dgid.raw, + sizeof(match.d_addr)); + } + + mask = match.mask; + res_match = tclass_find_match(tcd, &match, mask, true); + if (!res_match) + res_match = tclass_find_match(tcd, &match, mask & + ~(TCLASS_MATCH_MASK_SRC_ADDR_IP | TCLASS_MATCH_MASK_SRC_ADDR_IP6), + true); + else + goto out; + mask = match.mask; + if (!res_match) + res_match = tclass_find_match(tcd, &match, mask & + ~(TCLASS_MATCH_MASK_DST_ADDR_IP | TCLASS_MATCH_MASK_DST_ADDR_IP6), + true); + } +out: + if (res_match) + *tclass = res_match->tclass; +} + +void tclass_get_tclass(struct mlx5_ib_dev *dev, + struct mlx5_tc_data *tcd, + const struct rdma_ah_attr *ah, + u8 port, + u8 *tclass, + bool *global_tc) +{ + mutex_lock(&tcd->lock); + tclass_get_tclass_locked(dev, tcd, ah, port, tclass, global_tc); + mutex_unlock(&tcd->lock); +} +struct tc_attribute { + struct attribute attr; + ssize_t (*show)(struct mlx5_tc_data *, struct tc_attribute *, char *buf); + ssize_t (*store)(struct mlx5_tc_data *, struct tc_attribute *, + const char *buf, size_t count); +}; + +#define TC_ATTR(_name, _mode, _show, _store) \ + struct tc_attribute tc_attr_##_name = __ATTR(_name, _mode, _show, _store) + + +static ssize_t traffic_class_show(struct mlx5_tc_data *tcd, struct tc_attribute *unused, char *buf) +{ + size_t count = 0; + int j; + int i; + + mutex_lock(&tcd->lock); + if (tcd->val >= 0) + count = snprintf(buf, PAGE_SIZE, "Global tclass=%d\n", + tcd->val); + + for (i = 0; i < TCLASS_MAX_RULES && + count < (PAGE_SIZE - TCLASS_MAX_CMD); i++) { + if (!tcd->rule[i].mask) + continue; + for (j = 0; j < ARRAY_SIZE(parse_tree); j++) { + if (tcd->rule[i].mask & parse_tree[j].mask && + parse_tree[j].print) + count += parse_tree[j].print(&tcd->rule[i], + buf + count, + PAGE_SIZE - count); + } + } + mutex_unlock(&tcd->lock); + + return count; +} + +static int tclass_compare_match(const void *ptr1, const void *ptr2) +{ + const struct tclass_match *m1 = ptr1; + const struct tclass_match *m2 = ptr2; + + if (m1->mask & TCLASS_MATCH_MASK_DST_ADDR_IP && + m2->mask & TCLASS_MATCH_MASK_DST_ADDR_IP) + return hweight32(*(u32 *)m2->d_addr_m) - + hweight32(*(u32 *)m1->d_addr_m); + + if (m1->mask & TCLASS_MATCH_MASK_DST_ADDR_IP) + return -1; + + if (m2->mask & TCLASS_MATCH_MASK_DST_ADDR_IP) + return 1; + + return 0; + +} +static int tclass_update_qp(struct mlx5_ib_dev *ibdev, struct mlx5_ib_qp *mqp, + u8 tclass, void *qpc) +{ + enum mlx5_qp_optpar optpar = MLX5_QP_OPTPAR_PRIMARY_ADDR_PATH_DSCP; + struct mlx5_ib_qp_base *base = &mqp->trans_qp.base; + u16 op = MLX5_CMD_OP_RTS2RTS_QP; + int err; + + MLX5_SET(qpc, qpc, primary_address_path.dscp, tclass >> 2); + err = mlx5_core_qp_modify(ibdev, op, optpar, qpc, &base->mqp, 0); + + return err; +} + +static void tclass_update_qps(struct mlx5_tc_data *tcd) +{ + struct mlx5_ib_dev *ibdev = tcd->ibdev; + struct rdma_restrack_entry *res; + struct rdma_restrack_root *rt; + struct mlx5_ib_qp *mqp; + unsigned long id = 0; + struct ib_qp *ibqp; + bool global_tc; + u8 tclass; + int ret; + void *qpc; + + if (!tcd->ibdev || !MLX5_CAP_GEN(ibdev->mdev, rts2rts_qp_dscp)) + return; + + qpc = kzalloc(MLX5_ST_SZ_BYTES(qpc), GFP_KERNEL); + if (!qpc) + return; + + rt = &ibdev->ib_dev.res[RDMA_RESTRACK_QP]; + xa_lock(&rt->xa); + xa_for_each(&rt->xa, id, res) { + if (!rdma_restrack_get(res)) + continue; + + xa_unlock(&rt->xa); + + ibqp = container_of(res, struct ib_qp, res); + mqp = to_mqp(ibqp); + + if (ibqp->qp_type == IB_QPT_GSI || + mqp->type == MLX5_IB_QPT_DCT) + goto cont; + + mutex_lock(&mqp->mutex); + + if (mqp->state == IB_QPS_RTS && + rdma_ah_get_ah_flags(&mqp->ah) & IB_AH_GRH) { + + tclass = mqp->tclass; + tclass_get_tclass_locked(ibdev, tcd, &mqp->ah, + mqp->ah.port_num, + &tclass, &global_tc); + + if (tclass != mqp->tclass) { + ret = tclass_update_qp(ibdev, mqp, tclass, + qpc); + if (!ret) + mqp->tclass = tclass; + } + } + mutex_unlock(&mqp->mutex); +cont: + rdma_restrack_put(res); + xa_lock(&rt->xa); + } + xa_unlock(&rt->xa); +} +static ssize_t traffic_class_store(struct mlx5_tc_data *tcd, struct tc_attribute *unused, + const char *buf, size_t count) +{ + struct tclass_match *dst_match = NULL; + char cmd[TCLASS_MAX_CMD + 1] = {}; + struct tclass_match match = {}; + int ret; + + if (count > TCLASS_MAX_CMD) + return -EINVAL; + memcpy(cmd, buf, count); + + ret = tclass_parse_input(cmd, &match); + + if (ret) + return -EINVAL; + + mutex_lock(&tcd->lock); + + if (match.mask == TCLASS_MATCH_MASK_TCLASS) { + tcd->val = match.tclass; + } else { + dst_match = tclass_find_match(tcd, &match, match.mask, false); + if (!dst_match) { + dst_match = tclass_find_empty(tcd); + if (!dst_match) { + mutex_unlock(&tcd->lock); + return -ENOMEM; + } + } + if (match.tclass < 0) + memset(dst_match, 0, sizeof(*dst_match)); + else + memcpy(dst_match, &match, sizeof(*dst_match)); + } + + /* Sort the list based on subnet mask */ + sort(tcd->rule, TCLASS_MAX_RULES, sizeof(tcd->rule[0]), + tclass_compare_match, NULL); + tclass_update_qps(tcd); + mutex_unlock(&tcd->lock); + + return count; +} + +static TC_ATTR(traffic_class, 0644, traffic_class_show, traffic_class_store); + +static struct attribute *tc_attrs[] = { + &tc_attr_traffic_class.attr, + NULL +}; + +static ssize_t tc_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct tc_attribute *tc_attr = container_of(attr, struct tc_attribute, attr); + struct mlx5_tc_data *d = container_of(kobj, struct mlx5_tc_data, kobj); + + if (!tc_attr->show) + return -EIO; + + return tc_attr->show(d, tc_attr, buf); +} + +static ssize_t tc_attr_store(struct kobject *kobj, + struct attribute *attr, const char *buf, size_t count) +{ + struct tc_attribute *tc_attr = container_of(attr, struct tc_attribute, attr); + struct mlx5_tc_data *d = container_of(kobj, struct mlx5_tc_data, kobj); + + if (!tc_attr->store) + return -EIO; + + return tc_attr->store(d, tc_attr, buf, count); +} + +static const struct sysfs_ops tc_sysfs_ops = { + .show = tc_attr_show, + .store = tc_attr_store +}; + +static struct kobj_type tc_type = { + .sysfs_ops = &tc_sysfs_ops, + .default_attrs = tc_attrs +}; + +int init_tc_sysfs(struct mlx5_ib_dev *dev) +{ + struct device *device = &dev->ib_dev.dev; + int num_ports; + int port; + int err; + + dev->tc_kobj = kobject_create_and_add("tc", &device->kobj); + if (!dev->tc_kobj) + return -ENOMEM; + num_ports = max(MLX5_CAP_GEN(dev->mdev, num_ports), + MLX5_CAP_GEN(dev->mdev, num_vhca_ports)); + for (port = 1; port <= num_ports; port++) { + struct mlx5_tc_data *tcd = &dev->tcd[port - 1]; + + err = kobject_init_and_add(&tcd->kobj, &tc_type, dev->tc_kobj, "%d", port); + if (err) + goto err; + tcd->val = -1; + tcd->ibdev = dev; + tcd->initialized = true; + mutex_init(&tcd->lock); + } + return 0; +err: + cleanup_tc_sysfs(dev); + return err; +} + +void cleanup_tc_sysfs(struct mlx5_ib_dev *dev) +{ + if (dev->tc_kobj) { + int num_ports; + int port; + + kobject_put(dev->tc_kobj); + dev->tc_kobj = NULL; + num_ports = max(MLX5_CAP_GEN(dev->mdev, num_ports), + MLX5_CAP_GEN(dev->mdev, num_vhca_ports)); + for (port = 1; port <= num_ports; port++) { + struct mlx5_tc_data *tcd = &dev->tcd[port - 1]; + + if (tcd->initialized) + kobject_put(&tcd->kobj); + } + } +} + +/* DC_cnak feature*/ + +static unsigned int dc_cnak_qp_depth = MLX5_DC_CONNECT_QP_DEPTH; +module_param_named(dc_cnak_qp_depth, dc_cnak_qp_depth, uint, 0444); +MODULE_PARM_DESC(dc_cnak_qp_depth, "DC CNAK QP depth"); + +static void mlx5_ib_enable_dc_tracer(struct mlx5_ib_dev *dev) +{ + struct device *device = dev->ib_dev.dma_device; + struct mlx5_dc_tracer *dct = &dev->dctr; + int order; + void *tmp; + int size; + int err; + + size = MLX5_CAP_GEN(dev->mdev, num_ports) * 4096; + if (size <= PAGE_SIZE) + order = 0; + else + order = 1; + + dct->pg = alloc_pages(GFP_KERNEL, order); + if (!dct->pg) { + mlx5_ib_err(dev, "failed to allocate %d pages\n", order); + return; + } + + tmp = kmap(dct->pg); + if (!tmp) { + mlx5_ib_err(dev, "failed to kmap one page\n"); + err = -ENOMEM; + goto map_err; + } + + memset(tmp, 0xff, size); + kunmap(dct->pg); + + dct->size = size; + dct->order = order; + dct->dma = dma_map_page(device, dct->pg, 0, size, DMA_FROM_DEVICE); + if (dma_mapping_error(device, dct->dma)) { + mlx5_ib_err(dev, "dma mapping error\n"); + goto map_err; + } + + err = mlx5_core_set_dc_cnak_trace(dev->mdev, 1, dct->dma); + if (err) { + mlx5_ib_warn(dev, "failed to enable DC tracer\n"); + goto cmd_err; + } + + return; + +cmd_err: + dma_unmap_page(device, dct->dma, size, DMA_FROM_DEVICE); +map_err: + __free_pages(dct->pg, dct->order); + dct->pg = NULL; +} + +static void mlx5_ib_disable_dc_tracer(struct mlx5_ib_dev *dev) +{ + struct device *device = dev->ib_dev.dma_device; + struct mlx5_dc_tracer *dct = &dev->dctr; + int err; + + if (!dct->pg) + return; + + err = mlx5_core_set_dc_cnak_trace(dev->mdev, 0, dct->dma); + if (err) { + mlx5_ib_warn(dev, "failed to disable DC tracer\n"); + return; + } + + dma_unmap_page(device, dct->dma, dct->size, DMA_FROM_DEVICE); + __free_pages(dct->pg, dct->order); + dct->pg = NULL; +} + +enum { + MLX5_DC_CNAK_SIZE = 128, + MLX5_NUM_BUF_IN_PAGE = PAGE_SIZE / MLX5_DC_CNAK_SIZE, + MLX5_CNAK_TX_CQ_SIGNAL_FACTOR = 128, + MLX5_DC_CNAK_SL = 0, + MLX5_DC_CNAK_VL = 0, +}; + +int mlx5_ib_mmap_dc_info_page(struct mlx5_ib_dev *dev, + struct vm_area_struct *vma) +{ + struct mlx5_dc_tracer *dct; + phys_addr_t pfn; + int err; + + if ((MLX5_CAP_GEN(dev->mdev, port_type) != + MLX5_CAP_PORT_TYPE_IB) || + (!mlx5_core_is_pf(dev->mdev)) || + (!MLX5_CAP_GEN(dev->mdev, dc_cnak_trace))) + return -ENOTSUPP; + + dct = &dev->dctr; + if (!dct->pg) { + mlx5_ib_err(dev, "mlx5_ib_mmap DC no page\n"); + return -ENOMEM; + } + + pfn = page_to_pfn(dct->pg); + err = remap_pfn_range(vma, vma->vm_start, pfn, dct->size, vma->vm_page_prot); + if (err) { + mlx5_ib_err(dev, "mlx5_ib_mmap DC remap_pfn_range failed\n"); + return err; + } + return 0; +} + +static void dump_buf(void *buf, int size) +{ + __be32 *p = buf; + int offset; + int i; + + for (i = 0, offset = 0; i < size; i += 16) { + pr_info("%03x: %08x %08x %08x %08x\n", offset, be32_to_cpu(p[0]), + be32_to_cpu(p[1]), be32_to_cpu(p[2]), be32_to_cpu(p[3])); + p += 4; + offset += 16; + } + pr_info("\n"); +} + +enum { + CNAK_LENGTH_WITHOUT_GRH = 32, + CNAK_LENGTH_WITH_GRH = 72, +}; + +static struct mlx5_dc_desc *get_desc_from_index(struct mlx5_dc_desc *desc, u64 index, unsigned *offset) +{ + struct mlx5_dc_desc *d; + + int i; + int j; + + i = index / MLX5_NUM_BUF_IN_PAGE; + j = index % MLX5_NUM_BUF_IN_PAGE; + d = desc + i; + *offset = j * MLX5_DC_CNAK_SIZE; + return d; +} + +static void build_cnak_msg(void *rbuf, void *sbuf, u32 *length, u16 *dlid) +{ + void *rdceth, *sdceth; + void *rlrh, *slrh; + void *rgrh, *sgrh; + void *rbth, *sbth; + int is_global; + void *saeth; + + memset(sbuf, 0, MLX5_DC_CNAK_SIZE); + rlrh = rbuf; + is_global = MLX5_GET(lrh, rlrh, lnh) == 0x3; + rgrh = is_global ? rlrh + MLX5_ST_SZ_BYTES(lrh) : NULL; + rbth = rgrh ? rgrh + MLX5_ST_SZ_BYTES(grh) : rlrh + MLX5_ST_SZ_BYTES(lrh); + rdceth = rbth + MLX5_ST_SZ_BYTES(bth); + + slrh = sbuf; + sgrh = is_global ? slrh + MLX5_ST_SZ_BYTES(lrh) : NULL; + sbth = sgrh ? sgrh + MLX5_ST_SZ_BYTES(grh) : slrh + MLX5_ST_SZ_BYTES(lrh); + sdceth = sbth + MLX5_ST_SZ_BYTES(bth); + saeth = sdceth + MLX5_ST_SZ_BYTES(dceth); + + *dlid = MLX5_GET(lrh, rlrh, slid); + MLX5_SET(lrh, slrh, vl, MLX5_DC_CNAK_VL); + MLX5_SET(lrh, slrh, lver, MLX5_GET(lrh, rlrh, lver)); + MLX5_SET(lrh, slrh, sl, MLX5_DC_CNAK_SL); + MLX5_SET(lrh, slrh, lnh, MLX5_GET(lrh, rlrh, lnh)); + MLX5_SET(lrh, slrh, dlid, MLX5_GET(lrh, rlrh, slid)); + MLX5_SET(lrh, slrh, pkt_len, 0x9 + ((is_global ? MLX5_ST_SZ_BYTES(grh) : 0) >> 2)); + MLX5_SET(lrh, slrh, slid, MLX5_GET(lrh, rlrh, dlid)); + + if (is_global) { + void *rdgid, *rsgid; + void *ssgid, *sdgid; + + MLX5_SET(grh, sgrh, ip_version, MLX5_GET(grh, rgrh, ip_version)); + MLX5_SET(grh, sgrh, traffic_class, MLX5_GET(grh, rgrh, traffic_class)); + MLX5_SET(grh, sgrh, flow_label, MLX5_GET(grh, rgrh, flow_label)); + MLX5_SET(grh, sgrh, payload_length, 0x1c); + MLX5_SET(grh, sgrh, next_header, 0x1b); + MLX5_SET(grh, sgrh, hop_limit, MLX5_GET(grh, rgrh, hop_limit)); + + rdgid = MLX5_ADDR_OF(grh, rgrh, dgid); + rsgid = MLX5_ADDR_OF(grh, rgrh, sgid); + ssgid = MLX5_ADDR_OF(grh, sgrh, sgid); + sdgid = MLX5_ADDR_OF(grh, sgrh, dgid); + memcpy(ssgid, rdgid, 16); + memcpy(sdgid, rsgid, 16); + *length = CNAK_LENGTH_WITH_GRH; + } else { + *length = CNAK_LENGTH_WITHOUT_GRH; + } + + MLX5_SET(bth, sbth, opcode, 0x51); + MLX5_SET(bth, sbth, migreq, 0x1); + MLX5_SET(bth, sbth, p_key, MLX5_GET(bth, rbth, p_key)); + MLX5_SET(bth, sbth, dest_qp, MLX5_GET(dceth, rdceth, dci_dct)); + MLX5_SET(bth, sbth, psn, MLX5_GET(bth, rbth, psn)); + + MLX5_SET(dceth, sdceth, dci_dct, MLX5_GET(bth, rbth, dest_qp)); + + MLX5_SET(aeth, saeth, syndrome, 0x64); + + if (0) { + pr_info("===dump packet ====\n"); + dump_buf(sbuf, *length); + } +} + +static int reduce_tx_pending(struct mlx5_dc_data *dcd, int num) +{ + struct mlx5_ib_dev *dev = dcd->dev; + struct ib_cq *cq = dcd->scq; + unsigned int send_completed; + unsigned int polled; + struct ib_wc wc; + int n; + + while (num > 0) { + n = ib_poll_cq(cq, 1, &wc); + if (unlikely(n < 0)) { + mlx5_ib_warn(dev, "error polling cnak send cq\n"); + return n; + } + if (unlikely(!n)) + return -EAGAIN; + + if (unlikely(wc.status != IB_WC_SUCCESS)) { + mlx5_ib_warn(dev, "cnak send completed with error, status %d vendor_err %d\n", + wc.status, wc.vendor_err); + dcd->last_send_completed++; + dcd->tx_pending--; + num--; + } else { + send_completed = wc.wr_id; + polled = send_completed - dcd->last_send_completed; + dcd->tx_pending = (unsigned int)(dcd->cur_send - send_completed); + num -= polled; + dcd->last_send_completed = send_completed; + } + } + + return 0; +} + +static bool signal_wr(int wr_count, struct mlx5_dc_data *dcd) +{ + return !(wr_count % dcd->tx_signal_factor); +} + +static int send_cnak(struct mlx5_dc_data *dcd, struct mlx5_send_wr *mlx_wr, + u64 rcv_buff_id) +{ + struct ib_send_wr *wr = &mlx_wr->wr; + struct mlx5_ib_dev *dev = dcd->dev; + const struct ib_send_wr *bad_wr; + struct mlx5_dc_desc *rxd; + struct mlx5_dc_desc *txd; + unsigned int offset; + unsigned int cur; + __be32 *sbuf; + void *rbuf; + int err; + + if (unlikely(dcd->tx_pending > dcd->max_wqes)) { + mlx5_ib_warn(dev, "SW error in cnak send: tx_pending(%d) > max_wqes(%d)\n", + dcd->tx_pending, dcd->max_wqes); + return -EFAULT; + } + + if (unlikely(dcd->tx_pending == dcd->max_wqes)) { + err = reduce_tx_pending(dcd, 1); + if (err) + return err; + if (dcd->tx_pending == dcd->max_wqes) + return -EAGAIN; + } + + cur = dcd->cur_send; + txd = get_desc_from_index(dcd->txdesc, cur % dcd->max_wqes, &offset); + sbuf = txd->buf + offset; + + wr->sg_list[0].addr = txd->dma + offset; + wr->sg_list[0].lkey = dcd->mr->lkey; + wr->opcode = IB_WR_SEND; + wr->num_sge = 1; + wr->wr_id = cur; + if (!signal_wr(cur, dcd)) + wr->send_flags &= ~IB_SEND_SIGNALED; + else + wr->send_flags |= IB_SEND_SIGNALED; + + rxd = get_desc_from_index(dcd->rxdesc, rcv_buff_id, &offset); + rbuf = rxd->buf + offset; + build_cnak_msg(rbuf, sbuf, &wr->sg_list[0].length, &mlx_wr->sel.mlx.dlid); + + mlx_wr->sel.mlx.sl = MLX5_DC_CNAK_SL; + mlx_wr->sel.mlx.icrc = 1; + + err = ib_post_send(dcd->dcqp, wr, &bad_wr); + if (likely(!err)) { + dcd->tx_pending++; + dcd->cur_send++; + atomic64_inc(&dcd->dev->dc_stats[dcd->port - 1].cnaks); + } + + return err; +} + +static int mlx5_post_one_rxdc(struct mlx5_dc_data *dcd, int index) +{ + const struct ib_recv_wr *bad_wr; + struct ib_recv_wr wr; + struct ib_sge sge; + u64 addr; + int err; + int i; + int j; + + i = index / (PAGE_SIZE / MLX5_DC_CNAK_SIZE); + j = index % (PAGE_SIZE / MLX5_DC_CNAK_SIZE); + addr = dcd->rxdesc[i].dma + j * MLX5_DC_CNAK_SIZE; + + memset(&wr, 0, sizeof(wr)); + wr.num_sge = 1; + sge.addr = addr; + sge.length = MLX5_DC_CNAK_SIZE; + sge.lkey = dcd->mr->lkey; + wr.sg_list = &sge; + wr.num_sge = 1; + wr.wr_id = index; + err = ib_post_recv(dcd->dcqp, &wr, &bad_wr); + if (unlikely(err)) + mlx5_ib_warn(dcd->dev, "failed to post dc rx buf at index %d\n", index); + + return err; +} + +static void dc_cnack_rcv_comp_handler(struct ib_cq *cq, void *cq_context) +{ + struct mlx5_dc_data *dcd = cq_context; + struct mlx5_ib_dev *dev = dcd->dev; + struct mlx5_send_wr mlx_wr; + struct ib_send_wr *wr = &mlx_wr.wr; + struct ib_wc *wc = dcd->wc_tbl; + struct ib_sge sge; + int err; + int n; + int i; + + memset(&mlx_wr, 0, sizeof(mlx_wr)); + wr->sg_list = &sge; + + n = ib_poll_cq(cq, MLX5_CNAK_RX_POLL_CQ_QUOTA, wc); + if (unlikely(n < 0)) { + /* mlx5 never returns negative values but leave a message just in case */ + mlx5_ib_warn(dev, "DC cnak[%d]: failed to poll cq (%d), aborting\n", + dcd->index, n); + return; + } + if (likely(n > 0)) { + for (i = 0; i < n; i++) { + if (dev->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) + return; + + if (unlikely(wc[i].status != IB_WC_SUCCESS)) { + mlx5_ib_warn(dev, "DC cnak[%d]: completed with error, status = %d vendor_err = %d\n", + wc[i].status, wc[i].vendor_err, dcd->index); + } else { + atomic64_inc(&dcd->dev->dc_stats[dcd->port - 1].connects); + dev->dc_stats[dcd->port - 1].rx_scatter[dcd->index]++; + if (unlikely(send_cnak(dcd, &mlx_wr, wc[i].wr_id))) + mlx5_ib_warn(dev, "DC cnak[%d]: failed to allocate send buf - dropped\n", + dcd->index); + } + + if (unlikely(mlx5_post_one_rxdc(dcd, wc[i].wr_id))) { + atomic64_inc(&dcd->dev->dc_stats[dcd->port - 1].discards); + mlx5_ib_warn(dev, "DC cnak[%d]: repost rx failed, will leak rx queue\n", + dcd->index); + } + } + } + + err = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + if (unlikely(err)) + mlx5_ib_warn(dev, "DC cnak[%d]: failed to re-arm receive cq (%d)\n", + dcd->index, err); +} + +static int alloc_dc_buf(struct mlx5_dc_data *dcd, int rx) +{ + struct mlx5_ib_dev *dev = dcd->dev; + struct mlx5_dc_desc **desc; + struct mlx5_dc_desc *d; + struct device *ddev; + int max_wqes; + int err = 0; + int npages; + int totsz; + int i; + + ddev = &dev->mdev->pdev->dev; + max_wqes = dcd->max_wqes; + totsz = max_wqes * MLX5_DC_CNAK_SIZE; + npages = DIV_ROUND_UP(totsz, PAGE_SIZE); + desc = rx ? &dcd->rxdesc : &dcd->txdesc; + *desc = kcalloc(npages, sizeof(*dcd->rxdesc), GFP_KERNEL); + if (!*desc) { + err = -ENOMEM; + goto out; + } + + for (i = 0; i < npages; i++) { + d = *desc + i; + d->buf = dma_alloc_coherent(ddev, PAGE_SIZE, &d->dma, GFP_KERNEL); + if (!d->buf) { + mlx5_ib_err(dev, "dma alloc failed at %d\n", i); + goto out_free; + } + } + if (rx) + dcd->rx_npages = npages; + else + dcd->tx_npages = npages; + + return 0; + +out_free: + for (i--; i >= 0; i--) { + d = *desc + i; + dma_free_coherent(ddev, PAGE_SIZE, d->buf, d->dma); + } + kfree(*desc); +out: + return err; +} + +static int alloc_dc_rx_buf(struct mlx5_dc_data *dcd) +{ + return alloc_dc_buf(dcd, 1); +} + +static int alloc_dc_tx_buf(struct mlx5_dc_data *dcd) +{ + return alloc_dc_buf(dcd, 0); +} + +static void free_dc_buf(struct mlx5_dc_data *dcd, int rx) +{ + struct mlx5_ib_dev *dev = dcd->dev; + struct mlx5_dc_desc *desc; + struct mlx5_dc_desc *d; + struct device *ddev; + int npages; + int i; + + ddev = &dev->mdev->pdev->dev; + npages = rx ? dcd->rx_npages : dcd->tx_npages; + desc = rx ? dcd->rxdesc : dcd->txdesc; + for (i = 0; i < npages; i++) { + d = desc + i; + dma_free_coherent(ddev, PAGE_SIZE, d->buf, d->dma); + } + kfree(desc); +} + +static void free_dc_rx_buf(struct mlx5_dc_data *dcd) +{ + free_dc_buf(dcd, 1); +} + +static void free_dc_tx_buf(struct mlx5_dc_data *dcd) +{ + free_dc_buf(dcd, 0); +} + +struct dc_attribute { + struct attribute attr; + ssize_t (*show)(struct mlx5_dc_stats *, struct dc_attribute *, char *buf); + ssize_t (*store)(struct mlx5_dc_stats *, struct dc_attribute *, + const char *buf, size_t count); +}; + +static ssize_t qp_count_show(struct mlx5_dc_stats *dc_stats, + struct dc_attribute *unused, + char *buf) +{ + return sprintf(buf, "%u\n", dc_stats->dev->num_dc_cnak_qps); +} + +static int init_driver_cnak(struct mlx5_ib_dev *dev, int port, int index); +static ssize_t qp_count_store(struct mlx5_dc_stats *dc_stats, + struct dc_attribute *unused, + const char *buf, size_t count) +{ + struct mlx5_ib_dev *dev = dc_stats->dev; + int port = dc_stats->port; + unsigned long var; + int i; + int err = 0; + int qp_add = 0; + + if (kstrtol(buf, 0, &var)) { + err = -EINVAL; + goto err; + } + if ((var > dev->max_dc_cnak_qps) || + (dev->num_dc_cnak_qps >= var)) { + err = -EINVAL; + goto err; + } + + for (i = dev->num_dc_cnak_qps; i < var; i++) { + err = init_driver_cnak(dev, port, i); + if (err) { + mlx5_ib_warn(dev, "Fail to set %ld CNAK QPs. Only %d were added\n", + var, qp_add); + break; + } + dev->num_dc_cnak_qps++; + qp_add++; + } +err: + + return err ? err : count; +} + +#define DC_ATTR(_name, _mode, _show, _store) \ +struct dc_attribute dc_attr_##_name = __ATTR(_name, _mode, _show, _store) + +static DC_ATTR(qp_count, 0644, qp_count_show, qp_count_store); + +static ssize_t rx_connect_show(struct mlx5_dc_stats *dc_stats, + struct dc_attribute *unused, + char *buf) +{ + unsigned long num; + + num = atomic64_read(&dc_stats->connects); + + return sprintf(buf, "%lu\n", num); +} + +static ssize_t tx_cnak_show(struct mlx5_dc_stats *dc_stats, + struct dc_attribute *unused, + char *buf) +{ + unsigned long num; + + num = atomic64_read(&dc_stats->cnaks); + + return sprintf(buf, "%lu\n", num); +} + +static ssize_t tx_discard_show(struct mlx5_dc_stats *dc_stats, + struct dc_attribute *unused, + char *buf) +{ + unsigned long num; + + num = atomic64_read(&dc_stats->discards); + + return sprintf(buf, "%lu\n", num); +} + +static ssize_t rx_scatter_show(struct mlx5_dc_stats *dc_stats, + struct dc_attribute *unused, + char *buf) +{ + int i; + int ret; + int res = 0; + + buf[0] = 0; + + for (i = 0; i < dc_stats->dev->max_dc_cnak_qps ; i++) { + unsigned long num = dc_stats->rx_scatter[i]; + + if (!dc_stats->dev->dcd[dc_stats->port - 1][i].initialized) + continue; + ret = sprintf(buf + strlen(buf), "%d:%lu\n", i, num); + if (ret < 0) { + res = ret; + break; + } + res += ret; + } + return res; +} + +#define DC_ATTR_RO(_name) \ +struct dc_attribute dc_attr_##_name = __ATTR_RO(_name) + +static DC_ATTR_RO(rx_connect); +static DC_ATTR_RO(tx_cnak); +static DC_ATTR_RO(tx_discard); +static DC_ATTR_RO(rx_scatter); + +static struct attribute *dc_attrs[] = { + &dc_attr_rx_connect.attr, + &dc_attr_tx_cnak.attr, + &dc_attr_tx_discard.attr, + &dc_attr_rx_scatter.attr, + &dc_attr_qp_count.attr, + NULL +}; + +static ssize_t dc_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct dc_attribute *dc_attr = container_of(attr, struct dc_attribute, attr); + struct mlx5_dc_stats *d = container_of(kobj, struct mlx5_dc_stats, kobj); + + if (!dc_attr->show) + return -EIO; + + return dc_attr->show(d, dc_attr, buf); +} + +static ssize_t dc_attr_store(struct kobject *kobj, + struct attribute *attr, const char *buf, size_t size) +{ + struct dc_attribute *dc_attr = container_of(attr, struct dc_attribute, attr); + struct mlx5_dc_stats *d = container_of(kobj, struct mlx5_dc_stats, kobj); + + if (!dc_attr->store) + return -EIO; + + return dc_attr->store(d, dc_attr, buf, size); +} + +static const struct sysfs_ops dc_sysfs_ops = { + .show = dc_attr_show, + .store = dc_attr_store +}; + +static struct kobj_type dc_type = { + .sysfs_ops = &dc_sysfs_ops, + .default_attrs = dc_attrs +}; + +static int init_sysfs(struct mlx5_ib_dev *dev) +{ + struct device *device = &dev->ib_dev.dev; + + dev->dc_kobj = kobject_create_and_add("dct", &device->kobj); + if (!dev->dc_kobj) { + mlx5_ib_err(dev, "failed to register DCT sysfs object\n"); + return -ENOMEM; + } + + return 0; +} + +static void cleanup_sysfs(struct mlx5_ib_dev *dev) +{ + if (dev->dc_kobj) { + kobject_put(dev->dc_kobj); + dev->dc_kobj = NULL; + } +} + +static int init_port_sysfs(struct mlx5_dc_stats *dc_stats, + struct mlx5_ib_dev *dev, int port) +{ + int ret; + + dc_stats->dev = dev; + dc_stats->port = port; + ret = kobject_init_and_add(&dc_stats->kobj, &dc_type, + dc_stats->dev->dc_kobj, "%d", dc_stats->port); + if (!ret) + dc_stats->initialized = 1; + return ret; +} + +static void cleanup_port_sysfs(struct mlx5_dc_stats *dc_stats) +{ + if (!dc_stats->initialized) + return; + kobject_put(&dc_stats->kobj); +} + +static int comp_vector(struct ib_device *dev, int port, int index) +{ + int comp_per_port = dev->num_comp_vectors / dev->phys_port_cnt; + + return (port - 1) * comp_per_port + (index % comp_per_port); +} + +static int init_driver_cnak(struct mlx5_ib_dev *dev, int port, int index) +{ + struct mlx5_dc_data *dcd = &dev->dcd[port - 1][index]; + struct mlx5_ib_resources *devr = &dev->devr; + struct ib_cq_init_attr cq_attr = {}; + struct ib_qp_init_attr init_attr; + struct ib_pd *pd = devr->p0; + struct ib_qp_attr attr; + int ncqe; + int nwr; + int err; + int i; + + dcd->dev = dev; + dcd->port = port; + dcd->index = index; + dcd->mr = pd->device->ops.get_dma_mr(pd, IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(dcd->mr)) { + mlx5_ib_warn(dev, "failed to create dc DMA MR\n"); + err = PTR_ERR(dcd->mr); + goto error1; + } + + dcd->mr->device = pd->device; + dcd->mr->pd = pd; + dcd->mr->uobject = NULL; + dcd->mr->need_inval = false; + + ncqe = min_t(int, dc_cnak_qp_depth, + BIT(MLX5_CAP_GEN(dev->mdev, log_max_cq_sz))); + nwr = min_t(int, ncqe, + BIT(MLX5_CAP_GEN(dev->mdev, log_max_qp_sz))); + + if (dc_cnak_qp_depth > nwr) { + mlx5_ib_warn(dev, "Can't set DC CNAK QP size to %d. Set to default %d\n", + dc_cnak_qp_depth, nwr); + dc_cnak_qp_depth = nwr; + } + + cq_attr.cqe = ncqe; + cq_attr.comp_vector = comp_vector(&dev->ib_dev, port, index); + dcd->rcq = ib_create_cq(&dev->ib_dev, dc_cnack_rcv_comp_handler, NULL, + dcd, &cq_attr); + if (IS_ERR(dcd->rcq)) { + err = PTR_ERR(dcd->rcq); + mlx5_ib_warn(dev, "failed to create dc cnack rx cq (%d)\n", err); + goto error2; + } + + err = ib_req_notify_cq(dcd->rcq, IB_CQ_NEXT_COMP); + if (err) { + mlx5_ib_warn(dev, "failed to setup dc cnack rx cq (%d)\n", err); + goto error3; + } + + dcd->scq = ib_create_cq(&dev->ib_dev, NULL, NULL, + dcd, &cq_attr); + if (IS_ERR(dcd->scq)) { + err = PTR_ERR(dcd->scq); + mlx5_ib_warn(dev, "failed to create dc cnack tx cq (%d)\n", err); + goto error3; + } + + memset(&init_attr, 0, sizeof(init_attr)); + init_attr.qp_type = MLX5_IB_QPT_SW_CNAK; + init_attr.cap.max_recv_wr = nwr; + init_attr.cap.max_recv_sge = 1; + init_attr.cap.max_send_wr = nwr; + init_attr.cap.max_send_sge = 1; + init_attr.sq_sig_type = IB_SIGNAL_REQ_WR; + init_attr.recv_cq = dcd->rcq; + init_attr.send_cq = dcd->scq; + dcd->dcqp = ib_create_qp(pd, &init_attr); + if (IS_ERR(dcd->dcqp)) { + mlx5_ib_warn(dev, "failed to create qp (%d)\n", err); + err = PTR_ERR(dcd->dcqp); + goto error4; + } + memset(&attr, 0, sizeof(attr)); + attr.qp_state = IB_QPS_INIT; + attr.port_num = port; + err = ib_modify_qp(dcd->dcqp, &attr, + IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT); + if (err) { + mlx5_ib_warn(dev, "failed to modify qp to init\n"); + goto error5; + } + + memset(&attr, 0, sizeof(attr)); + attr.qp_state = IB_QPS_RTR; + attr.path_mtu = IB_MTU_4096; + err = ib_modify_qp(dcd->dcqp, &attr, IB_QP_STATE); + if (err) { + mlx5_ib_warn(dev, "failed to modify qp to rtr\n"); + goto error5; + } + + memset(&attr, 0, sizeof(attr)); + attr.qp_state = IB_QPS_RTS; + err = ib_modify_qp(dcd->dcqp, &attr, IB_QP_STATE); + if (err) { + mlx5_ib_warn(dev, "failed to modify qp to rts\n"); + goto error5; + } + + dcd->max_wqes = nwr; + err = alloc_dc_rx_buf(dcd); + if (err) { + mlx5_ib_warn(dev, "failed to allocate rx buf\n"); + goto error5; + } + + err = alloc_dc_tx_buf(dcd); + if (err) { + mlx5_ib_warn(dev, "failed to allocate tx buf\n"); + goto error6; + } + + for (i = 0; i < nwr; i++) { + err = mlx5_post_one_rxdc(dcd, i); + if (err) + goto error7; + } + + dcd->tx_signal_factor = min_t(int, DIV_ROUND_UP(dcd->max_wqes, 2), + MLX5_CNAK_TX_CQ_SIGNAL_FACTOR); + + dcd->initialized = 1; + return 0; + +error7: + free_dc_tx_buf(dcd); +error6: + free_dc_rx_buf(dcd); +error5: + if (ib_destroy_qp(dcd->dcqp)) + mlx5_ib_warn(dev, "failed to destroy dc qp\n"); +error4: + if (ib_destroy_cq(dcd->scq)) + mlx5_ib_warn(dev, "failed to destroy dc scq\n"); +error3: + if (ib_destroy_cq(dcd->rcq)) + mlx5_ib_warn(dev, "failed to destroy dc rcq\n"); +error2: + ib_dereg_mr(dcd->mr); +error1: + return err; +} + +static void cleanup_driver_cnak(struct mlx5_ib_dev *dev, int port, int index) +{ + struct mlx5_dc_data *dcd = &dev->dcd[port - 1][index]; + + if (!dcd->initialized) + return; + + if (ib_destroy_qp(dcd->dcqp)) + mlx5_ib_warn(dev, "destroy qp failed\n"); + + if (ib_destroy_cq(dcd->scq)) + mlx5_ib_warn(dev, "destroy scq failed\n"); + + if (ib_destroy_cq(dcd->rcq)) + mlx5_ib_warn(dev, "destroy rcq failed\n"); + + ib_dereg_mr(dcd->mr); + free_dc_tx_buf(dcd); + free_dc_rx_buf(dcd); + dcd->initialized = 0; +} + +int mlx5_ib_init_dc_improvements(struct mlx5_ib_dev *dev) +{ + int port; + int err; + int i; + struct mlx5_core_dev *mdev = dev->mdev; + int max_dc_cnak_qps; + int ini_dc_cnak_qps; + + if (!mlx5_core_is_pf(dev->mdev) || + !(MLX5_CAP_GEN(dev->mdev, dc_cnak_trace))) + return 0; + + mlx5_ib_enable_dc_tracer(dev); + + max_dc_cnak_qps = min_t(int, 1 << MLX5_CAP_GEN(mdev, log_max_dc_cnak_qps), + dev->ib_dev.num_comp_vectors / MLX5_CAP_GEN(mdev, num_ports)); + + if (!MLX5_CAP_GEN(dev->mdev, dc_connect_qp)) + return 0; + + err = init_sysfs(dev); + if (err) + return err; + + /* start with 25% of maximum CNAK QPs */ + ini_dc_cnak_qps = DIV_ROUND_UP(max_dc_cnak_qps, 4); + + for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++) { + dev->dcd[port - 1] = + kcalloc(max_dc_cnak_qps, sizeof(struct mlx5_dc_data), GFP_KERNEL); + if (!dev->dcd[port - 1]) { + err = -ENOMEM; + goto err; + } + dev->dc_stats[port - 1].rx_scatter = + kcalloc(max_dc_cnak_qps, sizeof(int), GFP_KERNEL); + if (!dev->dc_stats[port - 1].rx_scatter) { + err = -ENOMEM; + goto err; + } + for (i = 0; i < ini_dc_cnak_qps; i++) { + err = init_driver_cnak(dev, port, i); + if (err) + goto err; + } + err = init_port_sysfs(&dev->dc_stats[port - 1], dev, port); + if (err) { + mlx5_ib_warn(dev, "failed to initialize DC cnak sysfs\n"); + goto err; + } + } + dev->num_dc_cnak_qps = ini_dc_cnak_qps; + dev->max_dc_cnak_qps = max_dc_cnak_qps; + + return 0; + +err: + for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++) { + for (i = 0; i < ini_dc_cnak_qps; i++) + cleanup_driver_cnak(dev, port, i); + cleanup_port_sysfs(&dev->dc_stats[port - 1]); + kfree(dev->dc_stats[port - 1].rx_scatter); + kfree(dev->dcd[port - 1]); + } + cleanup_sysfs(dev); + + return err; +} + +void mlx5_ib_cleanup_dc_improvements(struct mlx5_ib_dev *dev) +{ + int port; + int i; + + if (dev->num_dc_cnak_qps) { + for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++) { + for (i = 0; i < dev->num_dc_cnak_qps; i++) + cleanup_driver_cnak(dev, port, i); + cleanup_port_sysfs(&dev->dc_stats[port - 1]); + kfree(dev->dc_stats[port - 1].rx_scatter); + kfree(dev->dcd[port - 1]); + } + cleanup_sysfs(dev); + } + + mlx5_ib_disable_dc_tracer(dev); +} + + diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/mem.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/mem.c new file mode 100644 index 0000000..8445450 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/mem.c @@ -0,0 +1,296 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include "mlx5_ib.h" +#include + +/* + * Fill in a physical address list. ib_umem_num_dma_blocks() entries will be + * filled in the pas array. + */ +void mlx5_ib_populate_pas(struct ib_umem *umem, size_t page_size, __be64 *pas, + u64 access_flags) +{ + struct ib_block_iter biter; + + rdma_umem_for_each_dma_block (umem, &biter, page_size) { + *pas = cpu_to_be64(rdma_block_iter_dma_address(&biter) | + access_flags); + pas++; + } +} + +/* + * Compute the page shift and page_offset for mailboxes that use a quantized + * page_offset. The granulatity of the page offset scales according to page + * size. + */ +unsigned long __mlx5_umem_find_best_quantized_pgoff( + struct ib_umem *umem, unsigned long pgsz_bitmap, + unsigned int page_offset_bits, u64 pgoff_bitmask, unsigned int scale, + unsigned int *page_offset_quantized) +{ + const u64 page_offset_mask = (1UL << page_offset_bits) - 1; + unsigned long page_size; + u64 page_offset; + + page_size = ib_umem_find_best_pgoff(umem, pgsz_bitmap, pgoff_bitmask); + if (!page_size) + return 0; + + /* + * page size is the largest possible page size. + * + * Reduce the page_size, and thus the page_offset and quanta, until the + * page_offset fits into the mailbox field. Once page_size < scale this + * loop is guaranteed to terminate. + */ + page_offset = ib_umem_dma_offset(umem, page_size); + while (page_offset & ~(u64)(page_offset_mask * (page_size / scale))) { + page_size /= 2; + page_offset = ib_umem_dma_offset(umem, page_size); + } + + /* + * The address is not aligned, or otherwise cannot be represented by the + * page_offset. + */ + if (!(pgsz_bitmap & page_size)) + return 0; + + *page_offset_quantized = + (unsigned long)page_offset / (page_size / scale); + if (WARN_ON(*page_offset_quantized > page_offset_mask)) + return 0; + return page_size; +} + +#define WR_ID_BF 0xBF +#define WR_ID_END 0xBAD +#define TEST_WC_NUM_WQES 255 +#define TEST_WC_POLLING_MAX_TIME_JIFFIES msecs_to_jiffies(100) +static int post_send_nop(struct mlx5_ib_dev *dev, struct ib_qp *ibqp, u64 wr_id, + bool signaled) +{ + struct mlx5_ib_qp *qp = to_mqp(ibqp); + struct mlx5_wqe_ctrl_seg *ctrl; + struct mlx5_bf *bf = &qp->bf; + __be32 mmio_wqe[16] = {}; + unsigned long flags; + unsigned int idx; + int i; + + if (unlikely(dev->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)) + return -EIO; + + spin_lock_irqsave(&qp->sq.lock, flags); + + idx = qp->sq.cur_post & (qp->sq.wqe_cnt - 1); + ctrl = mlx5_frag_buf_get_wqe(&qp->sq.fbc, idx); + + memset(ctrl, 0, sizeof(struct mlx5_wqe_ctrl_seg)); + ctrl->fm_ce_se = signaled ? MLX5_WQE_CTRL_CQ_UPDATE : 0; + ctrl->opmod_idx_opcode = + cpu_to_be32(((u32)(qp->sq.cur_post) << 8) | MLX5_OPCODE_NOP); + ctrl->qpn_ds = cpu_to_be32((sizeof(struct mlx5_wqe_ctrl_seg) / 16) | + (qp->trans_qp.base.mqp.qpn << 8)); + + qp->sq.wrid[idx] = wr_id; + qp->sq.w_list[idx].opcode = MLX5_OPCODE_NOP; + qp->sq.wqe_head[idx] = qp->sq.head + 1; + qp->sq.cur_post += DIV_ROUND_UP(sizeof(struct mlx5_wqe_ctrl_seg), + MLX5_SEND_WQE_BB); + qp->sq.w_list[idx].next = qp->sq.cur_post; + qp->sq.head++; + + memcpy(mmio_wqe, ctrl, sizeof(*ctrl)); + ((struct mlx5_wqe_ctrl_seg *)&mmio_wqe)->fm_ce_se |= + MLX5_WQE_CTRL_CQ_UPDATE; + + /* Make sure that descriptors are written before + * updating doorbell record and ringing the doorbell + */ + wmb(); + + qp->db.db[MLX5_SND_DBR] = cpu_to_be32(qp->sq.cur_post); + + /* Make sure doorbell record is visible to the HCA before + * we hit doorbell + */ + wmb(); + for (i = 0; i < 8; i++) + mlx5_write64(&mmio_wqe[i * 2], + bf->bfreg->map + bf->offset + i * 8); + + bf->offset ^= bf->buf_size; + + spin_unlock_irqrestore(&qp->sq.lock, flags); + + return 0; +} + +static int test_wc_poll_cq_result(struct mlx5_ib_dev *dev, struct ib_cq *cq) +{ + int ret; + struct ib_wc wc = {}; + unsigned long end = jiffies + TEST_WC_POLLING_MAX_TIME_JIFFIES; + + do { + ret = ib_poll_cq(cq, 1, &wc); + if (ret < 0 || wc.status) + return ret < 0 ? ret : -EINVAL; + if (ret) + break; + } while (!time_after(jiffies, end)); + + if (!ret) + return -ETIMEDOUT; + + if (wc.wr_id != WR_ID_BF) + ret = 0; + + return ret; +} + +static int test_wc_do_send(struct mlx5_ib_dev *dev, struct ib_qp *qp) +{ + int err, i; + + for (i = 0; i < TEST_WC_NUM_WQES; i++) { + err = post_send_nop(dev, qp, WR_ID_BF, false); + if (err) + return err; + } + + return post_send_nop(dev, qp, WR_ID_END, true); +} + +int mlx5_ib_test_wc(struct mlx5_ib_dev *dev) +{ + struct ib_cq_init_attr cq_attr = { .cqe = TEST_WC_NUM_WQES + 1 }; + int port_type_cap = MLX5_CAP_GEN(dev->mdev, port_type); + struct ib_qp_init_attr qp_init_attr = { + .cap = { .max_send_wr = TEST_WC_NUM_WQES }, + .qp_type = IB_QPT_UD, + .sq_sig_type = IB_SIGNAL_REQ_WR, + .create_flags = MLX5_IB_QP_CREATE_WC_TEST, + }; + struct ib_qp_attr qp_attr = { .port_num = 1 }; + struct ib_device *ibdev = &dev->ib_dev; + struct ib_qp *qp; + struct ib_cq *cq; + struct ib_pd *pd; + int ret; + + if (!MLX5_CAP_GEN(dev->mdev, bf)) + return 0; + + if (!dev->mdev->roce.roce_en && + port_type_cap == MLX5_CAP_PORT_TYPE_ETH) { + if (mlx5_core_is_pf(dev->mdev)) + dev->wc_support = arch_can_pci_mmap_wc(); + return 0; + } + + ret = mlx5_alloc_bfreg(dev->mdev, &dev->wc_bfreg, true, false); + if (ret) + goto print_err; + + if (!dev->wc_bfreg.wc) + goto out1; + + pd = ib_alloc_pd(ibdev, 0); + if (IS_ERR(pd)) { + ret = PTR_ERR(pd); + goto out1; + } + + cq = ib_create_cq(ibdev, NULL, NULL, NULL, &cq_attr); + if (IS_ERR(cq)) { + ret = PTR_ERR(cq); + goto out2; + } + + qp_init_attr.recv_cq = cq; + qp_init_attr.send_cq = cq; + qp = ib_create_qp(pd, &qp_init_attr); + if (IS_ERR(qp)) { + ret = PTR_ERR(qp); + goto out3; + } + + qp_attr.qp_state = IB_QPS_INIT; + ret = ib_modify_qp(qp, &qp_attr, + IB_QP_STATE | IB_QP_PORT | IB_QP_PKEY_INDEX | + IB_QP_QKEY); + if (ret) + goto out4; + + qp_attr.qp_state = IB_QPS_RTR; + ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE); + if (ret) + goto out4; + + qp_attr.qp_state = IB_QPS_RTS; + ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE | IB_QP_SQ_PSN); + if (ret) + goto out4; + + ret = test_wc_do_send(dev, qp); + if (ret < 0) + goto out4; + + ret = test_wc_poll_cq_result(dev, cq); + if (ret > 0) { + dev->wc_support = true; + ret = 0; + } + +out4: + ib_destroy_qp(qp); +out3: + ib_destroy_cq(cq); +out2: + ib_dealloc_pd(pd); +out1: + mlx5_free_bfreg(dev->mdev, &dev->wc_bfreg); +print_err: + if (ret) + mlx5_ib_err( + dev, + "Error %d while trying to test write-combining support\n", + ret); + return ret; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/mlx5_ib.h b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/mlx5_ib.h new file mode 100644 index 0000000..d62e5c4 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -0,0 +1,1741 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2013-2020, Mellanox Technologies inc. All rights reserved. + * Copyright (c) 2020, Intel Corporation. All rights reserved. + */ + +#ifndef MLX5_IB_H +#define MLX5_IB_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "srq.h" +#include "mlx5_ib_nvmf.h" +#include "mlx5_ib_ext.h" + +#define MLX5_PAS_ALIGN 64 +#define mlx5_ib_dbg(_dev, format, arg...) \ + dev_dbg(&(_dev)->ib_dev.dev, "%s:%d:(pid %d): " format, __func__, \ + __LINE__, current->pid, ##arg) + +#define mlx5_ib_err(_dev, format, arg...) \ + dev_err(&(_dev)->ib_dev.dev, "%s:%d:(pid %d): " format, __func__, \ + __LINE__, current->pid, ##arg) + +#define mlx5_ib_warn(_dev, format, arg...) \ + dev_warn(&(_dev)->ib_dev.dev, "%s:%d:(pid %d): " format, __func__, \ + __LINE__, current->pid, ##arg) + +#define MLX5_IB_DEFAULT_UIDX 0xffffff +#define MLX5_USER_ASSIGNED_UIDX_MASK __mlx5_mask(qpc, user_index) + +static __always_inline unsigned long +__mlx5_log_page_size_to_bitmap(unsigned int log_pgsz_bits, + unsigned int pgsz_shift) +{ + unsigned int largest_pg_shift = + min_t(unsigned long, (1ULL << log_pgsz_bits) - 1 + pgsz_shift, + BITS_PER_LONG - 1); + + /* + * Despite a command allowing it, the device does not support lower than + * 4k page size. + */ + pgsz_shift = max_t(unsigned int, MLX5_ADAPTER_PAGE_SHIFT, pgsz_shift); + return GENMASK(largest_pg_shift, pgsz_shift); +} + +/* + * For mkc users, instead of a page_offset the command has a start_iova which + * specifies both the page_offset and the on-the-wire IOVA + */ +#define mlx5_umem_find_best_pgsz(umem, typ, log_pgsz_fld, pgsz_shift, iova) \ + ib_umem_find_best_pgsz(umem, \ + __mlx5_log_page_size_to_bitmap( \ + __mlx5_bit_sz(typ, log_pgsz_fld), \ + pgsz_shift), \ + iova) + +static __always_inline unsigned long +__mlx5_page_offset_to_bitmask(unsigned int page_offset_bits, + unsigned int offset_shift) +{ + unsigned int largest_offset_shift = + min_t(unsigned long, page_offset_bits - 1 + offset_shift, + BITS_PER_LONG - 1); + + return GENMASK(largest_offset_shift, offset_shift); +} + +/* + * QP/CQ/WQ/etc type commands take a page offset that satisifies: + * page_offset_quantized * (page_size/scale) = page_offset + * Which restricts allowed page sizes to ones that satisify the above. + */ +unsigned long __mlx5_umem_find_best_quantized_pgoff( + struct ib_umem *umem, unsigned long pgsz_bitmap, + unsigned int page_offset_bits, u64 pgoff_bitmask, unsigned int scale, + unsigned int *page_offset_quantized); +#define mlx5_umem_find_best_quantized_pgoff(umem, typ, log_pgsz_fld, \ + pgsz_shift, page_offset_fld, \ + scale, page_offset_quantized) \ + __mlx5_umem_find_best_quantized_pgoff( \ + umem, \ + __mlx5_log_page_size_to_bitmap( \ + __mlx5_bit_sz(typ, log_pgsz_fld), pgsz_shift), \ + __mlx5_bit_sz(typ, page_offset_fld), \ + GENMASK(31, order_base_2(scale)), scale, \ + page_offset_quantized) + +#define mlx5_umem_find_best_cq_quantized_pgoff(umem, typ, log_pgsz_fld, \ + pgsz_shift, page_offset_fld, \ + scale, page_offset_quantized) \ + __mlx5_umem_find_best_quantized_pgoff( \ + umem, \ + __mlx5_log_page_size_to_bitmap( \ + __mlx5_bit_sz(typ, log_pgsz_fld), pgsz_shift), \ + __mlx5_bit_sz(typ, page_offset_fld), 0, scale, \ + page_offset_quantized) + +extern struct workqueue_struct *mlx5_ib_sigerr_sqd_wq; + +enum { + MLX5_IB_MMAP_OFFSET_START = 9, + MLX5_IB_MMAP_OFFSET_END = 255, +}; + +enum { + MLX5_IB_MMAP_CMD_SHIFT = 8, + MLX5_IB_MMAP_CMD_MASK = 0xff, +}; + +enum { + MLX5_RES_SCAT_DATA32_CQE = 0x1, + MLX5_RES_SCAT_DATA64_CQE = 0x2, + MLX5_REQ_SCAT_DATA32_CQE = 0x11, + MLX5_REQ_SCAT_DATA64_CQE = 0x22, +}; + +enum mlx5_ib_mad_ifc_flags { + MLX5_MAD_IFC_IGNORE_MKEY = 1, + MLX5_MAD_IFC_IGNORE_BKEY = 2, + MLX5_MAD_IFC_NET_VIEW = 4, +}; + +enum { + MLX5_CROSS_CHANNEL_BFREG = 0, +}; + +enum { + MLX5_CQE_VERSION_V0, + MLX5_CQE_VERSION_V1, +}; + +enum { + MLX5_TM_MAX_RNDV_MSG_SIZE = 64, + MLX5_TM_MAX_SGE = 1, +}; + +enum { + MLX5_IB_INVALID_UAR_INDEX = BIT(31), + MLX5_IB_INVALID_BFREG = BIT(31), +}; + +enum { + MLX5_MAX_MEMIC_PAGES = 0x100, + MLX5_MEMIC_ALLOC_SIZE_MASK = 0x3f, +}; + +enum { + MLX5_MEMIC_BASE_ALIGN = 6, + MLX5_MEMIC_BASE_SIZE = 1 << MLX5_MEMIC_BASE_ALIGN, +}; + +enum mlx5_ib_mmap_type { + MLX5_IB_MMAP_TYPE_MEMIC = 1, + MLX5_IB_MMAP_TYPE_VAR = 2, + MLX5_IB_MMAP_TYPE_UAR_WC = 3, + MLX5_IB_MMAP_TYPE_UAR_NC = 4, + MLX5_IB_MMAP_TYPE_MEMIC_OP = 5, +}; + +struct mlx5_bfreg_info { + u32 *sys_pages; + int num_low_latency_bfregs; + unsigned int *count; + + /* + * protect bfreg allocation data structs + */ + struct mutex lock; + u32 ver; + u8 lib_uar_4k : 1; + u8 lib_uar_dyn : 1; + u32 num_sys_pages; + u32 num_static_sys_pages; + u32 total_num_bfregs; + u32 num_dyn_bfregs; +}; + +struct mlx5_ib_ucontext { + struct ib_ucontext ibucontext; + struct list_head db_page_list; + + /* protect doorbell record alloc/free + */ + struct mutex db_page_mutex; + struct mlx5_bfreg_info bfregi; + u8 cqe_version; + /* Transport Domain number */ + u32 tdn; + + u64 lib_caps; + u16 devx_uid; + /* For RoCE LAG TX affinity */ + atomic_t tx_port_affinity; +}; + +static inline struct mlx5_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext) +{ + return container_of(ibucontext, struct mlx5_ib_ucontext, ibucontext); +} + +struct mlx5_ib_pd { + struct ib_pd ibpd; + u32 pdn; + u16 uid; +}; + +enum { + MLX5_IB_FLOW_ACTION_MODIFY_HEADER, + MLX5_IB_FLOW_ACTION_PACKET_REFORMAT, + MLX5_IB_FLOW_ACTION_DECAP, +}; + +#define MLX5_IB_FLOW_MCAST_PRIO (MLX5_BY_PASS_NUM_PRIOS - 1) +#define MLX5_IB_FLOW_LAST_PRIO (MLX5_BY_PASS_NUM_REGULAR_PRIOS - 1) +#if (MLX5_IB_FLOW_LAST_PRIO <= 0) +#error "Invalid number of bypass priorities" +#endif +#define MLX5_IB_FLOW_LEFTOVERS_PRIO (MLX5_IB_FLOW_MCAST_PRIO + 1) + +#define MLX5_IB_NUM_FLOW_FT (MLX5_IB_FLOW_LEFTOVERS_PRIO + 1) +#define MLX5_IB_NUM_SNIFFER_FTS 2 +#define MLX5_IB_NUM_EGRESS_FTS 1 +#define MLX5_IB_NUM_FDB_FTS MLX5_BY_PASS_NUM_REGULAR_PRIOS +struct mlx5_ib_flow_prio { + struct mlx5_flow_table *flow_table; + unsigned int refcount; +}; + +struct mlx5_ib_flow_handler { + struct list_head list; + struct ib_flow ibflow; + struct mlx5_ib_flow_prio *prio; + struct mlx5_flow_handle *rule; + struct ib_counters *ibcounters; + struct mlx5_ib_dev *dev; + struct mlx5_ib_flow_matcher *flow_matcher; +}; + +struct mlx5_ib_flow_matcher { + struct mlx5_ib_match_params matcher_mask; + int mask_len; + enum mlx5_ib_flow_type flow_type; + enum mlx5_flow_namespace_type ns_type; + u16 priority; + struct mlx5_core_dev *mdev; + atomic_t usecnt; + u8 match_criteria_enable; +}; + +struct mlx5_ib_steering_anchor { + struct mlx5_ib_flow_prio *ft_prio; + struct mlx5_ib_dev *dev; + atomic_t usecnt; +}; + +struct mlx5_ib_pp { + u16 index; + struct mlx5_core_dev *mdev; +}; + +enum mlx5_ib_optional_counter_type { + MLX5_IB_OPCOUNTER_CC_RX_CE_PKTS, + MLX5_IB_OPCOUNTER_CC_RX_CNP_PKTS, + MLX5_IB_OPCOUNTER_CC_TX_CNP_PKTS, + + MLX5_IB_OPCOUNTER_MAX, +}; + +struct mlx5_ib_flow_db { + struct mlx5_ib_flow_prio prios[MLX5_IB_NUM_FLOW_FT]; + struct mlx5_ib_flow_prio egress_prios[MLX5_IB_NUM_FLOW_FT]; + struct mlx5_ib_flow_prio sniffer[MLX5_IB_NUM_SNIFFER_FTS]; + struct mlx5_ib_flow_prio egress[MLX5_IB_NUM_EGRESS_FTS]; + struct mlx5_ib_flow_prio fdb[MLX5_IB_NUM_FDB_FTS]; + struct mlx5_ib_flow_prio rdma_rx[MLX5_IB_NUM_FLOW_FT]; + struct mlx5_ib_flow_prio rdma_tx[MLX5_IB_NUM_FLOW_FT]; + struct mlx5_ib_flow_prio opfcs[MLX5_IB_OPCOUNTER_MAX]; + struct mlx5_flow_table *lag_demux_ft; + /* Protect flow steering bypass flow tables + * when add/del flow rules. + * only single add/removal of flow steering rule could be done + * simultaneously. + */ + struct mutex lock; +}; + +/* Use macros here so that don't have to duplicate + * enum ib_send_flags and enum ib_qp_type for low-level driver + */ + +#define MLX5_IB_SEND_UMR_ENABLE_MR (IB_SEND_RESERVED_START << 0) +#define MLX5_IB_SEND_UMR_DISABLE_MR (IB_SEND_RESERVED_START << 1) +#define MLX5_IB_SEND_UMR_FAIL_IF_FREE (IB_SEND_RESERVED_START << 2) +#define MLX5_IB_SEND_UMR_UPDATE_XLT (IB_SEND_RESERVED_START << 3) +#define MLX5_IB_SEND_UMR_UPDATE_TRANSLATION (IB_SEND_RESERVED_START << 4) +#define MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS IB_SEND_RESERVED_END + +#define MLX5_IB_QPT_REG_UMR IB_QPT_RESERVED1 +/* + * IB_QPT_GSI creates the software wrapper around GSI, and MLX5_IB_QPT_HW_GSI + * creates the actual hardware QP. + */ +#define MLX5_IB_QPT_HW_GSI IB_QPT_RESERVED2 +#define MLX5_IB_QPT_DCI IB_QPT_RESERVED3 +#define MLX5_IB_QPT_DCT IB_QPT_RESERVED4 +#define MLX5_IB_WR_UMR IB_WR_RESERVED1 +#define MLX5_IB_WR_SIG_PIPED IB_WR_RESERVED2 +#define MLX5_IB_WR_SIG_CANCELED IB_WR_RESERVED3 + +#define MLX5_IB_UMR_OCTOWORD 16 +#define MLX5_IB_UMR_XLT_ALIGNMENT 64 + +#define MLX5_IB_UPD_XLT_ZAP BIT(0) +#define MLX5_IB_UPD_XLT_ENABLE BIT(1) +#define MLX5_IB_UPD_XLT_ATOMIC BIT(2) +#define MLX5_IB_UPD_XLT_ADDR BIT(3) +#define MLX5_IB_UPD_XLT_PD BIT(4) +#define MLX5_IB_UPD_XLT_ACCESS BIT(5) +#define MLX5_IB_UPD_XLT_INDIRECT BIT(6) + +/* Private QP creation flags to be passed in ib_qp_init_attr.create_flags. + * + * These flags are intended for internal use by the mlx5_ib driver, and they + * rely on the range reserved for that use in the ib_qp_create_flags enum. + */ +#define MLX5_IB_QP_CREATE_SQPN_QP1 IB_QP_CREATE_RESERVED_START +#define MLX5_IB_QP_CREATE_WC_TEST (IB_QP_CREATE_RESERVED_START << 1) + +struct wr_list { + u16 opcode; + u16 next; +}; + +enum mlx5_ib_rq_flags { + MLX5_IB_RQ_CVLAN_STRIPPING = 1 << 0, + MLX5_IB_RQ_PCI_WRITE_END_PADDING = 1 << 1, +}; + +struct mlx5_ib_wq { + struct mlx5_frag_buf_ctrl fbc; + u64 *wrid; + u32 *wr_data; + struct wr_list *w_list; + unsigned *wqe_head; + u16 unsig_count; + + /* serialize post to the work queue + */ + spinlock_t lock; + int wqe_cnt; + int max_post; + int max_gs; + int offset; + int wqe_shift; + unsigned head; + unsigned tail; + u16 cur_post; + u16 last_poll; + void *cur_edge; +}; + +enum mlx5_ib_wq_flags { + MLX5_IB_WQ_FLAGS_DELAY_DROP = 0x1, + MLX5_IB_WQ_FLAGS_STRIDING_RQ = 0x2, +}; + +#define MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES 9 +#define MLX5_MAX_SINGLE_WQE_LOG_NUM_STRIDES 16 +#define MLX5_MIN_SINGLE_STRIDE_LOG_NUM_BYTES 6 +#define MLX5_MAX_SINGLE_STRIDE_LOG_NUM_BYTES 13 +#define MLX5_EXT_MIN_SINGLE_WQE_LOG_NUM_STRIDES 3 + +struct mlx5_ib_rwq { + struct ib_wq ibwq; + struct mlx5_core_qp core_qp; + u32 rq_num_pas; + u32 log_rq_stride; + u32 log_rq_size; + u32 rq_page_offset; + u32 log_page_size; + u32 log_num_strides; + u32 two_byte_shift_en; + u32 single_stride_log_num_of_bytes; + struct ib_umem *umem; + size_t buf_size; + unsigned int page_shift; + struct mlx5_db db; + u32 user_index; + u32 wqe_count; + u32 wqe_shift; + int wq_sig; + u32 create_flags; /* Use enum mlx5_ib_wq_flags */ +}; + +struct mlx5_ib_rwq_ind_table { + struct ib_rwq_ind_table ib_rwq_ind_tbl; + u32 rqtn; + u16 uid; +}; + +struct mlx5_ib_ubuffer { + struct ib_umem *umem; + int buf_size; + u64 buf_addr; +}; + +struct mlx5_ib_qp_base { + struct mlx5_ib_qp *container_mibqp; + struct mlx5_core_qp mqp; + struct mlx5_ib_ubuffer ubuffer; +}; + +struct mlx5_ib_qp_trans { + struct mlx5_ib_qp_base base; + u16 xrcdn; + u32 alt_port; + u8 atomic_rd_en; + u8 resp_depth; +}; + +struct mlx5_ib_rss_qp { + u32 tirn; +}; + +struct mlx5_ib_rq { + struct mlx5_ib_qp_base base; + struct mlx5_ib_wq *rq; + struct mlx5_ib_ubuffer ubuffer; + struct mlx5_db *doorbell; + u32 tirn; + u8 state; + u32 flags; +}; + +struct mlx5_ib_sq { + struct mlx5_ib_qp_base base; + struct mlx5_ib_wq *sq; + struct mlx5_ib_ubuffer ubuffer; + struct mlx5_db *doorbell; + struct mlx5_flow_handle *flow_rule; + u32 tisn; + u8 state; +}; + +struct mlx5_ib_raw_packet_qp { + struct mlx5_ib_sq sq; + struct mlx5_ib_rq rq; +}; + +struct mlx5_bf { + int buf_size; + unsigned long offset; + struct mlx5_sq_bfreg *bfreg; +}; + +struct mlx5_ib_dct { + struct mlx5_core_dct mdct; + u32 *in; +}; + +struct mlx5_ib_gsi_qp { + struct ib_qp *rx_qp; + u32 port_num; + struct ib_qp_cap cap; + struct ib_cq *cq; + struct mlx5_ib_gsi_wr *outstanding_wrs; + u32 outstanding_pi, outstanding_ci; + int num_qps; + /* Protects access to the tx_qps. Post send operations synchronize + * with tx_qp creation in setup_qp(). Also protects the + * outstanding_wrs array and indices. + */ + spinlock_t lock; + struct ib_qp **tx_qps; +}; + +struct mlx5_ib_qp { + struct ib_qp ibqp; + union { + struct mlx5_ib_qp_trans trans_qp; + struct mlx5_ib_raw_packet_qp raw_packet_qp; + struct mlx5_ib_rss_qp rss_qp; + struct mlx5_ib_dct dct; + struct mlx5_ib_gsi_qp gsi; + }; + struct mlx5_frag_buf buf; + + struct mlx5_db db; + struct mlx5_ib_wq rq; + + u8 sq_signal_bits; + u8 next_fence; + struct mlx5_ib_wq sq; + + /* serialize qp state modifications + */ + struct mutex mutex; + /* cached variant of create_flags from struct ib_qp_init_attr */ + u32 flags; + u32 port; + u8 state; + int max_inline_data; + struct mlx5_bf bf; + u8 has_rq:1; + u8 is_rss:1; + u32 rq_type; + + /* only for user space QPs. For kernel + * we have it from the bf object + */ + int bfregn; + + struct list_head qps_list; + struct list_head cq_recv_list; + struct list_head cq_send_list; + struct mlx5_rate_limit rl; + u32 underlay_qpn; + u32 flags_en; + /* + * IB/core doesn't store low-level QP types, so + * store both MLX and IBTA types in the field below. + */ + enum ib_qp_type type; + struct rdma_ah_attr ah; + u8 tclass; + /* A flag to indicate if there's a new counter is configured + * but not take effective + */ + u32 counter_pending; + u16 gsi_lag_port; +}; + +struct mlx5_ib_cq_buf { + struct mlx5_frag_buf_ctrl fbc; + struct mlx5_frag_buf frag_buf; + struct ib_umem *umem; + int cqe_size; + int nent; +}; + +struct mlx5_umr_wr { + struct ib_send_wr wr; + u64 virt_addr; + u64 offset; + struct ib_pd *pd; + unsigned int page_shift; + unsigned int xlt_size; + u64 length; + int access_flags; + u32 mkey; + u8 ignore_free_state:1; +}; + +static inline const struct mlx5_umr_wr *umr_wr(const struct ib_send_wr *wr) +{ + return container_of(wr, struct mlx5_umr_wr, wr); +} + +enum mlx5_ib_cq_pr_flags { + MLX5_IB_CQ_PR_FLAGS_CQE_128_PAD = 1 << 0, + MLX5_IB_CQ_PR_FLAGS_REAL_TIME_TS = 1 << 1, +}; + +struct mlx5_ib_cq { + struct ib_cq ibcq; + struct mlx5_core_cq mcq; + struct mlx5_ib_cq_buf buf; + struct mlx5_db db; + + /* serialize access to the CQ + */ + spinlock_t lock; + + /* protect resize cq + */ + struct mutex resize_mutex; + struct mlx5_ib_cq_buf *resize_buf; + struct ib_umem *resize_umem; + int cqe_size; + struct list_head list_send_qp; + struct list_head list_recv_qp; + u32 create_flags; + struct list_head wc_list; + enum ib_cq_notify_flags notify_flags; + struct work_struct notify_work; + u16 private_flags; /* Use mlx5_ib_cq_pr_flags */ +}; + +struct mlx5_ib_wc { + struct ib_wc wc; + struct list_head list; + atomic_t in_use; +}; + +struct mlx5_ib_srq { + struct ib_srq ibsrq; + struct mlx5_core_srq msrq; + struct mlx5_frag_buf buf; + struct mlx5_db db; + struct mlx5_frag_buf_ctrl fbc; + u64 *wrid; + /* protect SRQ hanlding + */ + spinlock_t lock; + int head; + int tail; + u16 wqe_ctr; + struct ib_umem *umem; + /* serialize arming a SRQ + */ + struct mutex mutex; + int wq_sig; +}; + +struct mlx5_ib_xrcd { + struct ib_xrcd ibxrcd; + u32 xrcdn; +}; + +enum mlx5_ib_mtt_access_flags { + MLX5_IB_MTT_READ = (1 << 0), + MLX5_IB_MTT_WRITE = (1 << 1), +}; + +struct mlx5_user_mmap_entry { + struct rdma_user_mmap_entry rdma_entry; + u8 mmap_flag; + u64 address; + u32 page_idx; +}; + +enum mlx5_mkey_type { + MLX5_MKEY_MR = 1, + MLX5_MKEY_MW, + MLX5_MKEY_INDIRECT_DEVX, +}; + +struct mlx5_ib_mkey { + u32 key; + enum mlx5_mkey_type type; + unsigned int ndescs; + struct wait_queue_head wait; + refcount_t usecount; +}; + +#define MLX5_IB_MTT_PRESENT (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE) + +#define MLX5_IB_DM_MEMIC_ALLOWED_ACCESS (IB_ACCESS_LOCAL_WRITE |\ + IB_ACCESS_REMOTE_WRITE |\ + IB_ACCESS_REMOTE_READ |\ + IB_ACCESS_REMOTE_ATOMIC |\ + IB_ZERO_BASED) + +#define MLX5_IB_DM_SW_ICM_ALLOWED_ACCESS (IB_ACCESS_LOCAL_WRITE |\ + IB_ACCESS_REMOTE_WRITE |\ + IB_ACCESS_REMOTE_READ |\ + IB_ZERO_BASED) + +#define mlx5_update_odp_stats(mr, counter_name, value) \ + atomic64_add(value, &((mr)->odp_stats.counter_name)) + +struct mlx5_ib_mr { + struct ib_mr ibmr; + struct mlx5_ib_mkey mmkey; + + /* User MR data */ + struct mlx5_cache_ent *cache_ent; + /* Everything after cache_ent is zero'd when MR allocated */ + struct ib_umem *umem; + + struct { + /* Used only while the MR is in the cache */ + struct { + u32 out[MLX5_ST_SZ_DW(create_mkey_out)]; + struct mlx5_async_work cb_work; + /* Cache list element */ + struct list_head list; + }; + + /* Used only by kernel MRs (umem == NULL) */ + struct { + void *descs; + void *descs_alloc; + dma_addr_t desc_map; + int max_descs; + int desc_size; + int access_mode; + + /* For Kernel IB_MR_TYPE_INTEGRITY */ + struct mlx5_core_sig_ctx *sig; + struct mlx5_ib_mr *pi_mr; + struct mlx5_ib_mr *klm_mr; + struct mlx5_ib_mr *mtt_mr; + u64 data_iova; + u64 pi_iova; + int meta_ndescs; + int meta_length; + int data_length; + }; + + /* Used only by User MRs (umem != NULL) */ + struct { + unsigned int page_shift; + /* Current access_flags */ + int access_flags; + + /* For User ODP */ + struct mlx5_ib_mr *parent; + struct xarray implicit_children; + union { + struct work_struct work; + } odp_destroy; + struct ib_odp_counters odp_stats; + bool is_odp_implicit; + }; + }; +}; + +/* Zero the fields in the mr that are variant depending on usage */ +static inline void mlx5_clear_mr(struct mlx5_ib_mr *mr) +{ + memset_after(mr, 0, cache_ent); +} + +static inline bool is_odp_mr(struct mlx5_ib_mr *mr) +{ + return IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && mr->umem && + mr->umem->is_odp; +} + +static inline bool is_dmabuf_mr(struct mlx5_ib_mr *mr) +{ + return IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && mr->umem && + mr->umem->is_dmabuf; +} + +struct mlx5_ib_mw { + struct ib_mw ibmw; + struct mlx5_ib_mkey mmkey; +}; + +struct mlx5_ib_umr_context { + struct ib_cqe cqe; + enum ib_wc_status status; + struct completion done; +}; + +enum { + MLX5_UMR_STATE_ACTIVE, + MLX5_UMR_STATE_RECOVER, + MLX5_UMR_STATE_ERR, +}; + +struct umr_common { + struct ib_pd *pd; + struct ib_cq *cq; + struct ib_qp *qp; + /* Protects from UMR QP overflow + */ + struct semaphore sem; + /* Protects from using UMR while the UMR is not active + */ + struct mutex lock; + unsigned int state; +}; + +struct cache_order { + struct kobject kobj; + int order; + int index; + struct mlx5_ib_dev *dev; +}; + +struct mlx5_cache_ent { + struct list_head head; + /* sync access to the cahce entry + */ + spinlock_t lock; + + u32 order; + u32 xlt; + u32 access_mode; + u32 page; + + u8 disabled:1; + u8 fill_to_high_water:1; + + /* + * - available_mrs is the length of list head, ie the number of MRs + * available for immediate allocation. + * - total_mrs is available_mrs plus all in use MRs that could be + * returned to the cache. + * - limit is the low water mark for available_mrs, 2* limit is the + * upper water mark. + * - pending is the number of MRs currently being created + */ + u32 total_mrs; + u32 available_mrs; + u32 limit; + u32 pending; + + /* Statistics */ + u32 miss; + + struct mlx5_ib_dev *dev; + struct work_struct work; + struct delayed_work dwork; + struct cache_order co; +}; + +struct mlx5_mr_cache { + struct workqueue_struct *wq; + struct mlx5_cache_ent ent[MAX_MR_CACHE_ENTRIES]; + struct dentry *root; + unsigned long last_add; + int rel_timeout; + int rel_imm; +}; + +struct mlx5_ib_port_resources { + struct mlx5_ib_gsi_qp *gsi; + struct work_struct pkey_change_work; +}; + +struct mlx5_ib_resources { + struct ib_cq *c0; + u32 xrcdn0; + u32 xrcdn1; + struct ib_pd *p0; + struct ib_srq *s0; + struct ib_srq *s1; + struct mlx5_ib_port_resources ports[2]; +}; + +#define MAX_OPFC_RULES 2 + +struct mlx5_ib_op_fc { + struct mlx5_fc *fc; + struct mlx5_flow_handle *rule[MAX_OPFC_RULES]; +}; + +struct mlx5_ib_counters { + struct rdma_stat_desc *descs; + size_t *offsets; + u32 num_q_counters; + u32 num_cong_counters; + u32 num_ext_ppcnt_counters; + u32 num_op_counters; + u16 set_id; + struct mlx5_ib_op_fc opfcs[MLX5_IB_OPCOUNTER_MAX]; +}; + +int mlx5_ib_fs_add_op_fc(struct mlx5_ib_dev *dev, u32 port_num, + struct mlx5_ib_op_fc *opfc, + enum mlx5_ib_optional_counter_type type); + +void mlx5_ib_fs_remove_op_fc(struct mlx5_ib_dev *dev, + struct mlx5_ib_op_fc *opfc, + enum mlx5_ib_optional_counter_type type); + +struct mlx5_ib_multiport_info; + +struct mlx5_ib_multiport { + struct mlx5_ib_multiport_info *mpi; + /* To be held when accessing the multiport info */ + spinlock_t mpi_lock; +}; + +struct mlx5_roce { + /* Protect mlx5_ib_get_netdev from invoking dev_hold() with a NULL + * netdev pointer + */ + rwlock_t netdev_lock; + struct net_device *netdev; + struct notifier_block nb; + atomic_t tx_port_affinity; + enum ib_port_state last_port_state; + struct mlx5_ib_dev *dev; + u32 native_port_num; +}; + +struct mlx5_ib_port { + struct mlx5_ib_counters cnts; + struct mlx5_ib_multiport mp; + struct mlx5_ib_dbg_cc_params *dbg_cc_params; + struct mlx5_roce roce; + struct mlx5_eswitch_rep *rep; +}; + +struct mlx5_ib_dbg_param { + int offset; + struct mlx5_ib_dev *dev; + struct dentry *dentry; + u32 port_num; +}; + +enum mlx5_ib_dbg_cc_types { + MLX5_IB_DBG_CC_RP_CLAMP_TGT_RATE, + MLX5_IB_DBG_CC_RP_CLAMP_TGT_RATE_ATI, + MLX5_IB_DBG_CC_RP_TIME_RESET, + MLX5_IB_DBG_CC_RP_BYTE_RESET, + MLX5_IB_DBG_CC_RP_THRESHOLD, + MLX5_IB_DBG_CC_RP_AI_RATE, + MLX5_IB_DBG_CC_RP_MAX_RATE, + MLX5_IB_DBG_CC_RP_HAI_RATE, + MLX5_IB_DBG_CC_RP_MIN_DEC_FAC, + MLX5_IB_DBG_CC_RP_MIN_RATE, + MLX5_IB_DBG_CC_RP_RATE_TO_SET_ON_FIRST_CNP, + MLX5_IB_DBG_CC_RP_DCE_TCP_G, + MLX5_IB_DBG_CC_RP_DCE_TCP_RTT, + MLX5_IB_DBG_CC_RP_RATE_REDUCE_MONITOR_PERIOD, + MLX5_IB_DBG_CC_RP_INITIAL_ALPHA_VALUE, + MLX5_IB_DBG_CC_RP_GD, + MLX5_IB_DBG_CC_NP_MIN_TIME_BETWEEN_CNPS, + MLX5_IB_DBG_CC_NP_CNP_DSCP, + MLX5_IB_DBG_CC_NP_CNP_PRIO_MODE, + MLX5_IB_DBG_CC_NP_CNP_PRIO, + MLX5_IB_DBG_CC_MAX, +}; + +struct mlx5_ib_dbg_cc_params { + struct dentry *root; + struct mlx5_ib_dbg_param params[MLX5_IB_DBG_CC_MAX]; +}; + +struct mlx5_ib_delay_drop { + struct mlx5_ib_dev *dev; + struct work_struct delay_drop_work; + /* serialize setting of delay drop */ + struct mutex lock; + u32 timeout; + bool activate; + atomic_t events_cnt; + atomic_t rqs_cnt; + struct dentry *dir_debugfs; +}; + +enum mlx5_ib_stages { + MLX5_IB_STAGE_INIT, + MLX5_IB_STAGE_FS, + MLX5_IB_STAGE_CAPS, + MLX5_IB_STAGE_NON_DEFAULT_CB, + MLX5_IB_STAGE_ROCE, + MLX5_IB_STAGE_QP, + MLX5_IB_STAGE_SRQ, + MLX5_IB_STAGE_DEVICE_RESOURCES, + MLX5_IB_STAGE_DEVICE_NOTIFIER, + MLX5_IB_STAGE_ODP, + MLX5_IB_STAGE_COUNTERS, + MLX5_IB_STAGE_CONG_DEBUGFS, + MLX5_IB_STAGE_UAR, + MLX5_IB_STAGE_BFREG, + MLX5_IB_STAGE_PRE_IB_REG_UMR, + MLX5_IB_STAGE_WHITELIST_UID, + MLX5_IB_STAGE_IB_REG, + MLX5_IB_STAGE_POST_IB_REG_UMR, + MLX5_IB_STAGE_DELAY_DROP, + MLX5_IB_STAGE_DC_TRACER, + MLX5_IB_STAGE_RESTRACK, + MLX5_IB_STAGE_TTL_SYSFS, + MLX5_IB_STAGE_TC_SYSFS, + MLX5_IB_STAGE_MAX, +}; + +struct mlx5_ib_stage { + int (*init)(struct mlx5_ib_dev *dev); + void (*cleanup)(struct mlx5_ib_dev *dev); +}; + +#define STAGE_CREATE(_stage, _init, _cleanup) \ + .stage[_stage] = {.init = _init, .cleanup = _cleanup} + +struct mlx5_ib_profile { + struct mlx5_ib_stage stage[MLX5_IB_STAGE_MAX]; +}; + +struct mlx5_ib_multiport_info { + struct list_head list; + struct mlx5_ib_dev *ibdev; + struct mlx5_core_dev *mdev; + struct notifier_block mdev_events; + struct completion unref_comp; + u64 sys_image_guid; + u32 mdev_refcnt; + bool is_master; + bool unaffiliate; +}; + +struct mlx5_ib_flow_action { + struct ib_flow_action ib_action; + union { + struct { + u64 ib_flags; + struct mlx5_accel_esp_xfrm *ctx; + } esp_aes_gcm; + struct { + struct mlx5_ib_dev *dev; + u32 sub_type; + union { + struct mlx5_modify_hdr *modify_hdr; + struct mlx5_pkt_reformat *pkt_reformat; + }; + } flow_action_raw; + }; +}; + +struct mlx5_dm { + struct mlx5_core_dev *dev; + /* This lock is used to protect the access to the shared + * allocation map when concurrent requests by different + * processes are handled. + */ + spinlock_t lock; + DECLARE_BITMAP(memic_alloc_pages, MLX5_MAX_MEMIC_PAGES); +}; + +struct mlx5_read_counters_attr { + struct mlx5_fc *hw_cntrs_hndl; + u64 *out; + u32 flags; +}; + +enum mlx5_ib_counters_type { + MLX5_IB_COUNTERS_FLOW, +}; + +struct mlx5_ib_mcounters { + struct ib_counters ibcntrs; + enum mlx5_ib_counters_type type; + /* number of counters supported for this counters type */ + u32 counters_num; + struct mlx5_fc *hw_cntrs_hndl; + /* read function for this counters type */ + int (*read_counters)(struct ib_device *ibdev, + struct mlx5_read_counters_attr *read_attr); + /* max index set as part of create_flow */ + u32 cntrs_max_index; + /* number of counters data entries ( pair) */ + u32 ncounters; + /* counters data array for descriptions and indexes */ + struct mlx5_ib_flow_counters_desc *counters_data; + /* protects access to mcounters internal data */ + struct mutex mcntrs_mutex; +}; + +static inline struct mlx5_ib_mcounters * +to_mcounters(struct ib_counters *ibcntrs) +{ + return container_of(ibcntrs, struct mlx5_ib_mcounters, ibcntrs); +} + +int parse_flow_flow_action(struct mlx5_ib_flow_action *maction, + bool is_egress, + struct mlx5_flow_act *action); +struct mlx5_ib_lb_state { + /* protect the user_td */ + struct mutex mutex; + u32 user_td; + int qps; + bool enabled; +}; + +struct mlx5_ib_pf_eq { + struct notifier_block irq_nb; + struct mlx5_ib_dev *dev; + struct mlx5_eq *core; + struct work_struct work; + spinlock_t lock; /* Pagefaults spinlock */ + struct workqueue_struct *wq; + mempool_t *pool; +}; + +struct mlx5_devx_event_table { + struct mlx5_nb devx_nb; + /* serialize updating the event_xa */ + struct mutex event_xa_lock; + struct xarray event_xa; +}; + +struct mlx5_var_table { + /* serialize updating the bitmap */ + struct mutex bitmap_lock; + unsigned long *bitmap; + u64 hw_start_addr; + u32 stride_size; + u64 num_var_hw_entries; +}; + +struct mlx5_port_caps { + bool has_smi; + u8 ext_port_cap; +}; + +struct mlx5_reserved_gids { + int macsec_index; + const struct ib_gid_attr *physical_gid; +}; + +struct mlx5_ib_dev { + struct ib_device ib_dev; + struct mlx5_core_dev *mdev; + struct notifier_block mdev_events; + int num_ports; + /* serialize update of capability mask + */ + struct mutex cap_mask_mutex; + u8 ib_active:1; + u8 is_rep:1; + u8 lag_active:1; + u8 wc_support:1; + u8 fill_delay; + struct umr_common umrc; + /* sync used page count stats + */ + struct mlx5_ib_resources devr; + + atomic_t mkey_var; + struct mlx5_mr_cache cache; + struct timer_list delay_timer; + /* Prevents soft lock on massive reg MRs */ + struct mutex slow_path_mutex; + struct ib_odp_caps odp_caps; + u64 odp_max_size; + struct mutex odp_eq_mutex; + struct mlx5_ib_pf_eq odp_pf_eq; + + struct xarray odp_mkeys; + + u32 null_mkey; + struct mlx5_ib_flow_db *flow_db; + struct ib_nvmf_caps nvmf_caps; + /* protect resources needed as part of reset flow */ + spinlock_t reset_flow_resource_lock; + struct list_head qp_list; + struct mlx5_dc_tracer dctr; + u32 num_dc_cnak_qps; + u32 max_dc_cnak_qps; + struct mlx5_dc_stats dc_stats[MLX5_MAX_PORTS]; + struct mlx5_dc_data *dcd[MLX5_MAX_PORTS]; + struct mlx5_ttl_data ttld[MLX5_MAX_PORTS]; + struct mlx5_tc_data tcd[MLX5_MAX_PORTS]; + struct kobject *dc_kobj; + struct kobject *ttl_kobj; + struct kobject *tc_kobj; + /* Array with num_ports elements */ + struct mlx5_ib_port *port; + struct mlx5_sq_bfreg bfreg; + struct mlx5_sq_bfreg wc_bfreg; + struct mlx5_sq_bfreg fp_bfreg; + struct mlx5_ib_delay_drop delay_drop; + const struct mlx5_ib_profile *profile; + + struct mlx5_ib_lb_state lb; + u8 umr_fence; + struct list_head ib_dev_list; + u64 sys_image_guid; + struct mlx5_dm dm; + u16 devx_whitelist_uid; + struct kobject mr_cache; + struct mlx5_srq_table srq_table; + struct mlx5_qp_table qp_table; + struct mlx5_async_ctx async_ctx; + struct mlx5_devx_event_table devx_event_table; + struct mlx5_var_table var_table; + + struct xarray sig_mrs; + struct mlx5_port_caps port_caps[MLX5_MAX_PORTS]; + struct mlx5_reserved_gids reserved_gids[MLX5_MAX_PORTS][MLX5_MAX_MACSEC_GIDS]; + u16 pkey_table_len; + u8 lag_ports; +}; + +static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq) +{ + return container_of(mcq, struct mlx5_ib_cq, mcq); +} + +static inline struct mlx5_ib_xrcd *to_mxrcd(struct ib_xrcd *ibxrcd) +{ + return container_of(ibxrcd, struct mlx5_ib_xrcd, ibxrcd); +} + +static inline struct mlx5_ib_dev *to_mdev(struct ib_device *ibdev) +{ + return container_of(ibdev, struct mlx5_ib_dev, ib_dev); +} + +static inline struct mlx5_ib_dev *mr_to_mdev(struct mlx5_ib_mr *mr) +{ + return to_mdev(mr->ibmr.device); +} + +static inline struct mlx5_ib_dev *mlx5_udata_to_mdev(struct ib_udata *udata) +{ + struct mlx5_ib_ucontext *context = rdma_udata_to_drv_context( + udata, struct mlx5_ib_ucontext, ibucontext); + + return to_mdev(context->ibucontext.device); +} + +static inline struct mlx5_ib_cq *to_mcq(struct ib_cq *ibcq) +{ + return container_of(ibcq, struct mlx5_ib_cq, ibcq); +} + +static inline struct mlx5_ib_qp *to_mibqp(struct mlx5_core_qp *mqp) +{ + return container_of(mqp, struct mlx5_ib_qp_base, mqp)->container_mibqp; +} + +static inline struct mlx5_ib_rwq *to_mibrwq(struct mlx5_core_qp *core_qp) +{ + return container_of(core_qp, struct mlx5_ib_rwq, core_qp); +} + +static inline struct mlx5_ib_pd *to_mpd(struct ib_pd *ibpd) +{ + return container_of(ibpd, struct mlx5_ib_pd, ibpd); +} + +static inline struct mlx5_ib_srq *to_msrq(struct ib_srq *ibsrq) +{ + return container_of(ibsrq, struct mlx5_ib_srq, ibsrq); +} + +static inline struct mlx5_ib_qp *to_mqp(struct ib_qp *ibqp) +{ + return container_of(ibqp, struct mlx5_ib_qp, ibqp); +} + +static inline struct mlx5_ib_rwq *to_mrwq(struct ib_wq *ibwq) +{ + return container_of(ibwq, struct mlx5_ib_rwq, ibwq); +} + +static inline struct mlx5_ib_rwq_ind_table *to_mrwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_tbl) +{ + return container_of(ib_rwq_ind_tbl, struct mlx5_ib_rwq_ind_table, ib_rwq_ind_tbl); +} + +static inline struct mlx5_ib_srq *to_mibsrq(struct mlx5_core_srq *msrq) +{ + return container_of(msrq, struct mlx5_ib_srq, msrq); +} + +static inline struct mlx5_ib_mr *to_mmr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct mlx5_ib_mr, ibmr); +} + +static inline struct mlx5_ib_mw *to_mmw(struct ib_mw *ibmw) +{ + return container_of(ibmw, struct mlx5_ib_mw, ibmw); +} + +static inline struct mlx5_ib_flow_action * +to_mflow_act(struct ib_flow_action *ibact) +{ + return container_of(ibact, struct mlx5_ib_flow_action, ib_action); +} + +static inline struct mlx5_user_mmap_entry * +to_mmmap(struct rdma_user_mmap_entry *rdma_entry) +{ + return container_of(rdma_entry, + struct mlx5_user_mmap_entry, rdma_entry); +} + +int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, unsigned long virt, + struct mlx5_db *db); +void mlx5_ib_db_unmap_user(struct mlx5_ib_ucontext *context, struct mlx5_db *db); +void __mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq); +void mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq); +void mlx5_ib_free_srq_wqe(struct mlx5_ib_srq *srq, int wqe_index); +int mlx5_ib_create_ah(struct ib_ah *ah, struct rdma_ah_init_attr *init_attr, + struct ib_udata *udata); +int mlx5_ib_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr); +static inline int mlx5_ib_destroy_ah(struct ib_ah *ah, u32 flags) +{ + return 0; +} +int mlx5_ib_create_srq(struct ib_srq *srq, struct ib_srq_init_attr *init_attr, + struct ib_udata *udata); +int mlx5_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, struct ib_udata *udata); +int mlx5_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr); +int mlx5_ib_destroy_srq(struct ib_srq *srq, struct ib_udata *udata); +int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr); +int mlx5_ib_enable_lb(struct mlx5_ib_dev *dev, bool td, bool qp); +void mlx5_ib_disable_lb(struct mlx5_ib_dev *dev, bool td, bool qp); +int mlx5_ib_create_qp(struct ib_qp *qp, struct ib_qp_init_attr *init_attr, + struct ib_udata *udata); +int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata); +int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr); +int mlx5_ib_destroy_qp(struct ib_qp *qp, struct ib_udata *udata); +void mlx5_ib_drain_sq(struct ib_qp *qp); +void mlx5_ib_drain_rq(struct ib_qp *qp); +int mlx5_ib_read_wqe_sq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer, + size_t buflen, size_t *bc); +int mlx5_ib_read_wqe_rq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer, + size_t buflen, size_t *bc); +int mlx5_ib_read_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index, void *buffer, + size_t buflen, size_t *bc); +int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata); +int mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); +int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); +int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); +int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period); +int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata); +struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc); +struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int access_flags, + struct ib_udata *udata); +struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 start, + u64 length, u64 virt_addr, + int fd, int access_flags, + struct ib_udata *udata); +int mlx5_ib_advise_mr(struct ib_pd *pd, + enum ib_uverbs_advise_mr_advice advice, + u32 flags, + struct ib_sge *sg_list, + u32 num_sge, + struct uverbs_attr_bundle *attrs); +int mlx5_ib_alloc_mw(struct ib_mw *mw, struct ib_udata *udata); +int mlx5_ib_dealloc_mw(struct ib_mw *mw); +int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages, + int page_shift, int flags); +int mlx5_ib_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags); +struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, + int access_flags); +void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *mr); +void mlx5_ib_free_odp_mr(struct mlx5_ib_mr *mr); +struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, + u64 length, u64 virt_addr, int access_flags, + struct ib_pd *pd, struct ib_udata *udata); +int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata); +struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, + u32 max_num_sg); +struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd, + u32 max_num_sg, + u32 max_num_meta_sg); +int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset); +int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, + int data_sg_nents, unsigned int *data_sg_offset, + struct scatterlist *meta_sg, int meta_sg_nents, + unsigned int *meta_sg_offset); +int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u32 port_num, + const struct ib_wc *in_wc, const struct ib_grh *in_grh, + const struct ib_mad *in, struct ib_mad *out, + size_t *out_mad_size, u16 *out_mad_pkey_index); +int mlx5_ib_alloc_xrcd(struct ib_xrcd *xrcd, struct ib_udata *udata); +int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd, struct ib_udata *udata); +int mlx5_query_ext_port_caps(struct mlx5_ib_dev *dev, unsigned int port); +int mlx5_query_mad_ifc_system_image_guid(struct ib_device *ibdev, + __be64 *sys_image_guid); +int mlx5_query_mad_ifc_max_pkeys(struct ib_device *ibdev, + u16 *max_pkeys); +int mlx5_query_mad_ifc_vendor_id(struct ib_device *ibdev, + u32 *vendor_id); +int mlx5_query_mad_ifc_node_desc(struct mlx5_ib_dev *dev, char *node_desc); +int mlx5_query_mad_ifc_node_guid(struct mlx5_ib_dev *dev, __be64 *node_guid); +int mlx5_query_mad_ifc_pkey(struct ib_device *ibdev, u32 port, u16 index, + u16 *pkey); +int mlx5_query_mad_ifc_gids(struct ib_device *ibdev, u32 port, int index, + union ib_gid *gid); +int mlx5_query_mad_ifc_port(struct ib_device *ibdev, u32 port, + struct ib_port_attr *props); +int mlx5_ib_query_port(struct ib_device *ibdev, u32 port, + struct ib_port_attr *props); +void mlx5_ib_populate_pas(struct ib_umem *umem, size_t page_size, __be64 *pas, + u64 access_flags); +void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num); +int mlx5_ib_get_cqe_size(struct ib_cq *ibcq); +int mlx5_mr_cache_init(struct mlx5_ib_dev *dev); +int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev); + +struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, + unsigned int entry, int access_flags); + +int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, + struct ib_mr_status *mr_status); +struct ib_wq *mlx5_ib_create_wq(struct ib_pd *pd, + struct ib_wq_init_attr *init_attr, + struct ib_udata *udata); +int mlx5_ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata); +int mlx5_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr, + u32 wq_attr_mask, struct ib_udata *udata); +int mlx5_ib_create_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_table, + struct ib_rwq_ind_table_init_attr *init_attr, + struct ib_udata *udata); +int mlx5_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table); +struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm, + struct ib_dm_mr_attr *attr, + struct uverbs_attr_bundle *attrs); +int mlx5r_umr_recover(struct mlx5_ib_dev *dev); + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING +int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev); +int mlx5r_odp_create_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq); +void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev); +int __init mlx5_ib_odp_init(void); +void mlx5_ib_odp_cleanup(void); +void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent); +void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries, + struct mlx5_ib_mr *mr, int flags); + +int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd, + enum ib_uverbs_advise_mr_advice advice, + u32 flags, struct ib_sge *sg_list, u32 num_sge); +int mlx5_ib_init_odp_mr(struct mlx5_ib_mr *mr); +int mlx5_ib_init_dmabuf_mr(struct mlx5_ib_mr *mr); +#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ +static inline int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) { return 0; } +static inline int mlx5r_odp_create_eq(struct mlx5_ib_dev *dev, + struct mlx5_ib_pf_eq *eq) +{ + return 0; +} +static inline void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev) {} +static inline int mlx5_ib_odp_init(void) { return 0; } +static inline void mlx5_ib_odp_cleanup(void) {} +static inline void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) {} +static inline void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries, + struct mlx5_ib_mr *mr, int flags) {} + +static inline int +mlx5_ib_advise_mr_prefetch(struct ib_pd *pd, + enum ib_uverbs_advise_mr_advice advice, u32 flags, + struct ib_sge *sg_list, u32 num_sge) +{ + return -EOPNOTSUPP; +} +static inline int mlx5_ib_init_odp_mr(struct mlx5_ib_mr *mr) +{ + return -EOPNOTSUPP; +} +static inline int mlx5_ib_init_dmabuf_mr(struct mlx5_ib_mr *mr) +{ + return -EOPNOTSUPP; +} +#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ + +extern const struct mmu_interval_notifier_ops mlx5_mn_ops; + +/* Needed for rep profile */ +void __mlx5_ib_remove(struct mlx5_ib_dev *dev, + const struct mlx5_ib_profile *profile, + int stage); +int __mlx5_ib_add(struct mlx5_ib_dev *dev, + const struct mlx5_ib_profile *profile); + +int mlx5_ib_get_vf_config(struct ib_device *device, int vf, + u32 port, struct ifla_vf_info *info); +int mlx5_ib_set_vf_link_state(struct ib_device *device, int vf, + u32 port, int state); +int mlx5_ib_get_vf_stats(struct ib_device *device, int vf, + u32 port, struct ifla_vf_stats *stats); +int mlx5_ib_get_vf_guid(struct ib_device *device, int vf, u32 port, + struct ifla_vf_guid *node_guid, + struct ifla_vf_guid *port_guid); +int mlx5_ib_set_vf_guid(struct ib_device *device, int vf, u32 port, + u64 guid, int type); + +__be16 mlx5_get_roce_udp_sport_min(const struct mlx5_ib_dev *dev, + const struct ib_gid_attr *attr); + +void mlx5_ib_cleanup_cong_debugfs(struct mlx5_ib_dev *dev, u32 port_num); +void mlx5_ib_init_cong_debugfs(struct mlx5_ib_dev *dev, u32 port_num); + +/* GSI QP helper functions */ +int mlx5_ib_create_gsi(struct ib_pd *pd, struct mlx5_ib_qp *mqp, + struct ib_qp_init_attr *attr); +int mlx5_ib_destroy_gsi(struct mlx5_ib_qp *mqp); +int mlx5_ib_gsi_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr, + int attr_mask); +int mlx5_ib_gsi_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr); +int mlx5_ib_gsi_post_send(struct ib_qp *qp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr); +int mlx5_ib_gsi_post_recv(struct ib_qp *qp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr); +void mlx5_ib_gsi_pkey_change(struct mlx5_ib_gsi_qp *gsi); + +void mlx5_ib_generate_wc(struct ib_cq *ibcq, struct mlx5_ib_wc *soft_wc); + +void mlx5_ib_free_bfreg(struct mlx5_ib_dev *dev, struct mlx5_bfreg_info *bfregi, + int bfregn); +struct mlx5_ib_dev *mlx5_ib_get_ibdev_from_mpi(struct mlx5_ib_multiport_info *mpi); +struct mlx5_core_dev *mlx5_ib_get_native_port_mdev(struct mlx5_ib_dev *dev, + u32 ib_port_num, + u32 *native_port_num); +void mlx5_ib_put_native_port_mdev(struct mlx5_ib_dev *dev, + u32 port_num); + +extern const struct uapi_definition mlx5_ib_devx_defs[]; +extern const struct uapi_definition mlx5_ib_flow_defs[]; +extern const struct uapi_definition mlx5_ib_qos_defs[]; +extern const struct uapi_definition mlx5_ib_std_types_defs[]; + +static inline int is_qp1(enum ib_qp_type qp_type) +{ + return qp_type == MLX5_IB_QPT_HW_GSI || qp_type == IB_QPT_GSI; +} + +#define MLX5_MAX_UMR_SHIFT 16 +#define MLX5_MAX_UMR_PAGES (1 << MLX5_MAX_UMR_SHIFT) + +static inline u32 check_cq_create_flags(u32 flags) +{ + /* + * It returns non-zero value for unsupported CQ + * create flags, otherwise it returns zero. + */ + return (flags & ~(IB_UVERBS_CQ_FLAGS_IGNORE_OVERRUN | + IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION)); +} + +static inline int verify_assign_uidx(u8 cqe_version, u32 cmd_uidx, + u32 *user_index) +{ + if (cqe_version) { + if ((cmd_uidx == MLX5_IB_DEFAULT_UIDX) || + (cmd_uidx & ~MLX5_USER_ASSIGNED_UIDX_MASK)) + return -EINVAL; + *user_index = cmd_uidx; + } else { + *user_index = MLX5_IB_DEFAULT_UIDX; + } + + return 0; +} + +static inline int get_qp_user_index(struct mlx5_ib_ucontext *ucontext, + struct mlx5_ib_create_qp *ucmd, + int inlen, + u32 *user_index) +{ + u8 cqe_version = ucontext->cqe_version; + + if ((offsetofend(typeof(*ucmd), uidx) <= inlen) && !cqe_version && + (ucmd->uidx == MLX5_IB_DEFAULT_UIDX)) + return 0; + + if ((offsetofend(typeof(*ucmd), uidx) <= inlen) != !!cqe_version) + return -EINVAL; + + return verify_assign_uidx(cqe_version, ucmd->uidx, user_index); +} + +static inline int get_srq_user_index(struct mlx5_ib_ucontext *ucontext, + struct mlx5_ib_create_srq *ucmd, + int inlen, + u32 *user_index) +{ + u8 cqe_version = ucontext->cqe_version; + + if ((offsetofend(typeof(*ucmd), uidx) <= inlen) && !cqe_version && + (ucmd->uidx == MLX5_IB_DEFAULT_UIDX)) + return 0; + + if ((offsetofend(typeof(*ucmd), uidx) <= inlen) != !!cqe_version) + return -EINVAL; + + return verify_assign_uidx(cqe_version, ucmd->uidx, user_index); +} + +static inline int get_uars_per_sys_page(struct mlx5_ib_dev *dev, bool lib_support) +{ + return lib_support && MLX5_CAP_GEN(dev->mdev, uar_4k) ? + MLX5_UARS_IN_PAGE : 1; +} + +static inline int get_num_static_uars(struct mlx5_ib_dev *dev, + struct mlx5_bfreg_info *bfregi) +{ + return get_uars_per_sys_page(dev, bfregi->lib_uar_4k) * bfregi->num_static_sys_pages; +} + +extern void *xlt_emergency_page; + +int bfregn_to_uar_index(struct mlx5_ib_dev *dev, + struct mlx5_bfreg_info *bfregi, u32 bfregn, + bool dyn_bfreg); + +static inline bool mlx5_ib_can_load_pas_with_umr(struct mlx5_ib_dev *dev, + size_t length) +{ + /* + * umr_check_mkey_mask() rejects MLX5_MKEY_MASK_PAGE_SIZE which is + * always set if MLX5_IB_SEND_UMR_UPDATE_TRANSLATION (aka + * MLX5_IB_UPD_XLT_ADDR and MLX5_IB_UPD_XLT_ENABLE) is set. Thus, a mkey + * can never be enabled without this capability. Simplify this weird + * quirky hardware by just saying it can't use PAS lists with UMR at + * all. + */ + if (MLX5_CAP_GEN(dev->mdev, umr_modify_entity_size_disabled)) + return false; + + /* + * length is the size of the MR in bytes when mlx5_ib_update_xlt() is + * used. + */ + if (!MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset) && + length >= MLX5_MAX_UMR_PAGES * PAGE_SIZE) + return false; + + return true; +} + +/* + * true if an existing MR can be reconfigured to new access_flags using UMR. + * Older HW cannot use UMR to update certain elements of the MKC. See + * umr_check_mkey_mask(), get_umr_update_access_mask() and umr_check_mkey_mask() + */ +static inline bool mlx5_ib_can_reconfig_with_umr(struct mlx5_ib_dev *dev, + unsigned int current_access_flags, + unsigned int target_access_flags) +{ + unsigned int diffs = current_access_flags ^ target_access_flags; + + if ((diffs & IB_ACCESS_REMOTE_ATOMIC) && + MLX5_CAP_GEN(dev->mdev, atomic) && + MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled)) + return false; + + if ((diffs & IB_ACCESS_RELAXED_ORDERING) && + MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write) && + !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr)) + return false; + + if ((diffs & IB_ACCESS_RELAXED_ORDERING) && + MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) && + !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr)) + return false; + + return true; +} + +static inline int mlx5r_store_odp_mkey(struct mlx5_ib_dev *dev, + struct mlx5_ib_mkey *mmkey) +{ + refcount_set(&mmkey->usecount, 1); + + return xa_err(xa_store(&dev->odp_mkeys, mlx5_base_mkey(mmkey->key), + mmkey, GFP_KERNEL)); +} + +/* deref an mkey that can participate in ODP flow */ +static inline void mlx5r_deref_odp_mkey(struct mlx5_ib_mkey *mmkey) +{ + if (refcount_dec_and_test(&mmkey->usecount)) + wake_up(&mmkey->wait); +} + +/* deref an mkey that can participate in ODP flow and wait for relese */ +static inline void mlx5r_deref_wait_odp_mkey(struct mlx5_ib_mkey *mmkey) +{ + mlx5r_deref_odp_mkey(mmkey); + wait_event(mmkey->wait, refcount_read(&mmkey->usecount) == 0); +} + +int mlx5_ib_test_wc(struct mlx5_ib_dev *dev); + +static inline bool mlx5_ib_lag_should_assign_affinity(struct mlx5_ib_dev *dev) +{ + if (dev->lag_active && + mlx5_lag_mode_is_hash(dev->mdev) && + MLX5_CAP_PORT_SELECTION(dev->mdev, port_select_flow_table_bypass)) + return 0; + + return dev->lag_active || + (MLX5_CAP_GEN(dev->mdev, num_lag_ports) > 1 && + MLX5_CAP_GEN(dev->mdev, lag_tx_port_affinity)); +} + +static inline bool rt_supported(int ts_cap) +{ + return ts_cap == MLX5_TIMESTAMP_FORMAT_CAP_REAL_TIME || + ts_cap == MLX5_TIMESTAMP_FORMAT_CAP_FREE_RUNNING_AND_REAL_TIME; +} + +/* + * PCI Peer to Peer is a trainwreck. If no switch is present then things + * sometimes work, depending on the pci_distance_p2p logic for excluding broken + * root complexes. However if a switch is present in the path, then things get + * really ugly depending on how the switch is setup. This table assumes that the + * root complex is strict and is validating that all req/reps are matches + * perfectly - so any scenario where it sees only half the transaction is a + * failure. + * + * CR/RR/DT ATS RO P2P + * 00X X X OK + * 010 X X fails (request is routed to root but root never sees comp) + * 011 0 X fails (request is routed to root but root never sees comp) + * 011 1 X OK + * 10X X 1 OK + * 101 X 0 fails (completion is routed to root but root didn't see req) + * 110 X 0 SLOW + * 111 0 0 SLOW + * 111 1 0 fails (completion is routed to root but root didn't see req) + * 111 1 1 OK + * + * Unfortunately we cannot reliably know if a switch is present or what the + * CR/RR/DT ACS settings are, as in a VM that is all hidden. Assume that + * CR/RR/DT is 111 if the ATS cap is enabled and follow the last three rows. + * + * For now assume if the umem is a dma_buf then it is P2P. + */ +static inline bool mlx5_umem_needs_ats(struct mlx5_ib_dev *dev, + struct ib_umem *umem, int access_flags) +{ + if (!MLX5_CAP_GEN(dev->mdev, ats) || + (!umem->is_dmabuf && !umem->is_peer)) + return false; + return access_flags & IB_ACCESS_RELAXED_ORDERING; +} + +#endif /* MLX5_IB_H */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/mlx5_ib_ext.h b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/mlx5_ib_ext.h new file mode 100644 index 0000000..d6d936f --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/mlx5_ib_ext.h @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2013-2016, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX5_IB_EXT_H +#define MLX5_IB_EXT_H + +#include + +/* mlx5_set_ttl feature infra */ +struct mlx5_ttl_data { + int val; + struct kobject kobj; +}; + +int init_ttl_sysfs(struct mlx5_ib_dev *dev); +void cleanup_ttl_sysfs(struct mlx5_ib_dev *dev); + +/* mlx5_force_tc feature */ +enum { + TCLASS_MATCH_SRC_ADDR_IP, + TCLASS_MATCH_DST_ADDR_IP, + TCLASS_MATCH_SRC_ADDR_IP6, + TCLASS_MATCH_DST_ADDR_IP6, + TCLASS_MATCH_TCLASS, + TCLASS_MATCH_TCLASS_NO_PREFIX, + TCLASS_MATCH_MAX, +}; + +struct tclass_match { + u32 mask; + u8 s_addr[16]; + u8 d_addr[16]; + u8 d_addr_m[16]; + int tclass; /* Should be always last! */ +}; + +struct tclass_parse_node { + int (*parse)(const char *str, void *store, void *store_mask); + int (*compare)(struct tclass_match *match, struct tclass_match *match2, + bool with_mask); + size_t (*print)(struct tclass_match *match, char *buf, size_t size); + const char *pattern; + size_t v_offset; + size_t m_offset; + u32 mask; +}; + +#define TCLASS_CREATE_PARSE_NODE(type, parse, compare, print, pattern, \ + mask, v_member, m_member) \ + [(type)] = {parse, compare, print, pattern, \ + offsetof(struct tclass_match, v_member), \ + offsetof(struct tclass_match, m_member), mask} + +enum { + TCLASS_MATCH_MASK_SRC_ADDR_IP = BIT(TCLASS_MATCH_SRC_ADDR_IP), + TCLASS_MATCH_MASK_DST_ADDR_IP = BIT(TCLASS_MATCH_DST_ADDR_IP), + TCLASS_MATCH_MASK_SRC_ADDR_IP6 = BIT(TCLASS_MATCH_SRC_ADDR_IP6), + TCLASS_MATCH_MASK_DST_ADDR_IP6 = BIT(TCLASS_MATCH_DST_ADDR_IP6), + TCLASS_MATCH_MASK_TCLASS = BIT(TCLASS_MATCH_TCLASS), + TCLASS_MATCH_MASK_MAX = BIT(TCLASS_MATCH_MAX), +}; + +#define TCLASS_MAX_RULES 40 +#define TCLASS_MAX_CMD 100 + +struct mlx5_tc_data { + struct tclass_match rule[TCLASS_MAX_RULES]; + struct mutex lock; + bool initialized; + int val; + struct kobject kobj; + struct mlx5_ib_dev *ibdev; +}; + +int init_tc_sysfs(struct mlx5_ib_dev *dev); +void cleanup_tc_sysfs(struct mlx5_ib_dev *dev); +void tclass_get_tclass(struct mlx5_ib_dev *dev, + struct mlx5_tc_data *tcd, + const struct rdma_ah_attr *ah, + u8 port, + u8 *tclass, + bool *global_tc); + +/* DC_cnak feature */ + +#define MLX5_DC_CONNECT_QP_DEPTH 8192 +#define MLX5_IB_QPT_SW_CNAK IB_QPT_RESERVED5 + +enum { + MLX5_DCT_CS_RES_64 = 2, + MLX5_CNAK_RX_POLL_CQ_QUOTA = 256, +}; + +struct mlx5_ib_dev; + +struct mlx5_dc_tracer { + struct page *pg; + dma_addr_t dma; + int size; + int order; +}; + +struct mlx5_dc_desc { + dma_addr_t dma; + void *buf; +}; + +enum mlx5_op { + MLX5_WR_OP_MLX = 1, +}; + +struct mlx5_mlx_wr { + u8 sl; + u16 dlid; + int icrc; +}; + +struct mlx5_send_wr { + struct ib_send_wr wr; + union { + struct mlx5_mlx_wr mlx; + } sel; +}; + +struct mlx5_dc_stats { + struct kobject kobj; + struct mlx5_ib_dev *dev; + int port; + atomic64_t connects; + atomic64_t cnaks; + atomic64_t discards; + int *rx_scatter; + int initialized; +}; + +struct mlx5_dc_data { + struct ib_mr *mr; + struct ib_qp *dcqp; + struct ib_cq *rcq; + struct ib_cq *scq; + unsigned int rx_npages; + unsigned int tx_npages; + struct mlx5_dc_desc *rxdesc; + struct mlx5_dc_desc *txdesc; + unsigned int max_wqes; + unsigned int cur_send; + unsigned int last_send_completed; + int tx_pending; + struct mlx5_ib_dev *dev; + int port; + int initialized; + int index; + int tx_signal_factor; + struct ib_wc wc_tbl[MLX5_CNAK_RX_POLL_CQ_QUOTA]; +}; + +int mlx5_ib_mmap_dc_info_page(struct mlx5_ib_dev *dev, + struct vm_area_struct *vma); +int mlx5_ib_init_dc_improvements(struct mlx5_ib_dev *dev); +void mlx5_ib_cleanup_dc_improvements(struct mlx5_ib_dev *dev); + +void mlx5_ib_set_mlx_seg(struct mlx5_mlx_seg *seg, struct mlx5_mlx_wr *wr); +#endif /*MLX5_IB_EXT_H*/ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/mlx5_ib_nvmf.h b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/mlx5_ib_nvmf.h new file mode 100644 index 0000000..89f11d7 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/mlx5_ib_nvmf.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2013-2016, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX5_IB_EXP_H +#define MLX5_IB_EXP_H + +#include +#include +#include "srq.h" + +int mlx5_ib_set_qp_offload_type(void *qpc, struct ib_qp *qp, + enum ib_qp_offload_type offload_type); + +int mlx5_ib_exp_set_nvmf_srq_attrs(struct mlx5_nvmf_attr *nvmf, + struct ib_srq_init_attr *init_attr); +int mlx5_ib_set_qp_srqn(void *qpc, struct ib_qp *qp, + u32 srqn); +void mlx5_ib_internal_fill_nvmf_caps(struct mlx5_ib_dev *dev); +struct mlx5_ib_nvmf_be_ctrl { + struct ib_nvmf_ctrl ibctrl; + struct mlx5_core_nvmf_be_ctrl mctrl; +}; + +struct mlx5_ib_nvmf_ns { + struct ib_nvmf_ns ibns; + struct mlx5_core_nvmf_ns mns; +}; + +static inline struct mlx5_ib_nvmf_be_ctrl * +to_mibctrl(struct mlx5_core_nvmf_be_ctrl *mctrl) +{ + return container_of(mctrl, struct mlx5_ib_nvmf_be_ctrl, mctrl); +} + +static inline struct mlx5_ib_nvmf_be_ctrl *to_mctrl(struct ib_nvmf_ctrl *ibctrl) +{ + return container_of(ibctrl, struct mlx5_ib_nvmf_be_ctrl, ibctrl); +} + +static inline struct mlx5_ib_nvmf_ns *to_mns(struct ib_nvmf_ns *ibns) +{ + return container_of(ibns, struct mlx5_ib_nvmf_ns, ibns); +} + +struct ib_nvmf_ctrl *mlx5_ib_create_nvmf_backend_ctrl(struct ib_srq *srq, + struct ib_nvmf_backend_ctrl_init_attr *init_attr); +int mlx5_ib_destroy_nvmf_backend_ctrl(struct ib_nvmf_ctrl *ctrl); +struct ib_nvmf_ns *mlx5_ib_attach_nvmf_ns(struct ib_nvmf_ctrl *ctrl, + struct ib_nvmf_ns_init_attr *init_attr); +int mlx5_ib_detach_nvmf_ns(struct ib_nvmf_ns *ns); +int mlx5_ib_query_nvmf_ns(struct ib_nvmf_ns *ns, + struct ib_nvmf_ns_attr *ns_attr); + + + +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/mr.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/mr.c new file mode 100644 index 0000000..edcb3f2 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/mr.c @@ -0,0 +1,3109 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * Copyright (c) 2020, Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "dm.h" +#include "mlx5_ib.h" +#include "ib_rep.h" + +/* + * We can't use an array for xlt_emergency_page because dma_map_single doesn't + * work on kernel modules memory + */ +void *xlt_emergency_page; +static DEFINE_MUTEX(xlt_emergency_page_mutex); +static void mlx5_invalidate_umem(struct ib_umem *umem, void *priv); + +enum { + MAX_PENDING_REG_MR = 8, + MAX_MR_RELEASE_TIMEOUT = (60 * 20) /* Allow release timeout up to 20 min */ +}; + +#define MLX5_UMR_ALIGN 2048 + +static void +create_mkey_callback(int status, struct mlx5_async_work *context); +static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, + u64 iova, int access_flags, + unsigned int page_size, bool populate); + +static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr, + struct ib_pd *pd) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + + MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC)); + MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE)); + MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ)); + MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE)); + MLX5_SET(mkc, mkc, lr, 1); + + if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write)) + MLX5_SET(mkc, mkc, relaxed_ordering_write, + !!(acc & IB_ACCESS_RELAXED_ORDERING)); + if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read)) + MLX5_SET(mkc, mkc, relaxed_ordering_read, + !!(acc & IB_ACCESS_RELAXED_ORDERING)); + + MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn); + MLX5_SET(mkc, mkc, qpn, 0xffffff); + MLX5_SET64(mkc, mkc, start_addr, start_addr); +} + +static void assign_mkey_variant(struct mlx5_ib_dev *dev, + struct mlx5_ib_mkey *mkey, u32 *in) +{ + u8 key = atomic_inc_return(&dev->mkey_var); + void *mkc; + + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + MLX5_SET(mkc, mkc, mkey_7_0, key); + mkey->key = key; +} + +static int mlx5_ib_create_mkey(struct mlx5_ib_dev *dev, + struct mlx5_ib_mkey *mkey, u32 *in, int inlen) +{ + int ret; + + assign_mkey_variant(dev, mkey, in); + ret = mlx5_core_create_mkey(dev->mdev, &mkey->key, in, inlen); + if (!ret) + init_waitqueue_head(&mkey->wait); + + return ret; +} + +static int +mlx5_ib_create_mkey_cb(struct mlx5_ib_dev *dev, + struct mlx5_ib_mkey *mkey, + struct mlx5_async_ctx *async_ctx, + u32 *in, int inlen, u32 *out, int outlen, + struct mlx5_async_work *context) +{ + MLX5_SET(create_mkey_in, in, opcode, MLX5_CMD_OP_CREATE_MKEY); + assign_mkey_variant(dev, mkey, in); + return mlx5_cmd_exec_cb(async_ctx, in, inlen, out, outlen, + create_mkey_callback, context); +} + + +static int mlx5_mr_sysfs_init(struct mlx5_ib_dev *dev); +static void mlx5_mr_sysfs_cleanup(struct mlx5_ib_dev *dev); + +static int mr_cache_max_order(struct mlx5_ib_dev *dev); +static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent); + +static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev) +{ + return !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled); +} + +static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) +{ + WARN_ON(xa_load(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key))); + + return mlx5_core_destroy_mkey(dev->mdev, mr->mmkey.key); +} + +static void create_mkey_warn(struct mlx5_ib_dev *dev, int status, void *out) +{ + if (status == -ENXIO) /* core driver is not available */ + return; + + mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status); + if (status != -EREMOTEIO) /* driver specific failure */ + return; + + /* Failed in FW, print cmd out failure details */ + mlx5_cmd_out_err(dev->mdev, MLX5_CMD_OP_CREATE_MKEY, 0, out); +} + +static void create_mkey_callback(int status, struct mlx5_async_work *context) +{ + struct mlx5_ib_mr *mr = + container_of(context, struct mlx5_ib_mr, cb_work); + struct mlx5_cache_ent *ent = mr->cache_ent; + struct mlx5_ib_dev *dev = ent->dev; + unsigned long flags; + + if (status) { + create_mkey_warn(dev, status, mr->out); + kfree(mr); + spin_lock_irqsave(&ent->lock, flags); + ent->pending--; + WRITE_ONCE(dev->fill_delay, 1); + spin_unlock_irqrestore(&ent->lock, flags); + mod_timer(&dev->delay_timer, jiffies + HZ); + return; + } + + mr->mmkey.type = MLX5_MKEY_MR; + mr->mmkey.key |= mlx5_idx_to_mkey( + MLX5_GET(create_mkey_out, mr->out, mkey_index)); + init_waitqueue_head(&mr->mmkey.wait); + + WRITE_ONCE(dev->cache.last_add, jiffies); + + spin_lock_irqsave(&ent->lock, flags); + list_add_tail(&mr->list, &ent->head); + ent->available_mrs++; + ent->total_mrs++; + /* If we are doing fill_to_high_water then keep going. */ + queue_adjust_cache_locked(ent); + ent->pending--; + spin_unlock_irqrestore(&ent->lock, flags); +} + +static struct mlx5_ib_mr *alloc_cache_mr(struct mlx5_cache_ent *ent, void *mkc) +{ + struct mlx5_ib_mr *mr; + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return NULL; + mr->cache_ent = ent; + + set_mkc_access_pd_addr_fields(mkc, 0, 0, ent->dev->umrc.pd); + MLX5_SET(mkc, mkc, free, 1); + MLX5_SET(mkc, mkc, umr_en, 1); + MLX5_SET(mkc, mkc, access_mode_1_0, ent->access_mode & 0x3); + MLX5_SET(mkc, mkc, access_mode_4_2, (ent->access_mode >> 2) & 0x7); + + MLX5_SET(mkc, mkc, translations_octword_size, ent->xlt); + MLX5_SET(mkc, mkc, log_page_size, ent->page); + return mr; +} + +/* Asynchronously schedule new MRs to be populated in the cache. */ +static int add_keys(struct mlx5_cache_ent *ent, unsigned int num) +{ + size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in); + struct mlx5_ib_mr *mr; + void *mkc; + u32 *in; + int err = 0; + int i; + + in = kzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + for (i = 0; i < num; i++) { + mr = alloc_cache_mr(ent, mkc); + if (!mr) { + err = -ENOMEM; + break; + } + spin_lock_irq(&ent->lock); + if (ent->pending >= MAX_PENDING_REG_MR) { + err = -EAGAIN; + spin_unlock_irq(&ent->lock); + kfree(mr); + break; + } + ent->pending++; + spin_unlock_irq(&ent->lock); + err = mlx5_ib_create_mkey_cb(ent->dev, &mr->mmkey, + &ent->dev->async_ctx, in, inlen, + mr->out, sizeof(mr->out), + &mr->cb_work); + if (err) { + spin_lock_irq(&ent->lock); + ent->pending--; + spin_unlock_irq(&ent->lock); + mlx5_ib_warn(ent->dev, "create mkey failed %d\n", err); + kfree(mr); + break; + } + } + + kfree(in); + return err; +} + +/* Synchronously create a MR in the cache */ +static struct mlx5_ib_mr *create_cache_mr(struct mlx5_cache_ent *ent) +{ + size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in); + struct mlx5_ib_mr *mr; + void *mkc; + u32 *in; + int err; + + in = kzalloc(inlen, GFP_KERNEL); + if (!in) + return ERR_PTR(-ENOMEM); + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + + mr = alloc_cache_mr(ent, mkc); + if (!mr) { + err = -ENOMEM; + goto free_in; + } + + err = mlx5_core_create_mkey(ent->dev->mdev, &mr->mmkey.key, in, inlen); + if (err) + goto free_mr; + + init_waitqueue_head(&mr->mmkey.wait); + mr->mmkey.type = MLX5_MKEY_MR; + WRITE_ONCE(ent->dev->cache.last_add, jiffies); + spin_lock_irq(&ent->lock); + ent->total_mrs++; + spin_unlock_irq(&ent->lock); + kfree(in); + return mr; +free_mr: + kfree(mr); +free_in: + kfree(in); + return ERR_PTR(err); +} + +static void remove_cache_mr_locked(struct mlx5_cache_ent *ent) +{ + struct mlx5_ib_mr *mr; + + lockdep_assert_held(&ent->lock); + if (list_empty(&ent->head)) + return; + mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); + list_del(&mr->list); + ent->available_mrs--; + ent->total_mrs--; + spin_unlock_irq(&ent->lock); + mlx5_core_destroy_mkey(ent->dev->mdev, mr->mmkey.key); + kfree(mr); + spin_lock_irq(&ent->lock); +} + +static bool someone_adding(struct mlx5_mr_cache *cache) +{ + unsigned int i; + + for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { + struct mlx5_cache_ent *ent = &cache->ent[i]; + bool ret; + + spin_lock_irq(&ent->lock); + ret = ent->available_mrs < ent->limit; + spin_unlock_irq(&ent->lock); + if (ret) + return true; + } + return false; +} + +/* + * Check if the bucket is outside the high/low water mark and schedule an async + * update. The cache refill has hysteresis, once the low water mark is hit it is + * refilled up to the high mark. + */ +static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent) +{ + lockdep_assert_held(&ent->lock); + + if (ent->disabled || READ_ONCE(ent->dev->fill_delay)) + return; + if (ent->available_mrs < ent->limit) { + ent->fill_to_high_water = true; + queue_work(ent->dev->cache.wq, &ent->work); + } else if (ent->fill_to_high_water && + ent->available_mrs + ent->pending < 2 * ent->limit) { + /* + * Once we start populating due to hitting a low water mark + * continue until we pass the high water mark. + */ + queue_work(ent->dev->cache.wq, &ent->work); + } else if (ent->available_mrs == 2 * ent->limit) { + ent->fill_to_high_water = false; + } else if (ent->available_mrs > 2 * ent->limit) { + /* Queue deletion of excess entries */ + ent->fill_to_high_water = false; + if (ent->pending) { + cancel_delayed_work(&ent->dwork); + queue_delayed_work(ent->dev->cache.wq, &ent->dwork, + msecs_to_jiffies(1000)); + } + else + queue_work(ent->dev->cache.wq, &ent->work); + } +} + +static int someone_releasing(struct mlx5_mr_cache *cache) +{ + int i; + + for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { + if (cache->ent[i].available_mrs > 2 * cache->ent[i].limit) + return 1; + } + + return 0; +} + +static void __cache_work_func(struct mlx5_cache_ent *ent) +{ + struct mlx5_ib_dev *dev = ent->dev; + struct mlx5_mr_cache *cache = &dev->cache; + int err; + s64 dtime; + + spin_lock_irq(&ent->lock); + if (ent->disabled) + goto out; + + if (ent->fill_to_high_water && + ent->available_mrs + ent->pending < 2 * ent->limit && + !READ_ONCE(dev->fill_delay)) { + spin_unlock_irq(&ent->lock); + err = add_keys(ent, 1); + spin_lock_irq(&ent->lock); + if (ent->disabled) + goto out; + if (err) { + /* + * EAGAIN only happens if pending is positive, so we + * will be rescheduled from reg_mr_callback(). The only + * failure path here is ENOMEM. + */ + if (err != -EAGAIN) { + mlx5_ib_warn( + dev, + "command failed order %d, err %d\n", + ent->order, err); + cancel_delayed_work(&ent->dwork); + queue_delayed_work(cache->wq, &ent->dwork, + msecs_to_jiffies(1000)); + } + } + } else if (ent->available_mrs > 2 * ent->limit) { + bool need_delay; + + /* + * The remove_cache_mr() logic is performed as garbage + * collection task. Such task is intended to be run when no + * other active processes are running. + * + * The need_resched() will return TRUE if there are user tasks + * to be activated in near future. + * + * In such case, we don't execute remove_cache_mr() and postpone + * the garbage collection work to try to run in next cycle, in + * order to free CPU resources to other tasks. + */ + spin_unlock_irq(&ent->lock); + dtime = (cache->last_add + (s64)cache->rel_timeout * HZ) - + jiffies; + need_delay = !(cache->rel_imm || (!need_resched() && + cache->rel_timeout >= 0 && !someone_adding(cache) && + dtime <= 0)); + + spin_lock_irq(&ent->lock); + if (ent->disabled) + goto out; + if (need_delay && cache->rel_timeout >= 0) { + dtime = max_t(s64, dtime, 0); + dtime = min_t(s64, dtime, (MAX_MR_RELEASE_TIMEOUT * HZ)); + cancel_delayed_work(&ent->dwork); + queue_delayed_work(cache->wq, &ent->dwork, dtime); + goto out; + } + remove_cache_mr_locked(ent); + queue_adjust_cache_locked(ent); + } else if (cache->rel_imm && !someone_releasing(cache)) { + cache->rel_imm = 0; + } +out: + spin_unlock_irq(&ent->lock); +} + +static void delayed_cache_work_func(struct work_struct *work) +{ + struct mlx5_cache_ent *ent; + + ent = container_of(work, struct mlx5_cache_ent, dwork.work); + __cache_work_func(ent); +} + +static void cache_work_func(struct work_struct *work) +{ + struct mlx5_cache_ent *ent; + + ent = container_of(work, struct mlx5_cache_ent, work); + __cache_work_func(ent); +} + +/* Allocate a special entry from the cache */ +struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, + unsigned int entry, int access_flags) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent; + struct mlx5_ib_mr *mr; + + if (WARN_ON(entry <= MR_CACHE_LAST_STD_ENTRY || + entry >= ARRAY_SIZE(cache->ent))) + return ERR_PTR(-EINVAL); + + /* Matches access in alloc_cache_mr() */ + if (!mlx5_ib_can_reconfig_with_umr(dev, 0, access_flags)) + return ERR_PTR(-EOPNOTSUPP); + + ent = &cache->ent[entry]; + spin_lock_irq(&ent->lock); + if (list_empty(&ent->head)) { + queue_adjust_cache_locked(ent); + ent->miss++; + spin_unlock_irq(&ent->lock); + mr = create_cache_mr(ent); + if (IS_ERR(mr)) + return mr; + } else { + mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); + list_del(&mr->list); + ent->available_mrs--; + queue_adjust_cache_locked(ent); + spin_unlock_irq(&ent->lock); + + mlx5_clear_mr(mr); + } + mr->access_flags = access_flags; + return mr; +} + +static u32 max_order_to_search(u32 order) +{ + if (order <= 3) + return 3; + if (order <= 5) + return 2; + if (order <= 10) + return 1; + return 0; +} + +/* Return a MR already available in the cache */ +static struct mlx5_ib_mr *get_cache_mr(struct mlx5_cache_ent *req_ent) +{ + u32 max_order = max_order_to_search(req_ent->order); + struct mlx5_ib_dev *dev = req_ent->dev; + struct mlx5_ib_mr *mr = NULL; + struct mlx5_cache_ent *ent = req_ent; + int i; + + /* Try larger MR pools from the cache to satisfy the allocation */ + for (i = 0; i <= max_order && + ent != &dev->cache.ent[MR_CACHE_LAST_STD_ENTRY + 1]; + ent++, i++) { + mlx5_ib_dbg(dev, "order %u, cache index %zu\n", ent->order, + ent - dev->cache.ent); + + spin_lock_irq(&ent->lock); + if (!list_empty(&ent->head)) { + mr = list_first_entry(&ent->head, struct mlx5_ib_mr, + list); + list_del(&mr->list); + ent->available_mrs--; + queue_adjust_cache_locked(ent); + spin_unlock_irq(&ent->lock); + mlx5_clear_mr(mr); + return mr; + } + queue_adjust_cache_locked(ent); + spin_unlock_irq(&ent->lock); + } + req_ent->miss++; + return NULL; +} + +static void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) +{ + struct mlx5_cache_ent *ent = mr->cache_ent; + + WRITE_ONCE(dev->cache.last_add, jiffies); + spin_lock_irq(&ent->lock); + list_add_tail(&mr->list, &ent->head); + ent->available_mrs++; + queue_adjust_cache_locked(ent); + spin_unlock_irq(&ent->lock); +} + +static void clean_keys(struct mlx5_ib_dev *dev, int c) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent = &cache->ent[c]; + struct mlx5_ib_mr *tmp_mr; + struct mlx5_ib_mr *mr; + LIST_HEAD(del_list); + + cancel_delayed_work(&ent->dwork); + while (1) { + spin_lock_irq(&ent->lock); + if (list_empty(&ent->head)) { + spin_unlock_irq(&ent->lock); + break; + } + mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); + list_move(&mr->list, &del_list); + ent->available_mrs--; + ent->total_mrs--; + spin_unlock_irq(&ent->lock); + mlx5_core_destroy_mkey(dev->mdev, mr->mmkey.key); + } + + list_for_each_entry_safe(mr, tmp_mr, &del_list, list) { + list_del(&mr->list); + kfree(mr); + } +} + +static void delay_time_func(struct timer_list *t) +{ + struct mlx5_ib_dev *dev = from_timer(dev, t, delay_timer); + + WRITE_ONCE(dev->fill_delay, 0); +} + +int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent; + int i; + + mutex_init(&dev->slow_path_mutex); + cache->rel_timeout = 300; + cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM); + if (!cache->wq) { + mlx5_ib_warn(dev, "failed to create work queue\n"); + return -ENOMEM; + } + + mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx); + timer_setup(&dev->delay_timer, delay_time_func, 0); + for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { + ent = &cache->ent[i]; + INIT_LIST_HEAD(&ent->head); + spin_lock_init(&ent->lock); + ent->order = i + 2; + ent->dev = dev; + ent->limit = 0; + + INIT_WORK(&ent->work, cache_work_func); + INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func); + + if (i > MR_CACHE_LAST_STD_ENTRY) { + mlx5_odp_init_mr_cache_entry(ent); + continue; + } + + if (ent->order > mr_cache_max_order(dev)) + continue; + + ent->page = PAGE_SHIFT; + ent->xlt = (1 << ent->order) * sizeof(struct mlx5_mtt) / + MLX5_IB_UMR_OCTOWORD; + ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT; + if ((dev->mdev->profile.mask & MLX5_PROF_MASK_MR_CACHE) && + !dev->is_rep && mlx5_core_is_pf(dev->mdev) && + mlx5_ib_can_load_pas_with_umr(dev, 0)) + ent->limit = dev->mdev->profile.mr_cache[i].limit; + else + ent->limit = 0; + spin_lock_irq(&ent->lock); + queue_adjust_cache_locked(ent); + spin_unlock_irq(&ent->lock); + } + + mlx5_mr_sysfs_init(dev); + + return 0; +} + +int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev) +{ + unsigned int i; + + if (!dev->cache.wq) + return 0; + + for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { + struct mlx5_cache_ent *ent = &dev->cache.ent[i]; + + spin_lock_irq(&ent->lock); + ent->disabled = true; + spin_unlock_irq(&ent->lock); + cancel_work_sync(&ent->work); + cancel_delayed_work_sync(&ent->dwork); + } + + mlx5_mr_sysfs_cleanup(dev); + mlx5_cmd_cleanup_async_ctx(&dev->async_ctx); + + for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) + clean_keys(dev, i); + + destroy_workqueue(dev->cache.wq); + del_timer_sync(&dev->delay_timer); + + return 0; +} + +struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); + struct mlx5_ib_mr *mr; + void *mkc; + u32 *in; + int err; + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + in = kzalloc(inlen, GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_free; + } + + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + + MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA); + MLX5_SET(mkc, mkc, length64, 1); + set_mkc_access_pd_addr_fields(mkc, acc | IB_ACCESS_RELAXED_ORDERING, 0, + pd); + +#ifdef CONFIG_GPU_DIRECT_STORAGE + MLX5_SET(mkc, mkc, ma_translation_mode, MLX5_CAP_GEN(dev->mdev, ats)); +#endif + + err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); + if (err) + goto err_in; + + kfree(in); + mr->mmkey.type = MLX5_MKEY_MR; + mr->ibmr.lkey = mr->mmkey.key; + mr->ibmr.rkey = mr->mmkey.key; + mr->umem = NULL; + + return &mr->ibmr; + +err_in: + kfree(in); + +err_free: + kfree(mr); + + return ERR_PTR(err); +} + +static int get_octo_len(u64 addr, u64 len, int page_shift) +{ + u64 page_size = 1ULL << page_shift; + u64 offset; + int npages; + + offset = addr & (page_size - 1); + npages = ALIGN(len + offset, page_size) >> page_shift; + return (npages + 1) / 2; +} + +static int mr_cache_max_order(struct mlx5_ib_dev *dev) +{ + if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) + return MR_CACHE_LAST_STD_ENTRY + 2; + return MLX5_MAX_UMR_SHIFT; +} + +static void mlx5_ib_umr_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct mlx5_ib_umr_context *context = + container_of(wc->wr_cqe, struct mlx5_ib_umr_context, cqe); + + context->status = wc->status; + complete(&context->done); +} + +static inline void mlx5_ib_init_umr_context(struct mlx5_ib_umr_context *context) +{ + context->cqe.done = mlx5_ib_umr_done; + context->status = -1; + init_completion(&context->done); +} + +static int mlx5_ib_post_send_wait(struct mlx5_ib_dev *dev, + struct mlx5_umr_wr *umrwr) +{ + struct umr_common *umrc = &dev->umrc; + const struct ib_send_wr *bad; + int err; + struct mlx5_ib_umr_context umr_context; + + mlx5_ib_init_umr_context(&umr_context); + umrwr->wr.wr_cqe = &umr_context.cqe; + + down(&umrc->sem); + while (true) { + mutex_lock(&umrc->lock); + if (umrc->state == MLX5_UMR_STATE_ERR) { + mutex_unlock(&umrc->lock); + err = -EFAULT; + break; + } + + if (umrc->state == MLX5_UMR_STATE_RECOVER) { + mutex_unlock(&umrc->lock); + usleep_range(3000, 5000); + continue; + } + + err = ib_post_send(umrc->qp, &umrwr->wr, &bad); + mutex_unlock(&umrc->lock); + if (err) { + mlx5_ib_warn(dev, "UMR post send failed, err %d\n", err); + break; + } + + wait_for_completion(&umr_context.done); + + if (umr_context.status == IB_WC_SUCCESS) + break; + + if (umr_context.status == IB_WC_WR_FLUSH_ERR) + continue; + + WARN_ON_ONCE(1); + mlx5_ib_warn(dev, + "reg umr failed (%u). Trying to recover and resubmit the flushed WQEs\n", + umr_context.status); + mutex_lock(&umrc->lock); + err = mlx5r_umr_recover(dev); + mutex_unlock(&umrc->lock); + if (err) + mlx5_ib_warn(dev, "couldn't recover UMR, err %d\n", err); + err = -EFAULT; + break; + } + up(&umrc->sem); + return err; +} + +static struct mlx5_cache_ent *mr_cache_ent_from_order(struct mlx5_ib_dev *dev, + unsigned int order) +{ + struct mlx5_mr_cache *cache = &dev->cache; + + if (order < cache->ent[0].order) + return &cache->ent[0]; + order = order - cache->ent[0].order; + if (order > MR_CACHE_LAST_STD_ENTRY) + return NULL; + return &cache->ent[order]; +} + +static void set_mr_fields(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, + u64 length, int access_flags, u64 iova) +{ + mr->ibmr.lkey = mr->mmkey.key; + mr->ibmr.rkey = mr->mmkey.key; + mr->ibmr.length = length; + mr->ibmr.device = &dev->ib_dev; + mr->ibmr.iova = iova; + mr->access_flags = access_flags; +} + +static unsigned int mlx5_umem_dmabuf_default_pgsz(struct ib_umem *umem, + u64 iova) +{ + /* + * The alignment of iova has already been checked upon entering + * UVERBS_METHOD_REG_DMABUF_MR + */ + umem->iova = iova; + return PAGE_SIZE; +} + +static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, + struct ib_umem *umem, u64 iova, + int access_flags) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_cache_ent *ent; + struct mlx5_ib_mr *mr; + unsigned int page_size; + + if (umem->is_dmabuf) + page_size = mlx5_umem_dmabuf_default_pgsz(umem, iova); + else + page_size = mlx5_umem_find_best_pgsz(umem, mkc, log_page_size, + 0, iova); + if (WARN_ON(!page_size)) + return ERR_PTR(-EINVAL); + ent = mr_cache_ent_from_order( + dev, order_base_2(ib_umem_num_dma_blocks(umem, page_size))); + /* + * Matches access in alloc_cache_mr(). If the MR can't come from the + * cache then synchronously create an uncached one. + */ + if (!ent || ent->limit == 0 || + !mlx5_ib_can_reconfig_with_umr(dev, 0, access_flags) || + mlx5_umem_needs_ats(dev, umem, access_flags)) { + mutex_lock(&dev->slow_path_mutex); + mr = reg_create(pd, umem, iova, access_flags, page_size, false); + mutex_unlock(&dev->slow_path_mutex); + return mr; + } + + mr = get_cache_mr(ent); + if (!mr) { + mr = create_cache_mr(ent); + /* + * The above already tried to do the same stuff as reg_create(), + * no reason to try it again. + */ + if (IS_ERR(mr)) + return mr; + } + + mr->ibmr.pd = pd; + mr->umem = umem; + mr->page_shift = order_base_2(page_size); + set_mr_fields(dev, mr, umem->length, access_flags, iova); + + return mr; +} + +#define MLX5_MAX_UMR_CHUNK ((1 << (MLX5_MAX_UMR_SHIFT + 4)) - \ + MLX5_UMR_MTT_ALIGNMENT) +#define MLX5_SPARE_UMR_CHUNK 0x10000 + +/* + * Allocate a temporary buffer to hold the per-page information to transfer to + * HW. For efficiency this should be as large as it can be, but buffer + * allocation failure is not allowed, so try smaller sizes. + */ +static void *mlx5_ib_alloc_xlt(size_t *nents, size_t ent_size, gfp_t gfp_mask) +{ + const size_t xlt_chunk_align = + MLX5_UMR_MTT_ALIGNMENT / ent_size; + size_t size; + void *res = NULL; + + static_assert(PAGE_SIZE % MLX5_UMR_MTT_ALIGNMENT == 0); + + /* + * MLX5_IB_UPD_XLT_ATOMIC doesn't signal an atomic context just that the + * allocation can't trigger any kind of reclaim. + */ + might_sleep(); + + gfp_mask |= __GFP_ZERO | __GFP_NORETRY; + + /* + * If the system already has a suitable high order page then just use + * that, but don't try hard to create one. This max is about 1M, so a + * free x86 huge page will satisfy it. + */ + size = min_t(size_t, ent_size * ALIGN(*nents, xlt_chunk_align), + MLX5_MAX_UMR_CHUNK); + *nents = size / ent_size; + res = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN, + get_order(size)); + if (res) + return res; + + if (size > MLX5_SPARE_UMR_CHUNK) { + size = MLX5_SPARE_UMR_CHUNK; + *nents = size / ent_size; + res = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN, + get_order(size)); + if (res) + return res; + } + + *nents = PAGE_SIZE / ent_size; + res = (void *)__get_free_page(gfp_mask); + if (res) + return res; + + mutex_lock(&xlt_emergency_page_mutex); + memset(xlt_emergency_page, 0, PAGE_SIZE); + return xlt_emergency_page; +} + +static void mlx5_ib_free_xlt(void *xlt, size_t length) +{ + if (xlt == xlt_emergency_page) { + mutex_unlock(&xlt_emergency_page_mutex); + return; + } + + free_pages((unsigned long)xlt, get_order(length)); +} + +/* + * Create a MLX5_IB_SEND_UMR_UPDATE_XLT work request and XLT buffer ready for + * submission. + */ +static void *mlx5_ib_create_xlt_wr(struct mlx5_ib_mr *mr, + struct mlx5_umr_wr *wr, struct ib_sge *sg, + size_t nents, size_t ent_size, + unsigned int flags) +{ + struct mlx5_ib_dev *dev = mr_to_mdev(mr); + struct device *ddev = &dev->mdev->pdev->dev; + dma_addr_t dma; + void *xlt; + + xlt = mlx5_ib_alloc_xlt(&nents, ent_size, + flags & MLX5_IB_UPD_XLT_ATOMIC ? GFP_ATOMIC : + GFP_KERNEL); + sg->length = nents * ent_size; + dma = dma_map_single(ddev, xlt, sg->length, DMA_TO_DEVICE); + if (dma_mapping_error(ddev, dma)) { + mlx5_ib_err(dev, "unable to map DMA during XLT update.\n"); + mlx5_ib_free_xlt(xlt, sg->length); + return NULL; + } + sg->addr = dma; + sg->lkey = dev->umrc.pd->local_dma_lkey; + + memset(wr, 0, sizeof(*wr)); + wr->wr.send_flags = MLX5_IB_SEND_UMR_UPDATE_XLT; + if (!(flags & MLX5_IB_UPD_XLT_ENABLE)) + wr->wr.send_flags |= MLX5_IB_SEND_UMR_FAIL_IF_FREE; + wr->wr.sg_list = sg; + wr->wr.num_sge = 1; + wr->wr.opcode = MLX5_IB_WR_UMR; + wr->pd = mr->ibmr.pd; + wr->mkey = mr->mmkey.key; + wr->length = mr->ibmr.length; + wr->virt_addr = mr->ibmr.iova; + wr->access_flags = mr->access_flags; + wr->page_shift = mr->page_shift; + wr->xlt_size = sg->length; + return xlt; +} + +static void mlx5_ib_unmap_free_xlt(struct mlx5_ib_dev *dev, void *xlt, + struct ib_sge *sg) +{ + struct device *ddev = &dev->mdev->pdev->dev; + + dma_unmap_single(ddev, sg->addr, sg->length, DMA_TO_DEVICE); + mlx5_ib_free_xlt(xlt, sg->length); +} + +static unsigned int xlt_wr_final_send_flags(unsigned int flags) +{ + unsigned int res = 0; + + if (flags & MLX5_IB_UPD_XLT_ENABLE) + res |= MLX5_IB_SEND_UMR_ENABLE_MR | + MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS | + MLX5_IB_SEND_UMR_UPDATE_TRANSLATION; + if (flags & MLX5_IB_UPD_XLT_PD || flags & MLX5_IB_UPD_XLT_ACCESS) + res |= MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS; + if (flags & MLX5_IB_UPD_XLT_ADDR) + res |= MLX5_IB_SEND_UMR_UPDATE_TRANSLATION; + return res; +} + +int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages, + int page_shift, int flags) +{ + struct mlx5_ib_dev *dev = mr_to_mdev(mr); + struct device *ddev = &dev->mdev->pdev->dev; + void *xlt; + struct mlx5_umr_wr wr; + struct ib_sge sg; + int err = 0; + int desc_size = (flags & MLX5_IB_UPD_XLT_INDIRECT) + ? sizeof(struct mlx5_klm) + : sizeof(struct mlx5_mtt); + const int page_align = MLX5_UMR_MTT_ALIGNMENT / desc_size; + const int page_mask = page_align - 1; + size_t pages_mapped = 0; + size_t pages_to_map = 0; + size_t pages_iter; + size_t size_to_map = 0; + size_t orig_sg_length; + + if ((flags & MLX5_IB_UPD_XLT_INDIRECT) && + !umr_can_use_indirect_mkey(dev)) + return -EPERM; + + if (WARN_ON(!mr->umem->is_odp)) + return -EINVAL; + + /* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes, + * so we need to align the offset and length accordingly + */ + if (idx & page_mask) { + npages += idx & page_mask; + idx &= ~page_mask; + } + pages_to_map = ALIGN(npages, page_align); + + xlt = mlx5_ib_create_xlt_wr(mr, &wr, &sg, npages, desc_size, flags); + if (!xlt) + return -ENOMEM; + pages_iter = sg.length / desc_size; + orig_sg_length = sg.length; + + if (!(flags & MLX5_IB_UPD_XLT_INDIRECT)) { + struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); + size_t max_pages = ib_umem_odp_num_pages(odp) - idx; + + pages_to_map = min_t(size_t, pages_to_map, max_pages); + } + + wr.page_shift = page_shift; + + for (pages_mapped = 0; + pages_mapped < pages_to_map && !err; + pages_mapped += pages_iter, idx += pages_iter) { + npages = min_t(int, pages_iter, pages_to_map - pages_mapped); + size_to_map = npages * desc_size; + dma_sync_single_for_cpu(ddev, sg.addr, sg.length, + DMA_TO_DEVICE); + mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags); + dma_sync_single_for_device(ddev, sg.addr, sg.length, + DMA_TO_DEVICE); + + sg.length = ALIGN(size_to_map, MLX5_UMR_MTT_ALIGNMENT); + + if (pages_mapped + pages_iter >= pages_to_map) + wr.wr.send_flags |= xlt_wr_final_send_flags(flags); + + wr.offset = idx * desc_size; + wr.xlt_size = sg.length; + + err = mlx5_ib_post_send_wait(dev, &wr); + } + sg.length = orig_sg_length; + mlx5_ib_unmap_free_xlt(dev, xlt, &sg); + return err; +} + +/* + * Send the DMA list to the HW for a normal MR using UMR. + * Dmabuf MR is handled in a similar way, except that the MLX5_IB_UPD_XLT_ZAP + * flag may be used. + */ +int mlx5_ib_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags) +{ + struct mlx5_ib_dev *dev = mr_to_mdev(mr); + struct device *ddev = &dev->mdev->pdev->dev; + struct ib_block_iter biter; + struct mlx5_mtt *cur_mtt; + struct mlx5_umr_wr wr; + size_t orig_sg_length; + struct mlx5_mtt *mtt; + size_t final_size; + struct ib_sge sg; + int err = 0; + + if (WARN_ON(mr->umem->is_odp)) + return -EINVAL; + + mtt = mlx5_ib_create_xlt_wr(mr, &wr, &sg, + ib_umem_num_dma_blocks(mr->umem, + 1 << mr->page_shift), + sizeof(*mtt), flags); + if (!mtt) + return -ENOMEM; + orig_sg_length = sg.length; + + cur_mtt = mtt; + rdma_for_each_block (mr->umem->sgt_append.sgt.sgl, &biter, + mr->umem->sgt_append.sgt.nents, + BIT(mr->page_shift)) { + if (cur_mtt == (void *)mtt + sg.length) { + dma_sync_single_for_device(ddev, sg.addr, sg.length, + DMA_TO_DEVICE); + err = mlx5_ib_post_send_wait(dev, &wr); + if (err) + goto err; + dma_sync_single_for_cpu(ddev, sg.addr, sg.length, + DMA_TO_DEVICE); + wr.offset += sg.length; + cur_mtt = mtt; + } + + cur_mtt->ptag = + cpu_to_be64(rdma_block_iter_dma_address(&biter) | + MLX5_IB_MTT_PRESENT); + + if (mr->umem->is_dmabuf && (flags & MLX5_IB_UPD_XLT_ZAP)) + cur_mtt->ptag = 0; + + cur_mtt++; + } + + final_size = (void *)cur_mtt - (void *)mtt; + sg.length = ALIGN(final_size, MLX5_UMR_MTT_ALIGNMENT); + memset(cur_mtt, 0, sg.length - final_size); + wr.wr.send_flags |= xlt_wr_final_send_flags(flags); + wr.xlt_size = sg.length; + + dma_sync_single_for_device(ddev, sg.addr, sg.length, DMA_TO_DEVICE); + err = mlx5_ib_post_send_wait(dev, &wr); + +err: + sg.length = orig_sg_length; + mlx5_ib_unmap_free_xlt(dev, mtt, &sg); + return err; +} + +/* + * If ibmr is NULL it will be allocated by reg_create. + * Else, the given ibmr will be used. + */ +static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, + u64 iova, int access_flags, + unsigned int page_size, bool populate) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_ib_mr *mr; + __be64 *pas; + void *mkc; + int inlen; + u32 *in; + int err; + bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)); + + if (!page_size) + return ERR_PTR(-EINVAL); + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + mr->ibmr.pd = pd; + mr->access_flags = access_flags; + mr->page_shift = order_base_2(page_size); + + inlen = MLX5_ST_SZ_BYTES(create_mkey_in); + if (populate) + inlen += sizeof(*pas) * + roundup(ib_umem_num_dma_blocks(umem, page_size), 2); + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_1; + } + pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); + if (populate) { + if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND)) { + err = -EINVAL; + goto err_2; + } + mlx5_ib_populate_pas(umem, 1UL << mr->page_shift, pas, + pg_cap ? MLX5_IB_MTT_PRESENT : 0); + } + + /* The pg_access bit allows setting the access flags + * in the page list submitted with the command. */ + MLX5_SET(create_mkey_in, in, pg_access, !!(pg_cap)); + + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + set_mkc_access_pd_addr_fields(mkc, access_flags, iova, + populate ? pd : dev->umrc.pd); + MLX5_SET(mkc, mkc, free, !populate); + MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT); + MLX5_SET(mkc, mkc, umr_en, 1); + + MLX5_SET64(mkc, mkc, len, umem->length); + MLX5_SET(mkc, mkc, bsf_octword_size, 0); + MLX5_SET(mkc, mkc, translations_octword_size, + get_octo_len(iova, umem->length, mr->page_shift)); + MLX5_SET(mkc, mkc, log_page_size, mr->page_shift); + + if (mlx5_umem_needs_ats(dev, umem, access_flags)) + MLX5_SET(mkc, mkc, ma_translation_mode, + MLX5_CAP_GEN(dev->mdev, ats)); + if (populate) { + MLX5_SET(create_mkey_in, in, translations_octword_actual_size, + get_octo_len(iova, umem->length, mr->page_shift)); + } + + err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); + if (err) { + mlx5_ib_warn(dev, "create mkey failed\n"); + goto err_2; + } + mr->mmkey.type = MLX5_MKEY_MR; + mr->umem = umem; + set_mr_fields(dev, mr, umem->length, access_flags, iova); + kvfree(in); + + mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmkey.key); + + return mr; + +err_2: + kvfree(in); +err_1: + kfree(mr); + return ERR_PTR(err); +} + +static struct ib_mr *mlx5_ib_get_dm_mr(struct ib_pd *pd, u64 start_addr, + u64 length, int acc, int mode) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); + struct mlx5_ib_mr *mr; + void *mkc; + u32 *in; + int err; + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + in = kzalloc(inlen, GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_free; + } + + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + + MLX5_SET(mkc, mkc, access_mode_1_0, mode & 0x3); + MLX5_SET(mkc, mkc, access_mode_4_2, (mode >> 2) & 0x7); + MLX5_SET64(mkc, mkc, len, length); + set_mkc_access_pd_addr_fields(mkc, acc, start_addr, pd); + + err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); + if (err) + goto err_in; + + kfree(in); + + set_mr_fields(dev, mr, length, acc, start_addr); + + return &mr->ibmr; + +err_in: + kfree(in); + +err_free: + kfree(mr); + + return ERR_PTR(err); +} + +int mlx5_ib_advise_mr(struct ib_pd *pd, + enum ib_uverbs_advise_mr_advice advice, + u32 flags, + struct ib_sge *sg_list, + u32 num_sge, + struct uverbs_attr_bundle *attrs) +{ + if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH && + advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE && + advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT) + return -EOPNOTSUPP; + + return mlx5_ib_advise_mr_prefetch(pd, advice, flags, + sg_list, num_sge); +} + +struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm, + struct ib_dm_mr_attr *attr, + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_ib_dm *mdm = to_mdm(dm); + struct mlx5_core_dev *dev = to_mdev(dm->device)->mdev; + u64 start_addr = mdm->dev_addr + attr->offset; + int mode; + + switch (mdm->type) { + case MLX5_IB_UAPI_DM_TYPE_MEMIC: + if (attr->access_flags & ~MLX5_IB_DM_MEMIC_ALLOWED_ACCESS) + return ERR_PTR(-EINVAL); + + mode = MLX5_MKC_ACCESS_MODE_MEMIC; + start_addr -= pci_resource_start(dev->pdev, 0); + break; + case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM: + case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM: + case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_PATTERN_SW_ICM: + case MLX5_IB_UAPI_DM_TYPE_ENCAP_SW_ICM: + if (attr->access_flags & ~MLX5_IB_DM_SW_ICM_ALLOWED_ACCESS) + return ERR_PTR(-EINVAL); + + mode = MLX5_MKC_ACCESS_MODE_SW_ICM; + break; + default: + return ERR_PTR(-EINVAL); + } + + return mlx5_ib_get_dm_mr(pd, start_addr, attr->length, + attr->access_flags, mode); +} + +static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem, + u64 iova, int access_flags) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_ib_mr *mr = NULL; + bool xlt_with_umr; + int err; + + xlt_with_umr = mlx5_ib_can_load_pas_with_umr(dev, umem->length); + if (xlt_with_umr && !umem->is_peer) { + mr = alloc_cacheable_mr(pd, umem, iova, access_flags); + } else { + unsigned int page_size = mlx5_umem_find_best_pgsz( + umem, mkc, log_page_size, 0, iova); + + mutex_lock(&dev->slow_path_mutex); + mr = reg_create(pd, umem, iova, access_flags, page_size, + !xlt_with_umr); + mutex_unlock(&dev->slow_path_mutex); + } + if (IS_ERR(mr)) { + if (umem->is_peer) + ib_umem_stop_invalidation_notifier(umem); + ib_umem_release(umem); + return ERR_CAST(mr); + } + + mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key); + + atomic_add(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages); + + if (xlt_with_umr) { + /* + * If the MR was created with reg_create then it will be + * configured properly but left disabled. It is safe to go ahead + * and configure it again via UMR while enabling it. + */ + err = mlx5_ib_update_mr_pas(mr, MLX5_IB_UPD_XLT_ENABLE); + if (err) { + mlx5_ib_dereg_mr(&mr->ibmr, NULL); + return ERR_PTR(err); + } + } + + if (umem->is_peer) + ib_umem_activate_invalidation_notifier( + umem, mlx5_invalidate_umem, mr); + + return &mr->ibmr; +} + +static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length, + u64 iova, int access_flags, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct ib_umem_odp *odp; + struct mlx5_ib_mr *mr; + int err; + + if (!IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) + return ERR_PTR(-EOPNOTSUPP); + + err = mlx5r_odp_create_eq(dev, &dev->odp_pf_eq); + if (err) + return ERR_PTR(err); + if (!start && length == U64_MAX) { + if (iova != 0) + return ERR_PTR(-EINVAL); + if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) + return ERR_PTR(-EINVAL); + + mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), access_flags); + if (IS_ERR(mr)) + return ERR_CAST(mr); + return &mr->ibmr; + } + + /* ODP requires xlt update via umr to work. */ + if (!mlx5_ib_can_load_pas_with_umr(dev, length)) + return ERR_PTR(-EINVAL); + + odp = ib_umem_odp_get(&dev->ib_dev, start, length, access_flags, + &mlx5_mn_ops); + if (IS_ERR(odp)) + return ERR_CAST(odp); + + mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags); + if (IS_ERR(mr)) { + ib_umem_release(&odp->umem); + return ERR_CAST(mr); + } + xa_init(&mr->implicit_children); + + odp->private = mr; + err = mlx5r_store_odp_mkey(dev, &mr->mmkey); + if (err) + goto err_dereg_mr; + + err = mlx5_ib_init_odp_mr(mr); + if (err) + goto err_dereg_mr; + return &mr->ibmr; + +err_dereg_mr: + mlx5_ib_dereg_mr(&mr->ibmr, NULL); + return ERR_PTR(err); +} + +struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 iova, int access_flags, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct ib_umem *umem; + + if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM)) + return ERR_PTR(-EOPNOTSUPP); + + mlx5_ib_dbg(dev, "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n", + start, iova, length, access_flags); + + if ((access_flags & IB_ACCESS_ON_DEMAND) && (dev->profile != &raw_eth_profile)) + return create_user_odp_mr(pd, start, length, iova, access_flags, + udata); + umem = ib_umem_get_peer(&dev->ib_dev, start, length, access_flags, + IB_PEER_MEM_INVAL_SUPP); + if (IS_ERR(umem)) + return ERR_CAST(umem); + return create_real_mr(pd, umem, iova, access_flags); +} + +static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach) +{ + struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv; + struct mlx5_ib_mr *mr = umem_dmabuf->private; + + dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv); + + if (!umem_dmabuf->sgt) + return; + + mlx5_ib_update_mr_pas(mr, MLX5_IB_UPD_XLT_ZAP); + ib_umem_dmabuf_unmap_pages(umem_dmabuf); +} + +static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = { + .allow_peer2peer = 1, + .move_notify = mlx5_ib_dmabuf_invalidate_cb, +}; + +struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, + u64 length, u64 virt_addr, + int fd, int access_flags, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_ib_mr *mr = NULL; + struct ib_umem_dmabuf *umem_dmabuf; + int err; + + if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || + !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) + return ERR_PTR(-EOPNOTSUPP); + + mlx5_ib_dbg(dev, + "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x\n", + offset, virt_addr, length, fd, access_flags); + + /* dmabuf requires xlt update via umr to work. */ + if (!mlx5_ib_can_load_pas_with_umr(dev, length)) + return ERR_PTR(-EINVAL); + + umem_dmabuf = ib_umem_dmabuf_get(&dev->ib_dev, offset, length, fd, + access_flags, + &mlx5_ib_dmabuf_attach_ops); + if (IS_ERR(umem_dmabuf)) { + mlx5_ib_dbg(dev, "umem_dmabuf get failed (%ld)\n", + PTR_ERR(umem_dmabuf)); + return ERR_CAST(umem_dmabuf); + } + + mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr, + access_flags); + if (IS_ERR(mr)) { + ib_umem_release(&umem_dmabuf->umem); + return ERR_CAST(mr); + } + + mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key); + + atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages); + umem_dmabuf->private = mr; + err = mlx5r_store_odp_mkey(dev, &mr->mmkey); + if (err) + goto err_dereg_mr; + + err = mlx5_ib_init_dmabuf_mr(mr); + if (err) + goto err_dereg_mr; + return &mr->ibmr; + +err_dereg_mr: + mlx5_ib_dereg_mr(&mr->ibmr, NULL); + return ERR_PTR(err); +} + +/** + * revoke_mr - Fence all DMA on the MR + * @mr: The MR to fence + * + * Upon return the NIC will not be doing any DMA to the pages under the MR, + * and any DMA in progress will be completed. Failure of this function + * indicates the HW has failed catastrophically. + */ +static int revoke_mr(struct mlx5_ib_mr *mr) +{ + struct mlx5_umr_wr umrwr = {}; + + if (mr_to_mdev(mr)->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) + return 0; + + umrwr.wr.send_flags = MLX5_IB_SEND_UMR_DISABLE_MR | + MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS; + umrwr.wr.opcode = MLX5_IB_WR_UMR; + umrwr.pd = mr_to_mdev(mr)->umrc.pd; + umrwr.mkey = mr->mmkey.key; + umrwr.ignore_free_state = 1; + + return mlx5_ib_post_send_wait(mr_to_mdev(mr), &umrwr); +} + +/* + * True if the change in access flags can be done via UMR, only some access + * flags can be updated. + */ +static bool can_use_umr_rereg_access(struct mlx5_ib_dev *dev, + unsigned int current_access_flags, + unsigned int target_access_flags) +{ + unsigned int diffs = current_access_flags ^ target_access_flags; + + if (diffs & ~(IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ | IB_ACCESS_RELAXED_ORDERING)) + return false; + return mlx5_ib_can_reconfig_with_umr(dev, current_access_flags, + target_access_flags); +} + +static int umr_rereg_pd_access(struct mlx5_ib_mr *mr, struct ib_pd *pd, + int access_flags) +{ + struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); + struct mlx5_umr_wr umrwr = { + .wr = { + .send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE | + MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS, + .opcode = MLX5_IB_WR_UMR, + }, + .mkey = mr->mmkey.key, + .pd = pd, + .access_flags = access_flags, + }; + int err; + + err = mlx5_ib_post_send_wait(dev, &umrwr); + if (err) + return err; + + mr->access_flags = access_flags; + return 0; +} + +static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr, + struct ib_umem *new_umem, + int new_access_flags, u64 iova, + unsigned long *page_size) +{ + struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); + + /* We only track the allocated sizes of MRs from the cache */ + if (!mr->cache_ent) + return false; + if (!mlx5_ib_can_load_pas_with_umr(dev, new_umem->length)) + return false; + + *page_size = + mlx5_umem_find_best_pgsz(new_umem, mkc, log_page_size, 0, iova); + if (WARN_ON(!*page_size)) + return false; + return (1ULL << mr->cache_ent->order) >= + ib_umem_num_dma_blocks(new_umem, *page_size); +} + +static int umr_rereg_pas(struct mlx5_ib_mr *mr, struct ib_pd *pd, + int access_flags, int flags, struct ib_umem *new_umem, + u64 iova, unsigned long page_size) +{ + struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); + int upd_flags = MLX5_IB_UPD_XLT_ADDR | MLX5_IB_UPD_XLT_ENABLE; + struct ib_umem *old_umem = mr->umem; + int err; + + /* + * To keep everything simple the MR is revoked before we start to mess + * with it. This ensure the change is atomic relative to any use of the + * MR. + */ + err = revoke_mr(mr); + if (err) + return err; + + if (flags & IB_MR_REREG_PD) { + mr->ibmr.pd = pd; + upd_flags |= MLX5_IB_UPD_XLT_PD; + } + if (flags & IB_MR_REREG_ACCESS) { + mr->access_flags = access_flags; + upd_flags |= MLX5_IB_UPD_XLT_ACCESS; + } + + mr->ibmr.length = new_umem->length; + mr->ibmr.iova = iova; + mr->ibmr.length = new_umem->length; + mr->page_shift = order_base_2(page_size); + mr->umem = new_umem; + err = mlx5_ib_update_mr_pas(mr, upd_flags); + if (err) { + /* + * The MR is revoked at this point so there is no issue to free + * new_umem. + */ + mr->umem = old_umem; + return err; + } + + if (new_umem->is_peer) + ib_umem_activate_invalidation_notifier( + new_umem, mlx5_invalidate_umem, mr); + + atomic_sub(ib_umem_num_pages(old_umem), &dev->mdev->priv.reg_pages); + ib_umem_release(old_umem); + atomic_add(ib_umem_num_pages(new_umem), &dev->mdev->priv.reg_pages); + return 0; +} + +struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, + u64 length, u64 iova, int new_access_flags, + struct ib_pd *new_pd, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(ib_mr->device); + struct mlx5_ib_mr *mr = to_mmr(ib_mr); + int err; + + if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM)) + return ERR_PTR(-EOPNOTSUPP); + + mlx5_ib_dbg( + dev, + "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n", + start, iova, length, new_access_flags); + + if (flags & ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS)) + return ERR_PTR(-EOPNOTSUPP); + + if (!(flags & IB_MR_REREG_ACCESS)) + new_access_flags = mr->access_flags; + if (!(flags & IB_MR_REREG_PD)) + new_pd = ib_mr->pd; + + if (!(flags & IB_MR_REREG_TRANS)) { + struct ib_umem *umem; + + /* Fast path for PD/access change */ + if (can_use_umr_rereg_access(dev, mr->access_flags, + new_access_flags)) { + err = umr_rereg_pd_access(mr, new_pd, new_access_flags); + if (err) + return ERR_PTR(err); + return NULL; + } + /* + * DM or ODP MR's don't have a normal umem so we can't re-use it. + * Peer umems cannot have their MR's changed once created due + * to races with invalidation. + */ + if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr) || + mr->umem->is_peer) + goto recreate; + + /* + * Only one active MR can refer to a umem at one time, revoke + * the old MR before assigning the umem to the new one. + */ + err = revoke_mr(mr); + if (err) + return ERR_PTR(err); + umem = mr->umem; + mr->umem = NULL; + atomic_sub(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages); + + return create_real_mr(new_pd, umem, mr->ibmr.iova, + new_access_flags); + } + + /* + * DM doesn't have a PAS list so we can't re-use it, odp/dmabuf does but + * the logic around releasing the umem is different, peer memory + * invalidation semantics are incompatible. + */ + if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr) || mr->umem->is_peer) + goto recreate; + + if (!(new_access_flags & IB_ACCESS_ON_DEMAND) && + can_use_umr_rereg_access(dev, mr->access_flags, new_access_flags)) { + struct ib_umem *new_umem; + unsigned long page_size; + + new_umem = ib_umem_get_peer(&dev->ib_dev, start, length, + new_access_flags, + IB_PEER_MEM_INVAL_SUPP); + if (IS_ERR(new_umem)) + return ERR_CAST(new_umem); + + /* Fast path for PAS change */ + if (can_use_umr_rereg_pas(mr, new_umem, new_access_flags, iova, + &page_size)) { + err = umr_rereg_pas(mr, new_pd, new_access_flags, flags, + new_umem, iova, page_size); + if (err) { + ib_umem_release(new_umem); + return ERR_PTR(err); + } + return NULL; + } + return create_real_mr(new_pd, new_umem, iova, new_access_flags); + } + + /* + * Everything else has no state we can preserve, just create a new MR + * from scratch + */ +recreate: + return mlx5_ib_reg_user_mr(new_pd, start, length, iova, + new_access_flags, udata); +} + +static int +mlx5_alloc_priv_descs(struct ib_device *device, + struct mlx5_ib_mr *mr, + int ndescs, + int desc_size) +{ + struct mlx5_ib_dev *dev = to_mdev(device); + struct device *ddev = &dev->mdev->pdev->dev; + int size = ndescs * desc_size; + int add_size; + int ret; + + add_size = max_t(int, MLX5_UMR_ALIGN - ARCH_KMALLOC_MINALIGN, 0); + + mr->descs_alloc = kzalloc(size + add_size, GFP_KERNEL); + if (!mr->descs_alloc) + return -ENOMEM; + + mr->descs = PTR_ALIGN(mr->descs_alloc, MLX5_UMR_ALIGN); + + mr->desc_map = dma_map_single(ddev, mr->descs, size, DMA_TO_DEVICE); + if (dma_mapping_error(ddev, mr->desc_map)) { + ret = -ENOMEM; + goto err; + } + + return 0; +err: + kfree(mr->descs_alloc); + + return ret; +} + +static void +mlx5_free_priv_descs(struct mlx5_ib_mr *mr) +{ + if (!mr->umem && mr->descs) { + struct ib_device *device = mr->ibmr.device; + int size = mr->max_descs * mr->desc_size; + struct mlx5_ib_dev *dev = to_mdev(device); + + dma_unmap_single(&dev->mdev->pdev->dev, mr->desc_map, size, + DMA_TO_DEVICE); + kfree(mr->descs_alloc); + mr->descs = NULL; + } +} + +int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) +{ + struct mlx5_ib_mr *mr = to_mmr(ibmr); + struct mlx5_ib_dev *dev = to_mdev(ibmr->device); + int rc; + + /* + * Any async use of the mr must hold the refcount, once the refcount + * goes to zero no other thread, such as ODP page faults, prefetch, any + * UMR activity, etc can touch the mkey. Thus it is safe to destroy it. + */ + if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && + refcount_read(&mr->mmkey.usecount) != 0 && + xa_erase(&mr_to_mdev(mr)->odp_mkeys, mlx5_base_mkey(mr->mmkey.key))) + mlx5r_deref_wait_odp_mkey(&mr->mmkey); + + if (ibmr->type == IB_MR_TYPE_INTEGRITY) { + xa_cmpxchg(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key), + mr->sig, NULL, GFP_KERNEL); + + if (mr->mtt_mr) { + rc = mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL); + if (rc) + return rc; + mr->mtt_mr = NULL; + } + if (mr->klm_mr) { + rc = mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL); + if (rc) + return rc; + mr->klm_mr = NULL; + } + + if (mlx5_core_destroy_psv(dev->mdev, + mr->sig->psv_memory.psv_idx)) + mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", + mr->sig->psv_memory.psv_idx); + if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx)) + mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", + mr->sig->psv_wire.psv_idx); + kfree(mr->sig); + mr->sig = NULL; + } + + /* Stop DMA */ + rc = 0; + if (mr->cache_ent || (mr->umem && mr->umem->is_peer)) { + rc = revoke_mr(mr); + if (mr->umem && mr->umem->is_peer) { + if (rc) + return rc; + ib_umem_stop_invalidation_notifier(mr->umem); + } + } + + if (mr->cache_ent && rc) { + spin_lock_irq(&mr->cache_ent->lock); + mr->cache_ent->total_mrs--; + spin_unlock_irq(&mr->cache_ent->lock); + mr->cache_ent = NULL; + } + + if (!mr->cache_ent) { + rc = destroy_mkey(to_mdev(mr->ibmr.device), mr); + if (rc) + return rc; + } + + if (mr->umem) { + bool is_odp = is_odp_mr(mr); + + if (!is_odp) + atomic_sub(ib_umem_num_pages(mr->umem), + &dev->mdev->priv.reg_pages); + ib_umem_release(mr->umem); + if (is_odp) + mlx5_ib_free_odp_mr(mr); + } + + if (mr->cache_ent) { + mlx5_mr_cache_free(dev, mr); + } else { + mlx5_free_priv_descs(mr); + kfree(mr); + } + return 0; +} + +static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs, + int access_mode, int page_shift) +{ + void *mkc; +#ifdef CONFIG_GPU_DIRECT_STORAGE + struct mlx5_ib_dev *dev = to_mdev(pd->device); +#endif + + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + + /* This is only used from the kernel, so setting the PD is OK. */ + set_mkc_access_pd_addr_fields(mkc, IB_ACCESS_RELAXED_ORDERING, 0, pd); + MLX5_SET(mkc, mkc, free, 1); + MLX5_SET(mkc, mkc, translations_octword_size, ndescs); + MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3); + MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7); + MLX5_SET(mkc, mkc, umr_en, 1); + MLX5_SET(mkc, mkc, log_page_size, page_shift); +#ifdef CONFIG_GPU_DIRECT_STORAGE + if (access_mode == MLX5_MKC_ACCESS_MODE_PA || + access_mode == MLX5_MKC_ACCESS_MODE_MTT) + MLX5_SET(mkc, mkc, ma_translation_mode, MLX5_CAP_GEN(dev->mdev, + ats)); + else + pr_err_once("mlx5_ib: %s: Translation mode supported only when access_mode is MTT or PA\n", __func__); +#endif +} + +static int _mlx5_alloc_mkey_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, + int ndescs, int desc_size, int page_shift, + int access_mode, u32 *in, int inlen) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + int err; + + mr->access_mode = access_mode; + mr->desc_size = desc_size; + mr->max_descs = ndescs; + + err = mlx5_alloc_priv_descs(pd->device, mr, ndescs, desc_size); + if (err) + return err; + + mlx5_set_umr_free_mkey(pd, in, ndescs, access_mode, page_shift); + + err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); + if (err) + goto err_free_descs; + + mr->mmkey.type = MLX5_MKEY_MR; + mr->ibmr.lkey = mr->mmkey.key; + mr->ibmr.rkey = mr->mmkey.key; + + return 0; + +err_free_descs: + mlx5_free_priv_descs(mr); + return err; +} + +static struct mlx5_ib_mr *mlx5_ib_alloc_pi_mr(struct ib_pd *pd, + u32 max_num_sg, u32 max_num_meta_sg, + int desc_size, int access_mode) +{ + int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); + int ndescs = ALIGN(max_num_sg + max_num_meta_sg, 4); + int page_shift = 0; + struct mlx5_ib_mr *mr; + u32 *in; + int err; + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + mr->ibmr.pd = pd; + mr->ibmr.device = pd->device; + + in = kzalloc(inlen, GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_free; + } + + if (access_mode == MLX5_MKC_ACCESS_MODE_MTT) + page_shift = PAGE_SHIFT; + + err = _mlx5_alloc_mkey_descs(pd, mr, ndescs, desc_size, page_shift, + access_mode, in, inlen); + if (err) + goto err_free_in; + + mr->umem = NULL; + kfree(in); + + return mr; + +err_free_in: + kfree(in); +err_free: + kfree(mr); + return ERR_PTR(err); +} + +static int mlx5_alloc_mem_reg_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, + int ndescs, u32 *in, int inlen) +{ + return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_mtt), + PAGE_SHIFT, MLX5_MKC_ACCESS_MODE_MTT, in, + inlen); +} + +static int mlx5_alloc_sg_gaps_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, + int ndescs, u32 *in, int inlen) +{ + return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_klm), + 0, MLX5_MKC_ACCESS_MODE_KLMS, in, inlen); +} + +static int mlx5_alloc_integrity_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, + int max_num_sg, int max_num_meta_sg, + u32 *in, int inlen) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + u32 psv_index[2]; + void *mkc; + int err; + + mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL); + if (!mr->sig) + return -ENOMEM; + + /* create mem & wire PSVs */ + err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn, 2, psv_index); + if (err) + goto err_free_sig; + + mr->sig->psv_memory.psv_idx = psv_index[0]; + mr->sig->psv_wire.psv_idx = psv_index[1]; + + mr->sig->sig_status_checked = true; + mr->sig->sig_err_exists = false; + /* Next UMR, Arm SIGERR */ + ++mr->sig->sigerr_count; + mr->klm_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg, + sizeof(struct mlx5_klm), + MLX5_MKC_ACCESS_MODE_KLMS); + if (IS_ERR(mr->klm_mr)) { + err = PTR_ERR(mr->klm_mr); + goto err_destroy_psv; + } + mr->mtt_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg, + sizeof(struct mlx5_mtt), + MLX5_MKC_ACCESS_MODE_MTT); + if (IS_ERR(mr->mtt_mr)) { + err = PTR_ERR(mr->mtt_mr); + goto err_free_klm_mr; + } + + /* Set bsf descriptors for mkey */ + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + MLX5_SET(mkc, mkc, bsf_en, 1); + MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE); + + err = _mlx5_alloc_mkey_descs(pd, mr, 4, sizeof(struct mlx5_klm), 0, + MLX5_MKC_ACCESS_MODE_KLMS, in, inlen); + if (err) + goto err_free_mtt_mr; + + err = xa_err(xa_store(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key), + mr->sig, GFP_KERNEL)); + if (err) + goto err_free_descs; + return 0; + +err_free_descs: + destroy_mkey(dev, mr); + mlx5_free_priv_descs(mr); +err_free_mtt_mr: + mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL); + mr->mtt_mr = NULL; +err_free_klm_mr: + mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL); + mr->klm_mr = NULL; +err_destroy_psv: + if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_memory.psv_idx)) + mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", + mr->sig->psv_memory.psv_idx); + if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx)) + mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", + mr->sig->psv_wire.psv_idx); +err_free_sig: + kfree(mr->sig); + + return err; +} + +static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd, + enum ib_mr_type mr_type, u32 max_num_sg, + u32 max_num_meta_sg) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); + int ndescs = ALIGN(max_num_sg, 4); + struct mlx5_ib_mr *mr; + u32 *in; + int err; + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + in = kzalloc(inlen, GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_free; + } + + mr->ibmr.device = pd->device; + mr->umem = NULL; + + switch (mr_type) { + case IB_MR_TYPE_MEM_REG: + err = mlx5_alloc_mem_reg_descs(pd, mr, ndescs, in, inlen); + break; + case IB_MR_TYPE_SG_GAPS: + err = mlx5_alloc_sg_gaps_descs(pd, mr, ndescs, in, inlen); + break; + case IB_MR_TYPE_INTEGRITY: + err = mlx5_alloc_integrity_descs(pd, mr, max_num_sg, + max_num_meta_sg, in, inlen); + break; + default: + mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type); + err = -EINVAL; + } + + if (err) + goto err_free_in; + + kfree(in); + + return &mr->ibmr; + +err_free_in: + kfree(in); +err_free: + kfree(mr); + return ERR_PTR(err); +} + +struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, + u32 max_num_sg) +{ + return __mlx5_ib_alloc_mr(pd, mr_type, max_num_sg, 0); +} + +struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd, + u32 max_num_sg, u32 max_num_meta_sg) +{ + return __mlx5_ib_alloc_mr(pd, IB_MR_TYPE_INTEGRITY, max_num_sg, + max_num_meta_sg); +} + +int mlx5_ib_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(ibmw->device); + int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); + struct mlx5_ib_mw *mw = to_mmw(ibmw); + unsigned int ndescs; + u32 *in = NULL; + void *mkc; + int err; + struct mlx5_ib_alloc_mw req = {}; + struct { + __u32 comp_mask; + __u32 response_length; + } resp = {}; + + err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req))); + if (err) + return err; + + if (req.comp_mask || req.reserved1 || req.reserved2) + return -EOPNOTSUPP; + + if (udata->inlen > sizeof(req) && + !ib_is_udata_cleared(udata, sizeof(req), + udata->inlen - sizeof(req))) + return -EOPNOTSUPP; + + ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4); + + in = kzalloc(inlen, GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto free; + } + + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + + MLX5_SET(mkc, mkc, free, 1); + MLX5_SET(mkc, mkc, translations_octword_size, ndescs); + MLX5_SET(mkc, mkc, pd, to_mpd(ibmw->pd)->pdn); + MLX5_SET(mkc, mkc, umr_en, 1); + MLX5_SET(mkc, mkc, lr, 1); + MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_KLMS); + MLX5_SET(mkc, mkc, en_rinval, !!((ibmw->type == IB_MW_TYPE_2))); + MLX5_SET(mkc, mkc, qpn, 0xffffff); + + err = mlx5_ib_create_mkey(dev, &mw->mmkey, in, inlen); + if (err) + goto free; + + mw->mmkey.type = MLX5_MKEY_MW; + ibmw->rkey = mw->mmkey.key; + mw->mmkey.ndescs = ndescs; + + resp.response_length = + min(offsetofend(typeof(resp), response_length), udata->outlen); + if (resp.response_length) { + err = ib_copy_to_udata(udata, &resp, resp.response_length); + if (err) + goto free_mkey; + } + + if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) { + err = mlx5r_store_odp_mkey(dev, &mw->mmkey); + if (err) + goto free_mkey; + } + + kfree(in); + return 0; + +free_mkey: + mlx5_core_destroy_mkey(dev->mdev, mw->mmkey.key); +free: + kfree(in); + return err; +} + +int mlx5_ib_dealloc_mw(struct ib_mw *mw) +{ + struct mlx5_ib_dev *dev = to_mdev(mw->device); + struct mlx5_ib_mw *mmw = to_mmw(mw); + + if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && + xa_erase(&dev->odp_mkeys, mlx5_base_mkey(mmw->mmkey.key))) + /* + * pagefault_single_data_segment() may be accessing mmw + * if the user bound an ODP MR to this MW. + */ + mlx5r_deref_wait_odp_mkey(&mmw->mmkey); + + return mlx5_core_destroy_mkey(dev->mdev, mmw->mmkey.key); +} + +int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, + struct ib_mr_status *mr_status) +{ + struct mlx5_ib_mr *mmr = to_mmr(ibmr); + int ret = 0; + + if (check_mask & ~IB_MR_CHECK_SIG_STATUS) { + pr_err("Invalid status check mask\n"); + ret = -EINVAL; + goto done; + } + + mr_status->fail_status = 0; + if (check_mask & IB_MR_CHECK_SIG_STATUS) { + if (!mmr->sig) { + ret = -EINVAL; + pr_err("signature status check requested on a non-signature enabled MR\n"); + goto done; + } + + mmr->sig->sig_status_checked = true; + if (!mmr->sig->sig_err_exists) + goto done; + + if (ibmr->lkey == mmr->sig->err_item.key) + memcpy(&mr_status->sig_err, &mmr->sig->err_item, + sizeof(mr_status->sig_err)); + else { + mr_status->sig_err.err_type = IB_SIG_BAD_GUARD; + mr_status->sig_err.sig_err_offset = 0; + mr_status->sig_err.key = mmr->sig->err_item.key; + } + + mmr->sig->sig_err_exists = false; + mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS; + } + +done: + return ret; +} + +static int +mlx5_ib_map_pa_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, + int data_sg_nents, unsigned int *data_sg_offset, + struct scatterlist *meta_sg, int meta_sg_nents, + unsigned int *meta_sg_offset) +{ + struct mlx5_ib_mr *mr = to_mmr(ibmr); + unsigned int sg_offset = 0; + int n = 0; + + mr->meta_length = 0; + if (data_sg_nents == 1) { + n++; + mr->mmkey.ndescs = 1; + if (data_sg_offset) + sg_offset = *data_sg_offset; + mr->data_length = sg_dma_len(data_sg) - sg_offset; + mr->data_iova = sg_dma_address(data_sg) + sg_offset; + if (meta_sg_nents == 1) { + n++; + mr->meta_ndescs = 1; + if (meta_sg_offset) + sg_offset = *meta_sg_offset; + else + sg_offset = 0; + mr->meta_length = sg_dma_len(meta_sg) - sg_offset; + mr->pi_iova = sg_dma_address(meta_sg) + sg_offset; + } + ibmr->length = mr->data_length + mr->meta_length; + } + + return n; +} + +static int +mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr, + struct scatterlist *sgl, + unsigned short sg_nents, + unsigned int *sg_offset_p, + struct scatterlist *meta_sgl, + unsigned short meta_sg_nents, + unsigned int *meta_sg_offset_p) +{ + struct scatterlist *sg = sgl; + struct mlx5_klm *klms = mr->descs; + unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0; + u32 lkey = mr->ibmr.pd->local_dma_lkey; + int i, j = 0; + + mr->ibmr.iova = sg_dma_address(sg) + sg_offset; + mr->ibmr.length = 0; + + for_each_sg(sgl, sg, sg_nents, i) { + if (unlikely(i >= mr->max_descs)) + break; + klms[i].va = cpu_to_be64(sg_dma_address(sg) + sg_offset); + klms[i].bcount = cpu_to_be32(sg_dma_len(sg) - sg_offset); + klms[i].key = cpu_to_be32(lkey); + mr->ibmr.length += sg_dma_len(sg) - sg_offset; + + sg_offset = 0; + } + + if (sg_offset_p) + *sg_offset_p = sg_offset; + + mr->mmkey.ndescs = i; + mr->data_length = mr->ibmr.length; + + if (meta_sg_nents) { + sg = meta_sgl; + sg_offset = meta_sg_offset_p ? *meta_sg_offset_p : 0; + for_each_sg(meta_sgl, sg, meta_sg_nents, j) { + if (unlikely(i + j >= mr->max_descs)) + break; + klms[i + j].va = cpu_to_be64(sg_dma_address(sg) + + sg_offset); + klms[i + j].bcount = cpu_to_be32(sg_dma_len(sg) - + sg_offset); + klms[i + j].key = cpu_to_be32(lkey); + mr->ibmr.length += sg_dma_len(sg) - sg_offset; + + sg_offset = 0; + } + if (meta_sg_offset_p) + *meta_sg_offset_p = sg_offset; + + mr->meta_ndescs = j; + mr->meta_length = mr->ibmr.length - mr->data_length; + } + + return i + j; +} + +static int mlx5_set_page(struct ib_mr *ibmr, u64 addr) +{ + struct mlx5_ib_mr *mr = to_mmr(ibmr); + __be64 *descs; + + if (unlikely(mr->mmkey.ndescs == mr->max_descs)) + return -ENOMEM; + + descs = mr->descs; + descs[mr->mmkey.ndescs++] = cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR); + + return 0; +} + +static int mlx5_set_page_pi(struct ib_mr *ibmr, u64 addr) +{ + struct mlx5_ib_mr *mr = to_mmr(ibmr); + __be64 *descs; + + if (unlikely(mr->mmkey.ndescs + mr->meta_ndescs == mr->max_descs)) + return -ENOMEM; + + descs = mr->descs; + descs[mr->mmkey.ndescs + mr->meta_ndescs++] = + cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR); + + return 0; +} + +static int +mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, + int data_sg_nents, unsigned int *data_sg_offset, + struct scatterlist *meta_sg, int meta_sg_nents, + unsigned int *meta_sg_offset) +{ + struct mlx5_ib_mr *mr = to_mmr(ibmr); + struct mlx5_ib_mr *pi_mr = mr->mtt_mr; + int n; + + pi_mr->mmkey.ndescs = 0; + pi_mr->meta_ndescs = 0; + pi_mr->meta_length = 0; + + ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map, + pi_mr->desc_size * pi_mr->max_descs, + DMA_TO_DEVICE); + + pi_mr->ibmr.page_size = ibmr->page_size; + n = ib_sg_to_pages(&pi_mr->ibmr, data_sg, data_sg_nents, data_sg_offset, + mlx5_set_page); + if (n != data_sg_nents) + return n; + + pi_mr->data_iova = pi_mr->ibmr.iova; + pi_mr->data_length = pi_mr->ibmr.length; + pi_mr->ibmr.length = pi_mr->data_length; + ibmr->length = pi_mr->data_length; + + if (meta_sg_nents) { + u64 page_mask = ~((u64)ibmr->page_size - 1); + u64 iova = pi_mr->data_iova; + + n += ib_sg_to_pages(&pi_mr->ibmr, meta_sg, meta_sg_nents, + meta_sg_offset, mlx5_set_page_pi); + + pi_mr->meta_length = pi_mr->ibmr.length; + /* + * PI address for the HW is the offset of the metadata address + * relative to the first data page address. + * It equals to first data page address + size of data pages + + * metadata offset at the first metadata page + */ + pi_mr->pi_iova = (iova & page_mask) + + pi_mr->mmkey.ndescs * ibmr->page_size + + (pi_mr->ibmr.iova & ~page_mask); + /* + * In order to use one MTT MR for data and metadata, we register + * also the gaps between the end of the data and the start of + * the metadata (the sig MR will verify that the HW will access + * to right addresses). This mapping is safe because we use + * internal mkey for the registration. + */ + pi_mr->ibmr.length = pi_mr->pi_iova + pi_mr->meta_length - iova; + pi_mr->ibmr.iova = iova; + ibmr->length += pi_mr->meta_length; + } + + ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map, + pi_mr->desc_size * pi_mr->max_descs, + DMA_TO_DEVICE); + + return n; +} + +static int +mlx5_ib_map_klm_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, + int data_sg_nents, unsigned int *data_sg_offset, + struct scatterlist *meta_sg, int meta_sg_nents, + unsigned int *meta_sg_offset) +{ + struct mlx5_ib_mr *mr = to_mmr(ibmr); + struct mlx5_ib_mr *pi_mr = mr->klm_mr; + int n; + + pi_mr->mmkey.ndescs = 0; + pi_mr->meta_ndescs = 0; + pi_mr->meta_length = 0; + + ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map, + pi_mr->desc_size * pi_mr->max_descs, + DMA_TO_DEVICE); + + n = mlx5_ib_sg_to_klms(pi_mr, data_sg, data_sg_nents, data_sg_offset, + meta_sg, meta_sg_nents, meta_sg_offset); + + ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map, + pi_mr->desc_size * pi_mr->max_descs, + DMA_TO_DEVICE); + + /* This is zero-based memory region */ + pi_mr->data_iova = 0; + pi_mr->ibmr.iova = 0; + pi_mr->pi_iova = pi_mr->data_length; + ibmr->length = pi_mr->ibmr.length; + + return n; +} + +int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, + int data_sg_nents, unsigned int *data_sg_offset, + struct scatterlist *meta_sg, int meta_sg_nents, + unsigned int *meta_sg_offset) +{ + struct mlx5_ib_mr *mr = to_mmr(ibmr); + struct mlx5_ib_mr *pi_mr = NULL; + int n; + + WARN_ON(ibmr->type != IB_MR_TYPE_INTEGRITY); + + mr->mmkey.ndescs = 0; + mr->data_length = 0; + mr->data_iova = 0; + mr->meta_ndescs = 0; + mr->pi_iova = 0; + /* + * As a performance optimization, if possible, there is no need to + * perform UMR operation to register the data/metadata buffers. + * First try to map the sg lists to PA descriptors with local_dma_lkey. + * Fallback to UMR only in case of a failure. + */ + n = mlx5_ib_map_pa_mr_sg_pi(ibmr, data_sg, data_sg_nents, + data_sg_offset, meta_sg, meta_sg_nents, + meta_sg_offset); + if (n == data_sg_nents + meta_sg_nents) + goto out; + /* + * As a performance optimization, if possible, there is no need to map + * the sg lists to KLM descriptors. First try to map the sg lists to MTT + * descriptors and fallback to KLM only in case of a failure. + * It's more efficient for the HW to work with MTT descriptors + * (especially in high load). + * Use KLM (indirect access) only if it's mandatory. + */ + pi_mr = mr->mtt_mr; + n = mlx5_ib_map_mtt_mr_sg_pi(ibmr, data_sg, data_sg_nents, + data_sg_offset, meta_sg, meta_sg_nents, + meta_sg_offset); + if (n == data_sg_nents + meta_sg_nents) + goto out; + + pi_mr = mr->klm_mr; + n = mlx5_ib_map_klm_mr_sg_pi(ibmr, data_sg, data_sg_nents, + data_sg_offset, meta_sg, meta_sg_nents, + meta_sg_offset); + if (unlikely(n != data_sg_nents + meta_sg_nents)) + return -ENOMEM; + +out: + /* This is zero-based memory region */ + ibmr->iova = 0; + mr->pi_mr = pi_mr; + if (pi_mr) + ibmr->sig_attrs->meta_length = pi_mr->meta_length; + else + ibmr->sig_attrs->meta_length = mr->meta_length; + + return 0; +} + +int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset) +{ + struct mlx5_ib_mr *mr = to_mmr(ibmr); + int n; + + mr->mmkey.ndescs = 0; + + ib_dma_sync_single_for_cpu(ibmr->device, mr->desc_map, + mr->desc_size * mr->max_descs, + DMA_TO_DEVICE); + + if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS) + n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset, NULL, 0, + NULL); + else + n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, + mlx5_set_page); + + ib_dma_sync_single_for_device(ibmr->device, mr->desc_map, + mr->desc_size * mr->max_descs, + DMA_TO_DEVICE); + + return n; +} + +static void mlx5_invalidate_umem(struct ib_umem *umem, void *priv) +{ + struct mlx5_ib_mr *mr = priv; + + /* + * DMA is turned off for the mkey, but the mkey remains otherwise + * untouched until the normal flow of dereg_mr happens. Any access to + * this mkey will generate CQEs. + */ + revoke_mr(mr); +} + +static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target, + bool limit_fill) +{ + int err; + + lockdep_assert_held(&ent->lock); + while (true) { + if (limit_fill) + target = ent->limit * 2; + if (target == ent->available_mrs + ent->pending) + return 0; + if (target > ent->available_mrs + ent->pending) { + u32 todo = target - (ent->available_mrs + ent->pending); + + spin_unlock_irq(&ent->lock); + err = add_keys(ent, todo); + if (err == -EAGAIN) + usleep_range(3000, 5000); + spin_lock_irq(&ent->lock); + if (err) { + if (err != -EAGAIN) + return err; + } else + return 0; + } else { + remove_cache_mr_locked(ent); + } + } +} + +struct order_attribute { + struct attribute attr; + ssize_t (*show)(struct cache_order *, struct order_attribute *, char *buf); + ssize_t (*store)(struct cache_order *, struct order_attribute *, + const char *buf, size_t count); +}; + +static ssize_t cur_show(struct cache_order *co, struct order_attribute *oa, + char *buf) +{ + struct mlx5_ib_dev *dev = co->dev; + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent = &cache->ent[co->index]; + int err; + + err = snprintf(buf, 20, "%d\n", ent->available_mrs); + return err; +} + +static ssize_t limit_show(struct cache_order *co, struct order_attribute *oa, + char *buf) +{ + struct mlx5_ib_dev *dev = co->dev; + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent = &cache->ent[co->index]; + int err; + + err = snprintf(buf, 20, "%d\n", ent->limit); + return err; +} + +static ssize_t limit_store(struct cache_order *co, struct order_attribute *oa, + const char *buf, size_t count) +{ + struct mlx5_ib_dev *dev = co->dev; + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent = &cache->ent[co->index]; + u32 var; + int err; + + if (kstrtouint(buf, 0, &var)) + return -EINVAL; + + /* + * Upon set we immediately fill the cache to high water mark implied by + * the limit. + */ + spin_lock_irq(&ent->lock); + ent->limit = var; + err = resize_available_mrs(ent, 0, true); + spin_unlock_irq(&ent->lock); + if (err) + return err; + return count; +} + +static ssize_t miss_show(struct cache_order *co, struct order_attribute *oa, + char *buf) +{ + struct mlx5_ib_dev *dev = co->dev; + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent = &cache->ent[co->index]; + int err; + + err = snprintf(buf, 20, "%d\n", ent->miss); + return err; +} + +static ssize_t miss_store(struct cache_order *co, struct order_attribute *oa, + const char *buf, size_t count) +{ + struct mlx5_ib_dev *dev = co->dev; + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent = &cache->ent[co->index]; + u32 var; + + if (kstrtouint(buf, 0, &var)) + return -EINVAL; + + if (var != 0) + return -EINVAL; + + ent->miss = var; + + return count; +} + +static ssize_t size_show(struct cache_order *co, struct order_attribute *oa, + char *buf) +{ + struct mlx5_ib_dev *dev = co->dev; + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent = &cache->ent[co->index]; + int err; + + err = snprintf(buf, 20, "%d\n", ent->total_mrs); + return err; +} + +static ssize_t size_store(struct cache_order *co, struct order_attribute *oa, + const char *buf, size_t count) +{ + struct mlx5_ib_dev *dev = co->dev; + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent = &cache->ent[co->index]; + u32 target; + int err; + + if (kstrtouint(buf, 0, &target)) + return -EINVAL; + /* + * Target is the new value of total_mrs the user requests, however we + * cannot free MRs that are in use. Compute the target value for + * available_mrs. + */ + spin_lock_irq(&ent->lock); + if (target < ent->total_mrs - ent->available_mrs) { + err = -EINVAL; + goto err_unlock; + + } + target = target - (ent->total_mrs - ent->available_mrs); + if (target < ent->limit || target > ent->limit*2) { + err = -EINVAL; + goto err_unlock; + + } + err = resize_available_mrs(ent, target, false); + if (err) + goto err_unlock; + spin_unlock_irq(&ent->lock); + + return count; + +err_unlock: + spin_unlock_irq(&ent->lock); + return err; +} + +static ssize_t order_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct order_attribute *oa = + container_of(attr, struct order_attribute, attr); + struct cache_order *co = container_of(kobj, struct cache_order, kobj); + + if (!oa->show) + return -EIO; + + return oa->show(co, oa, buf); +} + +static ssize_t order_attr_store(struct kobject *kobj, + struct attribute *attr, const char *buf, size_t size) +{ + struct order_attribute *oa = + container_of(attr, struct order_attribute, attr); + struct cache_order *co = container_of(kobj, struct cache_order, kobj); + + if (!oa->store) + return -EIO; + + return oa->store(co, oa, buf, size); +} + +static const struct sysfs_ops order_sysfs_ops = { + .show = order_attr_show, + .store = order_attr_store, +}; + +#define ORDER_ATTR(_name) struct order_attribute order_attr_##_name = \ + __ATTR(_name, 0644, _name##_show, _name##_store) +#define ORDER_ATTR_RO(_name) struct order_attribute order_attr_##_name = \ + __ATTR(_name, 0444, _name##_show, NULL) + +static ORDER_ATTR_RO(cur); +static ORDER_ATTR(limit); +static ORDER_ATTR(miss); +static ORDER_ATTR(size); + +static struct attribute *order_default_attrs[] = { + &order_attr_cur.attr, + &order_attr_limit.attr, + &order_attr_miss.attr, + &order_attr_size.attr, + NULL +}; + +static struct kobj_type order_type = { + .sysfs_ops = &order_sysfs_ops, + .default_attrs = order_default_attrs +}; + + + +struct cache_attribute { + struct attribute attr; + ssize_t (*show)(struct mlx5_ib_dev *dev, char *buf); + ssize_t (*store)(struct mlx5_ib_dev *dev, const char *buf, size_t count); +}; + +static ssize_t rel_imm_show(struct mlx5_ib_dev *dev, char *buf) +{ + struct mlx5_mr_cache *cache = &dev->cache; + int err; + + err = snprintf(buf, 20, "%d\n", cache->rel_imm); + return err; +} + +static ssize_t rel_imm_store(struct mlx5_ib_dev *dev, const char *buf, size_t count) +{ + struct mlx5_mr_cache *cache = &dev->cache; + u32 var; + int i; + int found = 0; + + if (kstrtouint(buf, 0, &var)) + return -EINVAL; + + if (var > 1) + return -EINVAL; + + if (var == cache->rel_imm) + return count; + + cache->rel_imm = var; + if (cache->rel_imm == 1) { + for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { + if (cache->ent[i].available_mrs > 2 * cache->ent[i].limit) { + queue_work(cache->wq, &cache->ent[i].work); + found = 1; + } + } + if (!found) + cache->rel_imm = 0; + } + + return count; +} +static ssize_t rel_timeout_show(struct mlx5_ib_dev *dev, char *buf) +{ + struct mlx5_mr_cache *cache = &dev->cache; + int err; + + err = snprintf(buf, 20, "%d\n", cache->rel_timeout); + return err; +} + +static ssize_t rel_timeout_store(struct mlx5_ib_dev *dev, const char *buf, size_t count) +{ + struct mlx5_mr_cache *cache = &dev->cache; + int var; + int i; + + if (kstrtoint(buf, 0, &var)) + return -EINVAL; + + if (var < -1 || var > MAX_MR_RELEASE_TIMEOUT) + return -EINVAL; + + if (var == cache->rel_timeout) + return count; + + if (cache->rel_timeout == -1 || (var < cache->rel_timeout && var != -1)) { + cache->rel_timeout = var; + for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { + if (cache->ent[i].available_mrs > 2 * cache->ent[i].limit) + queue_work(cache->wq, &cache->ent[i].work); + } + } else { + cache->rel_timeout = var; + } + + return count; +} + +static ssize_t cache_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct cache_attribute *ca = + container_of(attr, struct cache_attribute, attr); + struct mlx5_ib_dev *dev = container_of(kobj, struct mlx5_ib_dev, mr_cache); + + if (!ca->show) + return -EIO; + + return ca->show(dev, buf); +} + +static ssize_t cache_attr_store(struct kobject *kobj, + struct attribute *attr, const char *buf, size_t size) +{ + struct cache_attribute *ca = + container_of(attr, struct cache_attribute, attr); + struct mlx5_ib_dev *dev = container_of(kobj, struct mlx5_ib_dev, mr_cache); + + if (!ca->store) + return -EIO; + + return ca->store(dev, buf, size); +} + +static const struct sysfs_ops cache_sysfs_ops = { + .show = cache_attr_show, + .store = cache_attr_store, +}; + +#define CACHE_ATTR(_name) struct cache_attribute cache_attr_##_name = \ + __ATTR(_name, 0644, _name##_show, _name##_store) + +static CACHE_ATTR(rel_imm); +static CACHE_ATTR(rel_timeout); + +static struct attribute *cache_default_attrs[] = { + &cache_attr_rel_imm.attr, + &cache_attr_rel_timeout.attr, + NULL +}; + +static struct kobj_type cache_type = { + .sysfs_ops = &cache_sysfs_ops, + .default_attrs = cache_default_attrs +}; + +static int mlx5_mr_sysfs_init(struct mlx5_ib_dev *dev) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct device *device = &dev->ib_dev.dev; + struct cache_order *co; + int o; + int i; + int err; + + if (dev->is_rep) + return 0; + + err = kobject_init_and_add(&dev->mr_cache, &cache_type, + &device->kobj, "mr_cache"); + if (err) + return -ENOMEM; + + for (o = 2, i = 0; i < MAX_MR_CACHE_ENTRIES; o++, i++) { + co = &cache->ent[i].co; + co->order = o; + co->index = i; + co->dev = dev; + err = kobject_init_and_add(&co->kobj, &order_type, + &dev->mr_cache, "%d", o); + if (err) + goto err_put; + + kobject_uevent(&co->kobj, KOBJ_ADD); + } + + return 0; + +err_put: + for (; i >= 0; i--) { + co = &cache->ent[i].co; + kobject_put(&co->kobj); + } + kobject_put(&dev->mr_cache); + + return err; +} + +static void mlx5_mr_sysfs_cleanup(struct mlx5_ib_dev *dev) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct cache_order *co; + int i; + + if (dev->is_rep) + return; + + for (i = MAX_MR_CACHE_ENTRIES - 1; i >= 0; i--) { + co = &cache->ent[i].co; + kobject_put(&co->kobj); + } + kobject_put(&dev->mr_cache); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/nvmf.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/nvmf.c new file mode 100644 index 0000000..e290b32 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/nvmf.c @@ -0,0 +1,552 @@ +/* + * Copyright (c) 2016, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include "srq.h" +#include +#include +#include +#include +#include "mlx5_ib.h" + +void mlx5_ib_internal_fill_nvmf_caps(struct mlx5_ib_dev *dev) +{ + struct ib_nvmf_caps *caps = &dev->nvmf_caps; + struct mlx5_core_dev *mdev = dev->mdev; + + memset(caps, 0, sizeof(*caps)); + + if (MLX5_CAP_NVMF(mdev, write_offload_dc)) + caps->offload_type_dc |= IB_NVMF_WRITE_OFFLOAD; + if (MLX5_CAP_NVMF(mdev, read_offload_dc)) + caps->offload_type_dc |= IB_NVMF_READ_OFFLOAD; + if (MLX5_CAP_NVMF(mdev, read_write_offload_dc)) + caps->offload_type_dc |= IB_NVMF_READ_WRITE_OFFLOAD; + if (MLX5_CAP_NVMF(mdev, read_write_flush_offload_dc)) + caps->offload_type_dc |= IB_NVMF_READ_WRITE_FLUSH_OFFLOAD; + + if (MLX5_CAP_NVMF(mdev, write_offload_rc)) + caps->offload_type_rc |= IB_NVMF_WRITE_OFFLOAD; + if (MLX5_CAP_NVMF(mdev, read_offload_rc)) + caps->offload_type_rc |= IB_NVMF_READ_OFFLOAD; + if (MLX5_CAP_NVMF(mdev, read_write_offload_rc)) + caps->offload_type_rc |= IB_NVMF_READ_WRITE_OFFLOAD; + if (MLX5_CAP_NVMF(mdev, read_write_flush_offload_rc)) + caps->offload_type_rc |= IB_NVMF_READ_WRITE_FLUSH_OFFLOAD; + + caps->max_namespace = + 1 << MLX5_CAP_NVMF(mdev, log_max_namespace_per_xrq); + caps->max_staging_buffer_sz = + 1 << MLX5_CAP_NVMF(mdev, log_max_staging_buffer_size); + caps->min_staging_buffer_sz = + 1 << MLX5_CAP_NVMF(mdev, log_min_staging_buffer_size); + caps->max_io_sz = 1 << MLX5_CAP_NVMF(mdev, log_max_io_size); + caps->max_be_ctrl = + 1 << MLX5_CAP_NVMF(mdev, log_max_backend_controller_per_xrq); + caps->max_queue_sz = + 1 << MLX5_CAP_NVMF(mdev, log_max_queue_size); + caps->min_queue_sz = + 1 << MLX5_CAP_NVMF(mdev, log_min_queue_size); + caps->min_cmd_size = MLX5_CAP_NVMF(mdev, min_ioccsz); + caps->max_cmd_size = MLX5_CAP_NVMF(mdev, max_ioccsz); + caps->max_data_offset = MLX5_CAP_NVMF(mdev, max_icdoff); + caps->passthrough_sqe_rw_service = + MLX5_CAP_NVMF(mdev, passthrough_sqe_rw_service); + /* log_min_cmd_timeout = 0 meens use default timeout from HCA */ + if (MLX5_CAP_NVMF(mdev, log_min_cmd_timeout)) + caps->min_cmd_timeout_us = 1 << MLX5_CAP_NVMF(mdev, log_min_cmd_timeout); + else + caps->min_cmd_timeout_us = 0; + /* log_max_cmd_timeout = 0 meens use default timeout from HCA */ + if (MLX5_CAP_NVMF(mdev, log_max_cmd_timeout)) + caps->max_cmd_timeout_us = 1 << MLX5_CAP_NVMF(mdev, log_max_cmd_timeout); + else + caps->max_cmd_timeout_us = 0; + if (MLX5_CAP_NVMF(mdev, log_max_frontend_nsid)) + caps->max_frontend_nsid = 1 << MLX5_CAP_NVMF(mdev, log_max_frontend_nsid); + else + caps->max_frontend_nsid = 0; +} + +static void set_nvmf_backend_ctrl_attrs(struct ib_nvmf_backend_ctrl_init_attr *attr, + struct mlx5_be_ctrl_attr *in) +{ + in->cq_page_offset = attr->cq_page_offset; + in->sq_page_offset = attr->sq_page_offset; + in->cq_log_page_size = attr->cq_log_page_size; + in->sq_log_page_size = attr->sq_log_page_size; + in->initial_cqh_db_value = attr->initial_cqh_db_value; + in->initial_sqt_db_value = attr->initial_sqt_db_value; + in->log_cmd_timeout_us = attr->cmd_timeout_us ? ilog2(attr->cmd_timeout_us) : 0; + in->cqh_dbr_addr = attr->cqh_dbr_addr; + in->sqt_dbr_addr = attr->sqt_dbr_addr; + in->cq_pas = attr->cq_pas; + in->sq_pas = attr->sq_pas; +} + +static void mlx5_ib_nvmf_backend_ctrl_event(struct mlx5_core_nvmf_be_ctrl *ctrl, + int event_type, int event_subtype, + int error_type) +{ + struct ib_nvmf_ctrl *ibctrl = &to_mibctrl(ctrl)->ibctrl; + struct mlx5_ib_dev *dev = to_mdev(ibctrl->srq->device); + struct ib_event event; + + if (event_type != MLX5_EVENT_TYPE_XRQ_ERROR) { + /* This is the only valid event type for nvmf backend ctrl */ + return; + } + + if (!ibctrl->event_handler) + return; + + event.device = ibctrl->srq->device; + switch (error_type) { + case MLX5_XRQ_ERROR_TYPE_BACKEND_CONTROLLER_ERROR: + switch (event_subtype) { + case MLX5_XRQ_SUBTYPE_BACKEND_CONTROLLER_PCI_ERROR: + event.event = IB_EVENT_XRQ_NVMF_BACKEND_CTRL_PCI_ERR; + break; + case MLX5_XRQ_SUBTYPE_BACKEND_CONTROLLER_TO_ERROR: + event.event = IB_EVENT_XRQ_NVMF_BACKEND_CTRL_TO_ERR; + break; + default: + mlx5_ib_warn(dev, + "Unexpected event subtype %d on CTRL %06x\n", + event_subtype, ibctrl->id); + return; + } + break; + default: + mlx5_ib_warn(dev, + "Unexpected event error type %d on CTRL %06x\n", + error_type, ibctrl->id); + return; + } + + ibctrl->event_handler(&event, ibctrl->be_context); +} + +struct ib_nvmf_ctrl *mlx5_ib_create_nvmf_backend_ctrl(struct ib_srq *srq, + struct ib_nvmf_backend_ctrl_init_attr *init_attr) +{ + struct mlx5_ib_dev *dev = to_mdev(srq->device); + struct mlx5_ib_srq *msrq = to_msrq(srq); + struct mlx5_ib_nvmf_be_ctrl *ctrl; + struct mlx5_be_ctrl_attr in = {0}; + int err; + + ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL); + if (!ctrl) + return ERR_PTR(-ENOMEM); + + set_nvmf_backend_ctrl_attrs(init_attr, &in); + err = mlx5_core_create_nvmf_backend_ctrl(dev->mdev, + &msrq->msrq, + &ctrl->mctrl, + &in); + if (err) { + mlx5_ib_dbg(dev, "create NVMF backend ctrl failed, err %d\n", err); + goto err_ctrl; + } + + mlx5_ib_dbg(dev, "create NVMF backend ctrl with ctrlid 0x%x\n", + ctrl->mctrl.id); + + ctrl->ibctrl.id = ctrl->mctrl.id; + ctrl->mctrl.event = mlx5_ib_nvmf_backend_ctrl_event; + return &ctrl->ibctrl; + +err_ctrl: + kfree(ctrl); + + return ERR_PTR(err); + +} + +int mlx5_ib_destroy_nvmf_backend_ctrl(struct ib_nvmf_ctrl *ctrl) +{ + struct mlx5_ib_dev *dev = to_mdev(ctrl->srq->device); + struct mlx5_ib_nvmf_be_ctrl *mctrl = to_mctrl(ctrl); + struct mlx5_ib_srq *msrq = to_msrq(ctrl->srq); + + mlx5_core_destroy_nvmf_backend_ctrl(dev->mdev, + &msrq->msrq, + &mctrl->mctrl); + + kfree(mctrl); + return 0; +} + +static void set_nvmf_ns_attrs(struct ib_nvmf_ns_init_attr *attr, + struct mlx5_ns_attr *in) +{ + in->frontend_namespace = attr->frontend_namespace; + in->backend_namespace = attr->backend_namespace; + in->lba_data_size = attr->lba_data_size; + in->backend_ctrl_id = attr->backend_ctrl_id; +} + + +struct ib_nvmf_ns *mlx5_ib_attach_nvmf_ns(struct ib_nvmf_ctrl *ctrl, + struct ib_nvmf_ns_init_attr *init_attr) +{ + struct mlx5_ib_nvmf_be_ctrl *mctrl = to_mctrl(ctrl); + struct mlx5_ib_dev *dev = to_mdev(ctrl->srq->device); + struct mlx5_ib_srq *msrq = to_msrq(ctrl->srq); + struct mlx5_ib_nvmf_ns *ns; + struct mlx5_ns_attr in = {0}; + int err; + + ns = kzalloc(sizeof(*ns), GFP_KERNEL); + if (!ns) + return ERR_PTR(-ENOMEM); + + set_nvmf_ns_attrs(init_attr, &in); + err = mlx5_core_attach_nvmf_ns(dev->mdev, + &msrq->msrq, + &mctrl->mctrl, + &ns->mns, + &in); + if (err) { + mlx5_ib_dbg(dev, "attach NVMF ns failed, err %d\n", err); + goto err_ns; + } + + mlx5_ib_dbg(dev, "NVMF ns %p was attached\n", ns); + + return &ns->ibns; + +err_ns: + kfree(ns); + + return ERR_PTR(err); + +} + +int mlx5_ib_detach_nvmf_ns(struct ib_nvmf_ns *ns) +{ + struct mlx5_ib_nvmf_ns *mns = to_mns(ns); + struct mlx5_ib_nvmf_be_ctrl *mctrl = to_mctrl(ns->ctrl); + struct mlx5_ib_dev *dev = to_mdev(ns->ctrl->srq->device); + struct mlx5_ib_srq *msrq = to_msrq(ns->ctrl->srq); + + mlx5_core_detach_nvmf_ns(dev->mdev, + &msrq->msrq, + &mctrl->mctrl, + &mns->mns); + + kfree(mns); + return 0; +} + +int mlx5_ib_query_nvmf_ns(struct ib_nvmf_ns *ns, + struct ib_nvmf_ns_attr *ns_attr) +{ + struct mlx5_ib_nvmf_ns *mns = to_mns(ns); + struct mlx5_ib_dev *dev = to_mdev(ns->ctrl->srq->device); + struct mlx5_ib_srq *msrq = to_msrq(ns->ctrl->srq); + int ret; + + ret = mlx5_core_query_nvmf_ns(dev->mdev, &msrq->msrq, &mns->mns); + if (!ret) { + ns_attr->num_read_cmd = mns->mns.counters.num_read_cmd; + ns_attr->num_read_blocks = mns->mns.counters.num_read_blocks; + ns_attr->num_write_cmd = mns->mns.counters.num_write_cmd; + ns_attr->num_write_blocks = mns->mns.counters.num_write_blocks; + ns_attr->num_write_inline_cmd = mns->mns.counters.num_write_inline_cmd; + ns_attr->num_flush_cmd = mns->mns.counters.num_flush_cmd; + ns_attr->num_error_cmd = mns->mns.counters.num_error_cmd; + ns_attr->num_backend_error_cmd = mns->mns.counters.num_backend_error_cmd; + ns_attr->last_read_latency = mns->mns.counters.last_read_latency; + ns_attr->last_write_latency = mns->mns.counters.last_write_latency; + ns_attr->queue_depth = mns->mns.counters.queue_depth; + } + + return ret; +} + +static int get_pas_size(struct mlx5_be_ctrl_attr *in) +{ + /* + * Currently we support only contig sq/cq aligned to 64B. + * In the future we might add number sq/cq physical addresses + * and set the size accordingly + */ + return 2 * MLX5_PAS_ALIGN; +} + +static inline void set_nvmf_pas(struct mlx5_be_ctrl_attr *in, + void *start, + int align) +{ + dma_addr_t dma_addr_be; + + /* set cq PAS */ + dma_addr_be = cpu_to_be64(in->cq_pas); + memcpy(start, &dma_addr_be, sizeof(u64)); + + /* set sq PAS */ + dma_addr_be = cpu_to_be64(in->sq_pas); + memcpy(start + align, &dma_addr_be, sizeof(u64)); +} + +int mlx5_core_create_nvmf_backend_ctrl(struct mlx5_core_dev *dev, + struct mlx5_core_srq *srq, + struct mlx5_core_nvmf_be_ctrl *ctrl, + struct mlx5_be_ctrl_attr *attr_in) +{ + u32 out[MLX5_ST_SZ_DW(create_nvmf_be_ctrl_out)] = {0}; + void *in; + void *pas_addr; + int pas_size; + int inlen; + int err; + + pas_size = get_pas_size(attr_in); + inlen = MLX5_ST_SZ_BYTES(create_nvmf_be_ctrl_in) + pas_size; + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + pas_addr = MLX5_ADDR_OF(create_nvmf_be_ctrl_in, in, + nvmf_be_ctrl_entry.pas); + set_nvmf_pas(attr_in, pas_addr, MLX5_PAS_ALIGN); + + MLX5_SET(create_nvmf_be_ctrl_in, in, + opcode, + MLX5_CMD_OP_CREATE_NVMF_BACKEND_CTRL); + MLX5_SET(create_nvmf_be_ctrl_in, in, + xrqn, + srq->srqn); + MLX5_SET(create_nvmf_be_ctrl_in, in, + nvmf_be_ctrl_entry.cq_page_offset, + attr_in->cq_page_offset); + MLX5_SET(create_nvmf_be_ctrl_in, in, + nvmf_be_ctrl_entry.sq_page_offset, + attr_in->sq_page_offset); + MLX5_SET(create_nvmf_be_ctrl_in, in, + nvmf_be_ctrl_entry.cq_log_page_size, + attr_in->cq_log_page_size); + MLX5_SET(create_nvmf_be_ctrl_in, in, + nvmf_be_ctrl_entry.sq_log_page_size, + attr_in->sq_log_page_size); + MLX5_SET(create_nvmf_be_ctrl_in, in, + nvmf_be_ctrl_entry.initial_cqh_db_value, + attr_in->initial_cqh_db_value); + MLX5_SET(create_nvmf_be_ctrl_in, in, + nvmf_be_ctrl_entry.initial_sqt_db_value, + attr_in->initial_sqt_db_value); + MLX5_SET(create_nvmf_be_ctrl_in, in, + nvmf_be_ctrl_entry.log_cmd_timeout_us, + attr_in->log_cmd_timeout_us); + MLX5_SET64(create_nvmf_be_ctrl_in, in, + nvmf_be_ctrl_entry.cqh_dbr_addr, + attr_in->cqh_dbr_addr); + MLX5_SET64(create_nvmf_be_ctrl_in, in, + nvmf_be_ctrl_entry.sqt_dbr_addr, + attr_in->sqt_dbr_addr); + + err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); + kvfree(in); + if (err) + return err; + + ctrl->id = MLX5_GET(create_nvmf_be_ctrl_out, out, + backend_controller_id); + + spin_lock(&srq->lock); + list_add_tail(&ctrl->entry, &srq->ctrl_list); + spin_unlock(&srq->lock); + + spin_lock_init(&ctrl->lock); + INIT_LIST_HEAD(&ctrl->ns_list); + + return 0; +} +EXPORT_SYMBOL_GPL(mlx5_core_create_nvmf_backend_ctrl); + +int mlx5_core_destroy_nvmf_backend_ctrl(struct mlx5_core_dev *dev, + struct mlx5_core_srq *srq, + struct mlx5_core_nvmf_be_ctrl *ctrl) +{ + u32 in[MLX5_ST_SZ_DW(destroy_nvmf_be_ctrl_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(destroy_nvmf_be_ctrl_out)] = {0}; + + spin_lock(&srq->lock); + list_del(&ctrl->entry); + spin_unlock(&srq->lock); + + MLX5_SET(destroy_nvmf_be_ctrl_in, in, opcode, + MLX5_CMD_OP_DESTROY_NVMF_BACKEND_CTRL); + MLX5_SET(destroy_nvmf_be_ctrl_in, in, xrqn, srq->srqn); + MLX5_SET(destroy_nvmf_be_ctrl_in, in, backend_controller_id, ctrl->id); + + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); +} +EXPORT_SYMBOL_GPL(mlx5_core_destroy_nvmf_backend_ctrl); + +int mlx5_core_attach_nvmf_ns(struct mlx5_core_dev *dev, + struct mlx5_core_srq *srq, + struct mlx5_core_nvmf_be_ctrl *ctrl, + struct mlx5_core_nvmf_ns *ns, + struct mlx5_ns_attr *attr_in) + +{ + u32 in[MLX5_ST_SZ_DW(attach_nvmf_namespace_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(attach_nvmf_namespace_out)] = {0}; + int err; + + MLX5_SET(attach_nvmf_namespace_in, in, + opcode, + MLX5_CMD_OP_ATTACH_NVMF_NAMESPACE); + MLX5_SET(attach_nvmf_namespace_in, in, + xrqn, + srq->srqn); + MLX5_SET(attach_nvmf_namespace_in, in, + frontend_namespace, + attr_in->frontend_namespace); + MLX5_SET(attach_nvmf_namespace_in, in, + backend_namespace, + attr_in->backend_namespace); + MLX5_SET(attach_nvmf_namespace_in, in, + lba_data_size, + attr_in->lba_data_size); + MLX5_SET(attach_nvmf_namespace_in, in, + backend_controller_id, + attr_in->backend_ctrl_id); + + err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + if (err) + return err; + + ns->frontend_nsid = attr_in->frontend_namespace; + ns->backend_nsid = attr_in->backend_namespace; + + spin_lock(&ctrl->lock); + list_add_tail(&ns->entry, &ctrl->ns_list); + spin_unlock(&ctrl->lock); + + return 0; +} +EXPORT_SYMBOL_GPL(mlx5_core_attach_nvmf_ns); + +int mlx5_core_detach_nvmf_ns(struct mlx5_core_dev *dev, + struct mlx5_core_srq *srq, + struct mlx5_core_nvmf_be_ctrl *ctrl, + struct mlx5_core_nvmf_ns *ns) +{ + u32 in[MLX5_ST_SZ_DW(detach_nvmf_namespace_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(detach_nvmf_namespace_out)] = {0}; + int err; + + MLX5_SET(detach_nvmf_namespace_in, in, + opcode, + MLX5_CMD_OP_DETACH_NVMF_NAMESPACE); + MLX5_SET(detach_nvmf_namespace_in, in, + xrqn, + srq->srqn); + MLX5_SET(detach_nvmf_namespace_in, in, + frontend_namespace, + ns->frontend_nsid); + + err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + if (err) + return err; + + spin_lock(&ctrl->lock); + list_del(&ns->entry); + spin_unlock(&ctrl->lock); + + return 0; +} +EXPORT_SYMBOL_GPL(mlx5_core_detach_nvmf_ns); + +int mlx5_core_query_nvmf_ns(struct mlx5_core_dev *dev, + struct mlx5_core_srq *srq, + struct mlx5_core_nvmf_ns *ns) +{ + u32 in[MLX5_ST_SZ_DW(query_nvmf_namespace_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(query_nvmf_namespace_out)] = {0}; + int err; + + MLX5_SET(query_nvmf_namespace_in, in, + opcode, + MLX5_CMD_OP_QUERY_NVMF_NAMESPACE_CONTEXT); + MLX5_SET(query_nvmf_namespace_in, in, + xrqn, + srq->srqn); + MLX5_SET(query_nvmf_namespace_in, in, + frontend_namespace, + ns->frontend_nsid); + + err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + if (err) + return err; + + ns->counters.num_read_cmd = MLX5_GET(query_nvmf_namespace_out, out, + ns_ctx.num_read_cmd_low); + ns->counters.num_read_blocks = MLX5_GET(query_nvmf_namespace_out, out, + ns_ctx.num_read_blocks_low); + ns->counters.num_write_cmd = MLX5_GET(query_nvmf_namespace_out, out, + ns_ctx.num_write_cmd_low); + ns->counters.num_write_blocks = MLX5_GET(query_nvmf_namespace_out, out, + ns_ctx.num_write_blocks_low); + ns->counters.num_write_inline_cmd = MLX5_GET(query_nvmf_namespace_out, out, + ns_ctx.num_write_inline_cmd_low); + ns->counters.num_flush_cmd = MLX5_GET(query_nvmf_namespace_out, out, + ns_ctx.num_flush_cmd_low); + ns->counters.num_error_cmd = MLX5_GET(query_nvmf_namespace_out, out, + ns_ctx.num_error_cmd_low); + ns->counters.num_backend_error_cmd = MLX5_GET(query_nvmf_namespace_out, out, + ns_ctx.num_backend_error_cmd_low); + if (MLX5_CAP_NVMF(dev, last_req_latency)) { + ns->counters.last_read_latency = MLX5_GET(query_nvmf_namespace_out, out, + ns_ctx.last_read_req_latency); + ns->counters.last_write_latency = MLX5_GET(query_nvmf_namespace_out, out, + ns_ctx.last_write_req_latency); + } else { + ns->counters.last_read_latency = 0; + ns->counters.last_write_latency = 0; + } + if (MLX5_CAP_NVMF(dev, current_q_depth)) + ns->counters.queue_depth = MLX5_GET(query_nvmf_namespace_out, out, + ns_ctx.current_q_depth); + else + ns->counters.queue_depth = 0; + + return 0; +} +EXPORT_SYMBOL_GPL(mlx5_core_query_nvmf_ns); + diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/odp.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/odp.c new file mode 100644 index 0000000..e3a3e7a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/odp.c @@ -0,0 +1,1824 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#include "mlx5_ib.h" +#include "cmd.h" +#include "qp.h" + +#include + +/* Contains the details of a pagefault. */ +struct mlx5_pagefault { + u32 bytes_committed; + u32 token; + u8 event_subtype; + u8 type; + union { + /* Initiator or send message responder pagefault details. */ + struct { + /* Received packet size, only valid for responders. */ + u32 packet_size; + /* + * Number of resource holding WQE, depends on type. + */ + u32 wq_num; + /* + * WQE index. Refers to either the send queue or + * receive queue, according to event_subtype. + */ + u16 wqe_index; + } wqe; + /* RDMA responder pagefault details */ + struct { + u32 r_key; + /* + * Received packet size, minimal size page fault + * resolution required for forward progress. + */ + u32 packet_size; + u32 rdma_op_len; + u64 rdma_va; + } rdma; + }; + + struct mlx5_ib_pf_eq *eq; + struct work_struct work; +}; + +#define MAX_PREFETCH_LEN (4*1024*1024U) + +/* Timeout in ms to wait for an active mmu notifier to complete when handling + * a pagefault. */ +#define MMU_NOTIFIER_TIMEOUT 60000 + +#define MLX5_IMR_MTT_BITS (30 - PAGE_SHIFT) +#define MLX5_IMR_MTT_SHIFT (MLX5_IMR_MTT_BITS + PAGE_SHIFT) +#define MLX5_IMR_MTT_ENTRIES BIT_ULL(MLX5_IMR_MTT_BITS) +#define MLX5_IMR_MTT_SIZE BIT_ULL(MLX5_IMR_MTT_SHIFT) +#define MLX5_IMR_MTT_MASK (~(MLX5_IMR_MTT_SIZE - 1)) + +#define MLX5_KSM_PAGE_SHIFT MLX5_IMR_MTT_SHIFT + +static u64 mlx5_imr_ksm_entries; + +static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries, + struct mlx5_ib_mr *imr, int flags) +{ + struct mlx5_klm *end = pklm + nentries; + + if (flags & MLX5_IB_UPD_XLT_ZAP) { + for (; pklm != end; pklm++, idx++) { + pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); + pklm->key = cpu_to_be32(mr_to_mdev(imr)->null_mkey); + pklm->va = 0; + } + return; + } + + /* + * The locking here is pretty subtle. Ideally the implicit_children + * xarray would be protected by the umem_mutex, however that is not + * possible. Instead this uses a weaker update-then-lock pattern: + * + * xa_store() + * mutex_lock(umem_mutex) + * mlx5_ib_update_xlt() + * mutex_unlock(umem_mutex) + * destroy lkey + * + * ie any change the xarray must be followed by the locked update_xlt + * before destroying. + * + * The umem_mutex provides the acquire/release semantic needed to make + * the xa_store() visible to a racing thread. + */ + lockdep_assert_held(&to_ib_umem_odp(imr->umem)->umem_mutex); + + for (; pklm != end; pklm++, idx++) { + struct mlx5_ib_mr *mtt = xa_load(&imr->implicit_children, idx); + + pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); + if (mtt) { + pklm->key = cpu_to_be32(mtt->ibmr.lkey); + pklm->va = cpu_to_be64(idx * MLX5_IMR_MTT_SIZE); + } else { + pklm->key = cpu_to_be32(mr_to_mdev(imr)->null_mkey); + pklm->va = 0; + } + } +} + +static u64 umem_dma_to_mtt(dma_addr_t umem_dma) +{ + u64 mtt_entry = umem_dma & ODP_DMA_ADDR_MASK; + + if (umem_dma & ODP_READ_ALLOWED_BIT) + mtt_entry |= MLX5_IB_MTT_READ; + if (umem_dma & ODP_WRITE_ALLOWED_BIT) + mtt_entry |= MLX5_IB_MTT_WRITE; + + return mtt_entry; +} + +static void populate_mtt(__be64 *pas, size_t idx, size_t nentries, + struct mlx5_ib_mr *mr, int flags) +{ + struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); + dma_addr_t pa; + size_t i; + + if (flags & MLX5_IB_UPD_XLT_ZAP) + return; + + for (i = 0; i < nentries; i++) { + pa = odp->dma_list[idx + i]; + pas[i] = cpu_to_be64(umem_dma_to_mtt(pa)); + } +} + +void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries, + struct mlx5_ib_mr *mr, int flags) +{ + if (flags & MLX5_IB_UPD_XLT_INDIRECT) { + populate_klm(xlt, idx, nentries, mr, flags); + } else { + populate_mtt(xlt, idx, nentries, mr, flags); + } +} + +/* + * This must be called after the mr has been removed from implicit_children. + * NOTE: The MR does not necessarily have to be + * empty here, parallel page faults could have raced with the free process and + * added pages to it. + */ +static void free_implicit_child_mr_work(struct work_struct *work) +{ + struct mlx5_ib_mr *mr = + container_of(work, struct mlx5_ib_mr, odp_destroy.work); + struct mlx5_ib_mr *imr = mr->parent; + struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem); + struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); + + mlx5r_deref_wait_odp_mkey(&mr->mmkey); + + mutex_lock(&odp_imr->umem_mutex); + mlx5_ib_update_xlt(mr->parent, ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT, + 1, 0, + MLX5_IB_UPD_XLT_INDIRECT | MLX5_IB_UPD_XLT_ATOMIC); + mutex_unlock(&odp_imr->umem_mutex); + mlx5_ib_dereg_mr(&mr->ibmr, NULL); + + mlx5r_deref_odp_mkey(&imr->mmkey); +} + +static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr) +{ + struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); + unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT; + struct mlx5_ib_mr *imr = mr->parent; + + if (!refcount_inc_not_zero(&imr->mmkey.usecount)) + return; + + xa_erase(&imr->implicit_children, idx); + + /* Freeing a MR is a sleeping operation, so bounce to a work queue */ + INIT_WORK(&mr->odp_destroy.work, free_implicit_child_mr_work); + queue_work(system_unbound_wq, &mr->odp_destroy.work); +} + +static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni, + const struct mmu_notifier_range *range, + unsigned long cur_seq) +{ + struct ib_umem_odp *umem_odp = + container_of(mni, struct ib_umem_odp, notifier); + struct mlx5_ib_mr *mr; + const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT / + sizeof(struct mlx5_mtt)) - 1; + u64 idx = 0, blk_start_idx = 0; + u64 invalidations = 0; + unsigned long start; + unsigned long end; + int in_block = 0; + u64 addr; + + if (!mmu_notifier_range_blockable(range)) + return false; + + mutex_lock(&umem_odp->umem_mutex); + mmu_interval_set_seq(mni, cur_seq); + /* + * If npages is zero then umem_odp->private may not be setup yet. This + * does not complete until after the first page is mapped for DMA. + */ + if (!umem_odp->npages) + goto out; + mr = umem_odp->private; + + start = max_t(u64, ib_umem_start(umem_odp), range->start); + end = min_t(u64, ib_umem_end(umem_odp), range->end); + + /* + * Iteration one - zap the HW's MTTs. The notifiers_count ensures that + * while we are doing the invalidation, no page fault will attempt to + * overwrite the same MTTs. Concurent invalidations might race us, + * but they will write 0s as well, so no difference in the end result. + */ + for (addr = start; addr < end; addr += BIT(umem_odp->page_shift)) { + idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift; + /* + * Strive to write the MTTs in chunks, but avoid overwriting + * non-existing MTTs. The huristic here can be improved to + * estimate the cost of another UMR vs. the cost of bigger + * UMR. + */ + if (umem_odp->dma_list[idx] & + (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) { + if (!in_block) { + blk_start_idx = idx; + in_block = 1; + } + + /* Count page invalidations */ + invalidations += idx - blk_start_idx + 1; + } else { + u64 umr_offset = idx & umr_block_mask; + + if (in_block && umr_offset == 0) { + mlx5_ib_update_xlt(mr, blk_start_idx, + idx - blk_start_idx, 0, + MLX5_IB_UPD_XLT_ZAP | + MLX5_IB_UPD_XLT_ATOMIC); + in_block = 0; + } + } + } + if (in_block) + mlx5_ib_update_xlt(mr, blk_start_idx, + idx - blk_start_idx + 1, 0, + MLX5_IB_UPD_XLT_ZAP | + MLX5_IB_UPD_XLT_ATOMIC); + + mlx5_update_odp_stats(mr, invalidations, invalidations); + + /* + * We are now sure that the device will not access the + * memory. We can safely unmap it, and mark it as dirty if + * needed. + */ + + ib_umem_odp_unmap_dma_pages(umem_odp, start, end); + + if (unlikely(!umem_odp->npages && mr->parent)) + destroy_unused_implicit_child_mr(mr); +out: + mutex_unlock(&umem_odp->umem_mutex); + return true; +} + +const struct mmu_interval_notifier_ops mlx5_mn_ops = { + .invalidate = mlx5_ib_invalidate_range, +}; + +static void internal_fill_odp_caps(struct mlx5_ib_dev *dev) +{ + struct ib_odp_caps *caps = &dev->odp_caps; + + memset(caps, 0, sizeof(*caps)); + + if (!MLX5_CAP_GEN(dev->mdev, pg) || + !mlx5_ib_can_load_pas_with_umr(dev, 0)) + return; + + caps->general_caps = IB_ODP_SUPPORT; + + if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) + dev->odp_max_size = U64_MAX; + else + dev->odp_max_size = BIT_ULL(MLX5_MAX_UMR_SHIFT + PAGE_SHIFT); + + if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.send)) + caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND; + + if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.srq_receive)) + caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; + + if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.send)) + caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SEND; + + if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.receive)) + caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_RECV; + + if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.write)) + caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_WRITE; + + if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.read)) + caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ; + + if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.atomic)) + caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC; + + if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.srq_receive)) + caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; + + if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.send)) + caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_SEND; + + if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.receive)) + caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_RECV; + + if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.write)) + caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_WRITE; + + if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.read)) + caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_READ; + + if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.atomic)) + caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_ATOMIC; + + if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.srq_receive)) + caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; + + if (MLX5_CAP_GEN(dev->mdev, fixed_buffer_size) && + MLX5_CAP_GEN(dev->mdev, null_mkey) && + MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset) && + !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled)) + caps->general_caps |= IB_ODP_SUPPORT_IMPLICIT; +} + +static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev, + struct mlx5_pagefault *pfault, + int error) +{ + int wq_num = pfault->event_subtype == MLX5_PFAULT_SUBTYPE_WQE ? + pfault->wqe.wq_num : pfault->token; + u32 in[MLX5_ST_SZ_DW(page_fault_resume_in)] = {}; + int err; + + MLX5_SET(page_fault_resume_in, in, opcode, MLX5_CMD_OP_PAGE_FAULT_RESUME); + MLX5_SET(page_fault_resume_in, in, page_fault_type, pfault->type); + MLX5_SET(page_fault_resume_in, in, token, pfault->token); + MLX5_SET(page_fault_resume_in, in, wq_number, wq_num); + MLX5_SET(page_fault_resume_in, in, error, !!error); + + err = mlx5_cmd_exec_in(dev->mdev, page_fault_resume, in); + if (err) + mlx5_ib_err(dev, "Failed to resolve the page fault on WQ 0x%x err %d\n", + wq_num, err); +} + +static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, + unsigned long idx) +{ + struct ib_umem_odp *odp; + struct mlx5_ib_mr *mr; + struct mlx5_ib_mr *ret; + int err; + + odp = ib_umem_odp_alloc_child(to_ib_umem_odp(imr->umem), + idx * MLX5_IMR_MTT_SIZE, + MLX5_IMR_MTT_SIZE, &mlx5_mn_ops); + if (IS_ERR(odp)) + return ERR_CAST(odp); + + mr = mlx5_mr_cache_alloc( + mr_to_mdev(imr), MLX5_IMR_MTT_CACHE_ENTRY, imr->access_flags); + if (IS_ERR(mr)) { + ib_umem_odp_release(odp); + return mr; + } + + mr->ibmr.pd = imr->ibmr.pd; + mr->ibmr.device = &mr_to_mdev(imr)->ib_dev; + mr->umem = &odp->umem; + mr->ibmr.lkey = mr->mmkey.key; + mr->ibmr.rkey = mr->mmkey.key; + mr->ibmr.iova = idx * MLX5_IMR_MTT_SIZE; + mr->parent = imr; + odp->private = mr; + + /* + * First refcount is owned by the xarray and second refconut + * is returned to the caller. + */ + refcount_set(&mr->mmkey.usecount, 2); + + err = mlx5_ib_update_xlt(mr, 0, + MLX5_IMR_MTT_ENTRIES, + PAGE_SHIFT, + MLX5_IB_UPD_XLT_ZAP | + MLX5_IB_UPD_XLT_ENABLE); + if (err) { + ret = ERR_PTR(err); + goto out_mr; + } + + xa_lock(&imr->implicit_children); + ret = __xa_cmpxchg(&imr->implicit_children, idx, NULL, mr, + GFP_KERNEL); + if (unlikely(ret)) { + if (xa_is_err(ret)) { + ret = ERR_PTR(xa_err(ret)); + goto out_lock; + } + /* + * Another thread beat us to creating the child mr, use + * theirs. + */ + refcount_inc(&ret->mmkey.usecount); + goto out_lock; + } + xa_unlock(&imr->implicit_children); + + mlx5_ib_dbg(mr_to_mdev(imr), "key %x mr %p\n", mr->mmkey.key, mr); + return mr; + +out_lock: + xa_unlock(&imr->implicit_children); +out_mr: + mlx5_ib_dereg_mr(&mr->ibmr, NULL); + return ret; +} + +struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, + int access_flags) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->ibpd.device); + struct ib_umem_odp *umem_odp; + struct mlx5_ib_mr *imr; + int err; + + if (!mlx5_ib_can_load_pas_with_umr(dev, + MLX5_IMR_MTT_ENTRIES * PAGE_SIZE)) + return ERR_PTR(-EOPNOTSUPP); + + umem_odp = ib_umem_odp_alloc_implicit(&dev->ib_dev, access_flags); + if (IS_ERR(umem_odp)) + return ERR_CAST(umem_odp); + + imr = mlx5_mr_cache_alloc(dev, MLX5_IMR_KSM_CACHE_ENTRY, access_flags); + if (IS_ERR(imr)) { + ib_umem_odp_release(umem_odp); + return imr; + } + + imr->ibmr.pd = &pd->ibpd; + imr->ibmr.iova = 0; + imr->umem = &umem_odp->umem; + imr->ibmr.lkey = imr->mmkey.key; + imr->ibmr.rkey = imr->mmkey.key; + imr->ibmr.device = &dev->ib_dev; + imr->umem = &umem_odp->umem; + imr->is_odp_implicit = true; + xa_init(&imr->implicit_children); + + err = mlx5_ib_update_xlt(imr, 0, + mlx5_imr_ksm_entries, + MLX5_KSM_PAGE_SHIFT, + MLX5_IB_UPD_XLT_INDIRECT | + MLX5_IB_UPD_XLT_ZAP | + MLX5_IB_UPD_XLT_ENABLE); + if (err) + goto out_mr; + + err = mlx5r_store_odp_mkey(dev, &imr->mmkey); + if (err) + goto out_mr; + + mlx5_ib_dbg(dev, "key %x mr %p\n", imr->mmkey.key, imr); + return imr; +out_mr: + mlx5_ib_err(dev, "Failed to register MKEY %d\n", err); + mlx5_ib_dereg_mr(&imr->ibmr, NULL); + return ERR_PTR(err); +} + +void mlx5_ib_free_odp_mr(struct mlx5_ib_mr *mr) +{ + struct mlx5_ib_mr *mtt; + unsigned long idx; + + /* + * If this is an implicit MR it is already invalidated so we can just + * delete the children mkeys. + */ + xa_for_each(&mr->implicit_children, idx, mtt) { + xa_erase(&mr->implicit_children, idx); + mlx5_ib_dereg_mr(&mtt->ibmr, NULL); + } +} + +#define MLX5_PF_FLAGS_DOWNGRADE BIT(1) +#define MLX5_PF_FLAGS_SNAPSHOT BIT(2) +#define MLX5_PF_FLAGS_ENABLE BIT(3) +static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp, + u64 user_va, size_t bcnt, u32 *bytes_mapped, + u32 flags) +{ + int page_shift, ret, np; + bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE; + u64 access_mask; + u64 start_idx; + bool fault = !(flags & MLX5_PF_FLAGS_SNAPSHOT); + u32 xlt_flags = MLX5_IB_UPD_XLT_ATOMIC; + + if (flags & MLX5_PF_FLAGS_ENABLE) + xlt_flags |= MLX5_IB_UPD_XLT_ENABLE; + + page_shift = odp->page_shift; + start_idx = (user_va - ib_umem_start(odp)) >> page_shift; + access_mask = ODP_READ_ALLOWED_BIT; + + if (odp->umem.writable && !downgrade) + access_mask |= ODP_WRITE_ALLOWED_BIT; + + np = ib_umem_odp_map_dma_and_lock(odp, user_va, bcnt, access_mask, fault); + if (np < 0) + return np; + + /* + * No need to check whether the MTTs really belong to this MR, since + * ib_umem_odp_map_dma_and_lock already checks this. + */ + ret = mlx5_ib_update_xlt(mr, start_idx, np, page_shift, xlt_flags); + mutex_unlock(&odp->umem_mutex); + + if (ret < 0) { + if (ret != -EAGAIN) + mlx5_ib_err(mr_to_mdev(mr), + "Failed to update mkey page tables\n"); + goto out; + } + + if (bytes_mapped) { + u32 new_mappings = (np << page_shift) - + (user_va - round_down(user_va, 1 << page_shift)); + + *bytes_mapped += min_t(u32, new_mappings, bcnt); + } + + return np << (page_shift - PAGE_SHIFT); + +out: + return ret; +} + +static int pagefault_implicit_mr(struct mlx5_ib_mr *imr, + struct ib_umem_odp *odp_imr, u64 user_va, + size_t bcnt, u32 *bytes_mapped, u32 flags) +{ + unsigned long end_idx = (user_va + bcnt - 1) >> MLX5_IMR_MTT_SHIFT; + unsigned long upd_start_idx = end_idx + 1; + unsigned long upd_len = 0; + unsigned long npages = 0; + int err; + int ret; + + if (unlikely(user_va >= mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE || + mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE - user_va < bcnt)) + return -EFAULT; + + /* Fault each child mr that intersects with our interval. */ + while (bcnt) { + unsigned long idx = user_va >> MLX5_IMR_MTT_SHIFT; + struct ib_umem_odp *umem_odp; + struct mlx5_ib_mr *mtt; + u64 len; + + xa_lock(&imr->implicit_children); + mtt = xa_load(&imr->implicit_children, idx); + if (unlikely(!mtt)) { + xa_unlock(&imr->implicit_children); + mtt = implicit_get_child_mr(imr, idx); + if (IS_ERR(mtt)) { + ret = PTR_ERR(mtt); + goto out; + } + upd_start_idx = min(upd_start_idx, idx); + upd_len = idx - upd_start_idx + 1; + } else { + refcount_inc(&mtt->mmkey.usecount); + xa_unlock(&imr->implicit_children); + } + + umem_odp = to_ib_umem_odp(mtt->umem); + len = min_t(u64, user_va + bcnt, ib_umem_end(umem_odp)) - + user_va; + + ret = pagefault_real_mr(mtt, umem_odp, user_va, len, + bytes_mapped, flags); + + mlx5r_deref_odp_mkey(&mtt->mmkey); + + if (ret < 0) + goto out; + user_va += len; + bcnt -= len; + npages += ret; + } + + ret = npages; + + /* + * Any time the implicit_children are changed we must perform an + * update of the xlt before exiting to ensure the HW and the + * implicit_children remains synchronized. + */ +out: + if (likely(!upd_len)) + return ret; + + /* + * Notice this is not strictly ordered right, the KSM is updated after + * the implicit_children is updated, so a parallel page fault could + * see a MR that is not yet visible in the KSM. This is similar to a + * parallel page fault seeing a MR that is being concurrently removed + * from the KSM. Both of these improbable situations are resolved + * safely by resuming the HW and then taking another page fault. The + * next pagefault handler will see the new information. + */ + mutex_lock(&odp_imr->umem_mutex); + err = mlx5_ib_update_xlt(imr, upd_start_idx, upd_len, 0, + MLX5_IB_UPD_XLT_INDIRECT | + MLX5_IB_UPD_XLT_ATOMIC); + mutex_unlock(&odp_imr->umem_mutex); + if (err) { + mlx5_ib_err(mr_to_mdev(imr), "Failed to update PAS\n"); + return err; + } + return ret; +} + +static int pagefault_dmabuf_mr(struct mlx5_ib_mr *mr, size_t bcnt, + u32 *bytes_mapped, u32 flags) +{ + struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem); + u32 xlt_flags = 0; + int err; + unsigned int page_size; + + if (flags & MLX5_PF_FLAGS_ENABLE) + xlt_flags |= MLX5_IB_UPD_XLT_ENABLE; + + dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL); + err = ib_umem_dmabuf_map_pages(umem_dmabuf); + if (err) { + dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv); + return err; + } + + page_size = mlx5_umem_find_best_pgsz(&umem_dmabuf->umem, mkc, + log_page_size, 0, + umem_dmabuf->umem.iova); + if (unlikely(page_size < PAGE_SIZE)) { + ib_umem_dmabuf_unmap_pages(umem_dmabuf); + err = -EINVAL; + } else { + err = mlx5_ib_update_mr_pas(mr, xlt_flags); + } + dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv); + + if (err) + return err; + + if (bytes_mapped) + *bytes_mapped += bcnt; + + return ib_umem_num_pages(mr->umem); +} + +/* + * Returns: + * -EFAULT: The io_virt->bcnt is not within the MR, it covers pages that are + * not accessible, or the MR is no longer valid. + * -EAGAIN/-ENOMEM: The operation should be retried + * + * -EINVAL/others: General internal malfunction + * >0: Number of pages mapped + */ +static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt, + u32 *bytes_mapped, u32 flags) +{ + struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); + + if (unlikely(io_virt < mr->ibmr.iova)) + return -EFAULT; + + if (mr->umem->is_dmabuf) + return pagefault_dmabuf_mr(mr, bcnt, bytes_mapped, flags); + + if (!odp->is_implicit_odp) { + u64 user_va; + + if (check_add_overflow(io_virt - mr->ibmr.iova, + (u64)odp->umem.address, &user_va)) + return -EFAULT; + if (unlikely(user_va >= ib_umem_end(odp) || + ib_umem_end(odp) - user_va < bcnt)) + return -EFAULT; + return pagefault_real_mr(mr, odp, user_va, bcnt, bytes_mapped, + flags); + } + return pagefault_implicit_mr(mr, odp, io_virt, bcnt, bytes_mapped, + flags); +} + +int mlx5_ib_init_odp_mr(struct mlx5_ib_mr *mr) +{ + int ret; + + ret = pagefault_real_mr(mr, to_ib_umem_odp(mr->umem), mr->umem->address, + mr->umem->length, NULL, + MLX5_PF_FLAGS_SNAPSHOT | MLX5_PF_FLAGS_ENABLE); + return ret >= 0 ? 0 : ret; +} + +int mlx5_ib_init_dmabuf_mr(struct mlx5_ib_mr *mr) +{ + int ret; + + ret = pagefault_dmabuf_mr(mr, mr->umem->length, NULL, + MLX5_PF_FLAGS_ENABLE); + + return ret >= 0 ? 0 : ret; +} + +struct pf_frame { + struct pf_frame *next; + u32 key; + u64 io_virt; + size_t bcnt; + int depth; +}; + +static bool mkey_is_eq(struct mlx5_ib_mkey *mmkey, u32 key) +{ + if (!mmkey) + return false; + if (mmkey->type == MLX5_MKEY_MW) + return mlx5_base_mkey(mmkey->key) == mlx5_base_mkey(key); + return mmkey->key == key; +} + +/* + * Handle a single data segment in a page-fault WQE or RDMA region. + * + * Returns number of OS pages retrieved on success. The caller may continue to + * the next data segment. + * Can return the following error codes: + * -EAGAIN to designate a temporary error. The caller will abort handling the + * page fault and resolve it. + * -EFAULT when there's an error mapping the requested pages. The caller will + * abort the page fault handling. + */ +static int pagefault_single_data_segment(struct mlx5_ib_dev *dev, + struct ib_pd *pd, u32 key, + u64 io_virt, size_t bcnt, + u32 *bytes_committed, + u32 *bytes_mapped) +{ + int npages = 0, ret, i, outlen, cur_outlen = 0, depth = 0; + struct pf_frame *head = NULL, *frame; + struct mlx5_ib_mkey *mmkey; + struct mlx5_ib_mr *mr; + struct mlx5_klm *pklm; + u32 *out = NULL; + size_t offset; + + io_virt += *bytes_committed; + bcnt -= *bytes_committed; + +next_mr: + xa_lock(&dev->odp_mkeys); + mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(key)); + if (!mmkey) { + xa_unlock(&dev->odp_mkeys); + mlx5_ib_dbg( + dev, + "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n", + key); + if (bytes_mapped) + *bytes_mapped += bcnt; + /* + * The user could specify a SGL with multiple lkeys and only + * some of them are ODP. Treat the non-ODP ones as fully + * faulted. + */ + ret = 0; + goto end; + } + refcount_inc(&mmkey->usecount); + xa_unlock(&dev->odp_mkeys); + + if (!mkey_is_eq(mmkey, key)) { + mlx5_ib_dbg(dev, "failed to find mkey %x\n", key); + ret = -EFAULT; + goto end; + } + + switch (mmkey->type) { + case MLX5_MKEY_MR: + mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); + + ret = pagefault_mr(mr, io_virt, bcnt, bytes_mapped, 0); + if (ret < 0) + goto end; + + mlx5_update_odp_stats(mr, faults, ret); + + npages += ret; + ret = 0; + break; + + case MLX5_MKEY_MW: + case MLX5_MKEY_INDIRECT_DEVX: + if (depth >= MLX5_CAP_GEN(dev->mdev, max_indirection)) { + mlx5_ib_dbg(dev, "indirection level exceeded\n"); + ret = -EFAULT; + goto end; + } + + outlen = MLX5_ST_SZ_BYTES(query_mkey_out) + + sizeof(*pklm) * (mmkey->ndescs - 2); + + if (outlen > cur_outlen) { + kfree(out); + out = kzalloc(outlen, GFP_KERNEL); + if (!out) { + ret = -ENOMEM; + goto end; + } + cur_outlen = outlen; + } + + pklm = (struct mlx5_klm *)MLX5_ADDR_OF(query_mkey_out, out, + bsf0_klm0_pas_mtt0_1); + + ret = mlx5_core_query_mkey(dev->mdev, mmkey->key, out, outlen); + if (ret) + goto end; + + offset = io_virt - MLX5_GET64(query_mkey_out, out, + memory_key_mkey_entry.start_addr); + + for (i = 0; bcnt && i < mmkey->ndescs; i++, pklm++) { + if (offset >= be32_to_cpu(pklm->bcount)) { + offset -= be32_to_cpu(pklm->bcount); + continue; + } + + frame = kzalloc(sizeof(*frame), GFP_KERNEL); + if (!frame) { + ret = -ENOMEM; + goto end; + } + + frame->key = be32_to_cpu(pklm->key); + frame->io_virt = be64_to_cpu(pklm->va) + offset; + frame->bcnt = min_t(size_t, bcnt, + be32_to_cpu(pklm->bcount) - offset); + frame->depth = depth + 1; + frame->next = head; + head = frame; + + bcnt -= frame->bcnt; + offset = 0; + } + break; + + default: + mlx5_ib_dbg(dev, "wrong mkey type %d\n", mmkey->type); + ret = -EFAULT; + goto end; + } + + if (head) { + frame = head; + head = frame->next; + + key = frame->key; + io_virt = frame->io_virt; + bcnt = frame->bcnt; + depth = frame->depth; + kfree(frame); + + mlx5r_deref_odp_mkey(mmkey); + goto next_mr; + } + +end: + if (mmkey) + mlx5r_deref_odp_mkey(mmkey); + while (head) { + frame = head; + head = frame->next; + kfree(frame); + } + kfree(out); + + *bytes_committed = 0; + return ret ? ret : npages; +} + +/* + * Parse a series of data segments for page fault handling. + * + * @dev: Pointer to mlx5 IB device + * @pfault: contains page fault information. + * @wqe: points at the first data segment in the WQE. + * @wqe_end: points after the end of the WQE. + * @bytes_mapped: receives the number of bytes that the function was able to + * map. This allows the caller to decide intelligently whether + * enough memory was mapped to resolve the page fault + * successfully (e.g. enough for the next MTU, or the entire + * WQE). + * @total_wqe_bytes: receives the total data size of this WQE in bytes (minus + * the committed bytes). + * @receive_queue: receive WQE end of sg list + * + * Returns the number of pages loaded if positive, zero for an empty WQE, or a + * negative error code. + */ +static int pagefault_data_segments(struct mlx5_ib_dev *dev, + struct mlx5_pagefault *pfault, + void *wqe, + void *wqe_end, u32 *bytes_mapped, + u32 *total_wqe_bytes, bool receive_queue) +{ + int ret = 0, npages = 0; + u64 io_virt; + u32 key; + u32 byte_count; + size_t bcnt; + int inline_segment; + + if (bytes_mapped) + *bytes_mapped = 0; + if (total_wqe_bytes) + *total_wqe_bytes = 0; + + while (wqe < wqe_end) { + struct mlx5_wqe_data_seg *dseg = wqe; + + io_virt = be64_to_cpu(dseg->addr); + key = be32_to_cpu(dseg->lkey); + byte_count = be32_to_cpu(dseg->byte_count); + inline_segment = !!(byte_count & MLX5_INLINE_SEG); + bcnt = byte_count & ~MLX5_INLINE_SEG; + + if (inline_segment) { + bcnt = bcnt & MLX5_WQE_INLINE_SEG_BYTE_COUNT_MASK; + wqe += ALIGN(sizeof(struct mlx5_wqe_inline_seg) + bcnt, + 16); + } else { + wqe += sizeof(*dseg); + } + + /* receive WQE end of sg list. */ + if (receive_queue && bcnt == 0 && key == MLX5_INVALID_LKEY && + io_virt == 0) + break; + + if (!inline_segment && total_wqe_bytes) { + *total_wqe_bytes += bcnt - min_t(size_t, bcnt, + pfault->bytes_committed); + } + + /* A zero length data segment designates a length of 2GB. */ + if (bcnt == 0) + bcnt = 1U << 31; + + if (inline_segment || bcnt <= pfault->bytes_committed) { + pfault->bytes_committed -= + min_t(size_t, bcnt, + pfault->bytes_committed); + continue; + } + + ret = pagefault_single_data_segment(dev, NULL, key, + io_virt, bcnt, + &pfault->bytes_committed, + bytes_mapped); + if (ret < 0) + break; + npages += ret; + } + + return ret < 0 ? ret : npages; +} + +/* + * Parse initiator WQE. Advances the wqe pointer to point at the + * scatter-gather list, and set wqe_end to the end of the WQE. + */ +static int mlx5_ib_mr_initiator_pfault_handler( + struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault, + struct mlx5_ib_qp *qp, void **wqe, void **wqe_end, int wqe_length) +{ + struct mlx5_wqe_ctrl_seg *ctrl = *wqe; + u16 wqe_index = pfault->wqe.wqe_index; + struct mlx5_base_av *av; + unsigned ds, opcode; + u32 qpn = qp->trans_qp.base.mqp.qpn; + + ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK; + if (ds * MLX5_WQE_DS_UNITS > wqe_length) { + mlx5_ib_err(dev, "Unable to read the complete WQE. ds = 0x%x, ret = 0x%x\n", + ds, wqe_length); + return -EFAULT; + } + + if (ds == 0) { + mlx5_ib_err(dev, "Got WQE with zero DS. wqe_index=%x, qpn=%x\n", + wqe_index, qpn); + return -EFAULT; + } + + *wqe_end = *wqe + ds * MLX5_WQE_DS_UNITS; + *wqe += sizeof(*ctrl); + + opcode = be32_to_cpu(ctrl->opmod_idx_opcode) & + MLX5_WQE_CTRL_OPCODE_MASK; + + if (qp->type == IB_QPT_XRC_INI) + *wqe += sizeof(struct mlx5_wqe_xrc_seg); + + if (qp->type == IB_QPT_UD || qp->type == MLX5_IB_QPT_DCI) { + av = *wqe; + if (av->dqp_dct & cpu_to_be32(MLX5_EXTENDED_UD_AV)) + *wqe += sizeof(struct mlx5_av); + else + *wqe += sizeof(struct mlx5_base_av); + } + + switch (opcode) { + case MLX5_OPCODE_RDMA_WRITE: + case MLX5_OPCODE_RDMA_WRITE_IMM: + case MLX5_OPCODE_RDMA_READ: + *wqe += sizeof(struct mlx5_wqe_raddr_seg); + break; + case MLX5_OPCODE_ATOMIC_CS: + case MLX5_OPCODE_ATOMIC_FA: + *wqe += sizeof(struct mlx5_wqe_raddr_seg); + *wqe += sizeof(struct mlx5_wqe_atomic_seg); + break; + } + + return 0; +} + +/* + * Parse responder WQE and set wqe_end to the end of the WQE. + */ +static int mlx5_ib_mr_responder_pfault_handler_srq(struct mlx5_ib_dev *dev, + struct mlx5_ib_srq *srq, + void **wqe, void **wqe_end, + int wqe_length) +{ + int wqe_size = 1 << srq->msrq.wqe_shift; + + if (wqe_size > wqe_length) { + mlx5_ib_err(dev, "Couldn't read all of the receive WQE's content\n"); + return -EFAULT; + } + + *wqe_end = *wqe + wqe_size; + *wqe += sizeof(struct mlx5_wqe_srq_next_seg); + + return 0; +} + +static int mlx5_ib_mr_responder_pfault_handler_rq(struct mlx5_ib_dev *dev, + struct mlx5_ib_qp *qp, + void *wqe, void **wqe_end, + int wqe_length) +{ + struct mlx5_ib_wq *wq = &qp->rq; + int wqe_size = 1 << wq->wqe_shift; + + if (qp->flags_en & MLX5_QP_FLAG_SIGNATURE) { + mlx5_ib_err(dev, "ODP fault with WQE signatures is not supported\n"); + return -EFAULT; + } + + if (wqe_size > wqe_length) { + mlx5_ib_err(dev, "Couldn't read all of the receive WQE's content\n"); + return -EFAULT; + } + + *wqe_end = wqe + wqe_size; + + return 0; +} + +static inline struct mlx5_core_rsc_common *odp_get_rsc(struct mlx5_ib_dev *dev, + u32 wq_num, int pf_type) +{ + struct mlx5_core_rsc_common *common = NULL; + struct mlx5_core_srq *srq; + + switch (pf_type) { + case MLX5_WQE_PF_TYPE_RMP: + srq = mlx5_cmd_get_srq(dev, wq_num); + if (srq) + common = &srq->common; + break; + case MLX5_WQE_PF_TYPE_REQ_SEND_OR_WRITE: + case MLX5_WQE_PF_TYPE_RESP: + case MLX5_WQE_PF_TYPE_REQ_READ_OR_ATOMIC: + common = mlx5_core_res_hold(dev, wq_num, MLX5_RES_QP); + break; + default: + break; + } + + return common; +} + +static inline struct mlx5_ib_qp *res_to_qp(struct mlx5_core_rsc_common *res) +{ + struct mlx5_core_qp *mqp = (struct mlx5_core_qp *)res; + + return to_mibqp(mqp); +} + +static inline struct mlx5_ib_srq *res_to_srq(struct mlx5_core_rsc_common *res) +{ + struct mlx5_core_srq *msrq = + container_of(res, struct mlx5_core_srq, common); + + return to_mibsrq(msrq); +} + +static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev, + struct mlx5_pagefault *pfault) +{ + bool sq = pfault->type & MLX5_PFAULT_REQUESTOR; + u16 wqe_index = pfault->wqe.wqe_index; + void *wqe, *wqe_start = NULL, *wqe_end = NULL; + u32 bytes_mapped, total_wqe_bytes; + struct mlx5_core_rsc_common *res; + int resume_with_error = 1; + struct mlx5_ib_qp *qp; + size_t bytes_copied; + int ret = 0; + + res = odp_get_rsc(dev, pfault->wqe.wq_num, pfault->type); + if (!res) { + mlx5_ib_dbg(dev, "wqe page fault for missing resource %d\n", pfault->wqe.wq_num); + return; + } + + if (res->res != MLX5_RES_QP && res->res != MLX5_RES_SRQ && + res->res != MLX5_RES_XSRQ) { + mlx5_ib_err(dev, "wqe page fault for unsupported type %d\n", + pfault->type); + goto resolve_page_fault; + } + + wqe_start = (void *)__get_free_page(GFP_KERNEL); + if (!wqe_start) { + mlx5_ib_err(dev, "Error allocating memory for IO page fault handling.\n"); + goto resolve_page_fault; + } + + wqe = wqe_start; + qp = (res->res == MLX5_RES_QP) ? res_to_qp(res) : NULL; + if (qp && sq) { + ret = mlx5_ib_read_wqe_sq(qp, wqe_index, wqe, PAGE_SIZE, + &bytes_copied); + if (ret) + goto read_user; + ret = mlx5_ib_mr_initiator_pfault_handler( + dev, pfault, qp, &wqe, &wqe_end, bytes_copied); + } else if (qp && !sq) { + ret = mlx5_ib_read_wqe_rq(qp, wqe_index, wqe, PAGE_SIZE, + &bytes_copied); + if (ret) + goto read_user; + ret = mlx5_ib_mr_responder_pfault_handler_rq( + dev, qp, wqe, &wqe_end, bytes_copied); + } else if (!qp) { + struct mlx5_ib_srq *srq = res_to_srq(res); + + ret = mlx5_ib_read_wqe_srq(srq, wqe_index, wqe, PAGE_SIZE, + &bytes_copied); + if (ret) + goto read_user; + ret = mlx5_ib_mr_responder_pfault_handler_srq( + dev, srq, &wqe, &wqe_end, bytes_copied); + } + + if (ret < 0 || wqe >= wqe_end) + goto resolve_page_fault; + + ret = pagefault_data_segments(dev, pfault, wqe, wqe_end, &bytes_mapped, + &total_wqe_bytes, !sq); + if (ret == -EAGAIN) + goto out; + + if (ret < 0 || total_wqe_bytes > bytes_mapped) + goto resolve_page_fault; + +out: + ret = 0; + resume_with_error = 0; + +read_user: + if (ret) + mlx5_ib_err( + dev, + "Failed reading a WQE following page fault, error %d, wqe_index %x, qpn %x\n", + ret, wqe_index, pfault->token); + +resolve_page_fault: + mlx5_ib_page_fault_resume(dev, pfault, resume_with_error); + mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, type: 0x%x\n", + pfault->wqe.wq_num, resume_with_error, + pfault->type); + mlx5_core_res_put(res); + free_page((unsigned long)wqe_start); +} + +static int pages_in_range(u64 address, u32 length) +{ + return (ALIGN(address + length, PAGE_SIZE) - + (address & PAGE_MASK)) >> PAGE_SHIFT; +} + +static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev, + struct mlx5_pagefault *pfault) +{ + u64 address; + u32 length; + u32 prefetch_len = pfault->bytes_committed; + int prefetch_activated = 0; + u32 rkey = pfault->rdma.r_key; + int ret; + + /* The RDMA responder handler handles the page fault in two parts. + * First it brings the necessary pages for the current packet + * (and uses the pfault context), and then (after resuming the QP) + * prefetches more pages. The second operation cannot use the pfault + * context and therefore uses the dummy_pfault context allocated on + * the stack */ + pfault->rdma.rdma_va += pfault->bytes_committed; + pfault->rdma.rdma_op_len -= min(pfault->bytes_committed, + pfault->rdma.rdma_op_len); + pfault->bytes_committed = 0; + + address = pfault->rdma.rdma_va; + length = pfault->rdma.rdma_op_len; + + /* For some operations, the hardware cannot tell the exact message + * length, and in those cases it reports zero. Use prefetch + * logic. */ + if (length == 0) { + prefetch_activated = 1; + length = pfault->rdma.packet_size; + prefetch_len = min(MAX_PREFETCH_LEN, prefetch_len); + } + + ret = pagefault_single_data_segment(dev, NULL, rkey, address, length, + &pfault->bytes_committed, NULL); + if (ret == -EAGAIN) { + /* We're racing with an invalidation, don't prefetch */ + prefetch_activated = 0; + } else if (ret < 0 || pages_in_range(address, length) > ret) { + mlx5_ib_page_fault_resume(dev, pfault, 1); + if (ret != -ENOENT) + mlx5_ib_dbg(dev, "PAGE FAULT error %d. QP 0x%x, type: 0x%x\n", + ret, pfault->token, pfault->type); + return; + } + + mlx5_ib_page_fault_resume(dev, pfault, 0); + mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x, type: 0x%x, prefetch_activated: %d\n", + pfault->token, pfault->type, + prefetch_activated); + + /* At this point, there might be a new pagefault already arriving in + * the eq, switch to the dummy pagefault for the rest of the + * processing. We're still OK with the objects being alive as the + * work-queue is being fenced. */ + + if (prefetch_activated) { + u32 bytes_committed = 0; + + ret = pagefault_single_data_segment(dev, NULL, rkey, address, + prefetch_len, + &bytes_committed, NULL); + if (ret < 0 && ret != -EAGAIN) { + mlx5_ib_dbg(dev, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n", + ret, pfault->token, address, prefetch_len); + } + } +} + +static void mlx5_ib_pfault(struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault) +{ + u8 event_subtype = pfault->event_subtype; + + switch (event_subtype) { + case MLX5_PFAULT_SUBTYPE_WQE: + mlx5_ib_mr_wqe_pfault_handler(dev, pfault); + break; + case MLX5_PFAULT_SUBTYPE_RDMA: + mlx5_ib_mr_rdma_pfault_handler(dev, pfault); + break; + default: + mlx5_ib_err(dev, "Invalid page fault event subtype: 0x%x\n", + event_subtype); + mlx5_ib_page_fault_resume(dev, pfault, 1); + } +} + +static void mlx5_ib_eqe_pf_action(struct work_struct *work) +{ + struct mlx5_pagefault *pfault = container_of(work, + struct mlx5_pagefault, + work); + struct mlx5_ib_pf_eq *eq = pfault->eq; + + mlx5_ib_pfault(eq->dev, pfault); + mempool_free(pfault, eq->pool); +} + +static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq) +{ + struct mlx5_eqe_page_fault *pf_eqe; + struct mlx5_pagefault *pfault; + struct mlx5_eqe *eqe; + int cc = 0; + + while ((eqe = mlx5_eq_get_eqe(eq->core, cc))) { + pfault = mempool_alloc(eq->pool, GFP_ATOMIC); + if (!pfault) { + schedule_work(&eq->work); + break; + } + + pf_eqe = &eqe->data.page_fault; + pfault->event_subtype = eqe->sub_type; + pfault->bytes_committed = be32_to_cpu(pf_eqe->bytes_committed); + + mlx5_ib_dbg(eq->dev, + "PAGE_FAULT: subtype: 0x%02x, bytes_committed: 0x%06x\n", + eqe->sub_type, pfault->bytes_committed); + + switch (eqe->sub_type) { + case MLX5_PFAULT_SUBTYPE_RDMA: + /* RDMA based event */ + pfault->type = + be32_to_cpu(pf_eqe->rdma.pftype_token) >> 24; + pfault->token = + be32_to_cpu(pf_eqe->rdma.pftype_token) & + MLX5_24BIT_MASK; + pfault->rdma.r_key = + be32_to_cpu(pf_eqe->rdma.r_key); + pfault->rdma.packet_size = + be16_to_cpu(pf_eqe->rdma.packet_length); + pfault->rdma.rdma_op_len = + be32_to_cpu(pf_eqe->rdma.rdma_op_len); + pfault->rdma.rdma_va = + be64_to_cpu(pf_eqe->rdma.rdma_va); + mlx5_ib_dbg(eq->dev, + "PAGE_FAULT: type:0x%x, token: 0x%06x, r_key: 0x%08x\n", + pfault->type, pfault->token, + pfault->rdma.r_key); + mlx5_ib_dbg(eq->dev, + "PAGE_FAULT: rdma_op_len: 0x%08x, rdma_va: 0x%016llx\n", + pfault->rdma.rdma_op_len, + pfault->rdma.rdma_va); + break; + + case MLX5_PFAULT_SUBTYPE_WQE: + /* WQE based event */ + pfault->type = + (be32_to_cpu(pf_eqe->wqe.pftype_wq) >> 24) & 0x7; + pfault->token = + be32_to_cpu(pf_eqe->wqe.token); + pfault->wqe.wq_num = + be32_to_cpu(pf_eqe->wqe.pftype_wq) & + MLX5_24BIT_MASK; + pfault->wqe.wqe_index = + be16_to_cpu(pf_eqe->wqe.wqe_index); + pfault->wqe.packet_size = + be16_to_cpu(pf_eqe->wqe.packet_length); + mlx5_ib_dbg(eq->dev, + "PAGE_FAULT: type:0x%x, token: 0x%06x, wq_num: 0x%06x, wqe_index: 0x%04x\n", + pfault->type, pfault->token, + pfault->wqe.wq_num, + pfault->wqe.wqe_index); + break; + + default: + mlx5_ib_warn(eq->dev, + "Unsupported page fault event sub-type: 0x%02hhx\n", + eqe->sub_type); + /* Unsupported page faults should still be + * resolved by the page fault handler + */ + } + + pfault->eq = eq; + INIT_WORK(&pfault->work, mlx5_ib_eqe_pf_action); + queue_work(eq->wq, &pfault->work); + + cc = mlx5_eq_update_cc(eq->core, ++cc); + } + + mlx5_eq_update_ci(eq->core, cc, 1); +} + +static int mlx5_ib_eq_pf_int(struct notifier_block *nb, unsigned long type, + void *data) +{ + struct mlx5_ib_pf_eq *eq = + container_of(nb, struct mlx5_ib_pf_eq, irq_nb); + unsigned long flags; + + if (spin_trylock_irqsave(&eq->lock, flags)) { + mlx5_ib_eq_pf_process(eq); + spin_unlock_irqrestore(&eq->lock, flags); + } else { + schedule_work(&eq->work); + } + + return IRQ_HANDLED; +} + +/* mempool_refill() was proposed but unfortunately wasn't accepted + * http://lkml.iu.edu/hypermail/linux/kernel/1512.1/05073.html + * Cheap workaround. + */ +static void mempool_refill(mempool_t *pool) +{ + while (pool->curr_nr < pool->min_nr) + mempool_free(mempool_alloc(pool, GFP_KERNEL), pool); +} + +static void mlx5_ib_eq_pf_action(struct work_struct *work) +{ + struct mlx5_ib_pf_eq *eq = + container_of(work, struct mlx5_ib_pf_eq, work); + + mempool_refill(eq->pool); + + spin_lock_irq(&eq->lock); + mlx5_ib_eq_pf_process(eq); + spin_unlock_irq(&eq->lock); +} + +enum { + MLX5_IB_NUM_PF_EQE = 0x1000, + MLX5_IB_NUM_PF_DRAIN = 64, +}; + +int mlx5r_odp_create_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq) +{ + struct mlx5_eq_param param = {}; + int err = 0; + + mutex_lock(&dev->odp_eq_mutex); + if (eq->core) { + mutex_unlock(&dev->odp_eq_mutex); + return 0; + } + INIT_WORK(&eq->work, mlx5_ib_eq_pf_action); + spin_lock_init(&eq->lock); + eq->dev = dev; + + eq->pool = mempool_create_kmalloc_pool(MLX5_IB_NUM_PF_DRAIN, + sizeof(struct mlx5_pagefault)); + if (!eq->pool) { + err = -ENOMEM; + goto unlock; + } + + eq->wq = alloc_workqueue("mlx5_ib_page_fault", + WQ_HIGHPRI | WQ_UNBOUND | WQ_MEM_RECLAIM, + MLX5_NUM_CMD_EQE); + if (!eq->wq) { + err = -ENOMEM; + goto err_mempool; + } + + eq->irq_nb.notifier_call = mlx5_ib_eq_pf_int; + param = (struct mlx5_eq_param) { + .nent = MLX5_IB_NUM_PF_EQE, + }; + param.mask[0] = 1ull << MLX5_EVENT_TYPE_PAGE_FAULT; + eq->core = mlx5_eq_create_generic(dev->mdev, ¶m); + if (IS_ERR(eq->core)) { + err = PTR_ERR(eq->core); + goto err_wq; + } + err = mlx5_eq_enable(dev->mdev, eq->core, &eq->irq_nb); + if (err) { + mlx5_ib_err(dev, "failed to enable odp EQ %d\n", err); + goto err_eq; + } + + mutex_unlock(&dev->odp_eq_mutex); + return 0; +err_eq: + mlx5_eq_destroy_generic(dev->mdev, eq->core); +err_wq: + eq->core = NULL; + destroy_workqueue(eq->wq); +err_mempool: + mempool_destroy(eq->pool); +unlock: + mutex_unlock(&dev->odp_eq_mutex); + return err; +} + +static int +mlx5_ib_odp_destroy_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq) +{ + int err; + + if (!eq->core) + return 0; + mlx5_eq_disable(dev->mdev, eq->core, &eq->irq_nb); + err = mlx5_eq_destroy_generic(dev->mdev, eq->core); + cancel_work_sync(&eq->work); + destroy_workqueue(eq->wq); + mempool_destroy(eq->pool); + + return err; +} + +void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) +{ + if (!(ent->dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) + return; + + switch (ent->order - 2) { + case MLX5_IMR_MTT_CACHE_ENTRY: + ent->page = PAGE_SHIFT; + ent->xlt = MLX5_IMR_MTT_ENTRIES * + sizeof(struct mlx5_mtt) / + MLX5_IB_UMR_OCTOWORD; + ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT; + ent->limit = 0; + break; + + case MLX5_IMR_KSM_CACHE_ENTRY: + ent->page = MLX5_KSM_PAGE_SHIFT; + ent->xlt = mlx5_imr_ksm_entries * + sizeof(struct mlx5_klm) / + MLX5_IB_UMR_OCTOWORD; + ent->access_mode = MLX5_MKC_ACCESS_MODE_KSM; + ent->limit = 0; + break; + } +} + +static const struct ib_device_ops mlx5_ib_dev_odp_ops = { + .advise_mr = mlx5_ib_advise_mr, +}; + +int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev) +{ + int ret = 0; + + internal_fill_odp_caps(dev); + + if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT)) + return ret; + + ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_odp_ops); + + if (dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT) { + ret = mlx5_cmd_null_mkey(dev->mdev, &dev->null_mkey); + if (ret) { + mlx5_ib_err(dev, "Error getting null_mkey %d\n", ret); + return ret; + } + } + + mutex_init(&dev->odp_eq_mutex); + return ret; +} + +void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *dev) +{ + if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT)) + return; + + mlx5_ib_odp_destroy_eq(dev, &dev->odp_pf_eq); +} + +int mlx5_ib_odp_init(void) +{ + mlx5_imr_ksm_entries = BIT_ULL(get_order(TASK_SIZE) - + MLX5_IMR_MTT_BITS); + + return 0; +} + +struct prefetch_mr_work { + struct work_struct work; + u32 pf_flags; + u32 num_sge; + struct { + u64 io_virt; + struct mlx5_ib_mr *mr; + size_t length; + } frags[]; +}; + +static void destroy_prefetch_work(struct prefetch_mr_work *work) +{ + u32 i; + + for (i = 0; i < work->num_sge; ++i) + mlx5r_deref_odp_mkey(&work->frags[i].mr->mmkey); + + kvfree(work); +} + +static struct mlx5_ib_mr * +get_prefetchable_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice, + u32 lkey) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_ib_mr *mr = NULL; + struct mlx5_ib_mkey *mmkey; + + xa_lock(&dev->odp_mkeys); + mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(lkey)); + if (!mmkey || mmkey->key != lkey) { + mr = ERR_PTR(-ENOENT); + goto end; + } + if (mmkey->type != MLX5_MKEY_MR) { + mr = ERR_PTR(-EINVAL); + goto end; + } + + mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); + + if (mr->ibmr.pd != pd) { + mr = ERR_PTR(-EPERM); + goto end; + } + + /* prefetch with write-access must be supported by the MR */ + if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE && + !mr->umem->writable) { + mr = ERR_PTR(-EPERM); + goto end; + } + + refcount_inc(&mmkey->usecount); +end: + xa_unlock(&dev->odp_mkeys); + return mr; +} + +static void mlx5_ib_prefetch_mr_work(struct work_struct *w) +{ + struct prefetch_mr_work *work = + container_of(w, struct prefetch_mr_work, work); + u32 bytes_mapped = 0; + int ret; + u32 i; + + /* We rely on IB/core that work is executed if we have num_sge != 0 only. */ + WARN_ON(!work->num_sge); + for (i = 0; i < work->num_sge; ++i) { + ret = pagefault_mr(work->frags[i].mr, work->frags[i].io_virt, + work->frags[i].length, &bytes_mapped, + work->pf_flags); + if (ret <= 0) + continue; + mlx5_update_odp_stats(work->frags[i].mr, prefetch, ret); + } + + destroy_prefetch_work(work); +} + +static int init_prefetch_work(struct ib_pd *pd, + enum ib_uverbs_advise_mr_advice advice, + u32 pf_flags, struct prefetch_mr_work *work, + struct ib_sge *sg_list, u32 num_sge) +{ + u32 i; + + INIT_WORK(&work->work, mlx5_ib_prefetch_mr_work); + work->pf_flags = pf_flags; + + for (i = 0; i < num_sge; ++i) { + struct mlx5_ib_mr *mr; + + mr = get_prefetchable_mr(pd, advice, sg_list[i].lkey); + if (IS_ERR(mr)) { + work->num_sge = i; + return PTR_ERR(mr); + } + work->frags[i].io_virt = sg_list[i].addr; + work->frags[i].length = sg_list[i].length; + work->frags[i].mr = mr; + } + work->num_sge = num_sge; + return 0; +} + +static int mlx5_ib_prefetch_sg_list(struct ib_pd *pd, + enum ib_uverbs_advise_mr_advice advice, + u32 pf_flags, struct ib_sge *sg_list, + u32 num_sge) +{ + u32 bytes_mapped = 0; + int ret = 0; + u32 i; + + for (i = 0; i < num_sge; ++i) { + struct mlx5_ib_mr *mr; + + mr = get_prefetchable_mr(pd, advice, sg_list[i].lkey); + if (IS_ERR(mr)) + return PTR_ERR(mr); + ret = pagefault_mr(mr, sg_list[i].addr, sg_list[i].length, + &bytes_mapped, pf_flags); + if (ret < 0) { + mlx5r_deref_odp_mkey(&mr->mmkey); + return ret; + } + mlx5_update_odp_stats(mr, prefetch, ret); + mlx5r_deref_odp_mkey(&mr->mmkey); + } + + return 0; +} + +int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd, + enum ib_uverbs_advise_mr_advice advice, + u32 flags, struct ib_sge *sg_list, u32 num_sge) +{ + u32 pf_flags = 0; + struct prefetch_mr_work *work; + int rc; + + if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH) + pf_flags |= MLX5_PF_FLAGS_DOWNGRADE; + + if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT) + pf_flags |= MLX5_PF_FLAGS_SNAPSHOT; + + if (flags & IB_UVERBS_ADVISE_MR_FLAG_FLUSH) + return mlx5_ib_prefetch_sg_list(pd, advice, pf_flags, sg_list, + num_sge); + + work = kvzalloc(struct_size(work, frags, num_sge), GFP_KERNEL); + if (!work) + return -ENOMEM; + + rc = init_prefetch_work(pd, advice, pf_flags, work, sg_list, num_sge); + if (rc) { + destroy_prefetch_work(work); + return rc; + } + queue_work(system_unbound_wq, &work->work); + return 0; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/qos.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/qos.c new file mode 100644 index 0000000..dce9255 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/qos.c @@ -0,0 +1,133 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. + */ + +#include +#include +#include +#include +#include "mlx5_ib.h" + +#define UVERBS_MODULE_NAME mlx5_ib +#include + +static bool pp_is_supported(struct ib_device *device) +{ + struct mlx5_ib_dev *dev = to_mdev(device); + + return (MLX5_CAP_GEN(dev->mdev, qos) && + MLX5_CAP_QOS(dev->mdev, packet_pacing) && + MLX5_CAP_QOS(dev->mdev, packet_pacing_uid)); +} + +static int UVERBS_HANDLER(MLX5_IB_METHOD_PP_OBJ_ALLOC)( + struct uverbs_attr_bundle *attrs) +{ + u8 rl_raw[MLX5_ST_SZ_BYTES(set_pp_rate_limit_context)] = {}; + struct ib_uobject *uobj = uverbs_attr_get_uobject(attrs, + MLX5_IB_ATTR_PP_OBJ_ALLOC_HANDLE); + struct mlx5_ib_dev *dev; + struct mlx5_ib_ucontext *c; + struct mlx5_ib_pp *pp_entry; + void *in_ctx; + u16 uid; + int inlen; + u32 flags; + int err; + + c = to_mucontext(ib_uverbs_get_ucontext(attrs)); + if (IS_ERR(c)) + return PTR_ERR(c); + + /* The allocated entry can be used only by a DEVX context */ + if (!c->devx_uid) + return -EINVAL; + + dev = to_mdev(c->ibucontext.device); + pp_entry = kzalloc(sizeof(*pp_entry), GFP_KERNEL); + if (!pp_entry) + return -ENOMEM; + + in_ctx = uverbs_attr_get_alloced_ptr(attrs, + MLX5_IB_ATTR_PP_OBJ_ALLOC_CTX); + inlen = uverbs_attr_get_len(attrs, + MLX5_IB_ATTR_PP_OBJ_ALLOC_CTX); + memcpy(rl_raw, in_ctx, inlen); + err = uverbs_get_flags32(&flags, attrs, + MLX5_IB_ATTR_PP_OBJ_ALLOC_FLAGS, + MLX5_IB_UAPI_PP_ALLOC_FLAGS_DEDICATED_INDEX); + if (err) + goto err; + + uid = (flags & MLX5_IB_UAPI_PP_ALLOC_FLAGS_DEDICATED_INDEX) ? + c->devx_uid : MLX5_SHARED_RESOURCE_UID; + + err = mlx5_rl_add_rate_raw(dev->mdev, rl_raw, uid, + (flags & MLX5_IB_UAPI_PP_ALLOC_FLAGS_DEDICATED_INDEX), + &pp_entry->index); + if (err) + goto err; + + pp_entry->mdev = dev->mdev; + uobj->object = pp_entry; + uverbs_finalize_uobj_create(attrs, MLX5_IB_ATTR_PP_OBJ_ALLOC_HANDLE); + + err = uverbs_copy_to(attrs, MLX5_IB_ATTR_PP_OBJ_ALLOC_INDEX, + &pp_entry->index, sizeof(pp_entry->index)); + return err; + +err: + kfree(pp_entry); + return err; +} + +static int pp_obj_cleanup(struct ib_uobject *uobject, + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_ib_pp *pp_entry = uobject->object; + + mlx5_rl_remove_rate_raw(pp_entry->mdev, pp_entry->index); + kfree(pp_entry); + return 0; +} + +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_PP_OBJ_ALLOC, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_PP_OBJ_ALLOC_HANDLE, + MLX5_IB_OBJECT_PP, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN( + MLX5_IB_ATTR_PP_OBJ_ALLOC_CTX, + UVERBS_ATTR_SIZE(1, + MLX5_ST_SZ_BYTES(set_pp_rate_limit_context)), + UA_MANDATORY, + UA_ALLOC_AND_COPY), + UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_PP_OBJ_ALLOC_FLAGS, + enum mlx5_ib_uapi_pp_alloc_flags, + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_PP_OBJ_ALLOC_INDEX, + UVERBS_ATTR_TYPE(u16), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD_DESTROY( + MLX5_IB_METHOD_PP_OBJ_DESTROY, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_PP_OBJ_DESTROY_HANDLE, + MLX5_IB_OBJECT_PP, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_PP, + UVERBS_TYPE_ALLOC_IDR(pp_obj_cleanup), + &UVERBS_METHOD(MLX5_IB_METHOD_PP_OBJ_ALLOC), + &UVERBS_METHOD(MLX5_IB_METHOD_PP_OBJ_DESTROY)); + + +const struct uapi_definition mlx5_ib_qos_defs[] = { + UAPI_DEF_CHAIN_OBJ_TREE_NAMED( + MLX5_IB_OBJECT_PP, + UAPI_DEF_IS_OBJ_SUPPORTED(pp_is_supported)), + {}, +}; diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/qp.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/qp.c new file mode 100644 index 0000000..d35206d --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/qp.c @@ -0,0 +1,5958 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "mlx5_ib.h" +#include "mlx5_ib_ext.h" +#include "ib_rep.h" +#include "counters.h" +#include "cmd.h" +#include "qp.h" +#include "wr.h" + +enum { + MLX5_IB_ACK_REQ_FREQ = 8, +}; + +enum { + MLX5_IB_DEFAULT_SCHED_QUEUE = 0x83, + MLX5_IB_DEFAULT_QP0_SCHED_QUEUE = 0x3f, + MLX5_IB_LINK_TYPE_IB = 0, + MLX5_IB_LINK_TYPE_ETH = 1 +}; + +enum raw_qp_set_mask_map { + MLX5_RAW_QP_MOD_SET_RQ_Q_CTR_ID = 1UL << 0, + MLX5_RAW_QP_RATE_LIMIT = 1UL << 1, +}; + +struct mlx5_modify_raw_qp_param { + u16 operation; + + u32 set_mask; /* raw_qp_set_mask_map */ + + struct mlx5_rate_limit rl; + + u8 rq_q_ctr_id; + u32 port; +}; + +struct mlx5_ib_sqd { + struct mlx5_ib_qp *qp; + struct work_struct work; +}; + +static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, + const struct ib_qp_attr *attr, int attr_mask, + enum ib_qp_state cur_state, + enum ib_qp_state new_state, + const struct mlx5_ib_modify_qp *ucmd, + struct mlx5_ib_modify_qp_resp *resp, + struct ib_udata *udata); + +static void get_cqs(enum ib_qp_type qp_type, + struct ib_cq *ib_send_cq, struct ib_cq *ib_recv_cq, + struct mlx5_ib_cq **send_cq, struct mlx5_ib_cq **recv_cq); + +static int is_qp0(enum ib_qp_type qp_type) +{ + return qp_type == IB_QPT_SMI; +} + +static int is_sqp(enum ib_qp_type qp_type) +{ + return is_qp0(qp_type) || is_qp1(qp_type); +} + +/** + * mlx5_ib_read_user_wqe_common() - Copy a WQE (or part of) from user WQ + * to kernel buffer + * + * @umem: User space memory where the WQ is + * @buffer: buffer to copy to + * @buflen: buffer length + * @wqe_index: index of WQE to copy from + * @wq_offset: offset to start of WQ + * @wq_wqe_cnt: number of WQEs in WQ + * @wq_wqe_shift: log2 of WQE size + * @bcnt: number of bytes to copy + * @bytes_copied: number of bytes to copy (return value) + * + * Copies from start of WQE bcnt or less bytes. + * Does not gurantee to copy the entire WQE. + * + * Return: zero on success, or an error code. + */ +static int mlx5_ib_read_user_wqe_common(struct ib_umem *umem, void *buffer, + size_t buflen, int wqe_index, + int wq_offset, int wq_wqe_cnt, + int wq_wqe_shift, int bcnt, + size_t *bytes_copied) +{ + size_t offset = wq_offset + ((wqe_index % wq_wqe_cnt) << wq_wqe_shift); + size_t wq_end = wq_offset + (wq_wqe_cnt << wq_wqe_shift); + size_t copy_length; + int ret; + + /* don't copy more than requested, more than buffer length or + * beyond WQ end + */ + copy_length = min_t(u32, buflen, wq_end - offset); + copy_length = min_t(u32, copy_length, bcnt); + + ret = ib_umem_copy_from(buffer, umem, offset, copy_length); + if (ret) + return ret; + + if (!ret && bytes_copied) + *bytes_copied = copy_length; + + return 0; +} + +static int mlx5_ib_read_kernel_wqe_sq(struct mlx5_ib_qp *qp, int wqe_index, + void *buffer, size_t buflen, size_t *bc) +{ + struct mlx5_wqe_ctrl_seg *ctrl; + size_t bytes_copied = 0; + size_t wqe_length; + void *p; + int ds; + + wqe_index = wqe_index & qp->sq.fbc.sz_m1; + + /* read the control segment first */ + p = mlx5_frag_buf_get_wqe(&qp->sq.fbc, wqe_index); + ctrl = p; + ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK; + wqe_length = ds * MLX5_WQE_DS_UNITS; + + /* read rest of WQE if it spreads over more than one stride */ + while (bytes_copied < wqe_length) { + size_t copy_length = + min_t(size_t, buflen - bytes_copied, MLX5_SEND_WQE_BB); + + if (!copy_length) + break; + + memcpy(buffer + bytes_copied, p, copy_length); + bytes_copied += copy_length; + + wqe_index = (wqe_index + 1) & qp->sq.fbc.sz_m1; + p = mlx5_frag_buf_get_wqe(&qp->sq.fbc, wqe_index); + } + *bc = bytes_copied; + return 0; +} + +static int mlx5_ib_read_user_wqe_sq(struct mlx5_ib_qp *qp, int wqe_index, + void *buffer, size_t buflen, size_t *bc) +{ + struct mlx5_ib_qp_base *base = &qp->trans_qp.base; + struct ib_umem *umem = base->ubuffer.umem; + struct mlx5_ib_wq *wq = &qp->sq; + struct mlx5_wqe_ctrl_seg *ctrl; + size_t bytes_copied; + size_t bytes_copied2; + size_t wqe_length; + int ret; + int ds; + + /* at first read as much as possible */ + ret = mlx5_ib_read_user_wqe_common(umem, buffer, buflen, wqe_index, + wq->offset, wq->wqe_cnt, + wq->wqe_shift, buflen, + &bytes_copied); + if (ret) + return ret; + + /* we need at least control segment size to proceed */ + if (bytes_copied < sizeof(*ctrl)) + return -EINVAL; + + ctrl = buffer; + ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK; + wqe_length = ds * MLX5_WQE_DS_UNITS; + + /* if we copied enough then we are done */ + if (bytes_copied >= wqe_length) { + *bc = bytes_copied; + return 0; + } + + /* otherwise this a wrapped around wqe + * so read the remaining bytes starting + * from wqe_index 0 + */ + ret = mlx5_ib_read_user_wqe_common(umem, buffer + bytes_copied, + buflen - bytes_copied, 0, wq->offset, + wq->wqe_cnt, wq->wqe_shift, + wqe_length - bytes_copied, + &bytes_copied2); + + if (ret) + return ret; + *bc = bytes_copied + bytes_copied2; + return 0; +} + +int mlx5_ib_read_wqe_sq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer, + size_t buflen, size_t *bc) +{ + struct mlx5_ib_qp_base *base = &qp->trans_qp.base; + struct ib_umem *umem = base->ubuffer.umem; + + if (buflen < sizeof(struct mlx5_wqe_ctrl_seg)) + return -EINVAL; + + if (!umem) + return mlx5_ib_read_kernel_wqe_sq(qp, wqe_index, buffer, + buflen, bc); + + return mlx5_ib_read_user_wqe_sq(qp, wqe_index, buffer, buflen, bc); +} + +static int mlx5_ib_read_user_wqe_rq(struct mlx5_ib_qp *qp, int wqe_index, + void *buffer, size_t buflen, size_t *bc) +{ + struct mlx5_ib_qp_base *base = &qp->trans_qp.base; + struct ib_umem *umem = base->ubuffer.umem; + struct mlx5_ib_wq *wq = &qp->rq; + size_t bytes_copied; + int ret; + + ret = mlx5_ib_read_user_wqe_common(umem, buffer, buflen, wqe_index, + wq->offset, wq->wqe_cnt, + wq->wqe_shift, buflen, + &bytes_copied); + + if (ret) + return ret; + *bc = bytes_copied; + return 0; +} + +int mlx5_ib_read_wqe_rq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer, + size_t buflen, size_t *bc) +{ + struct mlx5_ib_qp_base *base = &qp->trans_qp.base; + struct ib_umem *umem = base->ubuffer.umem; + struct mlx5_ib_wq *wq = &qp->rq; + size_t wqe_size = 1 << wq->wqe_shift; + + if (buflen < wqe_size) + return -EINVAL; + + if (!umem) + return -EOPNOTSUPP; + + return mlx5_ib_read_user_wqe_rq(qp, wqe_index, buffer, buflen, bc); +} + +static int mlx5_ib_read_user_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index, + void *buffer, size_t buflen, size_t *bc) +{ + struct ib_umem *umem = srq->umem; + size_t bytes_copied; + int ret; + + ret = mlx5_ib_read_user_wqe_common(umem, buffer, buflen, wqe_index, 0, + srq->msrq.max, srq->msrq.wqe_shift, + buflen, &bytes_copied); + + if (ret) + return ret; + *bc = bytes_copied; + return 0; +} + +int mlx5_ib_read_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index, void *buffer, + size_t buflen, size_t *bc) +{ + struct ib_umem *umem = srq->umem; + size_t wqe_size = 1 << srq->msrq.wqe_shift; + + if (buflen < wqe_size) + return -EINVAL; + + if (!umem) + return -EOPNOTSUPP; + + return mlx5_ib_read_user_wqe_srq(srq, wqe_index, buffer, buflen, bc); +} + +static int query_wqe_idx(struct mlx5_ib_qp *qp) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.device); + int outlen = MLX5_ST_SZ_BYTES(query_qp_out); + u32 *outb; + void *qpc; + int ret; + + mutex_lock(&qp->mutex); + outb = kzalloc(outlen, GFP_KERNEL); + if (!outb) { + ret = -ENOMEM; + goto out; + } + ret = mlx5_core_qp_query(dev, &qp->trans_qp.base.mqp, outb, + outlen); + if (ret) + goto out_free; + + qpc = MLX5_ADDR_OF(query_qp_out, outb, qpc); + mutex_unlock(&qp->mutex); + + return MLX5_GET(qpc, qpc, hw_sq_wqebb_counter) & (qp->sq.wqe_cnt - 1); + +out_free: + kfree(outb); +out: + mutex_unlock(&qp->mutex); + return ret; +} + +static int mlx5_handle_sig_pipelining(struct mlx5_ib_qp *qp) +{ + int wqe_idx, ret = 0; + + wqe_idx = query_wqe_idx(qp); + if (wqe_idx < 0) { + ret = wqe_idx; + pr_err("Failed to query QP 0x%x wqe index\n", + qp->trans_qp.base.mqp.qpn); + goto out; + } + + if (qp->sq.wr_data[wqe_idx] == MLX5_IB_WR_SIG_PIPED) { + struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.device); + struct mlx5_wqe_ctrl_seg *cwqe; + + cwqe = mlx5_frag_buf_get_wqe(&qp->sq.fbc, wqe_idx); + cwqe->opmod_idx_opcode = cpu_to_be32( + (be32_to_cpu(cwqe->opmod_idx_opcode) & 0xffffff00) | + MLX5_OPCODE_NOP); + qp->sq.wr_data[wqe_idx] = MLX5_IB_WR_SIG_CANCELED; + mlx5_ib_dbg(dev, "Cancel QP 0x%x wqe_index 0x%x\n", + qp->trans_qp.base.mqp.qpn, wqe_idx); + } +out: + return ret; +} + +static void mlx5_ib_sqd_work(struct work_struct *work) +{ + struct mlx5_ib_sqd *sqd; + struct mlx5_ib_qp *qp; + struct ib_qp_attr qp_attr; + struct mlx5_ib_modify_qp_resp resp = {}; + + sqd = container_of(work, struct mlx5_ib_sqd, work); + qp = sqd->qp; + + if (mlx5_handle_sig_pipelining(qp)) + goto out; + + mutex_lock(&qp->mutex); + + if (__mlx5_ib_modify_qp(&qp->ibqp, &qp_attr, 0, IB_QPS_SQD, IB_QPS_RTS, + NULL, &resp, NULL)) + + pr_err("Failed to resume QP 0x%x\n", qp->trans_qp.base.mqp.qpn); + mutex_unlock(&qp->mutex); +out: + kfree(sqd); +} + +static void mlx5_ib_sigerr_sqd_event(struct mlx5_ib_qp *qp) +{ + struct mlx5_ib_sqd *sqd; + + sqd = kzalloc(sizeof(*sqd), GFP_ATOMIC); + if (!sqd) + return; + + sqd->qp = qp; + INIT_WORK(&sqd->work, mlx5_ib_sqd_work); + queue_work(mlx5_ib_sigerr_sqd_wq, &sqd->work); +} + +/** + * mlx5_ib_qp_event() - Raise an IB event on dedicated qp. + * + * @qp: low level QP. + * @event_info: holds event information such as event type and error type. + * + * In case that dedicated IB QP assigned an event handler, raise the incoming event. + */ +static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int event_info) +{ + struct ib_qp *ibqp = &to_mibqp(qp)->ibqp; + struct ib_event event; + u8 type = event_info & 0xff; + u8 error_type = (event_info >> 8) & 0xff; + + if (type == MLX5_EVENT_TYPE_SQ_DRAINED && + to_mibqp(qp)->flags & IB_QP_CREATE_SIGNATURE_PIPELINE && + to_mibqp(qp)->state != IB_QPS_SQD) { + mlx5_ib_sigerr_sqd_event(to_mibqp(qp)); + return; + } + + if (type == MLX5_EVENT_TYPE_PATH_MIG) { + /* This event is only valid for trans_qps */ + to_mibqp(qp)->port = to_mibqp(qp)->trans_qp.alt_port; + } + + if (ibqp->event_handler) { + event.device = ibqp->device; + event.element.qp = ibqp; + switch (type) { + case MLX5_EVENT_TYPE_PATH_MIG: + event.event = IB_EVENT_PATH_MIG; + break; + case MLX5_EVENT_TYPE_COMM_EST: + event.event = IB_EVENT_COMM_EST; + break; + case MLX5_EVENT_TYPE_SQ_DRAINED: + event.event = IB_EVENT_SQ_DRAINED; + break; + case MLX5_EVENT_TYPE_SRQ_LAST_WQE: + event.event = IB_EVENT_QP_LAST_WQE_REACHED; + break; + case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: + event.event = IB_EVENT_QP_FATAL; + break; + case MLX5_EVENT_TYPE_PATH_MIG_FAILED: + event.event = IB_EVENT_PATH_MIG_ERR; + break; + case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: + event.event = IB_EVENT_QP_REQ_ERR; + break; + case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: + event.event = IB_EVENT_QP_ACCESS_ERR; + break; + case MLX5_EVENT_TYPE_XRQ_ERROR: + switch (error_type) { + case MLX5_XRQ_ERROR_TYPE_QP_ERROR: + event.event = IB_EXP_EVENT_XRQ_QP_ERR; + break; + default: + pr_warn("mlx5_ib: Unexpected event type %d error type %d on QP %06x\n", + type, error_type, qp->qpn); + return; + } + break; + default: + pr_warn("mlx5_ib: Unexpected event type %d on QP %06x\n", type, qp->qpn); + return; + } + + ibqp->event_handler(&event, ibqp->qp_context); + } +} + +static int set_rq_size(struct mlx5_ib_dev *dev, struct ib_qp_cap *cap, + int has_rq, struct mlx5_ib_qp *qp, struct mlx5_ib_create_qp *ucmd) +{ + int wqe_size; + int wq_size; + + /* Sanity check RQ size before proceeding */ + if (cap->max_recv_wr > (1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz))) + return -EINVAL; + + if (!has_rq) { + qp->rq.max_gs = 0; + qp->rq.wqe_cnt = 0; + qp->rq.wqe_shift = 0; + cap->max_recv_wr = 0; + cap->max_recv_sge = 0; + } else { + int wq_sig = !!(qp->flags_en & MLX5_QP_FLAG_SIGNATURE); + + if (ucmd) { + qp->rq.wqe_cnt = ucmd->rq_wqe_count; + if (ucmd->rq_wqe_shift > BITS_PER_BYTE * sizeof(ucmd->rq_wqe_shift)) + return -EINVAL; + qp->rq.wqe_shift = ucmd->rq_wqe_shift; + if ((1 << qp->rq.wqe_shift) / + sizeof(struct mlx5_wqe_data_seg) < + wq_sig) + return -EINVAL; + qp->rq.max_gs = + (1 << qp->rq.wqe_shift) / + sizeof(struct mlx5_wqe_data_seg) - + wq_sig; + qp->rq.max_post = qp->rq.wqe_cnt; + } else { + wqe_size = + wq_sig ? sizeof(struct mlx5_wqe_signature_seg) : + 0; + wqe_size += cap->max_recv_sge * sizeof(struct mlx5_wqe_data_seg); + wqe_size = roundup_pow_of_two(wqe_size); + wq_size = roundup_pow_of_two(cap->max_recv_wr) * wqe_size; + wq_size = max_t(int, wq_size, MLX5_SEND_WQE_BB); + qp->rq.wqe_cnt = wq_size / wqe_size; + if (wqe_size > MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq)) { + mlx5_ib_dbg(dev, "wqe_size %d, max %d\n", + wqe_size, + MLX5_CAP_GEN(dev->mdev, + max_wqe_sz_rq)); + return -EINVAL; + } + qp->rq.wqe_shift = ilog2(wqe_size); + qp->rq.max_gs = + (1 << qp->rq.wqe_shift) / + sizeof(struct mlx5_wqe_data_seg) - + wq_sig; + qp->rq.max_post = qp->rq.wqe_cnt; + } + } + + return 0; +} + +static int sq_overhead(struct ib_qp_init_attr *attr) +{ + int size = 0; + + switch (attr->qp_type) { + case IB_QPT_XRC_INI: + size += sizeof(struct mlx5_wqe_xrc_seg); + fallthrough; + case IB_QPT_RC: + size += sizeof(struct mlx5_wqe_ctrl_seg) + + max(sizeof(struct mlx5_wqe_atomic_seg) + + sizeof(struct mlx5_wqe_raddr_seg), + sizeof(struct mlx5_wqe_umr_ctrl_seg) + + sizeof(struct mlx5_mkey_seg) + + MLX5_IB_SQ_UMR_INLINE_THRESHOLD / + MLX5_IB_UMR_OCTOWORD); + break; + + case IB_QPT_XRC_TGT: + return 0; + + case IB_QPT_UC: + size += sizeof(struct mlx5_wqe_ctrl_seg) + + max(sizeof(struct mlx5_wqe_raddr_seg), + sizeof(struct mlx5_wqe_umr_ctrl_seg) + + sizeof(struct mlx5_mkey_seg)); + break; + + case IB_QPT_UD: + if (attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO) + size += sizeof(struct mlx5_wqe_eth_pad) + + sizeof(struct mlx5_wqe_eth_seg); + fallthrough; + case IB_QPT_SMI: + case MLX5_IB_QPT_HW_GSI: + size += sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_datagram_seg); + break; + + case MLX5_IB_QPT_REG_UMR: + size += sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_umr_ctrl_seg) + + sizeof(struct mlx5_mkey_seg); + break; + + case MLX5_IB_QPT_SW_CNAK: + size += sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_mlx_seg); + break; + + default: + return -EINVAL; + } + + return size; +} + +static int calc_send_wqe(struct ib_qp_init_attr *attr) +{ + int inl_size = 0; + int size; + + size = sq_overhead(attr); + if (size < 0) + return size; + + if (attr->cap.max_inline_data) { + inl_size = size + sizeof(struct mlx5_wqe_inline_seg) + + attr->cap.max_inline_data; + } + + size += attr->cap.max_send_sge * sizeof(struct mlx5_wqe_data_seg); + if (attr->create_flags & IB_QP_CREATE_INTEGRITY_EN && + ALIGN(max_t(int, inl_size, size), MLX5_SEND_WQE_BB) < MLX5_SIG_WQE_SIZE) + return MLX5_SIG_WQE_SIZE; + else + return ALIGN(max_t(int, inl_size, size), MLX5_SEND_WQE_BB); +} + +static int get_send_sge(struct ib_qp_init_attr *attr, int wqe_size) +{ + int max_sge; + + if (attr->qp_type == IB_QPT_RC) + max_sge = (min_t(int, wqe_size, 512) - + sizeof(struct mlx5_wqe_ctrl_seg) - + sizeof(struct mlx5_wqe_raddr_seg)) / + sizeof(struct mlx5_wqe_data_seg); + else if (attr->qp_type == IB_QPT_XRC_INI) + max_sge = (min_t(int, wqe_size, 512) - + sizeof(struct mlx5_wqe_ctrl_seg) - + sizeof(struct mlx5_wqe_xrc_seg) - + sizeof(struct mlx5_wqe_raddr_seg)) / + sizeof(struct mlx5_wqe_data_seg); + else + max_sge = (wqe_size - sq_overhead(attr)) / + sizeof(struct mlx5_wqe_data_seg); + + return min_t(int, max_sge, wqe_size - sq_overhead(attr) / + sizeof(struct mlx5_wqe_data_seg)); +} + +static int calc_sq_size(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr, + struct mlx5_ib_qp *qp) +{ + int wqe_size; + int wq_size; + + if (!attr->cap.max_send_wr) + return 0; + + wqe_size = calc_send_wqe(attr); + mlx5_ib_dbg(dev, "wqe_size %d\n", wqe_size); + if (wqe_size < 0) + return wqe_size; + + if (wqe_size > MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq)) { + mlx5_ib_dbg(dev, "wqe_size(%d) > max_sq_desc_sz(%d)\n", + wqe_size, MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq)); + return -EINVAL; + } + + qp->max_inline_data = wqe_size - sq_overhead(attr) - + sizeof(struct mlx5_wqe_inline_seg); + attr->cap.max_inline_data = qp->max_inline_data; + + wq_size = roundup_pow_of_two(attr->cap.max_send_wr * wqe_size); + qp->sq.wqe_cnt = wq_size / MLX5_SEND_WQE_BB; + if (qp->sq.wqe_cnt > (1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz))) { + mlx5_ib_dbg(dev, "send queue size (%d * %d / %d -> %d) exceeds limits(%d)\n", + attr->cap.max_send_wr, wqe_size, MLX5_SEND_WQE_BB, + qp->sq.wqe_cnt, + 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz)); + return -ENOMEM; + } + qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB); + qp->sq.max_gs = get_send_sge(attr, wqe_size); + if (qp->sq.max_gs < attr->cap.max_send_sge) + return -ENOMEM; + + attr->cap.max_send_sge = qp->sq.max_gs; + qp->sq.max_post = wq_size / wqe_size; + attr->cap.max_send_wr = qp->sq.max_post; + + return wq_size; +} + +static int set_user_buf_size(struct mlx5_ib_dev *dev, + struct mlx5_ib_qp *qp, + struct mlx5_ib_create_qp *ucmd, + struct mlx5_ib_qp_base *base, + struct ib_qp_init_attr *attr) +{ + int desc_sz = 1 << qp->sq.wqe_shift; + + if (desc_sz > MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq)) { + mlx5_ib_warn(dev, "desc_sz %d, max_sq_desc_sz %d\n", + desc_sz, MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq)); + return -EINVAL; + } + + if (ucmd->sq_wqe_count && !is_power_of_2(ucmd->sq_wqe_count)) { + mlx5_ib_warn(dev, "sq_wqe_count %d is not a power of two\n", + ucmd->sq_wqe_count); + return -EINVAL; + } + + qp->sq.wqe_cnt = ucmd->sq_wqe_count; + + if (qp->sq.wqe_cnt > (1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz))) { + mlx5_ib_warn(dev, "wqe_cnt %d, max_wqes %d\n", + qp->sq.wqe_cnt, + 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz)); + return -EINVAL; + } + + if (attr->qp_type == IB_QPT_RAW_PACKET || + qp->flags & IB_QP_CREATE_SOURCE_QPN) { + base->ubuffer.buf_size = qp->rq.wqe_cnt << qp->rq.wqe_shift; + qp->raw_packet_qp.sq.ubuffer.buf_size = qp->sq.wqe_cnt << 6; + } else { + base->ubuffer.buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + + (qp->sq.wqe_cnt << 6); + } + + return 0; +} + +static int qp_has_rq(struct ib_qp_init_attr *attr) +{ + if (attr->qp_type == IB_QPT_XRC_INI || + attr->qp_type == IB_QPT_XRC_TGT || attr->srq || + attr->qp_type == MLX5_IB_QPT_REG_UMR || + !attr->cap.max_recv_wr) + return 0; + + return 1; +} + +enum { + /* this is the first blue flame register in the array of bfregs assigned + * to a processes. Since we do not use it for blue flame but rather + * regular 64 bit doorbells, we do not need a lock for maintaiing + * "odd/even" order + */ + NUM_NON_BLUE_FLAME_BFREGS = 1, +}; + +static int max_bfregs(struct mlx5_ib_dev *dev, struct mlx5_bfreg_info *bfregi) +{ + return get_num_static_uars(dev, bfregi) * MLX5_NON_FP_BFREGS_PER_UAR; +} + +static int num_med_bfreg(struct mlx5_ib_dev *dev, + struct mlx5_bfreg_info *bfregi) +{ + int n; + + n = max_bfregs(dev, bfregi) - bfregi->num_low_latency_bfregs - + NUM_NON_BLUE_FLAME_BFREGS; + + return n >= 0 ? n : 0; +} + +static int first_med_bfreg(struct mlx5_ib_dev *dev, + struct mlx5_bfreg_info *bfregi) +{ + return num_med_bfreg(dev, bfregi) ? 1 : -ENOMEM; +} + +static int first_hi_bfreg(struct mlx5_ib_dev *dev, + struct mlx5_bfreg_info *bfregi) +{ + int med; + + med = num_med_bfreg(dev, bfregi); + return ++med; +} + +static int alloc_high_class_bfreg(struct mlx5_ib_dev *dev, + struct mlx5_bfreg_info *bfregi) +{ + int i; + + for (i = first_hi_bfreg(dev, bfregi); i < max_bfregs(dev, bfregi); i++) { + if (!bfregi->count[i]) { + bfregi->count[i]++; + return i; + } + } + + return -ENOMEM; +} + +static int alloc_med_class_bfreg(struct mlx5_ib_dev *dev, + struct mlx5_bfreg_info *bfregi) +{ + int minidx = first_med_bfreg(dev, bfregi); + int i; + + if (minidx < 0) + return minidx; + + for (i = minidx; i < first_hi_bfreg(dev, bfregi); i++) { + if (bfregi->count[i] < bfregi->count[minidx]) + minidx = i; + if (!bfregi->count[minidx]) + break; + } + + bfregi->count[minidx]++; + return minidx; +} + +static int alloc_bfreg(struct mlx5_ib_dev *dev, + struct mlx5_bfreg_info *bfregi) +{ + int bfregn = -ENOMEM; + + if (bfregi->lib_uar_dyn) + return -EINVAL; + + mutex_lock(&bfregi->lock); + if (bfregi->ver >= 2) { + bfregn = alloc_high_class_bfreg(dev, bfregi); + if (bfregn < 0) + bfregn = alloc_med_class_bfreg(dev, bfregi); + } + + if (bfregn < 0) { + BUILD_BUG_ON(NUM_NON_BLUE_FLAME_BFREGS != 1); + bfregn = 0; + bfregi->count[bfregn]++; + } + mutex_unlock(&bfregi->lock); + + return bfregn; +} + +void mlx5_ib_free_bfreg(struct mlx5_ib_dev *dev, struct mlx5_bfreg_info *bfregi, int bfregn) +{ + mutex_lock(&bfregi->lock); + bfregi->count[bfregn]--; + mutex_unlock(&bfregi->lock); +} + +static enum mlx5_qp_state to_mlx5_state(enum ib_qp_state state) +{ + switch (state) { + case IB_QPS_RESET: return MLX5_QP_STATE_RST; + case IB_QPS_INIT: return MLX5_QP_STATE_INIT; + case IB_QPS_RTR: return MLX5_QP_STATE_RTR; + case IB_QPS_RTS: return MLX5_QP_STATE_RTS; + case IB_QPS_SQD: return MLX5_QP_STATE_SQD; + case IB_QPS_SQE: return MLX5_QP_STATE_SQER; + case IB_QPS_ERR: return MLX5_QP_STATE_ERR; + default: return -1; + } +} + +static int to_mlx5_st(enum ib_qp_type type) +{ + switch (type) { + case IB_QPT_RC: return MLX5_QP_ST_RC; + case IB_QPT_UC: return MLX5_QP_ST_UC; + case IB_QPT_UD: return MLX5_QP_ST_UD; + case MLX5_IB_QPT_REG_UMR: return MLX5_QP_ST_REG_UMR; + case MLX5_IB_QPT_SW_CNAK: return MLX5_QP_ST_SW_CNAK; + case IB_QPT_XRC_INI: + case IB_QPT_XRC_TGT: return MLX5_QP_ST_XRC; + case IB_QPT_SMI: return MLX5_QP_ST_QP0; + case MLX5_IB_QPT_HW_GSI: return MLX5_QP_ST_QP1; + case MLX5_IB_QPT_DCI: return MLX5_QP_ST_DCI; + case IB_QPT_RAW_PACKET: return MLX5_QP_ST_RAW_ETHERTYPE; + default: return -EINVAL; + } +} + +static void mlx5_ib_lock_cqs(struct mlx5_ib_cq *send_cq, + struct mlx5_ib_cq *recv_cq); +static void mlx5_ib_unlock_cqs(struct mlx5_ib_cq *send_cq, + struct mlx5_ib_cq *recv_cq); + +int bfregn_to_uar_index(struct mlx5_ib_dev *dev, + struct mlx5_bfreg_info *bfregi, u32 bfregn, + bool dyn_bfreg) +{ + unsigned int bfregs_per_sys_page; + u32 index_of_sys_page; + u32 offset; + + if (bfregi->lib_uar_dyn) + return -EINVAL; + + bfregs_per_sys_page = get_uars_per_sys_page(dev, bfregi->lib_uar_4k) * + MLX5_NON_FP_BFREGS_PER_UAR; + index_of_sys_page = bfregn / bfregs_per_sys_page; + + if (dyn_bfreg) { + index_of_sys_page += bfregi->num_static_sys_pages; + + if (index_of_sys_page >= bfregi->num_sys_pages) + return -EINVAL; + + if (bfregn > bfregi->num_dyn_bfregs || + bfregi->sys_pages[index_of_sys_page] == MLX5_IB_INVALID_UAR_INDEX) { + mlx5_ib_dbg(dev, "Invalid dynamic uar index\n"); + return -EINVAL; + } + } + + offset = bfregn % bfregs_per_sys_page / MLX5_NON_FP_BFREGS_PER_UAR; + return bfregi->sys_pages[index_of_sys_page] + offset; +} + +static void destroy_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd, + struct mlx5_ib_rwq *rwq, struct ib_udata *udata) +{ + struct mlx5_ib_ucontext *context = + rdma_udata_to_drv_context( + udata, + struct mlx5_ib_ucontext, + ibucontext); + + if (rwq->create_flags & MLX5_IB_WQ_FLAGS_DELAY_DROP) + atomic_dec(&dev->delay_drop.rqs_cnt); + + mlx5_ib_db_unmap_user(context, &rwq->db); + ib_umem_release(rwq->umem); +} + +static int create_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd, + struct ib_udata *udata, struct mlx5_ib_rwq *rwq, + struct mlx5_ib_create_wq *ucmd) +{ + struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context( + udata, struct mlx5_ib_ucontext, ibucontext); + unsigned long page_size = 0; + u32 offset = 0; + int err; + + if (!ucmd->buf_addr) + return -EINVAL; + + rwq->umem = ib_umem_get_peer(&dev->ib_dev, ucmd->buf_addr, + rwq->buf_size, 0, 0); + if (IS_ERR(rwq->umem)) { + mlx5_ib_dbg(dev, "umem_get failed\n"); + err = PTR_ERR(rwq->umem); + return err; + } + + page_size = mlx5_umem_find_best_quantized_pgoff( + rwq->umem, wq, log_wq_pg_sz, MLX5_ADAPTER_PAGE_SHIFT, + page_offset, 64, &rwq->rq_page_offset); + if (!page_size) { + mlx5_ib_warn(dev, "bad offset\n"); + err = -EINVAL; + goto err_umem; + } + + rwq->rq_num_pas = ib_umem_num_dma_blocks(rwq->umem, page_size); + rwq->page_shift = order_base_2(page_size); + rwq->log_page_size = rwq->page_shift - MLX5_ADAPTER_PAGE_SHIFT; + rwq->wq_sig = !!(ucmd->flags & MLX5_WQ_FLAG_SIGNATURE); + + mlx5_ib_dbg( + dev, + "addr 0x%llx, size %zd, npages %zu, page_size %ld, ncont %d, offset %d\n", + (unsigned long long)ucmd->buf_addr, rwq->buf_size, + ib_umem_num_pages(rwq->umem), page_size, rwq->rq_num_pas, + offset); + + err = mlx5_ib_db_map_user(ucontext, ucmd->db_addr, &rwq->db); + if (err) { + mlx5_ib_dbg(dev, "map failed\n"); + goto err_umem; + } + + return 0; + +err_umem: + ib_umem_release(rwq->umem); + return err; +} + +static int adjust_bfregn(struct mlx5_ib_dev *dev, + struct mlx5_bfreg_info *bfregi, int bfregn) +{ + return bfregn / MLX5_NON_FP_BFREGS_PER_UAR * MLX5_BFREGS_PER_UAR + + bfregn % MLX5_NON_FP_BFREGS_PER_UAR; +} + +static int _create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, + struct mlx5_ib_qp *qp, struct ib_udata *udata, + struct ib_qp_init_attr *attr, u32 **in, + struct mlx5_ib_create_qp_resp *resp, int *inlen, + struct mlx5_ib_qp_base *base, + struct mlx5_ib_create_qp *ucmd) +{ + struct mlx5_ib_ucontext *context; + struct mlx5_ib_ubuffer *ubuffer = &base->ubuffer; + unsigned int page_offset_quantized = 0; + unsigned long page_size = 0; + int uar_index = 0; + int bfregn; + int ncont = 0; + __be64 *pas; + void *qpc; + int err; + u16 uid; + u32 uar_flags; + + context = rdma_udata_to_drv_context(udata, struct mlx5_ib_ucontext, + ibucontext); + uar_flags = qp->flags_en & + (MLX5_QP_FLAG_UAR_PAGE_INDEX | MLX5_QP_FLAG_BFREG_INDEX); + switch (uar_flags) { + case MLX5_QP_FLAG_UAR_PAGE_INDEX: + uar_index = ucmd->bfreg_index; + bfregn = MLX5_IB_INVALID_BFREG; + break; + case MLX5_QP_FLAG_BFREG_INDEX: + uar_index = bfregn_to_uar_index(dev, &context->bfregi, + ucmd->bfreg_index, true); + if (uar_index < 0) + return uar_index; + bfregn = MLX5_IB_INVALID_BFREG; + break; + case 0: + if (qp->flags & IB_QP_CREATE_CROSS_CHANNEL) + return -EINVAL; + bfregn = alloc_bfreg(dev, &context->bfregi); + if (bfregn < 0) + return bfregn; + break; + default: + return -EINVAL; + } + + mlx5_ib_dbg(dev, "bfregn 0x%x, uar_index 0x%x\n", bfregn, uar_index); + if (bfregn != MLX5_IB_INVALID_BFREG) + uar_index = bfregn_to_uar_index(dev, &context->bfregi, bfregn, + false); + + qp->rq.offset = 0; + qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB); + qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift; + + err = set_user_buf_size(dev, qp, ucmd, base, attr); + if (err) + goto err_bfreg; + + if (ucmd->buf_addr && ubuffer->buf_size) { + ubuffer->buf_addr = ucmd->buf_addr; + ubuffer->umem = + ib_umem_get_peer(&dev->ib_dev, ubuffer->buf_addr, + ubuffer->buf_size, 0, 0); + if (IS_ERR(ubuffer->umem)) { + err = PTR_ERR(ubuffer->umem); + goto err_bfreg; + } + page_size = mlx5_umem_find_best_quantized_pgoff( + ubuffer->umem, qpc, log_page_size, + MLX5_ADAPTER_PAGE_SHIFT, page_offset, 64, + &page_offset_quantized); + if (!page_size) { + err = -EINVAL; + goto err_umem; + } + ncont = ib_umem_num_dma_blocks(ubuffer->umem, page_size); + } else { + ubuffer->umem = NULL; + } + + *inlen = MLX5_ST_SZ_BYTES(create_qp_in) + + MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) * ncont; + *in = kvzalloc(*inlen, GFP_KERNEL); + if (!*in) { + err = -ENOMEM; + goto err_umem; + } + + uid = (attr->qp_type != IB_QPT_XRC_INI) ? to_mpd(pd)->uid : 0; + MLX5_SET(create_qp_in, *in, uid, uid); + qpc = MLX5_ADDR_OF(create_qp_in, *in, qpc); + pas = (__be64 *)MLX5_ADDR_OF(create_qp_in, *in, pas); + if (ubuffer->umem) { + mlx5_ib_populate_pas(ubuffer->umem, page_size, pas, 0); + MLX5_SET(qpc, qpc, log_page_size, + order_base_2(page_size) - MLX5_ADAPTER_PAGE_SHIFT); + MLX5_SET(qpc, qpc, page_offset, page_offset_quantized); + } + MLX5_SET(qpc, qpc, uar_page, uar_index); + if (bfregn != MLX5_IB_INVALID_BFREG) + resp->bfreg_index = adjust_bfregn(dev, &context->bfregi, bfregn); + else + resp->bfreg_index = MLX5_IB_INVALID_BFREG; + qp->bfregn = bfregn; + + err = mlx5_ib_db_map_user(context, ucmd->db_addr, &qp->db); + if (err) { + mlx5_ib_dbg(dev, "map failed\n"); + goto err_free; + } + + return 0; + +err_free: + kvfree(*in); + +err_umem: + ib_umem_release(ubuffer->umem); + +err_bfreg: + if (bfregn != MLX5_IB_INVALID_BFREG) + mlx5_ib_free_bfreg(dev, &context->bfregi, bfregn); + return err; +} + +static void destroy_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + struct mlx5_ib_qp_base *base, struct ib_udata *udata) +{ + struct mlx5_ib_ucontext *context = rdma_udata_to_drv_context( + udata, struct mlx5_ib_ucontext, ibucontext); + + if (udata) { + /* User QP */ + mlx5_ib_db_unmap_user(context, &qp->db); + ib_umem_release(base->ubuffer.umem); + + /* + * Free only the BFREGs which are handled by the kernel. + * BFREGs of UARs allocated dynamically are handled by user. + */ + if (qp->bfregn != MLX5_IB_INVALID_BFREG) + mlx5_ib_free_bfreg(dev, &context->bfregi, qp->bfregn); + return; + } + + /* Kernel QP */ + kvfree(qp->sq.wqe_head); + kvfree(qp->sq.w_list); + kvfree(qp->sq.wrid); + kvfree(qp->sq.wr_data); + kvfree(qp->rq.wrid); + if (qp->db.db) + mlx5_db_free(dev->mdev, &qp->db); + if (qp->buf.frags) + mlx5_frag_buf_free(dev->mdev, &qp->buf); +} + +static int _create_kernel_qp(struct mlx5_ib_dev *dev, + struct ib_qp_init_attr *init_attr, + struct mlx5_ib_qp *qp, u32 **in, int *inlen, + struct mlx5_ib_qp_base *base) +{ + int uar_index; + void *qpc; + int err; + + if (init_attr->qp_type == MLX5_IB_QPT_REG_UMR) + qp->bf.bfreg = &dev->fp_bfreg; + else if (qp->flags & MLX5_IB_QP_CREATE_WC_TEST) + qp->bf.bfreg = &dev->wc_bfreg; + else + qp->bf.bfreg = &dev->bfreg; + + /* We need to divide by two since each register is comprised of + * two buffers of identical size, namely odd and even + */ + qp->bf.buf_size = (1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size)) / 2; + uar_index = qp->bf.bfreg->index; + + err = calc_sq_size(dev, init_attr, qp); + if (err < 0) { + mlx5_ib_dbg(dev, "err %d\n", err); + return err; + } + + qp->rq.offset = 0; + qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift; + base->ubuffer.buf_size = err + (qp->rq.wqe_cnt << qp->rq.wqe_shift); + + err = mlx5_frag_buf_alloc_node(dev->mdev, base->ubuffer.buf_size, + &qp->buf, dev->mdev->priv.numa_node); + if (err) { + mlx5_ib_dbg(dev, "err %d\n", err); + return err; + } + + if (qp->rq.wqe_cnt) + mlx5_init_fbc(qp->buf.frags, qp->rq.wqe_shift, + ilog2(qp->rq.wqe_cnt), &qp->rq.fbc); + + if (qp->sq.wqe_cnt) { + int sq_strides_offset = (qp->sq.offset & (PAGE_SIZE - 1)) / + MLX5_SEND_WQE_BB; + mlx5_init_fbc_offset(qp->buf.frags + + (qp->sq.offset / PAGE_SIZE), + ilog2(MLX5_SEND_WQE_BB), + ilog2(qp->sq.wqe_cnt), + sq_strides_offset, &qp->sq.fbc); + + qp->sq.cur_edge = get_sq_edge(&qp->sq, 0); + } + + *inlen = MLX5_ST_SZ_BYTES(create_qp_in) + + MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) * qp->buf.npages; + *in = kvzalloc(*inlen, GFP_KERNEL); + if (!*in) { + err = -ENOMEM; + goto err_buf; + } + + qpc = MLX5_ADDR_OF(create_qp_in, *in, qpc); + MLX5_SET(qpc, qpc, uar_page, uar_index); + MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(dev->mdev)); + MLX5_SET(qpc, qpc, log_page_size, qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); + + /* Set "fast registration enabled" for all kernel QPs */ + MLX5_SET(qpc, qpc, fre, 1); + MLX5_SET(qpc, qpc, rlky, 1); + + if (qp->flags & MLX5_IB_QP_CREATE_SQPN_QP1) + MLX5_SET(qpc, qpc, deth_sqpn, 1); + + if (qp->flags & IB_QP_CREATE_SIGNATURE_PIPELINE) + MLX5_SET(qpc, qpc, drain_sigerr, 1); + + mlx5_fill_page_frag_array(&qp->buf, + (__be64 *)MLX5_ADDR_OF(create_qp_in, + *in, pas)); + + err = mlx5_db_alloc(dev->mdev, &qp->db); + if (err) { + mlx5_ib_dbg(dev, "err %d\n", err); + goto err_free; + } + + qp->sq.wrid = kvmalloc_array(qp->sq.wqe_cnt, + sizeof(*qp->sq.wrid), GFP_KERNEL); + qp->sq.wr_data = kvmalloc_array(qp->sq.wqe_cnt, + sizeof(*qp->sq.wr_data), GFP_KERNEL); + qp->rq.wrid = kvmalloc_array(qp->rq.wqe_cnt, + sizeof(*qp->rq.wrid), GFP_KERNEL); + qp->sq.w_list = kvmalloc_array(qp->sq.wqe_cnt, + sizeof(*qp->sq.w_list), GFP_KERNEL); + qp->sq.wqe_head = kvmalloc_array(qp->sq.wqe_cnt, + sizeof(*qp->sq.wqe_head), GFP_KERNEL); + + if (!qp->sq.wrid || !qp->sq.wr_data || !qp->rq.wrid || + !qp->sq.w_list || !qp->sq.wqe_head) { + err = -ENOMEM; + goto err_wrid; + } + + return 0; + +err_wrid: + kvfree(qp->sq.wqe_head); + kvfree(qp->sq.w_list); + kvfree(qp->sq.wrid); + kvfree(qp->sq.wr_data); + kvfree(qp->rq.wrid); + mlx5_db_free(dev->mdev, &qp->db); + +err_free: + kvfree(*in); + +err_buf: + mlx5_frag_buf_free(dev->mdev, &qp->buf); + return err; +} + +static u32 get_rx_type(struct mlx5_ib_qp *qp, struct ib_qp_init_attr *attr) +{ + if (attr->srq || (qp->type == IB_QPT_XRC_TGT) || + (qp->type == MLX5_IB_QPT_DCI) || (qp->type == IB_QPT_XRC_INI)) + return MLX5_SRQ_RQ; + else if (!qp->has_rq) + return MLX5_ZERO_LEN_RQ; + + return MLX5_NON_ZERO_RQ; +} + +static int create_raw_packet_qp_tis(struct mlx5_ib_dev *dev, + struct mlx5_ib_qp *qp, + struct mlx5_ib_sq *sq, u32 tdn, + struct ib_pd *pd) +{ + u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {}; + void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx); + + MLX5_SET(create_tis_in, in, uid, to_mpd(pd)->uid); + MLX5_SET(tisc, tisc, transport_domain, tdn); + if (qp->flags & IB_QP_CREATE_SOURCE_QPN) + MLX5_SET(tisc, tisc, underlay_qpn, qp->underlay_qpn); + + return mlx5_core_create_tis(dev->mdev, in, &sq->tisn); +} + +static void destroy_raw_packet_qp_tis(struct mlx5_ib_dev *dev, + struct mlx5_ib_sq *sq, struct ib_pd *pd) +{ + mlx5_cmd_destroy_tis(dev->mdev, sq->tisn, to_mpd(pd)->uid); +} + +static void destroy_flow_rule_vport_sq(struct mlx5_ib_sq *sq) +{ + if (sq->flow_rule) + mlx5_del_flow_rules(sq->flow_rule); + sq->flow_rule = NULL; +} + +static bool fr_supported(int ts_cap) +{ + return ts_cap == MLX5_TIMESTAMP_FORMAT_CAP_FREE_RUNNING || + ts_cap == MLX5_TIMESTAMP_FORMAT_CAP_FREE_RUNNING_AND_REAL_TIME; +} + +static int get_ts_format(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, + bool fr_sup, bool rt_sup) +{ + if (cq->private_flags & MLX5_IB_CQ_PR_FLAGS_REAL_TIME_TS) { + if (!rt_sup) { + mlx5_ib_dbg(dev, + "Real time TS format is not supported\n"); + return -EOPNOTSUPP; + } + return MLX5_TIMESTAMP_FORMAT_REAL_TIME; + } + if (cq->create_flags & IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION) { + if (!fr_sup) { + mlx5_ib_dbg(dev, + "Free running TS format is not supported\n"); + return -EOPNOTSUPP; + } + return MLX5_TIMESTAMP_FORMAT_FREE_RUNNING; + } + return fr_sup ? MLX5_TIMESTAMP_FORMAT_FREE_RUNNING : + MLX5_TIMESTAMP_FORMAT_DEFAULT; +} + +static int get_rq_ts_format(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *recv_cq) +{ + u8 ts_cap = MLX5_CAP_GEN(dev->mdev, rq_ts_format); + + return get_ts_format(dev, recv_cq, fr_supported(ts_cap), + rt_supported(ts_cap)); +} + +static int get_sq_ts_format(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *send_cq) +{ + u8 ts_cap = MLX5_CAP_GEN(dev->mdev, sq_ts_format); + + return get_ts_format(dev, send_cq, fr_supported(ts_cap), + rt_supported(ts_cap)); +} + +static int get_qp_ts_format(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *send_cq, + struct mlx5_ib_cq *recv_cq) +{ + u8 ts_cap = MLX5_CAP_ROCE(dev->mdev, qp_ts_format); + bool fr_sup = fr_supported(ts_cap); + bool rt_sup = rt_supported(ts_cap); + u8 default_ts = fr_sup ? MLX5_TIMESTAMP_FORMAT_FREE_RUNNING : + MLX5_TIMESTAMP_FORMAT_DEFAULT; + int send_ts_format = + send_cq ? get_ts_format(dev, send_cq, fr_sup, rt_sup) : + default_ts; + int recv_ts_format = + recv_cq ? get_ts_format(dev, recv_cq, fr_sup, rt_sup) : + default_ts; + + if (send_ts_format < 0 || recv_ts_format < 0) + return -EOPNOTSUPP; + + if (send_ts_format != MLX5_TIMESTAMP_FORMAT_DEFAULT && + recv_ts_format != MLX5_TIMESTAMP_FORMAT_DEFAULT && + send_ts_format != recv_ts_format) { + mlx5_ib_dbg( + dev, + "The send ts_format does not match the receive ts_format\n"); + return -EOPNOTSUPP; + } + + return send_ts_format == default_ts ? recv_ts_format : send_ts_format; +} + +static int create_raw_packet_qp_sq(struct mlx5_ib_dev *dev, + struct ib_udata *udata, + struct mlx5_ib_sq *sq, void *qpin, + struct ib_pd *pd, struct mlx5_ib_cq *cq) +{ + struct mlx5_ib_ubuffer *ubuffer = &sq->ubuffer; + __be64 *pas; + void *in; + void *sqc; + void *qpc = MLX5_ADDR_OF(create_qp_in, qpin, qpc); + void *wq; + int inlen; + int err; + unsigned int page_offset_quantized; + unsigned long page_size; + int ts_format; + + ts_format = get_sq_ts_format(dev, cq); + if (ts_format < 0) + return ts_format; + + sq->ubuffer.umem = ib_umem_get_peer(&dev->ib_dev, ubuffer->buf_addr, + ubuffer->buf_size, 0, 0); + if (IS_ERR(sq->ubuffer.umem)) + return PTR_ERR(sq->ubuffer.umem); + page_size = mlx5_umem_find_best_quantized_pgoff( + ubuffer->umem, wq, log_wq_pg_sz, MLX5_ADAPTER_PAGE_SHIFT, + page_offset, 64, &page_offset_quantized); + if (!page_size) { + err = -EINVAL; + goto err_umem; + } + + inlen = MLX5_ST_SZ_BYTES(create_sq_in) + + sizeof(u64) * + ib_umem_num_dma_blocks(sq->ubuffer.umem, page_size); + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_umem; + } + + MLX5_SET(create_sq_in, in, uid, to_mpd(pd)->uid); + sqc = MLX5_ADDR_OF(create_sq_in, in, ctx); + MLX5_SET(sqc, sqc, flush_in_error_en, 1); + if (MLX5_CAP_ETH(dev->mdev, multi_pkt_send_wqe)) + MLX5_SET(sqc, sqc, allow_multi_pkt_send_wqe, 1); + MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RST); + MLX5_SET(sqc, sqc, ts_format, ts_format); + MLX5_SET(sqc, sqc, user_index, MLX5_GET(qpc, qpc, user_index)); + MLX5_SET(sqc, sqc, cqn, MLX5_GET(qpc, qpc, cqn_snd)); + MLX5_SET(sqc, sqc, tis_lst_sz, 1); + MLX5_SET(sqc, sqc, tis_num_0, sq->tisn); + if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) && + MLX5_CAP_ETH(dev->mdev, swp)) + MLX5_SET(sqc, sqc, allow_swp, 1); + + wq = MLX5_ADDR_OF(sqc, sqc, wq); + MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC); + MLX5_SET(wq, wq, pd, MLX5_GET(qpc, qpc, pd)); + MLX5_SET(wq, wq, uar_page, MLX5_GET(qpc, qpc, uar_page)); + MLX5_SET64(wq, wq, dbr_addr, MLX5_GET64(qpc, qpc, dbr_addr)); + MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB)); + MLX5_SET(wq, wq, log_wq_sz, MLX5_GET(qpc, qpc, log_sq_size)); + MLX5_SET(wq, wq, log_wq_pg_sz, + order_base_2(page_size) - MLX5_ADAPTER_PAGE_SHIFT); + MLX5_SET(wq, wq, page_offset, page_offset_quantized); + + pas = (__be64 *)MLX5_ADDR_OF(wq, wq, pas); + mlx5_ib_populate_pas(sq->ubuffer.umem, page_size, pas, 0); + + err = mlx5_core_create_sq_tracked(dev, in, inlen, &sq->base.mqp); + + kvfree(in); + + if (err) + goto err_umem; + + return 0; + +err_umem: + ib_umem_release(sq->ubuffer.umem); + sq->ubuffer.umem = NULL; + + return err; +} + +static void destroy_raw_packet_qp_sq(struct mlx5_ib_dev *dev, + struct mlx5_ib_sq *sq) +{ + destroy_flow_rule_vport_sq(sq); + mlx5_core_destroy_sq_tracked(dev, &sq->base.mqp); + ib_umem_release(sq->ubuffer.umem); +} + +static int create_raw_packet_qp_rq(struct mlx5_ib_dev *dev, + struct mlx5_ib_rq *rq, void *qpin, + struct ib_pd *pd, struct mlx5_ib_cq *cq) +{ + struct mlx5_ib_qp *mqp = rq->base.container_mibqp; + __be64 *pas; + void *in; + void *rqc; + void *wq; + void *qpc = MLX5_ADDR_OF(create_qp_in, qpin, qpc); + struct ib_umem *umem = rq->base.ubuffer.umem; + unsigned int page_offset_quantized; + unsigned long page_size = 0; + int ts_format; + size_t inlen; + int err; + + ts_format = get_rq_ts_format(dev, cq); + if (ts_format < 0) + return ts_format; + + page_size = mlx5_umem_find_best_quantized_pgoff(umem, wq, log_wq_pg_sz, + MLX5_ADAPTER_PAGE_SHIFT, + page_offset, 64, + &page_offset_quantized); + if (!page_size) + return -EINVAL; + + inlen = MLX5_ST_SZ_BYTES(create_rq_in) + + sizeof(u64) * ib_umem_num_dma_blocks(umem, page_size); + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(create_rq_in, in, uid, to_mpd(pd)->uid); + rqc = MLX5_ADDR_OF(create_rq_in, in, ctx); + if (!(rq->flags & MLX5_IB_RQ_CVLAN_STRIPPING)) + MLX5_SET(rqc, rqc, vsd, 1); + MLX5_SET(rqc, rqc, mem_rq_type, MLX5_RQC_MEM_RQ_TYPE_MEMORY_RQ_INLINE); + MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RST); + MLX5_SET(rqc, rqc, ts_format, ts_format); + MLX5_SET(rqc, rqc, flush_in_error_en, 1); + MLX5_SET(rqc, rqc, user_index, MLX5_GET(qpc, qpc, user_index)); + MLX5_SET(rqc, rqc, cqn, MLX5_GET(qpc, qpc, cqn_rcv)); + + if (mqp->flags & IB_QP_CREATE_SCATTER_FCS) + MLX5_SET(rqc, rqc, scatter_fcs, 1); + + wq = MLX5_ADDR_OF(rqc, rqc, wq); + MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC); + if (rq->flags & MLX5_IB_RQ_PCI_WRITE_END_PADDING) + MLX5_SET(wq, wq, end_padding_mode, MLX5_WQ_END_PAD_MODE_ALIGN); + MLX5_SET(wq, wq, page_offset, page_offset_quantized); + MLX5_SET(wq, wq, pd, MLX5_GET(qpc, qpc, pd)); + MLX5_SET64(wq, wq, dbr_addr, MLX5_GET64(qpc, qpc, dbr_addr)); + MLX5_SET(wq, wq, log_wq_stride, MLX5_GET(qpc, qpc, log_rq_stride) + 4); + MLX5_SET(wq, wq, log_wq_pg_sz, + order_base_2(page_size) - MLX5_ADAPTER_PAGE_SHIFT); + MLX5_SET(wq, wq, log_wq_sz, MLX5_GET(qpc, qpc, log_rq_size)); + + pas = (__be64 *)MLX5_ADDR_OF(wq, wq, pas); + mlx5_ib_populate_pas(umem, page_size, pas, 0); + + err = mlx5_core_create_rq_tracked(dev, in, inlen, &rq->base.mqp); + + kvfree(in); + + return err; +} + +static void destroy_raw_packet_qp_rq(struct mlx5_ib_dev *dev, + struct mlx5_ib_rq *rq) +{ + mlx5_core_destroy_rq_tracked(dev, &rq->base.mqp); +} + +static void destroy_raw_packet_qp_tir(struct mlx5_ib_dev *dev, + struct mlx5_ib_rq *rq, + u32 qp_flags_en, + struct ib_pd *pd) +{ + if (qp_flags_en & (MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC | + MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC)) + mlx5_ib_disable_lb(dev, false, true); + mlx5_cmd_destroy_tir(dev->mdev, rq->tirn, to_mpd(pd)->uid); +} + +static int create_raw_packet_qp_tir(struct mlx5_ib_dev *dev, + struct mlx5_ib_rq *rq, u32 tdn, + u32 *qp_flags_en, struct ib_pd *pd, + u32 *out) +{ + u8 lb_flag = 0; + u32 *in; + void *tirc; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(create_tir_in); + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(create_tir_in, in, uid, to_mpd(pd)->uid); + tirc = MLX5_ADDR_OF(create_tir_in, in, ctx); + MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_DIRECT); + MLX5_SET(tirc, tirc, inline_rqn, rq->base.mqp.qpn); + MLX5_SET(tirc, tirc, transport_domain, tdn); + if (*qp_flags_en & MLX5_QP_FLAG_TUNNEL_OFFLOADS) + MLX5_SET(tirc, tirc, tunneled_offload_en, 1); + + if (*qp_flags_en & MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC) + lb_flag |= MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST; + + if (*qp_flags_en & MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC) + lb_flag |= MLX5_TIRC_SELF_LB_BLOCK_BLOCK_MULTICAST; + + if (dev->is_rep) { + lb_flag |= MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST; + *qp_flags_en |= MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC; + } + + MLX5_SET(tirc, tirc, self_lb_block, lb_flag); + MLX5_SET(create_tir_in, in, opcode, MLX5_CMD_OP_CREATE_TIR); + err = mlx5_cmd_exec_inout(dev->mdev, create_tir, in, out); + rq->tirn = MLX5_GET(create_tir_out, out, tirn); + if (!err && MLX5_GET(tirc, tirc, self_lb_block)) { + err = mlx5_ib_enable_lb(dev, false, true); + + if (err) + destroy_raw_packet_qp_tir(dev, rq, 0, pd); + } + kvfree(in); + + return err; +} + +static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + u32 *in, size_t inlen, struct ib_pd *pd, + struct ib_udata *udata, + struct mlx5_ib_create_qp_resp *resp, + struct ib_qp_init_attr *init_attr) +{ + struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp; + struct mlx5_ib_sq *sq = &raw_packet_qp->sq; + struct mlx5_ib_rq *rq = &raw_packet_qp->rq; + struct mlx5_ib_ucontext *mucontext = rdma_udata_to_drv_context( + udata, struct mlx5_ib_ucontext, ibucontext); + int err; + u32 tdn = mucontext->tdn; + u16 uid = to_mpd(pd)->uid; + u32 out[MLX5_ST_SZ_DW(create_tir_out)] = {}; + + if (!qp->sq.wqe_cnt && !qp->rq.wqe_cnt) + return -EINVAL; + if (qp->sq.wqe_cnt) { + err = create_raw_packet_qp_tis(dev, qp, sq, tdn, pd); + if (err) + return err; + + err = create_raw_packet_qp_sq(dev, udata, sq, in, pd, + to_mcq(init_attr->send_cq)); + if (err) + goto err_destroy_tis; + + if (uid) { + resp->tisn = sq->tisn; + resp->comp_mask |= MLX5_IB_CREATE_QP_RESP_MASK_TISN; + resp->sqn = sq->base.mqp.qpn; + resp->comp_mask |= MLX5_IB_CREATE_QP_RESP_MASK_SQN; + } + + sq->base.container_mibqp = qp; + sq->base.mqp.event = mlx5_ib_qp_event; + } + + if (qp->rq.wqe_cnt) { + rq->base.container_mibqp = qp; + + if (qp->flags & IB_QP_CREATE_CVLAN_STRIPPING) + rq->flags |= MLX5_IB_RQ_CVLAN_STRIPPING; + if (qp->flags & IB_QP_CREATE_PCI_WRITE_END_PADDING) + rq->flags |= MLX5_IB_RQ_PCI_WRITE_END_PADDING; + err = create_raw_packet_qp_rq(dev, rq, in, pd, + to_mcq(init_attr->recv_cq)); + if (err) + goto err_destroy_sq; + + err = create_raw_packet_qp_tir(dev, rq, tdn, &qp->flags_en, pd, + out); + if (err) + goto err_destroy_rq; + + if (uid) { + resp->rqn = rq->base.mqp.qpn; + resp->comp_mask |= MLX5_IB_CREATE_QP_RESP_MASK_RQN; + resp->tirn = rq->tirn; + resp->comp_mask |= MLX5_IB_CREATE_QP_RESP_MASK_TIRN; + if (MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, sw_owner) || + MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, sw_owner_v2)) { + resp->tir_icm_addr = MLX5_GET( + create_tir_out, out, icm_address_31_0); + resp->tir_icm_addr |= + (u64)MLX5_GET(create_tir_out, out, + icm_address_39_32) + << 32; + resp->tir_icm_addr |= + (u64)MLX5_GET(create_tir_out, out, + icm_address_63_40) + << 40; + resp->comp_mask |= + MLX5_IB_CREATE_QP_RESP_MASK_TIR_ICM_ADDR; + } + } + } + + qp->trans_qp.base.mqp.qpn = qp->sq.wqe_cnt ? sq->base.mqp.qpn : + rq->base.mqp.qpn; + return 0; + +err_destroy_rq: + destroy_raw_packet_qp_rq(dev, rq); +err_destroy_sq: + if (!qp->sq.wqe_cnt) + return err; + destroy_raw_packet_qp_sq(dev, sq); +err_destroy_tis: + destroy_raw_packet_qp_tis(dev, sq, pd); + + return err; +} + +static void destroy_raw_packet_qp(struct mlx5_ib_dev *dev, + struct mlx5_ib_qp *qp) +{ + struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp; + struct mlx5_ib_sq *sq = &raw_packet_qp->sq; + struct mlx5_ib_rq *rq = &raw_packet_qp->rq; + + if (qp->rq.wqe_cnt) { + destroy_raw_packet_qp_tir(dev, rq, qp->flags_en, qp->ibqp.pd); + destroy_raw_packet_qp_rq(dev, rq); + } + + if (qp->sq.wqe_cnt) { + destroy_raw_packet_qp_sq(dev, sq); + destroy_raw_packet_qp_tis(dev, sq, qp->ibqp.pd); + } +} + +static void raw_packet_qp_copy_info(struct mlx5_ib_qp *qp, + struct mlx5_ib_raw_packet_qp *raw_packet_qp) +{ + struct mlx5_ib_sq *sq = &raw_packet_qp->sq; + struct mlx5_ib_rq *rq = &raw_packet_qp->rq; + + sq->sq = &qp->sq; + rq->rq = &qp->rq; + sq->doorbell = &qp->db; + rq->doorbell = &qp->db; +} + +static void destroy_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) +{ + if (qp->flags_en & (MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC | + MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC)) + mlx5_ib_disable_lb(dev, false, true); + mlx5_cmd_destroy_tir(dev->mdev, qp->rss_qp.tirn, + to_mpd(qp->ibqp.pd)->uid); +} + +struct mlx5_create_qp_params { + struct ib_udata *udata; + size_t inlen; + size_t outlen; + size_t ucmd_size; + void *ucmd; + u8 is_rss_raw : 1; + struct ib_qp_init_attr *attr; + u32 uidx; + struct mlx5_ib_create_qp_resp resp; +}; + +static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct ib_pd *pd, + struct mlx5_ib_qp *qp, + struct mlx5_create_qp_params *params) +{ + struct ib_qp_init_attr *init_attr = params->attr; + struct mlx5_ib_create_qp_rss *ucmd = params->ucmd; + struct ib_udata *udata = params->udata; + struct mlx5_ib_ucontext *mucontext = rdma_udata_to_drv_context( + udata, struct mlx5_ib_ucontext, ibucontext); + int inlen; + int outlen; + int err; + u32 *in; + u32 *out; + void *tirc; + void *hfso; + u32 selected_fields = 0; + u32 outer_l4; + u32 tdn = mucontext->tdn; + u8 lb_flag = 0; + + if (ucmd->comp_mask) { + mlx5_ib_dbg(dev, "invalid comp mask\n"); + return -EOPNOTSUPP; + } + + if (ucmd->rx_hash_fields_mask & MLX5_RX_HASH_INNER && + !(ucmd->flags & MLX5_QP_FLAG_TUNNEL_OFFLOADS)) { + mlx5_ib_dbg(dev, "Tunnel offloads must be set for inner RSS\n"); + return -EOPNOTSUPP; + } + + if (dev->is_rep) + qp->flags_en |= MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC; + + if (qp->flags_en & MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC) + lb_flag |= MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST; + + if (qp->flags_en & MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC) + lb_flag |= MLX5_TIRC_SELF_LB_BLOCK_BLOCK_MULTICAST; + + inlen = MLX5_ST_SZ_BYTES(create_tir_in); + outlen = MLX5_ST_SZ_BYTES(create_tir_out); + in = kvzalloc(inlen + outlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + out = in + MLX5_ST_SZ_DW(create_tir_in); + MLX5_SET(create_tir_in, in, uid, to_mpd(pd)->uid); + tirc = MLX5_ADDR_OF(create_tir_in, in, ctx); + MLX5_SET(tirc, tirc, disp_type, + MLX5_TIRC_DISP_TYPE_INDIRECT); + MLX5_SET(tirc, tirc, indirect_table, + init_attr->rwq_ind_tbl->ind_tbl_num); + MLX5_SET(tirc, tirc, transport_domain, tdn); + + hfso = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer); + + if (ucmd->flags & MLX5_QP_FLAG_TUNNEL_OFFLOADS) + MLX5_SET(tirc, tirc, tunneled_offload_en, 1); + + MLX5_SET(tirc, tirc, self_lb_block, lb_flag); + + if (ucmd->rx_hash_fields_mask & MLX5_RX_HASH_INNER) + hfso = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_inner); + else + hfso = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer); + + switch (ucmd->rx_hash_function) { + case MLX5_RX_HASH_FUNC_TOEPLITZ: + { + void *rss_key = MLX5_ADDR_OF(tirc, tirc, rx_hash_toeplitz_key); + size_t len = MLX5_FLD_SZ_BYTES(tirc, rx_hash_toeplitz_key); + + if (len != ucmd->rx_key_len) { + err = -EINVAL; + goto err; + } + + MLX5_SET(tirc, tirc, rx_hash_fn, MLX5_RX_HASH_FN_TOEPLITZ); + memcpy(rss_key, ucmd->rx_hash_key, len); + break; + } + default: + err = -EOPNOTSUPP; + goto err; + } + + if (!ucmd->rx_hash_fields_mask) { + /* special case when this TIR serves as steering entry without hashing */ + if (!init_attr->rwq_ind_tbl->log_ind_tbl_size) + goto create_tir; + err = -EINVAL; + goto err; + } + + if (((ucmd->rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV4) || + (ucmd->rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV4)) && + ((ucmd->rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV6) || + (ucmd->rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV6))) { + err = -EINVAL; + goto err; + } + + /* If none of IPV4 & IPV6 SRC/DST was set - this bit field is ignored */ + if ((ucmd->rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV4) || + (ucmd->rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV4)) + MLX5_SET(rx_hash_field_select, hfso, l3_prot_type, + MLX5_L3_PROT_TYPE_IPV4); + else if ((ucmd->rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV6) || + (ucmd->rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV6)) + MLX5_SET(rx_hash_field_select, hfso, l3_prot_type, + MLX5_L3_PROT_TYPE_IPV6); + + outer_l4 = ((ucmd->rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_TCP) || + (ucmd->rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_TCP)) + << 0 | + ((ucmd->rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_UDP) || + (ucmd->rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_UDP)) + << 1 | + (ucmd->rx_hash_fields_mask & MLX5_RX_HASH_IPSEC_SPI) << 2; + + /* Check that only one l4 protocol is set */ + if (outer_l4 & (outer_l4 - 1)) { + err = -EINVAL; + goto err; + } + + /* If none of TCP & UDP SRC/DST was set - this bit field is ignored */ + if ((ucmd->rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_TCP) || + (ucmd->rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_TCP)) + MLX5_SET(rx_hash_field_select, hfso, l4_prot_type, + MLX5_L4_PROT_TYPE_TCP); + else if ((ucmd->rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_UDP) || + (ucmd->rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_UDP)) + MLX5_SET(rx_hash_field_select, hfso, l4_prot_type, + MLX5_L4_PROT_TYPE_UDP); + + if ((ucmd->rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV4) || + (ucmd->rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV6)) + selected_fields |= MLX5_HASH_FIELD_SEL_SRC_IP; + + if ((ucmd->rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV4) || + (ucmd->rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV6)) + selected_fields |= MLX5_HASH_FIELD_SEL_DST_IP; + + if ((ucmd->rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_TCP) || + (ucmd->rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_UDP)) + selected_fields |= MLX5_HASH_FIELD_SEL_L4_SPORT; + + if ((ucmd->rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_TCP) || + (ucmd->rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_UDP)) + selected_fields |= MLX5_HASH_FIELD_SEL_L4_DPORT; + + if (ucmd->rx_hash_fields_mask & MLX5_RX_HASH_IPSEC_SPI) + selected_fields |= MLX5_HASH_FIELD_SEL_IPSEC_SPI; + + MLX5_SET(rx_hash_field_select, hfso, selected_fields, selected_fields); + +create_tir: + MLX5_SET(create_tir_in, in, opcode, MLX5_CMD_OP_CREATE_TIR); + err = mlx5_cmd_exec_inout(dev->mdev, create_tir, in, out); + + qp->rss_qp.tirn = MLX5_GET(create_tir_out, out, tirn); + if (!err && MLX5_GET(tirc, tirc, self_lb_block)) { + err = mlx5_ib_enable_lb(dev, false, true); + + if (err) + mlx5_cmd_destroy_tir(dev->mdev, qp->rss_qp.tirn, + to_mpd(pd)->uid); + } + + if (err) + goto err; + + if (mucontext->devx_uid) { + params->resp.comp_mask |= MLX5_IB_CREATE_QP_RESP_MASK_TIRN; + params->resp.tirn = qp->rss_qp.tirn; + if (MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, sw_owner) || + MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, sw_owner_v2)) { + params->resp.tir_icm_addr = + MLX5_GET(create_tir_out, out, icm_address_31_0); + params->resp.tir_icm_addr |= + (u64)MLX5_GET(create_tir_out, out, + icm_address_39_32) + << 32; + params->resp.tir_icm_addr |= + (u64)MLX5_GET(create_tir_out, out, + icm_address_63_40) + << 40; + params->resp.comp_mask |= + MLX5_IB_CREATE_QP_RESP_MASK_TIR_ICM_ADDR; + } + } + + kvfree(in); + /* qpn is reserved for that QP */ + qp->trans_qp.base.mqp.qpn = 0; + qp->is_rss = true; + return 0; + +err: + kvfree(in); + return err; +} + +static void configure_requester_scat_cqe(struct mlx5_ib_dev *dev, + struct mlx5_ib_qp *qp, + struct ib_qp_init_attr *init_attr, + void *qpc) +{ + int scqe_sz; + bool allow_scat_cqe = false; + + allow_scat_cqe = qp->flags_en & MLX5_QP_FLAG_ALLOW_SCATTER_CQE; + + if (!allow_scat_cqe && init_attr->sq_sig_type != IB_SIGNAL_ALL_WR) + return; + + scqe_sz = mlx5_ib_get_cqe_size(init_attr->send_cq); + if (scqe_sz == 128) { + MLX5_SET(qpc, qpc, cs_req, MLX5_REQ_SCAT_DATA64_CQE); + return; + } + + if (init_attr->qp_type != MLX5_IB_QPT_DCI || + MLX5_CAP_GEN(dev->mdev, dc_req_scat_data_cqe)) + MLX5_SET(qpc, qpc, cs_req, MLX5_REQ_SCAT_DATA32_CQE); +} + +static int atomic_size_to_mode(int size_mask) +{ + /* driver does not support atomic_size > 256B + * and does not know how to translate bigger sizes + */ + int supported_size_mask = size_mask & 0x1ff; + int log_max_size; + + if (!supported_size_mask) + return -EOPNOTSUPP; + + log_max_size = __fls(supported_size_mask); + + if (log_max_size > 3) + return log_max_size; + + return MLX5_ATOMIC_MODE_8B; +} + +static int get_atomic_mode(struct mlx5_ib_dev *dev, + enum ib_qp_type qp_type) +{ + u8 atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations); + u8 atomic = MLX5_CAP_GEN(dev->mdev, atomic); + int atomic_mode = -EOPNOTSUPP; + int atomic_size_mask; + + if (!atomic) + return -EOPNOTSUPP; + + if (qp_type == MLX5_IB_QPT_DCT) + atomic_size_mask = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_dc); + else + atomic_size_mask = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp); + + if ((atomic_operations & MLX5_ATOMIC_OPS_EXTENDED_CMP_SWAP) || + (atomic_operations & MLX5_ATOMIC_OPS_EXTENDED_FETCH_ADD)) + atomic_mode = atomic_size_to_mode(atomic_size_mask); + + if (atomic_mode <= 0 && + (atomic_operations & MLX5_ATOMIC_OPS_CMP_SWAP && + atomic_operations & MLX5_ATOMIC_OPS_FETCH_ADD)) + atomic_mode = MLX5_ATOMIC_MODE_IB_COMP; + + return atomic_mode; +} + +static int create_xrc_tgt_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + struct mlx5_create_qp_params *params) +{ + struct ib_qp_init_attr *attr = params->attr; + u32 uidx = params->uidx; + struct mlx5_ib_resources *devr = &dev->devr; + u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {}; + int inlen = MLX5_ST_SZ_BYTES(create_qp_in); + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_ib_qp_base *base; + unsigned long flags; + void *qpc; + u32 *in; + int err; + + if (attr->sq_sig_type == IB_SIGNAL_ALL_WR) + qp->sq_signal_bits = MLX5_WQE_CTRL_CQ_UPDATE; + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); + + MLX5_SET(qpc, qpc, st, MLX5_QP_ST_XRC); + MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED); + MLX5_SET(qpc, qpc, pd, to_mpd(devr->p0)->pdn); + + if (qp->flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) + MLX5_SET(qpc, qpc, block_lb_mc, 1); + if (qp->flags & IB_QP_CREATE_CROSS_CHANNEL) + MLX5_SET(qpc, qpc, cd_master, 1); + if (qp->flags & IB_QP_CREATE_MANAGED_SEND) + MLX5_SET(qpc, qpc, cd_slave_send, 1); + if (qp->flags & IB_QP_CREATE_MANAGED_RECV) + MLX5_SET(qpc, qpc, cd_slave_receive, 1); + + MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(dev->mdev)); + MLX5_SET(qpc, qpc, rq_type, MLX5_SRQ_RQ); + MLX5_SET(qpc, qpc, no_sq, 1); + MLX5_SET(qpc, qpc, cqn_rcv, to_mcq(devr->c0)->mcq.cqn); + MLX5_SET(qpc, qpc, cqn_snd, to_mcq(devr->c0)->mcq.cqn); + MLX5_SET(qpc, qpc, srqn_rmpn_xrqn, to_msrq(devr->s0)->msrq.srqn); + MLX5_SET(qpc, qpc, xrcd, to_mxrcd(attr->xrcd)->xrcdn); + MLX5_SET64(qpc, qpc, dbr_addr, qp->db.dma); + + /* 0xffffff means we ask to work with cqe version 0 */ + if (MLX5_CAP_GEN(mdev, cqe_version) == MLX5_CQE_VERSION_V1) + MLX5_SET(qpc, qpc, user_index, uidx); + + if (qp->flags & IB_QP_CREATE_PCI_WRITE_END_PADDING) { + MLX5_SET(qpc, qpc, end_padding_mode, + MLX5_WQ_END_PAD_MODE_ALIGN); + /* Special case to clean flag */ + qp->flags &= ~IB_QP_CREATE_PCI_WRITE_END_PADDING; + } + + base = &qp->trans_qp.base; + err = mlx5_qpc_create_qp(dev, &base->mqp, in, inlen, out); + kvfree(in); + if (err) + return err; + + base->container_mibqp = qp; + base->mqp.event = mlx5_ib_qp_event; + if (MLX5_CAP_GEN(mdev, ece_support)) + params->resp.ece_options = MLX5_GET(create_qp_out, out, ece); + + spin_lock_irqsave(&dev->reset_flow_resource_lock, flags); + list_add_tail(&qp->qps_list, &dev->qp_list); + spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags); + + qp->trans_qp.xrcdn = to_mxrcd(attr->xrcd)->xrcdn; + return 0; +} + +static int create_dci(struct mlx5_ib_dev *dev, struct ib_pd *pd, + struct mlx5_ib_qp *qp, + struct mlx5_create_qp_params *params) +{ + struct ib_qp_init_attr *init_attr = params->attr; + struct mlx5_ib_create_qp *ucmd = params->ucmd; + u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {}; + struct ib_udata *udata = params->udata; + u32 uidx = params->uidx; + struct mlx5_ib_resources *devr = &dev->devr; + int inlen = MLX5_ST_SZ_BYTES(create_qp_in); + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_ib_cq *send_cq; + struct mlx5_ib_cq *recv_cq; + unsigned long flags; + struct mlx5_ib_qp_base *base; + int ts_format; + int mlx5_st; + void *qpc; + u32 *in; + int err; + + spin_lock_init(&qp->sq.lock); + spin_lock_init(&qp->rq.lock); + + mlx5_st = to_mlx5_st(qp->type); + if (mlx5_st < 0) + return -EINVAL; + + if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) + qp->sq_signal_bits = MLX5_WQE_CTRL_CQ_UPDATE; + + base = &qp->trans_qp.base; + + qp->has_rq = qp_has_rq(init_attr); + err = set_rq_size(dev, &init_attr->cap, qp->has_rq, qp, ucmd); + if (err) { + mlx5_ib_dbg(dev, "err %d\n", err); + return err; + } + + if (ucmd->rq_wqe_shift != qp->rq.wqe_shift || + ucmd->rq_wqe_count != qp->rq.wqe_cnt) + return -EINVAL; + + if (ucmd->sq_wqe_count > (1 << MLX5_CAP_GEN(mdev, log_max_qp_sz))) + return -EINVAL; + + ts_format = get_qp_ts_format(dev, to_mcq(init_attr->send_cq), + to_mcq(init_attr->recv_cq)); + + if (ts_format < 0) + return ts_format; + + err = _create_user_qp(dev, pd, qp, udata, init_attr, &in, ¶ms->resp, + &inlen, base, ucmd); + if (err) + return err; + + if (MLX5_CAP_GEN(mdev, ece_support)) + MLX5_SET(create_qp_in, in, ece, ucmd->ece_options); + qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); + + MLX5_SET(qpc, qpc, st, mlx5_st); + MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED); + MLX5_SET(qpc, qpc, pd, to_mpd(pd)->pdn); + + if (qp->flags_en & MLX5_QP_FLAG_SIGNATURE) + MLX5_SET(qpc, qpc, wq_signature, 1); + + if (qp->flags & IB_QP_CREATE_CROSS_CHANNEL) + MLX5_SET(qpc, qpc, cd_master, 1); + if (qp->flags & IB_QP_CREATE_MANAGED_SEND) + MLX5_SET(qpc, qpc, cd_slave_send, 1); + if (qp->flags_en & MLX5_QP_FLAG_SCATTER_CQE) + configure_requester_scat_cqe(dev, qp, init_attr, qpc); + + if (qp->rq.wqe_cnt) { + MLX5_SET(qpc, qpc, log_rq_stride, qp->rq.wqe_shift - 4); + MLX5_SET(qpc, qpc, log_rq_size, ilog2(qp->rq.wqe_cnt)); + } + + if (qp->flags_en & MLX5_QP_FLAG_DCI_STREAM) { + MLX5_SET(qpc, qpc, log_num_dci_stream_channels, + ucmd->dci_streams.log_num_concurent); + MLX5_SET(qpc, qpc, log_num_dci_errored_streams, + ucmd->dci_streams.log_num_errored); + } + + MLX5_SET(qpc, qpc, ts_format, ts_format); + MLX5_SET(qpc, qpc, rq_type, get_rx_type(qp, init_attr)); + + MLX5_SET(qpc, qpc, log_sq_size, ilog2(qp->sq.wqe_cnt)); + + /* Set default resources */ + if (init_attr->srq) { + MLX5_SET(qpc, qpc, xrcd, devr->xrcdn0); + MLX5_SET(qpc, qpc, srqn_rmpn_xrqn, + to_msrq(init_attr->srq)->msrq.srqn); + } else { + MLX5_SET(qpc, qpc, xrcd, devr->xrcdn1); + MLX5_SET(qpc, qpc, srqn_rmpn_xrqn, + to_msrq(devr->s1)->msrq.srqn); + } + + if (init_attr->send_cq) + MLX5_SET(qpc, qpc, cqn_snd, + to_mcq(init_attr->send_cq)->mcq.cqn); + + if (init_attr->recv_cq) + MLX5_SET(qpc, qpc, cqn_rcv, + to_mcq(init_attr->recv_cq)->mcq.cqn); + + MLX5_SET64(qpc, qpc, dbr_addr, qp->db.dma); + + /* 0xffffff means we ask to work with cqe version 0 */ + if (MLX5_CAP_GEN(mdev, cqe_version) == MLX5_CQE_VERSION_V1) + MLX5_SET(qpc, qpc, user_index, uidx); + + if (qp->flags & IB_QP_CREATE_PCI_WRITE_END_PADDING) { + MLX5_SET(qpc, qpc, end_padding_mode, + MLX5_WQ_END_PAD_MODE_ALIGN); + /* Special case to clean flag */ + qp->flags &= ~IB_QP_CREATE_PCI_WRITE_END_PADDING; + } + + err = mlx5_qpc_create_qp(dev, &base->mqp, in, inlen, out); + + kvfree(in); + if (err) + goto err_create; + + base->container_mibqp = qp; + base->mqp.event = mlx5_ib_qp_event; + if (MLX5_CAP_GEN(mdev, ece_support)) + params->resp.ece_options = MLX5_GET(create_qp_out, out, ece); + + get_cqs(qp->type, init_attr->send_cq, init_attr->recv_cq, + &send_cq, &recv_cq); + spin_lock_irqsave(&dev->reset_flow_resource_lock, flags); + mlx5_ib_lock_cqs(send_cq, recv_cq); + /* Maintain device to QPs access, needed for further handling via reset + * flow + */ + list_add_tail(&qp->qps_list, &dev->qp_list); + /* Maintain CQ to QPs access, needed for further handling via reset flow + */ + if (send_cq) + list_add_tail(&qp->cq_send_list, &send_cq->list_send_qp); + if (recv_cq) + list_add_tail(&qp->cq_recv_list, &recv_cq->list_recv_qp); + mlx5_ib_unlock_cqs(send_cq, recv_cq); + spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags); + + return 0; + +err_create: + destroy_qp(dev, qp, base, udata); + return err; +} + +static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, + struct mlx5_ib_qp *qp, + struct mlx5_create_qp_params *params) +{ + struct ib_qp_init_attr *init_attr = params->attr; + struct mlx5_ib_create_qp *ucmd = params->ucmd; + u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {}; + struct ib_udata *udata = params->udata; + u32 uidx = params->uidx; + struct mlx5_ib_resources *devr = &dev->devr; + int inlen = MLX5_ST_SZ_BYTES(create_qp_in); + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_ib_cq *send_cq; + struct mlx5_ib_cq *recv_cq; + unsigned long flags; + struct mlx5_ib_qp_base *base; + int ts_format; + int mlx5_st; + void *qpc; + u32 *in; + int err; + + spin_lock_init(&qp->sq.lock); + spin_lock_init(&qp->rq.lock); + + mlx5_st = to_mlx5_st(qp->type); + if (mlx5_st < 0) + return -EINVAL; + + if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) + qp->sq_signal_bits = MLX5_WQE_CTRL_CQ_UPDATE; + + if (qp->flags & IB_QP_CREATE_SOURCE_QPN) + qp->underlay_qpn = init_attr->source_qpn; + + base = (init_attr->qp_type == IB_QPT_RAW_PACKET || + qp->flags & IB_QP_CREATE_SOURCE_QPN) ? + &qp->raw_packet_qp.rq.base : + &qp->trans_qp.base; + + qp->has_rq = qp_has_rq(init_attr); + err = set_rq_size(dev, &init_attr->cap, qp->has_rq, qp, ucmd); + if (err) { + mlx5_ib_dbg(dev, "err %d\n", err); + return err; + } + + if (ucmd->rq_wqe_shift != qp->rq.wqe_shift || + ucmd->rq_wqe_count != qp->rq.wqe_cnt) + return -EINVAL; + + if (ucmd->sq_wqe_count > (1 << MLX5_CAP_GEN(mdev, log_max_qp_sz))) + return -EINVAL; + + if (init_attr->qp_type != IB_QPT_RAW_PACKET) { + ts_format = get_qp_ts_format(dev, to_mcq(init_attr->send_cq), + to_mcq(init_attr->recv_cq)); + if (ts_format < 0) + return ts_format; + } + + err = _create_user_qp(dev, pd, qp, udata, init_attr, &in, ¶ms->resp, + &inlen, base, ucmd); + if (err) + return err; + + if (is_sqp(init_attr->qp_type)) + qp->port = init_attr->port_num; + + if (MLX5_CAP_GEN(mdev, ece_support)) + MLX5_SET(create_qp_in, in, ece, ucmd->ece_options); + qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); + + MLX5_SET(qpc, qpc, st, mlx5_st); + MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED); + MLX5_SET(qpc, qpc, pd, to_mpd(pd)->pdn); + + if (qp->flags_en & MLX5_QP_FLAG_SIGNATURE) + MLX5_SET(qpc, qpc, wq_signature, 1); + + if (qp->flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) + MLX5_SET(qpc, qpc, block_lb_mc, 1); + + if (qp->flags & IB_QP_CREATE_CROSS_CHANNEL) + MLX5_SET(qpc, qpc, cd_master, 1); + if (qp->flags & IB_QP_CREATE_MANAGED_SEND) + MLX5_SET(qpc, qpc, cd_slave_send, 1); + if (qp->flags & IB_QP_CREATE_MANAGED_RECV) + MLX5_SET(qpc, qpc, cd_slave_receive, 1); + if (qp->flags_en & MLX5_QP_FLAG_PACKET_BASED_CREDIT_MODE) + MLX5_SET(qpc, qpc, req_e2e_credit_mode, 1); + if ((qp->flags_en & MLX5_QP_FLAG_SCATTER_CQE) && + (init_attr->qp_type == IB_QPT_RC || + init_attr->qp_type == IB_QPT_UC)) { + int rcqe_sz = mlx5_ib_get_cqe_size(init_attr->recv_cq); + + MLX5_SET(qpc, qpc, cs_res, + rcqe_sz == 128 ? MLX5_RES_SCAT_DATA64_CQE : + MLX5_RES_SCAT_DATA32_CQE); + } + if ((qp->flags_en & MLX5_QP_FLAG_SCATTER_CQE) && + (qp->type == MLX5_IB_QPT_DCI || qp->type == IB_QPT_RC)) + configure_requester_scat_cqe(dev, qp, init_attr, qpc); + + if (qp->rq.wqe_cnt) { + MLX5_SET(qpc, qpc, log_rq_stride, qp->rq.wqe_shift - 4); + MLX5_SET(qpc, qpc, log_rq_size, ilog2(qp->rq.wqe_cnt)); + } + + if (init_attr->qp_type != IB_QPT_RAW_PACKET) + MLX5_SET(qpc, qpc, ts_format, ts_format); + + qp->rq_type = get_rx_type(qp, init_attr); + MLX5_SET(qpc, qpc, rq_type, qp->rq_type); + + if (qp->sq.wqe_cnt) { + MLX5_SET(qpc, qpc, log_sq_size, ilog2(qp->sq.wqe_cnt)); + } else { + MLX5_SET(qpc, qpc, no_sq, 1); + if (init_attr->srq && + init_attr->srq->srq_type == IB_SRQT_TM) + MLX5_SET(qpc, qpc, offload_type, + MLX5_QPC_OFFLOAD_TYPE_RNDV); + } + + /* Set default resources */ + switch (init_attr->qp_type) { + case IB_QPT_XRC_INI: + MLX5_SET(qpc, qpc, cqn_rcv, to_mcq(devr->c0)->mcq.cqn); + MLX5_SET(qpc, qpc, xrcd, devr->xrcdn1); + MLX5_SET(qpc, qpc, srqn_rmpn_xrqn, to_msrq(devr->s0)->msrq.srqn); + break; + default: + if (init_attr->srq) { + MLX5_SET(qpc, qpc, xrcd, devr->xrcdn0); + MLX5_SET(qpc, qpc, srqn_rmpn_xrqn, to_msrq(init_attr->srq)->msrq.srqn); + } else { + MLX5_SET(qpc, qpc, xrcd, devr->xrcdn1); + MLX5_SET(qpc, qpc, srqn_rmpn_xrqn, to_msrq(devr->s1)->msrq.srqn); + } + } + + if (init_attr->send_cq) + MLX5_SET(qpc, qpc, cqn_snd, to_mcq(init_attr->send_cq)->mcq.cqn); + + if (init_attr->recv_cq) + MLX5_SET(qpc, qpc, cqn_rcv, to_mcq(init_attr->recv_cq)->mcq.cqn); + + MLX5_SET64(qpc, qpc, dbr_addr, qp->db.dma); + + /* 0xffffff means we ask to work with cqe version 0 */ + if (MLX5_CAP_GEN(mdev, cqe_version) == MLX5_CQE_VERSION_V1) + MLX5_SET(qpc, qpc, user_index, uidx); + + if (qp->flags & IB_QP_CREATE_PCI_WRITE_END_PADDING && + init_attr->qp_type != IB_QPT_RAW_PACKET) { + MLX5_SET(qpc, qpc, end_padding_mode, + MLX5_WQ_END_PAD_MODE_ALIGN); + /* Special case to clean flag */ + qp->flags &= ~IB_QP_CREATE_PCI_WRITE_END_PADDING; + } + + if (init_attr->qp_type == IB_QPT_RAW_PACKET || + qp->flags & IB_QP_CREATE_SOURCE_QPN) { + qp->raw_packet_qp.sq.ubuffer.buf_addr = ucmd->sq_buf_addr; + raw_packet_qp_copy_info(qp, &qp->raw_packet_qp); + err = create_raw_packet_qp(dev, qp, in, inlen, pd, udata, + ¶ms->resp, init_attr); + } else + err = mlx5_qpc_create_qp(dev, &base->mqp, in, inlen, out); + + kvfree(in); + if (err) + goto err_create; + + base->container_mibqp = qp; + base->mqp.event = mlx5_ib_qp_event; + if (MLX5_CAP_GEN(mdev, ece_support)) + params->resp.ece_options = MLX5_GET(create_qp_out, out, ece); + + get_cqs(qp->type, init_attr->send_cq, init_attr->recv_cq, + &send_cq, &recv_cq); + spin_lock_irqsave(&dev->reset_flow_resource_lock, flags); + mlx5_ib_lock_cqs(send_cq, recv_cq); + /* Maintain device to QPs access, needed for further handling via reset + * flow + */ + list_add_tail(&qp->qps_list, &dev->qp_list); + /* Maintain CQ to QPs access, needed for further handling via reset flow + */ + if (send_cq) + list_add_tail(&qp->cq_send_list, &send_cq->list_send_qp); + if (recv_cq) + list_add_tail(&qp->cq_recv_list, &recv_cq->list_recv_qp); + mlx5_ib_unlock_cqs(send_cq, recv_cq); + spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags); + + return 0; + +err_create: + destroy_qp(dev, qp, base, udata); + return err; +} + +static int create_kernel_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, + struct mlx5_ib_qp *qp, + struct mlx5_create_qp_params *params) +{ + struct ib_qp_init_attr *attr = params->attr; + u32 uidx = params->uidx; + struct mlx5_ib_resources *devr = &dev->devr; + u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {}; + int inlen = MLX5_ST_SZ_BYTES(create_qp_in); + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_ib_cq *send_cq; + struct mlx5_ib_cq *recv_cq; + unsigned long flags; + struct mlx5_ib_qp_base *base; + int mlx5_st; + void *qpc; + u32 *in = NULL; + int err; + + spin_lock_init(&qp->sq.lock); + spin_lock_init(&qp->rq.lock); + + mlx5_st = to_mlx5_st(qp->type); + if (mlx5_st < 0) + return -EINVAL; + + if (attr->sq_sig_type == IB_SIGNAL_ALL_WR) + qp->sq_signal_bits = MLX5_WQE_CTRL_CQ_UPDATE; + + base = &qp->trans_qp.base; + + qp->has_rq = qp_has_rq(attr); + err = set_rq_size(dev, &attr->cap, qp->has_rq, qp, NULL); + if (err) { + mlx5_ib_dbg(dev, "err %d\n", err); + return err; + } + + err = _create_kernel_qp(dev, attr, qp, &in, &inlen, base); + if (err) + return err; + + if (is_sqp(attr->qp_type)) + qp->port = attr->port_num; + + qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); + + MLX5_SET(qpc, qpc, st, mlx5_st); + MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED); + + if (attr->qp_type != MLX5_IB_QPT_REG_UMR) + MLX5_SET(qpc, qpc, pd, to_mpd(pd ? pd : devr->p0)->pdn); + else + MLX5_SET(qpc, qpc, latency_sensitive, 1); + + + if (qp->flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) + MLX5_SET(qpc, qpc, block_lb_mc, 1); + + if (qp->rq.wqe_cnt) { + MLX5_SET(qpc, qpc, log_rq_stride, qp->rq.wqe_shift - 4); + MLX5_SET(qpc, qpc, log_rq_size, ilog2(qp->rq.wqe_cnt)); + } + + qp->rq_type = get_rx_type(qp, attr); + MLX5_SET(qpc, qpc, rq_type, get_rx_type(qp, attr)); + + if (qp->sq.wqe_cnt) + MLX5_SET(qpc, qpc, log_sq_size, ilog2(qp->sq.wqe_cnt)); + else + MLX5_SET(qpc, qpc, no_sq, 1); + + if (attr->srq) { + MLX5_SET(qpc, qpc, xrcd, devr->xrcdn0); + MLX5_SET(qpc, qpc, srqn_rmpn_xrqn, + to_msrq(attr->srq)->msrq.srqn); + } else { + MLX5_SET(qpc, qpc, xrcd, devr->xrcdn1); + MLX5_SET(qpc, qpc, srqn_rmpn_xrqn, + to_msrq(devr->s1)->msrq.srqn); + } + + if (attr->send_cq) + MLX5_SET(qpc, qpc, cqn_snd, to_mcq(attr->send_cq)->mcq.cqn); + + if (attr->recv_cq) + MLX5_SET(qpc, qpc, cqn_rcv, to_mcq(attr->recv_cq)->mcq.cqn); + + MLX5_SET64(qpc, qpc, dbr_addr, qp->db.dma); + + /* 0xffffff means we ask to work with cqe version 0 */ + if (MLX5_CAP_GEN(mdev, cqe_version) == MLX5_CQE_VERSION_V1) + MLX5_SET(qpc, qpc, user_index, uidx); + + /* we use IB_QP_CREATE_IPOIB_UD_LSO to indicates ipoib qp */ + if (qp->flags & IB_QP_CREATE_IPOIB_UD_LSO) + MLX5_SET(qpc, qpc, ulp_stateless_offload_mode, 1); + + err = mlx5_qpc_create_qp(dev, &base->mqp, in, inlen, out); + kvfree(in); + if (err) + goto err_create; + + base->container_mibqp = qp; + base->mqp.event = mlx5_ib_qp_event; + + get_cqs(qp->type, attr->send_cq, attr->recv_cq, + &send_cq, &recv_cq); + spin_lock_irqsave(&dev->reset_flow_resource_lock, flags); + mlx5_ib_lock_cqs(send_cq, recv_cq); + /* Maintain device to QPs access, needed for further handling via reset + * flow + */ + list_add_tail(&qp->qps_list, &dev->qp_list); + /* Maintain CQ to QPs access, needed for further handling via reset flow + */ + if (send_cq) + list_add_tail(&qp->cq_send_list, &send_cq->list_send_qp); + if (recv_cq) + list_add_tail(&qp->cq_recv_list, &recv_cq->list_recv_qp); + mlx5_ib_unlock_cqs(send_cq, recv_cq); + spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags); + + return 0; + +err_create: + destroy_qp(dev, qp, base, NULL); + return err; +} + +static void mlx5_ib_lock_cqs(struct mlx5_ib_cq *send_cq, struct mlx5_ib_cq *recv_cq) + __acquires(&send_cq->lock) __acquires(&recv_cq->lock) +{ + if (send_cq) { + if (recv_cq) { + if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { + spin_lock(&send_cq->lock); + spin_lock_nested(&recv_cq->lock, + SINGLE_DEPTH_NESTING); + } else if (send_cq->mcq.cqn == recv_cq->mcq.cqn) { + spin_lock(&send_cq->lock); + __acquire(&recv_cq->lock); + } else { + spin_lock(&recv_cq->lock); + spin_lock_nested(&send_cq->lock, + SINGLE_DEPTH_NESTING); + } + } else { + spin_lock(&send_cq->lock); + __acquire(&recv_cq->lock); + } + } else if (recv_cq) { + spin_lock(&recv_cq->lock); + __acquire(&send_cq->lock); + } else { + __acquire(&send_cq->lock); + __acquire(&recv_cq->lock); + } +} + +static void mlx5_ib_unlock_cqs(struct mlx5_ib_cq *send_cq, struct mlx5_ib_cq *recv_cq) + __releases(&send_cq->lock) __releases(&recv_cq->lock) +{ + if (send_cq) { + if (recv_cq) { + if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { + spin_unlock(&recv_cq->lock); + spin_unlock(&send_cq->lock); + } else if (send_cq->mcq.cqn == recv_cq->mcq.cqn) { + __release(&recv_cq->lock); + spin_unlock(&send_cq->lock); + } else { + spin_unlock(&send_cq->lock); + spin_unlock(&recv_cq->lock); + } + } else { + __release(&recv_cq->lock); + spin_unlock(&send_cq->lock); + } + } else if (recv_cq) { + __release(&send_cq->lock); + spin_unlock(&recv_cq->lock); + } else { + __release(&recv_cq->lock); + __release(&send_cq->lock); + } +} + +static void get_cqs(enum ib_qp_type qp_type, + struct ib_cq *ib_send_cq, struct ib_cq *ib_recv_cq, + struct mlx5_ib_cq **send_cq, struct mlx5_ib_cq **recv_cq) +{ + switch (qp_type) { + case IB_QPT_XRC_TGT: + *send_cq = NULL; + *recv_cq = NULL; + break; + case MLX5_IB_QPT_REG_UMR: + case IB_QPT_XRC_INI: + *send_cq = ib_send_cq ? to_mcq(ib_send_cq) : NULL; + *recv_cq = NULL; + break; + + case IB_QPT_SMI: + case MLX5_IB_QPT_HW_GSI: + case IB_QPT_RC: + case IB_QPT_UC: + case IB_QPT_UD: + case MLX5_IB_QPT_SW_CNAK: + case IB_QPT_RAW_PACKET: + *send_cq = ib_send_cq ? to_mcq(ib_send_cq) : NULL; + *recv_cq = ib_recv_cq ? to_mcq(ib_recv_cq) : NULL; + break; + default: + *send_cq = NULL; + *recv_cq = NULL; + break; + } +} + +static int modify_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + const struct mlx5_modify_raw_qp_param *raw_qp_param, + u8 lag_tx_affinity); + +static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + struct ib_udata *udata) +{ + struct mlx5_ib_cq *send_cq, *recv_cq; + struct mlx5_ib_qp_base *base; + unsigned long flags; + int err; + + if (qp->is_rss) { + destroy_rss_raw_qp_tir(dev, qp); + return; + } + + base = (qp->type == IB_QPT_RAW_PACKET || + qp->flags & IB_QP_CREATE_SOURCE_QPN) ? + &qp->raw_packet_qp.rq.base : + &qp->trans_qp.base; + + if (qp->state != IB_QPS_RESET) { + if (qp->type != IB_QPT_RAW_PACKET && + !(qp->flags & IB_QP_CREATE_SOURCE_QPN)) { + err = mlx5_core_qp_modify(dev, MLX5_CMD_OP_2RST_QP, 0, + NULL, &base->mqp, NULL); + } else { + struct mlx5_modify_raw_qp_param raw_qp_param = { + .operation = MLX5_CMD_OP_2RST_QP + }; + + err = modify_raw_packet_qp(dev, qp, &raw_qp_param, 0); + } + if (err) + mlx5_ib_warn(dev, "mlx5_ib: modify QP 0x%06x to RESET failed\n", + base->mqp.qpn); + } + + get_cqs(qp->type, qp->ibqp.send_cq, qp->ibqp.recv_cq, &send_cq, + &recv_cq); + + spin_lock_irqsave(&dev->reset_flow_resource_lock, flags); + mlx5_ib_lock_cqs(send_cq, recv_cq); + /* del from lists under both locks above to protect reset flow paths */ + list_del(&qp->qps_list); + if (send_cq) + list_del(&qp->cq_send_list); + + if (recv_cq) + list_del(&qp->cq_recv_list); + + if (!udata) { + __mlx5_ib_cq_clean(recv_cq, base->mqp.qpn, + qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL); + if (send_cq != recv_cq) + __mlx5_ib_cq_clean(send_cq, base->mqp.qpn, + NULL); + } + mlx5_ib_unlock_cqs(send_cq, recv_cq); + spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags); + + if (qp->type == IB_QPT_RAW_PACKET || + qp->flags & IB_QP_CREATE_SOURCE_QPN) { + destroy_raw_packet_qp(dev, qp); + } else { + err = mlx5_core_destroy_qp(dev, &base->mqp); + if (err) + mlx5_ib_warn(dev, "failed to destroy QP 0x%x\n", + base->mqp.qpn); + } + + destroy_qp(dev, qp, base, udata); +} + +static int create_dct(struct mlx5_ib_dev *dev, struct ib_pd *pd, + struct mlx5_ib_qp *qp, + struct mlx5_create_qp_params *params) +{ + struct ib_qp_init_attr *attr = params->attr; + struct mlx5_ib_create_qp *ucmd = params->ucmd; + u32 uidx = params->uidx; + void *dctc; + + if (mlx5_lag_is_active(dev->mdev) && !MLX5_CAP_GEN(dev->mdev, lag_dct)) + return -EOPNOTSUPP; + + qp->dct.in = kzalloc(MLX5_ST_SZ_BYTES(create_dct_in), GFP_KERNEL); + if (!qp->dct.in) + return -ENOMEM; + + MLX5_SET(create_dct_in, qp->dct.in, uid, to_mpd(pd)->uid); + dctc = MLX5_ADDR_OF(create_dct_in, qp->dct.in, dct_context_entry); + MLX5_SET(dctc, dctc, pd, to_mpd(pd)->pdn); + MLX5_SET(dctc, dctc, srqn_xrqn, to_msrq(attr->srq)->msrq.srqn); + MLX5_SET(dctc, dctc, cqn, to_mcq(attr->recv_cq)->mcq.cqn); + MLX5_SET64(dctc, dctc, dc_access_key, ucmd->access_key); + MLX5_SET(dctc, dctc, user_index, uidx); + if (MLX5_CAP_GEN(dev->mdev, ece_support)) + MLX5_SET(dctc, dctc, ece, ucmd->ece_options); + + if (qp->flags_en & MLX5_QP_FLAG_SCATTER_CQE) { + int rcqe_sz = mlx5_ib_get_cqe_size(attr->recv_cq); + + if (rcqe_sz == 128) + MLX5_SET(dctc, dctc, cs_res, MLX5_RES_SCAT_DATA64_CQE); + } + + qp->state = IB_QPS_RESET; + return 0; +} + +static int check_qp_type(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr, + enum ib_qp_type *type) +{ + if (attr->qp_type == IB_QPT_DRIVER && !MLX5_CAP_GEN(dev->mdev, dct)) + goto out; + + switch (attr->qp_type) { + case IB_QPT_XRC_TGT: + case IB_QPT_XRC_INI: + if (!MLX5_CAP_GEN(dev->mdev, xrc)) + goto out; + fallthrough; + case IB_QPT_RC: + case IB_QPT_UC: + case IB_QPT_SMI: + case MLX5_IB_QPT_HW_GSI: + case IB_QPT_DRIVER: + case IB_QPT_GSI: + case IB_QPT_RAW_PACKET: + case IB_QPT_UD: + case MLX5_IB_QPT_REG_UMR: + case MLX5_IB_QPT_SW_CNAK: + break; + default: + goto out; + } + + *type = attr->qp_type; + return 0; + +out: + mlx5_ib_dbg(dev, "Unsupported QP type %d\n", attr->qp_type); + return -EOPNOTSUPP; +} + +static int check_valid_flow(struct mlx5_ib_dev *dev, struct ib_pd *pd, + struct ib_qp_init_attr *attr, + struct ib_udata *udata) +{ + struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context( + udata, struct mlx5_ib_ucontext, ibucontext); + + if (!udata) { + /* Kernel create_qp callers */ + if (attr->rwq_ind_tbl) + return -EOPNOTSUPP; + + switch (attr->qp_type) { + case IB_QPT_RAW_PACKET: + case IB_QPT_DRIVER: + return -EOPNOTSUPP; + default: + return 0; + } + } + + /* Userspace create_qp callers */ + if (attr->qp_type == IB_QPT_RAW_PACKET && !ucontext->cqe_version) { + mlx5_ib_dbg(dev, + "Raw Packet QP is only supported for CQE version > 0\n"); + return -EINVAL; + } + + if (attr->qp_type != IB_QPT_RAW_PACKET && attr->rwq_ind_tbl) { + mlx5_ib_dbg(dev, + "Wrong QP type %d for the RWQ indirect table\n", + attr->qp_type); + return -EINVAL; + } + + /* + * We don't need to see this warning, it means that kernel code + * missing ib_pd. Placed here to catch developer's mistakes. + */ + WARN_ONCE(!pd && attr->qp_type != IB_QPT_XRC_TGT, + "There is a missing PD pointer assignment\n"); + return 0; +} + +static void process_vendor_flag(struct mlx5_ib_dev *dev, int *flags, int flag, + bool cond, struct mlx5_ib_qp *qp) +{ + if (!(*flags & flag)) + return; + + if (cond) { + qp->flags_en |= flag; + *flags &= ~flag; + return; + } + + switch (flag) { + case MLX5_QP_FLAG_SCATTER_CQE: + case MLX5_QP_FLAG_ALLOW_SCATTER_CQE: + /* + * We don't return error if these flags were provided, + * and mlx5 doesn't have right capability. + */ + *flags &= ~(MLX5_QP_FLAG_SCATTER_CQE | + MLX5_QP_FLAG_ALLOW_SCATTER_CQE); + return; + default: + break; + } + mlx5_ib_dbg(dev, "Vendor create QP flag 0x%X is not supported\n", flag); +} + +static int process_vendor_flags(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + void *ucmd, struct ib_qp_init_attr *attr) +{ + struct mlx5_core_dev *mdev = dev->mdev; + bool cond; + int flags; + + if (attr->rwq_ind_tbl) + flags = ((struct mlx5_ib_create_qp_rss *)ucmd)->flags; + else + flags = ((struct mlx5_ib_create_qp *)ucmd)->flags; + + switch (flags & (MLX5_QP_FLAG_TYPE_DCT | MLX5_QP_FLAG_TYPE_DCI)) { + case MLX5_QP_FLAG_TYPE_DCI: + qp->type = MLX5_IB_QPT_DCI; + break; + case MLX5_QP_FLAG_TYPE_DCT: + qp->type = MLX5_IB_QPT_DCT; + break; + default: + if (qp->type != IB_QPT_DRIVER) + break; + /* + * It is IB_QPT_DRIVER and or no subtype or + * wrong subtype were provided. + */ + return -EINVAL; + } + + process_vendor_flag(dev, &flags, MLX5_QP_FLAG_TYPE_DCI, true, qp); + process_vendor_flag(dev, &flags, MLX5_QP_FLAG_TYPE_DCT, true, qp); + process_vendor_flag(dev, &flags, MLX5_QP_FLAG_DCI_STREAM, + MLX5_CAP_GEN(mdev, log_max_dci_stream_channels), + qp); + + process_vendor_flag(dev, &flags, MLX5_QP_FLAG_SIGNATURE, true, qp); + process_vendor_flag(dev, &flags, MLX5_QP_FLAG_SCATTER_CQE, + MLX5_CAP_GEN(mdev, sctr_data_cqe), qp); + process_vendor_flag(dev, &flags, MLX5_QP_FLAG_ALLOW_SCATTER_CQE, + MLX5_CAP_GEN(mdev, sctr_data_cqe), qp); + + if (qp->type == IB_QPT_RAW_PACKET) { + cond = MLX5_CAP_ETH(mdev, tunnel_stateless_vxlan) || + MLX5_CAP_ETH(mdev, tunnel_stateless_gre) || + MLX5_CAP_ETH(mdev, tunnel_stateless_geneve_rx); + process_vendor_flag(dev, &flags, MLX5_QP_FLAG_TUNNEL_OFFLOADS, + cond, qp); + process_vendor_flag(dev, &flags, + MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC, true, + qp); + process_vendor_flag(dev, &flags, + MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC, true, + qp); + } + + if (qp->type == IB_QPT_RC) + process_vendor_flag(dev, &flags, + MLX5_QP_FLAG_PACKET_BASED_CREDIT_MODE, + MLX5_CAP_GEN(mdev, qp_packet_based), qp); + + process_vendor_flag(dev, &flags, MLX5_QP_FLAG_BFREG_INDEX, true, qp); + process_vendor_flag(dev, &flags, MLX5_QP_FLAG_UAR_PAGE_INDEX, true, qp); + + cond = qp->flags_en & ~(MLX5_QP_FLAG_TUNNEL_OFFLOADS | + MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC | + MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC); + if (attr->rwq_ind_tbl && cond) { + mlx5_ib_dbg(dev, "RSS RAW QP has unsupported flags 0x%X\n", + cond); + return -EINVAL; + } + + if (flags) + mlx5_ib_dbg(dev, "udata has unsupported flags 0x%X\n", flags); + + return (flags) ? -EINVAL : 0; + } + +static void process_create_flag(struct mlx5_ib_dev *dev, int *flags, int flag, + bool cond, struct mlx5_ib_qp *qp) +{ + if (!(*flags & flag)) + return; + + if (cond) { + qp->flags |= flag; + *flags &= ~flag; + return; + } + + if (flag == MLX5_IB_QP_CREATE_WC_TEST) { + /* + * Special case, if condition didn't meet, it won't be error, + * just different in-kernel flow. + */ + *flags &= ~MLX5_IB_QP_CREATE_WC_TEST; + return; + } + mlx5_ib_dbg(dev, "Verbs create QP flag 0x%X is not supported\n", flag); +} + +static int process_create_flags(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + struct ib_qp_init_attr *attr) +{ + enum ib_qp_type qp_type = qp->type; + struct mlx5_core_dev *mdev = dev->mdev; + int create_flags = attr->create_flags; + bool cond; + + if (qp_type == MLX5_IB_QPT_DCT) + return (create_flags) ? -EINVAL : 0; + + if (qp_type == IB_QPT_RAW_PACKET && attr->rwq_ind_tbl) + return (create_flags) ? -EINVAL : 0; + + process_create_flag(dev, &create_flags, IB_QP_CREATE_NETIF_QP, + mlx5_get_flow_namespace(dev->mdev, + MLX5_FLOW_NAMESPACE_BYPASS), + qp); + process_create_flag(dev, &create_flags, + IB_QP_CREATE_INTEGRITY_EN, + MLX5_CAP_GEN(mdev, sho), qp); + process_create_flag(dev, &create_flags, + IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK, + MLX5_CAP_GEN(mdev, block_lb_mc), qp); + process_create_flag(dev, &create_flags, IB_QP_CREATE_CROSS_CHANNEL, + MLX5_CAP_GEN(mdev, cd), qp); + process_create_flag(dev, &create_flags, IB_QP_CREATE_MANAGED_SEND, + MLX5_CAP_GEN(mdev, cd), qp); + process_create_flag(dev, &create_flags, IB_QP_CREATE_MANAGED_RECV, + MLX5_CAP_GEN(mdev, cd), qp); + + if (qp_type == IB_QPT_UD) { + process_create_flag(dev, &create_flags, + IB_QP_CREATE_IPOIB_UD_LSO, + MLX5_CAP_GEN(mdev, ipoib_basic_offloads), + qp); + cond = MLX5_CAP_GEN(mdev, port_type) == MLX5_CAP_PORT_TYPE_IB; + process_create_flag(dev, &create_flags, IB_QP_CREATE_SOURCE_QPN, + cond, qp); + } + + if (qp_type == IB_QPT_RAW_PACKET) { + cond = MLX5_CAP_GEN(mdev, eth_net_offloads) && + MLX5_CAP_ETH(mdev, scatter_fcs); + process_create_flag(dev, &create_flags, + IB_QP_CREATE_SCATTER_FCS, cond, qp); + + cond = MLX5_CAP_GEN(mdev, eth_net_offloads) && + MLX5_CAP_ETH(mdev, vlan_cap); + process_create_flag(dev, &create_flags, + IB_QP_CREATE_CVLAN_STRIPPING, cond, qp); + } + + process_create_flag(dev, &create_flags, + IB_QP_CREATE_PCI_WRITE_END_PADDING, + MLX5_CAP_GEN(mdev, end_pad), qp); + + process_create_flag(dev, &create_flags, MLX5_IB_QP_CREATE_WC_TEST, + qp_type != MLX5_IB_QPT_REG_UMR, qp); + process_create_flag(dev, &create_flags, MLX5_IB_QP_CREATE_SQPN_QP1, + true, qp); + process_create_flag(dev, &create_flags, IB_QP_CREATE_SIGNATURE_PIPELINE, + MLX5_CAP_GEN(mdev, drain_sigerr), qp); + + if (create_flags) { + mlx5_ib_dbg(dev, "Create QP has unsupported flags 0x%X\n", + create_flags); + return -EOPNOTSUPP; + } + return 0; +} + +static int process_udata_size(struct mlx5_ib_dev *dev, + struct mlx5_create_qp_params *params) +{ + size_t ucmd = sizeof(struct mlx5_ib_create_qp); + struct ib_udata *udata = params->udata; + size_t outlen = udata->outlen; + size_t inlen = udata->inlen; + + params->outlen = min(outlen, sizeof(struct mlx5_ib_create_qp_resp)); + params->ucmd_size = ucmd; + if (!params->is_rss_raw) { + /* User has old rdma-core, which doesn't support ECE */ + size_t min_inlen = + offsetof(struct mlx5_ib_create_qp, ece_options); + + /* + * We will check in check_ucmd_data() that user + * cleared everything after inlen. + */ + params->inlen = (inlen < min_inlen) ? 0 : min(inlen, ucmd); + goto out; + } + + /* RSS RAW QP */ + if (inlen < offsetofend(struct mlx5_ib_create_qp_rss, flags)) + return -EINVAL; + + if (outlen < offsetofend(struct mlx5_ib_create_qp_resp, bfreg_index)) + return -EINVAL; + + ucmd = sizeof(struct mlx5_ib_create_qp_rss); + params->ucmd_size = ucmd; + if (inlen > ucmd && !ib_is_udata_cleared(udata, ucmd, inlen - ucmd)) + return -EINVAL; + + params->inlen = min(ucmd, inlen); +out: + if (!params->inlen) + mlx5_ib_dbg(dev, "udata is too small\n"); + + return (params->inlen) ? 0 : -EINVAL; +} + +static int create_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, + struct mlx5_ib_qp *qp, + struct mlx5_create_qp_params *params) +{ + int err; + + if (params->is_rss_raw) { + err = create_rss_raw_qp_tir(dev, pd, qp, params); + goto out; + } + + switch (qp->type) { + case MLX5_IB_QPT_DCT: + err = create_dct(dev, pd, qp, params); + rdma_restrack_no_track(&qp->ibqp.res); + break; + case MLX5_IB_QPT_DCI: + err = create_dci(dev, pd, qp, params); + break; + case IB_QPT_XRC_TGT: + err = create_xrc_tgt_qp(dev, qp, params); + break; + case IB_QPT_GSI: + err = mlx5_ib_create_gsi(pd, qp, params->attr); + break; + case MLX5_IB_QPT_HW_GSI: + case MLX5_IB_QPT_REG_UMR: + rdma_restrack_no_track(&qp->ibqp.res); + fallthrough; + default: + if (params->udata) + err = create_user_qp(dev, pd, qp, params); + else + err = create_kernel_qp(dev, pd, qp, params); + } + +out: + if (err) { + mlx5_ib_err(dev, "Create QP type %d failed\n", qp->type); + return err; + } + + if (is_qp0(qp->type)) + qp->ibqp.qp_num = 0; + else if (is_qp1(qp->type)) + qp->ibqp.qp_num = 1; + else + qp->ibqp.qp_num = qp->trans_qp.base.mqp.qpn; + + mlx5_ib_dbg(dev, + "QP type %d, ib qpn 0x%X, mlx qpn 0x%x, rcqn 0x%x, scqn 0x%x, ece 0x%x\n", + qp->type, qp->ibqp.qp_num, qp->trans_qp.base.mqp.qpn, + params->attr->recv_cq ? to_mcq(params->attr->recv_cq)->mcq.cqn : + -1, + params->attr->send_cq ? to_mcq(params->attr->send_cq)->mcq.cqn : + -1, + params->resp.ece_options); + + return 0; +} + +static int check_qp_attr(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + struct ib_qp_init_attr *attr) +{ + int ret = 0; + + switch (qp->type) { + case MLX5_IB_QPT_DCT: + ret = (!attr->srq || !attr->recv_cq) ? -EINVAL : 0; + break; + case MLX5_IB_QPT_DCI: + ret = (attr->cap.max_recv_wr || attr->cap.max_recv_sge) ? + -EINVAL : + 0; + break; + case IB_QPT_RAW_PACKET: + ret = (attr->rwq_ind_tbl && attr->send_cq) ? -EINVAL : 0; + break; + default: + break; + } + + if (ret) + mlx5_ib_dbg(dev, "QP type %d has wrong attributes\n", qp->type); + + return ret; +} + +static int get_qp_uidx(struct mlx5_ib_qp *qp, + struct mlx5_create_qp_params *params) +{ + struct mlx5_ib_create_qp *ucmd = params->ucmd; + struct ib_udata *udata = params->udata; + struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context( + udata, struct mlx5_ib_ucontext, ibucontext); + + if (params->is_rss_raw) + return 0; + + return get_qp_user_index(ucontext, ucmd, sizeof(*ucmd), ¶ms->uidx); +} + +static int mlx5_ib_destroy_dct(struct mlx5_ib_qp *mqp) +{ + struct mlx5_ib_dev *dev = to_mdev(mqp->ibqp.device); + + if (mqp->state == IB_QPS_RTR) { + int err; + + err = mlx5_core_destroy_dct(dev, &mqp->dct.mdct); + if (err) { + mlx5_ib_warn(dev, "failed to destroy DCT %d\n", err); + return err; + } + } + + kfree(mqp->dct.in); + return 0; +} + +static int check_ucmd_data(struct mlx5_ib_dev *dev, + struct mlx5_create_qp_params *params) +{ + struct ib_udata *udata = params->udata; + size_t size, last; + int ret; + + if (params->is_rss_raw) + /* + * These QPs don't have "reserved" field in their + * create_qp input struct, so their data is always valid. + */ + last = sizeof(struct mlx5_ib_create_qp_rss); + else + last = offsetof(struct mlx5_ib_create_qp, reserved); + + if (udata->inlen <= last) + return 0; + + /* + * User provides different create_qp structures based on the + * flow and we need to know if he cleared memory after our + * struct create_qp ends. + */ + size = udata->inlen - last; + ret = ib_is_udata_cleared(params->udata, last, size); + if (!ret) + mlx5_ib_dbg( + dev, + "udata is not cleared, inlen = %zu, ucmd = %zu, last = %zu, size = %zu\n", + udata->inlen, params->ucmd_size, last, size); + return ret ? 0 : -EINVAL; +} + +int mlx5_ib_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attr, + struct ib_udata *udata) +{ + struct mlx5_create_qp_params params = {}; + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + struct mlx5_ib_qp *qp = to_mqp(ibqp); + struct ib_pd *pd = ibqp->pd; + enum ib_qp_type type; + int err; + + err = check_qp_type(dev, attr, &type); + if (err) + return err; + + err = check_valid_flow(dev, pd, attr, udata); + if (err) + return err; + + params.udata = udata; + params.uidx = MLX5_IB_DEFAULT_UIDX; + params.attr = attr; + params.is_rss_raw = !!attr->rwq_ind_tbl; + + if (udata) { + err = process_udata_size(dev, ¶ms); + if (err) + return err; + + err = check_ucmd_data(dev, ¶ms); + if (err) + return err; + + params.ucmd = kzalloc(params.ucmd_size, GFP_KERNEL); + if (!params.ucmd) + return -ENOMEM; + + err = ib_copy_from_udata(params.ucmd, udata, params.inlen); + if (err) + goto free_ucmd; + } + + mutex_init(&qp->mutex); + qp->type = type; + if (udata) { + err = process_vendor_flags(dev, qp, params.ucmd, attr); + if (err) + goto free_ucmd; + + err = get_qp_uidx(qp, ¶ms); + if (err) + goto free_ucmd; + } + err = process_create_flags(dev, qp, attr); + if (err) + goto free_ucmd; + + err = check_qp_attr(dev, qp, attr); + if (err) + goto free_ucmd; + + err = create_qp(dev, pd, qp, ¶ms); + if (err) + goto free_ucmd; + + kfree(params.ucmd); + params.ucmd = NULL; + + if (udata) + /* + * It is safe to copy response for all user create QP flows, + * including MLX5_IB_QPT_DCT, which doesn't need it. + * In that case, resp will be filled with zeros. + */ + err = ib_copy_to_udata(udata, ¶ms.resp, params.outlen); + if (err) + goto destroy_qp; + + return 0; + +destroy_qp: + switch (qp->type) { + case MLX5_IB_QPT_DCT: + mlx5_ib_destroy_dct(qp); + break; + case IB_QPT_GSI: + mlx5_ib_destroy_gsi(qp); + break; + default: + destroy_qp_common(dev, qp, udata); + } + +free_ucmd: + kfree(params.ucmd); + return err; +} + +int mlx5_ib_destroy_qp(struct ib_qp *qp, struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->device); + struct mlx5_ib_qp *mqp = to_mqp(qp); + + if (mqp->type == IB_QPT_GSI) + return mlx5_ib_destroy_gsi(mqp); + + if (mqp->type == MLX5_IB_QPT_DCT) + return mlx5_ib_destroy_dct(mqp); + + destroy_qp_common(dev, mqp, udata); + return 0; +} + +static int set_qpc_atomic_flags(struct mlx5_ib_qp *qp, + const struct ib_qp_attr *attr, int attr_mask, + void *qpc) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.device); + u8 dest_rd_atomic; + u32 access_flags; + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) + dest_rd_atomic = attr->max_dest_rd_atomic; + else + dest_rd_atomic = qp->trans_qp.resp_depth; + + if (attr_mask & IB_QP_ACCESS_FLAGS) + access_flags = attr->qp_access_flags; + else + access_flags = qp->trans_qp.atomic_rd_en; + + if (!dest_rd_atomic) + access_flags &= IB_ACCESS_REMOTE_WRITE; + + MLX5_SET(qpc, qpc, rre, !!(access_flags & IB_ACCESS_REMOTE_READ)); + + if (access_flags & IB_ACCESS_REMOTE_ATOMIC) { + int atomic_mode; + + atomic_mode = get_atomic_mode(dev, qp->type); + if (atomic_mode < 0) + return -EOPNOTSUPP; + + MLX5_SET(qpc, qpc, rae, 1); + MLX5_SET(qpc, qpc, atomic_mode, atomic_mode); + } + + MLX5_SET(qpc, qpc, rwe, !!(access_flags & IB_ACCESS_REMOTE_WRITE)); + return 0; +} + +enum { + MLX5_PATH_FLAG_FL = 1 << 0, + MLX5_PATH_FLAG_FREE_AR = 1 << 1, + MLX5_PATH_FLAG_COUNTER = 1 << 2, +}; + +static int mlx5_to_ib_rate_map(u8 rate) +{ + static const int rates[] = { IB_RATE_PORT_CURRENT, IB_RATE_56_GBPS, + IB_RATE_25_GBPS, IB_RATE_100_GBPS, + IB_RATE_200_GBPS, IB_RATE_50_GBPS, + IB_RATE_400_GBPS }; + + if (rate < ARRAY_SIZE(rates)) + return rates[rate]; + + return rate - MLX5_STAT_RATE_OFFSET; +} + +static int ib_to_mlx5_rate_map(u8 rate) +{ + switch (rate) { + case IB_RATE_PORT_CURRENT: + return 0; + case IB_RATE_56_GBPS: + return 1; + case IB_RATE_25_GBPS: + return 2; + case IB_RATE_100_GBPS: + return 3; + case IB_RATE_200_GBPS: + return 4; + case IB_RATE_50_GBPS: + return 5; + case IB_RATE_400_GBPS: + return 6; + default: + return rate + MLX5_STAT_RATE_OFFSET; + } + + return 0; +} + +static int ib_rate_to_mlx5(struct mlx5_ib_dev *dev, u8 rate) +{ + u32 stat_rate_support; + + if (rate == IB_RATE_PORT_CURRENT) + return 0; + + if (rate < IB_RATE_2_5_GBPS || rate > IB_RATE_600_GBPS) + return -EINVAL; + + stat_rate_support = MLX5_CAP_GEN(dev->mdev, stat_rate_support); + while (rate != IB_RATE_PORT_CURRENT && + !(1 << ib_to_mlx5_rate_map(rate) & stat_rate_support)) + --rate; + + return ib_to_mlx5_rate_map(rate); +} + +static int modify_raw_packet_eth_prio(struct mlx5_core_dev *dev, + struct mlx5_ib_sq *sq, u8 sl, + struct ib_pd *pd) +{ + void *in; + void *tisc; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(modify_tis_in); + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(modify_tis_in, in, bitmask.prio, 1); + MLX5_SET(modify_tis_in, in, uid, to_mpd(pd)->uid); + + tisc = MLX5_ADDR_OF(modify_tis_in, in, ctx); + MLX5_SET(tisc, tisc, prio, ((sl & 0x7) << 1)); + + err = mlx5_core_modify_tis(dev, sq->tisn, in); + + kvfree(in); + + return err; +} + +static int modify_raw_packet_tx_affinity(struct mlx5_core_dev *dev, + struct mlx5_ib_sq *sq, u8 tx_affinity, + struct ib_pd *pd) +{ + void *in; + void *tisc; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(modify_tis_in); + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(modify_tis_in, in, bitmask.lag_tx_port_affinity, 1); + MLX5_SET(modify_tis_in, in, uid, to_mpd(pd)->uid); + + tisc = MLX5_ADDR_OF(modify_tis_in, in, ctx); + MLX5_SET(tisc, tisc, lag_tx_port_affinity, tx_affinity); + + err = mlx5_core_modify_tis(dev, sq->tisn, in); + + kvfree(in); + + return err; +} + +static void mlx5_set_path_udp_sport(void *path, const struct rdma_ah_attr *ah, + u32 lqpn, u32 rqpn) + +{ + u32 fl = ah->grh.flow_label; + + if (!fl) + fl = rdma_calc_flow_label(lqpn, rqpn); + + MLX5_SET(ads, path, udp_sport, rdma_flow_label_to_udp_sport(fl)); +} + +static int mlx5_set_path(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + const struct rdma_ah_attr *ah, void *path, u8 port, + int attr_mask, u32 path_flags, + const struct ib_qp_attr *attr, bool alt) +{ + const struct ib_global_route *grh = rdma_ah_read_grh(ah); + bool global_tc; + int err; + u8 ah_flags = rdma_ah_get_ah_flags(ah); + u8 sl = rdma_ah_get_sl(ah); + u8 tclass = grh->traffic_class; + + if (attr_mask & IB_QP_PKEY_INDEX) + MLX5_SET(ads, path, pkey_index, + alt ? attr->alt_pkey_index : attr->pkey_index); + + if (ah_flags & IB_AH_GRH) { + const struct ib_port_immutable *immutable; + u8 tmp_hop_limit = 0; + enum ib_gid_type gid_type; + + immutable = ib_port_immutable_read(&dev->ib_dev, port); + if (grh->sgid_index >= immutable->gid_tbl_len) { + pr_err("sgid_index (%u) too large. max is %d\n", + grh->sgid_index, + immutable->gid_tbl_len); + return -EINVAL; + } + + tclass_get_tclass(dev, &dev->tcd[port - 1], ah, port, &tclass, + &global_tc); + memcpy(&qp->ah, ah, sizeof(qp->ah)); + qp->tclass = tclass; + + gid_type = ah->grh.sgid_attr->gid_type; + if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) { + if (qp->type == MLX5_IB_QPT_DCI) + MLX5_SET(ads, path, f_dscp, global_tc); + MLX5_SET(ads, path, dscp, tclass >> 2); + } + + MLX5_SET(ads, path, src_addr_index, grh->sgid_index); + if ((ah->type == RDMA_AH_ATTR_TYPE_ROCE) && + (gid_type != IB_GID_TYPE_IB) && + (ah->grh.hop_limit < 2)) + tmp_hop_limit = IPV6_DEFAULT_HOPLIMIT; + else + tmp_hop_limit = ah->grh.hop_limit; + + tmp_hop_limit = (dev->ttld[port - 1].val) ? + dev->ttld[port - 1].val : tmp_hop_limit; + MLX5_SET(ads, path, hop_limit,tmp_hop_limit); + MLX5_SET(ads, path, flow_label, grh->flow_label); + memcpy(MLX5_ADDR_OF(ads, path, rgid_rip), grh->dgid.raw, + sizeof(grh->dgid.raw)); + } + + if (ah->type == RDMA_AH_ATTR_TYPE_ROCE) { + if (!(ah_flags & IB_AH_GRH)) + return -EINVAL; + + ether_addr_copy(MLX5_ADDR_OF(ads, path, rmac_47_32), + ah->roce.dmac); + if ((qp->type == IB_QPT_RC || + qp->type == IB_QPT_UC || + qp->type == IB_QPT_XRC_INI || + qp->type == IB_QPT_XRC_TGT) && + (grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) && + (attr_mask & IB_QP_DEST_QPN)) + mlx5_set_path_udp_sport(path, ah, + qp->ibqp.qp_num, + attr->dest_qp_num); + MLX5_SET(ads, path, eth_prio, sl & 0x7); + } else { + MLX5_SET(ads, path, fl, !!(path_flags & MLX5_PATH_FLAG_FL)); + MLX5_SET(ads, path, free_ar, + !!(path_flags & MLX5_PATH_FLAG_FREE_AR)); + MLX5_SET(ads, path, rlid, rdma_ah_get_dlid(ah)); + MLX5_SET(ads, path, mlid, rdma_ah_get_path_bits(ah)); + MLX5_SET(ads, path, grh, !!(ah_flags & IB_AH_GRH)); + MLX5_SET(ads, path, sl, sl); + } + + err = ib_rate_to_mlx5(dev, rdma_ah_get_static_rate(ah)); + if (err < 0) + return err; + MLX5_SET(ads, path, stat_rate, err); + MLX5_SET(ads, path, vhca_port_num, port); + + if (attr_mask & IB_QP_TIMEOUT) + MLX5_SET(ads, path, ack_timeout, + alt ? attr->alt_timeout : attr->timeout); + + if ((qp->type == IB_QPT_RAW_PACKET) && qp->sq.wqe_cnt) + return modify_raw_packet_eth_prio(dev->mdev, + &qp->raw_packet_qp.sq, + sl & 0xf, qp->ibqp.pd); + + return 0; +} + +static enum mlx5_qp_optpar opt_mask[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE][MLX5_QP_ST_MAX] = { + [MLX5_QP_STATE_INIT] = { + [MLX5_QP_STATE_INIT] = { + [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_PRI_PORT | + MLX5_QP_OPTPAR_LAG_TX_AFF, + [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_PRI_PORT | + MLX5_QP_OPTPAR_LAG_TX_AFF, + [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_Q_KEY | + MLX5_QP_OPTPAR_PRI_PORT, + [MLX5_QP_ST_XRC] = MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_PRI_PORT | + MLX5_QP_OPTPAR_LAG_TX_AFF, + }, + [MLX5_QP_STATE_RTR] = { + [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | + MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_LAG_TX_AFF, + [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_LAG_TX_AFF, + [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_Q_KEY, + [MLX5_QP_ST_MLX] = MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_Q_KEY, + [MLX5_QP_ST_XRC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | + MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_LAG_TX_AFF, + }, + }, + [MLX5_QP_STATE_RTR] = { + [MLX5_QP_STATE_RTS] = { + [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | + MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PM_STATE | + MLX5_QP_OPTPAR_RNR_TIMEOUT | + MLX5_QP_OPTPAR_OFFLOAD_TYPE, + [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PM_STATE, + [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY, + [MLX5_QP_ST_XRC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | + MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PM_STATE | + MLX5_QP_OPTPAR_RNR_TIMEOUT, + }, + }, + [MLX5_QP_STATE_RTS] = { + [MLX5_QP_STATE_RTS] = { + [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_RNR_TIMEOUT | + MLX5_QP_OPTPAR_PM_STATE | + MLX5_QP_OPTPAR_ALT_ADDR_PATH | + MLX5_QP_OPTPAR_OFFLOAD_TYPE | + MLX5_QP_OPTPAR_RMPN_XRQN, + [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_PM_STATE | + MLX5_QP_OPTPAR_ALT_ADDR_PATH, + [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY | + MLX5_QP_OPTPAR_CQN_RCV, + [MLX5_QP_ST_XRC] = MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_RNR_TIMEOUT | + MLX5_QP_OPTPAR_PM_STATE | + MLX5_QP_OPTPAR_ALT_ADDR_PATH, + }, + }, + [MLX5_QP_STATE_SQER] = { + [MLX5_QP_STATE_RTS] = { + [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY, + [MLX5_QP_ST_MLX] = MLX5_QP_OPTPAR_Q_KEY, + [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE, + [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_RNR_TIMEOUT | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RRE, + [MLX5_QP_ST_XRC] = MLX5_QP_OPTPAR_RNR_TIMEOUT | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RRE, + }, + }, + [MLX5_QP_STATE_SQD] = { + [MLX5_QP_STATE_RTS] = { + [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY, + [MLX5_QP_ST_MLX] = MLX5_QP_OPTPAR_Q_KEY, + [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE, + [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_RNR_TIMEOUT | + MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_RAE | + MLX5_QP_OPTPAR_RRE, + }, + }, +}; + +static int ib_nr_to_mlx5_nr(int ib_mask) +{ + switch (ib_mask) { + case IB_QP_STATE: + return 0; + case IB_QP_CUR_STATE: + return 0; + case IB_QP_EN_SQD_ASYNC_NOTIFY: + return 0; + case IB_QP_ACCESS_FLAGS: + return MLX5_QP_OPTPAR_RWE | MLX5_QP_OPTPAR_RRE | + MLX5_QP_OPTPAR_RAE; + case IB_QP_PKEY_INDEX: + return MLX5_QP_OPTPAR_PKEY_INDEX; + case IB_QP_PORT: + return MLX5_QP_OPTPAR_PRI_PORT; + case IB_QP_QKEY: + return MLX5_QP_OPTPAR_Q_KEY; + case IB_QP_AV: + return MLX5_QP_OPTPAR_PRIMARY_ADDR_PATH | + MLX5_QP_OPTPAR_PRI_PORT; + case IB_QP_PATH_MTU: + return 0; + case IB_QP_TIMEOUT: + return MLX5_QP_OPTPAR_ACK_TIMEOUT; + case IB_QP_RETRY_CNT: + return MLX5_QP_OPTPAR_RETRY_COUNT; + case IB_QP_RNR_RETRY: + return MLX5_QP_OPTPAR_RNR_RETRY; + case IB_QP_RQ_PSN: + return 0; + case IB_QP_MAX_QP_RD_ATOMIC: + return MLX5_QP_OPTPAR_SRA_MAX; + case IB_QP_ALT_PATH: + return MLX5_QP_OPTPAR_ALT_ADDR_PATH; + case IB_QP_MIN_RNR_TIMER: + return MLX5_QP_OPTPAR_RNR_TIMEOUT; + case IB_QP_SQ_PSN: + return 0; + case IB_QP_MAX_DEST_RD_ATOMIC: + return MLX5_QP_OPTPAR_RRA_MAX | MLX5_QP_OPTPAR_RWE | + MLX5_QP_OPTPAR_RRE | MLX5_QP_OPTPAR_RAE; + case IB_QP_PATH_MIG_STATE: + return MLX5_QP_OPTPAR_PM_STATE; + case IB_QP_OFFLOAD_TYPE: + return MLX5_QP_OPTPAR_OFFLOAD_TYPE; + case IB_QP_RMPN_XRQN: + return MLX5_QP_OPTPAR_RMPN_XRQN; + case IB_QP_CAP: + return 0; + case IB_QP_DEST_QPN: + return 0; + } + return 0; +} + +static int ib_mask_to_mlx5_opt(int ib_mask) +{ + int result = 0; + int i; + + for (i = 0; i < 8 * sizeof(int); i++) { + if ((1 << i) & ib_mask) + result |= ib_nr_to_mlx5_nr(1 << i); + } + + return result; +} + +static int modify_raw_packet_qp_rq( + struct mlx5_ib_dev *dev, struct mlx5_ib_rq *rq, int new_state, + const struct mlx5_modify_raw_qp_param *raw_qp_param, struct ib_pd *pd) +{ + void *in; + void *rqc; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(modify_rq_in); + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(modify_rq_in, in, rq_state, rq->state); + MLX5_SET(modify_rq_in, in, uid, to_mpd(pd)->uid); + + rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx); + MLX5_SET(rqc, rqc, state, new_state); + + if (raw_qp_param->set_mask & MLX5_RAW_QP_MOD_SET_RQ_Q_CTR_ID) { + if (MLX5_CAP_GEN(dev->mdev, modify_rq_counter_set_id)) { + MLX5_SET64(modify_rq_in, in, modify_bitmask, + MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_RQ_COUNTER_SET_ID); + MLX5_SET(rqc, rqc, counter_set_id, raw_qp_param->rq_q_ctr_id); + } else + dev_info_once( + &dev->ib_dev.dev, + "RAW PACKET QP counters are not supported on current FW\n"); + } + + err = mlx5_core_modify_rq(dev->mdev, rq->base.mqp.qpn, in); + if (err) + goto out; + + rq->state = new_state; + +out: + kvfree(in); + return err; +} + +static int modify_raw_packet_qp_sq( + struct mlx5_core_dev *dev, struct mlx5_ib_sq *sq, int new_state, + const struct mlx5_modify_raw_qp_param *raw_qp_param, struct ib_pd *pd) +{ + struct mlx5_ib_qp *ibqp = sq->base.container_mibqp; + struct mlx5_rate_limit old_rl = ibqp->rl; + struct mlx5_rate_limit new_rl = old_rl; + bool new_rate_added = false; + u16 rl_index = 0; + void *in; + void *sqc; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(modify_sq_in); + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(modify_sq_in, in, uid, to_mpd(pd)->uid); + MLX5_SET(modify_sq_in, in, sq_state, sq->state); + + sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx); + MLX5_SET(sqc, sqc, state, new_state); + + if (raw_qp_param->set_mask & MLX5_RAW_QP_RATE_LIMIT) { + if (new_state != MLX5_SQC_STATE_RDY) + pr_warn("%s: Rate limit can only be changed when SQ is moving to RDY\n", + __func__); + else + new_rl = raw_qp_param->rl; + } + + if (!mlx5_rl_are_equal(&old_rl, &new_rl)) { + if (new_rl.rate) { + err = mlx5_rl_add_rate(dev, &rl_index, &new_rl); + if (err) { + pr_err("Failed configuring rate limit(err %d): \ + rate %u, max_burst_sz %u, typical_pkt_sz %u\n", + err, new_rl.rate, new_rl.max_burst_sz, + new_rl.typical_pkt_sz); + + goto out; + } + new_rate_added = true; + } + + MLX5_SET64(modify_sq_in, in, modify_bitmask, 1); + /* index 0 means no limit */ + MLX5_SET(sqc, sqc, packet_pacing_rate_limit_index, rl_index); + } + + err = mlx5_core_modify_sq(dev, sq->base.mqp.qpn, in); + if (err) { + /* Remove new rate from table if failed */ + if (new_rate_added) + mlx5_rl_remove_rate(dev, &new_rl); + goto out; + } + + /* Only remove the old rate after new rate was set */ + if ((old_rl.rate && !mlx5_rl_are_equal(&old_rl, &new_rl)) || + (new_state != MLX5_SQC_STATE_RDY)) { + mlx5_rl_remove_rate(dev, &old_rl); + if (new_state != MLX5_SQC_STATE_RDY) + memset(&new_rl, 0, sizeof(new_rl)); + } + + ibqp->rl = new_rl; + sq->state = new_state; + +out: + kvfree(in); + return err; +} + +static int modify_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + const struct mlx5_modify_raw_qp_param *raw_qp_param, + u8 tx_affinity) +{ + struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp; + struct mlx5_ib_rq *rq = &raw_packet_qp->rq; + struct mlx5_ib_sq *sq = &raw_packet_qp->sq; + int modify_rq = !!qp->rq.wqe_cnt; + int modify_sq = !!qp->sq.wqe_cnt; + int rq_state; + int sq_state; + int err; + + switch (raw_qp_param->operation) { + case MLX5_CMD_OP_RST2INIT_QP: + rq_state = MLX5_RQC_STATE_RDY; + sq_state = MLX5_SQC_STATE_RST; + break; + case MLX5_CMD_OP_2ERR_QP: + rq_state = MLX5_RQC_STATE_ERR; + sq_state = MLX5_SQC_STATE_ERR; + break; + case MLX5_CMD_OP_2RST_QP: + rq_state = MLX5_RQC_STATE_RST; + sq_state = MLX5_SQC_STATE_RST; + break; + case MLX5_CMD_OP_RTR2RTS_QP: + case MLX5_CMD_OP_RTS2RTS_QP: + if (raw_qp_param->set_mask & ~MLX5_RAW_QP_RATE_LIMIT) + return -EINVAL; + + modify_rq = 0; + sq_state = MLX5_SQC_STATE_RDY; + break; + case MLX5_CMD_OP_INIT2INIT_QP: + case MLX5_CMD_OP_INIT2RTR_QP: + if (raw_qp_param->set_mask) + return -EINVAL; + else + return 0; + default: + WARN_ON(1); + return -EINVAL; + } + + if (modify_rq) { + err = modify_raw_packet_qp_rq(dev, rq, rq_state, raw_qp_param, + qp->ibqp.pd); + if (err) + return err; + } + + if (modify_sq) { + struct mlx5_flow_handle *flow_rule; + + if (tx_affinity) { + err = modify_raw_packet_tx_affinity(dev->mdev, sq, + tx_affinity, + qp->ibqp.pd); + if (err) + return err; + } + + flow_rule = create_flow_rule_vport_sq(dev, sq, + raw_qp_param->port); + if (IS_ERR(flow_rule)) + return PTR_ERR(flow_rule); + + err = modify_raw_packet_qp_sq(dev->mdev, sq, sq_state, + raw_qp_param, qp->ibqp.pd); + if (err) { + if (flow_rule) + mlx5_del_flow_rules(flow_rule); + return err; + } + + if (flow_rule) { + destroy_flow_rule_vport_sq(sq); + sq->flow_rule = flow_rule; + } + + return err; + } + + return 0; +} + +static unsigned int get_tx_affinity_rr(struct mlx5_ib_dev *dev, + struct ib_udata *udata) +{ + struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context( + udata, struct mlx5_ib_ucontext, ibucontext); + u8 port_num = mlx5_core_native_port_num(dev->mdev) - 1; + atomic_t *tx_port_affinity; + + if (ucontext) + tx_port_affinity = &ucontext->tx_port_affinity; + else + tx_port_affinity = &dev->port[port_num].roce.tx_port_affinity; + + return (unsigned int)atomic_add_return(1, tx_port_affinity) % + (dev->lag_active ? dev->lag_ports : MLX5_CAP_GEN(dev->mdev, num_lag_ports)) + 1; +} + +static bool qp_supports_affinity(struct mlx5_ib_qp *qp) +{ + if ((qp->type == IB_QPT_RC) || (qp->type == IB_QPT_UD) || + (qp->type == IB_QPT_UC) || (qp->type == IB_QPT_RAW_PACKET) || + (qp->type == IB_QPT_XRC_INI) || (qp->type == IB_QPT_XRC_TGT) || + (qp->type == MLX5_IB_QPT_DCI)) + return true; + return false; +} + +static unsigned int get_tx_affinity(struct ib_qp *qp, + const struct ib_qp_attr *attr, + int attr_mask, u8 init, + struct ib_udata *udata) +{ + struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context( + udata, struct mlx5_ib_ucontext, ibucontext); + struct mlx5_ib_dev *dev = to_mdev(qp->device); + struct mlx5_ib_qp *mqp = to_mqp(qp); + struct mlx5_ib_qp_base *qp_base; + unsigned int tx_affinity; + + if (!(mlx5_ib_lag_should_assign_affinity(dev) && + qp_supports_affinity(mqp))) + return 0; + + if (mqp->flags & MLX5_IB_QP_CREATE_SQPN_QP1) + tx_affinity = mqp->gsi_lag_port; + else if (init) + tx_affinity = get_tx_affinity_rr(dev, udata); + else if ((attr_mask & IB_QP_AV) && attr->xmit_slave) + tx_affinity = + mlx5_lag_get_slave_port(dev->mdev, attr->xmit_slave); + else + return 0; + + qp_base = &mqp->trans_qp.base; + if (ucontext) + mlx5_ib_dbg(dev, "Set tx affinity 0x%x to qpn 0x%x ucontext %p\n", + tx_affinity, qp_base->mqp.qpn, ucontext); + else + mlx5_ib_dbg(dev, "Set tx affinity 0x%x to qpn 0x%x\n", + tx_affinity, qp_base->mqp.qpn); + return tx_affinity; +} + +static int __mlx5_ib_qp_set_counter(struct ib_qp *qp, + struct rdma_counter *counter) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->device); + u32 in[MLX5_ST_SZ_DW(rts2rts_qp_in)] = {}; + struct mlx5_ib_qp *mqp = to_mqp(qp); + struct mlx5_ib_qp_base *base; + u32 set_id; + u32 *qpc; + + if (counter) + set_id = counter->id; + else + set_id = mlx5_ib_get_counters_id(dev, mqp->port - 1); + + base = &mqp->trans_qp.base; + MLX5_SET(rts2rts_qp_in, in, opcode, MLX5_CMD_OP_RTS2RTS_QP); + MLX5_SET(rts2rts_qp_in, in, qpn, base->mqp.qpn); + MLX5_SET(rts2rts_qp_in, in, uid, base->mqp.uid); + MLX5_SET(rts2rts_qp_in, in, opt_param_mask, + MLX5_QP_OPTPAR_COUNTER_SET_ID); + + qpc = MLX5_ADDR_OF(rts2rts_qp_in, in, qpc); + MLX5_SET(qpc, qpc, counter_set_id, set_id); + return mlx5_cmd_exec_in(dev->mdev, rts2rts_qp, in); +} + +static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, + const struct ib_qp_attr *attr, int attr_mask, + enum ib_qp_state cur_state, + enum ib_qp_state new_state, + const struct mlx5_ib_modify_qp *ucmd, + struct mlx5_ib_modify_qp_resp *resp, + struct ib_udata *udata) +{ + static const u16 optab[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE] = { + [MLX5_QP_STATE_RST] = { + [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, + [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, + [MLX5_QP_STATE_INIT] = MLX5_CMD_OP_RST2INIT_QP, + }, + [MLX5_QP_STATE_INIT] = { + [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, + [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, + [MLX5_QP_STATE_INIT] = MLX5_CMD_OP_INIT2INIT_QP, + [MLX5_QP_STATE_RTR] = MLX5_CMD_OP_INIT2RTR_QP, + }, + [MLX5_QP_STATE_RTR] = { + [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, + [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, + [MLX5_QP_STATE_RTS] = MLX5_CMD_OP_RTR2RTS_QP, + }, + [MLX5_QP_STATE_RTS] = { + [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, + [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, + [MLX5_QP_STATE_RTS] = MLX5_CMD_OP_RTS2RTS_QP, + }, + [MLX5_QP_STATE_SQD] = { + [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, + [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, + [MLX5_QP_STATE_RTS] = MLX5_CMD_OP_SQD_RTS_QP, + }, + [MLX5_QP_STATE_SQER] = { + [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, + [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, + [MLX5_QP_STATE_RTS] = MLX5_CMD_OP_SQERR2RTS_QP, + }, + [MLX5_QP_STATE_ERR] = { + [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, + [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, + } + }; + + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + struct mlx5_ib_qp *qp = to_mqp(ibqp); + struct mlx5_ib_qp_base *base = &qp->trans_qp.base; + struct mlx5_ib_cq *send_cq, *recv_cq; + struct mlx5_ib_pd *pd; + enum mlx5_qp_state mlx5_cur, mlx5_new; + void *qpc, *pri_path, *alt_path; + enum mlx5_qp_optpar optpar = 0; + u32 set_id = 0; + int mlx5_st; + int err; + u16 op; + u8 tx_affinity = 0; + struct ib_srq *old_srq = NULL; + + mlx5_st = to_mlx5_st(qp->type); + if (mlx5_st < 0) + return -EINVAL; + + qpc = kzalloc(MLX5_ST_SZ_BYTES(qpc), GFP_KERNEL); + if (!qpc) + return -ENOMEM; + + pd = to_mpd(qp->ibqp.pd); + MLX5_SET(qpc, qpc, st, mlx5_st); + + if (!(attr_mask & IB_QP_PATH_MIG_STATE)) { + MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED); + } else { + switch (attr->path_mig_state) { + case IB_MIG_MIGRATED: + MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED); + break; + case IB_MIG_REARM: + MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_REARM); + break; + case IB_MIG_ARMED: + MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_ARMED); + break; + } + } + + tx_affinity = get_tx_affinity(ibqp, attr, attr_mask, + cur_state == IB_QPS_RESET && + new_state == IB_QPS_INIT, udata); + + MLX5_SET(qpc, qpc, lag_tx_port_affinity, tx_affinity); + if (tx_affinity && new_state == IB_QPS_RTR && + MLX5_CAP_GEN(dev->mdev, init2_lag_tx_port_affinity)) + optpar |= MLX5_QP_OPTPAR_LAG_TX_AFF; + + if (is_sqp(qp->type)) { + MLX5_SET(qpc, qpc, mtu, IB_MTU_256); + MLX5_SET(qpc, qpc, log_msg_max, 8); + } else if ((qp->type == IB_QPT_UD && + !(qp->flags & IB_QP_CREATE_SOURCE_QPN)) || + qp->type == MLX5_IB_QPT_REG_UMR) { + MLX5_SET(qpc, qpc, mtu, IB_MTU_4096); + MLX5_SET(qpc, qpc, log_msg_max, 12); + } else if (attr_mask & IB_QP_PATH_MTU) { + if (attr->path_mtu < IB_MTU_256 || + attr->path_mtu > IB_MTU_4096) { + mlx5_ib_warn(dev, "invalid mtu %d\n", attr->path_mtu); + err = -EINVAL; + goto out; + } + MLX5_SET(qpc, qpc, mtu, attr->path_mtu); + MLX5_SET(qpc, qpc, log_msg_max, + MLX5_CAP_GEN(dev->mdev, log_max_msg)); + } + + if (attr_mask & IB_QP_DEST_QPN) + MLX5_SET(qpc, qpc, remote_qpn, attr->dest_qp_num); + + pri_path = MLX5_ADDR_OF(qpc, qpc, primary_address_path); + alt_path = MLX5_ADDR_OF(qpc, qpc, secondary_address_path); + + if (attr_mask & IB_QP_PKEY_INDEX) + MLX5_SET(ads, pri_path, pkey_index, attr->pkey_index); + + /* todo implement counter_index functionality */ + + if (is_sqp(qp->type)) + MLX5_SET(ads, pri_path, vhca_port_num, qp->port); + + if (attr_mask & IB_QP_PORT) + MLX5_SET(ads, pri_path, vhca_port_num, attr->port_num); + + if (attr_mask & IB_QP_AV) { + err = mlx5_set_path(dev, qp, &attr->ah_attr, pri_path, + attr_mask & IB_QP_PORT ? attr->port_num : + qp->port, + attr_mask, 0, attr, false); + if (err) + goto out; + } + + if (attr_mask & IB_QP_TIMEOUT) + MLX5_SET(ads, pri_path, ack_timeout, attr->timeout); + + if (attr_mask & IB_QP_ALT_PATH) { + err = mlx5_set_path(dev, qp, &attr->alt_ah_attr, alt_path, + attr->alt_port_num, + attr_mask | IB_QP_PKEY_INDEX | + IB_QP_TIMEOUT, + 0, attr, true); + if (err) + goto out; + } + + get_cqs(qp->type, qp->ibqp.send_cq, qp->ibqp.recv_cq, + &send_cq, &recv_cq); + + MLX5_SET(qpc, qpc, pd, pd ? pd->pdn : to_mpd(dev->devr.p0)->pdn); + if (send_cq) + MLX5_SET(qpc, qpc, cqn_snd, send_cq->mcq.cqn); + if (recv_cq) + MLX5_SET(qpc, qpc, cqn_rcv, recv_cq->mcq.cqn); + + MLX5_SET(qpc, qpc, log_ack_req_freq, MLX5_IB_ACK_REQ_FREQ); + + if (attr_mask & IB_QP_RNR_RETRY) + MLX5_SET(qpc, qpc, rnr_retry, attr->rnr_retry); + + if (attr_mask & IB_QP_RETRY_CNT) + MLX5_SET(qpc, qpc, retry_count, attr->retry_cnt); + + if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC && attr->max_rd_atomic) + MLX5_SET(qpc, qpc, log_sra_max, ilog2(attr->max_rd_atomic)); + + if (attr_mask & IB_QP_SQ_PSN) + MLX5_SET(qpc, qpc, next_send_psn, attr->sq_psn); + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC && attr->max_dest_rd_atomic) + MLX5_SET(qpc, qpc, log_rra_max, + ilog2(attr->max_dest_rd_atomic)); + + if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) { + err = set_qpc_atomic_flags(qp, attr, attr_mask, qpc); + if (err) + goto out; + } + + if (attr_mask & IB_QP_MIN_RNR_TIMER) + MLX5_SET(qpc, qpc, min_rnr_nak, attr->min_rnr_timer); + + if (attr_mask & IB_QP_RQ_PSN) + MLX5_SET(qpc, qpc, next_rcv_psn, attr->rq_psn); + + if (attr_mask & IB_QP_QKEY) + MLX5_SET(qpc, qpc, q_key, attr->qkey); + + if (qp->rq.wqe_cnt && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) + MLX5_SET64(qpc, qpc, dbr_addr, qp->db.dma); + + if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { + u8 port_num = (attr_mask & IB_QP_PORT ? attr->port_num : + qp->port) - 1; + + /* Underlay port should be used - index 0 function per port */ + if (qp->flags & IB_QP_CREATE_SOURCE_QPN) + port_num = 0; + + if (ibqp->counter) + set_id = ibqp->counter->id; + else + set_id = mlx5_ib_get_counters_id(dev, port_num); + MLX5_SET(qpc, qpc, counter_set_id, set_id); + } + + if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) + MLX5_SET(qpc, qpc, rlky, 1); + + if (qp->flags & MLX5_IB_QP_CREATE_SQPN_QP1) + MLX5_SET(qpc, qpc, deth_sqpn, 1); + + mlx5_cur = to_mlx5_state(cur_state); + mlx5_new = to_mlx5_state(new_state); + + if (mlx5_cur >= MLX5_QP_NUM_STATE || mlx5_new >= MLX5_QP_NUM_STATE || + !optab[mlx5_cur][mlx5_new]) { + err = -EINVAL; + goto out; + } + + op = optab[mlx5_cur][mlx5_new]; + optpar |= ib_mask_to_mlx5_opt(attr_mask); + optpar &= opt_mask[mlx5_cur][mlx5_new][mlx5_st]; + + if (attr_mask & IB_QP_RMPN_XRQN) { + old_srq = ibqp->srq; + err = mlx5_ib_set_qp_srqn(qpc, ibqp, attr->rmpn_xrqn); + if (err) { + old_srq = NULL; + goto out; + } + atomic_dec(&old_srq->usecnt); + atomic_inc(&ibqp->srq->usecnt); + } + + if (attr_mask & IB_QP_OFFLOAD_TYPE) { + err = mlx5_ib_set_qp_offload_type(qpc, ibqp, + attr->offload_type); + if (err) + goto out; + } + + if (qp->type == IB_QPT_RAW_PACKET || + qp->flags & IB_QP_CREATE_SOURCE_QPN) { + struct mlx5_modify_raw_qp_param raw_qp_param = {}; + + raw_qp_param.operation = op; + if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { + raw_qp_param.rq_q_ctr_id = set_id; + raw_qp_param.set_mask |= MLX5_RAW_QP_MOD_SET_RQ_Q_CTR_ID; + } + + if (attr_mask & IB_QP_PORT) + raw_qp_param.port = attr->port_num; + + if (attr_mask & IB_QP_RATE_LIMIT) { + raw_qp_param.rl.rate = attr->rate_limit; + + if (ucmd->burst_info.max_burst_sz) { + if (attr->rate_limit && + MLX5_CAP_QOS(dev->mdev, packet_pacing_burst_bound)) { + raw_qp_param.rl.max_burst_sz = + ucmd->burst_info.max_burst_sz; + } else { + err = -EINVAL; + goto out; + } + } + + if (ucmd->burst_info.typical_pkt_sz) { + if (attr->rate_limit && + MLX5_CAP_QOS(dev->mdev, packet_pacing_typical_size)) { + raw_qp_param.rl.typical_pkt_sz = + ucmd->burst_info.typical_pkt_sz; + } else { + err = -EINVAL; + goto out; + } + } + + raw_qp_param.set_mask |= MLX5_RAW_QP_RATE_LIMIT; + } + + err = modify_raw_packet_qp(dev, qp, &raw_qp_param, tx_affinity); + } else { + if (udata) { + /* For the kernel flows, the resp will stay zero */ + resp->ece_options = + MLX5_CAP_GEN(dev->mdev, ece_support) ? + ucmd->ece_options : 0; + resp->response_length = sizeof(*resp); + } + err = mlx5_core_qp_modify(dev, op, optpar, qpc, &base->mqp, + &resp->ece_options); + } + + if (err) + goto out; + + qp->state = new_state; + + if (attr_mask & IB_QP_ACCESS_FLAGS) + qp->trans_qp.atomic_rd_en = attr->qp_access_flags; + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) + qp->trans_qp.resp_depth = attr->max_dest_rd_atomic; + if (attr_mask & IB_QP_PORT) + qp->port = attr->port_num; + if (attr_mask & IB_QP_ALT_PATH) + qp->trans_qp.alt_port = attr->alt_port_num; + + /* + * If we moved a kernel QP to RESET, clean up all old CQ + * entries and reinitialize the QP. + */ + if (new_state == IB_QPS_RESET && + !ibqp->uobject && qp->type != IB_QPT_XRC_TGT) { + mlx5_ib_cq_clean(recv_cq, base->mqp.qpn, + ibqp->srq ? to_msrq(ibqp->srq) : NULL); + if (send_cq != recv_cq) + mlx5_ib_cq_clean(send_cq, base->mqp.qpn, NULL); + + qp->rq.head = 0; + qp->rq.tail = 0; + qp->sq.head = 0; + qp->sq.tail = 0; + qp->sq.cur_post = 0; + if (qp->sq.wqe_cnt) + qp->sq.cur_edge = get_sq_edge(&qp->sq, 0); + qp->sq.last_poll = 0; + qp->db.db[MLX5_RCV_DBR] = 0; + qp->db.db[MLX5_SND_DBR] = 0; + } + + if ((new_state == IB_QPS_RTS) && qp->counter_pending) { + err = __mlx5_ib_qp_set_counter(ibqp, ibqp->counter); + if (!err) + qp->counter_pending = 0; + } + +out: + if (err && old_srq) { + atomic_dec(&ibqp->srq->usecnt); + atomic_inc(&old_srq->usecnt); + ibqp->srq = old_srq; + } + kfree(qpc); + return err; +} + +static inline bool is_valid_mask(int mask, int req, int opt) +{ + if ((mask & req) != req) + return false; + + if (mask & ~(req | opt)) + return false; + + return true; +} + +/* check valid transition for driver QP types + * for now the only QP type that this function supports is DCI + */ +static bool modify_dci_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state new_state, + enum ib_qp_attr_mask attr_mask) +{ + int req = IB_QP_STATE; + int opt = 0; + + if (new_state == IB_QPS_RESET) { + return is_valid_mask(attr_mask, req, opt); + } else if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { + req |= IB_QP_PKEY_INDEX | IB_QP_PORT; + return is_valid_mask(attr_mask, req, opt); + } else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_INIT) { + opt = IB_QP_PKEY_INDEX | IB_QP_PORT; + return is_valid_mask(attr_mask, req, opt); + } else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) { + req |= IB_QP_PATH_MTU; + opt = IB_QP_PKEY_INDEX | IB_QP_AV; + return is_valid_mask(attr_mask, req, opt); + } else if (cur_state == IB_QPS_RTR && new_state == IB_QPS_RTS) { + req |= IB_QP_TIMEOUT | IB_QP_RETRY_CNT | IB_QP_RNR_RETRY | + IB_QP_MAX_QP_RD_ATOMIC | IB_QP_SQ_PSN; + opt = IB_QP_MIN_RNR_TIMER; + return is_valid_mask(attr_mask, req, opt); + } else if (cur_state == IB_QPS_RTS && new_state == IB_QPS_RTS) { + opt = IB_QP_MIN_RNR_TIMER; + return is_valid_mask(attr_mask, req, opt); + } else if (cur_state != IB_QPS_RESET && new_state == IB_QPS_ERR) { + return is_valid_mask(attr_mask, req, opt); + } + return false; +} + +/* mlx5_ib_modify_dct: modify a DCT QP + * valid transitions are: + * RESET to INIT: must set access_flags, pkey_index and port + * INIT to RTR : must set min_rnr_timer, tclass, flow_label, + * mtu, gid_index and hop_limit + * Other transitions and attributes are illegal + */ +static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct mlx5_ib_modify_qp *ucmd, + struct ib_udata *udata) +{ + struct mlx5_ib_qp *qp = to_mqp(ibqp); + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + enum ib_qp_state cur_state, new_state; + int required = IB_QP_STATE; + void *dctc; + int err; + u8 tmp_hop_limit; + + if (!(attr_mask & IB_QP_STATE)) + return -EINVAL; + + cur_state = qp->state; + new_state = attr->qp_state; + + dctc = MLX5_ADDR_OF(create_dct_in, qp->dct.in, dct_context_entry); + if (MLX5_CAP_GEN(dev->mdev, ece_support) && ucmd->ece_options) + /* + * DCT doesn't initialize QP till modify command is executed, + * so we need to overwrite previously set ECE field if user + * provided any value except zero, which means not set/not + * valid. + */ + MLX5_SET(dctc, dctc, ece, ucmd->ece_options); + + if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { + u16 set_id; + + required |= IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_PORT; + if (!is_valid_mask(attr_mask, required, 0)) + return -EINVAL; + + if (attr->port_num == 0 || + attr->port_num > MLX5_CAP_GEN(dev->mdev, num_ports)) { + mlx5_ib_dbg(dev, "invalid port number %d. number of ports is %d\n", + attr->port_num, dev->num_ports); + return -EINVAL; + } + if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ) + MLX5_SET(dctc, dctc, rre, 1); + if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE) + MLX5_SET(dctc, dctc, rwe, 1); + if (attr->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC) { + int atomic_mode; + + atomic_mode = get_atomic_mode(dev, MLX5_IB_QPT_DCT); + if (atomic_mode < 0) + return -EOPNOTSUPP; + + MLX5_SET(dctc, dctc, atomic_mode, atomic_mode); + MLX5_SET(dctc, dctc, rae, 1); + } + MLX5_SET(dctc, dctc, pkey_index, attr->pkey_index); + if (mlx5_lag_is_active(dev->mdev)) + MLX5_SET(dctc, dctc, port, + get_tx_affinity_rr(dev, udata)); + else + MLX5_SET(dctc, dctc, port, attr->port_num); + + set_id = mlx5_ib_get_counters_id(dev, attr->port_num - 1); + MLX5_SET(dctc, dctc, counter_set_id, set_id); + } else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) { + struct mlx5_ib_modify_qp_resp resp = {}; + u32 out[MLX5_ST_SZ_DW(create_dct_out)] = {}; + u32 min_resp_len = offsetofend(typeof(resp), dctn); + u8 tclass = attr->ah_attr.grh.traffic_class; + u8 port = MLX5_GET(dctc, dctc, port); + + if (udata->outlen < min_resp_len) + return -EINVAL; + /* + * If we don't have enough space for the ECE options, + * simply indicate it with resp.response_length. + */ + resp.response_length = (udata->outlen < sizeof(resp)) ? + min_resp_len : + sizeof(resp); + + required |= IB_QP_MIN_RNR_TIMER | IB_QP_AV | IB_QP_PATH_MTU; + if (!is_valid_mask(attr_mask, required, 0)) + return -EINVAL; + MLX5_SET(dctc, dctc, min_rnr_nak, attr->min_rnr_timer); + if (dev->tcd[port - 1].val >= 0) + tclass = dev->tcd[port - 1].val; + MLX5_SET(dctc, dctc, tclass, tclass); + MLX5_SET(dctc, dctc, flow_label, attr->ah_attr.grh.flow_label); + MLX5_SET(dctc, dctc, mtu, attr->path_mtu); + MLX5_SET(dctc, dctc, my_addr_index, attr->ah_attr.grh.sgid_index); + + attr->ah_attr.type = rdma_ah_find_type(&dev->ib_dev, port); + if ((attr->ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE) && + (attr->ah_attr.grh.hop_limit < 2)) + tmp_hop_limit = IPV6_DEFAULT_HOPLIMIT; + else + tmp_hop_limit = attr->ah_attr.grh.hop_limit; + + tmp_hop_limit = (dev->ttld[port - 1].val) ? + dev->ttld[port - 1].val : tmp_hop_limit; + MLX5_SET(dctc, dctc, hop_limit, tmp_hop_limit); + + if (rdma_protocol_roce(ibqp->device, port)) + MLX5_SET(dctc, dctc, eth_prio, attr->ah_attr.sl & 0x7); + + err = mlx5_core_create_dct(dev, &qp->dct.mdct, qp->dct.in, + MLX5_ST_SZ_BYTES(create_dct_in), out, + sizeof(out)); + err = mlx5_cmd_check(dev->mdev, err, qp->dct.in, out); + if (err) + return err; + resp.dctn = qp->dct.mdct.mqp.qpn; + if (MLX5_CAP_GEN(dev->mdev, ece_support)) + resp.ece_options = MLX5_GET(create_dct_out, out, ece); + err = ib_copy_to_udata(udata, &resp, resp.response_length); + if (err) { + mlx5_core_destroy_dct(dev, &qp->dct.mdct); + return err; + } + } else { + mlx5_ib_warn(dev, "Modify DCT: Invalid transition from %d to %d\n", cur_state, new_state); + return -EINVAL; + } + + qp->state = new_state; + return 0; +} + +static bool mlx5_ib_modify_qp_allowed(struct mlx5_ib_dev *dev, + struct mlx5_ib_qp *qp) +{ + if (dev->profile != &raw_eth_profile) + return true; + + if (qp->type == IB_QPT_RAW_PACKET || qp->type == MLX5_IB_QPT_REG_UMR) + return true; + + /* Internal QP used for wc testing, with NOPs in wq */ + if (qp->flags & MLX5_IB_QP_CREATE_WC_TEST) + return true; + + return false; +} + +static int ignored_ts_check(enum ib_qp_type qp_type) +{ + if (qp_type == MLX5_IB_QPT_REG_UMR || + qp_type == MLX5_IB_QPT_SW_CNAK || + qp_type == MLX5_IB_QPT_DCI) + return 1; + + return 0; +} + +static int validate_rd_atomic(struct mlx5_ib_dev *dev, struct ib_qp_attr *attr, + int attr_mask, enum ib_qp_type qp_type) +{ + int log_max_ra_res; + int log_max_ra_req; + + if (qp_type == MLX5_IB_QPT_DCI) { + log_max_ra_res = 1 << MLX5_CAP_GEN(dev->mdev, + log_max_ra_res_dc); + log_max_ra_req = 1 << MLX5_CAP_GEN(dev->mdev, + log_max_ra_req_dc); + } else { + log_max_ra_res = 1 << MLX5_CAP_GEN(dev->mdev, + log_max_ra_res_qp); + log_max_ra_req = 1 << MLX5_CAP_GEN(dev->mdev, + log_max_ra_req_qp); + } + + if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC && + attr->max_rd_atomic > log_max_ra_res) { + mlx5_ib_dbg(dev, "invalid max_rd_atomic value %d\n", + attr->max_rd_atomic); + return false; + } + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC && + attr->max_dest_rd_atomic > log_max_ra_req) { + mlx5_ib_dbg(dev, "invalid max_dest_rd_atomic value %d\n", + attr->max_dest_rd_atomic); + return false; + } + return true; +} + +int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + struct mlx5_ib_modify_qp_resp resp = {}; + struct mlx5_ib_qp *qp = to_mqp(ibqp); + struct mlx5_ib_modify_qp ucmd = {}; + enum ib_qp_type qp_type; + enum ib_qp_state cur_state, new_state; + int err = -EINVAL; + + if (!mlx5_ib_modify_qp_allowed(dev, qp)) + return -EOPNOTSUPP; + + if (attr_mask & ~(IB_QP_ATTR_STANDARD_BITS | IB_QP_RATE_LIMIT | + IB_QP_OFFLOAD_TYPE | IB_QP_RMPN_XRQN)) + return -EOPNOTSUPP; + + if (ibqp->rwq_ind_tbl) + return -ENOSYS; + + if (udata && udata->inlen) { + if (udata->inlen < offsetofend(typeof(ucmd), ece_options)) + return -EINVAL; + + if (udata->inlen > sizeof(ucmd) && + !ib_is_udata_cleared(udata, sizeof(ucmd), + udata->inlen - sizeof(ucmd))) + return -EOPNOTSUPP; + + if (ib_copy_from_udata(&ucmd, udata, + min(udata->inlen, sizeof(ucmd)))) + return -EFAULT; + + if (ucmd.comp_mask || + memchr_inv(&ucmd.burst_info.reserved, 0, + sizeof(ucmd.burst_info.reserved))) + return -EOPNOTSUPP; + + } + + if (qp->type == IB_QPT_GSI) + return mlx5_ib_gsi_modify_qp(ibqp, attr, attr_mask); + + qp_type = (qp->type == MLX5_IB_QPT_HW_GSI) ? IB_QPT_GSI : qp->type; + + if (qp_type == MLX5_IB_QPT_DCT) + return mlx5_ib_modify_dct(ibqp, attr, attr_mask, &ucmd, udata); + + mutex_lock(&qp->mutex); + + cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state; + new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; + + if (qp->flags & IB_QP_CREATE_SOURCE_QPN) { + if (attr_mask & ~(IB_QP_STATE | IB_QP_CUR_STATE)) { + mlx5_ib_dbg(dev, "invalid attr_mask 0x%x when underlay QP is used\n", + attr_mask); + goto out; + } + } else if (!ignored_ts_check(qp_type) && + !ib_modify_qp_is_ok(cur_state, new_state, qp_type, + attr_mask)) { + mlx5_ib_dbg(dev, "invalid QP state transition from %d to %d, qp_type %d, attr_mask 0x%x\n", + cur_state, new_state, qp->type, attr_mask); + goto out; + } else if (qp_type == MLX5_IB_QPT_DCI && + !modify_dci_qp_is_ok(cur_state, new_state, attr_mask)) { + mlx5_ib_dbg(dev, "invalid QP state transition from %d to %d, qp_type %d, attr_mask 0x%x\n", + cur_state, new_state, qp_type, attr_mask); + goto out; + } + + if ((attr_mask & IB_QP_PORT) && + (attr->port_num == 0 || + attr->port_num > dev->num_ports)) { + mlx5_ib_dbg(dev, "invalid port number %d. number of ports is %d\n", + attr->port_num, dev->num_ports); + goto out; + } + + if ((attr_mask & IB_QP_PKEY_INDEX) && + attr->pkey_index >= dev->pkey_table_len) { + mlx5_ib_dbg(dev, "invalid pkey index %d\n", attr->pkey_index); + goto out; + } + + if (!validate_rd_atomic(dev, attr, attr_mask, qp_type)) + goto out; + + if (cur_state == new_state && cur_state == IB_QPS_RESET) { + err = 0; + goto out; + } + + err = __mlx5_ib_modify_qp(ibqp, attr, attr_mask, cur_state, + new_state, &ucmd, &resp, udata); + + /* resp.response_length is set in ECE supported flows only */ + if (!err && resp.response_length && + udata->outlen >= resp.response_length) + /* Return -EFAULT to the user and expect him to destroy QP. */ + err = ib_copy_to_udata(udata, &resp, resp.response_length); + +out: + mutex_unlock(&qp->mutex); + return err; +} + +static inline enum ib_qp_state to_ib_qp_state(enum mlx5_qp_state mlx5_state) +{ + switch (mlx5_state) { + case MLX5_QP_STATE_RST: return IB_QPS_RESET; + case MLX5_QP_STATE_INIT: return IB_QPS_INIT; + case MLX5_QP_STATE_RTR: return IB_QPS_RTR; + case MLX5_QP_STATE_RTS: return IB_QPS_RTS; + case MLX5_QP_STATE_SQ_DRAINING: + case MLX5_QP_STATE_SQD: return IB_QPS_SQD; + case MLX5_QP_STATE_SQER: return IB_QPS_SQE; + case MLX5_QP_STATE_ERR: return IB_QPS_ERR; + default: return -1; + } +} + +static inline enum ib_mig_state to_ib_mig_state(int mlx5_mig_state) +{ + switch (mlx5_mig_state) { + case MLX5_QP_PM_ARMED: return IB_MIG_ARMED; + case MLX5_QP_PM_REARM: return IB_MIG_REARM; + case MLX5_QP_PM_MIGRATED: return IB_MIG_MIGRATED; + default: return -1; + } +} + +static void to_rdma_ah_attr(struct mlx5_ib_dev *ibdev, + struct rdma_ah_attr *ah_attr, void *path) +{ + int port = MLX5_GET(ads, path, vhca_port_num); + int static_rate; + + memset(ah_attr, 0, sizeof(*ah_attr)); + + if (!port || port > ibdev->num_ports) + return; + + ah_attr->type = rdma_ah_find_type(&ibdev->ib_dev, port); + + rdma_ah_set_port_num(ah_attr, port); + rdma_ah_set_sl(ah_attr, MLX5_GET(ads, path, sl)); + + rdma_ah_set_dlid(ah_attr, MLX5_GET(ads, path, rlid)); + rdma_ah_set_path_bits(ah_attr, MLX5_GET(ads, path, mlid)); + + static_rate = MLX5_GET(ads, path, stat_rate); + rdma_ah_set_static_rate(ah_attr, mlx5_to_ib_rate_map(static_rate)); + if (MLX5_GET(ads, path, grh) || + ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) { + rdma_ah_set_grh(ah_attr, NULL, MLX5_GET(ads, path, flow_label), + MLX5_GET(ads, path, src_addr_index), + MLX5_GET(ads, path, hop_limit), + MLX5_GET(ads, path, tclass)); + rdma_ah_set_dgid_raw(ah_attr, MLX5_ADDR_OF(ads, path, rgid_rip)); + } +} + +static int query_raw_packet_qp_sq_state(struct mlx5_ib_dev *dev, + struct mlx5_ib_sq *sq, + u8 *sq_state) +{ + int err; + + err = mlx5_core_query_sq_state(dev->mdev, sq->base.mqp.qpn, sq_state); + if (err) + goto out; + sq->state = *sq_state; + +out: + return err; +} + +static int query_raw_packet_qp_rq_state(struct mlx5_ib_dev *dev, + struct mlx5_ib_rq *rq, + u8 *rq_state) +{ + void *out; + void *rqc; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(query_rq_out); + out = kvzalloc(inlen, GFP_KERNEL); + if (!out) + return -ENOMEM; + + err = mlx5_core_query_rq(dev->mdev, rq->base.mqp.qpn, out); + if (err) + goto out; + + rqc = MLX5_ADDR_OF(query_rq_out, out, rq_context); + *rq_state = MLX5_GET(rqc, rqc, state); + rq->state = *rq_state; + +out: + kvfree(out); + return err; +} + +static int sqrq_state_to_qp_state(u8 sq_state, u8 rq_state, + struct mlx5_ib_qp *qp, u8 *qp_state) +{ + static const u8 sqrq_trans[MLX5_RQ_NUM_STATE][MLX5_SQ_NUM_STATE] = { + [MLX5_RQC_STATE_RST] = { + [MLX5_SQC_STATE_RST] = IB_QPS_RESET, + [MLX5_SQC_STATE_RDY] = MLX5_QP_STATE_BAD, + [MLX5_SQC_STATE_ERR] = MLX5_QP_STATE_BAD, + [MLX5_SQ_STATE_NA] = IB_QPS_RESET, + }, + [MLX5_RQC_STATE_RDY] = { + [MLX5_SQC_STATE_RST] = MLX5_QP_STATE, + [MLX5_SQC_STATE_RDY] = MLX5_QP_STATE, + [MLX5_SQC_STATE_ERR] = IB_QPS_SQE, + [MLX5_SQ_STATE_NA] = MLX5_QP_STATE, + }, + [MLX5_RQC_STATE_ERR] = { + [MLX5_SQC_STATE_RST] = MLX5_QP_STATE_BAD, + [MLX5_SQC_STATE_RDY] = MLX5_QP_STATE_BAD, + [MLX5_SQC_STATE_ERR] = IB_QPS_ERR, + [MLX5_SQ_STATE_NA] = IB_QPS_ERR, + }, + [MLX5_RQ_STATE_NA] = { + [MLX5_SQC_STATE_RST] = MLX5_QP_STATE, + [MLX5_SQC_STATE_RDY] = MLX5_QP_STATE, + [MLX5_SQC_STATE_ERR] = MLX5_QP_STATE, + [MLX5_SQ_STATE_NA] = MLX5_QP_STATE_BAD, + }, + }; + + *qp_state = sqrq_trans[rq_state][sq_state]; + + if (*qp_state == MLX5_QP_STATE_BAD) { + WARN(1, "Buggy Raw Packet QP state, SQ 0x%x state: 0x%x, RQ 0x%x state: 0x%x", + qp->raw_packet_qp.sq.base.mqp.qpn, sq_state, + qp->raw_packet_qp.rq.base.mqp.qpn, rq_state); + return -EINVAL; + } + + if (*qp_state == MLX5_QP_STATE) + *qp_state = qp->state; + + return 0; +} + +static int query_raw_packet_qp_state(struct mlx5_ib_dev *dev, + struct mlx5_ib_qp *qp, + u8 *raw_packet_qp_state) +{ + struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp; + struct mlx5_ib_sq *sq = &raw_packet_qp->sq; + struct mlx5_ib_rq *rq = &raw_packet_qp->rq; + int err; + u8 sq_state = MLX5_SQ_STATE_NA; + u8 rq_state = MLX5_RQ_STATE_NA; + + if (qp->sq.wqe_cnt) { + err = query_raw_packet_qp_sq_state(dev, sq, &sq_state); + if (err) + return err; + } + + if (qp->rq.wqe_cnt) { + err = query_raw_packet_qp_rq_state(dev, rq, &rq_state); + if (err) + return err; + } + + return sqrq_state_to_qp_state(sq_state, rq_state, qp, + raw_packet_qp_state); +} + +static int query_qp_attr(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + struct ib_qp_attr *qp_attr) +{ + int outlen = MLX5_ST_SZ_BYTES(query_qp_out); + void *qpc, *pri_path, *alt_path; + u32 *outb; + int err; + + outb = kzalloc(outlen, GFP_KERNEL); + if (!outb) + return -ENOMEM; + + err = mlx5_core_qp_query(dev, &qp->trans_qp.base.mqp, outb, outlen); + if (err) + goto out; + + qpc = MLX5_ADDR_OF(query_qp_out, outb, qpc); + + qp->state = to_ib_qp_state(MLX5_GET(qpc, qpc, state)); + if (MLX5_GET(qpc, qpc, state) == MLX5_QP_STATE_SQ_DRAINING) + qp_attr->sq_draining = 1; + + qp_attr->path_mtu = MLX5_GET(qpc, qpc, mtu); + qp_attr->path_mig_state = to_ib_mig_state(MLX5_GET(qpc, qpc, pm_state)); + qp_attr->qkey = MLX5_GET(qpc, qpc, q_key); + qp_attr->rq_psn = MLX5_GET(qpc, qpc, next_rcv_psn); + qp_attr->sq_psn = MLX5_GET(qpc, qpc, next_send_psn); + qp_attr->dest_qp_num = MLX5_GET(qpc, qpc, remote_qpn); + + if (MLX5_GET(qpc, qpc, rre)) + qp_attr->qp_access_flags |= IB_ACCESS_REMOTE_READ; + if (MLX5_GET(qpc, qpc, rwe)) + qp_attr->qp_access_flags |= IB_ACCESS_REMOTE_WRITE; + if (MLX5_GET(qpc, qpc, rae)) + qp_attr->qp_access_flags |= IB_ACCESS_REMOTE_ATOMIC; + + qp_attr->max_rd_atomic = 1 << MLX5_GET(qpc, qpc, log_sra_max); + qp_attr->max_dest_rd_atomic = 1 << MLX5_GET(qpc, qpc, log_rra_max); + qp_attr->min_rnr_timer = MLX5_GET(qpc, qpc, min_rnr_nak); + qp_attr->retry_cnt = MLX5_GET(qpc, qpc, retry_count); + qp_attr->rnr_retry = MLX5_GET(qpc, qpc, rnr_retry); + + pri_path = MLX5_ADDR_OF(qpc, qpc, primary_address_path); + alt_path = MLX5_ADDR_OF(qpc, qpc, secondary_address_path); + + if (qp->type == IB_QPT_RC || qp->type == IB_QPT_UC || + qp->type == IB_QPT_XRC_INI || qp->type == IB_QPT_XRC_TGT) { + to_rdma_ah_attr(dev, &qp_attr->ah_attr, pri_path); + to_rdma_ah_attr(dev, &qp_attr->alt_ah_attr, alt_path); + qp_attr->alt_pkey_index = MLX5_GET(ads, alt_path, pkey_index); + qp_attr->alt_port_num = MLX5_GET(ads, alt_path, vhca_port_num); + } + + qp_attr->pkey_index = MLX5_GET(ads, pri_path, pkey_index); + qp_attr->port_num = MLX5_GET(ads, pri_path, vhca_port_num); + qp_attr->timeout = MLX5_GET(ads, pri_path, ack_timeout); + qp_attr->alt_timeout = MLX5_GET(ads, alt_path, ack_timeout); + +out: + kfree(outb); + return err; +} + +static int mlx5_ib_dct_query_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *mqp, + struct ib_qp_attr *qp_attr, int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr) +{ + struct mlx5_core_dct *dct = &mqp->dct.mdct; + u32 *out; + u32 access_flags = 0; + int outlen = MLX5_ST_SZ_BYTES(query_dct_out); + void *dctc; + int err; + int supported_mask = IB_QP_STATE | + IB_QP_ACCESS_FLAGS | + IB_QP_PORT | + IB_QP_MIN_RNR_TIMER | + IB_QP_AV | + IB_QP_PATH_MTU | + IB_QP_PKEY_INDEX; + + if (qp_attr_mask & ~supported_mask) + return -EINVAL; + if (mqp->state != IB_QPS_RTR) + return -EINVAL; + + out = kzalloc(outlen, GFP_KERNEL); + if (!out) + return -ENOMEM; + + err = mlx5_core_dct_query(dev, dct, out, outlen); + if (err) + goto out; + + dctc = MLX5_ADDR_OF(query_dct_out, out, dct_context_entry); + + if (qp_attr_mask & IB_QP_STATE) + qp_attr->qp_state = IB_QPS_RTR; + + if (qp_attr_mask & IB_QP_ACCESS_FLAGS) { + if (MLX5_GET(dctc, dctc, rre)) + access_flags |= IB_ACCESS_REMOTE_READ; + if (MLX5_GET(dctc, dctc, rwe)) + access_flags |= IB_ACCESS_REMOTE_WRITE; + if (MLX5_GET(dctc, dctc, rae)) + access_flags |= IB_ACCESS_REMOTE_ATOMIC; + qp_attr->qp_access_flags = access_flags; + } + + if (qp_attr_mask & IB_QP_PORT) + qp_attr->port_num = MLX5_GET(dctc, dctc, port); + if (qp_attr_mask & IB_QP_MIN_RNR_TIMER) + qp_attr->min_rnr_timer = MLX5_GET(dctc, dctc, min_rnr_nak); + if (qp_attr_mask & IB_QP_AV) { + qp_attr->ah_attr.grh.traffic_class = MLX5_GET(dctc, dctc, tclass); + qp_attr->ah_attr.grh.flow_label = MLX5_GET(dctc, dctc, flow_label); + qp_attr->ah_attr.grh.sgid_index = MLX5_GET(dctc, dctc, my_addr_index); + qp_attr->ah_attr.grh.hop_limit = MLX5_GET(dctc, dctc, hop_limit); + } + if (qp_attr_mask & IB_QP_PATH_MTU) + qp_attr->path_mtu = MLX5_GET(dctc, dctc, mtu); + if (qp_attr_mask & IB_QP_PKEY_INDEX) + qp_attr->pkey_index = MLX5_GET(dctc, dctc, pkey_index); +out: + kfree(out); + return err; +} + +int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) +{ + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + struct mlx5_ib_qp *qp = to_mqp(ibqp); + int err = 0; + u8 raw_packet_qp_state; + + if (ibqp->rwq_ind_tbl) + return -ENOSYS; + + if (qp->type == IB_QPT_GSI) + return mlx5_ib_gsi_query_qp(ibqp, qp_attr, qp_attr_mask, + qp_init_attr); + + /* Not all of output fields are applicable, make sure to zero them */ + memset(qp_init_attr, 0, sizeof(*qp_init_attr)); + memset(qp_attr, 0, sizeof(*qp_attr)); + + if (unlikely(qp->type == MLX5_IB_QPT_DCT)) + return mlx5_ib_dct_query_qp(dev, qp, qp_attr, + qp_attr_mask, qp_init_attr); + + mutex_lock(&qp->mutex); + + if (qp->type == IB_QPT_RAW_PACKET || + qp->flags & IB_QP_CREATE_SOURCE_QPN) { + err = query_raw_packet_qp_state(dev, qp, &raw_packet_qp_state); + if (err) + goto out; + qp->state = raw_packet_qp_state; + qp_attr->port_num = 1; + } else { + err = query_qp_attr(dev, qp, qp_attr); + if (err) + goto out; + } + + qp_attr->qp_state = qp->state; + qp_attr->cur_qp_state = qp_attr->qp_state; + qp_attr->cap.max_recv_wr = qp->rq.wqe_cnt; + qp_attr->cap.max_recv_sge = qp->rq.max_gs; + + if (!ibqp->uobject) { + qp_attr->cap.max_send_wr = qp->sq.max_post; + qp_attr->cap.max_send_sge = qp->sq.max_gs; + qp_init_attr->qp_context = ibqp->qp_context; + } else { + qp_attr->cap.max_send_wr = 0; + qp_attr->cap.max_send_sge = 0; + } + + qp_init_attr->qp_type = qp->type; + qp_init_attr->recv_cq = ibqp->recv_cq; + qp_init_attr->send_cq = ibqp->send_cq; + qp_init_attr->srq = ibqp->srq; + qp_attr->cap.max_inline_data = qp->max_inline_data; + + qp_init_attr->cap = qp_attr->cap; + + qp_init_attr->create_flags = qp->flags; + + qp_init_attr->sq_sig_type = qp->sq_signal_bits & MLX5_WQE_CTRL_CQ_UPDATE ? + IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; + +out: + mutex_unlock(&qp->mutex); + return err; +} + +int mlx5_ib_alloc_xrcd(struct ib_xrcd *ibxrcd, struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(ibxrcd->device); + struct mlx5_ib_xrcd *xrcd = to_mxrcd(ibxrcd); + + if (!MLX5_CAP_GEN(dev->mdev, xrc)) + return -EOPNOTSUPP; + + return mlx5_cmd_xrcd_alloc(dev->mdev, &xrcd->xrcdn, 0); +} + +int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd, struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(xrcd->device); + u32 xrcdn = to_mxrcd(xrcd)->xrcdn; + + return mlx5_cmd_xrcd_dealloc(dev->mdev, xrcdn, 0); +} + +static void mlx5_ib_wq_event(struct mlx5_core_qp *core_qp, int type) +{ + struct mlx5_ib_rwq *rwq = to_mibrwq(core_qp); + struct mlx5_ib_dev *dev = to_mdev(rwq->ibwq.device); + struct ib_event event; + + if (rwq->ibwq.event_handler) { + event.device = rwq->ibwq.device; + event.element.wq = &rwq->ibwq; + switch (type) { + case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: + event.event = IB_EVENT_WQ_FATAL; + break; + default: + mlx5_ib_warn(dev, "Unexpected event type %d on WQ %06x\n", type, core_qp->qpn); + return; + } + + rwq->ibwq.event_handler(&event, rwq->ibwq.wq_context); + } +} + +static int set_delay_drop(struct mlx5_ib_dev *dev) +{ + int err = 0; + + mutex_lock(&dev->delay_drop.lock); + if (dev->delay_drop.activate) + goto out; + + err = mlx5_core_set_delay_drop(dev, dev->delay_drop.timeout); + if (err) + goto out; + + dev->delay_drop.activate = true; +out: + mutex_unlock(&dev->delay_drop.lock); + + if (!err) + atomic_inc(&dev->delay_drop.rqs_cnt); + return err; +} + +static int create_rq(struct mlx5_ib_rwq *rwq, struct ib_pd *pd, + struct ib_wq_init_attr *init_attr) +{ + struct mlx5_ib_dev *dev; + int has_net_offloads; + __be64 *rq_pas0; + int ts_format; + void *in; + void *rqc; + void *wq; + int inlen; + int err; + + dev = to_mdev(pd->device); + + ts_format = get_rq_ts_format(dev, to_mcq(init_attr->cq)); + if (ts_format < 0) + return ts_format; + + inlen = MLX5_ST_SZ_BYTES(create_rq_in) + sizeof(u64) * rwq->rq_num_pas; + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(create_rq_in, in, uid, to_mpd(pd)->uid); + rqc = MLX5_ADDR_OF(create_rq_in, in, ctx); + MLX5_SET(rqc, rqc, mem_rq_type, + MLX5_RQC_MEM_RQ_TYPE_MEMORY_RQ_INLINE); + MLX5_SET(rqc, rqc, ts_format, ts_format); + MLX5_SET(rqc, rqc, user_index, rwq->user_index); + MLX5_SET(rqc, rqc, cqn, to_mcq(init_attr->cq)->mcq.cqn); + MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RST); + MLX5_SET(rqc, rqc, flush_in_error_en, 1); + wq = MLX5_ADDR_OF(rqc, rqc, wq); + MLX5_SET(wq, wq, wq_type, + rwq->create_flags & MLX5_IB_WQ_FLAGS_STRIDING_RQ ? + MLX5_WQ_TYPE_CYCLIC_STRIDING_RQ : MLX5_WQ_TYPE_CYCLIC); + if (init_attr->create_flags & IB_WQ_FLAGS_PCI_WRITE_END_PADDING) { + if (!MLX5_CAP_GEN(dev->mdev, end_pad)) { + mlx5_ib_dbg(dev, "Scatter end padding is not supported\n"); + err = -EOPNOTSUPP; + goto out; + } else { + MLX5_SET(wq, wq, end_padding_mode, MLX5_WQ_END_PAD_MODE_ALIGN); + } + } + MLX5_SET(wq, wq, log_wq_stride, rwq->log_rq_stride); + if (rwq->create_flags & MLX5_IB_WQ_FLAGS_STRIDING_RQ) { + /* + * In Firmware number of strides in each WQE is: + * "512 * 2^single_wqe_log_num_of_strides" + * Values 3 to 8 are accepted as 10 to 15, 9 to 18 are + * accepted as 0 to 9 + */ + static const u8 fw_map[] = { 10, 11, 12, 13, 14, 15, 0, 1, + 2, 3, 4, 5, 6, 7, 8, 9 }; + MLX5_SET(wq, wq, two_byte_shift_en, rwq->two_byte_shift_en); + MLX5_SET(wq, wq, log_wqe_stride_size, + rwq->single_stride_log_num_of_bytes - + MLX5_MIN_SINGLE_STRIDE_LOG_NUM_BYTES); + MLX5_SET(wq, wq, log_wqe_num_of_strides, + fw_map[rwq->log_num_strides - + MLX5_EXT_MIN_SINGLE_WQE_LOG_NUM_STRIDES]); + } + MLX5_SET(wq, wq, log_wq_sz, rwq->log_rq_size); + MLX5_SET(wq, wq, pd, to_mpd(pd)->pdn); + MLX5_SET(wq, wq, page_offset, rwq->rq_page_offset); + MLX5_SET(wq, wq, log_wq_pg_sz, rwq->log_page_size); + MLX5_SET(wq, wq, wq_signature, rwq->wq_sig); + MLX5_SET64(wq, wq, dbr_addr, rwq->db.dma); + has_net_offloads = MLX5_CAP_GEN(dev->mdev, eth_net_offloads); + if (init_attr->create_flags & IB_WQ_FLAGS_CVLAN_STRIPPING) { + if (!(has_net_offloads && MLX5_CAP_ETH(dev->mdev, vlan_cap))) { + mlx5_ib_dbg(dev, "VLAN offloads are not supported\n"); + err = -EOPNOTSUPP; + goto out; + } + } else { + MLX5_SET(rqc, rqc, vsd, 1); + } + if (init_attr->create_flags & IB_WQ_FLAGS_SCATTER_FCS) { + if (!(has_net_offloads && MLX5_CAP_ETH(dev->mdev, scatter_fcs))) { + mlx5_ib_dbg(dev, "Scatter FCS is not supported\n"); + err = -EOPNOTSUPP; + goto out; + } + MLX5_SET(rqc, rqc, scatter_fcs, 1); + } + if (init_attr->create_flags & IB_WQ_FLAGS_DELAY_DROP) { + if (!(dev->ib_dev.attrs.raw_packet_caps & + IB_RAW_PACKET_CAP_DELAY_DROP)) { + mlx5_ib_dbg(dev, "Delay drop is not supported\n"); + err = -EOPNOTSUPP; + goto out; + } + MLX5_SET(rqc, rqc, delay_drop_en, 1); + } + rq_pas0 = (__be64 *)MLX5_ADDR_OF(wq, wq, pas); + mlx5_ib_populate_pas(rwq->umem, 1UL << rwq->page_shift, rq_pas0, 0); + err = mlx5_core_create_rq_tracked(dev, in, inlen, &rwq->core_qp); + if (!err && init_attr->create_flags & IB_WQ_FLAGS_DELAY_DROP) { + err = set_delay_drop(dev); + if (err) { + mlx5_ib_warn(dev, "Failed to enable delay drop err=%d\n", + err); + mlx5_core_destroy_rq_tracked(dev, &rwq->core_qp); + } else { + rwq->create_flags |= MLX5_IB_WQ_FLAGS_DELAY_DROP; + } + } +out: + kvfree(in); + return err; +} + +static int set_user_rq_size(struct mlx5_ib_dev *dev, + struct ib_wq_init_attr *wq_init_attr, + struct mlx5_ib_create_wq *ucmd, + struct mlx5_ib_rwq *rwq) +{ + /* Sanity check RQ size before proceeding */ + if (wq_init_attr->max_wr > (1 << MLX5_CAP_GEN(dev->mdev, log_max_wq_sz))) + return -EINVAL; + + if (!ucmd->rq_wqe_count) + return -EINVAL; + + rwq->wqe_count = ucmd->rq_wqe_count; + rwq->wqe_shift = ucmd->rq_wqe_shift; + if (check_shl_overflow(rwq->wqe_count, rwq->wqe_shift, &rwq->buf_size)) + return -EINVAL; + + rwq->log_rq_stride = rwq->wqe_shift; + rwq->log_rq_size = ilog2(rwq->wqe_count); + return 0; +} + +static bool log_of_strides_valid(struct mlx5_ib_dev *dev, u32 log_num_strides) +{ + if ((log_num_strides > MLX5_MAX_SINGLE_WQE_LOG_NUM_STRIDES) || + (log_num_strides < MLX5_EXT_MIN_SINGLE_WQE_LOG_NUM_STRIDES)) + return false; + + if (!MLX5_CAP_GEN(dev->mdev, ext_stride_num_range) && + (log_num_strides < MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES)) + return false; + + return true; +} + +static int prepare_user_rq(struct ib_pd *pd, + struct ib_wq_init_attr *init_attr, + struct ib_udata *udata, + struct mlx5_ib_rwq *rwq) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_ib_create_wq ucmd = {}; + int err; + size_t required_cmd_sz; + + required_cmd_sz = offsetofend(struct mlx5_ib_create_wq, + single_stride_log_num_of_bytes); + if (udata->inlen < required_cmd_sz) { + mlx5_ib_dbg(dev, "invalid inlen\n"); + return -EINVAL; + } + + if (udata->inlen > sizeof(ucmd) && + !ib_is_udata_cleared(udata, sizeof(ucmd), + udata->inlen - sizeof(ucmd))) { + mlx5_ib_dbg(dev, "inlen is not supported\n"); + return -EOPNOTSUPP; + } + + if (ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen))) { + mlx5_ib_dbg(dev, "copy failed\n"); + return -EFAULT; + } + + if (ucmd.comp_mask & (~MLX5_IB_CREATE_WQ_STRIDING_RQ)) { + mlx5_ib_dbg(dev, "invalid comp mask\n"); + return -EOPNOTSUPP; + } else if (ucmd.comp_mask & MLX5_IB_CREATE_WQ_STRIDING_RQ) { + if (!MLX5_CAP_GEN(dev->mdev, striding_rq)) { + mlx5_ib_dbg(dev, "Striding RQ is not supported\n"); + return -EOPNOTSUPP; + } + if ((ucmd.single_stride_log_num_of_bytes < + MLX5_MIN_SINGLE_STRIDE_LOG_NUM_BYTES) || + (ucmd.single_stride_log_num_of_bytes > + MLX5_MAX_SINGLE_STRIDE_LOG_NUM_BYTES)) { + mlx5_ib_dbg(dev, "Invalid log stride size (%u. Range is %u - %u)\n", + ucmd.single_stride_log_num_of_bytes, + MLX5_MIN_SINGLE_STRIDE_LOG_NUM_BYTES, + MLX5_MAX_SINGLE_STRIDE_LOG_NUM_BYTES); + return -EINVAL; + } + if (!log_of_strides_valid(dev, + ucmd.single_wqe_log_num_of_strides)) { + mlx5_ib_dbg( + dev, + "Invalid log num strides (%u. Range is %u - %u)\n", + ucmd.single_wqe_log_num_of_strides, + MLX5_CAP_GEN(dev->mdev, ext_stride_num_range) ? + MLX5_EXT_MIN_SINGLE_WQE_LOG_NUM_STRIDES : + MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES, + MLX5_MAX_SINGLE_WQE_LOG_NUM_STRIDES); + return -EINVAL; + } + rwq->single_stride_log_num_of_bytes = + ucmd.single_stride_log_num_of_bytes; + rwq->log_num_strides = ucmd.single_wqe_log_num_of_strides; + rwq->two_byte_shift_en = !!ucmd.two_byte_shift_en; + rwq->create_flags |= MLX5_IB_WQ_FLAGS_STRIDING_RQ; + } + + err = set_user_rq_size(dev, init_attr, &ucmd, rwq); + if (err) { + mlx5_ib_dbg(dev, "err %d\n", err); + return err; + } + + err = create_user_rq(dev, pd, udata, rwq, &ucmd); + if (err) { + mlx5_ib_dbg(dev, "err %d\n", err); + return err; + } + + rwq->user_index = ucmd.user_index; + return 0; +} + +struct ib_wq *mlx5_ib_create_wq(struct ib_pd *pd, + struct ib_wq_init_attr *init_attr, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev; + struct mlx5_ib_rwq *rwq; + struct mlx5_ib_create_wq_resp resp = {}; + size_t min_resp_len; + int err; + + if (!udata) + return ERR_PTR(-ENOSYS); + + min_resp_len = offsetofend(struct mlx5_ib_create_wq_resp, reserved); + if (udata->outlen && udata->outlen < min_resp_len) + return ERR_PTR(-EINVAL); + + if (!capable(CAP_SYS_RAWIO) && + init_attr->create_flags & IB_WQ_FLAGS_DELAY_DROP) + return ERR_PTR(-EPERM); + + dev = to_mdev(pd->device); + switch (init_attr->wq_type) { + case IB_WQT_RQ: + rwq = kzalloc(sizeof(*rwq), GFP_KERNEL); + if (!rwq) + return ERR_PTR(-ENOMEM); + err = prepare_user_rq(pd, init_attr, udata, rwq); + if (err) + goto err; + err = create_rq(rwq, pd, init_attr); + if (err) + goto err_user_rq; + break; + default: + mlx5_ib_dbg(dev, "unsupported wq type %d\n", + init_attr->wq_type); + return ERR_PTR(-EINVAL); + } + + rwq->ibwq.wq_num = rwq->core_qp.qpn; + rwq->ibwq.state = IB_WQS_RESET; + if (udata->outlen) { + resp.response_length = offsetofend( + struct mlx5_ib_create_wq_resp, response_length); + err = ib_copy_to_udata(udata, &resp, resp.response_length); + if (err) + goto err_copy; + } + + rwq->core_qp.event = mlx5_ib_wq_event; + rwq->ibwq.event_handler = init_attr->event_handler; + return &rwq->ibwq; + +err_copy: + mlx5_core_destroy_rq_tracked(dev, &rwq->core_qp); +err_user_rq: + destroy_user_rq(dev, pd, rwq, udata); +err: + kfree(rwq); + return ERR_PTR(err); +} + +int mlx5_ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(wq->device); + struct mlx5_ib_rwq *rwq = to_mrwq(wq); + int ret; + + ret = mlx5_core_destroy_rq_tracked(dev, &rwq->core_qp); + if (ret) + return ret; + destroy_user_rq(dev, wq->pd, rwq, udata); + kfree(rwq); + return 0; +} + +int mlx5_ib_create_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_table, + struct ib_rwq_ind_table_init_attr *init_attr, + struct ib_udata *udata) +{ + struct mlx5_ib_rwq_ind_table *rwq_ind_tbl = + to_mrwq_ind_table(ib_rwq_ind_table); + struct mlx5_ib_dev *dev = to_mdev(ib_rwq_ind_table->device); + int sz = 1 << init_attr->log_ind_tbl_size; + struct mlx5_ib_create_rwq_ind_tbl_resp resp = {}; + size_t min_resp_len; + int inlen; + int err; + int i; + u32 *in; + void *rqtc; + + if (udata->inlen > 0 && + !ib_is_udata_cleared(udata, 0, + udata->inlen)) + return -EOPNOTSUPP; + + if (init_attr->log_ind_tbl_size > + MLX5_CAP_GEN(dev->mdev, log_max_rqt_size)) { + mlx5_ib_dbg(dev, "log_ind_tbl_size = %d is bigger than supported = %d\n", + init_attr->log_ind_tbl_size, + MLX5_CAP_GEN(dev->mdev, log_max_rqt_size)); + return -EINVAL; + } + + min_resp_len = + offsetofend(struct mlx5_ib_create_rwq_ind_tbl_resp, reserved); + if (udata->outlen && udata->outlen < min_resp_len) + return -EINVAL; + + inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + sizeof(u32) * sz; + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context); + + MLX5_SET(rqtc, rqtc, rqt_actual_size, sz); + MLX5_SET(rqtc, rqtc, rqt_max_size, sz); + + for (i = 0; i < sz; i++) + MLX5_SET(rqtc, rqtc, rq_num[i], init_attr->ind_tbl[i]->wq_num); + + rwq_ind_tbl->uid = to_mpd(init_attr->ind_tbl[0]->pd)->uid; + MLX5_SET(create_rqt_in, in, uid, rwq_ind_tbl->uid); + + err = mlx5_core_create_rqt(dev->mdev, in, inlen, &rwq_ind_tbl->rqtn); + kvfree(in); + if (err) + return err; + + rwq_ind_tbl->ib_rwq_ind_tbl.ind_tbl_num = rwq_ind_tbl->rqtn; + if (udata->outlen) { + resp.response_length = + offsetofend(struct mlx5_ib_create_rwq_ind_tbl_resp, + response_length); + err = ib_copy_to_udata(udata, &resp, resp.response_length); + if (err) + goto err_copy; + } + + return 0; + +err_copy: + mlx5_cmd_destroy_rqt(dev->mdev, rwq_ind_tbl->rqtn, rwq_ind_tbl->uid); + return err; +} + +int mlx5_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_tbl) +{ + struct mlx5_ib_rwq_ind_table *rwq_ind_tbl = to_mrwq_ind_table(ib_rwq_ind_tbl); + struct mlx5_ib_dev *dev = to_mdev(ib_rwq_ind_tbl->device); + + return mlx5_cmd_destroy_rqt(dev->mdev, rwq_ind_tbl->rqtn, rwq_ind_tbl->uid); +} + +int mlx5_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr, + u32 wq_attr_mask, struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(wq->device); + struct mlx5_ib_rwq *rwq = to_mrwq(wq); + struct mlx5_ib_modify_wq ucmd = {}; + size_t required_cmd_sz; + int curr_wq_state; + int wq_state; + int inlen; + int err; + void *rqc; + void *in; + + required_cmd_sz = offsetofend(struct mlx5_ib_modify_wq, reserved); + if (udata->inlen < required_cmd_sz) + return -EINVAL; + + if (udata->inlen > sizeof(ucmd) && + !ib_is_udata_cleared(udata, sizeof(ucmd), + udata->inlen - sizeof(ucmd))) + return -EOPNOTSUPP; + + if (ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen))) + return -EFAULT; + + if (ucmd.comp_mask || ucmd.reserved) + return -EOPNOTSUPP; + + inlen = MLX5_ST_SZ_BYTES(modify_rq_in); + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx); + + curr_wq_state = wq_attr->curr_wq_state; + wq_state = wq_attr->wq_state; + if (curr_wq_state == IB_WQS_ERR) + curr_wq_state = MLX5_RQC_STATE_ERR; + if (wq_state == IB_WQS_ERR) + wq_state = MLX5_RQC_STATE_ERR; + MLX5_SET(modify_rq_in, in, rq_state, curr_wq_state); + MLX5_SET(modify_rq_in, in, uid, to_mpd(wq->pd)->uid); + MLX5_SET(rqc, rqc, state, wq_state); + + if (wq_attr_mask & IB_WQ_FLAGS) { + if (wq_attr->flags_mask & IB_WQ_FLAGS_CVLAN_STRIPPING) { + if (!(MLX5_CAP_GEN(dev->mdev, eth_net_offloads) && + MLX5_CAP_ETH(dev->mdev, vlan_cap))) { + mlx5_ib_dbg(dev, "VLAN offloads are not " + "supported\n"); + err = -EOPNOTSUPP; + goto out; + } + MLX5_SET64(modify_rq_in, in, modify_bitmask, + MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_VSD); + MLX5_SET(rqc, rqc, vsd, + (wq_attr->flags & IB_WQ_FLAGS_CVLAN_STRIPPING) ? 0 : 1); + } + + if (wq_attr->flags_mask & IB_WQ_FLAGS_PCI_WRITE_END_PADDING) { + mlx5_ib_dbg(dev, "Modifying scatter end padding is not supported\n"); + err = -EOPNOTSUPP; + goto out; + } + } + + if (curr_wq_state == IB_WQS_RESET && wq_state == IB_WQS_RDY) { + u16 set_id; + + set_id = mlx5_ib_get_counters_id(dev, 0); + if (MLX5_CAP_GEN(dev->mdev, modify_rq_counter_set_id)) { + MLX5_SET64(modify_rq_in, in, modify_bitmask, + MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_RQ_COUNTER_SET_ID); + MLX5_SET(rqc, rqc, counter_set_id, set_id); + } else + dev_info_once( + &dev->ib_dev.dev, + "Receive WQ counters are not supported on current FW\n"); + } + + err = mlx5_core_modify_rq(dev->mdev, rwq->core_qp.qpn, in); + if (!err) + rwq->ibwq.state = (wq_state == MLX5_RQC_STATE_ERR) ? IB_WQS_ERR : wq_state; + +out: + kvfree(in); + return err; +} + +struct mlx5_ib_drain_cqe { + struct ib_cqe cqe; + struct completion done; +}; + +static void mlx5_ib_drain_qp_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct mlx5_ib_drain_cqe *cqe = container_of(wc->wr_cqe, + struct mlx5_ib_drain_cqe, + cqe); + + complete(&cqe->done); +} + +/* This function returns only once the drained WR was completed */ +static void handle_drain_completion(struct ib_cq *cq, + struct mlx5_ib_drain_cqe *sdrain, + struct mlx5_ib_dev *dev) +{ + struct mlx5_core_dev *mdev = dev->mdev; + + if (cq->poll_ctx == IB_POLL_DIRECT) { + while (wait_for_completion_timeout(&sdrain->done, HZ / 10) <= 0) + ib_process_cq_direct(cq, -1); + return; + } + + if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { + struct mlx5_ib_cq *mcq = to_mcq(cq); + bool triggered = false; + unsigned long flags; + + spin_lock_irqsave(&dev->reset_flow_resource_lock, flags); + /* Make sure that the CQ handler won't run if wasn't run yet */ + if (!mcq->mcq.reset_notify_added) + mcq->mcq.reset_notify_added = 1; + else + triggered = true; + spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags); + + if (triggered) { + /* Wait for any scheduled/running task to be ended */ + switch (cq->poll_ctx) { + case IB_POLL_SOFTIRQ: + irq_poll_disable(&cq->iop); + irq_poll_enable(&cq->iop); + break; + case IB_POLL_WORKQUEUE: + cancel_work_sync(&cq->work); + break; + default: + WARN_ON_ONCE(1); + } + } + + /* Run the CQ handler - this makes sure that the drain WR will + * be processed if wasn't processed yet. + */ + mcq->mcq.comp(&mcq->mcq, NULL); + } + + wait_for_completion(&sdrain->done); +} + +void mlx5_ib_drain_sq(struct ib_qp *qp) +{ + struct ib_cq *cq = qp->send_cq; + struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR }; + struct mlx5_ib_drain_cqe sdrain; + const struct ib_send_wr *bad_swr; + struct ib_rdma_wr swr = { + .wr = { + .next = NULL, + { .wr_cqe = &sdrain.cqe, }, + .opcode = IB_WR_RDMA_WRITE, + }, + }; + int ret; + struct mlx5_ib_dev *dev = to_mdev(qp->device); + struct mlx5_core_dev *mdev = dev->mdev; + + ret = ib_modify_qp(qp, &attr, IB_QP_STATE); + if (ret && mdev->state != MLX5_DEVICE_STATE_INTERNAL_ERROR) { + WARN_ONCE(ret, "failed to drain send queue: %d\n", ret); + return; + } + + sdrain.cqe.done = mlx5_ib_drain_qp_done; + init_completion(&sdrain.done); + + ret = mlx5_ib_post_send_drain(qp, &swr.wr, &bad_swr); + if (ret) { + WARN_ONCE(ret, "failed to drain send queue: %d\n", ret); + return; + } + + handle_drain_completion(cq, &sdrain, dev); +} + +void mlx5_ib_drain_rq(struct ib_qp *qp) +{ + struct ib_cq *cq = qp->recv_cq; + struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR }; + struct mlx5_ib_drain_cqe rdrain; + struct ib_recv_wr rwr = {}; + const struct ib_recv_wr *bad_rwr; + int ret; + struct mlx5_ib_dev *dev = to_mdev(qp->device); + struct mlx5_core_dev *mdev = dev->mdev; + + ret = ib_modify_qp(qp, &attr, IB_QP_STATE); + if (ret && mdev->state != MLX5_DEVICE_STATE_INTERNAL_ERROR) { + WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret); + return; + } + + rwr.wr_cqe = &rdrain.cqe; + rdrain.cqe.done = mlx5_ib_drain_qp_done; + init_completion(&rdrain.done); + + ret = mlx5_ib_post_recv_drain(qp, &rwr, &bad_rwr); + if (ret) { + WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret); + return; + } + + handle_drain_completion(cq, &rdrain, dev); +} + +/* + * Bind a qp to a counter. If @counter is NULL then bind the qp to + * the default counter + */ +int mlx5_ib_qp_set_counter(struct ib_qp *qp, struct rdma_counter *counter) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->device); + struct mlx5_ib_qp *mqp = to_mqp(qp); + int err = 0; + + mutex_lock(&mqp->mutex); + if (mqp->state == IB_QPS_RESET) { + qp->counter = counter; + goto out; + } + + if (!MLX5_CAP_GEN(dev->mdev, rts2rts_qp_counters_set_id)) { + err = -EOPNOTSUPP; + goto out; + } + + if (mqp->state == IB_QPS_RTS) { + err = __mlx5_ib_qp_set_counter(qp, counter); + if (!err) + qp->counter = counter; + + goto out; + } + + mqp->counter_pending = 1; + qp->counter = counter; + +out: + mutex_unlock(&mqp->mutex); + return err; +} + +void mlx5_ib_set_mlx_seg(struct mlx5_mlx_seg *seg, struct mlx5_mlx_wr *wr) +{ + memset(seg, 0, sizeof(*seg)); + seg->stat_rate_sl = wr->sl & 0xf; + seg->dlid = cpu_to_be16(wr->dlid); + seg->flags = wr->icrc ? 8 : 0; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/qp.h b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/qp.h new file mode 100644 index 0000000..5d4e140 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/qp.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2013-2020, Mellanox Technologies inc. All rights reserved. + */ + +#ifndef _MLX5_IB_QP_H +#define _MLX5_IB_QP_H + +#include "mlx5_ib.h" + +int mlx5_init_qp_table(struct mlx5_ib_dev *dev); +void mlx5_cleanup_qp_table(struct mlx5_ib_dev *dev); + +int mlx5_core_create_dct(struct mlx5_ib_dev *dev, struct mlx5_core_dct *qp, + u32 *in, int inlen, u32 *out, int outlen); +int mlx5_qpc_create_qp(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp, + u32 *in, int inlen, u32 *out); +int mlx5_core_qp_modify(struct mlx5_ib_dev *dev, u16 opcode, u32 opt_param_mask, + void *qpc, struct mlx5_core_qp *qp, u32 *ece); +int mlx5_core_destroy_qp(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp); +int mlx5_core_destroy_dct(struct mlx5_ib_dev *dev, struct mlx5_core_dct *dct); +int mlx5_core_qp_query(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp, + u32 *out, int outlen); +int mlx5_core_dct_query(struct mlx5_ib_dev *dev, struct mlx5_core_dct *dct, + u32 *out, int outlen); + +int mlx5_core_set_delay_drop(struct mlx5_ib_dev *dev, u32 timeout_usec); + +int mlx5_core_destroy_rq_tracked(struct mlx5_ib_dev *dev, + struct mlx5_core_qp *rq); +int mlx5_core_create_sq_tracked(struct mlx5_ib_dev *dev, u32 *in, int inlen, + struct mlx5_core_qp *sq); +void mlx5_core_destroy_sq_tracked(struct mlx5_ib_dev *dev, + struct mlx5_core_qp *sq); + +int mlx5_core_create_rq_tracked(struct mlx5_ib_dev *dev, u32 *in, int inlen, + struct mlx5_core_qp *rq); + +struct mlx5_core_rsc_common *mlx5_core_res_hold(struct mlx5_ib_dev *dev, + int res_num, + enum mlx5_res_type res_type); +void mlx5_core_res_put(struct mlx5_core_rsc_common *res); + +int mlx5_core_xrcd_alloc(struct mlx5_ib_dev *dev, u32 *xrcdn); +int mlx5_core_xrcd_dealloc(struct mlx5_ib_dev *dev, u32 xrcdn); +int mlx5_ib_qp_set_counter(struct ib_qp *qp, struct rdma_counter *counter); +#endif /* _MLX5_IB_QP_H */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/qp_nvmf.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/qp_nvmf.c new file mode 100644 index 0000000..62d5351 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/qp_nvmf.c @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "mlx5_ib.h" +#include +#include + +int mlx5_ib_set_qp_offload_type(void *qpc, struct ib_qp *qp, + enum ib_qp_offload_type offload_type) +{ + switch (offload_type) { + case IB_QP_OFFLOAD_NVMF: + if (qp->srq && + qp->srq->srq_type == IB_EXP_SRQT_NVMF) { + MLX5_SET(qpc, qpc, offload_type, MLX5_QPC_OFFLOAD_TYPE_NVMF); + break; + } + fallthrough; + default: + return -EINVAL; + } + + return 0; +} + +int mlx5_ib_set_qp_srqn(void *qpc, struct ib_qp *qp, + u32 srqn) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->device); + struct mlx5_srq_table *table = &dev->srq_table; + struct mlx5_core_srq *msrq; + + if (to_mqp(qp)->rq_type != MLX5_SRQ_RQ) + return -EINVAL; + + xa_lock(&table->array); + msrq = xa_load(&table->array, srqn); + xa_unlock(&table->array); + if (!msrq) + return -EINVAL; + + qp->srq = &to_mibsrq(msrq)->ibsrq; + MLX5_SET(qpc, qpc, srqn_rmpn_xrqn, srqn); + + return 0; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/qpc.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/qpc.c new file mode 100644 index 0000000..14984ba --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/qpc.c @@ -0,0 +1,663 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2013-2020, Mellanox Technologies inc. All rights reserved. + */ + +#include +#include +#include +#include "mlx5_ib.h" +#include "qp.h" + +static int mlx5_core_drain_dct(struct mlx5_ib_dev *dev, + struct mlx5_core_dct *dct); + +static struct mlx5_core_rsc_common * +mlx5_get_rsc(struct mlx5_qp_table *table, u32 rsn) +{ + struct mlx5_core_rsc_common *common; + unsigned long flags; + + spin_lock_irqsave(&table->lock, flags); + + common = radix_tree_lookup(&table->tree, rsn); + if (common) + refcount_inc(&common->refcount); + + spin_unlock_irqrestore(&table->lock, flags); + + return common; +} + +void mlx5_core_put_rsc(struct mlx5_core_rsc_common *common) +{ + if (refcount_dec_and_test(&common->refcount)) + complete(&common->free); +} + +static u64 qp_allowed_event_types(void) +{ + u64 mask; + + mask = BIT(MLX5_EVENT_TYPE_PATH_MIG) | + BIT(MLX5_EVENT_TYPE_COMM_EST) | + BIT(MLX5_EVENT_TYPE_SQ_DRAINED) | + BIT(MLX5_EVENT_TYPE_SRQ_LAST_WQE) | + BIT(MLX5_EVENT_TYPE_WQ_CATAS_ERROR) | + BIT(MLX5_EVENT_TYPE_PATH_MIG_FAILED) | + BIT(MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR) | + BIT(MLX5_EVENT_TYPE_WQ_ACCESS_ERROR) | + BIT(MLX5_EVENT_TYPE_XRQ_ERROR); + + return mask; +} + +static u64 rq_allowed_event_types(void) +{ + u64 mask; + + mask = BIT(MLX5_EVENT_TYPE_SRQ_LAST_WQE) | + BIT(MLX5_EVENT_TYPE_WQ_CATAS_ERROR); + + return mask; +} + +static u64 sq_allowed_event_types(void) +{ + return BIT(MLX5_EVENT_TYPE_WQ_CATAS_ERROR); +} + +static u64 dct_allowed_event_types(void) +{ + return BIT(MLX5_EVENT_TYPE_DCT_DRAINED); +} + +static bool is_event_type_allowed(int rsc_type, int event_type) +{ + switch (rsc_type) { + case MLX5_EVENT_QUEUE_TYPE_QP: + return BIT(event_type) & qp_allowed_event_types(); + case MLX5_EVENT_QUEUE_TYPE_RQ: + return BIT(event_type) & rq_allowed_event_types(); + case MLX5_EVENT_QUEUE_TYPE_SQ: + return BIT(event_type) & sq_allowed_event_types(); + case MLX5_EVENT_QUEUE_TYPE_DCT: + return BIT(event_type) & dct_allowed_event_types(); + default: + WARN(1, "Event arrived for unknown resource type"); + return false; + } +} + +static int rsc_event_notifier(struct notifier_block *nb, + unsigned long type, void *data) +{ + struct mlx5_core_rsc_common *common; + struct mlx5_qp_table *table; + struct mlx5_core_dct *dct; + u8 event_type = (u8)type; + struct mlx5_core_qp *qp; + struct mlx5_eqe *eqe; + u32 rsn; + + switch (event_type) { + case MLX5_EVENT_TYPE_DCT_DRAINED: + eqe = data; + rsn = be32_to_cpu(eqe->data.dct.dctn) & 0xffffff; + rsn |= (MLX5_RES_DCT << MLX5_USER_INDEX_LEN); + break; + case MLX5_EVENT_TYPE_PATH_MIG: + case MLX5_EVENT_TYPE_COMM_EST: + case MLX5_EVENT_TYPE_SQ_DRAINED: + case MLX5_EVENT_TYPE_SRQ_LAST_WQE: + case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: + case MLX5_EVENT_TYPE_PATH_MIG_FAILED: + case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: + case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: + eqe = data; + rsn = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff; + rsn |= (eqe->data.qp_srq.type << MLX5_USER_INDEX_LEN); + break; + case MLX5_EVENT_TYPE_XRQ_ERROR: + { + u8 error_type; + + eqe = data; + error_type = be32_to_cpu(eqe->data.xrq.type_xrqn) >> 24; + if (error_type != MLX5_XRQ_ERROR_TYPE_QP_ERROR) + return NOTIFY_DONE; + rsn = be32_to_cpu(eqe->data.xrq.qpn_id_handle) & + MLX5_24BIT_MASK; + rsn |= (MLX5_EVENT_QUEUE_TYPE_QP << MLX5_USER_INDEX_LEN); + } + break; + default: + return NOTIFY_DONE; + } + + table = container_of(nb, struct mlx5_qp_table, nb); + common = mlx5_get_rsc(table, rsn); + if (!common) + return NOTIFY_OK; + + if (!is_event_type_allowed((rsn >> MLX5_USER_INDEX_LEN), event_type)) + goto out; + + switch (common->res) { + case MLX5_RES_QP: + case MLX5_RES_RQ: + case MLX5_RES_SQ: + qp = (struct mlx5_core_qp *)common; + qp->event(qp, event_type); + break; + case MLX5_RES_DCT: + dct = (struct mlx5_core_dct *)common; + if (event_type == MLX5_EVENT_TYPE_DCT_DRAINED) + complete(&dct->drained); + break; + default: + break; + } +out: + mlx5_core_put_rsc(common); + + return NOTIFY_OK; +} + +static int create_resource_common(struct mlx5_ib_dev *dev, + struct mlx5_core_qp *qp, int rsc_type) +{ + struct mlx5_qp_table *table = &dev->qp_table; + int err; + + qp->common.res = rsc_type; + spin_lock_irq(&table->lock); + err = radix_tree_insert(&table->tree, + qp->qpn | (rsc_type << MLX5_USER_INDEX_LEN), + qp); + spin_unlock_irq(&table->lock); + if (err) + return err; + + refcount_set(&qp->common.refcount, 1); + init_completion(&qp->common.free); + qp->pid = current->pid; + + return 0; +} + +static void destroy_resource_common(struct mlx5_ib_dev *dev, + struct mlx5_core_qp *qp) +{ + struct mlx5_qp_table *table = &dev->qp_table; + unsigned long flags; + + if (refcount_read(&qp->common.refcount) == 0) + return; + + spin_lock_irqsave(&table->lock, flags); + radix_tree_delete(&table->tree, + qp->qpn | (qp->common.res << MLX5_USER_INDEX_LEN)); + spin_unlock_irqrestore(&table->lock, flags); + mlx5_core_put_rsc((struct mlx5_core_rsc_common *)qp); + wait_for_completion(&qp->common.free); +} + +static int _mlx5_core_destroy_dct(struct mlx5_ib_dev *dev, + struct mlx5_core_dct *dct, bool need_cleanup) +{ + u32 in[MLX5_ST_SZ_DW(destroy_dct_in)] = {}; + struct mlx5_core_qp *qp = &dct->mqp; + int err; + + err = mlx5_core_drain_dct(dev, dct); + if (err) { + if (dev->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) + goto destroy; + + return err; + } + wait_for_completion(&dct->drained); +destroy: + if (need_cleanup) + destroy_resource_common(dev, &dct->mqp); + MLX5_SET(destroy_dct_in, in, opcode, MLX5_CMD_OP_DESTROY_DCT); + MLX5_SET(destroy_dct_in, in, dctn, qp->qpn); + MLX5_SET(destroy_dct_in, in, uid, qp->uid); + err = mlx5_cmd_exec_in(dev->mdev, destroy_dct, in); + return err; +} + +int mlx5_core_create_dct(struct mlx5_ib_dev *dev, struct mlx5_core_dct *dct, + u32 *in, int inlen, u32 *out, int outlen) +{ + struct mlx5_core_qp *qp = &dct->mqp; + int err; + + init_completion(&dct->drained); + MLX5_SET(create_dct_in, in, opcode, MLX5_CMD_OP_CREATE_DCT); + + err = mlx5_cmd_do(dev->mdev, in, inlen, out, outlen); + if (err) + return err; + + qp->qpn = MLX5_GET(create_dct_out, out, dctn); + qp->uid = MLX5_GET(create_dct_in, in, uid); + err = create_resource_common(dev, qp, MLX5_RES_DCT); + if (err) + goto err_cmd; + + return 0; +err_cmd: + _mlx5_core_destroy_dct(dev, dct, false); + return err; +} + +int mlx5_qpc_create_qp(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp, + u32 *in, int inlen, u32 *out) +{ + u32 din[MLX5_ST_SZ_DW(destroy_qp_in)] = {}; + int err; + + MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP); + + err = mlx5_cmd_exec(dev->mdev, in, inlen, out, + MLX5_ST_SZ_BYTES(create_qp_out)); + if (err) + return err; + + qp->uid = MLX5_GET(create_qp_in, in, uid); + qp->qpn = MLX5_GET(create_qp_out, out, qpn); + + err = create_resource_common(dev, qp, MLX5_RES_QP); + if (err) + goto err_cmd; + + mlx5_debug_qp_add(dev->mdev, qp); + + return 0; + +err_cmd: + MLX5_SET(destroy_qp_in, din, opcode, MLX5_CMD_OP_DESTROY_QP); + MLX5_SET(destroy_qp_in, din, qpn, qp->qpn); + MLX5_SET(destroy_qp_in, din, uid, qp->uid); + mlx5_cmd_exec_in(dev->mdev, destroy_qp, din); + return err; +} + +static int mlx5_core_drain_dct(struct mlx5_ib_dev *dev, + struct mlx5_core_dct *dct) +{ + u32 in[MLX5_ST_SZ_DW(drain_dct_in)] = {}; + struct mlx5_core_qp *qp = &dct->mqp; + + MLX5_SET(drain_dct_in, in, opcode, MLX5_CMD_OP_DRAIN_DCT); + MLX5_SET(drain_dct_in, in, dctn, qp->qpn); + MLX5_SET(drain_dct_in, in, uid, qp->uid); + return mlx5_cmd_exec_in(dev->mdev, drain_dct, in); +} + +int mlx5_core_destroy_dct(struct mlx5_ib_dev *dev, + struct mlx5_core_dct *dct) +{ + return _mlx5_core_destroy_dct(dev, dct, true); +} + +int mlx5_core_destroy_qp(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp) +{ + u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {}; + + mlx5_debug_qp_remove(dev->mdev, qp); + + destroy_resource_common(dev, qp); + + MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP); + MLX5_SET(destroy_qp_in, in, qpn, qp->qpn); + MLX5_SET(destroy_qp_in, in, uid, qp->uid); + return mlx5_cmd_exec_in(dev->mdev, destroy_qp, in); +} + +int mlx5_core_set_delay_drop(struct mlx5_ib_dev *dev, + u32 timeout_usec) +{ + u32 in[MLX5_ST_SZ_DW(set_delay_drop_params_in)] = {}; + + MLX5_SET(set_delay_drop_params_in, in, opcode, + MLX5_CMD_OP_SET_DELAY_DROP_PARAMS); + MLX5_SET(set_delay_drop_params_in, in, delay_drop_timeout, + timeout_usec / 100); + return mlx5_cmd_exec_in(dev->mdev, set_delay_drop_params, in); +} + +struct mbox_info { + u32 *in; + u32 *out; + int inlen; + int outlen; +}; + +static int mbox_alloc(struct mbox_info *mbox, int inlen, int outlen) +{ + mbox->inlen = inlen; + mbox->outlen = outlen; + mbox->in = kzalloc(mbox->inlen, GFP_KERNEL); + mbox->out = kzalloc(mbox->outlen, GFP_KERNEL); + if (!mbox->in || !mbox->out) { + kfree(mbox->in); + kfree(mbox->out); + return -ENOMEM; + } + + return 0; +} + +static void mbox_free(struct mbox_info *mbox) +{ + kfree(mbox->in); + kfree(mbox->out); +} + +static int get_ece_from_mbox(void *out, u16 opcode) +{ + int ece = 0; + + switch (opcode) { + case MLX5_CMD_OP_INIT2INIT_QP: + ece = MLX5_GET(init2init_qp_out, out, ece); + break; + case MLX5_CMD_OP_INIT2RTR_QP: + ece = MLX5_GET(init2rtr_qp_out, out, ece); + break; + case MLX5_CMD_OP_RTR2RTS_QP: + ece = MLX5_GET(rtr2rts_qp_out, out, ece); + break; + case MLX5_CMD_OP_RTS2RTS_QP: + ece = MLX5_GET(rts2rts_qp_out, out, ece); + break; + case MLX5_CMD_OP_RST2INIT_QP: + ece = MLX5_GET(rst2init_qp_out, out, ece); + break; + default: + break; + } + + return ece; +} + +static int modify_qp_mbox_alloc(struct mlx5_core_dev *dev, u16 opcode, int qpn, + u32 opt_param_mask, void *qpc, + struct mbox_info *mbox, u16 uid, u32 ece) +{ + mbox->out = NULL; + mbox->in = NULL; + +#define MBOX_ALLOC(mbox, typ) \ + mbox_alloc(mbox, MLX5_ST_SZ_BYTES(typ##_in), MLX5_ST_SZ_BYTES(typ##_out)) + +#define MOD_QP_IN_SET(typ, in, _opcode, _qpn, _uid) \ + do { \ + MLX5_SET(typ##_in, in, opcode, _opcode); \ + MLX5_SET(typ##_in, in, qpn, _qpn); \ + MLX5_SET(typ##_in, in, uid, _uid); \ + } while (0) + +#define MOD_QP_IN_SET_QPC(typ, in, _opcode, _qpn, _opt_p, _qpc, _uid) \ + do { \ + MOD_QP_IN_SET(typ, in, _opcode, _qpn, _uid); \ + MLX5_SET(typ##_in, in, opt_param_mask, _opt_p); \ + memcpy(MLX5_ADDR_OF(typ##_in, in, qpc), _qpc, \ + MLX5_ST_SZ_BYTES(qpc)); \ + } while (0) + + switch (opcode) { + /* 2RST & 2ERR */ + case MLX5_CMD_OP_2RST_QP: + if (MBOX_ALLOC(mbox, qp_2rst)) + return -ENOMEM; + MOD_QP_IN_SET(qp_2rst, mbox->in, opcode, qpn, uid); + break; + case MLX5_CMD_OP_2ERR_QP: + if (MBOX_ALLOC(mbox, qp_2err)) + return -ENOMEM; + MOD_QP_IN_SET(qp_2err, mbox->in, opcode, qpn, uid); + break; + + /* MODIFY with QPC */ + case MLX5_CMD_OP_RST2INIT_QP: + if (MBOX_ALLOC(mbox, rst2init_qp)) + return -ENOMEM; + MOD_QP_IN_SET_QPC(rst2init_qp, mbox->in, opcode, qpn, + opt_param_mask, qpc, uid); + MLX5_SET(rst2init_qp_in, mbox->in, ece, ece); + break; + case MLX5_CMD_OP_INIT2RTR_QP: + if (MBOX_ALLOC(mbox, init2rtr_qp)) + return -ENOMEM; + MOD_QP_IN_SET_QPC(init2rtr_qp, mbox->in, opcode, qpn, + opt_param_mask, qpc, uid); + MLX5_SET(init2rtr_qp_in, mbox->in, ece, ece); + break; + case MLX5_CMD_OP_RTR2RTS_QP: + if (MBOX_ALLOC(mbox, rtr2rts_qp)) + return -ENOMEM; + MOD_QP_IN_SET_QPC(rtr2rts_qp, mbox->in, opcode, qpn, + opt_param_mask, qpc, uid); + MLX5_SET(rtr2rts_qp_in, mbox->in, ece, ece); + break; + case MLX5_CMD_OP_RTS2RTS_QP: + if (MBOX_ALLOC(mbox, rts2rts_qp)) + return -ENOMEM; + MOD_QP_IN_SET_QPC(rts2rts_qp, mbox->in, opcode, qpn, + opt_param_mask, qpc, uid); + MLX5_SET(rts2rts_qp_in, mbox->in, ece, ece); + break; + case MLX5_CMD_OP_SQERR2RTS_QP: + if (MBOX_ALLOC(mbox, sqerr2rts_qp)) + return -ENOMEM; + MOD_QP_IN_SET_QPC(sqerr2rts_qp, mbox->in, opcode, qpn, + opt_param_mask, qpc, uid); + break; + case MLX5_CMD_OP_SQD_RTS_QP: + if (MBOX_ALLOC(mbox, sqd2rts_qp)) + return -ENOMEM; + MOD_QP_IN_SET_QPC(sqd2rts_qp, mbox->in, opcode, qpn, + opt_param_mask, qpc, uid); + break; + case MLX5_CMD_OP_INIT2INIT_QP: + if (MBOX_ALLOC(mbox, init2init_qp)) + return -ENOMEM; + MOD_QP_IN_SET_QPC(init2init_qp, mbox->in, opcode, qpn, + opt_param_mask, qpc, uid); + MLX5_SET(init2init_qp_in, mbox->in, ece, ece); + break; + default: + return -EINVAL; + } + return 0; +} + +int mlx5_core_qp_modify(struct mlx5_ib_dev *dev, u16 opcode, u32 opt_param_mask, + void *qpc, struct mlx5_core_qp *qp, u32 *ece) +{ + struct mbox_info mbox; + int err; + + err = modify_qp_mbox_alloc(dev->mdev, opcode, qp->qpn, opt_param_mask, + qpc, &mbox, qp->uid, (ece) ? *ece : 0); + if (err) + return err; + + err = mlx5_cmd_exec(dev->mdev, mbox.in, mbox.inlen, mbox.out, + mbox.outlen); + + if (ece) + *ece = get_ece_from_mbox(mbox.out, opcode); + + mbox_free(&mbox); + return err; +} + +int mlx5_init_qp_table(struct mlx5_ib_dev *dev) +{ + struct mlx5_qp_table *table = &dev->qp_table; + + spin_lock_init(&table->lock); + INIT_RADIX_TREE(&table->tree, GFP_ATOMIC); + mlx5_qp_debugfs_init(dev->mdev); + + table->nb.notifier_call = rsc_event_notifier; + mlx5_notifier_register(dev->mdev, &table->nb); + + return 0; +} + +void mlx5_cleanup_qp_table(struct mlx5_ib_dev *dev) +{ + struct mlx5_qp_table *table = &dev->qp_table; + + mlx5_notifier_unregister(dev->mdev, &table->nb); + mlx5_qp_debugfs_cleanup(dev->mdev); +} + +int mlx5_core_qp_query(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp, + u32 *out, int outlen) +{ + u32 in[MLX5_ST_SZ_DW(query_qp_in)] = {}; + + MLX5_SET(query_qp_in, in, opcode, MLX5_CMD_OP_QUERY_QP); + MLX5_SET(query_qp_in, in, qpn, qp->qpn); + return mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, outlen); +} + +int mlx5_core_dct_query(struct mlx5_ib_dev *dev, struct mlx5_core_dct *dct, + u32 *out, int outlen) +{ + u32 in[MLX5_ST_SZ_DW(query_dct_in)] = {}; + struct mlx5_core_qp *qp = &dct->mqp; + + MLX5_SET(query_dct_in, in, opcode, MLX5_CMD_OP_QUERY_DCT); + MLX5_SET(query_dct_in, in, dctn, qp->qpn); + + return mlx5_cmd_exec(dev->mdev, (void *)&in, sizeof(in), (void *)out, + outlen); +} + +int mlx5_core_xrcd_alloc(struct mlx5_ib_dev *dev, u32 *xrcdn) +{ + u32 out[MLX5_ST_SZ_DW(alloc_xrcd_out)] = {}; + u32 in[MLX5_ST_SZ_DW(alloc_xrcd_in)] = {}; + int err; + + MLX5_SET(alloc_xrcd_in, in, opcode, MLX5_CMD_OP_ALLOC_XRCD); + err = mlx5_cmd_exec_inout(dev->mdev, alloc_xrcd, in, out); + if (!err) + *xrcdn = MLX5_GET(alloc_xrcd_out, out, xrcd); + return err; +} + +int mlx5_core_xrcd_dealloc(struct mlx5_ib_dev *dev, u32 xrcdn) +{ + u32 in[MLX5_ST_SZ_DW(dealloc_xrcd_in)] = {}; + + MLX5_SET(dealloc_xrcd_in, in, opcode, MLX5_CMD_OP_DEALLOC_XRCD); + MLX5_SET(dealloc_xrcd_in, in, xrcd, xrcdn); + return mlx5_cmd_exec_in(dev->mdev, dealloc_xrcd, in); +} + +static int destroy_rq_tracked(struct mlx5_ib_dev *dev, u32 rqn, u16 uid) +{ + u32 in[MLX5_ST_SZ_DW(destroy_rq_in)] = {}; + + MLX5_SET(destroy_rq_in, in, opcode, MLX5_CMD_OP_DESTROY_RQ); + MLX5_SET(destroy_rq_in, in, rqn, rqn); + MLX5_SET(destroy_rq_in, in, uid, uid); + return mlx5_cmd_exec_in(dev->mdev, destroy_rq, in); +} + +int mlx5_core_create_rq_tracked(struct mlx5_ib_dev *dev, u32 *in, int inlen, + struct mlx5_core_qp *rq) +{ + int err; + u32 rqn; + + err = mlx5_core_create_rq(dev->mdev, in, inlen, &rqn); + if (err) + return err; + + rq->uid = MLX5_GET(create_rq_in, in, uid); + rq->qpn = rqn; + err = create_resource_common(dev, rq, MLX5_RES_RQ); + if (err) + goto err_destroy_rq; + + return 0; + +err_destroy_rq: + destroy_rq_tracked(dev, rq->qpn, rq->uid); + + return err; +} + +int mlx5_core_destroy_rq_tracked(struct mlx5_ib_dev *dev, + struct mlx5_core_qp *rq) +{ + destroy_resource_common(dev, rq); + return destroy_rq_tracked(dev, rq->qpn, rq->uid); +} + +static void destroy_sq_tracked(struct mlx5_ib_dev *dev, u32 sqn, u16 uid) +{ + u32 in[MLX5_ST_SZ_DW(destroy_sq_in)] = {}; + + MLX5_SET(destroy_sq_in, in, opcode, MLX5_CMD_OP_DESTROY_SQ); + MLX5_SET(destroy_sq_in, in, sqn, sqn); + MLX5_SET(destroy_sq_in, in, uid, uid); + mlx5_cmd_exec_in(dev->mdev, destroy_sq, in); +} + +int mlx5_core_create_sq_tracked(struct mlx5_ib_dev *dev, u32 *in, int inlen, + struct mlx5_core_qp *sq) +{ + u32 out[MLX5_ST_SZ_DW(create_sq_out)] = {}; + int err; + + MLX5_SET(create_sq_in, in, opcode, MLX5_CMD_OP_CREATE_SQ); + err = mlx5_cmd_exec(dev->mdev, in, inlen, out, sizeof(out)); + if (err) + return err; + + sq->qpn = MLX5_GET(create_sq_out, out, sqn); + sq->uid = MLX5_GET(create_sq_in, in, uid); + err = create_resource_common(dev, sq, MLX5_RES_SQ); + if (err) + goto err_destroy_sq; + + return 0; + +err_destroy_sq: + destroy_sq_tracked(dev, sq->qpn, sq->uid); + + return err; +} + +void mlx5_core_destroy_sq_tracked(struct mlx5_ib_dev *dev, + struct mlx5_core_qp *sq) +{ + destroy_resource_common(dev, sq); + destroy_sq_tracked(dev, sq->qpn, sq->uid); +} + +struct mlx5_core_rsc_common *mlx5_core_res_hold(struct mlx5_ib_dev *dev, + int res_num, + enum mlx5_res_type res_type) +{ + u32 rsn = res_num | (res_type << MLX5_USER_INDEX_LEN); + struct mlx5_qp_table *table = &dev->qp_table; + + return mlx5_get_rsc(table, rsn); +} + +void mlx5_core_res_put(struct mlx5_core_rsc_common *res) +{ + mlx5_core_put_rsc(res); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/restrack.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/restrack.c new file mode 100644 index 0000000..4ac429e --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/restrack.c @@ -0,0 +1,179 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2019-2020, Mellanox Technologies Ltd. All rights reserved. + */ + +#include +#include +#include +#include +#include "mlx5_ib.h" +#include "restrack.h" + +#define MAX_DUMP_SIZE 1024 + +static int dump_rsc(struct mlx5_core_dev *dev, enum mlx5_sgmt_type type, + int index, void *data, int *data_len) +{ + struct mlx5_core_dev *mdev = dev; + struct mlx5_rsc_dump_cmd *cmd; + struct mlx5_rsc_key key = {}; + struct page *page; + int offset = 0; + int err = 0; + int cmd_err; + int size; + + page = alloc_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + + key.size = PAGE_SIZE; + key.rsc = type; + key.index1 = index; + key.num_of_obj1 = 1; + + cmd = mlx5_rsc_dump_cmd_create(mdev, &key); + if (IS_ERR(cmd)) { + err = PTR_ERR(cmd); + goto free_page; + } + + do { + cmd_err = mlx5_rsc_dump_next(mdev, cmd, page, &size); + if (cmd_err < 0 || size + offset > MAX_DUMP_SIZE) { + err = cmd_err; + goto destroy_cmd; + } + memcpy(data + offset, page_address(page), size); + offset += size; + } while (cmd_err > 0); + *data_len = offset; + +destroy_cmd: + mlx5_rsc_dump_cmd_destroy(cmd); +free_page: + __free_page(page); + return err; +} + +static int fill_res_raw(struct sk_buff *msg, struct mlx5_ib_dev *dev, + enum mlx5_sgmt_type type, u32 key) +{ + int len = 0; + void *data; + int err; + + data = kzalloc(MAX_DUMP_SIZE, GFP_KERNEL); + if (!data) + return -ENOMEM; + + err = dump_rsc(dev->mdev, type, key, data, &len); + if (err) + goto out; + + err = nla_put(msg, RDMA_NLDEV_ATTR_RES_RAW, len, data); +out: + kfree(data); + return err; +} + +static int fill_stat_mr_entry(struct sk_buff *msg, struct ib_mr *ibmr) +{ + struct mlx5_ib_mr *mr = to_mmr(ibmr); + struct nlattr *table_attr; + + if (!(mr->access_flags & IB_ACCESS_ON_DEMAND)) + return 0; + + table_attr = nla_nest_start(msg, + RDMA_NLDEV_ATTR_STAT_HWCOUNTERS); + + if (!table_attr) + goto err; + + if (rdma_nl_stat_hwcounter_entry(msg, "page_faults", + atomic64_read(&mr->odp_stats.faults))) + goto err_table; + if (rdma_nl_stat_hwcounter_entry( + msg, "page_invalidations", + atomic64_read(&mr->odp_stats.invalidations))) + goto err_table; + if (rdma_nl_stat_hwcounter_entry(msg, "page_prefetch", + atomic64_read(&mr->odp_stats.prefetch))) + goto err_table; + + nla_nest_end(msg, table_attr); + return 0; + +err_table: + nla_nest_cancel(msg, table_attr); +err: + return -EMSGSIZE; +} + +static int fill_res_mr_entry_raw(struct sk_buff *msg, struct ib_mr *ibmr) +{ + struct mlx5_ib_mr *mr = to_mmr(ibmr); + + return fill_res_raw(msg, mr_to_mdev(mr), MLX5_SGMT_TYPE_PRM_QUERY_MKEY, + mlx5_mkey_to_idx(mr->mmkey.key)); +} + +static int fill_res_mr_entry(struct sk_buff *msg, struct ib_mr *ibmr) +{ + struct mlx5_ib_mr *mr = to_mmr(ibmr); + struct nlattr *table_attr; + + if (!(mr->access_flags & IB_ACCESS_ON_DEMAND)) + return 0; + + table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_DRIVER); + if (!table_attr) + goto err; + + if (mr->is_odp_implicit) { + if (rdma_nl_put_driver_string(msg, "odp", "implicit")) + goto err; + } else { + if (rdma_nl_put_driver_string(msg, "odp", "explicit")) + goto err; + } + + nla_nest_end(msg, table_attr); + return 0; + +err: + nla_nest_cancel(msg, table_attr); + return -EMSGSIZE; +} + +static int fill_res_cq_entry_raw(struct sk_buff *msg, struct ib_cq *ibcq) +{ + struct mlx5_ib_dev *dev = to_mdev(ibcq->device); + struct mlx5_ib_cq *cq = to_mcq(ibcq); + + return fill_res_raw(msg, dev, MLX5_SGMT_TYPE_PRM_QUERY_CQ, cq->mcq.cqn); +} + +static int fill_res_qp_entry_raw(struct sk_buff *msg, struct ib_qp *ibqp) +{ + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + + return fill_res_raw(msg, dev, MLX5_SGMT_TYPE_PRM_QUERY_QP, + ibqp->qp_num); +} + +static const struct ib_device_ops restrack_ops = { + .fill_res_cq_entry_raw = fill_res_cq_entry_raw, + .fill_res_mr_entry = fill_res_mr_entry, + .fill_res_mr_entry_raw = fill_res_mr_entry_raw, + .fill_res_qp_entry_raw = fill_res_qp_entry_raw, + .fill_stat_mr_entry = fill_stat_mr_entry, +}; + +int mlx5_ib_restrack_init(struct mlx5_ib_dev *dev) +{ + ib_set_device_ops(&dev->ib_dev, &restrack_ops); + return 0; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/restrack.h b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/restrack.h new file mode 100644 index 0000000..e8d8127 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/restrack.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2013-2020, Mellanox Technologies Ltd. All rights reserved. + */ + +#ifndef _MLX5_IB_RESTRACK_H +#define _MLX5_IB_RESTRACK_H + +#include "mlx5_ib.h" + +int mlx5_ib_restrack_init(struct mlx5_ib_dev *dev); + +#endif /* _MLX5_IB_RESTRACK_H */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/srq.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/srq.c new file mode 100644 index 0000000..95c8bfc --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/srq.c @@ -0,0 +1,482 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2013-2018, Mellanox Technologies inc. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include "mlx5_ib.h" +#include "srq.h" + +static void *get_wqe(struct mlx5_ib_srq *srq, int n) +{ + return mlx5_frag_buf_get_wqe(&srq->fbc, n); +} + +static void mlx5_ib_srq_event(struct mlx5_core_srq *srq, enum mlx5_event type) +{ + struct ib_event event; + struct ib_srq *ibsrq = &to_mibsrq(srq)->ibsrq; + + if (ibsrq->event_handler) { + event.device = ibsrq->device; + event.element.srq = ibsrq; + switch (type) { + case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT: + event.event = IB_EVENT_SRQ_LIMIT_REACHED; + break; + case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR: + event.event = IB_EVENT_SRQ_ERR; + break; + default: + pr_warn("mlx5_ib: Unexpected event type %d on SRQ %06x\n", + type, srq->srqn); + return; + } + + ibsrq->event_handler(&event, ibsrq->srq_context); + } +} + +static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, + struct mlx5_srq_attr *in, + struct ib_udata *udata, int buf_size) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_ib_create_srq ucmd = {}; + struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context( + udata, struct mlx5_ib_ucontext, ibucontext); + size_t ucmdlen; + int err; + u32 uidx = MLX5_IB_DEFAULT_UIDX; + + ucmdlen = min(udata->inlen, sizeof(ucmd)); + + if (ib_copy_from_udata(&ucmd, udata, ucmdlen)) { + mlx5_ib_dbg(dev, "failed copy udata\n"); + return -EFAULT; + } + + if (ucmd.reserved0 || ucmd.reserved1) + return -EINVAL; + + if (udata->inlen > sizeof(ucmd) && + !ib_is_udata_cleared(udata, sizeof(ucmd), + udata->inlen - sizeof(ucmd))) + return -EINVAL; + + if (in->type != IB_SRQT_BASIC) { + err = get_srq_user_index(ucontext, &ucmd, udata->inlen, &uidx); + if (err) + return err; + } + + srq->wq_sig = !!(ucmd.flags & MLX5_SRQ_FLAG_SIGNATURE); + + srq->umem = ib_umem_get_peer(pd->device, ucmd.buf_addr, buf_size, 0, 0); + if (IS_ERR(srq->umem)) { + mlx5_ib_dbg(dev, "failed umem get, size %d\n", buf_size); + err = PTR_ERR(srq->umem); + return err; + } + in->umem = srq->umem; + + err = mlx5_ib_db_map_user(ucontext, ucmd.db_addr, &srq->db); + if (err) { + mlx5_ib_dbg(dev, "map doorbell failed\n"); + goto err_umem; + } + + in->uid = (in->type != IB_SRQT_XRC) ? to_mpd(pd)->uid : 0; + if (MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1 && + in->type != IB_SRQT_BASIC) + in->user_index = uidx; + + return 0; + +err_umem: + ib_umem_release(srq->umem); + + return err; +} + +static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq, + struct mlx5_srq_attr *in, int buf_size) +{ + int err; + int i; + struct mlx5_wqe_srq_next_seg *next; + + err = mlx5_db_alloc(dev->mdev, &srq->db); + if (err) { + mlx5_ib_warn(dev, "alloc dbell rec failed\n"); + return err; + } + + if (mlx5_frag_buf_alloc_node(dev->mdev, buf_size, &srq->buf, + dev->mdev->priv.numa_node)) { + mlx5_ib_dbg(dev, "buf alloc failed\n"); + err = -ENOMEM; + goto err_db; + } + + mlx5_init_fbc(srq->buf.frags, srq->msrq.wqe_shift, ilog2(srq->msrq.max), + &srq->fbc); + + srq->head = 0; + srq->tail = srq->msrq.max - 1; + srq->wqe_ctr = 0; + + for (i = 0; i < srq->msrq.max; i++) { + next = get_wqe(srq, i); + next->next_wqe_index = + cpu_to_be16((i + 1) & (srq->msrq.max - 1)); + } + + mlx5_ib_dbg(dev, "srq->buf.page_shift = %d\n", srq->buf.page_shift); + in->pas = kvcalloc(srq->buf.npages, sizeof(*in->pas), GFP_KERNEL); + if (!in->pas) { + err = -ENOMEM; + goto err_buf; + } + mlx5_fill_page_frag_array(&srq->buf, in->pas); + + srq->wrid = kvmalloc_array(srq->msrq.max, sizeof(u64), GFP_KERNEL); + if (!srq->wrid) { + err = -ENOMEM; + goto err_in; + } + srq->wq_sig = 0; + + in->log_page_size = srq->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT; + if (MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1 && + in->type != IB_SRQT_BASIC) + in->user_index = MLX5_IB_DEFAULT_UIDX; + + return 0; + +err_in: + kvfree(in->pas); + +err_buf: + mlx5_frag_buf_free(dev->mdev, &srq->buf); + +err_db: + mlx5_db_free(dev->mdev, &srq->db); + return err; +} + +static void destroy_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, + struct ib_udata *udata) +{ + mlx5_ib_db_unmap_user( + rdma_udata_to_drv_context( + udata, + struct mlx5_ib_ucontext, + ibucontext), + &srq->db); + ib_umem_release(srq->umem); +} + + +static void destroy_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq) +{ + kvfree(srq->wrid); + mlx5_frag_buf_free(dev->mdev, &srq->buf); + mlx5_db_free(dev->mdev, &srq->db); +} + +int mlx5_ib_create_srq(struct ib_srq *ib_srq, + struct ib_srq_init_attr *init_attr, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(ib_srq->device); + struct mlx5_ib_srq *srq = to_msrq(ib_srq); + size_t desc_size; + size_t buf_size; + int err; + struct mlx5_srq_attr in = {}; + __u32 max_srq_wqes = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz); + + if (init_attr->srq_type != IB_SRQT_BASIC && + init_attr->srq_type != IB_SRQT_XRC && + init_attr->srq_type != IB_SRQT_TM && + init_attr->srq_type != IB_EXP_SRQT_NVMF) + return -EOPNOTSUPP; + + /* Sanity check SRQ size before proceeding */ + if (init_attr->attr.max_wr >= max_srq_wqes) { + mlx5_ib_dbg(dev, "max_wr %d, cap %d\n", + init_attr->attr.max_wr, + max_srq_wqes); + return -EINVAL; + } + + mutex_init(&srq->mutex); + spin_lock_init(&srq->lock); + srq->msrq.max = roundup_pow_of_two(init_attr->attr.max_wr + 1); + srq->msrq.max_gs = init_attr->attr.max_sge; + + desc_size = sizeof(struct mlx5_wqe_srq_next_seg) + + srq->msrq.max_gs * sizeof(struct mlx5_wqe_data_seg); + if (desc_size == 0 || srq->msrq.max_gs > desc_size) + return -EINVAL; + + desc_size = roundup_pow_of_two(desc_size); + desc_size = max_t(size_t, 32, desc_size); + if (desc_size < sizeof(struct mlx5_wqe_srq_next_seg)) + return -EINVAL; + + srq->msrq.max_avail_gather = (desc_size - sizeof(struct mlx5_wqe_srq_next_seg)) / + sizeof(struct mlx5_wqe_data_seg); + srq->msrq.wqe_shift = ilog2(desc_size); + buf_size = srq->msrq.max * desc_size; + if (buf_size < desc_size) + return -EINVAL; + + in.type = init_attr->srq_type; + + if (udata) + err = create_srq_user(ib_srq->pd, srq, &in, udata, buf_size); + else + err = create_srq_kernel(dev, srq, &in, buf_size); + + if (err) { + mlx5_ib_warn(dev, "create srq %s failed, err %d\n", + udata ? "user" : "kernel", err); + return err; + } + + in.log_size = ilog2(srq->msrq.max); + in.wqe_shift = srq->msrq.wqe_shift - 4; + if (srq->wq_sig) + in.flags |= MLX5_SRQ_FLAG_WQ_SIG; + + if (init_attr->srq_type == IB_SRQT_XRC && init_attr->ext.xrc.xrcd) + in.xrcd = to_mxrcd(init_attr->ext.xrc.xrcd)->xrcdn; + else + in.xrcd = dev->devr.xrcdn0; + + if (init_attr->srq_type == IB_EXP_SRQT_NVMF) { + err = mlx5_ib_exp_set_nvmf_srq_attrs(&in.nvmf, init_attr); + if (err) { + mlx5_ib_warn(dev, "setting nvmf srq attrs failed, err %d\n", err); + goto err_usr_kern_srq; + } + } + + if (init_attr->srq_type == IB_SRQT_TM) { + in.tm_log_list_size = + ilog2(init_attr->ext.tag_matching.max_num_tags) + 1; + if (in.tm_log_list_size > + MLX5_CAP_GEN(dev->mdev, log_tag_matching_list_sz)) { + mlx5_ib_dbg(dev, "TM SRQ max_num_tags exceeding limit\n"); + err = -EINVAL; + goto err_usr_kern_srq; + } + in.flags |= MLX5_SRQ_FLAG_RNDV; + } + + if (ib_srq_has_cq(init_attr->srq_type)) + in.cqn = to_mcq(init_attr->ext.cq)->mcq.cqn; + else + in.cqn = to_mcq(dev->devr.c0)->mcq.cqn; + + in.pd = to_mpd(ib_srq->pd)->pdn; + in.db_record = srq->db.dma; + err = mlx5_cmd_create_srq(dev, &srq->msrq, &in); + kvfree(in.pas); + if (err) { + mlx5_ib_dbg(dev, "create SRQ failed, err %d\n", err); + goto err_usr_kern_srq; + } + + mlx5_ib_dbg(dev, "create SRQ with srqn 0x%x\n", srq->msrq.srqn); + + srq->msrq.event = mlx5_ib_srq_event; + srq->ibsrq.ext.xrc.srq_num = srq->msrq.srqn; + + if (udata) { + struct mlx5_ib_create_srq_resp resp = { + .srqn = srq->msrq.srqn, + }; + + if (ib_copy_to_udata(udata, &resp, min(udata->outlen, + sizeof(resp)))) { + mlx5_ib_dbg(dev, "copy to user failed\n"); + err = -EFAULT; + goto err_core; + } + } + + init_attr->attr.max_wr = srq->msrq.max - 1; + + return 0; + +err_core: + mlx5_cmd_destroy_srq(dev, &srq->msrq); + +err_usr_kern_srq: + if (udata) + destroy_srq_user(ib_srq->pd, srq, udata); + else + destroy_srq_kernel(dev, srq); + + return err; +} + +int mlx5_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(ibsrq->device); + struct mlx5_ib_srq *srq = to_msrq(ibsrq); + int ret; + + /* We don't support resizing SRQs yet */ + if (attr_mask & IB_SRQ_MAX_WR) + return -EINVAL; + + if (attr_mask & IB_SRQ_LIMIT) { + if (attr->srq_limit >= srq->msrq.max) + return -EINVAL; + + mutex_lock(&srq->mutex); + ret = mlx5_cmd_arm_srq(dev, &srq->msrq, attr->srq_limit, 1); + mutex_unlock(&srq->mutex); + + if (ret) + return ret; + } + + return 0; +} + +int mlx5_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr) +{ + struct mlx5_ib_dev *dev = to_mdev(ibsrq->device); + struct mlx5_ib_srq *srq = to_msrq(ibsrq); + int ret; + struct mlx5_srq_attr *out; + + out = kzalloc(sizeof(*out), GFP_KERNEL); + if (!out) + return -ENOMEM; + + ret = mlx5_cmd_query_srq(dev, &srq->msrq, out); + if (ret) + goto out_box; + + srq_attr->srq_limit = out->lwm; + srq_attr->max_wr = srq->msrq.max - 1; + srq_attr->max_sge = srq->msrq.max_gs; + if (ibsrq->srq_type == IB_EXP_SRQT_NVMF) + srq_attr->nvmf.cmd_unknown_namespace_cnt = + out->nvmf.cmd_unknown_namespace_cnt; + +out_box: + kfree(out); + return ret; +} + +int mlx5_ib_destroy_srq(struct ib_srq *srq, struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(srq->device); + struct mlx5_ib_srq *msrq = to_msrq(srq); + int ret; + + ret = mlx5_cmd_destroy_srq(dev, &msrq->msrq); + if (ret) + return ret; + + if (udata) + destroy_srq_user(srq->pd, msrq, udata); + else + destroy_srq_kernel(dev, msrq); + return 0; +} + +void mlx5_ib_free_srq_wqe(struct mlx5_ib_srq *srq, int wqe_index) +{ + struct mlx5_wqe_srq_next_seg *next; + + /* always called with interrupts disabled. */ + spin_lock(&srq->lock); + + next = get_wqe(srq, srq->tail); + next->next_wqe_index = cpu_to_be16(wqe_index); + srq->tail = wqe_index; + + spin_unlock(&srq->lock); +} + +int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) +{ + struct mlx5_ib_srq *srq = to_msrq(ibsrq); + struct mlx5_wqe_srq_next_seg *next; + struct mlx5_wqe_data_seg *scat; + struct mlx5_ib_dev *dev = to_mdev(ibsrq->device); + struct mlx5_core_dev *mdev = dev->mdev; + unsigned long flags; + int err = 0; + int nreq; + int i; + + spin_lock_irqsave(&srq->lock, flags); + + if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { + err = -EIO; + *bad_wr = wr; + goto out; + } + + for (nreq = 0; wr; nreq++, wr = wr->next) { + if (unlikely(wr->num_sge > srq->msrq.max_gs)) { + err = -EINVAL; + *bad_wr = wr; + break; + } + + if (unlikely(srq->head == srq->tail)) { + err = -ENOMEM; + *bad_wr = wr; + break; + } + + srq->wrid[srq->head] = wr->wr_id; + + next = get_wqe(srq, srq->head); + srq->head = be16_to_cpu(next->next_wqe_index); + scat = (struct mlx5_wqe_data_seg *)(next + 1); + + for (i = 0; i < wr->num_sge; i++) { + scat[i].byte_count = cpu_to_be32(wr->sg_list[i].length); + scat[i].lkey = cpu_to_be32(wr->sg_list[i].lkey); + scat[i].addr = cpu_to_be64(wr->sg_list[i].addr); + } + + if (i < srq->msrq.max_avail_gather) { + scat[i].byte_count = 0; + scat[i].lkey = cpu_to_be32(MLX5_INVALID_LKEY); + scat[i].addr = 0; + } + } + + if (likely(nreq)) { + srq->wqe_ctr += nreq; + + /* Make sure that descriptors are written before + * doorbell record. + */ + wmb(); + + *srq->db.db = cpu_to_be32(srq->wqe_ctr); + } +out: + spin_unlock_irqrestore(&srq->lock, flags); + + return err; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/srq.h b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/srq.h new file mode 100644 index 0000000..1295a7b --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/srq.h @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2013-2018, Mellanox Technologies. All rights reserved. + */ + +#ifndef MLX5_IB_SRQ_H +#define MLX5_IB_SRQ_H + +enum { + MLX5_SRQ_FLAG_ERR = (1 << 0), + MLX5_SRQ_FLAG_WQ_SIG = (1 << 1), + MLX5_SRQ_FLAG_RNDV = (1 << 2), +}; + +enum mlx5_nvmf_offload_type { + MLX5_NVMF_WRITE_OFFLOAD = 1, + MLX5_NVMF_READ_OFFLOAD = 2, + MLX5_NVMF_READ_WRITE_OFFLOAD = 3, + MLX5_NVMF_READ_WRITE_FLUSH_OFFLOAD = 4, +}; + +struct mlx5_nvmf_attr { + enum mlx5_nvmf_offload_type type; + u8 passthrough_sqe_rw_service_en; + u8 log_max_namespace; + u32 cmd_unknown_namespace_cnt; + u32 ioccsz; + u8 icdoff; + u8 log_max_io_size; + u8 nvme_memory_log_page_size; + u8 staging_buffer_log_page_size; + u16 staging_buffer_number_of_pages; + u8 staging_buffer_page_offset; + u32 nvme_queue_size; + u64 *staging_buffer_pas; +}; + +struct mlx5_srq_attr { + u32 type; + u32 flags; + u32 log_size; + u32 wqe_shift; + u32 log_page_size; + u32 wqe_cnt; + u32 srqn; + u32 xrcd; + u32 page_offset; + u32 cqn; + u32 pd; + u32 lwm; + u32 user_index; + u64 db_record; + __be64 *pas; + struct ib_umem *umem; + u32 tm_log_list_size; + u32 tm_next_tag; + u32 tm_hw_phase_cnt; + u32 tm_sw_phase_cnt; + u16 uid; + struct mlx5_nvmf_attr nvmf; +}; + +struct mlx5_ib_dev; + +struct mlx5_core_srq { + struct mlx5_core_rsc_common common; /* must be first */ + u32 srqn; + int max; + size_t max_gs; + size_t max_avail_gather; + int wqe_shift; + void (*event)(struct mlx5_core_srq *srq, enum mlx5_event e); + + /* protect ctrl list */ + spinlock_t lock; + struct list_head ctrl_list; + u16 uid; +}; + +struct mlx5_srq_table { + struct notifier_block nb; + struct xarray array; +}; + +int mlx5_cmd_create_srq(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq, + struct mlx5_srq_attr *in); +int mlx5_cmd_destroy_srq(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq); +int mlx5_cmd_query_srq(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq, + struct mlx5_srq_attr *out); +int mlx5_cmd_arm_srq(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq, + u16 lwm, int is_srq); +struct mlx5_core_srq *mlx5_cmd_get_srq(struct mlx5_ib_dev *dev, u32 srqn); + +int mlx5_init_srq_table(struct mlx5_ib_dev *dev); +void mlx5_cleanup_srq_table(struct mlx5_ib_dev *dev); +#endif /* MLX5_IB_SRQ_H */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/srq_cmd.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/srq_cmd.c new file mode 100644 index 0000000..10cde7e --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/srq_cmd.c @@ -0,0 +1,833 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2013-2018, Mellanox Technologies inc. All rights reserved. + */ + +#include +#include +#include "mlx5_ib.h" +#include "srq.h" +#include "qp.h" +#include "srq_nvmf.h" + +static int get_pas_size(struct mlx5_srq_attr *in) +{ + u32 log_page_size = in->log_page_size + 12; + u32 log_srq_size = in->log_size; + u32 log_rq_stride = in->wqe_shift; + u32 page_offset = in->page_offset; + u32 po_quanta = 1 << (log_page_size - 6); + u32 rq_sz = 1 << (log_srq_size + 4 + log_rq_stride); + u32 page_size = 1 << log_page_size; + u32 rq_sz_po = rq_sz + (page_offset * po_quanta); + u32 rq_num_pas = DIV_ROUND_UP(rq_sz_po, page_size); + + return rq_num_pas * sizeof(u64); +} + +static void set_wq(void *wq, struct mlx5_srq_attr *in) +{ + MLX5_SET(wq, wq, wq_signature, !!(in->flags + & MLX5_SRQ_FLAG_WQ_SIG)); + MLX5_SET(wq, wq, log_wq_pg_sz, in->log_page_size); + MLX5_SET(wq, wq, log_wq_stride, in->wqe_shift + 4); + MLX5_SET(wq, wq, log_wq_sz, in->log_size); + MLX5_SET(wq, wq, page_offset, in->page_offset); + MLX5_SET(wq, wq, lwm, in->lwm); + MLX5_SET(wq, wq, pd, in->pd); + MLX5_SET64(wq, wq, dbr_addr, in->db_record); +} + +static void set_srqc(void *srqc, struct mlx5_srq_attr *in) +{ + MLX5_SET(srqc, srqc, wq_signature, !!(in->flags + & MLX5_SRQ_FLAG_WQ_SIG)); + MLX5_SET(srqc, srqc, log_page_size, in->log_page_size); + MLX5_SET(srqc, srqc, log_rq_stride, in->wqe_shift); + MLX5_SET(srqc, srqc, log_srq_size, in->log_size); + MLX5_SET(srqc, srqc, page_offset, in->page_offset); + MLX5_SET(srqc, srqc, lwm, in->lwm); + MLX5_SET(srqc, srqc, pd, in->pd); + MLX5_SET64(srqc, srqc, dbr_addr, in->db_record); + MLX5_SET(srqc, srqc, xrcd, in->xrcd); + MLX5_SET(srqc, srqc, cqn, in->cqn); +} + +static void get_wq(void *wq, struct mlx5_srq_attr *in) +{ + if (MLX5_GET(wq, wq, wq_signature)) + in->flags &= MLX5_SRQ_FLAG_WQ_SIG; + in->log_page_size = MLX5_GET(wq, wq, log_wq_pg_sz); + in->wqe_shift = MLX5_GET(wq, wq, log_wq_stride) - 4; + in->log_size = MLX5_GET(wq, wq, log_wq_sz); + in->page_offset = MLX5_GET(wq, wq, page_offset); + in->lwm = MLX5_GET(wq, wq, lwm); + in->pd = MLX5_GET(wq, wq, pd); + in->db_record = MLX5_GET64(wq, wq, dbr_addr); +} + +static void get_srqc(void *srqc, struct mlx5_srq_attr *in) +{ + if (MLX5_GET(srqc, srqc, wq_signature)) + in->flags &= MLX5_SRQ_FLAG_WQ_SIG; + in->log_page_size = MLX5_GET(srqc, srqc, log_page_size); + in->wqe_shift = MLX5_GET(srqc, srqc, log_rq_stride); + in->log_size = MLX5_GET(srqc, srqc, log_srq_size); + in->page_offset = MLX5_GET(srqc, srqc, page_offset); + in->lwm = MLX5_GET(srqc, srqc, lwm); + in->pd = MLX5_GET(srqc, srqc, pd); + in->db_record = MLX5_GET64(srqc, srqc, dbr_addr); +} + +struct mlx5_core_srq *mlx5_cmd_get_srq(struct mlx5_ib_dev *dev, u32 srqn) +{ + struct mlx5_srq_table *table = &dev->srq_table; + struct mlx5_core_srq *srq; + + xa_lock_irq(&table->array); + srq = xa_load(&table->array, srqn); + if (srq) + refcount_inc(&srq->common.refcount); + xa_unlock_irq(&table->array); + + return srq; +} + +static int __set_srq_page_size(struct mlx5_srq_attr *in, + unsigned long page_size) +{ + if (!page_size) + return -EINVAL; + in->log_page_size = order_base_2(page_size) - MLX5_ADAPTER_PAGE_SHIFT; + + if (WARN_ON(get_pas_size(in) != + ib_umem_num_dma_blocks(in->umem, page_size) * sizeof(u64))) + return -EINVAL; + return 0; +} + +#define set_srq_page_size(in, typ, log_pgsz_fld) \ + __set_srq_page_size(in, mlx5_umem_find_best_quantized_pgoff( \ + (in)->umem, typ, log_pgsz_fld, \ + MLX5_ADAPTER_PAGE_SHIFT, page_offset, \ + 64, &(in)->page_offset)) + +static int create_srq_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq, + struct mlx5_srq_attr *in) +{ + u32 create_out[MLX5_ST_SZ_DW(create_srq_out)] = {0}; + void *create_in; + void *srqc; + void *pas; + int pas_size; + int inlen; + int err; + + if (in->umem) { + err = set_srq_page_size(in, srqc, log_page_size); + if (err) + return err; + } + + pas_size = get_pas_size(in); + inlen = MLX5_ST_SZ_BYTES(create_srq_in) + pas_size; + create_in = kvzalloc(inlen, GFP_KERNEL); + if (!create_in) + return -ENOMEM; + + MLX5_SET(create_srq_in, create_in, uid, in->uid); + srqc = MLX5_ADDR_OF(create_srq_in, create_in, srq_context_entry); + pas = MLX5_ADDR_OF(create_srq_in, create_in, pas); + + set_srqc(srqc, in); + if (in->umem) + mlx5_ib_populate_pas( + in->umem, + 1UL << (in->log_page_size + MLX5_ADAPTER_PAGE_SHIFT), + pas, 0); + else + memcpy(pas, in->pas, pas_size); + + MLX5_SET(create_srq_in, create_in, opcode, + MLX5_CMD_OP_CREATE_SRQ); + + err = mlx5_cmd_exec(dev->mdev, create_in, inlen, create_out, + sizeof(create_out)); + kvfree(create_in); + if (!err) { + srq->srqn = MLX5_GET(create_srq_out, create_out, srqn); + srq->uid = in->uid; + } + + return err; +} + +static int destroy_srq_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq) +{ + u32 in[MLX5_ST_SZ_DW(destroy_srq_in)] = {}; + + MLX5_SET(destroy_srq_in, in, opcode, MLX5_CMD_OP_DESTROY_SRQ); + MLX5_SET(destroy_srq_in, in, srqn, srq->srqn); + MLX5_SET(destroy_srq_in, in, uid, srq->uid); + + return mlx5_cmd_exec_in(dev->mdev, destroy_srq, in); +} + +static int arm_srq_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq, + u16 lwm, int is_srq) +{ + u32 in[MLX5_ST_SZ_DW(arm_rq_in)] = {}; + + MLX5_SET(arm_rq_in, in, opcode, MLX5_CMD_OP_ARM_RQ); + MLX5_SET(arm_rq_in, in, op_mod, MLX5_ARM_RQ_IN_OP_MOD_SRQ); + MLX5_SET(arm_rq_in, in, srq_number, srq->srqn); + MLX5_SET(arm_rq_in, in, lwm, lwm); + MLX5_SET(arm_rq_in, in, uid, srq->uid); + + return mlx5_cmd_exec_in(dev->mdev, arm_rq, in); +} + +static int query_srq_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq, + struct mlx5_srq_attr *out) +{ + u32 in[MLX5_ST_SZ_DW(query_srq_in)] = {}; + u32 *srq_out; + void *srqc; + int err; + + srq_out = kvzalloc(MLX5_ST_SZ_BYTES(query_srq_out), GFP_KERNEL); + if (!srq_out) + return -ENOMEM; + + MLX5_SET(query_srq_in, in, opcode, MLX5_CMD_OP_QUERY_SRQ); + MLX5_SET(query_srq_in, in, srqn, srq->srqn); + err = mlx5_cmd_exec_inout(dev->mdev, query_srq, in, srq_out); + if (err) + goto out; + + srqc = MLX5_ADDR_OF(query_srq_out, srq_out, srq_context_entry); + get_srqc(srqc, out); + if (MLX5_GET(srqc, srqc, state) != MLX5_SRQC_STATE_GOOD) + out->flags |= MLX5_SRQ_FLAG_ERR; +out: + kvfree(srq_out); + return err; +} + +static int create_xrc_srq_cmd(struct mlx5_ib_dev *dev, + struct mlx5_core_srq *srq, + struct mlx5_srq_attr *in) +{ + u32 create_out[MLX5_ST_SZ_DW(create_xrc_srq_out)]; + void *create_in; + void *xrc_srqc; + void *pas; + int pas_size; + int inlen; + int err; + + if (in->umem) { + err = set_srq_page_size(in, xrc_srqc, log_page_size); + if (err) + return err; + } + + pas_size = get_pas_size(in); + inlen = MLX5_ST_SZ_BYTES(create_xrc_srq_in) + pas_size; + create_in = kvzalloc(inlen, GFP_KERNEL); + if (!create_in) + return -ENOMEM; + + MLX5_SET(create_xrc_srq_in, create_in, uid, in->uid); + xrc_srqc = MLX5_ADDR_OF(create_xrc_srq_in, create_in, + xrc_srq_context_entry); + pas = MLX5_ADDR_OF(create_xrc_srq_in, create_in, pas); + + set_srqc(xrc_srqc, in); + MLX5_SET(xrc_srqc, xrc_srqc, user_index, in->user_index); + if (in->umem) + mlx5_ib_populate_pas( + in->umem, + 1UL << (in->log_page_size + MLX5_ADAPTER_PAGE_SHIFT), + pas, 0); + else + memcpy(pas, in->pas, pas_size); + MLX5_SET(create_xrc_srq_in, create_in, opcode, + MLX5_CMD_OP_CREATE_XRC_SRQ); + + memset(create_out, 0, sizeof(create_out)); + err = mlx5_cmd_exec(dev->mdev, create_in, inlen, create_out, + sizeof(create_out)); + if (err) + goto out; + + srq->srqn = MLX5_GET(create_xrc_srq_out, create_out, xrc_srqn); + srq->uid = in->uid; +out: + kvfree(create_in); + return err; +} + +static int destroy_xrc_srq_cmd(struct mlx5_ib_dev *dev, + struct mlx5_core_srq *srq) +{ + u32 in[MLX5_ST_SZ_DW(destroy_xrc_srq_in)] = {}; + + MLX5_SET(destroy_xrc_srq_in, in, opcode, MLX5_CMD_OP_DESTROY_XRC_SRQ); + MLX5_SET(destroy_xrc_srq_in, in, xrc_srqn, srq->srqn); + MLX5_SET(destroy_xrc_srq_in, in, uid, srq->uid); + + return mlx5_cmd_exec_in(dev->mdev, destroy_xrc_srq, in); +} + +static int arm_xrc_srq_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq, + u16 lwm) +{ + u32 in[MLX5_ST_SZ_DW(arm_xrc_srq_in)] = {}; + + MLX5_SET(arm_xrc_srq_in, in, opcode, MLX5_CMD_OP_ARM_XRC_SRQ); + MLX5_SET(arm_xrc_srq_in, in, op_mod, + MLX5_ARM_XRC_SRQ_IN_OP_MOD_XRC_SRQ); + MLX5_SET(arm_xrc_srq_in, in, xrc_srqn, srq->srqn); + MLX5_SET(arm_xrc_srq_in, in, lwm, lwm); + MLX5_SET(arm_xrc_srq_in, in, uid, srq->uid); + + return mlx5_cmd_exec_in(dev->mdev, arm_xrc_srq, in); +} + +static int query_xrc_srq_cmd(struct mlx5_ib_dev *dev, + struct mlx5_core_srq *srq, + struct mlx5_srq_attr *out) +{ + u32 in[MLX5_ST_SZ_DW(query_xrc_srq_in)] = {}; + u32 *xrcsrq_out; + void *xrc_srqc; + int err; + + xrcsrq_out = kvzalloc(MLX5_ST_SZ_BYTES(query_xrc_srq_out), GFP_KERNEL); + if (!xrcsrq_out) + return -ENOMEM; + + MLX5_SET(query_xrc_srq_in, in, opcode, MLX5_CMD_OP_QUERY_XRC_SRQ); + MLX5_SET(query_xrc_srq_in, in, xrc_srqn, srq->srqn); + + err = mlx5_cmd_exec_inout(dev->mdev, query_xrc_srq, in, xrcsrq_out); + if (err) + goto out; + + xrc_srqc = MLX5_ADDR_OF(query_xrc_srq_out, xrcsrq_out, + xrc_srq_context_entry); + get_srqc(xrc_srqc, out); + if (MLX5_GET(xrc_srqc, xrc_srqc, state) != MLX5_XRC_SRQC_STATE_GOOD) + out->flags |= MLX5_SRQ_FLAG_ERR; + +out: + kvfree(xrcsrq_out); + return err; +} + +static int create_rmp_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq, + struct mlx5_srq_attr *in) +{ + void *create_out = NULL; + void *create_in = NULL; + void *rmpc; + void *wq; + void *pas; + int pas_size; + int outlen; + int inlen; + int err; + + if (in->umem) { + err = set_srq_page_size(in, wq, log_wq_pg_sz); + if (err) + return err; + } + + pas_size = get_pas_size(in); + inlen = MLX5_ST_SZ_BYTES(create_rmp_in) + pas_size; + outlen = MLX5_ST_SZ_BYTES(create_rmp_out); + create_in = kvzalloc(inlen, GFP_KERNEL); + create_out = kvzalloc(outlen, GFP_KERNEL); + if (!create_in || !create_out) { + err = -ENOMEM; + goto out; + } + + rmpc = MLX5_ADDR_OF(create_rmp_in, create_in, ctx); + wq = MLX5_ADDR_OF(rmpc, rmpc, wq); + + MLX5_SET(rmpc, rmpc, state, MLX5_RMPC_STATE_RDY); + MLX5_SET(create_rmp_in, create_in, uid, in->uid); + pas = MLX5_ADDR_OF(rmpc, rmpc, wq.pas); + + set_wq(wq, in); + if (in->umem) + mlx5_ib_populate_pas( + in->umem, + 1UL << (in->log_page_size + MLX5_ADAPTER_PAGE_SHIFT), + pas, 0); + else + memcpy(pas, in->pas, pas_size); + + MLX5_SET(create_rmp_in, create_in, opcode, MLX5_CMD_OP_CREATE_RMP); + err = mlx5_cmd_exec(dev->mdev, create_in, inlen, create_out, outlen); + if (!err) { + srq->srqn = MLX5_GET(create_rmp_out, create_out, rmpn); + srq->uid = in->uid; + } + +out: + kvfree(create_in); + kvfree(create_out); + return err; +} + +static int destroy_rmp_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq) +{ + u32 in[MLX5_ST_SZ_DW(destroy_rmp_in)] = {}; + + MLX5_SET(destroy_rmp_in, in, opcode, MLX5_CMD_OP_DESTROY_RMP); + MLX5_SET(destroy_rmp_in, in, rmpn, srq->srqn); + MLX5_SET(destroy_rmp_in, in, uid, srq->uid); + return mlx5_cmd_exec_in(dev->mdev, destroy_rmp, in); +} + +static int arm_rmp_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq, + u16 lwm) +{ + void *out = NULL; + void *in = NULL; + void *rmpc; + void *wq; + void *bitmask; + int outlen; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(modify_rmp_in); + outlen = MLX5_ST_SZ_BYTES(modify_rmp_out); + + in = kvzalloc(inlen, GFP_KERNEL); + out = kvzalloc(outlen, GFP_KERNEL); + if (!in || !out) { + err = -ENOMEM; + goto out; + } + + rmpc = MLX5_ADDR_OF(modify_rmp_in, in, ctx); + bitmask = MLX5_ADDR_OF(modify_rmp_in, in, bitmask); + wq = MLX5_ADDR_OF(rmpc, rmpc, wq); + + MLX5_SET(modify_rmp_in, in, rmp_state, MLX5_RMPC_STATE_RDY); + MLX5_SET(modify_rmp_in, in, rmpn, srq->srqn); + MLX5_SET(modify_rmp_in, in, uid, srq->uid); + MLX5_SET(wq, wq, lwm, lwm); + MLX5_SET(rmp_bitmask, bitmask, lwm, 1); + MLX5_SET(rmpc, rmpc, state, MLX5_RMPC_STATE_RDY); + MLX5_SET(modify_rmp_in, in, opcode, MLX5_CMD_OP_MODIFY_RMP); + + err = mlx5_cmd_exec_inout(dev->mdev, modify_rmp, in, out); + +out: + kvfree(in); + kvfree(out); + return err; +} + +static int query_rmp_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq, + struct mlx5_srq_attr *out) +{ + u32 *rmp_out = NULL; + u32 *rmp_in = NULL; + void *rmpc; + int outlen; + int inlen; + int err; + + outlen = MLX5_ST_SZ_BYTES(query_rmp_out); + inlen = MLX5_ST_SZ_BYTES(query_rmp_in); + + rmp_out = kvzalloc(outlen, GFP_KERNEL); + rmp_in = kvzalloc(inlen, GFP_KERNEL); + if (!rmp_out || !rmp_in) { + err = -ENOMEM; + goto out; + } + + MLX5_SET(query_rmp_in, rmp_in, opcode, MLX5_CMD_OP_QUERY_RMP); + MLX5_SET(query_rmp_in, rmp_in, rmpn, srq->srqn); + err = mlx5_cmd_exec_inout(dev->mdev, query_rmp, rmp_in, rmp_out); + if (err) + goto out; + + rmpc = MLX5_ADDR_OF(query_rmp_out, rmp_out, rmp_context); + get_wq(MLX5_ADDR_OF(rmpc, rmpc, wq), out); + if (MLX5_GET(rmpc, rmpc, state) != MLX5_RMPC_STATE_RDY) + out->flags |= MLX5_SRQ_FLAG_ERR; + +out: + kvfree(rmp_out); + kvfree(rmp_in); + return err; +} + +static int create_xrq_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq, + struct mlx5_srq_attr *in) +{ + u32 create_out[MLX5_ST_SZ_DW(create_xrq_out)] = {0}; + void *create_in; + void *xrqc; + void *wq; + void *pas; + int pas_size, rq_pas_size; + int inlen; + int err; + + if (in->umem) { + err = set_srq_page_size(in, wq, log_wq_pg_sz); + if (err) + return err; + } + + rq_pas_size = get_pas_size(in); + if (in->type == IB_EXP_SRQT_NVMF) + pas_size = roundup(rq_pas_size, MLX5_PAS_ALIGN) + + roundup(get_nvmf_pas_size(&in->nvmf), MLX5_PAS_ALIGN); + else if (in->type == IB_SRQT_TM) + pas_size = rq_pas_size; + else + return -EINVAL; + + inlen = MLX5_ST_SZ_BYTES(create_xrq_in) + pas_size; + create_in = kvzalloc(inlen, GFP_KERNEL); + if (!create_in) + return -ENOMEM; + + xrqc = MLX5_ADDR_OF(create_xrq_in, create_in, xrq_context); + wq = MLX5_ADDR_OF(xrqc, xrqc, wq); + pas = MLX5_ADDR_OF(xrqc, xrqc, wq.pas); + + set_wq(wq, in); + if (in->umem) + mlx5_ib_populate_pas( + in->umem, + 1UL << (in->log_page_size + MLX5_ADAPTER_PAGE_SHIFT), + pas, 0); + else + memcpy(pas, in->pas, rq_pas_size); + + if (in->type == IB_SRQT_TM) { + MLX5_SET(xrqc, xrqc, topology, MLX5_XRQC_TOPOLOGY_TAG_MATCHING); + if (in->flags & MLX5_SRQ_FLAG_RNDV) + MLX5_SET(xrqc, xrqc, offload, MLX5_XRQC_OFFLOAD_RNDV); + MLX5_SET(xrqc, xrqc, + tag_matching_topology_context.log_matching_list_sz, + in->tm_log_list_size); + } else if (in->type == IB_EXP_SRQT_NVMF) { + MLX5_SET(xrqc, xrqc, offload, MLX5_XRQC_OFFLOAD_NVMF); + set_nvmf_srq_pas(&in->nvmf, + pas + roundup(rq_pas_size, MLX5_PAS_ALIGN)); + set_nvmf_xrq_context(&in->nvmf, xrqc); + } + MLX5_SET(xrqc, xrqc, user_index, in->user_index); + MLX5_SET(xrqc, xrqc, cqn, in->cqn); + MLX5_SET(create_xrq_in, create_in, opcode, MLX5_CMD_OP_CREATE_XRQ); + MLX5_SET(create_xrq_in, create_in, uid, in->uid); + err = mlx5_cmd_exec(dev->mdev, create_in, inlen, create_out, + sizeof(create_out)); + kvfree(create_in); + if (!err) { + srq->srqn = MLX5_GET(create_xrq_out, create_out, xrqn); + srq->uid = in->uid; + if (in->type == IB_EXP_SRQT_NVMF) + INIT_LIST_HEAD(&srq->ctrl_list); + } + + return err; +} + +static int destroy_xrq_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq) +{ + u32 in[MLX5_ST_SZ_DW(destroy_xrq_in)] = {}; + + MLX5_SET(destroy_xrq_in, in, opcode, MLX5_CMD_OP_DESTROY_XRQ); + MLX5_SET(destroy_xrq_in, in, xrqn, srq->srqn); + MLX5_SET(destroy_xrq_in, in, uid, srq->uid); + + return mlx5_cmd_exec_in(dev->mdev, destroy_xrq, in); +} + +static int arm_xrq_cmd(struct mlx5_ib_dev *dev, + struct mlx5_core_srq *srq, + u16 lwm) +{ + u32 in[MLX5_ST_SZ_DW(arm_rq_in)] = {}; + + MLX5_SET(arm_rq_in, in, opcode, MLX5_CMD_OP_ARM_RQ); + MLX5_SET(arm_rq_in, in, op_mod, MLX5_ARM_RQ_IN_OP_MOD_XRQ); + MLX5_SET(arm_rq_in, in, srq_number, srq->srqn); + MLX5_SET(arm_rq_in, in, lwm, lwm); + MLX5_SET(arm_rq_in, in, uid, srq->uid); + + return mlx5_cmd_exec_in(dev->mdev, arm_rq, in); +} + +static int query_xrq_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq, + struct mlx5_srq_attr *out) +{ + u32 in[MLX5_ST_SZ_DW(query_xrq_in)] = {}; + u32 *xrq_out; + int outlen = MLX5_ST_SZ_BYTES(query_xrq_out); + void *xrqc; + int err; + + xrq_out = kvzalloc(outlen, GFP_KERNEL); + if (!xrq_out) + return -ENOMEM; + + MLX5_SET(query_xrq_in, in, opcode, MLX5_CMD_OP_QUERY_XRQ); + MLX5_SET(query_xrq_in, in, xrqn, srq->srqn); + + err = mlx5_cmd_exec_inout(dev->mdev, query_xrq, in, xrq_out); + if (err) + goto out; + + xrqc = MLX5_ADDR_OF(query_xrq_out, xrq_out, xrq_context); + get_wq(MLX5_ADDR_OF(xrqc, xrqc, wq), out); + if (MLX5_GET(xrqc, xrqc, state) != MLX5_XRQC_STATE_GOOD) + out->flags |= MLX5_SRQ_FLAG_ERR; + out->tm_next_tag = + MLX5_GET(xrqc, xrqc, + tag_matching_topology_context.append_next_index); + out->tm_hw_phase_cnt = + MLX5_GET(xrqc, xrqc, + tag_matching_topology_context.hw_phase_cnt); + out->tm_sw_phase_cnt = + MLX5_GET(xrqc, xrqc, + tag_matching_topology_context.sw_phase_cnt); + + if (MLX5_CAP_NVMF(dev->mdev, cmd_unknown_namespace_cnt)) { + out->nvmf.cmd_unknown_namespace_cnt = + MLX5_GET(xrqc, xrqc, + nvme_offload_context.cmd_unknown_namespace_cnt); + } + +out: + kvfree(xrq_out); + return err; +} + +static int create_srq_split(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq, + struct mlx5_srq_attr *in) +{ + if (!dev->mdev->issi) + return create_srq_cmd(dev, srq, in); + switch (srq->common.res) { + case MLX5_RES_XSRQ: + return create_xrc_srq_cmd(dev, srq, in); + case MLX5_RES_XRQ: + return create_xrq_cmd(dev, srq, in); + default: + return create_rmp_cmd(dev, srq, in); + } +} + +static int destroy_srq_split(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq) +{ + if (!dev->mdev->issi) + return destroy_srq_cmd(dev, srq); + switch (srq->common.res) { + case MLX5_RES_XSRQ: + return destroy_xrc_srq_cmd(dev, srq); + case MLX5_RES_XRQ: + return destroy_xrq_cmd(dev, srq); + default: + return destroy_rmp_cmd(dev, srq); + } +} + +int mlx5_cmd_create_srq(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq, + struct mlx5_srq_attr *in) +{ + struct mlx5_srq_table *table = &dev->srq_table; + int err; + + switch (in->type) { + case IB_SRQT_XRC: + srq->common.res = MLX5_RES_XSRQ; + break; + case IB_SRQT_TM: + case IB_EXP_SRQT_NVMF: + srq->common.res = MLX5_RES_XRQ; + break; + default: + srq->common.res = MLX5_RES_SRQ; + } + + err = create_srq_split(dev, srq, in); + if (err) + return err; + + refcount_set(&srq->common.refcount, 1); + init_completion(&srq->common.free); + + err = xa_err(xa_store_irq(&table->array, srq->srqn, srq, GFP_KERNEL)); + if (err) + goto err_destroy_srq_split; + + return 0; + +err_destroy_srq_split: + destroy_srq_split(dev, srq); + + return err; +} + +int mlx5_cmd_destroy_srq(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq) +{ + struct mlx5_srq_table *table = &dev->srq_table; + struct mlx5_core_srq *tmp; + int err; + + /* Delete entry, but leave index occupied */ + tmp = xa_cmpxchg_irq(&table->array, srq->srqn, srq, XA_ZERO_ENTRY, 0); + if (WARN_ON(tmp != srq)) + return xa_err(tmp) ?: -EINVAL; + + err = destroy_srq_split(dev, srq); + if (err) { + /* + * We don't need to check returned result for an error, + * because we are storing in pre-allocated space xarray + * entry and it can't fail at this stage. + */ + xa_cmpxchg_irq(&table->array, srq->srqn, XA_ZERO_ENTRY, srq, 0); + return err; + } + xa_erase_irq(&table->array, srq->srqn); + + mlx5_core_res_put(&srq->common); + wait_for_completion(&srq->common.free); + return 0; +} + +int mlx5_cmd_query_srq(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq, + struct mlx5_srq_attr *out) +{ + if (!dev->mdev->issi) + return query_srq_cmd(dev, srq, out); + switch (srq->common.res) { + case MLX5_RES_XSRQ: + return query_xrc_srq_cmd(dev, srq, out); + case MLX5_RES_XRQ: + return query_xrq_cmd(dev, srq, out); + default: + return query_rmp_cmd(dev, srq, out); + } +} + +int mlx5_cmd_arm_srq(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq, + u16 lwm, int is_srq) +{ + if (!dev->mdev->issi) + return arm_srq_cmd(dev, srq, lwm, is_srq); + switch (srq->common.res) { + case MLX5_RES_XSRQ: + return arm_xrc_srq_cmd(dev, srq, lwm); + case MLX5_RES_XRQ: + return arm_xrq_cmd(dev, srq, lwm); + default: + return arm_rmp_cmd(dev, srq, lwm); + } +} + +static int srq_event_notifier(struct notifier_block *nb, + unsigned long type, void *data) +{ + struct mlx5_srq_table *table; + struct mlx5_core_srq *srq; + struct mlx5_eqe *eqe; + u32 srqn; + struct mlx5_core_nvmf_be_ctrl *ctrl; + u32 qpn_id_handle = 0; + bool found = false; + + if (type != MLX5_EVENT_TYPE_SRQ_CATAS_ERROR && + type != MLX5_EVENT_TYPE_SRQ_RQ_LIMIT && + type != MLX5_EVENT_TYPE_XRQ_ERROR) + return NOTIFY_DONE; + + table = container_of(nb, struct mlx5_srq_table, nb); + + eqe = data; + if (type == MLX5_EVENT_TYPE_XRQ_ERROR) { + u8 error_type; + + error_type = be32_to_cpu(eqe->data.xrq.type_xrqn) >> 24; + if (error_type != MLX5_XRQ_ERROR_TYPE_BACKEND_CONTROLLER_ERROR) + return NOTIFY_DONE; + srqn = be32_to_cpu(eqe->data.xrq.type_xrqn) & + MLX5_24BIT_MASK; + qpn_id_handle = be32_to_cpu(eqe->data.xrq.qpn_id_handle) & + MLX5_24BIT_MASK; + } else { + srqn = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff; + } + + xa_lock(&table->array); + srq = xa_load(&table->array, srqn); + if (srq) + refcount_inc(&srq->common.refcount); + xa_unlock(&table->array); + + if (!srq) + return NOTIFY_OK; + + if (type == MLX5_EVENT_TYPE_XRQ_ERROR) { + spin_lock(&srq->lock); + list_for_each_entry(ctrl, &srq->ctrl_list, entry) { + if (ctrl->id == qpn_id_handle) { + found = true; + break; + } + } + spin_unlock(&srq->lock); + + if (found) + ctrl->event(ctrl, type, eqe->sub_type, + MLX5_XRQ_ERROR_TYPE_BACKEND_CONTROLLER_ERROR); + + if (refcount_dec_and_test(&srq->common.refcount)) + complete(&srq->common.free); + + return NOTIFY_OK; + } + + srq->event(srq, eqe->type); + + mlx5_core_res_put(&srq->common); + + return NOTIFY_OK; +} + +int mlx5_init_srq_table(struct mlx5_ib_dev *dev) +{ + struct mlx5_srq_table *table = &dev->srq_table; + + memset(table, 0, sizeof(*table)); + xa_init_flags(&table->array, XA_FLAGS_LOCK_IRQ); + + table->nb.notifier_call = srq_event_notifier; + mlx5_notifier_register(dev->mdev, &table->nb); + + return 0; +} + +void mlx5_cleanup_srq_table(struct mlx5_ib_dev *dev) +{ + struct mlx5_srq_table *table = &dev->srq_table; + + mlx5_notifier_unregister(dev->mdev, &table->nb); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/srq_nvmf.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/srq_nvmf.c new file mode 100644 index 0000000..8172e5f --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/srq_nvmf.c @@ -0,0 +1,158 @@ +/* + * Copyright (c) 2017 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include "mlx5_ib.h" +#include "srq_nvmf.h" + +int get_nvmf_pas_size(struct mlx5_nvmf_attr *nvmf) +{ + return nvmf->staging_buffer_number_of_pages * sizeof(u64); +} + +void set_nvmf_srq_pas(struct mlx5_nvmf_attr *nvmf, __be64 *pas) +{ + int i; + + for (i = 0; i < nvmf->staging_buffer_number_of_pages; i++) + pas[i] = cpu_to_be64(nvmf->staging_buffer_pas[i]); +} + +void set_nvmf_xrq_context(struct mlx5_nvmf_attr *nvmf, void *xrqc) +{ + u16 nvme_queue_size; + + /* + * According to the PRM, nvme_queue_size is a 16 bit field and + * setting it to 0 means setting size to 2^16 (The maximum queue size + * possible for an NVMe device). + */ + if (nvmf->nvme_queue_size < 0x10000) + nvme_queue_size = nvmf->nvme_queue_size; + else + nvme_queue_size = 0; + + + MLX5_SET(xrqc, xrqc, + nvme_offload_context.nvmf_offload_type, + nvmf->type); + MLX5_SET(xrqc, xrqc, + nvme_offload_context.passthrough_sqe_rw_service_en, + nvmf->passthrough_sqe_rw_service_en); + MLX5_SET(xrqc, xrqc, + nvme_offload_context.log_max_namespace, + nvmf->log_max_namespace); + MLX5_SET(xrqc, xrqc, + nvme_offload_context.ioccsz, + nvmf->ioccsz); + MLX5_SET(xrqc, xrqc, + nvme_offload_context.icdoff, + nvmf->icdoff); + MLX5_SET(xrqc, xrqc, + nvme_offload_context.log_max_io_size, + nvmf->log_max_io_size); + MLX5_SET(xrqc, xrqc, + nvme_offload_context.nvme_memory_log_page_size, + nvmf->nvme_memory_log_page_size); + MLX5_SET(xrqc, xrqc, + nvme_offload_context.staging_buffer_log_page_size, + nvmf->staging_buffer_log_page_size); + MLX5_SET(xrqc, xrqc, + nvme_offload_context.staging_buffer_number_of_pages, + nvmf->staging_buffer_number_of_pages); + MLX5_SET(xrqc, xrqc, + nvme_offload_context.staging_buffer_page_offset, + nvmf->staging_buffer_page_offset); + MLX5_SET(xrqc, xrqc, + nvme_offload_context.nvme_queue_size, + nvmf->nvme_queue_size); +} + +static int mlx5_ib_check_nvmf_srq_attrs(struct ib_srq_init_attr *init_attr) +{ + switch (init_attr->ext.nvmf.type) { + case IB_NVMF_WRITE_OFFLOAD: + case IB_NVMF_READ_OFFLOAD: + case IB_NVMF_READ_WRITE_OFFLOAD: + case IB_NVMF_READ_WRITE_FLUSH_OFFLOAD: + break; + default: + return -EINVAL; + } + + return 0; +} + +/* Must be called after checking that offload type values are valid */ +static enum mlx5_nvmf_offload_type to_mlx5_nvmf_offload_type(enum ib_nvmf_offload_type type) +{ + switch (type) { + case IB_NVMF_WRITE_OFFLOAD: + return MLX5_NVMF_WRITE_OFFLOAD; + case IB_NVMF_READ_OFFLOAD: + return MLX5_NVMF_READ_OFFLOAD; + case IB_NVMF_READ_WRITE_OFFLOAD: + return MLX5_NVMF_READ_WRITE_OFFLOAD; + case IB_NVMF_READ_WRITE_FLUSH_OFFLOAD: + return MLX5_NVMF_READ_WRITE_FLUSH_OFFLOAD; + default: + return -EINVAL; + } +} + +int mlx5_ib_exp_set_nvmf_srq_attrs(struct mlx5_nvmf_attr *nvmf, + struct ib_srq_init_attr *init_attr) +{ + int err; + + err = mlx5_ib_check_nvmf_srq_attrs(init_attr); + if (err) + return -EINVAL; + + nvmf->type = to_mlx5_nvmf_offload_type(init_attr->ext.nvmf.type); + nvmf->passthrough_sqe_rw_service_en = + init_attr->ext.nvmf.passthrough_sqe_rw_service_en; + nvmf->log_max_namespace = init_attr->ext.nvmf.log_max_namespace; + nvmf->ioccsz = init_attr->ext.nvmf.cmd_size; + nvmf->icdoff = init_attr->ext.nvmf.data_offset; + nvmf->log_max_io_size = init_attr->ext.nvmf.log_max_io_size; + nvmf->nvme_memory_log_page_size = init_attr->ext.nvmf.nvme_memory_log_page_size; + nvmf->staging_buffer_log_page_size = init_attr->ext.nvmf.staging_buffer_log_page_size; + nvmf->staging_buffer_number_of_pages = init_attr->ext.nvmf.staging_buffer_number_of_pages; + nvmf->staging_buffer_page_offset = init_attr->ext.nvmf.staging_buffer_page_offset; + nvmf->nvme_queue_size = init_attr->ext.nvmf.nvme_queue_size; + nvmf->staging_buffer_pas = init_attr->ext.nvmf.staging_buffer_pas; + + return err; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/srq_nvmf.h b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/srq_nvmf.h new file mode 100644 index 0000000..ebd99ed --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/srq_nvmf.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2017, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef MLX5_IB_SRQ_EXP_H +#define MLX5_IB_SRQ_EXP_H + +/* NVMEoF API */ +int get_nvmf_pas_size(struct mlx5_nvmf_attr *nvmf); +void set_nvmf_srq_pas(struct mlx5_nvmf_attr *nvmf, __be64 *pas); +void set_nvmf_xrq_context(struct mlx5_nvmf_attr *nvmf, void *xrqc); + +#endif /* __MLX5_SRQ_EXP_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/std_types.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/std_types.c new file mode 100644 index 0000000..bbfcce3 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/std_types.c @@ -0,0 +1,222 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include "mlx5_ib.h" + +#define UVERBS_MODULE_NAME mlx5_ib +#include + +static int UVERBS_HANDLER(MLX5_IB_METHOD_PD_QUERY)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_pd *pd = + uverbs_attr_get_obj(attrs, MLX5_IB_ATTR_QUERY_PD_HANDLE); + struct mlx5_ib_pd *mpd = to_mpd(pd); + + return uverbs_copy_to(attrs, MLX5_IB_ATTR_QUERY_PD_RESP_PDN, + &mpd->pdn, sizeof(mpd->pdn)); +} + +static int fill_vport_icm_addr(struct mlx5_core_dev *mdev, u16 vport, + struct mlx5_ib_uapi_query_port *info) +{ + u32 out[MLX5_ST_SZ_DW(query_esw_vport_context_out)] = {}; + u32 in[MLX5_ST_SZ_DW(query_esw_vport_context_in)] = {}; + bool sw_owner_supp; + u64 icm_rx; + u64 icm_tx; + int err; + + sw_owner_supp = MLX5_CAP_ESW_FLOWTABLE_FDB(mdev, sw_owner) || + MLX5_CAP_ESW_FLOWTABLE_FDB(mdev, sw_owner_v2); + + if (vport == MLX5_VPORT_UPLINK) { + icm_rx = MLX5_CAP64_ESW_FLOWTABLE(mdev, + sw_steering_uplink_icm_address_rx); + icm_tx = MLX5_CAP64_ESW_FLOWTABLE(mdev, + sw_steering_uplink_icm_address_tx); + } else { + MLX5_SET(query_esw_vport_context_in, in, opcode, + MLX5_CMD_OP_QUERY_ESW_VPORT_CONTEXT); + MLX5_SET(query_esw_vport_context_in, in, vport_number, vport); + MLX5_SET(query_esw_vport_context_in, in, other_vport, true); + + err = mlx5_cmd_exec_inout(mdev, query_esw_vport_context, in, + out); + + if (err) + return err; + + icm_rx = MLX5_GET64( + query_esw_vport_context_out, out, + esw_vport_context.sw_steering_vport_icm_address_rx); + + icm_tx = MLX5_GET64( + query_esw_vport_context_out, out, + esw_vport_context.sw_steering_vport_icm_address_tx); + } + + if (sw_owner_supp && icm_rx) { + info->vport_steering_icm_rx = icm_rx; + info->flags |= + MLX5_IB_UAPI_QUERY_PORT_VPORT_STEERING_ICM_RX; + } + + if (sw_owner_supp && icm_tx) { + info->vport_steering_icm_tx = icm_tx; + info->flags |= + MLX5_IB_UAPI_QUERY_PORT_VPORT_STEERING_ICM_TX; + } + + return 0; +} + +static int fill_vport_vhca_id(struct mlx5_core_dev *mdev, u16 vport, + struct mlx5_ib_uapi_query_port *info) +{ + size_t out_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out); + u32 in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {}; + void *out; + int err; + + out = kzalloc(out_sz, GFP_KERNEL); + if (!out) + return -ENOMEM; + + MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP); + MLX5_SET(query_hca_cap_in, in, other_function, true); + MLX5_SET(query_hca_cap_in, in, function_id, vport); + MLX5_SET(query_hca_cap_in, in, op_mod, + MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE | + HCA_CAP_OPMOD_GET_CUR); + + err = mlx5_cmd_exec(mdev, in, sizeof(in), out, out_sz); + if (err) + goto out; + + info->vport_vhca_id = MLX5_GET(query_hca_cap_out, out, + capability.cmd_hca_cap.vhca_id); + + info->flags |= MLX5_IB_UAPI_QUERY_PORT_VPORT_VHCA_ID; +out: + kfree(out); + return err; +} + +static int fill_switchdev_info(struct mlx5_ib_dev *dev, u32 port_num, + struct mlx5_ib_uapi_query_port *info) +{ + struct mlx5_eswitch_rep *rep; + struct mlx5_core_dev *mdev; + int err; + + rep = dev->port[port_num - 1].rep; + if (!rep) + return -EOPNOTSUPP; + + mdev = mlx5_eswitch_get_core_dev(rep->esw); + if (!mdev) + return -EINVAL; + + info->vport = rep->vport; + info->flags |= MLX5_IB_UAPI_QUERY_PORT_VPORT; + + if (rep->vport != MLX5_VPORT_UPLINK) { + err = fill_vport_vhca_id(mdev, rep->vport, info); + if (err) + return err; + } + + info->esw_owner_vhca_id = MLX5_CAP_GEN(mdev, vhca_id); + info->flags |= MLX5_IB_UAPI_QUERY_PORT_ESW_OWNER_VHCA_ID; + + err = fill_vport_icm_addr(mdev, rep->vport, info); + if (err) + return err; + + if (mlx5_eswitch_vport_match_metadata_enabled(rep->esw)) { + info->reg_c0.value = mlx5_eswitch_get_vport_metadata_for_match( + rep->esw, rep->vport); + info->reg_c0.mask = mlx5_eswitch_get_vport_metadata_mask(); + info->flags |= MLX5_IB_UAPI_QUERY_PORT_VPORT_REG_C0; + } + + return 0; +} + +static int UVERBS_HANDLER(MLX5_IB_METHOD_QUERY_PORT)( + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_ib_uapi_query_port info = {}; + struct mlx5_ib_ucontext *c; + struct mlx5_ib_dev *dev; + u32 port_num; + int ret; + + if (uverbs_copy_from(&port_num, attrs, + MLX5_IB_ATTR_QUERY_PORT_PORT_NUM)) + return -EFAULT; + + c = to_mucontext(ib_uverbs_get_ucontext(attrs)); + if (IS_ERR(c)) + return PTR_ERR(c); + dev = to_mdev(c->ibucontext.device); + + if (!rdma_is_port_valid(&dev->ib_dev, port_num)) + return -EINVAL; + + if (mlx5_eswitch_mode(dev->mdev) == MLX5_ESWITCH_OFFLOADS) { + ret = fill_switchdev_info(dev, port_num, &info); + if (ret) + return ret; + } + + return uverbs_copy_to_struct_or_zero(attrs, MLX5_IB_ATTR_QUERY_PORT, &info, + sizeof(info)); +} + +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_QUERY_PORT, + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_QUERY_PORT_PORT_NUM, + UVERBS_ATTR_TYPE(u32), UA_MANDATORY), + UVERBS_ATTR_PTR_OUT( + MLX5_IB_ATTR_QUERY_PORT, + UVERBS_ATTR_STRUCT(struct mlx5_ib_uapi_query_port, + reg_c0), + UA_MANDATORY)); + +ADD_UVERBS_METHODS(mlx5_ib_device, + UVERBS_OBJECT_DEVICE, + &UVERBS_METHOD(MLX5_IB_METHOD_QUERY_PORT)); + +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_PD_QUERY, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_QUERY_PD_HANDLE, + UVERBS_OBJECT_PD, + UVERBS_ACCESS_READ, + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_QUERY_PD_RESP_PDN, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY)); + +ADD_UVERBS_METHODS(mlx5_ib_pd, + UVERBS_OBJECT_PD, + &UVERBS_METHOD(MLX5_IB_METHOD_PD_QUERY)); + +const struct uapi_definition mlx5_ib_std_types_defs[] = { + UAPI_DEF_CHAIN_OBJ_TREE( + UVERBS_OBJECT_PD, + &mlx5_ib_pd), + UAPI_DEF_CHAIN_OBJ_TREE( + UVERBS_OBJECT_DEVICE, + &mlx5_ib_device), + {}, +}; diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/wr.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/wr.c new file mode 100644 index 0000000..0a74df2 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/wr.c @@ -0,0 +1,1555 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. + */ + +#include +#include +#include +#include "wr.h" + +static const u32 mlx5_ib_opcode[] = { + [IB_WR_SEND] = MLX5_OPCODE_SEND, + [IB_WR_LSO] = MLX5_OPCODE_LSO, + [IB_WR_SEND_WITH_IMM] = MLX5_OPCODE_SEND_IMM, + [IB_WR_RDMA_WRITE] = MLX5_OPCODE_RDMA_WRITE, + [IB_WR_RDMA_WRITE_WITH_IMM] = MLX5_OPCODE_RDMA_WRITE_IMM, + [IB_WR_RDMA_READ] = MLX5_OPCODE_RDMA_READ, + [IB_WR_ATOMIC_CMP_AND_SWP] = MLX5_OPCODE_ATOMIC_CS, + [IB_WR_ATOMIC_FETCH_AND_ADD] = MLX5_OPCODE_ATOMIC_FA, + [IB_WR_SEND_WITH_INV] = MLX5_OPCODE_SEND_INVAL, + [IB_WR_LOCAL_INV] = MLX5_OPCODE_UMR, + [IB_WR_REG_MR] = MLX5_OPCODE_UMR, + [IB_WR_MASKED_ATOMIC_CMP_AND_SWP] = MLX5_OPCODE_ATOMIC_MASKED_CS, + [IB_WR_MASKED_ATOMIC_FETCH_AND_ADD] = MLX5_OPCODE_ATOMIC_MASKED_FA, + [MLX5_IB_WR_UMR] = MLX5_OPCODE_UMR, +}; + +/* handle_post_send_edge - Check if we get to SQ edge. If yes, update to the + * next nearby edge and get new address translation for current WQE position. + * @sq - SQ buffer. + * @seg: Current WQE position (16B aligned). + * @wqe_sz: Total current WQE size [16B]. + * @cur_edge: Updated current edge. + */ +static inline void handle_post_send_edge(struct mlx5_ib_wq *sq, void **seg, + u32 wqe_sz, void **cur_edge) +{ + u32 idx; + + if (likely(*seg != *cur_edge)) + return; + + idx = (sq->cur_post + (wqe_sz >> 2)) & (sq->wqe_cnt - 1); + *cur_edge = get_sq_edge(sq, idx); + + *seg = mlx5_frag_buf_get_wqe(&sq->fbc, idx); +} + +/* memcpy_send_wqe - copy data from src to WQE and update the relevant WQ's + * pointers. At the end @seg is aligned to 16B regardless the copied size. + * @sq - SQ buffer. + * @cur_edge: Updated current edge. + * @seg: Current WQE position (16B aligned). + * @wqe_sz: Total current WQE size [16B]. + * @src: Pointer to copy from. + * @n: Number of bytes to copy. + */ +static inline void memcpy_send_wqe(struct mlx5_ib_wq *sq, void **cur_edge, + void **seg, u32 *wqe_sz, const void *src, + size_t n) +{ + while (likely(n)) { + size_t leftlen = *cur_edge - *seg; + size_t copysz = min_t(size_t, leftlen, n); + size_t stride; + + memcpy(*seg, src, copysz); + + n -= copysz; + src += copysz; + stride = !n ? ALIGN(copysz, 16) : copysz; + *seg += stride; + *wqe_sz += stride >> 4; + handle_post_send_edge(sq, seg, *wqe_sz, cur_edge); + } +} + +static int mlx5_wq_overflow(struct mlx5_ib_wq *wq, int nreq, + struct ib_cq *ib_cq) +{ + struct mlx5_ib_cq *cq; + unsigned int cur; + + cur = wq->head - wq->tail; + if (likely(cur + nreq < wq->max_post)) + return 0; + + cq = to_mcq(ib_cq); + spin_lock(&cq->lock); + cur = wq->head - wq->tail; + spin_unlock(&cq->lock); + + return cur + nreq >= wq->max_post; +} + +static __always_inline void set_raddr_seg(struct mlx5_wqe_raddr_seg *rseg, + u64 remote_addr, u32 rkey) +{ + rseg->raddr = cpu_to_be64(remote_addr); + rseg->rkey = cpu_to_be32(rkey); + rseg->reserved = 0; +} + +static void set_eth_seg(const struct ib_send_wr *wr, struct mlx5_ib_qp *qp, + void **seg, int *size, void **cur_edge) +{ + struct mlx5_wqe_eth_seg *eseg = *seg; + + memset(eseg, 0, sizeof(struct mlx5_wqe_eth_seg)); + + if (wr->send_flags & IB_SEND_IP_CSUM) + eseg->cs_flags = MLX5_ETH_WQE_L3_CSUM | + MLX5_ETH_WQE_L4_CSUM; + + if (wr->opcode == IB_WR_LSO) { + struct ib_ud_wr *ud_wr = container_of(wr, struct ib_ud_wr, wr); + size_t left, copysz; + void *pdata = ud_wr->header; + size_t stride; + + left = ud_wr->hlen; + eseg->mss = cpu_to_be16(ud_wr->mss); + eseg->inline_hdr.sz = cpu_to_be16(left); + + /* memcpy_send_wqe should get a 16B align address. Hence, we + * first copy up to the current edge and then, if needed, + * continue to memcpy_send_wqe. + */ + copysz = min_t(u64, *cur_edge - (void *)eseg->inline_hdr.start, + left); + memcpy(eseg->inline_hdr.start, pdata, copysz); + stride = ALIGN(sizeof(struct mlx5_wqe_eth_seg) - + sizeof(eseg->inline_hdr.start) + copysz, 16); + *size += stride / 16; + *seg += stride; + + if (copysz < left) { + handle_post_send_edge(&qp->sq, seg, *size, cur_edge); + left -= copysz; + pdata += copysz; + memcpy_send_wqe(&qp->sq, cur_edge, seg, size, pdata, + left); + } + + return; + } + + *seg += sizeof(struct mlx5_wqe_eth_seg); + *size += sizeof(struct mlx5_wqe_eth_seg) / 16; +} + +static void set_datagram_seg(struct mlx5_wqe_datagram_seg *dseg, + const struct ib_send_wr *wr) +{ + memcpy(&dseg->av, &to_mah(ud_wr(wr)->ah)->av, sizeof(struct mlx5_av)); + dseg->av.dqp_dct = + cpu_to_be32(ud_wr(wr)->remote_qpn | MLX5_EXTENDED_UD_AV); + dseg->av.key.qkey.qkey = cpu_to_be32(ud_wr(wr)->remote_qkey); +} + +static void set_data_ptr_seg(struct mlx5_wqe_data_seg *dseg, struct ib_sge *sg) +{ + dseg->byte_count = cpu_to_be32(sg->length); + dseg->lkey = cpu_to_be32(sg->lkey); + dseg->addr = cpu_to_be64(sg->addr); +} + +static u64 get_xlt_octo(u64 bytes) +{ + return ALIGN(bytes, MLX5_IB_UMR_XLT_ALIGNMENT) / + MLX5_IB_UMR_OCTOWORD; +} + +static __be64 frwr_mkey_mask(bool atomic) +{ + u64 result; + + result = MLX5_MKEY_MASK_LEN | + MLX5_MKEY_MASK_PAGE_SIZE | + MLX5_MKEY_MASK_START_ADDR | + MLX5_MKEY_MASK_EN_RINVAL | + MLX5_MKEY_MASK_KEY | + MLX5_MKEY_MASK_LR | + MLX5_MKEY_MASK_LW | + MLX5_MKEY_MASK_RR | + MLX5_MKEY_MASK_RW | + MLX5_MKEY_MASK_SMALL_FENCE | + MLX5_MKEY_MASK_FREE; + + if (atomic) + result |= MLX5_MKEY_MASK_A; + + return cpu_to_be64(result); +} + +static __be64 sig_mkey_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_LEN | + MLX5_MKEY_MASK_PAGE_SIZE | + MLX5_MKEY_MASK_START_ADDR | + MLX5_MKEY_MASK_EN_SIGERR | + MLX5_MKEY_MASK_EN_RINVAL | + MLX5_MKEY_MASK_KEY | + MLX5_MKEY_MASK_LR | + MLX5_MKEY_MASK_LW | + MLX5_MKEY_MASK_RR | + MLX5_MKEY_MASK_RW | + MLX5_MKEY_MASK_SMALL_FENCE | + MLX5_MKEY_MASK_FREE | + MLX5_MKEY_MASK_BSF_EN; + + return cpu_to_be64(result); +} + +static void set_reg_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr, + struct mlx5_ib_mr *mr, u8 flags, bool atomic) +{ + int size = (mr->mmkey.ndescs + mr->meta_ndescs) * mr->desc_size; + + memset(umr, 0, sizeof(*umr)); + + umr->flags = flags; + umr->xlt_octowords = cpu_to_be16(get_xlt_octo(size)); + umr->mkey_mask = frwr_mkey_mask(atomic); +} + +static void set_linv_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr) +{ + memset(umr, 0, sizeof(*umr)); + umr->mkey_mask = cpu_to_be64(MLX5_MKEY_MASK_FREE); + umr->flags = MLX5_UMR_INLINE; +} + +static __be64 get_umr_enable_mr_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_KEY | + MLX5_MKEY_MASK_FREE; + + return cpu_to_be64(result); +} + +static __be64 get_umr_disable_mr_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_FREE; + + return cpu_to_be64(result); +} + +static __be64 get_umr_update_translation_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_LEN | + MLX5_MKEY_MASK_PAGE_SIZE | + MLX5_MKEY_MASK_START_ADDR; + + return cpu_to_be64(result); +} + +static __be64 get_umr_update_access_mask(int atomic, + int relaxed_ordering_write, + int relaxed_ordering_read) +{ + u64 result; + + result = MLX5_MKEY_MASK_LR | + MLX5_MKEY_MASK_LW | + MLX5_MKEY_MASK_RR | + MLX5_MKEY_MASK_RW; + + if (atomic) + result |= MLX5_MKEY_MASK_A; + + if (relaxed_ordering_write) + result |= MLX5_MKEY_MASK_RELAXED_ORDERING_WRITE; + + if (relaxed_ordering_read) + result |= MLX5_MKEY_MASK_RELAXED_ORDERING_READ; + + return cpu_to_be64(result); +} + +static __be64 get_umr_update_pd_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_PD; + + return cpu_to_be64(result); +} + +static int umr_check_mkey_mask(struct mlx5_ib_dev *dev, u64 mask) +{ + if (mask & MLX5_MKEY_MASK_PAGE_SIZE && + MLX5_CAP_GEN(dev->mdev, umr_modify_entity_size_disabled)) + return -EPERM; + + if (mask & MLX5_MKEY_MASK_A && + MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled)) + return -EPERM; + + if (mask & MLX5_MKEY_MASK_RELAXED_ORDERING_WRITE && + !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr)) + return -EPERM; + + if (mask & MLX5_MKEY_MASK_RELAXED_ORDERING_READ && + !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr)) + return -EPERM; + + return 0; +} + +static int set_reg_umr_segment(struct mlx5_ib_dev *dev, + struct mlx5_wqe_umr_ctrl_seg *umr, + const struct ib_send_wr *wr) +{ + const struct mlx5_umr_wr *umrwr = umr_wr(wr); + + memset(umr, 0, sizeof(*umr)); + + if (!umrwr->ignore_free_state) { + if (wr->send_flags & MLX5_IB_SEND_UMR_FAIL_IF_FREE) + /* fail if free */ + umr->flags = MLX5_UMR_CHECK_FREE; + else + /* fail if not free */ + umr->flags = MLX5_UMR_CHECK_NOT_FREE; + } + + umr->xlt_octowords = cpu_to_be16(get_xlt_octo(umrwr->xlt_size)); + if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_XLT) { + u64 offset = get_xlt_octo(umrwr->offset); + + umr->xlt_offset = cpu_to_be16(offset & 0xffff); + umr->xlt_offset_47_16 = cpu_to_be32(offset >> 16); + umr->flags |= MLX5_UMR_TRANSLATION_OFFSET_EN; + } + if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_TRANSLATION) + umr->mkey_mask |= get_umr_update_translation_mask(); + if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS) { + umr->mkey_mask |= get_umr_update_access_mask( + !!(MLX5_CAP_GEN(dev->mdev, atomic)), + !!(MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr)), + !!(MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr))); + umr->mkey_mask |= get_umr_update_pd_mask(); + } + if (wr->send_flags & MLX5_IB_SEND_UMR_ENABLE_MR) + umr->mkey_mask |= get_umr_enable_mr_mask(); + if (wr->send_flags & MLX5_IB_SEND_UMR_DISABLE_MR) + umr->mkey_mask |= get_umr_disable_mr_mask(); + + if (!wr->num_sge) + umr->flags |= MLX5_UMR_INLINE; + + return umr_check_mkey_mask(dev, be64_to_cpu(umr->mkey_mask)); +} + +static u8 get_umr_flags(int acc) +{ + return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX5_PERM_ATOMIC : 0) | + (acc & IB_ACCESS_REMOTE_WRITE ? MLX5_PERM_REMOTE_WRITE : 0) | + (acc & IB_ACCESS_REMOTE_READ ? MLX5_PERM_REMOTE_READ : 0) | + (acc & IB_ACCESS_LOCAL_WRITE ? MLX5_PERM_LOCAL_WRITE : 0) | + MLX5_PERM_LOCAL_READ | MLX5_PERM_UMR_EN; +} + +static void set_reg_mkey_seg(struct mlx5_mkey_seg *seg, + struct mlx5_ib_mr *mr, + u32 key, int access) +{ + int ndescs = ALIGN(mr->mmkey.ndescs + mr->meta_ndescs, 8) >> 1; + + memset(seg, 0, sizeof(*seg)); + + if (mr->access_mode == MLX5_MKC_ACCESS_MODE_MTT) + seg->log2_page_size = ilog2(mr->ibmr.page_size); + else if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS) + /* KLMs take twice the size of MTTs */ + ndescs *= 2; + + seg->flags = get_umr_flags(access) | mr->access_mode; + seg->qpn_mkey7_0 = cpu_to_be32((key & 0xff) | 0xffffff00); + seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL); + seg->start_addr = cpu_to_be64(mr->ibmr.iova); + seg->len = cpu_to_be64(mr->ibmr.length); + seg->xlt_oct_size = cpu_to_be32(ndescs); +} + +static void set_linv_mkey_seg(struct mlx5_mkey_seg *seg) +{ + memset(seg, 0, sizeof(*seg)); + seg->status = MLX5_MKEY_STATUS_FREE; +} + +static void set_reg_mkey_segment(struct mlx5_ib_dev *dev, + struct mlx5_mkey_seg *seg, + const struct ib_send_wr *wr) +{ + const struct mlx5_umr_wr *umrwr = umr_wr(wr); + + memset(seg, 0, sizeof(*seg)); + if (wr->send_flags & MLX5_IB_SEND_UMR_DISABLE_MR) + MLX5_SET(mkc, seg, free, 1); + + MLX5_SET(mkc, seg, a, + !!(umrwr->access_flags & IB_ACCESS_REMOTE_ATOMIC)); + MLX5_SET(mkc, seg, rw, + !!(umrwr->access_flags & IB_ACCESS_REMOTE_WRITE)); + MLX5_SET(mkc, seg, rr, !!(umrwr->access_flags & IB_ACCESS_REMOTE_READ)); + MLX5_SET(mkc, seg, lw, !!(umrwr->access_flags & IB_ACCESS_LOCAL_WRITE)); + MLX5_SET(mkc, seg, lr, 1); + if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr)) + MLX5_SET(mkc, seg, relaxed_ordering_write, + !!(umrwr->access_flags & IB_ACCESS_RELAXED_ORDERING)); + if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr)) + MLX5_SET(mkc, seg, relaxed_ordering_read, + !!(umrwr->access_flags & IB_ACCESS_RELAXED_ORDERING)); + + if (umrwr->pd) + MLX5_SET(mkc, seg, pd, to_mpd(umrwr->pd)->pdn); + if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_TRANSLATION && + !umrwr->length) + MLX5_SET(mkc, seg, length64, 1); + + MLX5_SET64(mkc, seg, start_addr, umrwr->virt_addr); + MLX5_SET64(mkc, seg, len, umrwr->length); + MLX5_SET(mkc, seg, log_page_size, umrwr->page_shift); + MLX5_SET(mkc, seg, qpn, 0xffffff); + MLX5_SET(mkc, seg, mkey_7_0, mlx5_mkey_variant(umrwr->mkey)); +} + +static void set_reg_data_seg(struct mlx5_wqe_data_seg *dseg, + struct mlx5_ib_mr *mr, + struct mlx5_ib_pd *pd) +{ + int bcount = mr->desc_size * (mr->mmkey.ndescs + mr->meta_ndescs); + + dseg->addr = cpu_to_be64(mr->desc_map); + dseg->byte_count = cpu_to_be32(ALIGN(bcount, 64)); + dseg->lkey = cpu_to_be32(pd->ibpd.local_dma_lkey); +} + +static __be32 send_ieth(const struct ib_send_wr *wr) +{ + switch (wr->opcode) { + case IB_WR_SEND_WITH_IMM: + case IB_WR_RDMA_WRITE_WITH_IMM: + return wr->ex.imm_data; + + case IB_WR_SEND_WITH_INV: + return cpu_to_be32(wr->ex.invalidate_rkey); + + default: + return 0; + } +} + +static u8 calc_sig(void *wqe, int size) +{ + u8 *p = wqe; + u8 res = 0; + int i; + + for (i = 0; i < size; i++) + res ^= p[i]; + + return ~res; +} + +static u8 wq_sig(void *wqe) +{ + return calc_sig(wqe, (*((u8 *)wqe + 8) & 0x3f) << 4); +} + +static int set_data_inl_seg(struct mlx5_ib_qp *qp, const struct ib_send_wr *wr, + void **wqe, int *wqe_sz, void **cur_edge) +{ + struct mlx5_wqe_inline_seg *seg; + size_t offset; + int inl = 0; + int i; + + seg = *wqe; + *wqe += sizeof(*seg); + offset = sizeof(*seg); + + for (i = 0; i < wr->num_sge; i++) { + size_t len = wr->sg_list[i].length; + void *addr = (void *)(unsigned long)(wr->sg_list[i].addr); + + inl += len; + + if (unlikely(inl > qp->max_inline_data)) + return -ENOMEM; + + while (likely(len)) { + size_t leftlen; + size_t copysz; + + handle_post_send_edge(&qp->sq, wqe, + *wqe_sz + (offset >> 4), + cur_edge); + + leftlen = *cur_edge - *wqe; + copysz = min_t(size_t, leftlen, len); + + memcpy(*wqe, addr, copysz); + len -= copysz; + addr += copysz; + *wqe += copysz; + offset += copysz; + } + } + + seg->byte_count = cpu_to_be32(inl | MLX5_INLINE_SEG); + + *wqe_sz += ALIGN(inl + sizeof(seg->byte_count), 16) / 16; + + return 0; +} + +static u16 prot_field_size(enum ib_signature_type type) +{ + switch (type) { + case IB_SIG_TYPE_T10_DIF: + return MLX5_DIF_SIZE; + default: + return 0; + } +} + +static u8 bs_selector(int block_size) +{ + switch (block_size) { + case 512: return 0x1; + case 520: return 0x2; + case 4096: return 0x3; + case 4160: return 0x4; + case 1073741824: return 0x5; + default: return 0; + } +} + +static void mlx5_fill_inl_bsf(struct ib_sig_domain *domain, + struct mlx5_bsf_inl *inl) +{ + /* Valid inline section and allow BSF refresh */ + inl->vld_refresh = cpu_to_be16(MLX5_BSF_INL_VALID | + MLX5_BSF_REFRESH_DIF); + inl->dif_apptag = cpu_to_be16(domain->sig.dif.app_tag); + inl->dif_reftag = cpu_to_be32(domain->sig.dif.ref_tag); + /* repeating block */ + inl->rp_inv_seed = MLX5_BSF_REPEAT_BLOCK; + inl->sig_type = domain->sig.dif.bg_type == IB_T10DIF_CRC ? + MLX5_DIF_CRC : MLX5_DIF_IPCS; + + if (domain->sig.dif.ref_remap) + inl->dif_inc_ref_guard_check |= MLX5_BSF_INC_REFTAG; + + if (domain->sig.dif.app_escape) { + if (domain->sig.dif.ref_escape) + inl->dif_inc_ref_guard_check |= MLX5_BSF_APPREF_ESCAPE; + else + inl->dif_inc_ref_guard_check |= MLX5_BSF_APPTAG_ESCAPE; + } + + inl->dif_app_bitmask_check = + cpu_to_be16(domain->sig.dif.apptag_check_mask); +} + +static int mlx5_set_bsf(struct ib_mr *sig_mr, + struct ib_sig_attrs *sig_attrs, + struct mlx5_bsf *bsf, u32 data_size) +{ + struct mlx5_core_sig_ctx *msig = to_mmr(sig_mr)->sig; + struct mlx5_bsf_basic *basic = &bsf->basic; + struct ib_sig_domain *mem = &sig_attrs->mem; + struct ib_sig_domain *wire = &sig_attrs->wire; + + memset(bsf, 0, sizeof(*bsf)); + + /* Basic + Extended + Inline */ + basic->bsf_size_sbs = 1 << 7; + /* Input domain check byte mask */ + basic->check_byte_mask = sig_attrs->check_mask; + basic->raw_data_size = cpu_to_be32(data_size); + + /* Memory domain */ + switch (sig_attrs->mem.sig_type) { + case IB_SIG_TYPE_NONE: + break; + case IB_SIG_TYPE_T10_DIF: + basic->mem.bs_selector = bs_selector(mem->sig.dif.pi_interval); + basic->m_bfs_psv = cpu_to_be32(msig->psv_memory.psv_idx); + mlx5_fill_inl_bsf(mem, &bsf->m_inl); + break; + default: + return -EINVAL; + } + + /* Wire domain */ + switch (sig_attrs->wire.sig_type) { + case IB_SIG_TYPE_NONE: + break; + case IB_SIG_TYPE_T10_DIF: + if (mem->sig.dif.pi_interval == wire->sig.dif.pi_interval && + mem->sig_type == wire->sig_type) { + /* Same block structure */ + basic->bsf_size_sbs |= 1 << 4; + if (mem->sig.dif.bg_type == wire->sig.dif.bg_type) + basic->wire.copy_byte_mask |= MLX5_CPY_GRD_MASK; + if (mem->sig.dif.app_tag == wire->sig.dif.app_tag) + basic->wire.copy_byte_mask |= MLX5_CPY_APP_MASK; + if (mem->sig.dif.ref_tag == wire->sig.dif.ref_tag) + basic->wire.copy_byte_mask |= MLX5_CPY_REF_MASK; + } else + basic->wire.bs_selector = + bs_selector(wire->sig.dif.pi_interval); + + basic->w_bfs_psv = cpu_to_be32(msig->psv_wire.psv_idx); + mlx5_fill_inl_bsf(wire, &bsf->w_inl); + break; + default: + return -EINVAL; + } + + return 0; +} + + +static int set_sig_data_segment(const struct ib_send_wr *send_wr, + struct ib_mr *sig_mr, + struct ib_sig_attrs *sig_attrs, + struct mlx5_ib_qp *qp, void **seg, int *size, + void **cur_edge) +{ + struct mlx5_bsf *bsf; + u32 data_len; + u32 data_key; + u64 data_va; + u32 prot_len = 0; + u32 prot_key = 0; + u64 prot_va = 0; + bool prot = false; + int ret; + int wqe_size; + struct mlx5_ib_mr *mr = to_mmr(sig_mr); + struct mlx5_ib_mr *pi_mr = mr->pi_mr; + + data_len = pi_mr->data_length; + data_key = pi_mr->ibmr.lkey; + data_va = pi_mr->data_iova; + if (pi_mr->meta_ndescs) { + prot_len = pi_mr->meta_length; + prot_key = pi_mr->ibmr.lkey; + prot_va = pi_mr->pi_iova; + prot = true; + } + + if (!prot || (data_key == prot_key && data_va == prot_va && + data_len == prot_len)) { + /** + * Source domain doesn't contain signature information + * or data and protection are interleaved in memory. + * So need construct: + * ------------------ + * | data_klm | + * ------------------ + * | BSF | + * ------------------ + **/ + struct mlx5_klm *data_klm = *seg; + + data_klm->bcount = cpu_to_be32(data_len); + data_klm->key = cpu_to_be32(data_key); + data_klm->va = cpu_to_be64(data_va); + wqe_size = ALIGN(sizeof(*data_klm), 64); + } else { + /** + * Source domain contains signature information + * So need construct a strided block format: + * --------------------------- + * | stride_block_ctrl | + * --------------------------- + * | data_klm | + * --------------------------- + * | prot_klm | + * --------------------------- + * | BSF | + * --------------------------- + **/ + struct mlx5_stride_block_ctrl_seg *sblock_ctrl; + struct mlx5_stride_block_entry *data_sentry; + struct mlx5_stride_block_entry *prot_sentry; + u16 block_size = sig_attrs->mem.sig.dif.pi_interval; + int prot_size; + + sblock_ctrl = *seg; + data_sentry = (void *)sblock_ctrl + sizeof(*sblock_ctrl); + prot_sentry = (void *)data_sentry + sizeof(*data_sentry); + + prot_size = prot_field_size(sig_attrs->mem.sig_type); + if (!prot_size) { + pr_err("Bad block size given: %u\n", block_size); + return -EINVAL; + } + sblock_ctrl->bcount_per_cycle = cpu_to_be32(block_size + + prot_size); + sblock_ctrl->op = cpu_to_be32(MLX5_STRIDE_BLOCK_OP); + sblock_ctrl->repeat_count = cpu_to_be32(data_len / block_size); + sblock_ctrl->num_entries = cpu_to_be16(2); + + data_sentry->bcount = cpu_to_be16(block_size); + data_sentry->key = cpu_to_be32(data_key); + data_sentry->va = cpu_to_be64(data_va); + data_sentry->stride = cpu_to_be16(block_size); + + prot_sentry->bcount = cpu_to_be16(prot_size); + prot_sentry->key = cpu_to_be32(prot_key); + prot_sentry->va = cpu_to_be64(prot_va); + prot_sentry->stride = cpu_to_be16(prot_size); + + wqe_size = ALIGN(sizeof(*sblock_ctrl) + sizeof(*data_sentry) + + sizeof(*prot_sentry), 64); + } + + *seg += wqe_size; + *size += wqe_size / 16; + handle_post_send_edge(&qp->sq, seg, *size, cur_edge); + + bsf = *seg; + ret = mlx5_set_bsf(sig_mr, sig_attrs, bsf, data_len); + if (ret) + return -EINVAL; + + *seg += sizeof(*bsf); + *size += sizeof(*bsf) / 16; + handle_post_send_edge(&qp->sq, seg, *size, cur_edge); + + return 0; +} + +static void set_sig_mkey_segment(struct mlx5_mkey_seg *seg, + struct ib_mr *sig_mr, int access_flags, + u32 size, u32 length, u32 pdn) +{ + u32 sig_key = sig_mr->rkey; + u8 sigerr = to_mmr(sig_mr)->sig->sigerr_count & 1; + + memset(seg, 0, sizeof(*seg)); + + seg->flags = get_umr_flags(access_flags) | MLX5_MKC_ACCESS_MODE_KLMS; + seg->qpn_mkey7_0 = cpu_to_be32((sig_key & 0xff) | 0xffffff00); + seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL | sigerr << 26 | + MLX5_MKEY_BSF_EN | pdn); + seg->len = cpu_to_be64(length); + seg->xlt_oct_size = cpu_to_be32(get_xlt_octo(size)); + seg->bsfs_octo_size = cpu_to_be32(MLX5_MKEY_BSF_OCTO_SIZE); +} + +static void set_sig_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, + u32 size) +{ + memset(umr, 0, sizeof(*umr)); + + umr->flags = MLX5_FLAGS_INLINE | MLX5_FLAGS_CHECK_FREE; + umr->xlt_octowords = cpu_to_be16(get_xlt_octo(size)); + umr->bsf_octowords = cpu_to_be16(MLX5_MKEY_BSF_OCTO_SIZE); + umr->mkey_mask = sig_mkey_mask(); +} + +static int set_pi_umr_wr(const struct ib_send_wr *send_wr, + struct mlx5_ib_qp *qp, void **seg, int *size, + void **cur_edge) +{ + const struct ib_reg_wr *wr = reg_wr(send_wr); + struct mlx5_ib_mr *sig_mr = to_mmr(wr->mr); + struct mlx5_ib_mr *pi_mr = sig_mr->pi_mr; + struct ib_sig_attrs *sig_attrs = sig_mr->ibmr.sig_attrs; + u32 pdn = to_mpd(qp->ibqp.pd)->pdn; + u32 xlt_size; + int region_len, ret; + + if (unlikely(send_wr->num_sge != 0) || + unlikely(wr->access & IB_ACCESS_REMOTE_ATOMIC) || + unlikely(!sig_mr->sig) || unlikely(!qp->ibqp.integrity_en) || + unlikely(!sig_mr->sig->sig_status_checked)) + return -EINVAL; + + /* length of the protected region, data + protection */ + region_len = pi_mr->ibmr.length; + + /** + * KLM octoword size - if protection was provided + * then we use strided block format (3 octowords), + * else we use single KLM (1 octoword) + **/ + if (sig_attrs->mem.sig_type != IB_SIG_TYPE_NONE) + xlt_size = 0x30; + else + xlt_size = sizeof(struct mlx5_klm); + + set_sig_umr_segment(*seg, xlt_size); + *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); + *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; + handle_post_send_edge(&qp->sq, seg, *size, cur_edge); + + set_sig_mkey_segment(*seg, wr->mr, wr->access, xlt_size, region_len, + pdn); + *seg += sizeof(struct mlx5_mkey_seg); + *size += sizeof(struct mlx5_mkey_seg) / 16; + handle_post_send_edge(&qp->sq, seg, *size, cur_edge); + + ret = set_sig_data_segment(send_wr, wr->mr, sig_attrs, qp, seg, size, + cur_edge); + if (ret) + return ret; + + sig_mr->sig->sig_status_checked = false; + return 0; +} + +static int set_psv_wr(struct ib_sig_domain *domain, + u32 psv_idx, void **seg, int *size) +{ + struct mlx5_seg_set_psv *psv_seg = *seg; + + memset(psv_seg, 0, sizeof(*psv_seg)); + psv_seg->psv_num = cpu_to_be32(psv_idx); + switch (domain->sig_type) { + case IB_SIG_TYPE_NONE: + break; + case IB_SIG_TYPE_T10_DIF: + psv_seg->transient_sig = cpu_to_be32(domain->sig.dif.bg << 16 | + domain->sig.dif.app_tag); + psv_seg->ref_tag = cpu_to_be32(domain->sig.dif.ref_tag); + break; + default: + pr_err("Bad signature type (%d) is given.\n", + domain->sig_type); + return -EINVAL; + } + + *seg += sizeof(*psv_seg); + *size += sizeof(*psv_seg) / 16; + + return 0; +} + +static int set_reg_wr(struct mlx5_ib_qp *qp, + const struct ib_reg_wr *wr, + void **seg, int *size, void **cur_edge, + bool check_not_free) +{ + struct mlx5_ib_mr *mr = to_mmr(wr->mr); + struct mlx5_ib_pd *pd = to_mpd(qp->ibqp.pd); + struct mlx5_ib_dev *dev = to_mdev(pd->ibpd.device); + int mr_list_size = (mr->mmkey.ndescs + mr->meta_ndescs) * mr->desc_size; + bool umr_inline = mr_list_size <= MLX5_IB_SQ_UMR_INLINE_THRESHOLD; + bool atomic = wr->access & IB_ACCESS_REMOTE_ATOMIC; + u8 flags = 0; + + /* Matches access in mlx5_set_umr_free_mkey(). + * Relaxed Ordering is set implicitly in mlx5_set_umr_free_mkey() and + * kernel ULPs are not aware of it, so we don't set it here. + */ + if (!mlx5_ib_can_reconfig_with_umr(dev, 0, wr->access)) { + mlx5_ib_warn( + to_mdev(qp->ibqp.device), + "Fast update for MR access flags is not possible\n"); + return -EINVAL; + } + + if (unlikely(wr->wr.send_flags & IB_SEND_INLINE)) { + mlx5_ib_warn(to_mdev(qp->ibqp.device), + "Invalid IB_SEND_INLINE send flag\n"); + return -EINVAL; + } + + if (check_not_free) + flags |= MLX5_UMR_CHECK_NOT_FREE; + if (umr_inline) + flags |= MLX5_UMR_INLINE; + + set_reg_umr_seg(*seg, mr, flags, atomic); + *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); + *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; + handle_post_send_edge(&qp->sq, seg, *size, cur_edge); + + set_reg_mkey_seg(*seg, mr, wr->key, wr->access); + *seg += sizeof(struct mlx5_mkey_seg); + *size += sizeof(struct mlx5_mkey_seg) / 16; + handle_post_send_edge(&qp->sq, seg, *size, cur_edge); + + if (umr_inline) { + memcpy_send_wqe(&qp->sq, cur_edge, seg, size, mr->descs, + mr_list_size); + *size = ALIGN(*size, MLX5_SEND_WQE_BB >> 4); + } else { + set_reg_data_seg(*seg, mr, pd); + *seg += sizeof(struct mlx5_wqe_data_seg); + *size += (sizeof(struct mlx5_wqe_data_seg) / 16); + } + return 0; +} + +static void set_linv_wr(struct mlx5_ib_qp *qp, void **seg, int *size, + void **cur_edge) +{ + set_linv_umr_seg(*seg); + *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); + *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; + handle_post_send_edge(&qp->sq, seg, *size, cur_edge); + set_linv_mkey_seg(*seg); + *seg += sizeof(struct mlx5_mkey_seg); + *size += sizeof(struct mlx5_mkey_seg) / 16; + handle_post_send_edge(&qp->sq, seg, *size, cur_edge); +} + +static void dump_wqe(struct mlx5_ib_qp *qp, u32 idx, int size_16) +{ + __be32 *p = NULL; + int i, j; + + pr_debug("dump WQE index %u:\n", idx); + for (i = 0, j = 0; i < size_16 * 4; i += 4, j += 4) { + if ((i & 0xf) == 0) { + p = mlx5_frag_buf_get_wqe(&qp->sq.fbc, idx); + pr_debug("WQBB at %p:\n", (void *)p); + j = 0; + idx = (idx + 1) & (qp->sq.wqe_cnt - 1); + } + pr_debug("%08x %08x %08x %08x\n", be32_to_cpu(p[j]), + be32_to_cpu(p[j + 1]), be32_to_cpu(p[j + 2]), + be32_to_cpu(p[j + 3])); + } +} + +static int __begin_wqe(struct mlx5_ib_qp *qp, void **seg, + struct mlx5_wqe_ctrl_seg **ctrl, + const struct ib_send_wr *wr, unsigned int *idx, + int *size, void **cur_edge, int nreq, + bool send_signaled, bool solicited) +{ + if (unlikely(mlx5_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq))) + return -ENOMEM; + + *idx = qp->sq.cur_post & (qp->sq.wqe_cnt - 1); + *seg = mlx5_frag_buf_get_wqe(&qp->sq.fbc, *idx); + *ctrl = *seg; + *(uint32_t *)(*seg + 8) = 0; + (*ctrl)->imm = send_ieth(wr); + (*ctrl)->fm_ce_se = qp->sq_signal_bits | + (send_signaled ? MLX5_WQE_CTRL_CQ_UPDATE : 0) | + (solicited ? MLX5_WQE_CTRL_SOLICITED : 0); + + *seg += sizeof(**ctrl); + *size = sizeof(**ctrl) / 16; + *cur_edge = qp->sq.cur_edge; + + return 0; +} + +static int begin_wqe(struct mlx5_ib_qp *qp, void **seg, + struct mlx5_wqe_ctrl_seg **ctrl, + const struct ib_send_wr *wr, unsigned int *idx, int *size, + void **cur_edge, int nreq) +{ + return __begin_wqe(qp, seg, ctrl, wr, idx, size, cur_edge, nreq, + wr->send_flags & IB_SEND_SIGNALED, + wr->send_flags & IB_SEND_SOLICITED); +} + +static void finish_wqe(struct mlx5_ib_qp *qp, + struct mlx5_wqe_ctrl_seg *ctrl, + void *seg, u8 size, void *cur_edge, + unsigned int idx, u64 wr_id, int nreq, u8 fence, + u32 mlx5_opcode) +{ + u8 opmod = 0; + + ctrl->opmod_idx_opcode = cpu_to_be32(((u32)(qp->sq.cur_post) << 8) | + mlx5_opcode | ((u32)opmod << 24)); + ctrl->qpn_ds = cpu_to_be32(size | (qp->trans_qp.base.mqp.qpn << 8)); + ctrl->fm_ce_se |= fence; + if (unlikely(qp->flags_en & MLX5_QP_FLAG_SIGNATURE)) + ctrl->signature = wq_sig(ctrl); + + qp->sq.wrid[idx] = wr_id; + qp->sq.w_list[idx].opcode = mlx5_opcode; + qp->sq.wqe_head[idx] = qp->sq.head + nreq; + qp->sq.cur_post += DIV_ROUND_UP(size * 16, MLX5_SEND_WQE_BB); + qp->sq.w_list[idx].next = qp->sq.cur_post; + + /* We save the edge which was possibly updated during the WQE + * construction, into SQ's cache. + */ + seg = PTR_ALIGN(seg, MLX5_SEND_WQE_BB); + qp->sq.cur_edge = (unlikely(seg == cur_edge)) ? + get_sq_edge(&qp->sq, qp->sq.cur_post & + (qp->sq.wqe_cnt - 1)) : + cur_edge; +} + +static void handle_rdma_op(const struct ib_send_wr *wr, void **seg, int *size) +{ + set_raddr_seg(*seg, rdma_wr(wr)->remote_addr, rdma_wr(wr)->rkey); + *seg += sizeof(struct mlx5_wqe_raddr_seg); + *size += sizeof(struct mlx5_wqe_raddr_seg) / 16; +} + +static void handle_local_inv(struct mlx5_ib_qp *qp, const struct ib_send_wr *wr, + struct mlx5_wqe_ctrl_seg **ctrl, void **seg, + int *size, void **cur_edge, unsigned int idx) +{ + qp->sq.wr_data[idx] = IB_WR_LOCAL_INV; + (*ctrl)->imm = cpu_to_be32(wr->ex.invalidate_rkey); + set_linv_wr(qp, seg, size, cur_edge); +} + +static int handle_reg_mr(struct mlx5_ib_qp *qp, const struct ib_send_wr *wr, + struct mlx5_wqe_ctrl_seg **ctrl, void **seg, int *size, + void **cur_edge, unsigned int idx) +{ + qp->sq.wr_data[idx] = IB_WR_REG_MR; + (*ctrl)->imm = cpu_to_be32(reg_wr(wr)->key); + return set_reg_wr(qp, reg_wr(wr), seg, size, cur_edge, true); +} + +static int handle_psv(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + const struct ib_send_wr *wr, + struct mlx5_wqe_ctrl_seg **ctrl, void **seg, int *size, + void **cur_edge, unsigned int *idx, int nreq, + struct ib_sig_domain *domain, u32 psv_index, + u8 next_fence) +{ + int err; + + /* + * SET_PSV WQEs are not signaled and solicited on error. + */ + err = __begin_wqe(qp, seg, ctrl, wr, idx, size, cur_edge, nreq, + false, true); + if (unlikely(err)) { + mlx5_ib_warn(dev, "\n"); + err = -ENOMEM; + goto out; + } + err = set_psv_wr(domain, psv_index, seg, size); + if (unlikely(err)) { + mlx5_ib_warn(dev, "\n"); + goto out; + } + finish_wqe(qp, *ctrl, *seg, *size, *cur_edge, *idx, wr->wr_id, nreq, + next_fence, MLX5_OPCODE_SET_PSV); + +out: + return err; +} + +static int handle_reg_mr_integrity(struct mlx5_ib_dev *dev, + struct mlx5_ib_qp *qp, + const struct ib_send_wr *wr, + struct mlx5_wqe_ctrl_seg **ctrl, void **seg, + int *size, void **cur_edge, + unsigned int *idx, int nreq, u8 fence, + u8 next_fence) +{ + struct mlx5_ib_mr *mr; + struct mlx5_ib_mr *pi_mr; + struct mlx5_ib_mr pa_pi_mr; + struct ib_sig_attrs *sig_attrs; + struct ib_reg_wr reg_pi_wr; + int err; + + qp->sq.wr_data[*idx] = IB_WR_REG_MR_INTEGRITY; + + mr = to_mmr(reg_wr(wr)->mr); + pi_mr = mr->pi_mr; + + if (pi_mr) { + memset(®_pi_wr, 0, + sizeof(struct ib_reg_wr)); + + reg_pi_wr.mr = &pi_mr->ibmr; + reg_pi_wr.access = reg_wr(wr)->access; + reg_pi_wr.key = pi_mr->ibmr.rkey; + + (*ctrl)->imm = cpu_to_be32(reg_pi_wr.key); + /* UMR for data + prot registration */ + err = set_reg_wr(qp, ®_pi_wr, seg, size, cur_edge, false); + if (unlikely(err)) + goto out; + + finish_wqe(qp, *ctrl, *seg, *size, *cur_edge, *idx, wr->wr_id, + nreq, fence, MLX5_OPCODE_UMR); + + err = begin_wqe(qp, seg, ctrl, wr, idx, size, cur_edge, nreq); + if (unlikely(err)) { + mlx5_ib_warn(dev, "\n"); + err = -ENOMEM; + goto out; + } + } else { + memset(&pa_pi_mr, 0, sizeof(struct mlx5_ib_mr)); + /* No UMR, use local_dma_lkey */ + pa_pi_mr.ibmr.lkey = mr->ibmr.pd->local_dma_lkey; + pa_pi_mr.mmkey.ndescs = mr->mmkey.ndescs; + pa_pi_mr.data_length = mr->data_length; + pa_pi_mr.data_iova = mr->data_iova; + if (mr->meta_ndescs) { + pa_pi_mr.meta_ndescs = mr->meta_ndescs; + pa_pi_mr.meta_length = mr->meta_length; + pa_pi_mr.pi_iova = mr->pi_iova; + } + + pa_pi_mr.ibmr.length = mr->ibmr.length; + mr->pi_mr = &pa_pi_mr; + } + (*ctrl)->imm = cpu_to_be32(mr->ibmr.rkey); + /* UMR for sig MR */ + err = set_pi_umr_wr(wr, qp, seg, size, cur_edge); + if (unlikely(err)) { + mlx5_ib_warn(dev, "\n"); + goto out; + } + finish_wqe(qp, *ctrl, *seg, *size, *cur_edge, *idx, wr->wr_id, nreq, + fence, MLX5_OPCODE_UMR); + + sig_attrs = mr->ibmr.sig_attrs; + err = handle_psv(dev, qp, wr, ctrl, seg, size, cur_edge, idx, nreq, + &sig_attrs->mem, mr->sig->psv_memory.psv_idx, + next_fence); + if (unlikely(err)) + goto out; + + err = handle_psv(dev, qp, wr, ctrl, seg, size, cur_edge, idx, nreq, + &sig_attrs->wire, mr->sig->psv_wire.psv_idx, + next_fence); + if (unlikely(err)) + goto out; + + qp->next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL; + +out: + return err; +} + +static int handle_qpt_rc(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + const struct ib_send_wr *wr, + struct mlx5_wqe_ctrl_seg **ctrl, void **seg, int *size, + void **cur_edge, unsigned int *idx, int nreq, u8 fence, + u8 next_fence, int *num_sge) +{ + int err = 0; + + switch (wr->opcode) { + case IB_WR_RDMA_READ: + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + handle_rdma_op(wr, seg, size); + break; + + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + case IB_WR_MASKED_ATOMIC_CMP_AND_SWP: + mlx5_ib_warn(dev, "Atomic operations are not supported yet\n"); + err = -EOPNOTSUPP; + goto out; + + case IB_WR_LOCAL_INV: + handle_local_inv(qp, wr, ctrl, seg, size, cur_edge, *idx); + *num_sge = 0; + break; + + case IB_WR_REG_MR: + err = handle_reg_mr(qp, wr, ctrl, seg, size, cur_edge, *idx); + if (unlikely(err)) + goto out; + *num_sge = 0; + break; + + case IB_WR_REG_MR_INTEGRITY: + err = handle_reg_mr_integrity(dev, qp, wr, ctrl, seg, size, + cur_edge, idx, nreq, fence, + next_fence); + if (unlikely(err)) + goto out; + *num_sge = 0; + break; + + default: + break; + } + +out: + return err; +} + +static void handle_qpt_uc(const struct ib_send_wr *wr, void **seg, int *size) +{ + switch (wr->opcode) { + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + handle_rdma_op(wr, seg, size); + break; + default: + break; + } +} + +static void handle_qpt_hw_gsi(struct mlx5_ib_qp *qp, + const struct ib_send_wr *wr, void **seg, + int *size, void **cur_edge) +{ + set_datagram_seg(*seg, wr); + *seg += sizeof(struct mlx5_wqe_datagram_seg); + *size += sizeof(struct mlx5_wqe_datagram_seg) / 16; + handle_post_send_edge(&qp->sq, seg, *size, cur_edge); +} + +static void handle_qpt_ud(struct mlx5_ib_qp *qp, const struct ib_send_wr *wr, + void **seg, int *size, void **cur_edge) +{ + set_datagram_seg(*seg, wr); + *seg += sizeof(struct mlx5_wqe_datagram_seg); + *size += sizeof(struct mlx5_wqe_datagram_seg) / 16; + handle_post_send_edge(&qp->sq, seg, *size, cur_edge); + + /* handle qp that supports ud offload */ + if (qp->flags & IB_QP_CREATE_IPOIB_UD_LSO) { + struct mlx5_wqe_eth_pad *pad; + + pad = *seg; + memset(pad, 0, sizeof(struct mlx5_wqe_eth_pad)); + *seg += sizeof(struct mlx5_wqe_eth_pad); + *size += sizeof(struct mlx5_wqe_eth_pad) / 16; + set_eth_seg(wr, qp, seg, size, cur_edge); + handle_post_send_edge(&qp->sq, seg, *size, cur_edge); + } +} + +static int handle_qpt_reg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + const struct ib_send_wr *wr, + struct mlx5_wqe_ctrl_seg **ctrl, void **seg, + int *size, void **cur_edge, unsigned int idx) +{ + int err = 0; + + if (unlikely(wr->opcode != MLX5_IB_WR_UMR)) { + err = -EINVAL; + mlx5_ib_warn(dev, "bad opcode %d\n", wr->opcode); + goto out; + } + + qp->sq.wr_data[idx] = MLX5_IB_WR_UMR; + (*ctrl)->imm = cpu_to_be32(umr_wr(wr)->mkey); + err = set_reg_umr_segment(dev, *seg, wr); + if (unlikely(err)) + goto out; + *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); + *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; + handle_post_send_edge(&qp->sq, seg, *size, cur_edge); + set_reg_mkey_segment(dev, *seg, wr); + *seg += sizeof(struct mlx5_mkey_seg); + *size += sizeof(struct mlx5_mkey_seg) / 16; + handle_post_send_edge(&qp->sq, seg, *size, cur_edge); +out: + return err; +} + +int mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr, bool drain) +{ + struct mlx5_wqe_ctrl_seg *ctrl = NULL; /* compiler warning */ + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_ib_qp *qp = to_mqp(ibqp); + struct mlx5_wqe_xrc_seg *xrc; + struct mlx5_bf *bf; + void *cur_edge; + int size; + unsigned long flags; + unsigned int idx; + int err = 0; + int num_sge; + void *seg; + int nreq; + int i; + u8 next_fence = 0; + u8 fence; + + if (unlikely(mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR && + !drain)) { + *bad_wr = wr; + return -EIO; + } + + if (qp->type == IB_QPT_GSI) + return mlx5_ib_gsi_post_send(ibqp, wr, bad_wr); + + bf = &qp->bf; + + spin_lock_irqsave(&qp->sq.lock, flags); + + for (nreq = 0; wr; nreq++, wr = wr->next) { + if (unlikely(wr->opcode >= ARRAY_SIZE(mlx5_ib_opcode))) { + mlx5_ib_warn(dev, "\n"); + err = -EINVAL; + *bad_wr = wr; + goto out; + } + + num_sge = wr->num_sge; + if (unlikely(num_sge > qp->sq.max_gs)) { + mlx5_ib_warn(dev, "\n"); + err = -EINVAL; + *bad_wr = wr; + goto out; + } + + err = begin_wqe(qp, &seg, &ctrl, wr, &idx, &size, &cur_edge, + nreq); + if (err) { + mlx5_ib_warn(dev, "\n"); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + if (wr->opcode == IB_WR_REG_MR || + wr->opcode == IB_WR_REG_MR_INTEGRITY) { + fence = dev->umr_fence; + next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL; + } else if (qp->type == MLX5_IB_QPT_REG_UMR) { + fence = MLX5_FENCE_MODE_INITIATOR_SMALL; + } else { + if (wr->send_flags & IB_SEND_FENCE) { + if (qp->next_fence) + fence = MLX5_FENCE_MODE_SMALL_AND_FENCE; + else + fence = MLX5_FENCE_MODE_FENCE; + } else { + fence = qp->next_fence; + } + } + + qp->sq.wr_data[idx] = 0; + if ((wr->opcode == IB_WR_SEND || + wr->opcode == IB_WR_SEND_WITH_IMM || + wr->opcode == IB_WR_SEND_WITH_INV) && + (wr->send_flags & IB_SEND_SIG_PIPELINED)) { + if (unlikely(!(qp->flags & IB_QP_CREATE_SIGNATURE_PIPELINE))) { + mlx5_ib_warn(dev, "\n"); + err = -EINVAL; + *bad_wr = wr; + goto out; + } + qp->sq.wr_data[idx] = MLX5_IB_WR_SIG_PIPED; + } + + switch (qp->type) { + case IB_QPT_XRC_INI: + xrc = seg; + seg += sizeof(*xrc); + size += sizeof(*xrc) / 16; + fallthrough; + case IB_QPT_RC: + err = handle_qpt_rc(dev, qp, wr, &ctrl, &seg, &size, + &cur_edge, &idx, nreq, fence, + next_fence, &num_sge); + if (unlikely(err)) { + *bad_wr = wr; + goto out; + } else if (wr->opcode == IB_WR_REG_MR_INTEGRITY) { + goto skip_psv; + } + break; + + case IB_QPT_UC: + handle_qpt_uc(wr, &seg, &size); + break; + case IB_QPT_SMI: + if (unlikely(!dev->port_caps[qp->port - 1].has_smi)) { + mlx5_ib_warn(dev, "Send SMP MADs is not allowed\n"); + err = -EPERM; + *bad_wr = wr; + goto out; + } + fallthrough; + case MLX5_IB_QPT_HW_GSI: + handle_qpt_hw_gsi(qp, wr, &seg, &size, &cur_edge); + break; + case IB_QPT_UD: + handle_qpt_ud(qp, wr, &seg, &size, &cur_edge); + break; + case MLX5_IB_QPT_REG_UMR: + err = handle_qpt_reg_umr(dev, qp, wr, &ctrl, &seg, + &size, &cur_edge, idx); + if (unlikely(err)) + goto out; + break; + + default: + break; + } + + if (wr->send_flags & IB_SEND_INLINE && num_sge) { + err = set_data_inl_seg(qp, wr, &seg, &size, &cur_edge); + if (unlikely(err)) { + mlx5_ib_warn(dev, "\n"); + *bad_wr = wr; + goto out; + } + } else { + for (i = 0; i < num_sge; i++) { + handle_post_send_edge(&qp->sq, &seg, size, + &cur_edge); + if (unlikely(!wr->sg_list[i].length)) + continue; + + set_data_ptr_seg( + (struct mlx5_wqe_data_seg *)seg, + wr->sg_list + i); + size += sizeof(struct mlx5_wqe_data_seg) / 16; + seg += sizeof(struct mlx5_wqe_data_seg); + } + } + + qp->next_fence = next_fence; + finish_wqe(qp, ctrl, seg, size, cur_edge, idx, wr->wr_id, nreq, + fence, mlx5_ib_opcode[wr->opcode]); +skip_psv: + if (0) + dump_wqe(qp, idx, size); + } + +out: + if (likely(nreq)) { + qp->sq.head += nreq; + + /* Make sure that descriptors are written before + * updating doorbell record and ringing the doorbell + */ + wmb(); + + qp->db.db[MLX5_SND_DBR] = cpu_to_be32(qp->sq.cur_post); + + /* Make sure doorbell record is visible to the HCA before + * we hit doorbell. + */ + wmb(); + + mlx5_write64((__be32 *)ctrl, bf->bfreg->map + bf->offset); + /* Make sure doorbells don't leak out of SQ spinlock + * and reach the HCA out of order. + */ + bf->offset ^= bf->buf_size; + } + + spin_unlock_irqrestore(&qp->sq.lock, flags); + + return err; +} + +static void set_sig_seg(struct mlx5_rwqe_sig *sig, int max_gs) +{ + sig->signature = calc_sig(sig, (max_gs + 1) << 2); +} + +int mlx5_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr, bool drain) +{ + struct mlx5_ib_qp *qp = to_mqp(ibqp); + struct mlx5_wqe_data_seg *scat; + struct mlx5_rwqe_sig *sig; + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + struct mlx5_core_dev *mdev = dev->mdev; + unsigned long flags; + int err = 0; + int nreq; + int ind; + int i; + + if (unlikely(mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR && + !drain)) { + *bad_wr = wr; + return -EIO; + } + + if (qp->type == IB_QPT_GSI) + return mlx5_ib_gsi_post_recv(ibqp, wr, bad_wr); + + spin_lock_irqsave(&qp->rq.lock, flags); + + ind = qp->rq.head & (qp->rq.wqe_cnt - 1); + + for (nreq = 0; wr; nreq++, wr = wr->next) { + if (mlx5_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) { + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + if (unlikely(wr->num_sge > qp->rq.max_gs)) { + err = -EINVAL; + *bad_wr = wr; + goto out; + } + + scat = mlx5_frag_buf_get_wqe(&qp->rq.fbc, ind); + if (qp->flags_en & MLX5_QP_FLAG_SIGNATURE) + scat++; + + for (i = 0; i < wr->num_sge; i++) + set_data_ptr_seg(scat + i, wr->sg_list + i); + + if (i < qp->rq.max_gs) { + scat[i].byte_count = 0; + scat[i].lkey = cpu_to_be32(MLX5_INVALID_LKEY); + scat[i].addr = 0; + } + + if (qp->flags_en & MLX5_QP_FLAG_SIGNATURE) { + sig = (struct mlx5_rwqe_sig *)scat; + set_sig_seg(sig, qp->rq.max_gs); + } + + qp->rq.wrid[ind] = wr->wr_id; + + ind = (ind + 1) & (qp->rq.wqe_cnt - 1); + } + +out: + if (likely(nreq)) { + qp->rq.head += nreq; + + /* Make sure that descriptors are written before + * doorbell record. + */ + wmb(); + + *qp->db.db = cpu_to_be32(qp->rq.head & 0xffff); + } + + spin_unlock_irqrestore(&qp->rq.lock, flags); + + return err; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/wr.h b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/wr.h new file mode 100644 index 0000000..4f00575 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mlx5/wr.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. + */ + +#ifndef _MLX5_IB_WR_H +#define _MLX5_IB_WR_H + +#include "mlx5_ib.h" + +enum { + MLX5_IB_SQ_UMR_INLINE_THRESHOLD = 64, +}; + +struct mlx5_wqe_eth_pad { + u8 rsvd0[16]; +}; + + +/* get_sq_edge - Get the next nearby edge. + * + * An 'edge' is defined as the first following address after the end + * of the fragment or the SQ. Accordingly, during the WQE construction + * which repetitively increases the pointer to write the next data, it + * simply should check if it gets to an edge. + * + * @sq - SQ buffer. + * @idx - Stride index in the SQ buffer. + * + * Return: + * The new edge. + */ +static inline void *get_sq_edge(struct mlx5_ib_wq *sq, u32 idx) +{ + void *fragment_end; + + fragment_end = mlx5_frag_buf_get_wqe + (&sq->fbc, + mlx5_frag_buf_get_idx_last_contig_stride(&sq->fbc, idx)); + + return fragment_end + MLX5_SEND_WQE_BB; +} + +int mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr, bool drain); +int mlx5_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr, bool drain); + +static inline int mlx5_ib_post_send_nodrain(struct ib_qp *ibqp, + const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) +{ + return mlx5_ib_post_send(ibqp, wr, bad_wr, false); +} + +static inline int mlx5_ib_post_send_drain(struct ib_qp *ibqp, + const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) +{ + return mlx5_ib_post_send(ibqp, wr, bad_wr, true); +} + +static inline int mlx5_ib_post_recv_nodrain(struct ib_qp *ibqp, + const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) +{ + return mlx5_ib_post_recv(ibqp, wr, bad_wr, false); +} + +static inline int mlx5_ib_post_recv_drain(struct ib_qp *ibqp, + const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) +{ + return mlx5_ib_post_recv(ibqp, wr, bad_wr, true); +} +#endif /* _MLX5_IB_WR_H */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mthca/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mthca/Makefile new file mode 100644 index 0000000..e3064b8 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mthca/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_INFINIBAND_MTHCA) += ib_mthca.o + +ib_mthca-y := main.o diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mthca/main.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mthca/main.c new file mode 100644 index 0000000..394df2f --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/mthca/main.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "ib_mthca" +#define DRV_VERSION "2.0.0" +#define DRV_RELDATE "November 17, 2016" + +MODULE_AUTHOR("Alaa Hleihel"); +MODULE_DESCRIPTION("ib_mthca dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init ib_mthca_init(void) +{ + return 0; +} + +static void __exit ib_mthca_cleanup(void) +{ +} + +module_init(ib_mthca_init); +module_exit(ib_mthca_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/nes/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/nes/Makefile new file mode 100644 index 0000000..88165e9 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/nes/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_INFINIBAND_NES) += iw_nes.o + +iw_nes-objs := main.o diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/nes/main.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/nes/main.c new file mode 100644 index 0000000..c4d52d6 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/nes/main.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2012 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "iw_nes" +#define DRV_VERSION "5.8-1.1.2" +#define DRV_RELDATE "27 Nov 2022" + +MODULE_AUTHOR("Alaa Hleihel"); +MODULE_DESCRIPTION("iw_nes dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init iw_nes_init(void) +{ + return 0; +} + +static void __exit iw_nes_cleanup(void) +{ +} + +module_init(iw_nes_init); +module_exit(iw_nes_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/ocrdma/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/ocrdma/Makefile new file mode 100644 index 0000000..470a568 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/ocrdma/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_INFINIBAND_OCRDMA) += ocrdma.o diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/ocrdma/ocrdma.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/ocrdma/ocrdma.c new file mode 100644 index 0000000..8b27cc3 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/ocrdma/ocrdma.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "ocrdma" +#define DRV_VERSION "100.1" +#define DRV_RELDATE "April 06, 2016" + +MODULE_AUTHOR("Alaa Hleihel"); +MODULE_DESCRIPTION("ocrdma dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init ocrdma_init(void) +{ + return 0; +} + +static void __exit ocrdma_cleanup(void) +{ +} + +module_init(ocrdma_init); +module_exit(ocrdma_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/qedr/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/qedr/Makefile new file mode 100644 index 0000000..b647443 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/qedr/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_INFINIBAND_QEDR) += qedr.o + +qedr-y := main.o diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/qedr/main.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/qedr/main.c new file mode 100644 index 0000000..dd0b148 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/qedr/main.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "qedr" +#define DRV_VERSION "2.0.0" +#define DRV_RELDATE "November 06, 2016" + +MODULE_AUTHOR("Alaa Hleihel"); +MODULE_DESCRIPTION("qedr dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init qedr_init(void) +{ + return 0; +} + +static void __exit qedr_cleanup(void) +{ +} + +module_init(qedr_init); +module_exit(qedr_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/qib/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/qib/Makefile new file mode 100644 index 0000000..1fde555 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/qib/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_INFINIBAND_QIB) += ib_qib.o + +ib_qib-y := main.o diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/qib/main.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/qib/main.c new file mode 100644 index 0000000..921f066 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/qib/main.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "ib_qib" +#define DRV_VERSION "2.0.0" +#define DRV_RELDATE "November 07, 2016" + +MODULE_AUTHOR("Alaa Hleihel"); +MODULE_DESCRIPTION("ib_qib dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init ib_qib_init(void) +{ + return 0; +} + +static void __exit ib_qib_cleanup(void) +{ +} + +module_init(ib_qib_init); +module_exit(ib_qib_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/usnic/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/usnic/Makefile new file mode 100644 index 0000000..87844a2 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/usnic/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_INFINIBAND_USNIC) += usnic_verbs.o + +usnic_verbs-y := main.o diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/usnic/main.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/usnic/main.c new file mode 100644 index 0000000..82f26d6 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/usnic/main.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2014 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "usnic_verbs" +#define DRV_VERSION "5.8-1.1.2" +#define DRV_RELDATE "27 Nov 2022" + +MODULE_AUTHOR("Alaa Hleihel"); +MODULE_DESCRIPTION("usnic_verbs dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init usnic_verbs_init(void) +{ + return 0; +} + +static void __exit usnic_verbs_cleanup(void) +{ +} + +module_init(usnic_verbs_init); +module_exit(usnic_verbs_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/vmw_pvrdma/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/vmw_pvrdma/Makefile new file mode 100644 index 0000000..78227cd --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/vmw_pvrdma/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_INFINIBAND_VMWARE_PVRDMA) += vmw_pvrdma.o + +vmw_pvrdma-y := main.o diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/vmw_pvrdma/main.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/vmw_pvrdma/main.c new file mode 100644 index 0000000..d7d3c56 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/hw/vmw_pvrdma/main.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2017 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "vmw_pvrdma" +#define DRV_VERSION "2.0.0" +#define DRV_RELDATE "March 06, 2017" + +MODULE_AUTHOR("Alaa Hleihel"); +MODULE_DESCRIPTION("vmw_pvrdma dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init vmw_pvrdma_init(void) +{ + return 0; +} + +static void __exit vmw_pvrdma_cleanup(void) +{ +} + +module_init(vmw_pvrdma_init); +module_exit(vmw_pvrdma_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/sw/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/sw/Makefile new file mode 100644 index 0000000..68e0230 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/sw/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_INFINIBAND_RDMAVT) += rdmavt/ +obj-$(CONFIG_RDMA_RXE) += rxe/ +obj-$(CONFIG_RDMA_SIW) += siw/ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/sw/rdmavt/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/sw/rdmavt/Makefile new file mode 100644 index 0000000..92df3a3 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/sw/rdmavt/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_INFINIBAND_RDMAVT) += rdmavt.o + +rdmavt-y := main.o diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/sw/rdmavt/main.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/sw/rdmavt/main.c new file mode 100644 index 0000000..02cb021 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/sw/rdmavt/main.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "rdmavt" +#define DRV_VERSION "2.0.0" +#define DRV_RELDATE "November 07, 2016" + +MODULE_AUTHOR("Alaa Hleihel"); +MODULE_DESCRIPTION("rdmavt dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init rdmavt_init(void) +{ + return 0; +} + +static void __exit rdmavt_cleanup(void) +{ +} + +module_init(rdmavt_init); +module_exit(rdmavt_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/sw/rxe/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/sw/rxe/Makefile new file mode 100644 index 0000000..546126c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/sw/rxe/Makefile @@ -0,0 +1,24 @@ +# SPDX-License-Identifier: GPL-2.0 + +# Retpoline support: check if this is the right architecture and that +# the kernel does not support it already. +# Alternatively, if we are called from the main mlnx-ofa build system, +# CONFIG_RETPOLINE will be set by the configure script, however +# subdir-ccflags-y will be set by the toplevel Makefile. +ifneq (,$(findstring $(ARCH),i386 x86_64)) + ifndef CONFIG_RETPOLINE + ifneq (,$(shell awk 'BEGIN {if ($(VERSION).$(PATCHLEVEL) < 4.15) {print 1}}' +#include +#include + +#define DRV_NAME "rdma_rxe" +#define PFX DRV_NAME ": " +#define DRV_VERSION "2.0.0" +#define DRV_RELDATE "November 20, 2016" + +MODULE_AUTHOR("Alaa Hleihel"); +MODULE_DESCRIPTION("rdma_rxe dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init rdma_rxe_init(void) +{ + return 0; +} + +static void __exit rdma_rxe_cleanup(void) +{ +} + +module_init(rdma_rxe_init); +module_exit(rdma_rxe_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/sw/siw/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/sw/siw/Makefile new file mode 100644 index 0000000..ee936f9 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/sw/siw/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_RDMA_SIW) += siw.o + +siw-y := main.o diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/sw/siw/main.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/sw/siw/main.c new file mode 100644 index 0000000..47b54bf --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/sw/siw/main.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2020 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "siw" +#define DRV_VERSION "2.0.0" +#define DRV_RELDATE "February 09, 2020" + +MODULE_AUTHOR("Alaa Hleihel"); +MODULE_DESCRIPTION("siw dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init siw_init(void) +{ + return 0; +} + +static void __exit siw_cleanup(void) +{ +} + +module_init(siw_init); +module_exit(siw_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/Makefile new file mode 100644 index 0000000..82a7170 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/Makefile @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_INFINIBAND_IPOIB) += ipoib$(CONFIG_IPOIB_VERSION)/ +obj-$(CONFIG_INFINIBAND_SRP) += srp/ +obj-$(CONFIG_INFINIBAND_SRP_DUMMY) += srp/ +obj-$(CONFIG_INFINIBAND_SRPT) += srpt/ +obj-$(CONFIG_INFINIBAND_ISER) += iser/ +obj-$(CONFIG_INFINIBAND_ISER_DUMMY) += iser/ +obj-$(CONFIG_INFINIBAND_ISERT) += isert/ +obj-$(CONFIG_INFINIBAND_ISERT_DUMMY) += isert/ +obj-$(CONFIG_INFINIBAND_OPA_VNIC) += opa_vnic/ +obj-$(CONFIG_INFINIBAND_RTRS) += rtrs/ +obj-$(CONFIG_INFINIBAND_XSCORE) += xsigo/ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/Kconfig b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/Kconfig new file mode 100644 index 0000000..254e31a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/Kconfig @@ -0,0 +1,50 @@ +# SPDX-License-Identifier: GPL-2.0-only +config INFINIBAND_IPOIB + tristate "IP-over-InfiniBand" + depends on NETDEVICES && INET && (IPV6 || IPV6=n) + help + Support for the IP-over-InfiniBand protocol (IPoIB). This + transports IP packets over InfiniBand so you can use your IB + device as a fancy NIC. + + See Documentation/infiniband/ipoib.rst for more information + +config INFINIBAND_IPOIB_CM + bool "IP-over-InfiniBand Connected Mode support" + depends on INFINIBAND_IPOIB + default n + help + This option enables support for IPoIB connected mode. After + enabling this option, you need to switch to connected mode + through /sys/class/net/ibXXX/mode to actually create + connections, and then increase the interface MTU with + e.g. ifconfig ib0 mtu 65520. + + WARNING: Enabling connected mode will trigger some packet + drops for multicast and UD mode traffic from this interface, + unless you limit mtu for these destinations to 2044. + +config INFINIBAND_IPOIB_DEBUG + bool "IP-over-InfiniBand debugging" if EXPERT + depends on INFINIBAND_IPOIB + default y + help + This option causes debugging code to be compiled into the + IPoIB driver. The output can be turned on via the + debug_level and mcast_debug_level module parameters (which + can also be set after the driver is loaded through sysfs). + + This option also creates a directory tree under ipoib/ in + debugfs, which contains files that expose debugging + information about IB multicast groups used by the IPoIB + driver. + +config INFINIBAND_IPOIB_DEBUG_DATA + bool "IP-over-InfiniBand data path debugging" + depends on INFINIBAND_IPOIB_DEBUG + help + This option compiles debugging code into the data path + of the IPoIB driver. The output can be turned on via the + data_debug_level module parameter; however, even with output + turned off, this debugging code will have some performance + impact. diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/Makefile new file mode 100644 index 0000000..ba82238 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/Makefile @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_INFINIBAND_IPOIB) += ib_ipoib.o + +ib_ipoib-y := ipoib_main.o \ + ipoib_ib.o \ + ipoib_multicast.o \ + ipoib_verbs.o \ + ipoib_vlan.o \ + ipoib_ethtool.o \ + ipoib_netlink.o \ + ipoib_genetlink.o +ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_CM) += ipoib_cm.o +ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_DEBUG) += ipoib_fs.o diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/ipoib.h b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/ipoib.h new file mode 100644 index 0000000..3ecee88 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/ipoib.h @@ -0,0 +1,868 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2004 Voltaire, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _IPOIB_H +#define _IPOIB_H + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include +#include +#include +#include +/* constants */ + +enum ipoib_flush_level { + IPOIB_FLUSH_LIGHT, + IPOIB_FLUSH_NORMAL, + IPOIB_FLUSH_HEAVY +}; + +enum { + IPOIB_ENCAP_LEN = 4, + IPOIB_PSEUDO_LEN = 20, + IPOIB_HARD_LEN = IPOIB_ENCAP_LEN + IPOIB_PSEUDO_LEN, + + IPOIB_UD_HEAD_SIZE = IB_GRH_BYTES + IPOIB_ENCAP_LEN, + IPOIB_UD_RX_SG = 2, /* max buffer needed for 4K mtu */ + + IPOIB_CM_MTU = 0x10000 - 0x10, /* padding to align header to 16 */ + IPOIB_CM_BUF_SIZE = IPOIB_CM_MTU + IPOIB_ENCAP_LEN, + IPOIB_CM_HEAD_SIZE = IPOIB_CM_BUF_SIZE % PAGE_SIZE, + IPOIB_CM_RX_SG = ALIGN(IPOIB_CM_BUF_SIZE, PAGE_SIZE) / PAGE_SIZE, + IPOIB_RX_RING_SIZE = 256, + IPOIB_TX_RING_SIZE = 128, + IPOIB_MAX_QUEUE_SIZE = 8192, + IPOIB_MIN_QUEUE_SIZE = 2, + IPOIB_CM_MAX_CONN_QP = 4096, + + IPOIB_NUM_WC = 64, + + IPOIB_MAX_PATH_REC_QUEUE = 3, + IPOIB_MAX_MCAST_QUEUE = 64, + + IPOIB_FLAG_OPER_UP = 0, + IPOIB_FLAG_INITIALIZED = 1, + IPOIB_FLAG_ADMIN_UP = 2, + IPOIB_PKEY_ASSIGNED = 3, + IPOIB_FLAG_SUBINTERFACE = 5, + IPOIB_STOP_REAPER = 7, + IPOIB_FLAG_ADMIN_CM = 9, + IPOIB_FLAG_UMCAST = 10, + IPOIB_NEIGH_TBL_FLUSH = 12, + IPOIB_FLAG_DEV_ADDR_SET = 13, + IPOIB_FLAG_DEV_ADDR_CTRL = 14, + + IPOIB_MAX_BACKOFF_SECONDS = 16, + + IPOIB_MCAST_FLAG_FOUND = 0, /* used in set_multicast_list */ + IPOIB_MCAST_FLAG_SENDONLY = 1, + /* + * For IPOIB_MCAST_FLAG_BUSY + * When set, in flight join and mcast->mc is unreliable + * When clear and mcast->mc IS_ERR_OR_NULL, need to restart or + * haven't started yet + * When clear and mcast->mc is valid pointer, join was successful + */ + IPOIB_MCAST_FLAG_BUSY = 2, + IPOIB_MCAST_FLAG_ATTACHED = 3, + + MAX_SEND_CQE = 64, + IPOIB_CM_COPYBREAK = 256, + + IPOIB_MAX_INLINE_SIZE = 120, + + IPOIB_NON_CHILD = 0, + IPOIB_LEGACY_CHILD = 1, + IPOIB_RTNL_CHILD = 2, +}; + +#define IPOIB_OP_RECV (1ul << 31) +#ifdef CONFIG_INFINIBAND_IPOIB_CM +#define IPOIB_OP_CM (1ul << 30) +#else +#define IPOIB_OP_CM (0) +#endif + +#define IPOIB_QPN_MASK ((__force u32) cpu_to_be32(0xFFFFFF)) + +/* structs */ + +struct ipoib_header { + __be16 proto; + u16 reserved; +}; + +struct ipoib_pseudo_header { + u8 hwaddr[INFINIBAND_ALEN]; +}; + +static inline void skb_add_pseudo_hdr(struct sk_buff *skb) +{ + char *data = skb_push(skb, IPOIB_PSEUDO_LEN); + + /* + * only the ipoib header is present now, make room for a dummy + * pseudo header and set skb field accordingly + */ + memset(data, 0, IPOIB_PSEUDO_LEN); + skb_reset_mac_header(skb); + skb_pull(skb, IPOIB_HARD_LEN); +} + +static inline struct ipoib_dev_priv *ipoib_priv(const struct net_device *dev) +{ + struct rdma_netdev *rn = netdev_priv(dev); + + return rn->clnt_priv; +} + +/* Used for all multicast joins (broadcast, IPv4 mcast and IPv6 mcast) */ +struct ipoib_mcast { + struct ib_sa_mcmember_rec mcmember; + struct ib_sa_multicast *mc; + struct ipoib_ah *ah; + + struct rb_node rb_node; + struct list_head list; + + unsigned long created; + unsigned long backoff; + unsigned long delay_until; + + unsigned long flags; + unsigned char logcount; + + struct list_head neigh_list; + + struct sk_buff_head pkt_queue; + + struct net_device *dev; + struct completion done; +}; + +struct ipoib_rx_buf { + struct sk_buff *skb; + u64 mapping[IPOIB_UD_RX_SG]; +}; + +struct ipoib_tx_buf { + struct sk_buff *skb; + u64 mapping[MAX_SKB_FRAGS + 1]; + u32 is_inline; +}; + +struct ib_cm_id; + +struct ipoib_cm_data { + __be32 qpn; /* High byte MUST be ignored on receive */ + __be32 mtu; +}; + +/* + * Quoting 10.3.1 Queue Pair and EE Context States: + * + * Note, for QPs that are associated with an SRQ, the Consumer should take the + * QP through the Error State before invoking a Destroy QP or a Modify QP to the + * Reset State. The Consumer may invoke the Destroy QP without first performing + * a Modify QP to the Error State and waiting for the Affiliated Asynchronous + * Last WQE Reached Event. However, if the Consumer does not wait for the + * Affiliated Asynchronous Last WQE Reached Event, then WQE and Data Segment + * leakage may occur. Therefore, it is good programming practice to tear down a + * QP that is associated with an SRQ by using the following process: + * + * - Put the QP in the Error State + * - Wait for the Affiliated Asynchronous Last WQE Reached Event; + * - either: + * drain the CQ by invoking the Poll CQ verb and either wait for CQ + * to be empty or the number of Poll CQ operations has exceeded + * CQ capacity size; + * - or + * post another WR that completes on the same CQ and wait for this + * WR to return as a WC; + * - and then invoke a Destroy QP or Reset QP. + * + * We use the second option and wait for a completion on the + * same CQ before destroying QPs attached to our SRQ. + */ + +enum ipoib_cm_state { + IPOIB_CM_RX_LIVE, + IPOIB_CM_RX_ERROR, /* Ignored by stale task */ + IPOIB_CM_RX_FLUSH /* Last WQE Reached event observed */ +}; + +struct ipoib_cm_rx { + struct ib_cm_id *id; + struct ib_qp *qp; + struct ipoib_cm_rx_buf *rx_ring; + struct list_head list; + struct net_device *dev; + unsigned long jiffies; + enum ipoib_cm_state state; + int recv_count; +}; + +struct ipoib_cm_tx { + struct ib_cm_id *id; + struct ib_qp *qp; + struct list_head list; + struct net_device *dev; + struct ipoib_neigh *neigh; + struct ipoib_tx_buf *tx_ring; + unsigned int tx_head; + unsigned int tx_tail; + unsigned long flags; + u32 mtu; + unsigned int max_send_sge; +}; + +struct ipoib_cm_rx_buf { + struct sk_buff *skb; + u64 mapping[IPOIB_CM_RX_SG]; +}; + +struct ipoib_cm_dev_priv { + struct ib_srq *srq; + struct ipoib_cm_rx_buf *srq_ring; + struct ib_cm_id *id; + struct list_head passive_ids; /* state: LIVE */ + struct list_head rx_error_list; /* state: ERROR */ + struct list_head rx_flush_list; /* state: FLUSH, drain not started */ + struct list_head rx_drain_list; /* state: FLUSH, drain started */ + struct list_head rx_reap_list; /* state: FLUSH, drain done */ + struct work_struct start_task; + struct work_struct reap_task; + struct work_struct skb_task; + struct work_struct rx_reap_task; + struct delayed_work stale_task; + struct sk_buff_head skb_queue; + struct list_head start_list; + struct list_head reap_list; + struct ib_wc ibwc[IPOIB_NUM_WC]; + struct ib_sge rx_sge[IPOIB_CM_RX_SG]; + struct ib_recv_wr rx_wr; + int nonsrq_conn_qp; + int max_cm_mtu; + int num_frags; +}; + +struct ipoib_ethtool_st { + u16 coalesce_usecs; + u16 max_coalesced_frames; +}; + +struct ipoib_neigh_table; + +struct ipoib_neigh_hash { + struct ipoib_neigh_table *ntbl; + struct ipoib_neigh __rcu **buckets; + struct rcu_head rcu; + u32 mask; + u32 size; +}; + +struct ipoib_neigh_table { + struct ipoib_neigh_hash __rcu *htbl; + atomic_t entries; + struct completion flushed; + struct completion deleted; +}; + +struct ipoib_qp_state_validate { + struct work_struct work; + struct ipoib_dev_priv *priv; +}; + +struct ipoib_arp_repath { + struct work_struct work; + u16 lid; + union ib_gid sgid; + struct net_device *dev; +}; + +/* + * Device private locking: network stack tx_lock protects members used + * in TX fast path, lock protects everything else. lock nests inside + * of tx_lock (ie tx_lock must be acquired first if needed). + */ +struct ipoib_dev_priv { + spinlock_t lock; + + struct net_device *dev; + void (*next_priv_destructor)(struct net_device *dev); + + struct napi_struct send_napi; + struct napi_struct recv_napi; + + unsigned long flags; + + /* + * This protects access to the child_intfs list. + * To READ from child_intfs the RTNL or vlan_rwsem read side must be + * held. To WRITE RTNL and the vlan_rwsem write side must be held (in + * that order) This lock exists because we have a few contexts where + * we need the child_intfs, but do not want to grab the RTNL. + */ + struct rw_semaphore vlan_rwsem; + struct mutex mcast_mutex; + + struct rb_root path_tree; + struct list_head path_list; + + struct ipoib_neigh_table ntbl; + + struct ipoib_mcast *broadcast; + struct list_head multicast_list; + struct rb_root multicast_tree; + + struct workqueue_struct *wq; + struct delayed_work mcast_task; + struct work_struct carrier_on_task; + struct work_struct reschedule_napi_work; + struct work_struct flush_light; + struct work_struct flush_normal; + struct work_struct flush_heavy; + struct work_struct restart_task; + struct work_struct tx_timeout_work; + struct delayed_work ah_reap_task; + struct delayed_work neigh_reap_task; + struct ib_device *ca; + u8 port; + u16 pkey; + u16 pkey_index; + struct ib_pd *pd; + struct ib_cq *recv_cq; + struct ib_cq *send_cq; + struct ib_qp *qp; + u32 qkey; + + union ib_gid local_gid; + u32 local_lid; + + unsigned int admin_mtu; + unsigned int mcast_mtu; + unsigned int max_ib_mtu; + + struct ipoib_rx_buf *rx_ring; + + struct ipoib_tx_buf *tx_ring; + /* cyclic ring variables for managing tx_ring, for UD only */ + unsigned int tx_head; + unsigned int tx_tail; + /* cyclic ring variables for counting overall outstanding send WRs */ + unsigned int global_tx_head; + unsigned int global_tx_tail; + struct ib_sge tx_sge[MAX_SKB_FRAGS + 1]; + struct ib_ud_wr tx_wr; + struct ib_wc send_wc[MAX_SEND_CQE]; + + struct ib_recv_wr rx_wr; + struct ib_sge rx_sge[IPOIB_UD_RX_SG]; + + struct ib_wc ibwc[IPOIB_NUM_WC]; + + struct list_head dead_ahs; + + struct ib_event_handler event_handler; + + struct net_device *parent; + struct list_head child_intfs; + struct list_head list; + int child_type; + +#ifdef CONFIG_INFINIBAND_IPOIB_CM + struct ipoib_cm_dev_priv cm; +#endif + +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG + struct list_head fs_list; + struct dentry *mcg_dentry; + struct dentry *path_dentry; +#endif + u64 hca_caps; + struct ipoib_ethtool_st ethtool; + u32 sendq_size; + u32 recvq_size; + unsigned int max_send_sge; + const struct net_device_ops *rn_ops; +}; + +struct ipoib_ah { + struct net_device *dev; + struct ib_ah *ah; + struct list_head list; + struct kref ref; + unsigned int last_send; + int valid; +}; + +struct ipoib_path { + struct net_device *dev; + struct sa_path_rec pathrec; + struct ipoib_ah *ah; + struct sk_buff_head queue; + + struct list_head neigh_list; + + int query_id; + struct ib_sa_query *query; + struct completion done; + + struct rb_node rb_node; + struct list_head list; +}; + +struct ipoib_neigh { + struct ipoib_ah *ah; +#ifdef CONFIG_INFINIBAND_IPOIB_CM + struct ipoib_cm_tx *cm; +#endif + u8 daddr[INFINIBAND_ALEN]; + struct sk_buff_head queue; + + struct net_device *dev; + + struct list_head list; + struct ipoib_neigh __rcu *hnext; + struct rcu_head rcu; + refcount_t refcnt; + unsigned long alive; +}; + +#define IPOIB_UD_MTU(ib_mtu) (ib_mtu - IPOIB_ENCAP_LEN) +#define IPOIB_UD_BUF_SIZE(ib_mtu) (ib_mtu + IB_GRH_BYTES) + +void ipoib_neigh_dtor(struct ipoib_neigh *neigh); +static inline void ipoib_neigh_put(struct ipoib_neigh *neigh) +{ + if (refcount_dec_and_test(&neigh->refcnt)) + ipoib_neigh_dtor(neigh); +} +struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr); +struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr, + struct net_device *dev); +void ipoib_neigh_free(struct ipoib_neigh *neigh); +void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid); + +extern struct workqueue_struct *ipoib_workqueue; + +/* functions */ + +int ipoib_rx_poll(struct napi_struct *napi, int budget); +int ipoib_tx_poll(struct napi_struct *napi, int budget); +void ipoib_ib_rx_completion(struct ib_cq *cq, void *ctx_ptr); +void ipoib_ib_tx_completion(struct ib_cq *cq, void *ctx_ptr); + +struct ipoib_ah *ipoib_create_ah(struct net_device *dev, + struct ib_pd *pd, struct rdma_ah_attr *attr); +void ipoib_free_ah(struct kref *kref); +static inline void ipoib_put_ah(struct ipoib_ah *ah) +{ + kref_put(&ah->ref, ipoib_free_ah); +} +int ipoib_open(struct net_device *dev); +void ipoib_intf_free(struct net_device *dev); +int ipoib_add_pkey_attr(struct net_device *dev); +int ipoib_add_umcast_attr(struct net_device *dev); + +int ipoib_send(struct net_device *dev, struct sk_buff *skb, + struct ib_ah *address, u32 dqpn); +void ipoib_reap_ah(struct work_struct *work); +void ipoib_repath_ah(struct work_struct *work); + +void ipoib_napi_schedule_work(struct work_struct *work); +struct ipoib_path *__path_find(struct net_device *dev, void *gid); +void ipoib_mark_paths_invalid(struct net_device *dev); +void ipoib_flush_paths(struct net_device *dev); +struct net_device *ipoib_intf_alloc(struct ib_device *hca, u32 port, + const char *format); +int ipoib_intf_init(struct ib_device *hca, u32 port, const char *format, + struct net_device *dev); +void ipoib_ib_tx_timer_func(struct timer_list *t); +void ipoib_ib_dev_flush_light(struct work_struct *work); +void ipoib_ib_dev_flush_normal(struct work_struct *work); +void ipoib_ib_dev_flush_heavy(struct work_struct *work); +void ipoib_ib_tx_timeout_work(struct work_struct *work); +void ipoib_pkey_event(struct work_struct *work); +void ipoib_ib_dev_cleanup(struct net_device *dev); + +int ipoib_ib_dev_open_default(struct net_device *dev); +int ipoib_ib_dev_open(struct net_device *dev); +void ipoib_ib_dev_stop(struct net_device *dev); +void ipoib_ib_dev_up(struct net_device *dev); +void ipoib_ib_dev_down(struct net_device *dev); +int ipoib_ib_dev_stop_default(struct net_device *dev); +void ipoib_pkey_dev_check_presence(struct net_device *dev); + +void ipoib_mcast_join_task(struct work_struct *work); +void ipoib_mcast_carrier_on_task(struct work_struct *work); +void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb); + +void ipoib_mcast_restart_task(struct work_struct *work); +void ipoib_mcast_start_thread(struct net_device *dev); +void ipoib_mcast_stop_thread(struct net_device *dev); + +void ipoib_mcast_dev_down(struct net_device *dev); +void ipoib_mcast_dev_flush(struct net_device *dev); + +int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req); +void ipoib_dma_unmap_tx(struct ipoib_dev_priv *priv, + struct ipoib_tx_buf *tx_req); + +struct rtnl_link_ops *ipoib_get_link_ops(void); + +static inline void ipoib_build_sge(struct ipoib_dev_priv *priv, + struct ipoib_tx_buf *tx_req) +{ + int i, off; + struct sk_buff *skb = tx_req->skb; + skb_frag_t *frags = skb_shinfo(skb)->frags; + int nr_frags = skb_shinfo(skb)->nr_frags; + u64 *mapping = tx_req->mapping; + + if (skb_headlen(skb)) { + priv->tx_sge[0].addr = mapping[0]; + priv->tx_sge[0].length = skb_headlen(skb); + off = 1; + } else + off = 0; + + for (i = 0; i < nr_frags; ++i) { + priv->tx_sge[i + off].addr = mapping[i + off]; + priv->tx_sge[i + off].length = skb_frag_size(&frags[i]); + } + priv->tx_wr.wr.num_sge = nr_frags + off; +} + +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG +struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev); +int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter); +void ipoib_mcast_iter_read(struct ipoib_mcast_iter *iter, + union ib_gid *gid, + unsigned long *created, + unsigned int *queuelen, + unsigned int *complete, + unsigned int *send_only); + +struct ipoib_path_iter *ipoib_path_iter_init(struct net_device *dev); +int ipoib_path_iter_next(struct ipoib_path_iter *iter); +void ipoib_path_iter_read(struct ipoib_path_iter *iter, + struct ipoib_path *path); +#endif + +int ipoib_mcast_attach(struct net_device *dev, struct ib_device *hca, + union ib_gid *mgid, u16 mlid, int set_qkey, u32 qkey); +int ipoib_mcast_detach(struct net_device *dev, struct ib_device *hca, + union ib_gid *mgid, u16 mlid); +void ipoib_mcast_remove_list(struct list_head *remove_list); +void ipoib_check_and_add_mcast_sendonly(struct ipoib_dev_priv *priv, u8 *mgid, + struct list_head *remove_list); + +int ipoib_init_qp(struct net_device *dev); +int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca); +void ipoib_transport_dev_cleanup(struct net_device *dev); + +void ipoib_event(struct ib_event_handler *handler, + struct ib_event *record); + +int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey); +int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey); + +int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct ipoib_dev_priv *priv, + u16 pkey, int child_type); + +int __init ipoib_netlink_init(void); +void __exit ipoib_netlink_fini(void); + +void ipoib_set_umcast(struct net_device *ndev, int umcast_val); +int ipoib_set_mode(struct net_device *dev, const char *buf); + +void ipoib_setup_common(struct net_device *dev); + +void ipoib_pkey_open(struct ipoib_dev_priv *priv); +void ipoib_drain_cq(struct net_device *dev); + +void ipoib_set_ethtool_ops(struct net_device *dev); + +int ipoib_register_genl(void); +void ipoib_unregister_genl(void); + +void ipoib_path_add_notify(struct ipoib_dev_priv *priv, + struct sa_path_rec *pathrec); + +void ipoib_path_del_notify(struct ipoib_dev_priv *priv, + struct sa_path_rec *pathrec); +#define IPOIB_FLAGS_RC 0x80 +#define IPOIB_FLAGS_UC 0x40 + +/* We don't support UC connections at the moment */ +#define IPOIB_CM_SUPPORTED(ha) (ha[0] & (IPOIB_FLAGS_RC)) + +#ifdef CONFIG_INFINIBAND_IPOIB_CM + +extern int ipoib_max_conn_qp; + +static inline int ipoib_cm_admin_enabled(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + return IPOIB_CM_SUPPORTED(dev->dev_addr) && + test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); +} + +static inline int ipoib_cm_enabled(struct net_device *dev, u8 *hwaddr) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + return IPOIB_CM_SUPPORTED(hwaddr) && + test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); +} + +static inline int ipoib_cm_up(struct ipoib_neigh *neigh) + +{ + return test_bit(IPOIB_FLAG_OPER_UP, &neigh->cm->flags); +} + +static inline struct ipoib_cm_tx *ipoib_cm_get(struct ipoib_neigh *neigh) +{ + return neigh->cm; +} + +static inline void ipoib_cm_set(struct ipoib_neigh *neigh, struct ipoib_cm_tx *tx) +{ + neigh->cm = tx; +} + +static inline int ipoib_cm_has_srq(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + return !!priv->cm.srq; +} + +static inline unsigned int ipoib_cm_max_mtu(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + return priv->cm.max_cm_mtu; +} + +void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx); +int ipoib_cm_dev_open(struct net_device *dev); +void ipoib_cm_dev_stop(struct net_device *dev); +int ipoib_cm_dev_init(struct net_device *dev); +int ipoib_cm_add_mode_attr(struct net_device *dev); +void ipoib_cm_dev_cleanup(struct net_device *dev); +struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path, + struct ipoib_neigh *neigh); +void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx); +void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb, + unsigned int mtu); +void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc); +void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc); +#else + +#define ipoib_max_conn_qp 0 + +static inline int ipoib_cm_admin_enabled(struct net_device *dev) +{ + return 0; +} +static inline int ipoib_cm_enabled(struct net_device *dev, u8 *hwaddr) + +{ + return 0; +} + +static inline int ipoib_cm_up(struct ipoib_neigh *neigh) + +{ + return 0; +} + +static inline struct ipoib_cm_tx *ipoib_cm_get(struct ipoib_neigh *neigh) +{ + return NULL; +} + +static inline void ipoib_cm_set(struct ipoib_neigh *neigh, struct ipoib_cm_tx *tx) +{ +} + +static inline int ipoib_cm_has_srq(struct net_device *dev) +{ + return 0; +} + +static inline unsigned int ipoib_cm_max_mtu(struct net_device *dev) +{ + return 0; +} + +static inline +void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx) +{ + return; +} + +static inline +int ipoib_cm_dev_open(struct net_device *dev) +{ + return 0; +} + +static inline +void ipoib_cm_dev_stop(struct net_device *dev) +{ + return; +} + +static inline +int ipoib_cm_dev_init(struct net_device *dev) +{ + return -EOPNOTSUPP; +} + +static inline +void ipoib_cm_dev_cleanup(struct net_device *dev) +{ + return; +} + +static inline +struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path, + struct ipoib_neigh *neigh) +{ + return NULL; +} + +static inline +void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx) +{ + return; +} + +static inline +int ipoib_cm_add_mode_attr(struct net_device *dev) +{ + return 0; +} + +static inline void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb, + unsigned int mtu) +{ + dev_kfree_skb_any(skb); +} + +static inline void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) +{ +} + +static inline void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) +{ +} +#endif + +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG +void ipoib_create_debug_files(struct net_device *dev); +void ipoib_delete_debug_files(struct net_device *dev); +void ipoib_register_debugfs(void); +void ipoib_unregister_debugfs(void); +#else +static inline void ipoib_create_debug_files(struct net_device *dev) { } +static inline void ipoib_delete_debug_files(struct net_device *dev) { } +static inline void ipoib_register_debugfs(void) { } +static inline void ipoib_unregister_debugfs(void) { } +#endif + +#define ipoib_printk(level, priv, format, arg...) \ + printk(level "%s: " format, ((struct ipoib_dev_priv *) priv)->dev->name , ## arg) +#define ipoib_warn(priv, format, arg...) \ +do { \ + static DEFINE_RATELIMIT_STATE(_rs, \ + 10 * HZ /*10 seconds */, \ + 100); \ + if (__ratelimit(&_rs)) \ + ipoib_printk(KERN_WARNING, priv, format , ## arg);\ +} while (0) + +extern int ipoib_sendq_size; +extern int ipoib_recvq_size; +extern u32 ipoib_inline_thold; + +extern struct ib_sa_client ipoib_sa_client; + +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG +extern int ipoib_debug_level; + +#define ipoib_dbg(priv, format, arg...) \ + do { \ + if (ipoib_debug_level > 0) \ + ipoib_printk(KERN_DEBUG, priv, format , ## arg); \ + } while (0) +#define ipoib_dbg_mcast(priv, format, arg...) \ + do { \ + if (mcast_debug_level > 0) \ + ipoib_printk(KERN_DEBUG, priv, format , ## arg); \ + } while (0) +#else /* CONFIG_INFINIBAND_IPOIB_DEBUG */ +#define ipoib_dbg(priv, format, arg...) \ + do { (void) (priv); } while (0) +#define ipoib_dbg_mcast(priv, format, arg...) \ + do { (void) (priv); } while (0) +#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */ + +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA +#define ipoib_dbg_data(priv, format, arg...) \ + do { \ + if (data_debug_level > 0) \ + ipoib_printk(KERN_DEBUG, priv, format , ## arg); \ + } while (0) +#else /* CONFIG_INFINIBAND_IPOIB_DEBUG_DATA */ +#define ipoib_dbg_data(priv, format, arg...) \ + do { (void) (priv); } while (0) +#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG_DATA */ + +#define IPOIB_QPN(ha) (be32_to_cpup((__be32 *) ha) & 0xffffff) + +#endif /* _IPOIB_H */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/ipoib_cm.c new file mode 100644 index 0000000..5ac8a29 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -0,0 +1,1708 @@ +/* + * Copyright (c) 2006 Mellanox Technologies. All rights reserved + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ipoib.h" + +int ipoib_max_conn_qp = 128; +u32 ipoib_inline_thold = IPOIB_MAX_INLINE_SIZE; + +module_param_named(max_nonsrq_conn_qp, ipoib_max_conn_qp, int, 0444); +MODULE_PARM_DESC(max_nonsrq_conn_qp, + "Max number of connected-mode QPs per interface " + "(applied only if shared receive queue is not available)"); + +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA +static int data_debug_level; + +module_param_named(cm_data_debug_level, data_debug_level, int, 0644); +MODULE_PARM_DESC(cm_data_debug_level, + "Enable data path debug tracing for connected mode if > 0"); +#endif + +#define IPOIB_CM_IETF_ID 0x1000000000000000ULL + +#define IPOIB_CM_RX_UPDATE_TIME (256 * HZ) +#define IPOIB_CM_RX_TIMEOUT (2 * 256 * HZ) +#define IPOIB_CM_RX_DELAY (3 * 256 * HZ) +#define IPOIB_CM_RX_UPDATE_MASK (0x3) + +#define IPOIB_CM_RX_RESERVE (ALIGN(IPOIB_HARD_LEN, 16) - IPOIB_ENCAP_LEN) + +static struct ib_qp_attr ipoib_cm_err_attr = { + .qp_state = IB_QPS_ERR +}; + +#define IPOIB_CM_RX_DRAIN_WRID 0xffffffff + +static struct ib_send_wr ipoib_cm_rx_drain_wr = { + .opcode = IB_WR_SEND, +}; + +static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id, + const struct ib_cm_event *event); + +static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, int frags, + u64 mapping[IPOIB_CM_RX_SG]) +{ + int i; + + ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE); + + for (i = 0; i < frags; ++i) + ib_dma_unmap_page(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE); +} + +static int ipoib_cm_post_receive_srq(struct net_device *dev, int id) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + int i, ret; + + priv->cm.rx_wr.wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV; + + for (i = 0; i < priv->cm.num_frags; ++i) + priv->cm.rx_sge[i].addr = priv->cm.srq_ring[id].mapping[i]; + + ret = ib_post_srq_recv(priv->cm.srq, &priv->cm.rx_wr, NULL); + if (unlikely(ret)) { + ipoib_warn(priv, "post srq failed for buf %d (%d)\n", id, ret); + ipoib_cm_dma_unmap_rx(priv, priv->cm.num_frags - 1, + priv->cm.srq_ring[id].mapping); + dev_kfree_skb_any(priv->cm.srq_ring[id].skb); + priv->cm.srq_ring[id].skb = NULL; + } + + return ret; +} + +static int ipoib_cm_post_receive_nonsrq(struct net_device *dev, + struct ipoib_cm_rx *rx, + struct ib_recv_wr *wr, + struct ib_sge *sge, int id) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + int i, ret; + + wr->wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV; + + for (i = 0; i < IPOIB_CM_RX_SG; ++i) + sge[i].addr = rx->rx_ring[id].mapping[i]; + + ret = ib_post_recv(rx->qp, wr, NULL); + if (unlikely(ret)) { + ipoib_warn(priv, "post recv failed for buf %d (%d)\n", id, ret); + ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1, + rx->rx_ring[id].mapping); + dev_kfree_skb_any(rx->rx_ring[id].skb); + rx->rx_ring[id].skb = NULL; + } + + return ret; +} + +static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev, + struct ipoib_cm_rx_buf *rx_ring, + int id, int frags, + u64 mapping[IPOIB_CM_RX_SG], + gfp_t gfp) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct sk_buff *skb; + int i; + + skb = dev_alloc_skb(ALIGN(IPOIB_CM_HEAD_SIZE + IPOIB_PSEUDO_LEN, 16)); + if (unlikely(!skb)) + return NULL; + + /* + * IPoIB adds a IPOIB_ENCAP_LEN byte header, this will align the + * IP header to a multiple of 16. + */ + skb_reserve(skb, IPOIB_CM_RX_RESERVE); + + mapping[0] = ib_dma_map_single(priv->ca, skb->data, IPOIB_CM_HEAD_SIZE, + DMA_FROM_DEVICE); + if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0]))) { + dev_kfree_skb_any(skb); + return NULL; + } + + for (i = 0; i < frags; i++) { + struct page *page = alloc_page(gfp); + + if (!page) + goto partial_error; + skb_fill_page_desc(skb, i, page, 0, PAGE_SIZE); + + mapping[i + 1] = ib_dma_map_page(priv->ca, page, + 0, PAGE_SIZE, DMA_FROM_DEVICE); + if (unlikely(ib_dma_mapping_error(priv->ca, mapping[i + 1]))) + goto partial_error; + } + + rx_ring[id].skb = skb; + return skb; + +partial_error: + + ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE); + + for (; i > 0; --i) + ib_dma_unmap_page(priv->ca, mapping[i], PAGE_SIZE, DMA_FROM_DEVICE); + + dev_kfree_skb_any(skb); + return NULL; +} + +static void ipoib_cm_free_rx_ring(struct net_device *dev, + struct ipoib_cm_rx_buf *rx_ring) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + int i; + + for (i = 0; i < priv->recvq_size; ++i) + if (rx_ring[i].skb) { + ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1, + rx_ring[i].mapping); + dev_kfree_skb_any(rx_ring[i].skb); + } + + vfree(rx_ring); +} + +static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv *priv) +{ + struct ipoib_cm_rx *p; + + /* We only reserved 1 extra slot in CQ for drain WRs, so + * make sure we have at most 1 outstanding WR. */ + if (list_empty(&priv->cm.rx_flush_list) || + !list_empty(&priv->cm.rx_drain_list)) + return; + + /* + * QPs on flush list are error state. This way, a "flush + * error" WC will be immediately generated for each WR we post. + */ + p = list_entry(priv->cm.rx_flush_list.next, typeof(*p), list); + ipoib_cm_rx_drain_wr.wr_id = IPOIB_CM_RX_DRAIN_WRID; + if (ib_post_send(p->qp, &ipoib_cm_rx_drain_wr, NULL)) + ipoib_warn(priv, "failed to post drain wr\n"); + + list_splice_init(&priv->cm.rx_flush_list, &priv->cm.rx_drain_list); +} + +static void ipoib_cm_rx_event_handler(struct ib_event *event, void *ctx) +{ + struct ipoib_cm_rx *p = ctx; + struct ipoib_dev_priv *priv = ipoib_priv(p->dev); + unsigned long flags; + + if (event->event != IB_EVENT_QP_LAST_WQE_REACHED) + return; + + spin_lock_irqsave(&priv->lock, flags); + list_move(&p->list, &priv->cm.rx_flush_list); + p->state = IPOIB_CM_RX_FLUSH; + ipoib_cm_start_rx_drain(priv); + spin_unlock_irqrestore(&priv->lock, flags); +} + +static struct ib_qp *ipoib_cm_create_rx_qp(struct net_device *dev, + struct ipoib_cm_rx *p) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct ib_qp_init_attr attr = { + .event_handler = ipoib_cm_rx_event_handler, + .send_cq = priv->recv_cq, /* For drain WR */ + .recv_cq = priv->recv_cq, + .srq = priv->cm.srq, + .cap.max_send_wr = 1, /* For drain WR */ + .cap.max_send_sge = 1, /* FIXME: 0 Seems not to work */ + .sq_sig_type = IB_SIGNAL_ALL_WR, + .qp_type = IB_QPT_RC, + .qp_context = p, + }; + + if (!ipoib_cm_has_srq(dev)) { + attr.cap.max_recv_wr = priv->recvq_size; + attr.cap.max_recv_sge = IPOIB_CM_RX_SG; + } + + return ib_create_qp(priv->pd, &attr); +} + +static int ipoib_cm_modify_rx_qp(struct net_device *dev, + struct ib_cm_id *cm_id, struct ib_qp *qp, + unsigned int psn) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct ib_qp_attr qp_attr; + int qp_attr_mask, ret; + + qp_attr.qp_state = IB_QPS_INIT; + ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to init QP attr for INIT: %d\n", ret); + return ret; + } + ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify QP to INIT: %d\n", ret); + return ret; + } + qp_attr.qp_state = IB_QPS_RTR; + ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret); + return ret; + } + qp_attr.rq_psn = psn; + ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret); + return ret; + } + + /* + * Current Mellanox HCA firmware won't generate completions + * with error for drain WRs unless the QP has been moved to + * RTS first. This work-around leaves a window where a QP has + * moved to error asynchronously, but this will eventually get + * fixed in firmware, so let's not error out if modify QP + * fails. + */ + qp_attr.qp_state = IB_QPS_RTS; + ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret); + return 0; + } + ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret); + return 0; + } + + return 0; +} + +static void ipoib_cm_init_rx_wr(struct net_device *dev, + struct ib_recv_wr *wr, + struct ib_sge *sge) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + int i; + + for (i = 0; i < priv->cm.num_frags; ++i) + sge[i].lkey = priv->pd->local_dma_lkey; + + sge[0].length = IPOIB_CM_HEAD_SIZE; + for (i = 1; i < priv->cm.num_frags; ++i) + sge[i].length = PAGE_SIZE; + + wr->next = NULL; + wr->sg_list = sge; + wr->num_sge = priv->cm.num_frags; +} + +static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_id, + struct ipoib_cm_rx *rx) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct { + struct ib_recv_wr wr; + struct ib_sge sge[IPOIB_CM_RX_SG]; + } *t; + int ret; + int i; + + rx->rx_ring = vzalloc(array_size(priv->recvq_size, + sizeof(*rx->rx_ring))); + if (!rx->rx_ring) + return -ENOMEM; + + t = kmalloc(sizeof(*t), GFP_KERNEL); + if (!t) { + ret = -ENOMEM; + goto err_free_1; + } + + ipoib_cm_init_rx_wr(dev, &t->wr, t->sge); + + spin_lock_irq(&priv->lock); + + if (priv->cm.nonsrq_conn_qp >= ipoib_max_conn_qp) { + spin_unlock_irq(&priv->lock); + ib_send_cm_rej(cm_id, IB_CM_REJ_NO_QP, NULL, 0, NULL, 0); + ret = -EINVAL; + goto err_free; + } else + ++priv->cm.nonsrq_conn_qp; + + spin_unlock_irq(&priv->lock); + + for (i = 0; i < priv->recvq_size; ++i) { + if (!ipoib_cm_alloc_rx_skb(dev, rx->rx_ring, i, IPOIB_CM_RX_SG - 1, + rx->rx_ring[i].mapping, + GFP_KERNEL)) { + ipoib_warn(priv, "failed to allocate receive buffer %d\n", i); + ret = -ENOMEM; + goto err_count; + } + ret = ipoib_cm_post_receive_nonsrq(dev, rx, &t->wr, t->sge, i); + if (ret) { + ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq " + "failed for buf %d\n", i); + ret = -EIO; + goto err_count; + } + } + + rx->recv_count = priv->recvq_size; + + kfree(t); + + return 0; + +err_count: + spin_lock_irq(&priv->lock); + --priv->cm.nonsrq_conn_qp; + spin_unlock_irq(&priv->lock); + +err_free: + kfree(t); + +err_free_1: + ipoib_cm_free_rx_ring(dev, rx->rx_ring); + + return ret; +} + +static int ipoib_cm_send_rep(struct net_device *dev, struct ib_cm_id *cm_id, + struct ib_qp *qp, + const struct ib_cm_req_event_param *req, + unsigned int psn) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct ipoib_cm_data data = {}; + struct ib_cm_rep_param rep = {}; + + data.qpn = cpu_to_be32(priv->qp->qp_num); + data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE); + + rep.private_data = &data; + rep.private_data_len = sizeof(data); + rep.flow_control = 0; + rep.rnr_retry_count = req->rnr_retry_count; + rep.srq = ipoib_cm_has_srq(dev); + rep.qp_num = qp->qp_num; + rep.starting_psn = psn; + return ib_send_cm_rep(cm_id, &rep); +} + +static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, + const struct ib_cm_event *event) +{ + struct net_device *dev = cm_id->context; + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct ipoib_cm_rx *p; + unsigned int psn; + int ret; + + ipoib_dbg(priv, "REQ arrived\n"); + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return -ENOMEM; + p->dev = dev; + p->id = cm_id; + cm_id->context = p; + p->state = IPOIB_CM_RX_LIVE; + p->jiffies = jiffies; + INIT_LIST_HEAD(&p->list); + + p->qp = ipoib_cm_create_rx_qp(dev, p); + if (IS_ERR(p->qp)) { + ret = PTR_ERR(p->qp); + goto err_qp; + } + + psn = prandom_u32() & 0xffffff; + ret = ipoib_cm_modify_rx_qp(dev, cm_id, p->qp, psn); + if (ret) + goto err_modify; + + if (!ipoib_cm_has_srq(dev)) { + ret = ipoib_cm_nonsrq_init_rx(dev, cm_id, p); + if (ret) + goto err_modify; + } + + spin_lock_irq(&priv->lock); + queue_delayed_work(priv->wq, + &priv->cm.stale_task, IPOIB_CM_RX_DELAY); + /* Add this entry to passive ids list head, but do not re-add it + * if IB_EVENT_QP_LAST_WQE_REACHED has moved it to flush list. */ + p->jiffies = jiffies; + if (p->state == IPOIB_CM_RX_LIVE) + list_move(&p->list, &priv->cm.passive_ids); + spin_unlock_irq(&priv->lock); + + ret = ipoib_cm_send_rep(dev, cm_id, p->qp, &event->param.req_rcvd, psn); + if (ret) { + ipoib_warn(priv, "failed to send REP: %d\n", ret); + if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE)) + ipoib_warn(priv, "unable to move qp to error state\n"); + } + return 0; + +err_modify: + ib_destroy_qp(p->qp); +err_qp: + kfree(p); + return ret; +} + +static int ipoib_cm_rx_handler(struct ib_cm_id *cm_id, + const struct ib_cm_event *event) +{ + struct ipoib_cm_rx *p; + struct ipoib_dev_priv *priv; + + switch (event->event) { + case IB_CM_REQ_RECEIVED: + return ipoib_cm_req_handler(cm_id, event); + case IB_CM_DREQ_RECEIVED: + ib_send_cm_drep(cm_id, NULL, 0); + fallthrough; + case IB_CM_REJ_RECEIVED: + p = cm_id->context; + priv = ipoib_priv(p->dev); + if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE)) + ipoib_warn(priv, "unable to move qp to error state\n"); + fallthrough; + default: + return 0; + } +} +/* Adjust length of skb with fragments to match received data */ +static void skb_put_frags(struct sk_buff *skb, unsigned int hdr_space, + unsigned int length, struct sk_buff *toskb) +{ + int i, num_frags; + unsigned int size; + + /* put header into skb */ + size = min(length, hdr_space); + skb->tail += size; + skb->len += size; + length -= size; + + num_frags = skb_shinfo(skb)->nr_frags; + for (i = 0; i < num_frags; i++) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + if (length == 0) { + /* don't need this page */ + skb_fill_page_desc(toskb, i, skb_frag_page(frag), + 0, PAGE_SIZE); + --skb_shinfo(skb)->nr_frags; + } else { + size = min_t(unsigned int, length, PAGE_SIZE); + + skb_frag_size_set(frag, size); + skb->data_len += size; + skb->truesize += size; + skb->len += size; + length -= size; + } + } +} + +void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct ipoib_cm_rx_buf *rx_ring; + unsigned int wr_id = wc->wr_id & ~(IPOIB_OP_CM | IPOIB_OP_RECV); + struct sk_buff *skb, *newskb; + struct ipoib_cm_rx *p; + unsigned long flags; + u64 mapping[IPOIB_CM_RX_SG]; + int frags; + int has_srq; + struct sk_buff *small_skb; + + ipoib_dbg_data(priv, "cm recv completion: id %d, status: %d\n", + wr_id, wc->status); + + if (unlikely(wr_id >= priv->recvq_size)) { + if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~(IPOIB_OP_CM | IPOIB_OP_RECV))) { + spin_lock_irqsave(&priv->lock, flags); + list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list); + ipoib_cm_start_rx_drain(priv); + queue_work(priv->wq, &priv->cm.rx_reap_task); + spin_unlock_irqrestore(&priv->lock, flags); + } else + ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n", + wr_id, priv->recvq_size); + return; + } + + p = wc->qp->qp_context; + + has_srq = ipoib_cm_has_srq(dev); + rx_ring = has_srq ? priv->cm.srq_ring : p->rx_ring; + + skb = rx_ring[wr_id].skb; + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + ipoib_dbg(priv, + "cm recv error (status=%d, wrid=%d vend_err %#x)\n", + wc->status, wr_id, wc->vendor_err); + ++dev->stats.rx_dropped; + if (has_srq) + goto repost; + else { + if (!--p->recv_count) { + spin_lock_irqsave(&priv->lock, flags); + list_move(&p->list, &priv->cm.rx_reap_list); + spin_unlock_irqrestore(&priv->lock, flags); + queue_work(priv->wq, &priv->cm.rx_reap_task); + } + return; + } + } + + if (unlikely(!(wr_id & IPOIB_CM_RX_UPDATE_MASK))) { + if (p && time_after_eq(jiffies, p->jiffies + IPOIB_CM_RX_UPDATE_TIME)) { + spin_lock_irqsave(&priv->lock, flags); + p->jiffies = jiffies; + /* Move this entry to list head, but do not re-add it + * if it has been moved out of list. */ + if (p->state == IPOIB_CM_RX_LIVE) + list_move(&p->list, &priv->cm.passive_ids); + spin_unlock_irqrestore(&priv->lock, flags); + } + } + + if (wc->byte_len < IPOIB_CM_COPYBREAK) { + int dlen = wc->byte_len; + + small_skb = dev_alloc_skb(dlen + IPOIB_CM_RX_RESERVE); + if (small_skb) { + skb_reserve(small_skb, IPOIB_CM_RX_RESERVE); + ib_dma_sync_single_for_cpu(priv->ca, rx_ring[wr_id].mapping[0], + dlen, DMA_FROM_DEVICE); + skb_copy_from_linear_data(skb, small_skb->data, dlen); + ib_dma_sync_single_for_device(priv->ca, rx_ring[wr_id].mapping[0], + dlen, DMA_FROM_DEVICE); + skb_put(small_skb, dlen); + skb = small_skb; + goto copied; + } + } + + frags = PAGE_ALIGN(wc->byte_len - + min_t(u32, wc->byte_len, IPOIB_CM_HEAD_SIZE)) / + PAGE_SIZE; + + newskb = ipoib_cm_alloc_rx_skb(dev, rx_ring, wr_id, frags, + mapping, GFP_ATOMIC); + if (unlikely(!newskb)) { + /* + * If we can't allocate a new RX buffer, dump + * this packet and reuse the old buffer. + */ + ipoib_dbg(priv, "failed to allocate receive buffer %d\n", wr_id); + ++dev->stats.rx_dropped; + goto repost; + } + + ipoib_cm_dma_unmap_rx(priv, frags, rx_ring[wr_id].mapping); + memcpy(rx_ring[wr_id].mapping, mapping, (frags + 1) * sizeof(*mapping)); + + ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n", + wc->byte_len, wc->slid); + + skb_put_frags(skb, IPOIB_CM_HEAD_SIZE, wc->byte_len, newskb); + +copied: + skb->protocol = ((struct ipoib_header *) skb->data)->proto; + skb_add_pseudo_hdr(skb); + + ++dev->stats.rx_packets; + dev->stats.rx_bytes += skb->len; + + skb->dev = dev; + /* XXX get correct PACKET_ type here */ + skb->pkt_type = PACKET_HOST; + netif_receive_skb(skb); + +repost: + if (has_srq) { + if (unlikely(ipoib_cm_post_receive_srq(dev, wr_id))) + ipoib_warn(priv, "ipoib_cm_post_receive_srq failed " + "for buf %d\n", wr_id); + } else { + if (unlikely(ipoib_cm_post_receive_nonsrq(dev, p, + &priv->cm.rx_wr, + priv->cm.rx_sge, + wr_id))) { + --p->recv_count; + ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq failed " + "for buf %d\n", wr_id); + } + } +} + +static inline int post_send(struct ipoib_dev_priv *priv, + struct ipoib_cm_tx *tx, + unsigned int wr_id, + struct ipoib_tx_buf *tx_req) +{ + ipoib_build_sge(priv, tx_req); + + priv->tx_wr.wr.wr_id = wr_id | IPOIB_OP_CM; + + return ib_post_send(tx->qp, &priv->tx_wr.wr, NULL); +} + +void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct ipoib_tx_buf *tx_req; + int rc; + unsigned int usable_sge = tx->max_send_sge - !!skb_headlen(skb); + + if (unlikely(skb->len > tx->mtu)) { + ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n", + skb->len, tx->mtu); + ++dev->stats.tx_dropped; + ++dev->stats.tx_errors; + ipoib_cm_skb_too_long(dev, skb, tx->mtu - IPOIB_ENCAP_LEN); + return; + } + if (skb_shinfo(skb)->nr_frags > usable_sge) { + if (skb_linearize(skb) < 0) { + ipoib_warn(priv, "skb could not be linearized\n"); + ++dev->stats.tx_dropped; + ++dev->stats.tx_errors; + dev_kfree_skb_any(skb); + return; + } + /* Does skb_linearize return ok without reducing nr_frags? */ + if (skb_shinfo(skb)->nr_frags > usable_sge) { + ipoib_warn(priv, "too many frags after skb linearize\n"); + ++dev->stats.tx_dropped; + ++dev->stats.tx_errors; + dev_kfree_skb_any(skb); + return; + } + } + ipoib_dbg_data(priv, "sending packet: head 0x%x length %d connection 0x%x\n", + tx->tx_head, skb->len, tx->qp->qp_num); + + /* + * We put the skb into the tx_ring _before_ we call post_send() + * because it's entirely possible that the completion handler will + * run before we execute anything after the post_send(). That + * means we have to make sure everything is properly recorded and + * our state is consistent before we call post_send(). + */ + tx_req = &tx->tx_ring[tx->tx_head & (priv->sendq_size - 1)]; + tx_req->skb = skb; + + if (skb->len < ipoib_inline_thold && !skb_shinfo(skb)->nr_frags) { + tx_req->mapping[0] = (u64)skb->data; + priv->tx_wr.wr.send_flags |= IB_SEND_INLINE; + tx_req->is_inline = 1; + } else { + if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req))) { + ++dev->stats.tx_errors; + dev_kfree_skb_any(skb); + return; + } + tx_req->is_inline = 0; + priv->tx_wr.wr.send_flags &= ~IB_SEND_INLINE; + } + + if ((priv->global_tx_head - priv->global_tx_tail) == + priv->sendq_size - 1) { + ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n", + tx->qp->qp_num); + netif_stop_queue(dev); + } + + skb_orphan(skb); + skb_dst_drop(skb); + + if (netif_queue_stopped(dev)) { + rc = ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP | + IB_CQ_REPORT_MISSED_EVENTS); + if (unlikely(rc < 0)) + ipoib_warn(priv, "IPoIB/CM:request notify on send CQ failed\n"); + else if (rc) + napi_schedule(&priv->send_napi); + } + + rc = post_send(priv, tx, tx->tx_head & (priv->sendq_size - 1), tx_req); + if (unlikely(rc)) { + ipoib_warn(priv, "IPoIB/CM:post_send failed, error %d\n", rc); + ++dev->stats.tx_errors; + if (!tx_req->is_inline) + ipoib_dma_unmap_tx(priv, tx_req); + dev_kfree_skb_any(skb); + + if (netif_queue_stopped(dev)) + netif_wake_queue(dev); + } else { + netif_trans_update(dev); + ++tx->tx_head; + ++priv->global_tx_head; + } +} + +void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct ipoib_cm_tx *tx = wc->qp->qp_context; + unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM; + struct ipoib_tx_buf *tx_req; + unsigned long flags; + + ipoib_dbg_data(priv, "cm send completion: id %d, status: %d\n", + wr_id, wc->status); + + if (unlikely(wr_id >= priv->sendq_size)) { + ipoib_warn(priv, "cm send completion event with wrid %d (> %d)\n", + wr_id, priv->sendq_size); + return; + } + + tx_req = &tx->tx_ring[wr_id]; + + /* Checking whether inline send was used - nothing to unmap */ + if (!tx_req->is_inline) + ipoib_dma_unmap_tx(priv, tx_req); + + /* FIXME: is this right? Shouldn't we only increment on success? */ + ++dev->stats.tx_packets; + dev->stats.tx_bytes += tx_req->skb->len; + + dev_kfree_skb_any(tx_req->skb); + + netif_tx_lock(dev); + + ++tx->tx_tail; + ++priv->global_tx_tail; + + if (unlikely(netif_queue_stopped(dev) && + ((priv->global_tx_head - priv->global_tx_tail) <= + priv->sendq_size >> 1) && + test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))) + netif_wake_queue(dev); + + if (wc->status != IB_WC_SUCCESS && + wc->status != IB_WC_WR_FLUSH_ERR) { + struct ipoib_neigh *neigh; + + /* IB_WC[_RNR]_RETRY_EXC_ERR error is part of the life cycle, + * so don't make waves. + */ + if (wc->status == IB_WC_RNR_RETRY_EXC_ERR || + wc->status == IB_WC_RETRY_EXC_ERR) + ipoib_dbg(priv, + "%s: failed cm send event (status=%d, wrid=%d vend_err %#x)\n", + __func__, wc->status, wr_id, wc->vendor_err); + else + ipoib_warn(priv, + "%s: failed cm send event (status=%d, wrid=%d vend_err %#x)\n", + __func__, wc->status, wr_id, wc->vendor_err); + + spin_lock_irqsave(&priv->lock, flags); + neigh = tx->neigh; + + if (neigh) { + neigh->cm = NULL; + ipoib_neigh_free(neigh); + + tx->neigh = NULL; + } + + if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { + list_move(&tx->list, &priv->cm.reap_list); + queue_work(priv->wq, &priv->cm.reap_task); + } + + clear_bit(IPOIB_FLAG_OPER_UP, &tx->flags); + + spin_unlock_irqrestore(&priv->lock, flags); + } + + netif_tx_unlock(dev); +} + +int ipoib_cm_dev_open(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + int ret; + + if (!IPOIB_CM_SUPPORTED(dev->dev_addr)) + return 0; + + priv->cm.id = ib_create_cm_id(priv->ca, ipoib_cm_rx_handler, dev); + if (IS_ERR(priv->cm.id)) { + pr_warn("%s: failed to create CM ID\n", priv->ca->name); + ret = PTR_ERR(priv->cm.id); + goto err_cm; + } + + ret = ib_cm_listen(priv->cm.id, cpu_to_be64(IPOIB_CM_IETF_ID | priv->qp->qp_num), + 0); + if (ret) { + pr_warn("%s: failed to listen on ID 0x%llx\n", priv->ca->name, + IPOIB_CM_IETF_ID | priv->qp->qp_num); + goto err_listen; + } + + return 0; + +err_listen: + ib_destroy_cm_id(priv->cm.id); +err_cm: + priv->cm.id = NULL; + return ret; +} + +static void ipoib_cm_free_rx_reap_list(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct ipoib_cm_rx *rx, *n; + LIST_HEAD(list); + + spin_lock_irq(&priv->lock); + list_splice_init(&priv->cm.rx_reap_list, &list); + spin_unlock_irq(&priv->lock); + + list_for_each_entry_safe(rx, n, &list, list) { + ib_destroy_cm_id(rx->id); + ib_destroy_qp(rx->qp); + if (!ipoib_cm_has_srq(dev)) { + ipoib_cm_free_rx_ring(priv->dev, rx->rx_ring); + spin_lock_irq(&priv->lock); + --priv->cm.nonsrq_conn_qp; + spin_unlock_irq(&priv->lock); + } + kfree(rx); + } +} + +void ipoib_cm_dev_stop(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct ipoib_cm_rx *p; + unsigned long begin; + int ret; + + if (!IPOIB_CM_SUPPORTED(dev->dev_addr) || !priv->cm.id) + return; + + ib_destroy_cm_id(priv->cm.id); + priv->cm.id = NULL; + + spin_lock_irq(&priv->lock); + while (!list_empty(&priv->cm.passive_ids)) { + p = list_entry(priv->cm.passive_ids.next, typeof(*p), list); + list_move(&p->list, &priv->cm.rx_error_list); + p->state = IPOIB_CM_RX_ERROR; + spin_unlock_irq(&priv->lock); + ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE); + if (ret) + ipoib_warn(priv, "unable to move qp to error state: %d\n", ret); + spin_lock_irq(&priv->lock); + } + + /* Wait for all RX to be drained */ + begin = jiffies; + + while (!list_empty(&priv->cm.rx_error_list) || + !list_empty(&priv->cm.rx_flush_list) || + !list_empty(&priv->cm.rx_drain_list)) { + if (time_after(jiffies, begin + 5 * HZ)) { + ipoib_warn(priv, "RX drain timing out\n"); + + /* + * assume the HW is wedged and just free up everything. + */ + list_splice_init(&priv->cm.rx_flush_list, + &priv->cm.rx_reap_list); + list_splice_init(&priv->cm.rx_error_list, + &priv->cm.rx_reap_list); + list_splice_init(&priv->cm.rx_drain_list, + &priv->cm.rx_reap_list); + break; + } + spin_unlock_irq(&priv->lock); + usleep_range(1000, 2000); + ipoib_drain_cq(dev); + spin_lock_irq(&priv->lock); + } + + spin_unlock_irq(&priv->lock); + + ipoib_cm_free_rx_reap_list(dev); + + cancel_delayed_work(&priv->cm.stale_task); +} + +static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, + const struct ib_cm_event *event) +{ + struct ipoib_cm_tx *p = cm_id->context; + struct ipoib_dev_priv *priv = ipoib_priv(p->dev); + struct ipoib_cm_data *data = event->private_data; + struct sk_buff_head skqueue; + struct ib_qp_attr qp_attr; + int qp_attr_mask, ret; + struct sk_buff *skb; + + p->mtu = be32_to_cpu(data->mtu); + + if (p->mtu <= IPOIB_ENCAP_LEN) { + ipoib_warn(priv, "Rejecting connection: mtu %d <= %d\n", + p->mtu, IPOIB_ENCAP_LEN); + return -EINVAL; + } + + qp_attr.qp_state = IB_QPS_RTR; + ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret); + return ret; + } + + qp_attr.rq_psn = 0 /* FIXME */; + ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret); + return ret; + } + + qp_attr.qp_state = IB_QPS_RTS; + ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret); + return ret; + } + ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret); + return ret; + } + + skb_queue_head_init(&skqueue); + + netif_tx_lock_bh(p->dev); + spin_lock_irq(&priv->lock); + set_bit(IPOIB_FLAG_OPER_UP, &p->flags); + if (p->neigh) + while ((skb = __skb_dequeue(&p->neigh->queue))) + __skb_queue_tail(&skqueue, skb); + spin_unlock_irq(&priv->lock); + netif_tx_unlock_bh(p->dev); + + while ((skb = __skb_dequeue(&skqueue))) { + skb->dev = p->dev; + ret = dev_queue_xmit(skb); + if (ret) + ipoib_warn(priv, "%s:dev_queue_xmit failed to re-queue packet, ret:%d\n", + __func__, ret); + } + + ret = ib_send_cm_rtu(cm_id, NULL, 0); + if (ret) { + ipoib_warn(priv, "failed to send RTU: %d\n", ret); + return ret; + } + return 0; +} + +static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_cm_tx *tx) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct ib_qp_init_attr attr = { + .send_cq = priv->send_cq, + .recv_cq = priv->recv_cq, + .srq = priv->cm.srq, + .cap.max_send_wr = priv->sendq_size, + .cap.max_send_sge = 1, + .cap.max_inline_data = IPOIB_MAX_INLINE_SIZE, + .sq_sig_type = IB_SIGNAL_ALL_WR, + .qp_type = IB_QPT_RC, + .qp_context = tx, + .create_flags = 0 + }; + struct ib_qp *tx_qp; + + if (dev->features & NETIF_F_SG) + attr.cap.max_send_sge = min_t(u32, priv->ca->attrs.max_send_sge, + MAX_SKB_FRAGS + 1); + + tx_qp = ib_create_qp(priv->pd, &attr); + tx->max_send_sge = attr.cap.max_send_sge; + return tx_qp; +} + +static int ipoib_cm_send_req(struct net_device *dev, + struct ib_cm_id *id, struct ib_qp *qp, + u32 qpn, + struct sa_path_rec *pathrec) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct ipoib_cm_data data = {}; + struct ib_cm_req_param req = {}; + + data.qpn = cpu_to_be32(priv->qp->qp_num); + data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE); + + req.primary_path = pathrec; + req.alternate_path = NULL; + req.service_id = cpu_to_be64(IPOIB_CM_IETF_ID | qpn); + req.qp_num = qp->qp_num; + req.qp_type = qp->qp_type; + req.private_data = &data; + req.private_data_len = sizeof(data); + req.flow_control = 0; + + req.starting_psn = 0; /* FIXME */ + + /* + * Pick some arbitrary defaults here; we could make these + * module parameters if anyone cared about setting them. + */ + req.responder_resources = 4; + req.remote_cm_response_timeout = 20; + req.local_cm_response_timeout = 20; + req.retry_count = 0; /* RFC draft warns against retries */ + req.rnr_retry_count = 0; /* RFC draft warns against retries */ + req.max_cm_retries = 15; + req.srq = ipoib_cm_has_srq(dev); + return ib_send_cm_req(id, &req); +} + +static int ipoib_cm_modify_tx_init(struct net_device *dev, + struct ib_cm_id *cm_id, struct ib_qp *qp) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct ib_qp_attr qp_attr; + int qp_attr_mask, ret; + + qp_attr.pkey_index = priv->pkey_index; + qp_attr.qp_state = IB_QPS_INIT; + qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE; + qp_attr.port_num = priv->port; + qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_PORT; + + ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify tx QP to INIT: %d\n", ret); + return ret; + } + return 0; +} + +static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn, + struct sa_path_rec *pathrec) +{ + struct ipoib_dev_priv *priv = ipoib_priv(p->dev); + unsigned int noio_flag; + int ret; + + noio_flag = memalloc_noio_save(); + p->tx_ring = vzalloc(array_size(priv->sendq_size, sizeof(*p->tx_ring))); + if (!p->tx_ring) { + memalloc_noio_restore(noio_flag); + ret = -ENOMEM; + goto err_tx; + } + + p->qp = ipoib_cm_create_tx_qp(p->dev, p); + memalloc_noio_restore(noio_flag); + if (IS_ERR(p->qp)) { + ret = PTR_ERR(p->qp); + ipoib_warn(priv, "failed to create tx qp: %d\n", ret); + goto err_qp; + } + + p->id = ib_create_cm_id(priv->ca, ipoib_cm_tx_handler, p); + if (IS_ERR(p->id)) { + ret = PTR_ERR(p->id); + ipoib_warn(priv, "failed to create tx cm id: %d\n", ret); + goto err_id; + } + + ret = ipoib_cm_modify_tx_init(p->dev, p->id, p->qp); + if (ret) { + ipoib_warn(priv, "failed to modify tx qp to rtr: %d\n", ret); + goto err_modify_send; + } + + ret = ipoib_cm_send_req(p->dev, p->id, p->qp, qpn, pathrec); + if (ret) { + ipoib_warn(priv, "failed to send cm req: %d\n", ret); + goto err_modify_send; + } + + ipoib_dbg(priv, "Request connection 0x%x for gid %pI6 qpn 0x%x\n", + p->qp->qp_num, pathrec->dgid.raw, qpn); + + return 0; + +err_modify_send: + ib_destroy_cm_id(p->id); +err_id: + p->id = NULL; + ib_destroy_qp(p->qp); +err_qp: + p->qp = NULL; + vfree(p->tx_ring); +err_tx: + return ret; +} + +static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p) +{ + struct ipoib_dev_priv *priv = ipoib_priv(p->dev); + struct ipoib_tx_buf *tx_req; + unsigned long begin; + int num_tries = 0; + + ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail 0x%x\n", + p->qp ? p->qp->qp_num : 0, p->tx_head, p->tx_tail); + + /* arming cq*/ + ib_req_notify_cq(priv->send_cq, + IB_CQ_NEXT_COMP | + IB_CQ_REPORT_MISSED_EVENTS); + + if (p->id) + ib_destroy_cm_id(p->id); + + if (p->qp) { + if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE)) + ipoib_warn(priv, "%s: Failed to modify QP to ERROR state\n", + __func__); + } + + if (p->tx_ring) { + /* Wait for all sends to complete */ + begin = jiffies; + while ((int) p->tx_tail - (int) p->tx_head < 0) { + if (time_after(jiffies, begin + 5 * HZ)) { + ipoib_warn(priv, "timing out; %d sends not completed\n", + p->tx_head - p->tx_tail); + /* + * check if we are in napi_disable state + * (in port/module down etc.), or if send queue + * is closed, then force drain over the qp + * in order to get all the wc's. + */ + if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags) || + netif_queue_stopped(p->dev)) + ipoib_drain_cq(p->dev); + + /* arming cq*/ + ib_req_notify_cq(priv->send_cq, + IB_CQ_NEXT_COMP | + IB_CQ_REPORT_MISSED_EVENTS); + + begin = jiffies; + num_tries++; + if (num_tries == 5) { + ipoib_warn(priv, "%s: %d not completed for QP: 0x%x force cleanup.\n", + __func__, p->tx_head - p->tx_tail, p->qp->qp_num); + goto timeout; + } + } + + msleep(5); + } + } + +timeout: + + while ((int) p->tx_tail - (int) p->tx_head < 0) { + tx_req = &p->tx_ring[p->tx_tail & (priv->sendq_size - 1)]; + if (!tx_req->is_inline) + ipoib_dma_unmap_tx(priv, tx_req); + dev_kfree_skb_any(tx_req->skb); + netif_tx_lock_bh(p->dev); + ++p->tx_tail; + ++priv->global_tx_tail; + if (unlikely((priv->global_tx_head - priv->global_tx_tail) <= + priv->sendq_size >> 1) && + netif_queue_stopped(p->dev) && + test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) + netif_wake_queue(p->dev); + netif_tx_unlock_bh(p->dev); + } + + if (p->qp) + ib_destroy_qp(p->qp); + + vfree(p->tx_ring); + kfree(p); +} + +static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id, + const struct ib_cm_event *event) +{ + struct ipoib_cm_tx *tx = cm_id->context; + struct ipoib_dev_priv *priv = ipoib_priv(tx->dev); + struct net_device *dev = priv->dev; + struct ipoib_neigh *neigh; + unsigned long flags; + int ret; + + switch (event->event) { + case IB_CM_DREQ_RECEIVED: + ipoib_dbg(priv, "DREQ received.\n"); + ib_send_cm_drep(cm_id, NULL, 0); + break; + case IB_CM_REP_RECEIVED: + ipoib_dbg(priv, "REP received.\n"); + ret = ipoib_cm_rep_handler(cm_id, event); + if (ret) + ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED, + NULL, 0, NULL, 0); + break; + case IB_CM_REQ_ERROR: + case IB_CM_REJ_RECEIVED: + case IB_CM_TIMEWAIT_EXIT: + ipoib_dbg(priv, "CM error %d.\n", event->event); + netif_tx_lock_bh(dev); + spin_lock_irqsave(&priv->lock, flags); + neigh = tx->neigh; + + if (neigh) { + neigh->cm = NULL; + ipoib_neigh_free(neigh); + + tx->neigh = NULL; + } + + if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { + list_move(&tx->list, &priv->cm.reap_list); + queue_work(priv->wq, &priv->cm.reap_task); + } + + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); + break; + default: + break; + } + + return 0; +} + +struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path, + struct ipoib_neigh *neigh) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct ipoib_cm_tx *tx; + + tx = kzalloc(sizeof(*tx), GFP_ATOMIC); + if (!tx) + return NULL; + + neigh->cm = tx; + tx->neigh = neigh; + tx->dev = dev; + list_add(&tx->list, &priv->cm.start_list); + set_bit(IPOIB_FLAG_INITIALIZED, &tx->flags); + queue_work(priv->wq, &priv->cm.start_task); + return tx; +} + +void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx) +{ + struct ipoib_dev_priv *priv = ipoib_priv(tx->dev); + unsigned long flags; + if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { + spin_lock_irqsave(&priv->lock, flags); + list_move(&tx->list, &priv->cm.reap_list); + queue_work(priv->wq, &priv->cm.reap_task); + ipoib_dbg(priv, "Reap connection for gid %pI6\n", + tx->neigh->daddr + 4); + tx->neigh = NULL; + spin_unlock_irqrestore(&priv->lock, flags); + } +} + +#define QPN_AND_OPTIONS_OFFSET 4 + +static void ipoib_cm_tx_start(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, + cm.start_task); + struct net_device *dev = priv->dev; + struct ipoib_neigh *neigh; + struct ipoib_cm_tx *p; + unsigned long flags; + struct ipoib_path *path; + int ret; + + struct sa_path_rec pathrec; + u32 qpn; + + netif_tx_lock_bh(dev); + spin_lock_irqsave(&priv->lock, flags); + + while (!list_empty(&priv->cm.start_list)) { + p = list_entry(priv->cm.start_list.next, typeof(*p), list); + list_del_init(&p->list); + neigh = p->neigh; + + qpn = IPOIB_QPN(neigh->daddr); + /* + * As long as the search is with these 2 locks, + * path existence indicates its validity. + */ + path = __path_find(dev, neigh->daddr + QPN_AND_OPTIONS_OFFSET); + if (!path) { + pr_info("%s ignore not valid path %pI6\n", + __func__, + neigh->daddr + QPN_AND_OPTIONS_OFFSET); + goto free_neigh; + } + memcpy(&pathrec, &path->pathrec, sizeof(pathrec)); + + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); + + ret = ipoib_cm_tx_init(p, qpn, &pathrec); + + netif_tx_lock_bh(dev); + spin_lock_irqsave(&priv->lock, flags); + + if (ret) { +free_neigh: + neigh = p->neigh; + if (neigh) { + neigh->cm = NULL; + ipoib_neigh_free(neigh); + } + list_del(&p->list); + kfree(p); + } + } + + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); +} + +static void ipoib_cm_tx_reap(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, + cm.reap_task); + struct net_device *dev = priv->dev; + struct ipoib_cm_tx *p; + unsigned long flags; + + netif_tx_lock_bh(dev); + spin_lock_irqsave(&priv->lock, flags); + + while (!list_empty(&priv->cm.reap_list)) { + p = list_entry(priv->cm.reap_list.next, typeof(*p), list); + list_del_init(&p->list); + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); + ipoib_cm_tx_destroy(p); + netif_tx_lock_bh(dev); + spin_lock_irqsave(&priv->lock, flags); + } + + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); +} + +static void ipoib_cm_skb_reap(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, + cm.skb_task); + struct net_device *dev = priv->dev; + struct sk_buff *skb; + unsigned long flags; + unsigned int mtu = priv->mcast_mtu; + + netif_tx_lock_bh(dev); + spin_lock_irqsave(&priv->lock, flags); + + while ((skb = skb_dequeue(&priv->cm.skb_queue))) { + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); + + if (skb->protocol == htons(ETH_P_IP)) { + memset(IPCB(skb), 0, sizeof(*IPCB(skb))); + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); + } +#if IS_ENABLED(CONFIG_IPV6) + else if (skb->protocol == htons(ETH_P_IPV6)) { + memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + } +#endif + dev_kfree_skb_any(skb); + + netif_tx_lock_bh(dev); + spin_lock_irqsave(&priv->lock, flags); + } + + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); +} + +void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb, + unsigned int mtu) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + int e = skb_queue_empty(&priv->cm.skb_queue); + + skb_dst_update_pmtu(skb, mtu); + + skb_queue_tail(&priv->cm.skb_queue, skb); + if (e) + queue_work(priv->wq, &priv->cm.skb_task); +} + +static void ipoib_cm_rx_reap(struct work_struct *work) +{ + ipoib_cm_free_rx_reap_list(container_of(work, struct ipoib_dev_priv, + cm.rx_reap_task)->dev); +} + +static void ipoib_cm_stale_task(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, + cm.stale_task.work); + struct ipoib_cm_rx *p; + int ret; + + spin_lock_irq(&priv->lock); + while (!list_empty(&priv->cm.passive_ids)) { + /* List is sorted by LRU, start from tail, + * stop when we see a recently used entry */ + p = list_entry(priv->cm.passive_ids.prev, typeof(*p), list); + if (time_before_eq(jiffies, p->jiffies + IPOIB_CM_RX_TIMEOUT)) + break; + list_move(&p->list, &priv->cm.rx_error_list); + p->state = IPOIB_CM_RX_ERROR; + spin_unlock_irq(&priv->lock); + ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE); + if (ret) + ipoib_warn(priv, "unable to move qp to error state: %d\n", ret); + spin_lock_irq(&priv->lock); + } + + if (!list_empty(&priv->cm.passive_ids)) + queue_delayed_work(priv->wq, + &priv->cm.stale_task, IPOIB_CM_RX_DELAY); + spin_unlock_irq(&priv->lock); +} + +static ssize_t mode_show(struct device *d, struct device_attribute *attr, + char *buf) +{ + struct net_device *dev = to_net_dev(d); + struct ipoib_dev_priv *priv = ipoib_priv(dev); + + if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags)) + return sysfs_emit(buf, "connected\n"); + else + return sysfs_emit(buf, "datagram\n"); +} + +static ssize_t mode_store(struct device *d, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct net_device *dev = to_net_dev(d); + int ret; + + if (!rtnl_trylock()) { + return restart_syscall(); + } + + if (dev->reg_state != NETREG_REGISTERED) { + rtnl_unlock(); + return -EPERM; + } + + ret = ipoib_set_mode(dev, buf); + + /* The assumption is that the function ipoib_set_mode returned + * with the rtnl held by it, if not the value -EBUSY returned, + * then no need to rtnl_unlock + */ + if (ret != -EBUSY) + rtnl_unlock(); + + return (!ret || ret == -EBUSY) ? count : ret; +} + +static DEVICE_ATTR_RW(mode); + +int ipoib_cm_add_mode_attr(struct net_device *dev) +{ + return device_create_file(&dev->dev, &dev_attr_mode); +} + +static void ipoib_cm_create_srq(struct net_device *dev, int max_sge) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct ib_srq_init_attr srq_init_attr = { + .srq_type = IB_SRQT_BASIC, + .attr = { + .max_wr = priv->recvq_size, + .max_sge = max_sge + } + }; + + priv->cm.srq = ib_create_srq(priv->pd, &srq_init_attr); + if (IS_ERR(priv->cm.srq)) { + if (PTR_ERR(priv->cm.srq) != -EOPNOTSUPP) + pr_warn("%s: failed to allocate SRQ, error %ld\n", + priv->ca->name, PTR_ERR(priv->cm.srq)); + priv->cm.srq = NULL; + return; + } + + priv->cm.srq_ring = vzalloc(array_size(priv->recvq_size, + sizeof(*priv->cm.srq_ring))); + if (!priv->cm.srq_ring) { + ib_destroy_srq(priv->cm.srq); + priv->cm.srq = NULL; + return; + } + +} + +int ipoib_cm_dev_init(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + int max_srq_sge, i; + u8 addr; + + INIT_LIST_HEAD(&priv->cm.passive_ids); + INIT_LIST_HEAD(&priv->cm.reap_list); + INIT_LIST_HEAD(&priv->cm.start_list); + INIT_LIST_HEAD(&priv->cm.rx_error_list); + INIT_LIST_HEAD(&priv->cm.rx_flush_list); + INIT_LIST_HEAD(&priv->cm.rx_drain_list); + INIT_LIST_HEAD(&priv->cm.rx_reap_list); + INIT_WORK(&priv->cm.start_task, ipoib_cm_tx_start); + INIT_WORK(&priv->cm.reap_task, ipoib_cm_tx_reap); + INIT_WORK(&priv->cm.skb_task, ipoib_cm_skb_reap); + INIT_WORK(&priv->cm.rx_reap_task, ipoib_cm_rx_reap); + INIT_DELAYED_WORK(&priv->cm.stale_task, ipoib_cm_stale_task); + + skb_queue_head_init(&priv->cm.skb_queue); + + ipoib_dbg(priv, "max_srq_sge=%d\n", priv->ca->attrs.max_srq_sge); + + max_srq_sge = min_t(int, IPOIB_CM_RX_SG, priv->ca->attrs.max_srq_sge); + ipoib_cm_create_srq(dev, max_srq_sge); + if (ipoib_cm_has_srq(dev)) { + priv->cm.max_cm_mtu = max_srq_sge * PAGE_SIZE - 0x10; + priv->cm.num_frags = max_srq_sge; + ipoib_dbg(priv, "max_cm_mtu = 0x%x, num_frags=%d\n", + priv->cm.max_cm_mtu, priv->cm.num_frags); + } else { + priv->cm.max_cm_mtu = IPOIB_CM_MTU; + priv->cm.num_frags = IPOIB_CM_RX_SG; + } + + ipoib_cm_init_rx_wr(dev, &priv->cm.rx_wr, priv->cm.rx_sge); + + if (ipoib_cm_has_srq(dev)) { + for (i = 0; i < priv->recvq_size; ++i) { + if (!ipoib_cm_alloc_rx_skb(dev, priv->cm.srq_ring, i, + priv->cm.num_frags - 1, + priv->cm.srq_ring[i].mapping, + GFP_KERNEL)) { + ipoib_warn(priv, "failed to allocate " + "receive buffer %d\n", i); + ipoib_cm_dev_cleanup(dev); + return -ENOMEM; + } + + if (ipoib_cm_post_receive_srq(dev, i)) { + ipoib_warn(priv, "ipoib_cm_post_receive_srq " + "failed for buf %d\n", i); + ipoib_cm_dev_cleanup(dev); + return -EIO; + } + } + } + + addr = IPOIB_FLAGS_RC; + dev_addr_mod(dev, 0, &addr, 1); + return 0; +} + +void ipoib_cm_dev_cleanup(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + + if (!priv->cm.srq) + return; + + ipoib_dbg(priv, "Cleanup ipoib connected mode.\n"); + + ib_destroy_srq(priv->cm.srq); + priv->cm.srq = NULL; + if (!priv->cm.srq_ring) + return; + + ipoib_cm_free_rx_ring(dev, priv->cm.srq_ring); + priv->cm.srq_ring = NULL; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c new file mode 100644 index 0000000..a64874d --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c @@ -0,0 +1,341 @@ +/* + * Copyright (c) 2007 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include "ipoib.h" + +struct ipoib_stats { + char stat_string[ETH_GSTRING_LEN]; + int stat_offset; +}; + +#define IPOIB_NETDEV_STAT(m) { \ + .stat_string = #m, \ + .stat_offset = offsetof(struct rtnl_link_stats64, m) } + +static const struct ipoib_stats ipoib_gstrings_stats[] = { + IPOIB_NETDEV_STAT(rx_packets), + IPOIB_NETDEV_STAT(tx_packets), + IPOIB_NETDEV_STAT(rx_bytes), + IPOIB_NETDEV_STAT(tx_bytes), + IPOIB_NETDEV_STAT(tx_errors), + IPOIB_NETDEV_STAT(rx_dropped), + IPOIB_NETDEV_STAT(tx_dropped), + IPOIB_NETDEV_STAT(multicast), +}; + +#define IPOIB_GLOBAL_STATS_LEN ARRAY_SIZE(ipoib_gstrings_stats) + +static int ipoib_set_ring_param(struct net_device *dev, + struct ethtool_ringparam *ringparam, + struct kernel_ethtool_ringparam *kernel_param, + struct netlink_ext_ack *extack) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + unsigned int new_recvq_size, new_sendq_size; + unsigned long priv_current_flags; + unsigned int dev_current_flags; + bool init = false, init_fail = false; + bool is_changed_rx = false, is_changed_tx = false; + + if (ringparam->rx_pending <= IPOIB_MAX_QUEUE_SIZE && + ringparam->rx_pending >= IPOIB_MIN_QUEUE_SIZE) { + new_recvq_size = roundup_pow_of_two(ringparam->rx_pending); + is_changed_rx = (new_recvq_size != priv->recvq_size); + if (ringparam->rx_pending != new_recvq_size) + pr_warn("%s: %s: rx_pending should be power of two. rx_pending is %d\n", + dev->name, __func__, new_recvq_size); + } else { + pr_err("rx_pending (%d) is out of bounds [%d-%d]\n", + ringparam->rx_pending, + IPOIB_MIN_QUEUE_SIZE, IPOIB_MAX_QUEUE_SIZE); + return -EINVAL; + } + + if (ringparam->tx_pending <= IPOIB_MAX_QUEUE_SIZE && + ringparam->tx_pending >= IPOIB_MIN_QUEUE_SIZE) { + new_sendq_size = roundup_pow_of_two(ringparam->tx_pending); + is_changed_tx = (new_sendq_size != priv->sendq_size); + if (ringparam->tx_pending != new_sendq_size) + pr_warn("%s: %s: tx_pending should be power of two. tx_pending is %d\n", + dev->name, __func__, new_sendq_size); + } else { + pr_err("tx_pending (%d) is out of bounds [%d-%d]\n", + ringparam->tx_pending, + IPOIB_MIN_QUEUE_SIZE, IPOIB_MAX_QUEUE_SIZE); + return -EINVAL; + } + + if (is_changed_rx || is_changed_tx) { + priv_current_flags = priv->flags; + dev_current_flags = dev->flags; + + dev_change_flags(dev, dev->flags & ~IFF_UP, NULL); + priv->rn_ops->ndo_uninit(dev); + + do { + priv->recvq_size = new_recvq_size; + priv->sendq_size = new_sendq_size; + if (priv->rn_ops->ndo_init(dev)) { + new_recvq_size >>= is_changed_rx; + new_sendq_size >>= is_changed_tx; + /* keep the values always legal */ + new_recvq_size = max_t(unsigned int, + new_recvq_size, + IPOIB_MIN_QUEUE_SIZE); + new_sendq_size = max_t(unsigned int, + new_sendq_size, + IPOIB_MIN_QUEUE_SIZE); + init_fail = true; + } else { + init = true; + } + } while (!init && + !(priv->recvq_size == IPOIB_MIN_QUEUE_SIZE && + priv->sendq_size == IPOIB_MIN_QUEUE_SIZE)); + + if (!init) { + pr_err("%s: Failed to init interface %s, removing it\n", + __func__, dev->name); + return -ENOMEM; + } + + if (init_fail) + pr_warn("%s: Unable to set the requested ring size values, " + "new values are rx = %d, tx = %d\n", + dev->name, new_recvq_size, new_sendq_size); + + if (dev_current_flags & IFF_UP) + dev_change_flags(dev, dev_current_flags, NULL); + } + + return 0; +} + +static void ipoib_get_ring_param(struct net_device *dev, + struct ethtool_ringparam *ringparam, + struct kernel_ethtool_ringparam *kernel_param, + struct netlink_ext_ack *extack) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + + ringparam->rx_max_pending = IPOIB_MAX_QUEUE_SIZE; + ringparam->tx_max_pending = IPOIB_MAX_QUEUE_SIZE; + ringparam->rx_mini_max_pending = 0; + ringparam->rx_jumbo_max_pending = 0; + ringparam->rx_pending = priv->recvq_size; + ringparam->tx_pending = priv->sendq_size; + ringparam->rx_mini_pending = 0; + ringparam->rx_jumbo_pending = 0; +} + +static void ipoib_get_drvinfo(struct net_device *netdev, + struct ethtool_drvinfo *drvinfo) +{ + struct ipoib_dev_priv *priv = ipoib_priv(netdev); + + ib_get_device_fw_str(priv->ca, drvinfo->fw_version); + + strlcpy(drvinfo->bus_info, dev_name(priv->ca->dev.parent), + sizeof(drvinfo->bus_info)); + + strlcpy(drvinfo->driver, "ib_ipoib", sizeof(drvinfo->driver)); +} + +static int ipoib_get_coalesce(struct net_device *dev, + struct ethtool_coalesce *coal, + struct kernel_ethtool_coalesce *kernel_coal, + struct netlink_ext_ack *extack) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + + coal->rx_coalesce_usecs = priv->ethtool.coalesce_usecs; + coal->rx_max_coalesced_frames = priv->ethtool.max_coalesced_frames; + + return 0; +} + +static int ipoib_set_coalesce(struct net_device *dev, + struct ethtool_coalesce *coal, + struct kernel_ethtool_coalesce *kernel_coal, + struct netlink_ext_ack *extack) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + int ret; + + /* + * These values are saved in the private data and returned + * when ipoib_get_coalesce() is called + */ + if (coal->rx_coalesce_usecs > 0xffff || + coal->rx_max_coalesced_frames > 0xffff) + return -EINVAL; + + ret = rdma_set_cq_moderation(priv->recv_cq, + coal->rx_max_coalesced_frames, + coal->rx_coalesce_usecs); + if (ret && ret != -EOPNOTSUPP) { + ipoib_warn(priv, "failed modifying CQ (%d)\n", ret); + return ret; + } + + priv->ethtool.coalesce_usecs = coal->rx_coalesce_usecs; + priv->ethtool.max_coalesced_frames = coal->rx_max_coalesced_frames; + + return 0; +} +static void ipoib_get_ethtool_stats(struct net_device *dev, + struct ethtool_stats __always_unused *stats, + u64 *data) +{ + int i; + struct net_device_stats *net_stats = &dev->stats; + u8 *p = (u8 *)net_stats; + + for (i = 0; i < IPOIB_GLOBAL_STATS_LEN; i++) + data[i] = *(u64 *)(p + ipoib_gstrings_stats[i].stat_offset); + +} +static void ipoib_get_strings(struct net_device __always_unused *dev, + u32 stringset, u8 *data) +{ + u8 *p = data; + int i; + + switch (stringset) { + case ETH_SS_STATS: + for (i = 0; i < IPOIB_GLOBAL_STATS_LEN; i++) { + memcpy(p, ipoib_gstrings_stats[i].stat_string, + ETH_GSTRING_LEN); + p += ETH_GSTRING_LEN; + } + break; + default: + break; + } +} +static int ipoib_get_sset_count(struct net_device __always_unused *dev, + int sset) +{ + switch (sset) { + case ETH_SS_STATS: + return IPOIB_GLOBAL_STATS_LEN; + default: + break; + } + return -EOPNOTSUPP; +} + +/* Return lane speed in unit of 1e6 bit/sec */ +static inline int ib_speed_enum_to_int(int speed) +{ + switch (speed) { + case IB_SPEED_SDR: + return SPEED_2500; + case IB_SPEED_DDR: + return SPEED_5000; + case IB_SPEED_QDR: + case IB_SPEED_FDR10: + return SPEED_10000; + case IB_SPEED_FDR: + return SPEED_14000; + case IB_SPEED_EDR: + return SPEED_25000; + case IB_SPEED_HDR: + return SPEED_50000; + case IB_SPEED_NDR: + return SPEED_100000; + } + + return SPEED_UNKNOWN; +} + +static int ipoib_get_link_ksettings(struct net_device *netdev, + struct ethtool_link_ksettings *cmd) +{ + struct ipoib_dev_priv *priv = ipoib_priv(netdev); + struct ib_port_attr attr; + int ret, speed, width; + + if (!netif_carrier_ok(netdev)) { + cmd->base.speed = SPEED_UNKNOWN; + cmd->base.duplex = DUPLEX_UNKNOWN; + return 0; + } + + ret = ib_query_port(priv->ca, priv->port, &attr); + if (ret < 0) + return -EINVAL; + + speed = ib_speed_enum_to_int(attr.active_speed); + width = ib_width_enum_to_int(attr.active_width); + + if (speed < 0 || width < 0) + return -EINVAL; + + /* Except the following are set, the other members of + * the struct ethtool_link_settings are initialized to + * zero in the function __ethtool_get_link_ksettings. + */ + cmd->base.speed = speed * width; + cmd->base.duplex = DUPLEX_FULL; + + cmd->base.phy_address = 0xFF; + + cmd->base.autoneg = AUTONEG_ENABLE; + cmd->base.port = PORT_OTHER; + + return 0; +} + +static const struct ethtool_ops ipoib_ethtool_ops = { + .supported_coalesce_params = ETHTOOL_COALESCE_RX_USECS | + ETHTOOL_COALESCE_RX_MAX_FRAMES, + .get_link_ksettings = ipoib_get_link_ksettings, + .get_drvinfo = ipoib_get_drvinfo, + .get_coalesce = ipoib_get_coalesce, + .set_coalesce = ipoib_set_coalesce, + .get_strings = ipoib_get_strings, + .get_ethtool_stats = ipoib_get_ethtool_stats, + .get_sset_count = ipoib_get_sset_count, + .get_link = ethtool_op_get_link, + .set_ringparam = ipoib_set_ring_param, + .get_ringparam = ipoib_get_ring_param, +}; + +void ipoib_set_ethtool_ops(struct net_device *dev) +{ + dev->ethtool_ops = &ipoib_ethtool_ops; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/ipoib_fs.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/ipoib_fs.c new file mode 100644 index 0000000..12ba7a0 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/ipoib_fs.c @@ -0,0 +1,251 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +struct file_operations; + +#include +#include + +#include "ipoib.h" + +static struct dentry *ipoib_root; + +static void format_gid(union ib_gid *gid, char *buf) +{ + int i, n; + + for (n = 0, i = 0; i < 8; ++i) { + n += sprintf(buf + n, "%x", + be16_to_cpu(((__be16 *) gid->raw)[i])); + if (i < 7) + buf[n++] = ':'; + } +} + +static void *ipoib_mcg_seq_start(struct seq_file *file, loff_t *pos) +{ + struct ipoib_mcast_iter *iter; + loff_t n = *pos; + + iter = ipoib_mcast_iter_init(file->private); + if (!iter) + return NULL; + + while (n--) { + if (ipoib_mcast_iter_next(iter)) { + kfree(iter); + return NULL; + } + } + + return iter; +} + +static void *ipoib_mcg_seq_next(struct seq_file *file, void *iter_ptr, + loff_t *pos) +{ + struct ipoib_mcast_iter *iter = iter_ptr; + + (*pos)++; + + if (ipoib_mcast_iter_next(iter)) { + kfree(iter); + return NULL; + } + + return iter; +} + +static void ipoib_mcg_seq_stop(struct seq_file *file, void *iter_ptr) +{ + /* nothing for now */ +} + +static int ipoib_mcg_seq_show(struct seq_file *file, void *iter_ptr) +{ + struct ipoib_mcast_iter *iter = iter_ptr; + char gid_buf[sizeof "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff"]; + union ib_gid mgid; + unsigned long created; + unsigned int queuelen, complete, send_only; + + if (!iter) + return 0; + + ipoib_mcast_iter_read(iter, &mgid, &created, &queuelen, + &complete, &send_only); + + format_gid(&mgid, gid_buf); + + seq_printf(file, + "GID: %s\n" + " created: %10ld\n" + " queuelen: %9d\n" + " complete: %9s\n" + " send_only: %8s\n" + "\n", + gid_buf, created, queuelen, + complete ? "yes" : "no", + send_only ? "yes" : "no"); + + return 0; +} + +static const struct seq_operations ipoib_mcg_sops = { + .start = ipoib_mcg_seq_start, + .next = ipoib_mcg_seq_next, + .stop = ipoib_mcg_seq_stop, + .show = ipoib_mcg_seq_show, +}; + +DEFINE_SEQ_ATTRIBUTE(ipoib_mcg); + +static void *ipoib_path_seq_start(struct seq_file *file, loff_t *pos) +{ + struct ipoib_path_iter *iter; + loff_t n = *pos; + + iter = ipoib_path_iter_init(file->private); + if (!iter) + return NULL; + + while (n--) { + if (ipoib_path_iter_next(iter)) { + kfree(iter); + return NULL; + } + } + + return iter; +} + +static void *ipoib_path_seq_next(struct seq_file *file, void *iter_ptr, + loff_t *pos) +{ + struct ipoib_path_iter *iter = iter_ptr; + + (*pos)++; + + if (ipoib_path_iter_next(iter)) { + kfree(iter); + return NULL; + } + + return iter; +} + +static void ipoib_path_seq_stop(struct seq_file *file, void *iter_ptr) +{ + /* nothing for now */ +} + +static int ipoib_path_seq_show(struct seq_file *file, void *iter_ptr) +{ + struct ipoib_path_iter *iter = iter_ptr; + char gid_buf[sizeof "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff"]; + struct ipoib_path path; + int rate; + + if (!iter) + return 0; + + ipoib_path_iter_read(iter, &path); + + format_gid(&path.pathrec.dgid, gid_buf); + + seq_printf(file, + "GID: %s\n" + " complete: %6s\n", + gid_buf, sa_path_get_dlid(&path.pathrec) ? "yes" : "no"); + + if (sa_path_get_dlid(&path.pathrec)) { + rate = ib_rate_to_mbps(path.pathrec.rate); + + seq_printf(file, + " DLID: 0x%04x\n" + " SL: %12d\n" + " rate: %8d.%d Gb/sec\n", + be32_to_cpu(sa_path_get_dlid(&path.pathrec)), + path.pathrec.sl, + rate / 1000, rate % 1000); + } + + seq_putc(file, '\n'); + + return 0; +} + +static const struct seq_operations ipoib_path_sops = { + .start = ipoib_path_seq_start, + .next = ipoib_path_seq_next, + .stop = ipoib_path_seq_stop, + .show = ipoib_path_seq_show, +}; + +DEFINE_SEQ_ATTRIBUTE(ipoib_path); + +void ipoib_create_debug_files(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + char name[IFNAMSIZ + sizeof("_path")]; + + snprintf(name, sizeof(name), "%s_mcg", dev->name); + priv->mcg_dentry = debugfs_create_file(name, S_IFREG | S_IRUGO, + ipoib_root, dev, &ipoib_mcg_fops); + + snprintf(name, sizeof(name), "%s_path", dev->name); + priv->path_dentry = debugfs_create_file(name, S_IFREG | S_IRUGO, + ipoib_root, dev, &ipoib_path_fops); +} + +void ipoib_delete_debug_files(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + + debugfs_remove(priv->mcg_dentry); + debugfs_remove(priv->path_dentry); + priv->mcg_dentry = priv->path_dentry = NULL; +} + +void ipoib_register_debugfs(void) +{ + ipoib_root = debugfs_create_dir("ipoib", NULL); +} + +void ipoib_unregister_debugfs(void) +{ + debugfs_remove(ipoib_root); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/ipoib_genetlink.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/ipoib_genetlink.c new file mode 100644 index 0000000..c1ee08a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/ipoib_genetlink.c @@ -0,0 +1,284 @@ +/* + * Copyright (c) 2012 Mellanox Technologies, Inc. - All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ipoib.h" +#include +#include +#include + +/* netlink flags bits */ +#define GENL_PATH_NOTIFICATIONS_ACTIVE 2 +#define GENL_MC_NOTIFICATIONS_ACTIVE 4 + +/* attributes types + * 0 causes issues with Netlink */ +enum { + ATTRIBUTE_UNSPECIFIED, + PATH_ADD, + PATH_DEL, + __IPOIB_NETLINK_ATT_MAX +}; + +#define IPOIB_NETLINK_ATT_MAX (__IPOIB_NETLINK_ATT_MAX - 1) + +/* command types + * 0 causes issues with Netlink */ +enum { + COMMAND_UNSPECIFIED, + REPORT_PATH +}; + +enum ipoib_genl_grps_id { + IPOIB_PATH_NOTIFY_GRP_ID, +}; + +struct genl_multicast_group ipoib_genl_grps[] = { + /* ipoib mcast group for path rec */ + [IPOIB_PATH_NOTIFY_GRP_ID] = { + .name = "PATH_NOTIFY" + }, +}; + +struct ipoib_family_header { + char name[IFNAMSIZ]; +}; + +struct ipoib_path_notice { + u8 gid[16]; + __be16 lid; + u8 sl; + u8 hop_limit; +}; + +struct ipoib_path_del_notice { + u8 gid[16]; +}; + +struct ipoib_ge_netlink_notify { + union { + struct ipoib_path_notice path_rec; + struct ipoib_path_del_notice path_del; + }; +}; + +struct ipoib_genl_work { + struct work_struct work; + struct ipoib_dev_priv *priv; + struct ipoib_ge_netlink_notify record; + int type; +}; + +/* genl_registered's value is changed only on module load/unload */ +static int genl_registered; + +/* + * Handler module, contains the logic to process notifications and user + * requests but not the sending-via-GENL logic. + */ + +void generate_reply(struct work_struct *work); + +void ipoib_path_add_notify(struct ipoib_dev_priv *priv, + struct sa_path_rec *pathrec) +{ + struct ipoib_genl_work *genl_work; + + genl_work = kzalloc(sizeof(struct ipoib_genl_work), + GFP_KERNEL); + if (!genl_work) { + ipoib_warn(priv, "%s: allocation of ipoib_genl_work failed\n", + __func__); + return; + } + + memcpy(genl_work->record.path_rec.gid, pathrec->dgid.raw, + sizeof(union ib_gid)); + genl_work->record.path_rec.lid = be32_to_cpu(sa_path_get_dlid(pathrec)); + genl_work->record.path_rec.sl = pathrec->sl; + genl_work->record.path_rec.hop_limit = pathrec->hop_limit; + + INIT_WORK(&genl_work->work, generate_reply); + genl_work->priv = priv; + genl_work->type = PATH_ADD; + queue_work(priv->wq, &genl_work->work); +} + +void ipoib_path_del_notify(struct ipoib_dev_priv *priv, + struct sa_path_rec *pathrec) +{ + struct ipoib_genl_work *genl_work; + + genl_work = kzalloc(sizeof(struct ipoib_genl_work), + GFP_ATOMIC); + if (!genl_work) { + ipoib_warn(priv, "%s: allocation of ipoib_genl_work failed\n", + __func__); + return; + } + + memcpy(genl_work->record.path_del.gid, pathrec->dgid.raw, + sizeof(union ib_gid)); + INIT_WORK(&genl_work->work, generate_reply); + genl_work->priv = priv; + genl_work->type = PATH_DEL; + queue_work(priv->wq, &genl_work->work); +} + +/* + * Notifier module. Contains the needed functions to send messages to + * userspace using GENL. + */ + +static struct genl_family ipoib_genl_family = { + .hdrsize = sizeof(struct ipoib_family_header), + .name = "GENETLINK_IPOIB", + .version = 1, + .maxattr = IPOIB_NETLINK_ATT_MAX, + .mcgrps = ipoib_genl_grps, + .n_mcgrps = 1, +}; + +static inline char *get_command(int command) +{ + switch (command) { + case PATH_ADD: + return "PATH_ADD"; + case PATH_DEL: + return "PATH_DEL"; + default: + return ""; + } +} + +void generate_reply(struct work_struct *work) +{ + struct ipoib_genl_work *genl_work = container_of(work, + struct ipoib_genl_work, + work); + struct ipoib_dev_priv *priv; + struct sk_buff *skb; + void *msg_head; + struct nlattr *nla; + unsigned int seq = 0; + int i = 0; + int type = genl_work->type; + struct ipoib_ge_netlink_notify *record = &genl_work->record; + + priv = genl_work->priv; + if (!priv) { + pr_crit("%s: priv is NULL\n", __func__); + return; + } + + skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (skb == NULL) { + ipoib_printk(KERN_CRIT, priv, "%s: skb allocation failed\n", + __func__); + goto out; + } + + msg_head = genlmsg_put(skb, 0, seq++, &ipoib_genl_family, 0, + REPORT_PATH); + /* Warning: + * genlmsg_put can return NULL in case there is not enough room + * in the skb for the family and netlink headers. As long as + * allock succeeded and is NLMSG_GOODSIZE the command can't + * fail. + */ + + memcpy(msg_head, priv->dev->name, IFNAMSIZ); + nla = __nla_reserve(skb, type, 0); + + nla->nla_type = type; + switch (type) { + case PATH_ADD: + { + struct ipoib_path_notice *p; + nla->nla_len += sizeof(struct ipoib_path_notice); + p = (struct ipoib_path_notice *)skb_put(skb, + sizeof(struct ipoib_path_notice)); + memcpy(p, &record->path_rec, + sizeof(struct ipoib_path_notice)); + genlmsg_end(skb, msg_head); + i = genlmsg_multicast(&ipoib_genl_family, skb, 0, IPOIB_PATH_NOTIFY_GRP_ID, + GFP_KERNEL); + break; + } + case PATH_DEL: + { + struct ipoib_path_del_notice *p; + nla->nla_len += sizeof(struct ipoib_path_del_notice); + p = (struct ipoib_path_del_notice *)skb_put(skb, + sizeof(struct ipoib_path_del_notice)); + memcpy(p, &record->path_del, + sizeof(struct ipoib_path_del_notice)); + genlmsg_end(skb, msg_head); + i = genlmsg_multicast(&ipoib_genl_family, skb, 0, IPOIB_PATH_NOTIFY_GRP_ID, + GFP_KERNEL); + break; + } + } + if (i && i != -ESRCH) { + pr_err("%s: sending GENL %s message returned %d\n", __func__, + get_command(type), i); + } + +out: + kfree(genl_work); + return; +} + +/* If needed, deletes the netlink interfaces from the ipoib_genl_if list + * and resets the flags. */ +void ipoib_unregister_genl(void) +{ + if (!genl_registered) + return; + genl_registered = 0; + genl_unregister_family(&ipoib_genl_family); +} + +int ipoib_register_genl(void) +{ + int rc; + rc = genl_register_family(&ipoib_genl_family); + if (rc != 0) + goto out; + genl_registered = 1; + + return 0; +/* unregistering the family will cause: + * all assigned operations to be unregistered automatically. + * all assigned multicast groups to be unregistered automatically. */ +out: + return rc; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/ipoib_ib.c new file mode 100644 index 0000000..fcbe126 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -0,0 +1,1397 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * Copyright (c) 2004, 2005 Voltaire, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#include +#include +#include + +#include "ipoib.h" +#include /* For ARPHRD_xxx */ + +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA +static int data_debug_level; + +module_param(data_debug_level, int, 0644); +MODULE_PARM_DESC(data_debug_level, + "Enable data path debug tracing if > 0"); +#endif + +struct ipoib_ah *ipoib_create_ah(struct net_device *dev, + struct ib_pd *pd, struct rdma_ah_attr *attr) +{ + struct ipoib_ah *ah; + struct ib_ah *vah; + + ah = kmalloc(sizeof(*ah), GFP_KERNEL); + if (!ah) + return ERR_PTR(-ENOMEM); + + ah->dev = dev; + ah->last_send = 0; + kref_init(&ah->ref); + + vah = rdma_create_ah(pd, attr, RDMA_CREATE_AH_SLEEPABLE); + if (IS_ERR(vah)) { + kfree(ah); + ah = (struct ipoib_ah *)vah; + } else { + ah->ah = vah; + ipoib_dbg(ipoib_priv(dev), "Created ah %p\n", ah->ah); + } + + return ah; +} + +void ipoib_free_ah(struct kref *kref) +{ + struct ipoib_ah *ah = container_of(kref, struct ipoib_ah, ref); + struct ipoib_dev_priv *priv = ipoib_priv(ah->dev); + + unsigned long flags; + + spin_lock_irqsave(&priv->lock, flags); + list_add_tail(&ah->list, &priv->dead_ahs); + spin_unlock_irqrestore(&priv->lock, flags); +} + +static void ipoib_ud_dma_unmap_rx(struct ipoib_dev_priv *priv, + u64 mapping[IPOIB_UD_RX_SG]) +{ + ib_dma_unmap_single(priv->ca, mapping[0], + IPOIB_UD_BUF_SIZE(priv->max_ib_mtu), + DMA_FROM_DEVICE); +} + +static int ipoib_ib_post_receive(struct net_device *dev, int id) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + int ret; + + priv->rx_wr.wr_id = id | IPOIB_OP_RECV; + priv->rx_sge[0].addr = priv->rx_ring[id].mapping[0]; + priv->rx_sge[1].addr = priv->rx_ring[id].mapping[1]; + + + ret = ib_post_recv(priv->qp, &priv->rx_wr, NULL); + if (unlikely(ret)) { + ipoib_warn(priv, "receive failed for buf %d (%d)\n", id, ret); + ipoib_ud_dma_unmap_rx(priv, priv->rx_ring[id].mapping); + dev_kfree_skb_any(priv->rx_ring[id].skb); + priv->rx_ring[id].skb = NULL; + } + + return ret; +} + +static struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev, int id) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct sk_buff *skb; + int buf_size; + u64 *mapping; + + buf_size = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu); + + skb = dev_alloc_skb(buf_size + IPOIB_HARD_LEN); + if (unlikely(!skb)) + return NULL; + + /* + * the IP header will be at IPOIP_HARD_LEN + IB_GRH_BYTES, that is + * 64 bytes aligned + */ + skb_reserve(skb, sizeof(struct ipoib_pseudo_header)); + + mapping = priv->rx_ring[id].mapping; + mapping[0] = ib_dma_map_single(priv->ca, skb->data, buf_size, + DMA_FROM_DEVICE); + if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0]))) + goto error; + + priv->rx_ring[id].skb = skb; + return skb; +error: + dev_kfree_skb_any(skb); + return NULL; +} + +static int ipoib_ib_post_receives(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + int i; + + for (i = 0; i < priv->recvq_size; ++i) { + if (!ipoib_alloc_rx_skb(dev, i)) { + ipoib_warn(priv, "failed to allocate receive buffer %d\n", i); + return -ENOMEM; + } + if (ipoib_ib_post_receive(dev, i)) { + ipoib_warn(priv, "ipoib_ib_post_receive failed for buf %d\n", i); + return -EIO; + } + } + + return 0; +} + +static inline void ipoib_create_repath_ent(struct net_device *dev, + struct sk_buff *skb, u16 lid) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct ipoib_arp_repath *arp_repath; + struct arphdr *parphdr; + + parphdr = (struct arphdr *)(skb->data); + if ((be16_to_cpu(parphdr->ar_op) != ARPOP_REPLY) && + (be16_to_cpu(parphdr->ar_op) != ARPOP_REQUEST)) { + return; + } + + arp_repath = kzalloc(sizeof(*arp_repath), GFP_ATOMIC); + if (!arp_repath) { + ipoib_warn(priv, "Failed alloc ipoib_arp_repath.\n"); + return; + } + + INIT_WORK(&arp_repath->work, ipoib_repath_ah); + + arp_repath->lid = lid; + memcpy(&arp_repath->sgid, skb->data + sizeof(struct arphdr) + 4, + sizeof(union ib_gid)); + arp_repath->dev = dev; + + if (!test_bit(IPOIB_STOP_REAPER, &priv->flags)) + queue_work(ipoib_workqueue, &arp_repath->work); + else + kfree(arp_repath); +} + +static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + unsigned int wr_id = wc->wr_id & ~IPOIB_OP_RECV; + struct sk_buff *skb; + u64 mapping[IPOIB_UD_RX_SG]; + union ib_gid *dgid; + union ib_gid *sgid; + + ipoib_dbg_data(priv, "recv completion: id %d, status: %d\n", + wr_id, wc->status); + + if (unlikely(wr_id >= priv->recvq_size)) { + ipoib_warn(priv, "recv completion event with wrid %d (> %d)\n", + wr_id, priv->recvq_size); + return; + } + + skb = priv->rx_ring[wr_id].skb; + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + if (wc->status != IB_WC_WR_FLUSH_ERR) + ipoib_warn(priv, + "failed recv event (status=%d, wrid=%d vend_err %#x)\n", + wc->status, wr_id, wc->vendor_err); + ipoib_ud_dma_unmap_rx(priv, priv->rx_ring[wr_id].mapping); + dev_kfree_skb_any(skb); + priv->rx_ring[wr_id].skb = NULL; + return; + } + + memcpy(mapping, priv->rx_ring[wr_id].mapping, + IPOIB_UD_RX_SG * sizeof(*mapping)); + + /* + * If we can't allocate a new RX buffer, dump + * this packet and reuse the old buffer. + */ + if (unlikely(!ipoib_alloc_rx_skb(dev, wr_id))) { + ++dev->stats.rx_dropped; + goto repost; + } + + ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n", + wc->byte_len, wc->slid); + + ipoib_ud_dma_unmap_rx(priv, mapping); + + skb_put(skb, wc->byte_len); + + /* First byte of dgid signals multicast when 0xff */ + dgid = &((struct ib_grh *)skb->data)->dgid; + + if (!(wc->wc_flags & IB_WC_GRH) || dgid->raw[0] != 0xff) + skb->pkt_type = PACKET_HOST; + else if (memcmp(dgid, dev->broadcast + 4, sizeof(union ib_gid)) == 0) + skb->pkt_type = PACKET_BROADCAST; + else + skb->pkt_type = PACKET_MULTICAST; + + sgid = &((struct ib_grh *)skb->data)->sgid; + + /* + * Drop packets that this interface sent, ie multicast packets + * that the HCA has replicated. + */ + if (wc->slid == priv->local_lid && wc->src_qp == priv->qp->qp_num) { + int need_repost = 1; + + if ((wc->wc_flags & IB_WC_GRH) && + sgid->global.interface_id != priv->local_gid.global.interface_id) + need_repost = 0; + + if (need_repost) { + dev_kfree_skb_any(skb); + goto repost; + } + } + + skb_pull(skb, IB_GRH_BYTES); + + skb->protocol = ((struct ipoib_header *) skb->data)->proto; + skb_add_pseudo_hdr(skb); + + ++dev->stats.rx_packets; + dev->stats.rx_bytes += skb->len; + if (skb->pkt_type == PACKET_MULTICAST) + dev->stats.multicast++; + + if (unlikely(be16_to_cpu(skb->protocol) == ETH_P_ARP)) + ipoib_create_repath_ent(dev, skb, wc->slid); + + skb->dev = dev; + if ((dev->features & NETIF_F_RXCSUM) && + likely(wc->wc_flags & IB_WC_IP_CSUM_OK)) + skb->ip_summed = CHECKSUM_UNNECESSARY; + + napi_gro_receive(&priv->recv_napi, skb); + +repost: + if (unlikely(ipoib_ib_post_receive(dev, wr_id))) + ipoib_warn(priv, "ipoib_ib_post_receive failed " + "for buf %d\n", wr_id); +} + +int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req) +{ + struct sk_buff *skb = tx_req->skb; + u64 *mapping = tx_req->mapping; + int i; + int off; + + if (skb_headlen(skb)) { + mapping[0] = ib_dma_map_single(ca, skb->data, skb_headlen(skb), + DMA_TO_DEVICE); + if (unlikely(ib_dma_mapping_error(ca, mapping[0]))) + return -EIO; + + off = 1; + } else + off = 0; + + for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) { + const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + mapping[i + off] = ib_dma_map_page(ca, + skb_frag_page(frag), + skb_frag_off(frag), + skb_frag_size(frag), + DMA_TO_DEVICE); + if (unlikely(ib_dma_mapping_error(ca, mapping[i + off]))) + goto partial_error; + } + return 0; + +partial_error: + for (; i > 0; --i) { + const skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1]; + + ib_dma_unmap_page(ca, mapping[i - !off], skb_frag_size(frag), DMA_TO_DEVICE); + } + + if (off) + ib_dma_unmap_single(ca, mapping[0], skb_headlen(skb), DMA_TO_DEVICE); + + return -EIO; +} + +void ipoib_dma_unmap_tx(struct ipoib_dev_priv *priv, + struct ipoib_tx_buf *tx_req) +{ + struct sk_buff *skb = tx_req->skb; + u64 *mapping = tx_req->mapping; + int i; + int off; + + if (skb_headlen(skb)) { + ib_dma_unmap_single(priv->ca, mapping[0], skb_headlen(skb), + DMA_TO_DEVICE); + off = 1; + } else + off = 0; + + for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) { + const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + ib_dma_unmap_page(priv->ca, mapping[i + off], + skb_frag_size(frag), DMA_TO_DEVICE); + } +} + +/* + * As the result of a completion error the QP Can be transferred to SQE states. + * The function checks if the (send)QP is in SQE state and + * moves it back to RTS state, that in order to have it functional again. + */ +static void ipoib_qp_state_validate_work(struct work_struct *work) +{ + struct ipoib_qp_state_validate *qp_work = + container_of(work, struct ipoib_qp_state_validate, work); + + struct ipoib_dev_priv *priv = qp_work->priv; + struct ib_qp_attr qp_attr; + struct ib_qp_init_attr query_init_attr; + int ret; + + ret = ib_query_qp(priv->qp, &qp_attr, IB_QP_STATE, &query_init_attr); + if (ret) { + ipoib_warn(priv, "%s: Failed to query QP ret: %d\n", + __func__, ret); + goto free_res; + } + pr_info("%s: QP: 0x%x is in state: %d\n", + __func__, priv->qp->qp_num, qp_attr.qp_state); + + /* currently support only in SQE->RTS transition*/ + if (qp_attr.qp_state == IB_QPS_SQE) { + qp_attr.qp_state = IB_QPS_RTS; + + ret = ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE); + if (ret) { + pr_warn("failed(%d) modify QP:0x%x SQE->RTS\n", + ret, priv->qp->qp_num); + goto free_res; + } + pr_info("%s: QP: 0x%x moved from IB_QPS_SQE to IB_QPS_RTS\n", + __func__, priv->qp->qp_num); + } else { + pr_warn("QP (%d) will stay in state: %d\n", + priv->qp->qp_num, qp_attr.qp_state); + } + +free_res: + kfree(qp_work); +} + +static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + unsigned int wr_id = wc->wr_id; + struct ipoib_tx_buf *tx_req; + + ipoib_dbg_data(priv, "send completion: id %d, status: %d\n", + wr_id, wc->status); + + if (unlikely(wr_id >= priv->sendq_size)) { + ipoib_warn(priv, "send completion event with wrid %d (> %d)\n", + wr_id, priv->sendq_size); + return; + } + + tx_req = &priv->tx_ring[wr_id]; + + if (!tx_req->is_inline) + ipoib_dma_unmap_tx(priv, tx_req); + + ++dev->stats.tx_packets; + dev->stats.tx_bytes += tx_req->skb->len; + + dev_kfree_skb_any(tx_req->skb); + + ++priv->tx_tail; + ++priv->global_tx_tail; + + if (unlikely(netif_queue_stopped(dev) && + ((priv->global_tx_head - priv->global_tx_tail) <= + priv->sendq_size >> 1) && + test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))) + netif_wake_queue(dev); + + if (wc->status != IB_WC_SUCCESS && + wc->status != IB_WC_WR_FLUSH_ERR) { + struct ipoib_qp_state_validate *qp_work; + ipoib_warn(priv, + "failed send event (status=%d, wrid=%d vend_err %#x)\n", + wc->status, wr_id, wc->vendor_err); + qp_work = kzalloc(sizeof(*qp_work), GFP_ATOMIC); + if (!qp_work) + return; + + INIT_WORK(&qp_work->work, ipoib_qp_state_validate_work); + qp_work->priv = priv; + queue_work(priv->wq, &qp_work->work); + } +} + +static int poll_tx(struct ipoib_dev_priv *priv) +{ + int n, i; + struct ib_wc *wc; + + n = ib_poll_cq(priv->send_cq, MAX_SEND_CQE, priv->send_wc); + for (i = 0; i < n; ++i) { + wc = priv->send_wc + i; + if (wc->wr_id & IPOIB_OP_CM) + ipoib_cm_handle_tx_wc(priv->dev, priv->send_wc + i); + else + ipoib_ib_handle_tx_wc(priv->dev, priv->send_wc + i); + } + return n == MAX_SEND_CQE; +} + +int ipoib_rx_poll(struct napi_struct *napi, int budget) +{ + struct ipoib_dev_priv *priv = + container_of(napi, struct ipoib_dev_priv, recv_napi); + struct net_device *dev = priv->dev; + int done; + int t; + int n, i; + + done = 0; + +poll_more: + while (done < budget) { + int max = (budget - done); + + t = min(IPOIB_NUM_WC, max); + n = ib_poll_cq(priv->recv_cq, t, priv->ibwc); + + for (i = 0; i < n; i++) { + struct ib_wc *wc = priv->ibwc + i; + + if (wc->wr_id & IPOIB_OP_RECV) { + ++done; + if (wc->wr_id & IPOIB_OP_CM) + ipoib_cm_handle_rx_wc(dev, wc); + else + ipoib_ib_handle_rx_wc(dev, wc); + } else { + pr_warn("%s: Got unexpected wqe id\n", __func__); + } + } + + if (n != t) + break; + } + + if (done < budget) { + napi_complete(napi); + if (unlikely(ib_req_notify_cq(priv->recv_cq, + IB_CQ_NEXT_COMP | + IB_CQ_REPORT_MISSED_EVENTS)) && + napi_reschedule(napi)) + goto poll_more; + } + + return done; +} + +int ipoib_tx_poll(struct napi_struct *napi, int budget) +{ + struct ipoib_dev_priv *priv = container_of(napi, struct ipoib_dev_priv, + send_napi); + struct net_device *dev = priv->dev; + int n, i; + struct ib_wc *wc; + +poll_more: + n = ib_poll_cq(priv->send_cq, MAX_SEND_CQE, priv->send_wc); + + for (i = 0; i < n; i++) { + wc = priv->send_wc + i; + if (wc->wr_id & IPOIB_OP_CM) + ipoib_cm_handle_tx_wc(dev, wc); + else + ipoib_ib_handle_tx_wc(dev, wc); + } + + if (n < budget) { + napi_complete(napi); + if (unlikely(ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP | + IB_CQ_REPORT_MISSED_EVENTS)) && + napi_reschedule(napi)) + goto poll_more; + } + return n < 0 ? 0 : n; +} + +void ipoib_ib_rx_completion(struct ib_cq *cq, void *ctx_ptr) +{ + struct ipoib_dev_priv *priv = ctx_ptr; + + napi_schedule(&priv->recv_napi); +} + +/* The function will force napi_schedule */ +void ipoib_napi_schedule_work(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = + container_of(work, struct ipoib_dev_priv, reschedule_napi_work); + bool ret; + + do { + ret = napi_reschedule(&priv->send_napi); + if (!ret) + msleep(3); + } while (!ret && netif_queue_stopped(priv->dev) && + test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)); +} + +void ipoib_ib_tx_completion(struct ib_cq *cq, void *ctx_ptr) +{ + struct ipoib_dev_priv *priv = ctx_ptr; + bool ret; + + ret = napi_reschedule(&priv->send_napi); + /* + * if the queue is closed the driver must be able to schedule napi, + * otherwise we can end with closed queue forever, because no new + * packets to send and napi callback might not get new event after + * its re-arm of the napi. + */ + if (!ret && netif_queue_stopped(priv->dev)) + schedule_work(&priv->reschedule_napi_work); +} + +static inline int post_send(struct ipoib_dev_priv *priv, + unsigned int wr_id, + struct ib_ah *address, u32 dqpn, + struct ipoib_tx_buf *tx_req, + void *head, int hlen) +{ + struct sk_buff *skb = tx_req->skb; + + if (tx_req->is_inline) { + priv->tx_sge[0].addr = (u64)skb->data; + priv->tx_sge[0].length = skb->len; + priv->tx_wr.wr.num_sge = 1; + } else { + ipoib_build_sge(priv, tx_req); + } + + priv->tx_wr.wr.wr_id = wr_id; + priv->tx_wr.remote_qpn = dqpn; + priv->tx_wr.ah = address; + + if (head) { + priv->tx_wr.mss = skb_shinfo(skb)->gso_size; + priv->tx_wr.header = head; + priv->tx_wr.hlen = hlen; + priv->tx_wr.wr.opcode = IB_WR_LSO; + } else + priv->tx_wr.wr.opcode = IB_WR_SEND; + + return ib_post_send(priv->qp, &priv->tx_wr.wr, NULL); +} + +int ipoib_send(struct net_device *dev, struct sk_buff *skb, + struct ib_ah *address, u32 dqpn) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct ipoib_tx_buf *tx_req; + int hlen, rc; + void *phead; + unsigned int usable_sge = priv->max_send_sge - !!skb_headlen(skb); + + if (skb_is_gso(skb)) { + hlen = skb_transport_offset(skb) + tcp_hdrlen(skb); + phead = skb->data; + if (unlikely(!skb_pull(skb, hlen))) { + ipoib_warn(priv, "linear data too small\n"); + ++dev->stats.tx_dropped; + ++dev->stats.tx_errors; + dev_kfree_skb_any(skb); + return -1; + } + } else { + if (unlikely(skb->len > priv->mcast_mtu + IPOIB_ENCAP_LEN)) { + ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n", + skb->len, priv->mcast_mtu + IPOIB_ENCAP_LEN); + ++dev->stats.tx_dropped; + ++dev->stats.tx_errors; + ipoib_cm_skb_too_long(dev, skb, priv->mcast_mtu); + return -1; + } + phead = NULL; + hlen = 0; + } + if (skb_shinfo(skb)->nr_frags > usable_sge) { + if (skb_linearize(skb) < 0) { + ipoib_warn(priv, "skb could not be linearized\n"); + ++dev->stats.tx_dropped; + ++dev->stats.tx_errors; + dev_kfree_skb_any(skb); + return -1; + } + /* Does skb_linearize return ok without reducing nr_frags? */ + if (skb_shinfo(skb)->nr_frags > usable_sge) { + ipoib_warn(priv, "too many frags after skb linearize\n"); + ++dev->stats.tx_dropped; + ++dev->stats.tx_errors; + dev_kfree_skb_any(skb); + return -1; + } + } + + ipoib_dbg_data(priv, + "sending packet, length=%d address=%p dqpn=0x%06x\n", + skb->len, address, dqpn); + + /* + * We put the skb into the tx_ring _before_ we call post_send() + * because it's entirely possible that the completion handler will + * run before we execute anything after the post_send(). That + * means we have to make sure everything is properly recorded and + * our state is consistent before we call post_send(). + */ + tx_req = &priv->tx_ring[priv->tx_head & (priv->sendq_size - 1)]; + tx_req->skb = skb; + + if (skb->len < ipoib_inline_thold && + !skb_shinfo(skb)->nr_frags) { + tx_req->is_inline = 1; + priv->tx_wr.wr.send_flags |= IB_SEND_INLINE; + } else { + if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req))) { + ++dev->stats.tx_errors; + dev_kfree_skb_any(skb); + return -1; + } + tx_req->is_inline = 0; + priv->tx_wr.wr.send_flags &= ~IB_SEND_INLINE; + } + + if (skb->ip_summed == CHECKSUM_PARTIAL) + priv->tx_wr.wr.send_flags |= IB_SEND_IP_CSUM; + else + priv->tx_wr.wr.send_flags &= ~IB_SEND_IP_CSUM; + /* increase the tx_head after send success, but use it for queue state */ + if ((priv->global_tx_head - priv->global_tx_tail) == + priv->sendq_size - 1) { + ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n"); + netif_stop_queue(dev); + } + + skb_orphan(skb); + skb_dst_drop(skb); + + if (netif_queue_stopped(dev)) + if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP | + IB_CQ_REPORT_MISSED_EVENTS) < 0) + ipoib_warn(priv, "request notify on send CQ failed\n"); + + rc = post_send(priv, priv->tx_head & (priv->sendq_size - 1), + address, dqpn, tx_req, phead, hlen); + if (unlikely(rc)) { + ipoib_warn(priv, "post_send failed, error %d\n", rc); + ++dev->stats.tx_errors; + if (!tx_req->is_inline) + ipoib_dma_unmap_tx(priv, tx_req); + dev_kfree_skb_any(skb); + if (netif_queue_stopped(dev)) + netif_wake_queue(dev); + rc = 0; + } else { + netif_trans_update(dev); + + rc = priv->tx_head; + ++priv->tx_head; + ++priv->global_tx_head; + } + return rc; +} + +static void ipoib_reap_dead_ahs(struct ipoib_dev_priv *priv) +{ + struct ipoib_ah *ah, *tah; + unsigned long flags; + + netif_tx_lock_bh(priv->dev); + spin_lock_irqsave(&priv->lock, flags); + + list_for_each_entry_safe(ah, tah, &priv->dead_ahs, list) + if ((int) priv->tx_tail - (int) ah->last_send >= 0) { + list_del(&ah->list); + rdma_destroy_ah(ah->ah, 0); + kfree(ah); + } + + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(priv->dev); +} + +void ipoib_reap_ah(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = + container_of(work, struct ipoib_dev_priv, ah_reap_task.work); + + ipoib_reap_dead_ahs(priv); + + if (!test_bit(IPOIB_STOP_REAPER, &priv->flags)) + queue_delayed_work(priv->wq, &priv->ah_reap_task, + round_jiffies_relative(HZ)); +} + +static void ipoib_start_ah_reaper(struct ipoib_dev_priv *priv) +{ + clear_bit(IPOIB_STOP_REAPER, &priv->flags); + queue_delayed_work(priv->wq, &priv->ah_reap_task, + round_jiffies_relative(HZ)); +} + +static void ipoib_stop_ah_reaper(struct ipoib_dev_priv *priv) +{ + set_bit(IPOIB_STOP_REAPER, &priv->flags); + cancel_delayed_work(&priv->ah_reap_task); + /* + * After ipoib_stop_ah_reaper() we always go through + * ipoib_reap_dead_ahs() which ensures the work is really stopped and + * does a final flush out of the dead_ah's list + */ +} + +static int recvs_pending(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + int pending = 0; + int i; + + for (i = 0; i < priv->recvq_size; ++i) + if (priv->rx_ring[i].skb) + ++pending; + + return pending; +} + +static void check_qp_movement_and_print(struct ipoib_dev_priv *priv, + struct ib_qp *qp, + enum ib_qp_state new_state) +{ + struct ib_qp_attr qp_attr; + struct ib_qp_init_attr query_init_attr; + int ret; + + ret = ib_query_qp(qp, &qp_attr, IB_QP_STATE, &query_init_attr); + if (ret) { + ipoib_warn(priv, "%s: Failed to query QP\n", __func__); + return; + } + /* print according to the new-state and the previous state.*/ + if (new_state == IB_QPS_ERR && qp_attr.qp_state == IB_QPS_RESET) + ipoib_dbg(priv, "Failed modify QP, IB_QPS_RESET to IB_QPS_ERR, acceptable\n"); + else + ipoib_warn(priv, "Failed to modify QP to state: %d from state: %d\n", + new_state, qp_attr.qp_state); +} + +static void ipoib_napi_enable(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + + napi_enable(&priv->recv_napi); + napi_enable(&priv->send_napi); +} + +static void ipoib_napi_disable(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + + napi_disable(&priv->recv_napi); + napi_disable(&priv->send_napi); +} + +int ipoib_ib_dev_stop_default(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct ib_qp_attr qp_attr; + unsigned long begin; + struct ipoib_tx_buf *tx_req; + int i; + + if (test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) + ipoib_napi_disable(dev); + + ipoib_cm_dev_stop(dev); + + /* + * Move our QP to the error state and then reinitialize in + * when all work requests have completed or have been flushed. + */ + qp_attr.qp_state = IB_QPS_ERR; + if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE)) + check_qp_movement_and_print(priv, priv->qp, IB_QPS_ERR); + + /* Wait for all sends and receives to complete */ + begin = jiffies; + + while (priv->tx_head != priv->tx_tail || recvs_pending(dev)) { + if (time_after(jiffies, begin + 5 * HZ)) { + ipoib_warn(priv, + "timing out; %d sends %d receives not completed\n", + priv->tx_head - priv->tx_tail, + recvs_pending(dev)); + + /* + * assume the HW is wedged and just free up + * all our pending work requests. + */ + while ((int)priv->tx_tail - (int)priv->tx_head < 0) { + tx_req = &priv->tx_ring[priv->tx_tail & + (priv->sendq_size - 1)]; + if (!tx_req->is_inline) + ipoib_dma_unmap_tx(priv, tx_req); + dev_kfree_skb_any(tx_req->skb); + ++priv->tx_tail; + ++priv->global_tx_tail; + } + + for (i = 0; i < priv->recvq_size; ++i) { + struct ipoib_rx_buf *rx_req; + + rx_req = &priv->rx_ring[i]; + if (!rx_req->skb) + continue; + ipoib_ud_dma_unmap_rx(priv, + priv->rx_ring[i].mapping); + dev_kfree_skb_any(rx_req->skb); + rx_req->skb = NULL; + } + + goto timeout; + } + + ipoib_drain_cq(dev); + + usleep_range(1000, 2000); + } + + ipoib_dbg(priv, "All sends and receives done.\n"); + +timeout: + qp_attr.qp_state = IB_QPS_RESET; + if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE)) + check_qp_movement_and_print(priv, priv->qp, IB_QPS_RESET); + + ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP); + + return 0; +} + +int ipoib_ib_dev_open_default(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + int ret; + + /* Re-arm the RX CQs due to a race between completion event and arm + * during default stop. This fix is temporary and should be removed + * once the mlx4/5 bug is solved + */ + ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP); + + ret = ipoib_init_qp(dev); + if (ret) { + ipoib_warn(priv, "ipoib_init_qp returned %d\n", ret); + return -1; + } + + ret = ipoib_ib_post_receives(dev); + if (ret) { + ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret); + goto out; + } + + ret = ipoib_cm_dev_open(dev); + if (ret) { + ipoib_warn(priv, "ipoib_cm_dev_open returned %d\n", ret); + goto out; + } + + if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) + ipoib_napi_enable(dev); + + return 0; +out: + return -1; +} + +int ipoib_ib_dev_open(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + + ipoib_pkey_dev_check_presence(dev); + + if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) { + ipoib_warn(priv, "P_Key 0x%04x is %s\n", priv->pkey, + (!(priv->pkey & 0x7fff) ? "Invalid" : "not found")); + return -1; + } + + ipoib_start_ah_reaper(priv); + if (priv->rn_ops->ndo_open(dev)) { + pr_warn("%s: Failed to open dev\n", dev->name); + goto dev_stop; + } + + set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags); + + return 0; + +dev_stop: + ipoib_stop_ah_reaper(priv); + return -1; +} + +void ipoib_ib_dev_stop(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + + priv->rn_ops->ndo_stop(dev); + + clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags); + ipoib_stop_ah_reaper(priv); +} + +void ipoib_pkey_dev_check_presence(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct rdma_netdev *rn = netdev_priv(dev); + + if (!(priv->pkey & 0x7fff) || + ib_find_pkey(priv->ca, priv->port, priv->pkey, + &priv->pkey_index)) { + clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); + } else { + if (rn->set_id) + rn->set_id(dev, priv->pkey_index); + set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); + } +} + +void ipoib_ib_dev_up(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + + ipoib_pkey_dev_check_presence(dev); + + if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) { + ipoib_dbg(priv, "PKEY is not assigned.\n"); + return; + } + + set_bit(IPOIB_FLAG_OPER_UP, &priv->flags); + + ipoib_mcast_start_thread(dev); +} + +void ipoib_ib_dev_down(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + + ipoib_dbg(priv, "downing ib_dev\n"); + + clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags); + netif_carrier_off(dev); + + ipoib_mcast_stop_thread(dev); + ipoib_mcast_dev_flush(dev); + + ipoib_flush_paths(dev); +} + +void ipoib_drain_cq(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + int i, n; + + /* + * We call completion handling routines that expect to be + * called from the BH-disabled NAPI poll context, so disable + * BHs here too. + */ + local_bh_disable(); + + do { + n = ib_poll_cq(priv->recv_cq, IPOIB_NUM_WC, priv->ibwc); + for (i = 0; i < n; ++i) { + /* + * Convert any successful completions to flush + * errors to avoid passing packets up the + * stack after bringing the device down. + */ + if (priv->ibwc[i].status == IB_WC_SUCCESS) + priv->ibwc[i].status = IB_WC_WR_FLUSH_ERR; + + if (priv->ibwc[i].wr_id & IPOIB_OP_RECV) { + if (priv->ibwc[i].wr_id & IPOIB_OP_CM) + ipoib_cm_handle_rx_wc(dev, priv->ibwc + i); + else + ipoib_ib_handle_rx_wc(dev, priv->ibwc + i); + } else { + pr_warn("%s: Got unexpected wqe id\n", __func__); + } + } + } while (n == IPOIB_NUM_WC); + + while (poll_tx(priv)) + ; /* nothing */ + + local_bh_enable(); +} + +/* + * Takes whatever value which is in pkey index 0 and updates priv->pkey + * returns 0 if the pkey value was changed. + */ +static inline int update_parent_pkey(struct ipoib_dev_priv *priv) +{ + int result; + u16 prev_pkey; + + prev_pkey = priv->pkey; + result = ib_query_pkey(priv->ca, priv->port, 0, &priv->pkey); + if (result) { + ipoib_warn(priv, "ib_query_pkey port %d failed (ret = %d)\n", + priv->port, result); + return result; + } + + priv->pkey |= 0x8000; + + if (prev_pkey != priv->pkey) { + ipoib_dbg(priv, "pkey changed from 0x%x to 0x%x\n", + prev_pkey, priv->pkey); + /* + * Update the pkey in the broadcast address, while making sure to set + * the full membership bit, so that we join the right broadcast group. + */ + priv->dev->broadcast[8] = priv->pkey >> 8; + priv->dev->broadcast[9] = priv->pkey & 0xff; + return 0; + } + + return 1; +} +/* + * returns 0 if pkey value was found in a different slot. + */ +static inline int update_child_pkey(struct ipoib_dev_priv *priv) +{ + u16 old_index = priv->pkey_index; + + priv->pkey_index = 0; + ipoib_pkey_dev_check_presence(priv->dev); + + if (test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags) && + (old_index == priv->pkey_index)) + return 1; + return 0; +} + +/* + * returns true if the device address of the ipoib interface has changed and the + * new address is a valid one (i.e in the gid table), return false otherwise. + */ +static bool ipoib_dev_addr_changed_valid(struct ipoib_dev_priv *priv) +{ + union ib_gid search_gid; + union ib_gid gid0; + int err; + u16 index; + u32 port; + bool ret = false; + + if (rdma_query_gid(priv->ca, priv->port, 0, &gid0)) + return false; + + netif_addr_lock_bh(priv->dev); + + /* The subnet prefix may have changed, update it now so we won't have + * to do it later + */ + priv->local_gid.global.subnet_prefix = gid0.global.subnet_prefix; + dev_addr_mod(priv->dev, 4, (u8 *)&gid0.global.subnet_prefix, + sizeof(gid0.global.subnet_prefix)); + search_gid.global.subnet_prefix = gid0.global.subnet_prefix; + + search_gid.global.interface_id = priv->local_gid.global.interface_id; + + netif_addr_unlock_bh(priv->dev); + + err = ib_find_gid(priv->ca, &search_gid, &port, &index); + + netif_addr_lock_bh(priv->dev); + + if (search_gid.global.interface_id != + priv->local_gid.global.interface_id) + /* There was a change while we were looking up the gid, bail + * here and let the next work sort this out + */ + goto out; + + /* The next section of code needs some background: + * Per IB spec the port GUID can't change if the HCA is powered on. + * port GUID is the basis for GID at index 0 which is the basis for + * the default device address of a ipoib interface. + * + * so it seems the flow should be: + * if user_changed_dev_addr && gid in gid tbl + * set bit dev_addr_set + * return true + * else + * return false + * + * The issue is that there are devices that don't follow the spec, + * they change the port GUID when the HCA is powered, so in order + * not to break userspace applications, We need to check if the + * user wanted to control the device address and we assume that + * if he sets the device address back to be based on GID index 0, + * he no longer wishs to control it. + * + * If the user doesn't control the the device address, + * IPOIB_FLAG_DEV_ADDR_SET is set and ib_find_gid failed it means + * the port GUID has changed and GID at index 0 has changed + * so we need to change priv->local_gid and priv->dev->dev_addr + * to reflect the new GID. + */ + if (!test_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags)) { + if (!err && port == priv->port) { + set_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags); + if (index == 0) + clear_bit(IPOIB_FLAG_DEV_ADDR_CTRL, + &priv->flags); + else + set_bit(IPOIB_FLAG_DEV_ADDR_CTRL, &priv->flags); + ret = true; + } else { + ret = false; + } + } else { + if (!err && port == priv->port) { + ret = true; + } else { + if (!test_bit(IPOIB_FLAG_DEV_ADDR_CTRL, &priv->flags)) { + memcpy(&priv->local_gid, &gid0, + sizeof(priv->local_gid)); + dev_addr_mod(priv->dev, 4, (u8 *)&gid0, + sizeof(priv->local_gid)); + ret = true; + } + } + } + +out: + netif_addr_unlock_bh(priv->dev); + + return ret; +} + +static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, + enum ipoib_flush_level level, + int nesting) +{ + struct ipoib_dev_priv *cpriv; + struct net_device *dev = priv->dev; + int result; + + down_read_nested(&priv->vlan_rwsem, nesting); + + /* + * Flush any child interfaces too -- they might be up even if + * the parent is down. + */ + list_for_each_entry(cpriv, &priv->child_intfs, list) + __ipoib_ib_dev_flush(cpriv, level, nesting + 1); + + up_read(&priv->vlan_rwsem); + + if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags) && + level != IPOIB_FLUSH_HEAVY) { + /* Make sure the dev_addr is set even if not flushing */ + if (level == IPOIB_FLUSH_LIGHT) + ipoib_dev_addr_changed_valid(priv); + ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_INITIALIZED not set.\n"); + return; + } + + if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) { + /* interface is down. update pkey and leave. */ + if (level == IPOIB_FLUSH_HEAVY) { + if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) + update_parent_pkey(priv); + else + update_child_pkey(priv); + } else if (level == IPOIB_FLUSH_LIGHT) + ipoib_dev_addr_changed_valid(priv); + ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_ADMIN_UP not set.\n"); + return; + } + + if (level == IPOIB_FLUSH_HEAVY) { + /* child devices chase their origin pkey value, while non-child + * (parent) devices should always takes what present in pkey index 0 + */ + if (test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { + result = update_child_pkey(priv); + if (result) { + /* restart QP only if P_Key index is changed */ + ipoib_dbg(priv, "Not flushing - P_Key index not changed.\n"); + return; + } + + } else { + result = update_parent_pkey(priv); + /* restart QP only if P_Key value changed */ + if (result) { + ipoib_dbg(priv, "Not flushing - P_Key value not changed.\n"); + return; + } + } + } + + if (level == IPOIB_FLUSH_LIGHT) { + int oper_up; + ipoib_mark_paths_invalid(dev); + /* Set IPoIB operation as down to prevent races between: + * the flush flow which leaves MCG and on the fly joins + * which can happen during that time. mcast restart task + * should deal with join requests we missed. + */ + oper_up = test_and_clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags); + ipoib_mcast_dev_flush(dev); + ipoib_reap_dead_ahs(priv); + if (oper_up) + set_bit(IPOIB_FLAG_OPER_UP, &priv->flags); + } + + if (level >= IPOIB_FLUSH_NORMAL) + ipoib_ib_dev_down(dev); + + if (level == IPOIB_FLUSH_HEAVY) { + if (test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) + ipoib_ib_dev_stop(dev); + + if (ipoib_ib_dev_open(dev)) + return; + + if (netif_queue_stopped(dev)) + netif_start_queue(dev); + } + + /* + * The device could have been brought down between the start and when + * we get here, don't bring it back up if it's not configured up + */ + if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) { + if (level >= IPOIB_FLUSH_NORMAL) + ipoib_ib_dev_up(dev); + if (ipoib_dev_addr_changed_valid(priv)) + ipoib_mcast_restart_task(&priv->restart_task); + } +} + +void ipoib_ib_dev_flush_light(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = + container_of(work, struct ipoib_dev_priv, flush_light); + + rtnl_lock(); + __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_LIGHT, 0); + rtnl_unlock(); +} + +void ipoib_ib_dev_flush_normal(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = + container_of(work, struct ipoib_dev_priv, flush_normal); + + __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_NORMAL, 0); +} + +void ipoib_ib_dev_flush_heavy(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = + container_of(work, struct ipoib_dev_priv, flush_heavy); + + rtnl_lock(); + __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_HEAVY, 0); + rtnl_unlock(); +} + +void ipoib_ib_dev_cleanup(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + + ipoib_dbg(priv, "cleaning up ib_dev\n"); + /* + * We must make sure there are no more (path) completions + * that may wish to touch priv fields that are no longer valid + */ + ipoib_flush_paths(dev); + + ipoib_mcast_stop_thread(dev); + ipoib_mcast_dev_flush(dev); + + /* + * All of our ah references aren't free until after + * ipoib_mcast_dev_flush(), ipoib_flush_paths, and + * the neighbor garbage collection is stopped and reaped. + * That should all be done now, so make a final ah flush. + */ + ipoib_reap_dead_ahs(priv); + + clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); + + priv->rn_ops->ndo_uninit(dev); + + if (priv->pd) { + ib_dealloc_pd(priv->pd); + priv->pd = NULL; + } +} + diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/ipoib_main.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/ipoib_main.c new file mode 100644 index 0000000..b4b9e4e --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -0,0 +1,2886 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2004 Voltaire, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ipoib.h" + +#include + +#include +#include +#include +#include + +#include /* For ARPHRD_xxx */ + +#include +#include + +#include +#include +#include +#include +#include +#include + +MODULE_AUTHOR("Roland Dreier"); +MODULE_DESCRIPTION("IP-over-InfiniBand net driver"); +MODULE_LICENSE("Dual BSD/GPL"); + +int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE; +int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE; +int ipoib_enhanced_enabled = 1; + +module_param_named(send_queue_size, ipoib_sendq_size, int, 0444); +MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue"); +module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444); +MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue"); + +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG +int ipoib_debug_level; + +module_param_named(debug_level, ipoib_debug_level, int, 0644); +MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0"); +#endif + +module_param_named(ipoib_enhanced, ipoib_enhanced_enabled, int, 0444); +MODULE_PARM_DESC(ipoib_enhanced, "Enable IPoIB enhanced for capable devices (default = 1) (0-1)"); + +#define IPOIB_MAX_NEIGH_TIME (240UL * HZ) +#define IPOIB_MIN_NEIGH_TIME (30UL * HZ) + +struct ipoib_path_iter { + struct net_device *dev; + struct ipoib_path path; +}; + +static const u8 ipv4_bcast_addr[] = { + 0x00, 0xff, 0xff, 0xff, + 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff +}; + +struct workqueue_struct *ipoib_workqueue; + +struct ib_sa_client ipoib_sa_client; + +static int ipoib_add_one(struct ib_device *device); +static void ipoib_remove_one(struct ib_device *device, void *client_data); +static void ipoib_neigh_reclaim(struct rcu_head *rp); +static struct net_device *ipoib_get_net_dev_by_params( + struct ib_device *dev, u32 port, u16 pkey, + const union ib_gid *gid, const struct sockaddr *addr, + void *client_data); +static int ipoib_set_mac(struct net_device *dev, void *addr); +static int ipoib_ioctl(struct net_device *dev, struct ifreq *ifr, + int cmd); + +static struct ib_client ipoib_client = { + .name = "ipoib", + .add = ipoib_add_one, + .remove = ipoib_remove_one, + .get_net_dev_by_params = ipoib_get_net_dev_by_params, +}; + +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG +static int ipoib_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct netdev_notifier_info *ni = ptr; + struct net_device *dev = ni->dev; + + if (dev->netdev_ops->ndo_open != ipoib_open) + return NOTIFY_DONE; + + switch (event) { + case NETDEV_REGISTER: + ipoib_create_debug_files(dev); + break; + case NETDEV_CHANGENAME: + ipoib_delete_debug_files(dev); + ipoib_create_debug_files(dev); + break; + case NETDEV_UNREGISTER: + ipoib_delete_debug_files(dev); + break; + } + + return NOTIFY_DONE; +} +#endif + +int ipoib_open(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + + ipoib_dbg(priv, "bringing up interface\n"); + + netif_carrier_off(dev); + + set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); + + if (ipoib_ib_dev_open(dev)) { + if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) + return 0; + goto err_disable; + } + + ipoib_ib_dev_up(dev); + + if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { + struct ipoib_dev_priv *cpriv; + + /* Bring up any child interfaces too */ + down_read(&priv->vlan_rwsem); + list_for_each_entry(cpriv, &priv->child_intfs, list) { + int flags; + + flags = cpriv->dev->flags; + if (flags & IFF_UP) + continue; + + dev_change_flags(cpriv->dev, flags | IFF_UP, NULL); + } + up_read(&priv->vlan_rwsem); + } else if (priv->parent) { + struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent); + + if (!test_bit(IPOIB_FLAG_ADMIN_UP, &ppriv->flags)) + ipoib_dbg(priv, "parent device %s is not up, so child device may be not functioning.\n", + ppriv->dev->name); + } + netif_start_queue(dev); + + return 0; + +err_disable: + clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); + + return -EINVAL; +} + +static int ipoib_stop(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + + ipoib_dbg(priv, "stopping interface\n"); + + clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); + + netif_stop_queue(dev); + + ipoib_ib_dev_down(dev); + ipoib_ib_dev_stop(dev); + + if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { + struct ipoib_dev_priv *cpriv; + + /* Bring down any child interfaces too */ + down_read(&priv->vlan_rwsem); + list_for_each_entry(cpriv, &priv->child_intfs, list) { + int flags; + + flags = cpriv->dev->flags; + if (!(flags & IFF_UP)) + continue; + + dev_change_flags(cpriv->dev, flags & ~IFF_UP, NULL); + } + up_read(&priv->vlan_rwsem); + } + + return 0; +} + +static netdev_features_t ipoib_fix_features(struct net_device *dev, netdev_features_t features) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + + if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags)) { + features &= ~(NETIF_F_IP_CSUM | NETIF_F_TSO | NETIF_F_SG); + } else { + if (priv->max_send_sge > 1) + features |= NETIF_F_SG; + } + + return features; +} + +static int ipoib_change_mtu(struct net_device *dev, int new_mtu) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + int ret = 0; + + /* dev->mtu > 2K ==> connected mode */ + if (ipoib_cm_admin_enabled(dev)) { + if (new_mtu > ipoib_cm_max_mtu(dev)) + return -EINVAL; + + if (new_mtu > priv->mcast_mtu) + ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n", + priv->mcast_mtu); + + dev->mtu = new_mtu; + return 0; + } + + if (new_mtu < (ETH_MIN_MTU + IPOIB_ENCAP_LEN) || + new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu)) + return -EINVAL; + + priv->admin_mtu = new_mtu; + + if (priv->mcast_mtu < priv->admin_mtu) + ipoib_dbg(priv, "MTU must be smaller than the underlying " + "link layer MTU - 4 (%u)\n", priv->mcast_mtu); + + new_mtu = min(priv->mcast_mtu, priv->admin_mtu); + + if (priv->rn_ops->ndo_change_mtu) { + bool carrier_status = netif_carrier_ok(dev); + + netif_carrier_off(dev); + + /* notify lower level on the real mtu */ + ret = priv->rn_ops->ndo_change_mtu(dev, new_mtu); + + if (carrier_status) + netif_carrier_on(dev); + } else { + dev->mtu = new_mtu; + } + + return ret; +} + +static void ipoib_get_stats(struct net_device *dev, + struct rtnl_link_stats64 *stats) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + + if (priv->rn_ops->ndo_get_stats64) + priv->rn_ops->ndo_get_stats64(dev, stats); + else + netdev_stats_to_stats64(stats, &dev->stats); +} + +/* Called with an RCU read lock taken */ +static bool ipoib_is_dev_match_addr_rcu(const struct sockaddr *addr, + struct net_device *dev) +{ + struct net *net = dev_net(dev); + struct in_device *in_dev; + struct sockaddr_in *addr_in = (struct sockaddr_in *)addr; + struct sockaddr_in6 *addr_in6 = (struct sockaddr_in6 *)addr; + __be32 ret_addr; + + switch (addr->sa_family) { + case AF_INET: + in_dev = in_dev_get(dev); + if (!in_dev) + return false; + + ret_addr = inet_confirm_addr(net, in_dev, 0, + addr_in->sin_addr.s_addr, + RT_SCOPE_HOST); + in_dev_put(in_dev); + if (ret_addr) + return true; + + break; + case AF_INET6: + if (IS_ENABLED(CONFIG_IPV6) && + ipv6_chk_addr(net, &addr_in6->sin6_addr, dev, 1)) + return true; + + break; + } + return false; +} + +/* + * Find the master net_device on top of the given net_device. + * @dev: base IPoIB net_device + * + * Returns the master net_device with a reference held, or the same net_device + * if no master exists. + */ +static struct net_device *ipoib_get_master_net_dev(struct net_device *dev) +{ + struct net_device *master; + + rcu_read_lock(); + master = netdev_master_upper_dev_get_rcu(dev); + if (master) + dev_hold(master); + rcu_read_unlock(); + + if (master) + return master; + + dev_hold(dev); + return dev; +} + +struct ipoib_walk_data { + const struct sockaddr *addr; + struct net_device *result; +}; + +static int ipoib_upper_walk(struct net_device *upper, + struct netdev_nested_priv *priv) +{ + struct ipoib_walk_data *data = (struct ipoib_walk_data *)priv->data; + int ret = 0; + + if (ipoib_is_dev_match_addr_rcu(data->addr, upper)) { + dev_hold(upper); + data->result = upper; + ret = 1; + } + + return ret; +} + +/** + * ipoib_get_net_dev_match_addr - Find a net_device matching + * the given address, which is an upper device of the given net_device. + * + * @addr: IP address to look for. + * @dev: base IPoIB net_device + * + * If found, returns the net_device with a reference held. Otherwise return + * NULL. + */ +static struct net_device *ipoib_get_net_dev_match_addr( + const struct sockaddr *addr, struct net_device *dev) +{ + struct netdev_nested_priv priv; + struct ipoib_walk_data data = { + .addr = addr, + }; + + priv.data = (void *)&data; + rcu_read_lock(); + if (ipoib_is_dev_match_addr_rcu(addr, dev)) { + dev_hold(dev); + data.result = dev; + goto out; + } + + netdev_walk_all_upper_dev_rcu(dev, ipoib_upper_walk, &priv); +out: + rcu_read_unlock(); + return data.result; +} + +/* returns the number of IPoIB netdevs on top a given ipoib device matching a + * pkey_index and address, if one exists. + * + * @found_net_dev: contains a matching net_device if the return value >= 1, + * with a reference held. */ +static int ipoib_match_gid_pkey_addr(struct ipoib_dev_priv *priv, + const union ib_gid *gid, + u16 pkey_index, + const struct sockaddr *addr, + int nesting, + struct net_device **found_net_dev) +{ + struct ipoib_dev_priv *child_priv; + struct net_device *net_dev = NULL; + int matches = 0; + + if (priv->pkey_index == pkey_index && + (!gid || !memcmp(gid, &priv->local_gid, sizeof(*gid)))) { + if (!addr) { + net_dev = ipoib_get_master_net_dev(priv->dev); + } else { + /* Verify the net_device matches the IP address, as + * IPoIB child devices currently share a GID. */ + net_dev = ipoib_get_net_dev_match_addr(addr, priv->dev); + } + if (net_dev) { + if (!*found_net_dev) + *found_net_dev = net_dev; + else + dev_put(net_dev); + ++matches; + } + } + + /* Check child interfaces */ + down_read_nested(&priv->vlan_rwsem, nesting); + list_for_each_entry(child_priv, &priv->child_intfs, list) { + matches += ipoib_match_gid_pkey_addr(child_priv, gid, + pkey_index, addr, + nesting + 1, + found_net_dev); + if (matches > 1) + break; + } + up_read(&priv->vlan_rwsem); + + return matches; +} + +/* Returns the number of matching net_devs found (between 0 and 2). Also + * return the matching net_device in the @net_dev parameter, holding a + * reference to the net_device, if the number of matches >= 1 */ +static int __ipoib_get_net_dev_by_params(struct list_head *dev_list, u32 port, + u16 pkey_index, + const union ib_gid *gid, + const struct sockaddr *addr, + struct net_device **net_dev) +{ + struct ipoib_dev_priv *priv; + int matches = 0; + + *net_dev = NULL; + + list_for_each_entry(priv, dev_list, list) { + if (priv->port != port) + continue; + + matches += ipoib_match_gid_pkey_addr(priv, gid, pkey_index, + addr, 0, net_dev); + if (matches > 1) + break; + } + + return matches; +} + +static struct net_device *ipoib_get_net_dev_by_params( + struct ib_device *dev, u32 port, u16 pkey, + const union ib_gid *gid, const struct sockaddr *addr, + void *client_data) +{ + struct net_device *net_dev; + struct list_head *dev_list = client_data; + u16 pkey_index; + int matches; + int ret; + + if (!rdma_protocol_ib(dev, port)) + return NULL; + + ret = ib_find_cached_pkey(dev, port, pkey, &pkey_index); + if (ret) + return NULL; + + /* See if we can find a unique device matching the L2 parameters */ + matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index, + gid, NULL, &net_dev); + + switch (matches) { + case 0: + return NULL; + case 1: + return net_dev; + } + + dev_put(net_dev); + + /* Couldn't find a unique device with L2 parameters only. Use L3 + * address to uniquely match the net device */ + matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index, + gid, addr, &net_dev); + switch (matches) { + case 0: + return NULL; + default: + dev_warn_ratelimited(&dev->dev, + "duplicate IP address detected\n"); + fallthrough; + case 1: + return net_dev; + } +} + +int ipoib_set_mode(struct net_device *dev, const char *buf) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + + if ((test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags) && + !strcmp(buf, "connected\n")) || + (!test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags) && + !strcmp(buf, "datagram\n"))) { + return 0; + } + + /* flush paths if we switch modes so that connections are restarted */ + if (!strcmp(buf, "connected\n")) { + if (IPOIB_CM_SUPPORTED(dev->dev_addr)) { + set_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); + ipoib_warn(priv, "enabling connected mode " + "will cause multicast packet drops\n"); + netdev_update_features(dev); + dev_set_mtu(dev, ipoib_cm_max_mtu(dev)); + rtnl_unlock(); + priv->tx_wr.wr.send_flags &= ~IB_SEND_IP_CSUM; + priv->tx_wr.wr.opcode = IB_WR_SEND; + + ipoib_flush_paths(dev); + return (!rtnl_trylock()) ? -EBUSY : 0; + } else { + ipoib_warn(priv, "Setting Connected Mode failed, " + "not supported by this device"); + return -EINVAL; + } + } + + if (!strcmp(buf, "datagram\n")) { + clear_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); + netdev_update_features(dev); + dev_set_mtu(dev, min(priv->mcast_mtu, dev->mtu)); + netif_set_real_num_tx_queues(dev, dev->num_tx_queues); + rtnl_unlock(); + ipoib_flush_paths(dev); + return (!rtnl_trylock()) ? -EBUSY : 0; + } + + return -EINVAL; +} + +struct ipoib_path *__path_find(struct net_device *dev, void *gid) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct rb_node *n = priv->path_tree.rb_node; + struct ipoib_path *path; + int ret; + + while (n) { + path = rb_entry(n, struct ipoib_path, rb_node); + + ret = memcmp(gid, path->pathrec.dgid.raw, + sizeof (union ib_gid)); + + if (ret < 0) + n = n->rb_left; + else if (ret > 0) + n = n->rb_right; + else + return path; + } + + return NULL; +} + +static int __path_add(struct net_device *dev, struct ipoib_path *path) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct rb_node **n = &priv->path_tree.rb_node; + struct rb_node *pn = NULL; + struct ipoib_path *tpath; + int ret; + + while (*n) { + pn = *n; + tpath = rb_entry(pn, struct ipoib_path, rb_node); + + ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw, + sizeof (union ib_gid)); + if (ret < 0) + n = &pn->rb_left; + else if (ret > 0) + n = &pn->rb_right; + else + return -EEXIST; + } + + rb_link_node(&path->rb_node, pn, n); + rb_insert_color(&path->rb_node, &priv->path_tree); + + list_add_tail(&path->list, &priv->path_list); + + return 0; +} + +static void path_free(struct net_device *dev, struct ipoib_path *path) +{ + struct sk_buff *skb; + + while ((skb = __skb_dequeue(&path->queue))) + dev_kfree_skb_irq(skb); + if (sa_path_get_dlid(&path->pathrec)) + ipoib_path_del_notify(ipoib_priv(dev), &path->pathrec); + + ipoib_dbg(ipoib_priv(dev), "%s\n", __func__); + + /* remove all neigh connected to this path */ + ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw); + + if (path->ah) + ipoib_put_ah(path->ah); + + kfree(path); +} + +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG + +struct ipoib_path_iter *ipoib_path_iter_init(struct net_device *dev) +{ + struct ipoib_path_iter *iter; + + iter = kmalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return NULL; + + iter->dev = dev; + memset(iter->path.pathrec.dgid.raw, 0, 16); + + if (ipoib_path_iter_next(iter)) { + kfree(iter); + return NULL; + } + + return iter; +} + +int ipoib_path_iter_next(struct ipoib_path_iter *iter) +{ + struct ipoib_dev_priv *priv = ipoib_priv(iter->dev); + struct rb_node *n; + struct ipoib_path *path; + int ret = 1; + + spin_lock_irq(&priv->lock); + + n = rb_first(&priv->path_tree); + + while (n) { + path = rb_entry(n, struct ipoib_path, rb_node); + + if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw, + sizeof (union ib_gid)) < 0) { + iter->path = *path; + ret = 0; + break; + } + + n = rb_next(n); + } + + spin_unlock_irq(&priv->lock); + + return ret; +} + +void ipoib_path_iter_read(struct ipoib_path_iter *iter, + struct ipoib_path *path) +{ + *path = iter->path; +} + +#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */ + +void ipoib_mark_paths_invalid(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct ipoib_path *path, *tp; + + spin_lock_irq(&priv->lock); + + list_for_each_entry_safe(path, tp, &priv->path_list, list) { + ipoib_dbg(priv, "mark path LID 0x%08x GID %pI6 invalid\n", + be32_to_cpu(sa_path_get_dlid(&path->pathrec)), + path->pathrec.dgid.raw); + if (path->ah) + path->ah->valid = 0; + ipoib_path_del_notify(priv, &path->pathrec); + } + + spin_unlock_irq(&priv->lock); +} + +static void push_pseudo_header(struct sk_buff *skb, const char *daddr) +{ + struct ipoib_pseudo_header *phdr; + + phdr = skb_push(skb, sizeof(*phdr)); + memcpy(phdr->hwaddr, daddr, INFINIBAND_ALEN); +} + +void ipoib_flush_paths(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct ipoib_path *path, *tp; + LIST_HEAD(remove_list); + unsigned long flags; + + netif_tx_lock_bh(dev); + spin_lock_irqsave(&priv->lock, flags); + + list_splice_init(&priv->path_list, &remove_list); + + list_for_each_entry(path, &remove_list, list) + rb_erase(&path->rb_node, &priv->path_tree); + + list_for_each_entry_safe(path, tp, &remove_list, list) { + if (path->query) + ib_sa_cancel_query(path->query_id, path->query); + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); + wait_for_completion(&path->done); + path_free(dev, path); + netif_tx_lock_bh(dev); + spin_lock_irqsave(&priv->lock, flags); + } + + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); +} + +static void path_rec_completion(int status, + struct sa_path_rec *pathrec, + int num_prs, void *path_ptr) +{ + struct ipoib_path *path = path_ptr; + struct net_device *dev = path->dev; + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct ipoib_ah *ah = NULL; + struct ipoib_ah *old_ah = NULL; + struct ipoib_neigh *neigh, *tn; + struct sk_buff_head skqueue; + struct sk_buff *skb; + unsigned long flags; + + if (!status) + ipoib_dbg(priv, "PathRec LID 0x%04x for GID %pI6\n", + be32_to_cpu(sa_path_get_dlid(pathrec)), + pathrec->dgid.raw); + else + ipoib_dbg(priv, "PathRec status %d for GID %pI6\n", + status, path->pathrec.dgid.raw); + + skb_queue_head_init(&skqueue); + + if (!status) { + struct rdma_ah_attr av; + + if (!ib_init_ah_attr_from_path(priv->ca, priv->port, + pathrec, &av, NULL)) { + ah = ipoib_create_ah(dev, priv->pd, &av); + rdma_destroy_ah_attr(&av); + } + ipoib_path_add_notify(priv, pathrec); + } + + spin_lock_irqsave(&priv->lock, flags); + + if (!IS_ERR_OR_NULL(ah)) { + /* + * pathrec.dgid is used as the database key from the LLADDR, + * it must remain unchanged even if the SA returns a different + * GID to use in the AH. + */ + if (memcmp(pathrec->dgid.raw, path->pathrec.dgid.raw, + sizeof(union ib_gid))) { + ipoib_dbg( + priv, + "%s got PathRec for gid %pI6 while asked for %pI6\n", + dev->name, pathrec->dgid.raw, + path->pathrec.dgid.raw); + memcpy(pathrec->dgid.raw, path->pathrec.dgid.raw, + sizeof(union ib_gid)); + } + + path->pathrec = *pathrec; + + old_ah = path->ah; + path->ah = ah; + + ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n", + ah, be32_to_cpu(sa_path_get_dlid(pathrec)), + pathrec->sl); + + while ((skb = __skb_dequeue(&path->queue))) + __skb_queue_tail(&skqueue, skb); + + list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) { + if (neigh->ah) { + WARN_ON(neigh->ah != old_ah); + /* + * Dropping the ah reference inside + * priv->lock is safe here, because we + * will hold one more reference from + * the original value of path->ah (ie + * old_ah). + */ + ipoib_put_ah(neigh->ah); + } + kref_get(&path->ah->ref); + neigh->ah = path->ah; + + if (ipoib_cm_enabled(dev, neigh->daddr)) { + if (!ipoib_cm_get(neigh)) + ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, + path, + neigh)); + if (!ipoib_cm_get(neigh)) { + ipoib_neigh_free(neigh); + continue; + } + } + + while ((skb = __skb_dequeue(&neigh->queue))) + __skb_queue_tail(&skqueue, skb); + } + path->ah->valid = 1; + } + + path->query = NULL; + complete(&path->done); + + spin_unlock_irqrestore(&priv->lock, flags); + + if (IS_ERR_OR_NULL(ah)) + ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw); + + if (old_ah) + ipoib_put_ah(old_ah); + + while ((skb = __skb_dequeue(&skqueue))) { + int ret; + skb->dev = dev; + ret = dev_queue_xmit(skb); + if (ret) + ipoib_warn(priv, "%s: dev_queue_xmit failed to re-queue packet, ret:%d\n", + __func__, ret); + } +} + +static void init_path_rec(struct ipoib_dev_priv *priv, struct ipoib_path *path, + void *gid) +{ + path->dev = priv->dev; + + if (rdma_cap_opa_ah(priv->ca, priv->port)) + path->pathrec.rec_type = SA_PATH_REC_TYPE_OPA; + else + path->pathrec.rec_type = SA_PATH_REC_TYPE_IB; + + memcpy(path->pathrec.dgid.raw, gid, sizeof(union ib_gid)); + path->pathrec.sgid = priv->local_gid; + path->pathrec.pkey = cpu_to_be16(priv->pkey); + path->pathrec.numb_path = 1; + path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class; +} + +static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct ipoib_path *path; + + if (!priv->broadcast) + return NULL; + + path = kzalloc(sizeof(*path), GFP_ATOMIC); + if (!path) + return NULL; + + skb_queue_head_init(&path->queue); + + INIT_LIST_HEAD(&path->neigh_list); + + init_path_rec(priv, path, gid); + + return path; +} + +static int path_rec_start(struct net_device *dev, + struct ipoib_path *path) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + + ipoib_dbg(priv, "Start path record lookup for %pI6\n", + path->pathrec.dgid.raw); + + init_completion(&path->done); + + path->query_id = + ib_sa_path_rec_get(&ipoib_sa_client, priv->ca, priv->port, + &path->pathrec, + IB_SA_PATH_REC_DGID | + IB_SA_PATH_REC_SGID | + IB_SA_PATH_REC_NUMB_PATH | + IB_SA_PATH_REC_TRAFFIC_CLASS | + IB_SA_PATH_REC_PKEY, + 1000, 0, GFP_ATOMIC, + path_rec_completion, + path, &path->query); + if (path->query_id < 0) { + ipoib_warn(priv, "ib_sa_path_rec_get failed: %d\n", path->query_id); + path->query = NULL; + complete(&path->done); + return path->query_id; + } + + return 0; +} + +static void neigh_refresh_path(struct ipoib_neigh *neigh, u8 *daddr, + struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct ipoib_path *path; + unsigned long flags; + + spin_lock_irqsave(&priv->lock, flags); + + path = __path_find(dev, daddr + 4); + if (!path) + goto out; + if (!path->query) + path_rec_start(dev, path); +out: + spin_unlock_irqrestore(&priv->lock, flags); +} + +static struct ipoib_neigh *neigh_add_path(struct sk_buff *skb, u8 *daddr, + struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct rdma_netdev *rn = netdev_priv(dev); + struct ipoib_path *path; + struct ipoib_neigh *neigh; + unsigned long flags; + + spin_lock_irqsave(&priv->lock, flags); + neigh = ipoib_neigh_alloc(daddr, dev); + if (!neigh) { + spin_unlock_irqrestore(&priv->lock, flags); + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + return NULL; + } + + /* To avoid race condition, make sure that the + * neigh will be added only once. + */ + if (unlikely(!list_empty(&neigh->list))) { + spin_unlock_irqrestore(&priv->lock, flags); + return neigh; + } + + path = __path_find(dev, daddr + 4); + if (!path) { + path = path_rec_create(dev, daddr + 4); + if (!path) + goto err_path; + + __path_add(dev, path); + } + + list_add_tail(&neigh->list, &path->neigh_list); + + if (path->ah && path->ah->valid) { + kref_get(&path->ah->ref); + neigh->ah = path->ah; + + if (ipoib_cm_enabled(dev, neigh->daddr)) { + if (!ipoib_cm_get(neigh)) + ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh)); + if (!ipoib_cm_get(neigh)) { + ipoib_neigh_free(neigh); + goto err_drop; + } + if (skb_queue_len(&neigh->queue) < + IPOIB_MAX_PATH_REC_QUEUE) { + push_pseudo_header(skb, neigh->daddr); + __skb_queue_tail(&neigh->queue, skb); + } else { + ipoib_warn(priv, "queue length limit %d. Packet drop.\n", + skb_queue_len(&neigh->queue)); + goto err_drop; + } + } else { + spin_unlock_irqrestore(&priv->lock, flags); + path->ah->last_send = rn->send(dev, skb, path->ah->ah, + IPOIB_QPN(daddr)); + ipoib_neigh_put(neigh); + return NULL; + } + } else { + neigh->ah = NULL; + + if (!path->query && path_rec_start(dev, path)) + goto err_path; + if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { + push_pseudo_header(skb, neigh->daddr); + __skb_queue_tail(&neigh->queue, skb); + } else { + goto err_drop; + } + } + + spin_unlock_irqrestore(&priv->lock, flags); + ipoib_neigh_put(neigh); + return NULL; + +err_path: + ipoib_neigh_free(neigh); +err_drop: + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + + spin_unlock_irqrestore(&priv->lock, flags); + ipoib_neigh_put(neigh); + + return NULL; +} + +/* + * Clean_path_from_cache: free path from both caches (list and rb tree) + * Call this function under lock. (netif_tx_lock_bh && priv->lock) + */ +static inline void clean_path_from_cache(struct ipoib_path *path, + struct ipoib_dev_priv *priv) +{ + list_del(&path->list); + rb_erase(&path->rb_node, &priv->path_tree); + if (path->query) + ib_sa_cancel_query(path->query_id, path->query); +} + +/* + * Clean_path_dependencies: free path from neighs. + * Do not call this function under locks. + */ +static inline void clean_path_references(struct ipoib_path *path, + struct net_device *dev) +{ + wait_for_completion(&path->done); + path_free(dev, path); +} + +/* + * For each arp response/request, check that the lid ipoib kept for this + * gid is the same as it has in the arp packet. If not, delete that + * path from the cache. + */ +void ipoib_repath_ah(struct work_struct *work) +{ + struct ipoib_arp_repath *repath = + container_of(work, struct ipoib_arp_repath, work); + struct net_device *dev = repath->dev; + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct ipoib_path *path_from_cache; + u16 lid_from_cache; + unsigned long flags; + + netif_tx_lock_bh(dev); + spin_lock_irqsave(&priv->lock, flags); + + path_from_cache = __path_find(dev, &repath->sgid); + + if (path_from_cache) { + lid_from_cache = be32_to_cpu(sa_path_get_dlid(&path_from_cache->pathrec)); + /*check if we have the same path in the path cache:*/ + if ((lid_from_cache && repath->lid) && + (repath->lid != lid_from_cache)) { + ipoib_warn(priv, "Found gid with mismatch lids.(cache:%d,from arp: %d)\n", + lid_from_cache, repath->lid); + clean_path_from_cache(path_from_cache, priv); + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); + clean_path_references(path_from_cache, dev); + goto free_res; + } + } + + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); + +free_res: + kfree(repath); +} + +static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, + struct ipoib_pseudo_header *phdr) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct rdma_netdev *rn = netdev_priv(dev); + struct ipoib_path *path; + unsigned long flags; + + spin_lock_irqsave(&priv->lock, flags); + + /* no broadcast means that all paths are (going to be) not valid */ + if (!priv->broadcast) + goto drop_and_unlock; + + path = __path_find(dev, phdr->hwaddr + 4); + if (!path || !path->ah || !path->ah->valid) { + if (!path) { + path = path_rec_create(dev, phdr->hwaddr + 4); + if (!path) + goto drop_and_unlock; + __path_add(dev, path); + } else { + /* + * make sure there are no changes in the existing + * path record + */ + init_path_rec(priv, path, phdr->hwaddr + 4); + } + if (!path->query && path_rec_start(dev, path)) { + goto drop_and_unlock; + } + + if (skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) { + push_pseudo_header(skb, phdr->hwaddr); + __skb_queue_tail(&path->queue, skb); + goto unlock; + } else { + goto drop_and_unlock; + } + } + + spin_unlock_irqrestore(&priv->lock, flags); + ipoib_dbg(priv, "Send unicast ARP to %08x\n", + be32_to_cpu(sa_path_get_dlid(&path->pathrec))); + path->ah->last_send = rn->send(dev, skb, path->ah->ah, + IPOIB_QPN(phdr->hwaddr)); + return; + +drop_and_unlock: + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); +unlock: + spin_unlock_irqrestore(&priv->lock, flags); +} + +static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct rdma_netdev *rn = netdev_priv(dev); + struct ipoib_neigh *neigh; + struct ipoib_pseudo_header *phdr; + struct ipoib_header *header; + unsigned long flags; + + phdr = (struct ipoib_pseudo_header *) skb->data; + skb_pull(skb, sizeof(*phdr)); + header = (struct ipoib_header *) skb->data; + + if (unlikely(phdr->hwaddr[4] == 0xff)) { + /* multicast, arrange "if" according to probability */ + if ((header->proto != htons(ETH_P_IP)) && + (header->proto != htons(ETH_P_IPV6)) && + (header->proto != htons(ETH_P_ARP)) && + (header->proto != htons(ETH_P_RARP)) && + (header->proto != htons(ETH_P_TIPC))) { + /* ethertype not supported by IPoIB */ + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + return NETDEV_TX_OK; + } + /* Add in the P_Key for multicast*/ + phdr->hwaddr[8] = (priv->pkey >> 8) & 0xff; + phdr->hwaddr[9] = priv->pkey & 0xff; + + neigh = ipoib_neigh_get(dev, phdr->hwaddr); + if (likely(neigh)) + goto send_using_neigh; + ipoib_mcast_send(dev, phdr->hwaddr, skb); + return NETDEV_TX_OK; + } + + /* unicast, arrange "switch" according to probability */ + switch (header->proto) { + case htons(ETH_P_IP): + case htons(ETH_P_IPV6): + case htons(ETH_P_TIPC): + neigh = ipoib_neigh_get(dev, phdr->hwaddr); + if (unlikely(!neigh)) { + neigh = neigh_add_path(skb, phdr->hwaddr, dev); + if (likely(!neigh)) + return NETDEV_TX_OK; + } + break; + case htons(ETH_P_ARP): + case htons(ETH_P_RARP): + /* for unicast ARP and RARP should always perform path find */ + unicast_arp_send(skb, dev, phdr); + return NETDEV_TX_OK; + default: + /* ethertype not supported by IPoIB */ + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + return NETDEV_TX_OK; + } + +send_using_neigh: + /* note we now hold a ref to neigh */ + if (ipoib_cm_get(neigh)) { + if (ipoib_cm_up(neigh)) { + ipoib_cm_send(dev, skb, ipoib_cm_get(neigh)); + goto unref; + } + } else if (neigh->ah && neigh->ah->valid) { + neigh->ah->last_send = rn->send(dev, skb, neigh->ah->ah, + IPOIB_QPN(phdr->hwaddr)); + goto unref; + } else if (neigh->ah) { + neigh_refresh_path(neigh, phdr->hwaddr, dev); + } + + if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { + spin_lock_irqsave(&priv->lock, flags); + /* + * to avoid race with path_rec_completion check if it already + * done, if yes re-send the packet, otherwise push the skb into + * the queue. + * it is safe to check it here while priv->lock around. + */ + if (neigh->ah && neigh->ah->valid) + if (!ipoib_cm_get(neigh) || + (ipoib_cm_get(neigh) && ipoib_cm_up(neigh))) { + spin_unlock_irqrestore(&priv->lock, flags); + goto send_using_neigh; + } + push_pseudo_header(skb, phdr->hwaddr); + __skb_queue_tail(&neigh->queue, skb); + spin_unlock_irqrestore(&priv->lock, flags); + } else { + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + } + +unref: + ipoib_neigh_put(neigh); + + return NETDEV_TX_OK; +} + +static void ipoib_timeout(struct net_device *dev, unsigned int txqueue) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct rdma_netdev *rn = netdev_priv(dev); + + if (rn->tx_timeout) { + rn->tx_timeout(dev, txqueue); + return; + } + ipoib_warn(priv, "transmit timeout: latency %d msecs\n", + jiffies_to_msecs(jiffies - dev_trans_start(dev))); + ipoib_warn(priv, + "queue stopped %d, tx_head %u, tx_tail %u, global_tx_head %u, global_tx_tail %u\n", + netif_queue_stopped(dev), priv->tx_head, priv->tx_tail, + priv->global_tx_head, priv->global_tx_tail); + + + schedule_work(&priv->tx_timeout_work); +} + +void ipoib_ib_tx_timeout_work(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = container_of(work, + struct ipoib_dev_priv, + tx_timeout_work); + int err; + + rtnl_lock(); + + if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) + goto unlock; + + ipoib_stop(priv->dev); + err = ipoib_open(priv->dev); + if (err) { + ipoib_warn(priv, "ipoib_open failed recovering from a tx_timeout, err(%d).\n", + err); + goto unlock; + } + + netif_tx_wake_all_queues(priv->dev); +unlock: + rtnl_unlock(); + +} + +static int ipoib_hard_header(struct sk_buff *skb, + struct net_device *dev, + unsigned short type, + const void *daddr, + const void *saddr, + unsigned int len) +{ + struct ipoib_header *header; + + header = skb_push(skb, sizeof(*header)); + + header->proto = htons(type); + header->reserved = 0; + + /* + * we don't rely on dst_entry structure, always stuff the + * destination address into skb hard header so we can figure out where + * to send the packet later. + */ + push_pseudo_header(skb, daddr); + + return IPOIB_HARD_LEN; +} + +static void ipoib_set_mcast_list(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + + if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) { + ipoib_dbg(priv, "IPOIB_FLAG_OPER_UP not set"); + return; + } + + queue_work(priv->wq, &priv->restart_task); +} + +static int ipoib_get_iflink(const struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + + /* parent interface */ + if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) + return dev->ifindex; + + /* child/vlan interface */ + return priv->parent->ifindex; +} + +static u32 ipoib_addr_hash(struct ipoib_neigh_hash *htbl, u8 *daddr) +{ + /* + * Use only the address parts that contributes to spreading + * The subnet prefix is not used as one can not connect to + * same remote port (GUID) using the same remote QPN via two + * different subnets. + */ + /* qpn octets[1:4) & port GUID octets[12:20) */ + u32 *d32 = (u32 *) daddr; + u32 hv; + + hv = jhash_3words(d32[3], d32[4], IPOIB_QPN_MASK & d32[0], 0); + return hv & htbl->mask; +} + +struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct ipoib_neigh_table *ntbl = &priv->ntbl; + struct ipoib_neigh_hash *htbl; + struct ipoib_neigh *neigh = NULL; + u32 hash_val; + + rcu_read_lock_bh(); + + htbl = rcu_dereference_bh(ntbl->htbl); + + if (!htbl) + goto out_unlock; + + hash_val = ipoib_addr_hash(htbl, daddr); + for (neigh = rcu_dereference_bh(htbl->buckets[hash_val]); + neigh != NULL; + neigh = rcu_dereference_bh(neigh->hnext)) { + if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) { + /* found, take one ref on behalf of the caller */ + if (!refcount_inc_not_zero(&neigh->refcnt)) { + /* deleted */ + neigh = NULL; + goto out_unlock; + } + + if (likely(skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE)) + neigh->alive = jiffies; + goto out_unlock; + } + } + +out_unlock: + rcu_read_unlock_bh(); + return neigh; +} + +static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv) +{ + struct ipoib_neigh_table *ntbl = &priv->ntbl; + struct ipoib_neigh_hash *htbl; + unsigned long neigh_obsolete; + unsigned long dt; + unsigned long flags; + int i; + LIST_HEAD(remove_list); + + spin_lock_irqsave(&priv->lock, flags); + + htbl = rcu_dereference_protected(ntbl->htbl, + lockdep_is_held(&priv->lock)); + + if (!htbl) + goto out_unlock; + + /* neigh is obsolete if it was idle for two GC periods */ + dt = 2 * arp_tbl.gc_interval; + neigh_obsolete = jiffies - dt; + dt = min(dt, IPOIB_MAX_NEIGH_TIME); + dt = max(dt, IPOIB_MIN_NEIGH_TIME); + + for (i = 0; i < htbl->size; i++) { + struct ipoib_neigh *neigh; + struct ipoib_neigh __rcu **np = &htbl->buckets[i]; + + while ((neigh = rcu_dereference_protected(*np, + lockdep_is_held(&priv->lock))) != NULL) { + /* was the neigh idle for two GC periods */ + if (time_after(neigh_obsolete, neigh->alive)) { + + ipoib_check_and_add_mcast_sendonly(priv, neigh->daddr + 4, &remove_list); + + rcu_assign_pointer(*np, + rcu_dereference_protected(neigh->hnext, + lockdep_is_held(&priv->lock))); + /* remove from path/mc list */ + list_del_init(&neigh->list); + call_rcu(&neigh->rcu, ipoib_neigh_reclaim); + } else { + np = &neigh->hnext; + } + + } + } + +out_unlock: + spin_unlock_irqrestore(&priv->lock, flags); + ipoib_mcast_remove_list(&remove_list); +} + +static void ipoib_reap_neigh(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = + container_of(work, struct ipoib_dev_priv, neigh_reap_task.work); + unsigned long gc_time; + + gc_time = arp_tbl.gc_interval; + gc_time = min(gc_time, IPOIB_MAX_NEIGH_TIME); + gc_time = max(gc_time, IPOIB_MIN_NEIGH_TIME); + + __ipoib_reap_neigh(priv); + + queue_delayed_work(priv->wq, &priv->neigh_reap_task, + gc_time); +} + + +static struct ipoib_neigh *ipoib_neigh_ctor(u8 *daddr, + struct net_device *dev) +{ + struct ipoib_neigh *neigh; + + neigh = kzalloc(sizeof(*neigh), GFP_ATOMIC); + if (!neigh) + return NULL; + + neigh->dev = dev; + memcpy(&neigh->daddr, daddr, sizeof(neigh->daddr)); + skb_queue_head_init(&neigh->queue); + INIT_LIST_HEAD(&neigh->list); + ipoib_cm_set(neigh, NULL); + /* one ref on behalf of the caller */ + refcount_set(&neigh->refcnt, 1); + + return neigh; +} + +struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr, + struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct ipoib_neigh_table *ntbl = &priv->ntbl; + struct ipoib_neigh_hash *htbl; + struct ipoib_neigh *neigh; + u32 hash_val; + + htbl = rcu_dereference_protected(ntbl->htbl, + lockdep_is_held(&priv->lock)); + if (!htbl) { + neigh = NULL; + goto out_unlock; + } + + /* need to add a new neigh, but maybe some other thread succeeded? + * recalc hash, maybe hash resize took place so we do a search + */ + hash_val = ipoib_addr_hash(htbl, daddr); + for (neigh = rcu_dereference_protected(htbl->buckets[hash_val], + lockdep_is_held(&priv->lock)); + neigh != NULL; + neigh = rcu_dereference_protected(neigh->hnext, + lockdep_is_held(&priv->lock))) { + if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) { + /* found, take one ref on behalf of the caller */ + if (!refcount_inc_not_zero(&neigh->refcnt)) { + /* deleted */ + neigh = NULL; + break; + } + neigh->alive = jiffies; + goto out_unlock; + } + } + + neigh = ipoib_neigh_ctor(daddr, dev); + if (!neigh) + goto out_unlock; + + /* one ref on behalf of the hash table */ + refcount_inc(&neigh->refcnt); + neigh->alive = jiffies; + /* put in hash */ + rcu_assign_pointer(neigh->hnext, + rcu_dereference_protected(htbl->buckets[hash_val], + lockdep_is_held(&priv->lock))); + rcu_assign_pointer(htbl->buckets[hash_val], neigh); + atomic_inc(&ntbl->entries); + +out_unlock: + + return neigh; +} + +void ipoib_neigh_dtor(struct ipoib_neigh *neigh) +{ + /* neigh reference count was dropprd to zero */ + struct net_device *dev = neigh->dev; + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct sk_buff *skb; + if (neigh->ah) + ipoib_put_ah(neigh->ah); + while ((skb = __skb_dequeue(&neigh->queue))) { + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + } + if (ipoib_cm_get(neigh)) + ipoib_cm_destroy_tx(ipoib_cm_get(neigh)); + ipoib_dbg(ipoib_priv(dev), + "neigh free for %06x %pI6\n", + IPOIB_QPN(neigh->daddr), + neigh->daddr + 4); + kfree(neigh); + if (atomic_dec_and_test(&priv->ntbl.entries)) { + if (test_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags)) + complete(&priv->ntbl.flushed); + } +} + +static void ipoib_neigh_reclaim(struct rcu_head *rp) +{ + /* Called as a result of removal from hash table */ + struct ipoib_neigh *neigh = container_of(rp, struct ipoib_neigh, rcu); + /* note TX context may hold another ref */ + ipoib_neigh_put(neigh); +} + +void ipoib_neigh_free(struct ipoib_neigh *neigh) +{ + struct net_device *dev = neigh->dev; + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct ipoib_neigh_table *ntbl = &priv->ntbl; + struct ipoib_neigh_hash *htbl; + struct ipoib_neigh __rcu **np; + struct ipoib_neigh *n; + u32 hash_val; + + htbl = rcu_dereference_protected(ntbl->htbl, + lockdep_is_held(&priv->lock)); + if (!htbl) + return; + + hash_val = ipoib_addr_hash(htbl, neigh->daddr); + np = &htbl->buckets[hash_val]; + for (n = rcu_dereference_protected(*np, + lockdep_is_held(&priv->lock)); + n != NULL; + n = rcu_dereference_protected(*np, + lockdep_is_held(&priv->lock))) { + if (n == neigh) { + /* found */ + rcu_assign_pointer(*np, + rcu_dereference_protected(neigh->hnext, + lockdep_is_held(&priv->lock))); + /* remove from parent list */ + list_del_init(&neigh->list); + call_rcu(&neigh->rcu, ipoib_neigh_reclaim); + return; + } else { + np = &n->hnext; + } + } +} + +static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv) +{ + struct ipoib_neigh_table *ntbl = &priv->ntbl; + struct ipoib_neigh_hash *htbl; + struct ipoib_neigh __rcu **buckets; + u32 size; + + clear_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags); + ntbl->htbl = NULL; + htbl = kzalloc(sizeof(*htbl), GFP_KERNEL); + if (!htbl) + return -ENOMEM; + size = roundup_pow_of_two(arp_tbl.gc_thresh3); + buckets = kvcalloc(size, sizeof(*buckets), GFP_KERNEL); + if (!buckets) { + kfree(htbl); + return -ENOMEM; + } + htbl->size = size; + htbl->mask = (size - 1); + htbl->buckets = buckets; + RCU_INIT_POINTER(ntbl->htbl, htbl); + htbl->ntbl = ntbl; + atomic_set(&ntbl->entries, 0); + + /* start garbage collection */ + queue_delayed_work(priv->wq, &priv->neigh_reap_task, + IPOIB_MIN_NEIGH_TIME); + + return 0; +} + +static void neigh_hash_free_rcu(struct rcu_head *head) +{ + struct ipoib_neigh_hash *htbl = container_of(head, + struct ipoib_neigh_hash, + rcu); + struct ipoib_neigh __rcu **buckets = htbl->buckets; + struct ipoib_neigh_table *ntbl = htbl->ntbl; + + kvfree(buckets); + kfree(htbl); + complete(&ntbl->deleted); +} + +void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct ipoib_neigh_table *ntbl = &priv->ntbl; + struct ipoib_neigh_hash *htbl; + unsigned long flags; + int i; + + /* remove all neigh connected to a given path or mcast */ + spin_lock_irqsave(&priv->lock, flags); + + htbl = rcu_dereference_protected(ntbl->htbl, + lockdep_is_held(&priv->lock)); + + if (!htbl) + goto out_unlock; + + for (i = 0; i < htbl->size; i++) { + struct ipoib_neigh *neigh; + struct ipoib_neigh __rcu **np = &htbl->buckets[i]; + + while ((neigh = rcu_dereference_protected(*np, + lockdep_is_held(&priv->lock))) != NULL) { + /* delete neighs belong to this parent */ + if (!memcmp(gid, neigh->daddr + 4, sizeof (union ib_gid))) { + rcu_assign_pointer(*np, + rcu_dereference_protected(neigh->hnext, + lockdep_is_held(&priv->lock))); + /* remove from parent list */ + list_del_init(&neigh->list); + call_rcu(&neigh->rcu, ipoib_neigh_reclaim); + } else { + np = &neigh->hnext; + } + + } + } +out_unlock: + spin_unlock_irqrestore(&priv->lock, flags); +} + +static void ipoib_flush_neighs(struct ipoib_dev_priv *priv) +{ + struct ipoib_neigh_table *ntbl = &priv->ntbl; + struct ipoib_neigh_hash *htbl; + unsigned long flags; + int i, wait_flushed = 0; + + init_completion(&priv->ntbl.flushed); + set_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags); + + spin_lock_irqsave(&priv->lock, flags); + + htbl = rcu_dereference_protected(ntbl->htbl, + lockdep_is_held(&priv->lock)); + if (!htbl) + goto out_unlock; + + wait_flushed = atomic_read(&priv->ntbl.entries); + if (!wait_flushed) + goto free_htbl; + + for (i = 0; i < htbl->size; i++) { + struct ipoib_neigh *neigh; + struct ipoib_neigh __rcu **np = &htbl->buckets[i]; + + while ((neigh = rcu_dereference_protected(*np, + lockdep_is_held(&priv->lock))) != NULL) { + rcu_assign_pointer(*np, + rcu_dereference_protected(neigh->hnext, + lockdep_is_held(&priv->lock))); + /* remove from path/mc list */ + list_del_init(&neigh->list); + call_rcu(&neigh->rcu, ipoib_neigh_reclaim); + } + } + +free_htbl: + rcu_assign_pointer(ntbl->htbl, NULL); + call_rcu(&htbl->rcu, neigh_hash_free_rcu); + +out_unlock: + spin_unlock_irqrestore(&priv->lock, flags); + if (wait_flushed) + wait_for_completion(&priv->ntbl.flushed); +} + +static void ipoib_neigh_hash_uninit(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + + ipoib_dbg(priv, "%s\n", __func__); + init_completion(&priv->ntbl.deleted); + + cancel_delayed_work_sync(&priv->neigh_reap_task); + + ipoib_flush_neighs(priv); + + wait_for_completion(&priv->ntbl.deleted); +} + +static void ipoib_napi_add(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + + netif_napi_add(dev, &priv->recv_napi, ipoib_rx_poll, IPOIB_NUM_WC); + netif_napi_add(dev, &priv->send_napi, ipoib_tx_poll, MAX_SEND_CQE); +} + +static void ipoib_napi_del(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + + netif_napi_del(&priv->recv_napi); + netif_napi_del(&priv->send_napi); +} + +static void ipoib_dev_uninit_default(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + + ipoib_transport_dev_cleanup(dev); + + ipoib_napi_del(dev); + + ipoib_cm_dev_cleanup(dev); + + kfree(priv->rx_ring); + vfree(priv->tx_ring); + + priv->rx_ring = NULL; + priv->tx_ring = NULL; +} + +static int ipoib_dev_init_default(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + u8 addr_mod[3]; + + ipoib_napi_add(dev); + + /* Allocate RX/TX "rings" to hold queued skbs */ + priv->rx_ring = kcalloc(priv->recvq_size, + sizeof(*priv->rx_ring), + GFP_KERNEL); + if (!priv->rx_ring) + goto out; + + priv->tx_ring = vzalloc(array_size(priv->sendq_size, + sizeof(*priv->tx_ring))); + if (!priv->tx_ring) { + pr_warn("%s: failed to allocate TX ring (%d entries)\n", + priv->ca->name, priv->sendq_size); + goto out_rx_ring_cleanup; + } + + /* priv->tx_head, tx_tail and global_tx_tail/head are already 0 */ + + if (ipoib_transport_dev_init(dev, priv->ca)) { + pr_warn("%s: ipoib_transport_dev_init failed\n", + priv->ca->name); + goto out_tx_ring_cleanup; + } + + /* after qp created set dev address */ + addr_mod[0] = (priv->qp->qp_num >> 16) & 0xff; + addr_mod[1] = (priv->qp->qp_num >> 8) & 0xff; + addr_mod[2] = (priv->qp->qp_num) & 0xff; + dev_addr_mod(priv->dev, 1, addr_mod, sizeof(addr_mod)); + + return 0; + +out_tx_ring_cleanup: + vfree(priv->tx_ring); + +out_rx_ring_cleanup: + kfree(priv->rx_ring); + +out: + ipoib_napi_del(dev); + return -ENOMEM; +} + +static int ipoib_ioctl(struct net_device *dev, struct ifreq *ifr, + int cmd) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + + if (!priv->rn_ops->ndo_eth_ioctl) + return -EOPNOTSUPP; + + return priv->rn_ops->ndo_eth_ioctl(dev, ifr, cmd); +} + +static int ipoib_dev_init(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + int ret = -ENOMEM; + + priv->qp = NULL; + + /* + * the various IPoIB tasks assume they will never race against + * themselves, so always use a single thread workqueue + */ + priv->wq = alloc_ordered_workqueue("ipoib_wq", WQ_MEM_RECLAIM); + if (!priv->wq) { + pr_warn("%s: failed to allocate device WQ\n", dev->name); + goto out; + } + + /* create pd, which used both for control and datapath*/ + priv->pd = ib_alloc_pd(priv->ca, 0); + if (IS_ERR(priv->pd)) { + pr_warn("%s: failed to allocate PD\n", priv->ca->name); + goto clean_wq; + } + + ret = priv->rn_ops->ndo_init(dev); + if (ret) { + pr_warn("%s failed to init HW resource\n", dev->name); + goto out_free_pd; + } + + ret = ipoib_neigh_hash_init(priv); + if (ret) { + pr_warn("%s failed to init neigh hash\n", dev->name); + goto out_dev_uninit; + } + + if (dev->flags & IFF_UP) { + if (ipoib_ib_dev_open(dev)) { + pr_warn("%s failed to open device\n", dev->name); + ret = -ENODEV; + goto out_hash_uninit; + } + } + + return 0; + +out_hash_uninit: + ipoib_neigh_hash_uninit(dev); + +out_dev_uninit: + ipoib_ib_dev_cleanup(dev); + +out_free_pd: + if (priv->pd) { + ib_dealloc_pd(priv->pd); + priv->pd = NULL; + } + +clean_wq: + if (priv->wq) { + destroy_workqueue(priv->wq); + priv->wq = NULL; + } + +out: + return ret; +} + +/* + * This must be called before doing an unregister_netdev on a parent device to + * shutdown the IB event handler. + */ +static void ipoib_parent_unregister_pre(struct net_device *ndev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(ndev); + + /* + * ipoib_set_mac checks netif_running before pushing work, clearing + * running ensures the it will not add more work. + */ + rtnl_lock(); + dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP, NULL); + rtnl_unlock(); + + /* ipoib_event() cannot be running once this returns */ + ib_unregister_event_handler(&priv->event_handler); + + /* + * Work on the queue grabs the rtnl lock, so this cannot be done while + * also holding it. + */ + flush_workqueue(ipoib_workqueue); +} + +static void ipoib_set_dev_features(struct ipoib_dev_priv *priv) +{ + priv->hca_caps = priv->ca->attrs.device_cap_flags; + + if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) { + priv->dev->hw_features |= NETIF_F_IP_CSUM | NETIF_F_RXCSUM; + + if (priv->hca_caps & IB_DEVICE_UD_TSO) + priv->dev->hw_features |= NETIF_F_TSO; + + priv->dev->features |= priv->dev->hw_features; + } +} + +static int ipoib_parent_init(struct net_device *ndev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(ndev); + struct ib_port_attr attr; + int result; + + result = ib_query_port(priv->ca, priv->port, &attr); + if (result) { + pr_warn("%s: ib_query_port %d failed\n", priv->ca->name, + priv->port); + return result; + } + priv->max_ib_mtu = rdma_mtu_from_attr(priv->ca, priv->port, &attr); + + result = ib_query_pkey(priv->ca, priv->port, 0, &priv->pkey); + if (result) { + pr_warn("%s: ib_query_pkey port %d failed (ret = %d)\n", + priv->ca->name, priv->port, result); + return result; + } + + result = rdma_query_gid(priv->ca, priv->port, 0, &priv->local_gid); + if (result) { + pr_warn("%s: rdma_query_gid port %d failed (ret = %d)\n", + priv->ca->name, priv->port, result); + return result; + } + dev_addr_mod(priv->dev, 4, priv->local_gid.raw, sizeof(union ib_gid)); + + SET_NETDEV_DEV(priv->dev, priv->ca->dev.parent); + priv->dev->dev_port = priv->port - 1; + /* Let's set this one too for backwards compatibility. */ + priv->dev->dev_id = priv->port - 1; + + return 0; +} + +static void ipoib_child_init(struct net_device *ndev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(ndev); + struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent); + + priv->max_ib_mtu = ppriv->max_ib_mtu; + set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags); + if (memchr_inv(priv->dev->dev_addr, 0, INFINIBAND_ALEN)) + memcpy(&priv->local_gid, priv->dev->dev_addr + 4, + sizeof(priv->local_gid)); + else { + __dev_addr_set(priv->dev, ppriv->dev->dev_addr, + INFINIBAND_ALEN); + memcpy(&priv->local_gid, &ppriv->local_gid, + sizeof(priv->local_gid)); + } +} + +static int ipoib_ndo_init(struct net_device *ndev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(ndev); + int rc; + struct rdma_netdev *rn = netdev_priv(ndev); + + if (priv->parent) { + ipoib_child_init(ndev); + } else { + rc = ipoib_parent_init(ndev); + if (rc) + return rc; + } + + /* Initial ring params*/ + priv->sendq_size = ipoib_sendq_size; + priv->recvq_size = ipoib_recvq_size; + + /* MTU will be reset when mcast join happens */ + ndev->mtu = IPOIB_UD_MTU(priv->max_ib_mtu); + priv->mcast_mtu = priv->admin_mtu = ndev->mtu; + rn->mtu = priv->mcast_mtu; + ndev->max_mtu = IPOIB_CM_MTU; + + ndev->neigh_priv_len = sizeof(struct ipoib_neigh); + + /* + * Set the full membership bit, so that we join the right + * broadcast group, etc. + */ + priv->pkey |= 0x8000; + + ndev->broadcast[8] = priv->pkey >> 8; + ndev->broadcast[9] = priv->pkey & 0xff; + set_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags); + + ipoib_set_dev_features(priv); + + rc = ipoib_dev_init(ndev); + if (rc) { + pr_warn("%s: failed to initialize device: %s port %d (ret = %d)\n", + priv->ca->name, priv->dev->name, priv->port, rc); + return rc; + } + + if (priv->parent) { + struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent); + + dev_hold(priv->parent); + + down_write(&ppriv->vlan_rwsem); + list_add_tail(&priv->list, &ppriv->child_intfs); + up_write(&ppriv->vlan_rwsem); + } + + return 0; +} + +static void ipoib_ndo_uninit(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + + ASSERT_RTNL(); + + /* + * ipoib_remove_one guarantees the children are removed before the + * parent, and that is the only place where a parent can be removed. + */ + WARN_ON(!list_empty(&priv->child_intfs)); + + if (priv->parent) { + struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent); + + down_write(&ppriv->vlan_rwsem); + list_del(&priv->list); + up_write(&ppriv->vlan_rwsem); + } + + ipoib_neigh_hash_uninit(dev); + + ipoib_ib_dev_cleanup(dev); + + /* no more works over the priv->wq */ + if (priv->wq) { + /* See ipoib_mcast_carrier_on_task() */ + WARN_ON(test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)); + destroy_workqueue(priv->wq); + priv->wq = NULL; + } + + if (priv->parent) + dev_put(priv->parent); +} + +static int ipoib_set_vf_link_state(struct net_device *dev, int vf, int link_state) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + + return ib_set_vf_link_state(priv->ca, vf, priv->port, link_state); +} + +static int ipoib_get_vf_config(struct net_device *dev, int vf, + struct ifla_vf_info *ivf) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + int err; + + err = ib_get_vf_config(priv->ca, vf, priv->port, ivf); + if (err) + return err; + + ivf->vf = vf; + memcpy(ivf->mac, dev->dev_addr, dev->addr_len); + + return 0; +} + +static int ipoib_set_vf_guid(struct net_device *dev, int vf, u64 guid, int type) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + + if (type != IFLA_VF_IB_NODE_GUID && type != IFLA_VF_IB_PORT_GUID) + return -EINVAL; + + return ib_set_vf_guid(priv->ca, vf, priv->port, guid, type); +} + +static int ipoib_get_vf_guid(struct net_device *dev, int vf, + struct ifla_vf_guid *node_guid, + struct ifla_vf_guid *port_guid) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + + return ib_get_vf_guid(priv->ca, vf, priv->port, node_guid, port_guid); +} + +static int ipoib_get_vf_stats(struct net_device *dev, int vf, + struct ifla_vf_stats *vf_stats) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + + return ib_get_vf_stats(priv->ca, vf, priv->port, vf_stats); +} + +static int ipoib_set_vf_local_mac(struct net_device *dev, void *addr) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct sockaddr_storage *ss = addr; + int ret = 0; + + netif_addr_lock_bh(dev); + if (memcmp(dev->dev_addr, ss->__data, 4 + sizeof(union ib_gid))) { + ipoib_warn(priv, "mac address change is unsupported.\n"); + ret = -EINVAL; + } + netif_addr_unlock_bh(dev); + return ret; +} + +static const struct header_ops ipoib_header_ops = { + .create = ipoib_hard_header, +}; + +static const struct net_device_ops ipoib_netdev_ops_pf = { + .ndo_init = ipoib_ndo_init, + .ndo_uninit = ipoib_ndo_uninit, + .ndo_open = ipoib_open, + .ndo_stop = ipoib_stop, + .ndo_change_mtu = ipoib_change_mtu, + .ndo_fix_features = ipoib_fix_features, + .ndo_start_xmit = ipoib_start_xmit, + .ndo_tx_timeout = ipoib_timeout, + .ndo_set_rx_mode = ipoib_set_mcast_list, + .ndo_get_iflink = ipoib_get_iflink, + .ndo_set_vf_link_state = ipoib_set_vf_link_state, + .ndo_get_vf_config = ipoib_get_vf_config, + .ndo_get_vf_stats = ipoib_get_vf_stats, + .ndo_get_vf_guid = ipoib_get_vf_guid, + .ndo_set_vf_guid = ipoib_set_vf_guid, + .ndo_set_mac_address = ipoib_set_mac, + .ndo_get_stats64 = ipoib_get_stats, + .ndo_eth_ioctl = ipoib_ioctl, +}; + +static const struct net_device_ops ipoib_netdev_ops_vf = { + .ndo_init = ipoib_ndo_init, + .ndo_uninit = ipoib_ndo_uninit, + .ndo_open = ipoib_open, + .ndo_stop = ipoib_stop, + .ndo_change_mtu = ipoib_change_mtu, + .ndo_fix_features = ipoib_fix_features, + .ndo_start_xmit = ipoib_start_xmit, + .ndo_tx_timeout = ipoib_timeout, + .ndo_set_rx_mode = ipoib_set_mcast_list, + .ndo_get_iflink = ipoib_get_iflink, + .ndo_set_mac_address = ipoib_set_vf_local_mac, + .ndo_get_stats64 = ipoib_get_stats, + .ndo_eth_ioctl = ipoib_ioctl, +}; + +static const struct net_device_ops ipoib_netdev_default_pf = { + .ndo_init = ipoib_dev_init_default, + .ndo_uninit = ipoib_dev_uninit_default, + .ndo_open = ipoib_ib_dev_open_default, + .ndo_stop = ipoib_ib_dev_stop_default, +}; + +void ipoib_setup_common(struct net_device *dev) +{ + dev->header_ops = &ipoib_header_ops; + dev->netdev_ops = &ipoib_netdev_default_pf; + + ipoib_set_ethtool_ops(dev); + + dev->watchdog_timeo = 10 * HZ; + + dev->flags |= IFF_BROADCAST | IFF_MULTICAST; + + dev->hard_header_len = IPOIB_HARD_LEN; + dev->addr_len = INFINIBAND_ALEN; + dev->type = ARPHRD_INFINIBAND; + dev->tx_queue_len = ipoib_sendq_size * 2; + dev->features = (NETIF_F_VLAN_CHALLENGED | + NETIF_F_HIGHDMA); + netif_keep_dst(dev); + + memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN); + + /* + * unregister_netdev always frees the netdev, we use this mode + * consistently to unify all the various unregister paths, including + * those connected to rtnl_link_ops which require it. + */ + dev->needs_free_netdev = true; +} + +static void ipoib_build_priv(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + + priv->dev = dev; + spin_lock_init(&priv->lock); + init_rwsem(&priv->vlan_rwsem); + mutex_init(&priv->mcast_mutex); + + INIT_LIST_HEAD(&priv->path_list); + INIT_LIST_HEAD(&priv->child_intfs); + INIT_LIST_HEAD(&priv->dead_ahs); + INIT_LIST_HEAD(&priv->multicast_list); + + INIT_DELAYED_WORK(&priv->mcast_task, ipoib_mcast_join_task); + INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task); + INIT_WORK(&priv->reschedule_napi_work, ipoib_napi_schedule_work); + INIT_WORK(&priv->flush_light, ipoib_ib_dev_flush_light); + INIT_WORK(&priv->flush_normal, ipoib_ib_dev_flush_normal); + INIT_WORK(&priv->flush_heavy, ipoib_ib_dev_flush_heavy); + INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task); + INIT_WORK(&priv->tx_timeout_work, ipoib_ib_tx_timeout_work); + INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah); + INIT_DELAYED_WORK(&priv->neigh_reap_task, ipoib_reap_neigh); +} + +static struct net_device *ipoib_alloc_netdev(struct ib_device *hca, u32 port, + const char *name) +{ + struct net_device *dev = NULL; + + dev = rdma_alloc_netdev(hca, port, RDMA_NETDEV_IPOIB, name, + NET_NAME_UNKNOWN, ipoib_setup_common, + !ipoib_enhanced_enabled); + if (!IS_ERR(dev) || PTR_ERR(dev) != -EOPNOTSUPP) + return dev; + + dev = alloc_netdev(sizeof(struct rdma_netdev), name, NET_NAME_UNKNOWN, + ipoib_setup_common); + if (!dev) + return ERR_PTR(-ENOMEM); + return dev; +} + +int ipoib_intf_init(struct ib_device *hca, u32 port, const char *name, + struct net_device *dev) +{ + struct rdma_netdev *rn = netdev_priv(dev); + struct ipoib_dev_priv *priv; + int rc; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->ca = hca; + priv->port = port; + + rc = rdma_init_netdev(hca, port, RDMA_NETDEV_IPOIB, name, + NET_NAME_UNKNOWN, ipoib_setup_common, dev, + !ipoib_enhanced_enabled); + if (rc) { + if (rc != -EOPNOTSUPP) + goto out; + + rn->send = ipoib_send; + rn->attach_mcast = ipoib_mcast_attach; + rn->detach_mcast = ipoib_mcast_detach; + rn->hca = hca; + } + + priv->rn_ops = dev->netdev_ops; + + if (hca->attrs.device_cap_flags & IB_DEVICE_VIRTUAL_FUNCTION) + dev->netdev_ops = &ipoib_netdev_ops_vf; + else + dev->netdev_ops = &ipoib_netdev_ops_pf; + + rn->clnt_priv = priv; + /* + * Only the child register_netdev flows can handle priv_destructor + * being set, so we force it to NULL here and handle manually until it + * is safe to turn on. + */ + priv->next_priv_destructor = dev->priv_destructor; + dev->priv_destructor = NULL; + + ipoib_build_priv(dev); + + return 0; + +out: + kfree(priv); + return rc; +} + +struct net_device *ipoib_intf_alloc(struct ib_device *hca, u32 port, + const char *name) +{ + struct net_device *dev; + int rc; + + dev = ipoib_alloc_netdev(hca, port, name); + if (IS_ERR(dev)) + return dev; + + rc = ipoib_intf_init(hca, port, name, dev); + if (rc) { + free_netdev(dev); + return ERR_PTR(rc); + } + + /* + * Upon success the caller must ensure ipoib_intf_free is called or + * register_netdevice succeed'd and priv_destructor is set to + * ipoib_intf_free. + */ + return dev; +} + +void ipoib_intf_free(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct rdma_netdev *rn = netdev_priv(dev); + + dev->priv_destructor = priv->next_priv_destructor; + if (dev->priv_destructor) + dev->priv_destructor(dev); + + /* + * There are some error flows around register_netdev failing that may + * attempt to call priv_destructor twice, prevent that from happening. + */ + dev->priv_destructor = NULL; + + /* unregister/destroy is very complicated. Make bugs more obvious. */ + rn->clnt_priv = NULL; + + kfree(priv); +} + +static ssize_t pkey_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct net_device *ndev = to_net_dev(dev); + struct ipoib_dev_priv *priv = ipoib_priv(ndev); + + return sysfs_emit(buf, "0x%04x\n", priv->pkey); +} +static DEVICE_ATTR_RO(pkey); + +static ssize_t umcast_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct net_device *ndev = to_net_dev(dev); + struct ipoib_dev_priv *priv = ipoib_priv(ndev); + + return sysfs_emit(buf, "%d\n", + test_bit(IPOIB_FLAG_UMCAST, &priv->flags)); +} + +void ipoib_set_umcast(struct net_device *ndev, int umcast_val) +{ + struct ipoib_dev_priv *priv = ipoib_priv(ndev); + + if (umcast_val > 0) { + set_bit(IPOIB_FLAG_UMCAST, &priv->flags); + ipoib_warn(priv, "ignoring multicast groups joined directly " + "by userspace\n"); + } else + clear_bit(IPOIB_FLAG_UMCAST, &priv->flags); +} + +static ssize_t umcast_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + unsigned long umcast_val = simple_strtoul(buf, NULL, 0); + + ipoib_set_umcast(to_net_dev(dev), umcast_val); + + return count; +} +static DEVICE_ATTR_RW(umcast); + +int ipoib_add_umcast_attr(struct net_device *dev) +{ + return device_create_file(&dev->dev, &dev_attr_umcast); +} + +static void set_base_guid(struct ipoib_dev_priv *priv, union ib_gid *gid) +{ + struct ipoib_dev_priv *child_priv; + struct net_device *netdev = priv->dev; + + netif_addr_lock_bh(netdev); + + memcpy(&priv->local_gid.global.interface_id, + &gid->global.interface_id, + sizeof(gid->global.interface_id)); + dev_addr_mod(netdev, 4, (u8 *)&priv->local_gid, sizeof(priv->local_gid)); + clear_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags); + + netif_addr_unlock_bh(netdev); + + if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { + down_read(&priv->vlan_rwsem); + list_for_each_entry(child_priv, &priv->child_intfs, list) + set_base_guid(child_priv, gid); + up_read(&priv->vlan_rwsem); + } +} + +static int ipoib_check_lladdr(struct net_device *dev, + struct sockaddr_storage *ss) +{ + union ib_gid *gid = (union ib_gid *)(ss->__data + 4); + int ret = 0; + + netif_addr_lock_bh(dev); + + /* Make sure the QPN, reserved and subnet prefix match the current + * lladdr, it also makes sure the lladdr is unicast. + */ + if (memcmp(dev->dev_addr, ss->__data, + 4 + sizeof(gid->global.subnet_prefix)) || + gid->global.interface_id == 0) + ret = -EINVAL; + + netif_addr_unlock_bh(dev); + + return ret; +} + +static int ipoib_set_mac(struct net_device *dev, void *addr) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct sockaddr_storage *ss = addr; + int ret; + + if (!(dev->priv_flags & IFF_LIVE_ADDR_CHANGE) && netif_running(dev)) + return -EBUSY; + + ret = ipoib_check_lladdr(dev, ss); + if (ret) + return ret; + + set_base_guid(priv, (union ib_gid *)(ss->__data + 4)); + + queue_work(ipoib_workqueue, &priv->flush_light); + + return 0; +} + +static ssize_t ipoib_set_mac_using_sysfs(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct ipoib_dev_priv *priv = ipoib_priv(to_net_dev(dev)); + union ib_gid gid; + + if (!in6_pton(buf, count, gid.raw, -1, NULL)) + return -EINVAL; + + set_base_guid(priv, &gid); + queue_work(ipoib_workqueue, &priv->flush_light); + + return count; +} +static DEVICE_ATTR(set_mac, S_IWUSR, NULL, ipoib_set_mac_using_sysfs); + +static ssize_t create_child_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + int pkey; + int ret; + + if (sscanf(buf, "%i", &pkey) != 1) + return -EINVAL; + + if (pkey <= 0 || pkey > 0xffff || pkey == 0x8000) + return -EINVAL; + + ret = ipoib_vlan_add(to_net_dev(dev), pkey); + + return ret ? ret : count; +} +static DEVICE_ATTR_WO(create_child); + +static ssize_t delete_child_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + int pkey; + int ret; + + if (sscanf(buf, "%i", &pkey) != 1) + return -EINVAL; + + if (pkey < 0 || pkey > 0xffff) + return -EINVAL; + + ret = ipoib_vlan_delete(to_net_dev(dev), pkey); + + return ret ? ret : count; + +} +static DEVICE_ATTR_WO(delete_child); + +int ipoib_add_pkey_attr(struct net_device *dev) +{ + return device_create_file(&dev->dev, &dev_attr_pkey); +} + +/* + * We erroneously exposed the iface's port number in the dev_id + * sysfs field long after dev_port was introduced for that purpose[1], + * and we need to stop everyone from relying on that. + * Let's overload the shower routine for the dev_id file here + * to gently bring the issue up. + * + * [1] https://www.spinics.net/lists/netdev/msg272123.html + */ +static ssize_t dev_id_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *ndev = to_net_dev(dev); + + /* + * ndev->dev_port will be equal to 0 in old kernel prior to commit + * 9b8b2a323008 ("IB/ipoib: Use dev_port to expose network interface + * port numbers") Zero was chosen as special case for user space + * applications to fallback and query dev_id to check if it has + * different value or not. + * + * Don't print warning in such scenario. + * + * https://github.com/systemd/systemd/blob/master/src/udev/udev-builtin-net_id.c#L358 + */ + if (ndev->dev_port && ndev->dev_id == ndev->dev_port) + netdev_info_once(ndev, + "\"%s\" wants to know my dev_id. Should it look at dev_port instead? See Documentation/ABI/testing/sysfs-class-net for more info.\n", + current->comm); + + return sysfs_emit(buf, "%#x\n", ndev->dev_id); +} +static DEVICE_ATTR_RO(dev_id); + +static int ipoib_intercept_dev_id_attr(struct net_device *dev) +{ + device_remove_file(&dev->dev, &dev_attr_dev_id); + return device_create_file(&dev->dev, &dev_attr_dev_id); +} + +static struct net_device *ipoib_add_port(const char *format, + struct ib_device *hca, u32 port) +{ + struct rtnl_link_ops *ops = ipoib_get_link_ops(); + struct rdma_netdev_alloc_params params; + struct ipoib_dev_priv *priv; + struct net_device *ndev; + int result; + + ndev = ipoib_intf_alloc(hca, port, format); + if (IS_ERR(ndev)) { + pr_warn("%s, %d: ipoib_intf_alloc failed %ld\n", hca->name, port, + PTR_ERR(ndev)); + return ndev; + } + priv = ipoib_priv(ndev); + + INIT_IB_EVENT_HANDLER(&priv->event_handler, + priv->ca, ipoib_event); + ib_register_event_handler(&priv->event_handler); + + /* call event handler to ensure pkey in sync */ + queue_work(ipoib_workqueue, &priv->flush_heavy); + + ndev->rtnl_link_ops = ipoib_get_link_ops(); + + result = register_netdev(ndev); + if (result) { + pr_warn("%s: couldn't register ipoib port %d; error %d\n", + hca->name, port, result); + + ipoib_parent_unregister_pre(ndev); + ipoib_intf_free(ndev); + free_netdev(ndev); + + return ERR_PTR(result); + } + + if (hca->ops.rdma_netdev_get_params) { + int rc = hca->ops.rdma_netdev_get_params(hca, port, + RDMA_NETDEV_IPOIB, + ¶ms); + + if (!rc && ops->priv_size < params.sizeof_priv) + ops->priv_size = params.sizeof_priv; + } + /* + * We cannot set priv_destructor before register_netdev because we + * need priv to be always valid during the error flow to execute + * ipoib_parent_unregister_pre(). Instead handle it manually and only + * enter priv_destructor mode once we are completely registered. + */ + ndev->priv_destructor = ipoib_intf_free; + + if (ipoib_intercept_dev_id_attr(ndev)) + goto sysfs_failed; + if (ipoib_cm_add_mode_attr(ndev)) + goto sysfs_failed; + if (ipoib_add_pkey_attr(ndev)) + goto sysfs_failed; + if (ipoib_add_umcast_attr(ndev)) + goto sysfs_failed; + if (device_create_file(&ndev->dev, &dev_attr_create_child)) + goto sysfs_failed; + if (device_create_file(&ndev->dev, &dev_attr_delete_child)) + goto sysfs_failed; + if (device_create_file(&priv->dev->dev, &dev_attr_set_mac)) + goto sysfs_failed; + + return ndev; + +sysfs_failed: + ipoib_parent_unregister_pre(ndev); + unregister_netdev(ndev); + return ERR_PTR(-ENOMEM); +} + +static int ipoib_add_one(struct ib_device *device) +{ + struct list_head *dev_list; + struct net_device *dev; + struct ipoib_dev_priv *priv; + unsigned int p; + int count = 0; + + dev_list = kmalloc(sizeof(*dev_list), GFP_KERNEL); + if (!dev_list) + return -ENOMEM; + + INIT_LIST_HEAD(dev_list); + + rdma_for_each_port (device, p) { + if (!rdma_protocol_ib(device, p)) + continue; + dev = ipoib_add_port("ib%d", device, p); + if (!IS_ERR(dev)) { + priv = ipoib_priv(dev); + list_add_tail(&priv->list, dev_list); + count++; + } + } + + if (!count) { + kfree(dev_list); + return -EOPNOTSUPP; + } + + ib_set_client_data(device, &ipoib_client, dev_list); + return 0; +} + +static void ipoib_remove_one(struct ib_device *device, void *client_data) +{ + struct ipoib_dev_priv *priv, *tmp, *cpriv, *tcpriv; + struct list_head *dev_list = client_data; + + list_for_each_entry_safe(priv, tmp, dev_list, list) { + LIST_HEAD(head); + ipoib_parent_unregister_pre(priv->dev); + + rtnl_lock(); + + list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, + list) + unregister_netdevice_queue(cpriv->dev, &head); + unregister_netdevice_queue(priv->dev, &head); + unregister_netdevice_many(&head); + + rtnl_unlock(); + } + + kfree(dev_list); +} + +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG +static struct notifier_block ipoib_netdev_notifier = { + .notifier_call = ipoib_netdev_event, +}; +#endif + +static int __init ipoib_init_module(void) +{ + int ret; + + if (ipoib_recvq_size <= IPOIB_MAX_QUEUE_SIZE && + ipoib_recvq_size >= IPOIB_MIN_QUEUE_SIZE) { + ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size); + ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE); + ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE); + } else { + pr_warn("ipoib_recvq_size is out of bounds [%d-%d], setting to default %d\n", + IPOIB_MIN_QUEUE_SIZE, IPOIB_MAX_QUEUE_SIZE, + IPOIB_RX_RING_SIZE); + ipoib_recvq_size = IPOIB_RX_RING_SIZE; + } + + if (ipoib_sendq_size <= IPOIB_MAX_QUEUE_SIZE && + ipoib_sendq_size >= IPOIB_MIN_QUEUE_SIZE) { + ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size); + ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE); + ipoib_sendq_size = max3(ipoib_sendq_size, 2 * MAX_SEND_CQE, + IPOIB_MIN_QUEUE_SIZE); + } else { + pr_warn("ipoib_sendq_size is out of bounds [%d-%d], setting to default %d\n", + IPOIB_MIN_QUEUE_SIZE, IPOIB_MAX_QUEUE_SIZE, + IPOIB_TX_RING_SIZE); + ipoib_sendq_size = IPOIB_TX_RING_SIZE; + } + +#ifdef CONFIG_INFINIBAND_IPOIB_CM + ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP); + ipoib_max_conn_qp = max(ipoib_max_conn_qp, 0); +#endif + + /* + * When copying small received packets, we only copy from the + * linear data part of the SKB, so we rely on this condition. + */ + BUILD_BUG_ON(IPOIB_CM_COPYBREAK > IPOIB_CM_HEAD_SIZE); + + ipoib_register_debugfs(); + + /* + * We create a global workqueue here that is used for all flush + * operations. However, if you attempt to flush a workqueue + * from a task on that same workqueue, it deadlocks the system. + * We want to be able to flush the tasks associated with a + * specific net device, so we also create a workqueue for each + * netdevice. We queue up the tasks for that device only on + * its private workqueue, and we only queue up flush events + * on our global flush workqueue. This avoids the deadlocks. + */ + ipoib_workqueue = alloc_ordered_workqueue("ipoib_flush", 0); + if (!ipoib_workqueue) { + ret = -ENOMEM; + goto err_fs; + } + + ib_sa_register_client(&ipoib_sa_client); + + ret = ib_register_client(&ipoib_client); + if (ret) + goto err_sa; + + ret = ipoib_netlink_init(); + if (ret) + goto err_client; + + if (ipoib_register_genl()) + pr_warn("IPoIB: ipoib_register_genl failed\n"); + +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG + register_netdevice_notifier(&ipoib_netdev_notifier); +#endif + return 0; + +err_client: + ib_unregister_client(&ipoib_client); + +err_sa: + ib_sa_unregister_client(&ipoib_sa_client); + destroy_workqueue(ipoib_workqueue); + +err_fs: + ipoib_unregister_debugfs(); + + return ret; +} + +static void __exit ipoib_cleanup_module(void) +{ +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG + unregister_netdevice_notifier(&ipoib_netdev_notifier); +#endif + ipoib_unregister_genl(); + ipoib_netlink_fini(); + ib_unregister_client(&ipoib_client); + ib_sa_unregister_client(&ipoib_sa_client); + ipoib_unregister_debugfs(); + destroy_workqueue(ipoib_workqueue); +} + +module_init(ipoib_init_module); +module_exit(ipoib_cleanup_module); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/ipoib_multicast.c new file mode 100644 index 0000000..9063932 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -0,0 +1,1055 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2004 Voltaire, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "ipoib.h" + +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG +static int mcast_debug_level; + +module_param(mcast_debug_level, int, 0644); +MODULE_PARM_DESC(mcast_debug_level, + "Enable multicast debug tracing if > 0"); +#endif + +struct ipoib_mcast_iter { + struct net_device *dev; + union ib_gid mgid; + unsigned long created; + unsigned int queuelen; + unsigned int complete; + unsigned int send_only; +}; + +/* join state that allows creating mcg with sendonly member request */ +#define SENDONLY_FULLMEMBER_JOIN 8 + +/* + * This should be called with the priv->lock held + */ +static void __ipoib_mcast_schedule_join_thread(struct ipoib_dev_priv *priv, + struct ipoib_mcast *mcast, + bool delay) +{ + if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) + return; + + /* + * We will be scheduling *something*, so cancel whatever is + * currently scheduled first + */ + cancel_delayed_work(&priv->mcast_task); + if (mcast && delay) { + /* + * We had a failure and want to schedule a retry later + */ + mcast->backoff *= 2; + if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS) + mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS; + mcast->delay_until = jiffies + (mcast->backoff * HZ); + /* + * Mark this mcast for its delay, but restart the + * task immediately. The join task will make sure to + * clear out all entries without delays, and then + * schedule itself to run again when the earliest + * delay expires + */ + queue_delayed_work(priv->wq, &priv->mcast_task, 0); + } else if (delay) { + /* + * Special case of retrying after a failure to + * allocate the broadcast multicast group, wait + * 1 second and try again + */ + queue_delayed_work(priv->wq, &priv->mcast_task, HZ); + } else + queue_delayed_work(priv->wq, &priv->mcast_task, 0); +} + +static void ipoib_mcast_free(struct ipoib_mcast *mcast) +{ + struct net_device *dev = mcast->dev; + int tx_dropped = 0; + + ipoib_dbg_mcast(ipoib_priv(dev), "deleting multicast group %pI6\n", + mcast->mcmember.mgid.raw); + + /* remove all neigh connected to this mcast */ + ipoib_del_neighs_by_gid(dev, mcast->mcmember.mgid.raw); + + if (mcast->ah) + ipoib_put_ah(mcast->ah); + + while (!skb_queue_empty(&mcast->pkt_queue)) { + ++tx_dropped; + dev_kfree_skb_any(skb_dequeue(&mcast->pkt_queue)); + } + + netif_tx_lock_bh(dev); + dev->stats.tx_dropped += tx_dropped; + netif_tx_unlock_bh(dev); + + kfree(mcast); +} + +static struct ipoib_mcast *ipoib_mcast_alloc(struct net_device *dev) +{ + struct ipoib_mcast *mcast; + + mcast = kzalloc(sizeof(*mcast), GFP_ATOMIC); + if (!mcast) + return NULL; + + mcast->dev = dev; + mcast->created = jiffies; + mcast->delay_until = jiffies; + mcast->backoff = 1; + + INIT_LIST_HEAD(&mcast->list); + INIT_LIST_HEAD(&mcast->neigh_list); + skb_queue_head_init(&mcast->pkt_queue); + + return mcast; +} + +static struct ipoib_mcast *__ipoib_mcast_find(struct net_device *dev, void *mgid) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct rb_node *n = priv->multicast_tree.rb_node; + + while (n) { + struct ipoib_mcast *mcast; + int ret; + + mcast = rb_entry(n, struct ipoib_mcast, rb_node); + + ret = memcmp(mgid, mcast->mcmember.mgid.raw, + sizeof (union ib_gid)); + if (ret < 0) + n = n->rb_left; + else if (ret > 0) + n = n->rb_right; + else + return mcast; + } + + return NULL; +} + +static int __ipoib_mcast_add(struct net_device *dev, struct ipoib_mcast *mcast) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct rb_node **n = &priv->multicast_tree.rb_node, *pn = NULL; + + while (*n) { + struct ipoib_mcast *tmcast; + int ret; + + pn = *n; + tmcast = rb_entry(pn, struct ipoib_mcast, rb_node); + + ret = memcmp(mcast->mcmember.mgid.raw, tmcast->mcmember.mgid.raw, + sizeof (union ib_gid)); + if (ret < 0) + n = &pn->rb_left; + else if (ret > 0) + n = &pn->rb_right; + else + return -EEXIST; + } + + rb_link_node(&mcast->rb_node, pn, n); + rb_insert_color(&mcast->rb_node, &priv->multicast_tree); + + return 0; +} + +static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, + struct ib_sa_mcmember_rec *mcmember) +{ + struct net_device *dev = mcast->dev; + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct rdma_netdev *rn = netdev_priv(dev); + struct ipoib_ah *ah; + struct rdma_ah_attr av; + int ret; + int set_qkey = 0; + int mtu; + + mcast->mcmember = *mcmember; + + /* Set the multicast MTU and cached Q_Key before we attach if it's + * the broadcast group. + */ + if (!memcmp(mcast->mcmember.mgid.raw, priv->dev->broadcast + 4, + sizeof (union ib_gid))) { + spin_lock_irq(&priv->lock); + if (!priv->broadcast) { + spin_unlock_irq(&priv->lock); + return -EAGAIN; + } + /*update priv member according to the new mcast*/ + priv->broadcast->mcmember.qkey = mcmember->qkey; + priv->broadcast->mcmember.mtu = mcmember->mtu; + priv->broadcast->mcmember.traffic_class = mcmember->traffic_class; + priv->broadcast->mcmember.rate = mcmember->rate; + priv->broadcast->mcmember.sl = mcmember->sl; + priv->broadcast->mcmember.flow_label = mcmember->flow_label; + priv->broadcast->mcmember.hop_limit = mcmember->hop_limit; + /* assume if the admin and the mcast are the same both can be changed */ + mtu = rdma_mtu_enum_to_int(priv->ca, priv->port, + priv->broadcast->mcmember.mtu); + if (priv->mcast_mtu == priv->admin_mtu) + priv->admin_mtu = IPOIB_UD_MTU(mtu); + priv->mcast_mtu = IPOIB_UD_MTU(mtu); + rn->mtu = priv->mcast_mtu; + + priv->qkey = be32_to_cpu(priv->broadcast->mcmember.qkey); + spin_unlock_irq(&priv->lock); + priv->tx_wr.remote_qkey = priv->qkey; + set_qkey = 1; + } + + if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { + if (test_and_set_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) { + ipoib_warn(priv, "multicast group %pI6 already attached\n", + mcast->mcmember.mgid.raw); + + return 0; + } + + ret = rn->attach_mcast(dev, priv->ca, &mcast->mcmember.mgid, + be16_to_cpu(mcast->mcmember.mlid), + set_qkey, priv->qkey); + if (ret < 0) { + ipoib_warn(priv, "couldn't attach QP to multicast group %pI6\n", + mcast->mcmember.mgid.raw); + + clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags); + return ret; + } + } + + memset(&av, 0, sizeof(av)); + av.type = rdma_ah_find_type(priv->ca, priv->port); + rdma_ah_set_dlid(&av, be16_to_cpu(mcast->mcmember.mlid)); + rdma_ah_set_port_num(&av, priv->port); + rdma_ah_set_sl(&av, mcast->mcmember.sl); + rdma_ah_set_static_rate(&av, mcast->mcmember.rate); + + rdma_ah_set_grh(&av, &mcast->mcmember.mgid, + be32_to_cpu(mcast->mcmember.flow_label), + 0, mcast->mcmember.hop_limit, + mcast->mcmember.traffic_class); + + ah = ipoib_create_ah(dev, priv->pd, &av); + if (IS_ERR(ah)) { + ipoib_warn(priv, "ib_address_create failed %ld\n", + -PTR_ERR(ah)); + /* use original error */ + return PTR_ERR(ah); + } + spin_lock_irq(&priv->lock); + mcast->ah = ah; + spin_unlock_irq(&priv->lock); + + ipoib_dbg_mcast(priv, "MGID %pI6 AV %p, LID 0x%04x, SL %d\n", + mcast->mcmember.mgid.raw, + mcast->ah->ah, + be16_to_cpu(mcast->mcmember.mlid), + mcast->mcmember.sl); + + /* actually send any queued packets */ + netif_tx_lock_bh(dev); + while (!skb_queue_empty(&mcast->pkt_queue)) { + struct sk_buff *skb = skb_dequeue(&mcast->pkt_queue); + + netif_tx_unlock_bh(dev); + + skb->dev = dev; + + ret = dev_queue_xmit(skb); + if (ret) + ipoib_warn(priv, "%s:dev_queue_xmit failed to re-queue packet, ret:%d\n", + __func__, ret); + netif_tx_lock_bh(dev); + } + netif_tx_unlock_bh(dev); + + return 0; +} + +void ipoib_mcast_carrier_on_task(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, + carrier_on_task); + struct ib_port_attr attr; + + if (ib_query_port(priv->ca, priv->port, &attr) || + attr.state != IB_PORT_ACTIVE) { + ipoib_dbg(priv, "Keeping carrier off until IB port is active\n"); + return; + } + /* + * Take rtnl_lock to avoid racing with ipoib_stop() and + * turning the carrier back on while a device is being + * removed. However, ipoib_stop() will attempt to flush + * the workqueue while holding the rtnl lock, so loop + * on trylock until either we get the lock or we see + * FLAG_OPER_UP go away as that signals that we are bailing + * and can safely ignore the carrier on work. + */ + while (!rtnl_trylock()) { + if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) + return; + else + msleep(20); + } + if (!ipoib_cm_admin_enabled(priv->dev)) + dev_set_mtu(priv->dev, min(priv->mcast_mtu, priv->admin_mtu)); + netif_carrier_on(priv->dev); + rtnl_unlock(); +} + +static int ipoib_mcast_join_complete(int status, + struct ib_sa_multicast *multicast) +{ + struct ipoib_mcast *mcast = multicast->context; + struct net_device *dev = mcast->dev; + struct ipoib_dev_priv *priv = ipoib_priv(dev); + + ipoib_dbg_mcast(priv, "%sjoin completion for %pI6 (status %d)\n", + test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) ? + "sendonly " : "", + mcast->mcmember.mgid.raw, status); + + /* We trap for port events ourselves. */ + if (status == -ENETRESET) { + status = 0; + goto out; + } + + if (!status) + status = ipoib_mcast_join_finish(mcast, &multicast->rec); + + if (!status) { + mcast->backoff = 1; + mcast->delay_until = jiffies; + + /* + * Defer carrier on work to priv->wq to avoid a + * deadlock on rtnl_lock here. Requeue our multicast + * work too, which will end up happening right after + * our carrier on task work and will allow us to + * send out all of the non-broadcast joins + */ + if (mcast == priv->broadcast) { + spin_lock_irq(&priv->lock); + queue_work(priv->wq, &priv->carrier_on_task); + __ipoib_mcast_schedule_join_thread(priv, NULL, 0); + goto out_locked; + } + } else { + bool silent_fail = + test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) && + status == -EINVAL; + + if (mcast->logcount < 20) { + if (status == -ETIMEDOUT || status == -EAGAIN || + silent_fail) { + ipoib_dbg_mcast(priv, "%smulticast join failed for %pI6, status %d\n", + test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) ? "sendonly " : "", + mcast->mcmember.mgid.raw, status); + } else { + ipoib_warn(priv, "%smulticast join failed for %pI6, status %d\n", + test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) ? "sendonly " : "", + mcast->mcmember.mgid.raw, status); + } + + if (!silent_fail) + mcast->logcount++; + } + + if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) && + mcast->backoff >= 2) { + /* + * We only retry sendonly joins once before we drop + * the packet and quit trying to deal with the + * group. However, we leave the group in the + * mcast list as an unjoined group. If we want to + * try joining again, we simply queue up a packet + * and restart the join thread. The empty queue + * is why the join thread ignores this group. + */ + mcast->backoff = 1; + netif_tx_lock_bh(dev); + while (!skb_queue_empty(&mcast->pkt_queue)) { + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb_dequeue(&mcast->pkt_queue)); + } + netif_tx_unlock_bh(dev); + } else { + spin_lock_irq(&priv->lock); + /* Requeue this join task with a backoff delay */ + __ipoib_mcast_schedule_join_thread(priv, mcast, 1); + goto out_locked; + } + } +out: + spin_lock_irq(&priv->lock); +out_locked: + /* + * Make sure to set mcast->mc before we clear the busy flag to avoid + * racing with code that checks for BUSY before checking mcast->mc + */ + if (status) + mcast->mc = NULL; + else + mcast->mc = multicast; + clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); + spin_unlock_irq(&priv->lock); + complete(&mcast->done); + + return status; +} + +/* + * Caller must hold 'priv->lock' + */ +static int ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct ib_sa_multicast *multicast; + struct ib_sa_mcmember_rec rec = { + .join_state = 1 + }; + ib_sa_comp_mask comp_mask; + int ret = 0; + + if (!priv->broadcast || + !test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) + return -EINVAL; + + init_completion(&mcast->done); + set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); + + ipoib_dbg_mcast(priv, "joining MGID %pI6\n", mcast->mcmember.mgid.raw); + + rec.mgid = mcast->mcmember.mgid; + rec.port_gid = priv->local_gid; + rec.pkey = cpu_to_be16(priv->pkey); + + comp_mask = + IB_SA_MCMEMBER_REC_MGID | + IB_SA_MCMEMBER_REC_PORT_GID | + IB_SA_MCMEMBER_REC_PKEY | + IB_SA_MCMEMBER_REC_JOIN_STATE; + + if (mcast != priv->broadcast) { + /* + * RFC 4391: + * The MGID MUST use the same P_Key, Q_Key, SL, MTU, + * and HopLimit as those used in the broadcast-GID. The rest + * of attributes SHOULD follow the values used in the + * broadcast-GID as well. + */ + comp_mask |= + IB_SA_MCMEMBER_REC_QKEY | + IB_SA_MCMEMBER_REC_MTU_SELECTOR | + IB_SA_MCMEMBER_REC_MTU | + IB_SA_MCMEMBER_REC_TRAFFIC_CLASS | + IB_SA_MCMEMBER_REC_RATE_SELECTOR | + IB_SA_MCMEMBER_REC_RATE | + IB_SA_MCMEMBER_REC_SL | + IB_SA_MCMEMBER_REC_FLOW_LABEL | + IB_SA_MCMEMBER_REC_HOP_LIMIT; + + rec.qkey = priv->broadcast->mcmember.qkey; + rec.mtu_selector = IB_SA_EQ; + rec.mtu = priv->broadcast->mcmember.mtu; + rec.traffic_class = priv->broadcast->mcmember.traffic_class; + rec.rate_selector = IB_SA_EQ; + rec.rate = priv->broadcast->mcmember.rate; + rec.sl = priv->broadcast->mcmember.sl; + rec.flow_label = priv->broadcast->mcmember.flow_label; + rec.hop_limit = priv->broadcast->mcmember.hop_limit; + + /* + * Send-only IB Multicast joins work at the core IB layer but + * require specific SM support. + * We can use such joins here only if the current SM supports that feature. + * However, if not, we emulate an Ethernet multicast send, + * which does not require a multicast subscription and will + * still send properly. The most appropriate thing to + * do is to create the group if it doesn't exist as that + * most closely emulates the behavior, from a user space + * application perspective, of Ethernet multicast operation. + */ + if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) + rec.join_state = SENDONLY_FULLMEMBER_JOIN; + } + spin_unlock_irq(&priv->lock); + + multicast = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port, + &rec, comp_mask, GFP_KERNEL, + ipoib_mcast_join_complete, mcast); + spin_lock_irq(&priv->lock); + if (IS_ERR(multicast)) { + ret = PTR_ERR(multicast); + ipoib_warn(priv, "ib_sa_join_multicast failed, status %d\n", ret); + /* Requeue this join task with a backoff delay */ + __ipoib_mcast_schedule_join_thread(priv, mcast, 1); + clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); + spin_unlock_irq(&priv->lock); + complete(&mcast->done); + spin_lock_irq(&priv->lock); + return ret; + } + return 0; +} + +void ipoib_mcast_join_task(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = + container_of(work, struct ipoib_dev_priv, mcast_task.work); + struct net_device *dev = priv->dev; + struct ib_port_attr port_attr; + unsigned long delay_until = 0; + struct ipoib_mcast *mcast = NULL; + + if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) + return; + + if (ib_query_port(priv->ca, priv->port, &port_attr)) { + ipoib_dbg(priv, "ib_query_port() failed\n"); + return; + } + if (port_attr.state != IB_PORT_ACTIVE) { + ipoib_dbg(priv, "port state is not ACTIVE (state = %d) suspending join task\n", + port_attr.state); + return; + } + priv->local_lid = port_attr.lid; + netif_addr_lock_bh(dev); + + if (!test_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags)) { + netif_addr_unlock_bh(dev); + return; + } + netif_addr_unlock_bh(dev); + + spin_lock_irq(&priv->lock); + if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) + goto out; + + if (!priv->broadcast) { + struct ipoib_mcast *broadcast; + + broadcast = ipoib_mcast_alloc(dev); + if (!broadcast) { + ipoib_warn(priv, "failed to allocate broadcast group\n"); + /* + * Restart us after a 1 second delay to retry + * creating our broadcast group and attaching to + * it. Until this succeeds, this ipoib dev is + * completely stalled (multicast wise). + */ + __ipoib_mcast_schedule_join_thread(priv, NULL, 1); + goto out; + } + + memcpy(broadcast->mcmember.mgid.raw, priv->dev->broadcast + 4, + sizeof (union ib_gid)); + priv->broadcast = broadcast; + + __ipoib_mcast_add(dev, priv->broadcast); + } + + if (!test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) { + if (IS_ERR_OR_NULL(priv->broadcast->mc) && + !test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags)) { + mcast = priv->broadcast; + if (mcast->backoff > 1 && + time_before(jiffies, mcast->delay_until)) { + delay_until = mcast->delay_until; + mcast = NULL; + } + } + goto out; + } + + /* + * We'll never get here until the broadcast group is both allocated + * and attached + */ +rescan: + list_for_each_entry(mcast, &priv->multicast_list, list) { + if (IS_ERR_OR_NULL(mcast->mc) && + !test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags) && + (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) || + !skb_queue_empty(&mcast->pkt_queue))) { + if (mcast->backoff == 1 || + time_after_eq(jiffies, mcast->delay_until)) { + /* Found the next unjoined group */ + if (ipoib_mcast_join(dev, mcast)) { + spin_unlock_irq(&priv->lock); + return; + } + /*ipoib_mcast_join() drops lock, restart foreach loop*/ + goto rescan; + } else if (!delay_until || + time_before(mcast->delay_until, delay_until)) + delay_until = mcast->delay_until; + } + } + + mcast = NULL; + ipoib_dbg_mcast(priv, "successfully started all multicast joins\n"); + +out: + if (delay_until) { + cancel_delayed_work(&priv->mcast_task); + queue_delayed_work(priv->wq, &priv->mcast_task, + delay_until - jiffies); + } + if (mcast) + ipoib_mcast_join(dev, mcast); + + spin_unlock_irq(&priv->lock); +} + +void ipoib_mcast_start_thread(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + unsigned long flags; + + ipoib_dbg_mcast(priv, "starting multicast thread\n"); + + spin_lock_irqsave(&priv->lock, flags); + __ipoib_mcast_schedule_join_thread(priv, NULL, 0); + spin_unlock_irqrestore(&priv->lock, flags); +} + +void ipoib_mcast_stop_thread(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + + ipoib_dbg_mcast(priv, "stopping multicast thread\n"); + + cancel_delayed_work_sync(&priv->mcast_task); +} + +static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct rdma_netdev *rn = netdev_priv(dev); + int ret = 0; + + if (test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) + ipoib_warn(priv, "ipoib_mcast_leave on an in-flight join\n"); + + if (!IS_ERR_OR_NULL(mcast->mc)) + ib_sa_free_multicast(mcast->mc); + + if (test_and_clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) { + ipoib_dbg_mcast(priv, "leaving MGID %pI6\n", + mcast->mcmember.mgid.raw); + + /* Remove ourselves from the multicast group */ + ret = rn->detach_mcast(dev, priv->ca, &mcast->mcmember.mgid, + be16_to_cpu(mcast->mcmember.mlid)); + if (ret) + ipoib_warn(priv, "ib_detach_mcast failed (result = %d)\n", ret); + } else if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) + ipoib_dbg(priv, "leaving with no mcmember but not a " + "SENDONLY join\n"); + + return 0; +} + +/* + * Check if the multicast group is sendonly. If so remove it from the maps + * and add to the remove list + */ +void ipoib_check_and_add_mcast_sendonly(struct ipoib_dev_priv *priv, u8 *mgid, + struct list_head *remove_list) +{ + /* Is this multicast ? */ + if (*mgid == 0xff) { + struct ipoib_mcast *mcast = __ipoib_mcast_find(priv->dev, mgid); + + if (mcast && test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { + list_del(&mcast->list); + rb_erase(&mcast->rb_node, &priv->multicast_tree); + list_add_tail(&mcast->list, remove_list); + } + } +} + +void ipoib_mcast_remove_list(struct list_head *remove_list) +{ + struct ipoib_mcast *mcast, *tmcast; + + /* + * make sure the in-flight joins have finished before we attempt + * to leave + */ + list_for_each_entry_safe(mcast, tmcast, remove_list, list) + if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) + wait_for_completion(&mcast->done); + + list_for_each_entry_safe(mcast, tmcast, remove_list, list) { + ipoib_mcast_leave(mcast->dev, mcast); + ipoib_mcast_free(mcast); + } +} + +void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct rdma_netdev *rn = netdev_priv(dev); + struct ipoib_mcast *mcast; + unsigned long flags; + void *mgid = daddr + 4; + + spin_lock_irqsave(&priv->lock, flags); + + if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags) || + !priv->broadcast || + !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) { + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + goto unlock; + } + + mcast = __ipoib_mcast_find(dev, mgid); + if (!mcast || !mcast->ah) { + if (!mcast) { + /* Let's create a new send only group now */ + ipoib_dbg_mcast(priv, "setting up send only multicast group for %pI6\n", + mgid); + + mcast = ipoib_mcast_alloc(dev); + if (!mcast) { + ipoib_warn(priv, "unable to allocate memory " + "for multicast structure\n"); + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + goto unlock; + } + + set_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags); + memcpy(mcast->mcmember.mgid.raw, mgid, + sizeof (union ib_gid)); + __ipoib_mcast_add(dev, mcast); + list_add_tail(&mcast->list, &priv->multicast_list); + } + if (skb_queue_len(&mcast->pkt_queue) < IPOIB_MAX_MCAST_QUEUE) { + /* put pseudoheader back on for next time */ + skb_push(skb, sizeof(struct ipoib_pseudo_header)); + skb_queue_tail(&mcast->pkt_queue, skb); + } else { + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + } + if (!test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) { + __ipoib_mcast_schedule_join_thread(priv, NULL, 0); + } + } else { + struct ipoib_neigh *neigh; + + spin_unlock_irqrestore(&priv->lock, flags); + neigh = ipoib_neigh_get(dev, daddr); + spin_lock_irqsave(&priv->lock, flags); + if (!neigh) { + neigh = ipoib_neigh_alloc(daddr, dev); + /* Make sure that the neigh will be added only + * once to mcast list. + */ + if (neigh && list_empty(&neigh->list)) { + kref_get(&mcast->ah->ref); + neigh->ah = mcast->ah; + neigh->ah->valid = 1; + list_add_tail(&neigh->list, &mcast->neigh_list); + } + } + spin_unlock_irqrestore(&priv->lock, flags); + mcast->ah->last_send = rn->send(dev, skb, mcast->ah->ah, + IB_MULTICAST_QPN); + if (neigh) + ipoib_neigh_put(neigh); + return; + } + +unlock: + spin_unlock_irqrestore(&priv->lock, flags); +} + +void ipoib_mcast_dev_flush(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + LIST_HEAD(remove_list); + struct ipoib_mcast *mcast, *tmcast; + unsigned long flags; + + mutex_lock(&priv->mcast_mutex); + ipoib_dbg_mcast(priv, "flushing multicast list\n"); + + spin_lock_irqsave(&priv->lock, flags); + + list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) { + list_del(&mcast->list); + rb_erase(&mcast->rb_node, &priv->multicast_tree); + list_add_tail(&mcast->list, &remove_list); + } + + if (priv->broadcast) { + rb_erase(&priv->broadcast->rb_node, &priv->multicast_tree); + list_add_tail(&priv->broadcast->list, &remove_list); + priv->broadcast = NULL; + } + + spin_unlock_irqrestore(&priv->lock, flags); + + ipoib_mcast_remove_list(&remove_list); + mutex_unlock(&priv->mcast_mutex); +} + +static int ipoib_mcast_addr_is_valid(const u8 *addr, const u8 *broadcast) +{ + /* reserved QPN, prefix, scope */ + if (memcmp(addr, broadcast, 6)) + return 0; + /* signature lower, pkey */ + if (memcmp(addr + 7, broadcast + 7, 3)) + return 0; + return 1; +} + +void ipoib_mcast_restart_task(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = + container_of(work, struct ipoib_dev_priv, restart_task); + struct net_device *dev = priv->dev; + struct netdev_hw_addr *ha; + struct ipoib_mcast *mcast, *tmcast; + LIST_HEAD(remove_list); + struct ib_sa_mcmember_rec rec; + + if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) + /* + * shortcut...on shutdown flush is called next, just + * let it do all the work + */ + return; + + ipoib_dbg_mcast(priv, "restarting multicast task\n"); + + netif_addr_lock_bh(dev); + spin_lock_irq(&priv->lock); + + /* + * Unfortunately, the networking core only gives us a list of all of + * the multicast hardware addresses. We need to figure out which ones + * are new and which ones have been removed + */ + + /* Clear out the found flag */ + list_for_each_entry(mcast, &priv->multicast_list, list) + clear_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags); + + /* Mark all of the entries that are found or don't exist */ + netdev_for_each_mc_addr(ha, dev) { + union ib_gid mgid; + + if (!ipoib_mcast_addr_is_valid(ha->addr, dev->broadcast)) + continue; + + memcpy(mgid.raw, ha->addr + 4, sizeof(mgid)); + + mcast = __ipoib_mcast_find(dev, &mgid); + if (!mcast || test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { + struct ipoib_mcast *nmcast; + + /* ignore group which is directly joined by userspace */ + if (test_bit(IPOIB_FLAG_UMCAST, &priv->flags) && + !ib_sa_get_mcmember_rec(priv->ca, priv->port, &mgid, &rec)) { + ipoib_dbg_mcast(priv, "ignoring multicast entry for mgid %pI6\n", + mgid.raw); + continue; + } + + /* Not found or send-only group, let's add a new entry */ + ipoib_dbg_mcast(priv, "adding multicast entry for mgid %pI6\n", + mgid.raw); + + nmcast = ipoib_mcast_alloc(dev); + if (!nmcast) { + ipoib_warn(priv, "unable to allocate memory for multicast structure\n"); + continue; + } + + set_bit(IPOIB_MCAST_FLAG_FOUND, &nmcast->flags); + + nmcast->mcmember.mgid = mgid; + + if (mcast) { + /* Destroy the send only entry */ + list_move_tail(&mcast->list, &remove_list); + + rb_replace_node(&mcast->rb_node, + &nmcast->rb_node, + &priv->multicast_tree); + } else + __ipoib_mcast_add(dev, nmcast); + + list_add_tail(&nmcast->list, &priv->multicast_list); + } + + if (mcast) + set_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags); + } + + /* Remove all of the entries don't exist anymore */ + list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) { + if (!test_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags) && + !test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { + ipoib_dbg_mcast(priv, "deleting multicast group %pI6\n", + mcast->mcmember.mgid.raw); + + rb_erase(&mcast->rb_node, &priv->multicast_tree); + + /* Move to the remove list */ + list_move_tail(&mcast->list, &remove_list); + } + } + + spin_unlock_irq(&priv->lock); + netif_addr_unlock_bh(dev); + + ipoib_mcast_remove_list(&remove_list); + + /* + * Double check that we are still up + */ + if (test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) { + spin_lock_irq(&priv->lock); + __ipoib_mcast_schedule_join_thread(priv, NULL, 0); + spin_unlock_irq(&priv->lock); + } +} + +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG + +struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev) +{ + struct ipoib_mcast_iter *iter; + + iter = kmalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return NULL; + + iter->dev = dev; + memset(iter->mgid.raw, 0, 16); + + if (ipoib_mcast_iter_next(iter)) { + kfree(iter); + return NULL; + } + + return iter; +} + +int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter) +{ + struct ipoib_dev_priv *priv = ipoib_priv(iter->dev); + struct rb_node *n; + struct ipoib_mcast *mcast; + int ret = 1; + + spin_lock_irq(&priv->lock); + + n = rb_first(&priv->multicast_tree); + + while (n) { + mcast = rb_entry(n, struct ipoib_mcast, rb_node); + + if (memcmp(iter->mgid.raw, mcast->mcmember.mgid.raw, + sizeof (union ib_gid)) < 0) { + iter->mgid = mcast->mcmember.mgid; + iter->created = mcast->created; + iter->queuelen = skb_queue_len(&mcast->pkt_queue); + iter->complete = !!mcast->ah; + iter->send_only = !!(mcast->flags & (1 << IPOIB_MCAST_FLAG_SENDONLY)); + + ret = 0; + + break; + } + + n = rb_next(n); + } + + spin_unlock_irq(&priv->lock); + + return ret; +} + +void ipoib_mcast_iter_read(struct ipoib_mcast_iter *iter, + union ib_gid *mgid, + unsigned long *created, + unsigned int *queuelen, + unsigned int *complete, + unsigned int *send_only) +{ + *mgid = iter->mgid; + *created = iter->created; + *queuelen = iter->queuelen; + *complete = iter->complete; + *send_only = iter->send_only; +} + +#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/ipoib_netlink.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/ipoib_netlink.c new file mode 100644 index 0000000..5b05cf3 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/ipoib_netlink.c @@ -0,0 +1,193 @@ +/* + * Copyright (c) 2012 Mellanox Technologies. - All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include /* For ARPHRD_xxx */ +#include +#include +#include "ipoib.h" + +static const struct nla_policy ipoib_policy[IFLA_IPOIB_MAX + 1] = { + [IFLA_IPOIB_PKEY] = { .type = NLA_U16 }, + [IFLA_IPOIB_MODE] = { .type = NLA_U16 }, + [IFLA_IPOIB_UMCAST] = { .type = NLA_U16 }, +}; + +static int ipoib_fill_info(struct sk_buff *skb, const struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + u16 val; + + if (nla_put_u16(skb, IFLA_IPOIB_PKEY, priv->pkey)) + goto nla_put_failure; + + val = test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); + if (nla_put_u16(skb, IFLA_IPOIB_MODE, val)) + goto nla_put_failure; + + val = test_bit(IPOIB_FLAG_UMCAST, &priv->flags); + if (nla_put_u16(skb, IFLA_IPOIB_UMCAST, val)) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static int ipoib_changelink(struct net_device *dev, struct nlattr *tb[], + struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + u16 mode, umcast; + int ret = 0; + + if (data[IFLA_IPOIB_MODE]) { + mode = nla_get_u16(data[IFLA_IPOIB_MODE]); + if (mode == IPOIB_MODE_DATAGRAM) + ret = ipoib_set_mode(dev, "datagram\n"); + else if (mode == IPOIB_MODE_CONNECTED) + ret = ipoib_set_mode(dev, "connected\n"); + else + ret = -EINVAL; + + if (ret < 0) + goto out_err; + } + + if (data[IFLA_IPOIB_UMCAST]) { + umcast = nla_get_u16(data[IFLA_IPOIB_UMCAST]); + ipoib_set_umcast(dev, umcast); + } + +out_err: + return ret; +} + +static int ipoib_new_child_link(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + struct net_device *pdev; + struct ipoib_dev_priv *ppriv; + u16 child_pkey; + int err; + + if (!tb[IFLA_LINK]) + return -EINVAL; + + pdev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK])); + if (!pdev || pdev->type != ARPHRD_INFINIBAND) + return -ENODEV; + + ppriv = ipoib_priv(pdev); + + if (test_bit(IPOIB_FLAG_SUBINTERFACE, &ppriv->flags)) { + ipoib_warn(ppriv, "child creation disallowed for child devices\n"); + return -EINVAL; + } + + if (!data || !data[IFLA_IPOIB_PKEY]) { + ipoib_dbg(ppriv, "no pkey specified, using parent pkey\n"); + child_pkey = ppriv->pkey; + } else + child_pkey = nla_get_u16(data[IFLA_IPOIB_PKEY]); + + err = ipoib_intf_init(ppriv->ca, ppriv->port, dev->name, dev); + if (err) { + ipoib_warn(ppriv, "failed to initialize pkey device\n"); + return err; + } + + err = __ipoib_vlan_add(ppriv, ipoib_priv(dev), + child_pkey, IPOIB_RTNL_CHILD); + if (err) + return err; + + if (data) { + err = ipoib_changelink(dev, tb, data, extack); + if (err) { + unregister_netdevice(dev); + return err; + } + } + + return 0; +} + +static void ipoib_del_child_link(struct net_device *dev, struct list_head *head) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + + if (!priv->parent) + return; + + unregister_netdevice_queue(dev, head); +} + +static size_t ipoib_get_size(const struct net_device *dev) +{ + return nla_total_size(2) + /* IFLA_IPOIB_PKEY */ + nla_total_size(2) + /* IFLA_IPOIB_MODE */ + nla_total_size(2); /* IFLA_IPOIB_UMCAST */ +} + +static struct rtnl_link_ops ipoib_link_ops __read_mostly = { + .kind = "ipoib", + .netns_refund = true, + .maxtype = IFLA_IPOIB_MAX, + .policy = ipoib_policy, + .priv_size = sizeof(struct ipoib_dev_priv), + .setup = ipoib_setup_common, + .newlink = ipoib_new_child_link, + .dellink = ipoib_del_child_link, + .changelink = ipoib_changelink, + .get_size = ipoib_get_size, + .fill_info = ipoib_fill_info, +}; + +struct rtnl_link_ops *ipoib_get_link_ops(void) +{ + return &ipoib_link_ops; +} + +int __init ipoib_netlink_init(void) +{ + return rtnl_link_register(&ipoib_link_ops); +} + +void __exit ipoib_netlink_fini(void) +{ + rtnl_link_unregister(&ipoib_link_ops); +} + +MODULE_ALIAS_RTNL_LINK("ipoib"); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/ipoib_verbs.c new file mode 100644 index 0000000..e0878fa --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/ipoib_verbs.c @@ -0,0 +1,298 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include "ipoib.h" + +int ipoib_mcast_attach(struct net_device *dev, struct ib_device *hca, + union ib_gid *mgid, u16 mlid, int set_qkey, u32 qkey) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct ib_qp_attr *qp_attr = NULL; + int ret; + u16 pkey_index; + + if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &pkey_index)) { + clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); + ret = -ENXIO; + goto out; + } + set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); + + if (set_qkey) { + ret = -ENOMEM; + qp_attr = kmalloc(sizeof(*qp_attr), GFP_KERNEL); + if (!qp_attr) + goto out; + + /* set correct QKey for QP */ + qp_attr->qkey = qkey; + ret = ib_modify_qp(priv->qp, qp_attr, IB_QP_QKEY); + if (ret) { + ipoib_warn(priv, "failed to modify QP, ret = %d\n", ret); + goto out; + } + } + + /* attach QP to multicast group */ + ret = ib_attach_mcast(priv->qp, mgid, mlid); + if (ret) + ipoib_warn(priv, "failed to attach to multicast group, ret = %d\n", ret); + +out: + kfree(qp_attr); + return ret; +} + +int ipoib_mcast_detach(struct net_device *dev, struct ib_device *hca, + union ib_gid *mgid, u16 mlid) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + int ret; + + ret = ib_detach_mcast(priv->qp, mgid, mlid); + + return ret; +} + +int ipoib_init_qp(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + int ret; + struct ib_qp_attr qp_attr; + int attr_mask; + + if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) + return -1; + + qp_attr.qp_state = IB_QPS_INIT; + qp_attr.qkey = 0; + qp_attr.port_num = priv->port; + qp_attr.pkey_index = priv->pkey_index; + attr_mask = + IB_QP_QKEY | + IB_QP_PORT | + IB_QP_PKEY_INDEX | + IB_QP_STATE; + ret = ib_modify_qp(priv->qp, &qp_attr, attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify QP to init, ret = %d\n", ret); + goto out_fail; + } + + qp_attr.qp_state = IB_QPS_RTR; + /* Can't set this in a INIT->RTR transition */ + attr_mask &= ~IB_QP_PORT; + ret = ib_modify_qp(priv->qp, &qp_attr, attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify QP to RTR, ret = %d\n", ret); + goto out_fail; + } + + qp_attr.qp_state = IB_QPS_RTS; + qp_attr.sq_psn = 0; + attr_mask |= IB_QP_SQ_PSN; + attr_mask &= ~IB_QP_PKEY_INDEX; + ret = ib_modify_qp(priv->qp, &qp_attr, attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify QP to RTS, ret = %d\n", ret); + goto out_fail; + } + + return 0; + +out_fail: + qp_attr.qp_state = IB_QPS_RESET; + if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE)) + ipoib_warn(priv, "Failed to modify QP to RESET state\n"); + + return ret; +} + +int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct ib_qp_init_attr init_attr = { + .cap = { + .max_send_wr = priv->sendq_size, + .max_recv_wr = priv->recvq_size, + .max_send_sge = min_t(u32, priv->ca->attrs.max_send_sge, + MAX_SKB_FRAGS + 1), + .max_recv_sge = IPOIB_UD_RX_SG, + .max_inline_data = IPOIB_MAX_INLINE_SIZE, + }, + .sq_sig_type = IB_SIGNAL_ALL_WR, + .qp_type = IB_QPT_UD + }; + struct ib_cq_init_attr cq_attr = {}; + + int ret, size, req_vec; + int i; + static atomic_t counter; + + size = priv->recvq_size + 1; + ret = ipoib_cm_dev_init(dev); + if (!ret) { + size += priv->sendq_size; + if (ipoib_cm_has_srq(dev)) + size += priv->recvq_size + 1; /* 1 extra for rx_drain_qp */ + else + size += priv->recvq_size * ipoib_max_conn_qp; + } else + if (ret != -EOPNOTSUPP) + return ret; + + req_vec = atomic_inc_return(&counter) * 2; + cq_attr.cqe = size; + cq_attr.comp_vector = req_vec % priv->ca->num_comp_vectors; + priv->recv_cq = ib_create_cq(priv->ca, ipoib_ib_rx_completion, NULL, + priv, &cq_attr); + if (IS_ERR(priv->recv_cq)) { + pr_warn("%s: failed to create receive CQ\n", ca->name); + goto out_cm_dev_cleanup; + } + + cq_attr.cqe = priv->sendq_size; + cq_attr.comp_vector = (req_vec + 1) % priv->ca->num_comp_vectors; + priv->send_cq = ib_create_cq(priv->ca, ipoib_ib_tx_completion, NULL, + priv, &cq_attr); + if (IS_ERR(priv->send_cq)) { + pr_warn("%s: failed to create send CQ\n", ca->name); + goto out_free_recv_cq; + } + + if (ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP)) + goto out_free_send_cq; + + if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP)) + goto out_free_send_cq; + + init_attr.send_cq = priv->send_cq; + init_attr.recv_cq = priv->recv_cq; + + if (priv->hca_caps & IB_DEVICE_UD_TSO) + init_attr.create_flags |= IB_QP_CREATE_IPOIB_UD_LSO; + + if (priv->hca_caps & IB_DEVICE_BLOCK_MULTICAST_LOOPBACK) + init_attr.create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK; + + if (priv->hca_caps & IB_DEVICE_MANAGED_FLOW_STEERING) + init_attr.create_flags |= IB_QP_CREATE_NETIF_QP; + + if (priv->hca_caps & IB_DEVICE_RDMA_NETDEV_OPA) + init_attr.create_flags |= IB_QP_CREATE_NETDEV_USE; + + priv->qp = ib_create_qp(priv->pd, &init_attr); + if (IS_ERR(priv->qp)) { + pr_warn("%s: failed to create QP\n", ca->name); + goto out_free_send_cq; + } + + if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP)) + goto out_free_send_cq; + + for (i = 0; i < MAX_SKB_FRAGS + 1; ++i) + priv->tx_sge[i].lkey = priv->pd->local_dma_lkey; + + priv->tx_wr.wr.opcode = IB_WR_SEND; + priv->tx_wr.wr.sg_list = priv->tx_sge; + priv->tx_wr.wr.send_flags = IB_SEND_SIGNALED; + + priv->rx_sge[0].lkey = priv->pd->local_dma_lkey; + + priv->rx_sge[0].length = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu); + priv->rx_wr.num_sge = 1; + + priv->rx_wr.next = NULL; + priv->rx_wr.sg_list = priv->rx_sge; + + if (init_attr.cap.max_send_sge > 1) + dev->features |= NETIF_F_SG; + + priv->max_send_sge = init_attr.cap.max_send_sge; + + return 0; + +out_free_send_cq: + ib_destroy_cq(priv->send_cq); + +out_free_recv_cq: + ib_destroy_cq(priv->recv_cq); + +out_cm_dev_cleanup: + ipoib_cm_dev_cleanup(dev); + + return -ENODEV; +} + +void ipoib_transport_dev_cleanup(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + + if (priv->qp) { + if (ib_destroy_qp(priv->qp)) + ipoib_warn(priv, "ib_qp_destroy failed\n"); + + priv->qp = NULL; + } + + ib_destroy_cq(priv->send_cq); + ib_destroy_cq(priv->recv_cq); +} + +void ipoib_event(struct ib_event_handler *handler, + struct ib_event *record) +{ + struct ipoib_dev_priv *priv = + container_of(handler, struct ipoib_dev_priv, event_handler); + + if (record->element.port_num != priv->port) + return; + + ipoib_dbg(priv, "Event %d on device %s port %d\n", record->event, + dev_name(&record->device->dev), record->element.port_num); + + if (record->event == IB_EVENT_CLIENT_REREGISTER) { + queue_work(ipoib_workqueue, &priv->flush_light); + } else if (record->event == IB_EVENT_PORT_ERR || + record->event == IB_EVENT_PORT_ACTIVE || + record->event == IB_EVENT_LID_CHANGE) { + queue_work(ipoib_workqueue, &priv->flush_normal); + } else if (record->event == IB_EVENT_PKEY_CHANGE) { + queue_work(ipoib_workqueue, &priv->flush_heavy); + } else if (record->event == IB_EVENT_GID_CHANGE && + !test_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags)) { + queue_work(ipoib_workqueue, &priv->flush_light); + } +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/ipoib_vlan.c new file mode 100644 index 0000000..0322dc7 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib/ipoib_vlan.c @@ -0,0 +1,294 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include +#include + +#include + +#include "ipoib.h" + +static ssize_t parent_show(struct device *d, struct device_attribute *attr, + char *buf) +{ + struct net_device *dev = to_net_dev(d); + struct ipoib_dev_priv *priv = ipoib_priv(dev); + + return sysfs_emit(buf, "%s\n", priv->parent->name); +} +static DEVICE_ATTR_RO(parent); + +static bool is_child_unique(struct ipoib_dev_priv *ppriv, + struct ipoib_dev_priv *priv) +{ + struct ipoib_dev_priv *tpriv; + + ASSERT_RTNL(); + + /* + * Since the legacy sysfs interface uses pkey for deletion it cannot + * support more than one interface with the same pkey, it creates + * ambiguity. The RTNL interface deletes using the netdev so it does + * not have a problem to support duplicated pkeys. + */ + if (priv->child_type != IPOIB_LEGACY_CHILD) + return true; + + /* + * First ensure this isn't a duplicate. We check the parent device and + * then all of the legacy child interfaces to make sure the Pkey + * doesn't match. + */ + if (ppriv->pkey == priv->pkey) + return false; + + list_for_each_entry(tpriv, &ppriv->child_intfs, list) { + if (tpriv->pkey == priv->pkey && + tpriv->child_type == IPOIB_LEGACY_CHILD) + return false; + } + + return true; +} + +/* + * NOTE: If this function fails then the priv->dev will remain valid, however + * priv will have been freed and must not be touched by caller in the error + * case. + * + * If (ndev->reg_state == NETREG_UNINITIALIZED) then it is up to the caller to + * free the net_device (just as rtnl_newlink does) otherwise the net_device + * will be freed when the rtnl is unlocked. + */ +int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct ipoib_dev_priv *priv, + u16 pkey, int type) +{ + struct net_device *ndev = priv->dev; + int result; + struct rdma_netdev *rn = netdev_priv(ndev); + + ASSERT_RTNL(); + + /* + * We do not need to touch priv if register_netdevice fails, so just + * always use this flow. + */ + ndev->priv_destructor = ipoib_intf_free; + + /* + * Racing with unregister of the parent must be prevented by the + * caller. + */ + WARN_ON(ppriv->dev->reg_state != NETREG_REGISTERED); + + if (pkey == 0 || pkey == 0x8000) { + result = -EINVAL; + goto out_early; + } + + rn->mtu = priv->mcast_mtu; + + priv->parent = ppriv->dev; + priv->pkey = pkey; + priv->child_type = type; + + if (!is_child_unique(ppriv, priv)) { + result = -ENOTUNIQ; + goto out_early; + } + + result = register_netdevice(ndev); + if (result) { + ipoib_warn(priv, "failed to initialize; error %i", result); + + /* + * register_netdevice sometimes calls priv_destructor, + * sometimes not. Make sure it was done. + */ + goto out_early; + } + + /* RTNL childs don't need proprietary sysfs entries */ + if (type == IPOIB_LEGACY_CHILD) { + if (ipoib_cm_add_mode_attr(ndev)) + goto sysfs_failed; + if (ipoib_add_pkey_attr(ndev)) + goto sysfs_failed; + if (ipoib_add_umcast_attr(ndev)) + goto sysfs_failed; + + if (device_create_file(&ndev->dev, &dev_attr_parent)) + goto sysfs_failed; + } + + return 0; + +sysfs_failed: + unregister_netdevice(priv->dev); + return -ENOMEM; + +out_early: + if (ndev->priv_destructor) + ndev->priv_destructor(ndev); + return result; +} + +int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) +{ + struct ipoib_dev_priv *ppriv, *priv; + char intf_name[IFNAMSIZ]; + struct net_device *ndev; + int result; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (!rtnl_trylock()) + return restart_syscall(); + + if (pdev->reg_state != NETREG_REGISTERED) { + rtnl_unlock(); + return -EPERM; + } + + ppriv = ipoib_priv(pdev); + + snprintf(intf_name, sizeof(intf_name), "%s.%04x", + ppriv->dev->name, pkey); + + ndev = ipoib_intf_alloc(ppriv->ca, ppriv->port, intf_name); + if (IS_ERR(ndev)) { + result = PTR_ERR(ndev); + goto out; + } + priv = ipoib_priv(ndev); + + ndev->rtnl_link_ops = ipoib_get_link_ops(); + + result = __ipoib_vlan_add(ppriv, priv, pkey, IPOIB_LEGACY_CHILD); + + if (result && ndev->reg_state == NETREG_UNINITIALIZED) + free_netdev(ndev); + +out: + rtnl_unlock(); + + return result; +} + +struct ipoib_vlan_delete_work { + struct work_struct work; + struct net_device *dev; +}; + +/* + * sysfs callbacks of a netdevice cannot obtain the rtnl lock as + * unregister_netdev ultimately deletes the sysfs files while holding the rtnl + * lock. This deadlocks the system. + * + * A callback can use rtnl_trylock to avoid the deadlock but it cannot call + * unregister_netdev as that internally takes and releases the rtnl_lock. So + * instead we find the netdev to unregister and then do the actual unregister + * from the global work queue where we can obtain the rtnl_lock safely. + */ +static void ipoib_vlan_delete_task(struct work_struct *work) +{ + struct ipoib_vlan_delete_work *pwork = + container_of(work, struct ipoib_vlan_delete_work, work); + struct net_device *dev = pwork->dev; + + rtnl_lock(); + + /* Unregistering tasks can race with another task or parent removal */ + if (dev->reg_state == NETREG_REGISTERED) { + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent); + + ipoib_dbg(ppriv, "delete child vlan %s\n", dev->name); + unregister_netdevice(dev); + } + + rtnl_unlock(); + + kfree(pwork); +} + +int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey) +{ + struct ipoib_dev_priv *ppriv, *priv, *tpriv; + int rc; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (!rtnl_trylock()) + return restart_syscall(); + + if (pdev->reg_state != NETREG_REGISTERED) { + rtnl_unlock(); + return -EPERM; + } + + ppriv = ipoib_priv(pdev); + + rc = -ENODEV; + list_for_each_entry_safe(priv, tpriv, &ppriv->child_intfs, list) { + if (priv->pkey == pkey && + priv->child_type == IPOIB_LEGACY_CHILD) { + struct ipoib_vlan_delete_work *work; + + work = kmalloc(sizeof(*work), GFP_KERNEL); + if (!work) { + rc = -ENOMEM; + goto out; + } + + down_write(&ppriv->vlan_rwsem); + list_del_init(&priv->list); + up_write(&ppriv->vlan_rwsem); + work->dev = priv->dev; + INIT_WORK(&work->work, ipoib_vlan_delete_task); + queue_work(ipoib_workqueue, &work->work); + + rc = 0; + break; + } + } + +out: + rtnl_unlock(); + + return rc; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib_1.5.3/Kconfig b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib_1.5.3/Kconfig new file mode 100644 index 0000000..9d9a9dc --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib_1.5.3/Kconfig @@ -0,0 +1,50 @@ +config INFINIBAND_IPOIB + tristate "IP-over-InfiniBand" + depends on NETDEVICES && INET && (IPV6 || IPV6=n) + select INET_LRO + ---help--- + Support for the IP-over-InfiniBand protocol (IPoIB). This + transports IP packets over InfiniBand so you can use your IB + device as a fancy NIC. + + See Documentation/infiniband/ipoib.txt for more information + +config INFINIBAND_IPOIB_CM + bool "IP-over-InfiniBand Connected Mode support" + depends on INFINIBAND_IPOIB + default n + ---help--- + This option enables support for IPoIB connected mode. After + enabling this option, you need to switch to connected mode + through /sys/class/net/ibXXX/mode to actually create + connections, and then increase the interface MTU with + e.g. ifconfig ib0 mtu 65520. + + WARNING: Enabling connected mode will trigger some packet + drops for multicast and UD mode traffic from this interface, + unless you limit mtu for these destinations to 2044. + +config INFINIBAND_IPOIB_DEBUG + bool "IP-over-InfiniBand debugging" if EMBEDDED + depends on INFINIBAND_IPOIB + default y + ---help--- + This option causes debugging code to be compiled into the + IPoIB driver. The output can be turned on via the + debug_level and mcast_debug_level module parameters (which + can also be set after the driver is loaded through sysfs). + + This option also creates a directory tree under ipoib/ in + debugfs, which contains files that expose debugging + information about IB multicast groups used by the IPoIB + driver. + +config INFINIBAND_IPOIB_DEBUG_DATA + bool "IP-over-InfiniBand data path debugging" + depends on INFINIBAND_IPOIB_DEBUG + ---help--- + This option compiles debugging code into the data path + of the IPoIB driver. The output can be turned on via the + data_debug_level module parameter; however, even with output + turned off, this debugging code will have some performance + impact. diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib_1.5.3/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib_1.5.3/Makefile new file mode 100644 index 0000000..3006845 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib_1.5.3/Makefile @@ -0,0 +1,12 @@ +obj-$(CONFIG_INFINIBAND_IPOIB) += ib_ipoib.o + +ib_ipoib-y := ipoib_main.o \ + ipoib_ib.o \ + ipoib_multicast.o \ + ipoib_verbs.o \ + ipoib_vlan.o \ + ipoib_ethtool.o \ + inet_lro.o +ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_CM) += ipoib_cm.o +ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_DEBUG) += ipoib_fs.o + diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib_1.5.3/inet_lro.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib_1.5.3/inet_lro.c new file mode 100644 index 0000000..daaadea --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib_1.5.3/inet_lro.c @@ -0,0 +1,605 @@ +/* + * linux/net/ipv4/inet_lro.c + * + * Large Receive Offload (ipv4 / tcp) + * + * (C) Copyright IBM Corp. 2007 + * + * Authors: + * Jan-Bernd Themann + * Christoph Raisch + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + + +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_AUTHOR("Jan-Bernd Themann "); +MODULE_DESCRIPTION("Large Receive Offload (ipv4 / tcp)"); + +#define TCP_HDR_LEN(tcph) (tcph->doff << 2) +#define IP_HDR_LEN(iph) (iph->ihl << 2) +#define TCP_PAYLOAD_LENGTH(iph, tcph) \ + (ntohs(iph->tot_len) - IP_HDR_LEN(iph) - TCP_HDR_LEN(tcph)) + +#define IPH_LEN_WO_OPTIONS 5 +#define TCPH_LEN_WO_OPTIONS 5 +#define TCPH_LEN_W_TIMESTAMP 8 + +#define LRO_MAX_PG_HLEN 64 + +#define LRO_INC_STATS(lro_mgr, attr) { lro_mgr->stats.attr++; } + +/* + * Basic tcp checks whether packet is suitable for LRO + */ + +static int lro_tcp_ip_check(struct iphdr *iph, struct tcphdr *tcph, + int len, struct net_lro_desc *lro_desc) +{ + /* check ip header: don't aggregate padded frames */ + if (ntohs(iph->tot_len) != len) + return -1; + + if (TCP_PAYLOAD_LENGTH(iph, tcph) == 0) + return -1; + + if (iph->ihl != IPH_LEN_WO_OPTIONS) + return -1; + + if (tcph->cwr || tcph->ece || tcph->urg || !tcph->ack + || tcph->rst || tcph->syn || tcph->fin) + return -1; + + if (INET_ECN_is_ce(ipv4_get_dsfield(iph))) + return -1; + + if (tcph->doff != TCPH_LEN_WO_OPTIONS + && tcph->doff != TCPH_LEN_W_TIMESTAMP) + return -1; + + /* check tcp options (only timestamp allowed) */ + if (tcph->doff == TCPH_LEN_W_TIMESTAMP) { + __be32 *topt = (__be32 *)(tcph + 1); + + if (*topt != htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) + | (TCPOPT_TIMESTAMP << 8) + | TCPOLEN_TIMESTAMP)) + return -1; + + /* timestamp should be in right order */ + topt++; + if (lro_desc && after(ntohl(lro_desc->tcp_rcv_tsval), + ntohl(*topt))) + return -1; + + /* timestamp reply should not be zero */ + topt++; + if (*topt == 0) + return -1; + } + + return 0; +} + +static void lro_update_tcp_ip_header(struct net_lro_desc *lro_desc) +{ + struct iphdr *iph = lro_desc->iph; + struct tcphdr *tcph = lro_desc->tcph; + __be32 *p; + __wsum tcp_hdr_csum; + + tcph->ack_seq = lro_desc->tcp_ack; + tcph->window = lro_desc->tcp_window; + + if (lro_desc->tcp_saw_tstamp) { + p = (__be32 *)(tcph + 1); + *(p+2) = lro_desc->tcp_rcv_tsecr; + } + + iph->tot_len = htons(lro_desc->ip_tot_len); + + iph->check = 0; + iph->check = ip_fast_csum((u8 *)lro_desc->iph, iph->ihl); + + tcph->check = 0; + tcp_hdr_csum = csum_partial((u8 *)tcph, TCP_HDR_LEN(tcph), 0); + lro_desc->data_csum = csum_add(lro_desc->data_csum, tcp_hdr_csum); + tcph->check = csum_tcpudp_magic(iph->saddr, iph->daddr, + lro_desc->ip_tot_len - + IP_HDR_LEN(iph), IPPROTO_TCP, + lro_desc->data_csum); +} + +static __wsum lro_tcp_data_csum(struct iphdr *iph, struct tcphdr *tcph, int len) +{ + __wsum tcp_csum; + __wsum tcp_hdr_csum; + __wsum tcp_ps_hdr_csum; + + tcp_csum = ~csum_unfold(tcph->check); + tcp_hdr_csum = csum_partial((u8 *)tcph, TCP_HDR_LEN(tcph), tcp_csum); + + tcp_ps_hdr_csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, + len + TCP_HDR_LEN(tcph), + IPPROTO_TCP, 0); + + return csum_sub(csum_sub(tcp_csum, tcp_hdr_csum), + tcp_ps_hdr_csum); +} + +static void lro_init_desc(struct net_lro_desc *lro_desc, struct sk_buff *skb, + struct iphdr *iph, struct tcphdr *tcph, + u16 vlan_tag, struct vlan_group *vgrp) +{ + int nr_frags; + __be32 *ptr; + u32 tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph); + + nr_frags = skb_shinfo(skb)->nr_frags; + lro_desc->parent = skb; + lro_desc->next_frag = &(skb_shinfo(skb)->frags[nr_frags]); + lro_desc->iph = iph; + lro_desc->tcph = tcph; + lro_desc->tcp_next_seq = ntohl(tcph->seq) + tcp_data_len; + lro_desc->tcp_ack = tcph->ack_seq; + lro_desc->tcp_window = tcph->window; + + lro_desc->pkt_aggr_cnt = 1; + lro_desc->ip_tot_len = ntohs(iph->tot_len); + + if (tcph->doff == 8) { + ptr = (__be32 *)(tcph+1); + lro_desc->tcp_saw_tstamp = 1; + lro_desc->tcp_rcv_tsval = *(ptr+1); + lro_desc->tcp_rcv_tsecr = *(ptr+2); + } + + lro_desc->mss = tcp_data_len; + lro_desc->vgrp = vgrp; + lro_desc->vlan_tag = vlan_tag; + lro_desc->active = 1; + + lro_desc->data_csum = lro_tcp_data_csum(iph, tcph, + tcp_data_len); +} + +static inline void lro_clear_desc(struct net_lro_desc *lro_desc) +{ + memset(lro_desc, 0, sizeof(struct net_lro_desc)); +} + +static void lro_add_common(struct net_lro_desc *lro_desc, struct iphdr *iph, + struct tcphdr *tcph, int tcp_data_len) +{ + struct sk_buff *parent = lro_desc->parent; + __be32 *topt; + + lro_desc->pkt_aggr_cnt++; + lro_desc->ip_tot_len += tcp_data_len; + lro_desc->tcp_next_seq += tcp_data_len; + lro_desc->tcp_window = tcph->window; + lro_desc->tcp_ack = tcph->ack_seq; + + /* don't update tcp_rcv_tsval, would not work with PAWS */ + if (lro_desc->tcp_saw_tstamp) { + topt = (__be32 *) (tcph + 1); + lro_desc->tcp_rcv_tsecr = *(topt + 2); + } + + lro_desc->data_csum = csum_block_add(lro_desc->data_csum, + lro_tcp_data_csum(iph, tcph, + tcp_data_len), + parent->len); + + parent->len += tcp_data_len; + parent->data_len += tcp_data_len; + if (tcp_data_len > lro_desc->mss) + lro_desc->mss = tcp_data_len; +} + +static void lro_add_packet(struct net_lro_desc *lro_desc, struct sk_buff *skb, + struct iphdr *iph, struct tcphdr *tcph) +{ + struct sk_buff *parent = lro_desc->parent; + int tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph); + + lro_add_common(lro_desc, iph, tcph, tcp_data_len); + + skb_pull(skb, (skb->len - tcp_data_len)); + parent->truesize += skb->truesize; + + if (lro_desc->last_skb) + lro_desc->last_skb->next = skb; + else + skb_shinfo(parent)->frag_list = skb; + + lro_desc->last_skb = skb; +} + +static void lro_add_frags(struct net_lro_desc *lro_desc, + int len, int hlen, int truesize, + struct skb_frag_struct *skb_frags, + struct iphdr *iph, struct tcphdr *tcph) +{ + struct sk_buff *skb = lro_desc->parent; + int tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph); + + lro_add_common(lro_desc, iph, tcph, tcp_data_len); + + skb->truesize += truesize; + + skb_frags[0].page_offset += hlen; + skb_frags[0].size -= hlen; + + while (tcp_data_len > 0) { + *(lro_desc->next_frag) = *skb_frags; + tcp_data_len -= skb_frags->size; + lro_desc->next_frag++; + skb_frags++; + skb_shinfo(skb)->nr_frags++; + } +} + +static int lro_check_tcp_conn(struct net_lro_desc *lro_desc, + struct iphdr *iph, + struct tcphdr *tcph) +{ + if ((lro_desc->iph->saddr != iph->saddr) + || (lro_desc->iph->daddr != iph->daddr) + || (lro_desc->tcph->source != tcph->source) + || (lro_desc->tcph->dest != tcph->dest)) + return -1; + return 0; +} + +static struct net_lro_desc *lro_get_desc(struct net_lro_mgr *lro_mgr, + struct net_lro_desc *lro_arr, + struct iphdr *iph, + struct tcphdr *tcph) +{ + struct net_lro_desc *lro_desc = NULL; + struct net_lro_desc *tmp; + int max_desc = lro_mgr->max_desc; + int i; + + for (i = 0; i < max_desc; i++) { + tmp = &lro_arr[i]; + if (tmp->active) + if (!lro_check_tcp_conn(tmp, iph, tcph)) { + lro_desc = tmp; + goto out; + } + } + + for (i = 0; i < max_desc; i++) { + if (!lro_arr[i].active) { + lro_desc = &lro_arr[i]; + goto out; + } + } + + LRO_INC_STATS(lro_mgr, no_desc); +out: + return lro_desc; +} + +static void lro_flush(struct net_lro_mgr *lro_mgr, + struct net_lro_desc *lro_desc) +{ + if (lro_desc->pkt_aggr_cnt > 1) + lro_update_tcp_ip_header(lro_desc); + + skb_shinfo(lro_desc->parent)->gso_size = lro_desc->mss; + + if (lro_desc->vgrp) { + if (lro_mgr->features & LRO_F_NAPI) + vlan_hwaccel_receive_skb(lro_desc->parent, + lro_desc->vgrp, + lro_desc->vlan_tag); + else + vlan_hwaccel_rx(lro_desc->parent, + lro_desc->vgrp, + lro_desc->vlan_tag); + + } else { + if (lro_mgr->features & LRO_F_NAPI) + netif_receive_skb(lro_desc->parent); + else + netif_rx(lro_desc->parent); + } + + LRO_INC_STATS(lro_mgr, flushed); + lro_clear_desc(lro_desc); +} + +static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb, + struct vlan_group *vgrp, u16 vlan_tag, void *priv) +{ + struct net_lro_desc *lro_desc; + struct iphdr *iph; + struct tcphdr *tcph; + u64 flags; + int vlan_hdr_len = 0; + + if (!lro_mgr->get_skb_header + || lro_mgr->get_skb_header(skb, (void *)&iph, (void *)&tcph, + &flags, priv)) + goto out; + + if (!(flags & LRO_IPV4) || !(flags & LRO_TCP)) + goto out; + + lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph); + if (!lro_desc) + goto out; + + if ((skb->protocol == htons(ETH_P_8021Q)) + && !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID)) + vlan_hdr_len = VLAN_HLEN; + + if (!lro_desc->active) { /* start new lro session */ + if (lro_tcp_ip_check(iph, tcph, skb->len - vlan_hdr_len, NULL)) + goto out; + + skb->ip_summed = lro_mgr->ip_summed_aggr; + lro_init_desc(lro_desc, skb, iph, tcph, vlan_tag, vgrp); + LRO_INC_STATS(lro_mgr, aggregated); + return 0; + } + + if (lro_desc->tcp_next_seq != ntohl(tcph->seq)) + goto out2; + + if (lro_tcp_ip_check(iph, tcph, skb->len, lro_desc)) + goto out2; + + lro_add_packet(lro_desc, skb, iph, tcph); + LRO_INC_STATS(lro_mgr, aggregated); + + if ((lro_desc->pkt_aggr_cnt >= lro_mgr->max_aggr) || + lro_desc->parent->len > (0xFFFF - lro_mgr->dev->mtu)) + lro_flush(lro_mgr, lro_desc); + + return 0; + +out2: /* send aggregated SKBs to stack */ + lro_flush(lro_mgr, lro_desc); + +out: /* Original SKB has to be posted to stack */ + skb->ip_summed = lro_mgr->ip_summed; + return 1; +} + + +static struct sk_buff *lro_gen_skb(struct net_lro_mgr *lro_mgr, + struct skb_frag_struct *frags, + int len, int true_size, + void *mac_hdr, + int hlen, __wsum sum, + u32 ip_summed) +{ + struct sk_buff *skb; + struct skb_frag_struct *skb_frags; + int data_len = len; + int hdr_len = min(len, hlen); + + skb = netdev_alloc_skb(lro_mgr->dev, hlen + lro_mgr->frag_align_pad); + if (!skb) + return NULL; + + skb_reserve(skb, lro_mgr->frag_align_pad); + skb->len = len; + skb->data_len = len - hdr_len; + skb->truesize += true_size; + skb->tail += hdr_len; + + memcpy(skb->data, mac_hdr, hdr_len); + + skb_frags = skb_shinfo(skb)->frags; + while (data_len > 0) { + *skb_frags = *frags; + data_len -= frags->size; + skb_frags++; + frags++; + skb_shinfo(skb)->nr_frags++; + } + + skb_shinfo(skb)->frags[0].page_offset += hdr_len; + skb_shinfo(skb)->frags[0].size -= hdr_len; + + skb->ip_summed = ip_summed; + skb->csum = sum; + skb->protocol = eth_type_trans(skb, lro_mgr->dev); + return skb; +} + +static struct sk_buff *__lro_proc_segment(struct net_lro_mgr *lro_mgr, + struct skb_frag_struct *frags, + int len, int true_size, + struct vlan_group *vgrp, + u16 vlan_tag, void *priv, __wsum sum) +{ + struct net_lro_desc *lro_desc; + struct iphdr *iph; + struct tcphdr *tcph; + struct sk_buff *skb; + u64 flags; + void *mac_hdr; + int mac_hdr_len; + int hdr_len = LRO_MAX_PG_HLEN; + int vlan_hdr_len = 0; + + if (!lro_mgr->get_frag_header + || lro_mgr->get_frag_header(frags, (void *)&mac_hdr, (void *)&iph, + (void *)&tcph, &flags, priv)) { + mac_hdr = page_address(frags->page) + frags->page_offset; + goto out1; + } + + if (!(flags & LRO_IPV4) || !(flags & LRO_TCP)) + goto out1; + + hdr_len = (int)((void *)(tcph) + TCP_HDR_LEN(tcph) - mac_hdr); + mac_hdr_len = (int)((void *)(iph) - mac_hdr); + + lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph); + if (!lro_desc) + goto out1; + + if (!lro_desc->active) { /* start new lro session */ + if (lro_tcp_ip_check(iph, tcph, len - mac_hdr_len, NULL)) + goto out1; + + skb = lro_gen_skb(lro_mgr, frags, len, true_size, mac_hdr, + hdr_len, 0, lro_mgr->ip_summed_aggr); + if (!skb) + goto out; + + if ((skb->protocol == htons(ETH_P_8021Q)) + && !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID)) + vlan_hdr_len = VLAN_HLEN; + + iph = (void *)(skb->data + vlan_hdr_len); + tcph = (void *)((u8 *)skb->data + vlan_hdr_len + + IP_HDR_LEN(iph)); + + lro_init_desc(lro_desc, skb, iph, tcph, 0, NULL); + LRO_INC_STATS(lro_mgr, aggregated); + return NULL; + } + + if (lro_desc->tcp_next_seq != ntohl(tcph->seq)) + goto out2; + + if (lro_tcp_ip_check(iph, tcph, len - mac_hdr_len, lro_desc)) + goto out2; + + lro_add_frags(lro_desc, len, hdr_len, true_size, frags, iph, tcph); + LRO_INC_STATS(lro_mgr, aggregated); + + if ((skb_shinfo(lro_desc->parent)->nr_frags >= lro_mgr->max_aggr) || + lro_desc->parent->len > (0xFFFF - lro_mgr->dev->mtu)) + lro_flush(lro_mgr, lro_desc); + + return NULL; + +out2: /* send aggregated packets to the stack */ + lro_flush(lro_mgr, lro_desc); + +out1: /* Original packet has to be posted to the stack */ + skb = lro_gen_skb(lro_mgr, frags, len, true_size, mac_hdr, + hdr_len, sum, lro_mgr->ip_summed); +out: + return skb; +} + +void lro_receive_skb(struct net_lro_mgr *lro_mgr, + struct sk_buff *skb, + void *priv) +{ + if (__lro_proc_skb(lro_mgr, skb, NULL, 0, priv)) { + if (lro_mgr->features & LRO_F_NAPI) + netif_receive_skb(skb); + else + netif_rx(skb); + } +} +EXPORT_SYMBOL(lro_receive_skb); + +void lro_vlan_hwaccel_receive_skb(struct net_lro_mgr *lro_mgr, + struct sk_buff *skb, + struct vlan_group *vgrp, + u16 vlan_tag, + void *priv) +{ + if (__lro_proc_skb(lro_mgr, skb, vgrp, vlan_tag, priv)) { + if (lro_mgr->features & LRO_F_NAPI) + vlan_hwaccel_receive_skb(skb, vgrp, vlan_tag); + else + vlan_hwaccel_rx(skb, vgrp, vlan_tag); + } +} +EXPORT_SYMBOL(lro_vlan_hwaccel_receive_skb); + +void lro_receive_frags(struct net_lro_mgr *lro_mgr, + struct skb_frag_struct *frags, + int len, int true_size, void *priv, __wsum sum) +{ + struct sk_buff *skb; + + skb = __lro_proc_segment(lro_mgr, frags, len, true_size, NULL, 0, + priv, sum); + if (!skb) + return; + + if (lro_mgr->features & LRO_F_NAPI) + netif_receive_skb(skb); + else + netif_rx(skb); +} +EXPORT_SYMBOL(lro_receive_frags); + +void lro_vlan_hwaccel_receive_frags(struct net_lro_mgr *lro_mgr, + struct skb_frag_struct *frags, + int len, int true_size, + struct vlan_group *vgrp, + u16 vlan_tag, void *priv, __wsum sum) +{ + struct sk_buff *skb; + + skb = __lro_proc_segment(lro_mgr, frags, len, true_size, vgrp, + vlan_tag, priv, sum); + if (!skb) + return; + + if (lro_mgr->features & LRO_F_NAPI) + vlan_hwaccel_receive_skb(skb, vgrp, vlan_tag); + else + vlan_hwaccel_rx(skb, vgrp, vlan_tag); +} +EXPORT_SYMBOL(lro_vlan_hwaccel_receive_frags); + +void lro_flush_all(struct net_lro_mgr *lro_mgr) +{ + int i; + struct net_lro_desc *lro_desc = lro_mgr->lro_arr; + + for (i = 0; i < lro_mgr->max_desc; i++) { + if (lro_desc[i].active) + lro_flush(lro_mgr, &lro_desc[i]); + } +} +EXPORT_SYMBOL(lro_flush_all); + +void lro_flush_pkt(struct net_lro_mgr *lro_mgr, + struct iphdr *iph, struct tcphdr *tcph) +{ + struct net_lro_desc *lro_desc; + + lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph); + if (lro_desc->active) + lro_flush(lro_mgr, lro_desc); +} +EXPORT_SYMBOL(lro_flush_pkt); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib_1.5.3/ipoib.h b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib_1.5.3/ipoib.h new file mode 100644 index 0000000..7b94026 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib_1.5.3/ipoib.h @@ -0,0 +1,869 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2004 Voltaire, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _IPOIB_H +#define _IPOIB_H + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include +#include +#include +#include + +#ifndef __rcu +#define __rcu +#endif + +/* constants */ + +enum ipoib_flush_level { + IPOIB_FLUSH_LIGHT, + IPOIB_FLUSH_NORMAL, + IPOIB_FLUSH_HEAVY +}; + +enum { + IPOIB_ENCAP_LEN = 4, + + IPOIB_UD_HEAD_SIZE = IB_GRH_BYTES + IPOIB_ENCAP_LEN, + IPOIB_UD_RX_SG = 2, /* max buffer needed for 4K mtu */ + + IPOIB_CM_MTU = 0x10000 - 0x10, /* padding to align header to 16 */ + IPOIB_CM_BUF_SIZE = IPOIB_CM_MTU + IPOIB_ENCAP_LEN, + IPOIB_CM_HEAD_SIZE = IPOIB_CM_BUF_SIZE % PAGE_SIZE, + IPOIB_CM_RX_SG = ALIGN(IPOIB_CM_BUF_SIZE, PAGE_SIZE) / PAGE_SIZE, + IPOIB_RX_RING_SIZE = 512, + IPOIB_TX_RING_SIZE = 512, + IPOIB_MAX_QUEUE_SIZE = 8192, + IPOIB_MIN_QUEUE_SIZE = 2, + IPOIB_CM_MAX_CONN_QP = 4096, + + IPOIB_NUM_WC = 4, + + IPOIB_MAX_PATH_REC_QUEUE = 3, + IPOIB_MAX_MCAST_QUEUE = 3, + + IPOIB_FLAG_OPER_UP = 0, + IPOIB_FLAG_INITIALIZED = 1, + IPOIB_FLAG_ADMIN_UP = 2, + IPOIB_PKEY_ASSIGNED = 3, + IPOIB_PKEY_STOP = 4, + IPOIB_FLAG_SUBINTERFACE = 5, + IPOIB_MCAST_RUN = 6, + IPOIB_STOP_REAPER = 7, + IPOIB_FLAG_ADMIN_CM = 9, + IPOIB_FLAG_UMCAST = 10, + IPOIB_FLAG_CSUM = 11, + IPOIB_MCAST_RUN_GC = 12, + IPOIB_FLAG_AUTO_MODER = 13, /*indicates moderation is running*/ + IPOIB_STOP_NEIGH_GC = 14, + IPOIB_NEIGH_TBL_FLUSH = 15, + + IPOIB_MAX_BACKOFF_SECONDS = 16, + IPOIB_FLAG_MODULE_DOWN = 17, /*indicates module is his way down*/ + + IPOIB_MCAST_FLAG_FOUND = 0, /* used in set_multicast_list */ + IPOIB_MCAST_FLAG_SENDONLY = 1, + IPOIB_MCAST_FLAG_BUSY = 2, /* joining or already joined */ + IPOIB_MCAST_FLAG_ATTACHED = 3, + IPOIB_MCAST_JOIN_STARTED = 4, + IPOIB_MCAST_UMCAST_ATTACHED = 5, + + IPOIB_MAX_LRO_DESCRIPTORS = 8, + IPOIB_LRO_MAX_AGGR = 64, + + MAX_SEND_CQE = 16, + IPOIB_CM_COPYBREAK = 256, +}; + +#define IPOIB_OP_RECV (1ul << 31) +#ifdef CONFIG_INFINIBAND_IPOIB_CM +#define IPOIB_OP_CM (1ul << 30) +#else +#define IPOIB_OP_CM (0) +#endif + +/* structs */ + +struct ipoib_header { + __be16 proto; + u16 reserved; +}; + +/*copy it from kernel 2.6.2X */ +struct qdisc_skb_cb { + unsigned int pkt_len; + char data[]; +}; + +struct ipoib_cb { + struct qdisc_skb_cb qdisc_cb; + u8 hwaddr[INFINIBAND_ALEN]; +}; + +/* Used for all multicast joins (broadcast, IPv4 mcast and IPv6 mcast) */ +struct ipoib_mcast { + struct ib_sa_mcmember_rec mcmember; + struct ib_sa_multicast *mc; + struct ipoib_ah *ah; + + struct rb_node rb_node; + struct list_head list; + + unsigned long created; + unsigned long used; + unsigned long backoff; + + unsigned long flags; + unsigned char logcount; + + struct list_head neigh_list; + + struct sk_buff_head pkt_queue; + + struct net_device *dev; + struct completion done; +}; + +struct ipoib_rx_buf { + struct sk_buff *skb; + u64 mapping[IPOIB_UD_RX_SG]; +}; + +struct ipoib_tx_buf { + struct sk_buff *skb; + u64 mapping[MAX_SKB_FRAGS + 1]; +}; + +struct ipoib_cm_tx_buf { + struct sk_buff *skb; + u64 mapping; +}; + +/* in order to call dst->ops->update_pmtu out of spin-lock*/ +struct ipoib_pmtu_update { + struct work_struct work; + struct sk_buff *skb; + unsigned int mtu; +}; + +struct ib_cm_id; + +struct ipoib_cm_data { + __be32 qpn; /* High byte MUST be ignored on receive */ + __be32 mtu; +}; + +/* + * Quoting 10.3.1 Queue Pair and EE Context States: + * + * Note, for QPs that are associated with an SRQ, the Consumer should take the + * QP through the Error State before invoking a Destroy QP or a Modify QP to the + * Reset State. The Consumer may invoke the Destroy QP without first performing + * a Modify QP to the Error State and waiting for the Affiliated Asynchronous + * Last WQE Reached Event. However, if the Consumer does not wait for the + * Affiliated Asynchronous Last WQE Reached Event, then WQE and Data Segment + * leakage may occur. Therefore, it is good programming practice to tear down a + * QP that is associated with an SRQ by using the following process: + * + * - Put the QP in the Error State + * - Wait for the Affiliated Asynchronous Last WQE Reached Event; + * - either: + * drain the CQ by invoking the Poll CQ verb and either wait for CQ + * to be empty or the number of Poll CQ operations has exceeded + * CQ capacity size; + * - or + * post another WR that completes on the same CQ and wait for this + * WR to return as a WC; + * - and then invoke a Destroy QP or Reset QP. + * + * We use the second option and wait for a completion on the + * same CQ before destroying QPs attached to our SRQ. + */ + +enum ipoib_cm_state { + IPOIB_CM_RX_LIVE, + IPOIB_CM_RX_ERROR, /* Ignored by stale task */ + IPOIB_CM_RX_FLUSH /* Last WQE Reached event observed */ +}; + +struct ipoib_cm_rx { + struct ib_cm_id *id; + struct ib_qp *qp; + struct ipoib_cm_rx_buf *rx_ring; + struct list_head list; + struct net_device *dev; + unsigned long jiffies; + enum ipoib_cm_state state; + int recv_count; +}; + +struct ipoib_cm_tx { + struct ib_cm_id *id; + struct ib_qp *qp; + struct list_head list; + struct net_device *dev; + struct ipoib_neigh *neigh; + struct ipoib_path *path; + struct ipoib_cm_tx_buf *tx_ring; + unsigned tx_head; + unsigned tx_tail; + unsigned long flags; + u32 mtu; +}; + +struct ipoib_cm_rx_buf { + struct sk_buff *skb; + u64 mapping[IPOIB_CM_RX_SG]; +}; + +struct ipoib_cm_dev_priv { + struct ib_srq *srq; + struct ipoib_cm_rx_buf *srq_ring; + struct ib_cm_id *id; + struct list_head passive_ids; /* state: LIVE */ + struct list_head rx_error_list; /* state: ERROR */ + struct list_head rx_flush_list; /* state: FLUSH, drain not started */ + struct list_head rx_drain_list; /* state: FLUSH, drain started */ + struct list_head rx_reap_list; /* state: FLUSH, drain done */ + struct work_struct start_task; + struct work_struct reap_task; + struct work_struct skb_task; + struct work_struct rx_reap_task; + struct delayed_work stale_task; + struct sk_buff_head skb_queue; + struct list_head start_list; + struct list_head reap_list; + struct ib_wc ibwc[IPOIB_NUM_WC]; + struct ib_sge rx_sge[IPOIB_CM_RX_SG]; + struct ib_recv_wr rx_wr; + int nonsrq_conn_qp; + int max_cm_mtu; + int num_frags; +}; + +/* adaptive moderation parameters: */ +enum { + /* Target number of packets to coalesce with interrupt moderation */ + IPOIB_RX_COAL_TARGET = 88, + IPOIB_RX_COAL_TIME = 16, + IPOIB_TX_COAL_PKTS = 5, + IPOIB_TX_COAL_TIME = 0x80, + IPOIB_RX_RATE_LOW = 400000, + IPOIB_RX_COAL_TIME_LOW = 0, + IPOIB_RX_RATE_HIGH = 450000, + IPOIB_RX_COAL_TIME_HIGH = 128, + IPOIB_RX_SIZE_THRESH = 1024, + IPOIB_RX_RATE_THRESH = 1000000 / IPOIB_RX_COAL_TIME_HIGH, + IPOIB_SAMPLE_INTERVAL = 0, + IPOIB_AVG_PKT_SMALL = 256, + IPOIB_AUTO_CONF = 0xffff, + ADAPT_MODERATION_DELAY = HZ / 4, +}; + +struct ipoib_ethtool_st { + __u32 rx_max_coalesced_frames; + __u32 rx_coalesce_usecs; +/* u16 coalesce_usecs; + u16 max_coalesced_frames; +*/ + __u32 pkt_rate_low; + __u32 pkt_rate_high; + __u32 rx_coalesce_usecs_low; + __u32 rx_coalesce_usecs_high; + __u32 rate_sample_interval; + __u32 use_adaptive_rx_coalesce; + int last_moder_time; + u16 sample_interval; + unsigned long last_moder_jiffies; + unsigned long last_moder_packets; + unsigned long last_moder_tx_packets; + unsigned long last_moder_bytes; +}; + +struct ipoib_lro { + struct net_lro_mgr lro_mgr; + struct net_lro_desc lro_desc[IPOIB_MAX_LRO_DESCRIPTORS]; +}; + +#define SOCK_ACCL_POLL_TCP 1UL << 28 +#define SOCK_ACCL_POLL_UDP 1UL << 29 + +struct sock_accl_ops { + void (*poll)(struct net_device *dev, int ring_num); + void (*get_tcp_ring)(struct net_device *dev, u8 *poll_ring, + u32 saddr, u32 daddr, u16 sport, u16 dport); + void (*get_udp_rings)(struct net_device *dev, u8 *poll_rings, + u8 *num_rings); +}; + +struct ipoib_neigh_table; +struct ipoib_neigh_hash { + struct ipoib_neigh_table *ntbl; + struct ipoib_neigh __rcu **buckets; + struct rcu_head rcu; + u32 mask; + u32 size; +}; + +struct ipoib_neigh_table { + struct ipoib_neigh_hash __rcu *htbl; + atomic_t entries; + struct completion flushed; + struct completion deleted; +}; + +/* + * Device private locking: network stack tx_lock protects members used + * in TX fast path, lock protects everything else. lock nests inside + * of tx_lock (ie tx_lock must be acquired first if needed). + */ +struct ipoib_dev_priv { + + struct sock_accl_ops accl_priv; + spinlock_t rx_ring_lock; + + spinlock_t lock; + + struct net_device *dev; + + unsigned long flags; + + struct mutex vlan_mutex; + + struct rb_root path_tree; + struct list_head path_list; + + struct ipoib_neigh_table ntbl; + + struct ipoib_mcast *broadcast; + struct list_head multicast_list; + struct rb_root multicast_tree; + + struct delayed_work pkey_poll_task; + struct delayed_work mcast_join_task; + struct delayed_work mcast_leave_task; + struct work_struct carrier_on_task; + struct work_struct flush_light; + struct work_struct flush_normal; + struct work_struct flush_heavy; + struct work_struct restart_task; + struct delayed_work ah_reap_task; + struct delayed_work adaptive_moder_task; + struct delayed_work neigh_reap_task; + + struct ib_device *ca; + u8 port; + u16 pkey; + u16 pkey_index; + struct ib_pd *pd; + struct ib_mr *mr; + struct ib_cq *recv_cq; + struct ib_cq *send_cq; + struct ib_qp *qp; + u32 qkey; + + union ib_gid local_gid; + u16 local_lid; + + unsigned int admin_mtu; + unsigned int mcast_mtu; + unsigned int max_ib_mtu; + + struct ipoib_rx_buf *rx_ring; + + struct ipoib_tx_buf *tx_ring; + unsigned tx_head; + unsigned tx_tail; + struct ib_sge tx_sge[MAX_SKB_FRAGS + 1]; + struct ib_send_wr tx_wr; + unsigned tx_outstanding; + struct ib_wc send_wc[MAX_SEND_CQE]; + + struct ib_recv_wr rx_wr; + struct ib_sge rx_sge[IPOIB_UD_RX_SG]; + + struct ib_wc ibwc[IPOIB_NUM_WC]; + + struct list_head dead_ahs; + + struct ib_event_handler event_handler; + + struct net_device_stats stats; + + struct net_device *parent; + struct list_head child_intfs; + struct list_head list; + int child_index; + +#ifdef CONFIG_INFINIBAND_IPOIB_CM + struct ipoib_cm_dev_priv cm; +#endif + +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG + struct list_head fs_list; + struct dentry *mcg_dentry; + struct dentry *path_dentry; +#endif + int hca_caps; + struct ipoib_ethtool_st ethtool; + struct timer_list poll_timer; + + struct ipoib_lro lro; + struct mutex state_lock; +}; + +struct ipoib_ah { + struct net_device *dev; + struct ib_ah *ah; + struct list_head list; + struct kref ref; + unsigned last_send; +}; + +struct ipoib_path { + struct net_device *dev; + struct ib_sa_path_rec pathrec; + struct ipoib_ah *ah; + struct sk_buff_head queue; + + struct list_head neigh_list; + + int query_id; + struct ib_sa_query *query; + struct completion done; + + struct rb_node rb_node; + struct list_head list; + int valid; +}; + +struct ipoib_neigh { + struct ipoib_ah *ah; +#ifdef CONFIG_INFINIBAND_IPOIB_CM + struct ipoib_cm_tx *cm; +#endif + u8 daddr[INFINIBAND_ALEN]; + struct sk_buff_head queue; + + struct net_device *dev; + + struct list_head all_neigh_list; + struct list_head list; + struct ipoib_neigh __rcu *hnext; + struct rcu_head rcu; + atomic_t refcnt; + unsigned long alive; +}; + +#define IPOIB_UD_MTU(ib_mtu) (ib_mtu - IPOIB_ENCAP_LEN) +#define IPOIB_UD_BUF_SIZE(ib_mtu) (ib_mtu + IB_GRH_BYTES) + +static inline int ipoib_ud_need_sg(unsigned int ib_mtu) +{ + return IPOIB_UD_BUF_SIZE(ib_mtu) > PAGE_SIZE; +} + +void ipoib_neigh_dtor(struct ipoib_neigh *neigh); +static inline void ipoib_neigh_put(struct ipoib_neigh *neigh) +{ + if (atomic_dec_and_test(&neigh->refcnt)) + ipoib_neigh_dtor(neigh); +} +struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr); +struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr, + struct net_device *dev); +void ipoib_neigh_free(struct ipoib_neigh *neigh); +void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid); + +extern struct workqueue_struct *ipoib_workqueue; +extern struct workqueue_struct *ipoib_auto_moder_workqueue; + +extern int ipoib_mc_sendonly_timeout; + +/* functions */ + +void ipoib_get_tcp_ring(struct net_device *dev, u8 *poll_ring, u32 saddr, u32 daddr, u16 sport, u16 dport); +void ipoib_get_udp_rings(struct net_device *dev, u8 *poll_rings, u8 *num_rings); +void ipoib_accl_poll(struct net_device *dev, int ring_num); + +int ipoib_poll(struct net_device *dev, int *budget); +void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr); +void ipoib_send_comp_handler(struct ib_cq *cq, void *dev_ptr); + +struct ipoib_ah *ipoib_create_ah(struct net_device *dev, + struct ib_pd *pd, struct ib_ah_attr *attr); +void ipoib_free_ah(struct kref *kref); +static inline void ipoib_put_ah(struct ipoib_ah *ah) +{ + kref_put(&ah->ref, ipoib_free_ah); +} +int ipoib_open(struct net_device *dev); +int ipoib_add_pkey_attr(struct net_device *dev); +int ipoib_add_umcast_attr(struct net_device *dev); + +void ipoib_send(struct net_device *dev, struct sk_buff *skb, + struct ipoib_ah *address, u32 qpn); +void ipoib_reap_ah(struct work_struct *work); + +void ipoib_mark_paths_invalid(struct net_device *dev); +void ipoib_flush_paths(struct net_device *dev); +struct ipoib_dev_priv *ipoib_intf_alloc(const char *format); + +int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port); +void ipoib_ib_dev_flush_light(struct work_struct *work); +void ipoib_ib_dev_flush_normal(struct work_struct *work); +void ipoib_ib_dev_flush_heavy(struct work_struct *work); +void ipoib_pkey_event(struct work_struct *work); +void ipoib_ib_dev_cleanup(struct net_device *dev); + +int ipoib_ib_dev_open(struct net_device *dev); +int ipoib_ib_dev_up(struct net_device *dev); +int ipoib_ib_dev_down(struct net_device *dev, int flush); +int ipoib_ib_dev_stop(struct net_device *dev, int flush); + +int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port); +void ipoib_dev_cleanup(struct net_device *dev); + +void ipoib_mcast_join_task(struct work_struct *work); +void ipoib_mcast_leave_task(struct work_struct *work); +void ipoib_mcast_carrier_on_task(struct work_struct *work); +void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb); + +void ipoib_mcast_restart_task(struct work_struct *work); +int ipoib_mcast_start_thread(struct net_device *dev); +int ipoib_mcast_stop_thread(struct net_device *dev, int flush); + +void ipoib_mcast_dev_down(struct net_device *dev); +void ipoib_mcast_dev_flush(struct net_device *dev); + +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG +struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev); +int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter); +void ipoib_mcast_iter_read(struct ipoib_mcast_iter *iter, + union ib_gid *gid, + unsigned long *created, + unsigned int *queuelen, + unsigned int *complete, + unsigned int *send_only); + +struct ipoib_path_iter *ipoib_path_iter_init(struct net_device *dev); +int ipoib_path_iter_next(struct ipoib_path_iter *iter); +void ipoib_path_iter_read(struct ipoib_path_iter *iter, + struct ipoib_path *path); +#endif + +int ipoib_mcast_attach(struct net_device *dev, u16 mlid, + union ib_gid *mgid, int set_qkey); + +int ipoib_init_qp(struct net_device *dev); +int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca); +void ipoib_transport_dev_cleanup(struct net_device *dev); + +void ipoib_event(struct ib_event_handler *handler, + struct ib_event *record); + +int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey, + unsigned char clone_index); +int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey, + unsigned char clone_index); + +void ipoib_pkey_poll(struct work_struct *work); +int ipoib_pkey_dev_delay_open(struct net_device *dev); +void ipoib_drain_cq(struct net_device *dev); + +void ipoib_set_ethtool_ops(struct net_device *dev); +int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca); + +#ifdef CONFIG_INFINIBAND_IPOIB_CM + +#define IPOIB_FLAGS_RC 0x80 +#define IPOIB_FLAGS_UC 0x40 + +/* We don't support UC connections at the moment */ +#define IPOIB_CM_SUPPORTED(ha) (ha[0] & (IPOIB_FLAGS_RC)) + +extern int ipoib_max_conn_qp; + +static inline int ipoib_cm_admin_enabled(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + return IPOIB_CM_SUPPORTED(dev->dev_addr) && + test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); +} + +static inline int ipoib_cm_enabled(struct net_device *dev, u8 *hwaddr) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + return IPOIB_CM_SUPPORTED(hwaddr) && + test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); +} + +static inline int ipoib_cm_up(struct ipoib_neigh *neigh) + +{ + return test_bit(IPOIB_FLAG_OPER_UP, &neigh->cm->flags); +} + +static inline struct ipoib_cm_tx *ipoib_cm_get(struct ipoib_neigh *neigh) +{ + return neigh->cm; +} + +static inline void ipoib_cm_set(struct ipoib_neigh *neigh, struct ipoib_cm_tx *tx) +{ + neigh->cm = tx; +} + +static inline int ipoib_cm_has_srq(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + return !!priv->cm.srq; +} + +static inline unsigned int ipoib_cm_max_mtu(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + return priv->cm.max_cm_mtu; +} + +void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx); +int ipoib_cm_dev_open(struct net_device *dev); +void ipoib_cm_dev_stop(struct net_device *dev); +int ipoib_cm_dev_init(struct net_device *dev); +int ipoib_cm_add_mode_attr(struct net_device *dev); +void ipoib_cm_dev_cleanup(struct net_device *dev); +struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path, + struct ipoib_neigh *neigh); +void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx); +void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb, + unsigned int mtu); +void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc); +void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc); +#else + +struct ipoib_cm_tx; + +#define ipoib_max_conn_qp 0 + +static inline int ipoib_cm_admin_enabled(struct net_device *dev) +{ + return 0; +} +static inline int ipoib_cm_enabled(struct net_device *dev, u8 *hwaddr) + +{ + return 0; +} + +static inline int ipoib_cm_up(struct ipoib_neigh *neigh) + +{ + return 0; +} + +static inline struct ipoib_cm_tx *ipoib_cm_get(struct ipoib_neigh *neigh) +{ + return NULL; +} + +static inline void ipoib_cm_set(struct ipoib_neigh *neigh, struct ipoib_cm_tx *tx) +{ +} + +static inline int ipoib_cm_has_srq(struct net_device *dev) +{ + return 0; +} + +static inline unsigned int ipoib_cm_max_mtu(struct net_device *dev) +{ + return 0; +} + +static inline +void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx) +{ + return; +} + +static inline +int ipoib_cm_dev_open(struct net_device *dev) +{ + return 0; +} + +static inline +void ipoib_cm_dev_stop(struct net_device *dev) +{ + return; +} + +static inline +int ipoib_cm_dev_init(struct net_device *dev) +{ + return -ENOSYS; +} + +static inline +void ipoib_cm_dev_cleanup(struct net_device *dev) +{ + return; +} + +static inline +struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path, + struct ipoib_neigh *neigh) +{ + return NULL; +} + +static inline +void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx) +{ + return; +} + +static inline +int ipoib_cm_add_mode_attr(struct net_device *dev) +{ + return 0; +} + +static inline void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb, + unsigned int mtu) +{ + dev_kfree_skb_any(skb); +} + +static inline void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) +{ +} + +static inline void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) +{ +} +#endif + +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG +void ipoib_create_debug_files(struct net_device *dev); +void ipoib_delete_debug_files(struct net_device *dev); +int ipoib_register_debugfs(void); +void ipoib_unregister_debugfs(void); +#else +static inline void ipoib_create_debug_files(struct net_device *dev) { } +static inline void ipoib_delete_debug_files(struct net_device *dev) { } +static inline int ipoib_register_debugfs(void) { return 0; } +static inline void ipoib_unregister_debugfs(void) { } +#endif + +#define ipoib_printk(level, priv, format, arg...) \ + printk(level "%s: " format, ((struct ipoib_dev_priv *) priv)->dev->name , ## arg) +#define ipoib_warn(priv, format, arg...) \ + ipoib_printk(KERN_WARNING, priv, format , ## arg) + +extern int ipoib_sendq_size; +extern int ipoib_recvq_size; + +extern struct ib_sa_client ipoib_sa_client; + +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG +extern int ipoib_debug_level; + +#define ipoib_dbg(priv, format, arg...) \ + do { \ + if (ipoib_debug_level > 0) \ + ipoib_printk(KERN_DEBUG, priv, format , ## arg); \ + } while (0) +#define ipoib_dbg_mcast(priv, format, arg...) \ + do { \ + if (mcast_debug_level > 0) \ + ipoib_printk(KERN_DEBUG, priv, format , ## arg); \ + } while (0) +#else /* CONFIG_INFINIBAND_IPOIB_DEBUG */ +#define ipoib_dbg(priv, format, arg...) \ + do { (void) (priv); } while (0) +#define ipoib_dbg_mcast(priv, format, arg...) \ + do { (void) (priv); } while (0) +#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */ + +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA +#define ipoib_dbg_data(priv, format, arg...) \ + do { \ + if (data_debug_level > 0) \ + ipoib_printk(KERN_DEBUG, priv, format , ## arg); \ + } while (0) +#else /* CONFIG_INFINIBAND_IPOIB_DEBUG_DATA */ +#define ipoib_dbg_data(priv, format, arg...) \ + do { (void) (priv); } while (0) +#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG_DATA */ + + +#define IPOIB_GID_FMT "%2.2x%2.2x:%2.2x%2.2x:%2.2x%2.2x:%2.2x%2.2x:" \ + "%2.2x%2.2x:%2.2x%2.2x:%2.2x%2.2x:%2.2x%2.2x" + +#define IPOIB_GID_RAW_ARG(gid) ((u8 *)(gid))[0], \ + ((u8 *)(gid))[1], \ + ((u8 *)(gid))[2], \ + ((u8 *)(gid))[3], \ + ((u8 *)(gid))[4], \ + ((u8 *)(gid))[5], \ + ((u8 *)(gid))[6], \ + ((u8 *)(gid))[7], \ + ((u8 *)(gid))[8], \ + ((u8 *)(gid))[9], \ + ((u8 *)(gid))[10],\ + ((u8 *)(gid))[11],\ + ((u8 *)(gid))[12],\ + ((u8 *)(gid))[13],\ + ((u8 *)(gid))[14],\ + ((u8 *)(gid))[15] + +#define IPOIB_GID_ARG(gid) IPOIB_GID_RAW_ARG((gid).raw) + +#define IPOIB_QPN(ha) (be32_to_cpup((__be32 *) ha) & 0xffffff) + +#endif /* _IPOIB_H */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib_1.5.3/ipoib_cm.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib_1.5.3/ipoib_cm.c new file mode 100644 index 0000000..8d1f9d7 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib_1.5.3/ipoib_cm.c @@ -0,0 +1,1675 @@ +/* + * Copyright (c) 2006 Mellanox Technologies. All rights reserved + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "ipoib.h" + +int ipoib_max_conn_qp = 128; + +module_param_named(max_nonsrq_conn_qp, ipoib_max_conn_qp, int, 0444); +MODULE_PARM_DESC(max_nonsrq_conn_qp, + "Max number of connected-mode QPs per interface " + "(applied only if shared receive queue is not available)"); + +#define to_net_dev(class) container_of(class, struct net_device, class_dev) + +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA +static int data_debug_level; + +module_param_named(cm_data_debug_level, data_debug_level, int, 0644); +MODULE_PARM_DESC(cm_data_debug_level, + "Enable data path debug tracing for connected mode if > 0"); +#endif + +#define IPOIB_CM_IETF_ID 0x1000000000000000ULL + +#define IPOIB_CM_RX_UPDATE_TIME (256 * HZ) +#define IPOIB_CM_RX_TIMEOUT (2 * 256 * HZ) +#define IPOIB_CM_RX_DELAY (3 * 256 * HZ) +#define IPOIB_CM_RX_UPDATE_MASK (0x3) + +static struct ib_qp_attr ipoib_cm_err_attr = { + .qp_state = IB_QPS_ERR +}; + +#define IPOIB_CM_RX_DRAIN_WRID 0xffffffff + +static struct ib_send_wr ipoib_cm_rx_drain_wr = { + .opcode = IB_WR_SEND, +}; + +static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id, + struct ib_cm_event *event); + +static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, int frags, + u64 mapping[IPOIB_CM_RX_SG]) +{ + int i; + + ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE); + + for (i = 0; i < frags; ++i) + ib_dma_unmap_page(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE); +} + +static int ipoib_cm_post_receive_srq(struct net_device *dev, int id) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_recv_wr *bad_wr; + int i, ret; + + priv->cm.rx_wr.wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV; + + for (i = 0; i < priv->cm.num_frags; ++i) + priv->cm.rx_sge[i].addr = priv->cm.srq_ring[id].mapping[i]; + + ret = ib_post_srq_recv(priv->cm.srq, &priv->cm.rx_wr, &bad_wr); + if (unlikely(ret)) { + ipoib_warn(priv, "post srq failed for buf %d (%d)\n", id, ret); + ipoib_cm_dma_unmap_rx(priv, priv->cm.num_frags - 1, + priv->cm.srq_ring[id].mapping); + dev_kfree_skb_any(priv->cm.srq_ring[id].skb); + priv->cm.srq_ring[id].skb = NULL; + } + + return ret; +} + +static int ipoib_cm_post_receive_nonsrq(struct net_device *dev, + struct ipoib_cm_rx *rx, + struct ib_recv_wr *wr, + struct ib_sge *sge, int id) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_recv_wr *bad_wr; + int i, ret; + + wr->wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV; + + for (i = 0; i < IPOIB_CM_RX_SG; ++i) + sge[i].addr = rx->rx_ring[id].mapping[i]; + + ret = ib_post_recv(rx->qp, wr, &bad_wr); + if (unlikely(ret)) { + ipoib_warn(priv, "post recv failed for buf %d (%d)\n", id, ret); + ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1, + rx->rx_ring[id].mapping); + dev_kfree_skb_any(rx->rx_ring[id].skb); + rx->rx_ring[id].skb = NULL; + } + + return ret; +} + +static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev, + struct ipoib_cm_rx_buf *rx_ring, + int id, int frags, + u64 mapping[IPOIB_CM_RX_SG]) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct sk_buff *skb; + int i; + + skb = dev_alloc_skb(IPOIB_CM_HEAD_SIZE + 12); + if (unlikely(!skb)) + return NULL; + + /* + * IPoIB adds a 4 byte header. So we need 12 more bytes to align the + * IP header to a multiple of 16. + */ + skb_reserve(skb, 12); + + mapping[0] = ib_dma_map_single(priv->ca, skb->data, IPOIB_CM_HEAD_SIZE, + DMA_FROM_DEVICE); + if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0]))) { + dev_kfree_skb_any(skb); + return NULL; + } + + for (i = 0; i < frags; i++) { + struct page *page = alloc_page(GFP_ATOMIC); + + if (!page) + goto partial_error; + skb_fill_page_desc(skb, i, page, 0, PAGE_SIZE); + + mapping[i + 1] = ib_dma_map_page(priv->ca, skb_shinfo(skb)->frags[i].page, + 0, PAGE_SIZE, DMA_FROM_DEVICE); + if (unlikely(ib_dma_mapping_error(priv->ca, mapping[i + 1]))) + goto partial_error; + } + + rx_ring[id].skb = skb; + return skb; + +partial_error: + + ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE); + + for (; i > 0; --i) + ib_dma_unmap_page(priv->ca, mapping[i], PAGE_SIZE, DMA_FROM_DEVICE); + + dev_kfree_skb_any(skb); + return NULL; +} + +static void ipoib_cm_free_rx_ring(struct net_device *dev, + struct ipoib_cm_rx_buf *rx_ring) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int i; + + for (i = 0; i < ipoib_recvq_size; ++i) + if (rx_ring[i].skb) { + ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1, + rx_ring[i].mapping); + dev_kfree_skb_any(rx_ring[i].skb); + } + + vfree(rx_ring); +} + +static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv *priv) +{ + struct ib_send_wr *bad_wr; + struct ipoib_cm_rx *p; + + /* We only reserved 1 extra slot in CQ for drain WRs, so + * make sure we have at most 1 outstanding WR. */ + if (list_empty(&priv->cm.rx_flush_list) || + !list_empty(&priv->cm.rx_drain_list)) + return; + + /* + * QPs on flush list are error state. This way, a "flush + * error" WC will be immediately generated for each WR we post. + */ + p = list_entry(priv->cm.rx_flush_list.next, typeof(*p), list); + ipoib_cm_rx_drain_wr.wr_id = IPOIB_CM_RX_DRAIN_WRID; + if (ib_post_send(p->qp, &ipoib_cm_rx_drain_wr, &bad_wr)) + ipoib_warn(priv, "failed to post drain wr\n"); + + list_splice_init(&priv->cm.rx_flush_list, &priv->cm.rx_drain_list); +} + +static void ipoib_cm_rx_event_handler(struct ib_event *event, void *ctx) +{ + struct ipoib_cm_rx *p = ctx; + struct ipoib_dev_priv *priv = netdev_priv(p->dev); + unsigned long flags; + + if (event->event != IB_EVENT_QP_LAST_WQE_REACHED) + return; + + spin_lock_irqsave(&priv->lock, flags); + list_move(&p->list, &priv->cm.rx_flush_list); + p->state = IPOIB_CM_RX_FLUSH; + ipoib_cm_start_rx_drain(priv); + spin_unlock_irqrestore(&priv->lock, flags); +} + +static struct ib_qp *ipoib_cm_create_rx_qp(struct net_device *dev, + struct ipoib_cm_rx *p) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_qp_init_attr attr = { + .event_handler = ipoib_cm_rx_event_handler, + .send_cq = priv->recv_cq, /* For drain WR */ + .recv_cq = priv->recv_cq, + .srq = priv->cm.srq, + .cap.max_send_wr = 1, /* For drain WR */ + .cap.max_send_sge = 1, /* FIXME: 0 Seems not to work */ + .sq_sig_type = IB_SIGNAL_ALL_WR, + .qp_type = IB_QPT_RC, + .qp_context = p, + }; + + if (!ipoib_cm_has_srq(dev)) { + attr.cap.max_recv_wr = ipoib_recvq_size; + attr.cap.max_recv_sge = IPOIB_CM_RX_SG; + } + + return ib_create_qp(priv->pd, &attr); +} + +static int ipoib_cm_modify_rx_qp(struct net_device *dev, + struct ib_cm_id *cm_id, struct ib_qp *qp, + unsigned psn) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_qp_attr qp_attr; + int qp_attr_mask, ret; + + qp_attr.qp_state = IB_QPS_INIT; + ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to init QP attr for INIT: %d\n", ret); + return ret; + } + ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify QP to INIT: %d\n", ret); + return ret; + } + qp_attr.qp_state = IB_QPS_RTR; + ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret); + return ret; + } + qp_attr.rq_psn = psn; + ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret); + return ret; + } + + /* + * Current Mellanox HCA firmware won't generate completions + * with error for drain WRs unless the QP has been moved to + * RTS first. This work-around leaves a window where a QP has + * moved to error asynchronously, but this will eventually get + * fixed in firmware, so let's not error out if modify QP + * fails. + */ + qp_attr.qp_state = IB_QPS_RTS; + ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret); + return 0; + } + ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret); + return 0; + } + + return 0; +} + +static void ipoib_cm_init_rx_wr(struct net_device *dev, + struct ib_recv_wr *wr, + struct ib_sge *sge) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int i; + + for (i = 0; i < priv->cm.num_frags; ++i) + sge[i].lkey = priv->mr->lkey; + + sge[0].length = IPOIB_CM_HEAD_SIZE; + for (i = 1; i < priv->cm.num_frags; ++i) + sge[i].length = PAGE_SIZE; + + wr->next = NULL; + wr->sg_list = sge; + wr->num_sge = priv->cm.num_frags; +} + +static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_id, + struct ipoib_cm_rx *rx) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct { + struct ib_recv_wr wr; + struct ib_sge sge[IPOIB_CM_RX_SG]; + } *t; + int ret; + int i; + + rx->rx_ring = vmalloc(ipoib_recvq_size * sizeof *rx->rx_ring); + if (!rx->rx_ring) { + printk(KERN_WARNING "%s: failed to allocate CM non-SRQ ring (%d entries)\n", + priv->ca->name, ipoib_recvq_size); + return -ENOMEM; + } + + memset(rx->rx_ring, 0, ipoib_recvq_size * sizeof *rx->rx_ring); + + t = kmalloc(sizeof *t, GFP_KERNEL); + if (!t) { + ret = -ENOMEM; + goto err_free; + } + + ipoib_cm_init_rx_wr(dev, &t->wr, t->sge); + + spin_lock_irq(&priv->lock); + + if (priv->cm.nonsrq_conn_qp >= ipoib_max_conn_qp) { + spin_unlock_irq(&priv->lock); + ib_send_cm_rej(cm_id, IB_CM_REJ_NO_QP, NULL, 0, NULL, 0); + ret = -EINVAL; + goto err_free; + } else + ++priv->cm.nonsrq_conn_qp; + + spin_unlock_irq(&priv->lock); + + for (i = 0; i < ipoib_recvq_size; ++i) { + if (!ipoib_cm_alloc_rx_skb(dev, rx->rx_ring, i, IPOIB_CM_RX_SG - 1, + rx->rx_ring[i].mapping)) { + ipoib_warn(priv, "failed to allocate receive buffer %d\n", i); + ret = -ENOMEM; + goto err_count; + } + ret = ipoib_cm_post_receive_nonsrq(dev, rx, &t->wr, t->sge, i); + if (ret) { + ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq " + "failed for buf %d\n", i); + ret = -EIO; + goto err_count; + } + } + + rx->recv_count = ipoib_recvq_size; + + kfree(t); + + return 0; + +err_count: + spin_lock_irq(&priv->lock); + --priv->cm.nonsrq_conn_qp; + spin_unlock_irq(&priv->lock); + +err_free: + kfree(t); + ipoib_cm_free_rx_ring(dev, rx->rx_ring); + + return ret; +} + +static int ipoib_cm_send_rep(struct net_device *dev, struct ib_cm_id *cm_id, + struct ib_qp *qp, struct ib_cm_req_event_param *req, + unsigned psn) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_cm_data data = {}; + struct ib_cm_rep_param rep = {}; + + data.qpn = cpu_to_be32(priv->qp->qp_num); + data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE); + + rep.private_data = &data; + rep.private_data_len = sizeof data; + rep.flow_control = 0; + rep.rnr_retry_count = req->rnr_retry_count; + rep.srq = ipoib_cm_has_srq(dev); + rep.qp_num = qp->qp_num; + rep.starting_psn = psn; + return ib_send_cm_rep(cm_id, &rep); +} + +static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) +{ + struct net_device *dev = cm_id->context; + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_cm_rx *p; + unsigned psn; + int ret; + + ipoib_dbg(priv, "REQ arrived\n"); + p = kzalloc(sizeof *p, GFP_KERNEL); + if (!p) + return -ENOMEM; + p->dev = dev; + p->id = cm_id; + cm_id->context = p; + p->state = IPOIB_CM_RX_LIVE; + p->jiffies = jiffies; + INIT_LIST_HEAD(&p->list); + + p->qp = ipoib_cm_create_rx_qp(dev, p); + if (IS_ERR(p->qp)) { + ret = PTR_ERR(p->qp); + goto err_qp; + } + + psn = random32() & 0xffffff; + ret = ipoib_cm_modify_rx_qp(dev, cm_id, p->qp, psn); + if (ret) + goto err_modify; + + if (!ipoib_cm_has_srq(dev)) { + ret = ipoib_cm_nonsrq_init_rx(dev, cm_id, p); + if (ret) + goto err_modify; + } + + spin_lock_irq(&priv->lock); + queue_delayed_work(ipoib_workqueue, + &priv->cm.stale_task, IPOIB_CM_RX_DELAY); + /* Add this entry to passive ids list head, but do not re-add it + * if IB_EVENT_QP_LAST_WQE_REACHED has moved it to flush list. */ + p->jiffies = jiffies; + if (p->state == IPOIB_CM_RX_LIVE) + list_move(&p->list, &priv->cm.passive_ids); + spin_unlock_irq(&priv->lock); + + ret = ipoib_cm_send_rep(dev, cm_id, p->qp, &event->param.req_rcvd, psn); + if (ret) { + ipoib_warn(priv, "failed to send REP: %d\n", ret); + if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE)) + ipoib_warn(priv, "unable to move qp to error state\n"); + } + return 0; + +err_modify: + ib_destroy_qp(p->qp); +err_qp: + kfree(p); + return ret; +} + +static int ipoib_cm_rx_handler(struct ib_cm_id *cm_id, + struct ib_cm_event *event) +{ + struct ipoib_cm_rx *p; + struct ipoib_dev_priv *priv; + + switch (event->event) { + case IB_CM_REQ_RECEIVED: + return ipoib_cm_req_handler(cm_id, event); + case IB_CM_DREQ_RECEIVED: + p = cm_id->context; + ib_send_cm_drep(cm_id, NULL, 0); + /* Fall through */ + case IB_CM_REJ_RECEIVED: + p = cm_id->context; + priv = netdev_priv(p->dev); + if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE)) + ipoib_warn(priv, "unable to move qp to error state\n"); + /* Fall through */ + default: + return 0; + } +} +/* Adjust length of skb with fragments to match received data */ +static void skb_put_frags(struct sk_buff *skb, unsigned int hdr_space, + unsigned int length, struct sk_buff *toskb) +{ + int i, num_frags; + unsigned int size; + + /* put header into skb */ + size = min(length, hdr_space); + skb->tail += size; + skb->len += size; + length -= size; + + num_frags = skb_shinfo(skb)->nr_frags; + for (i = 0; i < num_frags; i++) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + if (length == 0) { + /* don't need this page */ + skb_fill_page_desc(toskb, i, frag->page, 0, PAGE_SIZE); + --skb_shinfo(skb)->nr_frags; + } else { + size = min(length, (unsigned) PAGE_SIZE); + + frag->size = size; + skb->data_len += size; + skb->truesize += size; + skb->len += size; + length -= size; + } + } +} + +void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_cm_rx_buf *rx_ring; + unsigned int wr_id = wc->wr_id & ~(IPOIB_OP_CM | IPOIB_OP_RECV); + struct sk_buff *skb, *newskb; + struct ipoib_cm_rx *p; + unsigned long flags; + u64 mapping[IPOIB_CM_RX_SG]; + int frags; + int has_srq; + struct sk_buff *small_skb; + + ipoib_dbg_data(priv, "cm recv completion: id %d, status: %d\n", + wr_id, wc->status); + + if (unlikely(wr_id >= ipoib_recvq_size)) { + if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~(IPOIB_OP_CM | IPOIB_OP_RECV))) { + spin_lock_irqsave(&priv->lock, flags); + list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list); + ipoib_cm_start_rx_drain(priv); + queue_work(ipoib_workqueue, &priv->cm.rx_reap_task); + spin_unlock_irqrestore(&priv->lock, flags); + } else + ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n", + wr_id, ipoib_recvq_size); + return; + } + + p = wc->qp->qp_context; + + has_srq = ipoib_cm_has_srq(dev); + rx_ring = has_srq ? priv->cm.srq_ring : p->rx_ring; + + skb = rx_ring[wr_id].skb; + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + ipoib_dbg(priv, "cm recv error " + "(status=%d, wrid=%d vend_err %x)\n", + wc->status, wr_id, wc->vendor_err); + ++priv->stats.rx_dropped; + if (has_srq) + goto repost; + else { + if (!--p->recv_count) { + spin_lock_irqsave(&priv->lock, flags); + list_move(&p->list, &priv->cm.rx_reap_list); + spin_unlock_irqrestore(&priv->lock, flags); + queue_work(ipoib_workqueue, &priv->cm.rx_reap_task); + } + return; + } + } + + if (unlikely(!(wr_id & IPOIB_CM_RX_UPDATE_MASK))) { + if (p && time_after_eq(jiffies, p->jiffies + IPOIB_CM_RX_UPDATE_TIME)) { + spin_lock_irqsave(&priv->lock, flags); + p->jiffies = jiffies; + /* Move this entry to list head, but do not re-add it + * if it has been moved out of list. */ + if (p->state == IPOIB_CM_RX_LIVE) + list_move(&p->list, &priv->cm.passive_ids); + spin_unlock_irqrestore(&priv->lock, flags); + } + } + + if (wc->byte_len < IPOIB_CM_COPYBREAK) { + int dlen = wc->byte_len; + + small_skb = dev_alloc_skb(dlen + 12); + if (small_skb) { + skb_reserve(small_skb, 12); + ib_dma_sync_single_for_cpu(priv->ca, rx_ring[wr_id].mapping[0], + dlen, DMA_FROM_DEVICE); + skb_copy_from_linear_data(skb, small_skb->data, dlen); + ib_dma_sync_single_for_device(priv->ca, rx_ring[wr_id].mapping[0], + dlen, DMA_FROM_DEVICE); + skb_put(small_skb, dlen); + skb = small_skb; + goto copied; + } + } + + frags = PAGE_ALIGN(wc->byte_len - min(wc->byte_len, + (unsigned)IPOIB_CM_HEAD_SIZE)) / PAGE_SIZE; + + newskb = ipoib_cm_alloc_rx_skb(dev, rx_ring, wr_id, frags, mapping); + if (unlikely(!newskb)) { + /* + * If we can't allocate a new RX buffer, dump + * this packet and reuse the old buffer. + */ + ipoib_dbg(priv, "failed to allocate receive buffer %d\n", wr_id); + ++priv->stats.rx_dropped; + goto repost; + } + + ipoib_cm_dma_unmap_rx(priv, frags, rx_ring[wr_id].mapping); + memcpy(rx_ring[wr_id].mapping, mapping, (frags + 1) * sizeof *mapping); + + ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n", + wc->byte_len, wc->slid); + + skb_put_frags(skb, IPOIB_CM_HEAD_SIZE, wc->byte_len, newskb); + +copied: + skb->protocol = ((struct ipoib_header *) skb->data)->proto; + skb_reset_mac_header(skb); + skb_pull(skb, IPOIB_ENCAP_LEN); + + dev->last_rx = jiffies; + ++priv->stats.rx_packets; + priv->stats.rx_bytes += skb->len; + + skb->dev = dev; + /* XXX get correct PACKET_ type here */ + skb->pkt_type = PACKET_HOST; + netif_receive_skb(skb); + +repost: + if (has_srq) { + if (unlikely(ipoib_cm_post_receive_srq(dev, wr_id))) + ipoib_warn(priv, "ipoib_cm_post_receive_srq failed " + "for buf %d\n", wr_id); + } else { + if (unlikely(ipoib_cm_post_receive_nonsrq(dev, p, + &priv->cm.rx_wr, + priv->cm.rx_sge, + wr_id))) { + --p->recv_count; + ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq failed " + "for buf %d\n", wr_id); + } + } +} + +static inline int post_send(struct ipoib_dev_priv *priv, + struct ipoib_cm_tx *tx, + unsigned int wr_id, + u64 addr, int len) +{ + struct ib_send_wr *bad_wr; + + priv->tx_sge[0].addr = addr; + priv->tx_sge[0].length = len; + + priv->tx_wr.num_sge = 1; + priv->tx_wr.wr_id = wr_id | IPOIB_OP_CM; + + return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr); +} + +void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_cm_tx_buf *tx_req; + u64 addr; + + if (unlikely(skb->len > tx->mtu)) { + ipoib_warn(priv, "%s: packet len %d (> %d) too long to send, dropping\n", + __func__, skb->len, tx->mtu); + ++priv->stats.tx_dropped; + ++priv->stats.tx_errors; + + ipoib_cm_skb_too_long(dev, skb, tx->mtu - IPOIB_ENCAP_LEN); + + return; + } + + ipoib_dbg_data(priv, "sending packet: head 0x%x length %d connection 0x%x\n", + tx->tx_head, skb->len, tx->qp->qp_num); + + /* + * We put the skb into the tx_ring _before_ we call post_send() + * because it's entirely possible that the completion handler will + * run before we execute anything after the post_send(). That + * means we have to make sure everything is properly recorded and + * our state is consistent before we call post_send(). + */ + tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)]; + tx_req->skb = skb; + addr = ib_dma_map_single(priv->ca, skb->data, skb->len, DMA_TO_DEVICE); + if (unlikely(ib_dma_mapping_error(priv->ca, addr))) { + ++priv->stats.tx_errors; + dev_kfree_skb_any(skb); + return; + } + + tx_req->mapping = addr; + + if (unlikely(post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1), + addr, skb->len))) { + ipoib_warn(priv, "post_send failed\n"); + ++priv->stats.tx_errors; + ib_dma_unmap_single(priv->ca, addr, skb->len, DMA_TO_DEVICE); + dev_kfree_skb_any(skb); + } else { + dev->trans_start = jiffies; + ++tx->tx_head; + + if (++priv->tx_outstanding == ipoib_sendq_size) { + ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n", + tx->qp->qp_num); + if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP)) + ipoib_warn(priv, "request notify on send CQ failed\n"); + netif_stop_queue(dev); + } + } +} + +void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_cm_tx *tx = wc->qp->qp_context; + unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM; + struct ipoib_cm_tx_buf *tx_req; + unsigned long flags; + + ipoib_dbg_data(priv, "cm send completion: id %d, status: %d\n", + wr_id, wc->status); + + if (unlikely(wr_id >= ipoib_sendq_size)) { + ipoib_warn(priv, "cm send completion event with wrid %d (> %d)\n", + wr_id, ipoib_sendq_size); + return; + } + + tx_req = &tx->tx_ring[wr_id]; + + ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, DMA_TO_DEVICE); + + /* FIXME: is this right? Shouldn't we only increment on success? */ + ++priv->stats.tx_packets; + priv->stats.tx_bytes += tx_req->skb->len; + + dev_kfree_skb_any(tx_req->skb); + + netif_tx_lock(dev); + + ++tx->tx_tail; + if (unlikely(--priv->tx_outstanding <= ipoib_sendq_size >> 1) && + netif_queue_stopped(dev) && + test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) + netif_wake_queue(dev); + + if (wc->status != IB_WC_SUCCESS && + wc->status != IB_WC_WR_FLUSH_ERR) { + struct ipoib_neigh *neigh; + + /*IB_WC_RNR_RETRY_EXC_ERR error is part of the life cycle, so don't make waves.*/ + if (IB_WC_RNR_RETRY_EXC_ERR != wc->status) + ipoib_warn(priv, "%s: failed cm send event " + "(status=%d, wrid=%d vend_err %x)\n", + __func__, wc->status, wr_id, wc->vendor_err); + else + ipoib_dbg(priv, "%s: failed cm send event " + "(status=%d, wrid=%d vend_err %x)\n", + __func__, wc->status, wr_id, wc->vendor_err); + + spin_lock_irqsave(&priv->lock, flags); + neigh = tx->neigh; + + if (neigh) { + neigh->cm = NULL; + list_del_init(&neigh->list); + ipoib_neigh_free(neigh); + + tx->neigh = NULL; + } + + if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { + list_move(&tx->list, &priv->cm.reap_list); + queue_work(ipoib_workqueue, &priv->cm.reap_task); + } + + clear_bit(IPOIB_FLAG_OPER_UP, &tx->flags); + + spin_unlock_irqrestore(&priv->lock, flags); + } + + netif_tx_unlock(dev); +} + +int ipoib_cm_dev_open(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int ret; + + if (!IPOIB_CM_SUPPORTED(dev->dev_addr)) + return 0; + + priv->cm.id = ib_create_cm_id(priv->ca, ipoib_cm_rx_handler, dev); + if (IS_ERR(priv->cm.id)) { + printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name); + ret = PTR_ERR(priv->cm.id); + goto err_cm; + } + + ret = ib_cm_listen(priv->cm.id, cpu_to_be64(IPOIB_CM_IETF_ID | priv->qp->qp_num), + 0, NULL); + if (ret) { + printk(KERN_WARNING "%s: failed to listen on ID 0x%llx\n", priv->ca->name, + IPOIB_CM_IETF_ID | priv->qp->qp_num); + goto err_listen; + } + + return 0; + +err_listen: + ib_destroy_cm_id(priv->cm.id); +err_cm: + priv->cm.id = NULL; + return ret; +} + +static void ipoib_cm_free_rx_reap_list(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_cm_rx *rx, *n; + LIST_HEAD(list); + + spin_lock_irq(&priv->lock); + list_splice_init(&priv->cm.rx_reap_list, &list); + spin_unlock_irq(&priv->lock); + + list_for_each_entry_safe(rx, n, &list, list) { + ib_destroy_cm_id(rx->id); + ib_destroy_qp(rx->qp); + if (!ipoib_cm_has_srq(dev)) { + ipoib_cm_free_rx_ring(priv->dev, rx->rx_ring); + spin_lock_irq(&priv->lock); + --priv->cm.nonsrq_conn_qp; + spin_unlock_irq(&priv->lock); + } + kfree(rx); + } +} + +void ipoib_cm_dev_stop(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_cm_rx *p; + unsigned long begin; + int ret; + + if (!IPOIB_CM_SUPPORTED(dev->dev_addr) || !priv->cm.id) + return; + + ib_destroy_cm_id(priv->cm.id); + priv->cm.id = NULL; + + spin_lock_irq(&priv->lock); + while (!list_empty(&priv->cm.passive_ids)) { + p = list_entry(priv->cm.passive_ids.next, typeof(*p), list); + list_move(&p->list, &priv->cm.rx_error_list); + p->state = IPOIB_CM_RX_ERROR; + spin_unlock_irq(&priv->lock); + ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE); + if (ret) + ipoib_warn(priv, "unable to move qp to error state: %d\n", ret); + spin_lock_irq(&priv->lock); + } + + /* Wait for all RX to be drained */ + begin = jiffies; + + while (!list_empty(&priv->cm.rx_error_list) || + !list_empty(&priv->cm.rx_flush_list) || + !list_empty(&priv->cm.rx_drain_list)) { + if (time_after(jiffies, begin + 5 * HZ)) { + ipoib_warn(priv, "RX drain timing out\n"); + + /* + * assume the HW is wedged and just free up everything. + */ + list_splice_init(&priv->cm.rx_flush_list, + &priv->cm.rx_reap_list); + list_splice_init(&priv->cm.rx_error_list, + &priv->cm.rx_reap_list); + list_splice_init(&priv->cm.rx_drain_list, + &priv->cm.rx_reap_list); + break; + } + spin_unlock_irq(&priv->lock); + msleep(1); + ipoib_drain_cq(dev); + spin_lock_irq(&priv->lock); + } + + spin_unlock_irq(&priv->lock); + + ipoib_cm_free_rx_reap_list(dev); + + cancel_delayed_work(&priv->cm.stale_task); +} + +static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) +{ + struct ipoib_cm_tx *p = cm_id->context; + struct ipoib_dev_priv *priv = netdev_priv(p->dev); + struct ipoib_cm_data *data = event->private_data; + struct sk_buff_head skqueue; + struct ib_qp_attr qp_attr; + int qp_attr_mask, ret; + struct sk_buff *skb; + + p->mtu = be32_to_cpu(data->mtu); + + if (p->mtu <= IPOIB_ENCAP_LEN) { + ipoib_warn(priv, "Rejecting connection: mtu %d <= %d\n", + p->mtu, IPOIB_ENCAP_LEN); + return -EINVAL; + } + + qp_attr.qp_state = IB_QPS_RTR; + ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret); + return ret; + } + + qp_attr.rq_psn = 0 /* FIXME */; + ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret); + return ret; + } + + qp_attr.qp_state = IB_QPS_RTS; + ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret); + return ret; + } + ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret); + return ret; + } + + skb_queue_head_init(&skqueue); + + spin_lock_irq(&priv->lock); + set_bit(IPOIB_FLAG_OPER_UP, &p->flags); + if (p->neigh) + while ((skb = __skb_dequeue(&p->neigh->queue))) + __skb_queue_tail(&skqueue, skb); + spin_unlock_irq(&priv->lock); + + while ((skb = __skb_dequeue(&skqueue))) { + skb->dev = p->dev; + ret = dev_queue_xmit(skb); + if (ret) + ipoib_warn(priv, "%s:dev_queue_xmit failed (ret = %d) " + "to requeue packet\n",__func__, ret); + } + + ret = ib_send_cm_rtu(cm_id, NULL, 0); + if (ret) { + ipoib_warn(priv, "failed to send RTU: %d\n", ret); + return ret; + } + return 0; +} + +static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_cm_tx *tx) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_qp_init_attr attr = { + .send_cq = priv->recv_cq, + .recv_cq = priv->recv_cq, + .srq = priv->cm.srq, + .cap.max_send_wr = ipoib_sendq_size, + .cap.max_send_sge = 1, + .sq_sig_type = IB_SIGNAL_ALL_WR, + .qp_type = IB_QPT_RC, + .qp_context = tx + }; + + return ib_create_qp(priv->pd, &attr); +} + +static int ipoib_cm_send_req(struct net_device *dev, + struct ib_cm_id *id, struct ib_qp *qp, + u32 qpn, + struct ib_sa_path_rec *pathrec) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_cm_data data = {}; + struct ib_cm_req_param req = {}; + + data.qpn = cpu_to_be32(priv->qp->qp_num); + data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE); + + req.primary_path = pathrec; + req.alternate_path = NULL; + req.service_id = cpu_to_be64(IPOIB_CM_IETF_ID | qpn); + req.qp_num = qp->qp_num; + req.qp_type = qp->qp_type; + req.private_data = &data; + req.private_data_len = sizeof data; + req.flow_control = 0; + + req.starting_psn = 0; /* FIXME */ + + /* + * Pick some arbitrary defaults here; we could make these + * module parameters if anyone cared about setting them. + */ + req.responder_resources = 4; + req.remote_cm_response_timeout = 20; + req.local_cm_response_timeout = 20; + req.retry_count = 0; /* RFC draft warns against retries */ + req.rnr_retry_count = 0; /* RFC draft warns against retries */ + req.max_cm_retries = 15; + req.srq = ipoib_cm_has_srq(dev); + return ib_send_cm_req(id, &req); +} + +static int ipoib_cm_modify_tx_init(struct net_device *dev, + struct ib_cm_id *cm_id, struct ib_qp *qp) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_qp_attr qp_attr; + int qp_attr_mask, ret; + ret = ib_find_pkey(priv->ca, priv->port, priv->pkey, &qp_attr.pkey_index); + if (ret) { + ipoib_warn(priv, "pkey 0x%x not found: %d\n", priv->pkey, ret); + return ret; + } + + qp_attr.qp_state = IB_QPS_INIT; + qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE; + qp_attr.port_num = priv->port; + qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_PORT; + + ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify tx QP to INIT: %d\n", ret); + return ret; + } + return 0; +} + +static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn, + struct ib_sa_path_rec *pathrec) +{ + struct ipoib_dev_priv *priv = netdev_priv(p->dev); + int ret; + + p->tx_ring = vmalloc(ipoib_sendq_size * sizeof *p->tx_ring); + if (!p->tx_ring) { + ipoib_warn(priv, "failed to allocate tx ring\n"); + ret = -ENOMEM; + goto err_tx; + } + memset(p->tx_ring, 0, ipoib_sendq_size * sizeof *p->tx_ring); + + p->qp = ipoib_cm_create_tx_qp(p->dev, p); + if (IS_ERR(p->qp)) { + ret = PTR_ERR(p->qp); + ipoib_warn(priv, "failed to allocate tx qp: %d\n", ret); + goto err_qp; + } + + p->id = ib_create_cm_id(priv->ca, ipoib_cm_tx_handler, p); + if (IS_ERR(p->id)) { + ret = PTR_ERR(p->id); + ipoib_warn(priv, "failed to create tx cm id: %d\n", ret); + goto err_id; + } + + ret = ipoib_cm_modify_tx_init(p->dev, p->id, p->qp); + if (ret) { + ipoib_warn(priv, "failed to modify tx qp to rtr: %d\n", ret); + goto err_modify; + } + + ret = ipoib_cm_send_req(p->dev, p->id, p->qp, qpn, pathrec); + if (ret) { + ipoib_warn(priv, "failed to send cm req: %d\n", ret); + goto err_send_cm; + } + + ipoib_dbg(priv, "Request connection 0x%x for gid " IPOIB_GID_FMT " qpn 0x%x\n", + p->qp->qp_num, IPOIB_GID_ARG(pathrec->dgid), qpn); + + return 0; + +err_send_cm: +err_modify: + ib_destroy_cm_id(p->id); +err_id: + p->id = NULL; + ib_destroy_qp(p->qp); +err_qp: + p->qp = NULL; + vfree(p->tx_ring); +err_tx: + return ret; +} + +static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p) +{ + struct ipoib_dev_priv *priv = netdev_priv(p->dev); + unsigned long begin; + int num_tries = 0; + + ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail 0x%x\n", + p->qp ? p->qp->qp_num : 0, p->tx_head, p->tx_tail); + + if (p->id) + ib_destroy_cm_id(p->id); + + /*move the qp to ERROR state*/ + if (p->qp) { + if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE)) + ipoib_warn(priv, "%s: Failed to modify QP to ERROR state\n", + __func__); + } + + if (p->tx_ring) { + /* + * Wait for all sends to complete, + * All of them should return here after ERROR state in the qp. + */ + begin = jiffies; + while (p->tx_tail != p->tx_head) { + if (time_after(jiffies, begin + 5 * HZ)) { + ipoib_warn(priv, "timing out; %d sends not completed still waiting..\n", + p->tx_head - p->tx_tail); + /* + * check if we are in napi_disable state (in port/module down etc.), + * if so we need to force drain over the qp in order to get all the comp + * otherwise ib_req_notify_cq to get the poll_tx at the next time. + */ + if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) { + ipoib_warn(priv, "%s: start drain CQ \n", __func__); + ipoib_drain_cq(p->dev); + + ipoib_warn(priv, "%s: re-arm CQ\n", __func__); + if (ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP)) + ipoib_warn(priv, "request notify on RECIVE CQ failed\n"); + } + begin = jiffies; + num_tries++; + if (num_tries == 5) { + ipoib_warn(priv, "%s: %d not completed Going out.\n", + __func__, p->tx_head - p->tx_tail); + goto out; + } + } + /*let the wc to arrived.*/ + msleep(2); + } + } +out: + /* assume all the wc are reached.*/ + if (p->qp) + ib_destroy_qp(p->qp); + + vfree(p->tx_ring); + kfree(p); +} + +static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id, + struct ib_cm_event *event) +{ + struct ipoib_cm_tx *tx = cm_id->context; + struct ipoib_dev_priv *priv = netdev_priv(tx->dev); + struct net_device *dev = priv->dev; + struct ipoib_neigh *neigh; + unsigned long flags; + int ret; + + switch (event->event) { + case IB_CM_DREQ_RECEIVED: + ipoib_dbg(priv, "DREQ received.\n"); + ib_send_cm_drep(cm_id, NULL, 0); + break; + case IB_CM_REP_RECEIVED: + ipoib_dbg(priv, "REP received.\n"); + ret = ipoib_cm_rep_handler(cm_id, event); + if (ret) + ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED, + NULL, 0, NULL, 0); + break; + case IB_CM_REQ_ERROR: + case IB_CM_REJ_RECEIVED: + case IB_CM_TIMEWAIT_EXIT: + ipoib_dbg(priv, "CM error %d.\n", event->event); + netif_tx_lock_bh(dev); + spin_lock_irqsave(&priv->lock, flags); + neigh = tx->neigh; + + if (neigh) { + neigh->cm = NULL; + list_del_init(&neigh->list); + ipoib_neigh_free(neigh); + + tx->neigh = NULL; + } + + if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { + list_move(&tx->list, &priv->cm.reap_list); + queue_work(ipoib_workqueue, &priv->cm.reap_task); + } + + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); + break; + default: + break; + } + + return 0; +} + +struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path, + struct ipoib_neigh *neigh) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_cm_tx *tx; + + tx = kzalloc(sizeof *tx, GFP_ATOMIC); + if (!tx) + return NULL; + + neigh->cm = tx; + tx->neigh = neigh; + tx->path = path; + tx->dev = dev; + list_add(&tx->list, &priv->cm.start_list); + set_bit(IPOIB_FLAG_INITIALIZED, &tx->flags); + queue_work(ipoib_workqueue, &priv->cm.start_task); + return tx; +} + +void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx) +{ + struct ipoib_dev_priv *priv = netdev_priv(tx->dev); + unsigned long flags; + if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { + spin_lock_irqsave(&priv->lock, flags); + list_move(&tx->list, &priv->cm.reap_list); + queue_work(ipoib_workqueue, &priv->cm.reap_task); + tx->neigh = NULL; + spin_unlock_irqrestore(&priv->lock, flags); + } +} + +static void ipoib_cm_tx_start(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, + cm.start_task); + struct net_device *dev = priv->dev; + struct ipoib_neigh *neigh; + struct ipoib_cm_tx *p; + unsigned long flags; + int ret; + + struct ib_sa_path_rec pathrec; + u32 qpn; + + netif_tx_lock_bh(dev); + spin_lock_irqsave(&priv->lock, flags); + + while (!list_empty(&priv->cm.start_list)) { + p = list_entry(priv->cm.start_list.next, typeof(*p), list); + list_del_init(&p->list); + neigh = p->neigh; + qpn = IPOIB_QPN(neigh->daddr); + memcpy(&pathrec, &p->path->pathrec, sizeof pathrec); + + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); + + ret = ipoib_cm_tx_init(p, qpn, &pathrec); + + netif_tx_lock_bh(dev); + spin_lock_irqsave(&priv->lock, flags); + + if (ret) { + neigh = p->neigh; + if (neigh) { + neigh->cm = NULL; + list_del_init(&neigh->list); + ipoib_neigh_free(neigh); + } + list_del_init(&p->list); + kfree(p); + } + } + + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); +} + +static void ipoib_cm_tx_reap(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, + cm.reap_task); + struct net_device *dev = priv->dev; + struct ipoib_cm_tx *p; + unsigned long flags; + + netif_tx_lock_bh(dev); + spin_lock_irqsave(&priv->lock, flags); + + while (!list_empty(&priv->cm.reap_list)) { + p = list_entry(priv->cm.reap_list.next, typeof(*p), list); + list_del_init(&p->list); + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); + ipoib_cm_tx_destroy(p); + netif_tx_lock_bh(dev); + spin_lock_irqsave(&priv->lock, flags); + } + + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); +} + +static void ipoib_cm_skb_reap(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, + cm.skb_task); + struct net_device *dev = priv->dev; + struct sk_buff *skb; + unsigned long flags; +/* unsigned mtu = priv->mcast_mtu; */ + + netif_tx_lock_bh(dev); + spin_lock_irqsave(&priv->lock, flags); + + while ((skb = skb_dequeue(&priv->cm.skb_queue))) { + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); +/* + if (skb->protocol == htons(ETH_P_IP)) + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + else if (skb->protocol == htons(ETH_P_IPV6)) + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, priv->dev); +#endif +*/ + dev_kfree_skb_any(skb); + + netif_tx_lock_bh(dev); + spin_lock_irqsave(&priv->lock, flags); + } + + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); +} + +static void ipoib_cm_update_pmtu_task(struct work_struct *work) +{ + struct ipoib_pmtu_update *pmtu_update = + container_of(work, struct ipoib_pmtu_update, work); + struct sk_buff *skb = pmtu_update->skb; + + skb->dst->ops->update_pmtu(skb->dst, pmtu_update->mtu); + + atomic_dec(&skb->users); + + kfree(pmtu_update); +} + +void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb, + unsigned int mtu) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int e = skb_queue_empty(&priv->cm.skb_queue); + struct ipoib_pmtu_update *pmtu_update; + + if (skb->dst) { + /* take the pmtu_update ouf ot spin-lock context */ + pmtu_update = kzalloc(sizeof *pmtu_update, GFP_ATOMIC); + if (pmtu_update) { + pmtu_update->skb = skb; + pmtu_update->mtu = mtu; + /* in order to keep the skb available */ + skb_get(skb); + + INIT_WORK(&pmtu_update->work, ipoib_cm_update_pmtu_task); + /* + * in order to have it serial, push that task to + * the same queue which the function will push + * the priv->cm.skb_task work. + */ + queue_work(ipoib_workqueue, &pmtu_update->work); + } else + ipoib_warn(priv, "Failed alloc pmtu_update and update_pmtu(skb->dst, mtu)\n"); + } + + skb_queue_tail(&priv->cm.skb_queue, skb); + if (e) + queue_work(ipoib_workqueue, &priv->cm.skb_task); + +} + +static void ipoib_cm_rx_reap(struct work_struct *work) +{ + ipoib_cm_free_rx_reap_list(container_of(work, struct ipoib_dev_priv, + cm.rx_reap_task)->dev); +} + +static void ipoib_cm_stale_task(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, + cm.stale_task.work); + struct ipoib_cm_rx *p; + int ret; + + spin_lock_irq(&priv->lock); + while (!list_empty(&priv->cm.passive_ids)) { + /* List is sorted by LRU, start from tail, + * stop when we see a recently used entry */ + p = list_entry(priv->cm.passive_ids.prev, typeof(*p), list); + if (time_before_eq(jiffies, p->jiffies + IPOIB_CM_RX_TIMEOUT)) + break; + list_move(&p->list, &priv->cm.rx_error_list); + p->state = IPOIB_CM_RX_ERROR; + spin_unlock_irq(&priv->lock); + ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE); + if (ret) + ipoib_warn(priv, "unable to move qp to error state: %d\n", ret); + spin_lock_irq(&priv->lock); + } + + if (!list_empty(&priv->cm.passive_ids)) + queue_delayed_work(ipoib_workqueue, + &priv->cm.stale_task, IPOIB_CM_RX_DELAY); + spin_unlock_irq(&priv->lock); +} + +static ssize_t show_mode(struct class_device *d, char *buf) +{ + struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(d)); + + if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags)) + return sprintf(buf, "connected\n"); + else + return sprintf(buf, "datagram\n"); +} + +static ssize_t set_mode(struct class_device *d, const char *buf, size_t count) +{ + struct net_device *dev = to_net_dev(d); + struct ipoib_dev_priv *priv = netdev_priv(dev); + + /* flush paths if we switch modes so that connections are restarted */ + if (IPOIB_CM_SUPPORTED(dev->dev_addr) && !strcmp(buf, "connected\n")) { + set_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); + ipoib_warn(priv, "enabling connected mode " + "will cause multicast packet drops\n"); + + rtnl_lock(); + dev->features &= ~(NETIF_F_IP_CSUM | NETIF_F_SG | NETIF_F_TSO); + priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM; + + if (ipoib_cm_max_mtu(dev) > priv->mcast_mtu) + ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n", + priv->mcast_mtu); + dev_set_mtu(dev, ipoib_cm_max_mtu(dev)); + rtnl_unlock(); + + ipoib_flush_paths(dev); + return count; + } + + if (!strcmp(buf, "datagram\n")) { + clear_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); + + rtnl_lock(); + if (test_bit(IPOIB_FLAG_CSUM, &priv->flags)) { + dev->features |= NETIF_F_IP_CSUM | NETIF_F_SG; + if (priv->hca_caps & IB_DEVICE_UD_TSO) + dev->features |= NETIF_F_TSO; + } + dev_set_mtu(dev, min(priv->mcast_mtu, dev->mtu)); + rtnl_unlock(); + ipoib_flush_paths(dev); + + return count; + } + + return -EINVAL; +} + +static CLASS_DEVICE_ATTR(mode, S_IWUSR | S_IRUGO, show_mode, set_mode); + +int ipoib_cm_add_mode_attr(struct net_device *dev) +{ + return class_device_create_file(&dev->class_dev, &class_device_attr_mode); +} + +static void ipoib_cm_create_srq(struct net_device *dev, int max_sge) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_srq_init_attr srq_init_attr = { + .attr = { + .max_wr = ipoib_recvq_size, + .max_sge = max_sge + } + }; + + priv->cm.srq = ib_create_srq(priv->pd, &srq_init_attr); + if (IS_ERR(priv->cm.srq)) { + if (PTR_ERR(priv->cm.srq) != -ENOSYS) + printk(KERN_WARNING "%s: failed to allocate SRQ, error %ld\n", + priv->ca->name, PTR_ERR(priv->cm.srq)); + priv->cm.srq = NULL; + return; + } + + priv->cm.srq_ring = vmalloc(ipoib_recvq_size * sizeof *priv->cm.srq_ring); + if (!priv->cm.srq_ring) { + printk(KERN_WARNING "%s: failed to allocate CM SRQ ring (%d entries)\n", + priv->ca->name, ipoib_recvq_size); + ib_destroy_srq(priv->cm.srq); + priv->cm.srq = NULL; + return; + } + + memset(priv->cm.srq_ring, 0, ipoib_recvq_size * sizeof *priv->cm.srq_ring); +} + +int ipoib_cm_dev_init(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int i, ret; + struct ib_device_attr attr; + + INIT_LIST_HEAD(&priv->cm.passive_ids); + INIT_LIST_HEAD(&priv->cm.reap_list); + INIT_LIST_HEAD(&priv->cm.start_list); + INIT_LIST_HEAD(&priv->cm.rx_error_list); + INIT_LIST_HEAD(&priv->cm.rx_flush_list); + INIT_LIST_HEAD(&priv->cm.rx_drain_list); + INIT_LIST_HEAD(&priv->cm.rx_reap_list); + INIT_WORK(&priv->cm.start_task, ipoib_cm_tx_start); + INIT_WORK(&priv->cm.reap_task, ipoib_cm_tx_reap); + INIT_WORK(&priv->cm.skb_task, ipoib_cm_skb_reap); + INIT_WORK(&priv->cm.rx_reap_task, ipoib_cm_rx_reap); + INIT_DELAYED_WORK(&priv->cm.stale_task, ipoib_cm_stale_task); + + skb_queue_head_init(&priv->cm.skb_queue); + + ret = ib_query_device(priv->ca, &attr); + if (ret) { + printk(KERN_WARNING "ib_query_device() failed with %d\n", ret); + return ret; + } + + ipoib_dbg(priv, "max_srq_sge=%d\n", attr.max_srq_sge); + + attr.max_srq_sge = min_t(int, IPOIB_CM_RX_SG, attr.max_srq_sge); + ipoib_cm_create_srq(dev, attr.max_srq_sge); + if (ipoib_cm_has_srq(dev)) { + + priv->cm.max_cm_mtu = attr.max_srq_sge * PAGE_SIZE - 0x10; + priv->cm.num_frags = attr.max_srq_sge; + ipoib_dbg(priv, "max_cm_mtu = 0x%x, num_frags=%d\n", + priv->cm.max_cm_mtu, priv->cm.num_frags); + } else { + priv->cm.max_cm_mtu = IPOIB_CM_MTU; + priv->cm.num_frags = IPOIB_CM_RX_SG; + } + + ipoib_cm_init_rx_wr(dev, &priv->cm.rx_wr, priv->cm.rx_sge); + + if (ipoib_cm_has_srq(dev)) { + for (i = 0; i < ipoib_recvq_size; ++i) { + if (!ipoib_cm_alloc_rx_skb(dev, priv->cm.srq_ring, i, + priv->cm.num_frags - 1, + priv->cm.srq_ring[i].mapping)) { + ipoib_warn(priv, "failed to allocate " + "receive buffer %d\n", i); + ipoib_cm_dev_cleanup(dev); + return -ENOMEM; + } + + if (ipoib_cm_post_receive_srq(dev, i)) { + ipoib_warn(priv, "ipoib_cm_post_receive_srq " + "failed for buf %d\n", i); + ipoib_cm_dev_cleanup(dev); + return -EIO; + } + } + } + + priv->dev->dev_addr[0] = IPOIB_FLAGS_RC; + return 0; +} + +void ipoib_cm_dev_cleanup(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int ret; + + if (!priv->cm.srq) + return; + + ipoib_dbg(priv, "Cleanup ipoib connected mode.\n"); + + ret = ib_destroy_srq(priv->cm.srq); + if (ret) + ipoib_warn(priv, "ib_destroy_srq failed: %d\n", ret); + + priv->cm.srq = NULL; + if (!priv->cm.srq_ring) + return; + + ipoib_cm_free_rx_ring(dev, priv->cm.srq_ring); + priv->cm.srq_ring = NULL; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib_1.5.3/ipoib_ethtool.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib_1.5.3/ipoib_ethtool.c new file mode 100644 index 0000000..0a428f4 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib_1.5.3/ipoib_ethtool.c @@ -0,0 +1,238 @@ +/* + * Copyright (c) 2007 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include "ipoib.h" + +static void ipoib_get_drvinfo(struct net_device *netdev, + struct ethtool_drvinfo *drvinfo) +{ + strncpy(drvinfo->driver, "ipoib", sizeof(drvinfo->driver) - 1); +} + +static u32 ipoib_get_rx_csum(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + return test_bit(IPOIB_FLAG_CSUM, &priv->flags) && + !test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); +} + +static int ipoib_set_tso(struct net_device *dev, u32 data) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + if (data) { + if (!test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags) && + (dev->features & NETIF_F_SG) && + (priv->hca_caps & IB_DEVICE_UD_TSO)) { + dev->features |= NETIF_F_TSO; + } else { + ipoib_warn(priv, "can't set TSO on\n"); + return -EOPNOTSUPP; + } + } else + dev->features &= ~NETIF_F_TSO; + + return 0; +} + +static int ipoib_get_coalesce(struct net_device *dev, + struct ethtool_coalesce *coal) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + coal->rx_coalesce_usecs = priv->ethtool.rx_coalesce_usecs; + coal->rx_max_coalesced_frames = priv->ethtool.rx_max_coalesced_frames; + coal->pkt_rate_low = priv->ethtool.pkt_rate_low; + coal->rx_coalesce_usecs_low = priv->ethtool.rx_coalesce_usecs_low; + coal->rx_coalesce_usecs_high = priv->ethtool.rx_coalesce_usecs_high; + coal->pkt_rate_high = priv->ethtool.pkt_rate_high; + coal->rate_sample_interval = priv->ethtool.rate_sample_interval; + coal->use_adaptive_rx_coalesce = priv->ethtool.use_adaptive_rx_coalesce; +/* coal->rx_coalesce_usecs = priv->ethtool.coalesce_usecs; + coal->rx_max_coalesced_frames = priv->ethtool.max_coalesced_frames; +*/ + return 0; +} + +enum ipoib_auto_moder_operation { + NONE, + MOVING_TO_ON, + MOVING_TO_OFF +}; + +static int ipoib_set_coalesce(struct net_device *dev, + struct ethtool_coalesce *coal) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int ret; + enum ipoib_auto_moder_operation moder_operation = NONE; + struct ib_cq_attr attr; + + /* + * These values are saved in the private data and returned + * when ipoib_get_coalesce() is called + */ + if (coal->rx_coalesce_usecs > 0xffff || + coal->rx_max_coalesced_frames > 0xffff) + return -EINVAL; + priv->ethtool.rx_max_coalesced_frames = + (coal->rx_max_coalesced_frames == + IPOIB_AUTO_CONF) ? + IPOIB_RX_COAL_TARGET : + coal->rx_max_coalesced_frames; + priv->ethtool.rx_coalesce_usecs = (coal->rx_coalesce_usecs == + IPOIB_AUTO_CONF) ? + IPOIB_RX_COAL_TIME : + coal->rx_coalesce_usecs; + + memset(&attr, 0, sizeof(attr)); + attr.moderation.cq_count = coal->rx_max_coalesced_frames; + attr.moderation.cq_period = coal->rx_coalesce_usecs; + + ret = ib_modify_cq(priv->recv_cq, &attr, + IB_CQ_MODERATE); + if (ret && ret != -ENOSYS) { + ipoib_warn(priv, "failed modifying CQ (%d)\n", ret); + return ret; + } + + priv->ethtool.pkt_rate_low = coal->pkt_rate_low; + priv->ethtool.rx_coalesce_usecs_low = coal->rx_coalesce_usecs_low; + priv->ethtool.rx_coalesce_usecs_high = coal->rx_coalesce_usecs_high; + priv->ethtool.pkt_rate_high = coal->pkt_rate_high; + priv->ethtool.rate_sample_interval = coal->rate_sample_interval; + + if (priv->ethtool.use_adaptive_rx_coalesce && + !coal->use_adaptive_rx_coalesce) { + /* switch from adaptive-mode to non-adaptive mode: + cancell the adaptive moderation task. */ + clear_bit(IPOIB_FLAG_AUTO_MODER, &priv->flags); + cancel_delayed_work(&priv->adaptive_moder_task); + moder_operation = MOVING_TO_OFF; + } else if ((!priv->ethtool.use_adaptive_rx_coalesce && + coal->use_adaptive_rx_coalesce)) { + /* switch from non-adaptive-mode to adaptive mode, + starts it now */ + set_bit(IPOIB_FLAG_AUTO_MODER, &priv->flags); + moder_operation = MOVING_TO_ON; + priv->ethtool.use_adaptive_rx_coalesce = 1; + queue_delayed_work(ipoib_auto_moder_workqueue, + &priv->adaptive_moder_task, 0); + } + + if (MOVING_TO_OFF == moder_operation) + flush_workqueue(ipoib_auto_moder_workqueue); + else if (MOVING_TO_ON == moder_operation) { + /* move to initial values */ + ret = ib_modify_cq(priv->recv_cq, + &attr, + IB_CQ_MODERATE); + if (ret && ret != -ENOSYS) { + ipoib_warn(priv, "failed modifying CQ (%d)" + "(when moving to auto-moderation)\n", + ret); + return ret; + } + } + priv->ethtool.use_adaptive_rx_coalesce = coal->use_adaptive_rx_coalesce; + + return 0; +} + +static const char ipoib_stats_keys[][ETH_GSTRING_LEN] = { + "LRO aggregated", "LRO flushed", + "LRO avg aggr", "LRO no desc" +}; + +static void ipoib_get_strings(struct net_device *netdev, u32 stringset, u8 *data) +{ + switch (stringset) { + case ETH_SS_STATS: + memcpy(data, *ipoib_stats_keys, sizeof(ipoib_stats_keys)); + break; + } +} + +static int ipoib_get_stats_count(struct net_device *dev) +{ + return ARRAY_SIZE(ipoib_stats_keys); +} + +static void ipoib_get_ethtool_stats(struct net_device *dev, + struct ethtool_stats *stats, uint64_t *data) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int index = 0; + + /* Get LRO statistics */ + data[index++] = priv->lro.lro_mgr.stats.aggregated; + data[index++] = priv->lro.lro_mgr.stats.flushed; + if (priv->lro.lro_mgr.stats.flushed) + data[index++] = priv->lro.lro_mgr.stats.aggregated / + priv->lro.lro_mgr.stats.flushed; + else + data[index++] = 0; + data[index++] = priv->lro.lro_mgr.stats.no_desc; +} + +static void ipoib_get_ringparam(struct net_device *dev, + struct ethtool_ringparam *param) +{ + + memset(param, 0, sizeof(*param)); + param->rx_max_pending = IPOIB_MAX_QUEUE_SIZE; + param->tx_max_pending = IPOIB_MAX_QUEUE_SIZE; + param->rx_pending = ipoib_recvq_size; + param->tx_pending = ipoib_sendq_size; +} + +static const struct ethtool_ops ipoib_ethtool_ops = { + .get_drvinfo = ipoib_get_drvinfo, + .get_rx_csum = ipoib_get_rx_csum, + .get_tso = ethtool_op_get_tso, + .set_tso = ipoib_set_tso, + .get_coalesce = ipoib_get_coalesce, + .set_coalesce = ipoib_set_coalesce, + .get_strings = ipoib_get_strings, + .get_stats_count = ipoib_get_stats_count, + .get_ethtool_stats = ipoib_get_ethtool_stats, + .get_ringparam = ipoib_get_ringparam, +}; + +void ipoib_set_ethtool_ops(struct net_device *dev) +{ + SET_ETHTOOL_OPS(dev, &ipoib_ethtool_ops); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib_1.5.3/ipoib_fs.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib_1.5.3/ipoib_fs.c new file mode 100644 index 0000000..ebf1dbb --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib_1.5.3/ipoib_fs.c @@ -0,0 +1,297 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +struct file_operations; + +#include + +#include "ipoib.h" + +static struct dentry *ipoib_root; + +static void format_gid(union ib_gid *gid, char *buf) +{ + int i, n; + + for (n = 0, i = 0; i < 8; ++i) { + n += sprintf(buf + n, "%x", + be16_to_cpu(((__be16 *) gid->raw)[i])); + if (i < 7) + buf[n++] = ':'; + } +} + +static void *ipoib_mcg_seq_start(struct seq_file *file, loff_t *pos) +{ + struct ipoib_mcast_iter *iter; + loff_t n = *pos; + + iter = ipoib_mcast_iter_init(file->private); + if (!iter) + return NULL; + + while (n--) { + if (ipoib_mcast_iter_next(iter)) { + kfree(iter); + return NULL; + } + } + + return iter; +} + +static void *ipoib_mcg_seq_next(struct seq_file *file, void *iter_ptr, + loff_t *pos) +{ + struct ipoib_mcast_iter *iter = iter_ptr; + + (*pos)++; + + if (ipoib_mcast_iter_next(iter)) { + kfree(iter); + return NULL; + } + + return iter; +} + +static void ipoib_mcg_seq_stop(struct seq_file *file, void *iter_ptr) +{ + /* nothing for now */ +} + +static int ipoib_mcg_seq_show(struct seq_file *file, void *iter_ptr) +{ + struct ipoib_mcast_iter *iter = iter_ptr; + char gid_buf[sizeof "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff"]; + union ib_gid mgid; + unsigned long created; + unsigned int queuelen, complete, send_only; + + if (!iter) + return 0; + + ipoib_mcast_iter_read(iter, &mgid, &created, &queuelen, + &complete, &send_only); + + format_gid(&mgid, gid_buf); + + seq_printf(file, + "GID: %s\n" + " created: %10ld\n" + " queuelen: %9d\n" + " complete: %9s\n" + " send_only: %8s\n" + "\n", + gid_buf, created, queuelen, + complete ? "yes" : "no", + send_only ? "yes" : "no"); + + return 0; +} + +static const struct seq_operations ipoib_mcg_seq_ops = { + .start = ipoib_mcg_seq_start, + .next = ipoib_mcg_seq_next, + .stop = ipoib_mcg_seq_stop, + .show = ipoib_mcg_seq_show, +}; + +static int ipoib_mcg_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int ret; + + ret = seq_open(file, (struct seq_operations *) &ipoib_mcg_seq_ops); + if (ret) + return ret; + + seq = file->private_data; + seq->private = inode->i_private; + + return 0; +} + +static const struct file_operations ipoib_mcg_fops = { + .owner = THIS_MODULE, + .open = ipoib_mcg_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +static void *ipoib_path_seq_start(struct seq_file *file, loff_t *pos) +{ + struct ipoib_path_iter *iter; + loff_t n = *pos; + + iter = ipoib_path_iter_init(file->private); + if (!iter) + return NULL; + + while (n--) { + if (ipoib_path_iter_next(iter)) { + kfree(iter); + return NULL; + } + } + + return iter; +} + +static void *ipoib_path_seq_next(struct seq_file *file, void *iter_ptr, + loff_t *pos) +{ + struct ipoib_path_iter *iter = iter_ptr; + + (*pos)++; + + if (ipoib_path_iter_next(iter)) { + kfree(iter); + return NULL; + } + + return iter; +} + +static void ipoib_path_seq_stop(struct seq_file *file, void *iter_ptr) +{ + /* nothing for now */ +} + +static int ipoib_path_seq_show(struct seq_file *file, void *iter_ptr) +{ + struct ipoib_path_iter *iter = iter_ptr; + char gid_buf[sizeof "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff"]; + struct ipoib_path path; + int rate; + + if (!iter) + return 0; + + ipoib_path_iter_read(iter, &path); + + format_gid(&path.pathrec.dgid, gid_buf); + + seq_printf(file, + "GID: %s\n" + " complete: %6s\n", + gid_buf, path.pathrec.dlid ? "yes" : "no"); + + if (path.pathrec.dlid) { + rate = ib_rate_to_mbps(path.pathrec.rate); + + seq_printf(file, + " DLID: 0x%04x\n" + " SL: %12d\n" + " rate: %8d.%d Gb/sec\n", + be16_to_cpu(path.pathrec.dlid), + path.pathrec.sl, + rate / 1000, rate % 1000); + } + + seq_putc(file, '\n'); + + return 0; +} + +static const struct seq_operations ipoib_path_seq_ops = { + .start = ipoib_path_seq_start, + .next = ipoib_path_seq_next, + .stop = ipoib_path_seq_stop, + .show = ipoib_path_seq_show, +}; + +static int ipoib_path_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int ret; + + ret = seq_open(file, (struct seq_operations *) &ipoib_path_seq_ops); + if (ret) + return ret; + + seq = file->private_data; + seq->private = inode->i_private; + + return 0; +} + +static const struct file_operations ipoib_path_fops = { + .owner = THIS_MODULE, + .open = ipoib_path_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +void ipoib_create_debug_files(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + char name[IFNAMSIZ + sizeof "_path"]; + + snprintf(name, sizeof name, "%s_mcg", dev->name); + priv->mcg_dentry = debugfs_create_file(name, S_IFREG | S_IRUGO, + ipoib_root, dev, &ipoib_mcg_fops); + if (!priv->mcg_dentry) + ipoib_warn(priv, "failed to create mcg debug file\n"); + + snprintf(name, sizeof name, "%s_path", dev->name); + priv->path_dentry = debugfs_create_file(name, S_IFREG | S_IRUGO, + ipoib_root, dev, &ipoib_path_fops); + if (!priv->path_dentry) + ipoib_warn(priv, "failed to create path debug file\n"); +} + +void ipoib_delete_debug_files(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + if (priv->mcg_dentry) + debugfs_remove(priv->mcg_dentry); + if (priv->path_dentry) + debugfs_remove(priv->path_dentry); +} + +int ipoib_register_debugfs(void) +{ + ipoib_root = debugfs_create_dir("ipoib", NULL); + return ipoib_root ? 0 : -ENOMEM; +} + +void ipoib_unregister_debugfs(void) +{ + debugfs_remove(ipoib_root); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib_1.5.3/ipoib_ib.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib_1.5.3/ipoib_ib.c new file mode 100644 index 0000000..dc0a459 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib_1.5.3/ipoib_ib.c @@ -0,0 +1,1163 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * Copyright (c) 2004, 2005 Voltaire, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include +#include +#include + +#include "ipoib.h" + +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA +static int data_debug_level; + +module_param(data_debug_level, int, 0644); +MODULE_PARM_DESC(data_debug_level, + "Enable data path debug tracing if > 0"); +#endif + +static DEFINE_MUTEX(pkey_mutex); + +struct ipoib_ah *ipoib_create_ah(struct net_device *dev, + struct ib_pd *pd, struct ib_ah_attr *attr) +{ + struct ipoib_ah *ah; + + ah = kmalloc(sizeof *ah, GFP_KERNEL); + if (!ah) + return NULL; + + ah->dev = dev; + ah->last_send = 0; + kref_init(&ah->ref); + + ah->ah = ib_create_ah(pd, attr); + if (IS_ERR(ah->ah)) { + kfree(ah); + ah = NULL; + } else + ipoib_dbg(netdev_priv(dev), "Created ah %p\n", ah->ah); + + return ah; +} + +void ipoib_free_ah(struct kref *kref) +{ + struct ipoib_ah *ah = container_of(kref, struct ipoib_ah, ref); + struct ipoib_dev_priv *priv = netdev_priv(ah->dev); + + unsigned long flags; + + spin_lock_irqsave(&priv->lock, flags); + list_add_tail(&ah->list, &priv->dead_ahs); + spin_unlock_irqrestore(&priv->lock, flags); +} + +static void ipoib_ud_dma_unmap_rx(struct ipoib_dev_priv *priv, + u64 mapping[IPOIB_UD_RX_SG]) +{ + if (ipoib_ud_need_sg(priv->max_ib_mtu)) { + ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_UD_HEAD_SIZE, + DMA_FROM_DEVICE); + ib_dma_unmap_page(priv->ca, mapping[1], PAGE_SIZE, + DMA_FROM_DEVICE); + } else + ib_dma_unmap_single(priv->ca, mapping[0], + IPOIB_UD_BUF_SIZE(priv->max_ib_mtu), + DMA_FROM_DEVICE); +} + +static void ipoib_ud_skb_put_frags(struct ipoib_dev_priv *priv, + struct sk_buff *skb, + unsigned int length) +{ + if (ipoib_ud_need_sg(priv->max_ib_mtu)) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[0]; + unsigned int size; + /* + * There is only two buffers needed for max_payload = 4K, + * first buf size is IPOIB_UD_HEAD_SIZE + */ + skb->tail += IPOIB_UD_HEAD_SIZE; + skb->len += length; + + size = length - IPOIB_UD_HEAD_SIZE; + + frag->size = size; + skb->data_len += size; + skb->truesize += size; + } else + skb_put(skb, length); + +} + +static int ipoib_ib_post_receive(struct net_device *dev, int id) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_recv_wr *bad_wr; + int ret; + + priv->rx_wr.wr_id = id | IPOIB_OP_RECV; + priv->rx_sge[0].addr = priv->rx_ring[id].mapping[0]; + priv->rx_sge[1].addr = priv->rx_ring[id].mapping[1]; + + + ret = ib_post_recv(priv->qp, &priv->rx_wr, &bad_wr); + if (unlikely(ret)) { + ipoib_warn(priv, "receive failed for buf %d (%d)\n", id, ret); + ipoib_ud_dma_unmap_rx(priv, priv->rx_ring[id].mapping); + dev_kfree_skb_any(priv->rx_ring[id].skb); + priv->rx_ring[id].skb = NULL; + } + + return ret; +} + +static struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev, int id) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct sk_buff *skb; + int buf_size; + u64 *mapping; + + if (ipoib_ud_need_sg(priv->max_ib_mtu)) + buf_size = IPOIB_UD_HEAD_SIZE; + else + buf_size = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu); + + skb = dev_alloc_skb(buf_size + 4); + if (unlikely(!skb)) + return NULL; + + /* + * IB will leave a 40 byte gap for a GRH and IPoIB adds a 4 byte + * header. So we need 4 more bytes to get to 48 and align the + * IP header to a multiple of 16. + */ + skb_reserve(skb, 4); + + mapping = priv->rx_ring[id].mapping; + mapping[0] = ib_dma_map_single(priv->ca, skb->data, buf_size, + DMA_FROM_DEVICE); + if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0]))) + goto error; + + if (ipoib_ud_need_sg(priv->max_ib_mtu)) { + struct page *page = alloc_page(GFP_ATOMIC); + if (!page) + goto partial_error; + skb_fill_page_desc(skb, 0, page, 0, PAGE_SIZE); + mapping[1] = + ib_dma_map_page(priv->ca, skb_shinfo(skb)->frags[0].page, + 0, PAGE_SIZE, DMA_FROM_DEVICE); + if (unlikely(ib_dma_mapping_error(priv->ca, mapping[1]))) + goto partial_error; + } + + priv->rx_ring[id].skb = skb; + return skb; + +partial_error: + ib_dma_unmap_single(priv->ca, mapping[0], buf_size, DMA_FROM_DEVICE); +error: + dev_kfree_skb_any(skb); + return NULL; +} + +static int ipoib_ib_post_receives(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int i; + + for (i = 0; i < ipoib_recvq_size; ++i) { + if (!ipoib_alloc_rx_skb(dev, i)) { + ipoib_warn(priv, "failed to allocate receive buffer %d\n", i); + return -ENOMEM; + } + if (ipoib_ib_post_receive(dev, i)) { + ipoib_warn(priv, "ipoib_ib_post_receive failed for buf %d\n", i); + return -EIO; + } + } + + return 0; +} + +static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + unsigned int wr_id = wc->wr_id & ~IPOIB_OP_RECV; + struct sk_buff *skb; + u64 mapping[IPOIB_UD_RX_SG]; + union ib_gid *dgid; + + ipoib_dbg_data(priv, "recv completion: id %d, status: %d\n", + wr_id, wc->status); + + if (unlikely(wr_id >= ipoib_recvq_size)) { + ipoib_warn(priv, "recv completion event with wrid %d (> %d)\n", + wr_id, ipoib_recvq_size); + return; + } + + skb = priv->rx_ring[wr_id].skb; + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + if (wc->status != IB_WC_WR_FLUSH_ERR) + ipoib_warn(priv, "failed recv event " + "(status=%d, wrid=%d vend_err %x)\n", + wc->status, wr_id, wc->vendor_err); + ipoib_ud_dma_unmap_rx(priv, priv->rx_ring[wr_id].mapping); + dev_kfree_skb_any(skb); + priv->rx_ring[wr_id].skb = NULL; + return; + } + + /* + * Drop packets that this interface sent, ie multicast packets + * that the HCA has replicated. + */ + if (wc->slid == priv->local_lid && wc->src_qp == priv->qp->qp_num) + goto repost; + + memcpy(mapping, priv->rx_ring[wr_id].mapping, + IPOIB_UD_RX_SG * sizeof *mapping); + + /* + * If we can't allocate a new RX buffer, dump + * this packet and reuse the old buffer. + */ + if (unlikely(!ipoib_alloc_rx_skb(dev, wr_id))) { + ++priv->stats.rx_dropped; + goto repost; + } + + ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n", + wc->byte_len, wc->slid); + + ipoib_ud_dma_unmap_rx(priv, mapping); + ipoib_ud_skb_put_frags(priv, skb, wc->byte_len); + + /* First byte of dgid signals multicast when 0xff */ + dgid = &((struct ib_grh *)skb->data)->dgid; + + if (!(wc->wc_flags & IB_WC_GRH) || dgid->raw[0] != 0xff) + skb->pkt_type = PACKET_HOST; + else if (memcmp(dgid, dev->broadcast + 4, sizeof(union ib_gid)) == 0) + skb->pkt_type = PACKET_BROADCAST; + else + skb->pkt_type = PACKET_MULTICAST; + + skb_pull(skb, IB_GRH_BYTES); + + skb->protocol = ((struct ipoib_header *) skb->data)->proto; + skb_reset_mac_header(skb); + skb_pull(skb, IPOIB_ENCAP_LEN); + + dev->last_rx = jiffies; + ++priv->stats.rx_packets; + priv->stats.rx_bytes += skb->len; + + skb->dev = dev; + if (test_bit(IPOIB_FLAG_CSUM, &priv->flags) && likely(wc->wc_flags & IB_WC_IP_CSUM_OK)) + skb->ip_summed = CHECKSUM_UNNECESSARY; + + if (dev->features & NETIF_F_LRO) + lro_receive_skb(&priv->lro.lro_mgr, skb, NULL); + else + netif_receive_skb(skb); + +repost: + if (unlikely(ipoib_ib_post_receive(dev, wr_id))) + ipoib_warn(priv, "ipoib_ib_post_receive failed " + "for buf %d\n", wr_id); +} + +static int ipoib_dma_map_tx(struct ib_device *ca, + struct ipoib_tx_buf *tx_req) +{ + struct sk_buff *skb = tx_req->skb; + u64 *mapping = tx_req->mapping; + int i; + int off; + + if (skb_headlen(skb)) { + mapping[0] = ib_dma_map_single(ca, skb->data, skb_headlen(skb), + DMA_TO_DEVICE); + if (unlikely(ib_dma_mapping_error(ca, mapping[0]))) + return -EIO; + + off = 1; + } else + off = 0; + + for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + mapping[i + off] = ib_dma_map_page(ca, frag->page, + frag->page_offset, frag->size, + DMA_TO_DEVICE); + if (unlikely(ib_dma_mapping_error(ca, mapping[i + off]))) + goto partial_error; + } + return 0; + +partial_error: + for (; i > 0; --i) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1]; + ib_dma_unmap_page(ca, mapping[i - !off], frag->size, DMA_TO_DEVICE); + } + + if (off) + ib_dma_unmap_single(ca, mapping[0], skb_headlen(skb), DMA_TO_DEVICE); + + return -EIO; +} + +static void ipoib_dma_unmap_tx(struct ib_device *ca, + struct ipoib_tx_buf *tx_req) +{ + struct sk_buff *skb = tx_req->skb; + u64 *mapping = tx_req->mapping; + int i; + int off; + + if (skb_headlen(skb)) { + ib_dma_unmap_single(ca, mapping[0], skb_headlen(skb), DMA_TO_DEVICE); + off = 1; + } else + off = 0; + + for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + ib_dma_unmap_page(ca, mapping[i + off], frag->size, + DMA_TO_DEVICE); + } +} + +static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + unsigned int wr_id = wc->wr_id; + struct ipoib_tx_buf *tx_req; + + ipoib_dbg_data(priv, "send completion: id %d, status: %d\n", + wr_id, wc->status); + + if (unlikely(wr_id >= ipoib_sendq_size)) { + ipoib_warn(priv, "send completion event with wrid %d (> %d)\n", + wr_id, ipoib_sendq_size); + return; + } + + tx_req = &priv->tx_ring[wr_id]; + + ipoib_dma_unmap_tx(priv->ca, tx_req); + + ++priv->stats.tx_packets; + priv->stats.tx_bytes += tx_req->skb->len; + + dev_kfree_skb_any(tx_req->skb); + + ++priv->tx_tail; + if (unlikely(--priv->tx_outstanding <= ipoib_sendq_size >> 1) && + netif_queue_stopped(dev) && + test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) + netif_wake_queue(dev); + + if (wc->status != IB_WC_SUCCESS && + wc->status != IB_WC_WR_FLUSH_ERR) + ipoib_warn(priv, "failed send event " + "(status=%d, wrid=%d vend_err %x)\n", + wc->status, wr_id, wc->vendor_err); +} + +static int poll_tx(struct ipoib_dev_priv *priv) +{ + int n, i; + + n = ib_poll_cq(priv->send_cq, MAX_SEND_CQE, priv->send_wc); + for (i = 0; i < n; ++i) + ipoib_ib_handle_tx_wc(priv->dev, priv->send_wc + i); + + return n == MAX_SEND_CQE; +} + +int ipoib_poll(struct net_device *dev, int *budget) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int max = min(*budget, dev->quota); + int done; + int t; + int n, i; + int ret; + + done = 0; + + spin_lock(&priv->rx_ring_lock); +poll_more: + while (max) { + + t = min(IPOIB_NUM_WC, max); + n = ib_poll_cq(priv->recv_cq, t, priv->ibwc); + + for (i = 0; i < n; i++) { + struct ib_wc *wc = priv->ibwc + i; + + if (wc->wr_id & IPOIB_OP_RECV) { + ++done; + --max; + if (wc->wr_id & IPOIB_OP_CM) + ipoib_cm_handle_rx_wc(dev, wc); + else + ipoib_ib_handle_rx_wc(dev, wc); + } else + ipoib_cm_handle_tx_wc(priv->dev, wc); + } + + if (n != t) + break; + } + + if (max) { + if (dev->features & NETIF_F_LRO) + lro_flush_all(&priv->lro.lro_mgr); + + netif_rx_complete(dev); + if (unlikely(ib_req_notify_cq(priv->recv_cq, + IB_CQ_NEXT_COMP | + IB_CQ_REPORT_MISSED_EVENTS)) && + netif_rx_reschedule(dev, 0)) + goto poll_more; + ret = 0; + } else + ret = 1; + + dev->quota -= done; + *budget -= done; + + spin_unlock(&priv->rx_ring_lock); + return ret; +} + +void ipoib_get_tcp_ring(struct net_device *dev, u8 *poll_ring, u32 saddr, u32 daddr, u16 sport, u16 dport) +{ + *poll_ring = 0; +} + +void ipoib_get_udp_rings(struct net_device *dev, u8 *poll_rings, u8 *num_rings) +{ + *poll_rings = 0; + *num_rings = 1; +} + +void ipoib_accl_poll(struct net_device *dev, int ring_num) +{ + int budget = 64; + struct ipoib_dev_priv *priv = netdev_priv(dev); + int n, i, num_recv = 0; + struct ib_wc *wc; + + if (!spin_trylock_bh(&priv->rx_ring_lock)) + return; + while (num_recv < budget) { + n = ib_poll_cq(priv->recv_cq, budget, priv->ibwc); + for (i = 0; i < n; i++) { + wc = priv->ibwc + i; + + if (wc->wr_id & IPOIB_OP_RECV) { + num_recv++; + if (wc->wr_id & IPOIB_OP_CM) + ipoib_cm_handle_rx_wc(dev, wc); + else + ipoib_ib_handle_rx_wc(dev, wc); + } else + ipoib_cm_handle_tx_wc(priv->dev, wc); + } + if (n < budget) + break; + } + /* We always want to flush all of the accumulated skb's */ + if (dev->features & NETIF_F_LRO) + lro_flush_all(&priv->lro.lro_mgr); + + spin_unlock_bh(&priv->rx_ring_lock); +} + +void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr) +{ + netif_rx_schedule(dev_ptr); +} + +static void drain_tx_cq(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + netif_tx_lock(dev); + while (poll_tx(priv)) + ; /* nothing */ + + if (netif_queue_stopped(dev)) + mod_timer(&priv->poll_timer, jiffies + 1); + + netif_tx_unlock(dev); +} + +void ipoib_send_comp_handler(struct ib_cq *cq, void *dev_ptr) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev_ptr); + + mod_timer(&priv->poll_timer, jiffies); +} + +static inline int post_send(struct ipoib_dev_priv *priv, + unsigned int wr_id, + struct ib_ah *address, u32 qpn, + struct ipoib_tx_buf *tx_req, + void *head, int hlen) +{ + struct ib_send_wr *bad_wr; + int i, off; + struct sk_buff *skb = tx_req->skb; + skb_frag_t *frags = skb_shinfo(skb)->frags; + int nr_frags = skb_shinfo(skb)->nr_frags; + u64 *mapping = tx_req->mapping; + + if (skb_headlen(skb)) { + priv->tx_sge[0].addr = mapping[0]; + priv->tx_sge[0].length = skb_headlen(skb); + off = 1; + } else + off = 0; + + for (i = 0; i < nr_frags; ++i) { + priv->tx_sge[i + off].addr = mapping[i + off]; + priv->tx_sge[i + off].length = frags[i].size; + } + priv->tx_wr.num_sge = nr_frags + off; + priv->tx_wr.wr_id = wr_id; + priv->tx_wr.wr.ud.remote_qpn = qpn; + priv->tx_wr.wr.ud.ah = address; + + if (head) { + priv->tx_wr.wr.ud.mss = skb_shinfo(skb)->gso_size; + priv->tx_wr.wr.ud.header = head; + priv->tx_wr.wr.ud.hlen = hlen; + priv->tx_wr.opcode = IB_WR_LSO; + } else + priv->tx_wr.opcode = IB_WR_SEND; + + return ib_post_send(priv->qp, &priv->tx_wr, &bad_wr); +} + +void ipoib_send(struct net_device *dev, struct sk_buff *skb, + struct ipoib_ah *address, u32 qpn) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_tx_buf *tx_req; + int hlen; + void *phead; + + if (skb_is_gso(skb)) { + hlen = skb_transport_offset(skb) + tcp_hdrlen(skb); + phead = skb->data; + if (unlikely(!skb_pull(skb, hlen))) { + ipoib_warn(priv, "linear data too small\n"); + ++priv->stats.tx_dropped; + ++priv->stats.tx_errors; + dev_kfree_skb_any(skb); + return; + } + } else { + if (unlikely(skb->len > priv->mcast_mtu + IPOIB_ENCAP_LEN)) { + ipoib_warn(priv, "%s: packet len %d (> %d) too long to send, dropping\n", + __func__, skb->len, priv->mcast_mtu + IPOIB_ENCAP_LEN); + ++priv->stats.tx_dropped; + ++priv->stats.tx_errors; + + ipoib_cm_skb_too_long(dev, skb, priv->mcast_mtu); + + return; + } + phead = NULL; + hlen = 0; + } + + ipoib_dbg_data(priv, "sending packet, length=%d address=%p qpn=0x%06x\n", + skb->len, address, qpn); + + /* + * We put the skb into the tx_ring _before_ we call post_send() + * because it's entirely possible that the completion handler will + * run before we execute anything after the post_send(). That + * means we have to make sure everything is properly recorded and + * our state is consistent before we call post_send(). + */ + tx_req = &priv->tx_ring[priv->tx_head & (ipoib_sendq_size - 1)]; + tx_req->skb = skb; + if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req))) { + ++priv->stats.tx_errors; + dev_kfree_skb_any(skb); + return; + } + + if (skb->ip_summed == CHECKSUM_PARTIAL) + priv->tx_wr.send_flags |= IB_SEND_IP_CSUM; + else + priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM; + + if (++priv->tx_outstanding == ipoib_sendq_size) { + ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n"); + if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP)) + ipoib_warn(priv, "request notify on send CQ failed\n"); + netif_stop_queue(dev); + } + + if (unlikely(post_send(priv, priv->tx_head & (ipoib_sendq_size - 1), + address->ah, qpn, tx_req, phead, hlen))) { + ipoib_warn(priv, "post_send failed\n"); + ++priv->stats.tx_errors; + --priv->tx_outstanding; + ipoib_dma_unmap_tx(priv->ca, tx_req); + dev_kfree_skb_any(skb); + if (netif_queue_stopped(dev)) + netif_wake_queue(dev); + } else { + dev->trans_start = jiffies; + + address->last_send = priv->tx_head; + ++priv->tx_head; + skb_orphan(skb); + + } + + if (unlikely(priv->tx_outstanding > MAX_SEND_CQE)) + while (poll_tx(priv)) + ; /* nothing */ +} + +static void __ipoib_reap_ah(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_ah *ah, *tah; + LIST_HEAD(remove_list); + unsigned long flags; + + netif_tx_lock_bh(dev); + spin_lock_irqsave(&priv->lock, flags); + + list_for_each_entry_safe(ah, tah, &priv->dead_ahs, list) + if ((int) priv->tx_tail - (int) ah->last_send >= 0) { + list_del(&ah->list); + ib_destroy_ah(ah->ah); + kfree(ah); + } + + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); +} + +void ipoib_reap_ah(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = + container_of(work, struct ipoib_dev_priv, ah_reap_task.work); + struct net_device *dev = priv->dev; + + __ipoib_reap_ah(dev); + + if (!test_bit(IPOIB_STOP_REAPER, &priv->flags)) + queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, + round_jiffies_relative(HZ)); +} + +static void ipoib_ah_dev_cleanup(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + unsigned long begin; + + begin = jiffies; + + while (!list_empty(&priv->dead_ahs)) { + __ipoib_reap_ah(dev); + + if (time_after(jiffies, begin + HZ)) { + ipoib_warn(priv, "timing out; will leak address handles\n"); + break; + } + + msleep(1); + } +} + +static void ipoib_ib_tx_timer_func(unsigned long ctx) +{ + drain_tx_cq((struct net_device *)ctx); +} + +int ipoib_ib_dev_open(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int ret; + + if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &priv->pkey_index)) { + ipoib_warn(priv, "P_Key 0x%04x not found\n", priv->pkey); + clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); + return -1; + } + set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); + + ret = ipoib_init_qp(dev); + if (ret) { + ipoib_warn(priv, "ipoib_init_qp returned %d\n", ret); + return -1; + } + + ret = ipoib_ib_post_receives(dev); + if (ret) { + ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret); + ipoib_ib_dev_stop(dev, 1); + return -1; + } + + ret = ipoib_cm_dev_open(dev); + if (ret) { + ipoib_warn(priv, "ipoib_cm_dev_open returned %d\n", ret); + ipoib_ib_dev_stop(dev, 1); + return -1; + } + + clear_bit(IPOIB_STOP_REAPER, &priv->flags); + queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, + round_jiffies_relative(HZ)); + + set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags); + + return 0; +} + +static void ipoib_pkey_dev_check_presence(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + u16 pkey_index = 0; + + if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &pkey_index)) + clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); + else + set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); +} + +int ipoib_ib_dev_up(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + ipoib_pkey_dev_check_presence(dev); + + if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) { + ipoib_dbg(priv, "PKEY is not assigned.\n"); + return 0; + } + + set_bit(IPOIB_FLAG_OPER_UP, &priv->flags); + + set_bit(IPOIB_FLAG_AUTO_MODER, &priv->flags); + + return ipoib_mcast_start_thread(dev); +} + +int ipoib_ib_dev_down(struct net_device *dev, int flush) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + ipoib_dbg(priv, "downing ib_dev\n"); + + clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags); + netif_carrier_off(dev); + + /* Shutdown the P_Key thread if still active */ + if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) { + mutex_lock(&pkey_mutex); + set_bit(IPOIB_PKEY_STOP, &priv->flags); + cancel_delayed_work(&priv->pkey_poll_task); + mutex_unlock(&pkey_mutex); + if (flush) + flush_workqueue(ipoib_workqueue); + } + + /* cancell the adaptive moderation task. */ + if (test_and_clear_bit(IPOIB_FLAG_AUTO_MODER, &priv->flags)) + cancel_delayed_work(&priv->adaptive_moder_task); + + flush_workqueue(ipoib_auto_moder_workqueue); + + ipoib_mcast_stop_thread(dev, flush); + ipoib_mcast_dev_flush(dev); + + ipoib_flush_paths(dev); + + return 0; +} + +static int recvs_pending(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int pending = 0; + int i; + + for (i = 0; i < ipoib_recvq_size; ++i) + if (priv->rx_ring[i].skb) + ++pending; + + return pending; +} + +void ipoib_drain_cq(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int i, n; + + /* + * We call completion handling routines that expect to be + * called from the BH-disabled NAPI poll context, so disable + * BHs here too. + */ + local_bh_disable(); + + do { + n = ib_poll_cq(priv->recv_cq, IPOIB_NUM_WC, priv->ibwc); + for (i = 0; i < n; ++i) { + /* + * Convert any successful completions to flush + * errors to avoid passing packets up the + * stack after bringing the device down. + */ + if (priv->ibwc[i].status == IB_WC_SUCCESS) + priv->ibwc[i].status = IB_WC_WR_FLUSH_ERR; + + if (priv->ibwc[i].wr_id & IPOIB_OP_RECV) { + if (priv->ibwc[i].wr_id & IPOIB_OP_CM) + ipoib_cm_handle_rx_wc(dev, priv->ibwc + i); + else + ipoib_ib_handle_rx_wc(dev, priv->ibwc + i); + } else + ipoib_cm_handle_tx_wc(dev, priv->ibwc + i); + } + } while (n == IPOIB_NUM_WC); + + while (poll_tx(priv)) + ; /* nothing */ + + local_bh_enable(); +} + +int ipoib_ib_dev_stop(struct net_device *dev, int flush) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_qp_attr qp_attr; + struct ib_qp_init_attr query_init_attr; + int ret; + unsigned long begin; + struct ipoib_tx_buf *tx_req; + int i; + + clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags); + netif_poll_disable(dev); + + ipoib_cm_dev_stop(dev); + + /* + * Move our QP to the error state and then reinitialize in + * when all work requests have completed or have been flushed. + */ + ret = ib_query_qp(priv->qp, &qp_attr, IB_QP_STATE, &query_init_attr); + + /* Cannot move to Error state if we still in RESET state.*/ + if (!ret && qp_attr.qp_state != IB_QPS_RESET) { + qp_attr.qp_state = IB_QPS_ERR; + if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE)) + ipoib_warn(priv, "Failed to modify QP to ERROR state\n"); + } else + ipoib_dbg(priv, "ib_query_qp returned: %d," + "qp state is %d, no need to move to ERROR.\n", + ret, qp_attr.qp_state); + + /* Wait for all sends and receives to complete */ + begin = jiffies; + + while (priv->tx_head != priv->tx_tail || recvs_pending(dev)) { + if (time_after(jiffies, begin + 5 * HZ)) { + ipoib_warn(priv, "timing out; %d sends %d receives not completed\n", + priv->tx_head - priv->tx_tail, recvs_pending(dev)); + + /* + * assume the HW is wedged and just free up + * all our pending work requests. + */ + while ((int) priv->tx_tail - (int) priv->tx_head < 0) { + tx_req = &priv->tx_ring[priv->tx_tail & + (ipoib_sendq_size - 1)]; + ipoib_dma_unmap_tx(priv->ca, tx_req); + dev_kfree_skb_any(tx_req->skb); + ++priv->tx_tail; + --priv->tx_outstanding; + } + + for (i = 0; i < ipoib_recvq_size; ++i) { + struct ipoib_rx_buf *rx_req; + + rx_req = &priv->rx_ring[i]; + if (!rx_req->skb) + continue; + ipoib_ud_dma_unmap_rx(priv, + priv->rx_ring[i].mapping); + dev_kfree_skb_any(rx_req->skb); + rx_req->skb = NULL; + } + + goto timeout; + } + + ipoib_drain_cq(dev); + + msleep(1); + } + + ipoib_dbg(priv, "All sends and receives done.\n"); + +timeout: + del_timer_sync(&priv->poll_timer); + qp_attr.qp_state = IB_QPS_RESET; + if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE)) + ipoib_warn(priv, "Failed to modify QP to RESET state\n"); + + /* Wait for all AHs to be reaped */ + set_bit(IPOIB_STOP_REAPER, &priv->flags); + cancel_delayed_work(&priv->ah_reap_task); + if (flush) + flush_workqueue(ipoib_workqueue); + + ipoib_ah_dev_cleanup(dev); + + netif_poll_enable(dev); + ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP); + + return 0; +} + +int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + priv->ca = ca; + priv->port = port; + priv->qp = NULL; + + if (ipoib_transport_dev_init(dev, ca)) { + printk(KERN_WARNING "%s: ipoib_transport_dev_init failed\n", ca->name); + return -ENODEV; + } + + setup_timer(&priv->poll_timer, ipoib_ib_tx_timer_func, + (unsigned long) dev); + + if (dev->flags & IFF_UP) { + if (ipoib_ib_dev_open(dev)) { + ipoib_transport_dev_cleanup(dev); + return -ENODEV; + } + } + + return 0; +} + +static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, + enum ipoib_flush_level level) +{ + struct ipoib_dev_priv *cpriv; + struct net_device *dev = priv->dev; + u16 new_index; + + mutex_lock(&priv->vlan_mutex); + + /* + * Flush any child interfaces too -- they might be up even if + * the parent is down. + */ + list_for_each_entry(cpriv, &priv->child_intfs, list) + __ipoib_ib_dev_flush(cpriv, level); + + mutex_unlock(&priv->vlan_mutex); + + if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) { + ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_INITIALIZED not set.\n"); + return; + } + + if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) { + ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_ADMIN_UP not set.\n"); + return; + } + + if (level == IPOIB_FLUSH_HEAVY) { + if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &new_index)) { + clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); + ipoib_ib_dev_down(dev, 0); + ipoib_ib_dev_stop(dev, 0); + if (ipoib_pkey_dev_delay_open(dev)) + return; + } + + /* restart QP only if P_Key index is changed */ + if (test_and_set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags) && + new_index == priv->pkey_index) { + ipoib_dbg(priv, "Not flushing - P_Key index not changed.\n"); + return; + } + priv->pkey_index = new_index; + } + + if (level == IPOIB_FLUSH_LIGHT) { + ipoib_mark_paths_invalid(dev); + ipoib_mcast_dev_flush(dev); + } + + if (level >= IPOIB_FLUSH_NORMAL) + ipoib_ib_dev_down(dev, 0); + + if (level == IPOIB_FLUSH_HEAVY) { + ipoib_ib_dev_stop(dev, 0); + ipoib_ib_dev_open(dev); + } + + /* + * The device could have been brought down between the start and when + * we get here, don't bring it back up if it's not configured up + */ + if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) { + if (level >= IPOIB_FLUSH_NORMAL) + ipoib_ib_dev_up(dev); + ipoib_mcast_restart_task(&priv->restart_task); + } +} + +void ipoib_ib_dev_flush_light(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = + container_of(work, struct ipoib_dev_priv, flush_light); + + __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_LIGHT); +} + +void ipoib_ib_dev_flush_normal(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = + container_of(work, struct ipoib_dev_priv, flush_normal); + + __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_NORMAL); +} + +void ipoib_ib_dev_flush_heavy(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = + container_of(work, struct ipoib_dev_priv, flush_heavy); + + __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_HEAVY); +} + +void ipoib_ib_dev_cleanup(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + ipoib_dbg(priv, "cleaning up ib_dev\n"); + + ipoib_mcast_stop_thread(dev, 1); + ipoib_mcast_dev_flush(dev); + + ipoib_ah_dev_cleanup(dev); + ipoib_transport_dev_cleanup(dev); +} + +/* + * Delayed P_Key Assigment Interim Support + * + * The following is initial implementation of delayed P_Key assigment + * mechanism. It is using the same approach implemented for the multicast + * group join. The single goal of this implementation is to quickly address + * Bug #2507. This implementation will probably be removed when the P_Key + * change async notification is available. + */ + +void ipoib_pkey_poll(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = + container_of(work, struct ipoib_dev_priv, pkey_poll_task.work); + struct net_device *dev = priv->dev; + + ipoib_pkey_dev_check_presence(dev); + + if (test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) + ipoib_open(dev); + else { + mutex_lock(&pkey_mutex); + if (!test_bit(IPOIB_PKEY_STOP, &priv->flags)) + queue_delayed_work(ipoib_workqueue, + &priv->pkey_poll_task, + HZ); + mutex_unlock(&pkey_mutex); + } +} + +int ipoib_pkey_dev_delay_open(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + /* Look for the interface pkey value in the IB Port P_Key table and */ + /* set the interface pkey assigment flag */ + ipoib_pkey_dev_check_presence(dev); + + /* P_Key value not assigned yet - start polling */ + if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) { + mutex_lock(&pkey_mutex); + clear_bit(IPOIB_PKEY_STOP, &priv->flags); + queue_delayed_work(ipoib_workqueue, + &priv->pkey_poll_task, + HZ); + mutex_unlock(&pkey_mutex); + return 1; + } + + return 0; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib_1.5.3/ipoib_main.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib_1.5.3/ipoib_main.c new file mode 100644 index 0000000..d9e5aa1 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib_1.5.3/ipoib_main.c @@ -0,0 +1,2067 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2004 Voltaire, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ipoib.h" + +#include + +#include +#include +#include +#include + +#include /* For ARPHRD_xxx */ + +#include +#include + +#include +#include + +MODULE_AUTHOR("Roland Dreier"); +MODULE_DESCRIPTION("IP-over-InfiniBand net driver"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif + +int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE; +int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE; + +module_param_named(send_queue_size, ipoib_sendq_size, int, 0444); +MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue"); +module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444); +MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue"); + +static int lro = 1; +module_param(lro, bool, 0444); +MODULE_PARM_DESC(lro, "Enable LRO (Large Receive Offload)"); + +static int lro_max_aggr = IPOIB_LRO_MAX_AGGR; +module_param(lro_max_aggr, int, 0644); +MODULE_PARM_DESC(lro_max_aggr, "LRO: Max packets to be aggregated " + "(default = 64)"); + +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG +int ipoib_debug_level; + +module_param_named(debug_level, ipoib_debug_level, int, 0644); +MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0"); +#endif + +int ipoib_mc_sendonly_timeout; + +module_param_named(mc_sendonly_timeout, ipoib_mc_sendonly_timeout, int, 0644); +MODULE_PARM_DESC(mc_sendonly_timeout, "Enable debug tracing if > 0"); + +#ifndef rcu_dereference_protected +#define rcu_dereference_protected(p, c) \ + rcu_dereference((p)) +#endif + +#ifndef rcu_dereference_bh +#define rcu_dereference_bh(p) \ + rcu_dereference((p)) +#endif + + +struct ipoib_path_iter { + struct net_device *dev; + struct ipoib_path path; +}; + +static const u8 ipv4_bcast_addr[] = { + 0x00, 0xff, 0xff, 0xff, + 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff +}; + +struct workqueue_struct *ipoib_workqueue; + +struct workqueue_struct *ipoib_auto_moder_workqueue; + +struct ib_sa_client ipoib_sa_client; + +static LIST_HEAD(ipoib_all_neigh_list); + +static void ipoib_add_one(struct ib_device *device); +static void ipoib_remove_one(struct ib_device *device); +static void ipoib_neigh_reclaim(struct rcu_head *rp); + +static struct ib_client ipoib_client = { + .name = "ipoib", + .add = ipoib_add_one, + .remove = ipoib_remove_one +}; + +#define to_net_dev(class) container_of(class, struct net_device, class_dev) + +int ipoib_open(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + ipoib_dbg(priv, "bringing up interface\n"); + + set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); + + if (ipoib_pkey_dev_delay_open(dev)) + return 0; + + if (ipoib_ib_dev_open(dev)) + goto err_disable; + + if (ipoib_ib_dev_up(dev)) + goto err_stop; + + if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { + struct ipoib_dev_priv *cpriv; + + /* Bring up any child interfaces too */ + mutex_lock(&priv->vlan_mutex); + list_for_each_entry(cpriv, &priv->child_intfs, list) { + int flags; + + flags = cpriv->dev->flags; + if (flags & IFF_UP) + continue; + + dev_change_flags(cpriv->dev, flags | IFF_UP); + } + mutex_unlock(&priv->vlan_mutex); + } + + netif_start_queue(dev); + + if (priv->ethtool.use_adaptive_rx_coalesce) { + set_bit(IPOIB_FLAG_AUTO_MODER, &priv->flags); + queue_delayed_work(ipoib_auto_moder_workqueue, + &priv->adaptive_moder_task, + ADAPT_MODERATION_DELAY); + } + + return 0; + +err_stop: + ipoib_ib_dev_stop(dev, 1); + +err_disable: + clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); + + return -EINVAL; +} + +static int ipoib_stop(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + ipoib_dbg(priv, "stopping interface\n"); + mutex_lock(&priv->state_lock); + clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); + mutex_unlock(&priv->state_lock); + + netif_stop_queue(dev); + + ipoib_ib_dev_down(dev, 0); + ipoib_ib_dev_stop(dev, 0); + + if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { + struct ipoib_dev_priv *cpriv; + + /* Bring down any child interfaces too */ + mutex_lock(&priv->vlan_mutex); + list_for_each_entry(cpriv, &priv->child_intfs, list) { + int flags; + + flags = cpriv->dev->flags; + if (!(flags & IFF_UP)) + continue; + + dev_change_flags(cpriv->dev, flags & ~IFF_UP); + } + mutex_unlock(&priv->vlan_mutex); + } + + return 0; +} + +static int ipoib_change_mtu(struct net_device *dev, int new_mtu) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + /* dev->mtu > 2K ==> connected mode */ + if (ipoib_cm_admin_enabled(dev)) { + if (new_mtu > ipoib_cm_max_mtu(dev)) + return -EINVAL; + + if (new_mtu > priv->mcast_mtu) + ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n", + priv->mcast_mtu); + + dev->mtu = new_mtu; + return 0; + } + + if (new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu)) + return -EINVAL; + + priv->admin_mtu = new_mtu; + + dev->mtu = min(priv->mcast_mtu, priv->admin_mtu); + + queue_work(ipoib_workqueue, &priv->flush_light); + + return 0; +} + +static struct ipoib_path *__path_find(struct net_device *dev, void *gid) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct rb_node *n = priv->path_tree.rb_node; + struct ipoib_path *path; + int ret; + + while (n) { + path = rb_entry(n, struct ipoib_path, rb_node); + + ret = memcmp(gid, path->pathrec.dgid.raw, + sizeof (union ib_gid)); + + if (ret < 0) + n = n->rb_left; + else if (ret > 0) + n = n->rb_right; + else + return path; + } + + return NULL; +} + +static int __path_add(struct net_device *dev, struct ipoib_path *path) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct rb_node **n = &priv->path_tree.rb_node; + struct rb_node *pn = NULL; + struct ipoib_path *tpath; + int ret; + + while (*n) { + pn = *n; + tpath = rb_entry(pn, struct ipoib_path, rb_node); + + ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw, + sizeof (union ib_gid)); + if (ret < 0) + n = &pn->rb_left; + else if (ret > 0) + n = &pn->rb_right; + else + return -EEXIST; + } + + rb_link_node(&path->rb_node, pn, n); + rb_insert_color(&path->rb_node, &priv->path_tree); + + list_add_tail(&path->list, &priv->path_list); + + return 0; +} + +static void path_free(struct net_device *dev, struct ipoib_path *path) +{ + struct sk_buff *skb; + + while ((skb = __skb_dequeue(&path->queue))) + dev_kfree_skb_irq(skb); + + ipoib_dbg(netdev_priv(dev), "path_free\n"); + + /* remove all neigh connected to this path */ + ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw); + + if (path->ah) + ipoib_put_ah(path->ah); + + kfree(path); +} + +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG + +struct ipoib_path_iter *ipoib_path_iter_init(struct net_device *dev) +{ + struct ipoib_path_iter *iter; + + iter = kmalloc(sizeof *iter, GFP_KERNEL); + if (!iter) + return NULL; + + iter->dev = dev; + memset(iter->path.pathrec.dgid.raw, 0, 16); + + if (ipoib_path_iter_next(iter)) { + kfree(iter); + return NULL; + } + + return iter; +} + +int ipoib_path_iter_next(struct ipoib_path_iter *iter) +{ + struct ipoib_dev_priv *priv = netdev_priv(iter->dev); + struct rb_node *n; + struct ipoib_path *path; + int ret = 1; + + spin_lock_irq(&priv->lock); + + n = rb_first(&priv->path_tree); + + while (n) { + path = rb_entry(n, struct ipoib_path, rb_node); + + if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw, + sizeof (union ib_gid)) < 0) { + iter->path = *path; + ret = 0; + break; + } + + n = rb_next(n); + } + + spin_unlock_irq(&priv->lock); + + return ret; +} + +void ipoib_path_iter_read(struct ipoib_path_iter *iter, + struct ipoib_path *path) +{ + *path = iter->path; +} + +#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */ + +void ipoib_mark_paths_invalid(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_path *path, *tp; + + spin_lock_irq(&priv->lock); + + list_for_each_entry_safe(path, tp, &priv->path_list, list) { + ipoib_dbg(priv, "mark path LID 0x%04x GID " IPOIB_GID_FMT " invalid\n", + be16_to_cpu(path->pathrec.dlid), + IPOIB_GID_ARG(path->pathrec.dgid)); + path->valid = 0; + } + + spin_unlock_irq(&priv->lock); +} + +void ipoib_flush_paths(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_path *path, *tp; + LIST_HEAD(remove_list); + unsigned long flags; + + netif_tx_lock_bh(dev); + spin_lock_irqsave(&priv->lock, flags); + + list_splice_init(&priv->path_list, &remove_list); + + list_for_each_entry(path, &remove_list, list) + rb_erase(&path->rb_node, &priv->path_tree); + + list_for_each_entry_safe(path, tp, &remove_list, list) { + if (path->query) + ib_sa_cancel_query(path->query_id, path->query); + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); + wait_for_completion(&path->done); + list_del(&path->list); + path_free(dev, path); + netif_tx_lock_bh(dev); + spin_lock_irqsave(&priv->lock, flags); + } + + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); +} + +static void path_rec_completion(int status, + struct ib_sa_path_rec *pathrec, + void *path_ptr) +{ + struct ipoib_path *path = path_ptr; + struct net_device *dev = path->dev; + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_ah *ah = NULL; + struct ipoib_ah *old_ah = NULL; + struct ipoib_neigh *neigh, *tn; + struct sk_buff_head skqueue; + struct sk_buff *skb; + unsigned long flags; + int ret; + + if (!status) + ipoib_dbg(priv, "PathRec LID 0x%04x for GID " IPOIB_GID_FMT "\n", + be16_to_cpu(pathrec->dlid), IPOIB_GID_ARG(pathrec->dgid)); + else + ipoib_dbg(priv, "PathRec status %d for GID " IPOIB_GID_FMT "\n", + status, IPOIB_GID_ARG(path->pathrec.dgid)); + + skb_queue_head_init(&skqueue); + + if (!status) { + struct ib_ah_attr av; + + if (!ib_init_ah_from_path(priv->ca, priv->port, pathrec, &av)) + ah = ipoib_create_ah(dev, priv->pd, &av); + } + + spin_lock_irqsave(&priv->lock, flags); + + if (ah) { + path->pathrec = *pathrec; + + old_ah = path->ah; + path->ah = ah; + + ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n", + ah, be16_to_cpu(pathrec->dlid), pathrec->sl); + + while ((skb = __skb_dequeue(&path->queue))) + __skb_queue_tail(&skqueue, skb); + + list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) { + if (neigh->ah) { + WARN_ON(neigh->ah != old_ah); + /* + * Dropping the ah reference inside + * priv->lock is safe here, because we + * will hold one more reference from + * the original value of path->ah (ie + * old_ah). + */ + ipoib_put_ah(neigh->ah); + } + kref_get(&path->ah->ref); + neigh->ah = path->ah; + + if (ipoib_cm_enabled(dev, neigh->daddr)) { + if (!ipoib_cm_get(neigh)) + ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, + path, + neigh)); + if (!ipoib_cm_get(neigh)) { + list_del_init(&neigh->list); + ipoib_neigh_free(neigh); + continue; + } + } + + while ((skb = __skb_dequeue(&neigh->queue))) + __skb_queue_tail(&skqueue, skb); + } + path->valid = 1; + } + + path->query = NULL; + complete(&path->done); + + spin_unlock_irqrestore(&priv->lock, flags); + + if (old_ah) + ipoib_put_ah(old_ah); + + while ((skb = __skb_dequeue(&skqueue))) { + skb->dev = dev; + ret = dev_queue_xmit(skb); + if (ret) + ipoib_warn(priv, "%s: dev_queue_xmit failed to requeue" + " packet (ret:%d)\n", __func__, ret); + } +} + +static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_path *path; + + if (!priv->broadcast) + return NULL; + + path = kzalloc(sizeof *path, GFP_ATOMIC); + if (!path) + return NULL; + + path->dev = dev; + + skb_queue_head_init(&path->queue); + + INIT_LIST_HEAD(&path->neigh_list); + + memcpy(path->pathrec.dgid.raw, gid, sizeof (union ib_gid)); + path->pathrec.sgid = priv->local_gid; + path->pathrec.pkey = cpu_to_be16(priv->pkey); + path->pathrec.numb_path = 1; + path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class; + + return path; +} + +static int path_rec_start(struct net_device *dev, + struct ipoib_path *path) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + ib_sa_comp_mask comp_mask = IB_SA_PATH_REC_MTU_SELECTOR | IB_SA_PATH_REC_MTU; + struct ib_sa_path_rec p_rec; + + p_rec = path->pathrec; + p_rec.mtu_selector = IB_SA_GT; + + switch (roundup_pow_of_two(dev->mtu + IPOIB_ENCAP_LEN)) { + case 512: + p_rec.mtu = IB_MTU_256; + break; + case 1024: + p_rec.mtu = IB_MTU_512; + break; + case 2048: + p_rec.mtu = IB_MTU_1024; + break; + case 4096: + p_rec.mtu = IB_MTU_2048; + break; + default: + /* Wildcard everything */ + comp_mask = 0; + p_rec.mtu = 0; + p_rec.mtu_selector = 0; + } + + ipoib_dbg(priv, "Start path record lookup for " IPOIB_GID_FMT " > %d\n", + IPOIB_GID_ARG(p_rec.dgid), + comp_mask ? ib_mtu_enum_to_int(p_rec.mtu) : 0); + + init_completion(&path->done); + + path->query_id = + ib_sa_path_rec_get(&ipoib_sa_client, priv->ca, priv->port, + &p_rec, comp_mask | + IB_SA_PATH_REC_DGID | + IB_SA_PATH_REC_SGID | + IB_SA_PATH_REC_NUMB_PATH | + IB_SA_PATH_REC_TRAFFIC_CLASS | + IB_SA_PATH_REC_PKEY, + 1000, 0, GFP_ATOMIC, + path_rec_completion, + path, &path->query); + if (path->query_id < 0) { + ipoib_warn(priv, "ib_sa_path_rec_get failed: %d\n", path->query_id); + path->query = NULL; + complete(&path->done); + return path->query_id; + } + + return 0; +} + +static void neigh_add_path(struct sk_buff *skb, u8 *daddr, + struct net_device *dev) + +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_path *path; + struct ipoib_neigh *neigh; + unsigned long flags; + + spin_lock_irqsave(&priv->lock, flags); + neigh = ipoib_neigh_alloc(daddr, dev); + if (!neigh) { + spin_unlock_irqrestore(&priv->lock, flags); + ++priv->stats.tx_dropped; + dev_kfree_skb_any(skb); + return; + } + + path = __path_find(dev, daddr + 4); + if (!path) { + path = path_rec_create(dev, daddr + 4); + if (!path) + goto err_path; + + __path_add(dev, path); + } + + list_add_tail(&neigh->list, &path->neigh_list); + + if (path->ah) { + kref_get(&path->ah->ref); + neigh->ah = path->ah; + + if (ipoib_cm_enabled(dev, neigh->daddr)) { + if (!ipoib_cm_get(neigh)) + ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh)); + if (!ipoib_cm_get(neigh)) { + list_del_init(&neigh->list); + ipoib_neigh_free(neigh); + goto err_drop; + } + if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) + __skb_queue_tail(&neigh->queue, skb); + else { + ipoib_warn(priv, "queue length limit %d. Packet drop.\n", + skb_queue_len(&neigh->queue)); + goto err_drop; + } + } else + ipoib_send(dev, skb, path->ah, IPOIB_QPN(daddr)); + } else { + neigh->ah = NULL; + + if (!path->query && path_rec_start(dev, path)) + goto err_list; + + __skb_queue_tail(&neigh->queue, skb); + } + + spin_unlock_irqrestore(&priv->lock, flags); + ipoib_neigh_put(neigh); + return; + +err_list: + list_del_init(&neigh->list); + +err_path: + ipoib_neigh_free(neigh); +err_drop: + ++priv->stats.tx_dropped; + dev_kfree_skb_any(skb); + + spin_unlock_irqrestore(&priv->lock, flags); + ipoib_neigh_put(neigh); +} + +static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, + struct ipoib_cb *cb) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_path *path; + unsigned long flags; + + spin_lock_irqsave(&priv->lock, flags); + + path = __path_find(dev, cb->hwaddr + 4); + if (!path || !path->valid) { + int new_path = 0; + + if (!path) { + path = path_rec_create(dev, cb->hwaddr + 4); + new_path = 1; + } + if (path) { + __skb_queue_tail(&path->queue, skb); + + if (!path->query && path_rec_start(dev, path)) { + spin_unlock_irqrestore(&priv->lock, flags); + if (new_path) + path_free(dev, path); + return; + } else + __path_add(dev, path); + } else { + ++priv->stats.tx_dropped; + dev_kfree_skb_any(skb); + } + + spin_unlock_irqrestore(&priv->lock, flags); + return; + } + + if (path->ah) { + ipoib_dbg(priv, "Send unicast ARP to %04x\n", + be16_to_cpu(path->pathrec.dlid)); + + ipoib_send(dev, skb, path->ah, IPOIB_QPN(cb->hwaddr)); + } else if ((path->query || !path_rec_start(dev, path)) && + skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) { + __skb_queue_tail(&path->queue, skb); + } else { + ++priv->stats.tx_dropped; + dev_kfree_skb_any(skb); + } + + spin_unlock_irqrestore(&priv->lock, flags); +} + +static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_neigh *neigh; + struct ipoib_cb *cb = (struct ipoib_cb *) skb->cb; + struct ipoib_header *header; + unsigned long flags; + + header = (struct ipoib_header *) skb->data; + + if (unlikely(cb->hwaddr[4] == 0xff)) { + /* multicast, arrange "if" according to probability */ + if ((header->proto != htons(ETH_P_IP)) && + (header->proto != htons(ETH_P_IPV6)) && + (header->proto != htons(ETH_P_ARP)) && + (header->proto != htons(ETH_P_RARP))) { + /* ethertype not supported by IPoIB */ + ++priv->stats.tx_dropped; + dev_kfree_skb_any(skb); + return NETDEV_TX_OK; + } + /* Add in the P_Key for multicast*/ + cb->hwaddr[8] = (priv->pkey >> 8) & 0xff; + cb->hwaddr[9] = priv->pkey & 0xff; + + neigh = ipoib_neigh_get(dev, cb->hwaddr); + if (likely(neigh)) + goto send_using_neigh; + ipoib_mcast_send(dev, cb->hwaddr, skb); + return NETDEV_TX_OK; + } + /* unicast, arrange "switch" according to probability */ + switch (htons(header->proto)) { + case ETH_P_IP: + case ETH_P_IPV6: + neigh = ipoib_neigh_get(dev, cb->hwaddr); + if (unlikely(!neigh)) { + neigh_add_path(skb, cb->hwaddr, dev); + return NETDEV_TX_OK; + } + break; + case ETH_P_ARP: + case ETH_P_RARP: + /* for unicast ARP and RARP should always perform path find */ + unicast_arp_send(skb, dev, cb); + return NETDEV_TX_OK; + default: + /* ethertype not supported by IPoIB */ + ++priv->stats.tx_dropped; + dev_kfree_skb_any(skb); + return NETDEV_TX_OK; + } +send_using_neigh: + /* note we now hold a ref to neigh */ + if (ipoib_cm_get(neigh)) { + if (ipoib_cm_up(neigh)) { + ipoib_cm_send(dev, skb, ipoib_cm_get(neigh)); + goto unref; + } + } else if (neigh->ah) { + ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(cb->hwaddr)); + goto unref; + } + + if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { + spin_lock_irqsave(&priv->lock, flags); + __skb_queue_tail(&neigh->queue, skb); + spin_unlock_irqrestore(&priv->lock, flags); + } else { + ++priv->stats.tx_dropped; + dev_kfree_skb_any(skb); + } + +unref: + ipoib_neigh_put(neigh); + + return NETDEV_TX_OK; +} + +static struct net_device_stats *ipoib_get_stats(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + return &priv->stats; +} + +static void ipoib_timeout(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + ipoib_warn(priv, "transmit timeout: latency %d msecs\n", + jiffies_to_msecs(jiffies - dev->trans_start)); + ipoib_warn(priv, "queue stopped %d, tx_head %u, tx_tail %u, tx_outstanding %u ipoib_sendq_size: %d \n", + netif_queue_stopped(dev),priv->tx_head, priv->tx_tail, priv->tx_outstanding, ipoib_sendq_size); + + if (unlikely(priv->tx_outstanding < ipoib_sendq_size >> 1) && + netif_queue_stopped(dev) && + test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) { + ipoib_warn(priv, "%s: waking the queue\n", __func__); + netif_wake_queue(dev); + } + + /* XXX reset QP, etc. */ +} + +static int ipoib_hard_header(struct sk_buff *skb, + struct net_device *dev, + unsigned short type, + void *daddr, void *saddr, unsigned len) +{ + struct ipoib_header *header; + struct ipoib_cb *cb = (struct ipoib_cb *) skb->cb; + + header = (struct ipoib_header *) skb_push(skb, sizeof *header); + + header->proto = htons(type); + header->reserved = 0; + + /* + * we don't rely on dst_entry structure, always stuff the + * destination address into skb->cb so we can figure out where + * to send the packet later. + */ + memcpy(cb->hwaddr, daddr, INFINIBAND_ALEN); + + return 0; +} + +static void ipoib_set_mcast_list(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) { + ipoib_dbg(priv, "IPOIB_FLAG_OPER_UP not set"); + return; + } + + queue_work(ipoib_workqueue, &priv->restart_task); +} + +static u32 ipoib_addr_hash(struct ipoib_neigh_hash *htbl, u8 *daddr) +{ + /* + * * Use only the address parts that contributes to spreading + * * The subnet prefix is not used as one can not connect to + * * same remote port (GUID) using the same remote QPN via two + * * different subnets. + * */ + /* qpn octets[1:4) & port GUID octets[12:20) */ + u32 *daddr_32 = (u32 *) daddr; + u32 hv; + + hv = jhash_3words(daddr_32[3], daddr_32[4], 0xFFFFFF & daddr_32[0], 0); + return hv & htbl->mask; +} + +struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_neigh_table *ntbl = &priv->ntbl; + struct ipoib_neigh_hash *htbl; + struct ipoib_neigh *neigh = NULL; + u32 hash_val; + + rcu_read_lock_bh(); + + htbl = rcu_dereference_bh(ntbl->htbl); + + if (!htbl) + goto out_unlock; + + hash_val = ipoib_addr_hash(htbl, daddr); + for (neigh = rcu_dereference_bh(htbl->buckets[hash_val]); + neigh != NULL; + neigh = rcu_dereference_bh(neigh->hnext)) { + /* don't use flags for the comapre */ + if (memcmp(daddr+1, neigh->daddr+1, INFINIBAND_ALEN-1) == 0) { + /* found, take one ref on behalf of the caller */ + if (!atomic_inc_not_zero(&neigh->refcnt)) { + /* deleted */ + neigh = NULL; + goto out_unlock; + } + neigh->alive = jiffies; + goto out_unlock; + } + } + +out_unlock: + rcu_read_unlock_bh(); + return neigh; +} + +static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv) +{ + struct ipoib_neigh_table *ntbl = &priv->ntbl; + struct ipoib_neigh_hash *htbl; + unsigned long neigh_obsolete; + unsigned long dt; + unsigned long flags; + int i; + + if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags)) + return; + + spin_lock_irqsave(&priv->lock, flags); + + htbl = rcu_dereference_protected(ntbl->htbl, + lockdep_is_held(&priv->lock)); + + if (!htbl) + goto out_unlock; + + /* neigh is obsolete if it was idle for two GC periods */ + dt = 2 * arp_tbl.gc_interval; + neigh_obsolete = jiffies - dt; + /* handle possible race condition */ + if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags)) + goto out_unlock; + + for (i = 0; i < htbl->size; i++) { + struct ipoib_neigh *neigh; + struct ipoib_neigh __rcu **np = &htbl->buckets[i]; + + while ((neigh = rcu_dereference_protected(*np, + lockdep_is_held(&priv->lock))) != NULL) { + /* was the neigh idle for two GC periods */ + if (time_after(neigh_obsolete, neigh->alive)) { + rcu_assign_pointer(*np, + rcu_dereference_protected(neigh->hnext, + lockdep_is_held(&priv->lock))); + /* remove from path/mc list */ + list_del_init(&neigh->list); + call_rcu(&neigh->rcu, ipoib_neigh_reclaim); + } else { + np = &neigh->hnext; + } + + } + } + +out_unlock: + spin_unlock_irqrestore(&priv->lock, flags); +} + +static void ipoib_reap_neigh(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = + container_of(work, struct ipoib_dev_priv, neigh_reap_task.work); + + __ipoib_reap_neigh(priv); + + if (!test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags)) + queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task, + arp_tbl.gc_interval); +} + +static struct ipoib_neigh *ipoib_neigh_ctor(u8 *daddr, + struct net_device *dev) +{ + struct ipoib_neigh *neigh; + + neigh = kzalloc(sizeof *neigh, GFP_ATOMIC); + if (!neigh) + return NULL; + + neigh->dev = dev; + memcpy(&neigh->daddr, daddr, sizeof(neigh->daddr)); + skb_queue_head_init(&neigh->queue); + INIT_LIST_HEAD(&neigh->list); + ipoib_cm_set(neigh, NULL); + /* one ref on behalf of the caller */ + atomic_set(&neigh->refcnt, 1); + + ipoib_dbg(netdev_priv(dev), + "neigh ctor for %06x %pI6\n", + IPOIB_QPN(neigh->daddr), + neigh->daddr + 4); + + return neigh; +} + +struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr, + struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_neigh_table *ntbl = &priv->ntbl; + struct ipoib_neigh_hash *htbl; + struct ipoib_neigh *neigh; + u32 hash_val; + + htbl = rcu_dereference_protected(ntbl->htbl, + lockdep_is_held(&priv->lock)); + if (!htbl) { + neigh = NULL; + goto out_unlock; + } + + /* need to add a new neigh, but maybe some other thread succeeded? + * recalc hash, maybe hash resize took place so we do a search + */ + hash_val = ipoib_addr_hash(htbl, daddr); + for (neigh = rcu_dereference_protected(htbl->buckets[hash_val], + lockdep_is_held(&priv->lock)); + neigh != NULL; + neigh = rcu_dereference_protected(neigh->hnext, + lockdep_is_held(&priv->lock))) { + /* don't use flags for the comapre */ + if (memcmp(daddr+1, neigh->daddr+1, INFINIBAND_ALEN-1) == 0) { + /* found, take one ref on behalf of the caller */ + if (!atomic_inc_not_zero(&neigh->refcnt)) { + /* deleted */ + neigh = NULL; + break; + } + neigh->alive = jiffies; + goto out_unlock; + } + } + + neigh = ipoib_neigh_ctor(daddr, dev); + if (!neigh) + goto out_unlock; + + /* one ref on behalf of the hash table */ + atomic_inc(&neigh->refcnt); + neigh->alive = jiffies; + /* put in hash */ + rcu_assign_pointer(neigh->hnext, + rcu_dereference_protected(htbl->buckets[hash_val], + lockdep_is_held(&priv->lock))); + rcu_assign_pointer(htbl->buckets[hash_val], neigh); + atomic_inc(&ntbl->entries); + +out_unlock: + return neigh; +} + +void ipoib_neigh_dtor(struct ipoib_neigh *neigh) +{ + /* neigh reference count was dropprd to zero */ + struct net_device *dev = neigh->dev; + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct sk_buff *skb; + if (neigh->ah) + ipoib_put_ah(neigh->ah); + while ((skb = __skb_dequeue(&neigh->queue))) { + ++priv->stats.tx_dropped; + dev_kfree_skb_any(skb); + } + if (ipoib_cm_get(neigh)) + ipoib_cm_destroy_tx(ipoib_cm_get(neigh)); + ipoib_dbg(netdev_priv(dev), + "neigh free for %06x %pI6\n", + IPOIB_QPN(neigh->daddr), + neigh->daddr + 4); + kfree(neigh); + if (atomic_dec_and_test(&priv->ntbl.entries)) { + if (test_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags)) + complete(&priv->ntbl.flushed); + } +} + +static void ipoib_neigh_reclaim(struct rcu_head *rp) +{ + /* Called as a result of removal from hash table */ + struct ipoib_neigh *neigh = container_of(rp, struct ipoib_neigh, rcu); + /* note TX context may hold another ref */ + ipoib_neigh_put(neigh); +} + +void ipoib_neigh_free(struct ipoib_neigh *neigh) +{ + struct net_device *dev = neigh->dev; + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_neigh_table *ntbl = &priv->ntbl; + struct ipoib_neigh_hash *htbl; + struct ipoib_neigh __rcu **np; + struct ipoib_neigh *n; + u32 hash_val; + + + htbl = rcu_dereference_protected(ntbl->htbl, + lockdep_is_held(&priv->lock)); + if (!htbl) + return; + + hash_val = ipoib_addr_hash(htbl, neigh->daddr); + np = &htbl->buckets[hash_val]; + for (n = rcu_dereference_protected(*np, + lockdep_is_held(&priv->lock)); + n != NULL; + n = rcu_dereference_protected(*np, + lockdep_is_held(&priv->lock))) { + if (n == neigh) { + /* found */ + rcu_assign_pointer(*np, + rcu_dereference_protected(neigh->hnext, + lockdep_is_held(&priv->lock))); + call_rcu(&neigh->rcu, ipoib_neigh_reclaim); + return; + } else { + np = &n->hnext; + } + } +} + +static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv) +{ + struct ipoib_neigh_table *ntbl = &priv->ntbl; + struct ipoib_neigh_hash *htbl; + struct ipoib_neigh **buckets; + u32 size; + + clear_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags); + ntbl->htbl = NULL; + htbl = kzalloc(sizeof(*htbl), GFP_KERNEL); + if (!htbl) + return -ENOMEM; + set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); + size = roundup_pow_of_two(arp_tbl.gc_thresh3); + buckets = kzalloc(size * sizeof(*buckets), GFP_KERNEL); + if (!buckets) { + kfree(htbl); + return -ENOMEM; + } + htbl->size = size; + htbl->mask = (size - 1); + htbl->buckets = buckets; + ntbl->htbl = htbl; + htbl->ntbl = ntbl; + atomic_set(&ntbl->entries, 0); + + /* start garbage collection */ + clear_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); + queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task, + arp_tbl.gc_interval); + + return 0; +} + +static void neigh_hash_free_rcu(struct rcu_head *head) +{ + struct ipoib_neigh_hash *htbl = container_of(head, + struct ipoib_neigh_hash, + rcu); + struct ipoib_neigh __rcu **buckets = htbl->buckets; + struct ipoib_neigh_table *ntbl = htbl->ntbl; + + kfree(buckets); + kfree(htbl); + complete(&ntbl->deleted); +} + +void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_neigh_table *ntbl = &priv->ntbl; + struct ipoib_neigh_hash *htbl; + unsigned long flags; + int i; + + /* remove all neigh connected to a given path or mcast */ + spin_lock_irqsave(&priv->lock, flags); + + htbl = rcu_dereference_protected(ntbl->htbl, + lockdep_is_held(&priv->lock)); + + if (!htbl) + goto out_unlock; + + for (i = 0; i < htbl->size; i++) { + struct ipoib_neigh *neigh; + struct ipoib_neigh __rcu **np = &htbl->buckets[i]; + + while ((neigh = rcu_dereference_protected(*np, + lockdep_is_held(&priv->lock))) != NULL) { + /* delete neighs belong to this parent */ + if (!memcmp(gid, neigh->daddr + 4, sizeof (union ib_gid))) { + rcu_assign_pointer(*np, + rcu_dereference_protected(neigh->hnext, + lockdep_is_held(&priv->lock))); + /* remove from parent list */ + list_del_init(&neigh->list); + call_rcu(&neigh->rcu, ipoib_neigh_reclaim); + } else { + np = &neigh->hnext; + } + + } + } + +out_unlock: + spin_unlock_irqrestore(&priv->lock, flags); +} + +static void ipoib_flush_neighs(struct ipoib_dev_priv *priv) +{ + struct ipoib_neigh_table *ntbl = &priv->ntbl; + struct ipoib_neigh_hash *htbl; + unsigned long flags; + int i; + + spin_lock_irqsave(&priv->lock, flags); + + htbl = rcu_dereference_protected(ntbl->htbl, + lockdep_is_held(&priv->lock)); + + if (!htbl) + goto out_unlock; + + for (i = 0; i < htbl->size; i++) { + struct ipoib_neigh *neigh; + struct ipoib_neigh __rcu **np = &htbl->buckets[i]; + + while ((neigh = rcu_dereference_protected(*np, + lockdep_is_held(&ntbl->rwlock))) != NULL) { + rcu_assign_pointer(*np, + rcu_dereference_protected(neigh->hnext, + lockdep_is_held(&ntbl->rwlock))); + /* remove from path/mc list */ + spin_lock_irqsave(&priv->lock, flags); + list_del_init(&neigh->list); + spin_unlock_irqrestore(&priv->lock, flags); + call_rcu(&neigh->rcu, ipoib_neigh_reclaim); + } + } + + rcu_assign_pointer(ntbl->htbl, NULL); + call_rcu(&htbl->rcu, neigh_hash_free_rcu); + +out_unlock: + spin_unlock_irqrestore(&priv->lock, flags); +} + +static void ipoib_neigh_hash_uninit(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int stopped; + + ipoib_dbg(priv, "ipoib_neigh_hash_uninit\n"); + init_completion(&priv->ntbl.deleted); + set_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags); + + /* Stop GC if called at init fail need to cancel work */ + stopped = test_and_set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); + if (!stopped) + cancel_delayed_work_sync(&priv->neigh_reap_task); + + ipoib_flush_neighs(priv); + + wait_for_completion(&priv->ntbl.deleted); + +} + +static void ipoib_set_default_moderation(struct ipoib_dev_priv *priv) +{ + + /* If we haven't received a specific coalescing setting + * (module param), we set the moderation parameters as follows: + * - moder_cnt is set to the number of mtu sized packets to + * satisfy our coaelscing target. + * - moder_time is set to a fixed value. + */ + priv->ethtool.rx_max_coalesced_frames = IPOIB_RX_COAL_TARGET; + priv->ethtool.rx_coalesce_usecs = IPOIB_RX_COAL_TIME; + printk(KERN_ERR "Default coalesing params for mtu:%d - " + "rx_frames:%d rx_usecs:%d\n", + priv->dev->mtu, priv->ethtool.rx_max_coalesced_frames, + priv->ethtool.rx_coalesce_usecs); + + /* Reset auto-moderation params */ + priv->ethtool.pkt_rate_low = IPOIB_RX_RATE_LOW; + priv->ethtool.rx_coalesce_usecs_low = IPOIB_RX_COAL_TIME_LOW; + priv->ethtool.pkt_rate_high = IPOIB_RX_RATE_HIGH; + priv->ethtool.rx_coalesce_usecs_high = IPOIB_RX_COAL_TIME_HIGH; + priv->ethtool.sample_interval = IPOIB_SAMPLE_INTERVAL; + priv->ethtool.use_adaptive_rx_coalesce = 1; + priv->ethtool.last_moder_time = IPOIB_AUTO_CONF; + priv->ethtool.last_moder_jiffies = 0; + priv->ethtool.last_moder_packets = 0; + priv->ethtool.last_moder_tx_packets = 0; + priv->ethtool.last_moder_bytes = 0; +} +/* +The function classifies the incoming traffic during each sampling interval +into classes. The rx_usec value (i.e., moderation time) is then adjusted +appropriately per class. +There are two classes defined: + A. Bulk traffic: for heavy traffic consisting of packets of normal size. + This class is further divided into two sub-classes: + 1. Traffic that is mainly BW bound + - This traffic will get maximum moderation. + 2. Traffic that is mostly latency bound + - For situations where low latency is vital + - The rx_usec will be changed to a value in the range: + (ethtool.pkt_rate_low .. ethtool.pkt_rate_high) + depending on sampled packet rate. + B. Low latency traffic: for minimal traffic, or small packets. + - This traffic will get minimum moderation. +*/ +static void ipoib_auto_moderation(struct ipoib_dev_priv *priv) +{ + unsigned long period = jiffies - priv->ethtool.last_moder_jiffies; + unsigned long packets; + unsigned long rate; + unsigned long avg_pkt_size; + unsigned long rx_packets; + unsigned long rx_bytes; + unsigned long tx_packets; + unsigned long tx_pkt_diff; + unsigned long rx_pkt_diff; + int moder_time; + int ret; + + if (!priv->ethtool.use_adaptive_rx_coalesce) + return; + + rx_packets = priv->stats.rx_packets; + rx_bytes = priv->stats.rx_bytes; + tx_packets = priv->stats.tx_packets; + + tx_pkt_diff = tx_packets - priv->ethtool.last_moder_tx_packets; + rx_pkt_diff = rx_packets - priv->ethtool.last_moder_packets; + packets = max(tx_pkt_diff, rx_pkt_diff); + rate = packets * HZ / period; + avg_pkt_size = packets ? + (rx_bytes - priv->ethtool.last_moder_bytes) / packets : 0; + + /* Apply auto-moderation only when packet rate exceeds a rate that + * it matters */ + if (rate > IPOIB_RX_RATE_THRESH && + avg_pkt_size > IPOIB_AVG_PKT_SMALL) { + if (rate < priv->ethtool.pkt_rate_low) + moder_time = + priv->ethtool.rx_coalesce_usecs_low; + else if (rate > priv->ethtool.pkt_rate_high) + moder_time = + priv->ethtool.rx_coalesce_usecs_high; + else + moder_time = (rate - priv->ethtool.pkt_rate_low) * + (priv->ethtool.rx_coalesce_usecs_high - priv->ethtool.rx_coalesce_usecs_low) / + (priv->ethtool.pkt_rate_high - priv->ethtool.pkt_rate_low) + + priv->ethtool.rx_coalesce_usecs_low; + + } else + moder_time = priv->ethtool.rx_coalesce_usecs_low; + + if (moder_time != priv->ethtool.last_moder_time) { + struct ib_cq_attr attr; + + memset(&attr, 0, sizeof(attr)); + attr.moderation.cq_count = priv->ethtool.rx_max_coalesced_frames; + attr.moderation.cq_period = moder_time; + ipoib_dbg(priv, "%s: Rx moder_time changed from:%d to %d\n", + __func__, priv->ethtool.last_moder_time, moder_time); + priv->ethtool.last_moder_time = moder_time; + ret = ib_modify_cq(priv->recv_cq, + &attr, + IB_CQ_MODERATE); + if (ret && ret != -ENOSYS) + ipoib_warn(priv, "%s: failed modifying CQ (%d)\n", + __func__, ret); + } + + priv->ethtool.last_moder_packets = rx_packets; + priv->ethtool.last_moder_tx_packets = tx_packets; + priv->ethtool.last_moder_bytes = rx_bytes; + priv->ethtool.last_moder_jiffies = jiffies; +} + +static void ipoib_config_adapt_moder(struct work_struct *work) +{ + struct delayed_work *delay = to_delayed_work(work); + struct ipoib_dev_priv *priv = container_of(delay, + struct ipoib_dev_priv, + adaptive_moder_task); + + if (!(netif_running(priv->dev) && netif_carrier_ok(priv->dev))) { + ipoib_dbg(priv, "%s: port is not ACTIVE, no configuration" + " for adaptive moderation\n", + __func__); + return; + } + + ipoib_auto_moderation(priv); + + if (test_bit(IPOIB_FLAG_AUTO_MODER, &priv->flags) && + priv->ethtool.use_adaptive_rx_coalesce) + queue_delayed_work(ipoib_auto_moder_workqueue, + &priv->adaptive_moder_task, + ADAPT_MODERATION_DELAY); +} + +int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + if (ipoib_neigh_hash_init(priv) < 0) + goto out; + + /* Allocate RX/TX "rings" to hold queued skbs */ + priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring, + GFP_KERNEL); + if (!priv->rx_ring) { + printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n", + ca->name, ipoib_recvq_size); + goto out_neigh_hash_cleanup; + } + + priv->tx_ring = vmalloc(ipoib_sendq_size * sizeof *priv->tx_ring); + if (!priv->tx_ring) { + printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n", + ca->name, ipoib_sendq_size); + goto out_rx_ring_cleanup; + } + memset(priv->tx_ring, 0, ipoib_sendq_size * sizeof *priv->tx_ring); + + /* priv->tx_head, tx_tail & tx_outstanding are already 0 */ + if (ipoib_ib_dev_init(dev, ca, port)) + goto out_tx_ring_cleanup; + + ipoib_set_default_moderation(priv); + + return 0; + +out_tx_ring_cleanup: + vfree(priv->tx_ring); + +out_rx_ring_cleanup: + kfree(priv->rx_ring); + +out_neigh_hash_cleanup: + ipoib_neigh_hash_uninit(dev); +out: + return -ENOMEM; +} + +void ipoib_dev_cleanup(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev), *cpriv, *tcpriv; + + ipoib_delete_debug_files(dev); + + /* Delete any child interfaces first */ + list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) { + /* Stop GC on child */ + set_bit(IPOIB_STOP_NEIGH_GC, &cpriv->flags); + cancel_delayed_work(&cpriv->neigh_reap_task); + unregister_netdev(cpriv->dev); + ipoib_dev_cleanup(cpriv->dev); + free_netdev(cpriv->dev); + } + + ipoib_ib_dev_cleanup(dev); + + kfree(priv->rx_ring); + vfree(priv->tx_ring); + + priv->rx_ring = NULL; + priv->tx_ring = NULL; + + ipoib_neigh_hash_uninit(dev); +} + +static int get_skb_hdr(struct sk_buff *skb, void **iphdr, + void **tcph, u64 *hdr_flags, void *priv) +{ + unsigned int ip_len; + struct iphdr *iph; + + if (unlikely(skb->protocol != htons(ETH_P_IP))) + return -1; + + /* + * In the future we may add an else clause that verifies the + * checksum and allows devices which do not calculate checksum + * to use LRO. + */ + if (unlikely(skb->ip_summed != CHECKSUM_UNNECESSARY)) + return -1; + + /* Check for non-TCP packet */ + skb_reset_network_header(skb); + iph = ip_hdr(skb); + if (iph->protocol != IPPROTO_TCP) + return -1; + + ip_len = ip_hdrlen(skb); + skb_set_transport_header(skb, ip_len); + *tcph = tcp_hdr(skb); + + /* check if IP header and TCP header are complete */ + if (ntohs(iph->tot_len) < ip_len + tcp_hdrlen(skb)) + return -1; + + *hdr_flags = LRO_IPV4 | LRO_TCP; + *iphdr = iph; + + return 0; +} + +static void ipoib_lro_setup(struct ipoib_dev_priv *priv) +{ + priv->lro.lro_mgr.max_aggr = lro_max_aggr; + priv->lro.lro_mgr.max_desc = IPOIB_MAX_LRO_DESCRIPTORS; + priv->lro.lro_mgr.lro_arr = priv->lro.lro_desc; + priv->lro.lro_mgr.get_skb_header = get_skb_hdr; + priv->lro.lro_mgr.features = LRO_F_NAPI; + priv->lro.lro_mgr.dev = priv->dev; + priv->lro.lro_mgr.ip_summed_aggr = CHECKSUM_UNNECESSARY; +} + +static void ipoib_setup(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + dev->open = ipoib_open; + dev->stop = ipoib_stop; + dev->change_mtu = ipoib_change_mtu; + dev->hard_start_xmit = ipoib_start_xmit; + dev->get_stats = ipoib_get_stats; + dev->tx_timeout = ipoib_timeout; + dev->set_multicast_list = ipoib_set_mcast_list; + + dev->hard_header = ipoib_hard_header; + + ipoib_set_ethtool_ops(dev); + + dev->poll = ipoib_poll; + dev->weight = 100; + + dev->watchdog_timeo = 5 * HZ; + + dev->flags |= IFF_BROADCAST | IFF_MULTICAST; + + dev->hard_header_len = IPOIB_ENCAP_LEN; + dev->addr_len = INFINIBAND_ALEN; + dev->type = ARPHRD_INFINIBAND; + dev->tx_queue_len = ipoib_sendq_size * 2; + dev->features = (NETIF_F_VLAN_CHALLENGED | + NETIF_F_HIGHDMA); + + memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN); + + netif_carrier_off(dev); + + priv->dev = dev; + + ipoib_lro_setup(priv); + + spin_lock_init(&priv->lock); + spin_lock_init(&priv->rx_ring_lock); + + mutex_init(&priv->vlan_mutex); + mutex_init(&priv->state_lock); + + INIT_LIST_HEAD(&priv->path_list); + INIT_LIST_HEAD(&priv->child_intfs); + INIT_LIST_HEAD(&priv->dead_ahs); + INIT_LIST_HEAD(&priv->multicast_list); + + INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll); + INIT_DELAYED_WORK(&priv->mcast_join_task, ipoib_mcast_join_task); + INIT_DELAYED_WORK(&priv->mcast_leave_task, ipoib_mcast_leave_task); + INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task); + INIT_WORK(&priv->flush_light, ipoib_ib_dev_flush_light); + INIT_WORK(&priv->flush_normal, ipoib_ib_dev_flush_normal); + INIT_WORK(&priv->flush_heavy, ipoib_ib_dev_flush_heavy); + INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task); + INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah); + INIT_DELAYED_WORK(&priv->adaptive_moder_task, ipoib_config_adapt_moder); + INIT_DELAYED_WORK(&priv->neigh_reap_task, ipoib_reap_neigh); +} + +struct ipoib_dev_priv *ipoib_intf_alloc(const char *name) +{ + struct net_device *dev; + + dev = alloc_netdev((int) sizeof (struct ipoib_dev_priv), name, + ipoib_setup); + if (!dev) + return NULL; + + return netdev_priv(dev); +} + +static ssize_t show_pkey(struct class_device *dev, char *buf) +{ + struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev)); + + return sprintf(buf, "0x%04x\n", priv->pkey); +} +static CLASS_DEVICE_ATTR(pkey, S_IRUGO, show_pkey, NULL); + +static ssize_t show_dev_id(struct class_device *d, char *buf) +{ + return sprintf(buf, "%d\n", to_net_dev(d)->dev_id); +} +static CLASS_DEVICE_ATTR(dev_id, S_IRUGO, show_dev_id, NULL); + +static ssize_t show_umcast(struct class_device *dev, char *buf) +{ + struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev)); + + return sprintf(buf, "%d\n", test_bit(IPOIB_FLAG_UMCAST, &priv->flags)); +} + +static ssize_t set_umcast(struct class_device *dev, + const char *buf, size_t count) +{ + struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev)); + unsigned long umcast_val = simple_strtoul(buf, NULL, 0); + + if (umcast_val > 0) { + set_bit(IPOIB_FLAG_UMCAST, &priv->flags); + ipoib_warn(priv, "ignoring multicast groups joined directly " + "by userspace\n"); + } else + clear_bit(IPOIB_FLAG_UMCAST, &priv->flags); + + return count; +} +static CLASS_DEVICE_ATTR(umcast, S_IWUSR | S_IRUGO, show_umcast, set_umcast); + +int ipoib_add_umcast_attr(struct net_device *dev) +{ + return class_device_create_file(&dev->class_dev, + &class_device_attr_umcast); +} + +static int parse_child(struct device *dev, const char *buf, int *pkey, + int *child_index) +{ + int ret; + struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev)); + + *pkey = *child_index = -1; + + /* 'pkey' or 'pkey.child_index' or '.child_index' are allowed */ + ret = sscanf(buf, "%i.%i", pkey, child_index); + if (ret == 1) /* just pkey, implicit child index is 0 */ + *child_index = 0; + else if (ret != 2) { /* pkey same as parent, specified child index */ + *pkey = priv->pkey; + ret = sscanf(buf, ".%i", child_index); + if (ret != 1 || *child_index == 0) + return -EINVAL; + } + + if (*child_index < 0 || *child_index > 0xff) + return -EINVAL; + + if (*pkey < 0 || *pkey > 0xffff) + return -EINVAL; + + ipoib_dbg(priv, "parse_child inp %s out pkey %04x index %d\n", + buf, *pkey, *child_index); + return 0; +} + +static ssize_t create_child(struct class_device *dev, + const char *buf, size_t count) +{ + int pkey, child_index; + int ret; + + if (parse_child(dev, buf, &pkey, &child_index)) + return -EINVAL; + + /* + * Set the full membership bit, so that we join the right + * broadcast group, etc. + */ + pkey |= 0x8000; + + ret = ipoib_vlan_add(to_net_dev(dev), pkey, child_index); + + return ret ? ret : count; +} +static CLASS_DEVICE_ATTR(create_child, S_IWUGO, NULL, create_child); + +static ssize_t delete_child(struct class_device *dev, + const char *buf, size_t count) +{ + int pkey, child_index; + int ret; + + if (parse_child(dev, buf, &pkey, &child_index)) + return -EINVAL; + + ret = ipoib_vlan_delete(to_net_dev(dev), pkey, child_index); + + return ret ? ret : count; + +} +static CLASS_DEVICE_ATTR(delete_child, S_IWUGO, NULL, delete_child); + +int ipoib_add_pkey_attr(struct net_device *dev) +{ + return class_device_create_file(&dev->class_dev, + &class_device_attr_pkey); +} + +void set_lro_features_bit(struct ipoib_dev_priv *priv) +{ + if (lro) + priv->dev->features |= NETIF_F_LRO; + /*no support in LRO with 4k mtu.*/ + if (ipoib_ud_need_sg(priv->max_ib_mtu)) + priv->dev->features &= ~NETIF_F_LRO; +} + +int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca) +{ + struct ib_device_attr *device_attr; + int result = -ENOMEM; + + device_attr = kmalloc(sizeof *device_attr, GFP_KERNEL); + if (!device_attr) { + printk(KERN_WARNING "%s: allocation of %zu bytes failed\n", + hca->name, sizeof *device_attr); + return result; + } + + result = ib_query_device(hca, device_attr); + if (result) { + printk(KERN_WARNING "%s: ib_query_device failed (ret = %d)\n", + hca->name, result); + kfree(device_attr); + return result; + } + priv->hca_caps = device_attr->device_cap_flags; + + kfree(device_attr); + + if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) { + set_bit(IPOIB_FLAG_CSUM, &priv->flags); + priv->dev->features |= NETIF_F_SG | NETIF_F_IP_CSUM; + } + + set_lro_features_bit(priv); + + if (priv->dev->features & NETIF_F_SG && priv->hca_caps & IB_DEVICE_UD_TSO) + priv->dev->features |= NETIF_F_TSO; + + priv->dev->features |= SOCK_ACCL_POLL_TCP | SOCK_ACCL_POLL_UDP; + + return 0; +} + + +static struct net_device *ipoib_add_port(const char *format, + struct ib_device *hca, u8 port) +{ + struct ipoib_dev_priv *priv; + struct ib_port_attr attr; + int result = -ENOMEM; + + priv = ipoib_intf_alloc(format); + if (!priv) + goto alloc_mem_failed; + + SET_NETDEV_DEV(priv->dev, hca->dma_device); + priv->dev->dev_id = port - 1; + + if (!ib_query_port(hca, port, &attr)) + priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu); + else { + printk(KERN_WARNING "%s: ib_query_port %d failed\n", + hca->name, port); + goto device_init_failed; + } + + /* MTU will be reset when mcast join happens */ + priv->dev->mtu = IPOIB_UD_MTU(priv->max_ib_mtu); + priv->mcast_mtu = priv->admin_mtu = priv->dev->mtu; + + result = ib_query_pkey(hca, port, 0, &priv->pkey); + if (result) { + printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n", + hca->name, port, result); + goto device_init_failed; + } + + result = ipoib_set_dev_features(priv, hca); + if (result) { + printk(KERN_WARNING "%s: failed to set device features for port %d (ret = %d)\n", + hca->name, port, result); + goto device_init_failed; + } + + /* + * Set the full membership bit, so that we join the right + * broadcast group, etc. + */ + priv->pkey |= 0x8000; + + priv->dev->broadcast[8] = priv->pkey >> 8; + priv->dev->broadcast[9] = priv->pkey & 0xff; + + priv->accl_priv.poll = &ipoib_accl_poll; + priv->accl_priv.get_tcp_ring = ipoib_get_tcp_ring; + priv->accl_priv.get_udp_rings = ipoib_get_udp_rings; + + result = ib_query_gid(hca, port, 0, &priv->local_gid, NULL); + if (result) { + printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n", + hca->name, port, result); + goto device_init_failed; + } else + memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid)); + + result = ipoib_dev_init(priv->dev, hca, port); + if (result < 0) { + printk(KERN_WARNING "%s: failed to initialize port %d (ret = %d)\n", + hca->name, port, result); + goto device_init_failed; + } + + INIT_IB_EVENT_HANDLER(&priv->event_handler, + priv->ca, ipoib_event); + result = ib_register_event_handler(&priv->event_handler); + if (result < 0) { + printk(KERN_WARNING "%s: ib_register_event_handler failed for " + "port %d (ret = %d)\n", + hca->name, port, result); + goto event_failed; + } + + result = register_netdev(priv->dev); + if (result) { + printk(KERN_WARNING "%s: couldn't register ipoib port %d; error %d\n", + hca->name, port, result); + goto register_failed; + } + /*force lro on the dev->features, because the function + register_netdev disable it according to our private lro*/ + set_lro_features_bit(priv); + + ipoib_create_debug_files(priv->dev); + + result = -ENOMEM; + + if (ipoib_cm_add_mode_attr(priv->dev)) + goto sysfs_failed; + if (ipoib_add_pkey_attr(priv->dev)) + goto sysfs_failed; + if (ipoib_add_umcast_attr(priv->dev)) + goto sysfs_failed; + if (class_device_create_file(&priv->dev->class_dev, + &class_device_attr_create_child)) + goto sysfs_failed; + if (class_device_create_file(&priv->dev->class_dev, + &class_device_attr_delete_child)) + goto sysfs_failed; + if (class_device_create_file(&priv->dev->class_dev, + &class_device_attr_dev_id)) + goto sysfs_failed; + + return priv->dev; + +sysfs_failed: + ipoib_delete_debug_files(priv->dev); + unregister_netdev(priv->dev); + +register_failed: + ib_unregister_event_handler(&priv->event_handler); + /* Stop GC if started before flush */ + set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); + cancel_delayed_work(&priv->neigh_reap_task); + flush_workqueue(ipoib_workqueue); + +event_failed: + ipoib_dev_cleanup(priv->dev); + +device_init_failed: + free_netdev(priv->dev); + +alloc_mem_failed: + return ERR_PTR(result); +} + +static void ipoib_add_one(struct ib_device *device) +{ + struct list_head *dev_list; + struct net_device *dev; + struct ipoib_dev_priv *priv; + int s, e, p; + + if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) + return; + + dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL); + if (!dev_list) + return; + + INIT_LIST_HEAD(dev_list); + + if (device->node_type == RDMA_NODE_IB_SWITCH) { + s = 0; + e = 0; + } else { + s = 1; + e = device->phys_port_cnt; + } + + for (p = s; p <= e; ++p) { + if (rdma_port_get_link_layer(device, p) != IB_LINK_LAYER_INFINIBAND) + continue; + dev = ipoib_add_port("ib%d", device, p); + if (!IS_ERR(dev)) { + priv = netdev_priv(dev); + list_add_tail(&priv->list, dev_list); + } + } + + ib_set_client_data(device, &ipoib_client, dev_list); +} + +static void ipoib_remove_one(struct ib_device *device) +{ + struct ipoib_dev_priv *priv, *tmp; + struct list_head *dev_list; + + if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) + return; + + dev_list = ib_get_client_data(device, &ipoib_client); + if (!dev_list) + return; + + list_for_each_entry_safe(priv, tmp, dev_list, list) { + if (rdma_port_get_link_layer(device, priv->port) != IB_LINK_LAYER_INFINIBAND) + continue; + + set_bit(IPOIB_FLAG_MODULE_DOWN, &priv->flags); + ib_unregister_event_handler(&priv->event_handler); + + rtnl_lock(); + dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP); + rtnl_unlock(); + + /* Stop GC */ + set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); + cancel_delayed_work(&priv->neigh_reap_task); + flush_workqueue(ipoib_workqueue); + flush_workqueue(ipoib_auto_moder_workqueue); + + unregister_netdev(priv->dev); + ipoib_dev_cleanup(priv->dev); + free_netdev(priv->dev); + } + + kfree(dev_list); +} + +static int __init ipoib_init_module(void) +{ + int ret; + + ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size); + ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE); + ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE); + + ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size); + ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE); + ipoib_sendq_size = max(ipoib_sendq_size, max(2 * MAX_SEND_CQE, + IPOIB_MIN_QUEUE_SIZE)); +#ifdef CONFIG_INFINIBAND_IPOIB_CM + ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP); +#endif + + /* + * When copying small received packets, we only copy from the + * linear data part of the SKB, so we rely on this condition. + */ + BUILD_BUG_ON(IPOIB_CM_COPYBREAK > IPOIB_CM_HEAD_SIZE); + + ret = ipoib_register_debugfs(); + if (ret) + return ret; + + /* + * We create our own workqueue mainly because we want to be + * able to flush it when devices are being removed. We can't + * use schedule_work()/flush_scheduled_work() because both + * unregister_netdev() and linkwatch_event take the rtnl lock, + * so flush_scheduled_work() can deadlock during device + * removal. + */ + ipoib_workqueue = create_singlethread_workqueue("ipoib"); + if (!ipoib_workqueue) { + ret = -ENOMEM; + goto err_fs; + } + + ipoib_auto_moder_workqueue = + create_singlethread_workqueue("ipoib_auto_moder"); + if (!ipoib_auto_moder_workqueue) { + ret = -ENOMEM; + goto err_am; + } + + + ib_sa_register_client(&ipoib_sa_client); + + ret = ib_register_client(&ipoib_client); + if (ret) + goto err_sa; + + return 0; + +err_sa: + ib_sa_unregister_client(&ipoib_sa_client); + destroy_workqueue(ipoib_auto_moder_workqueue); +err_am: + destroy_workqueue(ipoib_workqueue); + +err_fs: + ipoib_unregister_debugfs(); + + return ret; +} + +static void __exit ipoib_cleanup_module(void) +{ + ib_unregister_client(&ipoib_client); + ib_sa_unregister_client(&ipoib_sa_client); + ipoib_unregister_debugfs(); + destroy_workqueue(ipoib_workqueue); + destroy_workqueue(ipoib_auto_moder_workqueue); +} + +module_init(ipoib_init_module); +module_exit(ipoib_cleanup_module); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib_1.5.3/ipoib_multicast.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib_1.5.3/ipoib_multicast.c new file mode 100644 index 0000000..891abfc --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib_1.5.3/ipoib_multicast.c @@ -0,0 +1,1094 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2004 Voltaire, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "ipoib.h" + +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG +static int mcast_debug_level; + +module_param(mcast_debug_level, int, 0644); +MODULE_PARM_DESC(mcast_debug_level, + "Enable multicast debug tracing if > 0"); +#endif + +static DEFINE_MUTEX(mcast_mutex); + +struct ipoib_mcast_iter { + struct net_device *dev; + union ib_gid mgid; + unsigned long created; + unsigned int queuelen; + unsigned int complete; + unsigned int send_only; +}; + +static void ipoib_mcast_free(struct ipoib_mcast *mcast) +{ + struct net_device *dev = mcast->dev; + int tx_dropped = 0; + struct ipoib_dev_priv *priv = netdev_priv(dev); + + ipoib_dbg_mcast(netdev_priv(dev), + "deleting multicast group " IPOIB_GID_FMT "\n", + IPOIB_GID_ARG(mcast->mcmember.mgid)); + + + /* remove all neigh connected to this mcast */ + ipoib_del_neighs_by_gid(dev, mcast->mcmember.mgid.raw); + + if (mcast->ah) + ipoib_put_ah(mcast->ah); + + while (!skb_queue_empty(&mcast->pkt_queue)) { + ++tx_dropped; + dev_kfree_skb_any(skb_dequeue(&mcast->pkt_queue)); + } + + netif_tx_lock_bh(dev); + priv->stats.tx_dropped += tx_dropped; + netif_tx_unlock_bh(dev); + + kfree(mcast); +} + +static struct ipoib_mcast *ipoib_mcast_alloc(struct net_device *dev, + int can_sleep) +{ + struct ipoib_mcast *mcast; + + mcast = kzalloc(sizeof *mcast, can_sleep ? GFP_KERNEL : GFP_ATOMIC); + if (!mcast) + return NULL; + + mcast->dev = dev; + mcast->created = jiffies; + mcast->used = jiffies; + mcast->backoff = 1; + + INIT_LIST_HEAD(&mcast->list); + INIT_LIST_HEAD(&mcast->neigh_list); + skb_queue_head_init(&mcast->pkt_queue); + + return mcast; +} + +static struct ipoib_mcast *__ipoib_mcast_find(struct net_device *dev, void *mgid) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct rb_node *n = priv->multicast_tree.rb_node; + + while (n) { + struct ipoib_mcast *mcast; + int ret; + + mcast = rb_entry(n, struct ipoib_mcast, rb_node); + + ret = memcmp(mgid, mcast->mcmember.mgid.raw, + sizeof (union ib_gid)); + if (ret < 0) + n = n->rb_left; + else if (ret > 0) + n = n->rb_right; + else + return mcast; + } + + return NULL; +} + +static int __ipoib_mcast_add(struct net_device *dev, struct ipoib_mcast *mcast) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct rb_node **n = &priv->multicast_tree.rb_node, *pn = NULL; + + while (*n) { + struct ipoib_mcast *tmcast; + int ret; + + pn = *n; + tmcast = rb_entry(pn, struct ipoib_mcast, rb_node); + + ret = memcmp(mcast->mcmember.mgid.raw, tmcast->mcmember.mgid.raw, + sizeof (union ib_gid)); + if (ret < 0) + n = &pn->rb_left; + else if (ret > 0) + n = &pn->rb_right; + else + return -EEXIST; + } + + rb_link_node(&mcast->rb_node, pn, n); + rb_insert_color(&mcast->rb_node, &priv->multicast_tree); + + return 0; +} + +static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, + struct ib_sa_mcmember_rec *mcmember) +{ + struct net_device *dev = mcast->dev; + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_ah *ah; + int ret; + int set_qkey = 0; + + mcast->mcmember = *mcmember; + + /* Set the cached Q_Key before we attach if it's the broadcast group */ + if (!memcmp(mcast->mcmember.mgid.raw, priv->dev->broadcast + 4, + sizeof (union ib_gid))) { + spin_lock_irq(&priv->lock); + if (!priv->broadcast) { + spin_unlock_irq(&priv->lock); + return -EAGAIN; + } + priv->qkey = be32_to_cpu(priv->broadcast->mcmember.qkey); + spin_unlock_irq(&priv->lock); + priv->tx_wr.wr.ud.remote_qkey = priv->qkey; + set_qkey = 1; + } + + if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { + if (test_and_set_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) { + ipoib_warn(priv, "multicast group " IPOIB_GID_FMT + " already attached\n", + IPOIB_GID_ARG(mcast->mcmember.mgid)); + + return 0; + } + + ret = ipoib_mcast_attach(dev, be16_to_cpu(mcast->mcmember.mlid), + &mcast->mcmember.mgid, set_qkey); + if (ret < 0) { + ipoib_warn(priv, "couldn't attach QP to multicast group " + IPOIB_GID_FMT "\n", + IPOIB_GID_ARG(mcast->mcmember.mgid)); + + clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags); + return ret; + } + } + + { + struct ib_ah_attr av = { + .dlid = be16_to_cpu(mcast->mcmember.mlid), + .port_num = priv->port, + .sl = mcast->mcmember.sl, + .ah_flags = IB_AH_GRH, + .static_rate = mcast->mcmember.rate, + .grh = { + .flow_label = be32_to_cpu(mcast->mcmember.flow_label), + .hop_limit = mcast->mcmember.hop_limit, + .sgid_index = 0, + .traffic_class = mcast->mcmember.traffic_class + } + }; + av.grh.dgid = mcast->mcmember.mgid; + + ah = ipoib_create_ah(dev, priv->pd, &av); + if (!ah) { + ipoib_warn(priv, "ib_address_create failed\n"); + } else { + spin_lock_irq(&priv->lock); + mcast->ah = ah; + spin_unlock_irq(&priv->lock); + + ipoib_dbg_mcast(priv, "MGID " IPOIB_GID_FMT + " AV %p, LID 0x%04x, SL %d\n", + IPOIB_GID_ARG(mcast->mcmember.mgid), + mcast->ah->ah, + be16_to_cpu(mcast->mcmember.mlid), + mcast->mcmember.sl); + } + } + + /* actually send any queued packets */ + netif_tx_lock_bh(dev); + while (!skb_queue_empty(&mcast->pkt_queue)) { + struct sk_buff *skb = skb_dequeue(&mcast->pkt_queue); + netif_tx_unlock_bh(dev); + + skb->dev = dev; + ret = dev_queue_xmit(skb); + if (ret) + ipoib_warn(priv, "%s: dev_queue_xmit failed to " + "requeue packet(ret: %d)\n", __func__, ret); + netif_tx_lock_bh(dev); + } + netif_tx_unlock_bh(dev); + + return 0; +} + +static int +ipoib_mcast_sendonly_join_complete(int status, + struct ib_sa_multicast *multicast) +{ + struct ipoib_mcast *mcast = multicast->context; + struct net_device *dev = mcast->dev; + struct ipoib_dev_priv *priv = netdev_priv(dev); + + /* We trap for port events ourselves. */ + if (status == -ENETRESET) + return 0; + + if (!status) + status = ipoib_mcast_join_finish(mcast, &multicast->rec); + + if (status) { + if (mcast->logcount++ < 20) + ipoib_dbg_mcast(netdev_priv(dev), "multicast join failed for " + IPOIB_GID_FMT ", status %d\n", + IPOIB_GID_ARG(mcast->mcmember.mgid), status); + + /* Flush out any queued packets */ + netif_tx_lock_bh(dev); + while (!skb_queue_empty(&mcast->pkt_queue)) { + ++priv->stats.tx_dropped; + dev_kfree_skb_any(skb_dequeue(&mcast->pkt_queue)); + } + netif_tx_unlock_bh(dev); + + /* Clear the busy flag so we try again */ + status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, + &mcast->flags); + } + return status; +} + +static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast) +{ + struct net_device *dev = mcast->dev; + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_sa_mcmember_rec rec = { +#if 0 /* Some SMs don't support send-only yet */ + .join_state = 4 +#else + .join_state = 1 +#endif + }; + ib_sa_comp_mask comp_mask; + int ret = 0; + + if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) { + ipoib_dbg_mcast(priv, "device shutting down, no multicast joins\n"); + return -ENODEV; + } + + if (test_and_set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) { + ipoib_dbg_mcast(priv, "multicast entry busy, skipping\n"); + return -EBUSY; + } + + rec.mgid = mcast->mcmember.mgid; + rec.port_gid = priv->local_gid; + rec.pkey = cpu_to_be16(priv->pkey); + + comp_mask = + IB_SA_MCMEMBER_REC_MGID | + IB_SA_MCMEMBER_REC_PORT_GID | + IB_SA_MCMEMBER_REC_PKEY | + IB_SA_MCMEMBER_REC_JOIN_STATE; + + if (priv->broadcast) { + comp_mask |= + IB_SA_MCMEMBER_REC_QKEY | + IB_SA_MCMEMBER_REC_MTU_SELECTOR | + IB_SA_MCMEMBER_REC_MTU | + IB_SA_MCMEMBER_REC_TRAFFIC_CLASS | + IB_SA_MCMEMBER_REC_RATE_SELECTOR | + IB_SA_MCMEMBER_REC_RATE | + IB_SA_MCMEMBER_REC_SL | + IB_SA_MCMEMBER_REC_FLOW_LABEL | + IB_SA_MCMEMBER_REC_HOP_LIMIT; + + rec.qkey = priv->broadcast->mcmember.qkey; + rec.mtu_selector = IB_SA_EQ; + rec.mtu = priv->broadcast->mcmember.mtu; + rec.traffic_class = priv->broadcast->mcmember.traffic_class; + rec.rate_selector = IB_SA_EQ; + rec.rate = priv->broadcast->mcmember.rate; + rec.sl = priv->broadcast->mcmember.sl; + rec.flow_label = priv->broadcast->mcmember.flow_label; + rec.hop_limit = priv->broadcast->mcmember.hop_limit; + } + + mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, + priv->port, &rec, + comp_mask, + GFP_ATOMIC, + ipoib_mcast_sendonly_join_complete, + mcast); + if (IS_ERR(mcast->mc)) { + ret = PTR_ERR(mcast->mc); + clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); + ipoib_warn(priv, "ib_sa_join_multicast failed (ret = %d)\n", + ret); + } else { + ipoib_dbg_mcast(priv, "no multicast record for " IPOIB_GID_FMT + ", starting join\n", + IPOIB_GID_ARG(mcast->mcmember.mgid)); + } + + return ret; +} + +void ipoib_mcast_carrier_on_task(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, + carrier_on_task); + struct ib_port_attr attr; + + mutex_lock(&priv->state_lock); + if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) { + ipoib_dbg(priv, "Keeping carrier off - IPOIB_FLAG_ADMIN_UP not set.\n"); + goto out; + } + + if (ib_query_port(priv->ca, priv->port, &attr) || + attr.state != IB_PORT_ACTIVE) { + ipoib_dbg(priv, "Keeping carrier off until IB port is active\n"); + goto out; + } + + netif_carrier_on(priv->dev); + + /* enable auto-moderation */ + if (priv->ethtool.use_adaptive_rx_coalesce && + test_bit(IPOIB_FLAG_AUTO_MODER, &priv->flags)) + queue_delayed_work(ipoib_auto_moder_workqueue, + &priv->adaptive_moder_task, + ADAPT_MODERATION_DELAY); + +out: + mutex_unlock(&priv->state_lock); + + +} + +static int ipoib_mcast_join_complete(int status, + struct ib_sa_multicast *multicast) +{ + struct ipoib_mcast *mcast = multicast->context; + struct net_device *dev = mcast->dev; + struct ipoib_dev_priv *priv = netdev_priv(dev); + + ipoib_dbg_mcast(priv, "join completion for " IPOIB_GID_FMT + " (status %d)\n", + IPOIB_GID_ARG(mcast->mcmember.mgid), status); + + /* We trap for port events ourselves. */ + if (status == -ENETRESET){ + status = 0; + goto out; + } + + if (!status) + status = ipoib_mcast_join_finish(mcast, &multicast->rec); + + if (!status) { + mcast->backoff = 1; + mutex_lock(&mcast_mutex); + if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) + queue_delayed_work(ipoib_workqueue, + &priv->mcast_join_task, 0); + mutex_unlock(&mcast_mutex); + + /* + * Defer carrier on work to ipoib_workqueue to avoid a + * deadlock on rtnl_lock here. + */ + if (mcast == priv->broadcast) + queue_work(ipoib_workqueue, &priv->carrier_on_task); + + status = 0; + goto out; + } + + if (mcast->logcount++ < 20) { + if (status == -ETIMEDOUT || status == -EAGAIN) { + ipoib_dbg_mcast(priv, "multicast join failed for " IPOIB_GID_FMT + ", status %d\n", + IPOIB_GID_ARG(mcast->mcmember.mgid), + status); + } else { + ipoib_warn(priv, "multicast join failed for " + IPOIB_GID_FMT ", status %d\n", + IPOIB_GID_ARG(mcast->mcmember.mgid), + status); + } + } + + mcast->backoff *= 2; + if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS) + mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS; + + /* Clear the busy flag so we try again */ + status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); + + mutex_lock(&mcast_mutex); + spin_lock_irq(&priv->lock); + if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) + queue_delayed_work(ipoib_workqueue, &priv->mcast_join_task, + mcast->backoff * HZ); + spin_unlock_irq(&priv->lock); + mutex_unlock(&mcast_mutex); +out: + complete(&mcast->done); + return status; +} + +static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast, + int create) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_sa_mcmember_rec rec = { + .join_state = 1 + }; + ib_sa_comp_mask comp_mask; + int ret = 0; + + ipoib_dbg_mcast(priv, "joining MGID " IPOIB_GID_FMT "\n", + IPOIB_GID_ARG(mcast->mcmember.mgid)); + + rec.mgid = mcast->mcmember.mgid; + rec.port_gid = priv->local_gid; + rec.pkey = cpu_to_be16(priv->pkey); + + comp_mask = + IB_SA_MCMEMBER_REC_MGID | + IB_SA_MCMEMBER_REC_PORT_GID | + IB_SA_MCMEMBER_REC_PKEY | + IB_SA_MCMEMBER_REC_JOIN_STATE; + + if (create) { + comp_mask |= + IB_SA_MCMEMBER_REC_QKEY | + IB_SA_MCMEMBER_REC_MTU_SELECTOR | + IB_SA_MCMEMBER_REC_MTU | + IB_SA_MCMEMBER_REC_TRAFFIC_CLASS | + IB_SA_MCMEMBER_REC_RATE_SELECTOR | + IB_SA_MCMEMBER_REC_RATE | + IB_SA_MCMEMBER_REC_SL | + IB_SA_MCMEMBER_REC_FLOW_LABEL | + IB_SA_MCMEMBER_REC_HOP_LIMIT; + + rec.qkey = priv->broadcast->mcmember.qkey; + rec.mtu_selector = IB_SA_EQ; + rec.mtu = priv->broadcast->mcmember.mtu; + rec.traffic_class = priv->broadcast->mcmember.traffic_class; + rec.rate_selector = IB_SA_EQ; + rec.rate = priv->broadcast->mcmember.rate; + rec.sl = priv->broadcast->mcmember.sl; + rec.flow_label = priv->broadcast->mcmember.flow_label; + rec.hop_limit = priv->broadcast->mcmember.hop_limit; + } + + set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); + init_completion(&mcast->done); + set_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags); + + mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port, + &rec, comp_mask, GFP_KERNEL, + ipoib_mcast_join_complete, mcast); + if (IS_ERR(mcast->mc)) { + clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); + complete(&mcast->done); + ret = PTR_ERR(mcast->mc); + ipoib_warn(priv, "ib_sa_join_multicast failed, status %d\n", ret); + + mcast->backoff *= 2; + if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS) + mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS; + + mutex_lock(&mcast_mutex); + if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) + queue_delayed_work(ipoib_workqueue, + &priv->mcast_join_task, + mcast->backoff * HZ); + mutex_unlock(&mcast_mutex); + } +} + +void ipoib_mcast_join_task(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = + container_of(work, struct ipoib_dev_priv, mcast_join_task.work); + struct net_device *dev = priv->dev; + struct ib_port_attr attr; + + if (!test_bit(IPOIB_MCAST_RUN, &priv->flags)) + return; + + if (ib_query_port(priv->ca, priv->port, &attr) || + attr.state != IB_PORT_ACTIVE) { + ipoib_dbg(priv, "%s: port state is not ACTIVE (state = %d) suspend task.\n", + __func__, attr.state); + return; + } + + if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid, NULL)) + ipoib_warn(priv, "ib_query_gid() failed\n"); + else + memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid)); + + { + struct ib_port_attr attr; + + if (!ib_query_port(priv->ca, priv->port, &attr)) + priv->local_lid = attr.lid; + else + ipoib_warn(priv, "ib_query_port failed\n"); + } + + if (!priv->broadcast) { + struct ipoib_mcast *broadcast; + + if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) + return; + + broadcast = ipoib_mcast_alloc(dev, 1); + if (!broadcast) { + ipoib_warn(priv, "failed to allocate broadcast group\n"); + mutex_lock(&mcast_mutex); + if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) + queue_delayed_work(ipoib_workqueue, + &priv->mcast_join_task, HZ); + mutex_unlock(&mcast_mutex); + return; + } + + spin_lock_irq(&priv->lock); + memcpy(broadcast->mcmember.mgid.raw, priv->dev->broadcast + 4, + sizeof (union ib_gid)); + priv->broadcast = broadcast; + + __ipoib_mcast_add(dev, priv->broadcast); + spin_unlock_irq(&priv->lock); + } + + if (priv->broadcast && + !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) { + if (priv->broadcast && + !test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags)) + ipoib_mcast_join(dev, priv->broadcast, 0); + return; + } + + while (1) { + struct ipoib_mcast *mcast = NULL; + + spin_lock_irq(&priv->lock); + list_for_each_entry(mcast, &priv->multicast_list, list) { + if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) + && !test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags) + && !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) { + /* Found the next unjoined group */ + break; + } + } + spin_unlock_irq(&priv->lock); + + if (&mcast->list == &priv->multicast_list) { + /* All done */ + break; + } + + ipoib_mcast_join(dev, mcast, 1); + return; + } + + spin_lock_irq(&priv->lock); + if (priv->broadcast) + priv->mcast_mtu = IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu)); + else + priv->mcast_mtu = priv->admin_mtu; + spin_unlock_irq(&priv->lock); + + if (!ipoib_cm_admin_enabled(dev)) { + rtnl_lock(); + dev_set_mtu(dev, min(priv->mcast_mtu, priv->admin_mtu)); + rtnl_unlock(); + } + + ipoib_dbg_mcast(priv, "successfully joined all multicast groups\n"); + + clear_bit(IPOIB_MCAST_RUN, &priv->flags); +} + +int ipoib_mcast_start_thread(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + ipoib_dbg_mcast(priv, "starting multicast thread\n"); + + mutex_lock(&mcast_mutex); + if (!test_and_set_bit(IPOIB_MCAST_RUN, &priv->flags)) + queue_delayed_work(ipoib_workqueue, &priv->mcast_join_task, 0); + if (!test_and_set_bit(IPOIB_MCAST_RUN_GC, &priv->flags)) + queue_delayed_work(ipoib_workqueue, &priv->mcast_leave_task, 0); + mutex_unlock(&mcast_mutex); + + return 0; +} + +int ipoib_mcast_stop_thread(struct net_device *dev, int flush) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + ipoib_dbg_mcast(priv, "stopping multicast thread\n"); + + mutex_lock(&mcast_mutex); + clear_bit(IPOIB_MCAST_RUN, &priv->flags); + clear_bit(IPOIB_MCAST_RUN_GC, &priv->flags); + cancel_delayed_work(&priv->mcast_join_task); + cancel_delayed_work(&priv->mcast_leave_task); + mutex_unlock(&mcast_mutex); + + if (flush) + flush_workqueue(ipoib_workqueue); + + return 0; +} + +static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int ret = 0; + + if (test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) + ib_sa_free_multicast(mcast->mc); + + if (test_and_clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) { + ipoib_dbg_mcast(priv, "leaving MGID " IPOIB_GID_FMT "\n", + IPOIB_GID_ARG(mcast->mcmember.mgid)); + + /* Remove ourselves from the multicast group */ + ret = ib_detach_mcast(priv->qp, &mcast->mcmember.mgid, + be16_to_cpu(mcast->mcmember.mlid)); + if (ret) + ipoib_warn(priv, "ib_detach_mcast failed (result = %d)\n", ret); + } + + return 0; +} + +void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_mcast *mcast; + unsigned long flags; + void *mgid = daddr + 4; + + spin_lock_irqsave(&priv->lock, flags); + + if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags) || + !priv->broadcast || + !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) { + ++priv->stats.tx_dropped; + dev_kfree_skb_any(skb); + goto unlock; + } + + mcast = __ipoib_mcast_find(dev, mgid); + if (!mcast) { + /* Let's create a new send only group now */ + ipoib_dbg_mcast(priv, "setting up send only multicast group for " + IPOIB_GID_FMT "\n", IPOIB_GID_RAW_ARG(mgid)); + + mcast = ipoib_mcast_alloc(dev, 0); + if (!mcast) { + ipoib_warn(priv, "unable to allocate memory for " + "multicast structure\n"); + ++priv->stats.tx_dropped; + dev_kfree_skb_any(skb); + goto out; + } + + set_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags); + memcpy(mcast->mcmember.mgid.raw, mgid, sizeof (union ib_gid)); + + /* + * Check if user-space already attached to that mcg. + * if yes, marks the mcg as user-space-attached, and when + * the kernel will call ipoib to add it as full memeber + * in set_mc_list callback, ipoib ignores that mcg. + */ + if (test_bit(IPOIB_FLAG_UMCAST, &priv->flags)) { + union ib_gid sa_mgid; + struct ib_sa_mcmember_rec rec; + + memcpy(sa_mgid.raw, mgid, sizeof sa_mgid); + if (!ib_sa_get_mcmember_rec(priv->ca, priv->port, &sa_mgid, &rec)) { + ipoib_dbg_mcast(priv, "Found send-only that already attached" + " by user-space mgid "IPOIB_GID_FMT"\n" ,IPOIB_GID_ARG(sa_mgid)); + set_bit(IPOIB_MCAST_UMCAST_ATTACHED, &mcast->flags); + } + } + + __ipoib_mcast_add(dev, mcast); + list_add_tail(&mcast->list, &priv->multicast_list); + } + + if (!mcast->ah) { + if (skb_queue_len(&mcast->pkt_queue) < IPOIB_MAX_MCAST_QUEUE) + skb_queue_tail(&mcast->pkt_queue, skb); + else { + ++priv->stats.tx_dropped; + dev_kfree_skb_any(skb); + } + + if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) + ipoib_dbg_mcast(priv, "no address vector, " + "but multicast join already started\n"); + else if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) + ipoib_mcast_sendonly_join(mcast); + + /* + * If lookup completes between here and out:, don't + * want to send packet twice. + */ + mcast = NULL; + } + +out: + if (mcast && mcast->ah) { + struct ipoib_neigh *neigh; + + spin_unlock_irqrestore(&priv->lock, flags); + neigh = ipoib_neigh_get(dev, daddr); + spin_lock_irqsave(&priv->lock, flags); + if (!neigh) { + neigh = ipoib_neigh_alloc(daddr, dev); + if (neigh) { + kref_get(&mcast->ah->ref); + neigh->ah = mcast->ah; + list_add_tail(&neigh->list, &mcast->neigh_list); + } + } + spin_unlock_irqrestore(&priv->lock, flags); + mcast->used = jiffies; + ipoib_send(dev, skb, mcast->ah, IB_MULTICAST_QPN); + if (neigh) + ipoib_neigh_put(neigh); + return; + + } + +unlock: + spin_unlock_irqrestore(&priv->lock, flags); +} + +void ipoib_mcast_dev_flush(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + LIST_HEAD(remove_list); + struct ipoib_mcast *mcast, *tmcast; + unsigned long flags; + + ipoib_dbg_mcast(priv, "flushing multicast list\n"); + + spin_lock_irqsave(&priv->lock, flags); + + list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) { + list_del(&mcast->list); + rb_erase(&mcast->rb_node, &priv->multicast_tree); + list_add_tail(&mcast->list, &remove_list); + } + + if (priv->broadcast) { + rb_erase(&priv->broadcast->rb_node, &priv->multicast_tree); + list_add_tail(&priv->broadcast->list, &remove_list); + priv->broadcast = NULL; + } + + spin_unlock_irqrestore(&priv->lock, flags); + + /*seperate between the wait to the leav.*/ + list_for_each_entry_safe(mcast, tmcast, &remove_list, list) + if (test_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags)) + wait_for_completion(&mcast->done); + + list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { + ipoib_mcast_leave(dev, mcast); + ipoib_mcast_free(mcast); + } +} + +static int ipoib_mcast_addr_is_valid(const u8 *addr, unsigned int addrlen, + const u8 *broadcast) +{ + if (addrlen != INFINIBAND_ALEN) + return 0; + /* reserved QPN, prefix, scope */ + if (memcmp(addr, broadcast, 5)) + return 0; + /* signature lower */ + if (addr[7] != broadcast[7]) + return 0; + return 1; +} + +void ipoib_mcast_restart_task(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = + container_of(work, struct ipoib_dev_priv, restart_task); + struct net_device *dev = priv->dev; + struct dev_mc_list *mclist; + struct ipoib_mcast *mcast, *tmcast; + LIST_HEAD(remove_list); + unsigned long flags; + struct ib_sa_mcmember_rec rec; + + ipoib_dbg_mcast(priv, "restarting multicast task\n"); + + ipoib_mcast_stop_thread(dev, 0); + + local_irq_save(flags); + netif_tx_lock(dev); + spin_lock(&priv->lock); + + /* + * Unfortunately, the networking core only gives us a list of all of + * the multicast hardware addresses. We need to figure out which ones + * are new and which ones have been removed + */ + + /* Clear out the found flag */ + list_for_each_entry(mcast, &priv->multicast_list, list) + clear_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags); + + /* Mark all of the entries that are found or don't exist */ + for (mclist = dev->mc_list; mclist; mclist = mclist->next) { + union ib_gid mgid; + + if (!ipoib_mcast_addr_is_valid(mclist->dmi_addr, + mclist->dmi_addrlen, + dev->broadcast)) + continue; + + memcpy(mgid.raw, mclist->dmi_addr + 4, sizeof mgid); + + /* update scope */ + mgid.raw[1] = 0x10 | (dev->broadcast[5] & 0xF); + /* Add in the P_Key */ + mgid.raw[4] = dev->broadcast[8]; + mgid.raw[5] = dev->broadcast[9]; + mcast = __ipoib_mcast_find(dev, &mgid); + if (!mcast || test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { + struct ipoib_mcast *nmcast; + + /* ignore group which is directly joined by userspace */ + if ((!mcast && test_bit(IPOIB_FLAG_UMCAST, &priv->flags) && + !ib_sa_get_mcmember_rec(priv->ca, priv->port, &mgid, &rec)) || + (mcast && test_bit(IPOIB_MCAST_UMCAST_ATTACHED, &mcast->flags))) { + ipoib_dbg_mcast(priv, "ignoring multicast entry for mgid " + IPOIB_GID_FMT "\n", IPOIB_GID_ARG(mgid)); + continue; + } + + /* Not found or send-only group, let's add a new entry */ + ipoib_dbg_mcast(priv, "adding multicast entry for mgid " + IPOIB_GID_FMT "\n", IPOIB_GID_ARG(mgid)); + + nmcast = ipoib_mcast_alloc(dev, 0); + if (!nmcast) { + ipoib_warn(priv, "unable to allocate memory for multicast structure\n"); + continue; + } + + set_bit(IPOIB_MCAST_FLAG_FOUND, &nmcast->flags); + + nmcast->mcmember.mgid = mgid; + + if (mcast) { + /* Destroy the send only entry */ + list_move_tail(&mcast->list, &remove_list); + + rb_replace_node(&mcast->rb_node, + &nmcast->rb_node, + &priv->multicast_tree); + } else + __ipoib_mcast_add(dev, nmcast); + + list_add_tail(&nmcast->list, &priv->multicast_list); + } + + if (mcast) + set_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags); + } + + /* Remove all of the entries don't exist anymore */ + list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) { + if (!test_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags) && + !test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { + ipoib_dbg_mcast(priv, "deleting multicast group " IPOIB_GID_FMT "\n", + IPOIB_GID_ARG(mcast->mcmember.mgid)); + + rb_erase(&mcast->rb_node, &priv->multicast_tree); + + /* Move to the remove list */ + list_move_tail(&mcast->list, &remove_list); + } + } + + spin_unlock(&priv->lock); + netif_tx_unlock(dev); + local_irq_restore(flags); + + /* We have to cancel outside of the spinlock */ + list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { + ipoib_mcast_leave(mcast->dev, mcast); + ipoib_mcast_free(mcast); + } + + if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) + ipoib_mcast_start_thread(dev); +} + +void ipoib_mcast_leave_task(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = + container_of(work, struct ipoib_dev_priv, mcast_leave_task.work); + struct net_device *dev = priv->dev; + struct ipoib_mcast *mcast, *tmcast; + LIST_HEAD(remove_list); + + if (!test_bit(IPOIB_MCAST_RUN_GC, &priv->flags)) + return; + + if (ipoib_mc_sendonly_timeout > 0) { + list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) { + if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) && + time_before(mcast->used, jiffies - ipoib_mc_sendonly_timeout * HZ)) { + rb_erase(&mcast->rb_node, &priv->multicast_tree); + list_move_tail(&mcast->list, &remove_list); + } + } + + list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { + ipoib_mcast_leave(dev, mcast); + ipoib_mcast_free(mcast); + } + } + + mutex_lock(&mcast_mutex); + if (test_bit(IPOIB_MCAST_RUN_GC, &priv->flags)) + queue_delayed_work(ipoib_workqueue, &priv->mcast_leave_task, 60 * HZ); + mutex_unlock(&mcast_mutex); +} + +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG + +struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev) +{ + struct ipoib_mcast_iter *iter; + + iter = kmalloc(sizeof *iter, GFP_KERNEL); + if (!iter) + return NULL; + + iter->dev = dev; + memset(iter->mgid.raw, 0, 16); + + if (ipoib_mcast_iter_next(iter)) { + kfree(iter); + return NULL; + } + + return iter; +} + +int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter) +{ + struct ipoib_dev_priv *priv = netdev_priv(iter->dev); + struct rb_node *n; + struct ipoib_mcast *mcast; + int ret = 1; + + spin_lock_irq(&priv->lock); + + n = rb_first(&priv->multicast_tree); + + while (n) { + mcast = rb_entry(n, struct ipoib_mcast, rb_node); + + if (memcmp(iter->mgid.raw, mcast->mcmember.mgid.raw, + sizeof (union ib_gid)) < 0) { + iter->mgid = mcast->mcmember.mgid; + iter->created = mcast->created; + iter->queuelen = skb_queue_len(&mcast->pkt_queue); + iter->complete = !!mcast->ah; + iter->send_only = !!(mcast->flags & (1 << IPOIB_MCAST_FLAG_SENDONLY)); + + ret = 0; + + break; + } + + n = rb_next(n); + } + + spin_unlock_irq(&priv->lock); + + return ret; +} + +void ipoib_mcast_iter_read(struct ipoib_mcast_iter *iter, + union ib_gid *mgid, + unsigned long *created, + unsigned int *queuelen, + unsigned int *complete, + unsigned int *send_only) +{ + *mgid = iter->mgid; + *created = iter->created; + *queuelen = iter->queuelen; + *complete = iter->complete; + *send_only = iter->send_only; +} + +#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib_1.5.3/ipoib_verbs.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib_1.5.3/ipoib_verbs.c new file mode 100644 index 0000000..2e94e2d --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib_1.5.3/ipoib_verbs.c @@ -0,0 +1,305 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ipoib.h" +#include + +int ipoib_mcast_attach(struct net_device *dev, u16 mlid, union ib_gid *mgid, int set_qkey) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_qp_attr *qp_attr = NULL; + int ret; + u16 pkey_index; + + if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &pkey_index)) { + clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); + ret = -ENXIO; + goto out; + } + set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); + + if (set_qkey) { + ret = -ENOMEM; + qp_attr = kmalloc(sizeof *qp_attr, GFP_KERNEL); + if (!qp_attr) + goto out; + + /* set correct QKey for QP */ + qp_attr->qkey = priv->qkey; + ret = ib_modify_qp(priv->qp, qp_attr, IB_QP_QKEY); + if (ret) { + ipoib_warn(priv, "failed to modify QP, ret = %d\n", ret); + goto out; + } + } + + /* attach QP to multicast group */ + ret = ib_attach_mcast(priv->qp, mgid, mlid); + if (ret) + ipoib_warn(priv, "failed to attach to multicast group, ret = %d\n", ret); + +out: + kfree(qp_attr); + return ret; +} + +int ipoib_init_qp(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int ret; + struct ib_qp_attr qp_attr; + int attr_mask; + + if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) + return -1; + + qp_attr.qp_state = IB_QPS_INIT; + qp_attr.qkey = 0; + qp_attr.port_num = priv->port; + qp_attr.pkey_index = priv->pkey_index; + attr_mask = + IB_QP_QKEY | + IB_QP_PORT | + IB_QP_PKEY_INDEX | + IB_QP_STATE; + ret = ib_modify_qp(priv->qp, &qp_attr, attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify QP to init, ret = %d\n", ret); + goto out_fail; + } + + qp_attr.qp_state = IB_QPS_RTR; + /* Can't set this in a INIT->RTR transition */ + attr_mask &= ~IB_QP_PORT; + ret = ib_modify_qp(priv->qp, &qp_attr, attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify QP to RTR, ret = %d\n", ret); + goto out_fail; + } + + qp_attr.qp_state = IB_QPS_RTS; + qp_attr.sq_psn = 0; + attr_mask |= IB_QP_SQ_PSN; + attr_mask &= ~IB_QP_PKEY_INDEX; + ret = ib_modify_qp(priv->qp, &qp_attr, attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify QP to RTS, ret = %d\n", ret); + goto out_fail; + } + + return 0; + +out_fail: + qp_attr.qp_state = IB_QPS_RESET; + if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE)) + ipoib_warn(priv, "Failed to modify QP to RESET state\n"); + + return ret; +} + +int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_qp_init_attr init_attr = { + .cap = { + .max_send_wr = ipoib_sendq_size, + .max_recv_wr = ipoib_recvq_size, + .max_send_sge = 1, + .max_recv_sge = IPOIB_UD_RX_SG + }, + .sq_sig_type = IB_SIGNAL_ALL_WR, + .qp_type = IB_QPT_UD + }; + + int ret, size; + int i; + struct ethtool_coalesce *coal; + + priv->pd = ib_alloc_pd(priv->ca); + if (IS_ERR(priv->pd)) { + printk(KERN_WARNING "%s: failed to allocate PD\n", ca->name); + return -ENODEV; + } + + priv->mr = ib_get_dma_mr(priv->pd, IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(priv->mr)) { + printk(KERN_WARNING "%s: ib_get_dma_mr failed\n", ca->name); + goto out_free_pd; + } + + size = ipoib_recvq_size + 1; + ret = ipoib_cm_dev_init(dev); + if (!ret) { + size += ipoib_sendq_size; + if (ipoib_cm_has_srq(dev)) + size += ipoib_recvq_size + 1; /* 1 extra for rx_drain_qp */ + else + size += ipoib_recvq_size * ipoib_max_conn_qp; + } + + priv->recv_cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, dev, size, + priv->child_index % priv->ca->num_comp_vectors); + if (IS_ERR(priv->recv_cq)) { + printk(KERN_WARNING "%s: failed to create receive CQ\n", ca->name); + goto out_free_mr; + } + + priv->send_cq = ib_create_cq(priv->ca, ipoib_send_comp_handler, NULL, + dev, ipoib_sendq_size, 0); + if (IS_ERR(priv->send_cq)) { + printk(KERN_WARNING "%s: failed to create send CQ\n", ca->name); + goto out_free_recv_cq; + } + + if (ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP)) + goto out_free_send_cq; + + coal = kzalloc(sizeof *coal, GFP_KERNEL); + if (coal) { + coal->rx_coalesce_usecs = 10; + coal->tx_coalesce_usecs = 10; + coal->rx_max_coalesced_frames = 16; + coal->tx_max_coalesced_frames = 16; + dev->ethtool_ops->set_coalesce(dev, coal); + kfree(coal); + } + + init_attr.send_cq = priv->send_cq; + init_attr.recv_cq = priv->recv_cq; + + if (priv->hca_caps & IB_DEVICE_UD_TSO) + init_attr.create_flags |= IB_QP_CREATE_IPOIB_UD_LSO; + + if (priv->hca_caps & IB_DEVICE_BLOCK_MULTICAST_LOOPBACK) + init_attr.create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK; + + if (dev->features & NETIF_F_SG) + init_attr.cap.max_send_sge = MAX_SKB_FRAGS + 1; + + priv->qp = ib_create_qp(priv->pd, &init_attr); + if (IS_ERR(priv->qp)) { + printk(KERN_WARNING "%s: failed to create QP\n", ca->name); + goto out_free_send_cq; + } + + priv->dev->dev_addr[1] = (priv->qp->qp_num >> 16) & 0xff; + priv->dev->dev_addr[2] = (priv->qp->qp_num >> 8) & 0xff; + priv->dev->dev_addr[3] = (priv->qp->qp_num ) & 0xff; + + for (i = 0; i < MAX_SKB_FRAGS + 1; ++i) + priv->tx_sge[i].lkey = priv->mr->lkey; + + priv->tx_wr.opcode = IB_WR_SEND; + priv->tx_wr.sg_list = priv->tx_sge; + priv->tx_wr.send_flags = IB_SEND_SIGNALED; + + priv->rx_sge[0].lkey = priv->mr->lkey; + if (ipoib_ud_need_sg(priv->max_ib_mtu)) { + priv->rx_sge[0].length = IPOIB_UD_HEAD_SIZE; + priv->rx_sge[1].length = PAGE_SIZE; + priv->rx_sge[1].lkey = priv->mr->lkey; + priv->rx_wr.num_sge = IPOIB_UD_RX_SG; + } else { + priv->rx_sge[0].length = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu); + priv->rx_wr.num_sge = 1; + } + priv->rx_wr.next = NULL; + priv->rx_wr.sg_list = priv->rx_sge; + + return 0; + +out_free_send_cq: + ib_destroy_cq(priv->send_cq); + +out_free_recv_cq: + ib_destroy_cq(priv->recv_cq); + +out_free_mr: + ib_dereg_mr(priv->mr); + ipoib_cm_dev_cleanup(dev); + +out_free_pd: + ib_dealloc_pd(priv->pd); + return -ENODEV; +} + +void ipoib_transport_dev_cleanup(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + if (priv->qp) { + if (ib_destroy_qp(priv->qp)) + ipoib_warn(priv, "ib_qp_destroy failed\n"); + + priv->qp = NULL; + clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); + } + + if (ib_destroy_cq(priv->send_cq)) + ipoib_warn(priv, "ib_cq_destroy (send) failed\n"); + + if (ib_destroy_cq(priv->recv_cq)) + ipoib_warn(priv, "ib_cq_destroy (recv) failed\n"); + + ipoib_cm_dev_cleanup(dev); + + if (ib_dereg_mr(priv->mr)) + ipoib_warn(priv, "ib_dereg_mr failed\n"); + + if (ib_dealloc_pd(priv->pd)) + ipoib_warn(priv, "ib_dealloc_pd failed\n"); +} + +void ipoib_event(struct ib_event_handler *handler, + struct ib_event *record) +{ + struct ipoib_dev_priv *priv = + container_of(handler, struct ipoib_dev_priv, event_handler); + + if (record->element.port_num != priv->port) + return; + + ipoib_dbg(priv, "Event %d on device %s port %d\n", record->event, + record->device->name, record->element.port_num); + + if (record->event == IB_EVENT_SM_CHANGE || + record->event == IB_EVENT_CLIENT_REREGISTER) { + queue_work(ipoib_workqueue, &priv->flush_light); + } else if (record->event == IB_EVENT_PORT_ERR || + record->event == IB_EVENT_PORT_ACTIVE || + record->event == IB_EVENT_LID_CHANGE) { + queue_work(ipoib_workqueue, &priv->flush_normal); + } else if (record->event == IB_EVENT_PKEY_CHANGE) { + queue_work(ipoib_workqueue, &priv->flush_heavy); + } +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib_1.5.3/ipoib_vlan.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib_1.5.3/ipoib_vlan.c new file mode 100644 index 0000000..b605624 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/ipoib_1.5.3/ipoib_vlan.c @@ -0,0 +1,228 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include +#include +#include + +#include + +#include "ipoib.h" + +#define to_net_dev(class) container_of(class, struct net_device, class_dev) + +static ssize_t show_parent(struct class_device *d, char *buf) +{ + struct net_device *dev = to_net_dev(d); + struct ipoib_dev_priv *priv = netdev_priv(dev); + + return sprintf(buf, "%s\n", priv->parent->name); +} +static CLASS_DEVICE_ATTR(parent, S_IRUGO, show_parent, NULL); + +int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey, + unsigned char child_index) +{ + struct ipoib_dev_priv *ppriv, *priv; + char intf_name[IFNAMSIZ]; + int result; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + ppriv = netdev_priv(pdev); + while (!rtnl_trylock()) { + if (test_bit(IPOIB_FLAG_MODULE_DOWN, &ppriv->flags)) { + ipoib_dbg(ppriv, "%s: module is going down - nop\n", + __func__); + return -ENODEV; + } + /* enable other tasks to unlock the rtnl */ + msleep(5); + } + mutex_lock(&ppriv->vlan_mutex); + + /* + * First ensure this isn't a duplicate. We check all of the child + * interfaces to make sure the Pkey AND the child index + * don't match. + */ + list_for_each_entry(priv, &ppriv->child_intfs, list) { + if (priv->pkey == pkey && priv->child_index == child_index) { + result = -ENOTUNIQ; + priv = NULL; + goto err; + } + } + + /* + * for the case of non-legacy and same pkey childs we wanted to use + * a notation of ibN.pkey:index and ibN:index but this is problematic + * with tools like ifconfig who treat devices with ":" in their names + * as aliases which are restriced, e.t w.r.t counters, etc + */ + if (ppriv->pkey != pkey && child_index == 0) /* legacy child */ + snprintf(intf_name, sizeof intf_name, "%s.%04x", + ppriv->dev->name, pkey); + else if (ppriv->pkey != pkey && child_index != 0) /* non-legacy child */ + snprintf(intf_name, sizeof intf_name, "%s.%04x.%d", + ppriv->dev->name, pkey, child_index); + else if (ppriv->pkey == pkey && child_index != 0) /* same pkey child */ + snprintf(intf_name, sizeof intf_name, "%s.%d", + ppriv->dev->name, child_index); + else { + ipoib_warn(ppriv, "wrong pkey/child_index pairing %04x %d\n", + pkey, child_index); + result = -EINVAL; + goto err; + } + + priv = ipoib_intf_alloc(intf_name); + if (!priv) { + result = -ENOMEM; + goto err; + } + + priv->max_ib_mtu = ppriv->max_ib_mtu; + /* MTU will be reset when mcast join happens */ + priv->dev->mtu = IPOIB_UD_MTU(priv->max_ib_mtu); + priv->mcast_mtu = priv->admin_mtu = priv->dev->mtu; + set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags); + + result = ipoib_set_dev_features(priv, ppriv->ca); + if (result) + goto err; + + priv->pkey = pkey; + priv->child_index = child_index; + + memcpy(priv->dev->dev_addr, ppriv->dev->dev_addr, INFINIBAND_ALEN); + priv->dev->broadcast[8] = pkey >> 8; + priv->dev->broadcast[9] = pkey & 0xff; + + result = ipoib_dev_init(priv->dev, ppriv->ca, ppriv->port); + if (result < 0) { + ipoib_warn(ppriv, "failed to initialize subinterface: " + "device %s, port %d", + ppriv->ca->name, ppriv->port); + goto err; + } + + result = register_netdevice(priv->dev); + if (result) { + ipoib_warn(priv, "failed to initialize; error %i", result); + goto register_failed; + } + + priv->parent = ppriv->dev; + + ipoib_create_debug_files(priv->dev); + + rtnl_unlock(); + rtnl_lock(); + + if (ipoib_cm_add_mode_attr(priv->dev)) + goto sysfs_failed; + if (ipoib_add_pkey_attr(priv->dev)) + goto sysfs_failed; + if (ipoib_add_umcast_attr(priv->dev)) + goto sysfs_failed; + + if (class_device_create_file(&priv->dev->class_dev, + &class_device_attr_parent)) + goto sysfs_failed; + + list_add_tail(&priv->list, &ppriv->child_intfs); + + mutex_unlock(&ppriv->vlan_mutex); + rtnl_unlock(); + + return 0; + +sysfs_failed: + ipoib_delete_debug_files(priv->dev); + unregister_netdevice(priv->dev); + +register_failed: + ipoib_dev_cleanup(priv->dev); + +err: + mutex_unlock(&ppriv->vlan_mutex); + rtnl_unlock(); + if (priv) + free_netdev(priv->dev); + + return result; +} + +int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey, + unsigned char child_index) +{ + struct ipoib_dev_priv *ppriv, *priv, *tpriv; + struct net_device *dev = NULL; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + ppriv = netdev_priv(pdev); + while (!rtnl_trylock()) { + if (test_bit(IPOIB_FLAG_MODULE_DOWN, &ppriv->flags)) { + ipoib_dbg(ppriv, "%s: module is going down - nop\n", + __func__); + return -ENODEV; + } + /* enable other tasks to unlock the rtnl */ + msleep(5); + } + + mutex_lock(&ppriv->vlan_mutex); + list_for_each_entry_safe(priv, tpriv, &ppriv->child_intfs, list) { + if (priv->pkey == pkey && priv->child_index == child_index) { + unregister_netdevice(priv->dev); + list_del(&priv->list); + dev = priv->dev; + break; + } + } + mutex_unlock(&ppriv->vlan_mutex); + rtnl_unlock(); + + if (dev) { + ipoib_dev_cleanup(dev); + free_netdev(dev); + return 0; + } + + return -ENODEV; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/Kconfig b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/Kconfig new file mode 100644 index 0000000..3016a0c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/Kconfig @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: GPL-2.0-only +config INFINIBAND_ISER + tristate "iSCSI Extensions for RDMA (iSER)" + depends on SCSI && INET && INFINIBAND_ADDR_TRANS + select SCSI_ISCSI_ATTRS + help + Support for the iSCSI Extensions for RDMA (iSER) Protocol + over InfiniBand. This allows you to access storage devices + that speak iSCSI over iSER over InfiniBand. + + The iSER protocol is defined by IETF. + See + and diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/Makefile new file mode 100644 index 0000000..c2cc39a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/Makefile @@ -0,0 +1,31 @@ +# SPDX-License-Identifier: GPL-2.0-only +# Retpoline support: check if this is the right architecture and that +# the kernel does not support it already. +# Alternatively, if we are called from the main mlnx-ofa build system, +# CONFIG_RETPOLINE will be set by the configure script, however +# subdir-ccflags-y will be set by the toplevel Makefile. +ifneq (,$(findstring $(ARCH),i386 x86_64)) + ifndef CONFIG_RETPOLINE + ifneq (,$(shell awk 'BEGIN {if ($(VERSION).$(PATCHLEVEL) < 4.15) {print 1}}' /dev/null | head -1) +kconfig_h=$(shell /bin/ls -1 $(K_BUILD)/include/*/kconfig.h 2> /dev/null | head -1) + +ifneq ($(kconfig_h),) +KCONFIG_H = -include $(kconfig_h) +endif + +V ?= 0 + +# GCC earlier than 4.6.0 will build modules which require 'mcount', +# and this symbol will not be available in the kernel if the kernel was +# compiled with GCC 4.6.0 and above. +# therefore, to prevent unknown symbol issues we disable function tracing. +# +CC = $(CROSS_COMPILE)gcc +CPP = $(CC) -E + +CPP_MAJOR := $(shell $(CPP) -dumpversion 2>&1 | cut -d'.' -f1) +CPP_MINOR := $(shell $(CPP) -dumpversion 2>&1 | cut -d'.' -f2) +CPP_PATCH := $(shell $(CPP) -dumpversion 2>&1 | cut -d'.' -f3) +# Assumes that major, minor, and patch cannot exceed 999 +CPP_VERS := $(shell expr 0$(CPP_MAJOR) \* 1000000 + 0$(CPP_MINOR) \* 1000 + 0$(CPP_PATCH)) +compile_h=$(shell /bin/ls -1 $(K_BUILD)/include/*/compile.h 2> /dev/null | head -1) +ifneq ($(compile_h),) +KERNEL_GCC_MAJOR := $(shell grep LINUX_COMPILER $(compile_h) | sed -r -e 's/.*gcc version ([0-9\.\-]*) .*/\1/g' | cut -d'.' -f1) +KERNEL_GCC_MINOR := $(shell grep LINUX_COMPILER $(compile_h) | sed -r -e 's/.*gcc version ([0-9\.\-]*) .*/\1/g' | cut -d'.' -f2) +KERNEL_GCC_PATCH := $(shell grep LINUX_COMPILER $(compile_h) | sed -r -e 's/.*gcc version ([0-9\.\-]*) .*/\1/g' | cut -d'.' -f3) +KERNEL_GCC_VER := $(shell expr 0$(KERNEL_GCC_MAJOR) \* 1000000 + 0$(KERNEL_GCC_MINOR) \* 1000 + 0$(KERNEL_GCC_PATCH)) +ifneq ($(shell if [ $(CPP_VERS) -lt 4006000 ] && [ $(KERNEL_GCC_VER) -ge 4006000 ]; then \ + echo "YES"; else echo ""; fi),) +$(info Warning: The kernel was compiled with GCC newer than 4.6.0, while the current GCC is older than 4.6.0, Disabling function tracing to prevent unknown symbol issues...) +override WITH_MAKE_PARAMS += CONFIG_FUNCTION_TRACER= CONFIG_HAVE_FENTRY= +endif +ifneq ($(shell if [ $(CPP_VERS) -ge 4006000 ] && [ $(KERNEL_GCC_VER) -lt 4006000 ]; then \ + echo "YES"; else echo ""; fi),) +$(info Warning: The kernel was compiled with GCC older than 4.6.0, while the current GCC is newer than 4.6.0, Disabling function tracing to prevent unknown symbol issues...) +override WITH_MAKE_PARAMS += CONFIG_FUNCTION_TRACER= CONFIG_HAVE_FENTRY= +endif +endif + +ifneq ($(shell if (echo $(KVER) | grep -qE 'uek'); then \ + echo "YES"; else echo ""; fi),) +override WITH_MAKE_PARAMS += ctf-dir=$(CWD)/.ctf +endif + +name := iser +VERSION=$(shell grep "define _version" $(name).spec | sed -e 's/.*_version //' | sed -e 's/}//' | sed -e 's/\s*//g') +RELEASE=$(shell grep "define _release" $(name).spec | sed -e 's/.*_release //' | sed -e 's/}//' | sed -e 's/\s*//g') +PACKAGE := $(name)-$(VERSION) +SHELL = /bin/bash +rpmspec := $(name).spec +rpmroot = $(PWD)/rpm-dist/ +rpmopts = --nodeps --buildroot='$(rpmroot)/_rpm' --define '_source_filedigest_algorithm md5' --define '_binary_filedigest_algorithm md5' +rpmmacros =\ + --define='_topdir $(rpmroot)'\ + --define='_rpmdir $(rpmroot)'\ + --define='_srcrpmdir $(rpmroot)'\ + --define='_sourcedir $(rpmroot)'\ + --define='_specdir $(PWD)' +override WITH_MAKE_PARAMS += KBUILD_EXTRA_SYMBOLS=$(OFA)/Module.symvers + +LINUXINCLUDE=\ + $(EXTRA_CFLAGS) \ + -include $(autoconf_h) \ + $(KCONFIG_H) \ + -include $(OFA)/include/linux/compat-2.6.h \ + -I$(OFA)/include \ + -I$(OFA)/include/uapi \ + $(BACKPORT_INCLUDES) \ + $$(if $$(CONFIG_XEN),-D__XEN_INTERFACE_VERSION__=$$(CONFIG_XEN_INTERFACE_VERSION)) \ + $$(if $$(CONFIG_XEN),-I$$(srctree)/arch/x86/include/mach-xen) \ + -I$$(srctree)/arch/$$(SRCARCH)/include \ + -Iarch/$$(SRCARCH)/include/generated \ + -Iinclude \ + -I$$(srctree)/arch/$$(SRCARCH)/include/uapi \ + -Iarch/$$(SRCARCH)/include/generated/uapi \ + -I$$(srctree)/include \ + -I$$(srctree)/include/uapi \ + -Iinclude/generated/uapi \ + $$(if $$(KBUILD_SRC),-Iinclude2 -I$$(srctree)/include) \ + -I$$(srctree)/arch/$$(SRCARCH)/include \ + -Iarch/$$(SRCARCH)/include/generated \ + # + +default: +ifneq ($(shell test -d $(OFA) && echo "true" || echo "" ),) +# compile with ofed driver + make -C $(K_BUILD) O=$(K_OBJ) M=$(shell pwd) $(WITH_MAKE_PARAMS) \ + CONFIG_INFINIBAND_ISER=m \ + CONFIG_INFINIBAND_ISER_DUMMY= \ + CONFIG_DTRACE= \ + CONFIG_CTF= \ + LINUXINCLUDE='$(LINUXINCLUDE)' \ + modules +else +# compile with inbox driver + make EXTRA_CFLAGS="$(EXTRA_CFLAGS)" -C $(K_BUILD) O=$(K_OBJ) M=$(shell pwd) $(WITH_MAKE_PARAMS) \ + modules +endif + +install: + make -C $(K_BUILD) O=$(K_OBJ) M=$(shell pwd) INSTALL_MOD_PATH=$(INSTALL_MOD_PATH) INSTALL_MOD_DIR=$(INSTALL_MOD_DIR) $(WITH_MAKE_PARAMS) modules_install + if [ ! -n "$(INSTALL_MOD_PATH)" ]; then /sbin/depmod $(KVER);fi; + +rpmcheck: + @which rpmbuild &> /dev/null; \ + if [ $$? -ne 0 ]; then \ + echo "*** This make target requires an rpm-based linux distribution."; \ + (exit 1); exit 1; \ + fi + -mkdir -p $(rpmroot)/BUILD + +srcrpm: dist rpmcheck $(rpmspec) + -rpmbuild -bs --define 'src_release $(RELEASE)' $(rpmmacros) $(rpmopts) $(rpmspec); \ + if [ $$? -ne 0 ]; then \ + (exit 1); exit 1; \ + fi + +binrpm: rpmcheck $(rpmspec) + -rpmbuild -bb $(rpmmacros) $(rpmopts) $(rpmspec); \ + if [ $$? -ne 0 ]; then \ + (exit 1); exit 1; \ + fi + +dist: + mkdir -p $(rpmroot)/$(PACKAGE)/ + cp {$(rpmspec),Kconfig,makefile,Makefile,dkms.conf,Module.supported} $(rpmroot)/$(PACKAGE)/ + cp common.postinst $(rpmroot)/$(PACKAGE)/ + cp *.c $(rpmroot)/$(PACKAGE)/ + cp *.h $(rpmroot)/$(PACKAGE)/ + cp -r debian $(rpmroot)/$(PACKAGE)/ + cp -r tools $(rpmroot)/$(PACKAGE)/ + cd $(rpmroot) && tar czf $(PACKAGE).tgz $(PACKAGE) + cd $(rpmroot) && tar czf $(name)_$(VERSION).orig.tar.gz $(PACKAGE) + +clean: + rm -f *.o + rm -f *.ko *.ko.gz + rm -f *.mod.c + rm -f Module*.symvers modules*.order + +distclean: clean + @rm -rf $(PWD)/rpm-dist + rm -f makefile *.spec + +all: clean distclean dist srcrpm binrpm diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/autogen.sh b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/autogen.sh new file mode 100755 index 0000000..3873a8a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/autogen.sh @@ -0,0 +1,9 @@ +#!/bin/bash +set -e + +name=iser +version=$(grep "define _version" ${name}_spec_ | sed -e 's/.*_version //' | sed -e 's/}//' | sed -e 's/\s*//g') +release=$(grep "define _release" ${name}_spec_ | sed -e 's/.*_release //' | sed -e 's/}//' | sed -e 's/\s*//g') +/bin/cp -f ${name}_spec_ ${name}.spec +/bin/cp -f _makefile_ makefile +/bin/sed -i -r "s/^$name \(([0-9.-]+)\) (.*)/$name \($version-$release\) \2/" debian/changelog diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/common.postinst b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/common.postinst new file mode 100755 index 0000000..bbf9aad --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/common.postinst @@ -0,0 +1,296 @@ +#!/bin/sh +# Copyright (C) 2002-2005 Flavio Stanchina +# Copyright (C) 2005-2006 Aric Cyr +# Copyright (C) 2007 Mario Limonciello +# Copyright (C) 2009 Alberto Milone + +set -e + +uname_s=$(uname -s) + +_get_kernel_dir() { + KVER=$1 + case ${uname_s} in + Linux) DIR="/lib/modules/$KVER/build" ;; + GNU/kFreeBSD) DIR="/usr/src/kfreebsd-headers-$KVER/sys" ;; + esac + echo $DIR +} + +_check_kernel_dir() { + DIR=$(_get_kernel_dir $1) + case ${uname_s} in + Linux) test -e $DIR/include ;; + GNU/kFreeBSD) test -e $DIR/kern && test -e $DIR/conf/kmod.mk ;; + *) return 1 ;; + esac + return $? +} + +# Check the existence of a kernel named as $1 +_is_kernel_name_correct() { + CORRECT="no" + KERNEL_NAME=$1 + + for kernel in /boot/config-*; do + KERNEL=${kernel#*-} + if [ "${KERNEL}" = "${KERNEL_NAME}" ]; then + CORRECT="yes" + break + fi + done + + echo $CORRECT +} + + +# Get the most recent kernel on Debian based systems. This keeps +# into account both the version and the ABI. If the current kernel +# is the most recent kernel then the function will print a null string. +_get_newest_kernel_debian() { + NEWEST_KERNEL= + NEWEST_VERSION= + NEWEST_ABI= + + for kernel in /boot/config-*; do + KERNEL=${kernel#*-} + KERNEL_VERSION=${KERNEL%%-*} + ABI=${KERNEL#*-} + ABI=${ABI%%-*} + + if [ -z "$NEWEST_KERNEL" ]; then + # The 1st time get a version which is bigger than $1 + COMPARE_TO=$1 + else + # Get the biggest version + COMPARE_TO="$NEWEST_VERSION-$NEWEST_ABI" + fi + + # if $kernel is greater than $COMPARE_TO + if [ `dpkg --compare-versions "$KERNEL_VERSION-$ABI" gt "$COMPARE_TO" && echo "yes" || \ + echo "no"` = "yes" ]; then + NEWEST_KERNEL=$KERNEL + NEWEST_VERSION=$KERNEL_VERSION + NEWEST_ABI=$ABI + fi + done + + echo "$NEWEST_KERNEL" +} + +# Get the most recent kernel in Rhel based systems. If the current kernel +# is the most recent kernel then the function will print a null string. +_get_newest_kernel_rhel() { + NEWEST_KERNEL= + + LAST_INSTALLED_KERNEL=$(rpm -q --whatprovides kernel --last | grep kernel -m1 | cut -f1 -d' ') + + LIK_FORMATTED_NAME=$(rpm -q $LAST_INSTALLED_KERNEL --queryformat="%{VERSION}-%{RELEASE}.%{ARCH}\n") + + if [ `echo $LIK_FORMATTED_NAME | grep 2.6 >/dev/null` ]; then + # Fedora and Suse + NEWEST_KERNEL=$LIK_FORMATTED_NAME + else + # Hack for Mandriva where $LIK_FORMATTED_NAME is broken + LIK_NAME=$(rpm -q $LAST_INSTALLED_KERNEL --queryformat="%{NAME}\n") + LIK_TYPE=${LIK_NAME#kernel-} + LIK_TYPE=${LIK_TYPE%%-*} + LIK_STRIPPED=${LIK_NAME#kernel-} + LIK_STRIPPED=${LIK_STRIPPED#$LIK_TYPE-} + LIK_STRIPPED_BASE=${LIK_STRIPPED%%-*} + LIK_STRIPPED_END=${LIK_STRIPPED#$LIK_STRIPPED_BASE-} + LIK_FINAL=$LIK_STRIPPED_BASE-$LIK_TYPE-$LIK_STRIPPED_END + + NEWEST_KERNEL=$LIK_FINAL + fi + + echo $NEWEST_KERNEL +} + +# Get the newest kernel on Debian and Rhel based systems. +get_newest_kernel() { + NEWEST_KERNEL= + # Try Debian first as rpm can be installed in Debian based distros + if [ -e /usr/bin/dpkg ]; then + # If DEB based + CURRENT_KERNEL=$1 + CURRENT_VERSION=${CURRENT_KERNEL%%-*} + CURRENT_ABI=${CURRENT_KERNEL#*-} + CURRENT_FLAVOUR=${CURRENT_ABI#*-} + CURRENT_ABI=${CURRENT_ABI%%-*} + NEWEST_KERNEL=$(_get_newest_kernel_debian "$CURRENT_VERSION-$CURRENT_ABI") + + elif [ `which rpm >/dev/null` ]; then + # If RPM based + NEWEST_KERNEL=$(_get_newest_kernel_rhel) + fi + + # Make sure that kernel name that we extracted corresponds to an installed + # kernel + if [ -n "$NEWEST_KERNEL" ] && [ `_is_kernel_name_correct $NEWEST_KERNEL` = "no" ]; then + NEWEST_KERNEL= + fi + + echo $NEWEST_KERNEL +} + +NAME=$1 +VERSION=$2 +TARBALL_ROOT=$3 +ARCH=$4 +UPGRADE=$5 + +if [ -z "$NAME" ] || [ -z "$VERSION" ]; then + echo "Need NAME, and VERSION defined" + echo "ARCH is optional" + exit 1 +fi + +KERNELS=$(ls /lib/modules/ 2>/dev/null || true) +CURRENT_KERNEL=$(uname -r) + +#We never want to keep an older version side by side to prevent conflicts +if [ -e "/var/lib/dkms/$NAME/$VERSION" ]; then + echo "Removing old $NAME-$VERSION DKMS files..." + dkms remove -m $NAME -v $VERSION --all +fi + +#Load new files, by source package and by tarball +if [ -f "$TARBALL_ROOT/$NAME-$VERSION.dkms.tar.gz" ]; then + if ! dkms ldtarball --archive "$TARBALL_ROOT/$NAME-$VERSION.dkms.tar.gz"; then + echo "" + echo "" + echo "Unable to load DKMS tarball $TARBALL_ROOT/$NAME-$VERSION.dkms.tar.gz." + echo "Common causes include: " + echo " - You must be using DKMS 2.1.0.0 or later to support binaries only" + echo " distribution specific archives." + echo " - Corrupt distribution specific archive" + echo "" + echo "" + exit 2 + fi +elif [ -d "/usr/src/$NAME-$VERSION" ]; then + echo "Loading new $NAME-$VERSION DKMS files..." + dkms add -m $NAME -v $VERSION > /dev/null +fi + +# On 1st installation, let us look for a directory +# in /lib/modules which matches `uname -r`. If none +# is found it is possible that buildd is being used +# and that uname -r is giving us the name of the +# kernel used by the buildd machine. +# +# If this is the case we try to build the kernel +# module for each kernel which has a directory in +# /lib/modules. Furthermore we will have to tell +# DKMS which architecture it should build the module +# for (e.g. if the buildd machine is using a +# 2.6.24-23-xen 64bit kernel). +# +# NOTE: if the headers are not installed then the +# module won't be built, as usual +if [ -z "$UPGRADE" ]; then + echo "First Installation: checking all kernels..." + for KERNEL in $KERNELS; do + if [ ${KERNEL} = ${CURRENT_KERNEL} ]; then + # Kernel found + KERNELS=$CURRENT_KERNEL + break + fi + done +else + KERNELS=$CURRENT_KERNEL +fi + +# Here we look for the most recent kernel so that we can +# build the module for it (in addition to doing it for the +# current kernel. +NEWEST_KERNEL=$(get_newest_kernel "$KERNELS") + +# If the current kernel doesn't come from the host of a chroot +if [ `_is_kernel_name_correct $CURRENT_KERNEL` = "yes" ]; then + # See if it's worth building the module for both the newest kernel + # and for the current kernel + if [ -n "$NEWEST_KERNEL" ] && [ ${CURRENT_KERNEL} != ${NEWEST_KERNEL} ]; then + echo "Building for $CURRENT_KERNEL and $NEWEST_KERNEL" + KERNELS="$CURRENT_KERNEL $NEWEST_KERNEL" + else + echo "Building only for $CURRENT_KERNEL" + fi +# The current kernel is not useful as it's the host's +else + echo "It is likely that $CURRENT_KERNEL belongs to a chroot's host" + + # Let's use only the newest kernel + if [ -n "$NEWEST_KERNEL" ]; then + KERNELS="$NEWEST_KERNEL" + echo "Building only for $NEWEST_KERNEL" + fi +fi + +if [ -n "$ARCH" ]; then + if which lsb_release >/dev/null && [ $(lsb_release -s -i) = "Ubuntu" ]; then + case $ARCH in + amd64) + ARCH="x86_64" + ;; + lpia|i?86) + ARCH="i686" + ;; + esac + fi + echo "Building for architecture $ARCH" + ARCH="-a $ARCH" +fi + +for KERNEL in $KERNELS; do + dkms_status=`dkms status -m $NAME -v $VERSION -k $KERNEL $ARCH` + if [ `echo $KERNEL | grep -c "BOOT"` -gt 0 ]; then + echo "" + echo "Module build and install for $KERNEL was skipped as " + echo "it is a BOOT variant" + continue + fi + + + #if the module isn't yet built, try to build it + if [ `echo $dkms_status | grep -c ": built"` -eq 0 ]; then + if [ ! -L /var/lib/dkms/$NAME/$VERSION/source ]; then + echo "This package appears to be a binaries-only package" + echo " you will not be able to build against kernel $KERNEL" + echo " since the package source was not provided" + continue + fi + if _check_kernel_dir $KERNEL; then + echo "Building initial module for $KERNEL" + set +e + dkms build -m $NAME -v $VERSION -k $KERNEL $ARCH > /dev/null + rc=$? + case $rc in + 9) + set -e + echo "Skipped." + continue + ;; + 0) + set -e + echo "Done." + ;; + *) + exit $rc + ;; + esac + dkms_status=`dkms status -m $NAME -v $VERSION -k $KERNEL $ARCH` + else + echo "Module build for the currently running kernel was skipped since the" + echo "kernel source for this kernel does not seem to be installed." + fi + fi + + #if the module is built (either pre-built or just now), install it + if [ `echo $dkms_status | grep -c ": built"` -eq 1 ] && + [ `echo $dkms_status | grep -c ": installed"` -eq 0 ]; then + dkms install -m $NAME -v $VERSION -k $KERNEL $ARCH --force + fi +done + diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/debian/changelog b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/debian/changelog new file mode 100644 index 0000000..24088a1 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/debian/changelog @@ -0,0 +1,5 @@ +iser (0.0-0) unstable; urgency=low + + * Initial release. + + -- Alaa Hleihel Sun, 16 Feb 2014 17:30:53 +0200 diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/debian/compat b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/debian/compat new file mode 100644 index 0000000..45a4fb7 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/debian/compat @@ -0,0 +1 @@ +8 diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/debian/control b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/debian/control new file mode 100644 index 0000000..2b1c855 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/debian/control @@ -0,0 +1,17 @@ +Source: iser +Section: kernel +Priority: optional +Maintainer: Mellanox Technologies +Build-Depends: debhelper (>= 8.0.0), autotools-dev, bzip2, dkms +Standards-Version: 1.0 +Homepage: http://www.mellanox.com + +Package: iser-dkms +Section: kernel +Architecture: all +Depends: dkms, make, mlnx-ofed-kernel-dkms, ${misc:Depends} +Recommends: linux-headers-arm64 | linux-headers-powerpc | linux-headers-ppc64 | linux-headers-ppc64le | linux-headers-amd64 | linux-headers | linux-headers-generic +Description: DKMS support fo iser kernel modules + This package provides integration with the DKMS infrastructure for automatically building out of tree kernel modules. + . + This package contains the source to be built with dkms. diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/debian/control.no_dkms b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/debian/control.no_dkms new file mode 100644 index 0000000..217d2c3 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/debian/control.no_dkms @@ -0,0 +1,14 @@ +Source: iser +Section: kernel +Priority: optional +Maintainer: Mellanox Technologies +Build-Depends: debhelper (>= 8.0.0), autotools-dev, bzip2, make +Standards-Version: 1.0 +Homepage: http://www.mellanox.com + +Package: iser-modules +Section: kernel +Architecture: any +Depends: mlnx-ofed-kernel-modules, ${misc:Depends} +Description: iser kernel modules + This package provides the binary code for the iser kernel modules. diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/debian/copyright b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/debian/copyright new file mode 100644 index 0000000..53aa878 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/debian/copyright @@ -0,0 +1,19 @@ +Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ + +Files: * +Copyright: Copyright 2017 Mellanox Technologies +License: GPL-2 + Mellanox OFED (MLNX_OFED) Software distributed under the terms of the GNU General Public License ("GPL") version 2 as published by the Free Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/debian/iser-dkms.postinst b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/debian/iser-dkms.postinst new file mode 100755 index 0000000..a5e5d39 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/debian/iser-dkms.postinst @@ -0,0 +1,43 @@ +#!/bin/sh +set -e + +# Get the package version +NAME=iser +PACKAGE_NAME=$NAME-dkms +CVERSION=`dpkg-query -W -f='${Version}' $PACKAGE_NAME | awk -F "-" '{print $1}' | cut -d\: -f2` +ARCH=`uname -m` + +dkms_configure () { + POSTINST="/usr/src/$NAME-$CVERSION/common.postinst" + if [ -f "$POSTINST" ]; then + "$POSTINST" "$NAME" "$CVERSION" "/usr/share/$PACKAGE_NAME" "$ARCH" "$2" + return $? + fi + echo "WARNING: $POSTINST does not exist." >&2 + echo "ERROR: DKMS version is too old and $PACKAGE_NAME was not" >&2 + echo "built with legacy DKMS support." >&2 + echo "You must either rebuild $PACKAGE_NAME with legacy postinst" >&2 + echo "support or upgrade DKMS to a more current version." >&2 + return 1 +} + +case "$1" in + configure) + dkms_configure + ;; + + abort-upgrade|abort-remove|abort-deconfigure) + ;; + + *) + echo "postinst called with unknown argument \`$1'" >&2 + exit 1 + ;; +esac + +# dh_installdeb will replace this with shell code automatically +# generated by other debhelper scripts. + +#DEBHELPER# + +exit 0 diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/debian/iser-dkms.prerm b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/debian/iser-dkms.prerm new file mode 100755 index 0000000..c520442 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/debian/iser-dkms.prerm @@ -0,0 +1,13 @@ +#!/bin/sh +set -e + +# Get the package version +package=iser +version=`dpkg-query -W -f='${Version}' "$package-dkms" \ + | sed -e 's/[+-].*//'` + +dkms remove -m "$package" -v "$version" --all || true + +#DEBHELPER# + +exit 0 diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/debian/rules b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/debian/rules new file mode 100755 index 0000000..28a4ca5 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/debian/rules @@ -0,0 +1,109 @@ +#!/usr/bin/make -f +# -*- makefile -*- +# Sample debian/rules that uses debhelper. +# This file was originally written by Joey Hess and Craig Small. +# As a special exception, when this file is copied by dh-make into a +# dh-make output file, you may use that output file without restriction. +# This special exception was added by Craig Small in version 0.37 of dh-make. +# +# This version is for a hypothetical package that can build a kernel modules +# architecture-dependant package via make-kpkg, as well as an +# architecture-independent module source package, and other packages +# either dep/indep for things like common files or userspace components +# needed for the kernel modules. + +# Uncomment this to turn on verbose mode. +#export DH_VERBOSE=1 + +WITH_DKMS ?= 1 +WITH_MOD_SIGN ?= 0 +MLXNUMC = $(shell grep ^processor /proc/cpuinfo | wc -l) +NJOBS ?= $(shell if [ $(MLXNUMC) -lt 16 ]; then echo $(MLXNUMC); else echo 16; fi) + +pname:=iser +psource:=$(pname)-source +ifeq ($(WITH_DKMS),1) +pdkms:=$(pname)-dkms +else +pdkms:=$(pname)-modules +endif + +pversion := $(shell dpkg-parsechangelog | sed -n 's/^Version: *\([^-]\+\)-.\+/\1/p') +prel := $(shell dpkg-parsechangelog | sed -n 's/^Version: *\([^-]\+\)-\(.\+\)/\2/p') + +export INSTALL_MOD_DIR:=updates +export INSTALL_MOD_PATH:=$(CURDIR)/debian/$(pdkms) + +DIST_NAME := $(shell lsb_release -si) +DIST_RELEASE := $(DIST_NAME)/$(shell lsb_release -sc) + + +KVER ?= $(shell uname -r) +KVER1 = $(shell echo $(KVER) | sed -e 's/_/-/g') +K_BUILD ?= "/lib/modules/$(KVER)/build" + +%: +ifeq ($(WITH_DKMS),1) + dh $@ --with dkms +else + dh $@ +endif + +override_dh_auto_clean: + +override_dh_auto_configure: + +override_dh_auto_build: +ifneq ($(WITH_DKMS),1) + @echo Building for $(KVER) + make clean || true + make -j$(NJOBS) KVER=$(KVER) K_BUILD=$(K_BUILD) +endif + +override_dh_auto_test: + +override_dh_auto_install: +ifneq ($(WITH_DKMS),1) + make install INSTALL_MOD_DIR=$(INSTALL_MOD_DIR) INSTALL_MOD_PATH=$(INSTALL_MOD_PATH) KERNELRELEASE=$(KVER) KVER=$(KVER) K_BUILD=$(K_BUILD) + find $(INSTALL_MOD_PATH) \( -type f -a -name "modules.*" \) -delete +ifeq ($(WITH_MOD_SIGN),1) + tools/sign-modules $(INSTALL_MOD_PATH)/lib/modules/ $(K_BUILD) +endif +endif + + # For dkms +ifeq ($(WITH_DKMS),1) + dh_installdirs -p$(pdkms) usr/src/$(pname)-$(pversion) + cp Kconfig debian/$(pdkms)/usr/src/$(pname)-$(pversion) + cp makefile debian/$(pdkms)/usr/src/$(pname)-$(pversion) + cp Makefile debian/$(pdkms)/usr/src/$(pname)-$(pversion) + cp dkms.conf debian/$(pdkms)/usr/src/$(pname)-$(pversion) + cp common.postinst debian/$(pdkms)/usr/src/$(pname)-$(pversion) + cp *.c debian/$(pdkms)/usr/src/$(pname)-$(pversion) + cp *.h debian/$(pdkms)/usr/src/$(pname)-$(pversion)/ + + # Force DKMS to install our modules. + # This is mostly needed for modules that do not have a version number info, as DKMS + # will compare their srcversion field, which does not really say which module is newer. + dh_installdirs -p$(pdkms) usr/share/dkms/modules_to_force_install/ + echo "$(pname)" > debian/$(pdkms)/usr/share/dkms/modules_to_force_install/$(pname).force +endif + +override_dh_installinit: + + +ifneq ($(WITH_DKMS),1) +override_dh_gencontrol: + dh_gencontrol -- -v$(pversion)-$(prel).kver.$(KVER1) +endif + +ifneq ($(MLNX_KO_NO_STRIP),1) +ifneq ($(WITH_DKMS),1) +override_dh_strip: + dh_strip + find debian -name '*.ko' | xargs strip -g +ifeq ($(WITH_MOD_SIGN),1) + tools/sign-modules $(INSTALL_MOD_PATH)/lib/modules/ $(K_BUILD) +endif +endif +endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/debian/source/format b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/debian/source/format new file mode 100644 index 0000000..163aaf8 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/debian/source/format @@ -0,0 +1 @@ +3.0 (quilt) diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/dkms.conf b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/dkms.conf new file mode 100644 index 0000000..1c6301c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/dkms.conf @@ -0,0 +1,19 @@ +# DKMS module name and version +PACKAGE_NAME="iser" +PACKAGE_VERSION="4.0" + +kernelver=${kernelver:-$(uname -r)} +kernel_source_dir=${kernel_source_dir:-/lib/modules/$kernelver/build} + +# Module name, source and destination directories, and build command-line +BUILT_MODULE_NAME[0]="ib_iser" +BUILT_MODULE_LOCATION[0]="./" +DEST_MODULE_LOCATION[0]="/kernel/../updates/" +MAKE="make -j`MLXNUMC=$(grep ^processor /proc/cpuinfo | wc -l) && echo $(($MLXNUMC<16?$MLXNUMC:16))` KVER=$kernelver K_BUILD=$kernel_source_dir" + +# Cleanup command-line +CLEAN="make clean" + +# disable autoinstall since this module depends on mlnx-ofed-kernel-dkms +# mlnx-ofed-kernel-dkms will build this module on POST_INSTALL +AUTOINSTALL= diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/ib_iser_dummy.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/ib_iser_dummy.c new file mode 100644 index 0000000..35f07e7 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/ib_iser_dummy.c @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "ib_iser" +#define PFX DRV_NAME ": " +#define DRV_VERSION "2.0.0" +#define DRV_RELDATE "August 14, 2016" + +MODULE_AUTHOR("Alaa Hleihel"); +MODULE_DESCRIPTION("ib_iser dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init iser_init(void) +{ + return 0; +} + +static void __exit iser_cleanup(void) +{ +} + +module_init(iser_init); +module_exit(iser_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/iscsi_iser.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/iscsi_iser.c new file mode 100644 index 0000000..07e4702 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/iscsi_iser.c @@ -0,0 +1,1086 @@ +/* + * iSCSI Initiator over iSER Data-Path + * + * Copyright (C) 2004 Dmitry Yusupov + * Copyright (C) 2004 Alex Aizman + * Copyright (C) 2005 Mike Christie + * Copyright (c) 2005, 2006 Voltaire, Inc. All rights reserved. + * Copyright (c) 2013-2014 Mellanox Technologies. All rights reserved. + * maintained by openib-general@openib.org + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Credits: + * Christoph Hellwig + * FUJITA Tomonori + * Arne Redlich + * Zhenyu Wang + * Modified by: + * Erez Zilber + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "iscsi_iser.h" + +MODULE_DESCRIPTION("iSER (iSCSI Extensions for RDMA) Datamover"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_AUTHOR("Alex Nezhinsky, Dan Bar Dov, Or Gerlitz"); + +static struct scsi_host_template iscsi_iser_sht; +static struct iscsi_transport iscsi_iser_transport; +static struct scsi_transport_template *iscsi_iser_scsi_transport; +static struct workqueue_struct *release_wq; +static DEFINE_MUTEX(unbind_iser_conn_mutex); +struct iser_global ig; + +int iser_debug_level = 0; +module_param_named(debug_level, iser_debug_level, int, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0 (default:disabled)"); + +static int iscsi_iser_set(const char *val, const struct kernel_param *kp); +static const struct kernel_param_ops iscsi_iser_size_ops = { + .set = iscsi_iser_set, + .get = param_get_uint, +}; + +static unsigned int iscsi_max_lun = 512; +module_param_cb(max_lun, &iscsi_iser_size_ops, &iscsi_max_lun, S_IRUGO); +MODULE_PARM_DESC(max_lun, "Max LUNs to allow per session, should > 0 (default:512)"); + +unsigned int iser_max_sectors = ISER_DEF_MAX_SECTORS; +module_param_cb(max_sectors, &iscsi_iser_size_ops, &iser_max_sectors, + S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(max_sectors, "Max number of sectors in a single scsi command, should > 0 (default:1024)"); + +bool iser_always_reg = true; +module_param_named(always_register, iser_always_reg, bool, S_IRUGO); +MODULE_PARM_DESC(always_register, + "Always register memory, even for continuous memory regions (default:true)"); + +bool iser_pi_enable = false; +module_param_named(pi_enable, iser_pi_enable, bool, S_IRUGO); +MODULE_PARM_DESC(pi_enable, "Enable T10-PI offload support (default:disabled)"); + +static int iscsi_iser_set(const char *val, const struct kernel_param *kp) +{ + int ret; + unsigned int n = 0; + + ret = kstrtouint(val, 10, &n); + if (ret != 0 || n == 0) + return -EINVAL; + + return param_set_uint(val, kp); +} + +/* + * iscsi_iser_recv() - Process a successful recv completion + * @conn: iscsi connection + * @hdr: iscsi header + * @rx_data: buffer containing receive data payload + * @rx_data_len: length of rx_data + * + * Notes: In case of data length errors or iscsi PDU completion failures + * this routine will signal iscsi layer of connection failure. + */ +void iscsi_iser_recv(struct iscsi_conn *conn, struct iscsi_hdr *hdr, + char *rx_data, int rx_data_len) +{ + int rc = 0; + int datalen; + + /* verify PDU length */ + datalen = ntoh24(hdr->dlength); + if (datalen > rx_data_len || (datalen + 4) < rx_data_len) { + iser_err("wrong datalen %d (hdr), %d (IB)\n", + datalen, rx_data_len); + rc = ISCSI_ERR_DATALEN; + goto error; + } + + if (datalen != rx_data_len) + iser_dbg("aligned datalen (%d) hdr, %d (IB)\n", + datalen, rx_data_len); + + rc = iscsi_complete_pdu(conn, hdr, rx_data, rx_data_len); + if (rc && rc != ISCSI_ERR_NO_SCSI_CMD) + goto error; + + return; +error: + iscsi_conn_failure(conn, rc); +} + +/** + * iscsi_iser_pdu_alloc() - allocate an iscsi-iser PDU + * @task: iscsi task + * @opcode: iscsi command opcode + * + * Netes: This routine can't fail, just assign iscsi task + * hdr and max hdr size. + */ +static int iscsi_iser_pdu_alloc(struct iscsi_task *task, uint8_t opcode) +{ + struct iscsi_iser_task *iser_task = task->dd_data; + + task->hdr = (struct iscsi_hdr *)&iser_task->desc.iscsi_header; + task->hdr_max = sizeof(iser_task->desc.iscsi_header); + + return 0; +} + +/** + * iser_initialize_task_headers() - Initialize task headers + * @task: iscsi task + * @tx_desc: iser tx descriptor + * + * Notes: + * This routine may race with iser teardown flow for scsi + * error handling TMFs. So for TMF we should acquire the + * state mutex to avoid dereferencing the IB device which + * may have already been terminated. + */ +int iser_initialize_task_headers(struct iscsi_task *task, + struct iser_tx_desc *tx_desc) +{ + struct iser_conn *iser_conn = task->conn->dd_data; + struct iser_device *device = iser_conn->ib_conn.device; + struct iscsi_iser_task *iser_task = task->dd_data; + u64 dma_addr; + + if (unlikely(iser_conn->state != ISER_CONN_UP)) + return -ENODEV; + + dma_addr = ib_dma_map_single(device->ib_device, (void *)tx_desc, + ISER_HEADERS_LEN, DMA_TO_DEVICE); + if (ib_dma_mapping_error(device->ib_device, dma_addr)) + return -ENOMEM; + + tx_desc->inv_wr.next = NULL; + tx_desc->reg_wr.wr.next = NULL; + tx_desc->mapped = true; + tx_desc->dma_addr = dma_addr; + tx_desc->tx_sg[0].addr = tx_desc->dma_addr; + tx_desc->tx_sg[0].length = ISER_HEADERS_LEN; + tx_desc->tx_sg[0].lkey = device->pd->local_dma_lkey; + + iser_task->iser_conn = iser_conn; + + return 0; +} + +/** + * iscsi_iser_task_init() - Initialize iscsi-iser task + * @task: iscsi task + * + * Initialize the task for the scsi command or mgmt command. + * + * Return: Returns zero on success or -ENOMEM when failing + * to init task headers (dma mapping error). + */ +static int iscsi_iser_task_init(struct iscsi_task *task) +{ + struct iscsi_iser_task *iser_task = task->dd_data; + int ret; + + ret = iser_initialize_task_headers(task, &iser_task->desc); + if (ret) { + iser_err("Failed to init task %p, err = %d\n", + iser_task, ret); + return ret; + } + + /* mgmt task */ + if (!task->sc) + return 0; + + iser_task->command_sent = 0; + iser_task_rdma_init(iser_task); + iser_task->sc = task->sc; + + return 0; +} + +/** + * iscsi_iser_mtask_xmit() - xmit management (immediate) task + * @conn: iscsi connection + * @task: task management task + * + * Notes: + * The function can return -EAGAIN in which case caller must + * call it again later, or recover. '0' return code means successful + * xmit. + * + **/ +static int iscsi_iser_mtask_xmit(struct iscsi_conn *conn, + struct iscsi_task *task) +{ + int error = 0; + + iser_dbg("mtask xmit [cid %d itt 0x%x]\n", conn->id, task->itt); + + error = iser_send_control(conn, task); + + /* since iser xmits control with zero copy, tasks can not be recycled + * right after sending them. + * The recycling scheme is based on whether a response is expected + * - if yes, the task is recycled at iscsi_complete_pdu + * - if no, the task is recycled at iser_snd_completion + */ + return error; +} + +static int iscsi_iser_task_xmit_unsol_data(struct iscsi_conn *conn, + struct iscsi_task *task) +{ + struct iscsi_r2t_info *r2t = &task->unsol_r2t; + struct iscsi_data hdr; + int error = 0; + + /* Send data-out PDUs while there's still unsolicited data to send */ + while (iscsi_task_has_unsol_data(task)) { + iscsi_prep_data_out_pdu(task, r2t, &hdr); + iser_dbg("Sending data-out: itt 0x%x, data count %d\n", + hdr.itt, r2t->data_count); + + /* the buffer description has been passed with the command */ + /* Send the command */ + error = iser_send_data_out(conn, task, &hdr); + if (error) { + r2t->datasn--; + goto iscsi_iser_task_xmit_unsol_data_exit; + } + r2t->sent += r2t->data_count; + iser_dbg("Need to send %d more as data-out PDUs\n", + r2t->data_length - r2t->sent); + } + +iscsi_iser_task_xmit_unsol_data_exit: + return error; +} + +/** + * iscsi_iser_task_xmit() - xmit iscsi-iser task + * @task: iscsi task + * + * Return: zero on success or escalates $error on failure. + */ +static int iscsi_iser_task_xmit(struct iscsi_task *task) +{ + struct iscsi_conn *conn = task->conn; + struct iscsi_iser_task *iser_task = task->dd_data; + int error = 0; + + if (!task->sc) + return iscsi_iser_mtask_xmit(conn, task); + + if (task->sc->sc_data_direction == DMA_TO_DEVICE) { + BUG_ON(scsi_bufflen(task->sc) == 0); + + iser_dbg("cmd [itt %x total %d imm %d unsol_data %d\n", + task->itt, scsi_bufflen(task->sc), + task->imm_count, task->unsol_r2t.data_length); + } + + iser_dbg("ctask xmit [cid %d itt 0x%x]\n", + conn->id, task->itt); + + /* Send the cmd PDU */ + if (!iser_task->command_sent) { + error = iser_send_command(conn, task); + if (error) + goto iscsi_iser_task_xmit_exit; + iser_task->command_sent = 1; + } + + /* Send unsolicited data-out PDU(s) if necessary */ + if (iscsi_task_has_unsol_data(task)) + error = iscsi_iser_task_xmit_unsol_data(conn, task); + + iscsi_iser_task_xmit_exit: + return error; +} + +/** + * iscsi_iser_cleanup_task() - cleanup an iscsi-iser task + * @task: iscsi task + * + * Notes: In case the RDMA device is already NULL (might have + * been removed in DEVICE_REMOVAL CM event it will bail-out + * without doing dma unmapping. + */ +static void iscsi_iser_cleanup_task(struct iscsi_task *task) +{ + struct iscsi_iser_task *iser_task = task->dd_data; + struct iser_tx_desc *tx_desc = &iser_task->desc; + struct iser_conn *iser_conn = task->conn->dd_data; + struct iser_device *device = iser_conn->ib_conn.device; + + /* DEVICE_REMOVAL event might have already released the device */ + if (!device) + return; + + if (likely(tx_desc->mapped)) { + ib_dma_unmap_single(device->ib_device, tx_desc->dma_addr, + ISER_HEADERS_LEN, DMA_TO_DEVICE); + tx_desc->mapped = false; + } + + /* mgmt tasks do not need special cleanup */ + if (!task->sc) + return; + + if (iser_task->status == ISER_TASK_STATUS_STARTED) { + iser_task->status = ISER_TASK_STATUS_COMPLETED; + iser_task_rdma_finalize(iser_task); + } +} + +/** + * iscsi_iser_check_protection() - check protection information status of task. + * @task: iscsi task + * @sector: error sector if exsists (output) + * + * Return: zero if no data-integrity errors have occured + * 0x1: data-integrity error occured in the guard-block + * 0x2: data-integrity error occured in the reference tag + * 0x3: data-integrity error occured in the application tag + * + * In addition the error sector is marked. + */ +static u8 iscsi_iser_check_protection(struct iscsi_task *task, sector_t *sector) +{ + struct iscsi_iser_task *iser_task = task->dd_data; + enum iser_data_dir dir = iser_task->dir[ISER_DIR_IN] ? + ISER_DIR_IN : ISER_DIR_OUT; + + return iser_check_task_pi_status(iser_task, dir, sector); +} + +/** + * iscsi_iser_conn_create() - create a new iscsi-iser connection + * @cls_session: iscsi class connection + * @conn_idx: connection index within the session (for MCS) + * + * Return: iscsi_cls_conn when iscsi_conn_setup succeeds or NULL + * otherwise. + */ +static struct iscsi_cls_conn * +iscsi_iser_conn_create(struct iscsi_cls_session *cls_session, + uint32_t conn_idx) +{ + struct iscsi_conn *conn; + struct iscsi_cls_conn *cls_conn; + + cls_conn = iscsi_conn_setup(cls_session, 0, conn_idx); + if (!cls_conn) + return NULL; + conn = cls_conn->dd_data; + + /* + * due to issues with the login code re iser sematics + * this not set in iscsi_conn_setup - FIXME + */ + conn->max_recv_dlength = ISER_RECV_DATA_SEG_LEN; + + return cls_conn; +} + +/** + * iscsi_iser_conn_bind() - bind iscsi and iser connection structures + * @cls_session: iscsi class session + * @cls_conn: iscsi class connection + * @transport_eph: transport end-point handle + * @is_leading: indicate if this is the session leading connection (MCS) + * + * Return: zero on success, $error if iscsi_conn_bind fails and + * -EINVAL in case end-point doesn't exsits anymore or iser connection + * state is not UP (teardown already started). + */ +static int iscsi_iser_conn_bind(struct iscsi_cls_session *cls_session, + struct iscsi_cls_conn *cls_conn, + uint64_t transport_eph, int is_leading) +{ + struct iscsi_conn *conn = cls_conn->dd_data; + struct iser_conn *iser_conn; + struct iscsi_endpoint *ep; + int error; + + error = iscsi_conn_bind(cls_session, cls_conn, is_leading); + if (error) + return error; + + /* the transport ep handle comes from user space so it must be + * verified against the global ib connections list */ + ep = iscsi_lookup_endpoint(transport_eph); + if (!ep) { + iser_err("can't bind eph %llx\n", + (unsigned long long)transport_eph); + return -EINVAL; + } + iser_conn = ep->dd_data; + + mutex_lock(&iser_conn->state_mutex); + if (iser_conn->state != ISER_CONN_UP) { + error = -EINVAL; + iser_err("iser_conn %p state is %d, teardown started\n", + iser_conn, iser_conn->state); + goto out; + } + + error = iser_alloc_rx_descriptors(iser_conn, conn->session); + if (error) + goto out; + + /* binds the iSER connection retrieved from the previously + * connected ep_handle to the iSCSI layer connection. exchanges + * connection pointers */ + iser_info("binding iscsi conn %p to iser_conn %p\n", conn, iser_conn); + + conn->dd_data = iser_conn; + iser_conn->iscsi_conn = conn; + +out: + iscsi_put_endpoint(ep); + mutex_unlock(&iser_conn->state_mutex); + return error; +} + +/** + * iscsi_iser_conn_start() - start iscsi-iser connection + * @cls_conn: iscsi class connection + * + * Notes: Here iser intialize (or re-initialize) stop_completion as + * from this point iscsi must call conn_stop in session/connection + * teardown so iser transport must wait for it. + */ +static int iscsi_iser_conn_start(struct iscsi_cls_conn *cls_conn) +{ + struct iscsi_conn *iscsi_conn; + struct iser_conn *iser_conn; + + iscsi_conn = cls_conn->dd_data; + iser_conn = iscsi_conn->dd_data; + reinit_completion(&iser_conn->stop_completion); + + return iscsi_conn_start(cls_conn); +} + +/** + * iscsi_iser_conn_stop() - stop iscsi-iser connection + * @cls_conn: iscsi class connection + * @flag: indicate if recover or terminate (passed as is) + * + * Notes: Calling iscsi_conn_stop might theoretically race with + * DEVICE_REMOVAL event and dereference a previously freed RDMA device + * handle, so we call it under iser the state lock to protect against + * this kind of race. + */ +static void iscsi_iser_conn_stop(struct iscsi_cls_conn *cls_conn, int flag) +{ + struct iscsi_conn *conn = cls_conn->dd_data; + struct iser_conn *iser_conn = conn->dd_data; + + iser_info("stopping iscsi_conn: %p, iser_conn: %p\n", conn, iser_conn); + + /* + * Userspace may have goofed up and not bound the connection or + * might have only partially setup the connection. + */ + if (iser_conn) { + mutex_lock(&iser_conn->state_mutex); + mutex_lock(&unbind_iser_conn_mutex); + iser_conn_terminate(iser_conn); + iscsi_conn_stop(cls_conn, flag); + + /* unbind */ + iser_conn->iscsi_conn = NULL; + conn->dd_data = NULL; + mutex_unlock(&unbind_iser_conn_mutex); + + complete(&iser_conn->stop_completion); + mutex_unlock(&iser_conn->state_mutex); + } else { + iscsi_conn_stop(cls_conn, flag); + } +} + +/** + * iscsi_iser_session_destroy() - destroy iscsi-iser session + * @cls_session: iscsi class session + * + * Removes and free iscsi host. + */ +static void iscsi_iser_session_destroy(struct iscsi_cls_session *cls_session) +{ + struct Scsi_Host *shost = iscsi_session_to_shost(cls_session); + + iscsi_session_teardown(cls_session); + iscsi_host_remove(shost); + iscsi_host_free(shost); +} + +static inline unsigned int iser_dif_prot_caps(int prot_caps) +{ + int ret = 0; + + if (prot_caps & IB_PROT_T10DIF_TYPE_1) + ret |= SHOST_DIF_TYPE1_PROTECTION | + SHOST_DIX_TYPE0_PROTECTION | + SHOST_DIX_TYPE1_PROTECTION; + if (prot_caps & IB_PROT_T10DIF_TYPE_2) + ret |= SHOST_DIF_TYPE2_PROTECTION | + SHOST_DIX_TYPE2_PROTECTION; + if (prot_caps & IB_PROT_T10DIF_TYPE_3) + ret |= SHOST_DIF_TYPE3_PROTECTION | + SHOST_DIX_TYPE3_PROTECTION; + + return ret; +} + +/** + * iscsi_iser_session_create() - create an iscsi-iser session + * @ep: iscsi end-point handle + * @cmds_max: maximum commands in this session + * @qdepth: session command queue depth + * @initial_cmdsn: initiator command sequnce number + * + * Allocates and adds a scsi host, expose DIF supprot if + * exists, and sets up an iscsi session. + */ +static struct iscsi_cls_session * +iscsi_iser_session_create(struct iscsi_endpoint *ep, + uint16_t cmds_max, uint16_t qdepth, + uint32_t initial_cmdsn) +{ + struct iscsi_cls_session *cls_session; + struct Scsi_Host *shost; + struct iser_conn *iser_conn = NULL; + struct ib_conn *ib_conn; + struct ib_device *ib_dev; + u32 max_fr_sectors; + + shost = iscsi_host_alloc(&iscsi_iser_sht, 0, 0); + if (!shost) + return NULL; + shost->transportt = iscsi_iser_scsi_transport; + shost->cmd_per_lun = qdepth; + shost->max_lun = iscsi_max_lun; + shost->max_id = 0; + shost->max_channel = 0; + shost->max_cmd_len = 16; + + /* + * older userspace tools (before 2.0-870) did not pass us + * the leading conn's ep so this will be NULL; + */ + if (ep) { + iser_conn = ep->dd_data; + shost->sg_tablesize = iser_conn->scsi_sg_tablesize; + shost->can_queue = min_t(u16, cmds_max, iser_conn->max_cmds); + + mutex_lock(&iser_conn->state_mutex); + if (iser_conn->state != ISER_CONN_UP) { + iser_err("iser conn %p already started teardown\n", + iser_conn); + mutex_unlock(&iser_conn->state_mutex); + goto free_host; + } + + ib_conn = &iser_conn->ib_conn; + ib_dev = ib_conn->device->ib_device; + if (ib_conn->pi_support) { + u32 sig_caps = ib_dev->attrs.sig_prot_cap; + + shost->sg_prot_tablesize = shost->sg_tablesize; + scsi_host_set_prot(shost, iser_dif_prot_caps(sig_caps)); + scsi_host_set_guard(shost, SHOST_DIX_GUARD_IP | + SHOST_DIX_GUARD_CRC); + } + + if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG)) + shost->virt_boundary_mask = SZ_4K - 1; + + if (iscsi_host_add(shost, ib_dev->dev.parent)) { + mutex_unlock(&iser_conn->state_mutex); + goto free_host; + } + mutex_unlock(&iser_conn->state_mutex); + } else { + shost->can_queue = min_t(u16, cmds_max, ISER_DEF_XMIT_CMDS_MAX); + if (iscsi_host_add(shost, NULL)) + goto free_host; + } + + max_fr_sectors = (shost->sg_tablesize * PAGE_SIZE) >> 9; + shost->max_sectors = min(iser_max_sectors, max_fr_sectors); + + iser_dbg("iser_conn %p, sg_tablesize %u, max_sectors %u\n", + iser_conn, shost->sg_tablesize, + shost->max_sectors); + + if (shost->max_sectors < iser_max_sectors) + iser_warn("max_sectors was reduced from %u to %u\n", + iser_max_sectors, shost->max_sectors); + + cls_session = iscsi_session_setup(&iscsi_iser_transport, shost, + shost->can_queue, 0, + sizeof(struct iscsi_iser_task), + initial_cmdsn, 0); + if (!cls_session) + goto remove_host; + + return cls_session; + +remove_host: + iscsi_host_remove(shost); +free_host: + iscsi_host_free(shost); + return NULL; +} + +static int iscsi_iser_set_param(struct iscsi_cls_conn *cls_conn, + enum iscsi_param param, char *buf, int buflen) +{ + int value; + + switch (param) { + case ISCSI_PARAM_MAX_RECV_DLENGTH: + /* TBD */ + break; + case ISCSI_PARAM_HDRDGST_EN: + sscanf(buf, "%d", &value); + if (value) { + iser_err("DataDigest wasn't negotiated to None\n"); + return -EPROTO; + } + break; + case ISCSI_PARAM_DATADGST_EN: + sscanf(buf, "%d", &value); + if (value) { + iser_err("DataDigest wasn't negotiated to None\n"); + return -EPROTO; + } + break; + case ISCSI_PARAM_IFMARKER_EN: + sscanf(buf, "%d", &value); + if (value) { + iser_err("IFMarker wasn't negotiated to No\n"); + return -EPROTO; + } + break; + case ISCSI_PARAM_OFMARKER_EN: + sscanf(buf, "%d", &value); + if (value) { + iser_err("OFMarker wasn't negotiated to No\n"); + return -EPROTO; + } + break; + default: + return iscsi_set_param(cls_conn, param, buf, buflen); + } + + return 0; +} + +/** + * iscsi_iser_conn_get_stats() - get iscsi connection statistics + * @cls_conn: iscsi class connection + * @stats: iscsi stats to output + * + * Output connection statistics. + */ +static void iscsi_iser_conn_get_stats(struct iscsi_cls_conn *cls_conn, + struct iscsi_stats *stats) +{ + struct iscsi_conn *conn = cls_conn->dd_data; + + stats->txdata_octets = conn->txdata_octets; + stats->rxdata_octets = conn->rxdata_octets; + stats->scsicmd_pdus = conn->scsicmd_pdus_cnt; + stats->dataout_pdus = conn->dataout_pdus_cnt; + stats->scsirsp_pdus = conn->scsirsp_pdus_cnt; + stats->datain_pdus = conn->datain_pdus_cnt; /* always 0 */ + stats->r2t_pdus = conn->r2t_pdus_cnt; /* always 0 */ + stats->tmfcmd_pdus = conn->tmfcmd_pdus_cnt; + stats->tmfrsp_pdus = conn->tmfrsp_pdus_cnt; + stats->custom_length = 0; +} + +static int iscsi_iser_get_ep_param(struct iscsi_endpoint *ep, + enum iscsi_param param, char *buf) +{ + struct iser_conn *iser_conn = ep->dd_data; + + switch (param) { + case ISCSI_PARAM_CONN_PORT: + case ISCSI_PARAM_CONN_ADDRESS: + if (!iser_conn || !iser_conn->ib_conn.cma_id) + return -ENOTCONN; + + return iscsi_conn_get_addr_param((struct sockaddr_storage *) + &iser_conn->ib_conn.cma_id->route.addr.dst_addr, + param, buf); + default: + break; + } + return -ENOSYS; +} + +/** + * iscsi_iser_ep_connect() - Initiate iSER connection establishment + * @shost: scsi_host + * @dst_addr: destination address + * @non_blocking: indicate if routine can block + * + * Allocate an iscsi endpoint, an iser_conn structure and bind them. + * After that start RDMA connection establishment via rdma_cm. We + * don't allocate iser_conn embedded in iscsi_endpoint since in teardown + * the endpoint will be destroyed at ep_disconnect while iser_conn will + * cleanup its resources asynchronuously. + * + * Return: iscsi_endpoint created by iscsi layer or ERR_PTR(error) + * if fails. + */ +static struct iscsi_endpoint *iscsi_iser_ep_connect(struct Scsi_Host *shost, + struct sockaddr *dst_addr, + int non_blocking) +{ + int err; + struct iser_conn *iser_conn; + struct iscsi_endpoint *ep; + + ep = iscsi_create_endpoint(0); + if (!ep) + return ERR_PTR(-ENOMEM); + + iser_conn = kzalloc(sizeof(*iser_conn), GFP_KERNEL); + if (!iser_conn) { + err = -ENOMEM; + goto failure; + } + + ep->dd_data = iser_conn; + iser_conn->ep = ep; + iser_conn_init(iser_conn); + + err = iser_connect(iser_conn, NULL, dst_addr, non_blocking); + if (err) + goto failure; + + return ep; +failure: + iscsi_destroy_endpoint(ep); + return ERR_PTR(err); +} + +/** + * iscsi_iser_ep_poll() - poll for iser connection establishment to complete + * @ep: iscsi endpoint (created at ep_connect) + * @timeout_ms: polling timeout allowed in ms. + * + * This routine boils down to waiting for up_completion signaling + * that cma_id got CONNECTED event. + * + * Return: 1 if succeeded in connection establishment, 0 if timeout expired + * (libiscsi will retry will kick in) or -1 if interrupted by signal + * or more likely iser connection state transitioned to TEMINATING or + * DOWN during the wait period. + */ +static int iscsi_iser_ep_poll(struct iscsi_endpoint *ep, int timeout_ms) +{ + struct iser_conn *iser_conn = ep->dd_data; + int rc; + + rc = wait_for_completion_interruptible_timeout(&iser_conn->up_completion, + msecs_to_jiffies(timeout_ms)); + /* if conn establishment failed, return error code to iscsi */ + if (rc == 0) { + mutex_lock(&iser_conn->state_mutex); + if (iser_conn->state == ISER_CONN_TERMINATING || + iser_conn->state == ISER_CONN_DOWN) + rc = -1; + mutex_unlock(&iser_conn->state_mutex); + } + + iser_info("iser conn %p rc = %d\n", iser_conn, rc); + + if (rc > 0) + return 1; /* success, this is the equivalent of EPOLLOUT */ + else if (!rc) + return 0; /* timeout */ + else + return rc; /* signal */ +} + +/** + * iscsi_iser_ep_disconnect() - Initiate connection teardown process + * @ep: iscsi endpoint handle + * + * This routine is not blocked by iser and RDMA termination process + * completion as we queue a deffered work for iser/RDMA destruction + * and cleanup or actually call it immediately in case we didn't pass + * iscsi conn bind/start stage, thus it is safe. + */ +static void iscsi_iser_ep_disconnect(struct iscsi_endpoint *ep) +{ + struct iser_conn *iser_conn = ep->dd_data; + + iser_info("ep %p iser conn %p\n", ep, iser_conn); + + mutex_lock(&iser_conn->state_mutex); + iser_conn_terminate(iser_conn); + + /* + * if iser_conn and iscsi_conn are bound, we must wait for + * iscsi_conn_stop and flush errors completion before freeing + * the iser resources. Otherwise we are safe to free resources + * immediately. + */ + if (iser_conn->iscsi_conn) { + INIT_WORK(&iser_conn->release_work, iser_release_work); + queue_work(release_wq, &iser_conn->release_work); + mutex_unlock(&iser_conn->state_mutex); + } else { + iser_conn->state = ISER_CONN_DOWN; + mutex_unlock(&iser_conn->state_mutex); + iser_conn_release(iser_conn); + } + + iscsi_destroy_endpoint(ep); +} + +static umode_t iser_attr_is_visible(int param_type, int param) +{ + switch (param_type) { + case ISCSI_HOST_PARAM: + switch (param) { + case ISCSI_HOST_PARAM_NETDEV_NAME: + case ISCSI_HOST_PARAM_HWADDRESS: + case ISCSI_HOST_PARAM_INITIATOR_NAME: + return S_IRUGO; + default: + return 0; + } + case ISCSI_PARAM: + switch (param) { + case ISCSI_PARAM_MAX_RECV_DLENGTH: + case ISCSI_PARAM_MAX_XMIT_DLENGTH: + case ISCSI_PARAM_HDRDGST_EN: + case ISCSI_PARAM_DATADGST_EN: + case ISCSI_PARAM_CONN_ADDRESS: + case ISCSI_PARAM_CONN_PORT: + case ISCSI_PARAM_EXP_STATSN: + case ISCSI_PARAM_PERSISTENT_ADDRESS: + case ISCSI_PARAM_PERSISTENT_PORT: + case ISCSI_PARAM_PING_TMO: + case ISCSI_PARAM_RECV_TMO: + case ISCSI_PARAM_INITIAL_R2T_EN: + case ISCSI_PARAM_MAX_R2T: + case ISCSI_PARAM_IMM_DATA_EN: + case ISCSI_PARAM_FIRST_BURST: + case ISCSI_PARAM_MAX_BURST: + case ISCSI_PARAM_PDU_INORDER_EN: + case ISCSI_PARAM_DATASEQ_INORDER_EN: + case ISCSI_PARAM_TARGET_NAME: + case ISCSI_PARAM_TPGT: + case ISCSI_PARAM_USERNAME: + case ISCSI_PARAM_PASSWORD: + case ISCSI_PARAM_USERNAME_IN: + case ISCSI_PARAM_PASSWORD_IN: + case ISCSI_PARAM_FAST_ABORT: + case ISCSI_PARAM_ABORT_TMO: + case ISCSI_PARAM_LU_RESET_TMO: + case ISCSI_PARAM_TGT_RESET_TMO: + case ISCSI_PARAM_IFACE_NAME: + case ISCSI_PARAM_INITIATOR_NAME: + case ISCSI_PARAM_DISCOVERY_SESS: + return S_IRUGO; + default: + return 0; + } + } + + return 0; +} + +static struct scsi_host_template iscsi_iser_sht = { + .module = THIS_MODULE, + .name = "iSCSI Initiator over iSER", + .queuecommand = iscsi_queuecommand, + .change_queue_depth = scsi_change_queue_depth, + .sg_tablesize = ISCSI_ISER_DEF_SG_TABLESIZE, + .cmd_per_lun = ISER_DEF_CMD_PER_LUN, + .eh_timed_out = iscsi_eh_cmd_timed_out, + .eh_abort_handler = iscsi_eh_abort, + .eh_device_reset_handler= iscsi_eh_device_reset, + .eh_target_reset_handler = iscsi_eh_recover_target, + .target_alloc = iscsi_target_alloc, + .proc_name = "iscsi_iser", + .this_id = -1, + .track_queue_depth = 1, +}; + +static struct iscsi_transport iscsi_iser_transport = { + .owner = THIS_MODULE, + .name = "iser", + .caps = CAP_RECOVERY_L0 | CAP_MULTI_R2T | CAP_TEXT_NEGO, + /* session management */ + .create_session = iscsi_iser_session_create, + .destroy_session = iscsi_iser_session_destroy, + /* connection management */ + .create_conn = iscsi_iser_conn_create, + .bind_conn = iscsi_iser_conn_bind, + .unbind_conn = iscsi_conn_unbind, + .destroy_conn = iscsi_conn_teardown, + .attr_is_visible = iser_attr_is_visible, + .set_param = iscsi_iser_set_param, + .get_conn_param = iscsi_conn_get_param, + .get_ep_param = iscsi_iser_get_ep_param, + .get_session_param = iscsi_session_get_param, + .start_conn = iscsi_iser_conn_start, + .stop_conn = iscsi_iser_conn_stop, + /* iscsi host params */ + .get_host_param = iscsi_host_get_param, + .set_host_param = iscsi_host_set_param, + /* IO */ + .send_pdu = iscsi_conn_send_pdu, + .get_stats = iscsi_iser_conn_get_stats, + .init_task = iscsi_iser_task_init, + .xmit_task = iscsi_iser_task_xmit, + .cleanup_task = iscsi_iser_cleanup_task, + .alloc_pdu = iscsi_iser_pdu_alloc, + .check_protection = iscsi_iser_check_protection, + /* recovery */ + .session_recovery_timedout = iscsi_session_recovery_timedout, + + .ep_connect = iscsi_iser_ep_connect, + .ep_poll = iscsi_iser_ep_poll, + .ep_disconnect = iscsi_iser_ep_disconnect +}; + +static int __init iser_init(void) +{ + int err; + + iser_dbg("Starting iSER datamover...\n"); + + memset(&ig, 0, sizeof(struct iser_global)); + + ig.desc_cache = kmem_cache_create("iser_descriptors", + sizeof(struct iser_tx_desc), + 0, SLAB_HWCACHE_ALIGN, + NULL); + if (ig.desc_cache == NULL) + return -ENOMEM; + + /* device init is called only after the first addr resolution */ + mutex_init(&ig.device_list_mutex); + INIT_LIST_HEAD(&ig.device_list); + mutex_init(&ig.connlist_mutex); + INIT_LIST_HEAD(&ig.connlist); + + release_wq = alloc_workqueue("release workqueue", 0, 0); + if (!release_wq) { + iser_err("failed to allocate release workqueue\n"); + err = -ENOMEM; + goto err_alloc_wq; + } + + iscsi_iser_scsi_transport = iscsi_register_transport( + &iscsi_iser_transport); + if (!iscsi_iser_scsi_transport) { + iser_err("iscsi_register_transport failed\n"); + err = -EINVAL; + goto err_reg; + } + + return 0; + +err_reg: + destroy_workqueue(release_wq); +err_alloc_wq: + kmem_cache_destroy(ig.desc_cache); + + return err; +} + +static void __exit iser_exit(void) +{ + struct iser_conn *iser_conn, *n; + int connlist_empty; + + iser_dbg("Removing iSER datamover...\n"); + destroy_workqueue(release_wq); + + mutex_lock(&ig.connlist_mutex); + connlist_empty = list_empty(&ig.connlist); + mutex_unlock(&ig.connlist_mutex); + + if (!connlist_empty) { + iser_err("Error cleanup stage completed but we still have iser " + "connections, destroying them anyway\n"); + list_for_each_entry_safe(iser_conn, n, &ig.connlist, + conn_list) { + iser_conn_release(iser_conn); + } + } + + iscsi_unregister_transport(&iscsi_iser_transport); + kmem_cache_destroy(ig.desc_cache); +} + +module_init(iser_init); +module_exit(iser_exit); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/iscsi_iser.h b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/iscsi_iser.h new file mode 100644 index 0000000..0970005 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/iscsi_iser.h @@ -0,0 +1,585 @@ +/* + * iSER transport for the Open iSCSI Initiator & iSER transport internals + * + * Copyright (C) 2004 Dmitry Yusupov + * Copyright (C) 2004 Alex Aizman + * Copyright (C) 2005 Mike Christie + * based on code maintained by open-iscsi@googlegroups.com + * + * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved. + * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. + * Copyright (c) 2013-2014 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ISCSI_ISER_H__ +#define __ISCSI_ISER_H__ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#define DRV_NAME "iser" +#define PFX DRV_NAME ": " +#define DRV_VER "1.6" + +#define iser_dbg(fmt, arg...) \ + do { \ + if (unlikely(iser_debug_level > 2)) \ + printk(KERN_DEBUG PFX "%s: " fmt,\ + __func__ , ## arg); \ + } while (0) + +#define iser_warn(fmt, arg...) \ + do { \ + if (unlikely(iser_debug_level > 0)) \ + pr_warn(PFX "%s: " fmt, \ + __func__ , ## arg); \ + } while (0) + +#define iser_info(fmt, arg...) \ + do { \ + if (unlikely(iser_debug_level > 1)) \ + pr_info(PFX "%s: " fmt, \ + __func__ , ## arg); \ + } while (0) + +#define iser_err(fmt, arg...) \ + pr_err(PFX "%s: " fmt, __func__ , ## arg) + +/* Default support is 512KB I/O size */ +#define ISER_DEF_MAX_SECTORS 1024 +#define ISCSI_ISER_DEF_SG_TABLESIZE \ + ((ISER_DEF_MAX_SECTORS * SECTOR_SIZE) >> ilog2(SZ_4K)) +/* Maximum support is 16MB I/O size */ +#define ISCSI_ISER_MAX_SG_TABLESIZE ((32768 * SECTOR_SIZE) >> ilog2(SZ_4K)) + +#define ISER_DEF_XMIT_CMDS_DEFAULT 512 +#if ISCSI_DEF_XMIT_CMDS_MAX > ISER_DEF_XMIT_CMDS_DEFAULT + #define ISER_DEF_XMIT_CMDS_MAX ISCSI_DEF_XMIT_CMDS_MAX +#else + #define ISER_DEF_XMIT_CMDS_MAX ISER_DEF_XMIT_CMDS_DEFAULT +#endif +#define ISER_DEF_CMD_PER_LUN ISER_DEF_XMIT_CMDS_MAX + +/* QP settings */ +/* Maximal bounds on received asynchronous PDUs */ +#define ISER_MAX_RX_MISC_PDUS 4 /* NOOP_IN(2) , ASYNC_EVENT(2) */ + +#define ISER_MAX_TX_MISC_PDUS 6 /* NOOP_OUT(2), TEXT(1), * + * SCSI_TMFUNC(2), LOGOUT(1) */ + +#define ISER_QP_MAX_RECV_DTOS (ISER_DEF_XMIT_CMDS_MAX) + +/* the max TX (send) WR supported by the iSER QP is defined by * + * max_send_wr = T * (1 + D) + C ; D is how many inflight dataouts we expect * + * to have at max for SCSI command. The tx posting & completion handling code * + * supports -EAGAIN scheme where tx is suspended till the QP has room for more * + * send WR. D=8 comes from 64K/8K */ + +#define ISER_INFLIGHT_DATAOUTS 8 + +#define ISER_QP_MAX_REQ_DTOS (ISER_DEF_XMIT_CMDS_MAX * \ + (1 + ISER_INFLIGHT_DATAOUTS) + \ + ISER_MAX_TX_MISC_PDUS + \ + ISER_MAX_RX_MISC_PDUS) + +/* Max registration work requests per command */ +#define ISER_MAX_REG_WR_PER_CMD 5 + +/* For Signature we don't support DATAOUTs so no need to make room for them */ +#define ISER_QP_SIG_MAX_REQ_DTOS (ISER_DEF_XMIT_CMDS_MAX * \ + (1 + ISER_MAX_REG_WR_PER_CMD) + \ + ISER_MAX_TX_MISC_PDUS + \ + ISER_MAX_RX_MISC_PDUS) + +#define ISER_GET_MAX_XMIT_CMDS(send_wr) ((send_wr \ + - ISER_MAX_TX_MISC_PDUS \ + - ISER_MAX_RX_MISC_PDUS) / \ + (1 + ISER_INFLIGHT_DATAOUTS)) + +/* Constant PDU lengths calculations */ +#define ISER_HEADERS_LEN (sizeof(struct iser_ctrl) + sizeof(struct iscsi_hdr)) + +#define ISER_RECV_DATA_SEG_LEN 128 +#define ISER_RX_PAYLOAD_SIZE (ISER_HEADERS_LEN + ISER_RECV_DATA_SEG_LEN) +#define ISER_RX_LOGIN_SIZE (ISER_HEADERS_LEN + ISCSI_DEF_MAX_RECV_SEG_LEN) + +/* Length of an object name string */ +#define ISER_OBJECT_NAME_SIZE 64 + +/* Timeout in milliseconds to wait for rdma_cm events in disconnect +* flows. +* This timeout is needed for a workaround in iSER for a bug in the +* CM layer that doesn't handle CM_DREQ timeouts correctly. +* TODO: remove this workaround once the CM bug fixed. +*/ +#define IB_COMPLETION_TMO 4096 + +enum iser_conn_state { + ISER_CONN_INIT, /* descriptor allocd, no conn */ + ISER_CONN_PENDING, /* in the process of being established */ + ISER_CONN_UP, /* up and running */ + ISER_CONN_TERMINATING, /* in the process of being terminated */ + ISER_CONN_DOWN, /* shut down */ + ISER_CONN_STATES_NUM +}; + +enum iser_task_status { + ISER_TASK_STATUS_INIT = 0, + ISER_TASK_STATUS_STARTED, + ISER_TASK_STATUS_COMPLETED +}; + +enum iser_data_dir { + ISER_DIR_IN = 0, /* to initiator */ + ISER_DIR_OUT, /* from initiator */ + ISER_DIRS_NUM +}; + +/** + * struct iser_data_buf - iSER data buffer + * + * @sg: pointer to the sg list + * @size: num entries of this sg + * @data_len: total beffer byte len + * @dma_nents: returned by dma_map_sg + */ +struct iser_data_buf { + struct scatterlist *sg; + int size; + unsigned long data_len; + int dma_nents; +}; + +/* fwd declarations */ +struct iser_device; +struct iscsi_iser_task; +struct iscsi_endpoint; +struct iser_reg_resources; + +/** + * struct iser_mem_reg - iSER memory registration info + * + * @sge: memory region sg element + * @rkey: memory region remote key + * @mem_h: pointer to registration context (FMR/Fastreg) + */ +struct iser_mem_reg { + struct ib_sge sge; + u32 rkey; + void *mem_h; +}; + +enum iser_desc_type { + ISCSI_TX_CONTROL , + ISCSI_TX_SCSI_COMMAND, + ISCSI_TX_DATAOUT +}; + +/** + * struct iser_tx_desc - iSER TX descriptor + * + * @iser_header: iser header + * @iscsi_header: iscsi header + * @type: command/control/dataout + * @dma_addr: header buffer dma_address + * @tx_sg: sg[0] points to iser/iscsi headers + * sg[1] optionally points to either of immediate data + * unsolicited data-out or control + * @num_sge: number sges used on this TX task + * @cqe: completion handler + * @mapped: Is the task header mapped + * @reg_wr: registration WR + * @send_wr: send WR + * @inv_wr: invalidate WR + */ +struct iser_tx_desc { + struct iser_ctrl iser_header; + struct iscsi_hdr iscsi_header; + enum iser_desc_type type; + u64 dma_addr; + struct ib_sge tx_sg[2]; + int num_sge; + struct ib_cqe cqe; + bool mapped; + struct ib_reg_wr reg_wr; + struct ib_send_wr send_wr; + struct ib_send_wr inv_wr; +}; + +#define ISER_RX_PAD_SIZE (256 - (ISER_RX_PAYLOAD_SIZE + \ + sizeof(u64) + sizeof(struct ib_sge) + \ + sizeof(struct ib_cqe))) +/** + * struct iser_rx_desc - iSER RX descriptor + * + * @iser_header: iser header + * @iscsi_header: iscsi header + * @data: received data segment + * @dma_addr: receive buffer dma address + * @rx_sg: ib_sge of receive buffer + * @cqe: completion handler + * @pad: for sense data TODO: Modify to maximum sense length supported + */ +struct iser_rx_desc { + struct iser_ctrl iser_header; + struct iscsi_hdr iscsi_header; + char data[ISER_RECV_DATA_SEG_LEN]; + u64 dma_addr; + struct ib_sge rx_sg; + struct ib_cqe cqe; + char pad[ISER_RX_PAD_SIZE]; +} __packed; + +/** + * struct iser_login_desc - iSER login descriptor + * + * @req: pointer to login request buffer + * @rsp: pointer to login response buffer + * @req_dma: DMA address of login request buffer + * @rsp_dma: DMA address of login response buffer + * @sge: IB sge for login post recv + * @cqe: completion handler + */ +struct iser_login_desc { + void *req; + void *rsp; + u64 req_dma; + u64 rsp_dma; + struct ib_sge sge; + struct ib_cqe cqe; +} __packed; + +struct iser_conn; +struct ib_conn; + +/** + * struct iser_device - iSER device handle + * + * @ib_device: RDMA device + * @pd: Protection Domain for this device + * @mr: Global DMA memory region + * @event_handler: IB events handle routine + * @ig_list: entry in devices list + * @refcount: Reference counter, dominated by open iser connections + */ +struct iser_device { + struct ib_device *ib_device; + struct ib_pd *pd; + struct ib_event_handler event_handler; + struct list_head ig_list; + int refcount; +}; + +/** + * struct iser_reg_resources - Fast registration resources + * + * @mr: memory region + * @sig_mr: signature memory region + * @mr_valid: is mr valid indicator + */ +struct iser_reg_resources { + struct ib_mr *mr; + struct ib_mr *sig_mr; + u8 mr_valid:1; +}; + +/** + * struct iser_fr_desc - Fast registration descriptor + * + * @list: entry in connection fastreg pool + * @rsc: data buffer registration resources + * @sig_protected: is region protected indicator + * @all_list: first and last list members + */ +struct iser_fr_desc { + struct list_head list; + struct iser_reg_resources rsc; + bool sig_protected; + struct list_head all_list; +}; + +/** + * struct iser_fr_pool - connection fast registration pool + * + * @list: list of fastreg descriptors + * @lock: protects fastreg pool + * @size: size of the pool + * @all_list: first and last list members + */ +struct iser_fr_pool { + struct list_head list; + spinlock_t lock; + int size; + struct list_head all_list; +}; + +/** + * struct ib_conn - Infiniband related objects + * + * @cma_id: rdma_cm connection maneger handle + * @qp: Connection Queue-pair + * @cq: Connection completion queue + * @cq_size: The number of max outstanding completions + * @device: reference to iser device + * @fr_pool: connection fast registration poool + * @pi_support: Indicate device T10-PI support + * @reg_cqe: completion handler + */ +struct ib_conn { + struct rdma_cm_id *cma_id; + struct ib_qp *qp; + struct ib_cq *cq; + u32 cq_size; + struct iser_device *device; + struct iser_fr_pool fr_pool; + bool pi_support; + struct ib_cqe reg_cqe; +}; + +/** + * struct iser_conn - iSER connection context + * + * @ib_conn: connection RDMA resources + * @iscsi_conn: link to matching iscsi connection + * @ep: transport handle + * @state: connection logical state + * @qp_max_recv_dtos: maximum number of data outs, corresponds + * to max number of post recvs + * @max_cmds: maximum cmds allowed for this connection + * @name: connection peer portal + * @release_work: deffered work for release job + * @state_mutex: protects iser onnection state + * @stop_completion: conn_stop completion + * @ib_completion: RDMA cleanup completion + * @up_completion: connection establishment completed + * (state is ISER_CONN_UP) + * @conn_list: entry in ig conn list + * @login_desc: login descriptor + * @rx_descs: rx buffers array (cyclic buffer) + * @num_rx_descs: number of rx descriptors + * @scsi_sg_tablesize: scsi host sg_tablesize + * @pages_per_mr: maximum pages available for registration + * @snd_w_inv: connection uses remote invalidation + */ +struct iser_conn { + struct ib_conn ib_conn; + struct iscsi_conn *iscsi_conn; + struct iscsi_endpoint *ep; + enum iser_conn_state state; + unsigned qp_max_recv_dtos; + u16 max_cmds; + char name[ISER_OBJECT_NAME_SIZE]; + struct work_struct release_work; + struct mutex state_mutex; + struct completion stop_completion; + struct completion ib_completion; + struct completion up_completion; + struct list_head conn_list; + struct iser_login_desc login_desc; + struct iser_rx_desc *rx_descs; + u32 num_rx_descs; + unsigned short scsi_sg_tablesize; + unsigned short pages_per_mr; + bool snd_w_inv; +}; + +/** + * struct iscsi_iser_task - iser task context + * + * @desc: TX descriptor + * @iser_conn: link to iser connection + * @status: current task status + * @sc: link to scsi command + * @command_sent: indicate if command was sent + * @dir: iser data direction + * @rdma_reg: task rdma registration desc + * @data: iser data buffer desc + * @prot: iser protection buffer desc + */ +struct iscsi_iser_task { + struct iser_tx_desc desc; + struct iser_conn *iser_conn; + enum iser_task_status status; + struct scsi_cmnd *sc; + int command_sent; + int dir[ISER_DIRS_NUM]; + struct iser_mem_reg rdma_reg[ISER_DIRS_NUM]; + struct iser_data_buf data[ISER_DIRS_NUM]; + struct iser_data_buf prot[ISER_DIRS_NUM]; +}; + +/** + * struct iser_global - iSER global context + * + * @device_list_mutex: protects device_list + * @device_list: iser devices global list + * @connlist_mutex: protects connlist + * @connlist: iser connections global list + * @desc_cache: kmem cache for tx dataout + */ +struct iser_global { + struct mutex device_list_mutex; + struct list_head device_list; + struct mutex connlist_mutex; + struct list_head connlist; + struct kmem_cache *desc_cache; +}; + +extern struct iser_global ig; +extern int iser_debug_level; +extern bool iser_pi_enable; +extern unsigned int iser_max_sectors; +extern bool iser_always_reg; + +int iser_send_control(struct iscsi_conn *conn, + struct iscsi_task *task); + +int iser_send_command(struct iscsi_conn *conn, + struct iscsi_task *task); + +int iser_send_data_out(struct iscsi_conn *conn, + struct iscsi_task *task, + struct iscsi_data *hdr); + +void iscsi_iser_recv(struct iscsi_conn *conn, + struct iscsi_hdr *hdr, + char *rx_data, + int rx_data_len); + +void iser_conn_init(struct iser_conn *iser_conn); + +void iser_conn_release(struct iser_conn *iser_conn); + +int iser_conn_terminate(struct iser_conn *iser_conn); + +void iser_release_work(struct work_struct *work); + +void iser_err_comp(struct ib_wc *wc, const char *type); +void iser_login_rsp(struct ib_cq *cq, struct ib_wc *wc); +void iser_task_rsp(struct ib_cq *cq, struct ib_wc *wc); +void iser_cmd_comp(struct ib_cq *cq, struct ib_wc *wc); +void iser_ctrl_comp(struct ib_cq *cq, struct ib_wc *wc); +void iser_dataout_comp(struct ib_cq *cq, struct ib_wc *wc); +void iser_reg_comp(struct ib_cq *cq, struct ib_wc *wc); + +void iser_task_rdma_init(struct iscsi_iser_task *task); + +void iser_task_rdma_finalize(struct iscsi_iser_task *task); + +void iser_free_rx_descriptors(struct iser_conn *iser_conn); + +void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, + struct iser_data_buf *mem, + enum iser_data_dir cmd_dir); + +int iser_reg_mem_fastreg(struct iscsi_iser_task *task, + enum iser_data_dir dir, + bool all_imm); +void iser_unreg_mem_fastreg(struct iscsi_iser_task *task, + enum iser_data_dir dir); + +int iser_connect(struct iser_conn *iser_conn, + struct sockaddr *src_addr, + struct sockaddr *dst_addr, + int non_blocking); + +int iser_post_recvl(struct iser_conn *iser_conn); +int iser_post_recvm(struct iser_conn *iser_conn, + struct iser_rx_desc *rx_desc); +int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc); + +int iser_dma_map_task_data(struct iscsi_iser_task *iser_task, + struct iser_data_buf *data, + enum iser_data_dir iser_dir, + enum dma_data_direction dma_dir); + +void iser_dma_unmap_task_data(struct iscsi_iser_task *iser_task, + struct iser_data_buf *data, + enum dma_data_direction dir); + +int iser_initialize_task_headers(struct iscsi_task *task, + struct iser_tx_desc *tx_desc); +int iser_alloc_rx_descriptors(struct iser_conn *iser_conn, + struct iscsi_session *session); +int iser_alloc_fastreg_pool(struct ib_conn *ib_conn, + unsigned cmds_max, + unsigned int size); +void iser_free_fastreg_pool(struct ib_conn *ib_conn); +u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task, + enum iser_data_dir cmd_dir, sector_t *sector); + +static inline struct iser_conn * +to_iser_conn(struct ib_conn *ib_conn) +{ + return container_of(ib_conn, struct iser_conn, ib_conn); +} + +static inline struct iser_rx_desc * +iser_rx(struct ib_cqe *cqe) +{ + return container_of(cqe, struct iser_rx_desc, cqe); +} + +static inline struct iser_tx_desc * +iser_tx(struct ib_cqe *cqe) +{ + return container_of(cqe, struct iser_tx_desc, cqe); +} + +static inline struct iser_login_desc * +iser_login(struct ib_cqe *cqe) +{ + return container_of(cqe, struct iser_login_desc, cqe); +} + +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/iser_initiator.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/iser_initiator.c new file mode 100644 index 0000000..2e160c8 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/iser_initiator.c @@ -0,0 +1,771 @@ +/* + * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved. + * Copyright (c) 2013-2014 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include +#include +#include +#include + +#include "iscsi_iser.h" + +/* Register user buffer memory and initialize passive rdma + * dto descriptor. Data size is stored in + * task->data[ISER_DIR_IN].data_len, Protection size + * os stored in task->prot[ISER_DIR_IN].data_len + */ +static int iser_prepare_read_cmd(struct iscsi_task *task) + +{ + struct iscsi_iser_task *iser_task = task->dd_data; + struct iser_mem_reg *mem_reg; + int err; + struct iser_ctrl *hdr = &iser_task->desc.iser_header; + struct iser_data_buf *buf_in = &iser_task->data[ISER_DIR_IN]; + + err = iser_dma_map_task_data(iser_task, + buf_in, + ISER_DIR_IN, + DMA_FROM_DEVICE); + if (err) + return err; + + if (scsi_prot_sg_count(iser_task->sc)) { + struct iser_data_buf *pbuf_in = &iser_task->prot[ISER_DIR_IN]; + + err = iser_dma_map_task_data(iser_task, + pbuf_in, + ISER_DIR_IN, + DMA_FROM_DEVICE); + if (err) + return err; + } + + err = iser_reg_mem_fastreg(iser_task, ISER_DIR_IN, false); + if (err) { + iser_err("Failed to set up Data-IN RDMA\n"); + return err; + } + mem_reg = &iser_task->rdma_reg[ISER_DIR_IN]; + + hdr->flags |= ISER_RSV; + hdr->read_stag = cpu_to_be32(mem_reg->rkey); + hdr->read_va = cpu_to_be64(mem_reg->sge.addr); + + iser_dbg("Cmd itt:%d READ tags RKEY:%#.4X VA:%#llX\n", + task->itt, mem_reg->rkey, + (unsigned long long)mem_reg->sge.addr); + + return 0; +} + +/* Register user buffer memory and initialize passive rdma + * dto descriptor. Data size is stored in + * task->data[ISER_DIR_OUT].data_len, Protection size + * is stored at task->prot[ISER_DIR_OUT].data_len + */ +static int iser_prepare_write_cmd(struct iscsi_task *task, unsigned int imm_sz, + unsigned int unsol_sz, unsigned int edtl) +{ + struct iscsi_iser_task *iser_task = task->dd_data; + struct iser_mem_reg *mem_reg; + int err; + struct iser_ctrl *hdr = &iser_task->desc.iser_header; + struct iser_data_buf *buf_out = &iser_task->data[ISER_DIR_OUT]; + struct ib_sge *tx_dsg = &iser_task->desc.tx_sg[1]; + + err = iser_dma_map_task_data(iser_task, + buf_out, + ISER_DIR_OUT, + DMA_TO_DEVICE); + if (err) + return err; + + if (scsi_prot_sg_count(iser_task->sc)) { + struct iser_data_buf *pbuf_out = &iser_task->prot[ISER_DIR_OUT]; + + err = iser_dma_map_task_data(iser_task, + pbuf_out, + ISER_DIR_OUT, + DMA_TO_DEVICE); + if (err) + return err; + } + + err = iser_reg_mem_fastreg(iser_task, ISER_DIR_OUT, + buf_out->data_len == imm_sz); + if (err != 0) { + iser_err("Failed to register write cmd RDMA mem\n"); + return err; + } + + mem_reg = &iser_task->rdma_reg[ISER_DIR_OUT]; + + if (unsol_sz < edtl) { + hdr->flags |= ISER_WSV; + if (buf_out->data_len > imm_sz) { + hdr->write_stag = cpu_to_be32(mem_reg->rkey); + hdr->write_va = cpu_to_be64(mem_reg->sge.addr + unsol_sz); + } + + iser_dbg("Cmd itt:%d, WRITE tags, RKEY:%#.4X VA:%#llX + unsol:%d\n", + task->itt, mem_reg->rkey, + (unsigned long long)mem_reg->sge.addr, unsol_sz); + } + + if (imm_sz > 0) { + iser_dbg("Cmd itt:%d, WRITE, adding imm.data sz: %d\n", + task->itt, imm_sz); + tx_dsg->addr = mem_reg->sge.addr; + tx_dsg->length = imm_sz; + tx_dsg->lkey = mem_reg->sge.lkey; + iser_task->desc.num_sge = 2; + } + + return 0; +} + +/* creates a new tx descriptor and adds header regd buffer */ +static void iser_create_send_desc(struct iser_conn *iser_conn, + struct iser_tx_desc *tx_desc) +{ + struct iser_device *device = iser_conn->ib_conn.device; + + ib_dma_sync_single_for_cpu(device->ib_device, + tx_desc->dma_addr, ISER_HEADERS_LEN, DMA_TO_DEVICE); + + memset(&tx_desc->iser_header, 0, sizeof(struct iser_ctrl)); + tx_desc->iser_header.flags = ISER_VER; + tx_desc->num_sge = 1; +} + +static void iser_free_login_buf(struct iser_conn *iser_conn) +{ + struct iser_device *device = iser_conn->ib_conn.device; + struct iser_login_desc *desc = &iser_conn->login_desc; + + if (!desc->req) + return; + + ib_dma_unmap_single(device->ib_device, desc->req_dma, + ISCSI_DEF_MAX_RECV_SEG_LEN, DMA_TO_DEVICE); + + ib_dma_unmap_single(device->ib_device, desc->rsp_dma, + ISER_RX_LOGIN_SIZE, DMA_FROM_DEVICE); + + kfree(desc->req); + kfree(desc->rsp); + + /* make sure we never redo any unmapping */ + desc->req = NULL; + desc->rsp = NULL; +} + +static int iser_alloc_login_buf(struct iser_conn *iser_conn) +{ + struct iser_device *device = iser_conn->ib_conn.device; + struct iser_login_desc *desc = &iser_conn->login_desc; + + desc->req = kmalloc(ISCSI_DEF_MAX_RECV_SEG_LEN, GFP_KERNEL); + if (!desc->req) + return -ENOMEM; + + desc->req_dma = ib_dma_map_single(device->ib_device, desc->req, + ISCSI_DEF_MAX_RECV_SEG_LEN, + DMA_TO_DEVICE); + if (ib_dma_mapping_error(device->ib_device, + desc->req_dma)) + goto free_req; + + desc->rsp = kmalloc(ISER_RX_LOGIN_SIZE, GFP_KERNEL); + if (!desc->rsp) + goto unmap_req; + + desc->rsp_dma = ib_dma_map_single(device->ib_device, desc->rsp, + ISER_RX_LOGIN_SIZE, + DMA_FROM_DEVICE); + if (ib_dma_mapping_error(device->ib_device, + desc->rsp_dma)) + goto free_rsp; + + return 0; + +free_rsp: + kfree(desc->rsp); +unmap_req: + ib_dma_unmap_single(device->ib_device, desc->req_dma, + ISCSI_DEF_MAX_RECV_SEG_LEN, + DMA_TO_DEVICE); +free_req: + kfree(desc->req); + + return -ENOMEM; +} + +int iser_alloc_rx_descriptors(struct iser_conn *iser_conn, + struct iscsi_session *session) +{ + int i, j; + u64 dma_addr; + struct iser_rx_desc *rx_desc; + struct ib_sge *rx_sg; + struct ib_conn *ib_conn = &iser_conn->ib_conn; + struct iser_device *device = ib_conn->device; + + iser_conn->qp_max_recv_dtos = session->cmds_max; + + if (iser_alloc_fastreg_pool(ib_conn, session->scsi_cmds_max, + iser_conn->pages_per_mr)) + goto create_rdma_reg_res_failed; + + if (iser_alloc_login_buf(iser_conn)) + goto alloc_login_buf_fail; + + iser_conn->num_rx_descs = session->cmds_max; + iser_conn->rx_descs = kmalloc_array(iser_conn->num_rx_descs, + sizeof(struct iser_rx_desc), + GFP_KERNEL); + if (!iser_conn->rx_descs) + goto rx_desc_alloc_fail; + + rx_desc = iser_conn->rx_descs; + + for (i = 0; i < iser_conn->qp_max_recv_dtos; i++, rx_desc++) { + dma_addr = ib_dma_map_single(device->ib_device, (void *)rx_desc, + ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); + if (ib_dma_mapping_error(device->ib_device, dma_addr)) + goto rx_desc_dma_map_failed; + + rx_desc->dma_addr = dma_addr; + rx_desc->cqe.done = iser_task_rsp; + rx_sg = &rx_desc->rx_sg; + rx_sg->addr = rx_desc->dma_addr; + rx_sg->length = ISER_RX_PAYLOAD_SIZE; + rx_sg->lkey = device->pd->local_dma_lkey; + } + + return 0; + +rx_desc_dma_map_failed: + rx_desc = iser_conn->rx_descs; + for (j = 0; j < i; j++, rx_desc++) + ib_dma_unmap_single(device->ib_device, rx_desc->dma_addr, + ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); + kfree(iser_conn->rx_descs); + iser_conn->rx_descs = NULL; +rx_desc_alloc_fail: + iser_free_login_buf(iser_conn); +alloc_login_buf_fail: + iser_free_fastreg_pool(ib_conn); +create_rdma_reg_res_failed: + iser_err("failed allocating rx descriptors / data buffers\n"); + return -ENOMEM; +} + +void iser_free_rx_descriptors(struct iser_conn *iser_conn) +{ + int i; + struct iser_rx_desc *rx_desc; + struct ib_conn *ib_conn = &iser_conn->ib_conn; + struct iser_device *device = ib_conn->device; + + iser_free_fastreg_pool(ib_conn); + + rx_desc = iser_conn->rx_descs; + for (i = 0; i < iser_conn->qp_max_recv_dtos; i++, rx_desc++) + ib_dma_unmap_single(device->ib_device, rx_desc->dma_addr, + ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); + kfree(iser_conn->rx_descs); + /* make sure we never redo any unmapping */ + iser_conn->rx_descs = NULL; + + iser_free_login_buf(iser_conn); +} + +static int iser_post_rx_bufs(struct iscsi_conn *conn, struct iscsi_hdr *req) +{ + struct iser_conn *iser_conn = conn->dd_data; + struct iscsi_session *session = conn->session; + int err = 0; + int i; + + iser_dbg("req op %x flags %x\n", req->opcode, req->flags); + /* check if this is the last login - going to full feature phase */ + if ((req->flags & ISCSI_FULL_FEATURE_PHASE) != ISCSI_FULL_FEATURE_PHASE) + goto out; + + if (session->discovery_sess) { + iser_info("Discovery session, re-using login RX buffer\n"); + goto out; + } + + iser_info("Normal session, posting batch of RX %d buffers\n", + iser_conn->qp_max_recv_dtos - 1); + + /* + * Initial post receive buffers. + * There is one already posted recv buffer (for the last login + * response). Therefore, the first recv buffer is skipped here. + */ + for (i = 1; i < iser_conn->qp_max_recv_dtos; i++) { + err = iser_post_recvm(iser_conn, &iser_conn->rx_descs[i]); + if (err) + goto out; + } +out: + return err; +} + +/** + * iser_send_command - send command PDU + * @conn: link to matching iscsi connection + * @task: SCSI command task + */ +int iser_send_command(struct iscsi_conn *conn, struct iscsi_task *task) +{ + struct iser_conn *iser_conn = conn->dd_data; + struct iscsi_iser_task *iser_task = task->dd_data; + unsigned long edtl; + int err; + struct iser_data_buf *data_buf, *prot_buf; + struct iscsi_scsi_req *hdr = (struct iscsi_scsi_req *)task->hdr; + struct scsi_cmnd *sc = task->sc; + struct iser_tx_desc *tx_desc = &iser_task->desc; + + edtl = ntohl(hdr->data_length); + + /* build the tx desc regd header and add it to the tx desc dto */ + tx_desc->type = ISCSI_TX_SCSI_COMMAND; + tx_desc->cqe.done = iser_cmd_comp; + iser_create_send_desc(iser_conn, tx_desc); + + if (hdr->flags & ISCSI_FLAG_CMD_READ) { + data_buf = &iser_task->data[ISER_DIR_IN]; + prot_buf = &iser_task->prot[ISER_DIR_IN]; + } else { + data_buf = &iser_task->data[ISER_DIR_OUT]; + prot_buf = &iser_task->prot[ISER_DIR_OUT]; + } + + if (scsi_sg_count(sc)) { /* using a scatter list */ + data_buf->sg = scsi_sglist(sc); + data_buf->size = scsi_sg_count(sc); + } + data_buf->data_len = scsi_bufflen(sc); + + if (scsi_prot_sg_count(sc)) { + prot_buf->sg = scsi_prot_sglist(sc); + prot_buf->size = scsi_prot_sg_count(sc); + prot_buf->data_len = (data_buf->data_len >> + ilog2(sc->device->sector_size)) * 8; + } + + if (hdr->flags & ISCSI_FLAG_CMD_READ) { + err = iser_prepare_read_cmd(task); + if (err) + goto send_command_error; + } + if (hdr->flags & ISCSI_FLAG_CMD_WRITE) { + err = iser_prepare_write_cmd(task, + task->imm_count, + task->imm_count + + task->unsol_r2t.data_length, + edtl); + if (err) + goto send_command_error; + } + + iser_task->status = ISER_TASK_STATUS_STARTED; + + err = iser_post_send(&iser_conn->ib_conn, tx_desc); + if (!err) + return 0; + +send_command_error: + iser_err("conn %p failed task->itt %d err %d\n",conn, task->itt, err); + return err; +} + +/** + * iser_send_data_out - send data out PDU + * @conn: link to matching iscsi connection + * @task: SCSI command task + * @hdr: pointer to the LLD's iSCSI message header + */ +int iser_send_data_out(struct iscsi_conn *conn, struct iscsi_task *task, + struct iscsi_data *hdr) +{ + struct iser_conn *iser_conn = conn->dd_data; + struct iscsi_iser_task *iser_task = task->dd_data; + struct iser_tx_desc *tx_desc; + struct iser_mem_reg *mem_reg; + unsigned long buf_offset; + unsigned long data_seg_len; + uint32_t itt; + int err; + struct ib_sge *tx_dsg; + + itt = (__force uint32_t)hdr->itt; + data_seg_len = ntoh24(hdr->dlength); + buf_offset = ntohl(hdr->offset); + + iser_dbg("%s itt %d dseg_len %d offset %d\n", + __func__,(int)itt,(int)data_seg_len,(int)buf_offset); + + tx_desc = kmem_cache_zalloc(ig.desc_cache, GFP_ATOMIC); + if (!tx_desc) + return -ENOMEM; + + tx_desc->type = ISCSI_TX_DATAOUT; + tx_desc->cqe.done = iser_dataout_comp; + tx_desc->iser_header.flags = ISER_VER; + memcpy(&tx_desc->iscsi_header, hdr, sizeof(struct iscsi_hdr)); + + /* build the tx desc */ + err = iser_initialize_task_headers(task, tx_desc); + if (err) + goto send_data_out_error; + + mem_reg = &iser_task->rdma_reg[ISER_DIR_OUT]; + tx_dsg = &tx_desc->tx_sg[1]; + tx_dsg->addr = mem_reg->sge.addr + buf_offset; + tx_dsg->length = data_seg_len; + tx_dsg->lkey = mem_reg->sge.lkey; + tx_desc->num_sge = 2; + + if (buf_offset + data_seg_len > iser_task->data[ISER_DIR_OUT].data_len) { + iser_err("Offset:%ld & DSL:%ld in Data-Out inconsistent with total len:%ld, itt:%d\n", + buf_offset, data_seg_len, + iser_task->data[ISER_DIR_OUT].data_len, itt); + err = -EINVAL; + goto send_data_out_error; + } + iser_dbg("data-out itt: %d, offset: %ld, sz: %ld\n", + itt, buf_offset, data_seg_len); + + + err = iser_post_send(&iser_conn->ib_conn, tx_desc); + if (!err) + return 0; + +send_data_out_error: + kmem_cache_free(ig.desc_cache, tx_desc); + iser_err("conn %p failed err %d\n", conn, err); + return err; +} + +int iser_send_control(struct iscsi_conn *conn, struct iscsi_task *task) +{ + struct iser_conn *iser_conn = conn->dd_data; + struct iscsi_iser_task *iser_task = task->dd_data; + struct iser_tx_desc *mdesc = &iser_task->desc; + unsigned long data_seg_len; + int err = 0; + struct iser_device *device; + + /* build the tx desc regd header and add it to the tx desc dto */ + mdesc->type = ISCSI_TX_CONTROL; + mdesc->cqe.done = iser_ctrl_comp; + iser_create_send_desc(iser_conn, mdesc); + + device = iser_conn->ib_conn.device; + + data_seg_len = ntoh24(task->hdr->dlength); + + if (data_seg_len > 0) { + struct iser_login_desc *desc = &iser_conn->login_desc; + struct ib_sge *tx_dsg = &mdesc->tx_sg[1]; + + if (task != conn->login_task) { + iser_err("data present on non login task!!!\n"); + goto send_control_error; + } + + ib_dma_sync_single_for_cpu(device->ib_device, desc->req_dma, + task->data_count, DMA_TO_DEVICE); + + memcpy(desc->req, task->data, task->data_count); + + ib_dma_sync_single_for_device(device->ib_device, desc->req_dma, + task->data_count, DMA_TO_DEVICE); + + tx_dsg->addr = desc->req_dma; + tx_dsg->length = task->data_count; + tx_dsg->lkey = device->pd->local_dma_lkey; + mdesc->num_sge = 2; + } + + if (task == conn->login_task) { + iser_dbg("op %x dsl %lx, posting login rx buffer\n", + task->hdr->opcode, data_seg_len); + err = iser_post_recvl(iser_conn); + if (err) + goto send_control_error; + err = iser_post_rx_bufs(conn, task->hdr); + if (err) + goto send_control_error; + } + + err = iser_post_send(&iser_conn->ib_conn, mdesc); + if (!err) + return 0; + +send_control_error: + iser_err("conn %p failed err %d\n",conn, err); + return err; +} + +void iser_login_rsp(struct ib_cq *cq, struct ib_wc *wc) +{ + struct ib_conn *ib_conn = wc->qp->qp_context; + struct iser_conn *iser_conn = to_iser_conn(ib_conn); + struct iser_login_desc *desc = iser_login(wc->wr_cqe); + struct iscsi_hdr *hdr; + char *data; + int length; + bool full_feature_phase; + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + iser_err_comp(wc, "login_rsp"); + return; + } + + ib_dma_sync_single_for_cpu(ib_conn->device->ib_device, + desc->rsp_dma, ISER_RX_LOGIN_SIZE, + DMA_FROM_DEVICE); + + hdr = desc->rsp + sizeof(struct iser_ctrl); + data = desc->rsp + ISER_HEADERS_LEN; + length = wc->byte_len - ISER_HEADERS_LEN; + full_feature_phase = ((hdr->flags & ISCSI_FULL_FEATURE_PHASE) == + ISCSI_FULL_FEATURE_PHASE) && + (hdr->flags & ISCSI_FLAG_CMD_FINAL); + + iser_dbg("op 0x%x itt 0x%x dlen %d\n", hdr->opcode, + hdr->itt, length); + + iscsi_iser_recv(iser_conn->iscsi_conn, hdr, data, length); + + ib_dma_sync_single_for_device(ib_conn->device->ib_device, + desc->rsp_dma, ISER_RX_LOGIN_SIZE, + DMA_FROM_DEVICE); + + if (!full_feature_phase || + iser_conn->iscsi_conn->session->discovery_sess) + return; + + /* Post the first RX buffer that is skipped in iser_post_rx_bufs() */ + iser_post_recvm(iser_conn, iser_conn->rx_descs); +} + +static inline int iser_inv_desc(struct iser_fr_desc *desc, u32 rkey) +{ + if (unlikely((!desc->sig_protected && rkey != desc->rsc.mr->rkey) || + (desc->sig_protected && rkey != desc->rsc.sig_mr->rkey))) { + iser_err("Bogus remote invalidation for rkey %#x\n", rkey); + return -EINVAL; + } + + desc->rsc.mr_valid = 0; + + return 0; +} + +static int iser_check_remote_inv(struct iser_conn *iser_conn, struct ib_wc *wc, + struct iscsi_hdr *hdr) +{ + if (wc->wc_flags & IB_WC_WITH_INVALIDATE) { + struct iscsi_task *task; + u32 rkey = wc->ex.invalidate_rkey; + + iser_dbg("conn %p: remote invalidation for rkey %#x\n", + iser_conn, rkey); + + if (unlikely(!iser_conn->snd_w_inv)) { + iser_err("conn %p: unexpected remote invalidation, terminating connection\n", + iser_conn); + return -EPROTO; + } + + task = iscsi_itt_to_ctask(iser_conn->iscsi_conn, hdr->itt); + if (likely(task)) { + struct iscsi_iser_task *iser_task = task->dd_data; + struct iser_fr_desc *desc; + + if (iser_task->dir[ISER_DIR_IN]) { + desc = iser_task->rdma_reg[ISER_DIR_IN].mem_h; + if (unlikely(iser_inv_desc(desc, rkey))) + return -EINVAL; + } + + if (iser_task->dir[ISER_DIR_OUT]) { + desc = iser_task->rdma_reg[ISER_DIR_OUT].mem_h; + if (unlikely(iser_inv_desc(desc, rkey))) + return -EINVAL; + } + } else { + iser_err("failed to get task for itt=%d\n", hdr->itt); + return -EINVAL; + } + } + + return 0; +} + + +void iser_task_rsp(struct ib_cq *cq, struct ib_wc *wc) +{ + struct ib_conn *ib_conn = wc->qp->qp_context; + struct iser_conn *iser_conn = to_iser_conn(ib_conn); + struct iser_rx_desc *desc = iser_rx(wc->wr_cqe); + struct iscsi_hdr *hdr; + int length, err; + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + iser_err_comp(wc, "task_rsp"); + return; + } + + ib_dma_sync_single_for_cpu(ib_conn->device->ib_device, + desc->dma_addr, ISER_RX_PAYLOAD_SIZE, + DMA_FROM_DEVICE); + + hdr = &desc->iscsi_header; + length = wc->byte_len - ISER_HEADERS_LEN; + + iser_dbg("op 0x%x itt 0x%x dlen %d\n", hdr->opcode, + hdr->itt, length); + + if (iser_check_remote_inv(iser_conn, wc, hdr)) { + iscsi_conn_failure(iser_conn->iscsi_conn, + ISCSI_ERR_CONN_FAILED); + return; + } + + iscsi_iser_recv(iser_conn->iscsi_conn, hdr, desc->data, length); + + ib_dma_sync_single_for_device(ib_conn->device->ib_device, + desc->dma_addr, ISER_RX_PAYLOAD_SIZE, + DMA_FROM_DEVICE); + + err = iser_post_recvm(iser_conn, desc); + if (err) + iser_err("posting rx buffer err %d\n", err); +} + +void iser_cmd_comp(struct ib_cq *cq, struct ib_wc *wc) +{ + if (unlikely(wc->status != IB_WC_SUCCESS)) + iser_err_comp(wc, "command"); +} + +void iser_ctrl_comp(struct ib_cq *cq, struct ib_wc *wc) +{ + struct iser_tx_desc *desc = iser_tx(wc->wr_cqe); + struct iscsi_task *task; + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + iser_err_comp(wc, "control"); + return; + } + + /* this arithmetic is legal by libiscsi dd_data allocation */ + task = (void *)desc - sizeof(struct iscsi_task); + if (task->hdr->itt == RESERVED_ITT) + iscsi_put_task(task); +} + +void iser_dataout_comp(struct ib_cq *cq, struct ib_wc *wc) +{ + struct iser_tx_desc *desc = iser_tx(wc->wr_cqe); + struct ib_conn *ib_conn = wc->qp->qp_context; + struct iser_device *device = ib_conn->device; + + if (unlikely(wc->status != IB_WC_SUCCESS)) + iser_err_comp(wc, "dataout"); + + ib_dma_unmap_single(device->ib_device, desc->dma_addr, + ISER_HEADERS_LEN, DMA_TO_DEVICE); + kmem_cache_free(ig.desc_cache, desc); +} + +void iser_task_rdma_init(struct iscsi_iser_task *iser_task) + +{ + iser_task->status = ISER_TASK_STATUS_INIT; + + iser_task->dir[ISER_DIR_IN] = 0; + iser_task->dir[ISER_DIR_OUT] = 0; + + iser_task->data[ISER_DIR_IN].data_len = 0; + iser_task->data[ISER_DIR_OUT].data_len = 0; + + iser_task->prot[ISER_DIR_IN].data_len = 0; + iser_task->prot[ISER_DIR_OUT].data_len = 0; + + iser_task->prot[ISER_DIR_IN].dma_nents = 0; + iser_task->prot[ISER_DIR_OUT].dma_nents = 0; + + memset(&iser_task->rdma_reg[ISER_DIR_IN], 0, + sizeof(struct iser_mem_reg)); + memset(&iser_task->rdma_reg[ISER_DIR_OUT], 0, + sizeof(struct iser_mem_reg)); +} + +void iser_task_rdma_finalize(struct iscsi_iser_task *iser_task) +{ + int prot_count = scsi_prot_sg_count(iser_task->sc); + + if (iser_task->dir[ISER_DIR_IN]) { + iser_unreg_mem_fastreg(iser_task, ISER_DIR_IN); + iser_dma_unmap_task_data(iser_task, + &iser_task->data[ISER_DIR_IN], + DMA_FROM_DEVICE); + if (prot_count) + iser_dma_unmap_task_data(iser_task, + &iser_task->prot[ISER_DIR_IN], + DMA_FROM_DEVICE); + } + + if (iser_task->dir[ISER_DIR_OUT]) { + iser_unreg_mem_fastreg(iser_task, ISER_DIR_OUT); + iser_dma_unmap_task_data(iser_task, + &iser_task->data[ISER_DIR_OUT], + DMA_TO_DEVICE); + if (prot_count) + iser_dma_unmap_task_data(iser_task, + &iser_task->prot[ISER_DIR_OUT], + DMA_TO_DEVICE); + } +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/iser_memory.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/iser_memory.c new file mode 100644 index 0000000..6609826 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/iser_memory.c @@ -0,0 +1,382 @@ +/* + * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved. + * Copyright (c) 2013-2014 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include +#include +#include + +#include "iscsi_iser.h" + +void iser_reg_comp(struct ib_cq *cq, struct ib_wc *wc) +{ + iser_err_comp(wc, "memreg"); +} + +static struct iser_fr_desc *iser_reg_desc_get_fr(struct ib_conn *ib_conn) +{ + struct iser_fr_pool *fr_pool = &ib_conn->fr_pool; + struct iser_fr_desc *desc; + unsigned long flags; + + spin_lock_irqsave(&fr_pool->lock, flags); + desc = list_first_entry(&fr_pool->list, + struct iser_fr_desc, list); + list_del(&desc->list); + spin_unlock_irqrestore(&fr_pool->lock, flags); + + return desc; +} + +static void iser_reg_desc_put_fr(struct ib_conn *ib_conn, + struct iser_fr_desc *desc) +{ + struct iser_fr_pool *fr_pool = &ib_conn->fr_pool; + unsigned long flags; + + spin_lock_irqsave(&fr_pool->lock, flags); + list_add(&desc->list, &fr_pool->list); + spin_unlock_irqrestore(&fr_pool->lock, flags); +} + +int iser_dma_map_task_data(struct iscsi_iser_task *iser_task, + struct iser_data_buf *data, + enum iser_data_dir iser_dir, + enum dma_data_direction dma_dir) +{ + struct ib_device *dev; + + iser_task->dir[iser_dir] = 1; + dev = iser_task->iser_conn->ib_conn.device->ib_device; + + data->dma_nents = ib_dma_map_sg(dev, data->sg, data->size, dma_dir); + if (unlikely(data->dma_nents == 0)) { + iser_err("dma_map_sg failed!!!\n"); + return -EINVAL; + } + return 0; +} + +void iser_dma_unmap_task_data(struct iscsi_iser_task *iser_task, + struct iser_data_buf *data, + enum dma_data_direction dir) +{ + struct ib_device *dev; + + dev = iser_task->iser_conn->ib_conn.device->ib_device; + ib_dma_unmap_sg(dev, data->sg, data->size, dir); +} + +static int iser_reg_dma(struct iser_device *device, struct iser_data_buf *mem, + struct iser_mem_reg *reg) +{ + struct scatterlist *sg = mem->sg; + + reg->sge.lkey = device->pd->local_dma_lkey; + /* + * FIXME: rework the registration code path to differentiate + * rkey/lkey use cases + */ + + if (device->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) + reg->rkey = device->pd->unsafe_global_rkey; + else + reg->rkey = 0; + reg->sge.addr = sg_dma_address(&sg[0]); + reg->sge.length = sg_dma_len(&sg[0]); + + iser_dbg("Single DMA entry: lkey=0x%x, rkey=0x%x, addr=0x%llx," + " length=0x%x\n", reg->sge.lkey, reg->rkey, + reg->sge.addr, reg->sge.length); + + return 0; +} + +void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task, + enum iser_data_dir cmd_dir) +{ + struct iser_mem_reg *reg = &iser_task->rdma_reg[cmd_dir]; + struct iser_fr_desc *desc; + struct ib_mr_status mr_status; + + desc = reg->mem_h; + if (!desc) + return; + + /* + * The signature MR cannot be invalidated and reused without checking. + * libiscsi calls the check_protection transport handler only if + * SCSI-Response is received. And the signature MR is not checked if + * the task is completed for some other reason like a timeout or error + * handling. That's why we must check the signature MR here before + * putting it to the free pool. + */ + if (unlikely(desc->sig_protected)) { + desc->sig_protected = false; + ib_check_mr_status(desc->rsc.sig_mr, IB_MR_CHECK_SIG_STATUS, + &mr_status); + } + iser_reg_desc_put_fr(&iser_task->iser_conn->ib_conn, reg->mem_h); + reg->mem_h = NULL; +} + +static void iser_set_dif_domain(struct scsi_cmnd *sc, + struct ib_sig_domain *domain) +{ + domain->sig_type = IB_SIG_TYPE_T10_DIF; + domain->sig.dif.pi_interval = scsi_prot_interval(sc); + domain->sig.dif.ref_tag = t10_pi_ref_tag(scsi_cmd_to_rq(sc)); + /* + * At the moment we hard code those, but in the future + * we will take them from sc. + */ + domain->sig.dif.apptag_check_mask = 0xffff; + domain->sig.dif.app_escape = true; + domain->sig.dif.ref_escape = true; + if (sc->prot_flags & SCSI_PROT_REF_INCREMENT) + domain->sig.dif.ref_remap = true; +} + +static int iser_set_sig_attrs(struct scsi_cmnd *sc, + struct ib_sig_attrs *sig_attrs) +{ + switch (scsi_get_prot_op(sc)) { + case SCSI_PROT_WRITE_INSERT: + case SCSI_PROT_READ_STRIP: + sig_attrs->mem.sig_type = IB_SIG_TYPE_NONE; + iser_set_dif_domain(sc, &sig_attrs->wire); + sig_attrs->wire.sig.dif.bg_type = IB_T10DIF_CRC; + break; + case SCSI_PROT_READ_INSERT: + case SCSI_PROT_WRITE_STRIP: + sig_attrs->wire.sig_type = IB_SIG_TYPE_NONE; + iser_set_dif_domain(sc, &sig_attrs->mem); + sig_attrs->mem.sig.dif.bg_type = sc->prot_flags & SCSI_PROT_IP_CHECKSUM ? + IB_T10DIF_CSUM : IB_T10DIF_CRC; + break; + case SCSI_PROT_READ_PASS: + case SCSI_PROT_WRITE_PASS: + iser_set_dif_domain(sc, &sig_attrs->wire); + sig_attrs->wire.sig.dif.bg_type = IB_T10DIF_CRC; + iser_set_dif_domain(sc, &sig_attrs->mem); + sig_attrs->mem.sig.dif.bg_type = sc->prot_flags & SCSI_PROT_IP_CHECKSUM ? + IB_T10DIF_CSUM : IB_T10DIF_CRC; + break; + default: + iser_err("Unsupported PI operation %d\n", + scsi_get_prot_op(sc)); + return -EINVAL; + } + + return 0; +} + +static inline void iser_set_prot_checks(struct scsi_cmnd *sc, u8 *mask) +{ + *mask = 0; + if (sc->prot_flags & SCSI_PROT_REF_CHECK) + *mask |= IB_SIG_CHECK_REFTAG; + if (sc->prot_flags & SCSI_PROT_GUARD_CHECK) + *mask |= IB_SIG_CHECK_GUARD; +} + +static inline void iser_inv_rkey(struct ib_send_wr *inv_wr, struct ib_mr *mr, + struct ib_cqe *cqe, struct ib_send_wr *next_wr) +{ + inv_wr->opcode = IB_WR_LOCAL_INV; + inv_wr->wr_cqe = cqe; + inv_wr->ex.invalidate_rkey = mr->rkey; + inv_wr->send_flags = 0; + inv_wr->num_sge = 0; + inv_wr->next = next_wr; +} + +static int iser_reg_sig_mr(struct iscsi_iser_task *iser_task, + struct iser_data_buf *mem, + struct iser_data_buf *sig_mem, + struct iser_reg_resources *rsc, + struct iser_mem_reg *sig_reg) +{ + struct iser_tx_desc *tx_desc = &iser_task->desc; + struct ib_cqe *cqe = &iser_task->iser_conn->ib_conn.reg_cqe; + struct ib_mr *mr = rsc->sig_mr; + struct ib_sig_attrs *sig_attrs = mr->sig_attrs; + struct ib_reg_wr *wr = &tx_desc->reg_wr; + int ret; + + memset(sig_attrs, 0, sizeof(*sig_attrs)); + ret = iser_set_sig_attrs(iser_task->sc, sig_attrs); + if (ret) + goto err; + + iser_set_prot_checks(iser_task->sc, &sig_attrs->check_mask); + + if (rsc->mr_valid) + iser_inv_rkey(&tx_desc->inv_wr, mr, cqe, &wr->wr); + + ib_update_fast_reg_key(mr, ib_inc_rkey(mr->rkey)); + + ret = ib_map_mr_sg_pi(mr, mem->sg, mem->dma_nents, NULL, + sig_mem->sg, sig_mem->dma_nents, NULL, SZ_4K); + if (unlikely(ret)) { + iser_err("failed to map PI sg (%d)\n", + mem->dma_nents + sig_mem->dma_nents); + goto err; + } + + memset(wr, 0, sizeof(*wr)); + wr->wr.next = &tx_desc->send_wr; + wr->wr.opcode = IB_WR_REG_MR_INTEGRITY; + wr->wr.wr_cqe = cqe; + wr->wr.num_sge = 0; + wr->wr.send_flags = 0; + wr->mr = mr; + wr->key = mr->rkey; + wr->access = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE; + rsc->mr_valid = 1; + + sig_reg->sge.lkey = mr->lkey; + sig_reg->rkey = mr->rkey; + sig_reg->sge.addr = mr->iova; + sig_reg->sge.length = mr->length; + + iser_dbg("lkey=0x%x rkey=0x%x addr=0x%llx length=%u\n", + sig_reg->sge.lkey, sig_reg->rkey, sig_reg->sge.addr, + sig_reg->sge.length); +err: + return ret; +} + +static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task, + struct iser_data_buf *mem, + struct iser_reg_resources *rsc, + struct iser_mem_reg *reg) +{ + struct iser_tx_desc *tx_desc = &iser_task->desc; + struct ib_cqe *cqe = &iser_task->iser_conn->ib_conn.reg_cqe; + struct ib_mr *mr = rsc->mr; + struct ib_reg_wr *wr = &tx_desc->reg_wr; + int n; + + if (rsc->mr_valid) + iser_inv_rkey(&tx_desc->inv_wr, mr, cqe, &wr->wr); + + ib_update_fast_reg_key(mr, ib_inc_rkey(mr->rkey)); + + n = ib_map_mr_sg(mr, mem->sg, mem->dma_nents, NULL, SZ_4K); + if (unlikely(n != mem->dma_nents)) { + iser_err("failed to map sg (%d/%d)\n", + n, mem->dma_nents); + return n < 0 ? n : -EINVAL; + } + + wr->wr.next = &tx_desc->send_wr; + wr->wr.opcode = IB_WR_REG_MR; + wr->wr.wr_cqe = cqe; + wr->wr.send_flags = 0; + wr->wr.num_sge = 0; + wr->mr = mr; + wr->key = mr->rkey; + wr->access = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ; + + rsc->mr_valid = 1; + + reg->sge.lkey = mr->lkey; + reg->rkey = mr->rkey; + reg->sge.addr = mr->iova; + reg->sge.length = mr->length; + + iser_dbg("lkey=0x%x rkey=0x%x addr=0x%llx length=0x%x\n", + reg->sge.lkey, reg->rkey, reg->sge.addr, reg->sge.length); + + return 0; +} + +static int iser_reg_data_sg(struct iscsi_iser_task *task, + struct iser_data_buf *mem, + struct iser_fr_desc *desc, bool use_dma_key, + struct iser_mem_reg *reg) +{ + struct iser_device *device = task->iser_conn->ib_conn.device; + + if (use_dma_key) + return iser_reg_dma(device, mem, reg); + + return iser_fast_reg_mr(task, mem, &desc->rsc, reg); +} + +int iser_reg_mem_fastreg(struct iscsi_iser_task *task, + enum iser_data_dir dir, + bool all_imm) +{ + struct ib_conn *ib_conn = &task->iser_conn->ib_conn; + struct iser_data_buf *mem = &task->data[dir]; + struct iser_mem_reg *reg = &task->rdma_reg[dir]; + struct iser_fr_desc *desc = NULL; + bool use_dma_key; + int err; + + use_dma_key = mem->dma_nents == 1 && (all_imm || !iser_always_reg) && + scsi_get_prot_op(task->sc) == SCSI_PROT_NORMAL; + + if (!use_dma_key) { + desc = iser_reg_desc_get_fr(ib_conn); + reg->mem_h = desc; + } + + if (scsi_get_prot_op(task->sc) == SCSI_PROT_NORMAL) { + err = iser_reg_data_sg(task, mem, desc, use_dma_key, reg); + if (unlikely(err)) + goto err_reg; + } else { + err = iser_reg_sig_mr(task, mem, &task->prot[dir], + &desc->rsc, reg); + if (unlikely(err)) + goto err_reg; + + desc->sig_protected = true; + } + + return 0; + +err_reg: + if (desc) + iser_reg_desc_put_fr(ib_conn, desc); + + return err; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/iser_spec_ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/iser_spec_ new file mode 100644 index 0000000..fdd7b3c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/iser_spec_ @@ -0,0 +1,216 @@ +# +# Copyright (c) 2014 Mellanox Technologies. All rights reserved. +# +# This Software is licensed under one of the following licenses: +# +# 1) under the terms of the "Common Public License 1.0" a copy of which is +# available from the Open Source Initiative, see +# http://www.opensource.org/licenses/cpl.php. +# +# 2) under the terms of the "The BSD License" a copy of which is +# available from the Open Source Initiative, see +# http://www.opensource.org/licenses/bsd-license.php. +# +# 3) under the terms of the "GNU General Public License (GPL) Version 2" a +# copy of which is available from the Open Source Initiative, see +# http://www.opensource.org/licenses/gpl-license.php. +# +# Licensee has the right to choose one of the above licenses. +# +# Redistributions of source code must retain the above copyright +# notice and one of the license notices. +# +# Redistributions in binary form must reproduce both the above copyright +# notice, one of the license notices in the documentation +# and/or other materials provided with the distribution. +# +# + +%{!?_name: %define _name iser} +%{!?_version: %define _version 4.0} +%{!?_release: %define _release 0} + +# KMP is disabled by default +%{!?KMP: %global KMP 0} + +# take kernel version or default to uname -r +%{!?KVERSION: %global KVERSION %(uname -r)} +%global kernel_version %{KVERSION} +%global krelver %(echo -n %{KVERSION} | sed -e 's/-/_/g') +# take path to kernel sources if provided, otherwise look in default location (for non KMP rpms). +%{!?K_SRC: %global K_SRC /lib/modules/%{KVERSION}/build} + +# define release version +%{!?src_release: %global src_release %{_release}_%{krelver}} +%if "%{KMP}" != "1" +%global _release1 %{src_release} +%else +%global _release1 %{_release} +%endif +%global _kmp_rel %{_release1}%{?_kmp_build_num}%{?_dist} + +Summary: %{_name} Driver +Name: %{_name} +Version: %{_version} +Release: %{_release1}%{?_dist} +License: GPLv2 +Url: http://www.mellanox.com +Group: System Environment/Base +Source: %{_name}-%{_version}.tgz +BuildRoot: %{?build_root:%{build_root}}%{!?build_root:/var/tmp/OFED} +Vendor: Mellanox Technologies +%description +%{name} kernel modules + +# build KMP rpms? +%if "%{KMP}" == "1" +%global kernel_release() $(make -s -C %{1} kernelrelease M=$PWD) +BuildRequires: %kernel_module_package_buildreqs +%(mkdir -p %{buildroot}) +%(echo '%defattr (-,root,root)' > %{buildroot}/file_list) +%(echo '/lib/modules/%2-%1' >> %{buildroot}/file_list) +%(echo '%config(noreplace) %{_sysconfdir}/depmod.d/zz02-%{name}-*-%1.conf' >> %{buildroot}/file_list) +%{kernel_module_package -f %{buildroot}/file_list -x xen -r %{_kmp_rel} } +%else +%global kernel_source() %{K_SRC} +%global kernel_release() %{KVERSION} +%global flavors_to_build default +%endif + +# +# setup module sign scripts if paths to the keys are given +# +%global WITH_MOD_SIGN %(if ( test -f "$MODULE_SIGN_PRIV_KEY" && test -f "$MODULE_SIGN_PUB_KEY" ); \ + then \ + echo -n '1'; \ + else \ + echo -n '0'; fi) + +%if "%{WITH_MOD_SIGN}" == "1" +# call module sign script +%global __modsign_install_post \ + %{_builddir}/%{name}-%{version}/source/tools/sign-modules %{buildroot}/lib/modules/ %{kernel_source default} || exit 1 \ +%{nil} + +%global __debug_package 1 +%global buildsubdir %{name}-%{version} +# Disgusting hack alert! We need to ensure we sign modules *after* all +# invocations of strip occur, which is in __debug_install_post if +# find-debuginfo.sh runs, and __os_install_post if not. +# +%global __spec_install_post \ + %{?__debug_package:%{__debug_install_post}} \ + %{__arch_install_post} \ + %{__os_install_post} \ + %{__modsign_install_post} \ +%{nil} + +%endif # end of setup module sign scripts +# + +%if "%{_vendor}" == "suse" +%debug_package +%endif + +# set modules dir +%if "%{_vendor}" == "redhat" || ("%{_vendor}" == "openEuler") +%if 0%{?fedora} +%global install_mod_dir updates/%{name} +%else +%global install_mod_dir extra/%{name} +%endif +%endif + +%if "%{_vendor}" == "suse" +%global install_mod_dir updates/%{name} +%endif + +%{!?install_mod_dir: %global install_mod_dir updates/%{name}} + +%prep +%setup +set -- * +mkdir source +mv "$@" source/ +mkdir obj + +%build +export EXTRA_CFLAGS='-DVERSION=\"%version\"' +export INSTALL_MOD_DIR=%{install_mod_dir} +export CONF_OPTIONS="%{configure_options}" +for flavor in %{flavors_to_build}; do + export K_BUILD=%{kernel_source $flavor} + export KVER=%{kernel_release $K_BUILD} + export LIB_MOD_DIR=/lib/modules/$KVER/$INSTALL_MOD_DIR + rm -rf obj/$flavor + cp -r source obj/$flavor + cd $PWD/obj/$flavor + make + cd - +done + +%install +export INSTALL_MOD_PATH=%{buildroot} +export INSTALL_MOD_DIR=%{install_mod_dir} +export PREFIX=%{_prefix} +for flavor in %flavors_to_build; do + export K_BUILD=%{kernel_source $flavor} + export KVER=%{kernel_release $K_BUILD} + cd $PWD/obj/$flavor + make install KERNELRELEASE=$KVER + # Cleanup unnecessary kernel-generated module dependency files. + find $INSTALL_MOD_PATH/lib/modules -iname 'modules.*' -exec rm {} \; + cd - +done + +# Set the module(s) to be executable, so that they will be stripped when packaged. +find %{buildroot} \( -type f -name '*.ko' -o -name '*ko.gz' \) -exec %{__chmod} u+x \{\} \; + +%{__install} -d %{buildroot}%{_sysconfdir}/depmod.d/ +for module in `find %{buildroot}/ -name '*.ko' -o -name '*.ko.gz' | sort` +do +ko_name=${module##*/} +mod_name=${ko_name/.ko*/} +mod_path=${module/*\/%{name}} +mod_path=${mod_path/\/${ko_name}} +%if "%{_vendor}" == "suse" + for flavor in %{flavors_to_build}; do + if [[ $module =~ $flavor ]] || [ "X%{KMP}" != "X1" ];then + echo "override ${mod_name} * updates/%{name}${mod_path}" >> %{buildroot}%{_sysconfdir}/depmod.d/zz02-%{name}-${mod_name}-$flavor.conf + fi + done +%else + %if 0%{?fedora} + echo "override ${mod_name} * updates/%{name}${mod_path}" >> %{buildroot}%{_sysconfdir}/depmod.d/zz02-%{name}-${mod_name}.conf + %else + %if "%{_vendor}" == "redhat" || ("%{_vendor}" == "openEuler") + echo "override ${mod_name} * weak-updates/%{name}${mod_path}" >> %{buildroot}%{_sysconfdir}/depmod.d/zz02-%{name}-${mod_name}.conf + %endif + echo "override ${mod_name} * extra/%{name}${mod_path}" >> %{buildroot}%{_sysconfdir}/depmod.d/zz02-%{name}-${mod_name}.conf + %endif +%endif +done + + +%clean +rm -rf %{buildroot} + +%post +if [ $1 -ge 1 ]; then # 1 : This package is being installed or reinstalled + /sbin/depmod %{KVERSION} +fi # 1 : closed +# END of post + +%postun +/sbin/depmod %{KVERSION} + +%if "%{KMP}" != "1" +%files +%defattr(-,root,root,-) +/lib/modules/%{KVERSION}/%{install_mod_dir}/ +%config(noreplace) %{_sysconfdir}/depmod.d/zz02-%{name}-*.conf +%endif + +%changelog +* Thu Feb 20 2014 Alaa Hleihel +- Initial packaging diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/iser_verbs.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/iser_verbs.c new file mode 100644 index 0000000..11eecd6 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/iser_verbs.c @@ -0,0 +1,970 @@ +/* + * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved. + * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. + * Copyright (c) 2013-2014 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include + +#include "iscsi_iser.h" + +#define ISCSI_ISER_MAX_CONN 8 +#define ISER_MAX_RX_LEN (ISER_QP_MAX_RECV_DTOS * ISCSI_ISER_MAX_CONN) +#define ISER_MAX_TX_LEN (ISER_QP_MAX_REQ_DTOS * ISCSI_ISER_MAX_CONN) +#define ISER_MAX_CQ_LEN (ISER_MAX_RX_LEN + ISER_MAX_TX_LEN + \ + ISCSI_ISER_MAX_CONN) + +static void iser_qp_event_callback(struct ib_event *cause, void *context) +{ + iser_err("qp event %s (%d)\n", + ib_event_msg(cause->event), cause->event); +} + +static void iser_event_handler(struct ib_event_handler *handler, + struct ib_event *event) +{ + iser_err("async event %s (%d) on device %s port %d\n", + ib_event_msg(event->event), event->event, + dev_name(&event->device->dev), event->element.port_num); +} + +/* + * iser_create_device_ib_res - creates Protection Domain (PD), Completion + * Queue (CQ), DMA Memory Region (DMA MR) with the device associated with + * the adaptor. + * + * Return: 0 on success, -1 on failure + */ +static int iser_create_device_ib_res(struct iser_device *device) +{ + struct ib_device *ib_dev = device->ib_device; + + if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) { + iser_err("IB device does not support memory registrations\n"); + return -1; + } + + device->pd = ib_alloc_pd(ib_dev, + iser_always_reg ? 0 : IB_PD_UNSAFE_GLOBAL_RKEY); + if (IS_ERR(device->pd)) + goto pd_err; + + INIT_IB_EVENT_HANDLER(&device->event_handler, ib_dev, + iser_event_handler); + ib_register_event_handler(&device->event_handler); + return 0; + +pd_err: + iser_err("failed to allocate an IB resource\n"); + return -1; +} + +/* + * iser_free_device_ib_res - destroy/dealloc/dereg the DMA MR, + * CQ and PD created with the device associated with the adaptor. + */ +static void iser_free_device_ib_res(struct iser_device *device) +{ + ib_unregister_event_handler(&device->event_handler); + ib_dealloc_pd(device->pd); + + device->pd = NULL; +} + +static struct iser_fr_desc * +iser_create_fastreg_desc(struct iser_device *device, + struct ib_pd *pd, + bool pi_enable, + unsigned int size) +{ + struct iser_fr_desc *desc; + struct ib_device *ib_dev = device->ib_device; + enum ib_mr_type mr_type; + int ret; + + desc = kzalloc(sizeof(*desc), GFP_KERNEL); + if (!desc) + return ERR_PTR(-ENOMEM); + + if (ib_dev->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG) + mr_type = IB_MR_TYPE_SG_GAPS; + else + mr_type = IB_MR_TYPE_MEM_REG; + + desc->rsc.mr = ib_alloc_mr(pd, mr_type, size); + if (IS_ERR(desc->rsc.mr)) { + ret = PTR_ERR(desc->rsc.mr); + iser_err("Failed to allocate ib_fast_reg_mr err=%d\n", ret); + goto err_alloc_mr; + } + + if (pi_enable) { + desc->rsc.sig_mr = ib_alloc_mr_integrity(pd, size, size); + if (IS_ERR(desc->rsc.sig_mr)) { + ret = PTR_ERR(desc->rsc.sig_mr); + iser_err("Failed to allocate sig_mr err=%d\n", ret); + goto err_alloc_mr_integrity; + } + } + desc->rsc.mr_valid = 0; + + return desc; + +err_alloc_mr_integrity: + ib_dereg_mr(desc->rsc.mr); +err_alloc_mr: + kfree(desc); + + return ERR_PTR(ret); +} + +static void iser_destroy_fastreg_desc(struct iser_fr_desc *desc) +{ + struct iser_reg_resources *res = &desc->rsc; + + ib_dereg_mr(res->mr); + if (res->sig_mr) { + ib_dereg_mr(res->sig_mr); + res->sig_mr = NULL; + } + kfree(desc); +} + +/** + * iser_alloc_fastreg_pool - Creates pool of fast_reg descriptors + * for fast registration work requests. + * @ib_conn: connection RDMA resources + * @cmds_max: max number of SCSI commands for this connection + * @size: max number of pages per map request + * + * Return: 0 on success, or errno code on failure + */ +int iser_alloc_fastreg_pool(struct ib_conn *ib_conn, + unsigned cmds_max, + unsigned int size) +{ + struct iser_device *device = ib_conn->device; + struct iser_fr_pool *fr_pool = &ib_conn->fr_pool; + struct iser_fr_desc *desc; + int i, ret; + + INIT_LIST_HEAD(&fr_pool->list); + INIT_LIST_HEAD(&fr_pool->all_list); + spin_lock_init(&fr_pool->lock); + fr_pool->size = 0; + for (i = 0; i < cmds_max; i++) { + desc = iser_create_fastreg_desc(device, device->pd, + ib_conn->pi_support, size); + if (IS_ERR(desc)) { + ret = PTR_ERR(desc); + goto err; + } + + list_add_tail(&desc->list, &fr_pool->list); + list_add_tail(&desc->all_list, &fr_pool->all_list); + fr_pool->size++; + } + + return 0; + +err: + iser_free_fastreg_pool(ib_conn); + return ret; +} + +/** + * iser_free_fastreg_pool - releases the pool of fast_reg descriptors + * @ib_conn: connection RDMA resources + */ +void iser_free_fastreg_pool(struct ib_conn *ib_conn) +{ + struct iser_fr_pool *fr_pool = &ib_conn->fr_pool; + struct iser_fr_desc *desc, *tmp; + int i = 0; + + if (list_empty(&fr_pool->all_list)) + return; + + iser_info("freeing conn %p fr pool\n", ib_conn); + + list_for_each_entry_safe(desc, tmp, &fr_pool->all_list, all_list) { + list_del(&desc->all_list); + iser_destroy_fastreg_desc(desc); + ++i; + } + + if (i < fr_pool->size) + iser_warn("pool still has %d regions registered\n", + fr_pool->size - i); +} + +/* + * iser_create_ib_conn_res - Queue-Pair (QP) + * + * Return: 0 on success, -1 on failure + */ +static int iser_create_ib_conn_res(struct ib_conn *ib_conn) +{ + struct iser_conn *iser_conn = to_iser_conn(ib_conn); + struct iser_device *device; + struct ib_device *ib_dev; + struct ib_qp_init_attr init_attr; + int ret = -ENOMEM; + unsigned int max_send_wr, cq_size; + + BUG_ON(ib_conn->device == NULL); + + device = ib_conn->device; + ib_dev = device->ib_device; + + if (ib_conn->pi_support) + max_send_wr = ISER_QP_SIG_MAX_REQ_DTOS + 1; + else + max_send_wr = ISER_QP_MAX_REQ_DTOS + 1; + max_send_wr = min_t(unsigned int, max_send_wr, + (unsigned int)ib_dev->attrs.max_qp_wr); + + cq_size = max_send_wr + ISER_QP_MAX_RECV_DTOS; + ib_conn->cq = ib_cq_pool_get(ib_dev, cq_size, -1, IB_POLL_SOFTIRQ); + if (IS_ERR(ib_conn->cq)) { + ret = PTR_ERR(ib_conn->cq); + goto cq_err; + } + ib_conn->cq_size = cq_size; + + memset(&init_attr, 0, sizeof(init_attr)); + + init_attr.event_handler = iser_qp_event_callback; + init_attr.qp_context = (void *)ib_conn; + init_attr.send_cq = ib_conn->cq; + init_attr.recv_cq = ib_conn->cq; + init_attr.cap.max_recv_wr = ISER_QP_MAX_RECV_DTOS; + init_attr.cap.max_send_sge = 2; + init_attr.cap.max_recv_sge = 1; + init_attr.sq_sig_type = IB_SIGNAL_REQ_WR; + init_attr.qp_type = IB_QPT_RC; + init_attr.cap.max_send_wr = max_send_wr; + if (ib_conn->pi_support) + init_attr.create_flags |= IB_QP_CREATE_INTEGRITY_EN; + iser_conn->max_cmds = ISER_GET_MAX_XMIT_CMDS(max_send_wr - 1); + + ret = rdma_create_qp(ib_conn->cma_id, device->pd, &init_attr); + if (ret) + goto out_err; + + ib_conn->qp = ib_conn->cma_id->qp; + iser_info("setting conn %p cma_id %p qp %p max_send_wr %d\n", ib_conn, + ib_conn->cma_id, ib_conn->cma_id->qp, max_send_wr); + return ret; + +out_err: + ib_cq_pool_put(ib_conn->cq, ib_conn->cq_size); +cq_err: + iser_err("unable to alloc mem or create resource, err %d\n", ret); + + return ret; +} + +/* + * based on the resolved device node GUID see if there already allocated + * device for this device. If there's no such, create one. + */ +static +struct iser_device *iser_device_find_by_ib_device(struct rdma_cm_id *cma_id) +{ + struct iser_device *device; + + mutex_lock(&ig.device_list_mutex); + + list_for_each_entry(device, &ig.device_list, ig_list) + /* find if there's a match using the node GUID */ + if (device->ib_device->node_guid == cma_id->device->node_guid) + goto inc_refcnt; + + device = kzalloc(sizeof *device, GFP_KERNEL); + if (!device) + goto out; + + /* assign this device to the device */ + device->ib_device = cma_id->device; + /* init the device and link it into ig device list */ + if (iser_create_device_ib_res(device)) { + kfree(device); + device = NULL; + goto out; + } + list_add(&device->ig_list, &ig.device_list); + +inc_refcnt: + device->refcount++; +out: + mutex_unlock(&ig.device_list_mutex); + return device; +} + +/* if there's no demand for this device, release it */ +static void iser_device_try_release(struct iser_device *device) +{ + mutex_lock(&ig.device_list_mutex); + device->refcount--; + iser_info("device %p refcount %d\n", device, device->refcount); + if (!device->refcount) { + iser_free_device_ib_res(device); + list_del(&device->ig_list); + kfree(device); + } + mutex_unlock(&ig.device_list_mutex); +} + +/* + * Called with state mutex held + */ +static int iser_conn_state_comp_exch(struct iser_conn *iser_conn, + enum iser_conn_state comp, + enum iser_conn_state exch) +{ + int ret; + + ret = (iser_conn->state == comp); + if (ret) + iser_conn->state = exch; + + return ret; +} + +void iser_release_work(struct work_struct *work) +{ + struct iser_conn *iser_conn; + + iser_conn = container_of(work, struct iser_conn, release_work); + + /* Wait for conn_stop to complete */ + wait_for_completion(&iser_conn->stop_completion); + /* Wait for IB resouces cleanup to complete */ + if (!wait_for_completion_timeout(&iser_conn->ib_completion, + msecs_to_jiffies(IB_COMPLETION_TMO))) + iser_info("RDMA cleanup completion timeout expired, conn %p\n", + iser_conn); + + mutex_lock(&iser_conn->state_mutex); + iser_conn->state = ISER_CONN_DOWN; + mutex_unlock(&iser_conn->state_mutex); + + iser_conn_release(iser_conn); +} + +/** + * iser_free_ib_conn_res - release IB related resources + * @iser_conn: iser connection struct + * @destroy: indicator if we need to try to release the + * iser device and memory regoins pool (only iscsi + * shutdown and DEVICE_REMOVAL will use this). + * + * This routine is called with the iser state mutex held + * so the cm_id removal is out of here. It is Safe to + * be invoked multiple times. + */ +static void iser_free_ib_conn_res(struct iser_conn *iser_conn, bool destroy) +{ + struct ib_conn *ib_conn = &iser_conn->ib_conn; + struct iser_device *device = ib_conn->device; + + iser_info("freeing conn %p cma_id %p qp %p\n", + iser_conn, ib_conn->cma_id, ib_conn->qp); + + if (ib_conn->qp) { + rdma_destroy_qp(ib_conn->cma_id); + ib_cq_pool_put(ib_conn->cq, ib_conn->cq_size); + ib_conn->qp = NULL; + } + + if (destroy) { + if (iser_conn->rx_descs) + iser_free_rx_descriptors(iser_conn); + + if (device) { + iser_device_try_release(device); + ib_conn->device = NULL; + } + } +} + +/** + * iser_conn_release - Frees all conn objects and deallocs conn descriptor + * @iser_conn: iSER connection context + */ +void iser_conn_release(struct iser_conn *iser_conn) +{ + struct ib_conn *ib_conn = &iser_conn->ib_conn; + + mutex_lock(&ig.connlist_mutex); + list_del(&iser_conn->conn_list); + mutex_unlock(&ig.connlist_mutex); + + mutex_lock(&iser_conn->state_mutex); + /* In case we endup here without ep_disconnect being invoked. */ + if (iser_conn->state != ISER_CONN_DOWN) { + iser_warn("iser conn %p state %d, expected state down.\n", + iser_conn, iser_conn->state); + iscsi_destroy_endpoint(iser_conn->ep); + iser_conn->state = ISER_CONN_DOWN; + } + /* + * In case we never got to bind stage, we still need to + * release IB resources (which is safe to call more than once). + */ + iser_free_ib_conn_res(iser_conn, true); + mutex_unlock(&iser_conn->state_mutex); + + if (ib_conn->cma_id) { + rdma_destroy_id(ib_conn->cma_id); + ib_conn->cma_id = NULL; + } + + kfree(iser_conn); +} + +/** + * iser_conn_terminate - triggers start of the disconnect procedures and + * waits for them to be done + * @iser_conn: iSER connection context + * + * Called with state mutex held + */ +int iser_conn_terminate(struct iser_conn *iser_conn) +{ + struct ib_conn *ib_conn = &iser_conn->ib_conn; + int err = 0; + + /* terminate the iser conn only if the conn state is UP */ + if (!iser_conn_state_comp_exch(iser_conn, ISER_CONN_UP, + ISER_CONN_TERMINATING)) + return 0; + + iser_info("iser_conn %p state %d\n", iser_conn, iser_conn->state); + + /* suspend queuing of new iscsi commands */ + if (iser_conn->iscsi_conn) + iscsi_suspend_queue(iser_conn->iscsi_conn); + + /* + * In case we didn't already clean up the cma_id (peer initiated + * a disconnection), we need to Cause the CMA to change the QP + * state to ERROR. + */ + if (ib_conn->cma_id) { + err = rdma_disconnect(ib_conn->cma_id); + if (err) + iser_err("Failed to disconnect, conn: 0x%p err %d\n", + iser_conn, err); + + /* block until all flush errors are consumed */ + ib_drain_sq(ib_conn->qp); + } + + return 1; +} + +/* + * Called with state mutex held + */ +static void iser_connect_error(struct rdma_cm_id *cma_id) +{ + struct iser_conn *iser_conn; + + iser_conn = cma_id->context; + iser_conn->state = ISER_CONN_TERMINATING; +} + +static void iser_calc_scsi_params(struct iser_conn *iser_conn, + unsigned int max_sectors) +{ + struct iser_device *device = iser_conn->ib_conn.device; + struct ib_device_attr *attr = &device->ib_device->attrs; + unsigned short sg_tablesize, sup_sg_tablesize; + unsigned short reserved_mr_pages; + u32 max_num_sg; + + /* + * FRs without SG_GAPS can only map up to a (device) page per entry, + * but if the first entry is misaligned we'll end up using two entries + * (head and tail) for a single page worth data, so one additional + * entry is required. + */ + if (attr->device_cap_flags & IB_DEVICE_SG_GAPS_REG) + reserved_mr_pages = 0; + else + reserved_mr_pages = 1; + + if (iser_conn->ib_conn.pi_support) + max_num_sg = attr->max_pi_fast_reg_page_list_len; + else + max_num_sg = attr->max_fast_reg_page_list_len; + + sg_tablesize = DIV_ROUND_UP(max_sectors * SECTOR_SIZE, SZ_4K); + sup_sg_tablesize = min_t(uint, ISCSI_ISER_MAX_SG_TABLESIZE, + max_num_sg - reserved_mr_pages); + iser_conn->scsi_sg_tablesize = min(sg_tablesize, sup_sg_tablesize); + iser_conn->pages_per_mr = + iser_conn->scsi_sg_tablesize + reserved_mr_pages; +} + +/* + * Called with state mutex held + */ +static void iser_addr_handler(struct rdma_cm_id *cma_id) +{ + struct iser_device *device; + struct iser_conn *iser_conn; + struct ib_conn *ib_conn; + int ret; + + iser_conn = cma_id->context; + if (iser_conn->state != ISER_CONN_PENDING) + /* bailout */ + return; + + ib_conn = &iser_conn->ib_conn; + device = iser_device_find_by_ib_device(cma_id); + if (!device) { + iser_err("device lookup/creation failed\n"); + iser_connect_error(cma_id); + return; + } + + ib_conn->device = device; + + /* connection T10-PI support */ + if (iser_pi_enable) { + if (!(device->ib_device->attrs.device_cap_flags & + IB_DEVICE_INTEGRITY_HANDOVER)) { + iser_warn("T10-PI requested but not supported on %s, " + "continue without T10-PI\n", + dev_name(&ib_conn->device->ib_device->dev)); + ib_conn->pi_support = false; + } else { + ib_conn->pi_support = true; + } + } + + iser_calc_scsi_params(iser_conn, iser_max_sectors); + + ret = rdma_resolve_route(cma_id, 1000); + if (ret) { + iser_err("resolve route failed: %d\n", ret); + iser_connect_error(cma_id); + return; + } +} + +/* + * Called with state mutex held + */ +static void iser_route_handler(struct rdma_cm_id *cma_id) +{ + struct rdma_conn_param conn_param; + int ret; + struct iser_cm_hdr req_hdr; + struct iser_conn *iser_conn = cma_id->context; + struct ib_conn *ib_conn = &iser_conn->ib_conn; + struct ib_device *ib_dev = ib_conn->device->ib_device; + + if (iser_conn->state != ISER_CONN_PENDING) + /* bailout */ + return; + + ret = iser_create_ib_conn_res(ib_conn); + if (ret) + goto failure; + + memset(&conn_param, 0, sizeof conn_param); + conn_param.responder_resources = ib_dev->attrs.max_qp_rd_atom; + conn_param.initiator_depth = 1; + conn_param.retry_count = 7; + conn_param.rnr_retry_count = 6; + + memset(&req_hdr, 0, sizeof(req_hdr)); + req_hdr.flags = ISER_ZBVA_NOT_SUP; + if (!iser_always_reg) + req_hdr.flags |= ISER_SEND_W_INV_NOT_SUP; + conn_param.private_data = (void *)&req_hdr; + conn_param.private_data_len = sizeof(struct iser_cm_hdr); + + ret = rdma_connect_locked(cma_id, &conn_param); + if (ret) { + iser_err("failure connecting: %d\n", ret); + goto failure; + } + + return; +failure: + iser_connect_error(cma_id); +} + +static void iser_connected_handler(struct rdma_cm_id *cma_id, + const void *private_data) +{ + struct iser_conn *iser_conn; + struct ib_qp_attr attr; + struct ib_qp_init_attr init_attr; + + iser_conn = cma_id->context; + if (iser_conn->state != ISER_CONN_PENDING) + /* bailout */ + return; + + (void)ib_query_qp(cma_id->qp, &attr, ~0, &init_attr); + iser_info("remote qpn:%x my qpn:%x\n", attr.dest_qp_num, cma_id->qp->qp_num); + + if (private_data) { + u8 flags = *(u8 *)private_data; + + iser_conn->snd_w_inv = !(flags & ISER_SEND_W_INV_NOT_SUP); + } + + iser_info("conn %p: negotiated %s invalidation\n", + iser_conn, iser_conn->snd_w_inv ? "remote" : "local"); + + iser_conn->state = ISER_CONN_UP; + complete(&iser_conn->up_completion); +} + +static void iser_disconnected_handler(struct rdma_cm_id *cma_id) +{ + struct iser_conn *iser_conn = cma_id->context; + + if (iser_conn_terminate(iser_conn)) { + if (iser_conn->iscsi_conn) + iscsi_conn_failure(iser_conn->iscsi_conn, + ISCSI_ERR_CONN_FAILED); + else + iser_err("iscsi_iser connection isn't bound\n"); + } +} + +static void iser_cleanup_handler(struct rdma_cm_id *cma_id, + bool destroy) +{ + struct iser_conn *iser_conn = cma_id->context; + + /* + * We are not guaranteed that we visited disconnected_handler + * by now, call it here to be safe that we handle CM drep + * and flush errors. + */ + iser_disconnected_handler(cma_id); + iser_free_ib_conn_res(iser_conn, destroy); + complete(&iser_conn->ib_completion); +} + +static int iser_cma_handler(struct rdma_cm_id *cma_id, + struct rdma_cm_event *event) +{ + struct iser_conn *iser_conn; + int ret = 0; + + iser_conn = cma_id->context; + iser_info("%s (%d): status %d conn %p id %p\n", + rdma_event_msg(event->event), event->event, + event->status, cma_id->context, cma_id); + + mutex_lock(&iser_conn->state_mutex); + switch (event->event) { + case RDMA_CM_EVENT_ADDR_RESOLVED: + iser_addr_handler(cma_id); + break; + case RDMA_CM_EVENT_ROUTE_RESOLVED: + iser_route_handler(cma_id); + break; + case RDMA_CM_EVENT_ESTABLISHED: + iser_connected_handler(cma_id, event->param.conn.private_data); + break; + case RDMA_CM_EVENT_REJECTED: + iser_info("Connection rejected: %s\n", + rdma_reject_msg(cma_id, event->status)); + fallthrough; + case RDMA_CM_EVENT_ADDR_ERROR: + case RDMA_CM_EVENT_ROUTE_ERROR: + case RDMA_CM_EVENT_CONNECT_ERROR: + case RDMA_CM_EVENT_UNREACHABLE: + iser_connect_error(cma_id); + break; + case RDMA_CM_EVENT_DISCONNECTED: + case RDMA_CM_EVENT_ADDR_CHANGE: + case RDMA_CM_EVENT_TIMEWAIT_EXIT: + iser_cleanup_handler(cma_id, false); + break; + case RDMA_CM_EVENT_DEVICE_REMOVAL: + /* + * we *must* destroy the device as we cannot rely + * on iscsid to be around to initiate error handling. + * also if we are not in state DOWN implicitly destroy + * the cma_id. + */ + iser_cleanup_handler(cma_id, true); + if (iser_conn->state != ISER_CONN_DOWN) { + iser_conn->ib_conn.cma_id = NULL; + ret = 1; + } + break; + default: + iser_err("Unexpected RDMA CM event: %s (%d)\n", + rdma_event_msg(event->event), event->event); + break; + } + mutex_unlock(&iser_conn->state_mutex); + + return ret; +} + +void iser_conn_init(struct iser_conn *iser_conn) +{ + struct ib_conn *ib_conn = &iser_conn->ib_conn; + + iser_conn->state = ISER_CONN_INIT; + init_completion(&iser_conn->stop_completion); + init_completion(&iser_conn->ib_completion); + init_completion(&iser_conn->up_completion); + INIT_LIST_HEAD(&iser_conn->conn_list); + mutex_init(&iser_conn->state_mutex); + + ib_conn->reg_cqe.done = iser_reg_comp; +} + +/* + * starts the process of connecting to the target + * sleeps until the connection is established or rejected + */ +int iser_connect(struct iser_conn *iser_conn, struct sockaddr *src_addr, + struct sockaddr *dst_addr, int non_blocking) +{ + struct ib_conn *ib_conn = &iser_conn->ib_conn; + int err = 0; + + mutex_lock(&iser_conn->state_mutex); + + sprintf(iser_conn->name, "%pISp", dst_addr); + + iser_info("connecting to: %s\n", iser_conn->name); + + /* the device is known only --after-- address resolution */ + ib_conn->device = NULL; + + iser_conn->state = ISER_CONN_PENDING; + + ib_conn->cma_id = rdma_create_id(&init_net, iser_cma_handler, + iser_conn, RDMA_PS_TCP, IB_QPT_RC); + if (IS_ERR(ib_conn->cma_id)) { + err = PTR_ERR(ib_conn->cma_id); + iser_err("rdma_create_id failed: %d\n", err); + goto id_failure; + } + + err = rdma_resolve_addr(ib_conn->cma_id, src_addr, dst_addr, 1000); + if (err) { + iser_err("rdma_resolve_addr failed: %d\n", err); + goto addr_failure; + } + + if (!non_blocking) { + wait_for_completion_interruptible(&iser_conn->up_completion); + + if (iser_conn->state != ISER_CONN_UP) { + err = -EIO; + goto connect_failure; + } + } + mutex_unlock(&iser_conn->state_mutex); + + mutex_lock(&ig.connlist_mutex); + list_add(&iser_conn->conn_list, &ig.connlist); + mutex_unlock(&ig.connlist_mutex); + return 0; + +id_failure: + ib_conn->cma_id = NULL; +addr_failure: + iser_conn->state = ISER_CONN_DOWN; +connect_failure: + mutex_unlock(&iser_conn->state_mutex); + iser_conn_release(iser_conn); + return err; +} + +int iser_post_recvl(struct iser_conn *iser_conn) +{ + struct ib_conn *ib_conn = &iser_conn->ib_conn; + struct iser_login_desc *desc = &iser_conn->login_desc; + struct ib_recv_wr wr; + int ret; + + desc->sge.addr = desc->rsp_dma; + desc->sge.length = ISER_RX_LOGIN_SIZE; + desc->sge.lkey = ib_conn->device->pd->local_dma_lkey; + + desc->cqe.done = iser_login_rsp; + wr.wr_cqe = &desc->cqe; + wr.sg_list = &desc->sge; + wr.num_sge = 1; + wr.next = NULL; + + ret = ib_post_recv(ib_conn->qp, &wr, NULL); + if (unlikely(ret)) + iser_err("ib_post_recv login failed ret=%d\n", ret); + + return ret; +} + +int iser_post_recvm(struct iser_conn *iser_conn, struct iser_rx_desc *rx_desc) +{ + struct ib_conn *ib_conn = &iser_conn->ib_conn; + struct ib_recv_wr wr; + int ret; + + rx_desc->cqe.done = iser_task_rsp; + wr.wr_cqe = &rx_desc->cqe; + wr.sg_list = &rx_desc->rx_sg; + wr.num_sge = 1; + wr.next = NULL; + + ret = ib_post_recv(ib_conn->qp, &wr, NULL); + if (unlikely(ret)) + iser_err("ib_post_recv failed ret=%d\n", ret); + + return ret; +} + + +/** + * iser_post_send - Initiate a Send DTO operation + * @ib_conn: connection RDMA resources + * @tx_desc: iSER TX descriptor + * + * Return: 0 on success, -1 on failure + */ +int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc) +{ + struct ib_send_wr *wr = &tx_desc->send_wr; + struct ib_send_wr *first_wr; + int ret; + + ib_dma_sync_single_for_device(ib_conn->device->ib_device, + tx_desc->dma_addr, ISER_HEADERS_LEN, + DMA_TO_DEVICE); + + wr->next = NULL; + wr->wr_cqe = &tx_desc->cqe; + wr->sg_list = tx_desc->tx_sg; + wr->num_sge = tx_desc->num_sge; + wr->opcode = IB_WR_SEND; + wr->send_flags = IB_SEND_SIGNALED; + + if (tx_desc->inv_wr.next) + first_wr = &tx_desc->inv_wr; + else if (tx_desc->reg_wr.wr.next) + first_wr = &tx_desc->reg_wr.wr; + else + first_wr = wr; + + ret = ib_post_send(ib_conn->qp, first_wr, NULL); + if (unlikely(ret)) + iser_err("ib_post_send failed, ret:%d opcode:%d\n", + ret, wr->opcode); + + return ret; +} + +u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task, + enum iser_data_dir cmd_dir, sector_t *sector) +{ + struct iser_mem_reg *reg = &iser_task->rdma_reg[cmd_dir]; + struct iser_fr_desc *desc = reg->mem_h; + unsigned long sector_size = iser_task->sc->device->sector_size; + struct ib_mr_status mr_status; + int ret; + + if (desc && desc->sig_protected) { + desc->sig_protected = false; + ret = ib_check_mr_status(desc->rsc.sig_mr, + IB_MR_CHECK_SIG_STATUS, &mr_status); + if (ret) { + iser_err("ib_check_mr_status failed, ret %d\n", ret); + /* Not a lot we can do, return ambiguous guard error */ + *sector = 0; + return 0x1; + } + + if (mr_status.fail_status & IB_MR_CHECK_SIG_STATUS) { + sector_t sector_off = mr_status.sig_err.sig_err_offset; + + sector_div(sector_off, sector_size + 8); + *sector = scsi_get_sector(iser_task->sc) + sector_off; + + iser_err("PI error found type %d at sector %llx " + "expected %x vs actual %x\n", + mr_status.sig_err.err_type, + (unsigned long long)*sector, + mr_status.sig_err.expected, + mr_status.sig_err.actual); + + switch (mr_status.sig_err.err_type) { + case IB_SIG_BAD_GUARD: + return 0x1; + case IB_SIG_BAD_REFTAG: + return 0x3; + case IB_SIG_BAD_APPTAG: + return 0x2; + } + } + } + + return 0; +} + +void iser_err_comp(struct ib_wc *wc, const char *type) +{ + if (wc->status != IB_WC_WR_FLUSH_ERR) { + struct iser_conn *iser_conn = to_iser_conn(wc->qp->qp_context); + + iser_err("%s failure: %s (%d) vend_err %#x\n", type, + ib_wc_status_msg(wc->status), wc->status, + wc->vendor_err); + + if (iser_conn->iscsi_conn) + iscsi_conn_failure(iser_conn->iscsi_conn, + ISCSI_ERR_CONN_FAILED); + } else { + iser_dbg("%s failure: %s (%d)\n", type, + ib_wc_status_msg(wc->status), wc->status); + } +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/tools/sign-modules b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/tools/sign-modules new file mode 100755 index 0000000..b790769 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/iser/tools/sign-modules @@ -0,0 +1,58 @@ +#! /bin/bash + +moddir=$1; shift +KBUILD=$1; shift + +SOURCES_DIR= +case "$KBUILD" in + *linux-obj*) + SOURCES_DIR=$(readlink -f $KBUILD 2>/dev/null | sed -e 's/-obj.*//g') + ;; + */usr/src/linux-*-obj/*) + SOURCES_DIR=$(readlink -f $KBUILD 2>/dev/null | sed -e 's/-obj.*//g') + ;; + *) + SOURCES_DIR=$(readlink -f ${KBUILD/build/source}) + ;; +esac +if [ ! -e "$SOURCES_DIR" ]; then + SOURCES_DIR=$KBUILD +fi + +SIGN_FILE= +if [ -e "${KBUILD}/scripts/sign-file" ]; then + SIGN_FILE="${KBUILD}/scripts/sign-file" +elif [ -e "${SOURCES_DIR}/scripts/sign-file" ]; then + SIGN_FILE="${SOURCES_DIR}/scripts/sign-file" +else + echo "Error: Sign tool does not exist at '$KBUILD' or '$SOURCES_DIR' !" >&2 + exit 1 +fi +echo "Found Sign tool at: '${SIGN_FILE}'" + +if [ ! -e "${MODULE_SIGN_PRIV_KEY}" ]; then + echo "Error: MODULE_SIGN_PRIV_KEY is not set to valid path!" >&2 + exit 1 +fi +if [ ! -e "${MODULE_SIGN_PUB_KEY}" ]; then + echo "Error: MODULE_SIGN_PUB_KEY is not set to valid path!" >&2 + exit 1 +fi + +modules=`find $moddir -name '*.ko' -o -name '*.ko.gz'` +for mod in $modules +do + dir=`dirname $mod` + file=`basename $mod` + + ${SIGN_FILE} sha256 ${MODULE_SIGN_PRIV_KEY} ${MODULE_SIGN_PUB_KEY} ${dir}/${file} + rm -f ${dir}/${file}.{sig,dig} +done + +RANDOMMOD=$(find $moddir -type f -name '*.ko' -o -name '*.ko.gz' | sort -R | tail -n 1) +if [ "~Module signature appended~" != "$(tail -c 28 $RANDOMMOD)" ]; then + echo "*** Modules are unsigned! ***" + exit 1 +fi + +exit 0 diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/Kconfig b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/Kconfig new file mode 100644 index 0000000..798147a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/Kconfig @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0-only +config INFINIBAND_ISERT + tristate "iSCSI Extensions for RDMA (iSER) target support" + depends on INET && INFINIBAND_ADDR_TRANS && TARGET_CORE && ISCSI_TARGET + help + Support for iSCSI Extensions for RDMA (iSER) Target on Infiniband fabrics. diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/Makefile new file mode 100644 index 0000000..73ba5e9 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/Makefile @@ -0,0 +1,29 @@ +# SPDX-License-Identifier: GPL-2.0-only +ccflags-y := -Idrivers/target -Idrivers/target/iscsi +obj-$(CONFIG_INFINIBAND_ISERT) += ib_isert.o +obj-$(CONFIG_INFINIBAND_ISERT_DUMMY) += ib_isert.o + +ifeq ($(CONFIG_INFINIBAND_ISERT_DUMMY),m) +ib_isert-y := ib_isert_dummy.o +else ifeq ($(CONFIG_ISCSI_TARGET),) +ib_isert-y := ib_isert_dummy.o +endif + +# Retpoline support: check if this is the right architecture and that +# the kernel does not support it already. +# Alternatively, if we are called from the main mlnx-ofa build system, +# CONFIG_RETPOLINE will be set by the configure script, however +# subdir-ccflags-y will be set by the toplevel Makefile. +ifneq (,$(findstring $(ARCH),i386 x86_64)) + ifndef CONFIG_RETPOLINE + ifneq (,$(shell awk 'BEGIN {if ($(VERSION).$(PATCHLEVEL) < 4.15) {print 1}}' /dev/null | head -1) +kconfig_h=$(shell /bin/ls -1 $(K_BUILD)/include/*/kconfig.h 2> /dev/null | head -1) + +ifneq ($(kconfig_h),) +KCONFIG_H = -include $(kconfig_h) +endif + +V ?= 0 + +# GCC earlier than 4.6.0 will build modules which require 'mcount', +# and this symbol will not be available in the kernel if the kernel was +# compiled with GCC 4.6.0 and above. +# therefore, to prevent unknown symbol issues we disable function tracing. +# +CC = $(CROSS_COMPILE)gcc +CPP = $(CC) -E + +CPP_MAJOR := $(shell $(CPP) -dumpversion 2>&1 | cut -d'.' -f1) +CPP_MINOR := $(shell $(CPP) -dumpversion 2>&1 | cut -d'.' -f2) +CPP_PATCH := $(shell $(CPP) -dumpversion 2>&1 | cut -d'.' -f3) +# Assumes that major, minor, and patch cannot exceed 999 +CPP_VERS := $(shell expr 0$(CPP_MAJOR) \* 1000000 + 0$(CPP_MINOR) \* 1000 + 0$(CPP_PATCH)) +compile_h=$(shell /bin/ls -1 $(K_BUILD)/include/*/compile.h 2> /dev/null | head -1) +ifneq ($(compile_h),) +KERNEL_GCC_MAJOR := $(shell grep LINUX_COMPILER $(compile_h) | sed -r -e 's/.*gcc version ([0-9\.\-]*) .*/\1/g' | cut -d'.' -f1) +KERNEL_GCC_MINOR := $(shell grep LINUX_COMPILER $(compile_h) | sed -r -e 's/.*gcc version ([0-9\.\-]*) .*/\1/g' | cut -d'.' -f2) +KERNEL_GCC_PATCH := $(shell grep LINUX_COMPILER $(compile_h) | sed -r -e 's/.*gcc version ([0-9\.\-]*) .*/\1/g' | cut -d'.' -f3) +KERNEL_GCC_VER := $(shell expr 0$(KERNEL_GCC_MAJOR) \* 1000000 + 0$(KERNEL_GCC_MINOR) \* 1000 + 0$(KERNEL_GCC_PATCH)) +ifneq ($(shell if [ $(CPP_VERS) -lt 4006000 ] && [ $(KERNEL_GCC_VER) -ge 4006000 ]; then \ + echo "YES"; else echo ""; fi),) +$(info Warning: The kernel was compiled with GCC newer than 4.6.0, while the current GCC is older than 4.6.0, Disabling function tracing to prevent unknown symbol issues...) +override WITH_MAKE_PARAMS += CONFIG_FUNCTION_TRACER= CONFIG_HAVE_FENTRY= +endif +ifneq ($(shell if [ $(CPP_VERS) -ge 4006000 ] && [ $(KERNEL_GCC_VER) -lt 4006000 ]; then \ + echo "YES"; else echo ""; fi),) +$(info Warning: The kernel was compiled with GCC older than 4.6.0, while the current GCC is newer than 4.6.0, Disabling function tracing to prevent unknown symbol issues...) +override WITH_MAKE_PARAMS += CONFIG_FUNCTION_TRACER= CONFIG_HAVE_FENTRY= +endif +endif + +ifneq ($(shell if (echo $(KVER) | grep -qE 'uek'); then \ + echo "YES"; else echo ""; fi),) +override WITH_MAKE_PARAMS += ctf-dir=$(CWD)/.ctf +endif + +name := isert +VERSION=$(shell grep "define _version" $(name).spec | sed -e 's/.*_version //' | sed -e 's/}//' | sed -e 's/\s*//g') +RELEASE=$(shell grep "define _release" $(name).spec | sed -e 's/.*_release //' | sed -e 's/}//' | sed -e 's/\s*//g') +PACKAGE := $(name)-$(VERSION) +SHELL = /bin/bash +rpmspec := $(name).spec +rpmroot = $(PWD)/rpm-dist/ +rpmopts = --nodeps --buildroot='$(rpmroot)/_rpm' --define '_source_filedigest_algorithm md5' --define '_binary_filedigest_algorithm md5' +rpmmacros =\ + --define='_topdir $(rpmroot)'\ + --define='_rpmdir $(rpmroot)'\ + --define='_srcrpmdir $(rpmroot)'\ + --define='_sourcedir $(rpmroot)'\ + --define='_specdir $(PWD)' +override WITH_MAKE_PARAMS += KBUILD_EXTRA_SYMBOLS=$(OFA)/Module.symvers + +LINUXINCLUDE=\ + $(EXTRA_CFLAGS) \ + -include $(autoconf_h) \ + $(KCONFIG_H) \ + -include $(OFA)/include/linux/compat-2.6.h \ + -I$(OFA)/include \ + -I$(OFA)/include/uapi \ + $(BACKPORT_INCLUDES) \ + $$(if $$(CONFIG_XEN),-D__XEN_INTERFACE_VERSION__=$$(CONFIG_XEN_INTERFACE_VERSION)) \ + $$(if $$(CONFIG_XEN),-I$$(srctree)/arch/x86/include/mach-xen) \ + -I$$(srctree)/arch/$$(SRCARCH)/include \ + -Iarch/$$(SRCARCH)/include/generated \ + -Iinclude \ + -I$$(srctree)/arch/$$(SRCARCH)/include/uapi \ + -Iarch/$$(SRCARCH)/include/generated/uapi \ + -I$$(srctree)/include \ + -I$$(srctree)/include/uapi \ + -Iinclude/generated/uapi \ + $$(if $$(KBUILD_SRC),-Iinclude2 -I$$(srctree)/include) \ + -I$$(srctree)/arch/$$(SRCARCH)/include \ + -Iarch/$$(SRCARCH)/include/generated \ + # + +default: +ifneq ($(shell test -d $(OFA) && echo "true" || echo "" ),) +# compile with ofed driver + make -C $(K_BUILD) O=$(K_OBJ) M=$(shell pwd) $(WITH_MAKE_PARAMS) \ + CONFIG_INFINIBAND_ISERT=m \ + CONFIG_INFINIBAND_ISERT_DUMMY= \ + CONFIG_DTRACE= \ + CONFIG_CTF= \ + LINUXINCLUDE='$(LINUXINCLUDE)' \ + modules +else +# compile with inbox driver + make EXTRA_CFLAGS="$(EXTRA_CFLAGS)" -C $(K_BUILD) O=$(K_OBJ) M=$(shell pwd) $(WITH_MAKE_PARAMS) \ + modules +endif + +install: + make -C $(K_BUILD) O=$(K_OBJ) M=$(shell pwd) INSTALL_MOD_PATH=$(INSTALL_MOD_PATH) INSTALL_MOD_DIR=$(INSTALL_MOD_DIR) $(WITH_MAKE_PARAMS) modules_install + if [ ! -n "$(INSTALL_MOD_PATH)" ]; then /sbin/depmod $(KVER);fi; + +rpmcheck: + @which rpmbuild &> /dev/null; \ + if [ $$? -ne 0 ]; then \ + echo "*** This make target requires an rpm-based linux distribution."; \ + (exit 1); exit 1; \ + fi + -mkdir -p $(rpmroot)/BUILD + +srcrpm: dist rpmcheck $(rpmspec) + -rpmbuild -bs --define 'src_release $(RELEASE)' $(rpmmacros) $(rpmopts) $(rpmspec); \ + if [ $$? -ne 0 ]; then \ + (exit 1); exit 1; \ + fi + +binrpm: rpmcheck $(rpmspec) + -rpmbuild -bb $(rpmmacros) $(rpmopts) $(rpmspec); \ + if [ $$? -ne 0 ]; then \ + (exit 1); exit 1; \ + fi + +dist: + mkdir -p $(rpmroot)/$(PACKAGE)/ + cp {$(rpmspec),Kconfig,makefile,Makefile,dkms.conf,Module.supported} $(rpmroot)/$(PACKAGE)/ + cp common.postinst $(rpmroot)/$(PACKAGE)/ + cp *.c $(rpmroot)/$(PACKAGE)/ + cp *.h $(rpmroot)/$(PACKAGE)/ + cp -r debian $(rpmroot)/$(PACKAGE)/ + cp -r tools $(rpmroot)/$(PACKAGE)/ + cd $(rpmroot) && tar czf $(PACKAGE).tgz $(PACKAGE) + cd $(rpmroot) && tar czf $(name)_$(VERSION).orig.tar.gz $(PACKAGE) + +clean: + rm -f *.o + rm -f *.ko *.ko.gz + rm -f *.mod.c + rm -f Module*.symvers modules*.order + +distclean: clean + @rm -rf $(PWD)/rpm-dist + rm -f makefile *.spec + +all: clean distclean dist srcrpm binrpm diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/autogen.sh b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/autogen.sh new file mode 100755 index 0000000..11a1077 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/autogen.sh @@ -0,0 +1,9 @@ +#!/bin/bash +set -e + +name=isert +version=$(grep "define _version" ${name}_spec_ | sed -e 's/.*_version //' | sed -e 's/}//' | sed -e 's/\s*//g') +release=$(grep "define _release" ${name}_spec_ | sed -e 's/.*_release //' | sed -e 's/}//' | sed -e 's/\s*//g') +/bin/cp -f ${name}_spec_ ${name}.spec +/bin/cp -f _makefile_ makefile +/bin/sed -i -r "s/^$name \(([0-9.-]+)\) (.*)/$name \($version-$release\) \2/" debian/changelog diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/common.postinst b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/common.postinst new file mode 100755 index 0000000..bbf9aad --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/common.postinst @@ -0,0 +1,296 @@ +#!/bin/sh +# Copyright (C) 2002-2005 Flavio Stanchina +# Copyright (C) 2005-2006 Aric Cyr +# Copyright (C) 2007 Mario Limonciello +# Copyright (C) 2009 Alberto Milone + +set -e + +uname_s=$(uname -s) + +_get_kernel_dir() { + KVER=$1 + case ${uname_s} in + Linux) DIR="/lib/modules/$KVER/build" ;; + GNU/kFreeBSD) DIR="/usr/src/kfreebsd-headers-$KVER/sys" ;; + esac + echo $DIR +} + +_check_kernel_dir() { + DIR=$(_get_kernel_dir $1) + case ${uname_s} in + Linux) test -e $DIR/include ;; + GNU/kFreeBSD) test -e $DIR/kern && test -e $DIR/conf/kmod.mk ;; + *) return 1 ;; + esac + return $? +} + +# Check the existence of a kernel named as $1 +_is_kernel_name_correct() { + CORRECT="no" + KERNEL_NAME=$1 + + for kernel in /boot/config-*; do + KERNEL=${kernel#*-} + if [ "${KERNEL}" = "${KERNEL_NAME}" ]; then + CORRECT="yes" + break + fi + done + + echo $CORRECT +} + + +# Get the most recent kernel on Debian based systems. This keeps +# into account both the version and the ABI. If the current kernel +# is the most recent kernel then the function will print a null string. +_get_newest_kernel_debian() { + NEWEST_KERNEL= + NEWEST_VERSION= + NEWEST_ABI= + + for kernel in /boot/config-*; do + KERNEL=${kernel#*-} + KERNEL_VERSION=${KERNEL%%-*} + ABI=${KERNEL#*-} + ABI=${ABI%%-*} + + if [ -z "$NEWEST_KERNEL" ]; then + # The 1st time get a version which is bigger than $1 + COMPARE_TO=$1 + else + # Get the biggest version + COMPARE_TO="$NEWEST_VERSION-$NEWEST_ABI" + fi + + # if $kernel is greater than $COMPARE_TO + if [ `dpkg --compare-versions "$KERNEL_VERSION-$ABI" gt "$COMPARE_TO" && echo "yes" || \ + echo "no"` = "yes" ]; then + NEWEST_KERNEL=$KERNEL + NEWEST_VERSION=$KERNEL_VERSION + NEWEST_ABI=$ABI + fi + done + + echo "$NEWEST_KERNEL" +} + +# Get the most recent kernel in Rhel based systems. If the current kernel +# is the most recent kernel then the function will print a null string. +_get_newest_kernel_rhel() { + NEWEST_KERNEL= + + LAST_INSTALLED_KERNEL=$(rpm -q --whatprovides kernel --last | grep kernel -m1 | cut -f1 -d' ') + + LIK_FORMATTED_NAME=$(rpm -q $LAST_INSTALLED_KERNEL --queryformat="%{VERSION}-%{RELEASE}.%{ARCH}\n") + + if [ `echo $LIK_FORMATTED_NAME | grep 2.6 >/dev/null` ]; then + # Fedora and Suse + NEWEST_KERNEL=$LIK_FORMATTED_NAME + else + # Hack for Mandriva where $LIK_FORMATTED_NAME is broken + LIK_NAME=$(rpm -q $LAST_INSTALLED_KERNEL --queryformat="%{NAME}\n") + LIK_TYPE=${LIK_NAME#kernel-} + LIK_TYPE=${LIK_TYPE%%-*} + LIK_STRIPPED=${LIK_NAME#kernel-} + LIK_STRIPPED=${LIK_STRIPPED#$LIK_TYPE-} + LIK_STRIPPED_BASE=${LIK_STRIPPED%%-*} + LIK_STRIPPED_END=${LIK_STRIPPED#$LIK_STRIPPED_BASE-} + LIK_FINAL=$LIK_STRIPPED_BASE-$LIK_TYPE-$LIK_STRIPPED_END + + NEWEST_KERNEL=$LIK_FINAL + fi + + echo $NEWEST_KERNEL +} + +# Get the newest kernel on Debian and Rhel based systems. +get_newest_kernel() { + NEWEST_KERNEL= + # Try Debian first as rpm can be installed in Debian based distros + if [ -e /usr/bin/dpkg ]; then + # If DEB based + CURRENT_KERNEL=$1 + CURRENT_VERSION=${CURRENT_KERNEL%%-*} + CURRENT_ABI=${CURRENT_KERNEL#*-} + CURRENT_FLAVOUR=${CURRENT_ABI#*-} + CURRENT_ABI=${CURRENT_ABI%%-*} + NEWEST_KERNEL=$(_get_newest_kernel_debian "$CURRENT_VERSION-$CURRENT_ABI") + + elif [ `which rpm >/dev/null` ]; then + # If RPM based + NEWEST_KERNEL=$(_get_newest_kernel_rhel) + fi + + # Make sure that kernel name that we extracted corresponds to an installed + # kernel + if [ -n "$NEWEST_KERNEL" ] && [ `_is_kernel_name_correct $NEWEST_KERNEL` = "no" ]; then + NEWEST_KERNEL= + fi + + echo $NEWEST_KERNEL +} + +NAME=$1 +VERSION=$2 +TARBALL_ROOT=$3 +ARCH=$4 +UPGRADE=$5 + +if [ -z "$NAME" ] || [ -z "$VERSION" ]; then + echo "Need NAME, and VERSION defined" + echo "ARCH is optional" + exit 1 +fi + +KERNELS=$(ls /lib/modules/ 2>/dev/null || true) +CURRENT_KERNEL=$(uname -r) + +#We never want to keep an older version side by side to prevent conflicts +if [ -e "/var/lib/dkms/$NAME/$VERSION" ]; then + echo "Removing old $NAME-$VERSION DKMS files..." + dkms remove -m $NAME -v $VERSION --all +fi + +#Load new files, by source package and by tarball +if [ -f "$TARBALL_ROOT/$NAME-$VERSION.dkms.tar.gz" ]; then + if ! dkms ldtarball --archive "$TARBALL_ROOT/$NAME-$VERSION.dkms.tar.gz"; then + echo "" + echo "" + echo "Unable to load DKMS tarball $TARBALL_ROOT/$NAME-$VERSION.dkms.tar.gz." + echo "Common causes include: " + echo " - You must be using DKMS 2.1.0.0 or later to support binaries only" + echo " distribution specific archives." + echo " - Corrupt distribution specific archive" + echo "" + echo "" + exit 2 + fi +elif [ -d "/usr/src/$NAME-$VERSION" ]; then + echo "Loading new $NAME-$VERSION DKMS files..." + dkms add -m $NAME -v $VERSION > /dev/null +fi + +# On 1st installation, let us look for a directory +# in /lib/modules which matches `uname -r`. If none +# is found it is possible that buildd is being used +# and that uname -r is giving us the name of the +# kernel used by the buildd machine. +# +# If this is the case we try to build the kernel +# module for each kernel which has a directory in +# /lib/modules. Furthermore we will have to tell +# DKMS which architecture it should build the module +# for (e.g. if the buildd machine is using a +# 2.6.24-23-xen 64bit kernel). +# +# NOTE: if the headers are not installed then the +# module won't be built, as usual +if [ -z "$UPGRADE" ]; then + echo "First Installation: checking all kernels..." + for KERNEL in $KERNELS; do + if [ ${KERNEL} = ${CURRENT_KERNEL} ]; then + # Kernel found + KERNELS=$CURRENT_KERNEL + break + fi + done +else + KERNELS=$CURRENT_KERNEL +fi + +# Here we look for the most recent kernel so that we can +# build the module for it (in addition to doing it for the +# current kernel. +NEWEST_KERNEL=$(get_newest_kernel "$KERNELS") + +# If the current kernel doesn't come from the host of a chroot +if [ `_is_kernel_name_correct $CURRENT_KERNEL` = "yes" ]; then + # See if it's worth building the module for both the newest kernel + # and for the current kernel + if [ -n "$NEWEST_KERNEL" ] && [ ${CURRENT_KERNEL} != ${NEWEST_KERNEL} ]; then + echo "Building for $CURRENT_KERNEL and $NEWEST_KERNEL" + KERNELS="$CURRENT_KERNEL $NEWEST_KERNEL" + else + echo "Building only for $CURRENT_KERNEL" + fi +# The current kernel is not useful as it's the host's +else + echo "It is likely that $CURRENT_KERNEL belongs to a chroot's host" + + # Let's use only the newest kernel + if [ -n "$NEWEST_KERNEL" ]; then + KERNELS="$NEWEST_KERNEL" + echo "Building only for $NEWEST_KERNEL" + fi +fi + +if [ -n "$ARCH" ]; then + if which lsb_release >/dev/null && [ $(lsb_release -s -i) = "Ubuntu" ]; then + case $ARCH in + amd64) + ARCH="x86_64" + ;; + lpia|i?86) + ARCH="i686" + ;; + esac + fi + echo "Building for architecture $ARCH" + ARCH="-a $ARCH" +fi + +for KERNEL in $KERNELS; do + dkms_status=`dkms status -m $NAME -v $VERSION -k $KERNEL $ARCH` + if [ `echo $KERNEL | grep -c "BOOT"` -gt 0 ]; then + echo "" + echo "Module build and install for $KERNEL was skipped as " + echo "it is a BOOT variant" + continue + fi + + + #if the module isn't yet built, try to build it + if [ `echo $dkms_status | grep -c ": built"` -eq 0 ]; then + if [ ! -L /var/lib/dkms/$NAME/$VERSION/source ]; then + echo "This package appears to be a binaries-only package" + echo " you will not be able to build against kernel $KERNEL" + echo " since the package source was not provided" + continue + fi + if _check_kernel_dir $KERNEL; then + echo "Building initial module for $KERNEL" + set +e + dkms build -m $NAME -v $VERSION -k $KERNEL $ARCH > /dev/null + rc=$? + case $rc in + 9) + set -e + echo "Skipped." + continue + ;; + 0) + set -e + echo "Done." + ;; + *) + exit $rc + ;; + esac + dkms_status=`dkms status -m $NAME -v $VERSION -k $KERNEL $ARCH` + else + echo "Module build for the currently running kernel was skipped since the" + echo "kernel source for this kernel does not seem to be installed." + fi + fi + + #if the module is built (either pre-built or just now), install it + if [ `echo $dkms_status | grep -c ": built"` -eq 1 ] && + [ `echo $dkms_status | grep -c ": installed"` -eq 0 ]; then + dkms install -m $NAME -v $VERSION -k $KERNEL $ARCH --force + fi +done + diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/debian/changelog b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/debian/changelog new file mode 100644 index 0000000..be6a390 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/debian/changelog @@ -0,0 +1,5 @@ +isert (0.0-0) unstable; urgency=low + + * Initial release. + + -- Alaa Hleihel Sun, 16 Feb 2014 17:30:53 +0200 diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/debian/compat b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/debian/compat new file mode 100644 index 0000000..45a4fb7 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/debian/compat @@ -0,0 +1 @@ +8 diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/debian/control b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/debian/control new file mode 100644 index 0000000..d075b61 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/debian/control @@ -0,0 +1,17 @@ +Source: isert +Section: kernel +Priority: optional +Maintainer: Mellanox Technologies +Build-Depends: debhelper (>= 8.0.0), autotools-dev, bzip2, dkms +Standards-Version: 1.0 +Homepage: http://www.mellanox.com + +Package: isert-dkms +Section: kernel +Architecture: all +Depends: dkms, make, mlnx-ofed-kernel-dkms, ${misc:Depends} +Recommends: linux-headers-arm64 | linux-headers-powerpc | linux-headers-ppc64 | linux-headers-ppc64le | linux-headers-amd64 | linux-headers | linux-headers-generic +Description: DKMS support fo isert kernel modules + This package provides integration with the DKMS infrastructure for automatically building out of tree kernel modules. + . + This package contains the source to be built with dkms. diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/debian/control.no_dkms b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/debian/control.no_dkms new file mode 100644 index 0000000..4774408 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/debian/control.no_dkms @@ -0,0 +1,14 @@ +Source: isert +Section: kernel +Priority: optional +Maintainer: Mellanox Technologies +Build-Depends: debhelper (>= 8.0.0), autotools-dev, bzip2, make +Standards-Version: 1.0 +Homepage: http://www.mellanox.com + +Package: isert-modules +Section: kernel +Architecture: any +Depends: mlnx-ofed-kernel-modules, ${misc:Depends} +Description: isert kernel modules + This package provides the binary code for the isert kernel modules. diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/debian/copyright b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/debian/copyright new file mode 100644 index 0000000..53aa878 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/debian/copyright @@ -0,0 +1,19 @@ +Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ + +Files: * +Copyright: Copyright 2017 Mellanox Technologies +License: GPL-2 + Mellanox OFED (MLNX_OFED) Software distributed under the terms of the GNU General Public License ("GPL") version 2 as published by the Free Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/debian/isert-dkms.postinst b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/debian/isert-dkms.postinst new file mode 100755 index 0000000..29085fd --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/debian/isert-dkms.postinst @@ -0,0 +1,43 @@ +#!/bin/sh +set -e + +# Get the package version +NAME=isert +PACKAGE_NAME=$NAME-dkms +CVERSION=`dpkg-query -W -f='${Version}' $PACKAGE_NAME | awk -F "-" '{print $1}' | cut -d\: -f2` +ARCH=`uname -m` + +dkms_configure () { + POSTINST="/usr/src/$NAME-$CVERSION/common.postinst" + if [ -f "$POSTINST" ]; then + "$POSTINST" "$NAME" "$CVERSION" "/usr/share/$PACKAGE_NAME" "$ARCH" "$2" + return $? + fi + echo "WARNING: $POSTINST does not exist." >&2 + echo "ERROR: DKMS version is too old and $PACKAGE_NAME was not" >&2 + echo "built with legacy DKMS support." >&2 + echo "You must either rebuild $PACKAGE_NAME with legacy postinst" >&2 + echo "support or upgrade DKMS to a more current version." >&2 + return 1 +} + +case "$1" in + configure) + dkms_configure + ;; + + abort-upgrade|abort-remove|abort-deconfigure) + ;; + + *) + echo "postinst called with unknown argument \`$1'" >&2 + exit 1 + ;; +esac + +# dh_installdeb will replace this with shell code automatically +# generated by other debhelper scripts. + +#DEBHELPER# + +exit 0 diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/debian/isert-dkms.prerm b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/debian/isert-dkms.prerm new file mode 100755 index 0000000..72df0e6 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/debian/isert-dkms.prerm @@ -0,0 +1,13 @@ +#!/bin/sh +set -e + +# Get the package version +package=isert +version=`dpkg-query -W -f='${Version}' "$package-dkms" \ + | sed -e 's/[+-].*//'` + +dkms remove -m "$package" -v "$version" --all || true + +#DEBHELPER# + +exit 0 diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/debian/rules b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/debian/rules new file mode 100755 index 0000000..be257b9 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/debian/rules @@ -0,0 +1,109 @@ +#!/usr/bin/make -f +# -*- makefile -*- +# Sample debian/rules that uses debhelper. +# This file was originally written by Joey Hess and Craig Small. +# As a special exception, when this file is copied by dh-make into a +# dh-make output file, you may use that output file without restriction. +# This special exception was added by Craig Small in version 0.37 of dh-make. +# +# This version is for a hypothetical package that can build a kernel modules +# architecture-dependant package via make-kpkg, as well as an +# architecture-independent module source package, and other packages +# either dep/indep for things like common files or userspace components +# needed for the kernel modules. + +# Uncomment this to turn on verbose mode. +#export DH_VERBOSE=1 + +WITH_DKMS ?= 1 +WITH_MOD_SIGN ?= 0 +MLXNUMC = $(shell grep ^processor /proc/cpuinfo | wc -l) +NJOBS ?= $(shell if [ $(MLXNUMC) -lt 16 ]; then echo $(MLXNUMC); else echo 16; fi) + +pname:=isert +psource:=$(pname)-source +ifeq ($(WITH_DKMS),1) +pdkms:=$(pname)-dkms +else +pdkms:=$(pname)-modules +endif + +pversion := $(shell dpkg-parsechangelog | sed -n 's/^Version: *\([^-]\+\)-.\+/\1/p') +prel := $(shell dpkg-parsechangelog | sed -n 's/^Version: *\([^-]\+\)-\(.\+\)/\2/p') + +export INSTALL_MOD_DIR:=updates +export INSTALL_MOD_PATH:=$(CURDIR)/debian/$(pdkms) + +DIST_NAME := $(shell lsb_release -si) +DIST_RELEASE := $(DIST_NAME)/$(shell lsb_release -sc) + + +KVER ?= $(shell uname -r) +KVER1 = $(shell echo $(KVER) | sed -e 's/_/-/g') +K_BUILD ?= "/lib/modules/$(KVER)/build" + +%: +ifeq ($(WITH_DKMS),1) + dh $@ --with dkms +else + dh $@ +endif + +override_dh_auto_clean: + +override_dh_auto_configure: + +override_dh_auto_build: +ifneq ($(WITH_DKMS),1) + @echo Building for $(KVER) + make clean || true + make -j$(NJOBS) KVER=$(KVER) K_BUILD=$(K_BUILD) +endif + +override_dh_auto_test: + +override_dh_auto_install: +ifneq ($(WITH_DKMS),1) + make install INSTALL_MOD_DIR=$(INSTALL_MOD_DIR) INSTALL_MOD_PATH=$(INSTALL_MOD_PATH) KERNELRELEASE=$(KVER) KVER=$(KVER) K_BUILD=$(K_BUILD) + find $(INSTALL_MOD_PATH) \( -type f -a -name "modules.*" \) -delete +ifeq ($(WITH_MOD_SIGN),1) + tools/sign-modules $(INSTALL_MOD_PATH)/lib/modules/ $(K_BUILD) +endif +endif + + # For dkms +ifeq ($(WITH_DKMS),1) + dh_installdirs -p$(pdkms) usr/src/$(pname)-$(pversion) + cp Kconfig debian/$(pdkms)/usr/src/$(pname)-$(pversion) + cp makefile debian/$(pdkms)/usr/src/$(pname)-$(pversion) + cp Makefile debian/$(pdkms)/usr/src/$(pname)-$(pversion) + cp dkms.conf debian/$(pdkms)/usr/src/$(pname)-$(pversion) + cp common.postinst debian/$(pdkms)/usr/src/$(pname)-$(pversion) + cp *.c debian/$(pdkms)/usr/src/$(pname)-$(pversion) + cp *.h debian/$(pdkms)/usr/src/$(pname)-$(pversion)/ + + # Force DKMS to install our modules. + # This is mostly needed for modules that do not have a version number info, as DKMS + # will compare their srcversion field, which does not really say which module is newer. + dh_installdirs -p$(pdkms) usr/share/dkms/modules_to_force_install/ + echo "$(pname)" > debian/$(pdkms)/usr/share/dkms/modules_to_force_install/$(pname).force +endif + +override_dh_installinit: + + +ifneq ($(WITH_DKMS),1) +override_dh_gencontrol: + dh_gencontrol -- -v$(pversion)-$(prel).kver.$(KVER1) +endif + +ifneq ($(MLNX_KO_NO_STRIP),1) +ifneq ($(WITH_DKMS),1) +override_dh_strip: + dh_strip + find debian -name '*.ko' | xargs strip -g +ifeq ($(WITH_MOD_SIGN),1) + tools/sign-modules $(INSTALL_MOD_PATH)/lib/modules/ $(K_BUILD) +endif +endif +endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/debian/source/format b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/debian/source/format new file mode 100644 index 0000000..163aaf8 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/debian/source/format @@ -0,0 +1 @@ +3.0 (quilt) diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/dkms.conf b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/dkms.conf new file mode 100644 index 0000000..aeaeb10 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/dkms.conf @@ -0,0 +1,19 @@ +# DKMS module name and version +PACKAGE_NAME="isert" +PACKAGE_VERSION="4.0" + +kernelver=${kernelver:-$(uname -r)} +kernel_source_dir=${kernel_source_dir:-/lib/modules/$kernelver/build} + +# Module name, source and destination directories, and build command-line +BUILT_MODULE_NAME[0]="ib_isert" +BUILT_MODULE_LOCATION[0]="./" +DEST_MODULE_LOCATION[0]="/kernel/../updates/" +MAKE="make -j`MLXNUMC=$(grep ^processor /proc/cpuinfo | wc -l) && echo $(($MLXNUMC<16?$MLXNUMC:16))` KVER=$kernelver K_BUILD=$kernel_source_dir" + +# Cleanup command-line +CLEAN="make clean" + +# disable autoinstall since this module depends on mlnx-ofed-kernel-dkms +# mlnx-ofed-kernel-dkms will build this module on POST_INSTALL +AUTOINSTALL= diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/ib_isert.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/ib_isert.c new file mode 100644 index 0000000..06f1037 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/ib_isert.c @@ -0,0 +1,2703 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/******************************************************************************* + * This file contains iSCSI extentions for RDMA (iSER) Verbs + * + * (c) Copyright 2013 Datera, Inc. + * + * Nicholas A. Bellinger + * + ****************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ib_isert.h" + +static int isert_debug_level; +module_param_named(debug_level, isert_debug_level, int, 0644); +MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0 (default:0)"); + +static int isert_sg_tablesize_set(const char *val, + const struct kernel_param *kp); +static const struct kernel_param_ops sg_tablesize_ops = { + .set = isert_sg_tablesize_set, + .get = param_get_int, +}; + +static int isert_sg_tablesize = ISCSI_ISER_MIN_SG_TABLESIZE; +module_param_cb(sg_tablesize, &sg_tablesize_ops, &isert_sg_tablesize, 0644); +MODULE_PARM_DESC(sg_tablesize, + "Number of gather/scatter entries in a single scsi command, should >= 128 (default: 128, max: 4096)"); + +static DEFINE_MUTEX(device_list_mutex); +static LIST_HEAD(device_list); +static struct workqueue_struct *isert_comp_wq; +static struct workqueue_struct *isert_release_wq; + +static int +isert_put_response(struct iscsi_conn *conn, struct iscsi_cmd *cmd); +static int +isert_login_post_recv(struct isert_conn *isert_conn); +static int +isert_rdma_accept(struct isert_conn *isert_conn); +struct rdma_cm_id *isert_setup_id(struct isert_np *isert_np); + +static void isert_release_work(struct work_struct *work); +static void isert_recv_done(struct ib_cq *cq, struct ib_wc *wc); +static void isert_send_done(struct ib_cq *cq, struct ib_wc *wc); +static void isert_login_recv_done(struct ib_cq *cq, struct ib_wc *wc); +static void isert_login_send_done(struct ib_cq *cq, struct ib_wc *wc); + +static int isert_sg_tablesize_set(const char *val, const struct kernel_param *kp) +{ + int n = 0, ret; + + ret = kstrtoint(val, 10, &n); + if (ret != 0 || n < ISCSI_ISER_MIN_SG_TABLESIZE || + n > ISCSI_ISER_MAX_SG_TABLESIZE) + return -EINVAL; + + return param_set_int(val, kp); +} + +static inline bool +isert_prot_cmd(struct isert_conn *conn, struct se_cmd *cmd) +{ + return (conn->pi_support && + cmd->prot_op != TARGET_PROT_NORMAL); +} + +static void +isert_qp_event_callback(struct ib_event *e, void *context) +{ + struct isert_conn *isert_conn = context; + + isert_err("%s (%d): conn %p\n", + ib_event_msg(e->event), e->event, isert_conn); + + switch (e->event) { + case IB_EVENT_COMM_EST: + rdma_notify(isert_conn->cm_id, IB_EVENT_COMM_EST); + break; + case IB_EVENT_QP_LAST_WQE_REACHED: + isert_warn("Reached TX IB_EVENT_QP_LAST_WQE_REACHED\n"); + break; + default: + break; + } +} + +static struct ib_qp * +isert_create_qp(struct isert_conn *isert_conn, + struct rdma_cm_id *cma_id) +{ + u32 cq_size = ISERT_QP_MAX_REQ_DTOS + ISERT_QP_MAX_RECV_DTOS + 2; + struct isert_device *device = isert_conn->device; + struct ib_device *ib_dev = device->ib_device; + struct ib_qp_init_attr attr; + int ret, factor; + + isert_conn->cq = ib_cq_pool_get(ib_dev, cq_size, -1, IB_POLL_WORKQUEUE); + if (IS_ERR(isert_conn->cq)) { + isert_err("Unable to allocate cq\n"); + ret = PTR_ERR(isert_conn->cq); + return ERR_PTR(ret); + } + isert_conn->cq_size = cq_size; + + memset(&attr, 0, sizeof(struct ib_qp_init_attr)); + attr.event_handler = isert_qp_event_callback; + attr.qp_context = isert_conn; + attr.send_cq = isert_conn->cq; + attr.recv_cq = isert_conn->cq; + attr.cap.max_send_wr = ISERT_QP_MAX_REQ_DTOS + 1; + attr.cap.max_recv_wr = ISERT_QP_MAX_RECV_DTOS + 1; + factor = rdma_rw_mr_factor(device->ib_device, cma_id->port_num, + isert_sg_tablesize); + attr.cap.max_rdma_ctxs = ISCSI_DEF_XMIT_CMDS_MAX * factor; + attr.cap.max_send_sge = device->ib_device->attrs.max_send_sge; + attr.cap.max_recv_sge = 1; + attr.sq_sig_type = IB_SIGNAL_REQ_WR; + attr.qp_type = IB_QPT_RC; + if (device->pi_capable) + attr.create_flags |= IB_QP_CREATE_INTEGRITY_EN; + + if (device->sig_pipeline) + attr.create_flags |= IB_QP_CREATE_SIGNATURE_PIPELINE; + + ret = rdma_create_qp(cma_id, device->pd, &attr); + if (ret) { + isert_err("rdma_create_qp failed for cma_id %d\n", ret); + ib_cq_pool_put(isert_conn->cq, isert_conn->cq_size); + + return ERR_PTR(ret); + } + + return cma_id->qp; +} + +static int +isert_alloc_rx_descriptors(struct isert_conn *isert_conn) +{ + struct isert_device *device = isert_conn->device; + struct ib_device *ib_dev = device->ib_device; + struct iser_rx_desc *rx_desc; + struct ib_sge *rx_sg; + u64 dma_addr; + int i, j; + + isert_conn->rx_descs = kcalloc(ISERT_QP_MAX_RECV_DTOS, + sizeof(struct iser_rx_desc), + GFP_KERNEL); + if (!isert_conn->rx_descs) + return -ENOMEM; + + rx_desc = isert_conn->rx_descs; + + for (i = 0; i < ISERT_QP_MAX_RECV_DTOS; i++, rx_desc++) { + dma_addr = ib_dma_map_single(ib_dev, rx_desc->buf, + ISER_RX_SIZE, DMA_FROM_DEVICE); + if (ib_dma_mapping_error(ib_dev, dma_addr)) + goto dma_map_fail; + + rx_desc->dma_addr = dma_addr; + + rx_sg = &rx_desc->rx_sg; + rx_sg->addr = rx_desc->dma_addr + isert_get_hdr_offset(rx_desc); + rx_sg->length = ISER_RX_PAYLOAD_SIZE; + rx_sg->lkey = device->pd->local_dma_lkey; + rx_desc->rx_cqe.done = isert_recv_done; + } + + return 0; + +dma_map_fail: + rx_desc = isert_conn->rx_descs; + for (j = 0; j < i; j++, rx_desc++) { + ib_dma_unmap_single(ib_dev, rx_desc->dma_addr, + ISER_RX_SIZE, DMA_FROM_DEVICE); + } + kfree(isert_conn->rx_descs); + isert_conn->rx_descs = NULL; + isert_err("conn %p failed to allocate rx descriptors\n", isert_conn); + return -ENOMEM; +} + +static void +isert_free_rx_descriptors(struct isert_conn *isert_conn) +{ + struct ib_device *ib_dev = isert_conn->device->ib_device; + struct iser_rx_desc *rx_desc; + int i; + + if (!isert_conn->rx_descs) + return; + + rx_desc = isert_conn->rx_descs; + for (i = 0; i < ISERT_QP_MAX_RECV_DTOS; i++, rx_desc++) { + ib_dma_unmap_single(ib_dev, rx_desc->dma_addr, + ISER_RX_SIZE, DMA_FROM_DEVICE); + } + + kfree(isert_conn->rx_descs); + isert_conn->rx_descs = NULL; +} + +static int +isert_create_device_ib_res(struct isert_device *device) +{ + struct ib_device *ib_dev = device->ib_device; + int ret; + + isert_dbg("devattr->max_send_sge: %d devattr->max_recv_sge %d\n", + ib_dev->attrs.max_send_sge, ib_dev->attrs.max_recv_sge); + isert_dbg("devattr->max_sge_rd: %d\n", ib_dev->attrs.max_sge_rd); + + device->pd = ib_alloc_pd(ib_dev, 0); + if (IS_ERR(device->pd)) { + ret = PTR_ERR(device->pd); + isert_err("failed to allocate pd, device %p, ret=%d\n", + device, ret); + return ret; + } + + /* Check signature cap */ + if (ib_dev->attrs.device_cap_flags & IB_DEVICE_INTEGRITY_HANDOVER) + device->pi_capable = true; + else + device->pi_capable = false; + + device->sig_pipeline = ib_dev->attrs.device_cap_flags & + IB_DEVICE_SIGNATURE_PIPELINE ? true : false; + + + return 0; +} + +static void +isert_free_device_ib_res(struct isert_device *device) +{ + isert_info("device %p\n", device); + + ib_dealloc_pd(device->pd); +} + +static void +isert_device_put(struct isert_device *device) +{ + mutex_lock(&device_list_mutex); + device->refcount--; + isert_info("device %p refcount %d\n", device, device->refcount); + if (!device->refcount) { + isert_free_device_ib_res(device); + list_del(&device->dev_node); + kfree(device); + } + mutex_unlock(&device_list_mutex); +} + +static struct isert_device * +isert_device_get(struct rdma_cm_id *cma_id) +{ + struct isert_device *device; + int ret; + + mutex_lock(&device_list_mutex); + list_for_each_entry(device, &device_list, dev_node) { + if (device->ib_device->node_guid == cma_id->device->node_guid) { + device->refcount++; + isert_info("Found iser device %p refcount %d\n", + device, device->refcount); + mutex_unlock(&device_list_mutex); + return device; + } + } + + device = kzalloc(sizeof(struct isert_device), GFP_KERNEL); + if (!device) { + mutex_unlock(&device_list_mutex); + return ERR_PTR(-ENOMEM); + } + + INIT_LIST_HEAD(&device->dev_node); + + device->ib_device = cma_id->device; + ret = isert_create_device_ib_res(device); + if (ret) { + kfree(device); + mutex_unlock(&device_list_mutex); + return ERR_PTR(ret); + } + + device->refcount++; + list_add_tail(&device->dev_node, &device_list); + isert_info("Created a new iser device %p refcount %d\n", + device, device->refcount); + mutex_unlock(&device_list_mutex); + + return device; +} + +static void +isert_init_conn(struct isert_conn *isert_conn) +{ + isert_conn->state = ISER_CONN_INIT; + INIT_LIST_HEAD(&isert_conn->node); + init_completion(&isert_conn->login_comp); + init_completion(&isert_conn->login_req_comp); + init_waitqueue_head(&isert_conn->rem_wait); + kref_init(&isert_conn->kref); + mutex_init(&isert_conn->mutex); + INIT_WORK(&isert_conn->release_work, isert_release_work); +} + +static void +isert_free_login_buf(struct isert_conn *isert_conn) +{ + struct ib_device *ib_dev = isert_conn->device->ib_device; + + ib_dma_unmap_single(ib_dev, isert_conn->login_rsp_dma, + ISER_RX_PAYLOAD_SIZE, DMA_TO_DEVICE); + kfree(isert_conn->login_rsp_buf); + + ib_dma_unmap_single(ib_dev, isert_conn->login_desc->dma_addr, + ISER_RX_SIZE, DMA_FROM_DEVICE); + kfree(isert_conn->login_desc); +} + +static int +isert_alloc_login_buf(struct isert_conn *isert_conn, + struct ib_device *ib_dev) +{ + int ret; + + isert_conn->login_desc = kzalloc(sizeof(*isert_conn->login_desc), + GFP_KERNEL); + if (!isert_conn->login_desc) + return -ENOMEM; + + isert_conn->login_desc->dma_addr = ib_dma_map_single(ib_dev, + isert_conn->login_desc->buf, + ISER_RX_SIZE, DMA_FROM_DEVICE); + ret = ib_dma_mapping_error(ib_dev, isert_conn->login_desc->dma_addr); + if (ret) { + isert_err("login_desc dma mapping error: %d\n", ret); + isert_conn->login_desc->dma_addr = 0; + goto out_free_login_desc; + } + + isert_conn->login_rsp_buf = kzalloc(ISER_RX_PAYLOAD_SIZE, GFP_KERNEL); + if (!isert_conn->login_rsp_buf) { + ret = -ENOMEM; + goto out_unmap_login_desc; + } + + isert_conn->login_rsp_dma = ib_dma_map_single(ib_dev, + isert_conn->login_rsp_buf, + ISER_RX_PAYLOAD_SIZE, DMA_TO_DEVICE); + ret = ib_dma_mapping_error(ib_dev, isert_conn->login_rsp_dma); + if (ret) { + isert_err("login_rsp_dma mapping error: %d\n", ret); + isert_conn->login_rsp_dma = 0; + goto out_free_login_rsp_buf; + } + + return 0; + +out_free_login_rsp_buf: + kfree(isert_conn->login_rsp_buf); +out_unmap_login_desc: + ib_dma_unmap_single(ib_dev, isert_conn->login_desc->dma_addr, + ISER_RX_SIZE, DMA_FROM_DEVICE); +out_free_login_desc: + kfree(isert_conn->login_desc); + return ret; +} + +static void +isert_set_nego_params(struct isert_conn *isert_conn, + struct rdma_conn_param *param) +{ + struct ib_device_attr *attr = &isert_conn->device->ib_device->attrs; + + /* Set max inflight RDMA READ requests */ + isert_conn->initiator_depth = min_t(u8, param->initiator_depth, + attr->max_qp_init_rd_atom); + isert_dbg("Using initiator_depth: %u\n", isert_conn->initiator_depth); + + if (param->private_data) { + u8 flags = *(u8 *)param->private_data; + + /* + * use remote invalidation if the both initiator + * and the HCA support it + */ + isert_conn->snd_w_inv = !(flags & ISER_SEND_W_INV_NOT_SUP) && + (attr->device_cap_flags & + IB_DEVICE_MEM_MGT_EXTENSIONS); + if (isert_conn->snd_w_inv) + isert_info("Using remote invalidation\n"); + } +} + +static void +isert_destroy_qp(struct isert_conn *isert_conn) +{ + ib_destroy_qp(isert_conn->qp); + ib_cq_pool_put(isert_conn->cq, isert_conn->cq_size); +} + +static int +isert_connect_request(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) +{ + struct isert_np *isert_np = cma_id->context; + struct iscsi_np *np = isert_np->np; + struct isert_conn *isert_conn; + struct isert_device *device; + int ret = 0; + + spin_lock_bh(&np->np_thread_lock); + if (!np->enabled) { + spin_unlock_bh(&np->np_thread_lock); + isert_dbg("iscsi_np is not enabled, reject connect request\n"); + return rdma_reject(cma_id, NULL, 0, IB_CM_REJ_CONSUMER_DEFINED); + } + spin_unlock_bh(&np->np_thread_lock); + + isert_dbg("cma_id: %p, portal: %p\n", + cma_id, cma_id->context); + + isert_conn = kzalloc(sizeof(struct isert_conn), GFP_KERNEL); + if (!isert_conn) + return -ENOMEM; + + isert_init_conn(isert_conn); + isert_conn->cm_id = cma_id; + + device = isert_device_get(cma_id); + if (IS_ERR(device)) { + ret = PTR_ERR(device); + goto out; + } + isert_conn->device = device; + + ret = isert_alloc_login_buf(isert_conn, cma_id->device); + if (ret) + goto out_conn_dev; + + isert_set_nego_params(isert_conn, &event->param.conn); + + isert_conn->qp = isert_create_qp(isert_conn, cma_id); + if (IS_ERR(isert_conn->qp)) { + ret = PTR_ERR(isert_conn->qp); + goto out_rsp_dma_map; + } + + ret = isert_login_post_recv(isert_conn); + if (ret) + goto out_destroy_qp; + + ret = isert_rdma_accept(isert_conn); + if (ret) + goto out_destroy_qp; + + mutex_lock(&isert_np->mutex); + list_add_tail(&isert_conn->node, &isert_np->accepted); + mutex_unlock(&isert_np->mutex); + + return 0; + +out_destroy_qp: + isert_destroy_qp(isert_conn); +out_rsp_dma_map: + isert_free_login_buf(isert_conn); +out_conn_dev: + isert_device_put(device); +out: + kfree(isert_conn); + rdma_reject(cma_id, NULL, 0, IB_CM_REJ_CONSUMER_DEFINED); + return ret; +} + +static void +isert_connect_release(struct isert_conn *isert_conn) +{ + struct isert_device *device = isert_conn->device; + + isert_dbg("conn %p\n", isert_conn); + + BUG_ON(!device); + + isert_free_rx_descriptors(isert_conn); + if (isert_conn->cm_id && + !isert_conn->dev_removed) + rdma_destroy_id(isert_conn->cm_id); + + if (isert_conn->qp) + isert_destroy_qp(isert_conn); + + if (isert_conn->login_desc) + isert_free_login_buf(isert_conn); + + isert_device_put(device); + + if (isert_conn->dev_removed) + wake_up_interruptible(&isert_conn->rem_wait); + else + kfree(isert_conn); +} + +static void +isert_connected_handler(struct rdma_cm_id *cma_id) +{ + struct isert_conn *isert_conn = cma_id->qp->qp_context; + struct isert_np *isert_np = cma_id->context; + + isert_info("conn %p\n", isert_conn); + + mutex_lock(&isert_conn->mutex); + isert_conn->state = ISER_CONN_UP; + kref_get(&isert_conn->kref); + mutex_unlock(&isert_conn->mutex); + + mutex_lock(&isert_np->mutex); + list_move_tail(&isert_conn->node, &isert_np->pending); + mutex_unlock(&isert_np->mutex); + + isert_info("np %p: Allow accept_np to continue\n", isert_np); + up(&isert_np->sem); +} + +static void +isert_release_kref(struct kref *kref) +{ + struct isert_conn *isert_conn = container_of(kref, + struct isert_conn, kref); + + isert_info("conn %p final kref %s/%d\n", isert_conn, current->comm, + current->pid); + + isert_connect_release(isert_conn); +} + +static void +isert_put_conn(struct isert_conn *isert_conn) +{ + kref_put(&isert_conn->kref, isert_release_kref); +} + +static void +isert_handle_unbound_conn(struct isert_conn *isert_conn) +{ + struct isert_np *isert_np = isert_conn->cm_id->context; + + mutex_lock(&isert_np->mutex); + if (!list_empty(&isert_conn->node)) { + /* + * This means iscsi doesn't know this connection + * so schedule a cleanup ourselves + */ + list_del_init(&isert_conn->node); + isert_put_conn(isert_conn); + queue_work(isert_release_wq, &isert_conn->release_work); + } + mutex_unlock(&isert_np->mutex); +} + +/** + * isert_conn_terminate() - Initiate connection termination + * @isert_conn: isert connection struct + * + * Notes: + * In case the connection state is BOUND, move state + * to TEMINATING and start teardown sequence (rdma_disconnect). + * In case the connection state is UP, complete flush as well. + * + * This routine must be called with mutex held. Thus it is + * safe to call multiple times. + */ +static void +isert_conn_terminate(struct isert_conn *isert_conn) +{ + int err; + + if (isert_conn->state >= ISER_CONN_TERMINATING) + return; + + isert_info("Terminating conn %p state %d\n", + isert_conn, isert_conn->state); + isert_conn->state = ISER_CONN_TERMINATING; + err = rdma_disconnect(isert_conn->cm_id); + if (err) + isert_warn("Failed rdma_disconnect isert_conn %p\n", + isert_conn); +} + +static int +isert_np_cma_handler(struct isert_np *isert_np, + enum rdma_cm_event_type event) +{ + isert_dbg("%s (%d): isert np %p\n", + rdma_event_msg(event), event, isert_np); + + switch (event) { + case RDMA_CM_EVENT_DEVICE_REMOVAL: + isert_np->cm_id = NULL; + break; + case RDMA_CM_EVENT_ADDR_CHANGE: + isert_np->cm_id = isert_setup_id(isert_np); + if (IS_ERR(isert_np->cm_id)) { + isert_err("isert np %p setup id failed: %ld\n", + isert_np, PTR_ERR(isert_np->cm_id)); + isert_np->cm_id = NULL; + } + break; + default: + isert_err("isert np %p Unexpected event %d\n", + isert_np, event); + } + + return -1; +} + +static int +isert_disconnected_handler(struct rdma_cm_id *cma_id, + enum rdma_cm_event_type event) +{ + struct isert_conn *isert_conn = cma_id->qp->qp_context; + + mutex_lock(&isert_conn->mutex); + switch (isert_conn->state) { + case ISER_CONN_TERMINATING: + break; + case ISER_CONN_UP: + isert_conn_terminate(isert_conn); + ib_drain_qp(isert_conn->qp); + isert_handle_unbound_conn(isert_conn); + break; + case ISER_CONN_BOUND: + case ISER_CONN_FULL_FEATURE: /* FALLTHRU */ + iscsit_cause_connection_reinstatement(isert_conn->conn, 0); + break; + default: + isert_warn("conn %p terminating in state %d\n", + isert_conn, isert_conn->state); + } + mutex_unlock(&isert_conn->mutex); + + return 0; +} + +static int +isert_connect_error(struct rdma_cm_id *cma_id) +{ + struct isert_conn *isert_conn = cma_id->qp->qp_context; + + ib_drain_qp(isert_conn->qp); + list_del_init(&isert_conn->node); + isert_conn->cm_id = NULL; + isert_put_conn(isert_conn); + + return -1; +} + +static int +isert_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) +{ + struct isert_np *isert_np = cma_id->context; + struct isert_conn *isert_conn; + int ret = 0; + + isert_info("%s (%d): status %d id %p np %p\n", + rdma_event_msg(event->event), event->event, + event->status, cma_id, cma_id->context); + + if (isert_np->cm_id == cma_id) + return isert_np_cma_handler(cma_id->context, event->event); + + switch (event->event) { + case RDMA_CM_EVENT_CONNECT_REQUEST: + ret = isert_connect_request(cma_id, event); + if (ret) + isert_err("failed handle connect request %d\n", ret); + break; + case RDMA_CM_EVENT_ESTABLISHED: + isert_connected_handler(cma_id); + break; + case RDMA_CM_EVENT_ADDR_CHANGE: + case RDMA_CM_EVENT_DISCONNECTED: + case RDMA_CM_EVENT_TIMEWAIT_EXIT: /* FALLTHRU */ + ret = isert_disconnected_handler(cma_id, event->event); + break; + case RDMA_CM_EVENT_DEVICE_REMOVAL: + isert_conn = cma_id->qp->qp_context; + isert_conn->dev_removed = true; + isert_disconnected_handler(cma_id, event->event); + wait_event_interruptible(isert_conn->rem_wait, + isert_conn->state == ISER_CONN_DOWN); + kfree(isert_conn); + /* + * return non-zero from the callback to destroy + * the rdma cm id + */ + return 1; + case RDMA_CM_EVENT_REJECTED: + isert_info("Connection rejected: %s\n", + rdma_reject_msg(cma_id, event->status)); + fallthrough; + case RDMA_CM_EVENT_UNREACHABLE: + case RDMA_CM_EVENT_CONNECT_ERROR: + ret = isert_connect_error(cma_id); + break; + default: + isert_err("Unhandled RDMA CMA event: %d\n", event->event); + break; + } + + return ret; +} + +static int +isert_post_recvm(struct isert_conn *isert_conn, u32 count) +{ + struct ib_recv_wr *rx_wr; + int i, ret; + struct iser_rx_desc *rx_desc; + + for (rx_wr = isert_conn->rx_wr, i = 0; i < count; i++, rx_wr++) { + rx_desc = &isert_conn->rx_descs[i]; + + rx_wr->wr_cqe = &rx_desc->rx_cqe; + rx_wr->sg_list = &rx_desc->rx_sg; + rx_wr->num_sge = 1; + rx_wr->next = rx_wr + 1; + rx_desc->in_use = false; + } + rx_wr--; + rx_wr->next = NULL; /* mark end of work requests list */ + + ret = ib_post_recv(isert_conn->qp, isert_conn->rx_wr, NULL); + if (ret) + isert_err("ib_post_recv() failed with ret: %d\n", ret); + + return ret; +} + +static int +isert_post_recv(struct isert_conn *isert_conn, struct iser_rx_desc *rx_desc) +{ + struct ib_recv_wr rx_wr; + int ret; + + if (!rx_desc->in_use) { + /* + * if the descriptor is not in-use we already reposted it + * for recv, so just silently return + */ + return 0; + } + + rx_desc->in_use = false; + rx_wr.wr_cqe = &rx_desc->rx_cqe; + rx_wr.sg_list = &rx_desc->rx_sg; + rx_wr.num_sge = 1; + rx_wr.next = NULL; + + ret = ib_post_recv(isert_conn->qp, &rx_wr, NULL); + if (ret) + isert_err("ib_post_recv() failed with ret: %d\n", ret); + + return ret; +} + +static int +isert_login_post_send(struct isert_conn *isert_conn, struct iser_tx_desc *tx_desc) +{ + struct ib_device *ib_dev = isert_conn->cm_id->device; + struct ib_send_wr send_wr; + int ret; + + ib_dma_sync_single_for_device(ib_dev, tx_desc->dma_addr, + ISER_HEADERS_LEN, DMA_TO_DEVICE); + + tx_desc->tx_cqe.done = isert_login_send_done; + + send_wr.next = NULL; + send_wr.wr_cqe = &tx_desc->tx_cqe; + send_wr.sg_list = tx_desc->tx_sg; + send_wr.num_sge = tx_desc->num_sge; + send_wr.opcode = IB_WR_SEND; + send_wr.send_flags = IB_SEND_SIGNALED; + + ret = ib_post_send(isert_conn->qp, &send_wr, NULL); + if (ret) + isert_err("ib_post_send() failed, ret: %d\n", ret); + + return ret; +} + +static void +__isert_create_send_desc(struct isert_device *device, + struct iser_tx_desc *tx_desc) +{ + + memset(&tx_desc->iser_header, 0, sizeof(struct iser_ctrl)); + tx_desc->iser_header.flags = ISCSI_CTRL; + + tx_desc->num_sge = 1; + + if (tx_desc->tx_sg[0].lkey != device->pd->local_dma_lkey) { + tx_desc->tx_sg[0].lkey = device->pd->local_dma_lkey; + isert_dbg("tx_desc %p lkey mismatch, fixing\n", tx_desc); + } +} + +static void +isert_create_send_desc(struct isert_conn *isert_conn, + struct isert_cmd *isert_cmd, + struct iser_tx_desc *tx_desc) +{ + struct isert_device *device = isert_conn->device; + struct ib_device *ib_dev = device->ib_device; + + ib_dma_sync_single_for_cpu(ib_dev, tx_desc->dma_addr, + ISER_HEADERS_LEN, DMA_TO_DEVICE); + + __isert_create_send_desc(device, tx_desc); +} + +static int +isert_init_tx_hdrs(struct isert_conn *isert_conn, + struct iser_tx_desc *tx_desc) +{ + struct isert_device *device = isert_conn->device; + struct ib_device *ib_dev = device->ib_device; + u64 dma_addr; + + dma_addr = ib_dma_map_single(ib_dev, (void *)tx_desc, + ISER_HEADERS_LEN, DMA_TO_DEVICE); + if (ib_dma_mapping_error(ib_dev, dma_addr)) { + isert_err("ib_dma_mapping_error() failed\n"); + return -ENOMEM; + } + + tx_desc->dma_addr = dma_addr; + tx_desc->tx_sg[0].addr = tx_desc->dma_addr; + tx_desc->tx_sg[0].length = ISER_HEADERS_LEN; + tx_desc->tx_sg[0].lkey = device->pd->local_dma_lkey; + + isert_dbg("Setup tx_sg[0].addr: 0x%llx length: %u lkey: 0x%x\n", + tx_desc->tx_sg[0].addr, tx_desc->tx_sg[0].length, + tx_desc->tx_sg[0].lkey); + + return 0; +} + +static void +isert_init_send_wr_flags(struct isert_conn *isert_conn, + struct isert_cmd *isert_cmd, + struct ib_send_wr *send_wr, + int send_flags) +{ + struct iser_tx_desc *tx_desc = &isert_cmd->tx_desc; + + tx_desc->tx_cqe.done = isert_send_done; + send_wr->wr_cqe = &tx_desc->tx_cqe; + + if (isert_conn->snd_w_inv && isert_cmd->inv_rkey) { + send_wr->opcode = IB_WR_SEND_WITH_INV; + send_wr->ex.invalidate_rkey = isert_cmd->inv_rkey; + } else { + send_wr->opcode = IB_WR_SEND; + } + + send_wr->sg_list = &tx_desc->tx_sg[0]; + send_wr->num_sge = isert_cmd->tx_desc.num_sge; + send_wr->send_flags = send_flags; +} + +static inline void +isert_init_send_wr(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd, + struct ib_send_wr *send_wr) +{ + isert_init_send_wr_flags(isert_conn, isert_cmd, send_wr, + IB_SEND_SIGNALED); +} + +static int +isert_login_post_recv(struct isert_conn *isert_conn) +{ + struct ib_recv_wr rx_wr; + struct ib_sge sge; + int ret; + + memset(&sge, 0, sizeof(struct ib_sge)); + sge.addr = isert_conn->login_desc->dma_addr + + isert_get_hdr_offset(isert_conn->login_desc); + sge.length = ISER_RX_PAYLOAD_SIZE; + sge.lkey = isert_conn->device->pd->local_dma_lkey; + + isert_dbg("Setup sge: addr: %llx length: %d 0x%08x\n", + sge.addr, sge.length, sge.lkey); + + isert_conn->login_desc->rx_cqe.done = isert_login_recv_done; + + memset(&rx_wr, 0, sizeof(struct ib_recv_wr)); + rx_wr.wr_cqe = &isert_conn->login_desc->rx_cqe; + rx_wr.sg_list = &sge; + rx_wr.num_sge = 1; + + ret = ib_post_recv(isert_conn->qp, &rx_wr, NULL); + if (ret) + isert_err("ib_post_recv() failed: %d\n", ret); + + return ret; +} + +static int +isert_put_login_tx(struct iscsi_conn *conn, struct iscsi_login *login, + u32 length) +{ + struct isert_conn *isert_conn = conn->context; + struct isert_device *device = isert_conn->device; + struct ib_device *ib_dev = device->ib_device; + struct iser_tx_desc *tx_desc = &isert_conn->login_tx_desc; + int ret; + + __isert_create_send_desc(device, tx_desc); + + memcpy(&tx_desc->iscsi_header, &login->rsp[0], + sizeof(struct iscsi_hdr)); + + isert_init_tx_hdrs(isert_conn, tx_desc); + + if (length > 0) { + struct ib_sge *tx_dsg = &tx_desc->tx_sg[1]; + + ib_dma_sync_single_for_cpu(ib_dev, isert_conn->login_rsp_dma, + length, DMA_TO_DEVICE); + + memcpy(isert_conn->login_rsp_buf, login->rsp_buf, length); + + ib_dma_sync_single_for_device(ib_dev, isert_conn->login_rsp_dma, + length, DMA_TO_DEVICE); + + tx_dsg->addr = isert_conn->login_rsp_dma; + tx_dsg->length = length; + tx_dsg->lkey = isert_conn->device->pd->local_dma_lkey; + tx_desc->num_sge = 2; + } + if (!login->login_failed) { + if (login->login_complete) { + ret = isert_alloc_rx_descriptors(isert_conn); + if (ret) + return ret; + + ret = isert_post_recvm(isert_conn, + ISERT_QP_MAX_RECV_DTOS); + if (ret) + return ret; + + /* Now we are in FULL_FEATURE phase */ + mutex_lock(&isert_conn->mutex); + isert_conn->state = ISER_CONN_FULL_FEATURE; + mutex_unlock(&isert_conn->mutex); + goto post_send; + } + + ret = isert_login_post_recv(isert_conn); + if (ret) + return ret; + } +post_send: + ret = isert_login_post_send(isert_conn, tx_desc); + if (ret) + return ret; + + return 0; +} + +static void +isert_rx_login_req(struct isert_conn *isert_conn) +{ + struct iser_rx_desc *rx_desc = isert_conn->login_desc; + int rx_buflen = isert_conn->login_req_len; + struct iscsi_conn *conn = isert_conn->conn; + struct iscsi_login *login = conn->conn_login; + int size; + + isert_info("conn %p\n", isert_conn); + + WARN_ON_ONCE(!login); + + if (login->first_request) { + struct iscsi_login_req *login_req = + (struct iscsi_login_req *)isert_get_iscsi_hdr(rx_desc); + /* + * Setup the initial iscsi_login values from the leading + * login request PDU. + */ + login->leading_connection = (!login_req->tsih) ? 1 : 0; + login->current_stage = + (login_req->flags & ISCSI_FLAG_LOGIN_CURRENT_STAGE_MASK) + >> 2; + login->version_min = login_req->min_version; + login->version_max = login_req->max_version; + memcpy(login->isid, login_req->isid, 6); + login->cmd_sn = be32_to_cpu(login_req->cmdsn); + login->init_task_tag = login_req->itt; + login->initial_exp_statsn = be32_to_cpu(login_req->exp_statsn); + login->cid = be16_to_cpu(login_req->cid); + login->tsih = be16_to_cpu(login_req->tsih); + } + + memcpy(&login->req[0], isert_get_iscsi_hdr(rx_desc), ISCSI_HDR_LEN); + + size = min(rx_buflen, MAX_KEY_VALUE_PAIRS); + isert_dbg("Using login payload size: %d, rx_buflen: %d " + "MAX_KEY_VALUE_PAIRS: %d\n", size, rx_buflen, + MAX_KEY_VALUE_PAIRS); + memcpy(login->req_buf, isert_get_data(rx_desc), size); + + if (login->first_request) { + complete(&isert_conn->login_comp); + return; + } + schedule_delayed_work(&conn->login_work, 0); +} + +static struct iscsi_cmd +*isert_allocate_cmd(struct iscsi_conn *conn, struct iser_rx_desc *rx_desc) +{ + struct isert_conn *isert_conn = conn->context; + struct isert_cmd *isert_cmd; + struct iscsi_cmd *cmd; + + cmd = iscsit_allocate_cmd(conn, TASK_INTERRUPTIBLE); + if (!cmd) { + isert_err("Unable to allocate iscsi_cmd + isert_cmd\n"); + return NULL; + } + isert_cmd = iscsit_priv_cmd(cmd); + isert_cmd->conn = isert_conn; + isert_cmd->iscsi_cmd = cmd; + isert_cmd->rx_desc = rx_desc; + + return cmd; +} + +static int +isert_handle_scsi_cmd(struct isert_conn *isert_conn, + struct isert_cmd *isert_cmd, struct iscsi_cmd *cmd, + struct iser_rx_desc *rx_desc, unsigned char *buf) +{ + struct iscsi_conn *conn = isert_conn->conn; + struct iscsi_scsi_req *hdr = (struct iscsi_scsi_req *)buf; + int imm_data, imm_data_len, unsol_data, sg_nents, rc; + bool dump_payload = false; + unsigned int data_len; + + rc = iscsit_setup_scsi_cmd(conn, cmd, buf); + if (rc < 0) + return rc; + + imm_data = cmd->immediate_data; + imm_data_len = cmd->first_burst_len; + unsol_data = cmd->unsolicited_data; + data_len = cmd->se_cmd.data_length; + + if (imm_data && imm_data_len == data_len) + cmd->se_cmd.se_cmd_flags |= SCF_PASSTHROUGH_SG_TO_MEM_NOALLOC; + rc = iscsit_process_scsi_cmd(conn, cmd, hdr); + if (rc < 0) { + return 0; + } else if (rc > 0) { + dump_payload = true; + goto sequence_cmd; + } + + if (!imm_data) + return 0; + + if (imm_data_len != data_len) { + sg_nents = max(1UL, DIV_ROUND_UP(imm_data_len, PAGE_SIZE)); + sg_copy_from_buffer(cmd->se_cmd.t_data_sg, sg_nents, + isert_get_data(rx_desc), imm_data_len); + isert_dbg("Copy Immediate sg_nents: %u imm_data_len: %d\n", + sg_nents, imm_data_len); + } else { + sg_init_table(&isert_cmd->sg, 1); + cmd->se_cmd.t_data_sg = &isert_cmd->sg; + cmd->se_cmd.t_data_nents = 1; + sg_set_buf(&isert_cmd->sg, isert_get_data(rx_desc), + imm_data_len); + isert_dbg("Transfer Immediate imm_data_len: %d\n", + imm_data_len); + } + + cmd->write_data_done += imm_data_len; + + if (cmd->write_data_done == cmd->se_cmd.data_length) { + spin_lock_bh(&cmd->istate_lock); + cmd->cmd_flags |= ICF_GOT_LAST_DATAOUT; + cmd->i_state = ISTATE_RECEIVED_LAST_DATAOUT; + spin_unlock_bh(&cmd->istate_lock); + } + +sequence_cmd: + rc = iscsit_sequence_cmd(conn, cmd, buf, hdr->cmdsn); + + if (!rc && !dump_payload && unsol_data) + iscsit_set_unsolicited_dataout(cmd); + else if (dump_payload && imm_data) + target_put_sess_cmd(&cmd->se_cmd); + + return 0; +} + +static int +isert_handle_iscsi_dataout(struct isert_conn *isert_conn, + struct iser_rx_desc *rx_desc, unsigned char *buf) +{ + struct scatterlist *sg_start; + struct iscsi_conn *conn = isert_conn->conn; + struct iscsi_cmd *cmd = NULL; + struct iscsi_data *hdr = (struct iscsi_data *)buf; + u32 unsol_data_len = ntoh24(hdr->dlength); + int rc, sg_nents, sg_off, page_off; + + rc = iscsit_check_dataout_hdr(conn, buf, &cmd); + if (rc < 0) + return rc; + else if (!cmd) + return 0; + /* + * FIXME: Unexpected unsolicited_data out + */ + if (!cmd->unsolicited_data) { + isert_err("Received unexpected solicited data payload\n"); + dump_stack(); + return -1; + } + + isert_dbg("Unsolicited DataOut unsol_data_len: %u, " + "write_data_done: %u, data_length: %u\n", + unsol_data_len, cmd->write_data_done, + cmd->se_cmd.data_length); + + sg_off = cmd->write_data_done / PAGE_SIZE; + sg_start = &cmd->se_cmd.t_data_sg[sg_off]; + sg_nents = max(1UL, DIV_ROUND_UP(unsol_data_len, PAGE_SIZE)); + page_off = cmd->write_data_done % PAGE_SIZE; + /* + * FIXME: Non page-aligned unsolicited_data out + */ + if (page_off) { + isert_err("unexpected non-page aligned data payload\n"); + dump_stack(); + return -1; + } + isert_dbg("Copying DataOut: sg_start: %p, sg_off: %u " + "sg_nents: %u from %p %u\n", sg_start, sg_off, + sg_nents, isert_get_data(rx_desc), unsol_data_len); + + sg_copy_from_buffer(sg_start, sg_nents, isert_get_data(rx_desc), + unsol_data_len); + + rc = iscsit_check_dataout_payload(cmd, hdr, false); + if (rc < 0) + return rc; + + /* + * multiple data-outs on the same command can arrive - + * so post the buffer before hand + */ + return isert_post_recv(isert_conn, rx_desc); +} + +static int +isert_handle_nop_out(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd, + struct iscsi_cmd *cmd, struct iser_rx_desc *rx_desc, + unsigned char *buf) +{ + struct iscsi_conn *conn = isert_conn->conn; + struct iscsi_nopout *hdr = (struct iscsi_nopout *)buf; + int rc; + + rc = iscsit_setup_nop_out(conn, cmd, hdr); + if (rc < 0) + return rc; + /* + * FIXME: Add support for NOPOUT payload using unsolicited RDMA payload + */ + + return iscsit_process_nop_out(conn, cmd, hdr); +} + +static int +isert_handle_text_cmd(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd, + struct iscsi_cmd *cmd, struct iser_rx_desc *rx_desc, + struct iscsi_text *hdr) +{ + struct iscsi_conn *conn = isert_conn->conn; + u32 payload_length = ntoh24(hdr->dlength); + int rc; + unsigned char *text_in = NULL; + + rc = iscsit_setup_text_cmd(conn, cmd, hdr); + if (rc < 0) + return rc; + + if (payload_length) { + text_in = kzalloc(payload_length, GFP_KERNEL); + if (!text_in) + return -ENOMEM; + } + cmd->text_in_ptr = text_in; + + memcpy(cmd->text_in_ptr, isert_get_data(rx_desc), payload_length); + + return iscsit_process_text_cmd(conn, cmd, hdr); +} + +static int +isert_rx_opcode(struct isert_conn *isert_conn, struct iser_rx_desc *rx_desc, + uint32_t read_stag, uint64_t read_va, + uint32_t write_stag, uint64_t write_va) +{ + struct iscsi_hdr *hdr = isert_get_iscsi_hdr(rx_desc); + struct iscsi_conn *conn = isert_conn->conn; + struct iscsi_cmd *cmd; + struct isert_cmd *isert_cmd; + int ret = -EINVAL; + u8 opcode = (hdr->opcode & ISCSI_OPCODE_MASK); + + if (conn->sess->sess_ops->SessionType && + (!(opcode & ISCSI_OP_TEXT) || !(opcode & ISCSI_OP_LOGOUT))) { + isert_err("Got illegal opcode: 0x%02x in SessionType=Discovery," + " ignoring\n", opcode); + return 0; + } + + switch (opcode) { + case ISCSI_OP_SCSI_CMD: + cmd = isert_allocate_cmd(conn, rx_desc); + if (!cmd) + break; + + isert_cmd = iscsit_priv_cmd(cmd); + isert_cmd->read_stag = read_stag; + isert_cmd->read_va = read_va; + isert_cmd->write_stag = write_stag; + isert_cmd->write_va = write_va; + isert_cmd->inv_rkey = read_stag ? read_stag : write_stag; + + ret = isert_handle_scsi_cmd(isert_conn, isert_cmd, cmd, + rx_desc, (unsigned char *)hdr); + break; + case ISCSI_OP_NOOP_OUT: + cmd = isert_allocate_cmd(conn, rx_desc); + if (!cmd) + break; + + isert_cmd = iscsit_priv_cmd(cmd); + ret = isert_handle_nop_out(isert_conn, isert_cmd, cmd, + rx_desc, (unsigned char *)hdr); + break; + case ISCSI_OP_SCSI_DATA_OUT: + ret = isert_handle_iscsi_dataout(isert_conn, rx_desc, + (unsigned char *)hdr); + break; + case ISCSI_OP_SCSI_TMFUNC: + cmd = isert_allocate_cmd(conn, rx_desc); + if (!cmd) + break; + + ret = iscsit_handle_task_mgt_cmd(conn, cmd, + (unsigned char *)hdr); + break; + case ISCSI_OP_LOGOUT: + cmd = isert_allocate_cmd(conn, rx_desc); + if (!cmd) + break; + + ret = iscsit_handle_logout_cmd(conn, cmd, (unsigned char *)hdr); + break; + case ISCSI_OP_TEXT: + if (be32_to_cpu(hdr->ttt) != 0xFFFFFFFF) + cmd = iscsit_find_cmd_from_itt(conn, hdr->itt); + else + cmd = isert_allocate_cmd(conn, rx_desc); + + if (!cmd) + break; + + isert_cmd = iscsit_priv_cmd(cmd); + ret = isert_handle_text_cmd(isert_conn, isert_cmd, cmd, + rx_desc, (struct iscsi_text *)hdr); + break; + default: + isert_err("Got unknown iSCSI OpCode: 0x%02x\n", opcode); + dump_stack(); + break; + } + + return ret; +} + +static void +isert_print_wc(struct ib_wc *wc, const char *type) +{ + if (wc->status != IB_WC_WR_FLUSH_ERR) + isert_err("%s failure: %s (%d) vend_err %x\n", type, + ib_wc_status_msg(wc->status), wc->status, + wc->vendor_err); + else + isert_dbg("%s failure: %s (%d)\n", type, + ib_wc_status_msg(wc->status), wc->status); +} + +static void +isert_recv_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct isert_conn *isert_conn = wc->qp->qp_context; + struct ib_device *ib_dev = isert_conn->cm_id->device; + struct iser_rx_desc *rx_desc = cqe_to_rx_desc(wc->wr_cqe); + struct iscsi_hdr *hdr = isert_get_iscsi_hdr(rx_desc); + struct iser_ctrl *iser_ctrl = isert_get_iser_hdr(rx_desc); + uint64_t read_va = 0, write_va = 0; + uint32_t read_stag = 0, write_stag = 0; + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + isert_print_wc(wc, "recv"); + if (wc->status != IB_WC_WR_FLUSH_ERR) + iscsit_cause_connection_reinstatement(isert_conn->conn, 0); + return; + } + + rx_desc->in_use = true; + + ib_dma_sync_single_for_cpu(ib_dev, rx_desc->dma_addr, + ISER_RX_SIZE, DMA_FROM_DEVICE); + + isert_dbg("DMA: 0x%llx, iSCSI opcode: 0x%02x, ITT: 0x%08x, flags: 0x%02x dlen: %d\n", + rx_desc->dma_addr, hdr->opcode, hdr->itt, hdr->flags, + (int)(wc->byte_len - ISER_HEADERS_LEN)); + + switch (iser_ctrl->flags & 0xF0) { + case ISCSI_CTRL: + if (iser_ctrl->flags & ISER_RSV) { + read_stag = be32_to_cpu(iser_ctrl->read_stag); + read_va = be64_to_cpu(iser_ctrl->read_va); + isert_dbg("ISER_RSV: read_stag: 0x%x read_va: 0x%llx\n", + read_stag, (unsigned long long)read_va); + } + if (iser_ctrl->flags & ISER_WSV) { + write_stag = be32_to_cpu(iser_ctrl->write_stag); + write_va = be64_to_cpu(iser_ctrl->write_va); + isert_dbg("ISER_WSV: write_stag: 0x%x write_va: 0x%llx\n", + write_stag, (unsigned long long)write_va); + } + + isert_dbg("ISER ISCSI_CTRL PDU\n"); + break; + case ISER_HELLO: + isert_err("iSER Hello message\n"); + break; + default: + isert_warn("Unknown iSER hdr flags: 0x%02x\n", iser_ctrl->flags); + break; + } + + isert_rx_opcode(isert_conn, rx_desc, + read_stag, read_va, write_stag, write_va); + + ib_dma_sync_single_for_device(ib_dev, rx_desc->dma_addr, + ISER_RX_SIZE, DMA_FROM_DEVICE); +} + +static void +isert_login_recv_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct isert_conn *isert_conn = wc->qp->qp_context; + struct ib_device *ib_dev = isert_conn->device->ib_device; + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + isert_print_wc(wc, "login recv"); + return; + } + + ib_dma_sync_single_for_cpu(ib_dev, isert_conn->login_desc->dma_addr, + ISER_RX_SIZE, DMA_FROM_DEVICE); + + isert_conn->login_req_len = wc->byte_len - ISER_HEADERS_LEN; + + if (isert_conn->conn) { + struct iscsi_login *login = isert_conn->conn->conn_login; + + if (login && !login->first_request) + isert_rx_login_req(isert_conn); + } + + mutex_lock(&isert_conn->mutex); + complete(&isert_conn->login_req_comp); + mutex_unlock(&isert_conn->mutex); + + ib_dma_sync_single_for_device(ib_dev, isert_conn->login_desc->dma_addr, + ISER_RX_SIZE, DMA_FROM_DEVICE); +} + +static void +isert_rdma_rw_ctx_destroy(struct isert_cmd *cmd, struct isert_conn *conn) +{ + struct se_cmd *se_cmd = &cmd->iscsi_cmd->se_cmd; + enum dma_data_direction dir = target_reverse_dma_direction(se_cmd); + + if (!cmd->rw.nr_ops) + return; + + if (isert_prot_cmd(conn, se_cmd)) { + rdma_rw_ctx_destroy_signature(&cmd->rw, conn->qp, + conn->cm_id->port_num, se_cmd->t_data_sg, + se_cmd->t_data_nents, se_cmd->t_prot_sg, + se_cmd->t_prot_nents, dir); + } else { + rdma_rw_ctx_destroy(&cmd->rw, conn->qp, conn->cm_id->port_num, + se_cmd->t_data_sg, se_cmd->t_data_nents, dir); + } + + cmd->rw.nr_ops = 0; +} + +static void +isert_put_cmd(struct isert_cmd *isert_cmd, bool comp_err) +{ + struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd; + struct isert_conn *isert_conn = isert_cmd->conn; + struct iscsi_conn *conn = isert_conn->conn; + struct iscsi_text_rsp *hdr; + + isert_dbg("Cmd %p\n", isert_cmd); + + switch (cmd->iscsi_opcode) { + case ISCSI_OP_SCSI_CMD: + spin_lock_bh(&conn->cmd_lock); + if (!list_empty(&cmd->i_conn_node)) + list_del_init(&cmd->i_conn_node); + spin_unlock_bh(&conn->cmd_lock); + + if (cmd->data_direction == DMA_TO_DEVICE) { + iscsit_stop_dataout_timer(cmd); + /* + * Check for special case during comp_err where + * WRITE_PENDING has been handed off from core, + * but requires an extra target_put_sess_cmd() + * before transport_generic_free_cmd() below. + */ + if (comp_err && + cmd->se_cmd.t_state == TRANSPORT_WRITE_PENDING) { + struct se_cmd *se_cmd = &cmd->se_cmd; + + target_put_sess_cmd(se_cmd); + } + } + + isert_rdma_rw_ctx_destroy(isert_cmd, isert_conn); + transport_generic_free_cmd(&cmd->se_cmd, 0); + break; + case ISCSI_OP_SCSI_TMFUNC: + spin_lock_bh(&conn->cmd_lock); + if (!list_empty(&cmd->i_conn_node)) + list_del_init(&cmd->i_conn_node); + spin_unlock_bh(&conn->cmd_lock); + + transport_generic_free_cmd(&cmd->se_cmd, 0); + break; + case ISCSI_OP_REJECT: + case ISCSI_OP_NOOP_OUT: + case ISCSI_OP_TEXT: + hdr = (struct iscsi_text_rsp *)&isert_cmd->tx_desc.iscsi_header; + /* If the continue bit is on, keep the command alive */ + if (hdr->flags & ISCSI_FLAG_TEXT_CONTINUE) + break; + + spin_lock_bh(&conn->cmd_lock); + if (!list_empty(&cmd->i_conn_node)) + list_del_init(&cmd->i_conn_node); + spin_unlock_bh(&conn->cmd_lock); + + /* + * Handle special case for REJECT when iscsi_add_reject*() has + * overwritten the original iscsi_opcode assignment, and the + * associated cmd->se_cmd needs to be released. + */ + if (cmd->se_cmd.se_tfo != NULL) { + isert_dbg("Calling transport_generic_free_cmd for 0x%02x\n", + cmd->iscsi_opcode); + transport_generic_free_cmd(&cmd->se_cmd, 0); + break; + } + fallthrough; + default: + iscsit_release_cmd(cmd); + break; + } +} + +static void +isert_unmap_tx_desc(struct iser_tx_desc *tx_desc, struct ib_device *ib_dev) +{ + if (tx_desc->dma_addr != 0) { + isert_dbg("unmap single for tx_desc->dma_addr\n"); + ib_dma_unmap_single(ib_dev, tx_desc->dma_addr, + ISER_HEADERS_LEN, DMA_TO_DEVICE); + tx_desc->dma_addr = 0; + } +} + +static void +isert_completion_put(struct iser_tx_desc *tx_desc, struct isert_cmd *isert_cmd, + struct ib_device *ib_dev, bool comp_err) +{ + if (isert_cmd->pdu_buf_dma != 0) { + isert_dbg("unmap single for isert_cmd->pdu_buf_dma\n"); + ib_dma_unmap_single(ib_dev, isert_cmd->pdu_buf_dma, + isert_cmd->pdu_buf_len, DMA_TO_DEVICE); + isert_cmd->pdu_buf_dma = 0; + } + + isert_unmap_tx_desc(tx_desc, ib_dev); + isert_put_cmd(isert_cmd, comp_err); +} + +static int +isert_check_pi_status(struct se_cmd *se_cmd, struct ib_mr *sig_mr) +{ + struct ib_mr_status mr_status; + int ret; + + ret = ib_check_mr_status(sig_mr, IB_MR_CHECK_SIG_STATUS, &mr_status); + if (ret) { + isert_err("ib_check_mr_status failed, ret %d\n", ret); + goto fail_mr_status; + } + + if (mr_status.fail_status & IB_MR_CHECK_SIG_STATUS) { + u64 sec_offset_err; + u32 block_size = se_cmd->se_dev->dev_attrib.block_size + 8; + + switch (mr_status.sig_err.err_type) { + case IB_SIG_BAD_GUARD: + se_cmd->pi_err = TCM_LOGICAL_BLOCK_GUARD_CHECK_FAILED; + break; + case IB_SIG_BAD_REFTAG: + se_cmd->pi_err = TCM_LOGICAL_BLOCK_REF_TAG_CHECK_FAILED; + break; + case IB_SIG_BAD_APPTAG: + se_cmd->pi_err = TCM_LOGICAL_BLOCK_APP_TAG_CHECK_FAILED; + break; + } + sec_offset_err = mr_status.sig_err.sig_err_offset; + do_div(sec_offset_err, block_size); + se_cmd->sense_info = sec_offset_err + se_cmd->t_task_lba; + + isert_err("PI error found type %d at sector 0x%llx " + "expected 0x%x vs actual 0x%x\n", + mr_status.sig_err.err_type, + (unsigned long long)se_cmd->sense_info, + mr_status.sig_err.expected, + mr_status.sig_err.actual); + ret = 1; + } + +fail_mr_status: + return ret; +} + +static void +isert_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct isert_conn *isert_conn = wc->qp->qp_context; + struct isert_device *device = isert_conn->device; + struct iser_tx_desc *desc = cqe_to_tx_desc(wc->wr_cqe); + struct isert_cmd *isert_cmd = tx_desc_to_cmd(desc); + struct se_cmd *cmd = &isert_cmd->iscsi_cmd->se_cmd; + int ret = 0; + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + isert_print_wc(wc, "rdma write"); + if (wc->status != IB_WC_WR_FLUSH_ERR) + iscsit_cause_connection_reinstatement(isert_conn->conn, 0); + isert_completion_put(desc, isert_cmd, device->ib_device, true); + return; + } + + isert_dbg("Cmd %p\n", isert_cmd); + + ret = isert_check_pi_status(cmd, isert_cmd->rw.reg->mr); + isert_rdma_rw_ctx_destroy(isert_cmd, isert_conn); + + if (ret) { + /* + * transport_generic_request_failure() expects to have + * plus two references to handle queue-full, so re-add + * one here as target-core will have already dropped + * it after the first isert_put_datain() callback. + */ + kref_get(&cmd->cmd_kref); + transport_generic_request_failure(cmd, cmd->pi_err); + } else { + /* + * XXX: isert_put_response() failure is not retried. + */ + ret = isert_put_response(isert_conn->conn, isert_cmd->iscsi_cmd); + if (ret) + pr_warn_ratelimited("isert_put_response() ret: %d\n", ret); + } +} + +static void +isert_rdma_read_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct isert_conn *isert_conn = wc->qp->qp_context; + struct isert_device *device = isert_conn->device; + struct iser_tx_desc *desc = cqe_to_tx_desc(wc->wr_cqe); + struct isert_cmd *isert_cmd = tx_desc_to_cmd(desc); + struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd; + struct se_cmd *se_cmd = &cmd->se_cmd; + int ret = 0; + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + isert_print_wc(wc, "rdma read"); + if (wc->status != IB_WC_WR_FLUSH_ERR) + iscsit_cause_connection_reinstatement(isert_conn->conn, 0); + isert_completion_put(desc, isert_cmd, device->ib_device, true); + return; + } + + isert_dbg("Cmd %p\n", isert_cmd); + + iscsit_stop_dataout_timer(cmd); + + if (isert_prot_cmd(isert_conn, se_cmd)) + ret = isert_check_pi_status(se_cmd, isert_cmd->rw.reg->mr); + isert_rdma_rw_ctx_destroy(isert_cmd, isert_conn); + cmd->write_data_done = 0; + + isert_dbg("Cmd: %p RDMA_READ comp calling execute_cmd\n", isert_cmd); + spin_lock_bh(&cmd->istate_lock); + cmd->cmd_flags |= ICF_GOT_LAST_DATAOUT; + cmd->i_state = ISTATE_RECEIVED_LAST_DATAOUT; + spin_unlock_bh(&cmd->istate_lock); + + /* + * transport_generic_request_failure() will drop the extra + * se_cmd->cmd_kref reference after T10-PI error, and handle + * any non-zero ->queue_status() callback error retries. + */ + if (ret) + transport_generic_request_failure(se_cmd, se_cmd->pi_err); + else + target_execute_cmd(se_cmd); +} + +static void +isert_do_control_comp(struct work_struct *work) +{ + struct isert_cmd *isert_cmd = container_of(work, + struct isert_cmd, comp_work); + struct isert_conn *isert_conn = isert_cmd->conn; + struct ib_device *ib_dev = isert_conn->cm_id->device; + struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd; + + isert_dbg("Cmd %p i_state %d\n", isert_cmd, cmd->i_state); + + switch (cmd->i_state) { + case ISTATE_SEND_TASKMGTRSP: + iscsit_tmr_post_handler(cmd, cmd->conn); + fallthrough; + case ISTATE_SEND_REJECT: + case ISTATE_SEND_TEXTRSP: + cmd->i_state = ISTATE_SENT_STATUS; + isert_completion_put(&isert_cmd->tx_desc, isert_cmd, + ib_dev, false); + break; + case ISTATE_SEND_LOGOUTRSP: + iscsit_logout_post_handler(cmd, cmd->conn); + break; + default: + isert_err("Unknown i_state %d\n", cmd->i_state); + dump_stack(); + break; + } +} + +static void +isert_login_send_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct isert_conn *isert_conn = wc->qp->qp_context; + struct ib_device *ib_dev = isert_conn->cm_id->device; + struct iser_tx_desc *tx_desc = cqe_to_tx_desc(wc->wr_cqe); + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + isert_print_wc(wc, "login send"); + if (wc->status != IB_WC_WR_FLUSH_ERR) + iscsit_cause_connection_reinstatement(isert_conn->conn, 0); + } + + isert_unmap_tx_desc(tx_desc, ib_dev); +} + +static void +isert_send_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct isert_conn *isert_conn = wc->qp->qp_context; + struct ib_device *ib_dev = isert_conn->cm_id->device; + struct iser_tx_desc *tx_desc = cqe_to_tx_desc(wc->wr_cqe); + struct isert_cmd *isert_cmd = tx_desc_to_cmd(tx_desc); + struct se_cmd *cmd = &isert_cmd->iscsi_cmd->se_cmd; + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + isert_print_wc(wc, "send"); + if (wc->status == IB_WC_SIG_PIPELINE_CANCELED) { + isert_check_pi_status(cmd, isert_cmd->rw.reg->mr); + isert_rdma_rw_ctx_destroy(isert_cmd, isert_conn); + /* + * transport_generic_request_failure() expects to have + * plus two references to handle queue-full, so re-add + * one here as target-core will have already dropped + * it after the first isert_put_datain() callback. + */ + kref_get(&cmd->cmd_kref); + transport_generic_request_failure(cmd, cmd->pi_err); + } else { + if (wc->status != IB_WC_WR_FLUSH_ERR) + iscsit_cause_connection_reinstatement( + isert_conn->conn, 0); + isert_completion_put(tx_desc, isert_cmd, ib_dev, true); + } + return; + } + + isert_dbg("Cmd %p\n", isert_cmd); + + /* To reuse the signature MR later, we need to mark it as checked. */ + if (isert_cmd->send_sig_pipelined) + isert_check_pi_status(cmd, isert_cmd->rw.reg->mr); + + switch (isert_cmd->iscsi_cmd->i_state) { + case ISTATE_SEND_TASKMGTRSP: + case ISTATE_SEND_LOGOUTRSP: + case ISTATE_SEND_REJECT: + case ISTATE_SEND_TEXTRSP: + isert_unmap_tx_desc(tx_desc, ib_dev); + + INIT_WORK(&isert_cmd->comp_work, isert_do_control_comp); + queue_work(isert_comp_wq, &isert_cmd->comp_work); + return; + default: + isert_cmd->iscsi_cmd->i_state = ISTATE_SENT_STATUS; + isert_completion_put(tx_desc, isert_cmd, ib_dev, false); + break; + } +} + +static int +isert_post_response(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd) +{ + int ret; + + ret = isert_post_recv(isert_conn, isert_cmd->rx_desc); + if (ret) + return ret; + + ret = ib_post_send(isert_conn->qp, &isert_cmd->tx_desc.send_wr, NULL); + if (ret) { + isert_err("ib_post_send failed with %d\n", ret); + return ret; + } + return ret; +} + +static int +isert_put_response(struct iscsi_conn *conn, struct iscsi_cmd *cmd) +{ + struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); + struct isert_conn *isert_conn = conn->context; + struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr; + struct iscsi_scsi_rsp *hdr = (struct iscsi_scsi_rsp *) + &isert_cmd->tx_desc.iscsi_header; + + isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc); + iscsit_build_rsp_pdu(cmd, conn, true, hdr); + isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc); + /* + * Attach SENSE DATA payload to iSCSI Response PDU + */ + if (cmd->se_cmd.sense_buffer && + ((cmd->se_cmd.se_cmd_flags & SCF_TRANSPORT_TASK_SENSE) || + (cmd->se_cmd.se_cmd_flags & SCF_EMULATED_TASK_SENSE))) { + struct isert_device *device = isert_conn->device; + struct ib_device *ib_dev = device->ib_device; + struct ib_sge *tx_dsg = &isert_cmd->tx_desc.tx_sg[1]; + u32 padding, pdu_len; + + put_unaligned_be16(cmd->se_cmd.scsi_sense_length, + cmd->sense_buffer); + cmd->se_cmd.scsi_sense_length += sizeof(__be16); + + padding = -(cmd->se_cmd.scsi_sense_length) & 3; + hton24(hdr->dlength, (u32)cmd->se_cmd.scsi_sense_length); + pdu_len = cmd->se_cmd.scsi_sense_length + padding; + + isert_cmd->pdu_buf_dma = ib_dma_map_single(ib_dev, + (void *)cmd->sense_buffer, pdu_len, + DMA_TO_DEVICE); + if (ib_dma_mapping_error(ib_dev, isert_cmd->pdu_buf_dma)) + return -ENOMEM; + + isert_cmd->pdu_buf_len = pdu_len; + tx_dsg->addr = isert_cmd->pdu_buf_dma; + tx_dsg->length = pdu_len; + tx_dsg->lkey = device->pd->local_dma_lkey; + isert_cmd->tx_desc.num_sge = 2; + } + + isert_init_send_wr(isert_conn, isert_cmd, send_wr); + + isert_dbg("Posting SCSI Response\n"); + + return isert_post_response(isert_conn, isert_cmd); +} + +static void +isert_aborted_task(struct iscsi_conn *conn, struct iscsi_cmd *cmd) +{ + struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); + struct isert_conn *isert_conn = conn->context; + + spin_lock_bh(&conn->cmd_lock); + if (!list_empty(&cmd->i_conn_node)) + list_del_init(&cmd->i_conn_node); + spin_unlock_bh(&conn->cmd_lock); + + if (cmd->data_direction == DMA_TO_DEVICE) + iscsit_stop_dataout_timer(cmd); + isert_rdma_rw_ctx_destroy(isert_cmd, isert_conn); +} + +static enum target_prot_op +isert_get_sup_prot_ops(struct iscsi_conn *conn) +{ + struct isert_conn *isert_conn = conn->context; + struct isert_device *device = isert_conn->device; + + if (conn->tpg->tpg_attrib.t10_pi) { + if (device->pi_capable) { + isert_info("conn %p PI offload enabled\n", isert_conn); + isert_conn->pi_support = true; + isert_conn->sig_pipeline = device->sig_pipeline; + return TARGET_PROT_ALL; + } + } + + isert_info("conn %p PI offload disabled\n", isert_conn); + isert_conn->pi_support = false; + isert_conn->sig_pipeline = false; + + return TARGET_PROT_NORMAL; +} + +static int +isert_put_nopin(struct iscsi_cmd *cmd, struct iscsi_conn *conn, + bool nopout_response) +{ + struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); + struct isert_conn *isert_conn = conn->context; + struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr; + + isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc); + iscsit_build_nopin_rsp(cmd, conn, (struct iscsi_nopin *) + &isert_cmd->tx_desc.iscsi_header, + nopout_response); + isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc); + isert_init_send_wr(isert_conn, isert_cmd, send_wr); + + isert_dbg("conn %p Posting NOPIN Response\n", isert_conn); + + return isert_post_response(isert_conn, isert_cmd); +} + +static int +isert_put_logout_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn) +{ + struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); + struct isert_conn *isert_conn = conn->context; + struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr; + + isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc); + iscsit_build_logout_rsp(cmd, conn, (struct iscsi_logout_rsp *) + &isert_cmd->tx_desc.iscsi_header); + isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc); + isert_init_send_wr(isert_conn, isert_cmd, send_wr); + + isert_dbg("conn %p Posting Logout Response\n", isert_conn); + + return isert_post_response(isert_conn, isert_cmd); +} + +static int +isert_put_tm_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn) +{ + struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); + struct isert_conn *isert_conn = conn->context; + struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr; + + isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc); + iscsit_build_task_mgt_rsp(cmd, conn, (struct iscsi_tm_rsp *) + &isert_cmd->tx_desc.iscsi_header); + isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc); + isert_init_send_wr(isert_conn, isert_cmd, send_wr); + + isert_dbg("conn %p Posting Task Management Response\n", isert_conn); + + return isert_post_response(isert_conn, isert_cmd); +} + +static int +isert_put_reject(struct iscsi_cmd *cmd, struct iscsi_conn *conn) +{ + struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); + struct isert_conn *isert_conn = conn->context; + struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr; + struct isert_device *device = isert_conn->device; + struct ib_device *ib_dev = device->ib_device; + struct ib_sge *tx_dsg = &isert_cmd->tx_desc.tx_sg[1]; + struct iscsi_reject *hdr = + (struct iscsi_reject *)&isert_cmd->tx_desc.iscsi_header; + + isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc); + iscsit_build_reject(cmd, conn, hdr); + isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc); + + hton24(hdr->dlength, ISCSI_HDR_LEN); + isert_cmd->pdu_buf_dma = ib_dma_map_single(ib_dev, + (void *)cmd->buf_ptr, ISCSI_HDR_LEN, + DMA_TO_DEVICE); + if (ib_dma_mapping_error(ib_dev, isert_cmd->pdu_buf_dma)) + return -ENOMEM; + isert_cmd->pdu_buf_len = ISCSI_HDR_LEN; + tx_dsg->addr = isert_cmd->pdu_buf_dma; + tx_dsg->length = ISCSI_HDR_LEN; + tx_dsg->lkey = device->pd->local_dma_lkey; + isert_cmd->tx_desc.num_sge = 2; + + isert_init_send_wr(isert_conn, isert_cmd, send_wr); + + isert_dbg("conn %p Posting Reject\n", isert_conn); + + return isert_post_response(isert_conn, isert_cmd); +} + +static int +isert_put_text_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn) +{ + struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); + struct isert_conn *isert_conn = conn->context; + struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr; + struct iscsi_text_rsp *hdr = + (struct iscsi_text_rsp *)&isert_cmd->tx_desc.iscsi_header; + u32 txt_rsp_len; + int rc; + + isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc); + rc = iscsit_build_text_rsp(cmd, conn, hdr, ISCSI_INFINIBAND); + if (rc < 0) + return rc; + + txt_rsp_len = rc; + isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc); + + if (txt_rsp_len) { + struct isert_device *device = isert_conn->device; + struct ib_device *ib_dev = device->ib_device; + struct ib_sge *tx_dsg = &isert_cmd->tx_desc.tx_sg[1]; + void *txt_rsp_buf = cmd->buf_ptr; + + isert_cmd->pdu_buf_dma = ib_dma_map_single(ib_dev, + txt_rsp_buf, txt_rsp_len, DMA_TO_DEVICE); + if (ib_dma_mapping_error(ib_dev, isert_cmd->pdu_buf_dma)) + return -ENOMEM; + + isert_cmd->pdu_buf_len = txt_rsp_len; + tx_dsg->addr = isert_cmd->pdu_buf_dma; + tx_dsg->length = txt_rsp_len; + tx_dsg->lkey = device->pd->local_dma_lkey; + isert_cmd->tx_desc.num_sge = 2; + } + isert_init_send_wr(isert_conn, isert_cmd, send_wr); + + isert_dbg("conn %p Text Response\n", isert_conn); + + return isert_post_response(isert_conn, isert_cmd); +} + +static inline void +isert_set_dif_domain(struct se_cmd *se_cmd, struct ib_sig_domain *domain) +{ + domain->sig_type = IB_SIG_TYPE_T10_DIF; + domain->sig.dif.bg_type = IB_T10DIF_CRC; + domain->sig.dif.pi_interval = se_cmd->se_dev->dev_attrib.block_size; + domain->sig.dif.ref_tag = se_cmd->reftag_seed; + /* + * At the moment we hard code those, but if in the future + * the target core would like to use it, we will take it + * from se_cmd. + */ + domain->sig.dif.apptag_check_mask = 0xffff; + domain->sig.dif.app_escape = true; + domain->sig.dif.ref_escape = true; + if (se_cmd->prot_type == TARGET_DIF_TYPE1_PROT || + se_cmd->prot_type == TARGET_DIF_TYPE2_PROT) + domain->sig.dif.ref_remap = true; +} + +static int +isert_set_sig_attrs(struct se_cmd *se_cmd, struct ib_sig_attrs *sig_attrs) +{ + memset(sig_attrs, 0, sizeof(*sig_attrs)); + + switch (se_cmd->prot_op) { + case TARGET_PROT_DIN_INSERT: + case TARGET_PROT_DOUT_STRIP: + sig_attrs->mem.sig_type = IB_SIG_TYPE_NONE; + isert_set_dif_domain(se_cmd, &sig_attrs->wire); + break; + case TARGET_PROT_DOUT_INSERT: + case TARGET_PROT_DIN_STRIP: + sig_attrs->wire.sig_type = IB_SIG_TYPE_NONE; + isert_set_dif_domain(se_cmd, &sig_attrs->mem); + break; + case TARGET_PROT_DIN_PASS: + case TARGET_PROT_DOUT_PASS: + isert_set_dif_domain(se_cmd, &sig_attrs->wire); + isert_set_dif_domain(se_cmd, &sig_attrs->mem); + break; + default: + isert_err("Unsupported PI operation %d\n", se_cmd->prot_op); + return -EINVAL; + } + + if (se_cmd->prot_checks & TARGET_DIF_CHECK_GUARD) + sig_attrs->check_mask |= IB_SIG_CHECK_GUARD; + if (se_cmd->prot_checks & TARGET_DIF_CHECK_APPTAG) + sig_attrs->check_mask |= IB_SIG_CHECK_APPTAG; + if (se_cmd->prot_checks & TARGET_DIF_CHECK_REFTAG) + sig_attrs->check_mask |= IB_SIG_CHECK_REFTAG; + + return 0; +} + +static int +isert_rdma_rw_ctx_post(struct isert_cmd *cmd, struct isert_conn *conn, + struct ib_cqe *cqe, struct ib_send_wr *chain_wr) +{ + struct se_cmd *se_cmd = &cmd->iscsi_cmd->se_cmd; + enum dma_data_direction dir = target_reverse_dma_direction(se_cmd); + u8 port_num = conn->cm_id->port_num; + u64 addr; + u32 rkey, offset; + int ret; + + if (cmd->ctx_init_done) + goto rdma_ctx_post; + + if (dir == DMA_FROM_DEVICE) { + addr = cmd->write_va; + rkey = cmd->write_stag; + offset = cmd->iscsi_cmd->write_data_done; + } else { + addr = cmd->read_va; + rkey = cmd->read_stag; + offset = 0; + } + + if (isert_prot_cmd(conn, se_cmd)) { + struct ib_sig_attrs sig_attrs; + + ret = isert_set_sig_attrs(se_cmd, &sig_attrs); + if (ret) + return ret; + + WARN_ON_ONCE(offset); + ret = rdma_rw_ctx_signature_init(&cmd->rw, conn->qp, port_num, + se_cmd->t_data_sg, se_cmd->t_data_nents, + se_cmd->t_prot_sg, se_cmd->t_prot_nents, + &sig_attrs, addr, rkey, dir); + } else { + ret = rdma_rw_ctx_init(&cmd->rw, conn->qp, port_num, + se_cmd->t_data_sg, se_cmd->t_data_nents, + offset, addr, rkey, dir); + } + + if (ret < 0) { + isert_err("Cmd: %p failed to prepare RDMA res\n", cmd); + return ret; + } + + cmd->ctx_init_done = true; + +rdma_ctx_post: + ret = rdma_rw_ctx_post(&cmd->rw, conn->qp, port_num, cqe, chain_wr); + if (ret < 0) + isert_err("Cmd: %p failed to post RDMA res\n", cmd); + return ret; +} + +static int +isert_put_datain(struct iscsi_conn *conn, struct iscsi_cmd *cmd) +{ + struct se_cmd *se_cmd = &cmd->se_cmd; + struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); + struct isert_conn *isert_conn = conn->context; + struct ib_cqe *cqe = NULL; + struct ib_send_wr *chain_wr = NULL; + int rc; + + isert_dbg("Cmd: %p RDMA_WRITE data_length: %u\n", + isert_cmd, se_cmd->data_length); + + if (!isert_conn->sig_pipeline && isert_prot_cmd(isert_conn, se_cmd)) { + isert_cmd->tx_desc.tx_cqe.done = isert_rdma_write_done; + cqe = &isert_cmd->tx_desc.tx_cqe; + } else { + int send_flags = IB_SEND_SIGNALED; + + if (isert_prot_cmd(isert_conn, se_cmd)) { + send_flags |= IB_SEND_SIG_PIPELINED; + isert_cmd->send_sig_pipelined = true; + } + + /* + * Build isert_conn->tx_desc for iSCSI response PDU and attach + */ + isert_create_send_desc(isert_conn, isert_cmd, + &isert_cmd->tx_desc); + iscsit_build_rsp_pdu(cmd, conn, true, (struct iscsi_scsi_rsp *) + &isert_cmd->tx_desc.iscsi_header); + isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc); + isert_init_send_wr_flags(isert_conn, isert_cmd, + &isert_cmd->tx_desc.send_wr, + send_flags); + + rc = isert_post_recv(isert_conn, isert_cmd->rx_desc); + if (rc) + goto err; + + chain_wr = &isert_cmd->tx_desc.send_wr; + } + + rc = isert_rdma_rw_ctx_post(isert_cmd, isert_conn, cqe, chain_wr); + isert_dbg("Cmd: %p posted RDMA_WRITE for iSER Data READ rc: %d\n", + isert_cmd, rc); + if (rc) + goto err; + + return 0; +err: + isert_cmd->send_sig_pipelined = false; + return rc; +} + +static int +isert_get_dataout(struct iscsi_conn *conn, struct iscsi_cmd *cmd, bool recovery) +{ + struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); + int ret; + + isert_dbg("Cmd: %p RDMA_READ data_length: %u write_data_done: %u\n", + isert_cmd, cmd->se_cmd.data_length, cmd->write_data_done); + + isert_cmd->tx_desc.tx_cqe.done = isert_rdma_read_done; + ret = isert_rdma_rw_ctx_post(isert_cmd, conn->context, + &isert_cmd->tx_desc.tx_cqe, NULL); + + isert_dbg("Cmd: %p posted RDMA_READ memory for ISER Data WRITE rc: %d\n", + isert_cmd, ret); + return ret; +} + +static int +isert_immediate_queue(struct iscsi_conn *conn, struct iscsi_cmd *cmd, int state) +{ + struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); + int ret = 0; + + switch (state) { + case ISTATE_REMOVE: + spin_lock_bh(&conn->cmd_lock); + list_del_init(&cmd->i_conn_node); + spin_unlock_bh(&conn->cmd_lock); + isert_put_cmd(isert_cmd, true); + break; + case ISTATE_SEND_NOPIN_WANT_RESPONSE: + ret = isert_put_nopin(cmd, conn, false); + break; + default: + isert_err("Unknown immediate state: 0x%02x\n", state); + ret = -EINVAL; + break; + } + + return ret; +} + +static int +isert_response_queue(struct iscsi_conn *conn, struct iscsi_cmd *cmd, int state) +{ + struct isert_conn *isert_conn = conn->context; + int ret; + + switch (state) { + case ISTATE_SEND_LOGOUTRSP: + ret = isert_put_logout_rsp(cmd, conn); + if (!ret) + isert_conn->logout_posted = true; + break; + case ISTATE_SEND_NOPIN: + ret = isert_put_nopin(cmd, conn, true); + break; + case ISTATE_SEND_TASKMGTRSP: + ret = isert_put_tm_rsp(cmd, conn); + break; + case ISTATE_SEND_REJECT: + ret = isert_put_reject(cmd, conn); + break; + case ISTATE_SEND_TEXTRSP: + ret = isert_put_text_rsp(cmd, conn); + break; + case ISTATE_SEND_STATUS: + /* + * Special case for sending non GOOD SCSI status from TX thread + * context during pre se_cmd excecution failure. + */ + ret = isert_put_response(conn, cmd); + break; + default: + isert_err("Unknown response state: 0x%02x\n", state); + ret = -EINVAL; + break; + } + + return ret; +} + +struct rdma_cm_id * +isert_setup_id(struct isert_np *isert_np) +{ + struct iscsi_np *np = isert_np->np; + struct rdma_cm_id *id; + struct sockaddr *sa; + int ret; + + sa = (struct sockaddr *)&np->np_sockaddr; + isert_dbg("ksockaddr: %p, sa: %p\n", &np->np_sockaddr, sa); + + id = rdma_create_id(&init_net, isert_cma_handler, isert_np, + RDMA_PS_TCP, IB_QPT_RC); + if (IS_ERR(id)) { + isert_err("rdma_create_id() failed: %ld\n", PTR_ERR(id)); + ret = PTR_ERR(id); + goto out; + } + isert_dbg("id %p context %p\n", id, id->context); + + /* + * Allow both IPv4 and IPv6 sockets to bind a single port + * at the same time. + */ + ret = rdma_set_afonly(id, 1); + if (ret) { + isert_err("rdma_set_afonly() failed: %d\n", ret); + goto out_id; + } + + ret = rdma_bind_addr(id, sa); + if (ret) { + isert_err("rdma_bind_addr() failed: %d\n", ret); + goto out_id; + } + + ret = rdma_listen(id, 0); + if (ret) { + isert_err("rdma_listen() failed: %d\n", ret); + goto out_id; + } + + return id; +out_id: + rdma_destroy_id(id); +out: + return ERR_PTR(ret); +} + +static int +isert_setup_np(struct iscsi_np *np, + struct sockaddr_storage *ksockaddr) +{ + struct isert_np *isert_np; + struct rdma_cm_id *isert_lid; + int ret; + + isert_np = kzalloc(sizeof(struct isert_np), GFP_KERNEL); + if (!isert_np) + return -ENOMEM; + + sema_init(&isert_np->sem, 0); + mutex_init(&isert_np->mutex); + INIT_LIST_HEAD(&isert_np->accepted); + INIT_LIST_HEAD(&isert_np->pending); + isert_np->np = np; + + /* + * Setup the np->np_sockaddr from the passed sockaddr setup + * in iscsi_target_configfs.c code.. + */ + memcpy(&np->np_sockaddr, ksockaddr, + sizeof(struct sockaddr_storage)); + + isert_lid = isert_setup_id(isert_np); + if (IS_ERR(isert_lid)) { + ret = PTR_ERR(isert_lid); + goto out; + } + + isert_np->cm_id = isert_lid; + np->np_context = isert_np; + + return 0; + +out: + kfree(isert_np); + + return ret; +} + +static int +isert_rdma_accept(struct isert_conn *isert_conn) +{ + struct rdma_cm_id *cm_id = isert_conn->cm_id; + struct rdma_conn_param cp; + int ret; + struct iser_cm_hdr rsp_hdr; + + memset(&cp, 0, sizeof(struct rdma_conn_param)); + cp.initiator_depth = isert_conn->initiator_depth; + cp.retry_count = 7; + cp.rnr_retry_count = 7; + + memset(&rsp_hdr, 0, sizeof(rsp_hdr)); + rsp_hdr.flags = ISERT_ZBVA_NOT_USED; + if (!isert_conn->snd_w_inv) + rsp_hdr.flags = rsp_hdr.flags | ISERT_SEND_W_INV_NOT_USED; + cp.private_data = (void *)&rsp_hdr; + cp.private_data_len = sizeof(rsp_hdr); + + ret = rdma_accept(cm_id, &cp); + if (ret) { + isert_err("rdma_accept() failed with: %d\n", ret); + return ret; + } + + return 0; +} + +static int +isert_get_login_rx(struct iscsi_conn *conn, struct iscsi_login *login) +{ + struct isert_conn *isert_conn = conn->context; + int ret; + + isert_info("before login_req comp conn: %p\n", isert_conn); + ret = wait_for_completion_interruptible(&isert_conn->login_req_comp); + if (ret) { + isert_err("isert_conn %p interrupted before got login req\n", + isert_conn); + return ret; + } + reinit_completion(&isert_conn->login_req_comp); + + /* + * For login requests after the first PDU, isert_rx_login_req() will + * kick schedule_delayed_work(&conn->login_work) as the packet is + * received, which turns this callback from iscsi_target_do_login_rx() + * into a NOP. + */ + if (!login->first_request) + return 0; + + isert_rx_login_req(isert_conn); + + isert_info("before login_comp conn: %p\n", conn); + ret = wait_for_completion_interruptible(&isert_conn->login_comp); + if (ret) + return ret; + + isert_info("processing login->req: %p\n", login->req); + + return 0; +} + +static void +isert_set_conn_info(struct iscsi_np *np, struct iscsi_conn *conn, + struct isert_conn *isert_conn) +{ + struct rdma_cm_id *cm_id = isert_conn->cm_id; + struct rdma_route *cm_route = &cm_id->route; + + conn->login_family = np->np_sockaddr.ss_family; + + conn->login_sockaddr = cm_route->addr.dst_addr; + conn->local_sockaddr = cm_route->addr.src_addr; +} + +static int +isert_accept_np(struct iscsi_np *np, struct iscsi_conn *conn) +{ + struct isert_np *isert_np = np->np_context; + struct isert_conn *isert_conn; + int ret; + +accept_wait: + ret = down_interruptible(&isert_np->sem); + if (ret) + return -ENODEV; + + spin_lock_bh(&np->np_thread_lock); + if (np->np_thread_state >= ISCSI_NP_THREAD_RESET) { + spin_unlock_bh(&np->np_thread_lock); + isert_dbg("np_thread_state %d\n", + np->np_thread_state); + /* + * No point in stalling here when np_thread + * is in state RESET/SHUTDOWN/EXIT - bail + */ + return -ENODEV; + } + spin_unlock_bh(&np->np_thread_lock); + + mutex_lock(&isert_np->mutex); + if (list_empty(&isert_np->pending)) { + mutex_unlock(&isert_np->mutex); + goto accept_wait; + } + isert_conn = list_first_entry(&isert_np->pending, + struct isert_conn, node); + list_del_init(&isert_conn->node); + mutex_unlock(&isert_np->mutex); + + conn->context = isert_conn; + isert_conn->conn = conn; + isert_conn->state = ISER_CONN_BOUND; + + isert_set_conn_info(np, conn, isert_conn); + + isert_dbg("Processing isert_conn: %p\n", isert_conn); + + return 0; +} + +static void +isert_free_np(struct iscsi_np *np) +{ + struct isert_np *isert_np = np->np_context; + struct isert_conn *isert_conn, *n; + + if (isert_np->cm_id) + rdma_destroy_id(isert_np->cm_id); + + /* + * FIXME: At this point we don't have a good way to insure + * that at this point we don't have hanging connections that + * completed RDMA establishment but didn't start iscsi login + * process. So work-around this by cleaning up what ever piled + * up in accepted and pending lists. + */ + mutex_lock(&isert_np->mutex); + if (!list_empty(&isert_np->pending)) { + isert_info("Still have isert pending connections\n"); + list_for_each_entry_safe(isert_conn, n, + &isert_np->pending, + node) { + isert_info("cleaning isert_conn %p state (%d)\n", + isert_conn, isert_conn->state); + isert_connect_release(isert_conn); + } + } + + if (!list_empty(&isert_np->accepted)) { + isert_info("Still have isert accepted connections\n"); + list_for_each_entry_safe(isert_conn, n, + &isert_np->accepted, + node) { + isert_info("cleaning isert_conn %p state (%d)\n", + isert_conn, isert_conn->state); + isert_connect_release(isert_conn); + } + } + mutex_unlock(&isert_np->mutex); + + np->np_context = NULL; + kfree(isert_np); +} + +static void isert_release_work(struct work_struct *work) +{ + struct isert_conn *isert_conn = container_of(work, + struct isert_conn, + release_work); + + isert_info("Starting release conn %p\n", isert_conn); + + mutex_lock(&isert_conn->mutex); + isert_conn->state = ISER_CONN_DOWN; + mutex_unlock(&isert_conn->mutex); + + isert_info("Destroying conn %p\n", isert_conn); + isert_put_conn(isert_conn); +} + +static void +isert_wait4logout(struct isert_conn *isert_conn) +{ + struct iscsi_conn *conn = isert_conn->conn; + + isert_info("conn %p\n", isert_conn); + + if (isert_conn->logout_posted) { + isert_info("conn %p wait for conn_logout_comp\n", isert_conn); + wait_for_completion_timeout(&conn->conn_logout_comp, + SECONDS_FOR_LOGOUT_COMP * HZ); + } +} + +static void +isert_wait4cmds(struct iscsi_conn *conn) +{ + isert_info("iscsi_conn %p\n", conn); + + if (conn->sess) { + target_stop_session(conn->sess->se_sess); + target_wait_for_sess_cmds(conn->sess->se_sess); + } +} + +/** + * isert_put_unsol_pending_cmds() - Drop commands waiting for + * unsolicitate dataout + * @conn: iscsi connection + * + * We might still have commands that are waiting for unsolicited + * dataouts messages. We must put the extra reference on those + * before blocking on the target_wait_for_session_cmds + */ +static void +isert_put_unsol_pending_cmds(struct iscsi_conn *conn) +{ + struct iscsi_cmd *cmd, *tmp; + static LIST_HEAD(drop_cmd_list); + + spin_lock_bh(&conn->cmd_lock); + list_for_each_entry_safe(cmd, tmp, &conn->conn_cmd_list, i_conn_node) { + if ((cmd->cmd_flags & ICF_NON_IMMEDIATE_UNSOLICITED_DATA) && + (cmd->write_data_done < conn->sess->sess_ops->FirstBurstLength) && + (cmd->write_data_done < cmd->se_cmd.data_length)) + list_move_tail(&cmd->i_conn_node, &drop_cmd_list); + } + spin_unlock_bh(&conn->cmd_lock); + + list_for_each_entry_safe(cmd, tmp, &drop_cmd_list, i_conn_node) { + list_del_init(&cmd->i_conn_node); + if (cmd->i_state != ISTATE_REMOVE) { + struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); + + isert_info("conn %p dropping cmd %p\n", conn, cmd); + isert_put_cmd(isert_cmd, true); + } + } +} + +static void isert_wait_conn(struct iscsi_conn *conn) +{ + struct isert_conn *isert_conn = conn->context; + + isert_info("Starting conn %p\n", isert_conn); + + mutex_lock(&isert_conn->mutex); + isert_conn_terminate(isert_conn); + mutex_unlock(&isert_conn->mutex); + + ib_drain_qp(isert_conn->qp); + isert_put_unsol_pending_cmds(conn); + isert_wait4cmds(conn); + isert_wait4logout(isert_conn); + + queue_work(isert_release_wq, &isert_conn->release_work); +} + +static void isert_free_conn(struct iscsi_conn *conn) +{ + struct isert_conn *isert_conn = conn->context; + + ib_drain_qp(isert_conn->qp); + isert_put_conn(isert_conn); +} + +static void isert_get_rx_pdu(struct iscsi_conn *conn) +{ + struct completion comp; + + init_completion(&comp); + + wait_for_completion_interruptible(&comp); +} + +static struct iscsit_transport iser_target_transport = { + .name = "IB/iSER", + .transport_type = ISCSI_INFINIBAND, + .rdma_shutdown = true, + .priv_size = sizeof(struct isert_cmd), + .owner = THIS_MODULE, + .iscsit_setup_np = isert_setup_np, + .iscsit_accept_np = isert_accept_np, + .iscsit_free_np = isert_free_np, + .iscsit_wait_conn = isert_wait_conn, + .iscsit_free_conn = isert_free_conn, + .iscsit_get_login_rx = isert_get_login_rx, + .iscsit_put_login_tx = isert_put_login_tx, + .iscsit_immediate_queue = isert_immediate_queue, + .iscsit_response_queue = isert_response_queue, + .iscsit_get_dataout = isert_get_dataout, + .iscsit_queue_data_in = isert_put_datain, + .iscsit_queue_status = isert_put_response, + .iscsit_aborted_task = isert_aborted_task, + .iscsit_get_rx_pdu = isert_get_rx_pdu, + .iscsit_get_sup_prot_ops = isert_get_sup_prot_ops, +}; + +static int __init isert_init(void) +{ + int ret; + + isert_comp_wq = alloc_workqueue("isert_comp_wq", + WQ_UNBOUND | WQ_HIGHPRI, 0); + if (!isert_comp_wq) { + isert_err("Unable to allocate isert_comp_wq\n"); + return -ENOMEM; + } + + isert_release_wq = alloc_workqueue("isert_release_wq", WQ_UNBOUND, + WQ_UNBOUND_MAX_ACTIVE); + if (!isert_release_wq) { + isert_err("Unable to allocate isert_release_wq\n"); + ret = -ENOMEM; + goto destroy_comp_wq; + } + + iscsit_register_transport(&iser_target_transport); + isert_info("iSER_TARGET[0] - Loaded iser_target_transport\n"); + + return 0; + +destroy_comp_wq: + destroy_workqueue(isert_comp_wq); + + return ret; +} + +static void __exit isert_exit(void) +{ + flush_scheduled_work(); + destroy_workqueue(isert_release_wq); + destroy_workqueue(isert_comp_wq); + iscsit_unregister_transport(&iser_target_transport); + isert_info("iSER_TARGET[0] - Released iser_target_transport\n"); +} + +MODULE_DESCRIPTION("iSER-Target for mainline target infrastructure"); +MODULE_AUTHOR("nab@Linux-iSCSI.org"); +MODULE_LICENSE("GPL"); + +module_init(isert_init); +module_exit(isert_exit); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/ib_isert.h b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/ib_isert.h new file mode 100644 index 0000000..5b2f5c3 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/ib_isert.h @@ -0,0 +1,215 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include +#include +#include +#include +#include +#include +#include + + +#define DRV_NAME "isert" +#define PFX DRV_NAME ": " + +#define isert_dbg(fmt, arg...) \ + do { \ + if (unlikely(isert_debug_level > 2)) \ + printk(KERN_DEBUG PFX "%s: " fmt,\ + __func__ , ## arg); \ + } while (0) + +#define isert_warn(fmt, arg...) \ + do { \ + if (unlikely(isert_debug_level > 0)) \ + pr_warn(PFX "%s: " fmt, \ + __func__ , ## arg); \ + } while (0) + +#define isert_info(fmt, arg...) \ + do { \ + if (unlikely(isert_debug_level > 1)) \ + pr_info(PFX "%s: " fmt, \ + __func__ , ## arg); \ + } while (0) + +#define isert_err(fmt, arg...) \ + pr_err(PFX "%s: " fmt, __func__ , ## arg) + +/* Constant PDU lengths calculations */ +#define ISER_HEADERS_LEN (sizeof(struct iser_ctrl) + \ + sizeof(struct iscsi_hdr)) +#define ISER_RX_PAYLOAD_SIZE (ISER_HEADERS_LEN + ISCSI_DEF_MAX_RECV_SEG_LEN) + +/* QP settings */ +/* Maximal bounds on received asynchronous PDUs */ +#define ISERT_MAX_TX_MISC_PDUS 4 /* NOOP_IN(2) , ASYNC_EVENT(2) */ + +#define ISERT_MAX_RX_MISC_PDUS 6 /* + * NOOP_OUT(2), TEXT(1), + * SCSI_TMFUNC(2), LOGOUT(1) + */ + +#define ISCSI_DEF_XMIT_CMDS_MAX 128 /* from libiscsi.h, must be power of 2 */ + +#define ISERT_QP_MAX_RECV_DTOS (ISCSI_DEF_XMIT_CMDS_MAX) + +#define ISERT_MIN_POSTED_RX (ISCSI_DEF_XMIT_CMDS_MAX >> 2) + +#define ISERT_QP_MAX_REQ_DTOS (ISCSI_DEF_XMIT_CMDS_MAX + \ + ISERT_MAX_TX_MISC_PDUS + \ + ISERT_MAX_RX_MISC_PDUS) + +/* + * RX size is default of 8k plus headers, but data needs to align to + * 512 boundary, so use 1024 to have the extra space for alignment. + */ +#define ISER_RX_SIZE (ISCSI_DEF_MAX_RECV_SEG_LEN + 1024) + +/* Minimum I/O size is 512KB */ +#define ISCSI_ISER_MIN_SG_TABLESIZE 128 + +/* Maximum support is 16MB I/O size */ +#define ISCSI_ISER_MAX_SG_TABLESIZE 4096 + +enum isert_desc_type { + ISCSI_TX_CONTROL, + ISCSI_TX_DATAIN +}; + +enum iser_conn_state { + ISER_CONN_INIT, + ISER_CONN_UP, + ISER_CONN_BOUND, + ISER_CONN_FULL_FEATURE, + ISER_CONN_TERMINATING, + ISER_CONN_DOWN, +}; + +struct iser_rx_desc { + char buf[ISER_RX_SIZE]; + u64 dma_addr; + struct ib_sge rx_sg; + struct ib_cqe rx_cqe; + bool in_use; +}; + +static inline struct iser_rx_desc *cqe_to_rx_desc(struct ib_cqe *cqe) +{ + return container_of(cqe, struct iser_rx_desc, rx_cqe); +} + +static void *isert_get_iser_hdr(struct iser_rx_desc *desc) +{ + return PTR_ALIGN(desc->buf + ISER_HEADERS_LEN, 512) - ISER_HEADERS_LEN; +} + +static size_t isert_get_hdr_offset(struct iser_rx_desc *desc) +{ + return isert_get_iser_hdr(desc) - (void *)desc->buf; +} + +static void *isert_get_iscsi_hdr(struct iser_rx_desc *desc) +{ + return isert_get_iser_hdr(desc) + sizeof(struct iser_ctrl); +} + +static void *isert_get_data(struct iser_rx_desc *desc) +{ + void *data = isert_get_iser_hdr(desc) + ISER_HEADERS_LEN; + + WARN_ON((uintptr_t)data & 511); + return data; +} + +struct iser_tx_desc { + struct iser_ctrl iser_header; + struct iscsi_hdr iscsi_header; + enum isert_desc_type type; + u64 dma_addr; + struct ib_sge tx_sg[2]; + struct ib_cqe tx_cqe; + int num_sge; + struct ib_send_wr send_wr; +} __packed; + +static inline struct iser_tx_desc *cqe_to_tx_desc(struct ib_cqe *cqe) +{ + return container_of(cqe, struct iser_tx_desc, tx_cqe); +} + +struct isert_cmd { + uint32_t read_stag; + uint32_t write_stag; + uint64_t read_va; + uint64_t write_va; + uint32_t inv_rkey; + u64 pdu_buf_dma; + u32 pdu_buf_len; + struct isert_conn *conn; + struct iscsi_cmd *iscsi_cmd; + struct iser_tx_desc tx_desc; + struct iser_rx_desc *rx_desc; + struct rdma_rw_ctx rw; + struct work_struct comp_work; + struct scatterlist sg; + bool ctx_init_done; + bool send_sig_pipelined; +}; + +static inline struct isert_cmd *tx_desc_to_cmd(struct iser_tx_desc *desc) +{ + return container_of(desc, struct isert_cmd, tx_desc); +} + +struct isert_device; + +struct isert_conn { + enum iser_conn_state state; + u32 responder_resources; + u32 initiator_depth; + bool pi_support; + bool sig_pipeline; + struct iser_rx_desc *login_desc; + char *login_rsp_buf; + int login_req_len; + u64 login_rsp_dma; + struct iser_rx_desc *rx_descs; + struct ib_recv_wr rx_wr[ISERT_QP_MAX_RECV_DTOS]; + struct iscsi_conn *conn; + struct list_head node; + struct completion login_comp; + struct completion login_req_comp; + struct iser_tx_desc login_tx_desc; + struct rdma_cm_id *cm_id; + struct ib_qp *qp; + struct ib_cq *cq; + u32 cq_size; + struct isert_device *device; + struct mutex mutex; + struct kref kref; + struct work_struct release_work; + bool logout_posted; + bool snd_w_inv; + wait_queue_head_t rem_wait; + bool dev_removed; +}; + +struct isert_device { + bool pi_capable; + bool sig_pipeline; + int refcount; + struct ib_device *ib_device; + struct ib_pd *pd; + struct isert_comp *comps; + int comps_used; + struct list_head dev_node; +}; + +struct isert_np { + struct iscsi_np *np; + struct semaphore sem; + struct rdma_cm_id *cm_id; + struct mutex mutex; + struct list_head accepted; + struct list_head pending; +}; diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/ib_isert_dummy.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/ib_isert_dummy.c new file mode 100644 index 0000000..3761289 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/ib_isert_dummy.c @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "ib_isert" +#define PFX DRV_NAME ": " +#define DRV_VERSION "2.0.0" +#define DRV_RELDATE "August 14, 2016" + +MODULE_AUTHOR("Alaa Hleihel"); +MODULE_DESCRIPTION("ib_isert dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init isert_init(void) +{ + return 0; +} + +static void __exit isert_cleanup(void) +{ +} + +module_init(isert_init); +module_exit(isert_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/isert_spec_ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/isert_spec_ new file mode 100644 index 0000000..894da68 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/isert_spec_ @@ -0,0 +1,216 @@ +# +# Copyright (c) 2014 Mellanox Technologies. All rights reserved. +# +# This Software is licensed under one of the following licenses: +# +# 1) under the terms of the "Common Public License 1.0" a copy of which is +# available from the Open Source Initiative, see +# http://www.opensource.org/licenses/cpl.php. +# +# 2) under the terms of the "The BSD License" a copy of which is +# available from the Open Source Initiative, see +# http://www.opensource.org/licenses/bsd-license.php. +# +# 3) under the terms of the "GNU General Public License (GPL) Version 2" a +# copy of which is available from the Open Source Initiative, see +# http://www.opensource.org/licenses/gpl-license.php. +# +# Licensee has the right to choose one of the above licenses. +# +# Redistributions of source code must retain the above copyright +# notice and one of the license notices. +# +# Redistributions in binary form must reproduce both the above copyright +# notice, one of the license notices in the documentation +# and/or other materials provided with the distribution. +# +# + +%{!?_name: %define _name isert} +%{!?_version: %define _version 4.0} +%{!?_release: %define _release 0} + +# KMP is disabled by default +%{!?KMP: %global KMP 0} + +# take kernel version or default to uname -r +%{!?KVERSION: %global KVERSION %(uname -r)} +%global kernel_version %{KVERSION} +%global krelver %(echo -n %{KVERSION} | sed -e 's/-/_/g') +# take path to kernel sources if provided, otherwise look in default location (for non KMP rpms). +%{!?K_SRC: %global K_SRC /lib/modules/%{KVERSION}/build} + +# define release version +%{!?src_release: %global src_release %{_release}_%{krelver}} +%if "%{KMP}" != "1" +%global _release1 %{src_release} +%else +%global _release1 %{_release} +%endif +%global _kmp_rel %{_release1}%{?_kmp_build_num}%{?_dist} + +Summary: %{_name} Driver +Name: %{_name} +Version: %{_version} +Release: %{_release1}%{?_dist} +License: GPLv2 +Url: http://www.mellanox.com +Group: System Environment/Base +Source: %{_name}-%{_version}.tgz +BuildRoot: %{?build_root:%{build_root}}%{!?build_root:/var/tmp/OFED} +Vendor: Mellanox Technologies +%description +%{name} kernel modules + +# build KMP rpms? +%if "%{KMP}" == "1" +%global kernel_release() $(make -s -C %{1} kernelrelease M=$PWD) +BuildRequires: %kernel_module_package_buildreqs +%(mkdir -p %{buildroot}) +%(echo '%defattr (-,root,root)' > %{buildroot}/file_list) +%(echo '/lib/modules/%2-%1' >> %{buildroot}/file_list) +%(echo '%config(noreplace) %{_sysconfdir}/depmod.d/zz02-%{name}-*-%1.conf' >> %{buildroot}/file_list) +%{kernel_module_package -f %{buildroot}/file_list -x xen -r %{_kmp_rel} } +%else +%global kernel_source() %{K_SRC} +%global kernel_release() %{KVERSION} +%global flavors_to_build default +%endif + +# +# setup module sign scripts if paths to the keys are given +# +%global WITH_MOD_SIGN %(if ( test -f "$MODULE_SIGN_PRIV_KEY" && test -f "$MODULE_SIGN_PUB_KEY" ); \ + then \ + echo -n '1'; \ + else \ + echo -n '0'; fi) + +%if "%{WITH_MOD_SIGN}" == "1" +# call module sign script +%global __modsign_install_post \ + %{_builddir}/%{name}-%{version}/source/tools/sign-modules %{buildroot}/lib/modules/ %{kernel_source default} || exit 1 \ +%{nil} + +%global __debug_package 1 +%global buildsubdir %{name}-%{version} +# Disgusting hack alert! We need to ensure we sign modules *after* all +# invocations of strip occur, which is in __debug_install_post if +# find-debuginfo.sh runs, and __os_install_post if not. +# +%global __spec_install_post \ + %{?__debug_package:%{__debug_install_post}} \ + %{__arch_install_post} \ + %{__os_install_post} \ + %{__modsign_install_post} \ +%{nil} + +%endif # end of setup module sign scripts +# + +%if "%{_vendor}" == "suse" +%debug_package +%endif + +# set modules dir +%if "%{_vendor}" == "redhat" || ("%{_vendor}" == "openEuler") +%if 0%{?fedora} +%global install_mod_dir updates/%{name} +%else +%global install_mod_dir extra/%{name} +%endif +%endif + +%if "%{_vendor}" == "suse" +%global install_mod_dir updates/%{name} +%endif + +%{!?install_mod_dir: %global install_mod_dir updates/%{name}} + +%prep +%setup +set -- * +mkdir source +mv "$@" source/ +mkdir obj + +%build +export EXTRA_CFLAGS='-DVERSION=\"%version\"' +export INSTALL_MOD_DIR=%{install_mod_dir} +export CONF_OPTIONS="%{configure_options}" +for flavor in %{flavors_to_build}; do + export K_BUILD=%{kernel_source $flavor} + export KVER=%{kernel_release $K_BUILD} + export LIB_MOD_DIR=/lib/modules/$KVER/$INSTALL_MOD_DIR + rm -rf obj/$flavor + cp -r source obj/$flavor + cd $PWD/obj/$flavor + make + cd - +done + +%install +export INSTALL_MOD_PATH=%{buildroot} +export INSTALL_MOD_DIR=%{install_mod_dir} +export PREFIX=%{_prefix} +for flavor in %flavors_to_build; do + export K_BUILD=%{kernel_source $flavor} + export KVER=%{kernel_release $K_BUILD} + cd $PWD/obj/$flavor + make install KERNELRELEASE=$KVER + # Cleanup unnecessary kernel-generated module dependency files. + find $INSTALL_MOD_PATH/lib/modules -iname 'modules.*' -exec rm {} \; + cd - +done + +# Set the module(s) to be executable, so that they will be stripped when packaged. +find %{buildroot} \( -type f -name '*.ko' -o -name '*ko.gz' \) -exec %{__chmod} u+x \{\} \; + +%{__install} -d %{buildroot}%{_sysconfdir}/depmod.d/ +for module in `find %{buildroot}/ -name '*.ko' -o -name '*.ko.gz' | sort` +do +ko_name=${module##*/} +mod_name=${ko_name/.ko*/} +mod_path=${module/*\/%{name}} +mod_path=${mod_path/\/${ko_name}} +%if "%{_vendor}" == "suse" + for flavor in %{flavors_to_build}; do + if [[ $module =~ $flavor ]] || [ "X%{KMP}" != "X1" ];then + echo "override ${mod_name} * updates/%{name}${mod_path}" >> %{buildroot}%{_sysconfdir}/depmod.d/zz02-%{name}-${mod_name}-$flavor.conf + fi + done +%else + %if 0%{?fedora} + echo "override ${mod_name} * updates/%{name}${mod_path}" >> %{buildroot}%{_sysconfdir}/depmod.d/zz02-%{name}-${mod_name}.conf + %else + %if "%{_vendor}" == "redhat" || ("%{_vendor}" == "openEuler") + echo "override ${mod_name} * weak-updates/%{name}${mod_path}" >> %{buildroot}%{_sysconfdir}/depmod.d/zz02-%{name}-${mod_name}.conf + %endif + echo "override ${mod_name} * extra/%{name}${mod_path}" >> %{buildroot}%{_sysconfdir}/depmod.d/zz02-%{name}-${mod_name}.conf + %endif +%endif +done + + +%clean +rm -rf %{buildroot} + +%post +if [ $1 -ge 1 ]; then # 1 : This package is being installed or reinstalled + /sbin/depmod %{KVERSION} +fi # 1 : closed +# END of post + +%postun +/sbin/depmod %{KVERSION} + +%if "%{KMP}" != "1" +%files +%defattr(-,root,root,-) +/lib/modules/%{KVERSION}/%{install_mod_dir}/ +%config(noreplace) %{_sysconfdir}/depmod.d/zz02-%{name}-*.conf +%endif + +%changelog +* Thu Feb 20 2014 Alaa Hleihel +- Initial packaging diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/tools/sign-modules b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/tools/sign-modules new file mode 100755 index 0000000..b790769 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/isert/tools/sign-modules @@ -0,0 +1,58 @@ +#! /bin/bash + +moddir=$1; shift +KBUILD=$1; shift + +SOURCES_DIR= +case "$KBUILD" in + *linux-obj*) + SOURCES_DIR=$(readlink -f $KBUILD 2>/dev/null | sed -e 's/-obj.*//g') + ;; + */usr/src/linux-*-obj/*) + SOURCES_DIR=$(readlink -f $KBUILD 2>/dev/null | sed -e 's/-obj.*//g') + ;; + *) + SOURCES_DIR=$(readlink -f ${KBUILD/build/source}) + ;; +esac +if [ ! -e "$SOURCES_DIR" ]; then + SOURCES_DIR=$KBUILD +fi + +SIGN_FILE= +if [ -e "${KBUILD}/scripts/sign-file" ]; then + SIGN_FILE="${KBUILD}/scripts/sign-file" +elif [ -e "${SOURCES_DIR}/scripts/sign-file" ]; then + SIGN_FILE="${SOURCES_DIR}/scripts/sign-file" +else + echo "Error: Sign tool does not exist at '$KBUILD' or '$SOURCES_DIR' !" >&2 + exit 1 +fi +echo "Found Sign tool at: '${SIGN_FILE}'" + +if [ ! -e "${MODULE_SIGN_PRIV_KEY}" ]; then + echo "Error: MODULE_SIGN_PRIV_KEY is not set to valid path!" >&2 + exit 1 +fi +if [ ! -e "${MODULE_SIGN_PUB_KEY}" ]; then + echo "Error: MODULE_SIGN_PUB_KEY is not set to valid path!" >&2 + exit 1 +fi + +modules=`find $moddir -name '*.ko' -o -name '*.ko.gz'` +for mod in $modules +do + dir=`dirname $mod` + file=`basename $mod` + + ${SIGN_FILE} sha256 ${MODULE_SIGN_PRIV_KEY} ${MODULE_SIGN_PUB_KEY} ${dir}/${file} + rm -f ${dir}/${file}.{sig,dig} +done + +RANDOMMOD=$(find $moddir -type f -name '*.ko' -o -name '*.ko.gz' | sort -R | tail -n 1) +if [ "~Module signature appended~" != "$(tail -c 28 $RANDOMMOD)" ]; then + echo "*** Modules are unsigned! ***" + exit 1 +fi + +exit 0 diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/opa_vnic/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/opa_vnic/Makefile new file mode 100644 index 0000000..9808fd2 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/opa_vnic/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_INFINIBAND_OPA_VNIC) += opa_vnic.o + +opa_vnic-y := main.o diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/opa_vnic/main.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/opa_vnic/main.c new file mode 100644 index 0000000..e618977 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/opa_vnic/main.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2014 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "opa_vnic" +#define DRV_VERSION "4.2" +#define DRV_RELDATE "July 25, 2017" + +MODULE_AUTHOR("Alaa Hleihel"); +MODULE_DESCRIPTION("opa_vnic dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init opa_vnic_init(void) +{ + return 0; +} + +static void __exit opa_vnic_cleanup(void) +{ +} + +module_init(opa_vnic_init); +module_exit(opa_vnic_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/rtrs/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/rtrs/Makefile new file mode 100644 index 0000000..470306d --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/rtrs/Makefile @@ -0,0 +1,8 @@ +obj-m += rtrs-core.o +obj-m += rtrs-client.o +obj-m += rtrs-server.o + +rtrs-client-y := rtrs-clt_dummy.o +rtrs-server-y := rtrs-srv_dummy.o +rtrs-core-y := rtrs-core_dummy.o + diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/rtrs/rtrs-clt_dummy.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/rtrs/rtrs-clt_dummy.c new file mode 100644 index 0000000..9f706cf --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/rtrs/rtrs-clt_dummy.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2014 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "rtrs-clt" +#define DRV_VERSION "5.2" +#define DRV_RELDATE "Aug 25, 2020" + +MODULE_AUTHOR("Valentine Fatiev"); +MODULE_DESCRIPTION("rtrs-_clt dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init rtrs_clt_init(void) +{ + return 0; +} + +static void __exit rtrs_clt_cleanup(void) +{ +} + +module_init(rtrs_clt_init); +module_exit(rtrs_clt_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/rtrs/rtrs-core_dummy.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/rtrs/rtrs-core_dummy.c new file mode 100644 index 0000000..2c3e96c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/rtrs/rtrs-core_dummy.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2014 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "rtrs-core" +#define DRV_VERSION "5.2" +#define DRV_RELDATE "Aug 25, 2020" + +MODULE_AUTHOR("Valentine Fatiev"); +MODULE_DESCRIPTION("rtrs-core dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init rtrs_core_init(void) +{ + return 0; +} + +static void __exit rtrs_core_cleanup(void) +{ +} + +module_init(rtrs_core_init); +module_exit(rtrs_core_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/rtrs/rtrs-srv_dummy.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/rtrs/rtrs-srv_dummy.c new file mode 100644 index 0000000..06df1c9 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/rtrs/rtrs-srv_dummy.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2014 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "rtrs-srv" +#define DRV_VERSION "5.2" +#define DRV_RELDATE "Aug 25, 2020" + +MODULE_AUTHOR("Valentine Fatiev"); +MODULE_DESCRIPTION("rtrs_srv dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init rtrs_srv_init(void) +{ + return 0; +} + +static void __exit rtrs_srv_cleanup(void) +{ +} + +module_init(rtrs_srv_init); +module_exit(rtrs_srv_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/Kbuild b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/Kbuild new file mode 100644 index 0000000..fe81196 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/Kbuild @@ -0,0 +1,26 @@ +# SPDX-License-Identifier: GPL-2.0 +# Retpoline support: check if this is the right architecture and that +# the kernel does not support it already. +# Alternatively, if we are called from the main mlnx-ofa build system, +# CONFIG_RETPOLINE will be set by the configure script, however +# subdir-ccflags-y will be set by the toplevel Makefile. +ifneq (,$(findstring $(ARCH),i386 x86_64)) + ifndef CONFIG_RETPOLINE + ifneq (,$(shell awk 'BEGIN {if ($(VERSION).$(PATCHLEVEL) < 4.15) {print 1}}' . + diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/Module.supported b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/Module.supported new file mode 100644 index 0000000..fc35064 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/Module.supported @@ -0,0 +1,2 @@ +ib_srp.ko external +scsi_transport_srp.ko external diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/_makefile_ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/_makefile_ new file mode 100755 index 0000000..69ff71b --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/_makefile_ @@ -0,0 +1,187 @@ +KVER ?= $(shell uname -r) +SRC_DIR ?= $(shell pwd) +MLNX_ARCH = $(shell uname -m) +OFA_DIR ?= /usr/src/ofa_kernel/$(MLNX_ARCH) +OFA = $(shell ( test -d $(OFA_DIR)/$(KVER) && echo $(OFA_DIR)/$(KVER) ) || ( test -d /var/lib/dkms/mlnx-ofed-kernel/ && ls -d /var/lib/dkms/mlnx-ofed-kernel/*/build ) || ( echo $(OFA_DIR) )) + +ifneq ($(shell test -d $(OFA) && echo "true" || echo "" ),) +include $(OFA)/configure.mk.kernel +endif + +export KERNELRELEASE=$(KVER) +K_BUILD ?= /lib/modules/$(KVER)/build +K_OBJ ?= $(K_BUILD) +K_SRC ?= $(shell test -d /lib/modules/$(KVER)/source && echo /lib/modules/$(KVER)/source || echo $(K_BUILD)) + +EXTRA_CFLAGS += $(shell echo $(BACKPORT_INCLUDES) | sed -e 's@/var/tmp/OFED_topdir/BUILD@/usr/src@') + +autoconf_h=$(shell /bin/ls -1 $(K_BUILD)/include/*/autoconf.h 2> /dev/null | head -1) +kconfig_h=$(shell /bin/ls -1 $(K_BUILD)/include/*/kconfig.h 2> /dev/null | head -1) + +ifneq ($(kconfig_h),) +KCONFIG_H = -include $(kconfig_h) +endif + +V ?= 0 + +# GCC earlier than 4.6.0 will build modules which require 'mcount', +# and this symbol will not be available in the kernel if the kernel was +# compiled with GCC 4.6.0 and above. +# therefore, to prevent unknown symbol issues we disable function tracing. +# +CC = $(CROSS_COMPILE)gcc +CPP = $(CC) -E + +CPP_MAJOR := $(shell $(CPP) -dumpversion 2>&1 | cut -d'.' -f1) +CPP_MINOR := $(shell $(CPP) -dumpversion 2>&1 | cut -d'.' -f2) +CPP_PATCH := $(shell $(CPP) -dumpversion 2>&1 | cut -d'.' -f3) +# Assumes that major, minor, and patch cannot exceed 999 +CPP_VERS := $(shell expr 0$(CPP_MAJOR) \* 1000000 + 0$(CPP_MINOR) \* 1000 + 0$(CPP_PATCH)) +compile_h=$(shell /bin/ls -1 $(K_BUILD)/include/*/compile.h 2> /dev/null | head -1) +ifneq ($(compile_h),) +KERNEL_GCC_MAJOR := $(shell grep LINUX_COMPILER $(compile_h) | sed -r -e 's/.*gcc version ([0-9\.\-]*) .*/\1/g' | cut -d'.' -f1) +KERNEL_GCC_MINOR := $(shell grep LINUX_COMPILER $(compile_h) | sed -r -e 's/.*gcc version ([0-9\.\-]*) .*/\1/g' | cut -d'.' -f2) +KERNEL_GCC_PATCH := $(shell grep LINUX_COMPILER $(compile_h) | sed -r -e 's/.*gcc version ([0-9\.\-]*) .*/\1/g' | cut -d'.' -f3) +KERNEL_GCC_VER := $(shell expr 0$(KERNEL_GCC_MAJOR) \* 1000000 + 0$(KERNEL_GCC_MINOR) \* 1000 + 0$(KERNEL_GCC_PATCH)) +ifneq ($(shell if [ $(CPP_VERS) -lt 4006000 ] && [ $(KERNEL_GCC_VER) -ge 4006000 ]; then \ + echo "YES"; else echo ""; fi),) +$(info Warning: The kernel was compiled with GCC newer than 4.6.0, while the current GCC is older than 4.6.0, Disabling function tracing to prevent unknown symbol issues...) +override WITH_MAKE_PARAMS += CONFIG_FUNCTION_TRACER= CONFIG_HAVE_FENTRY= +endif +ifneq ($(shell if [ $(CPP_VERS) -ge 4006000 ] && [ $(KERNEL_GCC_VER) -lt 4006000 ]; then \ + echo "YES"; else echo ""; fi),) +$(info Warning: The kernel was compiled with GCC older than 4.6.0, while the current GCC is newer than 4.6.0, Disabling function tracing to prevent unknown symbol issues...) +override WITH_MAKE_PARAMS += CONFIG_FUNCTION_TRACER= CONFIG_HAVE_FENTRY= +endif +endif + +ifneq ($(shell if (echo $(KVER) | grep -qE 'uek'); then \ + echo "YES"; else echo ""; fi),) +override WITH_MAKE_PARAMS += ctf-dir=$(CWD)/.ctf +endif + +# This is an UGLY compat for MOFED ib_query_gid with attrs +ifneq ($(shell test -d $(OFA) && echo "true" || echo "" ),) +EXTRA_CFLAGS += -DHAVE_IB_QUERY_GID_ATTRS +endif + +name := srp +VERSION=$(shell grep "define _version" $(name).spec | sed -e 's/.*_version //' | sed -e 's/}//' | sed -e 's/\s*//g') +RELEASE=$(shell grep "define _release" $(name).spec | sed -e 's/.*_release //' | sed -e 's/}//' | sed -e 's/\s*//g') +PACKAGE := $(name)-$(VERSION) +SHELL = /bin/bash +rpmspec := $(name).spec +rpmroot = $(PWD)/rpm-dist/ +rpmopts = --nodeps --buildroot='$(rpmroot)/_rpm' --define '_source_filedigest_algorithm md5' --define '_binary_filedigest_algorithm md5' +rpmmacros =\ + --define='_topdir $(rpmroot)'\ + --define='_rpmdir $(rpmroot)'\ + --define='_srcrpmdir $(rpmroot)'\ + --define='_sourcedir $(rpmroot)'\ + --define='_specdir $(PWD)' +override WITH_MAKE_PARAMS += KBUILD_EXTRA_SYMBOLS=$(OFA)/Module.symvers + +LINUXINCLUDE=\ + $(EXTRA_CFLAGS) \ + -include $(autoconf_h) \ + $(KCONFIG_H) \ + -include $(OFA)/include/linux/compat-2.6.h \ + -I$(PWD) \ + -I$(OFA)/include \ + -I$(OFA)/include/uapi \ + $(BACKPORT_INCLUDES) \ + $$(if $$(CONFIG_XEN),-D__XEN_INTERFACE_VERSION__=$$(CONFIG_XEN_INTERFACE_VERSION)) \ + $$(if $$(CONFIG_XEN),-I$$(srctree)/arch/x86/include/mach-xen) \ + -I$$(srctree)/arch/$$(SRCARCH)/include \ + -Iarch/$$(SRCARCH)/include/generated \ + -Iinclude \ + -I$$(srctree)/arch/$$(SRCARCH)/include/uapi \ + -Iarch/$$(SRCARCH)/include/generated/uapi \ + -I$$(srctree)/include \ + -I$$(srctree)/include/uapi \ + -Iinclude/generated/uapi \ + $$(if $$(KBUILD_SRC),-Iinclude2 -I$$(srctree)/include) \ + -I$$(srctree)/arch/$$(SRCARCH)/include \ + -Iarch/$$(SRCARCH)/include/generated \ + # + +default: prep +ifneq ($(shell test -d $(OFA) && echo "true" || echo "" ),) +# compile with ofed driver + make -C $(K_BUILD) O=$(K_OBJ) M=$(shell pwd) $(WITH_MAKE_PARAMS) \ + CONFIG_SCSI_SRP_ATTRS_STANDALONE=m \ + CONFIG_SCSI_SRP_ATTRS=m \ + CONFIG_INFINIBAND_SRP=m \ + CONFIG_INFINIBAND_SRP_DUMMY= \ + CONFIG_DTRACE= \ + CONFIG_CTF= \ + LINUXINCLUDE='$(LINUXINCLUDE)' \ + modules +else +# compile with inbox driver + make EXTRA_CFLAGS="$(EXTRA_CFLAGS)" -C $(K_BUILD) O=$(K_OBJ) M=$(shell pwd) \ + CONFIG_SCSI_SRP_ATTRS_STANDALONE=m \ + CONFIG_SCSI_SRP_ATTRS=m \ + CONFIG_INFINIBAND_SRP=m \ + CONFIG_INFINIBAND_SRP_DUMMY= \ + modules +endif + +install: + make -C $(K_BUILD) O=$(K_OBJ) M=$(shell pwd) INSTALL_MOD_PATH=$(INSTALL_MOD_PATH) INSTALL_MOD_DIR=$(INSTALL_MOD_DIR) $(WITH_MAKE_PARAMS) modules_install + if [ ! -n "$(INSTALL_MOD_PATH)" ]; then /sbin/depmod $(KVER);fi; + +rpmcheck: + @which rpmbuild &> /dev/null; \ + if [ $$? -ne 0 ]; then \ + echo "*** This make target requires an rpm-based linux distribution."; \ + (exit 1); exit 1; \ + fi + -mkdir -p $(rpmroot)/BUILD + +srcrpm: dist rpmcheck $(rpmspec) + -rpmbuild -bs --define 'src_release $(RELEASE)' $(rpmmacros) $(rpmopts) $(rpmspec); \ + if [ $$? -ne 0 ]; then \ + (exit 1); exit 1; \ + fi + +binrpm: rpmcheck $(rpmspec) + -rpmbuild -bb $(rpmmacros) $(rpmopts) $(rpmspec); \ + if [ $$? -ne 0 ]; then \ + (exit 1); exit 1; \ + fi + +dist: prep + mkdir -p $(rpmroot)/$(PACKAGE)/ + cp {$(rpmspec),Kconfig,makefile,Kbuild,dkms.conf,Module.supported} $(rpmroot)/$(PACKAGE)/ + cp common.postinst $(rpmroot)/$(PACKAGE)/ + cp *.c $(rpmroot)/$(PACKAGE)/ + cp *.h $(rpmroot)/$(PACKAGE)/ + cp -r debian $(rpmroot)/$(PACKAGE)/ + cp -r tools $(rpmroot)/$(PACKAGE)/ + cp -r $(PWD)/scsi $(rpmroot)/$(PACKAGE)/ + cd $(rpmroot) && tar czf $(PACKAGE).tgz $(PACKAGE) + cd $(rpmroot) && tar czf $(name)_$(VERSION).orig.tar.gz $(PACKAGE) + +prep: +ifeq ($(shell test -d $(PWD)/scsi && echo "true" || echo ""),) + -cp -r $(PWD)/../../../scsi $(PWD) + -cp -r $(PWD)/../../../../include/scsi/* $(PWD)/scsi + -rm -rf $(PWD)/scsi/scsi.h + -rm -rf $(PWD)/scsi/scsi_device.h + -rm -rf $(PWD)/scsi/cxgbi + -sed -i 's@@"scsi/scsi_transport_srp.h"@g' $(PWD)/ib_srp.c +endif + +clean: + rm -f *.o + rm -f *.ko *.ko.gz + rm -f *.mod.c + rm -f Module*.symvers modules*.order + +distclean: clean + @rm -rf $(PWD)/rpm-dist + rm -f makefile Makefile *.spec + rm -rf $(PWD)/scsi + +all: clean distclean dist srcrpm binrpm diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/autogen.sh b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/autogen.sh new file mode 100755 index 0000000..1a1752c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/autogen.sh @@ -0,0 +1,14 @@ +#!/bin/bash +set -e + +name=srp +version=$(grep "define _version" ${name}_spec_ | sed -e 's/.*_version //' | sed -e 's/}//' | sed -e 's/\s*//g') +release=$(grep "define _release" ${name}_spec_ | sed -e 's/.*_release //' | sed -e 's/}//' | sed -e 's/\s*//g') + +/bin/cp -f ${name}_spec_ ${name}.spec +/bin/cp -f _makefile_ makefile +/bin/sed -i -r "s/^$name \(([0-9.-]+)\) (.*)/$name \($version-$release\) \2/" debian/changelog + +if ! (grep -q "CONFIG_SCSI_SRP_ATTRS_STANDALONE" Kbuild 2>/dev/null); then + echo "obj-\$(CONFIG_SCSI_SRP_ATTRS_STANDALONE) += scsi/" >> Kbuild +fi diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/common.postinst b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/common.postinst new file mode 100755 index 0000000..bbf9aad --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/common.postinst @@ -0,0 +1,296 @@ +#!/bin/sh +# Copyright (C) 2002-2005 Flavio Stanchina +# Copyright (C) 2005-2006 Aric Cyr +# Copyright (C) 2007 Mario Limonciello +# Copyright (C) 2009 Alberto Milone + +set -e + +uname_s=$(uname -s) + +_get_kernel_dir() { + KVER=$1 + case ${uname_s} in + Linux) DIR="/lib/modules/$KVER/build" ;; + GNU/kFreeBSD) DIR="/usr/src/kfreebsd-headers-$KVER/sys" ;; + esac + echo $DIR +} + +_check_kernel_dir() { + DIR=$(_get_kernel_dir $1) + case ${uname_s} in + Linux) test -e $DIR/include ;; + GNU/kFreeBSD) test -e $DIR/kern && test -e $DIR/conf/kmod.mk ;; + *) return 1 ;; + esac + return $? +} + +# Check the existence of a kernel named as $1 +_is_kernel_name_correct() { + CORRECT="no" + KERNEL_NAME=$1 + + for kernel in /boot/config-*; do + KERNEL=${kernel#*-} + if [ "${KERNEL}" = "${KERNEL_NAME}" ]; then + CORRECT="yes" + break + fi + done + + echo $CORRECT +} + + +# Get the most recent kernel on Debian based systems. This keeps +# into account both the version and the ABI. If the current kernel +# is the most recent kernel then the function will print a null string. +_get_newest_kernel_debian() { + NEWEST_KERNEL= + NEWEST_VERSION= + NEWEST_ABI= + + for kernel in /boot/config-*; do + KERNEL=${kernel#*-} + KERNEL_VERSION=${KERNEL%%-*} + ABI=${KERNEL#*-} + ABI=${ABI%%-*} + + if [ -z "$NEWEST_KERNEL" ]; then + # The 1st time get a version which is bigger than $1 + COMPARE_TO=$1 + else + # Get the biggest version + COMPARE_TO="$NEWEST_VERSION-$NEWEST_ABI" + fi + + # if $kernel is greater than $COMPARE_TO + if [ `dpkg --compare-versions "$KERNEL_VERSION-$ABI" gt "$COMPARE_TO" && echo "yes" || \ + echo "no"` = "yes" ]; then + NEWEST_KERNEL=$KERNEL + NEWEST_VERSION=$KERNEL_VERSION + NEWEST_ABI=$ABI + fi + done + + echo "$NEWEST_KERNEL" +} + +# Get the most recent kernel in Rhel based systems. If the current kernel +# is the most recent kernel then the function will print a null string. +_get_newest_kernel_rhel() { + NEWEST_KERNEL= + + LAST_INSTALLED_KERNEL=$(rpm -q --whatprovides kernel --last | grep kernel -m1 | cut -f1 -d' ') + + LIK_FORMATTED_NAME=$(rpm -q $LAST_INSTALLED_KERNEL --queryformat="%{VERSION}-%{RELEASE}.%{ARCH}\n") + + if [ `echo $LIK_FORMATTED_NAME | grep 2.6 >/dev/null` ]; then + # Fedora and Suse + NEWEST_KERNEL=$LIK_FORMATTED_NAME + else + # Hack for Mandriva where $LIK_FORMATTED_NAME is broken + LIK_NAME=$(rpm -q $LAST_INSTALLED_KERNEL --queryformat="%{NAME}\n") + LIK_TYPE=${LIK_NAME#kernel-} + LIK_TYPE=${LIK_TYPE%%-*} + LIK_STRIPPED=${LIK_NAME#kernel-} + LIK_STRIPPED=${LIK_STRIPPED#$LIK_TYPE-} + LIK_STRIPPED_BASE=${LIK_STRIPPED%%-*} + LIK_STRIPPED_END=${LIK_STRIPPED#$LIK_STRIPPED_BASE-} + LIK_FINAL=$LIK_STRIPPED_BASE-$LIK_TYPE-$LIK_STRIPPED_END + + NEWEST_KERNEL=$LIK_FINAL + fi + + echo $NEWEST_KERNEL +} + +# Get the newest kernel on Debian and Rhel based systems. +get_newest_kernel() { + NEWEST_KERNEL= + # Try Debian first as rpm can be installed in Debian based distros + if [ -e /usr/bin/dpkg ]; then + # If DEB based + CURRENT_KERNEL=$1 + CURRENT_VERSION=${CURRENT_KERNEL%%-*} + CURRENT_ABI=${CURRENT_KERNEL#*-} + CURRENT_FLAVOUR=${CURRENT_ABI#*-} + CURRENT_ABI=${CURRENT_ABI%%-*} + NEWEST_KERNEL=$(_get_newest_kernel_debian "$CURRENT_VERSION-$CURRENT_ABI") + + elif [ `which rpm >/dev/null` ]; then + # If RPM based + NEWEST_KERNEL=$(_get_newest_kernel_rhel) + fi + + # Make sure that kernel name that we extracted corresponds to an installed + # kernel + if [ -n "$NEWEST_KERNEL" ] && [ `_is_kernel_name_correct $NEWEST_KERNEL` = "no" ]; then + NEWEST_KERNEL= + fi + + echo $NEWEST_KERNEL +} + +NAME=$1 +VERSION=$2 +TARBALL_ROOT=$3 +ARCH=$4 +UPGRADE=$5 + +if [ -z "$NAME" ] || [ -z "$VERSION" ]; then + echo "Need NAME, and VERSION defined" + echo "ARCH is optional" + exit 1 +fi + +KERNELS=$(ls /lib/modules/ 2>/dev/null || true) +CURRENT_KERNEL=$(uname -r) + +#We never want to keep an older version side by side to prevent conflicts +if [ -e "/var/lib/dkms/$NAME/$VERSION" ]; then + echo "Removing old $NAME-$VERSION DKMS files..." + dkms remove -m $NAME -v $VERSION --all +fi + +#Load new files, by source package and by tarball +if [ -f "$TARBALL_ROOT/$NAME-$VERSION.dkms.tar.gz" ]; then + if ! dkms ldtarball --archive "$TARBALL_ROOT/$NAME-$VERSION.dkms.tar.gz"; then + echo "" + echo "" + echo "Unable to load DKMS tarball $TARBALL_ROOT/$NAME-$VERSION.dkms.tar.gz." + echo "Common causes include: " + echo " - You must be using DKMS 2.1.0.0 or later to support binaries only" + echo " distribution specific archives." + echo " - Corrupt distribution specific archive" + echo "" + echo "" + exit 2 + fi +elif [ -d "/usr/src/$NAME-$VERSION" ]; then + echo "Loading new $NAME-$VERSION DKMS files..." + dkms add -m $NAME -v $VERSION > /dev/null +fi + +# On 1st installation, let us look for a directory +# in /lib/modules which matches `uname -r`. If none +# is found it is possible that buildd is being used +# and that uname -r is giving us the name of the +# kernel used by the buildd machine. +# +# If this is the case we try to build the kernel +# module for each kernel which has a directory in +# /lib/modules. Furthermore we will have to tell +# DKMS which architecture it should build the module +# for (e.g. if the buildd machine is using a +# 2.6.24-23-xen 64bit kernel). +# +# NOTE: if the headers are not installed then the +# module won't be built, as usual +if [ -z "$UPGRADE" ]; then + echo "First Installation: checking all kernels..." + for KERNEL in $KERNELS; do + if [ ${KERNEL} = ${CURRENT_KERNEL} ]; then + # Kernel found + KERNELS=$CURRENT_KERNEL + break + fi + done +else + KERNELS=$CURRENT_KERNEL +fi + +# Here we look for the most recent kernel so that we can +# build the module for it (in addition to doing it for the +# current kernel. +NEWEST_KERNEL=$(get_newest_kernel "$KERNELS") + +# If the current kernel doesn't come from the host of a chroot +if [ `_is_kernel_name_correct $CURRENT_KERNEL` = "yes" ]; then + # See if it's worth building the module for both the newest kernel + # and for the current kernel + if [ -n "$NEWEST_KERNEL" ] && [ ${CURRENT_KERNEL} != ${NEWEST_KERNEL} ]; then + echo "Building for $CURRENT_KERNEL and $NEWEST_KERNEL" + KERNELS="$CURRENT_KERNEL $NEWEST_KERNEL" + else + echo "Building only for $CURRENT_KERNEL" + fi +# The current kernel is not useful as it's the host's +else + echo "It is likely that $CURRENT_KERNEL belongs to a chroot's host" + + # Let's use only the newest kernel + if [ -n "$NEWEST_KERNEL" ]; then + KERNELS="$NEWEST_KERNEL" + echo "Building only for $NEWEST_KERNEL" + fi +fi + +if [ -n "$ARCH" ]; then + if which lsb_release >/dev/null && [ $(lsb_release -s -i) = "Ubuntu" ]; then + case $ARCH in + amd64) + ARCH="x86_64" + ;; + lpia|i?86) + ARCH="i686" + ;; + esac + fi + echo "Building for architecture $ARCH" + ARCH="-a $ARCH" +fi + +for KERNEL in $KERNELS; do + dkms_status=`dkms status -m $NAME -v $VERSION -k $KERNEL $ARCH` + if [ `echo $KERNEL | grep -c "BOOT"` -gt 0 ]; then + echo "" + echo "Module build and install for $KERNEL was skipped as " + echo "it is a BOOT variant" + continue + fi + + + #if the module isn't yet built, try to build it + if [ `echo $dkms_status | grep -c ": built"` -eq 0 ]; then + if [ ! -L /var/lib/dkms/$NAME/$VERSION/source ]; then + echo "This package appears to be a binaries-only package" + echo " you will not be able to build against kernel $KERNEL" + echo " since the package source was not provided" + continue + fi + if _check_kernel_dir $KERNEL; then + echo "Building initial module for $KERNEL" + set +e + dkms build -m $NAME -v $VERSION -k $KERNEL $ARCH > /dev/null + rc=$? + case $rc in + 9) + set -e + echo "Skipped." + continue + ;; + 0) + set -e + echo "Done." + ;; + *) + exit $rc + ;; + esac + dkms_status=`dkms status -m $NAME -v $VERSION -k $KERNEL $ARCH` + else + echo "Module build for the currently running kernel was skipped since the" + echo "kernel source for this kernel does not seem to be installed." + fi + fi + + #if the module is built (either pre-built or just now), install it + if [ `echo $dkms_status | grep -c ": built"` -eq 1 ] && + [ `echo $dkms_status | grep -c ": installed"` -eq 0 ]; then + dkms install -m $NAME -v $VERSION -k $KERNEL $ARCH --force + fi +done + diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/debian/changelog b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/debian/changelog new file mode 100644 index 0000000..9c3896c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/debian/changelog @@ -0,0 +1,5 @@ +srp (0.0-0) unstable; urgency=low + + * Initial release. + + -- Alaa Hleihel Sun, 16 Feb 2014 17:30:53 +0200 diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/debian/compat b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/debian/compat new file mode 100644 index 0000000..45a4fb7 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/debian/compat @@ -0,0 +1 @@ +8 diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/debian/control b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/debian/control new file mode 100644 index 0000000..bd22fe2 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/debian/control @@ -0,0 +1,17 @@ +Source: srp +Section: kernel +Priority: optional +Maintainer: Mellanox Technologies +Build-Depends: debhelper (>= 8.0.0), autotools-dev, bzip2, dkms +Standards-Version: 1.0 +Homepage: http://www.mellanox.com + +Package: srp-dkms +Section: kernel +Architecture: all +Depends: dkms, make, mlnx-ofed-kernel-dkms, ${misc:Depends} +Recommends: linux-headers-arm64 | linux-headers-powerpc | linux-headers-ppc64 | linux-headers-ppc64le | linux-headers-amd64 | linux-headers | linux-headers-generic +Description: DKMS support fo srp kernel modules + This package provides integration with the DKMS infrastructure for automatically building out of tree kernel modules. + . + This package contains the source to be built with dkms. diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/debian/control.no_dkms b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/debian/control.no_dkms new file mode 100644 index 0000000..9ecfd14 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/debian/control.no_dkms @@ -0,0 +1,14 @@ +Source: srp +Section: kernel +Priority: optional +Maintainer: Mellanox Technologies +Build-Depends: debhelper (>= 8.0.0), autotools-dev, bzip2, make +Standards-Version: 1.0 +Homepage: http://www.mellanox.com + +Package: srp-modules +Section: kernel +Architecture: any +Depends: mlnx-ofed-kernel-modules, ${misc:Depends} +Description: srp kernel modules + This package provides the binary code for the srp kernel modules. diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/debian/copyright b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/debian/copyright new file mode 100644 index 0000000..53aa878 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/debian/copyright @@ -0,0 +1,19 @@ +Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ + +Files: * +Copyright: Copyright 2017 Mellanox Technologies +License: GPL-2 + Mellanox OFED (MLNX_OFED) Software distributed under the terms of the GNU General Public License ("GPL") version 2 as published by the Free Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/debian/rules b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/debian/rules new file mode 100755 index 0000000..1586a11 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/debian/rules @@ -0,0 +1,110 @@ +#!/usr/bin/make -f +# -*- makefile -*- +# Sample debian/rules that uses debhelper. +# This file was originally written by Joey Hess and Craig Small. +# As a special exception, when this file is copied by dh-make into a +# dh-make output file, you may use that output file without restriction. +# This special exception was added by Craig Small in version 0.37 of dh-make. +# +# This version is for a hypothetical package that can build a kernel modules +# architecture-dependant package via make-kpkg, as well as an +# architecture-independent module source package, and other packages +# either dep/indep for things like common files or userspace components +# needed for the kernel modules. + +# Uncomment this to turn on verbose mode. +#export DH_VERBOSE=1 + +WITH_DKMS ?= 1 +WITH_MOD_SIGN ?= 0 +MLXNUMC = $(shell grep ^processor /proc/cpuinfo | wc -l) +NJOBS ?= $(shell if [ $(MLXNUMC) -lt 16 ]; then echo $(MLXNUMC); else echo 16; fi) + +pname:=srp +psource:=$(pname)-source +ifeq ($(WITH_DKMS),1) +pdkms:=$(pname)-dkms +else +pdkms:=$(pname)-modules +endif + +pversion := $(shell dpkg-parsechangelog | sed -n 's/^Version: *\([^-]\+\)-.\+/\1/p') +prel := $(shell dpkg-parsechangelog | sed -n 's/^Version: *\([^-]\+\)-\(.\+\)/\2/p') + +export INSTALL_MOD_DIR:=updates +export INSTALL_MOD_PATH:=$(CURDIR)/debian/$(pdkms) + +DIST_NAME := $(shell lsb_release -si) +DIST_RELEASE := $(DIST_NAME)/$(shell lsb_release -sc) + + +KVER ?= $(shell uname -r) +KVER1 = $(shell echo $(KVER) | sed -e 's/_/-/g') +K_BUILD ?= "/lib/modules/$(KVER)/build" + +%: +ifeq ($(WITH_DKMS),1) + dh $@ --with dkms +else + dh $@ +endif + +override_dh_auto_clean: + +override_dh_auto_configure: + +override_dh_auto_build: +ifneq ($(WITH_DKMS),1) + @echo Building for $(KVER) + make clean || true + make -j$(NJOBS) KVER=$(KVER) K_BUILD=$(K_BUILD) +endif + +override_dh_auto_test: + +override_dh_auto_install: +ifneq ($(WITH_DKMS),1) + make install INSTALL_MOD_DIR=$(INSTALL_MOD_DIR) INSTALL_MOD_PATH=$(INSTALL_MOD_PATH) KERNELRELEASE=$(KVER) KVER=$(KVER) K_BUILD=$(K_BUILD) + find $(INSTALL_MOD_PATH) \( -type f -a -name "modules.*" \) -delete +ifeq ($(WITH_MOD_SIGN),1) + tools/sign-modules $(INSTALL_MOD_PATH)/lib/modules/ $(K_BUILD) +endif +endif + + # For dkms +ifeq ($(WITH_DKMS),1) + dh_installdirs -p$(pdkms) usr/src/$(pname)-$(pversion) + cp Kconfig debian/$(pdkms)/usr/src/$(pname)-$(pversion) + cp makefile debian/$(pdkms)/usr/src/$(pname)-$(pversion) + cp Kbuild debian/$(pdkms)/usr/src/$(pname)-$(pversion) + cp dkms.conf debian/$(pdkms)/usr/src/$(pname)-$(pversion) + cp common.postinst debian/$(pdkms)/usr/src/$(pname)-$(pversion) + cp *.c debian/$(pdkms)/usr/src/$(pname)-$(pversion) + cp *.h debian/$(pdkms)/usr/src/$(pname)-$(pversion) + cp -r scsi debian/$(pdkms)/usr/src/$(pname)-$(pversion) + + # Force DKMS to install our modules. + # This is mostly needed for modules that do not have a version number info, as DKMS + # will compare their srcversion field, which does not really say which module is newer. + dh_installdirs -p$(pdkms) usr/share/dkms/modules_to_force_install/ + echo "$(pname)" > debian/$(pdkms)/usr/share/dkms/modules_to_force_install/$(pname).force +endif + +override_dh_installinit: + + +ifneq ($(WITH_DKMS),1) +override_dh_gencontrol: + dh_gencontrol -- -v$(pversion)-$(prel).kver.$(KVER1) +endif + +ifneq ($(MLNX_KO_NO_STRIP),1) +ifneq ($(WITH_DKMS),1) +override_dh_strip: + dh_strip + find debian -name '*.ko' | xargs strip -g +ifeq ($(WITH_MOD_SIGN),1) + tools/sign-modules $(INSTALL_MOD_PATH)/lib/modules/ $(K_BUILD) +endif +endif +endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/debian/source/format b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/debian/source/format new file mode 100644 index 0000000..163aaf8 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/debian/source/format @@ -0,0 +1 @@ +3.0 (quilt) diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/debian/srp-dkms.postinst b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/debian/srp-dkms.postinst new file mode 100755 index 0000000..1de51be --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/debian/srp-dkms.postinst @@ -0,0 +1,53 @@ +#!/bin/sh +set -e + +# Get the package version +NAME=srp +PACKAGE_NAME=$NAME-dkms +CVERSION=`dpkg-query -W -f='${Version}' $PACKAGE_NAME | awk -F "-" '{print $1}' | cut -d\: -f2` +ARCH=`uname -m` + +dkms_configure () { + POSTINST="/usr/src/$NAME-$CVERSION/common.postinst" + if [ -f "$POSTINST" ]; then + "$POSTINST" "$NAME" "$CVERSION" "/usr/share/$PACKAGE_NAME" "$ARCH" "$2" + return $? + fi + echo "WARNING: $POSTINST does not exist." >&2 + echo "ERROR: DKMS version is too old and $PACKAGE_NAME was not" >&2 + echo "built with legacy DKMS support." >&2 + echo "You must either rebuild $PACKAGE_NAME with legacy postinst" >&2 + echo "support or upgrade DKMS to a more current version." >&2 + return 1 +} + +case "$1" in + configure) + dkms_configure + ;; + + abort-upgrade|abort-remove|abort-deconfigure) + ;; + + *) + echo "postinst called with unknown argument \`$1'" >&2 + exit 1 + ;; +esac + +# update openib.conf +init_conf_file=/etc/infiniband/openib.conf +if [ -e $init_conf_file ] && + ! grep -q 'SRP_LOAD=' $init_conf_file; then + cat <>$init_conf_file +# Load SRP module +SRP_LOAD=no +EOF +fi + +# dh_installdeb will replace this with shell code automatically +# generated by other debhelper scripts. + +#DEBHELPER# + +exit 0 diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/debian/srp-dkms.prerm b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/debian/srp-dkms.prerm new file mode 100755 index 0000000..4869d13 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/debian/srp-dkms.prerm @@ -0,0 +1,21 @@ +#!/bin/sh +set -e + +# Get the package version +package=srp +version=`dpkg-query -W -f='${Version}' "$package-dkms" \ + | sed -e 's/[+-].*//'` + +dkms remove -m "$package" -v "$version" --all || true + +# update openib.conf +init_conf_file=/etc/infiniband/openib.conf +if [ -e $init_conf_file ] && + grep -q 'SRP_LOAD=' $init_conf_file; then + sed -i '/\/d' $init_conf_file + sed -i '/\/d' $init_conf_file +fi + +#DEBHELPER# + +exit 0 diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/debian/srp-modules.postinst b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/debian/srp-modules.postinst new file mode 100755 index 0000000..652e933 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/debian/srp-modules.postinst @@ -0,0 +1,16 @@ +#!/bin/sh +set -e + +# update openib.conf +init_conf_file=/etc/infiniband/openib.conf +if [ -e $init_conf_file ] && + ! grep -q 'SRP_LOAD=' $init_conf_file; then + cat <>$init_conf_file +# Load SRP module +SRP_LOAD=no +EOF +fi + +#DEBHELPER# + +exit 0 diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/debian/srp-modules.prerm b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/debian/srp-modules.prerm new file mode 100755 index 0000000..0f1f14a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/debian/srp-modules.prerm @@ -0,0 +1,15 @@ +#!/bin/sh +set -e + + +# update openib.conf +init_conf_file=/etc/infiniband/openib.conf +if [ -e $init_conf_file ] && + grep -q 'SRP_LOAD=' $init_conf_file; then + sed -i '/\/d' $init_conf_file + sed -i '/\/d' $init_conf_file +fi + +#DEBHELPER# + +exit 0 diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/dkms.conf b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/dkms.conf new file mode 100644 index 0000000..a24b718 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/dkms.conf @@ -0,0 +1,22 @@ +# DKMS module name and version +PACKAGE_NAME="srp" +PACKAGE_VERSION="4.0" + +kernelver=${kernelver:-$(uname -r)} +kernel_source_dir=${kernel_source_dir:-/lib/modules/$kernelver/build} + +# Module name, source and destination directories, and build command-line +BUILT_MODULE_NAME[0]="ib_srp" +BUILT_MODULE_LOCATION[0]="./" +DEST_MODULE_LOCATION[0]="/kernel/../updates/" +BUILT_MODULE_NAME[1]="scsi_transport_srp" +BUILT_MODULE_LOCATION[1]="./scsi/" +DEST_MODULE_LOCATION[1]="/kernel/../updates/" +MAKE="make -j`MLXNUMC=$(grep ^processor /proc/cpuinfo | wc -l) && echo $(($MLXNUMC<16?$MLXNUMC:16))` KVER=$kernelver K_BUILD=$kernel_source_dir" + +# Cleanup command-line +CLEAN="make clean" + +# disable autoinstall since this module depends on mlnx-ofed-kernel-dkms +# mlnx-ofed-kernel-dkms will build this module on POST_INSTALL +AUTOINSTALL= diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/ib_srp.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/ib_srp.c new file mode 100644 index 0000000..2b84fd1 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/ib_srp.c @@ -0,0 +1,4157 @@ +/* + * Copyright (c) 2005 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifdef pr_fmt +#undef pr_fmt +#endif +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include "ib_srp.h" + +#define DRV_NAME "ib_srp" +#define PFX DRV_NAME ": " + +MODULE_AUTHOR("Roland Dreier"); +MODULE_DESCRIPTION("InfiniBand SCSI RDMA Protocol initiator"); +MODULE_LICENSE("Dual BSD/GPL"); + +#if !defined(CONFIG_DYNAMIC_DEBUG) +#define DEFINE_DYNAMIC_DEBUG_METADATA(name, fmt) +#define DYNAMIC_DEBUG_BRANCH(descriptor) false +#endif + +static unsigned int srp_sg_tablesize; +static unsigned int cmd_sg_entries; +static unsigned int indirect_sg_entries; +static bool allow_ext_sg; +static bool register_always = true; +static bool never_register; +static int topspin_workarounds = 1; + +module_param(srp_sg_tablesize, uint, 0444); +MODULE_PARM_DESC(srp_sg_tablesize, "Deprecated name for cmd_sg_entries"); + +module_param(cmd_sg_entries, uint, 0444); +MODULE_PARM_DESC(cmd_sg_entries, + "Default number of gather/scatter entries in the SRP command (default is 12, max 255)"); + +module_param(indirect_sg_entries, uint, 0444); +MODULE_PARM_DESC(indirect_sg_entries, + "Default max number of gather/scatter entries (default is 12, max is " __stringify(SG_MAX_SEGMENTS) ")"); + +module_param(allow_ext_sg, bool, 0444); +MODULE_PARM_DESC(allow_ext_sg, + "Default behavior when there are more than cmd_sg_entries S/G entries after mapping; fails the request when false (default false)"); + +module_param(topspin_workarounds, int, 0444); +MODULE_PARM_DESC(topspin_workarounds, + "Enable workarounds for Topspin/Cisco SRP target bugs if != 0"); + +module_param(register_always, bool, 0444); +MODULE_PARM_DESC(register_always, + "Use memory registration even for contiguous memory regions"); + +module_param(never_register, bool, 0444); +MODULE_PARM_DESC(never_register, "Never register memory"); + +static const struct kernel_param_ops srp_tmo_ops; + +static int srp_reconnect_delay = 20; +module_param_cb(reconnect_delay, &srp_tmo_ops, &srp_reconnect_delay, + S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(reconnect_delay, "Time between successive reconnect attempts"); + +static int srp_fast_io_fail_tmo = 15; +module_param_cb(fast_io_fail_tmo, &srp_tmo_ops, &srp_fast_io_fail_tmo, + S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(fast_io_fail_tmo, + "Number of seconds between the observation of a transport" + " layer error and failing all I/O. \"off\" means that this" + " functionality is disabled."); + +static int srp_dev_loss_tmo = 600; +module_param_cb(dev_loss_tmo, &srp_tmo_ops, &srp_dev_loss_tmo, + S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(dev_loss_tmo, + "Maximum number of seconds that the SRP transport should" + " insulate transport layer errors. After this time has been" + " exceeded the SCSI host is removed. Should be" + " between 1 and " __stringify(SCSI_DEVICE_BLOCK_MAX_TIMEOUT) + " if fast_io_fail_tmo has not been set. \"off\" means that" + " this functionality is disabled."); + +static bool srp_use_imm_data = true; +module_param_named(use_imm_data, srp_use_imm_data, bool, 0644); +MODULE_PARM_DESC(use_imm_data, + "Whether or not to request permission to use immediate data during SRP login."); + +static unsigned int srp_max_imm_data = 8 * 1024; +module_param_named(max_imm_data, srp_max_imm_data, uint, 0644); +MODULE_PARM_DESC(max_imm_data, "Maximum immediate data size."); + +static unsigned ch_count; +module_param(ch_count, uint, 0444); +MODULE_PARM_DESC(ch_count, + "Number of RDMA channels to use for communication with an SRP target. Using more than one channel improves performance if the HCA supports multiple completion vectors. The default value is the minimum of four times the number of online CPU sockets and the number of completion vectors supported by the HCA."); + +static int srp_add_one(struct ib_device *device); +static void srp_remove_one(struct ib_device *device, void *client_data); +static void srp_rename_dev(struct ib_device *device, void *client_data); +static void srp_recv_done(struct ib_cq *cq, struct ib_wc *wc); +static void srp_handle_qp_err(struct ib_cq *cq, struct ib_wc *wc, + const char *opname); +static int srp_ib_cm_handler(struct ib_cm_id *cm_id, + const struct ib_cm_event *event); +static int srp_rdma_cm_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event); + +static struct scsi_transport_template *ib_srp_transport_template; +static struct workqueue_struct *srp_remove_wq; + +static struct ib_client srp_client = { + .name = "srp", + .add = srp_add_one, + .remove = srp_remove_one, + .rename = srp_rename_dev +}; + +static struct ib_sa_client srp_sa_client; + +static int srp_tmo_get(char *buffer, const struct kernel_param *kp) +{ + int tmo = *(int *)kp->arg; + + if (tmo >= 0) + return sysfs_emit(buffer, "%d\n", tmo); + else + return sysfs_emit(buffer, "off\n"); +} + +static int srp_tmo_set(const char *val, const struct kernel_param *kp) +{ + int tmo, res; + + res = srp_parse_tmo(&tmo, val); + if (res) + goto out; + + if (kp->arg == &srp_reconnect_delay) + res = srp_tmo_valid(tmo, srp_fast_io_fail_tmo, + srp_dev_loss_tmo); + else if (kp->arg == &srp_fast_io_fail_tmo) + res = srp_tmo_valid(srp_reconnect_delay, tmo, srp_dev_loss_tmo); + else + res = srp_tmo_valid(srp_reconnect_delay, srp_fast_io_fail_tmo, + tmo); + if (res) + goto out; + *(int *)kp->arg = tmo; + +out: + return res; +} + +static const struct kernel_param_ops srp_tmo_ops = { + .get = srp_tmo_get, + .set = srp_tmo_set, +}; + +static inline struct srp_target_port *host_to_target(struct Scsi_Host *host) +{ + return (struct srp_target_port *) host->hostdata; +} + +static const char *srp_target_info(struct Scsi_Host *host) +{ + return host_to_target(host)->target_name; +} + +static int srp_target_is_topspin(struct srp_target_port *target) +{ + static const u8 topspin_oui[3] = { 0x00, 0x05, 0xad }; + static const u8 cisco_oui[3] = { 0x00, 0x1b, 0x0d }; + + return topspin_workarounds && + (!memcmp(&target->ioc_guid, topspin_oui, sizeof topspin_oui) || + !memcmp(&target->ioc_guid, cisco_oui, sizeof cisco_oui)); +} + +static struct srp_iu *srp_alloc_iu(struct srp_host *host, size_t size, + gfp_t gfp_mask, + enum dma_data_direction direction) +{ + struct srp_iu *iu; + + iu = kmalloc(sizeof *iu, gfp_mask); + if (!iu) + goto out; + + iu->buf = kzalloc(size, gfp_mask); + if (!iu->buf) + goto out_free_iu; + + iu->dma = ib_dma_map_single(host->srp_dev->dev, iu->buf, size, + direction); + if (ib_dma_mapping_error(host->srp_dev->dev, iu->dma)) + goto out_free_buf; + + iu->size = size; + iu->direction = direction; + + return iu; + +out_free_buf: + kfree(iu->buf); +out_free_iu: + kfree(iu); +out: + return NULL; +} + +static void srp_free_iu(struct srp_host *host, struct srp_iu *iu) +{ + if (!iu) + return; + + ib_dma_unmap_single(host->srp_dev->dev, iu->dma, iu->size, + iu->direction); + kfree(iu->buf); + kfree(iu); +} + +static void srp_qp_event(struct ib_event *event, void *context) +{ + pr_debug("QP event %s (%d)\n", + ib_event_msg(event->event), event->event); +} + +static int srp_init_ib_qp(struct srp_target_port *target, + struct ib_qp *qp) +{ + struct ib_qp_attr *attr; + int ret; + + attr = kmalloc(sizeof *attr, GFP_KERNEL); + if (!attr) + return -ENOMEM; + + ret = ib_find_cached_pkey(target->srp_host->srp_dev->dev, + target->srp_host->port, + be16_to_cpu(target->ib_cm.pkey), + &attr->pkey_index); + if (ret) + goto out; + + attr->qp_state = IB_QPS_INIT; + attr->qp_access_flags = (IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE); + attr->port_num = target->srp_host->port; + + ret = ib_modify_qp(qp, attr, + IB_QP_STATE | + IB_QP_PKEY_INDEX | + IB_QP_ACCESS_FLAGS | + IB_QP_PORT); + +out: + kfree(attr); + return ret; +} + +static int srp_new_ib_cm_id(struct srp_rdma_ch *ch) +{ + struct srp_target_port *target = ch->target; + struct ib_cm_id *new_cm_id; + + new_cm_id = ib_create_cm_id(target->srp_host->srp_dev->dev, + srp_ib_cm_handler, ch); + if (IS_ERR(new_cm_id)) + return PTR_ERR(new_cm_id); + + if (ch->ib_cm.cm_id) + ib_destroy_cm_id(ch->ib_cm.cm_id); + ch->ib_cm.cm_id = new_cm_id; + if (rdma_cap_opa_ah(target->srp_host->srp_dev->dev, + target->srp_host->port)) + ch->ib_cm.path.rec_type = SA_PATH_REC_TYPE_OPA; + else + ch->ib_cm.path.rec_type = SA_PATH_REC_TYPE_IB; + ch->ib_cm.path.sgid = target->sgid; + ch->ib_cm.path.dgid = target->ib_cm.orig_dgid; + ch->ib_cm.path.pkey = target->ib_cm.pkey; + ch->ib_cm.path.service_id = target->ib_cm.service_id; + + return 0; +} + +static int srp_new_rdma_cm_id(struct srp_rdma_ch *ch) +{ + struct srp_target_port *target = ch->target; + struct rdma_cm_id *new_cm_id; + int ret; + + new_cm_id = rdma_create_id(target->net, srp_rdma_cm_handler, ch, + RDMA_PS_TCP, IB_QPT_RC); + if (IS_ERR(new_cm_id)) { + ret = PTR_ERR(new_cm_id); + new_cm_id = NULL; + goto out; + } + + init_completion(&ch->done); + ret = rdma_resolve_addr(new_cm_id, target->rdma_cm.src_specified ? + &target->rdma_cm.src.sa : NULL, + &target->rdma_cm.dst.sa, + SRP_PATH_REC_TIMEOUT_MS); + if (ret) { + pr_err("No route available from %pISpsc to %pISpsc (%d)\n", + &target->rdma_cm.src, &target->rdma_cm.dst, ret); + goto out; + } + ret = wait_for_completion_interruptible(&ch->done); + if (ret < 0) + goto out; + + ret = ch->status; + if (ret) { + pr_err("Resolving address %pISpsc failed (%d)\n", + &target->rdma_cm.dst, ret); + goto out; + } + + swap(ch->rdma_cm.cm_id, new_cm_id); + +out: + if (new_cm_id) + rdma_destroy_id(new_cm_id); + + return ret; +} + +static int srp_new_cm_id(struct srp_rdma_ch *ch) +{ + struct srp_target_port *target = ch->target; + + return target->using_rdma_cm ? srp_new_rdma_cm_id(ch) : + srp_new_ib_cm_id(ch); +} + +/** + * srp_destroy_fr_pool() - free the resources owned by a pool + * @pool: Fast registration pool to be destroyed. + */ +static void srp_destroy_fr_pool(struct srp_fr_pool *pool) +{ + int i; + struct srp_fr_desc *d; + + if (!pool) + return; + + for (i = 0, d = &pool->desc[0]; i < pool->size; i++, d++) { + if (d->mr) + ib_dereg_mr(d->mr); + } + kfree(pool); +} + +/** + * srp_create_fr_pool() - allocate and initialize a pool for fast registration + * @device: IB device to allocate fast registration descriptors for. + * @pd: Protection domain associated with the FR descriptors. + * @pool_size: Number of descriptors to allocate. + * @max_page_list_len: Maximum fast registration work request page list length. + */ +static struct srp_fr_pool *srp_create_fr_pool(struct ib_device *device, + struct ib_pd *pd, int pool_size, + int max_page_list_len) +{ + struct srp_fr_pool *pool; + struct srp_fr_desc *d; + struct ib_mr *mr; + int i, ret = -EINVAL; + enum ib_mr_type mr_type; + + if (pool_size <= 0) + goto err; + ret = -ENOMEM; + pool = kzalloc(struct_size(pool, desc, pool_size), GFP_KERNEL); + if (!pool) + goto err; + pool->size = pool_size; + pool->max_page_list_len = max_page_list_len; + spin_lock_init(&pool->lock); + INIT_LIST_HEAD(&pool->free_list); + + mr_type = IB_MR_TYPE_MEM_REG; + + for (i = 0, d = &pool->desc[0]; i < pool->size; i++, d++) { + mr = ib_alloc_mr(pd, mr_type, max_page_list_len); + if (IS_ERR(mr)) { + ret = PTR_ERR(mr); + if (ret == -ENOMEM) + pr_info("%s: ib_alloc_mr() failed. Try to reduce max_cmd_per_lun, max_sect or ch_count\n", + dev_name(&device->dev)); + goto destroy_pool; + } + d->mr = mr; + list_add_tail(&d->entry, &pool->free_list); + } + +out: + return pool; + +destroy_pool: + srp_destroy_fr_pool(pool); + +err: + pool = ERR_PTR(ret); + goto out; +} + +/** + * srp_fr_pool_get() - obtain a descriptor suitable for fast registration + * @pool: Pool to obtain descriptor from. + */ +static struct srp_fr_desc *srp_fr_pool_get(struct srp_fr_pool *pool) +{ + struct srp_fr_desc *d = NULL; + unsigned long flags; + + spin_lock_irqsave(&pool->lock, flags); + if (!list_empty(&pool->free_list)) { + d = list_first_entry(&pool->free_list, typeof(*d), entry); + list_del(&d->entry); + } + spin_unlock_irqrestore(&pool->lock, flags); + + return d; +} + +/** + * srp_fr_pool_put() - put an FR descriptor back in the free list + * @pool: Pool the descriptor was allocated from. + * @desc: Pointer to an array of fast registration descriptor pointers. + * @n: Number of descriptors to put back. + * + * Note: The caller must already have queued an invalidation request for + * desc->mr->rkey before calling this function. + */ +static void srp_fr_pool_put(struct srp_fr_pool *pool, struct srp_fr_desc **desc, + int n) +{ + unsigned long flags; + int i; + + spin_lock_irqsave(&pool->lock, flags); + for (i = 0; i < n; i++) + list_add(&desc[i]->entry, &pool->free_list); + spin_unlock_irqrestore(&pool->lock, flags); +} + +static struct srp_fr_pool *srp_alloc_fr_pool(struct srp_target_port *target) +{ + struct srp_device *dev = target->srp_host->srp_dev; + + return srp_create_fr_pool(dev->dev, dev->pd, target->mr_pool_size, + dev->max_pages_per_mr); +} + +/** + * srp_destroy_qp() - destroy an RDMA queue pair + * @ch: SRP RDMA channel. + * + * Drain the qp before destroying it. This avoids that the receive + * completion handler can access the queue pair while it is + * being destroyed. + */ +static void srp_destroy_qp(struct srp_rdma_ch *ch) +{ + spin_lock_irq(&ch->lock); + ib_process_cq_direct(ch->send_cq, -1); + spin_unlock_irq(&ch->lock); + + ib_drain_qp(ch->qp); + ib_destroy_qp(ch->qp); +} + +static int srp_create_ch_ib(struct srp_rdma_ch *ch) +{ + struct srp_target_port *target = ch->target; + struct srp_device *dev = target->srp_host->srp_dev; + const struct ib_device_attr *attr = &dev->dev->attrs; + struct ib_qp_init_attr *init_attr; + struct ib_cq *recv_cq, *send_cq; + struct ib_qp *qp; + struct srp_fr_pool *fr_pool = NULL; + const int m = 1 + dev->use_fast_reg * target->mr_per_cmd * 2; + int ret; + + init_attr = kzalloc(sizeof *init_attr, GFP_KERNEL); + if (!init_attr) + return -ENOMEM; + + /* queue_size + 1 for ib_drain_rq() */ + recv_cq = ib_alloc_cq(dev->dev, ch, target->queue_size + 1, + ch->comp_vector, IB_POLL_SOFTIRQ); + if (IS_ERR(recv_cq)) { + ret = PTR_ERR(recv_cq); + goto err; + } + + send_cq = ib_alloc_cq(dev->dev, ch, m * target->queue_size, + ch->comp_vector, IB_POLL_DIRECT); + if (IS_ERR(send_cq)) { + ret = PTR_ERR(send_cq); + goto err_recv_cq; + } + + init_attr->event_handler = srp_qp_event; + init_attr->cap.max_send_wr = m * target->queue_size; + init_attr->cap.max_recv_wr = target->queue_size + 1; + init_attr->cap.max_recv_sge = 1; + init_attr->cap.max_send_sge = min(SRP_MAX_SGE, attr->max_send_sge); + init_attr->sq_sig_type = IB_SIGNAL_REQ_WR; + init_attr->qp_type = IB_QPT_RC; + init_attr->send_cq = send_cq; + init_attr->recv_cq = recv_cq; + + ch->max_imm_sge = min(init_attr->cap.max_send_sge - 1U, 255U); + + if (target->using_rdma_cm) { + ret = rdma_create_qp(ch->rdma_cm.cm_id, dev->pd, init_attr); + qp = ch->rdma_cm.cm_id->qp; + } else { + qp = ib_create_qp(dev->pd, init_attr); + if (!IS_ERR(qp)) { + ret = srp_init_ib_qp(target, qp); + if (ret) + ib_destroy_qp(qp); + } else { + ret = PTR_ERR(qp); + } + } + if (ret) { + pr_err("QP creation failed for dev %s: %d\n", + dev_name(&dev->dev->dev), ret); + goto err_send_cq; + } + + if (dev->use_fast_reg) { + fr_pool = srp_alloc_fr_pool(target); + if (IS_ERR(fr_pool)) { + ret = PTR_ERR(fr_pool); + shost_printk(KERN_WARNING, target->scsi_host, PFX + "FR pool allocation failed (%d)\n", ret); + goto err_qp; + } + } + + if (ch->qp) + srp_destroy_qp(ch); + if (ch->recv_cq) + ib_free_cq(ch->recv_cq); + if (ch->send_cq) + ib_free_cq(ch->send_cq); + + ch->qp = qp; + ch->recv_cq = recv_cq; + ch->send_cq = send_cq; + + if (dev->use_fast_reg) { + if (ch->fr_pool) + srp_destroy_fr_pool(ch->fr_pool); + ch->fr_pool = fr_pool; + } + + kfree(init_attr); + return 0; + +err_qp: + if (target->using_rdma_cm) + rdma_destroy_qp(ch->rdma_cm.cm_id); + else + ib_destroy_qp(qp); + +err_send_cq: + ib_free_cq(send_cq); + +err_recv_cq: + ib_free_cq(recv_cq); + +err: + kfree(init_attr); + return ret; +} + +/* + * Note: this function may be called without srp_alloc_iu_bufs() having been + * invoked. Hence the ch->[rt]x_ring checks. + */ +static void srp_free_ch_ib(struct srp_target_port *target, + struct srp_rdma_ch *ch) +{ + struct srp_device *dev = target->srp_host->srp_dev; + int i; + + if (!ch->target) + return; + + if (target->using_rdma_cm) { + if (ch->rdma_cm.cm_id) { + rdma_destroy_id(ch->rdma_cm.cm_id); + ch->rdma_cm.cm_id = NULL; + } + } else { + if (ch->ib_cm.cm_id) { + ib_destroy_cm_id(ch->ib_cm.cm_id); + ch->ib_cm.cm_id = NULL; + } + } + + /* If srp_new_cm_id() succeeded but srp_create_ch_ib() not, return. */ + if (!ch->qp) + return; + + if (dev->use_fast_reg) { + if (ch->fr_pool) + srp_destroy_fr_pool(ch->fr_pool); + } + + srp_destroy_qp(ch); + ib_free_cq(ch->send_cq); + ib_free_cq(ch->recv_cq); + + /* + * Avoid that the SCSI error handler tries to use this channel after + * it has been freed. The SCSI error handler can namely continue + * trying to perform recovery actions after scsi_remove_host() + * returned. + */ + ch->target = NULL; + + ch->qp = NULL; + ch->send_cq = ch->recv_cq = NULL; + + if (ch->rx_ring) { + for (i = 0; i < target->queue_size; ++i) + srp_free_iu(target->srp_host, ch->rx_ring[i]); + kfree(ch->rx_ring); + ch->rx_ring = NULL; + } + if (ch->tx_ring) { + for (i = 0; i < target->queue_size; ++i) + srp_free_iu(target->srp_host, ch->tx_ring[i]); + kfree(ch->tx_ring); + ch->tx_ring = NULL; + } +} + +static void srp_path_rec_completion(int status, + struct sa_path_rec *pathrec, + int num_paths, void *ch_ptr) +{ + struct srp_rdma_ch *ch = ch_ptr; + struct srp_target_port *target = ch->target; + + ch->status = status; + if (status) + shost_printk(KERN_ERR, target->scsi_host, + PFX "Got failed path rec status %d\n", status); + else + ch->ib_cm.path = *pathrec; + complete(&ch->done); +} + +static int srp_ib_lookup_path(struct srp_rdma_ch *ch) +{ + struct srp_target_port *target = ch->target; + int ret; + + ch->ib_cm.path.numb_path = 1; + + init_completion(&ch->done); + + ch->ib_cm.path_query_id = ib_sa_path_rec_get(&srp_sa_client, + target->srp_host->srp_dev->dev, + target->srp_host->port, + &ch->ib_cm.path, + IB_SA_PATH_REC_SERVICE_ID | + IB_SA_PATH_REC_DGID | + IB_SA_PATH_REC_SGID | + IB_SA_PATH_REC_NUMB_PATH | + IB_SA_PATH_REC_PKEY, + SRP_PATH_REC_TIMEOUT_MS, 0, + GFP_KERNEL, + srp_path_rec_completion, + ch, &ch->ib_cm.path_query); + if (ch->ib_cm.path_query_id < 0) + return ch->ib_cm.path_query_id; + + ret = wait_for_completion_interruptible(&ch->done); + if (ret < 0) + return ret; + + if (ch->status < 0) + shost_printk(KERN_WARNING, target->scsi_host, + PFX "Path record query failed: sgid %pI6, dgid %pI6, pkey %#04x, service_id %#16llx\n", + ch->ib_cm.path.sgid.raw, ch->ib_cm.path.dgid.raw, + be16_to_cpu(target->ib_cm.pkey), + be64_to_cpu(target->ib_cm.service_id)); + + return ch->status; +} + +static int srp_rdma_lookup_path(struct srp_rdma_ch *ch) +{ + struct srp_target_port *target = ch->target; + int ret; + + init_completion(&ch->done); + + ret = rdma_resolve_route(ch->rdma_cm.cm_id, SRP_PATH_REC_TIMEOUT_MS); + if (ret) + return ret; + + wait_for_completion_interruptible(&ch->done); + + if (ch->status != 0) + shost_printk(KERN_WARNING, target->scsi_host, + PFX "Path resolution failed\n"); + + return ch->status; +} + +static int srp_lookup_path(struct srp_rdma_ch *ch) +{ + struct srp_target_port *target = ch->target; + + return target->using_rdma_cm ? srp_rdma_lookup_path(ch) : + srp_ib_lookup_path(ch); +} + +static u8 srp_get_subnet_timeout(struct srp_host *host) +{ + struct ib_port_attr attr; + int ret; + u8 subnet_timeout = 18; + + ret = ib_query_port(host->srp_dev->dev, host->port, &attr); + if (ret == 0) + subnet_timeout = attr.subnet_timeout; + + if (unlikely(subnet_timeout < 15)) + pr_warn("%s: subnet timeout %d may cause SRP login to fail.\n", + dev_name(&host->srp_dev->dev->dev), subnet_timeout); + + return subnet_timeout; +} + +static int srp_send_req(struct srp_rdma_ch *ch, uint32_t max_iu_len, + bool multich) +{ + struct srp_target_port *target = ch->target; + struct { + struct rdma_conn_param rdma_param; + struct srp_login_req_rdma rdma_req; + struct ib_cm_req_param ib_param; + struct srp_login_req ib_req; + } *req = NULL; + char *ipi, *tpi; + int status; + + req = kzalloc(sizeof *req, GFP_KERNEL); + if (!req) + return -ENOMEM; + + req->ib_param.flow_control = 1; + req->ib_param.retry_count = target->tl_retry_count; + + /* + * Pick some arbitrary defaults here; we could make these + * module parameters if anyone cared about setting them. + */ + req->ib_param.responder_resources = 4; + req->ib_param.rnr_retry_count = 7; + req->ib_param.max_cm_retries = 15; + + req->ib_req.opcode = SRP_LOGIN_REQ; + req->ib_req.tag = 0; + req->ib_req.req_it_iu_len = cpu_to_be32(max_iu_len); + req->ib_req.req_buf_fmt = cpu_to_be16(SRP_BUF_FORMAT_DIRECT | + SRP_BUF_FORMAT_INDIRECT); + req->ib_req.req_flags = (multich ? SRP_MULTICHAN_MULTI : + SRP_MULTICHAN_SINGLE); + if (srp_use_imm_data) { + req->ib_req.req_flags |= SRP_IMMED_REQUESTED; + req->ib_req.imm_data_offset = cpu_to_be16(SRP_IMM_DATA_OFFSET); + } + + if (target->using_rdma_cm) { + req->rdma_param.flow_control = req->ib_param.flow_control; + req->rdma_param.responder_resources = + req->ib_param.responder_resources; + req->rdma_param.initiator_depth = req->ib_param.initiator_depth; + req->rdma_param.retry_count = req->ib_param.retry_count; + req->rdma_param.rnr_retry_count = req->ib_param.rnr_retry_count; + req->rdma_param.private_data = &req->rdma_req; + req->rdma_param.private_data_len = sizeof(req->rdma_req); + + req->rdma_req.opcode = req->ib_req.opcode; + req->rdma_req.tag = req->ib_req.tag; + req->rdma_req.req_it_iu_len = req->ib_req.req_it_iu_len; + req->rdma_req.req_buf_fmt = req->ib_req.req_buf_fmt; + req->rdma_req.req_flags = req->ib_req.req_flags; + req->rdma_req.imm_data_offset = req->ib_req.imm_data_offset; + + ipi = req->rdma_req.initiator_port_id; + tpi = req->rdma_req.target_port_id; + } else { + u8 subnet_timeout; + + subnet_timeout = srp_get_subnet_timeout(target->srp_host); + + req->ib_param.primary_path = &ch->ib_cm.path; + req->ib_param.alternate_path = NULL; + req->ib_param.service_id = target->ib_cm.service_id; + get_random_bytes(&req->ib_param.starting_psn, 4); + req->ib_param.starting_psn &= 0xffffff; + req->ib_param.qp_num = ch->qp->qp_num; + req->ib_param.qp_type = ch->qp->qp_type; + req->ib_param.local_cm_response_timeout = subnet_timeout + 2; + req->ib_param.remote_cm_response_timeout = subnet_timeout + 2; + req->ib_param.private_data = &req->ib_req; + req->ib_param.private_data_len = sizeof(req->ib_req); + + ipi = req->ib_req.initiator_port_id; + tpi = req->ib_req.target_port_id; + } + + /* + * In the published SRP specification (draft rev. 16a), the + * port identifier format is 8 bytes of ID extension followed + * by 8 bytes of GUID. Older drafts put the two halves in the + * opposite order, so that the GUID comes first. + * + * Targets conforming to these obsolete drafts can be + * recognized by the I/O Class they report. + */ + if (target->io_class == SRP_REV10_IB_IO_CLASS) { + memcpy(ipi, &target->sgid.global.interface_id, 8); + memcpy(ipi + 8, &target->initiator_ext, 8); + memcpy(tpi, &target->ioc_guid, 8); + memcpy(tpi + 8, &target->id_ext, 8); + } else { + memcpy(ipi, &target->initiator_ext, 8); + memcpy(ipi + 8, &target->sgid.global.interface_id, 8); + memcpy(tpi, &target->id_ext, 8); + memcpy(tpi + 8, &target->ioc_guid, 8); + } + + /* + * Topspin/Cisco SRP targets will reject our login unless we + * zero out the first 8 bytes of our initiator port ID and set + * the second 8 bytes to the local node GUID. + */ + if (srp_target_is_topspin(target)) { + shost_printk(KERN_DEBUG, target->scsi_host, + PFX "Topspin/Cisco initiator port ID workaround " + "activated for target GUID %016llx\n", + be64_to_cpu(target->ioc_guid)); + memset(ipi, 0, 8); + memcpy(ipi + 8, &target->srp_host->srp_dev->dev->node_guid, 8); + } + + if (target->using_rdma_cm) + status = rdma_connect(ch->rdma_cm.cm_id, &req->rdma_param); + else + status = ib_send_cm_req(ch->ib_cm.cm_id, &req->ib_param); + + kfree(req); + + return status; +} + +static bool srp_queue_remove_work(struct srp_target_port *target) +{ + bool changed = false; + + spin_lock_irq(&target->lock); + if (target->state != SRP_TARGET_REMOVED) { + target->state = SRP_TARGET_REMOVED; + changed = true; + } + spin_unlock_irq(&target->lock); + + if (changed) + queue_work(srp_remove_wq, &target->remove_work); + + return changed; +} + +static void srp_disconnect_target(struct srp_target_port *target) +{ + struct srp_rdma_ch *ch; + int i, ret; + + /* XXX should send SRP_I_LOGOUT request */ + + for (i = 0; i < target->ch_count; i++) { + ch = &target->ch[i]; + ch->connected = false; + ret = 0; + if (target->using_rdma_cm) { + if (ch->rdma_cm.cm_id) + rdma_disconnect(ch->rdma_cm.cm_id); + } else { + if (ch->ib_cm.cm_id) + ret = ib_send_cm_dreq(ch->ib_cm.cm_id, + NULL, 0); + } + if (ret < 0) { + shost_printk(KERN_DEBUG, target->scsi_host, + PFX "Sending CM DREQ failed\n"); + } + } +} + +static int srp_exit_cmd_priv(struct Scsi_Host *shost, struct scsi_cmnd *cmd) +{ + struct srp_target_port *target = host_to_target(shost); + struct srp_device *dev = target->srp_host->srp_dev; + struct ib_device *ibdev = dev->dev; + struct srp_request *req = scsi_cmd_priv(cmd); + + kfree(req->fr_list); + if (req->indirect_dma_addr) { + ib_dma_unmap_single(ibdev, req->indirect_dma_addr, + target->indirect_size, + DMA_TO_DEVICE); + } + kfree(req->indirect_desc); + + return 0; +} + +static int srp_init_cmd_priv(struct Scsi_Host *shost, struct scsi_cmnd *cmd) +{ + struct srp_target_port *target = host_to_target(shost); + struct srp_device *srp_dev = target->srp_host->srp_dev; + struct ib_device *ibdev = srp_dev->dev; + struct srp_request *req = scsi_cmd_priv(cmd); + dma_addr_t dma_addr; + int ret = -ENOMEM; + + if (srp_dev->use_fast_reg) { + req->fr_list = kmalloc_array(target->mr_per_cmd, sizeof(void *), + GFP_KERNEL); + if (!req->fr_list) + goto out; + } + req->indirect_desc = kmalloc(target->indirect_size, GFP_KERNEL); + if (!req->indirect_desc) + goto out; + + dma_addr = ib_dma_map_single(ibdev, req->indirect_desc, + target->indirect_size, + DMA_TO_DEVICE); + if (ib_dma_mapping_error(ibdev, dma_addr)) { + srp_exit_cmd_priv(shost, cmd); + goto out; + } + + req->indirect_dma_addr = dma_addr; + ret = 0; + +out: + return ret; +} + +/** + * srp_del_scsi_host_attr() - Remove attributes defined in the host template. + * @shost: SCSI host whose attributes to remove from sysfs. + * + * Note: Any attributes defined in the host template and that did not exist + * before invocation of this function will be ignored. + */ +static void srp_del_scsi_host_attr(struct Scsi_Host *shost) +{ + const struct attribute_group **g; + struct attribute **attr; + + for (g = shost->hostt->shost_groups; *g; ++g) { + for (attr = (*g)->attrs; *attr; ++attr) { + struct device_attribute *dev_attr = + container_of(*attr, typeof(*dev_attr), attr); + + device_remove_file(&shost->shost_dev, dev_attr); + } + } +} + +static void srp_remove_target(struct srp_target_port *target) +{ + struct srp_rdma_ch *ch; + int i; + + WARN_ON_ONCE(target->state != SRP_TARGET_REMOVED); + + srp_del_scsi_host_attr(target->scsi_host); + srp_rport_get(target->rport); + srp_remove_host(target->scsi_host); + scsi_remove_host(target->scsi_host); + srp_stop_rport_timers(target->rport); + srp_disconnect_target(target); + kobj_ns_drop(KOBJ_NS_TYPE_NET, target->net); + for (i = 0; i < target->ch_count; i++) { + ch = &target->ch[i]; + srp_free_ch_ib(target, ch); + } + cancel_work_sync(&target->tl_err_work); + srp_rport_put(target->rport); + kfree(target->ch); + target->ch = NULL; + + spin_lock(&target->srp_host->target_lock); + list_del(&target->list); + spin_unlock(&target->srp_host->target_lock); + + scsi_host_put(target->scsi_host); +} + +static void srp_remove_work(struct work_struct *work) +{ + struct srp_target_port *target = + container_of(work, struct srp_target_port, remove_work); + + WARN_ON_ONCE(target->state != SRP_TARGET_REMOVED); + + srp_remove_target(target); +} + +static void srp_rport_delete(struct srp_rport *rport) +{ + struct srp_target_port *target = rport->lld_data; + + srp_queue_remove_work(target); +} + +/** + * srp_connected_ch() - number of connected channels + * @target: SRP target port. + */ +static int srp_connected_ch(struct srp_target_port *target) +{ + int i, c = 0; + + for (i = 0; i < target->ch_count; i++) + c += target->ch[i].connected; + + return c; +} + +static int srp_connect_ch(struct srp_rdma_ch *ch, uint32_t max_iu_len, + bool multich) +{ + struct srp_target_port *target = ch->target; + int ret; + + WARN_ON_ONCE(!multich && srp_connected_ch(target) > 0); + + ret = srp_lookup_path(ch); + if (ret) + goto out; + + while (1) { + init_completion(&ch->done); + ret = srp_send_req(ch, max_iu_len, multich); + if (ret) + goto out; + ret = wait_for_completion_interruptible(&ch->done); + if (ret < 0) + goto out; + + /* + * The CM event handling code will set status to + * SRP_PORT_REDIRECT if we get a port redirect REJ + * back, or SRP_DLID_REDIRECT if we get a lid/qp + * redirect REJ back. + */ + ret = ch->status; + switch (ret) { + case 0: + ch->connected = true; + goto out; + + case SRP_PORT_REDIRECT: + ret = srp_lookup_path(ch); + if (ret) + goto out; + break; + + case SRP_DLID_REDIRECT: + break; + + case SRP_STALE_CONN: + shost_printk(KERN_ERR, target->scsi_host, PFX + "giving up on stale connection\n"); + ret = -ECONNRESET; + goto out; + + default: + goto out; + } + } + +out: + return ret <= 0 ? ret : -ENODEV; +} + +static void srp_inv_rkey_err_done(struct ib_cq *cq, struct ib_wc *wc) +{ + srp_handle_qp_err(cq, wc, "INV RKEY"); +} + +static int srp_inv_rkey(struct srp_request *req, struct srp_rdma_ch *ch, + u32 rkey) +{ + struct ib_send_wr wr = { + .opcode = IB_WR_LOCAL_INV, + .next = NULL, + .num_sge = 0, + .send_flags = 0, + .ex.invalidate_rkey = rkey, + }; + + wr.wr_cqe = &req->reg_cqe; + req->reg_cqe.done = srp_inv_rkey_err_done; + return ib_post_send(ch->qp, &wr, NULL); +} + +static void srp_unmap_data(struct scsi_cmnd *scmnd, + struct srp_rdma_ch *ch, + struct srp_request *req) +{ + struct srp_target_port *target = ch->target; + struct srp_device *dev = target->srp_host->srp_dev; + struct ib_device *ibdev = dev->dev; + int i, res; + + if (!scsi_sglist(scmnd) || + (scmnd->sc_data_direction != DMA_TO_DEVICE && + scmnd->sc_data_direction != DMA_FROM_DEVICE)) + return; + + if (dev->use_fast_reg) { + struct srp_fr_desc **pfr; + + for (i = req->nmdesc, pfr = req->fr_list; i > 0; i--, pfr++) { + res = srp_inv_rkey(req, ch, (*pfr)->mr->rkey); + if (res < 0) { + shost_printk(KERN_ERR, target->scsi_host, PFX + "Queueing INV WR for rkey %#x failed (%d)\n", + (*pfr)->mr->rkey, res); + queue_work(system_long_wq, + &target->tl_err_work); + } + } + if (req->nmdesc) + srp_fr_pool_put(ch->fr_pool, req->fr_list, + req->nmdesc); + } + + ib_dma_unmap_sg(ibdev, scsi_sglist(scmnd), scsi_sg_count(scmnd), + scmnd->sc_data_direction); +} + +/** + * srp_claim_req - Take ownership of the scmnd associated with a request. + * @ch: SRP RDMA channel. + * @req: SRP request. + * @sdev: If not NULL, only take ownership for this SCSI device. + * @scmnd: If NULL, take ownership of @req->scmnd. If not NULL, only take + * ownership of @req->scmnd if it equals @scmnd. + * + * Return value: + * Either NULL or a pointer to the SCSI command the caller became owner of. + */ +static struct scsi_cmnd *srp_claim_req(struct srp_rdma_ch *ch, + struct srp_request *req, + struct scsi_device *sdev, + struct scsi_cmnd *scmnd) +{ + unsigned long flags; + + spin_lock_irqsave(&ch->lock, flags); + if (req->scmnd && + (!sdev || req->scmnd->device == sdev) && + (!scmnd || req->scmnd == scmnd)) { + scmnd = req->scmnd; + req->scmnd = NULL; + } else { + scmnd = NULL; + } + spin_unlock_irqrestore(&ch->lock, flags); + + return scmnd; +} + +/** + * srp_free_req() - Unmap data and adjust ch->req_lim. + * @ch: SRP RDMA channel. + * @req: Request to be freed. + * @scmnd: SCSI command associated with @req. + * @req_lim_delta: Amount to be added to @target->req_lim. + */ +static void srp_free_req(struct srp_rdma_ch *ch, struct srp_request *req, + struct scsi_cmnd *scmnd, s32 req_lim_delta) +{ + unsigned long flags; + + srp_unmap_data(scmnd, ch, req); + + spin_lock_irqsave(&ch->lock, flags); + ch->req_lim += req_lim_delta; + spin_unlock_irqrestore(&ch->lock, flags); +} + +static void srp_finish_req(struct srp_rdma_ch *ch, struct srp_request *req, + struct scsi_device *sdev, int result) +{ + struct scsi_cmnd *scmnd = srp_claim_req(ch, req, sdev, NULL); + + if (scmnd) { + srp_free_req(ch, req, scmnd, 0); + scmnd->result = result; + scsi_done(scmnd); + } +} + +struct srp_terminate_context { + struct srp_target_port *srp_target; + int scsi_result; +}; + +static bool srp_terminate_cmd(struct scsi_cmnd *scmnd, void *context_ptr, + bool reserved) +{ + struct srp_terminate_context *context = context_ptr; + struct srp_target_port *target = context->srp_target; + u32 tag = blk_mq_unique_tag(scsi_cmd_to_rq(scmnd)); + struct srp_rdma_ch *ch = &target->ch[blk_mq_unique_tag_to_hwq(tag)]; + struct srp_request *req = scsi_cmd_priv(scmnd); + + srp_finish_req(ch, req, NULL, context->scsi_result); + + return true; +} + +static void srp_terminate_io(struct srp_rport *rport) +{ + struct srp_target_port *target = rport->lld_data; + struct srp_terminate_context context = { .srp_target = target, + .scsi_result = DID_TRANSPORT_FAILFAST << 16 }; + + scsi_host_busy_iter(target->scsi_host, srp_terminate_cmd, &context); +} + +/* Calculate maximum initiator to target information unit length. */ +static uint32_t srp_max_it_iu_len(int cmd_sg_cnt, bool use_imm_data, + uint32_t max_it_iu_size) +{ + uint32_t max_iu_len = sizeof(struct srp_cmd) + SRP_MAX_ADD_CDB_LEN + + sizeof(struct srp_indirect_buf) + + cmd_sg_cnt * sizeof(struct srp_direct_buf); + + if (use_imm_data) + max_iu_len = max(max_iu_len, SRP_IMM_DATA_OFFSET + + srp_max_imm_data); + + if (max_it_iu_size) + max_iu_len = min(max_iu_len, max_it_iu_size); + + pr_debug("max_iu_len = %d\n", max_iu_len); + + return max_iu_len; +} + +/* + * It is up to the caller to ensure that srp_rport_reconnect() calls are + * serialized and that no concurrent srp_queuecommand(), srp_abort(), + * srp_reset_device() or srp_reset_host() calls will occur while this function + * is in progress. One way to realize that is not to call this function + * directly but to call srp_reconnect_rport() instead since that last function + * serializes calls of this function via rport->mutex and also blocks + * srp_queuecommand() calls before invoking this function. + */ +static int srp_rport_reconnect(struct srp_rport *rport) +{ + struct srp_target_port *target = rport->lld_data; + struct srp_rdma_ch *ch; + uint32_t max_iu_len = srp_max_it_iu_len(target->cmd_sg_cnt, + srp_use_imm_data, + target->max_it_iu_size); + int i, j, ret = 0; + bool multich = false; + + srp_disconnect_target(target); + + if (target->state == SRP_TARGET_SCANNING) + return -ENODEV; + + /* + * Now get a new local CM ID so that we avoid confusing the target in + * case things are really fouled up. Doing so also ensures that all CM + * callbacks will have finished before a new QP is allocated. + */ + for (i = 0; i < target->ch_count; i++) { + ch = &target->ch[i]; + ret += srp_new_cm_id(ch); + } + { + struct srp_terminate_context context = { + .srp_target = target, .scsi_result = DID_RESET << 16}; + + scsi_host_busy_iter(target->scsi_host, srp_terminate_cmd, + &context); + } + for (i = 0; i < target->ch_count; i++) { + ch = &target->ch[i]; + /* + * Whether or not creating a new CM ID succeeded, create a new + * QP. This guarantees that all completion callback function + * invocations have finished before request resetting starts. + */ + ret += srp_create_ch_ib(ch); + + INIT_LIST_HEAD(&ch->free_tx); + for (j = 0; j < target->queue_size; ++j) + list_add(&ch->tx_ring[j]->list, &ch->free_tx); + } + + target->qp_in_error = false; + + for (i = 0; i < target->ch_count; i++) { + ch = &target->ch[i]; + if (ret) + break; + ret = srp_connect_ch(ch, max_iu_len, multich); + multich = true; + } + + if (ret == 0) + shost_printk(KERN_INFO, target->scsi_host, + PFX "reconnect succeeded\n"); + + return ret; +} + +static void srp_map_desc(struct srp_map_state *state, dma_addr_t dma_addr, + unsigned int dma_len, u32 rkey) +{ + struct srp_direct_buf *desc = state->desc; + + WARN_ON_ONCE(!dma_len); + + desc->va = cpu_to_be64(dma_addr); + desc->key = cpu_to_be32(rkey); + desc->len = cpu_to_be32(dma_len); + + state->total_len += dma_len; + state->desc++; + state->ndesc++; +} + +static void srp_reg_mr_err_done(struct ib_cq *cq, struct ib_wc *wc) +{ + srp_handle_qp_err(cq, wc, "FAST REG"); +} + +/* + * Map up to sg_nents elements of state->sg where *sg_offset_p is the offset + * where to start in the first element. If sg_offset_p != NULL then + * *sg_offset_p is updated to the offset in state->sg[retval] of the first + * byte that has not yet been mapped. + */ +static int srp_map_finish_fr(struct srp_map_state *state, + struct srp_request *req, + struct srp_rdma_ch *ch, int sg_nents, + unsigned int *sg_offset_p) +{ + struct srp_target_port *target = ch->target; + struct srp_device *dev = target->srp_host->srp_dev; + struct ib_reg_wr wr; + struct srp_fr_desc *desc; + u32 rkey; + int n, err; + + if (state->fr.next >= state->fr.end) { + shost_printk(KERN_ERR, ch->target->scsi_host, + PFX "Out of MRs (mr_per_cmd = %d)\n", + ch->target->mr_per_cmd); + return -ENOMEM; + } + + WARN_ON_ONCE(!dev->use_fast_reg); + + if (sg_nents == 1 && target->global_rkey) { + unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0; + + srp_map_desc(state, sg_dma_address(state->sg) + sg_offset, + sg_dma_len(state->sg) - sg_offset, + target->global_rkey); + if (sg_offset_p) + *sg_offset_p = 0; + return 1; + } + + desc = srp_fr_pool_get(ch->fr_pool); + if (!desc) + return -ENOMEM; + + rkey = ib_inc_rkey(desc->mr->rkey); + ib_update_fast_reg_key(desc->mr, rkey); + + n = ib_map_mr_sg(desc->mr, state->sg, sg_nents, sg_offset_p, + dev->mr_page_size); + if (unlikely(n < 0)) { + srp_fr_pool_put(ch->fr_pool, &desc, 1); + pr_debug("%s: ib_map_mr_sg(%d, %d) returned %d.\n", + dev_name(&req->scmnd->device->sdev_gendev), sg_nents, + sg_offset_p ? *sg_offset_p : -1, n); + return n; + } + + WARN_ON_ONCE(desc->mr->length == 0); + + req->reg_cqe.done = srp_reg_mr_err_done; + + wr.wr.next = NULL; + wr.wr.opcode = IB_WR_REG_MR; + wr.wr.wr_cqe = &req->reg_cqe; + wr.wr.num_sge = 0; + wr.wr.send_flags = 0; + wr.mr = desc->mr; + wr.key = desc->mr->rkey; + wr.access = (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE); + + *state->fr.next++ = desc; + state->nmdesc++; + + srp_map_desc(state, desc->mr->iova, + desc->mr->length, desc->mr->rkey); + + err = ib_post_send(ch->qp, &wr.wr, NULL); + if (unlikely(err)) { + WARN_ON_ONCE(err == -ENOMEM); + return err; + } + + return n; +} + +static int srp_map_sg_fr(struct srp_map_state *state, struct srp_rdma_ch *ch, + struct srp_request *req, struct scatterlist *scat, + int count) +{ + unsigned int sg_offset = 0; + + state->fr.next = req->fr_list; + state->fr.end = req->fr_list + ch->target->mr_per_cmd; + state->sg = scat; + + if (count == 0) + return 0; + + while (count) { + int i, n; + + n = srp_map_finish_fr(state, req, ch, count, &sg_offset); + if (unlikely(n < 0)) + return n; + + count -= n; + for (i = 0; i < n; i++) + state->sg = sg_next(state->sg); + } + + return 0; +} + +static int srp_map_sg_dma(struct srp_map_state *state, struct srp_rdma_ch *ch, + struct srp_request *req, struct scatterlist *scat, + int count) +{ + struct srp_target_port *target = ch->target; + struct scatterlist *sg; + int i; + + for_each_sg(scat, sg, count, i) { + srp_map_desc(state, sg_dma_address(sg), sg_dma_len(sg), + target->global_rkey); + } + + return 0; +} + +/* + * Register the indirect data buffer descriptor with the HCA. + * + * Note: since the indirect data buffer descriptor has been allocated with + * kmalloc() it is guaranteed that this buffer is a physically contiguous + * memory buffer. + */ +static int srp_map_idb(struct srp_rdma_ch *ch, struct srp_request *req, + void **next_mr, void **end_mr, u32 idb_len, + __be32 *idb_rkey) +{ + struct srp_target_port *target = ch->target; + struct srp_device *dev = target->srp_host->srp_dev; + struct srp_map_state state; + struct srp_direct_buf idb_desc; + struct scatterlist idb_sg[1]; + int ret; + + memset(&state, 0, sizeof(state)); + memset(&idb_desc, 0, sizeof(idb_desc)); + state.gen.next = next_mr; + state.gen.end = end_mr; + state.desc = &idb_desc; + state.base_dma_addr = req->indirect_dma_addr; + state.dma_len = idb_len; + + if (dev->use_fast_reg) { + state.sg = idb_sg; + sg_init_one(idb_sg, req->indirect_desc, idb_len); + idb_sg->dma_address = req->indirect_dma_addr; /* hack! */ +#ifdef CONFIG_NEED_SG_DMA_LENGTH + idb_sg->dma_length = idb_sg->length; /* hack^2 */ +#endif + ret = srp_map_finish_fr(&state, req, ch, 1, NULL); + if (ret < 0) + return ret; + WARN_ON_ONCE(ret < 1); + } else { + return -EINVAL; + } + + *idb_rkey = idb_desc.key; + + return 0; +} + +static void srp_check_mapping(struct srp_map_state *state, + struct srp_rdma_ch *ch, struct srp_request *req, + struct scatterlist *scat, int count) +{ + struct srp_device *dev = ch->target->srp_host->srp_dev; + struct srp_fr_desc **pfr; + u64 desc_len = 0, mr_len = 0; + int i; + + for (i = 0; i < state->ndesc; i++) + desc_len += be32_to_cpu(req->indirect_desc[i].len); + if (dev->use_fast_reg) + for (i = 0, pfr = req->fr_list; i < state->nmdesc; i++, pfr++) + mr_len += (*pfr)->mr->length; + if (desc_len != scsi_bufflen(req->scmnd) || + mr_len > scsi_bufflen(req->scmnd)) + pr_err("Inconsistent: scsi len %d <> desc len %lld <> mr len %lld; ndesc %d; nmdesc = %d\n", + scsi_bufflen(req->scmnd), desc_len, mr_len, + state->ndesc, state->nmdesc); +} + +/** + * srp_map_data() - map SCSI data buffer onto an SRP request + * @scmnd: SCSI command to map + * @ch: SRP RDMA channel + * @req: SRP request + * + * Returns the length in bytes of the SRP_CMD IU or a negative value if + * mapping failed. The size of any immediate data is not included in the + * return value. + */ +static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch, + struct srp_request *req) +{ + struct srp_target_port *target = ch->target; + struct scatterlist *scat, *sg; + struct srp_cmd *cmd = req->cmd->buf; + int i, len, nents, count, ret; + struct srp_device *dev; + struct ib_device *ibdev; + struct srp_map_state state; + struct srp_indirect_buf *indirect_hdr; + u64 data_len; + u32 idb_len, table_len; + __be32 idb_rkey; + u8 fmt; + + req->cmd->num_sge = 1; + + if (!scsi_sglist(scmnd) || scmnd->sc_data_direction == DMA_NONE) + return sizeof(struct srp_cmd) + cmd->add_cdb_len; + + if (scmnd->sc_data_direction != DMA_FROM_DEVICE && + scmnd->sc_data_direction != DMA_TO_DEVICE) { + shost_printk(KERN_WARNING, target->scsi_host, + PFX "Unhandled data direction %d\n", + scmnd->sc_data_direction); + return -EINVAL; + } + + nents = scsi_sg_count(scmnd); + scat = scsi_sglist(scmnd); + data_len = scsi_bufflen(scmnd); + + dev = target->srp_host->srp_dev; + ibdev = dev->dev; + + count = ib_dma_map_sg(ibdev, scat, nents, scmnd->sc_data_direction); + if (unlikely(count == 0)) + return -EIO; + + if (ch->use_imm_data && + count <= ch->max_imm_sge && + SRP_IMM_DATA_OFFSET + data_len <= ch->max_it_iu_len && + scmnd->sc_data_direction == DMA_TO_DEVICE) { + struct srp_imm_buf *buf; + struct ib_sge *sge = &req->cmd->sge[1]; + + fmt = SRP_DATA_DESC_IMM; + len = SRP_IMM_DATA_OFFSET; + req->nmdesc = 0; + buf = (void *)cmd->add_data + cmd->add_cdb_len; + buf->len = cpu_to_be32(data_len); + WARN_ON_ONCE((void *)(buf + 1) > (void *)cmd + len); + for_each_sg(scat, sg, count, i) { + sge[i].addr = sg_dma_address(sg); + sge[i].length = sg_dma_len(sg); + sge[i].lkey = target->lkey; + } + req->cmd->num_sge += count; + goto map_complete; + } + + fmt = SRP_DATA_DESC_DIRECT; + len = sizeof(struct srp_cmd) + cmd->add_cdb_len + + sizeof(struct srp_direct_buf); + + if (count == 1 && target->global_rkey) { + /* + * The midlayer only generated a single gather/scatter + * entry, or DMA mapping coalesced everything to a + * single entry. So a direct descriptor along with + * the DMA MR suffices. + */ + struct srp_direct_buf *buf; + + buf = (void *)cmd->add_data + cmd->add_cdb_len; + buf->va = cpu_to_be64(sg_dma_address(scat)); + buf->key = cpu_to_be32(target->global_rkey); + buf->len = cpu_to_be32(sg_dma_len(scat)); + + req->nmdesc = 0; + goto map_complete; + } + + /* + * We have more than one scatter/gather entry, so build our indirect + * descriptor table, trying to merge as many entries as we can. + */ + indirect_hdr = (void *)cmd->add_data + cmd->add_cdb_len; + + ib_dma_sync_single_for_cpu(ibdev, req->indirect_dma_addr, + target->indirect_size, DMA_TO_DEVICE); + + memset(&state, 0, sizeof(state)); + state.desc = req->indirect_desc; + if (dev->use_fast_reg) + ret = srp_map_sg_fr(&state, ch, req, scat, count); + else + ret = srp_map_sg_dma(&state, ch, req, scat, count); + req->nmdesc = state.nmdesc; + if (ret < 0) + goto unmap; + + { + DEFINE_DYNAMIC_DEBUG_METADATA(ddm, + "Memory mapping consistency check"); + if (DYNAMIC_DEBUG_BRANCH(ddm)) + srp_check_mapping(&state, ch, req, scat, count); + } + + /* We've mapped the request, now pull as much of the indirect + * descriptor table as we can into the command buffer. If this + * target is not using an external indirect table, we are + * guaranteed to fit into the command, as the SCSI layer won't + * give us more S/G entries than we allow. + */ + if (state.ndesc == 1) { + /* + * Memory registration collapsed the sg-list into one entry, + * so use a direct descriptor. + */ + struct srp_direct_buf *buf; + + buf = (void *)cmd->add_data + cmd->add_cdb_len; + *buf = req->indirect_desc[0]; + goto map_complete; + } + + if (unlikely(target->cmd_sg_cnt < state.ndesc && + !target->allow_ext_sg)) { + shost_printk(KERN_ERR, target->scsi_host, + "Could not fit S/G list into SRP_CMD\n"); + ret = -EIO; + goto unmap; + } + + count = min(state.ndesc, target->cmd_sg_cnt); + table_len = state.ndesc * sizeof (struct srp_direct_buf); + idb_len = sizeof(struct srp_indirect_buf) + table_len; + + fmt = SRP_DATA_DESC_INDIRECT; + len = sizeof(struct srp_cmd) + cmd->add_cdb_len + + sizeof(struct srp_indirect_buf); + len += count * sizeof (struct srp_direct_buf); + + memcpy(indirect_hdr->desc_list, req->indirect_desc, + count * sizeof (struct srp_direct_buf)); + + if (!target->global_rkey) { + ret = srp_map_idb(ch, req, state.gen.next, state.gen.end, + idb_len, &idb_rkey); + if (ret < 0) + goto unmap; + req->nmdesc++; + } else { + idb_rkey = cpu_to_be32(target->global_rkey); + } + + indirect_hdr->table_desc.va = cpu_to_be64(req->indirect_dma_addr); + indirect_hdr->table_desc.key = idb_rkey; + indirect_hdr->table_desc.len = cpu_to_be32(table_len); + indirect_hdr->len = cpu_to_be32(state.total_len); + + if (scmnd->sc_data_direction == DMA_TO_DEVICE) + cmd->data_out_desc_cnt = count; + else + cmd->data_in_desc_cnt = count; + + ib_dma_sync_single_for_device(ibdev, req->indirect_dma_addr, table_len, + DMA_TO_DEVICE); + +map_complete: + if (scmnd->sc_data_direction == DMA_TO_DEVICE) + cmd->buf_fmt = fmt << 4; + else + cmd->buf_fmt = fmt; + + return len; + +unmap: + srp_unmap_data(scmnd, ch, req); + if (ret == -ENOMEM && req->nmdesc >= target->mr_pool_size) + ret = -E2BIG; + return ret; +} + +/* + * Return an IU and possible credit to the free pool + */ +static void srp_put_tx_iu(struct srp_rdma_ch *ch, struct srp_iu *iu, + enum srp_iu_type iu_type) +{ + unsigned long flags; + + spin_lock_irqsave(&ch->lock, flags); + list_add(&iu->list, &ch->free_tx); + if (iu_type != SRP_IU_RSP) + ++ch->req_lim; + spin_unlock_irqrestore(&ch->lock, flags); +} + +/* + * Must be called with ch->lock held to protect req_lim and free_tx. + * If IU is not sent, it must be returned using srp_put_tx_iu(). + * + * Note: + * An upper limit for the number of allocated information units for each + * request type is: + * - SRP_IU_CMD: SRP_CMD_SQ_SIZE, since the SCSI mid-layer never queues + * more than Scsi_Host.can_queue requests. + * - SRP_IU_TSK_MGMT: SRP_TSK_MGMT_SQ_SIZE. + * - SRP_IU_RSP: 1, since a conforming SRP target never sends more than + * one unanswered SRP request to an initiator. + */ +static struct srp_iu *__srp_get_tx_iu(struct srp_rdma_ch *ch, + enum srp_iu_type iu_type) +{ + struct srp_target_port *target = ch->target; + s32 rsv = (iu_type == SRP_IU_TSK_MGMT) ? 0 : SRP_TSK_MGMT_SQ_SIZE; + struct srp_iu *iu; + + lockdep_assert_held(&ch->lock); + + ib_process_cq_direct(ch->send_cq, -1); + + if (list_empty(&ch->free_tx)) + return NULL; + + /* Initiator responses to target requests do not consume credits */ + if (iu_type != SRP_IU_RSP) { + if (ch->req_lim <= rsv) { + ++target->zero_req_lim; + return NULL; + } + + --ch->req_lim; + } + + iu = list_first_entry(&ch->free_tx, struct srp_iu, list); + list_del(&iu->list); + return iu; +} + +/* + * Note: if this function is called from inside ib_drain_sq() then it will + * be called without ch->lock being held. If ib_drain_sq() dequeues a WQE + * with status IB_WC_SUCCESS then that's a bug. + */ +static void srp_send_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct srp_iu *iu = container_of(wc->wr_cqe, struct srp_iu, cqe); + struct srp_rdma_ch *ch = cq->cq_context; + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + srp_handle_qp_err(cq, wc, "SEND"); + return; + } + + lockdep_assert_held(&ch->lock); + + list_add(&iu->list, &ch->free_tx); +} + +/** + * srp_post_send() - send an SRP information unit + * @ch: RDMA channel over which to send the information unit. + * @iu: Information unit to send. + * @len: Length of the information unit excluding immediate data. + */ +static int srp_post_send(struct srp_rdma_ch *ch, struct srp_iu *iu, int len) +{ + struct srp_target_port *target = ch->target; + struct ib_send_wr wr; + + if (WARN_ON_ONCE(iu->num_sge > SRP_MAX_SGE)) + return -EINVAL; + + iu->sge[0].addr = iu->dma; + iu->sge[0].length = len; + iu->sge[0].lkey = target->lkey; + + iu->cqe.done = srp_send_done; + + wr.next = NULL; + wr.wr_cqe = &iu->cqe; + wr.sg_list = &iu->sge[0]; + wr.num_sge = iu->num_sge; + wr.opcode = IB_WR_SEND; + wr.send_flags = IB_SEND_SIGNALED; + + return ib_post_send(ch->qp, &wr, NULL); +} + +static int srp_post_recv(struct srp_rdma_ch *ch, struct srp_iu *iu) +{ + struct srp_target_port *target = ch->target; + struct ib_recv_wr wr; + struct ib_sge list; + + list.addr = iu->dma; + list.length = iu->size; + list.lkey = target->lkey; + + iu->cqe.done = srp_recv_done; + + wr.next = NULL; + wr.wr_cqe = &iu->cqe; + wr.sg_list = &list; + wr.num_sge = 1; + + return ib_post_recv(ch->qp, &wr, NULL); +} + +static void srp_process_rsp(struct srp_rdma_ch *ch, struct srp_rsp *rsp) +{ + struct srp_target_port *target = ch->target; + struct srp_request *req; + struct scsi_cmnd *scmnd; + unsigned long flags; + + if (unlikely(rsp->tag & SRP_TAG_TSK_MGMT)) { + spin_lock_irqsave(&ch->lock, flags); + ch->req_lim += be32_to_cpu(rsp->req_lim_delta); + if (rsp->tag == ch->tsk_mgmt_tag) { + ch->tsk_mgmt_status = -1; + if (be32_to_cpu(rsp->resp_data_len) >= 4) + ch->tsk_mgmt_status = rsp->data[3]; + complete(&ch->tsk_mgmt_done); + } else { + shost_printk(KERN_ERR, target->scsi_host, + "Received tsk mgmt response too late for tag %#llx\n", + rsp->tag); + } + spin_unlock_irqrestore(&ch->lock, flags); + } else { + scmnd = scsi_host_find_tag(target->scsi_host, rsp->tag); + if (scmnd) { + req = scsi_cmd_priv(scmnd); + scmnd = srp_claim_req(ch, req, NULL, scmnd); + } else { + shost_printk(KERN_ERR, target->scsi_host, + "Null scmnd for RSP w/tag %#016llx received on ch %td / QP %#x\n", + rsp->tag, ch - target->ch, ch->qp->qp_num); + + spin_lock_irqsave(&ch->lock, flags); + ch->req_lim += be32_to_cpu(rsp->req_lim_delta); + spin_unlock_irqrestore(&ch->lock, flags); + + return; + } + scmnd->result = rsp->status; + + if (rsp->flags & SRP_RSP_FLAG_SNSVALID) { + memcpy(scmnd->sense_buffer, rsp->data + + be32_to_cpu(rsp->resp_data_len), + min_t(int, be32_to_cpu(rsp->sense_data_len), + SCSI_SENSE_BUFFERSIZE)); + } + + if (unlikely(rsp->flags & SRP_RSP_FLAG_DIUNDER)) + scsi_set_resid(scmnd, be32_to_cpu(rsp->data_in_res_cnt)); + else if (unlikely(rsp->flags & SRP_RSP_FLAG_DIOVER)) + scsi_set_resid(scmnd, -be32_to_cpu(rsp->data_in_res_cnt)); + else if (unlikely(rsp->flags & SRP_RSP_FLAG_DOUNDER)) + scsi_set_resid(scmnd, be32_to_cpu(rsp->data_out_res_cnt)); + else if (unlikely(rsp->flags & SRP_RSP_FLAG_DOOVER)) + scsi_set_resid(scmnd, -be32_to_cpu(rsp->data_out_res_cnt)); + + srp_free_req(ch, req, scmnd, + be32_to_cpu(rsp->req_lim_delta)); + + scsi_done(scmnd); + } +} + +static int srp_response_common(struct srp_rdma_ch *ch, s32 req_delta, + void *rsp, int len) +{ + struct srp_target_port *target = ch->target; + struct ib_device *dev = target->srp_host->srp_dev->dev; + unsigned long flags; + struct srp_iu *iu; + int err; + + spin_lock_irqsave(&ch->lock, flags); + ch->req_lim += req_delta; + iu = __srp_get_tx_iu(ch, SRP_IU_RSP); + spin_unlock_irqrestore(&ch->lock, flags); + + if (!iu) { + shost_printk(KERN_ERR, target->scsi_host, PFX + "no IU available to send response\n"); + return 1; + } + + iu->num_sge = 1; + ib_dma_sync_single_for_cpu(dev, iu->dma, len, DMA_TO_DEVICE); + memcpy(iu->buf, rsp, len); + ib_dma_sync_single_for_device(dev, iu->dma, len, DMA_TO_DEVICE); + + err = srp_post_send(ch, iu, len); + if (err) { + shost_printk(KERN_ERR, target->scsi_host, PFX + "unable to post response: %d\n", err); + srp_put_tx_iu(ch, iu, SRP_IU_RSP); + } + + return err; +} + +static void srp_process_cred_req(struct srp_rdma_ch *ch, + struct srp_cred_req *req) +{ + struct srp_cred_rsp rsp = { + .opcode = SRP_CRED_RSP, + .tag = req->tag, + }; + s32 delta = be32_to_cpu(req->req_lim_delta); + + if (srp_response_common(ch, delta, &rsp, sizeof(rsp))) + shost_printk(KERN_ERR, ch->target->scsi_host, PFX + "problems processing SRP_CRED_REQ\n"); +} + +static void srp_process_aer_req(struct srp_rdma_ch *ch, + struct srp_aer_req *req) +{ + struct srp_target_port *target = ch->target; + struct srp_aer_rsp rsp = { + .opcode = SRP_AER_RSP, + .tag = req->tag, + }; + s32 delta = be32_to_cpu(req->req_lim_delta); + + shost_printk(KERN_ERR, target->scsi_host, PFX + "ignoring AER for LUN %llu\n", scsilun_to_int(&req->lun)); + + if (srp_response_common(ch, delta, &rsp, sizeof(rsp))) + shost_printk(KERN_ERR, target->scsi_host, PFX + "problems processing SRP_AER_REQ\n"); +} + +static void srp_recv_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct srp_iu *iu = container_of(wc->wr_cqe, struct srp_iu, cqe); + struct srp_rdma_ch *ch = cq->cq_context; + struct srp_target_port *target = ch->target; + struct ib_device *dev = target->srp_host->srp_dev->dev; + int res; + u8 opcode; + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + srp_handle_qp_err(cq, wc, "RECV"); + return; + } + + ib_dma_sync_single_for_cpu(dev, iu->dma, ch->max_ti_iu_len, + DMA_FROM_DEVICE); + + opcode = *(u8 *) iu->buf; + + if (0) { + shost_printk(KERN_ERR, target->scsi_host, + PFX "recv completion, opcode 0x%02x\n", opcode); + print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 8, 1, + iu->buf, wc->byte_len, true); + } + + switch (opcode) { + case SRP_RSP: + srp_process_rsp(ch, iu->buf); + break; + + case SRP_CRED_REQ: + srp_process_cred_req(ch, iu->buf); + break; + + case SRP_AER_REQ: + srp_process_aer_req(ch, iu->buf); + break; + + case SRP_T_LOGOUT: + /* XXX Handle target logout */ + shost_printk(KERN_WARNING, target->scsi_host, + PFX "Got target logout request\n"); + break; + + default: + shost_printk(KERN_WARNING, target->scsi_host, + PFX "Unhandled SRP opcode 0x%02x\n", opcode); + break; + } + + ib_dma_sync_single_for_device(dev, iu->dma, ch->max_ti_iu_len, + DMA_FROM_DEVICE); + + res = srp_post_recv(ch, iu); + if (res != 0) + shost_printk(KERN_ERR, target->scsi_host, + PFX "Recv failed with error code %d\n", res); +} + +/** + * srp_tl_err_work() - handle a transport layer error + * @work: Work structure embedded in an SRP target port. + * + * Note: This function may get invoked before the rport has been created, + * hence the target->rport test. + */ +static void srp_tl_err_work(struct work_struct *work) +{ + struct srp_target_port *target; + + target = container_of(work, struct srp_target_port, tl_err_work); + if (target->rport) + srp_start_tl_fail_timers(target->rport); +} + +static void srp_handle_qp_err(struct ib_cq *cq, struct ib_wc *wc, + const char *opname) +{ + struct srp_rdma_ch *ch = cq->cq_context; + struct srp_target_port *target = ch->target; + + if (ch->connected && !target->qp_in_error) { + shost_printk(KERN_ERR, target->scsi_host, + PFX "failed %s status %s (%d) for CQE %p\n", + opname, ib_wc_status_msg(wc->status), wc->status, + wc->wr_cqe); + queue_work(system_long_wq, &target->tl_err_work); + } + target->qp_in_error = true; +} + +static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd) +{ + struct request *rq = scsi_cmd_to_rq(scmnd); + struct srp_target_port *target = host_to_target(shost); + struct srp_rdma_ch *ch; + struct srp_request *req = scsi_cmd_priv(scmnd); + struct srp_iu *iu; + struct srp_cmd *cmd; + struct ib_device *dev; + unsigned long flags; + u32 tag; + int len, ret; + + scmnd->result = srp_chkready(target->rport); + if (unlikely(scmnd->result)) + goto err; + + WARN_ON_ONCE(rq->tag < 0); + tag = blk_mq_unique_tag(rq); + ch = &target->ch[blk_mq_unique_tag_to_hwq(tag)]; + + spin_lock_irqsave(&ch->lock, flags); + iu = __srp_get_tx_iu(ch, SRP_IU_CMD); + spin_unlock_irqrestore(&ch->lock, flags); + + if (!iu) + goto err; + + dev = target->srp_host->srp_dev->dev; + ib_dma_sync_single_for_cpu(dev, iu->dma, ch->max_it_iu_len, + DMA_TO_DEVICE); + + cmd = iu->buf; + memset(cmd, 0, sizeof *cmd); + + cmd->opcode = SRP_CMD; + int_to_scsilun(scmnd->device->lun, &cmd->lun); + cmd->tag = tag; + memcpy(cmd->cdb, scmnd->cmnd, scmnd->cmd_len); + if (unlikely(scmnd->cmd_len > sizeof(cmd->cdb))) { + cmd->add_cdb_len = round_up(scmnd->cmd_len - sizeof(cmd->cdb), + 4); + if (WARN_ON_ONCE(cmd->add_cdb_len > SRP_MAX_ADD_CDB_LEN)) + goto err_iu; + } + + req->scmnd = scmnd; + req->cmd = iu; + + len = srp_map_data(scmnd, ch, req); + if (len < 0) { + shost_printk(KERN_ERR, target->scsi_host, + PFX "Failed to map data (%d)\n", len); + /* + * If we ran out of memory descriptors (-ENOMEM) because an + * application is queuing many requests with more than + * max_pages_per_mr sg-list elements, tell the SCSI mid-layer + * to reduce queue depth temporarily. + */ + scmnd->result = len == -ENOMEM ? + DID_OK << 16 | SAM_STAT_TASK_SET_FULL : DID_ERROR << 16; + goto err_iu; + } + + ib_dma_sync_single_for_device(dev, iu->dma, ch->max_it_iu_len, + DMA_TO_DEVICE); + + if (srp_post_send(ch, iu, len)) { + shost_printk(KERN_ERR, target->scsi_host, PFX "Send failed\n"); + scmnd->result = DID_ERROR << 16; + goto err_unmap; + } + + return 0; + +err_unmap: + srp_unmap_data(scmnd, ch, req); + +err_iu: + srp_put_tx_iu(ch, iu, SRP_IU_CMD); + + /* + * Avoid that the loops that iterate over the request ring can + * encounter a dangling SCSI command pointer. + */ + req->scmnd = NULL; + +err: + if (scmnd->result) { + scsi_done(scmnd); + ret = 0; + } else { + ret = SCSI_MLQUEUE_HOST_BUSY; + } + + return ret; +} + +/* + * Note: the resources allocated in this function are freed in + * srp_free_ch_ib(). + */ +static int srp_alloc_iu_bufs(struct srp_rdma_ch *ch) +{ + struct srp_target_port *target = ch->target; + int i; + + ch->rx_ring = kcalloc(target->queue_size, sizeof(*ch->rx_ring), + GFP_KERNEL); + if (!ch->rx_ring) + goto err_no_ring; + ch->tx_ring = kcalloc(target->queue_size, sizeof(*ch->tx_ring), + GFP_KERNEL); + if (!ch->tx_ring) + goto err_no_ring; + + for (i = 0; i < target->queue_size; ++i) { + ch->rx_ring[i] = srp_alloc_iu(target->srp_host, + ch->max_ti_iu_len, + GFP_KERNEL, DMA_FROM_DEVICE); + if (!ch->rx_ring[i]) + goto err; + } + + for (i = 0; i < target->queue_size; ++i) { + ch->tx_ring[i] = srp_alloc_iu(target->srp_host, + ch->max_it_iu_len, + GFP_KERNEL, DMA_TO_DEVICE); + if (!ch->tx_ring[i]) + goto err; + + list_add(&ch->tx_ring[i]->list, &ch->free_tx); + } + + return 0; + +err: + for (i = 0; i < target->queue_size; ++i) { + srp_free_iu(target->srp_host, ch->rx_ring[i]); + srp_free_iu(target->srp_host, ch->tx_ring[i]); + } + + +err_no_ring: + kfree(ch->tx_ring); + ch->tx_ring = NULL; + kfree(ch->rx_ring); + ch->rx_ring = NULL; + + return -ENOMEM; +} + +static uint32_t srp_compute_rq_tmo(struct ib_qp_attr *qp_attr, int attr_mask) +{ + uint64_t T_tr_ns, max_compl_time_ms; + uint32_t rq_tmo_jiffies; + + /* + * According to section 11.2.4.2 in the IBTA spec (Modify Queue Pair, + * table 91), both the QP timeout and the retry count have to be set + * for RC QP's during the RTR to RTS transition. + */ + WARN_ON_ONCE((attr_mask & (IB_QP_TIMEOUT | IB_QP_RETRY_CNT)) != + (IB_QP_TIMEOUT | IB_QP_RETRY_CNT)); + + /* + * Set target->rq_tmo_jiffies to one second more than the largest time + * it can take before an error completion is generated. See also + * C9-140..142 in the IBTA spec for more information about how to + * convert the QP Local ACK Timeout value to nanoseconds. + */ + T_tr_ns = 4096 * (1ULL << qp_attr->timeout); + max_compl_time_ms = qp_attr->retry_cnt * 4 * T_tr_ns; + do_div(max_compl_time_ms, NSEC_PER_MSEC); + rq_tmo_jiffies = msecs_to_jiffies(max_compl_time_ms + 1000); + + return rq_tmo_jiffies; +} + +static void srp_cm_rep_handler(struct ib_cm_id *cm_id, + const struct srp_login_rsp *lrsp, + struct srp_rdma_ch *ch) +{ + struct srp_target_port *target = ch->target; + struct ib_qp_attr *qp_attr = NULL; + int attr_mask = 0; + int ret = 0; + int i; + + if (lrsp->opcode == SRP_LOGIN_RSP) { + ch->max_ti_iu_len = be32_to_cpu(lrsp->max_ti_iu_len); + ch->req_lim = be32_to_cpu(lrsp->req_lim_delta); + ch->use_imm_data = srp_use_imm_data && + (lrsp->rsp_flags & SRP_LOGIN_RSP_IMMED_SUPP); + ch->max_it_iu_len = srp_max_it_iu_len(target->cmd_sg_cnt, + ch->use_imm_data, + target->max_it_iu_size); + WARN_ON_ONCE(ch->max_it_iu_len > + be32_to_cpu(lrsp->max_it_iu_len)); + + if (ch->use_imm_data) + shost_printk(KERN_DEBUG, target->scsi_host, + PFX "using immediate data\n"); + + /* + * Reserve credits for task management so we don't + * bounce requests back to the SCSI mid-layer. + */ + target->scsi_host->can_queue + = min(ch->req_lim - SRP_TSK_MGMT_SQ_SIZE, + target->scsi_host->can_queue); + target->scsi_host->cmd_per_lun + = min_t(int, target->scsi_host->can_queue, + target->scsi_host->cmd_per_lun); + } else { + shost_printk(KERN_WARNING, target->scsi_host, + PFX "Unhandled RSP opcode %#x\n", lrsp->opcode); + ret = -ECONNRESET; + goto error; + } + + if (!ch->rx_ring) { + ret = srp_alloc_iu_bufs(ch); + if (ret) + goto error; + } + + for (i = 0; i < target->queue_size; i++) { + struct srp_iu *iu = ch->rx_ring[i]; + + ret = srp_post_recv(ch, iu); + if (ret) + goto error; + } + + if (!target->using_rdma_cm) { + ret = -ENOMEM; + qp_attr = kmalloc(sizeof(*qp_attr), GFP_KERNEL); + if (!qp_attr) + goto error; + + qp_attr->qp_state = IB_QPS_RTR; + ret = ib_cm_init_qp_attr(cm_id, qp_attr, &attr_mask); + if (ret) + goto error_free; + + ret = ib_modify_qp(ch->qp, qp_attr, attr_mask); + if (ret) + goto error_free; + + qp_attr->qp_state = IB_QPS_RTS; + ret = ib_cm_init_qp_attr(cm_id, qp_attr, &attr_mask); + if (ret) + goto error_free; + + target->rq_tmo_jiffies = srp_compute_rq_tmo(qp_attr, attr_mask); + + ret = ib_modify_qp(ch->qp, qp_attr, attr_mask); + if (ret) + goto error_free; + + ret = ib_send_cm_rtu(cm_id, NULL, 0); + } + +error_free: + kfree(qp_attr); + +error: + ch->status = ret; +} + +static void srp_ib_cm_rej_handler(struct ib_cm_id *cm_id, + const struct ib_cm_event *event, + struct srp_rdma_ch *ch) +{ + struct srp_target_port *target = ch->target; + struct Scsi_Host *shost = target->scsi_host; + struct ib_class_port_info *cpi; + int opcode; + u16 dlid; + + switch (event->param.rej_rcvd.reason) { + case IB_CM_REJ_PORT_CM_REDIRECT: + cpi = event->param.rej_rcvd.ari; + dlid = be16_to_cpu(cpi->redirect_lid); + sa_path_set_dlid(&ch->ib_cm.path, dlid); + ch->ib_cm.path.pkey = cpi->redirect_pkey; + cm_id->remote_cm_qpn = be32_to_cpu(cpi->redirect_qp) & 0x00ffffff; + memcpy(ch->ib_cm.path.dgid.raw, cpi->redirect_gid, 16); + + ch->status = dlid ? SRP_DLID_REDIRECT : SRP_PORT_REDIRECT; + break; + + case IB_CM_REJ_PORT_REDIRECT: + if (srp_target_is_topspin(target)) { + union ib_gid *dgid = &ch->ib_cm.path.dgid; + + /* + * Topspin/Cisco SRP gateways incorrectly send + * reject reason code 25 when they mean 24 + * (port redirect). + */ + memcpy(dgid->raw, event->param.rej_rcvd.ari, 16); + + shost_printk(KERN_DEBUG, shost, + PFX "Topspin/Cisco redirect to target port GID %016llx%016llx\n", + be64_to_cpu(dgid->global.subnet_prefix), + be64_to_cpu(dgid->global.interface_id)); + + ch->status = SRP_PORT_REDIRECT; + } else { + shost_printk(KERN_WARNING, shost, + " REJ reason: IB_CM_REJ_PORT_REDIRECT\n"); + ch->status = -ECONNRESET; + } + break; + + case IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID: + shost_printk(KERN_WARNING, shost, + " REJ reason: IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID\n"); + ch->status = -ECONNRESET; + break; + + case IB_CM_REJ_CONSUMER_DEFINED: + opcode = *(u8 *) event->private_data; + if (opcode == SRP_LOGIN_REJ) { + struct srp_login_rej *rej = event->private_data; + u32 reason = be32_to_cpu(rej->reason); + + if (reason == SRP_LOGIN_REJ_REQ_IT_IU_LENGTH_TOO_LARGE) + shost_printk(KERN_WARNING, shost, + PFX "SRP_LOGIN_REJ: requested max_it_iu_len too large\n"); + else + shost_printk(KERN_WARNING, shost, PFX + "SRP LOGIN from %pI6 to %pI6 REJECTED, reason 0x%08x\n", + target->sgid.raw, + target->ib_cm.orig_dgid.raw, + reason); + } else + shost_printk(KERN_WARNING, shost, + " REJ reason: IB_CM_REJ_CONSUMER_DEFINED," + " opcode 0x%02x\n", opcode); + ch->status = -ECONNRESET; + break; + + case IB_CM_REJ_STALE_CONN: + shost_printk(KERN_WARNING, shost, " REJ reason: stale connection\n"); + ch->status = SRP_STALE_CONN; + break; + + default: + shost_printk(KERN_WARNING, shost, " REJ reason 0x%x\n", + event->param.rej_rcvd.reason); + ch->status = -ECONNRESET; + } +} + +static int srp_ib_cm_handler(struct ib_cm_id *cm_id, + const struct ib_cm_event *event) +{ + struct srp_rdma_ch *ch = cm_id->context; + struct srp_target_port *target = ch->target; + int comp = 0; + + switch (event->event) { + case IB_CM_REQ_ERROR: + shost_printk(KERN_DEBUG, target->scsi_host, + PFX "Sending CM REQ failed\n"); + comp = 1; + ch->status = -ECONNRESET; + break; + + case IB_CM_REP_RECEIVED: + comp = 1; + srp_cm_rep_handler(cm_id, event->private_data, ch); + break; + + case IB_CM_REJ_RECEIVED: + shost_printk(KERN_DEBUG, target->scsi_host, PFX "REJ received\n"); + comp = 1; + + srp_ib_cm_rej_handler(cm_id, event, ch); + break; + + case IB_CM_DREQ_RECEIVED: + shost_printk(KERN_WARNING, target->scsi_host, + PFX "DREQ received - connection closed\n"); + ch->connected = false; + if (ib_send_cm_drep(cm_id, NULL, 0)) + shost_printk(KERN_ERR, target->scsi_host, + PFX "Sending CM DREP failed\n"); + queue_work(system_long_wq, &target->tl_err_work); + break; + + case IB_CM_TIMEWAIT_EXIT: + shost_printk(KERN_ERR, target->scsi_host, + PFX "connection closed\n"); + comp = 1; + + ch->status = 0; + break; + + case IB_CM_MRA_RECEIVED: + case IB_CM_DREQ_ERROR: + case IB_CM_DREP_RECEIVED: + break; + + default: + shost_printk(KERN_WARNING, target->scsi_host, + PFX "Unhandled CM event %d\n", event->event); + break; + } + + if (comp) + complete(&ch->done); + + return 0; +} + +static void srp_rdma_cm_rej_handler(struct srp_rdma_ch *ch, + struct rdma_cm_event *event) +{ + struct srp_target_port *target = ch->target; + struct Scsi_Host *shost = target->scsi_host; + int opcode; + + switch (event->status) { + case IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID: + shost_printk(KERN_WARNING, shost, + " REJ reason: IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID\n"); + ch->status = -ECONNRESET; + break; + + case IB_CM_REJ_CONSUMER_DEFINED: + opcode = *(u8 *) event->param.conn.private_data; + if (opcode == SRP_LOGIN_REJ) { + struct srp_login_rej *rej = + (struct srp_login_rej *) + event->param.conn.private_data; + u32 reason = be32_to_cpu(rej->reason); + + if (reason == SRP_LOGIN_REJ_REQ_IT_IU_LENGTH_TOO_LARGE) + shost_printk(KERN_WARNING, shost, + PFX "SRP_LOGIN_REJ: requested max_it_iu_len too large\n"); + else + shost_printk(KERN_WARNING, shost, + PFX "SRP LOGIN REJECTED, reason 0x%08x\n", reason); + } else { + shost_printk(KERN_WARNING, shost, + " REJ reason: IB_CM_REJ_CONSUMER_DEFINED, opcode 0x%02x\n", + opcode); + } + ch->status = -ECONNRESET; + break; + + case IB_CM_REJ_STALE_CONN: + shost_printk(KERN_WARNING, shost, + " REJ reason: stale connection\n"); + ch->status = SRP_STALE_CONN; + break; + + default: + shost_printk(KERN_WARNING, shost, " REJ reason 0x%x\n", + event->status); + ch->status = -ECONNRESET; + break; + } +} + +static int srp_rdma_cm_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) +{ + struct srp_rdma_ch *ch = cm_id->context; + struct srp_target_port *target = ch->target; + int comp = 0; + + switch (event->event) { + case RDMA_CM_EVENT_ADDR_RESOLVED: + ch->status = 0; + comp = 1; + break; + + case RDMA_CM_EVENT_ADDR_ERROR: + ch->status = -ENXIO; + comp = 1; + break; + + case RDMA_CM_EVENT_ROUTE_RESOLVED: + ch->status = 0; + comp = 1; + break; + + case RDMA_CM_EVENT_ROUTE_ERROR: + case RDMA_CM_EVENT_UNREACHABLE: + ch->status = -EHOSTUNREACH; + comp = 1; + break; + + case RDMA_CM_EVENT_CONNECT_ERROR: + shost_printk(KERN_DEBUG, target->scsi_host, + PFX "Sending CM REQ failed\n"); + comp = 1; + ch->status = -ECONNRESET; + break; + + case RDMA_CM_EVENT_ESTABLISHED: + comp = 1; + srp_cm_rep_handler(NULL, event->param.conn.private_data, ch); + break; + + case RDMA_CM_EVENT_REJECTED: + shost_printk(KERN_DEBUG, target->scsi_host, PFX "REJ received\n"); + comp = 1; + + srp_rdma_cm_rej_handler(ch, event); + break; + + case RDMA_CM_EVENT_DISCONNECTED: + if (ch->connected) { + shost_printk(KERN_WARNING, target->scsi_host, + PFX "received DREQ\n"); + rdma_disconnect(ch->rdma_cm.cm_id); + comp = 1; + ch->status = 0; + queue_work(system_long_wq, &target->tl_err_work); + } + break; + + case RDMA_CM_EVENT_TIMEWAIT_EXIT: + shost_printk(KERN_ERR, target->scsi_host, + PFX "connection closed\n"); + + comp = 1; + ch->status = 0; + break; + + default: + shost_printk(KERN_WARNING, target->scsi_host, + PFX "Unhandled CM event %d\n", event->event); + break; + } + + if (comp) + complete(&ch->done); + + return 0; +} + +/** + * srp_change_queue_depth - setting device queue depth + * @sdev: scsi device struct + * @qdepth: requested queue depth + * + * Returns queue depth. + */ +static int +srp_change_queue_depth(struct scsi_device *sdev, int qdepth) +{ + if (!sdev->tagged_supported) + qdepth = 1; + return scsi_change_queue_depth(sdev, qdepth); +} + +static int srp_send_tsk_mgmt(struct srp_rdma_ch *ch, u64 req_tag, u64 lun, + u8 func, u8 *status) +{ + struct srp_target_port *target = ch->target; + struct srp_rport *rport = target->rport; + struct ib_device *dev = target->srp_host->srp_dev->dev; + struct srp_iu *iu; + struct srp_tsk_mgmt *tsk_mgmt; + int res; + + if (!ch->connected || target->qp_in_error) + return -1; + + /* + * Lock the rport mutex to avoid that srp_create_ch_ib() is + * invoked while a task management function is being sent. + */ + mutex_lock(&rport->mutex); + spin_lock_irq(&ch->lock); + iu = __srp_get_tx_iu(ch, SRP_IU_TSK_MGMT); + spin_unlock_irq(&ch->lock); + + if (!iu) { + mutex_unlock(&rport->mutex); + + return -1; + } + + iu->num_sge = 1; + + ib_dma_sync_single_for_cpu(dev, iu->dma, sizeof *tsk_mgmt, + DMA_TO_DEVICE); + tsk_mgmt = iu->buf; + memset(tsk_mgmt, 0, sizeof *tsk_mgmt); + + tsk_mgmt->opcode = SRP_TSK_MGMT; + int_to_scsilun(lun, &tsk_mgmt->lun); + tsk_mgmt->tsk_mgmt_func = func; + tsk_mgmt->task_tag = req_tag; + + spin_lock_irq(&ch->lock); + ch->tsk_mgmt_tag = (ch->tsk_mgmt_tag + 1) | SRP_TAG_TSK_MGMT; + tsk_mgmt->tag = ch->tsk_mgmt_tag; + spin_unlock_irq(&ch->lock); + + init_completion(&ch->tsk_mgmt_done); + + ib_dma_sync_single_for_device(dev, iu->dma, sizeof *tsk_mgmt, + DMA_TO_DEVICE); + if (srp_post_send(ch, iu, sizeof(*tsk_mgmt))) { + srp_put_tx_iu(ch, iu, SRP_IU_TSK_MGMT); + mutex_unlock(&rport->mutex); + + return -1; + } + res = wait_for_completion_timeout(&ch->tsk_mgmt_done, + msecs_to_jiffies(SRP_ABORT_TIMEOUT_MS)); + if (res > 0 && status) + *status = ch->tsk_mgmt_status; + mutex_unlock(&rport->mutex); + + WARN_ON_ONCE(res < 0); + + return res > 0 ? 0 : -1; +} + +static int srp_abort(struct scsi_cmnd *scmnd) +{ + struct srp_target_port *target = host_to_target(scmnd->device->host); + struct srp_request *req = (struct srp_request *) scmnd->host_scribble; + u32 tag; + u16 ch_idx; + struct srp_rdma_ch *ch; + int ret; + + shost_printk(KERN_ERR, target->scsi_host, "SRP abort called\n"); + + if (!req) + return SUCCESS; + tag = blk_mq_unique_tag(scsi_cmd_to_rq(scmnd)); + ch_idx = blk_mq_unique_tag_to_hwq(tag); + if (WARN_ON_ONCE(ch_idx >= target->ch_count)) + return SUCCESS; + ch = &target->ch[ch_idx]; + if (!srp_claim_req(ch, req, NULL, scmnd)) + return SUCCESS; + shost_printk(KERN_ERR, target->scsi_host, + "Sending SRP abort for tag %#x\n", tag); + if (srp_send_tsk_mgmt(ch, tag, scmnd->device->lun, + SRP_TSK_ABORT_TASK, NULL) == 0) + ret = SUCCESS; + else if (target->rport->state == SRP_RPORT_LOST) + ret = FAST_IO_FAIL; + else + ret = FAILED; + if (ret == SUCCESS) { + srp_free_req(ch, req, scmnd, 0); + scmnd->result = DID_ABORT << 16; + scsi_done(scmnd); + } + + return ret; +} + +static int srp_reset_device(struct scsi_cmnd *scmnd) +{ + struct srp_target_port *target = host_to_target(scmnd->device->host); + struct srp_rdma_ch *ch; + u8 status; + + shost_printk(KERN_ERR, target->scsi_host, "SRP reset_device called\n"); + + ch = &target->ch[0]; + if (srp_send_tsk_mgmt(ch, SRP_TAG_NO_REQ, scmnd->device->lun, + SRP_TSK_LUN_RESET, &status)) + return FAILED; + if (status) + return FAILED; + + return SUCCESS; +} + +static int srp_reset_host(struct scsi_cmnd *scmnd) +{ + struct srp_target_port *target = host_to_target(scmnd->device->host); + + shost_printk(KERN_ERR, target->scsi_host, PFX "SRP reset_host called\n"); + + return srp_reconnect_rport(target->rport) == 0 ? SUCCESS : FAILED; +} + +static int srp_target_alloc(struct scsi_target *starget) +{ + struct Scsi_Host *shost = dev_to_shost(starget->dev.parent); + struct srp_target_port *target = host_to_target(shost); + + if (target->target_can_queue) + starget->can_queue = target->target_can_queue; + return 0; +} + +static int srp_slave_configure(struct scsi_device *sdev) +{ + struct Scsi_Host *shost = sdev->host; + struct srp_target_port *target = host_to_target(shost); + struct request_queue *q = sdev->request_queue; + unsigned long timeout; + + if (sdev->type == TYPE_DISK) { + timeout = max_t(unsigned, 30 * HZ, target->rq_tmo_jiffies); + blk_queue_rq_timeout(q, timeout); + } + + return 0; +} + +static ssize_t id_ext_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sysfs_emit(buf, "0x%016llx\n", be64_to_cpu(target->id_ext)); +} + +static DEVICE_ATTR_RO(id_ext); + +static ssize_t ioc_guid_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sysfs_emit(buf, "0x%016llx\n", be64_to_cpu(target->ioc_guid)); +} + +static DEVICE_ATTR_RO(ioc_guid); + +static ssize_t service_id_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + if (target->using_rdma_cm) + return -ENOENT; + return sysfs_emit(buf, "0x%016llx\n", + be64_to_cpu(target->ib_cm.service_id)); +} + +static DEVICE_ATTR_RO(service_id); + +static ssize_t pkey_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + if (target->using_rdma_cm) + return -ENOENT; + + return sysfs_emit(buf, "0x%04x\n", be16_to_cpu(target->ib_cm.pkey)); +} + +static DEVICE_ATTR_RO(pkey); + +static ssize_t sgid_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sysfs_emit(buf, "%pI6\n", target->sgid.raw); +} + +static DEVICE_ATTR_RO(sgid); + +static ssize_t dgid_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + struct srp_rdma_ch *ch = &target->ch[0]; + + if (target->using_rdma_cm) + return -ENOENT; + + return sysfs_emit(buf, "%pI6\n", ch->ib_cm.path.dgid.raw); +} + +static DEVICE_ATTR_RO(dgid); + +static ssize_t orig_dgid_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + if (target->using_rdma_cm) + return -ENOENT; + + return sysfs_emit(buf, "%pI6\n", target->ib_cm.orig_dgid.raw); +} + +static DEVICE_ATTR_RO(orig_dgid); + +static ssize_t req_lim_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + struct srp_rdma_ch *ch; + int i, req_lim = INT_MAX; + + for (i = 0; i < target->ch_count; i++) { + ch = &target->ch[i]; + req_lim = min(req_lim, ch->req_lim); + } + + return sysfs_emit(buf, "%d\n", req_lim); +} + +static DEVICE_ATTR_RO(req_lim); + +static ssize_t zero_req_lim_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sysfs_emit(buf, "%d\n", target->zero_req_lim); +} + +static DEVICE_ATTR_RO(zero_req_lim); + +static ssize_t local_ib_port_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sysfs_emit(buf, "%d\n", target->srp_host->port); +} + +static DEVICE_ATTR_RO(local_ib_port); + +static ssize_t local_ib_device_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sysfs_emit(buf, "%s\n", + dev_name(&target->srp_host->srp_dev->dev->dev)); +} + +static DEVICE_ATTR_RO(local_ib_device); + +static ssize_t ch_count_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sysfs_emit(buf, "%d\n", target->ch_count); +} + +static DEVICE_ATTR_RO(ch_count); + +static ssize_t comp_vector_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sysfs_emit(buf, "%d\n", target->comp_vector); +} + +static DEVICE_ATTR_RO(comp_vector); + +static ssize_t tl_retry_count_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sysfs_emit(buf, "%d\n", target->tl_retry_count); +} + +static DEVICE_ATTR_RO(tl_retry_count); + +static ssize_t cmd_sg_entries_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sysfs_emit(buf, "%u\n", target->cmd_sg_cnt); +} + +static DEVICE_ATTR_RO(cmd_sg_entries); + +static ssize_t allow_ext_sg_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sysfs_emit(buf, "%s\n", target->allow_ext_sg ? "true" : "false"); +} + +static DEVICE_ATTR_RO(allow_ext_sg); + +static struct attribute *srp_host_attrs[] = { + &dev_attr_id_ext.attr, + &dev_attr_ioc_guid.attr, + &dev_attr_service_id.attr, + &dev_attr_pkey.attr, + &dev_attr_sgid.attr, + &dev_attr_dgid.attr, + &dev_attr_orig_dgid.attr, + &dev_attr_req_lim.attr, + &dev_attr_zero_req_lim.attr, + &dev_attr_local_ib_port.attr, + &dev_attr_local_ib_device.attr, + &dev_attr_ch_count.attr, + &dev_attr_comp_vector.attr, + &dev_attr_tl_retry_count.attr, + &dev_attr_cmd_sg_entries.attr, + &dev_attr_allow_ext_sg.attr, + NULL +}; + +ATTRIBUTE_GROUPS(srp_host); + +static struct scsi_host_template srp_template = { + .module = THIS_MODULE, + .name = "InfiniBand SRP initiator", + .proc_name = DRV_NAME, + .target_alloc = srp_target_alloc, + .slave_configure = srp_slave_configure, + .info = srp_target_info, + .init_cmd_priv = srp_init_cmd_priv, + .exit_cmd_priv = srp_exit_cmd_priv, + .queuecommand = srp_queuecommand, + .change_queue_depth = srp_change_queue_depth, + .eh_timed_out = srp_timed_out, + .eh_abort_handler = srp_abort, + .eh_device_reset_handler = srp_reset_device, + .eh_host_reset_handler = srp_reset_host, + .skip_settle_delay = true, + .sg_tablesize = SRP_DEF_SG_TABLESIZE, + .can_queue = SRP_DEFAULT_CMD_SQ_SIZE, + .this_id = -1, + .cmd_per_lun = SRP_DEFAULT_CMD_SQ_SIZE, + .shost_groups = srp_host_groups, + .track_queue_depth = 1, + .cmd_size = sizeof(struct srp_request), +}; + +static int srp_sdev_count(struct Scsi_Host *host) +{ + struct scsi_device *sdev; + int c = 0; + + shost_for_each_device(sdev, host) + c++; + + return c; +} + +/* + * Return values: + * < 0 upon failure. Caller is responsible for SRP target port cleanup. + * 0 and target->state == SRP_TARGET_REMOVED if asynchronous target port + * removal has been scheduled. + * 0 and target->state != SRP_TARGET_REMOVED upon success. + */ +static int srp_add_target(struct srp_host *host, struct srp_target_port *target) +{ + struct srp_rport_identifiers ids; + struct srp_rport *rport; + + target->state = SRP_TARGET_SCANNING; + sprintf(target->target_name, "SRP.T10:%016llX", + be64_to_cpu(target->id_ext)); + + if (scsi_add_host(target->scsi_host, host->srp_dev->dev->dev.parent)) + return -ENODEV; + + memcpy(ids.port_id, &target->id_ext, 8); + memcpy(ids.port_id + 8, &target->ioc_guid, 8); + ids.roles = SRP_RPORT_ROLE_TARGET; + rport = srp_rport_add(target->scsi_host, &ids); + if (IS_ERR(rport)) { + scsi_remove_host(target->scsi_host); + return PTR_ERR(rport); + } + + rport->lld_data = target; + target->rport = rport; + + spin_lock(&host->target_lock); + list_add_tail(&target->list, &host->target_list); + spin_unlock(&host->target_lock); + + scsi_scan_target(&target->scsi_host->shost_gendev, + 0, target->scsi_id, SCAN_WILD_CARD, SCSI_SCAN_INITIAL); + + if (srp_connected_ch(target) < target->ch_count || + target->qp_in_error) { + shost_printk(KERN_INFO, target->scsi_host, + PFX "SCSI scan failed - removing SCSI host\n"); + srp_queue_remove_work(target); + goto out; + } + + pr_debug("%s: SCSI scan succeeded - detected %d LUNs\n", + dev_name(&target->scsi_host->shost_gendev), + srp_sdev_count(target->scsi_host)); + + spin_lock_irq(&target->lock); + if (target->state == SRP_TARGET_SCANNING) + target->state = SRP_TARGET_LIVE; + spin_unlock_irq(&target->lock); + +out: + return 0; +} + +static void srp_release_dev(struct device *dev) +{ + struct srp_host *host = + container_of(dev, struct srp_host, dev); + + complete(&host->released); +} + +static struct class srp_class = { + .name = "infiniband_srp", + .dev_release = srp_release_dev +}; + +/** + * srp_conn_unique() - check whether the connection to a target is unique + * @host: SRP host. + * @target: SRP target port. + */ +static bool srp_conn_unique(struct srp_host *host, + struct srp_target_port *target) +{ + struct srp_target_port *t; + bool ret = false; + + if (target->state == SRP_TARGET_REMOVED) + goto out; + + ret = true; + + spin_lock(&host->target_lock); + list_for_each_entry(t, &host->target_list, list) { + if (t != target && + target->id_ext == t->id_ext && + target->ioc_guid == t->ioc_guid && + target->initiator_ext == t->initiator_ext) { + ret = false; + break; + } + } + spin_unlock(&host->target_lock); + +out: + return ret; +} + +/* + * Target ports are added by writing + * + * id_ext=,ioc_guid=,dgid=, + * pkey=,service_id= + * or + * id_ext=,ioc_guid=, + * [src=,]dest=: + * + * to the add_target sysfs attribute. + */ +enum { + SRP_OPT_ERR = 0, + SRP_OPT_ID_EXT = 1 << 0, + SRP_OPT_IOC_GUID = 1 << 1, + SRP_OPT_DGID = 1 << 2, + SRP_OPT_PKEY = 1 << 3, + SRP_OPT_SERVICE_ID = 1 << 4, + SRP_OPT_MAX_SECT = 1 << 5, + SRP_OPT_MAX_CMD_PER_LUN = 1 << 6, + SRP_OPT_IO_CLASS = 1 << 7, + SRP_OPT_INITIATOR_EXT = 1 << 8, + SRP_OPT_CMD_SG_ENTRIES = 1 << 9, + SRP_OPT_ALLOW_EXT_SG = 1 << 10, + SRP_OPT_SG_TABLESIZE = 1 << 11, + SRP_OPT_COMP_VECTOR = 1 << 12, + SRP_OPT_TL_RETRY_COUNT = 1 << 13, + SRP_OPT_QUEUE_SIZE = 1 << 14, + SRP_OPT_IP_SRC = 1 << 15, + SRP_OPT_IP_DEST = 1 << 16, + SRP_OPT_TARGET_CAN_QUEUE= 1 << 17, + SRP_OPT_MAX_IT_IU_SIZE = 1 << 18, + SRP_OPT_CH_COUNT = 1 << 19, +}; + +static unsigned int srp_opt_mandatory[] = { + SRP_OPT_ID_EXT | + SRP_OPT_IOC_GUID | + SRP_OPT_DGID | + SRP_OPT_PKEY | + SRP_OPT_SERVICE_ID, + SRP_OPT_ID_EXT | + SRP_OPT_IOC_GUID | + SRP_OPT_IP_DEST, +}; + +static const match_table_t srp_opt_tokens = { + { SRP_OPT_ID_EXT, "id_ext=%s" }, + { SRP_OPT_IOC_GUID, "ioc_guid=%s" }, + { SRP_OPT_DGID, "dgid=%s" }, + { SRP_OPT_PKEY, "pkey=%x" }, + { SRP_OPT_SERVICE_ID, "service_id=%s" }, + { SRP_OPT_MAX_SECT, "max_sect=%d" }, + { SRP_OPT_MAX_CMD_PER_LUN, "max_cmd_per_lun=%d" }, + { SRP_OPT_TARGET_CAN_QUEUE, "target_can_queue=%d" }, + { SRP_OPT_IO_CLASS, "io_class=%x" }, + { SRP_OPT_INITIATOR_EXT, "initiator_ext=%s" }, + { SRP_OPT_CMD_SG_ENTRIES, "cmd_sg_entries=%u" }, + { SRP_OPT_ALLOW_EXT_SG, "allow_ext_sg=%u" }, + { SRP_OPT_SG_TABLESIZE, "sg_tablesize=%u" }, + { SRP_OPT_COMP_VECTOR, "comp_vector=%u" }, + { SRP_OPT_TL_RETRY_COUNT, "tl_retry_count=%u" }, + { SRP_OPT_QUEUE_SIZE, "queue_size=%d" }, + { SRP_OPT_IP_SRC, "src=%s" }, + { SRP_OPT_IP_DEST, "dest=%s" }, + { SRP_OPT_MAX_IT_IU_SIZE, "max_it_iu_size=%d" }, + { SRP_OPT_CH_COUNT, "ch_count=%u", }, + { SRP_OPT_ERR, NULL } +}; + +/** + * srp_parse_in - parse an IP address and port number combination + * @net: [in] Network namespace. + * @sa: [out] Address family, IP address and port number. + * @addr_port_str: [in] IP address and port number. + * @has_port: [out] Whether or not @addr_port_str includes a port number. + * + * Parse the following address formats: + * - IPv4: :, e.g. 1.2.3.4:5. + * - IPv6: \[\]:, e.g. [1::2:3%4]:5. + */ +static int srp_parse_in(struct net *net, struct sockaddr_storage *sa, + const char *addr_port_str, bool *has_port) +{ + char *addr_end, *addr = kstrdup(addr_port_str, GFP_KERNEL); + char *port_str; + int ret; + + if (!addr) + return -ENOMEM; + port_str = strrchr(addr, ':'); + if (port_str && strchr(port_str, ']')) + port_str = NULL; + if (port_str) + *port_str++ = '\0'; + if (has_port) + *has_port = port_str != NULL; + ret = inet_pton_with_scope(net, AF_INET, addr, port_str, sa); + if (ret && addr[0]) { + addr_end = addr + strlen(addr) - 1; + if (addr[0] == '[' && *addr_end == ']') { + *addr_end = '\0'; + ret = inet_pton_with_scope(net, AF_INET6, addr + 1, + port_str, sa); + } + } + kfree(addr); + pr_debug("%s -> %pISpfsc\n", addr_port_str, sa); + return ret; +} + +static int srp_parse_options(struct net *net, const char *buf, + struct srp_target_port *target) +{ + char *options, *sep_opt; + char *p; + substring_t args[MAX_OPT_ARGS]; + unsigned long long ull; + bool has_port; + int opt_mask = 0; + int token; + int ret = -EINVAL; + int i; + + options = kstrdup(buf, GFP_KERNEL); + if (!options) + return -ENOMEM; + + sep_opt = options; + while ((p = strsep(&sep_opt, ",\n")) != NULL) { + if (!*p) + continue; + + token = match_token(p, srp_opt_tokens, args); + opt_mask |= token; + + switch (token) { + case SRP_OPT_ID_EXT: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + ret = kstrtoull(p, 16, &ull); + if (ret) { + pr_warn("invalid id_ext parameter '%s'\n", p); + kfree(p); + goto out; + } + target->id_ext = cpu_to_be64(ull); + kfree(p); + break; + + case SRP_OPT_IOC_GUID: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + ret = kstrtoull(p, 16, &ull); + if (ret) { + pr_warn("invalid ioc_guid parameter '%s'\n", p); + kfree(p); + goto out; + } + target->ioc_guid = cpu_to_be64(ull); + kfree(p); + break; + + case SRP_OPT_DGID: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + if (strlen(p) != 32) { + pr_warn("bad dest GID parameter '%s'\n", p); + kfree(p); + goto out; + } + + ret = hex2bin(target->ib_cm.orig_dgid.raw, p, 16); + kfree(p); + if (ret < 0) + goto out; + break; + + case SRP_OPT_PKEY: + if (match_hex(args, &token)) { + pr_warn("bad P_Key parameter '%s'\n", p); + goto out; + } + target->ib_cm.pkey = cpu_to_be16(token); + break; + + case SRP_OPT_SERVICE_ID: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + ret = kstrtoull(p, 16, &ull); + if (ret) { + pr_warn("bad service_id parameter '%s'\n", p); + kfree(p); + goto out; + } + target->ib_cm.service_id = cpu_to_be64(ull); + kfree(p); + break; + + case SRP_OPT_IP_SRC: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + ret = srp_parse_in(net, &target->rdma_cm.src.ss, p, + NULL); + if (ret < 0) { + pr_warn("bad source parameter '%s'\n", p); + kfree(p); + goto out; + } + target->rdma_cm.src_specified = true; + kfree(p); + break; + + case SRP_OPT_IP_DEST: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + ret = srp_parse_in(net, &target->rdma_cm.dst.ss, p, + &has_port); + if (!has_port) + ret = -EINVAL; + if (ret < 0) { + pr_warn("bad dest parameter '%s'\n", p); + kfree(p); + goto out; + } + target->using_rdma_cm = true; + kfree(p); + break; + + case SRP_OPT_MAX_SECT: + if (match_int(args, &token)) { + pr_warn("bad max sect parameter '%s'\n", p); + goto out; + } + target->scsi_host->max_sectors = token; + break; + + case SRP_OPT_QUEUE_SIZE: + if (match_int(args, &token) || token < 1) { + pr_warn("bad queue_size parameter '%s'\n", p); + goto out; + } + target->scsi_host->can_queue = token; + target->queue_size = token + SRP_RSP_SQ_SIZE + + SRP_TSK_MGMT_SQ_SIZE; + if (!(opt_mask & SRP_OPT_MAX_CMD_PER_LUN)) + target->scsi_host->cmd_per_lun = token; + break; + + case SRP_OPT_MAX_CMD_PER_LUN: + if (match_int(args, &token) || token < 1) { + pr_warn("bad max cmd_per_lun parameter '%s'\n", + p); + goto out; + } + target->scsi_host->cmd_per_lun = token; + break; + + case SRP_OPT_TARGET_CAN_QUEUE: + if (match_int(args, &token) || token < 1) { + pr_warn("bad max target_can_queue parameter '%s'\n", + p); + goto out; + } + target->target_can_queue = token; + break; + + case SRP_OPT_IO_CLASS: + if (match_hex(args, &token)) { + pr_warn("bad IO class parameter '%s'\n", p); + goto out; + } + if (token != SRP_REV10_IB_IO_CLASS && + token != SRP_REV16A_IB_IO_CLASS) { + pr_warn("unknown IO class parameter value %x specified (use %x or %x).\n", + token, SRP_REV10_IB_IO_CLASS, + SRP_REV16A_IB_IO_CLASS); + goto out; + } + target->io_class = token; + break; + + case SRP_OPT_INITIATOR_EXT: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + ret = kstrtoull(p, 16, &ull); + if (ret) { + pr_warn("bad initiator_ext value '%s'\n", p); + kfree(p); + goto out; + } + target->initiator_ext = cpu_to_be64(ull); + kfree(p); + break; + + case SRP_OPT_CMD_SG_ENTRIES: + if (match_int(args, &token) || token < 1 || token > 255) { + pr_warn("bad max cmd_sg_entries parameter '%s'\n", + p); + goto out; + } + target->cmd_sg_cnt = token; + break; + + case SRP_OPT_ALLOW_EXT_SG: + if (match_int(args, &token)) { + pr_warn("bad allow_ext_sg parameter '%s'\n", p); + goto out; + } + target->allow_ext_sg = !!token; + break; + + case SRP_OPT_SG_TABLESIZE: + if (match_int(args, &token) || token < 1 || + token > SG_MAX_SEGMENTS) { + pr_warn("bad max sg_tablesize parameter '%s'\n", + p); + goto out; + } + target->sg_tablesize = token; + break; + + case SRP_OPT_COMP_VECTOR: + if (match_int(args, &token) || token < 0) { + pr_warn("bad comp_vector parameter '%s'\n", p); + goto out; + } + target->comp_vector = token; + break; + + case SRP_OPT_TL_RETRY_COUNT: + if (match_int(args, &token) || token < 2 || token > 7) { + pr_warn("bad tl_retry_count parameter '%s' (must be a number between 2 and 7)\n", + p); + goto out; + } + target->tl_retry_count = token; + break; + + case SRP_OPT_MAX_IT_IU_SIZE: + if (match_int(args, &token) || token < 0) { + pr_warn("bad maximum initiator to target IU size '%s'\n", p); + goto out; + } + target->max_it_iu_size = token; + break; + + case SRP_OPT_CH_COUNT: + if (match_int(args, &token) || token < 1) { + pr_warn("bad channel count %s\n", p); + goto out; + } + target->ch_count = token; + break; + + default: + pr_warn("unknown parameter or missing value '%s' in target creation request\n", + p); + goto out; + } + } + + for (i = 0; i < ARRAY_SIZE(srp_opt_mandatory); i++) { + if ((opt_mask & srp_opt_mandatory[i]) == srp_opt_mandatory[i]) { + ret = 0; + break; + } + } + if (ret) + pr_warn("target creation request is missing one or more parameters\n"); + + if (target->scsi_host->cmd_per_lun > target->scsi_host->can_queue + && (opt_mask & SRP_OPT_MAX_CMD_PER_LUN)) + pr_warn("cmd_per_lun = %d > queue_size = %d\n", + target->scsi_host->cmd_per_lun, + target->scsi_host->can_queue); + +out: + kfree(options); + return ret; +} + +static ssize_t add_target_store(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t count) +{ + struct srp_host *host = + container_of(dev, struct srp_host, dev); + struct Scsi_Host *target_host; + struct srp_target_port *target; + struct srp_rdma_ch *ch; + struct srp_device *srp_dev = host->srp_dev; + struct ib_device *ibdev = srp_dev->dev; + int ret, i, ch_idx; + unsigned int max_sectors_per_mr, mr_per_cmd = 0; + bool multich = false; + uint32_t max_iu_len; + + target_host = scsi_host_alloc(&srp_template, + sizeof (struct srp_target_port)); + if (!target_host) + return -ENOMEM; + + target_host->transportt = ib_srp_transport_template; + target_host->max_channel = 0; + target_host->max_id = 1; + target_host->max_lun = -1LL; + target_host->max_cmd_len = sizeof ((struct srp_cmd *) (void *) 0L)->cdb; + target_host->max_segment_size = ib_dma_max_seg_size(ibdev); + target_host->virt_boundary_mask = ~srp_dev->mr_page_mask; + + target = host_to_target(target_host); + + target->net = kobj_ns_grab_current(KOBJ_NS_TYPE_NET); + target->io_class = SRP_REV16A_IB_IO_CLASS; + target->scsi_host = target_host; + target->srp_host = host; + target->lkey = host->srp_dev->pd->local_dma_lkey; + target->global_rkey = host->srp_dev->global_rkey; + target->cmd_sg_cnt = cmd_sg_entries; + target->sg_tablesize = indirect_sg_entries ? : cmd_sg_entries; + target->allow_ext_sg = allow_ext_sg; + target->tl_retry_count = 7; + target->queue_size = SRP_DEFAULT_QUEUE_SIZE; + + /* + * Avoid that the SCSI host can be removed by srp_remove_target() + * before this function returns. + */ + scsi_host_get(target->scsi_host); + + ret = mutex_lock_interruptible(&host->add_target_mutex); + if (ret < 0) + goto put; + + ret = srp_parse_options(target->net, buf, target); + if (ret) + goto out; + + if (!srp_conn_unique(target->srp_host, target)) { + if (target->using_rdma_cm) { + shost_printk(KERN_INFO, target->scsi_host, + PFX "Already connected to target port with id_ext=%016llx;ioc_guid=%016llx;dest=%pIS\n", + be64_to_cpu(target->id_ext), + be64_to_cpu(target->ioc_guid), + &target->rdma_cm.dst); + } else { + shost_printk(KERN_INFO, target->scsi_host, + PFX "Already connected to target port with id_ext=%016llx;ioc_guid=%016llx;initiator_ext=%016llx\n", + be64_to_cpu(target->id_ext), + be64_to_cpu(target->ioc_guid), + be64_to_cpu(target->initiator_ext)); + } + ret = -EEXIST; + goto out; + } + + if (!srp_dev->has_fr && !target->allow_ext_sg && + target->cmd_sg_cnt < target->sg_tablesize) { + pr_warn("No MR pool and no external indirect descriptors, limiting sg_tablesize to cmd_sg_cnt\n"); + target->sg_tablesize = target->cmd_sg_cnt; + } + + if (srp_dev->use_fast_reg) { + max_sectors_per_mr = srp_dev->max_pages_per_mr << + (ilog2(srp_dev->mr_page_size) - 9); + + /* + * FR and FMR can only map one HCA page per entry. If + * the start address is not aligned on a HCA page + * boundary two entries will be used for the head and + * the tail although these two entries combined + * contain at most one HCA page of data. Hence the "+ + * 1" in the calculation below. + * + * The indirect data buffer descriptor is contiguous + * so the memory for that buffer will only be + * registered if register_always is true. Hence add + * one to mr_per_cmd if register_always has been set. + */ + mr_per_cmd = register_always + + (target->scsi_host->max_sectors + 1 + + max_sectors_per_mr - 1) / max_sectors_per_mr; + + pr_debug("max_sectors = %u; max_pages_per_mr = %u; mr_page_size = %u; max_sectors_per_mr = %u; mr_per_cmd = %u\n", + target->scsi_host->max_sectors, srp_dev->max_pages_per_mr, srp_dev->mr_page_size, + max_sectors_per_mr, mr_per_cmd); + } + + target_host->sg_tablesize = target->sg_tablesize; + target->mr_pool_size = target->scsi_host->can_queue * mr_per_cmd; + target->mr_per_cmd = mr_per_cmd; + target->indirect_size = target->sg_tablesize * + sizeof (struct srp_direct_buf); + max_iu_len = srp_max_it_iu_len(target->cmd_sg_cnt, + srp_use_imm_data, + target->max_it_iu_size); + + INIT_WORK(&target->tl_err_work, srp_tl_err_work); + INIT_WORK(&target->remove_work, srp_remove_work); + spin_lock_init(&target->lock); + ret = rdma_query_gid(ibdev, host->port, 0, &target->sgid); + if (ret) + goto out; + + ret = -ENOMEM; + if (target->ch_count == 0) { + target->ch_count = + min(ch_count ?: + max(4 * num_online_nodes(), + ibdev->num_comp_vectors), + num_online_cpus()); + } + + target->ch = kcalloc(target->ch_count, sizeof(*target->ch), + GFP_KERNEL); + if (!target->ch) + goto out; + + for (ch_idx = 0; ch_idx < target->ch_count; ++ch_idx) { + ch = &target->ch[ch_idx]; + ch->target = target; + ch->comp_vector = ch_idx % ibdev->num_comp_vectors; + spin_lock_init(&ch->lock); + INIT_LIST_HEAD(&ch->free_tx); + ret = srp_new_cm_id(ch); + if (ret) + goto err_disconnect; + + ret = srp_create_ch_ib(ch); + if (ret) + goto err_disconnect; + + ret = srp_connect_ch(ch, max_iu_len, multich); + if (ret) { + char dst[64]; + + if (target->using_rdma_cm) + snprintf(dst, sizeof(dst), "%pIS", + &target->rdma_cm.dst); + else + snprintf(dst, sizeof(dst), "%pI6", + target->ib_cm.orig_dgid.raw); + shost_printk(KERN_ERR, target->scsi_host, + PFX "Connection %d/%d to %s failed\n", + ch_idx, + target->ch_count, dst); + if (ch_idx == 0) { + goto free_ch; + } else { + srp_free_ch_ib(target, ch); + target->ch_count = ch - target->ch; + goto connected; + } + } + multich = true; + } + +connected: + target->scsi_host->nr_hw_queues = target->ch_count; + + ret = srp_add_target(host, target); + if (ret) + goto err_disconnect; + + if (target->state != SRP_TARGET_REMOVED) { + if (target->using_rdma_cm) { + shost_printk(KERN_DEBUG, target->scsi_host, PFX + "new target: id_ext %016llx ioc_guid %016llx sgid %pI6 dest %pIS\n", + be64_to_cpu(target->id_ext), + be64_to_cpu(target->ioc_guid), + target->sgid.raw, &target->rdma_cm.dst); + } else { + shost_printk(KERN_DEBUG, target->scsi_host, PFX + "new target: id_ext %016llx ioc_guid %016llx pkey %04x service_id %016llx sgid %pI6 dgid %pI6\n", + be64_to_cpu(target->id_ext), + be64_to_cpu(target->ioc_guid), + be16_to_cpu(target->ib_cm.pkey), + be64_to_cpu(target->ib_cm.service_id), + target->sgid.raw, + target->ib_cm.orig_dgid.raw); + } + } + + ret = count; + +out: + mutex_unlock(&host->add_target_mutex); + +put: + scsi_host_put(target->scsi_host); + if (ret < 0) { + /* + * If a call to srp_remove_target() has not been scheduled, + * drop the network namespace reference now that was obtained + * earlier in this function. + */ + if (target->state != SRP_TARGET_REMOVED) + kobj_ns_drop(KOBJ_NS_TYPE_NET, target->net); + scsi_host_put(target->scsi_host); + } + + return ret; + +err_disconnect: + srp_disconnect_target(target); + +free_ch: + for (i = 0; i < target->ch_count; i++) { + ch = &target->ch[i]; + srp_free_ch_ib(target, ch); + } + + kfree(target->ch); + goto out; +} + +static DEVICE_ATTR_WO(add_target); + +static ssize_t ibdev_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct srp_host *host = container_of(dev, struct srp_host, dev); + + return sysfs_emit(buf, "%s\n", dev_name(&host->srp_dev->dev->dev)); +} + +static DEVICE_ATTR_RO(ibdev); + +static ssize_t port_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct srp_host *host = container_of(dev, struct srp_host, dev); + + return sysfs_emit(buf, "%d\n", host->port); +} + +static DEVICE_ATTR_RO(port); + +static struct srp_host *srp_add_port(struct srp_device *device, u8 port) +{ + struct srp_host *host; + + host = kzalloc(sizeof *host, GFP_KERNEL); + if (!host) + return NULL; + + INIT_LIST_HEAD(&host->target_list); + spin_lock_init(&host->target_lock); + init_completion(&host->released); + mutex_init(&host->add_target_mutex); + host->srp_dev = device; + host->port = port; + + host->dev.class = &srp_class; + host->dev.parent = device->dev->dev.parent; + dev_set_name(&host->dev, "srp-%s-%d", dev_name(&device->dev->dev), + port); + + if (device_register(&host->dev)) + goto free_host; + if (device_create_file(&host->dev, &dev_attr_add_target)) + goto err_class; + if (device_create_file(&host->dev, &dev_attr_ibdev)) + goto err_class; + if (device_create_file(&host->dev, &dev_attr_port)) + goto err_class; + + return host; + +err_class: + device_unregister(&host->dev); + +free_host: + kfree(host); + + return NULL; +} + +static void srp_rename_dev(struct ib_device *device, void *client_data) +{ + struct srp_device *srp_dev = client_data; + struct srp_host *host, *tmp_host; + + list_for_each_entry_safe(host, tmp_host, &srp_dev->dev_list, list) { + char name[IB_DEVICE_NAME_MAX + 8]; + + snprintf(name, sizeof(name), "srp-%s-%d", + dev_name(&device->dev), host->port); + device_rename(&host->dev, name); + } +} + +static int srp_add_one(struct ib_device *device) +{ + struct srp_device *srp_dev; + struct ib_device_attr *attr = &device->attrs; + struct srp_host *host; + int mr_page_shift; + unsigned int p; + u64 max_pages_per_mr; + unsigned int flags = 0; + + srp_dev = kzalloc(sizeof(*srp_dev), GFP_KERNEL); + if (!srp_dev) + return -ENOMEM; + + /* + * Use the smallest page size supported by the HCA, down to a + * minimum of 4096 bytes. We're unlikely to build large sglists + * out of smaller entries. + */ + mr_page_shift = max(12, ffs(attr->page_size_cap) - 1); + srp_dev->mr_page_size = 1 << mr_page_shift; + srp_dev->mr_page_mask = ~((u64) srp_dev->mr_page_size - 1); + max_pages_per_mr = attr->max_mr_size; + do_div(max_pages_per_mr, srp_dev->mr_page_size); + pr_debug("%s: %llu / %u = %llu <> %u\n", __func__, + attr->max_mr_size, srp_dev->mr_page_size, + max_pages_per_mr, SRP_MAX_PAGES_PER_MR); + srp_dev->max_pages_per_mr = min_t(u64, SRP_MAX_PAGES_PER_MR, + max_pages_per_mr); + + srp_dev->has_fr = (attr->device_cap_flags & + IB_DEVICE_MEM_MGT_EXTENSIONS); + if (!never_register && !srp_dev->has_fr) + dev_warn(&device->dev, "FR is not supported\n"); + else if (!never_register && + attr->max_mr_size >= 2 * srp_dev->mr_page_size) + srp_dev->use_fast_reg = srp_dev->has_fr; + + if (never_register || !register_always || !srp_dev->has_fr) + flags |= IB_PD_UNSAFE_GLOBAL_RKEY; + + if (srp_dev->use_fast_reg) { + srp_dev->max_pages_per_mr = + min_t(u32, srp_dev->max_pages_per_mr, + attr->max_fast_reg_page_list_len); + } + srp_dev->mr_max_size = srp_dev->mr_page_size * + srp_dev->max_pages_per_mr; + pr_debug("%s: mr_page_shift = %d, device->max_mr_size = %#llx, device->max_fast_reg_page_list_len = %u, max_pages_per_mr = %d, mr_max_size = %#x\n", + dev_name(&device->dev), mr_page_shift, attr->max_mr_size, + attr->max_fast_reg_page_list_len, + srp_dev->max_pages_per_mr, srp_dev->mr_max_size); + + INIT_LIST_HEAD(&srp_dev->dev_list); + + srp_dev->dev = device; + srp_dev->pd = ib_alloc_pd(device, flags); + if (IS_ERR(srp_dev->pd)) { + int ret = PTR_ERR(srp_dev->pd); + + kfree(srp_dev); + return ret; + } + + if (flags & IB_PD_UNSAFE_GLOBAL_RKEY) { + srp_dev->global_rkey = srp_dev->pd->unsafe_global_rkey; + WARN_ON_ONCE(srp_dev->global_rkey == 0); + } + + rdma_for_each_port (device, p) { + host = srp_add_port(srp_dev, p); + if (host) + list_add_tail(&host->list, &srp_dev->dev_list); + } + + ib_set_client_data(device, &srp_client, srp_dev); + return 0; +} + +static void srp_remove_one(struct ib_device *device, void *client_data) +{ + struct srp_device *srp_dev; + struct srp_host *host, *tmp_host; + struct srp_target_port *target; + + srp_dev = client_data; + + list_for_each_entry_safe(host, tmp_host, &srp_dev->dev_list, list) { + device_unregister(&host->dev); + /* + * Wait for the sysfs entry to go away, so that no new + * target ports can be created. + */ + wait_for_completion(&host->released); + + /* + * Remove all target ports. + */ + spin_lock(&host->target_lock); + list_for_each_entry(target, &host->target_list, list) + srp_queue_remove_work(target); + spin_unlock(&host->target_lock); + + /* + * srp_queue_remove_work() queues a call to + * srp_remove_target(). The latter function cancels + * target->tl_err_work so waiting for the remove works to + * finish is sufficient. + */ + flush_workqueue(srp_remove_wq); + + kfree(host); + } + + ib_dealloc_pd(srp_dev->pd); + + kfree(srp_dev); +} + +static struct srp_function_template ib_srp_transport_functions = { + .has_rport_state = true, + .reset_timer_if_blocked = true, + .reconnect_delay = &srp_reconnect_delay, + .fast_io_fail_tmo = &srp_fast_io_fail_tmo, + .dev_loss_tmo = &srp_dev_loss_tmo, + .reconnect = srp_rport_reconnect, + .rport_delete = srp_rport_delete, + .terminate_rport_io = srp_terminate_io, +}; + +static int __init srp_init_module(void) +{ + int ret; + + BUILD_BUG_ON(sizeof(struct srp_aer_req) != 36); + BUILD_BUG_ON(sizeof(struct srp_cmd) != 48); + BUILD_BUG_ON(sizeof(struct srp_imm_buf) != 4); + BUILD_BUG_ON(sizeof(struct srp_indirect_buf) != 20); + BUILD_BUG_ON(sizeof(struct srp_login_req) != 64); + BUILD_BUG_ON(sizeof(struct srp_login_req_rdma) != 56); + BUILD_BUG_ON(sizeof(struct srp_rsp) != 36); + + if (srp_sg_tablesize) { + pr_warn("srp_sg_tablesize is deprecated, please use cmd_sg_entries\n"); + if (!cmd_sg_entries) + cmd_sg_entries = srp_sg_tablesize; + } + + if (!cmd_sg_entries) + cmd_sg_entries = SRP_DEF_SG_TABLESIZE; + + if (cmd_sg_entries > 255) { + pr_warn("Clamping cmd_sg_entries to 255\n"); + cmd_sg_entries = 255; + } + + if (!indirect_sg_entries) + indirect_sg_entries = cmd_sg_entries; + else if (indirect_sg_entries < cmd_sg_entries) { + pr_warn("Bumping up indirect_sg_entries to match cmd_sg_entries (%u)\n", + cmd_sg_entries); + indirect_sg_entries = cmd_sg_entries; + } + + if (indirect_sg_entries > SG_MAX_SEGMENTS) { + pr_warn("Clamping indirect_sg_entries to %u\n", + SG_MAX_SEGMENTS); + indirect_sg_entries = SG_MAX_SEGMENTS; + } + + srp_remove_wq = create_workqueue("srp_remove"); + if (!srp_remove_wq) { + ret = -ENOMEM; + goto out; + } + + ret = -ENOMEM; + ib_srp_transport_template = + srp_attach_transport(&ib_srp_transport_functions); + if (!ib_srp_transport_template) + goto destroy_wq; + + ret = class_register(&srp_class); + if (ret) { + pr_err("couldn't register class infiniband_srp\n"); + goto release_tr; + } + + ib_sa_register_client(&srp_sa_client); + + ret = ib_register_client(&srp_client); + if (ret) { + pr_err("couldn't register IB client\n"); + goto unreg_sa; + } + +out: + return ret; + +unreg_sa: + ib_sa_unregister_client(&srp_sa_client); + class_unregister(&srp_class); + +release_tr: + srp_release_transport(ib_srp_transport_template); + +destroy_wq: + destroy_workqueue(srp_remove_wq); + goto out; +} + +static void __exit srp_cleanup_module(void) +{ + ib_unregister_client(&srp_client); + ib_sa_unregister_client(&srp_sa_client); + class_unregister(&srp_class); + srp_release_transport(ib_srp_transport_template); + destroy_workqueue(srp_remove_wq); +} + +module_init(srp_init_module); +module_exit(srp_cleanup_module); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/ib_srp.h b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/ib_srp.h new file mode 100644 index 0000000..abccdde --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/ib_srp.h @@ -0,0 +1,340 @@ +/* + * Copyright (c) 2005 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef IB_SRP_H +#define IB_SRP_H + +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +enum { + SRP_PATH_REC_TIMEOUT_MS = 1000, + SRP_ABORT_TIMEOUT_MS = 5000, + + SRP_PORT_REDIRECT = 1, + SRP_DLID_REDIRECT = 2, + SRP_STALE_CONN = 3, + + SRP_DEF_SG_TABLESIZE = 12, + + SRP_DEFAULT_QUEUE_SIZE = 1 << 6, + SRP_RSP_SQ_SIZE = 1, + SRP_TSK_MGMT_SQ_SIZE = 1, + SRP_DEFAULT_CMD_SQ_SIZE = SRP_DEFAULT_QUEUE_SIZE - SRP_RSP_SQ_SIZE - + SRP_TSK_MGMT_SQ_SIZE, + + SRP_TAG_NO_REQ = ~0U, + SRP_TAG_TSK_MGMT = 1U << 31, + + SRP_MAX_PAGES_PER_MR = 512, + + SRP_MAX_ADD_CDB_LEN = 16, + + SRP_MAX_IMM_SGE = 2, + SRP_MAX_SGE = SRP_MAX_IMM_SGE + 1, + /* + * Choose the immediate data offset such that a 32 byte CDB still fits. + */ + SRP_IMM_DATA_OFFSET = sizeof(struct srp_cmd) + + SRP_MAX_ADD_CDB_LEN + + sizeof(struct srp_imm_buf), +}; + +enum srp_target_state { + SRP_TARGET_SCANNING, + SRP_TARGET_LIVE, + SRP_TARGET_REMOVED, +}; + +enum srp_iu_type { + SRP_IU_CMD, + SRP_IU_TSK_MGMT, + SRP_IU_RSP, +}; + +/* + * @mr_page_mask: HCA memory registration page mask. + * @mr_page_size: HCA memory registration page size. + * @mr_max_size: Maximum size in bytes of a single FR registration request. + */ +struct srp_device { + struct list_head dev_list; + struct ib_device *dev; + struct ib_pd *pd; + u32 global_rkey; + u64 mr_page_mask; + int mr_page_size; + int mr_max_size; + int max_pages_per_mr; + bool has_fr; + bool use_fast_reg; +}; + +struct srp_host { + struct srp_device *srp_dev; + u8 port; + struct device dev; + struct list_head target_list; + spinlock_t target_lock; + struct completion released; + struct list_head list; + struct mutex add_target_mutex; +}; + +struct srp_request { + struct scsi_cmnd *scmnd; + struct srp_iu *cmd; + struct srp_fr_desc **fr_list; + struct srp_direct_buf *indirect_desc; + dma_addr_t indirect_dma_addr; + short nmdesc; + struct ib_cqe reg_cqe; +}; + +/** + * struct srp_rdma_ch + * @comp_vector: Completion vector used by this RDMA channel. + * @max_it_iu_len: Maximum initiator-to-target information unit length. + * @max_ti_iu_len: Maximum target-to-initiator information unit length. + */ +struct srp_rdma_ch { + /* These are RW in the hot path, and commonly used together */ + struct list_head free_tx; + spinlock_t lock; + s32 req_lim; + + /* These are read-only in the hot path */ + struct srp_target_port *target ____cacheline_aligned_in_smp; + struct ib_cq *send_cq; + struct ib_cq *recv_cq; + struct ib_qp *qp; + struct srp_fr_pool *fr_pool; + uint32_t max_it_iu_len; + uint32_t max_ti_iu_len; + u8 max_imm_sge; + bool use_imm_data; + + /* Everything above this point is used in the hot path of + * command processing. Try to keep them packed into cachelines. + */ + + struct completion done; + int status; + + union { + struct ib_cm { + struct sa_path_rec path; + struct ib_sa_query *path_query; + int path_query_id; + struct ib_cm_id *cm_id; + } ib_cm; + struct rdma_cm { + struct rdma_cm_id *cm_id; + } rdma_cm; + }; + + struct srp_iu **tx_ring; + struct srp_iu **rx_ring; + int comp_vector; + + u64 tsk_mgmt_tag; + struct completion tsk_mgmt_done; + u8 tsk_mgmt_status; + bool connected; +}; + +/** + * struct srp_target_port + * @comp_vector: Completion vector used by the first RDMA channel created for + * this target port. + */ +struct srp_target_port { + /* read and written in the hot path */ + spinlock_t lock; + + /* read only in the hot path */ + u32 global_rkey; + struct srp_rdma_ch *ch; + struct net *net; + u32 ch_count; + u32 lkey; + enum srp_target_state state; + uint32_t max_it_iu_size; + unsigned int cmd_sg_cnt; + unsigned int indirect_size; + bool allow_ext_sg; + + /* other member variables */ + union ib_gid sgid; + __be64 id_ext; + __be64 ioc_guid; + __be64 initiator_ext; + u16 io_class; + struct srp_host *srp_host; + struct Scsi_Host *scsi_host; + struct srp_rport *rport; + char target_name[32]; + unsigned int scsi_id; + unsigned int sg_tablesize; + unsigned int target_can_queue; + int mr_pool_size; + int mr_per_cmd; + int queue_size; + int comp_vector; + int tl_retry_count; + + bool using_rdma_cm; + + union { + struct { + __be64 service_id; + union ib_gid orig_dgid; + __be16 pkey; + } ib_cm; + struct { + union { + struct sockaddr_in ip4; + struct sockaddr_in6 ip6; + struct sockaddr sa; + struct sockaddr_storage ss; + } src; + union { + struct sockaddr_in ip4; + struct sockaddr_in6 ip6; + struct sockaddr sa; + struct sockaddr_storage ss; + } dst; + bool src_specified; + } rdma_cm; + }; + + u32 rq_tmo_jiffies; + + int zero_req_lim; + + struct work_struct tl_err_work; + struct work_struct remove_work; + + struct list_head list; + bool qp_in_error; +}; + +struct srp_iu { + struct list_head list; + u64 dma; + void *buf; + size_t size; + enum dma_data_direction direction; + u32 num_sge; + struct ib_sge sge[SRP_MAX_SGE]; + struct ib_cqe cqe; +}; + +/** + * struct srp_fr_desc - fast registration work request arguments + * @entry: Entry in srp_fr_pool.free_list. + * @mr: Memory region. + * @frpl: Fast registration page list. + */ +struct srp_fr_desc { + struct list_head entry; + struct ib_mr *mr; +}; + +/** + * struct srp_fr_pool - pool of fast registration descriptors + * + * An entry is available for allocation if and only if it occurs in @free_list. + * + * @size: Number of descriptors in this pool. + * @max_page_list_len: Maximum fast registration work request page list length. + * @lock: Protects free_list. + * @free_list: List of free descriptors. + * @desc: Fast registration descriptor pool. + */ +struct srp_fr_pool { + int size; + int max_page_list_len; + spinlock_t lock; + struct list_head free_list; + struct srp_fr_desc desc[]; +}; + +/** + * struct srp_map_state - per-request DMA memory mapping state + * @desc: Pointer to the element of the SRP buffer descriptor array + * that is being filled in. + * @pages: Array with DMA addresses of pages being considered for + * memory registration. + * @base_dma_addr: DMA address of the first page that has not yet been mapped. + * @dma_len: Number of bytes that will be registered with the next FR + * memory registration call. + * @total_len: Total number of bytes in the sg-list being mapped. + * @npages: Number of page addresses in the pages[] array. + * @nmdesc: Number of FR memory descriptors used for mapping. + * @ndesc: Number of SRP buffer descriptors that have been filled in. + */ +struct srp_map_state { + union { + struct { + struct srp_fr_desc **next; + struct srp_fr_desc **end; + } fr; + struct { + void **next; + void **end; + } gen; + }; + struct srp_direct_buf *desc; + union { + u64 *pages; + struct scatterlist *sg; + }; + dma_addr_t base_dma_addr; + u32 dma_len; + u32 total_len; + unsigned int npages; + unsigned int nmdesc; + unsigned int ndesc; +}; + +#endif /* IB_SRP_H */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/ib_srp_dummy.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/ib_srp_dummy.c new file mode 100644 index 0000000..a5cf3b3 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/ib_srp_dummy.c @@ -0,0 +1,61 @@ + +/* + * Copyright (c) 2014 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "ib_srp" +#define PFX DRV_NAME ": " +#define DRV_VERSION "5.8-1.1.2" +#define DRV_RELDATE "27 Nov 2022" + +MODULE_AUTHOR("Alaa Hleihel"); +MODULE_DESCRIPTION("ib_srp dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init srp_init(void) +{ + return 0; +} + +static void __exit srp_cleanup(void) +{ +} + +module_init(srp_init); +module_exit(srp_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/srp_spec_ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/srp_spec_ new file mode 100644 index 0000000..aee942f --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/srp_spec_ @@ -0,0 +1,221 @@ +# +# Copyright (c) 2014 Mellanox Technologies. All rights reserved. +# +# This Software is licensed under one of the following licenses: +# +# 1) under the terms of the "Common Public License 1.0" a copy of which is +# available from the Open Source Initiative, see +# http://www.opensource.org/licenses/cpl.php. +# +# 2) under the terms of the "The BSD License" a copy of which is +# available from the Open Source Initiative, see +# http://www.opensource.org/licenses/bsd-license.php. +# +# 3) under the terms of the "GNU General Public License (GPL) Version 2" a +# copy of which is available from the Open Source Initiative, see +# http://www.opensource.org/licenses/gpl-license.php. +# +# Licensee has the right to choose one of the above licenses. +# +# Redistributions of source code must retain the above copyright +# notice and one of the license notices. +# +# Redistributions in binary form must reproduce both the above copyright +# notice, one of the license notices in the documentation +# and/or other materials provided with the distribution. +# +# + +%{!?_name: %define _name srp} +%{!?_version: %define _version 4.0} +%{!?_release: %define _release 0} + +# KMP is disabled by default +%{!?KMP: %global KMP 0} + +# take kernel version or default to uname -r +%{!?KVERSION: %global KVERSION %(uname -r)} +%global kernel_version %{KVERSION} +%global krelver %(echo -n %{KVERSION} | sed -e 's/-/_/g') +# take path to kernel sources if provided, otherwise look in default location (for non KMP rpms). +%{!?K_SRC: %global K_SRC /lib/modules/%{KVERSION}/build} + +# define release version +%{!?src_release: %global src_release %{_release}_%{krelver}} +%if "%{KMP}" != "1" +%global _release1 %{src_release} +%else +%global _release1 %{_release} +%endif +%global _kmp_rel %{_release1}%{?_kmp_build_num}%{?_dist} + +Summary: %{_name} Driver +Name: %{_name} +Version: %{_version} +Release: %{_release1}%{?_dist} +License: GPLv2 +Url: http://www.mellanox.com +Group: System Environment/Base +Source: %{_name}-%{_version}.tgz +BuildRoot: %{?build_root:%{build_root}}%{!?build_root:/var/tmp/OFED} +Vendor: Mellanox Technologies +%description +%{name} kernel modules + +# build KMP rpms? +%if "%{KMP}" == "1" +%global kernel_release() $(make -s -C %{1} kernelrelease M=$PWD) +BuildRequires: %kernel_module_package_buildreqs +%(mkdir -p %{buildroot}) +%(echo '%defattr (-,root,root)' > %{buildroot}/file_list) +%(echo '/lib/modules/%2-%1' >> %{buildroot}/file_list) +%(echo '%config(noreplace) %{_sysconfdir}/depmod.d/zz02-%{name}-*-%1.conf' >> %{buildroot}/file_list) +%{kernel_module_package -f %{buildroot}/file_list -x xen -r %{_kmp_rel} } +%else +%global kernel_source() %{K_SRC} +%global kernel_release() %{KVERSION} +%global flavors_to_build default +%endif + +# +# setup module sign scripts if paths to the keys are given +# +%global WITH_MOD_SIGN %(if ( test -f "$MODULE_SIGN_PRIV_KEY" && test -f "$MODULE_SIGN_PUB_KEY" ); \ + then \ + echo -n '1'; \ + else \ + echo -n '0'; fi) + +%if "%{WITH_MOD_SIGN}" == "1" +# call module sign script +%global __modsign_install_post \ + %{_builddir}/%{name}-%{version}/source/tools/sign-modules %{buildroot}/lib/modules/ %{kernel_source default} || exit 1 \ +%{nil} + +%global __debug_package 1 +%global buildsubdir %{name}-%{version} +# Disgusting hack alert! We need to ensure we sign modules *after* all +# invocations of strip occur, which is in __debug_install_post if +# find-debuginfo.sh runs, and __os_install_post if not. +# +%global __spec_install_post \ + %{?__debug_package:%{__debug_install_post}} \ + %{__arch_install_post} \ + %{__os_install_post} \ + %{__modsign_install_post} \ +%{nil} + +%endif # end of setup module sign scripts +# + +%if "%{_vendor}" == "suse" +%debug_package +%endif + +# set modules dir +%if "%{_vendor}" == "redhat" || ("%{_vendor}" == "openEuler") +%if 0%{?fedora} +%global install_mod_dir updates/%{name} +%else +%global install_mod_dir extra/%{name} +%endif +%endif + +%if "%{_vendor}" == "suse" +%global install_mod_dir updates/%{name} +%endif + +%{!?install_mod_dir: %global install_mod_dir updates/%{name}} + +%prep +%setup +set -- * +mkdir source +mv "$@" source/ +mkdir obj + +%build +export EXTRA_CFLAGS='-DVERSION=\"%version\"' +export INSTALL_MOD_DIR=%{install_mod_dir} +export CONF_OPTIONS="%{configure_options}" +for flavor in %{flavors_to_build}; do + export K_BUILD=%{kernel_source $flavor} + export KVER=%{kernel_release $K_BUILD} + export LIB_MOD_DIR=/lib/modules/$KVER/$INSTALL_MOD_DIR + rm -rf obj/$flavor + cp -r source obj/$flavor + cd $PWD/obj/$flavor + make + cd - +done + +%install +export INSTALL_MOD_PATH=%{buildroot} +export INSTALL_MOD_DIR=%{install_mod_dir} +export PREFIX=%{_prefix} +for flavor in %flavors_to_build; do + export K_BUILD=%{kernel_source $flavor} + export KVER=%{kernel_release $K_BUILD} + cd $PWD/obj/$flavor + make install KERNELRELEASE=$KVER + # Cleanup unnecessary kernel-generated module dependency files. + find $INSTALL_MOD_PATH/lib/modules -iname 'modules.*' -exec rm {} \; + cd - +done + +# Set the module(s) to be executable, so that they will be stripped when packaged. +find %{buildroot} \( -type f -name '*.ko' -o -name '*ko.gz' \) -exec %{__chmod} u+x \{\} \; + +%{__install} -d %{buildroot}%{_sysconfdir}/depmod.d/ +for module in `find %{buildroot}/ -name '*.ko' -o -name '*.ko.gz' | sort` +do +ko_name=${module##*/} +mod_name=${ko_name/.ko*/} +mod_path=${module/*\/%{name}} +mod_path=${mod_path/\/${ko_name}} +%if "%{_vendor}" == "suse" + for flavor in %{flavors_to_build}; do + if [[ $module =~ $flavor ]] || [ "X%{KMP}" != "X1" ];then + echo "override ${mod_name} * updates/%{name}${mod_path}" >> %{buildroot}%{_sysconfdir}/depmod.d/zz02-%{name}-${mod_name}-$flavor.conf + fi + done +%else + %if 0%{?fedora} + echo "override ${mod_name} * updates/%{name}${mod_path}" >> %{buildroot}%{_sysconfdir}/depmod.d/zz02-%{name}-${mod_name}.conf + %else + %if "%{_vendor}" == "redhat" || ("%{_vendor}" == "openEuler") + echo "override ${mod_name} * weak-updates/%{name}${mod_path}" >> %{buildroot}%{_sysconfdir}/depmod.d/zz02-%{name}-${mod_name}.conf + %endif + echo "override ${mod_name} * extra/%{name}${mod_path}" >> %{buildroot}%{_sysconfdir}/depmod.d/zz02-%{name}-${mod_name}.conf + %endif +%endif +done + + +%clean +rm -rf %{buildroot} + +%post +if [ $1 -ge 1 ]; then # 1 : This package is being installed or reinstalled + /sbin/depmod %{KVERSION} +fi # 1 : closed +# add SRP_LOAD=no to openib.conf +if [ -f "/etc/infiniband/openib.conf" ] && ! (grep -q SRP_LOAD /etc/infiniband/openib.conf > /dev/null 2>&1) ; then + echo "# Load SRP module" >> /etc/infiniband/openib.conf + echo "SRP_LOAD=no" >> /etc/infiniband/openib.conf +fi +# END of post + +%postun +/sbin/depmod %{KVERSION} + +%if "%{KMP}" != "1" +%files +%defattr(-,root,root,-) +/lib/modules/%{KVERSION}/%{install_mod_dir}/ +%config(noreplace) %{_sysconfdir}/depmod.d/zz02-%{name}-*.conf +%endif + +%changelog +* Thu Feb 20 2014 Alaa Hleihel +- Initial packaging diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/tools/sign-modules b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/tools/sign-modules new file mode 100755 index 0000000..b790769 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srp/tools/sign-modules @@ -0,0 +1,58 @@ +#! /bin/bash + +moddir=$1; shift +KBUILD=$1; shift + +SOURCES_DIR= +case "$KBUILD" in + *linux-obj*) + SOURCES_DIR=$(readlink -f $KBUILD 2>/dev/null | sed -e 's/-obj.*//g') + ;; + */usr/src/linux-*-obj/*) + SOURCES_DIR=$(readlink -f $KBUILD 2>/dev/null | sed -e 's/-obj.*//g') + ;; + *) + SOURCES_DIR=$(readlink -f ${KBUILD/build/source}) + ;; +esac +if [ ! -e "$SOURCES_DIR" ]; then + SOURCES_DIR=$KBUILD +fi + +SIGN_FILE= +if [ -e "${KBUILD}/scripts/sign-file" ]; then + SIGN_FILE="${KBUILD}/scripts/sign-file" +elif [ -e "${SOURCES_DIR}/scripts/sign-file" ]; then + SIGN_FILE="${SOURCES_DIR}/scripts/sign-file" +else + echo "Error: Sign tool does not exist at '$KBUILD' or '$SOURCES_DIR' !" >&2 + exit 1 +fi +echo "Found Sign tool at: '${SIGN_FILE}'" + +if [ ! -e "${MODULE_SIGN_PRIV_KEY}" ]; then + echo "Error: MODULE_SIGN_PRIV_KEY is not set to valid path!" >&2 + exit 1 +fi +if [ ! -e "${MODULE_SIGN_PUB_KEY}" ]; then + echo "Error: MODULE_SIGN_PUB_KEY is not set to valid path!" >&2 + exit 1 +fi + +modules=`find $moddir -name '*.ko' -o -name '*.ko.gz'` +for mod in $modules +do + dir=`dirname $mod` + file=`basename $mod` + + ${SIGN_FILE} sha256 ${MODULE_SIGN_PRIV_KEY} ${MODULE_SIGN_PUB_KEY} ${dir}/${file} + rm -f ${dir}/${file}.{sig,dig} +done + +RANDOMMOD=$(find $moddir -type f -name '*.ko' -o -name '*.ko.gz' | sort -R | tail -n 1) +if [ "~Module signature appended~" != "$(tail -c 28 $RANDOMMOD)" ]; then + echo "*** Modules are unsigned! ***" + exit 1 +fi + +exit 0 diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srpt/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srpt/Makefile new file mode 100644 index 0000000..3df52d7 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srpt/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_INFINIBAND_SRPT) += ib_srpt.o + +ib_srpt-y := main.o diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srpt/main.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srpt/main.c new file mode 100644 index 0000000..7c90852 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/srpt/main.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2014 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "ib_srpt" +#define DRV_VERSION "0.1" +#define DRV_RELDATE "27 Nov 2022" + +MODULE_AUTHOR("Alaa Hleihel"); +MODULE_DESCRIPTION("ib_srpt dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init ib_srpt_init(void) +{ + return 0; +} + +static void __exit ib_srpt_cleanup(void) +{ +} + +module_init(ib_srpt_init); +module_exit(ib_srpt_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/xsigo/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/xsigo/Makefile new file mode 100644 index 0000000..053e4ef --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/xsigo/Makefile @@ -0,0 +1,4 @@ +obj-$(CONFIG_INFINIBAND_XSCORE) += xscore/ +obj-$(CONFIG_INFINIBAND_XSVNIC) += xsvnic/ +obj-$(CONFIG_INFINIBAND_XSVHBA) += xsvhba/ +obj-$(CONFIG_INFINIBAND_XVE) += xve/ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/xsigo/xscore/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/xsigo/xscore/Makefile new file mode 100644 index 0000000..faa168a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/xsigo/xscore/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_INFINIBAND_XSCORE) := xscore.o +xscore-y := main.o diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/xsigo/xscore/main.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/xsigo/xscore/main.c new file mode 100644 index 0000000..29d7550 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/xsigo/xscore/main.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "xscore" +#define DRV_VERSION "2.0.0" +#define DRV_RELDATE "August 14, 2016" + +MODULE_AUTHOR("Alaa Hleihel"); +MODULE_DESCRIPTION("xscore dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init xscore_init(void) +{ + return 0; +} + +static void __exit xscore_cleanup(void) +{ +} + +module_init(xscore_init); +module_exit(xscore_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/xsigo/xsvhba/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/xsigo/xsvhba/Makefile new file mode 100644 index 0000000..a41c77f --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/xsigo/xsvhba/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_INFINIBAND_XSVHBA) := xsvhba.o +xsvhba-y := main.o diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/xsigo/xsvhba/main.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/xsigo/xsvhba/main.c new file mode 100644 index 0000000..76f4bcd --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/xsigo/xsvhba/main.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "xsvhba" +#define DRV_VERSION "2.0.0" +#define DRV_RELDATE "August 14, 2016" + +MODULE_AUTHOR("Alaa Hleihel"); +MODULE_DESCRIPTION("xsvhba dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init xsvhba_init(void) +{ + return 0; +} + +static void __exit xsvhba_cleanup(void) +{ +} + +module_init(xsvhba_init); +module_exit(xsvhba_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/xsigo/xsvnic/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/xsigo/xsvnic/Makefile new file mode 100644 index 0000000..0b742d0 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/xsigo/xsvnic/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_INFINIBAND_XSVNIC) := xsvnic.o +xsvnic-y := main.o diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/xsigo/xsvnic/main.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/xsigo/xsvnic/main.c new file mode 100644 index 0000000..ad3a8bf --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/xsigo/xsvnic/main.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "xsvnic" +#define DRV_VERSION "2.0.0" +#define DRV_RELDATE "August 14, 2016" + +MODULE_AUTHOR("Alaa Hleihel"); +MODULE_DESCRIPTION("xsvnic dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init xsvnic_init(void) +{ + return 0; +} + +static void __exit xsvnic_cleanup(void) +{ +} + +module_init(xsvnic_init); +module_exit(xsvnic_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/xsigo/xve/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/xsigo/xve/Makefile new file mode 100644 index 0000000..cfa25ae --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/xsigo/xve/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_INFINIBAND_XVE) := xve.o +xve-y := main.o diff --git a/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/xsigo/xve/main.c b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/xsigo/xve/main.c new file mode 100644 index 0000000..605c3ee --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/infiniband/ulp/xsigo/xve/main.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "xve" +#define DRV_VERSION "2.0.0" +#define DRV_RELDATE "August 14, 2016" + +MODULE_AUTHOR("Alaa Hleihel"); +MODULE_DESCRIPTION("xve dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init xve_init(void) +{ + return 0; +} + +static void __exit xve_cleanup(void) +{ +} + +module_init(xve_init); +module_exit(xve_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/Kconfig b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/Kconfig new file mode 100644 index 0000000..b4f66eb --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/Kconfig @@ -0,0 +1,27 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# Mellanox driver configuration +# + +config NET_VENDOR_MELLANOX + bool "Mellanox devices" + default y + depends on PCI || I2C + help + If you have a network (Ethernet or RDMA) device belonging to this + class, say Y. + + Note that the answer to this question doesn't directly affect the + kernel: saying N will just cause the configurator to skip all + the questions about Mellanox cards. If you say Y, you will be asked + for your specific card in the following questions. + +if NET_VENDOR_MELLANOX + +source "drivers/net/ethernet/mellanox/mlx4/Kconfig" +source "drivers/net/ethernet/mellanox/mlx5/core/Kconfig" +source "drivers/net/ethernet/mellanox/mlxsw/Kconfig" +source "drivers/net/ethernet/mellanox/mlxfw/Kconfig" +source "drivers/net/ethernet/mellanox/mlxbf_gige/Kconfig" + +endif # NET_VENDOR_MELLANOX diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/Makefile new file mode 100644 index 0000000..d4b5f54 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/Makefile @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# Makefile for the Mellanox device drivers. +# + +obj-$(CONFIG_MLX4_CORE) += mlx4/ +obj-$(CONFIG_MLX5_CORE) += mlx5/core/ +obj-$(CONFIG_MLXSW_CORE) += mlxsw/ +obj-$(CONFIG_MLXFW) += mlxfw/ +obj-$(CONFIG_MLXBF_GIGE) += mlxbf_gige/ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/Kconfig b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/Kconfig new file mode 100644 index 0000000..f658f83 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/Kconfig @@ -0,0 +1,244 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# Mellanox driver configuration +# + +config MLX5_CORE + tristate "Mellanox 5th generation network adapters (ConnectX series) core driver" + depends on PCI + select AUXILIARY_BUS + select NET_DEVLINK + depends on VXLAN || !VXLAN + depends on MLXFW || !MLXFW + depends on PTP_1588_CLOCK_OPTIONAL + depends on PCI_HYPERV_INTERFACE || !PCI_HYPERV_INTERFACE + help + Core driver for low level functionality of the ConnectX-4 and + Connect-IB cards by Mellanox Technologies. + +config MLX5_ACCEL + bool + +config MLX5_FPGA + bool "Mellanox Technologies Innova support" + depends on MLX5_CORE + select MLX5_ACCEL + help + Build support for the Innova family of network cards by Mellanox + Technologies. Innova network cards are comprised of a ConnectX chip + and an FPGA chip on one board. If you select this option, the + mlx5_core driver will include the Innova FPGA core and allow building + sandbox-specific client drivers. + +config MLX5_CORE_EN + bool "Mellanox 5th generation network adapters (ConnectX series) Ethernet support" + depends on NETDEVICES && ETHERNET && INET && PCI && MLX5_CORE + select PAGE_POOL + select DIMLIB + help + Ethernet support in Mellanox Technologies ConnectX-4 NIC. + +config MLX5_EN_ARFS + bool "Mellanox MLX5 ethernet accelerated receive flow steering (ARFS) support" + depends on MLX5_CORE_EN && RFS_ACCEL + default y + help + Mellanox MLX5 ethernet hardware-accelerated receive flow steering support, + Enables ethernet netdevice arfs support and ntuple filtering. + +config MLX5_EN_RXNFC + bool "Mellanox MLX5 ethernet rx nfc flow steering support" + depends on MLX5_CORE_EN + default y + help + Mellanox MLX5 ethernet rx nfc flow steering support + Enables ethtool receive network flow classification, which allows user defined + flow rules to direct traffic into arbitrary rx queue via ethtool set/get_rxnfc + API. + +config MLX5_MPFS + bool "Mellanox Technologies MLX5 MPFS support" + depends on MLX5_CORE_EN + default y + help + Mellanox Technologies Ethernet Multi-Physical Function Switch (MPFS) + support in ConnectX NIC. MPFs is required for when multi-PF configuration + is enabled to allow passing user configured unicast MAC addresses to the + requesting PF. + +config MLX5_ESWITCH + bool "Mellanox Technologies MLX5 SRIOV E-Switch support" + depends on MLX5_CORE_EN && NET_SWITCHDEV + default y + help + Mellanox Technologies Ethernet SRIOV E-Switch support in ConnectX NIC. + E-Switch provides internal SRIOV packet steering and switching for the + enabled VFs and PF in two available modes: + Legacy SRIOV mode (L2 mac vlan steering based). + Switchdev mode (eswitch offloads). + +config MLX5_BRIDGE + bool + depends on MLX5_ESWITCH && BRIDGE + default y + help + mlx5 ConnectX offloads support for Ethernet Bridging (BRIDGE). + Enable adding representors of mlx5 uplink and VF ports to Bridge and + offloading rules for traffic between such ports. Supports VLANs (trunk and + access modes). + +config MLX5_CLS_ACT + bool "MLX5 TC classifier action support" + depends on MLX5_ESWITCH && NET_CLS_ACT + default y + help + mlx5 ConnectX offloads support for TC classifier action (NET_CLS_ACT), + works in both native NIC mode and Switchdev SRIOV mode. + Actions get attached to a Hardware offloaded classifiers and are + invoked after a successful classification. Actions are used to + overwrite the classification result, instantly drop or redirect and/or + reformat packets in wire speeds without involving the host cpu. + + If set to N, TC offloads in both NIC and switchdev modes will be disabled. + If unsure, set to Y + +config MLX5_TC_CT + bool "MLX5 TC connection tracking offload support" + depends on MLX5_CLS_ACT && NF_FLOW_TABLE && NET_ACT_CT && NET_TC_SKB_EXT + default y + help + Say Y here if you want to support offloading connection tracking rules + via tc ct action. + + If unsure, set to Y + +config MLX5_TC_SAMPLE + bool "MLX5 TC sample offload support" + depends on MLX5_CLS_ACT + depends on PSAMPLE=y || PSAMPLE=n || MLX5_CORE=m + default y + help + Say Y here if you want to support offloading sample rules via tc + sample action. + If set to N, will not be able to configure tc rules with sample + action. + + If unsure, set to Y + +config MLX5_CORE_EN_DCB + bool "Data Center Bridging (DCB) Support" + default y + depends on MLX5_CORE_EN && DCB + help + Say Y here if you want to use Data Center Bridging (DCB) in the + driver. + If set to N, will not be able to configure QoS and ratelimit attributes. + This flag is depended on the kernel's DCB support. + + If unsure, set to Y + +config MLX5_CORE_IPOIB + bool "Mellanox 5th generation network adapters (connectX series) IPoIB offloads support" + depends on MLX5_CORE_EN + help + MLX5 IPoIB offloads & acceleration support. + +config MLX5_FPGA_IPSEC + bool "Mellanox Technologies IPsec Innova support" + depends on MLX5_CORE + depends on MLX5_FPGA + help + Build IPsec support for the Innova family of network cards by Mellanox + Technologies. Innova network cards are comprised of a ConnectX chip + and an FPGA chip on one board. If you select this option, the + mlx5_core driver will include the Innova FPGA core and allow building + sandbox-specific client drivers. + +config MLX5_IPSEC + bool "Mellanox Technologies IPsec Connect-X support" + depends on MLX5_CORE_EN + depends on XFRM_OFFLOAD + depends on INET_ESP_OFFLOAD || INET6_ESP_OFFLOAD + select MLX5_ACCEL + help + Build IPsec support for the Connect-X family of network cards by Mellanox + Technologies. + Note: If you select this option, the mlx5_core driver will include + IPsec support for the Connect-X family. + +config MLX5_EN_IPSEC + bool "IPSec XFRM cryptography-offload acceleration" + depends on MLX5_CORE_EN + depends on XFRM_OFFLOAD + depends on INET_ESP_OFFLOAD || INET6_ESP_OFFLOAD + depends on MLX5_FPGA_IPSEC || MLX5_IPSEC + help + Build support for IPsec cryptography-offload acceleration in the NIC. + Note: Support for hardware with this capability needs to be selected + for this option to become available. + +config MLX5_EN_MACSEC + bool "Connect-X support for MACSec offload" + depends on MLX5_CORE_EN + depends on MACSEC + default n + help + Build support for MACsec cryptography-offload acceleration in the NIC. + +config MLX5_FPGA_TLS + bool "Mellanox Technologies TLS Innova support" + depends on TLS_DEVICE + depends on TLS=y || MLX5_CORE=m + depends on MLX5_CORE_EN + depends on MLX5_FPGA + select MLX5_EN_TLS + help + Build TLS support for the Innova family of network cards by Mellanox + Technologies. Innova network cards are comprised of a ConnectX chip + and an FPGA chip on one board. If you select this option, the + mlx5_core driver will include the Innova FPGA core and allow building + sandbox-specific client drivers. + +config MLX5_TLS + bool "Mellanox Technologies TLS Connect-X support" + depends on TLS_DEVICE + depends on TLS=y || MLX5_CORE=m + depends on MLX5_CORE_EN + select MLX5_ACCEL + select MLX5_EN_TLS + help + Build TLS support for the Connect-X family of network cards by Mellanox + Technologies. + +config MLX5_EN_TLS + bool + help + Build support for TLS cryptography-offload acceleration in the NIC. + Note: Support for hardware with this capability needs to be selected + for this option to become available. + +config MLX5_SW_STEERING + bool "Mellanox Technologies software-managed steering" + depends on MLX5_CORE_EN && MLX5_ESWITCH + select CRC32 + default y + help + Build support for software-managed steering in the NIC. + +config MLX5_SF + bool "Mellanox Technologies subfunction device support using auxiliary device" + depends on MLX5_CORE && MLX5_CORE_EN + help + Build support for subfuction device in the NIC. A Mellanox subfunction + device can support RDMA, netdevice and vdpa device. + It is similar to a SRIOV VF but it doesn't require SRIOV support. + +config MLX5_SF_MANAGER + bool + depends on MLX5_SF && MLX5_ESWITCH + default y + help + Build support for subfuction port in the NIC. A Mellanox subfunction + port is managed through devlink. A subfunction supports RDMA, netdevice + and vdpa device. It is similar to a SRIOV VF but it doesn't require + SRIOV support. diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/Makefile new file mode 100644 index 0000000..38352aa --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/Makefile @@ -0,0 +1,130 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for Mellanox 5th generation network adapters +# (ConnectX series) core & netdev driver +# + +subdir-ccflags-y += -I$(src) + +obj-$(CONFIG_MLX5_CORE) += mlx5_core.o + +# +# mlx5 core basic +# +mlx5_core-y := main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \ + health.o mcg.o cq.o alloc.o port.o mr.o pd.o \ + transobj.o vport.o sriov.o fs_cmd.o fs_core.o pci_irq.o \ + fs_counters.o fs_ft_pool.o rl.o lag/lag.o lag/debugfs.o dev.o events.o wq.o lib/gid.o \ + lib/devcom.o lib/pci_vsc.o lib/dm.o lib/fs_ttc.o diag/fs_tracepoint.o \ + diag/fw_tracer.o diag/crdump.o devlink.o diag/rsc_dump.o \ + fw_reset.o qos.o mst_dump.o en_diag.o sriov_sysfs.o crdump.o\ + diag/diag_cnt.o params.o fw_exp.o lib/tout.o eswitch_devlink_compat.o \ + ecpf.o lib/aso.o + +# +# Netdev basic +# +mlx5_core-$(CONFIG_MLX5_CORE_EN) += en/rqt.o en/tir.o en/rss.o en/rx_res.o \ + en/channels.o en_main.o en_common.o en_fs.o en_ethtool.o \ + en_tx.o en_rx.o en_dim.o en_txrx.o en/xdp.o en_stats.o en_sysfs.o en_ecn.o\ + en_selftest.o en/port.o en/monitor_stats.o en/health.o \ + en/reporter_tx.o en/reporter_rx.o en/params.o en/xsk/pool.o \ + en/xsk/setup.o en/xsk/rx.o en/xsk/tx.o en/devlink.o en/ptp.o \ + en/qos.o en/trap.o en/fs_tt_redirect.o en_debugfs.o en/aso.o \ + en/tc/meter.o + +# +# Netdev extra +# +mlx5_core-$(CONFIG_MLX5_EN_ARFS) += en_arfs.o +mlx5_core-$(CONFIG_MLX5_EN_RXNFC) += en_fs_ethtool.o +mlx5_core-$(CONFIG_MLX5_CORE_EN_DCB) += en_dcbnl.o en/port_buffer.o +mlx5_core-$(CONFIG_PCI_HYPERV_INTERFACE) += en/hv_vhca_stats.o +mlx5_core-$(CONFIG_MLX5_ESWITCH) += lag/mp.o lag/port_sel.o lib/geneve.o lib/port_tun.o \ + en_rep.o en/rep/bond.o en/mod_hdr.o \ + en/mapping.o en/rep/meter.o en/rep/sysfs.o +mlx5_core-$(CONFIG_MLX5_CLS_ACT) += en_tc.o en/rep/tc.o en/rep/neigh.o \ + lib/fs_chains.o en/tc_tun.o \ + esw/indir_table.o en/tc_tun_encap.o \ + en/tc_tun_vxlan.o en/tc_tun_gre.o en/tc_tun_geneve.o \ + en/tc_tun_mplsoudp.o diag/en_tc_tracepoint.o \ + en/tc/post_act.o en/tc/int_port.o \ + en/tc/post_meter.o + +mlx5_core-$(CONFIG_MLX5_CLS_ACT) += en/tc/act/act.o en/tc/act/drop.o en/tc/act/trap.o \ + en/tc/act/accept.o en/tc/act/mark.o en/tc/act/goto.o \ + en/tc/act/tun.o en/tc/act/csum.o en/tc/act/pedit.o \ + en/tc/act/vlan.o en/tc/act/vlan_mangle.o en/tc/act/mpls.o \ + en/tc/act/mirred.o en/tc/act/mirred_nic.o \ + en/tc/act/ct.o en/tc/act/sample.o en/tc/act/ptype.o \ + en/tc/act/redirect_ingress.o en/tc/act/prio.o en/tc/act/police.o + +mlx5_core-$(CONFIG_MLX5_TC_CT) += en/tc_ct.o en/tc/ct_fs_dmfs.o en/tc/ct_fs_smfs.o +mlx5_core-$(CONFIG_MLX5_TC_SAMPLE) += en/tc/sample.o + +# +# Core extra +# +mlx5_core-$(CONFIG_MLX5_ESWITCH) += eswitch.o eswitch_offloads.o eswitch_offloads_termtbl.o \ + ecpf.o rdma.o esw/legacy.o esw/vf_meter.o \ + esw/debugfs.o esw/devlink_port.o esw/vporttbl.o esw/qos.o \ + esw/pet_offloads.o + +mlx5_core-$(CONFIG_MLX5_ESWITCH) += esw/acl/helper.o \ + esw/acl/egress_lgcy.o esw/acl/egress_ofld.o \ + esw/acl/ingress_lgcy.o esw/acl/ingress_ofld.o + +mlx5_core-$(CONFIG_MLX5_BRIDGE) += esw/bridge.o en/rep/bridge.o + +mlx5_core-$(CONFIG_MLX5_MPFS) += lib/mpfs.o +mlx5_core-$(CONFIG_VXLAN) += lib/vxlan.o +mlx5_core-$(CONFIG_PTP_1588_CLOCK) += lib/clock.o +mlx5_core-$(CONFIG_PCI_HYPERV_INTERFACE) += lib/hv.o lib/hv_vhca.o +mlx5_core-$(CONFIG_MLXDEVM) += mlx5_devm.o esw/devm_port.o + +# +# Ipoib netdev +# +mlx5_core-$(CONFIG_MLX5_CORE_IPOIB) += ipoib/ipoib.o ipoib/ethtool.o ipoib/ipoib_vlan.o + +# +# Accelerations & FPGA +# +mlx5_core-$(CONFIG_MLX5_IPSEC) += accel/ipsec_offload.o +mlx5_core-$(CONFIG_MLX5_FPGA_IPSEC) += fpga/ipsec.o +mlx5_core-$(CONFIG_MLX5_FPGA_TLS) += fpga/tls.o +mlx5_core-$(CONFIG_MLX5_ACCEL) += lib/crypto.o accel/tls.o accel/ipsec.o + +mlx5_core-$(CONFIG_MLX5_FPGA) += fpga/cmd.o fpga/core.o fpga/conn.o fpga/sdk.o + +mlx5_core-$(CONFIG_MLX5_EN_IPSEC) += en_accel/ipsec.o en_accel/ipsec_rxtx.o \ + en_accel/ipsec_stats.o en_accel/ipsec_fs.o esw/ipsec.o \ + en/ipsec_aso.o + +mlx5_core-$(CONFIG_MLX5_EN_TLS) += en_accel/tls.o en_accel/tls_rxtx.o en_accel/tls_stats.o \ + en_accel/fs_tcp.o en_accel/ktls.o en_accel/ktls_txrx.o \ + en_accel/ktls_tx.o en_accel/ktls_rx.o + +mlx5_core-$(CONFIG_MLX5_SW_STEERING) += steering/dr_domain.o steering/dr_table.o \ + steering/dr_matcher.o steering/dr_rule.o \ + steering/dr_icm_pool.o steering/dr_buddy.o \ + steering/dr_ste.o steering/dr_send.o \ + steering/dr_ste_v0.o steering/dr_ste_v1.o \ + steering/dr_ste_v2.o \ + steering/dr_cmd.o steering/dr_fw.o \ + steering/dr_action.o steering/fs_dr.o \ + steering/dr_dbg.o lib/smfs.o +# +# SF device +# +mlx5_core-$(CONFIG_MLX5_SF) += sf/vhca_event.o sf/dev/dev.o sf/dev/driver.o irq_affinity.o + +# +# SF manager +# +mlx5_core-$(CONFIG_MLX5_SF_MANAGER) += sf/cmd.o sf/hw_table.o sf/devlink.o + +# +## SF cfg driver basic +# +mlx5_core-$(CONFIG_MLX5_SF_CFG) += sf/dev/cfg_driver.o diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/accel/accel.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/accel/accel.h new file mode 100644 index 0000000..82b1851 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/accel/accel.h @@ -0,0 +1,36 @@ +#ifndef __MLX5E_ACCEL_H__ +#define __MLX5E_ACCEL_H__ + +#ifdef CONFIG_MLX5_ACCEL + +#include +#include + +static inline bool is_metadata_hdr_valid(struct sk_buff *skb) +{ + __be16 *ethtype; + + if (unlikely(skb->len < ETH_HLEN + MLX5E_METADATA_ETHER_LEN)) + return false; + ethtype = (__be16 *)(skb->data + ETH_ALEN * 2); + if (*ethtype != cpu_to_be16(MLX5E_METADATA_ETHER_TYPE)) + return false; + return true; +} + +static inline void remove_metadata_hdr(struct sk_buff *skb) +{ + struct ethhdr *old_eth; + struct ethhdr *new_eth; + + /* Remove the metadata from the buffer */ + old_eth = (struct ethhdr *)skb->data; + new_eth = (struct ethhdr *)(skb->data + MLX5E_METADATA_ETHER_LEN); + memmove(new_eth, old_eth, 2 * ETH_ALEN); + /* Ethertype is already in its new place */ + skb_pull_inline(skb, MLX5E_METADATA_ETHER_LEN); +} + +#endif /* CONFIG_MLX5_ACCEL */ + +#endif /* __MLX5E_EN_ACCEL_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.c new file mode 100644 index 0000000..4a1bc7a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.c @@ -0,0 +1,179 @@ +/* + * Copyright (c) 2017 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include + +#include "accel/ipsec.h" +#include "mlx5_core.h" +#include "fpga/ipsec.h" +#include "accel/ipsec_offload.h" + +void mlx5_accel_ipsec_init(struct mlx5_core_dev *mdev) +{ + const struct mlx5_accel_ipsec_ops *ipsec_ops; + int err = 0; + + ipsec_ops = (mlx5_ipsec_offload_ops(mdev)) ? + mlx5_ipsec_offload_ops(mdev) : + mlx5_fpga_ipsec_ops(mdev); + + if (!ipsec_ops || !ipsec_ops->init) { + mlx5_core_dbg(mdev, "IPsec ops is not supported\n"); + return; + } + + err = ipsec_ops->init(mdev); + if (err) { + mlx5_core_warn_once(mdev, "Failed to start IPsec device, err = %d\n", err); + return; + } + + mdev->ipsec_ops = ipsec_ops; +} + +void mlx5_accel_ipsec_cleanup(struct mlx5_core_dev *mdev) +{ + const struct mlx5_accel_ipsec_ops *ipsec_ops = mdev->ipsec_ops; + + if (!ipsec_ops || !ipsec_ops->cleanup) + return; + + ipsec_ops->cleanup(mdev); +} + +u32 mlx5_accel_ipsec_device_caps(struct mlx5_core_dev *mdev) +{ + const struct mlx5_accel_ipsec_ops *ipsec_ops = mdev->ipsec_ops; + + if (!ipsec_ops || !ipsec_ops->device_caps) + return 0; + + return ipsec_ops->device_caps(mdev); +} +EXPORT_SYMBOL_GPL(mlx5_accel_ipsec_device_caps); + +unsigned int mlx5_accel_ipsec_counters_count(struct mlx5_core_dev *mdev) +{ + const struct mlx5_accel_ipsec_ops *ipsec_ops = mdev->ipsec_ops; + + if (!ipsec_ops || !ipsec_ops->counters_count) + return -EOPNOTSUPP; + + return ipsec_ops->counters_count(mdev); +} + +int mlx5_accel_ipsec_counters_read(struct mlx5_core_dev *mdev, u64 *counters, + unsigned int count) +{ + const struct mlx5_accel_ipsec_ops *ipsec_ops = mdev->ipsec_ops; + + if (!ipsec_ops || !ipsec_ops->counters_read) + return -EOPNOTSUPP; + + return ipsec_ops->counters_read(mdev, counters, count); +} + +void *mlx5_accel_esp_create_hw_context(struct mlx5_core_dev *mdev, + struct mlx5_accel_esp_xfrm *xfrm, + u32 pdn, u32 *sa_handle) +{ + const struct mlx5_accel_ipsec_ops *ipsec_ops = mdev->ipsec_ops; + __be32 saddr[4] = {}, daddr[4] = {}; + + if (!ipsec_ops || !ipsec_ops->create_hw_context) + return ERR_PTR(-EOPNOTSUPP); + + if (!xfrm->attrs.is_ipv6) { + saddr[3] = xfrm->attrs.saddr.a4; + daddr[3] = xfrm->attrs.daddr.a4; + } else { + memcpy(saddr, xfrm->attrs.saddr.a6, sizeof(saddr)); + memcpy(daddr, xfrm->attrs.daddr.a6, sizeof(daddr)); + } + + return ipsec_ops->create_hw_context(mdev, xfrm, saddr, daddr, xfrm->attrs.spi, + xfrm->attrs.is_ipv6, pdn, sa_handle); +} + +void mlx5_accel_esp_free_hw_context(struct mlx5_core_dev *mdev, void *context) +{ + const struct mlx5_accel_ipsec_ops *ipsec_ops = mdev->ipsec_ops; + + if (!ipsec_ops || !ipsec_ops->free_hw_context) + return; + + ipsec_ops->free_hw_context(context); +} + +struct mlx5_accel_esp_xfrm * +mlx5_accel_esp_create_xfrm(struct mlx5_core_dev *mdev, + const struct mlx5_accel_esp_xfrm_attrs *attrs, + u32 flags) +{ + const struct mlx5_accel_ipsec_ops *ipsec_ops = mdev->ipsec_ops; + struct mlx5_accel_esp_xfrm *xfrm; + + if (!ipsec_ops || !ipsec_ops->esp_create_xfrm) + return ERR_PTR(-EOPNOTSUPP); + + xfrm = ipsec_ops->esp_create_xfrm(mdev, attrs, flags); + if (IS_ERR(xfrm)) + return xfrm; + + xfrm->mdev = mdev; + return xfrm; +} +EXPORT_SYMBOL_GPL(mlx5_accel_esp_create_xfrm); + +void mlx5_accel_esp_destroy_xfrm(struct mlx5_accel_esp_xfrm *xfrm) +{ + const struct mlx5_accel_ipsec_ops *ipsec_ops = xfrm->mdev->ipsec_ops; + + if (!ipsec_ops || !ipsec_ops->esp_destroy_xfrm) + return; + + ipsec_ops->esp_destroy_xfrm(xfrm); +} +EXPORT_SYMBOL_GPL(mlx5_accel_esp_destroy_xfrm); + +int mlx5_accel_esp_modify_xfrm(struct mlx5_accel_esp_xfrm *xfrm, + const struct mlx5_accel_esp_xfrm_attrs *attrs) +{ + const struct mlx5_accel_ipsec_ops *ipsec_ops = xfrm->mdev->ipsec_ops; + + if (!ipsec_ops || !ipsec_ops->esp_modify_xfrm) + return -EOPNOTSUPP; + + return ipsec_ops->esp_modify_xfrm(xfrm, attrs); +} +EXPORT_SYMBOL_GPL(mlx5_accel_esp_modify_xfrm); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.h new file mode 100644 index 0000000..102c68c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2017 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef __MLX5_ACCEL_IPSEC_H__ +#define __MLX5_ACCEL_IPSEC_H__ + +#include +#include + +#ifdef CONFIG_MLX5_ACCEL + +#define MLX5_IPSEC_DEV(mdev) (mlx5_accel_ipsec_device_caps(mdev) & \ + MLX5_ACCEL_IPSEC_CAP_DEVICE) + +unsigned int mlx5_accel_ipsec_counters_count(struct mlx5_core_dev *mdev); +int mlx5_accel_ipsec_counters_read(struct mlx5_core_dev *mdev, u64 *counters, + unsigned int count); + +void *mlx5_accel_esp_create_hw_context(struct mlx5_core_dev *mdev, + struct mlx5_accel_esp_xfrm *xfrm, + u32 pdn, u32 *sa_handle); +void mlx5_accel_esp_free_hw_context(struct mlx5_core_dev *mdev, void *context); + +void mlx5_accel_ipsec_init(struct mlx5_core_dev *mdev); +void mlx5_accel_ipsec_cleanup(struct mlx5_core_dev *mdev); + +struct mlx5_accel_ipsec_ops { + u32 (*device_caps)(struct mlx5_core_dev *mdev); + unsigned int (*counters_count)(struct mlx5_core_dev *mdev); + int (*counters_read)(struct mlx5_core_dev *mdev, u64 *counters, unsigned int count); + void* (*create_hw_context)(struct mlx5_core_dev *mdev, + struct mlx5_accel_esp_xfrm *xfrm, + const __be32 saddr[4], const __be32 daddr[4], + const __be32 spi, bool is_ipv6, u32 pdn, u32 *sa_handle); + void (*free_hw_context)(void *context); + int (*init)(struct mlx5_core_dev *mdev); + void (*cleanup)(struct mlx5_core_dev *mdev); + struct mlx5_accel_esp_xfrm* (*esp_create_xfrm)(struct mlx5_core_dev *mdev, + const struct mlx5_accel_esp_xfrm_attrs *attrs, + u32 flags); + int (*esp_modify_xfrm)(struct mlx5_accel_esp_xfrm *xfrm, + const struct mlx5_accel_esp_xfrm_attrs *attrs); + void (*esp_destroy_xfrm)(struct mlx5_accel_esp_xfrm *xfrm); +}; + +#else + +#define MLX5_IPSEC_DEV(mdev) false + +static inline void * +mlx5_accel_esp_create_hw_context(struct mlx5_core_dev *mdev, + struct mlx5_accel_esp_xfrm *xfrm, + u32 pdn, u32 *sa_handle) +{ + return NULL; +} + +static inline void mlx5_accel_esp_free_hw_context(struct mlx5_core_dev *mdev, void *context) {} + +static inline void mlx5_accel_ipsec_init(struct mlx5_core_dev *mdev) {} + +static inline void mlx5_accel_ipsec_cleanup(struct mlx5_core_dev *mdev) {} + +#endif /* CONFIG_MLX5_ACCEL */ + +#endif /* __MLX5_ACCEL_IPSEC_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec_offload.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec_offload.c new file mode 100644 index 0000000..ab026d5 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec_offload.c @@ -0,0 +1,456 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. */ + +#include "mlx5_core.h" +#include "ipsec_offload.h" +#include "lib/mlx5.h" +#include "en_accel/ipsec_fs.h" + +#define MLX5_IPSEC_DEV_BASIC_CAPS (MLX5_ACCEL_IPSEC_CAP_DEVICE | MLX5_ACCEL_IPSEC_CAP_IPV6 | \ + MLX5_ACCEL_IPSEC_CAP_LSO) + +struct mlx5_ipsec_sa_ctx { + struct rhash_head hash; + u32 enc_key_id; + u32 ipsec_obj_id; + __u64 soft_packet_limit; + __u64 hard_packet_limit; + + /* hw ctx */ + struct mlx5_core_dev *dev; + struct mlx5_ipsec_esp_xfrm *mxfrm; +}; + +struct mlx5_ipsec_esp_xfrm { + /* reference counter of SA ctx */ + struct mlx5_ipsec_sa_ctx *sa_ctx; + struct mutex lock; /* protects mlx5_ipsec_esp_xfrm */ + struct mlx5_accel_esp_xfrm accel_xfrm; +}; + +static u32 mlx5_ipsec_offload_device_caps(struct mlx5_core_dev *mdev) +{ + u32 caps = MLX5_IPSEC_DEV_BASIC_CAPS; + + if (!mlx5_is_ipsec_device(mdev)) + return 0; + + if (!MLX5_CAP_FLOWTABLE_NIC_TX(mdev, ipsec_encrypt) || + !MLX5_CAP_FLOWTABLE_NIC_RX(mdev, ipsec_decrypt)) + return 0; + + if (MLX5_CAP_IPSEC(mdev, ipsec_crypto_esp_aes_gcm_128_encrypt) && + MLX5_CAP_IPSEC(mdev, ipsec_crypto_esp_aes_gcm_128_decrypt)) + caps |= MLX5_ACCEL_IPSEC_CAP_ESP; + + if (MLX5_CAP_IPSEC(mdev, ipsec_esn)) { + caps |= MLX5_ACCEL_IPSEC_CAP_ESN; + caps |= MLX5_ACCEL_IPSEC_CAP_TX_IV_IS_ESN; + } + + if (MLX5_CAP_IPSEC(mdev, ipsec_full_offload)) + caps |= MLX5_ACCEL_IPSEC_CAP_FULL_OFFLOAD; + + /* We can accommodate up to 2^24 different IPsec objects + * because we use up to 24 bit in flow table metadata + * to hold the IPsec Object unique handle. + */ + WARN_ON_ONCE(MLX5_CAP_IPSEC(mdev, log_max_ipsec_offload) > 24); + return caps; +} + +static int +mlx5_ipsec_offload_esp_validate_xfrm_attrs(struct mlx5_core_dev *mdev, + const struct mlx5_accel_esp_xfrm_attrs *attrs) +{ + if (attrs->replay_type != MLX5_ACCEL_ESP_REPLAY_NONE) { + mlx5_core_err(mdev, "Cannot offload xfrm states with anti replay (replay_type = %d)\n", + attrs->replay_type); + return -EOPNOTSUPP; + } + + if (attrs->keymat_type != MLX5_ACCEL_ESP_KEYMAT_AES_GCM) { + mlx5_core_err(mdev, "Only aes gcm keymat is supported (keymat_type = %d)\n", + attrs->keymat_type); + return -EOPNOTSUPP; + } + + if (attrs->keymat.aes_gcm.iv_algo != + MLX5_ACCEL_ESP_AES_GCM_IV_ALGO_SEQ) { + mlx5_core_err(mdev, "Only iv sequence algo is supported (iv_algo = %d)\n", + attrs->keymat.aes_gcm.iv_algo); + return -EOPNOTSUPP; + } + + if (attrs->keymat.aes_gcm.key_len != 128 && + attrs->keymat.aes_gcm.key_len != 256) { + mlx5_core_err(mdev, "Cannot offload xfrm states with key length other than 128/256 bit (key length = %d)\n", + attrs->keymat.aes_gcm.key_len); + return -EOPNOTSUPP; + } + + if ((attrs->flags & MLX5_ACCEL_ESP_FLAGS_ESN_TRIGGERED) && + !MLX5_CAP_IPSEC(mdev, ipsec_esn)) { + mlx5_core_err(mdev, "Cannot offload xfrm states with ESN triggered\n"); + return -EOPNOTSUPP; + } + + return 0; +} + +static struct mlx5_accel_esp_xfrm * +mlx5_ipsec_offload_esp_create_xfrm(struct mlx5_core_dev *mdev, + const struct mlx5_accel_esp_xfrm_attrs *attrs, + u32 flags) +{ + struct mlx5_ipsec_esp_xfrm *mxfrm; + int err = 0; + + err = mlx5_ipsec_offload_esp_validate_xfrm_attrs(mdev, attrs); + if (err) + return ERR_PTR(err); + + mxfrm = kzalloc(sizeof(*mxfrm), GFP_KERNEL); + if (!mxfrm) + return ERR_PTR(-ENOMEM); + + mutex_init(&mxfrm->lock); + memcpy(&mxfrm->accel_xfrm.attrs, attrs, + sizeof(mxfrm->accel_xfrm.attrs)); + + return &mxfrm->accel_xfrm; +} + +static void mlx5_ipsec_offload_esp_destroy_xfrm(struct mlx5_accel_esp_xfrm *xfrm) +{ + struct mlx5_ipsec_esp_xfrm *mxfrm = container_of(xfrm, struct mlx5_ipsec_esp_xfrm, + accel_xfrm); + + /* assuming no sa_ctx are connected to this xfrm_ctx */ + WARN_ON(mxfrm->sa_ctx); + kfree(mxfrm); +} + +struct mlx5_ipsec_obj_attrs { + const struct aes_gcm_keymat *aes_gcm; + u32 accel_flags; + u32 esn_msb; + u32 enc_key_id; + bool is_tx; + __u64 soft_packet_limit; + __u64 hard_packet_limit; + __u32 replay_window; +}; + +static int mlx5_create_ipsec_obj(struct mlx5_core_dev *mdev, + struct mlx5_ipsec_obj_attrs *attrs, + u32 pdn, u32 *ipsec_id) +{ + const struct aes_gcm_keymat *aes_gcm = attrs->aes_gcm; + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)]; + u32 in[MLX5_ST_SZ_DW(create_ipsec_obj_in)] = {}; + void *obj, *salt_p, *salt_iv_p, *aso_ctx; + int err; + + obj = MLX5_ADDR_OF(create_ipsec_obj_in, in, ipsec_object); + aso_ctx = MLX5_ADDR_OF(ipsec_obj, obj, ipsec_aso); + + /* salt and seq_iv */ + salt_p = MLX5_ADDR_OF(ipsec_obj, obj, salt); + memcpy(salt_p, &aes_gcm->salt, sizeof(aes_gcm->salt)); + + switch (aes_gcm->icv_len) { + case 64: + MLX5_SET(ipsec_obj, obj, icv_length, + MLX5_IPSEC_OBJECT_ICV_LEN_8B); + break; + case 96: + MLX5_SET(ipsec_obj, obj, icv_length, + MLX5_IPSEC_OBJECT_ICV_LEN_12B); + break; + case 128: + MLX5_SET(ipsec_obj, obj, icv_length, + MLX5_IPSEC_OBJECT_ICV_LEN_16B); + break; + default: + return -EINVAL; + } + salt_iv_p = MLX5_ADDR_OF(ipsec_obj, obj, implicit_iv); + memcpy(salt_iv_p, &aes_gcm->seq_iv, sizeof(aes_gcm->seq_iv)); + + /* esn */ + if (attrs->accel_flags & MLX5_ACCEL_ESP_FLAGS_ESN_TRIGGERED) { + MLX5_SET(ipsec_obj, obj, esn_en, 1); + MLX5_SET(ipsec_obj, obj, esn_msb, attrs->esn_msb); + if (attrs->accel_flags & MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP) + MLX5_SET(ipsec_obj, obj, esn_overlap, 1); + + if (attrs->accel_flags & MLX5_ACCEL_ESP_FLAGS_FULL_OFFLOAD) { + MLX5_SET(ipsec_aso, aso_ctx, esn_event_arm, 1); + + if (!attrs->is_tx) { + u8 window_sz; + + switch (attrs->replay_window) { + case 256: + window_sz = MLX5_IPSEC_ASO_REPLAY_WIN_256_BIT; + break; + case 128: + window_sz = MLX5_IPSEC_ASO_REPLAY_WIN_128_BIT; + break; + case 64: + window_sz = MLX5_IPSEC_ASO_REPLAY_WIN_64_BIT; + break; + case 32: + window_sz = MLX5_IPSEC_ASO_REPLAY_WIN_32_BIT; + break; + default: + return -EINVAL; + } + + MLX5_SET(ipsec_aso, aso_ctx, window_sz, window_sz); + MLX5_SET(ipsec_aso, aso_ctx, mode, MLX5_IPSEC_ASO_REPLAY_PROTECTION); + } + } + } + + MLX5_SET(ipsec_obj, obj, dekn, attrs->enc_key_id); + + /* general object fields set */ + MLX5_SET(general_obj_in_cmd_hdr, in, opcode, + MLX5_CMD_OP_CREATE_GENERAL_OBJECT); + MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, + MLX5_GENERAL_OBJECT_TYPES_IPSEC); + + /* ASO context */ + if (attrs->accel_flags & MLX5_ACCEL_ESP_FLAGS_FULL_OFFLOAD) { + MLX5_SET(ipsec_obj, obj, full_offload, 1); + MLX5_SET(ipsec_obj, obj, ipsec_aso_access_pd, pdn); + MLX5_SET(ipsec_aso, aso_ctx, valid, 1); + MLX5_SET(ipsec_obj, obj, aso_return_reg, MLX5_IPSEC_ASO_REG_C_4_5); + if (attrs->is_tx) + MLX5_SET(ipsec_aso, aso_ctx, mode, MLX5_IPSEC_ASO_INC_SN); + + /* hard and soft packet limit */ + if (attrs->soft_packet_limit != IPSEC_NO_LIMIT) { + MLX5_SET(ipsec_aso, aso_ctx, remove_flow_soft_lft, (u32)attrs->soft_packet_limit); + MLX5_SET(ipsec_aso, aso_ctx, soft_lft_arm, 1); + MLX5_SET(ipsec_aso, aso_ctx, remove_flow_enable, 1); + } + + if (attrs->hard_packet_limit != IPSEC_NO_LIMIT) { + MLX5_SET(ipsec_aso, aso_ctx, remove_flow_pkt_cnt, (u32)attrs->hard_packet_limit); + MLX5_SET(ipsec_aso, aso_ctx, hard_lft_arm, 1); + MLX5_SET(ipsec_aso, aso_ctx, remove_flow_enable, 1); + } + } + + err = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); + if (!err) + *ipsec_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id); + + return err; +} + +static void mlx5_destroy_ipsec_obj(struct mlx5_core_dev *mdev, u32 ipsec_id) +{ + u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {}; + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)]; + + MLX5_SET(general_obj_in_cmd_hdr, in, opcode, + MLX5_CMD_OP_DESTROY_GENERAL_OBJECT); + MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, + MLX5_GENERAL_OBJECT_TYPES_IPSEC); + MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, ipsec_id); + + mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); +} + +static void *mlx5_ipsec_offload_create_sa_ctx(struct mlx5_core_dev *mdev, + struct mlx5_accel_esp_xfrm *accel_xfrm, + const __be32 saddr[4], const __be32 daddr[4], + const __be32 spi, bool is_ipv6, u32 pdn, + u32 *hw_handle) +{ + struct mlx5_accel_esp_xfrm_attrs *xfrm_attrs = &accel_xfrm->attrs; + struct aes_gcm_keymat *aes_gcm = &xfrm_attrs->keymat.aes_gcm; + struct mlx5_ipsec_obj_attrs ipsec_attrs = {}; + struct mlx5_ipsec_esp_xfrm *mxfrm; + struct mlx5_ipsec_sa_ctx *sa_ctx; + int err; + + /* alloc SA context */ + sa_ctx = kzalloc(sizeof(*sa_ctx), GFP_KERNEL); + if (!sa_ctx) + return ERR_PTR(-ENOMEM); + + sa_ctx->dev = mdev; + + mxfrm = container_of(accel_xfrm, struct mlx5_ipsec_esp_xfrm, accel_xfrm); + mutex_lock(&mxfrm->lock); + sa_ctx->mxfrm = mxfrm; + + /* key */ + err = mlx5_create_encryption_key(mdev, aes_gcm->aes_key, + aes_gcm->key_len / BITS_PER_BYTE, + MLX5_ACCEL_OBJ_IPSEC_KEY, + &sa_ctx->enc_key_id); + if (err) { + mlx5_core_dbg(mdev, "Failed to create encryption key (err = %d)\n", err); + goto err_sa_ctx; + } + + ipsec_attrs.aes_gcm = aes_gcm; + ipsec_attrs.accel_flags = accel_xfrm->attrs.flags; + ipsec_attrs.esn_msb = accel_xfrm->attrs.esn; + ipsec_attrs.enc_key_id = sa_ctx->enc_key_id; + ipsec_attrs.is_tx = accel_xfrm->attrs.action & MLX5_ACCEL_ESP_ACTION_ENCRYPT; + ipsec_attrs.soft_packet_limit = accel_xfrm->attrs.soft_packet_limit; + ipsec_attrs.hard_packet_limit = accel_xfrm->attrs.hard_packet_limit; + ipsec_attrs.replay_window = accel_xfrm->attrs.replay_window; + + err = mlx5_create_ipsec_obj(mdev, &ipsec_attrs, + pdn, &sa_ctx->ipsec_obj_id); + if (err) { + mlx5_core_dbg(mdev, "Failed to create IPsec object (err = %d)\n", err); + goto err_enc_key; + } + + *hw_handle = sa_ctx->ipsec_obj_id; + mxfrm->sa_ctx = sa_ctx; + mutex_unlock(&mxfrm->lock); + + return sa_ctx; + +err_enc_key: + mlx5_destroy_encryption_key(mdev, sa_ctx->enc_key_id); +err_sa_ctx: + mutex_unlock(&mxfrm->lock); + kfree(sa_ctx); + return ERR_PTR(err); +} + +static void mlx5_ipsec_offload_delete_sa_ctx(void *context) +{ + struct mlx5_ipsec_sa_ctx *sa_ctx = (struct mlx5_ipsec_sa_ctx *)context; + struct mlx5_ipsec_esp_xfrm *mxfrm = sa_ctx->mxfrm; + + mutex_lock(&mxfrm->lock); + mlx5_destroy_ipsec_obj(sa_ctx->dev, sa_ctx->ipsec_obj_id); + mlx5_destroy_encryption_key(sa_ctx->dev, sa_ctx->enc_key_id); + kfree(sa_ctx); + mxfrm->sa_ctx = NULL; + mutex_unlock(&mxfrm->lock); +} + +static int mlx5_ipsec_offload_init(struct mlx5_core_dev *mdev) +{ + return 0; +} + +static int mlx5_modify_ipsec_obj(struct mlx5_core_dev *mdev, + struct mlx5_ipsec_obj_attrs *attrs, + u32 ipsec_id) +{ + u32 in[MLX5_ST_SZ_DW(modify_ipsec_obj_in)] = {}; + u32 out[MLX5_ST_SZ_DW(query_ipsec_obj_out)]; + u64 modify_field_select = 0; + u64 general_obj_types; + void *obj; + int err; + + if (!(attrs->accel_flags & MLX5_ACCEL_ESP_FLAGS_ESN_TRIGGERED)) + return 0; + + general_obj_types = MLX5_CAP_GEN_64(mdev, general_obj_types); + if (!(general_obj_types & MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_IPSEC)) + return -EINVAL; + + /* general object fields set */ + MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_QUERY_GENERAL_OBJECT); + MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_GENERAL_OBJECT_TYPES_IPSEC); + MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, ipsec_id); + err = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); + if (err) { + mlx5_core_err(mdev, "Query IPsec object failed (Object id %d), err = %d\n", + ipsec_id, err); + return err; + } + + obj = MLX5_ADDR_OF(query_ipsec_obj_out, out, ipsec_object); + modify_field_select = MLX5_GET64(ipsec_obj, obj, modify_field_select); + + /* esn */ + if (!(modify_field_select & MLX5_MODIFY_IPSEC_BITMASK_ESN_OVERLAP) || + !(modify_field_select & MLX5_MODIFY_IPSEC_BITMASK_ESN_MSB)) + return -EOPNOTSUPP; + + obj = MLX5_ADDR_OF(modify_ipsec_obj_in, in, ipsec_object); + MLX5_SET64(ipsec_obj, obj, modify_field_select, + MLX5_MODIFY_IPSEC_BITMASK_ESN_OVERLAP | MLX5_MODIFY_IPSEC_BITMASK_ESN_MSB); + MLX5_SET(ipsec_obj, obj, esn_msb, attrs->esn_msb); + if (attrs->accel_flags & MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP) + MLX5_SET(ipsec_obj, obj, esn_overlap, 1); + + /* general object fields set */ + MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT); + + return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); +} + +static int mlx5_ipsec_offload_esp_modify_xfrm(struct mlx5_accel_esp_xfrm *xfrm, + const struct mlx5_accel_esp_xfrm_attrs *attrs) +{ + struct mlx5_ipsec_obj_attrs ipsec_attrs = {}; + struct mlx5_core_dev *mdev = xfrm->mdev; + struct mlx5_ipsec_esp_xfrm *mxfrm; + + int err = 0; + + if (!memcmp(&xfrm->attrs, attrs, sizeof(xfrm->attrs))) + return 0; + + if (mlx5_ipsec_offload_esp_validate_xfrm_attrs(mdev, attrs)) + return -EOPNOTSUPP; + + mxfrm = container_of(xfrm, struct mlx5_ipsec_esp_xfrm, accel_xfrm); + + mutex_lock(&mxfrm->lock); + + if (!mxfrm->sa_ctx) + /* Not bound xfrm, change only sw attrs */ + goto change_sw_xfrm_attrs; + + /* need to add find and replace in ipsec_rhash_sa the sa_ctx */ + /* modify device with new hw_sa */ + ipsec_attrs.accel_flags = attrs->flags; + ipsec_attrs.esn_msb = attrs->esn; + err = mlx5_modify_ipsec_obj(mdev, + &ipsec_attrs, + mxfrm->sa_ctx->ipsec_obj_id); + +change_sw_xfrm_attrs: + if (!err) + memcpy(&xfrm->attrs, attrs, sizeof(xfrm->attrs)); + + mutex_unlock(&mxfrm->lock); + return err; +} + +static const struct mlx5_accel_ipsec_ops ipsec_offload_ops = { + .device_caps = mlx5_ipsec_offload_device_caps, + .create_hw_context = mlx5_ipsec_offload_create_sa_ctx, + .free_hw_context = mlx5_ipsec_offload_delete_sa_ctx, + .init = mlx5_ipsec_offload_init, + .esp_create_xfrm = mlx5_ipsec_offload_esp_create_xfrm, + .esp_destroy_xfrm = mlx5_ipsec_offload_esp_destroy_xfrm, + .esp_modify_xfrm = mlx5_ipsec_offload_esp_modify_xfrm, +}; + +const struct mlx5_accel_ipsec_ops *mlx5_ipsec_offload_ops(struct mlx5_core_dev *mdev) +{ + if (!mlx5_ipsec_offload_device_caps(mdev)) + return NULL; + + return &ipsec_offload_ops; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec_offload.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec_offload.h new file mode 100644 index 0000000..875e589 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec_offload.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. */ + +#ifndef __MLX5_IPSEC_OFFLOAD_H__ +#define __MLX5_IPSEC_OFFLOAD_H__ + +#include +#include "accel/ipsec.h" + +#ifdef CONFIG_MLX5_IPSEC + +const struct mlx5_accel_ipsec_ops *mlx5_ipsec_offload_ops(struct mlx5_core_dev *mdev); +static inline bool mlx5_is_ipsec_device(struct mlx5_core_dev *mdev) +{ + if (!MLX5_CAP_GEN(mdev, ipsec_offload)) + return false; + + if (!MLX5_CAP_GEN(mdev, log_max_dek)) + return false; + + if (!(MLX5_CAP_GEN_64(mdev, general_obj_types) & + MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_IPSEC)) + return false; + + return (MLX5_CAP_IPSEC(mdev, ipsec_crypto_offload) && MLX5_CAP_ETH(mdev, insert_trailer)) || + MLX5_CAP_IPSEC(mdev, ipsec_full_offload); +} + +#else +static inline const struct mlx5_accel_ipsec_ops * +mlx5_ipsec_offload_ops(struct mlx5_core_dev *mdev) { return NULL; } +static inline bool mlx5_is_ipsec_device(struct mlx5_core_dev *mdev) +{ + return false; +} + +#endif /* CONFIG_MLX5_IPSEC */ +#endif /* __MLX5_IPSEC_OFFLOAD_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.c new file mode 100644 index 0000000..6c2b86a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.c @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2018 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include + +#include "accel/tls.h" +#include "mlx5_core.h" +#include "lib/mlx5.h" + +#ifdef CONFIG_MLX5_FPGA_TLS +#include "fpga/tls.h" + +int mlx5_accel_tls_add_flow(struct mlx5_core_dev *mdev, void *flow, + struct tls_crypto_info *crypto_info, + u32 start_offload_tcp_sn, u32 *p_swid, + bool direction_sx) +{ + return mlx5_fpga_tls_add_flow(mdev, flow, crypto_info, + start_offload_tcp_sn, p_swid, + direction_sx); +} + +void mlx5_accel_tls_del_flow(struct mlx5_core_dev *mdev, u32 swid, + bool direction_sx) +{ + mlx5_fpga_tls_del_flow(mdev, swid, GFP_KERNEL, direction_sx); +} + +int mlx5_accel_tls_resync_rx(struct mlx5_core_dev *mdev, __be32 handle, + u32 seq, __be64 rcd_sn) +{ + return mlx5_fpga_tls_resync_rx(mdev, handle, seq, rcd_sn); +} + +bool mlx5_accel_is_tls_device(struct mlx5_core_dev *mdev) +{ + return mlx5_fpga_is_tls_device(mdev) || + mlx5_accel_is_ktls_device(mdev); +} + +u32 mlx5_accel_tls_device_caps(struct mlx5_core_dev *mdev) +{ + return mlx5_fpga_tls_device_caps(mdev); +} + +int mlx5_accel_tls_init(struct mlx5_core_dev *mdev) +{ + return mlx5_fpga_tls_init(mdev); +} + +void mlx5_accel_tls_cleanup(struct mlx5_core_dev *mdev) +{ + mlx5_fpga_tls_cleanup(mdev); +} +#endif + +#ifdef CONFIG_MLX5_TLS +int mlx5_ktls_create_key(struct mlx5_core_dev *mdev, + struct tls_crypto_info *crypto_info, + u32 *p_key_id) +{ + u32 sz_bytes; + void *key; + + switch (crypto_info->cipher_type) { + case TLS_CIPHER_AES_GCM_128: { + struct tls12_crypto_info_aes_gcm_128 *info = + (struct tls12_crypto_info_aes_gcm_128 *)crypto_info; + + key = info->key; + sz_bytes = sizeof(info->key); + break; + } + case TLS_CIPHER_AES_GCM_256: { + struct tls12_crypto_info_aes_gcm_256 *info = + (struct tls12_crypto_info_aes_gcm_256 *)crypto_info; + + key = info->key; + sz_bytes = sizeof(info->key); + break; + } + default: + return -EINVAL; + } + + return mlx5_create_encryption_key(mdev, key, sz_bytes, + MLX5_ACCEL_OBJ_TLS_KEY, + p_key_id); +} + +void mlx5_ktls_destroy_key(struct mlx5_core_dev *mdev, u32 key_id) +{ + mlx5_destroy_encryption_key(mdev, key_id); +} +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.h new file mode 100644 index 0000000..fd874f0 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.h @@ -0,0 +1,156 @@ +/* + * Copyright (c) 2018 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef __MLX5_ACCEL_TLS_H__ +#define __MLX5_ACCEL_TLS_H__ + +#include +#include + +#ifdef CONFIG_MLX5_TLS +int mlx5_ktls_create_key(struct mlx5_core_dev *mdev, + struct tls_crypto_info *crypto_info, + u32 *p_key_id); +void mlx5_ktls_destroy_key(struct mlx5_core_dev *mdev, u32 key_id); + +static inline bool mlx5_accel_is_ktls_tx(struct mlx5_core_dev *mdev) +{ + return MLX5_CAP_GEN(mdev, tls_tx); +} + +static inline bool mlx5_accel_is_ktls_rx(struct mlx5_core_dev *mdev) +{ + return MLX5_CAP_GEN(mdev, tls_rx); +} + +static inline bool mlx5_accel_is_ktls_device(struct mlx5_core_dev *mdev) +{ + if (!mlx5_accel_is_ktls_tx(mdev) && + !mlx5_accel_is_ktls_rx(mdev)) + return false; + + if (!MLX5_CAP_GEN(mdev, log_max_dek)) + return false; + + return MLX5_CAP_TLS(mdev, tls_1_2_aes_gcm_128); +} + +static inline bool mlx5e_ktls_type_check(struct mlx5_core_dev *mdev, + struct tls_crypto_info *crypto_info) +{ + switch (crypto_info->cipher_type) { + case TLS_CIPHER_AES_GCM_128: + if (crypto_info->version == TLS_1_2_VERSION) + return MLX5_CAP_TLS(mdev, tls_1_2_aes_gcm_128); + break; + } + + return false; +} +#else +static inline bool mlx5_accel_is_ktls_tx(struct mlx5_core_dev *mdev) +{ return false; } + +static inline bool mlx5_accel_is_ktls_rx(struct mlx5_core_dev *mdev) +{ return false; } + +static inline int +mlx5_ktls_create_key(struct mlx5_core_dev *mdev, + struct tls_crypto_info *crypto_info, + u32 *p_key_id) { return -ENOTSUPP; } +static inline void +mlx5_ktls_destroy_key(struct mlx5_core_dev *mdev, u32 key_id) {} + +static inline bool +mlx5_accel_is_ktls_device(struct mlx5_core_dev *mdev) { return false; } +static inline bool +mlx5e_ktls_type_check(struct mlx5_core_dev *mdev, + struct tls_crypto_info *crypto_info) { return false; } +#endif + +enum { + MLX5_ACCEL_TLS_TX = BIT(0), + MLX5_ACCEL_TLS_RX = BIT(1), + MLX5_ACCEL_TLS_V12 = BIT(2), + MLX5_ACCEL_TLS_V13 = BIT(3), + MLX5_ACCEL_TLS_LRO = BIT(4), + MLX5_ACCEL_TLS_IPV6 = BIT(5), + MLX5_ACCEL_TLS_AES_GCM128 = BIT(30), + MLX5_ACCEL_TLS_AES_GCM256 = BIT(31), +}; + +struct mlx5_ifc_tls_flow_bits { + u8 src_port[0x10]; + u8 dst_port[0x10]; + union mlx5_ifc_ipv6_layout_ipv4_layout_auto_bits src_ipv4_src_ipv6; + union mlx5_ifc_ipv6_layout_ipv4_layout_auto_bits dst_ipv4_dst_ipv6; + u8 ipv6[0x1]; + u8 direction_sx[0x1]; + u8 reserved_at_2[0x1e]; +}; + +#ifdef CONFIG_MLX5_FPGA_TLS +int mlx5_accel_tls_add_flow(struct mlx5_core_dev *mdev, void *flow, + struct tls_crypto_info *crypto_info, + u32 start_offload_tcp_sn, u32 *p_swid, + bool direction_sx); +void mlx5_accel_tls_del_flow(struct mlx5_core_dev *mdev, u32 swid, + bool direction_sx); +int mlx5_accel_tls_resync_rx(struct mlx5_core_dev *mdev, __be32 handle, + u32 seq, __be64 rcd_sn); +bool mlx5_accel_is_tls_device(struct mlx5_core_dev *mdev); +u32 mlx5_accel_tls_device_caps(struct mlx5_core_dev *mdev); +int mlx5_accel_tls_init(struct mlx5_core_dev *mdev); +void mlx5_accel_tls_cleanup(struct mlx5_core_dev *mdev); + +#else + +static inline int +mlx5_accel_tls_add_flow(struct mlx5_core_dev *mdev, void *flow, + struct tls_crypto_info *crypto_info, + u32 start_offload_tcp_sn, u32 *p_swid, + bool direction_sx) { return -ENOTSUPP; } +static inline void mlx5_accel_tls_del_flow(struct mlx5_core_dev *mdev, u32 swid, + bool direction_sx) { } +static inline int mlx5_accel_tls_resync_rx(struct mlx5_core_dev *mdev, __be32 handle, + u32 seq, __be64 rcd_sn) { return 0; } +static inline bool mlx5_accel_is_tls_device(struct mlx5_core_dev *mdev) +{ + return mlx5_accel_is_ktls_device(mdev); +} +static inline u32 mlx5_accel_tls_device_caps(struct mlx5_core_dev *mdev) { return 0; } +static inline int mlx5_accel_tls_init(struct mlx5_core_dev *mdev) { return 0; } +static inline void mlx5_accel_tls_cleanup(struct mlx5_core_dev *mdev) { } +#endif + +#endif /* __MLX5_ACCEL_TLS_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/alloc.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/alloc.c new file mode 100644 index 0000000..291e427 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/alloc.c @@ -0,0 +1,316 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mlx5_core.h" + +struct mlx5_db_pgdir { + struct list_head list; + unsigned long *bitmap; + __be32 *db_page; + dma_addr_t db_dma; +}; + +/* Handling for queue buffers -- we allocate a bunch of memory and + * register it in a memory region at HCA virtual address 0. + */ + +static void *mlx5_dma_zalloc_coherent_node(struct mlx5_core_dev *dev, + size_t size, dma_addr_t *dma_handle, + int node) +{ + struct device *device = mlx5_core_dma_dev(dev); + struct mlx5_priv *priv = &dev->priv; + int original_node; + void *cpu_handle; + + mutex_lock(&priv->alloc_mutex); + original_node = dev_to_node(device); + set_dev_node(device, node); + cpu_handle = dma_alloc_coherent(device, size, dma_handle, + GFP_KERNEL); + set_dev_node(device, original_node); + mutex_unlock(&priv->alloc_mutex); + return cpu_handle; +} + +static int mlx5_buf_alloc_node(struct mlx5_core_dev *dev, int size, + struct mlx5_frag_buf *buf, int node) +{ + dma_addr_t t; + + buf->size = size; + buf->npages = 1; + buf->page_shift = (u8)get_order(size) + PAGE_SHIFT; + + buf->frags = kzalloc(sizeof(*buf->frags), GFP_KERNEL); + if (!buf->frags) + return -ENOMEM; + + buf->frags->buf = mlx5_dma_zalloc_coherent_node(dev, size, + &t, node); + if (!buf->frags->buf) + goto err_out; + + buf->frags->map = t; + + while (t & ((1 << buf->page_shift) - 1)) { + --buf->page_shift; + buf->npages *= 2; + } + + return 0; +err_out: + kfree(buf->frags); + return -ENOMEM; +} + +int mlx5_buf_alloc(struct mlx5_core_dev *dev, + int size, struct mlx5_frag_buf *buf) +{ + return mlx5_buf_alloc_node(dev, size, buf, dev->priv.numa_node); +} +EXPORT_SYMBOL(mlx5_buf_alloc); + +void mlx5_buf_free(struct mlx5_core_dev *dev, struct mlx5_frag_buf *buf) +{ + dma_free_coherent(mlx5_core_dma_dev(dev), buf->size, buf->frags->buf, + buf->frags->map); + + kfree(buf->frags); +} +EXPORT_SYMBOL_GPL(mlx5_buf_free); + +int mlx5_frag_buf_alloc_node(struct mlx5_core_dev *dev, int size, + struct mlx5_frag_buf *buf, int node) +{ + int i; + + buf->size = size; + buf->npages = DIV_ROUND_UP(size, PAGE_SIZE); + buf->page_shift = PAGE_SHIFT; + buf->frags = kcalloc(buf->npages, sizeof(struct mlx5_buf_list), + GFP_KERNEL); + if (!buf->frags) + goto err_out; + + for (i = 0; i < buf->npages; i++) { + struct mlx5_buf_list *frag = &buf->frags[i]; + int frag_sz = min_t(int, size, PAGE_SIZE); + + frag->buf = mlx5_dma_zalloc_coherent_node(dev, frag_sz, + &frag->map, node); + if (!frag->buf) + goto err_free_buf; + if (frag->map & ((1 << buf->page_shift) - 1)) { + dma_free_coherent(mlx5_core_dma_dev(dev), frag_sz, + buf->frags[i].buf, buf->frags[i].map); + mlx5_core_warn(dev, "unexpected map alignment: %pad, page_shift=%d\n", + &frag->map, buf->page_shift); + goto err_free_buf; + } + size -= frag_sz; + } + + return 0; + +err_free_buf: + while (i--) + dma_free_coherent(mlx5_core_dma_dev(dev), PAGE_SIZE, buf->frags[i].buf, + buf->frags[i].map); + kfree(buf->frags); +err_out: + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(mlx5_frag_buf_alloc_node); + +void mlx5_frag_buf_free(struct mlx5_core_dev *dev, struct mlx5_frag_buf *buf) +{ + int size = buf->size; + int i; + + for (i = 0; i < buf->npages; i++) { + int frag_sz = min_t(int, size, PAGE_SIZE); + + dma_free_coherent(mlx5_core_dma_dev(dev), frag_sz, buf->frags[i].buf, + buf->frags[i].map); + size -= frag_sz; + } + kfree(buf->frags); +} +EXPORT_SYMBOL_GPL(mlx5_frag_buf_free); + +static struct mlx5_db_pgdir *mlx5_alloc_db_pgdir(struct mlx5_core_dev *dev, + int node) +{ + u32 db_per_page = PAGE_SIZE / cache_line_size(); + struct mlx5_db_pgdir *pgdir; + + pgdir = kzalloc(sizeof(*pgdir), GFP_KERNEL); + if (!pgdir) + return NULL; + + pgdir->bitmap = bitmap_zalloc(db_per_page, GFP_KERNEL); + if (!pgdir->bitmap) { + kfree(pgdir); + return NULL; + } + + bitmap_fill(pgdir->bitmap, db_per_page); + + pgdir->db_page = mlx5_dma_zalloc_coherent_node(dev, PAGE_SIZE, + &pgdir->db_dma, node); + if (!pgdir->db_page) { + bitmap_free(pgdir->bitmap); + kfree(pgdir); + return NULL; + } + + return pgdir; +} + +static int mlx5_alloc_db_from_pgdir(struct mlx5_db_pgdir *pgdir, + struct mlx5_db *db) +{ + u32 db_per_page = PAGE_SIZE / cache_line_size(); + int offset; + int i; + + i = find_first_bit(pgdir->bitmap, db_per_page); + if (i >= db_per_page) + return -ENOMEM; + + __clear_bit(i, pgdir->bitmap); + + db->u.pgdir = pgdir; + db->index = i; + offset = db->index * cache_line_size(); + db->db = pgdir->db_page + offset / sizeof(*pgdir->db_page); + db->dma = pgdir->db_dma + offset; + + db->db[0] = 0; + db->db[1] = 0; + + return 0; +} + +int mlx5_db_alloc_node(struct mlx5_core_dev *dev, struct mlx5_db *db, int node) +{ + struct mlx5_db_pgdir *pgdir; + int ret = 0; + + mutex_lock(&dev->priv.pgdir_mutex); + + list_for_each_entry(pgdir, &dev->priv.pgdir_list, list) + if (!mlx5_alloc_db_from_pgdir(pgdir, db)) + goto out; + + pgdir = mlx5_alloc_db_pgdir(dev, node); + if (!pgdir) { + ret = -ENOMEM; + goto out; + } + + list_add(&pgdir->list, &dev->priv.pgdir_list); + + /* This should never fail -- we just allocated an empty page: */ + WARN_ON(mlx5_alloc_db_from_pgdir(pgdir, db)); + +out: + mutex_unlock(&dev->priv.pgdir_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(mlx5_db_alloc_node); + +int mlx5_db_alloc(struct mlx5_core_dev *dev, struct mlx5_db *db) +{ + return mlx5_db_alloc_node(dev, db, dev->priv.numa_node); +} +EXPORT_SYMBOL_GPL(mlx5_db_alloc); + +void mlx5_db_free(struct mlx5_core_dev *dev, struct mlx5_db *db) +{ + u32 db_per_page = PAGE_SIZE / cache_line_size(); + + mutex_lock(&dev->priv.pgdir_mutex); + + __set_bit(db->index, db->u.pgdir->bitmap); + + if (bitmap_full(db->u.pgdir->bitmap, db_per_page)) { + dma_free_coherent(mlx5_core_dma_dev(dev), PAGE_SIZE, + db->u.pgdir->db_page, db->u.pgdir->db_dma); + list_del(&db->u.pgdir->list); + bitmap_free(db->u.pgdir->bitmap); + kfree(db->u.pgdir); + } + + mutex_unlock(&dev->priv.pgdir_mutex); +} +EXPORT_SYMBOL_GPL(mlx5_db_free); + +void mlx5_fill_page_array(struct mlx5_frag_buf *buf, __be64 *pas) +{ + u64 addr; + int i; + + for (i = 0; i < buf->npages; i++) { + addr = buf->frags->map + (i << buf->page_shift); + + pas[i] = cpu_to_be64(addr); + } +} +EXPORT_SYMBOL_GPL(mlx5_fill_page_array); + +void mlx5_fill_page_frag_array_perm(struct mlx5_frag_buf *buf, __be64 *pas, u8 perm) +{ + int i; + + WARN_ON(perm & 0xfc); + for (i = 0; i < buf->npages; i++) + pas[i] = cpu_to_be64(buf->frags[i].map | perm); +} +EXPORT_SYMBOL_GPL(mlx5_fill_page_frag_array_perm); + +void mlx5_fill_page_frag_array(struct mlx5_frag_buf *buf, __be64 *pas) +{ + mlx5_fill_page_frag_array_perm(buf, pas, 0); +} +EXPORT_SYMBOL_GPL(mlx5_fill_page_frag_array); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/cmd.c new file mode 100644 index 0000000..e9734c5 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -0,0 +1,2641 @@ +/* + * Copyright (c) 2013-2016, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mlx5_core.h" +#include "lib/eq.h" +#include "lib/tout.h" + +enum { + CMD_IF_REV = 5, +}; + +enum { + CMD_MODE_POLLING, + CMD_MODE_EVENTS +}; + +enum { + MLX5_CMD_DELIVERY_STAT_OK = 0x0, + MLX5_CMD_DELIVERY_STAT_SIGNAT_ERR = 0x1, + MLX5_CMD_DELIVERY_STAT_TOK_ERR = 0x2, + MLX5_CMD_DELIVERY_STAT_BAD_BLK_NUM_ERR = 0x3, + MLX5_CMD_DELIVERY_STAT_OUT_PTR_ALIGN_ERR = 0x4, + MLX5_CMD_DELIVERY_STAT_IN_PTR_ALIGN_ERR = 0x5, + MLX5_CMD_DELIVERY_STAT_FW_ERR = 0x6, + MLX5_CMD_DELIVERY_STAT_IN_LENGTH_ERR = 0x7, + MLX5_CMD_DELIVERY_STAT_OUT_LENGTH_ERR = 0x8, + MLX5_CMD_DELIVERY_STAT_RES_FLD_NOT_CLR_ERR = 0x9, + MLX5_CMD_DELIVERY_STAT_CMD_DESCR_ERR = 0x10, +}; + +static int cmd_sysfs_init(struct mlx5_core_dev *dev); +static void cmd_sysfs_cleanup(struct mlx5_core_dev *dev); + +static struct mlx5_cmd_work_ent * +cmd_alloc_ent(struct mlx5_cmd *cmd, struct mlx5_cmd_msg *in, + struct mlx5_cmd_msg *out, void *uout, int uout_size, + mlx5_cmd_cbk_t cbk, void *context, int page_queue) +{ + gfp_t alloc_flags = cbk ? GFP_ATOMIC : GFP_KERNEL; + struct mlx5_cmd_work_ent *ent; + + ent = kzalloc(sizeof(*ent), alloc_flags); + if (!ent) + return ERR_PTR(-ENOMEM); + + ent->idx = -EINVAL; + ent->in = in; + ent->out = out; + ent->uout = uout; + ent->uout_size = uout_size; + ent->callback = cbk; + ent->context = context; + ent->cmd = cmd; + ent->page_queue = page_queue; + refcount_set(&ent->refcnt, 1); + + return ent; +} + +static void cmd_free_ent(struct mlx5_cmd_work_ent *ent) +{ + kfree(ent); +} + +static u8 alloc_token(struct mlx5_cmd *cmd) +{ + u8 token; + + spin_lock(&cmd->token_lock); + cmd->token++; + if (cmd->token == 0) + cmd->token++; + token = cmd->token; + spin_unlock(&cmd->token_lock); + + return token; +} + +static int cmd_alloc_index(struct mlx5_cmd *cmd) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&cmd->alloc_lock, flags); + ret = find_first_bit(&cmd->bitmask, cmd->max_reg_cmds); + if (ret < cmd->max_reg_cmds) + clear_bit(ret, &cmd->bitmask); + spin_unlock_irqrestore(&cmd->alloc_lock, flags); + + return ret < cmd->max_reg_cmds ? ret : -ENOMEM; +} + +static void cmd_free_index(struct mlx5_cmd *cmd, int idx) +{ + lockdep_assert_held(&cmd->alloc_lock); + set_bit(idx, &cmd->bitmask); +} + +static void cmd_ent_get(struct mlx5_cmd_work_ent *ent) +{ + refcount_inc(&ent->refcnt); +} + +static void cmd_ent_put(struct mlx5_cmd_work_ent *ent) +{ + struct mlx5_cmd *cmd = ent->cmd; + unsigned long flags; + + spin_lock_irqsave(&cmd->alloc_lock, flags); + if (!refcount_dec_and_test(&ent->refcnt)) + goto out; + + if (ent->idx >= 0) { + cmd_free_index(cmd, ent->idx); + up(ent->page_queue ? &cmd->pages_sem : &cmd->sem); + } + + cmd_free_ent(ent); +out: + spin_unlock_irqrestore(&cmd->alloc_lock, flags); +} + +static struct mlx5_cmd_layout *get_inst(struct mlx5_cmd *cmd, int idx) +{ + return cmd->cmd_buf + (idx << cmd->log_stride); +} + +static int mlx5_calc_cmd_blocks(struct mlx5_cmd_msg *msg) +{ + int size = msg->len; + int blen = size - min_t(int, sizeof(msg->first.data), size); + + return DIV_ROUND_UP(blen, MLX5_CMD_DATA_BLOCK_SIZE); +} + +static u8 xor8_buf(void *buf, size_t offset, int len) +{ + u8 *ptr = buf; + u8 sum = 0; + int i; + int end = len + offset; + + for (i = offset; i < end; i++) + sum ^= ptr[i]; + + return sum; +} + +static int verify_block_sig(struct mlx5_cmd_prot_block *block) +{ + size_t rsvd0_off = offsetof(struct mlx5_cmd_prot_block, rsvd0); + int xor_len = sizeof(*block) - sizeof(block->data) - 1; + + if (xor8_buf(block, rsvd0_off, xor_len) != 0xff) + return -EINVAL; + + if (xor8_buf(block, 0, sizeof(*block)) != 0xff) + return -EINVAL; + + return 0; +} + +static void calc_block_sig(struct mlx5_cmd_prot_block *block) +{ + int ctrl_xor_len = sizeof(*block) - sizeof(block->data) - 2; + size_t rsvd0_off = offsetof(struct mlx5_cmd_prot_block, rsvd0); + + block->ctrl_sig = ~xor8_buf(block, rsvd0_off, ctrl_xor_len); + block->sig = ~xor8_buf(block, 0, sizeof(*block) - 1); +} + +static void calc_chain_sig(struct mlx5_cmd_msg *msg) +{ + struct mlx5_cmd_mailbox *next = msg->next; + int n = mlx5_calc_cmd_blocks(msg); + int i = 0; + + for (i = 0; i < n && next; i++) { + calc_block_sig(next->buf); + next = next->next; + } +} + +static void set_signature(struct mlx5_cmd_work_ent *ent, int csum) +{ + ent->lay->sig = ~xor8_buf(ent->lay, 0, sizeof(*ent->lay)); + if (csum) { + calc_chain_sig(ent->in); + calc_chain_sig(ent->out); + } +} + +static void poll_timeout(struct mlx5_cmd_work_ent *ent) +{ + struct mlx5_core_dev *dev = container_of(ent->cmd, struct mlx5_core_dev, cmd); + u64 cmd_to_ms = mlx5_tout_ms(dev, CMD); + unsigned long poll_end; + u8 own; + + poll_end = jiffies + msecs_to_jiffies(cmd_to_ms + 1000); + + do { + own = READ_ONCE(ent->lay->status_own); + if (!(own & CMD_OWNER_HW)) { + ent->ret = 0; + return; + } + cond_resched(); + } while (time_before(jiffies, poll_end)); + + ent->ret = -ETIMEDOUT; +} + +static int verify_signature(struct mlx5_cmd_work_ent *ent) +{ + struct mlx5_cmd_mailbox *next = ent->out->next; + int n = mlx5_calc_cmd_blocks(ent->out); + int err; + u8 sig; + int i = 0; + + sig = xor8_buf(ent->lay, 0, sizeof(*ent->lay)); + if (sig != 0xff) + return -EINVAL; + + for (i = 0; i < n && next; i++) { + err = verify_block_sig(next->buf); + if (err) + return err; + + next = next->next; + } + + return 0; +} + +static void dump_buf(void *buf, int size, int data_only, int offset, int idx) +{ + __be32 *p = buf; + int i; + + for (i = 0; i < size; i += 16) { + pr_debug("cmd[%d]: %03x: %08x %08x %08x %08x\n", idx, offset, + be32_to_cpu(p[0]), be32_to_cpu(p[1]), + be32_to_cpu(p[2]), be32_to_cpu(p[3])); + p += 4; + offset += 16; + } + if (!data_only) + pr_debug("\n"); +} + +static int mlx5_internal_err_ret_value(struct mlx5_core_dev *dev, u16 op, + u32 *synd, u8 *status) +{ + *synd = 0; + *status = 0; + + switch (op) { + case MLX5_CMD_OP_TEARDOWN_HCA: + case MLX5_CMD_OP_DISABLE_HCA: + case MLX5_CMD_OP_MANAGE_PAGES: + case MLX5_CMD_OP_DESTROY_MKEY: + case MLX5_CMD_OP_DESTROY_EQ: + case MLX5_CMD_OP_DESTROY_CQ: + case MLX5_CMD_OP_DESTROY_QP: + case MLX5_CMD_OP_SET_DC_CNAK_TRACE: + case MLX5_CMD_OP_DESTROY_PSV: + case MLX5_CMD_OP_DESTROY_SRQ: + case MLX5_CMD_OP_DESTROY_XRC_SRQ: + case MLX5_CMD_OP_DESTROY_XRQ: + case MLX5_CMD_OP_DESTROY_DCT: + case MLX5_CMD_OP_DEALLOC_Q_COUNTER: + case MLX5_CMD_OP_DESTROY_SCHEDULING_ELEMENT: + case MLX5_CMD_OP_DESTROY_QOS_PARA_VPORT: + case MLX5_CMD_OP_DEALLOC_PD: + case MLX5_CMD_OP_DEALLOC_UAR: + case MLX5_CMD_OP_DETACH_FROM_MCG: + case MLX5_CMD_OP_DEALLOC_XRCD: + case MLX5_CMD_OP_DEALLOC_TRANSPORT_DOMAIN: + case MLX5_CMD_OP_DELETE_VXLAN_UDP_DPORT: + case MLX5_CMD_OP_DELETE_L2_TABLE_ENTRY: + case MLX5_CMD_OP_DESTROY_LAG: + case MLX5_CMD_OP_DESTROY_VPORT_LAG: + case MLX5_CMD_OP_DESTROY_TIR: + case MLX5_CMD_OP_DESTROY_SQ: + case MLX5_CMD_OP_DESTROY_RQ: + case MLX5_CMD_OP_DESTROY_RMP: + case MLX5_CMD_OP_DESTROY_TIS: + case MLX5_CMD_OP_DESTROY_RQT: + case MLX5_CMD_OP_DESTROY_FLOW_TABLE: + case MLX5_CMD_OP_DESTROY_FLOW_GROUP: + case MLX5_CMD_OP_DELETE_FLOW_TABLE_ENTRY: + case MLX5_CMD_OP_DEALLOC_FLOW_COUNTER: + case MLX5_CMD_OP_DESTROY_NVMF_BACKEND_CTRL: + case MLX5_CMD_OP_DETACH_NVMF_NAMESPACE: + case MLX5_CMD_OP_2ERR_QP: + case MLX5_CMD_OP_2RST_QP: + case MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT: + case MLX5_CMD_OP_MODIFY_FLOW_TABLE: + case MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY: + case MLX5_CMD_OP_SET_FLOW_TABLE_ROOT: + case MLX5_CMD_OP_DEALLOC_PACKET_REFORMAT_CONTEXT: + case MLX5_CMD_OP_DEALLOC_MODIFY_HEADER_CONTEXT: + case MLX5_CMD_OP_FPGA_DESTROY_QP: + case MLX5_CMD_OP_DESTROY_GENERAL_OBJECT: + case MLX5_CMD_OP_DEALLOC_MEMIC: + case MLX5_CMD_OP_PAGE_FAULT_RESUME: + case MLX5_CMD_OP_QUERY_ESW_FUNCTIONS: + case MLX5_CMD_OP_DEALLOC_SF: + case MLX5_CMD_OP_QUERY_HCA_VPORT_PKEY: + case MLX5_CMD_OP_ACCESS_REG: + case MLX5_CMD_OP_DESTROY_UCTX: + case MLX5_CMD_OP_DESTROY_UMEM: + case MLX5_CMD_OP_MODIFY_RQT: + return MLX5_CMD_STAT_OK; + + case MLX5_CMD_OP_QUERY_HCA_CAP: + case MLX5_CMD_OP_QUERY_ADAPTER: + case MLX5_CMD_OP_INIT_HCA: + case MLX5_CMD_OP_ENABLE_HCA: + case MLX5_CMD_OP_QUERY_PAGES: + case MLX5_CMD_OP_SET_HCA_CAP: + case MLX5_CMD_OP_QUERY_ISSI: + case MLX5_CMD_OP_SET_ISSI: + case MLX5_CMD_OP_QUERY_OTHER_HCA_CAP: + case MLX5_CMD_OP_MODIFY_OTHER_HCA_CAP: + case MLX5_CMD_OP_CREATE_MKEY: + case MLX5_CMD_OP_QUERY_MKEY: + case MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS: + case MLX5_CMD_OP_CREATE_EQ: + case MLX5_CMD_OP_QUERY_EQ: + case MLX5_CMD_OP_GEN_EQE: + case MLX5_CMD_OP_CREATE_CQ: + case MLX5_CMD_OP_QUERY_CQ: + case MLX5_CMD_OP_MODIFY_CQ: + case MLX5_CMD_OP_CREATE_QP: + case MLX5_CMD_OP_RST2INIT_QP: + case MLX5_CMD_OP_INIT2RTR_QP: + case MLX5_CMD_OP_RTR2RTS_QP: + case MLX5_CMD_OP_RTS2RTS_QP: + case MLX5_CMD_OP_SQERR2RTS_QP: + case MLX5_CMD_OP_QUERY_QP: + case MLX5_CMD_OP_SQD_RTS_QP: + case MLX5_CMD_OP_INIT2INIT_QP: + case MLX5_CMD_OP_CREATE_PSV: + case MLX5_CMD_OP_CREATE_SRQ: + case MLX5_CMD_OP_QUERY_SRQ: + case MLX5_CMD_OP_ARM_RQ: + case MLX5_CMD_OP_CREATE_XRC_SRQ: + case MLX5_CMD_OP_QUERY_XRC_SRQ: + case MLX5_CMD_OP_ARM_XRC_SRQ: + case MLX5_CMD_OP_CREATE_XRQ: + case MLX5_CMD_OP_QUERY_XRQ: + case MLX5_CMD_OP_ARM_XRQ: + case MLX5_CMD_OP_CREATE_DCT: + case MLX5_CMD_OP_DRAIN_DCT: + case MLX5_CMD_OP_QUERY_DCT: + case MLX5_CMD_OP_ARM_DCT_FOR_KEY_VIOLATION: + case MLX5_CMD_OP_QUERY_VPORT_STATE: + case MLX5_CMD_OP_MODIFY_VPORT_STATE: + case MLX5_CMD_OP_QUERY_ESW_VPORT_CONTEXT: + case MLX5_CMD_OP_MODIFY_ESW_VPORT_CONTEXT: + case MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT: + case MLX5_CMD_OP_QUERY_ROCE_ADDRESS: + case MLX5_CMD_OP_SET_ROCE_ADDRESS: + case MLX5_CMD_OP_QUERY_HCA_VPORT_CONTEXT: + case MLX5_CMD_OP_MODIFY_HCA_VPORT_CONTEXT: + case MLX5_CMD_OP_QUERY_HCA_VPORT_GID: + case MLX5_CMD_OP_QUERY_VNIC_ENV: + case MLX5_CMD_OP_QUERY_VPORT_COUNTER: + case MLX5_CMD_OP_ALLOC_Q_COUNTER: + case MLX5_CMD_OP_QUERY_Q_COUNTER: + case MLX5_CMD_OP_SET_MONITOR_COUNTER: + case MLX5_CMD_OP_ARM_MONITOR_COUNTER: + case MLX5_CMD_OP_SET_PP_RATE_LIMIT: + case MLX5_CMD_OP_QUERY_RATE_LIMIT: + case MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT: + case MLX5_CMD_OP_QUERY_SCHEDULING_ELEMENT: + case MLX5_CMD_OP_MODIFY_SCHEDULING_ELEMENT: + case MLX5_CMD_OP_CREATE_QOS_PARA_VPORT: + case MLX5_CMD_OP_ALLOC_PD: + case MLX5_CMD_OP_ALLOC_UAR: + case MLX5_CMD_OP_CONFIG_INT_MODERATION: + case MLX5_CMD_OP_QUERY_DIAGNOSTIC_PARAMS: + case MLX5_CMD_OP_SET_DIAGNOSTIC_PARAMS: + case MLX5_CMD_OP_QUERY_DIAGNOSTIC_COUNTERS: + case MLX5_CMD_OP_ATTACH_TO_MCG: + case MLX5_CMD_OP_GET_DROPPED_PACKET_LOG: + case MLX5_CMD_OP_MAD_IFC: + case MLX5_CMD_OP_QUERY_MAD_DEMUX: + case MLX5_CMD_OP_SET_MAD_DEMUX: + case MLX5_CMD_OP_NOP: + case MLX5_CMD_OP_ALLOC_XRCD: + case MLX5_CMD_OP_ALLOC_TRANSPORT_DOMAIN: + case MLX5_CMD_OP_QUERY_CONG_STATUS: + case MLX5_CMD_OP_MODIFY_CONG_STATUS: + case MLX5_CMD_OP_QUERY_CONG_PARAMS: + case MLX5_CMD_OP_MODIFY_CONG_PARAMS: + case MLX5_CMD_OP_QUERY_CONG_STATISTICS: + case MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT: + case MLX5_CMD_OP_SET_L2_TABLE_ENTRY: + case MLX5_CMD_OP_QUERY_L2_TABLE_ENTRY: + case MLX5_CMD_OP_CREATE_LAG: + case MLX5_CMD_OP_MODIFY_LAG: + case MLX5_CMD_OP_QUERY_LAG: + case MLX5_CMD_OP_CREATE_VPORT_LAG: + case MLX5_CMD_OP_CREATE_TIR: + case MLX5_CMD_OP_MODIFY_TIR: + case MLX5_CMD_OP_QUERY_TIR: + case MLX5_CMD_OP_CREATE_SQ: + case MLX5_CMD_OP_MODIFY_SQ: + case MLX5_CMD_OP_QUERY_SQ: + case MLX5_CMD_OP_CREATE_RQ: + case MLX5_CMD_OP_MODIFY_RQ: + case MLX5_CMD_OP_QUERY_RQ: + case MLX5_CMD_OP_CREATE_RMP: + case MLX5_CMD_OP_MODIFY_RMP: + case MLX5_CMD_OP_QUERY_RMP: + case MLX5_CMD_OP_CREATE_TIS: + case MLX5_CMD_OP_MODIFY_TIS: + case MLX5_CMD_OP_QUERY_TIS: + case MLX5_CMD_OP_CREATE_RQT: + case MLX5_CMD_OP_QUERY_RQT: + case MLX5_CMD_OP_CREATE_NVMF_BACKEND_CTRL: + case MLX5_CMD_OP_QUERY_NVMF_BACKEND_CTRL: + case MLX5_CMD_OP_ATTACH_NVMF_NAMESPACE: + case MLX5_CMD_OP_QUERY_NVMF_NAMESPACE_CONTEXT: + + case MLX5_CMD_OP_CREATE_FLOW_TABLE: + case MLX5_CMD_OP_QUERY_FLOW_TABLE: + case MLX5_CMD_OP_CREATE_FLOW_GROUP: + case MLX5_CMD_OP_QUERY_FLOW_GROUP: + case MLX5_CMD_OP_QUERY_FLOW_TABLE_ENTRY: + case MLX5_CMD_OP_ALLOC_FLOW_COUNTER: + case MLX5_CMD_OP_QUERY_FLOW_COUNTER: + case MLX5_CMD_OP_ALLOC_PACKET_REFORMAT_CONTEXT: + case MLX5_CMD_OP_ALLOC_MODIFY_HEADER_CONTEXT: + case MLX5_CMD_OP_FPGA_CREATE_QP: + case MLX5_CMD_OP_FPGA_MODIFY_QP: + case MLX5_CMD_OP_FPGA_QUERY_QP: + case MLX5_CMD_OP_FPGA_QUERY_QP_COUNTERS: + case MLX5_CMD_OP_CREATE_GENERAL_OBJECT: + case MLX5_CMD_OP_MODIFY_GENERAL_OBJECT: + case MLX5_CMD_OP_QUERY_GENERAL_OBJECT: + case MLX5_CMD_OP_CREATE_UCTX: + case MLX5_CMD_OP_CREATE_UMEM: + case MLX5_CMD_OP_ALLOC_MEMIC: + case MLX5_CMD_OP_SYNC_STEERING: + case MLX5_CMD_OP_MODIFY_XRQ: + case MLX5_CMD_OP_RELEASE_XRQ_ERROR: + case MLX5_CMD_OP_QUERY_VHCA_STATE: + case MLX5_CMD_OP_MODIFY_VHCA_STATE: + case MLX5_CMD_OP_ALLOC_SF: + *status = MLX5_DRIVER_STATUS_ABORTED; + *synd = MLX5_DRIVER_SYND; + return -EIO; + default: + mlx5_core_err(dev, "Unknown FW command (%d)\n", op); + return -EINVAL; + } +} + +const char *mlx5_command_str(int command) +{ +#define MLX5_COMMAND_STR_CASE(__cmd) case MLX5_CMD_OP_ ## __cmd: return #__cmd + + switch (command) { + MLX5_COMMAND_STR_CASE(QUERY_HCA_CAP); + MLX5_COMMAND_STR_CASE(QUERY_ADAPTER); + MLX5_COMMAND_STR_CASE(INIT_HCA); + MLX5_COMMAND_STR_CASE(TEARDOWN_HCA); + MLX5_COMMAND_STR_CASE(ENABLE_HCA); + MLX5_COMMAND_STR_CASE(DISABLE_HCA); + MLX5_COMMAND_STR_CASE(QUERY_PAGES); + MLX5_COMMAND_STR_CASE(MANAGE_PAGES); + MLX5_COMMAND_STR_CASE(SET_HCA_CAP); + MLX5_COMMAND_STR_CASE(QUERY_ISSI); + MLX5_COMMAND_STR_CASE(SET_ISSI); + MLX5_COMMAND_STR_CASE(SET_DRIVER_VERSION); + MLX5_COMMAND_STR_CASE(CREATE_MKEY); + MLX5_COMMAND_STR_CASE(QUERY_MKEY); + MLX5_COMMAND_STR_CASE(DESTROY_MKEY); + MLX5_COMMAND_STR_CASE(QUERY_SPECIAL_CONTEXTS); + MLX5_COMMAND_STR_CASE(PAGE_FAULT_RESUME); + MLX5_COMMAND_STR_CASE(CREATE_EQ); + MLX5_COMMAND_STR_CASE(DESTROY_EQ); + MLX5_COMMAND_STR_CASE(QUERY_EQ); + MLX5_COMMAND_STR_CASE(GEN_EQE); + MLX5_COMMAND_STR_CASE(CREATE_CQ); + MLX5_COMMAND_STR_CASE(DESTROY_CQ); + MLX5_COMMAND_STR_CASE(QUERY_CQ); + MLX5_COMMAND_STR_CASE(MODIFY_CQ); + MLX5_COMMAND_STR_CASE(CREATE_QP); + MLX5_COMMAND_STR_CASE(DESTROY_QP); + MLX5_COMMAND_STR_CASE(RST2INIT_QP); + MLX5_COMMAND_STR_CASE(INIT2RTR_QP); + MLX5_COMMAND_STR_CASE(RTR2RTS_QP); + MLX5_COMMAND_STR_CASE(RTS2RTS_QP); + MLX5_COMMAND_STR_CASE(SQERR2RTS_QP); + MLX5_COMMAND_STR_CASE(2ERR_QP); + MLX5_COMMAND_STR_CASE(2RST_QP); + MLX5_COMMAND_STR_CASE(QUERY_QP); + MLX5_COMMAND_STR_CASE(SQD_RTS_QP); + MLX5_COMMAND_STR_CASE(INIT2INIT_QP); + MLX5_COMMAND_STR_CASE(CREATE_PSV); + MLX5_COMMAND_STR_CASE(DESTROY_PSV); + MLX5_COMMAND_STR_CASE(CREATE_SRQ); + MLX5_COMMAND_STR_CASE(DESTROY_SRQ); + MLX5_COMMAND_STR_CASE(QUERY_SRQ); + MLX5_COMMAND_STR_CASE(ARM_RQ); + MLX5_COMMAND_STR_CASE(CREATE_XRC_SRQ); + MLX5_COMMAND_STR_CASE(DESTROY_XRC_SRQ); + MLX5_COMMAND_STR_CASE(QUERY_XRC_SRQ); + MLX5_COMMAND_STR_CASE(ARM_XRC_SRQ); + MLX5_COMMAND_STR_CASE(CREATE_DCT); + MLX5_COMMAND_STR_CASE(DESTROY_DCT); + MLX5_COMMAND_STR_CASE(DRAIN_DCT); + MLX5_COMMAND_STR_CASE(QUERY_DCT); + MLX5_COMMAND_STR_CASE(SET_DC_CNAK_TRACE); + MLX5_COMMAND_STR_CASE(ARM_DCT_FOR_KEY_VIOLATION); + MLX5_COMMAND_STR_CASE(QUERY_VPORT_STATE); + MLX5_COMMAND_STR_CASE(MODIFY_VPORT_STATE); + MLX5_COMMAND_STR_CASE(QUERY_ESW_VPORT_CONTEXT); + MLX5_COMMAND_STR_CASE(MODIFY_ESW_VPORT_CONTEXT); + MLX5_COMMAND_STR_CASE(QUERY_NIC_VPORT_CONTEXT); + MLX5_COMMAND_STR_CASE(MODIFY_NIC_VPORT_CONTEXT); + MLX5_COMMAND_STR_CASE(QUERY_ROCE_ADDRESS); + MLX5_COMMAND_STR_CASE(SET_ROCE_ADDRESS); + MLX5_COMMAND_STR_CASE(QUERY_HCA_VPORT_CONTEXT); + MLX5_COMMAND_STR_CASE(MODIFY_HCA_VPORT_CONTEXT); + MLX5_COMMAND_STR_CASE(QUERY_HCA_VPORT_GID); + MLX5_COMMAND_STR_CASE(QUERY_HCA_VPORT_PKEY); + MLX5_COMMAND_STR_CASE(QUERY_VNIC_ENV); + MLX5_COMMAND_STR_CASE(QUERY_VPORT_COUNTER); + MLX5_COMMAND_STR_CASE(ALLOC_Q_COUNTER); + MLX5_COMMAND_STR_CASE(DEALLOC_Q_COUNTER); + MLX5_COMMAND_STR_CASE(QUERY_Q_COUNTER); + MLX5_COMMAND_STR_CASE(SET_MONITOR_COUNTER); + MLX5_COMMAND_STR_CASE(ARM_MONITOR_COUNTER); + MLX5_COMMAND_STR_CASE(SET_PP_RATE_LIMIT); + MLX5_COMMAND_STR_CASE(QUERY_RATE_LIMIT); + MLX5_COMMAND_STR_CASE(CREATE_SCHEDULING_ELEMENT); + MLX5_COMMAND_STR_CASE(DESTROY_SCHEDULING_ELEMENT); + MLX5_COMMAND_STR_CASE(QUERY_SCHEDULING_ELEMENT); + MLX5_COMMAND_STR_CASE(MODIFY_SCHEDULING_ELEMENT); + MLX5_COMMAND_STR_CASE(CREATE_QOS_PARA_VPORT); + MLX5_COMMAND_STR_CASE(DESTROY_QOS_PARA_VPORT); + MLX5_COMMAND_STR_CASE(ALLOC_PD); + MLX5_COMMAND_STR_CASE(DEALLOC_PD); + MLX5_COMMAND_STR_CASE(ALLOC_UAR); + MLX5_COMMAND_STR_CASE(DEALLOC_UAR); + MLX5_COMMAND_STR_CASE(CONFIG_INT_MODERATION); + MLX5_COMMAND_STR_CASE(ACCESS_REG); + MLX5_COMMAND_STR_CASE(ATTACH_TO_MCG); + MLX5_COMMAND_STR_CASE(DETACH_FROM_MCG); + MLX5_COMMAND_STR_CASE(GET_DROPPED_PACKET_LOG); + MLX5_COMMAND_STR_CASE(MAD_IFC); + MLX5_COMMAND_STR_CASE(QUERY_MAD_DEMUX); + MLX5_COMMAND_STR_CASE(SET_MAD_DEMUX); + MLX5_COMMAND_STR_CASE(NOP); + MLX5_COMMAND_STR_CASE(ALLOC_XRCD); + MLX5_COMMAND_STR_CASE(DEALLOC_XRCD); + MLX5_COMMAND_STR_CASE(ALLOC_TRANSPORT_DOMAIN); + MLX5_COMMAND_STR_CASE(DEALLOC_TRANSPORT_DOMAIN); + MLX5_COMMAND_STR_CASE(QUERY_CONG_STATUS); + MLX5_COMMAND_STR_CASE(MODIFY_CONG_STATUS); + MLX5_COMMAND_STR_CASE(QUERY_CONG_PARAMS); + MLX5_COMMAND_STR_CASE(MODIFY_CONG_PARAMS); + MLX5_COMMAND_STR_CASE(QUERY_CONG_STATISTICS); + MLX5_COMMAND_STR_CASE(ADD_VXLAN_UDP_DPORT); + MLX5_COMMAND_STR_CASE(DELETE_VXLAN_UDP_DPORT); + MLX5_COMMAND_STR_CASE(SET_L2_TABLE_ENTRY); + MLX5_COMMAND_STR_CASE(QUERY_L2_TABLE_ENTRY); + MLX5_COMMAND_STR_CASE(DELETE_L2_TABLE_ENTRY); + MLX5_COMMAND_STR_CASE(SET_WOL_ROL); + MLX5_COMMAND_STR_CASE(QUERY_WOL_ROL); + MLX5_COMMAND_STR_CASE(CREATE_LAG); + MLX5_COMMAND_STR_CASE(MODIFY_LAG); + MLX5_COMMAND_STR_CASE(QUERY_LAG); + MLX5_COMMAND_STR_CASE(DESTROY_LAG); + MLX5_COMMAND_STR_CASE(CREATE_VPORT_LAG); + MLX5_COMMAND_STR_CASE(DESTROY_VPORT_LAG); + MLX5_COMMAND_STR_CASE(CREATE_TIR); + MLX5_COMMAND_STR_CASE(MODIFY_TIR); + MLX5_COMMAND_STR_CASE(DESTROY_TIR); + MLX5_COMMAND_STR_CASE(QUERY_TIR); + MLX5_COMMAND_STR_CASE(CREATE_SQ); + MLX5_COMMAND_STR_CASE(MODIFY_SQ); + MLX5_COMMAND_STR_CASE(DESTROY_SQ); + MLX5_COMMAND_STR_CASE(QUERY_SQ); + MLX5_COMMAND_STR_CASE(CREATE_RQ); + MLX5_COMMAND_STR_CASE(MODIFY_RQ); + MLX5_COMMAND_STR_CASE(DESTROY_RQ); + MLX5_COMMAND_STR_CASE(QUERY_RQ); + MLX5_COMMAND_STR_CASE(CREATE_RMP); + MLX5_COMMAND_STR_CASE(MODIFY_RMP); + MLX5_COMMAND_STR_CASE(DESTROY_RMP); + MLX5_COMMAND_STR_CASE(QUERY_RMP); + MLX5_COMMAND_STR_CASE(CREATE_TIS); + MLX5_COMMAND_STR_CASE(MODIFY_TIS); + MLX5_COMMAND_STR_CASE(DESTROY_TIS); + MLX5_COMMAND_STR_CASE(QUERY_TIS); + MLX5_COMMAND_STR_CASE(CREATE_RQT); + MLX5_COMMAND_STR_CASE(MODIFY_RQT); + MLX5_COMMAND_STR_CASE(DESTROY_RQT); + MLX5_COMMAND_STR_CASE(QUERY_RQT); + MLX5_COMMAND_STR_CASE(SET_FLOW_TABLE_ROOT); + MLX5_COMMAND_STR_CASE(CREATE_FLOW_TABLE); + MLX5_COMMAND_STR_CASE(DESTROY_FLOW_TABLE); + MLX5_COMMAND_STR_CASE(QUERY_FLOW_TABLE); + MLX5_COMMAND_STR_CASE(CREATE_FLOW_GROUP); + MLX5_COMMAND_STR_CASE(DESTROY_FLOW_GROUP); + MLX5_COMMAND_STR_CASE(QUERY_FLOW_GROUP); + MLX5_COMMAND_STR_CASE(SET_FLOW_TABLE_ENTRY); + MLX5_COMMAND_STR_CASE(QUERY_FLOW_TABLE_ENTRY); + MLX5_COMMAND_STR_CASE(DELETE_FLOW_TABLE_ENTRY); + MLX5_COMMAND_STR_CASE(ALLOC_FLOW_COUNTER); + MLX5_COMMAND_STR_CASE(DEALLOC_FLOW_COUNTER); + MLX5_COMMAND_STR_CASE(QUERY_FLOW_COUNTER); + MLX5_COMMAND_STR_CASE(MODIFY_FLOW_TABLE); + MLX5_COMMAND_STR_CASE(ALLOC_PACKET_REFORMAT_CONTEXT); + MLX5_COMMAND_STR_CASE(DEALLOC_PACKET_REFORMAT_CONTEXT); + MLX5_COMMAND_STR_CASE(ALLOC_MODIFY_HEADER_CONTEXT); + MLX5_COMMAND_STR_CASE(DEALLOC_MODIFY_HEADER_CONTEXT); + MLX5_COMMAND_STR_CASE(FPGA_CREATE_QP); + MLX5_COMMAND_STR_CASE(FPGA_MODIFY_QP); + MLX5_COMMAND_STR_CASE(FPGA_QUERY_QP); + MLX5_COMMAND_STR_CASE(FPGA_QUERY_QP_COUNTERS); + MLX5_COMMAND_STR_CASE(FPGA_DESTROY_QP); + MLX5_COMMAND_STR_CASE(CREATE_XRQ); + MLX5_COMMAND_STR_CASE(DESTROY_XRQ); + MLX5_COMMAND_STR_CASE(QUERY_XRQ); + MLX5_COMMAND_STR_CASE(ARM_XRQ); + MLX5_COMMAND_STR_CASE(CREATE_NVMF_BACKEND_CTRL); + MLX5_COMMAND_STR_CASE(DESTROY_NVMF_BACKEND_CTRL); + MLX5_COMMAND_STR_CASE(QUERY_NVMF_BACKEND_CTRL); + MLX5_COMMAND_STR_CASE(ATTACH_NVMF_NAMESPACE); + MLX5_COMMAND_STR_CASE(DETACH_NVMF_NAMESPACE); + MLX5_COMMAND_STR_CASE(QUERY_NVMF_NAMESPACE_CONTEXT); + MLX5_COMMAND_STR_CASE(CREATE_GENERAL_OBJECT); + MLX5_COMMAND_STR_CASE(DESTROY_GENERAL_OBJECT); + MLX5_COMMAND_STR_CASE(MODIFY_GENERAL_OBJECT); + MLX5_COMMAND_STR_CASE(QUERY_GENERAL_OBJECT); + MLX5_COMMAND_STR_CASE(QUERY_MODIFY_HEADER_CONTEXT); + MLX5_COMMAND_STR_CASE(ALLOC_MEMIC); + MLX5_COMMAND_STR_CASE(DEALLOC_MEMIC); + MLX5_COMMAND_STR_CASE(QUERY_ESW_FUNCTIONS); + MLX5_COMMAND_STR_CASE(CREATE_UCTX); + MLX5_COMMAND_STR_CASE(DESTROY_UCTX); + MLX5_COMMAND_STR_CASE(CREATE_UMEM); + MLX5_COMMAND_STR_CASE(DESTROY_UMEM); + MLX5_COMMAND_STR_CASE(RELEASE_XRQ_ERROR); + MLX5_COMMAND_STR_CASE(MODIFY_XRQ); + MLX5_COMMAND_STR_CASE(QUERY_DIAGNOSTIC_PARAMS); + MLX5_COMMAND_STR_CASE(SET_DIAGNOSTIC_PARAMS); + MLX5_COMMAND_STR_CASE(QUERY_DIAGNOSTIC_COUNTERS); + MLX5_COMMAND_STR_CASE(QUERY_VHCA_STATE); + MLX5_COMMAND_STR_CASE(MODIFY_VHCA_STATE); + MLX5_COMMAND_STR_CASE(ALLOC_SF); + MLX5_COMMAND_STR_CASE(DEALLOC_SF); + MLX5_COMMAND_STR_CASE(SYNC_STEERING); + default: return "unknown command opcode"; + } +} + +static const char *cmd_status_str(u8 status) +{ + switch (status) { + case MLX5_CMD_STAT_OK: + return "OK"; + case MLX5_CMD_STAT_INT_ERR: + return "internal error"; + case MLX5_CMD_STAT_BAD_OP_ERR: + return "bad operation"; + case MLX5_CMD_STAT_BAD_PARAM_ERR: + return "bad parameter"; + case MLX5_CMD_STAT_BAD_SYS_STATE_ERR: + return "bad system state"; + case MLX5_CMD_STAT_BAD_RES_ERR: + return "bad resource"; + case MLX5_CMD_STAT_RES_BUSY: + return "resource busy"; + case MLX5_CMD_STAT_LIM_ERR: + return "limits exceeded"; + case MLX5_CMD_STAT_BAD_RES_STATE_ERR: + return "bad resource state"; + case MLX5_CMD_STAT_IX_ERR: + return "bad index"; + case MLX5_CMD_STAT_NO_RES_ERR: + return "no resources"; + case MLX5_CMD_STAT_BAD_INP_LEN_ERR: + return "bad input length"; + case MLX5_CMD_STAT_BAD_OUTP_LEN_ERR: + return "bad output length"; + case MLX5_CMD_STAT_BAD_QP_STATE_ERR: + return "bad QP state"; + case MLX5_CMD_STAT_BAD_PKT_ERR: + return "bad packet (discarded)"; + case MLX5_CMD_STAT_BAD_SIZE_OUTS_CQES_ERR: + return "bad size too many outstanding CQEs"; + default: + return "unknown status"; + } +} + +static int cmd_status_to_err(u8 status) +{ + switch (status) { + case MLX5_CMD_STAT_OK: return 0; + case MLX5_CMD_STAT_INT_ERR: return -EIO; + case MLX5_CMD_STAT_BAD_OP_ERR: return -EINVAL; + case MLX5_CMD_STAT_BAD_PARAM_ERR: return -EINVAL; + case MLX5_CMD_STAT_BAD_SYS_STATE_ERR: return -EIO; + case MLX5_CMD_STAT_BAD_RES_ERR: return -EINVAL; + case MLX5_CMD_STAT_RES_BUSY: return -EBUSY; + case MLX5_CMD_STAT_LIM_ERR: return -ENOMEM; + case MLX5_CMD_STAT_BAD_RES_STATE_ERR: return -EINVAL; + case MLX5_CMD_STAT_IX_ERR: return -EINVAL; + case MLX5_CMD_STAT_NO_RES_ERR: return -EAGAIN; + case MLX5_CMD_STAT_BAD_INP_LEN_ERR: return -EIO; + case MLX5_CMD_STAT_BAD_OUTP_LEN_ERR: return -EIO; + case MLX5_CMD_STAT_BAD_QP_STATE_ERR: return -EINVAL; + case MLX5_CMD_STAT_BAD_PKT_ERR: return -EINVAL; + case MLX5_CMD_STAT_BAD_SIZE_OUTS_CQES_ERR: return -EINVAL; + default: return -EIO; + } +} + +struct mlx5_ifc_mbox_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_mbox_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x40]; +}; + +void mlx5_cmd_out_err(struct mlx5_core_dev *dev, u16 opcode, u16 op_mod, void *out) +{ + u32 syndrome = MLX5_GET(mbox_out, out, syndrome); + u8 status = MLX5_GET(mbox_out, out, status); + + mlx5_core_err_rl(dev, + "%s(0x%x) op_mod(0x%x) failed, status %s(0x%x), syndrome (0x%x), err(%d)\n", + mlx5_command_str(opcode), opcode, op_mod, + cmd_status_str(status), status, syndrome, cmd_status_to_err(status)); +} +EXPORT_SYMBOL(mlx5_cmd_out_err); + +static void cmd_status_print(struct mlx5_core_dev *dev, void *in, void *out) +{ + u16 opcode, op_mod; + u32 syndrome; + u8 status; + u16 uid; + int err; + + syndrome = MLX5_GET(mbox_out, out, syndrome); + status = MLX5_GET(mbox_out, out, status); + + opcode = MLX5_GET(mbox_in, in, opcode); + op_mod = MLX5_GET(mbox_in, in, op_mod); + uid = MLX5_GET(mbox_in, in, uid); + + err = cmd_status_to_err(status); + + if (!uid && + opcode != MLX5_CMD_OP_DESTROY_MKEY && + !(opcode == MLX5_CMD_OP_ALLOC_MEMIC && status == MLX5_CMD_STAT_NO_RES_ERR)) + mlx5_cmd_out_err(dev, opcode, op_mod, out); + else + mlx5_core_dbg(dev, + "%s(0x%x) op_mod(0x%x) uid(%d) failed, status %s(0x%x), syndrome (0x%x), err(%d)\n", + mlx5_command_str(opcode), opcode, op_mod, uid, + cmd_status_str(status), status, syndrome, err); +} + +int mlx5_cmd_check(struct mlx5_core_dev *dev, int err, void *in, void *out) +{ + /* aborted due to PCI error or via reset flow mlx5_cmd_trigger_completions() */ + if (err == -ENXIO) { + u16 opcode = MLX5_GET(mbox_in, in, opcode); + u32 syndrome; + u8 status; + + /* PCI Error, emulate command return status, for smooth reset */ + err = mlx5_internal_err_ret_value(dev, opcode, &syndrome, &status); + MLX5_SET(mbox_out, out, status, status); + MLX5_SET(mbox_out, out, syndrome, syndrome); + if (!err) + return 0; + } + + /* driver or FW delivery error */ + if (err != -EREMOTEIO && err) + return err; + + /* check outbox status */ + err = cmd_status_to_err(MLX5_GET(mbox_out, out, status)); + if (err) + cmd_status_print(dev, in, out); + + return err; +} +EXPORT_SYMBOL(mlx5_cmd_check); + +static void dump_command(struct mlx5_core_dev *dev, + struct mlx5_cmd_work_ent *ent, int input) +{ + struct mlx5_cmd_msg *msg = input ? ent->in : ent->out; + u16 op = MLX5_GET(mbox_in, ent->lay->in, opcode); + struct mlx5_cmd_mailbox *next = msg->next; + int n = mlx5_calc_cmd_blocks(msg); + int data_only; + u32 offset = 0; + int dump_len; + int i; + + mlx5_core_dbg(dev, "cmd[%d]: start dump\n", ent->idx); + data_only = !!(mlx5_core_debug_mask & (1 << MLX5_CMD_DATA)); + + if (data_only) + mlx5_core_dbg_mask(dev, 1 << MLX5_CMD_DATA, + "cmd[%d]: dump command data %s(0x%x) %s\n", + ent->idx, mlx5_command_str(op), op, + input ? "INPUT" : "OUTPUT"); + else + mlx5_core_dbg(dev, "cmd[%d]: dump command %s(0x%x) %s\n", + ent->idx, mlx5_command_str(op), op, + input ? "INPUT" : "OUTPUT"); + + if (data_only) { + if (input) { + dump_buf(ent->lay->in, sizeof(ent->lay->in), 1, offset, ent->idx); + offset += sizeof(ent->lay->in); + } else { + dump_buf(ent->lay->out, sizeof(ent->lay->out), 1, offset, ent->idx); + offset += sizeof(ent->lay->out); + } + } else { + dump_buf(ent->lay, sizeof(*ent->lay), 0, offset, ent->idx); + offset += sizeof(*ent->lay); + } + + for (i = 0; i < n && next; i++) { + if (data_only) { + dump_len = min_t(int, MLX5_CMD_DATA_BLOCK_SIZE, msg->len - offset); + dump_buf(next->buf, dump_len, 1, offset, ent->idx); + offset += MLX5_CMD_DATA_BLOCK_SIZE; + } else { + mlx5_core_dbg(dev, "cmd[%d]: command block:\n", ent->idx); + dump_buf(next->buf, sizeof(struct mlx5_cmd_prot_block), 0, offset, + ent->idx); + offset += sizeof(struct mlx5_cmd_prot_block); + } + next = next->next; + } + + if (data_only) + pr_debug("\n"); + + mlx5_core_dbg(dev, "cmd[%d]: end dump\n", ent->idx); +} + +static u16 msg_to_opcode(struct mlx5_cmd_msg *in) +{ + return MLX5_GET(mbox_in, in->first.data, opcode); +} + +static void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, enum mlx5_comp_t comp_type); + +static void cb_timeout_handler(struct work_struct *work) +{ + struct delayed_work *dwork = container_of(work, struct delayed_work, + work); + struct mlx5_cmd_work_ent *ent = container_of(dwork, + struct mlx5_cmd_work_ent, + cb_timeout_work); + struct mlx5_core_dev *dev = container_of(ent->cmd, struct mlx5_core_dev, + cmd); + + mlx5_cmd_eq_recover(dev); + + /* Maybe got handled by eq recover ? */ + if (!test_bit(MLX5_CMD_ENT_STATE_PENDING_COMP, &ent->state)) { + mlx5_core_warn(dev, "cmd[%d]: %s(0x%x) Async, recovered after timeout\n", ent->idx, + mlx5_command_str(msg_to_opcode(ent->in)), msg_to_opcode(ent->in)); + goto out; /* phew, already handled */ + } + + ent->ret = -ETIMEDOUT; + mlx5_core_warn(dev, "cmd[%d]: %s(0x%x) Async, timeout. Will cause a leak of a command resource\n", + ent->idx, mlx5_command_str(msg_to_opcode(ent->in)), msg_to_opcode(ent->in)); + mlx5_cmd_comp_handler(dev, 1ULL << ent->idx, MLX5_CMD_COMP_TYPE_FORCED); + +out: + cmd_ent_put(ent); /* for the cmd_ent_get() took on schedule delayed work */ +} + +static void free_msg(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *msg); +static void mlx5_free_cmd_msg(struct mlx5_core_dev *dev, + struct mlx5_cmd_msg *msg); + +static bool opcode_allowed(struct mlx5_cmd *cmd, u16 opcode) +{ + if (cmd->allowed_opcode == CMD_ALLOWED_OPCODE_ALL) + return true; + + return cmd->allowed_opcode == opcode; +} + +bool mlx5_cmd_is_down(struct mlx5_core_dev *dev) +{ + return pci_channel_offline(dev->pdev) || + dev->cmd.state != MLX5_CMDIF_STATE_UP || + dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR; +} + +static void cmd_work_handler(struct work_struct *work) +{ + struct mlx5_cmd_work_ent *ent = container_of(work, struct mlx5_cmd_work_ent, work); + struct mlx5_cmd *cmd = ent->cmd; + bool poll_cmd = ent->polling; + struct mlx5_cmd_layout *lay; + struct mlx5_core_dev *dev; + unsigned long cb_timeout; + struct semaphore *sem; + unsigned long flags; + int alloc_ret; + int cmd_mode; + + dev = container_of(cmd, struct mlx5_core_dev, cmd); + cb_timeout = msecs_to_jiffies(mlx5_tout_ms(dev, CMD)); + + complete(&ent->handling); + sem = ent->page_queue ? &cmd->pages_sem : &cmd->sem; + down(sem); + if (!ent->page_queue) { + alloc_ret = cmd_alloc_index(cmd); + if (alloc_ret < 0) { + mlx5_core_err_rl(dev, "failed to allocate command entry\n"); + if (ent->callback) { + ent->callback(-EAGAIN, ent->context); + mlx5_free_cmd_msg(dev, ent->out); + free_msg(dev, ent->in); + cmd_ent_put(ent); + } else { + ent->ret = -EAGAIN; + complete(&ent->done); + } + up(sem); + return; + } + ent->idx = alloc_ret; + } else { + ent->idx = cmd->max_reg_cmds; + spin_lock_irqsave(&cmd->alloc_lock, flags); + clear_bit(ent->idx, &cmd->bitmask); + spin_unlock_irqrestore(&cmd->alloc_lock, flags); + } + + cmd->ent_arr[ent->idx] = ent; + lay = get_inst(cmd, ent->idx); + ent->lay = lay; + memset(lay, 0, sizeof(*lay)); + memcpy(lay->in, ent->in->first.data, sizeof(lay->in)); + ent->op = be32_to_cpu(lay->in[0]) >> 16; + if (ent->in->next) + lay->in_ptr = cpu_to_be64(ent->in->next->dma); + lay->inlen = cpu_to_be32(ent->in->len); + if (ent->out->next) + lay->out_ptr = cpu_to_be64(ent->out->next->dma); + lay->outlen = cpu_to_be32(ent->out->len); + lay->type = MLX5_PCI_CMD_XPORT; + lay->token = ent->token; + lay->status_own = CMD_OWNER_HW; + set_signature(ent, !cmd->checksum_disabled); + dump_command(dev, ent, 1); + ent->ts1 = ktime_get_ns(); + cmd_mode = cmd->mode; + + if (ent->callback && schedule_delayed_work(&ent->cb_timeout_work, cb_timeout)) + cmd_ent_get(ent); + set_bit(MLX5_CMD_ENT_STATE_PENDING_COMP, &ent->state); + + cmd_ent_get(ent); /* for the _real_ FW event on completion */ + /* Skip sending command to fw if internal error */ + if (mlx5_cmd_is_down(dev) || !opcode_allowed(&dev->cmd, ent->op)) { + ent->ret = -ENXIO; + mlx5_cmd_comp_handler(dev, 1ULL << ent->idx, MLX5_CMD_COMP_TYPE_FORCED); + return; + } + + /* ring doorbell after the descriptor is valid */ + mlx5_core_dbg(dev, "writing 0x%x to command doorbell\n", 1 << ent->idx); + wmb(); + iowrite32be(1 << ent->idx, &dev->iseg->cmd_dbell); + /* if not in polling don't use ent after this point */ + if (cmd_mode == CMD_MODE_POLLING || poll_cmd) { + poll_timeout(ent); + /* make sure we read the descriptor after ownership is SW */ + rmb(); + mlx5_cmd_comp_handler(dev, 1ULL << ent->idx, ent->ret == -ETIMEDOUT ? + MLX5_CMD_COMP_TYPE_FORCED : MLX5_CMD_COMP_TYPE_POLLING); + } +} + +static int status_to_err(u8 status) +{ + switch (status) { + case MLX5_CMD_DELIVERY_STAT_OK: + case MLX5_DRIVER_STATUS_ABORTED: + return 0; + case MLX5_CMD_DELIVERY_STAT_SIGNAT_ERR: + case MLX5_CMD_DELIVERY_STAT_TOK_ERR: + return -EBADR; + case MLX5_CMD_DELIVERY_STAT_BAD_BLK_NUM_ERR: + case MLX5_CMD_DELIVERY_STAT_OUT_PTR_ALIGN_ERR: + case MLX5_CMD_DELIVERY_STAT_IN_PTR_ALIGN_ERR: + return -EFAULT; /* Bad address */ + case MLX5_CMD_DELIVERY_STAT_IN_LENGTH_ERR: + case MLX5_CMD_DELIVERY_STAT_OUT_LENGTH_ERR: + case MLX5_CMD_DELIVERY_STAT_CMD_DESCR_ERR: + case MLX5_CMD_DELIVERY_STAT_RES_FLD_NOT_CLR_ERR: + return -ENOMSG; + case MLX5_CMD_DELIVERY_STAT_FW_ERR: + return -EIO; + default: + return -EINVAL; + } +} + +static const char *deliv_status_to_str(u8 status) +{ + switch (status) { + case MLX5_CMD_DELIVERY_STAT_OK: + return "no errors"; + case MLX5_CMD_DELIVERY_STAT_SIGNAT_ERR: + return "signature error"; + case MLX5_CMD_DELIVERY_STAT_TOK_ERR: + return "token error"; + case MLX5_CMD_DELIVERY_STAT_BAD_BLK_NUM_ERR: + return "bad block number"; + case MLX5_CMD_DELIVERY_STAT_OUT_PTR_ALIGN_ERR: + return "output pointer not aligned to block size"; + case MLX5_CMD_DELIVERY_STAT_IN_PTR_ALIGN_ERR: + return "input pointer not aligned to block size"; + case MLX5_CMD_DELIVERY_STAT_FW_ERR: + return "firmware internal error"; + case MLX5_CMD_DELIVERY_STAT_IN_LENGTH_ERR: + return "command input length error"; + case MLX5_CMD_DELIVERY_STAT_OUT_LENGTH_ERR: + return "command output length error"; + case MLX5_CMD_DELIVERY_STAT_RES_FLD_NOT_CLR_ERR: + return "reserved fields not cleared"; + case MLX5_CMD_DELIVERY_STAT_CMD_DESCR_ERR: + return "bad command descriptor type"; + default: + return "unknown status code"; + } +} + +enum { + MLX5_CMD_TIMEOUT_RECOVER_MSEC = 5 * 1000, +}; + +static void wait_func_handle_exec_timeout(struct mlx5_core_dev *dev, + struct mlx5_cmd_work_ent *ent) +{ + unsigned long timeout = msecs_to_jiffies(MLX5_CMD_TIMEOUT_RECOVER_MSEC); + + mlx5_cmd_eq_recover(dev); + + /* Re-wait on the ent->done after executing the recovery flow. If the + * recovery flow (or any other recovery flow running simultaneously) + * has recovered an EQE, it should cause the entry to be completed by + * the command interface. + */ + if (wait_for_completion_timeout(&ent->done, timeout)) { + mlx5_core_warn(dev, "cmd[%d]: %s(0x%x) recovered after timeout\n", ent->idx, + mlx5_command_str(msg_to_opcode(ent->in)), msg_to_opcode(ent->in)); + return; + } + + mlx5_core_warn(dev, "cmd[%d]: %s(0x%x) No done completion\n", ent->idx, + mlx5_command_str(msg_to_opcode(ent->in)), msg_to_opcode(ent->in)); + + ent->ret = -ETIMEDOUT; + mlx5_cmd_comp_handler(dev, 1ULL << ent->idx, true); +} + +static int wait_func(struct mlx5_core_dev *dev, struct mlx5_cmd_work_ent *ent) +{ + unsigned long timeout = msecs_to_jiffies(mlx5_tout_ms(dev, CMD)); + struct mlx5_cmd *cmd = &dev->cmd; + int err; + + if (!wait_for_completion_timeout(&ent->handling, timeout) && + cancel_work_sync(&ent->work)) { + ent->ret = -ECANCELED; + goto out_err; + } + if (cmd->mode == CMD_MODE_POLLING || ent->polling) + wait_for_completion(&ent->done); + else if (!wait_for_completion_timeout(&ent->done, timeout)) + wait_func_handle_exec_timeout(dev, ent); + +out_err: + err = ent->ret; + + if (err == -ETIMEDOUT) { + mlx5_core_warn(dev, "%s(0x%x) timeout. Will cause a leak of a command resource\n", + mlx5_command_str(msg_to_opcode(ent->in)), + msg_to_opcode(ent->in)); + } else if (err == -ECANCELED) { + mlx5_core_warn(dev, "%s(0x%x) canceled on out of queue timeout.\n", + mlx5_command_str(msg_to_opcode(ent->in)), + msg_to_opcode(ent->in)); + } + mlx5_core_dbg(dev, "err %d, delivery status %s(%d)\n", + err, deliv_status_to_str(ent->status), ent->status); + + return err; +} + +/* Notes: + * 1. Callback functions may not sleep + * 2. page queue commands do not support asynchrous completion + */ +static int mlx5_cmd_invoke(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *in, + struct mlx5_cmd_msg *out, void *uout, int uout_size, + mlx5_cmd_cbk_t callback, + void *context, int page_queue, u8 *status, + u8 token, bool force_polling) +{ + struct mlx5_cmd *cmd = &dev->cmd; + struct mlx5_cmd_work_ent *ent; + struct mlx5_cmd_stats *stats; + int err = 0; + s64 ds; + u16 op; + + if (callback && page_queue) + return -EINVAL; + + ent = cmd_alloc_ent(cmd, in, out, uout, uout_size, + callback, context, page_queue); + if (IS_ERR(ent)) + return PTR_ERR(ent); + + /* put for this ent is when consumed, depending on the use case + * 1) (!callback) blocking flow: by caller after wait_func completes + * 2) (callback) flow: by mlx5_cmd_comp_handler() when ent is handled + */ + + ent->token = token; + ent->polling = force_polling; + + init_completion(&ent->handling); + if (!callback) + init_completion(&ent->done); + + INIT_DELAYED_WORK(&ent->cb_timeout_work, cb_timeout_handler); + INIT_WORK(&ent->work, cmd_work_handler); + if (page_queue) { + cmd_work_handler(&ent->work); + } else if (!queue_work(cmd->wq, &ent->work)) { + mlx5_core_warn(dev, "failed to queue work\n"); + err = -ENOMEM; + goto out_free; + } + + if (callback) + goto out; /* mlx5_cmd_comp_handler() will put(ent) */ + + err = wait_func(dev, ent); + if (err == -ETIMEDOUT || err == -ECANCELED) + goto out_free; + + ds = ent->ts2 - ent->ts1; + op = MLX5_GET(mbox_in, in->first.data, opcode); + if (op < MLX5_CMD_OP_MAX) { + stats = &cmd->stats[op]; + spin_lock_irq(&stats->lock); + stats->sum += ds; + ++stats->n; + spin_unlock_irq(&stats->lock); + } + mlx5_core_dbg_mask(dev, 1 << MLX5_CMD_TIME, + "fw exec time for %s is %lld nsec\n", + mlx5_command_str(op), ds); + *status = ent->status; + +out_free: + cmd_ent_put(ent); +out: + return err; +} + +static ssize_t dbg_write(struct file *filp, const char __user *buf, + size_t count, loff_t *pos) +{ + struct mlx5_core_dev *dev = filp->private_data; + struct mlx5_cmd_debug *dbg = &dev->cmd.dbg; + char lbuf[3]; + int err; + + if (!dbg->in_msg || !dbg->out_msg) + return -ENOMEM; + + if (count < sizeof(lbuf) - 1) + return -EINVAL; + + if (copy_from_user(lbuf, buf, sizeof(lbuf) - 1)) + return -EFAULT; + + lbuf[sizeof(lbuf) - 1] = 0; + + if (strcmp(lbuf, "go")) + return -EINVAL; + + err = mlx5_cmd_exec(dev, dbg->in_msg, dbg->inlen, dbg->out_msg, dbg->outlen); + + return err ? err : count; +} + +static const struct file_operations fops = { + .owner = THIS_MODULE, + .open = simple_open, + .write = dbg_write, +}; + +static int mlx5_copy_to_msg(struct mlx5_cmd_msg *to, void *from, int size, + u8 token) +{ + struct mlx5_cmd_prot_block *block; + struct mlx5_cmd_mailbox *next; + int copy; + + if (!to || !from) + return -ENOMEM; + + copy = min_t(int, size, sizeof(to->first.data)); + memcpy(to->first.data, from, copy); + size -= copy; + from += copy; + + next = to->next; + while (size) { + if (!next) { + /* this is a BUG */ + return -ENOMEM; + } + + copy = min_t(int, size, MLX5_CMD_DATA_BLOCK_SIZE); + block = next->buf; + memcpy(block->data, from, copy); + from += copy; + size -= copy; + block->token = token; + next = next->next; + } + + return 0; +} + +static int mlx5_copy_from_msg(void *to, struct mlx5_cmd_msg *from, int size) +{ + struct mlx5_cmd_prot_block *block; + struct mlx5_cmd_mailbox *next; + int copy; + + if (!to || !from) + return -ENOMEM; + + copy = min_t(int, size, sizeof(from->first.data)); + memcpy(to, from->first.data, copy); + size -= copy; + to += copy; + + next = from->next; + while (size) { + if (!next) { + /* this is a BUG */ + return -ENOMEM; + } + + copy = min_t(int, size, MLX5_CMD_DATA_BLOCK_SIZE); + block = next->buf; + + memcpy(to, block->data, copy); + to += copy; + size -= copy; + next = next->next; + } + + return 0; +} + +static struct mlx5_cmd_mailbox *alloc_cmd_box(struct mlx5_core_dev *dev, + gfp_t flags) +{ + struct mlx5_cmd_mailbox *mailbox; + + mailbox = kmalloc(sizeof(*mailbox), flags); + if (!mailbox) + return ERR_PTR(-ENOMEM); + + mailbox->buf = dma_pool_zalloc(dev->cmd.pool, flags, + &mailbox->dma); + if (!mailbox->buf) { + mlx5_core_dbg(dev, "failed allocation\n"); + kfree(mailbox); + return ERR_PTR(-ENOMEM); + } + mailbox->next = NULL; + + return mailbox; +} + +static void free_cmd_box(struct mlx5_core_dev *dev, + struct mlx5_cmd_mailbox *mailbox) +{ + dma_pool_free(dev->cmd.pool, mailbox->buf, mailbox->dma); + kfree(mailbox); +} + +static struct mlx5_cmd_msg *mlx5_alloc_cmd_msg(struct mlx5_core_dev *dev, + gfp_t flags, int size, + u8 token) +{ + struct mlx5_cmd_mailbox *tmp, *head = NULL; + struct mlx5_cmd_prot_block *block; + struct mlx5_cmd_msg *msg; + int err; + int n; + int i; + + msg = kzalloc(sizeof(*msg), flags); + if (!msg) + return ERR_PTR(-ENOMEM); + + msg->len = size; + n = mlx5_calc_cmd_blocks(msg); + + for (i = 0; i < n; i++) { + tmp = alloc_cmd_box(dev, flags); + if (IS_ERR(tmp)) { + mlx5_core_warn(dev, "failed allocating block\n"); + err = PTR_ERR(tmp); + goto err_alloc; + } + + block = tmp->buf; + tmp->next = head; + block->next = cpu_to_be64(tmp->next ? tmp->next->dma : 0); + block->block_num = cpu_to_be32(n - i - 1); + block->token = token; + head = tmp; + } + msg->next = head; + return msg; + +err_alloc: + while (head) { + tmp = head->next; + free_cmd_box(dev, head); + head = tmp; + } + kfree(msg); + + return ERR_PTR(err); +} + +static void mlx5_free_cmd_msg(struct mlx5_core_dev *dev, + struct mlx5_cmd_msg *msg) +{ + struct mlx5_cmd_mailbox *head = msg->next; + struct mlx5_cmd_mailbox *next; + + while (head) { + next = head->next; + free_cmd_box(dev, head); + head = next; + } + kfree(msg); +} + +static ssize_t data_write(struct file *filp, const char __user *buf, + size_t count, loff_t *pos) +{ + struct mlx5_core_dev *dev = filp->private_data; + struct mlx5_cmd_debug *dbg = &dev->cmd.dbg; + void *ptr; + + if (*pos != 0) + return -EINVAL; + + kfree(dbg->in_msg); + dbg->in_msg = NULL; + dbg->inlen = 0; + ptr = memdup_user(buf, count); + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + dbg->in_msg = ptr; + dbg->inlen = count; + + *pos = count; + + return count; +} + +static ssize_t data_read(struct file *filp, char __user *buf, size_t count, + loff_t *pos) +{ + struct mlx5_core_dev *dev = filp->private_data; + struct mlx5_cmd_debug *dbg = &dev->cmd.dbg; + + if (!dbg->out_msg) + return -ENOMEM; + + return simple_read_from_buffer(buf, count, pos, dbg->out_msg, + dbg->outlen); +} + +static const struct file_operations dfops = { + .owner = THIS_MODULE, + .open = simple_open, + .write = data_write, + .read = data_read, +}; + +static ssize_t outlen_read(struct file *filp, char __user *buf, size_t count, + loff_t *pos) +{ + struct mlx5_core_dev *dev = filp->private_data; + struct mlx5_cmd_debug *dbg = &dev->cmd.dbg; + char outlen[8]; + int err; + + err = snprintf(outlen, sizeof(outlen), "%d", dbg->outlen); + if (err < 0) + return err; + + return simple_read_from_buffer(buf, count, pos, outlen, err); +} + +static ssize_t outlen_write(struct file *filp, const char __user *buf, + size_t count, loff_t *pos) +{ + struct mlx5_core_dev *dev = filp->private_data; + struct mlx5_cmd_debug *dbg = &dev->cmd.dbg; + char outlen_str[8] = {0}; + int outlen; + void *ptr; + int err; + + if (*pos != 0 || count > 6) + return -EINVAL; + + kfree(dbg->out_msg); + dbg->out_msg = NULL; + dbg->outlen = 0; + + if (copy_from_user(outlen_str, buf, count)) + return -EFAULT; + + err = sscanf(outlen_str, "%d", &outlen); + if (err < 0) + return err; + + ptr = kzalloc(outlen, GFP_KERNEL); + if (!ptr) + return -ENOMEM; + + dbg->out_msg = ptr; + dbg->outlen = outlen; + + *pos = count; + + return count; +} + +static const struct file_operations olfops = { + .owner = THIS_MODULE, + .open = simple_open, + .write = outlen_write, + .read = outlen_read, +}; + +static void set_wqname(struct mlx5_core_dev *dev) +{ + struct mlx5_cmd *cmd = &dev->cmd; + + snprintf(cmd->wq_name, sizeof(cmd->wq_name), "mlx5_cmd_%s", + dev_name(dev->device)); +} + +static void clean_debug_files(struct mlx5_core_dev *dev) +{ + struct mlx5_cmd_debug *dbg = &dev->cmd.dbg; + + if (!mlx5_debugfs_root) + return; + + mlx5_cmdif_debugfs_cleanup(dev); + debugfs_remove_recursive(dbg->dbg_root); +} + +static void create_debugfs_files(struct mlx5_core_dev *dev) +{ + struct mlx5_cmd_debug *dbg = &dev->cmd.dbg; + + dbg->dbg_root = debugfs_create_dir("cmd", mlx5_debugfs_get_dev_root(dev)); + + debugfs_create_file("in", 0400, dbg->dbg_root, dev, &dfops); + debugfs_create_file("out", 0200, dbg->dbg_root, dev, &dfops); + debugfs_create_file("out_len", 0600, dbg->dbg_root, dev, &olfops); + debugfs_create_u8("status", 0600, dbg->dbg_root, &dbg->status); + debugfs_create_file("run", 0200, dbg->dbg_root, dev, &fops); + + mlx5_cmdif_debugfs_init(dev); +} + +void mlx5_cmd_allowed_opcode(struct mlx5_core_dev *dev, u16 opcode) +{ + struct mlx5_cmd *cmd = &dev->cmd; + int i; + + for (i = 0; i < cmd->max_reg_cmds; i++) + down(&cmd->sem); + down(&cmd->pages_sem); + + cmd->allowed_opcode = opcode; + + up(&cmd->pages_sem); + for (i = 0; i < cmd->max_reg_cmds; i++) + up(&cmd->sem); +} + +static void mlx5_cmd_change_mod(struct mlx5_core_dev *dev, int mode) +{ + struct mlx5_cmd *cmd = &dev->cmd; + int i; + + for (i = 0; i < cmd->max_reg_cmds; i++) + down(&cmd->sem); + down(&cmd->pages_sem); + + cmd->mode = mode; + + up(&cmd->pages_sem); + for (i = 0; i < cmd->max_reg_cmds; i++) + up(&cmd->sem); +} + +static int cmd_comp_notifier(struct notifier_block *nb, + unsigned long type, void *data) +{ + struct mlx5_core_dev *dev; + struct mlx5_cmd *cmd; + struct mlx5_eqe *eqe; + + cmd = mlx5_nb_cof(nb, struct mlx5_cmd, nb); + dev = container_of(cmd, struct mlx5_core_dev, cmd); + eqe = data; + + mlx5_cmd_comp_handler(dev, be32_to_cpu(eqe->data.cmd.vector), + MLX5_CMD_COMP_TYPE_EVENT); + + return NOTIFY_OK; +} +void mlx5_cmd_use_events(struct mlx5_core_dev *dev) +{ + MLX5_NB_INIT(&dev->cmd.nb, cmd_comp_notifier, CMD); + mlx5_eq_notifier_register(dev, &dev->cmd.nb); + mlx5_cmd_change_mod(dev, CMD_MODE_EVENTS); +} + +void mlx5_cmd_use_polling(struct mlx5_core_dev *dev) +{ + mlx5_cmd_change_mod(dev, CMD_MODE_POLLING); + mlx5_eq_notifier_unregister(dev, &dev->cmd.nb); +} + +static void free_msg(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *msg) +{ + unsigned long flags; + + if (msg->parent) { + spin_lock_irqsave(&msg->parent->lock, flags); + list_add_tail(&msg->list, &msg->parent->head); + msg->parent->free++; + spin_unlock_irqrestore(&msg->parent->lock, flags); + } else { + mlx5_free_cmd_msg(dev, msg); + } +} + +static void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, enum mlx5_comp_t comp_type) +{ + struct mlx5_cmd *cmd = &dev->cmd; + struct mlx5_cmd_work_ent *ent; + mlx5_cmd_cbk_t callback; + void *context; + int err; + int i; + s64 ds; + struct mlx5_cmd_stats *stats; + unsigned long flags; + unsigned long vector; + + /* there can be at most 32 command queues */ + vector = vec & 0xffffffff; + for (i = 0; i < (1 << cmd->log_sz); i++) { + if (test_bit(i, &vector)) { + ent = cmd->ent_arr[i]; + + /* if we already completed the command, ignore it */ + if (!test_and_clear_bit(MLX5_CMD_ENT_STATE_PENDING_COMP, + &ent->state)) { + /* only real completion can free the cmd slot */ + if (comp_type == MLX5_CMD_COMP_TYPE_EVENT && !ent->polling) { + mlx5_core_err(dev, "Command completion arrived after timeout (entry idx = %d).\n", + ent->idx); + cmd_ent_put(ent); + } + if ((vec & MLX5_TRIGGERED_CMD_COMP) && ent->ret == -ETIMEDOUT) + cmd_ent_put(ent); + if (comp_type != MLX5_CMD_COMP_TYPE_POLLING) + continue; + } else if (ent->polling && comp_type == MLX5_CMD_COMP_TYPE_EVENT) { + u16 opcode; + + opcode = msg_to_opcode(ent->in); + mlx5_core_err(dev, "Command polling got Event as first completion (entry idx = %d) %s.\n", + ent->idx, mlx5_command_str(opcode)); + continue; + } + + if (ent->callback && cancel_delayed_work(&ent->cb_timeout_work)) + cmd_ent_put(ent); /* timeout work was canceled */ + + if (comp_type != MLX5_CMD_COMP_TYPE_FORCED || /* Real FW completion */ + mlx5_cmd_is_down(dev) || /* No real FW completion is expected */ + !opcode_allowed(cmd, ent->op)) + cmd_ent_put(ent); + + ent->ts2 = ktime_get_ns(); + memcpy(ent->out->first.data, ent->lay->out, sizeof(ent->lay->out)); + dump_command(dev, ent, 0); + + if (vec & MLX5_TRIGGERED_CMD_COMP) + ent->ret = -ENXIO; + + if (!ent->ret) { /* Command completed by FW */ + if (!cmd->checksum_disabled) + ent->ret = verify_signature(ent); + + ent->status = ent->lay->status_own >> 1; + + mlx5_core_dbg(dev, "command completed. ret 0x%x, delivery status %s(0x%x)\n", + ent->ret, deliv_status_to_str(ent->status), ent->status); + } + + if (ent->callback) { + ds = ent->ts2 - ent->ts1; + if (ent->op < MLX5_CMD_OP_MAX) { + stats = &cmd->stats[ent->op]; + spin_lock_irqsave(&stats->lock, flags); + stats->sum += ds; + ++stats->n; + spin_unlock_irqrestore(&stats->lock, flags); + } + + callback = ent->callback; + context = ent->context; + err = ent->ret ? : ent->status; + if (err > 0) /* Failed in FW, command didn't execute */ + err = status_to_err(err); + + if (!err) + err = mlx5_copy_from_msg(ent->uout, + ent->out, + ent->uout_size); + + mlx5_free_cmd_msg(dev, ent->out); + free_msg(dev, ent->in); + + /* final consumer is done, release ent */ + cmd_ent_put(ent); + callback(err, context); + } else { + /* release wait_func() so mlx5_cmd_invoke() + * can make the final ent_put() + */ + complete(&ent->done); + } + } + } +} + +void mlx5_cmd_trigger_completions(struct mlx5_core_dev *dev) +{ + struct mlx5_cmd *cmd = &dev->cmd; + unsigned long bitmask; + unsigned long flags; + u64 vector; + int i; + + /* wait for pending handlers to complete */ + mlx5_eq_synchronize_cmd_irq(dev); + spin_lock_irqsave(&dev->cmd.alloc_lock, flags); + vector = ~dev->cmd.bitmask & ((1ul << (1 << dev->cmd.log_sz)) - 1); + if (!vector) + goto no_trig; + + bitmask = vector; + /* we must increment the allocated entries refcount before triggering the completions + * to guarantee pending commands will not get freed in the meanwhile. + * For that reason, it also has to be done inside the alloc_lock. + */ + for_each_set_bit(i, &bitmask, (1 << cmd->log_sz)) + cmd_ent_get(cmd->ent_arr[i]); + vector |= MLX5_TRIGGERED_CMD_COMP; + spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags); + + mlx5_core_dbg(dev, "vector 0x%llx\n", vector); + mlx5_cmd_comp_handler(dev, vector, MLX5_CMD_COMP_TYPE_FORCED); + for_each_set_bit(i, &bitmask, (1 << cmd->log_sz)) + cmd_ent_put(cmd->ent_arr[i]); + return; + +no_trig: + spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags); +} + +void mlx5_cmd_flush(struct mlx5_core_dev *dev) +{ + struct mlx5_cmd *cmd = &dev->cmd; + int i; + + for (i = 0; i < cmd->max_reg_cmds; i++) { + while (down_trylock(&cmd->sem)) { + mlx5_cmd_trigger_completions(dev); + cond_resched(); + } + } + + while (down_trylock(&cmd->pages_sem)) { + mlx5_cmd_trigger_completions(dev); + cond_resched(); + } + + /* Unlock cmdif */ + up(&cmd->pages_sem); + for (i = 0; i < cmd->max_reg_cmds; i++) + up(&cmd->sem); +} + +static struct mlx5_cmd_msg *alloc_msg(struct mlx5_core_dev *dev, int in_size, + gfp_t gfp) +{ + struct mlx5_cmd_msg *msg = ERR_PTR(-ENOMEM); + struct cmd_msg_cache *ch = NULL; + struct mlx5_cmd *cmd = &dev->cmd; + bool total_accounted = false; + bool miss_accounted = false; + int i; + + if (in_size <= 16) + goto cache_miss; + + for (i = 0; i < MLX5_NUM_COMMAND_CACHES; i++) { + ch = &cmd->cache[i]; + if (in_size > ch->max_inbox_size) + continue; + spin_lock_irq(&ch->lock); + if (!total_accounted) { + ch->total_commands++; + total_accounted = true; + } + if (list_empty(&ch->head)) { + if (!miss_accounted) { + ch->miss++; + miss_accounted = true; + } + spin_unlock_irq(&ch->lock); + continue; + } + msg = list_entry(ch->head.next, typeof(*msg), list); + /* For cached lists, we must explicitly state what is + * the real size + */ + msg->len = in_size; + list_del(&msg->list); + ch->free--; + spin_unlock_irq(&ch->lock); + break; + } + + if (!IS_ERR(msg)) + return msg; + +cache_miss: + if (in_size > 16) + atomic_inc(&cmd->real_miss); + msg = mlx5_alloc_cmd_msg(dev, gfp, in_size, 0); + return msg; +} + +static int is_manage_pages(void *in) +{ + return MLX5_GET(mbox_in, in, opcode) == MLX5_CMD_OP_MANAGE_PAGES; +} + +static int cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out, + int out_size, mlx5_cmd_cbk_t callback, void *context, + bool force_polling) +{ + struct mlx5_cmd_msg *inb; + struct mlx5_cmd_msg *outb; + int pages_queue; + gfp_t gfp; + int err; + u8 status = 0; + u16 opcode; + u8 token; + + opcode = MLX5_GET(mbox_in, in, opcode); + if (mlx5_cmd_is_down(dev) || !opcode_allowed(&dev->cmd, opcode)) + return -ENXIO; + + pages_queue = is_manage_pages(in); + gfp = callback ? GFP_ATOMIC : GFP_KERNEL; + + inb = alloc_msg(dev, in_size, gfp); + if (IS_ERR(inb)) { + err = PTR_ERR(inb); + return err; + } + + token = alloc_token(&dev->cmd); + + err = mlx5_copy_to_msg(inb, in, in_size, token); + if (err) { + mlx5_core_warn(dev, "err %d\n", err); + goto out_in; + } + + outb = mlx5_alloc_cmd_msg(dev, gfp, out_size, token); + if (IS_ERR(outb)) { + err = PTR_ERR(outb); + goto out_in; + } + + err = mlx5_cmd_invoke(dev, inb, outb, out, out_size, callback, context, + pages_queue, &status, token, force_polling); + if (err) + goto out_out; + + mlx5_core_dbg(dev, "err %d, status %d\n", err, status); + if (status) { + err = status_to_err(status); + goto out_out; + } + + if (!callback) + err = mlx5_copy_from_msg(out, outb, out_size); + +out_out: + if (!callback) + mlx5_free_cmd_msg(dev, outb); + +out_in: + if (!callback) + free_msg(dev, inb); + return err; +} + +static void cmd_status_log(struct mlx5_core_dev *dev, u16 opcode, u8 status, int err) +{ + struct mlx5_cmd_stats *stats; + + if (!err || mlx5_cmd_is_down(dev)) + return; + + stats = &dev->cmd.stats[opcode]; + spin_lock_irq(&stats->lock); + stats->failed++; + if (err < 0) + stats->last_failed_errno = -err; + if (err == -EREMOTEIO) { + stats->failed_mbox_status++; + stats->last_failed_mbox_status = status; + } + spin_unlock_irq(&stats->lock); +} + +/* preserve -EREMOTEIO for outbox.status != OK, otherwise return err as is */ +static int cmd_status_err(struct mlx5_core_dev *dev, int err, u16 opcode, void *out) +{ + u8 status = MLX5_GET(mbox_out, out, status); + + if (err == -EREMOTEIO) /* -EREMOTEIO is preserved */ + err = -EIO; + + if (!err && status != MLX5_CMD_STAT_OK) + err = -EREMOTEIO; + + cmd_status_log(dev, opcode, status, err); + return err; +} + +/** + * mlx5_cmd_do - Executes a fw command, wait for completion. + * Unlike mlx5_cmd_exec, this function will not translate or intercept + * outbox.status and will return -EREMOTEIO when + * outbox.status != MLX5_CMD_STAT_OK + * + * @dev: mlx5 core device + * @in: inbox mlx5_ifc command buffer + * @in_size: inbox buffer size + * @out: outbox mlx5_ifc buffer + * @out_size: outbox size + * + * @return: + * -EREMOTEIO : Command executed by FW, outbox.status != MLX5_CMD_STAT_OK. + * Caller must check FW outbox status. + * 0 : Command execution successful, outbox.status == MLX5_CMD_STAT_OK. + * < 0 : Command execution couldn't be performed by firmware or driver + */ +int mlx5_cmd_do(struct mlx5_core_dev *dev, void *in, int in_size, void *out, int out_size) +{ + int err = cmd_exec(dev, in, in_size, out, out_size, NULL, NULL, false); + u16 opcode = MLX5_GET(mbox_in, in, opcode); + + err = cmd_status_err(dev, err, opcode, out); + return err; +} +EXPORT_SYMBOL(mlx5_cmd_do); + +/** + * mlx5_cmd_exec - Executes a fw command, wait for completion + * + * @dev: mlx5 core device + * @in: inbox mlx5_ifc command buffer + * @in_size: inbox buffer size + * @out: outbox mlx5_ifc buffer + * @out_size: outbox size + * + * @return: 0 if no error, FW command execution was successful + * and outbox status is ok. + */ +int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out, + int out_size) +{ + int err = mlx5_cmd_do(dev, in, in_size, out, out_size); + + return mlx5_cmd_check(dev, err, in, out); +} +EXPORT_SYMBOL(mlx5_cmd_exec); + +/** + * mlx5_cmd_exec_polling - Executes a fw command, poll for completion + * Needed for driver force teardown, when command completion EQ + * will not be available to complete the command + * + * @dev: mlx5 core device + * @in: inbox mlx5_ifc command buffer + * @in_size: inbox buffer size + * @out: outbox mlx5_ifc buffer + * @out_size: outbox size + * + * @return: 0 if no error, FW command execution was successful + * and outbox status is ok. + */ +int mlx5_cmd_exec_polling(struct mlx5_core_dev *dev, void *in, int in_size, + void *out, int out_size) +{ + int err = cmd_exec(dev, in, in_size, out, out_size, NULL, NULL, true); + u16 opcode = MLX5_GET(mbox_in, in, opcode); + + err = cmd_status_err(dev, err, opcode, out); + return mlx5_cmd_check(dev, err, in, out); +} +EXPORT_SYMBOL(mlx5_cmd_exec_polling); + +void mlx5_cmd_init_async_ctx(struct mlx5_core_dev *dev, + struct mlx5_async_ctx *ctx) +{ + ctx->dev = dev; + /* Starts at 1 to avoid doing wake_up if we are not cleaning up */ + atomic_set(&ctx->num_inflight, 1); + init_waitqueue_head(&ctx->wait); +} +EXPORT_SYMBOL(mlx5_cmd_init_async_ctx); + +/** + * mlx5_cmd_cleanup_async_ctx - Clean up an async_ctx + * @ctx: The ctx to clean + * + * Upon return all callbacks given to mlx5_cmd_exec_cb() have been called. The + * caller must ensure that mlx5_cmd_exec_cb() is not called during or after + * the call mlx5_cleanup_async_ctx(). + */ +void mlx5_cmd_cleanup_async_ctx(struct mlx5_async_ctx *ctx) +{ + atomic_dec(&ctx->num_inflight); + wait_event(ctx->wait, atomic_read(&ctx->num_inflight) == 0); +} +EXPORT_SYMBOL(mlx5_cmd_cleanup_async_ctx); + +static void mlx5_cmd_exec_cb_handler(int status, void *_work) +{ + struct mlx5_async_work *work = _work; + struct mlx5_async_ctx *ctx; + + ctx = work->ctx; + status = cmd_status_err(ctx->dev, status, work->opcode, work->out); + work->user_callback(status, work); + if (atomic_dec_and_test(&ctx->num_inflight)) + wake_up(&ctx->wait); +} + +int mlx5_cmd_exec_cb(struct mlx5_async_ctx *ctx, void *in, int in_size, + void *out, int out_size, mlx5_async_cbk_t callback, + struct mlx5_async_work *work) +{ + int ret; + + work->ctx = ctx; + work->user_callback = callback; + work->opcode = MLX5_GET(mbox_in, in, opcode); + work->out = out; + if (WARN_ON(!atomic_inc_not_zero(&ctx->num_inflight))) + return -EIO; + ret = cmd_exec(ctx->dev, in, in_size, out, out_size, + mlx5_cmd_exec_cb_handler, work, false); + if (ret && atomic_dec_and_test(&ctx->num_inflight)) + wake_up(&ctx->wait); + + return ret; +} +EXPORT_SYMBOL(mlx5_cmd_exec_cb); + +static void destroy_msg_cache(struct mlx5_core_dev *dev) +{ + struct cmd_msg_cache *ch; + struct mlx5_cmd_msg *msg; + struct mlx5_cmd_msg *n; + int i; + + if (mlx5_core_is_sf(dev)) + return; + + for (i = 0; i < MLX5_NUM_COMMAND_CACHES; i++) { + ch = &dev->cmd.cache[i]; + list_for_each_entry_safe(msg, n, &ch->head, list) { + list_del(&msg->list); + ch->free--; + mlx5_free_cmd_msg(dev, msg); + } + } + + cmd_sysfs_cleanup(dev); +} + +static unsigned cmd_cache_num_ent[MLX5_NUM_COMMAND_CACHES] = { + 512, 32, 16, 8, 2 +}; + +static unsigned cmd_cache_ent_size[MLX5_NUM_COMMAND_CACHES] = { + 16 + MLX5_CMD_DATA_BLOCK_SIZE, + 16 + MLX5_CMD_DATA_BLOCK_SIZE * 2, + 16 + MLX5_CMD_DATA_BLOCK_SIZE * 16, + 16 + MLX5_CMD_DATA_BLOCK_SIZE * 256, + 16 + MLX5_CMD_DATA_BLOCK_SIZE * 512, +}; + +static void create_msg_cache(struct mlx5_core_dev *dev) +{ + struct mlx5_cmd *cmd = &dev->cmd; + struct cmd_msg_cache *ch; + struct mlx5_cmd_msg *msg; + int i; + int k; + + if (mlx5_core_is_sf(dev)) + return; + + /* Initialize and fill the caches with initial entries */ + for (k = 0; k < MLX5_NUM_COMMAND_CACHES; k++) { + ch = &cmd->cache[k]; + spin_lock_init(&ch->lock); + INIT_LIST_HEAD(&ch->head); + ch->num_ent = cmd_cache_num_ent[k]; + ch->max_inbox_size = cmd_cache_ent_size[k]; + ch->miss = 0; + ch->total_commands = 0; + ch->free = 0; + for (i = 0; i < ch->num_ent; i++) { + msg = mlx5_alloc_cmd_msg(dev, GFP_KERNEL | __GFP_NOWARN, + ch->max_inbox_size, 0); + if (IS_ERR(msg)) + break; + msg->parent = ch; + ch->free++; + list_add_tail(&msg->list, &ch->head); + } + } + + cmd_sysfs_init(dev); +} + +static int alloc_cmd_page(struct mlx5_core_dev *dev, struct mlx5_cmd *cmd) +{ + cmd->cmd_alloc_buf = dma_alloc_coherent(mlx5_core_dma_dev(dev), MLX5_ADAPTER_PAGE_SIZE, + &cmd->alloc_dma, GFP_KERNEL); + if (!cmd->cmd_alloc_buf) + return -ENOMEM; + + /* make sure it is aligned to 4K */ + if (!((uintptr_t)cmd->cmd_alloc_buf & (MLX5_ADAPTER_PAGE_SIZE - 1))) { + cmd->cmd_buf = cmd->cmd_alloc_buf; + cmd->dma = cmd->alloc_dma; + cmd->alloc_size = MLX5_ADAPTER_PAGE_SIZE; + return 0; + } + + dma_free_coherent(mlx5_core_dma_dev(dev), MLX5_ADAPTER_PAGE_SIZE, cmd->cmd_alloc_buf, + cmd->alloc_dma); + cmd->cmd_alloc_buf = dma_alloc_coherent(mlx5_core_dma_dev(dev), + 2 * MLX5_ADAPTER_PAGE_SIZE - 1, + &cmd->alloc_dma, GFP_KERNEL); + if (!cmd->cmd_alloc_buf) + return -ENOMEM; + + cmd->cmd_buf = PTR_ALIGN(cmd->cmd_alloc_buf, MLX5_ADAPTER_PAGE_SIZE); + cmd->dma = ALIGN(cmd->alloc_dma, MLX5_ADAPTER_PAGE_SIZE); + cmd->alloc_size = 2 * MLX5_ADAPTER_PAGE_SIZE - 1; + return 0; +} + +static void free_cmd_page(struct mlx5_core_dev *dev, struct mlx5_cmd *cmd) +{ + dma_free_coherent(mlx5_core_dma_dev(dev), cmd->alloc_size, cmd->cmd_alloc_buf, + cmd->alloc_dma); +} + +static u16 cmdif_rev(struct mlx5_core_dev *dev) +{ + return ioread32be(&dev->iseg->cmdif_rev_fw_sub) >> 16; +} + +int mlx5_cmd_init(struct mlx5_core_dev *dev) +{ + int size = sizeof(struct mlx5_cmd_prot_block); + int align = roundup_pow_of_two(size); + struct mlx5_cmd *cmd = &dev->cmd; + u32 cmd_h, cmd_l; + u16 cmd_if_rev; + int err; + int i; + + memset(cmd, 0, sizeof(*cmd)); + cmd_if_rev = cmdif_rev(dev); + if (cmd_if_rev != CMD_IF_REV) { + mlx5_core_err(dev, + "Driver cmdif rev(%d) differs from firmware's(%d)\n", + CMD_IF_REV, cmd_if_rev); + return -EINVAL; + } + + cmd->stats = kvcalloc(MLX5_CMD_OP_MAX, sizeof(*cmd->stats), GFP_KERNEL); + if (!cmd->stats) + return -ENOMEM; + + cmd->pool = dma_pool_create("mlx5_cmd", mlx5_core_dma_dev(dev), size, align, 0); + if (!cmd->pool) { + err = -ENOMEM; + goto dma_pool_err; + } + + err = alloc_cmd_page(dev, cmd); + if (err) + goto err_free_pool; + + cmd_l = ioread32be(&dev->iseg->cmdq_addr_l_sz) & 0xff; + cmd->log_sz = cmd_l >> 4 & 0xf; + cmd->log_stride = cmd_l & 0xf; + if (1 << cmd->log_sz > MLX5_MAX_COMMANDS) { + mlx5_core_err(dev, "firmware reports too many outstanding commands %d\n", + 1 << cmd->log_sz); + err = -EINVAL; + goto err_free_page; + } + + if (cmd->log_sz + cmd->log_stride > MLX5_ADAPTER_PAGE_SHIFT) { + mlx5_core_err(dev, "command queue size overflow\n"); + err = -EINVAL; + goto err_free_page; + } + + cmd->state = MLX5_CMDIF_STATE_DOWN; + cmd->checksum_disabled = 1; + cmd->max_reg_cmds = (1 << cmd->log_sz) - 1; + cmd->bitmask = (1UL << cmd->max_reg_cmds) - 1; + + cmd->cmdif_rev = ioread32be(&dev->iseg->cmdif_rev_fw_sub) >> 16; + if (cmd->cmdif_rev > CMD_IF_REV) { + mlx5_core_err(dev, "driver does not support command interface version. driver %d, firmware %d\n", + CMD_IF_REV, cmd->cmdif_rev); + err = -EOPNOTSUPP; + goto err_free_page; + } + + spin_lock_init(&cmd->alloc_lock); + spin_lock_init(&cmd->token_lock); + for (i = 0; i < MLX5_CMD_OP_MAX; i++) + spin_lock_init(&cmd->stats[i].lock); + + sema_init(&cmd->sem, cmd->max_reg_cmds); + sema_init(&cmd->pages_sem, 1); + + cmd_h = (u32)((u64)(cmd->dma) >> 32); + cmd_l = (u32)(cmd->dma); + if (cmd_l & 0xfff) { + mlx5_core_err(dev, "invalid command queue address\n"); + err = -ENOMEM; + goto err_free_page; + } + + iowrite32be(cmd_h, &dev->iseg->cmdq_addr_h); + iowrite32be(cmd_l, &dev->iseg->cmdq_addr_l_sz); + + /* Make sure firmware sees the complete address before we proceed */ + wmb(); + + mlx5_core_dbg(dev, "descriptor at dma 0x%llx\n", (unsigned long long)(cmd->dma)); + + cmd->mode = CMD_MODE_POLLING; + cmd->allowed_opcode = CMD_ALLOWED_OPCODE_ALL; + + create_msg_cache(dev); + + set_wqname(dev); + cmd->wq = create_singlethread_workqueue(cmd->wq_name); + if (!cmd->wq) { + mlx5_core_err(dev, "failed to create command workqueue\n"); + err = -ENOMEM; + goto err_cache; + } + + create_debugfs_files(dev); + + return 0; + +err_cache: + destroy_msg_cache(dev); + +err_free_page: + free_cmd_page(dev, cmd); + +err_free_pool: + dma_pool_destroy(cmd->pool); +dma_pool_err: + kvfree(cmd->stats); + return err; +} + +void mlx5_cmd_cleanup(struct mlx5_core_dev *dev) +{ + struct mlx5_cmd *cmd = &dev->cmd; + + clean_debug_files(dev); + destroy_workqueue(cmd->wq); + destroy_msg_cache(dev); + free_cmd_page(dev, cmd); + dma_pool_destroy(cmd->pool); + kvfree(cmd->stats); +} + +void mlx5_cmd_set_state(struct mlx5_core_dev *dev, + enum mlx5_cmdif_state cmdif_state) +{ + dev->cmd.state = cmdif_state; +} + +struct cmd_cache_attribute { + struct attribute attr; + ssize_t (*show)(struct cmd_msg_cache *, + struct cmd_cache_attribute *, char *buf); + ssize_t (*store)(struct cmd_msg_cache *, struct cmd_cache_attribute *, + const char *buf, size_t count); +}; + +static ssize_t free_show(struct cmd_msg_cache *ch, + struct cmd_cache_attribute *ca, + char *buf) +{ + return snprintf(buf, 20, "%d\n", ch->free); +} + +static ssize_t num_ent_show(struct cmd_msg_cache *ch, + struct cmd_cache_attribute *ca, + char *buf) +{ + return snprintf(buf, 20, "%d\n", ch->num_ent); +} + +static ssize_t num_ent_store(struct cmd_msg_cache *ch, + struct cmd_cache_attribute *ca, + const char *buf, size_t count) +{ + struct mlx5_cmd_msg *msg; + struct mlx5_cmd_msg *n; + LIST_HEAD(remove_list); + LIST_HEAD(add_list); + unsigned long flags; + int err = count; + int add = 0; + int remove; + u32 var; + int i; + + if (kstrtouint(buf, 0, &var)) + return -EINVAL; + + spin_lock_irqsave(&ch->lock, flags); + if (var < ch->num_ent) { + remove = ch->num_ent - var; + for (i = 0; i < remove; i++) { + if (!list_empty(&ch->head)) { + msg = list_entry(ch->head.next, typeof(*msg), list); + list_del(&msg->list); + list_add(&msg->list, &remove_list); + ch->free--; + ch->num_ent--; + } else { + err = -EBUSY; + break; + } + } + } else if (var > ch->num_ent) { + add = var - ch->num_ent; + } + spin_unlock_irqrestore(&ch->lock, flags); + + list_for_each_entry_safe(msg, n, &remove_list, list) { + list_del(&msg->list); + mlx5_free_cmd_msg(ch->dev, msg); + } + + for (i = 0; i < add; i++) { + msg = mlx5_alloc_cmd_msg(ch->dev, GFP_KERNEL, ch->max_inbox_size, 0); + if (IS_ERR(msg)) { + err = PTR_ERR(msg); + if (i) + pr_warn("could add only %d entries\n", i); + break; + } + list_add(&msg->list, &add_list); + } + + spin_lock_irqsave(&ch->lock, flags); + list_for_each_entry_safe(msg, n, &add_list, list) { + list_del(&msg->list); + list_add_tail(&msg->list, &ch->head); + ch->num_ent++; + ch->free++; + msg->parent = ch; + } + spin_unlock_irqrestore(&ch->lock, flags); + + return err; +} + +static ssize_t miss_store(struct cmd_msg_cache *ch, + struct cmd_cache_attribute *ca, + const char *buf, size_t count) +{ + unsigned long flags; + u32 var; + + if (kstrtouint(buf, 0, &var)) + return -EINVAL; + + if (var) { + pr_warn("you may only clear the miss value\n"); + return -EINVAL; + } + + spin_lock_irqsave(&ch->lock, flags); + ch->miss = 0; + spin_unlock_irqrestore(&ch->lock, flags); + + return count; +} + +static ssize_t miss_show(struct cmd_msg_cache *ch, + struct cmd_cache_attribute *ca, + char *buf) +{ + return snprintf(buf, 20, "%d\n", ch->miss); +} + +static ssize_t total_commands_show(struct cmd_msg_cache *ch, + struct cmd_cache_attribute *ca, + char *buf) +{ + return snprintf(buf, 20, "%d\n", ch->total_commands); +} + +static ssize_t total_commands_store(struct cmd_msg_cache *ch, + struct cmd_cache_attribute *ca, + const char *buf, size_t count) +{ + unsigned long flags; + u32 var; + + if (kstrtouint(buf, 0, &var)) + return -EINVAL; + + if (var) { + pr_warn("you may only clear the total_commands value\n"); + return -EINVAL; + } + + spin_lock_irqsave(&ch->lock, flags); + ch->total_commands = 0; + spin_unlock_irqrestore(&ch->lock, flags); + + return count; +} + +static ssize_t cmd_cache_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct cmd_cache_attribute *ca = + container_of(attr, struct cmd_cache_attribute, attr); + struct cmd_msg_cache *ch = container_of(kobj, struct cmd_msg_cache, kobj); + + if (!ca->show) + return -EIO; + + return ca->show(ch, ca, buf); +} + +static ssize_t cmd_cache_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, + size_t size) +{ + struct cmd_cache_attribute *ca = + container_of(attr, struct cmd_cache_attribute, attr); + struct cmd_msg_cache *ch = container_of(kobj, struct cmd_msg_cache, kobj); + + if (!ca->store) + return -EIO; + + return ca->store(ch, ca, buf, size); +} + +static ssize_t real_miss_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct pci_dev *pdev = container_of(dev, struct pci_dev, dev); + struct mlx5_core_dev *cdev = pci_get_drvdata(pdev); + + return snprintf(buf, 20, "%d\n", atomic_read(&cdev->cmd.real_miss)); +} + +static ssize_t real_miss_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct pci_dev *pdev = container_of(dev, struct pci_dev, dev); + struct mlx5_core_dev *cdev = pci_get_drvdata(pdev); + u32 var; + + if (kstrtouint(buf, 0, &var)) + return -EINVAL; + + if (var) { + pr_warn("you may only clear this value\n"); + return -EINVAL; + } + + atomic_set(&cdev->cmd.real_miss, 0); + + return count; +} + +static const struct sysfs_ops cmd_cache_sysfs_ops = { + .show = cmd_cache_attr_show, + .store = cmd_cache_attr_store, +}; + +#define CMD_CACHE_ATTR(_name) struct cmd_cache_attribute cmd_cache_attr_##_name = \ + __ATTR(_name, 0644, _name##_show, _name##_store) +#define CMD_CACHE_ATTR_RO(_name) struct cmd_cache_attribute cmd_cache_attr_##_name = \ + __ATTR(_name, 0444, _name##_show, NULL) + +static CMD_CACHE_ATTR_RO(free); +static CMD_CACHE_ATTR(num_ent); +static CMD_CACHE_ATTR(miss); +static CMD_CACHE_ATTR(total_commands); + +struct cache_pdev_attr { + struct attribute attr; + struct mlx5_core_dev *dev; + struct kobject kobj; +}; + +static struct attribute *cmd_cache_default_attrs[] = { + &cmd_cache_attr_free.attr, + &cmd_cache_attr_num_ent.attr, + &cmd_cache_attr_miss.attr, + &cmd_cache_attr_total_commands.attr, + NULL +}; + +static struct kobj_type cmd_cache_type = { + .sysfs_ops = &cmd_cache_sysfs_ops, + .default_attrs = cmd_cache_default_attrs +}; + +static DEVICE_ATTR(real_miss, 0600, real_miss_show, real_miss_store); + +static int cmd_sysfs_init(struct mlx5_core_dev *dev) +{ + struct mlx5_cmd *cmd = &dev->cmd; + struct cmd_msg_cache *cache = cmd->cache; + struct cmd_msg_cache *ch; + struct device *class_dev = dev->device; + int err; + int i; + + cmd->ko = kobject_create_and_add("commands_cache", &class_dev->kobj); + if (!cmd->ko) + return -ENOMEM; + + err = device_create_file(class_dev, &dev_attr_real_miss); + if (err) + goto err_rm; + + for (i = 0; i < MLX5_NUM_COMMAND_CACHES; i++) { + ch = &cache[i]; + err = kobject_init_and_add(&ch->kobj, &cmd_cache_type, + cmd->ko, "%d", cmd_cache_ent_size[i]); + if (err) + goto err_put; + ch->dev = dev; + kobject_uevent(&ch->kobj, KOBJ_ADD); + } + + return 0; + +err_put: + device_remove_file(class_dev, &dev_attr_real_miss); + for (; i >= 0; i--) { + ch = &cache[i]; + kobject_put(&ch->kobj); + } + +err_rm: + kobject_put(cmd->ko); + return err; +} + +static void cmd_sysfs_cleanup(struct mlx5_core_dev *dev) +{ + struct device *class_dev = dev->device; + struct cmd_msg_cache *ch; + int i; + + device_remove_file(class_dev, &dev_attr_real_miss); + for (i = MLX5_NUM_COMMAND_CACHES - 1; i >= 0; i--) { + ch = &dev->cmd.cache[i]; + if (ch->dev) + kobject_put(&ch->kobj); + } + if (dev->cmd.ko) { + kobject_put(dev->cmd.ko); + dev->cmd.ko = NULL; + } +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/compat.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/compat.c new file mode 100644 index 0000000..815fa04 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/compat.c @@ -0,0 +1,162 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2019 Mellanox Technologies. */ + +#include +#include "devlink.h" +#include "eswitch.h" +#include "en.h" +#include "en_rep.h" +#include "en/rep/tc.h" + +#ifdef CONFIG_MLX5_ESWITCH +#if defined(HAVE_SWITCHDEV_OPS) || defined(HAVE_SWITCHDEV_H_COMPAT) +int mlx5e_attr_get(struct net_device *dev, struct switchdev_attr *attr) +{ + int err = 0; + + if (!netif_device_present(dev)) + return -EOPNOTSUPP; + + switch (attr->id) { +#ifndef HAVE_NDO_GET_PORT_PARENT_ID + case SWITCHDEV_ATTR_ID_PORT_PARENT_ID: + err = mlx5e_rep_get_port_parent_id(dev, &attr->u.ppid); + break; +#endif + default: + return -EOPNOTSUPP; + } + + return err; +} +#endif + +#ifdef HAVE_SWITCHDEV_H_COMPAT +static inline int dev_isalive(const struct net_device *dev) +{ + return dev->reg_state <= NETREG_REGISTERED; +} + +static ssize_t phys_port_name_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + ssize_t ret = -EINVAL; + + if (!rtnl_trylock()) + return restart_syscall(); + + if (dev_isalive(netdev)) { + char name[IFNAMSIZ]; + + ret = mlx5e_rep_get_phys_port_name(netdev, name, sizeof(name)); + if (!ret) + ret = sprintf(buf, "%s\n", name); + } + rtnl_unlock(); + + return ret; +} + +ssize_t phys_switch_id_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + ssize_t ret = -EINVAL; + + if (!rtnl_trylock()) + return restart_syscall(); + + if (dev_isalive(netdev)) { + struct switchdev_attr attr = { + .orig_dev = netdev, + .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID, + .flags = SWITCHDEV_F_NO_RECURSE, + }; + + ret = mlx5e_attr_get(netdev, &attr); + if (!ret) + ret = sprintf(buf, "%*phN\n", attr.u.ppid.id_len, + attr.u.ppid.id); + } + rtnl_unlock(); + + return ret; +} + +static DEVICE_ATTR(phys_port_name, S_IRUGO, phys_port_name_show, NULL); +static DEVICE_ATTR(phys_switch_id, S_IRUGO, phys_switch_id_show, NULL); + +static struct attribute *rep_sysfs_attrs[] = { + &dev_attr_phys_port_name.attr, + &dev_attr_phys_switch_id.attr, + NULL, +}; + +static struct attribute_group rep_sysfs_attr_group = { + .attrs = rep_sysfs_attrs, +}; +#endif /* HAVE_SWITCHDEV_H_COMPAT */ + +void mlx5e_rep_set_sysfs_attr(struct net_device *netdev) +{ + if (!netdev) + return; + +#ifdef HAVE_SWITCHDEV_H_COMPAT + if (!netdev->sysfs_groups[0]) + netdev->sysfs_groups[0] = &rep_sysfs_attr_group; +#endif +} + +int mlx5e_vport_rep_load_compat(struct mlx5e_priv *priv) +{ + struct net_device *netdev = priv->netdev; +#if IS_ENABLED(CONFIG_MLX5_CLS_ACT) && defined(HAVE_TC_SETUP_CB_EGDEV_REGISTER) + struct mlx5e_rep_priv *uplink_rpriv; +#ifdef HAVE_TC_BLOCK_OFFLOAD + struct mlx5e_priv *upriv; +#endif + int err; + + uplink_rpriv = mlx5_eswitch_get_uplink_priv(priv->mdev->priv.eswitch, + REP_ETH); +#ifdef HAVE_TC_BLOCK_OFFLOAD + upriv = netdev_priv(uplink_rpriv->netdev); + err = tc_setup_cb_egdev_register(netdev, mlx5e_rep_setup_tc_cb_egdev, + upriv); +#else + err = tc_setup_cb_egdev_register(netdev, mlx5e_rep_setup_tc_cb, + uplink_rpriv->netdev); +#endif + if (err) + return err; +#endif + + mlx5e_rep_set_sysfs_attr(netdev); + return 0; +} + +void mlx5e_vport_rep_unload_compat(struct mlx5e_priv *priv) +{ +#if IS_ENABLED(CONFIG_MLX5_CLS_ACT) && defined(HAVE_TC_SETUP_CB_EGDEV_REGISTER) + struct net_device *netdev = priv->netdev; + struct mlx5e_rep_priv *uplink_rpriv; +#ifdef HAVE_TC_BLOCK_OFFLOAD + struct mlx5e_priv *upriv; +#endif + + uplink_rpriv = mlx5_eswitch_get_uplink_priv(priv->mdev->priv.eswitch, + REP_ETH); +#ifdef HAVE_TC_BLOCK_OFFLOAD + upriv = netdev_priv(uplink_rpriv->netdev); + tc_setup_cb_egdev_unregister(netdev, mlx5e_rep_setup_tc_cb_egdev, + upriv); +#else + tc_setup_cb_egdev_unregister(netdev, mlx5e_rep_setup_tc_cb, + uplink_rpriv->netdev); +#endif + +#endif +} +#endif /* CONFIG_MLX5_ESWITCH */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/compat.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/compat.h new file mode 100644 index 0000000..b43221d --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/compat.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2020 Mellanox Technologies */ + +#ifndef __MLX5_COMPAT__ +#define __MLX5_COMPAT__ + +#ifdef CONFIG_MLX5_ESWITCH +#if defined(HAVE_SWITCHDEV_OPS) || defined(HAVE_SWITCHDEV_H_COMPAT) +int mlx5e_attr_get(struct net_device *dev, struct switchdev_attr *attr); +#endif +void mlx5e_rep_set_sysfs_attr(struct net_device *netdev); +int mlx5e_vport_rep_load_compat(struct mlx5e_priv *priv); +int mlx5e_vport_rep_unload_compat(struct mlx5e_priv *priv); +#else +void mlx5e_rep_set_sysfs_attr(struct net_device *netdev) {} +#endif /* CONFIG_MLX5_ESWITCH */ + +#endif /* __MLX5_COMPAT__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/cq.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/cq.c new file mode 100644 index 0000000..0bf2dd5 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/cq.c @@ -0,0 +1,230 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include "mlx5_core.h" +#include "lib/eq.h" + +#define TASKLET_MAX_TIME 2 +#define TASKLET_MAX_TIME_JIFFIES msecs_to_jiffies(TASKLET_MAX_TIME) + +void mlx5_cq_tasklet_cb(struct tasklet_struct *t) +{ + unsigned long flags; + unsigned long end = jiffies + TASKLET_MAX_TIME_JIFFIES; + struct mlx5_eq_tasklet *ctx = from_tasklet(ctx, t, task); + struct mlx5_core_cq *mcq; + struct mlx5_core_cq *temp; + + spin_lock_irqsave(&ctx->lock, flags); + list_splice_tail_init(&ctx->list, &ctx->process_list); + spin_unlock_irqrestore(&ctx->lock, flags); + + list_for_each_entry_safe(mcq, temp, &ctx->process_list, + tasklet_ctx.list) { + list_del_init(&mcq->tasklet_ctx.list); + if (mcq->tasklet_ctx.comp) + mcq->tasklet_ctx.comp(mcq, NULL); + mlx5_cq_put(mcq); + if (time_after(jiffies, end)) + break; + } + + if (!list_empty(&ctx->process_list)) + tasklet_schedule(&ctx->task); +} + +static void mlx5_add_cq_to_tasklet(struct mlx5_core_cq *cq, + struct mlx5_eqe *eqe) +{ + unsigned long flags; + struct mlx5_eq_tasklet *tasklet_ctx = cq->tasklet_ctx.priv; + + spin_lock_irqsave(&tasklet_ctx->lock, flags); + /* When migrating CQs between EQs will be implemented, please note + * that you need to sync this point. It is possible that + * while migrating a CQ, completions on the old EQs could + * still arrive. + */ + if (list_empty_careful(&cq->tasklet_ctx.list)) { + mlx5_cq_hold(cq); + list_add_tail(&cq->tasklet_ctx.list, &tasklet_ctx->list); + } + spin_unlock_irqrestore(&tasklet_ctx->lock, flags); +} + +/* Callers must verify outbox status in case of err */ +int mlx5_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, + u32 *in, int inlen, u32 *out, int outlen) +{ + int eqn = MLX5_GET(cqc, MLX5_ADDR_OF(create_cq_in, in, cq_context), + c_eqn_or_apu_element); + u32 din[MLX5_ST_SZ_DW(destroy_cq_in)] = {}; + struct mlx5_eq_comp *eq; + int err; + + eq = mlx5_eqn2comp_eq(dev, eqn); + if (IS_ERR(eq)) + return PTR_ERR(eq); + + memset(out, 0, outlen); + MLX5_SET(create_cq_in, in, opcode, MLX5_CMD_OP_CREATE_CQ); + err = mlx5_cmd_do(dev, in, inlen, out, outlen); + if (err) + return err; + + cq->cqn = MLX5_GET(create_cq_out, out, cqn); + cq->cons_index = 0; + cq->arm_sn = 0; + cq->eq = eq; + cq->uid = MLX5_GET(create_cq_in, in, uid); + refcount_set(&cq->refcount, 1); + init_completion(&cq->free); + if (!cq->comp) + cq->comp = mlx5_add_cq_to_tasklet; + /* assuming CQ will be deleted before the EQ */ + cq->tasklet_ctx.priv = &eq->tasklet_ctx; + INIT_LIST_HEAD(&cq->tasklet_ctx.list); + + /* Add to comp EQ CQ tree to recv comp events */ + err = mlx5_eq_add_cq(&eq->core, cq); + if (err) + goto err_cmd; + + /* Add to async EQ CQ tree to recv async events */ + err = mlx5_eq_add_cq(mlx5_get_async_eq(dev), cq); + if (err) + goto err_cq_add; + + cq->pid = current->pid; + err = mlx5_debug_cq_add(dev, cq); + if (err) + mlx5_core_dbg(dev, "failed adding CP 0x%x to debug file system\n", + cq->cqn); + + cq->uar = dev->priv.uar; + cq->irqn = eq->core.irqn; + + return 0; + +err_cq_add: + mlx5_eq_del_cq(&eq->core, cq); +err_cmd: + MLX5_SET(destroy_cq_in, din, opcode, MLX5_CMD_OP_DESTROY_CQ); + MLX5_SET(destroy_cq_in, din, cqn, cq->cqn); + MLX5_SET(destroy_cq_in, din, uid, cq->uid); + mlx5_cmd_exec_in(dev, destroy_cq, din); + return err; +} +EXPORT_SYMBOL(mlx5_create_cq); + +/* oubox is checked and err val is normalized */ +int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, + u32 *in, int inlen, u32 *out, int outlen) +{ + int err = mlx5_create_cq(dev, cq, in, inlen, out, outlen); + + return mlx5_cmd_check(dev, err, in, out); +} +EXPORT_SYMBOL(mlx5_core_create_cq); + +int mlx5_core_destroy_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq) +{ + u32 in[MLX5_ST_SZ_DW(destroy_cq_in)] = {}; + int err; + + mlx5_debug_cq_remove(dev, cq); + + mlx5_eq_del_cq(mlx5_get_async_eq(dev), cq); + mlx5_eq_del_cq(&cq->eq->core, cq); + + MLX5_SET(destroy_cq_in, in, opcode, MLX5_CMD_OP_DESTROY_CQ); + MLX5_SET(destroy_cq_in, in, cqn, cq->cqn); + MLX5_SET(destroy_cq_in, in, uid, cq->uid); + err = mlx5_cmd_exec_in(dev, destroy_cq, in); + if (err) + return err; + + synchronize_irq(cq->irqn); + mlx5_cq_put(cq); + wait_for_completion(&cq->free); + + return 0; +} +EXPORT_SYMBOL(mlx5_core_destroy_cq); + +int mlx5_core_query_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, + u32 *out) +{ + u32 in[MLX5_ST_SZ_DW(query_cq_in)] = {}; + + MLX5_SET(query_cq_in, in, opcode, MLX5_CMD_OP_QUERY_CQ); + MLX5_SET(query_cq_in, in, cqn, cq->cqn); + return mlx5_cmd_exec_inout(dev, query_cq, in, out); +} +EXPORT_SYMBOL(mlx5_core_query_cq); + +int mlx5_core_modify_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, + u32 *in, int inlen) +{ + u32 out[MLX5_ST_SZ_DW(modify_cq_out)] = {}; + + MLX5_SET(modify_cq_in, in, opcode, MLX5_CMD_OP_MODIFY_CQ); + MLX5_SET(modify_cq_in, in, uid, cq->uid); + return mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); +} +EXPORT_SYMBOL(mlx5_core_modify_cq); + +int mlx5_core_modify_cq_moderation(struct mlx5_core_dev *dev, + struct mlx5_core_cq *cq, + u16 cq_period, + u16 cq_max_count) +{ + u32 in[MLX5_ST_SZ_DW(modify_cq_in)] = {}; + void *cqc; + + MLX5_SET(modify_cq_in, in, cqn, cq->cqn); + cqc = MLX5_ADDR_OF(modify_cq_in, in, cq_context); + MLX5_SET(cqc, cqc, cq_period, cq_period); + MLX5_SET(cqc, cqc, cq_max_count, cq_max_count); + MLX5_SET(modify_cq_in, in, + modify_field_select_resize_field_select.modify_field_select.modify_field_select, + MLX5_CQ_MODIFY_PERIOD | MLX5_CQ_MODIFY_COUNT); + + return mlx5_core_modify_cq(dev, cq, in, sizeof(in)); +} +EXPORT_SYMBOL(mlx5_core_modify_cq_moderation); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/crdump.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/crdump.c new file mode 100644 index 0000000..9e32976 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/crdump.c @@ -0,0 +1,322 @@ +/* + * Copyright (c) 2017, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include "mlx5_core.h" +#include "lib/pci_vsc.h" + +#define MLX5_PROTECTED_CR_SPCAE_DOMAIN 0x6 +#define MLX5_PROTECTED_CR_SCAN_CRSPACE 0x7 + +/* iter func */ +struct mlx5_crdump_iter { + struct mlx5_fw_crdump *dump; + u32 cur_index; + u32 cur_data; +}; + +int mlx5_crdump_iter_next(struct mlx5_crdump_iter *iter) +{ + int ret = -1; + + /* check if we are at the end */ + mutex_lock(&iter->dump->crspace_mutex); + if (iter->cur_index >= iter->dump->crspace_size) + goto unlock; + + /* if not, read the next data */ + iter->cur_data = swab32(readl(&iter->dump->crspace[iter->cur_index])); + iter->cur_index += 4; + ret = 0; + +unlock: + mutex_unlock(&iter->dump->crspace_mutex); + return ret; +} + +struct mlx5_crdump_iter *mlx5_crdump_iter_init(struct mlx5_fw_crdump *dump) +{ + struct mlx5_crdump_iter *iter; + + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return NULL; + + iter->dump = dump; + iter->cur_index = 0; + + if (mlx5_crdump_iter_next(iter)) { + kfree(iter); + return NULL; + } + + return iter; +} + +void mlx5_crdump_iter_read(struct mlx5_crdump_iter *iter, + u32 *data, u32 *offset) +{ + *data = iter->cur_data; + *offset = iter->cur_index - 4; +} + +/* seq func */ +static void *mlx5_crdump_seq_start(struct seq_file *file, loff_t *pos) +{ + struct mlx5_crdump_iter *iter; + loff_t n = *pos; + + iter = mlx5_crdump_iter_init(file->private); + if (!iter) + return NULL; + + while (n--) { + if (mlx5_crdump_iter_next(iter)) { + kfree(iter); + return NULL; + } + } + + return iter; +} + +static void *mlx5_crdump_seq_next(struct seq_file *file, void *iter_ptr, + loff_t *pos) +{ + struct mlx5_crdump_iter *iter = iter_ptr; + + (*pos)++; + + if (mlx5_crdump_iter_next(iter)) { + kfree(iter); + return NULL; + } + + return iter; +} + +static void mlx5_crdump_seq_stop(struct seq_file *file, void *iter_ptr) +{ + /* nothing for now */ +} + +static int mlx5_crdump_seq_show(struct seq_file *file, void *iter_ptr) +{ + struct mlx5_crdump_iter *iter = iter_ptr; + u32 data; + u32 offset; + + if (!iter) + return 0; + + mlx5_crdump_iter_read(iter, &data, &offset); + + seq_printf(file, "0x%08x 0x%08x\n", offset, cpu_to_be32(data)); + + return 0; +} + +static const struct seq_operations mlx5_crdump_seq_ops = { + .start = mlx5_crdump_seq_start, + .next = mlx5_crdump_seq_next, + .stop = mlx5_crdump_seq_stop, + .show = mlx5_crdump_seq_show, +}; + +static int mlx5_crdump_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int ret; + + ret = seq_open(file, &mlx5_crdump_seq_ops); + if (ret) + return ret; + + seq = file->private_data; + seq->private = pde_data(inode); + + return 0; +} + +static const struct proc_ops mlx5_crdump_ops = { + .proc_open = mlx5_crdump_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = seq_release +}; + +int mlx5_cr_protected_capture(struct mlx5_core_dev *dev) +{ + struct mlx5_priv *priv = &dev->priv; + void *cr_data = NULL; + u32 total_len = 0; + int ret = 0; + + if (!priv->health.crdump->vsec_addr) + return -ENODEV; + + ret = mlx5_pciconf_cap9_sem(dev, LOCK); + if (ret) + return ret; + + /* Verify no other PF is running cr-dump or sw reset */ + ret = mlx5_vsc_sem_set_space(dev, MLX5_SEMAPHORE_SW_RESET, MLX5_VSC_LOCK); + if (ret) { + mlx5_core_warn(dev, "Failed to lock SW reset semaphore\n"); + goto unlock; + } + + ret = mlx5_pciconf_set_protected_addr_space(dev, &total_len); + if (ret) + goto unlock_sw_reset_sem; + + cr_data = kcalloc(total_len, sizeof(u8), GFP_KERNEL); + if (!cr_data) { + ret = -ENOMEM; + goto unlock_sw_reset_sem; + } + if (priv->health.crdump->space == MLX5_PROTECTED_CR_SCAN_CRSPACE) + ret = mlx5_block_op_pciconf_fast(dev, (u32 *)cr_data, total_len); + else + ret = mlx5_block_op_pciconf(dev, 0, (u32 *)cr_data, total_len); + if (ret < 0) + goto free_mem; + + if (total_len != ret) { + pr_warn("crdump failed to read full dump, read %d out of %u\n", + ret, total_len); + ret = -EINVAL; + goto free_mem; + } + + priv->health.crdump->crspace = cr_data; + priv->health.crdump->crspace_size = total_len; + ret = 0; + +free_mem: + if (ret) + kfree(cr_data); +unlock_sw_reset_sem: + mlx5_vsc_sem_set_space(dev, MLX5_SEMAPHORE_SW_RESET, MLX5_VSC_UNLOCK); +unlock: + mlx5_pciconf_cap9_sem(dev, UNLOCK); + return ret; +} + +int mlx5_fill_cr_dump(struct mlx5_core_dev *dev) +{ + int ret = 0; + + if (!mlx5_core_is_pf(dev)) + return 0; + + mutex_lock(&dev->priv.health.crdump->crspace_mutex); + if (dev->priv.health.crdump->crspace_size) { + /* reading only at the first time */ + pr_debug("crdump was already taken, returning\n"); + goto unlock; + } + + dev->priv.health.crdump->vsec_addr = pci_find_capability(dev->pdev, CAP_ID); + if (!dev->priv.health.crdump->vsec_addr) { + pr_warn("failed reading vsec_addr\n"); + ret = -1; + goto unlock; + } + + kfree(dev->priv.health.crdump->crspace); + dev->priv.health.crdump->crspace_size = 0; + + ret = mlx5_cr_protected_capture(dev); + if (ret) { + dev_err(&dev->pdev->dev, "failed capture crdump (err: %d)\n", ret); + goto unlock; + } + + pr_info("crdump: Crash snapshot collected to /proc/%s/%s/%s\n", + MLX5_CORE_PROC, MLX5_CORE_PROC_CRDUMP, + pci_name(dev->pdev)); + +unlock: + mutex_unlock(&dev->priv.health.crdump->crspace_mutex); + return ret; +} + +int mlx5_crdump_init(struct mlx5_core_dev *dev) +{ + struct mlx5_priv *priv = &dev->priv; + struct mlx5_fw_crdump *crdump; + int ret = -1; + + if (!mlx5_core_is_pf(dev)) + return 0; + + priv->health.crdump = kzalloc(sizeof(*crdump), GFP_KERNEL); + if (!priv->health.crdump) + return -ENOMEM; + + crdump = priv->health.crdump; + + mutex_init(&crdump->crspace_mutex); + + if (mlx5_crdump_dir) + if (!proc_create_data(pci_name(dev->pdev), S_IRUGO, + mlx5_crdump_dir, &mlx5_crdump_ops, + crdump)) { + pr_warn("failed creating proc file\n"); + goto clean_mem; + } + + return 0; + +clean_mem: + kfree(crdump); + return ret; +} + +void mlx5_crdump_cleanup(struct mlx5_core_dev *dev) +{ + struct mlx5_priv *priv = &dev->priv; + struct mlx5_fw_crdump *crdump = priv->health.crdump; + + if (!mlx5_core_is_pf(dev)) + return; + + if (mlx5_crdump_dir) + remove_proc_entry(pci_name(dev->pdev), mlx5_crdump_dir); + + if (crdump) { + kfree(crdump->crspace); + kfree(crdump); + } +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c new file mode 100644 index 0000000..aaaa5fe --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c @@ -0,0 +1,582 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include "mlx5_core.h" +#include "lib/eq.h" + +enum { + QP_PID, + QP_STATE, + QP_XPORT, + QP_MTU, + QP_N_RECV, + QP_RECV_SZ, + QP_N_SEND, + QP_LOG_PG_SZ, + QP_RQPN, +}; + +static char *qp_fields[] = { + [QP_PID] = "pid", + [QP_STATE] = "state", + [QP_XPORT] = "transport", + [QP_MTU] = "mtu", + [QP_N_RECV] = "num_recv", + [QP_RECV_SZ] = "rcv_wqe_sz", + [QP_N_SEND] = "num_send", + [QP_LOG_PG_SZ] = "log2_page_sz", + [QP_RQPN] = "remote_qpn", +}; + +enum { + EQ_NUM_EQES, + EQ_INTR, + EQ_LOG_PG_SZ, +}; + +static char *eq_fields[] = { + [EQ_NUM_EQES] = "num_eqes", + [EQ_INTR] = "intr", + [EQ_LOG_PG_SZ] = "log_page_size", +}; + +enum { + CQ_PID, + CQ_NUM_CQES, + CQ_LOG_PG_SZ, +}; + +static char *cq_fields[] = { + [CQ_PID] = "pid", + [CQ_NUM_CQES] = "num_cqes", + [CQ_LOG_PG_SZ] = "log_page_size", +}; + +struct dentry *mlx5_debugfs_root; +EXPORT_SYMBOL(mlx5_debugfs_root); + +void mlx5_register_debugfs(void) +{ + mlx5_debugfs_root = debugfs_create_dir("mlx5", NULL); +} + +void mlx5_unregister_debugfs(void) +{ + debugfs_remove(mlx5_debugfs_root); +} + +struct dentry *mlx5_debugfs_get_dev_root(struct mlx5_core_dev *dev) +{ + return dev->priv.dbg.dbg_root; +} +EXPORT_SYMBOL(mlx5_debugfs_get_dev_root); + +void mlx5_qp_debugfs_init(struct mlx5_core_dev *dev) +{ + dev->priv.dbg.qp_debugfs = debugfs_create_dir("QPs", dev->priv.dbg.dbg_root); +} +EXPORT_SYMBOL(mlx5_qp_debugfs_init); + +void mlx5_qp_debugfs_cleanup(struct mlx5_core_dev *dev) +{ + debugfs_remove_recursive(dev->priv.dbg.qp_debugfs); +} +EXPORT_SYMBOL(mlx5_qp_debugfs_cleanup); + +void mlx5_eq_debugfs_init(struct mlx5_core_dev *dev) +{ + dev->priv.dbg.eq_debugfs = debugfs_create_dir("EQs", dev->priv.dbg.dbg_root); +} + +void mlx5_eq_debugfs_cleanup(struct mlx5_core_dev *dev) +{ + debugfs_remove_recursive(dev->priv.dbg.eq_debugfs); +} + +static ssize_t average_read(struct file *filp, char __user *buf, size_t count, + loff_t *pos) +{ + struct mlx5_cmd_stats *stats; + u64 field = 0; + int ret; + char tbuf[22]; + + stats = filp->private_data; + spin_lock_irq(&stats->lock); + if (stats->n) + field = div64_u64(stats->sum, stats->n); + spin_unlock_irq(&stats->lock); + ret = snprintf(tbuf, sizeof(tbuf), "%llu\n", field); + return simple_read_from_buffer(buf, count, pos, tbuf, ret); +} + +static ssize_t average_write(struct file *filp, const char __user *buf, + size_t count, loff_t *pos) +{ + struct mlx5_cmd_stats *stats; + + stats = filp->private_data; + spin_lock_irq(&stats->lock); + stats->sum = 0; + stats->n = 0; + spin_unlock_irq(&stats->lock); + + *pos += count; + + return count; +} + +static const struct file_operations stats_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = average_read, + .write = average_write, +}; + +static void +mlx5_cmdif_debugfs_init_ct(struct mlx5_core_dev *dev) +{ +#if IS_ENABLED(CONFIG_MLX5_TC_CT) + struct mlx5_tc_ct_debugfs *ct_debugfs; + + ct_debugfs = kzalloc(sizeof(*ct_debugfs), GFP_KERNEL); + if (!ct_debugfs) + return; + + ct_debugfs->root = debugfs_create_dir("ct", dev->priv.dbg.dbg_root); + debugfs_create_atomic_t("offloaded", 0400, ct_debugfs->root, + &ct_debugfs->stats.offloaded); + debugfs_create_atomic_t("rx_dropped", 0400, ct_debugfs->root, + &ct_debugfs->stats.rx_dropped); + + dev->priv.ct_debugfs = ct_debugfs; +#endif +} + +static void +mlx5_cmdif_debugfs_cleanup_ct(struct mlx5_core_dev *dev) +{ +#if IS_ENABLED(CONFIG_MLX5_TC_CT) + struct mlx5_tc_ct_debugfs *ct_debugfs = dev->priv.ct_debugfs; + + debugfs_remove_recursive(ct_debugfs->root); + kfree(ct_debugfs); +#endif +} + +void mlx5_cmdif_debugfs_init(struct mlx5_core_dev *dev) +{ + struct mlx5_cmd_stats *stats; + struct dentry **cmd; + const char *namep; + int i; + + cmd = &dev->priv.dbg.cmdif_debugfs; + *cmd = debugfs_create_dir("commands", dev->priv.dbg.dbg_root); + + for (i = 0; i < MLX5_CMD_OP_MAX; i++) { + stats = &dev->cmd.stats[i]; + namep = mlx5_command_str(i); + if (strcmp(namep, "unknown command opcode")) { + stats->root = debugfs_create_dir(namep, *cmd); + + debugfs_create_file("average", 0400, stats->root, stats, + &stats_fops); + debugfs_create_u64("n", 0400, stats->root, &stats->n); + debugfs_create_u64("failed", 0400, stats->root, &stats->failed); + debugfs_create_u64("failed_mbox_status", 0400, stats->root, + &stats->failed_mbox_status); + debugfs_create_u32("last_failed_errno", 0400, stats->root, + &stats->last_failed_errno); + debugfs_create_u8("last_failed_mbox_status", 0400, stats->root, + &stats->last_failed_mbox_status); + } + } + + mlx5_cmdif_debugfs_init_ct(dev); +} + +void mlx5_cmdif_debugfs_cleanup(struct mlx5_core_dev *dev) +{ + mlx5_cmdif_debugfs_cleanup_ct(dev); + debugfs_remove_recursive(dev->priv.dbg.cmdif_debugfs); +} + +void mlx5_cq_debugfs_init(struct mlx5_core_dev *dev) +{ + dev->priv.dbg.cq_debugfs = debugfs_create_dir("CQs", dev->priv.dbg.dbg_root); +} + +void mlx5_cq_debugfs_cleanup(struct mlx5_core_dev *dev) +{ + debugfs_remove_recursive(dev->priv.dbg.cq_debugfs); +} + +void mlx5_pages_debugfs_init(struct mlx5_core_dev *dev) +{ + struct dentry *pages; + + dev->priv.dbg.pages_debugfs = debugfs_create_dir("pages", dev->priv.dbg.dbg_root); + pages = dev->priv.dbg.pages_debugfs; + + debugfs_create_u32("fw_pages_total", 0400, pages, &dev->priv.fw_pages); + debugfs_create_u32("fw_pages_vfs", 0400, pages, &dev->priv.vfs_pages); + debugfs_create_u32("fw_pages_host_pf", 0400, pages, &dev->priv.host_pf_pages); + debugfs_create_u32("fw_pages_alloc_failed", 0400, pages, &dev->priv.fw_pages_alloc_failed); + debugfs_create_u32("fw_pages_give_dropped", 0400, pages, &dev->priv.give_pages_dropped); + debugfs_create_u32("fw_pages_reclaim_discard", 0400, pages, + &dev->priv.reclaim_pages_discard); +} + +void mlx5_pages_debugfs_cleanup(struct mlx5_core_dev *dev) +{ + debugfs_remove_recursive(dev->priv.dbg.pages_debugfs); +} + +static u64 qp_read_field(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp, + int index, int *is_str) +{ + int outlen = MLX5_ST_SZ_BYTES(query_qp_out); + u32 in[MLX5_ST_SZ_DW(query_qp_in)] = {}; + u64 param = 0; + u32 *out; + int state; + u32 *qpc; + int err; + + out = kzalloc(outlen, GFP_KERNEL); + if (!out) + return 0; + + MLX5_SET(query_qp_in, in, opcode, MLX5_CMD_OP_QUERY_QP); + MLX5_SET(query_qp_in, in, qpn, qp->qpn); + err = mlx5_cmd_exec_inout(dev, query_qp, in, out); + if (err) + goto out; + + *is_str = 0; + + qpc = MLX5_ADDR_OF(query_qp_out, out, qpc); + switch (index) { + case QP_PID: + param = qp->pid; + break; + case QP_STATE: + state = MLX5_GET(qpc, qpc, state); + param = (unsigned long)mlx5_qp_state_str(state); + *is_str = 1; + break; + case QP_XPORT: + param = (unsigned long)mlx5_qp_type_str(MLX5_GET(qpc, qpc, st)); + *is_str = 1; + break; + case QP_MTU: + switch (MLX5_GET(qpc, qpc, mtu)) { + case IB_MTU_256: + param = 256; + break; + case IB_MTU_512: + param = 512; + break; + case IB_MTU_1024: + param = 1024; + break; + case IB_MTU_2048: + param = 2048; + break; + case IB_MTU_4096: + param = 4096; + break; + default: + param = 0; + } + break; + case QP_N_RECV: + param = 1 << MLX5_GET(qpc, qpc, log_rq_size); + break; + case QP_RECV_SZ: + param = 1 << (MLX5_GET(qpc, qpc, log_rq_stride) + 4); + break; + case QP_N_SEND: + if (!MLX5_GET(qpc, qpc, no_sq)) + param = 1 << MLX5_GET(qpc, qpc, log_sq_size); + break; + case QP_LOG_PG_SZ: + param = MLX5_GET(qpc, qpc, log_page_size) + 12; + break; + case QP_RQPN: + param = MLX5_GET(qpc, qpc, remote_qpn); + break; + } +out: + kfree(out); + return param; +} + +static u64 eq_read_field(struct mlx5_core_dev *dev, struct mlx5_eq *eq, + int index) +{ + int outlen = MLX5_ST_SZ_BYTES(query_eq_out); + u32 in[MLX5_ST_SZ_DW(query_eq_in)] = {}; + u64 param = 0; + void *ctx; + u32 *out; + int err; + + out = kzalloc(outlen, GFP_KERNEL); + if (!out) + return param; + + MLX5_SET(query_eq_in, in, opcode, MLX5_CMD_OP_QUERY_EQ); + MLX5_SET(query_eq_in, in, eq_number, eq->eqn); + err = mlx5_cmd_exec_inout(dev, query_eq, in, out); + if (err) { + mlx5_core_warn(dev, "failed to query eq\n"); + goto out; + } + ctx = MLX5_ADDR_OF(query_eq_out, out, eq_context_entry); + + switch (index) { + case EQ_NUM_EQES: + param = 1 << MLX5_GET(eqc, ctx, log_eq_size); + break; + case EQ_INTR: + param = MLX5_GET(eqc, ctx, intr); + break; + case EQ_LOG_PG_SZ: + param = MLX5_GET(eqc, ctx, log_page_size) + 12; + break; + } + +out: + kfree(out); + return param; +} + +static u64 cq_read_field(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, + int index) +{ + int outlen = MLX5_ST_SZ_BYTES(query_cq_out); + u64 param = 0; + void *ctx; + u32 *out; + int err; + + out = kvzalloc(outlen, GFP_KERNEL); + if (!out) + return param; + + err = mlx5_core_query_cq(dev, cq, out); + if (err) { + mlx5_core_warn(dev, "failed to query cq\n"); + goto out; + } + ctx = MLX5_ADDR_OF(query_cq_out, out, cq_context); + + switch (index) { + case CQ_PID: + param = cq->pid; + break; + case CQ_NUM_CQES: + param = 1 << MLX5_GET(cqc, ctx, log_cq_size); + break; + case CQ_LOG_PG_SZ: + param = MLX5_GET(cqc, ctx, log_page_size); + break; + } + +out: + kvfree(out); + return param; +} + +static ssize_t dbg_read(struct file *filp, char __user *buf, size_t count, + loff_t *pos) +{ + struct mlx5_field_desc *desc; + struct mlx5_rsc_debug *d; + char tbuf[18]; + int is_str = 0; + u64 field; + int ret; + + desc = filp->private_data; + d = (void *)(desc - desc->i) - sizeof(*d); + switch (d->type) { + case MLX5_DBG_RSC_QP: + field = qp_read_field(d->dev, d->object, desc->i, &is_str); + break; + + case MLX5_DBG_RSC_EQ: + field = eq_read_field(d->dev, d->object, desc->i); + break; + + case MLX5_DBG_RSC_CQ: + field = cq_read_field(d->dev, d->object, desc->i); + break; + + default: + mlx5_core_warn(d->dev, "invalid resource type %d\n", d->type); + return -EINVAL; + } + + if (is_str) + ret = snprintf(tbuf, sizeof(tbuf), "%s\n", (const char *)(unsigned long)field); + else + ret = snprintf(tbuf, sizeof(tbuf), "0x%llx\n", field); + + return simple_read_from_buffer(buf, count, pos, tbuf, ret); +} + +static const struct file_operations fops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = dbg_read, +}; + +static int add_res_tree(struct mlx5_core_dev *dev, enum dbg_rsc_type type, + struct dentry *root, struct mlx5_rsc_debug **dbg, + int rsn, char **field, int nfile, void *data) +{ + struct mlx5_rsc_debug *d; + char resn[32]; + int i; + + d = kzalloc(struct_size(d, fields, nfile), GFP_KERNEL); + if (!d) + return -ENOMEM; + + d->dev = dev; + d->object = data; + d->type = type; + sprintf(resn, "0x%x", rsn); + d->root = debugfs_create_dir(resn, root); + + for (i = 0; i < nfile; i++) { + d->fields[i].i = i; + debugfs_create_file(field[i], 0400, d->root, &d->fields[i], + &fops); + } + *dbg = d; + + return 0; +} + +static void rem_res_tree(struct mlx5_rsc_debug *d) +{ + debugfs_remove_recursive(d->root); + kfree(d); +} + +int mlx5_debug_qp_add(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp) +{ + int err; + + if (!mlx5_debugfs_root) + return 0; + + err = add_res_tree(dev, MLX5_DBG_RSC_QP, dev->priv.dbg.qp_debugfs, + &qp->dbg, qp->qpn, qp_fields, + ARRAY_SIZE(qp_fields), qp); + if (err) + qp->dbg = NULL; + + return err; +} +EXPORT_SYMBOL(mlx5_debug_qp_add); + +void mlx5_debug_qp_remove(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp) +{ + if (!mlx5_debugfs_root || !qp->dbg) + return; + + rem_res_tree(qp->dbg); + qp->dbg = NULL; +} +EXPORT_SYMBOL(mlx5_debug_qp_remove); + +int mlx5_debug_eq_add(struct mlx5_core_dev *dev, struct mlx5_eq *eq) +{ + int err; + + if (!mlx5_debugfs_root) + return 0; + + err = add_res_tree(dev, MLX5_DBG_RSC_EQ, dev->priv.dbg.eq_debugfs, + &eq->dbg, eq->eqn, eq_fields, + ARRAY_SIZE(eq_fields), eq); + if (err) + eq->dbg = NULL; + + return err; +} + +void mlx5_debug_eq_remove(struct mlx5_core_dev *dev, struct mlx5_eq *eq) +{ + if (!mlx5_debugfs_root || !eq->dbg) + return; + + rem_res_tree(eq->dbg); + eq->dbg = NULL; +} + +int mlx5_debug_cq_add(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq) +{ + int err; + + if (!mlx5_debugfs_root) + return 0; + + err = add_res_tree(dev, MLX5_DBG_RSC_CQ, dev->priv.dbg.cq_debugfs, + &cq->dbg, cq->cqn, cq_fields, + ARRAY_SIZE(cq_fields), cq); + if (err) + cq->dbg = NULL; + + return err; +} + +void mlx5_debug_cq_remove(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq) +{ + if (!mlx5_debugfs_root) + return; + + if (cq->dbg) { + rem_res_tree(cq->dbg); + cq->dbg = NULL; + } +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/dev.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/dev.c new file mode 100644 index 0000000..cb2aa3a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/dev.c @@ -0,0 +1,667 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include "mlx5_core.h" + +/* intf dev list mutex */ +static DEFINE_MUTEX(mlx5_intf_mutex); +static DEFINE_IDA(mlx5_adev_ida); + +static bool is_eth_rep_supported(struct mlx5_core_dev *dev) +{ + if (!IS_ENABLED(CONFIG_MLX5_ESWITCH)) + return false; + + if (!MLX5_ESWITCH_MANAGER(dev)) + return false; + + if (!is_mdev_switchdev_mode(dev)) + return false; + + return true; +} + +bool mlx5_eth_supported(struct mlx5_core_dev *dev) +{ + if (!IS_ENABLED(CONFIG_MLX5_CORE_EN)) + return false; + + if (mlx5_core_is_management_pf(dev)) + return false; + + if (MLX5_CAP_GEN(dev, port_type) != MLX5_CAP_PORT_TYPE_ETH) + return false; + + if (!MLX5_CAP_GEN(dev, eth_net_offloads)) { + mlx5_core_warn(dev, "Missing eth_net_offloads capability\n"); + return false; + } + + if (!MLX5_CAP_GEN(dev, nic_flow_table)) { + mlx5_core_warn(dev, "Missing nic_flow_table capability\n"); + return false; + } + + if (!MLX5_CAP_ETH(dev, csum_cap)) { + mlx5_core_warn(dev, "Missing csum_cap capability\n"); + return false; + } + + if (!MLX5_CAP_ETH(dev, max_lso_cap)) { + mlx5_core_warn(dev, "Missing max_lso_cap capability\n"); + return false; + } + + if (!MLX5_CAP_ETH(dev, vlan_cap)) { + mlx5_core_warn(dev, "Missing vlan_cap capability\n"); + return false; + } + + if (!MLX5_CAP_ETH(dev, rss_ind_tbl_cap)) { + mlx5_core_warn(dev, "Missing rss_ind_tbl_cap capability\n"); + return false; + } + + if (MLX5_CAP_FLOWTABLE(dev, + flow_table_properties_nic_receive.max_ft_level) < 3) { + mlx5_core_warn(dev, "max_ft_level < 3\n"); + return false; + } + + if (!MLX5_CAP_ETH(dev, self_lb_en_modifiable)) + mlx5_core_warn(dev, "Self loop back prevention is not supported\n"); + if (!MLX5_CAP_GEN(dev, cq_moderation)) + mlx5_core_warn(dev, "CQ moderation is not supported\n"); + if (dev->disable_en) + return false; + + return true; +} + +static bool is_eth_enabled(struct mlx5_core_dev *dev) +{ + union devlink_param_value val; + int err; + + err = devlink_param_driverinit_value_get(priv_to_devlink(dev), + DEVLINK_PARAM_GENERIC_ID_ENABLE_ETH, + &val); + return err ? false : val.vbool; +} + +bool mlx5_vnet_supported(struct mlx5_core_dev *dev) +{ + if (!IS_ENABLED(CONFIG_MLX5_VDPA_NET)) + return false; + + if (mlx5_core_is_pf(dev)) + return false; + + if (!(MLX5_CAP_GEN_64(dev, general_obj_types) & + MLX5_GENERAL_OBJ_TYPES_CAP_VIRTIO_NET_Q)) + return false; + + if (!(MLX5_CAP_DEV_VDPA_EMULATION(dev, event_mode) & + MLX5_VIRTIO_Q_EVENT_MODE_QP_MODE)) + return false; + + if (!MLX5_CAP_DEV_VDPA_EMULATION(dev, eth_frame_offload_type)) + return false; + + return true; +} + +static bool is_vnet_enabled(struct mlx5_core_dev *dev) +{ + union devlink_param_value val; + int err; + + err = devlink_param_driverinit_value_get(priv_to_devlink(dev), + DEVLINK_PARAM_GENERIC_ID_ENABLE_VNET, + &val); + return err ? false : val.vbool; +} + +static bool is_ib_rep_supported(struct mlx5_core_dev *dev) +{ + if (!IS_ENABLED(CONFIG_MLX5_INFINIBAND)) + return false; + + if (dev->priv.flags & MLX5_PRIV_FLAGS_DISABLE_IB_ADEV) + return false; + + if (!is_eth_rep_supported(dev)) + return false; + + if (!MLX5_ESWITCH_MANAGER(dev)) + return false; + + if (!is_mdev_switchdev_mode(dev)) + return false; + + if (mlx5_core_mp_enabled(dev)) + return false; + + return true; +} + +static bool is_mp_supported(struct mlx5_core_dev *dev) +{ + if (!IS_ENABLED(CONFIG_MLX5_INFINIBAND)) + return false; + + if (dev->priv.flags & MLX5_PRIV_FLAGS_DISABLE_IB_ADEV) + return false; + + if (is_ib_rep_supported(dev)) + return false; + + if (MLX5_CAP_GEN(dev, port_type) != MLX5_CAP_PORT_TYPE_ETH) + return false; + + if (!mlx5_core_is_mp_slave(dev)) + return false; + + return true; +} + +bool mlx5_rdma_supported(struct mlx5_core_dev *dev) +{ + if (!IS_ENABLED(CONFIG_MLX5_INFINIBAND)) + return false; + + if (mlx5_core_is_management_pf(dev)) + return false; + + if (dev->priv.flags & MLX5_PRIV_FLAGS_DISABLE_IB_ADEV) + return false; + + if (is_ib_rep_supported(dev)) + return false; + + if (is_mp_supported(dev)) + return false; + + return true; +} + +static bool is_ib_enabled(struct mlx5_core_dev *dev) +{ + union devlink_param_value val; + int err; + + err = devlink_param_driverinit_value_get(priv_to_devlink(dev), + DEVLINK_PARAM_GENERIC_ID_ENABLE_RDMA, + &val); + return err ? false : val.vbool; +} + +enum { + MLX5_INTERFACE_PROTOCOL_ETH, + MLX5_INTERFACE_PROTOCOL_ETH_REP, + + MLX5_INTERFACE_PROTOCOL_IB, + MLX5_INTERFACE_PROTOCOL_IB_REP, + MLX5_INTERFACE_PROTOCOL_MPIB, + + MLX5_INTERFACE_PROTOCOL_VNET, +}; + +static const struct mlx5_adev_device { + const char *suffix; + bool (*is_supported)(struct mlx5_core_dev *dev); + bool (*is_enabled)(struct mlx5_core_dev *dev); +} mlx5_adev_devices[] = { + [MLX5_INTERFACE_PROTOCOL_VNET] = { .suffix = "vnet", + .is_supported = &mlx5_vnet_supported, + .is_enabled = &is_vnet_enabled }, + [MLX5_INTERFACE_PROTOCOL_IB] = { .suffix = "rdma", + .is_supported = &mlx5_rdma_supported, + .is_enabled = &is_ib_enabled }, + [MLX5_INTERFACE_PROTOCOL_ETH] = { .suffix = "eth", + .is_supported = &mlx5_eth_supported, + .is_enabled = &is_eth_enabled }, + [MLX5_INTERFACE_PROTOCOL_ETH_REP] = { .suffix = "eth-rep", + .is_supported = &is_eth_rep_supported }, + [MLX5_INTERFACE_PROTOCOL_IB_REP] = { .suffix = "rdma-rep", + .is_supported = &is_ib_rep_supported }, + [MLX5_INTERFACE_PROTOCOL_MPIB] = { .suffix = "multiport", + .is_supported = &is_mp_supported }, +}; + +int mlx5_adev_idx_alloc(void) +{ + return ida_alloc(&mlx5_adev_ida, GFP_KERNEL); +} + +void mlx5_adev_idx_free(int idx) +{ + ida_free(&mlx5_adev_ida, idx); +} + +int mlx5_adev_init(struct mlx5_core_dev *dev) +{ + struct mlx5_priv *priv = &dev->priv; + + priv->adev = kcalloc(ARRAY_SIZE(mlx5_adev_devices), + sizeof(struct mlx5_adev *), GFP_KERNEL); + if (!priv->adev) + return -ENOMEM; + + return 0; +} + +void mlx5_adev_cleanup(struct mlx5_core_dev *dev) +{ + struct mlx5_priv *priv = &dev->priv; + + kfree(priv->adev); +} + +static void adev_release(struct device *dev) +{ + struct mlx5_adev *mlx5_adev = + container_of(dev, struct mlx5_adev, adev.dev); + struct mlx5_priv *priv = &mlx5_adev->mdev->priv; + int idx = mlx5_adev->idx; + + kfree(mlx5_adev); + priv->adev[idx] = NULL; +} + +static struct mlx5_adev *add_adev(struct mlx5_core_dev *dev, int idx) +{ + const char *suffix = mlx5_adev_devices[idx].suffix; + struct auxiliary_device *adev; + struct mlx5_adev *madev; + int ret; + + madev = kzalloc(sizeof(*madev), GFP_KERNEL); + if (!madev) + return ERR_PTR(-ENOMEM); + + adev = &madev->adev; + adev->id = dev->priv.adev_idx; + adev->name = suffix; + adev->dev.parent = dev->device; + adev->dev.release = adev_release; + madev->mdev = dev; + madev->idx = idx; + + ret = auxiliary_device_init(adev); + if (ret) { + kfree(madev); + return ERR_PTR(ret); + } + + ret = auxiliary_device_add(adev); + if (ret) { + auxiliary_device_uninit(adev); + return ERR_PTR(ret); + } + return madev; +} + +static void del_adev(struct auxiliary_device *adev) +{ + auxiliary_device_delete(adev); + auxiliary_device_uninit(adev); +} + +int mlx5_attach_device(struct mlx5_core_dev *dev) +{ + struct mlx5_priv *priv = &dev->priv; + struct auxiliary_device *adev; + struct auxiliary_driver *adrv; + int ret = 0, i; + + mutex_lock(&mlx5_intf_mutex); + priv->flags &= ~MLX5_PRIV_FLAGS_DETACH; + for (i = 0; i < ARRAY_SIZE(mlx5_adev_devices); i++) { + if (!priv->adev[i]) { + bool is_supported = false; + + if (mlx5_adev_devices[i].is_enabled) { + bool enabled; + + enabled = mlx5_adev_devices[i].is_enabled(dev); + if (!enabled) + continue; + } + + if (mlx5_adev_devices[i].is_supported) + is_supported = mlx5_adev_devices[i].is_supported(dev); + + if (!is_supported) + continue; + + priv->adev[i] = add_adev(dev, i); + if (IS_ERR(priv->adev[i])) { + ret = PTR_ERR(priv->adev[i]); + priv->adev[i] = NULL; + } + } else { + adev = &priv->adev[i]->adev; + + /* Pay attention that this is not PCI driver that + * mlx5_core_dev is connected, but auxiliary driver. + * + * Here we can race of module unload with devlink + * reload, but we don't need to take extra lock because + * we are holding global mlx5_intf_mutex. + */ + if (!adev->dev.driver) + continue; + adrv = to_auxiliary_drv(adev->dev.driver); + + if (adrv->resume) + ret = adrv->resume(adev); + } + if (ret) { + mlx5_core_warn(dev, "Device[%d] (%s) failed to load\n", + i, mlx5_adev_devices[i].suffix); + + break; + } + } + mutex_unlock(&mlx5_intf_mutex); + return ret; +} + +void mlx5_detach_device(struct mlx5_core_dev *dev) +{ + struct mlx5_priv *priv = &dev->priv; + struct auxiliary_device *adev; + struct auxiliary_driver *adrv; + pm_message_t pm = {}; + int i; + + mutex_lock(&mlx5_intf_mutex); + for (i = ARRAY_SIZE(mlx5_adev_devices) - 1; i >= 0; i--) { + if (!priv->adev[i]) + continue; + + if (mlx5_adev_devices[i].is_enabled) { + bool enabled; + + enabled = mlx5_adev_devices[i].is_enabled(dev); + if (!enabled) + goto skip_suspend; + } + + adev = &priv->adev[i]->adev; + /* Auxiliary driver was unbind manually through sysfs */ + if (!adev->dev.driver) + goto skip_suspend; + + adrv = to_auxiliary_drv(adev->dev.driver); + + if (adrv->suspend) { + adrv->suspend(adev, pm); + continue; + } + +skip_suspend: + del_adev(&priv->adev[i]->adev); + priv->adev[i] = NULL; + } + priv->flags |= MLX5_PRIV_FLAGS_DETACH; + mutex_unlock(&mlx5_intf_mutex); +} + +int mlx5_register_device(struct mlx5_core_dev *dev) +{ + int ret; + + mutex_lock(&mlx5_intf_mutex); + dev->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV; + ret = mlx5_rescan_drivers_locked(dev); + mutex_unlock(&mlx5_intf_mutex); + if (ret) + mlx5_unregister_device(dev); + + return ret; +} + +void mlx5_unregister_device(struct mlx5_core_dev *dev) +{ + mutex_lock(&mlx5_intf_mutex); + dev->priv.flags = MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV; + dev->priv.flags &= ~MLX5_PRIV_FLAGS_DETACH; + mlx5_rescan_drivers_locked(dev); + mutex_unlock(&mlx5_intf_mutex); +} + +static int add_drivers(struct mlx5_core_dev *dev) +{ + struct mlx5_priv *priv = &dev->priv; + int i, ret = 0; + + for (i = 0; i < ARRAY_SIZE(mlx5_adev_devices); i++) { + bool is_supported = false; + + if (priv->adev[i]) + continue; + + if (mlx5_adev_devices[i].is_supported) + is_supported = mlx5_adev_devices[i].is_supported(dev); + + if (!is_supported) + continue; + + priv->adev[i] = add_adev(dev, i); + if (IS_ERR(priv->adev[i])) { + mlx5_core_warn(dev, "Device[%d] (%s) failed to load\n", + i, mlx5_adev_devices[i].suffix); + /* We continue to rescan drivers and leave to the caller + * to make decision if to release everything or continue. + */ + ret = PTR_ERR(priv->adev[i]); + priv->adev[i] = NULL; + } + } + return ret; +} + +static void delete_drivers(struct mlx5_core_dev *dev) +{ + struct mlx5_priv *priv = &dev->priv; + bool delete_all; + int i; + + delete_all = priv->flags & MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV; + + for (i = ARRAY_SIZE(mlx5_adev_devices) - 1; i >= 0; i--) { + bool is_supported = false; + + if (!priv->adev[i]) + continue; + + if (mlx5_adev_devices[i].is_enabled) { + bool enabled; + + enabled = mlx5_adev_devices[i].is_enabled(dev); + if (!enabled) + goto del_adev; + } + + if (mlx5_adev_devices[i].is_supported && !delete_all) + is_supported = mlx5_adev_devices[i].is_supported(dev); + + if (is_supported) + continue; + +del_adev: + del_adev(&priv->adev[i]->adev); + priv->adev[i] = NULL; + } +} + +/* This function is used after mlx5_core_dev is reconfigured. + */ +int mlx5_rescan_drivers_locked(struct mlx5_core_dev *dev) +{ + struct mlx5_priv *priv = &dev->priv; + + lockdep_assert_held(&mlx5_intf_mutex); + if (priv->flags & MLX5_PRIV_FLAGS_DETACH) + return 0; + + delete_drivers(dev); + if (priv->flags & MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV) + return 0; + + return add_drivers(dev); +} + +bool mlx5_same_hw_devs(struct mlx5_core_dev *dev, struct mlx5_core_dev *peer_dev) +{ + u64 fsystem_guid, psystem_guid; + + fsystem_guid = mlx5_query_nic_system_image_guid(dev); + psystem_guid = mlx5_query_nic_system_image_guid(peer_dev); + + return (fsystem_guid && psystem_guid && fsystem_guid == psystem_guid); +} + +static u32 mlx5_gen_pci_id(const struct mlx5_core_dev *dev) +{ + return (u32)((pci_domain_nr(dev->pdev->bus) << 16) | + (dev->pdev->bus->number << 8) | + PCI_SLOT(dev->pdev->devfn)); +} + +static int _next_phys_dev(struct mlx5_core_dev *mdev, + const struct mlx5_core_dev *curr) +{ + if (!mlx5_core_is_pf(mdev)) + return 0; + + if (mdev == curr) + return 0; + + if (!mlx5_same_hw_devs(mdev, (struct mlx5_core_dev *)curr) && + mlx5_gen_pci_id(mdev) != mlx5_gen_pci_id(curr)) + return 0; + + return 1; +} + +static struct mlx5_core_dev *is_mlx5_core_dev(struct device *dev, struct mlx5_core_dev *curr) +{ + struct pci_dev *pdev = to_pci_dev(dev); + + if (dev->driver != curr->device->driver) + return NULL; + + return (struct mlx5_core_dev *)pci_get_drvdata(pdev); +} + +static int next_phys_dev(struct device *dev, const void *data) +{ + struct mlx5_core_dev *mdev = is_mlx5_core_dev(dev, (struct mlx5_core_dev *)data); + + if (!mdev) + return 0; + + return _next_phys_dev(mdev, data); +} + +static int next_phys_dev_lag(struct device *dev, const void *data) +{ + struct mlx5_core_dev *mdev = is_mlx5_core_dev(dev, (struct mlx5_core_dev *)data); + + if (!mdev) + return 0; + + if (!MLX5_CAP_GEN(mdev, vport_group_manager) || + !MLX5_CAP_GEN(mdev, lag_master) || + (MLX5_CAP_GEN(mdev, num_lag_ports) > MLX5_MAX_PORTS || + MLX5_CAP_GEN(mdev, num_lag_ports) <= 1)) + return 0; + + return _next_phys_dev(mdev, data); +} + +static struct device *pci_find_dev(void *data, + int (*match)(struct device *dev, const void *data)) +{ + return bus_find_device(&pci_bus_type, NULL, data, match); +} + +struct mlx5_core_dev *mlx5_get_next_dev(struct mlx5_core_dev *dev, + int (*match)(struct device *dev, const void *data)) +{ + struct device *next; + if (!mlx5_core_is_pf(dev)) + return NULL; + + next = pci_find_dev(dev, match); + if (!next) + return NULL; + + put_device(next); + return (struct mlx5_core_dev *)pci_get_drvdata(to_pci_dev(next)); +} + +/* Must be called with intf_mutex held */ +struct mlx5_core_dev *mlx5_get_next_phys_dev(struct mlx5_core_dev *dev) +{ + lockdep_assert_held(&mlx5_intf_mutex); + return mlx5_get_next_dev(dev, &next_phys_dev); +} + +/* Must be called with intf_mutex held */ +struct mlx5_core_dev *mlx5_get_next_phys_dev_lag(struct mlx5_core_dev *dev) +{ + lockdep_assert_held(&mlx5_intf_mutex); + return mlx5_get_next_dev(dev, &next_phys_dev_lag); +} + +void mlx5_dev_list_lock(void) +{ + mutex_lock(&mlx5_intf_mutex); +} +void mlx5_dev_list_unlock(void) +{ + mutex_unlock(&mlx5_intf_mutex); +} + +int mlx5_dev_list_trylock(void) +{ + return mutex_trylock(&mlx5_intf_mutex); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/devlink.c new file mode 100644 index 0000000..3577503 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/devlink.c @@ -0,0 +1,1070 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2019 Mellanox Technologies */ + +#include + +#include "mlx5_core.h" +#include "fw_reset.h" +#include "fs_core.h" +#include "eswitch.h" +#include "mlx5_devm.h" +#include "esw/qos.h" +#include "sf/dev/dev.h" +#include "sf/sf.h" +#include "en/tc_ct.h" + +static int mlx5_devlink_flash_update(struct devlink *devlink, + struct devlink_flash_update_params *params, + struct netlink_ext_ack *extack) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + + return mlx5_firmware_flash(dev, params->fw, extack); +} + +static u8 mlx5_fw_ver_major(u32 version) +{ + return (version >> 24) & 0xff; +} + +static u8 mlx5_fw_ver_minor(u32 version) +{ + return (version >> 16) & 0xff; +} + +static u16 mlx5_fw_ver_subminor(u32 version) +{ + return version & 0xffff; +} + +#define DEVLINK_FW_STRING_LEN 32 + +static int +mlx5_devlink_info_get(struct devlink *devlink, struct devlink_info_req *req, + struct netlink_ext_ack *extack) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + char version_str[DEVLINK_FW_STRING_LEN]; + u32 running_fw, stored_fw; + int err; + + err = devlink_info_driver_name_put(req, KBUILD_MODNAME); + if (err) + return err; + + err = devlink_info_version_fixed_put(req, "fw.psid", dev->board_id); + if (err) + return err; + + err = mlx5_fw_version_query(dev, &running_fw, &stored_fw); + if (err) + return err; + + snprintf(version_str, sizeof(version_str), "%d.%d.%04d", + mlx5_fw_ver_major(running_fw), mlx5_fw_ver_minor(running_fw), + mlx5_fw_ver_subminor(running_fw)); + err = devlink_info_version_running_put(req, "fw.version", version_str); + if (err) + return err; + err = devlink_info_version_running_put(req, + DEVLINK_INFO_VERSION_GENERIC_FW, + version_str); + if (err) + return err; + + /* no pending version, return running (stored) version */ + if (stored_fw == 0) + stored_fw = running_fw; + + snprintf(version_str, sizeof(version_str), "%d.%d.%04d", + mlx5_fw_ver_major(stored_fw), mlx5_fw_ver_minor(stored_fw), + mlx5_fw_ver_subminor(stored_fw)); + err = devlink_info_version_stored_put(req, "fw.version", version_str); + if (err) + return err; + return devlink_info_version_stored_put(req, + DEVLINK_INFO_VERSION_GENERIC_FW, + version_str); +} + +static int mlx5_devlink_reload_fw_activate(struct devlink *devlink, struct netlink_ext_ack *extack) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + u8 reset_level, reset_type, net_port_alive; + int err; + + err = mlx5_fw_reset_query(dev, &reset_level, &reset_type); + if (err) + return err; + if (!(reset_level & MLX5_MFRL_REG_RESET_LEVEL3)) { + NL_SET_ERR_MSG_MOD(extack, "FW activate requires reboot"); + return -EINVAL; + } + + net_port_alive = !!(reset_type & MLX5_MFRL_REG_RESET_TYPE_NET_PORT_ALIVE); + err = mlx5_fw_reset_set_reset_sync(dev, net_port_alive, extack); + if (err) + return err; + + return mlx5_fw_reset_wait_reset_done(dev); +} + +static int mlx5_devlink_trigger_fw_live_patch(struct devlink *devlink, + struct netlink_ext_ack *extack) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + u8 reset_level; + int err; + + err = mlx5_fw_reset_query(dev, &reset_level, NULL); + if (err) + return err; + if (!(reset_level & MLX5_MFRL_REG_RESET_LEVEL0)) { + NL_SET_ERR_MSG_MOD(extack, + "FW upgrade to the stored FW can't be done by FW live patching"); + return -EINVAL; + } + + return mlx5_fw_reset_set_live_patch(dev); +} + +static int mlx5_devlink_reload_down(struct devlink *devlink, bool netns_change, + enum devlink_reload_action action, + enum devlink_reload_limit limit, + struct netlink_ext_ack *extack) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + struct pci_dev *pdev = dev->pdev; + bool sf_dev_allocated; +#ifdef CONFIG_MLX5_ESWITCH + u16 mode = 0; + + if (!mlx5_devlink_eswitch_mode_get(devlink, &mode)) { + if (mode == DEVLINK_ESWITCH_MODE_SWITCHDEV) { + NL_SET_ERR_MSG_MOD(extack, "Reload not supported in switchdev mode"); + return -EOPNOTSUPP; + } + } +#endif + + sf_dev_allocated = mlx5_sf_dev_allocated(dev); + if (sf_dev_allocated) { + /* Reload results in deleting SF device which further results in + * unregistering devlink instance while holding devlink_mutext. + * Hence, do not support reload. + */ + NL_SET_ERR_MSG_MOD(extack, "reload is unsupported when SFs are allocated"); + return -EOPNOTSUPP; + } + + if (mlx5_lag_is_active(dev)) { + NL_SET_ERR_MSG_MOD(extack, "reload is unsupported in Lag mode"); + return -EOPNOTSUPP; + } + + if (pci_num_vf(pdev)) { + NL_SET_ERR_MSG_MOD(extack, "reload while VFs are present is unfavorable"); + } + + switch (action) { + case DEVLINK_RELOAD_ACTION_DRIVER_REINIT: + mlx5_unload_one(dev); + return 0; + case DEVLINK_RELOAD_ACTION_FW_ACTIVATE: + if (limit == DEVLINK_RELOAD_LIMIT_NO_RESET) + return mlx5_devlink_trigger_fw_live_patch(devlink, extack); + return mlx5_devlink_reload_fw_activate(devlink, extack); + default: + /* Unsupported action should not get to this function */ + WARN_ON(1); + return -EOPNOTSUPP; + } +} + +static int mlx5_devlink_reload_up(struct devlink *devlink, enum devlink_reload_action action, + enum devlink_reload_limit limit, u32 *actions_performed, + struct netlink_ext_ack *extack) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + + *actions_performed = BIT(action); + switch (action) { + case DEVLINK_RELOAD_ACTION_DRIVER_REINIT: + return mlx5_load_one(dev, false); + case DEVLINK_RELOAD_ACTION_FW_ACTIVATE: + if (limit == DEVLINK_RELOAD_LIMIT_NO_RESET) + break; + /* On fw_activate action, also driver is reloaded and reinit performed */ + *actions_performed |= BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT); + return mlx5_load_one(dev, false); + default: + /* Unsupported action should not get to this function */ + WARN_ON(1); + return -EOPNOTSUPP; + } + + return 0; +} + +static struct mlx5_devlink_trap *mlx5_find_trap_by_id(struct mlx5_core_dev *dev, int trap_id) +{ + struct mlx5_devlink_trap *dl_trap; + + list_for_each_entry(dl_trap, &dev->priv.traps, list) + if (dl_trap->trap.id == trap_id) + return dl_trap; + + return NULL; +} + +static int mlx5_devlink_trap_init(struct devlink *devlink, const struct devlink_trap *trap, + void *trap_ctx) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + struct mlx5_devlink_trap *dl_trap; + + dl_trap = kzalloc(sizeof(*dl_trap), GFP_KERNEL); + if (!dl_trap) + return -ENOMEM; + + dl_trap->trap.id = trap->id; + dl_trap->trap.action = DEVLINK_TRAP_ACTION_DROP; + dl_trap->item = trap_ctx; + + if (mlx5_find_trap_by_id(dev, trap->id)) { + kfree(dl_trap); + mlx5_core_err(dev, "Devlink trap: Trap 0x%x already found", trap->id); + return -EEXIST; + } + + list_add_tail(&dl_trap->list, &dev->priv.traps); + return 0; +} + +static void mlx5_devlink_trap_fini(struct devlink *devlink, const struct devlink_trap *trap, + void *trap_ctx) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + struct mlx5_devlink_trap *dl_trap; + + dl_trap = mlx5_find_trap_by_id(dev, trap->id); + if (!dl_trap) { + mlx5_core_err(dev, "Devlink trap: Missing trap id 0x%x", trap->id); + return; + } + list_del(&dl_trap->list); + kfree(dl_trap); +} + +static int mlx5_devlink_trap_action_set(struct devlink *devlink, + const struct devlink_trap *trap, + enum devlink_trap_action action, + struct netlink_ext_ack *extack) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + enum devlink_trap_action action_orig; + struct mlx5_devlink_trap *dl_trap; + int err = 0; + + if (is_mdev_switchdev_mode(dev)) { + NL_SET_ERR_MSG_MOD(extack, "Devlink traps can't be set in switchdev mode"); + return -EOPNOTSUPP; + } + + dl_trap = mlx5_find_trap_by_id(dev, trap->id); + if (!dl_trap) { + mlx5_core_err(dev, "Devlink trap: Set action on invalid trap id 0x%x", trap->id); + err = -EINVAL; + goto out; + } + + if (action != DEVLINK_TRAP_ACTION_DROP && action != DEVLINK_TRAP_ACTION_TRAP) { + err = -EOPNOTSUPP; + goto out; + } + + if (action == dl_trap->trap.action) + goto out; + + action_orig = dl_trap->trap.action; + dl_trap->trap.action = action; + err = mlx5_blocking_notifier_call_chain(dev, MLX5_DRIVER_EVENT_TYPE_TRAP, + &dl_trap->trap); + if (err) + dl_trap->trap.action = action_orig; +out: + return err; +} + +static const struct devlink_ops mlx5_devlink_ops = { +#ifdef CONFIG_MLX5_ESWITCH + .eswitch_mode_set = mlx5_devlink_eswitch_mode_set, + .eswitch_mode_get = mlx5_devlink_eswitch_mode_get, + .eswitch_inline_mode_set = mlx5_devlink_eswitch_inline_mode_set, + .eswitch_inline_mode_get = mlx5_devlink_eswitch_inline_mode_get, + .eswitch_encap_mode_set = mlx5_devlink_eswitch_encap_mode_set, + .eswitch_encap_mode_get = mlx5_devlink_eswitch_encap_mode_get, + .port_function_hw_addr_get = mlx5_devlink_port_function_hw_addr_get, + .port_function_hw_addr_set = mlx5_devlink_port_function_hw_addr_set, + .rate_leaf_tx_share_set = mlx5_esw_devlink_rate_leaf_tx_share_set, + .rate_leaf_tx_max_set = mlx5_esw_devlink_rate_leaf_tx_max_set, + .rate_node_tx_share_set = mlx5_esw_devlink_rate_node_tx_share_set, + .rate_node_tx_max_set = mlx5_esw_devlink_rate_node_tx_max_set, + .rate_node_new = mlx5_esw_devlink_rate_node_new, + .rate_node_del = mlx5_esw_devlink_rate_node_del, + .rate_leaf_parent_set = mlx5_esw_devlink_rate_parent_set, +#endif +#if defined(CONFIG_MLX5_SF_MANAGER) && \ + (defined(HAVE_DEVLINK_PORT_ATTRS_PCI_SF_SET_GET_4_PARAMS) || defined(HAVE_DEVLINK_PORT_ATTRS_PCI_SF_SET_GET_5_PARAMS)) + .port_new = mlx5_devlink_sf_port_new, + .port_del = mlx5_devlink_sf_port_del, +#endif +#if defined(CONFIG_MLX5_SF_MANAGER) && \ + defined(HAVE_DEVLINK_HAS_PORT_FUNCTION_STATE_GET) + .port_fn_state_get = mlx5_devlink_sf_port_fn_state_get, + .port_fn_state_set = mlx5_devlink_sf_port_fn_state_set, +#endif + .flash_update = mlx5_devlink_flash_update, + .info_get = mlx5_devlink_info_get, + .reload_actions = BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT) | + BIT(DEVLINK_RELOAD_ACTION_FW_ACTIVATE), + .reload_limits = BIT(DEVLINK_RELOAD_LIMIT_NO_RESET), + .reload_down = mlx5_devlink_reload_down, + .reload_up = mlx5_devlink_reload_up, + .trap_init = mlx5_devlink_trap_init, + .trap_fini = mlx5_devlink_trap_fini, + .trap_action_set = mlx5_devlink_trap_action_set, +}; + +void mlx5_devlink_trap_report(struct mlx5_core_dev *dev, int trap_id, struct sk_buff *skb, + struct devlink_port *dl_port) +{ + struct devlink *devlink = priv_to_devlink(dev); + struct mlx5_devlink_trap *dl_trap; + + dl_trap = mlx5_find_trap_by_id(dev, trap_id); + if (!dl_trap) { + mlx5_core_err(dev, "Devlink trap: Report on invalid trap id 0x%x", trap_id); + return; + } + + if (dl_trap->trap.action != DEVLINK_TRAP_ACTION_TRAP) { + mlx5_core_dbg(dev, "Devlink trap: Trap id %d has action %d", trap_id, + dl_trap->trap.action); + return; + } + devlink_trap_report(devlink, skb, dl_trap->item, dl_port, NULL); +} + +int mlx5_devlink_trap_get_num_active(struct mlx5_core_dev *dev) +{ + struct mlx5_devlink_trap *dl_trap; + int count = 0; + + list_for_each_entry(dl_trap, &dev->priv.traps, list) + if (dl_trap->trap.action == DEVLINK_TRAP_ACTION_TRAP) + count++; + + return count; +} + +int mlx5_devlink_traps_get_action(struct mlx5_core_dev *dev, int trap_id, + enum devlink_trap_action *action) +{ + struct mlx5_devlink_trap *dl_trap; + + dl_trap = mlx5_find_trap_by_id(dev, trap_id); + if (!dl_trap) { + mlx5_core_err(dev, "Devlink trap: Get action on invalid trap id 0x%x", + trap_id); + return -EINVAL; + } + + *action = dl_trap->trap.action; + return 0; +} + +struct devlink *mlx5_devlink_alloc(struct device *dev) +{ + return devlink_alloc(&mlx5_devlink_ops, sizeof(struct mlx5_core_dev), + dev); +} + +void mlx5_devlink_free(struct devlink *devlink) +{ + devlink_free(devlink); +} + +static int mlx5_devlink_fs_mode_validate(struct devlink *devlink, u32 id, + union devlink_param_value val, + struct netlink_ext_ack *extack) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + char *value = val.vstr; + int err = 0; + + if (!strcmp(value, "dmfs")) { + return 0; + } else if (!strcmp(value, "smfs")) { + u8 eswitch_mode; + bool smfs_cap; + + eswitch_mode = mlx5_eswitch_mode(dev); + smfs_cap = mlx5_fs_dr_is_supported(dev); + + if (!smfs_cap) { + err = -EOPNOTSUPP; + NL_SET_ERR_MSG_MOD(extack, + "Software managed steering is not supported by current device"); + } + + else if (eswitch_mode == MLX5_ESWITCH_OFFLOADS) { + NL_SET_ERR_MSG_MOD(extack, + "Software managed steering is not supported when eswitch offloads enabled."); + err = -EOPNOTSUPP; + } + } else { + NL_SET_ERR_MSG_MOD(extack, + "Bad parameter: supported values are [\"dmfs\", \"smfs\"]"); + err = -EINVAL; + } + + return err; +} + +static int mlx5_devlink_fs_mode_set(struct devlink *devlink, u32 id, + struct devlink_param_gset_ctx *ctx) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + enum mlx5_flow_steering_mode mode; + + if (!strcmp(ctx->val.vstr, "smfs")) + mode = MLX5_FLOW_STEERING_MODE_SMFS; + else + mode = MLX5_FLOW_STEERING_MODE_DMFS; + dev->priv.steering->mode = mode; + + return 0; +} + +static int mlx5_devlink_fs_mode_get(struct devlink *devlink, u32 id, + struct devlink_param_gset_ctx *ctx) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + + if (dev->priv.steering->mode == MLX5_FLOW_STEERING_MODE_SMFS) + strcpy(ctx->val.vstr, "smfs"); + else + strcpy(ctx->val.vstr, "dmfs"); + return 0; +} + +static int mlx5_devlink_enable_roce_validate(struct devlink *devlink, u32 id, + union devlink_param_value val, + struct netlink_ext_ack *extack) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + bool new_state = val.vbool; + + if (new_state && !MLX5_CAP_GEN(dev, roce) && + !MLX5_CAP_GEN(dev, roce_rw_supported)) { + NL_SET_ERR_MSG_MOD(extack, "Device doesn't support RoCE"); + return -EOPNOTSUPP; + } + if (mlx5_core_is_mp_slave(dev) || mlx5_lag_is_active(dev)) { + NL_SET_ERR_MSG_MOD(extack, "Multi port slave/Lag device can't configure RoCE"); + return -EOPNOTSUPP; + } + + return 0; +} + +#ifdef CONFIG_MLX5_ESWITCH +static int mlx5_devlink_large_group_num_validate(struct devlink *devlink, u32 id, + union devlink_param_value val, + struct netlink_ext_ack *extack) +{ + int group_num = val.vu32; + + if (group_num < 1 || group_num > 1024) { + NL_SET_ERR_MSG_MOD(extack, + "Unsupported group number, supported range is 1-1024"); + return -EOPNOTSUPP; + } + + return 0; +} + +static int mlx5_devlink_esw_port_metadata_set(struct devlink *devlink, u32 id, + struct devlink_param_gset_ctx *ctx) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + + if (!MLX5_ESWITCH_MANAGER(dev)) + return -EOPNOTSUPP; + + return mlx5_esw_offloads_vport_metadata_set(dev->priv.eswitch, ctx->val.vbool); +} + +static int mlx5_devlink_esw_port_metadata_get(struct devlink *devlink, u32 id, + struct devlink_param_gset_ctx *ctx) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + + if (!MLX5_ESWITCH_MANAGER(dev)) + return -EOPNOTSUPP; + + ctx->val.vbool = mlx5_eswitch_vport_match_metadata_enabled(dev->priv.eswitch); + return 0; +} +#endif /* CONFIG_MLX5_ESWITCH */ + +static int mlx5_devlink_ct_max_offloaded_conns_set(struct devlink *devlink, u32 id, + struct devlink_param_gset_ctx *ctx) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + + mlx5_tc_ct_max_offloaded_conns_set(dev, ctx->val.vu32); + return 0; +} + +static int mlx5_devlink_ct_max_offloaded_conns_get(struct devlink *devlink, u32 id, + struct devlink_param_gset_ctx *ctx) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + + ctx->val.vu32 = mlx5_tc_ct_max_offloaded_conns_get(dev); + return 0; +} + +static int mlx5_devlink_esw_port_metadata_validate(struct devlink *devlink, u32 id, + union devlink_param_value val, + struct netlink_ext_ack *extack) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + u8 esw_mode; + + if (!MLX5_ESWITCH_MANAGER(dev)) { + NL_SET_ERR_MSG_MOD(extack, "E-Switch is unsupported"); + return -EOPNOTSUPP; + } + esw_mode = mlx5_eswitch_mode(dev); + if (esw_mode == MLX5_ESWITCH_OFFLOADS) { + NL_SET_ERR_MSG_MOD(extack, + "E-Switch must either disabled or non switchdev mode"); + return -EBUSY; + } + return 0; +} + +static int mlx5_devlink_esw_pet_insert_set(struct devlink *devlink, u32 id, + struct devlink_param_gset_ctx *ctx) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + + if (!MLX5_ESWITCH_MANAGER(dev)) + return -EOPNOTSUPP; + + return mlx5_esw_offloads_pet_insert_set(dev->priv.eswitch, ctx->val.vbool); +} + +static int mlx5_devlink_esw_pet_insert_get(struct devlink *devlink, u32 id, + struct devlink_param_gset_ctx *ctx) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + + if (!MLX5_ESWITCH_MANAGER(dev)) + return -EOPNOTSUPP; + + ctx->val.vbool = mlx5_eswitch_pet_insert_allowed(dev->priv.eswitch); + return 0; +} + +static int mlx5_devlink_esw_pet_insert_validate(struct devlink *devlink, u32 id, + union devlink_param_value val, + struct netlink_ext_ack *extack) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + u8 esw_mode; + + if (!MLX5_ESWITCH_MANAGER(dev)) { + NL_SET_ERR_MSG_MOD(extack, "E-Switch is unsupported"); + return -EOPNOTSUPP; + } + + esw_mode = mlx5_eswitch_mode(dev); + if (esw_mode == MLX5_ESWITCH_OFFLOADS) { + NL_SET_ERR_MSG_MOD(extack, + "E-Switch must either disabled or non switchdev mode"); + return -EBUSY; + } + + if (!mlx5e_esw_offloads_pet_supported(dev->priv.eswitch)) + return -EOPNOTSUPP; + + if (!mlx5_core_is_ecpf(dev)) + return -EOPNOTSUPP; + + return 0; +} + +static int mlx5_devlink_enable_remote_dev_reset_set(struct devlink *devlink, u32 id, + struct devlink_param_gset_ctx *ctx) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + + mlx5_fw_reset_enable_remote_dev_reset_set(dev, ctx->val.vbool); + return 0; +} + +static int mlx5_devlink_enable_remote_dev_reset_get(struct devlink *devlink, u32 id, + struct devlink_param_gset_ctx *ctx) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + + ctx->val.vbool = mlx5_fw_reset_enable_remote_dev_reset_get(dev); + return 0; +} + +static int mlx5_devlink_eq_depth_validate(struct devlink *devlink, u32 id, + union devlink_param_value val, + struct netlink_ext_ack *extack) +{ + return (val.vu16 >= 64 && val.vu16 <= 4096) ? 0 : -EINVAL; +} + +static const struct devlink_param mlx5_devlink_params[] = { + DEVLINK_PARAM_DRIVER(MLX5_DEVLINK_PARAM_ID_CT_ACTION_ON_NAT_CONNS, + "ct_action_on_nat_conns", DEVLINK_PARAM_TYPE_BOOL, + BIT(DEVLINK_PARAM_CMODE_RUNTIME), + mlx5_devlink_ct_action_on_nat_conns_get, + mlx5_devlink_ct_action_on_nat_conns_set, + NULL), + DEVLINK_PARAM_DRIVER(MLX5_DEVLINK_PARAM_ID_FLOW_STEERING_MODE, + "flow_steering_mode", DEVLINK_PARAM_TYPE_STRING, + BIT(DEVLINK_PARAM_CMODE_RUNTIME), + mlx5_devlink_fs_mode_get, mlx5_devlink_fs_mode_set, + mlx5_devlink_fs_mode_validate), + DEVLINK_PARAM_GENERIC(ENABLE_ROCE, BIT(DEVLINK_PARAM_CMODE_DRIVERINIT), + NULL, NULL, mlx5_devlink_enable_roce_validate), + DEVLINK_PARAM_DRIVER(MLX5_DEVLINK_PARAM_ID_CT_MAX_OFFLOADED_CONNS, + "ct_max_offloaded_conns", DEVLINK_PARAM_TYPE_U32, + BIT(DEVLINK_PARAM_CMODE_RUNTIME), + mlx5_devlink_ct_max_offloaded_conns_get, + mlx5_devlink_ct_max_offloaded_conns_set, + NULL), +#ifdef CONFIG_MLX5_ESWITCH + DEVLINK_PARAM_DRIVER(MLX5_DEVLINK_PARAM_ID_ESW_LARGE_GROUP_NUM, + "fdb_large_groups", DEVLINK_PARAM_TYPE_U32, + BIT(DEVLINK_PARAM_CMODE_DRIVERINIT), + NULL, NULL, + mlx5_devlink_large_group_num_validate), + DEVLINK_PARAM_DRIVER(MLX5_DEVLINK_PARAM_ID_ESW_PORT_METADATA, + "esw_port_metadata", DEVLINK_PARAM_TYPE_BOOL, + BIT(DEVLINK_PARAM_CMODE_RUNTIME), + mlx5_devlink_esw_port_metadata_get, + mlx5_devlink_esw_port_metadata_set, + mlx5_devlink_esw_port_metadata_validate), + DEVLINK_PARAM_DRIVER(MLX5_DEVLINK_PARAM_ID_ESW_PET_INSERT, + "esw_pet_insert", DEVLINK_PARAM_TYPE_BOOL, + BIT(DEVLINK_PARAM_CMODE_RUNTIME), + mlx5_devlink_esw_pet_insert_get, + mlx5_devlink_esw_pet_insert_set, + mlx5_devlink_esw_pet_insert_validate), +#endif + DEVLINK_PARAM_GENERIC(ENABLE_REMOTE_DEV_RESET, BIT(DEVLINK_PARAM_CMODE_RUNTIME), + mlx5_devlink_enable_remote_dev_reset_get, + mlx5_devlink_enable_remote_dev_reset_set, NULL), + DEVLINK_PARAM_GENERIC(IO_EQ_SIZE, BIT(DEVLINK_PARAM_CMODE_DRIVERINIT), + NULL, NULL, mlx5_devlink_eq_depth_validate), + DEVLINK_PARAM_GENERIC(EVENT_EQ_SIZE, BIT(DEVLINK_PARAM_CMODE_DRIVERINIT), + NULL, NULL, mlx5_devlink_eq_depth_validate), +}; + +static void mlx5_devlink_set_params_init_values(struct devlink *devlink) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + union devlink_param_value value; + + if (dev->priv.steering->mode == MLX5_FLOW_STEERING_MODE_DMFS) + strcpy(value.vstr, "dmfs"); + else + strcpy(value.vstr, "smfs"); + devlink_param_driverinit_value_set(devlink, + MLX5_DEVLINK_PARAM_ID_FLOW_STEERING_MODE, + value); + + value.vbool = MLX5_CAP_GEN(dev, roce); + devlink_param_driverinit_value_set(devlink, + DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE, + value); + +#ifdef CONFIG_MLX5_ESWITCH + value.vu32 = ESW_OFFLOADS_DEFAULT_NUM_GROUPS; + devlink_param_driverinit_value_set(devlink, + MLX5_DEVLINK_PARAM_ID_ESW_LARGE_GROUP_NUM, + value); + + if (MLX5_ESWITCH_MANAGER(dev)) { + value.vbool = false; + devlink_param_driverinit_value_set(devlink, + MLX5_DEVLINK_PARAM_ID_ESW_PET_INSERT, + value); + } + + if (MLX5_ESWITCH_MANAGER(dev)) { + if (mlx5_esw_vport_match_metadata_supported(dev->priv.eswitch)) { + dev->priv.eswitch->flags |= MLX5_ESWITCH_VPORT_MATCH_METADATA; + value.vbool = true; + } else { + value.vbool = false; + } + devlink_param_driverinit_value_set(devlink, + MLX5_DEVLINK_PARAM_ID_ESW_PORT_METADATA, + value); + } +#endif + + value.vu32 = MLX5_COMP_EQ_SIZE; + devlink_param_driverinit_value_set(devlink, + DEVLINK_PARAM_GENERIC_ID_IO_EQ_SIZE, + value); + + value.vu32 = MLX5_NUM_ASYNC_EQE; + devlink_param_driverinit_value_set(devlink, + DEVLINK_PARAM_GENERIC_ID_EVENT_EQ_SIZE, + value); +} + +static const struct devlink_param enable_eth_param = + DEVLINK_PARAM_GENERIC(ENABLE_ETH, BIT(DEVLINK_PARAM_CMODE_DRIVERINIT), + NULL, NULL, NULL); + +static int mlx5_devlink_eth_param_register(struct devlink *devlink) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + union devlink_param_value value; + int err; + + if (!mlx5_eth_supported(dev)) + return 0; + + err = devlink_param_register(devlink, &enable_eth_param); + if (err) + return err; + + value.vbool = true; + devlink_param_driverinit_value_set(devlink, + DEVLINK_PARAM_GENERIC_ID_ENABLE_ETH, + value); + return 0; +} + +static void mlx5_devlink_eth_param_unregister(struct devlink *devlink) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + + if (!mlx5_eth_supported(dev)) + return; + + devlink_param_unregister(devlink, &enable_eth_param); +} + +static int mlx5_devlink_enable_rdma_validate(struct devlink *devlink, u32 id, + union devlink_param_value val, + struct netlink_ext_ack *extack) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + bool new_state = val.vbool; + + if (new_state && !mlx5_rdma_supported(dev)) + return -EOPNOTSUPP; + return 0; +} + +static const struct devlink_param enable_rdma_param = + DEVLINK_PARAM_GENERIC(ENABLE_RDMA, BIT(DEVLINK_PARAM_CMODE_DRIVERINIT), + NULL, NULL, mlx5_devlink_enable_rdma_validate); + +static int mlx5_devlink_rdma_param_register(struct devlink *devlink) +{ + union devlink_param_value value; + int err; + + if (!IS_ENABLED(CONFIG_MLX5_INFINIBAND)) + return 0; + + err = devlink_param_register(devlink, &enable_rdma_param); + if (err) + return err; + + value.vbool = true; + devlink_param_driverinit_value_set(devlink, + DEVLINK_PARAM_GENERIC_ID_ENABLE_RDMA, + value); + return 0; +} + +static void mlx5_devlink_rdma_param_unregister(struct devlink *devlink) +{ + if (!IS_ENABLED(CONFIG_MLX5_INFINIBAND)) + return; + + devlink_param_unregister(devlink, &enable_rdma_param); +} + +static const struct devlink_param enable_vnet_param = + DEVLINK_PARAM_GENERIC(ENABLE_VNET, BIT(DEVLINK_PARAM_CMODE_DRIVERINIT), + NULL, NULL, NULL); + +static int mlx5_devlink_vnet_param_register(struct devlink *devlink) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + union devlink_param_value value; + int err; + + if (!mlx5_vnet_supported(dev)) + return 0; + + err = devlink_param_register(devlink, &enable_vnet_param); + if (err) + return err; + + value.vbool = true; + devlink_param_driverinit_value_set(devlink, + DEVLINK_PARAM_GENERIC_ID_ENABLE_VNET, + value); + return 0; +} + +static void mlx5_devlink_vnet_param_unregister(struct devlink *devlink) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + + if (!mlx5_vnet_supported(dev)) + return; + + devlink_param_unregister(devlink, &enable_vnet_param); +} + +static int mlx5_devlink_auxdev_params_register(struct devlink *devlink) +{ + int err; + + err = mlx5_devlink_eth_param_register(devlink); + if (err) + return err; + + err = mlx5_devlink_rdma_param_register(devlink); + if (err) + goto rdma_err; + + err = mlx5_devlink_vnet_param_register(devlink); + if (err) + goto vnet_err; + return 0; + +vnet_err: + mlx5_devlink_rdma_param_unregister(devlink); +rdma_err: + mlx5_devlink_eth_param_unregister(devlink); + return err; +} + +static void mlx5_devlink_auxdev_params_unregister(struct devlink *devlink) +{ + mlx5_devlink_vnet_param_unregister(devlink); + mlx5_devlink_rdma_param_unregister(devlink); + mlx5_devlink_eth_param_unregister(devlink); +} + +static int mlx5_devlink_max_uc_list_validate(struct devlink *devlink, u32 id, + union devlink_param_value val, + struct netlink_ext_ack *extack) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + + if (val.vu32 == 0) { + NL_SET_ERR_MSG_MOD(extack, "max_macs value must be greater than 0"); + return -EINVAL; + } + + if (!is_power_of_2(val.vu32)) { + NL_SET_ERR_MSG_MOD(extack, "Only power of 2 values are supported for max_macs"); + return -EINVAL; + } + + if (ilog2(val.vu32) > + MLX5_CAP_GEN_MAX(dev, log_max_current_uc_list)) { + NL_SET_ERR_MSG_MOD(extack, "max_macs value is out of the supported range"); + return -EINVAL; + } + + return 0; +} + +static const struct devlink_param max_uc_list_param = + DEVLINK_PARAM_GENERIC(MAX_MACS, BIT(DEVLINK_PARAM_CMODE_DRIVERINIT), + NULL, NULL, mlx5_devlink_max_uc_list_validate); + +static int mlx5_devlink_max_uc_list_param_register(struct devlink *devlink) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + union devlink_param_value value; + int err; + + if (!MLX5_CAP_GEN_MAX(dev, log_max_current_uc_list_wr_supported)) + return 0; + + err = devlink_param_register(devlink, &max_uc_list_param); + if (err) + return err; + + value.vu32 = 1 << MLX5_CAP_GEN(dev, log_max_current_uc_list); + devlink_param_driverinit_value_set(devlink, + DEVLINK_PARAM_GENERIC_ID_MAX_MACS, + value); + return 0; +} + +static void +mlx5_devlink_max_uc_list_param_unregister(struct devlink *devlink) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + + if (!MLX5_CAP_GEN_MAX(dev, log_max_current_uc_list_wr_supported)) + return; + + devlink_param_unregister(devlink, &max_uc_list_param); +} + +#define MLX5_TRAP_DROP(_id, _group_id) \ + DEVLINK_TRAP_GENERIC(DROP, DROP, _id, \ + DEVLINK_TRAP_GROUP_GENERIC_ID_##_group_id, \ + DEVLINK_TRAP_METADATA_TYPE_F_IN_PORT) + +static const struct devlink_trap mlx5_traps_arr[] = { + MLX5_TRAP_DROP(INGRESS_VLAN_FILTER, L2_DROPS), + MLX5_TRAP_DROP(DMAC_FILTER, L2_DROPS), +}; + +static const struct devlink_trap_group mlx5_trap_groups_arr[] = { + DEVLINK_TRAP_GROUP_GENERIC(L2_DROPS, 0), +}; + +static int mlx5_devlink_traps_register(struct devlink *devlink) +{ + struct mlx5_core_dev *core_dev = devlink_priv(devlink); + int err; + + err = devlink_trap_groups_register(devlink, mlx5_trap_groups_arr, + ARRAY_SIZE(mlx5_trap_groups_arr)); + if (err) + return err; + + err = devlink_traps_register(devlink, mlx5_traps_arr, ARRAY_SIZE(mlx5_traps_arr), + &core_dev->priv); + if (err) + goto err_trap_group; + return 0; + +err_trap_group: + devlink_trap_groups_unregister(devlink, mlx5_trap_groups_arr, + ARRAY_SIZE(mlx5_trap_groups_arr)); + return err; +} + +static void mlx5_devlink_traps_unregister(struct devlink *devlink) +{ + devlink_traps_unregister(devlink, mlx5_traps_arr, ARRAY_SIZE(mlx5_traps_arr)); + devlink_trap_groups_unregister(devlink, mlx5_trap_groups_arr, + ARRAY_SIZE(mlx5_trap_groups_arr)); +} + +int mlx5_devlink_register(struct devlink *devlink) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + int err; + + err = devlink_params_register(devlink, mlx5_devlink_params, + ARRAY_SIZE(mlx5_devlink_params)); + if (err) + return err; + + mlx5_devlink_set_params_init_values(devlink); + + err = mlx5_devlink_auxdev_params_register(devlink); + if (err) + goto auxdev_reg_err; + + err = mlx5_devlink_max_uc_list_param_register(devlink); + if (err) + goto max_uc_list_err; + + err = mlx5_devlink_traps_register(devlink); + if (err) + goto traps_reg_err; + + if (!mlx5_core_is_mp_slave(dev)) + devlink_set_features(devlink, DEVLINK_F_RELOAD); + + return 0; + +traps_reg_err: + mlx5_devlink_max_uc_list_param_unregister(devlink); +max_uc_list_err: + mlx5_devlink_auxdev_params_unregister(devlink); +auxdev_reg_err: + devlink_params_unregister(devlink, mlx5_devlink_params, + ARRAY_SIZE(mlx5_devlink_params)); + return err; +} + +void mlx5_devlink_unregister(struct devlink *devlink) +{ + mlx5_devlink_traps_unregister(devlink); + mlx5_devlink_max_uc_list_param_unregister(devlink); + mlx5_devlink_auxdev_params_unregister(devlink); + devlink_params_unregister(devlink, mlx5_devlink_params, + ARRAY_SIZE(mlx5_devlink_params)); +} + +int +mlx5_devlink_ct_action_on_nat_conns_set(struct devlink *devlink, u32 id, + struct devlink_param_gset_ctx *ctx) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + + dev->mlx5e_res.ct.ct_action_on_nat_conns = ctx->val.vbool; + return 0; +} + +int +mlx5_devlink_ct_action_on_nat_conns_get(struct devlink *devlink, u32 id, + struct devlink_param_gset_ctx *ctx) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + + ctx->val.vbool = dev->mlx5e_res.ct.ct_action_on_nat_conns; + return 0; +} + +int +mlx5_devlink_ct_labels_mapping_set(struct devlink *devlink, u32 id, + struct devlink_param_gset_ctx *ctx) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + + dev->mlx5e_res.ct.ct_labels_mapping = ctx->val.vbool; + return 0; +} + +int +mlx5_devlink_ct_labels_mapping_get(struct devlink *devlink, u32 id, + struct devlink_param_gset_ctx *ctx) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + + ctx->val.vbool = dev->mlx5e_res.ct.ct_labels_mapping; + return 0; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/devlink.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/devlink.h new file mode 100644 index 0000000..242cd60 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/devlink.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2019, Mellanox Technologies */ + +#ifndef __MLX5_DEVLINK_H__ +#define __MLX5_DEVLINK_H__ + +#include + +enum mlx5_devlink_param_id { + MLX5_DEVLINK_PARAM_ID_BASE = DEVLINK_PARAM_GENERIC_ID_MAX, + MLX5_DEVLINK_PARAM_ID_FLOW_STEERING_MODE, + MLX5_DEVLINK_PARAM_ID_ESW_LARGE_GROUP_NUM, + /* Non upstream devlink params */ + MLX5_DEVLINK_PARAM_ID_COMPAT_BASE = MLX5_DEVLINK_PARAM_ID_BASE + 0xFF, + MLX5_DEVLINK_PARAM_ID_CT_ACTION_ON_NAT_CONNS, + MLX5_DEVLINK_PARAM_ID_CT_MAX_OFFLOADED_CONNS, + MLX5_DEVLINK_PARAM_ID_ESW_PET_INSERT, + MLX5_DEVLINK_PARAM_ID_ESW_PORT_METADATA, +}; + +struct mlx5_trap_ctx { + int id; + int action; +}; + +struct mlx5_devlink_trap { + struct mlx5_trap_ctx trap; + void *item; + struct list_head list; +}; + +struct mlx5_core_dev; +void mlx5_devlink_trap_report(struct mlx5_core_dev *dev, int trap_id, struct sk_buff *skb, + struct devlink_port *dl_port); +int mlx5_devlink_trap_get_num_active(struct mlx5_core_dev *dev); +int mlx5_devlink_traps_get_action(struct mlx5_core_dev *dev, int trap_id, + enum devlink_trap_action *action); + +struct devlink *mlx5_devlink_alloc(struct device *dev); +void mlx5_devlink_free(struct devlink *devlink); +int mlx5_devlink_register(struct devlink *devlink); +void mlx5_devlink_unregister(struct devlink *devlink); + +int +mlx5_devlink_ct_action_on_nat_conns_set(struct devlink *devlink, u32 id, + struct devlink_param_gset_ctx *ctx); +int +mlx5_devlink_ct_action_on_nat_conns_get(struct devlink *devlink, u32 id, + struct devlink_param_gset_ctx *ctx); + +int +mlx5_devlink_ct_labels_mapping_set(struct devlink *devlink, u32 id, + struct devlink_param_gset_ctx *ctx); +int +mlx5_devlink_ct_labels_mapping_get(struct devlink *devlink, u32 id, + struct devlink_param_gset_ctx *ctx); +#endif /* __MLX5_DEVLINK_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/crdump.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/crdump.c new file mode 100644 index 0000000..28d0274 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/crdump.c @@ -0,0 +1,115 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2019 Mellanox Technologies */ + +#include +#include "mlx5_core.h" +#include "lib/pci_vsc.h" +#include "lib/mlx5.h" + +#define BAD_ACCESS 0xBADACCE5 +#define MLX5_PROTECTED_CR_SCAN_CRSPACE 0x7 + +static bool mlx5_crdump_enabled(struct mlx5_core_dev *dev) +{ + return !!dev->priv.health.crdump_size; +} + +static int mlx5_crdump_fill(struct mlx5_core_dev *dev, u32 *cr_data) +{ + u32 crdump_size = dev->priv.health.crdump_size; + int i, ret; + + for (i = 0; i < (crdump_size / 4); i++) + cr_data[i] = BAD_ACCESS; + + ret = mlx5_vsc_gw_read_block_fast(dev, cr_data, crdump_size); + if (ret <= 0) { + if (ret == 0) + return -EIO; + return ret; + } + + if (crdump_size != ret) { + mlx5_core_warn(dev, "failed to read full dump, read %d out of %u\n", + ret, crdump_size); + return -EINVAL; + } + + return 0; +} + +int mlx5_crdump_collect(struct mlx5_core_dev *dev, u32 *cr_data) +{ + int ret; + + if (!mlx5_crdump_enabled(dev)) + return -ENODEV; + + ret = mlx5_vsc_gw_lock(dev); + if (ret) { + mlx5_core_warn(dev, "crdump: failed to lock vsc gw err %d\n", + ret); + return ret; + } + /* Verify no other PF is running cr-dump or sw reset */ + ret = mlx5_vsc_sem_set_space(dev, MLX5_SEMAPHORE_SW_RESET, + MLX5_VSC_LOCK); + if (ret) { + mlx5_core_warn(dev, "Failed to lock SW reset semaphore\n"); + goto unlock_gw; + } + + ret = mlx5_vsc_gw_set_space(dev, MLX5_VSC_SPACE_SCAN_CRSPACE, NULL); + if (ret) + goto unlock_sem; + + ret = mlx5_crdump_fill(dev, cr_data); + +unlock_sem: + mlx5_vsc_sem_set_space(dev, MLX5_SEMAPHORE_SW_RESET, MLX5_VSC_UNLOCK); +unlock_gw: + mlx5_vsc_gw_unlock(dev); + return ret; +} + +int mlx5_crdump_enable(struct mlx5_core_dev *dev) +{ + struct mlx5_priv *priv = &dev->priv; + u32 space_size; + int ret; + + if (!mlx5_core_is_pf(dev) || !mlx5_vsc_accessible(dev) || + mlx5_crdump_enabled(dev)) + return 0; + + ret = mlx5_vsc_gw_lock(dev); + if (ret) + return ret; + + /* Check if space is supported and get space size */ + ret = mlx5_vsc_gw_set_space(dev, MLX5_VSC_SPACE_SCAN_CRSPACE, + &space_size); + if (ret) { + /* Unlock and mask error since space is not supported */ + mlx5_vsc_gw_unlock(dev); + return 0; + } + + if (!space_size) { + mlx5_core_warn(dev, "Invalid Crspace size, zero\n"); + mlx5_vsc_gw_unlock(dev); + return -EINVAL; + } + + ret = mlx5_vsc_gw_unlock(dev); + if (ret) + return ret; + + priv->health.crdump_size = space_size; + return 0; +} + +void mlx5_crdump_disable(struct mlx5_core_dev *dev) +{ + dev->priv.health.crdump_size = 0; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/diag_cnt.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/diag_cnt.c new file mode 100644 index 0000000..177ab2e --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/diag_cnt.c @@ -0,0 +1,737 @@ +/* + * Copyright (c) 2018, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "diag_cnt.h" +#include + +static int get_supported_cnt_ids(struct mlx5_core_dev *dev); +static int enable_cnt_id(struct mlx5_core_dev *dev, u16 id); +static void reset_cnt_id(struct mlx5_core_dev *dev); +static void reset_params(struct mlx5_diag_cnt *diag_cnt); + +static ssize_t counter_id_write(struct file *filp, const char __user *buf, + size_t count, loff_t *pos) +{ + struct mlx5_core_dev *dev = filp->private_data; + struct mlx5_diag_cnt *diag_cnt; + unsigned int temp; + char *options; + char *kbuf; + char *p; + int err; + int i; + + if (*pos) + return 0; + + diag_cnt = &dev->diag_cnt; + reset_cnt_id(dev); + + /* Collect cnt_id input. Quit if cnt_id does not exist */ + kbuf = kzalloc(count + 1, GFP_KERNEL); + if (!kbuf) + return -ENOMEM; + + if (copy_from_user(kbuf, buf, count)) { + err = -EFAULT; + goto out_copy_from_usr_err; + } + + i = 0; + options = kbuf; + + while ((p = strsep(&options, ",")) != NULL && + i < MLX5_CAP_GEN(dev, num_of_diagnostic_counters)) { + if (sscanf(p, "%x", &temp) != 1) + continue; + err = enable_cnt_id(dev, temp); + if (err) + goto out_err; + i++; + } + + diag_cnt->num_cnt_id = i; + *pos = count; + + kfree(kbuf); + return count; + +out_err: + reset_cnt_id(dev); +out_copy_from_usr_err: + kfree(kbuf); + return err; +} + +static ssize_t counter_id_read(struct file *filp, char __user *buf, + size_t count, loff_t *pos) +{ + struct mlx5_core_dev *dev = filp->private_data; + struct mlx5_diag_cnt *diag_cnt; + char *kbuf; + int len = 0; + int i; + + diag_cnt = &dev->diag_cnt; + if (*pos || !diag_cnt->num_cnt_id) + return -EPERM; + + kbuf = kzalloc(5 * diag_cnt->num_cnt_id + 2, GFP_KERNEL); + if (!kbuf) + return -ENOMEM; + + for (i = 0; i < MLX5_CAP_GEN(dev, num_of_diagnostic_counters); i++) + if (diag_cnt->cnt_id[i].enabled) + len += sprintf(kbuf + len, "%04x,", diag_cnt->cnt_id[i].id); + + if (len) { + len += sprintf(kbuf + len, "\n"); + len = min_t(int, len, count); + if (copy_to_user(buf, kbuf, len)) { + len = 0; + goto out; + } + } + +out: + kfree(kbuf); + *pos = len; + return len; +} + +static const struct file_operations counter_id_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .write = counter_id_write, + .read = counter_id_read, +}; + +#define NUM_OF_DIAG_PARAMS 5 +static ssize_t params_write(struct file *filp, const char __user *buf, + size_t count, loff_t *pos) +{ + struct mlx5_core_dev *dev = filp->private_data; + struct mlx5_diag_cnt *diag_cnt; + unsigned int temp; + char *options; + char *kbuf; + char *p; + int err; + int i; + + diag_cnt = &dev->diag_cnt; + if (*pos || !diag_cnt->num_cnt_id) + return -EPERM; + + kbuf = kzalloc(count + 1, GFP_KERNEL); + if (!kbuf) + return -ENOMEM; + + if (copy_from_user(kbuf, buf, count)) { + err = -EFAULT; + goto out_copy_from_usr_err; + } + + /* Five parameters + * log_num_of_samples (dec) + * logSamplePeriod (dec) + * flags (hex) + * num_of_samples (dec) + * sample_index (dec) + */ + i = 0; + err = -EINVAL; + options = kbuf; + reset_params(diag_cnt); + + while ((p = strsep(&options, ",")) != NULL && i < NUM_OF_DIAG_PARAMS) { + if (i == 0) { + if (sscanf(p, "%d", &temp) != 1) + goto out_err; + + if ((1 << (MLX5_CAP_DEBUG(dev, log_max_samples) - temp)) < + diag_cnt->num_cnt_id) { + mlx5_core_warn(dev, "log_num_of_samples is too big for num_cnt_id=%d\n", + diag_cnt->num_cnt_id); + goto out_err; + } else { + diag_cnt->log_num_of_samples = temp; + } + } + + if (i == 1) { + if (sscanf(p, "%d", &temp) != 1) + goto out_err; + + if (temp < MLX5_CAP_DEBUG(dev, log_min_sample_period)) { + mlx5_core_warn(dev, "log_sample_period smaller than log_min_sample_period\n"); + goto out_err; + } else { + diag_cnt->log_sample_period = temp; + } + } + + if (i == 2) { + if (sscanf(p, "%x", &temp) != 1) + goto out_err; + + if (temp > 0xFF) + goto out_err; + else + diag_cnt->flag = temp; + } + + if (i == 3) { + if (sscanf(p, "%d", &temp) != 1) + goto out_err; + + if (temp > (1 << diag_cnt->log_num_of_samples)) { + mlx5_core_warn(dev, "num_of_samples bigger than log_num_of_samples\n"); + goto out_err; + } else { + diag_cnt->num_of_samples = temp; + } + } + + if (i == 4) { + if (sscanf(p, "%d", &temp) != 1) + goto out_err; + if (temp > (1 << diag_cnt->log_num_of_samples)) + goto out_err; + else + diag_cnt->sample_index = temp; + } + + i++; + } + + if (i < NUM_OF_DIAG_PARAMS) + goto out_err; + + *pos = count; + kfree(kbuf); + return count; + +out_err: + reset_params(diag_cnt); +out_copy_from_usr_err: + kfree(kbuf); + return err; +} + +#define PARAM_PRINT_SZ 104 +static ssize_t params_read(struct file *filp, char __user *buf, + size_t count, loff_t *pos) +{ + struct mlx5_core_dev *dev = filp->private_data; + char kbuf[PARAM_PRINT_SZ] = {0}; + struct mlx5_diag_cnt *diag_cnt; + int len = 0; + + if (*pos) + return 0; + + diag_cnt = &dev->diag_cnt; + + len += sprintf(kbuf + len, "log_num_of_samples=%d\n", + diag_cnt->log_num_of_samples); + len += sprintf(kbuf + len, "log_sample_period=%d\n", + diag_cnt->log_sample_period); + len += sprintf(kbuf + len, "flag=0x%02x\n", diag_cnt->flag); + len += sprintf(kbuf + len, "num_of_samples=%d\n", + diag_cnt->num_of_samples); + len += sprintf(kbuf + len, "sample_index=%d\n", + diag_cnt->sample_index); + + if (len) { + len = min_t(int, len, count); + if (copy_to_user(buf, kbuf, len)) { + len = 0; + goto out; + } + } + +out: + *pos = len; + return len; +} + +static const struct file_operations params_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .write = params_write, + .read = params_read, +}; + +#define DUMP_WRITE_BUF_LEN 4 +static ssize_t dump_write(struct file *filp, const char __user *buf, + size_t count, loff_t *pos) +{ + struct mlx5_core_dev *dev = filp->private_data; + char kbuf[DUMP_WRITE_BUF_LEN] = {0}; + int err; + + if (*pos || count > DUMP_WRITE_BUF_LEN) + return -EINVAL; + + if (copy_from_user(kbuf, buf, count)) + return -EFAULT; + + if (strncmp(kbuf, "set", DUMP_WRITE_BUF_LEN - 1)) + return -EINVAL; + + err = mlx5_diag_set_params(dev); + if (err) + return err; + + return count; +} + +#define SAMPLE_PRINT_SZ 36 +static int decode_cnt_buffer(u16 num_of_samples, u8 *out, size_t count, char **out_str) +{ + u16 num_samples; + char *kbuf; + void *cnt; + u64 temp; + int len; + int i; + + len = num_of_samples * SAMPLE_PRINT_SZ; + len = min_t(int, len, count); + + kbuf = kzalloc(len, GFP_KERNEL); + if (!kbuf) + return -ENOMEM; + + num_samples = len / SAMPLE_PRINT_SZ; + len = 0; + for (i = 0; i < num_samples; i++) { + cnt = MLX5_ADDR_OF(query_diagnostic_cntrs_out, + out, diag_counter[i]); + temp = MLX5_GET(diagnostic_cntr_struct, cnt, counter_value_h); + temp = (temp << 32) | + MLX5_GET(diagnostic_cntr_struct, cnt, counter_value_l); + + len += sprintf(kbuf + len, + "%04x,%04x,%08x,%016llx\n", + MLX5_GET(diagnostic_cntr_struct, cnt, counter_id), + MLX5_GET(diagnostic_cntr_struct, cnt, sample_id), + MLX5_GET(diagnostic_cntr_struct, cnt, time_stamp_31_0), + temp); + } + + *out_str = kbuf; + return 0; +} + +static ssize_t dump_read(struct file *filp, char __user *buf, + size_t count, loff_t *pos) +{ + struct mlx5_core_dev *dev = filp->private_data; + char *out_str; + u8 *out; + int err; + int len; + + if (*pos || !dev->diag_cnt.num_of_samples) + return -EPERM; + + err = mlx5_diag_query_counters(dev, &out); + if (err) + return 0; + + err = decode_cnt_buffer(dev->diag_cnt.num_of_samples * + dev->diag_cnt.num_cnt_id, + out, count, &out_str); + if (err) { + kfree(out); + return 0; + } + + len = min_t(int, strlen(out_str), count); + if (copy_to_user(buf, out_str, len)) + len = 0; + + kfree(out_str); + kfree(out); + *pos = len; + return len; +} + +static const struct file_operations dump_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .write = dump_write, + .read = dump_read, +}; + +#define CAPABILITY_PRINT_SZ 130 +#define COUNTER_ID_PRINT_SZ 5 +static ssize_t cap_read(struct file *filp, char __user *buf, + size_t count, loff_t *pos) +{ + struct mlx5_core_dev *dev = filp->private_data; + struct mlx5_diag_cnt *diag_cnt; + int len = 0; + char *kbuf; + int i; + + if (*pos) + return 0; + + kbuf = kzalloc(CAPABILITY_PRINT_SZ + + MLX5_CAP_GEN(dev, num_of_diagnostic_counters) * + COUNTER_ID_PRINT_SZ, + GFP_KERNEL); + if (!kbuf) + return 0; + + diag_cnt = &dev->diag_cnt; + + /* print cap */ + len += sprintf(kbuf + len, "log_max_samples=%d\n", + MLX5_CAP_DEBUG(dev, log_max_samples)); + len += sprintf(kbuf + len, "log_min_sample_period=%d\n", + MLX5_CAP_DEBUG(dev, log_min_sample_period)); + len += sprintf(kbuf + len, "repetitive=%d\n", + MLX5_CAP_DEBUG(dev, repetitive)); + len += sprintf(kbuf + len, "single=%d\n", + MLX5_CAP_DEBUG(dev, single)); + len += sprintf(kbuf + len, "num_of_diagnostic_counters=%d\n", + MLX5_CAP_GEN(dev, num_of_diagnostic_counters)); + + /* print list of supported counter */ + len += sprintf(kbuf + len, "supported counter id:\n"); + for (i = 0; i < MLX5_CAP_GEN(dev, num_of_diagnostic_counters); i++) + len += sprintf(kbuf + len, "%04x,", diag_cnt->cnt_id[i].id); + len += sprintf(kbuf + len, "\n"); + + len = min_t(int, len, count); + if (copy_to_user(buf, kbuf, len)) { + len = 0; + goto out; + } + +out: + kfree(kbuf); + *pos = len; + return len; +} + +static const struct file_operations cap_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = cap_read, +}; + +static int diag_cnt_debugfs_init(struct mlx5_core_dev *dev) +{ + struct mlx5_diag_cnt *diag_cnt = &dev->diag_cnt; + struct dentry *entry = NULL; + + diag_cnt->debugfs = debugfs_create_dir("diag_cnt", dev->priv.dbg.dbg_root); + + if (!diag_cnt->debugfs) + return -ENOMEM; + + entry = debugfs_create_file("counter_id", 0400, diag_cnt->debugfs, + dev, &counter_id_fops); + if (!entry) + goto out_err; + + entry = debugfs_create_file("params", 0400, diag_cnt->debugfs, + dev, ¶ms_fops); + if (!entry) + goto out_err; + + entry = debugfs_create_file("dump", 0400, diag_cnt->debugfs, + dev, &dump_fops); + if (!entry) + goto out_err; + + entry = debugfs_create_file("cap", 0400, diag_cnt->debugfs, + dev, &cap_fops); + if (!entry) + goto out_err; + + return 0; + +out_err: + mlx5_diag_cnt_cleanup(dev); + return -ENOMEM; +} + +static int get_supported_cnt_ids(struct mlx5_core_dev *dev) +{ + int num_counters = MLX5_CAP_GEN(dev, num_of_diagnostic_counters); + struct mlx5_diag_cnt *diag_cnt = &dev->diag_cnt; + int i; + + diag_cnt->cnt_id = kzalloc(sizeof(*diag_cnt->cnt_id) * num_counters, + GFP_KERNEL); + if (!diag_cnt->cnt_id) + return -ENOMEM; + + for (i = 0; i < num_counters; i++) + diag_cnt->cnt_id[i].id = + MLX5_CAP_DEBUG(dev, diagnostic_counter[i].counter_id); + + return 0; +} + +static void reset_cnt_id(struct mlx5_core_dev *dev) +{ + struct mlx5_diag_cnt *diag_cnt = &dev->diag_cnt; + int i; + + diag_cnt->num_cnt_id = 0; + for (i = 0; i < MLX5_CAP_GEN(dev, num_of_diagnostic_counters); i++) + diag_cnt->cnt_id[i].enabled = false; +} + +static int enable_cnt_id(struct mlx5_core_dev *dev, u16 id) +{ + struct mlx5_diag_cnt *diag_cnt = &dev->diag_cnt; + int i; + + for (i = 0; i < MLX5_CAP_GEN(dev, num_of_diagnostic_counters); i++) + if (diag_cnt->cnt_id[i].id == id) { + if (diag_cnt->cnt_id[i].enabled) + return -EINVAL; + + diag_cnt->cnt_id[i].enabled = true; + break; + } + + if (i == MLX5_CAP_GEN(dev, num_of_diagnostic_counters)) + return -ENOENT; + else + return 0; +} + +static void reset_params(struct mlx5_diag_cnt *diag_cnt) +{ + diag_cnt->log_num_of_samples = 0; + diag_cnt->log_sample_period = 0; + diag_cnt->flag = 0; + diag_cnt->num_of_samples = 0; + diag_cnt->sample_index = 0; +} + +int mlx5_diag_set_params(struct mlx5_core_dev *dev) +{ + u8 out[MLX5_ST_SZ_BYTES(set_diagnostic_params_out)] = {0}; + struct mlx5_diag_cnt *diag_cnt = &dev->diag_cnt; + void *cnt_id; + void *ctx; + u16 in_sz; + int err; + u8 *in; + int i; + int j; + + if (!diag_cnt->num_cnt_id) + return -EINVAL; + + in_sz = MLX5_ST_SZ_BYTES(set_diagnostic_params_in) + + diag_cnt->num_cnt_id * MLX5_ST_SZ_BYTES(counter_id); + in = kzalloc(in_sz, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(set_diagnostic_params_in, in, opcode, + MLX5_CMD_OP_SET_DIAGNOSTIC_PARAMS); + + ctx = MLX5_ADDR_OF(set_diagnostic_params_in, in, + diagnostic_params_context); + MLX5_SET(diagnostic_params_context, ctx, num_of_counters, + diag_cnt->num_cnt_id); + MLX5_SET(diagnostic_params_context, ctx, log_num_of_samples, + diag_cnt->log_num_of_samples); + + MLX5_SET(diagnostic_params_context, ctx, single, + (diag_cnt->flag >> 7) & 1); + MLX5_SET(diagnostic_params_context, ctx, repetitive, + (diag_cnt->flag >> 6) & 1); + MLX5_SET(diagnostic_params_context, ctx, sync, + (diag_cnt->flag >> 5) & 1); + MLX5_SET(diagnostic_params_context, ctx, clear, + (diag_cnt->flag >> 4) & 1); + MLX5_SET(diagnostic_params_context, ctx, on_demand, + (diag_cnt->flag >> 3) & 1); + MLX5_SET(diagnostic_params_context, ctx, enable, + (diag_cnt->flag >> 2) & 1); + MLX5_SET(diagnostic_params_context, ctx, log_sample_period, + diag_cnt->log_sample_period); + + j = 0; + for (i = 0; i < MLX5_CAP_GEN(dev, num_of_diagnostic_counters); i++) { + if (diag_cnt->cnt_id[i].enabled) { + cnt_id = MLX5_ADDR_OF(diagnostic_params_context, + ctx, counter_id[j]); + MLX5_SET(counter_id, cnt_id, counter_id, + diag_cnt->cnt_id[i].id); + j++; + } + } + + err = mlx5_cmd_exec(dev, in, in_sz, out, sizeof(out)); + + kfree(in); + return err; +} + +/* This function is for debug purpose */ +int mlx5_diag_query_params(struct mlx5_core_dev *dev) +{ + u8 in[MLX5_ST_SZ_BYTES(query_diagnostic_params_in)] = {0}; + struct mlx5_diag_cnt *diag_cnt = &dev->diag_cnt; + void *cnt_id; + u16 out_sz; + void *ctx; + int err; + u8 *out; + int i; + + out_sz = MLX5_ST_SZ_BYTES(query_diagnostic_params_out) + + diag_cnt->num_cnt_id * MLX5_ST_SZ_BYTES(counter_id); + + out = kzalloc(out_sz, GFP_KERNEL); + if (!out) + return -ENOMEM; + + MLX5_SET(query_diagnostic_params_in, in, opcode, + MLX5_CMD_OP_QUERY_DIAGNOSTIC_PARAMS); + err = mlx5_cmd_exec(dev, in, sizeof(in), out, out_sz); + if (err) + goto out; + + ctx = MLX5_ADDR_OF(query_diagnostic_params_out, out, + diagnostic_params_context); + mlx5_core_dbg(dev, "single=%x\n", + MLX5_GET(diagnostic_params_context, ctx, single)); + mlx5_core_dbg(dev, "repetitive=%x\n", + MLX5_GET(diagnostic_params_context, ctx, repetitive)); + mlx5_core_dbg(dev, "sync=%x\n", + MLX5_GET(diagnostic_params_context, ctx, sync)); + mlx5_core_dbg(dev, "clear=%x\n", + MLX5_GET(diagnostic_params_context, ctx, clear)); + mlx5_core_dbg(dev, "on_demand=%x\n", + MLX5_GET(diagnostic_params_context, ctx, on_demand)); + mlx5_core_dbg(dev, "enable=%x\n", + MLX5_GET(diagnostic_params_context, ctx, enable)); + mlx5_core_dbg(dev, "log_sample_period=%x\n", + MLX5_GET(diagnostic_params_context, ctx, + log_sample_period)); + + for (i = 0; i < diag_cnt->num_cnt_id; i++) { + cnt_id = MLX5_ADDR_OF(diagnostic_params_context, + ctx, counter_id[i]); + mlx5_core_dbg(dev, "counter_id[%d]=%x\n", i, + MLX5_GET(counter_id, cnt_id, counter_id)); + } +out: + kfree(out); + return err; +} + +int mlx5_diag_query_counters(struct mlx5_core_dev *dev, u8 **out_buffer) +{ + u8 in[MLX5_ST_SZ_BYTES(query_diagnostic_cntrs_in)] = {0}; + struct mlx5_diag_cnt *diag_cnt = &dev->diag_cnt; + u16 out_sz; + u8 *out; + int err; + + out_sz = MLX5_ST_SZ_BYTES(query_diagnostic_cntrs_out) + + diag_cnt->num_of_samples * diag_cnt->num_cnt_id * + MLX5_ST_SZ_BYTES(diagnostic_cntr_struct); + + out = kzalloc(out_sz, GFP_KERNEL); + if (!out) + return -ENOMEM; + + MLX5_SET(query_diagnostic_cntrs_in, in, opcode, + MLX5_CMD_OP_QUERY_DIAGNOSTIC_COUNTERS); + MLX5_SET(query_diagnostic_cntrs_in, in, num_of_samples, + diag_cnt->num_of_samples); + MLX5_SET(query_diagnostic_cntrs_in, in, sample_index, + diag_cnt->sample_index); + + err = mlx5_cmd_exec(dev, in, sizeof(in), out, out_sz); + + if (!err) + *out_buffer = out; + else + kfree(out); + + return err; +} + +void mlx5_diag_cnt_init(struct mlx5_core_dev *dev) +{ + int err; + + if (!MLX5_DIAG_CNT_SUPPORTED(dev)) + return; + + /* Build private data */ + err = get_supported_cnt_ids(dev); + if (err) + return; + + /* Create debugfs */ + if (!dev->priv.dbg.dbg_root) + return; + + err = diag_cnt_debugfs_init(dev); + if (err) + return; +} + +void mlx5_diag_cnt_cleanup(struct mlx5_core_dev *dev) +{ + struct mlx5_diag_cnt *diag_cnt = &dev->diag_cnt; + + if (!MLX5_DIAG_CNT_SUPPORTED(dev)) + return; + + if (diag_cnt->debugfs) { + debugfs_remove_recursive(diag_cnt->debugfs); + diag_cnt->debugfs = NULL; + } + + kfree(diag_cnt->cnt_id); + diag_cnt->cnt_id = NULL; + reset_params(diag_cnt); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/diag_cnt.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/diag_cnt.h new file mode 100644 index 0000000..217fd08 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/diag_cnt.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2018, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __MLX5_DIAG_CNT_H__ +#define __MLX5_DIAG_CNT_H__ + +#include +#include "mlx5_core.h" + +#define MLX5_DIAG_CNT_SUPPORTED(mdev) (MLX5_CAP_GEN(mdev, debug) && \ + MLX5_CAP_GEN(mdev, num_of_diagnostic_counters)) + +void mlx5_diag_cnt_init(struct mlx5_core_dev *dev); +void mlx5_diag_cnt_cleanup(struct mlx5_core_dev *dev); + +int mlx5_diag_query_params(struct mlx5_core_dev *dev); +int mlx5_diag_set_params(struct mlx5_core_dev *dev); +int mlx5_diag_query_counters(struct mlx5_core_dev *dev, u8 **out_buffer); +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/en_rep_tracepoint.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/en_rep_tracepoint.h new file mode 100644 index 0000000..f15718d --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/en_rep_tracepoint.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2019 Mellanox Technologies. */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM mlx5 + +#if !defined(_MLX5_EN_REP_TP_) || defined(TRACE_HEADER_MULTI_READ) +#define _MLX5_EN_REP_TP_ + +#include +#include +#include "en_rep.h" + +TRACE_EVENT(mlx5e_rep_neigh_update, + TP_PROTO(const struct mlx5e_neigh_hash_entry *nhe, const u8 *ha, + bool neigh_connected), + TP_ARGS(nhe, ha, neigh_connected), + TP_STRUCT__entry(__string(devname, nhe->neigh_dev->name) + __array(u8, ha, ETH_ALEN) + __array(u8, v4, 4) + __array(u8, v6, 16) + __field(bool, neigh_connected) + ), + TP_fast_assign(const struct mlx5e_neigh *mn = &nhe->m_neigh; + struct in6_addr *pin6; + __be32 *p32; + + __assign_str(devname, nhe->neigh_dev->name); + __entry->neigh_connected = neigh_connected; + memcpy(__entry->ha, ha, ETH_ALEN); + + p32 = (__be32 *)__entry->v4; + pin6 = (struct in6_addr *)__entry->v6; + if (mn->family == AF_INET) { + *p32 = mn->dst_ip.v4; + ipv6_addr_set_v4mapped(*p32, pin6); + } else if (mn->family == AF_INET6) { + *pin6 = mn->dst_ip.v6; + } + ), + TP_printk("netdev: %s MAC: %pM IPv4: %pI4 IPv6: %pI6c neigh_connected=%d\n", + __get_str(devname), __entry->ha, + __entry->v4, __entry->v6, __entry->neigh_connected + ) +); + +#endif /* _MLX5_EN_REP_TP_ */ + +/* This part must be outside protection */ +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH ./diag +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE en_rep_tracepoint +#include diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/en_tc_tracepoint.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/en_tc_tracepoint.c new file mode 100644 index 0000000..c5dc6c5 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/en_tc_tracepoint.c @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2019 Mellanox Technologies. */ + +#define CREATE_TRACE_POINTS +#include "en_tc_tracepoint.h" + +void put_ids_to_array(int *ids, + const struct flow_action_entry *entries, + unsigned int num) +{ + unsigned int i; + + for (i = 0; i < num; i++) + ids[i] = entries[i].id; +} + +#define NAME_SIZE 16 + +static const char FLOWACT2STR[NUM_FLOW_ACTIONS][NAME_SIZE] = { + [FLOW_ACTION_ACCEPT] = "ACCEPT", + [FLOW_ACTION_DROP] = "DROP", + [FLOW_ACTION_TRAP] = "TRAP", + [FLOW_ACTION_GOTO] = "GOTO", + [FLOW_ACTION_REDIRECT] = "REDIRECT", + [FLOW_ACTION_MIRRED] = "MIRRED", + [FLOW_ACTION_VLAN_PUSH] = "VLAN_PUSH", + [FLOW_ACTION_VLAN_POP] = "VLAN_POP", + [FLOW_ACTION_VLAN_MANGLE] = "VLAN_MANGLE", + [FLOW_ACTION_TUNNEL_ENCAP] = "TUNNEL_ENCAP", + [FLOW_ACTION_TUNNEL_DECAP] = "TUNNEL_DECAP", + [FLOW_ACTION_MANGLE] = "MANGLE", + [FLOW_ACTION_ADD] = "ADD", + [FLOW_ACTION_CSUM] = "CSUM", + [FLOW_ACTION_MARK] = "MARK", + [FLOW_ACTION_WAKE] = "WAKE", + [FLOW_ACTION_QUEUE] = "QUEUE", + [FLOW_ACTION_SAMPLE] = "SAMPLE", + [FLOW_ACTION_POLICE] = "POLICE", + [FLOW_ACTION_CT] = "CT", +}; + +const char *parse_action(struct trace_seq *p, + int *ids, + unsigned int num) +{ + const char *ret = trace_seq_buffer_ptr(p); + unsigned int i; + + for (i = 0; i < num; i++) { + if (ids[i] < NUM_FLOW_ACTIONS) + trace_seq_printf(p, "%s ", FLOWACT2STR[ids[i]]); + else + trace_seq_printf(p, "UNKNOWN "); + } + + trace_seq_putc(p, 0); + return ret; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/en_tc_tracepoint.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/en_tc_tracepoint.h new file mode 100644 index 0000000..ac52ef3 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/en_tc_tracepoint.h @@ -0,0 +1,114 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2019 Mellanox Technologies. */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM mlx5 + +#if !defined(_MLX5_TC_TP_) || defined(TRACE_HEADER_MULTI_READ) +#define _MLX5_TC_TP_ + +#include +#include +#include +#include "en_rep.h" + +#define __parse_action(ids, num) parse_action(p, ids, num) + +void put_ids_to_array(int *ids, + const struct flow_action_entry *entries, + unsigned int num); + +const char *parse_action(struct trace_seq *p, + int *ids, + unsigned int num); + +DECLARE_EVENT_CLASS(mlx5e_flower_template, + TP_PROTO(const struct flow_cls_offload *f), + TP_ARGS(f), + TP_STRUCT__entry(__field(void *, cookie) + __field(unsigned int, num) + __dynamic_array(int, ids, f->rule ? + f->rule->action.num_entries : 0) + ), + TP_fast_assign(__entry->cookie = (void *)f->cookie; + __entry->num = (f->rule ? + f->rule->action.num_entries : 0); + if (__entry->num) + put_ids_to_array(__get_dynamic_array(ids), + f->rule->action.entries, + f->rule->action.num_entries); + ), + TP_printk("cookie=%p actions= %s\n", + __entry->cookie, __entry->num ? + __parse_action(__get_dynamic_array(ids), + __entry->num) : "NULL" + ) +); + +DEFINE_EVENT(mlx5e_flower_template, mlx5e_configure_flower, + TP_PROTO(const struct flow_cls_offload *f), + TP_ARGS(f) + ); + +DEFINE_EVENT(mlx5e_flower_template, mlx5e_delete_flower, + TP_PROTO(const struct flow_cls_offload *f), + TP_ARGS(f) + ); + +TRACE_EVENT(mlx5e_stats_flower, + TP_PROTO(const struct flow_cls_offload *f), + TP_ARGS(f), + TP_STRUCT__entry(__field(void *, cookie) + __field(u64, bytes) + __field(u64, packets) + __field(u64, lastused) + ), + TP_fast_assign(__entry->cookie = (void *)f->cookie; + __entry->bytes = f->stats.bytes; + __entry->packets = f->stats.pkts; + __entry->lastused = f->stats.lastused; + ), + TP_printk("cookie=%p bytes=%llu packets=%llu lastused=%llu\n", + __entry->cookie, __entry->bytes, + __entry->packets, __entry->lastused + ) +); + +TRACE_EVENT(mlx5e_tc_update_neigh_used_value, + TP_PROTO(const struct mlx5e_neigh_hash_entry *nhe, bool neigh_used), + TP_ARGS(nhe, neigh_used), + TP_STRUCT__entry(__string(devname, nhe->neigh_dev->name) + __array(u8, v4, 4) + __array(u8, v6, 16) + __field(bool, neigh_used) + ), + TP_fast_assign(const struct mlx5e_neigh *mn = &nhe->m_neigh; + struct in6_addr *pin6; + __be32 *p32; + + __assign_str(devname, nhe->neigh_dev->name); + __entry->neigh_used = neigh_used; + + p32 = (__be32 *)__entry->v4; + pin6 = (struct in6_addr *)__entry->v6; + if (mn->family == AF_INET) { + *p32 = mn->dst_ip.v4; + ipv6_addr_set_v4mapped(*p32, pin6); + } else if (mn->family == AF_INET6) { + *pin6 = mn->dst_ip.v6; + } + ), + TP_printk("netdev: %s IPv4: %pI4 IPv6: %pI6c neigh_used=%d\n", + __get_str(devname), __entry->v4, __entry->v6, + __entry->neigh_used + ) +); + +#endif /* _MLX5_TC_TP_ */ + +/* This part must be outside protection */ +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH ./diag +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE en_tc_tracepoint +#include diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c new file mode 100644 index 0000000..fec2a2f --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c @@ -0,0 +1,280 @@ +/* + * Copyright (c) 2017, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#define CREATE_TRACE_POINTS + +#include "fs_tracepoint.h" +#include + +#define DECLARE_MASK_VAL(type, name) struct {type m; type v; } name +#define MASK_VAL(type, spec, name, mask, val, fld) \ + DECLARE_MASK_VAL(type, name) = \ + {.m = MLX5_GET(spec, mask, fld),\ + .v = MLX5_GET(spec, val, fld)} +#define MASK_VAL_BE(type, spec, name, mask, val, fld) \ + DECLARE_MASK_VAL(type, name) = \ + {.m = MLX5_GET_BE(type, spec, mask, fld),\ + .v = MLX5_GET_BE(type, spec, val, fld)} +#define GET_MASKED_VAL(name) (name.m & name.v) + +#define GET_MASK_VAL(name, type, mask, val, fld) \ + (name.m = MLX5_GET(type, mask, fld), \ + name.v = MLX5_GET(type, val, fld), \ + name.m & name.v) +#define PRINT_MASKED_VAL(name, p, format) { \ + if (name.m) \ + trace_seq_printf(p, __stringify(name) "=" format " ", name.v); \ + } +#define PRINT_MASKED_VALP(name, cast, p, format) { \ + if (name.m) \ + trace_seq_printf(p, __stringify(name) "=" format " ", \ + (cast)&name.v);\ + } + +static void print_lyr_2_4_hdrs(struct trace_seq *p, + const u32 *mask, const u32 *value) +{ +#define MASK_VAL_L2(type, name, fld) \ + MASK_VAL(type, fte_match_set_lyr_2_4, name, mask, value, fld) + DECLARE_MASK_VAL(u64, smac) = { + .m = MLX5_GET(fte_match_set_lyr_2_4, mask, smac_47_16) << 16 | + MLX5_GET(fte_match_set_lyr_2_4, mask, smac_15_0), + .v = MLX5_GET(fte_match_set_lyr_2_4, value, smac_47_16) << 16 | + MLX5_GET(fte_match_set_lyr_2_4, value, smac_15_0)}; + DECLARE_MASK_VAL(u64, dmac) = { + .m = MLX5_GET(fte_match_set_lyr_2_4, mask, dmac_47_16) << 16 | + MLX5_GET(fte_match_set_lyr_2_4, mask, dmac_15_0), + .v = MLX5_GET(fte_match_set_lyr_2_4, value, dmac_47_16) << 16 | + MLX5_GET(fte_match_set_lyr_2_4, value, dmac_15_0)}; + MASK_VAL_L2(u16, ethertype, ethertype); + MASK_VAL_L2(u8, ip_version, ip_version); + + PRINT_MASKED_VALP(smac, u8 *, p, "%pM"); + PRINT_MASKED_VALP(dmac, u8 *, p, "%pM"); + PRINT_MASKED_VAL(ethertype, p, "%04x"); + + if ((ethertype.m == 0xffff && ethertype.v == ETH_P_IP) || + (ip_version.m == 0xf && ip_version.v == 4)) { +#define MASK_VAL_L2_BE(type, name, fld) \ + MASK_VAL_BE(type, fte_match_set_lyr_2_4, name, mask, value, fld) + MASK_VAL_L2_BE(u32, src_ipv4, + src_ipv4_src_ipv6.ipv4_layout.ipv4); + MASK_VAL_L2_BE(u32, dst_ipv4, + dst_ipv4_dst_ipv6.ipv4_layout.ipv4); + + PRINT_MASKED_VALP(src_ipv4, typeof(&src_ipv4.v), p, + "%pI4"); + PRINT_MASKED_VALP(dst_ipv4, typeof(&dst_ipv4.v), p, + "%pI4"); + } else if ((ethertype.m == 0xffff && ethertype.v == ETH_P_IPV6) || + (ip_version.m == 0xf && ip_version.v == 6)) { + static const struct in6_addr full_ones = { + .in6_u.u6_addr32 = {__constant_htonl(0xffffffff), + __constant_htonl(0xffffffff), + __constant_htonl(0xffffffff), + __constant_htonl(0xffffffff)}, + }; + DECLARE_MASK_VAL(struct in6_addr, src_ipv6); + DECLARE_MASK_VAL(struct in6_addr, dst_ipv6); + + memcpy(src_ipv6.m.in6_u.u6_addr8, + MLX5_ADDR_OF(fte_match_set_lyr_2_4, mask, + src_ipv4_src_ipv6.ipv6_layout.ipv6), + sizeof(src_ipv6.m)); + memcpy(dst_ipv6.m.in6_u.u6_addr8, + MLX5_ADDR_OF(fte_match_set_lyr_2_4, mask, + dst_ipv4_dst_ipv6.ipv6_layout.ipv6), + sizeof(dst_ipv6.m)); + memcpy(src_ipv6.v.in6_u.u6_addr8, + MLX5_ADDR_OF(fte_match_set_lyr_2_4, value, + src_ipv4_src_ipv6.ipv6_layout.ipv6), + sizeof(src_ipv6.v)); + memcpy(dst_ipv6.v.in6_u.u6_addr8, + MLX5_ADDR_OF(fte_match_set_lyr_2_4, value, + dst_ipv4_dst_ipv6.ipv6_layout.ipv6), + sizeof(dst_ipv6.v)); + + if (!memcmp(&src_ipv6.m, &full_ones, sizeof(full_ones))) + trace_seq_printf(p, "src_ipv6=%pI6 ", + &src_ipv6.v); + if (!memcmp(&dst_ipv6.m, &full_ones, sizeof(full_ones))) + trace_seq_printf(p, "dst_ipv6=%pI6 ", + &dst_ipv6.v); + } + +#define PRINT_MASKED_VAL_L2(type, name, fld, p, format) {\ + MASK_VAL_L2(type, name, fld); \ + PRINT_MASKED_VAL(name, p, format); \ +} + + PRINT_MASKED_VAL_L2(u8, ip_protocol, ip_protocol, p, "%02x"); + PRINT_MASKED_VAL_L2(u16, tcp_flags, tcp_flags, p, "%x"); + PRINT_MASKED_VAL_L2(u16, tcp_sport, tcp_sport, p, "%u"); + PRINT_MASKED_VAL_L2(u16, tcp_dport, tcp_dport, p, "%u"); + PRINT_MASKED_VAL_L2(u16, udp_sport, udp_sport, p, "%u"); + PRINT_MASKED_VAL_L2(u16, udp_dport, udp_dport, p, "%u"); + PRINT_MASKED_VAL_L2(u16, first_vid, first_vid, p, "%04x"); + PRINT_MASKED_VAL_L2(u8, first_prio, first_prio, p, "%x"); + PRINT_MASKED_VAL_L2(u8, first_cfi, first_cfi, p, "%d"); + PRINT_MASKED_VAL_L2(u8, ip_dscp, ip_dscp, p, "%02x"); + PRINT_MASKED_VAL_L2(u8, ip_ecn, ip_ecn, p, "%x"); + PRINT_MASKED_VAL_L2(u8, cvlan_tag, cvlan_tag, p, "%d"); + PRINT_MASKED_VAL_L2(u8, svlan_tag, svlan_tag, p, "%d"); + PRINT_MASKED_VAL_L2(u8, frag, frag, p, "%d"); +} + +static void print_misc_parameters_hdrs(struct trace_seq *p, + const u32 *mask, const u32 *value) +{ +#define MASK_VAL_MISC(type, name, fld) \ + MASK_VAL(type, fte_match_set_misc, name, mask, value, fld) +#define PRINT_MASKED_VAL_MISC(type, name, fld, p, format) {\ + MASK_VAL_MISC(type, name, fld); \ + PRINT_MASKED_VAL(name, p, format); \ +} + DECLARE_MASK_VAL(u64, gre_key) = { + .m = MLX5_GET(fte_match_set_misc, mask, gre_key.nvgre.hi) << 8 | + MLX5_GET(fte_match_set_misc, mask, gre_key.nvgre.lo), + .v = MLX5_GET(fte_match_set_misc, value, gre_key.nvgre.hi) << 8 | + MLX5_GET(fte_match_set_misc, value, gre_key.nvgre.lo)}; + + PRINT_MASKED_VAL(gre_key, p, "%llu"); + PRINT_MASKED_VAL_MISC(u32, source_sqn, source_sqn, p, "%u"); + PRINT_MASKED_VAL_MISC(u16, source_port, source_port, p, "%u"); + PRINT_MASKED_VAL_MISC(u8, outer_second_prio, outer_second_prio, + p, "%u"); + PRINT_MASKED_VAL_MISC(u8, outer_second_cfi, outer_second_cfi, p, "%u"); + PRINT_MASKED_VAL_MISC(u16, outer_second_vid, outer_second_vid, p, "%u"); + PRINT_MASKED_VAL_MISC(u8, inner_second_prio, inner_second_prio, + p, "%u"); + PRINT_MASKED_VAL_MISC(u8, inner_second_cfi, inner_second_cfi, p, "%u"); + PRINT_MASKED_VAL_MISC(u16, inner_second_vid, inner_second_vid, p, "%u"); + + PRINT_MASKED_VAL_MISC(u8, outer_second_cvlan_tag, + outer_second_cvlan_tag, p, "%u"); + PRINT_MASKED_VAL_MISC(u8, inner_second_cvlan_tag, + inner_second_cvlan_tag, p, "%u"); + PRINT_MASKED_VAL_MISC(u8, outer_second_svlan_tag, + outer_second_svlan_tag, p, "%u"); + PRINT_MASKED_VAL_MISC(u8, inner_second_svlan_tag, + inner_second_svlan_tag, p, "%u"); + + PRINT_MASKED_VAL_MISC(u8, gre_protocol, gre_protocol, p, "%u"); + + PRINT_MASKED_VAL_MISC(u32, vxlan_vni, vxlan_vni, p, "%u"); + PRINT_MASKED_VAL_MISC(u32, outer_ipv6_flow_label, outer_ipv6_flow_label, + p, "%x"); + PRINT_MASKED_VAL_MISC(u32, inner_ipv6_flow_label, inner_ipv6_flow_label, + p, "%x"); +} + +const char *parse_fs_hdrs(struct trace_seq *p, + u8 match_criteria_enable, + const u32 *mask_outer, + const u32 *mask_misc, + const u32 *mask_inner, + const u32 *value_outer, + const u32 *value_misc, + const u32 *value_inner) +{ + const char *ret = trace_seq_buffer_ptr(p); + + if (match_criteria_enable & + 1 << MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_OUTER_HEADERS) { + trace_seq_printf(p, "[outer] "); + print_lyr_2_4_hdrs(p, mask_outer, value_outer); + } + + if (match_criteria_enable & + 1 << MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS) { + trace_seq_printf(p, "[misc] "); + print_misc_parameters_hdrs(p, mask_misc, value_misc); + } + if (match_criteria_enable & + 1 << MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_INNER_HEADERS) { + trace_seq_printf(p, "[inner] "); + print_lyr_2_4_hdrs(p, mask_inner, value_inner); + } + trace_seq_putc(p, 0); + return ret; +} + +const char *parse_fs_dst(struct trace_seq *p, + const struct mlx5_flow_destination *dst, + u32 counter_id) +{ + const char *ret = trace_seq_buffer_ptr(p); + + switch (dst->type) { + case MLX5_FLOW_DESTINATION_TYPE_UPLINK: + trace_seq_printf(p, "uplink\n"); + break; + case MLX5_FLOW_DESTINATION_TYPE_VPORT: + trace_seq_printf(p, "vport=%u\n", dst->vport.num); + break; + case MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE: + trace_seq_printf(p, "ft=%p\n", dst->ft); + break; + case MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM: + trace_seq_printf(p, "ft_num=%u\n", dst->ft_num); + break; + case MLX5_FLOW_DESTINATION_TYPE_TIR: + trace_seq_printf(p, "tir=%u\n", dst->tir_num); + break; + case MLX5_FLOW_DESTINATION_TYPE_FLOW_SAMPLER: + trace_seq_printf(p, "sampler_id=%u\n", dst->sampler_id); + break; + case MLX5_FLOW_DESTINATION_TYPE_COUNTER: + trace_seq_printf(p, "counter_id=%u\n", counter_id); + break; + case MLX5_FLOW_DESTINATION_TYPE_PORT: + trace_seq_printf(p, "port\n"); + break; + case MLX5_FLOW_DESTINATION_TYPE_TABLE_TYPE: + trace_seq_printf(p, "flow_table_type=%u id:%u\n", dst->ft->type, + dst->ft->id); + break; + } + + trace_seq_putc(p, 0); + return ret; +} + +EXPORT_TRACEPOINT_SYMBOL(mlx5_fs_add_ft); +EXPORT_TRACEPOINT_SYMBOL(mlx5_fs_del_ft); +EXPORT_TRACEPOINT_SYMBOL(mlx5_fs_add_fg); +EXPORT_TRACEPOINT_SYMBOL(mlx5_fs_del_fg); +EXPORT_TRACEPOINT_SYMBOL(mlx5_fs_set_fte); +EXPORT_TRACEPOINT_SYMBOL(mlx5_fs_del_fte); +EXPORT_TRACEPOINT_SYMBOL(mlx5_fs_add_rule); +EXPORT_TRACEPOINT_SYMBOL(mlx5_fs_del_rule); + diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h new file mode 100644 index 0000000..ddf1b87 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h @@ -0,0 +1,323 @@ +/* + * Copyright (c) 2017, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if !defined(_MLX5_FS_TP_) || defined(TRACE_HEADER_MULTI_READ) +#define _MLX5_FS_TP_ + +#include +#include +#include "../fs_core.h" + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM mlx5 + +#define __parse_fs_hdrs(match_criteria_enable, mouter, mmisc, minner, vouter, \ + vinner, vmisc) \ + parse_fs_hdrs(p, match_criteria_enable, mouter, mmisc, minner, vouter,\ + vinner, vmisc) + +const char *parse_fs_hdrs(struct trace_seq *p, + u8 match_criteria_enable, + const u32 *mask_outer, + const u32 *mask_misc, + const u32 *mask_inner, + const u32 *value_outer, + const u32 *value_misc, + const u32 *value_inner); + +#define __parse_fs_dst(dst, counter_id) \ + parse_fs_dst(p, (const struct mlx5_flow_destination *)dst, counter_id) + +const char *parse_fs_dst(struct trace_seq *p, + const struct mlx5_flow_destination *dst, + u32 counter_id); + +TRACE_EVENT(mlx5_fs_add_ft, + TP_PROTO(const struct mlx5_flow_table *ft), + TP_ARGS(ft), + TP_STRUCT__entry( + __field(const struct mlx5_flow_table *, ft) + __field(u32, id) + __field(u32, level) + __field(u32, type) + ), + TP_fast_assign( + __entry->ft = ft; + __entry->id = ft->id; + __entry->level = ft->level; + __entry->type = ft->type; + ), + TP_printk("ft=%p id=%u level=%u type=%u \n", + __entry->ft, __entry->id, __entry->level, __entry->type) + ); + +TRACE_EVENT(mlx5_fs_del_ft, + TP_PROTO(const struct mlx5_flow_table *ft), + TP_ARGS(ft), + TP_STRUCT__entry( + __field(const struct mlx5_flow_table *, ft) + __field(u32, id) + ), + TP_fast_assign( + __entry->ft = ft; + __entry->id = ft->id; + + ), + TP_printk("ft=%p id=%u\n", + __entry->ft, __entry->id) + ); + +TRACE_EVENT(mlx5_fs_add_fg, + TP_PROTO(const struct mlx5_flow_group *fg), + TP_ARGS(fg), + TP_STRUCT__entry( + __field(const struct mlx5_flow_group *, fg) + __field(const struct mlx5_flow_table *, ft) + __field(u32, start_index) + __field(u32, end_index) + __field(u32, id) + __field(u8, mask_enable) + __array(u32, mask_outer, MLX5_ST_SZ_DW(fte_match_set_lyr_2_4)) + __array(u32, mask_inner, MLX5_ST_SZ_DW(fte_match_set_lyr_2_4)) + __array(u32, mask_misc, MLX5_ST_SZ_DW(fte_match_set_misc)) + ), + TP_fast_assign( + __entry->fg = fg; + fs_get_obj(__entry->ft, fg->node.parent); + __entry->start_index = fg->start_index; + __entry->end_index = fg->start_index + fg->max_ftes; + __entry->id = fg->id; + __entry->mask_enable = fg->mask.match_criteria_enable; + memcpy(__entry->mask_outer, + MLX5_ADDR_OF(fte_match_param, + &fg->mask.match_criteria, + outer_headers), + sizeof(__entry->mask_outer)); + memcpy(__entry->mask_inner, + MLX5_ADDR_OF(fte_match_param, + &fg->mask.match_criteria, + inner_headers), + sizeof(__entry->mask_inner)); + memcpy(__entry->mask_misc, + MLX5_ADDR_OF(fte_match_param, + &fg->mask.match_criteria, + misc_parameters), + sizeof(__entry->mask_misc)); + + ), + TP_printk("fg=%p ft=%p id=%u start=%u end=%u bit_mask=%02x %s\n", + __entry->fg, __entry->ft, __entry->id, + __entry->start_index, __entry->end_index, + __entry->mask_enable, + __parse_fs_hdrs(__entry->mask_enable, + __entry->mask_outer, + __entry->mask_misc, + __entry->mask_inner, + __entry->mask_outer, + __entry->mask_misc, + __entry->mask_inner)) + ); + +TRACE_EVENT(mlx5_fs_del_fg, + TP_PROTO(const struct mlx5_flow_group *fg), + TP_ARGS(fg), + TP_STRUCT__entry( + __field(const struct mlx5_flow_group *, fg) + __field(u32, id) + ), + TP_fast_assign( + __entry->fg = fg; + __entry->id = fg->id; + + ), + TP_printk("fg=%p id=%u\n", + __entry->fg, __entry->id) + ); + +#define ACTION_FLAGS \ + {MLX5_FLOW_CONTEXT_ACTION_ALLOW, "ALLOW"},\ + {MLX5_FLOW_CONTEXT_ACTION_DROP, "DROP"},\ + {MLX5_FLOW_CONTEXT_ACTION_FWD_DEST, "FWD"},\ + {MLX5_FLOW_CONTEXT_ACTION_COUNT, "CNT"},\ + {MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT, "REFORMAT"},\ + {MLX5_FLOW_CONTEXT_ACTION_DECAP, "DECAP"},\ + {MLX5_FLOW_CONTEXT_ACTION_MOD_HDR, "MOD_HDR"},\ + {MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH, "VLAN_PUSH"},\ + {MLX5_FLOW_CONTEXT_ACTION_VLAN_POP, "VLAN_POP"},\ + {MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH_2, "VLAN_PUSH_2"},\ + {MLX5_FLOW_CONTEXT_ACTION_VLAN_POP_2, "VLAN_POP_2"},\ + {MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO, "NEXT_PRIO"} + +TRACE_EVENT(mlx5_fs_set_fte, + TP_PROTO(const struct fs_fte *fte, int new_fte), + TP_ARGS(fte, new_fte), + TP_STRUCT__entry( + __field(const struct fs_fte *, fte) + __field(const struct mlx5_flow_group *, fg) + __field(u32, group_index) + __field(u32, index) + __field(u32, action) + __field(u32, flow_tag) + __field(u32, flow_source) + __field(u8, mask_enable) + __field(int, new_fte) + __array(u32, mask_outer, MLX5_ST_SZ_DW(fte_match_set_lyr_2_4)) + __array(u32, mask_inner, MLX5_ST_SZ_DW(fte_match_set_lyr_2_4)) + __array(u32, mask_misc, MLX5_ST_SZ_DW(fte_match_set_misc)) + __array(u32, value_outer, MLX5_ST_SZ_DW(fte_match_set_lyr_2_4)) + __array(u32, value_inner, MLX5_ST_SZ_DW(fte_match_set_lyr_2_4)) + __array(u32, value_misc, MLX5_ST_SZ_DW(fte_match_set_misc)) + ), + TP_fast_assign( + __entry->fte = fte; + __entry->new_fte = new_fte; + fs_get_obj(__entry->fg, fte->node.parent); + __entry->group_index = __entry->fg->id; + __entry->index = fte->index; + __entry->action = fte->action.action; + __entry->mask_enable = __entry->fg->mask.match_criteria_enable; + __entry->flow_tag = fte->flow_context.flow_tag; + __entry->flow_source = fte->flow_context.flow_source; + memcpy(__entry->mask_outer, + MLX5_ADDR_OF(fte_match_param, + &__entry->fg->mask.match_criteria, + outer_headers), + sizeof(__entry->mask_outer)); + memcpy(__entry->mask_inner, + MLX5_ADDR_OF(fte_match_param, + &__entry->fg->mask.match_criteria, + inner_headers), + sizeof(__entry->mask_inner)); + memcpy(__entry->mask_misc, + MLX5_ADDR_OF(fte_match_param, + &__entry->fg->mask.match_criteria, + misc_parameters), + sizeof(__entry->mask_misc)); + memcpy(__entry->value_outer, + MLX5_ADDR_OF(fte_match_param, + &fte->val, + outer_headers), + sizeof(__entry->value_outer)); + memcpy(__entry->value_inner, + MLX5_ADDR_OF(fte_match_param, + &fte->val, + inner_headers), + sizeof(__entry->value_inner)); + memcpy(__entry->value_misc, + MLX5_ADDR_OF(fte_match_param, + &fte->val, + misc_parameters), + sizeof(__entry->value_misc)); + + ), + TP_printk("op=%s fte=%p fg=%p index=%u group_index=%u action=<%s> flow_tag=%x %s\n", + __entry->new_fte ? "add" : "set", + __entry->fte, __entry->fg, __entry->index, + __entry->group_index, __print_flags(__entry->action, "|", + ACTION_FLAGS), + __entry->flow_tag, + __parse_fs_hdrs(__entry->mask_enable, + __entry->mask_outer, + __entry->mask_misc, + __entry->mask_inner, + __entry->value_outer, + __entry->value_misc, + __entry->value_inner)) + ); + +TRACE_EVENT(mlx5_fs_del_fte, + TP_PROTO(const struct fs_fte *fte), + TP_ARGS(fte), + TP_STRUCT__entry( + __field(const struct fs_fte *, fte) + __field(u32, index) + ), + TP_fast_assign( + __entry->fte = fte; + __entry->index = fte->index; + + ), + TP_printk("fte=%p index=%u\n", + __entry->fte, __entry->index) + ); + +TRACE_EVENT(mlx5_fs_add_rule, + TP_PROTO(const struct mlx5_flow_rule *rule), + TP_ARGS(rule), + TP_STRUCT__entry( + __field(const struct mlx5_flow_rule *, rule) + __field(const struct fs_fte *, fte) + __field(u32, sw_action) + __field(u32, index) + __field(u32, counter_id) + __array(u8, destination, sizeof(struct mlx5_flow_destination)) + ), + TP_fast_assign( + __entry->rule = rule; + fs_get_obj(__entry->fte, rule->node.parent); + __entry->index = __entry->fte->dests_size - 1; + __entry->sw_action = rule->sw_action; + memcpy(__entry->destination, + &rule->dest_attr, + sizeof(__entry->destination)); + if (rule->dest_attr.type & + MLX5_FLOW_DESTINATION_TYPE_COUNTER) + __entry->counter_id = + rule->dest_attr.counter_id; + ), + TP_printk("rule=%p fte=%p index=%u sw_action=<%s> [dst] %s\n", + __entry->rule, __entry->fte, __entry->index, + __print_flags(__entry->sw_action, "|", ACTION_FLAGS), + __parse_fs_dst(__entry->destination, __entry->counter_id)) + ); + +TRACE_EVENT(mlx5_fs_del_rule, + TP_PROTO(const struct mlx5_flow_rule *rule), + TP_ARGS(rule), + TP_STRUCT__entry( + __field(const struct mlx5_flow_rule *, rule) + __field(const struct fs_fte *, fte) + ), + TP_fast_assign( + __entry->rule = rule; + fs_get_obj(__entry->fte, rule->node.parent); + ), + TP_printk("rule=%p fte=%p\n", + __entry->rule, __entry->fte) + ); +#endif + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH ./diag +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE fs_tracepoint +#include diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c new file mode 100644 index 0000000..25b1ae5 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c @@ -0,0 +1,1153 @@ +/* + * Copyright (c) 2018, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#define CREATE_TRACE_POINTS +#include "lib/eq.h" +#include "fw_tracer.h" +#include "fw_tracer_tracepoint.h" + +static int mlx5_query_mtrc_caps(struct mlx5_fw_tracer *tracer) +{ + u32 *string_db_base_address_out = tracer->str_db.base_address_out; + u32 *string_db_size_out = tracer->str_db.size_out; + struct mlx5_core_dev *dev = tracer->dev; + u32 out[MLX5_ST_SZ_DW(mtrc_cap)] = {0}; + u32 in[MLX5_ST_SZ_DW(mtrc_cap)] = {0}; + void *mtrc_cap_sp; + int err, i; + + err = mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out), + MLX5_REG_MTRC_CAP, 0, 0); + if (err) { + mlx5_core_warn(dev, "FWTracer: Error reading tracer caps %d\n", + err); + return err; + } + + if (!MLX5_GET(mtrc_cap, out, trace_to_memory)) { + mlx5_core_dbg(dev, "FWTracer: Device does not support logging traces to memory\n"); + return -ENOTSUPP; + } + + tracer->trc_ver = MLX5_GET(mtrc_cap, out, trc_ver); + tracer->str_db.first_string_trace = + MLX5_GET(mtrc_cap, out, first_string_trace); + tracer->str_db.num_string_trace = + MLX5_GET(mtrc_cap, out, num_string_trace); + tracer->str_db.num_string_db = MLX5_GET(mtrc_cap, out, num_string_db); + tracer->owner = !!MLX5_GET(mtrc_cap, out, trace_owner); + + for (i = 0; i < tracer->str_db.num_string_db; i++) { + mtrc_cap_sp = MLX5_ADDR_OF(mtrc_cap, out, string_db_param[i]); + string_db_base_address_out[i] = MLX5_GET(mtrc_string_db_param, + mtrc_cap_sp, + string_db_base_address); + string_db_size_out[i] = MLX5_GET(mtrc_string_db_param, + mtrc_cap_sp, string_db_size); + } + + return err; +} + +static int mlx5_set_mtrc_caps_trace_owner(struct mlx5_fw_tracer *tracer, + u32 *out, u32 out_size, + u8 trace_owner) +{ + struct mlx5_core_dev *dev = tracer->dev; + u32 in[MLX5_ST_SZ_DW(mtrc_cap)] = {0}; + + MLX5_SET(mtrc_cap, in, trace_owner, trace_owner); + + return mlx5_core_access_reg(dev, in, sizeof(in), out, out_size, + MLX5_REG_MTRC_CAP, 0, 1); +} + +static int mlx5_fw_tracer_ownership_acquire(struct mlx5_fw_tracer *tracer) +{ + struct mlx5_core_dev *dev = tracer->dev; + u32 out[MLX5_ST_SZ_DW(mtrc_cap)] = {0}; + int err; + + err = mlx5_set_mtrc_caps_trace_owner(tracer, out, sizeof(out), + MLX5_FW_TRACER_ACQUIRE_OWNERSHIP); + if (err) { + mlx5_core_warn(dev, "FWTracer: Acquire tracer ownership failed %d\n", + err); + return err; + } + + tracer->owner = !!MLX5_GET(mtrc_cap, out, trace_owner); + + if (!tracer->owner) + return -EBUSY; + + return 0; +} + +static void mlx5_fw_tracer_ownership_release(struct mlx5_fw_tracer *tracer) +{ + u32 out[MLX5_ST_SZ_DW(mtrc_cap)] = {0}; + + mlx5_set_mtrc_caps_trace_owner(tracer, out, sizeof(out), + MLX5_FW_TRACER_RELEASE_OWNERSHIP); + tracer->owner = false; +} + +static int mlx5_fw_tracer_create_log_buf(struct mlx5_fw_tracer *tracer) +{ + struct mlx5_core_dev *dev = tracer->dev; + struct device *ddev; + dma_addr_t dma; + void *buff; + gfp_t gfp; + int err; + + tracer->buff.size = TRACE_BUFFER_SIZE_BYTE; + + gfp = GFP_KERNEL | __GFP_ZERO; + buff = (void *)__get_free_pages(gfp, + get_order(tracer->buff.size)); + if (!buff) { + err = -ENOMEM; + mlx5_core_warn(dev, "FWTracer: Failed to allocate pages, %d\n", err); + return err; + } + tracer->buff.log_buf = buff; + + ddev = mlx5_core_dma_dev(dev); + dma = dma_map_single(ddev, buff, tracer->buff.size, DMA_FROM_DEVICE); + if (dma_mapping_error(ddev, dma)) { + mlx5_core_warn(dev, "FWTracer: Unable to map DMA: %d\n", + dma_mapping_error(ddev, dma)); + err = -ENOMEM; + goto free_pages; + } + tracer->buff.dma = dma; + + return 0; + +free_pages: + free_pages((unsigned long)tracer->buff.log_buf, get_order(tracer->buff.size)); + + return err; +} + +static void mlx5_fw_tracer_destroy_log_buf(struct mlx5_fw_tracer *tracer) +{ + struct mlx5_core_dev *dev = tracer->dev; + struct device *ddev; + + if (!tracer->buff.log_buf) + return; + + ddev = mlx5_core_dma_dev(dev); + dma_unmap_single(ddev, tracer->buff.dma, tracer->buff.size, DMA_FROM_DEVICE); + free_pages((unsigned long)tracer->buff.log_buf, get_order(tracer->buff.size)); +} + +static int mlx5_fw_tracer_create_mkey(struct mlx5_fw_tracer *tracer) +{ + struct mlx5_core_dev *dev = tracer->dev; + int err, inlen, i; + __be64 *mtt; + void *mkc; + u32 *in; + + inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + + sizeof(*mtt) * round_up(TRACER_BUFFER_PAGE_NUM, 2); + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(create_mkey_in, in, translations_octword_actual_size, + DIV_ROUND_UP(TRACER_BUFFER_PAGE_NUM, 2)); + mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); + for (i = 0 ; i < TRACER_BUFFER_PAGE_NUM ; i++) + mtt[i] = cpu_to_be64(tracer->buff.dma + i * PAGE_SIZE); + + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT); + MLX5_SET(mkc, mkc, lr, 1); + MLX5_SET(mkc, mkc, lw, 1); + MLX5_SET(mkc, mkc, pd, tracer->buff.pdn); + MLX5_SET(mkc, mkc, bsf_octword_size, 0); + MLX5_SET(mkc, mkc, qpn, 0xffffff); + MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT); + MLX5_SET(mkc, mkc, translations_octword_size, + DIV_ROUND_UP(TRACER_BUFFER_PAGE_NUM, 2)); + MLX5_SET64(mkc, mkc, start_addr, tracer->buff.dma); + MLX5_SET64(mkc, mkc, len, tracer->buff.size); + err = mlx5_core_create_mkey(dev, &tracer->buff.mkey, in, inlen); + if (err) + mlx5_core_warn(dev, "FWTracer: Failed to create mkey, %d\n", err); + + kvfree(in); + + return err; +} + +static void mlx5_fw_tracer_free_strings_db(struct mlx5_fw_tracer *tracer) +{ + u32 num_string_db = tracer->str_db.num_string_db; + int i; + + for (i = 0; i < num_string_db; i++) { + kfree(tracer->str_db.buffer[i]); + tracer->str_db.buffer[i] = NULL; + } +} + +static int mlx5_fw_tracer_allocate_strings_db(struct mlx5_fw_tracer *tracer) +{ + u32 *string_db_size_out = tracer->str_db.size_out; + u32 num_string_db = tracer->str_db.num_string_db; + int i; + + for (i = 0; i < num_string_db; i++) { + tracer->str_db.buffer[i] = kzalloc(string_db_size_out[i], GFP_KERNEL); + if (!tracer->str_db.buffer[i]) + goto free_strings_db; + } + + return 0; + +free_strings_db: + mlx5_fw_tracer_free_strings_db(tracer); + return -ENOMEM; +} + +static void +mlx5_fw_tracer_init_saved_traces_array(struct mlx5_fw_tracer *tracer) +{ + tracer->st_arr.saved_traces_index = 0; + mutex_init(&tracer->st_arr.lock); +} + +static void +mlx5_fw_tracer_clean_saved_traces_array(struct mlx5_fw_tracer *tracer) +{ + mutex_destroy(&tracer->st_arr.lock); +} + +static void mlx5_tracer_read_strings_db(struct work_struct *work) +{ + struct mlx5_fw_tracer *tracer = container_of(work, struct mlx5_fw_tracer, + read_fw_strings_work); + u32 num_of_reads, num_string_db = tracer->str_db.num_string_db; + struct mlx5_core_dev *dev = tracer->dev; + u32 in[MLX5_ST_SZ_DW(mtrc_cap)] = {0}; + u32 leftovers, offset; + int err = 0, i, j; + u32 *out, outlen; + void *out_value; + + outlen = MLX5_ST_SZ_BYTES(mtrc_stdb) + STRINGS_DB_READ_SIZE_BYTES; + out = kzalloc(outlen, GFP_KERNEL); + if (!out) { + err = -ENOMEM; + goto out; + } + + for (i = 0; i < num_string_db; i++) { + offset = 0; + MLX5_SET(mtrc_stdb, in, string_db_index, i); + num_of_reads = tracer->str_db.size_out[i] / + STRINGS_DB_READ_SIZE_BYTES; + leftovers = (tracer->str_db.size_out[i] % + STRINGS_DB_READ_SIZE_BYTES) / + STRINGS_DB_LEFTOVER_SIZE_BYTES; + + MLX5_SET(mtrc_stdb, in, read_size, STRINGS_DB_READ_SIZE_BYTES); + for (j = 0; j < num_of_reads; j++) { + MLX5_SET(mtrc_stdb, in, start_offset, offset); + + err = mlx5_core_access_reg(dev, in, sizeof(in), out, + outlen, MLX5_REG_MTRC_STDB, + 0, 1); + if (err) { + mlx5_core_dbg(dev, "FWTracer: Failed to read strings DB %d\n", + err); + goto out_free; + } + + out_value = MLX5_ADDR_OF(mtrc_stdb, out, string_db_data); + memcpy(tracer->str_db.buffer[i] + offset, out_value, + STRINGS_DB_READ_SIZE_BYTES); + offset += STRINGS_DB_READ_SIZE_BYTES; + } + + /* Strings database is aligned to 64, need to read leftovers*/ + MLX5_SET(mtrc_stdb, in, read_size, + STRINGS_DB_LEFTOVER_SIZE_BYTES); + for (j = 0; j < leftovers; j++) { + MLX5_SET(mtrc_stdb, in, start_offset, offset); + + err = mlx5_core_access_reg(dev, in, sizeof(in), out, + outlen, MLX5_REG_MTRC_STDB, + 0, 1); + if (err) { + mlx5_core_dbg(dev, "FWTracer: Failed to read strings DB %d\n", + err); + goto out_free; + } + + out_value = MLX5_ADDR_OF(mtrc_stdb, out, string_db_data); + memcpy(tracer->str_db.buffer[i] + offset, out_value, + STRINGS_DB_LEFTOVER_SIZE_BYTES); + offset += STRINGS_DB_LEFTOVER_SIZE_BYTES; + } + } + + tracer->str_db.loaded = true; + +out_free: + kfree(out); +out: + return; +} + +static void mlx5_fw_tracer_arm(struct mlx5_core_dev *dev) +{ + u32 out[MLX5_ST_SZ_DW(mtrc_ctrl)] = {0}; + u32 in[MLX5_ST_SZ_DW(mtrc_ctrl)] = {0}; + int err; + + MLX5_SET(mtrc_ctrl, in, arm_event, 1); + + err = mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out), + MLX5_REG_MTRC_CTRL, 0, 1); + if (err) + mlx5_core_warn(dev, "FWTracer: Failed to arm tracer event %d\n", err); +} + +static const char *VAL_PARM = "%llx"; +static const char *REPLACE_64_VAL_PARM = "%x%x"; +static const char *PARAM_CHAR = "%"; + +static int mlx5_tracer_message_hash(u32 message_id) +{ + return jhash_1word(message_id, 0) & (MESSAGE_HASH_SIZE - 1); +} + +static struct tracer_string_format *mlx5_tracer_message_insert(struct mlx5_fw_tracer *tracer, + struct tracer_event *tracer_event) +{ + struct hlist_head *head = + &tracer->hash[mlx5_tracer_message_hash(tracer_event->string_event.tmsn)]; + struct tracer_string_format *cur_string; + + cur_string = kzalloc(sizeof(*cur_string), GFP_KERNEL); + if (!cur_string) + return NULL; + + hlist_add_head(&cur_string->hlist, head); + + return cur_string; +} + +static struct tracer_string_format *mlx5_tracer_get_string(struct mlx5_fw_tracer *tracer, + struct tracer_event *tracer_event) +{ + struct tracer_string_format *cur_string; + u32 str_ptr, offset; + int i; + + str_ptr = tracer_event->string_event.string_param; + + for (i = 0; i < tracer->str_db.num_string_db; i++) { + if (str_ptr > tracer->str_db.base_address_out[i] && + str_ptr < tracer->str_db.base_address_out[i] + + tracer->str_db.size_out[i]) { + offset = str_ptr - tracer->str_db.base_address_out[i]; + /* add it to the hash */ + cur_string = mlx5_tracer_message_insert(tracer, tracer_event); + if (!cur_string) + return NULL; + cur_string->string = (char *)(tracer->str_db.buffer[i] + + offset); + return cur_string; + } + } + + return NULL; +} + +static void mlx5_tracer_clean_message(struct tracer_string_format *str_frmt) +{ + hlist_del(&str_frmt->hlist); + kfree(str_frmt); +} + +static int mlx5_tracer_get_num_of_params(char *str) +{ + char *substr, *pstr = str; + int num_of_params = 0; + + /* replace %llx with %x%x */ + substr = strstr(pstr, VAL_PARM); + while (substr) { + memcpy(substr, REPLACE_64_VAL_PARM, 4); + pstr = substr; + substr = strstr(pstr, VAL_PARM); + } + + /* count all the % characters */ + substr = strstr(str, PARAM_CHAR); + while (substr) { + num_of_params += 1; + str = substr + 1; + substr = strstr(str, PARAM_CHAR); + } + + return num_of_params; +} + +static struct tracer_string_format *mlx5_tracer_message_find(struct hlist_head *head, + u8 event_id, u32 tmsn) +{ + struct tracer_string_format *message; + + hlist_for_each_entry(message, head, hlist) + if (message->event_id == event_id && message->tmsn == tmsn) + return message; + + return NULL; +} + +static struct tracer_string_format *mlx5_tracer_message_get(struct mlx5_fw_tracer *tracer, + struct tracer_event *tracer_event) +{ + struct hlist_head *head = + &tracer->hash[mlx5_tracer_message_hash(tracer_event->string_event.tmsn)]; + + return mlx5_tracer_message_find(head, tracer_event->event_id, tracer_event->string_event.tmsn); +} + +static void poll_trace(struct mlx5_fw_tracer *tracer, + struct tracer_event *tracer_event, u64 *trace) +{ + u32 timestamp_low, timestamp_mid, timestamp_high, urts; + + tracer_event->event_id = MLX5_GET(tracer_event, trace, event_id); + tracer_event->lost_event = MLX5_GET(tracer_event, trace, lost); + + switch (tracer_event->event_id) { + case TRACER_EVENT_TYPE_TIMESTAMP: + tracer_event->type = TRACER_EVENT_TYPE_TIMESTAMP; + urts = MLX5_GET(tracer_timestamp_event, trace, urts); + if (tracer->trc_ver == 0) + tracer_event->timestamp_event.unreliable = !!(urts >> 2); + else + tracer_event->timestamp_event.unreliable = !!(urts & 1); + + timestamp_low = MLX5_GET(tracer_timestamp_event, + trace, timestamp7_0); + timestamp_mid = MLX5_GET(tracer_timestamp_event, + trace, timestamp39_8); + timestamp_high = MLX5_GET(tracer_timestamp_event, + trace, timestamp52_40); + + tracer_event->timestamp_event.timestamp = + ((u64)timestamp_high << 40) | + ((u64)timestamp_mid << 8) | + (u64)timestamp_low; + break; + default: + if (tracer_event->event_id >= tracer->str_db.first_string_trace || + tracer_event->event_id <= tracer->str_db.first_string_trace + + tracer->str_db.num_string_trace) { + tracer_event->type = TRACER_EVENT_TYPE_STRING; + tracer_event->string_event.timestamp = + MLX5_GET(tracer_string_event, trace, timestamp); + tracer_event->string_event.string_param = + MLX5_GET(tracer_string_event, trace, string_param); + tracer_event->string_event.tmsn = + MLX5_GET(tracer_string_event, trace, tmsn); + tracer_event->string_event.tdsn = + MLX5_GET(tracer_string_event, trace, tdsn); + } else { + tracer_event->type = TRACER_EVENT_TYPE_UNRECOGNIZED; + } + break; + } +} + +static u64 get_block_timestamp(struct mlx5_fw_tracer *tracer, u64 *ts_event) +{ + struct tracer_event tracer_event; + u8 event_id; + + event_id = MLX5_GET(tracer_event, ts_event, event_id); + + if (event_id == TRACER_EVENT_TYPE_TIMESTAMP) + poll_trace(tracer, &tracer_event, ts_event); + else + tracer_event.timestamp_event.timestamp = 0; + + return tracer_event.timestamp_event.timestamp; +} + +static void mlx5_fw_tracer_clean_print_hash(struct mlx5_fw_tracer *tracer) +{ + struct tracer_string_format *str_frmt; + struct hlist_node *n; + int i; + + for (i = 0; i < MESSAGE_HASH_SIZE; i++) { + hlist_for_each_entry_safe(str_frmt, n, &tracer->hash[i], hlist) + mlx5_tracer_clean_message(str_frmt); + } +} + +static void mlx5_fw_tracer_clean_ready_list(struct mlx5_fw_tracer *tracer) +{ + struct tracer_string_format *str_frmt, *tmp_str; + + list_for_each_entry_safe(str_frmt, tmp_str, &tracer->ready_strings_list, + list) + list_del(&str_frmt->list); +} + +static void mlx5_fw_tracer_save_trace(struct mlx5_fw_tracer *tracer, + u64 timestamp, bool lost, + u8 event_id, char *msg) +{ + struct mlx5_fw_trace_data *trace_data; + + mutex_lock(&tracer->st_arr.lock); + trace_data = &tracer->st_arr.straces[tracer->st_arr.saved_traces_index]; + trace_data->timestamp = timestamp; + trace_data->lost = lost; + trace_data->event_id = event_id; + strscpy_pad(trace_data->msg, msg, TRACE_STR_MSG); + + tracer->st_arr.saved_traces_index = + (tracer->st_arr.saved_traces_index + 1) & (SAVED_TRACES_NUM - 1); + mutex_unlock(&tracer->st_arr.lock); +} + +static noinline +void mlx5_tracer_print_trace(struct tracer_string_format *str_frmt, + struct mlx5_core_dev *dev, + u64 trace_timestamp) +{ + char tmp[512]; + + snprintf(tmp, sizeof(tmp), str_frmt->string, + str_frmt->params[0], + str_frmt->params[1], + str_frmt->params[2], + str_frmt->params[3], + str_frmt->params[4], + str_frmt->params[5], + str_frmt->params[6]); + + trace_mlx5_fw(dev->tracer, trace_timestamp, str_frmt->lost, + str_frmt->event_id, tmp); + + mlx5_fw_tracer_save_trace(dev->tracer, trace_timestamp, + str_frmt->lost, str_frmt->event_id, tmp); + + /* remove it from hash */ + mlx5_tracer_clean_message(str_frmt); +} + +static int mlx5_tracer_handle_string_trace(struct mlx5_fw_tracer *tracer, + struct tracer_event *tracer_event) +{ + struct tracer_string_format *cur_string; + + if (tracer_event->string_event.tdsn == 0) { + cur_string = mlx5_tracer_get_string(tracer, tracer_event); + if (!cur_string) + return -1; + + cur_string->num_of_params = mlx5_tracer_get_num_of_params(cur_string->string); + cur_string->last_param_num = 0; + cur_string->event_id = tracer_event->event_id; + cur_string->tmsn = tracer_event->string_event.tmsn; + cur_string->timestamp = tracer_event->string_event.timestamp; + cur_string->lost = tracer_event->lost_event; + if (cur_string->num_of_params == 0) /* trace with no params */ + list_add_tail(&cur_string->list, &tracer->ready_strings_list); + } else { + cur_string = mlx5_tracer_message_get(tracer, tracer_event); + if (!cur_string) { + pr_debug("%s Got string event for unknown string tdsm: %d\n", + __func__, tracer_event->string_event.tmsn); + return -1; + } + cur_string->last_param_num += 1; + if (cur_string->last_param_num > TRACER_MAX_PARAMS) { + pr_debug("%s Number of params exceeds the max (%d)\n", + __func__, TRACER_MAX_PARAMS); + list_add_tail(&cur_string->list, &tracer->ready_strings_list); + return 0; + } + /* keep the new parameter */ + cur_string->params[cur_string->last_param_num - 1] = + tracer_event->string_event.string_param; + if (cur_string->last_param_num == cur_string->num_of_params) + list_add_tail(&cur_string->list, &tracer->ready_strings_list); + } + + return 0; +} + +static void mlx5_tracer_handle_timestamp_trace(struct mlx5_fw_tracer *tracer, + struct tracer_event *tracer_event) +{ + struct tracer_timestamp_event timestamp_event = + tracer_event->timestamp_event; + struct tracer_string_format *str_frmt, *tmp_str; + struct mlx5_core_dev *dev = tracer->dev; + u64 trace_timestamp; + + list_for_each_entry_safe(str_frmt, tmp_str, &tracer->ready_strings_list, list) { + list_del(&str_frmt->list); + if (str_frmt->timestamp < (timestamp_event.timestamp & MASK_6_0)) + trace_timestamp = (timestamp_event.timestamp & MASK_52_7) | + (str_frmt->timestamp & MASK_6_0); + else + trace_timestamp = ((timestamp_event.timestamp & MASK_52_7) - 1) | + (str_frmt->timestamp & MASK_6_0); + + mlx5_tracer_print_trace(str_frmt, dev, trace_timestamp); + } +} + +static int mlx5_tracer_handle_trace(struct mlx5_fw_tracer *tracer, + struct tracer_event *tracer_event) +{ + if (tracer_event->type == TRACER_EVENT_TYPE_STRING) { + mlx5_tracer_handle_string_trace(tracer, tracer_event); + } else if (tracer_event->type == TRACER_EVENT_TYPE_TIMESTAMP) { + if (!tracer_event->timestamp_event.unreliable) + mlx5_tracer_handle_timestamp_trace(tracer, tracer_event); + } else { + pr_debug("%s Got unrecognised type %d for parsing, exiting..\n", + __func__, tracer_event->type); + } + return 0; +} + +static void mlx5_fw_tracer_handle_traces(struct work_struct *work) +{ + struct mlx5_fw_tracer *tracer = + container_of(work, struct mlx5_fw_tracer, handle_traces_work); + u64 block_timestamp, last_block_timestamp, tmp_trace_block[TRACES_PER_BLOCK]; + u32 block_count, start_offset, prev_start_offset, prev_consumer_index; + u32 trace_event_size = MLX5_ST_SZ_BYTES(tracer_event); + struct mlx5_core_dev *dev = tracer->dev; + struct tracer_event tracer_event; + int i; + + mlx5_core_dbg(dev, "FWTracer: Handle Trace event, owner=(%d)\n", tracer->owner); + if (!tracer->owner) + return; + + if (unlikely(!tracer->str_db.loaded)) + goto arm; + + block_count = tracer->buff.size / TRACER_BLOCK_SIZE_BYTE; + start_offset = tracer->buff.consumer_index * TRACER_BLOCK_SIZE_BYTE; + + /* Copy the block to local buffer to avoid HW override while being processed */ + memcpy(tmp_trace_block, tracer->buff.log_buf + start_offset, + TRACER_BLOCK_SIZE_BYTE); + + block_timestamp = + get_block_timestamp(tracer, &tmp_trace_block[TRACES_PER_BLOCK - 1]); + + while (block_timestamp > tracer->last_timestamp) { + /* Check block override if it's not the first block */ + if (!tracer->last_timestamp) { + u64 *ts_event; + /* To avoid block override be the HW in case of buffer + * wraparound, the time stamp of the previous block + * should be compared to the last timestamp handled + * by the driver. + */ + prev_consumer_index = + (tracer->buff.consumer_index - 1) & (block_count - 1); + prev_start_offset = prev_consumer_index * TRACER_BLOCK_SIZE_BYTE; + + ts_event = tracer->buff.log_buf + prev_start_offset + + (TRACES_PER_BLOCK - 1) * trace_event_size; + last_block_timestamp = get_block_timestamp(tracer, ts_event); + /* If previous timestamp different from last stored + * timestamp then there is a good chance that the + * current buffer is overwritten and therefore should + * not be parsed. + */ + if (tracer->last_timestamp != last_block_timestamp) { + mlx5_core_warn(dev, "FWTracer: Events were lost\n"); + tracer->last_timestamp = block_timestamp; + tracer->buff.consumer_index = + (tracer->buff.consumer_index + 1) & (block_count - 1); + break; + } + } + + /* Parse events */ + for (i = 0; i < TRACES_PER_BLOCK ; i++) { + poll_trace(tracer, &tracer_event, &tmp_trace_block[i]); + mlx5_tracer_handle_trace(tracer, &tracer_event); + } + + tracer->buff.consumer_index = + (tracer->buff.consumer_index + 1) & (block_count - 1); + + tracer->last_timestamp = block_timestamp; + start_offset = tracer->buff.consumer_index * TRACER_BLOCK_SIZE_BYTE; + memcpy(tmp_trace_block, tracer->buff.log_buf + start_offset, + TRACER_BLOCK_SIZE_BYTE); + block_timestamp = get_block_timestamp(tracer, + &tmp_trace_block[TRACES_PER_BLOCK - 1]); + } + +arm: + mlx5_fw_tracer_arm(dev); +} + +static int mlx5_fw_tracer_set_mtrc_conf(struct mlx5_fw_tracer *tracer) +{ + struct mlx5_core_dev *dev = tracer->dev; + u32 out[MLX5_ST_SZ_DW(mtrc_conf)] = {0}; + u32 in[MLX5_ST_SZ_DW(mtrc_conf)] = {0}; + int err; + + MLX5_SET(mtrc_conf, in, trace_mode, TRACE_TO_MEMORY); + MLX5_SET(mtrc_conf, in, log_trace_buffer_size, + ilog2(TRACER_BUFFER_PAGE_NUM)); + MLX5_SET(mtrc_conf, in, trace_mkey, tracer->buff.mkey); + + err = mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out), + MLX5_REG_MTRC_CONF, 0, 1); + if (err) + mlx5_core_warn(dev, "FWTracer: Failed to set tracer configurations %d\n", err); + + return err; +} + +static int mlx5_fw_tracer_set_mtrc_ctrl(struct mlx5_fw_tracer *tracer, u8 status, u8 arm) +{ + struct mlx5_core_dev *dev = tracer->dev; + u32 out[MLX5_ST_SZ_DW(mtrc_ctrl)] = {0}; + u32 in[MLX5_ST_SZ_DW(mtrc_ctrl)] = {0}; + int err; + + MLX5_SET(mtrc_ctrl, in, modify_field_select, TRACE_STATUS); + MLX5_SET(mtrc_ctrl, in, trace_status, status); + MLX5_SET(mtrc_ctrl, in, arm_event, arm); + + err = mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out), + MLX5_REG_MTRC_CTRL, 0, 1); + + if (!err && status) + tracer->last_timestamp = 0; + + return err; +} + +static int mlx5_fw_tracer_start(struct mlx5_fw_tracer *tracer) +{ + struct mlx5_core_dev *dev = tracer->dev; + int err; + + err = mlx5_fw_tracer_ownership_acquire(tracer); + if (err) { + mlx5_core_dbg(dev, "FWTracer: Ownership was not granted %d\n", err); + /* Don't fail since ownership can be acquired on a later FW event */ + return 0; + } + + err = mlx5_fw_tracer_set_mtrc_conf(tracer); + if (err) { + mlx5_core_warn(dev, "FWTracer: Failed to set tracer configuration %d\n", err); + goto release_ownership; + } + + /* enable tracer & trace events */ + err = mlx5_fw_tracer_set_mtrc_ctrl(tracer, 1, 1); + if (err) { + mlx5_core_warn(dev, "FWTracer: Failed to enable tracer %d\n", err); + goto release_ownership; + } + + mlx5_core_warn(dev, "FWTracer: Ownership granted and active\n"); + return 0; + +release_ownership: + mlx5_fw_tracer_ownership_release(tracer); + return err; +} + +static void mlx5_fw_tracer_ownership_change(struct work_struct *work) +{ + struct mlx5_fw_tracer *tracer = + container_of(work, struct mlx5_fw_tracer, ownership_change_work); + + mlx5_core_dbg(tracer->dev, "FWTracer: ownership changed, current=(%d)\n", tracer->owner); + if (tracer->owner) { + tracer->owner = false; + tracer->buff.consumer_index = 0; + return; + } + + mlx5_fw_tracer_start(tracer); +} + +static int mlx5_fw_tracer_set_core_dump_reg(struct mlx5_core_dev *dev, + u32 *in, int size_in) +{ + u32 out[MLX5_ST_SZ_DW(core_dump_reg)] = {}; + + if (!MLX5_CAP_DEBUG(dev, core_dump_general) && + !MLX5_CAP_DEBUG(dev, core_dump_qp)) + return -EOPNOTSUPP; + + return mlx5_core_access_reg(dev, in, size_in, out, sizeof(out), + MLX5_REG_CORE_DUMP, 0, 1); +} + +int mlx5_fw_tracer_trigger_core_dump_general(struct mlx5_core_dev *dev) +{ + struct mlx5_fw_tracer *tracer = dev->tracer; + u32 in[MLX5_ST_SZ_DW(core_dump_reg)] = {}; + int err; + + if (!MLX5_CAP_DEBUG(dev, core_dump_general) || !tracer) + return -EOPNOTSUPP; + if (!tracer->owner) + return -EPERM; + + MLX5_SET(core_dump_reg, in, core_dump_type, 0x0); + + err = mlx5_fw_tracer_set_core_dump_reg(dev, in, sizeof(in)); + if (err) + return err; + queue_work(tracer->work_queue, &tracer->handle_traces_work); + flush_workqueue(tracer->work_queue); + return 0; +} + +static int +mlx5_devlink_fmsg_fill_trace(struct devlink_fmsg *fmsg, + struct mlx5_fw_trace_data *trace_data) +{ + int err; + + err = devlink_fmsg_obj_nest_start(fmsg); + if (err) + return err; + + err = devlink_fmsg_u64_pair_put(fmsg, "timestamp", trace_data->timestamp); + if (err) + return err; + + err = devlink_fmsg_bool_pair_put(fmsg, "lost", trace_data->lost); + if (err) + return err; + + err = devlink_fmsg_u8_pair_put(fmsg, "event_id", trace_data->event_id); + if (err) + return err; + + err = devlink_fmsg_string_pair_put(fmsg, "msg", trace_data->msg); + if (err) + return err; + + err = devlink_fmsg_obj_nest_end(fmsg); + if (err) + return err; + return 0; +} + +int mlx5_fw_tracer_get_saved_traces_objects(struct mlx5_fw_tracer *tracer, + struct devlink_fmsg *fmsg) +{ + struct mlx5_fw_trace_data *straces = tracer->st_arr.straces; + u32 index, start_index, end_index; + u32 saved_traces_index; + int err; + + if (!straces[0].timestamp) + return -ENOMSG; + + mutex_lock(&tracer->st_arr.lock); + saved_traces_index = tracer->st_arr.saved_traces_index; + if (straces[saved_traces_index].timestamp) + start_index = saved_traces_index; + else + start_index = 0; + end_index = (saved_traces_index - 1) & (SAVED_TRACES_NUM - 1); + + err = devlink_fmsg_arr_pair_nest_start(fmsg, "dump fw traces"); + if (err) + goto unlock; + index = start_index; + while (index != end_index) { + err = mlx5_devlink_fmsg_fill_trace(fmsg, &straces[index]); + if (err) + goto unlock; + + index = (index + 1) & (SAVED_TRACES_NUM - 1); + } + + err = devlink_fmsg_arr_pair_nest_end(fmsg); +unlock: + mutex_unlock(&tracer->st_arr.lock); + return err; +} + +/* Create software resources (Buffers, etc ..) */ +struct mlx5_fw_tracer *mlx5_fw_tracer_create(struct mlx5_core_dev *dev) +{ + struct mlx5_fw_tracer *tracer = NULL; + int err; + + if (!MLX5_CAP_MCAM_REG(dev, tracer_registers)) { + mlx5_core_dbg(dev, "FWTracer: Tracer capability not present\n"); + return NULL; + } + + tracer = kvzalloc(sizeof(*tracer), GFP_KERNEL); + if (!tracer) + return ERR_PTR(-ENOMEM); + + tracer->work_queue = create_singlethread_workqueue("mlx5_fw_tracer"); + if (!tracer->work_queue) { + err = -ENOMEM; + goto free_tracer; + } + + tracer->dev = dev; + + INIT_LIST_HEAD(&tracer->ready_strings_list); + INIT_WORK(&tracer->ownership_change_work, mlx5_fw_tracer_ownership_change); + INIT_WORK(&tracer->read_fw_strings_work, mlx5_tracer_read_strings_db); + INIT_WORK(&tracer->handle_traces_work, mlx5_fw_tracer_handle_traces); + + + err = mlx5_query_mtrc_caps(tracer); + if (err) { + mlx5_core_dbg(dev, "FWTracer: Failed to query capabilities %d\n", err); + goto destroy_workqueue; + } + + err = mlx5_fw_tracer_create_log_buf(tracer); + if (err) { + mlx5_core_warn(dev, "FWTracer: Create log buffer failed %d\n", err); + goto destroy_workqueue; + } + + err = mlx5_fw_tracer_allocate_strings_db(tracer); + if (err) { + mlx5_core_warn(dev, "FWTracer: Allocate strings database failed %d\n", err); + goto free_log_buf; + } + + mlx5_fw_tracer_init_saved_traces_array(tracer); + mlx5_core_dbg(dev, "FWTracer: Tracer created\n"); + + return tracer; + +free_log_buf: + mlx5_fw_tracer_destroy_log_buf(tracer); +destroy_workqueue: + tracer->dev = NULL; + destroy_workqueue(tracer->work_queue); +free_tracer: + kvfree(tracer); + return ERR_PTR(err); +} + +static int fw_tracer_event(struct notifier_block *nb, unsigned long action, void *data); + +/* Create HW resources + start tracer */ +int mlx5_fw_tracer_init(struct mlx5_fw_tracer *tracer) +{ + struct mlx5_core_dev *dev; + int err; + + if (IS_ERR_OR_NULL(tracer)) + return 0; + + dev = tracer->dev; + + if (!tracer->str_db.loaded) + queue_work(tracer->work_queue, &tracer->read_fw_strings_work); + + err = mlx5_core_alloc_pd(dev, &tracer->buff.pdn); + if (err) { + mlx5_core_warn(dev, "FWTracer: Failed to allocate PD %d\n", err); + goto err_cancel_work; + } + + err = mlx5_fw_tracer_create_mkey(tracer); + if (err) { + mlx5_core_warn(dev, "FWTracer: Failed to create mkey %d\n", err); + goto err_dealloc_pd; + } + + MLX5_NB_INIT(&tracer->nb, fw_tracer_event, DEVICE_TRACER); + mlx5_eq_notifier_register(dev, &tracer->nb); + + err = mlx5_fw_tracer_start(tracer); + if (err) { + mlx5_core_warn(dev, "FWTracer: Failed to start tracer %d\n", err); + goto err_notifier_unregister; + } + return 0; + +err_notifier_unregister: + mlx5_eq_notifier_unregister(dev, &tracer->nb); + mlx5_core_destroy_mkey(dev, tracer->buff.mkey); +err_dealloc_pd: + mlx5_core_dealloc_pd(dev, tracer->buff.pdn); +err_cancel_work: + cancel_work_sync(&tracer->read_fw_strings_work); + return err; +} + +/* Stop tracer + Cleanup HW resources */ +void mlx5_fw_tracer_cleanup(struct mlx5_fw_tracer *tracer) +{ + if (IS_ERR_OR_NULL(tracer)) + return; + + mlx5_core_dbg(tracer->dev, "FWTracer: Cleanup, is owner ? (%d)\n", + tracer->owner); + mlx5_eq_notifier_unregister(tracer->dev, &tracer->nb); + cancel_work_sync(&tracer->ownership_change_work); + cancel_work_sync(&tracer->handle_traces_work); + + if (tracer->owner) + mlx5_fw_tracer_ownership_release(tracer); + + mlx5_core_destroy_mkey(tracer->dev, tracer->buff.mkey); + mlx5_core_dealloc_pd(tracer->dev, tracer->buff.pdn); +} + +/* Free software resources (Buffers, etc ..) */ +void mlx5_fw_tracer_destroy(struct mlx5_fw_tracer *tracer) +{ + if (IS_ERR_OR_NULL(tracer)) + return; + + mlx5_core_dbg(tracer->dev, "FWTracer: Destroy\n"); + + cancel_work_sync(&tracer->read_fw_strings_work); + mlx5_fw_tracer_clean_ready_list(tracer); + mlx5_fw_tracer_clean_print_hash(tracer); + mlx5_fw_tracer_clean_saved_traces_array(tracer); + mlx5_fw_tracer_free_strings_db(tracer); + mlx5_fw_tracer_destroy_log_buf(tracer); + destroy_workqueue(tracer->work_queue); + kvfree(tracer); +} + +static int mlx5_fw_tracer_recreate_strings_db(struct mlx5_fw_tracer *tracer) +{ + struct mlx5_core_dev *dev; + int err; + + cancel_work_sync(&tracer->read_fw_strings_work); + mlx5_fw_tracer_clean_ready_list(tracer); + mlx5_fw_tracer_clean_print_hash(tracer); + mlx5_fw_tracer_clean_saved_traces_array(tracer); + mlx5_fw_tracer_free_strings_db(tracer); + + dev = tracer->dev; + err = mlx5_query_mtrc_caps(tracer); + if (err) { + mlx5_core_dbg(dev, "FWTracer: Failed to query capabilities %d\n", err); + return err; + } + + err = mlx5_fw_tracer_allocate_strings_db(tracer); + if (err) { + mlx5_core_warn(dev, "FWTracer: Allocate strings DB failed %d\n", err); + return err; + } + mlx5_fw_tracer_init_saved_traces_array(tracer); + + return 0; +} + +int mlx5_fw_tracer_reload(struct mlx5_fw_tracer *tracer) +{ + struct mlx5_core_dev *dev; + int err; + + if (IS_ERR_OR_NULL(tracer)) + return 0; + + dev = tracer->dev; + mlx5_fw_tracer_cleanup(tracer); + err = mlx5_fw_tracer_recreate_strings_db(tracer); + if (err) { + mlx5_core_warn(dev, "Failed to recreate FW tracer strings DB\n"); + return err; + } + err = mlx5_fw_tracer_init(tracer); + if (err) { + mlx5_core_warn(dev, "Failed to re-initialize FW tracer\n"); + return err; + } + + return 0; +} + +static int fw_tracer_event(struct notifier_block *nb, unsigned long action, void *data) +{ + struct mlx5_fw_tracer *tracer = mlx5_nb_cof(nb, struct mlx5_fw_tracer, nb); + struct mlx5_core_dev *dev = tracer->dev; + struct mlx5_eqe *eqe = data; + + switch (eqe->sub_type) { + case MLX5_TRACER_SUBTYPE_OWNERSHIP_CHANGE: + queue_work(tracer->work_queue, &tracer->ownership_change_work); + break; + case MLX5_TRACER_SUBTYPE_TRACES_AVAILABLE: + queue_work(tracer->work_queue, &tracer->handle_traces_work); + break; + default: + mlx5_core_dbg(dev, "FWTracer: Event with unrecognized subtype: sub_type %d\n", + eqe->sub_type); + } + + return NOTIFY_OK; +} + +EXPORT_TRACEPOINT_SYMBOL(mlx5_fw); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.h new file mode 100644 index 0000000..4762b55 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.h @@ -0,0 +1,196 @@ +/* + * Copyright (c) 2018, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __LIB_TRACER_H__ +#define __LIB_TRACER_H__ + +#include +#include "mlx5_core.h" + +#define STRINGS_DB_SECTIONS_NUM 8 +#define STRINGS_DB_READ_SIZE_BYTES 256 +#define STRINGS_DB_LEFTOVER_SIZE_BYTES 64 +#define TRACER_BUFFER_PAGE_NUM 64 +#define TRACER_BUFFER_CHUNK 4096 +#define TRACE_BUFFER_SIZE_BYTE (TRACER_BUFFER_PAGE_NUM * TRACER_BUFFER_CHUNK) + +#define TRACER_BLOCK_SIZE_BYTE 256 +#define TRACES_PER_BLOCK 32 + +#define TRACE_STR_MSG 256 +#define SAVED_TRACES_NUM 8192 + +#define TRACER_MAX_PARAMS 7 +#define MESSAGE_HASH_BITS 6 +#define MESSAGE_HASH_SIZE BIT(MESSAGE_HASH_BITS) + +#define MASK_52_7 (0x1FFFFFFFFFFF80) +#define MASK_6_0 (0x7F) + +struct mlx5_fw_trace_data { + u64 timestamp; + bool lost; + u8 event_id; + char msg[TRACE_STR_MSG]; +}; + +struct mlx5_fw_tracer { + struct mlx5_core_dev *dev; + struct mlx5_nb nb; + bool owner; + u8 trc_ver; + struct workqueue_struct *work_queue; + struct work_struct ownership_change_work; + struct work_struct read_fw_strings_work; + + /* Strings DB */ + struct { + u8 first_string_trace; + u8 num_string_trace; + u32 num_string_db; + u32 base_address_out[STRINGS_DB_SECTIONS_NUM]; + u32 size_out[STRINGS_DB_SECTIONS_NUM]; + void *buffer[STRINGS_DB_SECTIONS_NUM]; + bool loaded; + } str_db; + + /* Log Buffer */ + struct { + u32 pdn; + void *log_buf; + dma_addr_t dma; + u32 size; + u32 mkey; + u32 consumer_index; + } buff; + + /* Saved Traces Array */ + struct { + struct mlx5_fw_trace_data straces[SAVED_TRACES_NUM]; + u32 saved_traces_index; + struct mutex lock; /* Protect st_arr access */ + } st_arr; + + u64 last_timestamp; + struct work_struct handle_traces_work; + struct hlist_head hash[MESSAGE_HASH_SIZE]; + struct list_head ready_strings_list; +}; + +struct tracer_string_format { + char *string; + int params[TRACER_MAX_PARAMS]; + int num_of_params; + int last_param_num; + u8 event_id; + u32 tmsn; + struct hlist_node hlist; + struct list_head list; + u32 timestamp; + bool lost; +}; + +enum mlx5_fw_tracer_ownership_state { + MLX5_FW_TRACER_RELEASE_OWNERSHIP, + MLX5_FW_TRACER_ACQUIRE_OWNERSHIP, +}; + +enum tracer_ctrl_fields_select { + TRACE_STATUS = 1 << 0, +}; + +enum tracer_event_type { + TRACER_EVENT_TYPE_STRING, + TRACER_EVENT_TYPE_TIMESTAMP = 0xFF, + TRACER_EVENT_TYPE_UNRECOGNIZED, +}; + +enum tracing_mode { + TRACE_TO_MEMORY = 1 << 0, +}; + +struct tracer_timestamp_event { + u64 timestamp; + u8 unreliable; +}; + +struct tracer_string_event { + u32 timestamp; + u32 tmsn; + u32 tdsn; + u32 string_param; +}; + +struct tracer_event { + bool lost_event; + u32 type; + u8 event_id; + union { + struct tracer_string_event string_event; + struct tracer_timestamp_event timestamp_event; + }; +}; + +struct mlx5_ifc_tracer_event_bits { + u8 lost[0x1]; + u8 timestamp[0x7]; + u8 event_id[0x8]; + u8 event_data[0x30]; +}; + +struct mlx5_ifc_tracer_string_event_bits { + u8 lost[0x1]; + u8 timestamp[0x7]; + u8 event_id[0x8]; + u8 tmsn[0xd]; + u8 tdsn[0x3]; + u8 string_param[0x20]; +}; + +struct mlx5_ifc_tracer_timestamp_event_bits { + u8 timestamp7_0[0x8]; + u8 event_id[0x8]; + u8 urts[0x3]; + u8 timestamp52_40[0xd]; + u8 timestamp39_8[0x20]; +}; + +struct mlx5_fw_tracer *mlx5_fw_tracer_create(struct mlx5_core_dev *dev); +int mlx5_fw_tracer_init(struct mlx5_fw_tracer *tracer); +void mlx5_fw_tracer_cleanup(struct mlx5_fw_tracer *tracer); +void mlx5_fw_tracer_destroy(struct mlx5_fw_tracer *tracer); +int mlx5_fw_tracer_trigger_core_dump_general(struct mlx5_core_dev *dev); +int mlx5_fw_tracer_get_saved_traces_objects(struct mlx5_fw_tracer *tracer, + struct devlink_fmsg *fmsg); +int mlx5_fw_tracer_reload(struct mlx5_fw_tracer *tracer); + +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer_tracepoint.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer_tracepoint.h new file mode 100644 index 0000000..3038be5 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer_tracepoint.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2018, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if !defined(__LIB_TRACER_TRACEPOINT_H__) || defined(TRACE_HEADER_MULTI_READ) +#define __LIB_TRACER_TRACEPOINT_H__ + +#include +#include "fw_tracer.h" + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM mlx5 + +/* Tracepoint for FWTracer messages: */ +TRACE_EVENT(mlx5_fw, + TP_PROTO(const struct mlx5_fw_tracer *tracer, u64 trace_timestamp, + bool lost, u8 event_id, const char *msg), + + TP_ARGS(tracer, trace_timestamp, lost, event_id, msg), + + TP_STRUCT__entry( + __string(dev_name, dev_name(tracer->dev->device)) + __field(u64, trace_timestamp) + __field(bool, lost) + __field(u8, event_id) + __string(msg, msg) + ), + + TP_fast_assign( + __assign_str(dev_name, + dev_name(tracer->dev->device)); + __entry->trace_timestamp = trace_timestamp; + __entry->lost = lost; + __entry->event_id = event_id; + __assign_str(msg, msg); + ), + + TP_printk("%s [0x%llx] %d [0x%x] %s", + __get_str(dev_name), + __entry->trace_timestamp, + __entry->lost, __entry->event_id, + __get_str(msg)) +); + +#endif + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH ./diag +#define TRACE_INCLUDE_FILE fw_tracer_tracepoint +#include diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/rsc_dump.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/rsc_dump.c new file mode 100644 index 0000000..c5b560a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/rsc_dump.c @@ -0,0 +1,311 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2019 Mellanox Technologies. */ + +#include "rsc_dump.h" +#include "lib/mlx5.h" + +#define MLX5_SGMT_TYPE(SGMT) MLX5_SGMT_TYPE_##SGMT +#define MLX5_SGMT_STR_ASSING(SGMT)[MLX5_SGMT_TYPE(SGMT)] = #SGMT +static const char *const mlx5_rsc_sgmt_name[] = { + MLX5_SGMT_STR_ASSING(HW_CQPC), + MLX5_SGMT_STR_ASSING(HW_SQPC), + MLX5_SGMT_STR_ASSING(HW_RQPC), + MLX5_SGMT_STR_ASSING(FULL_SRQC), + MLX5_SGMT_STR_ASSING(FULL_CQC), + MLX5_SGMT_STR_ASSING(FULL_EQC), + MLX5_SGMT_STR_ASSING(FULL_QPC), + MLX5_SGMT_STR_ASSING(SND_BUFF), + MLX5_SGMT_STR_ASSING(RCV_BUFF), + MLX5_SGMT_STR_ASSING(SRQ_BUFF), + MLX5_SGMT_STR_ASSING(CQ_BUFF), + MLX5_SGMT_STR_ASSING(EQ_BUFF), + MLX5_SGMT_STR_ASSING(SX_SLICE), + MLX5_SGMT_STR_ASSING(SX_SLICE_ALL), + MLX5_SGMT_STR_ASSING(RDB), + MLX5_SGMT_STR_ASSING(RX_SLICE_ALL), + MLX5_SGMT_STR_ASSING(PRM_QUERY_QP), + MLX5_SGMT_STR_ASSING(PRM_QUERY_CQ), + MLX5_SGMT_STR_ASSING(PRM_QUERY_MKEY), +}; + +struct mlx5_rsc_dump { + u32 pdn; + u32 mkey; + u32 number_of_menu_items; + u16 fw_segment_type[MLX5_SGMT_TYPE_NUM]; +}; + +struct mlx5_rsc_dump_cmd { + u64 mem_size; + u8 cmd[MLX5_ST_SZ_BYTES(resource_dump)]; +}; + +static int mlx5_rsc_dump_sgmt_get_by_name(char *name) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(mlx5_rsc_sgmt_name); i++) + if (!strcmp(name, mlx5_rsc_sgmt_name[i])) + return i; + + return -EINVAL; +} + +#define MLX5_RSC_DUMP_MENU_HEADER_SIZE (MLX5_ST_SZ_BYTES(resource_dump_info_segment) + \ + MLX5_ST_SZ_BYTES(resource_dump_command_segment) + \ + MLX5_ST_SZ_BYTES(resource_dump_menu_segment)) + +static int mlx5_rsc_dump_read_menu_sgmt(struct mlx5_rsc_dump *rsc_dump, struct page *page, + int read_size, int start_idx) +{ + void *data = page_address(page); + enum mlx5_sgmt_type sgmt_idx; + int num_of_items; + char *sgmt_name; + void *member; + int size = 0; + void *menu; + int i; + + if (!start_idx) { + menu = MLX5_ADDR_OF(menu_resource_dump_response, data, menu); + rsc_dump->number_of_menu_items = MLX5_GET(resource_dump_menu_segment, menu, + num_of_records); + size = MLX5_RSC_DUMP_MENU_HEADER_SIZE; + data += size; + } + num_of_items = rsc_dump->number_of_menu_items; + + for (i = 0; start_idx + i < num_of_items; i++) { + size += MLX5_ST_SZ_BYTES(resource_dump_menu_record); + if (size >= read_size) + return start_idx + i; + + member = data + MLX5_ST_SZ_BYTES(resource_dump_menu_record) * i; + sgmt_name = MLX5_ADDR_OF(resource_dump_menu_record, member, segment_name); + sgmt_idx = mlx5_rsc_dump_sgmt_get_by_name(sgmt_name); + if (sgmt_idx == -EINVAL) + continue; + rsc_dump->fw_segment_type[sgmt_idx] = MLX5_GET(resource_dump_menu_record, + member, segment_type); + } + return 0; +} + +static int mlx5_rsc_dump_trigger(struct mlx5_core_dev *dev, struct mlx5_rsc_dump_cmd *cmd, + struct page *page) +{ + struct mlx5_rsc_dump *rsc_dump = dev->rsc_dump; + struct device *ddev = mlx5_core_dma_dev(dev); + u32 out_seq_num; + u32 in_seq_num; + dma_addr_t dma; + int err; + + dma = dma_map_page(ddev, page, 0, cmd->mem_size, DMA_FROM_DEVICE); + if (unlikely(dma_mapping_error(ddev, dma))) + return -ENOMEM; + + in_seq_num = MLX5_GET(resource_dump, cmd->cmd, seq_num); + MLX5_SET(resource_dump, cmd->cmd, mkey, rsc_dump->mkey); + MLX5_SET64(resource_dump, cmd->cmd, address, dma); + + err = mlx5_core_access_reg(dev, cmd->cmd, sizeof(cmd->cmd), cmd->cmd, + sizeof(cmd->cmd), MLX5_REG_RESOURCE_DUMP, 0, 1); + if (err) { + mlx5_core_err(dev, "Resource dump: Failed to access err %d\n", err); + goto out; + } + out_seq_num = MLX5_GET(resource_dump, cmd->cmd, seq_num); + if (out_seq_num && (in_seq_num + 1 != out_seq_num)) + err = -EIO; +out: + dma_unmap_page(ddev, dma, cmd->mem_size, DMA_FROM_DEVICE); + return err; +} + +struct mlx5_rsc_dump_cmd *mlx5_rsc_dump_cmd_create(struct mlx5_core_dev *dev, + struct mlx5_rsc_key *key) +{ + struct mlx5_rsc_dump_cmd *cmd; + int sgmt_type; + + if (IS_ERR_OR_NULL(dev->rsc_dump)) + return ERR_PTR(-EOPNOTSUPP); + + sgmt_type = dev->rsc_dump->fw_segment_type[key->rsc]; + if (!sgmt_type && key->rsc != MLX5_SGMT_TYPE_MENU) + return ERR_PTR(-EOPNOTSUPP); + + cmd = kzalloc(sizeof(*cmd), GFP_KERNEL); + if (!cmd) { + mlx5_core_err(dev, "Resource dump: Failed to allocate command\n"); + return ERR_PTR(-ENOMEM); + } + MLX5_SET(resource_dump, cmd->cmd, segment_type, sgmt_type); + MLX5_SET(resource_dump, cmd->cmd, index1, key->index1); + MLX5_SET(resource_dump, cmd->cmd, index2, key->index2); + MLX5_SET(resource_dump, cmd->cmd, num_of_obj1, key->num_of_obj1); + MLX5_SET(resource_dump, cmd->cmd, num_of_obj2, key->num_of_obj2); + MLX5_SET(resource_dump, cmd->cmd, size, key->size); + cmd->mem_size = key->size; + return cmd; +} +EXPORT_SYMBOL(mlx5_rsc_dump_cmd_create); + +void mlx5_rsc_dump_cmd_destroy(struct mlx5_rsc_dump_cmd *cmd) +{ + kfree(cmd); +} +EXPORT_SYMBOL(mlx5_rsc_dump_cmd_destroy); + +int mlx5_rsc_dump_next(struct mlx5_core_dev *dev, struct mlx5_rsc_dump_cmd *cmd, + struct page *page, int *size) +{ + bool more_dump; + int err; + + if (IS_ERR_OR_NULL(dev->rsc_dump)) + return -EOPNOTSUPP; + + err = mlx5_rsc_dump_trigger(dev, cmd, page); + if (err) { + mlx5_core_err(dev, "Resource dump: Failed to trigger dump, %d\n", err); + return err; + } + *size = MLX5_GET(resource_dump, cmd->cmd, size); + more_dump = MLX5_GET(resource_dump, cmd->cmd, more_dump); + + return more_dump; +} +EXPORT_SYMBOL(mlx5_rsc_dump_next); + +#define MLX5_RSC_DUMP_MENU_SEGMENT 0xffff +static int mlx5_rsc_dump_menu(struct mlx5_core_dev *dev) +{ + struct mlx5_rsc_dump_cmd *cmd = NULL; + struct mlx5_rsc_key key = {}; + struct page *page; + int start_idx = 0; + int size; + int err; + + page = alloc_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + + key.rsc = MLX5_SGMT_TYPE_MENU; + key.size = PAGE_SIZE; + cmd = mlx5_rsc_dump_cmd_create(dev, &key); + if (IS_ERR(cmd)) { + err = PTR_ERR(cmd); + goto free_page; + } + MLX5_SET(resource_dump, cmd->cmd, segment_type, MLX5_RSC_DUMP_MENU_SEGMENT); + + do { + err = mlx5_rsc_dump_next(dev, cmd, page, &size); + if (err < 0) + goto destroy_cmd; + + start_idx = mlx5_rsc_dump_read_menu_sgmt(dev->rsc_dump, page, size, start_idx); + + } while (err > 0); + +destroy_cmd: + mlx5_rsc_dump_cmd_destroy(cmd); +free_page: + __free_page(page); + + return err; +} + +static int mlx5_rsc_dump_create_mkey(struct mlx5_core_dev *mdev, u32 pdn, + u32 *mkey) +{ + int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); + void *mkc; + u32 *in; + int err; + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA); + MLX5_SET(mkc, mkc, lw, 1); + MLX5_SET(mkc, mkc, lr, 1); + + MLX5_SET(mkc, mkc, pd, pdn); + MLX5_SET(mkc, mkc, length64, 1); + MLX5_SET(mkc, mkc, qpn, 0xffffff); + + err = mlx5_core_create_mkey(mdev, mkey, in, inlen); + + kvfree(in); + return err; +} + +struct mlx5_rsc_dump *mlx5_rsc_dump_create(struct mlx5_core_dev *dev) +{ + struct mlx5_rsc_dump *rsc_dump; + + if (!MLX5_CAP_DEBUG(dev, resource_dump)) { + mlx5_core_dbg(dev, "Resource dump: capability not present\n"); + return NULL; + } + rsc_dump = kzalloc(sizeof(*rsc_dump), GFP_KERNEL); + if (!rsc_dump) + return ERR_PTR(-ENOMEM); + + return rsc_dump; +} + +void mlx5_rsc_dump_destroy(struct mlx5_core_dev *dev) +{ + if (IS_ERR_OR_NULL(dev->rsc_dump)) + return; + kfree(dev->rsc_dump); +} + +int mlx5_rsc_dump_init(struct mlx5_core_dev *dev) +{ + struct mlx5_rsc_dump *rsc_dump = dev->rsc_dump; + int err; + + if (IS_ERR_OR_NULL(dev->rsc_dump)) + return 0; + + err = mlx5_core_alloc_pd(dev, &rsc_dump->pdn); + if (err) { + mlx5_core_warn(dev, "Resource dump: Failed to allocate PD %d\n", err); + return err; + } + err = mlx5_rsc_dump_create_mkey(dev, rsc_dump->pdn, &rsc_dump->mkey); + if (err) { + mlx5_core_err(dev, "Resource dump: Failed to create mkey, %d\n", err); + goto free_pd; + } + err = mlx5_rsc_dump_menu(dev); + if (err) { + mlx5_core_err(dev, "Resource dump: Failed to read menu, %d\n", err); + goto destroy_mkey; + } + return err; + +destroy_mkey: + mlx5_core_destroy_mkey(dev, rsc_dump->mkey); +free_pd: + mlx5_core_dealloc_pd(dev, rsc_dump->pdn); + return err; +} + +void mlx5_rsc_dump_cleanup(struct mlx5_core_dev *dev) +{ + if (IS_ERR_OR_NULL(dev->rsc_dump)) + return; + + mlx5_core_destroy_mkey(dev, dev->rsc_dump->mkey); + mlx5_core_dealloc_pd(dev, dev->rsc_dump->pdn); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/rsc_dump.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/rsc_dump.h new file mode 100644 index 0000000..64c4956 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/diag/rsc_dump.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2019 Mellanox Technologies. */ + +#ifndef __MLX5_RSC_DUMP_H +#define __MLX5_RSC_DUMP_H + +#include +#include +#include "mlx5_core.h" + +#define MLX5_RSC_DUMP_ALL 0xFFFF +struct mlx5_rsc_dump_cmd; +struct mlx5_rsc_dump; + +struct mlx5_rsc_dump *mlx5_rsc_dump_create(struct mlx5_core_dev *dev); +void mlx5_rsc_dump_destroy(struct mlx5_core_dev *dev); + +int mlx5_rsc_dump_init(struct mlx5_core_dev *dev); +void mlx5_rsc_dump_cleanup(struct mlx5_core_dev *dev); + +struct mlx5_rsc_dump_cmd *mlx5_rsc_dump_cmd_create(struct mlx5_core_dev *dev, + struct mlx5_rsc_key *key); +void mlx5_rsc_dump_cmd_destroy(struct mlx5_rsc_dump_cmd *cmd); + +int mlx5_rsc_dump_next(struct mlx5_core_dev *dev, struct mlx5_rsc_dump_cmd *cmd, + struct page *page, int *size); +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c new file mode 100644 index 0000000..6fac186 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/ecpf.c @@ -0,0 +1,741 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2019 Mellanox Technologies. */ + +#include "ecpf.h" +#include +#include "mlx5_core.h" +#include "eswitch.h" +#include "en.h" + +bool mlx5_read_embedded_cpu(struct mlx5_core_dev *dev) +{ + return (ioread32be(&dev->iseg->initializing) >> MLX5_ECPU_BIT_NUM) & 1; +} + +static bool mlx5_ecpf_esw_admins_host_pf(const struct mlx5_core_dev *dev) +{ + /* In separate host mode, PF enables itself. + * When ECPF is eswitch manager, eswitch enables host PF after + * eswitch is setup. + */ + return mlx5_core_is_ecpf_esw_manager(dev); +} + +int mlx5_cmd_host_pf_enable_hca(struct mlx5_core_dev *dev) +{ + u32 out[MLX5_ST_SZ_DW(enable_hca_out)] = {}; + u32 in[MLX5_ST_SZ_DW(enable_hca_in)] = {}; + + MLX5_SET(enable_hca_in, in, opcode, MLX5_CMD_OP_ENABLE_HCA); + MLX5_SET(enable_hca_in, in, function_id, 0); + MLX5_SET(enable_hca_in, in, embedded_cpu_function, 0); + return mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); +} + +int mlx5_cmd_host_pf_disable_hca(struct mlx5_core_dev *dev) +{ + u32 out[MLX5_ST_SZ_DW(disable_hca_out)] = {}; + u32 in[MLX5_ST_SZ_DW(disable_hca_in)] = {}; + + MLX5_SET(disable_hca_in, in, opcode, MLX5_CMD_OP_DISABLE_HCA); + MLX5_SET(disable_hca_in, in, function_id, 0); + MLX5_SET(disable_hca_in, in, embedded_cpu_function, 0); + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); +} + +static int mlx5_host_pf_init(struct mlx5_core_dev *dev) +{ + int err; + + if (mlx5_ecpf_esw_admins_host_pf(dev)) + return 0; + + /* ECPF shall enable HCA for host PF in the same way a PF + * does this for its VFs when ECPF is not a eswitch manager. + */ + err = mlx5_cmd_host_pf_enable_hca(dev); + if (err) + mlx5_core_err(dev, "Failed to enable external host PF HCA err(%d)\n", err); + + return err; +} + +static void mlx5_host_pf_cleanup(struct mlx5_core_dev *dev) +{ + int err; + + if (mlx5_ecpf_esw_admins_host_pf(dev)) + return; + + err = mlx5_cmd_host_pf_disable_hca(dev); + if (err) { + mlx5_core_err(dev, "Failed to disable external host PF HCA err(%d)\n", err); + return; + } +} + +int mlx5_ec_init(struct mlx5_core_dev *dev) +{ + if (!mlx5_core_is_ecpf(dev)) + return 0; + + /* Management PF don't have a peer PF */ + if (mlx5_core_is_management_pf(dev)) + return 0; + + return mlx5_host_pf_init(dev); +} + +void mlx5_ec_cleanup(struct mlx5_core_dev *dev) +{ + int err; + + if (!mlx5_core_is_ecpf(dev)) + return; + + /* Management PF don't have a peer PF */ + if (mlx5_core_is_management_pf(dev)) + return; + + mlx5_host_pf_cleanup(dev); + + err = mlx5_wait_for_pages(dev, &dev->priv.host_pf_pages); + if (err) + mlx5_core_warn(dev, "Timeout reclaiming external host PF pages err(%d)\n", err); +} + +static int mlx5_regex_enable(struct mlx5_core_dev *dev, int vport, bool en) +{ + u32 out_set[MLX5_ST_SZ_BYTES(set_hca_cap_out)] = {}; + u32 in_query[MLX5_ST_SZ_DW(query_hca_cap_in)] = {}; + void *set_hca_cap, *query_hca_cap; + u32 *out_query, *in_set; + int err = 0; + + out_query = kzalloc(MLX5_ST_SZ_BYTES(query_hca_cap_out), GFP_KERNEL); + if (!out_query) + return -ENOMEM; + + in_set = kzalloc(MLX5_ST_SZ_BYTES(set_hca_cap_in), GFP_KERNEL); + if (!in_set) { + kfree(out_query); + return -ENOMEM; + } + + MLX5_SET(query_hca_cap_in, in_query, opcode, + MLX5_CMD_OP_QUERY_HCA_CAP); + MLX5_SET(query_hca_cap_in, in_query, op_mod, + MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE | + HCA_CAP_OPMOD_GET_CUR); + MLX5_SET(query_hca_cap_in, in_query, other_function, 1); + MLX5_SET(query_hca_cap_in, in_query, function_id, vport); + + err = mlx5_cmd_exec(dev, in_query, MLX5_ST_SZ_BYTES(query_hca_cap_in), + out_query, MLX5_ST_SZ_BYTES(query_hca_cap_out)); + if (err) + goto out; + + query_hca_cap = MLX5_ADDR_OF(query_hca_cap_out, out_query, capability); + set_hca_cap = MLX5_ADDR_OF(set_hca_cap_in, in_set, capability); + memcpy(set_hca_cap, query_hca_cap, MLX5_ST_SZ_BYTES(cmd_hca_cap)); + + MLX5_SET(set_hca_cap_in, in_set, opcode, + MLX5_CMD_OP_SET_HCA_CAP); + MLX5_SET(set_hca_cap_in, in_set, op_mod, + MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE | + HCA_CAP_OPMOD_GET_MAX); + MLX5_SET(set_hca_cap_in, in_set, other_function, 1); + MLX5_SET(set_hca_cap_in, in_set, function_id, vport); + MLX5_SET(set_hca_cap_in, in_set, + capability.cmd_hca_cap.regexp_mmo_qp, en); + if (en) { + MLX5_SET(set_hca_cap_in, in_set, + capability.cmd_hca_cap.regexp_num_of_engines, + MLX5_CAP_GEN_MAX(dev, regexp_num_of_engines)); + MLX5_SET(set_hca_cap_in, in_set, + capability.cmd_hca_cap.regexp_params, + MLX5_CAP_GEN_MAX(dev, regexp_params)); + } + err = mlx5_cmd_exec(dev, in_set, MLX5_ST_SZ_BYTES(set_hca_cap_in), + out_set, MLX5_ST_SZ_BYTES(set_hca_cap_out)); + if (err) + goto out; + +out: + kfree(out_query); + kfree(in_set); + return err; +} + +static ssize_t max_tx_rate_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, + size_t count) +{ + struct mlx5_smart_nic_vport *tmp = + container_of(kobj, struct mlx5_smart_nic_vport, kobj); + struct mlx5_eswitch *esw = tmp->esw; + struct mlx5_vport *evport = mlx5_eswitch_get_vport(esw, tmp->vport); + u32 max_tx_rate; + u32 min_tx_rate; + int err; + + mutex_lock(&esw->state_lock); + min_tx_rate = evport->qos.min_rate; + mutex_unlock(&esw->state_lock); + + err = kstrtou32(buf, 0, &max_tx_rate); + if (err) + return err; + + if (max_tx_rate && max_tx_rate <= min_tx_rate) + return -EINVAL; + + err = mlx5_eswitch_set_vport_rate(esw, tmp->vport, + max_tx_rate, min_tx_rate); + + return err ? err : count; +} + +static ssize_t max_tx_rate_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, + "usage: write to set max transmit rate\n"); +} + +static ssize_t min_tx_rate_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, + size_t count) +{ + struct mlx5_smart_nic_vport *tmp = + container_of(kobj, struct mlx5_smart_nic_vport, kobj); + struct mlx5_eswitch *esw = tmp->esw; + struct mlx5_vport *evport = mlx5_eswitch_get_vport(esw, tmp->vport); + u32 max_tx_rate; + u32 min_tx_rate; + int err; + + mutex_lock(&esw->state_lock); + max_tx_rate = evport->qos.max_rate; + mutex_unlock(&esw->state_lock); + + err = kstrtou32(buf, 0, &min_tx_rate); + if (err) + return err; + + if (max_tx_rate && max_tx_rate <= min_tx_rate) + return -EINVAL; + + err = mlx5_eswitch_set_vport_rate(esw, tmp->vport, + max_tx_rate, min_tx_rate); + + return err ? err : count; +} + +static ssize_t min_tx_rate_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, + "usage: write to set min transmit rate\n"); +} + +static ssize_t mac_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, + size_t count) +{ + struct mlx5_smart_nic_vport *tmp = + container_of(kobj, struct mlx5_smart_nic_vport, kobj); + struct mlx5_eswitch *esw = tmp->esw; + u8 mac[ETH_ALEN]; + int err; + + err = sscanf(buf, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx", + &mac[0], &mac[1], &mac[2], &mac[3], &mac[4], &mac[5]); + if (err == 6) + goto set_mac; + + if (sysfs_streq(buf, "Random")) + eth_random_addr(mac); + else + return -EINVAL; + +set_mac: + err = mlx5_eswitch_set_vport_mac(esw, tmp->vport, mac); + return err ? err : count; +} + +static ssize_t mac_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, + "usage: write to set Mac Address\n"); +} + +static ssize_t regex_en_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, + size_t count) +{ + struct mlx5_smart_nic_vport *tmp = + container_of(kobj, struct mlx5_smart_nic_vport, kobj); + struct mlx5_eswitch *esw = tmp->esw; + int err; + + if (!MLX5_CAP_GEN_MAX(esw->dev, regexp_mmo_qp)) + return -EOPNOTSUPP; + if (sysfs_streq(buf, "1")) + err = mlx5_regex_enable(esw->dev, tmp->vport, 1); + else if (sysfs_streq(buf, "0")) + err = mlx5_regex_enable(esw->dev, tmp->vport, 0); + else + err = -EINVAL; + + return err ? err : count; +} + +static ssize_t regex_en_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "Usage: write 1/0 to enable/disable regex\n"); +} + +static int strpolicy(const char *buf, enum port_state_policy *policy) +{ + if (sysfs_streq(buf, "Down")) { + *policy = MLX5_POLICY_DOWN; + return 0; + } + + if (sysfs_streq(buf, "Up")) { + *policy = MLX5_POLICY_UP; + return 0; + } + + if (sysfs_streq(buf, "Follow")) { + *policy = MLX5_POLICY_FOLLOW; + return 0; + } + return -EINVAL; +} + +static ssize_t vport_state_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, + size_t count) +{ + struct mlx5_smart_nic_vport *tmp = + container_of(kobj, struct mlx5_smart_nic_vport, kobj); + struct mlx5_eswitch *esw = tmp->esw; + struct mlx5_vport *evport = mlx5_eswitch_get_vport(esw, tmp->vport); + int opmod = MLX5_VPORT_STATE_OP_MOD_ESW_VPORT; + enum port_state_policy policy; + int err; + + err = strpolicy(buf, &policy); + if (err) + return err; + + if (!mlx5_esw_allowed(esw)) + return -EPERM; + if (IS_ERR(evport)) + return PTR_ERR(evport); + + mutex_lock(&esw->state_lock); + + err = mlx5_modify_vport_admin_state(esw->dev, opmod, + tmp->vport, 1, policy); + if (err) { + mlx5_core_warn(esw->dev, "Failed to set vport %d link state, opmod = %d, err = %d", + tmp->vport, opmod, err); + goto unlock; + } + + evport->info.link_state = policy; + +unlock: + mutex_unlock(&esw->state_lock); + return err ? err : count; +} + +static ssize_t vport_state_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "usage: write to set VF State\n"); +} + +static const char *policy_str(enum port_state_policy policy) +{ + switch (policy) { + case MLX5_POLICY_DOWN: return "Down\n"; + case MLX5_POLICY_UP: return "Up\n"; + case MLX5_POLICY_FOLLOW: return "Follow\n"; + default: return "Invalid policy\n"; + } +} + +#define _sprintf(p, buf, format, arg...) \ + ((PAGE_SIZE - (int)(p - buf)) <= 0 ? 0 : \ + scnprintf(p, PAGE_SIZE - (int)(p - buf), format, ## arg)) + +static u8 mlx5_query_vport_admin_state(struct mlx5_core_dev *mdev, + u8 opmod, + u16 vport, u8 other_vport) +{ + u32 out[MLX5_ST_SZ_DW(query_vport_state_out)] = {}; + u32 in[MLX5_ST_SZ_DW(query_vport_state_in)] = {}; + int err; + + MLX5_SET(query_vport_state_in, in, opcode, + MLX5_CMD_OP_QUERY_VPORT_STATE); + MLX5_SET(query_vport_state_in, in, op_mod, opmod); + MLX5_SET(query_vport_state_in, in, vport_number, vport); + MLX5_SET(query_vport_state_in, in, other_vport, other_vport); + + err = mlx5_cmd_exec_inout(mdev, query_vport_state, in, out); + if (err) + return 0; + + return MLX5_GET(query_vport_state_out, out, admin_state); +} + +static ssize_t config_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct mlx5_smart_nic_vport *tmp = + container_of(kobj, struct mlx5_smart_nic_vport, kobj); + struct mlx5_eswitch *esw = tmp->esw; + struct mlx5_vport *evport = mlx5_eswitch_get_vport(esw, tmp->vport); + int opmod = MLX5_VPORT_STATE_OP_MOD_ESW_VPORT; + struct mlx5_vport_info *ivi; + int other_vport = 1; + char *p = buf; + u8 port_state; + + mutex_lock(&esw->state_lock); + ivi = &evport->info; + p += _sprintf(p, buf, "MAC : %pM\n", ivi->mac); + p += _sprintf(p, buf, "MaxTxRate : %d\n", evport->qos.max_rate); + p += _sprintf(p, buf, "MinTxRate : %d\n", evport->qos.min_rate); + port_state = mlx5_query_vport_admin_state(esw->dev, opmod, + tmp->vport, other_vport); + p += _sprintf(p, buf, "State : %s\n", policy_str(port_state)); + mutex_unlock(&esw->state_lock); + + return (ssize_t)(p - buf); +} + +static ssize_t smart_nic_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct kobj_attribute *kattr; + ssize_t ret = -EIO; + + kattr = container_of(attr, struct kobj_attribute, attr); + if (kattr->show) + ret = kattr->show(kobj, kattr, buf); + return ret; +} + +static ssize_t smart_nic_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t count) +{ + struct kobj_attribute *kattr; + ssize_t ret = -EIO; + + kattr = container_of(attr, struct kobj_attribute, attr); + if (kattr->store) + ret = kattr->store(kobj, kattr, buf, count); + return ret; +} + +static struct kobj_attribute attr_max_tx_rate = { + .attr = {.name = "max_tx_rate", + .mode = 0644 }, + .show = max_tx_rate_show, + .store = max_tx_rate_store, +}; + +static struct kobj_attribute attr_min_tx_rate = { + .attr = {.name = "min_tx_rate", + .mode = 0644 }, + .show = min_tx_rate_show, + .store = min_tx_rate_store, +}; + +static struct kobj_attribute attr_mac = { + .attr = {.name = "mac", + .mode = 0644 }, + .show = mac_show, + .store = mac_store, +}; + +static struct kobj_attribute attr_vport_state = { + .attr = {.name = "vport_state", + .mode = 0644 }, + .show = vport_state_show, + .store = vport_state_store, +}; + +static struct kobj_attribute attr_regex_en = { + .attr = {.name = "regex_en", + .mode = 0644 }, + .show = regex_en_show, + .store = regex_en_store, +}; + +static struct kobj_attribute attr_config = { + .attr = {.name = "config", + .mode = 0444 }, + .show = config_show, +}; + +static struct attribute *smart_nic_attrs[] = { + &attr_config.attr, + &attr_max_tx_rate.attr, + &attr_min_tx_rate.attr, + &attr_mac.attr, + &attr_vport_state.attr, + &attr_regex_en.attr, + NULL, +}; + +static const struct sysfs_ops smart_nic_sysfs_ops = { + .show = smart_nic_attr_show, + .store = smart_nic_attr_store +}; + +static struct kobj_type smart_nic_type = { + .sysfs_ops = &smart_nic_sysfs_ops, + .default_attrs = smart_nic_attrs +}; + +void mlx5_smartnic_sysfs_init(struct net_device *dev) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5_smart_nic_vport *tmp; + struct mlx5_eswitch *esw; + int num_vports; + int err; + int i; + + if (!mlx5_core_is_ecpf(mdev)) + return; + + esw = mdev->priv.eswitch; + esw->smart_nic_sysfs.kobj = + kobject_create_and_add("smart_nic", &dev->dev.kobj); + if (!esw->smart_nic_sysfs.kobj) + return; + + num_vports = mlx5_core_max_vfs(mdev) + 1; + esw->smart_nic_sysfs.vport = + kcalloc(num_vports, sizeof(struct mlx5_smart_nic_vport), + GFP_KERNEL); + if (!esw->smart_nic_sysfs.vport) + goto err_attr_mem; + + for (i = 0; i < num_vports; i++) { + tmp = &esw->smart_nic_sysfs.vport[i]; + tmp->esw = esw; + tmp->vport = i; + if (i == 0) + err = kobject_init_and_add(&tmp->kobj, &smart_nic_type, + esw->smart_nic_sysfs.kobj, + "pf"); + else + err = kobject_init_and_add(&tmp->kobj, &smart_nic_type, + esw->smart_nic_sysfs.kobj, + "vf%d", i - 1); + if (err) + goto err_attr; + } + + return; + +err_attr: + for (; i >= 0; i--) { + kobject_put(&esw->smart_nic_sysfs.vport[i].kobj); + esw->smart_nic_sysfs.vport[i].esw = NULL; + } + kfree(esw->smart_nic_sysfs.vport); + esw->smart_nic_sysfs.vport = NULL; + +err_attr_mem: + kobject_put(esw->smart_nic_sysfs.kobj); + esw->smart_nic_sysfs.kobj = NULL; +} + +void mlx5_smartnic_sysfs_cleanup(struct net_device *dev) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5_smart_nic_vport *tmp; + struct mlx5_eswitch *esw; + int i; + + if (!mlx5_core_is_ecpf(mdev)) + return; + + esw = mdev->priv.eswitch; + + if (!esw->smart_nic_sysfs.kobj || !esw->smart_nic_sysfs.vport) + return; + + for (i = 0; i < mlx5_core_max_vfs(mdev) + 1; i++) { + tmp = &esw->smart_nic_sysfs.vport[i]; + if (!tmp->esw) + continue; + kobject_put(&tmp->kobj); + } + + kfree(esw->smart_nic_sysfs.vport); + esw->smart_nic_sysfs.vport = NULL; + + kobject_put(esw->smart_nic_sysfs.kobj); + esw->smart_nic_sysfs.kobj = NULL; +} + +static ssize_t regex_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct mlx5_regex_vport *regex = + container_of(kobj, struct mlx5_regex_vport, kobj); + int err; + + if (!MLX5_CAP_GEN_MAX(regex->dev, regexp_mmo_qp)) + return -EOPNOTSUPP; + if (sysfs_streq(buf, "1")) + err = mlx5_regex_enable(regex->dev, regex->vport, 1); + else if (sysfs_streq(buf, "0")) + err = mlx5_regex_enable(regex->dev, regex->vport, 0); + else + err = -EINVAL; + + return err ? err : count; +} + +static ssize_t regex_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "Usage: write 1/0 to enable/disable regex\n"); +} + +static struct kobj_attribute attr_regex = { + .attr = {.name = "regex_en", + .mode = 0644 }, + .show = regex_show, + .store = regex_store, +}; + +static struct attribute *regex_attrs[] = { + &attr_regex.attr, + NULL, +}; + +static ssize_t regex_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + return smart_nic_attr_show(kobj, attr, buf); +} + +static ssize_t regex_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t count) +{ + return smart_nic_attr_store(kobj, attr, buf, count); +} + +static const struct sysfs_ops regex_sysfs_ops = { + .show = regex_attr_show, + .store = regex_attr_store +}; + +static struct kobj_type regex_type = { + .sysfs_ops = ®ex_sysfs_ops, + .default_attrs = regex_attrs +}; + +int mlx5_regex_sysfs_init(struct mlx5_core_dev *dev) +{ + struct mlx5_core_regex *regex = &dev->priv.regex; + struct device *device = &dev->pdev->dev; + struct mlx5_regex_vport *vport; + u16 num_vports; + int i, ret = 0; + + if (!mlx5_core_is_ecpf(dev)) + return 0; + + regex->kobj = kobject_create_and_add("regex", &device->kobj); + if (!regex->kobj) + return -ENOMEM; + + num_vports = mlx5_core_max_vfs(dev) + 1; + regex->vport = kcalloc(num_vports, sizeof(struct mlx5_regex_vport), + GFP_KERNEL); + if (!regex->vport) { + ret = -ENOMEM; + goto err_vport; + } + + for (i = 0; i < num_vports; i++) { + vport = ®ex->vport[i]; + vport->dev = dev; + vport->vport = i; + if (i == 0) + ret = kobject_init_and_add(&vport->kobj, ®ex_type, + regex->kobj, "pf"); + else + ret = kobject_init_and_add(&vport->kobj, ®ex_type, + regex->kobj, "vf%d", + i - 1); + if (ret) + goto err_attr; + } + + return 0; + +err_attr: + for (--i; i >= 0; i--) + kobject_put(®ex->vport[i].kobj); + kfree(regex->vport); + regex->vport = NULL; +err_vport: + kobject_put(regex->kobj); + regex->kobj = NULL; + return ret; +} + +void mlx5_regex_sysfs_cleanup(struct mlx5_core_dev *dev) +{ + struct mlx5_core_regex *regex = &dev->priv.regex; + struct mlx5_regex_vport *vport; + u16 num_vports, i; + + if (!mlx5_core_is_ecpf(dev)) + return; + + num_vports = mlx5_core_max_vfs(dev) + 1; + + for (i = 0; i < num_vports; i++) { + vport = ®ex->vport[i]; + kobject_put(&vport->kobj); + } + + kfree(regex->vport); + regex->vport = NULL; + + kobject_put(regex->kobj); + regex->kobj = NULL; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/ecpf.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/ecpf.h new file mode 100644 index 0000000..3296d78 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/ecpf.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2019 Mellanox Technologies. */ + +#ifndef __MLX5_ECPF_H__ +#define __MLX5_ECPF_H__ + +#include +#include "mlx5_core.h" + +enum { + MLX5_ECPU_BIT_NUM = 23, +}; + +bool mlx5_read_embedded_cpu(struct mlx5_core_dev *dev); +int mlx5_ec_init(struct mlx5_core_dev *dev); +void mlx5_ec_cleanup(struct mlx5_core_dev *dev); + +int mlx5_cmd_host_pf_enable_hca(struct mlx5_core_dev *dev); +int mlx5_cmd_host_pf_disable_hca(struct mlx5_core_dev *dev); +void mlx5_smartnic_sysfs_init(struct net_device *dev); +void mlx5_smartnic_sysfs_cleanup(struct net_device *dev); + +int mlx5_regex_sysfs_init(struct mlx5_core_dev *dev); +void mlx5_regex_sysfs_cleanup(struct mlx5_core_dev *dev); + +#endif /* __MLX5_ECPF_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en.h new file mode 100644 index 0000000..b7d66e7 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -0,0 +1,1376 @@ +/* + * Copyright (c) 2015-2016, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __MLX5_EN_H__ +#define __MLX5_EN_H__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "wq.h" +#include "mlx5_core.h" +#include "en_stats.h" +#include "en/dcbnl.h" +#include "en/fs.h" +#include "en/qos.h" +#include "lib/hv_vhca.h" +#include "lib/clock.h" +#include "en/rx_res.h" + +extern const struct net_device_ops mlx5e_netdev_ops; +struct page_pool; + +#define MLX5E_METADATA_ETHER_TYPE (0x8CE4) +#define MLX5E_METADATA_ETHER_LEN 8 + +#define MLX5E_ETH_HARD_MTU (ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN) + +#define MLX5E_HW2SW_MTU(params, hwmtu) ((hwmtu) - ((params)->hard_mtu)) +#define MLX5E_SW2HW_MTU(params, swmtu) ((swmtu) + ((params)->hard_mtu)) + +#define MLX5E_MAX_NUM_TC 8 +#define MLX5E_MIN_NUM_TC 0 +#define MLX5E_MAX_NUM_MQPRIO_CH_TC TC_QOPT_MAX_QUEUE + +#define MLX5_RX_HEADROOM NET_SKB_PAD +#define MLX5_SKB_FRAG_SZ(len) (SKB_DATA_ALIGN(len) + \ + SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) + +#define MLX5E_RX_MAX_HEAD (256) +#define MLX5E_SHAMPO_LOG_MAX_HEADER_ENTRY_SIZE (9) +#define MLX5E_SHAMPO_WQ_HEADER_PER_PAGE (PAGE_SIZE >> MLX5E_SHAMPO_LOG_MAX_HEADER_ENTRY_SIZE) +#define MLX5E_SHAMPO_WQ_BASE_HEAD_ENTRY_SIZE (64) +#define MLX5E_SHAMPO_WQ_RESRV_SIZE (64 * 1024) +#define MLX5E_SHAMPO_WQ_BASE_RESRV_SIZE (4096) + +#define MLX5_MPWRQ_MIN_LOG_STRIDE_SZ(mdev) \ + (6 + MLX5_CAP_GEN(mdev, cache_line_128byte)) /* HW restriction */ +#define MLX5_MPWRQ_LOG_STRIDE_SZ(mdev, req) \ + max_t(u32, MLX5_MPWRQ_MIN_LOG_STRIDE_SZ(mdev), req) +#define MLX5_MPWRQ_DEF_LOG_STRIDE_SZ(mdev) \ + MLX5_MPWRQ_LOG_STRIDE_SZ(mdev, order_base_2(MLX5E_RX_MAX_HEAD)) + +#define MLX5_MPWRQ_LOG_WQE_SZ 18 +#define MLX5_MPWRQ_WQE_PAGE_ORDER (MLX5_MPWRQ_LOG_WQE_SZ - PAGE_SHIFT > 0 ? \ + MLX5_MPWRQ_LOG_WQE_SZ - PAGE_SHIFT : 0) +#define MLX5_MPWRQ_PAGES_PER_WQE BIT(MLX5_MPWRQ_WQE_PAGE_ORDER) + +#define MLX5_ALIGN_MTTS(mtts) (ALIGN(mtts, 8)) +#define MLX5_ALIGNED_MTTS_OCTW(mtts) ((mtts) / 2) +#define MLX5_MTT_OCTW(mtts) (MLX5_ALIGNED_MTTS_OCTW(MLX5_ALIGN_MTTS(mtts))) +/* Add another page to MLX5E_REQUIRED_WQE_MTTS as a buffer between + * WQEs, This page will absorb write overflow by the hardware, when + * receiving packets larger than MTU. These oversize packets are + * dropped by the driver at a later stage. + */ +#define MLX5E_REQUIRED_WQE_MTTS (MLX5_ALIGN_MTTS(MLX5_MPWRQ_PAGES_PER_WQE + 1)) +#define MLX5E_REQUIRED_MTTS(wqes) (wqes * MLX5E_REQUIRED_WQE_MTTS) +#define MLX5E_MAX_RQ_NUM_MTTS \ + (ALIGN_DOWN(U16_MAX, 4) * 2) /* So that MLX5_MTT_OCTW(num_mtts) fits into u16 */ +#define MLX5E_ORDER2_MAX_PACKET_MTU (order_base_2(10 * 1024)) +#define MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE_MPW \ + (ilog2(MLX5E_MAX_RQ_NUM_MTTS / MLX5E_REQUIRED_WQE_MTTS)) +#define MLX5E_LOG_MAX_RQ_NUM_PACKETS_MPW \ + (MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE_MPW + \ + (MLX5_MPWRQ_LOG_WQE_SZ - MLX5E_ORDER2_MAX_PACKET_MTU)) + +#define MLX5E_MIN_SKB_FRAG_SZ (MLX5_SKB_FRAG_SZ(MLX5_RX_HEADROOM)) +#define MLX5E_LOG_MAX_RX_WQE_BULK \ + (ilog2(PAGE_SIZE / roundup_pow_of_two(MLX5E_MIN_SKB_FRAG_SZ))) + +#define MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE 0x6 +#define MLX5E_PARAMS_DEFAULT_LOG_SQ_SIZE 0xa +#define MLX5E_PARAMS_MAXIMUM_LOG_SQ_SIZE 0xd + +#define MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE (1 + MLX5E_LOG_MAX_RX_WQE_BULK) +#define MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE 0xa +#define MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE min_t(u8, 0xd, \ + MLX5E_LOG_MAX_RQ_NUM_PACKETS_MPW) + +#define MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE_MPW 0x2 + +#ifdef CONFIG_PPC +#define MLX5E_DEFAULT_LRO_TIMEOUT 1024 +#else +#define MLX5E_DEFAULT_LRO_TIMEOUT 32 +#endif +#define MLX5E_LRO_TIMEOUT_ARR_SIZE 4 + +#define MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC 0x10 +#define MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC_FROM_CQE 0x3 +#define MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_PKTS 0x20 +#define MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_USEC 0x10 +#define MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_USEC_FROM_CQE 0x10 +#define MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_PKTS 0x20 +#define MLX5E_PARAMS_DEFAULT_MIN_RX_WQES 0x80 +#define MLX5E_PARAMS_DEFAULT_MIN_RX_WQES_MPW 0x2 + +#define MLX5E_MIN_NUM_CHANNELS 0x1 +#define MLX5E_MAX_NUM_CHANNELS (MLX5E_INDIR_RQT_SIZE / 2) +#define MLX5E_TX_CQ_POLL_BUDGET 128 +#define MLX5E_TX_XSK_POLL_BUDGET 64 +#define MLX5E_SQ_RECOVER_MIN_INTERVAL 500 /* msecs */ + +#define MLX5E_UMR_WQE_INLINE_SZ \ + (sizeof(struct mlx5e_umr_wqe) + \ + ALIGN(MLX5_MPWRQ_PAGES_PER_WQE * sizeof(struct mlx5_mtt), \ + MLX5_UMR_MTT_ALIGNMENT)) +#define MLX5E_UMR_WQEBBS \ + (DIV_ROUND_UP(MLX5E_UMR_WQE_INLINE_SZ, MLX5_SEND_WQE_BB)) + +#define MLX5E_KLM_UMR_WQE_SZ(sgl_len)\ + (sizeof(struct mlx5e_umr_wqe) +\ + (sizeof(struct mlx5_klm) * (sgl_len))) + +#define MLX5E_KLM_UMR_WQEBBS(klm_entries) \ + (DIV_ROUND_UP(MLX5E_KLM_UMR_WQE_SZ(klm_entries), MLX5_SEND_WQE_BB)) + +#define MLX5E_KLM_UMR_DS_CNT(klm_entries)\ + (DIV_ROUND_UP(MLX5E_KLM_UMR_WQE_SZ(klm_entries), MLX5_SEND_WQE_DS)) + +#define MLX5E_KLM_MAX_ENTRIES_PER_WQE(wqe_size)\ + (((wqe_size) - sizeof(struct mlx5e_umr_wqe)) / sizeof(struct mlx5_klm)) + +#define MLX5E_KLM_ENTRIES_PER_WQE(wqe_size)\ + ALIGN_DOWN(MLX5E_KLM_MAX_ENTRIES_PER_WQE(wqe_size), MLX5_UMR_KLM_ALIGNMENT) + +#define MLX5E_MAX_KLM_PER_WQE(mdev) \ + MLX5E_KLM_ENTRIES_PER_WQE(MLX5_SEND_WQE_BB * mlx5e_get_max_sq_aligned_wqebbs(mdev)) + +#define MLX5E_MSG_LEVEL NETIF_MSG_LINK + +#define mlx5e_dbg(mlevel, priv, format, ...) \ +do { \ + if (NETIF_MSG_##mlevel & (priv)->msglevel) \ + netdev_warn(priv->netdev, format, \ + ##__VA_ARGS__); \ +} while (0) + +#define mlx5e_state_dereference(priv, p) \ + rcu_dereference_protected((p), lockdep_is_held(&(priv)->state_lock)) + +enum mlx5e_rq_group { + MLX5E_RQ_GROUP_REGULAR, + MLX5E_RQ_GROUP_XSK, +#define MLX5E_NUM_RQ_GROUPS(g) (1 + MLX5E_RQ_GROUP_##g) +}; + +static inline u8 mlx5e_get_num_lag_ports(struct mlx5_core_dev *mdev) +{ + if (mlx5_lag_is_lacp_owner(mdev)) + return 1; + + return clamp_t(u8, MLX5_CAP_GEN(mdev, num_lag_ports), 1, MLX5_MAX_PORTS); +} + +static inline u16 mlx5_min_rx_wqes(int wq_type, u32 wq_size) +{ + switch (wq_type) { + case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ: + return min_t(u16, MLX5E_PARAMS_DEFAULT_MIN_RX_WQES_MPW, + wq_size / 2); + default: + return min_t(u16, MLX5E_PARAMS_DEFAULT_MIN_RX_WQES, + wq_size / 2); + } +} + +/* Use this function to get max num channels (rxqs/txqs) only to create netdev */ +static inline int mlx5e_get_max_num_channels(struct mlx5_core_dev *mdev) +{ + return is_kdump_kernel() ? + MLX5E_MIN_NUM_CHANNELS : + min_t(int, mlx5_comp_vectors_count(mdev), MLX5E_MAX_NUM_CHANNELS); +} + +/* The maximum WQE size can be retrieved by max_wqe_sz_sq in + * bytes units. Driver hardens the limitation to 1KB (16 + * WQEBBs), unless firmware capability is stricter. + */ +static inline u8 mlx5e_get_max_sq_wqebbs(struct mlx5_core_dev *mdev) +{ + BUILD_BUG_ON(MLX5_SEND_WQE_MAX_WQEBBS > U8_MAX); + + return (u8)min_t(u16, MLX5_SEND_WQE_MAX_WQEBBS, + MLX5_CAP_GEN(mdev, max_wqe_sz_sq) / MLX5_SEND_WQE_BB); +} + +static inline u8 mlx5e_get_max_sq_aligned_wqebbs(struct mlx5_core_dev *mdev) +{ +/* The return value will be multiplied by MLX5_SEND_WQEBB_NUM_DS. + * Since max_sq_wqebbs may be up to MLX5_SEND_WQE_MAX_WQEBBS == 16, + * see mlx5e_get_max_sq_wqebbs(), the multiplication (16 * 4 == 64) + * overflows the 6-bit DS field of Ctrl Segment. Use a bound lower + * than MLX5_SEND_WQE_MAX_WQEBBS to let a full-session WQE be + * cache-aligned. + */ + u8 wqebbs = mlx5e_get_max_sq_wqebbs(mdev); + + wqebbs = min_t(u8, wqebbs, MLX5_SEND_WQE_MAX_WQEBBS - 1); +#if L1_CACHE_BYTES >= 128 + wqebbs = ALIGN_DOWN(wqebbs, 2); +#endif + return wqebbs; +} + +struct mlx5e_tx_wqe { + struct mlx5_wqe_ctrl_seg ctrl; + struct mlx5_wqe_eth_seg eth; + struct mlx5_wqe_data_seg data[]; +}; + +struct mlx5e_rx_wqe_ll { + struct mlx5_wqe_srq_next_seg next; + struct mlx5_wqe_data_seg data[]; +}; + +struct mlx5e_rx_wqe_cyc { + struct mlx5_wqe_data_seg data[0]; +}; + +struct mlx5e_umr_wqe { + struct mlx5_wqe_ctrl_seg ctrl; + struct mlx5_wqe_umr_ctrl_seg uctrl; + struct mlx5_mkey_seg mkc; + union { + DECLARE_FLEX_ARRAY(struct mlx5_mtt, inline_mtts); + DECLARE_FLEX_ARRAY(struct mlx5_klm, inline_klms); + }; +}; + +enum mlx5e_priv_flag { + MLX5E_PFLAG_RX_CQE_BASED_MODER, + MLX5E_PFLAG_TX_CQE_BASED_MODER, + MLX5E_PFLAG_RX_CQE_COMPRESS, + MLX5E_PFLAG_RX_STRIDING_RQ, + MLX5E_PFLAG_RX_NO_CSUM_COMPLETE, + MLX5E_PFLAG_XDP_TX_MPWQE, + MLX5E_PFLAG_SKB_TX_MPWQE, + MLX5E_PFLAG_TX_PORT_TS, + MLX5E_PFLAG_DROPLESS_RQ, + MLX5E_PFLAG_PER_CH_STATS, + MLX5E_PFLAG_TX_XDP_CSUM, + MLX5E_NUM_PFLAGS, /* Keep last */ +}; + +#define MLX5E_SET_PFLAG(params, pflag, enable) \ + do { \ + if (enable) \ + (params)->pflags |= BIT(pflag); \ + else \ + (params)->pflags &= ~(BIT(pflag)); \ + } while (0) + +#define MLX5E_GET_PFLAG(params, pflag) (!!((params)->pflags & (BIT(pflag)))) + +enum packet_merge { + MLX5E_PACKET_MERGE_NONE, + MLX5E_PACKET_MERGE_LRO, + MLX5E_PACKET_MERGE_SHAMPO, +}; + +struct mlx5e_packet_merge_param { + enum packet_merge type; + u32 timeout; + struct { + u8 match_criteria_type; + u8 alignment_granularity; + } shampo; +}; + +struct mlx5e_params { + u8 log_sq_size; + u8 rq_wq_type; + u8 log_rq_mtu_frames; + u8 log_rx_page_cache_mult; + u16 num_channels; + struct { + u16 mode; + u8 num_tc; + struct netdev_tc_txq tc_to_txq[TC_MAX_QUEUE]; + struct { + u64 max_rate[TC_MAX_QUEUE]; + u32 hw_id[TC_MAX_QUEUE]; + } channel; + } mqprio; + bool rx_cqe_compress_def; + bool tunneled_offload_en; + struct dim_cq_moder rx_cq_moderation; + struct dim_cq_moder tx_cq_moderation; + struct mlx5e_packet_merge_param packet_merge; + u8 tx_min_inline_mode; + bool vlan_strip_disable; + bool scatter_fcs_en; + bool rx_dim_enabled; + bool tx_dim_enabled; + u32 pflags; + struct bpf_prog *xdp_prog; + struct mlx5e_xsk *xsk; + unsigned int sw_mtu; + int hard_mtu; + bool ptp_rx; + struct { + __u32 flag; + u32 mst_size; + } dump; +}; + +static inline u8 mlx5e_get_dcb_num_tc(struct mlx5e_params *params) +{ + return params->mqprio.mode == TC_MQPRIO_MODE_DCB ? + params->mqprio.num_tc : 1; +} + +enum { + MLX5E_RQ_STATE_ENABLED, + MLX5E_RQ_STATE_RECOVERING, + MLX5E_RQ_STATE_AM, + MLX5E_RQ_STATE_NO_CSUM_COMPLETE, + MLX5E_RQ_STATE_CSUM_FULL, /* cqe_csum_full hw bit is set */ + MLX5E_RQ_STATE_FPGA_TLS, /* FPGA TLS enabled */ + MLX5E_RQ_STATE_MINI_CQE_HW_STRIDX, /* set when mini_cqe_resp_stride_index cap is used */ + MLX5E_RQ_STATE_SHAMPO, /* set when SHAMPO cap is used */ + MLX5E_RQ_STATE_CACHE_REDUCE_PENDING +}; + +struct mlx5e_cq { + /* data path - accessed per cqe */ + struct mlx5_cqwq wq; + + /* data path - accessed per napi poll */ + u16 event_ctr; + struct napi_struct *napi; + struct mlx5_core_cq mcq; + struct mlx5e_ch_stats *ch_stats; + + /* control */ + struct net_device *netdev; + struct mlx5_core_dev *mdev; + struct mlx5e_priv *priv; + struct mlx5_wq_ctrl wq_ctrl; + bool no_arm; +} ____cacheline_aligned_in_smp; + +struct mlx5e_cq_decomp { + /* cqe decompression */ + struct mlx5_cqe64 title; + struct mlx5_mini_cqe8 mini_arr[MLX5_MINI_CQE_ARRAY_SIZE]; + u8 mini_arr_idx; + u16 left; + u16 wqe_counter; +} ____cacheline_aligned_in_smp; + +enum mlx5e_dma_map_type { + MLX5E_DMA_MAP_SINGLE, + MLX5E_DMA_MAP_PAGE +}; + +struct mlx5e_sq_dma { + dma_addr_t addr; + u32 size; + enum mlx5e_dma_map_type type; +}; + +enum { + MLX5E_SQ_STATE_ENABLED, + MLX5E_SQ_STATE_MPWQE, + MLX5E_SQ_STATE_RECOVERING, + MLX5E_SQ_STATE_IPSEC, + MLX5E_SQ_STATE_AM, + MLX5E_SQ_STATE_VLAN_NEED_L2_INLINE, + MLX5E_SQ_STATE_PENDING_XSK_TX, + MLX5E_SQ_STATE_PENDING_TLS_RX_RESYNC, + MLX5E_SQ_STATE_TX_XDP_CSUM, +}; + +struct mlx5e_tx_mpwqe { + /* Current MPWQE session */ + struct mlx5e_tx_wqe *wqe; + u32 bytes_count; + u8 ds_count; + u8 pkt_count; + u8 inline_on; +}; + +struct mlx5e_skb_fifo { + struct sk_buff **fifo; + u16 *pc; + u16 *cc; + u16 mask; +}; + +struct mlx5e_ptpsq; + +struct mlx5e_dim { + struct dim dim; + struct dim_sample sample; +}; + +struct mlx5e_txqsq { + /* data path */ + + /* dirtied @completion */ + u16 cc; + u16 skb_fifo_cc; + u32 dma_fifo_cc; + struct mlx5e_dim dim_obj; /* Adaptive Moderation */ + + /* dirtied @xmit */ + u16 pc ____cacheline_aligned_in_smp; + u16 skb_fifo_pc; + u32 dma_fifo_pc; + struct mlx5e_tx_mpwqe mpwqe; + + struct mlx5e_cq cq; + + /* read only */ + struct mlx5_wq_cyc wq; + u32 dma_fifo_mask; + struct mlx5e_sq_stats *stats; + struct { + struct mlx5e_sq_dma *dma_fifo; + struct mlx5e_skb_fifo skb_fifo; + struct mlx5e_tx_wqe_info *wqe_info; + } db; + void __iomem *uar_map; + struct netdev_queue *txq; + u32 sqn; + u16 stop_room; + u8 max_sq_mpw_wqebbs; + u8 min_inline_mode; + struct device *pdev; + __be32 mkey_be; + unsigned long state; + unsigned int hw_mtu; + struct mlx5_clock *clock; + struct net_device *netdev; + struct mlx5_core_dev *mdev; + struct mlx5e_priv *priv; + + /* control path */ + struct mlx5_wq_ctrl wq_ctrl; + int ch_ix; + int txq_ix; + u32 rate_limit; + struct work_struct recover_work; + struct mlx5e_ptpsq *ptpsq; + cqe_ts_to_ns ptp_cyc2time; +} ____cacheline_aligned_in_smp; + +struct mlx5e_dma_info { + dma_addr_t addr; + u32 refcnt_bias; + union { + struct page *page; + struct xdp_buff *xsk; + }; +}; + +/* XDP packets can be transmitted in different ways. On completion, we need to + * distinguish between them to clean up things in a proper way. + */ +enum mlx5e_xdp_xmit_mode { + /* An xdp_frame was transmitted due to either XDP_REDIRECT from another + * device or XDP_TX from an XSK RQ. The frame has to be unmapped and + * returned. + */ + MLX5E_XDP_XMIT_MODE_FRAME, + + /* The xdp_frame was created in place as a result of XDP_TX from a + * regular RQ. No DMA remapping happened, and the page belongs to us. + */ + MLX5E_XDP_XMIT_MODE_PAGE, + + /* No xdp_frame was created at all, the transmit happened from a UMEM + * page. The UMEM Completion Ring producer pointer has to be increased. + */ + MLX5E_XDP_XMIT_MODE_XSK, +}; + +struct mlx5e_xdp_info { + enum mlx5e_xdp_xmit_mode mode; + union { + struct { + struct xdp_frame *xdpf; + dma_addr_t dma_addr; + } frame; + struct { + struct mlx5e_rq *rq; + struct mlx5e_dma_info di; + } page; + }; +}; + +struct mlx5e_xmit_data { + dma_addr_t dma_addr; + void *data; + u32 len; +}; + +struct mlx5e_xdp_info_fifo { + struct mlx5e_xdp_info *xi; + u32 *cc; + u32 *pc; + u32 mask; +}; + +struct mlx5e_xdpsq; +typedef int (*mlx5e_fp_xmit_xdp_frame_check)(struct mlx5e_xdpsq *); +typedef bool (*mlx5e_fp_xmit_xdp_frame)(struct mlx5e_xdpsq *, + struct mlx5e_xmit_data *, + struct mlx5e_xdp_info *, + int); + +struct mlx5e_xdpsq { + /* data path */ + + /* dirtied @completion */ + u32 xdpi_fifo_cc; + u16 cc; + + /* dirtied @xmit */ + u32 xdpi_fifo_pc ____cacheline_aligned_in_smp; + u16 pc; + struct mlx5_wqe_ctrl_seg *doorbell_cseg; + struct mlx5e_tx_mpwqe mpwqe; + + struct mlx5e_cq cq; + + /* read only */ + struct xsk_buff_pool *xsk_pool; + struct mlx5_wq_cyc wq; + struct mlx5e_xdpsq_stats *stats; + mlx5e_fp_xmit_xdp_frame_check xmit_xdp_frame_check; + mlx5e_fp_xmit_xdp_frame xmit_xdp_frame; + struct { + struct mlx5e_xdp_wqe_info *wqe_info; + struct mlx5e_xdp_info_fifo xdpi_fifo; + } db; + void __iomem *uar_map; + u32 sqn; + struct device *pdev; + __be32 mkey_be; + u16 stop_room; + u8 max_sq_mpw_wqebbs; + u8 min_inline_mode; + unsigned long state; + unsigned int hw_mtu; + + /* control path */ + struct mlx5_wq_ctrl wq_ctrl; + struct mlx5e_channel *channel; +} ____cacheline_aligned_in_smp; + +struct mlx5e_ktls_resync_resp; + +struct mlx5e_icosq { + /* data path */ + u16 cc; + u16 pc; + + struct mlx5_wqe_ctrl_seg *doorbell_cseg; + struct mlx5e_cq cq; + + /* write@xmit, read@completion */ + struct { + struct mlx5e_icosq_wqe_info *wqe_info; + } db; + + /* read only */ + struct mlx5_wq_cyc wq; + void __iomem *uar_map; + u32 sqn; + u16 reserved_room; + unsigned long state; + struct mlx5e_ktls_resync_resp *ktls_resync; + + /* control path */ + struct mlx5_wq_ctrl wq_ctrl; + struct mlx5e_channel *channel; + + struct work_struct recover_work; +} ____cacheline_aligned_in_smp; + +struct mlx5e_wqe_frag_info { + struct mlx5e_dma_info *di; + u32 offset; + bool last_in_page; +}; + +struct mlx5e_umr_dma_info { + struct mlx5e_dma_info dma_info[MLX5_MPWRQ_PAGES_PER_WQE]; +}; + +struct mlx5e_mpw_info { + struct mlx5e_umr_dma_info umr; + u16 consumed_strides; + DECLARE_BITMAP(xdp_xmit_bitmap, MLX5_MPWRQ_PAGES_PER_WQE); +}; + +#define MLX5E_MAX_RX_FRAGS 4 + +#define MLX5E_PAGE_CACHE_LOG_MAX_RQ_MULT 4 +#define MLX5E_PAGE_CACHE_REDUCE_WORK_INTERVAL 200 /* msecs */ +#define MLX5E_PAGE_CACHE_REDUCE_GRACE_PERIOD 1000 /* msecs */ +#define MLX5E_PAGE_CACHE_REDUCE_SUCCESSIVE_CNT 5 + +struct mlx5e_page_cache_reduce { + struct delayed_work reduce_work; + u32 successive; + unsigned long next_ts; + unsigned long graceful_period; + unsigned long delay; + + struct mlx5e_dma_info *pending; + u32 npages; +}; + +struct mlx5e_page_cache { + struct mlx5e_dma_info *page_cache; + int head; + u32 sz; + u32 lrs; /* least recently sampled */ + u8 log_min_sz; + u8 log_max_sz; + struct mlx5e_page_cache_reduce reduce; +}; + +static inline void mlx5e_put_page(struct mlx5e_dma_info *dma_info) +{ + page_ref_sub(dma_info->page, dma_info->refcnt_bias); + put_page(dma_info->page); +} + +struct mlx5e_rq; +typedef void (*mlx5e_fp_handle_rx_cqe)(struct mlx5e_rq*, struct mlx5_cqe64*); +typedef struct sk_buff * +(*mlx5e_fp_skb_from_cqe_mpwrq)(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, + u16 cqe_bcnt, u32 head_offset, u32 page_idx); +typedef struct sk_buff * +(*mlx5e_fp_skb_from_cqe)(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe, + struct mlx5e_wqe_frag_info *wi, u32 cqe_bcnt); +typedef bool (*mlx5e_fp_post_rx_wqes)(struct mlx5e_rq *rq); +typedef void (*mlx5e_fp_dealloc_wqe)(struct mlx5e_rq*, u16); +typedef void (*mlx5e_fp_shampo_dealloc_hd)(struct mlx5e_rq*, u16, u16, bool); + +int mlx5e_rq_set_handlers(struct mlx5e_rq *rq, struct mlx5e_params *params, bool xsk); +void mlx5e_rq_set_trap_handlers(struct mlx5e_rq *rq, struct mlx5e_params *params); +void mlx5e_rq_init_handler(struct mlx5e_rq *rq); + +enum mlx5e_rq_flag { + MLX5E_RQ_FLAG_XDP_XMIT, + MLX5E_RQ_FLAG_XDP_REDIRECT, +}; + +struct mlx5e_rq_frag_info { + int frag_size; + int frag_stride; +}; + +struct mlx5e_rq_frags_info { + struct mlx5e_rq_frag_info arr[MLX5E_MAX_RX_FRAGS]; + u8 num_frags; + u8 log_num_frags; + u8 wqe_bulk; +}; + +struct mlx5e_shampo_hd { + u32 mkey; + struct mlx5e_dma_info *info; + struct page *last_page; + u16 hd_per_wq; + u16 hd_per_wqe; + unsigned long *bitmap; + u16 pi; + u16 ci; + __be32 key; + u64 last_addr; +}; + +struct mlx5e_hw_gro_data { + struct sk_buff *skb; + struct flow_keys fk; + int second_ip_id; +}; + +struct mlx5e_rq { + /* data path */ + union { + struct { + struct mlx5_wq_cyc wq; + struct mlx5e_wqe_frag_info *frags; + struct mlx5e_dma_info *di; + struct mlx5e_rq_frags_info info; + mlx5e_fp_skb_from_cqe skb_from_cqe; + } wqe; + struct { + struct mlx5_wq_ll wq; + struct mlx5e_umr_wqe umr_wqe; + struct mlx5e_mpw_info *info; + mlx5e_fp_skb_from_cqe_mpwrq skb_from_cqe_mpwrq; + u16 num_strides; + u16 actual_wq_head; + u8 log_stride_sz; + u8 umr_in_progress; + u8 umr_last_bulk; + u8 umr_completed; + struct mlx5e_shampo_hd *shampo; + } mpwqe; + + }; + struct { + u16 headroom; + u32 frame0_sz; + u8 map_dir; /* dma map direction */ + } buff; + + struct device *pdev; + struct net_device *netdev; + struct mlx5e_rq_stats *stats; + struct mlx5e_cq cq; + struct mlx5e_cq_decomp cqd; + struct mlx5e_page_cache page_cache; + struct hwtstamp_config *tstamp; + struct mlx5_clock *clock; + struct mlx5e_icosq *icosq; + struct mlx5e_priv *priv; + + struct mlx5e_hw_gro_data *hw_gro_data; + + mlx5e_fp_handle_rx_cqe handle_rx_cqe; + mlx5e_fp_post_rx_wqes post_wqes; + mlx5e_fp_dealloc_wqe dealloc_wqe; + + unsigned long state; + int ix; + unsigned int hw_mtu; + unsigned int pet_hdr_size; + + struct mlx5e_dim dim_obj; /* Adaptive Moderation */ + + /* XDP */ + struct bpf_prog __rcu *xdp_prog; + struct mlx5e_xdpsq *xdpsq; + DECLARE_BITMAP(flags, 8); + struct page_pool *page_pool; + + /* AF_XDP zero-copy */ + struct xsk_buff_pool *xsk_pool; + + struct work_struct recover_work; + + /* control */ + struct mlx5_wq_ctrl wq_ctrl; + __be32 mkey_be; + u8 wq_type; + u32 rqn; + struct mlx5_core_dev *mdev; + struct mlx5e_channel *channel; + u32 umr_mkey; + struct mlx5e_dma_info wqe_overflow; + + /* XDP read-mostly */ + struct xdp_rxq_info xdp_rxq; + cqe_ts_to_ns ptp_cyc2time; +} ____cacheline_aligned_in_smp; + +enum mlx5e_channel_state { + MLX5E_CHANNEL_STATE_XSK, + MLX5E_CHANNEL_NUM_STATES +}; + +struct mlx5e_channel { + /* data path */ + struct mlx5e_rq rq; + struct mlx5e_xdpsq rq_xdpsq; + struct mlx5e_txqsq sq[MLX5E_MAX_NUM_TC]; + struct mlx5e_icosq icosq; /* internal control operations */ + struct mlx5e_txqsq __rcu * __rcu *qos_sqs; + bool xdp; + struct napi_struct napi; + struct device *pdev; + struct net_device *netdev; + __be32 mkey_be; + u16 qos_sqs_size; + u8 num_tc; + u8 lag_port; + + /* XDP_REDIRECT */ + struct mlx5e_xdpsq xdpsq; + + /* AF_XDP zero-copy */ + struct mlx5e_rq xskrq; + struct mlx5e_xdpsq xsksq; + + /* Async ICOSQ */ + struct mlx5e_icosq async_icosq; + /* async_icosq can be accessed from any CPU - the spinlock protects it. */ + spinlock_t async_icosq_lock; + + /* data path - accessed per napi poll */ + const struct cpumask *aff_mask; + struct mlx5e_ch_stats *stats; + + /* control */ + struct dentry *dfs_root; + struct mlx5e_priv *priv; + struct mlx5_core_dev *mdev; + struct hwtstamp_config *tstamp; + DECLARE_BITMAP(state, MLX5E_CHANNEL_NUM_STATES); + int ix; + int cpu; + /* Sync between icosq recovery and XSK enable/disable. */ + struct mutex icosq_recovery_lock; +}; + +struct mlx5e_ptp; + +struct mlx5e_channels { + struct mlx5e_channel **c; + struct mlx5e_ptp *ptp; + unsigned int num; + struct mlx5e_params params; +}; + +struct mlx5e_channel_stats { + struct mlx5e_ch_stats ch; + struct mlx5e_sq_stats sq[MLX5E_MAX_NUM_TC]; + struct mlx5e_rq_stats rq; + struct mlx5e_rq_stats xskrq; + struct mlx5e_xdpsq_stats rq_xdpsq; + struct mlx5e_xdpsq_stats xdpsq; + struct mlx5e_xdpsq_stats xsksq; +} ____cacheline_aligned_in_smp; + +struct mlx5e_ptp_stats { + struct mlx5e_ch_stats ch; + struct mlx5e_sq_stats sq[MLX5E_MAX_NUM_TC]; + struct mlx5e_ptp_cq_stats cq[MLX5E_MAX_NUM_TC]; + struct mlx5e_rq_stats rq; +} ____cacheline_aligned_in_smp; + +enum { + MLX5E_STATE_OPENED, + MLX5E_STATE_DESTROYING, + MLX5E_STATE_XDP_TX_ENABLED, + MLX5E_STATE_XDP_ACTIVE, +}; + +enum { + MLX5E_TC_PRIO = 0, + MLX5E_NIC_PRIO +}; + +struct mlx5e_modify_sq_param { + int curr_state; + int next_state; + int rl_update; + int rl_index; + bool qos_update; + u16 qos_queue_group_id; +}; + +#if IS_ENABLED(CONFIG_PCI_HYPERV_INTERFACE) +struct mlx5e_hv_vhca_stats_agent { + struct mlx5_hv_vhca_agent *agent; + struct delayed_work work; + u16 delay; + void *buf; +}; +#endif + +struct mlx5e_xsk { + /* XSK buffer pools are stored separately from channels, + * because we don't want to lose them when channels are + * recreated. The kernel also stores buffer pool, but it doesn't + * distinguish between zero-copy and non-zero-copy UMEMs, so + * rely on our mechanism. + */ + struct xsk_buff_pool **pools; + u16 refcnt; + bool ever_used; +}; + +/* Temporary storage for variables that are allocated when struct mlx5e_priv is + * initialized, and used where we can't allocate them because that functions + * must not fail. Use with care and make sure the same variable is not used + * simultaneously by multiple users. + */ +struct mlx5e_scratchpad { + cpumask_var_t cpumask; +}; + +struct mlx5e_select_queue_params { + unsigned int num_regular_queues; + unsigned int num_channels; + unsigned int num_tcs; + bool is_htb; + bool is_ptp; +}; + +struct mlx5e_htb { + DECLARE_HASHTABLE(qos_tc2node, order_base_2(MLX5E_QOS_MAX_LEAF_NODES)); + DECLARE_BITMAP(qos_used_qids, MLX5E_QOS_MAX_LEAF_NODES); + struct mlx5e_sq_stats **qos_sq_stats; + u16 max_qos_sqs; + u16 maj_id; + u16 defcls; + struct mlx5e_select_queue_params *final_selq; +}; + +struct mlx5e_delay_drop { + struct work_struct work; + /* serialize setting of delay drop */ + struct mutex lock; + u32 usec_timeout; + bool activate; +}; + +struct mlx5e_trap; + +struct mlx5e_priv { + /* priv data path fields - start */ + struct mlx5e_select_queue_params __rcu *selq; + struct mlx5e_txqsq **txq2sq; +#ifdef CONFIG_MLX5_CORE_EN_DCB + struct mlx5e_dcbx_dp dcbx_dp; +#endif + int pcp_tc_num; + /* priv data path fields - end */ + + u32 msglevel; + unsigned long state; + struct mutex state_lock; /* Protects Interface state */ + struct mlx5e_rq drop_rq; + + struct mlx5e_channels channels; + u32 tisn[MLX5_MAX_PORTS][MLX5E_MAX_NUM_TC]; + struct mlx5e_rx_res *rx_res; + u32 *tx_rates; + + struct mlx5e_flow_steering fs; + + struct workqueue_struct *wq; + struct work_struct update_carrier_work; + struct work_struct set_rx_mode_work; + struct work_struct tx_timeout_work; + struct work_struct update_stats_work; + struct work_struct monitor_counters_work; + struct mlx5_nb monitor_counters_nb; + + struct mlx5_core_dev *mdev; + struct net_device *netdev; + struct mlx5e_trap *en_trap; + struct mlx5e_stats stats; + struct mlx5e_channel_stats **channel_stats; + struct mlx5e_channel_stats trap_stats; + struct mlx5e_ptp_stats ptp_stats; + u16 stats_nch; + u16 max_nch; + u8 max_opened_tc; + u8 shared_rq:1; + bool tx_ptp_opened; + bool rx_ptp_opened; + struct hwtstamp_config tstamp; + u16 q_counter; + u16 drop_rq_q_counter; + struct notifier_block events_nb; + struct notifier_block blocking_events_nb; + + struct udp_tunnel_nic_info nic_info; +#ifdef CONFIG_MLX5_CORE_EN_DCB + struct mlx5e_dcbx dcbx; +#endif + + const struct mlx5e_profile *profile; + void *ppriv; +#ifdef CONFIG_MLX5_EN_MACSEC + struct mlx5e_macsec *macsec; +#endif +#ifdef CONFIG_MLX5_EN_IPSEC + struct mlx5e_ipsec *ipsec; +#endif +#ifdef CONFIG_MLX5_EN_TLS + struct mlx5e_tls *tls; +#endif + struct dentry *dfs_root; + struct mlx5e_delay_drop delay_drop; + struct devlink_health_reporter *tx_reporter; + struct devlink_health_reporter *rx_reporter; + struct mlx5e_xsk xsk; +#if IS_ENABLED(CONFIG_PCI_HYPERV_INTERFACE) + struct mlx5e_hv_vhca_stats_agent stats_agent; +#endif + struct mlx5e_scratchpad scratchpad; + struct mlx5e_htb htb; + struct mlx5e_mqprio_rl *mqprio_rl; + + struct mutex aso_lock; /* Protects aso data and operations */ + struct mlx5e_aso *aso; +}; + +struct mlx5e_rx_handlers { + mlx5e_fp_handle_rx_cqe handle_rx_cqe; + mlx5e_fp_handle_rx_cqe handle_rx_cqe_mpwqe; + mlx5e_fp_handle_rx_cqe handle_rx_cqe_mpwqe_shampo; +}; + +extern const struct mlx5e_rx_handlers mlx5e_rx_handlers_nic; + +enum mlx5e_profile_feature { + MLX5E_PROFILE_FEATURE_PTP_RX, + MLX5E_PROFILE_FEATURE_PTP_TX, + MLX5E_PROFILE_FEATURE_QOS_HTB, +}; + +struct mlx5e_profile { + int (*init)(struct mlx5_core_dev *mdev, + struct net_device *netdev); + void (*cleanup)(struct mlx5e_priv *priv); + int (*init_rx)(struct mlx5e_priv *priv); + void (*cleanup_rx)(struct mlx5e_priv *priv); + int (*init_tx)(struct mlx5e_priv *priv); + void (*cleanup_tx)(struct mlx5e_priv *priv); + void (*enable)(struct mlx5e_priv *priv); + void (*disable)(struct mlx5e_priv *priv); + int (*update_rx)(struct mlx5e_priv *priv); + void (*update_stats)(struct mlx5e_priv *priv); + void (*update_carrier)(struct mlx5e_priv *priv); + int (*max_nch_limit)(struct mlx5_core_dev *mdev); + unsigned int (*stats_grps_num)(struct mlx5e_priv *priv); + mlx5e_stats_grp_t *stats_grps; + const struct mlx5e_rx_handlers *rx_handlers; + int max_tc; + u8 rq_groups; + u32 features; +}; + +#define mlx5e_profile_feature_cap(profile, feature) \ + ((profile)->features & BIT(MLX5E_PROFILE_FEATURE_##feature)) + +void mlx5e_create_debugfs(struct mlx5e_priv *priv); +void mlx5e_destroy_debugfs(struct mlx5e_priv *priv); + +void mlx5e_build_ptys2ethtool_map(void); + +bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev); + +void mlx5e_shampo_dealloc_hd(struct mlx5e_rq *rq, u16 len, u16 start, bool close); +int mlx5e_sysfs_create(struct net_device *dev); +void mlx5e_sysfs_remove(struct net_device *dev); + +int mlx5e_setup_tc_mqprio(struct mlx5e_priv *priv, + struct tc_mqprio_qopt_offload *mqprio); + +void mlx5e_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats); +void mlx5e_fold_sw_stats64(struct mlx5e_priv *priv, struct rtnl_link_stats64 *s); + +void mlx5e_init_l2_addr(struct mlx5e_priv *priv); +int mlx5e_self_test_num(struct mlx5e_priv *priv); +int mlx5e_self_test_fill_strings(struct mlx5e_priv *priv, u8 *data); +void mlx5e_self_test(struct net_device *ndev, struct ethtool_test *etest, + u64 *buf); +void mlx5e_set_rx_mode_work(struct work_struct *work); + +int mlx5e_hwstamp_set(struct mlx5e_priv *priv, struct ifreq *ifr); +int mlx5e_hwstamp_get(struct mlx5e_priv *priv, struct ifreq *ifr); +int mlx5e_modify_rx_cqe_compression_locked(struct mlx5e_priv *priv, bool val, bool rx_filter); + +int mlx5e_vlan_rx_add_vid(struct net_device *dev, __always_unused __be16 proto, + u16 vid); +int mlx5e_vlan_rx_kill_vid(struct net_device *dev, __always_unused __be16 proto, + u16 vid); +void mlx5e_timestamp_init(struct mlx5e_priv *priv); + +struct mlx5e_xsk_param; + +struct mlx5e_rq_param; +struct mlx5e_create_cq_param { + struct napi_struct *napi; + struct mlx5e_ch_stats *ch_stats; + int node; + int ix; +}; +int mlx5e_wait_for_min_rx_wqes(struct mlx5e_rq *rq, int wait_time); +void mlx5e_close_rq(struct mlx5e_priv *priv, struct mlx5e_rq *rq); +int mlx5e_open_rq(struct mlx5e_priv *priv, struct mlx5e_params *params, + struct mlx5e_rq_param *param, struct mlx5e_xsk_param *xsk, + struct mlx5e_create_cq_param *ccp, struct dim_cq_moder moder, + int node, struct mlx5e_rq *rq); +int mlx5e_create_rq(struct mlx5e_rq *rq, struct mlx5e_rq_param *param); +void mlx5e_destroy_rq(struct mlx5e_rq *rq); + +struct mlx5e_sq_param; +int mlx5e_open_xdpsq(struct mlx5e_channel *c, struct mlx5e_params *params, + struct mlx5e_sq_param *param, struct xsk_buff_pool *xsk_pool, + struct mlx5e_xdpsq *sq, bool is_redirect); +void mlx5e_close_xdpsq(struct mlx5e_xdpsq *sq); +void mlx5e_activate_xdpsq(struct mlx5e_xdpsq *sq); +void mlx5e_deactivate_xdpsq(struct mlx5e_xdpsq *sq); + +struct mlx5e_cq_param; +int mlx5e_open_cq(struct mlx5e_priv *priv, struct dim_cq_moder moder, + struct mlx5e_cq_param *param, struct mlx5e_create_cq_param *ccp, + struct mlx5e_cq *cq); +void mlx5e_close_cq(struct mlx5e_cq *cq); +int mlx5e_create_cq(struct mlx5e_cq *cq, struct mlx5e_cq_param *param); +int mlx5e_alloc_cq_common(struct mlx5e_priv *priv, + struct mlx5e_cq_param *param, + struct mlx5e_cq *cq); +void mlx5e_free_cq(struct mlx5e_cq *cq); +int mlx5e_create_mkey(struct mlx5_core_dev *mdev, u32 pdn, + u32 *mkey); + + +int mlx5e_open_locked(struct net_device *netdev); +int mlx5e_close_locked(struct net_device *netdev); + +void mlx5e_trigger_napi_icosq(struct mlx5e_channel *c); +void mlx5e_trigger_napi_sched(struct napi_struct *napi); + +int mlx5e_open_channels(struct mlx5e_priv *priv, + struct mlx5e_channels *chs); +void mlx5e_close_channels(struct mlx5e_channels *chs); + +/* Function pointer to be used to modify HW or kernel settings while + * switching channels + */ +typedef int (*mlx5e_fp_preactivate)(struct mlx5e_priv *priv, void *context); +#define MLX5E_DEFINE_PREACTIVATE_WRAPPER_CTX(fn) \ +int fn##_ctx(struct mlx5e_priv *priv, void *context) \ +{ \ + return fn(priv); \ +} +int mlx5e_safe_reopen_channels(struct mlx5e_priv *priv); +int mlx5e_safe_switch_params(struct mlx5e_priv *priv, + struct mlx5e_params *new_params, + mlx5e_fp_preactivate preactivate, + void *context, bool reset); +int mlx5e_update_tx_netdev_queues(struct mlx5e_priv *priv); +int mlx5e_num_channels_changed_ctx(struct mlx5e_priv *priv, void *context); +void mlx5e_activate_priv_channels(struct mlx5e_priv *priv); +void mlx5e_deactivate_priv_channels(struct mlx5e_priv *priv); +int mlx5e_ptp_rx_manage_fs_ctx(struct mlx5e_priv *priv, void *ctx); + +int mlx5e_modify_rq_state(struct mlx5e_rq *rq, int curr_state, int next_state); +void mlx5e_activate_rq(struct mlx5e_rq *rq); +void mlx5e_deactivate_rq(struct mlx5e_rq *rq); +void mlx5e_activate_icosq(struct mlx5e_icosq *icosq); +void mlx5e_deactivate_icosq(struct mlx5e_icosq *icosq); + +int mlx5e_modify_sq(struct mlx5_core_dev *mdev, u32 sqn, + struct mlx5e_modify_sq_param *p); +int mlx5e_open_txqsq(struct mlx5e_channel *c, u32 tisn, int txq_ix, + struct mlx5e_params *params, struct mlx5e_sq_param *param, + struct mlx5e_txqsq *sq, int tc, u16 qos_queue_group_id, + struct mlx5e_sq_stats *sq_stats); +void mlx5e_enable_txqsq(struct mlx5e_txqsq *sq); +void mlx5e_start_txqsq(struct mlx5e_txqsq *sq); +void mlx5e_disable_txqsq(struct mlx5e_txqsq *sq); +void mlx5e_stop_txqsq(struct mlx5e_txqsq *sq); +void mlx5e_free_txqsq(struct mlx5e_txqsq *sq); +void mlx5e_tx_disable_queue(struct netdev_queue *txq); +int mlx5e_alloc_txqsq_db(struct mlx5e_txqsq *sq, int numa); +void mlx5e_free_txqsq_db(struct mlx5e_txqsq *sq); +struct mlx5e_create_sq_param; +int mlx5e_create_sq_rdy(struct mlx5_core_dev *mdev, + struct mlx5e_sq_param *param, + struct mlx5e_create_sq_param *csp, + u16 qos_queue_group_id, + u32 *sqn); +void mlx5e_tx_err_cqe_work(struct work_struct *recover_work); +void mlx5e_close_txqsq(struct mlx5e_txqsq *sq); + +static inline bool mlx5_tx_swp_supported(struct mlx5_core_dev *mdev) +{ + return MLX5_CAP_ETH(mdev, swp) && + MLX5_CAP_ETH(mdev, swp_csum) && MLX5_CAP_ETH(mdev, swp_lso); +} + +extern const struct ethtool_ops mlx5e_ethtool_ops; + +int mlx5e_create_mkey(struct mlx5_core_dev *mdev, u32 pdn, u32 *mkey); +int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev); +void mlx5e_destroy_mdev_resources(struct mlx5_core_dev *mdev); +int mlx5e_refresh_tirs(struct mlx5e_priv *priv, bool enable_uc_lb, + bool enable_mc_lb); +int mlx5e_modify_tirs_packet_merge(struct mlx5e_priv *priv); +void mlx5e_mkey_set_relaxed_ordering(struct mlx5_core_dev *mdev, void *mkc); + +/* common netdev helpers */ +void mlx5e_create_q_counters(struct mlx5e_priv *priv); +void mlx5e_destroy_q_counters(struct mlx5e_priv *priv); +int mlx5e_open_drop_rq(struct mlx5e_priv *priv, + struct mlx5e_rq *drop_rq); +void mlx5e_close_drop_rq(struct mlx5e_rq *drop_rq); +int mlx5e_init_di_list(struct mlx5e_rq *rq, int wq_sz, int node); +void mlx5e_free_di_list(struct mlx5e_rq *rq); + +int mlx5e_create_tis(struct mlx5_core_dev *mdev, void *in, u32 *tisn); +void mlx5e_destroy_tis(struct mlx5_core_dev *mdev, u32 tisn); + +int mlx5e_create_tises(struct mlx5e_priv *priv); +void mlx5e_destroy_tises(struct mlx5e_priv *priv); +int mlx5e_update_nic_rx(struct mlx5e_priv *priv); +void mlx5e_update_carrier(struct mlx5e_priv *priv); +int mlx5e_close(struct net_device *netdev); +int mlx5e_open(struct net_device *netdev); +u32 mlx5e_choose_lro_timeout(struct mlx5_core_dev *mdev, u32 wanted_timeout); + +void mlx5e_queue_update_stats(struct mlx5e_priv *priv); + +int mlx5e_set_dev_port_mtu(struct mlx5e_priv *priv); +int mlx5e_set_dev_port_mtu_ctx(struct mlx5e_priv *priv, void *context); +int mlx5e_change_mtu(struct net_device *netdev, int new_mtu, + mlx5e_fp_preactivate preactivate); +void mlx5e_vxlan_set_netdev_info(struct mlx5e_priv *priv); + +/* ethtool helpers */ +void mlx5e_ethtool_get_drvinfo(struct mlx5e_priv *priv, + struct ethtool_drvinfo *drvinfo); +void mlx5e_ethtool_get_strings(struct mlx5e_priv *priv, + uint32_t stringset, uint8_t *data); +int mlx5e_ethtool_get_sset_count(struct mlx5e_priv *priv, int sset); +void mlx5e_ethtool_get_ethtool_stats(struct mlx5e_priv *priv, + struct ethtool_stats *stats, u64 *data); +void mlx5e_ethtool_get_ringparam(struct mlx5e_priv *priv, + struct ethtool_ringparam *param); +int mlx5e_ethtool_set_ringparam(struct mlx5e_priv *priv, + struct ethtool_ringparam *param); +void mlx5e_ethtool_get_channels(struct mlx5e_priv *priv, + struct ethtool_channels *ch); +int mlx5e_ethtool_set_channels(struct mlx5e_priv *priv, + struct ethtool_channels *ch); +int mlx5e_ethtool_get_coalesce(struct mlx5e_priv *priv, + struct ethtool_coalesce *coal, + struct kernel_ethtool_coalesce *kernel_coal); +int mlx5e_ethtool_set_coalesce(struct mlx5e_priv *priv, + struct ethtool_coalesce *coal, + struct kernel_ethtool_coalesce *kernel_coal, + struct netlink_ext_ack *extack); +int mlx5e_ethtool_get_link_ksettings(struct mlx5e_priv *priv, + struct ethtool_link_ksettings *link_ksettings); +int mlx5e_ethtool_set_link_ksettings(struct mlx5e_priv *priv, + const struct ethtool_link_ksettings *link_ksettings); +int mlx5e_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key, u8 *hfunc); +int mlx5e_set_rxfh(struct net_device *dev, const u32 *indir, const u8 *key, + const u8 hfunc); +int mlx5e_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *info, + u32 *rule_locs); +int mlx5e_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd); +u32 mlx5e_ethtool_get_rxfh_key_size(struct mlx5e_priv *priv); +u32 mlx5e_ethtool_get_rxfh_indir_size(struct mlx5e_priv *priv); +int mlx5e_ethtool_get_ts_info(struct mlx5e_priv *priv, + struct ethtool_ts_info *info); +int mlx5e_ethtool_flash_device(struct mlx5e_priv *priv, + struct ethtool_flash *flash); +void mlx5e_ethtool_get_pauseparam(struct mlx5e_priv *priv, + struct ethtool_pauseparam *pauseparam); +int mlx5e_ethtool_set_pauseparam(struct mlx5e_priv *priv, + struct ethtool_pauseparam *pauseparam); + +/* mlx5e generic netdev management API */ +static inline bool +mlx5e_tx_mpwqe_supported(struct mlx5_core_dev *mdev) +{ + return !is_kdump_kernel() && + MLX5_CAP_ETH(mdev, enhanced_multi_pkt_send_wqe); +} + +int mlx5e_get_pf_num_tirs(struct mlx5_core_dev *mdev); +int mlx5e_priv_init(struct mlx5e_priv *priv, + const struct mlx5e_profile *profile, + struct net_device *netdev, + struct mlx5_core_dev *mdev); +void mlx5e_priv_cleanup(struct mlx5e_priv *priv); +struct net_device * +mlx5e_create_netdev(struct mlx5_core_dev *mdev, const struct mlx5e_profile *profile); +int mlx5e_attach_netdev(struct mlx5e_priv *priv); +void mlx5e_detach_netdev(struct mlx5e_priv *priv); +void mlx5e_destroy_netdev(struct mlx5e_priv *priv); +int mlx5e_netdev_change_profile(struct mlx5e_priv *priv, + const struct mlx5e_profile *new_profile, void *new_ppriv); +void mlx5e_netdev_attach_nic_profile(struct mlx5e_priv *priv); +void mlx5e_set_netdev_mtu_boundaries(struct mlx5e_priv *priv); +void mlx5e_build_nic_params(struct mlx5e_priv *priv, struct mlx5e_xsk *xsk, u16 mtu); +void mlx5e_build_txq_maps(struct mlx5e_priv *priv); + +int mlx5e_get_dump_flag(struct net_device *netdev, struct ethtool_dump *dump); +int mlx5e_get_dump_data(struct net_device *netdev, struct ethtool_dump *dump, + void *buffer); +int mlx5e_set_dump(struct net_device *dev, struct ethtool_dump *dump); + +static inline bool mlx5e_dropless_rq_supported(struct mlx5_core_dev *mdev) +{ + return (MLX5_CAP_GEN(mdev, rq_delay_drop) && + MLX5_CAP_GEN(mdev, general_notification_event)); +} + +void mlx5e_rx_dim_work(struct work_struct *work); +void mlx5e_tx_dim_work(struct work_struct *work); + +int mlx5e_get_link_ksettings(struct net_device *netdev, + struct ethtool_link_ksettings *link_ksettings); +int mlx5e_set_link_ksettings(struct net_device *netdev, + const struct ethtool_link_ksettings *link_ksettings); +netdev_features_t mlx5e_features_check(struct sk_buff *skb, + struct net_device *netdev, + netdev_features_t features); +int mlx5e_set_features(struct net_device *netdev, netdev_features_t features); +#ifdef CONFIG_MLX5_ESWITCH +int mlx5e_set_vf_mac(struct net_device *dev, int vf, u8 *mac); +int mlx5e_set_vf_rate(struct net_device *dev, int vf, int min_tx_rate, int max_tx_rate); +int mlx5e_get_vf_config(struct net_device *dev, int vf, struct ifla_vf_info *ivi); +int mlx5e_get_vf_stats(struct net_device *dev, int vf, struct ifla_vf_stats *vf_stats); +bool mlx5e_is_rep_shared_rq(const struct mlx5e_priv *priv); +#endif + +void mlx5e_build_selq(struct mlx5e_select_queue_params *selq, + struct mlx5e_params *params, bool htb); +void mlx5e_replace_selq(struct mlx5e_priv *priv, struct mlx5e_select_queue_params *selq); +void mlx5e_destroy_sq(struct mlx5_core_dev *mdev, u32 sqn); +#endif /* __MLX5_EN_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/aso.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/aso.c new file mode 100644 index 0000000..45a8a38 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/aso.c @@ -0,0 +1,449 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// // Copyright (c) 2020 Mellanox Technologies. + +#include "aso.h" + +static int mlx5e_aso_reg_mr(struct mlx5e_priv *priv, struct mlx5e_aso *aso) +{ + struct mlx5_core_dev *mdev = priv->mdev; + struct device *dma_device; + dma_addr_t dma_addr; + int err; + + err = mlx5_core_alloc_pd(mdev, &aso->pdn); + if (err) { + mlx5_core_err(mdev, "alloc pd failed, %d\n", err); + return err; + } + + if (aso->size == 0) + return 0; + + aso->ctx = kzalloc(aso->size, GFP_KERNEL); + if (!aso->ctx) { + err = -ENOMEM; + goto out_mem; + } + + dma_device = &mdev->pdev->dev; + dma_addr = dma_map_single(dma_device, aso->ctx, aso->size, DMA_BIDIRECTIONAL); + err = dma_mapping_error(dma_device, dma_addr); + if (err) { + mlx5_core_warn(mdev, "Can't dma aso\n"); + goto out_dma; + } + + err = mlx5e_create_mkey(mdev, aso->pdn, &aso->mkey); + if (err) { + mlx5_core_warn(mdev, "Can't create mkey\n"); + goto out_mkey; + } + + aso->dma_addr = dma_addr; + + return 0; + +out_mkey: + dma_unmap_single(dma_device, dma_addr, aso->size, DMA_BIDIRECTIONAL); + +out_dma: + kfree(aso->ctx); + aso->ctx = NULL; +out_mem: + mlx5_core_dealloc_pd(mdev, aso->pdn); + return err; +} + +static void mlx5e_aso_dereg_mr(struct mlx5e_priv *priv, struct mlx5e_aso *aso) +{ + mlx5_core_dealloc_pd(priv->mdev, aso->pdn); + + if (!aso->ctx) + return; + + mlx5_core_destroy_mkey(priv->mdev, aso->mkey); + dma_unmap_single(&priv->mdev->pdev->dev, aso->dma_addr, aso->size, DMA_BIDIRECTIONAL); + kfree(aso->ctx); + aso->ctx = NULL; +} + +void mlx5e_build_aso_wqe(struct mlx5e_aso *aso, struct mlx5e_asosq *sq, + u8 ds_cnt, struct mlx5_wqe_ctrl_seg *cseg, + struct mlx5_wqe_aso_ctrl_seg *aso_ctrl, + u32 obj_id, u32 opc_mode, + struct mlx5e_aso_ctrl_param *param) +{ + cseg->opmod_idx_opcode = cpu_to_be32((opc_mode << MLX5_WQE_CTRL_WQE_OPC_MOD_SHIFT) | + (sq->pc << MLX5_WQE_CTRL_WQE_INDEX_SHIFT) | + MLX5_OPCODE_ACCESS_ASO); + cseg->qpn_ds = cpu_to_be32((sq->sqn << MLX5_WQE_CTRL_QPN_SHIFT) | ds_cnt); + cseg->fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE; + cseg->general_id = cpu_to_be32(obj_id); + + memset(aso_ctrl, 0, sizeof(*aso_ctrl)); + if (aso->dma_addr) { + aso_ctrl->va_l = cpu_to_be32(aso->dma_addr | ASO_CTRL_READ_EN); + aso_ctrl->va_h = cpu_to_be32(aso->dma_addr >> 32); + aso_ctrl->l_key = cpu_to_be32(aso->mkey); + } + + if (param) { + aso_ctrl->data_mask_mode = param->data_mask_mode << 6; + aso_ctrl->condition_1_0_operand = param->condition_1_operand | param->condition_0_operand << 4; + aso_ctrl->condition_1_0_offset = param->condition_1_offset | param->condition_0_offset << 4; + aso_ctrl->data_offset_condition_operand = param->data_offset | param->condition_operand << 6; + aso_ctrl->condition_0_data = cpu_to_be32(param->condition_0_data); + aso_ctrl->condition_0_mask = cpu_to_be32(param->condition_0_mask); + aso_ctrl->condition_1_data = cpu_to_be32(param->condition_1_data); + aso_ctrl->condition_1_mask = cpu_to_be32(param->condition_1_mask); + aso_ctrl->bitwise_data = cpu_to_be64(param->bitwise_data); + aso_ctrl->data_mask = cpu_to_be64(param->data_mask); + } +} + +int mlx5e_poll_aso_cq(struct mlx5e_cq *cq) +{ + struct mlx5e_asosq *sq = container_of(cq, struct mlx5e_asosq, cq); + struct mlx5_cqe64 *cqe; + unsigned long expires; + int i, err; + u16 sqcc; + + err = 0; + + if (unlikely(!test_bit(MLX5E_SQ_STATE_ENABLED, &sq->state))) + return -EIO; + + cqe = mlx5_cqwq_get_cqe(&cq->wq); + + if (likely(!cqe)) { + /* Per Chip Design, if context is not in ICM cache, it will take 0.5us to read the context. + * We measure the total time in FW from doorbell ring until cqe update is 980us. + * So put 2us is sufficient. + */ + expires = jiffies + msecs_to_jiffies(10); + while (!cqe && time_is_after_jiffies(expires)) { + usleep_range(20, 50); /* WA for RM 2323775 */ + cqe = mlx5_cqwq_get_cqe(&cq->wq); + } + if (!cqe) { + mlx5_core_err(cq->mdev, "No ASO completion\n"); + return -EIO; + } + } + + /* sq->cc must be updated only after mlx5_cqwq_update_db_record(), + * otherwise a cq overrun may occur + */ + sqcc = sq->cc; + + i = 0; + do { + u16 wqe_counter; + bool last_wqe; + + mlx5_cqwq_pop(&cq->wq); + + wqe_counter = be16_to_cpu(cqe->wqe_counter); + + do { + struct mlx5e_aso_wqe_info *wi; + u16 ci; + + last_wqe = (sqcc == wqe_counter); + + ci = mlx5_wq_cyc_ctr2ix(&sq->wq, sqcc); + wi = &sq->db.aso_wqe[ci]; + + if (last_wqe && unlikely(get_cqe_opcode(cqe) != MLX5_CQE_REQ)) { + struct mlx5_err_cqe *err_cqe; + + mlx5_core_err(cq->mdev, "Bad OP in ASOSQ CQE: 0x%x\n", + get_cqe_opcode(cqe)); + + err_cqe = (struct mlx5_err_cqe *)cqe; + mlx5_core_err(cq->mdev, "vendor_err_synd=%x\n", err_cqe->vendor_err_synd); + mlx5_core_err(cq->mdev, "syndrome=%x\n", err_cqe->syndrome); + print_hex_dump(KERN_WARNING, "", DUMP_PREFIX_OFFSET, 16, 1, err_cqe, + sizeof(*err_cqe), false); + err = -EIO; + break; + } + + if (likely(wi->opcode == MLX5_OPCODE_NOP)) { + sqcc++; + } else if (likely(wi->opcode == MLX5_OPCODE_ACCESS_ASO)) { + if (wi->with_data) + sqcc += MLX5E_ASO_WQEBBS_DATA; + else + sqcc += MLX5E_ASO_WQEBBS; + } else { + mlx5_core_err(cq->mdev, + "Bad OPCODE in ASOSQ WQE info: 0x%x\n", + wi->opcode); + err = -EIO; + break; + } + } while (!last_wqe); + } while ((++i < MLX5E_TX_CQ_POLL_BUDGET) && (cqe = mlx5_cqwq_get_cqe(&cq->wq))); + + sq->cc = sqcc; + + mlx5_cqwq_update_db_record(&cq->wq); + return err; +} + +void mlx5e_fill_asosq_frag_edge(struct mlx5e_asosq *sq, struct mlx5_wq_cyc *wq, + u16 pi, u16 nnops) +{ + struct mlx5e_aso_wqe_info *edge_wi, *wi = &sq->db.aso_wqe[pi]; + + edge_wi = wi + nnops; + + /* fill sq frag edge with nops to avoid wqe wrapping two pages */ + for (; wi < edge_wi; wi++) { + wi->opcode = MLX5_OPCODE_NOP; + mlx5e_post_nop(wq, sq->sqn, &sq->pc); + } +} + +static void mlx5e_build_sq_param_common_aso(struct mlx5e_priv *priv, + struct mlx5e_aso *aso, + struct mlx5e_sq_param *param) +{ + void *sqc = param->sqc; + void *wq = MLX5_ADDR_OF(sqc, sqc, wq); + + MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB)); + + MLX5_SET(wq, wq, pd, aso->pdn); + param->wq.buf_numa_node = dev_to_node(priv->mdev->device); +} + +static void mlx5e_build_asosq_param(struct mlx5e_priv *priv, + struct mlx5e_aso *aso, + struct mlx5e_sq_param *param) +{ + void *sqc = param->sqc; + void *wq = MLX5_ADDR_OF(sqc, sqc, wq); + + mlx5e_build_sq_param_common_aso(priv, aso, param); + MLX5_SET(wq, wq, log_wq_sz, MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE); +} + +static int mlx5e_alloc_asosq_db(struct mlx5e_asosq *sq, int numa) +{ + int wq_sz = mlx5_wq_cyc_get_size(&sq->wq); + + sq->db.aso_wqe = kvzalloc_node(array_size(wq_sz, + sizeof(*sq->db.aso_wqe)), + GFP_KERNEL, numa); + if (!sq->db.aso_wqe) + return -ENOMEM; + + return 0; +} + +static int mlx5e_alloc_asosq(struct mlx5e_priv *priv, struct mlx5e_aso *aso) +{ + struct mlx5e_sq_param *param = &aso->sq_param; + struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5e_asosq *sq = &aso->sq; + struct mlx5_wq_cyc *wq = &sq->wq; + void *sqc_wq; + int err; + + sqc_wq = MLX5_ADDR_OF(sqc, param->sqc, wq); + sq->uar_map = mdev->mlx5e_res.hw_objs.bfreg.map; + + param->wq.db_numa_node = cpu_to_node(aso->cpu); + err = mlx5_wq_cyc_create(mdev, ¶m->wq, sqc_wq, wq, &sq->wq_ctrl); + if (err) + return err; + wq->db = &wq->db[MLX5_SND_DBR]; + + err = mlx5e_alloc_asosq_db(sq, cpu_to_node(aso->cpu)); + if (err) + mlx5_wq_destroy(&sq->wq_ctrl); + + return err; +} + +static void mlx5e_free_asosq_db(struct mlx5e_asosq *sq) +{ + kvfree(sq->db.aso_wqe); +} + +static void mlx5e_free_asosq(struct mlx5e_asosq *sq) +{ + mlx5e_free_asosq_db(sq); + mlx5_wq_destroy(&sq->wq_ctrl); +} + +static int mlx5e_open_asosq(struct mlx5e_priv *priv, struct mlx5e_aso *aso) +{ + struct mlx5e_sq_param *param = &aso->sq_param; + struct mlx5e_create_sq_param csp = {}; + struct mlx5e_asosq *sq = &aso->sq; + int err; + + err = mlx5e_alloc_asosq(priv, aso); + if (err) + return err; + + csp.cqn = sq->cq.mcq.cqn; + csp.wq_ctrl = &sq->wq_ctrl; + csp.min_inline_mode = MLX5_INLINE_MODE_NONE; + err = mlx5e_create_sq_rdy(priv->mdev, param, &csp, 0, &sq->sqn); + if (err) { + mlx5_core_err(priv->mdev, "fail to open aso sq err=%d\n", err); + goto err_free_asosq; + } + mlx5_core_dbg(priv->mdev, "sq->sqn = 0x%x\n", sq->sqn); + + set_bit(MLX5E_SQ_STATE_ENABLED, &sq->state); + + return 0; + +err_free_asosq: + mlx5e_free_asosq(sq); + + return err; +} + +static void mlx5e_close_asosq(struct mlx5e_aso *aso) +{ + struct mlx5e_asosq *sq = &aso->sq; + + clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state); + mlx5e_destroy_sq(aso->priv->mdev, sq->sqn); + mlx5e_free_asosq(sq); +} + +static int mlx5e_aso_alloc_cq(struct mlx5e_priv *priv, + struct mlx5e_cq_param *param, + struct mlx5e_cq *cq, int cpu) +{ + int err; + + param->wq.buf_numa_node = cpu_to_node(cpu); + param->wq.db_numa_node = cpu_to_node(cpu); + param->eq_ix = 0; /* Use first completion vector */ + + err = mlx5e_alloc_cq_common(priv, param, cq); + + /* no interrupt for aso cq */ + cq->napi = NULL; + + return err; +} + +static +int mlx5e_aso_open_cq(struct mlx5e_priv *priv, + struct mlx5e_cq_param *param, + struct mlx5e_cq *cq, int cpu) +{ + int err; + + err = mlx5e_aso_alloc_cq(priv, param, cq, cpu); + if (err) { + mlx5_core_err(priv->mdev, "fail to allocate aso cq err=%d\n", err); + return err; + } + + cq->no_arm = true; + err = mlx5e_create_cq(cq, param); + if (err) { + mlx5_core_err(priv->mdev, "fail to create aso cq err=%d\n", err); + goto err_free_cq; + } + + return 0; + +err_free_cq: + mlx5e_free_cq(cq); + return err; +} + +static void mlx5e_aso_build_param(struct mlx5e_priv *priv, struct mlx5e_aso *aso) +{ + mlx5e_build_aso_cq_param(priv->mdev, &aso->cq_param); + + aso->cpu = cpumask_first(mlx5_comp_irq_get_affinity_mask(priv->mdev, 0)); + aso->sq_param.pdn = aso->pdn; + mlx5e_build_asosq_param(priv, aso, &aso->sq_param); +} + +struct mlx5e_aso * +mlx5e_aso_setup(struct mlx5e_priv *priv, int size) +{ + struct mlx5e_aso *aso; + int err; + + aso = kzalloc(sizeof(*aso), GFP_KERNEL); + if (!aso) + return NULL; + + aso->size = size; + err = mlx5e_aso_reg_mr(priv, aso); + if (err) + goto err_mr; + + mlx5e_aso_build_param(priv, aso); + err = mlx5e_aso_open_cq(priv, &aso->cq_param, &aso->sq.cq, aso->cpu); + if (err) + goto err_cq; + + err = mlx5e_open_asosq(priv, aso); + if (err) + goto err_sq; + + aso->priv = priv; + + return aso; + +err_sq: + mlx5e_close_cq(&aso->sq.cq); +err_cq: + mlx5e_aso_dereg_mr(priv, aso); +err_mr: + kfree(aso); + return NULL; +} + +void mlx5e_aso_cleanup(struct mlx5e_priv *priv, struct mlx5e_aso *aso) +{ + if (!aso) + return; + + mlx5e_close_asosq(aso); + mlx5e_close_cq(&aso->sq.cq); + mlx5e_aso_dereg_mr(priv, aso); + kfree(aso); +} + +struct mlx5e_aso * +mlx5e_aso_get(struct mlx5e_priv *priv) +{ + mutex_lock(&priv->aso_lock); + if (!priv->aso) + priv->aso = mlx5e_aso_setup(priv, 0); + if (priv->aso) + priv->aso->refcnt++; + mutex_unlock(&priv->aso_lock); + + return priv->aso; +} + +void mlx5e_aso_put(struct mlx5e_priv *priv) +{ + mutex_lock(&priv->aso_lock); + if (priv->aso && --priv->aso->refcnt == 0) { + mlx5e_close_asosq(priv->aso); + mlx5e_close_cq(&priv->aso->sq.cq); + mlx5_core_dealloc_pd(priv->mdev, priv->aso->pdn); + kfree(priv->aso); + priv->aso = NULL; + } + mutex_unlock(&priv->aso_lock); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/aso.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/aso.h new file mode 100644 index 0000000..6482fcc --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/aso.h @@ -0,0 +1,152 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// // Copyright (c) 2020 Mellanox Technologies. + +#include "en.h" +#include "linux/dma-mapping.h" +#include "en/txrx.h" +#include "en/params.h" +#include "lib/aso.h" + +#ifndef __MLX5_EN_ASO_H__ +#define __MLX5_EN_ASO_H__ + +#define ASO_CTRL_READ_EN BIT(0) + +#define MLX5E_ASO_WQEBBS \ + (DIV_ROUND_UP(sizeof(struct mlx5e_aso_wqe), MLX5_SEND_WQE_BB)) +#define MLX5E_ASO_WQEBBS_DATA \ + (DIV_ROUND_UP(sizeof(struct mlx5e_aso_wqe_data), MLX5_SEND_WQE_BB)) +#define ASO_CTRL_READ_EN BIT(0) +#define MLX5E_MACSEC_ASO_DS_CNT \ + (DIV_ROUND_UP(sizeof(struct mlx5e_aso_wqe), MLX5_SEND_WQE_DS)) + +enum { + MLX5_ASO_SOFT_ARM = BIT(0), + MLX5_ASO_HARD_ARM = BIT(1), + MLX5_ASO_REMOVE_FLOW_ENABLE = BIT(2), + MLX5_ASO_ESN_ARM = BIT(3), +}; + +struct mlx5e_aso_wqe { + struct mlx5_wqe_ctrl_seg ctrl; + struct mlx5_wqe_aso_ctrl_seg aso_ctrl; +}; + +struct mlx5e_aso_wqe_data { + struct mlx5_wqe_ctrl_seg ctrl; + struct mlx5_wqe_aso_ctrl_seg aso_ctrl; + struct mlx5_wqe_aso_data_seg aso_data; +}; + +struct mlx5e_aso_wqe_info { + u8 opcode; + bool with_data; +}; + +struct mlx5e_asosq { + /* data path */ + u16 cc; + u16 pc; + + struct mlx5_wqe_ctrl_seg *doorbell_cseg; + struct mlx5e_cq cq; + + /* write@xmit, read@completion */ + struct { + struct mlx5e_aso_wqe_info *aso_wqe; + } db; + + /* read only */ + struct mlx5_wq_cyc wq; + void __iomem *uar_map; + u32 sqn; + unsigned long state; + + /* control path */ + struct mlx5_wq_ctrl wq_ctrl; +} ____cacheline_aligned_in_smp; + +struct mlx5e_aso { + u32 mkey; + dma_addr_t dma_addr; + void *ctx; + size_t size; + u32 pdn; + int refcnt; + struct mlx5e_cq_param cq_param; + int cpu; + struct mlx5e_priv *priv; + struct mlx5e_asosq sq; + struct mlx5e_sq_param sq_param; +}; + +enum { + LOGICAL_AND, + LOGICAL_OR, +}; + +enum { + ALWAYS_FALSE, + ALWAYS_TRUE, + EQUAL, + NOT_EQUAL, + GREATER_OR_EQUAL, + LESSER_OR_EQUAL, + LESSER, + GREATER, + CYCLIC_GREATER, + CYCLIC_LESSER, +}; + +enum { + ASO_DATA_MASK_MODE_BITWISE_64BIT, + ASO_DATA_MASK_MODE_BYTEWISE_64BYTE, + ASO_DATA_MASK_MODE_CALCULATED_64BYTE, +}; + +struct mlx5e_aso_ctrl_param { + u8 data_mask_mode; + u8 condition_0_operand; + u8 condition_1_operand; + u8 condition_0_offset; + u8 condition_1_offset; + u8 data_offset; + u8 condition_operand; + u32 condition_0_data; + u32 condition_0_mask; + u32 condition_1_data; + u32 condition_1_mask; + u64 bitwise_data; + u64 data_mask; +}; + +enum { + ARM_SOFT = BIT(0), + SET_SOFT = BIT(1), + SET_CNT_BIT31 = BIT(3), + CLEAR_SOFT = BIT(4), + ARM_ESN_EVENT = BIT(5), +}; + +enum { + MLX5_ACCESS_ASO_OPC_MOD_IPSEC, +}; + +enum { + MLX5_ACCESS_ASO_OPC_MOD_FLOW_METER = 0x2, + MLX5_ACCESS_ASO_OPC_MOD_MACSEC = 0x5, +}; + +void mlx5e_build_aso_wqe(struct mlx5e_aso *aso, struct mlx5e_asosq *sq, + u8 ds_cnt, struct mlx5_wqe_ctrl_seg *cseg, + struct mlx5_wqe_aso_ctrl_seg *aso_ctrl, + u32 obj_id, u32 opc_mode, + struct mlx5e_aso_ctrl_param *param); +int mlx5e_poll_aso_cq(struct mlx5e_cq *cq); +void mlx5e_fill_asosq_frag_edge(struct mlx5e_asosq *sq, struct mlx5_wq_cyc *wq, + u16 pi, u16 nnops); +struct mlx5e_aso *mlx5e_aso_setup(struct mlx5e_priv *priv, int size); +void mlx5e_aso_cleanup(struct mlx5e_priv *priv, struct mlx5e_aso *aso); +struct mlx5e_aso *mlx5e_aso_get(struct mlx5e_priv *priv); +void mlx5e_aso_put(struct mlx5e_priv *priv); +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/channels.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/channels.c new file mode 100644 index 0000000..e7c14c0 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/channels.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2021, Mellanox Technologies inc. All rights reserved. */ + +#include "channels.h" +#include "en.h" +#include "en/ptp.h" + +unsigned int mlx5e_channels_get_num(struct mlx5e_channels *chs) +{ + return chs->num; +} + +void mlx5e_channels_get_regular_rqn(struct mlx5e_channels *chs, unsigned int ix, u32 *rqn) +{ + struct mlx5e_channel *c; + + WARN_ON(ix >= mlx5e_channels_get_num(chs)); + c = chs->c[ix]; + + *rqn = c->rq.rqn; +} + +bool mlx5e_channels_get_xsk_rqn(struct mlx5e_channels *chs, unsigned int ix, u32 *rqn) +{ + struct mlx5e_channel *c; + + WARN_ON(ix >= mlx5e_channels_get_num(chs)); + c = chs->c[ix]; + + if (!test_bit(MLX5E_CHANNEL_STATE_XSK, c->state)) + return false; + + *rqn = c->xskrq.rqn; + return true; +} + +bool mlx5e_channels_get_ptp_rqn(struct mlx5e_channels *chs, u32 *rqn) +{ + struct mlx5e_ptp *c = chs->ptp; + + if (!c || !test_bit(MLX5E_PTP_STATE_RX, c->state)) + return false; + + *rqn = c->rq.rqn; + return true; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/channels.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/channels.h new file mode 100644 index 0000000..ca00cbc --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/channels.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2021, Mellanox Technologies inc. All rights reserved. */ + +#ifndef __MLX5_EN_CHANNELS_H__ +#define __MLX5_EN_CHANNELS_H__ + +#include + +struct mlx5e_channels; + +unsigned int mlx5e_channels_get_num(struct mlx5e_channels *chs); +void mlx5e_channels_get_regular_rqn(struct mlx5e_channels *chs, unsigned int ix, u32 *rqn); +bool mlx5e_channels_get_xsk_rqn(struct mlx5e_channels *chs, unsigned int ix, u32 *rqn); +bool mlx5e_channels_get_ptp_rqn(struct mlx5e_channels *chs, u32 *rqn); + +#endif /* __MLX5_EN_CHANNELS_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/dcbnl.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/dcbnl.h new file mode 100644 index 0000000..9976de8 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/dcbnl.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2020 Mellanox Technologies. */ + +#ifndef __MLX5E_DCBNL_H__ +#define __MLX5E_DCBNL_H__ + +#ifdef CONFIG_MLX5_CORE_EN_DCB + +#define MLX5E_MAX_PRIORITY (8) + +struct mlx5e_cee_config { + /* bw pct for priority group */ + u8 pg_bw_pct[CEE_DCBX_MAX_PGS]; + u8 prio_to_pg_map[CEE_DCBX_MAX_PRIO]; + bool pfc_setting[CEE_DCBX_MAX_PRIO]; + bool pfc_enable; +}; + +struct mlx5e_dcbx { + enum mlx5_dcbx_oper_mode mode; + struct mlx5e_cee_config cee_cfg; /* pending configuration */ + u8 dscp_app_cnt; + + /* The only setting that cannot be read from FW */ + u8 tc_tsa[IEEE_8021QAZ_MAX_TCS]; + u8 cap; + + /* Buffer configuration */ + bool manual_buffer; + u32 cable_len; + u32 xoff; + u16 port_buff_cell_sz; +}; + +#define MLX5E_MAX_DSCP (64) + +struct mlx5e_dcbx_dp { + u8 dscp2prio[MLX5E_MAX_DSCP]; + u8 trust_state; +}; + +void mlx5e_dcbnl_build_netdev(struct net_device *netdev); +void mlx5e_dcbnl_build_rep_netdev(struct net_device *netdev); +void mlx5e_dcbnl_initialize(struct mlx5e_priv *priv); +void mlx5e_dcbnl_init_app(struct mlx5e_priv *priv); +void mlx5e_dcbnl_delete_app(struct mlx5e_priv *priv); +#else +static inline void mlx5e_dcbnl_build_netdev(struct net_device *netdev) {} +static inline void mlx5e_dcbnl_build_rep_netdev(struct net_device *netdev) {} +static inline void mlx5e_dcbnl_initialize(struct mlx5e_priv *priv) {} +static inline void mlx5e_dcbnl_init_app(struct mlx5e_priv *priv) {} +static inline void mlx5e_dcbnl_delete_app(struct mlx5e_priv *priv) {} +#endif + +#endif /* __MLX5E_DCBNL_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c new file mode 100644 index 0000000..ae52e7f --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. */ + +#include "en/devlink.h" +#include "eswitch.h" + +static void +mlx5e_devlink_get_port_parent_id(struct mlx5_core_dev *dev, struct netdev_phys_item_id *ppid) +{ + u64 parent_id; + + parent_id = mlx5_query_nic_system_image_guid(dev); + ppid->id_len = sizeof(parent_id); + memcpy(ppid->id, &parent_id, sizeof(parent_id)); +} + +int mlx5e_devlink_port_register(struct mlx5e_priv *priv) +{ + struct devlink *devlink = priv_to_devlink(priv->mdev); + struct devlink_port_attrs attrs = {}; + struct netdev_phys_item_id ppid = {}; + struct devlink_port *dl_port; + unsigned int dl_port_index; + + if (mlx5_core_is_pf(priv->mdev)) { + attrs.flavour = DEVLINK_PORT_FLAVOUR_PHYSICAL; + attrs.phys.port_number = mlx5_get_dev_index(priv->mdev); + if (MLX5_ESWITCH_MANAGER(priv->mdev)) { + mlx5e_devlink_get_port_parent_id(priv->mdev, &ppid); + memcpy(attrs.switch_id.id, ppid.id, ppid.id_len); + attrs.switch_id.id_len = ppid.id_len; + } + dl_port_index = mlx5_esw_vport_to_devlink_port_index(priv->mdev, + MLX5_VPORT_UPLINK); + } else { + attrs.flavour = DEVLINK_PORT_FLAVOUR_VIRTUAL; + dl_port_index = mlx5_esw_vport_to_devlink_port_index(priv->mdev, 0); + } + + dl_port = mlx5e_devlink_get_dl_port(priv); + memset(dl_port, 0, sizeof(*dl_port)); + devlink_port_attrs_set(dl_port, &attrs); + + return devlink_port_register(devlink, dl_port, dl_port_index); +} + +void mlx5e_devlink_port_type_eth_set(struct mlx5e_priv *priv) +{ + struct devlink_port *dl_port = mlx5e_devlink_get_dl_port(priv); + + devlink_port_type_eth_set(dl_port, priv->netdev); +} + +void mlx5e_devlink_port_unregister(struct mlx5e_priv *priv) +{ + struct devlink_port *dl_port = mlx5e_devlink_get_dl_port(priv); + + devlink_port_unregister(dl_port); +} + +struct devlink_port *mlx5e_get_devlink_port(struct net_device *dev) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + + if (!netif_device_present(dev)) + return NULL; + + return mlx5e_devlink_get_dl_port(priv); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.h new file mode 100644 index 0000000..10b50fe --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. */ + +#ifndef __MLX5E_EN_DEVLINK_H +#define __MLX5E_EN_DEVLINK_H + +#include +#include "en.h" + +int mlx5e_devlink_port_register(struct mlx5e_priv *priv); +void mlx5e_devlink_port_unregister(struct mlx5e_priv *priv); +void mlx5e_devlink_port_type_eth_set(struct mlx5e_priv *priv); +struct devlink_port *mlx5e_get_devlink_port(struct net_device *dev); + +static inline struct devlink_port * +mlx5e_devlink_get_dl_port(struct mlx5e_priv *priv) +{ + return &priv->mdev->mlx5e_res.dl_port; +} + +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h new file mode 100644 index 0000000..6f465c9 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h @@ -0,0 +1,260 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2018 Mellanox Technologies. */ + +#ifndef __MLX5E_FLOW_STEER_H__ +#define __MLX5E_FLOW_STEER_H__ + +#include "mod_hdr.h" +#include "lib/fs_ttc.h" + +struct mlx5e_post_act; + +enum { + MLX5E_TC_FT_LEVEL = 0, + MLX5E_TC_HP_OOB_CNT_LEVEL, + MLX5E_TC_TTC_FT_LEVEL, + MLX5E_TC_MISS_LEVEL, +}; + +struct mlx5_prio_hp { + u32 rate; + struct kobject kobj; + struct mlx5e_priv *priv; + u32 prio; +}; + +#define HAIRPIN_OOB_NUM_CNT_PER_SET 2 + +struct mlx5e_hp_oob_cnt { + struct mlx5e_priv *priv; + union { + struct { + struct mlx5_fc *curr_cnt; + struct mlx5_fc *standby_cnt; + struct mlx5_fc *curr_peer_cnt; + struct mlx5_fc *standby_peer_cnt; + }; + struct mlx5_fc *cntrs[HAIRPIN_OOB_NUM_CNT_PER_SET * 2]; + }; + struct mlx5_core_dev *peer_dev; + struct mutex cnt_lock; /* Protect read/write of drop_cnt */ + u64 drop_cnt; + struct mlx5_flow_table *ft; + struct mlx5_flow_table *tx_ft; + struct mlx5_modify_hdr *curr_mod_hdr; + struct mlx5_modify_hdr *standby_mod_hdr; + struct mlx5_flow_handle *tx_red_rule; + struct mlx5_flow_handle *tx_blue_rule; + struct mlx5_flow_handle *rx_rule; + struct delayed_work hp_oob_work; + struct mlx5_flow_destination rx_dest; + bool dest_valid; +}; + +#define MLX5E_MAX_HP_PRIO 1000 + +struct mlx5e_tc_table { + /* Protects the dynamic assignment of the t parameter + * which is the nic tc root table. + */ + struct mutex t_lock; + struct mlx5_flow_table *t; + struct mlx5_flow_table *miss_t; + struct mlx5_fs_chains *chains; + struct mlx5e_post_act *post_act; + + struct rhashtable ht; + + struct mod_hdr_tbl mod_hdr; + struct mutex hairpin_tbl_lock; /* protects hairpin_tbl */ + DECLARE_HASHTABLE(hairpin_tbl, 16); + struct kobject *hp_config; + struct mlx5_prio_hp *prio_hp; + struct mlx5e_priv *prio_hp_ppriv; + int num_prio_hp; + atomic_t hp_fwd_ref_cnt; + struct mlx5_flow_table *hp_fwd; + struct mlx5_flow_group *hp_fwd_g; + u32 max_pp_burst_size; + struct mlx5e_hp_oob_cnt *hp_oob; + + struct notifier_block netdevice_nb; + struct netdev_net_notifier netdevice_nn; + + struct mlx5_tc_ct_priv *ct; + struct mapping_ctx *mapping; +}; + +struct mlx5e_flow_table { + int num_groups; + struct mlx5_flow_table *t; + struct mlx5_flow_group **g; +}; + +struct mlx5e_l2_rule { + u8 addr[ETH_ALEN + 2]; + struct mlx5_flow_handle *rule; +}; + +#define MLX5E_L2_ADDR_HASH_SIZE BIT(BITS_PER_BYTE) + +struct mlx5e_promisc_table { + struct mlx5e_flow_table ft; + struct mlx5_flow_handle *rule; +}; + +/* Forward declaration and APIs to get private fields of vlan_table */ +struct mlx5e_vlan_table; +unsigned long *mlx5e_vlan_get_active_svlans(struct mlx5e_vlan_table *vlan); +struct mlx5_flow_table *mlx5e_vlan_get_flowtable(struct mlx5e_vlan_table *vlan); + +struct mlx5e_l2_table { + struct mlx5e_flow_table ft; + struct hlist_head netdev_uc[MLX5E_L2_ADDR_HASH_SIZE]; + struct hlist_head netdev_mc[MLX5E_L2_ADDR_HASH_SIZE]; + struct mlx5e_l2_rule broadcast; + struct mlx5e_l2_rule allmulti; + struct mlx5_flow_handle *trap_rule; + bool broadcast_enabled; + bool allmulti_enabled; + bool promisc_enabled; +}; + +#define MLX5E_NUM_INDIR_TIRS (MLX5_NUM_TT - 1) + +#define MLX5_HASH_IP (MLX5_HASH_FIELD_SEL_SRC_IP |\ + MLX5_HASH_FIELD_SEL_DST_IP) +#define MLX5_HASH_IP_L4PORTS (MLX5_HASH_FIELD_SEL_SRC_IP |\ + MLX5_HASH_FIELD_SEL_DST_IP |\ + MLX5_HASH_FIELD_SEL_L4_SPORT |\ + MLX5_HASH_FIELD_SEL_L4_DPORT) +#define MLX5_HASH_IP_IPSEC_SPI (MLX5_HASH_FIELD_SEL_SRC_IP |\ + MLX5_HASH_FIELD_SEL_DST_IP |\ + MLX5_HASH_FIELD_SEL_IPSEC_SPI) + +/* NIC prio FTS */ +enum { + MLX5E_PROMISC_FT_LEVEL, + MLX5E_VLAN_FT_LEVEL, + MLX5E_L2_FT_LEVEL, + MLX5E_TTC_FT_LEVEL, + MLX5E_INNER_TTC_FT_LEVEL, + MLX5E_FS_TT_UDP_FT_LEVEL = MLX5E_INNER_TTC_FT_LEVEL + 1, + MLX5E_FS_TT_ANY_FT_LEVEL = MLX5E_INNER_TTC_FT_LEVEL + 1, +#ifdef CONFIG_MLX5_EN_TLS + MLX5E_ACCEL_FS_TCP_FT_LEVEL = MLX5E_INNER_TTC_FT_LEVEL + 1, +#endif +#ifdef CONFIG_MLX5_EN_ARFS + MLX5E_ARFS_FT_LEVEL = MLX5E_INNER_TTC_FT_LEVEL + 1, +#endif +#ifdef CONFIG_MLX5_EN_IPSEC + MLX5E_ACCEL_FS_ESP_FT_LEVEL = MLX5E_INNER_TTC_FT_LEVEL + 1, + MLX5E_ACCEL_FS_ESP_FT_ERR_LEVEL, +#endif +}; + +struct mlx5e_priv; + +#ifdef CONFIG_MLX5_EN_RXNFC + +struct mlx5e_ethtool_table { + struct mlx5_flow_table *ft; + int num_rules; +}; + +#define ETHTOOL_NUM_L3_L4_FTS 7 +#define ETHTOOL_NUM_L2_FTS 4 + +struct mlx5e_ethtool_steering { + struct mlx5e_ethtool_table l3_l4_ft[ETHTOOL_NUM_L3_L4_FTS]; + struct mlx5e_ethtool_table l2_ft[ETHTOOL_NUM_L2_FTS]; + struct list_head rules; + int tot_num_rules; +}; + +void mlx5e_ethtool_init_steering(struct mlx5e_priv *priv); +void mlx5e_ethtool_cleanup_steering(struct mlx5e_priv *priv); +int mlx5e_ethtool_set_rxnfc(struct mlx5e_priv *priv, struct ethtool_rxnfc *cmd); +int mlx5e_ethtool_get_rxnfc(struct mlx5e_priv *priv, + struct ethtool_rxnfc *info, u32 *rule_locs); +#else +static inline void mlx5e_ethtool_init_steering(struct mlx5e_priv *priv) { } +static inline void mlx5e_ethtool_cleanup_steering(struct mlx5e_priv *priv) { } +static inline int mlx5e_ethtool_set_rxnfc(struct mlx5e_priv *priv, struct ethtool_rxnfc *cmd) +{ return -EOPNOTSUPP; } +static inline int mlx5e_ethtool_get_rxnfc(struct mlx5e_priv *priv, + struct ethtool_rxnfc *info, u32 *rule_locs) +{ return -EOPNOTSUPP; } +#endif /* CONFIG_MLX5_EN_RXNFC */ + +#ifdef CONFIG_MLX5_EN_ARFS +struct mlx5e_arfs_tables; + +int mlx5e_arfs_create_tables(struct mlx5e_priv *priv); +void mlx5e_arfs_destroy_tables(struct mlx5e_priv *priv); +int mlx5e_arfs_enable(struct mlx5e_priv *priv); +int mlx5e_arfs_disable(struct mlx5e_priv *priv); +int mlx5e_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb, + u16 rxq_index, u32 flow_id); +#else +static inline int mlx5e_arfs_create_tables(struct mlx5e_priv *priv) { return 0; } +static inline void mlx5e_arfs_destroy_tables(struct mlx5e_priv *priv) {} +static inline int mlx5e_arfs_enable(struct mlx5e_priv *priv) { return -EOPNOTSUPP; } +static inline int mlx5e_arfs_disable(struct mlx5e_priv *priv) { return -EOPNOTSUPP; } +#endif + +#ifdef CONFIG_MLX5_EN_TLS +struct mlx5e_accel_fs_tcp; +#endif + +struct mlx5e_fs_udp; +struct mlx5e_fs_any; +struct mlx5e_ptp_fs; + +struct mlx5e_flow_steering { + struct mlx5_flow_namespace *ns; + struct mlx5_flow_namespace *egress_ns; +#ifdef CONFIG_MLX5_EN_RXNFC + struct mlx5e_ethtool_steering ethtool; +#endif + struct mlx5e_tc_table tc; + struct mlx5e_promisc_table promisc; + struct mlx5e_vlan_table *vlan; + struct mlx5e_l2_table l2; + struct mlx5_ttc_table *ttc; + struct mlx5_ttc_table *inner_ttc; +#ifdef CONFIG_MLX5_EN_ARFS + struct mlx5e_arfs_tables *arfs; +#endif +#ifdef CONFIG_MLX5_EN_TLS + struct mlx5e_accel_fs_tcp *accel_tcp; +#endif + struct mlx5e_fs_udp *udp; + struct mlx5e_fs_any *any; + struct mlx5e_ptp_fs *ptp_fs; +}; + +void mlx5e_set_ttc_params(struct mlx5e_priv *priv, + struct ttc_params *ttc_params, bool tunnel); + +void mlx5e_destroy_ttc_table(struct mlx5e_priv *priv); +int mlx5e_create_ttc_table(struct mlx5e_priv *priv); + +void mlx5e_destroy_flow_table(struct mlx5e_flow_table *ft); + +void mlx5e_enable_cvlan_filter(struct mlx5e_priv *priv); +void mlx5e_disable_cvlan_filter(struct mlx5e_priv *priv); + +int mlx5e_create_flow_steering(struct mlx5e_priv *priv); +void mlx5e_destroy_flow_steering(struct mlx5e_priv *priv); + +int mlx5e_fs_init(struct mlx5e_priv *priv); +void mlx5e_fs_cleanup(struct mlx5e_priv *priv); + +int mlx5e_add_vlan_trap(struct mlx5e_priv *priv, int trap_id, int tir_num); +void mlx5e_remove_vlan_trap(struct mlx5e_priv *priv); +int mlx5e_add_mac_trap(struct mlx5e_priv *priv, int trap_id, int tir_num); +void mlx5e_remove_mac_trap(struct mlx5e_priv *priv); + +#endif /* __MLX5E_FLOW_STEER_H__ */ + diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/fs_tt_redirect.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/fs_tt_redirect.c new file mode 100644 index 0000000..7aa25a5 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/fs_tt_redirect.c @@ -0,0 +1,605 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2021, Mellanox Technologies inc. All rights reserved. */ + +#include +#include "en/fs_tt_redirect.h" +#include "fs_core.h" + +enum fs_udp_type { + FS_IPV4_UDP, + FS_IPV6_UDP, + FS_UDP_NUM_TYPES, +}; + +struct mlx5e_fs_udp { + struct mlx5e_flow_table tables[FS_UDP_NUM_TYPES]; + struct mlx5_flow_handle *default_rules[FS_UDP_NUM_TYPES]; + int ref_cnt; +}; + +struct mlx5e_fs_any { + struct mlx5e_flow_table table; + struct mlx5_flow_handle *default_rule; + int ref_cnt; +}; + +static char *fs_udp_type2str(enum fs_udp_type i) +{ + switch (i) { + case FS_IPV4_UDP: + return "UDP v4"; + default: /* FS_IPV6_UDP */ + return "UDP v6"; + } +} + +static enum mlx5_traffic_types fs_udp2tt(enum fs_udp_type i) +{ + switch (i) { + case FS_IPV4_UDP: + return MLX5_TT_IPV4_UDP; + default: /* FS_IPV6_UDP */ + return MLX5_TT_IPV6_UDP; + } +} + +static enum fs_udp_type tt2fs_udp(enum mlx5_traffic_types i) +{ + switch (i) { + case MLX5_TT_IPV4_UDP: + return FS_IPV4_UDP; + case MLX5_TT_IPV6_UDP: + return FS_IPV6_UDP; + default: + return FS_UDP_NUM_TYPES; + } +} + +void mlx5e_fs_tt_redirect_del_rule(struct mlx5_flow_handle *rule) +{ + mlx5_del_flow_rules(rule); +} + +static void fs_udp_set_dport_flow(struct mlx5_flow_spec *spec, enum fs_udp_type type, + u16 udp_dport) +{ + spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS; + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.ip_protocol); + MLX5_SET(fte_match_param, spec->match_value, outer_headers.ip_protocol, IPPROTO_UDP); + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.ip_version); + MLX5_SET(fte_match_param, spec->match_value, outer_headers.ip_version, + type == FS_IPV4_UDP ? 4 : 6); + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.udp_dport); + MLX5_SET(fte_match_param, spec->match_value, outer_headers.udp_dport, udp_dport); +} + +struct mlx5_flow_handle * +mlx5e_fs_tt_redirect_udp_add_rule(struct mlx5e_priv *priv, + enum mlx5_traffic_types ttc_type, + u32 tir_num, u16 d_port) +{ + enum fs_udp_type type = tt2fs_udp(ttc_type); + struct mlx5_flow_destination dest = {}; + struct mlx5_flow_table *ft = NULL; + MLX5_DECLARE_FLOW_ACT(flow_act); + struct mlx5_flow_handle *rule; + struct mlx5_flow_spec *spec; + struct mlx5e_fs_udp *fs_udp; + int err; + + if (type == FS_UDP_NUM_TYPES) + return ERR_PTR(-EINVAL); + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) + return ERR_PTR(-ENOMEM); + + fs_udp = priv->fs.udp; + ft = fs_udp->tables[type].t; + + fs_udp_set_dport_flow(spec, type, d_port); + dest.type = MLX5_FLOW_DESTINATION_TYPE_TIR; + dest.tir_num = tir_num; + + rule = mlx5_add_flow_rules(ft, spec, &flow_act, &dest, 1); + kvfree(spec); + + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + netdev_err(priv->netdev, "%s: add %s rule failed, err %d\n", + __func__, fs_udp_type2str(type), err); + } + return rule; +} + +static int fs_udp_add_default_rule(struct mlx5e_priv *priv, enum fs_udp_type type) +{ + struct mlx5e_flow_table *fs_udp_t; + struct mlx5_flow_destination dest; + MLX5_DECLARE_FLOW_ACT(flow_act); + struct mlx5_flow_handle *rule; + struct mlx5e_fs_udp *fs_udp; + int err; + + fs_udp = priv->fs.udp; + fs_udp_t = &fs_udp->tables[type]; + + dest = mlx5_ttc_get_default_dest(priv->fs.ttc, fs_udp2tt(type)); + rule = mlx5_add_flow_rules(fs_udp_t->t, NULL, &flow_act, &dest, 1); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + netdev_err(priv->netdev, + "%s: add default rule failed, fs type=%d, err %d\n", + __func__, type, err); + return err; + } + + fs_udp->default_rules[type] = rule; + return 0; +} + +#define MLX5E_FS_UDP_NUM_GROUPS (2) +#define MLX5E_FS_UDP_GROUP1_SIZE (BIT(16)) +#define MLX5E_FS_UDP_GROUP2_SIZE (BIT(0)) +#define MLX5E_FS_UDP_TABLE_SIZE (MLX5E_FS_UDP_GROUP1_SIZE +\ + MLX5E_FS_UDP_GROUP2_SIZE) +static int fs_udp_create_groups(struct mlx5e_flow_table *ft, enum fs_udp_type type) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + void *outer_headers_c; + int ix = 0; + u32 *in; + int err; + u8 *mc; + + ft->g = kcalloc(MLX5E_FS_UDP_NUM_GROUPS, sizeof(*ft->g), GFP_KERNEL); + in = kvzalloc(inlen, GFP_KERNEL); + if (!in || !ft->g) { + kfree(ft->g); + kvfree(in); + return -ENOMEM; + } + + mc = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria); + outer_headers_c = MLX5_ADDR_OF(fte_match_param, mc, outer_headers); + MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, outer_headers_c, ip_protocol); + MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, outer_headers_c, ip_version); + + switch (type) { + case FS_IPV4_UDP: + case FS_IPV6_UDP: + MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, outer_headers_c, udp_dport); + break; + default: + err = -EINVAL; + goto out; + } + /* Match on udp protocol, Ipv4/6 and dport */ + MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); + MLX5_SET_CFG(in, start_flow_index, ix); + ix += MLX5E_FS_UDP_GROUP1_SIZE; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); + if (IS_ERR(ft->g[ft->num_groups])) + goto err; + ft->num_groups++; + + /* Default Flow Group */ + memset(in, 0, inlen); + MLX5_SET_CFG(in, start_flow_index, ix); + ix += MLX5E_FS_UDP_GROUP2_SIZE; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); + if (IS_ERR(ft->g[ft->num_groups])) + goto err; + ft->num_groups++; + + kvfree(in); + return 0; + +err: + err = PTR_ERR(ft->g[ft->num_groups]); + ft->g[ft->num_groups] = NULL; +out: + kvfree(in); + + return err; +} + +static int fs_udp_create_table(struct mlx5e_priv *priv, enum fs_udp_type type) +{ + struct mlx5e_flow_table *ft = &priv->fs.udp->tables[type]; + struct mlx5_flow_table_attr ft_attr = {}; + int err; + + ft->num_groups = 0; + + ft_attr.max_fte = MLX5E_FS_UDP_TABLE_SIZE; + ft_attr.level = MLX5E_FS_TT_UDP_FT_LEVEL; + ft_attr.prio = MLX5E_NIC_PRIO; + + ft->t = mlx5_create_flow_table(priv->fs.ns, &ft_attr); + if (IS_ERR(ft->t)) { + err = PTR_ERR(ft->t); + ft->t = NULL; + return err; + } + + netdev_dbg(priv->netdev, "Created fs %s table id %u level %u\n", + fs_udp_type2str(type), ft->t->id, ft->t->level); + + err = fs_udp_create_groups(ft, type); + if (err) + goto err; + + err = fs_udp_add_default_rule(priv, type); + if (err) + goto err; + + return 0; + +err: + mlx5e_destroy_flow_table(ft); + return err; +} + +static void fs_udp_destroy_table(struct mlx5e_fs_udp *fs_udp, int i) +{ + if (IS_ERR_OR_NULL(fs_udp->tables[i].t)) + return; + + mlx5_del_flow_rules(fs_udp->default_rules[i]); + mlx5e_destroy_flow_table(&fs_udp->tables[i]); + fs_udp->tables[i].t = NULL; +} + +static int fs_udp_disable(struct mlx5e_priv *priv) +{ + int err, i; + + for (i = 0; i < FS_UDP_NUM_TYPES; i++) { + /* Modify ttc rules destination to point back to the indir TIRs */ + err = mlx5_ttc_fwd_default_dest(priv->fs.ttc, fs_udp2tt(i)); + if (err) { + netdev_err(priv->netdev, + "%s: modify ttc[%d] default destination failed, err(%d)\n", + __func__, fs_udp2tt(i), err); + return err; + } + } + + return 0; +} + +static int fs_udp_enable(struct mlx5e_priv *priv) +{ + struct mlx5_flow_destination dest = {}; + int err, i; + + dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + for (i = 0; i < FS_UDP_NUM_TYPES; i++) { + dest.ft = priv->fs.udp->tables[i].t; + + /* Modify ttc rules destination to point on the accel_fs FTs */ + err = mlx5_ttc_fwd_dest(priv->fs.ttc, fs_udp2tt(i), &dest); + if (err) { + netdev_err(priv->netdev, + "%s: modify ttc[%d] destination to accel failed, err(%d)\n", + __func__, fs_udp2tt(i), err); + return err; + } + } + return 0; +} + +void mlx5e_fs_tt_redirect_udp_destroy(struct mlx5e_priv *priv) +{ + struct mlx5e_fs_udp *fs_udp = priv->fs.udp; + int i; + + if (!fs_udp) + return; + + if (--fs_udp->ref_cnt) + return; + + fs_udp_disable(priv); + + for (i = 0; i < FS_UDP_NUM_TYPES; i++) + fs_udp_destroy_table(fs_udp, i); + + kfree(fs_udp); + priv->fs.udp = NULL; +} + +int mlx5e_fs_tt_redirect_udp_create(struct mlx5e_priv *priv) +{ + int i, err; + + if (priv->fs.udp) { + priv->fs.udp->ref_cnt++; + return 0; + } + + priv->fs.udp = kzalloc(sizeof(*priv->fs.udp), GFP_KERNEL); + if (!priv->fs.udp) + return -ENOMEM; + + for (i = 0; i < FS_UDP_NUM_TYPES; i++) { + err = fs_udp_create_table(priv, i); + if (err) + goto err_destroy_tables; + } + + err = fs_udp_enable(priv); + if (err) + goto err_destroy_tables; + + priv->fs.udp->ref_cnt = 1; + + return 0; + +err_destroy_tables: + while (--i >= 0) + fs_udp_destroy_table(priv->fs.udp, i); + + kfree(priv->fs.udp); + priv->fs.udp = NULL; + return err; +} + +static void fs_any_set_ethertype_flow(struct mlx5_flow_spec *spec, u16 ether_type) +{ + spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS; + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.ethertype); + MLX5_SET(fte_match_param, spec->match_value, outer_headers.ethertype, ether_type); +} + +struct mlx5_flow_handle * +mlx5e_fs_tt_redirect_any_add_rule(struct mlx5e_priv *priv, + u32 tir_num, u16 ether_type) +{ + struct mlx5_flow_destination dest = {}; + struct mlx5_flow_table *ft = NULL; + MLX5_DECLARE_FLOW_ACT(flow_act); + struct mlx5_flow_handle *rule; + struct mlx5_flow_spec *spec; + struct mlx5e_fs_any *fs_any; + int err; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) + return ERR_PTR(-ENOMEM); + + fs_any = priv->fs.any; + ft = fs_any->table.t; + + fs_any_set_ethertype_flow(spec, ether_type); + dest.type = MLX5_FLOW_DESTINATION_TYPE_TIR; + dest.tir_num = tir_num; + + rule = mlx5_add_flow_rules(ft, spec, &flow_act, &dest, 1); + kvfree(spec); + + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + netdev_err(priv->netdev, "%s: add ANY rule failed, err %d\n", + __func__, err); + } + return rule; +} + +static int fs_any_add_default_rule(struct mlx5e_priv *priv) +{ + struct mlx5e_flow_table *fs_any_t; + struct mlx5_flow_destination dest; + MLX5_DECLARE_FLOW_ACT(flow_act); + struct mlx5_flow_handle *rule; + struct mlx5e_fs_any *fs_any; + int err; + + fs_any = priv->fs.any; + fs_any_t = &fs_any->table; + + dest = mlx5_ttc_get_default_dest(priv->fs.ttc, MLX5_TT_ANY); + rule = mlx5_add_flow_rules(fs_any_t->t, NULL, &flow_act, &dest, 1); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + netdev_err(priv->netdev, + "%s: add default rule failed, fs type=ANY, err %d\n", + __func__, err); + return err; + } + + fs_any->default_rule = rule; + return 0; +} + +#define MLX5E_FS_ANY_NUM_GROUPS (2) +#define MLX5E_FS_ANY_GROUP1_SIZE (BIT(16)) +#define MLX5E_FS_ANY_GROUP2_SIZE (BIT(0)) +#define MLX5E_FS_ANY_TABLE_SIZE (MLX5E_FS_ANY_GROUP1_SIZE +\ + MLX5E_FS_ANY_GROUP2_SIZE) + +static int fs_any_create_groups(struct mlx5e_flow_table *ft) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + void *outer_headers_c; + int ix = 0; + u32 *in; + int err; + u8 *mc; + + ft->g = kcalloc(MLX5E_FS_UDP_NUM_GROUPS, sizeof(*ft->g), GFP_KERNEL); + in = kvzalloc(inlen, GFP_KERNEL); + if (!in || !ft->g) { + kfree(ft->g); + kvfree(in); + return -ENOMEM; + } + + /* Match on ethertype */ + mc = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria); + outer_headers_c = MLX5_ADDR_OF(fte_match_param, mc, outer_headers); + MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, outer_headers_c, ethertype); + MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); + MLX5_SET_CFG(in, start_flow_index, ix); + ix += MLX5E_FS_ANY_GROUP1_SIZE; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); + if (IS_ERR(ft->g[ft->num_groups])) + goto err; + ft->num_groups++; + + /* Default Flow Group */ + memset(in, 0, inlen); + MLX5_SET_CFG(in, start_flow_index, ix); + ix += MLX5E_FS_ANY_GROUP2_SIZE; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); + if (IS_ERR(ft->g[ft->num_groups])) + goto err; + ft->num_groups++; + + kvfree(in); + return 0; + +err: + err = PTR_ERR(ft->g[ft->num_groups]); + ft->g[ft->num_groups] = NULL; + kvfree(in); + + return err; +} + +static int fs_any_create_table(struct mlx5e_priv *priv) +{ + struct mlx5e_flow_table *ft = &priv->fs.any->table; + struct mlx5_flow_table_attr ft_attr = {}; + int err; + + ft->num_groups = 0; + + ft_attr.max_fte = MLX5E_FS_UDP_TABLE_SIZE; + ft_attr.level = MLX5E_FS_TT_ANY_FT_LEVEL; + ft_attr.prio = MLX5E_NIC_PRIO; + + ft->t = mlx5_create_flow_table(priv->fs.ns, &ft_attr); + if (IS_ERR(ft->t)) { + err = PTR_ERR(ft->t); + ft->t = NULL; + return err; + } + + netdev_dbg(priv->netdev, "Created fs ANY table id %u level %u\n", + ft->t->id, ft->t->level); + + err = fs_any_create_groups(ft); + if (err) + goto err; + + err = fs_any_add_default_rule(priv); + if (err) + goto err; + + return 0; + +err: + mlx5e_destroy_flow_table(ft); + return err; +} + +static int fs_any_disable(struct mlx5e_priv *priv) +{ + int err; + + /* Modify ttc rules destination to point back to the indir TIRs */ + err = mlx5_ttc_fwd_default_dest(priv->fs.ttc, MLX5_TT_ANY); + if (err) { + netdev_err(priv->netdev, + "%s: modify ttc[%d] default destination failed, err(%d)\n", + __func__, MLX5_TT_ANY, err); + return err; + } + return 0; +} + +static int fs_any_enable(struct mlx5e_priv *priv) +{ + struct mlx5_flow_destination dest = {}; + int err; + + dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dest.ft = priv->fs.any->table.t; + + /* Modify ttc rules destination to point on the accel_fs FTs */ + err = mlx5_ttc_fwd_dest(priv->fs.ttc, MLX5_TT_ANY, &dest); + if (err) { + netdev_err(priv->netdev, + "%s: modify ttc[%d] destination to accel failed, err(%d)\n", + __func__, MLX5_TT_ANY, err); + return err; + } + return 0; +} + +static void fs_any_destroy_table(struct mlx5e_fs_any *fs_any) +{ + if (IS_ERR_OR_NULL(fs_any->table.t)) + return; + + mlx5_del_flow_rules(fs_any->default_rule); + mlx5e_destroy_flow_table(&fs_any->table); + fs_any->table.t = NULL; +} + +void mlx5e_fs_tt_redirect_any_destroy(struct mlx5e_priv *priv) +{ + struct mlx5e_fs_any *fs_any = priv->fs.any; + + if (!fs_any) + return; + + if (--fs_any->ref_cnt) + return; + + fs_any_disable(priv); + + fs_any_destroy_table(fs_any); + + kfree(fs_any); + priv->fs.any = NULL; +} + +int mlx5e_fs_tt_redirect_any_create(struct mlx5e_priv *priv) +{ + int err; + + if (priv->fs.any) { + priv->fs.any->ref_cnt++; + return 0; + } + + priv->fs.any = kzalloc(sizeof(*priv->fs.any), GFP_KERNEL); + if (!priv->fs.any) + return -ENOMEM; + + err = fs_any_create_table(priv); + if (err) + return err; + + err = fs_any_enable(priv); + if (err) + goto err_destroy_table; + + priv->fs.any->ref_cnt = 1; + + return 0; + +err_destroy_table: + fs_any_destroy_table(priv->fs.any); + + kfree(priv->fs.any); + priv->fs.any = NULL; + return err; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/fs_tt_redirect.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/fs_tt_redirect.h new file mode 100644 index 0000000..7a70c4f --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/fs_tt_redirect.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2021 Mellanox Technologies. */ + +#ifndef __MLX5E_FS_TT_REDIRECT_H__ +#define __MLX5E_FS_TT_REDIRECT_H__ + +#include "en.h" +#include "en/fs.h" + +void mlx5e_fs_tt_redirect_del_rule(struct mlx5_flow_handle *rule); + +/* UDP traffic type redirect */ +struct mlx5_flow_handle * +mlx5e_fs_tt_redirect_udp_add_rule(struct mlx5e_priv *priv, + enum mlx5_traffic_types ttc_type, + u32 tir_num, u16 d_port); +void mlx5e_fs_tt_redirect_udp_destroy(struct mlx5e_priv *priv); +int mlx5e_fs_tt_redirect_udp_create(struct mlx5e_priv *priv); + +/* ANY traffic type redirect*/ +struct mlx5_flow_handle * +mlx5e_fs_tt_redirect_any_add_rule(struct mlx5e_priv *priv, + u32 tir_num, u16 ether_type); +void mlx5e_fs_tt_redirect_any_destroy(struct mlx5e_priv *priv); +int mlx5e_fs_tt_redirect_any_create(struct mlx5e_priv *priv); +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/health.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/health.c new file mode 100644 index 0000000..6f4e6c3 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/health.c @@ -0,0 +1,339 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2019 Mellanox Technologies. + +#include "health.h" +#include "lib/eq.h" +#include "lib/mlx5.h" + +int mlx5e_health_fmsg_named_obj_nest_start(struct devlink_fmsg *fmsg, char *name) +{ + int err; + + err = devlink_fmsg_pair_nest_start(fmsg, name); + if (err) + return err; + + err = devlink_fmsg_obj_nest_start(fmsg); + if (err) + return err; + + return 0; +} + +int mlx5e_health_fmsg_named_obj_nest_end(struct devlink_fmsg *fmsg) +{ + int err; + + err = devlink_fmsg_obj_nest_end(fmsg); + if (err) + return err; + + err = devlink_fmsg_pair_nest_end(fmsg); + if (err) + return err; + + return 0; +} + +int mlx5e_health_cq_diag_fmsg(struct mlx5e_cq *cq, struct devlink_fmsg *fmsg) +{ + u32 out[MLX5_ST_SZ_DW(query_cq_out)] = {}; + u8 hw_status; + void *cqc; + int err; + + err = mlx5_core_query_cq(cq->mdev, &cq->mcq, out); + if (err) + return err; + + cqc = MLX5_ADDR_OF(query_cq_out, out, cq_context); + hw_status = MLX5_GET(cqc, cqc, status); + + err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "CQ"); + if (err) + return err; + + err = devlink_fmsg_u32_pair_put(fmsg, "cqn", cq->mcq.cqn); + if (err) + return err; + + err = devlink_fmsg_u8_pair_put(fmsg, "HW status", hw_status); + if (err) + return err; + + err = devlink_fmsg_u32_pair_put(fmsg, "ci", mlx5_cqwq_get_ci(&cq->wq)); + if (err) + return err; + + err = devlink_fmsg_u32_pair_put(fmsg, "size", mlx5_cqwq_get_size(&cq->wq)); + if (err) + return err; + + err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); + if (err) + return err; + + return 0; +} + +int mlx5e_health_cq_common_diag_fmsg(struct mlx5e_cq *cq, struct devlink_fmsg *fmsg) +{ + u8 cq_log_stride; + u32 cq_sz; + int err; + + cq_sz = mlx5_cqwq_get_size(&cq->wq); + cq_log_stride = mlx5_cqwq_get_log_stride_size(&cq->wq); + + err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "CQ"); + if (err) + return err; + + err = devlink_fmsg_u64_pair_put(fmsg, "stride size", BIT(cq_log_stride)); + if (err) + return err; + + err = devlink_fmsg_u32_pair_put(fmsg, "size", cq_sz); + if (err) + return err; + + err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); + if (err) + return err; + + return 0; +} + +int mlx5e_health_eq_diag_fmsg(struct mlx5_eq_comp *eq, struct devlink_fmsg *fmsg) +{ + int err; + + err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "EQ"); + if (err) + return err; + + err = devlink_fmsg_u8_pair_put(fmsg, "eqn", eq->core.eqn); + if (err) + return err; + + err = devlink_fmsg_u32_pair_put(fmsg, "irqn", eq->core.irqn); + if (err) + return err; + + err = devlink_fmsg_u32_pair_put(fmsg, "vecidx", eq->core.vecidx); + if (err) + return err; + + err = devlink_fmsg_u32_pair_put(fmsg, "ci", eq->core.cons_index); + if (err) + return err; + + err = devlink_fmsg_u32_pair_put(fmsg, "size", eq_get_size(&eq->core)); + if (err) + return err; + + return mlx5e_health_fmsg_named_obj_nest_end(fmsg); +} + +void mlx5e_health_create_reporters(struct mlx5e_priv *priv) +{ + mlx5e_reporter_tx_create(priv); + mlx5e_reporter_rx_create(priv); +} + +void mlx5e_health_destroy_reporters(struct mlx5e_priv *priv) +{ + mlx5e_reporter_rx_destroy(priv); + mlx5e_reporter_tx_destroy(priv); +} + +void mlx5e_health_channels_update(struct mlx5e_priv *priv) +{ + if (priv->tx_reporter) + devlink_health_reporter_state_update(priv->tx_reporter, + DEVLINK_HEALTH_REPORTER_STATE_HEALTHY); + if (priv->rx_reporter) + devlink_health_reporter_state_update(priv->rx_reporter, + DEVLINK_HEALTH_REPORTER_STATE_HEALTHY); +} + +int mlx5e_health_sq_to_ready(struct mlx5_core_dev *mdev, struct net_device *dev, u32 sqn) +{ + struct mlx5e_modify_sq_param msp = {}; + int err; + + msp.curr_state = MLX5_SQC_STATE_ERR; + msp.next_state = MLX5_SQC_STATE_RST; + + err = mlx5e_modify_sq(mdev, sqn, &msp); + if (err) { + netdev_err(dev, "Failed to move sq 0x%x to reset\n", sqn); + return err; + } + + memset(&msp, 0, sizeof(msp)); + msp.curr_state = MLX5_SQC_STATE_RST; + msp.next_state = MLX5_SQC_STATE_RDY; + + err = mlx5e_modify_sq(mdev, sqn, &msp); + if (err) { + netdev_err(dev, "Failed to move sq 0x%x to ready\n", sqn); + return err; + } + + return 0; +} + +int mlx5e_health_recover_channels(struct mlx5e_priv *priv) +{ + int err = 0; + + rtnl_lock(); + mutex_lock(&priv->state_lock); + + if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) + goto out; + + err = mlx5e_safe_reopen_channels(priv); + +out: + mutex_unlock(&priv->state_lock); + rtnl_unlock(); + + return err; +} + +int mlx5e_health_channel_eq_recover(struct net_device *dev, struct mlx5_eq_comp *eq, + struct mlx5e_ch_stats *stats) +{ + u32 eqe_count; + + netdev_err(dev, "EQ 0x%x: Cons = 0x%x, irqn = 0x%x\n", + eq->core.eqn, eq->core.cons_index, eq->core.irqn); + + eqe_count = mlx5_eq_poll_irq_disabled(eq); + if (!eqe_count) + return -EIO; + + netdev_err(dev, "Recovered %d eqes on EQ 0x%x\n", + eqe_count, eq->core.eqn); + + stats->eq_rearm++; + return 0; +} + +int mlx5e_health_report(struct mlx5e_priv *priv, + struct devlink_health_reporter *reporter, char *err_str, + struct mlx5e_err_ctx *err_ctx) +{ + netdev_err(priv->netdev, "%s\n", err_str); + + if (!reporter) + return err_ctx->recover(err_ctx->ctx); + + return devlink_health_report(reporter, err_str, err_ctx); +} + +#define MLX5_HEALTH_DEVLINK_MAX_SIZE 1024 +static int mlx5e_health_rsc_fmsg_binary(struct devlink_fmsg *fmsg, + const void *value, u32 value_len) + +{ + u32 data_size; + int err = 0; + u32 offset; + + for (offset = 0; offset < value_len; offset += data_size) { + data_size = value_len - offset; + if (data_size > MLX5_HEALTH_DEVLINK_MAX_SIZE) + data_size = MLX5_HEALTH_DEVLINK_MAX_SIZE; + err = devlink_fmsg_binary_put(fmsg, value + offset, data_size); + if (err) + break; + } + return err; +} + +int mlx5e_health_rsc_fmsg_dump(struct mlx5e_priv *priv, struct mlx5_rsc_key *key, + struct devlink_fmsg *fmsg) +{ + struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5_rsc_dump_cmd *cmd; + struct page *page; + int cmd_err, err; + int end_err; + int size; + + if (IS_ERR_OR_NULL(mdev->rsc_dump)) + return -EOPNOTSUPP; + + page = alloc_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + + err = devlink_fmsg_binary_pair_nest_start(fmsg, "data"); + if (err) + goto free_page; + + cmd = mlx5_rsc_dump_cmd_create(mdev, key); + if (IS_ERR(cmd)) { + err = PTR_ERR(cmd); + goto free_page; + } + + do { + cmd_err = mlx5_rsc_dump_next(mdev, cmd, page, &size); + if (cmd_err < 0) { + err = cmd_err; + goto destroy_cmd; + } + + err = mlx5e_health_rsc_fmsg_binary(fmsg, page_address(page), size); + if (err) + goto destroy_cmd; + + } while (cmd_err > 0); + +destroy_cmd: + mlx5_rsc_dump_cmd_destroy(cmd); + end_err = devlink_fmsg_binary_pair_nest_end(fmsg); + if (end_err) + err = end_err; +free_page: + __free_page(page); + return err; +} + +int mlx5e_health_queue_dump(struct mlx5e_priv *priv, struct devlink_fmsg *fmsg, + int queue_idx, char *lbl) +{ + struct mlx5_rsc_key key = {}; + int err; + + key.rsc = MLX5_SGMT_TYPE_FULL_QPC; + key.index1 = queue_idx; + key.size = PAGE_SIZE; + key.num_of_obj1 = 1; + + err = devlink_fmsg_obj_nest_start(fmsg); + if (err) + return err; + + err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, lbl); + if (err) + return err; + + err = devlink_fmsg_u32_pair_put(fmsg, "index", queue_idx); + if (err) + return err; + + err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); + if (err) + return err; + + err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); + if (err) + return err; + + return devlink_fmsg_obj_nest_end(fmsg); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/health.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/health.h new file mode 100644 index 0000000..0107e4e --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/health.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2019 Mellanox Technologies. */ + +#ifndef __MLX5E_EN_HEALTH_H +#define __MLX5E_EN_HEALTH_H + +#include "en.h" +#include "diag/rsc_dump.h" + +static inline bool cqe_syndrome_needs_recover(u8 syndrome) +{ + return syndrome == MLX5_CQE_SYNDROME_LOCAL_QP_OP_ERR || + syndrome == MLX5_CQE_SYNDROME_LOCAL_PROT_ERR || + syndrome == MLX5_CQE_SYNDROME_WR_FLUSH_ERR; +} + +void mlx5e_reporter_tx_create(struct mlx5e_priv *priv); +void mlx5e_reporter_tx_destroy(struct mlx5e_priv *priv); +void mlx5e_reporter_tx_err_cqe(struct mlx5e_txqsq *sq); +int mlx5e_reporter_tx_timeout(struct mlx5e_txqsq *sq); + +int mlx5e_health_cq_diag_fmsg(struct mlx5e_cq *cq, struct devlink_fmsg *fmsg); +int mlx5e_health_cq_common_diag_fmsg(struct mlx5e_cq *cq, struct devlink_fmsg *fmsg); +int mlx5e_health_eq_diag_fmsg(struct mlx5_eq_comp *eq, struct devlink_fmsg *fmsg); +int mlx5e_health_fmsg_named_obj_nest_start(struct devlink_fmsg *fmsg, char *name); +int mlx5e_health_fmsg_named_obj_nest_end(struct devlink_fmsg *fmsg); + +void mlx5e_reporter_rx_create(struct mlx5e_priv *priv); +void mlx5e_reporter_rx_destroy(struct mlx5e_priv *priv); +void mlx5e_reporter_icosq_cqe_err(struct mlx5e_icosq *icosq); +void mlx5e_reporter_rq_cqe_err(struct mlx5e_rq *rq); +void mlx5e_reporter_rx_timeout(struct mlx5e_rq *rq); +void mlx5e_reporter_icosq_suspend_recovery(struct mlx5e_channel *c); +void mlx5e_reporter_icosq_resume_recovery(struct mlx5e_channel *c); + +#define MLX5E_REPORTER_PER_Q_MAX_LEN 256 + +struct mlx5e_err_ctx { + int (*recover)(void *ctx); + int (*dump)(struct mlx5e_priv *priv, struct devlink_fmsg *fmsg, void *ctx); + void *ctx; +}; + +int mlx5e_health_sq_to_ready(struct mlx5_core_dev *mdev, struct net_device *dev, u32 sqn); +int mlx5e_health_channel_eq_recover(struct net_device *dev, struct mlx5_eq_comp *eq, + struct mlx5e_ch_stats *stats); +int mlx5e_health_recover_channels(struct mlx5e_priv *priv); +int mlx5e_health_report(struct mlx5e_priv *priv, + struct devlink_health_reporter *reporter, char *err_str, + struct mlx5e_err_ctx *err_ctx); +void mlx5e_health_create_reporters(struct mlx5e_priv *priv); +void mlx5e_health_destroy_reporters(struct mlx5e_priv *priv); +void mlx5e_health_channels_update(struct mlx5e_priv *priv); +int mlx5e_health_rsc_fmsg_dump(struct mlx5e_priv *priv, struct mlx5_rsc_key *key, + struct devlink_fmsg *fmsg); +int mlx5e_health_queue_dump(struct mlx5e_priv *priv, struct devlink_fmsg *fmsg, + int queue_idx, char *lbl); +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c new file mode 100644 index 0000000..b4f3bd7 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c @@ -0,0 +1,159 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// Copyright (c) 2018 Mellanox Technologies + +#include "en.h" +#include "en/hv_vhca_stats.h" +#include "lib/hv_vhca.h" +#include "lib/hv.h" + +struct mlx5e_hv_vhca_per_ring_stats { + u64 rx_packets; + u64 rx_bytes; + u64 tx_packets; + u64 tx_bytes; +}; + +static void +mlx5e_hv_vhca_fill_ring_stats(struct mlx5e_priv *priv, int ch, + struct mlx5e_hv_vhca_per_ring_stats *data) +{ + struct mlx5e_channel_stats *stats; + int tc; + + stats = priv->channel_stats[ch]; + data->rx_packets = stats->rq.packets; + data->rx_bytes = stats->rq.bytes; + + for (tc = 0; tc < priv->max_opened_tc; tc++) { + data->tx_packets += stats->sq[tc].packets; + data->tx_bytes += stats->sq[tc].bytes; + } +} + +static void mlx5e_hv_vhca_fill_stats(struct mlx5e_priv *priv, void *data, + int buf_len) +{ + int ch, i = 0; + + for (ch = 0; ch < priv->stats_nch; ch++) { + void *buf = data + i; + + if (WARN_ON_ONCE(buf + + sizeof(struct mlx5e_hv_vhca_per_ring_stats) > + data + buf_len)) + return; + + mlx5e_hv_vhca_fill_ring_stats(priv, ch, buf); + i += sizeof(struct mlx5e_hv_vhca_per_ring_stats); + } +} + +static int mlx5e_hv_vhca_stats_buf_size(struct mlx5e_priv *priv) +{ + return (sizeof(struct mlx5e_hv_vhca_per_ring_stats) * + priv->stats_nch); +} + +static void mlx5e_hv_vhca_stats_work(struct work_struct *work) +{ + struct mlx5e_hv_vhca_stats_agent *sagent; + struct mlx5_hv_vhca_agent *agent; + struct delayed_work *dwork; + struct mlx5e_priv *priv; + int buf_len, rc; + void *buf; + + dwork = to_delayed_work(work); + sagent = container_of(dwork, struct mlx5e_hv_vhca_stats_agent, work); + priv = container_of(sagent, struct mlx5e_priv, stats_agent); + buf_len = mlx5e_hv_vhca_stats_buf_size(priv); + agent = sagent->agent; + buf = sagent->buf; + + memset(buf, 0, buf_len); + mlx5e_hv_vhca_fill_stats(priv, buf, buf_len); + + rc = mlx5_hv_vhca_agent_write(agent, buf, buf_len); + if (rc) { + mlx5_core_err(priv->mdev, + "%s: Failed to write stats, err = %d\n", + __func__, rc); + return; + } + + if (sagent->delay) + queue_delayed_work(priv->wq, &sagent->work, sagent->delay); +} + +enum { + MLX5_HV_VHCA_STATS_VERSION = 1, + MLX5_HV_VHCA_STATS_UPDATE_ONCE = 0xFFFF, +}; + +static void mlx5e_hv_vhca_stats_control(struct mlx5_hv_vhca_agent *agent, + struct mlx5_hv_vhca_control_block *block) +{ + struct mlx5e_hv_vhca_stats_agent *sagent; + struct mlx5e_priv *priv; + + priv = mlx5_hv_vhca_agent_priv(agent); + sagent = &priv->stats_agent; + + block->version = MLX5_HV_VHCA_STATS_VERSION; + block->rings = priv->stats_nch; + + if (!block->command) { + cancel_delayed_work_sync(&priv->stats_agent.work); + return; + } + + sagent->delay = block->command == MLX5_HV_VHCA_STATS_UPDATE_ONCE ? 0 : + msecs_to_jiffies(block->command * 100); + + queue_delayed_work(priv->wq, &sagent->work, sagent->delay); +} + +static void mlx5e_hv_vhca_stats_cleanup(struct mlx5_hv_vhca_agent *agent) +{ + struct mlx5e_priv *priv = mlx5_hv_vhca_agent_priv(agent); + + cancel_delayed_work_sync(&priv->stats_agent.work); +} + +void mlx5e_hv_vhca_stats_create(struct mlx5e_priv *priv) +{ + int buf_len = mlx5e_hv_vhca_stats_buf_size(priv); + struct mlx5_hv_vhca_agent *agent; + + priv->stats_agent.buf = kvzalloc(buf_len, GFP_KERNEL); + if (!priv->stats_agent.buf) + return; + + agent = mlx5_hv_vhca_agent_create(priv->mdev->hv_vhca, + MLX5_HV_VHCA_AGENT_STATS, + mlx5e_hv_vhca_stats_control, NULL, + mlx5e_hv_vhca_stats_cleanup, + priv); + + if (IS_ERR_OR_NULL(agent)) { + if (IS_ERR(agent)) + netdev_warn(priv->netdev, + "Failed to create hv vhca stats agent, err = %ld\n", + PTR_ERR(agent)); + + kvfree(priv->stats_agent.buf); + return; + } + + priv->stats_agent.agent = agent; + INIT_DELAYED_WORK(&priv->stats_agent.work, mlx5e_hv_vhca_stats_work); +} + +void mlx5e_hv_vhca_stats_destroy(struct mlx5e_priv *priv) +{ + if (IS_ERR_OR_NULL(priv->stats_agent.agent)) + return; + + mlx5_hv_vhca_agent_destroy(priv->stats_agent.agent); + kvfree(priv->stats_agent.buf); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.h new file mode 100644 index 0000000..29c8c6d --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2019 Mellanox Technologies. */ + +#ifndef __MLX5_EN_STATS_VHCA_H__ +#define __MLX5_EN_STATS_VHCA_H__ +#include "en.h" + +#if IS_ENABLED(CONFIG_PCI_HYPERV_INTERFACE) + +void mlx5e_hv_vhca_stats_create(struct mlx5e_priv *priv); +void mlx5e_hv_vhca_stats_destroy(struct mlx5e_priv *priv); + +#else +static inline void mlx5e_hv_vhca_stats_create(struct mlx5e_priv *priv) {} +static inline void mlx5e_hv_vhca_stats_destroy(struct mlx5e_priv *priv) {} +#endif + +#endif /* __MLX5_EN_STATS_VHCA_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/ipsec_aso.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/ipsec_aso.c new file mode 100644 index 0000000..2bb6a8c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/ipsec_aso.c @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// // Copyright (c) 2021 Mellanox Technologies. + +#include "aso.h" +#include "ipsec_aso.h" + +static int mlx5e_aso_send_ipsec_aso(struct mlx5e_priv *priv, u32 ipsec_obj_id, + struct mlx5e_aso_ctrl_param *param, + u32 *hard_cnt, u32 *soft_cnt, + u8 *event_arm, u32 *mode_param) +{ + struct mlx5e_aso *aso = priv->ipsec->aso; + struct mlx5e_asosq *sq = &aso->sq; + struct mlx5_wq_cyc *wq = &sq->wq; + struct mlx5e_aso_wqe *aso_wqe; + u16 pi, contig_wqebbs_room; + int err = 0; + + memset(aso->ctx, 0, aso->size); + + pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc); + contig_wqebbs_room = mlx5_wq_cyc_get_contig_wqebbs(wq, pi); + + if (unlikely(contig_wqebbs_room < MLX5E_ASO_WQEBBS)) { + mlx5e_fill_asosq_frag_edge(sq, wq, pi, contig_wqebbs_room); + pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc); + } + + aso_wqe = mlx5_wq_cyc_get_wqe(wq, pi); + + /* read enable always set */ + mlx5e_build_aso_wqe(aso, sq, + DIV_ROUND_UP(sizeof(*aso_wqe), MLX5_SEND_WQE_DS), + &aso_wqe->ctrl, &aso_wqe->aso_ctrl, ipsec_obj_id, + MLX5_ACCESS_ASO_OPC_MOD_IPSEC, param); + + sq->db.aso_wqe[pi].opcode = MLX5_OPCODE_ACCESS_ASO; + sq->db.aso_wqe[pi].with_data = false; + sq->pc += MLX5E_ASO_WQEBBS; + sq->doorbell_cseg = &aso_wqe->ctrl; + + mlx5e_notify_hw(&sq->wq, sq->pc, sq->uar_map, sq->doorbell_cseg); + + /* Ensure doorbell is written on uar_page before poll_cq */ + WRITE_ONCE(sq->doorbell_cseg, NULL); + + err = mlx5e_poll_aso_cq(&sq->cq); + if (err) + return err; + + if (hard_cnt) + *hard_cnt = MLX5_GET(ipsec_aso, aso->ctx, remove_flow_pkt_cnt); + if (soft_cnt) + *soft_cnt = MLX5_GET(ipsec_aso, aso->ctx, remove_flow_soft_lft); + + if (event_arm) { + *event_arm = 0; + if (MLX5_GET(ipsec_aso, aso->ctx, esn_event_arm)) + *event_arm |= MLX5_ASO_ESN_ARM; + if (MLX5_GET(ipsec_aso, aso->ctx, soft_lft_arm)) + *event_arm |= MLX5_ASO_SOFT_ARM; + if (MLX5_GET(ipsec_aso, aso->ctx, hard_lft_arm)) + *event_arm |= MLX5_ASO_HARD_ARM; + if (MLX5_GET(ipsec_aso, aso->ctx, remove_flow_enable)) + *event_arm |= MLX5_ASO_REMOVE_FLOW_ENABLE; + } + + if (mode_param) + *mode_param = MLX5_GET(ipsec_aso, aso->ctx, mode_parameter); + + return err; +} + +#define UPPER32_MASK GENMASK_ULL(63, 32) + +int mlx5e_ipsec_aso_query(struct mlx5e_priv *priv, u32 obj_id, + u32 *hard_cnt, u32 *soft_cnt, + u8 *event_arm, u32 *mode_param) +{ + return mlx5e_aso_send_ipsec_aso(priv, obj_id, NULL, hard_cnt, soft_cnt, + event_arm, mode_param); +} + +int mlx5e_ipsec_aso_set(struct mlx5e_priv *priv, u32 obj_id, u8 flags, + u32 comparator, u32 *hard_cnt, u32 *soft_cnt, + u8 *event_arm, u32 *mode_param) +{ + struct mlx5e_aso_ctrl_param param = {}; + int err = 0; + + if (!flags) + return -EINVAL; + + param.data_mask_mode = ASO_DATA_MASK_MODE_BITWISE_64BIT; + param.condition_0_operand = ALWAYS_TRUE; + param.condition_1_operand = ALWAYS_TRUE; + + if (flags & ARM_ESN_EVENT) { + param.data_offset = MLX5_IPSEC_ASO_REMOVE_FLOW_PKT_CNT_OFFSET; + param.bitwise_data = BIT(22) << 32; + param.data_mask = param.bitwise_data; + return mlx5e_aso_send_ipsec_aso(priv, obj_id, ¶m, NULL, NULL, NULL, NULL); + } + + if (flags & SET_SOFT) { + param.data_offset = MLX5_IPSEC_ASO_REMOVE_FLOW_SOFT_LFT_OFFSET; + param.bitwise_data = (u64)(comparator) << 32; + param.data_mask = UPPER32_MASK; + err = mlx5e_aso_send_ipsec_aso(priv, obj_id, ¶m, hard_cnt, soft_cnt, + NULL, NULL); + if (flags == SET_SOFT) + return err; + } + + /* For ASO_WQE big Endian format, + * ARM_SOFT is BIT(25 + 32) + * SET COUNTER BIT 31 is BIT(31) + */ + param.data_offset = MLX5_IPSEC_ASO_REMOVE_FLOW_PKT_CNT_OFFSET; + + if (flags & SET_CNT_BIT31) + param.bitwise_data = IPSEC_SW_LIMIT; + if (flags & ARM_SOFT) + param.bitwise_data |= BIT(25 + 32); + if (flags & CLEAR_SOFT) + param.bitwise_data &= ~(BIT(25 + 32)); + + param.data_mask = param.bitwise_data; + return mlx5e_aso_send_ipsec_aso(priv, obj_id, ¶m, hard_cnt, soft_cnt, NULL, NULL); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/ipsec_aso.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/ipsec_aso.h new file mode 100644 index 0000000..5f6737f --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/ipsec_aso.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2021 Mellanox Technologies. */ + +#include "en.h" +#include "en_accel/ipsec.h" +#include "aso.h" + +#ifndef __MLX5_EN_IPSEC_ASO_H__ +#define __MLX5_EN_IPSEC_ASO_H__ + +enum { + MLX5_IPSEC_ASO_REMOVE_FLOW_PKT_CNT_OFFSET, + MLX5_IPSEC_ASO_REMOVE_FLOW_SOFT_LFT_OFFSET, +}; + +int mlx5e_ipsec_aso_query(struct mlx5e_priv *priv, u32 obj_id, + u32 *hard_cnt, u32 *soft_cnt, + u8 *event_arm, u32 *mode_param); +int mlx5e_ipsec_aso_set(struct mlx5e_priv *priv, u32 obj_id, u8 flags, + u32 comparator, u32 *hard_cnt, u32 *soft_cnt, + u8 *event_arm, u32 *mode_param); +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/mapping.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/mapping.c new file mode 100644 index 0000000..4e72ca8 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/mapping.c @@ -0,0 +1,263 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2018 Mellanox Technologies */ + +#include +#include +#include +#include +#include + +#include "mapping.h" + +#define MAPPING_GRACE_PERIOD 2000 + +static LIST_HEAD(shared_ctx_list); +static DEFINE_MUTEX(shared_ctx_lock); + +struct mapping_ctx { + struct xarray xarray; + DECLARE_HASHTABLE(ht, 8); + struct mutex lock; /* Guards hashtable and xarray */ + unsigned long max_id; + size_t data_size; + bool delayed_removal; + struct delayed_work dwork; + struct list_head pending_list; + spinlock_t pending_list_lock; /* Guards pending list */ + u64 id; + u8 type; + struct list_head list; + refcount_t refcount; +}; + +struct mapping_item { + struct rcu_head rcu; + struct list_head list; + unsigned long timeout; + struct hlist_node node; + int cnt; + u32 id; + char data[]; +}; + +int mapping_add(struct mapping_ctx *ctx, void *data, u32 *id) +{ + struct mapping_item *mi; + int err = -ENOMEM; + u32 hash_key; + + mutex_lock(&ctx->lock); + + hash_key = jhash(data, ctx->data_size, 0); + hash_for_each_possible(ctx->ht, mi, node, hash_key) { + if (!memcmp(data, mi->data, ctx->data_size)) + goto attach; + } + + mi = kzalloc(sizeof(*mi) + ctx->data_size, GFP_KERNEL); + if (!mi) + goto err_alloc; + + memcpy(mi->data, data, ctx->data_size); + hash_add(ctx->ht, &mi->node, hash_key); + + err = xa_alloc(&ctx->xarray, &mi->id, mi, XA_LIMIT(1, ctx->max_id), + GFP_KERNEL); + if (err) + goto err_assign; +attach: + ++mi->cnt; + *id = mi->id; + + mutex_unlock(&ctx->lock); + + return 0; + +err_assign: + hash_del(&mi->node); + kfree(mi); +err_alloc: + mutex_unlock(&ctx->lock); + + return err; +} + +static void mapping_remove_and_free(struct mapping_ctx *ctx, + struct mapping_item *mi) +{ + xa_erase(&ctx->xarray, mi->id); + kfree_rcu(mi, rcu); +} + +static void mapping_free_item(struct mapping_ctx *ctx, + struct mapping_item *mi) +{ + if (!ctx->delayed_removal) { + mapping_remove_and_free(ctx, mi); + return; + } + + mi->timeout = jiffies + msecs_to_jiffies(MAPPING_GRACE_PERIOD); + + spin_lock(&ctx->pending_list_lock); + list_add_tail(&mi->list, &ctx->pending_list); + spin_unlock(&ctx->pending_list_lock); + + schedule_delayed_work(&ctx->dwork, MAPPING_GRACE_PERIOD); +} + +int mapping_remove(struct mapping_ctx *ctx, u32 id) +{ + unsigned long index = id; + struct mapping_item *mi; + int err = -ENOENT; + + mutex_lock(&ctx->lock); + mi = xa_load(&ctx->xarray, index); + if (!mi) + goto out; + err = 0; + + if (--mi->cnt > 0) + goto out; + + hash_del(&mi->node); + mapping_free_item(ctx, mi); +out: + mutex_unlock(&ctx->lock); + + return err; +} + +int mapping_find(struct mapping_ctx *ctx, u32 id, void *data) +{ + unsigned long index = id; + struct mapping_item *mi; + int err = -ENOENT; + + rcu_read_lock(); + mi = xa_load(&ctx->xarray, index); + if (!mi) + goto err_find; + + memcpy(data, mi->data, ctx->data_size); + err = 0; + +err_find: + rcu_read_unlock(); + return err; +} + +static void +mapping_remove_and_free_list(struct mapping_ctx *ctx, struct list_head *list) +{ + struct mapping_item *mi; + + list_for_each_entry(mi, list, list) + mapping_remove_and_free(ctx, mi); +} + +static void mapping_work_handler(struct work_struct *work) +{ + unsigned long min_timeout = 0, now = jiffies; + struct mapping_item *mi, *next; + LIST_HEAD(pending_items); + struct mapping_ctx *ctx; + + ctx = container_of(work, struct mapping_ctx, dwork.work); + + spin_lock(&ctx->pending_list_lock); + list_for_each_entry_safe(mi, next, &ctx->pending_list, list) { + if (time_after(now, mi->timeout)) + list_move(&mi->list, &pending_items); + else if (!min_timeout || + time_before(mi->timeout, min_timeout)) + min_timeout = mi->timeout; + } + spin_unlock(&ctx->pending_list_lock); + + mapping_remove_and_free_list(ctx, &pending_items); + + if (min_timeout) + schedule_delayed_work(&ctx->dwork, abs(min_timeout - now)); +} + +static void mapping_flush_work(struct mapping_ctx *ctx) +{ + if (!ctx->delayed_removal) + return; + + cancel_delayed_work_sync(&ctx->dwork); + mapping_remove_and_free_list(ctx, &ctx->pending_list); +} + +struct mapping_ctx * +mapping_create(size_t data_size, u32 max_id, bool delayed_removal) +{ + struct mapping_ctx *ctx; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return ERR_PTR(-ENOMEM); + + ctx->max_id = max_id ? max_id : UINT_MAX; + ctx->data_size = data_size; + + if (delayed_removal) { + INIT_DELAYED_WORK(&ctx->dwork, mapping_work_handler); + INIT_LIST_HEAD(&ctx->pending_list); + spin_lock_init(&ctx->pending_list_lock); + ctx->delayed_removal = true; + } + + mutex_init(&ctx->lock); + xa_init_flags(&ctx->xarray, XA_FLAGS_ALLOC1); + + refcount_set(&ctx->refcount, 1); + INIT_LIST_HEAD(&ctx->list); + + return ctx; +} + +struct mapping_ctx * +mapping_create_for_id(u64 id, u8 type, size_t data_size, u32 max_id, bool delayed_removal) +{ + struct mapping_ctx *ctx; + + mutex_lock(&shared_ctx_lock); + list_for_each_entry(ctx, &shared_ctx_list, list) { + if (ctx->id == id && ctx->type == type) { + if (refcount_inc_not_zero(&ctx->refcount)) + goto unlock; + break; + } + } + + ctx = mapping_create(data_size, max_id, delayed_removal); + if (IS_ERR(ctx)) + goto unlock; + + ctx->id = id; + ctx->type = type; + list_add(&ctx->list, &shared_ctx_list); + +unlock: + mutex_unlock(&shared_ctx_lock); + return ctx; +} + +void mapping_destroy(struct mapping_ctx *ctx) +{ + if (!refcount_dec_and_test(&ctx->refcount)) + return; + + mutex_lock(&shared_ctx_lock); + list_del(&ctx->list); + mutex_unlock(&shared_ctx_lock); + + mapping_flush_work(ctx); + xa_destroy(&ctx->xarray); + mutex_destroy(&ctx->lock); + + kfree(ctx); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/mapping.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/mapping.h new file mode 100644 index 0000000..4e2119f --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/mapping.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2019 Mellanox Technologies */ + +#ifndef __MLX5_MAPPING_H__ +#define __MLX5_MAPPING_H__ + +struct mapping_ctx; + +int mapping_add(struct mapping_ctx *ctx, void *data, u32 *id); +int mapping_remove(struct mapping_ctx *ctx, u32 id); +int mapping_find(struct mapping_ctx *ctx, u32 id, void *data); + +/* mapping uses an xarray to map data to ids in add(), and for find(). + * For locking, it uses a internal xarray spin lock for add()/remove(), + * find() uses rcu_read_lock(). + * Choosing delayed_removal postpones the removal of a previously mapped + * id by MAPPING_GRACE_PERIOD milliseconds. + * This is to avoid races against hardware, where we mark the packet in + * hardware with a previous id, and quick remove() and add() reusing the same + * previous id. Then find() will get the new mapping instead of the old + * which was used to mark the packet. + */ +struct mapping_ctx *mapping_create(size_t data_size, u32 max_id, + bool delayed_removal); +void mapping_destroy(struct mapping_ctx *ctx); + +/* adds mapping with an id or get an existing mapping with the same id + */ +struct mapping_ctx * +mapping_create_for_id(u64 id, u8 type, size_t data_size, u32 max_id, bool delayed_removal); + +#endif /* __MLX5_MAPPING_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/mod_hdr.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/mod_hdr.c new file mode 100644 index 0000000..17325c5 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/mod_hdr.c @@ -0,0 +1,215 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// Copyright (c) 2020 Mellanox Technologies + +#include +#include "mod_hdr.h" + +#define MLX5_MH_ACT_SZ MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto) + +struct mod_hdr_key { + int num_actions; + void *actions; +}; + +struct mlx5e_mod_hdr_handle { + /* a node of a hash table which keeps all the mod_hdr entries */ + struct hlist_node mod_hdr_hlist; + + struct mod_hdr_key key; + + struct mlx5_modify_hdr *modify_hdr; + + refcount_t refcnt; + struct completion res_ready; + int compl_result; +}; + +static u32 hash_mod_hdr_info(struct mod_hdr_key *key) +{ + return jhash(key->actions, + key->num_actions * MLX5_MH_ACT_SZ, 0); +} + +static int cmp_mod_hdr_info(struct mod_hdr_key *a, struct mod_hdr_key *b) +{ + if (a->num_actions != b->num_actions) + return 1; + + return memcmp(a->actions, b->actions, + a->num_actions * MLX5_MH_ACT_SZ); +} + +void mlx5e_mod_hdr_tbl_init(struct mod_hdr_tbl *tbl) +{ + mutex_init(&tbl->lock); + hash_init(tbl->hlist); +} + +void mlx5e_mod_hdr_tbl_destroy(struct mod_hdr_tbl *tbl) +{ + mutex_destroy(&tbl->lock); +} + +static struct mlx5e_mod_hdr_handle *mod_hdr_get(struct mod_hdr_tbl *tbl, + struct mod_hdr_key *key, + u32 hash_key) +{ + struct mlx5e_mod_hdr_handle *mh, *found = NULL; + + hash_for_each_possible(tbl->hlist, mh, mod_hdr_hlist, hash_key) { + if (!cmp_mod_hdr_info(&mh->key, key)) { + refcount_inc(&mh->refcnt); + found = mh; + break; + } + } + + return found; +} + +struct mlx5e_mod_hdr_handle * +mlx5e_mod_hdr_attach(struct mlx5_core_dev *mdev, + struct mod_hdr_tbl *tbl, + enum mlx5_flow_namespace_type namespace, + struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts) +{ + int num_actions, actions_size, err; + struct mlx5e_mod_hdr_handle *mh; + struct mod_hdr_key key; + u32 hash_key; + + num_actions = mod_hdr_acts->num_actions; + actions_size = MLX5_MH_ACT_SZ * num_actions; + + key.actions = mod_hdr_acts->actions; + key.num_actions = num_actions; + + hash_key = hash_mod_hdr_info(&key); + + mutex_lock(&tbl->lock); + mh = mod_hdr_get(tbl, &key, hash_key); + if (mh) { + mutex_unlock(&tbl->lock); + wait_for_completion(&mh->res_ready); + + if (mh->compl_result < 0) { + err = -EREMOTEIO; + goto attach_header_err; + } + goto attach_header; + } + + mh = kzalloc(sizeof(*mh) + actions_size, GFP_KERNEL); + if (!mh) { + mutex_unlock(&tbl->lock); + return ERR_PTR(-ENOMEM); + } + + mh->key.actions = (void *)mh + sizeof(*mh); + memcpy(mh->key.actions, key.actions, actions_size); + mh->key.num_actions = num_actions; + refcount_set(&mh->refcnt, 1); + init_completion(&mh->res_ready); + + hash_add(tbl->hlist, &mh->mod_hdr_hlist, hash_key); + mutex_unlock(&tbl->lock); + + mh->modify_hdr = mlx5_modify_header_alloc(mdev, namespace, + mh->key.num_actions, + mh->key.actions); + if (IS_ERR(mh->modify_hdr)) { + err = PTR_ERR(mh->modify_hdr); + mh->compl_result = err; + goto alloc_header_err; + } + mh->compl_result = 1; + complete_all(&mh->res_ready); + +attach_header: + return mh; + +alloc_header_err: + complete_all(&mh->res_ready); +attach_header_err: + mlx5e_mod_hdr_detach(mdev, tbl, mh); + return ERR_PTR(err); +} + +void mlx5e_mod_hdr_detach(struct mlx5_core_dev *mdev, + struct mod_hdr_tbl *tbl, + struct mlx5e_mod_hdr_handle *mh) +{ + if (!refcount_dec_and_mutex_lock(&mh->refcnt, &tbl->lock)) + return; + hash_del(&mh->mod_hdr_hlist); + mutex_unlock(&tbl->lock); + + if (mh->compl_result > 0) + mlx5_modify_header_dealloc(mdev, mh->modify_hdr); + + kfree(mh); +} + +struct mlx5_modify_hdr *mlx5e_mod_hdr_get(struct mlx5e_mod_hdr_handle *mh) +{ + return mh->modify_hdr; +} + +char * +mlx5e_mod_hdr_alloc(struct mlx5_core_dev *mdev, int namespace, + struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts) +{ + int new_num_actions, max_hw_actions; + size_t new_sz, old_sz; + void *ret; + + if (mod_hdr_acts->num_actions < mod_hdr_acts->max_actions) + goto out; + + max_hw_actions = mlx5e_mod_hdr_max_actions(mdev, namespace); + new_num_actions = min(max_hw_actions, + mod_hdr_acts->actions ? + mod_hdr_acts->max_actions * 2 : 1); + if (mod_hdr_acts->max_actions == new_num_actions) + return ERR_PTR(-ENOSPC); + + new_sz = MLX5_MH_ACT_SZ * new_num_actions; + old_sz = mod_hdr_acts->max_actions * MLX5_MH_ACT_SZ; + + if (mod_hdr_acts->is_static) { + ret = kzalloc(new_sz, GFP_KERNEL); + if (ret) { + memcpy(ret, mod_hdr_acts->actions, old_sz); + mod_hdr_acts->is_static = false; + } + } else { + ret = krealloc(mod_hdr_acts->actions, new_sz, GFP_KERNEL); + if (ret) + memset(ret + old_sz, 0, new_sz - old_sz); + } + if (!ret) + return ERR_PTR(-ENOMEM); + + mod_hdr_acts->actions = ret; + mod_hdr_acts->max_actions = new_num_actions; + +out: + return mod_hdr_acts->actions + (mod_hdr_acts->num_actions * MLX5_MH_ACT_SZ); +} + +void +mlx5e_mod_hdr_dealloc(struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts) +{ + if (!mod_hdr_acts->is_static) + kfree(mod_hdr_acts->actions); + + mod_hdr_acts->actions = NULL; + mod_hdr_acts->num_actions = 0; + mod_hdr_acts->max_actions = 0; +} + +char * +mlx5e_mod_hdr_get_item(struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts, int pos) +{ + return mod_hdr_acts->actions + (pos * MLX5_MH_ACT_SZ); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/mod_hdr.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/mod_hdr.h new file mode 100644 index 0000000..b8dac41 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/mod_hdr.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2020 Mellanox Technologies */ + +#ifndef __MLX5E_EN_MOD_HDR_H__ +#define __MLX5E_EN_MOD_HDR_H__ + +#include +#include + +#define MLX5_MH_ACT_SZ MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto) + +struct mlx5e_mod_hdr_handle; + +struct mlx5e_tc_mod_hdr_acts { + int num_actions; + int max_actions; + bool is_static; + void *actions; +}; + +#define DECLARE_MOD_HDR_ACTS_ACTIONS(name, len) \ + u8 name[len][MLX5_MH_ACT_SZ] = {} + +#define DECLARE_MOD_HDR_ACTS(name, acts_arr) \ + struct mlx5e_tc_mod_hdr_acts name = { \ + .max_actions = ARRAY_SIZE(acts_arr), \ + .is_static = true, \ + .actions = acts_arr, \ + } + +char *mlx5e_mod_hdr_alloc(struct mlx5_core_dev *mdev, int namespace, + struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts); +void mlx5e_mod_hdr_dealloc(struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts); +char *mlx5e_mod_hdr_get_item(struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts, int pos); + +struct mlx5e_mod_hdr_handle * +mlx5e_mod_hdr_attach(struct mlx5_core_dev *mdev, + struct mod_hdr_tbl *tbl, + enum mlx5_flow_namespace_type namespace, + struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts); +void mlx5e_mod_hdr_detach(struct mlx5_core_dev *mdev, + struct mod_hdr_tbl *tbl, + struct mlx5e_mod_hdr_handle *mh); +struct mlx5_modify_hdr *mlx5e_mod_hdr_get(struct mlx5e_mod_hdr_handle *mh); + +void mlx5e_mod_hdr_tbl_init(struct mod_hdr_tbl *tbl); +void mlx5e_mod_hdr_tbl_destroy(struct mod_hdr_tbl *tbl); + +static inline int mlx5e_mod_hdr_max_actions(struct mlx5_core_dev *mdev, int namespace) +{ + if (namespace == MLX5_FLOW_NAMESPACE_FDB) /* FDB offloading */ + return MLX5_CAP_ESW_FLOWTABLE_FDB(mdev, max_modify_header_actions); + else /* namespace is MLX5_FLOW_NAMESPACE_KERNEL - NIC offloading */ + return MLX5_CAP_FLOWTABLE_NIC_RX(mdev, max_modify_header_actions); +} + +#endif /* __MLX5E_EN_MOD_HDR_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/monitor_stats.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/monitor_stats.c new file mode 100644 index 0000000..254c847 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/monitor_stats.c @@ -0,0 +1,151 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2018 Mellanox Technologies. */ + +#include "en.h" +#include "monitor_stats.h" +#include "lib/eq.h" + +/* Driver will set the following watch counters list: + * Ppcnt.802_3: + * a_in_range_length_errors Type: 0x0, Counter: 0x0, group_id = N/A + * a_out_of_range_length_field Type: 0x0, Counter: 0x1, group_id = N/A + * a_frame_too_long_errors Type: 0x0, Counter: 0x2, group_id = N/A + * a_frame_check_sequence_errors Type: 0x0, Counter: 0x3, group_id = N/A + * a_alignment_errors Type: 0x0, Counter: 0x4, group_id = N/A + * if_out_discards Type: 0x0, Counter: 0x5, group_id = N/A + * Q_Counters: + * Q[index].rx_out_of_buffer Type: 0x1, Counter: 0x4, group_id = counter_ix + */ + +#define NUM_REQ_PPCNT_COUNTER_S1 MLX5_CMD_SET_MONITOR_NUM_PPCNT_COUNTER_SET1 +#define NUM_REQ_Q_COUNTERS_S1 MLX5_CMD_SET_MONITOR_NUM_Q_COUNTERS_SET1 + +int mlx5e_monitor_counter_supported(struct mlx5e_priv *priv) +{ + struct mlx5_core_dev *mdev = priv->mdev; + + if (!MLX5_CAP_GEN(mdev, max_num_of_monitor_counters)) + return false; + if (MLX5_CAP_PCAM_REG(mdev, ppcnt) && + MLX5_CAP_GEN(mdev, num_ppcnt_monitor_counters) < + NUM_REQ_PPCNT_COUNTER_S1) + return false; + if (MLX5_CAP_GEN(mdev, num_q_monitor_counters) < + NUM_REQ_Q_COUNTERS_S1) + return false; + return true; +} + +void mlx5e_monitor_counter_arm(struct mlx5e_priv *priv) +{ + u32 in[MLX5_ST_SZ_DW(arm_monitor_counter_in)] = {}; + + MLX5_SET(arm_monitor_counter_in, in, opcode, + MLX5_CMD_OP_ARM_MONITOR_COUNTER); + mlx5_cmd_exec_in(priv->mdev, arm_monitor_counter, in); +} + +static void mlx5e_monitor_counters_work(struct work_struct *work) +{ + struct mlx5e_priv *priv = container_of(work, struct mlx5e_priv, + monitor_counters_work); + + mutex_lock(&priv->state_lock); + mlx5e_stats_update_ndo_stats(priv); + mutex_unlock(&priv->state_lock); + mlx5e_monitor_counter_arm(priv); +} + +static int mlx5e_monitor_event_handler(struct notifier_block *nb, + unsigned long event, void *eqe) +{ + struct mlx5e_priv *priv = mlx5_nb_cof(nb, struct mlx5e_priv, + monitor_counters_nb); + queue_work(priv->wq, &priv->monitor_counters_work); + return NOTIFY_OK; +} + +static int fill_monitor_counter_ppcnt_set1(int cnt, u32 *in) +{ + enum mlx5_monitor_counter_ppcnt ppcnt_cnt; + + for (ppcnt_cnt = 0; + ppcnt_cnt < NUM_REQ_PPCNT_COUNTER_S1; + ppcnt_cnt++, cnt++) { + MLX5_SET(set_monitor_counter_in, in, + monitor_counter[cnt].type, + MLX5_QUERY_MONITOR_CNT_TYPE_PPCNT); + MLX5_SET(set_monitor_counter_in, in, + monitor_counter[cnt].counter, + ppcnt_cnt); + } + return ppcnt_cnt; +} + +static int fill_monitor_counter_q_counter_set1(int cnt, int q_counter, u32 *in) +{ + MLX5_SET(set_monitor_counter_in, in, + monitor_counter[cnt].type, + MLX5_QUERY_MONITOR_CNT_TYPE_Q_COUNTER); + MLX5_SET(set_monitor_counter_in, in, + monitor_counter[cnt].counter, + MLX5_QUERY_MONITOR_Q_COUNTER_RX_OUT_OF_BUFFER); + MLX5_SET(set_monitor_counter_in, in, + monitor_counter[cnt].counter_group_id, + q_counter); + return 1; +} + +/* check if mlx5e_monitor_counter_supported before calling this function*/ +static void mlx5e_set_monitor_counter(struct mlx5e_priv *priv) +{ + struct mlx5_core_dev *mdev = priv->mdev; + int max_num_of_counters = MLX5_CAP_GEN(mdev, max_num_of_monitor_counters); + int num_q_counters = MLX5_CAP_GEN(mdev, num_q_monitor_counters); + int num_ppcnt_counters = !MLX5_CAP_PCAM_REG(mdev, ppcnt) ? 0 : + MLX5_CAP_GEN(mdev, num_ppcnt_monitor_counters); + u32 in[MLX5_ST_SZ_DW(set_monitor_counter_in)] = {}; + int q_counter = priv->q_counter; + int cnt = 0; + + if (num_ppcnt_counters >= NUM_REQ_PPCNT_COUNTER_S1 && + max_num_of_counters >= (NUM_REQ_PPCNT_COUNTER_S1 + cnt)) + cnt += fill_monitor_counter_ppcnt_set1(cnt, in); + + if (num_q_counters >= NUM_REQ_Q_COUNTERS_S1 && + max_num_of_counters >= (NUM_REQ_Q_COUNTERS_S1 + cnt) && + q_counter) + cnt += fill_monitor_counter_q_counter_set1(cnt, q_counter, in); + + MLX5_SET(set_monitor_counter_in, in, num_of_counters, cnt); + MLX5_SET(set_monitor_counter_in, in, opcode, + MLX5_CMD_OP_SET_MONITOR_COUNTER); + + mlx5_cmd_exec_in(mdev, set_monitor_counter, in); +} + +/* check if mlx5e_monitor_counter_supported before calling this function*/ +void mlx5e_monitor_counter_init(struct mlx5e_priv *priv) +{ + INIT_WORK(&priv->monitor_counters_work, mlx5e_monitor_counters_work); + MLX5_NB_INIT(&priv->monitor_counters_nb, mlx5e_monitor_event_handler, + MONITOR_COUNTER); + mlx5_eq_notifier_register(priv->mdev, &priv->monitor_counters_nb); + + mlx5e_set_monitor_counter(priv); + mlx5e_monitor_counter_arm(priv); + queue_work(priv->wq, &priv->update_stats_work); +} + +/* check if mlx5e_monitor_counter_supported before calling this function*/ +void mlx5e_monitor_counter_cleanup(struct mlx5e_priv *priv) +{ + u32 in[MLX5_ST_SZ_DW(set_monitor_counter_in)] = {}; + + MLX5_SET(set_monitor_counter_in, in, opcode, + MLX5_CMD_OP_SET_MONITOR_COUNTER); + + mlx5_cmd_exec_in(priv->mdev, set_monitor_counter, in); + mlx5_eq_notifier_unregister(priv->mdev, &priv->monitor_counters_nb); + cancel_work_sync(&priv->monitor_counters_work); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/monitor_stats.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/monitor_stats.h new file mode 100644 index 0000000..e1ac4b3 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/monitor_stats.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2018 Mellanox Technologies. */ + +#ifndef __MLX5_MONITOR_H__ +#define __MLX5_MONITOR_H__ + +int mlx5e_monitor_counter_supported(struct mlx5e_priv *priv); +void mlx5e_monitor_counter_init(struct mlx5e_priv *priv); +void mlx5e_monitor_counter_cleanup(struct mlx5e_priv *priv); +void mlx5e_monitor_counter_arm(struct mlx5e_priv *priv); + +#endif /* __MLX5_MONITOR_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/params.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/params.c new file mode 100644 index 0000000..8708186 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/params.c @@ -0,0 +1,857 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2019 Mellanox Technologies. */ + +#include "en/params.h" +#include "en/txrx.h" +#include "en/port.h" +#include "en_accel/en_accel.h" +#include "accel/ipsec.h" +#include "fpga/ipsec.h" + +static bool mlx5e_rx_is_xdp(struct mlx5e_params *params, + struct mlx5e_xsk_param *xsk) +{ + return params->xdp_prog || xsk; +} + +u16 mlx5e_get_linear_rq_headroom(struct mlx5e_params *params, + struct mlx5e_xsk_param *xsk) +{ + u16 headroom; + + if (xsk) + return xsk->headroom; + + headroom = NET_IP_ALIGN; + if (mlx5e_rx_is_xdp(params, xsk)) + headroom += XDP_PACKET_HEADROOM; + else + headroom += MLX5_RX_HEADROOM; + + return headroom; +} + +u32 mlx5e_rx_get_min_frag_sz(struct mlx5e_params *params, + struct mlx5e_xsk_param *xsk) +{ + u32 hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu); + u16 linear_rq_headroom = mlx5e_get_linear_rq_headroom(params, xsk); + + return linear_rq_headroom + hw_mtu; +} + +static u32 mlx5e_rx_get_linear_frag_sz(struct mlx5e_params *params, + struct mlx5e_xsk_param *xsk) +{ + u32 frag_sz = mlx5e_rx_get_min_frag_sz(params, xsk); + + /* AF_XDP doesn't build SKBs in place. */ + if (!xsk) + frag_sz = MLX5_SKB_FRAG_SZ(frag_sz); + + /* XDP in mlx5e doesn't support multiple packets per page. AF_XDP is a + * special case. It can run with frames smaller than a page, as it + * doesn't allocate pages dynamically. However, here we pretend that + * fragments are page-sized: it allows to treat XSK frames like pages + * by redirecting alloc and free operations to XSK rings and by using + * the fact there are no multiple packets per "page" (which is a frame). + * The latter is important, because frames may come in a random order, + * and we will have trouble assemblying a real page of multiple frames. + */ + if (mlx5e_rx_is_xdp(params, xsk)) + frag_sz = max_t(u32, frag_sz, PAGE_SIZE); + + /* Even if we can go with a smaller fragment size, we must not put + * multiple packets into a single frame. + */ + if (xsk) + frag_sz = max_t(u32, frag_sz, xsk->chunk_size); + + return frag_sz; +} + +u8 mlx5e_mpwqe_log_pkts_per_wqe(struct mlx5e_params *params, + struct mlx5e_xsk_param *xsk) +{ + u32 linear_frag_sz = mlx5e_rx_get_linear_frag_sz(params, xsk); + + return MLX5_MPWRQ_LOG_WQE_SZ - order_base_2(linear_frag_sz); +} + +bool mlx5e_rx_is_linear_skb(struct mlx5e_params *params, + struct mlx5e_xsk_param *xsk) +{ + /* AF_XDP allocates SKBs on XDP_PASS - ensure they don't occupy more + * than one page. For this, check both with and without xsk. + */ + u32 linear_frag_sz = max(mlx5e_rx_get_linear_frag_sz(params, xsk), + mlx5e_rx_get_linear_frag_sz(params, NULL)); + + return params->packet_merge.type == MLX5E_PACKET_MERGE_NONE && + linear_frag_sz <= PAGE_SIZE; +} + +static bool mlx5e_verify_rx_mpwqe_strides(struct mlx5_core_dev *mdev, + u8 log_stride_sz, u8 log_num_strides) +{ + if (log_stride_sz + log_num_strides != MLX5_MPWRQ_LOG_WQE_SZ) + return false; + + if (log_stride_sz < MLX5_MPWQE_LOG_STRIDE_SZ_BASE || + log_stride_sz > MLX5_MPWQE_LOG_STRIDE_SZ_MAX) + return false; + + if (log_num_strides > MLX5_MPWQE_LOG_NUM_STRIDES_MAX) + return false; + + if (MLX5_CAP_GEN(mdev, ext_stride_num_range)) + return log_num_strides >= MLX5_MPWQE_LOG_NUM_STRIDES_EXT_BASE; + + return log_num_strides >= MLX5_MPWQE_LOG_NUM_STRIDES_BASE; +} + +bool mlx5e_rx_mpwqe_is_linear_skb(struct mlx5_core_dev *mdev, + struct mlx5e_params *params, + struct mlx5e_xsk_param *xsk) +{ + s8 log_num_strides; + u8 log_stride_sz; + + if (!mlx5e_rx_is_linear_skb(params, xsk)) + return false; + + log_stride_sz = order_base_2(mlx5e_rx_get_linear_frag_sz(params, xsk)); + log_num_strides = MLX5_MPWRQ_LOG_WQE_SZ - log_stride_sz; + + return mlx5e_verify_rx_mpwqe_strides(mdev, log_stride_sz, log_num_strides); +} + +u8 mlx5e_mpwqe_get_log_rq_size(struct mlx5e_params *params, + struct mlx5e_xsk_param *xsk) +{ + u8 log_pkts_per_wqe = mlx5e_mpwqe_log_pkts_per_wqe(params, xsk); + + /* Numbers are unsigned, don't subtract to avoid underflow. */ + if (params->log_rq_mtu_frames < + log_pkts_per_wqe + MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE_MPW) + return MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE_MPW; + + return params->log_rq_mtu_frames - log_pkts_per_wqe; +} + +u8 mlx5e_shampo_get_log_hd_entry_size(struct mlx5_core_dev *mdev, + struct mlx5e_params *params) +{ + return order_base_2(DIV_ROUND_UP(MLX5E_RX_MAX_HEAD, MLX5E_SHAMPO_WQ_BASE_HEAD_ENTRY_SIZE)); +} + +u8 mlx5e_shampo_get_log_rsrv_size(struct mlx5_core_dev *mdev, + struct mlx5e_params *params) +{ + return order_base_2(MLX5E_SHAMPO_WQ_RESRV_SIZE / MLX5E_SHAMPO_WQ_BASE_RESRV_SIZE); +} + +u8 mlx5e_shampo_get_log_pkt_per_rsrv(struct mlx5_core_dev *mdev, + struct mlx5e_params *params) +{ + u32 resrv_size = BIT(mlx5e_shampo_get_log_rsrv_size(mdev, params)) * + PAGE_SIZE; + + return order_base_2(DIV_ROUND_UP(resrv_size, params->sw_mtu)); +} + +u8 mlx5e_mpwqe_get_log_stride_size(struct mlx5_core_dev *mdev, + struct mlx5e_params *params, + struct mlx5e_xsk_param *xsk) +{ + if (mlx5e_rx_mpwqe_is_linear_skb(mdev, params, xsk)) + return order_base_2(mlx5e_rx_get_linear_frag_sz(params, xsk)); + + return MLX5_MPWRQ_DEF_LOG_STRIDE_SZ(mdev); +} + +u8 mlx5e_mpwqe_get_log_num_strides(struct mlx5_core_dev *mdev, + struct mlx5e_params *params, + struct mlx5e_xsk_param *xsk) +{ + return MLX5_MPWRQ_LOG_WQE_SZ - + mlx5e_mpwqe_get_log_stride_size(mdev, params, xsk); +} + +u16 mlx5e_get_rq_headroom(struct mlx5_core_dev *mdev, + struct mlx5e_params *params, + struct mlx5e_xsk_param *xsk) +{ + bool is_linear_skb = (params->rq_wq_type == MLX5_WQ_TYPE_CYCLIC) ? + mlx5e_rx_is_linear_skb(params, xsk) : + mlx5e_rx_mpwqe_is_linear_skb(mdev, params, xsk); + + return is_linear_skb || params->packet_merge.type == MLX5E_PACKET_MERGE_SHAMPO ? + mlx5e_get_linear_rq_headroom(params, xsk) : 0; +} + +u16 mlx5e_calc_sq_stop_room(struct mlx5_core_dev *mdev, struct mlx5e_params *params) +{ + bool is_mpwqe = MLX5E_GET_PFLAG(params, MLX5E_PFLAG_SKB_TX_MPWQE); + u16 stop_room; + + stop_room = mlx5e_tls_get_stop_room(mdev, params); + stop_room += mlx5e_stop_room_for_max_wqe(mdev); + if (is_mpwqe) + /* A MPWQE can take up to the maximum cacheline-aligned WQE + + * all the normal stop room can be taken if a new packet breaks + * the active MPWQE session and allocates its WQEs right away. + */ + stop_room += mlx5e_stop_room_for_mpwqe(mdev); + + return stop_room; +} + +int mlx5e_validate_params(struct mlx5_core_dev *mdev, struct mlx5e_params *params) +{ + size_t sq_size = 1 << params->log_sq_size; + u16 stop_room; + + stop_room = mlx5e_calc_sq_stop_room(mdev, params); + if (stop_room >= sq_size) { + mlx5_core_err(mdev, "Stop room %u is bigger than the SQ size %zu\n", + stop_room, sq_size); + return -EINVAL; + } + + return 0; +} + +static struct dim_cq_moder mlx5e_get_def_tx_moderation(u8 cq_period_mode) +{ + struct dim_cq_moder moder = {}; + + moder.cq_period_mode = cq_period_mode; + moder.pkts = MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_PKTS; + moder.usec = MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_USEC; + if (cq_period_mode == MLX5_CQ_PERIOD_MODE_START_FROM_CQE) + moder.usec = MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_USEC_FROM_CQE; + + return moder; +} + +static struct dim_cq_moder mlx5e_get_def_rx_moderation(u8 cq_period_mode) +{ + struct dim_cq_moder moder = {}; + + moder.cq_period_mode = cq_period_mode; + moder.pkts = MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_PKTS; + moder.usec = MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC; + if (cq_period_mode == MLX5_CQ_PERIOD_MODE_START_FROM_CQE) + moder.usec = MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC_FROM_CQE; + + return moder; +} + +static u8 mlx5_to_net_dim_cq_period_mode(u8 cq_period_mode) +{ + return cq_period_mode == MLX5_CQ_PERIOD_MODE_START_FROM_CQE ? + DIM_CQ_PERIOD_MODE_START_FROM_CQE : + DIM_CQ_PERIOD_MODE_START_FROM_EQE; +} + +void mlx5e_reset_tx_moderation(struct mlx5e_params *params, u8 cq_period_mode) +{ + if (params->tx_dim_enabled) { + u8 dim_period_mode = mlx5_to_net_dim_cq_period_mode(cq_period_mode); + + params->tx_cq_moderation = net_dim_get_def_tx_moderation(dim_period_mode); + } else { + params->tx_cq_moderation = mlx5e_get_def_tx_moderation(cq_period_mode); + } +} + +#define MLX5E_DEF_RX_DIM_PROFILE_IX 3 +void mlx5e_reset_rx_moderation(struct mlx5e_params *params, u8 cq_period_mode) +{ + if (params->rx_dim_enabled) { + u8 dim_period_mode = mlx5_to_net_dim_cq_period_mode(cq_period_mode); + + params->rx_cq_moderation = + net_dim_get_rx_moderation(dim_period_mode, + MLX5E_DEF_RX_DIM_PROFILE_IX); + } else { + params->rx_cq_moderation = mlx5e_get_def_rx_moderation(cq_period_mode); + } +} + +void mlx5e_set_tx_cq_mode_params(struct mlx5e_params *params, u8 cq_period_mode) +{ + mlx5e_reset_tx_moderation(params, cq_period_mode); + MLX5E_SET_PFLAG(params, MLX5E_PFLAG_TX_CQE_BASED_MODER, + params->tx_cq_moderation.cq_period_mode == + MLX5_CQ_PERIOD_MODE_START_FROM_CQE); +} + +void mlx5e_set_rx_cq_mode_params(struct mlx5e_params *params, u8 cq_period_mode) +{ + mlx5e_reset_rx_moderation(params, cq_period_mode); + MLX5E_SET_PFLAG(params, MLX5E_PFLAG_RX_CQE_BASED_MODER, + params->rx_cq_moderation.cq_period_mode == + MLX5_CQ_PERIOD_MODE_START_FROM_CQE); +} + +bool slow_pci_heuristic(struct mlx5_core_dev *mdev) +{ + u32 link_speed = 0; + u32 pci_bw = 0; + + mlx5e_port_max_linkspeed(mdev, &link_speed); + pci_bw = pcie_bandwidth_available(mdev->pdev, NULL, NULL, NULL); + mlx5_core_dbg_once(mdev, "Max link speed = %d, PCI BW = %d\n", + link_speed, pci_bw); + +#define MLX5E_SLOW_PCI_RATIO (2) + + return link_speed && pci_bw && + link_speed > MLX5E_SLOW_PCI_RATIO * pci_bw; +} + +int mlx5e_mpwrq_validate_regular(struct mlx5_core_dev *mdev, struct mlx5e_params *params) +{ + if (!mlx5e_check_fragmented_striding_rq_cap(mdev)) + return -EOPNOTSUPP; + + if (mlx5_fpga_is_ipsec_device(mdev)) + return -EOPNOTSUPP; + + if (params->xdp_prog && !mlx5e_rx_mpwqe_is_linear_skb(mdev, params, NULL)) + return -EINVAL; + + return 0; +} + +int mlx5e_mpwrq_validate_xsk(struct mlx5_core_dev *mdev, struct mlx5e_params *params, + struct mlx5e_xsk_param *xsk) +{ + if (!mlx5e_check_fragmented_striding_rq_cap(mdev)) + return -EOPNOTSUPP; + + if (mlx5_fpga_is_ipsec_device(mdev)) + return -EOPNOTSUPP; + + if (!mlx5e_rx_mpwqe_is_linear_skb(mdev, params, xsk)) + return -EINVAL; + + return 0; +} + +void mlx5e_init_rq_type_params(struct mlx5_core_dev *mdev, + struct mlx5e_params *params) +{ + params->log_rq_mtu_frames = is_kdump_kernel() ? + MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE : + MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE; + + mlx5_core_info(mdev, "MLX5E: StrdRq(%d) RqSz(%ld) StrdSz(%ld) RxCqeCmprss(%d)\n", + params->rq_wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ, + params->rq_wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ ? + BIT(mlx5e_mpwqe_get_log_rq_size(params, NULL)) : + BIT(params->log_rq_mtu_frames), + BIT(mlx5e_mpwqe_get_log_stride_size(mdev, params, NULL)), + MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_CQE_COMPRESS)); +} + +void mlx5e_set_rq_type(struct mlx5_core_dev *mdev, struct mlx5e_params *params) +{ + params->rq_wq_type = MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_STRIDING_RQ) ? + MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ : + MLX5_WQ_TYPE_CYCLIC; +} + +void mlx5e_build_rq_params(struct mlx5_core_dev *mdev, + struct mlx5e_params *params) +{ + /* Prefer Striding RQ, unless any of the following holds: + * - Striding RQ configuration is not possible/supported. + * - Slow PCI heuristic. + * - Legacy RQ would use linear SKB while Striding RQ would use non-linear. + * + * No XSK params: checking the availability of striding RQ in general. + */ + if (!slow_pci_heuristic(mdev) && + !mlx5e_mpwrq_validate_regular(mdev, params) && + (mlx5e_rx_mpwqe_is_linear_skb(mdev, params, NULL) || + !mlx5e_rx_is_linear_skb(params, NULL))) + MLX5E_SET_PFLAG(params, MLX5E_PFLAG_RX_STRIDING_RQ, true); + mlx5e_set_rq_type(mdev, params); + mlx5e_init_rq_type_params(mdev, params); +} + +/* Build queue parameters */ + +void mlx5e_build_create_cq_param(struct mlx5e_create_cq_param *ccp, struct mlx5e_channel *c) +{ + *ccp = (struct mlx5e_create_cq_param) { + .napi = &c->napi, + .ch_stats = c->stats, + .node = cpu_to_node(c->cpu), + .ix = c->ix, + }; +} + +#define DEFAULT_FRAG_SIZE (2048) + +static void mlx5e_build_rq_frags_info(struct mlx5_core_dev *mdev, + struct mlx5e_params *params, + struct mlx5e_xsk_param *xsk, + struct mlx5e_rq_frags_info *info) +{ + u32 byte_count = MLX5E_SW2HW_MTU(params, params->sw_mtu); + int frag_size_max = DEFAULT_FRAG_SIZE; + u32 buf_size = 0; + int i; + + if (mlx5_fpga_is_ipsec_device(mdev)) + byte_count += MLX5E_METADATA_ETHER_LEN; + + if (mlx5e_rx_is_linear_skb(params, xsk)) { + int frag_stride; + + frag_stride = mlx5e_rx_get_linear_frag_sz(params, xsk); + frag_stride = roundup_pow_of_two(frag_stride); + + info->arr[0].frag_size = byte_count; + info->arr[0].frag_stride = frag_stride; + info->num_frags = 1; + info->wqe_bulk = PAGE_SIZE / frag_stride; + goto out; + } + + if (byte_count > PAGE_SIZE + + (MLX5E_MAX_RX_FRAGS - 1) * frag_size_max) + frag_size_max = PAGE_SIZE; + + i = 0; + while (buf_size < byte_count) { + int frag_size = byte_count - buf_size; + + if (i < MLX5E_MAX_RX_FRAGS - 1) + frag_size = min(frag_size, frag_size_max); + + info->arr[i].frag_size = frag_size; + info->arr[i].frag_stride = roundup_pow_of_two(frag_size); + + buf_size += frag_size; + i++; + } + info->num_frags = i; + /* number of different wqes sharing a page */ + info->wqe_bulk = 1 + (info->num_frags % 2); + +out: + info->wqe_bulk = max_t(u8, info->wqe_bulk, 8); + info->log_num_frags = order_base_2(info->num_frags); +} + +static u8 mlx5e_get_rqwq_log_stride(u8 wq_type, int ndsegs) +{ + int sz = sizeof(struct mlx5_wqe_data_seg) * ndsegs; + + switch (wq_type) { + case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ: + sz += sizeof(struct mlx5e_rx_wqe_ll); + break; + default: /* MLX5_WQ_TYPE_CYCLIC */ + sz += sizeof(struct mlx5e_rx_wqe_cyc); + } + + return order_base_2(sz); +} + +void mlx5e_build_common_cq_param(struct mlx5_core_dev *mdev, + struct mlx5e_cq_param *param) +{ + void *cqc = param->cqc; + + MLX5_SET(cqc, cqc, uar_page, mdev->priv.uar->index); + if (MLX5_CAP_GEN(mdev, cqe_128_always) && cache_line_size() >= 128) + MLX5_SET(cqc, cqc, cqe_sz, CQE_STRIDE_128_PAD); +} + +static u32 mlx5e_shampo_get_log_cq_size(struct mlx5_core_dev *mdev, + struct mlx5e_params *params, + struct mlx5e_xsk_param *xsk) +{ + int rsrv_size = BIT(mlx5e_shampo_get_log_rsrv_size(mdev, params)) * PAGE_SIZE; + u16 num_strides = BIT(mlx5e_mpwqe_get_log_num_strides(mdev, params, xsk)); + int pkt_per_rsrv = BIT(mlx5e_shampo_get_log_pkt_per_rsrv(mdev, params)); + u8 log_stride_sz = mlx5e_mpwqe_get_log_stride_size(mdev, params, xsk); + int wq_size = BIT(mlx5e_mpwqe_get_log_rq_size(params, xsk)); + int wqe_size = BIT(log_stride_sz) * num_strides; + + /* +1 is for the case that the pkt_per_rsrv dont consume the reservation + * so we get a filler cqe for the rest of the reservation. + */ + return order_base_2((wqe_size / rsrv_size) * wq_size * (pkt_per_rsrv + 1)); +} + +static void mlx5e_build_rx_cq_param(struct mlx5_core_dev *mdev, + struct mlx5e_params *params, + struct mlx5e_xsk_param *xsk, + struct mlx5e_cq_param *param) +{ + bool hw_stridx = false; + void *cqc = param->cqc; + u8 log_cq_size; + + switch (params->rq_wq_type) { + case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ: + hw_stridx = MLX5_CAP_GEN(mdev, mini_cqe_resp_stride_index); + if (params->packet_merge.type == MLX5E_PACKET_MERGE_SHAMPO) + log_cq_size = mlx5e_shampo_get_log_cq_size(mdev, params, xsk); + else + log_cq_size = mlx5e_mpwqe_get_log_rq_size(params, xsk) + + mlx5e_mpwqe_get_log_num_strides(mdev, params, xsk); + break; + default: /* MLX5_WQ_TYPE_CYCLIC */ + log_cq_size = params->log_rq_mtu_frames; + } + + MLX5_SET(cqc, cqc, log_cq_size, log_cq_size); + if (MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_CQE_COMPRESS)) { + MLX5_SET(cqc, cqc, mini_cqe_res_format, hw_stridx ? + MLX5_CQE_FORMAT_CSUM_STRIDX : MLX5_CQE_FORMAT_CSUM); + MLX5_SET(cqc, cqc, cqe_comp_en, 1); + } + + mlx5e_build_common_cq_param(mdev, param); + param->cq_period_mode = params->rx_cq_moderation.cq_period_mode; +} + +void mlx5e_build_aso_cq_param(struct mlx5_core_dev *mdev, + struct mlx5e_cq_param *param) +{ + void *cqc = param->cqc; + + MLX5_SET(cqc, cqc, log_cq_size, 1); + + mlx5e_build_common_cq_param(mdev, param); + param->cq_period_mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE; +} + +static u8 rq_end_pad_mode(struct mlx5_core_dev *mdev, struct mlx5e_params *params) +{ + bool lro_en = params->packet_merge.type == MLX5E_PACKET_MERGE_LRO; + bool ro = pcie_relaxed_ordering_enabled(mdev->pdev) && + MLX5_CAP_GEN(mdev, relaxed_ordering_write); + + return ro && lro_en ? + MLX5_WQ_END_PAD_MODE_NONE : MLX5_WQ_END_PAD_MODE_ALIGN; +} + +int mlx5e_build_rq_param(struct mlx5_core_dev *mdev, + struct mlx5e_params *params, + struct mlx5e_xsk_param *xsk, + u16 q_counter, + struct mlx5e_rq_param *param) +{ + void *rqc = param->rqc; + void *wq = MLX5_ADDR_OF(rqc, rqc, wq); + int ndsegs = 1; + + switch (params->rq_wq_type) { + case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ: { + u8 log_wqe_num_of_strides = mlx5e_mpwqe_get_log_num_strides(mdev, params, xsk); + u8 log_wqe_stride_size = mlx5e_mpwqe_get_log_stride_size(mdev, params, xsk); + + if (!mlx5e_verify_rx_mpwqe_strides(mdev, log_wqe_stride_size, + log_wqe_num_of_strides)) { + mlx5_core_err(mdev, + "Bad RX MPWQE params: log_stride_size %u, log_num_strides %u\n", + log_wqe_stride_size, log_wqe_num_of_strides); + return -EINVAL; + } + + MLX5_SET(wq, wq, log_wqe_num_of_strides, + log_wqe_num_of_strides - MLX5_MPWQE_LOG_NUM_STRIDES_BASE); + MLX5_SET(wq, wq, log_wqe_stride_size, + log_wqe_stride_size - MLX5_MPWQE_LOG_STRIDE_SZ_BASE); + MLX5_SET(wq, wq, log_wq_sz, mlx5e_mpwqe_get_log_rq_size(params, xsk)); + if (params->packet_merge.type == MLX5E_PACKET_MERGE_SHAMPO) { + MLX5_SET(wq, wq, shampo_enable, true); + MLX5_SET(wq, wq, log_reservation_size, + mlx5e_shampo_get_log_rsrv_size(mdev, params)); + MLX5_SET(wq, wq, + log_max_num_of_packets_per_reservation, + mlx5e_shampo_get_log_pkt_per_rsrv(mdev, params)); + MLX5_SET(wq, wq, log_headers_entry_size, + mlx5e_shampo_get_log_hd_entry_size(mdev, params)); + MLX5_SET(rqc, rqc, reservation_timeout, + params->packet_merge.timeout); + MLX5_SET(rqc, rqc, shampo_match_criteria_type, + params->packet_merge.shampo.match_criteria_type); + MLX5_SET(rqc, rqc, shampo_no_match_alignment_granularity, + params->packet_merge.shampo.alignment_granularity); + } + break; + } + default: /* MLX5_WQ_TYPE_CYCLIC */ + MLX5_SET(wq, wq, log_wq_sz, params->log_rq_mtu_frames); + mlx5e_build_rq_frags_info(mdev, params, xsk, ¶m->frags_info); + ndsegs = param->frags_info.num_frags; + } + + MLX5_SET(wq, wq, wq_type, params->rq_wq_type); + MLX5_SET(wq, wq, end_padding_mode, rq_end_pad_mode(mdev, params)); + MLX5_SET(wq, wq, log_wq_stride, + mlx5e_get_rqwq_log_stride(params->rq_wq_type, ndsegs)); + MLX5_SET(wq, wq, pd, mdev->mlx5e_res.hw_objs.pdn); + MLX5_SET(rqc, rqc, counter_set_id, q_counter); + MLX5_SET(rqc, rqc, vsd, params->vlan_strip_disable); + MLX5_SET(rqc, rqc, scatter_fcs, params->scatter_fcs_en); + + if (MLX5E_GET_PFLAG(params, MLX5E_PFLAG_DROPLESS_RQ)) + MLX5_SET(rqc, rqc, delay_drop_en, 1); + + param->wq.buf_numa_node = dev_to_node(mlx5_core_dma_dev(mdev)); + mlx5e_build_rx_cq_param(mdev, params, xsk, ¶m->cqp); + + return 0; +} + +void mlx5e_build_drop_rq_param(struct mlx5_core_dev *mdev, + u16 q_counter, + struct mlx5e_rq_param *param) +{ + void *rqc = param->rqc; + void *wq = MLX5_ADDR_OF(rqc, rqc, wq); + + MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC); + MLX5_SET(wq, wq, log_wq_stride, + mlx5e_get_rqwq_log_stride(MLX5_WQ_TYPE_CYCLIC, 1)); + MLX5_SET(rqc, rqc, counter_set_id, q_counter); + + param->wq.buf_numa_node = dev_to_node(mlx5_core_dma_dev(mdev)); +} + +void mlx5e_build_tx_cq_param(struct mlx5_core_dev *mdev, + struct mlx5e_params *params, + struct mlx5e_cq_param *param) +{ + void *cqc = param->cqc; + + MLX5_SET(cqc, cqc, log_cq_size, params->log_sq_size); + + mlx5e_build_common_cq_param(mdev, param); + param->cq_period_mode = params->tx_cq_moderation.cq_period_mode; +} + +void mlx5e_build_sq_param_common(struct mlx5_core_dev *mdev, + struct mlx5e_sq_param *param) +{ + void *sqc = param->sqc; + void *wq = MLX5_ADDR_OF(sqc, sqc, wq); + + MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB)); + MLX5_SET(wq, wq, pd, + param->pdn ? param->pdn : mdev->mlx5e_res.hw_objs.pdn); + + param->wq.buf_numa_node = dev_to_node(mlx5_core_dma_dev(mdev)); +} + +void mlx5e_build_sq_param(struct mlx5_core_dev *mdev, + struct mlx5e_params *params, + struct mlx5e_sq_param *param) +{ + void *sqc = param->sqc; + void *wq = MLX5_ADDR_OF(sqc, sqc, wq); + bool allow_swp; + + allow_swp = mlx5_geneve_tx_allowed(mdev) || + !!MLX5_IPSEC_DEV(mdev); + mlx5e_build_sq_param_common(mdev, param); + MLX5_SET(wq, wq, log_wq_sz, params->log_sq_size); + MLX5_SET(sqc, sqc, allow_swp, allow_swp); + param->is_mpw = MLX5E_GET_PFLAG(params, MLX5E_PFLAG_SKB_TX_MPWQE); + param->stop_room = mlx5e_calc_sq_stop_room(mdev, params); + mlx5e_build_tx_cq_param(mdev, params, ¶m->cqp); +} + +static void mlx5e_build_ico_cq_param(struct mlx5_core_dev *mdev, + u8 log_wq_size, + struct mlx5e_cq_param *param) +{ + void *cqc = param->cqc; + + MLX5_SET(cqc, cqc, log_cq_size, log_wq_size); + + mlx5e_build_common_cq_param(mdev, param); + + param->cq_period_mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE; +} + +static u8 mlx5e_get_rq_log_wq_sz(void *rqc) +{ + void *wq = MLX5_ADDR_OF(rqc, rqc, wq); + + return MLX5_GET(wq, wq, log_wq_sz); +} + +/* This function calculates the maximum number of headers entries that are needed + * per WQE, the formula is based on the size of the reservations and the + * restriction we have about max packets for reservation that is equal to max + * headers per reservation. + */ +u32 mlx5e_shampo_hd_per_wqe(struct mlx5_core_dev *mdev, + struct mlx5e_params *params, + struct mlx5e_rq_param *rq_param) +{ + int resv_size = BIT(mlx5e_shampo_get_log_rsrv_size(mdev, params)) * PAGE_SIZE; + u16 num_strides = BIT(mlx5e_mpwqe_get_log_num_strides(mdev, params, NULL)); + int pkt_per_resv = BIT(mlx5e_shampo_get_log_pkt_per_rsrv(mdev, params)); + u8 log_stride_sz = mlx5e_mpwqe_get_log_stride_size(mdev, params, NULL); + int wqe_size = BIT(log_stride_sz) * num_strides; + u32 hd_per_wqe; + + /* Assumption: hd_per_wqe % 8 == 0. */ + hd_per_wqe = (wqe_size / resv_size) * pkt_per_resv; + mlx5_core_dbg(mdev, "%s hd_per_wqe = %d rsrv_size = %d wqe_size = %d pkt_per_resv = %d\n", + __func__, hd_per_wqe, resv_size, wqe_size, pkt_per_resv); + return hd_per_wqe; +} + +/* This function calculates the maximum number of headers entries that are needed + * for the WQ, this value is uesed to allocate the header buffer in HW, thus + * must be a pow of 2. + */ +u32 mlx5e_shampo_hd_per_wq(struct mlx5_core_dev *mdev, + struct mlx5e_params *params, + struct mlx5e_rq_param *rq_param) +{ + void *wqc = MLX5_ADDR_OF(rqc, rq_param->rqc, wq); + int wq_size = BIT(MLX5_GET(wq, wqc, log_wq_sz)); + u32 hd_per_wqe, hd_per_wq; + + hd_per_wqe = mlx5e_shampo_hd_per_wqe(mdev, params, rq_param); + hd_per_wq = roundup_pow_of_two(hd_per_wqe * wq_size); + return hd_per_wq; +} + +static u32 mlx5e_shampo_icosq_sz(struct mlx5_core_dev *mdev, + struct mlx5e_params *params, + struct mlx5e_rq_param *rq_param) +{ + int max_num_of_umr_per_wqe, max_hd_per_wqe, max_klm_per_umr, rest; + void *wqc = MLX5_ADDR_OF(rqc, rq_param->rqc, wq); + int wq_size = BIT(MLX5_GET(wq, wqc, log_wq_sz)); + u32 wqebbs; + + max_klm_per_umr = MLX5E_MAX_KLM_PER_WQE(mdev); + max_hd_per_wqe = mlx5e_shampo_hd_per_wqe(mdev, params, rq_param); + max_num_of_umr_per_wqe = max_hd_per_wqe / max_klm_per_umr; + rest = max_hd_per_wqe % max_klm_per_umr; + wqebbs = MLX5E_KLM_UMR_WQEBBS(max_klm_per_umr) * max_num_of_umr_per_wqe; + if (rest) + wqebbs += MLX5E_KLM_UMR_WQEBBS(rest); + wqebbs *= wq_size; + return wqebbs; +} + +static u8 mlx5e_build_icosq_log_wq_sz(struct mlx5_core_dev *mdev, + struct mlx5e_params *params, + struct mlx5e_rq_param *rqp) +{ + u32 wqebbs; + + /* MLX5_WQ_TYPE_CYCLIC */ + if (params->rq_wq_type != MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) + return MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE; + + wqebbs = MLX5E_UMR_WQEBBS * BIT(mlx5e_get_rq_log_wq_sz(rqp->rqc)); + + /* If XDP program is attached, XSK may be turned on at any time without + * restarting the channel. ICOSQ must be big enough to fit UMR WQEs of + * both regular RQ and XSK RQ. + * Although mlx5e_mpwqe_get_log_rq_size accepts mlx5e_xsk_param, it + * doesn't affect its return value, as long as params->xdp_prog != NULL, + * so we can just multiply by 2. + */ + if (params->xdp_prog) + wqebbs *= 2; + + if (params->packet_merge.type == MLX5E_PACKET_MERGE_SHAMPO) + wqebbs += mlx5e_shampo_icosq_sz(mdev, params, rqp); + + return max_t(u8, MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE, order_base_2(wqebbs)); +} + +static u8 mlx5e_build_async_icosq_log_wq_sz(struct mlx5_core_dev *mdev) +{ + if (mlx5e_accel_is_ktls_rx(mdev)) + return MLX5E_PARAMS_DEFAULT_LOG_SQ_SIZE; + + return MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE; +} + +static void mlx5e_build_icosq_param(struct mlx5_core_dev *mdev, + u8 log_wq_size, + struct mlx5e_sq_param *param) +{ + void *sqc = param->sqc; + void *wq = MLX5_ADDR_OF(sqc, sqc, wq); + + mlx5e_build_sq_param_common(mdev, param); + + MLX5_SET(wq, wq, log_wq_sz, log_wq_size); + MLX5_SET(sqc, sqc, reg_umr, MLX5_CAP_ETH(mdev, reg_umr_sq)); + mlx5e_build_ico_cq_param(mdev, log_wq_size, ¶m->cqp); +} + +static void mlx5e_build_async_icosq_param(struct mlx5_core_dev *mdev, + u8 log_wq_size, + struct mlx5e_sq_param *param) +{ + void *sqc = param->sqc; + void *wq = MLX5_ADDR_OF(sqc, sqc, wq); + + mlx5e_build_sq_param_common(mdev, param); + param->stop_room = mlx5e_stop_room_for_wqe(mdev, 1); /* for XSK NOP */ + param->is_tls = mlx5e_accel_is_ktls_rx(mdev); + if (param->is_tls) + param->stop_room += mlx5e_stop_room_for_wqe(mdev, 1); /* for TLS RX resync NOP */ + MLX5_SET(sqc, sqc, reg_umr, MLX5_CAP_ETH(mdev, reg_umr_sq)); + MLX5_SET(wq, wq, log_wq_sz, log_wq_size); + mlx5e_build_ico_cq_param(mdev, log_wq_size, ¶m->cqp); +} + +void mlx5e_build_xdpsq_param(struct mlx5_core_dev *mdev, + struct mlx5e_params *params, + struct mlx5e_sq_param *param) +{ + void *sqc = param->sqc; + void *wq = MLX5_ADDR_OF(sqc, sqc, wq); + + mlx5e_build_sq_param_common(mdev, param); + MLX5_SET(wq, wq, log_wq_sz, params->log_sq_size); + param->is_mpw = MLX5E_GET_PFLAG(params, MLX5E_PFLAG_XDP_TX_MPWQE); + mlx5e_build_tx_cq_param(mdev, params, ¶m->cqp); +} + +int mlx5e_build_channel_param(struct mlx5_core_dev *mdev, + struct mlx5e_params *params, + u16 q_counter, + struct mlx5e_channel_param *cparam) +{ + u8 icosq_log_wq_sz, async_icosq_log_wq_sz; + int err; + + err = mlx5e_build_rq_param(mdev, params, NULL, q_counter, &cparam->rq); + if (err) + return err; + + icosq_log_wq_sz = mlx5e_build_icosq_log_wq_sz(mdev, params, &cparam->rq); + async_icosq_log_wq_sz = mlx5e_build_async_icosq_log_wq_sz(mdev); + + mlx5e_build_sq_param(mdev, params, &cparam->txq_sq); + mlx5e_build_xdpsq_param(mdev, params, &cparam->xdp_sq); + mlx5e_build_icosq_param(mdev, icosq_log_wq_sz, &cparam->icosq); + mlx5e_build_async_icosq_param(mdev, async_icosq_log_wq_sz, &cparam->async_icosq); + + return 0; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/params.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/params.h new file mode 100644 index 0000000..45fc69f --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/params.h @@ -0,0 +1,171 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2019 Mellanox Technologies. */ + +#ifndef __MLX5_EN_PARAMS_H__ +#define __MLX5_EN_PARAMS_H__ + +#include "en.h" + +struct mlx5e_xsk_param { + u16 headroom; + u16 chunk_size; +}; + +struct mlx5e_cq_param { + u32 cqc[MLX5_ST_SZ_DW(cqc)]; + struct mlx5_wq_param wq; + u16 eq_ix; + u8 cq_period_mode; +}; + +struct mlx5e_rq_param { + struct mlx5e_cq_param cqp; + u32 rqc[MLX5_ST_SZ_DW(rqc)]; + struct mlx5_wq_param wq; + struct mlx5e_rq_frags_info frags_info; +}; + +struct mlx5e_sq_param { + struct mlx5e_cq_param cqp; + u32 sqc[MLX5_ST_SZ_DW(sqc)]; + struct mlx5_wq_param wq; + bool is_mpw; + bool is_tls; + u16 stop_room; + u32 pdn; +}; + +struct mlx5e_channel_param { + struct mlx5e_rq_param rq; + struct mlx5e_sq_param txq_sq; + struct mlx5e_sq_param xdp_sq; + struct mlx5e_sq_param icosq; + struct mlx5e_sq_param async_icosq; +}; + +struct mlx5e_create_sq_param { + struct mlx5_wq_ctrl *wq_ctrl; + u32 cqn; + u32 ts_cqe_to_dest_cqn; + u32 tisn; + u8 tis_lst_sz; + u8 min_inline_mode; +}; + +static inline bool mlx5e_qid_get_ch_if_in_group(struct mlx5e_params *params, + u16 qid, + enum mlx5e_rq_group group, + u16 *ix) +{ + int nch = params->num_channels; + int ch = qid - nch * group; + + if (ch < 0 || ch >= nch) + return false; + + *ix = ch; + return true; +} + +static inline void mlx5e_qid_get_ch_and_group(struct mlx5e_params *params, + u16 qid, + u16 *ix, + enum mlx5e_rq_group *group) +{ + u16 nch = params->num_channels; + + *ix = qid % nch; + *group = qid / nch; +} + +static inline bool mlx5e_qid_validate(const struct mlx5e_profile *profile, + struct mlx5e_params *params, u64 qid) +{ + return qid < params->num_channels * profile->rq_groups; +} + +/* Parameter calculations */ + +void mlx5e_reset_tx_moderation(struct mlx5e_params *params, u8 cq_period_mode); +void mlx5e_reset_rx_moderation(struct mlx5e_params *params, u8 cq_period_mode); +void mlx5e_set_tx_cq_mode_params(struct mlx5e_params *params, u8 cq_period_mode); +void mlx5e_set_rx_cq_mode_params(struct mlx5e_params *params, u8 cq_period_mode); + +bool slow_pci_heuristic(struct mlx5_core_dev *mdev); +int mlx5e_mpwrq_validate_regular(struct mlx5_core_dev *mdev, struct mlx5e_params *params); +int mlx5e_mpwrq_validate_xsk(struct mlx5_core_dev *mdev, struct mlx5e_params *params, + struct mlx5e_xsk_param *xsk); +void mlx5e_build_rq_params(struct mlx5_core_dev *mdev, struct mlx5e_params *params); +void mlx5e_set_rq_type(struct mlx5_core_dev *mdev, struct mlx5e_params *params); +void mlx5e_init_rq_type_params(struct mlx5_core_dev *mdev, struct mlx5e_params *params); + +u16 mlx5e_get_linear_rq_headroom(struct mlx5e_params *params, + struct mlx5e_xsk_param *xsk); +u32 mlx5e_rx_get_min_frag_sz(struct mlx5e_params *params, + struct mlx5e_xsk_param *xsk); +u8 mlx5e_mpwqe_log_pkts_per_wqe(struct mlx5e_params *params, + struct mlx5e_xsk_param *xsk); +bool mlx5e_rx_is_linear_skb(struct mlx5e_params *params, + struct mlx5e_xsk_param *xsk); +bool mlx5e_rx_mpwqe_is_linear_skb(struct mlx5_core_dev *mdev, + struct mlx5e_params *params, + struct mlx5e_xsk_param *xsk); +u8 mlx5e_mpwqe_get_log_rq_size(struct mlx5e_params *params, + struct mlx5e_xsk_param *xsk); +u8 mlx5e_shampo_get_log_hd_entry_size(struct mlx5_core_dev *mdev, + struct mlx5e_params *params); +u8 mlx5e_shampo_get_log_rsrv_size(struct mlx5_core_dev *mdev, + struct mlx5e_params *params); +u8 mlx5e_shampo_get_log_pkt_per_rsrv(struct mlx5_core_dev *mdev, + struct mlx5e_params *params); +u32 mlx5e_shampo_hd_per_wqe(struct mlx5_core_dev *mdev, + struct mlx5e_params *params, + struct mlx5e_rq_param *rq_param); +u32 mlx5e_shampo_hd_per_wq(struct mlx5_core_dev *mdev, + struct mlx5e_params *params, + struct mlx5e_rq_param *rq_param); +u8 mlx5e_mpwqe_get_log_stride_size(struct mlx5_core_dev *mdev, + struct mlx5e_params *params, + struct mlx5e_xsk_param *xsk); +u8 mlx5e_mpwqe_get_log_num_strides(struct mlx5_core_dev *mdev, + struct mlx5e_params *params, + struct mlx5e_xsk_param *xsk); +u16 mlx5e_get_rq_headroom(struct mlx5_core_dev *mdev, + struct mlx5e_params *params, + struct mlx5e_xsk_param *xsk); + +/* Build queue parameters */ + +void mlx5e_build_create_cq_param(struct mlx5e_create_cq_param *ccp, struct mlx5e_channel *c); +int mlx5e_build_rq_param(struct mlx5_core_dev *mdev, + struct mlx5e_params *params, + struct mlx5e_xsk_param *xsk, + u16 q_counter, + struct mlx5e_rq_param *param); +void mlx5e_build_drop_rq_param(struct mlx5_core_dev *mdev, + u16 q_counter, + struct mlx5e_rq_param *param); +void mlx5e_build_sq_param_common(struct mlx5_core_dev *mdev, + struct mlx5e_sq_param *param); +void mlx5e_build_sq_param(struct mlx5_core_dev *mdev, + struct mlx5e_params *params, + struct mlx5e_sq_param *param); +void mlx5e_build_tx_cq_param(struct mlx5_core_dev *mdev, + struct mlx5e_params *params, + struct mlx5e_cq_param *param); +void mlx5e_build_xdpsq_param(struct mlx5_core_dev *mdev, + struct mlx5e_params *params, + struct mlx5e_sq_param *param); +void mlx5e_build_aso_cq_param(struct mlx5_core_dev *mdev, + struct mlx5e_cq_param *param); +int mlx5e_build_channel_param(struct mlx5_core_dev *mdev, + struct mlx5e_params *params, + u16 q_counter, + struct mlx5e_channel_param *cparam); + +u16 mlx5e_calc_sq_stop_room(struct mlx5_core_dev *mdev, struct mlx5e_params *params); +int mlx5e_validate_params(struct mlx5_core_dev *mdev, struct mlx5e_params *params); + +void mlx5e_build_common_cq_param(struct mlx5_core_dev *mdev, + struct mlx5e_cq_param *param); +#endif /* __MLX5_EN_PARAMS_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/port.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/port.c new file mode 100644 index 0000000..02d2f9a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/port.c @@ -0,0 +1,594 @@ +/* + * Copyright (c) 2018, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "port.h" + +/* speed in units of 1Mb */ +static const u32 mlx5e_link_speed[MLX5E_LINK_MODES_NUMBER] = { + [MLX5E_1000BASE_CX_SGMII] = 1000, + [MLX5E_1000BASE_KX] = 1000, + [MLX5E_10GBASE_CX4] = 10000, + [MLX5E_10GBASE_KX4] = 10000, + [MLX5E_10GBASE_KR] = 10000, + [MLX5E_20GBASE_KR2] = 20000, + [MLX5E_40GBASE_CR4] = 40000, + [MLX5E_40GBASE_KR4] = 40000, + [MLX5E_56GBASE_R4] = 56000, + [MLX5E_10GBASE_CR] = 10000, + [MLX5E_10GBASE_SR] = 10000, + [MLX5E_10GBASE_ER] = 10000, + [MLX5E_40GBASE_SR4] = 40000, + [MLX5E_40GBASE_LR4] = 40000, + [MLX5E_50GBASE_SR2] = 50000, + [MLX5E_100GBASE_CR4] = 100000, + [MLX5E_100GBASE_SR4] = 100000, + [MLX5E_100GBASE_KR4] = 100000, + [MLX5E_100GBASE_LR4] = 100000, + [MLX5E_100BASE_TX] = 100, + [MLX5E_1000BASE_T] = 1000, + [MLX5E_10GBASE_T] = 10000, + [MLX5E_25GBASE_CR] = 25000, + [MLX5E_25GBASE_KR] = 25000, + [MLX5E_25GBASE_SR] = 25000, + [MLX5E_50GBASE_CR2] = 50000, + [MLX5E_50GBASE_KR2] = 50000, +}; + +static const u32 mlx5e_ext_link_speed[MLX5E_EXT_LINK_MODES_NUMBER] = { + [MLX5E_SGMII_100M] = 100, + [MLX5E_1000BASE_X_SGMII] = 1000, + [MLX5E_5GBASE_R] = 5000, + [MLX5E_10GBASE_XFI_XAUI_1] = 10000, + [MLX5E_40GBASE_XLAUI_4_XLPPI_4] = 40000, + [MLX5E_25GAUI_1_25GBASE_CR_KR] = 25000, + [MLX5E_50GAUI_2_LAUI_2_50GBASE_CR2_KR2] = 50000, + [MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR] = 50000, + [MLX5E_CAUI_4_100GBASE_CR4_KR4] = 100000, + [MLX5E_100GAUI_2_100GBASE_CR2_KR2] = 100000, + [MLX5E_200GAUI_4_200GBASE_CR4_KR4] = 200000, + [MLX5E_400GAUI_8] = 400000, + [MLX5E_100GAUI_1_100GBASE_CR_KR] = 100000, + [MLX5E_200GAUI_2_200GBASE_CR2_KR2] = 200000, + [MLX5E_400GAUI_4_400GBASE_CR4_KR4] = 400000, +}; + +bool mlx5e_ptys_ext_supported(struct mlx5_core_dev *mdev) +{ + struct mlx5e_port_eth_proto eproto; + int err; + + if (MLX5_CAP_PCAM_FEATURE(mdev, ptys_extended_ethernet)) + return true; + + err = mlx5_port_query_eth_proto(mdev, 1, true, &eproto); + if (err) + return false; + + return !!eproto.cap; +} + +static void mlx5e_port_get_speed_arr(struct mlx5_core_dev *mdev, + const u32 **arr, u32 *size, + bool force_legacy) +{ + bool ext = force_legacy ? false : mlx5e_ptys_ext_supported(mdev); + + *size = ext ? ARRAY_SIZE(mlx5e_ext_link_speed) : + ARRAY_SIZE(mlx5e_link_speed); + *arr = ext ? mlx5e_ext_link_speed : mlx5e_link_speed; +} + +int mlx5_port_query_eth_proto(struct mlx5_core_dev *dev, u8 port, bool ext, + struct mlx5e_port_eth_proto *eproto) +{ + u32 out[MLX5_ST_SZ_DW(ptys_reg)]; + int err; + + if (!eproto) + return -EINVAL; + + err = mlx5_query_port_ptys(dev, out, sizeof(out), MLX5_PTYS_EN, port); + if (err) + return err; + + eproto->cap = MLX5_GET_ETH_PROTO(ptys_reg, out, ext, + eth_proto_capability); + eproto->admin = MLX5_GET_ETH_PROTO(ptys_reg, out, ext, eth_proto_admin); + eproto->oper = MLX5_GET_ETH_PROTO(ptys_reg, out, ext, eth_proto_oper); + return 0; +} + +void mlx5_port_query_eth_autoneg(struct mlx5_core_dev *dev, u8 *an_status, + u8 *an_disable_cap, u8 *an_disable_admin) +{ + u32 out[MLX5_ST_SZ_DW(ptys_reg)]; + + *an_status = 0; + *an_disable_cap = 0; + *an_disable_admin = 0; + + if (mlx5_query_port_ptys(dev, out, sizeof(out), MLX5_PTYS_EN, 1)) + return; + + *an_status = MLX5_GET(ptys_reg, out, an_status); + *an_disable_cap = MLX5_GET(ptys_reg, out, an_disable_cap); + *an_disable_admin = MLX5_GET(ptys_reg, out, an_disable_admin); +} + +int mlx5_port_set_eth_ptys(struct mlx5_core_dev *dev, bool an_disable, + u32 proto_admin, bool ext) +{ + u32 out[MLX5_ST_SZ_DW(ptys_reg)]; + u32 in[MLX5_ST_SZ_DW(ptys_reg)]; + u8 an_disable_admin; + u8 an_disable_cap; + u8 an_status; + + mlx5_port_query_eth_autoneg(dev, &an_status, &an_disable_cap, + &an_disable_admin); + if (!an_disable_cap && an_disable) + return -EPERM; + + memset(in, 0, sizeof(in)); + + MLX5_SET(ptys_reg, in, local_port, 1); + MLX5_SET(ptys_reg, in, an_disable_admin, an_disable); + MLX5_SET(ptys_reg, in, proto_mask, MLX5_PTYS_EN); + if (ext) + MLX5_SET(ptys_reg, in, ext_eth_proto_admin, proto_admin); + else + MLX5_SET(ptys_reg, in, eth_proto_admin, proto_admin); + + return mlx5_core_access_reg(dev, in, sizeof(in), out, + sizeof(out), MLX5_REG_PTYS, 0, 1); +} + +u32 mlx5e_port_ptys2speed(struct mlx5_core_dev *mdev, u32 eth_proto_oper, + bool force_legacy) +{ + unsigned long temp = eth_proto_oper; + const u32 *table; + u32 speed = 0; + u32 max_size; + int i; + + mlx5e_port_get_speed_arr(mdev, &table, &max_size, force_legacy); + i = find_first_bit(&temp, max_size); + if (i < max_size) + speed = table[i]; + return speed; +} + +int mlx5e_port_linkspeed(struct mlx5_core_dev *mdev, u32 *speed) +{ + struct mlx5e_port_eth_proto eproto; + bool force_legacy = false; + bool ext; + int err; + + ext = mlx5e_ptys_ext_supported(mdev); + err = mlx5_port_query_eth_proto(mdev, 1, ext, &eproto); + if (err) + goto out; + if (ext && !eproto.admin) { + force_legacy = true; + err = mlx5_port_query_eth_proto(mdev, 1, false, &eproto); + if (err) + goto out; + } + *speed = mlx5e_port_ptys2speed(mdev, eproto.oper, force_legacy); + if (!(*speed)) + err = -EINVAL; + +out: + return err; +} + +int mlx5e_port_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed) +{ + struct mlx5e_port_eth_proto eproto; + u32 max_speed = 0; + const u32 *table; + u32 max_size; + bool ext; + int err; + int i; + + ext = mlx5e_ptys_ext_supported(mdev); + err = mlx5_port_query_eth_proto(mdev, 1, ext, &eproto); + if (err) + return err; + + mlx5e_port_get_speed_arr(mdev, &table, &max_size, false); + for (i = 0; i < max_size; ++i) + if (eproto.cap & MLX5E_PROT_MASK(i)) + max_speed = max(max_speed, table[i]); + + *speed = max_speed; + return 0; +} + +u32 mlx5e_port_speed2linkmodes(struct mlx5_core_dev *mdev, u32 speed, + bool force_legacy) +{ + u32 link_modes = 0; + const u32 *table; + u32 max_size; + int i; + + mlx5e_port_get_speed_arr(mdev, &table, &max_size, force_legacy); + for (i = 0; i < max_size; ++i) { + if (table[i] == speed) + link_modes |= MLX5E_PROT_MASK(i); + } + return link_modes; +} + +int mlx5e_port_query_pbmc(struct mlx5_core_dev *mdev, void *out) +{ + int sz = MLX5_ST_SZ_BYTES(pbmc_reg); + void *in; + int err; + + in = kzalloc(sz, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(pbmc_reg, in, local_port, 1); + err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PBMC, 0, 0); + + kfree(in); + return err; +} + +int mlx5e_port_set_pbmc(struct mlx5_core_dev *mdev, void *in) +{ + int sz = MLX5_ST_SZ_BYTES(pbmc_reg); + void *out; + int err; + + out = kzalloc(sz, GFP_KERNEL); + if (!out) + return -ENOMEM; + + MLX5_SET(pbmc_reg, in, local_port, 1); + err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PBMC, 0, 1); + + kfree(out); + return err; +} + +/* buffer[i]: buffer that priority i mapped to */ +int mlx5e_port_query_priority2buffer(struct mlx5_core_dev *mdev, u8 *buffer) +{ + int sz = MLX5_ST_SZ_BYTES(pptb_reg); + u32 prio_x_buff; + void *out; + void *in; + int prio; + int err; + + in = kzalloc(sz, GFP_KERNEL); + out = kzalloc(sz, GFP_KERNEL); + if (!in || !out) { + err = -ENOMEM; + goto out; + } + + MLX5_SET(pptb_reg, in, local_port, 1); + err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPTB, 0, 0); + if (err) + goto out; + + prio_x_buff = MLX5_GET(pptb_reg, out, prio_x_buff); + for (prio = 0; prio < 8; prio++) { + buffer[prio] = (u8)(prio_x_buff >> (4 * prio)) & 0xF; + mlx5_core_dbg(mdev, "prio %d, buffer %d\n", prio, buffer[prio]); + } +out: + kfree(in); + kfree(out); + return err; +} + +int mlx5e_port_set_priority2buffer(struct mlx5_core_dev *mdev, u8 *buffer) +{ + int sz = MLX5_ST_SZ_BYTES(pptb_reg); + u32 prio_x_buff; + void *out; + void *in; + int prio; + int err; + + in = kzalloc(sz, GFP_KERNEL); + out = kzalloc(sz, GFP_KERNEL); + if (!in || !out) { + err = -ENOMEM; + goto out; + } + + /* First query the pptb register */ + MLX5_SET(pptb_reg, in, local_port, 1); + err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPTB, 0, 0); + if (err) + goto out; + + memcpy(in, out, sz); + MLX5_SET(pptb_reg, in, local_port, 1); + + /* Update the pm and prio_x_buff */ + MLX5_SET(pptb_reg, in, pm, 0xFF); + + prio_x_buff = 0; + for (prio = 0; prio < 8; prio++) + prio_x_buff |= (buffer[prio] << (4 * prio)); + MLX5_SET(pptb_reg, in, prio_x_buff, prio_x_buff); + + err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPTB, 0, 1); + +out: + kfree(in); + kfree(out); + return err; +} + +enum mlx5e_fec_supported_link_mode { + MLX5E_FEC_SUPPORTED_LINK_MODES_10G_40G, + MLX5E_FEC_SUPPORTED_LINK_MODES_25G, + MLX5E_FEC_SUPPORTED_LINK_MODES_50G, + MLX5E_FEC_SUPPORTED_LINK_MODES_56G, + MLX5E_FEC_SUPPORTED_LINK_MODES_100G, + MLX5E_FEC_SUPPORTED_LINK_MODE_50G_1X, + MLX5E_FEC_SUPPORTED_LINK_MODE_100G_2X, + MLX5E_FEC_SUPPORTED_LINK_MODE_200G_4X, + MLX5E_FEC_SUPPORTED_LINK_MODE_400G_8X, + MLX5E_MAX_FEC_SUPPORTED_LINK_MODE, +}; + +#define MLX5E_FEC_FIRST_50G_PER_LANE_MODE MLX5E_FEC_SUPPORTED_LINK_MODE_50G_1X + +#define MLX5E_FEC_OVERRIDE_ADMIN_POLICY(buf, policy, write, link) \ + do { \ + u16 *_policy = &(policy); \ + u32 *_buf = buf; \ + \ + if (write) \ + MLX5_SET(pplm_reg, _buf, fec_override_admin_##link, *_policy); \ + else \ + *_policy = MLX5_GET(pplm_reg, _buf, fec_override_admin_##link); \ + } while (0) + +/* get/set FEC admin field for a given speed */ +static int mlx5e_fec_admin_field(u32 *pplm, u16 *fec_policy, bool write, + enum mlx5e_fec_supported_link_mode link_mode) +{ + switch (link_mode) { + case MLX5E_FEC_SUPPORTED_LINK_MODES_10G_40G: + MLX5E_FEC_OVERRIDE_ADMIN_POLICY(pplm, *fec_policy, write, 10g_40g); + break; + case MLX5E_FEC_SUPPORTED_LINK_MODES_25G: + MLX5E_FEC_OVERRIDE_ADMIN_POLICY(pplm, *fec_policy, write, 25g); + break; + case MLX5E_FEC_SUPPORTED_LINK_MODES_50G: + MLX5E_FEC_OVERRIDE_ADMIN_POLICY(pplm, *fec_policy, write, 50g); + break; + case MLX5E_FEC_SUPPORTED_LINK_MODES_56G: + MLX5E_FEC_OVERRIDE_ADMIN_POLICY(pplm, *fec_policy, write, 56g); + break; + case MLX5E_FEC_SUPPORTED_LINK_MODES_100G: + MLX5E_FEC_OVERRIDE_ADMIN_POLICY(pplm, *fec_policy, write, 100g); + break; + case MLX5E_FEC_SUPPORTED_LINK_MODE_50G_1X: + MLX5E_FEC_OVERRIDE_ADMIN_POLICY(pplm, *fec_policy, write, 50g_1x); + break; + case MLX5E_FEC_SUPPORTED_LINK_MODE_100G_2X: + MLX5E_FEC_OVERRIDE_ADMIN_POLICY(pplm, *fec_policy, write, 100g_2x); + break; + case MLX5E_FEC_SUPPORTED_LINK_MODE_200G_4X: + MLX5E_FEC_OVERRIDE_ADMIN_POLICY(pplm, *fec_policy, write, 200g_4x); + break; + case MLX5E_FEC_SUPPORTED_LINK_MODE_400G_8X: + MLX5E_FEC_OVERRIDE_ADMIN_POLICY(pplm, *fec_policy, write, 400g_8x); + break; + default: + return -EINVAL; + } + return 0; +} + +#define MLX5E_GET_FEC_OVERRIDE_CAP(buf, link) \ + MLX5_GET(pplm_reg, buf, fec_override_cap_##link) + +/* returns FEC capabilities for a given speed */ +static int mlx5e_get_fec_cap_field(u32 *pplm, u16 *fec_cap, + enum mlx5e_fec_supported_link_mode link_mode) +{ + switch (link_mode) { + case MLX5E_FEC_SUPPORTED_LINK_MODES_10G_40G: + *fec_cap = MLX5E_GET_FEC_OVERRIDE_CAP(pplm, 10g_40g); + break; + case MLX5E_FEC_SUPPORTED_LINK_MODES_25G: + *fec_cap = MLX5E_GET_FEC_OVERRIDE_CAP(pplm, 25g); + break; + case MLX5E_FEC_SUPPORTED_LINK_MODES_50G: + *fec_cap = MLX5E_GET_FEC_OVERRIDE_CAP(pplm, 50g); + break; + case MLX5E_FEC_SUPPORTED_LINK_MODES_56G: + *fec_cap = MLX5E_GET_FEC_OVERRIDE_CAP(pplm, 56g); + break; + case MLX5E_FEC_SUPPORTED_LINK_MODES_100G: + *fec_cap = MLX5E_GET_FEC_OVERRIDE_CAP(pplm, 100g); + break; + case MLX5E_FEC_SUPPORTED_LINK_MODE_50G_1X: + *fec_cap = MLX5E_GET_FEC_OVERRIDE_CAP(pplm, 50g_1x); + break; + case MLX5E_FEC_SUPPORTED_LINK_MODE_100G_2X: + *fec_cap = MLX5E_GET_FEC_OVERRIDE_CAP(pplm, 100g_2x); + break; + case MLX5E_FEC_SUPPORTED_LINK_MODE_200G_4X: + *fec_cap = MLX5E_GET_FEC_OVERRIDE_CAP(pplm, 200g_4x); + break; + case MLX5E_FEC_SUPPORTED_LINK_MODE_400G_8X: + *fec_cap = MLX5E_GET_FEC_OVERRIDE_CAP(pplm, 400g_8x); + break; + default: + return -EINVAL; + } + return 0; +} + +bool mlx5e_fec_in_caps(struct mlx5_core_dev *dev, int fec_policy) +{ + bool fec_50g_per_lane = MLX5_CAP_PCAM_FEATURE(dev, fec_50G_per_lane_in_pplm); + u32 out[MLX5_ST_SZ_DW(pplm_reg)] = {}; + u32 in[MLX5_ST_SZ_DW(pplm_reg)] = {}; + int sz = MLX5_ST_SZ_BYTES(pplm_reg); + int err; + int i; + + if (!MLX5_CAP_GEN(dev, pcam_reg) || !MLX5_CAP_PCAM_REG(dev, pplm)) + return false; + + MLX5_SET(pplm_reg, in, local_port, 1); + err = mlx5_core_access_reg(dev, in, sz, out, sz, MLX5_REG_PPLM, 0, 0); + if (err) + return false; + + for (i = 0; i < MLX5E_MAX_FEC_SUPPORTED_LINK_MODE; i++) { + u16 fec_caps; + + if (i >= MLX5E_FEC_FIRST_50G_PER_LANE_MODE && !fec_50g_per_lane) + break; + + mlx5e_get_fec_cap_field(out, &fec_caps, i); + if (fec_caps & fec_policy) + return true; + } + return false; +} + +int mlx5e_get_fec_mode(struct mlx5_core_dev *dev, u32 *fec_mode_active, + u16 *fec_configured_mode) +{ + bool fec_50g_per_lane = MLX5_CAP_PCAM_FEATURE(dev, fec_50G_per_lane_in_pplm); + u32 out[MLX5_ST_SZ_DW(pplm_reg)] = {}; + u32 in[MLX5_ST_SZ_DW(pplm_reg)] = {}; + int sz = MLX5_ST_SZ_BYTES(pplm_reg); + int err; + int i; + + if (!MLX5_CAP_GEN(dev, pcam_reg)) + return false; + + if (!MLX5_CAP_PCAM_REG(dev, pplm)) + return false; + + MLX5_SET(pplm_reg, in, local_port, 1); + err = mlx5_core_access_reg(dev, in, sz, out, sz, MLX5_REG_PPLM, 0, 0); + if (err) + return err; + + *fec_mode_active = MLX5_GET(pplm_reg, out, fec_mode_active); + + if (!fec_configured_mode) + goto out; + + *fec_configured_mode = 0; + for (i = 0; i < MLX5E_MAX_FEC_SUPPORTED_LINK_MODE; i++) { + if (i >= MLX5E_FEC_FIRST_50G_PER_LANE_MODE && !fec_50g_per_lane) + break; + + mlx5e_fec_admin_field(out, fec_configured_mode, 0, i); + if (*fec_configured_mode != 0) + goto out; + } +out: + return 0; +} + +int mlx5e_set_fec_mode(struct mlx5_core_dev *dev, u16 fec_policy) +{ + bool fec_50g_per_lane = MLX5_CAP_PCAM_FEATURE(dev, fec_50G_per_lane_in_pplm); + u32 out[MLX5_ST_SZ_DW(pplm_reg)] = {}; + u32 in[MLX5_ST_SZ_DW(pplm_reg)] = {}; + int sz = MLX5_ST_SZ_BYTES(pplm_reg); + u16 fec_policy_auto = 0; + int err; + int i; + + if (!MLX5_CAP_GEN(dev, pcam_reg)) + return -EOPNOTSUPP; + + if (!MLX5_CAP_PCAM_REG(dev, pplm)) + return -EOPNOTSUPP; + + if (fec_policy >= (1 << MLX5E_FEC_LLRS_272_257_1) && !fec_50g_per_lane) + return -EOPNOTSUPP; + + if (fec_policy && !mlx5e_fec_in_caps(dev, fec_policy)) + return -EOPNOTSUPP; + + MLX5_SET(pplm_reg, in, local_port, 1); + err = mlx5_core_access_reg(dev, in, sz, out, sz, MLX5_REG_PPLM, 0, 0); + if (err) + return err; + + MLX5_SET(pplm_reg, out, local_port, 1); + + for (i = 0; i < MLX5E_MAX_FEC_SUPPORTED_LINK_MODE; i++) { + u16 conf_fec = fec_policy; + u16 fec_caps = 0; + + if (i >= MLX5E_FEC_FIRST_50G_PER_LANE_MODE && !fec_50g_per_lane) + break; + + /* RS fec in ethtool is mapped to MLX5E_FEC_RS_528_514 + * to link modes up to 25G per lane and to + * MLX5E_FEC_RS_544_514 in the new link modes based on + * 50 G per lane + */ + if (conf_fec == (1 << MLX5E_FEC_RS_528_514) && + i >= MLX5E_FEC_FIRST_50G_PER_LANE_MODE) + conf_fec = (1 << MLX5E_FEC_RS_544_514); + + mlx5e_get_fec_cap_field(out, &fec_caps, i); + + /* policy supported for link speed */ + if (fec_caps & conf_fec) + mlx5e_fec_admin_field(out, &conf_fec, 1, i); + else + /* set FEC to auto*/ + mlx5e_fec_admin_field(out, &fec_policy_auto, 1, i); + } + + return mlx5_core_access_reg(dev, out, sz, out, sz, MLX5_REG_PPLM, 0, 1); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/port.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/port.h new file mode 100644 index 0000000..efbafd7 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/port.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2018, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __MLX5E_EN_PORT_H +#define __MLX5E_EN_PORT_H + +#include +#include "en.h" + +struct mlx5e_port_eth_proto { + u32 cap; + u32 admin; + u32 oper; +}; + +int mlx5_query_port_status(struct mlx5_core_dev *mdev, u32 *status_opcode, + u16 *monitor_opcode, char *status_message); +int mlx5_port_query_eth_proto(struct mlx5_core_dev *dev, u8 port, bool ext, + struct mlx5e_port_eth_proto *eproto); +void mlx5_port_query_eth_autoneg(struct mlx5_core_dev *dev, u8 *an_status, + u8 *an_disable_cap, u8 *an_disable_admin); +int mlx5_port_set_eth_ptys(struct mlx5_core_dev *dev, bool an_disable, + u32 proto_admin, bool ext); +u32 mlx5e_port_ptys2speed(struct mlx5_core_dev *mdev, u32 eth_proto_oper, + bool force_legacy); +int mlx5e_port_linkspeed(struct mlx5_core_dev *mdev, u32 *speed); +int mlx5e_port_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed); +u32 mlx5e_port_speed2linkmodes(struct mlx5_core_dev *mdev, u32 speed, + bool force_legacy); +bool mlx5e_ptys_ext_supported(struct mlx5_core_dev *mdev); +int mlx5e_port_query_pbmc(struct mlx5_core_dev *mdev, void *out); +int mlx5e_port_set_pbmc(struct mlx5_core_dev *mdev, void *in); +int mlx5e_port_query_priority2buffer(struct mlx5_core_dev *mdev, u8 *buffer); +int mlx5e_port_set_priority2buffer(struct mlx5_core_dev *mdev, u8 *buffer); + +bool mlx5e_fec_in_caps(struct mlx5_core_dev *dev, int fec_policy); +int mlx5e_get_fec_mode(struct mlx5_core_dev *dev, u32 *fec_mode_active, + u16 *fec_configured_mode); +int mlx5e_set_fec_mode(struct mlx5_core_dev *dev, u16 fec_policy); + +enum { + MLX5E_FEC_NOFEC, + MLX5E_FEC_FIRECODE, + MLX5E_FEC_RS_528_514, + MLX5E_FEC_RS_544_514 = 7, + MLX5E_FEC_LLRS_272_257_1 = 9, +}; + +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c new file mode 100644 index 0000000..c9d5d8d --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c @@ -0,0 +1,362 @@ +/* + * Copyright (c) 2018, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "port_buffer.h" + +int mlx5e_port_query_buffer(struct mlx5e_priv *priv, + struct mlx5e_port_buffer *port_buffer) +{ + u16 port_buff_cell_sz = priv->dcbx.port_buff_cell_sz; + struct mlx5_core_dev *mdev = priv->mdev; + int sz = MLX5_ST_SZ_BYTES(pbmc_reg); + u32 total_used = 0; + void *buffer; + void *out; + int err; + int i; + + out = kzalloc(sz, GFP_KERNEL); + if (!out) + return -ENOMEM; + + err = mlx5e_port_query_pbmc(mdev, out); + if (err) + goto out; + + for (i = 0; i < MLX5E_MAX_BUFFER; i++) { + buffer = MLX5_ADDR_OF(pbmc_reg, out, buffer[i]); + port_buffer->buffer[i].lossy = + MLX5_GET(bufferx_reg, buffer, lossy); + port_buffer->buffer[i].epsb = + MLX5_GET(bufferx_reg, buffer, epsb); + port_buffer->buffer[i].size = + MLX5_GET(bufferx_reg, buffer, size) * port_buff_cell_sz; + port_buffer->buffer[i].xon = + MLX5_GET(bufferx_reg, buffer, xon_threshold) * port_buff_cell_sz; + port_buffer->buffer[i].xoff = + MLX5_GET(bufferx_reg, buffer, xoff_threshold) * port_buff_cell_sz; + total_used += port_buffer->buffer[i].size; + + mlx5e_dbg(HW, priv, "buffer %d: size=%d, xon=%d, xoff=%d, epsb=%d, lossy=%d\n", i, + port_buffer->buffer[i].size, + port_buffer->buffer[i].xon, + port_buffer->buffer[i].xoff, + port_buffer->buffer[i].epsb, + port_buffer->buffer[i].lossy); + } + + port_buffer->port_buffer_size = + MLX5_GET(pbmc_reg, out, port_buffer_size) * port_buff_cell_sz; + port_buffer->spare_buffer_size = + port_buffer->port_buffer_size - total_used; + + mlx5e_dbg(HW, priv, "total buffer size=%d, spare buffer size=%d\n", + port_buffer->port_buffer_size, + port_buffer->spare_buffer_size); +out: + kfree(out); + return err; +} + +static int port_set_buffer(struct mlx5e_priv *priv, + struct mlx5e_port_buffer *port_buffer) +{ + u16 port_buff_cell_sz = priv->dcbx.port_buff_cell_sz; + struct mlx5_core_dev *mdev = priv->mdev; + int sz = MLX5_ST_SZ_BYTES(pbmc_reg); + void *in; + int err; + int i; + + in = kzalloc(sz, GFP_KERNEL); + if (!in) + return -ENOMEM; + + err = mlx5e_port_query_pbmc(mdev, in); + if (err) + goto out; + + for (i = 0; i < MLX5E_MAX_BUFFER; i++) { + void *buffer = MLX5_ADDR_OF(pbmc_reg, in, buffer[i]); + u64 size = port_buffer->buffer[i].size; + u64 xoff = port_buffer->buffer[i].xoff; + u64 xon = port_buffer->buffer[i].xon; + + do_div(size, port_buff_cell_sz); + do_div(xoff, port_buff_cell_sz); + do_div(xon, port_buff_cell_sz); + MLX5_SET(bufferx_reg, buffer, size, size); + MLX5_SET(bufferx_reg, buffer, lossy, port_buffer->buffer[i].lossy); + MLX5_SET(bufferx_reg, buffer, xoff_threshold, xoff); + MLX5_SET(bufferx_reg, buffer, xon_threshold, xon); + } + + err = mlx5e_port_set_pbmc(mdev, in); +out: + kfree(in); + return err; +} + +/* xoff = ((301+2.16 * len [m]) * speed [Gbps] + 2.72 MTU [B]) + * minimum speed value is 40Gbps + */ +static u32 calculate_xoff(struct mlx5e_priv *priv, unsigned int mtu) +{ + u32 speed; + u32 xoff; + int err; + + err = mlx5e_port_linkspeed(priv->mdev, &speed); + if (err) + speed = SPEED_40000; + speed = max_t(u32, speed, SPEED_40000); + + xoff = (301 + 216 * priv->dcbx.cable_len / 100) * speed / 1000 + 272 * mtu / 100; + + mlx5e_dbg(HW, priv, "%s: xoff=%d\n", __func__, xoff); + return xoff; +} + +static int update_xoff_threshold(struct mlx5e_port_buffer *port_buffer, + u32 xoff, unsigned int max_mtu, u16 port_buff_cell_sz) +{ + int i; + + for (i = 0; i < MLX5E_MAX_BUFFER; i++) { + if (port_buffer->buffer[i].lossy) { + port_buffer->buffer[i].xoff = 0; + port_buffer->buffer[i].xon = 0; + continue; + } + + if (port_buffer->buffer[i].size < + (xoff + max_mtu + port_buff_cell_sz)) { + pr_err("buffer_size[%d]=%d is not enough for lossless buffer\n", + i, port_buffer->buffer[i].size); + return -ENOMEM; + } + + port_buffer->buffer[i].xoff = port_buffer->buffer[i].size - xoff; + port_buffer->buffer[i].xon = + port_buffer->buffer[i].xoff - max_mtu; + } + + return 0; +} + +/** + * update_buffer_lossy - Update buffer configuration based on pfc + * @max_mtu: netdev's max_mtu + * @pfc_en: current pfc configuration + * @buffer: current prio to buffer mapping + * @xoff: xoff value + * @port_buff_cell_sz: port buffer cell_size + * @port_buffer: port receive buffer configuration + * @change: + * + * Update buffer configuration based on pfc configuration and + * priority to buffer mapping. + * Buffer's lossy bit is changed to: + * lossless if there is at least one PFC enabled priority + * mapped to this buffer lossy if all priorities mapped to + * this buffer are PFC disabled + * + * @return: 0 if no error, + * sets change to true if buffer configuration was modified. + */ +static int update_buffer_lossy(unsigned int max_mtu, + u8 pfc_en, u8 *buffer, u32 xoff, u16 port_buff_cell_sz, + struct mlx5e_port_buffer *port_buffer, + bool *change) +{ + bool changed = false; + u8 lossy_count; + u8 prio_count; + u8 lossy; + int prio; + int err; + int i; + + for (i = 0; i < MLX5E_MAX_BUFFER; i++) { + prio_count = 0; + lossy_count = 0; + + for (prio = 0; prio < MLX5E_MAX_PRIORITY; prio++) { + if (buffer[prio] != i) + continue; + + prio_count++; + lossy_count += !(pfc_en & (1 << prio)); + } + + if (lossy_count == prio_count) + lossy = 1; + else /* lossy_count < prio_count */ + lossy = 0; + + if (lossy != port_buffer->buffer[i].lossy) { + port_buffer->buffer[i].lossy = lossy; + changed = true; + } + } + + if (changed) { + err = update_xoff_threshold(port_buffer, xoff, max_mtu, port_buff_cell_sz); + if (err) + return err; + + *change = true; + } + + return 0; +} + +static int fill_pfc_en(struct mlx5_core_dev *mdev, u8 *pfc_en) +{ + u32 g_rx_pause, g_tx_pause; + int err; + + err = mlx5_query_port_pause(mdev, &g_rx_pause, &g_tx_pause); + if (err) + return err; + + /* If global pause enabled, set all active buffers to lossless. + * Otherwise, check PFC setting. + */ + if (g_rx_pause || g_tx_pause) + *pfc_en = 0xff; + else + err = mlx5_query_port_pfc(mdev, pfc_en, NULL); + + return err; +} + +#define MINIMUM_MAX_MTU 9216 +int mlx5e_port_manual_buffer_config(struct mlx5e_priv *priv, + u32 change, unsigned int mtu, + struct ieee_pfc *pfc, + u32 *buffer_size, + u8 *prio2buffer) +{ + u16 port_buff_cell_sz = priv->dcbx.port_buff_cell_sz; + struct mlx5e_port_buffer port_buffer; + u32 xoff = calculate_xoff(priv, mtu); + bool update_prio2buffer = false; + u8 buffer[MLX5E_MAX_PRIORITY]; + bool update_buffer = false; + unsigned int max_mtu; + u32 total_used = 0; + u8 curr_pfc_en; + int err; + int i; + + mlx5e_dbg(HW, priv, "%s: change=%x\n", __func__, change); + max_mtu = max_t(unsigned int, priv->netdev->max_mtu, MINIMUM_MAX_MTU); + + err = mlx5e_port_query_buffer(priv, &port_buffer); + if (err) + return err; + + if (change & MLX5E_PORT_BUFFER_CABLE_LEN) { + update_buffer = true; + err = update_xoff_threshold(&port_buffer, xoff, max_mtu, port_buff_cell_sz); + if (err) + return err; + } + + if (change & MLX5E_PORT_BUFFER_PFC) { + err = mlx5e_port_query_priority2buffer(priv->mdev, buffer); + if (err) + return err; + + err = update_buffer_lossy(max_mtu, pfc->pfc_en, buffer, xoff, port_buff_cell_sz, + &port_buffer, &update_buffer); + if (err) + return err; + } + + if (change & MLX5E_PORT_BUFFER_PRIO2BUFFER) { + update_prio2buffer = true; + err = fill_pfc_en(priv->mdev, &curr_pfc_en); + if (err) + return err; + + err = update_buffer_lossy(max_mtu, curr_pfc_en, prio2buffer, xoff, + port_buff_cell_sz, &port_buffer, &update_buffer); + if (err) + return err; + } + + if (change & MLX5E_PORT_BUFFER_SIZE) { + for (i = 0; i < MLX5E_MAX_BUFFER; i++) { + mlx5e_dbg(HW, priv, "%s: buffer[%d]=%d\n", __func__, i, buffer_size[i]); + if (!port_buffer.buffer[i].lossy && !buffer_size[i]) { + mlx5e_dbg(HW, priv, "%s: lossless buffer[%d] size cannot be zero\n", + __func__, i); + return -EINVAL; + } + + port_buffer.buffer[i].size = buffer_size[i]; + total_used += buffer_size[i]; + } + + mlx5e_dbg(HW, priv, "%s: total buffer requested=%d\n", __func__, total_used); + + if (total_used > port_buffer.port_buffer_size) + return -EINVAL; + + update_buffer = true; + err = update_xoff_threshold(&port_buffer, xoff, max_mtu, port_buff_cell_sz); + if (err) + return err; + } + + /* Need to update buffer configuration if xoff value is changed */ + if (!update_buffer && xoff != priv->dcbx.xoff) { + update_buffer = true; + err = update_xoff_threshold(&port_buffer, xoff, max_mtu, port_buff_cell_sz); + if (err) + return err; + } + priv->dcbx.xoff = xoff; + + /* Apply the settings */ + if (update_buffer) { + err = port_set_buffer(priv, &port_buffer); + if (err) + return err; + } + + if (update_prio2buffer) + err = mlx5e_port_set_priority2buffer(priv->mdev, prio2buffer); + + return err; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.h new file mode 100644 index 0000000..80af7a5 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2018, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __MLX5_EN_PORT_BUFFER_H__ +#define __MLX5_EN_PORT_BUFFER_H__ + +#include "en.h" +#include "port.h" + +#define MLX5E_MAX_BUFFER 8 +#define MLX5E_DEFAULT_CABLE_LEN 7 /* 7 meters */ + +#define MLX5_BUFFER_SUPPORTED(mdev) (MLX5_CAP_GEN(mdev, pcam_reg) && \ + MLX5_CAP_PCAM_REG(mdev, pbmc) && \ + MLX5_CAP_PCAM_REG(mdev, pptb)) + +enum { + MLX5E_PORT_BUFFER_CABLE_LEN = BIT(0), + MLX5E_PORT_BUFFER_PFC = BIT(1), + MLX5E_PORT_BUFFER_PRIO2BUFFER = BIT(2), + MLX5E_PORT_BUFFER_SIZE = BIT(3), +}; + +struct mlx5e_bufferx_reg { + u8 lossy; + u8 epsb; + u32 size; + u32 xoff; + u32 xon; +}; + +struct mlx5e_port_buffer { + u32 port_buffer_size; + u32 spare_buffer_size; + struct mlx5e_bufferx_reg buffer[MLX5E_MAX_BUFFER]; +}; + +int mlx5e_port_manual_buffer_config(struct mlx5e_priv *priv, + u32 change, unsigned int mtu, + struct ieee_pfc *pfc, + u32 *buffer_size, + u8 *prio2buffer); + +int mlx5e_port_query_buffer(struct mlx5e_priv *priv, + struct mlx5e_port_buffer *port_buffer); +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c new file mode 100644 index 0000000..3da3356 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c @@ -0,0 +1,837 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// Copyright (c) 2020 Mellanox Technologies + +#include "en/ptp.h" +#include "en/txrx.h" +#include "en/params.h" +#include "en/fs_tt_redirect.h" + +struct mlx5e_ptp_fs { + struct mlx5_flow_handle *l2_rule; + struct mlx5_flow_handle *udp_v4_rule; + struct mlx5_flow_handle *udp_v6_rule; + bool valid; +}; + +struct mlx5e_ptp_params { + struct mlx5e_params params; + struct mlx5e_sq_param txq_sq_param; + struct mlx5e_rq_param rq_param; +}; + +struct mlx5e_skb_cb_hwtstamp { + ktime_t cqe_hwtstamp; + ktime_t port_hwtstamp; +}; + +void mlx5e_skb_cb_hwtstamp_init(struct sk_buff *skb) +{ + memset(skb->cb, 0, sizeof(struct mlx5e_skb_cb_hwtstamp)); +} + +static struct mlx5e_skb_cb_hwtstamp *mlx5e_skb_cb_get_hwts(struct sk_buff *skb) +{ + BUILD_BUG_ON(sizeof(struct mlx5e_skb_cb_hwtstamp) > sizeof(skb->cb)); + return (struct mlx5e_skb_cb_hwtstamp *)skb->cb; +} + +static void mlx5e_skb_cb_hwtstamp_tx(struct sk_buff *skb, + struct mlx5e_ptp_cq_stats *cq_stats) +{ + struct skb_shared_hwtstamps hwts = {}; + ktime_t diff; + + diff = abs(mlx5e_skb_cb_get_hwts(skb)->port_hwtstamp - + mlx5e_skb_cb_get_hwts(skb)->cqe_hwtstamp); + + /* Maximal allowed diff is 1 / 128 second */ + if (diff > (NSEC_PER_SEC >> 7)) { + cq_stats->abort++; + cq_stats->abort_abs_diff_ns += diff; + return; + } + + hwts.hwtstamp = mlx5e_skb_cb_get_hwts(skb)->port_hwtstamp; + skb_tstamp_tx(skb, &hwts); +} + +void mlx5e_skb_cb_hwtstamp_handler(struct sk_buff *skb, int hwtstamp_type, + ktime_t hwtstamp, + struct mlx5e_ptp_cq_stats *cq_stats) +{ + switch (hwtstamp_type) { + case (MLX5E_SKB_CB_CQE_HWTSTAMP): + mlx5e_skb_cb_get_hwts(skb)->cqe_hwtstamp = hwtstamp; + break; + case (MLX5E_SKB_CB_PORT_HWTSTAMP): + mlx5e_skb_cb_get_hwts(skb)->port_hwtstamp = hwtstamp; + break; + } + + /* If both CQEs arrive, check and report the port tstamp, and clear skb cb as + * skb soon to be released. + */ + if (!mlx5e_skb_cb_get_hwts(skb)->cqe_hwtstamp || + !mlx5e_skb_cb_get_hwts(skb)->port_hwtstamp) + return; + + mlx5e_skb_cb_hwtstamp_tx(skb, cq_stats); + memset(skb->cb, 0, sizeof(struct mlx5e_skb_cb_hwtstamp)); +} + +#define MLX5E_PTP_WQE_CTR_MASK(val) ((val) & ptpsq->ts_cqe_ctr_mask) +static void mlx5e_ptp_handle_ts_cqe(struct mlx5e_ptpsq *ptpsq, + struct mlx5_cqe64 *cqe, + int budget) +{ + struct mlx5e_txqsq *sq; + struct sk_buff *skb; + ktime_t hwtstamp; + + if (unlikely(MLX5E_RX_ERR_CQE(cqe))) { + skb = mlx5e_skb_fifo_pop(&ptpsq->skb_fifo); + ptpsq->cq_stats->err_cqe++; + goto out; + } + + sq = &ptpsq->txqsq; + if (MLX5_CAP_GEN_2(sq->mdev, ts_cqe_metadata_size2wqe_counter)) { + u16 ts_cqe_counter = MLX5E_PTP_WQE_CTR_MASK(be16_to_cpu(cqe->wqe_counter)); + u16 skb_fifo_cc_masked = MLX5E_PTP_WQE_CTR_MASK(ptpsq->skb_fifo_cc); + + if (skb_fifo_cc_masked != ts_cqe_counter) { + ptpsq->cq_stats->resync_event++; + do { + struct skb_shared_hwtstamps hwts = {}; + + skb = mlx5e_skb_fifo_pop(&ptpsq->skb_fifo); + hwts.hwtstamp = mlx5e_skb_cb_get_hwts(skb)->cqe_hwtstamp; + skb_tstamp_tx(skb, &hwts); + ptpsq->cq_stats->resync_cqe++; + skb_fifo_cc_masked = MLX5E_PTP_WQE_CTR_MASK(ptpsq->skb_fifo_cc); + } while (skb_fifo_cc_masked != ts_cqe_counter); + } + } + skb = mlx5e_skb_fifo_pop(&ptpsq->skb_fifo); + hwtstamp = mlx5e_cqe_ts_to_ns(sq->ptp_cyc2time, sq->clock, get_cqe_ts(cqe)); + mlx5e_skb_cb_hwtstamp_handler(skb, MLX5E_SKB_CB_PORT_HWTSTAMP, + hwtstamp, ptpsq->cq_stats); + ptpsq->cq_stats->cqe++; + +out: + napi_consume_skb(skb, budget); +} + +static bool mlx5e_ptp_poll_ts_cq(struct mlx5e_cq *cq, int budget) +{ + struct mlx5e_ptpsq *ptpsq = container_of(cq, struct mlx5e_ptpsq, ts_cq); + struct mlx5_cqwq *cqwq = &cq->wq; + struct mlx5_cqe64 *cqe; + int work_done = 0; + + if (unlikely(!test_bit(MLX5E_SQ_STATE_ENABLED, &ptpsq->txqsq.state))) + return false; + + cqe = mlx5_cqwq_get_cqe(cqwq); + if (!cqe) + return false; + + do { + mlx5_cqwq_pop(cqwq); + + mlx5e_ptp_handle_ts_cqe(ptpsq, cqe, budget); + } while ((++work_done < budget) && (cqe = mlx5_cqwq_get_cqe(cqwq))); + + mlx5_cqwq_update_db_record(cqwq); + + /* ensure cq space is freed before enabling more cqes */ + wmb(); + + return work_done == budget; +} + +static int mlx5e_ptp_napi_poll(struct napi_struct *napi, int budget) +{ + struct mlx5e_ptp *c = container_of(napi, struct mlx5e_ptp, napi); + struct mlx5e_ch_stats *ch_stats = c->stats; + struct mlx5e_rq *rq = &c->rq; + bool busy = false; + int work_done = 0; + int i; + + rcu_read_lock(); + + ch_stats->poll++; + + if (test_bit(MLX5E_PTP_STATE_TX, c->state)) { + for (i = 0; i < c->num_tc; i++) { + busy |= mlx5e_poll_tx_cq(&c->ptpsq[i].txqsq.cq, budget); + busy |= mlx5e_ptp_poll_ts_cq(&c->ptpsq[i].ts_cq, budget); + } + } + if (test_bit(MLX5E_PTP_STATE_RX, c->state) && likely(budget)) { + work_done = mlx5e_poll_rx_cq(&rq->cq, budget); + busy |= work_done == budget; + busy |= INDIRECT_CALL_2(rq->post_wqes, + mlx5e_post_rx_mpwqes, + mlx5e_post_rx_wqes, + rq); + } + + if (busy) { + work_done = budget; + goto out; + } + + if (unlikely(!napi_complete_done(napi, work_done))) + goto out; + + ch_stats->arm++; + + if (test_bit(MLX5E_PTP_STATE_TX, c->state)) { + for (i = 0; i < c->num_tc; i++) { + mlx5e_cq_arm(&c->ptpsq[i].txqsq.cq); + mlx5e_cq_arm(&c->ptpsq[i].ts_cq); + } + } + if (test_bit(MLX5E_PTP_STATE_RX, c->state)) + mlx5e_cq_arm(&rq->cq); + +out: + rcu_read_unlock(); + + return work_done; +} + +static int mlx5e_ptp_alloc_txqsq(struct mlx5e_ptp *c, int txq_ix, + struct mlx5e_params *params, + struct mlx5e_sq_param *param, + struct mlx5e_txqsq *sq, int tc, + struct mlx5e_ptpsq *ptpsq) +{ + void *sqc_wq = MLX5_ADDR_OF(sqc, param->sqc, wq); + struct mlx5_core_dev *mdev = c->mdev; + struct mlx5_wq_cyc *wq = &sq->wq; + int err; + int node; + + sq->pdev = c->pdev; + sq->clock = &mdev->clock; + sq->mkey_be = c->mkey_be; + sq->netdev = c->netdev; + sq->priv = c->priv; + sq->mdev = mdev; + sq->ch_ix = MLX5E_PTP_CHANNEL_IX; + sq->txq_ix = txq_ix; + sq->uar_map = mdev->mlx5e_res.hw_objs.bfreg.map; + sq->min_inline_mode = params->tx_min_inline_mode; + sq->hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu); + sq->stats = &c->priv->ptp_stats.sq[tc]; + sq->ptpsq = ptpsq; + INIT_WORK(&sq->recover_work, mlx5e_tx_err_cqe_work); + if (!MLX5_CAP_ETH(mdev, wqe_vlan_insert)) + set_bit(MLX5E_SQ_STATE_VLAN_NEED_L2_INLINE, &sq->state); + sq->stop_room = param->stop_room; + sq->ptp_cyc2time = mlx5_sq_ts_translator(mdev); + + if (MLX5_CAP_GEN_2(mdev, ts_cqe_metadata_size2wqe_counter)) + ptpsq->ts_cqe_ctr_mask = + (1 << MLX5_CAP_GEN_2(mdev, ts_cqe_metadata_size2wqe_counter)) - 1; + + node = dev_to_node(mlx5_core_dma_dev(mdev)); + + param->wq.db_numa_node = node; + err = mlx5_wq_cyc_create(mdev, ¶m->wq, sqc_wq, wq, &sq->wq_ctrl); + if (err) + return err; + wq->db = &wq->db[MLX5_SND_DBR]; + + err = mlx5e_alloc_txqsq_db(sq, node); + if (err) + goto err_sq_wq_destroy; + + return 0; + +err_sq_wq_destroy: + mlx5_wq_destroy(&sq->wq_ctrl); + + return err; +} + +static void mlx5e_ptp_destroy_sq(struct mlx5_core_dev *mdev, u32 sqn) +{ + mlx5_core_destroy_sq(mdev, sqn); +} + +static int mlx5e_ptp_alloc_traffic_db(struct mlx5e_ptpsq *ptpsq, int numa) +{ + int wq_sz = mlx5_wq_cyc_get_size(&ptpsq->txqsq.wq); + + ptpsq->skb_fifo.fifo = kvzalloc_node(array_size(wq_sz, sizeof(*ptpsq->skb_fifo.fifo)), + GFP_KERNEL, numa); + if (!ptpsq->skb_fifo.fifo) + return -ENOMEM; + + ptpsq->skb_fifo.pc = &ptpsq->skb_fifo_pc; + ptpsq->skb_fifo.cc = &ptpsq->skb_fifo_cc; + ptpsq->skb_fifo.mask = wq_sz - 1; + + return 0; +} + +static void mlx5e_ptp_drain_skb_fifo(struct mlx5e_skb_fifo *skb_fifo) +{ + while (*skb_fifo->pc != *skb_fifo->cc) { + struct sk_buff *skb = mlx5e_skb_fifo_pop(skb_fifo); + + dev_kfree_skb_any(skb); + } +} + +static void mlx5e_ptp_free_traffic_db(struct mlx5e_skb_fifo *skb_fifo) +{ + mlx5e_ptp_drain_skb_fifo(skb_fifo); + kvfree(skb_fifo->fifo); +} + +static int mlx5e_ptp_open_txqsq(struct mlx5e_ptp *c, u32 tisn, + int txq_ix, struct mlx5e_ptp_params *cparams, + int tc, struct mlx5e_ptpsq *ptpsq) +{ + struct mlx5e_sq_param *sqp = &cparams->txq_sq_param; + struct mlx5e_txqsq *txqsq = &ptpsq->txqsq; + struct mlx5e_create_sq_param csp = {}; + int err; + + err = mlx5e_ptp_alloc_txqsq(c, txq_ix, &cparams->params, sqp, + txqsq, tc, ptpsq); + if (err) + return err; + + csp.tisn = tisn; + csp.tis_lst_sz = 1; + csp.cqn = txqsq->cq.mcq.cqn; + csp.wq_ctrl = &txqsq->wq_ctrl; + csp.min_inline_mode = txqsq->min_inline_mode; + csp.ts_cqe_to_dest_cqn = ptpsq->ts_cq.mcq.cqn; + + err = mlx5e_create_sq_rdy(c->mdev, sqp, &csp, 0, &txqsq->sqn); + if (err) + goto err_free_txqsq; + + err = mlx5e_ptp_alloc_traffic_db(ptpsq, + dev_to_node(mlx5_core_dma_dev(c->mdev))); + if (err) + goto err_free_txqsq; + + return 0; + +err_free_txqsq: + mlx5e_free_txqsq(txqsq); + + return err; +} + +static void mlx5e_ptp_close_txqsq(struct mlx5e_ptpsq *ptpsq) +{ + struct mlx5e_txqsq *sq = &ptpsq->txqsq; + struct mlx5_core_dev *mdev = sq->mdev; + + mlx5e_ptp_free_traffic_db(&ptpsq->skb_fifo); + cancel_work_sync(&sq->recover_work); + mlx5e_ptp_destroy_sq(mdev, sq->sqn); + mlx5e_free_txqsq_descs(sq); + mlx5e_free_txqsq(sq); +} + +static int mlx5e_ptp_open_txqsqs(struct mlx5e_ptp *c, + struct mlx5e_ptp_params *cparams) +{ + struct mlx5e_params *params = &cparams->params; + u8 num_tc = mlx5e_get_dcb_num_tc(params); + int ix_base; + int err; + int tc; + + ix_base = num_tc * params->num_channels; + + for (tc = 0; tc < num_tc; tc++) { + int txq_ix = ix_base + tc; + + err = mlx5e_ptp_open_txqsq(c, c->priv->tisn[c->lag_port][tc], txq_ix, + cparams, tc, &c->ptpsq[tc]); + if (err) + goto close_txqsq; + } + + return 0; + +close_txqsq: + for (--tc; tc >= 0; tc--) + mlx5e_ptp_close_txqsq(&c->ptpsq[tc]); + + return err; +} + +static void mlx5e_ptp_close_txqsqs(struct mlx5e_ptp *c) +{ + int tc; + + for (tc = 0; tc < c->num_tc; tc++) + mlx5e_ptp_close_txqsq(&c->ptpsq[tc]); +} + +static int mlx5e_ptp_open_tx_cqs(struct mlx5e_ptp *c, + struct mlx5e_ptp_params *cparams) +{ + struct mlx5e_params *params = &cparams->params; + struct mlx5e_create_cq_param ccp = {}; + struct dim_cq_moder ptp_moder = {}; + struct mlx5e_cq_param *cq_param; + u8 num_tc; + int err; + int tc; + + num_tc = mlx5e_get_dcb_num_tc(params); + + ccp.node = dev_to_node(mlx5_core_dma_dev(c->mdev)); + ccp.ch_stats = c->stats; + ccp.napi = &c->napi; + ccp.ix = MLX5E_PTP_CHANNEL_IX; + + cq_param = &cparams->txq_sq_param.cqp; + + for (tc = 0; tc < num_tc; tc++) { + struct mlx5e_cq *cq = &c->ptpsq[tc].txqsq.cq; + + err = mlx5e_open_cq(c->priv, ptp_moder, cq_param, &ccp, cq); + if (err) + goto out_err_txqsq_cq; + } + + for (tc = 0; tc < num_tc; tc++) { + struct mlx5e_cq *cq = &c->ptpsq[tc].ts_cq; + struct mlx5e_ptpsq *ptpsq = &c->ptpsq[tc]; + + err = mlx5e_open_cq(c->priv, ptp_moder, cq_param, &ccp, cq); + if (err) + goto out_err_ts_cq; + + ptpsq->cq_stats = &c->priv->ptp_stats.cq[tc]; + } + + return 0; + +out_err_ts_cq: + for (--tc; tc >= 0; tc--) + mlx5e_close_cq(&c->ptpsq[tc].ts_cq); + tc = num_tc; +out_err_txqsq_cq: + for (--tc; tc >= 0; tc--) + mlx5e_close_cq(&c->ptpsq[tc].txqsq.cq); + + return err; +} + +static int mlx5e_init_ptp_rq(struct mlx5e_ptp *c, struct mlx5e_params *params, + struct mlx5e_rq *rq) +{ + struct mlx5_core_dev *mdev = c->mdev; + struct mlx5e_priv *priv = c->priv; + int err; + + rq->wq_type = params->rq_wq_type; + rq->pdev = c->pdev; + rq->netdev = priv->netdev; + rq->priv = priv; + rq->clock = &mdev->clock; + rq->tstamp = &priv->tstamp; + rq->mdev = mdev; + rq->hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu); + rq->stats = &c->priv->ptp_stats.rq; + rq->ix = MLX5E_PTP_CHANNEL_IX; + rq->ptp_cyc2time = mlx5_rq_ts_translator(mdev); + err = mlx5e_rq_set_handlers(rq, params, false); + if (err) + return err; + + return xdp_rxq_info_reg(&rq->xdp_rxq, rq->netdev, rq->ix, 0); +} + +static int mlx5e_ptp_open_rq(struct mlx5e_ptp *c, + struct mlx5e_ptp_params *cparams) +{ + struct mlx5e_rq_param *rq_param = &cparams->rq_param; + struct mlx5e_params *params = &cparams->params; + struct mlx5e_create_cq_param ccp = {}; + struct dim_cq_moder moder = {}; + int err; + + ccp.node = dev_to_node(mlx5_core_dma_dev(c->mdev)); + ccp.ch_stats = c->stats; + ccp.napi = &c->napi; + ccp.ix = MLX5E_PTP_CHANNEL_IX; + + err = mlx5e_init_ptp_rq(c, params, &c->rq); + if (err) + return err; + + return mlx5e_open_rq(c->priv, params, rq_param, NULL, &ccp, moder, ccp.node, &c->rq); + +} + +static void mlx5e_ptp_close_tx_cqs(struct mlx5e_ptp *c) +{ + int tc; + + for (tc = 0; tc < c->num_tc; tc++) + mlx5e_close_cq(&c->ptpsq[tc].ts_cq); + + for (tc = 0; tc < c->num_tc; tc++) + mlx5e_close_cq(&c->ptpsq[tc].txqsq.cq); +} + +static void mlx5e_ptp_build_sq_param(struct mlx5_core_dev *mdev, + struct mlx5e_params *params, + struct mlx5e_sq_param *param) +{ + void *sqc = param->sqc; + void *wq; + + mlx5e_build_sq_param_common(mdev, param); + + wq = MLX5_ADDR_OF(sqc, sqc, wq); + MLX5_SET(wq, wq, log_wq_sz, params->log_sq_size); + param->stop_room = mlx5e_stop_room_for_max_wqe(mdev); + mlx5e_build_tx_cq_param(mdev, params, ¶m->cqp); +} + +static void mlx5e_ptp_build_rq_param(struct mlx5_core_dev *mdev, + struct net_device *netdev, + u16 q_counter, + struct mlx5e_ptp_params *ptp_params) +{ + struct mlx5e_rq_param *rq_params = &ptp_params->rq_param; + struct mlx5e_params *params = &ptp_params->params; + + params->rq_wq_type = MLX5_WQ_TYPE_CYCLIC; + mlx5e_init_rq_type_params(mdev, params); + params->sw_mtu = netdev->max_mtu; + mlx5e_build_rq_param(mdev, params, NULL, q_counter, rq_params); +} + +static void mlx5e_ptp_build_params(struct mlx5e_ptp *c, + struct mlx5e_ptp_params *cparams, + struct mlx5e_params *orig) +{ + struct mlx5e_params *params = &cparams->params; + + params->tx_min_inline_mode = orig->tx_min_inline_mode; + params->num_channels = orig->num_channels; + params->hard_mtu = orig->hard_mtu; + params->sw_mtu = orig->sw_mtu; + params->mqprio = orig->mqprio; + + /* SQ */ + if (test_bit(MLX5E_PTP_STATE_TX, c->state)) { + params->log_sq_size = orig->log_sq_size; + mlx5e_ptp_build_sq_param(c->mdev, params, &cparams->txq_sq_param); + } + /* RQ */ + if (test_bit(MLX5E_PTP_STATE_RX, c->state)) { + params->vlan_strip_disable = orig->vlan_strip_disable; + mlx5e_ptp_build_rq_param(c->mdev, c->netdev, c->priv->q_counter, cparams); + } +} + +static int mlx5e_ptp_open_queues(struct mlx5e_ptp *c, + struct mlx5e_ptp_params *cparams) +{ + int err; + + if (test_bit(MLX5E_PTP_STATE_TX, c->state)) { + err = mlx5e_ptp_open_tx_cqs(c, cparams); + if (err) + return err; + + err = mlx5e_ptp_open_txqsqs(c, cparams); + if (err) + goto close_tx_cqs; + } + if (test_bit(MLX5E_PTP_STATE_RX, c->state)) { + err = mlx5e_ptp_open_rq(c, cparams); + if (err) + goto close_txqsq; + } + return 0; + +close_txqsq: + if (test_bit(MLX5E_PTP_STATE_TX, c->state)) + mlx5e_ptp_close_txqsqs(c); +close_tx_cqs: + if (test_bit(MLX5E_PTP_STATE_TX, c->state)) + mlx5e_ptp_close_tx_cqs(c); + + return err; +} + +static void mlx5e_ptp_close_queues(struct mlx5e_ptp *c) +{ + if (test_bit(MLX5E_PTP_STATE_RX, c->state)) + mlx5e_close_rq(c->priv, &c->rq); + + if (test_bit(MLX5E_PTP_STATE_TX, c->state)) { + mlx5e_ptp_close_txqsqs(c); + mlx5e_ptp_close_tx_cqs(c); + } +} + +static int mlx5e_ptp_set_state(struct mlx5e_ptp *c, struct mlx5e_params *params) +{ + if (MLX5E_GET_PFLAG(params, MLX5E_PFLAG_TX_PORT_TS)) + __set_bit(MLX5E_PTP_STATE_TX, c->state); + + if (params->ptp_rx) + __set_bit(MLX5E_PTP_STATE_RX, c->state); + + return bitmap_empty(c->state, MLX5E_PTP_STATE_NUM_STATES) ? -EINVAL : 0; +} + +static void mlx5e_ptp_rx_unset_fs(struct mlx5e_priv *priv) +{ + struct mlx5e_ptp_fs *ptp_fs = priv->fs.ptp_fs; + + if (!ptp_fs->valid) + return; + + mlx5e_fs_tt_redirect_del_rule(ptp_fs->l2_rule); + mlx5e_fs_tt_redirect_any_destroy(priv); + + mlx5e_fs_tt_redirect_del_rule(ptp_fs->udp_v6_rule); + mlx5e_fs_tt_redirect_del_rule(ptp_fs->udp_v4_rule); + mlx5e_fs_tt_redirect_udp_destroy(priv); + ptp_fs->valid = false; +} + +static int mlx5e_ptp_rx_set_fs(struct mlx5e_priv *priv) +{ + u32 tirn = mlx5e_rx_res_get_tirn_ptp(priv->rx_res); + struct mlx5e_ptp_fs *ptp_fs = priv->fs.ptp_fs; + struct mlx5_flow_handle *rule; + int err; + + if (ptp_fs->valid) + return 0; + + err = mlx5e_fs_tt_redirect_udp_create(priv); + if (err) + goto out_free; + + rule = mlx5e_fs_tt_redirect_udp_add_rule(priv, MLX5_TT_IPV4_UDP, + tirn, PTP_EV_PORT); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + goto out_destroy_fs_udp; + } + ptp_fs->udp_v4_rule = rule; + + rule = mlx5e_fs_tt_redirect_udp_add_rule(priv, MLX5_TT_IPV6_UDP, + tirn, PTP_EV_PORT); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + goto out_destroy_udp_v4_rule; + } + ptp_fs->udp_v6_rule = rule; + + err = mlx5e_fs_tt_redirect_any_create(priv); + if (err) + goto out_destroy_udp_v6_rule; + + rule = mlx5e_fs_tt_redirect_any_add_rule(priv, tirn, ETH_P_1588); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + goto out_destroy_fs_any; + } + ptp_fs->l2_rule = rule; + ptp_fs->valid = true; + + return 0; + +out_destroy_fs_any: + mlx5e_fs_tt_redirect_any_destroy(priv); +out_destroy_udp_v6_rule: + mlx5e_fs_tt_redirect_del_rule(ptp_fs->udp_v6_rule); +out_destroy_udp_v4_rule: + mlx5e_fs_tt_redirect_del_rule(ptp_fs->udp_v4_rule); +out_destroy_fs_udp: + mlx5e_fs_tt_redirect_udp_destroy(priv); +out_free: + return err; +} + +int mlx5e_ptp_open(struct mlx5e_priv *priv, struct mlx5e_params *params, + u8 lag_port, struct mlx5e_ptp **cp) +{ + struct net_device *netdev = priv->netdev; + struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5e_ptp_params *cparams; + struct mlx5e_ptp *c; + int err; + + + c = kvzalloc_node(sizeof(*c), GFP_KERNEL, dev_to_node(mlx5_core_dma_dev(mdev))); + cparams = kvzalloc(sizeof(*cparams), GFP_KERNEL); + if (!c || !cparams) + return -ENOMEM; + + c->priv = priv; + c->mdev = priv->mdev; + c->tstamp = &priv->tstamp; + c->pdev = mlx5_core_dma_dev(priv->mdev); + c->netdev = priv->netdev; + c->mkey_be = cpu_to_be32(priv->mdev->mlx5e_res.hw_objs.mkey); + c->num_tc = mlx5e_get_dcb_num_tc(params); + c->stats = &priv->ptp_stats.ch; + c->lag_port = lag_port; + + err = mlx5e_ptp_set_state(c, params); + if (err) + goto err_free; + + netif_napi_add(netdev, &c->napi, mlx5e_ptp_napi_poll, 64); + + mlx5e_ptp_build_params(c, cparams, params); + + err = mlx5e_ptp_open_queues(c, cparams); + if (unlikely(err)) + goto err_napi_del; + + if (test_bit(MLX5E_PTP_STATE_RX, c->state)) + priv->rx_ptp_opened = true; + + *cp = c; + + kvfree(cparams); + + return 0; + +err_napi_del: + netif_napi_del(&c->napi); +err_free: + kvfree(cparams); + kvfree(c); + return err; +} + +void mlx5e_ptp_close(struct mlx5e_ptp *c) +{ + mlx5e_ptp_close_queues(c); + netif_napi_del(&c->napi); + + kvfree(c); +} + +void mlx5e_ptp_enable_channel(struct mlx5e_ptp *c) +{ + napi_enable(&c->napi); +} + +void mlx5e_ptp_start_channel(struct mlx5e_ptp *c) +{ + int tc; + + if (test_bit(MLX5E_PTP_STATE_TX, c->state)) { + for (tc = 0; tc < c->num_tc; tc++) { + mlx5e_enable_txqsq(&c->ptpsq[tc].txqsq); + mlx5e_start_txqsq(&c->ptpsq[tc].txqsq); + } + } + if (test_bit(MLX5E_PTP_STATE_RX, c->state)) { + mlx5e_ptp_rx_set_fs(c->priv); + mlx5e_activate_rq(&c->rq); + mlx5e_trigger_napi_sched(&c->napi); + } +} + +void mlx5e_ptp_disable_channel(struct mlx5e_ptp *c) +{ + int tc; + + if (test_bit(MLX5E_PTP_STATE_RX, c->state)) + mlx5e_deactivate_rq(&c->rq); + + if (test_bit(MLX5E_PTP_STATE_TX, c->state)) + for (tc = 0; tc < c->num_tc; tc++) + mlx5e_disable_txqsq(&c->ptpsq[tc].txqsq); +} + +void mlx5e_ptp_stop_channel(struct mlx5e_ptp *c) +{ + int tc; + + if (test_bit(MLX5E_PTP_STATE_TX, c->state)) + for (tc = 0; tc < c->num_tc; tc++) + mlx5e_stop_txqsq(&c->ptpsq[tc].txqsq); + napi_disable(&c->napi); +} + +int mlx5e_ptp_get_rqn(struct mlx5e_ptp *c, u32 *rqn) +{ + if (!c || !test_bit(MLX5E_PTP_STATE_RX, c->state)) + return -EINVAL; + + *rqn = c->rq.rqn; + return 0; +} + +int mlx5e_ptp_alloc_rx_fs(struct mlx5e_priv *priv) +{ + struct mlx5e_ptp_fs *ptp_fs; + + if (!mlx5e_profile_feature_cap(priv->profile, PTP_RX)) + return 0; + + ptp_fs = kzalloc(sizeof(*ptp_fs), GFP_KERNEL); + if (!ptp_fs) + return -ENOMEM; + + priv->fs.ptp_fs = ptp_fs; + return 0; +} + +void mlx5e_ptp_free_rx_fs(struct mlx5e_priv *priv) +{ + struct mlx5e_ptp_fs *ptp_fs = priv->fs.ptp_fs; + + if (!mlx5e_profile_feature_cap(priv->profile, PTP_RX)) + return; + + mlx5e_ptp_rx_unset_fs(priv); + kfree(ptp_fs); +} + +int mlx5e_ptp_rx_manage_fs(struct mlx5e_priv *priv, bool set) +{ + struct mlx5e_ptp *c = priv->channels.ptp; + + if (!mlx5e_profile_feature_cap(priv->profile, PTP_RX)) + return 0; + + if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) + return 0; + + if (set) { + if (!c || !test_bit(MLX5E_PTP_STATE_RX, c->state)) { + netdev_WARN_ONCE(priv->netdev, "Don't try to add PTP RX-FS rules"); + return -EINVAL; + } + return mlx5e_ptp_rx_set_fs(priv); + } + /* set == false */ + if (c && test_bit(MLX5E_PTP_STATE_RX, c->state)) { + netdev_WARN_ONCE(priv->netdev, "Don't try to remove PTP RX-FS rules"); + return -EINVAL; + } + mlx5e_ptp_rx_unset_fs(priv); + return 0; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h new file mode 100644 index 0000000..0059a3d --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2020 Mellanox Technologies. */ + +#ifndef __MLX5_EN_PTP_H__ +#define __MLX5_EN_PTP_H__ + +#include "en.h" +#include "en_stats.h" +#include "en/txrx.h" +#include + +#define MLX5E_PTP_CHANNEL_IX 0 + +struct mlx5e_ptpsq { + struct mlx5e_txqsq txqsq; + struct mlx5e_cq ts_cq; + u16 skb_fifo_cc; + u16 skb_fifo_pc; + struct mlx5e_skb_fifo skb_fifo; + struct mlx5e_ptp_cq_stats *cq_stats; + u16 ts_cqe_ctr_mask; +}; + +enum { + MLX5E_PTP_STATE_TX, + MLX5E_PTP_STATE_RX, + MLX5E_PTP_STATE_NUM_STATES, +}; + +struct mlx5e_ptp { + /* data path */ + struct mlx5e_ptpsq ptpsq[MLX5E_MAX_NUM_TC]; + struct mlx5e_rq rq; + struct napi_struct napi; + struct device *pdev; + struct net_device *netdev; + __be32 mkey_be; + u8 num_tc; + u8 lag_port; + + /* data path - accessed per napi poll */ + struct mlx5e_ch_stats *stats; + + /* control */ + struct mlx5e_priv *priv; + struct mlx5_core_dev *mdev; + struct hwtstamp_config *tstamp; + DECLARE_BITMAP(state, MLX5E_PTP_STATE_NUM_STATES); +}; + +static inline bool mlx5e_use_ptpsq(struct sk_buff *skb) +{ + struct flow_keys fk; + + if (!(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) + return false; + + if (!skb_flow_dissect_flow_keys(skb, &fk, 0)) + return false; + + if (fk.basic.n_proto == htons(ETH_P_1588)) + return true; + + if (fk.basic.n_proto != htons(ETH_P_IP) && + fk.basic.n_proto != htons(ETH_P_IPV6)) + return false; + + return (fk.basic.ip_proto == IPPROTO_UDP && + fk.ports.dst == htons(PTP_EV_PORT)); +} + +static inline bool mlx5e_ptpsq_fifo_has_room(struct mlx5e_txqsq *sq) +{ + if (!sq->ptpsq) + return true; + + return mlx5e_skb_fifo_has_room(&sq->ptpsq->skb_fifo); +} + +int mlx5e_ptp_open(struct mlx5e_priv *priv, struct mlx5e_params *params, + u8 lag_port, struct mlx5e_ptp **cp); +void mlx5e_ptp_close(struct mlx5e_ptp *c); +void mlx5e_ptp_enable_channel(struct mlx5e_ptp *c); +void mlx5e_ptp_start_channel(struct mlx5e_ptp *c); +void mlx5e_ptp_disable_channel(struct mlx5e_ptp *c); +void mlx5e_ptp_stop_channel(struct mlx5e_ptp *c); +int mlx5e_ptp_get_rqn(struct mlx5e_ptp *c, u32 *rqn); +int mlx5e_ptp_alloc_rx_fs(struct mlx5e_priv *priv); +void mlx5e_ptp_free_rx_fs(struct mlx5e_priv *priv); +int mlx5e_ptp_rx_manage_fs(struct mlx5e_priv *priv, bool set); + +enum { + MLX5E_SKB_CB_CQE_HWTSTAMP = BIT(0), + MLX5E_SKB_CB_PORT_HWTSTAMP = BIT(1), +}; + +void mlx5e_skb_cb_hwtstamp_handler(struct sk_buff *skb, int hwtstamp_type, + ktime_t hwtstamp, + struct mlx5e_ptp_cq_stats *cq_stats); + +void mlx5e_skb_cb_hwtstamp_init(struct sk_buff *skb); +#endif /* __MLX5_EN_PTP_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c new file mode 100644 index 0000000..a5ec67e --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c @@ -0,0 +1,1140 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. */ +#include + +#include "en.h" +#include "params.h" +#include "../qos.h" + +#define BYTES_IN_MBIT 125000 + +int mlx5e_qos_bytes_rate_check(struct mlx5_core_dev *mdev, u64 nbytes) +{ + if (nbytes < BYTES_IN_MBIT) { + qos_warn(mdev, "Input rate (%llu Bytes/sec) below minimum supported (%u Bytes/sec)\n", + nbytes, BYTES_IN_MBIT); + return -EINVAL; + } + return 0; +} + +static u32 mlx5e_qos_bytes2mbits(struct mlx5_core_dev *mdev, u64 nbytes) +{ + return div_u64(nbytes, BYTES_IN_MBIT); +} + +int mlx5e_qos_max_leaf_nodes(struct mlx5_core_dev *mdev) +{ + return min(MLX5E_QOS_MAX_LEAF_NODES, mlx5_qos_max_leaf_nodes(mdev)); +} + +int mlx5e_qos_cur_leaf_nodes(struct mlx5e_priv *priv) +{ + int last = find_last_bit(priv->htb.qos_used_qids, mlx5e_qos_max_leaf_nodes(priv->mdev)); + + return last == mlx5e_qos_max_leaf_nodes(priv->mdev) ? 0 : last + 1; +} + +/* Software representation of the QoS tree (internal to this file) */ + +static int mlx5e_find_unused_qos_qid(struct mlx5e_priv *priv) +{ + int size = mlx5e_qos_max_leaf_nodes(priv->mdev); + int res; + + WARN_ONCE(!mutex_is_locked(&priv->state_lock), "%s: state_lock is not held\n", __func__); + res = find_first_zero_bit(priv->htb.qos_used_qids, size); + + return res == size ? -ENOSPC : res; +} + +struct mlx5e_qos_node { + struct hlist_node hnode; + struct mlx5e_qos_node *parent; + u64 rate; + u32 bw_share; + u32 max_average_bw; + u32 hw_id; + u32 classid; /* 16-bit, except root. */ + u16 qid; +}; + +#define MLX5E_QOS_QID_INNER 0xffff +#define MLX5E_HTB_CLASSID_ROOT 0xffffffff + +static struct mlx5e_qos_node * +mlx5e_sw_node_create_leaf(struct mlx5e_priv *priv, u16 classid, u16 qid, + struct mlx5e_qos_node *parent) +{ + struct mlx5e_qos_node *node; + + node = kzalloc(sizeof(*node), GFP_KERNEL); + if (!node) + return ERR_PTR(-ENOMEM); + + node->parent = parent; + + node->qid = qid; + __set_bit(qid, priv->htb.qos_used_qids); + + node->classid = classid; + hash_add_rcu(priv->htb.qos_tc2node, &node->hnode, classid); + + mlx5e_update_tx_netdev_queues(priv); + + return node; +} + +static struct mlx5e_qos_node *mlx5e_sw_node_create_root(struct mlx5e_priv *priv) +{ + struct mlx5e_qos_node *node; + + node = kzalloc(sizeof(*node), GFP_KERNEL); + if (!node) + return ERR_PTR(-ENOMEM); + + node->qid = MLX5E_QOS_QID_INNER; + node->classid = MLX5E_HTB_CLASSID_ROOT; + hash_add_rcu(priv->htb.qos_tc2node, &node->hnode, node->classid); + + return node; +} + +static struct mlx5e_qos_node *mlx5e_sw_node_find(struct mlx5e_priv *priv, u32 classid) +{ + struct mlx5e_qos_node *node = NULL; + + hash_for_each_possible(priv->htb.qos_tc2node, node, hnode, classid) { + if (node->classid == classid) + break; + } + + return node; +} + +static struct mlx5e_qos_node *mlx5e_sw_node_find_rcu(struct mlx5e_priv *priv, u32 classid) +{ + struct mlx5e_qos_node *node = NULL; + + hash_for_each_possible_rcu(priv->htb.qos_tc2node, node, hnode, classid) { + if (node->classid == classid) + break; + } + + return node; +} + +static void mlx5e_sw_node_delete(struct mlx5e_priv *priv, struct mlx5e_qos_node *node) +{ + hash_del_rcu(&node->hnode); + if (node->qid != MLX5E_QOS_QID_INNER) { + __clear_bit(node->qid, priv->htb.qos_used_qids); + mlx5e_update_tx_netdev_queues(priv); + } + /* Make sure this qid is no longer selected by mlx5e_select_queue, so + * that mlx5e_reactivate_qos_sq can safely restart the netdev TX queue. + */ + synchronize_net(); + kfree(node); +} + +/* TX datapath API */ + +static u16 mlx5e_qid_from_qos(struct mlx5e_channels *chs, u16 qid) +{ + /* These channel params are safe to access from the datapath, because: + * 1. This function is called only after checking priv->htb.maj_id != 0, + * and the number of queues can't change while HTB offload is active. + * 2. When priv->htb.maj_id becomes 0, synchronize_rcu waits for + * mlx5e_select_queue to finish while holding priv->state_lock, + * preventing other code from changing the number of queues. + */ + bool is_ptp = MLX5E_GET_PFLAG(&chs->params, MLX5E_PFLAG_TX_PORT_TS); + + return (chs->params.num_channels + is_ptp) * mlx5e_get_dcb_num_tc(&chs->params) + qid; +} + +int mlx5e_get_txq_by_classid(struct mlx5e_priv *priv, u16 classid) +{ + struct mlx5e_qos_node *node; + u16 qid; + int res; + + rcu_read_lock(); + + node = mlx5e_sw_node_find_rcu(priv, classid); + if (!node) { + res = -ENOENT; + goto out; + } + qid = READ_ONCE(node->qid); + if (qid == MLX5E_QOS_QID_INNER) { + res = -EINVAL; + goto out; + } + res = mlx5e_qid_from_qos(&priv->channels, qid); + +out: + rcu_read_unlock(); + return res; +} + +static struct mlx5e_txqsq *mlx5e_get_qos_sq(struct mlx5e_priv *priv, int qid) +{ + struct mlx5e_params *params = &priv->channels.params; + struct mlx5e_txqsq __rcu **qos_sqs; + struct mlx5e_channel *c; + int ix; + + ix = qid % params->num_channels; + qid /= params->num_channels; + c = priv->channels.c[ix]; + + qos_sqs = mlx5e_state_dereference(priv, c->qos_sqs); + return mlx5e_state_dereference(priv, qos_sqs[qid]); +} + +/* SQ lifecycle */ + +static int mlx5e_open_qos_sq(struct mlx5e_priv *priv, struct mlx5e_channels *chs, + struct mlx5e_qos_node *node) +{ + struct mlx5e_create_cq_param ccp = {}; + struct mlx5e_txqsq __rcu **qos_sqs; + struct mlx5e_sq_param param_sq; + struct mlx5e_cq_param param_cq; + int txq_ix, ix, qid, err = 0; + struct mlx5e_params *params; + struct mlx5e_channel *c; + struct mlx5e_txqsq *sq; + + params = &chs->params; + + txq_ix = mlx5e_qid_from_qos(chs, node->qid); + + WARN_ON(node->qid > priv->htb.max_qos_sqs); + if (node->qid == priv->htb.max_qos_sqs) { + struct mlx5e_sq_stats *stats, **stats_list = NULL; + + if (priv->htb.max_qos_sqs == 0) { + stats_list = kvcalloc(mlx5e_qos_max_leaf_nodes(priv->mdev), + sizeof(*stats_list), + GFP_KERNEL); + if (!stats_list) + return -ENOMEM; + } + stats = kzalloc(sizeof(*stats), GFP_KERNEL); + if (!stats) { + kvfree(stats_list); + return -ENOMEM; + } + if (stats_list) + WRITE_ONCE(priv->htb.qos_sq_stats, stats_list); + WRITE_ONCE(priv->htb.qos_sq_stats[node->qid], stats); + /* Order max_qos_sqs increment after writing the array pointer. + * Pairs with smp_load_acquire in en_stats.c. + */ + smp_store_release(&priv->htb.max_qos_sqs, priv->htb.max_qos_sqs + 1); + } + + ix = node->qid % params->num_channels; + qid = node->qid / params->num_channels; + c = chs->c[ix]; + + qos_sqs = mlx5e_state_dereference(priv, c->qos_sqs); + sq = kzalloc(sizeof(*sq), GFP_KERNEL); + + if (!sq) + return -ENOMEM; + + mlx5e_build_create_cq_param(&ccp, c); + + memset(¶m_sq, 0, sizeof(param_sq)); + memset(¶m_cq, 0, sizeof(param_cq)); + mlx5e_build_sq_param(priv->mdev, params, ¶m_sq); + mlx5e_build_tx_cq_param(priv->mdev, params, ¶m_cq); + err = mlx5e_open_cq(priv, params->tx_cq_moderation, ¶m_cq, &ccp, &sq->cq); + if (err) + goto err_free_sq; + err = mlx5e_open_txqsq(c, priv->tisn[c->lag_port][0], txq_ix, params, + ¶m_sq, sq, 0, node->hw_id, + priv->htb.qos_sq_stats[node->qid]); + if (err) + goto err_close_cq; + + rcu_assign_pointer(qos_sqs[qid], sq); + + return 0; + +err_close_cq: + mlx5e_close_cq(&sq->cq); +err_free_sq: + kfree(sq); + return err; +} + +static void mlx5e_activate_qos_sq(struct mlx5e_priv *priv, struct mlx5e_qos_node *node) +{ + struct mlx5e_txqsq *sq; + u16 qid; + + sq = mlx5e_get_qos_sq(priv, node->qid); + + qid = mlx5e_qid_from_qos(&priv->channels, node->qid); + + /* If it's a new queue, it will be marked as started at this point. + * Stop it before updating txq2sq. + */ + mlx5e_tx_disable_queue(netdev_get_tx_queue(priv->netdev, qid)); + + priv->txq2sq[qid] = sq; + + /* Make the change to txq2sq visible before the queue is started. + * As mlx5e_xmit runs under a spinlock, there is an implicit ACQUIRE, + * which pairs with this barrier. + */ + smp_wmb(); + + qos_dbg(priv->mdev, "Activate QoS SQ qid %u\n", node->qid); + mlx5e_enable_txqsq(sq); + mlx5e_start_txqsq(sq); +} + +static void mlx5e_deactivate_qos_sq(struct mlx5e_priv *priv, u16 qid) +{ + struct mlx5e_txqsq *sq; + + sq = mlx5e_get_qos_sq(priv, qid); + if (!sq) /* Handle the case when the SQ failed to open. */ + return; + + qos_dbg(priv->mdev, "Deactivate QoS SQ qid %u\n", qid); + + mlx5e_disable_txqsq(sq); + synchronize_net(); + mlx5e_stop_txqsq(sq); + + priv->txq2sq[mlx5e_qid_from_qos(&priv->channels, qid)] = NULL; + + /* Make the change to txq2sq visible before the queue is started again. + * As mlx5e_xmit runs under a spinlock, there is an implicit ACQUIRE, + * which pairs with this barrier. + */ + smp_wmb(); +} + +static void mlx5e_close_qos_sq(struct mlx5e_priv *priv, u16 qid) +{ + struct mlx5e_txqsq __rcu **qos_sqs; + struct mlx5e_params *params; + struct mlx5e_channel *c; + struct mlx5e_txqsq *sq; + int ix; + + params = &priv->channels.params; + + ix = qid % params->num_channels; + qid /= params->num_channels; + c = priv->channels.c[ix]; + qos_sqs = mlx5e_state_dereference(priv, c->qos_sqs); + sq = rcu_replace_pointer(qos_sqs[qid], NULL, lockdep_is_held(&priv->state_lock)); + if (!sq) /* Handle the case when the SQ failed to open. */ + return; + + synchronize_rcu(); /* Sync with NAPI. */ + + mlx5e_close_txqsq(sq); + mlx5e_close_cq(&sq->cq); + kfree(sq); +} + +void mlx5e_qos_close_queues(struct mlx5e_channel *c) +{ + struct mlx5e_txqsq __rcu **qos_sqs; + int i; + + qos_sqs = rcu_replace_pointer(c->qos_sqs, NULL, lockdep_is_held(&c->priv->state_lock)); + if (!qos_sqs) + return; + synchronize_rcu(); /* Sync with NAPI. */ + + for (i = 0; i < c->qos_sqs_size; i++) { + struct mlx5e_txqsq *sq; + + sq = mlx5e_state_dereference(c->priv, qos_sqs[i]); + if (!sq) /* Handle the case when the SQ failed to open. */ + continue; + + mlx5e_close_txqsq(sq); + mlx5e_close_cq(&sq->cq); + kfree(sq); + } + + kvfree(qos_sqs); +} + +static void mlx5e_qos_close_all_queues(struct mlx5e_channels *chs) +{ + int i; + + for (i = 0; i < chs->num; i++) + mlx5e_qos_close_queues(chs->c[i]); +} + +static int mlx5e_qos_alloc_queues(struct mlx5e_priv *priv, struct mlx5e_channels *chs) +{ + u16 qos_sqs_size; + int i; + + qos_sqs_size = DIV_ROUND_UP(mlx5e_qos_max_leaf_nodes(priv->mdev), chs->num); + + for (i = 0; i < chs->num; i++) { + struct mlx5e_txqsq **sqs; + + sqs = kvcalloc(qos_sqs_size, sizeof(struct mlx5e_txqsq *), GFP_KERNEL); + if (!sqs) + goto err_free; + + WRITE_ONCE(chs->c[i]->qos_sqs_size, qos_sqs_size); + smp_wmb(); /* Pairs with mlx5e_napi_poll. */ + rcu_assign_pointer(chs->c[i]->qos_sqs, sqs); + } + + return 0; + +err_free: + while (--i >= 0) { + struct mlx5e_txqsq **sqs; + + sqs = rcu_replace_pointer(chs->c[i]->qos_sqs, NULL, + lockdep_is_held(&priv->state_lock)); + + synchronize_rcu(); /* Sync with NAPI. */ + kvfree(sqs); + } + return -ENOMEM; +} + +int mlx5e_qos_open_queues(struct mlx5e_priv *priv, struct mlx5e_channels *chs) +{ + struct mlx5e_qos_node *node = NULL; + int bkt, err; + + if (!priv->htb.maj_id) + return 0; + + err = mlx5e_qos_alloc_queues(priv, chs); + if (err) + return err; + + hash_for_each(priv->htb.qos_tc2node, bkt, node, hnode) { + if (node->qid == MLX5E_QOS_QID_INNER) + continue; + err = mlx5e_open_qos_sq(priv, chs, node); + if (err) { + mlx5e_qos_close_all_queues(chs); + return err; + } + } + + return 0; +} + +void mlx5e_qos_activate_queues(struct mlx5e_priv *priv) +{ + struct mlx5e_qos_node *node = NULL; + int bkt; + + hash_for_each(priv->htb.qos_tc2node, bkt, node, hnode) { + if (node->qid == MLX5E_QOS_QID_INNER) + continue; + mlx5e_activate_qos_sq(priv, node); + } +} + +void mlx5e_qos_deactivate_queues(struct mlx5e_channel *c, bool finalize) +{ + struct mlx5e_params *params = &c->priv->channels.params; + struct mlx5e_txqsq __rcu **qos_sqs; + int i; + + qos_sqs = mlx5e_state_dereference(c->priv, c->qos_sqs); + if (!qos_sqs) + return; + + for (i = 0; i < c->qos_sqs_size; i++) { + u16 qid = params->num_channels * i + c->ix; + struct mlx5e_txqsq *sq; + + sq = mlx5e_state_dereference(c->priv, qos_sqs[i]); + if (!sq) /* Handle the case when the SQ failed to open. */ + continue; + if (finalize) { + qos_dbg(c->mdev, "Finalize QoS SQ qid %u\n", qid); + mlx5e_stop_txqsq(sq); + /* The queue is disabled, no synchronization with datapath is needed. */ + c->priv->txq2sq[mlx5e_qid_from_qos(&c->priv->channels, qid)] = NULL; + } else { + qos_dbg(c->mdev, "Deactivate QoS SQ qid %u\n", qid); + mlx5e_disable_txqsq(sq); + } + } +} + +static void mlx5e_qos_deactivate_all_queues(struct mlx5e_channels *chs) +{ + int i; + + for (i = 0; i < chs->num; i++) + mlx5e_qos_deactivate_queues(chs->c[i], false); + synchronize_net(); + for (i = 0; i < chs->num; i++) + mlx5e_qos_deactivate_queues(chs->c[i], true); +} + +/* HTB API */ + +int mlx5e_htb_root_add(struct mlx5e_priv *priv, u16 htb_maj_id, u16 htb_defcls, + struct netlink_ext_ack *extack) +{ + struct mlx5e_select_queue_params *selq; + struct mlx5e_qos_node *root; + bool opened; + int err; + + qos_dbg(priv->mdev, "TC_HTB_CREATE handle %04x:, default :%04x\n", htb_maj_id, htb_defcls); + + if (!mlx5_qos_is_supported(priv->mdev)) { + NL_SET_ERR_MSG_MOD(extack, + "Missing QoS capabilities. Try disabling SRIOV or use a supported device."); + return -EOPNOTSUPP; + } + + /* mlx5e_htb_root_del can't fail - alloc in advance. */ + priv->htb.final_selq = kvzalloc(sizeof(*priv->htb.final_selq), GFP_KERNEL); + if (!priv->htb.final_selq) + return -ENOMEM; + + opened = test_bit(MLX5E_STATE_OPENED, &priv->state); + if (opened) { + selq = kvzalloc(sizeof(*selq), GFP_KERNEL); + if (!selq) { + err = -ENOMEM; + goto err_free_final_selq; + } + mlx5e_build_selq(selq, &priv->channels.params, true); + + err = mlx5e_qos_alloc_queues(priv, &priv->channels); + if (err) + goto err_free_selq; + } + + root = mlx5e_sw_node_create_root(priv); + if (IS_ERR(root)) { + err = PTR_ERR(root); + goto err_free_queues; + } + + err = mlx5_qos_create_root_node(priv->mdev, &root->hw_id); + if (err) { + NL_SET_ERR_MSG_MOD(extack, "Firmware error. Try upgrading firmware."); + goto err_sw_node_delete; + } + + WRITE_ONCE(priv->htb.defcls, htb_defcls); + /* Order maj_id after defcls - pairs with + * mlx5e_select_queue/mlx5e_select_htb_queues. + */ + smp_store_release(&priv->htb.maj_id, htb_maj_id); + + mlx5e_replace_selq(priv, selq); + + return 0; + +err_sw_node_delete: + mlx5e_sw_node_delete(priv, root); + +err_free_queues: + if (opened) + mlx5e_qos_close_all_queues(&priv->channels); +err_free_selq: + kvfree(selq); +err_free_final_selq: + kvfree(priv->htb.final_selq); + priv->htb.final_selq = NULL; + return err; +} + +int mlx5e_htb_root_del(struct mlx5e_priv *priv) +{ + struct mlx5e_select_queue_params *selq = priv->htb.final_selq; + struct mlx5e_qos_node *root; + int err; + + qos_dbg(priv->mdev, "TC_HTB_DESTROY\n"); + + if (!WARN_ON(!selq)) { + /* Wait until real_num_tx_queues is updated for mlx5e_select_queue, + * so that we can safely switch to its non-HTB non-PTP fastpath. + */ + synchronize_net(); + + mlx5e_build_selq(selq, &priv->channels.params, false); + mlx5e_replace_selq(priv, selq); + } + priv->htb.final_selq = NULL; + + WRITE_ONCE(priv->htb.maj_id, 0); + + root = mlx5e_sw_node_find(priv, MLX5E_HTB_CLASSID_ROOT); + if (!root) { + qos_err(priv->mdev, "Failed to find the root node in the QoS tree\n"); + return -ENOENT; + } + err = mlx5_qos_destroy_node(priv->mdev, root->hw_id); + if (err) + qos_err(priv->mdev, "Failed to destroy root node %u, err = %d\n", + root->hw_id, err); + mlx5e_sw_node_delete(priv, root); + + mlx5e_qos_deactivate_all_queues(&priv->channels); + mlx5e_qos_close_all_queues(&priv->channels); + + return err; +} + +static int mlx5e_htb_convert_rate(struct mlx5e_priv *priv, u64 rate, + struct mlx5e_qos_node *parent, u32 *bw_share) +{ + u64 share = 0; + + while (parent->classid != MLX5E_HTB_CLASSID_ROOT && !parent->max_average_bw) + parent = parent->parent; + + if (parent->max_average_bw) + share = div64_u64(div_u64(rate * 100, BYTES_IN_MBIT), + parent->max_average_bw); + else + share = 101; + + *bw_share = share == 0 ? 1 : share > 100 ? 0 : share; + + qos_dbg(priv->mdev, "Convert: rate %llu, parent ceil %llu -> bw_share %u\n", + rate, (u64)parent->max_average_bw * BYTES_IN_MBIT, *bw_share); + + return 0; +} + +static void mlx5e_htb_convert_ceil(struct mlx5e_priv *priv, u64 ceil, u32 *max_average_bw) +{ + /* Hardware treats 0 as "unlimited", set at least 1. */ + *max_average_bw = max_t(u32, div_u64(ceil, BYTES_IN_MBIT), 1); + + qos_dbg(priv->mdev, "Convert: ceil %llu -> max_average_bw %u\n", + ceil, *max_average_bw); +} + +int mlx5e_htb_leaf_alloc_queue(struct mlx5e_priv *priv, u16 classid, + u32 parent_classid, u64 rate, u64 ceil, + struct netlink_ext_ack *extack) +{ + struct mlx5e_qos_node *node, *parent; + int qid; + int err; + + qos_dbg(priv->mdev, "TC_HTB_LEAF_ALLOC_QUEUE classid %04x, parent %04x, rate %llu, ceil %llu\n", + classid, parent_classid, rate, ceil); + + qid = mlx5e_find_unused_qos_qid(priv); + if (qid < 0) { + NL_SET_ERR_MSG_MOD(extack, "Maximum amount of leaf classes is reached."); + return qid; + } + + parent = mlx5e_sw_node_find(priv, parent_classid); + if (!parent) + return -EINVAL; + + node = mlx5e_sw_node_create_leaf(priv, classid, qid, parent); + if (IS_ERR(node)) + return PTR_ERR(node); + + node->rate = rate; + mlx5e_htb_convert_rate(priv, rate, node->parent, &node->bw_share); + mlx5e_htb_convert_ceil(priv, ceil, &node->max_average_bw); + + err = mlx5_qos_create_leaf_node(priv->mdev, node->parent->hw_id, + node->bw_share, node->max_average_bw, + &node->hw_id); + if (err) { + NL_SET_ERR_MSG_MOD(extack, "Firmware error when creating a leaf node."); + qos_err(priv->mdev, "Failed to create a leaf node (class %04x), err = %d\n", + classid, err); + mlx5e_sw_node_delete(priv, node); + return err; + } + + if (test_bit(MLX5E_STATE_OPENED, &priv->state)) { + err = mlx5e_open_qos_sq(priv, &priv->channels, node); + if (err) { + NL_SET_ERR_MSG_MOD(extack, "Error creating an SQ."); + qos_warn(priv->mdev, "Failed to create a QoS SQ (class %04x), err = %d\n", + classid, err); + } else { + mlx5e_activate_qos_sq(priv, node); + } + } + + return mlx5e_qid_from_qos(&priv->channels, node->qid); +} + +int mlx5e_htb_leaf_to_inner(struct mlx5e_priv *priv, u16 classid, u16 child_classid, + u64 rate, u64 ceil, struct netlink_ext_ack *extack) +{ + struct mlx5e_qos_node *node, *child; + int err, tmp_err; + u32 new_hw_id; + u16 qid; + + qos_dbg(priv->mdev, "TC_HTB_LEAF_TO_INNER classid %04x, upcoming child %04x, rate %llu, ceil %llu\n", + classid, child_classid, rate, ceil); + + node = mlx5e_sw_node_find(priv, classid); + if (!node) + return -ENOENT; + + err = mlx5_qos_create_inner_node(priv->mdev, node->parent->hw_id, + node->bw_share, node->max_average_bw, + &new_hw_id); + if (err) { + NL_SET_ERR_MSG_MOD(extack, "Firmware error when creating an inner node."); + qos_err(priv->mdev, "Failed to create an inner node (class %04x), err = %d\n", + classid, err); + return err; + } + + /* Intentionally reuse the qid for the upcoming first child. */ + child = mlx5e_sw_node_create_leaf(priv, child_classid, node->qid, node); + if (IS_ERR(child)) { + err = PTR_ERR(child); + goto err_destroy_hw_node; + } + + child->rate = rate; + mlx5e_htb_convert_rate(priv, rate, node, &child->bw_share); + mlx5e_htb_convert_ceil(priv, ceil, &child->max_average_bw); + + err = mlx5_qos_create_leaf_node(priv->mdev, new_hw_id, child->bw_share, + child->max_average_bw, &child->hw_id); + if (err) { + NL_SET_ERR_MSG_MOD(extack, "Firmware error when creating a leaf node."); + qos_err(priv->mdev, "Failed to create a leaf node (class %04x), err = %d\n", + classid, err); + goto err_delete_sw_node; + } + + /* No fail point. */ + + qid = node->qid; + /* Pairs with mlx5e_get_txq_by_classid. */ + WRITE_ONCE(node->qid, MLX5E_QOS_QID_INNER); + + if (test_bit(MLX5E_STATE_OPENED, &priv->state)) { + mlx5e_deactivate_qos_sq(priv, qid); + mlx5e_close_qos_sq(priv, qid); + } + + err = mlx5_qos_destroy_node(priv->mdev, node->hw_id); + if (err) /* Not fatal. */ + qos_warn(priv->mdev, "Failed to destroy leaf node %u (class %04x), err = %d\n", + node->hw_id, classid, err); + + node->hw_id = new_hw_id; + + if (test_bit(MLX5E_STATE_OPENED, &priv->state)) { + err = mlx5e_open_qos_sq(priv, &priv->channels, child); + if (err) { + NL_SET_ERR_MSG_MOD(extack, "Error creating an SQ."); + qos_warn(priv->mdev, "Failed to create a QoS SQ (class %04x), err = %d\n", + classid, err); + } else { + mlx5e_activate_qos_sq(priv, child); + } + } + + return 0; + +err_delete_sw_node: + child->qid = MLX5E_QOS_QID_INNER; + mlx5e_sw_node_delete(priv, child); + +err_destroy_hw_node: + tmp_err = mlx5_qos_destroy_node(priv->mdev, new_hw_id); + if (tmp_err) /* Not fatal. */ + qos_warn(priv->mdev, "Failed to roll back creation of an inner node %u (class %04x), err = %d\n", + new_hw_id, classid, tmp_err); + return err; +} + +static struct mlx5e_qos_node *mlx5e_sw_node_find_by_qid(struct mlx5e_priv *priv, u16 qid) +{ + struct mlx5e_qos_node *node = NULL; + int bkt; + + hash_for_each(priv->htb.qos_tc2node, bkt, node, hnode) + if (node->qid == qid) + break; + + return node; +} + +static void mlx5e_reactivate_qos_sq(struct mlx5e_priv *priv, u16 qid, struct netdev_queue *txq) +{ + qos_dbg(priv->mdev, "Reactivate QoS SQ qid %u\n", qid); + netdev_tx_reset_queue(txq); + netif_tx_start_queue(txq); +} + +static void mlx5e_reset_qdisc(struct net_device *dev, u16 qid) +{ + struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, qid); + struct Qdisc *qdisc = dev_queue->qdisc_sleeping; + + if (!qdisc) + return; + + spin_lock_bh(qdisc_lock(qdisc)); + qdisc_reset(qdisc); + spin_unlock_bh(qdisc_lock(qdisc)); +} + +int mlx5e_htb_leaf_del(struct mlx5e_priv *priv, u16 *classid, + struct netlink_ext_ack *extack) +{ + struct mlx5e_qos_node *node; + struct netdev_queue *txq; + u16 qid, moved_qid; + bool opened; + int err; + + qos_dbg(priv->mdev, "TC_HTB_LEAF_DEL classid %04x\n", *classid); + + node = mlx5e_sw_node_find(priv, *classid); + if (!node) + return -ENOENT; + + /* Store qid for reuse. */ + qid = node->qid; + + opened = test_bit(MLX5E_STATE_OPENED, &priv->state); + if (opened) { + txq = netdev_get_tx_queue(priv->netdev, + mlx5e_qid_from_qos(&priv->channels, qid)); + mlx5e_deactivate_qos_sq(priv, qid); + mlx5e_close_qos_sq(priv, qid); + } + + err = mlx5_qos_destroy_node(priv->mdev, node->hw_id); + if (err) /* Not fatal. */ + qos_warn(priv->mdev, "Failed to destroy leaf node %u (class %04x), err = %d\n", + node->hw_id, *classid, err); + + mlx5e_sw_node_delete(priv, node); + + moved_qid = mlx5e_qos_cur_leaf_nodes(priv); + + if (moved_qid == 0) { + /* The last QoS SQ was just destroyed. */ + if (opened) + mlx5e_reactivate_qos_sq(priv, qid, txq); + return 0; + } + moved_qid--; + + if (moved_qid < qid) { + /* The highest QoS SQ was just destroyed. */ + WARN(moved_qid != qid - 1, "Gaps in queue numeration: destroyed queue %u, the highest queue is %u", + qid, moved_qid); + if (opened) + mlx5e_reactivate_qos_sq(priv, qid, txq); + return 0; + } + + WARN(moved_qid == qid, "Can't move node with qid %u to itself", qid); + qos_dbg(priv->mdev, "Moving QoS SQ %u to %u\n", moved_qid, qid); + + node = mlx5e_sw_node_find_by_qid(priv, moved_qid); + WARN(!node, "Could not find a node with qid %u to move to queue %u", + moved_qid, qid); + + /* Stop traffic to the old queue. */ + WRITE_ONCE(node->qid, MLX5E_QOS_QID_INNER); + __clear_bit(moved_qid, priv->htb.qos_used_qids); + + if (opened) { + txq = netdev_get_tx_queue(priv->netdev, + mlx5e_qid_from_qos(&priv->channels, moved_qid)); + mlx5e_deactivate_qos_sq(priv, moved_qid); + mlx5e_close_qos_sq(priv, moved_qid); + } + + /* Prevent packets from the old class from getting into the new one. */ + mlx5e_reset_qdisc(priv->netdev, moved_qid); + + __set_bit(qid, priv->htb.qos_used_qids); + WRITE_ONCE(node->qid, qid); + + if (test_bit(MLX5E_STATE_OPENED, &priv->state)) { + err = mlx5e_open_qos_sq(priv, &priv->channels, node); + if (err) { + NL_SET_ERR_MSG_MOD(extack, "Error creating an SQ."); + qos_warn(priv->mdev, "Failed to create a QoS SQ (class %04x) while moving qid %u to %u, err = %d\n", + node->classid, moved_qid, qid, err); + } else { + mlx5e_activate_qos_sq(priv, node); + } + } + + mlx5e_update_tx_netdev_queues(priv); + if (opened) + mlx5e_reactivate_qos_sq(priv, moved_qid, txq); + + *classid = node->classid; + return 0; +} + +int mlx5e_htb_leaf_del_last(struct mlx5e_priv *priv, u16 classid, bool force, + struct netlink_ext_ack *extack) +{ + struct mlx5e_qos_node *node, *parent; + u32 old_hw_id, new_hw_id; + int err, saved_err = 0; + u16 qid; + + qos_dbg(priv->mdev, "TC_HTB_LEAF_DEL_LAST%s classid %04x\n", + force ? "_FORCE" : "", classid); + + node = mlx5e_sw_node_find(priv, classid); + if (!node) + return -ENOENT; + + err = mlx5_qos_create_leaf_node(priv->mdev, node->parent->parent->hw_id, + node->parent->bw_share, + node->parent->max_average_bw, + &new_hw_id); + if (err) { + NL_SET_ERR_MSG_MOD(extack, "Firmware error when creating a leaf node."); + qos_err(priv->mdev, "Failed to create a leaf node (class %04x), err = %d\n", + classid, err); + if (!force) + return err; + saved_err = err; + } + + /* Store qid for reuse and prevent clearing the bit. */ + qid = node->qid; + /* Pairs with mlx5e_get_txq_by_classid. */ + WRITE_ONCE(node->qid, MLX5E_QOS_QID_INNER); + + if (test_bit(MLX5E_STATE_OPENED, &priv->state)) { + mlx5e_deactivate_qos_sq(priv, qid); + mlx5e_close_qos_sq(priv, qid); + } + + /* Prevent packets from the old class from getting into the new one. */ + mlx5e_reset_qdisc(priv->netdev, qid); + + err = mlx5_qos_destroy_node(priv->mdev, node->hw_id); + if (err) /* Not fatal. */ + qos_warn(priv->mdev, "Failed to destroy leaf node %u (class %04x), err = %d\n", + node->hw_id, classid, err); + + parent = node->parent; + mlx5e_sw_node_delete(priv, node); + + node = parent; + WRITE_ONCE(node->qid, qid); + + /* Early return on error in force mode. Parent will still be an inner + * node to be deleted by a following delete operation. + */ + if (saved_err) + return saved_err; + + old_hw_id = node->hw_id; + node->hw_id = new_hw_id; + + if (test_bit(MLX5E_STATE_OPENED, &priv->state)) { + err = mlx5e_open_qos_sq(priv, &priv->channels, node); + if (err) { + NL_SET_ERR_MSG_MOD(extack, "Error creating an SQ."); + qos_warn(priv->mdev, "Failed to create a QoS SQ (class %04x), err = %d\n", + classid, err); + } else { + mlx5e_activate_qos_sq(priv, node); + } + } + + err = mlx5_qos_destroy_node(priv->mdev, old_hw_id); + if (err) /* Not fatal. */ + qos_warn(priv->mdev, "Failed to destroy leaf node %u (class %04x), err = %d\n", + node->hw_id, classid, err); + + return 0; +} + +static int mlx5e_qos_update_children(struct mlx5e_priv *priv, struct mlx5e_qos_node *node, + struct netlink_ext_ack *extack) +{ + struct mlx5e_qos_node *child; + int err = 0; + int bkt; + + hash_for_each(priv->htb.qos_tc2node, bkt, child, hnode) { + u32 old_bw_share = child->bw_share; + int err_one; + + if (child->parent != node) + continue; + + mlx5e_htb_convert_rate(priv, child->rate, node, &child->bw_share); + if (child->bw_share == old_bw_share) + continue; + + err_one = mlx5_qos_update_node(priv->mdev, child->hw_id, child->bw_share, + child->max_average_bw, child->hw_id); + if (!err && err_one) { + err = err_one; + + NL_SET_ERR_MSG_MOD(extack, "Firmware error when modifying a child node."); + qos_err(priv->mdev, "Failed to modify a child node (class %04x), err = %d\n", + node->classid, err); + } + } + + return err; +} + +int mlx5e_htb_node_modify(struct mlx5e_priv *priv, u16 classid, u64 rate, u64 ceil, + struct netlink_ext_ack *extack) +{ + u32 bw_share, max_average_bw; + struct mlx5e_qos_node *node; + bool ceil_changed = false; + int err; + + qos_dbg(priv->mdev, "TC_HTB_LEAF_MODIFY classid %04x, rate %llu, ceil %llu\n", + classid, rate, ceil); + + node = mlx5e_sw_node_find(priv, classid); + if (!node) + return -ENOENT; + + node->rate = rate; + mlx5e_htb_convert_rate(priv, rate, node->parent, &bw_share); + mlx5e_htb_convert_ceil(priv, ceil, &max_average_bw); + + err = mlx5_qos_update_node(priv->mdev, node->parent->hw_id, bw_share, + max_average_bw, node->hw_id); + if (err) { + NL_SET_ERR_MSG_MOD(extack, "Firmware error when modifying a node."); + qos_err(priv->mdev, "Failed to modify a node (class %04x), err = %d\n", + classid, err); + return err; + } + + if (max_average_bw != node->max_average_bw) + ceil_changed = true; + + node->bw_share = bw_share; + node->max_average_bw = max_average_bw; + + if (ceil_changed) + err = mlx5e_qos_update_children(priv, node, extack); + + return err; +} + +struct mlx5e_mqprio_rl { + struct mlx5_core_dev *mdev; + u32 root_id; + u32 *leaves_id; + u8 num_tc; +}; + +struct mlx5e_mqprio_rl *mlx5e_mqprio_rl_alloc(void) +{ + return kvzalloc(sizeof(struct mlx5e_mqprio_rl), GFP_KERNEL); +} + +void mlx5e_mqprio_rl_free(struct mlx5e_mqprio_rl *rl) +{ + kvfree(rl); +} + +int mlx5e_mqprio_rl_init(struct mlx5e_mqprio_rl *rl, struct mlx5_core_dev *mdev, u8 num_tc, + u64 max_rate[]) +{ + int err; + int tc; + + if (!mlx5_qos_is_supported(mdev)) { + qos_warn(mdev, "Missing QoS capabilities. Try disabling SRIOV or use a supported device."); + return -EOPNOTSUPP; + } + if (num_tc > mlx5e_qos_max_leaf_nodes(mdev)) + return -EINVAL; + + rl->mdev = mdev; + rl->num_tc = num_tc; + rl->leaves_id = kvcalloc(num_tc, sizeof(*rl->leaves_id), GFP_KERNEL); + if (!rl->leaves_id) + return -ENOMEM; + + err = mlx5_qos_create_root_node(mdev, &rl->root_id); + if (err) + goto err_free_leaves; + + qos_dbg(mdev, "Root created, id %#x\n", rl->root_id); + + for (tc = 0; tc < num_tc; tc++) { + u32 max_average_bw; + + max_average_bw = mlx5e_qos_bytes2mbits(mdev, max_rate[tc]); + err = mlx5_qos_create_leaf_node(mdev, rl->root_id, 0, max_average_bw, + &rl->leaves_id[tc]); + if (err) + goto err_destroy_leaves; + + qos_dbg(mdev, "Leaf[%d] created, id %#x, max average bw %u Mbits/sec\n", + tc, rl->leaves_id[tc], max_average_bw); + } + return 0; + +err_destroy_leaves: + while (--tc >= 0) + mlx5_qos_destroy_node(mdev, rl->leaves_id[tc]); + mlx5_qos_destroy_node(mdev, rl->root_id); +err_free_leaves: + kvfree(rl->leaves_id); + return err; +} + +void mlx5e_mqprio_rl_cleanup(struct mlx5e_mqprio_rl *rl) +{ + int tc; + + for (tc = 0; tc < rl->num_tc; tc++) + mlx5_qos_destroy_node(rl->mdev, rl->leaves_id[tc]); + mlx5_qos_destroy_node(rl->mdev, rl->root_id); + kvfree(rl->leaves_id); +} + +int mlx5e_mqprio_rl_get_node_hw_id(struct mlx5e_mqprio_rl *rl, int tc, u32 *hw_id) +{ + if (tc >= rl->num_tc) + return -EINVAL; + + *hw_id = rl->leaves_id[tc]; + return 0; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/qos.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/qos.h new file mode 100644 index 0000000..bf29428 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/qos.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. */ + +#ifndef __MLX5E_EN_QOS_H +#define __MLX5E_EN_QOS_H + +#include + +#define MLX5E_QOS_MAX_LEAF_NODES 256 + +struct mlx5e_priv; +struct mlx5e_channels; +struct mlx5e_channel; + +int mlx5e_qos_bytes_rate_check(struct mlx5_core_dev *mdev, u64 nbytes); +int mlx5e_qos_max_leaf_nodes(struct mlx5_core_dev *mdev); +int mlx5e_qos_cur_leaf_nodes(struct mlx5e_priv *priv); + +/* TX datapath API */ +int mlx5e_get_txq_by_classid(struct mlx5e_priv *priv, u16 classid); +struct mlx5e_txqsq *mlx5e_get_sq(struct mlx5e_priv *priv, int qid); + +/* SQ lifecycle */ +int mlx5e_qos_open_queues(struct mlx5e_priv *priv, struct mlx5e_channels *chs); +void mlx5e_qos_activate_queues(struct mlx5e_priv *priv); +void mlx5e_qos_deactivate_queues(struct mlx5e_channel *c, bool finalize); +void mlx5e_qos_close_queues(struct mlx5e_channel *c); + +/* HTB API */ +int mlx5e_htb_root_add(struct mlx5e_priv *priv, u16 htb_maj_id, u16 htb_defcls, + struct netlink_ext_ack *extack); +int mlx5e_htb_root_del(struct mlx5e_priv *priv); +int mlx5e_htb_leaf_alloc_queue(struct mlx5e_priv *priv, u16 classid, + u32 parent_classid, u64 rate, u64 ceil, + struct netlink_ext_ack *extack); +int mlx5e_htb_leaf_to_inner(struct mlx5e_priv *priv, u16 classid, u16 child_classid, + u64 rate, u64 ceil, struct netlink_ext_ack *extack); +int mlx5e_htb_leaf_del(struct mlx5e_priv *priv, u16 *classid, + struct netlink_ext_ack *extack); +int mlx5e_htb_leaf_del_last(struct mlx5e_priv *priv, u16 classid, bool force, + struct netlink_ext_ack *extack); +int mlx5e_htb_node_modify(struct mlx5e_priv *priv, u16 classid, u64 rate, u64 ceil, + struct netlink_ext_ack *extack); + +/* MQPRIO TX rate limit */ +struct mlx5e_mqprio_rl; +struct mlx5e_mqprio_rl *mlx5e_mqprio_rl_alloc(void); +void mlx5e_mqprio_rl_free(struct mlx5e_mqprio_rl *rl); +int mlx5e_mqprio_rl_init(struct mlx5e_mqprio_rl *rl, struct mlx5_core_dev *mdev, u8 num_tc, + u64 max_rate[]); +void mlx5e_mqprio_rl_cleanup(struct mlx5e_mqprio_rl *rl); +int mlx5e_mqprio_rl_get_node_hw_id(struct mlx5e_mqprio_rl *rl, int tc, u32 *hw_id); +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bond.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bond.c new file mode 100644 index 0000000..b6f5c1b --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bond.c @@ -0,0 +1,351 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020 Mellanox Technologies Inc. All rights reserved. */ + +#include +#include +#include + +#include "mlx5_core.h" +#include "eswitch.h" +#include "esw/acl/ofld.h" +#include "en_rep.h" + +struct mlx5e_rep_bond { + struct notifier_block nb; + struct netdev_net_notifier nn; + struct list_head metadata_list; +}; + +struct mlx5e_rep_bond_slave_entry { + struct list_head list; + struct net_device *netdev; +}; + +struct mlx5e_rep_bond_metadata { + struct list_head list; /* link to global list of rep_bond_metadata */ + struct mlx5_eswitch *esw; + /* private of uplink holding rep bond metadata list */ + struct net_device *lag_dev; + u32 metadata_reg_c_0; + + struct list_head slaves_list; /* slaves list */ + int slaves; +}; + +static struct mlx5e_rep_bond_metadata * +mlx5e_lookup_rep_bond_metadata(struct mlx5_rep_uplink_priv *uplink_priv, + const struct net_device *lag_dev) +{ + struct mlx5e_rep_bond_metadata *found = NULL; + struct mlx5e_rep_bond_metadata *cur; + + list_for_each_entry(cur, &uplink_priv->bond->metadata_list, list) { + if (cur->lag_dev == lag_dev) { + found = cur; + break; + } + } + + return found; +} + +static struct mlx5e_rep_bond_slave_entry * +mlx5e_lookup_rep_bond_slave_entry(struct mlx5e_rep_bond_metadata *mdata, + const struct net_device *netdev) +{ + struct mlx5e_rep_bond_slave_entry *found = NULL; + struct mlx5e_rep_bond_slave_entry *cur; + + list_for_each_entry(cur, &mdata->slaves_list, list) { + if (cur->netdev == netdev) { + found = cur; + break; + } + } + + return found; +} + +static void mlx5e_rep_bond_metadata_release(struct mlx5e_rep_bond_metadata *mdata) +{ + netdev_dbg(mdata->lag_dev, "destroy rep_bond_metadata(%d)\n", + mdata->metadata_reg_c_0); + list_del(&mdata->list); + mlx5_esw_match_metadata_free(mdata->esw, mdata->metadata_reg_c_0); + WARN_ON(!list_empty(&mdata->slaves_list)); + kfree(mdata); +} + +/* This must be called under rtnl_lock */ +int mlx5e_rep_bond_enslave(struct mlx5_eswitch *esw, struct net_device *netdev, + struct net_device *lag_dev) +{ + struct mlx5e_rep_bond_slave_entry *s_entry; + struct mlx5e_rep_bond_metadata *mdata; + struct mlx5e_rep_priv *rpriv; + struct mlx5e_priv *priv; + int err; + + ASSERT_RTNL(); + + rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH); + mdata = mlx5e_lookup_rep_bond_metadata(&rpriv->uplink_priv, lag_dev); + if (!mdata) { + /* First netdev becomes slave, no metadata presents the lag_dev. Create one */ + mdata = kzalloc(sizeof(*mdata), GFP_KERNEL); + if (!mdata) + return -ENOMEM; + + mdata->lag_dev = lag_dev; + mdata->esw = esw; + INIT_LIST_HEAD(&mdata->slaves_list); + mdata->metadata_reg_c_0 = mlx5_esw_match_metadata_alloc(esw); + if (!mdata->metadata_reg_c_0) { + kfree(mdata); + return -ENOSPC; + } + list_add(&mdata->list, &rpriv->uplink_priv.bond->metadata_list); + + netdev_dbg(lag_dev, "create rep_bond_metadata(%d)\n", + mdata->metadata_reg_c_0); + } + + s_entry = kzalloc(sizeof(*s_entry), GFP_KERNEL); + if (!s_entry) { + err = -ENOMEM; + goto entry_alloc_err; + } + + s_entry->netdev = netdev; + priv = netdev_priv(netdev); + rpriv = priv->ppriv; + + err = mlx5_esw_acl_ingress_vport_bond_update(esw, rpriv->rep->vport, + mdata->metadata_reg_c_0); + if (err) + goto ingress_err; + + mdata->slaves++; + list_add_tail(&s_entry->list, &mdata->slaves_list); + netdev_dbg(netdev, "enslave rep vport(%d) lag_dev(%s) metadata(0x%x)\n", + rpriv->rep->vport, lag_dev->name, mdata->metadata_reg_c_0); + + return 0; + +ingress_err: + kfree(s_entry); +entry_alloc_err: + if (!mdata->slaves) + mlx5e_rep_bond_metadata_release(mdata); + return err; +} + +/* This must be called under rtnl_lock */ +void mlx5e_rep_bond_unslave(struct mlx5_eswitch *esw, + const struct net_device *netdev, + const struct net_device *lag_dev) +{ + struct mlx5e_rep_bond_slave_entry *s_entry; + struct mlx5e_rep_bond_metadata *mdata; + struct mlx5e_rep_priv *rpriv; + struct mlx5e_priv *priv; + + ASSERT_RTNL(); + + rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH); + mdata = mlx5e_lookup_rep_bond_metadata(&rpriv->uplink_priv, lag_dev); + if (!mdata) + return; + + s_entry = mlx5e_lookup_rep_bond_slave_entry(mdata, netdev); + if (!s_entry) + return; + + priv = netdev_priv(netdev); + rpriv = priv->ppriv; + + /* Reset bond_metadata to zero first then reset all ingress/egress + * acls and rx rules of unslave representor's vport + */ + mlx5_esw_acl_ingress_vport_bond_update(esw, rpriv->rep->vport, 0); + mlx5_esw_acl_egress_vport_unbond(esw, rpriv->rep->vport); + mlx5e_rep_bond_update(priv, false); + + list_del(&s_entry->list); + + netdev_dbg(netdev, "unslave rep vport(%d) lag_dev(%s) metadata(0x%x)\n", + rpriv->rep->vport, lag_dev->name, mdata->metadata_reg_c_0); + + if (--mdata->slaves == 0) + mlx5e_rep_bond_metadata_release(mdata); + kfree(s_entry); +} + +static bool mlx5e_rep_is_lag_netdev(struct net_device *netdev) +{ + return netif_is_lag_port(netdev) && mlx5e_eswitch_vf_rep(netdev); +} + +static void mlx5e_rep_changelowerstate_event(struct net_device *netdev, void *ptr) +{ + struct netdev_notifier_changelowerstate_info *info; + struct netdev_lag_lower_state_info *lag_info; + struct mlx5e_rep_priv *rpriv; + struct net_device *lag_dev; + struct mlx5e_priv *priv; + struct list_head *iter; + struct net_device *dev; + u16 acl_vport_num; + u16 fwd_vport_num; + int err; + + info = ptr; + lag_info = info->lower_state_info; + /* This is not an event of a representor becoming active slave */ + if (!lag_info->tx_enabled) + return; + + priv = netdev_priv(netdev); + rpriv = priv->ppriv; + fwd_vport_num = rpriv->rep->vport; + lag_dev = netdev_master_upper_dev_get(netdev); + if (!lag_dev) + return; + + netdev_dbg(netdev, "lag_dev(%s)'s slave vport(%d) is txable(%d)\n", + lag_dev->name, fwd_vport_num, net_lag_port_dev_txable(netdev)); + + /* Point everyone's egress acl to the vport of the active representor */ + netdev_for_each_lower_dev(lag_dev, dev, iter) { + priv = netdev_priv(dev); + rpriv = priv->ppriv; + acl_vport_num = rpriv->rep->vport; + if (acl_vport_num != fwd_vport_num) { + /* Only single rx_rule for unique bond_metadata should be + * present, delete it if it's saved as passive vport's + * rx_rule with destination as passive vport's root_ft + */ + mlx5e_rep_bond_update(priv, true); + err = mlx5_esw_acl_egress_vport_bond(priv->mdev->priv.eswitch, + fwd_vport_num, + acl_vport_num); + if (err) + netdev_warn(dev, + "configure slave vport(%d) egress fwd, err(%d)", + acl_vport_num, err); + } + } + + /* Insert new rx_rule for unique bond_metadata, save it as active vport's + * rx_rule with new destination as active vport's root_ft + */ + err = mlx5e_rep_bond_update(netdev_priv(netdev), false); + if (err) + netdev_warn(netdev, "configure active slave vport(%d) rx_rule, err(%d)", + fwd_vport_num, err); +} + +static void mlx5e_rep_changeupper_event(struct net_device *netdev, void *ptr) +{ + struct netdev_notifier_changeupper_info *info = ptr; + struct mlx5e_rep_priv *rpriv; + struct net_device *lag_dev; + struct mlx5e_priv *priv; + + priv = netdev_priv(netdev); + rpriv = priv->ppriv; + lag_dev = info->upper_dev; + + netdev_dbg(netdev, "%sslave vport(%d) lag(%s)\n", + info->linking ? "en" : "un", rpriv->rep->vport, lag_dev->name); + + if (info->linking) + mlx5e_rep_bond_enslave(priv->mdev->priv.eswitch, netdev, lag_dev); + else + mlx5e_rep_bond_unslave(priv->mdev->priv.eswitch, netdev, lag_dev); +} + +/* Bond device of representors and netdev events are used here in specific way + * to support eswitch vports bonding and to perform failover of eswitch vport + * by modifying the vport's egress acl of lower dev representors. Thus this + * also change the traditional behavior of lower dev under bond device. + * All non-representor netdevs or representors of other vendors as lower dev + * of bond device are not supported. + */ +static int mlx5e_rep_esw_bond_netevent(struct notifier_block *nb, + unsigned long event, void *ptr) +{ + struct net_device *netdev = netdev_notifier_info_to_dev(ptr); + struct mlx5e_rep_priv *rpriv; + struct mlx5e_rep_bond *bond; + struct mlx5e_priv *priv; + + if (!mlx5e_rep_is_lag_netdev(netdev)) + return NOTIFY_DONE; + + bond = container_of(nb, struct mlx5e_rep_bond, nb); + priv = netdev_priv(netdev); + rpriv = mlx5_eswitch_get_uplink_priv(priv->mdev->priv.eswitch, REP_ETH); + /* Verify VF representor is on the same device of the bond handling the netevent. */ + if (rpriv->uplink_priv.bond != bond) + return NOTIFY_DONE; + + switch (event) { + case NETDEV_CHANGELOWERSTATE: + mlx5e_rep_changelowerstate_event(netdev, ptr); + break; + case NETDEV_CHANGEUPPER: + mlx5e_rep_changeupper_event(netdev, ptr); + break; + } + return NOTIFY_DONE; +} + +/* If HW support eswitch vports bonding, register a specific notifier to + * handle it when two or more representors are bonded + */ +int mlx5e_rep_bond_init(struct mlx5e_rep_priv *rpriv) +{ + struct mlx5_rep_uplink_priv *uplink_priv = &rpriv->uplink_priv; + struct net_device *netdev = rpriv->netdev; + struct mlx5e_priv *priv; + int ret = 0; + + priv = netdev_priv(netdev); + if (!mlx5_esw_acl_egress_fwd2vport_supported(priv->mdev->priv.eswitch)) + goto out; + + uplink_priv->bond = kvzalloc(sizeof(*uplink_priv->bond), GFP_KERNEL); + if (!uplink_priv->bond) { + ret = -ENOMEM; + goto out; + } + + INIT_LIST_HEAD(&uplink_priv->bond->metadata_list); + uplink_priv->bond->nb.notifier_call = mlx5e_rep_esw_bond_netevent; + ret = register_netdevice_notifier_dev_net(netdev, + &uplink_priv->bond->nb, + &uplink_priv->bond->nn); + if (ret) { + netdev_err(netdev, "register bonding netevent notifier, err(%d)\n", ret); + kvfree(uplink_priv->bond); + uplink_priv->bond = NULL; + } + +out: + return ret; +} + +void mlx5e_rep_bond_cleanup(struct mlx5e_rep_priv *rpriv) +{ + struct mlx5e_priv *priv = netdev_priv(rpriv->netdev); + + if (!mlx5_esw_acl_egress_fwd2vport_supported(priv->mdev->priv.eswitch) || + !rpriv->uplink_priv.bond) + return; + + unregister_netdevice_notifier_dev_net(rpriv->netdev, + &rpriv->uplink_priv.bond->nb, + &rpriv->uplink_priv.bond->nn); + kvfree(rpriv->uplink_priv.bond); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c new file mode 100644 index 0000000..48dc121 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.c @@ -0,0 +1,536 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2021 Mellanox Technologies. */ + +#include +#include +#include +#include +#include "bridge.h" +#include "esw/bridge.h" +#include "en_rep.h" + +#define MLX5_ESW_BRIDGE_UPDATE_INTERVAL 1000 + +struct mlx5_bridge_switchdev_fdb_work { + struct work_struct work; + struct switchdev_notifier_fdb_info fdb_info; + struct net_device *dev; + struct mlx5_esw_bridge_offloads *br_offloads; + bool add; +}; + +static bool mlx5_esw_bridge_dev_same_esw(struct net_device *dev, struct mlx5_eswitch *esw) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + + return esw == priv->mdev->priv.eswitch; +} + +static bool mlx5_esw_bridge_dev_same_hw(struct net_device *dev, struct mlx5_eswitch *esw) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5_core_dev *mdev, *esw_mdev; + u64 system_guid, esw_system_guid; + + mdev = priv->mdev; + esw_mdev = esw->dev; + + system_guid = mlx5_query_nic_system_image_guid(mdev); + esw_system_guid = mlx5_query_nic_system_image_guid(esw_mdev); + + return system_guid == esw_system_guid; +} + +static struct net_device * +mlx5_esw_bridge_lag_rep_get(struct net_device *dev, struct mlx5_eswitch *esw) +{ + struct net_device *lower; + struct list_head *iter; + + netdev_for_each_lower_dev(dev, lower, iter) { + struct mlx5_core_dev *mdev; + struct mlx5e_priv *priv; + + if (!mlx5e_eswitch_rep(lower)) + continue; + + priv = netdev_priv(lower); + mdev = priv->mdev; + if (mlx5_lag_is_shared_fdb(mdev) && mlx5_esw_bridge_dev_same_esw(lower, esw)) + return lower; + } + + return NULL; +} + +static struct net_device * +mlx5_esw_bridge_rep_vport_num_vhca_id_get(struct net_device *dev, struct mlx5_eswitch *esw, + u16 *vport_num, u16 *esw_owner_vhca_id) +{ + struct mlx5e_rep_priv *rpriv; + struct mlx5e_priv *priv; + + if (netif_is_lag_master(dev)) + dev = mlx5_esw_bridge_lag_rep_get(dev, esw); + + if (!dev || !mlx5e_eswitch_rep(dev) || !mlx5_esw_bridge_dev_same_hw(dev, esw)) + return NULL; + + priv = netdev_priv(dev); + rpriv = priv->ppriv; + *vport_num = rpriv->rep->vport; + *esw_owner_vhca_id = MLX5_CAP_GEN(priv->mdev, vhca_id); + return dev; +} + +static struct net_device * +mlx5_esw_bridge_lower_rep_vport_num_vhca_id_get(struct net_device *dev, struct mlx5_eswitch *esw, + u16 *vport_num, u16 *esw_owner_vhca_id) +{ + struct net_device *lower_dev; + struct list_head *iter; + + if (netif_is_lag_master(dev) || mlx5e_eswitch_rep(dev)) + return mlx5_esw_bridge_rep_vport_num_vhca_id_get(dev, esw, vport_num, + esw_owner_vhca_id); + + netdev_for_each_lower_dev(dev, lower_dev, iter) { + struct net_device *rep; + + if (netif_is_bridge_master(lower_dev)) + continue; + + rep = mlx5_esw_bridge_lower_rep_vport_num_vhca_id_get(lower_dev, esw, vport_num, + esw_owner_vhca_id); + if (rep) + return rep; + } + + return NULL; +} + +static bool mlx5_esw_bridge_is_local(struct net_device *dev, struct net_device *rep, + struct mlx5_eswitch *esw) +{ + struct mlx5_core_dev *mdev; + struct mlx5e_priv *priv; + + if (!mlx5_esw_bridge_dev_same_esw(rep, esw)) + return false; + + priv = netdev_priv(rep); + mdev = priv->mdev; + if (netif_is_lag_master(dev)) + return mlx5_lag_is_shared_fdb(mdev) && mlx5_lag_is_master(mdev); + return true; +} + +static int mlx5_esw_bridge_port_changeupper(struct notifier_block *nb, void *ptr) +{ + struct mlx5_esw_bridge_offloads *br_offloads = container_of(nb, + struct mlx5_esw_bridge_offloads, + netdev_nb); + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct netdev_notifier_changeupper_info *info = ptr; + struct net_device *upper = info->upper_dev, *rep; + struct mlx5_eswitch *esw = br_offloads->esw; + u16 vport_num, esw_owner_vhca_id; + struct netlink_ext_ack *extack; + int ifindex = upper->ifindex; + int err = 0; + + if (!netif_is_bridge_master(upper)) + return 0; + + rep = mlx5_esw_bridge_rep_vport_num_vhca_id_get(dev, esw, &vport_num, &esw_owner_vhca_id); + if (!rep) + return 0; + + extack = netdev_notifier_info_to_extack(&info->info); + + if (mlx5_esw_bridge_is_local(dev, rep, esw)) + err = info->linking ? + mlx5_esw_bridge_vport_link(ifindex, vport_num, esw_owner_vhca_id, + br_offloads, extack) : + mlx5_esw_bridge_vport_unlink(ifindex, vport_num, esw_owner_vhca_id, + br_offloads, extack); + else if (mlx5_esw_bridge_dev_same_hw(rep, esw)) + err = info->linking ? + mlx5_esw_bridge_vport_peer_link(ifindex, vport_num, esw_owner_vhca_id, + br_offloads, extack) : + mlx5_esw_bridge_vport_peer_unlink(ifindex, vport_num, esw_owner_vhca_id, + br_offloads, extack); + + return err; +} + +static int mlx5_esw_bridge_switchdev_port_event(struct notifier_block *nb, + unsigned long event, void *ptr) +{ + int err = 0; + + switch (event) { + case NETDEV_PRECHANGEUPPER: + break; + + case NETDEV_CHANGEUPPER: + err = mlx5_esw_bridge_port_changeupper(nb, ptr); + break; + } + + return notifier_from_errno(err); +} + +static int +mlx5_esw_bridge_port_obj_add(struct net_device *dev, + struct switchdev_notifier_port_obj_info *port_obj_info, + struct mlx5_esw_bridge_offloads *br_offloads) +{ + struct netlink_ext_ack *extack = switchdev_notifier_info_to_extack(&port_obj_info->info); + const struct switchdev_obj *obj = port_obj_info->obj; + const struct switchdev_obj_port_vlan *vlan; + u16 vport_num, esw_owner_vhca_id; + int err; + + if (!mlx5_esw_bridge_rep_vport_num_vhca_id_get(dev, br_offloads->esw, &vport_num, + &esw_owner_vhca_id)) + return 0; + + port_obj_info->handled = true; + + switch (obj->id) { + case SWITCHDEV_OBJ_ID_PORT_VLAN: + vlan = SWITCHDEV_OBJ_PORT_VLAN(obj); + err = mlx5_esw_bridge_port_vlan_add(vport_num, esw_owner_vhca_id, vlan->vid, + vlan->flags, br_offloads, extack); + break; + default: + return -EOPNOTSUPP; + } + return err; +} + +static int +mlx5_esw_bridge_port_obj_del(struct net_device *dev, + struct switchdev_notifier_port_obj_info *port_obj_info, + struct mlx5_esw_bridge_offloads *br_offloads) +{ + const struct switchdev_obj *obj = port_obj_info->obj; + const struct switchdev_obj_port_vlan *vlan; + u16 vport_num, esw_owner_vhca_id; + + if (!mlx5_esw_bridge_rep_vport_num_vhca_id_get(dev, br_offloads->esw, &vport_num, + &esw_owner_vhca_id)) + return 0; + + port_obj_info->handled = true; + + switch (obj->id) { + case SWITCHDEV_OBJ_ID_PORT_VLAN: + vlan = SWITCHDEV_OBJ_PORT_VLAN(obj); + mlx5_esw_bridge_port_vlan_del(vport_num, esw_owner_vhca_id, vlan->vid, br_offloads); + break; + default: + return -EOPNOTSUPP; + } + return 0; +} + +static int +mlx5_esw_bridge_port_obj_attr_set(struct net_device *dev, + struct switchdev_notifier_port_attr_info *port_attr_info, + struct mlx5_esw_bridge_offloads *br_offloads) +{ + struct netlink_ext_ack *extack = switchdev_notifier_info_to_extack(&port_attr_info->info); + const struct switchdev_attr *attr = port_attr_info->attr; + u16 vport_num, esw_owner_vhca_id; + int err = 0; + + if (!mlx5_esw_bridge_lower_rep_vport_num_vhca_id_get(dev, br_offloads->esw, &vport_num, + &esw_owner_vhca_id)) + return 0; + + port_attr_info->handled = true; + + switch (attr->id) { + case SWITCHDEV_ATTR_ID_PORT_PRE_BRIDGE_FLAGS: + if (attr->u.brport_flags.mask & ~(BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD)) { + NL_SET_ERR_MSG_MOD(extack, "Flag is not supported"); + err = -EINVAL; + } + break; + case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS: + break; + case SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME: + err = mlx5_esw_bridge_ageing_time_set(vport_num, esw_owner_vhca_id, + attr->u.ageing_time, br_offloads); + break; + case SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING: + err = mlx5_esw_bridge_vlan_filtering_set(vport_num, esw_owner_vhca_id, + attr->u.vlan_filtering, br_offloads); + break; + default: + err = -EOPNOTSUPP; + } + + return err; +} + +static int mlx5_esw_bridge_event_blocking(struct notifier_block *nb, + unsigned long event, void *ptr) +{ + struct mlx5_esw_bridge_offloads *br_offloads = container_of(nb, + struct mlx5_esw_bridge_offloads, + nb_blk); + struct net_device *dev = switchdev_notifier_info_to_dev(ptr); + int err; + + switch (event) { + case SWITCHDEV_PORT_OBJ_ADD: + err = mlx5_esw_bridge_port_obj_add(dev, ptr, br_offloads); + break; + case SWITCHDEV_PORT_OBJ_DEL: + err = mlx5_esw_bridge_port_obj_del(dev, ptr, br_offloads); + break; + case SWITCHDEV_PORT_ATTR_SET: + err = mlx5_esw_bridge_port_obj_attr_set(dev, ptr, br_offloads); + break; + default: + err = 0; + } + + return notifier_from_errno(err); +} + +static void +mlx5_esw_bridge_cleanup_switchdev_fdb_work(struct mlx5_bridge_switchdev_fdb_work *fdb_work) +{ + dev_put(fdb_work->dev); + kfree(fdb_work->fdb_info.addr); + kfree(fdb_work); +} + +static void mlx5_esw_bridge_switchdev_fdb_event_work(struct work_struct *work) +{ + struct mlx5_bridge_switchdev_fdb_work *fdb_work = + container_of(work, struct mlx5_bridge_switchdev_fdb_work, work); + struct switchdev_notifier_fdb_info *fdb_info = + &fdb_work->fdb_info; + struct mlx5_esw_bridge_offloads *br_offloads = + fdb_work->br_offloads; + struct net_device *dev = fdb_work->dev; + u16 vport_num, esw_owner_vhca_id; + + rtnl_lock(); + + if (!mlx5_esw_bridge_rep_vport_num_vhca_id_get(dev, br_offloads->esw, &vport_num, + &esw_owner_vhca_id)) + goto out; + + if (fdb_work->add) + mlx5_esw_bridge_fdb_create(dev, vport_num, esw_owner_vhca_id, br_offloads, + fdb_info); + else + mlx5_esw_bridge_fdb_remove(dev, vport_num, esw_owner_vhca_id, br_offloads, + fdb_info); + +out: + rtnl_unlock(); + mlx5_esw_bridge_cleanup_switchdev_fdb_work(fdb_work); +} + +static struct mlx5_bridge_switchdev_fdb_work * +mlx5_esw_bridge_init_switchdev_fdb_work(struct net_device *dev, bool add, + struct switchdev_notifier_fdb_info *fdb_info, + struct mlx5_esw_bridge_offloads *br_offloads) +{ + struct mlx5_bridge_switchdev_fdb_work *work; + u8 *addr; + + work = kzalloc(sizeof(*work), GFP_ATOMIC); + if (!work) + return ERR_PTR(-ENOMEM); + + INIT_WORK(&work->work, mlx5_esw_bridge_switchdev_fdb_event_work); + memcpy(&work->fdb_info, fdb_info, sizeof(work->fdb_info)); + + addr = kzalloc(ETH_ALEN, GFP_ATOMIC); + if (!addr) { + kfree(work); + return ERR_PTR(-ENOMEM); + } + ether_addr_copy(addr, fdb_info->addr); + work->fdb_info.addr = addr; + + dev_hold(dev); + work->dev = dev; + work->br_offloads = br_offloads; + work->add = add; + return work; +} + +static int mlx5_esw_bridge_switchdev_event(struct notifier_block *nb, + unsigned long event, void *ptr) +{ + struct mlx5_esw_bridge_offloads *br_offloads = container_of(nb, + struct mlx5_esw_bridge_offloads, + nb); + struct net_device *dev = switchdev_notifier_info_to_dev(ptr); + struct switchdev_notifier_fdb_info *fdb_info; + struct mlx5_bridge_switchdev_fdb_work *work; + struct mlx5_eswitch *esw = br_offloads->esw; + struct switchdev_notifier_info *info = ptr; + u16 vport_num, esw_owner_vhca_id; + struct net_device *upper, *rep; + + if (event == SWITCHDEV_PORT_ATTR_SET) { + int err = mlx5_esw_bridge_port_obj_attr_set(dev, ptr, br_offloads); + + return notifier_from_errno(err); + } + + upper = netdev_master_upper_dev_get_rcu(dev); + if (!upper) + return NOTIFY_DONE; + if (!netif_is_bridge_master(upper)) + return NOTIFY_DONE; + + rep = mlx5_esw_bridge_rep_vport_num_vhca_id_get(dev, esw, &vport_num, &esw_owner_vhca_id); + if (!rep) + return NOTIFY_DONE; + + switch (event) { + case SWITCHDEV_FDB_ADD_TO_BRIDGE: + /* only handle the event on native eswtich of representor */ + if (!mlx5_esw_bridge_is_local(dev, rep, esw)) + break; + + fdb_info = container_of(info, + struct switchdev_notifier_fdb_info, + info); + mlx5_esw_bridge_fdb_update_used(dev, vport_num, esw_owner_vhca_id, br_offloads, + fdb_info); + break; + case SWITCHDEV_FDB_DEL_TO_BRIDGE: + /* only handle the event on peers */ + if (mlx5_esw_bridge_is_local(dev, rep, esw)) + break; + fallthrough; + case SWITCHDEV_FDB_ADD_TO_DEVICE: + case SWITCHDEV_FDB_DEL_TO_DEVICE: + fdb_info = container_of(info, + struct switchdev_notifier_fdb_info, + info); + + work = mlx5_esw_bridge_init_switchdev_fdb_work(dev, + event == SWITCHDEV_FDB_ADD_TO_DEVICE, + fdb_info, + br_offloads); + if (IS_ERR(work)) { + WARN_ONCE(1, "Failed to init switchdev work, err=%ld", + PTR_ERR(work)); + return notifier_from_errno(PTR_ERR(work)); + } + + queue_work(br_offloads->wq, &work->work); + break; + default: + break; + } + return NOTIFY_DONE; +} + +static void mlx5_esw_bridge_update_work(struct work_struct *work) +{ + struct mlx5_esw_bridge_offloads *br_offloads = container_of(work, + struct mlx5_esw_bridge_offloads, + update_work.work); + + rtnl_lock(); + mlx5_esw_bridge_update(br_offloads); + rtnl_unlock(); + + queue_delayed_work(br_offloads->wq, &br_offloads->update_work, + msecs_to_jiffies(MLX5_ESW_BRIDGE_UPDATE_INTERVAL)); +} + +void mlx5e_rep_bridge_init(struct mlx5e_priv *priv) +{ + struct mlx5_esw_bridge_offloads *br_offloads; + struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5_eswitch *esw = + mdev->priv.eswitch; + int err; + + rtnl_lock(); + br_offloads = mlx5_esw_bridge_init(esw); + rtnl_unlock(); + if (IS_ERR(br_offloads)) { + esw_warn(mdev, "Failed to init esw bridge (err=%ld)\n", PTR_ERR(br_offloads)); + return; + } + + br_offloads->wq = alloc_ordered_workqueue("mlx5_bridge_wq", 0); + if (!br_offloads->wq) { + esw_warn(mdev, "Failed to allocate bridge offloads workqueue\n"); + goto err_alloc_wq; + } + + br_offloads->nb.notifier_call = mlx5_esw_bridge_switchdev_event; + err = register_switchdev_notifier(&br_offloads->nb); + if (err) { + esw_warn(mdev, "Failed to register switchdev notifier (err=%d)\n", err); + goto err_register_swdev; + } + + br_offloads->nb_blk.notifier_call = mlx5_esw_bridge_event_blocking; + err = register_switchdev_blocking_notifier(&br_offloads->nb_blk); + if (err) { + esw_warn(mdev, "Failed to register blocking switchdev notifier (err=%d)\n", err); + goto err_register_swdev_blk; + } + + br_offloads->netdev_nb.notifier_call = mlx5_esw_bridge_switchdev_port_event; + err = register_netdevice_notifier_net(&init_net, &br_offloads->netdev_nb); + if (err) { + esw_warn(mdev, "Failed to register bridge offloads netdevice notifier (err=%d)\n", + err); + goto err_register_netdev; + } + INIT_DELAYED_WORK(&br_offloads->update_work, mlx5_esw_bridge_update_work); + queue_delayed_work(br_offloads->wq, &br_offloads->update_work, + msecs_to_jiffies(MLX5_ESW_BRIDGE_UPDATE_INTERVAL)); + return; + +err_register_netdev: + unregister_switchdev_blocking_notifier(&br_offloads->nb_blk); +err_register_swdev_blk: + unregister_switchdev_notifier(&br_offloads->nb); +err_register_swdev: + destroy_workqueue(br_offloads->wq); +err_alloc_wq: + rtnl_lock(); + mlx5_esw_bridge_cleanup(esw); + rtnl_unlock(); +} + +void mlx5e_rep_bridge_cleanup(struct mlx5e_priv *priv) +{ + struct mlx5_esw_bridge_offloads *br_offloads; + struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5_eswitch *esw = + mdev->priv.eswitch; + + br_offloads = esw->br_offloads; + if (!br_offloads) + return; + + cancel_delayed_work_sync(&br_offloads->update_work); + unregister_netdevice_notifier_net(&init_net, &br_offloads->netdev_nb); + unregister_switchdev_blocking_notifier(&br_offloads->nb_blk); + unregister_switchdev_notifier(&br_offloads->nb); + destroy_workqueue(br_offloads->wq); + rtnl_lock(); + mlx5_esw_bridge_cleanup(esw); + rtnl_unlock(); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.h new file mode 100644 index 0000000..fbeb642 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rep/bridge.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2021 Mellanox Technologies. */ + +#ifndef __MLX5_EN_REP_BRIDGE__ +#define __MLX5_EN_REP_BRIDGE__ + +#include "en.h" + +#if IS_ENABLED(CONFIG_MLX5_BRIDGE) + +void mlx5e_rep_bridge_init(struct mlx5e_priv *priv); +void mlx5e_rep_bridge_cleanup(struct mlx5e_priv *priv); + +#else /* CONFIG_MLX5_BRIDGE */ + +static inline void mlx5e_rep_bridge_init(struct mlx5e_priv *priv) {} +static inline void mlx5e_rep_bridge_cleanup(struct mlx5e_priv *priv) {} + +#endif /* CONFIG_MLX5_BRIDGE */ + +#endif /* __MLX5_EN_REP_BRIDGE__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rep/meter.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rep/meter.c new file mode 100644 index 0000000..f562fd6 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rep/meter.c @@ -0,0 +1,211 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2021 NVIDIA Corporation. */ + +#include "en_rep.h" +#include "eswitch.h" +#include "en/tc/meter.h" + +void +mlx5_rep_destroy_miss_meter(struct mlx5_core_dev *dev, struct mlx5e_rep_priv *rep_priv) +{ + struct rep_meter *meter = &rep_priv->rep_meter; + u64 bytes, packets; + + if (meter->drop_red_rule) { + mlx5_del_flow_rules(meter->drop_red_rule); + meter->drop_red_rule = NULL; + } + + if (meter->drop_counter) { + mlx5_fc_query(dev, meter->drop_counter, &packets, &bytes); + meter->packets_dropped += packets; + meter->bytes_dropped += bytes; + mlx5_fc_destroy(dev, meter->drop_counter); + meter->drop_counter = NULL; + } + + if (meter->meter_rule) { + mlx5_del_flow_rules(meter->meter_rule); + meter->meter_rule = NULL; + } + + if (meter->meter_hndl) { + mlx5e_free_flow_meter(dev, meter->meter_hndl); + meter->meter_hndl = NULL; + } +} + +static int mlx5_rep_create_miss_meter_rules(struct mlx5_core_dev *dev, + struct mlx5e_rep_priv *rep_priv, + u16 vport) +{ + struct rep_meter *meter = &rep_priv->rep_meter; + struct mlx5_eswitch *esw = dev->priv.eswitch; + struct mlx5_flow_destination dest = {}; + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_handle *rule; + struct mlx5_fc *drop_counter; + struct mlx5_flow_table *tbl; + struct mlx5_flow_spec *spec; + void *misc2; + int err = 0; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) + return -ENOMEM; + + tbl = esw->fdb_table.offloads.miss_meter_fdb; + + mlx5_eswitch_set_rule_source_port(esw, spec, NULL, esw, vport); + + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | + MLX5_FLOW_CONTEXT_ACTION_EXECUTE_ASO; + + flow_act.exe_aso.type = MLX5_EXE_ASO_FLOW_METER; + flow_act.exe_aso.object_id = meter->meter_hndl->obj_id; + flow_act.exe_aso.flow_meter.meter_idx = meter->meter_hndl->idx; + flow_act.exe_aso.return_reg_id = 5; /* use reg c5 */ + flow_act.exe_aso.flow_meter.init_color = MLX5_FLOW_METER_COLOR_GREEN; + + dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dest.ft = esw->fdb_table.offloads.post_miss_meter_fdb; + rule = mlx5_add_flow_rules(tbl, spec, &flow_act, &dest, 1); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + goto out; + } + + meter->meter_rule = rule; + + /* Post meter rule - add matching on color and add counter*/ + tbl = esw->fdb_table.offloads.post_miss_meter_fdb; + + drop_counter = mlx5_fc_create(dev, false); + if (IS_ERR(drop_counter)) { + err = PTR_ERR(drop_counter); + goto out; + } + meter->drop_counter = drop_counter; + + spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS_2; + + misc2 = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, + misc_parameters_2); + MLX5_SET(fte_match_set_misc2, misc2, metadata_reg_c_5, 0x3); + misc2 = MLX5_ADDR_OF(fte_match_param, spec->match_value, + misc_parameters_2); + MLX5_SET(fte_match_set_misc2, misc2, metadata_reg_c_5, + MLX5_FLOW_METER_COLOR_RED); + + memset(&flow_act, 0, sizeof(flow_act)); + memset(&dest, 0, sizeof(dest)); + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP | + MLX5_FLOW_CONTEXT_ACTION_COUNT; + dest.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; + dest.counter_id = mlx5_fc_id(drop_counter); + + rule = mlx5_add_flow_rules(tbl, spec, &flow_act, + &dest, 1); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + goto out; + } + + meter->drop_red_rule = rule; + +out: + + if (err) + mlx5_rep_destroy_miss_meter(dev, rep_priv); + + kvfree(spec); + + return err; +} + +int +mlx5_rep_set_miss_meter(struct mlx5_core_dev *dev, struct mlx5e_rep_priv *rep_priv, + u16 vport, u64 rate, u64 burst) +{ + struct rep_meter *meter = &rep_priv->rep_meter; + struct mlx5e_flow_meter_handle *meter_hndl; + struct mlx5e_flow_meter_params params; + int err; + + if (rate == meter->rate && burst == meter->burst) + return 0; + + if (rate == 0 || burst == 0) { + mlx5_rep_destroy_miss_meter(dev, rep_priv); + goto update; + } + + if (!meter->meter_hndl) { + meter_hndl = mlx5e_alloc_flow_meter(dev); + if (IS_ERR(meter_hndl)) + return PTR_ERR(meter_hndl); + meter->meter_hndl = meter_hndl; + } + + params.mode = MLX5_RATE_LIMIT_PPS; + params.rate = rate; + params.burst = burst; + err = mlx5e_aso_send_flow_meter_aso(dev, meter->meter_hndl, ¶ms); + if (err) + goto check_and_free_meter_aso; + + if (!meter->meter_rule) { + err = mlx5_rep_create_miss_meter_rules(dev, rep_priv, vport); + if (err) + return err; + } + +update: + meter->rate = rate; + meter->burst = burst; + + return 0; + +check_and_free_meter_aso: + if (!meter->meter_rule) { + mlx5e_free_flow_meter(dev, meter->meter_hndl); + meter->meter_hndl = NULL; + } + return err; +} + +int mlx5_rep_get_miss_meter_data(struct mlx5_core_dev *dev, struct mlx5e_rep_priv *rep_priv, + int data_type, u64 *data) +{ + struct rep_meter *meter = &rep_priv->rep_meter; + u64 bytes = 0, packets = 0; + + if (meter->drop_counter) + mlx5_fc_query(dev, meter->drop_counter, &packets, &bytes); + + if (data_type == MLX5_RATE_LIMIT_DATA_PACKETS_DROPPED) { + *data = packets; + *data += meter->packets_dropped; + } else if (data_type == MLX5_RATE_LIMIT_DATA_BYTES_DROPPED) { + *data = bytes; + *data += meter->bytes_dropped; + } else { + return -EINVAL; + } + + return 0; +} + +int mlx5_rep_clear_miss_meter_data(struct mlx5_core_dev *dev, struct mlx5e_rep_priv *rep_priv) +{ + struct rep_meter *meter = &rep_priv->rep_meter; + u64 bytes = 0, packets = 0; + + if (meter->drop_counter) + mlx5_fc_query_and_clear(dev, meter->drop_counter, &packets, &bytes); + + meter->packets_dropped = 0; + meter->bytes_dropped = 0; + + return 0; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rep/meter.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rep/meter.h new file mode 100644 index 0000000..a073ab2 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rep/meter.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2021 NVIDIA Corporation. */ + +#ifndef __MLX5_EN_REP_METER_H__ +#define __MLX5_EN_REP_METER_H__ + +#include "en_rep.h" + +void +mlx5_rep_destroy_miss_meter(struct mlx5_core_dev *dev, struct mlx5e_rep_priv *rep_priv); +int +mlx5_rep_set_miss_meter(struct mlx5_core_dev *dev, struct mlx5e_rep_priv *rep_priv, + u16 vport, u64 rate, u64 burst); +int mlx5_rep_get_miss_meter_data(struct mlx5_core_dev *dev, struct mlx5e_rep_priv *rep_priv, + int data_type, u64 *data); +int mlx5_rep_clear_miss_meter_data(struct mlx5_core_dev *dev, struct mlx5e_rep_priv *rep_priv); + +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rep/neigh.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rep/neigh.c new file mode 100644 index 0000000..2e9bee4 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rep/neigh.c @@ -0,0 +1,398 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020 Mellanox Technologies. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "neigh.h" +#include "tc.h" +#include "en_rep.h" +#include "fs_core.h" +#include "diag/en_rep_tracepoint.h" + +static unsigned long mlx5e_rep_ipv6_interval(void) +{ + if (IS_ENABLED(CONFIG_IPV6) && ipv6_stub->nd_tbl) + return NEIGH_VAR(&ipv6_stub->nd_tbl->parms, DELAY_PROBE_TIME); + + return ~0UL; +} + +static void mlx5e_rep_neigh_update_init_interval(struct mlx5e_rep_priv *rpriv) +{ + unsigned long ipv4_interval = NEIGH_VAR(&arp_tbl.parms, DELAY_PROBE_TIME); + unsigned long ipv6_interval = mlx5e_rep_ipv6_interval(); + struct net_device *netdev = rpriv->netdev; + struct mlx5e_priv *priv = netdev_priv(netdev); + + rpriv->neigh_update.min_interval = min_t(unsigned long, ipv6_interval, ipv4_interval); + mlx5_fc_update_sampling_interval(priv->mdev, rpriv->neigh_update.min_interval); +} + +void mlx5e_rep_queue_neigh_stats_work(struct mlx5e_priv *priv) +{ + struct mlx5e_rep_priv *rpriv = priv->ppriv; + struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update; + + mlx5_fc_queue_stats_work(priv->mdev, + &neigh_update->neigh_stats_work, + neigh_update->min_interval); +} + +static bool mlx5e_rep_neigh_entry_hold(struct mlx5e_neigh_hash_entry *nhe) +{ + return refcount_inc_not_zero(&nhe->refcnt); +} + +static void mlx5e_rep_neigh_entry_remove(struct mlx5e_neigh_hash_entry *nhe); + +void mlx5e_rep_neigh_entry_release(struct mlx5e_neigh_hash_entry *nhe) +{ + if (refcount_dec_and_test(&nhe->refcnt)) { + mlx5e_rep_neigh_entry_remove(nhe); + kfree_rcu(nhe, rcu); + } +} + +static struct mlx5e_neigh_hash_entry * +mlx5e_get_next_nhe(struct mlx5e_rep_priv *rpriv, + struct mlx5e_neigh_hash_entry *nhe) +{ + struct mlx5e_neigh_hash_entry *next = NULL; + + rcu_read_lock(); + + for (next = nhe ? + list_next_or_null_rcu(&rpriv->neigh_update.neigh_list, + &nhe->neigh_list, + struct mlx5e_neigh_hash_entry, + neigh_list) : + list_first_or_null_rcu(&rpriv->neigh_update.neigh_list, + struct mlx5e_neigh_hash_entry, + neigh_list); + next; + next = list_next_or_null_rcu(&rpriv->neigh_update.neigh_list, + &next->neigh_list, + struct mlx5e_neigh_hash_entry, + neigh_list)) + if (mlx5e_rep_neigh_entry_hold(next)) + break; + + rcu_read_unlock(); + + if (nhe) + mlx5e_rep_neigh_entry_release(nhe); + + return next; +} + +static void mlx5e_rep_neigh_stats_work(struct work_struct *work) +{ + struct mlx5e_rep_priv *rpriv = container_of(work, struct mlx5e_rep_priv, + neigh_update.neigh_stats_work.work); + struct net_device *netdev = rpriv->netdev; + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5e_neigh_hash_entry *nhe = NULL; + + rtnl_lock(); + if (!list_empty(&rpriv->neigh_update.neigh_list)) + mlx5e_rep_queue_neigh_stats_work(priv); + + while ((nhe = mlx5e_get_next_nhe(rpriv, nhe)) != NULL) + mlx5e_tc_update_neigh_used_value(nhe); + + rtnl_unlock(); +} + +struct neigh_update_work { + struct work_struct work; + struct neighbour *n; + struct mlx5e_neigh_hash_entry *nhe; +}; + +static void mlx5e_release_neigh_update_work(struct neigh_update_work *update_work) +{ + neigh_release(update_work->n); + mlx5e_rep_neigh_entry_release(update_work->nhe); + kfree(update_work); +} + +static void mlx5e_rep_neigh_update(struct work_struct *work) +{ + struct neigh_update_work *update_work = container_of(work, struct neigh_update_work, + work); + struct mlx5e_neigh_hash_entry *nhe = update_work->nhe; + struct neighbour *n = update_work->n; + struct mlx5e_encap_entry *e = NULL; + bool neigh_connected, same_dev; + unsigned char ha[ETH_ALEN]; + u8 nud_state, dead; + + rtnl_lock(); + + /* If these parameters are changed after we release the lock, + * we'll receive another event letting us know about it. + * We use this lock to avoid inconsistency between the neigh validity + * and it's hw address. + */ + read_lock_bh(&n->lock); + memcpy(ha, n->ha, ETH_ALEN); + nud_state = n->nud_state; + dead = n->dead; + same_dev = READ_ONCE(nhe->neigh_dev) == n->dev; + read_unlock_bh(&n->lock); + + neigh_connected = (nud_state & NUD_VALID) && !dead; + + trace_mlx5e_rep_neigh_update(nhe, ha, neigh_connected); + + if (!same_dev) + goto out; + + /* mlx5e_get_next_init_encap() releases previous encap before returning + * the next one. + */ + while ((e = mlx5e_get_next_init_encap(nhe, e)) != NULL) + mlx5e_rep_update_flows(netdev_priv(e->out_dev), e, neigh_connected, ha); + +out: + rtnl_unlock(); + mlx5e_release_neigh_update_work(update_work); +} + +static struct neigh_update_work *mlx5e_alloc_neigh_update_work(struct mlx5e_priv *priv, + struct neighbour *n) +{ + struct neigh_update_work *update_work; + struct mlx5e_neigh_hash_entry *nhe; + struct mlx5e_neigh m_neigh = {}; + + update_work = kzalloc(sizeof(*update_work), GFP_ATOMIC); + if (WARN_ON(!update_work)) + return NULL; + + m_neigh.family = n->ops->family; + memcpy(&m_neigh.dst_ip, n->primary_key, n->tbl->key_len); + + /* Obtain reference to nhe as last step in order not to release it in + * atomic context. + */ + rcu_read_lock(); + nhe = mlx5e_rep_neigh_entry_lookup(priv, &m_neigh); + rcu_read_unlock(); + if (!nhe) { + kfree(update_work); + return NULL; + } + + INIT_WORK(&update_work->work, mlx5e_rep_neigh_update); + neigh_hold(n); + update_work->n = n; + update_work->nhe = nhe; + + return update_work; +} + +static int mlx5e_rep_netevent_event(struct notifier_block *nb, + unsigned long event, void *ptr) +{ + struct mlx5e_rep_priv *rpriv = container_of(nb, struct mlx5e_rep_priv, + neigh_update.netevent_nb); + struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update; + struct net_device *netdev = rpriv->netdev; + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5e_neigh_hash_entry *nhe = NULL; + struct neigh_update_work *update_work; + struct neigh_parms *p; + struct neighbour *n; + bool found = false; + + switch (event) { + case NETEVENT_NEIGH_UPDATE: + n = ptr; +#if IS_ENABLED(CONFIG_IPV6) + if (n->tbl != ipv6_stub->nd_tbl && n->tbl != &arp_tbl) +#else + if (n->tbl != &arp_tbl) +#endif + return NOTIFY_DONE; + + update_work = mlx5e_alloc_neigh_update_work(priv, n); + if (!update_work) + return NOTIFY_DONE; + + queue_work(priv->wq, &update_work->work); + break; + + case NETEVENT_DELAY_PROBE_TIME_UPDATE: + p = ptr; + + /* We check the device is present since we don't care about + * changes in the default table, we only care about changes + * done per device delay prob time parameter. + */ +#if IS_ENABLED(CONFIG_IPV6) + if (!p->dev || (p->tbl != ipv6_stub->nd_tbl && p->tbl != &arp_tbl)) +#else + if (!p->dev || p->tbl != &arp_tbl) +#endif + return NOTIFY_DONE; + + rcu_read_lock(); + list_for_each_entry_rcu(nhe, &neigh_update->neigh_list, + neigh_list) { + if (p->dev == READ_ONCE(nhe->neigh_dev)) { + found = true; + break; + } + } + rcu_read_unlock(); + if (!found) + return NOTIFY_DONE; + + neigh_update->min_interval = min_t(unsigned long, + NEIGH_VAR(p, DELAY_PROBE_TIME), + neigh_update->min_interval); + mlx5_fc_update_sampling_interval(priv->mdev, + neigh_update->min_interval); + break; + } + return NOTIFY_DONE; +} + +static const struct rhashtable_params mlx5e_neigh_ht_params = { + .head_offset = offsetof(struct mlx5e_neigh_hash_entry, rhash_node), + .key_offset = offsetof(struct mlx5e_neigh_hash_entry, m_neigh), + .key_len = sizeof(struct mlx5e_neigh), + .automatic_shrinking = true, +}; + +int mlx5e_rep_neigh_init(struct mlx5e_rep_priv *rpriv) +{ + struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update; + int err; + + err = rhashtable_init(&neigh_update->neigh_ht, &mlx5e_neigh_ht_params); + if (err) + goto out_err; + + INIT_LIST_HEAD(&neigh_update->neigh_list); + mutex_init(&neigh_update->encap_lock); + INIT_DELAYED_WORK(&neigh_update->neigh_stats_work, + mlx5e_rep_neigh_stats_work); + mlx5e_rep_neigh_update_init_interval(rpriv); + + neigh_update->netevent_nb.notifier_call = mlx5e_rep_netevent_event; + err = register_netevent_notifier(&neigh_update->netevent_nb); + if (err) + goto out_notifier; + return 0; + +out_notifier: + neigh_update->netevent_nb.notifier_call = NULL; + rhashtable_destroy(&neigh_update->neigh_ht); +out_err: + netdev_warn(rpriv->netdev, + "Failed to initialize neighbours handling for vport %d\n", + rpriv->rep->vport); + return err; +} + +void mlx5e_rep_neigh_cleanup(struct mlx5e_rep_priv *rpriv) +{ + struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update; + struct mlx5e_priv *priv = netdev_priv(rpriv->netdev); + + if (!rpriv->neigh_update.netevent_nb.notifier_call) + return; + + unregister_netevent_notifier(&neigh_update->netevent_nb); + + flush_workqueue(priv->wq); /* flush neigh update works */ + + cancel_delayed_work_sync(&rpriv->neigh_update.neigh_stats_work); + + mutex_destroy(&neigh_update->encap_lock); + rhashtable_destroy(&neigh_update->neigh_ht); +} + +static int mlx5e_rep_neigh_entry_insert(struct mlx5e_priv *priv, + struct mlx5e_neigh_hash_entry *nhe) +{ + struct mlx5e_rep_priv *rpriv = priv->ppriv; + int err; + + err = rhashtable_insert_fast(&rpriv->neigh_update.neigh_ht, + &nhe->rhash_node, + mlx5e_neigh_ht_params); + if (err) + return err; + + list_add_rcu(&nhe->neigh_list, &rpriv->neigh_update.neigh_list); + + return err; +} + +static void mlx5e_rep_neigh_entry_remove(struct mlx5e_neigh_hash_entry *nhe) +{ + struct mlx5e_rep_priv *rpriv = nhe->priv->ppriv; + + mutex_lock(&rpriv->neigh_update.encap_lock); + + list_del_rcu(&nhe->neigh_list); + + rhashtable_remove_fast(&rpriv->neigh_update.neigh_ht, + &nhe->rhash_node, + mlx5e_neigh_ht_params); + mutex_unlock(&rpriv->neigh_update.encap_lock); +} + +/* This function must only be called under the representor's encap_lock or + * inside rcu read lock section. + */ +struct mlx5e_neigh_hash_entry * +mlx5e_rep_neigh_entry_lookup(struct mlx5e_priv *priv, + struct mlx5e_neigh *m_neigh) +{ + struct mlx5e_rep_priv *rpriv = priv->ppriv; + struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update; + struct mlx5e_neigh_hash_entry *nhe; + + nhe = rhashtable_lookup_fast(&neigh_update->neigh_ht, m_neigh, + mlx5e_neigh_ht_params); + return nhe && mlx5e_rep_neigh_entry_hold(nhe) ? nhe : NULL; +} + +int mlx5e_rep_neigh_entry_create(struct mlx5e_priv *priv, + struct mlx5e_neigh *m_neigh, + struct net_device *neigh_dev, + struct mlx5e_neigh_hash_entry **nhe) +{ + int err; + + *nhe = kzalloc(sizeof(**nhe), GFP_KERNEL); + if (!*nhe) + return -ENOMEM; + + (*nhe)->priv = priv; + memcpy(&(*nhe)->m_neigh, m_neigh, sizeof(*m_neigh)); + spin_lock_init(&(*nhe)->encap_list_lock); + INIT_LIST_HEAD(&(*nhe)->encap_list); + refcount_set(&(*nhe)->refcnt, 1); + WRITE_ONCE((*nhe)->neigh_dev, neigh_dev); + + err = mlx5e_rep_neigh_entry_insert(priv, *nhe); + if (err) + goto out_free; + return 0; + +out_free: + kfree(*nhe); + return err; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rep/neigh.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rep/neigh.h new file mode 100644 index 0000000..6fe0ab9 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rep/neigh.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2020 Mellanox Technologies. */ + +#ifndef __MLX5_EN_REP_NEIGH__ +#define __MLX5_EN_REP_NEIGH__ + +#include "en.h" +#include "en_rep.h" + +#if IS_ENABLED(CONFIG_MLX5_CLS_ACT) + +int mlx5e_rep_neigh_init(struct mlx5e_rep_priv *rpriv); +void mlx5e_rep_neigh_cleanup(struct mlx5e_rep_priv *rpriv); + +struct mlx5e_neigh_hash_entry * +mlx5e_rep_neigh_entry_lookup(struct mlx5e_priv *priv, + struct mlx5e_neigh *m_neigh); +int mlx5e_rep_neigh_entry_create(struct mlx5e_priv *priv, + struct mlx5e_neigh *m_neigh, + struct net_device *neigh_dev, + struct mlx5e_neigh_hash_entry **nhe); +void mlx5e_rep_neigh_entry_release(struct mlx5e_neigh_hash_entry *nhe); + +void mlx5e_rep_queue_neigh_stats_work(struct mlx5e_priv *priv); + +#else /* CONFIG_MLX5_CLS_ACT */ + +static inline int +mlx5e_rep_neigh_init(struct mlx5e_rep_priv *rpriv) { return 0; } +static inline void +mlx5e_rep_neigh_cleanup(struct mlx5e_rep_priv *rpriv) {} + +#endif /* CONFIG_MLX5_CLS_ACT */ + +#endif /* __MLX5_EN_REP_NEIGH__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rep/sysfs.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rep/sysfs.c new file mode 100644 index 0000000..8c25252 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rep/sysfs.c @@ -0,0 +1,309 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2021 Mellanox Technologies. */ + +#include "en/rep/sysfs.h" +#include "en/rep/meter.h" +#include "en_rep.h" +#include "eswitch.h" + +static ssize_t rep_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct kobj_attribute *kattr; + ssize_t ret = -EIO; + + kattr = container_of(attr, struct kobj_attribute, attr); + if (kattr->show) + ret = kattr->show(kobj, kattr, buf); + return ret; +} + +static ssize_t rep_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t count) +{ + struct kobj_attribute *kattr; + ssize_t ret = -EIO; + + kattr = container_of(attr, struct kobj_attribute, attr); + if (kattr->store) + ret = kattr->store(kobj, kattr, buf, count); + return ret; +} + +static ssize_t miss_rl_cfg_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, + size_t count) +{ + struct mlx5_rep_sysfs *tmp = + container_of(kobj, struct mlx5_rep_sysfs, kobj); + struct mlx5_eswitch *esw = tmp->esw; + struct mlx5e_rep_priv *rep_priv; + struct mlx5_eswitch_rep *rep; + u64 rate, burst; + int err; + + err = sscanf(buf, "%llu %llu", &rate, &burst); + if (err != 2) + return -EINVAL; + + if (rate < 0 || burst < 0) + return -EINVAL; + + rep = mlx5_eswitch_vport_rep(esw, tmp->vport); + rep_priv = mlx5e_rep_to_rep_priv(rep); + + err = mlx5_rep_set_miss_meter(esw->dev, rep_priv, tmp->vport, + rate, burst); + + return err ? err : count; +} + +static ssize_t miss_rl_cfg_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct mlx5_rep_sysfs *tmp = + container_of(kobj, struct mlx5_rep_sysfs, kobj); + struct mlx5_eswitch *esw = tmp->esw; + struct mlx5e_rep_priv *rep_priv; + struct mlx5_eswitch_rep *rep; + + rep = mlx5_eswitch_vport_rep(esw, tmp->vport); + rep_priv = mlx5e_rep_to_rep_priv(rep); + + return sprintf(buf, + "rate: %llu[packes/s] burst: %llu[packets]\n", + rep_priv->rep_meter.rate, rep_priv->rep_meter.burst); +} + +static ssize_t miss_rl_dropped_show_common(struct kobject *kobj, + int drop_type, + char *buf) +{ + struct mlx5_rep_sysfs *tmp = + container_of(kobj, struct mlx5_rep_sysfs, kobj); + struct mlx5_eswitch *esw = tmp->esw; + struct mlx5e_rep_priv *rep_priv; + struct mlx5_eswitch_rep *rep; + u64 data; + int err; + + rep = mlx5_eswitch_vport_rep(esw, tmp->vport); + rep_priv = mlx5e_rep_to_rep_priv(rep); + + err = mlx5_rep_get_miss_meter_data(esw->dev, rep_priv, + drop_type, &data); + if (err) + return err; + + return sprintf(buf, "%llu\n", data); +} + +static ssize_t miss_rl_dropped_packets_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return miss_rl_dropped_show_common(kobj, MLX5_RATE_LIMIT_DATA_PACKETS_DROPPED, buf); +} + +static ssize_t miss_rl_dropped_bytes_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return miss_rl_dropped_show_common(kobj, MLX5_RATE_LIMIT_DATA_BYTES_DROPPED, buf); +} + +static ssize_t miss_rl_stats_clr_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, + size_t count) +{ + struct mlx5_rep_sysfs *tmp = + container_of(kobj, struct mlx5_rep_sysfs, kobj); + struct mlx5_eswitch *esw = tmp->esw; + struct mlx5e_rep_priv *rep_priv; + struct mlx5_eswitch_rep *rep; + int clr_set; + int err; + + err = sscanf(buf, "%d", &clr_set); + if (err != 1 || clr_set != 1) + return -EINVAL; + + rep = mlx5_eswitch_vport_rep(esw, tmp->vport); + rep_priv = mlx5e_rep_to_rep_priv(rep); + + err = mlx5_rep_clear_miss_meter_data(esw->dev, rep_priv); + + return err ? err : count; +} + +static ssize_t page_limit_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct mlx5_rep_sysfs *tmp = + container_of(kobj, struct mlx5_rep_sysfs, paging_kobj); + struct mlx5_eswitch *esw = tmp->esw; + struct mlx5_vport *evport; + u32 page_limit; + + evport = mlx5_eswitch_get_vport(esw, tmp->vport); + spin_lock(&evport->pg_counters_lock); + page_limit = evport->page_limit; + spin_unlock(&evport->pg_counters_lock); + return sprintf(buf, "limit: %u\n", page_limit); +} + +static ssize_t page_limit_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, + size_t count) +{ + struct mlx5_rep_sysfs *tmp = + container_of(kobj, struct mlx5_rep_sysfs, paging_kobj); + struct mlx5_eswitch *esw = tmp->esw; + struct mlx5_vport *evport; + u32 limit; + int err; + + evport = mlx5_eswitch_get_vport(esw, tmp->vport); + err = sscanf(buf, "%u", &limit); + if (err != 1) + return -EINVAL; + spin_lock(&evport->pg_counters_lock); + evport->page_limit = limit; + spin_unlock(&evport->pg_counters_lock); + return count; +} + +static ssize_t num_pages_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct mlx5_rep_sysfs *tmp = + container_of(kobj, struct mlx5_rep_sysfs, paging_kobj); + struct mlx5_eswitch *esw = tmp->esw; + struct mlx5_vport *evport; + u32 fw_pages; + + evport = mlx5_eswitch_get_vport(esw, tmp->vport); + spin_lock(&evport->pg_counters_lock); + fw_pages = evport->fw_pages; + spin_unlock(&evport->pg_counters_lock); + return sprintf(buf, "fw_pages: %u\n", fw_pages); +} + +static struct kobj_attribute attr_miss_rl_cfg = { + .attr = {.name = "miss_rl_cfg", + .mode = 0644 }, + .show = miss_rl_cfg_show, + .store = miss_rl_cfg_store, +}; + +static struct kobj_attribute attr_miss_rl_dropped_packets = { + .attr = {.name = "miss_rl_dropped_packets", + .mode = 0444 }, + .show = miss_rl_dropped_packets_show, +}; + +static struct kobj_attribute attr_miss_rl_dropped_bytes = { + .attr = {.name = "miss_rl_dropped_bytes", + .mode = 0444 }, + .show = miss_rl_dropped_bytes_show, +}; + +static struct kobj_attribute attr_miss_rl_stats_clr = { + .attr = {.name = "miss_rl_stats_clr", + .mode = 0200 }, + .store = miss_rl_stats_clr_store, +}; + +static struct kobj_attribute attr_page_limit = { + .attr = {.name = "page_limit", + .mode = 0644 }, + .show = page_limit_show, + .store = page_limit_store, +}; + +static struct kobj_attribute attr_num_pages = { + .attr = {.name = "num_pages", + .mode = 0644 }, + .show = num_pages_show, +}; + +static struct attribute *rep_attrs[] = { + &attr_miss_rl_cfg.attr, + &attr_miss_rl_dropped_packets.attr, + &attr_miss_rl_dropped_bytes.attr, + &attr_miss_rl_stats_clr.attr, + NULL, +}; + +static const struct sysfs_ops rep_sysfs_ops = { + .show = rep_attr_show, + .store = rep_attr_store +}; + +static struct kobj_type rep_type = { + .sysfs_ops = &rep_sysfs_ops, + .default_attrs = rep_attrs +}; + +static struct attribute *rep_paging_attrs[] = { + &attr_page_limit.attr, + &attr_num_pages.attr, + NULL, +}; + +static struct kobj_type rep_paging = { + .sysfs_ops = &rep_sysfs_ops, + .default_attrs = rep_paging_attrs +}; + +void mlx5_rep_sysfs_init(struct mlx5e_rep_priv *rpriv) +{ + struct mlx5e_priv *priv = netdev_priv(rpriv->netdev); + struct mlx5_core_dev *dev = priv->mdev; + struct mlx5_rep_sysfs *tmp; + struct mlx5_eswitch *esw; + int err; + + if (!(MLX5_CAP_GEN_64(dev, general_obj_types) & + MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_FLOW_METER_ASO)) + return; + + esw = dev->priv.eswitch; + + tmp = &rpriv->rep_sysfs; + tmp->esw = esw; + tmp->vport = rpriv->rep->vport; + err = kobject_init_and_add(&tmp->kobj, &rep_type, + &rpriv->netdev->dev.kobj, "rep_config"); + + if (err) { + tmp->esw = NULL; + return; + } + + err = kobject_init_and_add(&tmp->paging_kobj, &rep_paging, + &tmp->kobj, "paging_control"); + if (err) { + kobject_put(&tmp->kobj); + tmp->esw = NULL; + } +} + +void mlx5_rep_sysfs_cleanup(struct mlx5e_rep_priv *rpriv) +{ + struct mlx5_rep_sysfs *tmp; + + tmp = &rpriv->rep_sysfs; + if (!tmp->esw) + return; + + kobject_put(&tmp->paging_kobj); + kobject_put(&tmp->kobj); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rep/sysfs.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rep/sysfs.h new file mode 100644 index 0000000..b87613f --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rep/sysfs.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2021 NVIDIA Corporation. */ + +#ifndef __MLX5_EN_REP_SYSFS_H__ +#define __MLX5_EN_REP_SYSFS_H__ + +#include "en_rep.h" + +void mlx5_rep_sysfs_init(struct mlx5e_rep_priv *rpriv); +void mlx5_rep_sysfs_cleanup(struct mlx5e_rep_priv *rpriv); + +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c new file mode 100644 index 0000000..ab10114 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c @@ -0,0 +1,902 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020 Mellanox Technologies. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "tc.h" +#include "neigh.h" +#include "en_rep.h" +#include "eswitch.h" +#include "lib/fs_chains.h" +#include "en/tc_ct.h" +#include "en/mapping.h" +#include "en/tc_tun.h" +#include "lib/port_tun.h" +#include "en/tc/sample.h" +#include "en_accel/ipsec_rxtx.h" +#include "en/tc/int_port.h" +#include "en/tc/act/act.h" + +struct mlx5e_rep_indr_block_priv { + struct net_device *netdev; + struct mlx5e_rep_priv *rpriv; + enum flow_block_binder_type binder_type; + + struct list_head list; +}; + +int mlx5e_rep_encap_entry_attach(struct mlx5e_priv *priv, + struct mlx5e_encap_entry *e, + struct mlx5e_neigh *m_neigh, + struct net_device *neigh_dev) +{ + struct mlx5e_rep_priv *rpriv = priv->ppriv; + struct mlx5_rep_uplink_priv *uplink_priv = &rpriv->uplink_priv; + struct mlx5_tun_entropy *tun_entropy = &uplink_priv->tun_entropy; + struct mlx5e_neigh_hash_entry *nhe; + int err; + + err = mlx5_tun_entropy_refcount_inc(tun_entropy, e->reformat_type); + if (err) + return err; + + mutex_lock(&rpriv->neigh_update.encap_lock); + nhe = mlx5e_rep_neigh_entry_lookup(priv, m_neigh); + if (!nhe) { + err = mlx5e_rep_neigh_entry_create(priv, m_neigh, neigh_dev, &nhe); + if (err) { + mutex_unlock(&rpriv->neigh_update.encap_lock); + mlx5_tun_entropy_refcount_dec(tun_entropy, + e->reformat_type); + return err; + } + } + + e->nhe = nhe; + spin_lock(&nhe->encap_list_lock); + list_add_rcu(&e->encap_list, &nhe->encap_list); + spin_unlock(&nhe->encap_list_lock); + + mutex_unlock(&rpriv->neigh_update.encap_lock); + + return 0; +} + +void mlx5e_rep_encap_entry_detach(struct mlx5e_priv *priv, + struct mlx5e_encap_entry *e) +{ + struct mlx5e_rep_priv *rpriv = priv->ppriv; + struct mlx5_rep_uplink_priv *uplink_priv = &rpriv->uplink_priv; + struct mlx5_tun_entropy *tun_entropy = &uplink_priv->tun_entropy; + + if (!e->nhe) + return; + + spin_lock(&e->nhe->encap_list_lock); + list_del_rcu(&e->encap_list); + spin_unlock(&e->nhe->encap_list_lock); + + mlx5e_rep_neigh_entry_release(e->nhe); + e->nhe = NULL; + mlx5_tun_entropy_refcount_dec(tun_entropy, e->reformat_type); +} + +void mlx5e_rep_update_flows(struct mlx5e_priv *priv, + struct mlx5e_encap_entry *e, + bool neigh_connected, + unsigned char ha[ETH_ALEN]) +{ + struct ethhdr *eth = (struct ethhdr *)e->encap_header; + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + bool encap_connected; + LIST_HEAD(flow_list); + + ASSERT_RTNL(); + + mutex_lock(&esw->offloads.encap_tbl_lock); + encap_connected = !!(e->flags & MLX5_ENCAP_ENTRY_VALID); + if (encap_connected == neigh_connected && ether_addr_equal(e->h_dest, ha)) + goto unlock; + + mlx5e_take_all_encap_flows(e, &flow_list); + + if ((e->flags & MLX5_ENCAP_ENTRY_VALID) && + (!neigh_connected || !ether_addr_equal(e->h_dest, ha))) + mlx5e_tc_encap_flows_del(priv, e, &flow_list); + + if (neigh_connected && !(e->flags & MLX5_ENCAP_ENTRY_VALID)) { + struct net_device *route_dev; + + ether_addr_copy(e->h_dest, ha); + ether_addr_copy(eth->h_dest, ha); + /* Update the encap source mac, in case that we delete + * the flows when encap source mac changed. + */ + route_dev = __dev_get_by_index(dev_net(priv->netdev), e->route_dev_ifindex); + if (route_dev) + ether_addr_copy(eth->h_source, route_dev->dev_addr); + + mlx5e_tc_encap_flows_add(priv, e, &flow_list); + } +unlock: + mutex_unlock(&esw->offloads.encap_tbl_lock); + mlx5e_put_flow_list(priv, &flow_list); +} + +static int +mlx5e_rep_setup_tc_cls_flower(struct mlx5e_priv *priv, + struct flow_cls_offload *cls_flower, int flags) +{ + switch (cls_flower->command) { + case FLOW_CLS_REPLACE: + return mlx5e_configure_flower(priv->netdev, priv, cls_flower, + flags); + case FLOW_CLS_DESTROY: + return mlx5e_delete_flower(priv->netdev, priv, cls_flower, + flags); + case FLOW_CLS_STATS: + return mlx5e_stats_flower(priv->netdev, priv, cls_flower, + flags); + default: + return -EOPNOTSUPP; + } +} + +static +int mlx5e_rep_setup_tc_cls_matchall(struct mlx5e_priv *priv, + struct tc_cls_matchall_offload *ma) +{ + switch (ma->command) { + case TC_CLSMATCHALL_REPLACE: + return mlx5e_tc_configure_matchall(priv, ma); + case TC_CLSMATCHALL_DESTROY: + return mlx5e_tc_delete_matchall(priv, ma); + case TC_CLSMATCHALL_STATS: + mlx5e_tc_stats_matchall(priv, ma); + return 0; + default: + return -EOPNOTSUPP; + } +} + +static int mlx5e_rep_setup_tc_cb(enum tc_setup_type type, void *type_data, + void *cb_priv) +{ + unsigned long flags = MLX5_TC_FLAG(INGRESS) | MLX5_TC_FLAG(ESW_OFFLOAD); + struct mlx5e_priv *priv = cb_priv; + + if (!priv->netdev || !netif_device_present(priv->netdev)) + return -EOPNOTSUPP; + + switch (type) { + case TC_SETUP_CLSFLOWER: + return mlx5e_rep_setup_tc_cls_flower(priv, type_data, flags); + case TC_SETUP_CLSMATCHALL: + return mlx5e_rep_setup_tc_cls_matchall(priv, type_data); + default: + return -EOPNOTSUPP; + } +} + +static int mlx5e_rep_setup_ft_cb(enum tc_setup_type type, void *type_data, + void *cb_priv) +{ + struct flow_cls_offload tmp, *f = type_data; + struct mlx5e_priv *priv = cb_priv; + struct mlx5_eswitch *esw; + unsigned long flags; + int err; + + flags = MLX5_TC_FLAG(INGRESS) | + MLX5_TC_FLAG(ESW_OFFLOAD) | + MLX5_TC_FLAG(FT_OFFLOAD); + esw = priv->mdev->priv.eswitch; + + switch (type) { + case TC_SETUP_CLSFLOWER: + memcpy(&tmp, f, sizeof(*f)); + + if (!mlx5_chains_prios_supported(esw_chains(esw))) + return -EOPNOTSUPP; + + /* Re-use tc offload path by moving the ft flow to the + * reserved ft chain. + * + * FT offload can use prio range [0, INT_MAX], so we normalize + * it to range [1, mlx5_esw_chains_get_prio_range(esw)] + * as with tc, where prio 0 isn't supported. + * + * We only support chain 0 of FT offload. + */ + if (tmp.common.prio >= mlx5_chains_get_prio_range(esw_chains(esw))) + return -EOPNOTSUPP; + if (tmp.common.chain_index != 0) + return -EOPNOTSUPP; + + tmp.common.chain_index = mlx5_chains_get_nf_ft_chain(esw_chains(esw)); + tmp.common.prio++; + err = mlx5e_rep_setup_tc_cls_flower(priv, &tmp, flags); + memcpy(&f->stats, &tmp.stats, sizeof(f->stats)); + return err; + default: + return -EOPNOTSUPP; + } +} + +static LIST_HEAD(mlx5e_rep_block_tc_cb_list); +static LIST_HEAD(mlx5e_rep_block_ft_cb_list); +int mlx5e_rep_setup_tc(struct net_device *dev, enum tc_setup_type type, + void *type_data) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + struct flow_block_offload *f = type_data; + + f->unlocked_driver_cb = true; + + switch (type) { + case TC_SETUP_BLOCK: + return flow_block_cb_setup_simple(type_data, + &mlx5e_rep_block_tc_cb_list, + mlx5e_rep_setup_tc_cb, + priv, priv, true); + case TC_SETUP_FT: + return flow_block_cb_setup_simple(type_data, + &mlx5e_rep_block_ft_cb_list, + mlx5e_rep_setup_ft_cb, + priv, priv, true); + default: + return -EOPNOTSUPP; + } +} + +int mlx5e_rep_tc_init(struct mlx5e_rep_priv *rpriv) +{ + struct mlx5_rep_uplink_priv *uplink_priv = &rpriv->uplink_priv; + int err; + + mutex_init(&uplink_priv->unready_flows_lock); + INIT_LIST_HEAD(&uplink_priv->unready_flows); + + /* init shared tc flow table */ + err = mlx5e_tc_esw_init(uplink_priv); + return err; +} + +void mlx5e_rep_tc_cleanup(struct mlx5e_rep_priv *rpriv) +{ + /* delete shared tc flow table */ + mlx5e_tc_esw_cleanup(&rpriv->uplink_priv); + mutex_destroy(&rpriv->uplink_priv.unready_flows_lock); +} + +void mlx5e_rep_tc_enable(struct mlx5e_priv *priv) +{ + struct mlx5e_rep_priv *rpriv = priv->ppriv; + + INIT_WORK(&rpriv->uplink_priv.reoffload_flows_work, + mlx5e_tc_reoffload_flows_work); +} + +void mlx5e_rep_tc_disable(struct mlx5e_priv *priv) +{ + struct mlx5e_rep_priv *rpriv = priv->ppriv; + + cancel_work_sync(&rpriv->uplink_priv.reoffload_flows_work); +} + +int mlx5e_rep_tc_event_port_affinity(struct mlx5e_priv *priv) +{ + struct mlx5e_rep_priv *rpriv = priv->ppriv; + + queue_work(priv->wq, &rpriv->uplink_priv.reoffload_flows_work); + + return NOTIFY_OK; +} + +static struct mlx5e_rep_indr_block_priv * +mlx5e_rep_indr_block_priv_lookup(struct mlx5e_rep_priv *rpriv, + struct net_device *netdev, + enum flow_block_binder_type binder_type) +{ + struct mlx5e_rep_indr_block_priv *cb_priv; + + list_for_each_entry(cb_priv, + &rpriv->uplink_priv.tc_indr_block_priv_list, + list) + if (cb_priv->netdev == netdev && + cb_priv->binder_type == binder_type) + return cb_priv; + + return NULL; +} + +static int +mlx5e_rep_indr_offload(struct net_device *netdev, + struct flow_cls_offload *flower, + struct mlx5e_rep_indr_block_priv *indr_priv, + unsigned long flags) +{ + struct mlx5e_priv *priv = netdev_priv(indr_priv->rpriv->netdev); + int err = 0; + + if (!netif_device_present(indr_priv->rpriv->netdev)) + return -EOPNOTSUPP; + + switch (flower->command) { + case FLOW_CLS_REPLACE: + err = mlx5e_configure_flower(netdev, priv, flower, flags); + break; + case FLOW_CLS_DESTROY: + err = mlx5e_delete_flower(netdev, priv, flower, flags); + break; + case FLOW_CLS_STATS: + err = mlx5e_stats_flower(netdev, priv, flower, flags); + break; + default: + err = -EOPNOTSUPP; + } + + return err; +} + +static int mlx5e_rep_indr_setup_tc_cb(enum tc_setup_type type, + void *type_data, void *indr_priv) +{ + unsigned long flags = MLX5_TC_FLAG(ESW_OFFLOAD); + struct mlx5e_rep_indr_block_priv *priv = indr_priv; + + flags |= (priv->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS) ? + MLX5_TC_FLAG(EGRESS) : + MLX5_TC_FLAG(INGRESS); + + switch (type) { + case TC_SETUP_CLSFLOWER: + return mlx5e_rep_indr_offload(priv->netdev, type_data, priv, + flags); + default: + return -EOPNOTSUPP; + } +} + +static int mlx5e_rep_indr_setup_ft_cb(enum tc_setup_type type, + void *type_data, void *indr_priv) +{ + struct mlx5e_rep_indr_block_priv *priv = indr_priv; + struct flow_cls_offload *f = type_data; + struct flow_cls_offload tmp; + struct mlx5e_priv *mpriv; + struct mlx5_eswitch *esw; + unsigned long flags; + int err; + + mpriv = netdev_priv(priv->rpriv->netdev); + esw = mpriv->mdev->priv.eswitch; + + flags = MLX5_TC_FLAG(EGRESS) | + MLX5_TC_FLAG(ESW_OFFLOAD) | + MLX5_TC_FLAG(FT_OFFLOAD); + + switch (type) { + case TC_SETUP_CLSFLOWER: + memcpy(&tmp, f, sizeof(*f)); + + /* Re-use tc offload path by moving the ft flow to the + * reserved ft chain. + * + * FT offload can use prio range [0, INT_MAX], so we normalize + * it to range [1, mlx5_esw_chains_get_prio_range(esw)] + * as with tc, where prio 0 isn't supported. + * + * We only support chain 0 of FT offload. + */ + if (!mlx5_chains_prios_supported(esw_chains(esw)) || + tmp.common.prio >= mlx5_chains_get_prio_range(esw_chains(esw)) || + tmp.common.chain_index) + return -EOPNOTSUPP; + + tmp.common.chain_index = mlx5_chains_get_nf_ft_chain(esw_chains(esw)); + tmp.common.prio++; + err = mlx5e_rep_indr_offload(priv->netdev, &tmp, priv, flags); + memcpy(&f->stats, &tmp.stats, sizeof(f->stats)); + return err; + default: + return -EOPNOTSUPP; + } +} + +static void mlx5e_rep_indr_block_unbind(void *cb_priv) +{ + struct mlx5e_rep_indr_block_priv *indr_priv = cb_priv; + + list_del(&indr_priv->list); + kfree(indr_priv); +} + +static LIST_HEAD(mlx5e_block_cb_list); + +static bool mlx5e_rep_macvlan_mode_supported(const struct net_device *dev) +{ + struct macvlan_dev *macvlan = netdev_priv(dev); + + return macvlan->mode == MACVLAN_MODE_PASSTHRU; +} + +static int +mlx5e_rep_indr_setup_block(struct net_device *netdev, struct Qdisc *sch, + struct mlx5e_rep_priv *rpriv, + struct flow_block_offload *f, + flow_setup_cb_t *setup_cb, + void *data, + void (*cleanup)(struct flow_block_cb *block_cb)) +{ + struct mlx5e_priv *priv = netdev_priv(rpriv->netdev); + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + bool is_ovs_int_port = netif_is_ovs_master(netdev); + struct mlx5e_rep_indr_block_priv *indr_priv; + struct flow_block_cb *block_cb; + + if (!mlx5e_tc_tun_device_to_offload(priv, netdev) && + !(is_vlan_dev(netdev) && vlan_dev_real_dev(netdev) == rpriv->netdev) && + !is_ovs_int_port) { + if (!(netif_is_macvlan(netdev) && macvlan_dev_real_dev(netdev) == rpriv->netdev)) + return -EOPNOTSUPP; + if (!mlx5e_rep_macvlan_mode_supported(netdev)) { + netdev_warn(netdev, "Offloading ingress filter is supported only with macvlan passthru mode"); + return -EOPNOTSUPP; + } + } + + if (f->binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS && + f->binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS) + return -EOPNOTSUPP; + + if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS && !is_ovs_int_port) + return -EOPNOTSUPP; + + if (is_ovs_int_port && !mlx5e_tc_int_port_supported(esw)) + return -EOPNOTSUPP; + + f->unlocked_driver_cb = true; + f->driver_block_list = &mlx5e_block_cb_list; + + switch (f->command) { + case FLOW_BLOCK_BIND: + indr_priv = mlx5e_rep_indr_block_priv_lookup(rpriv, netdev, f->binder_type); + if (indr_priv) + return -EEXIST; + + indr_priv = kmalloc(sizeof(*indr_priv), GFP_KERNEL); + if (!indr_priv) + return -ENOMEM; + + indr_priv->netdev = netdev; + indr_priv->rpriv = rpriv; + indr_priv->binder_type = f->binder_type; + list_add(&indr_priv->list, + &rpriv->uplink_priv.tc_indr_block_priv_list); + + block_cb = flow_indr_block_cb_alloc(setup_cb, indr_priv, indr_priv, + mlx5e_rep_indr_block_unbind, + f, netdev, sch, data, rpriv, + cleanup); + if (IS_ERR(block_cb)) { + list_del(&indr_priv->list); + kfree(indr_priv); + return PTR_ERR(block_cb); + } + flow_block_cb_add(block_cb, f); + list_add_tail(&block_cb->driver_list, &mlx5e_block_cb_list); + + return 0; + case FLOW_BLOCK_UNBIND: + indr_priv = mlx5e_rep_indr_block_priv_lookup(rpriv, netdev, f->binder_type); + if (!indr_priv) + return -ENOENT; + + block_cb = flow_block_cb_lookup(f->block, setup_cb, indr_priv); + if (!block_cb) + return -ENOENT; + + flow_indr_block_cb_remove(block_cb, f); + list_del(&block_cb->driver_list); + return 0; + default: + return -EOPNOTSUPP; + } + return 0; +} + +static int +mlx5e_rep_indr_replace_act(struct mlx5e_rep_priv *rpriv, + struct flow_offload_action *fl_act) + +{ + struct mlx5e_priv *priv = netdev_priv(rpriv->netdev); + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + enum mlx5_flow_namespace_type ns_type; + struct flow_action_entry *action; + struct mlx5e_tc_act *act; + bool add = false; + int i; + + /* There is no use case currently for more than one action (e.g. pedit). + * when there will be, need to handle cleaning multiple actions on err. + */ + if (!flow_offload_has_one_action(&fl_act->action)) + return -EOPNOTSUPP; + + if (esw && esw->mode == MLX5_ESWITCH_OFFLOADS) + ns_type = MLX5_FLOW_NAMESPACE_FDB; + else + ns_type = MLX5_FLOW_NAMESPACE_KERNEL; + + flow_action_for_each(i, action, &fl_act->action) { + act = mlx5e_tc_act_get(action->id, ns_type); + if (!act) + continue; + + if (!act->offload_action) + continue; + + if (!act->offload_action(priv, fl_act, action)) + add = true; + } + + return add ? 0 : -EOPNOTSUPP; +} + +static int +mlx5e_rep_indr_destroy_act(struct mlx5e_rep_priv *rpriv, + struct flow_offload_action *fl_act) +{ + struct mlx5e_priv *priv = netdev_priv(rpriv->netdev); + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + enum mlx5_flow_namespace_type ns_type; + struct mlx5e_tc_act *act; + + if (esw && esw->mode == MLX5_ESWITCH_OFFLOADS) + ns_type = MLX5_FLOW_NAMESPACE_FDB; + else + ns_type = MLX5_FLOW_NAMESPACE_KERNEL; + + act = mlx5e_tc_act_get(fl_act->id, ns_type); + if (!act || !act->destroy_action) + return -EOPNOTSUPP; + + return act->destroy_action(priv, fl_act); +} + +static int +mlx5e_rep_indr_stats_act(struct mlx5e_rep_priv *rpriv, + struct flow_offload_action *fl_act) + +{ + struct mlx5e_priv *priv = netdev_priv(rpriv->netdev); + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + enum mlx5_flow_namespace_type ns_type; + struct mlx5e_tc_act *act; + + if (esw && esw->mode == MLX5_ESWITCH_OFFLOADS) + ns_type = MLX5_FLOW_NAMESPACE_FDB; + else + ns_type = MLX5_FLOW_NAMESPACE_KERNEL; + + act = mlx5e_tc_act_get(fl_act->id, ns_type); + if (!act || !act->stats_action) + return -EOPNOTSUPP; + + return act->stats_action(priv, fl_act); +} + +static int +mlx5e_rep_indr_setup_act(struct mlx5e_rep_priv *rpriv, + struct flow_offload_action *fl_act) +{ + switch (fl_act->command) { + case FLOW_ACT_REPLACE: + return mlx5e_rep_indr_replace_act(rpriv, fl_act); + case FLOW_ACT_DESTROY: + return mlx5e_rep_indr_destroy_act(rpriv, fl_act); + case FLOW_ACT_STATS: + return mlx5e_rep_indr_stats_act(rpriv, fl_act); + default: + return -EOPNOTSUPP; + } +} + +static int +mlx5e_rep_indr_no_dev_setup(struct mlx5e_rep_priv *rpriv, + enum tc_setup_type type, + void *data) +{ + if (!data) + return -EOPNOTSUPP; + + switch (type) { + case TC_SETUP_ACT: + return mlx5e_rep_indr_setup_act(rpriv, data); + default: + return -EOPNOTSUPP; + } +} + +static +int mlx5e_rep_indr_setup_cb(struct net_device *netdev, struct Qdisc *sch, void *cb_priv, + enum tc_setup_type type, void *type_data, + void *data, + void (*cleanup)(struct flow_block_cb *block_cb)) +{ + if (!netdev) + return mlx5e_rep_indr_no_dev_setup(cb_priv, type, data); + + switch (type) { + case TC_SETUP_BLOCK: + return mlx5e_rep_indr_setup_block(netdev, sch, cb_priv, type_data, + mlx5e_rep_indr_setup_tc_cb, + data, cleanup); + case TC_SETUP_FT: + return mlx5e_rep_indr_setup_block(netdev, sch, cb_priv, type_data, + mlx5e_rep_indr_setup_ft_cb, + data, cleanup); + default: + return -EOPNOTSUPP; + } +} + +int mlx5e_rep_tc_netdevice_event_register(struct mlx5e_rep_priv *rpriv) +{ + struct mlx5_rep_uplink_priv *uplink_priv = &rpriv->uplink_priv; + + /* init indirect block notifications */ + INIT_LIST_HEAD(&uplink_priv->tc_indr_block_priv_list); + + return flow_indr_dev_register(mlx5e_rep_indr_setup_cb, rpriv); +} + +void mlx5e_rep_tc_netdevice_event_unregister(struct mlx5e_rep_priv *rpriv) +{ + flow_indr_dev_unregister(mlx5e_rep_indr_setup_cb, rpriv, + mlx5e_rep_indr_block_unbind); +} + +static bool mlx5e_restore_tunnel(struct mlx5e_priv *priv, struct sk_buff *skb, + struct mlx5e_tc_update_priv *tc_priv, + u32 tunnel_id) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + struct tunnel_match_enc_opts enc_opts = {}; + struct mlx5_rep_uplink_priv *uplink_priv; + struct mlx5e_rep_priv *uplink_rpriv; + struct metadata_dst *tun_dst; + struct tunnel_match_key key; + u32 tun_id, enc_opts_id; + struct net_device *dev; + int err; + + enc_opts_id = tunnel_id & ENC_OPTS_BITS_MASK; + tun_id = tunnel_id >> ENC_OPTS_BITS; + + if (!tun_id) + return true; + + uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH); + uplink_priv = &uplink_rpriv->uplink_priv; + + err = mapping_find(uplink_priv->tunnel_mapping, tun_id, &key); + if (err) { + netdev_dbg(priv->netdev, + "Couldn't find tunnel for tun_id: %d, err: %d\n", + tun_id, err); + return false; + } + + if (enc_opts_id) { + err = mapping_find(uplink_priv->tunnel_enc_opts_mapping, + enc_opts_id, &enc_opts); + if (err) { + netdev_dbg(priv->netdev, + "Couldn't find tunnel (opts) for tun_id: %d, err: %d\n", + enc_opts_id, err); + return false; + } + } + + if (key.enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { + tun_dst = __ip_tun_set_dst(key.enc_ipv4.src, key.enc_ipv4.dst, + key.enc_ip.tos, key.enc_ip.ttl, + key.enc_tp.dst, TUNNEL_KEY, + key32_to_tunnel_id(key.enc_key_id.keyid), + enc_opts.key.len); + } else if (key.enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { + tun_dst = __ipv6_tun_set_dst(&key.enc_ipv6.src, &key.enc_ipv6.dst, + key.enc_ip.tos, key.enc_ip.ttl, + key.enc_tp.dst, 0, TUNNEL_KEY, + key32_to_tunnel_id(key.enc_key_id.keyid), + enc_opts.key.len); + } else { + netdev_dbg(priv->netdev, + "Couldn't restore tunnel, unsupported addr_type: %d\n", + key.enc_control.addr_type); + return false; + } + + if (!tun_dst) { + netdev_dbg(priv->netdev, "Couldn't restore tunnel, no tun_dst\n"); + return false; + } + + tun_dst->u.tun_info.key.tp_src = key.enc_tp.src; + + if (enc_opts.key.len) + ip_tunnel_info_opts_set(&tun_dst->u.tun_info, + enc_opts.key.data, + enc_opts.key.len, + enc_opts.key.dst_opt_type); + + skb_dst_set(skb, (struct dst_entry *)tun_dst); + dev = dev_get_by_index(&init_net, key.filter_ifindex); + if (!dev) { + netdev_dbg(priv->netdev, + "Couldn't find tunnel device with ifindex: %d\n", + key.filter_ifindex); + return false; + } + + /* Set fwd_dev so we do dev_put() after datapath */ + tc_priv->fwd_dev = dev; + + skb->dev = dev; + + return true; +} + +static bool mlx5e_restore_skb_chain(struct sk_buff *skb, u32 chain, u32 reg_c1, + struct mlx5e_tc_update_priv *tc_priv) +{ + struct mlx5e_priv *priv = netdev_priv(skb->dev); + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + u32 tunnel_id = (reg_c1 >> ESW_TUN_OFFSET) & TUNNEL_ID_MASK; + +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) + if (chain) { + struct mlx5_rep_uplink_priv *uplink_priv; + struct mlx5e_rep_priv *uplink_rpriv; + struct tc_skb_ext *tc_skb_ext; + u32 zone_restore_id; + + tc_skb_ext = tc_skb_ext_alloc(skb); + if (!tc_skb_ext) { + WARN_ON(1); + goto out_incr_rx_counter; + } + tc_skb_ext->chain = chain; + zone_restore_id = reg_c1 & ESW_ZONE_ID_MASK; + uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH); + uplink_priv = &uplink_rpriv->uplink_priv; + if (!mlx5e_tc_ct_restore_flow(uplink_priv->ct_priv, skb, + zone_restore_id)) + goto out_incr_rx_counter; + } +#endif /* CONFIG_NET_TC_SKB_EXT */ + return mlx5e_restore_tunnel(priv, skb, tc_priv, tunnel_id); + +out_incr_rx_counter: + atomic_inc(&esw->dev->priv.ct_debugfs->stats.rx_dropped); + + return false; +} + +static void mlx5_rep_tc_post_napi_receive(struct mlx5e_tc_update_priv *tc_priv) +{ + if (tc_priv->fwd_dev) + dev_put(tc_priv->fwd_dev); +} + +static void mlx5e_restore_skb_sample(struct mlx5e_priv *priv, struct sk_buff *skb, + struct mlx5_mapped_obj *mapped_obj, + struct mlx5e_tc_update_priv *tc_priv) +{ + if (!mlx5e_restore_tunnel(priv, skb, tc_priv, mapped_obj->sample.tunnel_id)) { + netdev_dbg(priv->netdev, + "Failed to restore tunnel info for sampled packet\n"); + return; + } + mlx5e_tc_sample_skb(skb, mapped_obj); + mlx5_rep_tc_post_napi_receive(tc_priv); +} + +static bool mlx5e_restore_skb_int_port(struct mlx5e_priv *priv, struct sk_buff *skb, + struct mlx5_mapped_obj *mapped_obj, + struct mlx5e_tc_update_priv *tc_priv, + bool *forward_tx, + u32 reg_c1) +{ + u32 tunnel_id = (reg_c1 >> ESW_TUN_OFFSET) & TUNNEL_ID_MASK; + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + struct mlx5_rep_uplink_priv *uplink_priv; + struct mlx5e_rep_priv *uplink_rpriv; + + /* Tunnel restore takes precedence over int port restore */ + if (tunnel_id) + return mlx5e_restore_tunnel(priv, skb, tc_priv, tunnel_id); + + uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH); + uplink_priv = &uplink_rpriv->uplink_priv; + + if (mlx5e_tc_int_port_dev_fwd(uplink_priv->int_port_priv, skb, + mapped_obj->int_port_metadata, forward_tx)) { + /* Set fwd_dev for future dev_put */ + tc_priv->fwd_dev = skb->dev; + + return true; + } + + return false; +} + +void mlx5e_rep_tc_receive(struct mlx5_cqe64 *cqe, struct mlx5e_rq *rq, + struct sk_buff *skb) +{ + u32 reg_c1 = be32_to_cpu(cqe->ft_metadata); + struct mlx5e_tc_update_priv tc_priv = {}; + struct mlx5_mapped_obj mapped_obj; + struct mlx5_eswitch *esw; + bool forward_tx = false; + struct mlx5e_priv *priv; + u32 reg_c0; + int err; + + reg_c0 = (be32_to_cpu(cqe->sop_drop_qpn) & MLX5E_TC_FLOW_ID_MASK); + if (!reg_c0 || reg_c0 == MLX5_FS_DEFAULT_FLOW_TAG) + goto forward; + + /* If reg_c0 is not equal to the default flow tag then skb->mark + * is not supported and must be reset back to 0. + */ + skb->mark = 0; + + priv = netdev_priv(skb->dev); + esw = priv->mdev->priv.eswitch; + err = mapping_find(esw->offloads.reg_c0_obj_pool, reg_c0, &mapped_obj); + if (err) { + netdev_dbg(priv->netdev, + "Couldn't find mapped object for reg_c0: %d, err: %d\n", + reg_c0, err); + goto free_skb; + } + + if (mapped_obj.type == MLX5_MAPPED_OBJ_CHAIN) { + if (!mlx5e_restore_skb_chain(skb, mapped_obj.chain, reg_c1, &tc_priv) && + !mlx5_ipsec_is_rx_flow(cqe)) + goto free_skb; + } else if (mapped_obj.type == MLX5_MAPPED_OBJ_SAMPLE) { + mlx5e_restore_skb_sample(priv, skb, &mapped_obj, &tc_priv); + goto free_skb; + } else if (mapped_obj.type == MLX5_MAPPED_OBJ_INT_PORT_METADATA) { + if (!mlx5e_restore_skb_int_port(priv, skb, &mapped_obj, &tc_priv, + &forward_tx, reg_c1)) + goto free_skb; + } else { + netdev_dbg(priv->netdev, "Invalid mapped object type: %d\n", mapped_obj.type); + goto free_skb; + } + +forward: + if (forward_tx) + dev_queue_xmit(skb); + else + napi_gro_receive(rq->cq.napi, skb); + + mlx5_rep_tc_post_napi_receive(&tc_priv); + + return; + +free_skb: + dev_kfree_skb_any(skb); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.h new file mode 100644 index 0000000..7c9dd3a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2020 Mellanox Technologies. */ + +#ifndef __MLX5_EN_REP_TC_H__ +#define __MLX5_EN_REP_TC_H__ + +#include +#include "en_tc.h" +#include "en_rep.h" + +#if IS_ENABLED(CONFIG_MLX5_CLS_ACT) + +int mlx5e_rep_tc_init(struct mlx5e_rep_priv *rpriv); +void mlx5e_rep_tc_cleanup(struct mlx5e_rep_priv *rpriv); + +int mlx5e_rep_tc_netdevice_event_register(struct mlx5e_rep_priv *rpriv); +void mlx5e_rep_tc_netdevice_event_unregister(struct mlx5e_rep_priv *rpriv); + +void mlx5e_rep_tc_enable(struct mlx5e_priv *priv); +void mlx5e_rep_tc_disable(struct mlx5e_priv *priv); + +int mlx5e_rep_tc_event_port_affinity(struct mlx5e_priv *priv); + +void mlx5e_rep_update_flows(struct mlx5e_priv *priv, + struct mlx5e_encap_entry *e, + bool neigh_connected, + unsigned char ha[ETH_ALEN]); + +int mlx5e_rep_encap_entry_attach(struct mlx5e_priv *priv, + struct mlx5e_encap_entry *e, + struct mlx5e_neigh *m_neigh, + struct net_device *neigh_dev); +void mlx5e_rep_encap_entry_detach(struct mlx5e_priv *priv, + struct mlx5e_encap_entry *e); + +int mlx5e_rep_setup_tc(struct net_device *dev, enum tc_setup_type type, + void *type_data); + +void mlx5e_rep_tc_receive(struct mlx5_cqe64 *cqe, struct mlx5e_rq *rq, + struct sk_buff *skb); + +#else /* CONFIG_MLX5_CLS_ACT */ + +struct mlx5e_rep_priv; +static inline int +mlx5e_rep_tc_init(struct mlx5e_rep_priv *rpriv) { return 0; } +static inline void +mlx5e_rep_tc_cleanup(struct mlx5e_rep_priv *rpriv) {} + +static inline int +mlx5e_rep_tc_netdevice_event_register(struct mlx5e_rep_priv *rpriv) { return 0; } +static inline void +mlx5e_rep_tc_netdevice_event_unregister(struct mlx5e_rep_priv *rpriv) {} + +static inline void +mlx5e_rep_tc_enable(struct mlx5e_priv *priv) {} +static inline void +mlx5e_rep_tc_disable(struct mlx5e_priv *priv) {} + +static inline int +mlx5e_rep_tc_event_port_affinity(struct mlx5e_priv *priv) { return NOTIFY_DONE; } + +static inline int +mlx5e_rep_setup_tc(struct net_device *dev, enum tc_setup_type type, + void *type_data) { return -EOPNOTSUPP; } + +static inline void +mlx5e_rep_tc_receive(struct mlx5_cqe64 *cqe, struct mlx5e_rq *rq, + struct sk_buff *skb) { napi_gro_receive(rq->cq.napi, skb); } + +#endif /* CONFIG_MLX5_CLS_ACT */ + +#endif /* __MLX5_EN_REP_TC_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c new file mode 100644 index 0000000..3087c57 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c @@ -0,0 +1,782 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2019 Mellanox Technologies. + +#include "health.h" +#include "params.h" +#include "txrx.h" +#include "devlink.h" +#include "ptp.h" +#include "lib/tout.h" + +static int mlx5e_query_rq_state(struct mlx5_core_dev *dev, u32 rqn, u8 *state) +{ + int outlen = MLX5_ST_SZ_BYTES(query_rq_out); + void *out; + void *rqc; + int err; + + out = kvzalloc(outlen, GFP_KERNEL); + if (!out) + return -ENOMEM; + + err = mlx5_core_query_rq(dev, rqn, out); + if (err) + goto out; + + rqc = MLX5_ADDR_OF(query_rq_out, out, rq_context); + *state = MLX5_GET(rqc, rqc, state); + +out: + kvfree(out); + return err; +} + +static int mlx5e_wait_for_icosq_flush(struct mlx5e_icosq *icosq) +{ + struct mlx5_core_dev *dev = icosq->channel->mdev; + unsigned long exp_time; + + exp_time = jiffies + msecs_to_jiffies(mlx5_tout_ms(dev, FLUSH_ON_ERROR)); + + while (time_before(jiffies, exp_time)) { + if (icosq->cc == icosq->pc) + return 0; + + msleep(20); + } + + netdev_err(icosq->channel->netdev, + "Wait for ICOSQ 0x%x flush timeout (cc = 0x%x, pc = 0x%x)\n", + icosq->sqn, icosq->cc, icosq->pc); + + return -ETIMEDOUT; +} + +static void mlx5e_reset_icosq_cc_pc(struct mlx5e_icosq *icosq) +{ + WARN_ONCE(icosq->cc != icosq->pc, "ICOSQ 0x%x: cc (0x%x) != pc (0x%x)\n", + icosq->sqn, icosq->cc, icosq->pc); + icosq->cc = 0; + icosq->pc = 0; +} + +static int mlx5e_rx_reporter_err_icosq_cqe_recover(void *ctx) +{ + struct mlx5e_rq *xskrq = NULL; + struct mlx5_core_dev *mdev; + struct mlx5e_icosq *icosq; + struct net_device *dev; + struct mlx5e_rq *rq; + u8 state; + int err; + + icosq = ctx; + + mutex_lock(&icosq->channel->icosq_recovery_lock); + + /* mlx5e_close_rq cancels this work before RQ and ICOSQ are killed. */ + rq = &icosq->channel->rq; + if (test_bit(MLX5E_RQ_STATE_ENABLED, &icosq->channel->xskrq.state)) + xskrq = &icosq->channel->xskrq; + mdev = icosq->channel->mdev; + dev = icosq->channel->netdev; + err = mlx5_core_query_sq_state(mdev, icosq->sqn, &state); + if (err) { + netdev_err(dev, "Failed to query ICOSQ 0x%x state. err = %d\n", + icosq->sqn, err); + goto out; + } + + if (state != MLX5_SQC_STATE_ERR) + goto out; + + mlx5e_deactivate_rq(rq); + if (xskrq) + mlx5e_deactivate_rq(xskrq); + + synchronize_net(); /* Sync with NAPI. */ + err = mlx5e_wait_for_icosq_flush(icosq); + if (err) + goto out; + + mlx5e_deactivate_icosq(icosq); + synchronize_net(); /* Sync with NAPI. */ + + /* At this point, both the rq and the icosq are disabled */ + + err = mlx5e_health_sq_to_ready(mdev, dev, icosq->sqn); + if (err) + goto out; + + mlx5e_reset_icosq_cc_pc(icosq); + + mlx5e_free_rx_in_progress_descs(rq); + if (xskrq) + mlx5e_free_rx_in_progress_descs(xskrq); + + clear_bit(MLX5E_SQ_STATE_RECOVERING, &icosq->state); + mlx5e_activate_icosq(icosq); + + mlx5e_activate_rq(rq); + rq->stats->recover++; + + if (xskrq) { + mlx5e_activate_rq(xskrq); + xskrq->stats->recover++; + } + + mlx5e_trigger_napi_icosq(icosq->channel); + + mutex_unlock(&icosq->channel->icosq_recovery_lock); + + return 0; +out: + clear_bit(MLX5E_SQ_STATE_RECOVERING, &icosq->state); + mutex_unlock(&icosq->channel->icosq_recovery_lock); + return err; +} + +static int mlx5e_rq_to_ready(struct mlx5e_rq *rq, int curr_state) +{ + struct net_device *dev = rq->netdev; + int err; + + err = mlx5e_modify_rq_state(rq, curr_state, MLX5_RQC_STATE_RST); + if (err) { + netdev_err(dev, "Failed to move rq 0x%x to reset\n", rq->rqn); + return err; + } + err = mlx5e_modify_rq_state(rq, MLX5_RQC_STATE_RST, MLX5_RQC_STATE_RDY); + if (err) { + netdev_err(dev, "Failed to move rq 0x%x to ready\n", rq->rqn); + return err; + } + + return 0; +} + +static int mlx5e_rx_reporter_err_rq_cqe_recover(void *ctx) +{ + struct mlx5e_rq *rq = ctx; + int err; + + mlx5e_deactivate_rq(rq); + synchronize_net(); /* Sync with NAPI. */ + mlx5e_free_rx_descs(rq); + + err = mlx5e_rq_to_ready(rq, MLX5_RQC_STATE_ERR); + if (err) + goto out; + + clear_bit(MLX5E_RQ_STATE_RECOVERING, &rq->state); + mlx5e_activate_rq(rq); + rq->stats->recover++; + if (rq->channel) + mlx5e_trigger_napi_icosq(rq->channel); + else + mlx5e_trigger_napi_sched(rq->cq.napi); + return 0; +out: + clear_bit(MLX5E_RQ_STATE_RECOVERING, &rq->state); + return err; +} + +static int mlx5e_rx_reporter_timeout_recover(void *ctx) +{ + struct mlx5_eq_comp *eq; + struct mlx5e_rq *rq; + int err; + + rq = ctx; + eq = rq->cq.mcq.eq; + + err = mlx5e_health_channel_eq_recover(rq->netdev, eq, rq->cq.ch_stats); + if (err && rq->icosq) + clear_bit(MLX5E_SQ_STATE_ENABLED, &rq->icosq->state); + + return err; +} + +static int mlx5e_rx_reporter_recover_from_ctx(struct mlx5e_err_ctx *err_ctx) +{ + return err_ctx->recover(err_ctx->ctx); +} + +static int mlx5e_rx_reporter_recover(struct devlink_health_reporter *reporter, + void *context, + struct netlink_ext_ack *extack) +{ + struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); + struct mlx5e_err_ctx *err_ctx = context; + + return err_ctx ? mlx5e_rx_reporter_recover_from_ctx(err_ctx) : + mlx5e_health_recover_channels(priv); +} + +static int mlx5e_reporter_icosq_diagnose(struct mlx5e_icosq *icosq, u8 hw_state, + struct devlink_fmsg *fmsg) +{ + int err; + + err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "ICOSQ"); + if (err) + return err; + + err = devlink_fmsg_u32_pair_put(fmsg, "sqn", icosq->sqn); + if (err) + return err; + + err = devlink_fmsg_u8_pair_put(fmsg, "HW state", hw_state); + if (err) + return err; + + err = devlink_fmsg_u32_pair_put(fmsg, "cc", icosq->cc); + if (err) + return err; + + err = devlink_fmsg_u32_pair_put(fmsg, "pc", icosq->pc); + if (err) + return err; + + err = devlink_fmsg_u32_pair_put(fmsg, "WQE size", + mlx5_wq_cyc_get_size(&icosq->wq)); + if (err) + return err; + + err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "CQ"); + if (err) + return err; + + err = devlink_fmsg_u32_pair_put(fmsg, "cqn", icosq->cq.mcq.cqn); + if (err) + return err; + + err = devlink_fmsg_u32_pair_put(fmsg, "cc", icosq->cq.wq.cc); + if (err) + return err; + + err = devlink_fmsg_u32_pair_put(fmsg, "size", mlx5_cqwq_get_size(&icosq->cq.wq)); + if (err) + return err; + + err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); + if (err) + return err; + + return mlx5e_health_fmsg_named_obj_nest_end(fmsg); +} + +static int +mlx5e_rx_reporter_build_diagnose_output_rq_common(struct mlx5e_rq *rq, + struct devlink_fmsg *fmsg) +{ + u16 wqe_counter; + int wqes_sz; + u8 hw_state; + u16 wq_head; + int err; + + err = mlx5e_query_rq_state(rq->mdev, rq->rqn, &hw_state); + if (err) + return err; + + wqes_sz = mlx5e_rqwq_get_cur_sz(rq); + wq_head = mlx5e_rqwq_get_head(rq); + wqe_counter = mlx5e_rqwq_get_wqe_counter(rq); + + err = devlink_fmsg_u32_pair_put(fmsg, "rqn", rq->rqn); + if (err) + return err; + + err = devlink_fmsg_u8_pair_put(fmsg, "HW state", hw_state); + if (err) + return err; + + err = devlink_fmsg_u8_pair_put(fmsg, "SW state", rq->state); + if (err) + return err; + + err = devlink_fmsg_u32_pair_put(fmsg, "WQE counter", wqe_counter); + if (err) + return err; + + err = devlink_fmsg_u32_pair_put(fmsg, "posted WQEs", wqes_sz); + if (err) + return err; + + err = devlink_fmsg_u32_pair_put(fmsg, "cc", wq_head); + if (err) + return err; + + err = mlx5e_health_cq_diag_fmsg(&rq->cq, fmsg); + if (err) + return err; + + err = mlx5e_health_eq_diag_fmsg(rq->cq.mcq.eq, fmsg); + if (err) + return err; + + if (rq->icosq) { + struct mlx5e_icosq *icosq = rq->icosq; + u8 icosq_hw_state; + + err = mlx5_core_query_sq_state(rq->mdev, icosq->sqn, &icosq_hw_state); + if (err) + return err; + + err = mlx5e_reporter_icosq_diagnose(icosq, icosq_hw_state, fmsg); + if (err) + return err; + } + + return 0; +} + +static int mlx5e_rx_reporter_build_diagnose_output(struct mlx5e_rq *rq, + struct devlink_fmsg *fmsg) +{ + int err; + + err = devlink_fmsg_obj_nest_start(fmsg); + if (err) + return err; + + err = devlink_fmsg_u32_pair_put(fmsg, "channel ix", rq->ix); + if (err) + return err; + + err = mlx5e_rx_reporter_build_diagnose_output_rq_common(rq, fmsg); + if (err) + return err; + + return devlink_fmsg_obj_nest_end(fmsg); +} + +static int mlx5e_rx_reporter_diagnose_generic_rq(struct mlx5e_rq *rq, + struct devlink_fmsg *fmsg) +{ + struct mlx5e_priv *priv = rq->priv; + struct mlx5e_params *params; + u32 rq_stride, rq_sz; + bool real_time; + int err; + + params = &priv->channels.params; + rq_sz = mlx5e_rqwq_get_size(rq); + real_time = mlx5_is_real_time_rq(priv->mdev); + rq_stride = BIT(mlx5e_mpwqe_get_log_stride_size(priv->mdev, params, NULL)); + + err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "RQ"); + if (err) + return err; + + err = devlink_fmsg_u8_pair_put(fmsg, "type", params->rq_wq_type); + if (err) + return err; + + err = devlink_fmsg_u64_pair_put(fmsg, "stride size", rq_stride); + if (err) + return err; + + err = devlink_fmsg_u32_pair_put(fmsg, "size", rq_sz); + if (err) + return err; + + err = devlink_fmsg_string_pair_put(fmsg, "ts_format", real_time ? "RT" : "FRC"); + if (err) + return err; + + err = mlx5e_health_cq_common_diag_fmsg(&rq->cq, fmsg); + if (err) + return err; + + return mlx5e_health_fmsg_named_obj_nest_end(fmsg); +} + +static int +mlx5e_rx_reporter_diagnose_common_ptp_config(struct mlx5e_priv *priv, struct mlx5e_ptp *ptp_ch, + struct devlink_fmsg *fmsg) +{ + int err; + + err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "PTP"); + if (err) + return err; + + err = devlink_fmsg_u32_pair_put(fmsg, "filter_type", priv->tstamp.rx_filter); + if (err) + return err; + + err = mlx5e_rx_reporter_diagnose_generic_rq(&ptp_ch->rq, fmsg); + if (err) + return err; + + return mlx5e_health_fmsg_named_obj_nest_end(fmsg); +} + +static int +mlx5e_rx_reporter_diagnose_common_config(struct devlink_health_reporter *reporter, + struct devlink_fmsg *fmsg) +{ + struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); + struct mlx5e_rq *generic_rq = &priv->channels.c[0]->rq; + struct mlx5e_ptp *ptp_ch = priv->channels.ptp; + int err; + + err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "Common config"); + if (err) + return err; + + err = mlx5e_rx_reporter_diagnose_generic_rq(generic_rq, fmsg); + if (err) + return err; + + if (ptp_ch && test_bit(MLX5E_PTP_STATE_RX, ptp_ch->state)) { + err = mlx5e_rx_reporter_diagnose_common_ptp_config(priv, ptp_ch, fmsg); + if (err) + return err; + } + + return mlx5e_health_fmsg_named_obj_nest_end(fmsg); +} + +static int mlx5e_rx_reporter_build_diagnose_output_ptp_rq(struct mlx5e_rq *rq, + struct devlink_fmsg *fmsg) +{ + int err; + + err = devlink_fmsg_obj_nest_start(fmsg); + if (err) + return err; + + err = devlink_fmsg_string_pair_put(fmsg, "channel", "ptp"); + if (err) + return err; + + err = mlx5e_rx_reporter_build_diagnose_output_rq_common(rq, fmsg); + if (err) + return err; + + err = devlink_fmsg_obj_nest_end(fmsg); + if (err) + return err; + + return 0; +} + +static int mlx5e_rx_reporter_diagnose(struct devlink_health_reporter *reporter, + struct devlink_fmsg *fmsg, + struct netlink_ext_ack *extack) +{ + struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); + struct mlx5e_ptp *ptp_ch = priv->channels.ptp; + int i, err = 0; + + mutex_lock(&priv->state_lock); + + if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) + goto unlock; + + err = mlx5e_rx_reporter_diagnose_common_config(reporter, fmsg); + if (err) + goto unlock; + + err = devlink_fmsg_arr_pair_nest_start(fmsg, "RQs"); + if (err) + goto unlock; + + for (i = 0; i < priv->channels.num; i++) { + struct mlx5e_rq *rq = &priv->channels.c[i]->rq; + + err = mlx5e_rx_reporter_build_diagnose_output(rq, fmsg); + if (err) + goto unlock; + } + if (ptp_ch && test_bit(MLX5E_PTP_STATE_RX, ptp_ch->state)) { + err = mlx5e_rx_reporter_build_diagnose_output_ptp_rq(&ptp_ch->rq, fmsg); + if (err) + goto unlock; + } + err = devlink_fmsg_arr_pair_nest_end(fmsg); +unlock: + mutex_unlock(&priv->state_lock); + return err; +} + +static int mlx5e_rx_reporter_dump_icosq(struct mlx5e_priv *priv, struct devlink_fmsg *fmsg, + void *ctx) +{ + struct mlx5e_txqsq *icosq = ctx; + struct mlx5_rsc_key key = {}; + int err; + + if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) + return 0; + + err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SX Slice"); + if (err) + return err; + + key.size = PAGE_SIZE; + key.rsc = MLX5_SGMT_TYPE_SX_SLICE_ALL; + err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); + if (err) + return err; + + err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); + if (err) + return err; + + err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "ICOSQ"); + if (err) + return err; + + err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "QPC"); + if (err) + return err; + + key.rsc = MLX5_SGMT_TYPE_FULL_QPC; + key.index1 = icosq->sqn; + key.num_of_obj1 = 1; + + err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); + if (err) + return err; + + err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); + if (err) + return err; + + err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "send_buff"); + if (err) + return err; + + key.rsc = MLX5_SGMT_TYPE_SND_BUFF; + key.num_of_obj2 = MLX5_RSC_DUMP_ALL; + + err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); + if (err) + return err; + + err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); + if (err) + return err; + + return mlx5e_health_fmsg_named_obj_nest_end(fmsg); +} + +static int mlx5e_rx_reporter_dump_rq(struct mlx5e_priv *priv, struct devlink_fmsg *fmsg, + void *ctx) +{ + struct mlx5_rsc_key key = {}; + struct mlx5e_rq *rq = ctx; + int err; + + if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) + return 0; + + err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "RX Slice"); + if (err) + return err; + + key.size = PAGE_SIZE; + key.rsc = MLX5_SGMT_TYPE_RX_SLICE_ALL; + err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); + if (err) + return err; + + err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); + if (err) + return err; + + err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "RQ"); + if (err) + return err; + + err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "QPC"); + if (err) + return err; + + key.rsc = MLX5_SGMT_TYPE_FULL_QPC; + key.index1 = rq->rqn; + key.num_of_obj1 = 1; + + err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); + if (err) + return err; + + err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); + if (err) + return err; + + err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "receive_buff"); + if (err) + return err; + + key.rsc = MLX5_SGMT_TYPE_RCV_BUFF; + key.num_of_obj2 = MLX5_RSC_DUMP_ALL; + err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); + if (err) + return err; + + err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); + if (err) + return err; + + return mlx5e_health_fmsg_named_obj_nest_end(fmsg); +} + +static int mlx5e_rx_reporter_dump_all_rqs(struct mlx5e_priv *priv, + struct devlink_fmsg *fmsg) +{ + struct mlx5e_ptp *ptp_ch = priv->channels.ptp; + struct mlx5_rsc_key key = {}; + int i, err; + + if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) + return 0; + + err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "RX Slice"); + if (err) + return err; + + key.size = PAGE_SIZE; + key.rsc = MLX5_SGMT_TYPE_RX_SLICE_ALL; + err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); + if (err) + return err; + + err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); + if (err) + return err; + + err = devlink_fmsg_arr_pair_nest_start(fmsg, "RQs"); + if (err) + return err; + + for (i = 0; i < priv->channels.num; i++) { + struct mlx5e_rq *rq = &priv->channels.c[i]->rq; + + err = mlx5e_health_queue_dump(priv, fmsg, rq->rqn, "RQ"); + if (err) + return err; + } + + if (ptp_ch && test_bit(MLX5E_PTP_STATE_RX, ptp_ch->state)) { + err = mlx5e_health_queue_dump(priv, fmsg, ptp_ch->rq.rqn, "PTP RQ"); + if (err) + return err; + } + + return devlink_fmsg_arr_pair_nest_end(fmsg); +} + +static int mlx5e_rx_reporter_dump_from_ctx(struct mlx5e_priv *priv, + struct mlx5e_err_ctx *err_ctx, + struct devlink_fmsg *fmsg) +{ + return err_ctx->dump(priv, fmsg, err_ctx->ctx); +} + +static int mlx5e_rx_reporter_dump(struct devlink_health_reporter *reporter, + struct devlink_fmsg *fmsg, void *context, + struct netlink_ext_ack *extack) +{ + struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); + struct mlx5e_err_ctx *err_ctx = context; + + return err_ctx ? mlx5e_rx_reporter_dump_from_ctx(priv, err_ctx, fmsg) : + mlx5e_rx_reporter_dump_all_rqs(priv, fmsg); +} + +void mlx5e_reporter_rx_timeout(struct mlx5e_rq *rq) +{ + char icosq_str[MLX5E_REPORTER_PER_Q_MAX_LEN] = {}; + char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN]; + struct mlx5e_icosq *icosq = rq->icosq; + struct mlx5e_priv *priv = rq->priv; + struct mlx5e_err_ctx err_ctx = {}; + + err_ctx.ctx = rq; + err_ctx.recover = mlx5e_rx_reporter_timeout_recover; + err_ctx.dump = mlx5e_rx_reporter_dump_rq; + + if (icosq) + snprintf(icosq_str, sizeof(icosq_str), "ICOSQ: 0x%x, ", icosq->sqn); + snprintf(err_str, sizeof(err_str), + "RX timeout on channel: %d, %sRQ: 0x%x, CQ: 0x%x", + rq->ix, icosq_str, rq->rqn, rq->cq.mcq.cqn); + + mlx5e_health_report(priv, priv->rx_reporter, err_str, &err_ctx); +} + +void mlx5e_reporter_rq_cqe_err(struct mlx5e_rq *rq) +{ + char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN]; + struct mlx5e_priv *priv = rq->priv; + struct mlx5e_err_ctx err_ctx = {}; + + err_ctx.ctx = rq; + err_ctx.recover = mlx5e_rx_reporter_err_rq_cqe_recover; + err_ctx.dump = mlx5e_rx_reporter_dump_rq; + snprintf(err_str, sizeof(err_str), "ERR CQE on RQ: 0x%x", rq->rqn); + + mlx5e_health_report(priv, priv->rx_reporter, err_str, &err_ctx); +} + +void mlx5e_reporter_icosq_cqe_err(struct mlx5e_icosq *icosq) +{ + struct mlx5e_priv *priv = icosq->channel->priv; + char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN]; + struct mlx5e_err_ctx err_ctx = {}; + + err_ctx.ctx = icosq; + err_ctx.recover = mlx5e_rx_reporter_err_icosq_cqe_recover; + err_ctx.dump = mlx5e_rx_reporter_dump_icosq; + snprintf(err_str, sizeof(err_str), "ERR CQE on ICOSQ: 0x%x", icosq->sqn); + + mlx5e_health_report(priv, priv->rx_reporter, err_str, &err_ctx); +} + +void mlx5e_reporter_icosq_suspend_recovery(struct mlx5e_channel *c) +{ + mutex_lock(&c->icosq_recovery_lock); +} + +void mlx5e_reporter_icosq_resume_recovery(struct mlx5e_channel *c) +{ + mutex_unlock(&c->icosq_recovery_lock); +} + +static const struct devlink_health_reporter_ops mlx5_rx_reporter_ops = { + .name = "rx", + .recover = mlx5e_rx_reporter_recover, + .diagnose = mlx5e_rx_reporter_diagnose, + .dump = mlx5e_rx_reporter_dump, +}; + +#define MLX5E_REPORTER_RX_GRACEFUL_PERIOD 500 + +void mlx5e_reporter_rx_create(struct mlx5e_priv *priv) +{ + struct devlink_port *dl_port = mlx5e_devlink_get_dl_port(priv); + struct devlink_health_reporter *reporter; + + reporter = devlink_port_health_reporter_create(dl_port, &mlx5_rx_reporter_ops, + MLX5E_REPORTER_RX_GRACEFUL_PERIOD, priv); + if (IS_ERR(reporter)) { + netdev_warn(priv->netdev, "Failed to create rx reporter, err = %ld\n", + PTR_ERR(reporter)); + return; + } + priv->rx_reporter = reporter; +} + +void mlx5e_reporter_rx_destroy(struct mlx5e_priv *priv) +{ + if (!priv->rx_reporter) + return; + + devlink_port_health_reporter_destroy(priv->rx_reporter); + priv->rx_reporter = NULL; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c new file mode 100644 index 0000000..9965e5c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c @@ -0,0 +1,615 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2019 Mellanox Technologies. */ + +#include "health.h" +#include "en/ptp.h" +#include "en/devlink.h" +#include "lib/tout.h" + +static int mlx5e_wait_for_sq_flush(struct mlx5e_txqsq *sq) +{ + struct mlx5_core_dev *dev = sq->mdev; + unsigned long exp_time; + + exp_time = jiffies + msecs_to_jiffies(mlx5_tout_ms(dev, FLUSH_ON_ERROR)); + + while (time_before(jiffies, exp_time)) { + if (sq->cc == sq->pc) + return 0; + + msleep(20); + } + + netdev_err(sq->netdev, + "Wait for SQ 0x%x flush timeout (sq cc = 0x%x, sq pc = 0x%x)\n", + sq->sqn, sq->cc, sq->pc); + + return -ETIMEDOUT; +} + +static void mlx5e_reset_txqsq_cc_pc(struct mlx5e_txqsq *sq) +{ + WARN_ONCE(sq->cc != sq->pc, + "SQ 0x%x: cc (0x%x) != pc (0x%x)\n", + sq->sqn, sq->cc, sq->pc); + sq->cc = 0; + sq->dma_fifo_cc = 0; + sq->pc = 0; +} + +static int mlx5e_tx_reporter_err_cqe_recover(void *ctx) +{ + struct mlx5_core_dev *mdev; + struct net_device *dev; + struct mlx5e_txqsq *sq; + u8 state; + int err; + + sq = ctx; + mdev = sq->mdev; + dev = sq->netdev; + + if (!test_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state)) + return 0; + + err = mlx5_core_query_sq_state(mdev, sq->sqn, &state); + if (err) { + netdev_err(dev, "Failed to query SQ 0x%x state. err = %d\n", + sq->sqn, err); + goto out; + } + + if (state != MLX5_SQC_STATE_ERR) + goto out; + + mlx5e_tx_disable_queue(sq->txq); + + err = mlx5e_wait_for_sq_flush(sq); + if (err) + goto out; + + /* At this point, no new packets will arrive from the stack as TXQ is + * marked with QUEUE_STATE_DRV_XOFF. In addition, NAPI cleared all + * pending WQEs. SQ can safely reset the SQ. + */ + + err = mlx5e_health_sq_to_ready(mdev, dev, sq->sqn); + if (err) + goto out; + + mlx5e_reset_txqsq_cc_pc(sq); + sq->stats->recover++; + clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state); + mlx5e_enable_txqsq(sq); + mlx5e_start_txqsq(sq); + + return 0; +out: + clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state); + return err; +} + +struct mlx5e_tx_timeout_ctx { + struct mlx5e_txqsq *sq; + signed int status; +}; + +static int mlx5e_tx_reporter_timeout_recover(void *ctx) +{ + struct mlx5e_tx_timeout_ctx *to_ctx; + struct mlx5e_priv *priv; + struct mlx5_eq_comp *eq; + struct mlx5e_txqsq *sq; + int err; + + to_ctx = ctx; + sq = to_ctx->sq; + eq = sq->cq.mcq.eq; + priv = sq->priv; + err = mlx5e_health_channel_eq_recover(sq->netdev, eq, sq->cq.ch_stats); + if (!err) { + to_ctx->status = 0; /* this sq recovered */ + return err; + } + + err = mlx5e_safe_reopen_channels(priv); + if (!err) { + to_ctx->status = 1; /* all channels recovered */ + return err; + } + + to_ctx->status = err; + clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state); + netdev_err(priv->netdev, + "mlx5e_safe_reopen_channels failed recovering from a tx_timeout, err(%d).\n", + err); + + return err; +} + +/* state lock cannot be grabbed within this function. + * It can cause a dead lock or a read-after-free. + */ +static int mlx5e_tx_reporter_recover_from_ctx(struct mlx5e_err_ctx *err_ctx) +{ + return err_ctx->recover(err_ctx->ctx); +} + +static int mlx5e_tx_reporter_recover(struct devlink_health_reporter *reporter, + void *context, + struct netlink_ext_ack *extack) +{ + struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); + struct mlx5e_err_ctx *err_ctx = context; + + return err_ctx ? mlx5e_tx_reporter_recover_from_ctx(err_ctx) : + mlx5e_health_recover_channels(priv); +} + +static int +mlx5e_tx_reporter_build_diagnose_output_sq_common(struct devlink_fmsg *fmsg, + struct mlx5e_txqsq *sq, int tc) +{ + bool stopped = netif_xmit_stopped(sq->txq); + struct mlx5e_priv *priv = sq->priv; + u8 state; + int err; + + err = mlx5_core_query_sq_state(priv->mdev, sq->sqn, &state); + if (err) + return err; + + err = devlink_fmsg_u32_pair_put(fmsg, "tc", tc); + if (err) + return err; + + err = devlink_fmsg_u32_pair_put(fmsg, "txq ix", sq->txq_ix); + if (err) + return err; + + err = devlink_fmsg_u32_pair_put(fmsg, "sqn", sq->sqn); + if (err) + return err; + + err = devlink_fmsg_u8_pair_put(fmsg, "HW state", state); + if (err) + return err; + + err = devlink_fmsg_bool_pair_put(fmsg, "stopped", stopped); + if (err) + return err; + + err = devlink_fmsg_u32_pair_put(fmsg, "cc", sq->cc); + if (err) + return err; + + err = devlink_fmsg_u32_pair_put(fmsg, "pc", sq->pc); + if (err) + return err; + + err = mlx5e_health_cq_diag_fmsg(&sq->cq, fmsg); + if (err) + return err; + + return mlx5e_health_eq_diag_fmsg(sq->cq.mcq.eq, fmsg); +} + +static int +mlx5e_tx_reporter_build_diagnose_output(struct devlink_fmsg *fmsg, + struct mlx5e_txqsq *sq, int tc) +{ + int err; + + err = devlink_fmsg_obj_nest_start(fmsg); + if (err) + return err; + + err = devlink_fmsg_u32_pair_put(fmsg, "channel ix", sq->ch_ix); + if (err) + return err; + + err = mlx5e_tx_reporter_build_diagnose_output_sq_common(fmsg, sq, tc); + if (err) + return err; + + err = devlink_fmsg_obj_nest_end(fmsg); + if (err) + return err; + + return 0; +} + +static int +mlx5e_tx_reporter_build_diagnose_output_ptpsq(struct devlink_fmsg *fmsg, + struct mlx5e_ptpsq *ptpsq, int tc) +{ + int err; + + err = devlink_fmsg_obj_nest_start(fmsg); + if (err) + return err; + + err = devlink_fmsg_string_pair_put(fmsg, "channel", "ptp"); + if (err) + return err; + + err = mlx5e_tx_reporter_build_diagnose_output_sq_common(fmsg, &ptpsq->txqsq, tc); + if (err) + return err; + + err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "Port TS"); + if (err) + return err; + + err = mlx5e_health_cq_diag_fmsg(&ptpsq->ts_cq, fmsg); + if (err) + return err; + + err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); + if (err) + return err; + + err = devlink_fmsg_obj_nest_end(fmsg); + if (err) + return err; + + return 0; +} + +static int +mlx5e_tx_reporter_diagnose_generic_txqsq(struct devlink_fmsg *fmsg, + struct mlx5e_txqsq *txqsq) +{ + u32 sq_stride, sq_sz; + bool real_time; + int err; + + err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SQ"); + if (err) + return err; + + real_time = mlx5_is_real_time_sq(txqsq->mdev); + sq_sz = mlx5_wq_cyc_get_size(&txqsq->wq); + sq_stride = MLX5_SEND_WQE_BB; + + err = devlink_fmsg_u64_pair_put(fmsg, "stride size", sq_stride); + if (err) + return err; + + err = devlink_fmsg_u32_pair_put(fmsg, "size", sq_sz); + if (err) + return err; + + err = devlink_fmsg_string_pair_put(fmsg, "ts_format", real_time ? "RT" : "FRC"); + if (err) + return err; + + err = mlx5e_health_cq_common_diag_fmsg(&txqsq->cq, fmsg); + if (err) + return err; + + return mlx5e_health_fmsg_named_obj_nest_end(fmsg); +} + +static int +mlx5e_tx_reporter_diagnose_generic_tx_port_ts(struct devlink_fmsg *fmsg, + struct mlx5e_ptpsq *ptpsq) +{ + int err; + + err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "Port TS"); + if (err) + return err; + + err = mlx5e_health_cq_common_diag_fmsg(&ptpsq->ts_cq, fmsg); + if (err) + return err; + + return mlx5e_health_fmsg_named_obj_nest_end(fmsg); +} + +static int +mlx5e_tx_reporter_diagnose_common_config(struct devlink_health_reporter *reporter, + struct devlink_fmsg *fmsg) +{ + struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); + struct mlx5e_txqsq *generic_sq = priv->txq2sq[0]; + struct mlx5e_ptp *ptp_ch = priv->channels.ptp; + struct mlx5e_ptpsq *generic_ptpsq; + int err; + + err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "Common Config"); + if (err) + return err; + + err = mlx5e_tx_reporter_diagnose_generic_txqsq(fmsg, generic_sq); + if (err) + return err; + + if (!ptp_ch || !test_bit(MLX5E_PTP_STATE_TX, ptp_ch->state)) + goto out; + + generic_ptpsq = &ptp_ch->ptpsq[0]; + + err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "PTP"); + if (err) + return err; + + err = mlx5e_tx_reporter_diagnose_generic_txqsq(fmsg, &generic_ptpsq->txqsq); + if (err) + return err; + + err = mlx5e_tx_reporter_diagnose_generic_tx_port_ts(fmsg, generic_ptpsq); + if (err) + return err; + + err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); + if (err) + return err; + +out: + return mlx5e_health_fmsg_named_obj_nest_end(fmsg); +} + +static int mlx5e_tx_reporter_diagnose(struct devlink_health_reporter *reporter, + struct devlink_fmsg *fmsg, + struct netlink_ext_ack *extack) +{ + struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); + struct mlx5e_ptp *ptp_ch = priv->channels.ptp; + + int i, tc, err = 0; + + mutex_lock(&priv->state_lock); + + if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) + goto unlock; + + err = mlx5e_tx_reporter_diagnose_common_config(reporter, fmsg); + if (err) + goto unlock; + + err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs"); + if (err) + goto unlock; + + for (i = 0; i < priv->channels.num; i++) { + struct mlx5e_channel *c = priv->channels.c[i]; + + for (tc = 0; tc < mlx5e_get_dcb_num_tc(&priv->channels.params); tc++) { + struct mlx5e_txqsq *sq = &c->sq[tc]; + + err = mlx5e_tx_reporter_build_diagnose_output(fmsg, sq, tc); + if (err) + goto unlock; + } + } + + if (!ptp_ch || !test_bit(MLX5E_PTP_STATE_TX, ptp_ch->state)) + goto close_sqs_nest; + + for (tc = 0; tc < mlx5e_get_dcb_num_tc(&priv->channels.params); tc++) { + err = mlx5e_tx_reporter_build_diagnose_output_ptpsq(fmsg, + &ptp_ch->ptpsq[tc], + tc); + if (err) + goto unlock; + } + +close_sqs_nest: + err = devlink_fmsg_arr_pair_nest_end(fmsg); + if (err) + goto unlock; + +unlock: + mutex_unlock(&priv->state_lock); + return err; +} + +static int mlx5e_tx_reporter_dump_sq(struct mlx5e_priv *priv, struct devlink_fmsg *fmsg, + void *ctx) +{ + struct mlx5_rsc_key key = {}; + struct mlx5e_txqsq *sq = ctx; + int err; + + if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) + return 0; + + err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SX Slice"); + if (err) + return err; + + key.size = PAGE_SIZE; + key.rsc = MLX5_SGMT_TYPE_SX_SLICE_ALL; + err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); + if (err) + return err; + + err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); + if (err) + return err; + + err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SQ"); + if (err) + return err; + + err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "QPC"); + if (err) + return err; + + key.rsc = MLX5_SGMT_TYPE_FULL_QPC; + key.index1 = sq->sqn; + key.num_of_obj1 = 1; + + err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); + if (err) + return err; + + err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); + if (err) + return err; + + err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "send_buff"); + if (err) + return err; + + key.rsc = MLX5_SGMT_TYPE_SND_BUFF; + key.num_of_obj2 = MLX5_RSC_DUMP_ALL; + err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); + if (err) + return err; + + err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); + if (err) + return err; + + return mlx5e_health_fmsg_named_obj_nest_end(fmsg); +} + +static int mlx5e_tx_reporter_timeout_dump(struct mlx5e_priv *priv, struct devlink_fmsg *fmsg, + void *ctx) +{ + struct mlx5e_tx_timeout_ctx *to_ctx = ctx; + + return mlx5e_tx_reporter_dump_sq(priv, fmsg, to_ctx->sq); +} + +static int mlx5e_tx_reporter_dump_all_sqs(struct mlx5e_priv *priv, + struct devlink_fmsg *fmsg) +{ + struct mlx5e_ptp *ptp_ch = priv->channels.ptp; + struct mlx5_rsc_key key = {}; + int i, tc, err; + + if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) + return 0; + + err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SX Slice"); + if (err) + return err; + + key.size = PAGE_SIZE; + key.rsc = MLX5_SGMT_TYPE_SX_SLICE_ALL; + err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg); + if (err) + return err; + + err = mlx5e_health_fmsg_named_obj_nest_end(fmsg); + if (err) + return err; + + err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs"); + if (err) + return err; + + for (i = 0; i < priv->channels.num; i++) { + struct mlx5e_channel *c = priv->channels.c[i]; + + for (tc = 0; tc < mlx5e_get_dcb_num_tc(&priv->channels.params); tc++) { + struct mlx5e_txqsq *sq = &c->sq[tc]; + + err = mlx5e_health_queue_dump(priv, fmsg, sq->sqn, "SQ"); + if (err) + return err; + } + } + + if (ptp_ch && test_bit(MLX5E_PTP_STATE_TX, ptp_ch->state)) { + for (tc = 0; tc < mlx5e_get_dcb_num_tc(&priv->channels.params); tc++) { + struct mlx5e_txqsq *sq = &ptp_ch->ptpsq[tc].txqsq; + + err = mlx5e_health_queue_dump(priv, fmsg, sq->sqn, "PTP SQ"); + if (err) + return err; + } + } + + return devlink_fmsg_arr_pair_nest_end(fmsg); +} + +static int mlx5e_tx_reporter_dump_from_ctx(struct mlx5e_priv *priv, + struct mlx5e_err_ctx *err_ctx, + struct devlink_fmsg *fmsg) +{ + return err_ctx->dump(priv, fmsg, err_ctx->ctx); +} + +static int mlx5e_tx_reporter_dump(struct devlink_health_reporter *reporter, + struct devlink_fmsg *fmsg, void *context, + struct netlink_ext_ack *extack) +{ + struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); + struct mlx5e_err_ctx *err_ctx = context; + + return err_ctx ? mlx5e_tx_reporter_dump_from_ctx(priv, err_ctx, fmsg) : + mlx5e_tx_reporter_dump_all_sqs(priv, fmsg); +} + +void mlx5e_reporter_tx_err_cqe(struct mlx5e_txqsq *sq) +{ + char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN]; + struct mlx5e_priv *priv = sq->priv; + struct mlx5e_err_ctx err_ctx = {}; + + err_ctx.ctx = sq; + err_ctx.recover = mlx5e_tx_reporter_err_cqe_recover; + err_ctx.dump = mlx5e_tx_reporter_dump_sq; + snprintf(err_str, sizeof(err_str), "ERR CQE on SQ: 0x%x", sq->sqn); + + mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx); +} + +int mlx5e_reporter_tx_timeout(struct mlx5e_txqsq *sq) +{ + char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN]; + struct mlx5e_tx_timeout_ctx to_ctx = {}; + struct mlx5e_priv *priv = sq->priv; + struct mlx5e_err_ctx err_ctx = {}; + + to_ctx.sq = sq; + err_ctx.ctx = &to_ctx; + err_ctx.recover = mlx5e_tx_reporter_timeout_recover; + err_ctx.dump = mlx5e_tx_reporter_timeout_dump; + snprintf(err_str, sizeof(err_str), + "TX timeout on queue: %d, SQ: 0x%x, CQ: 0x%x, SQ Cons: 0x%x SQ Prod: 0x%x, usecs since last trans: %u", + sq->ch_ix, sq->sqn, sq->cq.mcq.cqn, sq->cc, sq->pc, + jiffies_to_usecs(jiffies - READ_ONCE(sq->txq->trans_start))); + + mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx); + return to_ctx.status; +} + +static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = { + .name = "tx", + .recover = mlx5e_tx_reporter_recover, + .diagnose = mlx5e_tx_reporter_diagnose, + .dump = mlx5e_tx_reporter_dump, +}; + +#define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500 + +void mlx5e_reporter_tx_create(struct mlx5e_priv *priv) +{ + struct devlink_port *dl_port = mlx5e_devlink_get_dl_port(priv); + struct devlink_health_reporter *reporter; + + reporter = devlink_port_health_reporter_create(dl_port, &mlx5_tx_reporter_ops, + MLX5_REPORTER_TX_GRACEFUL_PERIOD, priv); + if (IS_ERR(reporter)) { + netdev_warn(priv->netdev, + "Failed to create tx reporter, err = %ld\n", + PTR_ERR(reporter)); + return; + } + priv->tx_reporter = reporter; +} + +void mlx5e_reporter_tx_destroy(struct mlx5e_priv *priv) +{ + if (!priv->tx_reporter) + return; + + devlink_port_health_reporter_destroy(priv->tx_reporter); + priv->tx_reporter = NULL; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rqt.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rqt.c new file mode 100644 index 0000000..b915fb2 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rqt.c @@ -0,0 +1,170 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2021, Mellanox Technologies inc. All rights reserved. */ + +#include "rqt.h" +#include + +void mlx5e_rss_params_indir_init_uniform(struct mlx5e_rss_params_indir *indir, + unsigned int num_channels) +{ + unsigned int i; + + for (i = 0; i < MLX5E_INDIR_RQT_SIZE; i++) + indir->table[i] = i % num_channels; +} + +static int mlx5e_rqt_init(struct mlx5e_rqt *rqt, struct mlx5_core_dev *mdev, + u16 max_size, u32 *init_rqns, u16 init_size) +{ + void *rqtc; + int inlen; + int err; + u32 *in; + int i; + + rqt->mdev = mdev; + rqt->size = max_size; + + inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + sizeof(u32) * init_size; + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context); + + MLX5_SET(rqtc, rqtc, rqt_max_size, rqt->size); + + MLX5_SET(rqtc, rqtc, rqt_actual_size, init_size); + for (i = 0; i < init_size; i++) + MLX5_SET(rqtc, rqtc, rq_num[i], init_rqns[i]); + + err = mlx5_core_create_rqt(rqt->mdev, in, inlen, &rqt->rqtn); + + kvfree(in); + return err; +} + +int mlx5e_rqt_init_direct(struct mlx5e_rqt *rqt, struct mlx5_core_dev *mdev, + bool indir_enabled, u32 init_rqn) +{ + u16 max_size = indir_enabled ? MLX5E_INDIR_RQT_SIZE : 1; + + return mlx5e_rqt_init(rqt, mdev, max_size, &init_rqn, 1); +} + +static int mlx5e_bits_invert(unsigned long a, int size) +{ + int inv = 0; + int i; + + for (i = 0; i < size; i++) + inv |= (test_bit(size - i - 1, &a) ? 1 : 0) << i; + + return inv; +} + +static int mlx5e_calc_indir_rqns(u32 *rss_rqns, u32 *rqns, unsigned int num_rqns, + u8 hfunc, struct mlx5e_rss_params_indir *indir) +{ + unsigned int i; + + for (i = 0; i < MLX5E_INDIR_RQT_SIZE; i++) { + unsigned int ix = i; + + if (hfunc == ETH_RSS_HASH_XOR) + ix = mlx5e_bits_invert(ix, ilog2(MLX5E_INDIR_RQT_SIZE)); + + ix = indir->table[ix]; + + if (WARN_ON(ix >= num_rqns)) + /* Could be a bug in the driver or in the kernel part of + * ethtool: indir table refers to non-existent RQs. + */ + return -EINVAL; + rss_rqns[i] = rqns[ix]; + } + + return 0; +} + +int mlx5e_rqt_init_indir(struct mlx5e_rqt *rqt, struct mlx5_core_dev *mdev, + u32 *rqns, unsigned int num_rqns, + u8 hfunc, struct mlx5e_rss_params_indir *indir) +{ + u32 *rss_rqns; + int err; + + rss_rqns = kvmalloc_array(MLX5E_INDIR_RQT_SIZE, sizeof(*rss_rqns), GFP_KERNEL); + if (!rss_rqns) + return -ENOMEM; + + err = mlx5e_calc_indir_rqns(rss_rqns, rqns, num_rqns, hfunc, indir); + if (err) + goto out; + + err = mlx5e_rqt_init(rqt, mdev, MLX5E_INDIR_RQT_SIZE, rss_rqns, MLX5E_INDIR_RQT_SIZE); + +out: + kvfree(rss_rqns); + return err; +} + +void mlx5e_rqt_destroy(struct mlx5e_rqt *rqt) +{ + mlx5_core_destroy_rqt(rqt->mdev, rqt->rqtn); +} + +static int mlx5e_rqt_redirect(struct mlx5e_rqt *rqt, u32 *rqns, unsigned int size) +{ + unsigned int i; + void *rqtc; + int inlen; + u32 *in; + int err; + + inlen = MLX5_ST_SZ_BYTES(modify_rqt_in) + sizeof(u32) * size; + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + rqtc = MLX5_ADDR_OF(modify_rqt_in, in, ctx); + + MLX5_SET(modify_rqt_in, in, bitmask.rqn_list, 1); + MLX5_SET(rqtc, rqtc, rqt_actual_size, size); + for (i = 0; i < size; i++) + MLX5_SET(rqtc, rqtc, rq_num[i], rqns[i]); + + err = mlx5_core_modify_rqt(rqt->mdev, rqt->rqtn, in, inlen); + + kvfree(in); + return err; +} + +int mlx5e_rqt_redirect_direct(struct mlx5e_rqt *rqt, u32 rqn) +{ + return mlx5e_rqt_redirect(rqt, &rqn, 1); +} + +int mlx5e_rqt_redirect_indir(struct mlx5e_rqt *rqt, u32 *rqns, unsigned int num_rqns, + u8 hfunc, struct mlx5e_rss_params_indir *indir) +{ + u32 *rss_rqns; + int err; + + if (WARN_ON(rqt->size != MLX5E_INDIR_RQT_SIZE)) + return -EINVAL; + + rss_rqns = kvmalloc_array(MLX5E_INDIR_RQT_SIZE, sizeof(*rss_rqns), GFP_KERNEL); + if (!rss_rqns) + return -ENOMEM; + + err = mlx5e_calc_indir_rqns(rss_rqns, rqns, num_rqns, hfunc, indir); + if (err) + goto out; + + err = mlx5e_rqt_redirect(rqt, rss_rqns, MLX5E_INDIR_RQT_SIZE); + +out: + kvfree(rss_rqns); + return err; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rqt.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rqt.h new file mode 100644 index 0000000..60c985a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rqt.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2021, Mellanox Technologies inc. All rights reserved. */ + +#ifndef __MLX5_EN_RQT_H__ +#define __MLX5_EN_RQT_H__ + +#include + +#define MLX5E_INDIR_RQT_SIZE (1 << 8) + +struct mlx5_core_dev; + +struct mlx5e_rss_params_indir { + u32 table[MLX5E_INDIR_RQT_SIZE]; +}; + +void mlx5e_rss_params_indir_init_uniform(struct mlx5e_rss_params_indir *indir, + unsigned int num_channels); + +struct mlx5e_rqt { + struct mlx5_core_dev *mdev; + u32 rqtn; + u16 size; +}; + +int mlx5e_rqt_init_direct(struct mlx5e_rqt *rqt, struct mlx5_core_dev *mdev, + bool indir_enabled, u32 init_rqn); +int mlx5e_rqt_init_indir(struct mlx5e_rqt *rqt, struct mlx5_core_dev *mdev, + u32 *rqns, unsigned int num_rqns, + u8 hfunc, struct mlx5e_rss_params_indir *indir); +void mlx5e_rqt_destroy(struct mlx5e_rqt *rqt); + +static inline u32 mlx5e_rqt_get_rqtn(struct mlx5e_rqt *rqt) +{ + return rqt->rqtn; +} + +int mlx5e_rqt_redirect_direct(struct mlx5e_rqt *rqt, u32 rqn); +int mlx5e_rqt_redirect_indir(struct mlx5e_rqt *rqt, u32 *rqns, unsigned int num_rqns, + u8 hfunc, struct mlx5e_rss_params_indir *indir); + +#endif /* __MLX5_EN_RQT_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rss.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rss.c new file mode 100644 index 0000000..c1cdd8c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rss.c @@ -0,0 +1,606 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. + +#include "rss.h" + +#define mlx5e_rss_warn(__dev, format, ...) \ + dev_warn((__dev)->device, "%s:%d:(pid %d): " format, \ + __func__, __LINE__, current->pid, \ + ##__VA_ARGS__) + +static const struct mlx5e_rss_params_traffic_type rss_default_config[MLX5E_NUM_INDIR_TIRS] = { + [MLX5_TT_IPV4_TCP] = { + .l3_prot_type = MLX5_L3_PROT_TYPE_IPV4, + .l4_prot_type = MLX5_L4_PROT_TYPE_TCP, + .rx_hash_fields = MLX5_HASH_IP_L4PORTS, + }, + [MLX5_TT_IPV6_TCP] = { + .l3_prot_type = MLX5_L3_PROT_TYPE_IPV6, + .l4_prot_type = MLX5_L4_PROT_TYPE_TCP, + .rx_hash_fields = MLX5_HASH_IP_L4PORTS, + }, + [MLX5_TT_IPV4_UDP] = { + .l3_prot_type = MLX5_L3_PROT_TYPE_IPV4, + .l4_prot_type = MLX5_L4_PROT_TYPE_UDP, + .rx_hash_fields = MLX5_HASH_IP_L4PORTS, + }, + [MLX5_TT_IPV6_UDP] = { + .l3_prot_type = MLX5_L3_PROT_TYPE_IPV6, + .l4_prot_type = MLX5_L4_PROT_TYPE_UDP, + .rx_hash_fields = MLX5_HASH_IP_L4PORTS, + }, + [MLX5_TT_IPV4_IPSEC_AH] = { + .l3_prot_type = MLX5_L3_PROT_TYPE_IPV4, + .l4_prot_type = 0, + .rx_hash_fields = MLX5_HASH_IP_IPSEC_SPI, + }, + [MLX5_TT_IPV6_IPSEC_AH] = { + .l3_prot_type = MLX5_L3_PROT_TYPE_IPV6, + .l4_prot_type = 0, + .rx_hash_fields = MLX5_HASH_IP_IPSEC_SPI, + }, + [MLX5_TT_IPV4_IPSEC_ESP] = { + .l3_prot_type = MLX5_L3_PROT_TYPE_IPV4, + .l4_prot_type = 0, + .rx_hash_fields = MLX5_HASH_IP_IPSEC_SPI, + }, + [MLX5_TT_IPV6_IPSEC_ESP] = { + .l3_prot_type = MLX5_L3_PROT_TYPE_IPV6, + .l4_prot_type = 0, + .rx_hash_fields = MLX5_HASH_IP_IPSEC_SPI, + }, + [MLX5_TT_IPV4] = { + .l3_prot_type = MLX5_L3_PROT_TYPE_IPV4, + .l4_prot_type = 0, + .rx_hash_fields = MLX5_HASH_IP, + }, + [MLX5_TT_IPV6] = { + .l3_prot_type = MLX5_L3_PROT_TYPE_IPV6, + .l4_prot_type = 0, + .rx_hash_fields = MLX5_HASH_IP, + }, +}; + +struct mlx5e_rss_params_traffic_type +mlx5e_rss_get_default_tt_config(enum mlx5_traffic_types tt) +{ + return rss_default_config[tt]; +} + +struct mlx5e_rss { + struct mlx5e_rss_params_hash hash; + struct mlx5e_rss_params_indir indir; + u32 rx_hash_fields[MLX5E_NUM_INDIR_TIRS]; + struct mlx5e_tir *tir[MLX5E_NUM_INDIR_TIRS]; + struct mlx5e_tir *inner_tir[MLX5E_NUM_INDIR_TIRS]; + struct mlx5e_rqt rqt; + struct mlx5_core_dev *mdev; + u32 drop_rqn; + bool inner_ft_support; + bool enabled; + refcount_t refcnt; +}; + +struct mlx5e_rss *mlx5e_rss_alloc(void) +{ + return kvzalloc(sizeof(struct mlx5e_rss), GFP_KERNEL); +} + +void mlx5e_rss_free(struct mlx5e_rss *rss) +{ + kvfree(rss); +} + +static void mlx5e_rss_params_init(struct mlx5e_rss *rss) +{ + enum mlx5_traffic_types tt; + + rss->hash.hfunc = ETH_RSS_HASH_TOP; + netdev_rss_key_fill(rss->hash.toeplitz_hash_key, + sizeof(rss->hash.toeplitz_hash_key)); + for (tt = 0; tt < MLX5E_NUM_INDIR_TIRS; tt++) + rss->rx_hash_fields[tt] = + mlx5e_rss_get_default_tt_config(tt).rx_hash_fields; +} + +static struct mlx5e_tir **rss_get_tirp(struct mlx5e_rss *rss, enum mlx5_traffic_types tt, + bool inner) +{ + return inner ? &rss->inner_tir[tt] : &rss->tir[tt]; +} + +static struct mlx5e_tir *rss_get_tir(struct mlx5e_rss *rss, enum mlx5_traffic_types tt, + bool inner) +{ + return *rss_get_tirp(rss, tt, inner); +} + +static struct mlx5e_rss_params_traffic_type +mlx5e_rss_get_tt_config(struct mlx5e_rss *rss, enum mlx5_traffic_types tt) +{ + struct mlx5e_rss_params_traffic_type rss_tt; + + rss_tt = mlx5e_rss_get_default_tt_config(tt); + rss_tt.rx_hash_fields = rss->rx_hash_fields[tt]; + return rss_tt; +} + +static int mlx5e_rss_create_tir(struct mlx5e_rss *rss, + enum mlx5_traffic_types tt, + const struct mlx5e_packet_merge_param *init_pkt_merge_param, + bool inner) +{ + struct mlx5e_rss_params_traffic_type rss_tt; + struct mlx5e_tir_builder *builder; + struct mlx5e_tir **tir_p; + struct mlx5e_tir *tir; + u32 rqtn; + int err; + + if (inner && !rss->inner_ft_support) { + mlx5e_rss_warn(rss->mdev, + "Cannot create inner indirect TIR[%d], RSS inner FT is not supported.\n", + tt); + return -EINVAL; + } + + tir_p = rss_get_tirp(rss, tt, inner); + if (*tir_p) + return -EINVAL; + + tir = kvzalloc(sizeof(*tir), GFP_KERNEL); + if (!tir) + return -ENOMEM; + + builder = mlx5e_tir_builder_alloc(false); + if (!builder) { + err = -ENOMEM; + goto free_tir; + } + + rqtn = mlx5e_rqt_get_rqtn(&rss->rqt); + mlx5e_tir_builder_build_rqt(builder, rss->mdev->mlx5e_res.hw_objs.td.tdn, + rqtn, rss->inner_ft_support); + mlx5e_tir_builder_build_packet_merge(builder, init_pkt_merge_param); + rss_tt = mlx5e_rss_get_tt_config(rss, tt); + mlx5e_tir_builder_build_rss(builder, &rss->hash, &rss_tt, inner); + + err = mlx5e_tir_init(tir, builder, rss->mdev, true); + mlx5e_tir_builder_free(builder); + if (err) { + mlx5e_rss_warn(rss->mdev, "Failed to create %sindirect TIR: err = %d, tt = %d\n", + inner ? "inner " : "", err, tt); + goto free_tir; + } + + *tir_p = tir; + return 0; + +free_tir: + kvfree(tir); + return err; +} + +static void mlx5e_rss_destroy_tir(struct mlx5e_rss *rss, enum mlx5_traffic_types tt, + bool inner) +{ + struct mlx5e_tir **tir_p; + struct mlx5e_tir *tir; + + tir_p = rss_get_tirp(rss, tt, inner); + if (!*tir_p) + return; + + tir = *tir_p; + mlx5e_tir_destroy(tir); + kvfree(tir); + *tir_p = NULL; +} + +static int mlx5e_rss_create_tirs(struct mlx5e_rss *rss, + const struct mlx5e_packet_merge_param *init_pkt_merge_param, + bool inner) +{ + enum mlx5_traffic_types tt, max_tt; + int err; + + for (tt = 0; tt < MLX5E_NUM_INDIR_TIRS; tt++) { + err = mlx5e_rss_create_tir(rss, tt, init_pkt_merge_param, inner); + if (err) + goto err_destroy_tirs; + } + + return 0; + +err_destroy_tirs: + max_tt = tt; + for (tt = 0; tt < max_tt; tt++) + mlx5e_rss_destroy_tir(rss, tt, inner); + return err; +} + +static void mlx5e_rss_destroy_tirs(struct mlx5e_rss *rss, bool inner) +{ + enum mlx5_traffic_types tt; + + for (tt = 0; tt < MLX5E_NUM_INDIR_TIRS; tt++) + mlx5e_rss_destroy_tir(rss, tt, inner); +} + +static int mlx5e_rss_update_tir(struct mlx5e_rss *rss, enum mlx5_traffic_types tt, + bool inner) +{ + struct mlx5e_rss_params_traffic_type rss_tt; + struct mlx5e_tir_builder *builder; + struct mlx5e_tir *tir; + int err; + + tir = rss_get_tir(rss, tt, inner); + if (!tir) + return 0; + + builder = mlx5e_tir_builder_alloc(true); + if (!builder) + return -ENOMEM; + + rss_tt = mlx5e_rss_get_tt_config(rss, tt); + + mlx5e_tir_builder_build_rss(builder, &rss->hash, &rss_tt, inner); + err = mlx5e_tir_modify(tir, builder); + + mlx5e_tir_builder_free(builder); + return err; +} + +static int mlx5e_rss_update_tirs(struct mlx5e_rss *rss) +{ + enum mlx5_traffic_types tt; + int err, retval; + + retval = 0; + + for (tt = 0; tt < MLX5E_NUM_INDIR_TIRS; tt++) { + err = mlx5e_rss_update_tir(rss, tt, false); + if (err) { + retval = retval ? : err; + mlx5e_rss_warn(rss->mdev, + "Failed to update RSS hash of indirect TIR for traffic type %d: err = %d\n", + tt, err); + } + + if (!rss->inner_ft_support) + continue; + + err = mlx5e_rss_update_tir(rss, tt, true); + if (err) { + retval = retval ? : err; + mlx5e_rss_warn(rss->mdev, + "Failed to update RSS hash of inner indirect TIR for traffic type %d: err = %d\n", + tt, err); + } + } + return retval; +} + +int mlx5e_rss_init_no_tirs(struct mlx5e_rss *rss, struct mlx5_core_dev *mdev, + bool inner_ft_support, u32 drop_rqn) +{ + rss->mdev = mdev; + rss->inner_ft_support = inner_ft_support; + rss->drop_rqn = drop_rqn; + + mlx5e_rss_params_init(rss); + refcount_set(&rss->refcnt, 1); + + return mlx5e_rqt_init_direct(&rss->rqt, mdev, true, drop_rqn); +} + +int mlx5e_rss_init(struct mlx5e_rss *rss, struct mlx5_core_dev *mdev, + bool inner_ft_support, u32 drop_rqn, + const struct mlx5e_packet_merge_param *init_pkt_merge_param) +{ + int err; + + err = mlx5e_rss_init_no_tirs(rss, mdev, inner_ft_support, drop_rqn); + if (err) + goto err_out; + + err = mlx5e_rss_create_tirs(rss, init_pkt_merge_param, false); + if (err) + goto err_destroy_rqt; + + if (inner_ft_support) { + err = mlx5e_rss_create_tirs(rss, init_pkt_merge_param, true); + if (err) + goto err_destroy_tirs; + } + + return 0; + +err_destroy_tirs: + mlx5e_rss_destroy_tirs(rss, false); +err_destroy_rqt: + mlx5e_rqt_destroy(&rss->rqt); +err_out: + return err; +} + +int mlx5e_rss_cleanup(struct mlx5e_rss *rss) +{ + if (!refcount_dec_if_one(&rss->refcnt)) + return -EBUSY; + + mlx5e_rss_destroy_tirs(rss, false); + + if (rss->inner_ft_support) + mlx5e_rss_destroy_tirs(rss, true); + + mlx5e_rqt_destroy(&rss->rqt); + + return 0; +} + +void mlx5e_rss_refcnt_inc(struct mlx5e_rss *rss) +{ + refcount_inc(&rss->refcnt); +} + +void mlx5e_rss_refcnt_dec(struct mlx5e_rss *rss) +{ + refcount_dec(&rss->refcnt); +} + +unsigned int mlx5e_rss_refcnt_read(struct mlx5e_rss *rss) +{ + return refcount_read(&rss->refcnt); +} + +u32 mlx5e_rss_get_tirn(struct mlx5e_rss *rss, enum mlx5_traffic_types tt, + bool inner) +{ + struct mlx5e_tir *tir; + + WARN_ON(inner && !rss->inner_ft_support); + tir = rss_get_tir(rss, tt, inner); + WARN_ON(!tir); + + return mlx5e_tir_get_tirn(tir); +} + +/* Fill the "tirn" output parameter. + * Create the requested TIR if it's its first usage. + */ +int mlx5e_rss_obtain_tirn(struct mlx5e_rss *rss, + enum mlx5_traffic_types tt, + const struct mlx5e_packet_merge_param *init_pkt_merge_param, + bool inner, u32 *tirn) +{ + struct mlx5e_tir *tir; + + tir = rss_get_tir(rss, tt, inner); + if (!tir) { /* TIR doesn't exist, create one */ + int err; + + err = mlx5e_rss_create_tir(rss, tt, init_pkt_merge_param, inner); + if (err) + return err; + tir = rss_get_tir(rss, tt, inner); + } + + *tirn = mlx5e_tir_get_tirn(tir); + return 0; +} + +static int mlx5e_rss_apply(struct mlx5e_rss *rss, u32 *rqns, unsigned int num_rqns) +{ + int err; + + err = mlx5e_rqt_redirect_indir(&rss->rqt, rqns, num_rqns, rss->hash.hfunc, &rss->indir); + if (err) + mlx5e_rss_warn(rss->mdev, "Failed to redirect RQT %#x to channels: err = %d\n", + mlx5e_rqt_get_rqtn(&rss->rqt), err); + return err; +} + +void mlx5e_rss_enable(struct mlx5e_rss *rss, u32 *rqns, unsigned int num_rqns) +{ + rss->enabled = true; + mlx5e_rss_apply(rss, rqns, num_rqns); +} + +void mlx5e_rss_disable(struct mlx5e_rss *rss) +{ + int err; + + rss->enabled = false; + err = mlx5e_rqt_redirect_direct(&rss->rqt, rss->drop_rqn); + if (err) + mlx5e_rss_warn(rss->mdev, "Failed to redirect RQT %#x to drop RQ %#x: err = %d\n", + mlx5e_rqt_get_rqtn(&rss->rqt), rss->drop_rqn, err); +} + +int mlx5e_rss_packet_merge_set_param(struct mlx5e_rss *rss, + struct mlx5e_packet_merge_param *pkt_merge_param) +{ + struct mlx5e_tir_builder *builder; + enum mlx5_traffic_types tt; + int err, final_err; + + builder = mlx5e_tir_builder_alloc(true); + if (!builder) + return -ENOMEM; + + mlx5e_tir_builder_build_packet_merge(builder, pkt_merge_param); + + final_err = 0; + + for (tt = 0; tt < MLX5E_NUM_INDIR_TIRS; tt++) { + struct mlx5e_tir *tir; + + tir = rss_get_tir(rss, tt, false); + if (!tir) + goto inner_tir; + err = mlx5e_tir_modify(tir, builder); + if (err) { + mlx5e_rss_warn(rss->mdev, "Failed to update LRO state of indirect TIR %#x for traffic type %d: err = %d\n", + mlx5e_tir_get_tirn(tir), tt, err); + if (!final_err) + final_err = err; + } + +inner_tir: + if (!rss->inner_ft_support) + continue; + + tir = rss_get_tir(rss, tt, true); + if (!tir) + continue; + err = mlx5e_tir_modify(tir, builder); + if (err) { + mlx5e_rss_warn(rss->mdev, "Failed to update LRO state of inner indirect TIR %#x for traffic type %d: err = %d\n", + mlx5e_tir_get_tirn(tir), tt, err); + if (!final_err) + final_err = err; + } + } + + mlx5e_tir_builder_free(builder); + return final_err; +} + +int mlx5e_rss_get_rxfh(struct mlx5e_rss *rss, u32 *indir, u8 *key, u8 *hfunc) +{ + unsigned int i; + + if (indir) + for (i = 0; i < MLX5E_INDIR_RQT_SIZE; i++) + indir[i] = rss->indir.table[i]; + + if (key) + memcpy(key, rss->hash.toeplitz_hash_key, + sizeof(rss->hash.toeplitz_hash_key)); + + if (hfunc) + *hfunc = rss->hash.hfunc; + + return 0; +} + +int mlx5e_rss_set_rxfh(struct mlx5e_rss *rss, const u32 *indir, + const u8 *key, const u8 *hfunc, + u32 *rqns, unsigned int num_rqns) +{ + bool changed_indir = false; + bool changed_hash = false; + struct mlx5e_rss *old_rss; + int err = 0; + + old_rss = mlx5e_rss_alloc(); + if (!old_rss) + return -ENOMEM; + + *old_rss = *rss; + + if (hfunc && *hfunc != rss->hash.hfunc) { + switch (*hfunc) { + case ETH_RSS_HASH_XOR: + case ETH_RSS_HASH_TOP: + break; + default: + err = -EINVAL; + goto out; + } + changed_hash = true; + changed_indir = true; + rss->hash.hfunc = *hfunc; + } + + if (key) { + if (rss->hash.hfunc == ETH_RSS_HASH_TOP) + changed_hash = true; + memcpy(rss->hash.toeplitz_hash_key, key, + sizeof(rss->hash.toeplitz_hash_key)); + } + + if (indir) { + unsigned int i; + + changed_indir = true; + + for (i = 0; i < MLX5E_INDIR_RQT_SIZE; i++) + rss->indir.table[i] = indir[i]; + } + + if (changed_indir && rss->enabled) { + err = mlx5e_rss_apply(rss, rqns, num_rqns); + if (err) { + *rss = *old_rss; + goto out; + } + } + + if (changed_hash) + mlx5e_rss_update_tirs(rss); + +out: + mlx5e_rss_free(old_rss); + return err; +} + +struct mlx5e_rss_params_hash mlx5e_rss_get_hash(struct mlx5e_rss *rss) +{ + return rss->hash; +} + +u8 mlx5e_rss_get_hash_fields(struct mlx5e_rss *rss, enum mlx5_traffic_types tt) +{ + return rss->rx_hash_fields[tt]; +} + +int mlx5e_rss_set_hash_fields(struct mlx5e_rss *rss, enum mlx5_traffic_types tt, + u8 rx_hash_fields) +{ + u8 old_rx_hash_fields; + int err; + + old_rx_hash_fields = rss->rx_hash_fields[tt]; + + if (old_rx_hash_fields == rx_hash_fields) + return 0; + + rss->rx_hash_fields[tt] = rx_hash_fields; + + err = mlx5e_rss_update_tir(rss, tt, false); + if (err) { + rss->rx_hash_fields[tt] = old_rx_hash_fields; + mlx5e_rss_warn(rss->mdev, + "Failed to update RSS hash fields of indirect TIR for traffic type %d: err = %d\n", + tt, err); + return err; + } + + if (!(rss->inner_ft_support)) + return 0; + + err = mlx5e_rss_update_tir(rss, tt, true); + if (err) { + /* Partial update happened. Try to revert - it may fail too, but + * there is nothing more we can do. + */ + rss->rx_hash_fields[tt] = old_rx_hash_fields; + mlx5e_rss_warn(rss->mdev, + "Failed to update RSS hash fields of inner indirect TIR for traffic type %d: err = %d\n", + tt, err); + if (mlx5e_rss_update_tir(rss, tt, false)) + mlx5e_rss_warn(rss->mdev, + "Partial update of RSS hash fields happened: failed to revert indirect TIR for traffic type %d to the old values\n", + tt); + } + + return err; +} + +void mlx5e_rss_set_indir_uniform(struct mlx5e_rss *rss, unsigned int nch) +{ + mlx5e_rss_params_indir_init_uniform(&rss->indir, nch); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rss.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rss.h new file mode 100644 index 0000000..c6b2164 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rss.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. */ + +#ifndef __MLX5_EN_RSS_H__ +#define __MLX5_EN_RSS_H__ + +#include "rqt.h" +#include "tir.h" +#include "fs.h" + +struct mlx5e_rss_params_traffic_type +mlx5e_rss_get_default_tt_config(enum mlx5_traffic_types tt); + +struct mlx5e_rss; + +struct mlx5e_rss *mlx5e_rss_alloc(void); +void mlx5e_rss_free(struct mlx5e_rss *rss); +int mlx5e_rss_init(struct mlx5e_rss *rss, struct mlx5_core_dev *mdev, + bool inner_ft_support, u32 drop_rqn, + const struct mlx5e_packet_merge_param *init_pkt_merge_param); +int mlx5e_rss_init_no_tirs(struct mlx5e_rss *rss, struct mlx5_core_dev *mdev, + bool inner_ft_support, u32 drop_rqn); +int mlx5e_rss_cleanup(struct mlx5e_rss *rss); + +void mlx5e_rss_refcnt_inc(struct mlx5e_rss *rss); +void mlx5e_rss_refcnt_dec(struct mlx5e_rss *rss); +unsigned int mlx5e_rss_refcnt_read(struct mlx5e_rss *rss); + +u32 mlx5e_rss_get_tirn(struct mlx5e_rss *rss, enum mlx5_traffic_types tt, + bool inner); +int mlx5e_rss_obtain_tirn(struct mlx5e_rss *rss, + enum mlx5_traffic_types tt, + const struct mlx5e_packet_merge_param *init_pkt_merge_param, + bool inner, u32 *tirn); + +void mlx5e_rss_enable(struct mlx5e_rss *rss, u32 *rqns, unsigned int num_rqns); +void mlx5e_rss_disable(struct mlx5e_rss *rss); + +int mlx5e_rss_packet_merge_set_param(struct mlx5e_rss *rss, + struct mlx5e_packet_merge_param *pkt_merge_param); +int mlx5e_rss_get_rxfh(struct mlx5e_rss *rss, u32 *indir, u8 *key, u8 *hfunc); +int mlx5e_rss_set_rxfh(struct mlx5e_rss *rss, const u32 *indir, + const u8 *key, const u8 *hfunc, + u32 *rqns, unsigned int num_rqns); +struct mlx5e_rss_params_hash mlx5e_rss_get_hash(struct mlx5e_rss *rss); +u8 mlx5e_rss_get_hash_fields(struct mlx5e_rss *rss, enum mlx5_traffic_types tt); +int mlx5e_rss_set_hash_fields(struct mlx5e_rss *rss, enum mlx5_traffic_types tt, + u8 rx_hash_fields); +void mlx5e_rss_set_indir_uniform(struct mlx5e_rss *rss, unsigned int nch); +#endif /* __MLX5_EN_RSS_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.c new file mode 100644 index 0000000..24c32f7 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.c @@ -0,0 +1,728 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2021, Mellanox Technologies inc. All rights reserved. */ + +#include "rx_res.h" +#include "channels.h" +#include "params.h" + +#define MLX5E_MAX_NUM_RSS 16 + +struct mlx5e_rx_res { + struct mlx5_core_dev *mdev; + enum mlx5e_rx_res_features features; + unsigned int max_nch; + u32 drop_rqn; + + struct mlx5e_packet_merge_param pkt_merge_param; + struct rw_semaphore pkt_merge_param_sem; + + struct mlx5e_rss *rss[MLX5E_MAX_NUM_RSS]; + bool rss_active; + u32 rss_rqns[MLX5E_INDIR_RQT_SIZE]; + unsigned int rss_nch; + + struct { + struct mlx5e_rqt direct_rqt; + struct mlx5e_tir direct_tir; + struct mlx5e_rqt xsk_rqt; + struct mlx5e_tir xsk_tir; + } *channels; + + struct { + struct mlx5e_rqt rqt; + struct mlx5e_tir tir; + } ptp; +}; + +/* API for rx_res_rss_* */ + +static int mlx5e_rx_res_rss_init_def(struct mlx5e_rx_res *res, + unsigned int init_nch) +{ + bool inner_ft_support = res->features & MLX5E_RX_RES_FEATURE_INNER_FT; + struct mlx5e_rss *rss; + int err; + + if (WARN_ON(res->rss[0])) + return -EINVAL; + + rss = mlx5e_rss_alloc(); + if (!rss) + return -ENOMEM; + + err = mlx5e_rss_init(rss, res->mdev, inner_ft_support, res->drop_rqn, + &res->pkt_merge_param); + if (err) + goto err_rss_free; + + mlx5e_rss_set_indir_uniform(rss, init_nch); + + res->rss[0] = rss; + + return 0; + +err_rss_free: + mlx5e_rss_free(rss); + return err; +} + +int mlx5e_rx_res_rss_init(struct mlx5e_rx_res *res, u32 *rss_idx, unsigned int init_nch) +{ + bool inner_ft_support = res->features & MLX5E_RX_RES_FEATURE_INNER_FT; + struct mlx5e_rss *rss; + int err, i; + + for (i = 1; i < MLX5E_MAX_NUM_RSS; i++) + if (!res->rss[i]) + break; + + if (i == MLX5E_MAX_NUM_RSS) + return -ENOSPC; + + rss = mlx5e_rss_alloc(); + if (!rss) + return -ENOMEM; + + err = mlx5e_rss_init_no_tirs(rss, res->mdev, inner_ft_support, res->drop_rqn); + if (err) + goto err_rss_free; + + mlx5e_rss_set_indir_uniform(rss, init_nch); + if (res->rss_active) + mlx5e_rss_enable(rss, res->rss_rqns, res->rss_nch); + + res->rss[i] = rss; + *rss_idx = i; + + return 0; + +err_rss_free: + mlx5e_rss_free(rss); + return err; +} + +static int __mlx5e_rx_res_rss_destroy(struct mlx5e_rx_res *res, u32 rss_idx) +{ + struct mlx5e_rss *rss = res->rss[rss_idx]; + int err; + + err = mlx5e_rss_cleanup(rss); + if (err) + return err; + + mlx5e_rss_free(rss); + res->rss[rss_idx] = NULL; + + return 0; +} + +int mlx5e_rx_res_rss_destroy(struct mlx5e_rx_res *res, u32 rss_idx) +{ + struct mlx5e_rss *rss; + + if (rss_idx >= MLX5E_MAX_NUM_RSS) + return -EINVAL; + + rss = res->rss[rss_idx]; + if (!rss) + return -EINVAL; + + return __mlx5e_rx_res_rss_destroy(res, rss_idx); +} + +static void mlx5e_rx_res_rss_destroy_all(struct mlx5e_rx_res *res) +{ + int i; + + for (i = 0; i < MLX5E_MAX_NUM_RSS; i++) { + struct mlx5e_rss *rss = res->rss[i]; + int err; + + if (!rss) + continue; + + err = __mlx5e_rx_res_rss_destroy(res, i); + if (err) { + unsigned int refcount; + + refcount = mlx5e_rss_refcnt_read(rss); + mlx5_core_warn(res->mdev, + "Failed to destroy RSS context %d, refcount = %u, err = %d\n", + i, refcount, err); + } + } +} + +static void mlx5e_rx_res_rss_enable(struct mlx5e_rx_res *res) +{ + int i; + + res->rss_active = true; + + for (i = 0; i < MLX5E_MAX_NUM_RSS; i++) { + struct mlx5e_rss *rss = res->rss[i]; + + if (!rss) + continue; + mlx5e_rss_enable(rss, res->rss_rqns, res->rss_nch); + } +} + +static void mlx5e_rx_res_rss_disable(struct mlx5e_rx_res *res) +{ + int i; + + res->rss_active = false; + + for (i = 0; i < MLX5E_MAX_NUM_RSS; i++) { + struct mlx5e_rss *rss = res->rss[i]; + + if (!rss) + continue; + mlx5e_rss_disable(rss); + } +} + +/* Updates the indirection table SW shadow, does not update the HW resources yet */ +void mlx5e_rx_res_rss_set_indir_uniform(struct mlx5e_rx_res *res, unsigned int nch) +{ + WARN_ON_ONCE(res->rss_active); + mlx5e_rss_set_indir_uniform(res->rss[0], nch); +} + +int mlx5e_rx_res_rss_get_rxfh(struct mlx5e_rx_res *res, u32 rss_idx, + u32 *indir, u8 *key, u8 *hfunc) +{ + struct mlx5e_rss *rss; + + if (rss_idx >= MLX5E_MAX_NUM_RSS) + return -EINVAL; + + rss = res->rss[rss_idx]; + if (!rss) + return -ENOENT; + + return mlx5e_rss_get_rxfh(rss, indir, key, hfunc); +} + +int mlx5e_rx_res_rss_set_rxfh(struct mlx5e_rx_res *res, u32 rss_idx, + const u32 *indir, const u8 *key, const u8 *hfunc) +{ + struct mlx5e_rss *rss; + + if (rss_idx >= MLX5E_MAX_NUM_RSS) + return -EINVAL; + + rss = res->rss[rss_idx]; + if (!rss) + return -ENOENT; + + return mlx5e_rss_set_rxfh(rss, indir, key, hfunc, res->rss_rqns, res->rss_nch); +} + +u8 mlx5e_rx_res_rss_get_hash_fields(struct mlx5e_rx_res *res, enum mlx5_traffic_types tt) +{ + struct mlx5e_rss *rss = res->rss[0]; + + return mlx5e_rss_get_hash_fields(rss, tt); +} + +int mlx5e_rx_res_rss_set_hash_fields(struct mlx5e_rx_res *res, enum mlx5_traffic_types tt, + u8 rx_hash_fields) +{ + struct mlx5e_rss *rss = res->rss[0]; + + return mlx5e_rss_set_hash_fields(rss, tt, rx_hash_fields); +} + +int mlx5e_rx_res_rss_cnt(struct mlx5e_rx_res *res) +{ + int i, cnt; + + cnt = 0; + for (i = 0; i < MLX5E_MAX_NUM_RSS; i++) + if (res->rss[i]) + cnt++; + + return cnt; +} + +int mlx5e_rx_res_rss_index(struct mlx5e_rx_res *res, struct mlx5e_rss *rss) +{ + int i; + + if (!rss) + return -EINVAL; + + for (i = 0; i < MLX5E_MAX_NUM_RSS; i++) + if (rss == res->rss[i]) + return i; + + return -ENOENT; +} + +struct mlx5e_rss *mlx5e_rx_res_rss_get(struct mlx5e_rx_res *res, u32 rss_idx) +{ + if (rss_idx >= MLX5E_MAX_NUM_RSS) + return NULL; + + return res->rss[rss_idx]; +} + +/* End of API rx_res_rss_* */ + +struct mlx5e_rx_res *mlx5e_rx_res_alloc(void) +{ + return kvzalloc(sizeof(struct mlx5e_rx_res), GFP_KERNEL); +} + +static int mlx5e_rx_res_channels_init(struct mlx5e_rx_res *res) +{ + bool inner_ft_support = res->features & MLX5E_RX_RES_FEATURE_INNER_FT; + struct mlx5e_tir_builder *builder; + int err = 0; + int ix; + + builder = mlx5e_tir_builder_alloc(false); + if (!builder) + return -ENOMEM; + + res->channels = kvcalloc(res->max_nch, sizeof(*res->channels), GFP_KERNEL); + if (!res->channels) { + err = -ENOMEM; + goto out; + } + + for (ix = 0; ix < res->max_nch; ix++) { + err = mlx5e_rqt_init_direct(&res->channels[ix].direct_rqt, + res->mdev, false, res->drop_rqn); + if (err) { + mlx5_core_warn(res->mdev, "Failed to create a direct RQT: err = %d, ix = %u\n", + err, ix); + goto err_destroy_direct_rqts; + } + } + + for (ix = 0; ix < res->max_nch; ix++) { + mlx5e_tir_builder_build_rqt(builder, res->mdev->mlx5e_res.hw_objs.td.tdn, + mlx5e_rqt_get_rqtn(&res->channels[ix].direct_rqt), + inner_ft_support); + mlx5e_tir_builder_build_packet_merge(builder, &res->pkt_merge_param); + mlx5e_tir_builder_build_direct(builder); + + err = mlx5e_tir_init(&res->channels[ix].direct_tir, builder, res->mdev, true); + if (err) { + mlx5_core_warn(res->mdev, "Failed to create a direct TIR: err = %d, ix = %u\n", + err, ix); + goto err_destroy_direct_tirs; + } + + mlx5e_tir_builder_clear(builder); + } + + if (!(res->features & MLX5E_RX_RES_FEATURE_XSK)) + goto out; + + for (ix = 0; ix < res->max_nch; ix++) { + err = mlx5e_rqt_init_direct(&res->channels[ix].xsk_rqt, + res->mdev, false, res->drop_rqn); + if (err) { + mlx5_core_warn(res->mdev, "Failed to create an XSK RQT: err = %d, ix = %u\n", + err, ix); + goto err_destroy_xsk_rqts; + } + } + + for (ix = 0; ix < res->max_nch; ix++) { + mlx5e_tir_builder_build_rqt(builder, res->mdev->mlx5e_res.hw_objs.td.tdn, + mlx5e_rqt_get_rqtn(&res->channels[ix].xsk_rqt), + inner_ft_support); + mlx5e_tir_builder_build_packet_merge(builder, &res->pkt_merge_param); + mlx5e_tir_builder_build_direct(builder); + + err = mlx5e_tir_init(&res->channels[ix].xsk_tir, builder, res->mdev, true); + if (err) { + mlx5_core_warn(res->mdev, "Failed to create an XSK TIR: err = %d, ix = %u\n", + err, ix); + goto err_destroy_xsk_tirs; + } + + mlx5e_tir_builder_clear(builder); + } + + goto out; + +err_destroy_xsk_tirs: + while (--ix >= 0) + mlx5e_tir_destroy(&res->channels[ix].xsk_tir); + + ix = res->max_nch; +err_destroy_xsk_rqts: + while (--ix >= 0) + mlx5e_rqt_destroy(&res->channels[ix].xsk_rqt); + + ix = res->max_nch; +err_destroy_direct_tirs: + while (--ix >= 0) + mlx5e_tir_destroy(&res->channels[ix].direct_tir); + + ix = res->max_nch; +err_destroy_direct_rqts: + while (--ix >= 0) + mlx5e_rqt_destroy(&res->channels[ix].direct_rqt); + + kvfree(res->channels); + +out: + mlx5e_tir_builder_free(builder); + + return err; +} + +static int mlx5e_rx_res_ptp_init(struct mlx5e_rx_res *res) +{ + bool inner_ft_support = res->features & MLX5E_RX_RES_FEATURE_INNER_FT; + struct mlx5e_tir_builder *builder; + int err; + + builder = mlx5e_tir_builder_alloc(false); + if (!builder) + return -ENOMEM; + + err = mlx5e_rqt_init_direct(&res->ptp.rqt, res->mdev, false, res->drop_rqn); + if (err) + goto out; + + /* Separated from the channels RQs, does not share pkt_merge state with them */ + mlx5e_tir_builder_build_rqt(builder, res->mdev->mlx5e_res.hw_objs.td.tdn, + mlx5e_rqt_get_rqtn(&res->ptp.rqt), + inner_ft_support); + mlx5e_tir_builder_build_direct(builder); + + err = mlx5e_tir_init(&res->ptp.tir, builder, res->mdev, true); + if (err) + goto err_destroy_ptp_rqt; + + goto out; + +err_destroy_ptp_rqt: + mlx5e_rqt_destroy(&res->ptp.rqt); + +out: + mlx5e_tir_builder_free(builder); + return err; +} + +static void mlx5e_rx_res_channels_destroy(struct mlx5e_rx_res *res) +{ + unsigned int ix; + + for (ix = 0; ix < res->max_nch; ix++) { + mlx5e_tir_destroy(&res->channels[ix].direct_tir); + mlx5e_rqt_destroy(&res->channels[ix].direct_rqt); + + if (!(res->features & MLX5E_RX_RES_FEATURE_XSK)) + continue; + + mlx5e_tir_destroy(&res->channels[ix].xsk_tir); + mlx5e_rqt_destroy(&res->channels[ix].xsk_rqt); + } + + kvfree(res->channels); +} + +static void mlx5e_rx_res_ptp_destroy(struct mlx5e_rx_res *res) +{ + mlx5e_tir_destroy(&res->ptp.tir); + mlx5e_rqt_destroy(&res->ptp.rqt); +} + +int mlx5e_rx_res_init(struct mlx5e_rx_res *res, struct mlx5_core_dev *mdev, + enum mlx5e_rx_res_features features, unsigned int max_nch, + u32 drop_rqn, const struct mlx5e_packet_merge_param *init_pkt_merge_param, + unsigned int init_nch) +{ + int err; + + res->mdev = mdev; + res->features = features; + res->max_nch = max_nch; + res->drop_rqn = drop_rqn; + + res->pkt_merge_param = *init_pkt_merge_param; + init_rwsem(&res->pkt_merge_param_sem); + + err = mlx5e_rx_res_rss_init_def(res, init_nch); + if (err) + goto err_out; + + err = mlx5e_rx_res_channels_init(res); + if (err) + goto err_rss_destroy; + + err = mlx5e_rx_res_ptp_init(res); + if (err) + goto err_channels_destroy; + + return 0; + +err_channels_destroy: + mlx5e_rx_res_channels_destroy(res); +err_rss_destroy: + __mlx5e_rx_res_rss_destroy(res, 0); +err_out: + return err; +} + +void mlx5e_rx_res_destroy(struct mlx5e_rx_res *res) +{ + mlx5e_rx_res_ptp_destroy(res); + mlx5e_rx_res_channels_destroy(res); + mlx5e_rx_res_rss_destroy_all(res); +} + +void mlx5e_rx_res_free(struct mlx5e_rx_res *res) +{ + kvfree(res); +} + +u32 mlx5e_rx_res_get_tirn_direct(struct mlx5e_rx_res *res, unsigned int ix) +{ + return mlx5e_tir_get_tirn(&res->channels[ix].direct_tir); +} + +u32 mlx5e_rx_res_get_tirn_xsk(struct mlx5e_rx_res *res, unsigned int ix) +{ + WARN_ON(!(res->features & MLX5E_RX_RES_FEATURE_XSK)); + + return mlx5e_tir_get_tirn(&res->channels[ix].xsk_tir); +} + +u32 mlx5e_rx_res_get_tirn_rss(struct mlx5e_rx_res *res, enum mlx5_traffic_types tt) +{ + struct mlx5e_rss *rss = res->rss[0]; + + return mlx5e_rss_get_tirn(rss, tt, false); +} + +u32 mlx5e_rx_res_get_tirn_rss_inner(struct mlx5e_rx_res *res, enum mlx5_traffic_types tt) +{ + struct mlx5e_rss *rss = res->rss[0]; + + return mlx5e_rss_get_tirn(rss, tt, true); +} + +u32 mlx5e_rx_res_get_tirn_ptp(struct mlx5e_rx_res *res) +{ + WARN_ON(!(res->features & MLX5E_RX_RES_FEATURE_PTP)); + return mlx5e_tir_get_tirn(&res->ptp.tir); +} + +static u32 mlx5e_rx_res_get_rqtn_direct(struct mlx5e_rx_res *res, unsigned int ix) +{ + return mlx5e_rqt_get_rqtn(&res->channels[ix].direct_rqt); +} + +void mlx5e_rx_res_channels_activate(struct mlx5e_rx_res *res, struct mlx5e_channels *chs) +{ + unsigned int nch, ix; + int err; + + nch = mlx5e_channels_get_num(chs); + + for (ix = 0; ix < chs->num; ix++) + mlx5e_channels_get_regular_rqn(chs, ix, &res->rss_rqns[ix]); + res->rss_nch = chs->num; + + mlx5e_rx_res_rss_enable(res); + + for (ix = 0; ix < nch; ix++) { + u32 rqn; + + mlx5e_channels_get_regular_rqn(chs, ix, &rqn); + err = mlx5e_rqt_redirect_direct(&res->channels[ix].direct_rqt, rqn); + if (err) + mlx5_core_warn(res->mdev, "Failed to redirect direct RQT %#x to RQ %#x (channel %u): err = %d\n", + mlx5e_rqt_get_rqtn(&res->channels[ix].direct_rqt), + rqn, ix, err); + + if (!(res->features & MLX5E_RX_RES_FEATURE_XSK)) + continue; + + if (!mlx5e_channels_get_xsk_rqn(chs, ix, &rqn)) + rqn = res->drop_rqn; + err = mlx5e_rqt_redirect_direct(&res->channels[ix].xsk_rqt, rqn); + if (err) + mlx5_core_warn(res->mdev, "Failed to redirect XSK RQT %#x to RQ %#x (channel %u): err = %d\n", + mlx5e_rqt_get_rqtn(&res->channels[ix].xsk_rqt), + rqn, ix, err); + } + for (ix = nch; ix < res->max_nch; ix++) { + err = mlx5e_rqt_redirect_direct(&res->channels[ix].direct_rqt, res->drop_rqn); + if (err) + mlx5_core_warn(res->mdev, "Failed to redirect direct RQT %#x to drop RQ %#x (channel %u): err = %d\n", + mlx5e_rqt_get_rqtn(&res->channels[ix].direct_rqt), + res->drop_rqn, ix, err); + + if (!(res->features & MLX5E_RX_RES_FEATURE_XSK)) + continue; + + err = mlx5e_rqt_redirect_direct(&res->channels[ix].xsk_rqt, res->drop_rqn); + if (err) + mlx5_core_warn(res->mdev, "Failed to redirect XSK RQT %#x to drop RQ %#x (channel %u): err = %d\n", + mlx5e_rqt_get_rqtn(&res->channels[ix].xsk_rqt), + res->drop_rqn, ix, err); + } + + if (res->features & MLX5E_RX_RES_FEATURE_PTP) { + u32 rqn; + + if (!mlx5e_channels_get_ptp_rqn(chs, &rqn)) + rqn = res->drop_rqn; + + err = mlx5e_rqt_redirect_direct(&res->ptp.rqt, rqn); + if (err) + mlx5_core_warn(res->mdev, "Failed to redirect direct RQT %#x to RQ %#x (PTP): err = %d\n", + mlx5e_rqt_get_rqtn(&res->ptp.rqt), + rqn, err); + } +} + +void mlx5e_rx_res_channels_deactivate(struct mlx5e_rx_res *res) +{ + unsigned int ix; + int err; + + mlx5e_rx_res_rss_disable(res); + + for (ix = 0; ix < res->max_nch; ix++) { + err = mlx5e_rqt_redirect_direct(&res->channels[ix].direct_rqt, res->drop_rqn); + if (err) + mlx5_core_warn(res->mdev, "Failed to redirect direct RQT %#x to drop RQ %#x (channel %u): err = %d\n", + mlx5e_rqt_get_rqtn(&res->channels[ix].direct_rqt), + res->drop_rqn, ix, err); + + if (!(res->features & MLX5E_RX_RES_FEATURE_XSK)) + continue; + + err = mlx5e_rqt_redirect_direct(&res->channels[ix].xsk_rqt, res->drop_rqn); + if (err) + mlx5_core_warn(res->mdev, "Failed to redirect XSK RQT %#x to drop RQ %#x (channel %u): err = %d\n", + mlx5e_rqt_get_rqtn(&res->channels[ix].xsk_rqt), + res->drop_rqn, ix, err); + } + + if (res->features & MLX5E_RX_RES_FEATURE_PTP) { + err = mlx5e_rqt_redirect_direct(&res->ptp.rqt, res->drop_rqn); + if (err) + mlx5_core_warn(res->mdev, "Failed to redirect direct RQT %#x to drop RQ %#x (PTP): err = %d\n", + mlx5e_rqt_get_rqtn(&res->ptp.rqt), + res->drop_rqn, err); + } +} + +int mlx5e_rx_res_xsk_activate(struct mlx5e_rx_res *res, struct mlx5e_channels *chs, + unsigned int ix) +{ + u32 rqn; + int err; + + if (!mlx5e_channels_get_xsk_rqn(chs, ix, &rqn)) + return -EINVAL; + + err = mlx5e_rqt_redirect_direct(&res->channels[ix].xsk_rqt, rqn); + if (err) + mlx5_core_warn(res->mdev, "Failed to redirect XSK RQT %#x to XSK RQ %#x (channel %u): err = %d\n", + mlx5e_rqt_get_rqtn(&res->channels[ix].xsk_rqt), + rqn, ix, err); + return err; +} + +int mlx5e_rx_res_xsk_deactivate(struct mlx5e_rx_res *res, unsigned int ix) +{ + int err; + + err = mlx5e_rqt_redirect_direct(&res->channels[ix].xsk_rqt, res->drop_rqn); + if (err) + mlx5_core_warn(res->mdev, "Failed to redirect XSK RQT %#x to drop RQ %#x (channel %u): err = %d\n", + mlx5e_rqt_get_rqtn(&res->channels[ix].xsk_rqt), + res->drop_rqn, ix, err); + return err; +} + +int mlx5e_rx_res_packet_merge_set_param(struct mlx5e_rx_res *res, + struct mlx5e_packet_merge_param *pkt_merge_param) +{ + struct mlx5e_tir_builder *builder; + int err, final_err; + unsigned int ix; + + builder = mlx5e_tir_builder_alloc(true); + if (!builder) + return -ENOMEM; + + down_write(&res->pkt_merge_param_sem); + res->pkt_merge_param = *pkt_merge_param; + + mlx5e_tir_builder_build_packet_merge(builder, pkt_merge_param); + + final_err = 0; + + for (ix = 0; ix < MLX5E_MAX_NUM_RSS; ix++) { + struct mlx5e_rss *rss = res->rss[ix]; + + if (!rss) + continue; + + err = mlx5e_rss_packet_merge_set_param(rss, pkt_merge_param); + if (err) + final_err = final_err ? : err; + } + + for (ix = 0; ix < res->max_nch; ix++) { + err = mlx5e_tir_modify(&res->channels[ix].direct_tir, builder); + if (err) { + mlx5_core_warn(res->mdev, "Failed to update packet merge state of direct TIR %#x for channel %u: err = %d\n", + mlx5e_tir_get_tirn(&res->channels[ix].direct_tir), ix, err); + if (!final_err) + final_err = err; + } + } + + up_write(&res->pkt_merge_param_sem); + mlx5e_tir_builder_free(builder); + return final_err; +} + +struct mlx5e_rss_params_hash mlx5e_rx_res_get_current_hash(struct mlx5e_rx_res *res) +{ + return mlx5e_rss_get_hash(res->rss[0]); +} + +int mlx5e_rx_res_tls_tir_create(struct mlx5e_rx_res *res, unsigned int rxq, + struct mlx5e_tir *tir) +{ + bool inner_ft_support = res->features & MLX5E_RX_RES_FEATURE_INNER_FT; + struct mlx5e_tir_builder *builder; + u32 rqtn; + int err; + + builder = mlx5e_tir_builder_alloc(false); + if (!builder) + return -ENOMEM; + + rqtn = mlx5e_rx_res_get_rqtn_direct(res, rxq); + + mlx5e_tir_builder_build_rqt(builder, res->mdev->mlx5e_res.hw_objs.td.tdn, rqtn, + inner_ft_support); + mlx5e_tir_builder_build_direct(builder); + mlx5e_tir_builder_build_tls(builder); + down_read(&res->pkt_merge_param_sem); + mlx5e_tir_builder_build_packet_merge(builder, &res->pkt_merge_param); + err = mlx5e_tir_init(tir, builder, res->mdev, false); + up_read(&res->pkt_merge_param_sem); + + mlx5e_tir_builder_free(builder); + + return err; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.h new file mode 100644 index 0000000..b39b20a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.h @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2021, Mellanox Technologies inc. All rights reserved. */ + +#ifndef __MLX5_EN_RX_RES_H__ +#define __MLX5_EN_RX_RES_H__ + +#include +#include "rqt.h" +#include "tir.h" +#include "fs.h" +#include "rss.h" + +struct mlx5e_rx_res; + +struct mlx5e_channels; +struct mlx5e_rss_params_hash; + +enum mlx5e_rx_res_features { + MLX5E_RX_RES_FEATURE_INNER_FT = BIT(0), + MLX5E_RX_RES_FEATURE_XSK = BIT(1), + MLX5E_RX_RES_FEATURE_PTP = BIT(2), +}; + +/* Setup */ +struct mlx5e_rx_res *mlx5e_rx_res_alloc(void); +int mlx5e_rx_res_init(struct mlx5e_rx_res *res, struct mlx5_core_dev *mdev, + enum mlx5e_rx_res_features features, unsigned int max_nch, + u32 drop_rqn, const struct mlx5e_packet_merge_param *init_pkt_merge_param, + unsigned int init_nch); +void mlx5e_rx_res_destroy(struct mlx5e_rx_res *res); +void mlx5e_rx_res_free(struct mlx5e_rx_res *res); + +/* TIRN getters for flow steering */ +u32 mlx5e_rx_res_get_tirn_direct(struct mlx5e_rx_res *res, unsigned int ix); +u32 mlx5e_rx_res_get_tirn_xsk(struct mlx5e_rx_res *res, unsigned int ix); +u32 mlx5e_rx_res_get_tirn_rss(struct mlx5e_rx_res *res, enum mlx5_traffic_types tt); +u32 mlx5e_rx_res_get_tirn_rss_inner(struct mlx5e_rx_res *res, enum mlx5_traffic_types tt); +u32 mlx5e_rx_res_get_tirn_ptp(struct mlx5e_rx_res *res); + +/* Activate/deactivate API */ +void mlx5e_rx_res_channels_activate(struct mlx5e_rx_res *res, struct mlx5e_channels *chs); +void mlx5e_rx_res_channels_deactivate(struct mlx5e_rx_res *res); +int mlx5e_rx_res_xsk_activate(struct mlx5e_rx_res *res, struct mlx5e_channels *chs, + unsigned int ix); +int mlx5e_rx_res_xsk_deactivate(struct mlx5e_rx_res *res, unsigned int ix); + +/* Configuration API */ +void mlx5e_rx_res_rss_set_indir_uniform(struct mlx5e_rx_res *res, unsigned int nch); +int mlx5e_rx_res_rss_get_rxfh(struct mlx5e_rx_res *res, u32 rss_idx, + u32 *indir, u8 *key, u8 *hfunc); +int mlx5e_rx_res_rss_set_rxfh(struct mlx5e_rx_res *res, u32 rss_idx, + const u32 *indir, const u8 *key, const u8 *hfunc); + +u8 mlx5e_rx_res_rss_get_hash_fields(struct mlx5e_rx_res *res, enum mlx5_traffic_types tt); +int mlx5e_rx_res_rss_set_hash_fields(struct mlx5e_rx_res *res, enum mlx5_traffic_types tt, + u8 rx_hash_fields); +int mlx5e_rx_res_packet_merge_set_param(struct mlx5e_rx_res *res, + struct mlx5e_packet_merge_param *pkt_merge_param); + +int mlx5e_rx_res_rss_init(struct mlx5e_rx_res *res, u32 *rss_idx, unsigned int init_nch); +int mlx5e_rx_res_rss_destroy(struct mlx5e_rx_res *res, u32 rss_idx); +int mlx5e_rx_res_rss_cnt(struct mlx5e_rx_res *res); +int mlx5e_rx_res_rss_index(struct mlx5e_rx_res *res, struct mlx5e_rss *rss); +struct mlx5e_rss *mlx5e_rx_res_rss_get(struct mlx5e_rx_res *res, u32 rss_idx); + +/* Workaround for hairpin */ +struct mlx5e_rss_params_hash mlx5e_rx_res_get_current_hash(struct mlx5e_rx_res *res); + +/* Accel TIRs */ +int mlx5e_rx_res_tls_tir_create(struct mlx5e_rx_res *res, unsigned int rxq, + struct mlx5e_tir *tir); +#endif /* __MLX5_EN_RX_RES_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/accept.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/accept.c new file mode 100644 index 0000000..21aab96 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/accept.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +#include "act.h" +#include "en/tc_priv.h" + +static bool +tc_act_can_offload_accept(struct mlx5e_tc_act_parse_state *parse_state, + const struct flow_action_entry *act, + int act_index, + struct mlx5_flow_attr *attr) +{ + return true; +} + +static int +tc_act_parse_accept(struct mlx5e_tc_act_parse_state *parse_state, + const struct flow_action_entry *act, + struct mlx5e_priv *priv, + struct mlx5_flow_attr *attr) +{ + attr->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + attr->flags |= MLX5_ATTR_FLAG_ACCEPT; + + return 0; +} + +struct mlx5e_tc_act mlx5e_tc_act_accept = { + .can_offload = tc_act_can_offload_accept, + .parse_action = tc_act_parse_accept, +}; diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.c new file mode 100644 index 0000000..2486299 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.c @@ -0,0 +1,175 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +#include "act.h" +#include "en/tc/post_act.h" +#include "en/tc_priv.h" +#include "mlx5_core.h" + +/* Must be aligned with enum flow_action_id. */ +static struct mlx5e_tc_act *tc_acts_fdb[NUM_FLOW_ACTIONS] = { + &mlx5e_tc_act_accept, + &mlx5e_tc_act_drop, + &mlx5e_tc_act_trap, + &mlx5e_tc_act_goto, + &mlx5e_tc_act_mirred, + &mlx5e_tc_act_mirred, + &mlx5e_tc_act_redirect_ingress, + NULL, /* FLOW_ACTION_MIRRED_INGRESS, */ + &mlx5e_tc_act_vlan, + &mlx5e_tc_act_vlan, + &mlx5e_tc_act_vlan_mangle, + &mlx5e_tc_act_tun_encap, + &mlx5e_tc_act_tun_decap, + &mlx5e_tc_act_pedit, + &mlx5e_tc_act_pedit, + &mlx5e_tc_act_csum, + NULL, /* FLOW_ACTION_MARK, */ + &mlx5e_tc_act_ptype, + NULL, /* FLOW_ACTION_PRIORITY, */ + NULL, /* FLOW_ACTION_WAKE, */ + NULL, /* FLOW_ACTION_QUEUE, */ + &mlx5e_tc_act_sample, + &mlx5e_tc_act_police, + &mlx5e_tc_act_ct, + NULL, /* FLOW_ACTION_CT_METADATA, */ + &mlx5e_tc_act_mpls_push, + &mlx5e_tc_act_mpls_pop, +}; + +/* Must be aligned with enum flow_action_id. */ +static struct mlx5e_tc_act *tc_acts_nic[NUM_FLOW_ACTIONS] = { + &mlx5e_tc_act_accept, + &mlx5e_tc_act_drop, + NULL, /* FLOW_ACTION_TRAP, */ + &mlx5e_tc_act_goto, + &mlx5e_tc_act_mirred_nic, + NULL, /* FLOW_ACTION_MIRRED, */ + NULL, /* FLOW_ACTION_REDIRECT_INGRESS, */ + NULL, /* FLOW_ACTION_MIRRED_INGRESS, */ + NULL, /* FLOW_ACTION_VLAN_PUSH, */ + NULL, /* FLOW_ACTION_VLAN_POP, */ + NULL, /* FLOW_ACTION_VLAN_MANGLE, */ + NULL, /* FLOW_ACTION_TUNNEL_ENCAP, */ + NULL, /* FLOW_ACTION_TUNNEL_DECAP, */ + &mlx5e_tc_act_pedit, + &mlx5e_tc_act_pedit, + &mlx5e_tc_act_csum, + &mlx5e_tc_act_mark, + NULL, /* FLOW_ACTION_PTYPE, */ + &mlx5e_tc_act_prio, + NULL, /* FLOW_ACTION_WAKE, */ + NULL, /* FLOW_ACTION_QUEUE, */ + NULL, /* FLOW_ACTION_SAMPLE, */ + NULL, /* FLOW_ACTION_POLICE, */ + &mlx5e_tc_act_ct, +}; + +/** + * mlx5e_tc_act_get() - Get an action parser for an action id. + * @act_id: Flow action id. + * @ns_type: flow namespace type. + */ +struct mlx5e_tc_act * +mlx5e_tc_act_get(enum flow_action_id act_id, + enum mlx5_flow_namespace_type ns_type) +{ + struct mlx5e_tc_act **tc_acts; + + tc_acts = ns_type == MLX5_FLOW_NAMESPACE_FDB ? tc_acts_fdb : tc_acts_nic; + + return tc_acts[act_id]; +} + +/** + * mlx5e_tc_act_init_parse_state() - Init a new parse_state. + * @parse_state: Parsing state. + * @flow: mlx5e tc flow being handled. + * @flow_action: flow action to parse. + * @extack: to set an error msg. + * + * The same parse_state should be passed to action parsers + * for tracking the current parsing state. + */ +void +mlx5e_tc_act_init_parse_state(struct mlx5e_tc_act_parse_state *parse_state, + struct mlx5e_tc_flow *flow, + struct flow_action *flow_action, + struct netlink_ext_ack *extack) +{ + memset(parse_state, 0, sizeof(*parse_state)); + parse_state->flow = flow; + parse_state->num_actions = flow_action->num_entries; + parse_state->extack = extack; +} + +void +mlx5e_tc_act_reorder_flow_actions(struct flow_action *flow_action, + struct mlx5e_tc_flow_action *flow_action_reorder) +{ + struct flow_action_entry *act; + int i, j = 0; + + flow_action_for_each(i, act, flow_action) { + /* Add CT action to be first. */ + if (act->id == FLOW_ACTION_CT) + flow_action_reorder->entries[j++] = act; + } + + flow_action_for_each(i, act, flow_action) { + if (act->id == FLOW_ACTION_CT) + continue; + flow_action_reorder->entries[j++] = act; + } +} + +int +mlx5e_tc_act_post_parse(struct mlx5e_tc_act_parse_state *parse_state, + struct flow_action *flow_action, + struct mlx5_flow_attr *attr, + enum mlx5_flow_namespace_type ns_type) +{ + struct flow_action_entry *act; + struct mlx5e_tc_act *tc_act; + struct mlx5e_priv *priv; + int err = 0, i; + + priv = parse_state->flow->priv; + + flow_action_for_each(i, act, flow_action) { + tc_act = mlx5e_tc_act_get(act->id, ns_type); + if (!tc_act || !tc_act->post_parse) + continue; + + err = tc_act->post_parse(parse_state, priv, attr); + if (err) + goto out; + } + +out: + return err; +} + +int +mlx5e_tc_act_set_next_post_act(struct mlx5e_tc_flow *flow, + struct mlx5_flow_attr *attr, + struct mlx5_flow_attr *next_attr) +{ + struct mlx5_core_dev *mdev = flow->priv->mdev; + struct mlx5e_tc_mod_hdr_acts *mod_acts; + int err; + + mod_acts = &attr->parse_attr->mod_hdr_acts; + + /* Set handle on current post act rule to next post act rule. */ + err = mlx5e_tc_post_act_set_handle(mdev, next_attr->post_act_handle, mod_acts); + if (err) { + mlx5_core_warn(mdev, "Failed setting post action handle"); + return err; + } + + attr->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | + MLX5_FLOW_CONTEXT_ACTION_MOD_HDR; + + return 0; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.h new file mode 100644 index 0000000..0c7c1be --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/act.h @@ -0,0 +1,115 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#ifndef __MLX5_EN_TC_ACT_H__ +#define __MLX5_EN_TC_ACT_H__ + +#include +#include +#include +#include "eswitch.h" +#include "pedit.h" + +struct mlx5_flow_attr; + +struct mlx5e_tc_act_parse_state { + unsigned int num_actions; + struct mlx5e_tc_flow *flow; + struct netlink_ext_ack *extack; + bool ct; + u32 actions; + bool ct_clear; + bool encap; + bool decap; + bool mpls_push; + bool ptype_host; + const struct ip_tunnel_info *tun_info; + struct mlx5e_mpls_info mpls_info; + int ifindexes[MLX5_MAX_FLOW_FWD_VPORTS]; + int if_count; + struct mlx5_tc_ct_priv *ct_priv; +}; + +struct mlx5e_tc_act { + bool (*can_offload)(struct mlx5e_tc_act_parse_state *parse_state, + const struct flow_action_entry *act, + int act_index, + struct mlx5_flow_attr *attr); + + int (*parse_action)(struct mlx5e_tc_act_parse_state *parse_state, + const struct flow_action_entry *act, + struct mlx5e_priv *priv, + struct mlx5_flow_attr *attr); + + int (*post_parse)(struct mlx5e_tc_act_parse_state *parse_state, + struct mlx5e_priv *priv, + struct mlx5_flow_attr *attr); + + bool (*is_multi_table_act)(struct mlx5e_priv *priv, + const struct flow_action_entry *act, + struct mlx5_flow_attr *attr); + + int (*offload_action)(struct mlx5e_priv *priv, + struct flow_offload_action *fl_act, + struct flow_action_entry *act); + + int (*destroy_action)(struct mlx5e_priv *priv, + struct flow_offload_action *fl_act); + + int (*stats_action)(struct mlx5e_priv *priv, + struct flow_offload_action *fl_act); +}; + +struct mlx5e_tc_flow_action { + unsigned int num_entries; + struct flow_action_entry **entries; +}; + +extern struct mlx5e_tc_act mlx5e_tc_act_drop; +extern struct mlx5e_tc_act mlx5e_tc_act_trap; +extern struct mlx5e_tc_act mlx5e_tc_act_accept; +extern struct mlx5e_tc_act mlx5e_tc_act_mark; +extern struct mlx5e_tc_act mlx5e_tc_act_goto; +extern struct mlx5e_tc_act mlx5e_tc_act_tun_encap; +extern struct mlx5e_tc_act mlx5e_tc_act_tun_decap; +extern struct mlx5e_tc_act mlx5e_tc_act_csum; +extern struct mlx5e_tc_act mlx5e_tc_act_pedit; +extern struct mlx5e_tc_act mlx5e_tc_act_vlan; +extern struct mlx5e_tc_act mlx5e_tc_act_vlan_mangle; +extern struct mlx5e_tc_act mlx5e_tc_act_mpls_push; +extern struct mlx5e_tc_act mlx5e_tc_act_mpls_pop; +extern struct mlx5e_tc_act mlx5e_tc_act_mirred; +extern struct mlx5e_tc_act mlx5e_tc_act_mirred_nic; +extern struct mlx5e_tc_act mlx5e_tc_act_ct; +extern struct mlx5e_tc_act mlx5e_tc_act_sample; +extern struct mlx5e_tc_act mlx5e_tc_act_ptype; +extern struct mlx5e_tc_act mlx5e_tc_act_redirect_ingress; +extern struct mlx5e_tc_act mlx5e_tc_act_prio; +extern struct mlx5e_tc_act mlx5e_tc_act_police; + +struct mlx5e_tc_act * +mlx5e_tc_act_get(enum flow_action_id act_id, + enum mlx5_flow_namespace_type ns_type); + +void +mlx5e_tc_act_init_parse_state(struct mlx5e_tc_act_parse_state *parse_state, + struct mlx5e_tc_flow *flow, + struct flow_action *flow_action, + struct netlink_ext_ack *extack); + +void +mlx5e_tc_act_reorder_flow_actions(struct flow_action *flow_action, + struct mlx5e_tc_flow_action *flow_action_reorder); + +int +mlx5e_tc_act_post_parse(struct mlx5e_tc_act_parse_state *parse_state, + struct flow_action *flow_action, + struct mlx5_flow_attr *attr, + enum mlx5_flow_namespace_type ns_type); + +int +mlx5e_tc_act_set_next_post_act(struct mlx5e_tc_flow *flow, + struct mlx5_flow_attr *attr, + struct mlx5_flow_attr *next_attr); + +#endif /* __MLX5_EN_TC_ACT_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/csum.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/csum.c new file mode 100644 index 0000000..c0f08ae --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/csum.c @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +#include +#include "act.h" +#include "en/tc_priv.h" + +static bool +csum_offload_supported(struct mlx5e_priv *priv, + u32 action, + u32 update_flags, + struct netlink_ext_ack *extack) +{ + u32 prot_flags = TCA_CSUM_UPDATE_FLAG_IPV4HDR | TCA_CSUM_UPDATE_FLAG_TCP | + TCA_CSUM_UPDATE_FLAG_UDP; + + /* The HW recalcs checksums only if re-writing headers */ + if (!(action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR)) { + NL_SET_ERR_MSG_MOD(extack, + "TC csum action is only offloaded with pedit"); + netdev_warn(priv->netdev, + "TC csum action is only offloaded with pedit\n"); + return false; + } + + if (update_flags & ~prot_flags) { + NL_SET_ERR_MSG_MOD(extack, + "can't offload TC csum action for some header/s"); + netdev_warn(priv->netdev, + "can't offload TC csum action for some header/s - flags %#x\n", + update_flags); + return false; + } + + return true; +} + +static bool +tc_act_can_offload_csum(struct mlx5e_tc_act_parse_state *parse_state, + const struct flow_action_entry *act, + int act_index, + struct mlx5_flow_attr *attr) +{ + struct mlx5e_tc_flow *flow = parse_state->flow; + + return csum_offload_supported(flow->priv, attr->action, + act->csum_flags, parse_state->extack); +} + +static int +tc_act_parse_csum(struct mlx5e_tc_act_parse_state *parse_state, + const struct flow_action_entry *act, + struct mlx5e_priv *priv, + struct mlx5_flow_attr *attr) +{ + return 0; +} + +struct mlx5e_tc_act mlx5e_tc_act_csum = { + .can_offload = tc_act_can_offload_csum, + .parse_action = tc_act_parse_csum, +}; diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/ct.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/ct.c new file mode 100644 index 0000000..685ec67 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/ct.c @@ -0,0 +1,108 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +#include "act.h" +#include "en/tc_priv.h" +#include "en/tc_ct.h" + +static bool +tc_act_can_offload_ct(struct mlx5e_tc_act_parse_state *parse_state, + const struct flow_action_entry *act, + int act_index, + struct mlx5_flow_attr *attr) +{ + bool clear_action = act->ct.action & TCA_CT_ACT_CLEAR; + struct netlink_ext_ack *extack = parse_state->extack; + + if (flow_flag_test(parse_state->flow, SAMPLE)) { + NL_SET_ERR_MSG_MOD(extack, + "Sample action with connection tracking is not supported"); + return false; + } + + if (parse_state->ct && !clear_action) { + NL_SET_ERR_MSG_MOD(extack, "Multiple CT actions are not supoported"); + return false; + } + + return true; +} + +static int +tc_act_parse_ct(struct mlx5e_tc_act_parse_state *parse_state, + const struct flow_action_entry *act, + struct mlx5e_priv *priv, + struct mlx5_flow_attr *attr) +{ + bool clear_action = act->ct.action & TCA_CT_ACT_CLEAR; + int err; + + /* It's redundant to do ct clear more than once. */ + if (clear_action && parse_state->ct_clear) + return 0; + + err = mlx5_tc_ct_parse_action(parse_state->ct_priv, attr, + &attr->parse_attr->mod_hdr_acts, + act, parse_state->extack); + if (err) + return err; + + if (mlx5e_is_eswitch_flow(parse_state->flow)) + attr->esw_attr->split_count = attr->esw_attr->out_count; + + if (clear_action) { + parse_state->ct_clear = true; + } else { + attr->flags |= MLX5_ATTR_FLAG_CT; + flow_flag_set(parse_state->flow, CT); + parse_state->ct = true; + } + + return 0; +} + +static int +tc_act_post_parse_ct(struct mlx5e_tc_act_parse_state *parse_state, + struct mlx5e_priv *priv, + struct mlx5_flow_attr *attr) +{ + struct mlx5e_tc_mod_hdr_acts *mod_acts = &attr->parse_attr->mod_hdr_acts; + int err; + + /* If ct action exist, we can ignore previous ct_clear actions */ + if (parse_state->ct) + return 0; + + if (parse_state->ct_clear) { + err = mlx5_tc_ct_set_ct_clear_regs(parse_state->ct_priv, mod_acts); + if (err) { + NL_SET_ERR_MSG_MOD(parse_state->extack, + "Failed to set registers for ct clear"); + return err; + } + attr->action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR; + + /* Prevent handling of additional, redundant clear actions */ + parse_state->ct_clear = false; + } + return 0; +} + +static bool +tc_act_is_multi_table_act_ct(struct mlx5e_priv *priv, + const struct flow_action_entry *act, + struct mlx5_flow_attr *attr) +{ + if (act->ct.action & TCA_CT_ACT_CLEAR) + return false; + + return true; +} + +struct mlx5e_tc_act mlx5e_tc_act_ct = { + .can_offload = tc_act_can_offload_ct, + .parse_action = tc_act_parse_ct, + .is_multi_table_act = tc_act_is_multi_table_act_ct, + .post_parse = tc_act_post_parse_ct, +}; + diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/drop.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/drop.c new file mode 100644 index 0000000..dd025a9 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/drop.c @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +#include "act.h" +#include "en/tc_priv.h" + +static bool +tc_act_can_offload_drop(struct mlx5e_tc_act_parse_state *parse_state, + const struct flow_action_entry *act, + int act_index, + struct mlx5_flow_attr *attr) +{ + return true; +} + +static int +tc_act_parse_drop(struct mlx5e_tc_act_parse_state *parse_state, + const struct flow_action_entry *act, + struct mlx5e_priv *priv, + struct mlx5_flow_attr *attr) +{ + attr->action |= MLX5_FLOW_CONTEXT_ACTION_DROP; + + return 0; +} + +struct mlx5e_tc_act mlx5e_tc_act_drop = { + .can_offload = tc_act_can_offload_drop, + .parse_action = tc_act_parse_drop, +}; diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/goto.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/goto.c new file mode 100644 index 0000000..4726bcb --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/goto.c @@ -0,0 +1,123 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +#include "act.h" +#include "en/tc_priv.h" +#include "eswitch.h" + +static int +validate_goto_chain(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow, + struct mlx5_flow_attr *attr, + const struct flow_action_entry *act, + struct netlink_ext_ack *extack) +{ + bool is_esw = mlx5e_is_eswitch_flow(flow); + bool ft_flow = mlx5e_is_ft_flow(flow); + u32 dest_chain = act->chain_index; + struct mlx5_fs_chains *chains; + struct mlx5_eswitch *esw; + u32 reformat_and_fwd; + u32 max_chain; + + esw = priv->mdev->priv.eswitch; + chains = is_esw ? esw_chains(esw) : mlx5e_nic_chains(priv); + max_chain = mlx5_chains_get_chain_range(chains); + reformat_and_fwd = is_esw ? + MLX5_CAP_ESW_FLOWTABLE_FDB(priv->mdev, reformat_and_fwd_to_table) : + MLX5_CAP_FLOWTABLE_NIC_RX(priv->mdev, reformat_and_fwd_to_table); + + if (ft_flow) { + NL_SET_ERR_MSG_MOD(extack, "Goto action is not supported"); + return -EOPNOTSUPP; + } + + if (!mlx5_chains_backwards_supported(chains) && + dest_chain <= attr->chain) { + NL_SET_ERR_MSG_MOD(extack, "Goto lower numbered chain isn't supported"); + return -EOPNOTSUPP; + } + + if (dest_chain > max_chain) { + NL_SET_ERR_MSG_MOD(extack, + "Requested destination chain is out of supported range"); + return -EOPNOTSUPP; + } + + if (attr->action & (MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT | + MLX5_FLOW_CONTEXT_ACTION_DECAP) && + !reformat_and_fwd) { + NL_SET_ERR_MSG_MOD(extack, + "Goto chain is not allowed if action has reformat or decap"); + return -EOPNOTSUPP; + } + + return 0; +} + +static bool +tc_act_can_offload_goto(struct mlx5e_tc_act_parse_state *parse_state, + const struct flow_action_entry *act, + int act_index, + struct mlx5_flow_attr *attr) +{ + struct netlink_ext_ack *extack = parse_state->extack; + struct mlx5e_tc_flow *flow = parse_state->flow; + + if (validate_goto_chain(flow->priv, flow, attr, act, extack)) + return false; + + return true; +} + +static int +tc_act_parse_goto(struct mlx5e_tc_act_parse_state *parse_state, + const struct flow_action_entry *act, + struct mlx5e_priv *priv, + struct mlx5_flow_attr *attr) +{ + attr->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + attr->dest_chain = act->chain_index; + + return 0; +} + +static int +tc_act_post_parse_goto(struct mlx5e_tc_act_parse_state *parse_state, + struct mlx5e_priv *priv, + struct mlx5_flow_attr *attr) +{ + struct mlx5e_tc_flow_parse_attr *parse_attr = attr->parse_attr; + struct netlink_ext_ack *extack = parse_state->extack; + struct mlx5e_tc_flow *flow = parse_state->flow; + + if (!attr->dest_chain) + return 0; + + if (parse_state->decap) { + /* It can be supported if we'll create a mapping for + * the tunnel device only (without tunnel), and set + * this tunnel id with this decap flow. + * + * On restore (miss), we'll just set this saved tunnel + * device. + */ + + NL_SET_ERR_MSG_MOD(extack, "Decap with goto isn't supported"); + netdev_warn(priv->netdev, "Decap with goto isn't supported"); + return -EOPNOTSUPP; + } + + if (!mlx5e_is_eswitch_flow(flow) && parse_attr->mirred_ifindex[0]) { + NL_SET_ERR_MSG_MOD(extack, "Mirroring goto chain rules isn't supported"); + return -EOPNOTSUPP; + } + + return 0; +} + +struct mlx5e_tc_act mlx5e_tc_act_goto = { + .can_offload = tc_act_can_offload_goto, + .parse_action = tc_act_parse_goto, + .post_parse = tc_act_post_parse_goto, +}; diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/mark.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/mark.c new file mode 100644 index 0000000..e8d2275 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/mark.c @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +#include "act.h" +#include "en_tc.h" + +static bool +tc_act_can_offload_mark(struct mlx5e_tc_act_parse_state *parse_state, + const struct flow_action_entry *act, + int act_index, + struct mlx5_flow_attr *attr) +{ + if (act->mark & ~MLX5E_TC_FLOW_ID_MASK) { + NL_SET_ERR_MSG_MOD(parse_state->extack, "Bad flow mark, only 16 bit supported"); + return false; + } + + return true; +} + +static int +tc_act_parse_mark(struct mlx5e_tc_act_parse_state *parse_state, + const struct flow_action_entry *act, + struct mlx5e_priv *priv, + struct mlx5_flow_attr *attr) +{ + attr->nic_attr->flow_tag = act->mark; + attr->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + + return 0; +} + +struct mlx5e_tc_act mlx5e_tc_act_mark = { + .can_offload = tc_act_can_offload_mark, + .parse_action = tc_act_parse_mark, +}; diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/mirred.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/mirred.c new file mode 100644 index 0000000..14b1946 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/mirred.c @@ -0,0 +1,317 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +#include +#include +#include +#include +#include "act.h" +#include "vlan.h" +#include "en/tc_tun_encap.h" +#include "en/tc_priv.h" +#include "en_rep.h" + +static bool +same_vf_reps(struct mlx5e_priv *priv, struct net_device *out_dev) +{ + return mlx5e_eswitch_vf_rep(priv->netdev) && + priv->netdev == out_dev; +} + +static int +verify_uplink_forwarding(struct mlx5e_priv *priv, + struct mlx5_flow_attr *attr, + struct net_device *out_dev, + struct netlink_ext_ack *extack) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + struct mlx5e_rep_priv *rep_priv; + + /* Forwarding non encapsulated traffic between + * uplink ports is allowed only if + * termination_table_raw_traffic cap is set. + * + * Input vport was stored attr->in_rep. + * In LAG case, *priv* is the private data of + * uplink which may be not the input vport. + */ + rep_priv = mlx5e_rep_to_rep_priv(attr->esw_attr->in_rep); + + if (!(mlx5e_eswitch_uplink_rep(rep_priv->netdev) && + mlx5e_eswitch_uplink_rep(out_dev))) + return 0; + + if (!MLX5_CAP_ESW_FLOWTABLE_FDB(esw->dev, + termination_table_raw_traffic)) { + NL_SET_ERR_MSG_MOD(extack, + "devices are both uplink, can't offload forwarding"); + return -EOPNOTSUPP; + } else if (out_dev != rep_priv->netdev) { + NL_SET_ERR_MSG_MOD(extack, + "devices are not the same uplink, can't offload forwarding"); + return -EOPNOTSUPP; + } + return 0; +} + +static bool +is_duplicated_output_device(struct net_device *dev, + struct net_device *out_dev, + int *ifindexes, int if_count, + struct netlink_ext_ack *extack) +{ + int i; + + for (i = 0; i < if_count; i++) { + if (ifindexes[i] == out_dev->ifindex) { + NL_SET_ERR_MSG_MOD(extack, "can't duplicate output to same device"); + netdev_err(dev, "can't duplicate output to same device: %s\n", + out_dev->name); + return true; + } + } + + return false; +} + +static struct net_device * +get_fdb_out_dev(struct net_device *uplink_dev, struct net_device *out_dev) +{ + struct net_device *fdb_out_dev = out_dev; + struct net_device *uplink_upper; + + rcu_read_lock(); + uplink_upper = netdev_master_upper_dev_get_rcu(uplink_dev); + if (uplink_upper && netif_is_lag_master(uplink_upper) && + uplink_upper == out_dev) { + fdb_out_dev = uplink_dev; + } else if (netif_is_lag_master(out_dev)) { + fdb_out_dev = bond_option_active_slave_get_rcu(netdev_priv(out_dev)); + if (fdb_out_dev && + (!mlx5e_eswitch_rep(fdb_out_dev) || + !netdev_port_same_parent_id(fdb_out_dev, uplink_dev))) + fdb_out_dev = NULL; + } + rcu_read_unlock(); + return fdb_out_dev; +} + +static bool +tc_act_can_offload_mirred(struct mlx5e_tc_act_parse_state *parse_state, + const struct flow_action_entry *act, + int act_index, + struct mlx5_flow_attr *attr) +{ + struct netlink_ext_ack *extack = parse_state->extack; + struct mlx5e_tc_flow *flow = parse_state->flow; + struct mlx5e_tc_flow_parse_attr *parse_attr; + struct net_device *out_dev = act->dev; + struct mlx5e_priv *priv = flow->priv; + struct mlx5_esw_flow_attr *esw_attr; + + parse_attr = attr->parse_attr; + esw_attr = attr->esw_attr; + + if (!out_dev) { + /* out_dev is NULL when filters with + * non-existing mirred device are replayed to + * the driver. + */ + return false; + } + + if (parse_state->mpls_push && !netif_is_bareudp(out_dev)) { + NL_SET_ERR_MSG_MOD(extack, "mpls is supported only through a bareudp device"); + return false; + } + + if (mlx5e_is_ft_flow(flow) && out_dev == priv->netdev) { + /* Ignore forward to self rules generated + * by adding both mlx5 devs to the flow table + * block on a normal nft offload setup. + */ + return false; + } + + if (esw_attr->out_count >= MLX5_MAX_FLOW_FWD_VPORTS) { + NL_SET_ERR_MSG_MOD(extack, + "can't support more output ports, can't offload forwarding"); + netdev_warn(priv->netdev, + "can't support more than %d output ports, can't offload forwarding\n", + esw_attr->out_count); + return false; + } + + if (parse_state->encap || + netdev_port_same_parent_id(priv->netdev, out_dev) || + netif_is_ovs_master(out_dev)) + return true; + + if (parse_attr->filter_dev != priv->netdev) { + /* All mlx5 devices are called to configure + * high level device filters. Therefore, the + * *attempt* to install a filter on invalid + * eswitch should not trigger an explicit error + */ + return false; + } + + NL_SET_ERR_MSG_MOD(extack, "devices are not on same switch HW, can't offload forwarding"); + pr_err_once("devices %s %s not on same switch HW, can't offload forwarding\n", + priv->netdev->name, out_dev->name); + pr_debug("devices %s %s not on same switch HW, can't offload forwarding\n", + priv->netdev->name, out_dev->name); + + return false; +} + +static int +parse_mirred_encap(struct mlx5e_tc_act_parse_state *parse_state, + const struct flow_action_entry *act, + struct mlx5_flow_attr *attr) +{ + struct mlx5e_tc_flow_parse_attr *parse_attr = attr->parse_attr; + struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr; + struct net_device *out_dev = act->dev; + + parse_attr->mirred_ifindex[esw_attr->out_count] = out_dev->ifindex; + parse_attr->tun_info[esw_attr->out_count] = + mlx5e_dup_tun_info(parse_state->tun_info); + + if (!parse_attr->tun_info[esw_attr->out_count]) + return -ENOMEM; + + parse_state->encap = false; + + if (parse_state->mpls_push) { + memcpy(&parse_attr->mpls_info[esw_attr->out_count], + &parse_state->mpls_info, sizeof(parse_state->mpls_info)); + parse_state->mpls_push = false; + } + esw_attr->dests[esw_attr->out_count].flags |= MLX5_ESW_DEST_ENCAP; + esw_attr->out_count++; + /* attr->dests[].rep is resolved when we handle encap */ + + return 0; +} + +static int +parse_mirred(struct mlx5e_tc_act_parse_state *parse_state, + const struct flow_action_entry *act, + struct mlx5e_priv *priv, + struct mlx5_flow_attr *attr) +{ + struct mlx5e_tc_flow_parse_attr *parse_attr = attr->parse_attr; + struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr; + struct netlink_ext_ack *extack = parse_state->extack; + struct mlx5e_rep_priv *rpriv = priv->ppriv; + struct net_device *out_dev = act->dev; + struct net_device *uplink_dev; + struct mlx5e_priv *out_priv; + struct mlx5_eswitch *esw; + int *ifindexes; + int if_count; + int err; + + esw = priv->mdev->priv.eswitch; + uplink_dev = mlx5_eswitch_uplink_get_proto_dev(esw, REP_ETH); + ifindexes = parse_state->ifindexes; + if_count = parse_state->if_count; + + if (is_duplicated_output_device(priv->netdev, out_dev, ifindexes, if_count, extack)) + return -EOPNOTSUPP; + + parse_state->ifindexes[if_count] = out_dev->ifindex; + parse_state->if_count++; + + out_dev = get_fdb_out_dev(uplink_dev, out_dev); + if (!out_dev) + return -ENODEV; + + if (is_vlan_dev(out_dev)) { + err = mlx5e_tc_act_vlan_add_push_action(priv, attr, &out_dev, extack); + if (err) + return err; + } + + if (is_vlan_dev(parse_attr->filter_dev)) { + err = mlx5e_tc_act_vlan_add_pop_action(priv, attr, extack); + if (err) + return err; + } + + if (netif_is_macvlan(out_dev)) + out_dev = macvlan_dev_real_dev(out_dev); + + err = verify_uplink_forwarding(priv, attr, out_dev, extack); + if (err) + return err; + + if (!mlx5e_is_valid_eswitch_fwd_dev(priv, out_dev)) { + NL_SET_ERR_MSG_MOD(extack, + "devices are not on same switch HW, can't offload forwarding"); + return -EOPNOTSUPP; + } + + if (same_vf_reps(priv, out_dev)) { + NL_SET_ERR_MSG_MOD(extack, "can't forward from a VF to itself"); + return -EOPNOTSUPP; + } + + out_priv = netdev_priv(out_dev); + rpriv = out_priv->ppriv; + esw_attr->dests[esw_attr->out_count].rep = rpriv->rep; + esw_attr->dests[esw_attr->out_count].mdev = out_priv->mdev; + esw_attr->out_count++; + + return 0; +} + +static int +parse_mirred_ovs_master(struct mlx5e_tc_act_parse_state *parse_state, + const struct flow_action_entry *act, + struct mlx5e_priv *priv, + struct mlx5_flow_attr *attr) +{ + struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr; + struct net_device *out_dev = act->dev; + int err; + + err = mlx5e_set_fwd_to_int_port_actions(priv, attr, out_dev->ifindex, + MLX5E_TC_INT_PORT_EGRESS, + &attr->action, esw_attr->out_count); + if (err) + return err; + + esw_attr->out_count++; + return 0; +} + +static int +tc_act_parse_mirred(struct mlx5e_tc_act_parse_state *parse_state, + const struct flow_action_entry *act, + struct mlx5e_priv *priv, + struct mlx5_flow_attr *attr) +{ + struct net_device *out_dev = act->dev; + int err = -EOPNOTSUPP; + + if (parse_state->encap) + err = parse_mirred_encap(parse_state, act, attr); + else if (netdev_port_same_parent_id(priv->netdev, out_dev)) + err = parse_mirred(parse_state, act, priv, attr); + else if (netif_is_ovs_master(out_dev)) + err = parse_mirred_ovs_master(parse_state, act, priv, attr); + + if (err) + return err; + + attr->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + + return 0; +} + +struct mlx5e_tc_act mlx5e_tc_act_mirred = { + .can_offload = tc_act_can_offload_mirred, + .parse_action = tc_act_parse_mirred, +}; diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/mirred_nic.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/mirred_nic.c new file mode 100644 index 0000000..90b4c1b --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/mirred_nic.c @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +#include "act.h" +#include "en/tc_priv.h" + +static bool +tc_act_can_offload_mirred_nic(struct mlx5e_tc_act_parse_state *parse_state, + const struct flow_action_entry *act, + int act_index, + struct mlx5_flow_attr *attr) +{ + struct netlink_ext_ack *extack = parse_state->extack; + struct mlx5e_tc_flow *flow = parse_state->flow; + struct net_device *out_dev = act->dev; + struct mlx5e_priv *priv = flow->priv; + + if (act->id != FLOW_ACTION_REDIRECT) + return false; + + if (priv->netdev->netdev_ops != out_dev->netdev_ops || + !mlx5e_same_hw_devs(priv, netdev_priv(out_dev))) { + NL_SET_ERR_MSG_MOD(extack, + "devices are not on same switch HW, can't offload forwarding"); + netdev_warn(priv->netdev, + "devices %s %s not on same switch HW, can't offload forwarding\n", + netdev_name(priv->netdev), + out_dev->name); + return false; + } + + return true; +} + +static int +tc_act_parse_mirred_nic(struct mlx5e_tc_act_parse_state *parse_state, + const struct flow_action_entry *act, + struct mlx5e_priv *priv, + struct mlx5_flow_attr *attr) +{ + attr->parse_attr->mirred_ifindex[0] = act->dev->ifindex; + flow_flag_set(parse_state->flow, HAIRPIN); + attr->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + + return 0; +} + +struct mlx5e_tc_act mlx5e_tc_act_mirred_nic = { + .can_offload = tc_act_can_offload_mirred_nic, + .parse_action = tc_act_parse_mirred_nic, +}; diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/mpls.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/mpls.c new file mode 100644 index 0000000..96a80e0 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/mpls.c @@ -0,0 +1,98 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +#include +#include "act.h" +#include "en/tc_priv.h" + +static bool +tc_act_can_offload_mpls_push(struct mlx5e_tc_act_parse_state *parse_state, + const struct flow_action_entry *act, + int act_index, + struct mlx5_flow_attr *attr) +{ + struct netlink_ext_ack *extack = parse_state->extack; + struct mlx5e_priv *priv = parse_state->flow->priv; + + if (!MLX5_CAP_ESW_FLOWTABLE_FDB(priv->mdev, reformat_l2_to_l3_tunnel) || + act->mpls_push.proto != htons(ETH_P_MPLS_UC)) { + NL_SET_ERR_MSG_MOD(extack, "mpls push is supported only for mpls_uc protocol"); + return false; + } + + return true; +} + +static void +copy_mpls_info(struct mlx5e_mpls_info *mpls_info, + const struct flow_action_entry *act) +{ + mpls_info->label = act->mpls_push.label; + mpls_info->tc = act->mpls_push.tc; + mpls_info->bos = act->mpls_push.bos; + mpls_info->ttl = act->mpls_push.ttl; +} + +static int +tc_act_parse_mpls_push(struct mlx5e_tc_act_parse_state *parse_state, + const struct flow_action_entry *act, + struct mlx5e_priv *priv, + struct mlx5_flow_attr *attr) +{ + parse_state->mpls_push = true; + copy_mpls_info(&parse_state->mpls_info, act); + + return 0; +} + +static bool +tc_act_can_offload_mpls_pop(struct mlx5e_tc_act_parse_state *parse_state, + const struct flow_action_entry *act, + int act_index, + struct mlx5_flow_attr *attr) +{ + struct netlink_ext_ack *extack = parse_state->extack; + struct net_device *filter_dev; + + filter_dev = attr->parse_attr->filter_dev; + + /* we only support mpls pop if it is the first action + * and the filter net device is bareudp. Subsequent + * actions can be pedit and the last can be mirred + * egress redirect. + */ + if (act_index) { + NL_SET_ERR_MSG_MOD(extack, "mpls pop supported only as first action"); + return false; + } + + if (!netif_is_bareudp(filter_dev)) { + NL_SET_ERR_MSG_MOD(extack, "mpls pop supported only on bareudp devices"); + return false; + } + + return true; +} + +static int +tc_act_parse_mpls_pop(struct mlx5e_tc_act_parse_state *parse_state, + const struct flow_action_entry *act, + struct mlx5e_priv *priv, + struct mlx5_flow_attr *attr) +{ + attr->parse_attr->eth.h_proto = act->mpls_pop.proto; + attr->action |= MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT; + flow_flag_set(parse_state->flow, L3_TO_L2_DECAP); + + return 0; +} + +struct mlx5e_tc_act mlx5e_tc_act_mpls_push = { + .can_offload = tc_act_can_offload_mpls_push, + .parse_action = tc_act_parse_mpls_push, +}; + +struct mlx5e_tc_act mlx5e_tc_act_mpls_pop = { + .can_offload = tc_act_can_offload_mpls_pop, + .parse_action = tc_act_parse_mpls_pop, +}; diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/pedit.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/pedit.c new file mode 100644 index 0000000..39f8f71 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/pedit.c @@ -0,0 +1,164 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +#include +#include "act.h" +#include "pedit.h" +#include "en/tc_priv.h" +#include "en/mod_hdr.h" + +static int pedit_header_offsets[] = { + [FLOW_ACT_MANGLE_HDR_TYPE_ETH] = offsetof(struct pedit_headers, eth), + [FLOW_ACT_MANGLE_HDR_TYPE_IP4] = offsetof(struct pedit_headers, ip4), + [FLOW_ACT_MANGLE_HDR_TYPE_IP6] = offsetof(struct pedit_headers, ip6), + [FLOW_ACT_MANGLE_HDR_TYPE_TCP] = offsetof(struct pedit_headers, tcp), + [FLOW_ACT_MANGLE_HDR_TYPE_UDP] = offsetof(struct pedit_headers, udp), +}; + +#define pedit_header(_ph, _htype) ((void *)(_ph) + pedit_header_offsets[_htype]) + +static int +set_pedit_val(u8 hdr_type, u32 mask, u32 val, u32 offset, + struct pedit_headers_action *hdrs, + struct netlink_ext_ack *extack) +{ + u32 *curr_pmask, *curr_pval; + + curr_pmask = (u32 *)(pedit_header(&hdrs->masks, hdr_type) + offset); + curr_pval = (u32 *)(pedit_header(&hdrs->vals, hdr_type) + offset); + + if (*curr_pmask & mask) { /* disallow acting twice on the same location */ + NL_SET_ERR_MSG_MOD(extack, + "curr_pmask and new mask same. Acting twice on same location"); + goto out_err; + } + + *curr_pmask |= mask; + *curr_pval |= (val & mask); + + return 0; + +out_err: + return -EOPNOTSUPP; +} + +static int +parse_pedit_to_modify_hdr(struct mlx5e_priv *priv, + const struct flow_action_entry *act, int namespace, + struct mlx5e_tc_flow_parse_attr *parse_attr, + struct netlink_ext_ack *extack) +{ + struct pedit_headers_action *hdrs = parse_attr->hdrs; + u8 cmd = (act->id == FLOW_ACTION_MANGLE) ? 0 : 1; + u8 htype = act->mangle.htype; + int err = -EOPNOTSUPP; + u32 mask, val, offset; + + if (htype == FLOW_ACT_MANGLE_UNSPEC) { + NL_SET_ERR_MSG_MOD(extack, "legacy pedit isn't offloaded"); + goto out_err; + } + + if (!mlx5e_mod_hdr_max_actions(priv->mdev, namespace)) { + NL_SET_ERR_MSG_MOD(extack, "The pedit offload action is not supported"); + goto out_err; + } + + mask = act->mangle.mask; + val = act->mangle.val; + offset = act->mangle.offset; + + err = set_pedit_val(htype, ~mask, val, offset, &hdrs[cmd], extack); + if (err) + goto out_err; + + hdrs[cmd].pedits++; + + return 0; +out_err: + return err; +} + +static int +parse_pedit_to_reformat(const struct flow_action_entry *act, + struct mlx5e_tc_flow_parse_attr *parse_attr, + struct netlink_ext_ack *extack) +{ + u32 mask, val, offset; + u32 *p; + + if (act->id != FLOW_ACTION_MANGLE) { + NL_SET_ERR_MSG_MOD(extack, "Unsupported action id"); + return -EOPNOTSUPP; + } + + if (act->mangle.htype != FLOW_ACT_MANGLE_HDR_TYPE_ETH) { + NL_SET_ERR_MSG_MOD(extack, "Only Ethernet modification is supported"); + return -EOPNOTSUPP; + } + + mask = ~act->mangle.mask; + val = act->mangle.val; + offset = act->mangle.offset; + p = (u32 *)&parse_attr->eth; + *(p + (offset >> 2)) |= (val & mask); + + return 0; +} + +int +mlx5e_tc_act_pedit_parse_action(struct mlx5e_priv *priv, + const struct flow_action_entry *act, int namespace, + struct mlx5e_tc_flow_parse_attr *parse_attr, + struct mlx5e_tc_flow *flow, + struct netlink_ext_ack *extack) +{ + if (flow && flow_flag_test(flow, L3_TO_L2_DECAP)) + return parse_pedit_to_reformat(act, parse_attr, extack); + + return parse_pedit_to_modify_hdr(priv, act, namespace, parse_attr, extack); +} + +static bool +tc_act_can_offload_pedit(struct mlx5e_tc_act_parse_state *parse_state, + const struct flow_action_entry *act, + int act_index, + struct mlx5_flow_attr *attr) +{ + return true; +} + +static int +tc_act_parse_pedit(struct mlx5e_tc_act_parse_state *parse_state, + const struct flow_action_entry *act, + struct mlx5e_priv *priv, + struct mlx5_flow_attr *attr) +{ + struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr; + struct mlx5e_tc_flow *flow = parse_state->flow; + enum mlx5_flow_namespace_type ns_type; + int err; + + ns_type = mlx5e_get_flow_namespace(flow); + + err = mlx5e_tc_act_pedit_parse_action(flow->priv, act, ns_type, attr->parse_attr, + flow, parse_state->extack); + if (err) + return err; + + if (flow_flag_test(flow, L3_TO_L2_DECAP)) + goto out; + + attr->action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR; + + if (ns_type == MLX5_FLOW_NAMESPACE_FDB) + esw_attr->split_count = esw_attr->out_count; + +out: + return 0; +} + +struct mlx5e_tc_act mlx5e_tc_act_pedit = { + .can_offload = tc_act_can_offload_pedit, + .parse_action = tc_act_parse_pedit, +}; diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/pedit.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/pedit.h new file mode 100644 index 0000000..258f030 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/pedit.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#ifndef __MLX5_EN_TC_ACT_PEDIT_H__ +#define __MLX5_EN_TC_ACT_PEDIT_H__ + +#include "en_tc.h" + +struct pedit_headers { + struct ethhdr eth; + struct vlan_hdr vlan; + struct iphdr ip4; + struct ipv6hdr ip6; + struct tcphdr tcp; + struct udphdr udp; +}; + +struct pedit_headers_action { + struct pedit_headers vals; + struct pedit_headers masks; + u32 pedits; +}; + +int +mlx5e_tc_act_pedit_parse_action(struct mlx5e_priv *priv, + const struct flow_action_entry *act, int namespace, + struct mlx5e_tc_flow_parse_attr *parse_attr, + struct mlx5e_tc_flow *flow, + struct netlink_ext_ack *extack); + +#endif /* __MLX5_EN_TC_ACT_PEDIT_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/police.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/police.c new file mode 100644 index 0000000..9d8220f --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/police.c @@ -0,0 +1,143 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +#include "act.h" +#include "en/tc_priv.h" + +static bool +tc_act_can_offload_police(struct mlx5e_tc_act_parse_state *parse_state, + const struct flow_action_entry *act, + int act_index, + struct mlx5_flow_attr *attr) +{ + return !!mlx5e_get_flow_meters(parse_state->flow->priv->mdev); +} + +static int +fill_meter_params_from_act(const struct flow_action_entry *act, + struct mlx5e_flow_meter_params *params) +{ + params->index = act->hw_index; + if (act->police.rate_bytes_ps) { + params->mode = MLX5_RATE_LIMIT_BPS; + /* change rate to bits per second */ + params->rate = act->police.rate_bytes_ps << 3; + params->burst = act->police.burst; + } else if (act->police.rate_pkt_ps) { + params->mode = MLX5_RATE_LIMIT_PPS; + params->rate = act->police.rate_pkt_ps; + params->burst = act->police.burst_pkt; + } else { + return -EOPNOTSUPP; + } + + return 0; +} + +static int +tc_act_parse_police(struct mlx5e_tc_act_parse_state *parse_state, + const struct flow_action_entry *act, + struct mlx5e_priv *priv, + struct mlx5_flow_attr *attr) +{ + int err; + + err = fill_meter_params_from_act(act, &attr->meter_attr.params); + if (err) + return err; + + attr->action |= MLX5_FLOW_CONTEXT_ACTION_EXECUTE_ASO; + attr->exe_aso_type = MLX5_EXE_ASO_FLOW_METER; + + return 0; +} + +static bool +tc_act_is_multi_table_act_police(struct mlx5e_priv *priv, + const struct flow_action_entry *act, + struct mlx5_flow_attr *attr) +{ + return true; +} + +static int +tc_act_police_offload(struct mlx5e_priv *priv, + struct flow_offload_action *fl_act, + struct flow_action_entry *act) +{ + struct mlx5e_flow_meter_params params = {}; + struct mlx5e_flow_meter_handle *meter; + int err = 0; + + err = fill_meter_params_from_act(act, ¶ms); + if (err) + return err; + + meter = mlx5e_tc_meter_get(priv->mdev, ¶ms); + if (IS_ERR(meter) && PTR_ERR(meter) == -ENOENT) { + meter = mlx5e_tc_meter_replace(priv->mdev, ¶ms); + } else if (!IS_ERR(meter)) { + err = mlx5e_tc_meter_update(meter, ¶ms); + mlx5e_tc_meter_put(meter); + } + + if (IS_ERR(meter)) { + NL_SET_ERR_MSG_MOD(fl_act->extack, "Failed to get flow meter"); + mlx5_core_err(priv->mdev, "Failed to get flow meter %d\n", params.index); + err = PTR_ERR(meter); + } + + return err; +} + +static int +tc_act_police_destroy(struct mlx5e_priv *priv, + struct flow_offload_action *fl_act) +{ + struct mlx5e_flow_meter_params params = {}; + struct mlx5e_flow_meter_handle *meter; + + params.index = fl_act->index; + meter = mlx5e_tc_meter_get(priv->mdev, ¶ms); + if (IS_ERR(meter)) { + NL_SET_ERR_MSG_MOD(fl_act->extack, "Failed to get flow meter"); + mlx5_core_err(priv->mdev, "Failed to get flow meter %d\n", params.index); + return PTR_ERR(meter); + } + /* first put for the get and second for cleanup */ + mlx5e_tc_meter_put(meter); + mlx5e_tc_meter_put(meter); + return 0; +} + +static int +tc_act_police_stats(struct mlx5e_priv *priv, + struct flow_offload_action *fl_act) +{ + struct mlx5e_flow_meter_params params = {}; + struct mlx5e_flow_meter_handle *meter; + u64 bytes, packets, drops, lastuse; + + params.index = fl_act->index; + meter = mlx5e_tc_meter_get(priv->mdev, ¶ms); + if (IS_ERR(meter)) { + NL_SET_ERR_MSG_MOD(fl_act->extack, "Failed to get flow meter"); + mlx5_core_err(priv->mdev, "Failed to get flow meter %d\n", params.index); + return PTR_ERR(meter); + } + + mlx5e_tc_meter_get_stats(meter, &bytes, &packets, &drops, &lastuse); + flow_stats_update(&fl_act->stats, bytes, packets, drops, lastuse, + FLOW_ACTION_HW_STATS_DELAYED); + mlx5e_tc_meter_put(meter); + return 0; +} + +struct mlx5e_tc_act mlx5e_tc_act_police = { + .can_offload = tc_act_can_offload_police, + .parse_action = tc_act_parse_police, + .is_multi_table_act = tc_act_is_multi_table_act_police, + .offload_action = tc_act_police_offload, + .destroy_action = tc_act_police_destroy, + .stats_action = tc_act_police_stats, +}; diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/prio.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/prio.c new file mode 100644 index 0000000..27c6f17 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/prio.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +#include "act.h" +#include "en/tc_priv.h" + +struct pedit_headers_action; + +static bool +tc_act_can_offload_prio(struct mlx5e_tc_act_parse_state *parse_state, + const struct flow_action_entry *act, + int act_index, + struct mlx5_flow_attr *attr) +{ + if (act->priority > parse_state->flow->priv->fs.tc.num_prio_hp) { + NL_SET_ERR_MSG_MOD(parse_state->extack, "Skb priority value is out of range"); + return false; + } + + return true; +} + +static int +tc_act_parse_prio(struct mlx5e_tc_act_parse_state *parse_state, + const struct flow_action_entry *act, + struct mlx5e_priv *priv, + struct mlx5_flow_attr *attr) +{ + int err; + + attr->nic_attr->user_prio = act->priority; + err = mlx5e_tc_match_to_reg_set(priv->mdev, &attr->parse_attr->mod_hdr_acts, + MLX5_FLOW_NAMESPACE_KERNEL, + USER_PRIO_TO_REG, attr->nic_attr->user_prio); + if (err) + return err; + + attr->action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR; + + return 0; +} + +struct mlx5e_tc_act mlx5e_tc_act_prio = { + .can_offload = tc_act_can_offload_prio, + .parse_action = tc_act_parse_prio, +}; diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/ptype.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/ptype.c new file mode 100644 index 0000000..6454b03 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/ptype.c @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +#include "act.h" +#include "en/tc_priv.h" + +static bool +tc_act_can_offload_ptype(struct mlx5e_tc_act_parse_state *parse_state, + const struct flow_action_entry *act, + int act_index, + struct mlx5_flow_attr *attr) +{ + return true; +} + +static int +tc_act_parse_ptype(struct mlx5e_tc_act_parse_state *parse_state, + const struct flow_action_entry *act, + struct mlx5e_priv *priv, + struct mlx5_flow_attr *attr) +{ + struct netlink_ext_ack *extack = parse_state->extack; + + if (act->ptype != PACKET_HOST) { + NL_SET_ERR_MSG_MOD(extack, "skbedit ptype is only supported with type host"); + return -EOPNOTSUPP; + } + + parse_state->ptype_host = true; + return 0; +} + +struct mlx5e_tc_act mlx5e_tc_act_ptype = { + .can_offload = tc_act_can_offload_ptype, + .parse_action = tc_act_parse_ptype, +}; diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/redirect_ingress.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/redirect_ingress.c new file mode 100644 index 0000000..ad09a8a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/redirect_ingress.c @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +#include "act.h" +#include "en/tc_priv.h" + +static bool +tc_act_can_offload_redirect_ingress(struct mlx5e_tc_act_parse_state *parse_state, + const struct flow_action_entry *act, + int act_index, + struct mlx5_flow_attr *attr) +{ + struct netlink_ext_ack *extack = parse_state->extack; + struct mlx5e_tc_flow_parse_attr *parse_attr; + struct net_device *out_dev = act->dev; + struct mlx5_esw_flow_attr *esw_attr; + + parse_attr = attr->parse_attr; + esw_attr = attr->esw_attr; + + if (!out_dev) + return false; + + if (!netif_is_ovs_master(out_dev)) { + NL_SET_ERR_MSG_MOD(extack, + "redirect to ingress is supported only for OVS internal ports"); + return false; + } + + if (netif_is_ovs_master(parse_attr->filter_dev)) { + NL_SET_ERR_MSG_MOD(extack, + "redirect to ingress is not supported from internal port"); + return false; + } + + if (!parse_state->ptype_host) { + NL_SET_ERR_MSG_MOD(extack, + "redirect to int port ingress requires ptype=host action"); + return false; + } + + if (esw_attr->out_count) { + NL_SET_ERR_MSG_MOD(extack, + "redirect to int port ingress is supported only as single destination"); + return false; + } + + return true; +} + +static int +tc_act_parse_redirect_ingress(struct mlx5e_tc_act_parse_state *parse_state, + const struct flow_action_entry *act, + struct mlx5e_priv *priv, + struct mlx5_flow_attr *attr) +{ + struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr; + struct net_device *out_dev = act->dev; + int err; + + attr->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + + err = mlx5e_set_fwd_to_int_port_actions(priv, attr, out_dev->ifindex, + MLX5E_TC_INT_PORT_INGRESS, + &attr->action, esw_attr->out_count); + if (err) + return err; + + esw_attr->out_count++; + + return 0; +} + +struct mlx5e_tc_act mlx5e_tc_act_redirect_ingress = { + .can_offload = tc_act_can_offload_redirect_ingress, + .parse_action = tc_act_parse_redirect_ingress, +}; + diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/sample.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/sample.c new file mode 100644 index 0000000..2c01964 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/sample.c @@ -0,0 +1,71 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +#include +#include "act.h" +#include "en/tc_priv.h" +#include "en/tc/act/sample.h" + +static bool +tc_act_can_offload_sample(struct mlx5e_tc_act_parse_state *parse_state, + const struct flow_action_entry *act, + int act_index, + struct mlx5_flow_attr *attr) +{ + struct netlink_ext_ack *extack = parse_state->extack; + bool ct_nat; + + ct_nat = attr->ct_attr.ct_action & TCA_CT_ACT_NAT; + + if (flow_flag_test(parse_state->flow, CT) && ct_nat) { + NL_SET_ERR_MSG_MOD(extack, "Sample action with CT NAT is not supported"); + return false; + } + + return true; +} + +static int +tc_act_parse_sample(struct mlx5e_tc_act_parse_state *parse_state, + const struct flow_action_entry *act, + struct mlx5e_priv *priv, + struct mlx5_flow_attr *attr) +{ + struct mlx5e_sample_attr *sample_attr = &attr->sample_attr; + + sample_attr->rate = act->sample.rate; + sample_attr->group_num = act->sample.psample_group->group_num; + + if (act->sample.truncate) + sample_attr->trunc_size = act->sample.trunc_size; + + attr->flags |= MLX5_ATTR_FLAG_SAMPLE; + flow_flag_set(parse_state->flow, SAMPLE); + + return 0; +} + +bool +mlx5e_tc_act_sample_is_multi_table(struct mlx5_core_dev *mdev, + struct mlx5_flow_attr *attr) +{ + if (MLX5_CAP_GEN(mdev, reg_c_preserve) || + attr->action & MLX5_FLOW_CONTEXT_ACTION_DECAP) + return true; + + return false; +} + +static bool +tc_act_is_multi_table_act_sample(struct mlx5e_priv *priv, + const struct flow_action_entry *act, + struct mlx5_flow_attr *attr) +{ + return mlx5e_tc_act_sample_is_multi_table(priv->mdev, attr); +} + +struct mlx5e_tc_act mlx5e_tc_act_sample = { + .can_offload = tc_act_can_offload_sample, + .parse_action = tc_act_parse_sample, + .is_multi_table_act = tc_act_is_multi_table_act_sample, +}; diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/sample.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/sample.h new file mode 100644 index 0000000..3efb3a1 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/sample.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#ifndef __MLX5_EN_TC_ACT_SAMPLE_H__ +#define __MLX5_EN_TC_ACT_SAMPLE_H__ + +#include +#include "en/tc_priv.h" + +bool +mlx5e_tc_act_sample_is_multi_table(struct mlx5_core_dev *mdev, + struct mlx5_flow_attr *attr); + +#endif /* __MLX5_EN_TC_ACT_SAMPLE_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/trap.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/trap.c new file mode 100644 index 0000000..a7d9eab --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/trap.c @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +#include "act.h" +#include "en/tc_priv.h" + +static bool +tc_act_can_offload_trap(struct mlx5e_tc_act_parse_state *parse_state, + const struct flow_action_entry *act, + int act_index, + struct mlx5_flow_attr *attr) +{ + struct netlink_ext_ack *extack = parse_state->extack; + + if (parse_state->num_actions != 1) { + NL_SET_ERR_MSG_MOD(extack, "action trap is supported as a sole action only"); + return false; + } + + return true; +} + +static int +tc_act_parse_trap(struct mlx5e_tc_act_parse_state *parse_state, + const struct flow_action_entry *act, + struct mlx5e_priv *priv, + struct mlx5_flow_attr *attr) +{ + attr->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + attr->flags |= MLX5_ATTR_FLAG_SLOW_PATH; + + return 0; +} + +struct mlx5e_tc_act mlx5e_tc_act_trap = { + .can_offload = tc_act_can_offload_trap, + .parse_action = tc_act_parse_trap, +}; diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/tun.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/tun.c new file mode 100644 index 0000000..b4fa2de --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/tun.c @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +#include "act.h" +#include "en/tc_tun_encap.h" +#include "en/tc_priv.h" + +static bool +tc_act_can_offload_tun_encap(struct mlx5e_tc_act_parse_state *parse_state, + const struct flow_action_entry *act, + int act_index, + struct mlx5_flow_attr *attr) +{ + if (!act->tunnel) { + NL_SET_ERR_MSG_MOD(parse_state->extack, + "Zero tunnel attributes is not supported"); + return false; + } + + return true; +} + +static int +tc_act_parse_tun_encap(struct mlx5e_tc_act_parse_state *parse_state, + const struct flow_action_entry *act, + struct mlx5e_priv *priv, + struct mlx5_flow_attr *attr) +{ + parse_state->tun_info = act->tunnel; + parse_state->encap = true; + + return 0; +} + +static bool +tc_act_can_offload_tun_decap(struct mlx5e_tc_act_parse_state *parse_state, + const struct flow_action_entry *act, + int act_index, + struct mlx5_flow_attr *attr) +{ + return true; +} + +static int +tc_act_parse_tun_decap(struct mlx5e_tc_act_parse_state *parse_state, + const struct flow_action_entry *act, + struct mlx5e_priv *priv, + struct mlx5_flow_attr *attr) +{ + parse_state->decap = true; + + return 0; +} + +struct mlx5e_tc_act mlx5e_tc_act_tun_encap = { + .can_offload = tc_act_can_offload_tun_encap, + .parse_action = tc_act_parse_tun_encap, +}; + +struct mlx5e_tc_act mlx5e_tc_act_tun_decap = { + .can_offload = tc_act_can_offload_tun_decap, + .parse_action = tc_act_parse_tun_decap, +}; diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/vlan.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/vlan.c new file mode 100644 index 0000000..82e8acc --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/vlan.c @@ -0,0 +1,222 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +#include +#include "act.h" +#include "vlan.h" +#include "en/tc_priv.h" + +static int +add_vlan_prio_tag_rewrite_action(struct mlx5e_priv *priv, + struct mlx5e_tc_flow_parse_attr *parse_attr, + u32 *action, struct netlink_ext_ack *extack) +{ + const struct flow_action_entry prio_tag_act = { + .vlan.vid = 0, + .vlan.prio = + MLX5_GET(fte_match_set_lyr_2_4, + mlx5e_get_match_headers_value(*action, + &parse_attr->spec), + first_prio) & + MLX5_GET(fte_match_set_lyr_2_4, + mlx5e_get_match_headers_criteria(*action, + &parse_attr->spec), + first_prio), + }; + + return mlx5e_tc_act_vlan_add_rewrite_action(priv, MLX5_FLOW_NAMESPACE_FDB, + &prio_tag_act, parse_attr, action, + extack); +} + +static int +parse_tc_vlan_action(struct mlx5e_priv *priv, + const struct flow_action_entry *act, + struct mlx5_esw_flow_attr *attr, + u32 *action, + struct netlink_ext_ack *extack) +{ + u8 vlan_idx = attr->total_vlan; + + if (vlan_idx >= MLX5_FS_VLAN_DEPTH) { + NL_SET_ERR_MSG_MOD(extack, "Total vlans used is greater than supported"); + return -EOPNOTSUPP; + } + + switch (act->id) { + case FLOW_ACTION_VLAN_POP: + if (*action & MLX5_FLOW_CONTEXT_ACTION_VLAN_POP) { + if (!mlx5_eswitch_vlan_actions_supported(priv->mdev, + MLX5_FS_VLAN_DEPTH)) { + NL_SET_ERR_MSG_MOD(extack, "vlan pop action is not supported"); + return -EOPNOTSUPP; + } + + *action |= MLX5_FLOW_CONTEXT_ACTION_VLAN_POP_2; + } else { + *action |= MLX5_FLOW_CONTEXT_ACTION_VLAN_POP; + } + break; + case FLOW_ACTION_VLAN_PUSH: + attr->vlan_vid[vlan_idx] = act->vlan.vid; + attr->vlan_prio[vlan_idx] = act->vlan.prio; + attr->vlan_proto[vlan_idx] = act->vlan.proto; + if (!attr->vlan_proto[vlan_idx]) + attr->vlan_proto[vlan_idx] = htons(ETH_P_8021Q); + + if (*action & MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH) { + if (!mlx5_eswitch_vlan_actions_supported(priv->mdev, + MLX5_FS_VLAN_DEPTH)) { + NL_SET_ERR_MSG_MOD(extack, + "vlan push action is not supported for vlan depth > 1"); + return -EOPNOTSUPP; + } + + *action |= MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH_2; + } else { + if (!mlx5_eswitch_vlan_actions_supported(priv->mdev, 1) && + (act->vlan.proto != htons(ETH_P_8021Q) || + act->vlan.prio)) { + NL_SET_ERR_MSG_MOD(extack, "vlan push action is not supported"); + return -EOPNOTSUPP; + } + + *action |= MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH; + } + break; + default: + NL_SET_ERR_MSG_MOD(extack, "Unexpected action id for VLAN"); + return -EINVAL; + } + + attr->total_vlan = vlan_idx + 1; + + return 0; +} + +int +mlx5e_tc_act_vlan_add_push_action(struct mlx5e_priv *priv, + struct mlx5_flow_attr *attr, + struct net_device **out_dev, + struct netlink_ext_ack *extack) +{ + struct net_device *vlan_dev = *out_dev; + struct flow_action_entry vlan_act = { + .id = FLOW_ACTION_VLAN_PUSH, + .vlan.vid = vlan_dev_vlan_id(vlan_dev), + .vlan.proto = vlan_dev_vlan_proto(vlan_dev), + .vlan.prio = 0, + }; + int err; + + err = parse_tc_vlan_action(priv, &vlan_act, attr->esw_attr, &attr->action, extack); + if (err) + return err; + + rcu_read_lock(); + *out_dev = dev_get_by_index_rcu(dev_net(vlan_dev), dev_get_iflink(vlan_dev)); + rcu_read_unlock(); + if (!*out_dev) + return -ENODEV; + + if (is_vlan_dev(*out_dev)) + err = mlx5e_tc_act_vlan_add_push_action(priv, attr, out_dev, extack); + + return err; +} + +int +mlx5e_tc_act_vlan_add_pop_action(struct mlx5e_priv *priv, + struct mlx5_flow_attr *attr, + struct netlink_ext_ack *extack) +{ + struct flow_action_entry vlan_act = { + .id = FLOW_ACTION_VLAN_POP, + }; + int nest_level, err = 0; + + nest_level = attr->parse_attr->filter_dev->lower_level - + priv->netdev->lower_level; + while (nest_level--) { + err = parse_tc_vlan_action(priv, &vlan_act, attr->esw_attr, &attr->action, + extack); + if (err) + return err; + } + + return err; +} + +static bool +tc_act_can_offload_vlan(struct mlx5e_tc_act_parse_state *parse_state, + const struct flow_action_entry *act, + int act_index, + struct mlx5_flow_attr *attr) +{ + return true; +} + +static int +tc_act_parse_vlan(struct mlx5e_tc_act_parse_state *parse_state, + const struct flow_action_entry *act, + struct mlx5e_priv *priv, + struct mlx5_flow_attr *attr) +{ + struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr; + int err; + + if (esw_attr->split_count != esw_attr->out_count) { + NL_SET_ERR_MSG_MOD(parse_state->extack, "Multiple output ports with different vlan action is not supported"); + return -EOPNOTSUPP; + } + + if (act->id == FLOW_ACTION_VLAN_PUSH && + (attr->action & MLX5_FLOW_CONTEXT_ACTION_VLAN_POP)) { + /* Replace vlan pop+push with vlan modify */ + attr->action &= ~MLX5_FLOW_CONTEXT_ACTION_VLAN_POP; + err = mlx5e_tc_act_vlan_add_rewrite_action(priv, MLX5_FLOW_NAMESPACE_FDB, act, + attr->parse_attr, &attr->action, + parse_state->extack); + } else { + err = parse_tc_vlan_action(priv, act, esw_attr, &attr->action, + parse_state->extack); + } + + if (err) + return err; + + esw_attr->split_count = esw_attr->out_count; + + return 0; +} + +static int +tc_act_post_parse_vlan(struct mlx5e_tc_act_parse_state *parse_state, + struct mlx5e_priv *priv, + struct mlx5_flow_attr *attr) +{ + struct mlx5e_tc_flow_parse_attr *parse_attr = attr->parse_attr; + struct netlink_ext_ack *extack = parse_state->extack; + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + int err; + + if (MLX5_CAP_GEN(esw->dev, prio_tag_required) && + attr->action & MLX5_FLOW_CONTEXT_ACTION_VLAN_POP) { + /* For prio tag mode, replace vlan pop with rewrite vlan prio + * tag rewrite. + */ + attr->action &= ~MLX5_FLOW_CONTEXT_ACTION_VLAN_POP; + err = add_vlan_prio_tag_rewrite_action(priv, parse_attr, + &attr->action, extack); + if (err) + return err; + } + + return 0; +} + +struct mlx5e_tc_act mlx5e_tc_act_vlan = { + .can_offload = tc_act_can_offload_vlan, + .parse_action = tc_act_parse_vlan, + .post_parse = tc_act_post_parse_vlan, +}; diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/vlan.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/vlan.h new file mode 100644 index 0000000..2fa58c6 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/vlan.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#ifndef __MLX5_EN_TC_ACT_VLAN_H__ +#define __MLX5_EN_TC_ACT_VLAN_H__ + +#include +#include "en/tc_priv.h" + +struct pedit_headers_action; + +int +mlx5e_tc_act_vlan_add_push_action(struct mlx5e_priv *priv, + struct mlx5_flow_attr *attr, + struct net_device **out_dev, + struct netlink_ext_ack *extack); + +int +mlx5e_tc_act_vlan_add_pop_action(struct mlx5e_priv *priv, + struct mlx5_flow_attr *attr, + struct netlink_ext_ack *extack); + +int +mlx5e_tc_act_vlan_add_rewrite_action(struct mlx5e_priv *priv, int namespace, + const struct flow_action_entry *act, + struct mlx5e_tc_flow_parse_attr *parse_attr, + u32 *action, struct netlink_ext_ack *extack); + +#endif /* __MLX5_EN_TC_ACT_VLAN_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/vlan_mangle.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/vlan_mangle.c new file mode 100644 index 0000000..28444d4 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/act/vlan_mangle.c @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +#include +#include "act.h" +#include "vlan.h" +#include "en/tc_priv.h" + +struct pedit_headers_action; + +int +mlx5e_tc_act_vlan_add_rewrite_action(struct mlx5e_priv *priv, int namespace, + const struct flow_action_entry *act, + struct mlx5e_tc_flow_parse_attr *parse_attr, + u32 *action, struct netlink_ext_ack *extack) +{ + u16 mask16 = VLAN_VID_MASK; + u16 val16 = act->vlan.vid & VLAN_VID_MASK; + const struct flow_action_entry pedit_act = { + .id = FLOW_ACTION_MANGLE, + .mangle.htype = FLOW_ACT_MANGLE_HDR_TYPE_ETH, + .mangle.offset = offsetof(struct vlan_ethhdr, h_vlan_TCI), + .mangle.mask = ~(u32)be16_to_cpu(*(__be16 *)&mask16), + .mangle.val = (u32)be16_to_cpu(*(__be16 *)&val16), + }; + u8 match_prio_mask, match_prio_val; + void *headers_c, *headers_v; + int err; + + headers_c = mlx5e_get_match_headers_criteria(*action, &parse_attr->spec); + headers_v = mlx5e_get_match_headers_value(*action, &parse_attr->spec); + + if (!(MLX5_GET(fte_match_set_lyr_2_4, headers_c, cvlan_tag) && + MLX5_GET(fte_match_set_lyr_2_4, headers_v, cvlan_tag))) { + NL_SET_ERR_MSG_MOD(extack, "VLAN rewrite action must have VLAN protocol match"); + return -EOPNOTSUPP; + } + + match_prio_mask = MLX5_GET(fte_match_set_lyr_2_4, headers_c, first_prio); + match_prio_val = MLX5_GET(fte_match_set_lyr_2_4, headers_v, first_prio); + if (act->vlan.prio != (match_prio_val & match_prio_mask)) { + NL_SET_ERR_MSG_MOD(extack, "Changing VLAN prio is not supported"); + return -EOPNOTSUPP; + } + + err = mlx5e_tc_act_pedit_parse_action(priv, &pedit_act, namespace, parse_attr, + NULL, extack); + *action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR; + + return err; +} + +static bool +tc_act_can_offload_vlan_mangle(struct mlx5e_tc_act_parse_state *parse_state, + const struct flow_action_entry *act, + int act_index, + struct mlx5_flow_attr *attr) +{ + return true; +} + +static int +tc_act_parse_vlan_mangle(struct mlx5e_tc_act_parse_state *parse_state, + const struct flow_action_entry *act, + struct mlx5e_priv *priv, + struct mlx5_flow_attr *attr) +{ + enum mlx5_flow_namespace_type ns_type; + int err; + + ns_type = mlx5e_get_flow_namespace(parse_state->flow); + err = mlx5e_tc_act_vlan_add_rewrite_action(priv, ns_type, act, attr->parse_attr, + &attr->action, parse_state->extack); + if (err) + return err; + + if (ns_type == MLX5_FLOW_NAMESPACE_FDB) + attr->esw_attr->split_count = attr->esw_attr->out_count; + + return 0; +} + +struct mlx5e_tc_act mlx5e_tc_act_vlan_mangle = { + .can_offload = tc_act_can_offload_vlan_mangle, + .parse_action = tc_act_parse_vlan_mangle, +}; diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/ct_fs.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/ct_fs.h new file mode 100644 index 0000000..bb6b1a9 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/ct_fs.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. */ + +#ifndef __MLX5_EN_TC_CT_FS_H__ +#define __MLX5_EN_TC_CT_FS_H__ + +struct mlx5_ct_fs { + const struct net_device *netdev; + struct mlx5_core_dev *dev; + + /* private data */ + void *priv_data[]; +}; + +struct mlx5_ct_fs_rule { +}; + +struct mlx5_ct_fs_ops { + int (*init)(struct mlx5_ct_fs *fs, struct mlx5_flow_table *ct, + struct mlx5_flow_table *ct_nat, struct mlx5_flow_table *post_ct); + void (*destroy)(struct mlx5_ct_fs *fs); + + struct mlx5_ct_fs_rule * (*ct_rule_add)(struct mlx5_ct_fs *fs, + struct mlx5_flow_spec *spec, + struct mlx5_flow_attr *attr, + struct flow_rule *flow_rule); + void (*ct_rule_del)(struct mlx5_ct_fs *fs, struct mlx5_ct_fs_rule *fs_rule); + + size_t priv_size; +}; + +static inline void *mlx5_ct_fs_priv(struct mlx5_ct_fs *fs) +{ + return &fs->priv_data; +} + +struct mlx5_ct_fs_ops *mlx5_ct_fs_dmfs_ops_get(void); + +#if IS_ENABLED(CONFIG_MLX5_SW_STEERING) +struct mlx5_ct_fs_ops *mlx5_ct_fs_smfs_ops_get(void); +#else +static inline struct mlx5_ct_fs_ops * +mlx5_ct_fs_smfs_ops_get(void) +{ + return NULL; +} +#endif /* IS_ENABLED(CONFIG_MLX5_SW_STEERING) */ + +#endif /* __MLX5_EN_TC_CT_FS_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/ct_fs_dmfs.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/ct_fs_dmfs.c new file mode 100644 index 0000000..ae4f55b --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/ct_fs_dmfs.c @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. */ + +#include "en_tc.h" +#include "en/tc_ct.h" +#include "en/tc/ct_fs.h" + +#define ct_dbg(fmt, args...)\ + netdev_dbg(fs->netdev, "ct_fs_dmfs debug: " fmt "\n", ##args) + +struct mlx5_ct_fs_dmfs_rule { + struct mlx5_ct_fs_rule fs_rule; + struct mlx5_flow_handle *rule; + struct mlx5_flow_attr *attr; +}; + +static int +mlx5_ct_fs_dmfs_init(struct mlx5_ct_fs *fs, struct mlx5_flow_table *ct, + struct mlx5_flow_table *ct_nat, struct mlx5_flow_table *post_ct) +{ + return 0; +} + +static void +mlx5_ct_fs_dmfs_destroy(struct mlx5_ct_fs *fs) +{ +} + +static struct mlx5_ct_fs_rule * +mlx5_ct_fs_dmfs_ct_rule_add(struct mlx5_ct_fs *fs, struct mlx5_flow_spec *spec, + struct mlx5_flow_attr *attr, struct flow_rule *flow_rule) +{ + struct mlx5e_priv *priv = netdev_priv(fs->netdev); + struct mlx5_ct_fs_dmfs_rule *dmfs_rule; + int err; + + dmfs_rule = kzalloc(sizeof(*dmfs_rule), GFP_KERNEL); + if (!dmfs_rule) + return ERR_PTR(-ENOMEM); + + dmfs_rule->rule = mlx5_tc_rule_insert(priv, spec, attr); + if (IS_ERR(dmfs_rule->rule)) { + err = PTR_ERR(dmfs_rule->rule); + ct_dbg("Failed to add ct entry fs rule"); + goto err_insert; + } + + dmfs_rule->attr = attr; + + return &dmfs_rule->fs_rule; + +err_insert: + kfree(dmfs_rule); + return ERR_PTR(err); +} + +static void +mlx5_ct_fs_dmfs_ct_rule_del(struct mlx5_ct_fs *fs, struct mlx5_ct_fs_rule *fs_rule) +{ + struct mlx5_ct_fs_dmfs_rule *dmfs_rule = container_of(fs_rule, + struct mlx5_ct_fs_dmfs_rule, + fs_rule); + + mlx5_tc_rule_delete(netdev_priv(fs->netdev), dmfs_rule->rule, dmfs_rule->attr); + kfree(dmfs_rule); +} + +static struct mlx5_ct_fs_ops dmfs_ops = { + .ct_rule_add = mlx5_ct_fs_dmfs_ct_rule_add, + .ct_rule_del = mlx5_ct_fs_dmfs_ct_rule_del, + + .init = mlx5_ct_fs_dmfs_init, + .destroy = mlx5_ct_fs_dmfs_destroy, +}; + +struct mlx5_ct_fs_ops *mlx5_ct_fs_dmfs_ops_get(void) +{ + return &dmfs_ops; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/ct_fs_smfs.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/ct_fs_smfs.c new file mode 100644 index 0000000..bec9ed0 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/ct_fs_smfs.c @@ -0,0 +1,380 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. */ + +#include + +#include "en_tc.h" +#include "en/tc_priv.h" +#include "en/tc_ct.h" +#include "en/tc/ct_fs.h" + +#include "lib/smfs.h" + +#define INIT_ERR_PREFIX "ct_fs_smfs init failed" +#define ct_dbg(fmt, args...)\ + netdev_dbg(fs->netdev, "ct_fs_smfs debug: " fmt "\n", ##args) +#define MLX5_CT_TCP_FLAGS_MASK cpu_to_be16(be32_to_cpu(TCP_FLAG_RST | TCP_FLAG_FIN) >> 16) + +struct mlx5_ct_fs_smfs_matcher { + struct mlx5dr_matcher *dr_matcher; + struct list_head list; + int prio; + refcount_t ref; +}; + +struct mlx5_ct_fs_smfs_matchers { + struct mlx5_ct_fs_smfs_matcher smfs_matchers[6]; + struct list_head used; +}; + +struct mlx5_ct_fs_smfs { + struct mlx5dr_table *ct_tbl, *ct_nat_tbl; + struct mlx5_ct_fs_smfs_matchers matchers; + struct mlx5_ct_fs_smfs_matchers matchers_nat; + struct mlx5dr_action *fwd_action; + struct mlx5_flow_table *ct_nat; + struct mutex lock; /* Guards matchers */ +}; + +struct mlx5_ct_fs_smfs_rule { + struct mlx5_ct_fs_rule fs_rule; + struct mlx5dr_rule *rule; + struct mlx5dr_action *count_action; + struct mlx5_ct_fs_smfs_matcher *smfs_matcher; +}; + +static inline void +mlx5_ct_fs_smfs_fill_mask(struct mlx5_ct_fs *fs, struct mlx5_flow_spec *spec, bool ipv4, bool tcp, + bool gre) +{ + void *headers_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, outer_headers); + + if (likely(MLX5_CAP_FLOWTABLE_NIC_RX(fs->dev, ft_field_support.outer_ip_version))) + MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, ip_version); + else + MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, ethertype); + + MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, ip_protocol); + if (likely(ipv4)) { + MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, + src_ipv4_src_ipv6.ipv4_layout.ipv4); + MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, + dst_ipv4_dst_ipv6.ipv4_layout.ipv4); + } else { + memset(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c, + dst_ipv4_dst_ipv6.ipv6_layout.ipv6), + 0xFF, + MLX5_FLD_SZ_BYTES(fte_match_set_lyr_2_4, + dst_ipv4_dst_ipv6.ipv6_layout.ipv6)); + memset(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c, + src_ipv4_src_ipv6.ipv6_layout.ipv6), + 0xFF, + MLX5_FLD_SZ_BYTES(fte_match_set_lyr_2_4, + src_ipv4_src_ipv6.ipv6_layout.ipv6)); + } + + if (likely(tcp)) { + MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, tcp_sport); + MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, tcp_dport); + MLX5_SET(fte_match_set_lyr_2_4, headers_c, tcp_flags, + ntohs(MLX5_CT_TCP_FLAGS_MASK)); + } else if (!gre) { + MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, udp_sport); + MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, udp_dport); + } + + mlx5e_tc_match_to_reg_match(spec, ZONE_TO_REG, 0, MLX5_CT_ZONE_MASK); +} + +static struct mlx5dr_matcher * +mlx5_ct_fs_smfs_matcher_create(struct mlx5_ct_fs *fs, struct mlx5dr_table *tbl, bool ipv4, + bool tcp, bool gre, u32 priority) +{ + struct mlx5dr_matcher *dr_matcher; + struct mlx5_flow_spec *spec; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) + return ERR_PTR(-ENOMEM); + + mlx5_ct_fs_smfs_fill_mask(fs, spec, ipv4, tcp, gre); + spec->match_criteria_enable = MLX5_MATCH_MISC_PARAMETERS_2 | MLX5_MATCH_OUTER_HEADERS; + + dr_matcher = mlx5_smfs_matcher_create(tbl, priority, spec); + kfree(spec); + if (!dr_matcher) + return ERR_PTR(-EINVAL); + + return dr_matcher; +} + +static struct mlx5_ct_fs_smfs_matcher * +mlx5_ct_fs_smfs_matcher_get(struct mlx5_ct_fs *fs, bool nat, bool ipv4, bool tcp, bool gre) +{ + struct mlx5_ct_fs_smfs *fs_smfs = mlx5_ct_fs_priv(fs); + struct mlx5_ct_fs_smfs_matcher *m, *smfs_matcher; + struct mlx5_ct_fs_smfs_matchers *matchers; + struct mlx5dr_matcher *dr_matcher; + struct mlx5dr_table *tbl; + struct list_head *prev; + int prio; + + matchers = nat ? &fs_smfs->matchers_nat : &fs_smfs->matchers; + smfs_matcher = &matchers->smfs_matchers[ipv4 * 3 + tcp * 2 + gre]; + + if (refcount_inc_not_zero(&smfs_matcher->ref)) + return smfs_matcher; + + mutex_lock(&fs_smfs->lock); + + /* Retry with lock, as another thread might have already created the relevant matcher + * till we acquired the lock + */ + if (refcount_inc_not_zero(&smfs_matcher->ref)) + goto out_unlock; + + // Find next available priority in sorted used list + prio = 0; + prev = &matchers->used; + list_for_each_entry(m, &matchers->used, list) { + prev = &m->list; + + if (m->prio == prio) + prio = m->prio + 1; + else + break; + } + + tbl = nat ? fs_smfs->ct_nat_tbl : fs_smfs->ct_tbl; + dr_matcher = mlx5_ct_fs_smfs_matcher_create(fs, tbl, ipv4, tcp, gre, prio); + if (IS_ERR(dr_matcher)) { + netdev_warn(fs->netdev, + "ct_fs_smfs: failed to create matcher (nat %d, ipv4 %d, tcp %d, gre %d), err: %ld\n", + nat, ipv4, tcp, gre, PTR_ERR(dr_matcher)); + + smfs_matcher = ERR_CAST(dr_matcher); + goto out_unlock; + } + + smfs_matcher->dr_matcher = dr_matcher; + smfs_matcher->prio = prio; + list_add(&smfs_matcher->list, prev); + refcount_set(&smfs_matcher->ref, 1); + +out_unlock: + mutex_unlock(&fs_smfs->lock); + return smfs_matcher; +} + +static void +mlx5_ct_fs_smfs_matcher_put(struct mlx5_ct_fs *fs, struct mlx5_ct_fs_smfs_matcher *smfs_matcher) +{ + struct mlx5_ct_fs_smfs *fs_smfs = mlx5_ct_fs_priv(fs); + + if (!refcount_dec_and_mutex_lock(&smfs_matcher->ref, &fs_smfs->lock)) + return; + + mlx5_smfs_matcher_destroy(smfs_matcher->dr_matcher); + list_del(&smfs_matcher->list); + mutex_unlock(&fs_smfs->lock); +} + +static int +mlx5_ct_fs_smfs_init(struct mlx5_ct_fs *fs, struct mlx5_flow_table *ct, + struct mlx5_flow_table *ct_nat, struct mlx5_flow_table *post_ct) +{ + struct mlx5dr_table *ct_tbl, *ct_nat_tbl, *post_ct_tbl; + struct mlx5_ct_fs_smfs *fs_smfs = mlx5_ct_fs_priv(fs); + + post_ct_tbl = mlx5_smfs_table_get_from_fs_ft(post_ct); + ct_nat_tbl = mlx5_smfs_table_get_from_fs_ft(ct_nat); + ct_tbl = mlx5_smfs_table_get_from_fs_ft(ct); + fs_smfs->ct_nat = ct_nat; + + if (!ct_tbl || !ct_nat_tbl || !post_ct_tbl) { + netdev_warn(fs->netdev, "ct_fs_smfs: failed to init, missing backing dr tables"); + return -EOPNOTSUPP; + } + + ct_dbg("using smfs steering"); + + fs_smfs->fwd_action = mlx5_smfs_action_create_dest_table(post_ct_tbl); + if (!fs_smfs->fwd_action) { + return -EINVAL; + } + + fs_smfs->ct_tbl = ct_tbl; + fs_smfs->ct_nat_tbl = ct_nat_tbl; + mutex_init(&fs_smfs->lock); + INIT_LIST_HEAD(&fs_smfs->matchers.used); + INIT_LIST_HEAD(&fs_smfs->matchers_nat.used); + + return 0; +} + +static void +mlx5_ct_fs_smfs_destroy(struct mlx5_ct_fs *fs) +{ + struct mlx5_ct_fs_smfs *fs_smfs = mlx5_ct_fs_priv(fs); + + mlx5_smfs_action_destroy(fs_smfs->fwd_action); +} + +static inline bool +mlx5_tc_ct_valid_used_dissector_keys(const u32 used_keys) +{ +#define DISS_BIT(name) BIT(FLOW_DISSECTOR_KEY_ ## name) + const u32 basic_keys = DISS_BIT(BASIC) | DISS_BIT(CONTROL) | DISS_BIT(META); + const u32 ipv4_tcp = basic_keys | DISS_BIT(IPV4_ADDRS) | DISS_BIT(PORTS) | DISS_BIT(TCP); + const u32 ipv6_tcp = basic_keys | DISS_BIT(IPV6_ADDRS) | DISS_BIT(PORTS) | DISS_BIT(TCP); + const u32 ipv4_udp = basic_keys | DISS_BIT(IPV4_ADDRS) | DISS_BIT(PORTS); + const u32 ipv6_udp = basic_keys | DISS_BIT(IPV6_ADDRS) | DISS_BIT(PORTS); + const u32 ipv4_gre = basic_keys | DISS_BIT(IPV4_ADDRS); + const u32 ipv6_gre = basic_keys | DISS_BIT(IPV6_ADDRS); + + return (used_keys == ipv4_tcp || used_keys == ipv4_udp || used_keys == ipv6_tcp || + used_keys == ipv6_udp || used_keys == ipv4_gre || used_keys == ipv6_gre); +} + +static bool +mlx5_ct_fs_smfs_ct_validate_flow_rule(struct mlx5_ct_fs *fs, struct flow_rule *flow_rule) +{ + struct flow_match_ipv4_addrs ipv4_addrs; + struct flow_match_ipv6_addrs ipv6_addrs; + struct flow_match_control control; + struct flow_match_basic basic; + struct flow_match_ports ports; + struct flow_match_tcp tcp; + + if (!mlx5_tc_ct_valid_used_dissector_keys(flow_rule->match.dissector->used_keys)) { + ct_dbg("rule uses unexpected dissectors (0x%08x)", + flow_rule->match.dissector->used_keys); + return false; + } + + flow_rule_match_basic(flow_rule, &basic); + flow_rule_match_control(flow_rule, &control); + flow_rule_match_ipv4_addrs(flow_rule, &ipv4_addrs); + flow_rule_match_ipv6_addrs(flow_rule, &ipv6_addrs); + if (basic.key->ip_proto != IPPROTO_GRE) + flow_rule_match_ports(flow_rule, &ports); + if (basic.key->ip_proto == IPPROTO_TCP) + flow_rule_match_tcp(flow_rule, &tcp); + + if (basic.mask->n_proto != htons(0xFFFF) || + (basic.key->n_proto != htons(ETH_P_IP) && basic.key->n_proto != htons(ETH_P_IPV6)) || + basic.mask->ip_proto != 0xFF || + (basic.key->ip_proto != IPPROTO_UDP && basic.key->ip_proto != IPPROTO_TCP && + basic.key->ip_proto != IPPROTO_GRE)) { + ct_dbg("rule uses unexpected basic match (n_proto 0x%04x/0x%04x, ip_proto 0x%02x/0x%02x)", + ntohs(basic.key->n_proto), ntohs(basic.mask->n_proto), + basic.key->ip_proto, basic.mask->ip_proto); + return false; + } + + if (basic.key->ip_proto != IPPROTO_GRE && + (ports.mask->src != htons(0xFFFF) || ports.mask->dst != htons(0xFFFF))) { + ct_dbg("rule uses ports match (src 0x%04x, dst 0x%04x)", + ports.mask->src, ports.mask->dst); + return false; + } + + if (basic.key->ip_proto == IPPROTO_TCP && tcp.mask->flags != MLX5_CT_TCP_FLAGS_MASK) { + ct_dbg("rule uses unexpected tcp match (flags 0x%02x)", tcp.mask->flags); + return false; + } + + return true; +} + +static struct mlx5_ct_fs_rule * +mlx5_ct_fs_smfs_ct_rule_add(struct mlx5_ct_fs *fs, struct mlx5_flow_spec *spec, + struct mlx5_flow_attr *attr, struct flow_rule *flow_rule) +{ + struct mlx5_ct_fs_smfs *fs_smfs = mlx5_ct_fs_priv(fs); + struct mlx5_ct_fs_smfs_matcher *smfs_matcher; + struct mlx5_ct_fs_smfs_rule *smfs_rule; + struct mlx5dr_action *actions[5]; + struct mlx5dr_rule *rule; + int num_actions = 0, err; + bool nat, tcp, ipv4, gre; + + if (!mlx5_ct_fs_smfs_ct_validate_flow_rule(fs, flow_rule)) + return ERR_PTR(-EOPNOTSUPP); + + smfs_rule = kzalloc(sizeof(*smfs_rule), GFP_KERNEL); + if (!smfs_rule) + return ERR_PTR(-ENOMEM); + + smfs_rule->count_action = mlx5_smfs_action_create_flow_counter(mlx5_fc_id(attr->counter)); + if (!smfs_rule->count_action) { + err = -EINVAL; + goto err_count; + } + + actions[num_actions++] = smfs_rule->count_action; + actions[num_actions++] = attr->modify_hdr->action.dr_action; + actions[num_actions++] = fs_smfs->fwd_action; + + nat = (attr->ft == fs_smfs->ct_nat); + ipv4 = mlx5e_tc_get_ip_version(spec, true) == 4; + tcp = MLX5_GET(fte_match_param, spec->match_value, + outer_headers.ip_protocol) == IPPROTO_TCP; + gre = MLX5_GET(fte_match_param, spec->match_value, + outer_headers.ip_protocol) == IPPROTO_GRE; + + smfs_matcher = mlx5_ct_fs_smfs_matcher_get(fs, nat, ipv4, tcp, gre); + if (IS_ERR(smfs_matcher)) { + err = PTR_ERR(smfs_matcher); + goto err_matcher; + } + + rule = mlx5_smfs_rule_create(smfs_matcher->dr_matcher, spec, num_actions, actions, + spec->flow_context.flow_source); + if (!rule) { + err = -EINVAL; + goto err_create; + } + + smfs_rule->rule = rule; + smfs_rule->smfs_matcher = smfs_matcher; + + return &smfs_rule->fs_rule; + +err_create: + mlx5_ct_fs_smfs_matcher_put(fs, smfs_matcher); +err_matcher: + mlx5_smfs_action_destroy(smfs_rule->count_action); +err_count: + kfree(smfs_rule); + return ERR_PTR(err); +} + +static void +mlx5_ct_fs_smfs_ct_rule_del(struct mlx5_ct_fs *fs, struct mlx5_ct_fs_rule *fs_rule) +{ + struct mlx5_ct_fs_smfs_rule *smfs_rule = container_of(fs_rule, + struct mlx5_ct_fs_smfs_rule, + fs_rule); + + mlx5_smfs_rule_destroy(smfs_rule->rule); + mlx5_ct_fs_smfs_matcher_put(fs, smfs_rule->smfs_matcher); + mlx5_smfs_action_destroy(smfs_rule->count_action); + kfree(smfs_rule); +} + +static struct mlx5_ct_fs_ops fs_smfs_ops = { + .ct_rule_add = mlx5_ct_fs_smfs_ct_rule_add, + .ct_rule_del = mlx5_ct_fs_smfs_ct_rule_del, + + .init = mlx5_ct_fs_smfs_init, + .destroy = mlx5_ct_fs_smfs_destroy, + + .priv_size = sizeof(struct mlx5_ct_fs_smfs), +}; + +struct mlx5_ct_fs_ops * +mlx5_ct_fs_smfs_ops_get(void) +{ + return &fs_smfs_ops; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/int_port.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/int_port.c new file mode 100644 index 0000000..6636ef6 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/int_port.c @@ -0,0 +1,506 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#include +#include "en/mapping.h" +#include "en/tc/int_port.h" +#include "en.h" +#include "en_rep.h" +#include "en_tc.h" + +struct mlx5e_tc_int_port { + enum mlx5e_tc_int_port_type type; + int ifindex; + u32 match_metadata; + u32 mapping; + struct list_head list; + struct mlx5_flow_handle *rx_rule; + refcount_t refcnt; + struct rcu_head rcu_head; +}; + +struct mlx5e_tc_int_port_priv { + struct mlx5_core_dev *dev; + struct mutex int_ports_lock; /* Protects int ports list */ + struct list_head int_ports; /* Uses int_ports_lock */ + u16 num_ports; + bool ul_rep_rx_ready; /* Set when uplink is performing teardown */ + struct mapping_ctx *metadata_mapping; /* Metadata for source port rewrite and matching */ +}; + +bool mlx5e_tc_int_port_supported(const struct mlx5_eswitch *esw) +{ + return mlx5_eswitch_vport_match_metadata_enabled(esw) && + MLX5_CAP_GEN(esw->dev, reg_c_preserve); +} + +u32 mlx5e_tc_int_port_get_metadata(struct mlx5e_tc_int_port *int_port) +{ + return int_port->match_metadata; +} + +int mlx5e_tc_int_port_get_flow_source(struct mlx5e_tc_int_port *int_port) +{ + /* For egress forwarding we can have the case + * where the packet came from a vport and redirected + * to int port or it came from the uplink, going + * via internal port and hairpinned back to uplink + * so we set the source to any port in this case. + */ + return int_port->type == MLX5E_TC_INT_PORT_EGRESS ? + MLX5_FLOW_CONTEXT_FLOW_SOURCE_ANY_VPORT : + MLX5_FLOW_CONTEXT_FLOW_SOURCE_UPLINK; +} + +u32 mlx5e_tc_int_port_get_metadata_for_match(struct mlx5e_tc_int_port *int_port) +{ + return int_port->match_metadata << (32 - ESW_SOURCE_PORT_METADATA_BITS); +} + +static int mlx5e_tc_int_port_rep_map_insert(struct mlx5_eswitch *esw, + struct mlx5e_tc_int_port *int_port) +{ + struct mlx5e_rep_priv *rpriv_uplink; + u32 metadata; + int err; + + if (!mlx5e_esw_offloads_pet_enabled(esw)) + return 0; + + rpriv_uplink = mlx5_eswitch_get_uplink_priv(esw, REP_ETH); + + metadata = mlx5e_tc_int_port_get_metadata_for_match(int_port); + err = xa_insert(&rpriv_uplink->vport_rep_map, + metadata, + rpriv_uplink->rep, GFP_KERNEL); + if (err) { + esw_warn(esw->dev, "Error %d inserting int_vport metadata\n", err); + goto err; + } + + return 0; + +err: + return err; +} + +static void mlx5e_tc_int_port_rep_map_remove(struct mlx5_eswitch *esw, + struct mlx5e_tc_int_port *int_port) +{ + struct mlx5e_rep_priv *rpriv_uplink; + u32 metadata; + + if (!mlx5e_esw_offloads_pet_enabled(esw)) + return; + + rpriv_uplink = mlx5_eswitch_get_uplink_priv(esw, REP_ETH); + metadata = mlx5e_tc_int_port_get_metadata_for_match(int_port); + xa_erase(&rpriv_uplink->vport_rep_map, metadata); +} + +static struct mlx5_flow_handle * +mlx5e_int_port_create_rx_rule(struct mlx5_eswitch *esw, + struct mlx5e_tc_int_port *int_port, + struct mlx5_flow_destination *dest) + +{ + struct mlx5_flow_context *flow_context; + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_handle *flow_rule; + struct mlx5_flow_spec *spec; + void *misc; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) + return ERR_PTR(-ENOMEM); + + misc = MLX5_ADDR_OF(fte_match_param, spec->match_value, misc_parameters_2); + MLX5_SET(fte_match_set_misc2, misc, metadata_reg_c_0, + mlx5e_tc_int_port_get_metadata_for_match(int_port)); + + misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, misc_parameters_2); + MLX5_SET(fte_match_set_misc2, misc, metadata_reg_c_0, + mlx5_eswitch_get_vport_metadata_mask()); + + spec->match_criteria_enable = MLX5_MATCH_MISC_PARAMETERS_2; + + /* Overwrite flow tag with the int port metadata mapping + * instead of the chain mapping. + */ + flow_context = &spec->flow_context; + flow_context->flags |= FLOW_CONTEXT_HAS_TAG; + flow_context->flow_tag = int_port->mapping; + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + flow_rule = mlx5_add_flow_rules(esw->offloads.ft_offloads, spec, + &flow_act, dest, 1); + if (IS_ERR(flow_rule)) + mlx5_core_warn(esw->dev, "ft offloads: Failed to add internal vport rx rule err %ld\n", + PTR_ERR(flow_rule)); + + kvfree(spec); + + return flow_rule; +} + +static struct mlx5e_tc_int_port * +mlx5e_int_port_lookup(struct mlx5e_tc_int_port_priv *priv, + int ifindex, + enum mlx5e_tc_int_port_type type) +{ + struct mlx5e_tc_int_port *int_port; + + if (!priv->ul_rep_rx_ready) + goto not_found; + + list_for_each_entry(int_port, &priv->int_ports, list) + if (int_port->ifindex == ifindex && int_port->type == type) { + refcount_inc(&int_port->refcnt); + return int_port; + } + +not_found: + return NULL; +} + +static int mlx5e_int_port_metadata_alloc(struct mlx5e_tc_int_port_priv *priv, + int ifindex, enum mlx5e_tc_int_port_type type, + u32 *id) +{ + u32 mapped_key[2] = {type, ifindex}; + int err; + + err = mapping_add(priv->metadata_mapping, mapped_key, id); + if (err) + return err; + + /* Fill upper 4 bits of PFNUM with reserved value */ + *id |= 0xf << ESW_VPORT_BITS; + + return 0; +} + +static void mlx5e_int_port_metadata_free(struct mlx5e_tc_int_port_priv *priv, + u32 id) +{ + id &= (1 << ESW_VPORT_BITS) - 1; + mapping_remove(priv->metadata_mapping, id); +} + +/* Must be called with priv->int_ports_lock held */ +static struct mlx5e_tc_int_port * +mlx5e_int_port_add(struct mlx5e_tc_int_port_priv *priv, + int ifindex, + enum mlx5e_tc_int_port_type type) +{ + struct mlx5_eswitch *esw = priv->dev->priv.eswitch; + struct mlx5_mapped_obj mapped_obj = {}; + struct mlx5e_rep_priv *uplink_rpriv; + struct mlx5e_tc_int_port *int_port; + struct mlx5_flow_destination dest; + struct mapping_ctx *ctx; + u32 match_metadata; + u32 mapping; + int err; + + if (priv->num_ports == MLX5E_TC_MAX_INT_PORT_NUM) { + mlx5_core_dbg(priv->dev, "Cannot add a new int port, max supported %d", + MLX5E_TC_MAX_INT_PORT_NUM); + return ERR_PTR(-ENOSPC); + } + + int_port = kzalloc(sizeof(*int_port), GFP_KERNEL); + if (!int_port) + return ERR_PTR(-ENOMEM); + + err = mlx5e_int_port_metadata_alloc(priv, ifindex, type, &match_metadata); + if (err) { + mlx5_core_warn(esw->dev, "Cannot add a new internal port, metadata allocation failed for ifindex %d", + ifindex); + goto err_metadata; + } + + /* map metadata to reg_c0 object for miss handling */ + ctx = esw->offloads.reg_c0_obj_pool; + mapped_obj.type = MLX5_MAPPED_OBJ_INT_PORT_METADATA; + mapped_obj.int_port_metadata = match_metadata; + err = mapping_add(ctx, &mapped_obj, &mapping); + if (err) + goto err_map; + + int_port->type = type; + int_port->ifindex = ifindex; + int_port->match_metadata = match_metadata; + int_port->mapping = mapping; + + err = mlx5e_tc_int_port_rep_map_insert(esw, int_port); + if (err) + goto err_rep_map; + + /* Create a match on internal vport metadata in vport table */ + uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH); + + dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dest.ft = uplink_rpriv->root_ft; + + int_port->rx_rule = mlx5e_int_port_create_rx_rule(esw, int_port, &dest); + if (IS_ERR(int_port->rx_rule)) { + err = PTR_ERR(int_port->rx_rule); + mlx5_core_warn(esw->dev, "Can't add internal port rx rule, err %d", err); + goto err_rx_rule; + } + + refcount_set(&int_port->refcnt, 1); + list_add_rcu(&int_port->list, &priv->int_ports); + priv->num_ports++; + + return int_port; + +err_rx_rule: + mlx5e_tc_int_port_rep_map_remove(esw, int_port); + +err_rep_map: + mapping_remove(ctx, int_port->mapping); + +err_map: + mlx5e_int_port_metadata_free(priv, match_metadata); + +err_metadata: + kfree(int_port); + + return ERR_PTR(err); +} + +/* Must be called with priv->int_ports_lock held */ +static void +mlx5e_int_port_remove(struct mlx5e_tc_int_port_priv *priv, + struct mlx5e_tc_int_port *int_port) +{ + struct mlx5_eswitch *esw = priv->dev->priv.eswitch; + struct mapping_ctx *ctx; + + ctx = esw->offloads.reg_c0_obj_pool; + + list_del_rcu(&int_port->list); + mlx5e_tc_int_port_rep_map_remove(esw, int_port); + + /* The following parameters are not used by the + * rcu readers of this int_port object so it is + * safe to release them. + */ + if (int_port->rx_rule) + mlx5_del_flow_rules(int_port->rx_rule); + mapping_remove(ctx, int_port->mapping); + mlx5e_int_port_metadata_free(priv, int_port->match_metadata); + kfree_rcu(int_port); + priv->num_ports--; +} + +/* Must be called with rcu_read_lock held */ +static struct mlx5e_tc_int_port * +mlx5e_int_port_get_from_metadata(struct mlx5e_tc_int_port_priv *priv, + u32 metadata) +{ + struct mlx5e_tc_int_port *int_port; + + list_for_each_entry_rcu(int_port, &priv->int_ports, list) + if (int_port->match_metadata == metadata) + return int_port; + + return NULL; +} + +struct mlx5e_tc_int_port * +mlx5e_tc_int_port_get(struct mlx5e_tc_int_port_priv *priv, + int ifindex, + enum mlx5e_tc_int_port_type type) +{ + struct mlx5e_tc_int_port *int_port; + + if (!priv) + return ERR_PTR(-EOPNOTSUPP); + + mutex_lock(&priv->int_ports_lock); + + /* Reject request if ul rep not ready */ + if (!priv->ul_rep_rx_ready) { + int_port = ERR_PTR(-EOPNOTSUPP); + goto done; + } + + int_port = mlx5e_int_port_lookup(priv, ifindex, type); + if (int_port) + goto done; + + /* Alloc and add new int port to list */ + int_port = mlx5e_int_port_add(priv, ifindex, type); + +done: + mutex_unlock(&priv->int_ports_lock); + + return int_port; +} + +void +mlx5e_tc_int_port_put(struct mlx5e_tc_int_port_priv *priv, + struct mlx5e_tc_int_port *int_port) +{ + if (!refcount_dec_and_mutex_lock(&int_port->refcnt, &priv->int_ports_lock)) + return; + + mlx5e_int_port_remove(priv, int_port); + mutex_unlock(&priv->int_ports_lock); +} + +struct mlx5e_tc_int_port_priv * +mlx5e_tc_int_port_init(struct mlx5e_priv *priv) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + struct mlx5e_tc_int_port_priv *int_port_priv; + u64 mapping_id; + + if (!mlx5e_tc_int_port_supported(esw)) + return NULL; + + int_port_priv = kzalloc(sizeof(*int_port_priv), GFP_KERNEL); + if (!int_port_priv) + return NULL; + + mapping_id = mlx5_query_nic_system_image_guid(priv->mdev); + + int_port_priv->metadata_mapping = mapping_create_for_id(mapping_id, MAPPING_TYPE_INT_PORT, + sizeof(u32) * 2, + (1 << ESW_VPORT_BITS) - 1, true); + if (IS_ERR(int_port_priv->metadata_mapping)) { + mlx5_core_warn(priv->mdev, "Can't allocate metadata mapping of int port offload, err=%ld\n", + PTR_ERR(int_port_priv->metadata_mapping)); + goto err_mapping; + } + + int_port_priv->dev = priv->mdev; + mutex_init(&int_port_priv->int_ports_lock); + INIT_LIST_HEAD(&int_port_priv->int_ports); + + return int_port_priv; + +err_mapping: + kfree(int_port_priv); + + return NULL; +} + +void +mlx5e_tc_int_port_cleanup(struct mlx5e_tc_int_port_priv *priv) +{ + if (!priv) + return; + + mutex_destroy(&priv->int_ports_lock); + mapping_destroy(priv->metadata_mapping); + kfree(priv); +} + +/* Int port rx rules reside in ul rep rx tables. + * It is possible the ul rep will go down while there are + * still int port rules in its rx table so proper cleanup + * is required to free resources. + */ +void mlx5e_tc_int_port_init_rep_rx(struct mlx5e_priv *priv) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + struct mlx5_rep_uplink_priv *uplink_priv; + struct mlx5e_tc_int_port_priv *ppriv; + struct mlx5e_rep_priv *uplink_rpriv; + + uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH); + uplink_priv = &uplink_rpriv->uplink_priv; + + ppriv = uplink_priv->int_port_priv; + + if (!ppriv) + return; + + mutex_lock(&ppriv->int_ports_lock); + ppriv->ul_rep_rx_ready = true; + mutex_unlock(&ppriv->int_ports_lock); +} + +void mlx5e_tc_int_port_cleanup_rep_rx(struct mlx5e_priv *priv) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + struct mlx5_rep_uplink_priv *uplink_priv; + struct mlx5e_tc_int_port_priv *ppriv; + struct mlx5e_rep_priv *uplink_rpriv; + struct mlx5e_tc_int_port *int_port; + + uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH); + uplink_priv = &uplink_rpriv->uplink_priv; + + ppriv = uplink_priv->int_port_priv; + + if (!ppriv) + return; + + mutex_lock(&ppriv->int_ports_lock); + + ppriv->ul_rep_rx_ready = false; + + list_for_each_entry(int_port, &ppriv->int_ports, list) { + if (!IS_ERR_OR_NULL(int_port->rx_rule)) + mlx5_del_flow_rules(int_port->rx_rule); + + int_port->rx_rule = NULL; + } + + mutex_unlock(&ppriv->int_ports_lock); +} + +bool +mlx5e_tc_int_port_dev_fwd(struct mlx5e_tc_int_port_priv *priv, + struct sk_buff *skb, u32 int_vport_metadata, + bool *forward_tx) +{ + enum mlx5e_tc_int_port_type fwd_type; + struct mlx5e_tc_int_port *int_port; + struct net_device *dev; + int ifindex; + + if (!priv) + return false; + + rcu_read_lock(); + int_port = mlx5e_int_port_get_from_metadata(priv, int_vport_metadata); + if (!int_port) { + rcu_read_unlock(); + mlx5_core_dbg(priv->dev, "Unable to find int port with metadata 0x%.8x\n", + int_vport_metadata); + return false; + } + + ifindex = int_port->ifindex; + fwd_type = int_port->type; + rcu_read_unlock(); + + dev = dev_get_by_index(&init_net, ifindex); + if (!dev) { + mlx5_core_dbg(priv->dev, + "Couldn't find internal port device with ifindex: %d\n", + ifindex); + return false; + } + + skb->skb_iif = dev->ifindex; + skb->dev = dev; + + if (fwd_type == MLX5E_TC_INT_PORT_INGRESS) { + skb->pkt_type = PACKET_HOST; + skb_set_redirected(skb, true); + *forward_tx = false; + } else { + skb_reset_network_header(skb); + skb_push_rcsum(skb, skb->mac_len); + skb_set_redirected(skb, false); + *forward_tx = true; + } + + return true; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/int_port.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/int_port.h new file mode 100644 index 0000000..e72c79d --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/int_port.h @@ -0,0 +1,65 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#ifndef __MLX5_EN_TC_INT_PORT_H__ +#define __MLX5_EN_TC_INT_PORT_H__ + +#include "en.h" + +struct mlx5e_tc_int_port; +struct mlx5e_tc_int_port_priv; + +enum mlx5e_tc_int_port_type { + MLX5E_TC_INT_PORT_INGRESS, + MLX5E_TC_INT_PORT_EGRESS, +}; + +#if IS_ENABLED(CONFIG_MLX5_CLS_ACT) +bool mlx5e_tc_int_port_supported(const struct mlx5_eswitch *esw); + +struct mlx5e_tc_int_port_priv * +mlx5e_tc_int_port_init(struct mlx5e_priv *priv); +void +mlx5e_tc_int_port_cleanup(struct mlx5e_tc_int_port_priv *priv); + +void mlx5e_tc_int_port_init_rep_rx(struct mlx5e_priv *priv); +void mlx5e_tc_int_port_cleanup_rep_rx(struct mlx5e_priv *priv); + +bool +mlx5e_tc_int_port_dev_fwd(struct mlx5e_tc_int_port_priv *priv, + struct sk_buff *skb, u32 int_vport_metadata, + bool *forward_tx); +struct mlx5e_tc_int_port * +mlx5e_tc_int_port_get(struct mlx5e_tc_int_port_priv *priv, + int ifindex, + enum mlx5e_tc_int_port_type type); +void +mlx5e_tc_int_port_put(struct mlx5e_tc_int_port_priv *priv, + struct mlx5e_tc_int_port *int_port); + +u32 mlx5e_tc_int_port_get_metadata(struct mlx5e_tc_int_port *int_port); +u32 mlx5e_tc_int_port_get_metadata_for_match(struct mlx5e_tc_int_port *int_port); +int mlx5e_tc_int_port_get_flow_source(struct mlx5e_tc_int_port *int_port); +#else /* CONFIG_MLX5_CLS_ACT */ +static inline u32 +mlx5e_tc_int_port_get_metadata_for_match(struct mlx5e_tc_int_port *int_port) +{ + return 0; +} + +static inline int +mlx5e_tc_int_port_get_flow_source(struct mlx5e_tc_int_port *int_port) +{ + return 0; +} + +static inline bool mlx5e_tc_int_port_supported(const struct mlx5_eswitch *esw) +{ + return false; +} + +static inline void mlx5e_tc_int_port_init_rep_rx(struct mlx5e_priv *priv) {} +static inline void mlx5e_tc_int_port_cleanup_rep_rx(struct mlx5e_priv *priv) {} + +#endif /* CONFIG_MLX5_CLS_ACT */ +#endif /* __MLX5_EN_TC_INT_PORT_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/meter.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/meter.c new file mode 100644 index 0000000..4590926 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/meter.c @@ -0,0 +1,831 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +#include +#include "en/aso.h" +#include "meter.h" +#include "en/tc_priv.h" +#include "en/tc/post_act.h" + +#define START_COLOR_SHIFT 28 +#define METER_MODE_SHIFT 24 +#define CBS_EXP_SHIFT 24 +#define CBS_MAN_SHIFT 16 +#define CIR_EXP_SHIFT 8 + +/* cir = 8*(10^9)*cir_mantissa/(2^cir_exponent)) bits/s */ +#define CONST_CIR 8000000000ULL +#define CALC_CIR(m, e) ((CONST_CIR * (m)) >> (e)) +#define MAX_CIR ((CONST_CIR * 0x100) - 1) + +/* cbs = cbs_mantissa*2^cbs_exponent */ +#define CALC_CBS(m, e) ((m) << (e)) +#define MAX_CBS ((0x100ULL << 0x1F) - 1) +#define MAX_HW_CBS 0x7FFFFFFF + +struct mlx5e_flow_meters { + struct mlx5_core_dev *mdev; + enum mlx5_flow_namespace_type ns_type; + struct mlx5e_aso *aso; + int log_granularity; + + DECLARE_HASHTABLE(hashtbl, 8); + + struct mutex sync_lock; /* protect flow meter operations */ + struct list_head partial_list; + struct list_head full_list; + + struct mlx5e_post_act *post_act; +}; + +static void +mlx5e_flow_meter_cir_calc(u64 cir, u8 *man, u8 *exp) +{ + s64 _cir, _delta, delta = S64_MAX; + u8 e, _man = 0, _exp = 0; + u64 m; + + for (e = 0; e <= 0x1F; e++) { /* exp width 5bit */ + m = cir << e; + if ((s64)m < 0) /* overflow */ + break; + m = div64_u64(m, CONST_CIR); + if (m > 0xFF) /* man width 8 bit */ + continue; + _cir = CALC_CIR(m, e); + _delta = cir - _cir; + if (_delta < delta) { + _man = m; + _exp = e; + if (!_delta) + goto found; + delta = _delta; + } + } + +found: + *man = _man; + *exp = _exp; +} + +static void +mlx5e_flow_meter_cbs_calc(u64 cbs, u8 *man, u8 *exp) +{ + s64 _cbs, _delta, delta = S64_MAX; + u8 e, _man = 0, _exp = 0; + u64 m; + + for (e = 0; e <= 0x1F; e++) { /* exp width 5bit */ + m = cbs >> e; + if (m > 0xFF) /* man width 8 bit */ + continue; + _cbs = CALC_CBS(m, e); + _delta = cbs - _cbs; + if (_delta < delta) { + _man = m; + _exp = e; + if (!_delta) + goto found; + delta = _delta; + } + } + +found: + *man = _man; + *exp = _exp; +} + +int +mlx5e_flow_meter_send(struct mlx5_core_dev *mdev, + struct mlx5e_flow_meter_handle *meter, + struct mlx5e_flow_meter_params *meter_params) +{ + struct mlx5e_aso_ctrl_param param = {}; + struct mlx5_wqe_aso_data_seg *aso_data; + struct mlx5e_flow_meters *flow_meters; + u8 cir_man, cir_exp, cbs_man, cbs_exp; + struct mlx5e_aso_wqe_data *aso_wqe; + u16 pi, contig_wqebbs_room; + struct mlx5e_asosq *sq; + struct mlx5_wq_cyc *wq; + struct mlx5e_aso *aso; + u64 rate, burst; + int err = 0; + + flow_meters = meter->flow_meters; + aso = flow_meters->aso; + sq = &aso->sq; + wq = &sq->wq; + + rate = meter_params->rate; + burst = meter_params->burst; + /* HW treats each packet as 128 bytes in PPS mode */ + if (meter_params->mode == MLX5_RATE_LIMIT_PPS) { + rate <<= 10; + burst <<= 7; + } + + if (!rate || rate > MAX_CIR || !burst || burst > MAX_CBS) + return -EINVAL; + + /* HW has limitation of total 31 bits for cbs */ + if (burst > MAX_HW_CBS) { + mlx5_core_warn(mdev, + "burst(%lld) is too large, use HW allowed value(%d)\n", + burst, MAX_HW_CBS); + burst = MAX_HW_CBS; + } + + mlx5_core_dbg(mdev, "meter mode=%d\n", meter_params->mode); + mlx5e_flow_meter_cir_calc(rate, &cir_man, &cir_exp); + mlx5_core_dbg(mdev, "rate=%lld, cir=%lld, exp=%d, man=%d\n", + rate, CALC_CIR(cir_man, cir_exp), cir_exp, cir_man); + mlx5e_flow_meter_cbs_calc(burst, &cbs_man, &cbs_exp); + mlx5_core_dbg(mdev, "burst=%lld, cbs=%lld, exp=%d, man=%d\n", + burst, CALC_CBS((u64)cbs_man, cbs_exp), cbs_exp, cbs_man); + + if (!cir_man || !cbs_man) + return -EINVAL; + + mutex_lock(&aso->priv->aso_lock); + pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc); + contig_wqebbs_room = mlx5_wq_cyc_get_contig_wqebbs(wq, pi); + + if (unlikely(contig_wqebbs_room < MLX5E_ASO_WQEBBS_DATA)) { + mlx5e_fill_asosq_frag_edge(sq, wq, pi, contig_wqebbs_room); + pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc); + } + + aso_wqe = mlx5_wq_cyc_get_wqe(wq, pi); + param.data_mask_mode = ASO_DATA_MASK_MODE_BYTEWISE_64BYTE; + param.condition_operand = LOGICAL_OR; + param.condition_0_operand = ALWAYS_TRUE; + param.condition_1_operand = ALWAYS_TRUE; + param.data_mask = 0x80FFFFFFULL << (meter->idx ? 0 : 32); + mlx5e_build_aso_wqe(aso, sq, + DIV_ROUND_UP(sizeof(*aso_wqe), MLX5_SEND_WQE_DS), + &aso_wqe->ctrl, &aso_wqe->aso_ctrl, meter->obj_id, + MLX5_ACCESS_ASO_OPC_MOD_FLOW_METER, ¶m); + + aso_data = &aso_wqe->aso_data; + memset(aso_data, 0, sizeof(*aso_data)); + aso_data->bytewise_data[meter->idx * 8] = cpu_to_be32((0x1 << 31) | /* valid */ + (MLX5_FLOW_METER_COLOR_GREEN << START_COLOR_SHIFT)); + if (meter_params->mode == MLX5_RATE_LIMIT_PPS) + aso_data->bytewise_data[meter->idx * 8] |= + cpu_to_be32(MLX5_FLOW_METER_MODE_NUM_PACKETS << METER_MODE_SHIFT); + else + aso_data->bytewise_data[meter->idx * 8] |= + cpu_to_be32(MLX5_FLOW_METER_MODE_BYTES_IP_LENGTH << METER_MODE_SHIFT); + + aso_data->bytewise_data[meter->idx * 8 + 2] = cpu_to_be32((cbs_exp << CBS_EXP_SHIFT) | + (cbs_man << CBS_MAN_SHIFT) | + (cir_exp << CIR_EXP_SHIFT) | + cir_man); + + sq->db.aso_wqe[pi].opcode = MLX5_OPCODE_ACCESS_ASO; + sq->db.aso_wqe[pi].with_data = true; + sq->pc += MLX5E_ASO_WQEBBS_DATA; + sq->doorbell_cseg = &aso_wqe->ctrl; + + mlx5e_notify_hw(&sq->wq, sq->pc, sq->uar_map, sq->doorbell_cseg); + + /* Ensure doorbell is written on uar_page before poll_cq */ + WRITE_ONCE(sq->doorbell_cseg, NULL); + + err = mlx5e_poll_aso_cq(&sq->cq); + mutex_unlock(&aso->priv->aso_lock); + + return err; +} + +static int +mlx5e_flow_meter_create_aso_obj(struct mlx5_core_dev *dev, + struct mlx5e_flow_meters *flow_meters, int *obj_id) +{ + u32 in[MLX5_ST_SZ_DW(create_flow_meter_aso_obj_in)] = {}; + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)]; + void *obj; + int err; + + MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT); + MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, + MLX5_GENERAL_OBJECT_TYPES_FLOW_METER_ASO); + MLX5_SET(general_obj_in_cmd_hdr, in, log_obj_range, flow_meters->log_granularity); + + obj = MLX5_ADDR_OF(create_flow_meter_aso_obj_in, in, flow_meter_aso_obj); + MLX5_SET(flow_meter_aso_obj, obj, meter_aso_access_pd, flow_meters->aso->pdn); + + err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + if (!err) { + *obj_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id); + mlx5_core_dbg(dev, "flow meter aso obj(0x%x) created\n", *obj_id); + } + + return err; +} + +static void +mlx5e_flow_meter_destroy_aso_obj(struct mlx5_core_dev *dev, u32 obj_id) +{ + u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {}; + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)]; + + MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT); + MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, + MLX5_GENERAL_OBJECT_TYPES_FLOW_METER_ASO); + MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, obj_id); + + mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + mlx5_core_dbg(dev, "flow meter aso obj(0x%x) destroyed\n", obj_id); +} + +static struct mlx5e_flow_meter_handle * +__mlx5e_flow_meter_alloc(struct mlx5e_flow_meters *flow_meters) +{ + struct mlx5_core_dev *mdev = flow_meters->mdev; + struct mlx5e_flow_meter_aso_obj *meters_obj; + struct mlx5e_flow_meter_handle *meter; + struct mlx5_fc *counter; + int err, pos, total; + u32 id; + + meter = kzalloc(sizeof(*meter), GFP_KERNEL); + if (!meter) + return ERR_PTR(-ENOMEM); + + counter = mlx5_fc_create(mdev, true); + if (IS_ERR(counter)) { + err = PTR_ERR(counter); + goto err_red_counter; + } + meter->red_counter = counter; + + counter = mlx5_fc_create(mdev, true); + if (IS_ERR(counter)) { + err = PTR_ERR(counter); + goto err_green_counter; + } + meter->green_counter = counter; + + meters_obj = list_first_entry_or_null(&flow_meters->partial_list, + struct mlx5e_flow_meter_aso_obj, + entry); + /* 2 meters in one object */ + total = 1 << (flow_meters->log_granularity + 1); + if (!meters_obj) { + err = mlx5e_flow_meter_create_aso_obj(mdev, flow_meters, &id); + if (err) { + mlx5_core_err(mdev, "Failed to create flow meter ASO object\n"); + goto err_create; + } + + meters_obj = kzalloc(sizeof(*meters_obj) + BITS_TO_BYTES(total), + GFP_KERNEL); + if (!meters_obj) { + err = -ENOMEM; + goto err_mem; + } + + meters_obj->base_id = id; + meters_obj->total_meters = total; + list_add(&meters_obj->entry, &flow_meters->partial_list); + pos = 0; + } else { + pos = find_first_zero_bit(meters_obj->meters_map, total); + if (bitmap_weight(meters_obj->meters_map, total) == total - 1) { + list_del(&meters_obj->entry); + list_add(&meters_obj->entry, &flow_meters->full_list); + } + } + + bitmap_set(meters_obj->meters_map, pos, 1); + meter->flow_meters = flow_meters; + meter->meters_obj = meters_obj; + meter->obj_id = meters_obj->base_id + pos / 2; + meter->idx = pos % 2; + + mlx5_core_dbg(mdev, "flow meter allocated, obj_id=0x%x, index=%d\n", + meter->obj_id, meter->idx); + + return meter; + +err_mem: + mlx5e_flow_meter_destroy_aso_obj(mdev, id); +err_create: + mlx5_fc_destroy(mdev, meter->green_counter); +err_green_counter: + mlx5_fc_destroy(mdev, meter->red_counter); +err_red_counter: + kfree(meter); + return ERR_PTR(err); +} + +static void +__mlx5e_flow_meter_free(struct mlx5e_flow_meter_handle *meter) +{ + struct mlx5e_flow_meters *flow_meters = meter->flow_meters; + struct mlx5_core_dev *mdev = flow_meters->mdev; + struct mlx5e_flow_meter_aso_obj *meters_obj; + int n, pos; + + mlx5_fc_destroy(mdev, meter->green_counter); + mlx5_fc_destroy(mdev, meter->red_counter); + + meters_obj = meter->meters_obj; + pos = (meter->obj_id - meters_obj->base_id) * 2 + meter->idx; + bitmap_clear(meters_obj->meters_map, pos, 1); + n = bitmap_weight(meters_obj->meters_map, meters_obj->total_meters); + if (n == 0) { + list_del(&meters_obj->entry); + mlx5e_flow_meter_destroy_aso_obj(mdev, meters_obj->base_id); + kfree(meters_obj); + } else if (n == meters_obj->total_meters - 1) { + list_del(&meters_obj->entry); + list_add(&meters_obj->entry, &flow_meters->partial_list); + } + + mlx5_core_dbg(mdev, "flow meter freed, obj_id=0x%x, index=%d\n", + meter->obj_id, meter->idx); + kfree(meter); +} + +static struct mlx5e_flow_meter_handle * +__mlx5e_tc_meter_get(struct mlx5e_flow_meters *flow_meters, u32 index) +{ + struct mlx5e_flow_meter_handle *meter; + + hash_for_each_possible(flow_meters->hashtbl, meter, hlist, index) + if (meter->params.index == index) + goto add_ref; + + return ERR_PTR(-ENOENT); + +add_ref: + meter->refcnt++; + + return meter; +} + +struct mlx5e_flow_meter_handle * +mlx5e_tc_meter_get(struct mlx5_core_dev *mdev, struct mlx5e_flow_meter_params *params) +{ + struct mlx5e_flow_meters *flow_meters; + struct mlx5e_flow_meter_handle *meter; + + flow_meters = mlx5e_get_flow_meters(mdev); + if (!flow_meters) + return ERR_PTR(-EOPNOTSUPP); + + meter = __mlx5e_tc_meter_get(flow_meters, params->index); + mutex_unlock(&flow_meters->sync_lock); + + return meter; +} + +static void +__mlx5e_tc_meter_put(struct mlx5e_flow_meter_handle *meter) +{ + if (--meter->refcnt == 0) { + hash_del(&meter->hlist); + __mlx5e_flow_meter_free(meter); + } +} + +void +mlx5e_tc_meter_put(struct mlx5e_flow_meter_handle *meter) +{ + struct mlx5e_flow_meters *flow_meters = meter->flow_meters; + + mutex_lock(&flow_meters->sync_lock); + __mlx5e_tc_meter_put(meter); + mutex_unlock(&flow_meters->sync_lock); +} + +static struct mlx5e_flow_meter_handle * +mlx5e_tc_meter_alloc(struct mlx5e_flow_meters *flow_meters, + struct mlx5e_flow_meter_params *params) +{ + struct mlx5e_flow_meter_handle *meter; + + meter = __mlx5e_flow_meter_alloc(flow_meters); + if (IS_ERR(meter)) + return meter; + + hash_add(flow_meters->hashtbl, &meter->hlist, params->index); + meter->params.index = params->index; + meter->refcnt++; + + return meter; +} + +static int +__mlx5e_tc_meter_update(struct mlx5e_flow_meter_handle *meter, + struct mlx5e_flow_meter_params *params) +{ + struct mlx5_core_dev *mdev = meter->flow_meters->mdev; + int err = 0; + + if (meter->params.mode != params->mode || meter->params.rate != params->rate || + meter->params.burst != params->burst) { + err = mlx5e_flow_meter_send(mdev, meter, params); + if (err) + goto out; + + meter->params.mode = params->mode; + meter->params.rate = params->rate; + meter->params.burst = params->burst; + } + +out: + return err; +} + +int +mlx5e_tc_meter_update(struct mlx5e_flow_meter_handle *meter, + struct mlx5e_flow_meter_params *params) +{ + struct mlx5_core_dev *mdev = meter->flow_meters->mdev; + struct mlx5e_flow_meters *flow_meters; + int err; + + flow_meters = mlx5e_get_flow_meters(mdev); + if (!flow_meters) + return -EOPNOTSUPP; + + mutex_lock(&flow_meters->sync_lock); + err = __mlx5e_tc_meter_update(meter, params); + mutex_unlock(&flow_meters->sync_lock); + return err; +} + +struct mlx5e_flow_meter_handle * +mlx5e_tc_meter_replace(struct mlx5_core_dev *mdev, struct mlx5e_flow_meter_params *params) +{ + struct mlx5e_flow_meters *flow_meters; + struct mlx5e_flow_meter_handle *meter; + int err; + + flow_meters = mlx5e_get_flow_meters(mdev); + if (!flow_meters) + return ERR_PTR(-EOPNOTSUPP); + + mutex_lock(&flow_meters->sync_lock); + meter = __mlx5e_tc_meter_get(flow_meters, params->index); + if (IS_ERR(meter)) { + meter = mlx5e_tc_meter_alloc(flow_meters, params); + if (IS_ERR(meter)) { + err = PTR_ERR(meter); + goto err_get; + } + } + + err = __mlx5e_tc_meter_update(meter, params); + if (err) + goto err_update; + + mutex_unlock(&flow_meters->sync_lock); + return meter; + +err_update: + __mlx5e_tc_meter_put(meter); +err_get: + mutex_unlock(&flow_meters->sync_lock); + return ERR_PTR(err); +} + +enum mlx5_flow_namespace_type +mlx5e_tc_meter_get_namespace(struct mlx5e_flow_meters *flow_meters) +{ + return flow_meters->ns_type; +} + +struct mlx5e_flow_meters * +mlx5e_flow_meters_init(struct mlx5e_priv *priv, + enum mlx5_flow_namespace_type ns_type, + struct mlx5e_post_act *post_act) +{ + struct mlx5e_flow_meters *flow_meters; + + if (!(MLX5_CAP_GEN_64(priv->mdev, general_obj_types) & + MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_FLOW_METER_ASO)) + return NULL; + + flow_meters = kzalloc(sizeof(*flow_meters), GFP_KERNEL); + if (!flow_meters) + return NULL; + + if (IS_ERR_OR_NULL(post_act)) { + netdev_dbg(priv->netdev, + "flow meter offload is not supported, post action is missing\n"); + goto errout; + } + + flow_meters->aso = mlx5e_aso_get(priv); + if (!flow_meters->aso) { + mlx5_core_warn(priv->mdev, "Failed to create aso wqe for flow meter\n"); + goto errout; + } + + flow_meters->ns_type = ns_type; + flow_meters->mdev = priv->mdev; + flow_meters->post_act = post_act; + flow_meters->log_granularity = min_t(int, 6, + MLX5_CAP_QOS(priv->mdev, log_meter_aso_max_alloc)); + mutex_init(&flow_meters->sync_lock); + INIT_LIST_HEAD(&flow_meters->partial_list); + INIT_LIST_HEAD(&flow_meters->full_list); + + return flow_meters; + +errout: + kfree(flow_meters); + return NULL; +} + +void +mlx5e_flow_meters_cleanup(struct mlx5e_flow_meters *flow_meters) +{ + if (!flow_meters) + return; + + mlx5e_aso_put(flow_meters->aso->priv); + kfree(flow_meters); +} + +int +mlx5e_aso_send_flow_meter_aso(struct mlx5_core_dev *mdev, + struct mlx5e_flow_meter_handle *meter, + struct mlx5e_flow_meter_params *meter_params) +{ + struct mlx5e_aso_ctrl_param param = {}; + struct mlx5_wqe_aso_data_seg *aso_data; + struct mlx5e_flow_meters *flow_meters; + u8 cir_man, cir_exp, cbs_man, cbs_exp; + struct mlx5e_aso_wqe_data *aso_wqe; + u16 pi, contig_wqebbs_room; + struct mlx5e_asosq *sq; + struct mlx5_wq_cyc *wq; + struct mlx5e_aso *aso; + u64 rate, burst; + int err = 0; + + flow_meters = meter->flow_meters; + aso = flow_meters->aso; + sq = &aso->sq; + wq = &sq->wq; + + rate = meter_params->rate; + burst = meter_params->burst; + /* HW treats each packet as 128 bytes in PPS mode */ + if (meter_params->mode == MLX5_RATE_LIMIT_PPS) { + rate <<= 10; + burst <<= 7; + } + + if (!rate || rate > MAX_CIR || !burst || burst > MAX_CBS) + return -EINVAL; + + /* HW has limitation of total 31 bits for cbs */ + if (burst > MAX_HW_CBS) { + mlx5_core_warn(mdev, + "burst(%lld) is too large, use HW allowed value(%d)\n", + burst, MAX_HW_CBS); + burst = MAX_HW_CBS; + } + + mlx5_core_dbg(mdev, "meter mode=%d\n", meter_params->mode); + mlx5e_flow_meter_cir_calc(rate, &cir_man, &cir_exp); + mlx5_core_dbg(mdev, "rate=%lld, cir=%lld, exp=%d, man=%d\n", + rate, CALC_CIR(cir_man, cir_exp), cir_exp, cir_man); + mlx5e_flow_meter_cbs_calc(burst, &cbs_man, &cbs_exp); + mlx5_core_dbg(mdev, "burst=%lld, cbs=%lld, exp=%d, man=%d\n", + burst, CALC_CBS((u64)cbs_man, cbs_exp), cbs_exp, cbs_man); + + if (!cir_man || !cbs_man) + return -EINVAL; + + mutex_lock(&aso->priv->aso_lock); + pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc); + contig_wqebbs_room = mlx5_wq_cyc_get_contig_wqebbs(wq, pi); + + if (unlikely(contig_wqebbs_room < MLX5E_ASO_WQEBBS_DATA)) { + mlx5e_fill_asosq_frag_edge(sq, wq, pi, contig_wqebbs_room); + pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc); + } + + aso_wqe = mlx5_wq_cyc_get_wqe(wq, pi); + param.data_mask_mode = ASO_DATA_MASK_MODE_BYTEWISE_64BYTE; + param.condition_operand = LOGICAL_OR; + param.condition_0_operand = ALWAYS_TRUE; + param.condition_1_operand = ALWAYS_TRUE; + param.data_mask = 0x80FFFFFFULL << (meter->idx ? 0 : 32); + mlx5e_build_aso_wqe(aso, sq, + DIV_ROUND_UP(sizeof(*aso_wqe), MLX5_SEND_WQE_DS), + &aso_wqe->ctrl, &aso_wqe->aso_ctrl, meter->obj_id, + MLX5_ACCESS_ASO_OPC_MOD_FLOW_METER, ¶m); + + aso_data = &aso_wqe->aso_data; + memset(aso_data, 0, sizeof(*aso_data)); + aso_data->bytewise_data[meter->idx * 8] = cpu_to_be32((0x1 << 31) | /* valid */ + (MLX5_FLOW_METER_COLOR_GREEN << START_COLOR_SHIFT)); + if (meter_params->mode == MLX5_RATE_LIMIT_PPS) + aso_data->bytewise_data[meter->idx * 8] |= + cpu_to_be32(MLX5_FLOW_METER_MODE_NUM_PACKETS << METER_MODE_SHIFT); + else + aso_data->bytewise_data[meter->idx * 8] |= + cpu_to_be32(MLX5_FLOW_METER_MODE_BYTES_IP_LENGTH << METER_MODE_SHIFT); + + aso_data->bytewise_data[meter->idx * 8 + 2] = cpu_to_be32((cbs_exp << CBS_EXP_SHIFT) | + (cbs_man << CBS_MAN_SHIFT) | + (cir_exp << CIR_EXP_SHIFT) | + cir_man); + + sq->db.aso_wqe[pi].opcode = MLX5_OPCODE_ACCESS_ASO; + sq->db.aso_wqe[pi].with_data = true; + sq->pc += MLX5E_ASO_WQEBBS_DATA; + sq->doorbell_cseg = &aso_wqe->ctrl; + + mlx5e_notify_hw(&sq->wq, sq->pc, sq->uar_map, sq->doorbell_cseg); + + /* Ensure doorbell is written on uar_page before poll_cq */ + WRITE_ONCE(sq->doorbell_cseg, NULL); + + err = mlx5e_poll_aso_cq(&sq->cq); + mutex_unlock(&aso->priv->aso_lock); + + return err; +} + +static int +mlx5e_create_flow_meter_aso_obj(struct mlx5_core_dev *dev, + struct mlx5e_flow_meters *flow_meters, int *obj_id) +{ + u32 in[MLX5_ST_SZ_DW(create_flow_meter_aso_obj_in)] = {}; + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)]; + void *obj; + int err; + + MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT); + MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, + MLX5_GENERAL_OBJECT_TYPES_FLOW_METER_ASO); + MLX5_SET(general_obj_in_cmd_hdr, in, log_obj_range, flow_meters->log_granularity); + + obj = MLX5_ADDR_OF(create_flow_meter_aso_obj_in, in, flow_meter_aso_obj); + MLX5_SET(flow_meter_aso_obj, obj, meter_aso_access_pd, flow_meters->aso->pdn); + + err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + if (!err) { + *obj_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id); + mlx5_core_dbg(dev, "flow meter aso obj(0x%x) created\n", *obj_id); + } + + return err; +} + +static void +mlx5e_destroy_flow_meter_aso_obj(struct mlx5_core_dev *dev, u32 obj_id) +{ + u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {}; + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)]; + + MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT); + MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, + MLX5_GENERAL_OBJECT_TYPES_FLOW_METER_ASO); + MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, obj_id); + + mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + mlx5_core_dbg(dev, "flow meter aso obj(0x%x) destroyed\n", obj_id); +} + +static struct mlx5e_flow_meter_handle * +__mlx5e_alloc_flow_meter(struct mlx5_core_dev *dev, + struct mlx5e_flow_meters *flow_meters) +{ + struct mlx5e_flow_meter_aso_obj *meters_obj; + struct mlx5e_flow_meter_handle *meter; + int err, pos, total; + u32 id; + + meter = kzalloc(sizeof(*meter), GFP_KERNEL); + if (!meter) + return ERR_PTR(-ENOMEM); + + meters_obj = list_first_entry_or_null(&flow_meters->partial_list, + struct mlx5e_flow_meter_aso_obj, + entry); + /* 2 meters in one object */ + total = 1 << (flow_meters->log_granularity + 1); + if (!meters_obj) { + err = mlx5e_create_flow_meter_aso_obj(dev, flow_meters, &id); + if (err) { + mlx5_core_err(dev, "Failed to create flow meter ASO object\n"); + goto err_create; + } + + meters_obj = kzalloc(sizeof(*meters_obj) + BITS_TO_BYTES(total), + GFP_KERNEL); + if (!meters_obj) { + err = -ENOMEM; + goto err_mem; + } + + meters_obj->base_id = id; + meters_obj->total_meters = total; + list_add(&meters_obj->entry, &flow_meters->partial_list); + pos = 0; + } else { + pos = find_first_zero_bit(meters_obj->meters_map, total); + if (bitmap_weight(meters_obj->meters_map, total) == total - 1) { + list_del(&meters_obj->entry); + list_add(&meters_obj->entry, &flow_meters->full_list); + } + } + + bitmap_set(meters_obj->meters_map, pos, 1); + meter->flow_meters = flow_meters; + meter->meters_obj = meters_obj; + meter->obj_id = meters_obj->base_id + pos / 2; + meter->idx = pos % 2; + + mlx5_core_dbg(dev, "flow meter allocated, obj_id=0x%x, index=%d\n", + meter->obj_id, meter->idx); + + return meter; + +err_mem: + mlx5e_destroy_flow_meter_aso_obj(dev, id); +err_create: + kfree(meter); + return ERR_PTR(err); +} + +static void +__mlx5e_free_flow_meter(struct mlx5_core_dev *dev, + struct mlx5e_flow_meters *flow_meters, + struct mlx5e_flow_meter_handle *meter) +{ + struct mlx5e_flow_meter_aso_obj *meters_obj; + int n, pos; + + meters_obj = meter->meters_obj; + pos = (meter->obj_id - meters_obj->base_id) * 2 + meter->idx; + bitmap_clear(meters_obj->meters_map, pos, 1); + n = bitmap_weight(meters_obj->meters_map, meters_obj->total_meters); + if (n == 0) { + list_del(&meters_obj->entry); + mlx5e_destroy_flow_meter_aso_obj(dev, meters_obj->base_id); + kfree(meters_obj); + } else if (n == meters_obj->total_meters - 1) { + list_del(&meters_obj->entry); + list_add(&meters_obj->entry, &flow_meters->partial_list); + } + + mlx5_core_dbg(dev, "flow meter freed, obj_id=0x%x, index=%d\n", + meter->obj_id, meter->idx); + kfree(meter); +} + +struct mlx5e_flow_meter_handle * +mlx5e_alloc_flow_meter(struct mlx5_core_dev *dev) +{ + struct mlx5e_flow_meters *flow_meters; + struct mlx5e_flow_meter_handle *meter; + + flow_meters = mlx5e_get_flow_meters(dev); + if (!flow_meters) + return ERR_PTR(-EOPNOTSUPP); + + mutex_lock(&flow_meters->sync_lock); + meter = __mlx5e_alloc_flow_meter(dev, flow_meters); + mutex_unlock(&flow_meters->sync_lock); + + return meter; +} + +void +mlx5e_free_flow_meter(struct mlx5_core_dev *dev, struct mlx5e_flow_meter_handle *meter) +{ + struct mlx5e_flow_meters *flow_meters; + + flow_meters = meter->flow_meters; + mutex_lock(&flow_meters->sync_lock); + __mlx5e_free_flow_meter(dev, flow_meters, meter); + mutex_unlock(&flow_meters->sync_lock); +} + +void +mlx5e_tc_meter_get_stats(struct mlx5e_flow_meter_handle *meter, + u64 *bytes, u64 *packets, u64 *drops, u64 *lastuse) +{ + u64 bytes1, packets1, lastuse1; + u64 bytes2, packets2, lastuse2; + + mlx5_fc_query_cached(meter->green_counter, &bytes1, &packets1, &lastuse1); + mlx5_fc_query_cached(meter->red_counter, &bytes2, &packets2, &lastuse2); + + *bytes = bytes1 + bytes2; + *packets = packets1 + packets2; + *drops = packets2; + *lastuse = max_t(u64, lastuse1, lastuse2); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/meter.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/meter.h new file mode 100644 index 0000000..108b5ae --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/meter.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#ifndef __MLX5_EN_FLOW_METER_H__ +#define __MLX5_EN_FLOW_METER_H__ + +struct mlx5e_post_meter_priv; +struct mlx5e_flow_meters; +struct mlx5_flow_attr; + +struct mlx5e_flow_meter_aso_obj { + struct list_head entry; + int base_id; + int total_meters; + + unsigned long meters_map[0]; /* must be at the end of this struct */ +}; + +struct mlx5e_flow_meter_params { + enum mlx5e_flow_meter_mode mode; + /* police action index */ + u32 index; + u64 rate; + u64 burst; +}; + +struct mlx5e_flow_meter_handle { + struct mlx5e_flow_meters *flow_meters; + struct mlx5e_flow_meter_aso_obj *meters_obj; + u32 obj_id; + u8 idx; + + int refcnt; + struct hlist_node hlist; + struct mlx5e_flow_meter_params params; + + struct mlx5_fc *green_counter; + struct mlx5_fc *red_counter; +}; + +struct mlx5e_meter_attr { + struct mlx5e_flow_meter_params params; + struct mlx5e_flow_meter_handle *meter; + struct mlx5e_post_meter_priv *post_meter; +}; + +int mlx5e_aso_send_flow_meter_aso(struct mlx5_core_dev *mdev, + struct mlx5e_flow_meter_handle *meter, + struct mlx5e_flow_meter_params *meter_params); + +struct mlx5e_flow_meter_handle *mlx5e_alloc_flow_meter(struct mlx5_core_dev *dev); +void mlx5e_free_flow_meter(struct mlx5_core_dev *dev, + struct mlx5e_flow_meter_handle *meter); + +int +mlx5e_flow_meter_send(struct mlx5_core_dev *mdev, + struct mlx5e_flow_meter_handle *meter, + struct mlx5e_flow_meter_params *meter_params); + +struct mlx5e_flow_meter_handle * +mlx5e_tc_meter_get(struct mlx5_core_dev *mdev, struct mlx5e_flow_meter_params *params); +void +mlx5e_tc_meter_put(struct mlx5e_flow_meter_handle *meter); +int +mlx5e_tc_meter_update(struct mlx5e_flow_meter_handle *meter, + struct mlx5e_flow_meter_params *params); +struct mlx5e_flow_meter_handle * +mlx5e_tc_meter_replace(struct mlx5_core_dev *mdev, struct mlx5e_flow_meter_params *params); + +struct mlx5_flow_table * +mlx5e_tc_meter_get_post_meter_ft(struct mlx5e_flow_meters *flow_meters); + +enum mlx5_flow_namespace_type +mlx5e_tc_meter_get_namespace(struct mlx5e_flow_meters *flow_meters); + +struct mlx5e_flow_meters * +mlx5e_flow_meters_init(struct mlx5e_priv *priv, + enum mlx5_flow_namespace_type ns_type, + struct mlx5e_post_act *post_action); +void +mlx5e_flow_meters_cleanup(struct mlx5e_flow_meters *flow_meters); + +void +mlx5e_tc_meter_get_stats(struct mlx5e_flow_meter_handle *meter, + u64 *bytes, u64 *packets, u64 *drops, u64 *lastuse); + +#endif /* __MLX5_EN_FLOW_METER_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/post_act.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/post_act.c new file mode 100644 index 0000000..d53f1e5 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/post_act.c @@ -0,0 +1,184 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +#include "en/tc_priv.h" +#include "en_tc.h" +#include "post_act.h" +#include "mlx5_core.h" +#include "fs_core.h" + +struct mlx5e_post_act { + enum mlx5_flow_namespace_type ns_type; + struct mlx5_fs_chains *chains; + struct mlx5_flow_table *ft; + struct mlx5e_priv *priv; + struct xarray ids; +}; + +struct mlx5e_post_act_handle { + enum mlx5_flow_namespace_type ns_type; + struct mlx5_flow_attr *attr; + struct mlx5_flow_handle *rule; + u32 id; +}; + +#define MLX5_POST_ACTION_BITS MLX5_REG_MAPPING_MBITS(FTEID_TO_REG) +#define MLX5_POST_ACTION_MASK MLX5_REG_MAPPING_MASK(FTEID_TO_REG) +#define MLX5_POST_ACTION_MAX MLX5_POST_ACTION_MASK + +struct mlx5e_post_act * +mlx5e_tc_post_act_init(struct mlx5e_priv *priv, struct mlx5_fs_chains *chains, + enum mlx5_flow_namespace_type ns_type) +{ + enum fs_flow_table_type table_type = ns_type == MLX5_FLOW_NAMESPACE_FDB ? + FS_FT_FDB : FS_FT_NIC_RX; + struct mlx5e_post_act *post_act; + int err; + + if (!MLX5_CAP_FLOWTABLE_TYPE(priv->mdev, ignore_flow_level, table_type)) { + if (priv->mdev->coredev_type != MLX5_COREDEV_VF) + mlx5_core_warn(priv->mdev, "firmware level support is missing\n"); + err = -EOPNOTSUPP; + goto err_check; + } + + post_act = kzalloc(sizeof(*post_act), GFP_KERNEL); + if (!post_act) { + err = -ENOMEM; + goto err_check; + } + post_act->ft = mlx5_chains_create_global_table(chains); + if (IS_ERR(post_act->ft)) { + err = PTR_ERR(post_act->ft); + mlx5_core_warn(priv->mdev, "failed to create post action table, err: %d\n", err); + goto err_ft; + } + post_act->chains = chains; + post_act->ns_type = ns_type; + post_act->priv = priv; + xa_init_flags(&post_act->ids, XA_FLAGS_ALLOC1); + return post_act; + +err_ft: + kfree(post_act); +err_check: + return ERR_PTR(err); +} + +void +mlx5e_tc_post_act_destroy(struct mlx5e_post_act *post_act) +{ + if (IS_ERR_OR_NULL(post_act)) + return; + + xa_destroy(&post_act->ids); + mlx5_chains_destroy_global_table(post_act->chains, post_act->ft); + kfree(post_act); +} + +int +mlx5e_tc_post_act_offload(struct mlx5e_post_act *post_act, + struct mlx5e_post_act_handle *handle) +{ + struct mlx5_flow_spec *spec; + int err; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) + return -ENOMEM; + + /* Post action rule matches on fte_id and executes original rule's tc rule action */ + mlx5e_tc_match_to_reg_match(spec, FTEID_TO_REG, handle->id, MLX5_POST_ACTION_MASK); + + handle->rule = mlx5e_tc_rule_offload(post_act->priv, spec, handle->attr); + if (IS_ERR(handle->rule)) { + err = PTR_ERR(handle->rule); + netdev_warn(post_act->priv->netdev, "Failed to add post action rule"); + goto err_rule; + } + + kvfree(spec); + return 0; + +err_rule: + kvfree(spec); + return err; +} + +struct mlx5e_post_act_handle * +mlx5e_tc_post_act_add(struct mlx5e_post_act *post_act, struct mlx5_flow_attr *attr) +{ + u32 attr_sz = ns_to_attr_sz(post_act->ns_type); + struct mlx5e_post_act_handle *handle; + struct mlx5_flow_attr *post_attr; + int err; + + handle = kzalloc(sizeof(*handle), GFP_KERNEL); + post_attr = mlx5_alloc_flow_attr(post_act->ns_type); + if (!handle || !post_attr) { + kfree(post_attr); + kfree(handle); + return ERR_PTR(-ENOMEM); + } + + memcpy(post_attr, attr, attr_sz); + post_attr->chain = 0; + post_attr->prio = 0; + post_attr->ft = post_act->ft; + post_attr->inner_match_level = MLX5_MATCH_NONE; + post_attr->outer_match_level = MLX5_MATCH_NONE; + post_attr->action &= ~MLX5_FLOW_CONTEXT_ACTION_DECAP; + post_attr->flags |= MLX5_ATTR_FLAG_NO_IN_PORT; + + handle->ns_type = post_act->ns_type; + /* Splits were handled before post action */ + if (handle->ns_type == MLX5_FLOW_NAMESPACE_FDB) + post_attr->esw_attr->split_count = 0; + + err = xa_alloc(&post_act->ids, &handle->id, post_attr, + XA_LIMIT(1, MLX5_POST_ACTION_MAX), GFP_KERNEL); + if (err) + goto err_xarray; + + handle->attr = post_attr; + + return handle; + +err_xarray: + kfree(post_attr); + kfree(handle); + return ERR_PTR(err); +} + +void +mlx5e_tc_post_act_unoffload(struct mlx5e_post_act *post_act, + struct mlx5e_post_act_handle *handle) +{ + mlx5e_tc_rule_unoffload(post_act->priv, handle->rule, handle->attr); + handle->rule = NULL; +} + +void +mlx5e_tc_post_act_del(struct mlx5e_post_act *post_act, struct mlx5e_post_act_handle *handle) +{ + if (!IS_ERR_OR_NULL(handle->rule)) + mlx5e_tc_post_act_unoffload(post_act, handle); + xa_erase(&post_act->ids, handle->id); + kfree(handle->attr); + kfree(handle); +} + +struct mlx5_flow_table * +mlx5e_tc_post_act_get_ft(struct mlx5e_post_act *post_act) +{ + return post_act->ft; +} + +/* Allocate a header modify action to write the post action handle fte id to a register. */ +int +mlx5e_tc_post_act_set_handle(struct mlx5_core_dev *dev, + struct mlx5e_post_act_handle *handle, + struct mlx5e_tc_mod_hdr_acts *acts) +{ + return mlx5e_tc_match_to_reg_set(dev, acts, handle->ns_type, FTEID_TO_REG, handle->id); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/post_act.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/post_act.h new file mode 100644 index 0000000..f476774 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/post_act.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#ifndef __MLX5_POST_ACTION_H__ +#define __MLX5_POST_ACTION_H__ + +#include "en.h" +#include "lib/fs_chains.h" + +struct mlx5_flow_attr; +struct mlx5e_priv; +struct mlx5e_tc_mod_hdr_acts; + +struct mlx5e_post_act * +mlx5e_tc_post_act_init(struct mlx5e_priv *priv, struct mlx5_fs_chains *chains, + enum mlx5_flow_namespace_type ns_type); + +void +mlx5e_tc_post_act_destroy(struct mlx5e_post_act *post_act); + +struct mlx5e_post_act_handle * +mlx5e_tc_post_act_add(struct mlx5e_post_act *post_act, struct mlx5_flow_attr *attr); + +void +mlx5e_tc_post_act_del(struct mlx5e_post_act *post_act, struct mlx5e_post_act_handle *handle); + +int +mlx5e_tc_post_act_offload(struct mlx5e_post_act *post_act, + struct mlx5e_post_act_handle *handle); + +void +mlx5e_tc_post_act_unoffload(struct mlx5e_post_act *post_act, + struct mlx5e_post_act_handle *handle); + +struct mlx5_flow_table * +mlx5e_tc_post_act_get_ft(struct mlx5e_post_act *post_act); + +int +mlx5e_tc_post_act_set_handle(struct mlx5_core_dev *dev, + struct mlx5e_post_act_handle *handle, + struct mlx5e_tc_mod_hdr_acts *acts); + +#endif /* __MLX5_POST_ACTION_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/post_meter.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/post_meter.c new file mode 100644 index 0000000..97c968d --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/post_meter.c @@ -0,0 +1,209 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +#include "en/tc_priv.h" +#include "post_meter.h" +#include "en/tc/post_act.h" + +#define MLX5_PACKET_COLOR_BITS MLX5_REG_MAPPING_MBITS(PACKET_COLOR_TO_REG) +#define MLX5_PACKET_COLOR_MASK MLX5_REG_MAPPING_MASK(PACKET_COLOR_TO_REG) + +struct mlx5e_post_meter_priv { + struct mlx5_flow_table *ft; + struct mlx5_flow_group *fg; + struct mlx5_flow_handle *fwd_green_rule; + struct mlx5_flow_handle *drop_red_rule; +}; + +struct mlx5_flow_table * +mlx5e_post_meter_get_ft(struct mlx5e_post_meter_priv *post_meter) +{ + return post_meter->ft; +} + +static int +mlx5e_post_meter_table_create(struct mlx5e_priv *priv, + enum mlx5_flow_namespace_type ns_type, + struct mlx5e_post_meter_priv *post_meter) +{ + struct mlx5_flow_table_attr ft_attr = {}; + struct mlx5_flow_namespace *root_ns; + + root_ns = mlx5_get_flow_namespace(priv->mdev, ns_type); + if (!root_ns) { + mlx5_core_warn(priv->mdev, "Failed to get namespace for flow meter\n"); + return -EOPNOTSUPP; + } + + ft_attr.flags = MLX5_FLOW_TABLE_UNMANAGED; + ft_attr.prio = FDB_SLOW_PATH; + ft_attr.max_fte = 2; + ft_attr.level = 1; + + post_meter->ft = mlx5_create_flow_table(root_ns, &ft_attr); + if (IS_ERR(post_meter->ft)) { + mlx5_core_warn(priv->mdev, "Failed to create post_meter table\n"); + return PTR_ERR(post_meter->ft); + } + + return 0; +} + +static int +mlx5e_post_meter_fg_create(struct mlx5e_priv *priv, + struct mlx5e_post_meter_priv *post_meter) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + void *misc2, *match_criteria; + u32 *flow_group_in; + int err = 0; + + flow_group_in = kvzalloc(inlen, GFP_KERNEL); + if (!flow_group_in) + return -ENOMEM; + + MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, + MLX5_MATCH_MISC_PARAMETERS_2); + match_criteria = MLX5_ADDR_OF(create_flow_group_in, flow_group_in, + match_criteria); + misc2 = MLX5_ADDR_OF(fte_match_param, match_criteria, misc_parameters_2); + MLX5_SET(fte_match_set_misc2, misc2, metadata_reg_c_5, MLX5_PACKET_COLOR_MASK); + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 0); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 1); + + post_meter->fg = mlx5_create_flow_group(post_meter->ft, flow_group_in); + if (IS_ERR(post_meter->fg)) { + mlx5_core_warn(priv->mdev, "Failed to create post_meter flow group\n"); + err = PTR_ERR(post_meter->fg); + } + + kvfree(flow_group_in); + return err; +} + +static int +mlx5e_post_meter_rules_create(struct mlx5e_priv *priv, + struct mlx5e_post_meter_priv *post_meter, + struct mlx5e_post_act *post_act, + struct mlx5_fc *green_counter, + struct mlx5_fc *red_counter) +{ + struct mlx5_flow_destination dest[2] = {}; + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_handle *rule; + struct mlx5_flow_spec *spec; + int err; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) + return -ENOMEM; + + mlx5e_tc_match_to_reg_match(spec, PACKET_COLOR_TO_REG, + MLX5_FLOW_METER_COLOR_RED, MLX5_PACKET_COLOR_MASK); + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP | + MLX5_FLOW_CONTEXT_ACTION_COUNT; + flow_act.flags |= FLOW_ACT_IGNORE_FLOW_LEVEL; + dest[0].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; + dest[0].counter_id = mlx5_fc_id(red_counter); + + rule = mlx5_add_flow_rules(post_meter->ft, spec, &flow_act, dest, 1); + if (IS_ERR(rule)) { + mlx5_core_warn(priv->mdev, "Failed to create post_meter flow drop rule\n"); + err = PTR_ERR(rule); + goto err_red; + } + post_meter->drop_red_rule = rule; + + mlx5e_tc_match_to_reg_match(spec, PACKET_COLOR_TO_REG, + MLX5_FLOW_METER_COLOR_GREEN, MLX5_PACKET_COLOR_MASK); + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | + MLX5_FLOW_CONTEXT_ACTION_COUNT; + dest[0].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dest[0].ft = mlx5e_tc_post_act_get_ft(post_act); + dest[1].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; + dest[1].counter_id = mlx5_fc_id(green_counter); + + rule = mlx5_add_flow_rules(post_meter->ft, spec, &flow_act, dest, 2); + if (IS_ERR(rule)) { + mlx5_core_warn(priv->mdev, "Failed to create post_meter flow green rule\n"); + err = PTR_ERR(rule); + goto err_green; + } + post_meter->fwd_green_rule = rule; + + kvfree(spec); + return 0; + +err_green: + mlx5_del_flow_rules(post_meter->drop_red_rule); +err_red: + kvfree(spec); + return err; +} + +static void +mlx5e_post_meter_rules_destroy(struct mlx5e_post_meter_priv *post_meter) +{ + mlx5_del_flow_rules(post_meter->drop_red_rule); + mlx5_del_flow_rules(post_meter->fwd_green_rule); +} + +static void +mlx5e_post_meter_fg_destroy(struct mlx5e_post_meter_priv *post_meter) +{ + mlx5_destroy_flow_group(post_meter->fg); +} + +static void +mlx5e_post_meter_table_destroy(struct mlx5e_post_meter_priv *post_meter) +{ + mlx5_destroy_flow_table(post_meter->ft); +} + +struct mlx5e_post_meter_priv * +mlx5e_post_meter_init(struct mlx5e_priv *priv, + enum mlx5_flow_namespace_type ns_type, + struct mlx5e_post_act *post_act, + struct mlx5_fc *green_counter, + struct mlx5_fc *red_counter) +{ + struct mlx5e_post_meter_priv *post_meter; + int err; + + post_meter = kzalloc(sizeof(*post_meter), GFP_KERNEL); + if (!post_meter) + return ERR_PTR(-ENOMEM); + + err = mlx5e_post_meter_table_create(priv, ns_type, post_meter); + if (err) + goto err_ft; + + err = mlx5e_post_meter_fg_create(priv, post_meter); + if (err) + goto err_fg; + + err = mlx5e_post_meter_rules_create(priv, post_meter, post_act, green_counter, + red_counter); + if (err) + goto err_rules; + + return post_meter; + +err_rules: + mlx5e_post_meter_fg_destroy(post_meter); +err_fg: + mlx5e_post_meter_table_destroy(post_meter); +err_ft: + kfree(post_meter); + return ERR_PTR(err); +} + +void +mlx5e_post_meter_cleanup(struct mlx5e_post_meter_priv *post_meter) +{ + mlx5e_post_meter_rules_destroy(post_meter); + mlx5e_post_meter_fg_destroy(post_meter); + mlx5e_post_meter_table_destroy(post_meter); + kfree(post_meter); +} + diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/post_meter.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/post_meter.h new file mode 100644 index 0000000..34d0e4b --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/post_meter.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#ifndef __MLX5_EN_POST_METER_H__ +#define __MLX5_EN_POST_METER_H__ + +#define packet_color_to_reg { \ + .mfield = MLX5_ACTION_IN_FIELD_METADATA_REG_C_5, \ + .moffset = 0, \ + .mlen = 8, \ + .soffset = MLX5_BYTE_OFF(fte_match_param, \ + misc_parameters_2.metadata_reg_c_5), \ +} + +struct mlx5e_post_meter_priv; + +struct mlx5_flow_table * +mlx5e_post_meter_get_ft(struct mlx5e_post_meter_priv *post_meter); + +struct mlx5e_post_meter_priv * +mlx5e_post_meter_init(struct mlx5e_priv *priv, + enum mlx5_flow_namespace_type ns_type, + struct mlx5e_post_act *post_act, + struct mlx5_fc *green_counter, + struct mlx5_fc *red_counter); +void +mlx5e_post_meter_cleanup(struct mlx5e_post_meter_priv *post_meter); + +#endif /* __MLX5_EN_POST_METER_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/sample.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/sample.c new file mode 100644 index 0000000..fd45045 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/sample.c @@ -0,0 +1,655 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2021 Mellanox Technologies. */ + +#include +#include +#include "en/mapping.h" +#include "en/tc/post_act.h" +#include "en/tc/act/sample.h" +#include "en/mod_hdr.h" +#include "sample.h" +#include "eswitch.h" +#include "en_tc.h" +#include "fs_core.h" + +#define MLX5_ESW_VPORT_TBL_SIZE_SAMPLE (64 * 1024) + +static const struct esw_vport_tbl_namespace mlx5_esw_vport_tbl_sample_ns = { + .max_fte = MLX5_ESW_VPORT_TBL_SIZE_SAMPLE, + .max_num_groups = 0, /* default num of groups */ + .flags = MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT | MLX5_FLOW_TABLE_TUNNEL_EN_DECAP, +}; + +struct mlx5e_tc_psample { + struct mlx5_eswitch *esw; + struct mlx5_flow_table *termtbl; + struct mlx5_flow_handle *termtbl_rule; + DECLARE_HASHTABLE(hashtbl, 8); + struct mutex ht_lock; /* protect hashtbl */ + DECLARE_HASHTABLE(restore_hashtbl, 8); + struct mutex restore_lock; /* protect restore_hashtbl */ + struct mlx5e_post_act *post_act; +}; + +struct mlx5e_sampler { + struct hlist_node hlist; + u32 sampler_id; + u32 sample_ratio; + u32 sample_table_id; + u32 default_table_id; + int count; +}; + +struct mlx5e_sample_flow { + struct mlx5e_sampler *sampler; + struct mlx5e_sample_restore *restore; + struct mlx5_flow_attr *pre_attr; + struct mlx5_flow_handle *pre_rule; + struct mlx5_flow_attr *post_attr; + struct mlx5_flow_handle *post_rule; +}; + +struct mlx5e_sample_restore { + struct hlist_node hlist; + struct mlx5_modify_hdr *modify_hdr; + struct mlx5_flow_handle *rule; + u32 obj_id; + int count; +}; + +static int +sampler_termtbl_create(struct mlx5e_tc_psample *tc_psample) +{ + struct mlx5_eswitch *esw = tc_psample->esw; + struct mlx5_flow_table_attr ft_attr = {}; + struct mlx5_flow_destination dest = {}; + struct mlx5_core_dev *dev = esw->dev; + struct mlx5_flow_namespace *root_ns; + struct mlx5_flow_act act = {}; + int err; + + if (!MLX5_CAP_ESW_FLOWTABLE_FDB(dev, termination_table)) { + mlx5_core_warn(dev, "termination table is not supported\n"); + return -EOPNOTSUPP; + } + + root_ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_FDB); + if (!root_ns) { + mlx5_core_warn(dev, "failed to get FDB flow namespace\n"); + return -EOPNOTSUPP; + } + + ft_attr.flags = MLX5_FLOW_TABLE_TERMINATION | MLX5_FLOW_TABLE_UNMANAGED; + ft_attr.autogroup.max_num_groups = 1; + ft_attr.prio = FDB_SLOW_PATH; + ft_attr.max_fte = 1; + ft_attr.level = 1; + tc_psample->termtbl = mlx5_create_auto_grouped_flow_table(root_ns, &ft_attr); + if (IS_ERR(tc_psample->termtbl)) { + err = PTR_ERR(tc_psample->termtbl); + mlx5_core_warn(dev, "failed to create termtbl, err: %d\n", err); + return err; + } + + act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + dest.vport.num = esw->manager_vport; + tc_psample->termtbl_rule = mlx5_add_flow_rules(tc_psample->termtbl, NULL, &act, &dest, 1); + if (IS_ERR(tc_psample->termtbl_rule)) { + err = PTR_ERR(tc_psample->termtbl_rule); + mlx5_core_warn(dev, "failed to create termtbl rule, err: %d\n", err); + mlx5_destroy_flow_table(tc_psample->termtbl); + return err; + } + + return 0; +} + +static void +sampler_termtbl_destroy(struct mlx5e_tc_psample *tc_psample) +{ + mlx5_del_flow_rules(tc_psample->termtbl_rule); + mlx5_destroy_flow_table(tc_psample->termtbl); +} + +static int +sampler_obj_create(struct mlx5_core_dev *mdev, struct mlx5e_sampler *sampler) +{ + u32 in[MLX5_ST_SZ_DW(create_sampler_obj_in)] = {}; + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)]; + u64 general_obj_types; + void *obj; + int err; + + general_obj_types = MLX5_CAP_GEN_64(mdev, general_obj_types); + if (!(general_obj_types & MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_SAMPLER)) + return -EOPNOTSUPP; + if (!MLX5_CAP_ESW_FLOWTABLE_FDB(mdev, ignore_flow_level)) + return -EOPNOTSUPP; + + obj = MLX5_ADDR_OF(create_sampler_obj_in, in, sampler_object); + MLX5_SET(sampler_obj, obj, table_type, FS_FT_FDB); + MLX5_SET(sampler_obj, obj, ignore_flow_level, 1); + MLX5_SET(sampler_obj, obj, level, 1); + MLX5_SET(sampler_obj, obj, sample_ratio, sampler->sample_ratio); + MLX5_SET(sampler_obj, obj, sample_table_id, sampler->sample_table_id); + MLX5_SET(sampler_obj, obj, default_table_id, sampler->default_table_id); + MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT); + MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_GENERAL_OBJECT_TYPES_SAMPLER); + + err = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); + if (!err) + sampler->sampler_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id); + + return err; +} + +static void +sampler_obj_destroy(struct mlx5_core_dev *mdev, u32 sampler_id) +{ + u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {}; + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)]; + + MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT); + MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_GENERAL_OBJECT_TYPES_SAMPLER); + MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, sampler_id); + + mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); +} + +static u32 +sampler_hash(u32 sample_ratio, u32 default_table_id) +{ + return jhash_2words(sample_ratio, default_table_id, 0); +} + +static int +sampler_cmp(u32 sample_ratio1, u32 default_table_id1, u32 sample_ratio2, u32 default_table_id2) +{ + return sample_ratio1 != sample_ratio2 || default_table_id1 != default_table_id2; +} + +static struct mlx5e_sampler * +sampler_get(struct mlx5e_tc_psample *tc_psample, u32 sample_ratio, u32 default_table_id) +{ + struct mlx5e_sampler *sampler; + u32 hash_key; + int err; + + mutex_lock(&tc_psample->ht_lock); + hash_key = sampler_hash(sample_ratio, default_table_id); + hash_for_each_possible(tc_psample->hashtbl, sampler, hlist, hash_key) + if (!sampler_cmp(sampler->sample_ratio, sampler->default_table_id, + sample_ratio, default_table_id)) + goto add_ref; + + sampler = kzalloc(sizeof(*sampler), GFP_KERNEL); + if (!sampler) { + err = -ENOMEM; + goto err_alloc; + } + + sampler->sample_table_id = tc_psample->termtbl->id; + sampler->default_table_id = default_table_id; + sampler->sample_ratio = sample_ratio; + + err = sampler_obj_create(tc_psample->esw->dev, sampler); + if (err) + goto err_create; + + hash_add(tc_psample->hashtbl, &sampler->hlist, hash_key); + +add_ref: + sampler->count++; + mutex_unlock(&tc_psample->ht_lock); + return sampler; + +err_create: + kfree(sampler); +err_alloc: + mutex_unlock(&tc_psample->ht_lock); + return ERR_PTR(err); +} + +static void +sampler_put(struct mlx5e_tc_psample *tc_psample, struct mlx5e_sampler *sampler) +{ + mutex_lock(&tc_psample->ht_lock); + if (--sampler->count == 0) { + hash_del(&sampler->hlist); + sampler_obj_destroy(tc_psample->esw->dev, sampler->sampler_id); + kfree(sampler); + } + mutex_unlock(&tc_psample->ht_lock); +} + +/* obj_id is used to restore the sample parameters. + * Set fte_id in original flow table, then match it in the default table. + * Only set it for NICs can preserve reg_c or decap action. For other cases, + * use the same match in the default table. + * Use one header rewrite for both obj_id and fte_id. + */ +static struct mlx5_modify_hdr * +sample_modify_hdr_get(struct mlx5_core_dev *mdev, u32 obj_id, + struct mlx5e_tc_mod_hdr_acts *mod_acts) +{ + struct mlx5_modify_hdr *modify_hdr; + int err; + + err = mlx5e_tc_match_to_reg_set(mdev, mod_acts, MLX5_FLOW_NAMESPACE_FDB, + CHAIN_TO_REG, obj_id); + if (err) + goto err_set_regc0; + + modify_hdr = mlx5_modify_header_alloc(mdev, MLX5_FLOW_NAMESPACE_FDB, + mod_acts->num_actions, + mod_acts->actions); + if (IS_ERR(modify_hdr)) { + err = PTR_ERR(modify_hdr); + goto err_modify_hdr; + } + + mlx5e_mod_hdr_dealloc(mod_acts); + return modify_hdr; + +err_modify_hdr: + mlx5e_mod_hdr_dealloc(mod_acts); +err_set_regc0: + return ERR_PTR(err); +} + +static struct mlx5e_sample_restore * +sample_restore_get(struct mlx5e_tc_psample *tc_psample, u32 obj_id, + struct mlx5e_tc_mod_hdr_acts *mod_acts) +{ + struct mlx5_eswitch *esw = tc_psample->esw; + struct mlx5_core_dev *mdev = esw->dev; + struct mlx5e_sample_restore *restore; + struct mlx5_modify_hdr *modify_hdr; + int err; + + mutex_lock(&tc_psample->restore_lock); + hash_for_each_possible(tc_psample->restore_hashtbl, restore, hlist, obj_id) + if (restore->obj_id == obj_id) + goto add_ref; + + restore = kzalloc(sizeof(*restore), GFP_KERNEL); + if (!restore) { + err = -ENOMEM; + goto err_alloc; + } + restore->obj_id = obj_id; + + modify_hdr = sample_modify_hdr_get(mdev, obj_id, mod_acts); + if (IS_ERR(modify_hdr)) { + err = PTR_ERR(modify_hdr); + goto err_modify_hdr; + } + restore->modify_hdr = modify_hdr; + + restore->rule = esw_add_restore_rule(esw, obj_id); + if (IS_ERR(restore->rule)) { + err = PTR_ERR(restore->rule); + goto err_restore; + } + + hash_add(tc_psample->restore_hashtbl, &restore->hlist, obj_id); +add_ref: + restore->count++; + mutex_unlock(&tc_psample->restore_lock); + return restore; + +err_restore: + mlx5_modify_header_dealloc(mdev, restore->modify_hdr); +err_modify_hdr: + kfree(restore); +err_alloc: + mutex_unlock(&tc_psample->restore_lock); + return ERR_PTR(err); +} + +static void +sample_restore_put(struct mlx5e_tc_psample *tc_psample, struct mlx5e_sample_restore *restore) +{ + mutex_lock(&tc_psample->restore_lock); + if (--restore->count == 0) + hash_del(&restore->hlist); + mutex_unlock(&tc_psample->restore_lock); + + if (!restore->count) { + mlx5_del_flow_rules(restore->rule); + mlx5_modify_header_dealloc(tc_psample->esw->dev, restore->modify_hdr); + kfree(restore); + } +} + +void mlx5e_tc_sample_skb(struct sk_buff *skb, struct mlx5_mapped_obj *mapped_obj) +{ + u32 trunc_size = mapped_obj->sample.trunc_size; + struct psample_group psample_group = {}; + struct psample_metadata md = {}; + + md.trunc_size = trunc_size ? min(trunc_size, skb->len) : skb->len; + md.in_ifindex = skb->dev->ifindex; + psample_group.group_num = mapped_obj->sample.group_id; + psample_group.net = &init_net; + skb_push(skb, skb->mac_len); + + psample_sample_packet(&psample_group, skb, mapped_obj->sample.rate, &md); +} + +static int +add_post_rule(struct mlx5_eswitch *esw, struct mlx5e_sample_flow *sample_flow, + struct mlx5_flow_spec *spec, struct mlx5_flow_attr *attr, + u32 *default_tbl_id) +{ + struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr; + u32 attr_sz = ns_to_attr_sz(MLX5_FLOW_NAMESPACE_FDB); + struct mlx5_vport_tbl_attr per_vport_tbl_attr; + struct mlx5_flow_table *default_tbl; + struct mlx5_flow_attr *post_attr; + int err; + + /* Allocate default table per vport, chain and prio. Otherwise, there is + * only one default table for the same sampler object. Rules with different + * prio and chain may overlap. For CT sample action, per vport default + * table is needed to resotre the metadata. + */ + per_vport_tbl_attr.chain = attr->chain; + per_vport_tbl_attr.prio = attr->prio; + per_vport_tbl_attr.vport = esw_attr->in_rep->vport; + per_vport_tbl_attr.vport_ns = &mlx5_esw_vport_tbl_sample_ns; + default_tbl = mlx5_esw_vporttbl_get(esw, &per_vport_tbl_attr); + if (IS_ERR(default_tbl)) { + err = PTR_ERR(default_tbl); + goto err_default_tbl; + } + *default_tbl_id = default_tbl->id; + + post_attr = mlx5_alloc_flow_attr(MLX5_FLOW_NAMESPACE_FDB); + if (!post_attr) { + err = -ENOMEM; + goto err_attr; + } + sample_flow->post_attr = post_attr; + memcpy(post_attr, attr, attr_sz); + /* Perform the original matches on the default table. + * Offload all actions except the sample action. + */ + post_attr->chain = 0; + post_attr->prio = 0; + post_attr->ft = default_tbl; + post_attr->flags = MLX5_ATTR_FLAG_NO_IN_PORT; + + /* When offloading sample and encap action, if there is no valid + * neigh data struct, a slow path rule is offloaded first. Source + * port metadata match is set at that time. A per vport table is + * already allocated. No need to match it again. So clear the source + * port metadata match. + */ + mlx5_eswitch_clear_rule_source_port(esw, spec); + sample_flow->post_rule = mlx5_eswitch_add_offloaded_rule(esw, spec, post_attr); + if (IS_ERR(sample_flow->post_rule)) { + err = PTR_ERR(sample_flow->post_rule); + goto err_rule; + } + return 0; + +err_rule: + kfree(post_attr); +err_attr: + mlx5_esw_vporttbl_put(esw, &per_vport_tbl_attr); +err_default_tbl: + return err; +} + +static void +del_post_rule(struct mlx5_eswitch *esw, struct mlx5e_sample_flow *sample_flow, + struct mlx5_flow_attr *attr) +{ + struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr; + struct mlx5_vport_tbl_attr tbl_attr; + + mlx5_eswitch_del_offloaded_rule(esw, sample_flow->post_rule, sample_flow->post_attr); + kfree(sample_flow->post_attr); + tbl_attr.chain = attr->chain; + tbl_attr.prio = attr->prio; + tbl_attr.vport = esw_attr->in_rep->vport; + tbl_attr.vport_ns = &mlx5_esw_vport_tbl_sample_ns; + mlx5_esw_vporttbl_put(esw, &tbl_attr); +} + +/* For the following typical flow table: + * + * +-------------------------------+ + * + original flow table + + * +-------------------------------+ + * + original match + + * +-------------------------------+ + * + sample action + other actions + + * +-------------------------------+ + * + * We translate the tc filter with sample action to the following HW model: + * + * +---------------------+ + * + original flow table + + * +---------------------+ + * + original match + + * +---------------------+ + * | set fte_id (if reg_c preserve cap) + * | do decap (if required) + * v + * +------------------------------------------------+ + * + Flow Sampler Object + + * +------------------------------------------------+ + * + sample ratio + + * +------------------------------------------------+ + * + sample table id | default table id + + * +------------------------------------------------+ + * | | + * v v + * +-----------------------------+ +-------------------+ + * + sample table + + default table + + * +-----------------------------+ +-------------------+ + * + forward to management vport + | + * +-----------------------------+ | + * +-------+------+ + * | |reg_c preserve cap + * | |or decap action + * v v + * +-----------------+ +-------------+ + * + per vport table + + post action + + * +-----------------+ +-------------+ + * + original match + + * +-----------------+ + * + other actions + + * +-----------------+ + */ +struct mlx5_flow_handle * +mlx5e_tc_sample_offload(struct mlx5e_tc_psample *tc_psample, + struct mlx5_flow_spec *spec, + struct mlx5_flow_attr *attr) +{ + struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr; + struct mlx5_esw_flow_attr *pre_esw_attr; + struct mlx5_mapped_obj restore_obj = {}; + struct mlx5e_tc_mod_hdr_acts *mod_acts; + struct mlx5e_sample_flow *sample_flow; + struct mlx5e_sample_attr *sample_attr; + struct mlx5_flow_attr *pre_attr; + u32 tunnel_id = attr->tunnel_id; + struct mlx5_eswitch *esw; + u32 default_tbl_id; + u32 obj_id; + int err; + + if (IS_ERR_OR_NULL(tc_psample)) + return ERR_PTR(-EOPNOTSUPP); + + sample_flow = kzalloc(sizeof(*sample_flow), GFP_KERNEL); + if (!sample_flow) + return ERR_PTR(-ENOMEM); + sample_attr = &attr->sample_attr; + sample_attr->sample_flow = sample_flow; + + /* For NICs with reg_c_preserve support or decap action, use + * post action instead of the per vport, chain and prio table. + * Only match the fte id instead of the same match in the + * original flow table. + */ + esw = tc_psample->esw; + if (mlx5e_tc_act_sample_is_multi_table(esw->dev, attr)) { + struct mlx5_flow_table *ft; + + ft = mlx5e_tc_post_act_get_ft(tc_psample->post_act); + default_tbl_id = ft->id; + } else { + err = add_post_rule(esw, sample_flow, spec, attr, &default_tbl_id); + if (err) + goto err_post_rule; + } + + /* Create sampler object. */ + sample_flow->sampler = sampler_get(tc_psample, sample_attr->rate, default_tbl_id); + if (IS_ERR(sample_flow->sampler)) { + err = PTR_ERR(sample_flow->sampler); + goto err_sampler; + } + sample_attr->sampler_id = sample_flow->sampler->sampler_id; + + /* Create an id mapping reg_c0 value to sample object. */ + restore_obj.type = MLX5_MAPPED_OBJ_SAMPLE; + restore_obj.sample.group_id = sample_attr->group_num; + restore_obj.sample.rate = sample_attr->rate; + restore_obj.sample.trunc_size = sample_attr->trunc_size; + restore_obj.sample.tunnel_id = tunnel_id; + err = mapping_add(esw->offloads.reg_c0_obj_pool, &restore_obj, &obj_id); + if (err) + goto err_obj_id; + sample_attr->restore_obj_id = obj_id; + + /* Create sample restore context. */ + mod_acts = &attr->parse_attr->mod_hdr_acts; + sample_flow->restore = sample_restore_get(tc_psample, obj_id, mod_acts); + if (IS_ERR(sample_flow->restore)) { + err = PTR_ERR(sample_flow->restore); + goto err_sample_restore; + } + + /* Perform the original matches on the original table. Offload the + * sample action. The destination is the sampler object. + */ + pre_attr = mlx5_alloc_flow_attr(MLX5_FLOW_NAMESPACE_FDB); + if (!pre_attr) { + err = -ENOMEM; + goto err_alloc_pre_flow_attr; + } + pre_attr->action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | MLX5_FLOW_CONTEXT_ACTION_MOD_HDR; + /* For decap action, do decap in the original flow table instead of the + * default flow table. + */ + if (tunnel_id) + pre_attr->action |= MLX5_FLOW_CONTEXT_ACTION_DECAP; + pre_attr->modify_hdr = sample_flow->restore->modify_hdr; + pre_attr->flags = MLX5_ATTR_FLAG_SAMPLE; + pre_attr->inner_match_level = attr->inner_match_level; + pre_attr->outer_match_level = attr->outer_match_level; + pre_attr->chain = attr->chain; + pre_attr->prio = attr->prio; + pre_attr->ft = attr->ft; + pre_attr->sample_attr = *sample_attr; + pre_esw_attr = pre_attr->esw_attr; + pre_esw_attr->in_mdev = esw_attr->in_mdev; + pre_esw_attr->in_rep = esw_attr->in_rep; + sample_flow->pre_rule = mlx5_eswitch_add_offloaded_rule(esw, spec, pre_attr); + if (IS_ERR(sample_flow->pre_rule)) { + err = PTR_ERR(sample_flow->pre_rule); + goto err_pre_offload_rule; + } + sample_flow->pre_attr = pre_attr; + + return sample_flow->pre_rule; + +err_pre_offload_rule: + kfree(pre_attr); +err_alloc_pre_flow_attr: + sample_restore_put(tc_psample, sample_flow->restore); +err_sample_restore: + mapping_remove(esw->offloads.reg_c0_obj_pool, obj_id); +err_obj_id: + sampler_put(tc_psample, sample_flow->sampler); +err_sampler: + if (sample_flow->post_rule) + del_post_rule(esw, sample_flow, attr); +err_post_rule: + kfree(sample_flow); + return ERR_PTR(err); +} + +void +mlx5e_tc_sample_unoffload(struct mlx5e_tc_psample *tc_psample, + struct mlx5_flow_handle *rule, + struct mlx5_flow_attr *attr) +{ + struct mlx5e_sample_flow *sample_flow; + struct mlx5_eswitch *esw; + + if (IS_ERR_OR_NULL(tc_psample)) + return; + + /* The following delete order can't be changed, otherwise, + * will hit fw syndromes. + */ + esw = tc_psample->esw; + sample_flow = attr->sample_attr.sample_flow; + mlx5_eswitch_del_offloaded_rule(esw, sample_flow->pre_rule, sample_flow->pre_attr); + + sample_restore_put(tc_psample, sample_flow->restore); + mapping_remove(esw->offloads.reg_c0_obj_pool, attr->sample_attr.restore_obj_id); + sampler_put(tc_psample, sample_flow->sampler); + if (sample_flow->post_rule) + del_post_rule(esw, sample_flow, attr); + + kfree(sample_flow->pre_attr); + kfree(sample_flow); +} + +struct mlx5e_tc_psample * +mlx5e_tc_sample_init(struct mlx5_eswitch *esw, struct mlx5e_post_act *post_act) +{ + struct mlx5e_tc_psample *tc_psample; + int err; + + tc_psample = kzalloc(sizeof(*tc_psample), GFP_KERNEL); + if (!tc_psample) + return ERR_PTR(-ENOMEM); + if (IS_ERR_OR_NULL(post_act)) { + err = PTR_ERR(post_act); + goto err_post_act; + } + tc_psample->post_act = post_act; + tc_psample->esw = esw; + err = sampler_termtbl_create(tc_psample); + if (err) + goto err_post_act; + + mutex_init(&tc_psample->ht_lock); + mutex_init(&tc_psample->restore_lock); + + return tc_psample; + +err_post_act: + kfree(tc_psample); + return ERR_PTR(err); +} + +void +mlx5e_tc_sample_cleanup(struct mlx5e_tc_psample *tc_psample) +{ + if (IS_ERR_OR_NULL(tc_psample)) + return; + + mutex_destroy(&tc_psample->restore_lock); + mutex_destroy(&tc_psample->ht_lock); + sampler_termtbl_destroy(tc_psample); + kfree(tc_psample); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/sample.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/sample.h new file mode 100644 index 0000000..a569367 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc/sample.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2021 Mellanox Technologies. */ + +#ifndef __MLX5_EN_TC_SAMPLE_H__ +#define __MLX5_EN_TC_SAMPLE_H__ + +#include "eswitch.h" + +struct mlx5_flow_attr; +struct mlx5e_tc_psample; +struct mlx5e_post_act; + +struct mlx5e_sample_attr { + u32 group_num; + u32 rate; + u32 trunc_size; + u32 restore_obj_id; + u32 sampler_id; + struct mlx5e_sample_flow *sample_flow; +}; + +#if IS_ENABLED(CONFIG_MLX5_TC_SAMPLE) + +void mlx5e_tc_sample_skb(struct sk_buff *skb, struct mlx5_mapped_obj *mapped_obj); + +struct mlx5_flow_handle * +mlx5e_tc_sample_offload(struct mlx5e_tc_psample *sample_priv, + struct mlx5_flow_spec *spec, + struct mlx5_flow_attr *attr); + +void +mlx5e_tc_sample_unoffload(struct mlx5e_tc_psample *sample_priv, + struct mlx5_flow_handle *rule, + struct mlx5_flow_attr *attr); + +struct mlx5e_tc_psample * +mlx5e_tc_sample_init(struct mlx5_eswitch *esw, struct mlx5e_post_act *post_act); + +void +mlx5e_tc_sample_cleanup(struct mlx5e_tc_psample *tc_psample); + +#else /* CONFIG_MLX5_TC_SAMPLE */ + +static inline struct mlx5_flow_handle * +mlx5e_tc_sample_offload(struct mlx5e_tc_psample *tc_psample, + struct mlx5_flow_spec *spec, + struct mlx5_flow_attr *attr) +{ return ERR_PTR(-EOPNOTSUPP); } + +static inline void +mlx5e_tc_sample_unoffload(struct mlx5e_tc_psample *tc_psample, + struct mlx5_flow_handle *rule, + struct mlx5_flow_attr *attr) {} + +static inline struct mlx5e_tc_psample * +mlx5e_tc_sample_init(struct mlx5_eswitch *esw, struct mlx5e_post_act *post_act) +{ return ERR_PTR(-EOPNOTSUPP); } + +static inline void +mlx5e_tc_sample_cleanup(struct mlx5e_tc_psample *tc_psample) {} + +static inline void +mlx5e_tc_sample_skb(struct sk_buff *skb, struct mlx5_mapped_obj *mapped_obj) {} + +#endif /* CONFIG_MLX5_TC_SAMPLE */ +#endif /* __MLX5_EN_TC_SAMPLE_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c new file mode 100644 index 0000000..611db4a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c @@ -0,0 +1,2394 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2019 Mellanox Technologies. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "lib/fs_chains.h" +#include "en/tc_ct.h" +#include "en/tc_priv.h" +#include "en/tc/ct_fs.h" +#include "en/mod_hdr.h" +#include "en/mapping.h" +#include "en/tc/post_act.h" +#include "en.h" +#include "en_tc.h" +#include "en_rep.h" +#include "fs_core.h" + +#define MLX5_CT_STATE_ESTABLISHED_BIT BIT(1) +#define MLX5_CT_STATE_TRK_BIT BIT(2) +#define MLX5_CT_STATE_NAT_BIT BIT(3) +#define MLX5_CT_STATE_REPLY_BIT BIT(4) +#define MLX5_CT_STATE_RELATED_BIT BIT(5) +#define MLX5_CT_STATE_INVALID_BIT BIT(6) + +#define MLX5_CT_LABELS_BITS MLX5_REG_MAPPING_MBITS(LABELS_TO_REG) +#define MLX5_CT_LABELS_MASK MLX5_REG_MAPPING_MASK(LABELS_TO_REG) + +/* Statically allocate modify actions for + * ipv6 and port nat (5) + tuple fields (4) + nic mode zone restore (1) = 10. + * This will be increased dynamically if needed (for the ipv6 snat + dnat). + */ +#define MLX5_CT_MIN_MOD_ACTS 10 + +#define ct_dbg(fmt, args...)\ + netdev_dbg(ct_priv->netdev, "ct_debug: " fmt "\n", ##args) + +#define MLX5_CT_DEFAULT_MAX_OFFLOADED_CONNS UINT_MAX + +struct mlx5_tc_ct_priv { + struct mlx5_core_dev *dev; + const struct net_device *netdev; + struct mod_hdr_tbl *mod_hdr_tbl; + struct xarray tuple_ids; + struct rhashtable zone_ht; + struct rhashtable ct_tuples_ht; + struct rhashtable ct_tuples_nat_ht; + struct mlx5_flow_table *ct; + struct mlx5_flow_table *ct_nat; + struct mlx5_flow_group *ct_nat_miss_group; + struct mlx5_flow_handle *ct_nat_miss_rule; + struct mlx5e_post_act *post_act; + struct mutex control_lock; /* guards parallel adds/dels */ + struct mapping_ctx *zone_mapping; + struct mapping_ctx *labels_mapping; + enum mlx5_flow_namespace_type ns_type; + struct mlx5_fs_chains *chains; + struct mlx5_ct_fs *fs; + struct mlx5_ct_fs_ops *fs_ops; + spinlock_t ht_lock; /* protects ft entries */ + atomic_t num_offloaded_flows; + bool use_label_mapping; + struct workqueue_struct *wq; +}; + +struct mlx5_ct_flow { + struct mlx5_flow_attr *pre_ct_attr; + struct mlx5_flow_handle *pre_ct_rule; + struct mlx5_ct_ft *ft; + u32 chain_mapping; +}; + +struct mlx5_ct_zone_rule { + struct mlx5_ct_fs_rule *rule; + struct mlx5e_mod_hdr_handle *mh; + struct mlx5_flow_attr *attr; + bool nat; +}; + +struct mlx5_tc_ct_pre { + struct mlx5_flow_table *ft; + struct mlx5_flow_group *flow_grp; + struct mlx5_flow_group *miss_grp; + struct mlx5_flow_handle *flow_rule; + struct mlx5_flow_handle *miss_rule; + struct mlx5_modify_hdr *modify_hdr; +}; + +struct mlx5_ct_ft { + struct rhash_head node; + u16 zone; + u32 zone_restore_id; + refcount_t refcount; + struct nf_flowtable *nf_ft; + struct mlx5_tc_ct_priv *ct_priv; + struct rhashtable ct_entries_ht; + struct mlx5_tc_ct_pre pre_ct; + struct mlx5_tc_ct_pre pre_ct_nat; +}; + +struct mlx5_ct_tuple { + u16 addr_type; + __be16 n_proto; + u8 ip_proto; + struct { + union { + __be32 src_v4; + struct in6_addr src_v6; + }; + union { + __be32 dst_v4; + struct in6_addr dst_v6; + }; + } ip; + struct { + __be16 src; + __be16 dst; + } port; + + u16 zone; +}; + +struct mlx5_ct_counter { + struct mlx5_fc *counter; + refcount_t refcount; + bool is_shared; +}; + +enum { + MLX5_CT_ENTRY_FLAG_VALID, + MLX5_CT_ENTRY_IN_CT_TABLE, + MLX5_CT_ENTRY_IN_CT_NAT_TABLE, +}; + +struct mlx5_ct_entry { + struct rhash_head node; + struct rhash_head tuple_node; + struct rhash_head tuple_nat_node; + struct mlx5_ct_counter *counter; + unsigned long cookie; + unsigned long restore_cookie; + struct mlx5_ct_tuple tuple; + struct mlx5_ct_tuple tuple_nat; + struct mlx5_ct_zone_rule zone_rules[2]; + + struct mlx5_tc_ct_priv *ct_priv; + struct work_struct work; + + refcount_t refcnt; + unsigned long flags; +}; + +static void +mlx5_tc_ct_entry_destroy_mod_hdr(struct mlx5_tc_ct_priv *ct_priv, + struct mlx5_flow_attr *attr, + struct mlx5e_mod_hdr_handle *mh); + +static const struct rhashtable_params cts_ht_params = { + .head_offset = offsetof(struct mlx5_ct_entry, node), + .key_offset = offsetof(struct mlx5_ct_entry, cookie), + .key_len = sizeof(((struct mlx5_ct_entry *)0)->cookie), + .automatic_shrinking = true, + .min_size = 16 * 1024, +}; + +static const struct rhashtable_params zone_params = { + .head_offset = offsetof(struct mlx5_ct_ft, node), + .key_offset = offsetof(struct mlx5_ct_ft, zone), + .key_len = sizeof(((struct mlx5_ct_ft *)0)->zone), + .automatic_shrinking = true, +}; + +static const struct rhashtable_params tuples_ht_params = { + .head_offset = offsetof(struct mlx5_ct_entry, tuple_node), + .key_offset = offsetof(struct mlx5_ct_entry, tuple), + .key_len = sizeof(((struct mlx5_ct_entry *)0)->tuple), + .automatic_shrinking = true, + .min_size = 16 * 1024, +}; + +static const struct rhashtable_params tuples_nat_ht_params = { + .head_offset = offsetof(struct mlx5_ct_entry, tuple_nat_node), + .key_offset = offsetof(struct mlx5_ct_entry, tuple_nat), + .key_len = sizeof(((struct mlx5_ct_entry *)0)->tuple_nat), + .automatic_shrinking = true, + .min_size = 16 * 1024, +}; + +static bool +mlx5_tc_ct_entry_in_ct_table(struct mlx5_ct_entry *entry) +{ + return test_bit(MLX5_CT_ENTRY_IN_CT_TABLE, &entry->flags); +} + +static bool +mlx5_tc_ct_entry_in_ct_nat_table(struct mlx5_ct_entry *entry) +{ + return test_bit(MLX5_CT_ENTRY_IN_CT_NAT_TABLE, &entry->flags); +} + +static int +mlx5_get_label_mapping(struct mlx5_tc_ct_priv *ct_priv, + u32 *labels, u32 *id) +{ + if (!ct_priv->use_label_mapping) { + if (labels[1] || labels[2] || labels[3]) { + /* labels here are either from a tc flower match or those to be set for a + * ct tuple. + * + * We can support matches that match specificly on higher bits + * being zero or don't care about them as long as both set and match go + * through this validation. + */ + + ct_dbg("ct_labels high bits not zero 0x%08x 0x%08x 0x%08x 0x%08x", + labels[3], labels[2], labels[1], labels[0]); + return -EOPNOTSUPP; + } + + *id = labels[0]; + return 0; + } + + if (!memchr_inv(labels, 0, sizeof(u32) * 4)) { + *id = 0; + return 0; + } + + if (mapping_add(ct_priv->labels_mapping, labels, id)) + return -EOPNOTSUPP; + + return 0; +} + +static void +mlx5_put_label_mapping(struct mlx5_tc_ct_priv *ct_priv, u32 id) +{ + if (id && ct_priv->use_label_mapping) + mapping_remove(ct_priv->labels_mapping, id); +} + +u32 +mlx5_tc_ct_max_offloaded_conns_get(struct mlx5_core_dev *dev) +{ + return dev->mlx5e_res.ct.max_offloaded_conns; +} + +void +mlx5_tc_ct_max_offloaded_conns_set(struct mlx5_core_dev *dev, u32 max) +{ + dev->mlx5e_res.ct.max_offloaded_conns = max; +} + +static void +mlx5_tc_ct_init_ct_max_offloaded_conns(struct mlx5_core_dev *dev) +{ + dev->mlx5e_res.ct.max_offloaded_conns = MLX5_CT_DEFAULT_MAX_OFFLOADED_CONNS; +} + +static int +mlx5_tc_ct_rule_to_tuple(struct mlx5_ct_tuple *tuple, struct flow_rule *rule) +{ + struct flow_match_control control; + struct flow_match_basic basic; + + flow_rule_match_basic(rule, &basic); + flow_rule_match_control(rule, &control); + + tuple->n_proto = basic.key->n_proto; + tuple->ip_proto = basic.key->ip_proto; + tuple->addr_type = control.key->addr_type; + + if (tuple->addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { + struct flow_match_ipv4_addrs match; + + flow_rule_match_ipv4_addrs(rule, &match); + tuple->ip.src_v4 = match.key->src; + tuple->ip.dst_v4 = match.key->dst; + } else if (tuple->addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { + struct flow_match_ipv6_addrs match; + + flow_rule_match_ipv6_addrs(rule, &match); + tuple->ip.src_v6 = match.key->src; + tuple->ip.dst_v6 = match.key->dst; + } else { + return -EOPNOTSUPP; + } + + if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_PORTS)) { + struct flow_match_ports match; + + flow_rule_match_ports(rule, &match); + switch (tuple->ip_proto) { + case IPPROTO_TCP: + case IPPROTO_UDP: + tuple->port.src = match.key->src; + tuple->port.dst = match.key->dst; + break; + default: + return -EOPNOTSUPP; + } + } else { + if (tuple->ip_proto != IPPROTO_GRE) + return -EOPNOTSUPP; + } + + return 0; +} + +static int +mlx5_tc_ct_rule_to_tuple_nat(struct mlx5_ct_tuple *tuple, + struct flow_rule *rule) +{ + struct flow_action *flow_action = &rule->action; + struct flow_action_entry *act; + u32 offset, val, ip6_offset; + int i; + + flow_action_for_each(i, act, flow_action) { + if (act->id != FLOW_ACTION_MANGLE) + continue; + + offset = act->mangle.offset; + val = act->mangle.val; + switch (act->mangle.htype) { + case FLOW_ACT_MANGLE_HDR_TYPE_IP4: + if (offset == offsetof(struct iphdr, saddr)) + tuple->ip.src_v4 = cpu_to_be32(val); + else if (offset == offsetof(struct iphdr, daddr)) + tuple->ip.dst_v4 = cpu_to_be32(val); + else + return -EOPNOTSUPP; + break; + + case FLOW_ACT_MANGLE_HDR_TYPE_IP6: + ip6_offset = (offset - offsetof(struct ipv6hdr, saddr)); + ip6_offset /= 4; + if (ip6_offset < 4) + tuple->ip.src_v6.s6_addr32[ip6_offset] = cpu_to_be32(val); + else if (ip6_offset < 8) + tuple->ip.dst_v6.s6_addr32[ip6_offset - 4] = cpu_to_be32(val); + else + return -EOPNOTSUPP; + break; + + case FLOW_ACT_MANGLE_HDR_TYPE_TCP: + if (offset == offsetof(struct tcphdr, source)) + tuple->port.src = cpu_to_be16(val); + else if (offset == offsetof(struct tcphdr, dest)) + tuple->port.dst = cpu_to_be16(val); + else + return -EOPNOTSUPP; + break; + + case FLOW_ACT_MANGLE_HDR_TYPE_UDP: + if (offset == offsetof(struct udphdr, source)) + tuple->port.src = cpu_to_be16(val); + else if (offset == offsetof(struct udphdr, dest)) + tuple->port.dst = cpu_to_be16(val); + else + return -EOPNOTSUPP; + break; + + default: + return -EOPNOTSUPP; + } + } + + return 0; +} + +static int +mlx5_tc_ct_get_flow_source_match(struct mlx5_tc_ct_priv *ct_priv, + struct net_device *ndev) +{ + struct mlx5e_priv *other_priv = netdev_priv(ndev); + struct mlx5_core_dev *mdev = ct_priv->dev; + bool vf_rep, uplink_rep; + + vf_rep = mlx5e_eswitch_vf_rep(ndev) && mlx5_same_hw_devs(mdev, other_priv->mdev); + uplink_rep = mlx5e_eswitch_uplink_rep(ndev) && mlx5_same_hw_devs(mdev, other_priv->mdev); + + if (vf_rep) + return MLX5_FLOW_CONTEXT_FLOW_SOURCE_LOCAL_VPORT; + if (uplink_rep) + return MLX5_FLOW_CONTEXT_FLOW_SOURCE_UPLINK; + if (is_vlan_dev(ndev)) + return mlx5_tc_ct_get_flow_source_match(ct_priv, vlan_dev_real_dev(ndev)); + if (netif_is_macvlan(ndev)) + return mlx5_tc_ct_get_flow_source_match(ct_priv, macvlan_dev_real_dev(ndev)); + if (mlx5e_get_tc_tun(ndev) || netif_is_lag_master(ndev)) + return MLX5_FLOW_CONTEXT_FLOW_SOURCE_UPLINK; + + return MLX5_FLOW_CONTEXT_FLOW_SOURCE_ANY_VPORT; +} + +static int +mlx5_tc_ct_set_tuple_match(struct mlx5_tc_ct_priv *ct_priv, + struct mlx5_flow_spec *spec, + struct flow_rule *rule) +{ + void *headers_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, + outer_headers); + void *headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, + outer_headers); + u16 addr_type = 0; + u8 ip_proto = 0; + + if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_BASIC)) { + struct flow_match_basic match; + + flow_rule_match_basic(rule, &match); + + mlx5e_tc_set_ethertype(ct_priv->dev, &match, true, headers_c, headers_v); + MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol, + match.mask->ip_proto); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol, + match.key->ip_proto); + + ip_proto = match.key->ip_proto; + } + + if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_CONTROL)) { + struct flow_match_control match; + + flow_rule_match_control(rule, &match); + addr_type = match.key->addr_type; + } + + if (addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { + struct flow_match_ipv4_addrs match; + + flow_rule_match_ipv4_addrs(rule, &match); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c, + src_ipv4_src_ipv6.ipv4_layout.ipv4), + &match.mask->src, sizeof(match.mask->src)); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, + src_ipv4_src_ipv6.ipv4_layout.ipv4), + &match.key->src, sizeof(match.key->src)); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c, + dst_ipv4_dst_ipv6.ipv4_layout.ipv4), + &match.mask->dst, sizeof(match.mask->dst)); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, + dst_ipv4_dst_ipv6.ipv4_layout.ipv4), + &match.key->dst, sizeof(match.key->dst)); + } + + if (addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { + struct flow_match_ipv6_addrs match; + + flow_rule_match_ipv6_addrs(rule, &match); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c, + src_ipv4_src_ipv6.ipv6_layout.ipv6), + &match.mask->src, sizeof(match.mask->src)); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, + src_ipv4_src_ipv6.ipv6_layout.ipv6), + &match.key->src, sizeof(match.key->src)); + + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c, + dst_ipv4_dst_ipv6.ipv6_layout.ipv6), + &match.mask->dst, sizeof(match.mask->dst)); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, + dst_ipv4_dst_ipv6.ipv6_layout.ipv6), + &match.key->dst, sizeof(match.key->dst)); + } + + if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_PORTS)) { + struct flow_match_ports match; + + flow_rule_match_ports(rule, &match); + switch (ip_proto) { + case IPPROTO_TCP: + MLX5_SET(fte_match_set_lyr_2_4, headers_c, + tcp_sport, ntohs(match.mask->src)); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, + tcp_sport, ntohs(match.key->src)); + + MLX5_SET(fte_match_set_lyr_2_4, headers_c, + tcp_dport, ntohs(match.mask->dst)); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, + tcp_dport, ntohs(match.key->dst)); + break; + + case IPPROTO_UDP: + MLX5_SET(fte_match_set_lyr_2_4, headers_c, + udp_sport, ntohs(match.mask->src)); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, + udp_sport, ntohs(match.key->src)); + + MLX5_SET(fte_match_set_lyr_2_4, headers_c, + udp_dport, ntohs(match.mask->dst)); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, + udp_dport, ntohs(match.key->dst)); + break; + default: + break; + } + } + + if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_TCP)) { + struct flow_match_tcp match; + + flow_rule_match_tcp(rule, &match); + MLX5_SET(fte_match_set_lyr_2_4, headers_c, tcp_flags, + ntohs(match.mask->flags)); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, tcp_flags, + ntohs(match.key->flags)); + } + + if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_META)) { + struct flow_match_meta match; + + flow_rule_match_meta(rule, &match); + + if (match.key->ingress_ifindex & match.mask->ingress_ifindex) { + struct net_device *dev; + + dev = dev_get_by_index(&init_net, match.key->ingress_ifindex); + if (dev && MLX5_CAP_ESW_FLOWTABLE(ct_priv->dev, flow_source)) + spec->flow_context.flow_source = + mlx5_tc_ct_get_flow_source_match(ct_priv, dev); + + if (dev) + dev_put(dev); + } + } + + return 0; +} + +static void +mlx5_tc_ct_counter_put(struct mlx5_tc_ct_priv *ct_priv, struct mlx5_ct_entry *entry) +{ + if (entry->counter->is_shared && + !refcount_dec_and_test(&entry->counter->refcount)) + return; + + mlx5_fc_destroy(ct_priv->dev, entry->counter->counter); + kfree(entry->counter); +} + +static void +mlx5_tc_ct_entry_del_rule(struct mlx5_tc_ct_priv *ct_priv, + struct mlx5_ct_entry *entry, + bool nat) +{ + struct mlx5_ct_zone_rule *zone_rule = &entry->zone_rules[nat]; + struct mlx5_flow_attr *attr = zone_rule->attr; + + ct_dbg("Deleting ct entry 0x%p rule in zone %d", entry, entry->tuple.zone); + + ct_priv->fs_ops->ct_rule_del(ct_priv->fs, zone_rule->rule); + mlx5_tc_ct_entry_destroy_mod_hdr(ct_priv, zone_rule->attr, zone_rule->mh); + mlx5_put_label_mapping(ct_priv, attr->ct_attr.ct_labels_id); + kfree(attr); +} + +static void +mlx5_tc_ct_entry_del_rules(struct mlx5_tc_ct_priv *ct_priv, + struct mlx5_ct_entry *entry) +{ + atomic_dec(&ct_priv->num_offloaded_flows); + + if (mlx5_tc_ct_entry_in_ct_nat_table(entry)) + mlx5_tc_ct_entry_del_rule(ct_priv, entry, true); + if (mlx5_tc_ct_entry_in_ct_table(entry)) + mlx5_tc_ct_entry_del_rule(ct_priv, entry, false); + + atomic_dec(&ct_priv->dev->priv.ct_debugfs->stats.offloaded); +} + +static struct flow_action_entry * +mlx5_tc_ct_get_ct_metadata_action(struct flow_rule *flow_rule) +{ + struct flow_action *flow_action = &flow_rule->action; + struct flow_action_entry *act; + int i; + + flow_action_for_each(i, act, flow_action) { + if (act->id == FLOW_ACTION_CT_METADATA) + return act; + } + + return NULL; +} + +static int +mlx5_tc_ct_entry_set_registers(struct mlx5_tc_ct_priv *ct_priv, + struct mlx5e_tc_mod_hdr_acts *mod_acts, + u8 ct_state, + u32 mark, + u32 labels_id, + u8 zone_restore_id) +{ + enum mlx5_flow_namespace_type ns = ct_priv->ns_type; + struct mlx5_core_dev *dev = ct_priv->dev; + int err; + + err = mlx5e_tc_match_to_reg_set(dev, mod_acts, ns, + CTSTATE_TO_REG, ct_state); + if (err) + return err; + + err = mlx5e_tc_match_to_reg_set(dev, mod_acts, ns, + MARK_TO_REG, mark); + if (err) + return err; + + err = mlx5e_tc_match_to_reg_set(dev, mod_acts, ns, + LABELS_TO_REG, labels_id); + if (err) + return err; + + err = mlx5e_tc_match_to_reg_set(dev, mod_acts, ns, + ZONE_RESTORE_TO_REG, zone_restore_id); + if (err) + return err; + + /* Make another copy of zone id in reg_b for + * NIC rx flows since we don't copy reg_c1 to + * reg_b upon miss. + */ + if (ns != MLX5_FLOW_NAMESPACE_FDB) { + err = mlx5e_tc_match_to_reg_set(dev, mod_acts, ns, + NIC_ZONE_RESTORE_TO_REG, zone_restore_id); + if (err) + return err; + } + return 0; +} + +int mlx5_tc_ct_set_ct_clear_regs(struct mlx5_tc_ct_priv *priv, + struct mlx5e_tc_mod_hdr_acts *mod_acts) +{ + return mlx5_tc_ct_entry_set_registers(priv, mod_acts, 0, 0, 0, 0); +} + +static int +mlx5_tc_ct_parse_mangle_to_mod_act(struct flow_action_entry *act, + char *modact) +{ + u32 offset = act->mangle.offset, field; + + switch (act->mangle.htype) { + case FLOW_ACT_MANGLE_HDR_TYPE_IP4: + MLX5_SET(set_action_in, modact, length, 0); + if (offset == offsetof(struct iphdr, saddr)) + field = MLX5_ACTION_IN_FIELD_OUT_SIPV4; + else if (offset == offsetof(struct iphdr, daddr)) + field = MLX5_ACTION_IN_FIELD_OUT_DIPV4; + else + return -EOPNOTSUPP; + break; + + case FLOW_ACT_MANGLE_HDR_TYPE_IP6: + MLX5_SET(set_action_in, modact, length, 0); + if (offset == offsetof(struct ipv6hdr, saddr) + 12) + field = MLX5_ACTION_IN_FIELD_OUT_SIPV6_31_0; + else if (offset == offsetof(struct ipv6hdr, saddr) + 8) + field = MLX5_ACTION_IN_FIELD_OUT_SIPV6_63_32; + else if (offset == offsetof(struct ipv6hdr, saddr) + 4) + field = MLX5_ACTION_IN_FIELD_OUT_SIPV6_95_64; + else if (offset == offsetof(struct ipv6hdr, saddr)) + field = MLX5_ACTION_IN_FIELD_OUT_SIPV6_127_96; + else if (offset == offsetof(struct ipv6hdr, daddr) + 12) + field = MLX5_ACTION_IN_FIELD_OUT_DIPV6_31_0; + else if (offset == offsetof(struct ipv6hdr, daddr) + 8) + field = MLX5_ACTION_IN_FIELD_OUT_DIPV6_63_32; + else if (offset == offsetof(struct ipv6hdr, daddr) + 4) + field = MLX5_ACTION_IN_FIELD_OUT_DIPV6_95_64; + else if (offset == offsetof(struct ipv6hdr, daddr)) + field = MLX5_ACTION_IN_FIELD_OUT_DIPV6_127_96; + else + return -EOPNOTSUPP; + break; + + case FLOW_ACT_MANGLE_HDR_TYPE_TCP: + MLX5_SET(set_action_in, modact, length, 16); + if (offset == offsetof(struct tcphdr, source)) + field = MLX5_ACTION_IN_FIELD_OUT_TCP_SPORT; + else if (offset == offsetof(struct tcphdr, dest)) + field = MLX5_ACTION_IN_FIELD_OUT_TCP_DPORT; + else + return -EOPNOTSUPP; + break; + + case FLOW_ACT_MANGLE_HDR_TYPE_UDP: + MLX5_SET(set_action_in, modact, length, 16); + if (offset == offsetof(struct udphdr, source)) + field = MLX5_ACTION_IN_FIELD_OUT_UDP_SPORT; + else if (offset == offsetof(struct udphdr, dest)) + field = MLX5_ACTION_IN_FIELD_OUT_UDP_DPORT; + else + return -EOPNOTSUPP; + break; + + default: + return -EOPNOTSUPP; + } + + MLX5_SET(set_action_in, modact, action_type, MLX5_ACTION_TYPE_SET); + MLX5_SET(set_action_in, modact, offset, 0); + MLX5_SET(set_action_in, modact, field, field); + MLX5_SET(set_action_in, modact, data, act->mangle.val); + + return 0; +} + +static int +mlx5_tc_ct_entry_create_nat(struct mlx5_tc_ct_priv *ct_priv, + struct flow_rule *flow_rule, + struct mlx5e_tc_mod_hdr_acts *mod_acts) +{ + struct flow_action *flow_action = &flow_rule->action; + struct mlx5_core_dev *mdev = ct_priv->dev; + struct flow_action_entry *act; + char *modact; + int err, i; + + flow_action_for_each(i, act, flow_action) { + switch (act->id) { + case FLOW_ACTION_MANGLE: { + modact = mlx5e_mod_hdr_alloc(mdev, ct_priv->ns_type, mod_acts); + if (IS_ERR(modact)) + return PTR_ERR(modact); + + err = mlx5_tc_ct_parse_mangle_to_mod_act(act, modact); + if (err) + return err; + + mod_acts->num_actions++; + } + break; + + case FLOW_ACTION_CT_METADATA: + /* Handled earlier */ + continue; + default: + return -EOPNOTSUPP; + } + } + + return 0; +} + +static int +mlx5_tc_ct_entry_create_mod_hdr(struct mlx5_tc_ct_priv *ct_priv, + struct mlx5_flow_attr *attr, + struct flow_rule *flow_rule, + struct mlx5e_mod_hdr_handle **mh, + u8 zone_restore_id, bool nat) +{ + DECLARE_MOD_HDR_ACTS_ACTIONS(actions_arr, MLX5_CT_MIN_MOD_ACTS); + DECLARE_MOD_HDR_ACTS(mod_acts, actions_arr); + struct flow_action_entry *meta; + u16 ct_state = 0; + int err; + + meta = mlx5_tc_ct_get_ct_metadata_action(flow_rule); + if (!meta) + return -EOPNOTSUPP; + + err = mlx5_get_label_mapping(ct_priv, meta->ct_metadata.labels, + &attr->ct_attr.ct_labels_id); + if (err) + return -EOPNOTSUPP; + if (nat) { + err = mlx5_tc_ct_entry_create_nat(ct_priv, flow_rule, + &mod_acts); + if (err) + goto err_mapping; + + ct_state |= MLX5_CT_STATE_NAT_BIT; + } + + ct_state |= MLX5_CT_STATE_ESTABLISHED_BIT | MLX5_CT_STATE_TRK_BIT; + ct_state |= meta->ct_metadata.orig_dir ? 0 : MLX5_CT_STATE_REPLY_BIT; + err = mlx5_tc_ct_entry_set_registers(ct_priv, &mod_acts, + ct_state, + meta->ct_metadata.mark, + attr->ct_attr.ct_labels_id, + zone_restore_id); + if (err) + goto err_mapping; + + if (nat) { + attr->modify_hdr = mlx5_modify_header_alloc(ct_priv->dev, ct_priv->ns_type, + mod_acts.num_actions, + mod_acts.actions); + if (IS_ERR(attr->modify_hdr)) { + err = PTR_ERR(attr->modify_hdr); + goto err_mapping; + } + + *mh = NULL; + } else { + *mh = mlx5e_mod_hdr_attach(ct_priv->dev, + ct_priv->mod_hdr_tbl, + ct_priv->ns_type, + &mod_acts); + if (IS_ERR(*mh)) { + err = PTR_ERR(*mh); + goto err_mapping; + } + attr->modify_hdr = mlx5e_mod_hdr_get(*mh); + } + + mlx5e_mod_hdr_dealloc(&mod_acts); + return 0; + +err_mapping: + mlx5e_mod_hdr_dealloc(&mod_acts); + mlx5_put_label_mapping(ct_priv, attr->ct_attr.ct_labels_id); + return err; +} + +static void +mlx5_tc_ct_entry_destroy_mod_hdr(struct mlx5_tc_ct_priv *ct_priv, + struct mlx5_flow_attr *attr, + struct mlx5e_mod_hdr_handle *mh) +{ + if (mh) + mlx5e_mod_hdr_detach(ct_priv->dev, ct_priv->mod_hdr_tbl, mh); + else + mlx5_modify_header_dealloc(ct_priv->dev, attr->modify_hdr); +} + +static int +mlx5_tc_ct_entry_add_rule(struct mlx5_tc_ct_priv *ct_priv, + struct flow_rule *flow_rule, + struct mlx5_ct_entry *entry, + bool nat, u8 zone_restore_id) +{ + struct mlx5_ct_zone_rule *zone_rule = &entry->zone_rules[nat]; + struct mlx5e_priv *priv = netdev_priv(ct_priv->netdev); + struct mlx5_flow_spec *spec = NULL; + struct mlx5_flow_attr *attr; + int err; + + zone_rule->nat = nat; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) + return -ENOMEM; + + attr = mlx5_alloc_flow_attr(ct_priv->ns_type); + if (!attr) { + err = -ENOMEM; + goto err_attr; + } + + err = mlx5_tc_ct_entry_create_mod_hdr(ct_priv, attr, flow_rule, + &zone_rule->mh, + zone_restore_id, nat); + if (err) { + ct_dbg("Failed to create ct entry mod hdr"); + goto err_mod_hdr; + } + + attr->action = MLX5_FLOW_CONTEXT_ACTION_MOD_HDR | + MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | + MLX5_FLOW_CONTEXT_ACTION_COUNT; + attr->dest_chain = 0; + attr->dest_ft = mlx5e_tc_post_act_get_ft(ct_priv->post_act); + attr->ft = nat ? ct_priv->ct_nat : ct_priv->ct; + if (entry->tuple.ip_proto == IPPROTO_TCP || + entry->tuple.ip_proto == IPPROTO_UDP) + attr->outer_match_level = MLX5_MATCH_L4; + else + attr->outer_match_level = MLX5_MATCH_L3; + attr->counter = entry->counter->counter; + attr->flags |= MLX5_ATTR_FLAG_NO_IN_PORT; + if (ct_priv->ns_type == MLX5_FLOW_NAMESPACE_FDB) + attr->esw_attr->in_mdev = priv->mdev; + + mlx5_tc_ct_set_tuple_match(ct_priv, spec, flow_rule); + mlx5e_tc_match_to_reg_match(spec, ZONE_TO_REG, entry->tuple.zone, MLX5_CT_ZONE_MASK); + + zone_rule->rule = ct_priv->fs_ops->ct_rule_add(ct_priv->fs, spec, attr, flow_rule); + if (IS_ERR(zone_rule->rule)) { + err = PTR_ERR(zone_rule->rule); + ct_dbg("Failed to add ct entry rule, nat: %d", nat); + goto err_rule; + } + + zone_rule->attr = attr; + + kvfree(spec); + ct_dbg("Offloaded ct entry rule in zone %d", entry->tuple.zone); + + return 0; + +err_rule: + mlx5_tc_ct_entry_destroy_mod_hdr(ct_priv, zone_rule->attr, zone_rule->mh); + mlx5_put_label_mapping(ct_priv, attr->ct_attr.ct_labels_id); +err_mod_hdr: + kfree(attr); +err_attr: + kvfree(spec); + return err; +} + +static bool +mlx5_tc_ct_entry_valid(struct mlx5_ct_entry *entry) +{ + return test_bit(MLX5_CT_ENTRY_FLAG_VALID, &entry->flags); +} + +static bool +mlx5_tc_ct_dup_nat_entries(struct mlx5_tc_ct_priv *ct_priv) +{ + return ct_priv->dev->mlx5e_res.ct.ct_action_on_nat_conns; +} + +static struct mlx5_ct_entry * +mlx5_tc_ct_entry_get(struct mlx5_tc_ct_priv *ct_priv, struct mlx5_ct_tuple *tuple) +{ + struct mlx5_ct_entry *entry; + + entry = rhashtable_lookup_fast(&ct_priv->ct_tuples_ht, tuple, + tuples_ht_params); + if (entry && mlx5_tc_ct_entry_valid(entry) && + refcount_inc_not_zero(&entry->refcnt)) { + return entry; + } else if (!entry) { + entry = rhashtable_lookup_fast(&ct_priv->ct_tuples_nat_ht, + tuple, tuples_nat_ht_params); + if (entry && mlx5_tc_ct_entry_valid(entry) && + refcount_inc_not_zero(&entry->refcnt)) + return entry; + } + + return entry ? ERR_PTR(-EINVAL) : NULL; +} + +static void mlx5_tc_ct_entry_remove_from_tuples(struct mlx5_ct_entry *entry) +{ + struct mlx5_tc_ct_priv *ct_priv = entry->ct_priv; + + if (mlx5_tc_ct_entry_in_ct_table(entry)) + rhashtable_remove_fast(&ct_priv->ct_tuples_ht, &entry->tuple_node, + tuples_ht_params); + if (mlx5_tc_ct_entry_in_ct_nat_table(entry)) + rhashtable_remove_fast(&ct_priv->ct_tuples_nat_ht, + &entry->tuple_nat_node, + tuples_nat_ht_params); +} + +static void mlx5_tc_ct_entry_del(struct mlx5_ct_entry *entry) +{ + struct mlx5_tc_ct_priv *ct_priv = entry->ct_priv; + + mlx5_tc_ct_entry_del_rules(ct_priv, entry); + + spin_lock_bh(&ct_priv->ht_lock); + mlx5_tc_ct_entry_remove_from_tuples(entry); + spin_unlock_bh(&ct_priv->ht_lock); + + mlx5_tc_ct_counter_put(ct_priv, entry); + kfree(entry); +} + +static void +mlx5_tc_ct_entry_put(struct mlx5_ct_entry *entry) +{ + if (!refcount_dec_and_test(&entry->refcnt)) + return; + + mlx5_tc_ct_entry_del(entry); +} + +static void mlx5_tc_ct_entry_del_work(struct work_struct *work) +{ + struct mlx5_ct_entry *entry = container_of(work, struct mlx5_ct_entry, work); + + mlx5_tc_ct_entry_del(entry); +} + +static void +__mlx5_tc_ct_entry_put(struct mlx5_ct_entry *entry) +{ + if (!refcount_dec_and_test(&entry->refcnt)) + return; + + INIT_WORK(&entry->work, mlx5_tc_ct_entry_del_work); + queue_work(entry->ct_priv->wq, &entry->work); +} + +static struct mlx5_ct_counter * +mlx5_tc_ct_counter_create(struct mlx5_tc_ct_priv *ct_priv) +{ + struct mlx5_ct_counter *counter; + int ret; + + counter = kzalloc(sizeof(*counter), GFP_KERNEL); + if (!counter) + return ERR_PTR(-ENOMEM); + + counter->is_shared = false; + counter->counter = mlx5_fc_create(ct_priv->dev, true); + if (IS_ERR(counter->counter)) { + ct_dbg("Failed to create counter for ct entry"); + ret = PTR_ERR(counter->counter); + kfree(counter); + return ERR_PTR(ret); + } + + return counter; +} + +static struct mlx5_ct_counter * +mlx5_tc_ct_shared_counter_get(struct mlx5_tc_ct_priv *ct_priv, + struct mlx5_ct_entry *entry) +{ + struct mlx5_ct_tuple rev_tuple = entry->tuple; + struct mlx5_ct_counter *shared_counter; + struct mlx5_ct_entry *rev_entry; + + /* get the reversed tuple */ + swap(rev_tuple.port.src, rev_tuple.port.dst); + + if (rev_tuple.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { + __be32 tmp_addr = rev_tuple.ip.src_v4; + + rev_tuple.ip.src_v4 = rev_tuple.ip.dst_v4; + rev_tuple.ip.dst_v4 = tmp_addr; + } else if (rev_tuple.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { + struct in6_addr tmp_addr = rev_tuple.ip.src_v6; + + rev_tuple.ip.src_v6 = rev_tuple.ip.dst_v6; + rev_tuple.ip.dst_v6 = tmp_addr; + } else { + return ERR_PTR(-EOPNOTSUPP); + } + + /* Use the same counter as the reverse direction */ + spin_lock_bh(&ct_priv->ht_lock); + rev_entry = mlx5_tc_ct_entry_get(ct_priv, &rev_tuple); + + if (IS_ERR(rev_entry)) { + spin_unlock_bh(&ct_priv->ht_lock); + goto create_counter; + } + + if (rev_entry && refcount_inc_not_zero(&rev_entry->counter->refcount)) { + ct_dbg("Using shared counter entry=0x%p rev=0x%p", entry, rev_entry); + shared_counter = rev_entry->counter; + spin_unlock_bh(&ct_priv->ht_lock); + + mlx5_tc_ct_entry_put(rev_entry); + return shared_counter; + } + + spin_unlock_bh(&ct_priv->ht_lock); + +create_counter: + + shared_counter = mlx5_tc_ct_counter_create(ct_priv); + if (IS_ERR(shared_counter)) + return shared_counter; + + shared_counter->is_shared = true; + refcount_set(&shared_counter->refcount, 1); + return shared_counter; +} + +static int +mlx5_tc_ct_entry_add_rules(struct mlx5_tc_ct_priv *ct_priv, + struct flow_rule *flow_rule, + struct mlx5_ct_entry *entry, + u8 zone_restore_id) +{ + int err; + + if (nf_ct_acct_enabled(dev_net(ct_priv->netdev))) + entry->counter = mlx5_tc_ct_counter_create(ct_priv); + else + entry->counter = mlx5_tc_ct_shared_counter_get(ct_priv, entry); + + if (IS_ERR(entry->counter)) { + err = PTR_ERR(entry->counter); + return err; + } + + if (mlx5_tc_ct_entry_in_ct_table(entry)) { + err = mlx5_tc_ct_entry_add_rule(ct_priv, flow_rule, entry, false, + zone_restore_id); + if (err) + goto err_orig; + } + + if (mlx5_tc_ct_entry_in_ct_nat_table(entry)) { + err = mlx5_tc_ct_entry_add_rule(ct_priv, flow_rule, entry, true, + zone_restore_id); + if (err) + goto err_nat; + } + + atomic_inc(&ct_priv->dev->priv.ct_debugfs->stats.offloaded); + return 0; + +err_nat: + if (mlx5_tc_ct_entry_in_ct_table(entry)) + mlx5_tc_ct_entry_del_rule(ct_priv, entry, false); +err_orig: + mlx5_tc_ct_counter_put(ct_priv, entry); + return err; +} + +static int +mlx5_tc_ct_block_flow_offload_add(struct mlx5_ct_ft *ft, + struct flow_cls_offload *flow) +{ + struct flow_rule *flow_rule = flow_cls_offload_flow_rule(flow); + struct mlx5_tc_ct_priv *ct_priv = ft->ct_priv; + struct flow_action_entry *meta_action; + unsigned long cookie = flow->cookie; + struct mlx5_ct_entry *entry; + int err; + + /* Two rules inserted per connection */ + if (atomic_read(&ct_priv->num_offloaded_flows) >= + mlx5_tc_ct_max_offloaded_conns_get(ct_priv->dev) * 2) + return -ENOSPC; + + meta_action = mlx5_tc_ct_get_ct_metadata_action(flow_rule); + if (!meta_action) + return -EOPNOTSUPP; + + spin_lock_bh(&ct_priv->ht_lock); + entry = rhashtable_lookup_fast(&ft->ct_entries_ht, &cookie, cts_ht_params); + if (entry && refcount_inc_not_zero(&entry->refcnt)) { + spin_unlock_bh(&ct_priv->ht_lock); + mlx5_tc_ct_entry_put(entry); + return -EEXIST; + } + spin_unlock_bh(&ct_priv->ht_lock); + + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return -ENOMEM; + + entry->tuple.zone = ft->zone; + entry->cookie = flow->cookie; + entry->restore_cookie = meta_action->ct_metadata.cookie; + refcount_set(&entry->refcnt, 2); + entry->ct_priv = ct_priv; + + err = mlx5_tc_ct_rule_to_tuple(&entry->tuple, flow_rule); + if (err) + goto err_set; + + memcpy(&entry->tuple_nat, &entry->tuple, sizeof(entry->tuple)); + err = mlx5_tc_ct_rule_to_tuple_nat(&entry->tuple_nat, flow_rule); + if (err) + goto err_set; + + spin_lock_bh(&ct_priv->ht_lock); + + err = rhashtable_lookup_insert_fast(&ft->ct_entries_ht, &entry->node, + cts_ht_params); + if (err) + goto err_entries; + + if (memcmp(&entry->tuple, &entry->tuple_nat, sizeof(entry->tuple))) { + err = rhashtable_lookup_insert_fast(&ct_priv->ct_tuples_nat_ht, + &entry->tuple_nat_node, + tuples_nat_ht_params); + if (err) + goto err_tuple_nat; + + set_bit(MLX5_CT_ENTRY_IN_CT_NAT_TABLE, &entry->flags); + } + + if (!mlx5_tc_ct_entry_in_ct_nat_table(entry) || mlx5_tc_ct_dup_nat_entries(ct_priv)) { + err = rhashtable_lookup_insert_fast(&ct_priv->ct_tuples_ht, + &entry->tuple_node, + tuples_ht_params); + if (err) + goto err_tuple; + + set_bit(MLX5_CT_ENTRY_IN_CT_TABLE, &entry->flags); + } + spin_unlock_bh(&ct_priv->ht_lock); + + err = mlx5_tc_ct_entry_add_rules(ct_priv, flow_rule, entry, + ft->zone_restore_id); + if (err) + goto err_rules; + + set_bit(MLX5_CT_ENTRY_FLAG_VALID, &entry->flags); + mlx5_tc_ct_entry_put(entry); /* this function reference */ + + atomic_inc(&ct_priv->num_offloaded_flows); + + return 0; + +err_rules: + spin_lock_bh(&ct_priv->ht_lock); +err_tuple: + mlx5_tc_ct_entry_remove_from_tuples(entry); +err_tuple_nat: + rhashtable_remove_fast(&ft->ct_entries_ht, &entry->node, cts_ht_params); +err_entries: + spin_unlock_bh(&ct_priv->ht_lock); +err_set: + kfree(entry); + if (err != -EEXIST) + netdev_warn(ct_priv->netdev, "Failed to offload ct entry, err: %d\n", err); + return err; +} + +static int +mlx5_tc_ct_block_flow_offload_del(struct mlx5_ct_ft *ft, + struct flow_cls_offload *flow) +{ + struct mlx5_tc_ct_priv *ct_priv = ft->ct_priv; + unsigned long cookie = flow->cookie; + struct mlx5_ct_entry *entry; + + spin_lock_bh(&ct_priv->ht_lock); + entry = rhashtable_lookup_fast(&ft->ct_entries_ht, &cookie, cts_ht_params); + if (!entry) { + spin_unlock_bh(&ct_priv->ht_lock); + return -ENOENT; + } + + if (!mlx5_tc_ct_entry_valid(entry)) { + spin_unlock_bh(&ct_priv->ht_lock); + return -EINVAL; + } + + rhashtable_remove_fast(&ft->ct_entries_ht, &entry->node, cts_ht_params); + spin_unlock_bh(&ct_priv->ht_lock); + + mlx5_tc_ct_entry_put(entry); + + return 0; +} + +static int +mlx5_tc_ct_block_flow_offload_stats(struct mlx5_ct_ft *ft, + struct flow_cls_offload *f) +{ + struct mlx5_tc_ct_priv *ct_priv = ft->ct_priv; + unsigned long cookie = f->cookie; + struct mlx5_ct_entry *entry; + u64 lastuse, packets, bytes; + + spin_lock_bh(&ct_priv->ht_lock); + entry = rhashtable_lookup_fast(&ft->ct_entries_ht, &cookie, cts_ht_params); + if (!entry) { + spin_unlock_bh(&ct_priv->ht_lock); + return -ENOENT; + } + + if (!mlx5_tc_ct_entry_valid(entry) || !refcount_inc_not_zero(&entry->refcnt)) { + spin_unlock_bh(&ct_priv->ht_lock); + return -EINVAL; + } + + spin_unlock_bh(&ct_priv->ht_lock); + + mlx5_fc_query_cached(entry->counter->counter, &bytes, &packets, &lastuse); + flow_stats_update(&f->stats, bytes, packets, 0, lastuse, + FLOW_ACTION_HW_STATS_DELAYED); + + mlx5_tc_ct_entry_put(entry); + return 0; +} + +static int +mlx5_tc_ct_block_flow_offload(enum tc_setup_type type, void *type_data, + void *cb_priv) +{ + struct flow_cls_offload *f = type_data; + struct mlx5_ct_ft *ft = cb_priv; + + if (type != TC_SETUP_CLSFLOWER) + return -EOPNOTSUPP; + + switch (f->command) { + case FLOW_CLS_REPLACE: + return mlx5_tc_ct_block_flow_offload_add(ft, f); + case FLOW_CLS_DESTROY: + return mlx5_tc_ct_block_flow_offload_del(ft, f); + case FLOW_CLS_STATS: + return mlx5_tc_ct_block_flow_offload_stats(ft, f); + default: + break; + } + + return -EOPNOTSUPP; +} + +static bool +mlx5_tc_ct_skb_to_tuple(struct sk_buff *skb, struct mlx5_ct_tuple *tuple, + u16 zone) +{ + struct flow_keys flow_keys; + + skb_reset_network_header(skb); + skb_flow_dissect_flow_keys(skb, &flow_keys, FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP); + + tuple->zone = zone; + + if (flow_keys.basic.ip_proto != IPPROTO_TCP && + flow_keys.basic.ip_proto != IPPROTO_UDP && + flow_keys.basic.ip_proto != IPPROTO_GRE) + return false; + + if (flow_keys.basic.ip_proto == IPPROTO_TCP || + flow_keys.basic.ip_proto == IPPROTO_UDP) { + tuple->port.src = flow_keys.ports.src; + tuple->port.dst = flow_keys.ports.dst; + } + tuple->n_proto = flow_keys.basic.n_proto; + tuple->ip_proto = flow_keys.basic.ip_proto; + + switch (flow_keys.basic.n_proto) { + case htons(ETH_P_IP): + tuple->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + tuple->ip.src_v4 = flow_keys.addrs.v4addrs.src; + tuple->ip.dst_v4 = flow_keys.addrs.v4addrs.dst; + break; + + case htons(ETH_P_IPV6): + tuple->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + tuple->ip.src_v6 = flow_keys.addrs.v6addrs.src; + tuple->ip.dst_v6 = flow_keys.addrs.v6addrs.dst; + break; + default: + goto out; + } + + return true; + +out: + return false; +} + +int mlx5_tc_ct_add_no_trk_match(struct mlx5_flow_spec *spec) +{ + u32 ctstate = 0, ctstate_mask = 0; + + mlx5e_tc_match_to_reg_get_match(spec, CTSTATE_TO_REG, + &ctstate, &ctstate_mask); + + if ((ctstate & ctstate_mask) == MLX5_CT_STATE_TRK_BIT) + return -EOPNOTSUPP; + + ctstate_mask |= MLX5_CT_STATE_TRK_BIT; + mlx5e_tc_match_to_reg_match(spec, CTSTATE_TO_REG, + ctstate, ctstate_mask); + + return 0; +} + +void mlx5_tc_ct_match_del(struct mlx5_tc_ct_priv *priv, struct mlx5_ct_attr *ct_attr) +{ + if (!priv || !ct_attr->ct_labels_id) + return; + + mlx5_put_label_mapping(priv, ct_attr->ct_labels_id); +} + +int +mlx5_tc_ct_match_add(struct mlx5_tc_ct_priv *priv, + struct mlx5_flow_spec *spec, + struct flow_cls_offload *f, + struct mlx5_ct_attr *ct_attr, + struct netlink_ext_ack *extack) +{ + bool trk, est, untrk, unest, new, rpl, unrpl, rel, unrel, inv, uninv; + struct flow_rule *rule = flow_cls_offload_flow_rule(f); + struct flow_dissector_key_ct *mask, *key; + u32 ctstate = 0, ctstate_mask = 0; + u16 ct_state_on, ct_state_off; + u16 ct_state, ct_state_mask; + struct flow_match_ct match; + u32 ct_labels[4]; + + if (!flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_CT)) + return 0; + + if (!priv) { + NL_SET_ERR_MSG_MOD(extack, + "offload of ct matching isn't available"); + return -EOPNOTSUPP; + } + + flow_rule_match_ct(rule, &match); + + key = match.key; + mask = match.mask; + + ct_state = key->ct_state; + ct_state_mask = mask->ct_state; + + if (ct_state_mask & ~(TCA_FLOWER_KEY_CT_FLAGS_TRACKED | + TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED | + TCA_FLOWER_KEY_CT_FLAGS_NEW | + TCA_FLOWER_KEY_CT_FLAGS_REPLY | + TCA_FLOWER_KEY_CT_FLAGS_RELATED | + TCA_FLOWER_KEY_CT_FLAGS_INVALID)) { + NL_SET_ERR_MSG_MOD(extack, + "only ct_state trk, est, new and rpl are supported for offload"); + return -EOPNOTSUPP; + } + + ct_state_on = ct_state & ct_state_mask; + ct_state_off = (ct_state & ct_state_mask) ^ ct_state_mask; + trk = ct_state_on & TCA_FLOWER_KEY_CT_FLAGS_TRACKED; + new = ct_state_on & TCA_FLOWER_KEY_CT_FLAGS_NEW; + est = ct_state_on & TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED; + rpl = ct_state_on & TCA_FLOWER_KEY_CT_FLAGS_REPLY; + rel = ct_state_on & TCA_FLOWER_KEY_CT_FLAGS_RELATED; + inv = ct_state_on & TCA_FLOWER_KEY_CT_FLAGS_INVALID; + untrk = ct_state_off & TCA_FLOWER_KEY_CT_FLAGS_TRACKED; + unest = ct_state_off & TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED; + unrpl = ct_state_off & TCA_FLOWER_KEY_CT_FLAGS_REPLY; + unrel = ct_state_off & TCA_FLOWER_KEY_CT_FLAGS_RELATED; + uninv = ct_state_off & TCA_FLOWER_KEY_CT_FLAGS_INVALID; + + ctstate |= trk ? MLX5_CT_STATE_TRK_BIT : 0; + ctstate |= est ? MLX5_CT_STATE_ESTABLISHED_BIT : 0; + ctstate |= rpl ? MLX5_CT_STATE_REPLY_BIT : 0; + ctstate_mask |= (untrk || trk) ? MLX5_CT_STATE_TRK_BIT : 0; + ctstate_mask |= (unest || est) ? MLX5_CT_STATE_ESTABLISHED_BIT : 0; + ctstate_mask |= (unrpl || rpl) ? MLX5_CT_STATE_REPLY_BIT : 0; + ctstate_mask |= unrel ? MLX5_CT_STATE_RELATED_BIT : 0; + ctstate_mask |= uninv ? MLX5_CT_STATE_INVALID_BIT : 0; + + if (rel) { + NL_SET_ERR_MSG_MOD(extack, + "matching on ct_state +rel isn't supported"); + return -EOPNOTSUPP; + } + + if (inv) { + NL_SET_ERR_MSG_MOD(extack, + "matching on ct_state +inv isn't supported"); + return -EOPNOTSUPP; + } + + if (new) { + NL_SET_ERR_MSG_MOD(extack, + "matching on ct_state +new isn't supported"); + return -EOPNOTSUPP; + } + + if (mask->ct_zone) + mlx5e_tc_match_to_reg_match(spec, ZONE_TO_REG, + key->ct_zone, MLX5_CT_ZONE_MASK); + if (ctstate_mask) + mlx5e_tc_match_to_reg_match(spec, CTSTATE_TO_REG, + ctstate, ctstate_mask); + if (mask->ct_mark) + mlx5e_tc_match_to_reg_match(spec, MARK_TO_REG, + key->ct_mark, mask->ct_mark); + if (mask->ct_labels[0] || mask->ct_labels[1] || mask->ct_labels[2] || + mask->ct_labels[3]) { + ct_labels[0] = key->ct_labels[0] & mask->ct_labels[0]; + ct_labels[1] = key->ct_labels[1] & mask->ct_labels[1]; + ct_labels[2] = key->ct_labels[2] & mask->ct_labels[2]; + ct_labels[3] = key->ct_labels[3] & mask->ct_labels[3]; + if (mlx5_get_label_mapping(priv, ct_labels, &ct_attr->ct_labels_id)) + return -EOPNOTSUPP; + mlx5e_tc_match_to_reg_match(spec, LABELS_TO_REG, ct_attr->ct_labels_id, + priv->use_label_mapping ? MLX5_CT_LABELS_MASK : + mask->ct_labels[0]); + } + + return 0; +} + +int +mlx5_tc_ct_parse_action(struct mlx5_tc_ct_priv *priv, + struct mlx5_flow_attr *attr, + struct mlx5e_tc_mod_hdr_acts *mod_acts, + const struct flow_action_entry *act, + struct netlink_ext_ack *extack) +{ + if (!priv) { + NL_SET_ERR_MSG_MOD(extack, + "offload of ct action isn't available"); + return -EOPNOTSUPP; + } + + attr->ct_attr.zone = act->ct.zone; + attr->ct_attr.ct_action = act->ct.action; + attr->ct_attr.nf_ft = act->ct.flow_table; + + return 0; +} + +static int tc_ct_pre_ct_add_rules(struct mlx5_ct_ft *ct_ft, + struct mlx5_tc_ct_pre *pre_ct, + bool nat) +{ + struct mlx5_tc_ct_priv *ct_priv = ct_ft->ct_priv; + struct mlx5e_tc_mod_hdr_acts pre_mod_acts = {}; + struct mlx5_core_dev *dev = ct_priv->dev; + struct mlx5_flow_table *ft = pre_ct->ft; + struct mlx5_flow_destination dest = {}; + struct mlx5_flow_act flow_act = {}; + struct mlx5_modify_hdr *mod_hdr; + struct mlx5_flow_handle *rule; + struct mlx5_flow_spec *spec; + u32 ctstate; + u16 zone; + int err; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) + return -ENOMEM; + + zone = ct_ft->zone & MLX5_CT_ZONE_MASK; + err = mlx5e_tc_match_to_reg_set(dev, &pre_mod_acts, ct_priv->ns_type, + ZONE_TO_REG, zone); + if (err) { + ct_dbg("Failed to set zone register mapping"); + goto err_mapping; + } + + mod_hdr = mlx5_modify_header_alloc(dev, ct_priv->ns_type, + pre_mod_acts.num_actions, + pre_mod_acts.actions); + + if (IS_ERR(mod_hdr)) { + err = PTR_ERR(mod_hdr); + ct_dbg("Failed to create pre ct mod hdr"); + goto err_mapping; + } + pre_ct->modify_hdr = mod_hdr; + + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | + MLX5_FLOW_CONTEXT_ACTION_MOD_HDR; + flow_act.flags |= FLOW_ACT_IGNORE_FLOW_LEVEL; + flow_act.modify_hdr = mod_hdr; + dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + + /* add flow rule */ + mlx5e_tc_match_to_reg_match(spec, ZONE_TO_REG, + zone, MLX5_CT_ZONE_MASK); + ctstate = MLX5_CT_STATE_TRK_BIT; + if (nat) + ctstate |= MLX5_CT_STATE_NAT_BIT; + mlx5e_tc_match_to_reg_match(spec, CTSTATE_TO_REG, ctstate, ctstate); + + dest.ft = mlx5e_tc_post_act_get_ft(ct_priv->post_act); + rule = mlx5_add_flow_rules(ft, spec, &flow_act, &dest, 1); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + ct_dbg("Failed to add pre ct flow rule zone %d", zone); + goto err_flow_rule; + } + pre_ct->flow_rule = rule; + + /* add miss rule */ + dest.ft = nat ? ct_priv->ct_nat : ct_priv->ct; + rule = mlx5_add_flow_rules(ft, NULL, &flow_act, &dest, 1); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + ct_dbg("Failed to add pre ct miss rule zone %d", zone); + goto err_miss_rule; + } + pre_ct->miss_rule = rule; + + mlx5e_mod_hdr_dealloc(&pre_mod_acts); + kvfree(spec); + return 0; + +err_miss_rule: + mlx5_del_flow_rules(pre_ct->flow_rule); +err_flow_rule: + mlx5_modify_header_dealloc(dev, pre_ct->modify_hdr); +err_mapping: + mlx5e_mod_hdr_dealloc(&pre_mod_acts); + kvfree(spec); + return err; +} + +static void +tc_ct_pre_ct_del_rules(struct mlx5_ct_ft *ct_ft, + struct mlx5_tc_ct_pre *pre_ct) +{ + struct mlx5_tc_ct_priv *ct_priv = ct_ft->ct_priv; + struct mlx5_core_dev *dev = ct_priv->dev; + + mlx5_del_flow_rules(pre_ct->flow_rule); + mlx5_del_flow_rules(pre_ct->miss_rule); + mlx5_modify_header_dealloc(dev, pre_ct->modify_hdr); +} + +static int +mlx5_tc_ct_alloc_pre_ct(struct mlx5_ct_ft *ct_ft, + struct mlx5_tc_ct_pre *pre_ct, + bool nat) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_tc_ct_priv *ct_priv = ct_ft->ct_priv; + struct mlx5_core_dev *dev = ct_priv->dev; + struct mlx5_flow_table_attr ft_attr = {}; + struct mlx5_flow_namespace *ns; + struct mlx5_flow_table *ft; + struct mlx5_flow_group *g; + u32 metadata_reg_c_2_mask; + u32 *flow_group_in; + void *misc; + int err; + + ns = mlx5_get_flow_namespace(dev, ct_priv->ns_type); + if (!ns) { + err = -EOPNOTSUPP; + ct_dbg("Failed to get flow namespace"); + return err; + } + + flow_group_in = kvzalloc(inlen, GFP_KERNEL); + if (!flow_group_in) + return -ENOMEM; + + ft_attr.flags = MLX5_FLOW_TABLE_UNMANAGED; + ft_attr.prio = ct_priv->ns_type == MLX5_FLOW_NAMESPACE_FDB ? + FDB_TC_OFFLOAD : MLX5E_TC_PRIO; + ft_attr.max_fte = 2; + ft_attr.level = 1; + ft = mlx5_create_flow_table(ns, &ft_attr); + if (IS_ERR(ft)) { + err = PTR_ERR(ft); + ct_dbg("Failed to create pre ct table"); + goto out_free; + } + pre_ct->ft = ft; + + /* create flow group */ + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 0); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 0); + MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, + MLX5_MATCH_MISC_PARAMETERS_2); + + misc = MLX5_ADDR_OF(create_flow_group_in, flow_group_in, + match_criteria.misc_parameters_2); + + metadata_reg_c_2_mask = MLX5_CT_ZONE_MASK; + metadata_reg_c_2_mask |= (MLX5_CT_STATE_TRK_BIT << 16); + if (nat) + metadata_reg_c_2_mask |= (MLX5_CT_STATE_NAT_BIT << 16); + + MLX5_SET(fte_match_set_misc2, misc, metadata_reg_c_2, + metadata_reg_c_2_mask); + + g = mlx5_create_flow_group(ft, flow_group_in); + if (IS_ERR(g)) { + err = PTR_ERR(g); + ct_dbg("Failed to create pre ct group"); + goto err_flow_grp; + } + pre_ct->flow_grp = g; + + /* create miss group */ + memset(flow_group_in, 0, inlen); + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 1); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 1); + g = mlx5_create_flow_group(ft, flow_group_in); + if (IS_ERR(g)) { + err = PTR_ERR(g); + ct_dbg("Failed to create pre ct miss group"); + goto err_miss_grp; + } + pre_ct->miss_grp = g; + + err = tc_ct_pre_ct_add_rules(ct_ft, pre_ct, nat); + if (err) + goto err_add_rules; + + kvfree(flow_group_in); + return 0; + +err_add_rules: + mlx5_destroy_flow_group(pre_ct->miss_grp); +err_miss_grp: + mlx5_destroy_flow_group(pre_ct->flow_grp); +err_flow_grp: + mlx5_destroy_flow_table(ft); +out_free: + kvfree(flow_group_in); + return err; +} + +static void +mlx5_tc_ct_free_pre_ct(struct mlx5_ct_ft *ct_ft, + struct mlx5_tc_ct_pre *pre_ct) +{ + tc_ct_pre_ct_del_rules(ct_ft, pre_ct); + mlx5_destroy_flow_group(pre_ct->miss_grp); + mlx5_destroy_flow_group(pre_ct->flow_grp); + mlx5_destroy_flow_table(pre_ct->ft); +} + +static int +mlx5_tc_ct_alloc_pre_ct_tables(struct mlx5_ct_ft *ft) +{ + int err; + + err = mlx5_tc_ct_alloc_pre_ct(ft, &ft->pre_ct, false); + if (err) + return err; + + err = mlx5_tc_ct_alloc_pre_ct(ft, &ft->pre_ct_nat, true); + if (err) + goto err_pre_ct_nat; + + return 0; + +err_pre_ct_nat: + mlx5_tc_ct_free_pre_ct(ft, &ft->pre_ct); + return err; +} + +static void +mlx5_tc_ct_free_pre_ct_tables(struct mlx5_ct_ft *ft) +{ + mlx5_tc_ct_free_pre_ct(ft, &ft->pre_ct_nat); + mlx5_tc_ct_free_pre_ct(ft, &ft->pre_ct); +} + +/* To avoid false lock dependency warning set the ct_entries_ht lock + * class different than the lock class of the ht being used when deleting + * last flow from a group and then deleting a group, we get into del_sw_flow_group() + * which call rhashtable_destroy on fg->ftes_hash which will take ht->mutex but + * it's different than the ht->mutex here. + */ +static struct lock_class_key ct_entries_ht_lock_key; + +static struct mlx5_ct_ft * +mlx5_tc_ct_add_ft_cb(struct mlx5_tc_ct_priv *ct_priv, u16 zone, + struct nf_flowtable *nf_ft) +{ + struct mlx5_ct_ft *ft; + int err; + + ft = rhashtable_lookup_fast(&ct_priv->zone_ht, &zone, zone_params); + if (ft) { + refcount_inc(&ft->refcount); + return ft; + } + + ft = kzalloc(sizeof(*ft), GFP_KERNEL); + if (!ft) + return ERR_PTR(-ENOMEM); + + err = mapping_add(ct_priv->zone_mapping, &zone, &ft->zone_restore_id); + if (err) + goto err_mapping; + + ft->zone = zone; + ft->nf_ft = nf_ft; + ft->ct_priv = ct_priv; + refcount_set(&ft->refcount, 1); + + err = mlx5_tc_ct_alloc_pre_ct_tables(ft); + if (err) + goto err_alloc_pre_ct; + + err = rhashtable_init(&ft->ct_entries_ht, &cts_ht_params); + if (err) + goto err_init; + + lockdep_set_class(&ft->ct_entries_ht.mutex, &ct_entries_ht_lock_key); + + err = rhashtable_insert_fast(&ct_priv->zone_ht, &ft->node, + zone_params); + if (err) + goto err_insert; + + err = nf_flow_table_offload_add_cb(ft->nf_ft, + mlx5_tc_ct_block_flow_offload, ft); + if (err) + goto err_add_cb; + + return ft; + +err_add_cb: + rhashtable_remove_fast(&ct_priv->zone_ht, &ft->node, zone_params); +err_insert: + rhashtable_destroy(&ft->ct_entries_ht); +err_init: + mlx5_tc_ct_free_pre_ct_tables(ft); +err_alloc_pre_ct: + mapping_remove(ct_priv->zone_mapping, ft->zone_restore_id); +err_mapping: + kfree(ft); + return ERR_PTR(err); +} + +static void +mlx5_tc_ct_flush_ft_entry(void *ptr, void *arg) +{ + struct mlx5_ct_entry *entry = ptr; + + mlx5_tc_ct_entry_put(entry); +} + +static void +mlx5_tc_ct_del_ft_cb(struct mlx5_tc_ct_priv *ct_priv, struct mlx5_ct_ft *ft) +{ + struct mlx5e_priv *priv; + + if (!refcount_dec_and_test(&ft->refcount)) + return; + + flush_workqueue(ct_priv->wq); + nf_flow_table_offload_del_cb(ft->nf_ft, + mlx5_tc_ct_block_flow_offload, ft); + rhashtable_remove_fast(&ct_priv->zone_ht, &ft->node, zone_params); + rhashtable_free_and_destroy(&ft->ct_entries_ht, + mlx5_tc_ct_flush_ft_entry, + ct_priv); + priv = netdev_priv(ct_priv->netdev); + mlx5_tc_ct_free_pre_ct_tables(ft); + mapping_remove(ct_priv->zone_mapping, ft->zone_restore_id); + kfree(ft); +} + +/* We translate the tc filter with CT action to the following HW model: + * + * +---------------------+ + * + ft prio (tc chain) + + * + original match + + * +---------------------+ + * | set chain miss mapping + * | set fte_id + * | set tunnel_id + * | do decap + * v + * +---------------------+ + * + pre_ct/pre_ct_nat + if matches +-------------------------+ + * + zone+nat match +---------------->+ post_act (see below) + + * +---------------------+ set zone +-------------------------+ + * | set zone + * v + * +--------------------+ + * + CT (nat or no nat) + + * + tuple + zone match + + * +--------------------+ + * | set mark + * | set labels_id + * | set established + * | set zone_restore + * | do nat (if needed) + * v + * +--------------+ + * + post_act + original filter actions + * + fte_id match +------------------------> + * +--------------+ + */ +static struct mlx5_flow_handle * +__mlx5_tc_ct_flow_offload(struct mlx5_tc_ct_priv *ct_priv, + struct mlx5_flow_spec *orig_spec, + struct mlx5_flow_attr *attr) +{ + bool nat = attr->ct_attr.ct_action & TCA_CT_ACT_NAT; + struct mlx5e_priv *priv = netdev_priv(ct_priv->netdev); + struct mlx5e_tc_mod_hdr_acts *pre_mod_acts; + u32 attr_sz = ns_to_attr_sz(ct_priv->ns_type); + struct mlx5_flow_attr *pre_ct_attr; + struct mlx5_modify_hdr *mod_hdr; + struct mlx5_ct_flow *ct_flow; + int chain_mapping = 0, err; + struct mlx5_ct_ft *ft; + + ct_flow = kzalloc(sizeof(*ct_flow), GFP_KERNEL); + if (!ct_flow) { + kfree(ct_flow); + return ERR_PTR(-ENOMEM); + } + + /* Register for CT established events */ + ft = mlx5_tc_ct_add_ft_cb(ct_priv, attr->ct_attr.zone, + attr->ct_attr.nf_ft); + if (IS_ERR(ft)) { + err = PTR_ERR(ft); + ct_dbg("Failed to register to ft callback"); + goto err_ft; + } + ct_flow->ft = ft; + + /* Base flow attributes of both rules on original rule attribute */ + ct_flow->pre_ct_attr = mlx5_alloc_flow_attr(ct_priv->ns_type); + if (!ct_flow->pre_ct_attr) { + err = -ENOMEM; + goto err_alloc_pre; + } + + pre_ct_attr = ct_flow->pre_ct_attr; + memcpy(pre_ct_attr, attr, attr_sz); + pre_mod_acts = &pre_ct_attr->parse_attr->mod_hdr_acts; + + /* Modify the original rule's action to fwd and modify, leave decap */ + pre_ct_attr->action = attr->action & MLX5_FLOW_CONTEXT_ACTION_DECAP; + pre_ct_attr->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | + MLX5_FLOW_CONTEXT_ACTION_MOD_HDR; + + /* Write chain miss tag for miss in ct table as we + * don't go though all prios of this chain as normal tc rules + * miss. + */ + err = mlx5_chains_get_chain_mapping(ct_priv->chains, attr->chain, + &chain_mapping); + if (err) { + ct_dbg("Failed to get chain register mapping for chain"); + goto err_get_chain; + } + ct_flow->chain_mapping = chain_mapping; + + err = mlx5e_tc_match_to_reg_set(priv->mdev, pre_mod_acts, ct_priv->ns_type, + CHAIN_TO_REG, chain_mapping); + if (err) { + ct_dbg("Failed to set chain register mapping"); + goto err_mapping; + } + + /* If original flow is decap, we do it before going into ct table + * so add a rewrite for the tunnel match_id. + */ + if ((pre_ct_attr->action & MLX5_FLOW_CONTEXT_ACTION_DECAP) && + attr->chain == 0) { + err = mlx5e_tc_match_to_reg_set(priv->mdev, pre_mod_acts, + ct_priv->ns_type, + TUNNEL_TO_REG, + attr->tunnel_id); + if (err) { + ct_dbg("Failed to set tunnel register mapping"); + goto err_mapping; + } + } + + mod_hdr = mlx5_modify_header_alloc(priv->mdev, ct_priv->ns_type, + pre_mod_acts->num_actions, + pre_mod_acts->actions); + if (IS_ERR(mod_hdr)) { + err = PTR_ERR(mod_hdr); + ct_dbg("Failed to create pre ct mod hdr"); + goto err_mapping; + } + pre_ct_attr->modify_hdr = mod_hdr; + + /* Change original rule point to ct table */ + pre_ct_attr->dest_chain = 0; + pre_ct_attr->dest_ft = nat ? ft->pre_ct_nat.ft : ft->pre_ct.ft; + ct_flow->pre_ct_rule = mlx5_tc_rule_insert(priv, orig_spec, + pre_ct_attr); + if (IS_ERR(ct_flow->pre_ct_rule)) { + err = PTR_ERR(ct_flow->pre_ct_rule); + ct_dbg("Failed to add pre ct rule"); + goto err_insert_orig; + } + + attr->ct_attr.ct_flow = ct_flow; + mlx5e_mod_hdr_dealloc(pre_mod_acts); + + return ct_flow->pre_ct_rule; + +err_insert_orig: + mlx5_modify_header_dealloc(priv->mdev, pre_ct_attr->modify_hdr); +err_mapping: + mlx5e_mod_hdr_dealloc(pre_mod_acts); + mlx5_chains_put_chain_mapping(ct_priv->chains, ct_flow->chain_mapping); +err_get_chain: + kfree(ct_flow->pre_ct_attr); +err_alloc_pre: + mlx5_tc_ct_del_ft_cb(ct_priv, ft); +err_ft: + kfree(ct_flow); + netdev_warn(priv->netdev, "Failed to offload ct flow, err %d\n", err); + return ERR_PTR(err); +} + +struct mlx5_flow_handle * +mlx5_tc_ct_flow_offload(struct mlx5_tc_ct_priv *priv, + struct mlx5_flow_spec *spec, + struct mlx5_flow_attr *attr, + struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts) +{ + struct mlx5_flow_handle *rule; + + if (!priv) + return ERR_PTR(-EOPNOTSUPP); + + mutex_lock(&priv->control_lock); + rule = __mlx5_tc_ct_flow_offload(priv, spec, attr); + mutex_unlock(&priv->control_lock); + + return rule; +} + +static void +__mlx5_tc_ct_delete_flow(struct mlx5_tc_ct_priv *ct_priv, + struct mlx5_ct_flow *ct_flow, + struct mlx5_flow_attr *attr) +{ + struct mlx5_flow_attr *pre_ct_attr = ct_flow->pre_ct_attr; + struct mlx5e_priv *priv = netdev_priv(ct_priv->netdev); + + mlx5_tc_rule_delete(priv, ct_flow->pre_ct_rule, pre_ct_attr); + mlx5_modify_header_dealloc(priv->mdev, pre_ct_attr->modify_hdr); + + mlx5_chains_put_chain_mapping(ct_priv->chains, ct_flow->chain_mapping); + mlx5_tc_ct_del_ft_cb(ct_priv, ct_flow->ft); + + kfree(ct_flow->pre_ct_attr); + kfree(ct_flow); +} + +void +mlx5_tc_ct_delete_flow(struct mlx5_tc_ct_priv *priv, + struct mlx5_flow_attr *attr) +{ + struct mlx5_ct_flow *ct_flow = attr->ct_attr.ct_flow; + + /* We are called on error to clean up stuff from parsing + * but we don't have anything for now + */ + if (!ct_flow) + return; + + mutex_lock(&priv->control_lock); + __mlx5_tc_ct_delete_flow(priv, ct_flow, attr); + mutex_unlock(&priv->control_lock); +} + +static int +mlx5_tc_ct_fs_init(struct mlx5_tc_ct_priv *ct_priv) +{ + struct mlx5_flow_table *post_ct = mlx5e_tc_post_act_get_ft(ct_priv->post_act); + struct mlx5_ct_fs_ops *fs_ops = mlx5_ct_fs_dmfs_ops_get(); + int err; + + if (ct_priv->ns_type == MLX5_FLOW_NAMESPACE_FDB && + ct_priv->dev->priv.steering->mode == MLX5_FLOW_STEERING_MODE_SMFS) { + ct_dbg("Using SMFS ct flow steering provider"); + fs_ops = mlx5_ct_fs_smfs_ops_get(); + } + + ct_priv->fs = kzalloc(sizeof(*ct_priv->fs) + fs_ops->priv_size, GFP_KERNEL); + if (!ct_priv->fs) + return -ENOMEM; + + ct_priv->fs->netdev = ct_priv->netdev; + ct_priv->fs->dev = ct_priv->dev; + ct_priv->fs_ops = fs_ops; + + err = ct_priv->fs_ops->init(ct_priv->fs, ct_priv->ct, ct_priv->ct_nat, post_ct); + if (err) + goto err_init; + + return 0; + +err_init: + kfree(ct_priv->fs); + return err; +} + +static int +mlx5_tc_ct_init_check_esw_support(struct mlx5_eswitch *esw, + const char **err_msg) +{ + if (!mlx5_eswitch_vlan_actions_supported(esw->dev, 1)) { + /* vlan workaround should be avoided for multi chain rules. + * This is just a sanity check as pop vlan action should + * be supported by any FW that supports ignore_flow_level + */ + + *err_msg = "firmware vlan actions support is missing"; + return -EOPNOTSUPP; + } + + if (!MLX5_CAP_ESW_FLOWTABLE(esw->dev, + fdb_modify_header_fwd_to_table)) { + /* CT always writes to registers which are mod header actions. + * Therefore, mod header and goto is required + */ + + *err_msg = "firmware fwd and modify support is missing"; + return -EOPNOTSUPP; + } + + if (!mlx5_eswitch_reg_c1_loopback_enabled(esw)) { + *err_msg = "register loopback isn't supported"; + return -EOPNOTSUPP; + } + + return 0; +} + +static int +mlx5_tc_ct_init_check_support(struct mlx5e_priv *priv, + enum mlx5_flow_namespace_type ns_type, + struct mlx5e_post_act *post_act) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + const char *err_msg = NULL; + int err = 0; + +#if !IS_ENABLED(CONFIG_NET_TC_SKB_EXT) + /* cannot restore chain ID on HW miss */ + + err_msg = "tc skb extension missing"; + err = -EOPNOTSUPP; + goto out_err; +#endif + if (IS_ERR_OR_NULL(post_act)) { + /* Ignore_flow_level support isn't supported by default for VFs and so post_act + * won't be supported. Skip showing error msg. + */ + if (priv->mdev->coredev_type != MLX5_COREDEV_VF) + err_msg = "post action is missing"; + err = -EOPNOTSUPP; + goto out_err; + } + + if (ns_type == MLX5_FLOW_NAMESPACE_FDB) + err = mlx5_tc_ct_init_check_esw_support(esw, &err_msg); + +out_err: + if (err && err_msg) + netdev_dbg(priv->netdev, "tc ct offload not supported, %s\n", err_msg); + return err; +} + +static struct mlx5_flow_handle * +tc_ct_add_miss_rule(struct mlx5_flow_table *ft, + struct mlx5_flow_table *next_ft) +{ + struct mlx5_flow_destination dest = {}; + struct mlx5_flow_act act = {}; + + act.flags = FLOW_ACT_IGNORE_FLOW_LEVEL | FLOW_ACT_NO_APPEND; + act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dest.ft = next_ft; + + return mlx5_add_flow_rules(ft, NULL, &act, &dest, 1); +} + +static int +tc_ct_add_ct_table_miss_rule(struct mlx5_flow_table *from, + struct mlx5_flow_table *to, + struct mlx5_flow_group **miss_group, + struct mlx5_flow_handle **miss_rule) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_flow_group *group; + struct mlx5_flow_handle *rule; + unsigned int max_fte = from->max_fte; + u32 *flow_group_in; + int err = 0; + + flow_group_in = kvzalloc(inlen, GFP_KERNEL); + if (!flow_group_in) + return -ENOMEM; + + /* create miss group */ + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, + max_fte - 2); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, + max_fte - 1); + group = mlx5_create_flow_group(from, flow_group_in); + if (IS_ERR(group)) { + err = PTR_ERR(group); + goto err_miss_grp; + } + + /* add miss rule to next fdb */ + rule = tc_ct_add_miss_rule(from, to); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + goto err_miss_rule; + } + + *miss_group = group; + *miss_rule = rule; + kvfree(flow_group_in); + return 0; + +err_miss_rule: + mlx5_destroy_flow_group(group); +err_miss_grp: + kvfree(flow_group_in); + return err; +} + +static void +tc_ct_del_ct_table_miss_rule(struct mlx5_flow_group *miss_group, + struct mlx5_flow_handle *miss_rule) +{ + mlx5_del_flow_rules(miss_rule); + mlx5_destroy_flow_group(miss_group); +} + +#define INIT_ERR_PREFIX "tc ct offload init failed" + +struct mlx5_tc_ct_priv * +mlx5_tc_ct_init(struct mlx5e_priv *priv, struct mlx5_fs_chains *chains, + struct mod_hdr_tbl *mod_hdr, + enum mlx5_flow_namespace_type ns_type, + struct mlx5e_post_act *post_act) +{ + struct mlx5_tc_ct_priv *ct_priv; + struct mlx5_core_dev *dev; + u64 mapping_id; + int err; + + dev = priv->mdev; + err = mlx5_tc_ct_init_check_support(priv, ns_type, post_act); + if (err) + goto err_support; + + ct_priv = kzalloc(sizeof(*ct_priv), GFP_KERNEL); + if (!ct_priv) + goto err_alloc; + + mapping_id = mlx5_query_nic_system_image_guid(dev); + + ct_priv->zone_mapping = mapping_create_for_id(mapping_id, MAPPING_TYPE_ZONE, + sizeof(u16), 0, true); + if (IS_ERR(ct_priv->zone_mapping)) { + err = PTR_ERR(ct_priv->zone_mapping); + goto err_mapping_zone; + } + + ct_priv->labels_mapping = mapping_create_for_id(mapping_id, MAPPING_TYPE_LABELS, + sizeof(u32) * 4, 0, true); + if (IS_ERR(ct_priv->labels_mapping)) { + err = PTR_ERR(ct_priv->labels_mapping); + goto err_mapping_labels; + } + + spin_lock_init(&ct_priv->ht_lock); + ct_priv->ns_type = ns_type; + ct_priv->chains = chains; + ct_priv->netdev = priv->netdev; + ct_priv->dev = priv->mdev; + ct_priv->mod_hdr_tbl = mod_hdr; + ct_priv->ct = mlx5_chains_create_global_table(chains); + if (IS_ERR(ct_priv->ct)) { + err = PTR_ERR(ct_priv->ct); + mlx5_core_warn(dev, + "%s, failed to create ct table err: %d\n", + INIT_ERR_PREFIX, err); + goto err_ct_tbl; + } + + ct_priv->ct_nat = mlx5_chains_create_global_table(chains); + if (IS_ERR(ct_priv->ct_nat)) { + err = PTR_ERR(ct_priv->ct_nat); + mlx5_core_warn(dev, + "%s, failed to create ct nat table err: %d\n", + INIT_ERR_PREFIX, err); + goto err_ct_nat_tbl; + } + + err = tc_ct_add_ct_table_miss_rule(ct_priv->ct_nat, ct_priv->ct, + &ct_priv->ct_nat_miss_group, + &ct_priv->ct_nat_miss_rule); + if (err) + goto err_ct_zone_ht; + + ct_priv->post_act = post_act; + mutex_init(&ct_priv->control_lock); + if (rhashtable_init(&ct_priv->zone_ht, &zone_params)) + goto err_ct_zone_ht; + if (rhashtable_init(&ct_priv->ct_tuples_ht, &tuples_ht_params)) + goto err_ct_tuples_ht; + if (rhashtable_init(&ct_priv->ct_tuples_nat_ht, &tuples_nat_ht_params)) + goto err_ct_tuples_nat_ht; + + mlx5_tc_ct_init_ct_max_offloaded_conns(dev); + + ct_priv->use_label_mapping = dev->mlx5e_res.ct.ct_labels_mapping; + + ct_priv->wq = alloc_ordered_workqueue("mlx5e_ct_priv_wq", 0); + if (!ct_priv->wq) { + err = -ENOMEM; + goto err_wq; + } + + err = mlx5_tc_ct_fs_init(ct_priv); + if (err) + goto err_init_fs; + + return ct_priv; + +err_init_fs: + destroy_workqueue(ct_priv->wq); +err_wq: + rhashtable_destroy(&ct_priv->ct_tuples_nat_ht); +err_ct_tuples_nat_ht: + rhashtable_destroy(&ct_priv->ct_tuples_ht); +err_ct_tuples_ht: + rhashtable_destroy(&ct_priv->zone_ht); +err_ct_zone_ht: + mlx5_chains_destroy_global_table(chains, ct_priv->ct_nat); +err_ct_nat_tbl: + mlx5_chains_destroy_global_table(chains, ct_priv->ct); +err_ct_tbl: + mapping_destroy(ct_priv->labels_mapping); +err_mapping_labels: + mapping_destroy(ct_priv->zone_mapping); +err_mapping_zone: + kfree(ct_priv); +err_alloc: +err_support: + + return NULL; +} + +void +mlx5_tc_ct_clean(struct mlx5_tc_ct_priv *ct_priv) +{ + struct mlx5_fs_chains *chains; + + if (!ct_priv) + return; + + destroy_workqueue(ct_priv->wq); + chains = ct_priv->chains; + + ct_priv->fs_ops->destroy(ct_priv->fs); + kfree(ct_priv->fs); + + tc_ct_del_ct_table_miss_rule(ct_priv->ct_nat_miss_group, ct_priv->ct_nat_miss_rule); + mlx5_chains_destroy_global_table(chains, ct_priv->ct_nat); + mlx5_chains_destroy_global_table(chains, ct_priv->ct); + mapping_destroy(ct_priv->zone_mapping); + mapping_destroy(ct_priv->labels_mapping); + + rhashtable_destroy(&ct_priv->ct_tuples_ht); + rhashtable_destroy(&ct_priv->ct_tuples_nat_ht); + rhashtable_destroy(&ct_priv->zone_ht); + mutex_destroy(&ct_priv->control_lock); + kfree(ct_priv); +} + +bool +mlx5e_tc_ct_restore_flow(struct mlx5_tc_ct_priv *ct_priv, + struct sk_buff *skb, u8 zone_restore_id) +{ + struct mlx5_ct_tuple tuple = {}; + struct mlx5_ct_entry *entry; + u16 zone; + + if (!ct_priv || !zone_restore_id) + return true; + + if (mapping_find(ct_priv->zone_mapping, zone_restore_id, &zone)) + return false; + + if (!mlx5_tc_ct_skb_to_tuple(skb, &tuple, zone)) + return false; + + spin_lock(&ct_priv->ht_lock); + + entry = mlx5_tc_ct_entry_get(ct_priv, &tuple); + if (!entry) { + spin_unlock(&ct_priv->ht_lock); + return false; + } + + if (IS_ERR(entry)) { + spin_unlock(&ct_priv->ht_lock); + return false; + } + spin_unlock(&ct_priv->ht_lock); + + tcf_ct_flow_table_restore_skb(skb, entry->restore_cookie); + __mlx5_tc_ct_entry_put(entry); + + return true; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.h new file mode 100644 index 0000000..5b84b93 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.h @@ -0,0 +1,255 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2018 Mellanox Technologies. */ + +#ifndef __MLX5_EN_TC_CT_H__ +#define __MLX5_EN_TC_CT_H__ + +#include +#include +#include + +#include "en.h" + +struct mlx5_flow_attr; +struct mlx5e_tc_mod_hdr_acts; +struct mlx5_rep_uplink_priv; +struct mlx5e_tc_flow; +struct mlx5e_priv; + +struct mlx5_fs_chains; +struct mlx5_tc_ct_priv; +struct mlx5_ct_flow; + +struct nf_flowtable; +struct flow_action_entry; +struct netlink_ext_ack; + +struct mlx5_ct_attr { + u16 zone; + u16 ct_action; + struct mlx5_ct_flow *ct_flow; + struct nf_flowtable *nf_ft; + u32 ct_labels_id; + bool clear_mod_acts_set; +}; + +#define zone_to_reg_ct {\ + .mfield = MLX5_ACTION_IN_FIELD_METADATA_REG_C_2,\ + .moffset = 0,\ + .mlen = 16,\ + .soffset = MLX5_BYTE_OFF(fte_match_param,\ + misc_parameters_2.metadata_reg_c_2),\ +} + +#define ctstate_to_reg_ct {\ + .mfield = MLX5_ACTION_IN_FIELD_METADATA_REG_C_2,\ + .moffset = 16,\ + .mlen = 16,\ + .soffset = MLX5_BYTE_OFF(fte_match_param,\ + misc_parameters_2.metadata_reg_c_2),\ +} + +#define mark_to_reg_ct {\ + .mfield = MLX5_ACTION_IN_FIELD_METADATA_REG_C_3,\ + .moffset = 0,\ + .mlen = 32,\ + .soffset = MLX5_BYTE_OFF(fte_match_param,\ + misc_parameters_2.metadata_reg_c_3),\ +} + +#define labels_to_reg_ct {\ + .mfield = MLX5_ACTION_IN_FIELD_METADATA_REG_C_4,\ + .moffset = 0,\ + .mlen = 32,\ + .soffset = MLX5_BYTE_OFF(fte_match_param,\ + misc_parameters_2.metadata_reg_c_4),\ +} + +/* 8 LSB of metadata C5 are reserved for packet color */ +#define fteid_to_reg_ct {\ + .mfield = MLX5_ACTION_IN_FIELD_METADATA_REG_C_5,\ + .moffset = 8,\ + .mlen = 24,\ + .soffset = MLX5_BYTE_OFF(fte_match_param,\ + misc_parameters_2.metadata_reg_c_5),\ +} + +#define zone_restore_to_reg_ct {\ + .mfield = MLX5_ACTION_IN_FIELD_METADATA_REG_C_1,\ + .moffset = 0,\ + .mlen = ESW_ZONE_ID_BITS,\ + .soffset = MLX5_BYTE_OFF(fte_match_param,\ + misc_parameters_2.metadata_reg_c_1),\ +} + +#define nic_zone_restore_to_reg_ct {\ + .mfield = MLX5_ACTION_IN_FIELD_METADATA_REG_B,\ + .moffset = 16,\ + .mlen = ESW_ZONE_ID_BITS,\ +} + +#define MLX5_CT_ZONE_BITS (mlx5e_tc_attr_to_reg_mappings[ZONE_TO_REG].mlen) +#define MLX5_CT_ZONE_MASK GENMASK(MLX5_CT_ZONE_BITS - 1, 0) + +#if IS_ENABLED(CONFIG_MLX5_TC_CT) + +struct mlx5_tc_ct_priv * +mlx5_tc_ct_init(struct mlx5e_priv *priv, struct mlx5_fs_chains *chains, + struct mod_hdr_tbl *mod_hdr, + enum mlx5_flow_namespace_type ns_type, + struct mlx5e_post_act *post_act); +void +mlx5_tc_ct_clean(struct mlx5_tc_ct_priv *ct_priv); + +void +mlx5_tc_ct_match_del(struct mlx5_tc_ct_priv *priv, struct mlx5_ct_attr *ct_attr); + +int +mlx5_tc_ct_match_add(struct mlx5_tc_ct_priv *priv, + struct mlx5_flow_spec *spec, + struct flow_cls_offload *f, + struct mlx5_ct_attr *ct_attr, + struct netlink_ext_ack *extack); +int mlx5_tc_ct_add_no_trk_match(struct mlx5_flow_spec *spec); +int +mlx5_tc_ct_parse_action(struct mlx5_tc_ct_priv *priv, + struct mlx5_flow_attr *attr, + struct mlx5e_tc_mod_hdr_acts *mod_acts, + const struct flow_action_entry *act, + struct netlink_ext_ack *extack); + +struct mlx5_flow_handle * +mlx5_tc_ct_flow_offload(struct mlx5_tc_ct_priv *priv, + struct mlx5_flow_spec *spec, + struct mlx5_flow_attr *attr, + struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts); +void +mlx5_tc_ct_delete_flow(struct mlx5_tc_ct_priv *priv, + struct mlx5_flow_attr *attr); + +bool +mlx5e_tc_ct_restore_flow(struct mlx5_tc_ct_priv *ct_priv, + struct sk_buff *skb, u8 zone_restore_id); + +u32 +mlx5_tc_ct_max_offloaded_conns_get(struct mlx5_core_dev *dev); +void +mlx5_tc_ct_max_offloaded_conns_set(struct mlx5_core_dev *dev, u32 max); + +bool +mlx5_tc_ct_labels_mapping_get(struct mlx5_core_dev *dev); +void +mlx5_tc_ct_lables_mapping_set(struct mlx5_core_dev *dev, bool enable); + +int +mlx5_tc_ct_set_ct_clear_regs(struct mlx5_tc_ct_priv *priv, + struct mlx5e_tc_mod_hdr_acts *mod_acts); + +#else /* CONFIG_MLX5_TC_CT */ + +static inline struct mlx5_tc_ct_priv * +mlx5_tc_ct_init(struct mlx5e_priv *priv, struct mlx5_fs_chains *chains, + struct mod_hdr_tbl *mod_hdr, + enum mlx5_flow_namespace_type ns_type, + struct mlx5e_post_act *post_act) +{ + return NULL; +} + +static inline void +mlx5_tc_ct_clean(struct mlx5_tc_ct_priv *ct_priv) +{ +} + +static inline void +mlx5_tc_ct_match_del(struct mlx5_tc_ct_priv *priv, struct mlx5_ct_attr *ct_attr) {} + +static inline int +mlx5_tc_ct_match_add(struct mlx5_tc_ct_priv *priv, + struct mlx5_flow_spec *spec, + struct flow_cls_offload *f, + struct mlx5_ct_attr *ct_attr, + struct netlink_ext_ack *extack) +{ + struct flow_rule *rule = flow_cls_offload_flow_rule(f); + + if (!flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_CT)) + return 0; + + NL_SET_ERR_MSG_MOD(extack, "mlx5 tc ct offload isn't enabled."); + return -EOPNOTSUPP; +} + +static inline int +mlx5_tc_ct_add_no_trk_match(struct mlx5_flow_spec *spec) +{ + return 0; +} + +static inline int +mlx5_tc_ct_set_ct_clear_regs(struct mlx5_tc_ct_priv *priv, + struct mlx5e_tc_mod_hdr_acts *mod_acts) +{ + return -EOPNOTSUPP; +} + +static inline int +mlx5_tc_ct_parse_action(struct mlx5_tc_ct_priv *priv, + struct mlx5_flow_attr *attr, + struct mlx5e_tc_mod_hdr_acts *mod_acts, + const struct flow_action_entry *act, + struct netlink_ext_ack *extack) +{ + NL_SET_ERR_MSG_MOD(extack, "mlx5 tc ct offload isn't enabled."); + return -EOPNOTSUPP; +} + +static inline struct mlx5_flow_handle * +mlx5_tc_ct_flow_offload(struct mlx5_tc_ct_priv *priv, + struct mlx5_flow_spec *spec, + struct mlx5_flow_attr *attr, + struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline void +mlx5_tc_ct_delete_flow(struct mlx5_tc_ct_priv *priv, + struct mlx5_flow_attr *attr) +{ +} + +static inline bool +mlx5e_tc_ct_restore_flow(struct mlx5_tc_ct_priv *ct_priv, + struct sk_buff *skb, u8 zone_restore_id) +{ + if (!zone_restore_id) + return true; + + return false; +} + +static inline u32 +mlx5_tc_ct_max_offloaded_conns_get(struct mlx5_core_dev *dev) +{ + return 0; +} + +static inline void +mlx5_tc_ct_max_offloaded_conns_set(struct mlx5_core_dev *dev, u32 max) +{ +} + +static inline bool +mlx5_tc_ct_labels_mapping_get(struct mlx5_core_dev *dev) +{ + return false; +} + +static inline void +mlx5_tc_ct_lables_mapping_set(struct mlx5_core_dev *dev, bool enable) +{ +} + +#endif /* !IS_ENABLED(CONFIG_MLX5_TC_CT) */ +#endif /* __MLX5_EN_TC_CT_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc_priv.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc_priv.h new file mode 100644 index 0000000..6dbb4b2 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc_priv.h @@ -0,0 +1,214 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2021 Mellanox Technologies. */ + +#ifndef __MLX5_EN_TC_PRIV_H__ +#define __MLX5_EN_TC_PRIV_H__ + +#include "en_tc.h" +#include "en/tc/act/act.h" + +#define MLX5E_TC_FLOW_BASE (MLX5E_TC_FLAG_LAST_EXPORTED_BIT + 1) + +#define MLX5E_TC_MAX_SPLITS 1 + +#define mlx5e_nic_chains(priv) ((priv)->fs.tc.chains) + +enum { + MLX5E_TC_FLOW_FLAG_INGRESS = MLX5E_TC_FLAG_INGRESS_BIT, + MLX5E_TC_FLOW_FLAG_EGRESS = MLX5E_TC_FLAG_EGRESS_BIT, + MLX5E_TC_FLOW_FLAG_ESWITCH = MLX5E_TC_FLAG_ESW_OFFLOAD_BIT, + MLX5E_TC_FLOW_FLAG_FT = MLX5E_TC_FLAG_FT_OFFLOAD_BIT, + MLX5E_TC_FLOW_FLAG_NIC = MLX5E_TC_FLAG_NIC_OFFLOAD_BIT, + MLX5E_TC_FLOW_FLAG_OFFLOADED = MLX5E_TC_FLOW_BASE, + MLX5E_TC_FLOW_FLAG_HAIRPIN = MLX5E_TC_FLOW_BASE + 1, + MLX5E_TC_FLOW_FLAG_HAIRPIN_RSS = MLX5E_TC_FLOW_BASE + 2, + MLX5E_TC_FLOW_FLAG_SLOW = MLX5E_TC_FLOW_BASE + 3, + MLX5E_TC_FLOW_FLAG_DUP = MLX5E_TC_FLOW_BASE + 4, + MLX5E_TC_FLOW_FLAG_NOT_READY = MLX5E_TC_FLOW_BASE + 5, + MLX5E_TC_FLOW_FLAG_DELETED = MLX5E_TC_FLOW_BASE + 6, + MLX5E_TC_FLOW_FLAG_CT = MLX5E_TC_FLOW_BASE + 7, + MLX5E_TC_FLOW_FLAG_L3_TO_L2_DECAP = MLX5E_TC_FLOW_BASE + 8, + MLX5E_TC_FLOW_FLAG_TUN_RX = MLX5E_TC_FLOW_BASE + 9, + MLX5E_TC_FLOW_FLAG_FAILED = MLX5E_TC_FLOW_BASE + 10, + MLX5E_TC_FLOW_FLAG_SAMPLE = MLX5E_TC_FLOW_BASE + 11, +}; + +struct mlx5e_tc_flow_parse_attr { + const struct ip_tunnel_info *tun_info[MLX5_MAX_FLOW_FWD_VPORTS]; + struct mlx5e_mpls_info mpls_info[MLX5_MAX_FLOW_FWD_VPORTS]; + struct net_device *filter_dev; + struct mlx5_flow_spec spec; + struct pedit_headers_action hdrs[__PEDIT_CMD_MAX]; + struct mlx5e_tc_mod_hdr_acts mod_hdr_acts; + int mirred_ifindex[MLX5_MAX_FLOW_FWD_VPORTS]; + struct ethhdr eth; + struct mlx5e_tc_act_parse_state parse_state; +}; + +/* Helper struct for accessing a struct containing list_head array. + * Containing struct + * |- Helper array + * [0] Helper item 0 + * |- list_head item 0 + * |- index (0) + * [1] Helper item 1 + * |- list_head item 1 + * |- index (1) + * To access the containing struct from one of the list_head items: + * 1. Get the helper item from the list_head item using + * helper item = + * container_of(list_head item, helper struct type, list_head field) + * 2. Get the contining struct from the helper item and its index in the array: + * containing struct = + * container_of(helper item, containing struct type, helper field[index]) + */ +struct encap_flow_item { + struct mlx5e_encap_entry *e; /* attached encap instance */ + struct list_head list; + int index; +}; + +struct encap_route_flow_item { + struct mlx5e_route_entry *r; /* attached route instance */ + int index; +}; + +struct mlx5e_tc_flow { + struct rhash_head node; + struct mlx5e_priv *priv; + u64 cookie; + unsigned long flags; + struct mlx5_flow_handle *rule[MLX5E_TC_MAX_SPLITS + 1]; + + /* flows sharing the same reformat object - currently mpls decap */ + struct list_head l3_to_l2_reformat; + struct mlx5e_decap_entry *decap_reformat; + + /* flows sharing same route entry */ + struct list_head decap_routes; + struct mlx5e_route_entry *decap_route; + struct encap_route_flow_item encap_routes[MLX5_MAX_FLOW_FWD_VPORTS]; + + /* Flow can be associated with multiple encap IDs. + * The number of encaps is bounded by the number of supported + * destinations. + */ + struct encap_flow_item encaps[MLX5_MAX_FLOW_FWD_VPORTS]; + struct mlx5e_tc_flow *peer_flow; + struct mlx5e_mod_hdr_handle *mh; /* attached mod header instance */ + struct mlx5e_mod_hdr_handle *slow_mh; /* attached mod header instance for slow path */ + struct mlx5e_hairpin_entry *hpe; /* attached hairpin instance */ + struct list_head hairpin; /* flows sharing the same hairpin */ + struct list_head peer; /* flows with peer flow */ + struct list_head unready; /* flows not ready to be offloaded (e.g + * due to missing route) + */ + struct net_device *orig_dev; /* netdev adding flow first */ + int tmp_entry_index; + struct list_head tmp_list; /* temporary flow list used by neigh update */ + refcount_t refcnt; + struct rcu_head rcu_head; + struct completion init_done; + struct completion del_hw_done; + struct mlx5_flow_attr *attr; + struct list_head attrs; + u32 chain_mapping; +}; + +struct mlx5_flow_handle * +mlx5e_tc_rule_offload(struct mlx5e_priv *priv, + struct mlx5_flow_spec *spec, + struct mlx5_flow_attr *attr); + +void +mlx5e_tc_rule_unoffload(struct mlx5e_priv *priv, + struct mlx5_flow_handle *rule, + struct mlx5_flow_attr *attr); + +u8 mlx5e_tc_get_ip_version(struct mlx5_flow_spec *spec, bool outer); + +struct mlx5_flow_handle * +mlx5e_tc_offload_fdb_rules(struct mlx5_eswitch *esw, + struct mlx5e_tc_flow *flow, + struct mlx5_flow_spec *spec, + struct mlx5_flow_attr *attr); + +struct mlx5_flow_attr * +mlx5e_tc_get_encap_attr(struct mlx5e_tc_flow *flow); + +void mlx5e_tc_unoffload_flow_post_acts(struct mlx5e_tc_flow *flow); +int mlx5e_tc_offload_flow_post_acts(struct mlx5e_tc_flow *flow); + +bool mlx5e_is_eswitch_flow(struct mlx5e_tc_flow *flow); +bool mlx5e_is_ft_flow(struct mlx5e_tc_flow *flow); +bool mlx5e_is_offloaded_flow(struct mlx5e_tc_flow *flow); +int mlx5e_get_flow_namespace(struct mlx5e_tc_flow *flow); +bool mlx5e_same_hw_devs(struct mlx5e_priv *priv, struct mlx5e_priv *peer_priv); + +static inline void __flow_flag_set(struct mlx5e_tc_flow *flow, unsigned long flag) +{ + /* Complete all memory stores before setting bit. */ + smp_mb__before_atomic(); + set_bit(flag, &flow->flags); +} + +#define flow_flag_set(flow, flag) __flow_flag_set(flow, MLX5E_TC_FLOW_FLAG_##flag) + +static inline bool __flow_flag_test_and_set(struct mlx5e_tc_flow *flow, + unsigned long flag) +{ + /* test_and_set_bit() provides all necessary barriers */ + return test_and_set_bit(flag, &flow->flags); +} + +#define flow_flag_test_and_set(flow, flag) \ + __flow_flag_test_and_set(flow, \ + MLX5E_TC_FLOW_FLAG_##flag) + +static inline void __flow_flag_clear(struct mlx5e_tc_flow *flow, unsigned long flag) +{ + /* Complete all memory stores before clearing bit. */ + smp_mb__before_atomic(); + clear_bit(flag, &flow->flags); +} + +#define flow_flag_clear(flow, flag) __flow_flag_clear(flow, \ + MLX5E_TC_FLOW_FLAG_##flag) + +static inline bool __flow_flag_test(struct mlx5e_tc_flow *flow, unsigned long flag) +{ + bool ret = test_bit(flag, &flow->flags); + + /* Read fields of flow structure only after checking flags. */ + smp_mb__after_atomic(); + return ret; +} + +#define flow_flag_test(flow, flag) __flow_flag_test(flow, \ + MLX5E_TC_FLOW_FLAG_##flag) + +void mlx5e_tc_unoffload_from_slow_path(struct mlx5_eswitch *esw, + struct mlx5e_tc_flow *flow); +struct mlx5_flow_handle * +mlx5e_tc_offload_to_slow_path(struct mlx5_eswitch *esw, + struct mlx5e_tc_flow *flow, + struct mlx5_flow_spec *spec); + +void mlx5e_tc_unoffload_fdb_rules(struct mlx5_eswitch *esw, + struct mlx5e_tc_flow *flow, + struct mlx5_flow_attr *attr); + +struct mlx5e_tc_flow *mlx5e_flow_get(struct mlx5e_tc_flow *flow); +void mlx5e_flow_put(struct mlx5e_priv *priv, struct mlx5e_tc_flow *flow); + +struct mlx5_fc *mlx5e_tc_get_counter(struct mlx5e_tc_flow *flow); + +struct mlx5e_tc_int_port_priv * +mlx5e_get_int_port_priv(struct mlx5e_priv *priv); + +struct mlx5e_flow_meters *mlx5e_get_flow_meters(struct mlx5_core_dev *dev); + +void *mlx5e_get_match_headers_value(u32 flags, struct mlx5_flow_spec *spec); +void *mlx5e_get_match_headers_criteria(u32 flags, struct mlx5_flow_spec *spec); + +#endif /* __MLX5_EN_TC_PRIV_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c new file mode 100644 index 0000000..1ddd55c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c @@ -0,0 +1,995 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2018 Mellanox Technologies. */ + +#include +#include +#include +#include +#include +#include "en/tc_tun.h" +#include "en/tc_priv.h" +#include "en_tc.h" +#include "rep/tc.h" +#include "rep/neigh.h" +#include "lag/lag.h" +#include "lag/mp.h" + +struct mlx5e_tc_tun_route_attr { + struct net_device *out_dev; + struct net_device *route_dev; + union { + struct flowi4 fl4; + struct flowi6 fl6; + } fl; + struct neighbour *n; + u8 ttl; +}; + +#define TC_TUN_ROUTE_ATTR_INIT(name) struct mlx5e_tc_tun_route_attr name = {} + +static void mlx5e_tc_tun_route_attr_cleanup(struct mlx5e_tc_tun_route_attr *attr) +{ + if (attr->n) + neigh_release(attr->n); + if (attr->route_dev) + dev_put(attr->route_dev); +} + +struct mlx5e_tc_tunnel *mlx5e_get_tc_tun(struct net_device *tunnel_dev) +{ + if (netif_is_vxlan(tunnel_dev)) + return &vxlan_tunnel; + else if (netif_is_geneve(tunnel_dev)) + return &geneve_tunnel; + else if (netif_is_gretap(tunnel_dev) || + netif_is_ip6gretap(tunnel_dev)) + return &gre_tunnel; + else if (netif_is_bareudp(tunnel_dev)) + return &mplsoudp_tunnel; + else + return NULL; +} + +static int get_route_and_out_devs(struct mlx5e_priv *priv, + struct net_device *dev, + struct net_device **route_dev, + struct net_device **out_dev) +{ + struct net_device *uplink_dev, *uplink_upper, *real_dev; + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + bool dst_is_lag_dev; + + real_dev = is_vlan_dev(dev) ? vlan_dev_real_dev(dev) : dev; + uplink_dev = mlx5_eswitch_uplink_get_proto_dev(esw, REP_ETH); + if (!uplink_dev) + return -ENODEV; + + rcu_read_lock(); + uplink_upper = netdev_master_upper_dev_get_rcu(uplink_dev); + /* mlx5_lag_is_sriov() is a blocking function which can't be called + * while holding rcu read lock. Take the net_device for correctness + * sake. + */ + if (uplink_upper) + dev_hold(uplink_upper); + rcu_read_unlock(); + + dst_is_lag_dev = (uplink_upper && + netif_is_lag_master(uplink_upper) && + real_dev == uplink_upper && + mlx5_lag_is_sriov(priv->mdev)); + if (uplink_upper) + dev_put(uplink_upper); + + /* if the egress device isn't on the same HW e-switch or + * it's a LAG device, use the uplink + */ + *route_dev = dev; + if (!netdev_port_same_parent_id(priv->netdev, real_dev) || + dst_is_lag_dev || is_vlan_dev(*route_dev) || + netif_is_ovs_master(*route_dev)) + *out_dev = uplink_dev; + else if (mlx5e_eswitch_rep(dev) && + mlx5e_is_valid_eswitch_fwd_dev(priv, dev)) + *out_dev = *route_dev; + else + return -EOPNOTSUPP; + + if (!(mlx5e_eswitch_rep(*out_dev) && + mlx5e_is_uplink_rep(netdev_priv(*out_dev)))) + return -EOPNOTSUPP; + + if (mlx5e_eswitch_uplink_rep(priv->netdev) && *out_dev != priv->netdev) + return -EOPNOTSUPP; + + return 0; +} + +static int mlx5e_route_lookup_ipv4_get(struct mlx5e_priv *priv, + struct net_device *dev, + struct mlx5e_tc_tun_route_attr *attr) +{ + struct net_device *route_dev; + struct net_device *out_dev; + struct neighbour *n; + struct rtable *rt; + +#if IS_ENABLED(CONFIG_INET) + struct mlx5_core_dev *mdev = priv->mdev; + struct net_device *uplink_dev; + int ret; + + if (mlx5_lag_is_multipath(mdev)) { + struct mlx5_eswitch *esw = mdev->priv.eswitch; + + uplink_dev = mlx5_eswitch_uplink_get_proto_dev(esw, REP_ETH); + attr->fl.fl4.flowi4_oif = uplink_dev->ifindex; + } else { + struct mlx5e_tc_tunnel *tunnel = mlx5e_get_tc_tun(dev); + + if (tunnel && tunnel->get_remote_ifindex) + attr->fl.fl4.flowi4_oif = tunnel->get_remote_ifindex(dev); + } + + rt = ip_route_output_key(dev_net(dev), &attr->fl.fl4); + if (IS_ERR(rt)) + return PTR_ERR(rt); + + if (rt->rt_type != RTN_UNICAST) { + ret = -ENETUNREACH; + goto err_rt_release; + } + + if (mlx5_lag_is_multipath(mdev) && rt->rt_gw_family != AF_INET) { + ret = -ENETUNREACH; + goto err_rt_release; + } +#else + return -EOPNOTSUPP; +#endif + + ret = get_route_and_out_devs(priv, rt->dst.dev, &route_dev, &out_dev); + if (ret < 0) + goto err_rt_release; + dev_hold(route_dev); + + if (!attr->ttl) + attr->ttl = ip4_dst_hoplimit(&rt->dst); + n = dst_neigh_lookup(&rt->dst, &attr->fl.fl4.daddr); + if (!n) { + ret = -ENOMEM; + goto err_dev_release; + } + + ip_rt_put(rt); + attr->route_dev = route_dev; + attr->out_dev = out_dev; + attr->n = n; + return 0; + +err_dev_release: + dev_put(route_dev); +err_rt_release: + ip_rt_put(rt); + return ret; +} + +static void mlx5e_route_lookup_ipv4_put(struct mlx5e_tc_tun_route_attr *attr) +{ + mlx5e_tc_tun_route_attr_cleanup(attr); +} + +static const char *mlx5e_netdev_kind(struct net_device *dev) +{ + if (dev->rtnl_link_ops) + return dev->rtnl_link_ops->kind; + else + return "unknown"; +} + +static int mlx5e_gen_ip_tunnel_header(char buf[], __u8 *ip_proto, + struct mlx5e_encap_entry *e) +{ + if (!e->tunnel) { + pr_warn("mlx5: Cannot generate tunnel header for this tunnel\n"); + return -EOPNOTSUPP; + } + + return e->tunnel->generate_ip_tun_hdr(buf, ip_proto, e); +} + +static char *gen_eth_tnl_hdr(char *buf, struct net_device *dev, + struct mlx5e_encap_entry *e, + u16 proto) +{ + struct ethhdr *eth = (struct ethhdr *)buf; + char *ip; + + ether_addr_copy(eth->h_dest, e->h_dest); + ether_addr_copy(eth->h_source, dev->dev_addr); + if (is_vlan_dev(dev)) { + struct vlan_hdr *vlan = (struct vlan_hdr *) + ((char *)eth + ETH_HLEN); + ip = (char *)vlan + VLAN_HLEN; + eth->h_proto = vlan_dev_vlan_proto(dev); + vlan->h_vlan_TCI = htons(vlan_dev_vlan_id(dev)); + vlan->h_vlan_encapsulated_proto = htons(proto); + } else { + eth->h_proto = htons(proto); + ip = (char *)eth + ETH_HLEN; + } + + return ip; +} + +int mlx5e_tc_tun_create_header_ipv4(struct mlx5e_priv *priv, + struct net_device *mirred_dev, + struct mlx5e_encap_entry *e) +{ + int max_encap_size = MLX5_CAP_ESW(priv->mdev, max_encap_header_size); + const struct ip_tunnel_key *tun_key = &e->tun_info->key; + struct mlx5_pkt_reformat_params reformat_params; + struct mlx5e_neigh m_neigh = {}; + TC_TUN_ROUTE_ATTR_INIT(attr); + int ipv4_encap_size; + char *encap_header; + struct iphdr *ip; + u8 nud_state; + int err; + + /* add the IP fields */ + attr.fl.fl4.flowi4_tos = tun_key->tos & ~INET_ECN_MASK; + attr.fl.fl4.daddr = tun_key->u.ipv4.dst; + attr.fl.fl4.saddr = tun_key->u.ipv4.src; + attr.ttl = tun_key->ttl; + + err = mlx5e_route_lookup_ipv4_get(priv, mirred_dev, &attr); + if (err) + return err; + + ipv4_encap_size = + (is_vlan_dev(attr.route_dev) ? VLAN_ETH_HLEN : ETH_HLEN) + + sizeof(struct iphdr) + + e->tunnel->calc_hlen(e); + + if (max_encap_size < ipv4_encap_size) { + mlx5_core_warn(priv->mdev, "encap size %d too big, max supported is %d\n", + ipv4_encap_size, max_encap_size); + err = -EOPNOTSUPP; + goto release_neigh; + } + + encap_header = kzalloc(ipv4_encap_size, GFP_KERNEL); + if (!encap_header) { + err = -ENOMEM; + goto release_neigh; + } + + m_neigh.family = attr.n->ops->family; + memcpy(&m_neigh.dst_ip, attr.n->primary_key, attr.n->tbl->key_len); + e->out_dev = attr.out_dev; + e->route_dev_ifindex = attr.route_dev->ifindex; + + /* It's important to add the neigh to the hash table before checking + * the neigh validity state. So if we'll get a notification, in case the + * neigh changes it's validity state, we would find the relevant neigh + * in the hash. + */ + err = mlx5e_rep_encap_entry_attach(netdev_priv(attr.out_dev), e, &m_neigh, attr.n->dev); + if (err) + goto free_encap; + + read_lock_bh(&attr.n->lock); + nud_state = attr.n->nud_state; + ether_addr_copy(e->h_dest, attr.n->ha); + read_unlock_bh(&attr.n->lock); + + /* add ethernet header */ + ip = (struct iphdr *)gen_eth_tnl_hdr(encap_header, attr.route_dev, e, + ETH_P_IP); + + /* add ip header */ + ip->tos = tun_key->tos; + ip->version = 0x4; + ip->ihl = 0x5; + ip->ttl = attr.ttl; + ip->daddr = attr.fl.fl4.daddr; + ip->saddr = attr.fl.fl4.saddr; + + /* add tunneling protocol header */ + err = mlx5e_gen_ip_tunnel_header((char *)ip + sizeof(struct iphdr), + &ip->protocol, e); + if (err) + goto destroy_neigh_entry; + + e->encap_size = ipv4_encap_size; + e->encap_header = encap_header; + + if (!(nud_state & NUD_VALID)) { + neigh_event_send(attr.n, NULL); + /* the encap entry will be made valid on neigh update event + * and not used before that. + */ + goto release_neigh; + } + + memset(&reformat_params, 0, sizeof(reformat_params)); + reformat_params.type = e->reformat_type; + reformat_params.size = ipv4_encap_size; + reformat_params.data = encap_header; + e->pkt_reformat = mlx5_packet_reformat_alloc(priv->mdev, &reformat_params, + MLX5_FLOW_NAMESPACE_FDB); + if (IS_ERR(e->pkt_reformat)) { + err = PTR_ERR(e->pkt_reformat); + goto destroy_neigh_entry; + } + + e->flags |= MLX5_ENCAP_ENTRY_VALID; + mlx5e_rep_queue_neigh_stats_work(netdev_priv(attr.out_dev)); + mlx5e_route_lookup_ipv4_put(&attr); + return err; + +destroy_neigh_entry: + mlx5e_rep_encap_entry_detach(netdev_priv(e->out_dev), e); +free_encap: + kfree(encap_header); +release_neigh: + mlx5e_route_lookup_ipv4_put(&attr); + return err; +} + +int mlx5e_tc_tun_update_header_ipv4(struct mlx5e_priv *priv, + struct net_device *mirred_dev, + struct mlx5e_encap_entry *e) +{ + int max_encap_size = MLX5_CAP_ESW(priv->mdev, max_encap_header_size); + const struct ip_tunnel_key *tun_key = &e->tun_info->key; + struct mlx5_pkt_reformat_params reformat_params; + TC_TUN_ROUTE_ATTR_INIT(attr); + int ipv4_encap_size; + char *encap_header; + struct iphdr *ip; + u8 nud_state; + int err; + + /* add the IP fields */ + attr.fl.fl4.flowi4_tos = tun_key->tos & ~INET_ECN_MASK; + attr.fl.fl4.daddr = tun_key->u.ipv4.dst; + attr.fl.fl4.saddr = tun_key->u.ipv4.src; + attr.ttl = tun_key->ttl; + + err = mlx5e_route_lookup_ipv4_get(priv, mirred_dev, &attr); + if (err) + return err; + + ipv4_encap_size = + (is_vlan_dev(attr.route_dev) ? VLAN_ETH_HLEN : ETH_HLEN) + + sizeof(struct iphdr) + + e->tunnel->calc_hlen(e); + + if (max_encap_size < ipv4_encap_size) { + mlx5_core_warn(priv->mdev, "encap size %d too big, max supported is %d\n", + ipv4_encap_size, max_encap_size); + err = -EOPNOTSUPP; + goto release_neigh; + } + + encap_header = kzalloc(ipv4_encap_size, GFP_KERNEL); + if (!encap_header) { + err = -ENOMEM; + goto release_neigh; + } + + e->route_dev_ifindex = attr.route_dev->ifindex; + + read_lock_bh(&attr.n->lock); + nud_state = attr.n->nud_state; + ether_addr_copy(e->h_dest, attr.n->ha); + WRITE_ONCE(e->nhe->neigh_dev, attr.n->dev); + read_unlock_bh(&attr.n->lock); + + /* add ethernet header */ + ip = (struct iphdr *)gen_eth_tnl_hdr(encap_header, attr.route_dev, e, + ETH_P_IP); + + /* add ip header */ + ip->tos = tun_key->tos; + ip->version = 0x4; + ip->ihl = 0x5; + ip->ttl = attr.ttl; + ip->daddr = attr.fl.fl4.daddr; + ip->saddr = attr.fl.fl4.saddr; + + /* add tunneling protocol header */ + err = mlx5e_gen_ip_tunnel_header((char *)ip + sizeof(struct iphdr), + &ip->protocol, e); + if (err) + goto free_encap; + + e->encap_size = ipv4_encap_size; + kfree(e->encap_header); + e->encap_header = encap_header; + + if (!(nud_state & NUD_VALID)) { + neigh_event_send(attr.n, NULL); + /* the encap entry will be made valid on neigh update event + * and not used before that. + */ + goto release_neigh; + } + + memset(&reformat_params, 0, sizeof(reformat_params)); + reformat_params.type = e->reformat_type; + reformat_params.size = ipv4_encap_size; + reformat_params.data = encap_header; + e->pkt_reformat = mlx5_packet_reformat_alloc(priv->mdev, &reformat_params, + MLX5_FLOW_NAMESPACE_FDB); + if (IS_ERR(e->pkt_reformat)) { + err = PTR_ERR(e->pkt_reformat); + goto free_encap; + } + + e->flags |= MLX5_ENCAP_ENTRY_VALID; + mlx5e_rep_queue_neigh_stats_work(netdev_priv(attr.out_dev)); + mlx5e_route_lookup_ipv4_put(&attr); + return err; + +free_encap: + kfree(encap_header); +release_neigh: + mlx5e_route_lookup_ipv4_put(&attr); + return err; +} + +#if IS_ENABLED(CONFIG_INET) && IS_ENABLED(CONFIG_IPV6) +static int mlx5e_route_lookup_ipv6_get(struct mlx5e_priv *priv, + struct net_device *dev, + struct mlx5e_tc_tun_route_attr *attr) +{ + struct mlx5e_tc_tunnel *tunnel = mlx5e_get_tc_tun(dev); + struct net_device *route_dev; + struct net_device *out_dev; + struct dst_entry *dst; + struct neighbour *n; + int ret; + + if (tunnel && tunnel->get_remote_ifindex) + attr->fl.fl6.flowi6_oif = tunnel->get_remote_ifindex(dev); + dst = ipv6_stub->ipv6_dst_lookup_flow(dev_net(dev), NULL, &attr->fl.fl6, + NULL); + if (IS_ERR(dst)) + return PTR_ERR(dst); + + if (!attr->ttl) + attr->ttl = ip6_dst_hoplimit(dst); + + ret = get_route_and_out_devs(priv, dst->dev, &route_dev, &out_dev); + if (ret < 0) + goto err_dst_release; + + dev_hold(route_dev); + n = dst_neigh_lookup(dst, &attr->fl.fl6.daddr); + if (!n) { + ret = -ENOMEM; + goto err_dev_release; + } + + dst_release(dst); + attr->out_dev = out_dev; + attr->route_dev = route_dev; + attr->n = n; + return 0; + +err_dev_release: + dev_put(route_dev); +err_dst_release: + dst_release(dst); + return ret; +} + +static void mlx5e_route_lookup_ipv6_put(struct mlx5e_tc_tun_route_attr *attr) +{ + mlx5e_tc_tun_route_attr_cleanup(attr); +} + +int mlx5e_tc_tun_create_header_ipv6(struct mlx5e_priv *priv, + struct net_device *mirred_dev, + struct mlx5e_encap_entry *e) +{ + int max_encap_size = MLX5_CAP_ESW(priv->mdev, max_encap_header_size); + const struct ip_tunnel_key *tun_key = &e->tun_info->key; + struct mlx5_pkt_reformat_params reformat_params; + struct mlx5e_neigh m_neigh = {}; + TC_TUN_ROUTE_ATTR_INIT(attr); + struct ipv6hdr *ip6h; + int ipv6_encap_size; + char *encap_header; + u8 nud_state; + int err; + + attr.ttl = tun_key->ttl; + attr.fl.fl6.flowlabel = ip6_make_flowinfo(tun_key->tos, tun_key->label); + attr.fl.fl6.daddr = tun_key->u.ipv6.dst; + attr.fl.fl6.saddr = tun_key->u.ipv6.src; + + err = mlx5e_route_lookup_ipv6_get(priv, mirred_dev, &attr); + if (err) + return err; + + ipv6_encap_size = + (is_vlan_dev(attr.route_dev) ? VLAN_ETH_HLEN : ETH_HLEN) + + sizeof(struct ipv6hdr) + + e->tunnel->calc_hlen(e); + + if (max_encap_size < ipv6_encap_size) { + mlx5_core_warn(priv->mdev, "encap size %d too big, max supported is %d\n", + ipv6_encap_size, max_encap_size); + err = -EOPNOTSUPP; + goto release_neigh; + } + + encap_header = kzalloc(ipv6_encap_size, GFP_KERNEL); + if (!encap_header) { + err = -ENOMEM; + goto release_neigh; + } + + m_neigh.family = attr.n->ops->family; + memcpy(&m_neigh.dst_ip, attr.n->primary_key, attr.n->tbl->key_len); + e->out_dev = attr.out_dev; + e->route_dev_ifindex = attr.route_dev->ifindex; + + /* It's important to add the neigh to the hash table before checking + * the neigh validity state. So if we'll get a notification, in case the + * neigh changes it's validity state, we would find the relevant neigh + * in the hash. + */ + err = mlx5e_rep_encap_entry_attach(netdev_priv(attr.out_dev), e, &m_neigh, attr.n->dev); + if (err) + goto free_encap; + + read_lock_bh(&attr.n->lock); + nud_state = attr.n->nud_state; + ether_addr_copy(e->h_dest, attr.n->ha); + read_unlock_bh(&attr.n->lock); + + /* add ethernet header */ + ip6h = (struct ipv6hdr *)gen_eth_tnl_hdr(encap_header, attr.route_dev, e, + ETH_P_IPV6); + + /* add ip header */ + ip6_flow_hdr(ip6h, tun_key->tos, 0); + /* the HW fills up ipv6 payload len */ + ip6h->hop_limit = attr.ttl; + ip6h->daddr = attr.fl.fl6.daddr; + ip6h->saddr = attr.fl.fl6.saddr; + + /* add tunneling protocol header */ + err = mlx5e_gen_ip_tunnel_header((char *)ip6h + sizeof(struct ipv6hdr), + &ip6h->nexthdr, e); + if (err) + goto destroy_neigh_entry; + + e->encap_size = ipv6_encap_size; + e->encap_header = encap_header; + + if (!(nud_state & NUD_VALID)) { + neigh_event_send(attr.n, NULL); + /* the encap entry will be made valid on neigh update event + * and not used before that. + */ + goto release_neigh; + } + + memset(&reformat_params, 0, sizeof(reformat_params)); + reformat_params.type = e->reformat_type; + reformat_params.size = ipv6_encap_size; + reformat_params.data = encap_header; + e->pkt_reformat = mlx5_packet_reformat_alloc(priv->mdev, &reformat_params, + MLX5_FLOW_NAMESPACE_FDB); + if (IS_ERR(e->pkt_reformat)) { + err = PTR_ERR(e->pkt_reformat); + goto destroy_neigh_entry; + } + + e->flags |= MLX5_ENCAP_ENTRY_VALID; + mlx5e_rep_queue_neigh_stats_work(netdev_priv(attr.out_dev)); + mlx5e_route_lookup_ipv6_put(&attr); + return err; + +destroy_neigh_entry: + mlx5e_rep_encap_entry_detach(netdev_priv(e->out_dev), e); +free_encap: + kfree(encap_header); +release_neigh: + mlx5e_route_lookup_ipv6_put(&attr); + return err; +} + +int mlx5e_tc_tun_update_header_ipv6(struct mlx5e_priv *priv, + struct net_device *mirred_dev, + struct mlx5e_encap_entry *e) +{ + int max_encap_size = MLX5_CAP_ESW(priv->mdev, max_encap_header_size); + const struct ip_tunnel_key *tun_key = &e->tun_info->key; + struct mlx5_pkt_reformat_params reformat_params; + TC_TUN_ROUTE_ATTR_INIT(attr); + struct ipv6hdr *ip6h; + int ipv6_encap_size; + char *encap_header; + u8 nud_state; + int err; + + attr.ttl = tun_key->ttl; + + attr.fl.fl6.flowlabel = ip6_make_flowinfo(tun_key->tos, tun_key->label); + attr.fl.fl6.daddr = tun_key->u.ipv6.dst; + attr.fl.fl6.saddr = tun_key->u.ipv6.src; + + err = mlx5e_route_lookup_ipv6_get(priv, mirred_dev, &attr); + if (err) + return err; + + ipv6_encap_size = + (is_vlan_dev(attr.route_dev) ? VLAN_ETH_HLEN : ETH_HLEN) + + sizeof(struct ipv6hdr) + + e->tunnel->calc_hlen(e); + + if (max_encap_size < ipv6_encap_size) { + mlx5_core_warn(priv->mdev, "encap size %d too big, max supported is %d\n", + ipv6_encap_size, max_encap_size); + err = -EOPNOTSUPP; + goto release_neigh; + } + + encap_header = kzalloc(ipv6_encap_size, GFP_KERNEL); + if (!encap_header) { + err = -ENOMEM; + goto release_neigh; + } + + e->route_dev_ifindex = attr.route_dev->ifindex; + + read_lock_bh(&attr.n->lock); + nud_state = attr.n->nud_state; + ether_addr_copy(e->h_dest, attr.n->ha); + WRITE_ONCE(e->nhe->neigh_dev, attr.n->dev); + read_unlock_bh(&attr.n->lock); + + /* add ethernet header */ + ip6h = (struct ipv6hdr *)gen_eth_tnl_hdr(encap_header, attr.route_dev, e, + ETH_P_IPV6); + + /* add ip header */ + ip6_flow_hdr(ip6h, tun_key->tos, 0); + /* the HW fills up ipv6 payload len */ + ip6h->hop_limit = attr.ttl; + ip6h->daddr = attr.fl.fl6.daddr; + ip6h->saddr = attr.fl.fl6.saddr; + + /* add tunneling protocol header */ + err = mlx5e_gen_ip_tunnel_header((char *)ip6h + sizeof(struct ipv6hdr), + &ip6h->nexthdr, e); + if (err) + goto free_encap; + + e->encap_size = ipv6_encap_size; + kfree(e->encap_header); + e->encap_header = encap_header; + + if (!(nud_state & NUD_VALID)) { + neigh_event_send(attr.n, NULL); + /* the encap entry will be made valid on neigh update event + * and not used before that. + */ + goto release_neigh; + } + + memset(&reformat_params, 0, sizeof(reformat_params)); + reformat_params.type = e->reformat_type; + reformat_params.size = ipv6_encap_size; + reformat_params.data = encap_header; + e->pkt_reformat = mlx5_packet_reformat_alloc(priv->mdev, &reformat_params, + MLX5_FLOW_NAMESPACE_FDB); + if (IS_ERR(e->pkt_reformat)) { + err = PTR_ERR(e->pkt_reformat); + goto free_encap; + } + + e->flags |= MLX5_ENCAP_ENTRY_VALID; + mlx5e_rep_queue_neigh_stats_work(netdev_priv(attr.out_dev)); + mlx5e_route_lookup_ipv6_put(&attr); + return err; + +free_encap: + kfree(encap_header); +release_neigh: + mlx5e_route_lookup_ipv6_put(&attr); + return err; +} +#endif + +int mlx5e_tc_tun_route_lookup(struct mlx5e_priv *priv, + struct mlx5_flow_spec *spec, + struct mlx5_flow_attr *flow_attr, + struct net_device *filter_dev) +{ + struct mlx5_esw_flow_attr *esw_attr = flow_attr->esw_attr; + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + struct mlx5e_tc_int_port *int_port; + TC_TUN_ROUTE_ATTR_INIT(attr); + u16 vport_num; + int err = 0; + + if (flow_attr->tun_ip_version == 4) { + /* Addresses are swapped for decap */ + attr.fl.fl4.saddr = esw_attr->rx_tun_attr->dst_ip.v4; + attr.fl.fl4.daddr = esw_attr->rx_tun_attr->src_ip.v4; + err = mlx5e_route_lookup_ipv4_get(priv, filter_dev, &attr); + } +#if IS_ENABLED(CONFIG_INET) && IS_ENABLED(CONFIG_IPV6) + else if (flow_attr->tun_ip_version == 6) { + /* Addresses are swapped for decap */ + attr.fl.fl6.saddr = esw_attr->rx_tun_attr->dst_ip.v6; + attr.fl.fl6.daddr = esw_attr->rx_tun_attr->src_ip.v6; + err = mlx5e_route_lookup_ipv6_get(priv, filter_dev, &attr); + } +#endif + else + return 0; + + if (err) + return err; + + if (attr.route_dev->netdev_ops == &mlx5e_netdev_ops && + mlx5e_tc_is_vf_tunnel(attr.out_dev, attr.route_dev)) { + err = mlx5e_tc_query_route_vport(attr.out_dev, attr.route_dev, &vport_num); + if (err) + goto out; + + esw_attr->rx_tun_attr->vni = MLX5_GET(fte_match_param, spec->match_value, + misc_parameters.vxlan_vni); + esw_attr->rx_tun_attr->decap_vport = vport_num; + } else if (netif_is_ovs_master(attr.route_dev) && mlx5e_tc_int_port_supported(esw)) { + int_port = mlx5e_tc_int_port_get(mlx5e_get_int_port_priv(priv), + attr.route_dev->ifindex, + MLX5E_TC_INT_PORT_INGRESS); + if (IS_ERR(int_port)) { + err = PTR_ERR(int_port); + goto out; + } + esw_attr->int_port = int_port; + } + +out: + if (flow_attr->tun_ip_version == 4) + mlx5e_route_lookup_ipv4_put(&attr); +#if IS_ENABLED(CONFIG_INET) && IS_ENABLED(CONFIG_IPV6) + else if (flow_attr->tun_ip_version == 6) + mlx5e_route_lookup_ipv6_put(&attr); +#endif + return err; +} + +bool mlx5e_tc_tun_device_to_offload(struct mlx5e_priv *priv, + struct net_device *netdev) +{ + struct mlx5e_tc_tunnel *tunnel = mlx5e_get_tc_tun(netdev); + + if (tunnel && tunnel->can_offload(priv)) + return true; + else + return false; +} + +int mlx5e_tc_tun_init_encap_attr(struct net_device *tunnel_dev, + struct mlx5e_priv *priv, + struct mlx5e_encap_entry *e, + struct netlink_ext_ack *extack) +{ + struct mlx5e_tc_tunnel *tunnel = mlx5e_get_tc_tun(tunnel_dev); + + if (!tunnel) { + e->reformat_type = -1; + return -EOPNOTSUPP; + } + + return tunnel->init_encap_attr(tunnel_dev, priv, e, extack); +} + +int mlx5e_tc_tun_parse(struct net_device *filter_dev, + struct mlx5e_priv *priv, + struct mlx5_flow_spec *spec, + struct flow_cls_offload *f, + u8 *match_level) +{ + struct mlx5e_tc_tunnel *tunnel = mlx5e_get_tc_tun(filter_dev); + struct flow_rule *rule = flow_cls_offload_flow_rule(f); + void *headers_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, + outer_headers); + void *headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, + outer_headers); + struct netlink_ext_ack *extack = f->common.extack; + int err = 0; + + if (!tunnel) { + netdev_warn(priv->netdev, + "decapsulation offload is not supported for %s net device\n", + mlx5e_netdev_kind(filter_dev)); + err = -EOPNOTSUPP; + goto out; + } + + *match_level = tunnel->match_level; + + if (tunnel->parse_udp_ports) { + err = tunnel->parse_udp_ports(priv, spec, f, + headers_c, headers_v); + if (err) + goto out; + } + + if (tunnel->parse_tunnel) { + err = tunnel->parse_tunnel(priv, spec, f, + headers_c, headers_v); + if (err) + goto out; + } + + if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_CONTROL)) { + struct flow_dissector_key_basic key_basic = {}; + struct flow_dissector_key_basic mask_basic = { + .n_proto = htons(0xFFFF), + }; + struct flow_match_basic match_basic = { + .key = &key_basic, .mask = &mask_basic, + }; + struct flow_match_control match; + u16 addr_type; + + flow_rule_match_enc_control(rule, &match); + addr_type = match.key->addr_type; + + /* For tunnel addr_type used same key id`s as for non-tunnel */ + if (addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { + struct flow_match_ipv4_addrs match; + + flow_rule_match_enc_ipv4_addrs(rule, &match); + MLX5_SET(fte_match_set_lyr_2_4, headers_c, + src_ipv4_src_ipv6.ipv4_layout.ipv4, + ntohl(match.mask->src)); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, + src_ipv4_src_ipv6.ipv4_layout.ipv4, + ntohl(match.key->src)); + + MLX5_SET(fte_match_set_lyr_2_4, headers_c, + dst_ipv4_dst_ipv6.ipv4_layout.ipv4, + ntohl(match.mask->dst)); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, + dst_ipv4_dst_ipv6.ipv4_layout.ipv4, + ntohl(match.key->dst)); + + key_basic.n_proto = htons(ETH_P_IP); + mlx5e_tc_set_ethertype(priv->mdev, &match_basic, true, + headers_c, headers_v); + } else if (addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { + struct flow_match_ipv6_addrs match; + + flow_rule_match_enc_ipv6_addrs(rule, &match); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c, + src_ipv4_src_ipv6.ipv6_layout.ipv6), + &match.mask->src, MLX5_FLD_SZ_BYTES(ipv6_layout, + ipv6)); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, + src_ipv4_src_ipv6.ipv6_layout.ipv6), + &match.key->src, MLX5_FLD_SZ_BYTES(ipv6_layout, + ipv6)); + + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c, + dst_ipv4_dst_ipv6.ipv6_layout.ipv6), + &match.mask->dst, MLX5_FLD_SZ_BYTES(ipv6_layout, + ipv6)); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, + dst_ipv4_dst_ipv6.ipv6_layout.ipv6), + &match.key->dst, MLX5_FLD_SZ_BYTES(ipv6_layout, + ipv6)); + + key_basic.n_proto = htons(ETH_P_IPV6); + mlx5e_tc_set_ethertype(priv->mdev, &match_basic, true, + headers_c, headers_v); + } + } + + if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_IP)) { + struct flow_match_ip match; + + flow_rule_match_enc_ip(rule, &match); + MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_ecn, + match.mask->tos & 0x3); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_ecn, + match.key->tos & 0x3); + + MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_dscp, + match.mask->tos >> 2); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_dscp, + match.key->tos >> 2); + + MLX5_SET(fte_match_set_lyr_2_4, headers_c, ttl_hoplimit, + match.mask->ttl); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ttl_hoplimit, + match.key->ttl); + + if (match.mask->ttl && + !MLX5_CAP_ESW_FLOWTABLE_FDB + (priv->mdev, + ft_field_support.outer_ipv4_ttl)) { + NL_SET_ERR_MSG_MOD(extack, + "Matching on TTL is not supported"); + err = -EOPNOTSUPP; + goto out; + } + } + + /* let software handle IP fragments */ + MLX5_SET(fte_match_set_lyr_2_4, headers_c, frag, 1); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, frag, 0); + + return 0; + +out: + return err; +} + +int mlx5e_tc_tun_parse_udp_ports(struct mlx5e_priv *priv, + struct mlx5_flow_spec *spec, + struct flow_cls_offload *f, + void *headers_c, + void *headers_v) +{ + struct flow_rule *rule = flow_cls_offload_flow_rule(f); + struct netlink_ext_ack *extack = f->common.extack; + struct flow_match_ports enc_ports; + + /* Full udp dst port must be given */ + + if (!flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_PORTS)) { + NL_SET_ERR_MSG_MOD(extack, + "UDP tunnel decap filter must include enc_dst_port condition"); + netdev_warn(priv->netdev, + "UDP tunnel decap filter must include enc_dst_port condition\n"); + return -EOPNOTSUPP; + } + + flow_rule_match_enc_ports(rule, &enc_ports); + + if (memchr_inv(&enc_ports.mask->dst, 0xff, + sizeof(enc_ports.mask->dst))) { + NL_SET_ERR_MSG_MOD(extack, + "UDP tunnel decap filter must match enc_dst_port fully"); + netdev_warn(priv->netdev, + "UDP tunnel decap filter must match enc_dst_port fully\n"); + return -EOPNOTSUPP; + } + + /* match on UDP protocol and dst port number */ + + MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, ip_protocol); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol, IPPROTO_UDP); + + MLX5_SET(fte_match_set_lyr_2_4, headers_c, udp_dport, + ntohs(enc_ports.mask->dst)); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_dport, + ntohs(enc_ports.key->dst)); + + /* UDP src port on outer header is generated by HW, + * so it is probably a bad idea to request matching it. + * Nonetheless, it is allowed. + */ + + MLX5_SET(fte_match_set_lyr_2_4, headers_c, udp_sport, + ntohs(enc_ports.mask->src)); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_sport, + ntohs(enc_ports.key->src)); + + return 0; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.h new file mode 100644 index 0000000..b38f693 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.h @@ -0,0 +1,120 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2018 Mellanox Technologies. */ + +#ifndef __MLX5_EN_TC_TUNNEL_H__ +#define __MLX5_EN_TC_TUNNEL_H__ + +#include +#include +#include +#include +#include "en.h" +#include "en_rep.h" + +#ifdef CONFIG_MLX5_ESWITCH + +enum { + MLX5E_TC_TUNNEL_TYPE_UNKNOWN, + MLX5E_TC_TUNNEL_TYPE_VXLAN, + MLX5E_TC_TUNNEL_TYPE_GENEVE, + MLX5E_TC_TUNNEL_TYPE_GRETAP, + MLX5E_TC_TUNNEL_TYPE_MPLSOUDP, +}; + +struct mlx5e_encap_key { + const struct ip_tunnel_key *ip_tun_key; + struct mlx5e_tc_tunnel *tc_tunnel; +}; + +struct mlx5e_tc_tunnel { + int tunnel_type; + enum mlx5_flow_match_level match_level; + + bool (*can_offload)(struct mlx5e_priv *priv); + int (*calc_hlen)(struct mlx5e_encap_entry *e); + int (*init_encap_attr)(struct net_device *tunnel_dev, + struct mlx5e_priv *priv, + struct mlx5e_encap_entry *e, + struct netlink_ext_ack *extack); + int (*generate_ip_tun_hdr)(char buf[], + __u8 *ip_proto, + struct mlx5e_encap_entry *e); + int (*parse_udp_ports)(struct mlx5e_priv *priv, + struct mlx5_flow_spec *spec, + struct flow_cls_offload *f, + void *headers_c, + void *headers_v); + int (*parse_tunnel)(struct mlx5e_priv *priv, + struct mlx5_flow_spec *spec, + struct flow_cls_offload *f, + void *headers_c, + void *headers_v); + bool (*encap_info_equal)(struct mlx5e_encap_key *a, + struct mlx5e_encap_key *b); + int (*get_remote_ifindex)(struct net_device *mirred_dev); +}; + +extern struct mlx5e_tc_tunnel vxlan_tunnel; +extern struct mlx5e_tc_tunnel geneve_tunnel; +extern struct mlx5e_tc_tunnel gre_tunnel; +extern struct mlx5e_tc_tunnel mplsoudp_tunnel; + +struct mlx5e_tc_tunnel *mlx5e_get_tc_tun(struct net_device *tunnel_dev); + +int mlx5e_tc_tun_init_encap_attr(struct net_device *tunnel_dev, + struct mlx5e_priv *priv, + struct mlx5e_encap_entry *e, + struct netlink_ext_ack *extack); + +int mlx5e_tc_tun_create_header_ipv4(struct mlx5e_priv *priv, + struct net_device *mirred_dev, + struct mlx5e_encap_entry *e); +int mlx5e_tc_tun_update_header_ipv4(struct mlx5e_priv *priv, + struct net_device *mirred_dev, + struct mlx5e_encap_entry *e); + +#if IS_ENABLED(CONFIG_INET) && IS_ENABLED(CONFIG_IPV6) +int mlx5e_tc_tun_create_header_ipv6(struct mlx5e_priv *priv, + struct net_device *mirred_dev, + struct mlx5e_encap_entry *e); +int mlx5e_tc_tun_update_header_ipv6(struct mlx5e_priv *priv, + struct net_device *mirred_dev, + struct mlx5e_encap_entry *e); +#else +static inline int +mlx5e_tc_tun_create_header_ipv6(struct mlx5e_priv *priv, + struct net_device *mirred_dev, + struct mlx5e_encap_entry *e) +{ return -EOPNOTSUPP; } +static inline int +mlx5e_tc_tun_update_header_ipv6(struct mlx5e_priv *priv, + struct net_device *mirred_dev, + struct mlx5e_encap_entry *e) +{ return -EOPNOTSUPP; } +#endif +int mlx5e_tc_tun_route_lookup(struct mlx5e_priv *priv, + struct mlx5_flow_spec *spec, + struct mlx5_flow_attr *attr, + struct net_device *filter_dev); + +bool mlx5e_tc_tun_device_to_offload(struct mlx5e_priv *priv, + struct net_device *netdev); + +int mlx5e_tc_tun_parse(struct net_device *filter_dev, + struct mlx5e_priv *priv, + struct mlx5_flow_spec *spec, + struct flow_cls_offload *f, + u8 *match_level); + +int mlx5e_tc_tun_parse_udp_ports(struct mlx5e_priv *priv, + struct mlx5_flow_spec *spec, + struct flow_cls_offload *f, + void *headers_c, + void *headers_v); + +bool mlx5e_tc_tun_encap_info_equal_generic(struct mlx5e_encap_key *a, + struct mlx5e_encap_key *b); + +#endif /* CONFIG_MLX5_ESWITCH */ + +#endif //__MLX5_EN_TC_TUNNEL_H__ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c new file mode 100644 index 0000000..7bc5363 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c @@ -0,0 +1,1781 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2021 Mellanox Technologies. */ + +#include +#include +#include "tc_tun_encap.h" +#include "en_tc.h" +#include "tc_tun.h" +#include "rep/tc.h" +#include "diag/en_tc_tracepoint.h" + +enum { + MLX5E_ROUTE_ENTRY_VALID = BIT(0), +}; + +static int mlx5e_set_int_port_tunnel(struct mlx5e_priv *priv, + struct mlx5_flow_attr *attr, + struct mlx5e_encap_entry *e, + int out_index) +{ + struct net_device *route_dev; + int err = 0; + + route_dev = dev_get_by_index(dev_net(e->out_dev), e->route_dev_ifindex); + + if (!route_dev || !netif_is_ovs_master(route_dev)) + goto out; + + err = mlx5e_set_fwd_to_int_port_actions(priv, attr, e->route_dev_ifindex, + MLX5E_TC_INT_PORT_EGRESS, + &attr->action, out_index); + +out: + if (route_dev) + dev_put(route_dev); + + return err; +} + +struct mlx5e_route_key { + int ip_version; + union { + __be32 v4; + struct in6_addr v6; + } endpoint_ip; +}; + +struct mlx5e_route_entry { + struct mlx5e_route_key key; + struct list_head encap_entries; + struct list_head decap_flows; + u32 flags; + struct hlist_node hlist; + refcount_t refcnt; + int tunnel_dev_index; + struct rcu_head rcu; +}; + +struct mlx5e_tc_tun_encap { + struct mlx5e_priv *priv; + struct notifier_block fib_nb; + spinlock_t route_lock; /* protects route_tbl */ + unsigned long route_tbl_last_update; + DECLARE_HASHTABLE(route_tbl, 8); +}; + +static bool mlx5e_route_entry_valid(struct mlx5e_route_entry *r) +{ + return r->flags & MLX5E_ROUTE_ENTRY_VALID; +} + +int mlx5e_tc_set_attr_rx_tun(struct mlx5e_tc_flow *flow, + struct mlx5_flow_spec *spec) +{ + struct mlx5_esw_flow_attr *esw_attr = flow->attr->esw_attr; + struct mlx5_rx_tun_attr *tun_attr; + void *daddr, *saddr; + u8 ip_version; + + tun_attr = kvzalloc(sizeof(*tun_attr), GFP_KERNEL); + if (!tun_attr) + return -ENOMEM; + + esw_attr->rx_tun_attr = tun_attr; + ip_version = mlx5e_tc_get_ip_version(spec, true); + + if (ip_version == 4) { + daddr = MLX5_ADDR_OF(fte_match_param, spec->match_value, + outer_headers.dst_ipv4_dst_ipv6.ipv4_layout.ipv4); + saddr = MLX5_ADDR_OF(fte_match_param, spec->match_value, + outer_headers.src_ipv4_src_ipv6.ipv4_layout.ipv4); + tun_attr->dst_ip.v4 = *(__be32 *)daddr; + tun_attr->src_ip.v4 = *(__be32 *)saddr; + if (!tun_attr->dst_ip.v4 || !tun_attr->src_ip.v4) + return 0; + } +#if IS_ENABLED(CONFIG_INET) && IS_ENABLED(CONFIG_IPV6) + else if (ip_version == 6) { + int ipv6_size = MLX5_FLD_SZ_BYTES(ipv6_layout, ipv6); + struct in6_addr zerov6 = {}; + + daddr = MLX5_ADDR_OF(fte_match_param, spec->match_value, + outer_headers.dst_ipv4_dst_ipv6.ipv6_layout.ipv6); + saddr = MLX5_ADDR_OF(fte_match_param, spec->match_value, + outer_headers.src_ipv4_src_ipv6.ipv6_layout.ipv6); + memcpy(&tun_attr->dst_ip.v6, daddr, ipv6_size); + memcpy(&tun_attr->src_ip.v6, saddr, ipv6_size); + if (!memcmp(&tun_attr->dst_ip.v6, &zerov6, sizeof(zerov6)) || + !memcmp(&tun_attr->src_ip.v6, &zerov6, sizeof(zerov6))) + return 0; + } +#endif + /* Only set the flag if both src and dst ip addresses exist. They are + * required to establish routing. + */ + flow_flag_set(flow, TUN_RX); + flow->attr->tun_ip_version = ip_version; + return 0; +} + +static bool mlx5e_tc_flow_all_encaps_valid(struct mlx5_esw_flow_attr *esw_attr) +{ + bool all_flow_encaps_valid = true; + int i; + + /* Flow can be associated with multiple encap entries. + * Before offloading the flow verify that all of them have + * a valid neighbour. + */ + for (i = 0; i < MLX5_MAX_FLOW_FWD_VPORTS; i++) { + if (!(esw_attr->dests[i].flags & MLX5_ESW_DEST_ENCAP)) + continue; + if (!(esw_attr->dests[i].flags & MLX5_ESW_DEST_ENCAP_VALID)) { + all_flow_encaps_valid = false; + break; + } + } + + return all_flow_encaps_valid; +} + +void mlx5e_tc_encap_flows_add(struct mlx5e_priv *priv, + struct mlx5e_encap_entry *e, + struct list_head *flow_list) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + struct mlx5_pkt_reformat_params reformat_params; + struct mlx5_esw_flow_attr *esw_attr; + struct mlx5_flow_handle *rule; + struct mlx5_flow_attr *attr; + struct mlx5_flow_spec *spec; + struct mlx5e_tc_flow *flow; + int err; + int count = 0, ok = 0, skip = 0; + + if (e->flags & MLX5_ENCAP_ENTRY_NO_ROUTE) { + list_for_each_entry(flow, flow_list, tmp_list) { + if (!mlx5e_is_offloaded_flow(flow) || !flow_flag_test(flow, SLOW)) + continue; + count++; + } + mlx5_core_warn(priv->mdev, "Skip %d flows for e %p dst %pI4, no route.\n", + count, e, &e->tun_info->key.u.ipv4.dst); + return; + } + + memset(&reformat_params, 0, sizeof(reformat_params)); + reformat_params.type = e->reformat_type; + reformat_params.size = e->encap_size; + reformat_params.data = e->encap_header; + e->pkt_reformat = mlx5_packet_reformat_alloc(priv->mdev, + &reformat_params, + MLX5_FLOW_NAMESPACE_FDB); + if (IS_ERR(e->pkt_reformat)) { + mlx5_core_warn(priv->mdev, "Failed to offload cached encapsulation header, %lu\n", + PTR_ERR(e->pkt_reformat)); + return; + } + e->flags |= MLX5_ENCAP_ENTRY_VALID; + mlx5e_rep_queue_neigh_stats_work(priv); + + list_for_each_entry(flow, flow_list, tmp_list) { + count++; + if (!mlx5e_is_offloaded_flow(flow) || !flow_flag_test(flow, SLOW)) { + skip++; + continue; + } + + spec = &flow->attr->parse_attr->spec; + + attr = mlx5e_tc_get_encap_attr(flow); + esw_attr = attr->esw_attr; + esw_attr->dests[flow->tmp_entry_index].pkt_reformat = e->pkt_reformat; + esw_attr->dests[flow->tmp_entry_index].flags |= MLX5_ESW_DEST_ENCAP_VALID; + + /* Do not offload flows with unresolved neighbors */ + if (!mlx5e_tc_flow_all_encaps_valid(esw_attr)) { + mlx5_core_warn(priv->mdev, "Not all encaps are valid for flow %p e %p dst %pI4\n", + flow, e, &e->tun_info->key.u.ipv4.dst); + skip++; + continue; + } + + err = mlx5e_tc_offload_flow_post_acts(flow); + if (err) { + mlx5_core_warn(priv->mdev, "Failed to update flow post acts, %d\n", + err); + continue; + } + + /* update from slow path rule to encap rule */ + rule = mlx5e_tc_offload_fdb_rules(esw, flow, spec, flow->attr); + if (IS_ERR(rule)) { + mlx5e_tc_unoffload_flow_post_acts(flow); + err = PTR_ERR(rule); + mlx5_core_warn(priv->mdev, "Failed to update cached encapsulation flow, %d\n", + err); + skip++; + continue; + } + + mlx5e_tc_unoffload_from_slow_path(esw, flow); + flow->rule[0] = rule; + /* was unset when slow path rule removed */ + flow_flag_set(flow, OFFLOADED); + ok++; + } + + if (count != ok || skip > 0 || (count == 0 && count == ok)) + mlx5_core_warn(priv->mdev, "Skip stats for e %p dst %pI4, count %d, ok %d, skip %d\n", + e, &e->tun_info->key.u.ipv4.dst, count, ok, skip); +} + +void mlx5e_tc_encap_flows_del(struct mlx5e_priv *priv, + struct mlx5e_encap_entry *e, + struct list_head *flow_list) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + struct mlx5_esw_flow_attr *esw_attr; + struct mlx5_flow_handle *rule; + struct mlx5_flow_attr *attr; + struct mlx5_flow_spec *spec; + struct mlx5e_tc_flow *flow; + int err; + + list_for_each_entry(flow, flow_list, tmp_list) { + if (!mlx5e_is_offloaded_flow(flow) || flow_flag_test(flow, SLOW)) + continue; + + attr = mlx5e_tc_get_encap_attr(flow); + esw_attr = attr->esw_attr; + /* mark the flow's encap dest as non-valid */ + esw_attr->dests[flow->tmp_entry_index].flags &= ~MLX5_ESW_DEST_ENCAP_VALID; + esw_attr->dests[flow->tmp_entry_index].pkt_reformat = NULL; + + /* update from encap rule to slow path rule */ + spec = &flow->attr->parse_attr->spec; + rule = mlx5e_tc_offload_to_slow_path(esw, flow, spec); + + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + mlx5_core_warn(priv->mdev, "Failed to update slow path (encap) flow, %d\n", + err); + continue; + } + + mlx5e_tc_unoffload_fdb_rules(esw, flow, flow->attr); + mlx5e_tc_unoffload_flow_post_acts(flow); + flow->rule[0] = rule; + /* was unset when fast path rule removed */ + flow_flag_set(flow, OFFLOADED); + } + + /* we know that the encap is valid */ + e->flags &= ~MLX5_ENCAP_ENTRY_VALID; + mlx5_packet_reformat_dealloc(priv->mdev, e->pkt_reformat); + e->pkt_reformat = NULL; +} + +static void mlx5e_take_tmp_flow(struct mlx5e_tc_flow *flow, + struct list_head *flow_list, + int index) +{ + if (IS_ERR(mlx5e_flow_get(flow))) { + /* Flow is being deleted concurrently. Wait for it to be + * unoffloaded from hardware, otherwise deleting encap will + * fail. + */ + wait_for_completion(&flow->del_hw_done); + return; + } + wait_for_completion(&flow->init_done); + + flow->tmp_entry_index = index; + list_add(&flow->tmp_list, flow_list); +} + +/* Takes reference to all flows attached to encap and adds the flows to + * flow_list using 'tmp_list' list_head in mlx5e_tc_flow. + */ +void mlx5e_take_all_encap_flows(struct mlx5e_encap_entry *e, struct list_head *flow_list) +{ + struct encap_flow_item *efi; + struct mlx5e_tc_flow *flow; + + list_for_each_entry(efi, &e->flows, list) { + flow = container_of(efi, struct mlx5e_tc_flow, encaps[efi->index]); + mlx5e_take_tmp_flow(flow, flow_list, efi->index); + } +} + +/* Takes reference to all flows attached to route and adds the flows to + * flow_list using 'tmp_list' list_head in mlx5e_tc_flow. + */ +static void mlx5e_take_all_route_decap_flows(struct mlx5e_route_entry *r, + struct list_head *flow_list) +{ + struct mlx5e_tc_flow *flow; + + list_for_each_entry(flow, &r->decap_flows, decap_routes) + mlx5e_take_tmp_flow(flow, flow_list, 0); +} + +typedef bool (match_cb)(struct mlx5e_encap_entry *); + +static struct mlx5e_encap_entry * +mlx5e_get_next_matching_encap(struct mlx5e_neigh_hash_entry *nhe, + struct mlx5e_encap_entry *e, + match_cb match) +{ + struct mlx5e_encap_entry *next = NULL; + +retry: + rcu_read_lock(); + + /* find encap with non-zero reference counter value */ + for (next = e ? + list_next_or_null_rcu(&nhe->encap_list, + &e->encap_list, + struct mlx5e_encap_entry, + encap_list) : + list_first_or_null_rcu(&nhe->encap_list, + struct mlx5e_encap_entry, + encap_list); + next; + next = list_next_or_null_rcu(&nhe->encap_list, + &next->encap_list, + struct mlx5e_encap_entry, + encap_list)) + if (mlx5e_encap_take(next)) + break; + + rcu_read_unlock(); + + /* release starting encap */ + if (e) + mlx5e_encap_put(netdev_priv(e->out_dev), e); + if (!next) + return next; + + /* wait for encap to be fully initialized */ + wait_for_completion(&next->res_ready); + /* continue searching if encap entry is not in valid state after completion */ + if (!match(next)) { + e = next; + goto retry; + } + + return next; +} + +static bool mlx5e_encap_valid(struct mlx5e_encap_entry *e) +{ + return e->flags & MLX5_ENCAP_ENTRY_VALID; +} + +static struct mlx5e_encap_entry * +mlx5e_get_next_valid_encap(struct mlx5e_neigh_hash_entry *nhe, + struct mlx5e_encap_entry *e) +{ + return mlx5e_get_next_matching_encap(nhe, e, mlx5e_encap_valid); +} + +static bool mlx5e_encap_initialized(struct mlx5e_encap_entry *e) +{ + return e->compl_result >= 0; +} + +struct mlx5e_encap_entry * +mlx5e_get_next_init_encap(struct mlx5e_neigh_hash_entry *nhe, + struct mlx5e_encap_entry *e) +{ + return mlx5e_get_next_matching_encap(nhe, e, mlx5e_encap_initialized); +} + +void mlx5e_tc_update_neigh_used_value(struct mlx5e_neigh_hash_entry *nhe) +{ + struct mlx5e_neigh *m_neigh = &nhe->m_neigh; + struct mlx5e_encap_entry *e = NULL; + struct mlx5e_tc_flow *flow; + struct mlx5_fc *counter; + struct neigh_table *tbl; + bool neigh_used = false; + struct neighbour *n; + u64 lastuse; + + if (m_neigh->family == AF_INET) + tbl = &arp_tbl; +#if IS_ENABLED(CONFIG_IPV6) + else if (m_neigh->family == AF_INET6) + tbl = ipv6_stub->nd_tbl; +#endif + else + return; + + /* mlx5e_get_next_valid_encap() releases previous encap before returning + * next one. + */ + while ((e = mlx5e_get_next_valid_encap(nhe, e)) != NULL) { + struct mlx5e_priv *priv = netdev_priv(e->out_dev); + struct encap_flow_item *efi, *tmp; + struct mlx5_eswitch *esw; + LIST_HEAD(flow_list); + + esw = priv->mdev->priv.eswitch; + mutex_lock(&esw->offloads.encap_tbl_lock); + list_for_each_entry_safe(efi, tmp, &e->flows, list) { + flow = container_of(efi, struct mlx5e_tc_flow, + encaps[efi->index]); + if (IS_ERR(mlx5e_flow_get(flow))) + continue; + list_add(&flow->tmp_list, &flow_list); + + if (mlx5e_is_offloaded_flow(flow)) { + counter = mlx5e_tc_get_counter(flow); + lastuse = mlx5_fc_query_lastuse(counter); + if (time_after((unsigned long)lastuse, nhe->reported_lastuse)) { + neigh_used = true; + break; + } + } + } + mutex_unlock(&esw->offloads.encap_tbl_lock); + + mlx5e_put_flow_list(priv, &flow_list); + if (neigh_used) { + /* release current encap before breaking the loop */ + mlx5e_encap_put(priv, e); + break; + } + } + + trace_mlx5e_tc_update_neigh_used_value(nhe, neigh_used); + + if (neigh_used) { + nhe->reported_lastuse = jiffies; + + /* find the relevant neigh according to the cached device and + * dst ip pair + */ + n = neigh_lookup(tbl, &m_neigh->dst_ip, READ_ONCE(nhe->neigh_dev)); + if (!n) + return; + + neigh_event_send(n, NULL); + neigh_release(n); + } +} + +static void mlx5e_encap_dealloc(struct mlx5e_priv *priv, struct mlx5e_encap_entry *e) +{ + WARN_ON(!list_empty(&e->flows)); + + if (e->compl_result > 0) { + mlx5e_rep_encap_entry_detach(netdev_priv(e->out_dev), e); + + if (e->flags & MLX5_ENCAP_ENTRY_VALID) + mlx5_packet_reformat_dealloc(priv->mdev, e->pkt_reformat); + } + + kfree(e->tun_info); + kfree(e->encap_header); + kfree_rcu(e, rcu); +} + +static void mlx5e_decap_dealloc(struct mlx5e_priv *priv, + struct mlx5e_decap_entry *d) +{ + WARN_ON(!list_empty(&d->flows)); + + if (!d->compl_result) + mlx5_packet_reformat_dealloc(priv->mdev, d->pkt_reformat); + + kfree_rcu(d, rcu); +} + +void mlx5e_encap_put(struct mlx5e_priv *priv, struct mlx5e_encap_entry *e) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + + if (!refcount_dec_and_mutex_lock(&e->refcnt, &esw->offloads.encap_tbl_lock)) + return; + list_del(&e->route_list); + hash_del_rcu(&e->encap_hlist); + mutex_unlock(&esw->offloads.encap_tbl_lock); + + mlx5e_encap_dealloc(priv, e); +} + +static void mlx5e_decap_put(struct mlx5e_priv *priv, struct mlx5e_decap_entry *d) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + + if (!refcount_dec_and_mutex_lock(&d->refcnt, &esw->offloads.decap_tbl_lock)) + return; + hash_del_rcu(&d->hlist); + mutex_unlock(&esw->offloads.decap_tbl_lock); + + mlx5e_decap_dealloc(priv, d); +} + +static void mlx5e_detach_encap_route(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow, + int out_index); + +void mlx5e_detach_encap(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow, + struct mlx5_flow_attr *attr, + int out_index) +{ + struct mlx5e_encap_entry *e = flow->encaps[out_index].e; + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + + if (!mlx5e_is_eswitch_flow(flow)) + return; + + if (attr->esw_attr->dests[out_index].flags & + MLX5_ESW_DEST_CHAIN_WITH_SRC_PORT_CHANGE) + mlx5e_detach_encap_route(priv, flow, out_index); + + /* flow wasn't fully initialized */ + if (!e) + return; + + mutex_lock(&esw->offloads.encap_tbl_lock); + list_del(&flow->encaps[out_index].list); + flow->encaps[out_index].e = NULL; + if (!refcount_dec_and_test(&e->refcnt)) { + mutex_unlock(&esw->offloads.encap_tbl_lock); + return; + } + list_del(&e->route_list); + hash_del_rcu(&e->encap_hlist); + mutex_unlock(&esw->offloads.encap_tbl_lock); + + mlx5e_encap_dealloc(priv, e); +} + +void mlx5e_detach_decap(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + struct mlx5e_decap_entry *d = flow->decap_reformat; + + if (!d) + return; + + mutex_lock(&esw->offloads.decap_tbl_lock); + list_del(&flow->l3_to_l2_reformat); + flow->decap_reformat = NULL; + + if (!refcount_dec_and_test(&d->refcnt)) { + mutex_unlock(&esw->offloads.decap_tbl_lock); + return; + } + hash_del_rcu(&d->hlist); + mutex_unlock(&esw->offloads.decap_tbl_lock); + + mlx5e_decap_dealloc(priv, d); +} + +bool mlx5e_tc_tun_encap_info_equal_generic(struct mlx5e_encap_key *a, + struct mlx5e_encap_key *b) +{ + return memcmp(a->ip_tun_key, b->ip_tun_key, sizeof(*a->ip_tun_key)) == 0 && + a->tc_tunnel->tunnel_type == b->tc_tunnel->tunnel_type; +} + +static int cmp_decap_info(struct mlx5e_decap_key *a, + struct mlx5e_decap_key *b) +{ + return memcmp(&a->key, &b->key, sizeof(b->key)); +} + +static int hash_encap_info(struct mlx5e_encap_key *key) +{ + return jhash(key->ip_tun_key, sizeof(*key->ip_tun_key), + key->tc_tunnel->tunnel_type); +} + +static int hash_decap_info(struct mlx5e_decap_key *key) +{ + return jhash(&key->key, sizeof(key->key), 0); +} + +bool mlx5e_encap_take(struct mlx5e_encap_entry *e) +{ + return refcount_inc_not_zero(&e->refcnt); +} + +static bool mlx5e_decap_take(struct mlx5e_decap_entry *e) +{ + return refcount_inc_not_zero(&e->refcnt); +} + +static struct mlx5e_encap_entry * +mlx5e_encap_get(struct mlx5e_priv *priv, struct mlx5e_encap_key *key, + uintptr_t hash_key) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + struct mlx5e_encap_key e_key; + struct mlx5e_encap_entry *e; + + hash_for_each_possible_rcu(esw->offloads.encap_tbl, e, + encap_hlist, hash_key) { + e_key.ip_tun_key = &e->tun_info->key; + e_key.tc_tunnel = e->tunnel; + if (e->tunnel->encap_info_equal(&e_key, key) && + mlx5e_encap_take(e)) + return e; + } + + return NULL; +} + +static struct mlx5e_decap_entry * +mlx5e_decap_get(struct mlx5e_priv *priv, struct mlx5e_decap_key *key, + uintptr_t hash_key) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + struct mlx5e_decap_key r_key; + struct mlx5e_decap_entry *e; + + hash_for_each_possible_rcu(esw->offloads.decap_tbl, e, + hlist, hash_key) { + r_key = e->key; + if (!cmp_decap_info(&r_key, key) && + mlx5e_decap_take(e)) + return e; + } + return NULL; +} + +struct ip_tunnel_info *mlx5e_dup_tun_info(const struct ip_tunnel_info *tun_info) +{ + size_t tun_size = sizeof(*tun_info) + tun_info->options_len; + + return kmemdup(tun_info, tun_size, GFP_KERNEL); +} + +static bool is_duplicated_encap_entry(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow, + int out_index, + struct mlx5e_encap_entry *e, + struct netlink_ext_ack *extack) +{ + int i; + + for (i = 0; i < out_index; i++) { + if (flow->encaps[i].e != e) + continue; + NL_SET_ERR_MSG_MOD(extack, "can't duplicate encap action"); + netdev_err(priv->netdev, "can't duplicate encap action\n"); + return true; + } + + return false; +} + +static int mlx5e_set_vf_tunnel(struct mlx5_eswitch *esw, + struct mlx5_flow_attr *attr, + struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts, + struct net_device *out_dev, + int route_dev_ifindex, + int out_index) +{ + struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr; + struct net_device *route_dev; + u16 vport_num; + int err = 0; + u32 data; + + route_dev = dev_get_by_index(dev_net(out_dev), route_dev_ifindex); + + if (!route_dev || route_dev->netdev_ops != &mlx5e_netdev_ops || + !mlx5e_tc_is_vf_tunnel(out_dev, route_dev)) + goto out; + + err = mlx5e_tc_query_route_vport(out_dev, route_dev, &vport_num); + if (err) + goto out; + + attr->dest_chain = 0; + attr->action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR; + esw_attr->dests[out_index].flags |= MLX5_ESW_DEST_CHAIN_WITH_SRC_PORT_CHANGE; + data = mlx5_eswitch_get_vport_metadata_for_set(esw_attr->in_mdev->priv.eswitch, + vport_num); + err = mlx5e_tc_match_to_reg_set_and_get_id(esw->dev, mod_hdr_acts, + MLX5_FLOW_NAMESPACE_FDB, + VPORT_TO_REG, data); + if (err >= 0) { + esw_attr->dests[out_index].src_port_rewrite_act_id = err; + err = 0; + } + +out: + if (route_dev) + dev_put(route_dev); + return err; +} + +static int mlx5e_update_vf_tunnel(struct mlx5_eswitch *esw, + struct mlx5_esw_flow_attr *attr, + struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts, + struct net_device *out_dev, + int route_dev_ifindex, + int out_index) +{ + int act_id = attr->dests[out_index].src_port_rewrite_act_id; + struct net_device *route_dev; + u16 vport_num; + int err = 0; + u32 data; + + route_dev = dev_get_by_index(dev_net(out_dev), route_dev_ifindex); + + if (!route_dev || route_dev->netdev_ops != &mlx5e_netdev_ops || + !mlx5e_tc_is_vf_tunnel(out_dev, route_dev)) { + err = -ENODEV; + goto out; + } + + err = mlx5e_tc_query_route_vport(out_dev, route_dev, &vport_num); + if (err) + goto out; + + data = mlx5_eswitch_get_vport_metadata_for_set(attr->in_mdev->priv.eswitch, + vport_num); + mlx5e_tc_match_to_reg_mod_hdr_change(esw->dev, mod_hdr_acts, VPORT_TO_REG, act_id, data); + +out: + if (route_dev) + dev_put(route_dev); + return err; +} + +static unsigned int mlx5e_route_tbl_get_last_update(struct mlx5e_priv *priv) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + struct mlx5_rep_uplink_priv *uplink_priv; + struct mlx5e_rep_priv *uplink_rpriv; + struct mlx5e_tc_tun_encap *encap; + unsigned int ret; + + uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH); + uplink_priv = &uplink_rpriv->uplink_priv; + encap = uplink_priv->encap; + + spin_lock_bh(&encap->route_lock); + ret = encap->route_tbl_last_update; + spin_unlock_bh(&encap->route_lock); + return ret; +} + +static int mlx5e_attach_encap_route(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow, + struct mlx5_flow_attr *attr, + struct mlx5e_encap_entry *e, + bool new_encap_entry, + unsigned long tbl_time_before, + int out_index); + +int mlx5e_attach_encap(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow, + struct mlx5_flow_attr *attr, + struct net_device *mirred_dev, + int out_index, + struct netlink_ext_ack *extack, + struct net_device **encap_dev) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + struct mlx5e_tc_flow_parse_attr *parse_attr; + const struct ip_tunnel_info *tun_info; + const struct mlx5e_mpls_info *mpls_info; + unsigned long tbl_time_before = 0; + struct mlx5e_encap_entry *e; + struct mlx5e_encap_key key; + bool entry_created = false; + unsigned short family; + uintptr_t hash_key; + int err = 0; + + parse_attr = attr->parse_attr; + tun_info = parse_attr->tun_info[out_index]; + mpls_info = &parse_attr->mpls_info[out_index]; + family = ip_tunnel_info_af(tun_info); + key.ip_tun_key = &tun_info->key; + key.tc_tunnel = mlx5e_get_tc_tun(mirred_dev); + if (!key.tc_tunnel) { + NL_SET_ERR_MSG_MOD(extack, "Unsupported tunnel"); + return -EOPNOTSUPP; + } + + hash_key = hash_encap_info(&key); + + mutex_lock(&esw->offloads.encap_tbl_lock); + e = mlx5e_encap_get(priv, &key, hash_key); + + /* must verify if encap is valid or not */ + if (e) { + /* Check that entry was not already attached to this flow */ + if (is_duplicated_encap_entry(priv, flow, out_index, e, extack)) { + err = -EOPNOTSUPP; + goto out_err; + } + + mutex_unlock(&esw->offloads.encap_tbl_lock); + wait_for_completion(&e->res_ready); + + /* Protect against concurrent neigh update. */ + mutex_lock(&esw->offloads.encap_tbl_lock); + if (e->compl_result < 0) { + err = -EREMOTEIO; + goto out_err; + } + goto attach_flow; + } + + e = kzalloc(sizeof(*e), GFP_KERNEL); + if (!e) { + err = -ENOMEM; + goto out_err; + } + + refcount_set(&e->refcnt, 1); + init_completion(&e->res_ready); + entry_created = true; + INIT_LIST_HEAD(&e->route_list); + + tun_info = mlx5e_dup_tun_info(tun_info); + if (!tun_info) { + err = -ENOMEM; + goto out_err_init; + } + e->tun_info = tun_info; + memcpy(&e->mpls_info, mpls_info, sizeof(*mpls_info)); + err = mlx5e_tc_tun_init_encap_attr(mirred_dev, priv, e, extack); + if (err) + goto out_err_init; + + INIT_LIST_HEAD(&e->flows); + hash_add_rcu(esw->offloads.encap_tbl, &e->encap_hlist, hash_key); + tbl_time_before = mlx5e_route_tbl_get_last_update(priv); + mutex_unlock(&esw->offloads.encap_tbl_lock); + + if (family == AF_INET) + err = mlx5e_tc_tun_create_header_ipv4(priv, mirred_dev, e); + else if (family == AF_INET6) + err = mlx5e_tc_tun_create_header_ipv6(priv, mirred_dev, e); + + /* Protect against concurrent neigh update. */ + mutex_lock(&esw->offloads.encap_tbl_lock); + complete_all(&e->res_ready); + if (err) { + e->compl_result = err; + goto out_err; + } + e->compl_result = 1; + +attach_flow: + err = mlx5e_attach_encap_route(priv, flow, attr, e, entry_created, + tbl_time_before, out_index); + if (err) + goto out_err; + + err = mlx5e_set_int_port_tunnel(priv, attr, e, out_index); + if (err == -EOPNOTSUPP) { + /* If device doesn't support int port offload, + * redirect to uplink vport. + */ + mlx5_core_dbg(priv->mdev, "attaching int port as encap dev not supported, using uplink\n"); + err = 0; + } else if (err) { + goto out_err; + } + + flow->encaps[out_index].e = e; + list_add(&flow->encaps[out_index].list, &e->flows); + flow->encaps[out_index].index = out_index; + *encap_dev = e->out_dev; + if (e->flags & MLX5_ENCAP_ENTRY_VALID) { + attr->esw_attr->dests[out_index].pkt_reformat = e->pkt_reformat; + attr->esw_attr->dests[out_index].flags |= MLX5_ESW_DEST_ENCAP_VALID; + } else { + flow_flag_set(flow, SLOW); + } + mutex_unlock(&esw->offloads.encap_tbl_lock); + + return err; + +out_err: + mutex_unlock(&esw->offloads.encap_tbl_lock); + if (e) + mlx5e_encap_put(priv, e); + return err; + +out_err_init: + mutex_unlock(&esw->offloads.encap_tbl_lock); + kfree(tun_info); + kfree(e); + return err; +} + +int mlx5e_attach_decap(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow, + struct netlink_ext_ack *extack) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + struct mlx5_esw_flow_attr *attr = flow->attr->esw_attr; + struct mlx5_pkt_reformat_params reformat_params; + struct mlx5e_tc_flow_parse_attr *parse_attr; + struct mlx5e_decap_entry *d; + struct mlx5e_decap_key key; + uintptr_t hash_key; + int err = 0; + + parse_attr = flow->attr->parse_attr; + if (sizeof(parse_attr->eth) > MLX5_CAP_ESW(priv->mdev, max_encap_header_size)) { + NL_SET_ERR_MSG_MOD(extack, + "encap header larger than max supported"); + return -EOPNOTSUPP; + } + + key.key = parse_attr->eth; + hash_key = hash_decap_info(&key); + mutex_lock(&esw->offloads.decap_tbl_lock); + d = mlx5e_decap_get(priv, &key, hash_key); + if (d) { + mutex_unlock(&esw->offloads.decap_tbl_lock); + wait_for_completion(&d->res_ready); + mutex_lock(&esw->offloads.decap_tbl_lock); + if (d->compl_result) { + err = -EREMOTEIO; + goto out_free; + } + goto found; + } + + d = kzalloc(sizeof(*d), GFP_KERNEL); + if (!d) { + err = -ENOMEM; + goto out_err; + } + + d->key = key; + refcount_set(&d->refcnt, 1); + init_completion(&d->res_ready); + INIT_LIST_HEAD(&d->flows); + hash_add_rcu(esw->offloads.decap_tbl, &d->hlist, hash_key); + mutex_unlock(&esw->offloads.decap_tbl_lock); + + memset(&reformat_params, 0, sizeof(reformat_params)); + reformat_params.type = MLX5_REFORMAT_TYPE_L3_TUNNEL_TO_L2; + reformat_params.size = sizeof(parse_attr->eth); + reformat_params.data = &parse_attr->eth; + d->pkt_reformat = mlx5_packet_reformat_alloc(priv->mdev, + &reformat_params, + MLX5_FLOW_NAMESPACE_FDB); + if (IS_ERR(d->pkt_reformat)) { + err = PTR_ERR(d->pkt_reformat); + d->compl_result = err; + } + mutex_lock(&esw->offloads.decap_tbl_lock); + complete_all(&d->res_ready); + if (err) + goto out_free; + +found: + flow->decap_reformat = d; + attr->decap_pkt_reformat = d->pkt_reformat; + list_add(&flow->l3_to_l2_reformat, &d->flows); + mutex_unlock(&esw->offloads.decap_tbl_lock); + return 0; + +out_free: + mutex_unlock(&esw->offloads.decap_tbl_lock); + mlx5e_decap_put(priv, d); + return err; + +out_err: + mutex_unlock(&esw->offloads.decap_tbl_lock); + return err; +} + +static int cmp_route_info(struct mlx5e_route_key *a, + struct mlx5e_route_key *b) +{ + if (a->ip_version == 4 && b->ip_version == 4) + return memcmp(&a->endpoint_ip.v4, &b->endpoint_ip.v4, + sizeof(a->endpoint_ip.v4)); + else if (a->ip_version == 6 && b->ip_version == 6) + return memcmp(&a->endpoint_ip.v6, &b->endpoint_ip.v6, + sizeof(a->endpoint_ip.v6)); + return 1; +} + +static u32 hash_route_info(struct mlx5e_route_key *key) +{ + if (key->ip_version == 4) + return jhash(&key->endpoint_ip.v4, sizeof(key->endpoint_ip.v4), 0); + return jhash(&key->endpoint_ip.v6, sizeof(key->endpoint_ip.v6), 0); +} + +static void mlx5e_route_dealloc(struct mlx5e_priv *priv, + struct mlx5e_route_entry *r) +{ + WARN_ON(!list_empty(&r->decap_flows)); + WARN_ON(!list_empty(&r->encap_entries)); + + kfree_rcu(r, rcu); +} + +static void mlx5e_route_put(struct mlx5e_priv *priv, struct mlx5e_route_entry *r) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + + if (!refcount_dec_and_mutex_lock(&r->refcnt, &esw->offloads.encap_tbl_lock)) + return; + + hash_del_rcu(&r->hlist); + mutex_unlock(&esw->offloads.encap_tbl_lock); + + mlx5e_route_dealloc(priv, r); +} + +static void mlx5e_route_put_locked(struct mlx5e_priv *priv, struct mlx5e_route_entry *r) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + + lockdep_assert_held(&esw->offloads.encap_tbl_lock); + + if (!refcount_dec_and_test(&r->refcnt)) + return; + hash_del_rcu(&r->hlist); + mlx5e_route_dealloc(priv, r); +} + +static struct mlx5e_route_entry * +mlx5e_route_get(struct mlx5e_tc_tun_encap *encap, struct mlx5e_route_key *key, + u32 hash_key) +{ + struct mlx5e_route_key r_key; + struct mlx5e_route_entry *r; + + hash_for_each_possible(encap->route_tbl, r, hlist, hash_key) { + r_key = r->key; + if (!cmp_route_info(&r_key, key) && + refcount_inc_not_zero(&r->refcnt)) + return r; + } + return NULL; +} + +static struct mlx5e_route_entry * +mlx5e_route_get_create(struct mlx5e_priv *priv, + struct mlx5e_route_key *key, + int tunnel_dev_index, + unsigned long *route_tbl_change_time) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + struct mlx5_rep_uplink_priv *uplink_priv; + struct mlx5e_rep_priv *uplink_rpriv; + struct mlx5e_tc_tun_encap *encap; + struct mlx5e_route_entry *r; + u32 hash_key; + + uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH); + uplink_priv = &uplink_rpriv->uplink_priv; + encap = uplink_priv->encap; + + hash_key = hash_route_info(key); + spin_lock_bh(&encap->route_lock); + r = mlx5e_route_get(encap, key, hash_key); + spin_unlock_bh(&encap->route_lock); + if (r) { + if (!mlx5e_route_entry_valid(r)) { + mlx5e_route_put_locked(priv, r); + return ERR_PTR(-EINVAL); + } + return r; + } + + r = kzalloc(sizeof(*r), GFP_KERNEL); + if (!r) + return ERR_PTR(-ENOMEM); + + r->key = *key; + r->flags |= MLX5E_ROUTE_ENTRY_VALID; + r->tunnel_dev_index = tunnel_dev_index; + refcount_set(&r->refcnt, 1); + INIT_LIST_HEAD(&r->decap_flows); + INIT_LIST_HEAD(&r->encap_entries); + + spin_lock_bh(&encap->route_lock); + *route_tbl_change_time = encap->route_tbl_last_update; + hash_add(encap->route_tbl, &r->hlist, hash_key); + spin_unlock_bh(&encap->route_lock); + + return r; +} + +static struct mlx5e_route_entry * +mlx5e_route_lookup_for_update(struct mlx5e_tc_tun_encap *encap, struct mlx5e_route_key *key) +{ + u32 hash_key = hash_route_info(key); + struct mlx5e_route_entry *r; + + spin_lock_bh(&encap->route_lock); + encap->route_tbl_last_update = jiffies; + r = mlx5e_route_get(encap, key, hash_key); + spin_unlock_bh(&encap->route_lock); + + return r; +} + +struct mlx5e_tc_fib_event_data { + struct work_struct work; + unsigned long event; + struct mlx5e_route_entry *r; + struct net_device *ul_dev; +}; + +static void mlx5e_tc_fib_event_work(struct work_struct *work); +static struct mlx5e_tc_fib_event_data * +mlx5e_tc_init_fib_work(unsigned long event, struct net_device *ul_dev, gfp_t flags) +{ + struct mlx5e_tc_fib_event_data *fib_work; + + fib_work = kzalloc(sizeof(*fib_work), flags); + if (WARN_ON(!fib_work)) + return NULL; + + INIT_WORK(&fib_work->work, mlx5e_tc_fib_event_work); + fib_work->event = event; + fib_work->ul_dev = ul_dev; + + return fib_work; +} + +static int +mlx5e_route_enqueue_update(struct mlx5e_priv *priv, + struct mlx5e_route_entry *r, + unsigned long event) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + struct mlx5e_tc_fib_event_data *fib_work; + struct mlx5e_rep_priv *uplink_rpriv; + struct net_device *ul_dev; + + uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH); + ul_dev = uplink_rpriv->netdev; + + fib_work = mlx5e_tc_init_fib_work(event, ul_dev, GFP_KERNEL); + if (!fib_work) + return -ENOMEM; + + dev_hold(ul_dev); + refcount_inc(&r->refcnt); + fib_work->r = r; + queue_work(priv->wq, &fib_work->work); + + return 0; +} + +int mlx5e_attach_decap_route(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + unsigned long tbl_time_before, tbl_time_after; + struct mlx5e_tc_flow_parse_attr *parse_attr; + struct mlx5_flow_attr *attr = flow->attr; + struct mlx5_esw_flow_attr *esw_attr; + struct mlx5e_route_entry *r; + struct mlx5e_route_key key; + int err = 0; + + esw_attr = attr->esw_attr; + parse_attr = attr->parse_attr; + mutex_lock(&esw->offloads.encap_tbl_lock); + if (!esw_attr->rx_tun_attr) + goto out; + + tbl_time_before = mlx5e_route_tbl_get_last_update(priv); + tbl_time_after = tbl_time_before; + err = mlx5e_tc_tun_route_lookup(priv, &parse_attr->spec, attr, parse_attr->filter_dev); + if (err || !esw_attr->rx_tun_attr->decap_vport) + goto out; + + key.ip_version = attr->tun_ip_version; + if (key.ip_version == 4) + key.endpoint_ip.v4 = esw_attr->rx_tun_attr->dst_ip.v4; + else + key.endpoint_ip.v6 = esw_attr->rx_tun_attr->dst_ip.v6; + + r = mlx5e_route_get_create(priv, &key, parse_attr->filter_dev->ifindex, + &tbl_time_after); + if (IS_ERR(r)) { + err = PTR_ERR(r); + goto out; + } + /* Routing changed concurrently. FIB event handler might have missed new + * entry, schedule update. + */ + if (tbl_time_before != tbl_time_after) { + err = mlx5e_route_enqueue_update(priv, r, FIB_EVENT_ENTRY_REPLACE); + if (err) { + mlx5e_route_put_locked(priv, r); + goto out; + } + } + + flow->decap_route = r; + list_add(&flow->decap_routes, &r->decap_flows); + mutex_unlock(&esw->offloads.encap_tbl_lock); + return 0; + +out: + mutex_unlock(&esw->offloads.encap_tbl_lock); + return err; +} + +static int mlx5e_attach_encap_route(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow, + struct mlx5_flow_attr *attr, + struct mlx5e_encap_entry *e, + bool new_encap_entry, + unsigned long tbl_time_before, + int out_index) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + unsigned long tbl_time_after = tbl_time_before; + struct mlx5e_tc_flow_parse_attr *parse_attr; + const struct ip_tunnel_info *tun_info; + struct mlx5_esw_flow_attr *esw_attr; + struct mlx5e_route_entry *r; + struct mlx5e_route_key key; + unsigned short family; + int err = 0; + + esw_attr = attr->esw_attr; + parse_attr = attr->parse_attr; + tun_info = parse_attr->tun_info[out_index]; + family = ip_tunnel_info_af(tun_info); + + if (family == AF_INET) { + key.endpoint_ip.v4 = tun_info->key.u.ipv4.src; + key.ip_version = 4; + } else if (family == AF_INET6) { + key.endpoint_ip.v6 = tun_info->key.u.ipv6.src; + key.ip_version = 6; + } + + err = mlx5e_set_vf_tunnel(esw, attr, &parse_attr->mod_hdr_acts, e->out_dev, + e->route_dev_ifindex, out_index); + if (err || !(esw_attr->dests[out_index].flags & + MLX5_ESW_DEST_CHAIN_WITH_SRC_PORT_CHANGE)) + return err; + + r = mlx5e_route_get_create(priv, &key, parse_attr->mirred_ifindex[out_index], + &tbl_time_after); + if (IS_ERR(r)) + return PTR_ERR(r); + /* Routing changed concurrently. FIB event handler might have missed new + * entry, schedule update. + */ + if (tbl_time_before != tbl_time_after) { + err = mlx5e_route_enqueue_update(priv, r, FIB_EVENT_ENTRY_REPLACE); + if (err) { + mlx5e_route_put_locked(priv, r); + return err; + } + } + + flow->encap_routes[out_index].r = r; + if (new_encap_entry) + list_add(&e->route_list, &r->encap_entries); + flow->encap_routes[out_index].index = out_index; + return 0; +} + +void mlx5e_detach_decap_route(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + struct mlx5e_route_entry *r = flow->decap_route; + + if (!r) + return; + + mutex_lock(&esw->offloads.encap_tbl_lock); + list_del(&flow->decap_routes); + flow->decap_route = NULL; + + if (!refcount_dec_and_test(&r->refcnt)) { + mutex_unlock(&esw->offloads.encap_tbl_lock); + return; + } + hash_del_rcu(&r->hlist); + mutex_unlock(&esw->offloads.encap_tbl_lock); + + mlx5e_route_dealloc(priv, r); +} + +static void mlx5e_detach_encap_route(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow, + int out_index) +{ + struct mlx5e_route_entry *r = flow->encap_routes[out_index].r; + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + struct mlx5e_encap_entry *e, *tmp; + + if (!r) + return; + + mutex_lock(&esw->offloads.encap_tbl_lock); + flow->encap_routes[out_index].r = NULL; + + if (!refcount_dec_and_test(&r->refcnt)) { + mutex_unlock(&esw->offloads.encap_tbl_lock); + return; + } + list_for_each_entry_safe(e, tmp, &r->encap_entries, route_list) + list_del_init(&e->route_list); + hash_del_rcu(&r->hlist); + mutex_unlock(&esw->offloads.encap_tbl_lock); + + mlx5e_route_dealloc(priv, r); +} + +static void mlx5e_invalidate_encap(struct mlx5e_priv *priv, + struct mlx5e_encap_entry *e, + struct list_head *encap_flows) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + struct mlx5e_tc_flow *flow; + + list_for_each_entry(flow, encap_flows, tmp_list) { + struct mlx5_flow_attr *attr = flow->attr; + struct mlx5_esw_flow_attr *esw_attr; + + if (!mlx5e_is_offloaded_flow(flow)) + continue; + esw_attr = attr->esw_attr; + + if (flow_flag_test(flow, SLOW)) + mlx5e_tc_unoffload_from_slow_path(esw, flow); + else + mlx5e_tc_unoffload_fdb_rules(esw, flow, flow->attr); + mlx5_modify_header_dealloc(priv->mdev, attr->modify_hdr); + attr->modify_hdr = NULL; + + esw_attr->dests[flow->tmp_entry_index].flags &= + ~MLX5_ESW_DEST_ENCAP_VALID; + esw_attr->dests[flow->tmp_entry_index].pkt_reformat = NULL; + } + + e->flags |= MLX5_ENCAP_ENTRY_NO_ROUTE; + if (e->flags & MLX5_ENCAP_ENTRY_VALID) { + e->flags &= ~MLX5_ENCAP_ENTRY_VALID; + mlx5_packet_reformat_dealloc(priv->mdev, e->pkt_reformat); + e->pkt_reformat = NULL; + } +} + +static void mlx5e_reoffload_encap(struct mlx5e_priv *priv, + struct net_device *tunnel_dev, + struct mlx5e_encap_entry *e, + struct list_head *encap_flows) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + struct mlx5e_tc_flow *flow; + int err; + + err = ip_tunnel_info_af(e->tun_info) == AF_INET ? + mlx5e_tc_tun_update_header_ipv4(priv, tunnel_dev, e) : + mlx5e_tc_tun_update_header_ipv6(priv, tunnel_dev, e); + if (err) + mlx5_core_warn(priv->mdev, "Failed to update encap header, %d", err); + e->flags &= ~MLX5_ENCAP_ENTRY_NO_ROUTE; + + list_for_each_entry(flow, encap_flows, tmp_list) { + struct mlx5e_tc_flow_parse_attr *parse_attr; + struct mlx5_esw_flow_attr *esw_attr; + struct mlx5_flow_handle *rule; + struct mlx5_flow_attr *attr; + struct mlx5_flow_spec *spec; + + if (flow_flag_test(flow, FAILED)) + continue; + + spec = &flow->attr->parse_attr->spec; + + attr = mlx5e_tc_get_encap_attr(flow); + esw_attr = attr->esw_attr; + parse_attr = attr->parse_attr; + + err = mlx5e_update_vf_tunnel(esw, esw_attr, &parse_attr->mod_hdr_acts, + e->out_dev, e->route_dev_ifindex, + flow->tmp_entry_index); + if (err) { + mlx5_core_warn(priv->mdev, "Failed to update VF tunnel err=%d", err); + continue; + } + + err = mlx5e_tc_add_flow_mod_hdr(priv, flow, attr); + if (err) { + mlx5_core_warn(priv->mdev, "Failed to update flow mod_hdr err=%d", + err); + continue; + } + + if (e->flags & MLX5_ENCAP_ENTRY_VALID) { + esw_attr->dests[flow->tmp_entry_index].pkt_reformat = e->pkt_reformat; + esw_attr->dests[flow->tmp_entry_index].flags |= MLX5_ESW_DEST_ENCAP_VALID; + if (!mlx5e_tc_flow_all_encaps_valid(esw_attr)) + goto offload_to_slow_path; + + err = mlx5e_tc_offload_flow_post_acts(flow); + if (err) { + mlx5_core_warn(priv->mdev, "Failed to update flow post acts, %d\n", + err); + goto offload_to_slow_path; + } + + /* update from slow path rule to encap rule */ + rule = mlx5e_tc_offload_fdb_rules(esw, flow, spec, flow->attr); + if (IS_ERR(rule)) { + mlx5e_tc_unoffload_flow_post_acts(flow); + err = PTR_ERR(rule); + mlx5_core_warn(priv->mdev, "Failed to update cached encapsulation flow, %d\n", + err); + } else { + flow->rule[0] = rule; + } + } else { +offload_to_slow_path: + rule = mlx5e_tc_offload_to_slow_path(esw, flow, spec); + /* mark the flow's encap dest as non-valid */ + esw_attr->dests[flow->tmp_entry_index].flags &= + ~MLX5_ESW_DEST_ENCAP_VALID; + + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + mlx5_core_warn(priv->mdev, "Failed to update slow path (encap) flow, %d\n", + err); + } else { + flow->rule[0] = rule; + } + } + flow_flag_set(flow, OFFLOADED); + } +} + +static int mlx5e_update_route_encaps(struct mlx5e_priv *priv, + struct mlx5e_route_entry *r, + struct list_head *flow_list, + bool replace) +{ + struct net_device *tunnel_dev; + struct mlx5e_encap_entry *e; + + tunnel_dev = __dev_get_by_index(dev_net(priv->netdev), r->tunnel_dev_index); + if (!tunnel_dev) + return -ENODEV; + + list_for_each_entry(e, &r->encap_entries, route_list) { + LIST_HEAD(encap_flows); + + mlx5e_take_all_encap_flows(e, &encap_flows); + if (list_empty(&encap_flows)) + continue; + + if (mlx5e_route_entry_valid(r)) + mlx5e_invalidate_encap(priv, e, &encap_flows); + + if (!replace) { + list_splice(&encap_flows, flow_list); + continue; + } + + mlx5e_reoffload_encap(priv, tunnel_dev, e, &encap_flows); + list_splice(&encap_flows, flow_list); + } + + return 0; +} + +static void mlx5e_unoffload_flow_list(struct mlx5e_priv *priv, + struct list_head *flow_list) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + struct mlx5e_tc_flow *flow; + + list_for_each_entry(flow, flow_list, tmp_list) + if (mlx5e_is_offloaded_flow(flow)) + mlx5e_tc_unoffload_fdb_rules(esw, flow, flow->attr); +} + +static void mlx5e_reoffload_decap(struct mlx5e_priv *priv, + struct list_head *decap_flows) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + struct mlx5e_tc_flow *flow; + + list_for_each_entry(flow, decap_flows, tmp_list) { + struct mlx5e_tc_flow_parse_attr *parse_attr; + struct mlx5_flow_attr *attr = flow->attr; + struct mlx5_flow_handle *rule; + struct mlx5_flow_spec *spec; + int err; + + if (flow_flag_test(flow, FAILED)) + continue; + + parse_attr = attr->parse_attr; + spec = &parse_attr->spec; + err = mlx5e_tc_tun_route_lookup(priv, spec, attr, parse_attr->filter_dev); + if (err) { + mlx5_core_warn(priv->mdev, "Failed to lookup route for flow, %d\n", + err); + continue; + } + + rule = mlx5e_tc_offload_fdb_rules(esw, flow, spec, attr); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + mlx5_core_warn(priv->mdev, "Failed to update cached decap flow, %d\n", + err); + } else { + flow->rule[0] = rule; + flow_flag_set(flow, OFFLOADED); + } + } +} + +static int mlx5e_update_route_decap_flows(struct mlx5e_priv *priv, + struct mlx5e_route_entry *r, + struct list_head *flow_list, + bool replace) +{ + struct net_device *tunnel_dev; + LIST_HEAD(decap_flows); + + tunnel_dev = __dev_get_by_index(dev_net(priv->netdev), r->tunnel_dev_index); + if (!tunnel_dev) + return -ENODEV; + + mlx5e_take_all_route_decap_flows(r, &decap_flows); + if (mlx5e_route_entry_valid(r)) + mlx5e_unoffload_flow_list(priv, &decap_flows); + if (replace) + mlx5e_reoffload_decap(priv, &decap_flows); + + list_splice(&decap_flows, flow_list); + + return 0; +} + +static void mlx5e_tc_fib_event_work(struct work_struct *work) +{ + struct mlx5e_tc_fib_event_data *event_data = + container_of(work, struct mlx5e_tc_fib_event_data, work); + struct net_device *ul_dev = event_data->ul_dev; + struct mlx5e_priv *priv = netdev_priv(ul_dev); + struct mlx5e_route_entry *r = event_data->r; + struct mlx5_eswitch *esw; + LIST_HEAD(flow_list); + bool replace; + int err; + + /* sync with concurrent neigh updates */ + rtnl_lock(); + esw = priv->mdev->priv.eswitch; + mutex_lock(&esw->offloads.encap_tbl_lock); + replace = event_data->event == FIB_EVENT_ENTRY_REPLACE; + + if (!mlx5e_route_entry_valid(r) && !replace) + goto out; + + err = mlx5e_update_route_encaps(priv, r, &flow_list, replace); + if (err) + mlx5_core_warn(priv->mdev, "Failed to update route encaps, %d\n", + err); + + err = mlx5e_update_route_decap_flows(priv, r, &flow_list, replace); + if (err) + mlx5_core_warn(priv->mdev, "Failed to update route decap flows, %d\n", + err); + + if (replace) + r->flags |= MLX5E_ROUTE_ENTRY_VALID; +out: + mutex_unlock(&esw->offloads.encap_tbl_lock); + rtnl_unlock(); + + mlx5e_put_flow_list(priv, &flow_list); + mlx5e_route_put(priv, event_data->r); + dev_put(event_data->ul_dev); + kfree(event_data); +} + +static struct mlx5e_tc_fib_event_data * +mlx5e_init_fib_work_ipv4(struct mlx5e_priv *priv, + struct net_device *ul_dev, + struct mlx5e_tc_tun_encap *encap, + unsigned long event, + struct fib_notifier_info *info) +{ + struct fib_entry_notifier_info *fen_info; + struct mlx5e_tc_fib_event_data *fib_work; + struct mlx5e_route_entry *r; + struct mlx5e_route_key key; + struct net_device *fib_dev; + + fen_info = container_of(info, struct fib_entry_notifier_info, info); + if (fen_info->fi->nh) + return NULL; + fib_dev = fib_info_nh(fen_info->fi, 0)->fib_nh_dev; + if (!fib_dev || fib_dev->netdev_ops != &mlx5e_netdev_ops || + fen_info->dst_len != 32) + return NULL; + + fib_work = mlx5e_tc_init_fib_work(event, ul_dev, GFP_ATOMIC); + if (!fib_work) + return ERR_PTR(-ENOMEM); + + key.endpoint_ip.v4 = htonl(fen_info->dst); + key.ip_version = 4; + + /* Can't fail after this point because releasing reference to r + * requires obtaining sleeping mutex which we can't do in atomic + * context. + */ + r = mlx5e_route_lookup_for_update(encap, &key); + if (!r) + goto out; + fib_work->r = r; + dev_hold(ul_dev); + + return fib_work; + +out: + kfree(fib_work); + return NULL; +} + +static struct mlx5e_tc_fib_event_data * +mlx5e_init_fib_work_ipv6(struct mlx5e_priv *priv, + struct net_device *ul_dev, + struct mlx5e_tc_tun_encap *encap, + unsigned long event, + struct fib_notifier_info *info) +{ + struct fib6_entry_notifier_info *fen_info; + struct mlx5e_tc_fib_event_data *fib_work; + struct mlx5e_route_entry *r; + struct mlx5e_route_key key; + struct net_device *fib_dev; + + fen_info = container_of(info, struct fib6_entry_notifier_info, info); + fib_dev = fib6_info_nh_dev(fen_info->rt); + if (fib_dev->netdev_ops != &mlx5e_netdev_ops || + fen_info->rt->fib6_dst.plen != 128) + return NULL; + + fib_work = mlx5e_tc_init_fib_work(event, ul_dev, GFP_ATOMIC); + if (!fib_work) + return ERR_PTR(-ENOMEM); + + memcpy(&key.endpoint_ip.v6, &fen_info->rt->fib6_dst.addr, + sizeof(fen_info->rt->fib6_dst.addr)); + key.ip_version = 6; + + /* Can't fail after this point because releasing reference to r + * requires obtaining sleeping mutex which we can't do in atomic + * context. + */ + r = mlx5e_route_lookup_for_update(encap, &key); + if (!r) + goto out; + fib_work->r = r; + dev_hold(ul_dev); + + return fib_work; + +out: + kfree(fib_work); + return NULL; +} + +static int mlx5e_tc_tun_fib_event(struct notifier_block *nb, unsigned long event, void *ptr) +{ + struct mlx5e_tc_fib_event_data *fib_work; + struct fib_notifier_info *info = ptr; + struct mlx5e_tc_tun_encap *encap; + struct net_device *ul_dev; + struct mlx5e_priv *priv; + + encap = container_of(nb, struct mlx5e_tc_tun_encap, fib_nb); + priv = encap->priv; + ul_dev = priv->netdev; + priv = netdev_priv(ul_dev); + + switch (event) { + case FIB_EVENT_ENTRY_REPLACE: + case FIB_EVENT_ENTRY_DEL: + if (info->family == AF_INET) + fib_work = mlx5e_init_fib_work_ipv4(priv, ul_dev, encap, event, info); + else if (info->family == AF_INET6) + fib_work = mlx5e_init_fib_work_ipv6(priv, ul_dev, encap, event, info); + else + return NOTIFY_DONE; + + if (!IS_ERR_OR_NULL(fib_work)) { + queue_work(priv->wq, &fib_work->work); + } else if (IS_ERR(fib_work)) { + NL_SET_ERR_MSG_MOD(info->extack, "Failed to init fib work"); + mlx5_core_warn(priv->mdev, "Failed to init fib work, %ld\n", + PTR_ERR(fib_work)); + } + + break; + default: + return NOTIFY_DONE; + } + + return NOTIFY_DONE; +} + +struct mlx5e_tc_tun_encap *mlx5e_tc_tun_init(struct mlx5e_priv *priv) +{ + struct mlx5e_tc_tun_encap *encap; + int err; + + encap = kvzalloc(sizeof(*encap), GFP_KERNEL); + if (!encap) + return ERR_PTR(-ENOMEM); + + encap->priv = priv; + encap->fib_nb.notifier_call = mlx5e_tc_tun_fib_event; + spin_lock_init(&encap->route_lock); + hash_init(encap->route_tbl); + err = register_fib_notifier(dev_net(priv->netdev), &encap->fib_nb, + NULL, NULL); + if (err) { + kvfree(encap); + return ERR_PTR(err); + } + + return encap; +} + +void mlx5e_tc_tun_cleanup(struct mlx5e_tc_tun_encap *encap) +{ + if (!encap) + return; + + unregister_fib_notifier(dev_net(encap->priv->netdev), &encap->fib_nb); + flush_workqueue(encap->priv->wq); /* flush fib event works */ + kvfree(encap); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.h new file mode 100644 index 0000000..8ad273d --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2021 Mellanox Technologies. */ + +#ifndef __MLX5_EN_TC_TUN_ENCAP_H__ +#define __MLX5_EN_TC_TUN_ENCAP_H__ + +#include "tc_priv.h" + +void mlx5e_detach_encap(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow, + struct mlx5_flow_attr *attr, + int out_index); + +int mlx5e_attach_encap(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow, + struct mlx5_flow_attr *attr, + struct net_device *mirred_dev, + int out_index, + struct netlink_ext_ack *extack, + struct net_device **encap_dev); + +int mlx5e_attach_decap(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow, + struct netlink_ext_ack *extack); +void mlx5e_detach_decap(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow); + +int mlx5e_attach_decap_route(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow); +void mlx5e_detach_decap_route(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow); + +struct ip_tunnel_info *mlx5e_dup_tun_info(const struct ip_tunnel_info *tun_info); + +int mlx5e_tc_set_attr_rx_tun(struct mlx5e_tc_flow *flow, + struct mlx5_flow_spec *spec); + +struct mlx5e_tc_tun_encap *mlx5e_tc_tun_init(struct mlx5e_priv *priv); +void mlx5e_tc_tun_cleanup(struct mlx5e_tc_tun_encap *encap); + +#endif /* __MLX5_EN_TC_TUN_ENCAP_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_geneve.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_geneve.c new file mode 100644 index 0000000..054d80c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_geneve.c @@ -0,0 +1,375 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2018 Mellanox Technologies. */ + +#include +#include "lib/geneve.h" +#include "en/tc_tun.h" + +#define MLX5E_GENEVE_VER 0 + +static bool mlx5e_tc_tun_can_offload_geneve(struct mlx5e_priv *priv) +{ + return !!(MLX5_CAP_GEN(priv->mdev, flex_parser_protocols) & MLX5_FLEX_PROTO_GENEVE); +} + +static int mlx5e_tc_tun_calc_hlen_geneve(struct mlx5e_encap_entry *e) +{ + return sizeof(struct udphdr) + + sizeof(struct genevehdr) + + e->tun_info->options_len; +} + +static int mlx5e_tc_tun_check_udp_dport_geneve(struct mlx5e_priv *priv, + struct flow_cls_offload *f) +{ + struct flow_rule *rule = flow_cls_offload_flow_rule(f); + struct netlink_ext_ack *extack = f->common.extack; + struct flow_match_ports enc_ports; + + if (!flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_PORTS)) + return -EOPNOTSUPP; + + flow_rule_match_enc_ports(rule, &enc_ports); + + /* Currently we support only default GENEVE + * port, so udp dst port must match. + */ + if (be16_to_cpu(enc_ports.key->dst) != GENEVE_UDP_PORT) { + NL_SET_ERR_MSG_MOD(extack, + "Matched UDP dst port is not registered as a GENEVE port"); + netdev_warn(priv->netdev, + "UDP port %d is not registered as a GENEVE port\n", + be16_to_cpu(enc_ports.key->dst)); + return -EOPNOTSUPP; + } + + return 0; +} + +static int mlx5e_tc_tun_parse_udp_ports_geneve(struct mlx5e_priv *priv, + struct mlx5_flow_spec *spec, + struct flow_cls_offload *f, + void *headers_c, + void *headers_v) +{ + int err; + + err = mlx5e_tc_tun_parse_udp_ports(priv, spec, f, headers_c, headers_v); + if (err) + return err; + + return mlx5e_tc_tun_check_udp_dport_geneve(priv, f); +} + +static int mlx5e_tc_tun_init_encap_attr_geneve(struct net_device *tunnel_dev, + struct mlx5e_priv *priv, + struct mlx5e_encap_entry *e, + struct netlink_ext_ack *extack) +{ + e->tunnel = &geneve_tunnel; + + /* Reformat type for GENEVE encap is similar to VXLAN: + * in both cases the HW adds in the same place a + * defined encapsulation header that the SW provides. + */ + e->reformat_type = MLX5_REFORMAT_TYPE_L2_TO_VXLAN; + return 0; +} + +static void mlx5e_tunnel_id_to_vni(__be64 tun_id, __u8 *vni) +{ +#ifdef __BIG_ENDIAN + vni[0] = (__force __u8)(tun_id >> 16); + vni[1] = (__force __u8)(tun_id >> 8); + vni[2] = (__force __u8)tun_id; +#else + vni[0] = (__force __u8)((__force u64)tun_id >> 40); + vni[1] = (__force __u8)((__force u64)tun_id >> 48); + vni[2] = (__force __u8)((__force u64)tun_id >> 56); +#endif +} + +static int mlx5e_gen_ip_tunnel_header_geneve(char buf[], + __u8 *ip_proto, + struct mlx5e_encap_entry *e) +{ + const struct ip_tunnel_info *tun_info = e->tun_info; + struct udphdr *udp = (struct udphdr *)(buf); + struct genevehdr *geneveh; + + geneveh = (struct genevehdr *)((char *)udp + sizeof(struct udphdr)); + + *ip_proto = IPPROTO_UDP; + + udp->dest = tun_info->key.tp_dst; + + memset(geneveh, 0, sizeof(*geneveh)); + geneveh->ver = MLX5E_GENEVE_VER; + geneveh->opt_len = tun_info->options_len / 4; + geneveh->oam = !!(tun_info->key.tun_flags & TUNNEL_OAM); + geneveh->critical = !!(tun_info->key.tun_flags & TUNNEL_CRIT_OPT); + mlx5e_tunnel_id_to_vni(tun_info->key.tun_id, geneveh->vni); + geneveh->proto_type = htons(ETH_P_TEB); + + if (tun_info->key.tun_flags & TUNNEL_GENEVE_OPT) { + if (!geneveh->opt_len) + return -EOPNOTSUPP; + ip_tunnel_info_opts_get(geneveh->options, tun_info); + } + + return 0; +} + +static int mlx5e_tc_tun_parse_geneve_vni(struct mlx5e_priv *priv, + struct mlx5_flow_spec *spec, + struct flow_cls_offload *f) +{ + struct flow_rule *rule = flow_cls_offload_flow_rule(f); + struct netlink_ext_ack *extack = f->common.extack; + struct flow_match_enc_keyid enc_keyid; + void *misc_c, *misc_v; + + misc_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, misc_parameters); + misc_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, misc_parameters); + + if (!flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_KEYID)) + return 0; + + flow_rule_match_enc_keyid(rule, &enc_keyid); + + if (!enc_keyid.mask->keyid) + return 0; + + if (!MLX5_CAP_ESW_FLOWTABLE_FDB(priv->mdev, ft_field_support.outer_geneve_vni)) { + NL_SET_ERR_MSG_MOD(extack, "Matching on GENEVE VNI is not supported"); + netdev_warn(priv->netdev, "Matching on GENEVE VNI is not supported\n"); + return -EOPNOTSUPP; + } + + MLX5_SET(fte_match_set_misc, misc_c, geneve_vni, be32_to_cpu(enc_keyid.mask->keyid)); + MLX5_SET(fte_match_set_misc, misc_v, geneve_vni, be32_to_cpu(enc_keyid.key->keyid)); + + return 0; +} + +static int mlx5e_tc_tun_parse_geneve_options(struct mlx5e_priv *priv, + struct mlx5_flow_spec *spec, + struct flow_cls_offload *f) +{ + u8 max_tlv_option_data_len = MLX5_CAP_GEN(priv->mdev, max_geneve_tlv_option_data_len); + u8 max_tlv_options = MLX5_CAP_GEN(priv->mdev, max_geneve_tlv_options); + struct flow_rule *rule = flow_cls_offload_flow_rule(f); + struct netlink_ext_ack *extack = f->common.extack; + void *misc_c, *misc_v, *misc_3_c, *misc_3_v; + struct geneve_opt *option_key, *option_mask; + __be32 opt_data_key = 0, opt_data_mask = 0; + struct flow_match_enc_opts enc_opts; + int res = 0; + + misc_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, misc_parameters); + misc_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, misc_parameters); + misc_3_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, misc_parameters_3); + misc_3_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, misc_parameters_3); + + if (!flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_OPTS)) + return 0; + + flow_rule_match_enc_opts(rule, &enc_opts); + + if (memchr_inv(&enc_opts.mask->data, 0, sizeof(enc_opts.mask->data)) && + !MLX5_CAP_ESW_FLOWTABLE_FDB(priv->mdev, + ft_field_support.geneve_tlv_option_0_data)) { + NL_SET_ERR_MSG_MOD(extack, + "Matching on GENEVE options is not supported"); + netdev_warn(priv->netdev, + "Matching on GENEVE options is not supported\n"); + return -EOPNOTSUPP; + } + + /* make sure that we're talking about GENEVE options */ + + if (enc_opts.key->dst_opt_type != TUNNEL_GENEVE_OPT) { + NL_SET_ERR_MSG_MOD(extack, + "Matching on GENEVE options: option type is not GENEVE"); + netdev_warn(priv->netdev, + "Matching on GENEVE options: option type is not GENEVE\n"); + return -EOPNOTSUPP; + } + + if (enc_opts.mask->len && + !MLX5_CAP_ESW_FLOWTABLE_FDB(priv->mdev, + ft_field_support.outer_geneve_opt_len)) { + NL_SET_ERR_MSG_MOD(extack, "Matching on GENEVE options len is not supported"); + netdev_warn(priv->netdev, + "Matching on GENEVE options len is not supported\n"); + return -EOPNOTSUPP; + } + + /* max_geneve_tlv_option_data_len comes in multiples of 4 bytes, and it + * doesn't include the TLV option header. 'geneve_opt_len' is a total + * len of all the options, including the headers, also multiples of 4 + * bytes. Len that comes from the dissector is in bytes. + */ + + if ((enc_opts.key->len / 4) > ((max_tlv_option_data_len + 1) * max_tlv_options)) { + NL_SET_ERR_MSG_MOD(extack, + "Matching on GENEVE options: unsupported options len"); + netdev_warn(priv->netdev, + "Matching on GENEVE options: unsupported options len (len=%d)\n", + enc_opts.key->len); + return -EOPNOTSUPP; + } + + MLX5_SET(fte_match_set_misc, misc_c, geneve_opt_len, enc_opts.mask->len / 4); + MLX5_SET(fte_match_set_misc, misc_v, geneve_opt_len, enc_opts.key->len / 4); + + /* we support matching on one option only, so just get it */ + option_key = (struct geneve_opt *)&enc_opts.key->data[0]; + option_mask = (struct geneve_opt *)&enc_opts.mask->data[0]; + + if (option_mask->opt_class == 0 && option_mask->type == 0 && + !memchr_inv(option_mask->opt_data, 0, option_mask->length * 4)) + return 0; + + if (option_key->length > max_tlv_option_data_len) { + NL_SET_ERR_MSG_MOD(extack, + "Matching on GENEVE options: unsupported option len"); + netdev_warn(priv->netdev, + "Matching on GENEVE options: unsupported option len (key=%d, mask=%d)\n", + option_key->length, option_mask->length); + return -EOPNOTSUPP; + } + + /* data can't be all 0 - fail to offload such rule */ + if (!memchr_inv(option_key->opt_data, 0, option_key->length * 4)) { + NL_SET_ERR_MSG_MOD(extack, + "Matching on GENEVE options: can't match on 0 data field"); + netdev_warn(priv->netdev, + "Matching on GENEVE options: can't match on 0 data field\n"); + return -EOPNOTSUPP; + } + + /* add new GENEVE TLV options object */ + res = mlx5_geneve_tlv_option_add(priv->mdev->geneve, option_key); + if (res) { + NL_SET_ERR_MSG_MOD(extack, + "Matching on GENEVE options: failed creating TLV opt object"); + netdev_warn(priv->netdev, + "Matching on GENEVE options: failed creating TLV opt object (class:type:len = 0x%x:0x%x:%d)\n", + be16_to_cpu(option_key->opt_class), + option_key->type, option_key->length); + return res; + } + + /* In general, after creating the object, need to query it + * in order to check which option data to set in misc3. + * But we support only geneve_tlv_option_0_data, so no + * point querying at this stage. + */ + + memcpy(&opt_data_key, option_key->opt_data, option_key->length * 4); + memcpy(&opt_data_mask, option_mask->opt_data, option_mask->length * 4); + MLX5_SET(fte_match_set_misc3, misc_3_v, + geneve_tlv_option_0_data, be32_to_cpu(opt_data_key)); + MLX5_SET(fte_match_set_misc3, misc_3_c, + geneve_tlv_option_0_data, be32_to_cpu(opt_data_mask)); + if (MLX5_CAP_ESW_FLOWTABLE_FDB(priv->mdev, + ft_field_support.geneve_tlv_option_0_exist)) { + MLX5_SET_TO_ONES(fte_match_set_misc, misc_c, geneve_tlv_option_0_exist); + MLX5_SET_TO_ONES(fte_match_set_misc, misc_v, geneve_tlv_option_0_exist); + } + + spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS_3; + + return 0; +} + +static int mlx5e_tc_tun_parse_geneve_params(struct mlx5e_priv *priv, + struct mlx5_flow_spec *spec, + struct flow_cls_offload *f) +{ + void *misc_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, misc_parameters); + void *misc_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, misc_parameters); + struct netlink_ext_ack *extack = f->common.extack; + + /* match on OAM - packets with OAM bit on should NOT be offloaded */ + + if (!MLX5_CAP_ESW_FLOWTABLE_FDB(priv->mdev, ft_field_support.outer_geneve_oam)) { + NL_SET_ERR_MSG_MOD(extack, "Matching on GENEVE OAM is not supported"); + netdev_warn(priv->netdev, "Matching on GENEVE OAM is not supported\n"); + return -EOPNOTSUPP; + } + MLX5_SET_TO_ONES(fte_match_set_misc, misc_c, geneve_oam); + MLX5_SET(fte_match_set_misc, misc_v, geneve_oam, 0); + + /* Match on GENEVE protocol. We support only Transparent Eth Bridge. */ + + if (MLX5_CAP_ESW_FLOWTABLE_FDB(priv->mdev, + ft_field_support.outer_geneve_protocol_type)) { + MLX5_SET_TO_ONES(fte_match_set_misc, misc_c, geneve_protocol_type); + MLX5_SET(fte_match_set_misc, misc_v, geneve_protocol_type, ETH_P_TEB); + } + + spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS; + + return 0; +} + +static int mlx5e_tc_tun_parse_geneve(struct mlx5e_priv *priv, + struct mlx5_flow_spec *spec, + struct flow_cls_offload *f, + void *headers_c, + void *headers_v) +{ + int err; + + err = mlx5e_tc_tun_parse_geneve_params(priv, spec, f); + if (err) + return err; + + err = mlx5e_tc_tun_parse_geneve_vni(priv, spec, f); + if (err) + return err; + + return mlx5e_tc_tun_parse_geneve_options(priv, spec, f); +} + +static bool mlx5e_tc_tun_encap_info_equal_geneve(struct mlx5e_encap_key *a, + struct mlx5e_encap_key *b) +{ + struct ip_tunnel_info *a_info; + struct ip_tunnel_info *b_info; + bool a_has_opts, b_has_opts; + + if (!mlx5e_tc_tun_encap_info_equal_generic(a, b)) + return false; + + a_has_opts = !!(a->ip_tun_key->tun_flags & TUNNEL_GENEVE_OPT); + b_has_opts = !!(b->ip_tun_key->tun_flags & TUNNEL_GENEVE_OPT); + + /* keys are equal when both don't have any options attached */ + if (!a_has_opts && !b_has_opts) + return true; + + if (a_has_opts != b_has_opts) + return false; + + /* geneve options stored in memory next to ip_tunnel_info struct */ + a_info = container_of(a->ip_tun_key, struct ip_tunnel_info, key); + b_info = container_of(b->ip_tun_key, struct ip_tunnel_info, key); + + return a_info->options_len == b_info->options_len && + memcmp(a_info + 1, b_info + 1, a_info->options_len) == 0; +} + +struct mlx5e_tc_tunnel geneve_tunnel = { + .tunnel_type = MLX5E_TC_TUNNEL_TYPE_GENEVE, + .match_level = MLX5_MATCH_L4, + .can_offload = mlx5e_tc_tun_can_offload_geneve, + .calc_hlen = mlx5e_tc_tun_calc_hlen_geneve, + .init_encap_attr = mlx5e_tc_tun_init_encap_attr_geneve, + .generate_ip_tun_hdr = mlx5e_gen_ip_tunnel_header_geneve, + .parse_udp_ports = mlx5e_tc_tun_parse_udp_ports_geneve, + .parse_tunnel = mlx5e_tc_tun_parse_geneve, + .encap_info_equal = mlx5e_tc_tun_encap_info_equal_geneve, +}; diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_gre.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_gre.c new file mode 100644 index 0000000..ada14f0 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_gre.c @@ -0,0 +1,98 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2018 Mellanox Technologies. */ + +#include +#include "en/tc_tun.h" + +static bool mlx5e_tc_tun_can_offload_gretap(struct mlx5e_priv *priv) +{ + return !!MLX5_CAP_ESW(priv->mdev, nvgre_encap_decap); +} + +static int mlx5e_tc_tun_calc_hlen_gretap(struct mlx5e_encap_entry *e) +{ + return gre_calc_hlen(e->tun_info->key.tun_flags); +} + +static int mlx5e_tc_tun_init_encap_attr_gretap(struct net_device *tunnel_dev, + struct mlx5e_priv *priv, + struct mlx5e_encap_entry *e, + struct netlink_ext_ack *extack) +{ + e->tunnel = &gre_tunnel; + e->reformat_type = MLX5_REFORMAT_TYPE_L2_TO_NVGRE; + return 0; +} + +static int mlx5e_gen_ip_tunnel_header_gretap(char buf[], + __u8 *ip_proto, + struct mlx5e_encap_entry *e) +{ + const struct ip_tunnel_key *tun_key = &e->tun_info->key; + struct gre_base_hdr *greh = (struct gre_base_hdr *)(buf); + __be32 tun_id = tunnel_id_to_key32(tun_key->tun_id); + int hdr_len; + + *ip_proto = IPPROTO_GRE; + + /* the HW does not calculate GRE csum or sequences */ + if (tun_key->tun_flags & (TUNNEL_CSUM | TUNNEL_SEQ)) + return -EOPNOTSUPP; + + greh->protocol = htons(ETH_P_TEB); + + /* GRE key */ + hdr_len = mlx5e_tc_tun_calc_hlen_gretap(e); + greh->flags = gre_tnl_flags_to_gre_flags(tun_key->tun_flags); + if (tun_key->tun_flags & TUNNEL_KEY) { + __be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4); + *ptr = tun_id; + } + + return 0; +} + +static int mlx5e_tc_tun_parse_gretap(struct mlx5e_priv *priv, + struct mlx5_flow_spec *spec, + struct flow_cls_offload *f, + void *headers_c, + void *headers_v) +{ + void *misc_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, misc_parameters); + void *misc_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, misc_parameters); + struct flow_rule *rule = flow_cls_offload_flow_rule(f); + + MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, ip_protocol); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol, IPPROTO_GRE); + + /* gre protocol */ + MLX5_SET_TO_ONES(fte_match_set_misc, misc_c, gre_protocol); + MLX5_SET(fte_match_set_misc, misc_v, gre_protocol, ETH_P_TEB); + + /* gre key */ + if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_KEYID)) { + struct flow_match_enc_keyid enc_keyid; + + flow_rule_match_enc_keyid(rule, &enc_keyid); + MLX5_SET(fte_match_set_misc, misc_c, + gre_key.key, be32_to_cpu(enc_keyid.mask->keyid)); + MLX5_SET(fte_match_set_misc, misc_v, + gre_key.key, be32_to_cpu(enc_keyid.key->keyid)); + } + + spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS; + + return 0; +} + +struct mlx5e_tc_tunnel gre_tunnel = { + .tunnel_type = MLX5E_TC_TUNNEL_TYPE_GRETAP, + .match_level = MLX5_MATCH_L3, + .can_offload = mlx5e_tc_tun_can_offload_gretap, + .calc_hlen = mlx5e_tc_tun_calc_hlen_gretap, + .init_encap_attr = mlx5e_tc_tun_init_encap_attr_gretap, + .generate_ip_tun_hdr = mlx5e_gen_ip_tunnel_header_gretap, + .parse_udp_ports = NULL, + .parse_tunnel = mlx5e_tc_tun_parse_gretap, + .encap_info_equal = mlx5e_tc_tun_encap_info_equal_generic, +}; diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_mplsoudp.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_mplsoudp.c new file mode 100644 index 0000000..c5b1617 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_mplsoudp.c @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2018 Mellanox Technologies. */ + +#include +#include +#include "en/tc_tun.h" + +static bool can_offload(struct mlx5e_priv *priv) +{ + return MLX5_CAP_ESW_FLOWTABLE_FDB(priv->mdev, reformat_l3_tunnel_to_l2); +} + +static int calc_hlen(struct mlx5e_encap_entry *e) +{ + return sizeof(struct udphdr) + MPLS_HLEN; +} + +static int init_encap_attr(struct net_device *tunnel_dev, + struct mlx5e_priv *priv, + struct mlx5e_encap_entry *e, + struct netlink_ext_ack *extack) +{ + e->tunnel = &mplsoudp_tunnel; + e->reformat_type = MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL; + return 0; +} + +static int generate_ip_tun_hdr(char buf[], + __u8 *ip_proto, + struct mlx5e_encap_entry *r) +{ + const struct ip_tunnel_key *tun_key = &r->tun_info->key; + const struct mlx5e_mpls_info *mpls_info = &r->mpls_info; + struct udphdr *udp = (struct udphdr *)(buf); + struct mpls_shim_hdr *mpls; + + mpls = (struct mpls_shim_hdr *)(udp + 1); + *ip_proto = IPPROTO_UDP; + + udp->dest = tun_key->tp_dst; + *mpls = mpls_entry_encode(mpls_info->label, mpls_info->ttl, mpls_info->tc, mpls_info->bos); + + return 0; +} + +static int parse_udp_ports(struct mlx5e_priv *priv, + struct mlx5_flow_spec *spec, + struct flow_cls_offload *f, + void *headers_c, + void *headers_v) +{ + return mlx5e_tc_tun_parse_udp_ports(priv, spec, f, headers_c, headers_v); +} + +static int parse_tunnel(struct mlx5e_priv *priv, + struct mlx5_flow_spec *spec, + struct flow_cls_offload *f, + void *headers_c, + void *headers_v) +{ + struct flow_rule *rule = flow_cls_offload_flow_rule(f); + struct flow_match_mpls match; + void *misc2_c; + void *misc2_v; + + if (!MLX5_CAP_ETH(priv->mdev, tunnel_stateless_mpls_over_udp) && + !(MLX5_CAP_GEN(priv->mdev, flex_parser_protocols) & MLX5_FLEX_PROTO_CW_MPLS_UDP)) + return -EOPNOTSUPP; + + if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_KEYID)) + return -EOPNOTSUPP; + + if (!flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_MPLS)) + return 0; + + flow_rule_match_mpls(rule, &match); + + /* Only support matching the first LSE */ + if (match.mask->used_lses != 1) + return -EOPNOTSUPP; + + misc2_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, + misc_parameters_2); + misc2_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, + misc_parameters_2); + + MLX5_SET(fte_match_set_misc2, misc2_c, + outer_first_mpls_over_udp.mpls_label, + match.mask->ls[0].mpls_label); + MLX5_SET(fte_match_set_misc2, misc2_v, + outer_first_mpls_over_udp.mpls_label, + match.key->ls[0].mpls_label); + + MLX5_SET(fte_match_set_misc2, misc2_c, + outer_first_mpls_over_udp.mpls_exp, + match.mask->ls[0].mpls_tc); + MLX5_SET(fte_match_set_misc2, misc2_v, + outer_first_mpls_over_udp.mpls_exp, match.key->ls[0].mpls_tc); + + MLX5_SET(fte_match_set_misc2, misc2_c, + outer_first_mpls_over_udp.mpls_s_bos, + match.mask->ls[0].mpls_bos); + MLX5_SET(fte_match_set_misc2, misc2_v, + outer_first_mpls_over_udp.mpls_s_bos, + match.key->ls[0].mpls_bos); + + MLX5_SET(fte_match_set_misc2, misc2_c, + outer_first_mpls_over_udp.mpls_ttl, + match.mask->ls[0].mpls_ttl); + MLX5_SET(fte_match_set_misc2, misc2_v, + outer_first_mpls_over_udp.mpls_ttl, + match.key->ls[0].mpls_ttl); + spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS_2; + + return 0; +} + +struct mlx5e_tc_tunnel mplsoudp_tunnel = { + .tunnel_type = MLX5E_TC_TUNNEL_TYPE_MPLSOUDP, + .match_level = MLX5_MATCH_L4, + .can_offload = can_offload, + .calc_hlen = calc_hlen, + .init_encap_attr = init_encap_attr, + .generate_ip_tun_hdr = generate_ip_tun_hdr, + .parse_udp_ports = parse_udp_ports, + .parse_tunnel = parse_tunnel, + .encap_info_equal = mlx5e_tc_tun_encap_info_equal_generic, +}; diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_vxlan.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_vxlan.c new file mode 100644 index 0000000..fd07c4c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_vxlan.c @@ -0,0 +1,163 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2018 Mellanox Technologies. */ + +#include +#include "lib/vxlan.h" +#include "en/tc_tun.h" + +static bool mlx5e_tc_tun_can_offload_vxlan(struct mlx5e_priv *priv) +{ + return !!MLX5_CAP_ESW(priv->mdev, vxlan_encap_decap); +} + +static int mlx5e_tc_tun_calc_hlen_vxlan(struct mlx5e_encap_entry *e) +{ + return VXLAN_HLEN; +} + +static int mlx5e_tc_tun_check_udp_dport_vxlan(struct mlx5e_priv *priv, + struct flow_cls_offload *f) +{ + struct flow_rule *rule = flow_cls_offload_flow_rule(f); + struct netlink_ext_ack *extack = f->common.extack; + struct flow_match_ports enc_ports; + + if (!flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_PORTS)) + return -EOPNOTSUPP; + + flow_rule_match_enc_ports(rule, &enc_ports); + + /* check the UDP destination port validity */ + + if (!mlx5_vxlan_lookup_port(priv->mdev->vxlan, + be16_to_cpu(enc_ports.key->dst))) { + NL_SET_ERR_MSG_MOD(extack, + "Matched UDP dst port is not registered as a VXLAN port"); + netdev_warn(priv->netdev, + "UDP port %d is not registered as a VXLAN port\n", + be16_to_cpu(enc_ports.key->dst)); + return -EOPNOTSUPP; + } + + return 0; +} + +static int mlx5e_tc_tun_parse_udp_ports_vxlan(struct mlx5e_priv *priv, + struct mlx5_flow_spec *spec, + struct flow_cls_offload *f, + void *headers_c, + void *headers_v) +{ + int err = 0; + + err = mlx5e_tc_tun_parse_udp_ports(priv, spec, f, headers_c, headers_v); + if (err) + return err; + + return mlx5e_tc_tun_check_udp_dport_vxlan(priv, f); +} + +static int mlx5e_tc_tun_init_encap_attr_vxlan(struct net_device *tunnel_dev, + struct mlx5e_priv *priv, + struct mlx5e_encap_entry *e, + struct netlink_ext_ack *extack) +{ + int dst_port = be16_to_cpu(e->tun_info->key.tp_dst); + + e->tunnel = &vxlan_tunnel; + + if (!mlx5_vxlan_lookup_port(priv->mdev->vxlan, dst_port)) { + NL_SET_ERR_MSG_MOD(extack, + "vxlan udp dport was not registered with the HW"); + netdev_warn(priv->netdev, + "%d isn't an offloaded vxlan udp dport\n", + dst_port); + return -EOPNOTSUPP; + } + + e->reformat_type = MLX5_REFORMAT_TYPE_L2_TO_VXLAN; + return 0; +} + +static int mlx5e_gen_ip_tunnel_header_vxlan(char buf[], + __u8 *ip_proto, + struct mlx5e_encap_entry *e) +{ + const struct ip_tunnel_key *tun_key = &e->tun_info->key; + __be32 tun_id = tunnel_id_to_key32(tun_key->tun_id); + struct udphdr *udp = (struct udphdr *)(buf); + struct vxlanhdr *vxh; + + vxh = (struct vxlanhdr *)((char *)udp + sizeof(struct udphdr)); + *ip_proto = IPPROTO_UDP; + + udp->dest = tun_key->tp_dst; + vxh->vx_flags = VXLAN_HF_VNI; + vxh->vx_vni = vxlan_vni_field(tun_id); + + return 0; +} + +static int mlx5e_tc_tun_parse_vxlan(struct mlx5e_priv *priv, + struct mlx5_flow_spec *spec, + struct flow_cls_offload *f, + void *headers_c, + void *headers_v) +{ + struct flow_rule *rule = flow_cls_offload_flow_rule(f); + struct netlink_ext_ack *extack = f->common.extack; + struct flow_match_enc_keyid enc_keyid; + void *misc_c, *misc_v; + + misc_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, misc_parameters); + misc_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, misc_parameters); + + if (!flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_KEYID)) + return 0; + + flow_rule_match_enc_keyid(rule, &enc_keyid); + + if (!enc_keyid.mask->keyid) + return 0; + + /* match on VNI is required */ + + if (!MLX5_CAP_ESW_FLOWTABLE_FDB(priv->mdev, + ft_field_support.outer_vxlan_vni)) { + NL_SET_ERR_MSG_MOD(extack, + "Matching on VXLAN VNI is not supported"); + netdev_warn(priv->netdev, + "Matching on VXLAN VNI is not supported\n"); + return -EOPNOTSUPP; + } + + MLX5_SET(fte_match_set_misc, misc_c, vxlan_vni, + be32_to_cpu(enc_keyid.mask->keyid)); + MLX5_SET(fte_match_set_misc, misc_v, vxlan_vni, + be32_to_cpu(enc_keyid.key->keyid)); + + spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS; + + return 0; +} + +static int mlx5e_tc_tun_get_remote_ifindex(struct net_device *mirred_dev) +{ + const struct vxlan_dev *vxlan = netdev_priv(mirred_dev); + const struct vxlan_rdst *dst = &vxlan->default_dst; + + return dst->remote_ifindex; +} + +struct mlx5e_tc_tunnel vxlan_tunnel = { + .tunnel_type = MLX5E_TC_TUNNEL_TYPE_VXLAN, + .match_level = MLX5_MATCH_L4, + .can_offload = mlx5e_tc_tun_can_offload_vxlan, + .calc_hlen = mlx5e_tc_tun_calc_hlen_vxlan, + .init_encap_attr = mlx5e_tc_tun_init_encap_attr_vxlan, + .generate_ip_tun_hdr = mlx5e_gen_ip_tunnel_header_vxlan, + .parse_udp_ports = mlx5e_tc_tun_parse_udp_ports_vxlan, + .parse_tunnel = mlx5e_tc_tun_parse_vxlan, + .encap_info_equal = mlx5e_tc_tun_encap_info_equal_generic, + .get_remote_ifindex = mlx5e_tc_tun_get_remote_ifindex, +}; diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tir.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tir.c new file mode 100644 index 0000000..580159d --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tir.c @@ -0,0 +1,207 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2021, Mellanox Technologies inc. All rights reserved. */ + +#include "tir.h" +#include "params.h" +#include + +#define MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ (64 * 1024) + +/* max() doesn't work inside square brackets. */ +#define MLX5E_TIR_CMD_IN_SZ_DW ( \ + MLX5_ST_SZ_DW(create_tir_in) > MLX5_ST_SZ_DW(modify_tir_in) ? \ + MLX5_ST_SZ_DW(create_tir_in) : MLX5_ST_SZ_DW(modify_tir_in) \ +) + +struct mlx5e_tir_builder { + u32 in[MLX5E_TIR_CMD_IN_SZ_DW]; + bool modify; +}; + +struct mlx5e_tir_builder *mlx5e_tir_builder_alloc(bool modify) +{ + struct mlx5e_tir_builder *builder; + + builder = kvzalloc(sizeof(*builder), GFP_KERNEL); + builder->modify = modify; + + return builder; +} + +void mlx5e_tir_builder_free(struct mlx5e_tir_builder *builder) +{ + kvfree(builder); +} + +void mlx5e_tir_builder_clear(struct mlx5e_tir_builder *builder) +{ + memset(builder->in, 0, sizeof(builder->in)); +} + +static void *mlx5e_tir_builder_get_tirc(struct mlx5e_tir_builder *builder) +{ + if (builder->modify) + return MLX5_ADDR_OF(modify_tir_in, builder->in, ctx); + return MLX5_ADDR_OF(create_tir_in, builder->in, ctx); +} + +void mlx5e_tir_builder_build_inline(struct mlx5e_tir_builder *builder, u32 tdn, u32 rqn) +{ + void *tirc = mlx5e_tir_builder_get_tirc(builder); + + WARN_ON(builder->modify); + + MLX5_SET(tirc, tirc, transport_domain, tdn); + MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_DIRECT); + MLX5_SET(tirc, tirc, rx_hash_fn, MLX5_RX_HASH_FN_NONE); + MLX5_SET(tirc, tirc, inline_rqn, rqn); +} + +void mlx5e_tir_builder_build_rqt(struct mlx5e_tir_builder *builder, u32 tdn, + u32 rqtn, bool inner_ft_support) +{ + void *tirc = mlx5e_tir_builder_get_tirc(builder); + + WARN_ON(builder->modify); + + MLX5_SET(tirc, tirc, transport_domain, tdn); + MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_INDIRECT); + MLX5_SET(tirc, tirc, indirect_table, rqtn); + MLX5_SET(tirc, tirc, tunneled_offload_en, inner_ft_support); +} + +void mlx5e_tir_builder_build_packet_merge(struct mlx5e_tir_builder *builder, + const struct mlx5e_packet_merge_param *pkt_merge_param) +{ + void *tirc = mlx5e_tir_builder_get_tirc(builder); + const unsigned int rough_max_l2_l3_hdr_sz = 256; + + if (builder->modify) + MLX5_SET(modify_tir_in, builder->in, bitmask.packet_merge, 1); + + switch (pkt_merge_param->type) { + case MLX5E_PACKET_MERGE_LRO: + MLX5_SET(tirc, tirc, packet_merge_mask, + MLX5_TIRC_PACKET_MERGE_MASK_IPV4_LRO | + MLX5_TIRC_PACKET_MERGE_MASK_IPV6_LRO); + MLX5_SET(tirc, tirc, lro_max_ip_payload_size, + (MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ - rough_max_l2_l3_hdr_sz) >> 8); + MLX5_SET(tirc, tirc, lro_timeout_period_usecs, pkt_merge_param->timeout); + break; + case MLX5E_PACKET_MERGE_SHAMPO: + MLX5_SET(tirc, tirc, packet_merge_mask, MLX5_TIRC_PACKET_MERGE_MASK_SHAMPO); + break; + default: + break; + } +} + +static int mlx5e_hfunc_to_hw(u8 hfunc) +{ + switch (hfunc) { + case ETH_RSS_HASH_TOP: + return MLX5_RX_HASH_FN_TOEPLITZ; + case ETH_RSS_HASH_XOR: + return MLX5_RX_HASH_FN_INVERTED_XOR8; + default: + return MLX5_RX_HASH_FN_NONE; + } +} + +void mlx5e_tir_builder_build_rss(struct mlx5e_tir_builder *builder, + const struct mlx5e_rss_params_hash *rss_hash, + const struct mlx5e_rss_params_traffic_type *rss_tt, + bool inner) +{ + void *tirc = mlx5e_tir_builder_get_tirc(builder); + void *hfso; + + if (builder->modify) + MLX5_SET(modify_tir_in, builder->in, bitmask.hash, 1); + + MLX5_SET(tirc, tirc, rx_hash_fn, mlx5e_hfunc_to_hw(rss_hash->hfunc)); + if (rss_hash->hfunc == ETH_RSS_HASH_TOP) { + const size_t len = MLX5_FLD_SZ_BYTES(tirc, rx_hash_toeplitz_key); + void *rss_key = MLX5_ADDR_OF(tirc, tirc, rx_hash_toeplitz_key); + + MLX5_SET(tirc, tirc, rx_hash_symmetric, 1); + memcpy(rss_key, rss_hash->toeplitz_hash_key, len); + } + + if (inner) + hfso = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_inner); + else + hfso = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer); + MLX5_SET(rx_hash_field_select, hfso, l3_prot_type, rss_tt->l3_prot_type); + MLX5_SET(rx_hash_field_select, hfso, l4_prot_type, rss_tt->l4_prot_type); + MLX5_SET(rx_hash_field_select, hfso, selected_fields, rss_tt->rx_hash_fields); +} + +void mlx5e_tir_builder_build_direct(struct mlx5e_tir_builder *builder) +{ + void *tirc = mlx5e_tir_builder_get_tirc(builder); + + WARN_ON(builder->modify); + + MLX5_SET(tirc, tirc, rx_hash_fn, MLX5_RX_HASH_FN_INVERTED_XOR8); +} + +void mlx5e_tir_builder_build_tls(struct mlx5e_tir_builder *builder) +{ + void *tirc = mlx5e_tir_builder_get_tirc(builder); + + WARN_ON(builder->modify); + + MLX5_SET(tirc, tirc, tls_en, 1); + MLX5_SET(tirc, tirc, self_lb_block, + MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST | + MLX5_TIRC_SELF_LB_BLOCK_BLOCK_MULTICAST); +} + +int mlx5e_tir_init(struct mlx5e_tir *tir, struct mlx5e_tir_builder *builder, + struct mlx5_core_dev *mdev, bool reg) +{ + int err; + + tir->mdev = mdev; + + err = mlx5_core_create_tir(tir->mdev, builder->in, &tir->tirn); + if (err) + return err; + + if (reg) { + struct mlx5e_hw_objs *res = &tir->mdev->mlx5e_res.hw_objs; + + mutex_lock(&res->td.list_lock); + list_add(&tir->list, &res->td.tirs_list); + mutex_unlock(&res->td.list_lock); + } else { + INIT_LIST_HEAD(&tir->list); + } + + return 0; +} + +void mlx5e_tir_destroy(struct mlx5e_tir *tir) +{ + struct mlx5e_hw_objs *res = &tir->mdev->mlx5e_res.hw_objs; + + /* Skip mutex if list_del is no-op (the TIR wasn't registered in the + * list). list_empty will never return true for an item of tirs_list, + * and READ_ONCE/WRITE_ONCE in list_empty/list_del guarantee consistency + * of the list->next value. + */ + if (!list_empty(&tir->list)) { + mutex_lock(&res->td.list_lock); + list_del(&tir->list); + mutex_unlock(&res->td.list_lock); + } + + mlx5_core_destroy_tir(tir->mdev, tir->tirn); + tir->tirn = 0; +} + +int mlx5e_tir_modify(struct mlx5e_tir *tir, struct mlx5e_tir_builder *builder) +{ + return mlx5_core_modify_tir(tir->mdev, tir->tirn, builder->in); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tir.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tir.h new file mode 100644 index 0000000..857a84b --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/tir.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2021, Mellanox Technologies inc. All rights reserved. */ + +#ifndef __MLX5_EN_TIR_H__ +#define __MLX5_EN_TIR_H__ + +#include + +struct mlx5e_rss_params_hash { + u8 hfunc; + u8 toeplitz_hash_key[40]; +}; + +struct mlx5e_rss_params_traffic_type { + u8 l3_prot_type; + u8 l4_prot_type; + u32 rx_hash_fields; +}; + +struct mlx5e_tir_builder; +struct mlx5e_packet_merge_param; + +struct mlx5e_tir_builder *mlx5e_tir_builder_alloc(bool modify); +void mlx5e_tir_builder_free(struct mlx5e_tir_builder *builder); +void mlx5e_tir_builder_clear(struct mlx5e_tir_builder *builder); + +void mlx5e_tir_builder_build_inline(struct mlx5e_tir_builder *builder, u32 tdn, u32 rqn); +void mlx5e_tir_builder_build_rqt(struct mlx5e_tir_builder *builder, u32 tdn, + u32 rqtn, bool inner_ft_support); +void mlx5e_tir_builder_build_packet_merge(struct mlx5e_tir_builder *builder, + const struct mlx5e_packet_merge_param *pkt_merge_param); +void mlx5e_tir_builder_build_rss(struct mlx5e_tir_builder *builder, + const struct mlx5e_rss_params_hash *rss_hash, + const struct mlx5e_rss_params_traffic_type *rss_tt, + bool inner); +void mlx5e_tir_builder_build_direct(struct mlx5e_tir_builder *builder); +void mlx5e_tir_builder_build_tls(struct mlx5e_tir_builder *builder); + +struct mlx5_core_dev; + +struct mlx5e_tir { + struct mlx5_core_dev *mdev; + u32 tirn; + struct list_head list; +}; + +int mlx5e_tir_init(struct mlx5e_tir *tir, struct mlx5e_tir_builder *builder, + struct mlx5_core_dev *mdev, bool reg); +void mlx5e_tir_destroy(struct mlx5e_tir *tir); + +static inline u32 mlx5e_tir_get_tirn(struct mlx5e_tir *tir) +{ + return tir->tirn; +} + +int mlx5e_tir_modify(struct mlx5e_tir *tir, struct mlx5e_tir_builder *builder); + +#endif /* __MLX5_EN_TIR_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/trap.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/trap.c new file mode 100644 index 0000000..6ce34a7 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/trap.c @@ -0,0 +1,322 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020 Mellanox Technologies */ + +#include +#include "en/txrx.h" +#include "en/params.h" +#include "en/trap.h" + +static int mlx5e_trap_napi_poll(struct napi_struct *napi, int budget) +{ + struct mlx5e_trap *trap_ctx = container_of(napi, struct mlx5e_trap, napi); + struct mlx5e_ch_stats *ch_stats = trap_ctx->stats; + struct mlx5e_rq *rq = &trap_ctx->rq; + bool busy = false; + int work_done = 0; + + rcu_read_lock(); + + ch_stats->poll++; + + work_done = mlx5e_poll_rx_cq(&rq->cq, budget); + busy |= work_done == budget; + busy |= rq->post_wqes(rq); + + if (busy) { + work_done = budget; + goto out; + } + + if (unlikely(!napi_complete_done(napi, work_done))) + goto out; + + mlx5e_cq_arm(&rq->cq); + +out: + rcu_read_unlock(); + return work_done; +} + +static void mlx5e_init_trap_rq(struct mlx5e_trap *t, struct mlx5e_params *params, + struct mlx5e_rq *rq) +{ + struct mlx5_core_dev *mdev = t->mdev; + struct mlx5e_priv *priv = t->priv; + + rq->wq_type = params->rq_wq_type; + rq->pdev = t->pdev; + rq->netdev = priv->netdev; + rq->priv = priv; + rq->clock = &mdev->clock; + rq->tstamp = &priv->tstamp; + rq->mdev = mdev; + rq->hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu); + rq->stats = &priv->trap_stats.rq; + rq->ptp_cyc2time = mlx5_rq_ts_translator(mdev); + xdp_rxq_info_unused(&rq->xdp_rxq); + mlx5e_rq_set_trap_handlers(rq, params); +} + +static int mlx5e_open_trap_rq(struct mlx5e_priv *priv, struct mlx5e_trap *t) +{ + struct mlx5e_rq_param *rq_param = &t->rq_param; + struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5e_create_cq_param ccp = {}; + struct dim_cq_moder trap_moder = {}; + struct mlx5e_rq *rq = &t->rq; + int node; + int err; + + node = dev_to_node(mdev->device); + + ccp.node = node; + ccp.ch_stats = t->stats; + ccp.napi = &t->napi; + ccp.ix = 0; + + mlx5e_init_trap_rq(t, &t->params, rq); + err = mlx5e_open_rq(priv, &t->params, rq_param, NULL, &ccp, trap_moder, node, rq); + if (err) + return err; + + return 0; +} + +static void mlx5e_close_trap_rq(struct mlx5e_rq *rq) +{ + mlx5e_close_rq(rq->priv, rq); +} + +static int mlx5e_create_trap_direct_rq_tir(struct mlx5_core_dev *mdev, struct mlx5e_tir *tir, + u32 rqn) +{ + struct mlx5e_tir_builder *builder; + int err; + + builder = mlx5e_tir_builder_alloc(false); + if (!builder) + return -ENOMEM; + + mlx5e_tir_builder_build_inline(builder, mdev->mlx5e_res.hw_objs.td.tdn, rqn); + err = mlx5e_tir_init(tir, builder, mdev, true); + + mlx5e_tir_builder_free(builder); + + return err; +} + +static void mlx5e_build_trap_params(struct mlx5_core_dev *mdev, + int max_mtu, u16 q_counter, + struct mlx5e_trap *t) +{ + struct mlx5e_params *params = &t->params; + + params->rq_wq_type = MLX5_WQ_TYPE_CYCLIC; + mlx5e_init_rq_type_params(mdev, params); + params->sw_mtu = max_mtu; + mlx5e_build_rq_param(mdev, params, NULL, q_counter, &t->rq_param); +} + +static struct mlx5e_trap *mlx5e_open_trap(struct mlx5e_priv *priv) +{ + int cpu = cpumask_first(mlx5_comp_irq_get_affinity_mask(priv->mdev, 0)); + struct net_device *netdev = priv->netdev; + struct mlx5e_trap *t; + int err; + + t = kvzalloc_node(sizeof(*t), GFP_KERNEL, cpu_to_node(cpu)); + if (!t) + return ERR_PTR(-ENOMEM); + + mlx5e_build_trap_params(priv->mdev, netdev->max_mtu, priv->q_counter, t); + + t->priv = priv; + t->mdev = priv->mdev; + t->tstamp = &priv->tstamp; + t->pdev = mlx5_core_dma_dev(priv->mdev); + t->netdev = priv->netdev; + t->mkey_be = cpu_to_be32(priv->mdev->mlx5e_res.hw_objs.mkey); + t->stats = &priv->trap_stats.ch; + + netif_napi_add(netdev, &t->napi, mlx5e_trap_napi_poll, 64); + + err = mlx5e_open_trap_rq(priv, t); + if (unlikely(err)) + goto err_napi_del; + + err = mlx5e_create_trap_direct_rq_tir(t->mdev, &t->tir, t->rq.rqn); + if (err) + goto err_close_trap_rq; + + return t; + +err_close_trap_rq: + mlx5e_close_trap_rq(&t->rq); +err_napi_del: + netif_napi_del(&t->napi); + kvfree(t); + return ERR_PTR(err); +} + +void mlx5e_close_trap(struct mlx5e_trap *trap) +{ + mlx5e_tir_destroy(&trap->tir); + mlx5e_close_trap_rq(&trap->rq); + netif_napi_del(&trap->napi); + kvfree(trap); +} + +static void mlx5e_activate_trap(struct mlx5e_trap *trap) +{ + napi_enable(&trap->napi); + mlx5e_activate_rq(&trap->rq); + mlx5e_trigger_napi_sched(&trap->napi); +} + +void mlx5e_deactivate_trap(struct mlx5e_priv *priv) +{ + struct mlx5e_trap *trap = priv->en_trap; + + mlx5e_deactivate_rq(&trap->rq); + napi_disable(&trap->napi); +} + +static struct mlx5e_trap *mlx5e_add_trap_queue(struct mlx5e_priv *priv) +{ + struct mlx5e_trap *trap; + + trap = mlx5e_open_trap(priv); + if (IS_ERR(trap)) + goto out; + + mlx5e_activate_trap(trap); +out: + return trap; +} + +static void mlx5e_del_trap_queue(struct mlx5e_priv *priv) +{ + mlx5e_deactivate_trap(priv); + mlx5e_close_trap(priv->en_trap); + priv->en_trap = NULL; +} + +static int mlx5e_trap_get_tirn(struct mlx5e_trap *en_trap) +{ + return en_trap->tir.tirn; +} + +static int mlx5e_handle_action_trap(struct mlx5e_priv *priv, int trap_id) +{ + bool open_queue = !priv->en_trap; + struct mlx5e_trap *trap; + int err; + + if (open_queue) { + trap = mlx5e_add_trap_queue(priv); + if (IS_ERR(trap)) + return PTR_ERR(trap); + priv->en_trap = trap; + } + + switch (trap_id) { + case DEVLINK_TRAP_GENERIC_ID_INGRESS_VLAN_FILTER: + err = mlx5e_add_vlan_trap(priv, trap_id, mlx5e_trap_get_tirn(priv->en_trap)); + if (err) + goto err_out; + break; + case DEVLINK_TRAP_GENERIC_ID_DMAC_FILTER: + err = mlx5e_add_mac_trap(priv, trap_id, mlx5e_trap_get_tirn(priv->en_trap)); + if (err) + goto err_out; + break; + default: + netdev_warn(priv->netdev, "%s: Unknown trap id %d\n", __func__, trap_id); + err = -EINVAL; + goto err_out; + } + return 0; + +err_out: + if (open_queue) + mlx5e_del_trap_queue(priv); + return err; +} + +static int mlx5e_handle_action_drop(struct mlx5e_priv *priv, int trap_id) +{ + switch (trap_id) { + case DEVLINK_TRAP_GENERIC_ID_INGRESS_VLAN_FILTER: + mlx5e_remove_vlan_trap(priv); + break; + case DEVLINK_TRAP_GENERIC_ID_DMAC_FILTER: + mlx5e_remove_mac_trap(priv); + break; + default: + netdev_warn(priv->netdev, "%s: Unknown trap id %d\n", __func__, trap_id); + return -EINVAL; + } + if (priv->en_trap && !mlx5_devlink_trap_get_num_active(priv->mdev)) + mlx5e_del_trap_queue(priv); + + return 0; +} + +int mlx5e_handle_trap_event(struct mlx5e_priv *priv, struct mlx5_trap_ctx *trap_ctx) +{ + int err = 0; + + /* Traps are unarmed when interface is down, no need to update + * them. The configuration is saved in the core driver, + * queried and applied upon interface up operation in + * mlx5e_open_locked(). + */ + if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) + return 0; + + switch (trap_ctx->action) { + case DEVLINK_TRAP_ACTION_TRAP: + err = mlx5e_handle_action_trap(priv, trap_ctx->id); + break; + case DEVLINK_TRAP_ACTION_DROP: + err = mlx5e_handle_action_drop(priv, trap_ctx->id); + break; + default: + netdev_warn(priv->netdev, "%s: Unsupported action %d\n", __func__, + trap_ctx->action); + err = -EINVAL; + } + return err; +} + +static int mlx5e_apply_trap(struct mlx5e_priv *priv, int trap_id, bool enable) +{ + enum devlink_trap_action action; + int err; + + err = mlx5_devlink_traps_get_action(priv->mdev, trap_id, &action); + if (err) + return err; + if (action == DEVLINK_TRAP_ACTION_TRAP) + err = enable ? mlx5e_handle_action_trap(priv, trap_id) : + mlx5e_handle_action_drop(priv, trap_id); + return err; +} + +static const int mlx5e_traps_arr[] = { + DEVLINK_TRAP_GENERIC_ID_INGRESS_VLAN_FILTER, + DEVLINK_TRAP_GENERIC_ID_DMAC_FILTER, +}; + +int mlx5e_apply_traps(struct mlx5e_priv *priv, bool enable) +{ + int err; + int i; + + for (i = 0; i < ARRAY_SIZE(mlx5e_traps_arr); i++) { + err = mlx5e_apply_trap(priv, mlx5e_traps_arr[i], enable); + if (err) + return err; + } + return 0; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/trap.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/trap.h new file mode 100644 index 0000000..aa3f176 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/trap.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2020, Mellanox Technologies */ + +#ifndef __MLX5E_TRAP_H__ +#define __MLX5E_TRAP_H__ + +#include "../en.h" +#include "../devlink.h" + +struct mlx5e_trap { + /* data path */ + struct mlx5e_rq rq; + struct mlx5e_tir tir; + struct napi_struct napi; + struct device *pdev; + struct net_device *netdev; + __be32 mkey_be; + + /* data path - accessed per napi poll */ + struct mlx5e_ch_stats *stats; + + /* control */ + struct mlx5e_priv *priv; + struct mlx5_core_dev *mdev; + struct hwtstamp_config *tstamp; + DECLARE_BITMAP(state, MLX5E_CHANNEL_NUM_STATES); + + struct mlx5e_params params; + struct mlx5e_rq_param rq_param; +}; + +void mlx5e_close_trap(struct mlx5e_trap *trap); +void mlx5e_deactivate_trap(struct mlx5e_priv *priv); +int mlx5e_handle_trap_event(struct mlx5e_priv *priv, struct mlx5_trap_ctx *trap_ctx); +int mlx5e_apply_traps(struct mlx5e_priv *priv, bool enable); + +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h new file mode 100644 index 0000000..97a408b --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h @@ -0,0 +1,479 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2019 Mellanox Technologies. */ + +#ifndef __MLX5_EN_TXRX_H___ +#define __MLX5_EN_TXRX_H___ + +#include "en.h" +#include + +#define MLX5E_TX_WQE_EMPTY_DS_COUNT (sizeof(struct mlx5e_tx_wqe) / MLX5_SEND_WQE_DS) + +/* The mult of MLX5_SEND_WQE_MAX_WQEBBS * MLX5_SEND_WQEBB_NUM_DS + * (16 * 4 == 64) does not fit in the 6-bit DS field of Ctrl Segment. + * We use a bound lower that MLX5_SEND_WQE_MAX_WQEBBS to let a + * full-session WQE be cache-aligned. + */ +#if L1_CACHE_BYTES < 128 +#define MLX5E_TX_MPW_MAX_WQEBBS (MLX5_SEND_WQE_MAX_WQEBBS - 1) +#else +#define MLX5E_TX_MPW_MAX_WQEBBS (MLX5_SEND_WQE_MAX_WQEBBS - 2) +#endif + +#define MLX5E_TX_MPW_MAX_NUM_DS (MLX5E_TX_MPW_MAX_WQEBBS * MLX5_SEND_WQEBB_NUM_DS) + +#define INL_HDR_START_SZ (sizeof(((struct mlx5_wqe_eth_seg *)NULL)->inline_hdr.start)) + +#define MLX5E_RX_ERR_CQE(cqe) (get_cqe_opcode(cqe) != MLX5_CQE_RESP_SEND) + +static inline +ktime_t mlx5e_cqe_ts_to_ns(cqe_ts_to_ns func, struct mlx5_clock *clock, u64 cqe_ts) +{ + return INDIRECT_CALL_2(func, mlx5_real_time_cyc2time, mlx5_timecounter_cyc2time, + clock, cqe_ts); +} + +enum mlx5e_icosq_wqe_type { + MLX5E_ICOSQ_WQE_NOP, + MLX5E_ICOSQ_WQE_UMR_RX, + MLX5E_ICOSQ_WQE_SHAMPO_HD_UMR, +#ifdef CONFIG_MLX5_EN_TLS + MLX5E_ICOSQ_WQE_UMR_TLS, + MLX5E_ICOSQ_WQE_SET_PSV_TLS, + MLX5E_ICOSQ_WQE_GET_PSV_TLS, +#endif +}; + +/* General */ +static inline bool mlx5e_skb_is_multicast(struct sk_buff *skb) +{ + return skb->pkt_type == PACKET_MULTICAST || skb->pkt_type == PACKET_BROADCAST; +} + +void mlx5e_trigger_irq(struct mlx5e_icosq *sq); +void mlx5e_completion_event(struct mlx5_core_cq *mcq, struct mlx5_eqe *eqe); +void mlx5e_cq_error_event(struct mlx5_core_cq *mcq, enum mlx5_event event); +int mlx5e_napi_poll(struct napi_struct *napi, int budget); +int mlx5e_poll_ico_cq(struct mlx5e_cq *cq); + +/* RX */ +void mlx5e_page_dma_unmap(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info); +void mlx5e_page_release_dynamic(struct mlx5e_rq *rq, + struct mlx5e_dma_info *dma_info, + bool recycle); +INDIRECT_CALLABLE_DECLARE(bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq)); +INDIRECT_CALLABLE_DECLARE(bool mlx5e_post_rx_mpwqes(struct mlx5e_rq *rq)); +INDIRECT_CALLABLE_DECLARE(bool mlx5e_post_rx_skip(struct mlx5e_rq *rq)); +int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget); +void mlx5e_free_rx_descs(struct mlx5e_rq *rq); +void mlx5e_free_rx_in_progress_descs(struct mlx5e_rq *rq); + +/* TX */ +u16 mlx5e_select_queue(struct net_device *dev, struct sk_buff *skb, + struct net_device *sb_dev); +netdev_tx_t mlx5e_xmit(struct sk_buff *skb, struct net_device *dev); +bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget); +void mlx5e_free_txqsq_descs(struct mlx5e_txqsq *sq); + +static inline bool +mlx5e_skb_fifo_has_room(struct mlx5e_skb_fifo *fifo) +{ + return (*fifo->pc - *fifo->cc) < fifo->mask; +} + +static inline bool +mlx5e_wqc_has_room_for(struct mlx5_wq_cyc *wq, u16 cc, u16 pc, u16 n) +{ + return (mlx5_wq_cyc_ctr2ix(wq, cc - pc) >= n) || (cc == pc); +} + +static inline void *mlx5e_fetch_wqe(struct mlx5_wq_cyc *wq, u16 pi, size_t wqe_size) +{ + void *wqe; + + wqe = mlx5_wq_cyc_get_wqe(wq, pi); + memset(wqe, 0, wqe_size); + + return wqe; +} + +#define MLX5E_TX_FETCH_WQE(sq, pi) \ + ((struct mlx5e_tx_wqe *)mlx5e_fetch_wqe(&(sq)->wq, pi, sizeof(struct mlx5e_tx_wqe))) + +static inline struct mlx5e_tx_wqe * +mlx5e_post_nop(struct mlx5_wq_cyc *wq, u32 sqn, u16 *pc) +{ + u16 pi = mlx5_wq_cyc_ctr2ix(wq, *pc); + struct mlx5e_tx_wqe *wqe = mlx5_wq_cyc_get_wqe(wq, pi); + struct mlx5_wqe_ctrl_seg *cseg = &wqe->ctrl; + + memset(cseg, 0, sizeof(*cseg)); + + cseg->opmod_idx_opcode = cpu_to_be32((*pc << 8) | MLX5_OPCODE_NOP); + cseg->qpn_ds = cpu_to_be32((sqn << 8) | 0x01); + + (*pc)++; + + return wqe; +} + +static inline struct mlx5e_tx_wqe * +mlx5e_post_nop_fence(struct mlx5_wq_cyc *wq, u32 sqn, u16 *pc) +{ + u16 pi = mlx5_wq_cyc_ctr2ix(wq, *pc); + struct mlx5e_tx_wqe *wqe = mlx5_wq_cyc_get_wqe(wq, pi); + struct mlx5_wqe_ctrl_seg *cseg = &wqe->ctrl; + + memset(cseg, 0, sizeof(*cseg)); + + cseg->opmod_idx_opcode = cpu_to_be32((*pc << 8) | MLX5_OPCODE_NOP); + cseg->qpn_ds = cpu_to_be32((sqn << 8) | 0x01); + cseg->fm_ce_se = MLX5_FENCE_MODE_INITIATOR_SMALL; + + (*pc)++; + + return wqe; +} + +struct mlx5e_tx_wqe_info { + struct sk_buff *skb; + u32 num_bytes; + u8 num_wqebbs; + u8 num_dma; + u8 num_fifo_pkts; +#ifdef CONFIG_MLX5_EN_TLS + struct page *resync_dump_frag_page; +#endif +}; + +static inline u16 mlx5e_txqsq_get_next_pi(struct mlx5e_txqsq *sq, u16 size) +{ + struct mlx5_wq_cyc *wq = &sq->wq; + u16 pi, contig_wqebbs; + + pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc); + contig_wqebbs = mlx5_wq_cyc_get_contig_wqebbs(wq, pi); + if (unlikely(contig_wqebbs < size)) { + struct mlx5e_tx_wqe_info *wi, *edge_wi; + + wi = &sq->db.wqe_info[pi]; + edge_wi = wi + contig_wqebbs; + + /* Fill SQ frag edge with NOPs to avoid WQE wrapping two pages. */ + for (; wi < edge_wi; wi++) { + *wi = (struct mlx5e_tx_wqe_info) { + .num_wqebbs = 1, + }; + mlx5e_post_nop(wq, sq->sqn, &sq->pc); + } + sq->stats->nop += contig_wqebbs; + + pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc); + } + + return pi; +} + +static inline u16 mlx5e_shampo_get_cqe_header_index(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe) +{ + return be16_to_cpu(cqe->shampo.header_entry_index) & (rq->mpwqe.shampo->hd_per_wq - 1); +} + +struct mlx5e_shampo_umr { + u16 len; +}; + +struct mlx5e_icosq_wqe_info { + u8 wqe_type; + u8 num_wqebbs; + + /* Auxiliary data for different wqe types. */ + union { + struct { + struct mlx5e_rq *rq; + } umr; + struct mlx5e_shampo_umr shampo; +#ifdef CONFIG_MLX5_EN_TLS + struct { + struct mlx5e_ktls_offload_context_rx *priv_rx; + } tls_set_params; + struct { + struct mlx5e_ktls_rx_resync_buf *buf; + } tls_get_params; +#endif + }; +}; + +void mlx5e_free_icosq_descs(struct mlx5e_icosq *sq); + +static inline u16 mlx5e_icosq_get_next_pi(struct mlx5e_icosq *sq, u16 size) +{ + struct mlx5_wq_cyc *wq = &sq->wq; + u16 pi, contig_wqebbs; + + pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc); + contig_wqebbs = mlx5_wq_cyc_get_contig_wqebbs(wq, pi); + if (unlikely(contig_wqebbs < size)) { + struct mlx5e_icosq_wqe_info *wi, *edge_wi; + + wi = &sq->db.wqe_info[pi]; + edge_wi = wi + contig_wqebbs; + + /* Fill SQ frag edge with NOPs to avoid WQE wrapping two pages. */ + for (; wi < edge_wi; wi++) { + *wi = (struct mlx5e_icosq_wqe_info) { + .wqe_type = MLX5E_ICOSQ_WQE_NOP, + .num_wqebbs = 1, + }; + mlx5e_post_nop(wq, sq->sqn, &sq->pc); + } + + pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc); + } + + return pi; +} + +static inline void +mlx5e_notify_hw(struct mlx5_wq_cyc *wq, u16 pc, void __iomem *uar_map, + struct mlx5_wqe_ctrl_seg *ctrl) +{ + ctrl->fm_ce_se |= MLX5_WQE_CTRL_CQ_UPDATE; + /* ensure wqe is visible to device before updating doorbell record */ + dma_wmb(); + + *wq->db = cpu_to_be32(pc); + + /* ensure doorbell record is visible to device before ringing the + * doorbell + */ + wmb(); + + mlx5_write64((__be32 *)ctrl, uar_map); +} + +static inline void mlx5e_cq_arm(struct mlx5e_cq *cq) +{ + struct mlx5_core_cq *mcq; + + mcq = &cq->mcq; + mlx5_cq_arm(mcq, MLX5_CQ_DB_REQ_NOT, mcq->uar->map, cq->wq.cc); +} + +static inline struct mlx5e_sq_dma * +mlx5e_dma_get(struct mlx5e_txqsq *sq, u32 i) +{ + return &sq->db.dma_fifo[i & sq->dma_fifo_mask]; +} + +static inline void +mlx5e_dma_push(struct mlx5e_txqsq *sq, dma_addr_t addr, u32 size, + enum mlx5e_dma_map_type map_type) +{ + struct mlx5e_sq_dma *dma = mlx5e_dma_get(sq, sq->dma_fifo_pc++); + + dma->addr = addr; + dma->size = size; + dma->type = map_type; +} + +static inline +struct sk_buff **mlx5e_skb_fifo_get(struct mlx5e_skb_fifo *fifo, u16 i) +{ + return &fifo->fifo[i & fifo->mask]; +} + +static inline +void mlx5e_skb_fifo_push(struct mlx5e_skb_fifo *fifo, struct sk_buff *skb) +{ + struct sk_buff **skb_item = mlx5e_skb_fifo_get(fifo, (*fifo->pc)++); + + *skb_item = skb; +} + +static inline +struct sk_buff *mlx5e_skb_fifo_pop(struct mlx5e_skb_fifo *fifo) +{ + return *mlx5e_skb_fifo_get(fifo, (*fifo->cc)++); +} + +static inline void +mlx5e_tx_dma_unmap(struct device *pdev, struct mlx5e_sq_dma *dma) +{ + switch (dma->type) { + case MLX5E_DMA_MAP_SINGLE: + dma_unmap_single(pdev, dma->addr, dma->size, DMA_TO_DEVICE); + break; + case MLX5E_DMA_MAP_PAGE: + dma_unmap_page(pdev, dma->addr, dma->size, DMA_TO_DEVICE); + break; + default: + WARN_ONCE(true, "mlx5e_tx_dma_unmap unknown DMA type!\n"); + } +} + +void mlx5e_sq_xmit_simple(struct mlx5e_txqsq *sq, struct sk_buff *skb, bool xmit_more); +void mlx5e_tx_mpwqe_ensure_complete(struct mlx5e_txqsq *sq); + +static inline bool mlx5e_tx_mpwqe_is_full(struct mlx5e_tx_mpwqe *session, u8 max_sq_mpw_wqebbs) +{ + return session->ds_count == max_sq_mpw_wqebbs * MLX5_SEND_WQEBB_NUM_DS; +} + +static inline void mlx5e_rqwq_reset(struct mlx5e_rq *rq) +{ + if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) { + mlx5_wq_ll_reset(&rq->mpwqe.wq); + rq->mpwqe.actual_wq_head = 0; + } else { + mlx5_wq_cyc_reset(&rq->wqe.wq); + } +} + +static inline void mlx5e_dump_error_cqe(struct mlx5e_cq *cq, u32 qn, + struct mlx5_err_cqe *err_cqe) +{ + struct mlx5_cqwq *wq = &cq->wq; + u32 ci; + + ci = mlx5_cqwq_ctr2ix(wq, wq->cc - 1); + + netdev_err(cq->netdev, + "Error cqe on cqn 0x%x, ci 0x%x, qn 0x%x, opcode 0x%x, syndrome 0x%x, vendor syndrome 0x%x\n", + cq->mcq.cqn, ci, qn, + get_cqe_opcode((struct mlx5_cqe64 *)err_cqe), + err_cqe->syndrome, err_cqe->vendor_err_synd); + mlx5_dump_err_cqe(cq->mdev, err_cqe); +} + +static inline u32 mlx5e_rqwq_get_size(struct mlx5e_rq *rq) +{ + switch (rq->wq_type) { + case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ: + return mlx5_wq_ll_get_size(&rq->mpwqe.wq); + default: + return mlx5_wq_cyc_get_size(&rq->wqe.wq); + } +} + +static inline u32 mlx5e_rqwq_get_cur_sz(struct mlx5e_rq *rq) +{ + switch (rq->wq_type) { + case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ: + return rq->mpwqe.wq.cur_sz; + default: + return rq->wqe.wq.cur_sz; + } +} + +static inline u16 mlx5e_rqwq_get_head(struct mlx5e_rq *rq) +{ + switch (rq->wq_type) { + case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ: + return mlx5_wq_ll_get_head(&rq->mpwqe.wq); + default: + return mlx5_wq_cyc_get_head(&rq->wqe.wq); + } +} + +static inline u16 mlx5e_rqwq_get_wqe_counter(struct mlx5e_rq *rq) +{ + switch (rq->wq_type) { + case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ: + return mlx5_wq_ll_get_counter(&rq->mpwqe.wq); + default: + return mlx5_wq_cyc_get_counter(&rq->wqe.wq); + } +} + +/* SW parser related functions */ + +struct mlx5e_swp_spec { + __be16 l3_proto; + u8 l4_proto; + u8 is_tun; + __be16 tun_l3_proto; + u8 tun_l4_proto; +}; + +static inline void mlx5e_eseg_swp_offsets_add_vlan(struct mlx5_wqe_eth_seg *eseg) +{ + /* SWP offsets are in 2-bytes words */ + eseg->swp_outer_l3_offset += VLAN_HLEN / 2; + eseg->swp_outer_l4_offset += VLAN_HLEN / 2; + eseg->swp_inner_l3_offset += VLAN_HLEN / 2; + eseg->swp_inner_l4_offset += VLAN_HLEN / 2; +} + +static inline void +mlx5e_set_eseg_swp(struct sk_buff *skb, struct mlx5_wqe_eth_seg *eseg, + struct mlx5e_swp_spec *swp_spec) +{ + /* SWP offsets are in 2-bytes words */ + eseg->swp_outer_l3_offset = skb_network_offset(skb) / 2; + if (swp_spec->l3_proto == htons(ETH_P_IPV6)) + eseg->swp_flags |= MLX5_ETH_WQE_SWP_OUTER_L3_IPV6; + if (swp_spec->l4_proto) { + eseg->swp_outer_l4_offset = skb_transport_offset(skb) / 2; + if (swp_spec->l4_proto == IPPROTO_UDP) + eseg->swp_flags |= MLX5_ETH_WQE_SWP_OUTER_L4_UDP; + } + + if (swp_spec->is_tun) { + eseg->swp_inner_l3_offset = skb_inner_network_offset(skb) / 2; + if (swp_spec->tun_l3_proto == htons(ETH_P_IPV6)) + eseg->swp_flags |= MLX5_ETH_WQE_SWP_INNER_L3_IPV6; + } else { /* typically for ipsec when xfrm mode != XFRM_MODE_TUNNEL */ + eseg->swp_inner_l3_offset = skb_network_offset(skb) / 2; + if (swp_spec->l3_proto == htons(ETH_P_IPV6)) + eseg->swp_flags |= MLX5_ETH_WQE_SWP_INNER_L3_IPV6; + } + switch (swp_spec->tun_l4_proto) { + case IPPROTO_UDP: + eseg->swp_flags |= MLX5_ETH_WQE_SWP_INNER_L4_UDP; + fallthrough; + case IPPROTO_TCP: + eseg->swp_inner_l4_offset = skb_inner_transport_offset(skb) / 2; + break; + } +} + +#define MLX5E_STOP_ROOM(wqebbs) ((wqebbs) * 2 - 1) + +static inline u16 mlx5e_stop_room_for_wqe(struct mlx5_core_dev *mdev, u16 wqe_size) +{ + /* A WQE must not cross the page boundary, hence two conditions: + * 1. Its size must not exceed the page size. + * 2. If the WQE size is X, and the space remaining in a page is less + * than X, this space needs to be padded with NOPs. So, one WQE of + * size X may require up to X-1 WQEBBs of padding, which makes the + * stop room of X-1 + X. + * WQE size is also limited by the hardware limit. + */ + WARN_ONCE(wqe_size > mlx5e_get_max_sq_wqebbs(mdev), + "wqe_size %u is greater than max SQ WQEBBs %u", + wqe_size, mlx5e_get_max_sq_wqebbs(mdev)); + + + return MLX5E_STOP_ROOM(wqe_size); +} + +static inline u16 mlx5e_stop_room_for_max_wqe(struct mlx5_core_dev *mdev) +{ + return MLX5E_STOP_ROOM(mlx5e_get_max_sq_wqebbs(mdev)); +} + +static inline u16 mlx5e_stop_room_for_mpwqe(struct mlx5_core_dev *mdev) +{ + u8 mpwqe_wqebbs = mlx5e_get_max_sq_aligned_wqebbs(mdev); + + return mlx5e_stop_room_for_wqe(mdev, mpwqe_wqebbs); +} + +static inline bool mlx5e_icosq_can_post_wqe(struct mlx5e_icosq *sq, u16 wqe_size) +{ + u16 room = sq->reserved_room + MLX5E_STOP_ROOM(wqe_size); + + return mlx5e_wqc_has_room_for(&sq->wq, sq->cc, sq->pc, room); +} +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c new file mode 100644 index 0000000..70c04bc --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c @@ -0,0 +1,592 @@ +/* + * Copyright (c) 2018, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include "en/xdp.h" +#include "en/params.h" + +int mlx5e_xdp_max_mtu(struct mlx5e_params *params, struct mlx5e_xsk_param *xsk) +{ + int hr = mlx5e_get_linear_rq_headroom(params, xsk); + + /* Let S := SKB_DATA_ALIGN(sizeof(struct skb_shared_info)). + * The condition checked in mlx5e_rx_is_linear_skb is: + * SKB_DATA_ALIGN(sw_mtu + hard_mtu + hr) + S <= PAGE_SIZE (1) + * (Note that hw_mtu == sw_mtu + hard_mtu.) + * What is returned from this function is: + * max_mtu = PAGE_SIZE - S - hr - hard_mtu (2) + * After assigning sw_mtu := max_mtu, the left side of (1) turns to + * SKB_DATA_ALIGN(PAGE_SIZE - S) + S, which is equal to PAGE_SIZE, + * because both PAGE_SIZE and S are already aligned. Any number greater + * than max_mtu would make the left side of (1) greater than PAGE_SIZE, + * so max_mtu is the maximum MTU allowed. + */ + + return MLX5E_HW2SW_MTU(params, SKB_MAX_HEAD(hr)); +} + +static inline bool +mlx5e_xmit_xdp_buff(struct mlx5e_xdpsq *sq, struct mlx5e_rq *rq, + struct mlx5e_dma_info *di, struct xdp_buff *xdp) +{ + struct mlx5e_xmit_data xdptxd; + struct mlx5e_xdp_info xdpi; + struct xdp_frame *xdpf; + dma_addr_t dma_addr; + + xdpf = xdp_convert_buff_to_frame(xdp); + if (unlikely(!xdpf)) + return false; + + xdptxd.data = xdpf->data; + xdptxd.len = xdpf->len; + + if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) { + /* The xdp_buff was in the UMEM and was copied into a newly + * allocated page. The UMEM page was returned via the ZCA, and + * this new page has to be mapped at this point and has to be + * unmapped and returned via xdp_return_frame on completion. + */ + + /* Prevent double recycling of the UMEM page. Even in case this + * function returns false, the xdp_buff shouldn't be recycled, + * as it was already done in xdp_convert_zc_to_xdp_frame. + */ + __set_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags); /* non-atomic */ + + xdpi.mode = MLX5E_XDP_XMIT_MODE_FRAME; + + dma_addr = dma_map_single(sq->pdev, xdptxd.data, xdptxd.len, + DMA_TO_DEVICE); + if (dma_mapping_error(sq->pdev, dma_addr)) { + xdp_return_frame(xdpf); + return false; + } + + xdptxd.dma_addr = dma_addr; + xdpi.frame.xdpf = xdpf; + xdpi.frame.dma_addr = dma_addr; + } else { + /* Driver assumes that xdp_convert_buff_to_frame returns + * an xdp_frame that points to the same memory region as + * the original xdp_buff. It allows to map the memory only + * once and to use the DMA_BIDIRECTIONAL mode. + */ + + xdpi.mode = MLX5E_XDP_XMIT_MODE_PAGE; + + dma_addr = di->addr + (xdpf->data - (void *)xdpf); + dma_sync_single_for_device(sq->pdev, dma_addr, xdptxd.len, + DMA_TO_DEVICE); + + xdptxd.dma_addr = dma_addr; + xdpi.page.rq = rq; + xdpi.page.di = *di; + } + + return INDIRECT_CALL_2(sq->xmit_xdp_frame, mlx5e_xmit_xdp_frame_mpwqe, + mlx5e_xmit_xdp_frame, sq, &xdptxd, &xdpi, 0); +} + +/* returns true if packet was consumed by xdp */ +bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct mlx5e_dma_info *di, + u32 *len, struct xdp_buff *xdp) +{ + struct bpf_prog *prog = rcu_dereference(rq->xdp_prog); + u32 act; + int err; + + if (!prog) + return false; + + act = bpf_prog_run_xdp(prog, xdp); + switch (act) { + case XDP_PASS: + *len = xdp->data_end - xdp->data; + return false; + case XDP_TX: + if (unlikely(!mlx5e_xmit_xdp_buff(rq->xdpsq, rq, di, xdp))) + goto xdp_abort; + __set_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags); /* non-atomic */ + return true; + case XDP_REDIRECT: + if (xdp->rxq->mem.type != MEM_TYPE_XSK_BUFF_POOL) { + page_ref_sub(di->page, di->refcnt_bias); + di->refcnt_bias = 0; + } + /* When XDP enabled then page-refcnt==1 here */ + err = xdp_do_redirect(rq->netdev, xdp, prog); + if (unlikely(err)) + goto xdp_abort; + __set_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags); + __set_bit(MLX5E_RQ_FLAG_XDP_REDIRECT, rq->flags); + if (xdp->rxq->mem.type != MEM_TYPE_XSK_BUFF_POOL) + mlx5e_page_dma_unmap(rq, di); + rq->stats->xdp_redirect++; + return true; + default: + bpf_warn_invalid_xdp_action(rq->netdev, prog, act); + fallthrough; + case XDP_ABORTED: +xdp_abort: + trace_xdp_exception(rq->netdev, prog, act); + fallthrough; + case XDP_DROP: + rq->stats->xdp_drop++; + return true; + } +} + +static u16 mlx5e_xdpsq_get_next_pi(struct mlx5e_xdpsq *sq, u16 size) +{ + struct mlx5_wq_cyc *wq = &sq->wq; + u16 pi, contig_wqebbs; + + pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc); + contig_wqebbs = mlx5_wq_cyc_get_contig_wqebbs(wq, pi); + if (unlikely(contig_wqebbs < size)) { + struct mlx5e_xdp_wqe_info *wi, *edge_wi; + + wi = &sq->db.wqe_info[pi]; + edge_wi = wi + contig_wqebbs; + + /* Fill SQ frag edge with NOPs to avoid WQE wrapping two pages. */ + for (; wi < edge_wi; wi++) { + *wi = (struct mlx5e_xdp_wqe_info) { + .num_wqebbs = 1, + .num_pkts = 0, + }; + mlx5e_post_nop(wq, sq->sqn, &sq->pc); + } + sq->stats->nops += contig_wqebbs; + + pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc); + } + + return pi; +} + +static void mlx5e_xdp_mpwqe_session_start(struct mlx5e_xdpsq *sq) +{ + struct mlx5e_tx_mpwqe *session = &sq->mpwqe; + struct mlx5e_xdpsq_stats *stats = sq->stats; + struct mlx5e_tx_wqe *wqe; + u16 pi; + + pi = mlx5e_xdpsq_get_next_pi(sq, sq->max_sq_mpw_wqebbs); + wqe = MLX5E_TX_FETCH_WQE(sq, pi); + net_prefetchw(wqe->data); + + *session = (struct mlx5e_tx_mpwqe) { + .wqe = wqe, + .bytes_count = 0, + .ds_count = MLX5E_TX_WQE_EMPTY_DS_COUNT, + .pkt_count = 0, + .inline_on = mlx5e_xdp_get_inline_state(sq, session->inline_on), + }; + + if (test_bit(MLX5E_SQ_STATE_TX_XDP_CSUM, &sq->state)) { + struct mlx5_wqe_eth_seg *eseg = &wqe->eth; + + eseg->cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM; + } + stats->mpwqe++; +} + +void mlx5e_xdp_mpwqe_complete(struct mlx5e_xdpsq *sq) +{ + struct mlx5_wq_cyc *wq = &sq->wq; + struct mlx5e_tx_mpwqe *session = &sq->mpwqe; + struct mlx5_wqe_ctrl_seg *cseg = &session->wqe->ctrl; + u16 ds_count = session->ds_count; + u16 pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc); + struct mlx5e_xdp_wqe_info *wi = &sq->db.wqe_info[pi]; + + cseg->opmod_idx_opcode = + cpu_to_be32((sq->pc << 8) | MLX5_OPCODE_ENHANCED_MPSW); + cseg->qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_count); + + wi->num_wqebbs = DIV_ROUND_UP(ds_count, MLX5_SEND_WQEBB_NUM_DS); + wi->num_pkts = session->pkt_count; + + sq->pc += wi->num_wqebbs; + + sq->doorbell_cseg = cseg; + + session->wqe = NULL; /* Close session */ +} + +enum { + MLX5E_XDP_CHECK_OK = 1, + MLX5E_XDP_CHECK_START_MPWQE = 2, +}; + +INDIRECT_CALLABLE_SCOPE int mlx5e_xmit_xdp_frame_check_mpwqe(struct mlx5e_xdpsq *sq) +{ + if (unlikely(!sq->mpwqe.wqe)) { + if (unlikely(!mlx5e_wqc_has_room_for(&sq->wq, sq->cc, sq->pc, + sq->stop_room))) { + /* SQ is full, ring doorbell */ + mlx5e_xmit_xdp_doorbell(sq); + sq->stats->full++; + return -EBUSY; + } + + return MLX5E_XDP_CHECK_START_MPWQE; + } + + return MLX5E_XDP_CHECK_OK; +} + +INDIRECT_CALLABLE_SCOPE bool +mlx5e_xmit_xdp_frame_mpwqe(struct mlx5e_xdpsq *sq, struct mlx5e_xmit_data *xdptxd, + struct mlx5e_xdp_info *xdpi, int check_result) +{ + struct mlx5e_tx_mpwqe *session = &sq->mpwqe; + struct mlx5e_xdpsq_stats *stats = sq->stats; + + if (unlikely(xdptxd->len > sq->hw_mtu)) { + stats->err++; + return false; + } + + if (!check_result) + check_result = mlx5e_xmit_xdp_frame_check_mpwqe(sq); + if (unlikely(check_result < 0)) + return false; + + if (check_result == MLX5E_XDP_CHECK_START_MPWQE) { + /* Start the session when nothing can fail, so it's guaranteed + * that if there is an active session, it has at least one dseg, + * and it's safe to complete it at any time. + */ + mlx5e_xdp_mpwqe_session_start(sq); + } + + mlx5e_xdp_mpwqe_add_dseg(sq, xdptxd, stats); + + if (unlikely(mlx5e_xdp_mpwqe_is_full(session, sq->max_sq_mpw_wqebbs))) + mlx5e_xdp_mpwqe_complete(sq); + + mlx5e_xdpi_fifo_push(&sq->db.xdpi_fifo, xdpi); + stats->xmit++; + return true; +} + +INDIRECT_CALLABLE_SCOPE int mlx5e_xmit_xdp_frame_check(struct mlx5e_xdpsq *sq) +{ + if (unlikely(!mlx5e_wqc_has_room_for(&sq->wq, sq->cc, sq->pc, 1))) { + /* SQ is full, ring doorbell */ + mlx5e_xmit_xdp_doorbell(sq); + sq->stats->full++; + return -EBUSY; + } + + return MLX5E_XDP_CHECK_OK; +} + +INDIRECT_CALLABLE_SCOPE bool +mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq, struct mlx5e_xmit_data *xdptxd, + struct mlx5e_xdp_info *xdpi, int check_result) +{ + struct mlx5_wq_cyc *wq = &sq->wq; + u16 pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc); + struct mlx5e_tx_wqe *wqe = mlx5_wq_cyc_get_wqe(wq, pi); + + struct mlx5_wqe_ctrl_seg *cseg = &wqe->ctrl; + struct mlx5_wqe_eth_seg *eseg = &wqe->eth; + struct mlx5_wqe_data_seg *dseg = wqe->data; + + dma_addr_t dma_addr = xdptxd->dma_addr; + u32 dma_len = xdptxd->len; + + struct mlx5e_xdpsq_stats *stats = sq->stats; + + net_prefetchw(wqe); + + if (unlikely(dma_len < MLX5E_XDP_MIN_INLINE || sq->hw_mtu < dma_len)) { + stats->err++; + return false; + } + + if (!check_result) + check_result = mlx5e_xmit_xdp_frame_check(sq); + if (unlikely(check_result < 0)) + return false; + + cseg->fm_ce_se = 0; + + /* copy the inline part if required */ + if (sq->min_inline_mode != MLX5_INLINE_MODE_NONE) { + memcpy(eseg->inline_hdr.start, xdptxd->data, sizeof(eseg->inline_hdr.start)); + eseg->inline_hdr.sz = cpu_to_be16(MLX5E_XDP_MIN_INLINE); + memcpy(dseg, xdptxd->data + sizeof(eseg->inline_hdr.start), + MLX5E_XDP_MIN_INLINE - sizeof(eseg->inline_hdr.start)); + dma_len -= MLX5E_XDP_MIN_INLINE; + dma_addr += MLX5E_XDP_MIN_INLINE; + dseg++; + } + + if (test_bit(MLX5E_SQ_STATE_TX_XDP_CSUM, &sq->state)) + eseg->cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM; + + /* write the dma part */ + dseg->addr = cpu_to_be64(dma_addr); + dseg->byte_count = cpu_to_be32(dma_len); + + cseg->opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | MLX5_OPCODE_SEND); + + sq->pc++; + + sq->doorbell_cseg = cseg; + + mlx5e_xdpi_fifo_push(&sq->db.xdpi_fifo, xdpi); + stats->xmit++; + return true; +} + +static void mlx5e_free_xdpsq_desc(struct mlx5e_xdpsq *sq, + struct mlx5e_xdp_wqe_info *wi, + u32 *xsk_frames, + bool recycle, + struct xdp_frame_bulk *bq) +{ + struct mlx5e_xdp_info_fifo *xdpi_fifo = &sq->db.xdpi_fifo; + u16 i; + + for (i = 0; i < wi->num_pkts; i++) { + struct mlx5e_xdp_info xdpi = mlx5e_xdpi_fifo_pop(xdpi_fifo); + + switch (xdpi.mode) { + case MLX5E_XDP_XMIT_MODE_FRAME: + /* XDP_TX from the XSK RQ and XDP_REDIRECT */ + dma_unmap_single(sq->pdev, xdpi.frame.dma_addr, + xdpi.frame.xdpf->len, DMA_TO_DEVICE); + xdp_return_frame_bulk(xdpi.frame.xdpf, bq); + break; + case MLX5E_XDP_XMIT_MODE_PAGE: + /* XDP_TX from the regular RQ */ + mlx5e_page_release_dynamic(xdpi.page.rq, &xdpi.page.di, recycle); + break; + case MLX5E_XDP_XMIT_MODE_XSK: + /* AF_XDP send */ + (*xsk_frames)++; + break; + default: + WARN_ON_ONCE(true); + } + } +} + +bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq) +{ + struct xdp_frame_bulk bq; + struct mlx5e_xdpsq *sq; + struct mlx5_cqe64 *cqe; + u32 xsk_frames = 0; + u16 sqcc; + int i; + + xdp_frame_bulk_init(&bq); + + sq = container_of(cq, struct mlx5e_xdpsq, cq); + + if (unlikely(!test_bit(MLX5E_SQ_STATE_ENABLED, &sq->state))) + return false; + + cqe = mlx5_cqwq_get_cqe(&cq->wq); + if (!cqe) + return false; + + /* sq->cc must be updated only after mlx5_cqwq_update_db_record(), + * otherwise a cq overrun may occur + */ + sqcc = sq->cc; + + i = 0; + do { + struct mlx5e_xdp_wqe_info *wi; + u16 wqe_counter, ci; + bool last_wqe; + + mlx5_cqwq_pop(&cq->wq); + + wqe_counter = be16_to_cpu(cqe->wqe_counter); + + do { + last_wqe = (sqcc == wqe_counter); + ci = mlx5_wq_cyc_ctr2ix(&sq->wq, sqcc); + wi = &sq->db.wqe_info[ci]; + + sqcc += wi->num_wqebbs; + + mlx5e_free_xdpsq_desc(sq, wi, &xsk_frames, true, &bq); + } while (!last_wqe); + + if (unlikely(get_cqe_opcode(cqe) != MLX5_CQE_REQ)) { + netdev_WARN_ONCE(sq->channel->netdev, + "Bad OP in XDPSQ CQE: 0x%x\n", + get_cqe_opcode(cqe)); + mlx5e_dump_error_cqe(&sq->cq, sq->sqn, + (struct mlx5_err_cqe *)cqe); + mlx5_wq_cyc_wqe_dump(&sq->wq, ci, wi->num_wqebbs); + } + } while ((++i < MLX5E_TX_CQ_POLL_BUDGET) && (cqe = mlx5_cqwq_get_cqe(&cq->wq))); + + xdp_flush_frame_bulk(&bq); + + if (xsk_frames) + xsk_tx_completed(sq->xsk_pool, xsk_frames); + + sq->stats->cqes += i; + + mlx5_cqwq_update_db_record(&cq->wq); + + /* ensure cq space is freed before enabling more cqes */ + wmb(); + + sq->cc = sqcc; + return (i == MLX5E_TX_CQ_POLL_BUDGET); +} + +void mlx5e_free_xdpsq_descs(struct mlx5e_xdpsq *sq) +{ + struct xdp_frame_bulk bq; + u32 xsk_frames = 0; + + xdp_frame_bulk_init(&bq); + + rcu_read_lock(); /* need for xdp_return_frame_bulk */ + + while (sq->cc != sq->pc) { + struct mlx5e_xdp_wqe_info *wi; + u16 ci; + + ci = mlx5_wq_cyc_ctr2ix(&sq->wq, sq->cc); + wi = &sq->db.wqe_info[ci]; + + sq->cc += wi->num_wqebbs; + + mlx5e_free_xdpsq_desc(sq, wi, &xsk_frames, false, &bq); + } + + xdp_flush_frame_bulk(&bq); + rcu_read_unlock(); + + if (xsk_frames) + xsk_tx_completed(sq->xsk_pool, xsk_frames); +} + +int mlx5e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, + u32 flags) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5e_xdpsq *sq; + int nxmit = 0; + int sq_num; + int i; + + /* this flag is sufficient, no need to test internal sq state */ + if (unlikely(!mlx5e_xdp_tx_is_enabled(priv))) + return -ENETDOWN; + + if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) + return -EINVAL; + + sq_num = smp_processor_id(); + + if (unlikely(sq_num >= priv->channels.num)) + return -ENXIO; + + sq = &priv->channels.c[sq_num]->xdpsq; + + for (i = 0; i < n; i++) { + struct xdp_frame *xdpf = frames[i]; + struct mlx5e_xmit_data xdptxd; + struct mlx5e_xdp_info xdpi; + bool ret; + + xdptxd.data = xdpf->data; + xdptxd.len = xdpf->len; + xdptxd.dma_addr = dma_map_single(sq->pdev, xdptxd.data, + xdptxd.len, DMA_TO_DEVICE); + + if (unlikely(dma_mapping_error(sq->pdev, xdptxd.dma_addr))) + break; + + xdpi.mode = MLX5E_XDP_XMIT_MODE_FRAME; + xdpi.frame.xdpf = xdpf; + xdpi.frame.dma_addr = xdptxd.dma_addr; + + ret = INDIRECT_CALL_2(sq->xmit_xdp_frame, mlx5e_xmit_xdp_frame_mpwqe, + mlx5e_xmit_xdp_frame, sq, &xdptxd, &xdpi, 0); + if (unlikely(!ret)) { + dma_unmap_single(sq->pdev, xdptxd.dma_addr, + xdptxd.len, DMA_TO_DEVICE); + break; + } + nxmit++; + } + + if (flags & XDP_XMIT_FLUSH) { + if (sq->mpwqe.wqe) + mlx5e_xdp_mpwqe_complete(sq); + mlx5e_xmit_xdp_doorbell(sq); + } + + return nxmit; +} + +void mlx5e_xdp_rx_poll_complete(struct mlx5e_rq *rq) +{ + struct mlx5e_xdpsq *xdpsq = rq->xdpsq; + + if (xdpsq->mpwqe.wqe) + mlx5e_xdp_mpwqe_complete(xdpsq); + + mlx5e_xmit_xdp_doorbell(xdpsq); + + if (test_bit(MLX5E_RQ_FLAG_XDP_REDIRECT, rq->flags)) { + xdp_do_flush_map(); + __clear_bit(MLX5E_RQ_FLAG_XDP_REDIRECT, rq->flags); + } +} + +void mlx5e_set_xmit_fp(struct mlx5e_xdpsq *sq, bool is_mpw) +{ + sq->xmit_xdp_frame_check = is_mpw ? + mlx5e_xmit_xdp_frame_check_mpwqe : mlx5e_xmit_xdp_frame_check; + sq->xmit_xdp_frame = is_mpw ? + mlx5e_xmit_xdp_frame_mpwqe : mlx5e_xmit_xdp_frame; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h new file mode 100644 index 0000000..b98e0e1 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h @@ -0,0 +1,187 @@ +/* + * Copyright (c) 2018, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __MLX5_EN_XDP_H__ +#define __MLX5_EN_XDP_H__ + +#include + +#include "en.h" +#include "en/txrx.h" + +#define MLX5E_XDP_MIN_INLINE (ETH_HLEN + VLAN_HLEN) +#define MLX5E_XDP_TX_DS_COUNT (MLX5E_TX_WQE_EMPTY_DS_COUNT + 1 /* SG DS */) + +#define MLX5E_XDP_INLINE_WQE_MAX_DS_CNT 16 +#define MLX5E_XDP_INLINE_WQE_SZ_THRSD \ + (MLX5E_XDP_INLINE_WQE_MAX_DS_CNT * MLX5_SEND_WQE_DS - \ + sizeof(struct mlx5_wqe_inline_seg)) + +struct mlx5e_xsk_param; +int mlx5e_xdp_max_mtu(struct mlx5e_params *params, struct mlx5e_xsk_param *xsk); +bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct mlx5e_dma_info *di, + u32 *len, struct xdp_buff *xdp); +void mlx5e_xdp_mpwqe_complete(struct mlx5e_xdpsq *sq); +bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq); +void mlx5e_free_xdpsq_descs(struct mlx5e_xdpsq *sq); +void mlx5e_set_xmit_fp(struct mlx5e_xdpsq *sq, bool is_mpw); +void mlx5e_xdp_rx_poll_complete(struct mlx5e_rq *rq); +int mlx5e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, + u32 flags); + +INDIRECT_CALLABLE_DECLARE(bool mlx5e_xmit_xdp_frame_mpwqe(struct mlx5e_xdpsq *sq, + struct mlx5e_xmit_data *xdptxd, + struct mlx5e_xdp_info *xdpi, + int check_result)); +INDIRECT_CALLABLE_DECLARE(bool mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq, + struct mlx5e_xmit_data *xdptxd, + struct mlx5e_xdp_info *xdpi, + int check_result)); +INDIRECT_CALLABLE_DECLARE(int mlx5e_xmit_xdp_frame_check_mpwqe(struct mlx5e_xdpsq *sq)); +INDIRECT_CALLABLE_DECLARE(int mlx5e_xmit_xdp_frame_check(struct mlx5e_xdpsq *sq)); + +static inline void mlx5e_xdp_tx_enable(struct mlx5e_priv *priv) +{ + set_bit(MLX5E_STATE_XDP_TX_ENABLED, &priv->state); + + if (priv->channels.params.xdp_prog) + set_bit(MLX5E_STATE_XDP_ACTIVE, &priv->state); +} + +static inline void mlx5e_xdp_tx_disable(struct mlx5e_priv *priv) +{ + if (priv->channels.params.xdp_prog) + clear_bit(MLX5E_STATE_XDP_ACTIVE, &priv->state); + + clear_bit(MLX5E_STATE_XDP_TX_ENABLED, &priv->state); + /* Let other device's napi(s) and XSK wakeups see our new state. */ + synchronize_net(); +} + +static inline bool mlx5e_xdp_tx_is_enabled(struct mlx5e_priv *priv) +{ + return test_bit(MLX5E_STATE_XDP_TX_ENABLED, &priv->state); +} + +static inline bool mlx5e_xdp_is_active(struct mlx5e_priv *priv) +{ + return test_bit(MLX5E_STATE_XDP_ACTIVE, &priv->state); +} + +static inline void mlx5e_xmit_xdp_doorbell(struct mlx5e_xdpsq *sq) +{ + if (sq->doorbell_cseg) { + mlx5e_notify_hw(&sq->wq, sq->pc, sq->uar_map, sq->doorbell_cseg); + sq->doorbell_cseg = NULL; + } +} + +/* Enable inline WQEs to shift some load from a congested HCA (HW) to + * a less congested cpu (SW). + */ +static inline bool mlx5e_xdp_get_inline_state(struct mlx5e_xdpsq *sq, bool cur) +{ + u16 outstanding = sq->xdpi_fifo_pc - sq->xdpi_fifo_cc; + +#define MLX5E_XDP_INLINE_WATERMARK_LOW 10 +#define MLX5E_XDP_INLINE_WATERMARK_HIGH 128 + + if (cur && outstanding <= MLX5E_XDP_INLINE_WATERMARK_LOW) + return false; + + if (!cur && outstanding >= MLX5E_XDP_INLINE_WATERMARK_HIGH) + return true; + + return cur; +} + +static inline bool mlx5e_xdp_mpwqe_is_full(struct mlx5e_tx_mpwqe *session, u8 max_sq_mpw_wqebbs) +{ + if (session->inline_on) + return session->ds_count + MLX5E_XDP_INLINE_WQE_MAX_DS_CNT > + max_sq_mpw_wqebbs * MLX5_SEND_WQEBB_NUM_DS; + + return mlx5e_tx_mpwqe_is_full(session, max_sq_mpw_wqebbs); +} + +struct mlx5e_xdp_wqe_info { + u8 num_wqebbs; + u8 num_pkts; +}; + +static inline void +mlx5e_xdp_mpwqe_add_dseg(struct mlx5e_xdpsq *sq, + struct mlx5e_xmit_data *xdptxd, + struct mlx5e_xdpsq_stats *stats) +{ + struct mlx5e_tx_mpwqe *session = &sq->mpwqe; + struct mlx5_wqe_data_seg *dseg = + (struct mlx5_wqe_data_seg *)session->wqe + session->ds_count; + u32 dma_len = xdptxd->len; + + session->pkt_count++; + session->bytes_count += dma_len; + + if (session->inline_on && dma_len <= MLX5E_XDP_INLINE_WQE_SZ_THRSD) { + struct mlx5_wqe_inline_seg *inline_dseg = + (struct mlx5_wqe_inline_seg *)dseg; + u16 ds_len = sizeof(*inline_dseg) + dma_len; + u16 ds_cnt = DIV_ROUND_UP(ds_len, MLX5_SEND_WQE_DS); + + inline_dseg->byte_count = cpu_to_be32(dma_len | MLX5_INLINE_SEG); + memcpy(inline_dseg->data, xdptxd->data, dma_len); + + session->ds_count += ds_cnt; + stats->inlnw++; + return; + } + + dseg->addr = cpu_to_be64(xdptxd->dma_addr); + dseg->byte_count = cpu_to_be32(dma_len); + dseg->lkey = sq->mkey_be; + session->ds_count++; +} + +static inline void +mlx5e_xdpi_fifo_push(struct mlx5e_xdp_info_fifo *fifo, + struct mlx5e_xdp_info *xi) +{ + u32 i = (*fifo->pc)++ & fifo->mask; + + fifo->xi[i] = *xi; +} + +static inline struct mlx5e_xdp_info +mlx5e_xdpi_fifo_pop(struct mlx5e_xdp_info_fifo *fifo) +{ + return fifo->xi[(*fifo->cc)++ & fifo->mask]; +} +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.c new file mode 100644 index 0000000..e740268 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.c @@ -0,0 +1,220 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2019-2020, Mellanox Technologies inc. All rights reserved. */ + +#include +#include "pool.h" +#include "setup.h" +#include "en/params.h" + +static int mlx5e_xsk_map_pool(struct mlx5e_priv *priv, + struct xsk_buff_pool *pool) +{ + struct device *dev = mlx5_core_dma_dev(priv->mdev); + + return xsk_pool_dma_map(pool, dev, DMA_ATTR_SKIP_CPU_SYNC); +} + +static void mlx5e_xsk_unmap_pool(struct mlx5e_priv *priv, + struct xsk_buff_pool *pool) +{ + return xsk_pool_dma_unmap(pool, DMA_ATTR_SKIP_CPU_SYNC); +} + +static int mlx5e_xsk_get_pools(struct mlx5e_xsk *xsk) +{ + if (!xsk->pools) { + xsk->pools = kcalloc(MLX5E_MAX_NUM_CHANNELS, + sizeof(*xsk->pools), GFP_KERNEL); + if (unlikely(!xsk->pools)) + return -ENOMEM; + } + + xsk->refcnt++; + xsk->ever_used = true; + + return 0; +} + +static void mlx5e_xsk_put_pools(struct mlx5e_xsk *xsk) +{ + if (!--xsk->refcnt) { + kfree(xsk->pools); + xsk->pools = NULL; + } +} + +static int mlx5e_xsk_add_pool(struct mlx5e_xsk *xsk, struct xsk_buff_pool *pool, u16 ix) +{ + int err; + + err = mlx5e_xsk_get_pools(xsk); + if (unlikely(err)) + return err; + + xsk->pools[ix] = pool; + return 0; +} + +static void mlx5e_xsk_remove_pool(struct mlx5e_xsk *xsk, u16 ix) +{ + xsk->pools[ix] = NULL; + + mlx5e_xsk_put_pools(xsk); +} + +static bool mlx5e_xsk_is_pool_sane(struct xsk_buff_pool *pool) +{ + return xsk_pool_get_headroom(pool) <= 0xffff && + xsk_pool_get_chunk_size(pool) <= 0xffff; +} + +void mlx5e_build_xsk_param(struct xsk_buff_pool *pool, struct mlx5e_xsk_param *xsk) +{ + xsk->headroom = xsk_pool_get_headroom(pool); + xsk->chunk_size = xsk_pool_get_chunk_size(pool); +} + +static int mlx5e_xsk_enable_locked(struct mlx5e_priv *priv, + struct xsk_buff_pool *pool, u16 ix) +{ + struct mlx5e_params *params = &priv->channels.params; + struct mlx5e_xsk_param xsk; + struct mlx5e_channel *c; + int err; + + if (unlikely(mlx5e_xsk_get_pool(&priv->channels.params, &priv->xsk, ix))) + return -EBUSY; + + if (unlikely(!mlx5e_xsk_is_pool_sane(pool))) + return -EINVAL; + + err = mlx5e_xsk_map_pool(priv, pool); + if (unlikely(err)) + return err; + + err = mlx5e_xsk_add_pool(&priv->xsk, pool, ix); + if (unlikely(err)) + goto err_unmap_pool; + + mlx5e_build_xsk_param(pool, &xsk); + + if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) { + /* XSK objects will be created on open. */ + goto validate_closed; + } + + if (!params->xdp_prog) { + /* XSK objects will be created when an XDP program is set, + * and the channels are reopened. + */ + goto validate_closed; + } + + c = priv->channels.c[ix]; + + err = mlx5e_open_xsk(priv, params, &xsk, pool, c); + if (unlikely(err)) + goto err_remove_pool; + + mlx5e_activate_xsk(c); + mlx5e_trigger_napi_icosq(c); + + /* Don't wait for WQEs, because the newer xdpsock sample doesn't provide + * any Fill Ring entries at the setup stage. + */ + + err = mlx5e_rx_res_xsk_activate(priv->rx_res, &priv->channels, ix); + if (unlikely(err)) + goto err_deactivate; + + return 0; + +err_deactivate: + mlx5e_deactivate_xsk(c); + synchronize_net(); /* Sync with NAPI. */ + mlx5e_close_xsk(c); + +err_remove_pool: + mlx5e_xsk_remove_pool(&priv->xsk, ix); + +err_unmap_pool: + mlx5e_xsk_unmap_pool(priv, pool); + + return err; + +validate_closed: + /* Check the configuration in advance, rather than fail at a later stage + * (in mlx5e_xdp_set or on open) and end up with no channels. + */ + if (!mlx5e_validate_xsk_param(params, &xsk, priv->mdev)) { + err = -EINVAL; + goto err_remove_pool; + } + + return 0; +} + +static int mlx5e_xsk_disable_locked(struct mlx5e_priv *priv, u16 ix) +{ + struct xsk_buff_pool *pool = mlx5e_xsk_get_pool(&priv->channels.params, + &priv->xsk, ix); + struct mlx5e_channel *c; + + if (unlikely(!pool)) + return -EINVAL; + + if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) + goto remove_pool; + + /* XSK RQ and SQ are only created if XDP program is set. */ + if (!priv->channels.params.xdp_prog) + goto remove_pool; + + c = priv->channels.c[ix]; + mlx5e_rx_res_xsk_deactivate(priv->rx_res, ix); + mlx5e_deactivate_xsk(c); + synchronize_net(); /* Sync with NAPI. */ + mlx5e_close_xsk(c); + +remove_pool: + mlx5e_xsk_remove_pool(&priv->xsk, ix); + mlx5e_xsk_unmap_pool(priv, pool); + + return 0; +} + +static int mlx5e_xsk_enable_pool(struct mlx5e_priv *priv, struct xsk_buff_pool *pool, + u16 ix) +{ + int err; + + mutex_lock(&priv->state_lock); + err = mlx5e_xsk_enable_locked(priv, pool, ix); + mutex_unlock(&priv->state_lock); + + return err; +} + +static int mlx5e_xsk_disable_pool(struct mlx5e_priv *priv, u16 ix) +{ + int err; + + mutex_lock(&priv->state_lock); + err = mlx5e_xsk_disable_locked(priv, ix); + mutex_unlock(&priv->state_lock); + + return err; +} + +int mlx5e_xsk_setup_pool(struct net_device *dev, struct xsk_buff_pool *pool, u16 qid) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5e_params *params = &priv->channels.params; + u16 ix; + + if (unlikely(!mlx5e_qid_get_ch_if_in_group(params, qid, MLX5E_RQ_GROUP_XSK, &ix))) + return -EINVAL; + + return pool ? mlx5e_xsk_enable_pool(priv, pool, ix) : + mlx5e_xsk_disable_pool(priv, ix); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.h new file mode 100644 index 0000000..dca0010 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2019-2020, Mellanox Technologies inc. All rights reserved. */ + +#ifndef __MLX5_EN_XSK_POOL_H__ +#define __MLX5_EN_XSK_POOL_H__ + +#include "en.h" + +static inline struct xsk_buff_pool *mlx5e_xsk_get_pool(struct mlx5e_params *params, + struct mlx5e_xsk *xsk, u16 ix) +{ + if (!xsk || !xsk->pools) + return NULL; + + if (unlikely(ix >= params->num_channels)) + return NULL; + + return xsk->pools[ix]; +} + +struct mlx5e_xsk_param; +void mlx5e_build_xsk_param(struct xsk_buff_pool *pool, struct mlx5e_xsk_param *xsk); + +/* .ndo_bpf callback. */ +int mlx5e_xsk_setup_pool(struct net_device *dev, struct xsk_buff_pool *pool, u16 qid); + +#endif /* __MLX5_EN_XSK_POOL_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c new file mode 100644 index 0000000..9dd73ba --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2019 Mellanox Technologies. */ + +#include "rx.h" +#include "en/xdp.h" +#include + +/* RX data path */ + +static struct sk_buff *mlx5e_xsk_construct_skb(struct mlx5e_rq *rq, void *data, + u32 cqe_bcnt) +{ + struct sk_buff *skb; + + skb = napi_alloc_skb(rq->cq.napi, cqe_bcnt); + if (unlikely(!skb)) { + rq->stats->buff_alloc_err++; + return NULL; + } + + skb_put_data(skb, data, cqe_bcnt); + + return skb; +} + +struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, + struct mlx5e_mpw_info *wi, + u16 cqe_bcnt, + u32 head_offset, + u32 page_idx) +{ + struct xdp_buff *xdp = wi->umr.dma_info[page_idx].xsk; + u32 cqe_bcnt32 = cqe_bcnt; + + /* Check packet size. Note LRO doesn't use linear SKB */ + if (unlikely(cqe_bcnt > rq->hw_mtu + rq->pet_hdr_size)) { + rq->stats->oversize_pkts_sw_drop++; + return NULL; + } + + /* head_offset is not used in this function, because xdp->data and the + * DMA address point directly to the necessary place. Furthermore, in + * the current implementation, UMR pages are mapped to XSK frames, so + * head_offset should always be 0. + */ + WARN_ON_ONCE(head_offset); + + xdp->data_end = xdp->data + cqe_bcnt32; + xdp_set_data_meta_invalid(xdp); + xsk_buff_dma_sync_for_cpu(xdp, rq->xsk_pool); + net_prefetch(xdp->data); + + /* Possible flows: + * - XDP_REDIRECT to XSKMAP: + * The page is owned by the userspace from now. + * - XDP_TX and other XDP_REDIRECTs: + * The page was returned by ZCA and recycled. + * - XDP_DROP: + * Recycle the page. + * - XDP_PASS: + * Allocate an SKB, copy the data and recycle the page. + * + * Pages to be recycled go to the Reuse Ring on MPWQE deallocation. Its + * size is the same as the Driver RX Ring's size, and pages for WQEs are + * allocated first from the Reuse Ring, so it has enough space. + */ + + if (likely(mlx5e_xdp_handle(rq, NULL, &cqe_bcnt32, xdp))) { + if (likely(__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags))) + __set_bit(page_idx, wi->xdp_xmit_bitmap); /* non-atomic */ + return NULL; /* page/packet was consumed by XDP */ + } + + /* XDP_PASS: copy the data from the UMEM to a new SKB and reuse the + * frame. On SKB allocation failure, NULL is returned. + */ + return mlx5e_xsk_construct_skb(rq, xdp->data, cqe_bcnt32); +} + +struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq, + struct mlx5_cqe64 *cqe, + struct mlx5e_wqe_frag_info *wi, + u32 cqe_bcnt) +{ + struct xdp_buff *xdp = wi->di->xsk; + + /* wi->offset is not used in this function, because xdp->data and the + * DMA address point directly to the necessary place. Furthermore, the + * XSK allocator allocates frames per packet, instead of pages, so + * wi->offset should always be 0. + */ + WARN_ON_ONCE(wi->offset); + + xdp->data_end = xdp->data + cqe_bcnt; + xdp_set_data_meta_invalid(xdp); + xsk_buff_dma_sync_for_cpu(xdp, rq->xsk_pool); + net_prefetch(xdp->data); + + if (unlikely(get_cqe_opcode(cqe) != MLX5_CQE_RESP_SEND)) { + rq->stats->wqe_err++; + return NULL; + } + + if (likely(mlx5e_xdp_handle(rq, NULL, &cqe_bcnt, xdp))) + return NULL; /* page/packet was consumed by XDP */ + + /* XDP_PASS: copy the data from the UMEM to a new SKB. The frame reuse + * will be handled by mlx5e_put_rx_frag. + * On SKB allocation failure, NULL is returned. + */ + return mlx5e_xsk_construct_skb(rq, xdp->data, cqe_bcnt); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h new file mode 100644 index 0000000..7f88ccf --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2019 Mellanox Technologies. */ + +#ifndef __MLX5_EN_XSK_RX_H__ +#define __MLX5_EN_XSK_RX_H__ + +#include "en.h" +#include + +/* RX data path */ + +struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, + struct mlx5e_mpw_info *wi, + u16 cqe_bcnt, + u32 head_offset, + u32 page_idx); +struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq, + struct mlx5_cqe64 *cqe, + struct mlx5e_wqe_frag_info *wi, + u32 cqe_bcnt); + +static inline int mlx5e_xsk_page_alloc_pool(struct mlx5e_rq *rq, + struct mlx5e_dma_info *dma_info) +{ + dma_info->xsk = xsk_buff_alloc(rq->xsk_pool); + if (!dma_info->xsk) + return -ENOMEM; + + /* Store the DMA address without headroom. In striding RQ case, we just + * provide pages for UMR, and headroom is counted at the setup stage + * when creating a WQE. In non-striding RQ case, headroom is accounted + * in mlx5e_alloc_rx_wqe. + */ + dma_info->addr = xsk_buff_xdp_get_frame_dma(dma_info->xsk); + + return 0; +} + +static inline bool mlx5e_xsk_update_rx_wakeup(struct mlx5e_rq *rq, bool alloc_err) +{ + if (!xsk_uses_need_wakeup(rq->xsk_pool)) + return alloc_err; + + if (unlikely(alloc_err)) + xsk_set_rx_need_wakeup(rq->xsk_pool); + else + xsk_clear_rx_need_wakeup(rq->xsk_pool); + + return false; +} + +#endif /* __MLX5_EN_XSK_RX_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c new file mode 100644 index 0000000..d667c8c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c @@ -0,0 +1,185 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2019 Mellanox Technologies. */ + +#include "setup.h" +#include "en/params.h" +#include "en/txrx.h" +#include "en/health.h" + +/* It matches XDP_UMEM_MIN_CHUNK_SIZE, but as this constant is private and may + * change unexpectedly, and mlx5e has a minimum valid stride size for striding + * RQ, keep this check in the driver. + */ +#define MLX5E_MIN_XSK_CHUNK_SIZE 2048 + +bool mlx5e_validate_xsk_param(struct mlx5e_params *params, + struct mlx5e_xsk_param *xsk, + struct mlx5_core_dev *mdev) +{ + /* AF_XDP doesn't support frames larger than PAGE_SIZE. */ + if (xsk->chunk_size > PAGE_SIZE || + xsk->chunk_size < MLX5E_MIN_XSK_CHUNK_SIZE) + return false; + + /* Current MTU and XSK headroom don't allow packets to fit the frames. */ + if (mlx5e_rx_get_min_frag_sz(params, xsk) > xsk->chunk_size) + return false; + + /* frag_sz is different for regular and XSK RQs, so ensure that linear + * SKB mode is possible. + */ + switch (params->rq_wq_type) { + case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ: + return !mlx5e_mpwrq_validate_xsk(mdev, params, xsk); + default: /* MLX5_WQ_TYPE_CYCLIC */ + return mlx5e_rx_is_linear_skb(params, xsk); + } +} + +static void mlx5e_build_xsk_cparam(struct mlx5_core_dev *mdev, + struct mlx5e_params *params, + struct mlx5e_xsk_param *xsk, + u16 q_counter, + struct mlx5e_channel_param *cparam) +{ + mlx5e_build_rq_param(mdev, params, xsk, q_counter, &cparam->rq); + mlx5e_build_xdpsq_param(mdev, params, &cparam->xdp_sq); +} + +static int mlx5e_init_xsk_rq(struct mlx5e_channel *c, + struct mlx5e_params *params, + struct xsk_buff_pool *pool, + struct mlx5e_xsk_param *xsk, + struct mlx5e_rq *rq) +{ + struct mlx5_core_dev *mdev = c->mdev; + int rq_xdp_ix; + int err; + + rq->wq_type = params->rq_wq_type; + rq->pdev = c->pdev; + rq->netdev = c->netdev; + rq->priv = c->priv; + rq->tstamp = c->tstamp; + rq->clock = &mdev->clock; + rq->icosq = &c->icosq; + rq->ix = c->ix; + rq->channel = c; + rq->mdev = mdev; + rq->hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu); + rq->xdpsq = &c->rq_xdpsq; + rq->xsk_pool = pool; + rq->stats = &c->priv->channel_stats[c->ix]->xskrq; + rq->ptp_cyc2time = mlx5_rq_ts_translator(mdev); + rq_xdp_ix = c->ix + params->num_channels * MLX5E_RQ_GROUP_XSK; + err = mlx5e_rq_set_handlers(rq, params, xsk); + if (err) + return err; + + return xdp_rxq_info_reg(&rq->xdp_rxq, rq->netdev, rq_xdp_ix, 0); +} + +static int mlx5e_open_xsk_rq(struct mlx5e_channel *c, struct mlx5e_params *params, + struct mlx5e_rq_param *rq_params, struct xsk_buff_pool *pool, + struct mlx5e_create_cq_param *ccp, struct mlx5e_xsk_param *xsk) +{ + int err; + + err = mlx5e_init_xsk_rq(c, params, pool, xsk, &c->xskrq); + if (err) + return err; + + return mlx5e_open_rq(c->priv, params, rq_params, xsk, ccp, params->rx_cq_moderation, cpu_to_node(c->cpu), &c->xskrq); +} + +int mlx5e_open_xsk(struct mlx5e_priv *priv, struct mlx5e_params *params, + struct mlx5e_xsk_param *xsk, struct xsk_buff_pool *pool, + struct mlx5e_channel *c) +{ + struct mlx5e_channel_param *cparam; + struct mlx5e_create_cq_param ccp; + int err; + + mlx5e_build_create_cq_param(&ccp, c); + + if (!mlx5e_validate_xsk_param(params, xsk, priv->mdev)) + return -EINVAL; + + cparam = kvzalloc(sizeof(*cparam), GFP_KERNEL); + if (!cparam) + return -ENOMEM; + + mlx5e_build_xsk_cparam(priv->mdev, params, xsk, priv->q_counter, cparam); + + err = mlx5e_open_xsk_rq(c, params, &cparam->rq, pool, &ccp, xsk); + if (unlikely(err)) + goto err_free_cparam; + + err = mlx5e_open_cq(c->priv, params->tx_cq_moderation, &cparam->xdp_sq.cqp, &ccp, + &c->xsksq.cq); + if (unlikely(err)) + goto err_close_rq; + + /* Create a separate SQ, so that when the buff pool is disabled, we could + * close this SQ safely and stop receiving CQEs. In other case, e.g., if + * the XDPSQ was used instead, we might run into trouble when the buff pool + * is disabled and then re-enabled, but the SQ continues receiving CQEs + * from the old buff pool. + */ + err = mlx5e_open_xdpsq(c, params, &cparam->xdp_sq, pool, &c->xsksq, true); + if (unlikely(err)) + goto err_close_tx_cq; + + kvfree(cparam); + + set_bit(MLX5E_CHANNEL_STATE_XSK, c->state); + + return 0; + +err_close_tx_cq: + mlx5e_close_cq(&c->xsksq.cq); + +err_close_rq: + mlx5e_close_rq(c->priv, &c->xskrq); + +err_free_cparam: + kvfree(cparam); + + return err; +} + +void mlx5e_close_xsk(struct mlx5e_channel *c) +{ + clear_bit(MLX5E_CHANNEL_STATE_XSK, c->state); + synchronize_net(); /* Sync with the XSK wakeup and with NAPI. */ + + mlx5e_close_rq(c->priv, &c->xskrq); + mlx5e_close_xdpsq(&c->xsksq); + mlx5e_close_cq(&c->xsksq.cq); + + memset(&c->xsksq, 0, sizeof(c->xsksq)); +} + +void mlx5e_activate_xsk(struct mlx5e_channel *c) +{ + /* ICOSQ recovery deactivates RQs. Suspend the recovery to avoid + * activating XSKRQ in the middle of recovery. + */ + mlx5e_reporter_icosq_suspend_recovery(c); + set_bit(MLX5E_RQ_STATE_ENABLED, &c->xskrq.state); + mlx5e_reporter_icosq_resume_recovery(c); + + mlx5e_activate_xdpsq(&c->xsksq); +} + +void mlx5e_deactivate_xsk(struct mlx5e_channel *c) +{ + mlx5e_deactivate_xdpsq(&c->xsksq); + /* ICOSQ recovery may reactivate XSKRQ if clear_bit is called in the + * middle of recovery. Suspend the recovery to avoid it. + */ + mlx5e_reporter_icosq_suspend_recovery(c); + clear_bit(MLX5E_RQ_STATE_ENABLED, &c->xskrq.state); + mlx5e_reporter_icosq_resume_recovery(c); + synchronize_net(); /* Sync with NAPI to prevent mlx5e_post_rx_wqes. */ +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.h new file mode 100644 index 0000000..50e111b --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2019 Mellanox Technologies. */ + +#ifndef __MLX5_EN_XSK_SETUP_H__ +#define __MLX5_EN_XSK_SETUP_H__ + +#include "en.h" + +struct mlx5e_xsk_param; + +bool mlx5e_validate_xsk_param(struct mlx5e_params *params, + struct mlx5e_xsk_param *xsk, + struct mlx5_core_dev *mdev); +int mlx5e_open_xsk(struct mlx5e_priv *priv, struct mlx5e_params *params, + struct mlx5e_xsk_param *xsk, struct xsk_buff_pool *pool, + struct mlx5e_channel *c); +void mlx5e_close_xsk(struct mlx5e_channel *c); +void mlx5e_activate_xsk(struct mlx5e_channel *c); +void mlx5e_deactivate_xsk(struct mlx5e_channel *c); + +#endif /* __MLX5_EN_XSK_SETUP_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c new file mode 100644 index 0000000..8e96260 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c @@ -0,0 +1,126 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2019 Mellanox Technologies. */ + +#include "tx.h" +#include "pool.h" +#include "en/xdp.h" +#include "en/params.h" +#include + +int mlx5e_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5e_params *params = &priv->channels.params; + struct mlx5e_channel *c; + u16 ix; + + if (unlikely(!mlx5e_xdp_is_active(priv))) + return -ENETDOWN; + + if (unlikely(!mlx5e_qid_get_ch_if_in_group(params, qid, MLX5E_RQ_GROUP_XSK, &ix))) + return -EINVAL; + + c = priv->channels.c[ix]; + + if (unlikely(!test_bit(MLX5E_CHANNEL_STATE_XSK, c->state))) + return -ENXIO; + + if (!napi_if_scheduled_mark_missed(&c->napi)) { + /* To avoid WQE overrun, don't post a NOP if async_icosq is not + * active and not polled by NAPI. Return 0, because the upcoming + * activate will trigger the IRQ for us. + */ + if (unlikely(!test_bit(MLX5E_SQ_STATE_ENABLED, &c->async_icosq.state))) + return 0; + + if (test_and_set_bit(MLX5E_SQ_STATE_PENDING_XSK_TX, &c->async_icosq.state)) + return 0; + + spin_lock_bh(&c->async_icosq_lock); + mlx5e_trigger_irq(&c->async_icosq); + spin_unlock_bh(&c->async_icosq_lock); + } + + return 0; +} + +/* When TX fails (because of the size of the packet), we need to get completions + * in order, so post a NOP to get a CQE. Since AF_XDP doesn't distinguish + * between successful TX and errors, handling in mlx5e_poll_xdpsq_cq is the + * same. + */ +static void mlx5e_xsk_tx_post_err(struct mlx5e_xdpsq *sq, + struct mlx5e_xdp_info *xdpi) +{ + u16 pi = mlx5_wq_cyc_ctr2ix(&sq->wq, sq->pc); + struct mlx5e_xdp_wqe_info *wi = &sq->db.wqe_info[pi]; + struct mlx5e_tx_wqe *nopwqe; + + wi->num_wqebbs = 1; + wi->num_pkts = 1; + + nopwqe = mlx5e_post_nop(&sq->wq, sq->sqn, &sq->pc); + mlx5e_xdpi_fifo_push(&sq->db.xdpi_fifo, xdpi); + sq->doorbell_cseg = &nopwqe->ctrl; +} + +bool mlx5e_xsk_tx(struct mlx5e_xdpsq *sq, unsigned int budget) +{ + struct xsk_buff_pool *pool = sq->xsk_pool; + struct mlx5e_xmit_data xdptxd; + struct mlx5e_xdp_info xdpi; + bool work_done = true; + bool flush = false; + + xdpi.mode = MLX5E_XDP_XMIT_MODE_XSK; + + for (; budget; budget--) { + int check_result = INDIRECT_CALL_2(sq->xmit_xdp_frame_check, + mlx5e_xmit_xdp_frame_check_mpwqe, + mlx5e_xmit_xdp_frame_check, + sq); + struct xdp_desc desc; + bool ret; + + if (unlikely(check_result < 0)) { + work_done = false; + break; + } + + if (!xsk_tx_peek_desc(pool, &desc)) { + /* TX will get stuck until something wakes it up by + * triggering NAPI. Currently it's expected that the + * application calls sendto() if there are consumed, but + * not completed frames. + */ + break; + } + + xdptxd.dma_addr = xsk_buff_raw_get_dma(pool, desc.addr); + xdptxd.data = xsk_buff_raw_get_data(pool, desc.addr); + xdptxd.len = desc.len; + + xsk_buff_raw_dma_sync_for_device(pool, xdptxd.dma_addr, xdptxd.len); + + ret = INDIRECT_CALL_2(sq->xmit_xdp_frame, mlx5e_xmit_xdp_frame_mpwqe, + mlx5e_xmit_xdp_frame, sq, &xdptxd, &xdpi, check_result); + if (unlikely(!ret)) { + if (sq->mpwqe.wqe) + mlx5e_xdp_mpwqe_complete(sq); + + mlx5e_xsk_tx_post_err(sq, &xdpi); + } + + flush = true; + } + + if (flush) { + if (sq->mpwqe.wqe) + mlx5e_xdp_mpwqe_complete(sq); + mlx5e_xmit_xdp_doorbell(sq); + + xsk_tx_release(pool); + } + + return !(budget && work_done); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h new file mode 100644 index 0000000..a050850 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2019 Mellanox Technologies. */ + +#ifndef __MLX5_EN_XSK_TX_H__ +#define __MLX5_EN_XSK_TX_H__ + +#include "en.h" +#include + +/* TX data path */ + +int mlx5e_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags); + +bool mlx5e_xsk_tx(struct mlx5e_xdpsq *sq, unsigned int budget); + +static inline void mlx5e_xsk_update_tx_wakeup(struct mlx5e_xdpsq *sq) +{ + if (!xsk_uses_need_wakeup(sq->xsk_pool)) + return; + + if (sq->pc != sq->cc) + xsk_clear_tx_need_wakeup(sq->xsk_pool); + else + xsk_set_tx_need_wakeup(sq->xsk_pool); +} + +#endif /* __MLX5_EN_XSK_TX_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h new file mode 100644 index 0000000..abeb8e5 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h @@ -0,0 +1,220 @@ +/* + * Copyright (c) 2018 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef __MLX5E_EN_ACCEL_H__ +#define __MLX5E_EN_ACCEL_H__ + +#include +#include +#include "en_accel/ipsec_rxtx.h" +#include "en_accel/tls.h" +#include "en_accel/tls_rxtx.h" +#include +#include "en.h" +#include "en/txrx.h" + +#if IS_ENABLED(CONFIG_GENEVE) +#include + +static inline bool mlx5_geneve_tx_allowed(struct mlx5_core_dev *mdev) +{ + return mlx5_tx_swp_supported(mdev); +} + +static inline void +mlx5e_tx_tunnel_accel(struct sk_buff *skb, struct mlx5_wqe_eth_seg *eseg, u16 ihs) +{ + struct mlx5e_swp_spec swp_spec = {}; + unsigned int offset = 0; + __be16 l3_proto; + u8 l4_proto; + + l3_proto = vlan_get_protocol(skb); + switch (l3_proto) { + case htons(ETH_P_IP): + l4_proto = ip_hdr(skb)->protocol; + break; + case htons(ETH_P_IPV6): + l4_proto = ipv6_find_hdr(skb, &offset, -1, NULL, NULL); + break; + default: + return; + } + + if (l4_proto != IPPROTO_UDP || + udp_hdr(skb)->dest != cpu_to_be16(GENEVE_UDP_PORT)) + return; + swp_spec.l3_proto = l3_proto; + swp_spec.l4_proto = l4_proto; + swp_spec.is_tun = true; + if (inner_ip_hdr(skb)->version == 6) { + swp_spec.tun_l3_proto = htons(ETH_P_IPV6); + swp_spec.tun_l4_proto = inner_ipv6_hdr(skb)->nexthdr; + } else { + swp_spec.tun_l3_proto = htons(ETH_P_IP); + swp_spec.tun_l4_proto = inner_ip_hdr(skb)->protocol; + } + + mlx5e_set_eseg_swp(skb, eseg, &swp_spec); + if (skb_vlan_tag_present(skb) && ihs) + mlx5e_eseg_swp_offsets_add_vlan(eseg); +} + +#else +static inline bool mlx5_geneve_tx_allowed(struct mlx5_core_dev *mdev) +{ + return false; +} + +#endif /* CONFIG_GENEVE */ + +static inline void +mlx5e_udp_gso_handle_tx_skb(struct sk_buff *skb) +{ + int payload_len = skb_shinfo(skb)->gso_size + sizeof(struct udphdr); + + udp_hdr(skb)->len = htons(payload_len); +} + +struct mlx5e_accel_tx_state { +#ifdef CONFIG_MLX5_EN_TLS + struct mlx5e_accel_tx_tls_state tls; +#endif +#ifdef CONFIG_MLX5_EN_IPSEC + struct mlx5e_accel_tx_ipsec_state ipsec; +#endif +}; + +static inline bool mlx5e_accel_tx_begin(struct net_device *dev, + struct mlx5e_txqsq *sq, + struct sk_buff *skb, + struct mlx5e_accel_tx_state *state) +{ + if (skb_is_gso(skb) && skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) + mlx5e_udp_gso_handle_tx_skb(skb); + +#ifdef CONFIG_MLX5_EN_TLS + /* May send SKBs and WQEs. */ + if (mlx5e_tls_skb_offloaded(skb)) + if (unlikely(!mlx5e_tls_handle_tx_skb(dev, sq, skb, &state->tls))) + return false; +#endif + +#ifdef CONFIG_MLX5_EN_IPSEC + if (test_bit(MLX5E_SQ_STATE_IPSEC, &sq->state) && xfrm_offload(skb)) { + if (unlikely(!mlx5e_ipsec_handle_tx_skb(dev, skb, &state->ipsec))) + return false; + } +#endif + +#ifdef CONFIG_MLX5_EN_MACSEC + if (unlikely(mlx5e_macsec_skb_is_offload(skb))) { + struct mlx5e_priv *priv = netdev_priv(dev); + + if (unlikely(!mlx5e_macsec_handle_tx_skb(priv->macsec, skb))) + return false; + } +#endif + + return true; +} + +static inline bool mlx5e_accel_tx_is_ipsec_flow(struct mlx5e_accel_tx_state *state) +{ +#ifdef CONFIG_MLX5_EN_IPSEC + return mlx5e_ipsec_is_tx_flow(&state->ipsec); +#else + return false; +#endif +} + +static inline unsigned int mlx5e_accel_tx_ids_len(struct mlx5e_txqsq *sq, + struct mlx5e_accel_tx_state *state) +{ +#ifdef CONFIG_MLX5_EN_IPSEC + if (test_bit(MLX5E_SQ_STATE_IPSEC, &sq->state)) + return mlx5e_ipsec_tx_ids_len(&state->ipsec); +#endif + + return 0; +} + +/* Part of the eseg touched by TX offloads */ +#define MLX5E_ACCEL_ESEG_LEN offsetof(struct mlx5_wqe_eth_seg, mss) + +static inline void mlx5e_accel_tx_eseg(struct mlx5e_priv *priv, + struct sk_buff *skb, + struct mlx5_wqe_eth_seg *eseg, u16 ihs) +{ +#ifdef CONFIG_MLX5_EN_IPSEC + if (xfrm_offload(skb)) + mlx5e_ipsec_tx_build_eseg(priv, skb, eseg); +#endif + +#ifdef CONFIG_MLX5_EN_MACSEC + if (unlikely(mlx5e_macsec_skb_is_offload(skb))) + mlx5e_macsec_tx_build_eseg(priv->macsec, skb, eseg); +#endif + +#if IS_ENABLED(CONFIG_GENEVE) + if (skb->encapsulation && skb->ip_summed == CHECKSUM_PARTIAL) + mlx5e_tx_tunnel_accel(skb, eseg, ihs); +#endif +} + +static inline void mlx5e_accel_tx_finish(struct mlx5e_txqsq *sq, + struct mlx5e_tx_wqe *wqe, + struct mlx5e_accel_tx_state *state, + struct mlx5_wqe_inline_seg *inlseg) +{ +#ifdef CONFIG_MLX5_EN_TLS + mlx5e_tls_handle_tx_wqe(&wqe->ctrl, &state->tls); +#endif + +#ifdef CONFIG_MLX5_EN_IPSEC + if (test_bit(MLX5E_SQ_STATE_IPSEC, &sq->state) && + state->ipsec.xo && state->ipsec.tailen) + mlx5e_ipsec_handle_tx_wqe(wqe, &state->ipsec, inlseg); +#endif +} + +static inline int mlx5e_accel_init_rx(struct mlx5e_priv *priv) +{ + return mlx5e_ktls_init_rx(priv); +} + +static inline void mlx5e_accel_cleanup_rx(struct mlx5e_priv *priv) +{ + mlx5e_ktls_cleanup_rx(priv); +} +#endif /* __MLX5E_EN_ACCEL_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/fs_tcp.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/fs_tcp.c new file mode 100644 index 0000000..4c4ee52 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/fs_tcp.c @@ -0,0 +1,402 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. */ + +#include +#include "en_accel/fs_tcp.h" +#include "fs_core.h" + +enum accel_fs_tcp_type { + ACCEL_FS_IPV4_TCP, + ACCEL_FS_IPV6_TCP, + ACCEL_FS_TCP_NUM_TYPES, +}; + +struct mlx5e_accel_fs_tcp { + struct mlx5e_flow_table tables[ACCEL_FS_TCP_NUM_TYPES]; + struct mlx5_flow_handle *default_rules[ACCEL_FS_TCP_NUM_TYPES]; +}; + +static enum mlx5_traffic_types fs_accel2tt(enum accel_fs_tcp_type i) +{ + switch (i) { + case ACCEL_FS_IPV4_TCP: + return MLX5_TT_IPV4_TCP; + default: /* ACCEL_FS_IPV6_TCP */ + return MLX5_TT_IPV6_TCP; + } +} + +static void accel_fs_tcp_set_ipv4_flow(struct mlx5_flow_spec *spec, struct sock *sk) +{ + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.ip_protocol); + MLX5_SET(fte_match_param, spec->match_value, outer_headers.ip_protocol, IPPROTO_TCP); + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.ip_version); + MLX5_SET(fte_match_param, spec->match_value, outer_headers.ip_version, 4); + memcpy(MLX5_ADDR_OF(fte_match_param, spec->match_value, + outer_headers.src_ipv4_src_ipv6.ipv4_layout.ipv4), + &inet_sk(sk)->inet_daddr, 4); + memcpy(MLX5_ADDR_OF(fte_match_param, spec->match_value, + outer_headers.dst_ipv4_dst_ipv6.ipv4_layout.ipv4), + &inet_sk(sk)->inet_rcv_saddr, 4); + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + outer_headers.src_ipv4_src_ipv6.ipv4_layout.ipv4); + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + outer_headers.dst_ipv4_dst_ipv6.ipv4_layout.ipv4); +} + +#if IS_ENABLED(CONFIG_IPV6) +static void accel_fs_tcp_set_ipv6_flow(struct mlx5_flow_spec *spec, struct sock *sk) +{ + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.ip_protocol); + MLX5_SET(fte_match_param, spec->match_value, outer_headers.ip_protocol, IPPROTO_TCP); + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.ip_version); + MLX5_SET(fte_match_param, spec->match_value, outer_headers.ip_version, 6); + memcpy(MLX5_ADDR_OF(fte_match_param, spec->match_value, + outer_headers.src_ipv4_src_ipv6.ipv6_layout.ipv6), + &sk->sk_v6_daddr, 16); + memcpy(MLX5_ADDR_OF(fte_match_param, spec->match_value, + outer_headers.dst_ipv4_dst_ipv6.ipv6_layout.ipv6), + &inet6_sk(sk)->saddr, 16); + memset(MLX5_ADDR_OF(fte_match_param, spec->match_criteria, + outer_headers.src_ipv4_src_ipv6.ipv6_layout.ipv6), + 0xff, 16); + memset(MLX5_ADDR_OF(fte_match_param, spec->match_criteria, + outer_headers.dst_ipv4_dst_ipv6.ipv6_layout.ipv6), + 0xff, 16); +} +#endif + +void mlx5e_accel_fs_del_sk(struct mlx5_flow_handle *rule) +{ + mlx5_del_flow_rules(rule); +} + +struct mlx5_flow_handle *mlx5e_accel_fs_add_sk(struct mlx5e_priv *priv, + struct sock *sk, u32 tirn, + uint32_t flow_tag) +{ + struct mlx5_flow_destination dest = {}; + struct mlx5e_flow_table *ft = NULL; + struct mlx5e_accel_fs_tcp *fs_tcp; + MLX5_DECLARE_FLOW_ACT(flow_act); + struct mlx5_flow_handle *flow; + struct mlx5_flow_spec *spec; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) + return ERR_PTR(-ENOMEM); + + fs_tcp = priv->fs.accel_tcp; + + spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS; + + switch (sk->sk_family) { + case AF_INET: + accel_fs_tcp_set_ipv4_flow(spec, sk); + ft = &fs_tcp->tables[ACCEL_FS_IPV4_TCP]; + mlx5e_dbg(HW, priv, "%s flow is %pI4:%d -> %pI4:%d\n", __func__, + &inet_sk(sk)->inet_rcv_saddr, + inet_sk(sk)->inet_sport, + &inet_sk(sk)->inet_daddr, + inet_sk(sk)->inet_dport); + break; +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + if (!sk->sk_ipv6only && + ipv6_addr_type(&sk->sk_v6_daddr) == IPV6_ADDR_MAPPED) { + accel_fs_tcp_set_ipv4_flow(spec, sk); + ft = &fs_tcp->tables[ACCEL_FS_IPV4_TCP]; + } else { + accel_fs_tcp_set_ipv6_flow(spec, sk); + ft = &fs_tcp->tables[ACCEL_FS_IPV6_TCP]; + } + break; +#endif + default: + break; + } + + if (!ft) { + flow = ERR_PTR(-EINVAL); + goto out; + } + + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + outer_headers.tcp_dport); + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + outer_headers.tcp_sport); + MLX5_SET(fte_match_param, spec->match_value, outer_headers.tcp_dport, + ntohs(inet_sk(sk)->inet_sport)); + MLX5_SET(fte_match_param, spec->match_value, outer_headers.tcp_sport, + ntohs(inet_sk(sk)->inet_dport)); + + dest.type = MLX5_FLOW_DESTINATION_TYPE_TIR; + dest.tir_num = tirn; + if (flow_tag != MLX5_FS_DEFAULT_FLOW_TAG) { + spec->flow_context.flow_tag = flow_tag; + spec->flow_context.flags = FLOW_CONTEXT_HAS_TAG; + } + + flow = mlx5_add_flow_rules(ft->t, spec, &flow_act, &dest, 1); + + if (IS_ERR(flow)) + netdev_err(priv->netdev, "mlx5_add_flow_rules() failed, flow is %ld\n", + PTR_ERR(flow)); + +out: + kvfree(spec); + return flow; +} + +static int accel_fs_tcp_add_default_rule(struct mlx5e_priv *priv, + enum accel_fs_tcp_type type) +{ + struct mlx5e_flow_table *accel_fs_t; + struct mlx5_flow_destination dest; + struct mlx5e_accel_fs_tcp *fs_tcp; + MLX5_DECLARE_FLOW_ACT(flow_act); + struct mlx5_flow_handle *rule; + int err = 0; + + fs_tcp = priv->fs.accel_tcp; + accel_fs_t = &fs_tcp->tables[type]; + + dest = mlx5_ttc_get_default_dest(priv->fs.ttc, fs_accel2tt(type)); + rule = mlx5_add_flow_rules(accel_fs_t->t, NULL, &flow_act, &dest, 1); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + netdev_err(priv->netdev, + "%s: add default rule failed, accel_fs type=%d, err %d\n", + __func__, type, err); + return err; + } + + fs_tcp->default_rules[type] = rule; + return 0; +} + +#define MLX5E_ACCEL_FS_TCP_NUM_GROUPS (2) +#define MLX5E_ACCEL_FS_TCP_GROUP1_SIZE (BIT(16) - 1) +#define MLX5E_ACCEL_FS_TCP_GROUP2_SIZE (BIT(0)) +#define MLX5E_ACCEL_FS_TCP_TABLE_SIZE (MLX5E_ACCEL_FS_TCP_GROUP1_SIZE +\ + MLX5E_ACCEL_FS_TCP_GROUP2_SIZE) +static int accel_fs_tcp_create_groups(struct mlx5e_flow_table *ft, + enum accel_fs_tcp_type type) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + void *outer_headers_c; + int ix = 0; + u32 *in; + int err; + u8 *mc; + + ft->g = kcalloc(MLX5E_ACCEL_FS_TCP_NUM_GROUPS, sizeof(*ft->g), GFP_KERNEL); + in = kvzalloc(inlen, GFP_KERNEL); + if (!in || !ft->g) { + kfree(ft->g); + kvfree(in); + return -ENOMEM; + } + + mc = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria); + outer_headers_c = MLX5_ADDR_OF(fte_match_param, mc, outer_headers); + MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, outer_headers_c, ip_protocol); + MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, outer_headers_c, ip_version); + + switch (type) { + case ACCEL_FS_IPV4_TCP: + case ACCEL_FS_IPV6_TCP: + MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, outer_headers_c, tcp_dport); + MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, outer_headers_c, tcp_sport); + break; + default: + err = -EINVAL; + goto out; + } + + switch (type) { + case ACCEL_FS_IPV4_TCP: + MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, outer_headers_c, + src_ipv4_src_ipv6.ipv4_layout.ipv4); + MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, outer_headers_c, + dst_ipv4_dst_ipv6.ipv4_layout.ipv4); + break; + case ACCEL_FS_IPV6_TCP: + memset(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c, + src_ipv4_src_ipv6.ipv6_layout.ipv6), + 0xff, 16); + memset(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c, + dst_ipv4_dst_ipv6.ipv6_layout.ipv6), + 0xff, 16); + break; + default: + err = -EINVAL; + goto out; + } + + MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); + MLX5_SET_CFG(in, start_flow_index, ix); + ix += MLX5E_ACCEL_FS_TCP_GROUP1_SIZE; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); + if (IS_ERR(ft->g[ft->num_groups])) + goto err; + ft->num_groups++; + + /* Default Flow Group */ + memset(in, 0, inlen); + MLX5_SET_CFG(in, start_flow_index, ix); + ix += MLX5E_ACCEL_FS_TCP_GROUP2_SIZE; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); + if (IS_ERR(ft->g[ft->num_groups])) + goto err; + ft->num_groups++; + + kvfree(in); + return 0; + +err: + err = PTR_ERR(ft->g[ft->num_groups]); + ft->g[ft->num_groups] = NULL; +out: + kvfree(in); + + return err; +} + +static int accel_fs_tcp_create_table(struct mlx5e_priv *priv, enum accel_fs_tcp_type type) +{ + struct mlx5e_flow_table *ft = &priv->fs.accel_tcp->tables[type]; + struct mlx5_flow_table_attr ft_attr = {}; + int err; + + ft->num_groups = 0; + + ft_attr.max_fte = MLX5E_ACCEL_FS_TCP_TABLE_SIZE; + ft_attr.level = MLX5E_ACCEL_FS_TCP_FT_LEVEL; + ft_attr.prio = MLX5E_NIC_PRIO; + + ft->t = mlx5_create_flow_table(priv->fs.ns, &ft_attr); + if (IS_ERR(ft->t)) { + err = PTR_ERR(ft->t); + ft->t = NULL; + return err; + } + + netdev_dbg(priv->netdev, "Created fs accel table id %u level %u\n", + ft->t->id, ft->t->level); + + err = accel_fs_tcp_create_groups(ft, type); + if (err) + goto err; + + err = accel_fs_tcp_add_default_rule(priv, type); + if (err) + goto err; + + return 0; +err: + mlx5e_destroy_flow_table(ft); + return err; +} + +static int accel_fs_tcp_disable(struct mlx5e_priv *priv) +{ + int err, i; + + for (i = 0; i < ACCEL_FS_TCP_NUM_TYPES; i++) { + /* Modify ttc rules destination to point back to the indir TIRs */ + err = mlx5_ttc_fwd_default_dest(priv->fs.ttc, fs_accel2tt(i)); + if (err) { + netdev_err(priv->netdev, + "%s: modify ttc[%d] default destination failed, err(%d)\n", + __func__, fs_accel2tt(i), err); + return err; + } + } + + return 0; +} + +static int accel_fs_tcp_enable(struct mlx5e_priv *priv) +{ + struct mlx5_flow_destination dest = {}; + int err, i; + + dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + for (i = 0; i < ACCEL_FS_TCP_NUM_TYPES; i++) { + dest.ft = priv->fs.accel_tcp->tables[i].t; + + /* Modify ttc rules destination to point on the accel_fs FTs */ + err = mlx5_ttc_fwd_dest(priv->fs.ttc, fs_accel2tt(i), &dest); + if (err) { + netdev_err(priv->netdev, + "%s: modify ttc[%d] destination to accel failed, err(%d)\n", + __func__, fs_accel2tt(i), err); + return err; + } + } + return 0; +} + +static void accel_fs_tcp_destroy_table(struct mlx5e_priv *priv, int i) +{ + struct mlx5e_accel_fs_tcp *fs_tcp; + + fs_tcp = priv->fs.accel_tcp; + if (IS_ERR_OR_NULL(fs_tcp->tables[i].t)) + return; + + mlx5_del_flow_rules(fs_tcp->default_rules[i]); + mlx5e_destroy_flow_table(&fs_tcp->tables[i]); + fs_tcp->tables[i].t = NULL; +} + +void mlx5e_accel_fs_tcp_destroy(struct mlx5e_priv *priv) +{ + int i; + + if (!priv->fs.accel_tcp) + return; + + accel_fs_tcp_disable(priv); + + for (i = 0; i < ACCEL_FS_TCP_NUM_TYPES; i++) + accel_fs_tcp_destroy_table(priv, i); + + kfree(priv->fs.accel_tcp); + priv->fs.accel_tcp = NULL; +} + +int mlx5e_accel_fs_tcp_create(struct mlx5e_priv *priv) +{ + int i, err; + + if (!MLX5_CAP_FLOWTABLE_NIC_RX(priv->mdev, ft_field_support.outer_ip_version)) + return -EOPNOTSUPP; + + priv->fs.accel_tcp = kzalloc(sizeof(*priv->fs.accel_tcp), GFP_KERNEL); + if (!priv->fs.accel_tcp) + return -ENOMEM; + + for (i = 0; i < ACCEL_FS_TCP_NUM_TYPES; i++) { + err = accel_fs_tcp_create_table(priv, i); + if (err) + goto err_destroy_tables; + } + + err = accel_fs_tcp_enable(priv); + if (err) + goto err_destroy_tables; + + return 0; + +err_destroy_tables: + while (--i >= 0) + accel_fs_tcp_destroy_table(priv, i); + + kfree(priv->fs.accel_tcp); + priv->fs.accel_tcp = NULL; + return err; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/fs_tcp.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/fs_tcp.h new file mode 100644 index 0000000..5892358 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/fs_tcp.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. */ + +#ifndef __MLX5E_ACCEL_FS_TCP_H__ +#define __MLX5E_ACCEL_FS_TCP_H__ + +#include "en.h" + +#ifdef CONFIG_MLX5_EN_TLS +int mlx5e_accel_fs_tcp_create(struct mlx5e_priv *priv); +void mlx5e_accel_fs_tcp_destroy(struct mlx5e_priv *priv); +struct mlx5_flow_handle *mlx5e_accel_fs_add_sk(struct mlx5e_priv *priv, + struct sock *sk, u32 tirn, + uint32_t flow_tag); +void mlx5e_accel_fs_del_sk(struct mlx5_flow_handle *rule); +#else +static inline int mlx5e_accel_fs_tcp_create(struct mlx5e_priv *priv) { return 0; } +static inline void mlx5e_accel_fs_tcp_destroy(struct mlx5e_priv *priv) {} +static inline struct mlx5_flow_handle *mlx5e_accel_fs_add_sk(struct mlx5e_priv *priv, + struct sock *sk, u32 tirn, + uint32_t flow_tag) +{ return ERR_PTR(-EOPNOTSUPP); } +static inline void mlx5e_accel_fs_del_sk(struct mlx5_flow_handle *rule) {} +#endif + +#endif /* __MLX5E_ACCEL_FS_TCP_H__ */ + diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c new file mode 100644 index 0000000..f595791 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c @@ -0,0 +1,1034 @@ +/* + * Copyright (c) 2017 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include +#include +#include +#include + +#include "en.h" +#include "en_accel/ipsec.h" +#include "en_accel/ipsec_rxtx.h" +#include "en_accel/ipsec_fs.h" +#include "eswitch.h" +#include "esw/ipsec.h" +#include +#include +#include "en/ipsec_aso.h" +#include "../esw/ipsec.h" + +#ifndef XFRM_OFFLOAD_FULL +#define XFRM_OFFLOAD_FULL 4 +#endif + +struct mlx5e_ipsec_async_work { + struct delayed_work dwork; + struct mlx5e_priv *priv; + u32 obj_id; +}; + +static void _mlx5e_ipsec_async_event(struct work_struct *work); + +static struct mlx5e_ipsec_sa_entry *to_ipsec_sa_entry(struct xfrm_state *x) +{ + struct mlx5e_ipsec_sa_entry *sa; + + if (!x) + return NULL; + + sa = (struct mlx5e_ipsec_sa_entry *)x->xso.offload_handle; + if (!sa) + return NULL; + + WARN_ON(sa->x != x); + return sa; +} + +#define ipv6_equal(a, b) (memcmp(&(a), &(b), sizeof(a)) == 0) +struct xfrm_state *mlx5e_ipsec_sadb_rx_lookup_state(struct mlx5e_ipsec *ipsec, + struct sk_buff *skb, u8 ip_ver) +{ + struct mlx5e_ipsec_sa_entry *sa_entry, *sa; + struct ipv6hdr *v6_hdr; + struct iphdr *v4_hdr; + unsigned int temp; + u16 family; + + sa = NULL; + if (ip_ver == 4) { + v4_hdr = (struct iphdr *)(skb->data + ETH_HLEN);; + family = AF_INET; + } else { + v6_hdr = (struct ipv6hdr *)(skb->data + ETH_HLEN); + family = AF_INET6; + } + + hash_for_each_rcu(ipsec->sadb_rx, temp, sa_entry, hlist) { + if (sa_entry->x->props.family != family) + continue; + + if (ip_ver == 4) { + if ((sa_entry->x->props.saddr.a4 == v4_hdr->saddr) && + (sa_entry->x->id.daddr.a4 == v4_hdr->daddr)) { + sa = sa_entry; + break; + } + } else { + if (ipv6_equal(sa_entry->x->id.daddr.a6, v6_hdr->daddr.in6_u.u6_addr32) && + ipv6_equal(sa_entry->x->props.saddr.a6, v6_hdr->saddr.in6_u.u6_addr32)) { + sa = sa_entry; + break; + } + } + } + + if (sa) { + xfrm_state_hold(sa->x); + return sa->x; + } + + return NULL; +} + +struct xfrm_state *mlx5e_ipsec_sadb_rx_lookup(struct mlx5e_ipsec *ipsec, + unsigned int handle) +{ + struct mlx5e_ipsec_sa_entry *sa_entry; + struct xfrm_state *ret = NULL; + + rcu_read_lock(); + hash_for_each_possible_rcu(ipsec->sadb_rx, sa_entry, hlist, handle) + if (sa_entry->handle == handle) { + ret = sa_entry->x; + xfrm_state_hold(ret); + break; + } + rcu_read_unlock(); + + return ret; +} + +#define ipv6_equal(a, b) (memcmp(&(a), &(b), sizeof(a)) == 0) + +static inline bool mlx5e_ipsec_sa_fs_equal(struct mlx5e_ipsec_sa_entry *sa_entry, + struct ethtool_rx_flow_spec *fs) +{ + if ((fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT)) == ESP_V4_FLOW) + return ((sa_entry->x->props.family == AF_INET) && + (fs->h_u.esp_ip4_spec.ip4dst == sa_entry->x->id.daddr.a4) && + (fs->h_u.esp_ip4_spec.ip4src == sa_entry->x->props.saddr.a4) && + (fs->h_u.esp_ip4_spec.spi == sa_entry->x->id.spi)); + + return ((sa_entry->x->props.family == AF_INET6) && + ipv6_equal(fs->h_u.esp_ip6_spec.ip6dst, sa_entry->x->id.daddr.a6) && + ipv6_equal(fs->h_u.esp_ip6_spec.ip6src, sa_entry->x->props.saddr.a6) && + (fs->h_u.esp_ip6_spec.spi == sa_entry->x->id.spi)); +} + +int mlx5e_ipsec_sadb_rx_lookup_rev(struct mlx5e_ipsec *ipsec, + struct ethtool_rx_flow_spec *fs, u32 *handle) +{ + struct mlx5e_ipsec_sa_entry *sa_entry; + int ret = -ENOENT; + unsigned int temp; + + if (((fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT)) != ESP_V4_FLOW) && + ((fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT)) != ESP_V6_FLOW)) + return -ENOENT; + + rcu_read_lock(); + hash_for_each_rcu(ipsec->sadb_rx, temp, sa_entry, hlist) + if (mlx5e_ipsec_sa_fs_equal(sa_entry, fs)) { + *handle = sa_entry->handle; + ret = 0; + break; + } + rcu_read_unlock(); + + return ret; +} + + +static int mlx5e_ipsec_sadb_rx_add(struct mlx5e_ipsec_sa_entry *sa_entry, + unsigned int handle) +{ + struct mlx5e_ipsec *ipsec = sa_entry->ipsec; + struct mlx5e_ipsec_sa_entry *_sa_entry; + unsigned long flags; + + rcu_read_lock(); + hash_for_each_possible_rcu(ipsec->sadb_rx, _sa_entry, hlist, handle) + if (_sa_entry->handle == handle) { + rcu_read_unlock(); + return -EEXIST; + } + rcu_read_unlock(); + + spin_lock_irqsave(&ipsec->sadb_rx_lock, flags); + sa_entry->handle = handle; + hash_add_rcu(ipsec->sadb_rx, &sa_entry->hlist, sa_entry->handle); + spin_unlock_irqrestore(&ipsec->sadb_rx_lock, flags); + + return 0; +} + +static void mlx5e_ipsec_sadb_rx_del(struct mlx5e_ipsec_sa_entry *sa_entry) +{ + struct mlx5e_ipsec *ipsec = sa_entry->ipsec; + unsigned long flags; + + spin_lock_irqsave(&ipsec->sadb_rx_lock, flags); + hash_del_rcu(&sa_entry->hlist); + spin_unlock_irqrestore(&ipsec->sadb_rx_lock, flags); +} + +struct xfrm_state *mlx5e_ipsec_sadb_tx_lookup(struct mlx5e_ipsec *ipsec, + unsigned int handle) +{ + struct mlx5e_ipsec_sa_entry *sa_entry; + struct xfrm_state *ret = NULL; + + rcu_read_lock(); + hash_for_each_possible_rcu(ipsec->sadb_tx, sa_entry, hlist, handle) + if (sa_entry->handle == handle) { + ret = sa_entry->x; + xfrm_state_hold(ret); + break; + } + rcu_read_unlock(); + + return ret; +} + +static int mlx5e_ipsec_sadb_tx_add(struct mlx5e_ipsec_sa_entry *sa_entry, + unsigned int handle) +{ + struct mlx5e_ipsec *ipsec = sa_entry->ipsec; + struct mlx5e_ipsec_sa_entry *_sa_entry; + unsigned long flags; + + rcu_read_lock(); + hash_for_each_possible_rcu(ipsec->sadb_tx, _sa_entry, hlist, handle) + if (_sa_entry->handle == handle) { + rcu_read_unlock(); + return -EEXIST; + } + rcu_read_unlock(); + + spin_lock_irqsave(&ipsec->sadb_tx_lock, flags); + sa_entry->handle = handle; + hash_add_rcu(ipsec->sadb_tx, &sa_entry->hlist, sa_entry->handle); + spin_unlock_irqrestore(&ipsec->sadb_tx_lock, flags); + + return 0; +} + +static void mlx5e_ipsec_sadb_tx_del(struct mlx5e_ipsec_sa_entry *sa_entry) +{ + struct mlx5e_ipsec *ipsec = sa_entry->ipsec; + unsigned long flags; + + spin_lock_irqsave(&ipsec->sadb_tx_lock, flags); + hash_del_rcu(&sa_entry->hlist); + spin_unlock_irqrestore(&ipsec->sadb_tx_lock, flags); +} + +static bool mlx5e_ipsec_update_esn_state(struct mlx5e_ipsec_sa_entry *sa_entry) +{ + struct xfrm_replay_state_esn *replay_esn; + u32 seq_bottom = 0; + u8 overlap; + u32 *esn; + + if (!(sa_entry->x->props.flags & XFRM_STATE_ESN)) { + sa_entry->esn_state.trigger = 0; + return false; + } + + replay_esn = sa_entry->x->replay_esn; + if (replay_esn->seq >= replay_esn->replay_window) + seq_bottom = replay_esn->seq - replay_esn->replay_window + 1; + + overlap = sa_entry->esn_state.overlap; + + sa_entry->esn_state.esn = xfrm_replay_seqhi(sa_entry->x, + htonl(seq_bottom)); + esn = &sa_entry->esn_state.esn; + + sa_entry->esn_state.trigger = 1; + if (unlikely(overlap && seq_bottom < MLX5E_IPSEC_ESN_SCOPE_MID)) { + ++(*esn); + sa_entry->esn_state.overlap = 0; + return true; + } else if (unlikely(!overlap && + (seq_bottom >= MLX5E_IPSEC_ESN_SCOPE_MID))) { + sa_entry->esn_state.overlap = 1; + return true; + } + + return false; +} + +static void +initialize_lifetime_limit(struct mlx5e_ipsec_sa_entry *sa_entry, + struct mlx5_accel_esp_xfrm_attrs *attrs) +{ + struct mlx5e_ipsec_state_lft *lft = &sa_entry->lft; + struct xfrm_state *x = sa_entry->x; + u64 soft_limit, hard_limit; + struct net_device *netdev; + struct mlx5e_priv *priv; + + netdev = x->xso.dev; + priv = netdev_priv(netdev); + + if (MLX5_CAP_GEN(priv->mdev, fpga)) + return; + + hard_limit = x->lft.hard_packet_limit; + soft_limit = (x->lft.soft_packet_limit == IPSEC_NO_LIMIT) + ? 0 : x->lft.soft_packet_limit; + if (!(x->xso.flags & XFRM_OFFLOAD_FULL) || + (hard_limit <= soft_limit) || + (hard_limit == IPSEC_NO_LIMIT)) { + attrs->soft_packet_limit = IPSEC_NO_LIMIT; + attrs->hard_packet_limit = IPSEC_NO_LIMIT; + + if ((hard_limit <= soft_limit) && hard_limit) + netdev_warn(priv->netdev, + "hard limit=%lld must be bigger than soft limit=%lld\n", + hard_limit, soft_limit); + return; + } + + /* We have three possible scenarios: + * 1: soft and hard less than 32 bit + * 2: soft less than 32 bit, hard greater than 32 bit + * 3: soft and hard greater than 32 bit + */ + if (hard_limit < IPSEC_HW_LIMIT) { + /* Case 1: we have one round of hard and one round of soft */ + lft->round_hard = 1; + lft->round_soft = soft_limit ? 1 : 0; + lft->is_simulated = false; + + /* xfrm user set soft limit is 2 and hard limit is 9 meaning u raise soft event + * after 2 packet and hard event after 9 packets. It means for hard limit you + * set counter to 9. For soft limit you have to set the comparator to 7 so that + * you get the soft event after 2 packet + */ + attrs->soft_packet_limit = soft_limit ? hard_limit - soft_limit : 0;; + attrs->hard_packet_limit = hard_limit; + return; + } + + /* Case 2 and 3: + * Each interrupt (round) counts 2^31 packets. How it works is: + * Soft limit (comparator) is set 2^31. At soft event, counter is < 2^31 + * and counter's bit(31) is set for another round of counting. + * If round hard is not divisible by 2^31, the first round is for counting + * the round hard's modulo of 2^31. + */ + lft->is_simulated = true; + + /* To distinguish betwen no soft limit and soft limit, + * we notify soft when round_soft == 1. Therefore + 1 to the division result + */ + lft->round_soft = (soft_limit) ? (soft_limit >> IPSEC_SW_LIMIT_BIT) + 1 : 0; + lft->round_hard = hard_limit >> IPSEC_SW_LIMIT_BIT; + + attrs->hard_packet_limit = IPSEC_SW_LIMIT + (hard_limit & IPSEC_SW_MASK); + attrs->soft_packet_limit = IPSEC_SW_LIMIT; +} + +static void +mlx5e_ipsec_build_accel_xfrm_attrs(struct mlx5e_ipsec_sa_entry *sa_entry, + struct mlx5_accel_esp_xfrm_attrs *attrs) +{ + struct xfrm_state *x = sa_entry->x; + struct aes_gcm_keymat *aes_gcm = &attrs->keymat.aes_gcm; + struct aead_geniv_ctx *geniv_ctx; + struct crypto_aead *aead; + unsigned int crypto_data_len, key_len; + int ivsize; + + memset(attrs, 0, sizeof(*attrs)); + + /* key */ + crypto_data_len = (x->aead->alg_key_len + 7) / 8; + key_len = crypto_data_len - 4; /* 4 bytes salt at end */ + + memcpy(aes_gcm->aes_key, x->aead->alg_key, key_len); + aes_gcm->key_len = key_len * 8; + + /* salt and seq_iv */ + aead = x->data; + geniv_ctx = crypto_aead_ctx(aead); + ivsize = crypto_aead_ivsize(aead); + memcpy(&aes_gcm->seq_iv, &geniv_ctx->salt, ivsize); + memcpy(&aes_gcm->salt, x->aead->alg_key + key_len, + sizeof(aes_gcm->salt)); + + /* iv len */ + aes_gcm->icv_len = x->aead->alg_icv_len; + + /* esn */ + if (sa_entry->esn_state.trigger) { + attrs->flags |= MLX5_ACCEL_ESP_FLAGS_ESN_TRIGGERED; + attrs->esn = sa_entry->esn_state.esn; + if (sa_entry->esn_state.overlap) + attrs->flags |= MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP; + attrs->replay_window = x->replay_esn->replay_window; + } + + /* rx handle */ + attrs->sa_handle = sa_entry->handle; + + /* algo type */ + attrs->keymat_type = MLX5_ACCEL_ESP_KEYMAT_AES_GCM; + + /* action */ + attrs->action = (!(x->xso.flags & XFRM_OFFLOAD_INBOUND)) ? + MLX5_ACCEL_ESP_ACTION_ENCRYPT : + MLX5_ACCEL_ESP_ACTION_DECRYPT; + /* flags */ + attrs->flags |= (x->props.mode == XFRM_MODE_TRANSPORT) ? + MLX5_ACCEL_ESP_FLAGS_TRANSPORT : + MLX5_ACCEL_ESP_FLAGS_TUNNEL; + +/* Valid till stack changes accepted */ +#define XFRM_OFFLOAD_FULL 4 + if (x->xso.flags & XFRM_OFFLOAD_FULL) + attrs->flags |= MLX5_ACCEL_ESP_FLAGS_FULL_OFFLOAD; + + /* spi */ + attrs->spi = x->id.spi; + + /* source , destination ips and udp dport */ + memcpy(&attrs->saddr, x->props.saddr.a6, sizeof(attrs->saddr)); + memcpy(&attrs->daddr, x->id.daddr.a6, sizeof(attrs->daddr)); + attrs->upspec.dport = ntohs(x->sel.dport); + attrs->upspec.dport_mask = ntohs(x->sel.dport_mask); + attrs->upspec.proto = x->sel.proto; + attrs->is_ipv6 = (x->props.family != AF_INET); + + /* authentication tag length */ + attrs->aulen = crypto_aead_authsize(aead); + + /* lifetime limit for full offload */ + initialize_lifetime_limit(sa_entry, attrs); +} + +static inline int mlx5e_xfrm_validate_state(struct xfrm_state *x) +{ + struct net_device *netdev = x->xso.real_dev; + struct mlx5_core_dev *mdev; + struct mlx5_eswitch *esw; + struct mlx5e_priv *priv; + + priv = netdev_priv(netdev); + mdev = priv->mdev; + + if (x->props.aalgo != SADB_AALG_NONE) { + netdev_info(netdev, "Cannot offload authenticated xfrm states\n"); + return -EINVAL; + } + if (x->props.ealgo != SADB_X_EALG_AES_GCM_ICV16) { + netdev_info(netdev, "Only AES-GCM-ICV16 xfrm state may be offloaded\n"); + return -EINVAL; + } + if (x->props.calgo != SADB_X_CALG_NONE) { + netdev_info(netdev, "Cannot offload compressed xfrm states\n"); + return -EINVAL; + } + if (x->props.flags & XFRM_STATE_ESN && + !(mlx5_accel_ipsec_device_caps(mdev) & + MLX5_ACCEL_IPSEC_CAP_ESN)) { + netdev_info(netdev, "Cannot offload ESN xfrm states\n"); + return -EINVAL; + } + if (x->props.family != AF_INET && + x->props.family != AF_INET6) { + netdev_info(netdev, "Only IPv4/6 xfrm states may be offloaded\n"); + return -EINVAL; + } + if (x->props.mode != XFRM_MODE_TRANSPORT && + x->props.mode != XFRM_MODE_TUNNEL) { + dev_info(&netdev->dev, "Only transport and tunnel xfrm states may be offloaded\n"); + return -EINVAL; + } + if (x->id.proto != IPPROTO_ESP) { + netdev_info(netdev, "Only ESP xfrm state may be offloaded\n"); + return -EINVAL; + } + if (x->encap) { + netdev_info(netdev, "Encapsulated xfrm state may not be offloaded\n"); + return -EINVAL; + } + if (!x->aead) { + netdev_info(netdev, "Cannot offload xfrm states without aead\n"); + return -EINVAL; + } + if (x->aead->alg_icv_len != 128) { + netdev_info(netdev, "Cannot offload xfrm states with AEAD ICV length other than 128bit\n"); + return -EINVAL; + } + if ((x->aead->alg_key_len != 128 + 32) && + (x->aead->alg_key_len != 256 + 32)) { + netdev_info(netdev, "Cannot offload xfrm states with AEAD key length other than 128/256 bit\n"); + return -EINVAL; + } + if (x->tfcpad) { + netdev_info(netdev, "Cannot offload xfrm states with tfc padding\n"); + return -EINVAL; + } + if (!x->geniv) { + netdev_info(netdev, "Cannot offload xfrm states without geniv\n"); + return -EINVAL; + } + if (strcmp(x->geniv, "seqiv")) { + netdev_info(netdev, "Cannot offload xfrm states with geniv other than seqiv\n"); + return -EINVAL; + } + if (x->props.family == AF_INET6 && + !(mlx5_accel_ipsec_device_caps(mdev) & + MLX5_ACCEL_IPSEC_CAP_IPV6)) { + netdev_info(netdev, "IPv6 xfrm state offload is not supported by this device\n"); + return -EINVAL; + } + if (x->xso.flags & XFRM_OFFLOAD_FULL) { + if (!(mlx5_accel_ipsec_device_caps(mdev) & MLX5_ACCEL_IPSEC_CAP_FULL_OFFLOAD)) { + netdev_info(netdev, "IPsec full offload is not supported by this device.\n"); + return -EINVAL; + } + esw = mdev->priv.eswitch; + if (!esw || esw->mode != MLX5_ESWITCH_OFFLOADS) { + netdev_info(netdev, "IPsec full offload allowed only in switchdev mode.\n"); + return -EINVAL; + } + if (esw->offloads.ipsec != DEVLINK_ESWITCH_IPSEC_MODE_FULL) { + netdev_info(netdev, + "IPsec full offload allowed only in when devlink full ipsec mode is set.\n"); + return -EINVAL; + } + } else { + esw = mdev->priv.eswitch; + if (esw && esw->offloads.ipsec == DEVLINK_ESWITCH_IPSEC_MODE_FULL) { + netdev_info(netdev, + "IPsec crypto only offload is not allowed when devlink ipsec mode is full.\n"); + return -EINVAL; + } + } + + if ((x->xso.flags & XFRM_OFFLOAD_FULL) && + ((x->lft.hard_byte_limit != XFRM_INF) || + (x->lft.soft_byte_limit != XFRM_INF))) { + netdev_info(netdev, "full offload state does not support:\n\ + x->lft.hard_byte_limit=0x%llx,\n\ + x->lft.soft_byte_limit=0x%llx,\n", + x->lft.hard_byte_limit, + x->lft.soft_byte_limit); + return -EINVAL; + } + + return 0; +} + +static int mlx5e_xfrm_fs_add_rule(struct mlx5e_priv *priv, + struct mlx5e_ipsec_sa_entry *sa_entry) +{ + if (!mlx5_is_ipsec_device(priv->mdev)) + return 0; + + return mlx5e_accel_ipsec_fs_add_rule(priv, &sa_entry->xfrm->attrs, + sa_entry->ipsec_obj_id, + &sa_entry->ipsec_rule); +} + +static void mlx5e_xfrm_fs_del_rule(struct mlx5e_priv *priv, + struct mlx5e_ipsec_sa_entry *sa_entry) +{ + if (!mlx5_is_ipsec_device(priv->mdev)) + return; + + mlx5e_accel_ipsec_fs_del_rule(priv, &sa_entry->xfrm->attrs, + &sa_entry->ipsec_rule); +} + +static int mlx5e_xfrm_add_state(struct xfrm_state *x) +{ + struct mlx5e_ipsec_sa_entry *sa_entry = NULL; + struct net_device *netdev = x->xso.real_dev; + struct mlx5_accel_esp_xfrm_attrs attrs; + struct mlx5e_priv *priv; + unsigned int sa_handle; + int pdn; + int err; + + priv = netdev_priv(netdev); + + err = mlx5e_xfrm_validate_state(x); + if (err) + return err; + + sa_entry = kzalloc(sizeof(*sa_entry), GFP_KERNEL); + if (!sa_entry) { + err = -ENOMEM; + goto out; + } + + sa_entry->x = x; + sa_entry->ipsec = priv->ipsec; + + /* check esn */ + mlx5e_ipsec_update_esn_state(sa_entry); + + /* create xfrm */ + mlx5e_ipsec_build_accel_xfrm_attrs(sa_entry, &attrs); + sa_entry->xfrm = + mlx5_accel_esp_create_xfrm(priv->mdev, &attrs, + MLX5_ACCEL_XFRM_FLAG_REQUIRE_METADATA); + if (IS_ERR(sa_entry->xfrm)) { + err = PTR_ERR(sa_entry->xfrm); + goto err_sa_entry; + } + + /* create hw context */ + pdn = priv->ipsec->aso ? priv->ipsec->aso->pdn : 0; + sa_entry->hw_context = + mlx5_accel_esp_create_hw_context(priv->mdev, + sa_entry->xfrm, + pdn, + &sa_handle); + if (IS_ERR(sa_entry->hw_context)) { + err = PTR_ERR(sa_entry->hw_context); + goto err_xfrm; + } + + sa_entry->ipsec_obj_id = sa_handle; + err = mlx5e_xfrm_fs_add_rule(priv, sa_entry); + if (err) + goto err_hw_ctx; + + if (x->xso.flags & XFRM_OFFLOAD_INBOUND) { + err = mlx5e_ipsec_sadb_rx_add(sa_entry, sa_handle); + if (err) + goto err_add_rule; + } else { + err = mlx5e_ipsec_sadb_tx_add(sa_entry, sa_handle); + if (err) + goto err_add_rule; + sa_entry->set_iv_op = (x->props.flags & XFRM_STATE_ESN) ? + mlx5e_ipsec_set_iv_esn : mlx5e_ipsec_set_iv; + } + + x->xso.offload_handle = (unsigned long)sa_entry; + goto out; + +err_add_rule: + mlx5e_xfrm_fs_del_rule(priv, sa_entry); +err_hw_ctx: + mlx5_accel_esp_free_hw_context(priv->mdev, sa_entry->hw_context); +err_xfrm: + mlx5_accel_esp_destroy_xfrm(sa_entry->xfrm); +err_sa_entry: + kfree(sa_entry); + +out: + return err; +} + +static void mlx5e_xfrm_del_state(struct xfrm_state *x) +{ + struct mlx5e_ipsec_sa_entry *sa_entry = to_ipsec_sa_entry(x); + + if (!sa_entry || sa_entry->is_removed) + return; + + if (x->xso.flags & XFRM_OFFLOAD_INBOUND) + mlx5e_ipsec_sadb_rx_del(sa_entry); + else + mlx5e_ipsec_sadb_tx_del(sa_entry); +} + +static void clean_up_steering(struct mlx5e_ipsec_sa_entry *sa_entry, struct mlx5e_priv *priv) +{ + if (!sa_entry->hw_context) + return; + + flush_workqueue(sa_entry->ipsec->wq); + mlx5e_xfrm_fs_del_rule(priv, sa_entry); + mlx5_accel_esp_free_hw_context(sa_entry->xfrm->mdev, sa_entry->hw_context); + mlx5_accel_esp_destroy_xfrm(sa_entry->xfrm); +} + +static void mlx5e_xfrm_free_state(struct xfrm_state *x) +{ + struct mlx5e_ipsec_sa_entry *sa_entry = to_ipsec_sa_entry(x); + struct mlx5e_priv *priv = netdev_priv(x->xso.dev); + + if (!sa_entry) + return; + + if (!sa_entry->is_removed) + clean_up_steering(sa_entry, priv); + + kfree(sa_entry); +} + +void mlx5e_ipsec_ul_cleanup(struct mlx5e_priv *priv) +{ + struct mlx5e_ipsec_sa_entry *sa_entry; + struct mlx5e_ipsec *ipsec = priv->ipsec; + unsigned int bucket; + + if (!ipsec) + return; + + /* Take rtnl lock to block XFRM Netlink command. + * Cannot take rcu. Therefore, cannot handle race situation + * with internal net/xfrm call back. + */ + rtnl_lock(); + hash_for_each_rcu(ipsec->sadb_rx, bucket, sa_entry, hlist) { + sa_entry->is_removed = true; + mlx5e_ipsec_sadb_rx_del(sa_entry); + clean_up_steering(sa_entry, priv); + } + + hash_for_each_rcu(ipsec->sadb_tx, bucket, sa_entry, hlist) { + sa_entry->is_removed = true; + mlx5e_ipsec_sadb_tx_del(sa_entry); + clean_up_steering(sa_entry, priv); + } + rtnl_unlock(); +} + +int mlx5e_ipsec_init(struct mlx5e_priv *priv) +{ + struct mlx5e_ipsec *ipsec = NULL; + + if (!MLX5_IPSEC_DEV(priv->mdev)) { + netdev_dbg(priv->netdev, "Not an IPSec offload device\n"); + return 0; + } + + ipsec = kzalloc(sizeof(*ipsec), GFP_KERNEL); + if (!ipsec) + return -ENOMEM; + + hash_init(ipsec->sadb_rx); + spin_lock_init(&ipsec->sadb_rx_lock); + ida_init(&ipsec->halloc); + hash_init(ipsec->sadb_tx); + spin_lock_init(&ipsec->sadb_tx_lock); + ipsec->en_priv = priv; + ipsec->no_trailer = !!(mlx5_accel_ipsec_device_caps(priv->mdev) & + MLX5_ACCEL_IPSEC_CAP_RX_NO_TRAILER); + ipsec->wq = alloc_ordered_workqueue("mlx5e_ipsec: %s", 0, + priv->netdev->name); + if (!ipsec->wq) { + kfree(ipsec); + return -ENOMEM; + } + + priv->ipsec = ipsec; + if (mlx5_is_ipsec_full_offload(priv)) + priv->ipsec->aso = mlx5e_aso_setup(priv, MLX5_ST_SZ_BYTES(ipsec_aso)); + else + mlx5e_accel_ipsec_fs_init(priv); + netdev_dbg(priv->netdev, "IPSec attached to netdevice\n"); + return 0; +} + +void mlx5e_ipsec_cleanup(struct mlx5e_priv *priv) +{ + struct mlx5e_ipsec *ipsec = priv->ipsec; + + if (!ipsec) + return; + + if (priv->ipsec->aso) { + mlx5e_aso_cleanup(priv, priv->ipsec->aso); + priv->ipsec->aso = NULL; + } + + mlx5e_accel_ipsec_fs_cleanup(priv); + destroy_workqueue(ipsec->wq); + ida_destroy(&ipsec->halloc); + kfree(ipsec); + priv->ipsec = NULL; +} + +static bool mlx5e_ipsec_offload_ok(struct sk_buff *skb, struct xfrm_state *x) +{ + if (x->props.family == AF_INET) { + /* Offload with IPv4 options is not supported yet */ + if (ip_hdr(skb)->ihl > 5) + return false; + } else { + /* Offload with IPv6 extension headers is not support yet */ + if (ipv6_ext_hdr(ipv6_hdr(skb)->nexthdr)) + return false; + } + + return true; +} + +struct mlx5e_ipsec_modify_state_work { + struct work_struct work; + struct mlx5_accel_esp_xfrm_attrs attrs; + struct mlx5e_ipsec_sa_entry *sa_entry; +}; + +static void _update_xfrm_state(struct work_struct *work) +{ + int ret; + struct mlx5e_ipsec_modify_state_work *modify_work = + container_of(work, struct mlx5e_ipsec_modify_state_work, work); + struct mlx5e_ipsec_sa_entry *sa_entry = modify_work->sa_entry; + + ret = mlx5_accel_esp_modify_xfrm(sa_entry->xfrm, + &modify_work->attrs); + if (ret) + netdev_warn(sa_entry->ipsec->en_priv->netdev, + "Not an IPSec offload device\n"); + + kfree(modify_work); +} + +static void mlx5e_xfrm_advance_esn_state(struct xfrm_state *x) +{ + struct mlx5e_ipsec_sa_entry *sa_entry = to_ipsec_sa_entry(x); + struct mlx5e_ipsec_modify_state_work *modify_work; + bool need_update; + + if (!sa_entry) + return; + + need_update = mlx5e_ipsec_update_esn_state(sa_entry); + if (!need_update) + return; + + modify_work = kzalloc(sizeof(*modify_work), GFP_ATOMIC); + if (!modify_work) + return; + + mlx5e_ipsec_build_accel_xfrm_attrs(sa_entry, &modify_work->attrs); + modify_work->sa_entry = sa_entry; + + INIT_WORK(&modify_work->work, _update_xfrm_state); + WARN_ON(!queue_work(sa_entry->ipsec->wq, &modify_work->work)); +} + +static const struct xfrmdev_ops mlx5e_ipsec_xfrmdev_ops = { + .xdo_dev_state_add = mlx5e_xfrm_add_state, + .xdo_dev_state_delete = mlx5e_xfrm_del_state, + .xdo_dev_state_free = mlx5e_xfrm_free_state, + .xdo_dev_offload_ok = mlx5e_ipsec_offload_ok, + .xdo_dev_state_advance_esn = mlx5e_xfrm_advance_esn_state, +}; + +void mlx5e_ipsec_build_netdev(struct mlx5e_priv *priv) +{ + struct mlx5_core_dev *mdev = priv->mdev; + struct net_device *netdev = priv->netdev; + + if (!(mlx5_accel_ipsec_device_caps(mdev) & MLX5_ACCEL_IPSEC_CAP_ESP) || + !MLX5_CAP_ETH(mdev, swp)) { + mlx5_core_dbg(mdev, "mlx5e: ESP and SWP offload not supported\n"); + return; + } + + mlx5_core_info(mdev, "mlx5e: IPSec ESP acceleration enabled\n"); + netdev->xfrmdev_ops = &mlx5e_ipsec_xfrmdev_ops; + netdev->features |= NETIF_F_HW_ESP; + netdev->hw_enc_features |= NETIF_F_HW_ESP; + + if (!MLX5_CAP_ETH(mdev, swp_csum)) { + mlx5_core_dbg(mdev, "mlx5e: SWP checksum not supported\n"); + return; + } + + netdev->features |= NETIF_F_HW_ESP_TX_CSUM; + netdev->hw_enc_features |= NETIF_F_HW_ESP_TX_CSUM; + + if (!(mlx5_accel_ipsec_device_caps(mdev) & MLX5_ACCEL_IPSEC_CAP_LSO) || + !MLX5_CAP_ETH(mdev, swp_lso)) { + mlx5_core_dbg(mdev, "mlx5e: ESP LSO not supported\n"); + return; + } + + if (mlx5_is_ipsec_device(mdev)) + netdev->gso_partial_features |= NETIF_F_GSO_ESP; + + mlx5_core_dbg(mdev, "mlx5e: ESP GSO capability turned on\n"); + netdev->features |= NETIF_F_GSO_ESP; + netdev->hw_features |= NETIF_F_GSO_ESP; + netdev->hw_enc_features |= NETIF_F_GSO_ESP; +} + +static void update_esn_full_offload(struct mlx5e_priv *priv, + struct mlx5e_ipsec_sa_entry *sa_entry, + u32 obj_id, u32 mode_param) +{ + struct mlx5_accel_esp_xfrm_attrs attrs = {}; + + if (mode_param < MLX5E_IPSEC_ESN_SCOPE_MID) { + sa_entry->esn_state.esn++; + sa_entry->esn_state.overlap = 0; + } else { + sa_entry->esn_state.overlap = 1; + } + + mlx5e_ipsec_build_accel_xfrm_attrs(sa_entry, &attrs); + mlx5_accel_esp_modify_xfrm(sa_entry->xfrm, &attrs); + mlx5e_ipsec_aso_set(priv, obj_id, ARM_ESN_EVENT, 0, NULL, NULL, NULL, NULL); +} + +static void _mlx5e_ipsec_async_event(struct work_struct *work) +{ + struct mlx5e_ipsec_async_work *async_work; + struct mlx5e_ipsec_sa_entry *sa_entry; + struct mlx5e_ipsec_state_lft *lft; + u32 hard_cnt, soft_cnt, old_cnt; + struct delayed_work *dwork; + struct mlx5e_priv *priv; + struct xfrm_state *xs; + u32 mode_param; + u8 event_arm; + u32 obj_id; + int err; + + /* Look up xfrm_state from obj_id */ + dwork = to_delayed_work(work); + async_work = container_of(dwork, struct mlx5e_ipsec_async_work, dwork); + priv = async_work->priv; + obj_id = async_work->obj_id; + + xs = mlx5e_ipsec_sadb_tx_lookup(priv->ipsec, obj_id); + if (!xs) { + xs = mlx5e_ipsec_sadb_rx_lookup(priv->ipsec, obj_id); + if (!xs) + goto out_async_work; + } + + sa_entry = to_ipsec_sa_entry(xs); + if (!sa_entry) + goto out_xs_state; + + lft = &sa_entry->lft; + + /* Query IPsec ASO context */ + if (mlx5e_ipsec_aso_query(priv, obj_id, &hard_cnt, &soft_cnt, &event_arm, &mode_param)) + goto out_xs_state; + + /* Check ESN event */ + if (sa_entry->esn_state.trigger && !(event_arm & MLX5_ASO_ESN_ARM)) + update_esn_full_offload(priv, sa_entry, obj_id, mode_param); + + /* Check life time event */ + if (hard_cnt > soft_cnt || + (!hard_cnt && !(event_arm & MLX5_ASO_REMOVE_FLOW_ENABLE))) + goto out_xs_state; + + /* Life time event */ + if (!hard_cnt) /* Notify hard lifetime to xfrm stack */ + goto out_xs_state; + + /* 0: no more soft + * 1: notify soft + */ + if (lft->round_soft) { + lft->round_soft--; + } + + if (!lft->is_simulated) /* hard_limit < IPSEC_HW_LIMIT */ + goto out_xs_state; + + /* Simulated case */ + if (hard_cnt < IPSEC_SW_LIMIT) { + lft->round_hard--; + if (!lft->round_hard) /* already in last round, no need to set bit(31) */ + goto out_xs_state; + } + + /* Update ASO context */ + old_cnt = hard_cnt; + + if (soft_cnt != IPSEC_SW_LIMIT) + err = mlx5e_ipsec_aso_set(priv, obj_id, + SET_SOFT | ARM_SOFT | SET_CNT_BIT31, + IPSEC_SW_LIMIT, &hard_cnt, &soft_cnt, NULL, NULL); + else + err = mlx5e_ipsec_aso_set(priv, obj_id, + ARM_SOFT | SET_CNT_BIT31, + 0, &hard_cnt, &soft_cnt, NULL, NULL); + + /* when soft_cnt == IPSEC_SW_LIMIT, soft event can happen + * case 1: hard_cnt goes down from IPSEC_SW_LIMIT to IPSEC_SW_LIMIT - 1. In this case, + * we need one extra round of soft event. + * case 2: hard_count goes down from (IPSEC_SW_LIMIT + a) to IPSEC_SW_LIMIT + */ + if (old_cnt == IPSEC_SW_LIMIT) { + if (hard_cnt > old_cnt) + lft->round_hard--; + else if (lft->round_soft) + lft->round_soft++; + } + +out_xs_state: + xfrm_state_put(xs); + +out_async_work: + kfree(async_work); +} + +int mlx5e_ipsec_async_event(struct mlx5e_priv *priv, u32 obj_id) +{ + struct mlx5e_ipsec_async_work *async_work; + + async_work = kzalloc(sizeof(*async_work), GFP_ATOMIC); + if (!async_work) + return NOTIFY_DONE; + + async_work->priv = priv; + async_work->obj_id = obj_id; + + INIT_DELAYED_WORK(&async_work->dwork, _mlx5e_ipsec_async_event); + + WARN_ON(!queue_delayed_work(priv->ipsec->wq, &async_work->dwork, 0)); + + return NOTIFY_OK; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h new file mode 100644 index 0000000..8073e4c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h @@ -0,0 +1,189 @@ +/* + * Copyright (c) 2017 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef __MLX5E_IPSEC_H__ +#define __MLX5E_IPSEC_H__ + +#ifdef CONFIG_MLX5_EN_IPSEC + +#include +#include +#include + +#include "accel/ipsec.h" +#include "en/aso.h" + +#define MLX5E_IPSEC_SADB_RX_BITS 10 +#define MLX5E_IPSEC_SADB_TX_BITS 10 +#define MLX5E_IPSEC_ESN_SCOPE_MID 0x80000000L + +struct mlx5e_priv; + +struct mlx5e_ipsec_sw_stats { + atomic64_t ipsec_rx_drop_sp_alloc; + atomic64_t ipsec_rx_drop_sadb_miss; + atomic64_t ipsec_rx_drop_syndrome; + atomic64_t ipsec_tx_drop_bundle; + atomic64_t ipsec_tx_drop_no_state; + atomic64_t ipsec_tx_drop_not_ip; + atomic64_t ipsec_tx_drop_trailer; + atomic64_t ipsec_tx_drop_metadata; +}; + +struct mlx5e_ipsec_stats { + u64 ipsec_dec_in_packets; + u64 ipsec_dec_out_packets; + u64 ipsec_dec_bypass_packets; + u64 ipsec_enc_in_packets; + u64 ipsec_enc_out_packets; + u64 ipsec_enc_bypass_packets; + u64 ipsec_dec_drop_packets; + u64 ipsec_dec_auth_fail_packets; + u64 ipsec_enc_drop_packets; + u64 ipsec_add_sa_success; + u64 ipsec_add_sa_fail; + u64 ipsec_del_sa_success; + u64 ipsec_del_sa_fail; + u64 ipsec_cmd_drop; + + u64 ipsec_full_rx_pkts; + u64 ipsec_full_rx_bytes; + u64 ipsec_full_rx_pkts_drop; + u64 ipsec_full_rx_bytes_drop; + u64 ipsec_full_tx_pkts; + u64 ipsec_full_tx_bytes; + u64 ipsec_full_tx_pkts_drop; + u64 ipsec_full_tx_bytes_drop; +}; + +struct mlx5e_accel_fs_esp; +struct mlx5e_ipsec_tx; + +struct mlx5e_ipsec { + struct mlx5e_priv *en_priv; + DECLARE_HASHTABLE(sadb_rx, MLX5E_IPSEC_SADB_RX_BITS); + DECLARE_HASHTABLE(sadb_tx, MLX5E_IPSEC_SADB_TX_BITS); + bool no_trailer; + spinlock_t sadb_rx_lock; /* Protects sadb_rx and halloc */ + spinlock_t sadb_tx_lock; /* Protects sadb_tx and halloc */ + struct ida halloc; + struct mlx5e_ipsec_sw_stats sw_stats; + struct mlx5e_ipsec_stats stats; + struct workqueue_struct *wq; + struct mlx5e_accel_fs_esp *rx_fs; + struct mlx5e_ipsec_tx *tx_fs; + struct mlx5e_aso *aso; +}; + +struct mlx5e_ipsec_esn_state { + u32 esn; + u8 trigger: 1; + u8 overlap: 1; +}; + +struct mlx5e_ipsec_rule { + struct mlx5_flow_handle *rule; + struct mlx5_modify_hdr *set_modify_hdr; + struct mlx5_pkt_reformat *pkt_reformat; +}; + +#define IPSEC_NO_LIMIT GENMASK_ULL(63, 0) +#define IPSEC_SW_LIMIT_BIT 31 +#define IPSEC_HW_LIMIT BIT(IPSEC_SW_LIMIT_BIT + 1) +#define IPSEC_SW_LIMIT BIT(IPSEC_SW_LIMIT_BIT) +#define IPSEC_SW_MASK GENMASK(IPSEC_SW_LIMIT_BIT - 1, 0) +#define IPSEC_SAFE (IPSEC_SW_LIMIT / 16) + +struct mlx5e_ipsec_state_lft { + u64 round_soft; /* Number of interrupt before send soft event */ + u64 round_hard; /* Number of interrupt before send hard event */ + bool is_simulated; +}; + +struct mlx5e_ipsec_sa_entry { + struct hlist_node hlist; /* Item in SADB_RX hashtable */ + struct mlx5e_ipsec_esn_state esn_state; + unsigned int handle; /* Handle in SADB_RX */ + struct xfrm_state *x; + struct mlx5e_ipsec *ipsec; + struct mlx5_accel_esp_xfrm *xfrm; + void *hw_context; + void (*set_iv_op)(struct sk_buff *skb, struct xfrm_state *x, + struct xfrm_offload *xo); + u32 ipsec_obj_id; + struct mlx5e_ipsec_rule ipsec_rule; + struct mlx5e_ipsec_state_lft lft; + bool is_removed; +}; + +void mlx5e_ipsec_build_inverse_table(void); +int mlx5e_ipsec_init(struct mlx5e_priv *priv); +void mlx5e_ipsec_cleanup(struct mlx5e_priv *priv); +void mlx5e_ipsec_build_netdev(struct mlx5e_priv *priv); + +struct xfrm_state *mlx5e_ipsec_sadb_rx_lookup(struct mlx5e_ipsec *dev, + unsigned int handle); +int mlx5e_ipsec_sadb_rx_lookup_rev(struct mlx5e_ipsec *ipsec, + struct ethtool_rx_flow_spec *fs, u32 *handle); +struct xfrm_state *mlx5e_ipsec_sadb_rx_lookup_state(struct mlx5e_ipsec *ipsec, + struct sk_buff *skb, u8 ip_ver); +int mlx5e_ipsec_async_event(struct mlx5e_priv *priv, u32 obj_id); +void mlx5e_ipsec_ul_cleanup(struct mlx5e_priv *priv); +#else + +static inline void mlx5e_ipsec_build_inverse_table(void) +{ +} + +static inline int mlx5e_ipsec_init(struct mlx5e_priv *priv) +{ + return 0; +} + +static inline void mlx5e_ipsec_cleanup(struct mlx5e_priv *priv) +{ +} + +static inline void mlx5e_ipsec_build_netdev(struct mlx5e_priv *priv) +{ +} + +static inline int mlx5e_ipsec_async_event(struct mlx5e_priv *priv, u32 obj_id) +{ + return NOTIFY_DONE; +} + +static inline void mlx5e_ipsec_ul_cleanup(struct mlx5e_priv *priv) {} +#endif + +#endif /* __MLX5E_IPSEC_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c new file mode 100644 index 0000000..711c0a4 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c @@ -0,0 +1,939 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. */ + +#include +#include "accel/ipsec_offload.h" +#include "../esw/ipsec.h" +#include "ipsec_fs.h" +#include "fs_core.h" + +#define NUM_IPSEC_FTE BIT(15) + +enum accel_fs_esp_type { + ACCEL_FS_ESP4, + ACCEL_FS_ESP6, + ACCEL_FS_ESP_NUM_TYPES, +}; + +struct mlx5e_ipsec_rx_err { + struct mlx5_flow_table *ft; + struct mlx5_flow_handle *rule; + struct mlx5_modify_hdr *copy_modify_hdr; +}; + +struct mlx5e_accel_fs_esp_prot { + struct mlx5_flow_table *ft; + struct mlx5_flow_group *miss_group; + struct mlx5_flow_handle *miss_rule; + struct mlx5_flow_destination default_dest; + struct mlx5e_ipsec_rx_err rx_err; + u32 refcnt; + struct mutex prot_mutex; /* protect ESP4/ESP6 protocol */ +}; + +struct mlx5e_accel_fs_esp { + struct mlx5e_accel_fs_esp_prot fs_prot[ACCEL_FS_ESP_NUM_TYPES]; +}; + +struct mlx5e_ipsec_tx { + struct mlx5_flow_table *ft; + struct mutex mutex; /* Protect IPsec TX steering */ + u32 refcnt; +}; + +/* IPsec RX flow steering */ +static enum mlx5_traffic_types fs_esp2tt(enum accel_fs_esp_type i) +{ + if (i == ACCEL_FS_ESP4) + return MLX5_TT_IPV4_IPSEC_ESP; + return MLX5_TT_IPV6_IPSEC_ESP; +} + +static int rx_err_add_rule(struct mlx5e_priv *priv, + struct mlx5e_accel_fs_esp_prot *fs_prot, + struct mlx5e_ipsec_rx_err *rx_err) +{ + u8 action[MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto)] = {}; + struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5_flow_act flow_act = {}; + struct mlx5_modify_hdr *modify_hdr; + struct mlx5_flow_handle *fte; + struct mlx5_flow_spec *spec; + int err = 0; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) + return -ENOMEM; + + /* Action to copy 7 bit ipsec_syndrome to regB[24:30] */ + MLX5_SET(copy_action_in, action, action_type, MLX5_ACTION_TYPE_COPY); + MLX5_SET(copy_action_in, action, src_field, MLX5_ACTION_IN_FIELD_IPSEC_SYNDROME); + MLX5_SET(copy_action_in, action, src_offset, 0); + MLX5_SET(copy_action_in, action, length, 7); + MLX5_SET(copy_action_in, action, dst_field, MLX5_ACTION_IN_FIELD_METADATA_REG_B); + MLX5_SET(copy_action_in, action, dst_offset, 24); + + modify_hdr = mlx5_modify_header_alloc(mdev, MLX5_FLOW_NAMESPACE_KERNEL, + 1, action); + + if (IS_ERR(modify_hdr)) { + err = PTR_ERR(modify_hdr); + netdev_err(priv->netdev, + "fail to alloc ipsec copy modify_header_id err=%d\n", err); + goto out_spec; + } + + /* create fte */ + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_MOD_HDR | + MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + flow_act.modify_hdr = modify_hdr; + fte = mlx5_add_flow_rules(rx_err->ft, spec, &flow_act, + &fs_prot->default_dest, 1); + if (IS_ERR(fte)) { + err = PTR_ERR(fte); + netdev_err(priv->netdev, "fail to add ipsec rx err copy rule err=%d\n", err); + goto out; + } + + rx_err->rule = fte; + rx_err->copy_modify_hdr = modify_hdr; + +out: + if (err) + mlx5_modify_header_dealloc(mdev, modify_hdr); +out_spec: + kvfree(spec); + return err; +} + +static void rx_err_del_rule(struct mlx5e_priv *priv, + struct mlx5e_ipsec_rx_err *rx_err) +{ + if (rx_err->rule) { + mlx5_del_flow_rules(rx_err->rule); + rx_err->rule = NULL; + } + + if (rx_err->copy_modify_hdr) { + mlx5_modify_header_dealloc(priv->mdev, rx_err->copy_modify_hdr); + rx_err->copy_modify_hdr = NULL; + } +} + +static void rx_err_destroy_ft(struct mlx5e_priv *priv, struct mlx5e_ipsec_rx_err *rx_err) +{ + rx_err_del_rule(priv, rx_err); + + if (rx_err->ft) { + mlx5_destroy_flow_table(rx_err->ft); + rx_err->ft = NULL; + } +} + +static int rx_err_create_ft(struct mlx5e_priv *priv, + struct mlx5e_accel_fs_esp_prot *fs_prot, + struct mlx5e_ipsec_rx_err *rx_err) +{ + struct mlx5_flow_table_attr ft_attr = {}; + struct mlx5_flow_table *ft; + int err; + + ft_attr.max_fte = 1; + ft_attr.autogroup.max_num_groups = 1; + ft_attr.level = MLX5E_ACCEL_FS_ESP_FT_ERR_LEVEL; + ft_attr.prio = MLX5E_NIC_PRIO; + ft = mlx5_create_auto_grouped_flow_table(priv->fs.ns, &ft_attr); + if (IS_ERR(ft)) { + err = PTR_ERR(ft); + netdev_err(priv->netdev, "fail to create ipsec rx inline ft err=%d\n", err); + return err; + } + + rx_err->ft = ft; + err = rx_err_add_rule(priv, fs_prot, rx_err); + if (err) + goto out_err; + + return 0; + +out_err: + mlx5_destroy_flow_table(ft); + rx_err->ft = NULL; + return err; +} + +static void rx_fs_destroy(struct mlx5e_accel_fs_esp_prot *fs_prot) +{ + if (fs_prot->miss_rule) { + mlx5_del_flow_rules(fs_prot->miss_rule); + fs_prot->miss_rule = NULL; + } + + if (fs_prot->miss_group) { + mlx5_destroy_flow_group(fs_prot->miss_group); + fs_prot->miss_group = NULL; + } + + if (fs_prot->ft) { + mlx5_destroy_flow_table(fs_prot->ft); + fs_prot->ft = NULL; + } +} + +static int rx_fs_create(struct mlx5e_priv *priv, + struct mlx5e_accel_fs_esp_prot *fs_prot) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_flow_table_attr ft_attr = {}; + struct mlx5_flow_group *miss_group; + struct mlx5_flow_handle *miss_rule; + MLX5_DECLARE_FLOW_ACT(flow_act); + struct mlx5_flow_spec *spec; + struct mlx5_flow_table *ft; + u32 *flow_group_in; + int err = 0; + + flow_group_in = kvzalloc(inlen, GFP_KERNEL); + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!flow_group_in || !spec) { + err = -ENOMEM; + goto out; + } + + /* Create FT */ + ft_attr.max_fte = NUM_IPSEC_FTE; + ft_attr.level = MLX5E_ACCEL_FS_ESP_FT_LEVEL; + ft_attr.prio = MLX5E_NIC_PRIO; + ft_attr.autogroup.num_reserved_entries = 1; + ft_attr.autogroup.max_num_groups = 1; + ft = mlx5_create_auto_grouped_flow_table(priv->fs.ns, &ft_attr); + if (IS_ERR(ft)) { + err = PTR_ERR(ft); + netdev_err(priv->netdev, "fail to create ipsec rx ft err=%d\n", err); + goto out; + } + fs_prot->ft = ft; + + /* Create miss_group */ + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, ft->max_fte - 1); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, ft->max_fte - 1); + miss_group = mlx5_create_flow_group(ft, flow_group_in); + if (IS_ERR(miss_group)) { + err = PTR_ERR(miss_group); + netdev_err(priv->netdev, "fail to create ipsec rx miss_group err=%d\n", err); + goto out; + } + fs_prot->miss_group = miss_group; + + /* Create miss rule */ + miss_rule = mlx5_add_flow_rules(ft, spec, &flow_act, &fs_prot->default_dest, 1); + if (IS_ERR(miss_rule)) { + err = PTR_ERR(miss_rule); + netdev_err(priv->netdev, "fail to create ipsec rx miss_rule err=%d\n", err); + goto out; + } + fs_prot->miss_rule = miss_rule; + +out: + kvfree(flow_group_in); + kvfree(spec); + return err; +} + +static int rx_destroy(struct mlx5e_priv *priv, enum accel_fs_esp_type type) +{ + struct mlx5e_accel_fs_esp_prot *fs_prot; + struct mlx5e_accel_fs_esp *accel_esp; + + accel_esp = priv->ipsec->rx_fs; + + /* The netdev unreg already happened, so all offloaded rule are already removed */ + fs_prot = &accel_esp->fs_prot[type]; + + rx_fs_destroy(fs_prot); + + rx_err_destroy_ft(priv, &fs_prot->rx_err); + + return 0; +} + +static int rx_create(struct mlx5e_priv *priv, enum accel_fs_esp_type type) +{ + struct mlx5e_accel_fs_esp_prot *fs_prot; + struct mlx5e_accel_fs_esp *accel_esp; + int err; + + accel_esp = priv->ipsec->rx_fs; + fs_prot = &accel_esp->fs_prot[type]; + + fs_prot->default_dest = + mlx5_ttc_get_default_dest(priv->fs.ttc, fs_esp2tt(type)); + + err = rx_err_create_ft(priv, fs_prot, &fs_prot->rx_err); + if (err) + return err; + + err = rx_fs_create(priv, fs_prot); + if (err) + rx_destroy(priv, type); + + return err; +} + +static int rx_ft_get(struct mlx5e_priv *priv, enum accel_fs_esp_type type) +{ + struct mlx5e_accel_fs_esp_prot *fs_prot; + struct mlx5_flow_destination dest = {}; + struct mlx5e_accel_fs_esp *accel_esp; + int err = 0; + + accel_esp = priv->ipsec->rx_fs; + fs_prot = &accel_esp->fs_prot[type]; + mutex_lock(&fs_prot->prot_mutex); + if (fs_prot->refcnt++) + goto out; + + /* create FT */ + err = rx_create(priv, type); + if (err) { + fs_prot->refcnt--; + goto out; + } + + /* connect */ + dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dest.ft = fs_prot->ft; + mlx5_ttc_fwd_dest(priv->fs.ttc, fs_esp2tt(type), &dest); + +out: + mutex_unlock(&fs_prot->prot_mutex); + return err; +} + +static void rx_ft_put(struct mlx5e_priv *priv, enum accel_fs_esp_type type) +{ + struct mlx5e_accel_fs_esp_prot *fs_prot; + struct mlx5e_accel_fs_esp *accel_esp; + + accel_esp = priv->ipsec->rx_fs; + fs_prot = &accel_esp->fs_prot[type]; + mutex_lock(&fs_prot->prot_mutex); + if (--fs_prot->refcnt) + goto out; + + /* disconnect */ + mlx5_ttc_fwd_default_dest(priv->fs.ttc, fs_esp2tt(type)); + + /* remove FT */ + rx_destroy(priv, type); + +out: + mutex_unlock(&fs_prot->prot_mutex); +} + +/* IPsec TX flow steering */ +static int tx_create(struct mlx5e_priv *priv) +{ + struct mlx5_flow_table_attr ft_attr = {}; + struct mlx5e_ipsec *ipsec = priv->ipsec; + struct mlx5_flow_table *ft; + int err; + + priv->fs.egress_ns = + mlx5_get_flow_namespace(priv->mdev, + MLX5_FLOW_NAMESPACE_EGRESS_IPSEC); + if (!priv->fs.egress_ns) + return -EOPNOTSUPP; + + ft_attr.max_fte = NUM_IPSEC_FTE; + ft_attr.prio = 1; + ft_attr.autogroup.max_num_groups = 1; + ft = mlx5_create_auto_grouped_flow_table(priv->fs.egress_ns, &ft_attr); + if (IS_ERR(ft)) { + err = PTR_ERR(ft); + netdev_err(priv->netdev, "fail to create ipsec tx ft err=%d\n", err); + return err; + } + ipsec->tx_fs->ft = ft; + return 0; +} + +static void tx_destroy(struct mlx5e_priv *priv) +{ + struct mlx5e_ipsec *ipsec = priv->ipsec; + + if (IS_ERR_OR_NULL(ipsec->tx_fs->ft)) + return; + + mlx5_destroy_flow_table(ipsec->tx_fs->ft); + ipsec->tx_fs->ft = NULL; +} + +static int tx_ft_get(struct mlx5e_priv *priv) +{ + struct mlx5e_ipsec_tx *tx_fs = priv->ipsec->tx_fs; + int err = 0; + + mutex_lock(&tx_fs->mutex); + if (tx_fs->refcnt++) + goto out; + + err = tx_create(priv); + if (err) { + tx_fs->refcnt--; + goto out; + } + +out: + mutex_unlock(&tx_fs->mutex); + return err; +} + +static void tx_ft_put(struct mlx5e_priv *priv) +{ + struct mlx5e_ipsec_tx *tx_fs = priv->ipsec->tx_fs; + + mutex_lock(&tx_fs->mutex); + if (--tx_fs->refcnt) + goto out; + + tx_destroy(priv); + +out: + mutex_unlock(&tx_fs->mutex); +} + + +static void setup_udp_match(struct mlx5_accel_esp_xfrm_attrs *attrs, struct mlx5_flow_spec *spec) +{ + if (attrs->upspec.proto == IPPROTO_UDP) { + spec->match_criteria_enable |= MLX5_MATCH_OUTER_HEADERS; + MLX5_SET(fte_match_set_lyr_2_4, spec->match_criteria, udp_dport, + attrs->upspec.dport_mask); + MLX5_SET(fte_match_set_lyr_2_4, spec->match_value, udp_dport, attrs->upspec.dport); + MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, spec->match_criteria, ip_protocol); + MLX5_SET(fte_match_set_lyr_2_4, spec->match_value, ip_protocol, IPPROTO_UDP); + } +} + +static void setup_esp_match(struct mlx5_accel_esp_xfrm_attrs *attrs, struct mlx5_flow_spec *spec) +{ + spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS; + + /* ESP protocol */ + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.ip_protocol); + MLX5_SET(fte_match_param, spec->match_value, outer_headers.ip_protocol, IPPROTO_ESP); + + /* SPI number */ + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, misc_parameters.outer_esp_spi); + MLX5_SET(fte_match_param, spec->match_value, misc_parameters.outer_esp_spi, + be32_to_cpu(attrs->spi)); +} + +static void setup_fte_common(struct mlx5_accel_esp_xfrm_attrs *attrs, + u32 ipsec_obj_id, + struct mlx5_flow_spec *spec, + struct mlx5_flow_act *flow_act) +{ + u8 ip_version = attrs->is_ipv6 ? 6 : 4; + + spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS; + + /* ip_version */ + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.ip_version); + MLX5_SET(fte_match_param, spec->match_value, outer_headers.ip_version, ip_version); + + /* Non fragmented */ + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.frag); + MLX5_SET(fte_match_param, spec->match_value, outer_headers.frag, 0); + if (ip_version == 4) { + memcpy(MLX5_ADDR_OF(fte_match_param, spec->match_value, + outer_headers.src_ipv4_src_ipv6.ipv4_layout.ipv4), + &attrs->saddr.a4, 4); + memcpy(MLX5_ADDR_OF(fte_match_param, spec->match_value, + outer_headers.dst_ipv4_dst_ipv6.ipv4_layout.ipv4), + &attrs->daddr.a4, 4); + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + outer_headers.src_ipv4_src_ipv6.ipv4_layout.ipv4); + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + outer_headers.dst_ipv4_dst_ipv6.ipv4_layout.ipv4); + } else { + memcpy(MLX5_ADDR_OF(fte_match_param, spec->match_value, + outer_headers.src_ipv4_src_ipv6.ipv6_layout.ipv6), + &attrs->saddr.a6, 16); + memcpy(MLX5_ADDR_OF(fte_match_param, spec->match_value, + outer_headers.dst_ipv4_dst_ipv6.ipv6_layout.ipv6), + &attrs->daddr.a6, 16); + memset(MLX5_ADDR_OF(fte_match_param, spec->match_criteria, + outer_headers.src_ipv4_src_ipv6.ipv6_layout.ipv6), + 0xff, 16); + memset(MLX5_ADDR_OF(fte_match_param, spec->match_criteria, + outer_headers.dst_ipv4_dst_ipv6.ipv6_layout.ipv6), + 0xff, 16); + } + + flow_act->crypto.type = MLX5_FLOW_CONTEXT_ENCRYPT_DECRYPT_TYPE_IPSEC; + flow_act->crypto.obj_id = ipsec_obj_id; + flow_act->flags |= FLOW_ACT_NO_APPEND; +} + +static int rx_add_rule_full(struct mlx5e_priv *priv, + struct mlx5_accel_esp_xfrm_attrs *attrs, + u32 ipsec_obj_id, + struct mlx5e_ipsec_rule *ipsec_rule) +{ + struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5_flow_destination dest = {}; + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_handle *rule; + struct mlx5_flow_spec *spec; + struct mlx5_eswitch *esw; + int err = 0; + + esw = mdev->priv.eswitch; + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) + return -ENOMEM; + + err = mlx5_esw_ipsec_get_refcnt(esw); + if (err) + goto ipsec_ref_err; + + setup_fte_common(attrs, ipsec_obj_id, spec, &flow_act); + setup_esp_match(attrs, spec); + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | + MLX5_FLOW_CONTEXT_ACTION_CRYPTO_DECRYPT; + dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dest.ft = mlx5_esw_ipsec_get_table(esw, MLX5_ESW_IPSEC_FT_RX_DECAP); + rule = mlx5_add_flow_rules(mlx5_esw_ipsec_get_table(esw, MLX5_ESW_IPSEC_FT_RX_CRYPTO), spec, &flow_act, &dest, 1); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + netdev_err(priv->netdev, "Failed to add IPsec Rx crypto rule err=%d\n", err); + goto rule_err; + } + + ipsec_rule->rule = rule; + return 0; + +rule_err: + mlx5_esw_ipsec_put_refcnt(esw); +ipsec_ref_err: + kvfree(spec); + return err; +} + +static int rx_add_rule(struct mlx5e_priv *priv, + struct mlx5_accel_esp_xfrm_attrs *attrs, + u32 ipsec_obj_id, + struct mlx5e_ipsec_rule *ipsec_rule) +{ + u8 action[MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto)] = {}; + struct mlx5_modify_hdr *modify_hdr = NULL; + struct mlx5e_accel_fs_esp_prot *fs_prot; + struct mlx5_flow_destination dest = {}; + struct mlx5e_accel_fs_esp *accel_esp; + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_handle *rule; + enum accel_fs_esp_type type; + struct mlx5_flow_spec *spec; + int err = 0; + + accel_esp = priv->ipsec->rx_fs; + type = attrs->is_ipv6 ? ACCEL_FS_ESP6 : ACCEL_FS_ESP4; + fs_prot = &accel_esp->fs_prot[type]; + + err = rx_ft_get(priv, type); + if (err) + return err; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) { + err = -ENOMEM; + goto out_err; + } + + setup_fte_common(attrs, ipsec_obj_id, spec, &flow_act); + setup_esp_match(attrs, spec); + + /* Set bit[31] ipsec marker */ + /* Set bit[23-0] ipsec_obj_id */ + MLX5_SET(set_action_in, action, action_type, MLX5_ACTION_TYPE_SET); + MLX5_SET(set_action_in, action, field, MLX5_ACTION_IN_FIELD_METADATA_REG_B); + MLX5_SET(set_action_in, action, data, (ipsec_obj_id | BIT(31))); + MLX5_SET(set_action_in, action, offset, 0); + MLX5_SET(set_action_in, action, length, 32); + + modify_hdr = mlx5_modify_header_alloc(priv->mdev, MLX5_FLOW_NAMESPACE_KERNEL, + 1, action); + if (IS_ERR(modify_hdr)) { + err = PTR_ERR(modify_hdr); + netdev_err(priv->netdev, + "fail to alloc ipsec set modify_header_id err=%d\n", err); + modify_hdr = NULL; + goto out_err; + } + + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | + MLX5_FLOW_CONTEXT_ACTION_CRYPTO_DECRYPT | + MLX5_FLOW_CONTEXT_ACTION_MOD_HDR; + dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + flow_act.modify_hdr = modify_hdr; + dest.ft = fs_prot->rx_err.ft; + rule = mlx5_add_flow_rules(fs_prot->ft, spec, &flow_act, &dest, 1); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + netdev_err(priv->netdev, "fail to add ipsec rule attrs->action=0x%x, err=%d\n", + attrs->action, err); + goto out_err; + } + + ipsec_rule->rule = rule; + ipsec_rule->set_modify_hdr = modify_hdr; + goto out; + +out_err: + if (modify_hdr) + mlx5_modify_header_dealloc(priv->mdev, modify_hdr); + rx_ft_put(priv, type); + +out: + kvfree(spec); + return err; +} + +static int tx_add_rule_full(struct mlx5e_priv *priv, + struct mlx5_accel_esp_xfrm_attrs *attrs, + u32 ipsec_obj_id, + struct mlx5e_ipsec_rule *ipsec_rule) +{ + struct mlx5_pkt_reformat_params reformat_params; + struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5_flow_destination dest = {}; + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_handle *rule; + struct mlx5_flow_spec *spec; + struct mlx5_eswitch *esw; + char *reformatbf = NULL; + int err = 0; + + esw = mdev->priv.eswitch; + + if (esw->offloads.ipsec != DEVLINK_ESWITCH_IPSEC_MODE_FULL) + return -ENOTSUPP; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) + return -ENOMEM; + + reformatbf = kzalloc(16, GFP_KERNEL); + if (!reformatbf) { + err = -ENOMEM; + goto out; + } + + err = mlx5_esw_ipsec_get_refcnt(esw); + if (err) + goto ipsec_ref_err; + + setup_fte_common(attrs, ipsec_obj_id, spec, &flow_act); + setup_udp_match(attrs, spec); + + /* IPsec Tx table1 FW rule */ + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | + MLX5_FLOW_CONTEXT_ACTION_CRYPTO_ENCRYPT | + MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT; + + memcpy(reformatbf, &attrs->spi, 4); + memcpy(reformatbf + 4, &attrs->seq, 4); + + reformat_params.type = (attrs->is_ipv6) ? MLX5_REFORMAT_TYPE_ADD_ESP_TRANSPORT_OVER_IPV6 : + MLX5_REFORMAT_TYPE_ADD_ESP_TRANSPORT_OVER_IPV4; + reformat_params.param_0 = attrs->aulen / 4; + reformat_params.param_1 = 0; + reformat_params.size = 16; + reformat_params.data = reformatbf; + flow_act.pkt_reformat = mlx5_packet_reformat_alloc(mdev, &reformat_params, + MLX5_FLOW_NAMESPACE_FDB); + if (IS_ERR(flow_act.pkt_reformat)) { + err = PTR_ERR(flow_act.pkt_reformat); + netdev_err(priv->netdev, "Failed to allocate IPsec Tx reformat context err=%d\n", err); + goto pkt_reformat_err; + } + ipsec_rule->pkt_reformat = flow_act.pkt_reformat; + + dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dest.ft = mlx5_esw_ipsec_get_table(mdev->priv.eswitch, MLX5_ESW_IPSEC_FT_TX_CHK); + flow_act.crypto.type = MLX5_FLOW_CONTEXT_ENCRYPT_DECRYPT_TYPE_IPSEC; + flow_act.crypto.obj_id = ipsec_obj_id; + rule = mlx5_add_flow_rules(mlx5_esw_ipsec_get_table(esw, MLX5_ESW_IPSEC_FT_TX_CRYPTO), spec, &flow_act, &dest, 1); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + netdev_err(priv->netdev, "Failed to add IPsec FULL crypto rule attrs->action=0x%x, err=%d\n", attrs->action, err); + goto add_rule_err; + } + ipsec_rule->rule = rule; + + goto ipsec_ref_err; + +add_rule_err: + mlx5_packet_reformat_dealloc(mdev, ipsec_rule->pkt_reformat); + ipsec_rule->pkt_reformat = NULL; +pkt_reformat_err: + mlx5_esw_ipsec_put_refcnt(esw); +ipsec_ref_err: + kfree(reformatbf); +out: + kvfree(spec); + return err; +} + +static int tx_add_rule(struct mlx5e_priv *priv, + struct mlx5_accel_esp_xfrm_attrs *attrs, + u32 ipsec_obj_id, + struct mlx5e_ipsec_rule *ipsec_rule) +{ + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_handle *rule; + struct mlx5_flow_spec *spec; + int err = 0; + + err = tx_ft_get(priv); + if (err) + return err; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) { + err = -ENOMEM; + goto out; + } + + setup_fte_common(attrs, ipsec_obj_id, spec, &flow_act); + setup_esp_match(attrs, spec); + + /* Add IPsec indicator in metadata_reg_a */ + spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS_2; + MLX5_SET(fte_match_param, spec->match_criteria, misc_parameters_2.metadata_reg_a, + MLX5_ETH_WQE_FT_META_IPSEC); + MLX5_SET(fte_match_param, spec->match_value, misc_parameters_2.metadata_reg_a, + MLX5_ETH_WQE_FT_META_IPSEC); + + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_ALLOW | + MLX5_FLOW_CONTEXT_ACTION_CRYPTO_ENCRYPT; + rule = mlx5_add_flow_rules(priv->ipsec->tx_fs->ft, spec, &flow_act, NULL, 0); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + netdev_err(priv->netdev, "fail to add ipsec rule attrs->action=0x%x, err=%d\n", + attrs->action, err); + goto out; + } + + ipsec_rule->rule = rule; + +out: + kvfree(spec); + if (err) + tx_ft_put(priv); + return err; +} + +static void rx_del_rule(struct mlx5e_priv *priv, + struct mlx5_accel_esp_xfrm_attrs *attrs, + struct mlx5e_ipsec_rule *ipsec_rule) +{ + mlx5_del_flow_rules(ipsec_rule->rule); + ipsec_rule->rule = NULL; + + mlx5_modify_header_dealloc(priv->mdev, ipsec_rule->set_modify_hdr); + ipsec_rule->set_modify_hdr = NULL; + + rx_ft_put(priv, attrs->is_ipv6 ? ACCEL_FS_ESP6 : ACCEL_FS_ESP4); +} + +static void tx_del_rule(struct mlx5e_priv *priv, + struct mlx5e_ipsec_rule *ipsec_rule) +{ + mlx5_del_flow_rules(ipsec_rule->rule); + ipsec_rule->rule = NULL; + tx_ft_put(priv); +} + +static void rx_del_rule_full(struct mlx5e_priv *priv, + struct mlx5_accel_esp_xfrm_attrs *attrs, + struct mlx5e_ipsec_rule *ipsec_rule) +{ + mlx5_del_flow_rules(ipsec_rule->rule); + ipsec_rule->rule = NULL; + mlx5_esw_ipsec_put_refcnt(priv->mdev->priv.eswitch); +} + +static void tx_del_rule_full(struct mlx5e_priv *priv, + struct mlx5e_ipsec_rule *ipsec_rule) +{ + mlx5_del_flow_rules(ipsec_rule->rule); + ipsec_rule->rule = NULL; + + mlx5_packet_reformat_dealloc(priv->mdev, ipsec_rule->pkt_reformat); + ipsec_rule->pkt_reformat = NULL; + mlx5_esw_ipsec_put_refcnt(priv->mdev->priv.eswitch); +} + +static int +ipsec_full_add_rule(struct mlx5e_priv *priv, + struct mlx5_accel_esp_xfrm_attrs *attrs, + u32 ipsec_obj_id, + struct mlx5e_ipsec_rule *ipsec_rule) +{ + if (!mlx5_esw_ipsec_is_full_initialized(priv->mdev->priv.eswitch)) + return -EINVAL; + + if (attrs->action == MLX5_ACCEL_ESP_ACTION_DECRYPT) + return rx_add_rule_full(priv, attrs, ipsec_obj_id, ipsec_rule); + else + return tx_add_rule_full(priv, attrs, ipsec_obj_id, ipsec_rule); +} + +static int +ipsec_inline_add_rule(struct mlx5e_priv *priv, + struct mlx5_accel_esp_xfrm_attrs *attrs, + u32 ipsec_obj_id, + struct mlx5e_ipsec_rule *ipsec_rule) +{ + if (!priv->ipsec->rx_fs) + return -EOPNOTSUPP; + + if (attrs->action == MLX5_ACCEL_ESP_ACTION_DECRYPT) + return rx_add_rule(priv, attrs, ipsec_obj_id, ipsec_rule); + else + return tx_add_rule(priv, attrs, ipsec_obj_id, ipsec_rule); +} + +int mlx5e_accel_ipsec_fs_add_rule(struct mlx5e_priv *priv, + struct mlx5_accel_esp_xfrm_attrs *attrs, + u32 ipsec_obj_id, + struct mlx5e_ipsec_rule *ipsec_rule) +{ + if (attrs->flags & MLX5_ACCEL_ESP_FLAGS_FULL_OFFLOAD) + return ipsec_full_add_rule(priv, attrs, ipsec_obj_id, ipsec_rule); + else + return ipsec_inline_add_rule(priv, attrs, ipsec_obj_id, ipsec_rule); +} + +static void +ipsec_full_del_rule(struct mlx5e_priv *priv, + struct mlx5_accel_esp_xfrm_attrs *attrs, + struct mlx5e_ipsec_rule *ipsec_rule) +{ + if (!mlx5_esw_ipsec_is_full_initialized(priv->mdev->priv.eswitch)) + return; + + if (attrs->action == MLX5_ACCEL_ESP_ACTION_DECRYPT) + return rx_del_rule_full(priv, attrs, ipsec_rule); + else + return tx_del_rule_full(priv, ipsec_rule); +} + +static void +ipsec_inline_del_rule(struct mlx5e_priv *priv, + struct mlx5_accel_esp_xfrm_attrs *attrs, + struct mlx5e_ipsec_rule *ipsec_rule) +{ + if (!priv->ipsec->rx_fs) + return; + + if (attrs->action == MLX5_ACCEL_ESP_ACTION_DECRYPT) + return rx_del_rule(priv, attrs, ipsec_rule); + else + return tx_del_rule(priv, ipsec_rule); +} + +void mlx5e_accel_ipsec_fs_del_rule(struct mlx5e_priv *priv, + struct mlx5_accel_esp_xfrm_attrs *attrs, + struct mlx5e_ipsec_rule *ipsec_rule) +{ + if (attrs->flags & MLX5_ACCEL_ESP_FLAGS_FULL_OFFLOAD) + return ipsec_full_del_rule(priv, attrs, ipsec_rule); + else + return ipsec_inline_del_rule(priv, attrs, ipsec_rule); +} + +static void fs_cleanup_tx(struct mlx5e_priv *priv) +{ + mutex_destroy(&priv->ipsec->tx_fs->mutex); + WARN_ON(priv->ipsec->tx_fs->refcnt); + kfree(priv->ipsec->tx_fs); + priv->ipsec->tx_fs = NULL; +} + +static void fs_cleanup_rx(struct mlx5e_priv *priv) +{ + struct mlx5e_accel_fs_esp_prot *fs_prot; + struct mlx5e_accel_fs_esp *accel_esp; + enum accel_fs_esp_type i; + + accel_esp = priv->ipsec->rx_fs; + for (i = 0; i < ACCEL_FS_ESP_NUM_TYPES; i++) { + fs_prot = &accel_esp->fs_prot[i]; + mutex_destroy(&fs_prot->prot_mutex); + WARN_ON(fs_prot->refcnt); + } + kfree(priv->ipsec->rx_fs); + priv->ipsec->rx_fs = NULL; +} + +static int fs_init_tx(struct mlx5e_priv *priv) +{ + priv->ipsec->tx_fs = + kzalloc(sizeof(struct mlx5e_ipsec_tx), GFP_KERNEL); + if (!priv->ipsec->tx_fs) + return -ENOMEM; + + mutex_init(&priv->ipsec->tx_fs->mutex); + return 0; +} + +static int fs_init_rx(struct mlx5e_priv *priv) +{ + struct mlx5e_accel_fs_esp_prot *fs_prot; + struct mlx5e_accel_fs_esp *accel_esp; + enum accel_fs_esp_type i; + + priv->ipsec->rx_fs = + kzalloc(sizeof(struct mlx5e_accel_fs_esp), GFP_KERNEL); + if (!priv->ipsec->rx_fs) + return -ENOMEM; + + accel_esp = priv->ipsec->rx_fs; + for (i = 0; i < ACCEL_FS_ESP_NUM_TYPES; i++) { + fs_prot = &accel_esp->fs_prot[i]; + mutex_init(&fs_prot->prot_mutex); + } + + return 0; +} + +void mlx5e_accel_ipsec_fs_cleanup(struct mlx5e_priv *priv) +{ + if (!priv->ipsec->rx_fs) + return; + + fs_cleanup_tx(priv); + fs_cleanup_rx(priv); +} + +int mlx5e_accel_ipsec_fs_init(struct mlx5e_priv *priv) +{ + int err; + + if (!mlx5_is_ipsec_device(priv->mdev) || !priv->ipsec) + return -EOPNOTSUPP; + + err = fs_init_tx(priv); + if (err) + return err; + + err = fs_init_rx(priv); + if (err) + fs_cleanup_tx(priv); + + return err; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.h new file mode 100644 index 0000000..3389b3b --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. */ + +#ifndef __MLX5_IPSEC_STEERING_H__ +#define __MLX5_IPSEC_STEERING_H__ + +#include "en.h" +#include "ipsec.h" +#include "accel/ipsec_offload.h" +#include "en/fs.h" + +#ifdef CONFIG_MLX5_EN_IPSEC +void mlx5e_accel_ipsec_fs_cleanup(struct mlx5e_priv *priv); +int mlx5e_accel_ipsec_fs_init(struct mlx5e_priv *priv); +int mlx5e_accel_ipsec_fs_add_rule(struct mlx5e_priv *priv, + struct mlx5_accel_esp_xfrm_attrs *attrs, + u32 ipsec_obj_id, + struct mlx5e_ipsec_rule *ipsec_rule); +void mlx5e_accel_ipsec_fs_del_rule(struct mlx5e_priv *priv, + struct mlx5_accel_esp_xfrm_attrs *attrs, + struct mlx5e_ipsec_rule *ipsec_rule); +#else +static inline void mlx5e_accel_ipsec_fs_cleanup(struct mlx5e_priv *priv) {} +static inline int mlx5e_accel_ipsec_fs_init(struct mlx5e_priv *priv) { return 0; } +#endif +#endif /* __MLX5_IPSEC_STEERING_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c new file mode 100644 index 0000000..78333e2 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c @@ -0,0 +1,657 @@ +/* + * Copyright (c) 2017 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include +#include +#include "accel/ipsec_offload.h" +#include "en_accel/ipsec_rxtx.h" +#include "en_accel/ipsec.h" +#include "accel/accel.h" +#include "en.h" +#include "../esw/ipsec.h" + +enum { + MLX5E_IPSEC_RX_SYNDROME_DECRYPTED = 0x11, + MLX5E_IPSEC_RX_SYNDROME_AUTH_FAILED = 0x12, + MLX5E_IPSEC_RX_SYNDROME_BAD_PROTO = 0x17, +}; + +struct mlx5e_ipsec_rx_metadata { + unsigned char nexthdr; + __be32 sa_handle; +} __packed; + +enum { + MLX5E_IPSEC_TX_SYNDROME_OFFLOAD = 0x8, + MLX5E_IPSEC_TX_SYNDROME_OFFLOAD_WITH_LSO_TCP = 0x9, +}; + +struct mlx5e_ipsec_tx_metadata { + __be16 mss_inv; /* 1/MSS in 16bit fixed point, only for LSO */ + __be16 seq; /* LSBs of the first TCP seq, only for LSO */ + u8 esp_next_proto; /* Next protocol of ESP */ +} __packed; + +struct mlx5e_ipsec_metadata { + unsigned char syndrome; + union { + unsigned char raw[5]; + /* from FPGA to host, on successful decrypt */ + struct mlx5e_ipsec_rx_metadata rx; + /* from host to FPGA */ + struct mlx5e_ipsec_tx_metadata tx; + } __packed content; + /* packet type ID field */ + __be16 ethertype; +} __packed; + +#define MAX_LSO_MSS 2048 + +/* Pre-calculated (Q0.16) fixed-point inverse 1/x function */ +static __be16 mlx5e_ipsec_inverse_table[MAX_LSO_MSS]; + +static inline __be16 mlx5e_ipsec_mss_inv(struct sk_buff *skb) +{ + return mlx5e_ipsec_inverse_table[skb_shinfo(skb)->gso_size]; +} + +static struct mlx5e_ipsec_metadata *mlx5e_ipsec_add_metadata(struct sk_buff *skb) +{ + struct mlx5e_ipsec_metadata *mdata; + struct ethhdr *eth; + + if (unlikely(skb_cow_head(skb, sizeof(*mdata)))) + return ERR_PTR(-ENOMEM); + + eth = (struct ethhdr *)skb_push(skb, sizeof(*mdata)); + skb->mac_header -= sizeof(*mdata); + mdata = (struct mlx5e_ipsec_metadata *)(eth + 1); + + memmove(skb->data, skb->data + sizeof(*mdata), + 2 * ETH_ALEN); + + eth->h_proto = cpu_to_be16(MLX5E_METADATA_ETHER_TYPE); + + memset(mdata->content.raw, 0, sizeof(mdata->content.raw)); + return mdata; +} + +static int mlx5e_ipsec_remove_trailer(struct sk_buff *skb, struct xfrm_state *x) +{ + unsigned int alen = crypto_aead_authsize(x->data); + struct ipv6hdr *ipv6hdr = ipv6_hdr(skb); + struct iphdr *ipv4hdr = ip_hdr(skb); + unsigned int trailer_len; + u8 plen; + int ret; + + ret = skb_copy_bits(skb, skb->len - alen - 2, &plen, 1); + if (unlikely(ret)) + return ret; + + trailer_len = alen + plen + 2; + + pskb_trim(skb, skb->len - trailer_len); + if (skb->protocol == htons(ETH_P_IP)) { + ipv4hdr->tot_len = htons(ntohs(ipv4hdr->tot_len) - trailer_len); + ip_send_check(ipv4hdr); + } else { + ipv6hdr->payload_len = htons(ntohs(ipv6hdr->payload_len) - + trailer_len); + } + return 0; +} + +static void mlx5e_ipsec_set_swp(struct sk_buff *skb, + struct mlx5_wqe_eth_seg *eseg, u8 mode, + struct xfrm_offload *xo) +{ + /* Tunnel Mode: + * SWP: OutL3 InL3 InL4 + * Pkt: MAC IP ESP IP L4 + * + * Transport Mode: + * SWP: OutL3 OutL4 + * Pkt: MAC IP ESP L4 + * + * Tunnel(VXLAN TCP/UDP) over Transport Mode + * SWP: OutL3 InL3 InL4 + * Pkt: MAC IP ESP UDP VXLAN IP L4 + */ + + /* Shared settings */ + eseg->swp_outer_l3_offset = skb_network_offset(skb) / 2; + if (skb->protocol == htons(ETH_P_IPV6)) + eseg->swp_flags |= MLX5_ETH_WQE_SWP_OUTER_L3_IPV6; + + /* Tunnel mode */ + if (mode == XFRM_MODE_TUNNEL) { + eseg->swp_inner_l3_offset = skb_inner_network_offset(skb) / 2; + if (xo->proto == IPPROTO_IPV6) + eseg->swp_flags |= MLX5_ETH_WQE_SWP_INNER_L3_IPV6; + + switch (xo->inner_ipproto) { + case IPPROTO_UDP: + eseg->swp_flags |= MLX5_ETH_WQE_SWP_INNER_L4_UDP; + fallthrough; + case IPPROTO_TCP: + /* IP | ESP | IP | [TCP | UDP] */ + eseg->swp_inner_l4_offset = skb_inner_transport_offset(skb) / 2; + break; + default: + break; + } + return; + } + + /* Transport mode */ + if (mode != XFRM_MODE_TRANSPORT) + return; + + if (!xo->inner_ipproto) { + switch (xo->proto) { + case IPPROTO_UDP: + eseg->swp_flags |= MLX5_ETH_WQE_SWP_OUTER_L4_UDP; + fallthrough; + case IPPROTO_TCP: + /* IP | ESP | TCP */ + eseg->swp_outer_l4_offset = skb_inner_transport_offset(skb) / 2; + break; + default: + break; + } + } else { + /* Tunnel(VXLAN TCP/UDP) over Transport Mode */ + switch (xo->inner_ipproto) { + case IPPROTO_UDP: + eseg->swp_flags |= MLX5_ETH_WQE_SWP_INNER_L4_UDP; + fallthrough; + case IPPROTO_TCP: + eseg->swp_inner_l3_offset = skb_inner_network_offset(skb) / 2; + eseg->swp_inner_l4_offset = + (skb->csum_start + skb->head - skb->data) / 2; + if (inner_ip_hdr(skb)->version == 6) + eseg->swp_flags |= MLX5_ETH_WQE_SWP_INNER_L3_IPV6; + break; + default: + break; + } + } + +} + +void mlx5e_ipsec_set_iv_esn(struct sk_buff *skb, struct xfrm_state *x, + struct xfrm_offload *xo) +{ + struct xfrm_replay_state_esn *replay_esn = x->replay_esn; + __u32 oseq = replay_esn->oseq; + int iv_offset; + __be64 seqno; + u32 seq_hi; + + if (unlikely(skb_is_gso(skb) && oseq < MLX5E_IPSEC_ESN_SCOPE_MID && + MLX5E_IPSEC_ESN_SCOPE_MID < (oseq - skb_shinfo(skb)->gso_segs))) { + seq_hi = xo->seq.hi - 1; + } else { + seq_hi = xo->seq.hi; + } + + /* Place the SN in the IV field */ + seqno = cpu_to_be64(xo->seq.low + ((u64)seq_hi << 32)); + iv_offset = skb_transport_offset(skb) + sizeof(struct ip_esp_hdr); + skb_store_bits(skb, iv_offset, &seqno, 8); +} + +void mlx5e_ipsec_set_iv(struct sk_buff *skb, struct xfrm_state *x, + struct xfrm_offload *xo) +{ + int iv_offset; + __be64 seqno; + + /* Place the SN in the IV field */ + seqno = cpu_to_be64(xo->seq.low + ((u64)xo->seq.hi << 32)); + iv_offset = skb_transport_offset(skb) + sizeof(struct ip_esp_hdr); + skb_store_bits(skb, iv_offset, &seqno, 8); +} + +static void mlx5e_ipsec_set_metadata(struct sk_buff *skb, + struct mlx5e_ipsec_metadata *mdata, + struct xfrm_offload *xo) +{ + struct ip_esp_hdr *esph; + struct tcphdr *tcph; + + if (skb_is_gso(skb)) { + /* Add LSO metadata indication */ + esph = ip_esp_hdr(skb); + tcph = inner_tcp_hdr(skb); + netdev_dbg(skb->dev, " Offloading GSO packet outer L3 %u; L4 %u; Inner L3 %u; L4 %u\n", + skb->network_header, + skb->transport_header, + skb->inner_network_header, + skb->inner_transport_header); + netdev_dbg(skb->dev, " Offloading GSO packet of len %u; mss %u; TCP sp %u dp %u seq 0x%x ESP seq 0x%x\n", + skb->len, skb_shinfo(skb)->gso_size, + ntohs(tcph->source), ntohs(tcph->dest), + ntohl(tcph->seq), ntohl(esph->seq_no)); + mdata->syndrome = MLX5E_IPSEC_TX_SYNDROME_OFFLOAD_WITH_LSO_TCP; + mdata->content.tx.mss_inv = mlx5e_ipsec_mss_inv(skb); + mdata->content.tx.seq = htons(ntohl(tcph->seq) & 0xFFFF); + } else { + mdata->syndrome = MLX5E_IPSEC_TX_SYNDROME_OFFLOAD; + } + mdata->content.tx.esp_next_proto = xo->proto; + + netdev_dbg(skb->dev, " TX metadata syndrome %u proto %u mss_inv %04x seq %04x\n", + mdata->syndrome, mdata->content.tx.esp_next_proto, + ntohs(mdata->content.tx.mss_inv), + ntohs(mdata->content.tx.seq)); +} + +void mlx5e_ipsec_handle_tx_wqe(struct mlx5e_tx_wqe *wqe, + struct mlx5e_accel_tx_ipsec_state *ipsec_st, + struct mlx5_wqe_inline_seg *inlseg) +{ + inlseg->byte_count = cpu_to_be32(ipsec_st->tailen | MLX5_INLINE_SEG); + esp_output_fill_trailer((u8 *)inlseg->data, 0, ipsec_st->plen, ipsec_st->xo->proto); +} + +static int mlx5e_ipsec_set_state(struct mlx5e_priv *priv, + struct sk_buff *skb, + struct xfrm_state *x, + struct xfrm_offload *xo, + struct mlx5e_accel_tx_ipsec_state *ipsec_st) +{ + unsigned int blksize, clen, alen, plen; + struct crypto_aead *aead; + unsigned int tailen; + + ipsec_st->x = x; + ipsec_st->xo = xo; + if (mlx5_is_ipsec_device(priv->mdev)) { + aead = x->data; + alen = crypto_aead_authsize(aead); + blksize = ALIGN(crypto_aead_blocksize(aead), 4); + clen = ALIGN(skb->len + 2, blksize); + plen = max_t(u32, clen - skb->len, 4); + tailen = plen + alen; + ipsec_st->plen = plen; + ipsec_st->tailen = tailen; + } + + return 0; +} + +void mlx5e_ipsec_tx_build_eseg(struct mlx5e_priv *priv, struct sk_buff *skb, + struct mlx5_wqe_eth_seg *eseg) +{ + struct xfrm_offload *xo = xfrm_offload(skb); + struct xfrm_encap_tmpl *encap; + struct xfrm_state *x; + struct sec_path *sp; + u8 l3_proto; + + sp = skb_sec_path(skb); + if (unlikely(sp->len != 1)) + return; + + x = xfrm_input_state(skb); + if (unlikely(!x)) + return; + + if (unlikely(!x->xso.offload_handle || + (skb->protocol != htons(ETH_P_IP) && + skb->protocol != htons(ETH_P_IPV6)))) + return; + + mlx5e_ipsec_set_swp(skb, eseg, x->props.mode, xo); + + l3_proto = (x->props.family == AF_INET) ? + ((struct iphdr *)skb_network_header(skb))->protocol : + ((struct ipv6hdr *)skb_network_header(skb))->nexthdr; + + if (mlx5_is_ipsec_device(priv->mdev)) { + eseg->flow_table_metadata |= cpu_to_be32(MLX5_ETH_WQE_FT_META_IPSEC); + eseg->trailer |= cpu_to_be32(MLX5_ETH_WQE_INSERT_TRAILER); + encap = x->encap; + if (!encap) { + eseg->trailer |= (l3_proto == IPPROTO_ESP) ? + cpu_to_be32(MLX5_ETH_WQE_TRAILER_HDR_OUTER_IP_ASSOC) : + cpu_to_be32(MLX5_ETH_WQE_TRAILER_HDR_OUTER_L4_ASSOC); + } else if (encap->encap_type == UDP_ENCAP_ESPINUDP) { + eseg->trailer |= (l3_proto == IPPROTO_ESP) ? + cpu_to_be32(MLX5_ETH_WQE_TRAILER_HDR_INNER_IP_ASSOC) : + cpu_to_be32(MLX5_ETH_WQE_TRAILER_HDR_INNER_L4_ASSOC); + } + } +} + +bool mlx5e_ipsec_handle_tx_skb(struct net_device *netdev, + struct sk_buff *skb, + struct mlx5e_accel_tx_ipsec_state *ipsec_st) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct xfrm_offload *xo = xfrm_offload(skb); + struct mlx5e_ipsec_sa_entry *sa_entry; + struct mlx5e_ipsec_metadata *mdata; + struct xfrm_state *x; + struct sec_path *sp; + + sp = skb_sec_path(skb); + if (unlikely(sp->len != 1)) { + atomic64_inc(&priv->ipsec->sw_stats.ipsec_tx_drop_bundle); + goto drop; + } + + x = xfrm_input_state(skb); + if (unlikely(!x)) { + atomic64_inc(&priv->ipsec->sw_stats.ipsec_tx_drop_no_state); + goto drop; + } + + if (unlikely(!x->xso.offload_handle || + (skb->protocol != htons(ETH_P_IP) && + skb->protocol != htons(ETH_P_IPV6)))) { + atomic64_inc(&priv->ipsec->sw_stats.ipsec_tx_drop_not_ip); + goto drop; + } + + if (!skb_is_gso(skb)) + if (unlikely(mlx5e_ipsec_remove_trailer(skb, x))) { + atomic64_inc(&priv->ipsec->sw_stats.ipsec_tx_drop_trailer); + goto drop; + } + + if (MLX5_CAP_GEN(priv->mdev, fpga)) { + mdata = mlx5e_ipsec_add_metadata(skb); + if (IS_ERR(mdata)) { + atomic64_inc(&priv->ipsec->sw_stats.ipsec_tx_drop_metadata); + goto drop; + } + } + + sa_entry = (struct mlx5e_ipsec_sa_entry *)x->xso.offload_handle; + sa_entry->set_iv_op(skb, x, xo); + if (MLX5_CAP_GEN(priv->mdev, fpga)) + mlx5e_ipsec_set_metadata(skb, mdata, xo); + + mlx5e_ipsec_set_state(priv, skb, x, xo, ipsec_st); + + return true; + +drop: + kfree_skb(skb); + return false; +} + +static inline struct xfrm_state * +mlx5e_ipsec_build_sp(struct net_device *netdev, struct sk_buff *skb, + struct mlx5e_ipsec_metadata *mdata) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct xfrm_offload *xo; + struct xfrm_state *xs; + struct sec_path *sp; + u32 sa_handle; + + sp = secpath_set(skb); + if (unlikely(!sp)) { + atomic64_inc(&priv->ipsec->sw_stats.ipsec_rx_drop_sp_alloc); + return NULL; + } + + sa_handle = be32_to_cpu(mdata->content.rx.sa_handle); + xs = mlx5e_ipsec_sadb_rx_lookup(priv->ipsec, sa_handle); + if (unlikely(!xs)) { + atomic64_inc(&priv->ipsec->sw_stats.ipsec_rx_drop_sadb_miss); + return NULL; + } + + sp = skb_sec_path(skb); + sp->xvec[sp->len++] = xs; + sp->olen++; + + xo = xfrm_offload(skb); + xo->flags = CRYPTO_DONE; + switch (mdata->syndrome) { + case MLX5E_IPSEC_RX_SYNDROME_DECRYPTED: + xo->status = CRYPTO_SUCCESS; + if (likely(priv->ipsec->no_trailer)) { + xo->flags |= XFRM_ESP_NO_TRAILER; + xo->proto = mdata->content.rx.nexthdr; + } + break; + case MLX5E_IPSEC_RX_SYNDROME_AUTH_FAILED: + xo->status = CRYPTO_TUNNEL_ESP_AUTH_FAILED; + break; + case MLX5E_IPSEC_RX_SYNDROME_BAD_PROTO: + xo->status = CRYPTO_INVALID_PROTOCOL; + break; + default: + atomic64_inc(&priv->ipsec->sw_stats.ipsec_rx_drop_syndrome); + return NULL; + } + return xs; +} + +struct sk_buff *mlx5e_ipsec_handle_rx_skb(struct net_device *netdev, + struct sk_buff *skb, u32 *cqe_bcnt) +{ + struct mlx5e_ipsec_metadata *mdata; + struct xfrm_state *xs; + + if (!is_metadata_hdr_valid(skb)) + return skb; + + /* Use the metadata */ + mdata = (struct mlx5e_ipsec_metadata *)(skb->data + ETH_HLEN); + xs = mlx5e_ipsec_build_sp(netdev, skb, mdata); + if (unlikely(!xs)) { + kfree_skb(skb); + return NULL; + } + + remove_metadata_hdr(skb); + *cqe_bcnt -= MLX5E_METADATA_ETHER_LEN; + + return skb; +} + +enum { + MLX5E_IPSEC_OFFLOAD_RX_SYNDROME_DECRYPTED, + MLX5E_IPSEC_OFFLOAD_RX_SYNDROME_AUTH_FAILED, + MLX5E_IPSEC_OFFLOAD_RX_SYNDROME_BAD_TRAILER, +}; + +static void +handle_rx_skb_full(struct mlx5e_priv *priv, + struct sk_buff *skb, + struct mlx5_cqe64 *cqe) +{ + struct xfrm_state *xs; + struct sec_path *sp; + struct iphdr *v4_hdr; + u8 ip_ver; + + v4_hdr = (struct iphdr *)(skb->data + ETH_HLEN); + ip_ver = v4_hdr->version; + + if ((ip_ver != 4) && (ip_ver != 6)) + return; + + xs = mlx5e_ipsec_sadb_rx_lookup_state(priv->ipsec, skb, ip_ver); + if (!xs) + return; + + sp = secpath_set(skb); + if (unlikely(!sp)) + return; + + sp->xvec[sp->len++] = xs; + return; +} + +static void +handle_rx_skb_inline(struct mlx5e_priv *priv, + struct sk_buff *skb, + struct mlx5_cqe64 *cqe) +{ + u32 ipsec_meta_data = be32_to_cpu(cqe->ft_metadata); + struct xfrm_offload *xo; + struct xfrm_state *xs; + struct sec_path *sp; + u32 sa_handle; + + sa_handle = MLX5_IPSEC_METADATA_HANDLE(ipsec_meta_data); + sp = secpath_set(skb); + if (unlikely(!sp)) { + atomic64_inc(&priv->ipsec->sw_stats.ipsec_rx_drop_sp_alloc); + return; + } + + xs = mlx5e_ipsec_sadb_rx_lookup(priv->ipsec, sa_handle); + if (unlikely(!xs)) { + atomic64_inc(&priv->ipsec->sw_stats.ipsec_rx_drop_sadb_miss); + return; + } + + sp = skb_sec_path(skb); + sp->xvec[sp->len++] = xs; + sp->olen++; + + xo = xfrm_offload(skb); + xo->flags = CRYPTO_DONE; + + switch (MLX5_IPSEC_METADATA_SYNDROM(ipsec_meta_data)) { + case MLX5E_IPSEC_OFFLOAD_RX_SYNDROME_DECRYPTED: + xo->status = CRYPTO_SUCCESS; + if (WARN_ON_ONCE(priv->ipsec->no_trailer)) + xo->flags |= XFRM_ESP_NO_TRAILER; + break; + case MLX5E_IPSEC_OFFLOAD_RX_SYNDROME_AUTH_FAILED: + xo->status = CRYPTO_TUNNEL_ESP_AUTH_FAILED; + break; + case MLX5E_IPSEC_OFFLOAD_RX_SYNDROME_BAD_TRAILER: + xo->status = CRYPTO_INVALID_PACKET_SYNTAX; + break; + default: + atomic64_inc(&priv->ipsec->sw_stats.ipsec_rx_drop_syndrome); + } +} + +void mlx5e_ipsec_offload_handle_rx_skb(struct net_device *netdev, + struct sk_buff *skb, + struct mlx5_cqe64 *cqe) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + + if (mlx5_is_ipsec_full_offload(priv)) + handle_rx_skb_full(priv, skb, cqe); + else + handle_rx_skb_inline(priv, skb, cqe); +} + +void mlx5e_ipsec_build_inverse_table(void) +{ + u16 mss_inv; + u32 mss; + + /* Calculate 1/x inverse table for use in GSO data path. + * Using this table, we provide the IPSec accelerator with the value of + * 1/gso_size so that it can infer the position of each segment inside + * the GSO, and increment the ESP sequence number, and generate the IV. + * The HW needs this value in Q0.16 fixed-point number format + */ + mlx5e_ipsec_inverse_table[1] = htons(0xFFFF); + for (mss = 2; mss < MAX_LSO_MSS; mss++) { + mss_inv = div_u64(1ULL << 32, mss) >> 16; + mlx5e_ipsec_inverse_table[mss] = htons(mss_inv); + } +} + +int mlx5e_ipsec_set_flow_attrs(struct mlx5e_priv *priv, u32 *match_c, u32 *match_v, + struct ethtool_rx_flow_spec *fs) +{ + void *misc_param_c = MLX5_ADDR_OF(fte_match_param, match_c, misc_parameters); + void *misc_param_v = MLX5_ADDR_OF(fte_match_param, match_v, misc_parameters); + struct mlx5e_ipsec_metadata *mdata_c, *mdata_v; + u32 handle; + int err; + + err = mlx5e_ipsec_sadb_rx_lookup_rev(priv->ipsec, fs, &handle); + if (err) + return err; + + mdata_c = (void *)MLX5_ADDR_OF(fte_match_set_misc, misc_param_c, outer_emd_tag_data); + mdata_v = (void *)MLX5_ADDR_OF(fte_match_set_misc, misc_param_v, outer_emd_tag_data); + + MLX5_SET(fte_match_set_misc, misc_param_c, outer_emd_tag, 1); + mdata_c->content.rx.sa_handle = 0xFFFFFFFF; + mdata_v->content.rx.sa_handle = htonl(handle); + return 0; +} + +__wsum mlx5e_ipsec_offload_handle_rx_csum(struct sk_buff *skb, struct mlx5_cqe64 *cqe) +{ + unsigned int tr_len, alen; + struct xfrm_offload *xo; + struct ipv6hdr *ipv6hdr; + struct iphdr *ipv4hdr; + __wsum csum, hw_csum; + struct xfrm_state *x; + u8 plen, proto; + + xo = xfrm_offload(skb); + x = xfrm_input_state(skb); + alen = crypto_aead_authsize(x->data); + skb_copy_bits(skb, skb->len - alen - 2, &plen, 1); + skb_copy_bits(skb, skb->len - alen - 1, &proto, 1); + tr_len = alen + plen + 2; + csum = skb_checksum(skb, skb->len - tr_len, tr_len, 0); + hw_csum = csum_unfold((__force __sum16)cqe->check_sum); + csum = csum_block_sub(csum_unfold((__force __sum16)cqe->check_sum), csum, + skb->len - tr_len); + pskb_trim(skb, skb->len - tr_len); + xo->flags |= XFRM_ESP_NO_TRAILER; + xo->proto = proto; + if (skb->protocol == htons(ETH_P_IP)) { + ipv4hdr = ip_hdr(skb); + ipv4hdr->tot_len = htons(ntohs(ipv4hdr->tot_len) - tr_len); + } else { + ipv6hdr = ipv6_hdr(skb); + ipv6hdr->payload_len = htons(ntohs(ipv6hdr->payload_len) - tr_len); + } + + return csum; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.h new file mode 100644 index 0000000..2a4a8dc --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.h @@ -0,0 +1,186 @@ +/* + * Copyright (c) 2017 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef __MLX5E_IPSEC_RXTX_H__ +#define __MLX5E_IPSEC_RXTX_H__ + +#include +#include +#include "en.h" +#include "en/txrx.h" + +/* Bit31: IPsec marker, Bit30: reserved, Bit29-24: IPsec syndrome, Bit23-0: IPsec obj id */ +#define MLX5_IPSEC_METADATA_MARKER(metadata) (((metadata) >> 31) & 0x1) +#define MLX5_IPSEC_METADATA_SYNDROM(metadata) (((metadata) >> 24) & GENMASK(5, 0)) +#define MLX5_IPSEC_METADATA_HANDLE(metadata) ((metadata) & GENMASK(23, 0)) + +struct mlx5e_accel_tx_ipsec_state { + struct xfrm_offload *xo; + struct xfrm_state *x; + u32 tailen; + u32 plen; +}; + +#ifdef CONFIG_MLX5_EN_IPSEC + +struct sk_buff *mlx5e_ipsec_handle_rx_skb(struct net_device *netdev, + struct sk_buff *skb, u32 *cqe_bcnt); + +void mlx5e_ipsec_inverse_table_init(void); +void mlx5e_ipsec_set_iv_esn(struct sk_buff *skb, struct xfrm_state *x, + struct xfrm_offload *xo); +void mlx5e_ipsec_set_iv(struct sk_buff *skb, struct xfrm_state *x, + struct xfrm_offload *xo); +bool mlx5e_ipsec_handle_tx_skb(struct net_device *netdev, + struct sk_buff *skb, + struct mlx5e_accel_tx_ipsec_state *ipsec_st); +void mlx5e_ipsec_handle_tx_wqe(struct mlx5e_tx_wqe *wqe, + struct mlx5e_accel_tx_ipsec_state *ipsec_st, + struct mlx5_wqe_inline_seg *inlseg); +void mlx5e_ipsec_offload_handle_rx_skb(struct net_device *netdev, + struct sk_buff *skb, + struct mlx5_cqe64 *cqe); +static inline unsigned int mlx5e_ipsec_tx_ids_len(struct mlx5e_accel_tx_ipsec_state *ipsec_st) +{ + return ipsec_st->tailen; +} + +static inline bool mlx5_ipsec_is_rx_flow(struct mlx5_cqe64 *cqe) +{ + return MLX5_IPSEC_METADATA_MARKER(be32_to_cpu(cqe->ft_metadata)); +} + +static inline bool mlx5e_ipsec_is_tx_flow(struct mlx5e_accel_tx_ipsec_state *ipsec_st) +{ + return ipsec_st->x; +} + +static inline bool mlx5e_ipsec_eseg_meta(struct mlx5_wqe_eth_seg *eseg) +{ + return eseg->flow_table_metadata & cpu_to_be32(MLX5_ETH_WQE_FT_META_IPSEC); +} + +void mlx5e_ipsec_tx_build_eseg(struct mlx5e_priv *priv, struct sk_buff *skb, + struct mlx5_wqe_eth_seg *eseg); + +static inline netdev_features_t +mlx5e_ipsec_feature_check(struct sk_buff *skb, netdev_features_t features) +{ + struct xfrm_offload *xo = xfrm_offload(skb); + struct sec_path *sp = skb_sec_path(skb); + + if (sp && sp->len && xo) { + struct xfrm_state *x = sp->xvec[0]; + + if (!x || !x->xso.offload_handle) + goto out_disable; + + if (xo->inner_ipproto) { + /* Cannot support tunnel packet over IPsec tunnel mode + * because we cannot offload three IP header csum + */ + if (x->props.mode == XFRM_MODE_TUNNEL) + goto out_disable; + + /* Only support UDP or TCP L4 checksum */ + if (xo->inner_ipproto != IPPROTO_UDP && + xo->inner_ipproto != IPPROTO_TCP) + goto out_disable; + } + + return features; + + } + + /* Disable CSUM and GSO for software IPsec */ +out_disable: + return features & ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK); +} + +static inline bool +mlx5e_ipsec_txwqe_build_eseg_csum(struct mlx5e_txqsq *sq, struct sk_buff *skb, + struct mlx5_wqe_eth_seg *eseg) +{ + u8 inner_ipproto; + + if (!mlx5e_ipsec_eseg_meta(eseg)) + return false; + + eseg->cs_flags = MLX5_ETH_WQE_L3_CSUM; + inner_ipproto = xfrm_offload(skb)->inner_ipproto; + if (inner_ipproto) { + eseg->cs_flags |= MLX5_ETH_WQE_L3_INNER_CSUM; + if (inner_ipproto == IPPROTO_TCP || inner_ipproto == IPPROTO_UDP) + eseg->cs_flags |= MLX5_ETH_WQE_L4_INNER_CSUM; + } else if (likely(skb->ip_summed == CHECKSUM_PARTIAL)) { + eseg->cs_flags |= MLX5_ETH_WQE_L4_CSUM; + sq->stats->csum_partial_inner++; + } + + return true; +} + +__wsum mlx5e_ipsec_offload_handle_rx_csum(struct sk_buff *skb, struct mlx5_cqe64 *cqe); +#else +static inline +void mlx5e_ipsec_offload_handle_rx_skb(struct net_device *netdev, + struct sk_buff *skb, + struct mlx5_cqe64 *cqe) +{} + +static inline bool mlx5e_ipsec_eseg_meta(struct mlx5_wqe_eth_seg *eseg) +{ + return false; +} + +static inline bool mlx5_ipsec_is_rx_flow(struct mlx5_cqe64 *cqe) { return false; } +static inline netdev_features_t +mlx5e_ipsec_feature_check(struct sk_buff *skb, netdev_features_t features) +{ return features & ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK); } + +static inline bool +mlx5e_ipsec_txwqe_build_eseg_csum(struct mlx5e_txqsq *sq, struct sk_buff *skb, + struct mlx5_wqe_eth_seg *eseg) +{ + return false; +} + +static inline __wsum mlx5e_ipsec_offload_handle_rx_csum(struct sk_buff *skb, + struct mlx5_cqe64 *cqe) +{ return 0; } + +#endif /* CONFIG_MLX5_EN_IPSEC */ + +int mlx5e_ipsec_set_flow_attrs(struct mlx5e_priv *priv, u32 *match_c, u32 *match_v, + struct ethtool_rx_flow_spec *fs); +#endif /* __MLX5E_IPSEC_RXTX_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_stats.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_stats.c new file mode 100644 index 0000000..6ae9e8e --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_stats.c @@ -0,0 +1,191 @@ +/* + * Copyright (c) 2017 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include + +#include "en.h" +#include "accel/ipsec.h" +#include "fpga/sdk.h" +#include "en_accel/ipsec.h" +#include "fpga/ipsec.h" +#include "esw/ipsec.h" + +static const struct counter_desc mlx5e_ipsec_hw_stats_desc[] = { + { MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_dec_in_packets) }, + { MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_dec_out_packets) }, + { MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_dec_bypass_packets) }, + { MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_enc_in_packets) }, + { MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_enc_out_packets) }, + { MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_enc_bypass_packets) }, + { MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_dec_drop_packets) }, + { MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_dec_auth_fail_packets) }, + { MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_enc_drop_packets) }, + { MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_add_sa_success) }, + { MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_add_sa_fail) }, + { MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_del_sa_success) }, + { MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_del_sa_fail) }, + { MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_cmd_drop) }, +}; + +static const struct counter_desc mlx5e_ipsec_hw_stats_desc_full[] = { + { MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_full_rx_pkts) }, + { MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_full_rx_bytes) }, + { MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_full_rx_pkts_drop) }, + { MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_full_rx_bytes_drop) }, + { MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_full_tx_pkts) }, + { MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_full_tx_bytes) }, + { MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_full_tx_pkts_drop) }, + { MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_full_tx_bytes_drop) }, +}; + +static const struct counter_desc mlx5e_ipsec_sw_stats_desc[] = { + { MLX5E_DECLARE_STAT(struct mlx5e_ipsec_sw_stats, ipsec_rx_drop_sp_alloc) }, + { MLX5E_DECLARE_STAT(struct mlx5e_ipsec_sw_stats, ipsec_rx_drop_sadb_miss) }, + { MLX5E_DECLARE_STAT(struct mlx5e_ipsec_sw_stats, ipsec_rx_drop_syndrome) }, + { MLX5E_DECLARE_STAT(struct mlx5e_ipsec_sw_stats, ipsec_tx_drop_bundle) }, + { MLX5E_DECLARE_STAT(struct mlx5e_ipsec_sw_stats, ipsec_tx_drop_no_state) }, + { MLX5E_DECLARE_STAT(struct mlx5e_ipsec_sw_stats, ipsec_tx_drop_not_ip) }, + { MLX5E_DECLARE_STAT(struct mlx5e_ipsec_sw_stats, ipsec_tx_drop_trailer) }, + { MLX5E_DECLARE_STAT(struct mlx5e_ipsec_sw_stats, ipsec_tx_drop_metadata) }, +}; + +#define MLX5E_READ_CTR_ATOMIC64(ptr, dsc, i) \ + atomic64_read((atomic64_t *)((char *)(ptr) + (dsc)[i].offset)) + +#define NUM_IPSEC_HW_COUNTERS ARRAY_SIZE(mlx5e_ipsec_hw_stats_desc) +#define NUM_IPSEC_HW_COUNTERS_FULL ARRAY_SIZE(mlx5e_ipsec_hw_stats_desc_full) +#define NUM_IPSEC_SW_COUNTERS ARRAY_SIZE(mlx5e_ipsec_sw_stats_desc) + +static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(ipsec_sw) +{ + return priv->ipsec ? NUM_IPSEC_SW_COUNTERS : 0; +} + +static inline MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(ipsec_sw) {} + +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(ipsec_sw) +{ + unsigned int i; + + if (priv->ipsec) + for (i = 0; i < NUM_IPSEC_SW_COUNTERS; i++) + strcpy(data + (idx++) * ETH_GSTRING_LEN, + mlx5e_ipsec_sw_stats_desc[i].format); + return idx; +} + +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(ipsec_sw) +{ + int i; + + if (priv->ipsec) + for (i = 0; i < NUM_IPSEC_SW_COUNTERS; i++) + data[idx++] = MLX5E_READ_CTR_ATOMIC64(&priv->ipsec->sw_stats, + mlx5e_ipsec_sw_stats_desc, i); + return idx; +} + +static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(ipsec_hw) +{ + int num_stats = 0; + + if (!priv->ipsec) + return 0; + + if (mlx5_accel_ipsec_device_caps(priv->mdev) & MLX5_ACCEL_IPSEC_CAP_FULL_OFFLOAD) + num_stats = NUM_IPSEC_HW_COUNTERS_FULL; + else if (mlx5_fpga_ipsec_device_caps(priv->mdev)) + num_stats = NUM_IPSEC_HW_COUNTERS; + + return num_stats; +} + +static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(ipsec_hw) +{ + int ret = 0; + + if (priv->ipsec) + ret = mlx5_accel_ipsec_counters_read(priv->mdev, (u64 *)&priv->ipsec->stats, + NUM_IPSEC_HW_COUNTERS); + if (ret) + memset(&priv->ipsec->stats, 0, sizeof(priv->ipsec->stats)); +} + +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(ipsec_hw) +{ + unsigned int i; + + if (!priv->ipsec) + return idx; + + if ((mlx5_accel_ipsec_device_caps(priv->mdev) & MLX5_ACCEL_IPSEC_CAP_FULL_OFFLOAD) && + (mlx5_is_ipsec_full_offload(priv))) + for (i = 0; i < NUM_IPSEC_HW_COUNTERS_FULL; i++) + strcpy(data + (idx++) * ETH_GSTRING_LEN, + mlx5e_ipsec_hw_stats_desc_full[i].format); + else if (mlx5_fpga_ipsec_device_caps(priv->mdev)) + for (i = 0; i < NUM_IPSEC_HW_COUNTERS; i++) + strcpy(data + (idx++) * ETH_GSTRING_LEN, + mlx5e_ipsec_hw_stats_desc[i].format); + + return idx; +} + +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(ipsec_hw) +{ + int i; + + if (!priv->ipsec) + return idx; + + if ((mlx5_accel_ipsec_device_caps(priv->mdev) & MLX5_ACCEL_IPSEC_CAP_FULL_OFFLOAD) && + (mlx5_is_ipsec_full_offload(priv))) { + mlx5_esw_ipsec_full_offload_get_stats(priv->mdev->priv.eswitch, + &priv->ipsec->stats); + for (i = 0; i < NUM_IPSEC_HW_COUNTERS_FULL; i++) + data[idx++] = MLX5E_READ_CTR64_CPU(&priv->ipsec->stats, + mlx5e_ipsec_hw_stats_desc_full, + i); + } else if (mlx5_fpga_ipsec_device_caps(priv->mdev)) { + for (i = 0; i < NUM_IPSEC_HW_COUNTERS; i++) + data[idx++] = MLX5E_READ_CTR64_CPU(&priv->ipsec->stats, + mlx5e_ipsec_hw_stats_desc, + i); + } + + return idx; +} + +MLX5E_DEFINE_STATS_GRP(ipsec_sw, 0); +MLX5E_DEFINE_STATS_GRP(ipsec_hw, 0); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.c new file mode 100644 index 0000000..c941be2 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.c @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// Copyright (c) 2019 Mellanox Technologies. + +#include "en.h" +#include "en_accel/tls.h" +#include "en_accel/ktls.h" +#include "en_accel/ktls_utils.h" +#include "en_accel/fs_tcp.h" + +static int mlx5e_ktls_add(struct net_device *netdev, struct sock *sk, + enum tls_offload_ctx_dir direction, + struct tls_crypto_info *crypto_info, + u32 start_offload_tcp_sn) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5_core_dev *mdev = priv->mdev; + int err; + + if (!mlx5e_ktls_type_check(mdev, crypto_info)) + return -EOPNOTSUPP; + + if (direction == TLS_OFFLOAD_CTX_DIR_TX) + err = mlx5e_ktls_add_tx(netdev, sk, crypto_info, start_offload_tcp_sn); + else + err = mlx5e_ktls_add_rx(netdev, sk, crypto_info, start_offload_tcp_sn); + + return err; +} + +static void mlx5e_ktls_del(struct net_device *netdev, + struct tls_context *tls_ctx, + enum tls_offload_ctx_dir direction) +{ + if (direction == TLS_OFFLOAD_CTX_DIR_TX) + mlx5e_ktls_del_tx(netdev, tls_ctx); + else + mlx5e_ktls_del_rx(netdev, tls_ctx); +} + +static int mlx5e_ktls_resync(struct net_device *netdev, + struct sock *sk, u32 seq, u8 *rcd_sn, + enum tls_offload_ctx_dir direction) +{ + if (unlikely(direction != TLS_OFFLOAD_CTX_DIR_RX)) + return -EOPNOTSUPP; + + mlx5e_ktls_rx_resync(netdev, sk, seq, rcd_sn); + return 0; +} + +static const struct tlsdev_ops mlx5e_ktls_ops = { + .tls_dev_add = mlx5e_ktls_add, + .tls_dev_del = mlx5e_ktls_del, + .tls_dev_resync = mlx5e_ktls_resync, +}; + +bool mlx5e_accel_is_ktls_rx(struct mlx5_core_dev *mdev) +{ + u8 max_sq_wqebbs = mlx5e_get_max_sq_wqebbs(mdev); + + if (is_kdump_kernel() || !mlx5_accel_is_ktls_rx(mdev)) + return false; + + /* Check the possibility to post the required ICOSQ WQEs. */ + if (WARN_ON_ONCE(max_sq_wqebbs < MLX5E_TLS_SET_STATIC_PARAMS_WQEBBS)) + return false; + if (WARN_ON_ONCE(max_sq_wqebbs < MLX5E_TLS_SET_PROGRESS_PARAMS_WQEBBS)) + return false; + if (WARN_ON_ONCE(max_sq_wqebbs < MLX5E_KTLS_GET_PROGRESS_WQEBBS)) + return false; + + return true; +} + +void mlx5e_ktls_build_netdev(struct mlx5e_priv *priv) +{ + struct net_device *netdev = priv->netdev; + struct mlx5_core_dev *mdev = priv->mdev; + + if (!mlx5e_accel_is_ktls_tx(mdev) && !mlx5e_accel_is_ktls_rx(mdev)) + return; + + if (mlx5e_accel_is_ktls_tx(mdev)) { + netdev->hw_features |= NETIF_F_HW_TLS_TX; + netdev->features |= NETIF_F_HW_TLS_TX; + } + + if (mlx5e_accel_is_ktls_rx(mdev)) + netdev->hw_features |= NETIF_F_HW_TLS_RX; + + netdev->tlsdev_ops = &mlx5e_ktls_ops; +} + +int mlx5e_ktls_set_feature_rx(struct net_device *netdev, bool enable) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + int err = 0; + + mutex_lock(&priv->state_lock); + if (enable) + err = mlx5e_accel_fs_tcp_create(priv); + else + mlx5e_accel_fs_tcp_destroy(priv); + mutex_unlock(&priv->state_lock); + + return err; +} + +int mlx5e_ktls_init_rx(struct mlx5e_priv *priv) +{ + int err; + + if (!mlx5e_accel_is_ktls_rx(priv->mdev)) + return 0; + + priv->tls->rx_wq = create_singlethread_workqueue("mlx5e_tls_rx"); + if (!priv->tls->rx_wq) + return -ENOMEM; + + if (priv->netdev->features & NETIF_F_HW_TLS_RX) { + err = mlx5e_accel_fs_tcp_create(priv); + if (err) { + destroy_workqueue(priv->tls->rx_wq); + return err; + } + } + + return 0; +} + +void mlx5e_ktls_cleanup_rx(struct mlx5e_priv *priv) +{ + if (!mlx5e_accel_is_ktls_rx(priv->mdev)) + return; + + if (priv->netdev->features & NETIF_F_HW_TLS_RX) + mlx5e_accel_fs_tcp_destroy(priv); + + destroy_workqueue(priv->tls->rx_wq); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.h new file mode 100644 index 0000000..6f22c71 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2019 Mellanox Technologies. */ + +#ifndef __MLX5E_KTLS_H__ +#define __MLX5E_KTLS_H__ + +#include "en.h" + +#ifdef CONFIG_MLX5_EN_TLS + +void mlx5e_ktls_build_netdev(struct mlx5e_priv *priv); +int mlx5e_ktls_init_rx(struct mlx5e_priv *priv); +void mlx5e_ktls_cleanup_rx(struct mlx5e_priv *priv); +int mlx5e_ktls_set_feature_rx(struct net_device *netdev, bool enable); +struct mlx5e_ktls_resync_resp * +mlx5e_ktls_rx_resync_create_resp_list(void); +void mlx5e_ktls_rx_resync_destroy_resp_list(struct mlx5e_ktls_resync_resp *resp_list); + +static inline bool mlx5e_accel_is_ktls_tx(struct mlx5_core_dev *mdev) +{ + return !is_kdump_kernel() && + mlx5_accel_is_ktls_tx(mdev); +} + +bool mlx5e_accel_is_ktls_rx(struct mlx5_core_dev *mdev); + +static inline bool mlx5e_accel_is_ktls_device(struct mlx5_core_dev *mdev) +{ + return !is_kdump_kernel() && + mlx5_accel_is_ktls_device(mdev); +} + +#else + +static inline void mlx5e_ktls_build_netdev(struct mlx5e_priv *priv) +{ +} + +static inline int mlx5e_ktls_init_rx(struct mlx5e_priv *priv) +{ + return 0; +} + +static inline void mlx5e_ktls_cleanup_rx(struct mlx5e_priv *priv) +{ +} + +static inline int mlx5e_ktls_set_feature_rx(struct net_device *netdev, bool enable) +{ + netdev_warn(netdev, "kTLS is not supported\n"); + return -EOPNOTSUPP; +} + +static inline struct mlx5e_ktls_resync_resp * +mlx5e_ktls_rx_resync_create_resp_list(void) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline void +mlx5e_ktls_rx_resync_destroy_resp_list(struct mlx5e_ktls_resync_resp *resp_list) {} + +static inline bool mlx5e_accel_is_ktls_tx(struct mlx5_core_dev *mdev) { return false; } +static inline bool mlx5e_accel_is_ktls_rx(struct mlx5_core_dev *mdev) { return false; } +static inline bool mlx5e_accel_is_ktls_device(struct mlx5_core_dev *mdev) { return false; } + +#endif + +#endif /* __MLX5E_TLS_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c new file mode 100644 index 0000000..96064a2 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c @@ -0,0 +1,747 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// Copyright (c) 2019 Mellanox Technologies. + +#include +#include "en_accel/en_accel.h" +#include "en_accel/tls.h" +#include "en_accel/ktls_txrx.h" +#include "en_accel/ktls_utils.h" +#include "en_accel/fs_tcp.h" + +struct accel_rule { + struct work_struct work; + struct mlx5e_priv *priv; + struct mlx5_flow_handle *rule; +}; + +#define PROGRESS_PARAMS_WRITE_UNIT 64 +#define PROGRESS_PARAMS_PADDED_SIZE \ + (ALIGN(sizeof(struct mlx5_wqe_tls_progress_params_seg), \ + PROGRESS_PARAMS_WRITE_UNIT)) + +struct mlx5e_ktls_rx_resync_buf { + union { + struct mlx5_wqe_tls_progress_params_seg progress; + u8 pad[PROGRESS_PARAMS_PADDED_SIZE]; + } ____cacheline_aligned_in_smp; + dma_addr_t dma_addr; + struct mlx5e_ktls_offload_context_rx *priv_rx; +}; + +enum { + MLX5E_PRIV_RX_FLAG_DELETING, + MLX5E_NUM_PRIV_RX_FLAGS, +}; + +struct mlx5e_ktls_rx_resync_ctx { + struct tls_offload_resync_async core; + struct work_struct work; + struct mlx5e_priv *priv; + refcount_t refcnt; + __be64 sw_rcd_sn_be; + u32 seq; +}; + +struct mlx5e_ktls_offload_context_rx { + struct tls12_crypto_info_aes_gcm_128 crypto_info; + struct accel_rule rule; + struct sock *sk; + struct mlx5e_rq_stats *rq_stats; + struct mlx5e_tls_sw_stats *sw_stats; + struct completion add_ctx; + struct mlx5e_tir tir; + u32 key_id; + u32 rxq; + DECLARE_BITMAP(flags, MLX5E_NUM_PRIV_RX_FLAGS); + + /* resync */ + spinlock_t lock; /* protects resync fields */ + struct mlx5e_ktls_rx_resync_ctx resync; + struct list_head list; +}; + +static bool mlx5e_ktls_priv_rx_put(struct mlx5e_ktls_offload_context_rx *priv_rx) +{ + if (!refcount_dec_and_test(&priv_rx->resync.refcnt)) + return false; + + kfree(priv_rx); + return true; +} + +static void mlx5e_ktls_priv_rx_get(struct mlx5e_ktls_offload_context_rx *priv_rx) +{ + refcount_inc(&priv_rx->resync.refcnt); +} + +struct mlx5e_ktls_resync_resp { + /* protects list changes */ + spinlock_t lock; + struct list_head list; +}; + +void mlx5e_ktls_rx_resync_destroy_resp_list(struct mlx5e_ktls_resync_resp *resp_list) +{ + kvfree(resp_list); +} + +struct mlx5e_ktls_resync_resp * +mlx5e_ktls_rx_resync_create_resp_list(void) +{ + struct mlx5e_ktls_resync_resp *resp_list; + + resp_list = kvzalloc(sizeof(*resp_list), GFP_KERNEL); + if (!resp_list) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&resp_list->list); + spin_lock_init(&resp_list->lock); + + return resp_list; +} + +static void accel_rule_handle_work(struct work_struct *work) +{ + struct mlx5e_ktls_offload_context_rx *priv_rx; + struct accel_rule *accel_rule; + struct mlx5_flow_handle *rule; + + accel_rule = container_of(work, struct accel_rule, work); + priv_rx = container_of(accel_rule, struct mlx5e_ktls_offload_context_rx, rule); + if (unlikely(test_bit(MLX5E_PRIV_RX_FLAG_DELETING, priv_rx->flags))) + goto out; + + rule = mlx5e_accel_fs_add_sk(accel_rule->priv, priv_rx->sk, + mlx5e_tir_get_tirn(&priv_rx->tir), + MLX5_FS_DEFAULT_FLOW_TAG); + if (!IS_ERR_OR_NULL(rule)) + accel_rule->rule = rule; +out: + complete(&priv_rx->add_ctx); +} + +static void accel_rule_init(struct accel_rule *rule, struct mlx5e_priv *priv) +{ + INIT_WORK(&rule->work, accel_rule_handle_work); + rule->priv = priv; +} + +static void icosq_fill_wi(struct mlx5e_icosq *sq, u16 pi, + struct mlx5e_icosq_wqe_info *wi) +{ + sq->db.wqe_info[pi] = *wi; +} + +static struct mlx5_wqe_ctrl_seg * +post_static_params(struct mlx5e_icosq *sq, + struct mlx5e_ktls_offload_context_rx *priv_rx) +{ + struct mlx5e_set_tls_static_params_wqe *wqe; + struct mlx5e_icosq_wqe_info wi; + u16 pi, num_wqebbs; + + num_wqebbs = MLX5E_TLS_SET_STATIC_PARAMS_WQEBBS; + if (unlikely(!mlx5e_icosq_can_post_wqe(sq, num_wqebbs))) + return ERR_PTR(-ENOSPC); + + pi = mlx5e_icosq_get_next_pi(sq, num_wqebbs); + wqe = MLX5E_TLS_FETCH_SET_STATIC_PARAMS_WQE(sq, pi); + mlx5e_ktls_build_static_params(wqe, sq->pc, sq->sqn, &priv_rx->crypto_info, + mlx5e_tir_get_tirn(&priv_rx->tir), + priv_rx->key_id, priv_rx->resync.seq, false, + TLS_OFFLOAD_CTX_DIR_RX); + wi = (struct mlx5e_icosq_wqe_info) { + .wqe_type = MLX5E_ICOSQ_WQE_UMR_TLS, + .num_wqebbs = num_wqebbs, + .tls_set_params.priv_rx = priv_rx, + }; + icosq_fill_wi(sq, pi, &wi); + sq->pc += num_wqebbs; + + return &wqe->ctrl; +} + +static struct mlx5_wqe_ctrl_seg * +post_progress_params(struct mlx5e_icosq *sq, + struct mlx5e_ktls_offload_context_rx *priv_rx, + u32 next_record_tcp_sn) +{ + struct mlx5e_set_tls_progress_params_wqe *wqe; + struct mlx5e_icosq_wqe_info wi; + u16 pi, num_wqebbs; + + num_wqebbs = MLX5E_TLS_SET_PROGRESS_PARAMS_WQEBBS; + if (unlikely(!mlx5e_icosq_can_post_wqe(sq, num_wqebbs))) + return ERR_PTR(-ENOSPC); + + pi = mlx5e_icosq_get_next_pi(sq, num_wqebbs); + wqe = MLX5E_TLS_FETCH_SET_PROGRESS_PARAMS_WQE(sq, pi); + mlx5e_ktls_build_progress_params(wqe, sq->pc, sq->sqn, + mlx5e_tir_get_tirn(&priv_rx->tir), + false, next_record_tcp_sn, + TLS_OFFLOAD_CTX_DIR_RX); + wi = (struct mlx5e_icosq_wqe_info) { + .wqe_type = MLX5E_ICOSQ_WQE_SET_PSV_TLS, + .num_wqebbs = num_wqebbs, + .tls_set_params.priv_rx = priv_rx, + }; + + icosq_fill_wi(sq, pi, &wi); + sq->pc += num_wqebbs; + + return &wqe->ctrl; +} + +static int post_rx_param_wqes(struct mlx5e_channel *c, + struct mlx5e_ktls_offload_context_rx *priv_rx, + u32 next_record_tcp_sn) +{ + struct mlx5_wqe_ctrl_seg *cseg; + struct mlx5e_icosq *sq; + int err; + + err = 0; + sq = &c->async_icosq; + spin_lock_bh(&c->async_icosq_lock); + + cseg = post_static_params(sq, priv_rx); + if (IS_ERR(cseg)) + goto err_out; + cseg = post_progress_params(sq, priv_rx, next_record_tcp_sn); + if (IS_ERR(cseg)) + goto err_out; + + mlx5e_notify_hw(&sq->wq, sq->pc, sq->uar_map, cseg); +unlock: + spin_unlock_bh(&c->async_icosq_lock); + + return err; + +err_out: + priv_rx->rq_stats->tls_resync_req_skip++; + err = PTR_ERR(cseg); + complete(&priv_rx->add_ctx); + goto unlock; +} + +static void +mlx5e_set_ktls_rx_priv_ctx(struct tls_context *tls_ctx, + struct mlx5e_ktls_offload_context_rx *priv_rx) +{ + struct mlx5e_ktls_offload_context_rx **ctx = + __tls_driver_ctx(tls_ctx, TLS_OFFLOAD_CTX_DIR_RX); + + BUILD_BUG_ON(sizeof(struct mlx5e_ktls_offload_context_rx *) > + TLS_OFFLOAD_CONTEXT_SIZE_RX); + + *ctx = priv_rx; +} + +static struct mlx5e_ktls_offload_context_rx * +mlx5e_get_ktls_rx_priv_ctx(struct tls_context *tls_ctx) +{ + struct mlx5e_ktls_offload_context_rx **ctx = + __tls_driver_ctx(tls_ctx, TLS_OFFLOAD_CTX_DIR_RX); + + return *ctx; +} + +/* Re-sync */ +/* Runs in work context */ +static int +resync_post_get_progress_params(struct mlx5e_icosq *sq, + struct mlx5e_ktls_offload_context_rx *priv_rx) +{ + struct mlx5e_get_tls_progress_params_wqe *wqe; + struct mlx5e_ktls_rx_resync_buf *buf; + struct mlx5e_icosq_wqe_info wi; + struct mlx5_wqe_ctrl_seg *cseg; + struct mlx5_seg_get_psv *psv; + struct device *pdev; + int err; + u16 pi; + + buf = kzalloc(sizeof(*buf), GFP_KERNEL); + if (unlikely(!buf)) { + err = -ENOMEM; + goto err_out; + } + + pdev = mlx5_core_dma_dev(sq->channel->priv->mdev); + buf->dma_addr = dma_map_single(pdev, &buf->progress, + PROGRESS_PARAMS_PADDED_SIZE, DMA_FROM_DEVICE); + if (unlikely(dma_mapping_error(pdev, buf->dma_addr))) { + err = -ENOMEM; + goto err_free; + } + + buf->priv_rx = priv_rx; + + spin_lock_bh(&sq->channel->async_icosq_lock); + + if (unlikely(!mlx5e_icosq_can_post_wqe(sq, MLX5E_KTLS_GET_PROGRESS_WQEBBS))) { + spin_unlock_bh(&sq->channel->async_icosq_lock); + err = -ENOSPC; + goto err_dma_unmap; + } + + pi = mlx5e_icosq_get_next_pi(sq, MLX5E_KTLS_GET_PROGRESS_WQEBBS); + wqe = MLX5E_TLS_FETCH_GET_PROGRESS_PARAMS_WQE(sq, pi); + +#define GET_PSV_DS_CNT (DIV_ROUND_UP(sizeof(*wqe), MLX5_SEND_WQE_DS)) + + cseg = &wqe->ctrl; + cseg->opmod_idx_opcode = + cpu_to_be32((sq->pc << 8) | MLX5_OPCODE_GET_PSV | + (MLX5_OPC_MOD_TLS_TIR_PROGRESS_PARAMS << 24)); + cseg->qpn_ds = + cpu_to_be32((sq->sqn << MLX5_WQE_CTRL_QPN_SHIFT) | GET_PSV_DS_CNT); + + psv = &wqe->psv; + psv->num_psv = 1 << 4; + psv->l_key = sq->channel->mkey_be; + psv->psv_index[0] = cpu_to_be32(mlx5e_tir_get_tirn(&priv_rx->tir)); + psv->va = cpu_to_be64(buf->dma_addr); + + wi = (struct mlx5e_icosq_wqe_info) { + .wqe_type = MLX5E_ICOSQ_WQE_GET_PSV_TLS, + .num_wqebbs = MLX5E_KTLS_GET_PROGRESS_WQEBBS, + .tls_get_params.buf = buf, + }; + icosq_fill_wi(sq, pi, &wi); + sq->pc++; + mlx5e_notify_hw(&sq->wq, sq->pc, sq->uar_map, cseg); + spin_unlock_bh(&sq->channel->async_icosq_lock); + + return 0; + +err_dma_unmap: + dma_unmap_single(pdev, buf->dma_addr, PROGRESS_PARAMS_PADDED_SIZE, DMA_FROM_DEVICE); +err_free: + kfree(buf); +err_out: + priv_rx->rq_stats->tls_resync_req_skip++; + return err; +} + +/* Function is called with elevated refcount. + * It decreases it only if no WQE is posted. + */ +static void resync_handle_work(struct work_struct *work) +{ + struct mlx5e_ktls_offload_context_rx *priv_rx; + struct mlx5e_ktls_rx_resync_ctx *resync; + struct mlx5e_channel *c; + struct mlx5e_icosq *sq; + + resync = container_of(work, struct mlx5e_ktls_rx_resync_ctx, work); + priv_rx = container_of(resync, struct mlx5e_ktls_offload_context_rx, resync); + + if (unlikely(test_bit(MLX5E_PRIV_RX_FLAG_DELETING, priv_rx->flags))) { + mlx5e_ktls_priv_rx_put(priv_rx); + return; + } + + c = resync->priv->channels.c[priv_rx->rxq]; + sq = &c->async_icosq; + + if (resync_post_get_progress_params(sq, priv_rx)) + mlx5e_ktls_priv_rx_put(priv_rx); +} + +static void resync_init(struct mlx5e_ktls_rx_resync_ctx *resync, + struct mlx5e_priv *priv) +{ + INIT_WORK(&resync->work, resync_handle_work); + resync->priv = priv; + refcount_set(&resync->refcnt, 1); +} + +/* Function can be called with the refcount being either elevated or not. + * It does not affect the refcount. + */ +static void resync_handle_seq_match(struct mlx5e_ktls_offload_context_rx *priv_rx, + struct mlx5e_channel *c) +{ + struct tls12_crypto_info_aes_gcm_128 *info = &priv_rx->crypto_info; + struct mlx5e_ktls_resync_resp *ktls_resync; + struct mlx5e_icosq *sq; + bool trigger_poll; + + sq = &c->async_icosq; + ktls_resync = sq->ktls_resync; + trigger_poll = false; + + spin_lock_bh(&ktls_resync->lock); + spin_lock_bh(&priv_rx->lock); + memcpy(info->rec_seq, &priv_rx->resync.sw_rcd_sn_be, sizeof(info->rec_seq)); + if (list_empty(&priv_rx->list)) { + list_add_tail(&priv_rx->list, &ktls_resync->list); + trigger_poll = !test_and_set_bit(MLX5E_SQ_STATE_PENDING_TLS_RX_RESYNC, &sq->state); + } + spin_unlock_bh(&priv_rx->lock); + spin_unlock_bh(&ktls_resync->lock); + + if (!trigger_poll) + return; + + if (!napi_if_scheduled_mark_missed(&c->napi)) { + spin_lock_bh(&c->async_icosq_lock); + mlx5e_trigger_irq(sq); + spin_unlock_bh(&c->async_icosq_lock); + } +} + +/* Function can be called with the refcount being either elevated or not. + * It decreases the refcount and may free the kTLS priv context. + * Refcount is not elevated only if tls_dev_del has been called, but GET_PSV was + * already in flight. + */ +void mlx5e_ktls_handle_get_psv_completion(struct mlx5e_icosq_wqe_info *wi, + struct mlx5e_icosq *sq) +{ + struct mlx5e_ktls_rx_resync_buf *buf = wi->tls_get_params.buf; + struct mlx5e_ktls_offload_context_rx *priv_rx; + struct mlx5e_ktls_rx_resync_ctx *resync; + u8 tracker_state, auth_state, *ctx; + struct device *dev; + u32 hw_seq; + + priv_rx = buf->priv_rx; + resync = &priv_rx->resync; + dev = mlx5_core_dma_dev(resync->priv->mdev); + if (unlikely(test_bit(MLX5E_PRIV_RX_FLAG_DELETING, priv_rx->flags))) + goto out; + + dma_sync_single_for_cpu(dev, buf->dma_addr, PROGRESS_PARAMS_PADDED_SIZE, + DMA_FROM_DEVICE); + + ctx = buf->progress.ctx; + tracker_state = MLX5_GET(tls_progress_params, ctx, record_tracker_state); + auth_state = MLX5_GET(tls_progress_params, ctx, auth_state); + if (tracker_state != MLX5E_TLS_PROGRESS_PARAMS_RECORD_TRACKER_STATE_TRACKING || + auth_state != MLX5E_TLS_PROGRESS_PARAMS_AUTH_STATE_NO_OFFLOAD) { + priv_rx->rq_stats->tls_resync_req_skip++; + goto out; + } + + hw_seq = MLX5_GET(tls_progress_params, ctx, hw_resync_tcp_sn); + tls_offload_rx_resync_async_request_end(priv_rx->sk, cpu_to_be32(hw_seq)); + priv_rx->rq_stats->tls_resync_req_end++; +out: + mlx5e_ktls_priv_rx_put(priv_rx); + dma_unmap_single(dev, buf->dma_addr, PROGRESS_PARAMS_PADDED_SIZE, DMA_FROM_DEVICE); + kfree(buf); +} + +/* Runs in NAPI. + * Function elevates the refcount, unless no work is queued. + */ +static bool resync_queue_get_psv(struct sock *sk) +{ + struct mlx5e_ktls_offload_context_rx *priv_rx; + struct mlx5e_ktls_rx_resync_ctx *resync; + + priv_rx = mlx5e_get_ktls_rx_priv_ctx(tls_get_ctx(sk)); + if (unlikely(!priv_rx)) + return false; + + if (unlikely(test_bit(MLX5E_PRIV_RX_FLAG_DELETING, priv_rx->flags))) + return false; + + resync = &priv_rx->resync; + mlx5e_ktls_priv_rx_get(priv_rx); + if (unlikely(!queue_work(resync->priv->tls->rx_wq, &resync->work))) + mlx5e_ktls_priv_rx_put(priv_rx); + + return true; +} + +/* Runs in NAPI */ +static void resync_update_sn(struct mlx5e_rq *rq, struct sk_buff *skb) +{ + struct ethhdr *eth = (struct ethhdr *)(skb->data); + struct net_device *netdev = rq->netdev; + struct sock *sk = NULL; + unsigned int datalen; + struct iphdr *iph; + struct tcphdr *th; + __be32 seq; + int depth = 0; + + __vlan_get_protocol(skb, eth->h_proto, &depth); + iph = (struct iphdr *)(skb->data + depth); + + if (iph->version == 4) { + depth += sizeof(struct iphdr); + th = (void *)iph + sizeof(struct iphdr); + + sk = inet_lookup_established(dev_net(netdev), &tcp_hashinfo, + iph->saddr, th->source, iph->daddr, + th->dest, netdev->ifindex); +#if IS_ENABLED(CONFIG_IPV6) + } else { + struct ipv6hdr *ipv6h = (struct ipv6hdr *)iph; + + depth += sizeof(struct ipv6hdr); + th = (void *)ipv6h + sizeof(struct ipv6hdr); + + sk = __inet6_lookup_established(dev_net(netdev), &tcp_hashinfo, + &ipv6h->saddr, th->source, + &ipv6h->daddr, ntohs(th->dest), + netdev->ifindex, 0); +#endif + } + + depth += sizeof(struct tcphdr); + + if (unlikely(!sk)) + return; + + if (unlikely(sk->sk_state == TCP_TIME_WAIT)) + goto unref; + + if (unlikely(!resync_queue_get_psv(sk))) + goto unref; + + seq = th->seq; + datalen = skb->len - depth; + tls_offload_rx_resync_async_request_start(sk, seq, datalen); + rq->stats->tls_resync_req_start++; + +unref: + sock_gen_put(sk); +} + +void mlx5e_ktls_rx_resync(struct net_device *netdev, struct sock *sk, + u32 seq, u8 *rcd_sn) +{ + struct mlx5e_ktls_offload_context_rx *priv_rx; + struct mlx5e_ktls_rx_resync_ctx *resync; + struct mlx5e_priv *priv; + struct mlx5e_channel *c; + + priv_rx = mlx5e_get_ktls_rx_priv_ctx(tls_get_ctx(sk)); + if (unlikely(!priv_rx)) + return; + + resync = &priv_rx->resync; + resync->sw_rcd_sn_be = *(__be64 *)rcd_sn; + resync->seq = seq; + + priv = netdev_priv(netdev); + c = priv->channels.c[priv_rx->rxq]; + + resync_handle_seq_match(priv_rx, c); +} + +/* End of resync section */ + +void mlx5e_ktls_handle_rx_skb(struct mlx5e_rq *rq, struct sk_buff *skb, + struct mlx5_cqe64 *cqe, u32 *cqe_bcnt) +{ + struct mlx5e_rq_stats *stats = rq->stats; + + switch (get_cqe_tls_offload(cqe)) { + case CQE_TLS_OFFLOAD_DECRYPTED: + skb->decrypted = 1; + stats->tls_decrypted_packets++; + stats->tls_decrypted_bytes += *cqe_bcnt; + break; + case CQE_TLS_OFFLOAD_RESYNC: + stats->tls_resync_req_pkt++; + resync_update_sn(rq, skb); + break; + default: /* CQE_TLS_OFFLOAD_ERROR: */ + stats->tls_err++; + break; + } +} + +void mlx5e_ktls_handle_ctx_completion(struct mlx5e_icosq_wqe_info *wi) +{ + struct mlx5e_ktls_offload_context_rx *priv_rx = wi->tls_set_params.priv_rx; + struct accel_rule *rule = &priv_rx->rule; + + if (unlikely(test_bit(MLX5E_PRIV_RX_FLAG_DELETING, priv_rx->flags))) { + complete(&priv_rx->add_ctx); + return; + } + queue_work(rule->priv->tls->rx_wq, &rule->work); +} + +static int mlx5e_ktls_sk_get_rxq(struct sock *sk) +{ + int rxq = sk_rx_queue_get(sk); + + if (unlikely(rxq == -1)) + rxq = 0; + + return rxq; +} + +int mlx5e_ktls_add_rx(struct net_device *netdev, struct sock *sk, + struct tls_crypto_info *crypto_info, + u32 start_offload_tcp_sn) +{ + struct mlx5e_ktls_offload_context_rx *priv_rx; + struct mlx5e_ktls_rx_resync_ctx *resync; + struct tls_context *tls_ctx; + struct mlx5_core_dev *mdev; + struct mlx5e_priv *priv; + int rxq, err; + + tls_ctx = tls_get_ctx(sk); + priv = netdev_priv(netdev); + mdev = priv->mdev; + priv_rx = kzalloc(sizeof(*priv_rx), GFP_KERNEL); + if (unlikely(!priv_rx)) + return -ENOMEM; + + err = mlx5_ktls_create_key(mdev, crypto_info, &priv_rx->key_id); + if (err) + goto err_create_key; + + INIT_LIST_HEAD(&priv_rx->list); + spin_lock_init(&priv_rx->lock); + priv_rx->crypto_info = + *(struct tls12_crypto_info_aes_gcm_128 *)crypto_info; + + rxq = mlx5e_ktls_sk_get_rxq(sk); + priv_rx->rxq = rxq; + priv_rx->sk = sk; + + priv_rx->rq_stats = &priv->channel_stats[rxq]->rq; + priv_rx->sw_stats = &priv->tls->sw_stats; + mlx5e_set_ktls_rx_priv_ctx(tls_ctx, priv_rx); + + err = mlx5e_rx_res_tls_tir_create(priv->rx_res, rxq, &priv_rx->tir); + if (err) + goto err_create_tir; + + init_completion(&priv_rx->add_ctx); + + accel_rule_init(&priv_rx->rule, priv); + resync = &priv_rx->resync; + resync_init(resync, priv); + tls_offload_ctx_rx(tls_ctx)->resync_async = &resync->core; + tls_offload_rx_resync_set_type(sk, TLS_OFFLOAD_SYNC_TYPE_DRIVER_REQ_ASYNC); + + err = post_rx_param_wqes(priv->channels.c[rxq], priv_rx, start_offload_tcp_sn); + if (err) + goto err_post_wqes; + + atomic64_inc(&priv_rx->sw_stats->rx_tls_ctx); + + return 0; + +err_post_wqes: + mlx5e_tir_destroy(&priv_rx->tir); +err_create_tir: + mlx5_ktls_destroy_key(mdev, priv_rx->key_id); +err_create_key: + kfree(priv_rx); + return err; +} + +void mlx5e_ktls_del_rx(struct net_device *netdev, struct tls_context *tls_ctx) +{ + struct mlx5e_ktls_offload_context_rx *priv_rx; + struct mlx5e_ktls_rx_resync_ctx *resync; + struct mlx5_core_dev *mdev; + struct mlx5e_priv *priv; + + priv = netdev_priv(netdev); + mdev = priv->mdev; + + priv_rx = mlx5e_get_ktls_rx_priv_ctx(tls_ctx); + set_bit(MLX5E_PRIV_RX_FLAG_DELETING, priv_rx->flags); + mlx5e_set_ktls_rx_priv_ctx(tls_ctx, NULL); + synchronize_net(); /* Sync with NAPI */ + if (!cancel_work_sync(&priv_rx->rule.work)) + /* completion is needed, as the priv_rx in the add flow + * is maintained on the wqe info (wi), not on the socket. + */ + wait_for_completion(&priv_rx->add_ctx); + resync = &priv_rx->resync; + if (cancel_work_sync(&resync->work)) + mlx5e_ktls_priv_rx_put(priv_rx); + + atomic64_inc(&priv_rx->sw_stats->rx_tls_del); + if (priv_rx->rule.rule) + mlx5e_accel_fs_del_sk(priv_rx->rule.rule); + + mlx5e_tir_destroy(&priv_rx->tir); + mlx5_ktls_destroy_key(mdev, priv_rx->key_id); + /* priv_rx should normally be freed here, but if there is an outstanding + * GET_PSV, deallocation will be delayed until the CQE for GET_PSV is + * processed. + */ + mlx5e_ktls_priv_rx_put(priv_rx); +} + +bool mlx5e_ktls_rx_handle_resync_list(struct mlx5e_channel *c, int budget) +{ + struct mlx5e_ktls_offload_context_rx *priv_rx, *tmp; + struct mlx5e_ktls_resync_resp *ktls_resync; + struct mlx5_wqe_ctrl_seg *db_cseg; + struct mlx5e_icosq *sq; + LIST_HEAD(local_list); + int i, j; + + sq = &c->async_icosq; + + if (unlikely(!test_bit(MLX5E_SQ_STATE_ENABLED, &sq->state))) + return false; + + ktls_resync = sq->ktls_resync; + db_cseg = NULL; + i = 0; + + spin_lock(&ktls_resync->lock); + list_for_each_entry_safe(priv_rx, tmp, &ktls_resync->list, list) { + list_move(&priv_rx->list, &local_list); + if (++i == budget) + break; + } + if (list_empty(&ktls_resync->list)) + clear_bit(MLX5E_SQ_STATE_PENDING_TLS_RX_RESYNC, &sq->state); + spin_unlock(&ktls_resync->lock); + + spin_lock(&c->async_icosq_lock); + for (j = 0; j < i; j++) { + struct mlx5_wqe_ctrl_seg *cseg; + + priv_rx = list_first_entry(&local_list, + struct mlx5e_ktls_offload_context_rx, + list); + spin_lock(&priv_rx->lock); + cseg = post_static_params(sq, priv_rx); + if (IS_ERR(cseg)) { + spin_unlock(&priv_rx->lock); + break; + } + list_del_init(&priv_rx->list); + spin_unlock(&priv_rx->lock); + db_cseg = cseg; + } + if (db_cseg) + mlx5e_notify_hw(&sq->wq, sq->pc, sq->uar_map, db_cseg); + spin_unlock(&c->async_icosq_lock); + + priv_rx->rq_stats->tls_resync_res_ok += j; + + if (!list_empty(&local_list)) { + /* This happens only if ICOSQ is full. + * There is no need to mark busy or explicitly ask for a NAPI cycle, + * it will be triggered by the outstanding ICOSQ completions. + */ + spin_lock(&ktls_resync->lock); + list_splice(&local_list, &ktls_resync->list); + set_bit(MLX5E_SQ_STATE_PENDING_TLS_RX_RESYNC, &sq->state); + spin_unlock(&ktls_resync->lock); + priv_rx->rq_stats->tls_resync_res_retry++; + } + + return i == budget; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c new file mode 100644 index 0000000..aaf11c6 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c @@ -0,0 +1,496 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// Copyright (c) 2019 Mellanox Technologies. + +#include "en_accel/tls.h" +#include "en_accel/ktls_txrx.h" +#include "en_accel/ktls_utils.h" + +struct mlx5e_dump_wqe { + struct mlx5_wqe_ctrl_seg ctrl; + struct mlx5_wqe_data_seg data; +}; + +#define MLX5E_KTLS_DUMP_WQEBBS \ + (DIV_ROUND_UP(sizeof(struct mlx5e_dump_wqe), MLX5_SEND_WQE_BB)) + +static u8 +mlx5e_ktls_dumps_num_wqes(struct mlx5e_params *params, unsigned int nfrags, + unsigned int sync_len) +{ + /* Given the MTU and sync_len, calculates an upper bound for the + * number of DUMP WQEs needed for the TX resync of a record. + */ + return nfrags + DIV_ROUND_UP(sync_len, MLX5E_SW2HW_MTU(params, params->sw_mtu)); +} + +u16 mlx5e_ktls_get_stop_room(struct mlx5_core_dev *mdev, struct mlx5e_params *params) +{ + u16 num_dumps, stop_room = 0; + + if (!mlx5e_accel_is_ktls_tx(mdev)) + return 0; + + num_dumps = mlx5e_ktls_dumps_num_wqes(params, MAX_SKB_FRAGS, TLS_MAX_PAYLOAD_SIZE); + + stop_room += mlx5e_stop_room_for_wqe(mdev, MLX5E_TLS_SET_STATIC_PARAMS_WQEBBS); + stop_room += mlx5e_stop_room_for_wqe(mdev, MLX5E_TLS_SET_PROGRESS_PARAMS_WQEBBS); + stop_room += num_dumps * mlx5e_stop_room_for_wqe(mdev, MLX5E_KTLS_DUMP_WQEBBS); + + return stop_room; +} + +static int mlx5e_ktls_create_tis(struct mlx5_core_dev *mdev, u32 *tisn) +{ + u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {}; + void *tisc; + + tisc = MLX5_ADDR_OF(create_tis_in, in, ctx); + + MLX5_SET(tisc, tisc, tls_en, 1); + + return mlx5e_create_tis(mdev, in, tisn); +} + +struct mlx5e_ktls_offload_context_tx { + struct tls_offload_context_tx *tx_ctx; + struct tls12_crypto_info_aes_gcm_128 crypto_info; + struct mlx5e_tls_sw_stats *sw_stats; + u32 expected_seq; + u32 tisn; + u32 key_id; + bool ctx_post_pending; +}; + +static void +mlx5e_set_ktls_tx_priv_ctx(struct tls_context *tls_ctx, + struct mlx5e_ktls_offload_context_tx *priv_tx) +{ + struct mlx5e_ktls_offload_context_tx **ctx = + __tls_driver_ctx(tls_ctx, TLS_OFFLOAD_CTX_DIR_TX); + + BUILD_BUG_ON(sizeof(struct mlx5e_ktls_offload_context_tx *) > + TLS_OFFLOAD_CONTEXT_SIZE_TX); + + *ctx = priv_tx; +} + +static struct mlx5e_ktls_offload_context_tx * +mlx5e_get_ktls_tx_priv_ctx(struct tls_context *tls_ctx) +{ + struct mlx5e_ktls_offload_context_tx **ctx = + __tls_driver_ctx(tls_ctx, TLS_OFFLOAD_CTX_DIR_TX); + + return *ctx; +} + +int mlx5e_ktls_add_tx(struct net_device *netdev, struct sock *sk, + struct tls_crypto_info *crypto_info, u32 start_offload_tcp_sn) +{ + struct mlx5e_ktls_offload_context_tx *priv_tx; + struct tls_context *tls_ctx; + struct mlx5_core_dev *mdev; + struct mlx5e_priv *priv; + int err; + + tls_ctx = tls_get_ctx(sk); + priv = netdev_priv(netdev); + mdev = priv->mdev; + + priv_tx = kzalloc(sizeof(*priv_tx), GFP_KERNEL); + if (!priv_tx) + return -ENOMEM; + + err = mlx5_ktls_create_key(mdev, crypto_info, &priv_tx->key_id); + if (err) + goto err_create_key; + + priv_tx->sw_stats = &priv->tls->sw_stats; + priv_tx->expected_seq = start_offload_tcp_sn; + priv_tx->crypto_info = + *(struct tls12_crypto_info_aes_gcm_128 *)crypto_info; + priv_tx->tx_ctx = tls_offload_ctx_tx(tls_ctx); + + mlx5e_set_ktls_tx_priv_ctx(tls_ctx, priv_tx); + + err = mlx5e_ktls_create_tis(mdev, &priv_tx->tisn); + if (err) + goto err_create_tis; + + priv_tx->ctx_post_pending = true; + atomic64_inc(&priv_tx->sw_stats->tx_tls_ctx); + + return 0; + +err_create_tis: + mlx5_ktls_destroy_key(mdev, priv_tx->key_id); +err_create_key: + kfree(priv_tx); + return err; +} + +void mlx5e_ktls_del_tx(struct net_device *netdev, struct tls_context *tls_ctx) +{ + struct mlx5e_ktls_offload_context_tx *priv_tx; + struct mlx5_core_dev *mdev; + struct mlx5e_priv *priv; + + priv_tx = mlx5e_get_ktls_tx_priv_ctx(tls_ctx); + priv = netdev_priv(netdev); + mdev = priv->mdev; + + atomic64_inc(&priv_tx->sw_stats->tx_tls_del); + mlx5e_destroy_tis(mdev, priv_tx->tisn); + mlx5_ktls_destroy_key(mdev, priv_tx->key_id); + kfree(priv_tx); +} + +static void tx_fill_wi(struct mlx5e_txqsq *sq, + u16 pi, u8 num_wqebbs, u32 num_bytes, + struct page *page) +{ + struct mlx5e_tx_wqe_info *wi = &sq->db.wqe_info[pi]; + + *wi = (struct mlx5e_tx_wqe_info) { + .num_wqebbs = num_wqebbs, + .num_bytes = num_bytes, + .resync_dump_frag_page = page, + }; +} + +static bool +mlx5e_ktls_tx_offload_test_and_clear_pending(struct mlx5e_ktls_offload_context_tx *priv_tx) +{ + bool ret = priv_tx->ctx_post_pending; + + priv_tx->ctx_post_pending = false; + + return ret; +} + +static void +post_static_params(struct mlx5e_txqsq *sq, + struct mlx5e_ktls_offload_context_tx *priv_tx, + bool fence) +{ + struct mlx5e_set_tls_static_params_wqe *wqe; + u16 pi, num_wqebbs; + + num_wqebbs = MLX5E_TLS_SET_STATIC_PARAMS_WQEBBS; + pi = mlx5e_txqsq_get_next_pi(sq, num_wqebbs); + wqe = MLX5E_TLS_FETCH_SET_STATIC_PARAMS_WQE(sq, pi); + mlx5e_ktls_build_static_params(wqe, sq->pc, sq->sqn, &priv_tx->crypto_info, + priv_tx->tisn, priv_tx->key_id, 0, fence, + TLS_OFFLOAD_CTX_DIR_TX); + tx_fill_wi(sq, pi, num_wqebbs, 0, NULL); + sq->pc += num_wqebbs; +} + +static void +post_progress_params(struct mlx5e_txqsq *sq, + struct mlx5e_ktls_offload_context_tx *priv_tx, + bool fence) +{ + struct mlx5e_set_tls_progress_params_wqe *wqe; + u16 pi, num_wqebbs; + + num_wqebbs = MLX5E_TLS_SET_PROGRESS_PARAMS_WQEBBS; + pi = mlx5e_txqsq_get_next_pi(sq, num_wqebbs); + wqe = MLX5E_TLS_FETCH_SET_PROGRESS_PARAMS_WQE(sq, pi); + mlx5e_ktls_build_progress_params(wqe, sq->pc, sq->sqn, priv_tx->tisn, fence, 0, + TLS_OFFLOAD_CTX_DIR_TX); + tx_fill_wi(sq, pi, num_wqebbs, 0, NULL); + sq->pc += num_wqebbs; +} + +static void +mlx5e_ktls_tx_post_param_wqes(struct mlx5e_txqsq *sq, + struct mlx5e_ktls_offload_context_tx *priv_tx, + bool skip_static_post, bool fence_first_post) +{ + bool progress_fence = skip_static_post || !fence_first_post; + + if (!skip_static_post) + post_static_params(sq, priv_tx, fence_first_post); + + post_progress_params(sq, priv_tx, progress_fence); +} + +struct tx_sync_info { + u64 rcd_sn; + u32 sync_len; + int nr_frags; + skb_frag_t frags[MAX_SKB_FRAGS]; +}; + +enum mlx5e_ktls_sync_retval { + MLX5E_KTLS_SYNC_DONE, + MLX5E_KTLS_SYNC_FAIL, + MLX5E_KTLS_SYNC_SKIP_NO_DATA, +}; + +static enum mlx5e_ktls_sync_retval +tx_sync_info_get(struct mlx5e_ktls_offload_context_tx *priv_tx, + u32 tcp_seq, int datalen, struct tx_sync_info *info) +{ + struct tls_offload_context_tx *tx_ctx = priv_tx->tx_ctx; + enum mlx5e_ktls_sync_retval ret = MLX5E_KTLS_SYNC_DONE; + struct tls_record_info *record; + int remaining, i = 0; + unsigned long flags; + bool ends_before; + + spin_lock_irqsave(&tx_ctx->lock, flags); + record = tls_get_record(tx_ctx, tcp_seq, &info->rcd_sn); + + if (unlikely(!record)) { + ret = MLX5E_KTLS_SYNC_FAIL; + goto out; + } + + /* There are the following cases: + * 1. packet ends before start marker: bypass offload. + * 2. packet starts before start marker and ends after it: drop, + * not supported, breaks contract with kernel. + * 3. packet ends before tls record info starts: drop, + * this packet was already acknowledged and its record info + * was released. + */ + ends_before = before(tcp_seq + datalen - 1, tls_record_start_seq(record)); + + if (unlikely(tls_record_is_start_marker(record))) { + ret = ends_before ? MLX5E_KTLS_SYNC_SKIP_NO_DATA : MLX5E_KTLS_SYNC_FAIL; + goto out; + } else if (ends_before) { + ret = MLX5E_KTLS_SYNC_FAIL; + goto out; + } + + info->sync_len = tcp_seq - tls_record_start_seq(record); + remaining = info->sync_len; + while (remaining > 0) { + skb_frag_t *frag = &record->frags[i]; + + get_page(skb_frag_page(frag)); + remaining -= skb_frag_size(frag); + info->frags[i++] = *frag; + } + /* reduce the part which will be sent with the original SKB */ + if (remaining < 0) + skb_frag_size_add(&info->frags[i - 1], remaining); + info->nr_frags = i; +out: + spin_unlock_irqrestore(&tx_ctx->lock, flags); + return ret; +} + +static void +tx_post_resync_params(struct mlx5e_txqsq *sq, + struct mlx5e_ktls_offload_context_tx *priv_tx, + u64 rcd_sn) +{ + struct tls12_crypto_info_aes_gcm_128 *info = &priv_tx->crypto_info; + __be64 rn_be = cpu_to_be64(rcd_sn); + bool skip_static_post; + u16 rec_seq_sz; + char *rec_seq; + + rec_seq = info->rec_seq; + rec_seq_sz = sizeof(info->rec_seq); + + skip_static_post = !memcmp(rec_seq, &rn_be, rec_seq_sz); + if (!skip_static_post) + memcpy(rec_seq, &rn_be, rec_seq_sz); + + mlx5e_ktls_tx_post_param_wqes(sq, priv_tx, skip_static_post, true); +} + +static int +tx_post_resync_dump(struct mlx5e_txqsq *sq, skb_frag_t *frag, u32 tisn, bool first) +{ + struct mlx5_wqe_ctrl_seg *cseg; + struct mlx5_wqe_data_seg *dseg; + struct mlx5e_dump_wqe *wqe; + dma_addr_t dma_addr = 0; + u16 ds_cnt; + int fsz; + u16 pi; + + BUILD_BUG_ON(MLX5E_KTLS_DUMP_WQEBBS != 1); + pi = mlx5_wq_cyc_ctr2ix(&sq->wq, sq->pc); + wqe = MLX5E_TLS_FETCH_DUMP_WQE(sq, pi); + + ds_cnt = sizeof(*wqe) / MLX5_SEND_WQE_DS; + + cseg = &wqe->ctrl; + dseg = &wqe->data; + + cseg->opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | MLX5_OPCODE_DUMP); + cseg->qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt); + cseg->tis_tir_num = cpu_to_be32(tisn << 8); + cseg->fm_ce_se = first ? MLX5_FENCE_MODE_INITIATOR_SMALL : 0; + + fsz = skb_frag_size(frag); + dma_addr = skb_frag_dma_map(sq->pdev, frag, 0, fsz, + DMA_TO_DEVICE); + if (unlikely(dma_mapping_error(sq->pdev, dma_addr))) + return -ENOMEM; + + dseg->addr = cpu_to_be64(dma_addr); + dseg->lkey = sq->mkey_be; + dseg->byte_count = cpu_to_be32(fsz); + mlx5e_dma_push(sq, dma_addr, fsz, MLX5E_DMA_MAP_PAGE); + + tx_fill_wi(sq, pi, MLX5E_KTLS_DUMP_WQEBBS, fsz, skb_frag_page(frag)); + sq->pc += MLX5E_KTLS_DUMP_WQEBBS; + + return 0; +} + +void mlx5e_ktls_tx_handle_resync_dump_comp(struct mlx5e_txqsq *sq, + struct mlx5e_tx_wqe_info *wi, + u32 *dma_fifo_cc) +{ + struct mlx5e_sq_stats *stats; + struct mlx5e_sq_dma *dma; + + dma = mlx5e_dma_get(sq, (*dma_fifo_cc)++); + stats = sq->stats; + + mlx5e_tx_dma_unmap(sq->pdev, dma); + put_page(wi->resync_dump_frag_page); + stats->tls_dump_packets++; + stats->tls_dump_bytes += wi->num_bytes; +} + +static void tx_post_fence_nop(struct mlx5e_txqsq *sq) +{ + struct mlx5_wq_cyc *wq = &sq->wq; + u16 pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc); + + tx_fill_wi(sq, pi, 1, 0, NULL); + + mlx5e_post_nop_fence(wq, sq->sqn, &sq->pc); +} + +static enum mlx5e_ktls_sync_retval +mlx5e_ktls_tx_handle_ooo(struct mlx5e_ktls_offload_context_tx *priv_tx, + struct mlx5e_txqsq *sq, + int datalen, + u32 seq) +{ + struct mlx5e_sq_stats *stats = sq->stats; + enum mlx5e_ktls_sync_retval ret; + struct tx_sync_info info = {}; + int i = 0; + + ret = tx_sync_info_get(priv_tx, seq, datalen, &info); + if (unlikely(ret != MLX5E_KTLS_SYNC_DONE)) { + if (ret == MLX5E_KTLS_SYNC_SKIP_NO_DATA) { + stats->tls_skip_no_sync_data++; + return MLX5E_KTLS_SYNC_SKIP_NO_DATA; + } + /* We might get here if a retransmission reaches the driver + * after the relevant record is acked. + * It should be safe to drop the packet in this case + */ + stats->tls_drop_no_sync_data++; + goto err_out; + } + + stats->tls_ooo++; + + tx_post_resync_params(sq, priv_tx, info.rcd_sn); + + /* If no dump WQE was sent, we need to have a fence NOP WQE before the + * actual data xmit. + */ + if (!info.nr_frags) { + tx_post_fence_nop(sq); + return MLX5E_KTLS_SYNC_DONE; + } + + for (; i < info.nr_frags; i++) { + unsigned int orig_fsz, frag_offset = 0, n = 0; + skb_frag_t *f = &info.frags[i]; + + orig_fsz = skb_frag_size(f); + + do { + bool fence = !(i || frag_offset); + unsigned int fsz; + + n++; + fsz = min_t(unsigned int, sq->hw_mtu, orig_fsz - frag_offset); + skb_frag_size_set(f, fsz); + if (tx_post_resync_dump(sq, f, priv_tx->tisn, fence)) { + page_ref_add(skb_frag_page(f), n - 1); + goto err_out; + } + + skb_frag_off_add(f, fsz); + frag_offset += fsz; + } while (frag_offset < orig_fsz); + + page_ref_add(skb_frag_page(f), n - 1); + } + + return MLX5E_KTLS_SYNC_DONE; + +err_out: + for (; i < info.nr_frags; i++) + /* The put_page() here undoes the page ref obtained in tx_sync_info_get(). + * Page refs obtained for the DUMP WQEs above (by page_ref_add) will be + * released only upon their completions (or in mlx5e_free_txqsq_descs, + * if channel closes). + */ + put_page(skb_frag_page(&info.frags[i])); + + return MLX5E_KTLS_SYNC_FAIL; +} + +bool mlx5e_ktls_handle_tx_skb(struct tls_context *tls_ctx, struct mlx5e_txqsq *sq, + struct sk_buff *skb, int datalen, + struct mlx5e_accel_tx_tls_state *state) +{ + struct mlx5e_ktls_offload_context_tx *priv_tx; + struct mlx5e_sq_stats *stats = sq->stats; + u32 seq; + + priv_tx = mlx5e_get_ktls_tx_priv_ctx(tls_ctx); + + if (unlikely(mlx5e_ktls_tx_offload_test_and_clear_pending(priv_tx))) { + mlx5e_ktls_tx_post_param_wqes(sq, priv_tx, false, false); + } + + seq = ntohl(tcp_hdr(skb)->seq); + if (unlikely(priv_tx->expected_seq != seq)) { + enum mlx5e_ktls_sync_retval ret = + mlx5e_ktls_tx_handle_ooo(priv_tx, sq, datalen, seq); + + switch (ret) { + case MLX5E_KTLS_SYNC_DONE: + break; + case MLX5E_KTLS_SYNC_SKIP_NO_DATA: + if (likely(!skb->decrypted)) + goto out; + WARN_ON_ONCE(1); + fallthrough; + case MLX5E_KTLS_SYNC_FAIL: + goto err_out; + } + } + + priv_tx->expected_seq = seq + datalen; + + state->tls_tisn = priv_tx->tisn; + + stats->tls_encrypted_packets += skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1; + stats->tls_encrypted_bytes += datalen; + +out: + return true; + +err_out: + dev_kfree_skb_any(skb); + return false; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_txrx.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_txrx.c new file mode 100644 index 0000000..ac29aeb --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_txrx.c @@ -0,0 +1,119 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. */ + +#include "en_accel/ktls_txrx.h" +#include "en_accel/ktls_utils.h" + +enum { + MLX5E_STATIC_PARAMS_CONTEXT_TLS_1_2 = 0x2, +}; + +enum { + MLX5E_ENCRYPTION_STANDARD_TLS = 0x1, +}; + +#define EXTRACT_INFO_FIELDS do { \ + salt = info->salt; \ + rec_seq = info->rec_seq; \ + salt_sz = sizeof(info->salt); \ + rec_seq_sz = sizeof(info->rec_seq); \ +} while (0) + +static void +fill_static_params(struct mlx5_wqe_tls_static_params_seg *params, + struct tls12_crypto_info_aes_gcm_128 *info, + u32 key_id, u32 resync_tcp_sn) +{ + char *initial_rn, *gcm_iv; + u16 salt_sz, rec_seq_sz; + char *salt, *rec_seq; + u8 tls_version; + u8 *ctx; + + ctx = params->ctx; + + EXTRACT_INFO_FIELDS; + + gcm_iv = MLX5_ADDR_OF(tls_static_params, ctx, gcm_iv); + initial_rn = MLX5_ADDR_OF(tls_static_params, ctx, initial_record_number); + + memcpy(gcm_iv, salt, salt_sz); + memcpy(initial_rn, rec_seq, rec_seq_sz); + + tls_version = MLX5E_STATIC_PARAMS_CONTEXT_TLS_1_2; + + MLX5_SET(tls_static_params, ctx, tls_version, tls_version); + MLX5_SET(tls_static_params, ctx, const_1, 1); + MLX5_SET(tls_static_params, ctx, const_2, 2); + MLX5_SET(tls_static_params, ctx, encryption_standard, + MLX5E_ENCRYPTION_STANDARD_TLS); + MLX5_SET(tls_static_params, ctx, resync_tcp_sn, resync_tcp_sn); + MLX5_SET(tls_static_params, ctx, dek_index, key_id); +} + +void +mlx5e_ktls_build_static_params(struct mlx5e_set_tls_static_params_wqe *wqe, + u16 pc, u32 sqn, + struct tls12_crypto_info_aes_gcm_128 *info, + u32 tis_tir_num, u32 key_id, u32 resync_tcp_sn, + bool fence, enum tls_offload_ctx_dir direction) +{ + struct mlx5_wqe_umr_ctrl_seg *ucseg = &wqe->uctrl; + struct mlx5_wqe_ctrl_seg *cseg = &wqe->ctrl; + u8 opmod = direction == TLS_OFFLOAD_CTX_DIR_TX ? + MLX5_OPC_MOD_TLS_TIS_STATIC_PARAMS : + MLX5_OPC_MOD_TLS_TIR_STATIC_PARAMS; + +#define STATIC_PARAMS_DS_CNT DIV_ROUND_UP(sizeof(*wqe), MLX5_SEND_WQE_DS) + + cseg->opmod_idx_opcode = cpu_to_be32((pc << 8) | MLX5_OPCODE_UMR | (opmod << 24)); + cseg->qpn_ds = cpu_to_be32((sqn << MLX5_WQE_CTRL_QPN_SHIFT) | + STATIC_PARAMS_DS_CNT); + cseg->fm_ce_se = fence ? MLX5_FENCE_MODE_INITIATOR_SMALL : 0; + cseg->tis_tir_num = cpu_to_be32(tis_tir_num << 8); + + ucseg->flags = MLX5_UMR_INLINE; + ucseg->bsf_octowords = cpu_to_be16(MLX5_ST_SZ_BYTES(tls_static_params) / 16); + + fill_static_params(&wqe->params, info, key_id, resync_tcp_sn); +} + +static void +fill_progress_params(struct mlx5_wqe_tls_progress_params_seg *params, u32 tis_tir_num, + u32 next_record_tcp_sn) +{ + u8 *ctx = params->ctx; + + params->tis_tir_num = cpu_to_be32(tis_tir_num); + + MLX5_SET(tls_progress_params, ctx, next_record_tcp_sn, + next_record_tcp_sn); + MLX5_SET(tls_progress_params, ctx, record_tracker_state, + MLX5E_TLS_PROGRESS_PARAMS_RECORD_TRACKER_STATE_START); + MLX5_SET(tls_progress_params, ctx, auth_state, + MLX5E_TLS_PROGRESS_PARAMS_AUTH_STATE_NO_OFFLOAD); +} + +void +mlx5e_ktls_build_progress_params(struct mlx5e_set_tls_progress_params_wqe *wqe, + u16 pc, u32 sqn, + u32 tis_tir_num, bool fence, + u32 next_record_tcp_sn, + enum tls_offload_ctx_dir direction) +{ + struct mlx5_wqe_ctrl_seg *cseg = &wqe->ctrl; + u8 opmod = direction == TLS_OFFLOAD_CTX_DIR_TX ? + MLX5_OPC_MOD_TLS_TIS_PROGRESS_PARAMS : + MLX5_OPC_MOD_TLS_TIR_PROGRESS_PARAMS; + +#define PROGRESS_PARAMS_DS_CNT DIV_ROUND_UP(sizeof(*wqe), MLX5_SEND_WQE_DS) + + cseg->opmod_idx_opcode = + cpu_to_be32((pc << 8) | MLX5_OPCODE_SET_PSV | (opmod << 24)); + cseg->qpn_ds = cpu_to_be32((sqn << MLX5_WQE_CTRL_QPN_SHIFT) | + PROGRESS_PARAMS_DS_CNT); + cseg->fm_ce_se = fence ? MLX5_FENCE_MODE_INITIATOR_SMALL : 0; + + fill_progress_params(&wqe->params, tis_tir_num, next_record_tcp_sn); +} + diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_txrx.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_txrx.h new file mode 100644 index 0000000..08c9d51 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_txrx.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. */ + +#ifndef __MLX5E_KTLS_TXRX_H__ +#define __MLX5E_KTLS_TXRX_H__ + +#ifdef CONFIG_MLX5_EN_TLS + +#include +#include "en.h" +#include "en/txrx.h" + +struct mlx5e_accel_tx_tls_state { + u32 tls_tisn; +}; + +u16 mlx5e_ktls_get_stop_room(struct mlx5_core_dev *mdev, struct mlx5e_params *params); + +bool mlx5e_ktls_handle_tx_skb(struct tls_context *tls_ctx, struct mlx5e_txqsq *sq, + struct sk_buff *skb, int datalen, + struct mlx5e_accel_tx_tls_state *state); +void mlx5e_ktls_handle_rx_skb(struct mlx5e_rq *rq, struct sk_buff *skb, + struct mlx5_cqe64 *cqe, u32 *cqe_bcnt); + +void mlx5e_ktls_handle_ctx_completion(struct mlx5e_icosq_wqe_info *wi); +void mlx5e_ktls_handle_get_psv_completion(struct mlx5e_icosq_wqe_info *wi, + struct mlx5e_icosq *sq); + +void mlx5e_ktls_tx_handle_resync_dump_comp(struct mlx5e_txqsq *sq, + struct mlx5e_tx_wqe_info *wi, + u32 *dma_fifo_cc); +static inline bool +mlx5e_ktls_tx_try_handle_resync_dump_comp(struct mlx5e_txqsq *sq, + struct mlx5e_tx_wqe_info *wi, + u32 *dma_fifo_cc) +{ + if (unlikely(wi->resync_dump_frag_page)) { + mlx5e_ktls_tx_handle_resync_dump_comp(sq, wi, dma_fifo_cc); + return true; + } + return false; +} + +bool mlx5e_ktls_rx_handle_resync_list(struct mlx5e_channel *c, int budget); + +static inline bool +mlx5e_ktls_rx_pending_resync_list(struct mlx5e_channel *c, int budget) +{ + return budget && test_bit(MLX5E_SQ_STATE_PENDING_TLS_RX_RESYNC, &c->async_icosq.state); +} +#else +static inline bool +mlx5e_ktls_tx_try_handle_resync_dump_comp(struct mlx5e_txqsq *sq, + struct mlx5e_tx_wqe_info *wi, + u32 *dma_fifo_cc) +{ + return false; +} + +static inline bool +mlx5e_ktls_rx_handle_resync_list(struct mlx5e_channel *c, int budget) +{ + return false; +} + +static inline bool +mlx5e_ktls_rx_pending_resync_list(struct mlx5e_channel *c, int budget) +{ + return false; +} + +#endif /* CONFIG_MLX5_EN_TLS */ + +#endif /* __MLX5E_TLS_TXRX_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_utils.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_utils.h new file mode 100644 index 0000000..e5c180f --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_utils.h @@ -0,0 +1,86 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. */ + +#ifndef __MLX5E_KTLS_UTILS_H__ +#define __MLX5E_KTLS_UTILS_H__ + +#include +#include "en.h" +#include "accel/tls.h" + +enum { + MLX5E_TLS_PROGRESS_PARAMS_AUTH_STATE_NO_OFFLOAD = 0, + MLX5E_TLS_PROGRESS_PARAMS_AUTH_STATE_OFFLOAD = 1, + MLX5E_TLS_PROGRESS_PARAMS_AUTH_STATE_AUTHENTICATION = 2, +}; + +enum { + MLX5E_TLS_PROGRESS_PARAMS_RECORD_TRACKER_STATE_START = 0, + MLX5E_TLS_PROGRESS_PARAMS_RECORD_TRACKER_STATE_TRACKING = 1, + MLX5E_TLS_PROGRESS_PARAMS_RECORD_TRACKER_STATE_SEARCHING = 2, +}; + +int mlx5e_ktls_add_tx(struct net_device *netdev, struct sock *sk, + struct tls_crypto_info *crypto_info, u32 start_offload_tcp_sn); +void mlx5e_ktls_del_tx(struct net_device *netdev, struct tls_context *tls_ctx); +int mlx5e_ktls_add_rx(struct net_device *netdev, struct sock *sk, + struct tls_crypto_info *crypto_info, u32 start_offload_tcp_sn); +void mlx5e_ktls_del_rx(struct net_device *netdev, struct tls_context *tls_ctx); +void mlx5e_ktls_rx_resync(struct net_device *netdev, struct sock *sk, u32 seq, u8 *rcd_sn); + +struct mlx5e_set_tls_static_params_wqe { + struct mlx5_wqe_ctrl_seg ctrl; + struct mlx5_wqe_umr_ctrl_seg uctrl; + struct mlx5_mkey_seg mkc; + struct mlx5_wqe_tls_static_params_seg params; +}; + +struct mlx5e_set_tls_progress_params_wqe { + struct mlx5_wqe_ctrl_seg ctrl; + struct mlx5_wqe_tls_progress_params_seg params; +}; + +struct mlx5e_get_tls_progress_params_wqe { + struct mlx5_wqe_ctrl_seg ctrl; + struct mlx5_seg_get_psv psv; +}; + +#define MLX5E_TLS_SET_STATIC_PARAMS_WQEBBS \ + (DIV_ROUND_UP(sizeof(struct mlx5e_set_tls_static_params_wqe), MLX5_SEND_WQE_BB)) + +#define MLX5E_TLS_SET_PROGRESS_PARAMS_WQEBBS \ + (DIV_ROUND_UP(sizeof(struct mlx5e_set_tls_progress_params_wqe), MLX5_SEND_WQE_BB)) + +#define MLX5E_KTLS_GET_PROGRESS_WQEBBS \ + (DIV_ROUND_UP(sizeof(struct mlx5e_get_tls_progress_params_wqe), MLX5_SEND_WQE_BB)) + +#define MLX5E_TLS_FETCH_SET_STATIC_PARAMS_WQE(sq, pi) \ + ((struct mlx5e_set_tls_static_params_wqe *)\ + mlx5e_fetch_wqe(&(sq)->wq, pi, sizeof(struct mlx5e_set_tls_static_params_wqe))) + +#define MLX5E_TLS_FETCH_SET_PROGRESS_PARAMS_WQE(sq, pi) \ + ((struct mlx5e_set_tls_progress_params_wqe *)\ + mlx5e_fetch_wqe(&(sq)->wq, pi, sizeof(struct mlx5e_set_tls_progress_params_wqe))) + +#define MLX5E_TLS_FETCH_GET_PROGRESS_PARAMS_WQE(sq, pi) \ + ((struct mlx5e_get_tls_progress_params_wqe *)\ + mlx5e_fetch_wqe(&(sq)->wq, pi, sizeof(struct mlx5e_get_tls_progress_params_wqe))) + +#define MLX5E_TLS_FETCH_DUMP_WQE(sq, pi) \ + ((struct mlx5e_dump_wqe *)\ + mlx5e_fetch_wqe(&(sq)->wq, pi, sizeof(struct mlx5e_dump_wqe))) + +void +mlx5e_ktls_build_static_params(struct mlx5e_set_tls_static_params_wqe *wqe, + u16 pc, u32 sqn, + struct tls12_crypto_info_aes_gcm_128 *info, + u32 tis_tir_num, u32 key_id, u32 resync_tcp_sn, + bool fence, enum tls_offload_ctx_dir direction); +void +mlx5e_ktls_build_progress_params(struct mlx5e_set_tls_progress_params_wqe *wqe, + u16 pc, u32 sqn, + u32 tis_tir_num, bool fence, + u32 next_record_tcp_sn, + enum tls_offload_ctx_dir direction); + +#endif /* __MLX5E_TLS_UTILS_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c new file mode 100644 index 0000000..67547f5 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c @@ -0,0 +1,1986 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#include +#include +#include +#include + +#include "en.h" +#include "en/aso.h" +#include "lib/mlx5.h" +#include "en_accel/macsec.h" +#include "en_accel/macsec_fs.h" + +#define MLX5_MACSEC_EPN_SCOPE_MID 0x80000000L +#define MLX5E_MACSEC_ASO_CTX_SZ MLX5_ST_SZ_BYTES(macsec_aso) + +enum mlx5_macsec_aso_event_arm { + MLX5E_ASO_EPN_ARM = BIT(0), +}; + +enum { + MLX5_MACSEC_ASO_REMOVE_FLOW_PKT_CNT_OFFSET, +}; + +struct mlx5e_macsec_handle { + struct mlx5e_macsec *macsec; + u32 obj_id; + u8 idx; +}; + +enum { + MLX5_MACSEC_EPN, +}; + +struct mlx5e_macsec_aso_out { + u8 event_arm; + u32 mode_param; +}; + +struct mlx5e_macsec_aso_in { + u8 mode; + u32 obj_id; +}; + +struct mlx5e_macsec_epn_state { + u32 epn_msb; + u8 epn_enabled; + u8 overlap; +}; + +struct mlx5e_macsec_async_work { + struct mlx5e_macsec *macsec; + struct mlx5_core_dev *mdev; + struct work_struct work; + u32 obj_id; +}; + +struct mlx5e_macsec_sa { + bool active; + u8 assoc_num; + u32 macsec_obj_id; + u32 enc_key_id; + u32 next_pn; + sci_t sci; + salt_t salt; + + struct rhash_head hash; + u32 fs_id; + union mlx5e_macsec_rule *macsec_rule; + struct mlx5e_macsec_epn_state epn_state; +}; + +struct mlx5e_macsec_rx_sc; +struct mlx5e_macsec_rx_sc { + bool active; + sci_t sci; + struct mlx5e_macsec_sa *rx_sa[MACSEC_NUM_AN]; + struct list_head rx_sc_list_element; + struct metadata_dst *md_dst; + u32 fs_id; +}; + +struct mlx5e_macsec_umr { + dma_addr_t dma_addr; + u8 ctx[MLX5_ST_SZ_BYTES(macsec_aso)]; + u32 mkey; +}; + +struct mlx5e_macsec_aso { + /* ASO */ + struct mlx5_aso *maso; + /* Protects macsec ASO */ + struct mutex aso_lock; + /* UMR */ + struct mlx5e_macsec_umr *umr; + + u32 pdn; +}; + +static const struct rhashtable_params rhash_sci = { + .key_len = sizeof_field(struct mlx5e_macsec_sa, sci), + .key_offset = offsetof(struct mlx5e_macsec_sa, sci), + .head_offset = offsetof(struct mlx5e_macsec_sa, hash), + .automatic_shrinking = true, + .min_size = 1, +}; + +struct mlx5e_macsec_device { + const struct net_device *netdev; + struct mlx5e_macsec_sa *tx_sa[MACSEC_NUM_AN]; + struct list_head macsec_rx_sc_list_head; + unsigned char *dev_addr; + struct list_head macsec_device_list_element; +}; + +struct mlx5e_macsec { + struct list_head macsec_device_list_head; + int num_of_devices; + struct mlx5e_macsec_fs *macsec_fs; + struct mutex lock; /* Protects mlx5e_macsec internal contexts */ + + /* Tx sci -> fs id mapping handling */ + struct rhashtable sci_hash; /* sci -> mlx5e_macsec_sa */ + + /* Rx fs_id -> rx_sc mapping */ + struct xarray sc_xarray; + + struct mlx5_core_dev *mdev; + + /* Stats manage */ + struct mlx5e_macsec_stats stats; + + /* ASO */ + struct mlx5e_macsec_aso aso; + + struct notifier_block nb; + struct workqueue_struct *wq; +}; + +struct mlx5_macsec_obj_attrs { + u32 aso_pdn; + u32 next_pn; + __be64 sci; + u32 enc_key_id; + bool encrypt; + struct mlx5e_macsec_epn_state epn_state; + salt_t salt; + __be32 ssci; + bool replay_protect; + u32 replay_window; +}; + +struct mlx5_aso_ctrl_param { + u8 data_mask_mode; + u8 condition_0_operand; + u8 condition_1_operand; + u8 condition_0_offset; + u8 condition_1_offset; + u8 data_offset; + u8 condition_operand; + u32 condition_0_data; + u32 condition_0_mask; + u32 condition_1_data; + u32 condition_1_mask; + u64 bitwise_data; + u64 data_mask; +}; + +static int mlx5e_macsec_aso_reg_mr(struct mlx5_core_dev *mdev, struct mlx5e_macsec_aso *aso) +{ + struct mlx5e_macsec_umr *umr; + struct device *dma_device; + dma_addr_t dma_addr; + int err; + + umr = kzalloc(sizeof(*umr), GFP_KERNEL); + if (!umr) { + err = -ENOMEM; + return err; + } + + dma_device = &mdev->pdev->dev; + dma_addr = dma_map_single(dma_device, umr->ctx, sizeof(umr->ctx), DMA_BIDIRECTIONAL); + err = dma_mapping_error(dma_device, dma_addr); + if (err) { + mlx5_core_err(mdev, "Can't map dma device, err=%d\n", err); + goto out_dma; + } + + err = mlx5e_create_mkey(mdev, aso->pdn, &umr->mkey); + if (err) { + mlx5_core_err(mdev, "Can't create mkey, err=%d\n", err); + goto out_mkey; + } + + umr->dma_addr = dma_addr; + + aso->umr = umr; + + return 0; + +out_mkey: + dma_unmap_single(dma_device, dma_addr, sizeof(umr->ctx), DMA_BIDIRECTIONAL); +out_dma: + kfree(umr); + return err; +} + +static void mlx5e_macsec_aso_dereg_mr(struct mlx5_core_dev *mdev, struct mlx5e_macsec_aso *aso) +{ + struct mlx5e_macsec_umr *umr = aso->umr; + + mlx5_core_destroy_mkey(mdev, umr->mkey); + dma_unmap_single(&mdev->pdev->dev, umr->dma_addr, sizeof(umr->ctx), DMA_BIDIRECTIONAL); + kfree(umr); +} + +static int macsec_set_replay_protection(struct mlx5_macsec_obj_attrs *attrs, void *aso_ctx) +{ + u8 window_sz; + + if (!attrs->replay_protect) + return 0; + + MLX5_SET(macsec_aso, aso_ctx, window_size, window_sz); + MLX5_SET(macsec_aso, aso_ctx, mode, MLX5_MACSEC_ASO_REPLAY_PROTECTION); + + return 0; +} + +static int mlx5e_macsec_create_object(struct mlx5_core_dev *mdev, + struct mlx5_macsec_obj_attrs *attrs, + bool is_tx, + u32 *macsec_obj_id) +{ + u32 in[MLX5_ST_SZ_DW(create_macsec_obj_in)] = {}; + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)]; + void *aso_ctx; + void *obj; + int err; + + obj = MLX5_ADDR_OF(create_macsec_obj_in, in, macsec_object); + aso_ctx = MLX5_ADDR_OF(macsec_offload_obj, obj, macsec_aso); + + MLX5_SET(macsec_offload_obj, obj, confidentiality_en, attrs->encrypt); + MLX5_SET(macsec_offload_obj, obj, dekn, attrs->enc_key_id); + MLX5_SET(macsec_offload_obj, obj, aso_return_reg, MLX5_MACSEC_ASO_REG_C_4_5); + MLX5_SET(macsec_offload_obj, obj, macsec_aso_access_pd, attrs->aso_pdn); + MLX5_SET(macsec_aso, aso_ctx, mode_parameter, attrs->next_pn); + + /* Epn */ + if (attrs->epn_state.epn_enabled) { + void *salt_p; + int i; + + MLX5_SET(macsec_aso, aso_ctx, epn_event_arm, 1); + MLX5_SET(macsec_offload_obj, obj, epn_en, 1); + MLX5_SET(macsec_offload_obj, obj, epn_msb, attrs->epn_state.epn_msb); + MLX5_SET(macsec_offload_obj, obj, epn_overlap, attrs->epn_state.overlap); + MLX5_SET64(macsec_offload_obj, obj, sci, (__force u64)attrs->ssci); + salt_p = MLX5_ADDR_OF(macsec_offload_obj, obj, salt); + for (i = 0; i < 3 ; i++) + memcpy((u32 *)salt_p + i, &attrs->salt.bytes[4 * (2 - i)], 4); + } else { + MLX5_SET64(macsec_offload_obj, obj, sci, (__force u64)(attrs->sci)); + } + + MLX5_SET(macsec_aso, aso_ctx, valid, 0x1); + if (is_tx) { + MLX5_SET(macsec_aso, aso_ctx, mode, MLX5_MACSEC_ASO_INC_SN); + } else { + err = macsec_set_replay_protection(attrs, aso_ctx); + if (err) + return err; + } + + /* general object fields set */ + MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT); + MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_GENERAL_OBJECT_TYPES_MACSEC); + + err = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); + if (err) { + mlx5_core_err(mdev, + "MACsec offload: Failed to create MACsec object (err = %d)\n", + err); + return err; + } + + *macsec_obj_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id); + + return err; +} + +static void mlx5e_macsec_destroy_object(struct mlx5_core_dev *mdev, u32 macsec_obj_id) +{ + u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {}; + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)]; + + MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT); + MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_GENERAL_OBJECT_TYPES_MACSEC); + MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, macsec_obj_id); + + mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); +} + +static void mlx5e_macsec_cleanup_sa(struct mlx5e_macsec *macsec, + struct mlx5e_macsec_sa *sa, + bool is_tx) +{ + int action = (is_tx) ? MLX5_ACCEL_MACSEC_ACTION_ENCRYPT : + MLX5_ACCEL_MACSEC_ACTION_DECRYPT; + + if ((is_tx) && sa->fs_id) { + /* Make sure ongoing datapath readers sees a valid SA */ + rhashtable_remove_fast(&macsec->sci_hash, &sa->hash, rhash_sci); + sa->fs_id = 0; + } + + if (!sa->macsec_rule) + return; + + if (is_tx) + macsec_fs_del_roce_rule_tx(macsec->macsec_fs, sa->macsec_rule); + else + macsec_fs_del_roce_rule_rx(macsec->macsec_fs, sa->macsec_rule); + + mlx5e_macsec_fs_del_rule(macsec->macsec_fs, sa->macsec_rule, action); + mlx5e_macsec_destroy_object(macsec->mdev, sa->macsec_obj_id); + sa->macsec_rule = NULL; +} + +static int mlx5e_macsec_init_sa(struct macsec_context *ctx, + struct mlx5e_macsec_sa *sa, + bool encrypt, + bool is_tx) +{ + struct mlx5e_priv *priv = netdev_priv(ctx->netdev); + struct mlx5e_macsec *macsec = priv->macsec; + struct mlx5_macsec_rule_attrs rule_attrs; + struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5_macsec_obj_attrs obj_attrs; + union mlx5e_macsec_rule *macsec_rule; + struct macsec_key *key; + int err; + + obj_attrs.next_pn = sa->next_pn; + obj_attrs.sci = cpu_to_be64((__force u64)sa->sci); + obj_attrs.enc_key_id = sa->enc_key_id; + obj_attrs.encrypt = encrypt; + obj_attrs.aso_pdn = macsec->aso.pdn; + obj_attrs.epn_state = sa->epn_state; + + key = (is_tx) ? &ctx->sa.tx_sa->key : &ctx->sa.rx_sa->key; + + if (sa->epn_state.epn_enabled) { + obj_attrs.ssci = (is_tx) ? cpu_to_be32((__force u32)ctx->sa.tx_sa->ssci) : + cpu_to_be32((__force u32)ctx->sa.rx_sa->ssci); + + memcpy(&obj_attrs.salt, &key->salt, sizeof(key->salt)); + } + + obj_attrs.replay_window = ctx->secy->replay_window; + obj_attrs.replay_protect = ctx->secy->replay_protect; + + err = mlx5e_macsec_create_object(mdev, &obj_attrs, is_tx, &sa->macsec_obj_id); + if (err) + return err; + + rule_attrs.macsec_obj_id = sa->macsec_obj_id; + rule_attrs.sci = sa->sci; + rule_attrs.assoc_num = sa->assoc_num; + rule_attrs.action = (is_tx) ? MLX5_ACCEL_MACSEC_ACTION_ENCRYPT : + MLX5_ACCEL_MACSEC_ACTION_DECRYPT; + + macsec_rule = mlx5e_macsec_fs_add_rule(macsec->macsec_fs, ctx, &rule_attrs, &sa->fs_id); + if (IS_ERR_OR_NULL(macsec_rule)) + goto destroy_macsec_object; + + sa->macsec_rule = macsec_rule; + + if (is_tx) { + err = rhashtable_insert_fast(&macsec->sci_hash, &sa->hash, rhash_sci); + if (err) + goto destroy_macsec_object_and_rule; + } + + return 0; + +destroy_macsec_object_and_rule: + mlx5e_macsec_cleanup_sa(macsec, sa, is_tx); +destroy_macsec_object: + mlx5e_macsec_destroy_object(mdev, sa->macsec_obj_id); + + return err; +} + +static struct mlx5e_macsec_rx_sc * +mlx5e_macsec_get_rx_sc_from_sc_list(const struct list_head *list, sci_t sci) +{ + struct mlx5e_macsec_rx_sc *iter; + + list_for_each_entry(iter, list, rx_sc_list_element) { + if (iter->sci == sci) + return iter; + } + + return NULL; +} + +static int macsec_rx_sa_active_update(struct macsec_context *ctx, + struct mlx5e_macsec_sa *rx_sa, + bool active) +{ + struct mlx5e_priv *priv = netdev_priv(ctx->netdev); + struct mlx5e_macsec *macsec = priv->macsec; + int err = 0; + + if (rx_sa->active == active) + return 0; + + rx_sa->active = active; + if (!active) { + mlx5e_macsec_cleanup_sa(macsec, rx_sa, false); + return 0; + } + + err = mlx5e_macsec_init_sa(ctx, rx_sa, true, false); + if (err) + rx_sa->active = false; + + return err; +} + +static bool mlx5e_macsec_secy_features_validate(struct macsec_context *ctx) +{ + const struct net_device *netdev = ctx->netdev; + const struct macsec_secy *secy = ctx->secy; + + if (secy->validate_frames != MACSEC_VALIDATE_STRICT) { + netdev_err(netdev, + "MACsec offload is supported only when validate_frame is in strict mode\n"); + return false; + } + + if (secy->icv_len != MACSEC_DEFAULT_ICV_LEN) { + netdev_err(netdev, "MACsec offload is supported only when icv_len is %d\n", + MACSEC_DEFAULT_ICV_LEN); + return false; + } + + if (!secy->protect_frames) { + netdev_err(netdev, + "MACsec offload is supported only when protect_frames is set\n"); + return false; + } + + if (!ctx->secy->tx_sc.encrypt) { + netdev_err(netdev, "MACsec offload: encrypt off isn't supported\n"); + return false; + } + + return true; +} + +static struct mlx5e_macsec_device * +mlx5e_macsec_get_macsec_device_context(const struct mlx5e_macsec *macsec, + const struct macsec_context *ctx) +{ + struct mlx5e_macsec_device *iter; + const struct list_head *list; + + list = &macsec->macsec_device_list_head; + list_for_each_entry(iter, list, macsec_device_list_element) { + if (iter->netdev == ctx->secy->netdev) + return iter; + } + + return NULL; +} + +static void update_macsec_epn(struct mlx5e_macsec_sa *sa, const struct macsec_key *key, + const pn_t *next_pn_halves) +{ + struct mlx5e_macsec_epn_state *epn_state = &sa->epn_state; + + sa->salt = key->salt; + epn_state->epn_enabled = 1; + epn_state->epn_msb = next_pn_halves->upper; + epn_state->overlap = next_pn_halves->lower < MLX5_MACSEC_EPN_SCOPE_MID ? 0 : 1; +} + +static struct mlx5e_macsec_device * +mlx5e_macsec_get_macsec_device_netdevice(const struct mlx5e_macsec *macsec, + const struct net_device *netdev) +{ + struct mlx5e_macsec_device *iter; + const struct list_head *list; + + list = &macsec->macsec_device_list_head; + list_for_each_entry_rcu(iter, list, macsec_device_list_element) { + if (iter->netdev == netdev) + return iter; + } + + return NULL; +} + +static void mlx5e_macsec_fs_del_roce_rule(struct net_device *ndev) +{ + struct mlx5e_priv *priv = netdev_priv(macsec_get_real_dev(ndev)); + struct mlx5e_macsec *macsec = priv->macsec; + struct mlx5e_macsec_device *macsec_device; + struct mlx5e_macsec_rx_sc *rx_sc, *tmp; + struct mlx5e_macsec_sa *rx_sa, *tx_sa; + struct list_head *list; + int i; + + macsec_device = mlx5e_macsec_get_macsec_device_netdevice(macsec, ndev); + if (!macsec_device) { + netdev_err(ndev, "MACsec offload: Failed to find device from netdev\n"); + return; + } + + for (i = 0; i < MACSEC_NUM_AN; ++i) { + tx_sa = macsec_device->tx_sa[i]; + if (!tx_sa || !tx_sa->macsec_rule) + continue; + + macsec_fs_del_roce_rule_tx(macsec->macsec_fs, tx_sa->macsec_rule); + } + + list = &macsec_device->macsec_rx_sc_list_head; + list_for_each_entry_safe(rx_sc, tmp, list, rx_sc_list_element) { + for (i = 0; i < MACSEC_NUM_AN; ++i) { + rx_sa = rx_sc->rx_sa[i]; + if (!rx_sa || !rx_sa->macsec_rule) + continue; + + macsec_fs_del_roce_rule_rx(macsec->macsec_fs, rx_sa->macsec_rule); + } + } +} + +int mlx5e_macsec_fs_add_roce_rule(struct net_device *ndev, + const struct sockaddr *addr) +{ + struct mlx5e_priv *priv = netdev_priv(macsec_get_real_dev(ndev)); + struct mlx5e_macsec *macsec = priv->macsec; + struct mlx5e_macsec_device *macsec_device; + struct mlx5e_macsec_rx_sc *rx_sc, *tmp; + struct mlx5e_macsec_sa *rx_sa, *tx_sa; + struct list_head *list; + int i, err; + + macsec_device = mlx5e_macsec_get_macsec_device_netdevice(macsec, ndev); + if (!macsec_device) { + netdev_err(ndev, "MACsec offload: Failed to find device from netdev\n"); + return -EINVAL; + } + + for (i = 0; i < MACSEC_NUM_AN; ++i) { + tx_sa = macsec_device->tx_sa[i]; + if (!tx_sa || !tx_sa->macsec_rule) + continue; + + err = macsec_fs_add_roce_rule_tx(macsec->macsec_fs, tx_sa->fs_id, addr, + tx_sa->macsec_rule); + if (err && err != -EEXIST) { + netdev_err(ndev, "MACsec offload: Failed to add roce TX rule\n"); + goto out; + } + } + + list = &macsec_device->macsec_rx_sc_list_head; + list_for_each_entry_safe(rx_sc, tmp, list, rx_sc_list_element) { + for (i = 0; i < MACSEC_NUM_AN; ++i) { + rx_sa = rx_sc->rx_sa[i]; + if (!rx_sa || !rx_sa->macsec_rule) + continue; + + err = macsec_fs_add_roce_rule_rx(macsec->macsec_fs, rx_sa->fs_id, addr, + rx_sa->macsec_rule); + if (err && err != -EEXIST) { + netdev_err(ndev, "MACsec offload: Failed to add roce RX rule\n"); + goto out; + } + } + } + + return 0; +out: + mlx5e_macsec_fs_del_roce_rule(ndev); + return err; +} +EXPORT_SYMBOL_GPL(mlx5e_macsec_fs_add_roce_rule); + +static int mlx5e_macsec_add_txsa(struct macsec_context *ctx) +{ + const struct macsec_tx_sc *tx_sc = &ctx->secy->tx_sc; + const struct macsec_tx_sa *ctx_tx_sa = ctx->sa.tx_sa; + struct mlx5e_priv *priv = netdev_priv(ctx->netdev); + const struct macsec_secy *secy = ctx->secy; + struct mlx5e_macsec_device *macsec_device; + struct mlx5_core_dev *mdev = priv->mdev; + u8 assoc_num = ctx->sa.assoc_num; + struct mlx5e_macsec_sa *tx_sa; + struct mlx5e_macsec *macsec; + int err = 0; + + if (ctx->prepare) + return 0; + + mutex_lock(&priv->macsec->lock); + + macsec = priv->macsec; + macsec_device = mlx5e_macsec_get_macsec_device_context(macsec, ctx); + if (!macsec_device) { + netdev_err(ctx->netdev, "MACsec offload: Failed to find device context\n"); + err = -EEXIST; + goto out; + } + + if (macsec_device->tx_sa[assoc_num]) { + netdev_err(ctx->netdev, "MACsec offload tx_sa: %d already exist\n", assoc_num); + err = -EEXIST; + goto out; + } + + tx_sa = kzalloc(sizeof(*tx_sa), GFP_KERNEL); + if (!tx_sa) { + err = -ENOMEM; + goto out; + } + + tx_sa->active = ctx_tx_sa->active; + tx_sa->next_pn = ctx_tx_sa->next_pn_halves.lower; + tx_sa->sci = secy->sci; + tx_sa->assoc_num = assoc_num; + + if (secy->xpn) + update_macsec_epn(tx_sa, &ctx_tx_sa->key, &ctx_tx_sa->next_pn_halves); + + err = mlx5_create_encryption_key(mdev, ctx->sa.key, secy->key_len, + MLX5_ACCEL_OBJ_MACSEC_KEY, + &tx_sa->enc_key_id); + if (err) + goto destroy_sa; + + macsec_device->tx_sa[assoc_num] = tx_sa; + if (!secy->operational || + assoc_num != tx_sc->encoding_sa || + !tx_sa->active) + goto out; + + err = mlx5e_macsec_init_sa(ctx, tx_sa, tx_sc->encrypt, true); + if (err) + goto destroy_encryption_key; + + mutex_unlock(&macsec->lock); + + return 0; + +destroy_encryption_key: + macsec_device->tx_sa[assoc_num] = NULL; + mlx5_destroy_encryption_key(mdev, tx_sa->enc_key_id); +destroy_sa: + kfree(tx_sa); +out: + mutex_unlock(&macsec->lock); + + return err; +} + +static int mlx5e_macsec_upd_txsa(struct macsec_context *ctx) +{ + const struct macsec_tx_sc *tx_sc = &ctx->secy->tx_sc; + const struct macsec_tx_sa *ctx_tx_sa = ctx->sa.tx_sa; + struct mlx5e_priv *priv = netdev_priv(ctx->netdev); + struct mlx5e_macsec_device *macsec_device; + u8 assoc_num = ctx->sa.assoc_num; + struct mlx5e_macsec_sa *tx_sa; + struct mlx5e_macsec *macsec; + struct net_device *netdev; + int err = 0; + + if (ctx->prepare) + return 0; + + mutex_lock(&priv->macsec->lock); + + macsec = priv->macsec; + netdev = ctx->netdev; + macsec_device = mlx5e_macsec_get_macsec_device_context(macsec, ctx); + if (!macsec_device) { + netdev_err(netdev, "MACsec offload: Failed to find device context\n"); + err = -EINVAL; + goto out; + } + + tx_sa = macsec_device->tx_sa[assoc_num]; + if (!tx_sa) { + netdev_err(netdev, "MACsec offload: TX sa 0x%x doesn't exist\n", assoc_num); + err = -EEXIST; + goto out; + } + + if (tx_sa->next_pn != ctx_tx_sa->next_pn_halves.lower) { + netdev_err(netdev, "MACsec offload: update TX sa %d PN isn't supported\n", + assoc_num); + err = -EINVAL; + goto out; + } + + if (tx_sa->active == ctx_tx_sa->active) + goto out; + + tx_sa->active = ctx_tx_sa->active; + if (tx_sa->assoc_num != tx_sc->encoding_sa) + goto out; + + if (ctx_tx_sa->active) { + err = mlx5e_macsec_init_sa(ctx, tx_sa, tx_sc->encrypt, true); + if (err) + goto out; + } else { + if (!tx_sa->macsec_rule) { + err = -EINVAL; + goto out; + } + + mlx5e_macsec_cleanup_sa(macsec, tx_sa, true); + } +out: + mutex_unlock(&macsec->lock); + + return err; +} + +static int mlx5e_macsec_del_txsa(struct macsec_context *ctx) +{ + struct mlx5e_priv *priv = netdev_priv(ctx->netdev); + struct mlx5e_macsec_device *macsec_device; + u8 assoc_num = ctx->sa.assoc_num; + struct mlx5e_macsec_sa *tx_sa; + struct mlx5e_macsec *macsec; + int err = 0; + + if (ctx->prepare) + return 0; + + mutex_lock(&priv->macsec->lock); + macsec = priv->macsec; + macsec_device = mlx5e_macsec_get_macsec_device_context(macsec, ctx); + if (!macsec_device) { + netdev_err(ctx->netdev, "MACsec offload: Failed to find device context\n"); + err = -EINVAL; + goto out; + } + + tx_sa = macsec_device->tx_sa[assoc_num]; + if (!tx_sa) { + netdev_err(ctx->netdev, "MACsec offload: TX sa 0x%x doesn't exist\n", assoc_num); + err = -EEXIST; + goto out; + } + + mlx5e_macsec_cleanup_sa(macsec, tx_sa, true); + mlx5_destroy_encryption_key(macsec->mdev, tx_sa->enc_key_id); + kfree(tx_sa); + macsec_device->tx_sa[assoc_num] = NULL; + +out: + mutex_unlock(&macsec->lock); + + return err; +} + +static u32 mlx5e_macsec_get_sa_from_hashtable(struct rhashtable *sci_hash, sci_t *sci) +{ + struct mlx5e_macsec_sa *macsec_sa; + u32 fs_id = 0; + + rcu_read_lock(); + macsec_sa = rhashtable_lookup(sci_hash, sci, rhash_sci); + if (macsec_sa) + fs_id = macsec_sa->fs_id; + rcu_read_unlock(); + + return fs_id; +} + +static int mlx5e_macsec_add_rxsc(struct macsec_context *ctx) +{ + struct mlx5e_priv *priv = netdev_priv(ctx->netdev); + const struct macsec_rx_sc *ctx_rx_sc = ctx->rx_sc; + struct mlx5e_macsec_device *macsec_device; + struct mlx5e_macsec_rx_sc *rx_sc; + struct list_head *rx_sc_list; + struct mlx5e_macsec *macsec; + int err = 0; + + if (ctx->prepare) + return 0; + + mutex_lock(&priv->macsec->lock); + macsec = priv->macsec; + macsec_device = mlx5e_macsec_get_macsec_device_context(macsec, ctx); + if (!macsec_device) { + netdev_err(ctx->netdev, "MACsec offload: Failed to find device context\n"); + err = -EINVAL; + goto out; + } + + rx_sc_list = &macsec_device->macsec_rx_sc_list_head; + rx_sc = mlx5e_macsec_get_rx_sc_from_sc_list(rx_sc_list, ctx_rx_sc->sci); + if (rx_sc) { + netdev_err(ctx->netdev, "MACsec offload: rx_sc (sci %lld) already exists\n", + ctx_rx_sc->sci); + err = -EEXIST; + goto out; + } + + rx_sc = kzalloc(sizeof(*rx_sc), GFP_KERNEL); + if (!rx_sc) { + err = -ENOMEM; + goto out; + } + + err = xa_alloc(&macsec->sc_xarray, &rx_sc->fs_id, rx_sc, + XA_LIMIT(1, MLX5_MACEC_RX_FS_ID_MAX), GFP_KERNEL); + if (err) { + if (err == -EBUSY) + netdev_err(ctx->netdev, + "MACsec offload: unable to create entry for RX SC (2^16 - 1 Rx SCs already allocated)\n"); + goto destroy_rx_sc; + } + + rx_sc->md_dst = metadata_dst_alloc(0, METADATA_MACSEC, GFP_KERNEL); + if (!rx_sc->md_dst) { + err = -ENOMEM; + goto erase_xa_alloc; + } + + rx_sc->sci = ctx_rx_sc->sci; + rx_sc->active = ctx_rx_sc->active; + list_add(&rx_sc->rx_sc_list_element, rx_sc_list); + rx_sc->md_dst->u.macsec_info.sci = rx_sc->sci; + mutex_unlock(&macsec->lock); + + return 0; + +erase_xa_alloc: + xa_erase(&macsec->sc_xarray, rx_sc->fs_id); +destroy_rx_sc: + kfree(rx_sc); + +out: + mutex_unlock(&macsec->lock); + + return err; +} + +static int mlx5e_macsec_upd_rxsc(struct macsec_context *ctx) +{ + struct mlx5e_priv *priv = netdev_priv(ctx->netdev); + const struct macsec_rx_sc *ctx_rx_sc = ctx->rx_sc; + struct mlx5e_macsec_device *macsec_device; + struct mlx5e_macsec_rx_sc *rx_sc; + struct mlx5e_macsec_sa *rx_sa; + struct mlx5e_macsec *macsec; + struct list_head *list; + int i; + int err = 0; + + if (ctx->prepare) + return 0; + + mutex_lock(&priv->macsec->lock); + + macsec = priv->macsec; + macsec_device = mlx5e_macsec_get_macsec_device_context(macsec, ctx); + if (!macsec_device) { + netdev_err(ctx->netdev, "MACsec offload: Failed to find device context\n"); + err = -EINVAL; + goto out; + } + + list = &macsec_device->macsec_rx_sc_list_head; + rx_sc = mlx5e_macsec_get_rx_sc_from_sc_list(list, ctx_rx_sc->sci); + if (!rx_sc) { + err = -EINVAL; + goto out; + } + + if (rx_sc->active == ctx_rx_sc->active) + goto out; + + rx_sc->active = ctx_rx_sc->active; + for (i = 0; i < MACSEC_NUM_AN; ++i) { + rx_sa = rx_sc->rx_sa[i]; + if (!rx_sa) + continue; + + err = macsec_rx_sa_active_update(ctx, rx_sa, rx_sa->active && ctx_rx_sc->active); + if (err) + goto out; + } +out: + mutex_unlock(&macsec->lock); + + return err; +} + +static int mlx5e_macsec_del_rxsc(struct macsec_context *ctx) +{ + struct mlx5e_priv *priv = netdev_priv(ctx->netdev); + struct mlx5e_macsec_device *macsec_device; + struct mlx5e_macsec_rx_sc *rx_sc; + struct mlx5e_macsec_sa *rx_sa; + struct mlx5e_macsec *macsec; + struct list_head *list; + int err = 0; + int i; + + if (ctx->prepare) + return 0; + + mutex_lock(&priv->macsec->lock); + + macsec = priv->macsec; + macsec_device = mlx5e_macsec_get_macsec_device_context(macsec, ctx); + if (!macsec_device) { + netdev_err(ctx->netdev, "MACsec offload: Failed to find device context\n"); + err = -EINVAL; + goto out; + } + + list = &macsec_device->macsec_rx_sc_list_head; + rx_sc = mlx5e_macsec_get_rx_sc_from_sc_list(list, ctx->rx_sc->sci); + if (!rx_sc) { + netdev_err(ctx->netdev, + "MACsec offload rx_sc sci %lld doesn't exist\n", + ctx->sa.rx_sa->sc->sci); + err = -EINVAL; + goto out; + } + + for (i = 0; i < MACSEC_NUM_AN; ++i) { + rx_sa = rx_sc->rx_sa[i]; + if (!rx_sa) + continue; + + mlx5e_macsec_cleanup_sa(macsec, rx_sa, false); + mlx5_destroy_encryption_key(macsec->mdev, rx_sa->enc_key_id); + + kfree(rx_sa); + rx_sc->rx_sa[i] = NULL; + } + +/* + * At this point the relevant MACsec offload Rx rule already removed at + * mlx5e_macsec_cleanup_sa need to wait for datapath to finish current + * Rx related data propagating using xa_erase which uses rcu to sync, + * once fs_id is erased then this rx_sc is hidden from datapath. + */ + list_del(&rx_sc->rx_sc_list_element); + xa_erase(&macsec->sc_xarray, rx_sc->fs_id); + synchronize_rcu(); + metadata_dst_free(rx_sc->md_dst); + kfree(rx_sc); + +out: + mutex_unlock(&macsec->lock); + + return err; +} + +static int mlx5e_macsec_add_rxsa(struct macsec_context *ctx) +{ + const struct macsec_rx_sa *ctx_rx_sa = ctx->sa.rx_sa; + struct mlx5e_priv *priv = netdev_priv(ctx->netdev); + struct mlx5e_macsec_device *macsec_device; + struct mlx5_core_dev *mdev = priv->mdev; + u8 assoc_num = ctx->sa.assoc_num; + struct mlx5e_macsec_rx_sc *rx_sc; + sci_t sci = ctx_rx_sa->sc->sci; + struct mlx5e_macsec_sa *rx_sa; + struct mlx5e_macsec *macsec; + struct list_head *list; + int err = 0; + + if (ctx->prepare) + return 0; + + mutex_lock(&priv->macsec->lock); + + macsec = priv->macsec; + macsec_device = mlx5e_macsec_get_macsec_device_context(macsec, ctx); + if (!macsec_device) { + netdev_err(ctx->netdev, "MACsec offload: Failed to find device context\n"); + err = -EINVAL; + goto out; + } + + list = &macsec_device->macsec_rx_sc_list_head; + rx_sc = mlx5e_macsec_get_rx_sc_from_sc_list(list, sci); + if (!rx_sc) { + netdev_err(ctx->netdev, + "MACsec offload rx_sc sci %lld doesn't exist\n", + ctx->sa.rx_sa->sc->sci); + err = -EINVAL; + goto out; + } + + if (rx_sc->rx_sa[assoc_num]) { + netdev_err(ctx->netdev, + "MACsec offload rx_sc sci %lld rx_sa %d already exist\n", + sci, assoc_num); + err = -EEXIST; + goto out; + } + + rx_sa = kzalloc(sizeof(*rx_sa), GFP_KERNEL); + if (!rx_sa) { + err = -ENOMEM; + goto out; + } + + rx_sa->active = ctx_rx_sa->active; + rx_sa->next_pn = ctx_rx_sa->next_pn; + rx_sa->sci = sci; + rx_sa->assoc_num = assoc_num; + rx_sa->fs_id = rx_sc->fs_id; + + if (ctx->secy->xpn) + update_macsec_epn(rx_sa, &ctx_rx_sa->key, &ctx_rx_sa->next_pn_halves); + + err = mlx5_create_encryption_key(mdev, ctx->sa.key, ctx->secy->key_len, + MLX5_ACCEL_OBJ_MACSEC_KEY, + &rx_sa->enc_key_id); + if (err) + goto destroy_sa; + + rx_sc->rx_sa[assoc_num] = rx_sa; + if (!rx_sa->active) + goto out; + + //TODO - add support for both authentication and encryption flows + err = mlx5e_macsec_init_sa(ctx, rx_sa, true, false); + if (err) + goto destroy_encryption_key; + + goto out; + +destroy_encryption_key: + rx_sc->rx_sa[assoc_num] = NULL; + mlx5_destroy_encryption_key(mdev, rx_sa->enc_key_id); +destroy_sa: + kfree(rx_sa); +out: + mutex_unlock(&macsec->lock); + + return err; +} + +static int mlx5e_macsec_upd_rxsa(struct macsec_context *ctx) +{ + const struct macsec_rx_sa *ctx_rx_sa = ctx->sa.rx_sa; + struct mlx5e_priv *priv = netdev_priv(ctx->netdev); + struct mlx5e_macsec_device *macsec_device; + u8 assoc_num = ctx->sa.assoc_num; + struct mlx5e_macsec_rx_sc *rx_sc; + sci_t sci = ctx_rx_sa->sc->sci; + struct mlx5e_macsec_sa *rx_sa; + struct mlx5e_macsec *macsec; + struct list_head *list; + int err = 0; + + if (ctx->prepare) + return 0; + + mutex_lock(&priv->macsec->lock); + + macsec = priv->macsec; + macsec_device = mlx5e_macsec_get_macsec_device_context(macsec, ctx); + if (!macsec_device) { + netdev_err(ctx->netdev, "MACsec offload: Failed to find device context\n"); + err = -EINVAL; + goto out; + } + + list = &macsec_device->macsec_rx_sc_list_head; + rx_sc = mlx5e_macsec_get_rx_sc_from_sc_list(list, sci); + if (!rx_sc) { + netdev_err(ctx->netdev, + "MACsec offload rx_sc sci %lld doesn't exist\n", + ctx->sa.rx_sa->sc->sci); + err = -EINVAL; + goto out; + } + + rx_sa = rx_sc->rx_sa[assoc_num]; + if (!rx_sa) { + netdev_err(ctx->netdev, + "MACsec offload rx_sc sci %lld rx_sa %d doesn't exist\n", + sci, assoc_num); + err = -EINVAL; + goto out; + } + + if (rx_sa->next_pn != ctx_rx_sa->next_pn_halves.lower) { + netdev_err(ctx->netdev, + "MACsec offload update RX sa %d PN isn't supported\n", + assoc_num); + err = -EINVAL; + goto out; + } + + err = macsec_rx_sa_active_update(ctx, rx_sa, ctx_rx_sa->active); +out: + mutex_unlock(&macsec->lock); + + return err; +} + +static int mlx5e_macsec_del_rxsa(struct macsec_context *ctx) +{ + struct mlx5e_priv *priv = netdev_priv(ctx->netdev); + struct mlx5e_macsec_device *macsec_device; + sci_t sci = ctx->sa.rx_sa->sc->sci; + struct mlx5e_macsec_rx_sc *rx_sc; + u8 assoc_num = ctx->sa.assoc_num; + struct mlx5e_macsec_sa *rx_sa; + struct mlx5e_macsec *macsec; + struct list_head *list; + int err = 0; + + if (ctx->prepare) + return 0; + + mutex_lock(&priv->macsec->lock); + + macsec = priv->macsec; + macsec_device = mlx5e_macsec_get_macsec_device_context(macsec, ctx); + if (!macsec_device) { + netdev_err(ctx->netdev, "MACsec offload: Failed to find device context\n"); + err = -EINVAL; + goto out; + } + + list = &macsec_device->macsec_rx_sc_list_head; + rx_sc = mlx5e_macsec_get_rx_sc_from_sc_list(list, sci); + if (!rx_sc) { + netdev_err(ctx->netdev, + "MACsec offload rx_sc sci %lld doesn't exist\n", + ctx->sa.rx_sa->sc->sci); + err = -EINVAL; + goto out; + } + + rx_sa = rx_sc->rx_sa[assoc_num]; + if (!rx_sa) { + netdev_err(ctx->netdev, + "MACsec offload rx_sc sci %lld rx_sa %d doesn't exist\n", + sci, assoc_num); + err = -EINVAL; + goto out; + } + + mlx5e_macsec_cleanup_sa(macsec, rx_sa, false); + mlx5_destroy_encryption_key(macsec->mdev, rx_sa->enc_key_id); + kfree(rx_sa); + rx_sc->rx_sa[assoc_num] = NULL; + +out: + mutex_unlock(&macsec->lock); + + return err; +} + +static int mlx5e_macsec_add_secy(struct macsec_context *ctx) +{ + struct mlx5e_priv *priv = netdev_priv(ctx->netdev); + const struct net_device *dev = ctx->secy->netdev; + const struct net_device *netdev = ctx->netdev; + struct mlx5e_macsec_device *macsec_device; + struct mlx5e_macsec *macsec; + int err = 0; + + if (ctx->prepare) { + if (!mlx5e_macsec_secy_features_validate(ctx)) + return -EINVAL; + return 0; + } + + mutex_lock(&priv->macsec->lock); + macsec = priv->macsec; + if (mlx5e_macsec_get_macsec_device_context(macsec, ctx)) { + netdev_err(netdev, "MACsec offload: MACsec net_device already exist\n"); + goto out; + } + + if (macsec->num_of_devices >= MLX5_MACSEC_NUM_OF_SUPPORTED_INTERFACES) { + netdev_err(netdev, "Currently, only %d MACsec offload devices can be set\n", + MLX5_MACSEC_NUM_OF_SUPPORTED_INTERFACES); + err = -EBUSY; + goto out; + } + + macsec_device = kzalloc(sizeof(*macsec_device), GFP_KERNEL); + if (!macsec_device) { + err = -ENOMEM; + goto out; + } + + macsec_device->dev_addr = kzalloc(dev->addr_len, GFP_KERNEL); + if (!macsec_device->dev_addr) { + kfree(macsec_device); + err = -ENOMEM; + goto out; + } + + memcpy(macsec_device->dev_addr, dev->dev_addr, dev->addr_len); + macsec_device->netdev = dev; + + INIT_LIST_HEAD(&macsec_device->macsec_rx_sc_list_head); + list_add(&macsec_device->macsec_device_list_element, &macsec->macsec_device_list_head); + + ++macsec->num_of_devices; +out: + mutex_unlock(&macsec->lock); + + return err; +} + +static int macsec_upd_secy_hw_address(struct macsec_context *ctx, + struct mlx5e_macsec_device *macsec_device) +{ + struct mlx5e_priv *priv = netdev_priv(ctx->netdev); + const struct net_device *dev = ctx->secy->netdev; + struct mlx5e_macsec *macsec = priv->macsec; + struct mlx5e_macsec_rx_sc *rx_sc, *tmp; + struct mlx5e_macsec_sa *rx_sa; + struct list_head *list; + int i, err = 0; + + + list = &macsec_device->macsec_rx_sc_list_head; + list_for_each_entry_safe(rx_sc, tmp, list, rx_sc_list_element) { + for (i = 0; i < MACSEC_NUM_AN; ++i) { + rx_sa = rx_sc->rx_sa[i]; + if (!rx_sa || !rx_sa->macsec_rule) + continue; + + mlx5e_macsec_cleanup_sa(macsec, rx_sa, false); + } + } + + list_for_each_entry_safe(rx_sc, tmp, list, rx_sc_list_element) { + for (i = 0; i < MACSEC_NUM_AN; ++i) { + rx_sa = rx_sc->rx_sa[i]; + if (!rx_sa) + continue; + + if (rx_sa->active) { + err = mlx5e_macsec_init_sa(ctx, rx_sa, true, false); + if (err) + goto out; + } + } + } + + memcpy(macsec_device->dev_addr, dev->dev_addr, dev->addr_len); +out: + return err; +} + +/* this function is called from 2 macsec ops functions: + * macsec_set_mac_address – MAC address was changed, therefore we need to destroy + * and create new Tx contexts(macsec object + steering). + * macsec_changelink – in this case the tx SC or SecY may be changed, therefore need to + * destroy Tx and Rx contexts(macsec object + steering) + */ +static int mlx5e_macsec_upd_secy(struct macsec_context *ctx) +{ + const struct macsec_tx_sc *tx_sc = &ctx->secy->tx_sc; + struct mlx5e_priv *priv = netdev_priv(ctx->netdev); + const struct net_device *dev = ctx->secy->netdev; + struct mlx5e_macsec_device *macsec_device; + struct mlx5e_macsec_sa *tx_sa; + struct mlx5e_macsec *macsec; + int i, err = 0; + + if (ctx->prepare) { + if (!mlx5e_macsec_secy_features_validate(ctx)) + return -EINVAL; + return 0; + } + + mutex_lock(&priv->macsec->lock); + + macsec = priv->macsec; + macsec_device = mlx5e_macsec_get_macsec_device_context(macsec, ctx); + if (!macsec_device) { + netdev_err(ctx->netdev, "MACsec offload: Failed to find device context\n"); + err = -EINVAL; + goto out; + } + + if (!tx_sc->encrypt) { + netdev_err(ctx->netdev, "MACsec offload: encrypt off isn't supported\n"); + err = -EINVAL; + goto out; + } + + /* if the dev_addr hasn't change, it mean the callback is from macsec_changelink */ + if (!memcmp(macsec_device->dev_addr, dev->dev_addr, dev->addr_len)) { + err = macsec_upd_secy_hw_address(ctx, macsec_device); + if (err) + goto out; + } + + for (i = 0; i < MACSEC_NUM_AN; ++i) { + tx_sa = macsec_device->tx_sa[i]; + if (!tx_sa) + continue; + + mlx5e_macsec_cleanup_sa(macsec, tx_sa, true); + } + + for (i = 0; i < MACSEC_NUM_AN; ++i) { + tx_sa = macsec_device->tx_sa[i]; + if (!tx_sa) + continue; + + if (tx_sa->assoc_num == tx_sc->encoding_sa && tx_sa->active) { + err = mlx5e_macsec_init_sa(ctx, tx_sa, tx_sc->encrypt, true); + if (err) + goto out; + } + } + +out: + mutex_unlock(&macsec->lock); + + return err; +} + +static int mlx5e_macsec_del_secy(struct macsec_context *ctx) +{ + struct mlx5e_priv *priv = netdev_priv(ctx->netdev); + struct mlx5e_macsec_device *macsec_device; + struct mlx5e_macsec_rx_sc *rx_sc, *tmp; + struct mlx5e_macsec_sa *rx_sa; + struct mlx5e_macsec_sa *tx_sa; + struct mlx5e_macsec *macsec; + struct list_head *list; + int err = 0; + int i; + + if (ctx->prepare) + return 0; + + mutex_lock(&priv->macsec->lock); + macsec = priv->macsec; + macsec_device = mlx5e_macsec_get_macsec_device_context(macsec, ctx); + if (!macsec_device) { + netdev_err(ctx->netdev, "MACsec offload: Failed to find device context\n"); + err = -EINVAL; + + goto out; + } + + for (i = 0; i < MACSEC_NUM_AN; ++i) { + tx_sa = macsec_device->tx_sa[i]; + if (!tx_sa) + continue; + + mlx5e_macsec_cleanup_sa(macsec, tx_sa, true); + mlx5_destroy_encryption_key(macsec->mdev, tx_sa->enc_key_id); + kfree(tx_sa); + macsec_device->tx_sa[i] = NULL; + } + + list = &macsec_device->macsec_rx_sc_list_head; + list_for_each_entry_safe(rx_sc, tmp, list, rx_sc_list_element) { + for (i = 0; i < MACSEC_NUM_AN; ++i) { + rx_sa = rx_sc->rx_sa[i]; + if (!rx_sa) + continue; + + mlx5e_macsec_cleanup_sa(macsec, rx_sa, false); + mlx5_destroy_encryption_key(macsec->mdev, rx_sa->enc_key_id); + kfree(rx_sa); + rx_sc->rx_sa[i] = NULL; + } + + list_del(&rx_sc->rx_sc_list_element); + xa_erase(&macsec->sc_xarray, rx_sc->fs_id); + synchronize_rcu(); + metadata_dst_free(rx_sc->md_dst); + kfree(rx_sc); + } + + kfree(macsec_device->dev_addr); + macsec_device->dev_addr = NULL; + + list_del(&macsec_device->macsec_device_list_element); + --macsec->num_of_devices; + kfree(macsec_device); + +out: + mutex_unlock(&macsec->lock); + + return err; +} + +static void macsec_build_accel_attrs(struct mlx5e_macsec_sa *sa, + struct mlx5_macsec_obj_attrs *attrs) +{ + attrs->epn_state.epn_msb = sa->epn_state.epn_msb; + attrs->epn_state.overlap = sa->epn_state.overlap; +} + +static void macsec_aso_build_wqe_ctrl_seg(struct mlx5e_macsec_aso *macsec_aso, + struct mlx5_wqe_aso_ctrl_seg *aso_ctrl, + struct mlx5_aso_ctrl_param *param) +{ + memset(aso_ctrl, 0, sizeof(*aso_ctrl)); + if (macsec_aso->umr->dma_addr) { + aso_ctrl->va_l = cpu_to_be32(macsec_aso->umr->dma_addr | ASO_CTRL_READ_EN); + aso_ctrl->va_h = cpu_to_be32(macsec_aso->umr->dma_addr >> 32); + aso_ctrl->l_key = cpu_to_be32(macsec_aso->umr->mkey); + } + + if (!param) + return; + + aso_ctrl->data_mask_mode = param->data_mask_mode << 6; + aso_ctrl->condition_1_0_operand = param->condition_1_operand | + param->condition_0_operand << 4; + aso_ctrl->condition_1_0_offset = param->condition_1_offset | + param->condition_0_offset << 4; + aso_ctrl->data_offset_condition_operand = param->data_offset | + param->condition_operand << 6; + aso_ctrl->condition_0_data = cpu_to_be32(param->condition_0_data); + aso_ctrl->condition_0_mask = cpu_to_be32(param->condition_0_mask); + aso_ctrl->condition_1_data = cpu_to_be32(param->condition_1_data); + aso_ctrl->condition_1_mask = cpu_to_be32(param->condition_1_mask); + aso_ctrl->bitwise_data = cpu_to_be64(param->bitwise_data); + aso_ctrl->data_mask = cpu_to_be64(param->data_mask); +} + +static int mlx5e_macsec_modify_obj(struct mlx5_core_dev *mdev, struct mlx5_macsec_obj_attrs *attrs, + u32 macsec_id) +{ + u32 in[MLX5_ST_SZ_DW(modify_macsec_obj_in)] = {}; + u32 out[MLX5_ST_SZ_DW(query_macsec_obj_out)]; + u64 modify_field_select = 0; + void *obj; + int err; + + /* General object fields set */ + MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_QUERY_GENERAL_OBJECT); + MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_GENERAL_OBJECT_TYPES_MACSEC); + MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, macsec_id); + err = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); + if (err) { + mlx5_core_err(mdev, "Query MACsec object failed (Object id %d), err = %d\n", + macsec_id, err); + return err; + } + + obj = MLX5_ADDR_OF(query_macsec_obj_out, out, macsec_object); + modify_field_select = MLX5_GET64(macsec_offload_obj, obj, modify_field_select); + + /* EPN */ + if (!(modify_field_select & MLX5_MODIFY_MACSEC_BITMASK_EPN_OVERLAP) || + !(modify_field_select & MLX5_MODIFY_MACSEC_BITMASK_EPN_MSB)) { + mlx5_core_dbg(mdev, "MACsec object field is not modifiable (Object id %d)\n", + macsec_id); + return -EOPNOTSUPP; + } + + obj = MLX5_ADDR_OF(modify_macsec_obj_in, in, macsec_object); + MLX5_SET64(macsec_offload_obj, obj, modify_field_select, + MLX5_MODIFY_MACSEC_BITMASK_EPN_OVERLAP | MLX5_MODIFY_MACSEC_BITMASK_EPN_MSB); + MLX5_SET(macsec_offload_obj, obj, epn_msb, attrs->epn_state.epn_msb); + MLX5_SET(macsec_offload_obj, obj, epn_overlap, attrs->epn_state.overlap); + + /* General object fields set */ + MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT); + + return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); +} + +static void macsec_aso_build_ctrl(struct mlx5e_macsec_aso *aso, + struct mlx5_wqe_aso_ctrl_seg *aso_ctrl, + struct mlx5e_macsec_aso_in *in) +{ + struct mlx5_aso_ctrl_param param = {}; + + param.data_mask_mode = MLX5_ASO_DATA_MASK_MODE_BITWISE_64BIT; + param.condition_0_operand = MLX5_ASO_ALWAYS_TRUE; + param.condition_1_operand = MLX5_ASO_ALWAYS_TRUE; + if (in->mode == MLX5_MACSEC_EPN) { + param.data_offset = MLX5_MACSEC_ASO_REMOVE_FLOW_PKT_CNT_OFFSET; + param.bitwise_data = BIT(22) << 32; + param.data_mask = param.bitwise_data; + } + + macsec_aso_build_wqe_ctrl_seg(aso, aso_ctrl, ¶m); +} + +static int macsec_aso_set_arm_event(struct mlx5_core_dev *mdev, struct mlx5e_macsec *macsec, + struct mlx5e_macsec_aso_in *in) +{ + struct mlx5e_macsec_aso *aso; + struct mlx5_aso_wqe *aso_wqe; + struct mlx5_aso *maso; + int err; + + aso = &macsec->aso; + maso = aso->maso; + + mutex_lock(&aso->aso_lock); + aso_wqe = mlx5_aso_get_wqe(maso); + mlx5_aso_build_wqe(maso, MLX5E_MACSEC_ASO_DS_CNT, aso_wqe, in->obj_id, + MLX5_ACCESS_ASO_OPC_MOD_MACSEC); + macsec_aso_build_ctrl(aso, &aso_wqe->aso_ctrl, in); + mlx5_aso_post_wqe(maso, false, &aso_wqe->ctrl); + err = mlx5_aso_poll_cq(maso, false, 10); + mutex_unlock(&aso->aso_lock); + + return err; +} + +static int macsec_aso_query(struct mlx5_core_dev *mdev, struct mlx5e_macsec *macsec, + struct mlx5e_macsec_aso_in *in, struct mlx5e_macsec_aso_out *out) +{ + struct mlx5e_macsec_aso *aso; + struct mlx5_aso_wqe *aso_wqe; + struct mlx5_aso *maso; + int err; + + aso = &macsec->aso; + maso = aso->maso; + + mutex_lock(&aso->aso_lock); + + aso_wqe = mlx5_aso_get_wqe(maso); + mlx5_aso_build_wqe(maso, MLX5E_MACSEC_ASO_DS_CNT, aso_wqe, in->obj_id, + MLX5_ACCESS_ASO_OPC_MOD_MACSEC); + macsec_aso_build_wqe_ctrl_seg(aso, &aso_wqe->aso_ctrl, NULL); + + mlx5_aso_post_wqe(maso, false, &aso_wqe->ctrl); + err = mlx5_aso_poll_cq(maso, false, 10); + if (err) + goto err_out; + + if (MLX5_GET(macsec_aso, aso->umr->ctx, epn_event_arm)) + out->event_arm |= MLX5E_ASO_EPN_ARM; + + out->mode_param = MLX5_GET(macsec_aso, aso->umr->ctx, mode_parameter); + +err_out: + mutex_unlock(&aso->aso_lock); + return err; +} + +static struct mlx5e_macsec_sa *get_macsec_tx_sa_from_obj_id(const struct mlx5e_macsec *macsec, + const u32 obj_id) +{ + const struct list_head *device_list; + struct mlx5e_macsec_sa *macsec_sa; + struct mlx5e_macsec_device *iter; + int i; + + device_list = &macsec->macsec_device_list_head; + + list_for_each_entry(iter, device_list, macsec_device_list_element) { + for (i = 0; i < MACSEC_NUM_AN; ++i) { + macsec_sa = iter->tx_sa[i]; + if (!macsec_sa || !macsec_sa->active) + continue; + if (macsec_sa->macsec_obj_id == obj_id) + return macsec_sa; + } + } + + return NULL; +} + +static struct mlx5e_macsec_sa *get_macsec_rx_sa_from_obj_id(const struct mlx5e_macsec *macsec, + const u32 obj_id) +{ + const struct list_head *device_list, *sc_list; + struct mlx5e_macsec_rx_sc *mlx5e_rx_sc; + struct mlx5e_macsec_sa *macsec_sa; + struct mlx5e_macsec_device *iter; + int i; + + device_list = &macsec->macsec_device_list_head; + + list_for_each_entry(iter, device_list, macsec_device_list_element) { + sc_list = &iter->macsec_rx_sc_list_head; + list_for_each_entry(mlx5e_rx_sc, sc_list, rx_sc_list_element) { + for (i = 0; i < MACSEC_NUM_AN; ++i) { + macsec_sa = mlx5e_rx_sc->rx_sa[i]; + if (!macsec_sa || !macsec_sa->active) + continue; + if (macsec_sa->macsec_obj_id == obj_id) + return macsec_sa; + } + } + } + + return NULL; +} + +static void macsec_epn_update(struct mlx5e_macsec *macsec, struct mlx5_core_dev *mdev, + struct mlx5e_macsec_sa *sa, u32 obj_id, u32 mode_param) +{ + struct mlx5_macsec_obj_attrs attrs = {}; + struct mlx5e_macsec_aso_in in = {}; + + /* When the bottom of the replay protection window (mode_param) crosses 2^31 (half sequence + * number wraparound) hence mode_param > MLX5_MACSEC_EPN_SCOPE_MID the SW should update the + * esn_overlap to OLD (1). + * When the bottom of the replay protection window (mode_param) crosses 2^32 (full sequence + * number wraparound) hence mode_param < MLX5_MACSEC_EPN_SCOPE_MID since it did a + * wraparound, the SW should update the esn_overlap to NEW (0), and increment the esn_msb. + */ + + if (mode_param < MLX5_MACSEC_EPN_SCOPE_MID) { + sa->epn_state.epn_msb++; + sa->epn_state.overlap = 0; + } else { + sa->epn_state.overlap = 1; + } + + macsec_build_accel_attrs(sa, &attrs); + mlx5e_macsec_modify_obj(mdev, &attrs, obj_id); + + /* Re-set EPN arm event */ + in.obj_id = obj_id; + in.mode = MLX5_MACSEC_EPN; + macsec_aso_set_arm_event(mdev, macsec, &in); +} + +static void macsec_async_event(struct work_struct *work) +{ + struct mlx5e_macsec_async_work *async_work; + struct mlx5e_macsec_aso_out out = {}; + struct mlx5e_macsec_aso_in in = {}; + struct mlx5e_macsec_sa *macsec_sa; + struct mlx5e_macsec *macsec; + struct mlx5_core_dev *mdev; + u32 obj_id; + + async_work = container_of(work, struct mlx5e_macsec_async_work, work); + macsec = async_work->macsec; + mutex_lock(&macsec->lock); + + mdev = async_work->mdev; + obj_id = async_work->obj_id; + macsec_sa = get_macsec_tx_sa_from_obj_id(macsec, obj_id); + if (!macsec_sa) { + macsec_sa = get_macsec_rx_sa_from_obj_id(macsec, obj_id); + if (!macsec_sa) { + mlx5_core_dbg(mdev, "MACsec SA is not found (SA object id %d)\n", obj_id); + goto out_async_work; + } + } + + /* Query MACsec ASO context */ + in.obj_id = obj_id; + macsec_aso_query(mdev, macsec, &in, &out); + + /* EPN case */ + if (macsec_sa->epn_state.epn_enabled && !(out.event_arm & MLX5E_ASO_EPN_ARM)) + macsec_epn_update(macsec, mdev, macsec_sa, obj_id, out.mode_param); + +out_async_work: + kfree(async_work); + mutex_unlock(&macsec->lock); +} + +static int macsec_obj_change_event(struct notifier_block *nb, unsigned long event, void *data) +{ + struct mlx5e_macsec *macsec = container_of(nb, struct mlx5e_macsec, nb); + struct mlx5e_macsec_async_work *async_work; + struct mlx5_eqe_obj_change *obj_change; + struct mlx5_eqe *eqe = data; + u16 obj_type; + u32 obj_id; + + if (event != MLX5_EVENT_TYPE_OBJECT_CHANGE_EVENT) + return NOTIFY_DONE; + + obj_change = &eqe->data.obj_change; + obj_type = be16_to_cpu(obj_change->obj_type); + obj_id = be32_to_cpu(obj_change->obj_id); + + if (obj_type != MLX5_GENERAL_OBJECT_TYPES_MACSEC) + return NOTIFY_DONE; + + async_work = kzalloc(sizeof(*async_work), GFP_ATOMIC); + if (!async_work) + return NOTIFY_DONE; + + async_work->macsec = macsec; + async_work->mdev = macsec->mdev; + async_work->obj_id = obj_id; + + INIT_WORK(&async_work->work, macsec_async_event); + + WARN_ON(!queue_work(macsec->wq, &async_work->work)); + + return NOTIFY_OK; +} + +static int mlx5e_macsec_aso_init(struct mlx5e_macsec_aso *aso, struct mlx5_core_dev *mdev) +{ + struct mlx5_aso *maso; + int err; + + err = mlx5_core_alloc_pd(mdev, &aso->pdn); + if (err) { + mlx5_core_err(mdev, + "MACsec offload: Failed to alloc pd for MACsec ASO, err=%d\n", + err); + return err; + } + + maso = mlx5_aso_create(mdev, aso->pdn); + if (IS_ERR(maso)) { + err = PTR_ERR(maso); + goto err_aso; + } + + err = mlx5e_macsec_aso_reg_mr(mdev, aso); + if (err) + goto err_aso_reg; + + mutex_init(&aso->aso_lock); + + aso->maso = maso; + + return 0; + +err_aso_reg: + mlx5_aso_destroy(maso); +err_aso: + mlx5_core_dealloc_pd(mdev, aso->pdn); + return err; +} + +static void mlx5e_macsec_aso_cleanup(struct mlx5e_macsec_aso *aso, struct mlx5_core_dev *mdev) +{ + if (!aso) + return; + + mlx5e_macsec_aso_dereg_mr(mdev, aso); + + mlx5_aso_destroy(aso->maso); + + mlx5_core_dealloc_pd(mdev, aso->pdn); +} + +bool mlx5e_is_macsec_device(const struct mlx5_core_dev *mdev) +{ + if (!(MLX5_CAP_GEN_64(mdev, general_obj_types) & + MLX5_GENERAL_OBJ_TYPES_CAP_MACSEC_OFFLOAD)) + return false; + + if (!MLX5_CAP_GEN(mdev, log_max_dek)) + return false; + + if (!MLX5_CAP_MACSEC(mdev, log_max_macsec_offload)) + return false; + + if (!MLX5_CAP_FLOWTABLE_NIC_RX(mdev, macsec_decrypt) || + !MLX5_CAP_FLOWTABLE_NIC_RX(mdev, reformat_remove_macsec)) + return false; + + if (!MLX5_CAP_FLOWTABLE_NIC_TX(mdev, macsec_encrypt) || + !MLX5_CAP_FLOWTABLE_NIC_TX(mdev, reformat_add_macsec)) + return false; + + if (!MLX5_CAP_MACSEC(mdev, macsec_crypto_esp_aes_gcm_128_encrypt) && + !MLX5_CAP_MACSEC(mdev, macsec_crypto_esp_aes_gcm_256_encrypt)) + return false; + + if (!MLX5_CAP_MACSEC(mdev, macsec_crypto_esp_aes_gcm_128_decrypt) && + !MLX5_CAP_MACSEC(mdev, macsec_crypto_esp_aes_gcm_256_decrypt)) + return false; + + return true; +} + +void mlx5e_macsec_get_stats_fill(struct mlx5e_macsec *macsec, void *macsec_stats) +{ + mlx5e_macsec_fs_get_stats_fill(macsec->macsec_fs, macsec_stats); +} + +struct mlx5e_macsec_stats *mlx5e_macsec_get_stats(struct mlx5e_macsec *macsec) +{ + if (!macsec) + return NULL; + + return &macsec->stats; +} + +static const struct macsec_ops macsec_offload_ops = { + .mdo_add_txsa = mlx5e_macsec_add_txsa, + .mdo_upd_txsa = mlx5e_macsec_upd_txsa, + .mdo_del_txsa = mlx5e_macsec_del_txsa, + .mdo_add_rxsc = mlx5e_macsec_add_rxsc, + .mdo_upd_rxsc = mlx5e_macsec_upd_rxsc, + .mdo_del_rxsc = mlx5e_macsec_del_rxsc, + .mdo_add_rxsa = mlx5e_macsec_add_rxsa, + .mdo_upd_rxsa = mlx5e_macsec_upd_rxsa, + .mdo_del_rxsa = mlx5e_macsec_del_rxsa, + .mdo_add_secy = mlx5e_macsec_add_secy, + .mdo_upd_secy = mlx5e_macsec_upd_secy, + .mdo_del_secy = mlx5e_macsec_del_secy, +}; + +bool mlx5e_macsec_handle_tx_skb(struct mlx5e_macsec *macsec, struct sk_buff *skb) +{ + struct metadata_dst *md_dst = skb_metadata_dst(skb); + u32 fs_id; + + fs_id = mlx5e_macsec_get_sa_from_hashtable(&macsec->sci_hash, &md_dst->u.macsec_info.sci); + if (!fs_id) + goto err_out; + + return true; + +err_out: + dev_kfree_skb_any(skb); + return false; +} + +void mlx5e_macsec_tx_build_eseg(struct mlx5e_macsec *macsec, + struct sk_buff *skb, + struct mlx5_wqe_eth_seg *eseg) +{ + struct metadata_dst *md_dst = skb_metadata_dst(skb); + u32 fs_id; + + fs_id = mlx5e_macsec_get_sa_from_hashtable(&macsec->sci_hash, &md_dst->u.macsec_info.sci); + if (!fs_id) + return; + + eseg->flow_table_metadata = cpu_to_be32(MLX5_ETH_WQE_FT_META_MACSEC | fs_id << 2); +} + +void mlx5e_macsec_offload_handle_rx_skb(struct net_device *netdev, + struct sk_buff *skb, + struct mlx5_cqe64 *cqe) +{ + u32 macsec_meta_data = be32_to_cpu(cqe->ft_metadata); + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5e_macsec_rx_sc *rx_sc; + struct mlx5e_macsec *macsec; + u32 fs_id; + + macsec = priv->macsec; + if (!macsec) + return; + + fs_id = MLX5_MACSEC_RX_METADAT_HANDLE(macsec_meta_data); + + rcu_read_lock(); + rx_sc = xa_load(&macsec->sc_xarray, fs_id); + if (rx_sc) { + dst_hold(&rx_sc->md_dst->dst); + skb_dst_set(skb, &rx_sc->md_dst->dst); + } + + rcu_read_unlock(); +} + +void mlx5e_macsec_build_netdev(struct mlx5e_priv *priv) +{ + struct net_device *netdev = priv->netdev; + + if (!mlx5e_is_macsec_device(priv->mdev)) + return; + + /* Enable MACsec */ + mlx5_core_dbg(priv->mdev, "mlx5e: MACsec acceleration enabled\n"); + netdev->macsec_ops = &macsec_offload_ops; + netdev->features |= NETIF_F_HW_MACSEC; + netif_keep_dst(netdev); +} + +int mlx5e_macsec_init(struct mlx5e_priv *priv) +{ + struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5e_macsec *macsec = NULL; + struct mlx5e_macsec_fs *macsec_fs; + int err; + + if (!mlx5e_is_macsec_device(priv->mdev)) { + mlx5_core_dbg(mdev, "Not a MACsec offload device\n"); + return 0; + } + + macsec = kzalloc(sizeof(*macsec), GFP_KERNEL); + if (!macsec) + return -ENOMEM; + + INIT_LIST_HEAD(&macsec->macsec_device_list_head); + mutex_init(&macsec->lock); + + err = rhashtable_init(&macsec->sci_hash, &rhash_sci); + if (err) { + mlx5_core_err(mdev, "MACsec offload: Failed to init SCI hash table, err=%d\n", + err); + goto err_hash; + } + + err = mlx5e_macsec_aso_init(&macsec->aso, priv->mdev); + if (err) { + mlx5_core_err(mdev, "MACsec offload: Failed to init aso, err=%d\n", err); + goto err_aso; + } + + macsec->wq = alloc_ordered_workqueue("mlx5e_macsec_%s", 0, priv->netdev->name); + if (!macsec->wq) { + err = -ENOMEM; + goto err_wq; + } + + xa_init_flags(&macsec->sc_xarray, XA_FLAGS_ALLOC1); + + priv->macsec = macsec; + + macsec->mdev = mdev; + + macsec_fs = mlx5e_macsec_fs_init(mdev, priv->netdev); + if (IS_ERR_OR_NULL(macsec_fs)) + goto err_out; + + macsec->nb.notifier_call = macsec_obj_change_event; + mlx5_notifier_register(mdev, &macsec->nb); + + macsec->macsec_fs = macsec_fs; + + mlx5_core_dbg(mdev, "MACsec attached to netdevice\n"); + + return 0; + +err_out: + destroy_workqueue(macsec->wq); +err_wq: + mlx5e_macsec_aso_cleanup(&macsec->aso, priv->mdev); +err_aso: + rhashtable_destroy(&macsec->sci_hash); +err_hash: + kfree(macsec); + priv->macsec = NULL; + return err; +} + +void mlx5e_macsec_cleanup(struct mlx5e_priv *priv) +{ + struct mlx5e_macsec *macsec = priv->macsec; + struct mlx5_core_dev *mdev = priv->mdev; + + if (!macsec) + return; + + priv->macsec = NULL; + mlx5_notifier_unregister(mdev, &macsec->nb); + mlx5e_macsec_fs_cleanup(macsec->macsec_fs); + destroy_workqueue(macsec->wq); + mlx5e_macsec_aso_cleanup(&macsec->aso, mdev); + rhashtable_destroy(&macsec->sci_hash); + mutex_destroy(&macsec->lock); + kfree(macsec); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.h new file mode 100644 index 0000000..347380a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#ifndef __MLX5_EN_ACCEL_MACSEC_H__ +#define __MLX5_EN_ACCEL_MACSEC_H__ + +#ifdef CONFIG_MLX5_EN_MACSEC + +#include +#include +#include + +/* Bit31 - 30: MACsec marker, Bit15-0: MACsec id */ +#define MLX5_MACEC_RX_FS_ID_MAX USHRT_MAX /* Must be power of two */ +#define MLX5_MACSEC_RX_FS_ID_MASK MLX5_MACEC_RX_FS_ID_MAX +#define MLX5_MACSEC_METADATA_MARKER(metadata) ((((metadata) >> 30) & 0x3) == 0x1) +#define MLX5_MACSEC_RX_METADAT_HANDLE(metadata) ((metadata) & MLX5_MACSEC_RX_FS_ID_MASK) + +struct mlx5e_priv; +struct mlx5e_macsec; + +struct mlx5e_macsec_stats { + u64 macsec_rx_pkts; + u64 macsec_rx_bytes; + u64 macsec_rx_pkts_drop; + u64 macsec_rx_bytes_drop; + u64 macsec_tx_pkts; + u64 macsec_tx_bytes; + u64 macsec_tx_pkts_drop; + u64 macsec_tx_bytes_drop; +}; + +void mlx5e_macsec_build_netdev(struct mlx5e_priv *priv); +int mlx5e_macsec_init(struct mlx5e_priv *priv); +void mlx5e_macsec_cleanup(struct mlx5e_priv *priv); +bool mlx5e_macsec_handle_tx_skb(struct mlx5e_macsec *macsec, struct sk_buff *skb); +void mlx5e_macsec_tx_build_eseg(struct mlx5e_macsec *macsec, + struct sk_buff *skb, + struct mlx5_wqe_eth_seg *eseg); + +static inline bool mlx5e_macsec_skb_is_offload(struct sk_buff *skb) +{ + struct metadata_dst *md_dst = skb_metadata_dst(skb); + + return md_dst && (md_dst->type == METADATA_MACSEC); +} + +static inline bool mlx5e_macsec_is_rx_flow(struct mlx5_cqe64 *cqe) +{ + return MLX5_MACSEC_METADATA_MARKER(be32_to_cpu(cqe->ft_metadata)); +} + +void mlx5e_macsec_offload_handle_rx_skb(struct net_device *netdev, struct sk_buff *skb, + struct mlx5_cqe64 *cqe); +bool mlx5e_is_macsec_device(const struct mlx5_core_dev *mdev); +void mlx5e_macsec_get_stats_fill(struct mlx5e_macsec *macsec, void *macsec_stats); +struct mlx5e_macsec_stats *mlx5e_macsec_get_stats(struct mlx5e_macsec *macsec); + +#else + +static inline void mlx5e_macsec_build_netdev(struct mlx5e_priv *priv) {} +static inline int mlx5e_macsec_init(struct mlx5e_priv *priv) { return 0; } +static inline void mlx5e_macsec_cleanup(struct mlx5e_priv *priv) {} +static inline bool mlx5e_macsec_skb_is_offload(struct sk_buff *skb) { return false; } +static inline bool mlx5e_macsec_is_rx_flow(struct mlx5_cqe64 *cqe) { return false; } +static inline void mlx5e_macsec_offload_handle_rx_skb(struct net_device *netdev, + struct sk_buff *skb, + struct mlx5_cqe64 *cqe) +{} +static inline bool mlx5e_is_macsec_device(const struct mlx5_core_dev *mdev) { return false; } +#endif /* CONFIG_MLX5_EN_MACSEC */ + +#endif /* __MLX5_ACCEL_EN_MACSEC_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec_fs.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec_fs.c new file mode 100644 index 0000000..655133a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec_fs.c @@ -0,0 +1,2027 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#include +#include +#include +#include +#include +#include "fs_core.h" +#include "en.h" +#include "en/fs.h" +#include "en_accel/macsec_fs.h" +#include "mlx5_core.h" + +/* MACsec TX flow steering */ +#define CRYPTO_NUM_MAXSEC_FTE BIT(15) +#define CRYPTO_TABLE_DEFAULT_RULE_GROUP_SIZE 1 + +#define TX_CRYPTO_TABLE_LEVEL 0 +#define TX_CRYPTO_TABLE_NUM_GROUPS 3 +#define TX_CRYPTO_TABLE_MKE_GROUP_SIZE 1 +#define TX_CRYPTO_TABLE_SA_GROUP_SIZE \ + (CRYPTO_NUM_MAXSEC_FTE - (TX_CRYPTO_TABLE_MKE_GROUP_SIZE + \ + CRYPTO_TABLE_DEFAULT_RULE_GROUP_SIZE)) +#define TX_CHECK_TABLE_LEVEL 1 +#define TX_CHECK_TABLE_NUM_FTE 2 +#define RX_CRYPTO_TABLE_LEVEL 0 +#define RX_CHECK_TABLE_LEVEL 1 +#define RX_ROCE_TABLE_LEVEL 2 +#define RX_CHECK_TABLE_NUM_FTE 3 +#define RX_ROCE_TABLE_NUM_FTE 2 +#define RX_CRYPTO_TABLE_NUM_GROUPS 3 +#define RX_CRYPTO_TABLE_SA_RULE_WITH_SCI_GROUP_SIZE \ + ((CRYPTO_NUM_MAXSEC_FTE - CRYPTO_TABLE_DEFAULT_RULE_GROUP_SIZE) / 2) +#define RX_CRYPTO_TABLE_SA_RULE_WITHOUT_SCI_GROUP_SIZE \ + (CRYPTO_NUM_MAXSEC_FTE - RX_CRYPTO_TABLE_SA_RULE_WITH_SCI_GROUP_SIZE) +#define RX_NUM_OF_RULES_PER_SA 2 + +#define RDMA_RX_ROCE_IP_TABLE_LEVEL 0 +#define RDMA_RX_ROCE_MACSEC_OP_TABLE_LEVEL 1 + +#define MLX5_MACSEC_TAG_LEN 8 /* SecTAG length with ethertype and without the optional SCI */ +#define MLX5_MACSEC_SECTAG_TCI_AN_FIELD_BITMASK 0x23 +#define MLX5_MACSEC_SECTAG_TCI_AN_FIELD_OFFSET 0x8 +#define MLX5_MACSEC_SECTAG_TCI_SC_FIELD_OFFSET 0x5 +#define MLX5_MACSEC_SECTAG_TCI_SC_FIELD_BIT (0x1 << MLX5_MACSEC_SECTAG_TCI_SC_FIELD_OFFSET) +#define MLX5_SECTAG_HEADER_SIZE_WITHOUT_SCI 0x8 +#define MLX5_SECTAG_HEADER_SIZE_WITH_SCI (MLX5_SECTAG_HEADER_SIZE_WITHOUT_SCI + MACSEC_SCI_LEN) + +/* MACsec RX flow steering */ +#define MLX5_ETH_WQE_FT_META_MACSEC_MASK 0x3E + +struct mlx5_sectag_header { + __be16 ethertype; + u8 tci_an; + u8 sl; + u32 pn; + u8 sci[MACSEC_SCI_LEN]; /* optional */ +} __packed; + +struct mlx5e_macsec_tx_rule { + struct mlx5_flow_handle *rule; + struct mlx5_pkt_reformat *pkt_reformat; + u32 fs_id; + + struct mlx5_flow_handle *roce_rule_tx; + struct mlx5_modify_hdr *roce_meta_modhdr; +}; + +struct mlx5e_macsec_tables { + struct mlx5e_flow_table ft_crypto; + struct mlx5_flow_handle *crypto_miss_rule; + + struct mlx5_flow_table *ft_check; + struct mlx5_flow_group *ft_check_group; + struct mlx5_fc *check_miss_rule_counter; + struct mlx5_flow_handle *check_miss_rule; + struct mlx5_fc *check_rule_counter; + + u32 refcnt; +}; + +struct mlx5e_macsec_tx { + struct mlx5_flow_handle *crypto_mke_rule; + struct mlx5_flow_handle *check_rule; + + struct ida tx_halloc; + + struct mlx5e_macsec_tables tables; + struct mlx5_flow_table *roce_ft_rdma_tx; + struct mutex roce_lock; /* serialize TX rules addition and deletion */ +}; + +struct mlx5e_macsec_rx_rule { + struct mlx5_flow_handle *rule[RX_NUM_OF_RULES_PER_SA]; + struct mlx5_modify_hdr *meta_modhdr; + + struct mlx5_flow_handle *roce_macsec_op_rule; + struct mlx5_flow_handle *roce_ip_rule; +}; + +struct mlx5e_macsec_miss { + struct mlx5_flow_group *group; + struct mlx5_flow_handle *rule; +}; + +struct mlx5e_macsec_rx_roce { + /* Flow table/rules in NIC domain, to check if it's a RoCE packet */ + struct mlx5_flow_group *nic_g; + struct mlx5_flow_table *nic_ft; + struct mlx5_flow_handle *nic_rule; + struct mlx5_modify_hdr *copy_modify_hdr; + struct mlx5e_macsec_miss roce_nic_miss; + + /* Flow table/rule in RDMA domain, to check dgid */ + struct mlx5_flow_table *ft_rdma_rx_ip_check; + struct mlx5_flow_table *ft_rdma_rx_macsec_op_check; + struct mlx5e_macsec_miss roce_rdma_macsec_op_miss; + + struct mutex lock; /* serialize RX rules addition and deletion */ +}; + +struct mlx5e_macsec_rx { + struct mlx5_flow_handle *check_rule[2]; + struct mlx5_pkt_reformat *check_rule_pkt_reformat[2]; + + struct mlx5e_macsec_tables tables; + struct mlx5e_macsec_rx_roce roce; +}; + +union mlx5e_macsec_rule { + struct mlx5e_macsec_tx_rule tx_rule; + struct mlx5e_macsec_rx_rule rx_rule; +}; + +struct mlx5e_macsec_fs { + struct mlx5_core_dev *mdev; + struct net_device *netdev; + struct mlx5e_macsec_tx *tx_fs; + struct mlx5e_macsec_rx *rx_fs; +}; + +static void macsec_fs_rdma_tx_destroy(struct mlx5e_macsec_tx *tx_fs) +{ + if (!tx_fs->roce_ft_rdma_tx) + return; + + mlx5_destroy_flow_table(tx_fs->roce_ft_rdma_tx); + tx_fs->roce_ft_rdma_tx = NULL; +} + +static void macsec_fs_tx_destroy(struct mlx5e_macsec_fs *macsec_fs) +{ + struct mlx5e_macsec_tx *tx_fs = macsec_fs->tx_fs; + struct mlx5e_macsec_tables *tx_tables; + + macsec_fs_rdma_tx_destroy(tx_fs); + + tx_tables = &tx_fs->tables; + + /* Tx check table */ + if (tx_fs->check_rule) { + mlx5_del_flow_rules(tx_fs->check_rule); + tx_fs->check_rule = NULL; + } + + if (tx_tables->check_miss_rule) { + mlx5_del_flow_rules(tx_tables->check_miss_rule); + tx_tables->check_miss_rule = NULL; + } + + if (tx_tables->ft_check_group) { + mlx5_destroy_flow_group(tx_tables->ft_check_group); + tx_tables->ft_check_group = NULL; + } + + if (tx_tables->ft_check) { + mlx5_destroy_flow_table(tx_tables->ft_check); + tx_tables->ft_check = NULL; + } + + /* Tx crypto table */ + if (tx_fs->crypto_mke_rule) { + mlx5_del_flow_rules(tx_fs->crypto_mke_rule); + tx_fs->crypto_mke_rule = NULL; + } + + if (tx_tables->crypto_miss_rule) { + mlx5_del_flow_rules(tx_tables->crypto_miss_rule); + tx_tables->crypto_miss_rule = NULL; + } + + mlx5e_destroy_flow_table(&tx_tables->ft_crypto); +} + +static int macsec_fs_tx_create_crypto_table_groups(struct mlx5e_flow_table *ft) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + int mclen = MLX5_ST_SZ_BYTES(fte_match_param); + int ix = 0; + u32 *in; + int err; + u8 *mc; + + ft->g = kcalloc(TX_CRYPTO_TABLE_NUM_GROUPS, sizeof(*ft->g), GFP_KERNEL); + if (!ft->g) + return -ENOMEM; + in = kvzalloc(inlen, GFP_KERNEL); + + if (!in) { + kfree(ft->g); + return -ENOMEM; + } + + mc = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria); + + /* Flow Group for MKE match */ + MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); + MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ethertype); + + MLX5_SET_CFG(in, start_flow_index, ix); + ix += TX_CRYPTO_TABLE_MKE_GROUP_SIZE; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); + if (IS_ERR(ft->g[ft->num_groups])) + goto err; + ft->num_groups++; + + /* Flow Group for SA rules */ + memset(in, 0, inlen); + memset(mc, 0, mclen); + MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_MISC_PARAMETERS_2); + MLX5_SET(fte_match_param, mc, misc_parameters_2.metadata_reg_a, + MLX5_ETH_WQE_FT_META_MACSEC_MASK); + + MLX5_SET_CFG(in, start_flow_index, ix); + ix += TX_CRYPTO_TABLE_SA_GROUP_SIZE; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); + if (IS_ERR(ft->g[ft->num_groups])) + goto err; + ft->num_groups++; + + /* Flow Group for l2 traps */ + memset(in, 0, inlen); + memset(mc, 0, mclen); + MLX5_SET_CFG(in, start_flow_index, ix); + ix += CRYPTO_TABLE_DEFAULT_RULE_GROUP_SIZE; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); + if (IS_ERR(ft->g[ft->num_groups])) + goto err; + ft->num_groups++; + + kvfree(in); + return 0; + +err: + err = PTR_ERR(ft->g[ft->num_groups]); + ft->g[ft->num_groups] = NULL; + kvfree(in); + + return err; +} + +static struct mlx5_flow_table + *macsec_fs_auto_group_table_create(struct mlx5_flow_namespace *ns, int flags, + int level, int max_fte) +{ + struct mlx5_flow_table_attr ft_attr = {}; + struct mlx5_flow_table *fdb = NULL; + + /* reserve entry for the match all miss group and rule */ + ft_attr.autogroup.num_reserved_entries = 1; + ft_attr.autogroup.max_num_groups = 1; + ft_attr.prio = 0; + ft_attr.flags = flags; + ft_attr.level = level; + ft_attr.max_fte = max_fte; + + fdb = mlx5_create_auto_grouped_flow_table(ns, &ft_attr); + + return fdb; +} + +enum { + RDMA_TX_MACSEC_LEVEL = 0, +}; + +#define NIC_RDMA_BOTH_DIRS_CAPS (MLX5_FT_NIC_RX_2_NIC_RX_RDMA | MLX5_FT_NIC_TX_RDMA_2_NIC_TX) + +static bool is_macsec_roce_supported(struct mlx5_core_dev *mdev) +{ + if (((MLX5_CAP_GEN_2(mdev, flow_table_type_2_type) & + NIC_RDMA_BOTH_DIRS_CAPS) != NIC_RDMA_BOTH_DIRS_CAPS) || + !MLX5_CAP_FLOWTABLE_RDMA_TX(mdev, max_modify_header_actions)) + return false; + + return true; +} + +static int macsec_fs_tx_roce_create(struct mlx5e_macsec_fs *macsec_fs) +{ + struct mlx5e_macsec_tx *tx_fs = macsec_fs->tx_fs; + struct mlx5_flow_namespace *ns; + struct mlx5_flow_table *ft; + int err; + + if (!is_macsec_roce_supported(macsec_fs->mdev)) { + netdev_err(macsec_fs->netdev, "Failed to init RoCE MACsec, capabilities not supported\n"); + return 0; + } + + ns = mlx5_get_flow_namespace(macsec_fs->mdev, MLX5_FLOW_NAMESPACE_RDMA_TX_MACSEC); + if (!ns) + return -ENOMEM; + + /* Tx RoCE crypto table */ + ft = macsec_fs_auto_group_table_create(ns, 0, RDMA_TX_MACSEC_LEVEL, + CRYPTO_NUM_MAXSEC_FTE); + if (IS_ERR(ft)) { + err = PTR_ERR(ft); + netdev_err(macsec_fs->netdev, "Failed to create MACsec RoCE Tx crypto table err(%d)\n", + err); + return err; + } + tx_fs->roce_ft_rdma_tx = ft; + + mutex_init(&tx_fs->roce_lock); + return 0; +} + +static int macsec_fs_tx_create(struct mlx5e_macsec_fs *macsec_fs) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5e_macsec_tx *tx_fs = macsec_fs->tx_fs; + struct net_device *netdev = macsec_fs->netdev; + struct mlx5_flow_table_attr ft_attr = {}; + struct mlx5_flow_destination dest = {}; + struct mlx5e_macsec_tables *tx_tables; + struct mlx5_flow_act flow_act = {}; + struct mlx5e_flow_table *ft_crypto; + struct mlx5_flow_table *flow_table; + struct mlx5_flow_group *flow_group; + struct mlx5_flow_namespace *ns; + struct mlx5_flow_handle *rule; + struct mlx5_flow_spec *spec; + u32 *flow_group_in; + int err = 0; + + ns = mlx5_get_flow_namespace(macsec_fs->mdev, MLX5_FLOW_NAMESPACE_EGRESS_MACSEC); + if (!ns) + return -ENOMEM; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) + return -ENOMEM; + + flow_group_in = kvzalloc(inlen, GFP_KERNEL); + if (!flow_group_in) + goto out_spec; + + tx_tables = &tx_fs->tables; + ft_crypto = &tx_tables->ft_crypto; + + /* Tx crypto table */ + ft_attr.flags = MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT; + ft_attr.level = TX_CRYPTO_TABLE_LEVEL; + ft_attr.max_fte = CRYPTO_NUM_MAXSEC_FTE; + + flow_table = mlx5_create_flow_table(ns, &ft_attr); + if (IS_ERR(flow_table)) { + err = PTR_ERR(flow_table); + netdev_err(netdev, "Failed to create MACsec Tx crypto table err(%d)\n", err); + goto out_flow_group; + } + ft_crypto->t = flow_table; + + /* Tx crypto table groups */ + err = macsec_fs_tx_create_crypto_table_groups(ft_crypto); + if (err) { + netdev_err(netdev, + "Failed to create default flow group for MACsec Tx crypto table err(%d)\n", + err); + goto err; + } + + /* Tx crypto table MKE rule - MKE packets shouldn't be offloaded */ + memset(&flow_act, 0, sizeof(flow_act)); + memset(spec, 0, sizeof(*spec)); + spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS; + + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.ethertype); + MLX5_SET(fte_match_param, spec->match_value, outer_headers.ethertype, ETH_P_PAE); + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_ALLOW; + + rule = mlx5_add_flow_rules(ft_crypto->t, spec, &flow_act, NULL, 0); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + netdev_err(netdev, "Failed to add MACsec TX MKE rule, err=%d\n", err); + goto err; + } + tx_fs->crypto_mke_rule = rule; + + /* Tx crypto table Default miss rule */ + memset(&flow_act, 0, sizeof(flow_act)); + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_ALLOW; + rule = mlx5_add_flow_rules(ft_crypto->t, NULL, &flow_act, NULL, 0); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + netdev_err(netdev, "Failed to add MACsec Tx table default miss rule %d\n", err); + goto err; + } + tx_tables->crypto_miss_rule = rule; + + /* Tx check table */ + flow_table = macsec_fs_auto_group_table_create(ns, 0, TX_CHECK_TABLE_LEVEL, + TX_CHECK_TABLE_NUM_FTE); + if (IS_ERR(flow_table)) { + err = PTR_ERR(flow_table); + netdev_err(netdev, "fail to create MACsec TX check table, err(%d)\n", err); + goto err; + } + tx_tables->ft_check = flow_table; + + /* Tx check table Default miss group/rule */ + memset(flow_group_in, 0, inlen); + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, flow_table->max_fte - 1); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, flow_table->max_fte - 1); + flow_group = mlx5_create_flow_group(tx_tables->ft_check, flow_group_in); + if (IS_ERR(flow_group)) { + err = PTR_ERR(flow_group); + netdev_err(netdev, + "Failed to create default flow group for MACsec Tx crypto table err(%d)\n", + err); + goto err; + } + tx_tables->ft_check_group = flow_group; + + /* Tx check table default drop rule */ + memset(&dest, 0, sizeof(struct mlx5_flow_destination)); + memset(&flow_act, 0, sizeof(flow_act)); + dest.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; + dest.counter_id = mlx5_fc_id(tx_tables->check_miss_rule_counter); + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP | MLX5_FLOW_CONTEXT_ACTION_COUNT; + rule = mlx5_add_flow_rules(tx_tables->ft_check, NULL, &flow_act, &dest, 1); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + netdev_err(netdev, "Failed to added MACsec tx check drop rule, err(%d)\n", err); + goto err; + } + tx_tables->check_miss_rule = rule; + + /* Tx check table rule */ + memset(spec, 0, sizeof(struct mlx5_flow_spec)); + memset(&dest, 0, sizeof(struct mlx5_flow_destination)); + memset(&flow_act, 0, sizeof(flow_act)); + + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, misc_parameters_2.metadata_reg_c_4); + MLX5_SET(fte_match_param, spec->match_value, misc_parameters_2.metadata_reg_c_4, 0); + spec->match_criteria_enable = MLX5_MATCH_MISC_PARAMETERS_2; + + flow_act.flags = FLOW_ACT_NO_APPEND; + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_ALLOW | MLX5_FLOW_CONTEXT_ACTION_COUNT; + dest.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; + dest.counter_id = mlx5_fc_id(tx_tables->check_rule_counter); + rule = mlx5_add_flow_rules(tx_tables->ft_check, spec, &flow_act, &dest, 1); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + netdev_err(netdev, "Failed to add MACsec check rule, err=%d\n", err); + goto err; + } + tx_fs->check_rule = rule; + + err = macsec_fs_tx_roce_create(macsec_fs); + if (err) + goto err; + + goto out_flow_group; + +err: + macsec_fs_tx_destroy(macsec_fs); +out_flow_group: + kvfree(flow_group_in); +out_spec: + kvfree(spec); + return err; +} + +static int macsec_fs_tx_ft_get(struct mlx5e_macsec_fs *macsec_fs) +{ + struct mlx5e_macsec_tx *tx_fs = macsec_fs->tx_fs; + struct mlx5e_macsec_tables *tx_tables; + int err = 0; + + tx_tables = &tx_fs->tables; + if (tx_tables->refcnt) + goto out; + + err = macsec_fs_tx_create(macsec_fs); + if (err) + return err; + +out: + tx_tables->refcnt++; + return err; +} + +static void macsec_fs_tx_ft_put(struct mlx5e_macsec_fs *macsec_fs) +{ + struct mlx5e_macsec_tables *tx_tables = &macsec_fs->tx_fs->tables; + + if (--tx_tables->refcnt) + return; + + macsec_fs_tx_destroy(macsec_fs); +} + +static int macsec_fs_tx_setup_fte(struct mlx5e_macsec_fs *macsec_fs, + struct mlx5_flow_spec *spec, + struct mlx5_flow_act *flow_act, + u32 macsec_obj_id, + u32 *fs_id) +{ + struct mlx5e_macsec_tx *tx_fs = macsec_fs->tx_fs; + int err = 0; + u32 id; + + err = ida_alloc_range(&tx_fs->tx_halloc, 1, + MLX5_MACSEC_NUM_OF_SUPPORTED_INTERFACES, + GFP_KERNEL); + if (err < 0) + return err; + + id = err; + spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS_2; + + /* Metadata match */ + MLX5_SET(fte_match_param, spec->match_criteria, misc_parameters_2.metadata_reg_a, + MLX5_ETH_WQE_FT_META_MACSEC_MASK); + MLX5_SET(fte_match_param, spec->match_value, misc_parameters_2.metadata_reg_a, + MLX5_ETH_WQE_FT_META_MACSEC | id << 2); + + *fs_id = id; + flow_act->crypto.type = MLX5_FLOW_CONTEXT_ENCRYPT_DECRYPT_TYPE_MACSEC; + flow_act->crypto.obj_id = macsec_obj_id; + + mlx5_core_dbg(macsec_fs->mdev, "Tx fte: macsec obj_id %u, fs_id %u\n", macsec_obj_id, id); + return 0; +} + +static void macsec_fs_tx_create_sectag_header(const struct macsec_context *ctx, + char *reformatbf, + size_t *reformat_size) +{ + const struct macsec_secy *secy = ctx->secy; + bool sci_present = macsec_send_sci(secy); + struct mlx5_sectag_header sectag = {}; + const struct macsec_tx_sc *tx_sc; + + tx_sc = &secy->tx_sc; + sectag.ethertype = htons(ETH_P_MACSEC); + + if (sci_present) { + sectag.tci_an |= MACSEC_TCI_SC; + memcpy(§ag.sci, &secy->sci, + sizeof(sectag.sci)); + } else { + if (tx_sc->end_station) + sectag.tci_an |= MACSEC_TCI_ES; + if (tx_sc->scb) + sectag.tci_an |= MACSEC_TCI_SCB; + } + + /* With GCM, C/E clear for !encrypt, both set for encrypt */ + if (tx_sc->encrypt) + sectag.tci_an |= MACSEC_TCI_CONFID; + else if (secy->icv_len != MACSEC_DEFAULT_ICV_LEN) + sectag.tci_an |= MACSEC_TCI_C; + + sectag.tci_an |= tx_sc->encoding_sa; + + *reformat_size = MLX5_MACSEC_TAG_LEN + (sci_present ? MACSEC_SCI_LEN : 0); + + memcpy(reformatbf, §ag, *reformat_size); +} + +static void macsec_fs_tx_del_rule(struct mlx5e_macsec_fs *macsec_fs, + struct mlx5e_macsec_tx_rule *tx_rule) +{ + if (tx_rule->rule) { + mlx5_del_flow_rules(tx_rule->rule); + tx_rule->rule = NULL; + } + + if (tx_rule->pkt_reformat) { + mlx5_packet_reformat_dealloc(macsec_fs->mdev, tx_rule->pkt_reformat); + tx_rule->pkt_reformat = NULL; + } + + if (tx_rule->fs_id) { + ida_free(&macsec_fs->tx_fs->tx_halloc, tx_rule->fs_id); + tx_rule->fs_id = 0; + } + + kfree(tx_rule); + + macsec_fs_tx_ft_put(macsec_fs); +} + +static union mlx5e_macsec_rule * +macsec_fs_tx_add_rule(struct mlx5e_macsec_fs *macsec_fs, + const struct macsec_context *macsec_ctx, + struct mlx5_macsec_rule_attrs *attrs, + u32 *sa_fs_id) +{ + char reformatbf[MLX5_MACSEC_TAG_LEN + MACSEC_SCI_LEN]; + struct mlx5_pkt_reformat_params reformat_params = {}; + struct mlx5e_macsec_tx *tx_fs = macsec_fs->tx_fs; + struct net_device *netdev = macsec_fs->netdev; + union mlx5e_macsec_rule *macsec_rule = NULL; + struct mlx5_flow_destination dest = {}; + struct mlx5e_macsec_tables *tx_tables; + struct mlx5e_macsec_tx_rule *tx_rule; + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_handle *rule; + struct mlx5_flow_spec *spec; + size_t reformat_size; + int err = 0; + u32 fs_id; + + tx_tables = &tx_fs->tables; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) + return NULL; + + err = macsec_fs_tx_ft_get(macsec_fs); + if (err) + goto out_spec; + + macsec_rule = kzalloc(sizeof(*macsec_rule), GFP_KERNEL); + if (!macsec_rule) { + macsec_fs_tx_ft_put(macsec_fs); + goto out_spec; + } + + tx_rule = &macsec_rule->tx_rule; + + /* Tx crypto table crypto rule */ + macsec_fs_tx_create_sectag_header(macsec_ctx, reformatbf, &reformat_size); + + reformat_params.type = MLX5_REFORMAT_TYPE_ADD_MACSEC; + reformat_params.size = reformat_size; + reformat_params.data = reformatbf; + flow_act.pkt_reformat = mlx5_packet_reformat_alloc(macsec_fs->mdev, + &reformat_params, + MLX5_FLOW_NAMESPACE_EGRESS_MACSEC); + if (IS_ERR(flow_act.pkt_reformat)) { + err = PTR_ERR(flow_act.pkt_reformat); + netdev_err(netdev, "Failed to allocate MACsec Tx reformat context err=%d\n", err); + goto err; + } + tx_rule->pkt_reformat = flow_act.pkt_reformat; + + err = macsec_fs_tx_setup_fte(macsec_fs, spec, &flow_act, attrs->macsec_obj_id, &fs_id); + if (err) { + netdev_err(netdev, + "Failed to add packet reformat for MACsec TX crypto rule, err=%d\n", + err); + goto err; + } + + tx_rule->fs_id = fs_id; + *sa_fs_id = fs_id; + + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | + MLX5_FLOW_CONTEXT_ACTION_CRYPTO_ENCRYPT | + MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT; + dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dest.ft = tx_tables->ft_check; + rule = mlx5_add_flow_rules(tx_tables->ft_crypto.t, spec, &flow_act, &dest, 1); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + netdev_err(netdev, "Failed to add MACsec TX crypto rule, err=%d\n", err); + goto err; + } + tx_rule->rule = rule; + + goto out_spec; + +err: + macsec_fs_tx_del_rule(macsec_fs, tx_rule); + macsec_rule = NULL; +out_spec: + kvfree(spec); + + return macsec_rule; +} + +static void macsec_fs_tx_cleanup(struct mlx5e_macsec_fs *macsec_fs) +{ + struct mlx5e_macsec_tx *tx_fs = macsec_fs->tx_fs; + struct mlx5_core_dev *mdev = macsec_fs->mdev; + struct mlx5e_macsec_tables *tx_tables; + + if (!tx_fs) + return; + + tx_tables = &tx_fs->tables; + if (tx_tables->refcnt) { + netdev_err(macsec_fs->netdev, + "Can't destroy MACsec offload tx_fs, refcnt(%u) isn't 0\n", + tx_tables->refcnt); + return; + } + + ida_destroy(&tx_fs->tx_halloc); + + if (tx_tables->check_miss_rule_counter) { + mlx5_fc_destroy(mdev, tx_tables->check_miss_rule_counter); + tx_tables->check_miss_rule_counter = NULL; + } + + if (tx_tables->check_rule_counter) { + mlx5_fc_destroy(mdev, tx_tables->check_rule_counter); + tx_tables->check_rule_counter = NULL; + } + + kfree(tx_fs); + macsec_fs->tx_fs = NULL; +} + +static int macsec_fs_tx_init(struct mlx5e_macsec_fs *macsec_fs) +{ + struct net_device *netdev = macsec_fs->netdev; + struct mlx5_core_dev *mdev = macsec_fs->mdev; + struct mlx5e_macsec_tables *tx_tables; + struct mlx5e_macsec_tx *tx_fs; + struct mlx5_fc *flow_counter; + int err; + + tx_fs = kzalloc(sizeof(*tx_fs), GFP_KERNEL); + if (!tx_fs) + return -ENOMEM; + + tx_tables = &tx_fs->tables; + + flow_counter = mlx5_fc_create(mdev, false); + if (IS_ERR(flow_counter)) { + err = PTR_ERR(flow_counter); + netdev_err(netdev, + "Failed to create MACsec Tx encrypt flow counter, err(%d)\n", + err); + goto err_encrypt_counter; + } + tx_tables->check_rule_counter = flow_counter; + + flow_counter = mlx5_fc_create(mdev, false); + if (IS_ERR(flow_counter)) { + err = PTR_ERR(flow_counter); + netdev_err(netdev, + "Failed to create MACsec Tx drop flow counter, err(%d)\n", + err); + goto err_drop_counter; + } + tx_tables->check_miss_rule_counter = flow_counter; + + ida_init(&tx_fs->tx_halloc); + + macsec_fs->tx_fs = tx_fs; + + return 0; + +err_drop_counter: + mlx5_fc_destroy(mdev, tx_tables->check_rule_counter); + tx_tables->check_rule_counter = NULL; + +err_encrypt_counter: + kfree(tx_fs); + macsec_fs->tx_fs = NULL; + + return err; +} + +static void macsec_fs_rdma_rx_destroy(struct mlx5e_macsec_rx_roce *roce, struct mlx5_core_dev *mdev) +{ + if (!roce->nic_ft) + return; + + mlx5_del_flow_rules(roce->roce_nic_miss.rule); + mlx5_del_flow_rules(roce->nic_rule); + mlx5_modify_header_dealloc(mdev, roce->copy_modify_hdr); + roce->copy_modify_hdr = NULL; + mlx5_destroy_flow_group(roce->roce_nic_miss.group); + mlx5_destroy_flow_group(roce->nic_g); + mlx5_destroy_flow_table(roce->nic_ft); + roce->nic_ft = NULL; + + mlx5_del_flow_rules(roce->roce_rdma_macsec_op_miss.rule); + mlx5_destroy_flow_group(roce->roce_rdma_macsec_op_miss.group); + mlx5_destroy_flow_table(roce->ft_rdma_rx_macsec_op_check); + mlx5_destroy_flow_table(roce->ft_rdma_rx_ip_check); + roce->ft_rdma_rx_macsec_op_check = NULL; + roce->ft_rdma_rx_ip_check = NULL; +} + +static void macsec_fs_rx_destroy(struct mlx5e_macsec_fs *macsec_fs) +{ + struct mlx5e_macsec_rx *rx_fs = macsec_fs->rx_fs; + struct mlx5e_macsec_tables *rx_tables; + int i; + + /* Rx check table */ + for (i = 1; i >= 0; --i) { + if (rx_fs->check_rule[i]) { + mlx5_del_flow_rules(rx_fs->check_rule[i]); + rx_fs->check_rule[i] = NULL; + } + + if (rx_fs->check_rule_pkt_reformat[i]) { + mlx5_packet_reformat_dealloc(macsec_fs->mdev, + rx_fs->check_rule_pkt_reformat[i]); + rx_fs->check_rule_pkt_reformat[i] = NULL; + } + } + + rx_tables = &rx_fs->tables; + + if (rx_tables->check_miss_rule) { + mlx5_del_flow_rules(rx_tables->check_miss_rule); + rx_tables->check_miss_rule = NULL; + } + + if (rx_tables->ft_check_group) { + mlx5_destroy_flow_group(rx_tables->ft_check_group); + rx_tables->ft_check_group = NULL; + } + + if (rx_tables->ft_check) { + mlx5_destroy_flow_table(rx_tables->ft_check); + rx_tables->ft_check = NULL; + } + + /* Rx crypto table */ + if (rx_tables->crypto_miss_rule) { + mlx5_del_flow_rules(rx_tables->crypto_miss_rule); + rx_tables->crypto_miss_rule = NULL; + } + + mlx5e_destroy_flow_table(&rx_tables->ft_crypto); + + macsec_fs_rdma_rx_destroy(&macsec_fs->rx_fs->roce, macsec_fs->mdev); +} + +static int macsec_fs_rx_create_crypto_table_groups(struct mlx5e_flow_table *ft) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + int mclen = MLX5_ST_SZ_BYTES(fte_match_param); + int ix = 0; + u32 *in; + int err; + u8 *mc; + + ft->g = kcalloc(RX_CRYPTO_TABLE_NUM_GROUPS, sizeof(*ft->g), GFP_KERNEL); + if (!ft->g) + return -ENOMEM; + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) { + kfree(ft->g); + return -ENOMEM; + } + + mc = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria); + + /* Flow group for SA rule with SCI */ + MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS | + MLX5_MATCH_MISC_PARAMETERS_5); + MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ethertype); + + MLX5_SET(fte_match_param, mc, misc_parameters_5.macsec_tag_0, + MLX5_MACSEC_SECTAG_TCI_AN_FIELD_BITMASK << + MLX5_MACSEC_SECTAG_TCI_AN_FIELD_OFFSET); + MLX5_SET_TO_ONES(fte_match_param, mc, misc_parameters_5.macsec_tag_2); + MLX5_SET_TO_ONES(fte_match_param, mc, misc_parameters_5.macsec_tag_3); + + MLX5_SET_CFG(in, start_flow_index, ix); + ix += RX_CRYPTO_TABLE_SA_RULE_WITH_SCI_GROUP_SIZE; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); + if (IS_ERR(ft->g[ft->num_groups])) + goto err; + ft->num_groups++; + + /* Flow group for SA rule without SCI */ + memset(in, 0, inlen); + memset(mc, 0, mclen); + MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS | + MLX5_MATCH_MISC_PARAMETERS_5); + MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.smac_47_16); + MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.smac_15_0); + MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ethertype); + + MLX5_SET(fte_match_param, mc, misc_parameters_5.macsec_tag_0, + MLX5_MACSEC_SECTAG_TCI_AN_FIELD_BITMASK << MLX5_MACSEC_SECTAG_TCI_AN_FIELD_OFFSET); + + MLX5_SET_CFG(in, start_flow_index, ix); + ix += RX_CRYPTO_TABLE_SA_RULE_WITHOUT_SCI_GROUP_SIZE; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); + if (IS_ERR(ft->g[ft->num_groups])) + goto err; + ft->num_groups++; + + /* Flow Group for l2 traps */ + memset(in, 0, inlen); + memset(mc, 0, mclen); + MLX5_SET_CFG(in, start_flow_index, ix); + ix += CRYPTO_TABLE_DEFAULT_RULE_GROUP_SIZE; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); + if (IS_ERR(ft->g[ft->num_groups])) + goto err; + ft->num_groups++; + + kvfree(in); + return 0; + +err: + err = PTR_ERR(ft->g[ft->num_groups]); + ft->g[ft->num_groups] = NULL; + kvfree(in); + + return err; +} + +static int macsec_fs_rx_create_check_decap_rule(struct mlx5e_macsec_fs *macsec_fs, + struct mlx5_flow_destination *dest, + struct mlx5_flow_act *flow_act, + struct mlx5_flow_spec *spec, + int reformat_param_size) +{ + int rule_index = (reformat_param_size == MLX5_SECTAG_HEADER_SIZE_WITH_SCI) ? 0 : 1; + u8 mlx5_reformat_buf[MLX5_SECTAG_HEADER_SIZE_WITH_SCI]; + struct mlx5_pkt_reformat_params reformat_params = {}; + struct mlx5e_macsec_rx *rx_fs = macsec_fs->rx_fs; + struct net_device *netdev = macsec_fs->netdev; + struct mlx5_flow_destination roce_dest[2]; + struct mlx5e_macsec_tables *rx_tables; + struct mlx5_flow_handle *rule; + int err = 0, dstn = 0; + + rx_tables = &rx_fs->tables; + + /* Rx check table decap 16B rule */ + memset(dest, 0, sizeof(*dest)); + memset(flow_act, 0, sizeof(*flow_act)); + memset(spec, 0, sizeof(*spec)); + + reformat_params.type = MLX5_REFORMAT_TYPE_DEL_MACSEC; + reformat_params.size = reformat_param_size; + reformat_params.data = mlx5_reformat_buf; + flow_act->pkt_reformat = mlx5_packet_reformat_alloc(macsec_fs->mdev, + &reformat_params, + MLX5_FLOW_NAMESPACE_KERNEL_RX_MACSEC); + if (IS_ERR(flow_act->pkt_reformat)) { + err = PTR_ERR(flow_act->pkt_reformat); + netdev_err(netdev, "Failed to allocate MACsec Rx reformat context err=%d\n", err); + return err; + } + rx_fs->check_rule_pkt_reformat[rule_index] = flow_act->pkt_reformat; + + spec->match_criteria_enable = MLX5_MATCH_MISC_PARAMETERS_2; + /* MACsec syndrome match */ + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, misc_parameters_2.macsec_syndrome); + MLX5_SET(fte_match_param, spec->match_value, misc_parameters_2.macsec_syndrome, 0); + /* ASO return reg syndrome match */ + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, misc_parameters_2.metadata_reg_c_4); + MLX5_SET(fte_match_param, spec->match_value, misc_parameters_2.metadata_reg_c_4, 0); + + spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS_5; + /* Sectag TCI SC present bit*/ + MLX5_SET(fte_match_param, spec->match_criteria, misc_parameters_5.macsec_tag_0, + MLX5_MACSEC_SECTAG_TCI_SC_FIELD_BIT << MLX5_MACSEC_SECTAG_TCI_AN_FIELD_OFFSET); + + if (reformat_param_size == MLX5_SECTAG_HEADER_SIZE_WITH_SCI) + MLX5_SET(fte_match_param, spec->match_value, misc_parameters_5.macsec_tag_0, + MLX5_MACSEC_SECTAG_TCI_SC_FIELD_BIT << + MLX5_MACSEC_SECTAG_TCI_AN_FIELD_OFFSET); + + flow_act->flags = FLOW_ACT_NO_APPEND; + + if (rx_fs->roce.nic_ft) { + flow_act->action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + roce_dest[dstn].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + roce_dest[dstn].ft = rx_fs->roce.nic_ft; + dstn++; + } else { + flow_act->action = MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO; + } + + flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT | + MLX5_FLOW_CONTEXT_ACTION_COUNT; + roce_dest[dstn].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; + roce_dest[dstn].counter_id = mlx5_fc_id(rx_tables->check_rule_counter); + rule = mlx5_add_flow_rules(rx_tables->ft_check, spec, flow_act, roce_dest, dstn + 1); + + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + netdev_err(netdev, "Failed to add MACsec Rx check rule, err=%d\n", err); + return err; + } + + rx_fs->check_rule[rule_index] = rule; + + return 0; +} + +static int macsec_fs_rx_roce_miss_create(struct net_device *netdev, + struct mlx5e_macsec_rx_roce *roce) +{ + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_group *flow_group; + struct mlx5_flow_handle *rule; + u32 *flow_group_in; + int err; + + flow_group_in = kvzalloc(MLX5_ST_SZ_BYTES(create_flow_group_in), GFP_KERNEL); + if (!flow_group_in) + return -ENOMEM; + + /* IP check ft has no miss rule since we use default miss action which is go to next PRIO */ + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, + roce->ft_rdma_rx_macsec_op_check->max_fte - 1); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, + roce->ft_rdma_rx_macsec_op_check->max_fte - 1); + flow_group = mlx5_create_flow_group(roce->ft_rdma_rx_macsec_op_check, flow_group_in); + if (IS_ERR(flow_group)) { + err = PTR_ERR(flow_group); + netdev_err(netdev, + "Failed to create miss flow group for MACsec RoCE MACSEC_OP_CHECK table err(%d)\n", + err); + goto macsec_op_miss_group_err; + } + roce->roce_rdma_macsec_op_miss.group = flow_group; + + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP; + rule = mlx5_add_flow_rules(roce->ft_rdma_rx_macsec_op_check, NULL, &flow_act, NULL, 0); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + netdev_err(netdev, "Failed to add miss rule to MACsec RoCE MACSEC_OP_CHECK table err(%d)\n", + err); + goto macsec_op_rule_err; + } + roce->roce_rdma_macsec_op_miss.rule = rule; + + kvfree(flow_group_in); + return 0; + +macsec_op_rule_err: + mlx5_destroy_flow_group(roce->roce_rdma_macsec_op_miss.group); +macsec_op_miss_group_err: + kvfree(flow_group_in); + return err; +} + +#define MLX5E_RX_ROCE_GROUP_SIZE BIT(0) + +static int create_nic_ft_udp_group_and_rules(struct mlx5e_macsec_fs *macsec_fs, + struct mlx5e_macsec_rx_roce *roce) +{ + u8 action[MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto)] = {}; + struct net_device *netdev = macsec_fs->netdev; + struct mlx5_flow_destination dst = {}; + struct mlx5_modify_hdr *modify_hdr; + MLX5_DECLARE_FLOW_ACT(flow_act); + struct mlx5_flow_handle *rule; + struct mlx5_flow_spec *spec; + struct mlx5_flow_group *g; + void *outer_headers_c; + int ix = 0; + u32 *in; + int err; + u8 *mc; + + in = kvzalloc(MLX5_ST_SZ_BYTES(create_flow_group_in), GFP_KERNEL); + if (!in) + return -ENOMEM; + + mc = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria); + outer_headers_c = MLX5_ADDR_OF(fte_match_param, mc, outer_headers); + MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, outer_headers_c, ip_protocol); + MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, outer_headers_c, udp_dport); + + MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); + MLX5_SET_CFG(in, start_flow_index, ix); + ix += MLX5E_RX_ROCE_GROUP_SIZE; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + g = mlx5_create_flow_group(roce->nic_ft, in); + if (IS_ERR(g)) { + err = PTR_ERR(g); + netdev_err(netdev, "Failed to create main flow group for MACsec RoCE NIC UDP table err(%d)\n", + err); + goto err_udp_group; + } + roce->nic_g = g; + + memset(in, 0, MLX5_ST_SZ_BYTES(create_flow_group_in)); + MLX5_SET_CFG(in, start_flow_index, ix); + ix += MLX5E_RX_ROCE_GROUP_SIZE; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + g = mlx5_create_flow_group(roce->nic_ft, in); + if (IS_ERR(g)) { + err = PTR_ERR(g); + netdev_err(netdev, "Failed to create miss flow group for MACsec RoCE NIC UDP table err(%d)\n", + err); + goto err_udp_miss_group; + } + roce->roce_nic_miss.group = g; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) { + err = -ENOMEM; + goto fail_alloc; + } + + spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS; + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + outer_headers.ip_protocol); + MLX5_SET(fte_match_param, spec->match_value, + outer_headers.ip_protocol, IPPROTO_UDP); + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + outer_headers.udp_dport); + MLX5_SET(fte_match_param, spec->match_value, + outer_headers.udp_dport, ROCE_V2_UDP_DPORT); + + MLX5_SET(copy_action_in, action, action_type, MLX5_ACTION_TYPE_COPY); + MLX5_SET(copy_action_in, action, src_field, MLX5_ACTION_IN_FIELD_METADATA_REG_B); + MLX5_SET(copy_action_in, action, src_offset, 0); + MLX5_SET(copy_action_in, action, length, 32); + MLX5_SET(copy_action_in, action, dst_field, MLX5_ACTION_IN_FIELD_METADATA_REG_C_5); + MLX5_SET(copy_action_in, action, dst_offset, 0); + + modify_hdr = mlx5_modify_header_alloc(macsec_fs->mdev, MLX5_FLOW_NAMESPACE_KERNEL_RX_MACSEC, + 1, action); + + if (IS_ERR(modify_hdr)) { + err = PTR_ERR(modify_hdr); + netdev_err(netdev, + "Fail to alloc macsec copy modify_header_id err(%d)\n", err); + goto fail_alloc_hdr; + } + + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_MOD_HDR | + MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + flow_act.modify_hdr = modify_hdr; + dst.type = MLX5_FLOW_DESTINATION_TYPE_TABLE_TYPE; + dst.ft = roce->ft_rdma_rx_ip_check; + rule = mlx5_add_flow_rules(roce->nic_ft, spec, &flow_act, &dst, 1); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + netdev_err(netdev, "Failed to add rule to MACsec RoCE NIC UDP table err(%d)\n", + err); + goto fail_add_rule; + } + roce->nic_rule = rule; + roce->copy_modify_hdr = modify_hdr; + + memset(&flow_act, 0, sizeof(flow_act)); + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO; + rule = mlx5_add_flow_rules(roce->nic_ft, NULL, &flow_act, NULL, 0); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + netdev_err(netdev, "Failed to add miss rule to MACsec RoCE NIC UDP table err(%d)\n", + err); + goto fail_add_rule2; + } + roce->roce_nic_miss.rule = rule; + + kvfree(in); + kvfree(spec); + return 0; + +fail_add_rule2: + mlx5_del_flow_rules(roce->nic_rule); +fail_add_rule: + mlx5_modify_header_dealloc(macsec_fs->mdev, modify_hdr); +fail_alloc_hdr: + kvfree(spec); +fail_alloc: + mlx5_destroy_flow_group(roce->roce_nic_miss.group); +err_udp_miss_group: + mlx5_destroy_flow_group(roce->nic_g); +err_udp_group: + kvfree(in); + return err; +} + +static int macsec_fs_rx_roce_create(struct mlx5e_macsec_fs *macsec_fs) +{ + struct mlx5e_macsec_rx *rx_fs = macsec_fs->rx_fs; + struct mlx5_flow_table_attr ft_attr = {}; + struct mlx5_flow_namespace *ns; + struct mlx5_flow_table *ft; + int err = 0; + + if (!is_macsec_roce_supported(macsec_fs->mdev)) { + netdev_err(macsec_fs->netdev, "Failed to init RoCE MACsec, capabilities not supported\n"); + return 0; + } + + ns = mlx5_get_flow_namespace(macsec_fs->mdev, + MLX5_FLOW_NAMESPACE_RDMA_RX_MACSEC); + if (!ns) + return -ENOMEM; + + ft = macsec_fs_auto_group_table_create(ns, 0, RDMA_RX_ROCE_IP_TABLE_LEVEL, + CRYPTO_NUM_MAXSEC_FTE); + if (IS_ERR(ft)) { + err = PTR_ERR(ft); + netdev_err(macsec_fs->netdev, + "Failed to create MACsec rdma_rx_ip_check roce table err(%d)\n", err); + return err; + } + rx_fs->roce.ft_rdma_rx_ip_check = ft; + + ft = macsec_fs_auto_group_table_create(ns, 0, RDMA_RX_ROCE_MACSEC_OP_TABLE_LEVEL, + CRYPTO_NUM_MAXSEC_FTE); + if (IS_ERR(ft)) { + err = PTR_ERR(ft); + netdev_err(macsec_fs->netdev, + "Failed to create MACsec rdma_rx_macsec_op_check roce table err(%d)\n", + err); + goto fail_macsec_op; + } + rx_fs->roce.ft_rdma_rx_macsec_op_check = ft; + + err = macsec_fs_rx_roce_miss_create(macsec_fs->netdev, &rx_fs->roce); + if (err) + goto fail_miss_create; + + ns = mlx5_get_flow_namespace(macsec_fs->mdev, + MLX5_FLOW_NAMESPACE_KERNEL_RX_MACSEC); + if (!ns) { + err = -EOPNOTSUPP; + goto fail_ns; + } + + ft_attr.level = RX_ROCE_TABLE_LEVEL; + ft_attr.max_fte = RX_ROCE_TABLE_NUM_FTE; + ft = mlx5_create_flow_table(ns, &ft_attr); + if (IS_ERR(ft)) { + err = PTR_ERR(ft); + netdev_err(macsec_fs->netdev, + "Failed to create MACsec Rx roce NIC table err(%d)\n", err); + goto fail_ns; + } + rx_fs->roce.nic_ft = ft; + + err = create_nic_ft_udp_group_and_rules(macsec_fs, &rx_fs->roce); + if (err) + goto udp_ft_fail; + + mutex_init(&rx_fs->roce.lock); + return 0; + +udp_ft_fail: + mlx5_destroy_flow_table(rx_fs->roce.nic_ft); + rx_fs->roce.nic_ft = NULL; +fail_ns: + mlx5_del_flow_rules(rx_fs->roce.roce_rdma_macsec_op_miss.rule); + mlx5_destroy_flow_group(rx_fs->roce.roce_rdma_macsec_op_miss.group); +fail_miss_create: + mlx5_destroy_flow_table(rx_fs->roce.ft_rdma_rx_macsec_op_check); + rx_fs->roce.ft_rdma_rx_macsec_op_check = NULL; +fail_macsec_op: + mlx5_destroy_flow_table(rx_fs->roce.ft_rdma_rx_ip_check); + rx_fs->roce.ft_rdma_rx_ip_check = NULL; + return err; +} + +static int macsec_fs_rx_create(struct mlx5e_macsec_fs *macsec_fs) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5e_macsec_rx *rx_fs = macsec_fs->rx_fs; + struct net_device *netdev = macsec_fs->netdev; + struct mlx5_flow_table_attr ft_attr = {}; + struct mlx5_flow_destination dest = {}; + struct mlx5e_macsec_tables *rx_tables; + struct mlx5e_flow_table *ft_crypto; + struct mlx5_flow_table *flow_table; + struct mlx5_flow_group *flow_group; + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_namespace *ns; + struct mlx5_flow_handle *rule; + struct mlx5_flow_spec *spec; + u32 *flow_group_in; + int err = 0; + + ns = mlx5_get_flow_namespace(macsec_fs->mdev, MLX5_FLOW_NAMESPACE_KERNEL_RX_MACSEC); + if (!ns) + return -ENOMEM; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) + return -ENOMEM; + + flow_group_in = kvzalloc(inlen, GFP_KERNEL); + if (!flow_group_in) + goto free_spec; + + rx_tables = &rx_fs->tables; + ft_crypto = &rx_tables->ft_crypto; + + /* Rx crypto table */ + ft_attr.level = RX_CRYPTO_TABLE_LEVEL; + ft_attr.max_fte = CRYPTO_NUM_MAXSEC_FTE; + + flow_table = mlx5_create_flow_table(ns, &ft_attr); + if (IS_ERR(flow_table)) { + err = PTR_ERR(flow_table); + netdev_err(netdev, "Failed to create MACsec Rx crypto table err(%d)\n", err); + goto out_flow_group; + } + ft_crypto->t = flow_table; + + /* Rx crypto table groups */ + err = macsec_fs_rx_create_crypto_table_groups(ft_crypto); + if (err) { + netdev_err(netdev, + "Failed to create default flow group for MACsec Tx crypto table err(%d)\n", + err); + goto err; + } + + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO; + rule = mlx5_add_flow_rules(ft_crypto->t, NULL, &flow_act, NULL, 0); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + netdev_err(netdev, + "Failed to add MACsec Rx crypto table default miss rule %d\n", + err); + goto err; + } + rx_tables->crypto_miss_rule = rule; + + /* Rx check table */ + flow_table = macsec_fs_auto_group_table_create(ns, + MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT, + RX_CHECK_TABLE_LEVEL, + RX_CHECK_TABLE_NUM_FTE); + if (IS_ERR(flow_table)) { + err = PTR_ERR(flow_table); + netdev_err(netdev, "fail to create MACsec RX check table, err(%d)\n", err); + goto err; + } + rx_tables->ft_check = flow_table; + + /* Rx check table Default miss group/rule */ + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, flow_table->max_fte - 1); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, flow_table->max_fte - 1); + flow_group = mlx5_create_flow_group(rx_tables->ft_check, flow_group_in); + if (IS_ERR(flow_group)) { + err = PTR_ERR(flow_group); + netdev_err(netdev, + "Failed to create default flow group for MACsec Rx check table err(%d)\n", + err); + goto err; + } + rx_tables->ft_check_group = flow_group; + + /* Rx check table default drop rule */ + memset(&flow_act, 0, sizeof(flow_act)); + + dest.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; + dest.counter_id = mlx5_fc_id(rx_tables->check_miss_rule_counter); + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP | MLX5_FLOW_CONTEXT_ACTION_COUNT; + rule = mlx5_add_flow_rules(rx_tables->ft_check, NULL, &flow_act, &dest, 1); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + netdev_err(netdev, "Failed to added MACsec Rx check drop rule, err(%d)\n", err); + goto err; + } + rx_tables->check_miss_rule = rule; + + err = macsec_fs_rx_roce_create(macsec_fs); + if (err) + goto err; + + /* Rx check table decap rules */ + err = macsec_fs_rx_create_check_decap_rule(macsec_fs, &dest, &flow_act, spec, + MLX5_SECTAG_HEADER_SIZE_WITH_SCI); + if (err) + goto err_with_roce; + + err = macsec_fs_rx_create_check_decap_rule(macsec_fs, &dest, &flow_act, spec, + MLX5_SECTAG_HEADER_SIZE_WITHOUT_SCI); + if (err) + goto err_with_roce; + + goto out_flow_group; + +err_with_roce: + macsec_fs_rdma_rx_destroy(&macsec_fs->rx_fs->roce, macsec_fs->mdev); +err: + macsec_fs_rx_destroy(macsec_fs); +out_flow_group: + kvfree(flow_group_in); +free_spec: + kvfree(spec); + return err; +} + +static int macsec_fs_rx_ft_get(struct mlx5e_macsec_fs *macsec_fs) +{ + struct mlx5e_macsec_tables *rx_tables = &macsec_fs->rx_fs->tables; + int err = 0; + + if (rx_tables->refcnt) + goto out; + + err = macsec_fs_rx_create(macsec_fs); + if (err) + return err; + +out: + rx_tables->refcnt++; + return err; +} + +static void macsec_fs_rx_ft_put(struct mlx5e_macsec_fs *macsec_fs) +{ + struct mlx5e_macsec_tables *rx_tables = &macsec_fs->rx_fs->tables; + + if (--rx_tables->refcnt) + return; + + macsec_fs_rx_destroy(macsec_fs); +} + +static void macsec_fs_rx_del_rule(struct mlx5e_macsec_fs *macsec_fs, + struct mlx5e_macsec_rx_rule *rx_rule) +{ + int i; + + for (i = 0; i < RX_NUM_OF_RULES_PER_SA; ++i) { + if (rx_rule->rule[i]) { + mlx5_del_flow_rules(rx_rule->rule[i]); + rx_rule->rule[i] = NULL; + } + } + + if (rx_rule->meta_modhdr) { + mlx5_modify_header_dealloc(macsec_fs->mdev, rx_rule->meta_modhdr); + rx_rule->meta_modhdr = NULL; + } + + kfree(rx_rule); + + macsec_fs_rx_ft_put(macsec_fs); +} + +static void macsec_fs_rx_setup_fte(struct mlx5_flow_spec *spec, + struct mlx5_flow_act *flow_act, + struct mlx5_macsec_rule_attrs *attrs, + bool sci_present) +{ + u8 tci_an = (sci_present << MLX5_MACSEC_SECTAG_TCI_SC_FIELD_OFFSET) | attrs->assoc_num; + struct mlx5_flow_act_crypto_params *crypto_params = &flow_act->crypto; + __be32 *sci_p = (__be32 *)(&attrs->sci); + + spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS; + + /* MACsec ethertype */ + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.ethertype); + MLX5_SET(fte_match_param, spec->match_value, outer_headers.ethertype, ETH_P_MACSEC); + + spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS_5; + + /* Sectag AN + TCI SC present bit*/ + MLX5_SET(fte_match_param, spec->match_criteria, misc_parameters_5.macsec_tag_0, + MLX5_MACSEC_SECTAG_TCI_AN_FIELD_BITMASK << MLX5_MACSEC_SECTAG_TCI_AN_FIELD_OFFSET); + MLX5_SET(fte_match_param, spec->match_value, misc_parameters_5.macsec_tag_0, + tci_an << MLX5_MACSEC_SECTAG_TCI_AN_FIELD_OFFSET); + + if (sci_present) { + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + misc_parameters_5.macsec_tag_2); + MLX5_SET(fte_match_param, spec->match_value, misc_parameters_5.macsec_tag_2, + be32_to_cpu(sci_p[0])); + + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + misc_parameters_5.macsec_tag_3); + MLX5_SET(fte_match_param, spec->match_value, misc_parameters_5.macsec_tag_3, + be32_to_cpu(sci_p[1])); + } else { + /* When SCI isn't present in the Sectag, need to match the source */ + /* MAC address only if the SCI contains the default MACsec PORT */ + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.smac_47_16); + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.smac_15_0); + memcpy(MLX5_ADDR_OF(fte_match_param, spec->match_value, outer_headers.smac_47_16), + sci_p, ETH_ALEN); + } + + crypto_params->type = MLX5_FLOW_CONTEXT_ENCRYPT_DECRYPT_TYPE_MACSEC; + crypto_params->obj_id = attrs->macsec_obj_id; +} + +static union mlx5e_macsec_rule * +macsec_fs_rx_add_rule(struct mlx5e_macsec_fs *macsec_fs, + const struct macsec_context *macsec_ctx, + struct mlx5_macsec_rule_attrs *attrs, + u32 fs_id) +{ + u8 action[MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto)] = {}; + struct mlx5e_macsec_rx *rx_fs = macsec_fs->rx_fs; + struct net_device *netdev = macsec_fs->netdev; + union mlx5e_macsec_rule *macsec_rule = NULL; + struct mlx5_modify_hdr *modify_hdr = NULL; + struct mlx5_flow_destination dest = {}; + struct mlx5e_macsec_tables *rx_tables; + struct mlx5e_macsec_rx_rule *rx_rule; + struct mlx5_flow_act flow_act = {}; + struct mlx5e_flow_table *ft_crypto; + struct mlx5_flow_handle *rule; + struct mlx5_flow_spec *spec; + int err = 0; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) + return NULL; + + err = macsec_fs_rx_ft_get(macsec_fs); + if (err) + goto out_spec; + + macsec_rule = kzalloc(sizeof(*macsec_rule), GFP_KERNEL); + if (!macsec_rule) { + macsec_fs_rx_ft_put(macsec_fs); + goto out_spec; + } + + rx_rule = &macsec_rule->rx_rule; + rx_tables = &rx_fs->tables; + ft_crypto = &rx_tables->ft_crypto; + + /* Set bit[31 - 30] macsec marker - 0x01 */ + /* Set bit[15-0] fs id */ + MLX5_SET(set_action_in, action, action_type, MLX5_ACTION_TYPE_SET); + MLX5_SET(set_action_in, action, field, MLX5_ACTION_IN_FIELD_METADATA_REG_B); + MLX5_SET(set_action_in, action, data, MLX5_MACSEC_RX_METADAT_HANDLE(fs_id) | BIT(30)); + MLX5_SET(set_action_in, action, offset, 0); + MLX5_SET(set_action_in, action, length, 32); + + modify_hdr = mlx5_modify_header_alloc(macsec_fs->mdev, MLX5_FLOW_NAMESPACE_KERNEL_RX_MACSEC, + 1, action); + if (IS_ERR(modify_hdr)) { + err = PTR_ERR(modify_hdr); + netdev_err(netdev, "fail to alloc MACsec set modify_header_id err=%d\n", err); + modify_hdr = NULL; + goto err; + } + rx_rule->meta_modhdr = modify_hdr; + + /* Rx crypto table with SCI rule */ + macsec_fs_rx_setup_fte(spec, &flow_act, attrs, true); + + flow_act.modify_hdr = modify_hdr; + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | + MLX5_FLOW_CONTEXT_ACTION_CRYPTO_DECRYPT | + MLX5_FLOW_CONTEXT_ACTION_MOD_HDR; + + dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dest.ft = rx_tables->ft_check; + rule = mlx5_add_flow_rules(ft_crypto->t, spec, &flow_act, &dest, 1); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + netdev_err(netdev, + "Failed to add SA with SCI rule to Rx crypto rule, err=%d\n", + err); + goto err; + } + rx_rule->rule[0] = rule; + + /* Rx crypto table without SCI rule */ + if (cpu_to_be64((__force u64)attrs->sci) & ntohs(MACSEC_PORT_ES)) { + memset(spec, 0, sizeof(struct mlx5_flow_spec)); + memset(&dest, 0, sizeof(struct mlx5_flow_destination)); + memset(&flow_act, 0, sizeof(flow_act)); + + macsec_fs_rx_setup_fte(spec, &flow_act, attrs, false); + + flow_act.modify_hdr = modify_hdr; + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | + MLX5_FLOW_CONTEXT_ACTION_CRYPTO_DECRYPT | + MLX5_FLOW_CONTEXT_ACTION_MOD_HDR; + + dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dest.ft = rx_tables->ft_check; + rule = mlx5_add_flow_rules(ft_crypto->t, spec, &flow_act, &dest, 1); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + netdev_err(netdev, + "Failed to add SA without SCI rule to Rx crypto rule, err=%d\n", + err); + goto err; + } + rx_rule->rule[1] = rule; + } + + kvfree(spec); + return macsec_rule; + +err: + macsec_fs_rx_del_rule(macsec_fs, rx_rule); + macsec_rule = NULL; +out_spec: + kvfree(spec); + return macsec_rule; +} + +static int macsec_fs_rx_init(struct mlx5e_macsec_fs *macsec_fs) +{ + struct net_device *netdev = macsec_fs->netdev; + struct mlx5_core_dev *mdev = macsec_fs->mdev; + struct mlx5e_macsec_tables *rx_tables; + struct mlx5e_macsec_rx *rx_fs; + struct mlx5_fc *flow_counter; + int err; + + rx_fs = kzalloc(sizeof(*rx_fs), GFP_KERNEL); + if (!rx_fs) + return -ENOMEM; + + flow_counter = mlx5_fc_create(mdev, false); + if (IS_ERR(flow_counter)) { + err = PTR_ERR(flow_counter); + netdev_err(netdev, + "Failed to create MACsec Rx encrypt flow counter, err(%d)\n", + err); + goto err_encrypt_counter; + } + + rx_tables = &rx_fs->tables; + rx_tables->check_rule_counter = flow_counter; + + flow_counter = mlx5_fc_create(mdev, false); + if (IS_ERR(flow_counter)) { + err = PTR_ERR(flow_counter); + netdev_err(netdev, + "Failed to create MACsec Rx drop flow counter, err(%d)\n", + err); + goto err_drop_counter; + } + rx_tables->check_miss_rule_counter = flow_counter; + + macsec_fs->rx_fs = rx_fs; + + return 0; + +err_drop_counter: + mlx5_fc_destroy(mdev, rx_tables->check_rule_counter); + rx_tables->check_rule_counter = NULL; + +err_encrypt_counter: + kfree(rx_fs); + macsec_fs->rx_fs = NULL; + + return err; +} + +static void macsec_fs_rx_cleanup(struct mlx5e_macsec_fs *macsec_fs) +{ + struct mlx5e_macsec_rx *rx_fs = macsec_fs->rx_fs; + struct mlx5_core_dev *mdev = macsec_fs->mdev; + struct mlx5e_macsec_tables *rx_tables; + + if (!rx_fs) + return; + + rx_tables = &rx_fs->tables; + + if (rx_tables->refcnt) { + netdev_err(macsec_fs->netdev, + "Can't destroy MACsec offload rx_fs, refcnt(%u) isn't 0\n", + rx_tables->refcnt); + return; + } + + if (rx_tables->check_miss_rule_counter) { + mlx5_fc_destroy(mdev, rx_tables->check_miss_rule_counter); + rx_tables->check_miss_rule_counter = NULL; + } + + if (rx_tables->check_rule_counter) { + mlx5_fc_destroy(mdev, rx_tables->check_rule_counter); + rx_tables->check_rule_counter = NULL; + } + + kfree(rx_fs); + macsec_fs->rx_fs = NULL; +} + +void mlx5e_macsec_fs_get_stats_fill(struct mlx5e_macsec_fs *macsec_fs, void *macsec_stats) +{ + struct mlx5e_macsec_stats *stats = (struct mlx5e_macsec_stats *)macsec_stats; + struct mlx5e_macsec_tables *tx_tables = &macsec_fs->tx_fs->tables; + struct mlx5e_macsec_tables *rx_tables = &macsec_fs->rx_fs->tables; + struct mlx5_core_dev *mdev = macsec_fs->mdev; + + if (tx_tables->check_rule_counter) + mlx5_fc_query(mdev, tx_tables->check_rule_counter, + &stats->macsec_tx_pkts, &stats->macsec_tx_bytes); + + if (tx_tables->check_miss_rule_counter) + mlx5_fc_query(mdev, tx_tables->check_miss_rule_counter, + &stats->macsec_tx_pkts_drop, &stats->macsec_tx_bytes_drop); + + if (rx_tables->check_rule_counter) + mlx5_fc_query(mdev, rx_tables->check_rule_counter, + &stats->macsec_rx_pkts, &stats->macsec_rx_bytes); + + if (rx_tables->check_miss_rule_counter) + mlx5_fc_query(mdev, rx_tables->check_miss_rule_counter, + &stats->macsec_rx_pkts_drop, &stats->macsec_rx_bytes_drop); +} + +union mlx5e_macsec_rule * +mlx5e_macsec_fs_add_rule(struct mlx5e_macsec_fs *macsec_fs, + const struct macsec_context *macsec_ctx, + struct mlx5_macsec_rule_attrs *attrs, + u32 *sa_fs_id) +{ + return (attrs->action == MLX5_ACCEL_MACSEC_ACTION_ENCRYPT) ? + macsec_fs_tx_add_rule(macsec_fs, macsec_ctx, attrs, sa_fs_id) : + macsec_fs_rx_add_rule(macsec_fs, macsec_ctx, attrs, *sa_fs_id); +} + +void mlx5e_macsec_fs_del_rule(struct mlx5e_macsec_fs *macsec_fs, + union mlx5e_macsec_rule *macsec_rule, + int action) +{ + (action == MLX5_ACCEL_MACSEC_ACTION_ENCRYPT) ? + macsec_fs_tx_del_rule(macsec_fs, &macsec_rule->tx_rule) : + macsec_fs_rx_del_rule(macsec_fs, &macsec_rule->rx_rule); +} + +static void set_ipaddr_spec(const struct sockaddr *addr, + struct mlx5_flow_spec *spec, bool is_dst_ip) +{ + spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS; + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + outer_headers.ip_version); + + if (addr->sa_family == AF_INET) { + struct sockaddr_in *in = (struct sockaddr_in *)addr; + + MLX5_SET(fte_match_param, spec->match_value, + outer_headers.ip_version, MLX5_FS_IPV4_VERSION); + + if (is_dst_ip) { + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + outer_headers.dst_ipv4_dst_ipv6.ipv4_layout.ipv4); + memcpy(MLX5_ADDR_OF(fte_match_param, spec->match_value, + outer_headers.dst_ipv4_dst_ipv6.ipv4_layout.ipv4), + &in->sin_addr.s_addr, 4); + } else { + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + outer_headers.src_ipv4_src_ipv6.ipv4_layout.ipv4); + memcpy(MLX5_ADDR_OF(fte_match_param, spec->match_value, + outer_headers.src_ipv4_src_ipv6.ipv4_layout.ipv4), + &in->sin_addr.s_addr, 4); + } + + } else { + struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)addr; + + MLX5_SET(fte_match_param, spec->match_value, + outer_headers.ip_version, MLX5_FS_IPV6_VERSION); + + if (is_dst_ip) { + memset(MLX5_ADDR_OF(fte_match_param, spec->match_criteria, + outer_headers.dst_ipv4_dst_ipv6.ipv6_layout.ipv6), + 0xff, 16); + memcpy(MLX5_ADDR_OF(fte_match_param, spec->match_value, + outer_headers.dst_ipv4_dst_ipv6.ipv6_layout.ipv6), + &in6->sin6_addr.s6_addr, 16); + } else { + memset(MLX5_ADDR_OF(fte_match_param, spec->match_criteria, + outer_headers.src_ipv4_src_ipv6.ipv6_layout.ipv6), + 0xff, 16); + memcpy(MLX5_ADDR_OF(fte_match_param, spec->match_value, + outer_headers.src_ipv4_src_ipv6.ipv6_layout.ipv6), + &in6->sin6_addr.s6_addr, 16); + } + } +} + +int macsec_fs_add_roce_rule_rx(struct mlx5e_macsec_fs *macsec_fs, u32 fs_id, + const struct sockaddr *addr, union mlx5e_macsec_rule *rule) +{ + struct mlx5e_macsec_rx_rule *rx_rule = &rule->rx_rule; + struct mlx5e_macsec_rx *rx_fs = macsec_fs->rx_fs; + struct mlx5_flow_destination dest = {}; + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_handle *new_rule; + struct mlx5_flow_spec *spec; + int err = 0; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) + return -ENOMEM; + + set_ipaddr_spec(addr, spec, true); + + mutex_lock(&rx_fs->roce.lock); + + if (!rx_fs->roce.nic_ft || !rx_fs->roce.ft_rdma_rx_ip_check) { + err = -EOPNOTSUPP; + goto out; + } + + if (rx_rule->roce_ip_rule) { /* Rules that were added for an outdated IP */ + mlx5_del_flow_rules(rx_rule->roce_macsec_op_rule); + mlx5_del_flow_rules(rx_rule->roce_ip_rule); + rx_rule->roce_macsec_op_rule = NULL; + rx_rule->roce_ip_rule = NULL; + } + + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + dest.ft = rx_fs->roce.ft_rdma_rx_macsec_op_check; + dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + new_rule = mlx5_add_flow_rules(rx_fs->roce.ft_rdma_rx_ip_check, spec, &flow_act, + &dest, 1); + if (IS_ERR(new_rule)) { + err = PTR_ERR(new_rule); + goto out; + } + rx_rule->roce_ip_rule = new_rule; + + memset(&flow_act, 0, sizeof(flow_act)); + memset(spec, 0, sizeof(*spec)); + + spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS_2; + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, misc_parameters_2.metadata_reg_c_5); + MLX5_SET(fte_match_param, spec->match_value, misc_parameters_2.metadata_reg_c_5, + fs_id | BIT(30)); + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_ALLOW; + new_rule = mlx5_add_flow_rules(rx_fs->roce.ft_rdma_rx_macsec_op_check, spec, &flow_act, + NULL, 0); + if (IS_ERR(new_rule)) { + err = PTR_ERR(new_rule); + goto macsec_rule_err; + } + rx_rule->roce_macsec_op_rule = new_rule; + + goto out; + +macsec_rule_err: + mlx5_del_flow_rules(rx_rule->roce_ip_rule); + rx_rule->roce_ip_rule = NULL; +out: + mutex_unlock(&rx_fs->roce.lock); + kvfree(spec); + + return err; +} + +void macsec_fs_del_roce_rule_rx(struct mlx5e_macsec_fs *macsec_fs, + union mlx5e_macsec_rule *rule) +{ + struct mlx5e_macsec_rx_rule *rx_rule = &rule->rx_rule; + struct mlx5e_macsec_rx *rx_fs = macsec_fs->rx_fs; + + if (!is_macsec_roce_supported(macsec_fs->mdev)) + return; + + mutex_lock(&rx_fs->roce.lock); + + if (!rx_rule->roce_ip_rule) + goto out; + mlx5_del_flow_rules(rx_rule->roce_macsec_op_rule); + mlx5_del_flow_rules(rx_rule->roce_ip_rule); + rx_rule->roce_macsec_op_rule = NULL; + rx_rule->roce_ip_rule = NULL; +out: + mutex_unlock(&rx_fs->roce.lock); +} + +int macsec_fs_add_roce_rule_tx(struct mlx5e_macsec_fs *macsec_fs, u32 fs_id, + const struct sockaddr *addr, union mlx5e_macsec_rule *rule) +{ + u8 action[MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto)] = {}; + struct mlx5e_macsec_tx_rule *tx_rule = &rule->tx_rule; + struct mlx5e_macsec_tx *tx_fs = macsec_fs->tx_fs; + struct mlx5_modify_hdr *modify_hdr = NULL; + struct mlx5_flow_destination dest = {}; + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_handle *new_rule; + struct mlx5_flow_spec *spec; + int err = 0; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) + return -ENOMEM; + + set_ipaddr_spec(addr, spec, false); + + mutex_lock(&tx_fs->roce_lock); + + if (!tx_fs->roce_ft_rdma_tx) { + err = -EOPNOTSUPP; + goto out; + } + + if (tx_rule->roce_rule_tx) { /* Rules that were added for an outdated IP */ + mlx5_del_flow_rules(tx_rule->roce_rule_tx); + tx_rule->roce_rule_tx = NULL; + mlx5_modify_header_dealloc(macsec_fs->mdev, tx_rule->roce_meta_modhdr); + tx_rule->roce_meta_modhdr = NULL; + } + + MLX5_SET(set_action_in, action, action_type, MLX5_ACTION_TYPE_SET); + MLX5_SET(set_action_in, action, field, MLX5_ACTION_IN_FIELD_METADATA_REG_A); + MLX5_SET(set_action_in, action, data, MLX5_ETH_WQE_FT_META_MACSEC | fs_id << 2); + MLX5_SET(set_action_in, action, offset, 0); + MLX5_SET(set_action_in, action, length, 32); + + modify_hdr = mlx5_modify_header_alloc(macsec_fs->mdev, MLX5_FLOW_NAMESPACE_RDMA_TX_MACSEC, + 1, action); + if (IS_ERR(modify_hdr)) { + err = PTR_ERR(modify_hdr); + netdev_err(macsec_fs->netdev, "fail to alloc ROCE MACsec set modify_header_id err=%d\n", + err); + modify_hdr = NULL; + goto out; + } + tx_rule->roce_meta_modhdr = modify_hdr; + + flow_act.modify_hdr = modify_hdr; + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | + MLX5_FLOW_CONTEXT_ACTION_MOD_HDR; + + dest.type = MLX5_FLOW_DESTINATION_TYPE_TABLE_TYPE; + dest.ft = tx_fs->tables.ft_crypto.t; + new_rule = mlx5_add_flow_rules(tx_fs->roce_ft_rdma_tx, spec, &flow_act, &dest, 1); + if (IS_ERR(new_rule)) { + err = PTR_ERR(new_rule); + netdev_err(macsec_fs->netdev, "Failed to add ROCE TX rule, err=%d\n", err); + goto rule_err; + } + tx_rule->roce_rule_tx = new_rule; + + goto out; + +rule_err: + mlx5_modify_header_dealloc(macsec_fs->mdev, tx_rule->roce_meta_modhdr); +out: + mutex_unlock(&tx_fs->roce_lock); + kvfree(spec); + return err; +} + +void macsec_fs_del_roce_rule_tx(struct mlx5e_macsec_fs *macsec_fs, + union mlx5e_macsec_rule *rule) +{ + struct mlx5e_macsec_tx_rule *tx_rule = &rule->tx_rule; + struct mlx5e_macsec_tx *tx_fs = macsec_fs->tx_fs; + + if (!is_macsec_roce_supported(macsec_fs->mdev)) + return; + + mutex_lock(&tx_fs->roce_lock); + + if (!tx_rule->roce_rule_tx) + goto out; + + mlx5_del_flow_rules(tx_rule->roce_rule_tx); + tx_rule->roce_rule_tx = NULL; + mlx5_modify_header_dealloc(macsec_fs->mdev, tx_rule->roce_meta_modhdr); + tx_rule->roce_meta_modhdr = NULL; +out: + mutex_unlock(&tx_fs->roce_lock); +} + +void mlx5e_macsec_fs_cleanup(struct mlx5e_macsec_fs *macsec_fs) +{ + macsec_fs_rx_cleanup(macsec_fs); + macsec_fs_tx_cleanup(macsec_fs); + kfree(macsec_fs); +} + +struct mlx5e_macsec_fs * +mlx5e_macsec_fs_init(struct mlx5_core_dev *mdev, + struct net_device *netdev) +{ + struct mlx5e_macsec_fs *macsec_fs; + int err; + + macsec_fs = kzalloc(sizeof(*macsec_fs), GFP_KERNEL); + if (!macsec_fs) + return NULL; + + macsec_fs->mdev = mdev; + macsec_fs->netdev = netdev; + + err = macsec_fs_tx_init(macsec_fs); + if (err) { + netdev_err(netdev, "MACsec offload: Failed to init tx_fs, err=%d\n", err); + goto err; + } + + err = macsec_fs_rx_init(macsec_fs); + if (err) { + netdev_err(netdev, "MACsec offload: Failed to init tx_fs, err=%d\n", err); + goto tx_cleanup; + } + + return macsec_fs; + +tx_cleanup: + macsec_fs_tx_cleanup(macsec_fs); +err: + kfree(macsec_fs); + return NULL; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec_fs.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec_fs.h new file mode 100644 index 0000000..22ce40a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec_fs.h @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#ifndef __MLX5_MACSEC_STEERING_H__ +#define __MLX5_MACSEC_STEERING_H__ + +#ifdef CONFIG_MLX5_EN_MACSEC + +#include "en_accel/macsec.h" + +#define MLX5_MACSEC_NUM_OF_SUPPORTED_INTERFACES 16 + +struct mlx5e_macsec_fs; +union mlx5e_macsec_rule; + +struct mlx5_macsec_rule_attrs { + sci_t sci; + u32 macsec_obj_id; + u8 assoc_num; + int action; +}; + +enum mlx5_macsec_action { + MLX5_ACCEL_MACSEC_ACTION_ENCRYPT, + MLX5_ACCEL_MACSEC_ACTION_DECRYPT, +}; + +void mlx5e_macsec_fs_cleanup(struct mlx5e_macsec_fs *macsec_fs); + +struct mlx5e_macsec_fs * +mlx5e_macsec_fs_init(struct mlx5_core_dev *mdev, struct net_device *netdev); + +union mlx5e_macsec_rule * +mlx5e_macsec_fs_add_rule(struct mlx5e_macsec_fs *macsec_fs, + const struct macsec_context *ctx, + struct mlx5_macsec_rule_attrs *attrs, + u32 *sa_fs_id); + +void mlx5e_macsec_fs_del_rule(struct mlx5e_macsec_fs *macsec_fs, + union mlx5e_macsec_rule *macsec_rule, + int action); + +void mlx5e_macsec_fs_get_stats_fill(struct mlx5e_macsec_fs *macsec_fs, void *macsec_stats); + +int macsec_fs_add_roce_rule_rx(struct mlx5e_macsec_fs *macsec_fs, u32 fs_id, + const struct sockaddr *addr, union mlx5e_macsec_rule *rule); + +void macsec_fs_del_roce_rule_rx(struct mlx5e_macsec_fs *macsec_fs, + union mlx5e_macsec_rule *rule); + +int macsec_fs_add_roce_rule_tx(struct mlx5e_macsec_fs *macsec_fs, u32 fs_id, + const struct sockaddr *addr, union mlx5e_macsec_rule *rule); + +void macsec_fs_del_roce_rule_tx(struct mlx5e_macsec_fs *macsec_fs, + union mlx5e_macsec_rule *rule); + +#endif + +#endif /* __MLX5_MACSEC_STEERING_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec_stats.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec_stats.c new file mode 100644 index 0000000..e50a2e3 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec_stats.c @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#include +#include + +#include "en.h" +#include "en_accel/macsec.h" + +static const struct counter_desc mlx5e_macsec_hw_stats_desc[] = { + { MLX5E_DECLARE_STAT(struct mlx5e_macsec_stats, macsec_rx_pkts) }, + { MLX5E_DECLARE_STAT(struct mlx5e_macsec_stats, macsec_rx_bytes) }, + { MLX5E_DECLARE_STAT(struct mlx5e_macsec_stats, macsec_rx_pkts_drop) }, + { MLX5E_DECLARE_STAT(struct mlx5e_macsec_stats, macsec_rx_bytes_drop) }, + { MLX5E_DECLARE_STAT(struct mlx5e_macsec_stats, macsec_tx_pkts) }, + { MLX5E_DECLARE_STAT(struct mlx5e_macsec_stats, macsec_tx_bytes) }, + { MLX5E_DECLARE_STAT(struct mlx5e_macsec_stats, macsec_tx_pkts_drop) }, + { MLX5E_DECLARE_STAT(struct mlx5e_macsec_stats, macsec_tx_bytes_drop) }, +}; + +#define NUM_MACSEC_HW_COUNTERS ARRAY_SIZE(mlx5e_macsec_hw_stats_desc) + +static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(macsec_hw) +{ + if (!priv->macsec) + return 0; + + if (mlx5e_is_macsec_device(priv->mdev)) + return NUM_MACSEC_HW_COUNTERS; + + return 0; +} + +static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(macsec_hw) {} + +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(macsec_hw) +{ + unsigned int i; + + if (!priv->macsec) + return idx; + + if (!mlx5e_is_macsec_device(priv->mdev)) + return idx; + + for (i = 0; i < NUM_MACSEC_HW_COUNTERS; i++) + strcpy(data + (idx++) * ETH_GSTRING_LEN, + mlx5e_macsec_hw_stats_desc[i].format); + + return idx; +} + +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(macsec_hw) +{ + int i; + + if (!priv->macsec) + return idx; + + if (!mlx5e_is_macsec_device(priv->mdev)) + return idx; + + mlx5e_macsec_get_stats_fill(priv->macsec, mlx5e_macsec_get_stats(priv->macsec)); + for (i = 0; i < NUM_MACSEC_HW_COUNTERS; i++) + data[idx++] = MLX5E_READ_CTR64_CPU(mlx5e_macsec_get_stats(priv->macsec), + mlx5e_macsec_hw_stats_desc, + i); + + return idx; +} + +MLX5E_DEFINE_STATS_GRP(macsec_hw, 0); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c new file mode 100644 index 0000000..b8fc863 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c @@ -0,0 +1,247 @@ +/* + * Copyright (c) 2018 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include +#include "en_accel/tls.h" +#include "accel/tls.h" + +static void mlx5e_tls_set_ipv4_flow(void *flow, struct sock *sk) +{ + struct inet_sock *inet = inet_sk(sk); + + MLX5_SET(tls_flow, flow, ipv6, 0); + memcpy(MLX5_ADDR_OF(tls_flow, flow, dst_ipv4_dst_ipv6.ipv4_layout.ipv4), + &inet->inet_daddr, MLX5_FLD_SZ_BYTES(ipv4_layout, ipv4)); + memcpy(MLX5_ADDR_OF(tls_flow, flow, src_ipv4_src_ipv6.ipv4_layout.ipv4), + &inet->inet_rcv_saddr, MLX5_FLD_SZ_BYTES(ipv4_layout, ipv4)); +} + +#if IS_ENABLED(CONFIG_IPV6) +static void mlx5e_tls_set_ipv6_flow(void *flow, struct sock *sk) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + + MLX5_SET(tls_flow, flow, ipv6, 1); + memcpy(MLX5_ADDR_OF(tls_flow, flow, dst_ipv4_dst_ipv6.ipv6_layout.ipv6), + &sk->sk_v6_daddr, MLX5_FLD_SZ_BYTES(ipv6_layout, ipv6)); + memcpy(MLX5_ADDR_OF(tls_flow, flow, src_ipv4_src_ipv6.ipv6_layout.ipv6), + &np->saddr, MLX5_FLD_SZ_BYTES(ipv6_layout, ipv6)); +} +#endif + +static void mlx5e_tls_set_flow_tcp_ports(void *flow, struct sock *sk) +{ + struct inet_sock *inet = inet_sk(sk); + + memcpy(MLX5_ADDR_OF(tls_flow, flow, src_port), &inet->inet_sport, + MLX5_FLD_SZ_BYTES(tls_flow, src_port)); + memcpy(MLX5_ADDR_OF(tls_flow, flow, dst_port), &inet->inet_dport, + MLX5_FLD_SZ_BYTES(tls_flow, dst_port)); +} + +static int mlx5e_tls_set_flow(void *flow, struct sock *sk, u32 caps) +{ + switch (sk->sk_family) { + case AF_INET: + mlx5e_tls_set_ipv4_flow(flow, sk); + break; +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + if (!sk->sk_ipv6only && + ipv6_addr_type(&sk->sk_v6_daddr) == IPV6_ADDR_MAPPED) { + mlx5e_tls_set_ipv4_flow(flow, sk); + break; + } + if (!(caps & MLX5_ACCEL_TLS_IPV6)) + goto error_out; + + mlx5e_tls_set_ipv6_flow(flow, sk); + break; +#endif + default: + goto error_out; + } + + mlx5e_tls_set_flow_tcp_ports(flow, sk); + return 0; +error_out: + return -EINVAL; +} + +static int mlx5e_tls_add(struct net_device *netdev, struct sock *sk, + enum tls_offload_ctx_dir direction, + struct tls_crypto_info *crypto_info, + u32 start_offload_tcp_sn) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct mlx5_core_dev *mdev = priv->mdev; + u32 caps = mlx5_accel_tls_device_caps(mdev); + int ret = -ENOMEM; + void *flow; + u32 swid; + + flow = kzalloc(MLX5_ST_SZ_BYTES(tls_flow), GFP_KERNEL); + if (!flow) + return ret; + + ret = mlx5e_tls_set_flow(flow, sk, caps); + if (ret) + goto free_flow; + + ret = mlx5_accel_tls_add_flow(mdev, flow, crypto_info, + start_offload_tcp_sn, &swid, + direction == TLS_OFFLOAD_CTX_DIR_TX); + if (ret < 0) + goto free_flow; + + if (direction == TLS_OFFLOAD_CTX_DIR_TX) { + struct mlx5e_tls_offload_context_tx *tx_ctx = + mlx5e_get_tls_tx_context(tls_ctx); + + tx_ctx->swid = htonl(swid); + tx_ctx->expected_seq = start_offload_tcp_sn; + } else { + struct mlx5e_tls_offload_context_rx *rx_ctx = + mlx5e_get_tls_rx_context(tls_ctx); + + rx_ctx->handle = htonl(swid); + } + + return 0; +free_flow: + kfree(flow); + return ret; +} + +static void mlx5e_tls_del(struct net_device *netdev, + struct tls_context *tls_ctx, + enum tls_offload_ctx_dir direction) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + unsigned int handle; + + handle = ntohl((direction == TLS_OFFLOAD_CTX_DIR_TX) ? + mlx5e_get_tls_tx_context(tls_ctx)->swid : + mlx5e_get_tls_rx_context(tls_ctx)->handle); + + mlx5_accel_tls_del_flow(priv->mdev, handle, + direction == TLS_OFFLOAD_CTX_DIR_TX); +} + +static int mlx5e_tls_resync(struct net_device *netdev, struct sock *sk, + u32 seq, u8 *rcd_sn_data, + enum tls_offload_ctx_dir direction) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5e_tls_offload_context_rx *rx_ctx; + __be64 rcd_sn = *(__be64 *)rcd_sn_data; + + if (WARN_ON_ONCE(direction != TLS_OFFLOAD_CTX_DIR_RX)) + return -EINVAL; + rx_ctx = mlx5e_get_tls_rx_context(tls_ctx); + + netdev_info(netdev, "resyncing seq %d rcd %lld\n", seq, + be64_to_cpu(rcd_sn)); + mlx5_accel_tls_resync_rx(priv->mdev, rx_ctx->handle, seq, rcd_sn); + atomic64_inc(&priv->tls->sw_stats.rx_tls_resync_reply); + + return 0; +} + +static const struct tlsdev_ops mlx5e_tls_ops = { + .tls_dev_add = mlx5e_tls_add, + .tls_dev_del = mlx5e_tls_del, + .tls_dev_resync = mlx5e_tls_resync, +}; + +void mlx5e_tls_build_netdev(struct mlx5e_priv *priv) +{ + struct net_device *netdev = priv->netdev; + u32 caps; + + if (mlx5e_accel_is_ktls_device(priv->mdev)) { + mlx5e_ktls_build_netdev(priv); + return; + } + + /* FPGA */ + if (!mlx5e_accel_is_tls_device(priv->mdev)) + return; + + caps = mlx5_accel_tls_device_caps(priv->mdev); + if (caps & MLX5_ACCEL_TLS_TX) { + netdev->features |= NETIF_F_HW_TLS_TX; + netdev->hw_features |= NETIF_F_HW_TLS_TX; + } + + if (caps & MLX5_ACCEL_TLS_RX) { + netdev->features |= NETIF_F_HW_TLS_RX; + netdev->hw_features |= NETIF_F_HW_TLS_RX; + } + + if (!(caps & MLX5_ACCEL_TLS_LRO)) { + netdev->features &= ~NETIF_F_LRO; + netdev->hw_features &= ~NETIF_F_LRO; + } + + netdev->tlsdev_ops = &mlx5e_tls_ops; +} + +int mlx5e_tls_init(struct mlx5e_priv *priv) +{ + struct mlx5e_tls *tls; + + if (!mlx5e_accel_is_tls_device(priv->mdev)) + return 0; + + tls = kzalloc(sizeof(*tls), GFP_KERNEL); + if (!tls) + return -ENOMEM; + + priv->tls = tls; + return 0; +} + +void mlx5e_tls_cleanup(struct mlx5e_priv *priv) +{ + struct mlx5e_tls *tls = priv->tls; + + if (!tls) + return; + + kfree(tls); + priv->tls = NULL; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.h new file mode 100644 index 0000000..62ecf14 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.h @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2018 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#ifndef __MLX5E_TLS_H__ +#define __MLX5E_TLS_H__ + +#include "accel/tls.h" +#include "en_accel/ktls.h" + +#ifdef CONFIG_MLX5_EN_TLS +#include +#include "en.h" + +struct mlx5e_tls_sw_stats { + atomic64_t tx_tls_ctx; + atomic64_t tx_tls_del; + atomic64_t tx_tls_drop_metadata; + atomic64_t tx_tls_drop_resync_alloc; + atomic64_t tx_tls_drop_no_sync_data; + atomic64_t tx_tls_drop_bypass_required; + atomic64_t rx_tls_ctx; + atomic64_t rx_tls_del; + atomic64_t rx_tls_drop_resync_request; + atomic64_t rx_tls_resync_request; + atomic64_t rx_tls_resync_reply; + atomic64_t rx_tls_auth_fail; +}; + +struct mlx5e_tls { + struct mlx5e_tls_sw_stats sw_stats; + struct workqueue_struct *rx_wq; +}; + +struct mlx5e_tls_offload_context_tx { + struct tls_offload_context_tx base; + u32 expected_seq; + __be32 swid; +}; + +static inline struct mlx5e_tls_offload_context_tx * +mlx5e_get_tls_tx_context(struct tls_context *tls_ctx) +{ + BUILD_BUG_ON(sizeof(struct mlx5e_tls_offload_context_tx) > + TLS_OFFLOAD_CONTEXT_SIZE_TX); + return container_of(tls_offload_ctx_tx(tls_ctx), + struct mlx5e_tls_offload_context_tx, + base); +} + +struct mlx5e_tls_offload_context_rx { + struct tls_offload_context_rx base; + __be32 handle; +}; + +static inline struct mlx5e_tls_offload_context_rx * +mlx5e_get_tls_rx_context(struct tls_context *tls_ctx) +{ + BUILD_BUG_ON(sizeof(struct mlx5e_tls_offload_context_rx) > + TLS_OFFLOAD_CONTEXT_SIZE_RX); + return container_of(tls_offload_ctx_rx(tls_ctx), + struct mlx5e_tls_offload_context_rx, + base); +} + +static inline bool mlx5e_is_tls_on(struct mlx5e_priv *priv) +{ + return priv->tls; +} + +void mlx5e_tls_build_netdev(struct mlx5e_priv *priv); +int mlx5e_tls_init(struct mlx5e_priv *priv); +void mlx5e_tls_cleanup(struct mlx5e_priv *priv); + +int mlx5e_tls_get_count(struct mlx5e_priv *priv); +int mlx5e_tls_get_strings(struct mlx5e_priv *priv, uint8_t *data); +int mlx5e_tls_get_stats(struct mlx5e_priv *priv, u64 *data); + +static inline bool mlx5e_accel_is_tls_device(struct mlx5_core_dev *mdev) +{ + return !is_kdump_kernel() && + mlx5_accel_is_tls_device(mdev); +} + +#else + +static inline void mlx5e_tls_build_netdev(struct mlx5e_priv *priv) +{ + if (!is_kdump_kernel() && + mlx5_accel_is_ktls_device(priv->mdev)) + mlx5e_ktls_build_netdev(priv); +} + +static inline bool mlx5e_is_tls_on(struct mlx5e_priv *priv) { return false; } +static inline int mlx5e_tls_init(struct mlx5e_priv *priv) { return 0; } +static inline void mlx5e_tls_cleanup(struct mlx5e_priv *priv) { } +static inline int mlx5e_tls_get_count(struct mlx5e_priv *priv) { return 0; } +static inline int mlx5e_tls_get_strings(struct mlx5e_priv *priv, uint8_t *data) { return 0; } +static inline int mlx5e_tls_get_stats(struct mlx5e_priv *priv, u64 *data) { return 0; } +static inline bool mlx5e_accel_is_tls_device(struct mlx5_core_dev *mdev) { return false; } + +#endif + +#endif /* __MLX5E_TLS_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.c new file mode 100644 index 0000000..a05580c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.c @@ -0,0 +1,390 @@ +/* + * Copyright (c) 2018 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include "en_accel/tls.h" +#include "en_accel/tls_rxtx.h" +#include "accel/accel.h" + +#include +#include + +#define SYNDROM_DECRYPTED 0x30 +#define SYNDROM_RESYNC_REQUEST 0x31 +#define SYNDROM_AUTH_FAILED 0x32 + +#define SYNDROME_OFFLOAD_REQUIRED 32 +#define SYNDROME_SYNC 33 + +struct sync_info { + u64 rcd_sn; + s32 sync_len; + int nr_frags; + skb_frag_t frags[MAX_SKB_FRAGS]; +}; + +struct recv_metadata_content { + u8 syndrome; + u8 reserved; + __be32 sync_seq; +} __packed; + +struct send_metadata_content { + /* One byte of syndrome followed by 3 bytes of swid */ + __be32 syndrome_swid; + __be16 first_seq; +} __packed; + +struct mlx5e_tls_metadata { + union { + /* from fpga to host */ + struct recv_metadata_content recv; + /* from host to fpga */ + struct send_metadata_content send; + unsigned char raw[6]; + } __packed content; + /* packet type ID field */ + __be16 ethertype; +} __packed; + +static int mlx5e_tls_add_metadata(struct sk_buff *skb, __be32 swid) +{ + struct mlx5e_tls_metadata *pet; + struct ethhdr *eth; + + if (skb_cow_head(skb, sizeof(struct mlx5e_tls_metadata))) + return -ENOMEM; + + eth = (struct ethhdr *)skb_push(skb, sizeof(struct mlx5e_tls_metadata)); + skb->mac_header -= sizeof(struct mlx5e_tls_metadata); + pet = (struct mlx5e_tls_metadata *)(eth + 1); + + memmove(skb->data, skb->data + sizeof(struct mlx5e_tls_metadata), + 2 * ETH_ALEN); + + eth->h_proto = cpu_to_be16(MLX5E_METADATA_ETHER_TYPE); + pet->content.send.syndrome_swid = + htonl(SYNDROME_OFFLOAD_REQUIRED << 24) | swid; + + return 0; +} + +static int mlx5e_tls_get_sync_data(struct mlx5e_tls_offload_context_tx *context, + u32 tcp_seq, struct sync_info *info) +{ + int remaining, i = 0, ret = -EINVAL; + struct tls_record_info *record; + unsigned long flags; + s32 sync_size; + + spin_lock_irqsave(&context->base.lock, flags); + record = tls_get_record(&context->base, tcp_seq, &info->rcd_sn); + + if (unlikely(!record)) + goto out; + + sync_size = tcp_seq - tls_record_start_seq(record); + info->sync_len = sync_size; + if (unlikely(sync_size < 0)) { + if (tls_record_is_start_marker(record)) + goto done; + + goto out; + } + + remaining = sync_size; + while (remaining > 0) { + info->frags[i] = record->frags[i]; + __skb_frag_ref(&info->frags[i]); + remaining -= skb_frag_size(&info->frags[i]); + + if (remaining < 0) + skb_frag_size_add(&info->frags[i], remaining); + + i++; + } + info->nr_frags = i; +done: + ret = 0; +out: + spin_unlock_irqrestore(&context->base.lock, flags); + return ret; +} + +static void mlx5e_tls_complete_sync_skb(struct sk_buff *skb, + struct sk_buff *nskb, u32 tcp_seq, + int headln, __be64 rcd_sn) +{ + struct mlx5e_tls_metadata *pet; + u8 syndrome = SYNDROME_SYNC; + struct iphdr *iph; + struct tcphdr *th; + int data_len, mss; + + nskb->dev = skb->dev; + skb_reset_mac_header(nskb); + skb_set_network_header(nskb, skb_network_offset(skb)); + skb_set_transport_header(nskb, skb_transport_offset(skb)); + memcpy(nskb->data, skb->data, headln); + memcpy(nskb->data + headln, &rcd_sn, sizeof(rcd_sn)); + + iph = ip_hdr(nskb); + iph->tot_len = htons(nskb->len - skb_network_offset(nskb)); + th = tcp_hdr(nskb); + data_len = nskb->len - headln; + tcp_seq -= data_len; + th->seq = htonl(tcp_seq); + + mss = nskb->dev->mtu - (headln - skb_network_offset(nskb)); + skb_shinfo(nskb)->gso_size = 0; + if (data_len > mss) { + skb_shinfo(nskb)->gso_size = mss; + skb_shinfo(nskb)->gso_segs = DIV_ROUND_UP(data_len, mss); + } + skb_shinfo(nskb)->gso_type = skb_shinfo(skb)->gso_type; + + pet = (struct mlx5e_tls_metadata *)(nskb->data + sizeof(struct ethhdr)); + memcpy(pet, &syndrome, sizeof(syndrome)); + pet->content.send.first_seq = htons(tcp_seq); + + /* MLX5 devices don't care about the checksum partial start, offset + * and pseudo header + */ + nskb->ip_summed = CHECKSUM_PARTIAL; + + nskb->queue_mapping = skb->queue_mapping; +} + +static bool mlx5e_tls_handle_ooo(struct mlx5e_tls_offload_context_tx *context, + struct mlx5e_txqsq *sq, struct sk_buff *skb, + struct mlx5e_tls *tls) +{ + u32 tcp_seq = ntohl(tcp_hdr(skb)->seq); + struct sync_info info; + struct sk_buff *nskb; + int linear_len = 0; + int headln; + int i; + + sq->stats->tls_ooo++; + + if (mlx5e_tls_get_sync_data(context, tcp_seq, &info)) { + /* We might get here if a retransmission reaches the driver + * after the relevant record is acked. + * It should be safe to drop the packet in this case + */ + atomic64_inc(&tls->sw_stats.tx_tls_drop_no_sync_data); + goto err_out; + } + + if (unlikely(info.sync_len < 0)) { + u32 payload; + + headln = skb_transport_offset(skb) + tcp_hdrlen(skb); + payload = skb->len - headln; + if (likely(payload <= -info.sync_len)) + /* SKB payload doesn't require offload + */ + return true; + + atomic64_inc(&tls->sw_stats.tx_tls_drop_bypass_required); + goto err_out; + } + + if (unlikely(mlx5e_tls_add_metadata(skb, context->swid))) { + atomic64_inc(&tls->sw_stats.tx_tls_drop_metadata); + goto err_out; + } + + headln = skb_transport_offset(skb) + tcp_hdrlen(skb); + linear_len += headln + sizeof(info.rcd_sn); + nskb = alloc_skb(linear_len, GFP_ATOMIC); + if (unlikely(!nskb)) { + atomic64_inc(&tls->sw_stats.tx_tls_drop_resync_alloc); + goto err_out; + } + + context->expected_seq = tcp_seq + skb->len - headln; + skb_put(nskb, linear_len); + for (i = 0; i < info.nr_frags; i++) + skb_shinfo(nskb)->frags[i] = info.frags[i]; + + skb_shinfo(nskb)->nr_frags = info.nr_frags; + nskb->data_len = info.sync_len; + nskb->len += info.sync_len; + sq->stats->tls_resync_bytes += nskb->len; + mlx5e_tls_complete_sync_skb(skb, nskb, tcp_seq, headln, + cpu_to_be64(info.rcd_sn)); + mlx5e_sq_xmit_simple(sq, nskb, true); + + return true; + +err_out: + dev_kfree_skb_any(skb); + return false; +} + +bool mlx5e_tls_handle_tx_skb(struct net_device *netdev, struct mlx5e_txqsq *sq, + struct sk_buff *skb, struct mlx5e_accel_tx_tls_state *state) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5e_tls_offload_context_tx *context; + struct tls_context *tls_ctx; + u32 expected_seq; + int datalen; + u32 skb_seq; + + datalen = skb->len - (skb_transport_offset(skb) + tcp_hdrlen(skb)); + if (!datalen) + return true; + + mlx5e_tx_mpwqe_ensure_complete(sq); + + tls_ctx = tls_get_ctx(skb->sk); + if (WARN_ON_ONCE(tls_ctx->netdev != netdev)) + goto err_out; + + if (mlx5e_accel_is_ktls_tx(sq->mdev)) + return mlx5e_ktls_handle_tx_skb(tls_ctx, sq, skb, datalen, state); + + /* FPGA */ + skb_seq = ntohl(tcp_hdr(skb)->seq); + context = mlx5e_get_tls_tx_context(tls_ctx); + expected_seq = context->expected_seq; + + if (unlikely(expected_seq != skb_seq)) + return mlx5e_tls_handle_ooo(context, sq, skb, priv->tls); + + if (unlikely(mlx5e_tls_add_metadata(skb, context->swid))) { + atomic64_inc(&priv->tls->sw_stats.tx_tls_drop_metadata); + dev_kfree_skb_any(skb); + return false; + } + + context->expected_seq = skb_seq + datalen; + return true; + +err_out: + dev_kfree_skb_any(skb); + return false; +} + +static int tls_update_resync_sn(struct net_device *netdev, + struct sk_buff *skb, + struct mlx5e_tls_metadata *mdata) +{ + struct sock *sk = NULL; + struct iphdr *iph; + struct tcphdr *th; + __be32 seq; + + if (mdata->ethertype != htons(ETH_P_IP)) + return -EINVAL; + + iph = (struct iphdr *)(mdata + 1); + + th = ((void *)iph) + iph->ihl * 4; + + if (iph->version == 4) { + sk = inet_lookup_established(dev_net(netdev), &tcp_hashinfo, + iph->saddr, th->source, iph->daddr, + th->dest, netdev->ifindex); +#if IS_ENABLED(CONFIG_IPV6) + } else { + struct ipv6hdr *ipv6h = (struct ipv6hdr *)iph; + + sk = __inet6_lookup_established(dev_net(netdev), &tcp_hashinfo, + &ipv6h->saddr, th->source, + &ipv6h->daddr, ntohs(th->dest), + netdev->ifindex, 0); +#endif + } + if (!sk || sk->sk_state == TCP_TIME_WAIT) { + struct mlx5e_priv *priv = netdev_priv(netdev); + + atomic64_inc(&priv->tls->sw_stats.rx_tls_drop_resync_request); + goto out; + } + + skb->sk = sk; + skb->destructor = sock_edemux; + + memcpy(&seq, &mdata->content.recv.sync_seq, sizeof(seq)); + tls_offload_rx_resync_request(sk, seq); +out: + return 0; +} + +/* FPGA tls rx handler */ +void mlx5e_tls_handle_rx_skb_metadata(struct mlx5e_rq *rq, struct sk_buff *skb, + u32 *cqe_bcnt) +{ + struct mlx5e_tls_metadata *mdata; + struct mlx5e_priv *priv; + + /* Use the metadata */ + mdata = (struct mlx5e_tls_metadata *)(skb->data + ETH_HLEN); + switch (mdata->content.recv.syndrome) { + case SYNDROM_DECRYPTED: + skb->decrypted = 1; + break; + case SYNDROM_RESYNC_REQUEST: + tls_update_resync_sn(rq->netdev, skb, mdata); + priv = netdev_priv(rq->netdev); + atomic64_inc(&priv->tls->sw_stats.rx_tls_resync_request); + break; + case SYNDROM_AUTH_FAILED: + /* Authentication failure will be observed and verified by kTLS */ + priv = netdev_priv(rq->netdev); + atomic64_inc(&priv->tls->sw_stats.rx_tls_auth_fail); + break; + default: + /* Bypass the metadata header to others */ + return; + } + + remove_metadata_hdr(skb); + *cqe_bcnt -= MLX5E_METADATA_ETHER_LEN; +} + +u16 mlx5e_tls_get_stop_room(struct mlx5_core_dev *mdev, struct mlx5e_params *params) +{ + if (!mlx5e_accel_is_tls_device(mdev)) + return 0; + + if (mlx5e_accel_is_ktls_device(mdev)) + return mlx5e_ktls_get_stop_room(mdev, params); + + /* FPGA */ + /* Resync SKB. */ + return mlx5e_stop_room_for_max_wqe(mdev); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.h new file mode 100644 index 0000000..0ca0a02 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.h @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2018 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef __MLX5E_TLS_RXTX_H__ +#define __MLX5E_TLS_RXTX_H__ + +#include "accel/accel.h" +#include "en_accel/ktls_txrx.h" + +#ifdef CONFIG_MLX5_EN_TLS + +#include +#include "en.h" +#include "en/txrx.h" + +u16 mlx5e_tls_get_stop_room(struct mlx5_core_dev *mdev, struct mlx5e_params *params); + +bool mlx5e_tls_handle_tx_skb(struct net_device *netdev, struct mlx5e_txqsq *sq, + struct sk_buff *skb, struct mlx5e_accel_tx_tls_state *state); + +static inline bool mlx5e_tls_skb_offloaded(struct sk_buff *skb) +{ + return skb->sk && tls_is_sk_tx_device_offloaded(skb->sk); +} + +static inline void +mlx5e_tls_handle_tx_wqe(struct mlx5_wqe_ctrl_seg *cseg, + struct mlx5e_accel_tx_tls_state *state) +{ + cseg->tis_tir_num = cpu_to_be32(state->tls_tisn << 8); +} + +void mlx5e_tls_handle_rx_skb_metadata(struct mlx5e_rq *rq, struct sk_buff *skb, + u32 *cqe_bcnt); + +static inline void +mlx5e_tls_handle_rx_skb(struct mlx5e_rq *rq, struct sk_buff *skb, + struct mlx5_cqe64 *cqe, u32 *cqe_bcnt) +{ + if (unlikely(get_cqe_tls_offload(cqe))) /* cqe bit indicates a TLS device */ + return mlx5e_ktls_handle_rx_skb(rq, skb, cqe, cqe_bcnt); + + if (unlikely(test_bit(MLX5E_RQ_STATE_FPGA_TLS, &rq->state) && is_metadata_hdr_valid(skb))) + return mlx5e_tls_handle_rx_skb_metadata(rq, skb, cqe_bcnt); +} + +#else + +static inline bool +mlx5e_accel_is_tls(struct mlx5_cqe64 *cqe, struct sk_buff *skb) { return false; } +static inline void +mlx5e_tls_handle_rx_skb(struct mlx5e_rq *rq, struct sk_buff *skb, + struct mlx5_cqe64 *cqe, u32 *cqe_bcnt) {} +static inline u16 mlx5e_tls_get_stop_room(struct mlx5_core_dev *mdev, struct mlx5e_params *params) +{ + return 0; +} + +#endif /* CONFIG_MLX5_EN_TLS */ + +#endif /* __MLX5E_TLS_RXTX_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_stats.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_stats.c new file mode 100644 index 0000000..56e7b2a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_stats.c @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2018 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include + +#include "en.h" +#include "fpga/sdk.h" +#include "en_accel/tls.h" + +static const struct counter_desc mlx5e_tls_sw_stats_desc[] = { + { MLX5E_DECLARE_STAT(struct mlx5e_tls_sw_stats, tx_tls_drop_metadata) }, + { MLX5E_DECLARE_STAT(struct mlx5e_tls_sw_stats, tx_tls_drop_resync_alloc) }, + { MLX5E_DECLARE_STAT(struct mlx5e_tls_sw_stats, tx_tls_drop_no_sync_data) }, + { MLX5E_DECLARE_STAT(struct mlx5e_tls_sw_stats, tx_tls_drop_bypass_required) }, +}; + +static const struct counter_desc mlx5e_ktls_sw_stats_desc[] = { + { MLX5E_DECLARE_STAT(struct mlx5e_tls_sw_stats, tx_tls_ctx) }, + { MLX5E_DECLARE_STAT(struct mlx5e_tls_sw_stats, tx_tls_del) }, + { MLX5E_DECLARE_STAT(struct mlx5e_tls_sw_stats, rx_tls_ctx) }, + { MLX5E_DECLARE_STAT(struct mlx5e_tls_sw_stats, rx_tls_del) }, +}; + +#define MLX5E_READ_CTR_ATOMIC64(ptr, dsc, i) \ + atomic64_read((atomic64_t *)((char *)(ptr) + (dsc)[i].offset)) + +static const struct counter_desc *get_tls_atomic_stats(struct mlx5e_priv *priv) +{ + if (!priv->tls) + return NULL; + if (mlx5e_accel_is_ktls_device(priv->mdev)) + return mlx5e_ktls_sw_stats_desc; + return mlx5e_tls_sw_stats_desc; +} + +int mlx5e_tls_get_count(struct mlx5e_priv *priv) +{ + if (!priv->tls) + return 0; + if (mlx5e_accel_is_ktls_device(priv->mdev)) + return ARRAY_SIZE(mlx5e_ktls_sw_stats_desc); + return ARRAY_SIZE(mlx5e_tls_sw_stats_desc); +} + +int mlx5e_tls_get_strings(struct mlx5e_priv *priv, uint8_t *data) +{ + const struct counter_desc *stats_desc; + unsigned int i, n, idx = 0; + + stats_desc = get_tls_atomic_stats(priv); + n = mlx5e_tls_get_count(priv); + + for (i = 0; i < n; i++) + strcpy(data + (idx++) * ETH_GSTRING_LEN, + stats_desc[i].format); + + return n; +} + +int mlx5e_tls_get_stats(struct mlx5e_priv *priv, u64 *data) +{ + const struct counter_desc *stats_desc; + unsigned int i, n, idx = 0; + + stats_desc = get_tls_atomic_stats(priv); + n = mlx5e_tls_get_count(priv); + + for (i = 0; i < n; i++) + data[idx++] = + MLX5E_READ_CTR_ATOMIC64(&priv->tls->sw_stats, + stats_desc, i); + + return n; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c new file mode 100644 index 0000000..49cca6b --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c @@ -0,0 +1,732 @@ +/* + * Copyright (c) 2016, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include "en.h" + +#define ARFS_HASH_SHIFT BITS_PER_BYTE +#define ARFS_HASH_SIZE BIT(BITS_PER_BYTE) + +struct arfs_table { + struct mlx5e_flow_table ft; + struct mlx5_flow_handle *default_rule; + struct hlist_head rules_hash[ARFS_HASH_SIZE]; +}; + +enum arfs_type { + ARFS_IPV4_TCP, + ARFS_IPV6_TCP, + ARFS_IPV4_UDP, + ARFS_IPV6_UDP, + ARFS_NUM_TYPES, +}; + +struct mlx5e_arfs_tables { + struct arfs_table arfs_tables[ARFS_NUM_TYPES]; + /* Protect aRFS rules list */ + spinlock_t arfs_lock; + struct list_head rules; + int last_filter_id; + struct workqueue_struct *wq; +}; + +struct arfs_tuple { + __be16 etype; + u8 ip_proto; + union { + __be32 src_ipv4; + struct in6_addr src_ipv6; + }; + union { + __be32 dst_ipv4; + struct in6_addr dst_ipv6; + }; + __be16 src_port; + __be16 dst_port; +}; + +struct arfs_rule { + struct mlx5e_priv *priv; + struct work_struct arfs_work; + struct mlx5_flow_handle *rule; + struct hlist_node hlist; + int rxq; + /* Flow ID passed to ndo_rx_flow_steer */ + int flow_id; + /* Filter ID returned by ndo_rx_flow_steer */ + int filter_id; + struct arfs_tuple tuple; +}; + +#define mlx5e_for_each_arfs_rule(hn, tmp, arfs_tables, i, j) \ + for (i = 0; i < ARFS_NUM_TYPES; i++) \ + mlx5e_for_each_hash_arfs_rule(hn, tmp, arfs_tables[i].rules_hash, j) + +#define mlx5e_for_each_hash_arfs_rule(hn, tmp, hash, j) \ + for (j = 0; j < ARFS_HASH_SIZE; j++) \ + hlist_for_each_entry_safe(hn, tmp, &hash[j], hlist) + +static enum mlx5_traffic_types arfs_get_tt(enum arfs_type type) +{ + switch (type) { + case ARFS_IPV4_TCP: + return MLX5_TT_IPV4_TCP; + case ARFS_IPV4_UDP: + return MLX5_TT_IPV4_UDP; + case ARFS_IPV6_TCP: + return MLX5_TT_IPV6_TCP; + case ARFS_IPV6_UDP: + return MLX5_TT_IPV6_UDP; + default: + return -EINVAL; + } +} + +static int arfs_disable(struct mlx5e_priv *priv) +{ + int err, i; + + for (i = 0; i < ARFS_NUM_TYPES; i++) { + /* Modify ttc rules destination back to their default */ + err = mlx5_ttc_fwd_default_dest(priv->fs.ttc, arfs_get_tt(i)); + if (err) { + netdev_err(priv->netdev, + "%s: modify ttc[%d] default destination failed, err(%d)\n", + __func__, arfs_get_tt(i), err); + return err; + } + } + return 0; +} + +static void arfs_del_rules(struct mlx5e_priv *priv); + +int mlx5e_arfs_disable(struct mlx5e_priv *priv) +{ + arfs_del_rules(priv); + + return arfs_disable(priv); +} + +int mlx5e_arfs_enable(struct mlx5e_priv *priv) +{ + struct mlx5_flow_destination dest = {}; + int err, i; + + dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + for (i = 0; i < ARFS_NUM_TYPES; i++) { + dest.ft = priv->fs.arfs->arfs_tables[i].ft.t; + /* Modify ttc rules destination to point on the aRFS FTs */ + err = mlx5_ttc_fwd_dest(priv->fs.ttc, arfs_get_tt(i), &dest); + if (err) { + netdev_err(priv->netdev, + "%s: modify ttc[%d] dest to arfs, failed err(%d)\n", + __func__, arfs_get_tt(i), err); + arfs_disable(priv); + return err; + } + } + return 0; +} + +static void arfs_destroy_table(struct arfs_table *arfs_t) +{ + mlx5_del_flow_rules(arfs_t->default_rule); + mlx5e_destroy_flow_table(&arfs_t->ft); +} + +static void _mlx5e_cleanup_tables(struct mlx5e_priv *priv) +{ + int i; + + arfs_del_rules(priv); + destroy_workqueue(priv->fs.arfs->wq); + for (i = 0; i < ARFS_NUM_TYPES; i++) { + if (!IS_ERR_OR_NULL(priv->fs.arfs->arfs_tables[i].ft.t)) + arfs_destroy_table(&priv->fs.arfs->arfs_tables[i]); + } +} + +void mlx5e_arfs_destroy_tables(struct mlx5e_priv *priv) +{ + if (!(priv->netdev->hw_features & NETIF_F_NTUPLE)) + return; + + _mlx5e_cleanup_tables(priv); + kvfree(priv->fs.arfs); +} + +static int arfs_add_default_rule(struct mlx5e_priv *priv, + enum arfs_type type) +{ + struct arfs_table *arfs_t = &priv->fs.arfs->arfs_tables[type]; + struct mlx5_flow_destination dest = {}; + MLX5_DECLARE_FLOW_ACT(flow_act); + enum mlx5_traffic_types tt; + int err = 0; + + dest.type = MLX5_FLOW_DESTINATION_TYPE_TIR; + tt = arfs_get_tt(type); + if (tt == -EINVAL) { + netdev_err(priv->netdev, "%s: bad arfs_type: %d\n", + __func__, type); + return -EINVAL; + } + + /* FIXME: Must use mlx5_ttc_get_default_dest(), + * but can't since TTC default is not setup yet ! + */ + dest.tir_num = mlx5e_rx_res_get_tirn_rss(priv->rx_res, tt); + arfs_t->default_rule = mlx5_add_flow_rules(arfs_t->ft.t, NULL, + &flow_act, + &dest, 1); + if (IS_ERR(arfs_t->default_rule)) { + err = PTR_ERR(arfs_t->default_rule); + arfs_t->default_rule = NULL; + netdev_err(priv->netdev, "%s: add rule failed, arfs type=%d\n", + __func__, type); + } + + return err; +} + +#define MLX5E_ARFS_NUM_GROUPS 2 +#define MLX5E_ARFS_GROUP1_SIZE (BIT(16) - 1) +#define MLX5E_ARFS_GROUP2_SIZE BIT(0) +#define MLX5E_ARFS_TABLE_SIZE (MLX5E_ARFS_GROUP1_SIZE +\ + MLX5E_ARFS_GROUP2_SIZE) +static int arfs_create_groups(struct mlx5e_flow_table *ft, + enum arfs_type type) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + void *outer_headers_c; + int ix = 0; + u32 *in; + int err; + u8 *mc; + + ft->g = kcalloc(MLX5E_ARFS_NUM_GROUPS, + sizeof(*ft->g), GFP_KERNEL); + in = kvzalloc(inlen, GFP_KERNEL); + if (!in || !ft->g) { + kfree(ft->g); + kvfree(in); + return -ENOMEM; + } + + mc = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria); + outer_headers_c = MLX5_ADDR_OF(fte_match_param, mc, + outer_headers); + MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, outer_headers_c, ethertype); + switch (type) { + case ARFS_IPV4_TCP: + case ARFS_IPV6_TCP: + MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, outer_headers_c, tcp_dport); + MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, outer_headers_c, tcp_sport); + break; + case ARFS_IPV4_UDP: + case ARFS_IPV6_UDP: + MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, outer_headers_c, udp_dport); + MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, outer_headers_c, udp_sport); + break; + default: + err = -EINVAL; + goto out; + } + + switch (type) { + case ARFS_IPV4_TCP: + case ARFS_IPV4_UDP: + MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, outer_headers_c, + src_ipv4_src_ipv6.ipv4_layout.ipv4); + MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, outer_headers_c, + dst_ipv4_dst_ipv6.ipv4_layout.ipv4); + break; + case ARFS_IPV6_TCP: + case ARFS_IPV6_UDP: + memset(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c, + src_ipv4_src_ipv6.ipv6_layout.ipv6), + 0xff, 16); + memset(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c, + dst_ipv4_dst_ipv6.ipv6_layout.ipv6), + 0xff, 16); + break; + default: + err = -EINVAL; + goto out; + } + + MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); + MLX5_SET_CFG(in, start_flow_index, ix); + ix += MLX5E_ARFS_GROUP1_SIZE; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); + if (IS_ERR(ft->g[ft->num_groups])) + goto err; + ft->num_groups++; + + memset(in, 0, inlen); + MLX5_SET_CFG(in, start_flow_index, ix); + ix += MLX5E_ARFS_GROUP2_SIZE; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); + if (IS_ERR(ft->g[ft->num_groups])) + goto err; + ft->num_groups++; + + kvfree(in); + return 0; + +err: + err = PTR_ERR(ft->g[ft->num_groups]); + ft->g[ft->num_groups] = NULL; +out: + kvfree(in); + + return err; +} + +static int arfs_create_table(struct mlx5e_priv *priv, + enum arfs_type type) +{ + struct mlx5e_arfs_tables *arfs = priv->fs.arfs; + struct mlx5e_flow_table *ft = &arfs->arfs_tables[type].ft; + struct mlx5_flow_table_attr ft_attr = {}; + int err; + + ft->num_groups = 0; + + ft_attr.max_fte = MLX5E_ARFS_TABLE_SIZE; + ft_attr.level = MLX5E_ARFS_FT_LEVEL; + ft_attr.prio = MLX5E_NIC_PRIO; + + ft->t = mlx5_create_flow_table(priv->fs.ns, &ft_attr); + if (IS_ERR(ft->t)) { + err = PTR_ERR(ft->t); + ft->t = NULL; + return err; + } + + err = arfs_create_groups(ft, type); + if (err) + goto err; + + err = arfs_add_default_rule(priv, type); + if (err) + goto err; + + return 0; +err: + mlx5e_destroy_flow_table(ft); + return err; +} + +int mlx5e_arfs_create_tables(struct mlx5e_priv *priv) +{ + int err = -ENOMEM; + int i; + + if (!(priv->netdev->hw_features & NETIF_F_NTUPLE)) + return 0; + + priv->fs.arfs = kvzalloc(sizeof(*priv->fs.arfs), GFP_KERNEL); + if (!priv->fs.arfs) + return -ENOMEM; + + spin_lock_init(&priv->fs.arfs->arfs_lock); + INIT_LIST_HEAD(&priv->fs.arfs->rules); + priv->fs.arfs->wq = create_singlethread_workqueue("mlx5e_arfs"); + if (!priv->fs.arfs->wq) + goto err; + + for (i = 0; i < ARFS_NUM_TYPES; i++) { + err = arfs_create_table(priv, i); + if (err) + goto err_des; + } + return 0; + +err_des: + _mlx5e_cleanup_tables(priv); +err: + kvfree(priv->fs.arfs); + return err; +} + +#define MLX5E_ARFS_EXPIRY_QUOTA 60 + +static void arfs_may_expire_flow(struct mlx5e_priv *priv) +{ + struct arfs_rule *arfs_rule; + struct hlist_node *htmp; + HLIST_HEAD(del_list); + int quota = 0; + int i; + int j; + + spin_lock_bh(&priv->fs.arfs->arfs_lock); + mlx5e_for_each_arfs_rule(arfs_rule, htmp, priv->fs.arfs->arfs_tables, i, j) { + if (!work_pending(&arfs_rule->arfs_work) && + rps_may_expire_flow(priv->netdev, + arfs_rule->rxq, arfs_rule->flow_id, + arfs_rule->filter_id)) { + hlist_del_init(&arfs_rule->hlist); + hlist_add_head(&arfs_rule->hlist, &del_list); + if (quota++ > MLX5E_ARFS_EXPIRY_QUOTA) + break; + } + } + spin_unlock_bh(&priv->fs.arfs->arfs_lock); + hlist_for_each_entry_safe(arfs_rule, htmp, &del_list, hlist) { + if (arfs_rule->rule) + mlx5_del_flow_rules(arfs_rule->rule); + hlist_del(&arfs_rule->hlist); + kfree(arfs_rule); + } +} + +static void arfs_del_rules(struct mlx5e_priv *priv) +{ + struct hlist_node *htmp; + struct arfs_rule *rule; + HLIST_HEAD(del_list); + int i; + int j; + + spin_lock_bh(&priv->fs.arfs->arfs_lock); + mlx5e_for_each_arfs_rule(rule, htmp, priv->fs.arfs->arfs_tables, i, j) { + hlist_del_init(&rule->hlist); + hlist_add_head(&rule->hlist, &del_list); + } + spin_unlock_bh(&priv->fs.arfs->arfs_lock); + + hlist_for_each_entry_safe(rule, htmp, &del_list, hlist) { + cancel_work_sync(&rule->arfs_work); + if (rule->rule) + mlx5_del_flow_rules(rule->rule); + hlist_del(&rule->hlist); + kfree(rule); + } +} + +static struct hlist_head * +arfs_hash_bucket(struct arfs_table *arfs_t, __be16 src_port, + __be16 dst_port) +{ + unsigned long l; + int bucket_idx; + + l = (__force unsigned long)src_port | + ((__force unsigned long)dst_port << 2); + + bucket_idx = hash_long(l, ARFS_HASH_SHIFT); + + return &arfs_t->rules_hash[bucket_idx]; +} + +static struct arfs_table *arfs_get_table(struct mlx5e_arfs_tables *arfs, + u8 ip_proto, __be16 etype) +{ + if (etype == htons(ETH_P_IP) && ip_proto == IPPROTO_TCP) + return &arfs->arfs_tables[ARFS_IPV4_TCP]; + if (etype == htons(ETH_P_IP) && ip_proto == IPPROTO_UDP) + return &arfs->arfs_tables[ARFS_IPV4_UDP]; + if (etype == htons(ETH_P_IPV6) && ip_proto == IPPROTO_TCP) + return &arfs->arfs_tables[ARFS_IPV6_TCP]; + if (etype == htons(ETH_P_IPV6) && ip_proto == IPPROTO_UDP) + return &arfs->arfs_tables[ARFS_IPV6_UDP]; + + return NULL; +} + +static struct mlx5_flow_handle *arfs_add_rule(struct mlx5e_priv *priv, + struct arfs_rule *arfs_rule) +{ + struct mlx5e_arfs_tables *arfs = priv->fs.arfs; + struct arfs_tuple *tuple = &arfs_rule->tuple; + struct mlx5_flow_handle *rule = NULL; + struct mlx5_flow_destination dest = {}; + MLX5_DECLARE_FLOW_ACT(flow_act); + struct arfs_table *arfs_table; + struct mlx5_flow_spec *spec; + struct mlx5_flow_table *ft; + int err = 0; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) { + err = -ENOMEM; + goto out; + } + spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS; + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + outer_headers.ethertype); + MLX5_SET(fte_match_param, spec->match_value, outer_headers.ethertype, + ntohs(tuple->etype)); + arfs_table = arfs_get_table(arfs, tuple->ip_proto, tuple->etype); + if (!arfs_table) { + err = -EINVAL; + goto out; + } + + ft = arfs_table->ft.t; + if (tuple->ip_proto == IPPROTO_TCP) { + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + outer_headers.tcp_dport); + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + outer_headers.tcp_sport); + MLX5_SET(fte_match_param, spec->match_value, outer_headers.tcp_dport, + ntohs(tuple->dst_port)); + MLX5_SET(fte_match_param, spec->match_value, outer_headers.tcp_sport, + ntohs(tuple->src_port)); + } else { + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + outer_headers.udp_dport); + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + outer_headers.udp_sport); + MLX5_SET(fte_match_param, spec->match_value, outer_headers.udp_dport, + ntohs(tuple->dst_port)); + MLX5_SET(fte_match_param, spec->match_value, outer_headers.udp_sport, + ntohs(tuple->src_port)); + } + if (tuple->etype == htons(ETH_P_IP)) { + memcpy(MLX5_ADDR_OF(fte_match_param, spec->match_value, + outer_headers.src_ipv4_src_ipv6.ipv4_layout.ipv4), + &tuple->src_ipv4, + 4); + memcpy(MLX5_ADDR_OF(fte_match_param, spec->match_value, + outer_headers.dst_ipv4_dst_ipv6.ipv4_layout.ipv4), + &tuple->dst_ipv4, + 4); + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + outer_headers.src_ipv4_src_ipv6.ipv4_layout.ipv4); + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + outer_headers.dst_ipv4_dst_ipv6.ipv4_layout.ipv4); + } else { + memcpy(MLX5_ADDR_OF(fte_match_param, spec->match_value, + outer_headers.src_ipv4_src_ipv6.ipv6_layout.ipv6), + &tuple->src_ipv6, + 16); + memcpy(MLX5_ADDR_OF(fte_match_param, spec->match_value, + outer_headers.dst_ipv4_dst_ipv6.ipv6_layout.ipv6), + &tuple->dst_ipv6, + 16); + memset(MLX5_ADDR_OF(fte_match_param, spec->match_criteria, + outer_headers.src_ipv4_src_ipv6.ipv6_layout.ipv6), + 0xff, + 16); + memset(MLX5_ADDR_OF(fte_match_param, spec->match_criteria, + outer_headers.dst_ipv4_dst_ipv6.ipv6_layout.ipv6), + 0xff, + 16); + } + dest.type = MLX5_FLOW_DESTINATION_TYPE_TIR; + dest.tir_num = mlx5e_rx_res_get_tirn_direct(priv->rx_res, arfs_rule->rxq); + rule = mlx5_add_flow_rules(ft, spec, &flow_act, &dest, 1); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + priv->channel_stats[arfs_rule->rxq]->rq.arfs_err++; + mlx5e_dbg(HW, priv, + "%s: add rule(filter id=%d, rq idx=%d, ip proto=0x%x) failed,err=%d\n", + __func__, arfs_rule->filter_id, arfs_rule->rxq, + tuple->ip_proto, err); + } + +out: + kvfree(spec); + return err ? ERR_PTR(err) : rule; +} + +static void arfs_modify_rule_rq(struct mlx5e_priv *priv, + struct mlx5_flow_handle *rule, u16 rxq) +{ + struct mlx5_flow_destination dst = {}; + int err = 0; + + dst.type = MLX5_FLOW_DESTINATION_TYPE_TIR; + dst.tir_num = mlx5e_rx_res_get_tirn_direct(priv->rx_res, rxq); + err = mlx5_modify_rule_destination(rule, &dst, NULL); + if (err) + netdev_warn(priv->netdev, + "Failed to modify aRFS rule destination to rq=%d\n", rxq); +} + +static void arfs_handle_work(struct work_struct *work) +{ + struct arfs_rule *arfs_rule = container_of(work, + struct arfs_rule, + arfs_work); + struct mlx5e_priv *priv = arfs_rule->priv; + struct mlx5_flow_handle *rule; + + mutex_lock(&priv->state_lock); + if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) { + spin_lock_bh(&priv->fs.arfs->arfs_lock); + hlist_del(&arfs_rule->hlist); + spin_unlock_bh(&priv->fs.arfs->arfs_lock); + + mutex_unlock(&priv->state_lock); + kfree(arfs_rule); + goto out; + } + mutex_unlock(&priv->state_lock); + + if (!arfs_rule->rule) { + rule = arfs_add_rule(priv, arfs_rule); + if (IS_ERR(rule)) + goto out; + arfs_rule->rule = rule; + } else { + arfs_modify_rule_rq(priv, arfs_rule->rule, + arfs_rule->rxq); + } +out: + arfs_may_expire_flow(priv); +} + +static struct arfs_rule *arfs_alloc_rule(struct mlx5e_priv *priv, + struct arfs_table *arfs_t, + const struct flow_keys *fk, + u16 rxq, u32 flow_id) +{ + struct arfs_rule *rule; + struct arfs_tuple *tuple; + + rule = kzalloc(sizeof(*rule), GFP_ATOMIC); + if (!rule) + return NULL; + + rule->priv = priv; + rule->rxq = rxq; + INIT_WORK(&rule->arfs_work, arfs_handle_work); + + tuple = &rule->tuple; + tuple->etype = fk->basic.n_proto; + tuple->ip_proto = fk->basic.ip_proto; + if (tuple->etype == htons(ETH_P_IP)) { + tuple->src_ipv4 = fk->addrs.v4addrs.src; + tuple->dst_ipv4 = fk->addrs.v4addrs.dst; + } else { + memcpy(&tuple->src_ipv6, &fk->addrs.v6addrs.src, + sizeof(struct in6_addr)); + memcpy(&tuple->dst_ipv6, &fk->addrs.v6addrs.dst, + sizeof(struct in6_addr)); + } + tuple->src_port = fk->ports.src; + tuple->dst_port = fk->ports.dst; + + rule->flow_id = flow_id; + rule->filter_id = priv->fs.arfs->last_filter_id++ % RPS_NO_FILTER; + + hlist_add_head(&rule->hlist, + arfs_hash_bucket(arfs_t, tuple->src_port, + tuple->dst_port)); + return rule; +} + +static bool arfs_cmp(const struct arfs_tuple *tuple, const struct flow_keys *fk) +{ + if (tuple->src_port != fk->ports.src || tuple->dst_port != fk->ports.dst) + return false; + if (tuple->etype != fk->basic.n_proto) + return false; + if (tuple->etype == htons(ETH_P_IP)) + return tuple->src_ipv4 == fk->addrs.v4addrs.src && + tuple->dst_ipv4 == fk->addrs.v4addrs.dst; + if (tuple->etype == htons(ETH_P_IPV6)) + return !memcmp(&tuple->src_ipv6, &fk->addrs.v6addrs.src, + sizeof(struct in6_addr)) && + !memcmp(&tuple->dst_ipv6, &fk->addrs.v6addrs.dst, + sizeof(struct in6_addr)); + return false; +} + +static struct arfs_rule *arfs_find_rule(struct arfs_table *arfs_t, + const struct flow_keys *fk) +{ + struct arfs_rule *arfs_rule; + struct hlist_head *head; + + head = arfs_hash_bucket(arfs_t, fk->ports.src, fk->ports.dst); + hlist_for_each_entry(arfs_rule, head, hlist) { + if (arfs_cmp(&arfs_rule->tuple, fk)) + return arfs_rule; + } + + return NULL; +} + +int mlx5e_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb, + u16 rxq_index, u32 flow_id) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5e_arfs_tables *arfs = priv->fs.arfs; + struct arfs_table *arfs_t; + struct arfs_rule *arfs_rule; + struct flow_keys fk; + + if (!skb_flow_dissect_flow_keys(skb, &fk, 0)) + return -EPROTONOSUPPORT; + + if (fk.basic.n_proto != htons(ETH_P_IP) && + fk.basic.n_proto != htons(ETH_P_IPV6)) + return -EPROTONOSUPPORT; + + if (skb->encapsulation) + return -EPROTONOSUPPORT; + + arfs_t = arfs_get_table(arfs, fk.basic.ip_proto, fk.basic.n_proto); + if (!arfs_t) + return -EPROTONOSUPPORT; + + spin_lock_bh(&arfs->arfs_lock); + arfs_rule = arfs_find_rule(arfs_t, &fk); + if (arfs_rule) { + if (arfs_rule->rxq == rxq_index) { + spin_unlock_bh(&arfs->arfs_lock); + return arfs_rule->filter_id; + } + arfs_rule->rxq = rxq_index; + } else { + arfs_rule = arfs_alloc_rule(priv, arfs_t, &fk, rxq_index, flow_id); + if (!arfs_rule) { + spin_unlock_bh(&arfs->arfs_lock); + return -ENOMEM; + } + } + queue_work(priv->fs.arfs->wq, &arfs_rule->arfs_work); + spin_unlock_bh(&arfs->arfs_lock); + return arfs_rule->filter_id; +} + diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_common.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_common.c new file mode 100644 index 0000000..166326b --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_common.c @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2016, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "en.h" + +/* mlx5e global resources should be placed in this file. + * Global resources are common to all the netdevices created on the same nic. + */ + +void mlx5e_mkey_set_relaxed_ordering(struct mlx5_core_dev *mdev, void *mkc) +{ + bool ro_pci_enable = pcie_relaxed_ordering_enabled(mdev->pdev); + bool ro_write = MLX5_CAP_GEN(mdev, relaxed_ordering_write); + bool ro_read = MLX5_CAP_GEN(mdev, relaxed_ordering_read); + + MLX5_SET(mkc, mkc, relaxed_ordering_read, ro_pci_enable && ro_read); + MLX5_SET(mkc, mkc, relaxed_ordering_write, ro_pci_enable && ro_write); +} + +int mlx5e_create_mkey(struct mlx5_core_dev *mdev, u32 pdn, u32 *mkey) +{ + int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); + void *mkc; + u32 *in; + int err; + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA); + MLX5_SET(mkc, mkc, lw, 1); + MLX5_SET(mkc, mkc, lr, 1); + mlx5e_mkey_set_relaxed_ordering(mdev, mkc); + MLX5_SET(mkc, mkc, pd, pdn); + MLX5_SET(mkc, mkc, length64, 1); + MLX5_SET(mkc, mkc, qpn, 0xffffff); + + err = mlx5_core_create_mkey(mdev, mkey, in, inlen); + + kvfree(in); + return err; +} + +int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev) +{ + struct mlx5e_hw_objs *res = &mdev->mlx5e_res.hw_objs; + int err; + + err = mlx5_core_alloc_pd(mdev, &res->pdn); + if (err) { + mlx5_core_err(mdev, "alloc pd failed, %d\n", err); + return err; + } + + err = mlx5_core_alloc_transport_domain(mdev, &res->td.tdn); + if (err) { + mlx5_core_err(mdev, "alloc td failed, %d\n", err); + goto err_dealloc_pd; + } + + err = mlx5e_create_mkey(mdev, res->pdn, &res->mkey); + if (err) { + mlx5_core_err(mdev, "create mkey failed, %d\n", err); + goto err_dealloc_transport_domain; + } + + err = mlx5_alloc_bfreg(mdev, &res->bfreg, false, false); + if (err) { + mlx5_core_err(mdev, "alloc bfreg failed, %d\n", err); + goto err_destroy_mkey; + } + + INIT_LIST_HEAD(&res->td.tirs_list); + mutex_init(&res->td.list_lock); + + return 0; + +err_destroy_mkey: + mlx5_core_destroy_mkey(mdev, res->mkey); +err_dealloc_transport_domain: + mlx5_core_dealloc_transport_domain(mdev, res->td.tdn); +err_dealloc_pd: + mlx5_core_dealloc_pd(mdev, res->pdn); + return err; +} + +void mlx5e_destroy_mdev_resources(struct mlx5_core_dev *mdev) +{ + struct mlx5e_hw_objs *res = &mdev->mlx5e_res.hw_objs; + + if (!res->bfreg.up) + return; + mlx5_free_bfreg(mdev, &res->bfreg); + mlx5_core_destroy_mkey(mdev, res->mkey); + mlx5_core_dealloc_transport_domain(mdev, res->td.tdn); + mlx5_core_dealloc_pd(mdev, res->pdn); + memset(res, 0, sizeof(*res)); +} + +int mlx5e_refresh_tirs(struct mlx5e_priv *priv, bool enable_uc_lb, + bool enable_mc_lb) +{ + struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5e_tir *tir; + u8 lb_flags = 0; + int err = 0; + u32 tirn = 0; + int inlen; + void *in; + + inlen = MLX5_ST_SZ_BYTES(modify_tir_in); + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto out; + } + + if (enable_uc_lb) + lb_flags = MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST; + + if (enable_mc_lb) + lb_flags |= MLX5_TIRC_SELF_LB_BLOCK_BLOCK_MULTICAST; + + if (lb_flags) + MLX5_SET(modify_tir_in, in, ctx.self_lb_block, lb_flags); + + MLX5_SET(modify_tir_in, in, bitmask.self_lb_en, 1); + + mutex_lock(&mdev->mlx5e_res.hw_objs.td.list_lock); + list_for_each_entry(tir, &mdev->mlx5e_res.hw_objs.td.tirs_list, list) { + tirn = tir->tirn; + err = mlx5_core_modify_tir(mdev, tirn, in); + if (err) + goto out; + } + +out: + kvfree(in); + if (err) + netdev_err(priv->netdev, "refresh tir(0x%x) failed, %d\n", tirn, err); + mutex_unlock(&mdev->mlx5e_res.hw_objs.td.list_lock); + + return err; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c new file mode 100644 index 0000000..47334b8 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c @@ -0,0 +1,1293 @@ +/* + * Copyright (c) 2016, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include "en.h" +#include "en/port.h" +#include "en/port_buffer.h" + +#define MLX5E_MAX_BW_ALLOC 100 /* Max percentage of BW allocation */ + +#define MLX5E_100MB (100000) +#define MLX5E_1GB (1000000) + +#define MLX5E_CEE_STATE_UP 1 +#define MLX5E_CEE_STATE_DOWN 0 + +/* Max supported cable length is 1000 meters */ +#define MLX5E_MAX_CABLE_LENGTH 1000 + +enum { + MLX5E_VENDOR_TC_GROUP_NUM = 7, + MLX5E_LOWEST_PRIO_GROUP = 0, +}; + +enum { + MLX5_DCB_CHG_RESET, + MLX5_DCB_NO_CHG, + MLX5_DCB_CHG_NO_RESET, +}; + +#define MLX5_DSCP_SUPPORTED(mdev) (MLX5_CAP_GEN(mdev, qcam_reg) && \ + MLX5_CAP_QCAM_REG(mdev, qpts) && \ + MLX5_CAP_QCAM_REG(mdev, qpdpm)) + +static int mlx5e_set_trust_state(struct mlx5e_priv *priv, u8 trust_state); +static int mlx5e_set_dscp2prio(struct mlx5e_priv *priv, u8 dscp, u8 prio); + +/* If dcbx mode is non-host set the dcbx mode to host. + */ +static int mlx5e_dcbnl_set_dcbx_mode(struct mlx5e_priv *priv, + enum mlx5_dcbx_oper_mode mode) +{ + struct mlx5_core_dev *mdev = priv->mdev; + u32 param[MLX5_ST_SZ_DW(dcbx_param)]; + int err; + + err = mlx5_query_port_dcbx_param(mdev, param); + if (err) + return err; + + MLX5_SET(dcbx_param, param, version_admin, mode); + if (mode != MLX5E_DCBX_PARAM_VER_OPER_HOST) + MLX5_SET(dcbx_param, param, willing_admin, 1); + + return mlx5_set_port_dcbx_param(mdev, param); +} + +static int mlx5e_dcbnl_switch_to_host_mode(struct mlx5e_priv *priv) +{ + struct mlx5e_dcbx *dcbx = &priv->dcbx; + int err; + + if (!MLX5_CAP_GEN(priv->mdev, dcbx)) + return 0; + + if (dcbx->mode == MLX5E_DCBX_PARAM_VER_OPER_HOST) + return 0; + + err = mlx5e_dcbnl_set_dcbx_mode(priv, MLX5E_DCBX_PARAM_VER_OPER_HOST); + if (err) + return err; + + dcbx->mode = MLX5E_DCBX_PARAM_VER_OPER_HOST; + return 0; +} + +static int mlx5e_dcbnl_ieee_getets(struct net_device *netdev, + struct ieee_ets *ets) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5_core_dev *mdev = priv->mdev; + u8 tc_group[IEEE_8021QAZ_MAX_TCS]; + bool is_tc_group_6_exist = false; + bool is_zero_bw_ets_tc = false; + int err = 0; + int i; + + if (!MLX5_CAP_GEN(priv->mdev, ets)) + return -EOPNOTSUPP; + + ets->ets_cap = mlx5_max_tc(priv->mdev) + 1; + for (i = 0; i < ets->ets_cap; i++) { + err = mlx5_query_port_prio_tc(mdev, i, &ets->prio_tc[i]); + if (err) + return err; + + err = mlx5_query_port_tc_group(mdev, i, &tc_group[i]); + if (err) + return err; + + err = mlx5_query_port_tc_bw_alloc(mdev, i, &ets->tc_tx_bw[i]); + if (err) + return err; + + if (ets->tc_tx_bw[i] < MLX5E_MAX_BW_ALLOC && + tc_group[i] == (MLX5E_LOWEST_PRIO_GROUP + 1)) + is_zero_bw_ets_tc = true; + + if (tc_group[i] == (MLX5E_VENDOR_TC_GROUP_NUM - 1)) + is_tc_group_6_exist = true; + } + + /* Report 0% ets tc if exits*/ + if (is_zero_bw_ets_tc) { + for (i = 0; i < ets->ets_cap; i++) + if (tc_group[i] == MLX5E_LOWEST_PRIO_GROUP) + ets->tc_tx_bw[i] = 0; + } + + /* Update tc_tsa based on fw setting*/ + for (i = 0; i < ets->ets_cap; i++) { + if (ets->tc_tx_bw[i] < MLX5E_MAX_BW_ALLOC) + priv->dcbx.tc_tsa[i] = IEEE_8021QAZ_TSA_ETS; + else if (tc_group[i] == MLX5E_VENDOR_TC_GROUP_NUM && + !is_tc_group_6_exist) + priv->dcbx.tc_tsa[i] = IEEE_8021QAZ_TSA_VENDOR; + } + memcpy(ets->tc_tsa, priv->dcbx.tc_tsa, sizeof(ets->tc_tsa)); + + return err; +} + +static void mlx5e_build_tc_group(struct ieee_ets *ets, u8 *tc_group, int max_tc) +{ + bool any_tc_mapped_to_ets = false; + bool ets_zero_bw = false; + int strict_group; + int i; + + for (i = 0; i <= max_tc; i++) { + if (ets->tc_tsa[i] == IEEE_8021QAZ_TSA_ETS) { + any_tc_mapped_to_ets = true; + if (!ets->tc_tx_bw[i]) + ets_zero_bw = true; + } + } + + /* strict group has higher priority than ets group */ + strict_group = MLX5E_LOWEST_PRIO_GROUP; + if (any_tc_mapped_to_ets) + strict_group++; + if (ets_zero_bw) + strict_group++; + + for (i = 0; i <= max_tc; i++) { + switch (ets->tc_tsa[i]) { + case IEEE_8021QAZ_TSA_VENDOR: + tc_group[i] = MLX5E_VENDOR_TC_GROUP_NUM; + break; + case IEEE_8021QAZ_TSA_STRICT: + tc_group[i] = strict_group++; + break; + case IEEE_8021QAZ_TSA_ETS: + tc_group[i] = MLX5E_LOWEST_PRIO_GROUP; + if (ets->tc_tx_bw[i] && ets_zero_bw) + tc_group[i] = MLX5E_LOWEST_PRIO_GROUP + 1; + break; + } + } +} + +static void mlx5e_build_tc_tx_bw(struct ieee_ets *ets, u8 *tc_tx_bw, + u8 *tc_group, int max_tc) +{ + int bw_for_ets_zero_bw_tc = 0; + int last_ets_zero_bw_tc = -1; + int num_ets_zero_bw = 0; + int i; + + for (i = 0; i <= max_tc; i++) { + if (ets->tc_tsa[i] == IEEE_8021QAZ_TSA_ETS && + !ets->tc_tx_bw[i]) { + num_ets_zero_bw++; + last_ets_zero_bw_tc = i; + } + } + + if (num_ets_zero_bw) + bw_for_ets_zero_bw_tc = MLX5E_MAX_BW_ALLOC / num_ets_zero_bw; + + for (i = 0; i <= max_tc; i++) { + switch (ets->tc_tsa[i]) { + case IEEE_8021QAZ_TSA_VENDOR: + tc_tx_bw[i] = MLX5E_MAX_BW_ALLOC; + break; + case IEEE_8021QAZ_TSA_STRICT: + tc_tx_bw[i] = MLX5E_MAX_BW_ALLOC; + break; + case IEEE_8021QAZ_TSA_ETS: + tc_tx_bw[i] = ets->tc_tx_bw[i] ? + ets->tc_tx_bw[i] : + bw_for_ets_zero_bw_tc; + break; + } + } + + /* Make sure the total bw for ets zero bw group is 100% */ + if (last_ets_zero_bw_tc != -1) + tc_tx_bw[last_ets_zero_bw_tc] += + MLX5E_MAX_BW_ALLOC % num_ets_zero_bw; +} + +/* If there are ETS BW 0, + * Set ETS group # to 1 for all ETS non zero BW tcs. Their sum must be 100%. + * Set group #0 to all the ETS BW 0 tcs and + * equally splits the 100% BW between them + * Report both group #0 and #1 as ETS type. + * All the tcs in group #0 will be reported with 0% BW. + */ +static int mlx5e_dcbnl_ieee_setets_core(struct mlx5e_priv *priv, struct ieee_ets *ets) +{ + struct mlx5_core_dev *mdev = priv->mdev; + u8 tc_tx_bw[IEEE_8021QAZ_MAX_TCS]; + u8 tc_group[IEEE_8021QAZ_MAX_TCS]; + int max_tc = mlx5_max_tc(mdev); + int err, i; + + mlx5e_build_tc_group(ets, tc_group, max_tc); + mlx5e_build_tc_tx_bw(ets, tc_tx_bw, tc_group, max_tc); + + err = mlx5_set_port_prio_tc(mdev, ets->prio_tc); + if (err) + return err; + + err = mlx5_set_port_tc_group(mdev, tc_group); + if (err) + return err; + + err = mlx5_set_port_tc_bw_alloc(mdev, tc_tx_bw); + + if (err) + return err; + + memcpy(priv->dcbx.tc_tsa, ets->tc_tsa, sizeof(ets->tc_tsa)); + + for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) { + mlx5e_dbg(HW, priv, "%s: prio_%d <=> tc_%d\n", + __func__, i, ets->prio_tc[i]); + mlx5e_dbg(HW, priv, "%s: tc_%d <=> tx_bw_%d%%, group_%d\n", + __func__, i, tc_tx_bw[i], tc_group[i]); + } + + return err; +} + +static int mlx5e_dbcnl_validate_ets(struct net_device *netdev, + struct ieee_ets *ets, + bool zero_sum_allowed) +{ + bool have_ets_tc = false; + int bw_sum = 0; + int i; + + /* Validate Priority */ + for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) { + if (ets->prio_tc[i] >= MLX5E_MAX_PRIORITY) { + netdev_err(netdev, + "Failed to validate ETS: priority value greater than max(%d)\n", + MLX5E_MAX_PRIORITY); + return -EINVAL; + } + } + + /* Validate Bandwidth Sum */ + for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) { + if (ets->tc_tsa[i] == IEEE_8021QAZ_TSA_ETS) { + have_ets_tc = true; + bw_sum += ets->tc_tx_bw[i]; + } + } + + if (have_ets_tc && bw_sum != 100) { + if (bw_sum || (!bw_sum && !zero_sum_allowed)) + netdev_err(netdev, + "Failed to validate ETS: BW sum is illegal\n"); + return -EINVAL; + } + return 0; +} + +static int mlx5e_dcbnl_ieee_setets(struct net_device *netdev, + struct ieee_ets *ets) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + int err; + + if (!MLX5_CAP_GEN(priv->mdev, ets)) + return -EOPNOTSUPP; + + err = mlx5e_dbcnl_validate_ets(netdev, ets, false); + if (err) + return err; + + err = mlx5e_dcbnl_ieee_setets_core(priv, ets); + if (err) + return err; + + return 0; +} + +static int mlx5e_dcbnl_ieee_getpfc(struct net_device *dev, + struct ieee_pfc *pfc) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5e_pport_stats *pstats = &priv->stats.pport; + int i; + + pfc->pfc_cap = mlx5_max_tc(mdev) + 1; + for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) { + pfc->requests[i] = PPORT_PER_PRIO_GET(pstats, i, tx_pause); + pfc->indications[i] = PPORT_PER_PRIO_GET(pstats, i, rx_pause); + } + + if (MLX5_BUFFER_SUPPORTED(mdev)) + pfc->delay = priv->dcbx.cable_len; + + return mlx5_query_port_pfc(mdev, &pfc->pfc_en, NULL); +} + +static int mlx5e_dcbnl_ieee_setpfc(struct net_device *dev, + struct ieee_pfc *pfc) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5_core_dev *mdev = priv->mdev; + u32 old_cable_len = priv->dcbx.cable_len; + struct ieee_pfc pfc_new; + u32 changed = 0; + u8 curr_pfc_en; + int ret = 0; + + /* pfc_en */ + mlx5_query_port_pfc(mdev, &curr_pfc_en, NULL); + if (pfc->pfc_en != curr_pfc_en) { + ret = mlx5_set_port_pfc(mdev, pfc->pfc_en, pfc->pfc_en); + if (ret) + return ret; + mlx5_toggle_port_link(mdev); + changed |= MLX5E_PORT_BUFFER_PFC; + } + + if (pfc->delay && + pfc->delay < MLX5E_MAX_CABLE_LENGTH && + pfc->delay != priv->dcbx.cable_len) { + priv->dcbx.cable_len = pfc->delay; + changed |= MLX5E_PORT_BUFFER_CABLE_LEN; + } + + if (MLX5_BUFFER_SUPPORTED(mdev)) { + pfc_new.pfc_en = (changed & MLX5E_PORT_BUFFER_PFC) ? pfc->pfc_en : curr_pfc_en; + if (priv->dcbx.manual_buffer) + ret = mlx5e_port_manual_buffer_config(priv, changed, + dev->mtu, &pfc_new, + NULL, NULL); + + if (ret && (changed & MLX5E_PORT_BUFFER_CABLE_LEN)) + priv->dcbx.cable_len = old_cable_len; + } + + if (!ret) { + mlx5e_dbg(HW, priv, + "%s: PFC per priority bit mask: 0x%x\n", + __func__, pfc->pfc_en); + } + return ret; +} + +static u8 mlx5e_dcbnl_getdcbx(struct net_device *dev) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + + return priv->dcbx.cap; +} + +static u8 mlx5e_dcbnl_setdcbx(struct net_device *dev, u8 mode) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5e_dcbx *dcbx = &priv->dcbx; + + if (mode & DCB_CAP_DCBX_LLD_MANAGED) + return 1; + + if ((!mode) && MLX5_CAP_GEN(priv->mdev, dcbx)) { + if (dcbx->mode == MLX5E_DCBX_PARAM_VER_OPER_AUTO) + return 0; + + /* set dcbx to fw controlled */ + if (!mlx5e_dcbnl_set_dcbx_mode(priv, MLX5E_DCBX_PARAM_VER_OPER_AUTO)) { + dcbx->mode = MLX5E_DCBX_PARAM_VER_OPER_AUTO; + dcbx->cap &= ~DCB_CAP_DCBX_HOST; + return 0; + } + + return 1; + } + + if (!(mode & DCB_CAP_DCBX_HOST)) + return 1; + + if (mlx5e_dcbnl_switch_to_host_mode(netdev_priv(dev))) + return 1; + + dcbx->cap = mode; + + return 0; +} + +static int mlx5e_dcbnl_ieee_setapp(struct net_device *dev, struct dcb_app *app) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + struct dcb_app temp; + bool is_new; + int err; + + if (!MLX5_CAP_GEN(priv->mdev, vport_group_manager) || + !MLX5_DSCP_SUPPORTED(priv->mdev)) + return -EOPNOTSUPP; + + if ((app->selector != IEEE_8021QAZ_APP_SEL_DSCP) || + (app->protocol >= MLX5E_MAX_DSCP)) + return -EINVAL; + + /* Save the old entry info */ + temp.selector = IEEE_8021QAZ_APP_SEL_DSCP; + temp.protocol = app->protocol; + temp.priority = priv->dcbx_dp.dscp2prio[app->protocol]; + + /* Check if need to switch to dscp trust state */ + if (!priv->dcbx.dscp_app_cnt) { + err = mlx5e_set_trust_state(priv, MLX5_QPTS_TRUST_DSCP); + if (err) + return err; + } + + /* Skip the fw command if new and old mapping are the same */ + if (app->priority != priv->dcbx_dp.dscp2prio[app->protocol]) { + err = mlx5e_set_dscp2prio(priv, app->protocol, app->priority); + if (err) + goto fw_err; + } + + /* Delete the old entry if exists */ + is_new = false; + err = dcb_ieee_delapp(dev, &temp); + if (err) + is_new = true; + + /* Add new entry and update counter */ + err = dcb_ieee_setapp(dev, app); + if (err) + return err; + + if (is_new) + priv->dcbx.dscp_app_cnt++; + + return err; + +fw_err: + mlx5e_set_trust_state(priv, MLX5_QPTS_TRUST_PCP); + return err; +} + +static int mlx5e_dcbnl_ieee_delapp(struct net_device *dev, struct dcb_app *app) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + int err; + + if (!MLX5_CAP_GEN(priv->mdev, vport_group_manager) || + !MLX5_DSCP_SUPPORTED(priv->mdev)) + return -EOPNOTSUPP; + + if ((app->selector != IEEE_8021QAZ_APP_SEL_DSCP) || + (app->protocol >= MLX5E_MAX_DSCP)) + return -EINVAL; + + /* Skip if no dscp app entry */ + if (!priv->dcbx.dscp_app_cnt) + return -ENOENT; + + /* Check if the entry matches fw setting */ + if (app->priority != priv->dcbx_dp.dscp2prio[app->protocol]) + return -ENOENT; + + /* Delete the app entry */ + err = dcb_ieee_delapp(dev, app); + if (err) + return err; + + /* Reset the priority mapping back to zero */ + err = mlx5e_set_dscp2prio(priv, app->protocol, 0); + if (err) + goto fw_err; + + priv->dcbx.dscp_app_cnt--; + + /* Check if need to switch to pcp trust state */ + if (!priv->dcbx.dscp_app_cnt) + err = mlx5e_set_trust_state(priv, MLX5_QPTS_TRUST_PCP); + + return err; + +fw_err: + mlx5e_set_trust_state(priv, MLX5_QPTS_TRUST_PCP); + return err; +} + +static int mlx5e_dcbnl_ieee_getmaxrate(struct net_device *netdev, + struct ieee_maxrate *maxrate) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5_core_dev *mdev = priv->mdev; + u8 max_bw_value[IEEE_8021QAZ_MAX_TCS]; + u8 max_bw_unit[IEEE_8021QAZ_MAX_TCS]; + int err; + int i; + + err = mlx5_query_port_ets_rate_limit(mdev, max_bw_value, max_bw_unit); + if (err) + return err; + + memset(maxrate->tc_maxrate, 0, sizeof(maxrate->tc_maxrate)); + + for (i = 0; i <= mlx5_max_tc(mdev); i++) { + switch (max_bw_unit[i]) { + case MLX5_100_MBPS_UNIT: + maxrate->tc_maxrate[i] = max_bw_value[i] * MLX5E_100MB; + break; + case MLX5_GBPS_UNIT: + maxrate->tc_maxrate[i] = max_bw_value[i] * MLX5E_1GB; + break; + case MLX5_BW_NO_LIMIT: + break; + default: + WARN(true, "non-supported BW unit"); + break; + } + } + + return 0; +} + +static int mlx5e_dcbnl_ieee_setmaxrate(struct net_device *netdev, + struct ieee_maxrate *maxrate) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5_core_dev *mdev = priv->mdev; + u8 max_bw_value[IEEE_8021QAZ_MAX_TCS]; + u8 max_bw_unit[IEEE_8021QAZ_MAX_TCS]; + __u64 upper_limit_mbps = roundup(255 * MLX5E_100MB, MLX5E_1GB); + int i; + + memset(max_bw_value, 0, sizeof(max_bw_value)); + memset(max_bw_unit, 0, sizeof(max_bw_unit)); + + for (i = 0; i <= mlx5_max_tc(mdev); i++) { + if (!maxrate->tc_maxrate[i]) { + max_bw_unit[i] = MLX5_BW_NO_LIMIT; + continue; + } + if (maxrate->tc_maxrate[i] < upper_limit_mbps) { + max_bw_value[i] = div_u64(maxrate->tc_maxrate[i], + MLX5E_100MB); + max_bw_value[i] = max_bw_value[i] ? max_bw_value[i] : 1; + max_bw_unit[i] = MLX5_100_MBPS_UNIT; + } else { + max_bw_value[i] = div_u64(maxrate->tc_maxrate[i], + MLX5E_1GB); + max_bw_unit[i] = MLX5_GBPS_UNIT; + } + } + + for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) { + mlx5e_dbg(HW, priv, "%s: tc_%d <=> max_bw %d Gbps\n", + __func__, i, max_bw_value[i]); + } + + return mlx5_modify_port_ets_rate_limit(mdev, max_bw_value, max_bw_unit); +} + +static u8 mlx5e_dcbnl_setall(struct net_device *netdev) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5e_cee_config *cee_cfg = &priv->dcbx.cee_cfg; + struct mlx5_core_dev *mdev = priv->mdev; + struct ieee_ets ets; + struct ieee_pfc pfc; + int err = -EOPNOTSUPP; + int i; + + if (!MLX5_CAP_GEN(mdev, ets)) + goto out; + + memset(&ets, 0, sizeof(ets)); + memset(&pfc, 0, sizeof(pfc)); + + ets.ets_cap = IEEE_8021QAZ_MAX_TCS; + for (i = 0; i < CEE_DCBX_MAX_PGS; i++) { + ets.tc_tx_bw[i] = cee_cfg->pg_bw_pct[i]; + ets.tc_rx_bw[i] = cee_cfg->pg_bw_pct[i]; + ets.tc_tsa[i] = IEEE_8021QAZ_TSA_ETS; + ets.prio_tc[i] = cee_cfg->prio_to_pg_map[i]; + mlx5e_dbg(HW, priv, + "%s: Priority group %d: tx_bw %d, rx_bw %d, prio_tc %d\n", + __func__, i, ets.tc_tx_bw[i], ets.tc_rx_bw[i], + ets.prio_tc[i]); + } + + err = mlx5e_dbcnl_validate_ets(netdev, &ets, true); + if (err) + goto out; + + err = mlx5e_dcbnl_ieee_setets_core(priv, &ets); + if (err) { + netdev_err(netdev, + "%s, Failed to set ETS: %d\n", __func__, err); + goto out; + } + + /* Set PFC */ + pfc.pfc_cap = mlx5_max_tc(mdev) + 1; + if (!cee_cfg->pfc_enable) + pfc.pfc_en = 0; + else + for (i = 0; i < CEE_DCBX_MAX_PRIO; i++) + pfc.pfc_en |= cee_cfg->pfc_setting[i] << i; + + err = mlx5e_dcbnl_ieee_setpfc(netdev, &pfc); + if (err) { + netdev_err(netdev, + "%s, Failed to set PFC: %d\n", __func__, err); + goto out; + } +out: + return err ? MLX5_DCB_NO_CHG : MLX5_DCB_CHG_RESET; +} + +static u8 mlx5e_dcbnl_getstate(struct net_device *netdev) +{ + return MLX5E_CEE_STATE_UP; +} + +static void mlx5e_dcbnl_getpermhwaddr(struct net_device *netdev, + u8 *perm_addr) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + + if (!perm_addr) + return; + + memset(perm_addr, 0xff, MAX_ADDR_LEN); + + mlx5_query_mac_address(priv->mdev, perm_addr); +} + +static void mlx5e_dcbnl_setpgtccfgtx(struct net_device *netdev, + int priority, u8 prio_type, + u8 pgid, u8 bw_pct, u8 up_map) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5e_cee_config *cee_cfg = &priv->dcbx.cee_cfg; + + if (priority >= CEE_DCBX_MAX_PRIO) { + netdev_err(netdev, + "%s, priority is out of range\n", __func__); + return; + } + + if (pgid >= CEE_DCBX_MAX_PGS) { + netdev_err(netdev, + "%s, priority group is out of range\n", __func__); + return; + } + + cee_cfg->prio_to_pg_map[priority] = pgid; +} + +static void mlx5e_dcbnl_setpgbwgcfgtx(struct net_device *netdev, + int pgid, u8 bw_pct) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5e_cee_config *cee_cfg = &priv->dcbx.cee_cfg; + + if (pgid >= CEE_DCBX_MAX_PGS) { + netdev_err(netdev, + "%s, priority group is out of range\n", __func__); + return; + } + + cee_cfg->pg_bw_pct[pgid] = bw_pct; +} + +static void mlx5e_dcbnl_getpgtccfgtx(struct net_device *netdev, + int priority, u8 *prio_type, + u8 *pgid, u8 *bw_pct, u8 *up_map) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5_core_dev *mdev = priv->mdev; + + if (!MLX5_CAP_GEN(priv->mdev, ets)) { + netdev_err(netdev, "%s, ets is not supported\n", __func__); + return; + } + + if (priority >= CEE_DCBX_MAX_PRIO) { + netdev_err(netdev, + "%s, priority is out of range\n", __func__); + return; + } + + *prio_type = 0; + *bw_pct = 0; + *up_map = 0; + + if (mlx5_query_port_prio_tc(mdev, priority, pgid)) + *pgid = 0; +} + +static void mlx5e_dcbnl_getpgbwgcfgtx(struct net_device *netdev, + int pgid, u8 *bw_pct) +{ + struct ieee_ets ets; + + if (pgid >= CEE_DCBX_MAX_PGS) { + netdev_err(netdev, + "%s, priority group is out of range\n", __func__); + return; + } + + mlx5e_dcbnl_ieee_getets(netdev, &ets); + *bw_pct = ets.tc_tx_bw[pgid]; +} + +static void mlx5e_dcbnl_setpfccfg(struct net_device *netdev, + int priority, u8 setting) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5e_cee_config *cee_cfg = &priv->dcbx.cee_cfg; + + if (priority >= CEE_DCBX_MAX_PRIO) { + netdev_err(netdev, + "%s, priority is out of range\n", __func__); + return; + } + + if (setting > 1) + return; + + cee_cfg->pfc_setting[priority] = setting; +} + +static int +mlx5e_dcbnl_get_priority_pfc(struct net_device *netdev, + int priority, u8 *setting) +{ + struct ieee_pfc pfc; + int err; + + err = mlx5e_dcbnl_ieee_getpfc(netdev, &pfc); + + if (err) + *setting = 0; + else + *setting = (pfc.pfc_en >> priority) & 0x01; + + return err; +} + +static void mlx5e_dcbnl_getpfccfg(struct net_device *netdev, + int priority, u8 *setting) +{ + if (priority >= CEE_DCBX_MAX_PRIO) { + netdev_err(netdev, + "%s, priority is out of range\n", __func__); + return; + } + + if (!setting) + return; + + mlx5e_dcbnl_get_priority_pfc(netdev, priority, setting); +} + +static u8 mlx5e_dcbnl_getcap(struct net_device *netdev, + int capid, u8 *cap) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5_core_dev *mdev = priv->mdev; + u8 rval = 0; + + switch (capid) { + case DCB_CAP_ATTR_PG: + *cap = true; + break; + case DCB_CAP_ATTR_PFC: + *cap = true; + break; + case DCB_CAP_ATTR_UP2TC: + *cap = false; + break; + case DCB_CAP_ATTR_PG_TCS: + *cap = 1 << mlx5_max_tc(mdev); + break; + case DCB_CAP_ATTR_PFC_TCS: + *cap = 1 << mlx5_max_tc(mdev); + break; + case DCB_CAP_ATTR_GSP: + *cap = false; + break; + case DCB_CAP_ATTR_BCN: + *cap = false; + break; + case DCB_CAP_ATTR_DCBX: + *cap = priv->dcbx.cap | + DCB_CAP_DCBX_VER_CEE | + DCB_CAP_DCBX_VER_IEEE; + break; + default: + *cap = 0; + rval = 1; + break; + } + + return rval; +} + +static int mlx5e_dcbnl_getnumtcs(struct net_device *netdev, + int tcs_id, u8 *num) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5_core_dev *mdev = priv->mdev; + + switch (tcs_id) { + case DCB_NUMTCS_ATTR_PG: + case DCB_NUMTCS_ATTR_PFC: + *num = mlx5_max_tc(mdev) + 1; + break; + default: + return -EINVAL; + } + + return 0; +} + +static u8 mlx5e_dcbnl_getpfcstate(struct net_device *netdev) +{ + struct ieee_pfc pfc; + + if (mlx5e_dcbnl_ieee_getpfc(netdev, &pfc)) + return MLX5E_CEE_STATE_DOWN; + + return pfc.pfc_en ? MLX5E_CEE_STATE_UP : MLX5E_CEE_STATE_DOWN; +} + +static void mlx5e_dcbnl_setpfcstate(struct net_device *netdev, u8 state) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5e_cee_config *cee_cfg = &priv->dcbx.cee_cfg; + + if ((state != MLX5E_CEE_STATE_UP) && (state != MLX5E_CEE_STATE_DOWN)) + return; + + cee_cfg->pfc_enable = state; +} + +static int mlx5e_dcbnl_getbuffer(struct net_device *dev, + struct dcbnl_buffer *dcb_buffer) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5e_port_buffer port_buffer; + u8 buffer[MLX5E_MAX_PRIORITY]; + int i, err; + + if (!MLX5_BUFFER_SUPPORTED(mdev)) + return -EOPNOTSUPP; + + err = mlx5e_port_query_priority2buffer(mdev, buffer); + if (err) + return err; + + for (i = 0; i < MLX5E_MAX_PRIORITY; i++) + dcb_buffer->prio2buffer[i] = buffer[i]; + + err = mlx5e_port_query_buffer(priv, &port_buffer); + if (err) + return err; + + for (i = 0; i < MLX5E_MAX_BUFFER; i++) + dcb_buffer->buffer_size[i] = port_buffer.buffer[i].size; + dcb_buffer->total_size = port_buffer.port_buffer_size; + + return 0; +} + +static int mlx5e_dcbnl_setbuffer(struct net_device *dev, + struct dcbnl_buffer *dcb_buffer) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5e_port_buffer port_buffer; + u8 old_prio2buffer[MLX5E_MAX_PRIORITY]; + u32 *buffer_size = NULL; + u8 *prio2buffer = NULL; + u32 changed = 0; + int i, err; + + if (!MLX5_BUFFER_SUPPORTED(mdev)) + return -EOPNOTSUPP; + + for (i = 0; i < DCBX_MAX_BUFFERS; i++) + mlx5_core_dbg(mdev, "buffer[%d]=%d\n", i, dcb_buffer->buffer_size[i]); + + for (i = 0; i < MLX5E_MAX_PRIORITY; i++) + mlx5_core_dbg(mdev, "priority %d buffer%d\n", i, dcb_buffer->prio2buffer[i]); + + err = mlx5e_port_query_priority2buffer(mdev, old_prio2buffer); + if (err) + return err; + + for (i = 0; i < MLX5E_MAX_PRIORITY; i++) { + if (dcb_buffer->prio2buffer[i] != old_prio2buffer[i]) { + changed |= MLX5E_PORT_BUFFER_PRIO2BUFFER; + prio2buffer = dcb_buffer->prio2buffer; + break; + } + } + + err = mlx5e_port_query_buffer(priv, &port_buffer); + if (err) + return err; + + for (i = 0; i < MLX5E_MAX_BUFFER; i++) { + if (port_buffer.buffer[i].size != dcb_buffer->buffer_size[i]) { + changed |= MLX5E_PORT_BUFFER_SIZE; + buffer_size = dcb_buffer->buffer_size; + break; + } + } + + if (!changed) + return 0; + + priv->dcbx.manual_buffer = true; + err = mlx5e_port_manual_buffer_config(priv, changed, dev->mtu, NULL, + buffer_size, prio2buffer); + return err; +} + +static const struct dcbnl_rtnl_ops mlx5e_dcbnl_ops = { + .ieee_getets = mlx5e_dcbnl_ieee_getets, + .ieee_setets = mlx5e_dcbnl_ieee_setets, + .ieee_getmaxrate = mlx5e_dcbnl_ieee_getmaxrate, + .ieee_setmaxrate = mlx5e_dcbnl_ieee_setmaxrate, + .ieee_getpfc = mlx5e_dcbnl_ieee_getpfc, + .ieee_setpfc = mlx5e_dcbnl_ieee_setpfc, + .ieee_setapp = mlx5e_dcbnl_ieee_setapp, + .ieee_delapp = mlx5e_dcbnl_ieee_delapp, + .getdcbx = mlx5e_dcbnl_getdcbx, + .setdcbx = mlx5e_dcbnl_setdcbx, + .dcbnl_getbuffer = mlx5e_dcbnl_getbuffer, + .dcbnl_setbuffer = mlx5e_dcbnl_setbuffer, + +/* CEE interfaces */ + .setall = mlx5e_dcbnl_setall, + .getstate = mlx5e_dcbnl_getstate, + .getpermhwaddr = mlx5e_dcbnl_getpermhwaddr, + + .setpgtccfgtx = mlx5e_dcbnl_setpgtccfgtx, + .setpgbwgcfgtx = mlx5e_dcbnl_setpgbwgcfgtx, + .getpgtccfgtx = mlx5e_dcbnl_getpgtccfgtx, + .getpgbwgcfgtx = mlx5e_dcbnl_getpgbwgcfgtx, + + .setpfccfg = mlx5e_dcbnl_setpfccfg, + .getpfccfg = mlx5e_dcbnl_getpfccfg, + .getcap = mlx5e_dcbnl_getcap, + .getnumtcs = mlx5e_dcbnl_getnumtcs, + .getpfcstate = mlx5e_dcbnl_getpfcstate, + .setpfcstate = mlx5e_dcbnl_setpfcstate, +}; + +void mlx5e_dcbnl_build_netdev(struct net_device *netdev) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5_core_dev *mdev = priv->mdev; + + if (MLX5_CAP_GEN(mdev, vport_group_manager) && MLX5_CAP_GEN(mdev, qos)) + netdev->dcbnl_ops = &mlx5e_dcbnl_ops; +} + +void mlx5e_dcbnl_build_rep_netdev(struct net_device *netdev) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5_core_dev *mdev = priv->mdev; + + if (MLX5_CAP_GEN(mdev, qos)) + netdev->dcbnl_ops = &mlx5e_dcbnl_ops; +} + +static void mlx5e_dcbnl_query_dcbx_mode(struct mlx5e_priv *priv, + enum mlx5_dcbx_oper_mode *mode) +{ + u32 out[MLX5_ST_SZ_DW(dcbx_param)]; + + *mode = MLX5E_DCBX_PARAM_VER_OPER_HOST; + + if (!mlx5_query_port_dcbx_param(priv->mdev, out)) + *mode = MLX5_GET(dcbx_param, out, version_oper); + + /* From driver's point of view, we only care if the mode + * is host (HOST) or non-host (AUTO) + */ + if (*mode != MLX5E_DCBX_PARAM_VER_OPER_HOST) + *mode = MLX5E_DCBX_PARAM_VER_OPER_AUTO; +} + +static void mlx5e_ets_init(struct mlx5e_priv *priv) +{ + struct ieee_ets ets; + int err; + int i; + + if (!MLX5_CAP_GEN(priv->mdev, ets)) + return; + + memset(&ets, 0, sizeof(ets)); + ets.ets_cap = mlx5_max_tc(priv->mdev) + 1; + for (i = 0; i < ets.ets_cap; i++) { + ets.tc_tx_bw[i] = MLX5E_MAX_BW_ALLOC; + ets.tc_tsa[i] = IEEE_8021QAZ_TSA_VENDOR; + ets.prio_tc[i] = i; + } + + if (ets.ets_cap > 1) { + /* tclass[prio=0]=1, tclass[prio=1]=0, tclass[prio=i]=i (for i>1) */ + ets.prio_tc[0] = 1; + ets.prio_tc[1] = 0; + } + + err = mlx5e_dcbnl_ieee_setets_core(priv, &ets); + if (err) + netdev_err(priv->netdev, + "%s, Failed to init ETS: %d\n", __func__, err); +} + +enum { + INIT, + DELETE, +}; + +static void mlx5e_dcbnl_dscp_app(struct mlx5e_priv *priv, int action) +{ + struct dcb_app temp; + int i; + + if (!MLX5_CAP_GEN(priv->mdev, vport_group_manager)) + return; + + if (!MLX5_DSCP_SUPPORTED(priv->mdev)) + return; + + /* No SEL_DSCP entry in non DSCP state */ + if (priv->dcbx_dp.trust_state != MLX5_QPTS_TRUST_DSCP) + return; + + temp.selector = IEEE_8021QAZ_APP_SEL_DSCP; + for (i = 0; i < MLX5E_MAX_DSCP; i++) { + temp.protocol = i; + temp.priority = priv->dcbx_dp.dscp2prio[i]; + if (action == INIT) + dcb_ieee_setapp(priv->netdev, &temp); + else + dcb_ieee_delapp(priv->netdev, &temp); + } + + priv->dcbx.dscp_app_cnt = (action == INIT) ? MLX5E_MAX_DSCP : 0; +} + +void mlx5e_dcbnl_init_app(struct mlx5e_priv *priv) +{ + mlx5e_dcbnl_dscp_app(priv, INIT); +} + +void mlx5e_dcbnl_delete_app(struct mlx5e_priv *priv) +{ + mlx5e_dcbnl_dscp_app(priv, DELETE); +} + +static void mlx5e_params_calc_trust_tx_min_inline_mode(struct mlx5_core_dev *mdev, + struct mlx5e_params *params, + u8 trust_state) +{ + mlx5_query_min_inline(mdev, ¶ms->tx_min_inline_mode); + if (trust_state == MLX5_QPTS_TRUST_DSCP && + params->tx_min_inline_mode == MLX5_INLINE_MODE_L2) + params->tx_min_inline_mode = MLX5_INLINE_MODE_IP; +} + +static int mlx5e_update_trust_state_hw(struct mlx5e_priv *priv, void *context) +{ + u8 *trust_state = context; + int err; + + err = mlx5_set_trust_state(priv->mdev, *trust_state); + if (err) + return err; + WRITE_ONCE(priv->dcbx_dp.trust_state, *trust_state); + + return 0; +} + +static int mlx5e_set_trust_state(struct mlx5e_priv *priv, u8 trust_state) +{ + struct tc_mqprio_qopt_offload mqprio = {.qopt.num_tc = MLX5E_MAX_NUM_TC}; + struct mlx5e_params new_params; + bool reset = true; + int err; + + mutex_lock(&priv->state_lock); + mqprio.mode = priv->channels.params.mqprio.mode; + if (mqprio.mode != TC_MQPRIO_MODE_DCB) { + netdev_err(priv->netdev, "Can't change trust state while in channel mode.\n"); + err = -EINVAL; + goto unlock; + } + + new_params = priv->channels.params; + mlx5e_params_calc_trust_tx_min_inline_mode(priv->mdev, &new_params, + trust_state); + + /* Skip if tx_min_inline is the same */ + if (new_params.tx_min_inline_mode == priv->channels.params.tx_min_inline_mode) + reset = false; + + err = mlx5e_safe_switch_params(priv, &new_params, + mlx5e_update_trust_state_hw, + &trust_state, reset); +unlock: + mutex_unlock(&priv->state_lock); + if (err) + return err; + + /* In DSCP trust state, we need 8 send queues per channel */ + if (priv->dcbx_dp.trust_state == MLX5_QPTS_TRUST_DSCP) { + mutex_lock(&priv->state_lock); + mlx5e_setup_tc_mqprio(priv, &mqprio); + mutex_unlock(&priv->state_lock); + } else if (priv->dcbx_dp.trust_state == MLX5_QPTS_TRUST_PCP) { + mutex_lock(&priv->state_lock); + mqprio.qopt.num_tc = priv->pcp_tc_num; + mlx5e_setup_tc_mqprio(priv, &mqprio); + mutex_unlock(&priv->state_lock); + } + + return 0; +} + +static int mlx5e_set_dscp2prio(struct mlx5e_priv *priv, u8 dscp, u8 prio) +{ + int err; + + err = mlx5_set_dscp2prio(priv->mdev, dscp, prio); + if (err) + return err; + + priv->dcbx_dp.dscp2prio[dscp] = prio; + return err; +} + +static int mlx5e_trust_initialize(struct mlx5e_priv *priv) +{ + struct mlx5_core_dev *mdev = priv->mdev; + int err; + u8 trust_state; + struct tc_mqprio_qopt_offload mqprio = {.qopt.num_tc = MLX5E_MAX_NUM_TC}; + const bool take_rtnl = priv->netdev->reg_state == NETREG_REGISTERED; + + if (!MLX5_DSCP_SUPPORTED(mdev)) { + WRITE_ONCE(priv->dcbx_dp.trust_state, MLX5_QPTS_TRUST_PCP); + return 0; + } + + err = mlx5_query_trust_state(priv->mdev, &trust_state); + if (err) + return err; + WRITE_ONCE(priv->dcbx_dp.trust_state, trust_state); + + if (priv->dcbx_dp.trust_state == MLX5_QPTS_TRUST_PCP && priv->dcbx.dscp_app_cnt) { + /* + * Align the driver state with the register state. + * Temporary state change is required to enable the app list reset. + */ + priv->dcbx_dp.trust_state = MLX5_QPTS_TRUST_DSCP; + mlx5e_dcbnl_delete_app(priv); + priv->dcbx_dp.trust_state = MLX5_QPTS_TRUST_PCP; + } + + mlx5e_params_calc_trust_tx_min_inline_mode(priv->mdev, &priv->channels.params, + priv->dcbx_dp.trust_state); + if (priv->dcbx_dp.trust_state == MLX5_QPTS_TRUST_DSCP) { + if (take_rtnl) + rtnl_lock(); + mlx5e_setup_tc_mqprio(priv, &mqprio); + if (take_rtnl) + rtnl_unlock(); + } + + err = mlx5_query_dscp2prio(priv->mdev, priv->dcbx_dp.dscp2prio); + if (err) + return err; + + return 0; +} + +#define MLX5E_BUFFER_CELL_SHIFT 7 + +static u16 mlx5e_query_port_buffers_cell_size(struct mlx5e_priv *priv) +{ + struct mlx5_core_dev *mdev = priv->mdev; + u32 out[MLX5_ST_SZ_DW(sbcam_reg)] = {}; + u32 in[MLX5_ST_SZ_DW(sbcam_reg)] = {}; + + if (!MLX5_CAP_GEN(mdev, sbcam_reg)) + return (1 << MLX5E_BUFFER_CELL_SHIFT); + + if (mlx5_core_access_reg(mdev, in, sizeof(in), out, sizeof(out), + MLX5_REG_SBCAM, 0, 0)) + return (1 << MLX5E_BUFFER_CELL_SHIFT); + + return MLX5_GET(sbcam_reg, out, cap_cell_size); +} + +void mlx5e_dcbnl_initialize(struct mlx5e_priv *priv) +{ + struct mlx5e_dcbx *dcbx = &priv->dcbx; + + mlx5e_trust_initialize(priv); + + if (!MLX5_CAP_GEN(priv->mdev, qos)) + return; + + if (MLX5_CAP_GEN(priv->mdev, dcbx)) + mlx5e_dcbnl_query_dcbx_mode(priv, &dcbx->mode); + + priv->dcbx.cap = DCB_CAP_DCBX_VER_CEE | + DCB_CAP_DCBX_VER_IEEE; + if (priv->dcbx.mode == MLX5E_DCBX_PARAM_VER_OPER_HOST) + priv->dcbx.cap |= DCB_CAP_DCBX_HOST; + + priv->dcbx.port_buff_cell_sz = mlx5e_query_port_buffers_cell_size(priv); + priv->dcbx.manual_buffer = false; + priv->dcbx.cable_len = MLX5E_DEFAULT_CABLE_LEN; + + mlx5e_ets_init(priv); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_debugfs.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_debugfs.c new file mode 100644 index 0000000..c75ee9c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_debugfs.c @@ -0,0 +1,193 @@ +/* + * Copyright (c) 2015, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include "en.h" +#include "en/rx_res.h" +#include "en/rss.h" + +/* For non-default namespaces, add suffix in format "@" */ +/* PCI id format: "%04x:%02x:%02x.%d" pci_domain bus_num pci_slot pci_func */ +#define PCI_ID_LEN 16 +#define MLX5_MAX_DEBUGFS_ROOT_NAME_LEN (IFNAMSIZ + 1 + PCI_ID_LEN) +#define MLX5_MAX_DEBUGFS_NAME_LEN 16 + +static void mlx5e_create_channel_debugfs(struct mlx5e_priv *priv, + int channel_num) +{ + int i; + char name[MLX5_MAX_DEBUGFS_NAME_LEN]; + struct dentry *channel_root; + struct mlx5e_channel *channel; + u8 num_tc = mlx5e_get_dcb_num_tc(&priv->channels.params); + + snprintf(name, MLX5_MAX_DEBUGFS_NAME_LEN, "channel-%d", channel_num); + channel_root = debugfs_create_dir(name, priv->dfs_root); + if (!channel_root) { + netdev_err(priv->netdev, + "Failed to create channel debugfs for %s\n", + priv->netdev->name); + return; + } + priv->channels.c[channel_num]->dfs_root = channel_root; + channel = priv->channels.c[channel_num]; + + for (i = 0; i < num_tc; i++) { + snprintf(name, MLX5_MAX_DEBUGFS_NAME_LEN, "sqn-%d", i); + debugfs_create_u32(name, S_IRUSR, channel_root, + &channel->sq[i].sqn); + + snprintf(name, MLX5_MAX_DEBUGFS_NAME_LEN, "sq-cqn-%d", i); + debugfs_create_u32(name, S_IRUSR, channel_root, + &channel->sq[i].cq.mcq.cqn); + } + + debugfs_create_u32("rqn", S_IRUSR, channel_root, + &channel->rq.rqn); + + debugfs_create_u32("rq-cqn", S_IRUSR, channel_root, + &channel->rq.cq.mcq.cqn); +} + +struct rx_res_debugfs { + struct mlx5e_rx_res *rx_res; + int i; +}; + +static int get_tir_dir(void *data, u64 *val) +{ + struct rx_res_debugfs *rx_res_dbg = (struct rx_res_debugfs *)data; + + *val = mlx5e_rx_res_get_tirn_direct(rx_res_dbg->rx_res, rx_res_dbg->i); + return 0; +} + +static int get_tir_indir(void *data, u64 *val) +{ + struct rx_res_debugfs *rx_res_dbg = (struct rx_res_debugfs *)data; + + *val = mlx5e_rx_res_get_tirn_rss(rx_res_dbg->rx_res, rx_res_dbg->i); + return 0; +} + +DEFINE_DEBUGFS_ATTRIBUTE(fops_dir, get_tir_dir, NULL, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_indir, get_tir_indir, NULL, "%llu\n"); + +void mlx5e_create_debugfs(struct mlx5e_priv *priv) +{ + int i; + char ns_root_name[MLX5_MAX_DEBUGFS_ROOT_NAME_LEN]; + char name[MLX5_MAX_DEBUGFS_NAME_LEN]; + char *root_name; + u8 num_tc = mlx5e_get_dcb_num_tc(&priv->channels.params); + + struct net_device *dev = priv->netdev; + struct net *net = dev_net(dev); + + if (net_eq(net, &init_net)) { + root_name = dev->name; + } else { + snprintf(ns_root_name, MLX5_MAX_DEBUGFS_ROOT_NAME_LEN, + "%s@%s", dev->name, dev_name(priv->mdev->device)); + root_name = ns_root_name; + } + + priv->dfs_root = debugfs_create_dir(root_name, NULL); + if (!priv->dfs_root) { + netdev_err(priv->netdev, "Failed to init debugfs files for %s\n", + root_name); + return; + } + + debugfs_create_u8("num_tc", S_IRUSR, priv->dfs_root, + &num_tc); + + for (i = 0; i < mlx5e_get_num_lag_ports(priv->mdev); i++) { + int tc; + + for (tc = 0; tc < num_tc; tc++) { + snprintf(name, MLX5_MAX_DEBUGFS_NAME_LEN, "tisn-%d_%d", i, tc); + debugfs_create_u32(name, S_IRUSR, priv->dfs_root, + &priv->tisn[i][tc]); + } + } + + for (i = 0; i < MLX5E_NUM_INDIR_TIRS; i++) { + struct rx_res_debugfs *rx_res_dbg = kvzalloc(sizeof(*rx_res_dbg), GFP_KERNEL); + + rx_res_dbg->i = i; + snprintf(name, MLX5_MAX_DEBUGFS_NAME_LEN, "indir-tirn-%d", i); + debugfs_create_file_unsafe(name, 0400, priv->dfs_root, rx_res_dbg, &fops_indir); + } + + for (i = 0; i < priv->max_nch; i++) { + struct rx_res_debugfs *rx_res_dbg = kvzalloc(sizeof(*rx_res_dbg), GFP_KERNEL); + + rx_res_dbg->i = i; + snprintf(name, MLX5_MAX_DEBUGFS_NAME_LEN, "dir-tirn-%d", i); + debugfs_create_file_unsafe(name, 0400, priv->dfs_root, rx_res_dbg, &fops_dir); + } + + for (i = 0; i < priv->channels.num; i++) + mlx5e_create_channel_debugfs(priv, i); +} + +void mlx5e_debugs_free_recursive_private_data(struct mlx5e_priv *priv) +{ + int i; + struct dentry *dent; + char name[MLX5_MAX_DEBUGFS_NAME_LEN]; + + for (i = 0; i < MLX5E_NUM_INDIR_TIRS; i++) { + snprintf(name, MLX5_MAX_DEBUGFS_NAME_LEN, "indir-tirn-%d", i); + + dent = debugfs_lookup(name, priv->dfs_root); + if (dent && dent->d_inode && dent->d_inode->i_private) + kvfree(dent->d_inode->i_private); + } + + for (i = 0; i < priv->max_nch; i++) { + snprintf(name, MLX5_MAX_DEBUGFS_NAME_LEN, "dir-tirn-%d", i); + + dent = debugfs_lookup(name, priv->dfs_root); + if (dent && dent->d_inode && dent->d_inode->i_private) + kvfree(dent->d_inode->i_private); + } +} + +void mlx5e_destroy_debugfs(struct mlx5e_priv *priv) +{ + mlx5e_debugs_free_recursive_private_data(priv); + debugfs_remove_recursive(priv->dfs_root); + priv->dfs_root = NULL; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_diag.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_diag.c new file mode 100644 index 0000000..d5d4f08 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_diag.c @@ -0,0 +1,302 @@ +/* + * Copyright (c) 2016, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "en.h" +#include "lib/eq.h" + +#define MLX5_DRV_VER_SZ 64 +#define MLX5_DEV_NAME_SZ 64 + +#define DIAG_BLK_SZ(data_size) (sizeof(struct mlx5_diag_blk) + data_size) +#define DIAG_GET_NEXT_BLK(dump_hdr) \ + ((struct mlx5_diag_blk *)(dump_hdr->dump + dump_hdr->total_length)) + +static int mlx5e_diag_fill_device_name(struct mlx5e_priv *priv, void *buff) +{ + struct mlx5_core_dev *mdev = priv->mdev; + size_t pci_name_sz = strlen(pci_name(mdev->pdev)); + + memset(buff, 0, MLX5_DEV_NAME_SZ); + strncpy(buff, pci_name(mdev->pdev), MLX5_DEV_NAME_SZ); + if (pci_name_sz >= MLX5_DEV_NAME_SZ - 2) + goto out; + + /* there is at least 2 bytes left */ + buff += pci_name_sz; + strncpy(buff, ":", 1); + buff += 1; + + strncpy(buff, priv->netdev->name, MLX5_DEV_NAME_SZ - pci_name_sz - 1); +out: + return MLX5_DEV_NAME_SZ; +} + +static int mlx5e_diag_fill_driver_version(void *buff) +{ + memset(buff, 0, MLX5_DRV_VER_SZ); + return MLX5_DRV_VER_SZ; +} + +static int dump_rq_info(struct mlx5e_rq *rq, void *buffer) +{ + struct mlx5_diag_wq *rqd = (struct mlx5_diag_wq *)buffer; + + rqd->wq_type = MLX5_DIAG_RQ; + rqd->wqn = rq->rqn; + rqd->ci = 0; + rqd->pi = rq->wqe.wq.cur_sz; + rqd->wqe_stride = rq->wqe.wq.fbc.log_stride; + rqd->size = rq->wqe.wq.fbc.sz_m1 + 1; + rqd->wqe_num = ((rq->wqe.wq.fbc.sz_m1 + 1) << rq->wqe.wq.fbc.log_stride); + rqd->group_id = rq->ix; + + return sizeof(*rqd); +} + +static int dump_sq_info(struct mlx5e_txqsq *sq, void *buffer) +{ + struct mlx5_diag_wq *sqd = (struct mlx5_diag_wq *)buffer; + + sqd->wq_type = MLX5_DIAG_SQ; + sqd->wqn = sq->sqn; + sqd->ci = sq->cc; + sqd->pi = sq->pc; + sqd->wqe_stride = sq->wq.fbc.log_stride; + sqd->size = sq->wq.fbc.sz_m1 + 1; + sqd->wqe_num = ((sq->wq.fbc.sz_m1 + 1) << sq->wq.fbc.log_stride); + sqd->group_id = sq->ch_ix; + + return sizeof(*sqd); +} + +static int dump_cq_info(struct mlx5e_cq *cq, void *buffer, int ix) +{ + struct mlx5_diag_wq *cqd = (struct mlx5_diag_wq *)buffer; + struct mlx5_cqwq *wq = &cq->wq; + + cqd->wq_type = MLX5_DIAG_CQ; + cqd->wqn = cq->mcq.cqn; + cqd->ci = wq->cc & wq->fbc.sz_m1; + cqd->pi = 0; + cqd->wqe_stride = wq->fbc.log_stride; + cqd->size = wq->fbc.sz_m1 + 1; + cqd->wqe_num = cqd->size; + cqd->group_id = ix; + + return sizeof(*cqd); +} + +static int dump_eq_info(struct mlx5_eq_comp *eq, void *buffer) +{ + struct mlx5_diag_eq *eqd = (struct mlx5_diag_eq *)buffer; + + eqd->type = MLX5_DIAG_EQ; + eqd->ci = eq->core.cons_index; + eqd->irqn = eq->core.irqn; + eqd->eqn = eq->core.eqn; + eqd->nent = eq_get_size(&eq->core); + eqd->mask = 0; + + return sizeof(*eqd); +} + +static void dump_channel_info(struct mlx5e_channel *c, + struct mlx5_diag_dump *dump_hdr) +{ + struct mlx5_diag_blk *dump_blk; + struct mlx5_eq_comp eqc; + int i; + + for (i = 0; i < c->num_tc; i++) { + /* Dump SQ */ + dump_blk = DIAG_GET_NEXT_BLK(dump_hdr); + dump_blk->type = MLX5_DIAG_SQ; + dump_blk->length = dump_sq_info(&c->sq[i], &dump_blk->data); + dump_hdr->total_length += DIAG_BLK_SZ(dump_blk->length); + dump_hdr->num_blocks++; + + /* Dump SQ CQ */ + dump_blk = DIAG_GET_NEXT_BLK(dump_hdr); + dump_blk->type = MLX5_DIAG_CQ; + dump_blk->length = dump_cq_info(&c->sq[i].cq, &dump_blk->data, + c->sq[i].ch_ix); + dump_hdr->total_length += DIAG_BLK_SZ(dump_blk->length); + dump_hdr->num_blocks++; + } + + /* Dump RQ */ + dump_blk = DIAG_GET_NEXT_BLK(dump_hdr); + dump_blk->type = MLX5_DIAG_RQ; + dump_blk->length = dump_rq_info(&c->rq, &dump_blk->data); + dump_hdr->total_length += DIAG_BLK_SZ(dump_blk->length); + dump_hdr->num_blocks++; + + /* Dump RQ CQ */ + dump_blk = DIAG_GET_NEXT_BLK(dump_hdr); + dump_blk->type = MLX5_DIAG_CQ; + dump_blk->length = dump_cq_info(&c->rq.cq, &dump_blk->data, c->rq.ix); + dump_hdr->total_length += DIAG_BLK_SZ(dump_blk->length); + dump_hdr->num_blocks++; + + /* Dump EQ */ + mlx5_vector2eq(c->priv->mdev, c->ix, &eqc); + dump_blk = DIAG_GET_NEXT_BLK(dump_hdr); + dump_blk->type = MLX5_DIAG_EQ; + dump_blk->length = dump_eq_info(&eqc, &dump_blk->data); + dump_hdr->total_length += DIAG_BLK_SZ(dump_blk->length); + dump_hdr->num_blocks++; +} + +static void dump_channels_info(struct mlx5e_priv *priv, + struct mlx5_diag_dump *dump_hdr) +{ + u32 nch = priv->channels.num; + int i; + + for (i = 0; i < nch; i++) + dump_channel_info(priv->channels.c[i], dump_hdr); +} + +int mlx5e_set_dump(struct net_device *netdev, struct ethtool_dump *dump) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + + priv->channels.params.dump.flag = dump->flag; + return 0; +} + +int mlx5e_get_dump_flag(struct net_device *netdev, struct ethtool_dump *dump) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + __u32 extra_len = 0; + + dump->version = MLX5_DIAG_DUMP_VERSION; + dump->flag = priv->channels.params.dump.flag; + + if (dump->flag & MLX5_DIAG_FLAG_MST) { + u32 mst_size = mlx5_mst_capture(priv->mdev); + + if (mst_size <= 0) { + dump->flag &= ~MLX5_DIAG_FLAG_MST; + netdev_warn(priv->netdev, + "Failed to get mst dump, err (%d)\n", + mst_size); + mst_size = 0; + } + priv->channels.params.dump.mst_size = mst_size; + extra_len += mst_size ? DIAG_BLK_SZ(mst_size) : 0; + } + + mutex_lock(&priv->state_lock); + if (dump->flag & MLX5_DIAG_FLAG_CHANNELS && + test_bit(MLX5E_STATE_OPENED, &priv->state)) { + u32 nch = priv->channels.num; + u32 ntc = priv->channels.params.mqprio.num_tc; + + extra_len += + nch * ntc * DIAG_BLK_SZ(sizeof(struct mlx5_diag_wq)) + /* SQs */ + nch * ntc * DIAG_BLK_SZ(sizeof(struct mlx5_diag_wq)) + /* SQs CQs */ + nch * DIAG_BLK_SZ(sizeof(struct mlx5_diag_wq)) + /* RQs */ + nch * DIAG_BLK_SZ(sizeof(struct mlx5_diag_wq)) + /* RQs CQs */ + nch * DIAG_BLK_SZ(sizeof(struct mlx5_diag_eq)); /* EQs */ + } + mutex_unlock(&priv->state_lock); + + dump->len = sizeof(struct mlx5_diag_dump) + + DIAG_BLK_SZ(MLX5_DRV_VER_SZ) + + DIAG_BLK_SZ(MLX5_DEV_NAME_SZ) + + extra_len; + return 0; +} + +int mlx5e_get_dump_data(struct net_device *netdev, struct ethtool_dump *dump, + void *buffer) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5_diag_dump *dump_hdr = buffer; + struct mlx5_diag_blk *dump_blk; + struct mlx5_mcion_reg mcion = {}; + int module_num; + int err; + + err = mlx5_query_module_num(priv->mdev, &module_num); + + if (err) + return err; + + mcion.module = module_num; + dump_hdr->version = MLX5_DIAG_DUMP_VERSION; + dump_hdr->flag = 0; + dump_hdr->num_blocks = 0; + dump_hdr->total_length = 0; + mlx5_icmd_access_register(priv->mdev, + MLX5_ICMD_MCION, + MLX5_ICMD_QUERY, + &mcion, + sizeof(mcion) / 4); + dump_hdr->module_no = mcion.module; + dump_hdr->module_status = mcion.module_status; + + /* Dump driver version */ + dump_blk = DIAG_GET_NEXT_BLK(dump_hdr); + dump_blk->type = MLX5_DIAG_DRV_VERSION; + dump_blk->length = mlx5e_diag_fill_driver_version(&dump_blk->data); + dump_hdr->total_length += DIAG_BLK_SZ(dump_blk->length); + dump_hdr->num_blocks++; + + /* Dump device name */ + dump_blk = DIAG_GET_NEXT_BLK(dump_hdr); + dump_blk->type = MLX5_DIAG_DEVICE_NAME; + dump_blk->length = mlx5e_diag_fill_device_name(priv, &dump_blk->data); + dump_hdr->total_length += DIAG_BLK_SZ(dump_blk->length); + dump_hdr->num_blocks++; + + /* Dump channels info */ + mutex_lock(&priv->state_lock); + if (priv->channels.params.dump.flag & MLX5_DIAG_FLAG_CHANNELS && + test_bit(MLX5E_STATE_OPENED, &priv->state)) + dump_channels_info(priv, dump_hdr); + mutex_unlock(&priv->state_lock); + + if (priv->channels.params.dump.flag & MLX5_DIAG_FLAG_MST) { + /* Dump mst buffer */ + dump_blk = DIAG_GET_NEXT_BLK(dump_hdr); + dump_blk->type = MLX5_DIAG_MST; + dump_blk->length = mlx5_mst_dump(priv->mdev, &dump_blk->data, + priv->channels.params.dump.mst_size); + dump_hdr->total_length += DIAG_BLK_SZ(dump_blk->length); + dump_hdr->num_blocks++; + dump_hdr->flag |= MLX5_DIAG_FLAG_MST; + } + + return 0; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c new file mode 100644 index 0000000..df123dd --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2016, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include "en.h" + +static void +mlx5e_complete_dim_work(struct dim *dim, struct dim_cq_moder moder, + struct mlx5_core_dev *mdev, struct mlx5_core_cq *mcq) +{ + mlx5_core_modify_cq_moderation(mdev, mcq, moder.usec, moder.pkts); + dim->state = DIM_START_MEASURE; +} + +void mlx5e_rx_dim_work(struct work_struct *work) +{ + struct dim *dim = container_of(work, struct dim, work); + struct mlx5e_dim *dim_obj = container_of(dim, struct mlx5e_dim, dim); + struct mlx5e_rq *rq = container_of(dim_obj, struct mlx5e_rq, dim_obj); + struct dim_cq_moder cur_moder = + net_dim_get_rx_moderation(dim->mode, dim->profile_ix); + + mlx5e_complete_dim_work(dim, cur_moder, rq->mdev, &rq->cq.mcq); +} + +void mlx5e_tx_dim_work(struct work_struct *work) +{ + struct dim *dim = container_of(work, struct dim, work); + struct mlx5e_dim *dim_obj = container_of(dim, struct mlx5e_dim, dim); + struct mlx5e_txqsq *sq = container_of(dim_obj, struct mlx5e_txqsq, dim_obj); + struct dim_cq_moder cur_moder = + net_dim_get_tx_moderation(dim->mode, dim->profile_ix); + + mlx5e_complete_dim_work(dim, cur_moder, sq->cq.mdev, &sq->cq.mcq); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_ecn.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_ecn.c new file mode 100644 index 0000000..966f676 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_ecn.c @@ -0,0 +1,1259 @@ +/* + * Copyright (c) 2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include "en.h" +#include "en_ecn.h" + +ssize_t mlx5e_show_ecn_enable(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct mlx5e_ecn_enable_ctx *enable_attr = container_of(attr, + struct mlx5e_ecn_enable_ctx , + enable); + int is_enable; + int err; + + err = mlx5_query_port_cong_status(enable_attr->mdev, + enable_attr->cong_protocol, + enable_attr->priority, &is_enable); + if (!err) + return sprintf(buf, "%d\n", is_enable); + return 0; +} + +ssize_t mlx5e_store_ecn_enable(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct mlx5e_ecn_enable_ctx *enable_attr = container_of(attr, + struct mlx5e_ecn_enable_ctx , + enable); + int is_qcn_enable; + int enable; + int err; + + err = sscanf(buf, "%d", &enable); + if (err != 1) + return -EINVAL; + + err = mlx5_query_port_cong_status(enable_attr->mdev, + enable_attr->cong_protocol, + enable_attr->priority, + &is_qcn_enable); + if (err) + return err; + + enable &= 1; + if (enable == is_qcn_enable) + goto success; + + err = mlx5_modify_port_cong_status(enable_attr->mdev, + enable_attr->cong_protocol, + enable_attr->priority, enable); + if (err) + return err; + +success: + return count; +} + +ssize_t mlx5e_show_clamp_tgt_rate(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct mlx5e_ecn_rp_attributes *rp_attr = container_of(attr, + struct mlx5e_ecn_rp_attributes, + clamp_tgt_rate); + u32 out[MLX5_ST_SZ_DW(query_cong_params_out)]; + void *congestion_parameters; + u8 clamp_tgt_rate = -1; + int err; + + err = mlx5_query_port_cong_params(rp_attr->mdev, + MLX5E_CON_PROTOCOL_R_ROCE_RP, + out, sizeof(out)); + if (!err) { + congestion_parameters = MLX5_ADDR_OF(query_cong_params_out, + out, + congestion_parameters); + clamp_tgt_rate = MLX5_GET(cong_control_r_roce_ecn_rp, + congestion_parameters, + clamp_tgt_rate); + return sprintf(buf, "%d\n", clamp_tgt_rate); + } + return err; +} + +ssize_t mlx5e_store_clamp_tgt_rate(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct mlx5e_ecn_rp_attributes *rp_attr = container_of(attr, + struct mlx5e_ecn_rp_attributes, + clamp_tgt_rate); + u32 in[MLX5_ST_SZ_DW(modify_cong_params_in)]; + void *congestion_parameters; + int clamp_tgt_rate; + void *field_select; + int err; + + err = sscanf(buf, "%d", &clamp_tgt_rate); + + memset(in, 0, sizeof(in)); + MLX5_SET(modify_cong_params_in, in, opcode, + MLX5_CMD_OP_MODIFY_CONG_PARAMS); + MLX5_SET(modify_cong_params_in, in, cong_protocol, + MLX5E_CON_PROTOCOL_R_ROCE_RP); + field_select = MLX5_ADDR_OF(modify_cong_params_in, in, field_select); + MLX5_SET(field_select_r_roce_rp, field_select, field_select_r_roce_rp, + 1 << MLX5E_RP_CLAMP_TGT_RATE); + congestion_parameters = MLX5_ADDR_OF(modify_cong_params_in, in, + congestion_parameters); + MLX5_SET(cong_control_r_roce_ecn_rp, congestion_parameters, + clamp_tgt_rate, clamp_tgt_rate); + + err = mlx5_modify_port_cong_params(rp_attr->mdev, in, sizeof(in)); + + if (err) + return err; + return count; +} + +ssize_t mlx5e_show_clamp_tgt_rate_ati(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct mlx5e_ecn_rp_attributes *rp_attr = container_of(attr, + struct mlx5e_ecn_rp_attributes, + clamp_tgt_rate_ati); + u32 out[MLX5_ST_SZ_DW(query_cong_params_out)]; + void *congestion_parameters; + u32 clamp_tgt_rate_ati = -1; + int err; + + err = mlx5_query_port_cong_params(rp_attr->mdev, + MLX5E_CON_PROTOCOL_R_ROCE_RP, + out, sizeof(out)); + if (!err) { + congestion_parameters = MLX5_ADDR_OF(query_cong_params_out, + out, + congestion_parameters); + clamp_tgt_rate_ati = MLX5_GET(cong_control_r_roce_ecn_rp, + congestion_parameters, + clamp_tgt_rate_after_time_inc); + return sprintf(buf, "%d\n", clamp_tgt_rate_ati); + } + return err; +} + +ssize_t mlx5e_store_clamp_tgt_rate_ati(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct mlx5e_ecn_rp_attributes *rp_attr = container_of(attr, + struct mlx5e_ecn_rp_attributes, + clamp_tgt_rate_ati); + u32 in[MLX5_ST_SZ_DW(modify_cong_params_in)]; + void *congestion_parameters; + int clamp_tgt_rate_ati; + void *field_select; + int err; + + err = sscanf(buf, "%d", &clamp_tgt_rate_ati); + + memset(in, 0, sizeof(in)); + MLX5_SET(modify_cong_params_in, in, opcode, + MLX5_CMD_OP_MODIFY_CONG_PARAMS); + MLX5_SET(modify_cong_params_in, in, cong_protocol, + MLX5E_CON_PROTOCOL_R_ROCE_RP); + field_select = MLX5_ADDR_OF(modify_cong_params_in, in, field_select); + MLX5_SET(field_select_r_roce_rp, field_select, field_select_r_roce_rp, + 1 << MLX5E_RP_CLAMP_TGT_RATE_AFTER_TIME_INC); + congestion_parameters = MLX5_ADDR_OF(modify_cong_params_in, in, + congestion_parameters); + MLX5_SET(cong_control_r_roce_ecn_rp, congestion_parameters, + clamp_tgt_rate_after_time_inc, clamp_tgt_rate_ati); + + err = mlx5_modify_port_cong_params(rp_attr->mdev, in, sizeof(in)); + + if (err) + return err; + return count; +} + +ssize_t mlx5e_show_rpg_time_reset(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct mlx5e_ecn_rp_attributes *rp_attr = container_of(attr, + struct mlx5e_ecn_rp_attributes, + rpg_time_reset); + u32 out[MLX5_ST_SZ_DW(query_cong_params_out)]; + void *congestion_parameters; + u32 rpg_time_reset = -1; + int err; + + err = mlx5_query_port_cong_params(rp_attr->mdev, + MLX5E_CON_PROTOCOL_R_ROCE_RP, + out, sizeof(out)); + if (!err) { + congestion_parameters = MLX5_ADDR_OF(query_cong_params_out, + out, + congestion_parameters); + rpg_time_reset = MLX5_GET(cong_control_r_roce_ecn_rp, + congestion_parameters, + rpg_time_reset); + return sprintf(buf, "%d\n", rpg_time_reset); + } + return err; +} + +ssize_t mlx5e_store_rpg_time_reset(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct mlx5e_ecn_rp_attributes *rp_attr = container_of(attr, + struct mlx5e_ecn_rp_attributes, + rpg_time_reset); + u32 in[MLX5_ST_SZ_DW(modify_cong_params_in)]; + void *congestion_parameters; + int rpg_time_reset; + void *field_select; + int err; + + err = sscanf(buf, "%d", &rpg_time_reset); + + memset(in, 0, sizeof(in)); + MLX5_SET(modify_cong_params_in, in, opcode, + MLX5_CMD_OP_MODIFY_CONG_PARAMS); + MLX5_SET(modify_cong_params_in, in, cong_protocol, + MLX5E_CON_PROTOCOL_R_ROCE_RP); + field_select = MLX5_ADDR_OF(modify_cong_params_in, in, field_select); + MLX5_SET(field_select_r_roce_rp, field_select, field_select_r_roce_rp, + 1 << MLX5E_RP_RPG_TIME_RESET); + congestion_parameters = MLX5_ADDR_OF(modify_cong_params_in, in, + congestion_parameters); + MLX5_SET(cong_control_r_roce_ecn_rp, congestion_parameters, + rpg_time_reset, rpg_time_reset); + + err = mlx5_modify_port_cong_params(rp_attr->mdev, in, sizeof(in)); + + if (err) + return err; + return count; +} + +ssize_t mlx5e_show_rpg_byte_reset(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct mlx5e_ecn_rp_attributes *rp_attr = container_of(attr, + struct mlx5e_ecn_rp_attributes, + rpg_byte_reset); + u32 out[MLX5_ST_SZ_DW(query_cong_params_out)]; + void *congestion_parameters; + u32 rpg_byte_reset = -1; + int err; + + err = mlx5_query_port_cong_params(rp_attr->mdev, + MLX5E_CON_PROTOCOL_R_ROCE_RP, + out, sizeof(out)); + if (!err) { + congestion_parameters = MLX5_ADDR_OF(query_cong_params_out, + out, + congestion_parameters); + rpg_byte_reset = MLX5_GET(cong_control_r_roce_ecn_rp, + congestion_parameters, + rpg_byte_reset); + return sprintf(buf, "%d\n", rpg_byte_reset); + } + return err; +} + +ssize_t mlx5e_store_rpg_byte_reset(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct mlx5e_ecn_rp_attributes *rp_attr = container_of(attr, + struct mlx5e_ecn_rp_attributes, + rpg_byte_reset); + u32 in[MLX5_ST_SZ_DW(modify_cong_params_in)]; + void *congestion_parameters; + int rpg_byte_reset; + void *field_select; + int err; + + err = sscanf(buf, "%d", &rpg_byte_reset); + + memset(in, 0, sizeof(in)); + MLX5_SET(modify_cong_params_in, in, opcode, + MLX5_CMD_OP_MODIFY_CONG_PARAMS); + MLX5_SET(modify_cong_params_in, in, cong_protocol, + MLX5E_CON_PROTOCOL_R_ROCE_RP); + field_select = MLX5_ADDR_OF(modify_cong_params_in, in, field_select); + MLX5_SET(field_select_r_roce_rp, field_select, field_select_r_roce_rp, + 1 << MLX5E_RP_RPG_BYTE_RESET); + congestion_parameters = MLX5_ADDR_OF(modify_cong_params_in, in, + congestion_parameters); + MLX5_SET(cong_control_r_roce_ecn_rp, congestion_parameters, + rpg_byte_reset, rpg_byte_reset); + + err = mlx5_modify_port_cong_params(rp_attr->mdev, in, sizeof(in)); + + if (err) + return err; + return count; +} + +ssize_t mlx5e_show_rpg_threshold(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct mlx5e_ecn_rp_attributes *rp_attr = container_of(attr, + struct mlx5e_ecn_rp_attributes, + rpg_threshold); + u32 out[MLX5_ST_SZ_DW(query_cong_params_out)]; + void *congestion_parameters; + u32 rpg_threshold = -1; + int err; + + err = mlx5_query_port_cong_params(rp_attr->mdev, + MLX5E_CON_PROTOCOL_R_ROCE_RP, + out, sizeof(out)); + if (!err) { + congestion_parameters = MLX5_ADDR_OF(query_cong_params_out, + out, + congestion_parameters); + rpg_threshold = MLX5_GET(cong_control_r_roce_ecn_rp, + congestion_parameters, rpg_threshold); + return sprintf(buf, "%d\n", rpg_threshold); + } + return err; +} + +ssize_t mlx5e_store_rpg_threshold(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct mlx5e_ecn_rp_attributes *rp_attr = container_of(attr, + struct mlx5e_ecn_rp_attributes, + rpg_threshold); + u32 in[MLX5_ST_SZ_DW(modify_cong_params_in)]; + void *congestion_parameters; + int rpg_threshold; + void *field_select; + int err; + + err = sscanf(buf, "%d", &rpg_threshold); + + memset(in, 0, sizeof(in)); + MLX5_SET(modify_cong_params_in, in, opcode, + MLX5_CMD_OP_MODIFY_CONG_PARAMS); + MLX5_SET(modify_cong_params_in, in, cong_protocol, + MLX5E_CON_PROTOCOL_R_ROCE_RP); + field_select = MLX5_ADDR_OF(modify_cong_params_in, in, field_select); + MLX5_SET(field_select_r_roce_rp, field_select, field_select_r_roce_rp, + 1 << MLX5E_RP_RPG_THRESHOLD); + congestion_parameters = MLX5_ADDR_OF(modify_cong_params_in, in, + congestion_parameters); + MLX5_SET(cong_control_r_roce_ecn_rp, congestion_parameters, + rpg_threshold, rpg_threshold); + + err = mlx5_modify_port_cong_params(rp_attr->mdev, in, sizeof(in)); + + if (err) + return err; + return count; +} + +ssize_t mlx5e_show_rpg_max_rate(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct mlx5e_ecn_rp_attributes *rp_attr = container_of(attr, + struct mlx5e_ecn_rp_attributes, + rpg_max_rate); + u32 out[MLX5_ST_SZ_DW(query_cong_params_out)]; + void *congestion_parameters; + u32 rpg_max_rate = -1; + int err; + + err = mlx5_query_port_cong_params(rp_attr->mdev, + MLX5E_CON_PROTOCOL_R_ROCE_RP, + out, sizeof(out)); + if (!err) { + congestion_parameters = MLX5_ADDR_OF(query_cong_params_out, + out, + congestion_parameters); + rpg_max_rate = MLX5_GET(cong_control_r_roce_ecn_rp, + congestion_parameters, rpg_max_rate); + return sprintf(buf, "%d\n", rpg_max_rate); + } + return err; +} + +ssize_t mlx5e_store_rpg_max_rate(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct mlx5e_ecn_rp_attributes *rp_attr = container_of(attr, + struct mlx5e_ecn_rp_attributes, + rpg_max_rate); + u32 in[MLX5_ST_SZ_DW(modify_cong_params_in)]; + void *congestion_parameters; + int rpg_max_rate; + void *field_select; + int err; + + err = sscanf(buf, "%d", &rpg_max_rate); + + memset(in, 0, sizeof(in)); + MLX5_SET(modify_cong_params_in, in, opcode, + MLX5_CMD_OP_MODIFY_CONG_PARAMS); + MLX5_SET(modify_cong_params_in, in, cong_protocol, + MLX5E_CON_PROTOCOL_R_ROCE_RP); + field_select = MLX5_ADDR_OF(modify_cong_params_in, in, field_select); + MLX5_SET(field_select_r_roce_rp, field_select, field_select_r_roce_rp, + 1 << MLX5E_RP_RPG_MAX_RATE); + congestion_parameters = MLX5_ADDR_OF(modify_cong_params_in, in, + congestion_parameters); + MLX5_SET(cong_control_r_roce_ecn_rp, congestion_parameters, + rpg_max_rate, rpg_max_rate); + + err = mlx5_modify_port_cong_params(rp_attr->mdev, in, sizeof(in)); + + if (err) + return err; + return count; +} + +ssize_t mlx5e_show_rpg_ai_rate(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct mlx5e_ecn_rp_attributes *rp_attr = container_of(attr, + struct mlx5e_ecn_rp_attributes, + rpg_ai_rate); + u32 out[MLX5_ST_SZ_DW(query_cong_params_out)]; + void *congestion_parameters; + u32 rpg_ai_rate = -1; + int err; + + err = mlx5_query_port_cong_params(rp_attr->mdev, + MLX5E_CON_PROTOCOL_R_ROCE_RP, + out, sizeof(out)); + if (!err) { + congestion_parameters = MLX5_ADDR_OF(query_cong_params_out, + out, + congestion_parameters); + rpg_ai_rate = MLX5_GET(cong_control_r_roce_ecn_rp, + congestion_parameters, rpg_ai_rate); + return sprintf(buf, "%d\n", rpg_ai_rate); + } + return err; +} + +ssize_t mlx5e_store_rpg_ai_rate(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct mlx5e_ecn_rp_attributes *rp_attr = container_of(attr, + struct mlx5e_ecn_rp_attributes, + rpg_ai_rate); + u32 in[MLX5_ST_SZ_DW(modify_cong_params_in)]; + void *congestion_parameters; + int rpg_ai_rate; + void *field_select; + int err; + + err = sscanf(buf, "%d", &rpg_ai_rate); + + memset(in, 0, sizeof(in)); + MLX5_SET(modify_cong_params_in, in, opcode, + MLX5_CMD_OP_MODIFY_CONG_PARAMS); + MLX5_SET(modify_cong_params_in, in, cong_protocol, + MLX5E_CON_PROTOCOL_R_ROCE_RP); + field_select = MLX5_ADDR_OF(modify_cong_params_in, in, field_select); + MLX5_SET(field_select_r_roce_rp, field_select, field_select_r_roce_rp, + 1 << MLX5E_RP_RPG_AI_RATE); + congestion_parameters = MLX5_ADDR_OF(modify_cong_params_in, in, + congestion_parameters); + MLX5_SET(cong_control_r_roce_ecn_rp, congestion_parameters, + rpg_ai_rate, rpg_ai_rate); + + err = mlx5_modify_port_cong_params(rp_attr->mdev, in, sizeof(in)); + + if (err) + return err; + return count; +} + +ssize_t mlx5e_show_rpg_hai_rate(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct mlx5e_ecn_rp_attributes *rp_attr = container_of(attr, + struct mlx5e_ecn_rp_attributes, + rpg_hai_rate); + u32 out[MLX5_ST_SZ_DW(query_cong_params_out)]; + void *congestion_parameters; + u32 rpg_hai_rate = -1; + int err; + + err = mlx5_query_port_cong_params(rp_attr->mdev, + MLX5E_CON_PROTOCOL_R_ROCE_RP, + out, sizeof(out)); + if (!err) { + congestion_parameters = MLX5_ADDR_OF(query_cong_params_out, + out, + congestion_parameters); + rpg_hai_rate = MLX5_GET(cong_control_r_roce_ecn_rp, + congestion_parameters, rpg_hai_rate); + return sprintf(buf, "%d\n", rpg_hai_rate); + } + return err; +} + +ssize_t mlx5e_store_rpg_hai_rate(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct mlx5e_ecn_rp_attributes *rp_attr = container_of(attr, + struct mlx5e_ecn_rp_attributes, + rpg_hai_rate); + u32 in[MLX5_ST_SZ_DW(modify_cong_params_in)]; + void *congestion_parameters; + int rpg_hai_rate; + void *field_select; + int err; + + err = sscanf(buf, "%d", &rpg_hai_rate); + + memset(in, 0, sizeof(in)); + MLX5_SET(modify_cong_params_in, in, opcode, + MLX5_CMD_OP_MODIFY_CONG_PARAMS); + MLX5_SET(modify_cong_params_in, in, cong_protocol, + MLX5E_CON_PROTOCOL_R_ROCE_RP); + field_select = MLX5_ADDR_OF(modify_cong_params_in, in, field_select); + MLX5_SET(field_select_r_roce_rp, field_select, field_select_r_roce_rp, + 1 << MLX5E_RP_RPG_HAI_RATE); + congestion_parameters = MLX5_ADDR_OF(modify_cong_params_in, in, + congestion_parameters); + MLX5_SET(cong_control_r_roce_ecn_rp, congestion_parameters, + rpg_hai_rate, rpg_hai_rate); + + err = mlx5_modify_port_cong_params(rp_attr->mdev, in, sizeof(in)); + + if (err) + return err; + return count; +} + +ssize_t mlx5e_show_rpg_gd(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct mlx5e_ecn_rp_attributes *rp_attr = container_of(attr, + struct mlx5e_ecn_rp_attributes, + rpg_gd); + u32 out[MLX5_ST_SZ_DW(query_cong_params_out)]; + void *congestion_parameters; + u32 rpg_gd = -1; + int err; + + err = mlx5_query_port_cong_params(rp_attr->mdev, + MLX5E_CON_PROTOCOL_R_ROCE_RP, + out, sizeof(out)); + if (!err) { + congestion_parameters = MLX5_ADDR_OF(query_cong_params_out, + out, congestion_parameters); + rpg_gd = MLX5_GET(cong_control_r_roce_ecn_rp, + congestion_parameters, rpg_gd); + return sprintf(buf, "%d\n", rpg_gd); + } + return err; +} + +ssize_t mlx5e_store_rpg_gd(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct mlx5e_ecn_rp_attributes *rp_attr = container_of(attr, + struct mlx5e_ecn_rp_attributes, + rpg_gd); + u32 in[MLX5_ST_SZ_DW(modify_cong_params_in)]; + void *congestion_parameters; + int rpg_gd; + void *field_select; + int err; + + err = sscanf(buf, "%d", &rpg_gd); + + printk("rpg_gd value: %d\n", rpg_gd); + + memset(in, 0, sizeof(in)); + MLX5_SET(modify_cong_params_in, in, opcode, + MLX5_CMD_OP_MODIFY_CONG_PARAMS); + MLX5_SET(modify_cong_params_in, in, cong_protocol, + MLX5E_CON_PROTOCOL_R_ROCE_RP); + field_select = MLX5_ADDR_OF(modify_cong_params_in, in, field_select); + MLX5_SET(field_select_r_roce_rp, field_select, field_select_r_roce_rp, + 1 << MLX5E_RP_RPG_GD); //TODO??? + + congestion_parameters = MLX5_ADDR_OF(modify_cong_params_in, in, + congestion_parameters); + MLX5_SET(cong_control_r_roce_ecn_rp, congestion_parameters, + rpg_gd, rpg_gd); + + err = mlx5_modify_port_cong_params(rp_attr->mdev, in, sizeof(in)); + + if (err) + return err; + + return count; +} + +ssize_t mlx5e_show_rpg_min_dec_fac(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct mlx5e_ecn_rp_attributes *rp_attr = container_of(attr, + struct mlx5e_ecn_rp_attributes, + rpg_min_dec_fac); + u32 out[MLX5_ST_SZ_DW(query_cong_params_out)]; + void *congestion_parameters; + u32 rpg_min_dec_fac = -1; + int err; + + err = mlx5_query_port_cong_params(rp_attr->mdev, + MLX5E_CON_PROTOCOL_R_ROCE_RP, + out, sizeof(out)); + if (!err) { + congestion_parameters = MLX5_ADDR_OF(query_cong_params_out, + out, + congestion_parameters); + rpg_min_dec_fac = MLX5_GET(cong_control_r_roce_ecn_rp, + congestion_parameters, + rpg_min_dec_fac); + return sprintf(buf, "%d\n", rpg_min_dec_fac); + } + return err; +} + +ssize_t mlx5e_store_rpg_min_dec_fac(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct mlx5e_ecn_rp_attributes *rp_attr = container_of(attr, + struct mlx5e_ecn_rp_attributes, + rpg_min_dec_fac); + u32 in[MLX5_ST_SZ_DW(modify_cong_params_in)]; + void *congestion_parameters; + int rpg_min_dec_fac; + void *field_select; + int err; + + err = sscanf(buf, "%d", &rpg_min_dec_fac); + + memset(in, 0, sizeof(in)); + MLX5_SET(modify_cong_params_in, in, opcode, + MLX5_CMD_OP_MODIFY_CONG_PARAMS); + MLX5_SET(modify_cong_params_in, in, cong_protocol, + MLX5E_CON_PROTOCOL_R_ROCE_RP); + field_select = MLX5_ADDR_OF(modify_cong_params_in, in, field_select); + MLX5_SET(field_select_r_roce_rp, field_select, field_select_r_roce_rp, + 1 << MLX5E_RP_MIN_DEC_FAC); + congestion_parameters = MLX5_ADDR_OF(modify_cong_params_in, in, + congestion_parameters); + MLX5_SET(cong_control_r_roce_ecn_rp, congestion_parameters, + rpg_min_dec_fac, rpg_min_dec_fac); + + err = mlx5_modify_port_cong_params(rp_attr->mdev, in, sizeof(in)); + + if (err) + return err; + return count; +} + +ssize_t mlx5e_show_rpg_min_rate(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct mlx5e_ecn_rp_attributes *rp_attr = container_of(attr, + struct mlx5e_ecn_rp_attributes, + rpg_min_rate); + u32 out[MLX5_ST_SZ_DW(query_cong_params_out)]; + void *congestion_parameters; + u32 rpg_min_rate = -1; + int err; + + err = mlx5_query_port_cong_params(rp_attr->mdev, + MLX5E_CON_PROTOCOL_R_ROCE_RP, + out, sizeof(out)); + if (!err) { + congestion_parameters = MLX5_ADDR_OF(query_cong_params_out, + out, + congestion_parameters); + rpg_min_rate = MLX5_GET(cong_control_r_roce_ecn_rp, + congestion_parameters, rpg_min_rate); + return sprintf(buf, "%d\n", rpg_min_rate); + } + return err; +} + +ssize_t mlx5e_store_rpg_min_rate(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct mlx5e_ecn_rp_attributes *rp_attr = container_of(attr, + struct mlx5e_ecn_rp_attributes, + rpg_min_rate); + u32 in[MLX5_ST_SZ_DW(modify_cong_params_in)]; + void *congestion_parameters; + int rpg_min_rate; + void *field_select; + int err; + + err = sscanf(buf, "%d", &rpg_min_rate); + + memset(in, 0, sizeof(in)); + MLX5_SET(modify_cong_params_in, in, opcode, + MLX5_CMD_OP_MODIFY_CONG_PARAMS); + MLX5_SET(modify_cong_params_in, in, cong_protocol, + MLX5E_CON_PROTOCOL_R_ROCE_RP); + field_select = MLX5_ADDR_OF(modify_cong_params_in, in, field_select); + MLX5_SET(field_select_r_roce_rp, field_select, field_select_r_roce_rp, + 1 << MLX5E_RP_RPG_MIN_RATE); + congestion_parameters = MLX5_ADDR_OF(modify_cong_params_in, in, + congestion_parameters); + MLX5_SET(cong_control_r_roce_ecn_rp, congestion_parameters, + rpg_min_rate, rpg_min_rate); + + err = mlx5_modify_port_cong_params(rp_attr->mdev, in, sizeof(in)); + + if (err) + return err; + return count; +} + +ssize_t mlx5e_show_rate2set_fcnp(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct mlx5e_ecn_rp_attributes *rp_attr = container_of(attr, + struct mlx5e_ecn_rp_attributes, + rate2set_fcnp); + u32 out[MLX5_ST_SZ_DW(query_cong_params_out)]; + void *congestion_parameters; + u32 rate_to_set_on_first_cnp = -1; + int err; + + err = mlx5_query_port_cong_params(rp_attr->mdev, + MLX5E_CON_PROTOCOL_R_ROCE_RP, + out, sizeof(out)); + if (!err) { + congestion_parameters = MLX5_ADDR_OF(query_cong_params_out, + out, + congestion_parameters); + rate_to_set_on_first_cnp = MLX5_GET(cong_control_r_roce_ecn_rp, + congestion_parameters, + rate_to_set_on_first_cnp); + return sprintf(buf, "%d\n", rate_to_set_on_first_cnp); + } + return err; +} + +ssize_t mlx5e_store_rate2set_fcnp(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct mlx5e_ecn_rp_attributes *rp_attr = container_of(attr, + struct mlx5e_ecn_rp_attributes, + rate2set_fcnp); + u32 in[MLX5_ST_SZ_DW(modify_cong_params_in)]; + void *congestion_parameters; + int rate_to_set_on_first_cnp; + void *field_select; + int err; + + err = sscanf(buf, "%d", &rate_to_set_on_first_cnp); + + memset(in, 0, sizeof(in)); + MLX5_SET(modify_cong_params_in, in, opcode, + MLX5_CMD_OP_MODIFY_CONG_PARAMS); + MLX5_SET(modify_cong_params_in, in, cong_protocol, + MLX5E_CON_PROTOCOL_R_ROCE_RP); + field_select = MLX5_ADDR_OF(modify_cong_params_in, in, field_select); + MLX5_SET(field_select_r_roce_rp, field_select, field_select_r_roce_rp, + 1 << MLX5E_RP_RATE_TO_SET_ON_FIRST_CNP); + congestion_parameters = MLX5_ADDR_OF(modify_cong_params_in, in, + congestion_parameters); + MLX5_SET(cong_control_r_roce_ecn_rp, congestion_parameters, + rate_to_set_on_first_cnp, rate_to_set_on_first_cnp); + + err = mlx5_modify_port_cong_params(rp_attr->mdev, in, sizeof(in)); + + if (err) + return err; + return count; +} + +ssize_t mlx5e_show_dce_tcp_g(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct mlx5e_ecn_rp_attributes *rp_attr = container_of(attr, + struct mlx5e_ecn_rp_attributes, + dce_tcp_g); + u32 out[MLX5_ST_SZ_DW(query_cong_params_out)]; + void *congestion_parameters; + u32 dce_tcp_g = -1; + int err; + + err = mlx5_query_port_cong_params(rp_attr->mdev, + MLX5E_CON_PROTOCOL_R_ROCE_RP, + out, sizeof(out)); + if (!err) { + congestion_parameters = MLX5_ADDR_OF(query_cong_params_out, + out, + congestion_parameters); + dce_tcp_g = MLX5_GET(cong_control_r_roce_ecn_rp, + congestion_parameters, dce_tcp_g); + return sprintf(buf, "%d\n", dce_tcp_g); + } + return err; +} + +ssize_t mlx5e_store_dce_tcp_g(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct mlx5e_ecn_rp_attributes *rp_attr = container_of(attr, + struct mlx5e_ecn_rp_attributes, + dce_tcp_g); + u32 in[MLX5_ST_SZ_DW(modify_cong_params_in)]; + void *congestion_parameters; + int dce_tcp_g; + void *field_select; + int err; + + err = sscanf(buf, "%d", &dce_tcp_g); + + memset(in, 0, sizeof(in)); + MLX5_SET(modify_cong_params_in, in, opcode, + MLX5_CMD_OP_MODIFY_CONG_PARAMS); + MLX5_SET(modify_cong_params_in, in, cong_protocol, + MLX5E_CON_PROTOCOL_R_ROCE_RP); + field_select = MLX5_ADDR_OF(modify_cong_params_in, in, field_select); + MLX5_SET(field_select_r_roce_rp, field_select, field_select_r_roce_rp, + 1 << MLX5E_RP_DCE_TCP_G); + congestion_parameters = MLX5_ADDR_OF(modify_cong_params_in, in, + congestion_parameters); + MLX5_SET(cong_control_r_roce_ecn_rp, congestion_parameters, + dce_tcp_g, dce_tcp_g); + + err = mlx5_modify_port_cong_params(rp_attr->mdev, in, sizeof(in)); + + if (err) + return err; + return count; +} + +ssize_t mlx5e_show_dce_tcp_rtt(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct mlx5e_ecn_rp_attributes *rp_attr = container_of(attr, + struct mlx5e_ecn_rp_attributes, + dce_tcp_rtt); + u32 out[MLX5_ST_SZ_DW(query_cong_params_out)]; + void *congestion_parameters; + u32 dce_tcp_rtt = -1; + int err; + + err = mlx5_query_port_cong_params(rp_attr->mdev, + MLX5E_CON_PROTOCOL_R_ROCE_RP, + out, sizeof(out)); + if (!err) { + congestion_parameters = MLX5_ADDR_OF(query_cong_params_out, + out, + congestion_parameters); + dce_tcp_rtt = MLX5_GET(cong_control_r_roce_ecn_rp, + congestion_parameters, dce_tcp_rtt); + return sprintf(buf, "%d\n", dce_tcp_rtt); + } + return err; +} + +ssize_t mlx5e_store_dce_tcp_rtt(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct mlx5e_ecn_rp_attributes *rp_attr = container_of(attr, + struct mlx5e_ecn_rp_attributes, + dce_tcp_rtt); + u32 in[MLX5_ST_SZ_DW(modify_cong_params_in)]; + void *congestion_parameters; + int dce_tcp_rtt; + void *field_select; + int err; + + err = sscanf(buf, "%d", &dce_tcp_rtt); + + memset(in, 0, sizeof(in)); + MLX5_SET(modify_cong_params_in, in, opcode, + MLX5_CMD_OP_MODIFY_CONG_PARAMS); + MLX5_SET(modify_cong_params_in, in, cong_protocol, + MLX5E_CON_PROTOCOL_R_ROCE_RP); + field_select = MLX5_ADDR_OF(modify_cong_params_in, in, field_select); + MLX5_SET(field_select_r_roce_rp, field_select, field_select_r_roce_rp, + 1 << MLX5E_RP_DCE_TCP_RTT); + congestion_parameters = MLX5_ADDR_OF(modify_cong_params_in, in, + congestion_parameters); + MLX5_SET(cong_control_r_roce_ecn_rp, congestion_parameters, + dce_tcp_rtt, dce_tcp_rtt); + + err = mlx5_modify_port_cong_params(rp_attr->mdev, in, sizeof(in)); + if (err) + return err; + return count; +} + +ssize_t mlx5e_show_rreduce_mperiod(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct mlx5e_ecn_rp_attributes *rp_attr = container_of(attr, + struct mlx5e_ecn_rp_attributes, + rreduce_mperiod); + u32 out[MLX5_ST_SZ_DW(query_cong_params_out)]; + void *congestion_parameters; + u32 rate_reduce_mperiod = -1; + int err; + + err = mlx5_query_port_cong_params(rp_attr->mdev, + MLX5E_CON_PROTOCOL_R_ROCE_RP, + out, sizeof(out)); + if (!err) { + congestion_parameters = MLX5_ADDR_OF(query_cong_params_out, + out, + congestion_parameters); + rate_reduce_mperiod = MLX5_GET(cong_control_r_roce_ecn_rp, + congestion_parameters, + rate_reduce_monitor_period); + return sprintf(buf, "%d\n", rate_reduce_mperiod); + } + return err; +} + +ssize_t mlx5e_store_rreduce_mperiod(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct mlx5e_ecn_rp_attributes *rp_attr = container_of(attr, + struct mlx5e_ecn_rp_attributes, + rreduce_mperiod); + u32 in[MLX5_ST_SZ_DW(modify_cong_params_in)]; + void *congestion_parameters; + int rate_reduce_monitor_period; + void *field_select; + int err; + + err = sscanf(buf, "%d", &rate_reduce_monitor_period); + + memset(in, 0, sizeof(in)); + MLX5_SET(modify_cong_params_in, in, opcode, + MLX5_CMD_OP_MODIFY_CONG_PARAMS); + MLX5_SET(modify_cong_params_in, in, cong_protocol, + MLX5E_CON_PROTOCOL_R_ROCE_RP); + field_select = MLX5_ADDR_OF(modify_cong_params_in, in, field_select); + MLX5_SET(field_select_r_roce_rp, field_select, field_select_r_roce_rp, + 1 << MLX5E_RP_RATE_REDUCE_MONITOR_PERIOD); + congestion_parameters = MLX5_ADDR_OF(modify_cong_params_in, in, + congestion_parameters); + MLX5_SET(cong_control_r_roce_ecn_rp, congestion_parameters, + rate_reduce_monitor_period, rate_reduce_monitor_period); + + err = mlx5_modify_port_cong_params(rp_attr->mdev, in, sizeof(in)); + if (err) + return err; + return count; +} + +ssize_t mlx5e_show_initial_alpha_value(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct mlx5e_ecn_rp_attributes *rp_attr = container_of(attr, + struct mlx5e_ecn_rp_attributes, + initial_alpha_value); + u32 out[MLX5_ST_SZ_DW(query_cong_params_out)]; + void *congestion_parameters; + u32 initial_alpha_value = -1; + int err; + + err = mlx5_query_port_cong_params(rp_attr->mdev, + MLX5E_CON_PROTOCOL_R_ROCE_RP, + out, sizeof(out)); + if (!err) { + congestion_parameters = MLX5_ADDR_OF(query_cong_params_out, + out, + congestion_parameters); + initial_alpha_value = MLX5_GET(cong_control_r_roce_ecn_rp, + congestion_parameters, + initial_alpha_value); + return sprintf(buf, "%d\n", initial_alpha_value); + } + return err; +} + +ssize_t mlx5e_store_initial_alpha_value(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct mlx5e_ecn_rp_attributes *rp_attr = container_of(attr, + struct mlx5e_ecn_rp_attributes, + initial_alpha_value); + u32 in[MLX5_ST_SZ_DW(modify_cong_params_in)]; + void *congestion_parameters; + int initial_alpha_value; + void *field_select; + int err; + + err = sscanf(buf, "%d", &initial_alpha_value); + + memset(in, 0, sizeof(in)); + MLX5_SET(modify_cong_params_in, in, opcode, + MLX5_CMD_OP_MODIFY_CONG_PARAMS); + MLX5_SET(modify_cong_params_in, in, cong_protocol, + MLX5E_CON_PROTOCOL_R_ROCE_RP); + field_select = MLX5_ADDR_OF(modify_cong_params_in, in, field_select); + MLX5_SET(field_select_r_roce_rp, field_select, field_select_r_roce_rp, + 1 << MLX5E_RP_INITIAL_ALPHA_VALUE); + congestion_parameters = MLX5_ADDR_OF(modify_cong_params_in, in, + congestion_parameters); + MLX5_SET(cong_control_r_roce_ecn_rp, congestion_parameters, + initial_alpha_value, initial_alpha_value); + + err = mlx5_modify_port_cong_params(rp_attr->mdev, in, sizeof(in)); + + if (err) + return err; + return count; +} + +ssize_t mlx5e_show_min_time_between_cnps(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct mlx5e_ecn_np_attributes *np_attr = container_of(attr, + struct mlx5e_ecn_np_attributes, + min_time_between_cnps); + u32 out[MLX5_ST_SZ_DW(query_cong_params_out)]; + void *congestion_parameters; + u32 min_time_between_cnps = -1; + int err; + + err = mlx5_query_port_cong_params(np_attr->mdev, + MLX5E_CON_PROTOCOL_R_ROCE_NP, + out, sizeof(out)); + if (!err) { + congestion_parameters = MLX5_ADDR_OF(query_cong_params_out, + out, + congestion_parameters); + min_time_between_cnps = MLX5_GET(cong_control_r_roce_ecn_np, + congestion_parameters, + min_time_between_cnps); + return sprintf(buf, "%d\n", min_time_between_cnps); + } + return err; +} + +ssize_t mlx5e_store_min_time_between_cnps(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct mlx5e_ecn_np_attributes *np_attr = container_of(attr, + struct mlx5e_ecn_np_attributes, + min_time_between_cnps); + u32 in[MLX5_ST_SZ_DW(modify_cong_params_in)]; + void *congestion_parameters; + int min_time_between_cnps; + void *field_select; + int err; + + err = sscanf(buf, "%d", &min_time_between_cnps); + + memset(in, 0, sizeof(in)); + MLX5_SET(modify_cong_params_in, in, opcode, + MLX5_CMD_OP_MODIFY_CONG_PARAMS); + MLX5_SET(modify_cong_params_in, in, cong_protocol, + MLX5E_CON_PROTOCOL_R_ROCE_NP); + field_select = MLX5_ADDR_OF(modify_cong_params_in, in, field_select); + MLX5_SET(field_select_r_roce_np, field_select, field_select_r_roce_np, + 1 << MLX5E_NP_MIN_TIME_BETWEEN_CNPS); + congestion_parameters = MLX5_ADDR_OF(modify_cong_params_in, in, + congestion_parameters); + MLX5_SET(cong_control_r_roce_ecn_np, congestion_parameters, + min_time_between_cnps, min_time_between_cnps); + + err = mlx5_modify_port_cong_params(np_attr->mdev, in, sizeof(in)); + + if (err) + return err; + return count; +} + +ssize_t mlx5e_show_cnp_dscp(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct mlx5e_ecn_np_attributes *np_attr = container_of(attr, + struct mlx5e_ecn_np_attributes, + cnp_dscp); + u32 out[MLX5_ST_SZ_DW(query_cong_params_out)]; + void *congestion_parameters; + u32 cnp_dscp = -1; + int err; + + err = mlx5_query_port_cong_params(np_attr->mdev, + MLX5E_CON_PROTOCOL_R_ROCE_NP, + out, sizeof(out)); + if (!err) { + congestion_parameters = MLX5_ADDR_OF(query_cong_params_out, + out, + congestion_parameters); + cnp_dscp = MLX5_GET(cong_control_r_roce_ecn_np, + congestion_parameters, cnp_dscp); + return sprintf(buf, "%d\n", cnp_dscp); + } + return err; +} + +ssize_t mlx5e_store_cnp_dscp(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct mlx5e_ecn_np_attributes *np_attr = container_of(attr, + struct mlx5e_ecn_np_attributes, + cnp_dscp); + u32 in[MLX5_ST_SZ_DW(modify_cong_params_in)]; + void *congestion_parameters; + void *field_select; + int cnp_dscp; + int err; + + err = sscanf(buf, "%d", &cnp_dscp); + + memset(in, 0, sizeof(in)); + MLX5_SET(modify_cong_params_in, in, opcode, + MLX5_CMD_OP_MODIFY_CONG_PARAMS); + MLX5_SET(modify_cong_params_in, in, cong_protocol, + MLX5E_CON_PROTOCOL_R_ROCE_NP); + field_select = MLX5_ADDR_OF(modify_cong_params_in, in, field_select); + MLX5_SET(field_select_r_roce_np, field_select, field_select_r_roce_np, + 1 << MLX5E_NP_CNP_DSCP); + congestion_parameters = MLX5_ADDR_OF(modify_cong_params_in, in, + congestion_parameters); + MLX5_SET(cong_control_r_roce_ecn_np, congestion_parameters, + cnp_dscp, cnp_dscp); + + err = mlx5_modify_port_cong_params(np_attr->mdev, in, sizeof(in)); + + if (err) + return err; + return count; +} + +ssize_t mlx5e_show_cnp_802p_prio(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct mlx5e_ecn_np_attributes *np_attr = container_of(attr, + struct mlx5e_ecn_np_attributes, + cnp_802p_prio); + u32 out[MLX5_ST_SZ_DW(query_cong_params_out)]; + void *congestion_parameters; + u32 cnp_802p_prio = -1; + int err; + + err = mlx5_query_port_cong_params(np_attr->mdev, + MLX5E_CON_PROTOCOL_R_ROCE_NP, + out, sizeof(out)); + if (!err) { + congestion_parameters = MLX5_ADDR_OF(query_cong_params_out, + out, + congestion_parameters); + cnp_802p_prio = MLX5_GET(cong_control_r_roce_ecn_np, + congestion_parameters, + cnp_802p_prio); + return sprintf(buf, "%d\n", cnp_802p_prio); + } + return err; +} + +ssize_t mlx5e_store_cnp_802p_prio(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct mlx5e_ecn_np_attributes *np_attr = container_of(attr, + struct mlx5e_ecn_np_attributes, + cnp_802p_prio); + u32 in[MLX5_ST_SZ_DW(modify_cong_params_in)]; + void *congestion_parameters; + int cnp_802p_prio; + void *field_select; + int err; + + err = sscanf(buf, "%d", &cnp_802p_prio); + + memset(in, 0, sizeof(in)); + MLX5_SET(modify_cong_params_in, in, opcode, + MLX5_CMD_OP_MODIFY_CONG_PARAMS); + MLX5_SET(modify_cong_params_in, in, cong_protocol, + MLX5E_CON_PROTOCOL_R_ROCE_NP); + field_select = MLX5_ADDR_OF(modify_cong_params_in, in, field_select); + MLX5_SET(field_select_r_roce_np, field_select, field_select_r_roce_np, + 1 << MLX5E_NP_CNP_802P_PRIO); + congestion_parameters = MLX5_ADDR_OF(modify_cong_params_in, in, + congestion_parameters); + MLX5_SET(cong_control_r_roce_ecn_np, congestion_parameters, + cnp_802p_prio, cnp_802p_prio); + + err = mlx5_modify_port_cong_params(np_attr->mdev, in, sizeof(in)); + + if (err) + return err; + return count; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_ecn.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_ecn.h new file mode 100644 index 0000000..6d1ecd2 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_ecn.h @@ -0,0 +1,176 @@ +/* + * Copyright (c) 2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __MLX5_EN_ECN_H__ +#define __MLX5_EN_ECN_H__ + +#define MLX5E_RP_CLAMP_TGT_RATE 1 +#define MLX5E_RP_CLAMP_TGT_RATE_AFTER_TIME_INC 2 +#define MLX5E_RP_RPG_TIME_RESET 3 +#define MLX5E_RP_RPG_BYTE_RESET 4 +#define MLX5E_RP_RPG_THRESHOLD 5 +#define MLX5E_RP_RPG_MAX_RATE 6 +#define MLX5E_RP_RPG_AI_RATE 7 +#define MLX5E_RP_RPG_HAI_RATE 8 +#define MLX5E_RP_MIN_DEC_FAC 9 +#define MLX5E_RP_RPG_MIN_RATE 10 +#define MLX5E_RP_RATE_TO_SET_ON_FIRST_CNP 11 +#define MLX5E_RP_DCE_TCP_G 12 +#define MLX5E_RP_DCE_TCP_RTT 13 +#define MLX5E_RP_RATE_REDUCE_MONITOR_PERIOD 14 +#define MLX5E_RP_INITIAL_ALPHA_VALUE 15 +#define MLX5E_RP_RPG_GD 16 +#define MLX5E_NP_MIN_TIME_BETWEEN_CNPS 2 +#define MLX5E_NP_CNP_DSCP 3 +#define MLX5E_NP_CNP_802P_PRIO 4 + +ssize_t mlx5e_show_ecn_enable(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf); +ssize_t mlx5e_store_ecn_enable(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count); +ssize_t mlx5e_show_clamp_tgt_rate(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf); +ssize_t mlx5e_store_clamp_tgt_rate(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count); +ssize_t mlx5e_show_clamp_tgt_rate_ati(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf); +ssize_t mlx5e_store_clamp_tgt_rate_ati(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count); +ssize_t mlx5e_show_rpg_time_reset(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf); +ssize_t mlx5e_store_rpg_time_reset(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count); +ssize_t mlx5e_show_rpg_byte_reset(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf); +ssize_t mlx5e_store_rpg_byte_reset(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count); +ssize_t mlx5e_show_rpg_threshold(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf); +ssize_t mlx5e_store_rpg_threshold(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count); +ssize_t mlx5e_show_rpg_max_rate(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf); +ssize_t mlx5e_store_rpg_max_rate(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count); +ssize_t mlx5e_show_rpg_ai_rate(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf); +ssize_t mlx5e_store_rpg_ai_rate(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count); +ssize_t mlx5e_show_rpg_hai_rate(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf); +ssize_t mlx5e_store_rpg_hai_rate(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count); +ssize_t mlx5e_show_rpg_gd(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf); +ssize_t mlx5e_store_rpg_gd(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count); +ssize_t mlx5e_show_rpg_min_dec_fac(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf); +ssize_t mlx5e_store_rpg_min_dec_fac(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count); +ssize_t mlx5e_show_rpg_min_rate(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf); +ssize_t mlx5e_store_rpg_min_rate(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count); +ssize_t mlx5e_show_rate2set_fcnp(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf); +ssize_t mlx5e_store_rate2set_fcnp(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count); +ssize_t mlx5e_show_dce_tcp_g(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf); +ssize_t mlx5e_store_dce_tcp_g(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count); +ssize_t mlx5e_show_dce_tcp_rtt(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf); +ssize_t mlx5e_store_dce_tcp_rtt(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count); +ssize_t mlx5e_show_rreduce_mperiod(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf); +ssize_t mlx5e_store_rreduce_mperiod(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count); +ssize_t mlx5e_show_initial_alpha_value(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf); +ssize_t mlx5e_store_initial_alpha_value(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count); +ssize_t mlx5e_show_min_time_between_cnps(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf); +ssize_t mlx5e_store_min_time_between_cnps(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count); +ssize_t mlx5e_show_cnp_dscp(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf); +ssize_t mlx5e_store_cnp_dscp(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count); +ssize_t mlx5e_show_cnp_802p_prio(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf); +ssize_t mlx5e_store_cnp_802p_prio(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count); +#endif /* __MLX5_ECN_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c new file mode 100644 index 0000000..dee5b8e --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -0,0 +1,2580 @@ +/* + * Copyright (c) 2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "en.h" +#include "en/port.h" +#include "en/params.h" +#include "en/xsk/pool.h" +#include "en/ptp.h" +#include "lib/clock.h" + +void mlx5e_ethtool_get_drvinfo(struct mlx5e_priv *priv, + struct ethtool_drvinfo *drvinfo) +{ + struct mlx5_core_dev *mdev = priv->mdev; + + strlcpy(drvinfo->driver, KBUILD_MODNAME, sizeof(drvinfo->driver)); + strlcpy(drvinfo->version, DRIVER_VERSION, + sizeof(drvinfo->version)); + snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), + "%d.%d.%04d (%.16s)", + fw_rev_maj(mdev), fw_rev_min(mdev), fw_rev_sub(mdev), + mdev->board_id); + strlcpy(drvinfo->bus_info, dev_name(mdev->device), + sizeof(drvinfo->bus_info)); +} + +static void mlx5e_get_drvinfo(struct net_device *dev, + struct ethtool_drvinfo *drvinfo) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + + mlx5e_ethtool_get_drvinfo(priv, drvinfo); +} + +struct ptys2ethtool_config { + __ETHTOOL_DECLARE_LINK_MODE_MASK(supported); + __ETHTOOL_DECLARE_LINK_MODE_MASK(advertised); +}; + +static +struct ptys2ethtool_config ptys2legacy_ethtool_table[MLX5E_LINK_MODES_NUMBER]; +static +struct ptys2ethtool_config ptys2ext_ethtool_table[MLX5E_EXT_LINK_MODES_NUMBER]; + +#define MLX5_BUILD_PTYS2ETHTOOL_CONFIG(reg_, table, ...) \ + ({ \ + struct ptys2ethtool_config *cfg; \ + const unsigned int modes[] = { __VA_ARGS__ }; \ + unsigned int i, bit, idx; \ + cfg = &ptys2##table##_ethtool_table[reg_]; \ + bitmap_zero(cfg->supported, \ + __ETHTOOL_LINK_MODE_MASK_NBITS); \ + bitmap_zero(cfg->advertised, \ + __ETHTOOL_LINK_MODE_MASK_NBITS); \ + for (i = 0 ; i < ARRAY_SIZE(modes) ; ++i) { \ + bit = modes[i] % 64; \ + idx = modes[i] / 64; \ + __set_bit(bit, &cfg->supported[idx]); \ + __set_bit(bit, &cfg->advertised[idx]); \ + } \ + }) + +void mlx5e_build_ptys2ethtool_map(void) +{ + memset(ptys2legacy_ethtool_table, 0, sizeof(ptys2legacy_ethtool_table)); + memset(ptys2ext_ethtool_table, 0, sizeof(ptys2ext_ethtool_table)); + MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_1000BASE_CX_SGMII, legacy, + ETHTOOL_LINK_MODE_1000baseKX_Full_BIT); + MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_1000BASE_KX, legacy, + ETHTOOL_LINK_MODE_1000baseKX_Full_BIT); + MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_10GBASE_CX4, legacy, + ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT); + MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_10GBASE_KX4, legacy, + ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT); + MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_10GBASE_KR, legacy, + ETHTOOL_LINK_MODE_10000baseKR_Full_BIT); + MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_20GBASE_KR2, legacy, + ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT); + MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_40GBASE_CR4, legacy, + ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT); + MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_40GBASE_KR4, legacy, + ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT); + MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_56GBASE_R4, legacy, + ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT); + MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_10GBASE_CR, legacy, + ETHTOOL_LINK_MODE_10000baseKR_Full_BIT); + MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_10GBASE_SR, legacy, + ETHTOOL_LINK_MODE_10000baseKR_Full_BIT); + MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_10GBASE_ER, legacy, + ETHTOOL_LINK_MODE_10000baseKR_Full_BIT); + MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_40GBASE_SR4, legacy, + ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT); + MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_40GBASE_LR4, legacy, + ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT); + MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_50GBASE_SR2, legacy, + ETHTOOL_LINK_MODE_50000baseSR2_Full_BIT); + MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_100GBASE_CR4, legacy, + ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT); + MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_100GBASE_SR4, legacy, + ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT); + MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_100GBASE_KR4, legacy, + ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT); + MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_100GBASE_LR4, legacy, + ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT); + MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_10GBASE_T, legacy, + ETHTOOL_LINK_MODE_10000baseT_Full_BIT); + MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_25GBASE_CR, legacy, + ETHTOOL_LINK_MODE_25000baseCR_Full_BIT); + MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_25GBASE_KR, legacy, + ETHTOOL_LINK_MODE_25000baseKR_Full_BIT); + MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_25GBASE_SR, legacy, + ETHTOOL_LINK_MODE_25000baseSR_Full_BIT); + MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_50GBASE_CR2, legacy, + ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT); + MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_50GBASE_KR2, legacy, + ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT); + MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_SGMII_100M, ext, + ETHTOOL_LINK_MODE_100baseT_Full_BIT); + MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_1000BASE_X_SGMII, ext, + ETHTOOL_LINK_MODE_1000baseT_Full_BIT, + ETHTOOL_LINK_MODE_1000baseKX_Full_BIT, + ETHTOOL_LINK_MODE_1000baseX_Full_BIT); + MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_5GBASE_R, ext, + ETHTOOL_LINK_MODE_5000baseT_Full_BIT); + MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_10GBASE_XFI_XAUI_1, ext, + ETHTOOL_LINK_MODE_10000baseT_Full_BIT, + ETHTOOL_LINK_MODE_10000baseKR_Full_BIT, + ETHTOOL_LINK_MODE_10000baseR_FEC_BIT, + ETHTOOL_LINK_MODE_10000baseCR_Full_BIT, + ETHTOOL_LINK_MODE_10000baseSR_Full_BIT, + ETHTOOL_LINK_MODE_10000baseLR_Full_BIT, + ETHTOOL_LINK_MODE_10000baseER_Full_BIT); + MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_40GBASE_XLAUI_4_XLPPI_4, ext, + ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT, + ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT, + ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT, + ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT); + MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_25GAUI_1_25GBASE_CR_KR, ext, + ETHTOOL_LINK_MODE_25000baseCR_Full_BIT, + ETHTOOL_LINK_MODE_25000baseKR_Full_BIT, + ETHTOOL_LINK_MODE_25000baseSR_Full_BIT); + MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_50GAUI_2_LAUI_2_50GBASE_CR2_KR2, + ext, + ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT, + ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT, + ETHTOOL_LINK_MODE_50000baseSR2_Full_BIT); + MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR, ext, + ETHTOOL_LINK_MODE_50000baseKR_Full_BIT, + ETHTOOL_LINK_MODE_50000baseSR_Full_BIT, + ETHTOOL_LINK_MODE_50000baseCR_Full_BIT, + ETHTOOL_LINK_MODE_50000baseLR_ER_FR_Full_BIT, + ETHTOOL_LINK_MODE_50000baseDR_Full_BIT); + MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_CAUI_4_100GBASE_CR4_KR4, ext, + ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT, + ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT, + ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT, + ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT); + MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_100GAUI_2_100GBASE_CR2_KR2, ext, + ETHTOOL_LINK_MODE_100000baseKR2_Full_BIT, + ETHTOOL_LINK_MODE_100000baseSR2_Full_BIT, + ETHTOOL_LINK_MODE_100000baseCR2_Full_BIT, + ETHTOOL_LINK_MODE_100000baseLR2_ER2_FR2_Full_BIT, + ETHTOOL_LINK_MODE_100000baseDR2_Full_BIT); + MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_200GAUI_4_200GBASE_CR4_KR4, ext, + ETHTOOL_LINK_MODE_200000baseKR4_Full_BIT, + ETHTOOL_LINK_MODE_200000baseSR4_Full_BIT, + ETHTOOL_LINK_MODE_200000baseLR4_ER4_FR4_Full_BIT, + ETHTOOL_LINK_MODE_200000baseDR4_Full_BIT, + ETHTOOL_LINK_MODE_200000baseCR4_Full_BIT); + MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_100GAUI_1_100GBASE_CR_KR, ext, + ETHTOOL_LINK_MODE_100000baseKR_Full_BIT, + ETHTOOL_LINK_MODE_100000baseSR_Full_BIT, + ETHTOOL_LINK_MODE_100000baseLR_ER_FR_Full_BIT, + ETHTOOL_LINK_MODE_100000baseDR_Full_BIT, + ETHTOOL_LINK_MODE_100000baseCR_Full_BIT); + MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_200GAUI_2_200GBASE_CR2_KR2, ext, + ETHTOOL_LINK_MODE_200000baseKR2_Full_BIT, + ETHTOOL_LINK_MODE_200000baseSR2_Full_BIT, + ETHTOOL_LINK_MODE_200000baseLR2_ER2_FR2_Full_BIT, + ETHTOOL_LINK_MODE_200000baseDR2_Full_BIT, + ETHTOOL_LINK_MODE_200000baseCR2_Full_BIT); + MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_400GAUI_4_400GBASE_CR4_KR4, ext, + ETHTOOL_LINK_MODE_400000baseKR4_Full_BIT, + ETHTOOL_LINK_MODE_400000baseSR4_Full_BIT, + ETHTOOL_LINK_MODE_400000baseLR4_ER4_FR4_Full_BIT, + ETHTOOL_LINK_MODE_400000baseDR4_Full_BIT, + ETHTOOL_LINK_MODE_400000baseCR4_Full_BIT); +} + +static void mlx5e_ethtool_get_speed_arr(struct mlx5_core_dev *mdev, + struct ptys2ethtool_config **arr, + u32 *size) +{ + bool ext = mlx5e_ptys_ext_supported(mdev); + + *arr = ext ? ptys2ext_ethtool_table : ptys2legacy_ethtool_table; + *size = ext ? ARRAY_SIZE(ptys2ext_ethtool_table) : + ARRAY_SIZE(ptys2legacy_ethtool_table); +} + +typedef int (*mlx5e_pflag_handler)(struct net_device *netdev, bool enable); + +struct pflag_desc { + char name[ETH_GSTRING_LEN]; + mlx5e_pflag_handler handler; +}; + +static const struct pflag_desc mlx5e_priv_flags[MLX5E_NUM_PFLAGS]; + +int mlx5e_ethtool_get_sset_count(struct mlx5e_priv *priv, int sset) +{ + switch (sset) { + case ETH_SS_STATS: + return mlx5e_stats_total_num(priv); + case ETH_SS_PRIV_FLAGS: + return MLX5E_NUM_PFLAGS; + case ETH_SS_TEST: + return mlx5e_self_test_num(priv); + default: + return -EOPNOTSUPP; + } +} + +static int mlx5e_get_sset_count(struct net_device *dev, int sset) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + + return mlx5e_ethtool_get_sset_count(priv, sset); +} + +void mlx5e_ethtool_get_strings(struct mlx5e_priv *priv, u32 stringset, u8 *data) +{ + int i; + + switch (stringset) { + case ETH_SS_PRIV_FLAGS: + for (i = 0; i < MLX5E_NUM_PFLAGS; i++) + strcpy(data + i * ETH_GSTRING_LEN, + mlx5e_priv_flags[i].name); + break; + + case ETH_SS_TEST: + mlx5e_self_test_fill_strings(priv, data); + break; + + case ETH_SS_STATS: + mlx5e_stats_fill_strings(priv, data); + break; + } +} + +static void mlx5e_get_strings(struct net_device *dev, u32 stringset, u8 *data) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + + mlx5e_ethtool_get_strings(priv, stringset, data); +} + +void mlx5e_ethtool_get_ethtool_stats(struct mlx5e_priv *priv, + struct ethtool_stats *stats, u64 *data) +{ + int idx = 0; + + mutex_lock(&priv->state_lock); + mlx5e_stats_update(priv); + mutex_unlock(&priv->state_lock); + + mlx5e_stats_fill(priv, data, idx); +} + +static void mlx5e_get_ethtool_stats(struct net_device *dev, + struct ethtool_stats *stats, + u64 *data) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + + mlx5e_ethtool_get_ethtool_stats(priv, stats, data); +} + +void mlx5e_ethtool_get_ringparam(struct mlx5e_priv *priv, + struct ethtool_ringparam *param) +{ + if (priv->shared_rq) { + param->rx_max_pending = 0; + param->rx_pending = 0; + } else { + param->rx_max_pending = 1 << MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE; + param->rx_pending = 1 << priv->channels.params.log_rq_mtu_frames; + } + param->tx_max_pending = 1 << MLX5E_PARAMS_MAXIMUM_LOG_SQ_SIZE; + param->tx_pending = 1 << priv->channels.params.log_sq_size; +} + +static void mlx5e_get_ringparam(struct net_device *dev, + struct ethtool_ringparam *param, + struct kernel_ethtool_ringparam *kernel_param, + struct netlink_ext_ack *extack) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + + mlx5e_ethtool_get_ringparam(priv, param); +} + +int mlx5e_ethtool_set_ringparam(struct mlx5e_priv *priv, + struct ethtool_ringparam *param) +{ + struct mlx5e_params new_params; + u8 log_rq_size; + u8 log_sq_size; + int err = 0; + + if (param->rx_jumbo_pending) { + netdev_info(priv->netdev, "%s: rx_jumbo_pending not supported\n", + __func__); + return -EINVAL; + } + if (param->rx_mini_pending) { + netdev_info(priv->netdev, "%s: rx_mini_pending not supported\n", + __func__); + return -EINVAL; + } + + if (!priv->shared_rq && + param->rx_pending < (1 << MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE)) { + netdev_info(priv->netdev, "%s: rx_pending (%d) < min (%d)\n", + __func__, param->rx_pending, + 1 << MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE); + return -EINVAL; + } + + if (param->tx_pending < (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE)) { + netdev_info(priv->netdev, "%s: tx_pending (%d) < min (%d)\n", + __func__, param->tx_pending, + 1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE); + return -EINVAL; + } + + log_rq_size = order_base_2(param->rx_pending); + log_sq_size = order_base_2(param->tx_pending); + + if (log_rq_size == priv->channels.params.log_rq_mtu_frames && + log_sq_size == priv->channels.params.log_sq_size) + return 0; + + mutex_lock(&priv->state_lock); + + new_params = priv->channels.params; + new_params.log_rq_mtu_frames = log_rq_size; + new_params.log_sq_size = log_sq_size; + + err = mlx5e_validate_params(priv->mdev, &new_params); + if (err) + goto unlock; + + err = mlx5e_safe_switch_params(priv, &new_params, NULL, NULL, true); + +unlock: + mutex_unlock(&priv->state_lock); + + return err; +} + +static int mlx5e_set_ringparam(struct net_device *dev, + struct ethtool_ringparam *param, + struct kernel_ethtool_ringparam *kernel_param, + struct netlink_ext_ack *extack) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + + return mlx5e_ethtool_set_ringparam(priv, param); +} + +void mlx5e_ethtool_get_channels(struct mlx5e_priv *priv, + struct ethtool_channels *ch) +{ + mutex_lock(&priv->state_lock); + + ch->max_combined = priv->max_nch; + ch->combined_count = priv->channels.params.num_channels; + if (priv->xsk.refcnt) { + /* The upper half are XSK queues. */ + ch->max_combined *= 2; + ch->combined_count *= 2; + } + + mutex_unlock(&priv->state_lock); +} + +static void mlx5e_get_channels(struct net_device *dev, + struct ethtool_channels *ch) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + + mlx5e_ethtool_get_channels(priv, ch); +} + +int mlx5e_ethtool_set_channels(struct mlx5e_priv *priv, + struct ethtool_channels *ch) +{ + struct mlx5e_params *cur_params = &priv->channels.params; + int ncv = mlx5e_get_max_num_channels(priv->mdev); + unsigned int count = ch->combined_count; + struct mlx5e_params new_params; + bool arfs_enabled; + int rss_cnt; + bool opened; + int err = 0; + + if (!count) { + netdev_info(priv->netdev, "%s: combined_count=0 not supported\n", + __func__); + return -EINVAL; + } + + if (ch->rx_count || ch->tx_count) { + netdev_info(priv->netdev, "%s: separate rx/tx count not supported\n", + __func__); + return -EINVAL; + } + + if (count > ncv) { + netdev_info(priv->netdev, "%s: count (%d) > max (%d)\n", + __func__, count, ncv); + return -EINVAL; + } + + if (cur_params->num_channels == count) + return 0; + + mutex_lock(&priv->state_lock); + + /* Don't allow changing the number of channels if there is an active + * XSK, because the numeration of the XSK and regular RQs will change. + */ + if (priv->xsk.refcnt) { + err = -EINVAL; + netdev_err(priv->netdev, "%s: AF_XDP is active, cannot change the number of channels\n", + __func__); + goto out; + } + + /* Don't allow changing the number of channels if HTB offload is active, + * because the numeration of the QoS SQs will change, while per-queue + * qdiscs are attached. + */ + if (priv->htb.maj_id) { + err = -EINVAL; + netdev_err(priv->netdev, "%s: HTB offload is active, cannot change the number of channels\n", + __func__); + goto out; + } + + if (!priv->shared_rq) { + /* Don't allow changing the number of channels if non-default RSS contexts exist, + * the kernel doesn't protect against set_channels operations that break them. + */ + rss_cnt = mlx5e_rx_res_rss_cnt(priv->rx_res) - 1; + if (rss_cnt) { + err = -EINVAL; + netdev_err(priv->netdev, "%s: Non-default RSS contexts exist (%d), cannot change the number of channels\n", + __func__, rss_cnt); + goto out; + } + } + + /* Don't allow changing the number of channels if MQPRIO mode channel offload is active, + * because it defines a partition over the channels queues. + */ + if (cur_params->mqprio.mode == TC_MQPRIO_MODE_CHANNEL) { + err = -EINVAL; + netdev_err(priv->netdev, "%s: MQPRIO mode channel offload is active, cannot change the number of channels\n", + __func__); + goto out; + } + + new_params = *cur_params; + new_params.num_channels = count; + + opened = test_bit(MLX5E_STATE_OPENED, &priv->state); + + arfs_enabled = opened && (priv->netdev->features & NETIF_F_NTUPLE); + if (arfs_enabled) + mlx5e_arfs_disable(priv); + + /* Switch to new channels, set new parameters and close old ones */ + err = mlx5e_safe_switch_params(priv, &new_params, + mlx5e_num_channels_changed_ctx, NULL, true); + + if (arfs_enabled) { + int err2 = mlx5e_arfs_enable(priv); + + if (err2) + netdev_err(priv->netdev, "%s: mlx5e_arfs_enable failed: %d\n", + __func__, err2); + } + +out: + mutex_unlock(&priv->state_lock); + + return err; +} + +static int mlx5e_set_channels(struct net_device *dev, + struct ethtool_channels *ch) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + + return mlx5e_ethtool_set_channels(priv, ch); +} + +int mlx5e_ethtool_get_coalesce(struct mlx5e_priv *priv, + struct ethtool_coalesce *coal, + struct kernel_ethtool_coalesce *kernel_coal) +{ + struct dim_cq_moder *rx_moder, *tx_moder; + + if (!MLX5_CAP_GEN(priv->mdev, cq_moderation)) + return -EOPNOTSUPP; + + if (priv->shared_rq) { + coal->rx_coalesce_usecs = 0; + coal->rx_max_coalesced_frames = 0; + coal->use_adaptive_rx_coalesce = 0; + } else { + rx_moder = &priv->channels.params.rx_cq_moderation; + coal->rx_coalesce_usecs = rx_moder->usec; + coal->rx_max_coalesced_frames = rx_moder->pkts; + coal->use_adaptive_rx_coalesce = priv->channels.params.rx_dim_enabled; + } + + tx_moder = &priv->channels.params.tx_cq_moderation; + coal->tx_coalesce_usecs = tx_moder->usec; + coal->tx_max_coalesced_frames = tx_moder->pkts; + coal->use_adaptive_tx_coalesce = priv->channels.params.tx_dim_enabled; + + kernel_coal->use_cqe_mode_rx = + MLX5E_GET_PFLAG(&priv->channels.params, MLX5E_PFLAG_RX_CQE_BASED_MODER); + kernel_coal->use_cqe_mode_tx = + MLX5E_GET_PFLAG(&priv->channels.params, MLX5E_PFLAG_TX_CQE_BASED_MODER); + + return 0; +} + +static int mlx5e_get_coalesce(struct net_device *netdev, + struct ethtool_coalesce *coal, + struct kernel_ethtool_coalesce *kernel_coal, + struct netlink_ext_ack *extack) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + + return mlx5e_ethtool_get_coalesce(priv, coal, kernel_coal); +} + +#define MLX5E_MAX_COAL_TIME MLX5_MAX_CQ_PERIOD +#define MLX5E_MAX_COAL_FRAMES MLX5_MAX_CQ_COUNT + +static void +mlx5e_set_priv_channels_tx_coalesce(struct mlx5e_priv *priv, struct ethtool_coalesce *coal) +{ + struct mlx5_core_dev *mdev = priv->mdev; + int tc; + int i; + + for (i = 0; i < priv->channels.num; ++i) { + struct mlx5e_channel *c = priv->channels.c[i]; + + for (tc = 0; tc < c->num_tc; tc++) { + mlx5_core_modify_cq_moderation(mdev, + &c->sq[tc].cq.mcq, + coal->tx_coalesce_usecs, + coal->tx_max_coalesced_frames); + } + } +} + +static void +mlx5e_set_priv_channels_rx_coalesce(struct mlx5e_priv *priv, struct ethtool_coalesce *coal) +{ + struct mlx5_core_dev *mdev = priv->mdev; + int i; + + for (i = 0; i < priv->channels.num; ++i) { + struct mlx5e_channel *c = priv->channels.c[i]; + + if (priv->shared_rq) + continue; + + mlx5_core_modify_cq_moderation(mdev, &c->rq.cq.mcq, + coal->rx_coalesce_usecs, + coal->rx_max_coalesced_frames); + } +} + +/* convert a boolean value of cq_mode to mlx5 period mode + * true : MLX5_CQ_PERIOD_MODE_START_FROM_CQE + * false : MLX5_CQ_PERIOD_MODE_START_FROM_EQE + */ +static int cqe_mode_to_period_mode(bool val) +{ + return val ? MLX5_CQ_PERIOD_MODE_START_FROM_CQE : MLX5_CQ_PERIOD_MODE_START_FROM_EQE; +} + +int mlx5e_ethtool_set_coalesce(struct mlx5e_priv *priv, + struct ethtool_coalesce *coal, + struct kernel_ethtool_coalesce *kernel_coal, + struct netlink_ext_ack *extack) +{ + struct dim_cq_moder *rx_moder, *tx_moder; + struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5e_params new_params; + bool reset_rx, reset_tx; + bool reset = true; + u8 cq_period_mode; + int err = 0; + + if (!MLX5_CAP_GEN(mdev, cq_moderation)) + return -EOPNOTSUPP; + + if (coal->tx_coalesce_usecs > MLX5E_MAX_COAL_TIME || + coal->rx_coalesce_usecs > MLX5E_MAX_COAL_TIME) { + netdev_info(priv->netdev, "%s: maximum coalesce time supported is %lu usecs\n", + __func__, MLX5E_MAX_COAL_TIME); + return -ERANGE; + } + + if (coal->tx_max_coalesced_frames > MLX5E_MAX_COAL_FRAMES || + coal->rx_max_coalesced_frames > MLX5E_MAX_COAL_FRAMES) { + netdev_info(priv->netdev, "%s: maximum coalesced frames supported is %lu\n", + __func__, MLX5E_MAX_COAL_FRAMES); + return -ERANGE; + } + + if ((kernel_coal->use_cqe_mode_rx || kernel_coal->use_cqe_mode_tx) && + !MLX5_CAP_GEN(priv->mdev, cq_period_start_from_cqe)) { + NL_SET_ERR_MSG_MOD(extack, "cqe_mode_rx/tx is not supported on this device"); + return -EOPNOTSUPP; + } + + mutex_lock(&priv->state_lock); + new_params = priv->channels.params; + + rx_moder = &new_params.rx_cq_moderation; + if (priv->shared_rq) { + if (coal->rx_coalesce_usecs || + coal->rx_max_coalesced_frames || + coal->use_adaptive_rx_coalesce) { + err = -EOPNOTSUPP; + goto out; + } + } + + rx_moder->usec = coal->rx_coalesce_usecs; + rx_moder->pkts = coal->rx_max_coalesced_frames; + new_params.rx_dim_enabled = !!coal->use_adaptive_rx_coalesce; + + tx_moder = &new_params.tx_cq_moderation; + tx_moder->usec = coal->tx_coalesce_usecs; + tx_moder->pkts = coal->tx_max_coalesced_frames; + new_params.tx_dim_enabled = !!coal->use_adaptive_tx_coalesce; + + reset_rx = !!coal->use_adaptive_rx_coalesce != priv->channels.params.rx_dim_enabled; + reset_tx = !!coal->use_adaptive_tx_coalesce != priv->channels.params.tx_dim_enabled; + + cq_period_mode = cqe_mode_to_period_mode(kernel_coal->use_cqe_mode_rx); + if (cq_period_mode != rx_moder->cq_period_mode) { + mlx5e_set_rx_cq_mode_params(&new_params, cq_period_mode); + reset_rx = true; + } + + cq_period_mode = cqe_mode_to_period_mode(kernel_coal->use_cqe_mode_tx); + if (cq_period_mode != tx_moder->cq_period_mode) { + mlx5e_set_tx_cq_mode_params(&new_params, cq_period_mode); + reset_tx = true; + } + + if (reset_rx) { + u8 mode = MLX5E_GET_PFLAG(&new_params, + MLX5E_PFLAG_RX_CQE_BASED_MODER); + + mlx5e_reset_rx_moderation(&new_params, mode); + } + if (reset_tx) { + u8 mode = MLX5E_GET_PFLAG(&new_params, + MLX5E_PFLAG_TX_CQE_BASED_MODER); + + mlx5e_reset_tx_moderation(&new_params, mode); + } + + /* If DIM state hasn't changed, it's possible to modify interrupt + * moderation parameters on the fly, even if the channels are open. + */ + if (!reset_rx && !reset_tx && test_bit(MLX5E_STATE_OPENED, &priv->state)) { + if (!coal->use_adaptive_rx_coalesce) + mlx5e_set_priv_channels_rx_coalesce(priv, coal); + if (!coal->use_adaptive_tx_coalesce) + mlx5e_set_priv_channels_tx_coalesce(priv, coal); + reset = false; + } + + err = mlx5e_safe_switch_params(priv, &new_params, NULL, NULL, reset); +out: + mutex_unlock(&priv->state_lock); + return err; +} + +static int mlx5e_set_coalesce(struct net_device *netdev, + struct ethtool_coalesce *coal, + struct kernel_ethtool_coalesce *kernel_coal, + struct netlink_ext_ack *extack) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + + return mlx5e_ethtool_set_coalesce(priv, coal, kernel_coal, extack); +} + +static void ptys2ethtool_supported_link(struct mlx5_core_dev *mdev, + unsigned long *supported_modes, + u32 eth_proto_cap) +{ + unsigned long proto_cap = eth_proto_cap; + struct ptys2ethtool_config *table; + u32 max_size; + int proto; + + mlx5e_ethtool_get_speed_arr(mdev, &table, &max_size); + for_each_set_bit(proto, &proto_cap, max_size) + bitmap_or(supported_modes, supported_modes, + table[proto].supported, + __ETHTOOL_LINK_MODE_MASK_NBITS); +} + +static void ptys2ethtool_adver_link(unsigned long *advertising_modes, + u32 eth_proto_cap, bool ext) +{ + unsigned long proto_cap = eth_proto_cap; + struct ptys2ethtool_config *table; + u32 max_size; + int proto; + + table = ext ? ptys2ext_ethtool_table : ptys2legacy_ethtool_table; + max_size = ext ? ARRAY_SIZE(ptys2ext_ethtool_table) : + ARRAY_SIZE(ptys2legacy_ethtool_table); + + for_each_set_bit(proto, &proto_cap, max_size) + bitmap_or(advertising_modes, advertising_modes, + table[proto].advertised, + __ETHTOOL_LINK_MODE_MASK_NBITS); +} + +static const u32 pplm_fec_2_ethtool[] = { + [MLX5E_FEC_NOFEC] = ETHTOOL_FEC_OFF, + [MLX5E_FEC_FIRECODE] = ETHTOOL_FEC_BASER, + [MLX5E_FEC_RS_528_514] = ETHTOOL_FEC_RS, + [MLX5E_FEC_RS_544_514] = ETHTOOL_FEC_RS, + [MLX5E_FEC_LLRS_272_257_1] = ETHTOOL_FEC_LLRS, +}; + +static u32 pplm2ethtool_fec(u_long fec_mode, unsigned long size) +{ + int mode = 0; + + if (!fec_mode) + return ETHTOOL_FEC_AUTO; + + mode = find_first_bit(&fec_mode, size); + + if (mode < ARRAY_SIZE(pplm_fec_2_ethtool)) + return pplm_fec_2_ethtool[mode]; + + return 0; +} + +#define MLX5E_ADVERTISE_SUPPORTED_FEC(mlx5_fec, ethtool_fec) \ + do { \ + if (mlx5e_fec_in_caps(dev, 1 << (mlx5_fec))) \ + __set_bit(ethtool_fec, \ + link_ksettings->link_modes.supported);\ + } while (0) + +static const u32 pplm_fec_2_ethtool_linkmodes[] = { + [MLX5E_FEC_NOFEC] = ETHTOOL_LINK_MODE_FEC_NONE_BIT, + [MLX5E_FEC_FIRECODE] = ETHTOOL_LINK_MODE_FEC_BASER_BIT, + [MLX5E_FEC_RS_528_514] = ETHTOOL_LINK_MODE_FEC_RS_BIT, + [MLX5E_FEC_RS_544_514] = ETHTOOL_LINK_MODE_FEC_RS_BIT, + [MLX5E_FEC_LLRS_272_257_1] = ETHTOOL_LINK_MODE_FEC_LLRS_BIT, +}; + +static int get_fec_supported_advertised(struct mlx5_core_dev *dev, + struct ethtool_link_ksettings *link_ksettings) +{ + unsigned long active_fec_long; + u32 active_fec; + u32 bitn; + int err; + + err = mlx5e_get_fec_mode(dev, &active_fec, NULL); + if (err) + return (err == -EOPNOTSUPP) ? 0 : err; + + MLX5E_ADVERTISE_SUPPORTED_FEC(MLX5E_FEC_NOFEC, + ETHTOOL_LINK_MODE_FEC_NONE_BIT); + MLX5E_ADVERTISE_SUPPORTED_FEC(MLX5E_FEC_FIRECODE, + ETHTOOL_LINK_MODE_FEC_BASER_BIT); + MLX5E_ADVERTISE_SUPPORTED_FEC(MLX5E_FEC_RS_528_514, + ETHTOOL_LINK_MODE_FEC_RS_BIT); + MLX5E_ADVERTISE_SUPPORTED_FEC(MLX5E_FEC_LLRS_272_257_1, + ETHTOOL_LINK_MODE_FEC_LLRS_BIT); + + active_fec_long = active_fec; + /* active fec is a bit set, find out which bit is set and + * advertise the corresponding ethtool bit + */ + bitn = find_first_bit(&active_fec_long, sizeof(active_fec_long) * BITS_PER_BYTE); + if (bitn < ARRAY_SIZE(pplm_fec_2_ethtool_linkmodes)) + __set_bit(pplm_fec_2_ethtool_linkmodes[bitn], + link_ksettings->link_modes.advertising); + + return 0; +} + +static void ptys2ethtool_supported_advertised_port(struct mlx5_core_dev *mdev, + struct ethtool_link_ksettings *link_ksettings, + u32 eth_proto_cap, u8 connector_type) +{ + if (!MLX5_CAP_PCAM_FEATURE(mdev, ptys_connector_type)) { + if (eth_proto_cap & (MLX5E_PROT_MASK(MLX5E_10GBASE_CR) + | MLX5E_PROT_MASK(MLX5E_10GBASE_SR) + | MLX5E_PROT_MASK(MLX5E_40GBASE_CR4) + | MLX5E_PROT_MASK(MLX5E_40GBASE_SR4) + | MLX5E_PROT_MASK(MLX5E_100GBASE_SR4) + | MLX5E_PROT_MASK(MLX5E_1000BASE_CX_SGMII))) { + ethtool_link_ksettings_add_link_mode(link_ksettings, + supported, + FIBRE); + ethtool_link_ksettings_add_link_mode(link_ksettings, + advertising, + FIBRE); + } + + if (eth_proto_cap & (MLX5E_PROT_MASK(MLX5E_100GBASE_KR4) + | MLX5E_PROT_MASK(MLX5E_40GBASE_KR4) + | MLX5E_PROT_MASK(MLX5E_10GBASE_KR) + | MLX5E_PROT_MASK(MLX5E_10GBASE_KX4) + | MLX5E_PROT_MASK(MLX5E_1000BASE_KX))) { + ethtool_link_ksettings_add_link_mode(link_ksettings, + supported, + Backplane); + ethtool_link_ksettings_add_link_mode(link_ksettings, + advertising, + Backplane); + } + return; + } + + switch (connector_type) { + case MLX5E_PORT_TP: + ethtool_link_ksettings_add_link_mode(link_ksettings, + supported, TP); + ethtool_link_ksettings_add_link_mode(link_ksettings, + advertising, TP); + break; + case MLX5E_PORT_AUI: + ethtool_link_ksettings_add_link_mode(link_ksettings, + supported, AUI); + ethtool_link_ksettings_add_link_mode(link_ksettings, + advertising, AUI); + break; + case MLX5E_PORT_BNC: + ethtool_link_ksettings_add_link_mode(link_ksettings, + supported, BNC); + ethtool_link_ksettings_add_link_mode(link_ksettings, + advertising, BNC); + break; + case MLX5E_PORT_MII: + ethtool_link_ksettings_add_link_mode(link_ksettings, + supported, MII); + ethtool_link_ksettings_add_link_mode(link_ksettings, + advertising, MII); + break; + case MLX5E_PORT_FIBRE: + ethtool_link_ksettings_add_link_mode(link_ksettings, + supported, FIBRE); + ethtool_link_ksettings_add_link_mode(link_ksettings, + advertising, FIBRE); + break; + case MLX5E_PORT_DA: + ethtool_link_ksettings_add_link_mode(link_ksettings, + supported, Backplane); + ethtool_link_ksettings_add_link_mode(link_ksettings, + advertising, Backplane); + break; + case MLX5E_PORT_NONE: + case MLX5E_PORT_OTHER: + default: + break; + } +} + +static void get_speed_duplex(struct net_device *netdev, + u32 eth_proto_oper, bool force_legacy, + u16 data_rate_oper, + struct ethtool_link_ksettings *link_ksettings) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + u32 speed = SPEED_UNKNOWN; + u8 duplex = DUPLEX_UNKNOWN; + + if (!netif_carrier_ok(netdev)) + goto out; + + speed = mlx5e_port_ptys2speed(priv->mdev, eth_proto_oper, force_legacy); + if (!speed) { + if (data_rate_oper) + speed = 100 * data_rate_oper; + else + speed = SPEED_UNKNOWN; + goto out; + } + + duplex = DUPLEX_FULL; + +out: + link_ksettings->base.speed = speed; + link_ksettings->base.duplex = duplex; +} + +static void get_supported(struct mlx5_core_dev *mdev, u32 eth_proto_cap, + struct ethtool_link_ksettings *link_ksettings) +{ + unsigned long *supported = link_ksettings->link_modes.supported; + ptys2ethtool_supported_link(mdev, supported, eth_proto_cap); + + ethtool_link_ksettings_add_link_mode(link_ksettings, supported, Pause); +} + +static void get_advertising(u32 eth_proto_cap, u8 tx_pause, u8 rx_pause, + struct ethtool_link_ksettings *link_ksettings, + bool ext) +{ + unsigned long *advertising = link_ksettings->link_modes.advertising; + ptys2ethtool_adver_link(advertising, eth_proto_cap, ext); + + if (rx_pause) + ethtool_link_ksettings_add_link_mode(link_ksettings, advertising, Pause); + if (tx_pause ^ rx_pause) + ethtool_link_ksettings_add_link_mode(link_ksettings, advertising, Asym_Pause); +} + +static int ptys2connector_type[MLX5E_CONNECTOR_TYPE_NUMBER] = { + [MLX5E_PORT_UNKNOWN] = PORT_OTHER, + [MLX5E_PORT_NONE] = PORT_NONE, + [MLX5E_PORT_TP] = PORT_TP, + [MLX5E_PORT_AUI] = PORT_AUI, + [MLX5E_PORT_BNC] = PORT_BNC, + [MLX5E_PORT_MII] = PORT_MII, + [MLX5E_PORT_FIBRE] = PORT_FIBRE, + [MLX5E_PORT_DA] = PORT_DA, + [MLX5E_PORT_OTHER] = PORT_OTHER, + }; + +static u8 get_connector_port(struct mlx5_core_dev *mdev, u32 eth_proto, u8 connector_type) +{ + if (MLX5_CAP_PCAM_FEATURE(mdev, ptys_connector_type)) + return ptys2connector_type[connector_type]; + + if (eth_proto & + (MLX5E_PROT_MASK(MLX5E_10GBASE_SR) | + MLX5E_PROT_MASK(MLX5E_40GBASE_SR4) | + MLX5E_PROT_MASK(MLX5E_100GBASE_SR4) | + MLX5E_PROT_MASK(MLX5E_1000BASE_CX_SGMII))) { + return PORT_FIBRE; + } + + if (eth_proto & + (MLX5E_PROT_MASK(MLX5E_40GBASE_CR4) | + MLX5E_PROT_MASK(MLX5E_10GBASE_CR) | + MLX5E_PROT_MASK(MLX5E_100GBASE_CR4))) { + return PORT_DA; + } + + if (eth_proto & + (MLX5E_PROT_MASK(MLX5E_10GBASE_KX4) | + MLX5E_PROT_MASK(MLX5E_10GBASE_KR) | + MLX5E_PROT_MASK(MLX5E_40GBASE_KR4) | + MLX5E_PROT_MASK(MLX5E_100GBASE_KR4))) { + return PORT_NONE; + } + + return PORT_OTHER; +} + +static void get_lp_advertising(struct mlx5_core_dev *mdev, u32 eth_proto_lp, + struct ethtool_link_ksettings *link_ksettings) +{ + unsigned long *lp_advertising = link_ksettings->link_modes.lp_advertising; + bool ext = mlx5e_ptys_ext_supported(mdev); + + ptys2ethtool_adver_link(lp_advertising, eth_proto_lp, ext); +} + +int mlx5e_ethtool_get_link_ksettings(struct mlx5e_priv *priv, + struct ethtool_link_ksettings *link_ksettings) +{ + struct mlx5_core_dev *mdev = priv->mdev; + u32 out[MLX5_ST_SZ_DW(ptys_reg)] = {}; + u32 eth_proto_admin; + u8 an_disable_admin; + u16 data_rate_oper; + u32 eth_proto_oper; + u32 eth_proto_cap; + u8 connector_type; + u32 rx_pause = 0; + u32 tx_pause = 0; + u32 eth_proto_lp; + bool admin_ext; + u8 an_status; + bool ext; + int err; + + err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN, 1); + if (err) { + netdev_err(priv->netdev, "%s: query port ptys failed: %d\n", + __func__, err); + goto err_query_regs; + } + ext = !!MLX5_GET_ETH_PROTO(ptys_reg, out, true, eth_proto_capability); + eth_proto_cap = MLX5_GET_ETH_PROTO(ptys_reg, out, ext, + eth_proto_capability); + eth_proto_admin = MLX5_GET_ETH_PROTO(ptys_reg, out, ext, + eth_proto_admin); + /* Fields: eth_proto_admin and ext_eth_proto_admin are + * mutually exclusive. Hence try reading legacy advertising + * when extended advertising is zero. + * admin_ext indicates which proto_admin (ext vs. legacy) + * should be read and interpreted + */ + admin_ext = ext; + if (ext && !eth_proto_admin) { + eth_proto_admin = MLX5_GET_ETH_PROTO(ptys_reg, out, false, + eth_proto_admin); + admin_ext = false; + } + + eth_proto_oper = MLX5_GET_ETH_PROTO(ptys_reg, out, admin_ext, + eth_proto_oper); + eth_proto_lp = MLX5_GET(ptys_reg, out, eth_proto_lp_advertise); + an_disable_admin = MLX5_GET(ptys_reg, out, an_disable_admin); + an_status = MLX5_GET(ptys_reg, out, an_status); + connector_type = MLX5_GET(ptys_reg, out, connector_type); + data_rate_oper = MLX5_GET(ptys_reg, out, data_rate_oper); + + mlx5_query_port_pause(mdev, &rx_pause, &tx_pause); + + ethtool_link_ksettings_zero_link_mode(link_ksettings, supported); + ethtool_link_ksettings_zero_link_mode(link_ksettings, advertising); + + get_supported(mdev, eth_proto_cap, link_ksettings); + get_advertising(eth_proto_admin, tx_pause, rx_pause, link_ksettings, + admin_ext); + get_speed_duplex(priv->netdev, eth_proto_oper, !admin_ext, + data_rate_oper, link_ksettings); + + eth_proto_oper = eth_proto_oper ? eth_proto_oper : eth_proto_cap; + connector_type = connector_type < MLX5E_CONNECTOR_TYPE_NUMBER ? + connector_type : MLX5E_PORT_UNKNOWN; + link_ksettings->base.port = get_connector_port(mdev, eth_proto_oper, connector_type); + ptys2ethtool_supported_advertised_port(mdev, link_ksettings, eth_proto_admin, + connector_type); + get_lp_advertising(mdev, eth_proto_lp, link_ksettings); + + if (an_status == MLX5_AN_COMPLETE) + ethtool_link_ksettings_add_link_mode(link_ksettings, + lp_advertising, Autoneg); + + link_ksettings->base.autoneg = an_disable_admin ? AUTONEG_DISABLE : + AUTONEG_ENABLE; + ethtool_link_ksettings_add_link_mode(link_ksettings, supported, + Autoneg); + + err = get_fec_supported_advertised(mdev, link_ksettings); + if (err) { + netdev_dbg(priv->netdev, "%s: FEC caps query failed: %d\n", + __func__, err); + err = 0; /* don't fail caps query because of FEC error */ + } + + if (!an_disable_admin) + ethtool_link_ksettings_add_link_mode(link_ksettings, + advertising, Autoneg); + +err_query_regs: + return err; +} + +int mlx5e_get_link_ksettings(struct net_device *netdev, + struct ethtool_link_ksettings *link_ksettings) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + + return mlx5e_ethtool_get_link_ksettings(priv, link_ksettings); +} + +static int mlx5e_speed_validate(struct net_device *netdev, bool ext, + const unsigned long link_modes, u8 autoneg) +{ + /* Extended link-mode has no speed limitations. */ + if (ext) + return 0; + + if ((link_modes & MLX5E_PROT_MASK(MLX5E_56GBASE_R4)) && + autoneg != AUTONEG_ENABLE) { + netdev_err(netdev, "%s: 56G link speed requires autoneg enabled\n", + __func__); + return -EINVAL; + } + return 0; +} + +static u32 mlx5e_ethtool2ptys_adver_link(const unsigned long *link_modes) +{ + u32 i, ptys_modes = 0; + + for (i = 0; i < MLX5E_LINK_MODES_NUMBER; ++i) { + if (*ptys2legacy_ethtool_table[i].advertised == 0) + continue; + if (bitmap_intersects(ptys2legacy_ethtool_table[i].advertised, + link_modes, + __ETHTOOL_LINK_MODE_MASK_NBITS)) + ptys_modes |= MLX5E_PROT_MASK(i); + } + + return ptys_modes; +} + +static u32 mlx5e_ethtool2ptys_ext_adver_link(const unsigned long *link_modes) +{ + u32 i, ptys_modes = 0; + unsigned long modes[2]; + + for (i = 0; i < MLX5E_EXT_LINK_MODES_NUMBER; ++i) { + if (ptys2ext_ethtool_table[i].advertised[0] == 0 && + ptys2ext_ethtool_table[i].advertised[1] == 0) + continue; + memset(modes, 0, sizeof(modes)); + bitmap_and(modes, ptys2ext_ethtool_table[i].advertised, + link_modes, __ETHTOOL_LINK_MODE_MASK_NBITS); + + if (modes[0] == ptys2ext_ethtool_table[i].advertised[0] && + modes[1] == ptys2ext_ethtool_table[i].advertised[1]) + ptys_modes |= MLX5E_PROT_MASK(i); + } + return ptys_modes; +} + +static bool ext_link_mode_requested(const unsigned long *adver) +{ +#define MLX5E_MIN_PTYS_EXT_LINK_MODE_BIT ETHTOOL_LINK_MODE_50000baseKR_Full_BIT + int size = __ETHTOOL_LINK_MODE_MASK_NBITS - MLX5E_MIN_PTYS_EXT_LINK_MODE_BIT; + __ETHTOOL_DECLARE_LINK_MODE_MASK(modes) = {0,}; + + /* bitmap_intersects returns true for empty modes, but we want false */ + if (size <= 0) + return false; + + bitmap_set(modes, MLX5E_MIN_PTYS_EXT_LINK_MODE_BIT, size); + return bitmap_intersects(modes, adver, __ETHTOOL_LINK_MODE_MASK_NBITS); +} + +static bool ext_requested(u8 autoneg, const unsigned long *adver, bool ext_supported) +{ + bool ext_link_mode = ext_link_mode_requested(adver); + + return autoneg == AUTONEG_ENABLE ? ext_link_mode : ext_supported; +} + +int mlx5e_ethtool_set_link_ksettings(struct mlx5e_priv *priv, + const struct ethtool_link_ksettings *link_ksettings) +{ + struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5e_port_eth_proto eproto; + const unsigned long *adver; + bool an_changes = false; + u8 an_disable_admin; + bool ext_supported; + u8 an_disable_cap; + bool an_disable; + u32 link_modes; + u8 an_status; + u8 autoneg; + u32 speed; + bool ext; + int err; + + u32 (*ethtool2ptys_adver_func)(const unsigned long *adver); + + adver = link_ksettings->link_modes.advertising; + autoneg = link_ksettings->base.autoneg; + speed = link_ksettings->base.speed; + + ext_supported = mlx5e_ptys_ext_supported(mdev); + ext = ext_requested(autoneg, adver, ext_supported); + if (!ext_supported && ext) + return -EOPNOTSUPP; + + ethtool2ptys_adver_func = ext ? mlx5e_ethtool2ptys_ext_adver_link : + mlx5e_ethtool2ptys_adver_link; + err = mlx5_port_query_eth_proto(mdev, 1, ext, &eproto); + if (err) { + netdev_err(priv->netdev, "%s: query port eth proto failed: %d\n", + __func__, err); + goto out; + } + link_modes = autoneg == AUTONEG_ENABLE ? ethtool2ptys_adver_func(adver) : + mlx5e_port_speed2linkmodes(mdev, speed, !ext); + + err = mlx5e_speed_validate(priv->netdev, ext, link_modes, autoneg); + if (err) + goto out; + + link_modes = link_modes & eproto.cap; + if (!link_modes) { + netdev_err(priv->netdev, "%s: Not supported link mode(s) requested", + __func__); + err = -EINVAL; + goto out; + } + + mlx5_port_query_eth_autoneg(mdev, &an_status, &an_disable_cap, + &an_disable_admin); + + an_disable = autoneg == AUTONEG_DISABLE; + an_changes = ((!an_disable && an_disable_admin) || + (an_disable && !an_disable_admin)); + + if (!an_changes && link_modes == eproto.admin) + goto out; + + mlx5_port_set_eth_ptys(mdev, an_disable, link_modes, ext); + mlx5_toggle_port_link(mdev); + +out: + return err; +} + +int mlx5e_set_link_ksettings(struct net_device *netdev, + const struct ethtool_link_ksettings *link_ksettings) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + + return mlx5e_ethtool_set_link_ksettings(priv, link_ksettings); +} + +u32 mlx5e_ethtool_get_rxfh_key_size(struct mlx5e_priv *priv) +{ + return sizeof_field(struct mlx5e_rss_params_hash, toeplitz_hash_key); +} + +static u32 mlx5e_get_rxfh_key_size(struct net_device *netdev) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + + return mlx5e_ethtool_get_rxfh_key_size(priv); +} + +u32 mlx5e_ethtool_get_rxfh_indir_size(struct mlx5e_priv *priv) +{ + return MLX5E_INDIR_RQT_SIZE; +} + +static u32 mlx5e_get_rxfh_indir_size(struct net_device *netdev) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + + return mlx5e_ethtool_get_rxfh_indir_size(priv); +} + +static int mlx5e_get_rxfh_context(struct net_device *dev, u32 *indir, + u8 *key, u8 *hfunc, u32 rss_context) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + int err; + + mutex_lock(&priv->state_lock); + err = mlx5e_rx_res_rss_get_rxfh(priv->rx_res, rss_context, indir, key, hfunc); + mutex_unlock(&priv->state_lock); + return err; +} + +static int mlx5e_set_rxfh_context(struct net_device *dev, const u32 *indir, + const u8 *key, const u8 hfunc, + u32 *rss_context, bool delete) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + int err; + + mutex_lock(&priv->state_lock); + if (delete) { + err = mlx5e_rx_res_rss_destroy(priv->rx_res, *rss_context); + goto unlock; + } + + if (*rss_context == ETH_RXFH_CONTEXT_ALLOC) { + unsigned int count = priv->channels.params.num_channels; + + err = mlx5e_rx_res_rss_init(priv->rx_res, rss_context, count); + if (err) + goto unlock; + } + + err = mlx5e_rx_res_rss_set_rxfh(priv->rx_res, *rss_context, indir, key, + hfunc == ETH_RSS_HASH_NO_CHANGE ? NULL : &hfunc); + +unlock: + mutex_unlock(&priv->state_lock); + return err; +} + +int mlx5e_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key, + u8 *hfunc) +{ + return mlx5e_get_rxfh_context(netdev, indir, key, hfunc, 0); +} + +int mlx5e_set_rxfh(struct net_device *dev, const u32 *indir, + const u8 *key, const u8 hfunc) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + int err; + + mutex_lock(&priv->state_lock); + err = mlx5e_rx_res_rss_set_rxfh(priv->rx_res, 0, indir, key, + hfunc == ETH_RSS_HASH_NO_CHANGE ? NULL : &hfunc); + mutex_unlock(&priv->state_lock); + return err; +} + +#define MLX5E_PFC_PREVEN_AUTO_TOUT_MSEC 100 +#define MLX5E_PFC_PREVEN_TOUT_MAX_MSEC 8000 +#define MLX5E_PFC_PREVEN_MINOR_PRECENT 85 +#define MLX5E_PFC_PREVEN_TOUT_MIN_MSEC 80 +#define MLX5E_DEVICE_STALL_MINOR_WATERMARK(critical_tout) \ + max_t(u16, MLX5E_PFC_PREVEN_TOUT_MIN_MSEC, \ + (critical_tout * MLX5E_PFC_PREVEN_MINOR_PRECENT) / 100) + +static int mlx5e_get_pfc_prevention_tout(struct net_device *netdev, + u16 *pfc_prevention_tout) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5_core_dev *mdev = priv->mdev; + + if (!MLX5_CAP_PCAM_FEATURE((priv)->mdev, pfcc_mask) || + !MLX5_CAP_DEBUG((priv)->mdev, stall_detect)) + return -EOPNOTSUPP; + + return mlx5_query_port_stall_watermark(mdev, pfc_prevention_tout, NULL); +} + +static int mlx5e_set_pfc_prevention_tout(struct net_device *netdev, + u16 pfc_preven) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5_core_dev *mdev = priv->mdev; + u16 critical_tout; + u16 minor; + + if (!MLX5_CAP_PCAM_FEATURE((priv)->mdev, pfcc_mask) || + !MLX5_CAP_DEBUG((priv)->mdev, stall_detect)) + return -EOPNOTSUPP; + + critical_tout = (pfc_preven == PFC_STORM_PREVENTION_AUTO) ? + MLX5E_PFC_PREVEN_AUTO_TOUT_MSEC : + pfc_preven; + + if (critical_tout != PFC_STORM_PREVENTION_DISABLE && + (critical_tout > MLX5E_PFC_PREVEN_TOUT_MAX_MSEC || + critical_tout < MLX5E_PFC_PREVEN_TOUT_MIN_MSEC)) { + netdev_info(netdev, "%s: pfc prevention tout not in range (%d-%d)\n", + __func__, MLX5E_PFC_PREVEN_TOUT_MIN_MSEC, + MLX5E_PFC_PREVEN_TOUT_MAX_MSEC); + return -EINVAL; + } + + minor = MLX5E_DEVICE_STALL_MINOR_WATERMARK(critical_tout); + return mlx5_set_port_stall_watermark(mdev, critical_tout, + minor); +} + +static int mlx5e_get_tunable(struct net_device *dev, + const struct ethtool_tunable *tuna, + void *data) +{ + int err; + + switch (tuna->id) { + case ETHTOOL_PFC_PREVENTION_TOUT: + err = mlx5e_get_pfc_prevention_tout(dev, data); + break; + default: + err = -EINVAL; + break; + } + + return err; +} + +static int mlx5e_set_tunable(struct net_device *dev, + const struct ethtool_tunable *tuna, + const void *data) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + int err; + + mutex_lock(&priv->state_lock); + + switch (tuna->id) { + case ETHTOOL_PFC_PREVENTION_TOUT: + err = mlx5e_set_pfc_prevention_tout(dev, *(u16 *)data); + break; + default: + err = -EINVAL; + break; + } + + mutex_unlock(&priv->state_lock); + return err; +} + +static void mlx5e_get_pause_stats(struct net_device *netdev, + struct ethtool_pause_stats *pause_stats) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + + mlx5e_stats_pause_get(priv, pause_stats); +} + +void mlx5e_ethtool_get_pauseparam(struct mlx5e_priv *priv, + struct ethtool_pauseparam *pauseparam) +{ + struct mlx5_core_dev *mdev = priv->mdev; + int err; + + err = mlx5_query_port_pause(mdev, &pauseparam->rx_pause, + &pauseparam->tx_pause); + if (err) { + netdev_err(priv->netdev, "%s: mlx5_query_port_pause failed:0x%x\n", + __func__, err); + } +} + +static void mlx5e_get_pauseparam(struct net_device *netdev, + struct ethtool_pauseparam *pauseparam) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + + mlx5e_ethtool_get_pauseparam(priv, pauseparam); +} + +int mlx5e_ethtool_set_pauseparam(struct mlx5e_priv *priv, + struct ethtool_pauseparam *pauseparam) +{ + struct mlx5_core_dev *mdev = priv->mdev; + int err; + + if (!MLX5_CAP_GEN(mdev, vport_group_manager)) + return -EOPNOTSUPP; + + if (pauseparam->autoneg) + return -EINVAL; + + err = mlx5_set_port_pause(mdev, + pauseparam->rx_pause ? 1 : 0, + pauseparam->tx_pause ? 1 : 0); + if (err) { + netdev_err(priv->netdev, "%s: mlx5_set_port_pause failed:0x%x\n", + __func__, err); + } + + return err; +} + +static int mlx5e_set_pauseparam(struct net_device *netdev, + struct ethtool_pauseparam *pauseparam) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + + return mlx5e_ethtool_set_pauseparam(priv, pauseparam); +} + +int mlx5e_ethtool_get_ts_info(struct mlx5e_priv *priv, + struct ethtool_ts_info *info) +{ + struct mlx5_core_dev *mdev = priv->mdev; + + info->phc_index = mlx5_clock_get_ptp_index(mdev); + + if (!MLX5_CAP_GEN(priv->mdev, device_frequency_khz) || + info->phc_index == -1) + return 0; + + info->so_timestamping = SOF_TIMESTAMPING_TX_HARDWARE | + SOF_TIMESTAMPING_RX_HARDWARE | + SOF_TIMESTAMPING_RAW_HARDWARE; + + info->tx_types = BIT(HWTSTAMP_TX_OFF) | + BIT(HWTSTAMP_TX_ON); + + info->rx_filters = BIT(HWTSTAMP_FILTER_NONE) | + BIT(HWTSTAMP_FILTER_ALL); + + return 0; +} + +static int mlx5e_get_ts_info(struct net_device *dev, + struct ethtool_ts_info *info) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + + return mlx5e_ethtool_get_ts_info(priv, info); +} + +static __u32 mlx5e_get_wol_supported(struct mlx5_core_dev *mdev) +{ + __u32 ret = 0; + + if (MLX5_CAP_GEN(mdev, wol_g)) + ret |= WAKE_MAGIC; + + if (MLX5_CAP_GEN(mdev, wol_s)) + ret |= WAKE_MAGICSECURE; + + if (MLX5_CAP_GEN(mdev, wol_a)) + ret |= WAKE_ARP; + + if (MLX5_CAP_GEN(mdev, wol_b)) + ret |= WAKE_BCAST; + + if (MLX5_CAP_GEN(mdev, wol_m)) + ret |= WAKE_MCAST; + + if (MLX5_CAP_GEN(mdev, wol_u)) + ret |= WAKE_UCAST; + + if (MLX5_CAP_GEN(mdev, wol_p)) + ret |= WAKE_PHY; + + return ret; +} + +static __u32 mlx5e_reformat_wol_mode_mlx5_to_linux(u8 mode) +{ + __u32 ret = 0; + + if (mode & MLX5_WOL_MAGIC) + ret |= WAKE_MAGIC; + + if (mode & MLX5_WOL_SECURED_MAGIC) + ret |= WAKE_MAGICSECURE; + + if (mode & MLX5_WOL_ARP) + ret |= WAKE_ARP; + + if (mode & MLX5_WOL_BROADCAST) + ret |= WAKE_BCAST; + + if (mode & MLX5_WOL_MULTICAST) + ret |= WAKE_MCAST; + + if (mode & MLX5_WOL_UNICAST) + ret |= WAKE_UCAST; + + if (mode & MLX5_WOL_PHY_ACTIVITY) + ret |= WAKE_PHY; + + return ret; +} + +static u8 mlx5e_reformat_wol_mode_linux_to_mlx5(__u32 mode) +{ + u8 ret = 0; + + if (mode & WAKE_MAGIC) + ret |= MLX5_WOL_MAGIC; + + if (mode & WAKE_MAGICSECURE) + ret |= MLX5_WOL_SECURED_MAGIC; + + if (mode & WAKE_ARP) + ret |= MLX5_WOL_ARP; + + if (mode & WAKE_BCAST) + ret |= MLX5_WOL_BROADCAST; + + if (mode & WAKE_MCAST) + ret |= MLX5_WOL_MULTICAST; + + if (mode & WAKE_UCAST) + ret |= MLX5_WOL_UNICAST; + + if (mode & WAKE_PHY) + ret |= MLX5_WOL_PHY_ACTIVITY; + + return ret; +} + +static void mlx5e_get_wol(struct net_device *netdev, + struct ethtool_wolinfo *wol) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5_core_dev *mdev = priv->mdev; + u8 mlx5_wol_mode; + int err; + + memset(wol, 0, sizeof(*wol)); + + wol->supported = mlx5e_get_wol_supported(mdev); + if (!wol->supported) + return; + + err = mlx5_query_port_wol(mdev, &mlx5_wol_mode); + if (err) + return; + + wol->wolopts = mlx5e_reformat_wol_mode_mlx5_to_linux(mlx5_wol_mode); +} + +static int mlx5e_set_wol(struct net_device *netdev, struct ethtool_wolinfo *wol) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5_core_dev *mdev = priv->mdev; + __u32 wol_supported = mlx5e_get_wol_supported(mdev); + u32 mlx5_wol_mode; + + if (!wol_supported) + return -EOPNOTSUPP; + + if (wol->wolopts & ~wol_supported) + return -EINVAL; + + mlx5_wol_mode = mlx5e_reformat_wol_mode_linux_to_mlx5(wol->wolopts); + + return mlx5_set_port_wol(mdev, mlx5_wol_mode); +} + +static void mlx5e_get_fec_stats(struct net_device *netdev, + struct ethtool_fec_stats *fec_stats) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + + mlx5e_stats_fec_get(priv, fec_stats); +} + +static int mlx5e_get_fecparam(struct net_device *netdev, + struct ethtool_fecparam *fecparam) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5_core_dev *mdev = priv->mdev; + u16 fec_configured; + u32 fec_active; + int err; + + err = mlx5e_get_fec_mode(mdev, &fec_active, &fec_configured); + + if (err) + return err; + + fecparam->active_fec = pplm2ethtool_fec((unsigned long)fec_active, + sizeof(unsigned long) * BITS_PER_BYTE); + + if (!fecparam->active_fec) + return -EOPNOTSUPP; + + fecparam->fec = pplm2ethtool_fec((unsigned long)fec_configured, + sizeof(unsigned long) * BITS_PER_BYTE); + + return 0; +} + +static int mlx5e_set_fecparam(struct net_device *netdev, + struct ethtool_fecparam *fecparam) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5_core_dev *mdev = priv->mdev; + unsigned long fec_bitmap; + u16 fec_policy = 0; + int mode; + int err; + + bitmap_from_arr32(&fec_bitmap, &fecparam->fec, sizeof(fecparam->fec) * BITS_PER_BYTE); + if (bitmap_weight(&fec_bitmap, ETHTOOL_FEC_LLRS_BIT + 1) > 1) + return -EOPNOTSUPP; + + for (mode = 0; mode < ARRAY_SIZE(pplm_fec_2_ethtool); mode++) { + if (!(pplm_fec_2_ethtool[mode] & fecparam->fec)) + continue; + fec_policy |= (1 << mode); + break; + } + + err = mlx5e_set_fec_mode(mdev, fec_policy); + + if (err) + return err; + + mlx5_toggle_port_link(mdev); + + return 0; +} + +static u32 mlx5e_get_msglevel(struct net_device *dev) +{ + return ((struct mlx5e_priv *)netdev_priv(dev))->msglevel; +} + +static void mlx5e_set_msglevel(struct net_device *dev, u32 val) +{ + ((struct mlx5e_priv *)netdev_priv(dev))->msglevel = val; +} + +static int mlx5e_set_phys_id(struct net_device *dev, + enum ethtool_phys_id_state state) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5_core_dev *mdev = priv->mdev; + u16 beacon_duration; + + if (!MLX5_CAP_GEN(mdev, beacon_led)) + return -EOPNOTSUPP; + + switch (state) { + case ETHTOOL_ID_ACTIVE: + beacon_duration = MLX5_BEACON_DURATION_INF; + break; + case ETHTOOL_ID_INACTIVE: + beacon_duration = MLX5_BEACON_DURATION_OFF; + break; + default: + return -EOPNOTSUPP; + } + + return mlx5_set_port_beacon(mdev, beacon_duration); +} + +static int mlx5e_get_module_info(struct net_device *netdev, + struct ethtool_modinfo *modinfo) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5_core_dev *dev = priv->mdev; + int size_read = 0; + u8 data[4] = {0}; + + size_read = mlx5_query_module_eeprom(dev, 0, 2, data); + if (size_read < 2) + return -EIO; + + /* data[0] = identifier byte */ + switch (data[0]) { + case MLX5_MODULE_ID_QSFP: + modinfo->type = ETH_MODULE_SFF_8436; + modinfo->eeprom_len = ETH_MODULE_SFF_8436_MAX_LEN; + break; + case MLX5_MODULE_ID_QSFP_PLUS: + case MLX5_MODULE_ID_QSFP28: + /* data[1] = revision id */ + if (data[0] == MLX5_MODULE_ID_QSFP28 || data[1] >= 0x3) { + modinfo->type = ETH_MODULE_SFF_8636; + modinfo->eeprom_len = ETH_MODULE_SFF_8636_MAX_LEN; + } else { + modinfo->type = ETH_MODULE_SFF_8436; + modinfo->eeprom_len = ETH_MODULE_SFF_8436_MAX_LEN; + } + break; + case MLX5_MODULE_ID_SFP: + modinfo->type = ETH_MODULE_SFF_8472; + modinfo->eeprom_len = ETH_MODULE_SFF_8472_LEN; + break; + default: + netdev_err(priv->netdev, "%s: cable type not recognized:0x%x\n", + __func__, data[0]); + return -EINVAL; + } + + return 0; +} + +static int mlx5e_get_module_eeprom(struct net_device *netdev, + struct ethtool_eeprom *ee, + u8 *data) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5_core_dev *mdev = priv->mdev; + int offset = ee->offset; + int size_read; + int i = 0; + + if (!ee->len) + return -EINVAL; + + memset(data, 0, ee->len); + + while (i < ee->len) { + size_read = mlx5_query_module_eeprom(mdev, offset, ee->len - i, + data + i); + + if (!size_read) + /* Done reading */ + return 0; + + if (size_read < 0) { + netdev_err(priv->netdev, "%s: mlx5_query_eeprom failed:0x%x\n", + __func__, size_read); + return size_read; + } + + i += size_read; + offset += size_read; + } + + return 0; +} + +static int mlx5e_get_module_eeprom_by_page(struct net_device *netdev, + const struct ethtool_module_eeprom *page_data, + struct netlink_ext_ack *extack) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5_module_eeprom_query_params query; + struct mlx5_core_dev *mdev = priv->mdev; + u8 *data = page_data->data; + int size_read; + int i = 0; + + if (!page_data->length) + return -EINVAL; + + memset(data, 0, page_data->length); + + query.offset = page_data->offset; + query.i2c_address = page_data->i2c_address; + query.bank = page_data->bank; + query.page = page_data->page; + while (i < page_data->length) { + query.size = page_data->length - i; + size_read = mlx5_query_module_eeprom_by_page(mdev, &query, data + i); + + /* Done reading, return how many bytes was read */ + if (!size_read) + return i; + + if (size_read == -EINVAL) + return -EINVAL; + if (size_read < 0) { + netdev_err(priv->netdev, "%s: mlx5_query_module_eeprom_by_page failed:0x%x\n", + __func__, size_read); + return i; + } + + i += size_read; + query.offset += size_read; + } + + return i; +} + +int mlx5e_ethtool_flash_device(struct mlx5e_priv *priv, + struct ethtool_flash *flash) +{ + struct mlx5_core_dev *mdev = priv->mdev; + struct net_device *dev = priv->netdev; + const struct firmware *fw; + int err; + + if (flash->region != ETHTOOL_FLASH_ALL_REGIONS) + return -EOPNOTSUPP; + + err = request_firmware_direct(&fw, flash->data, &dev->dev); + if (err) + return err; + + dev_hold(dev); + rtnl_unlock(); + + err = mlx5_firmware_flash(mdev, fw, NULL); + release_firmware(fw); + + rtnl_lock(); + dev_put(dev); + return err; +} + +static int mlx5e_flash_device(struct net_device *dev, + struct ethtool_flash *flash) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + + return mlx5e_ethtool_flash_device(priv, flash); +} + +static int set_pflag_cqe_based_moder(struct net_device *netdev, bool enable, + bool is_rx_cq) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + u8 cq_period_mode, current_cq_period_mode; + struct mlx5e_params new_params; + + if (enable && !MLX5_CAP_GEN(priv->mdev, cq_period_start_from_cqe)) + return -EOPNOTSUPP; + + cq_period_mode = cqe_mode_to_period_mode(enable); + + current_cq_period_mode = is_rx_cq ? + priv->channels.params.rx_cq_moderation.cq_period_mode : + priv->channels.params.tx_cq_moderation.cq_period_mode; + + if (cq_period_mode == current_cq_period_mode) + return 0; + + new_params = priv->channels.params; + if (is_rx_cq) + mlx5e_set_rx_cq_mode_params(&new_params, cq_period_mode); + else + mlx5e_set_tx_cq_mode_params(&new_params, cq_period_mode); + + return mlx5e_safe_switch_params(priv, &new_params, NULL, NULL, true); +} + +static int set_pflag_tx_cqe_based_moder(struct net_device *netdev, bool enable) +{ + return set_pflag_cqe_based_moder(netdev, enable, false); +} + +static int set_pflag_rx_cqe_based_moder(struct net_device *netdev, bool enable) +{ + return set_pflag_cqe_based_moder(netdev, enable, true); +} + +int mlx5e_modify_rx_cqe_compression_locked(struct mlx5e_priv *priv, bool new_val, bool rx_filter) +{ + bool curr_val = MLX5E_GET_PFLAG(&priv->channels.params, MLX5E_PFLAG_RX_CQE_COMPRESS); + struct mlx5e_params new_params; + int err = 0; + + if (!MLX5_CAP_GEN(priv->mdev, cqe_compression)) + return new_val ? -EOPNOTSUPP : 0; + + if (curr_val == new_val) + return 0; + + if (new_val && !mlx5e_profile_feature_cap(priv->profile, PTP_RX) && rx_filter) { + netdev_err(priv->netdev, + "Profile doesn't support enabling of CQE compression while hardware time-stamping is enabled.\n"); + return -EINVAL; + } + + if (priv->channels.params.packet_merge.type == MLX5E_PACKET_MERGE_SHAMPO) { + netdev_warn(priv->netdev, "Can't set CQE compression with HW-GRO, disable it first.\n"); + return -EINVAL; + } + + new_params = priv->channels.params; + MLX5E_SET_PFLAG(&new_params, MLX5E_PFLAG_RX_CQE_COMPRESS, new_val); + if (rx_filter) + new_params.ptp_rx = new_val; + + if (new_params.ptp_rx == priv->channels.params.ptp_rx) + err = mlx5e_safe_switch_params(priv, &new_params, NULL, NULL, true); + else + err = mlx5e_safe_switch_params(priv, &new_params, mlx5e_ptp_rx_manage_fs_ctx, + &new_params.ptp_rx, true); + if (err) + return err; + + mlx5e_dbg(DRV, priv, "MLX5E: RxCqeCmprss was turned %s\n", + MLX5E_GET_PFLAG(&priv->channels.params, + MLX5E_PFLAG_RX_CQE_COMPRESS) ? "ON" : "OFF"); + + return 0; +} + +static int set_pflag_rx_cqe_compress(struct net_device *netdev, + bool enable) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5_core_dev *mdev = priv->mdev; + bool rx_filter; + int err; + + if (!MLX5_CAP_GEN(mdev, cqe_compression)) + return -EOPNOTSUPP; + + rx_filter = priv->tstamp.rx_filter != HWTSTAMP_FILTER_NONE; + err = mlx5e_modify_rx_cqe_compression_locked(priv, enable, rx_filter); + if (err) + return err; + + priv->channels.params.rx_cqe_compress_def = enable; + + return 0; +} + +static int set_pflag_rx_striding_rq(struct net_device *netdev, bool enable) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5e_params new_params; + + if (enable) { + /* Checking the regular RQ here; mlx5e_validate_xsk_param called + * from mlx5e_open_xsk will check for each XSK queue, and + * mlx5e_safe_switch_params will be reverted if any check fails. + */ + int err = mlx5e_mpwrq_validate_regular(mdev, &priv->channels.params); + + if (err) + return err; + } else if (priv->channels.params.packet_merge.type != MLX5E_PACKET_MERGE_NONE) { + netdev_warn(netdev, "Can't set legacy RQ with HW-GRO/LRO, disable them first\n"); + return -EINVAL; + } + + new_params = priv->channels.params; + + MLX5E_SET_PFLAG(&new_params, MLX5E_PFLAG_RX_STRIDING_RQ, enable); + mlx5e_set_rq_type(mdev, &new_params); + + return mlx5e_safe_switch_params(priv, &new_params, NULL, NULL, true); +} + +static int set_pflag_rx_no_csum_complete(struct net_device *netdev, bool enable) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5e_channels *channels = &priv->channels; + struct mlx5e_channel *c; + int i; + + if (!test_bit(MLX5E_STATE_OPENED, &priv->state) || + priv->channels.params.xdp_prog) + return 0; + + for (i = 0; i < channels->num; i++) { + c = channels->c[i]; + if (enable) + __set_bit(MLX5E_RQ_STATE_NO_CSUM_COMPLETE, &c->rq.state); + else + __clear_bit(MLX5E_RQ_STATE_NO_CSUM_COMPLETE, &c->rq.state); + } + + return 0; +} + +static int set_pflag_tx_mpwqe_common(struct net_device *netdev, u32 flag, bool enable) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5e_params new_params; + + if (enable && !mlx5e_tx_mpwqe_supported(mdev)) + return -EOPNOTSUPP; + + new_params = priv->channels.params; + + MLX5E_SET_PFLAG(&new_params, flag, enable); + + return mlx5e_safe_switch_params(priv, &new_params, NULL, NULL, true); +} + +static int set_pflag_per_channel_stats(struct net_device *netdev, bool enable) +{ + /* This is empty on purpose. The per_channel_stats feature affects the + * "ethtool -S" output only. Allow to toggle it unconditionally. */ + return 0; +} + +static int set_pflag_xdp_tx_mpwqe(struct net_device *netdev, bool enable) +{ + return set_pflag_tx_mpwqe_common(netdev, MLX5E_PFLAG_XDP_TX_MPWQE, enable); +} + +static int set_pflag_skb_tx_mpwqe(struct net_device *netdev, bool enable) +{ + return set_pflag_tx_mpwqe_common(netdev, MLX5E_PFLAG_SKB_TX_MPWQE, enable); +} + +static int set_pflag_tx_port_ts(struct net_device *netdev, bool enable) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5e_params new_params; + int err; + + if (!MLX5_CAP_GEN(mdev, ts_cqe_to_dest_cqn)) + return -EOPNOTSUPP; + + /* Don't allow changing the PTP state if HTB offload is active, because + * the numeration of the QoS SQs will change, while per-queue qdiscs are + * attached. + */ + if (priv->htb.maj_id) { + netdev_err(priv->netdev, "%s: HTB offload is active, cannot change the PTP state\n", + __func__); + return -EINVAL; + } + + new_params = priv->channels.params; + /* Don't allow enabling TX-port-TS if MQPRIO mode channel offload is + * active, since it defines explicitly which TC accepts the packet. + * This conflicts with TX-port-TS hijacking the PTP traffic to a specific + * HW TX-queue. + */ + if (enable && new_params.mqprio.mode == TC_MQPRIO_MODE_CHANNEL) { + netdev_err(priv->netdev, + "%s: MQPRIO mode channel offload is active, cannot set the TX-port-TS\n", + __func__); + return -EINVAL; + } + MLX5E_SET_PFLAG(&new_params, MLX5E_PFLAG_TX_PORT_TS, enable); + /* No need to verify SQ stop room as + * ptpsq.txqsq.stop_room <= generic_sq->stop_room, and both + * has the same log_sq_size. + */ + + err = mlx5e_safe_switch_params(priv, &new_params, + mlx5e_num_channels_changed_ctx, NULL, true); + if (!err) + priv->tx_ptp_opened = true; + + return err; +} + +static int set_pflag_dropless_rq(struct net_device *netdev, + bool new_val) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + bool curr_val = MLX5E_GET_PFLAG(&priv->channels.params, MLX5E_PFLAG_DROPLESS_RQ); + struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5e_params new_params; + int err = 0; + + if (!mlx5e_dropless_rq_supported(mdev)) + return -EOPNOTSUPP; + + if (curr_val == new_val) + return 0; + + new_params = priv->channels.params; + + MLX5E_SET_PFLAG(&new_params, MLX5E_PFLAG_DROPLESS_RQ, new_val); + + mlx5e_set_rq_type(priv->mdev, &new_params); + mlx5e_init_rq_type_params(priv->mdev, &new_params); + + + if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) { + priv->channels.params = new_params; + return 0; + } + + err = mlx5e_safe_switch_params(priv, &new_params, NULL, NULL, true); + + return err; +} + +static int set_pflag_tx_xdp_hw_checksum(struct net_device *netdev, bool new_val) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + bool curr_val = MLX5E_GET_PFLAG(&priv->channels.params, + MLX5E_PFLAG_TX_XDP_CSUM); + struct mlx5e_params new_params; + int err = 0; + + if (curr_val == new_val) + return 0; + + new_params = priv->channels.params; + MLX5E_SET_PFLAG(&new_params, MLX5E_PFLAG_TX_XDP_CSUM, new_val); + + if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) { + priv->channels.params = new_params; + return 0; + } + + err = mlx5e_safe_switch_params(priv, &new_params, NULL, NULL, true); + mlx5e_dbg(DRV, priv, "MLX5E: tx_xdp_hw_checksum was turned %s err=%d\n", + MLX5E_GET_PFLAG(&priv->channels.params, + MLX5E_PFLAG_TX_XDP_CSUM) ? "ON" : "OFF", + err); + + return err; +} + +static const struct pflag_desc mlx5e_priv_flags[MLX5E_NUM_PFLAGS] = { + { "rx_cqe_moder", set_pflag_rx_cqe_based_moder }, + { "tx_cqe_moder", set_pflag_tx_cqe_based_moder }, + { "rx_cqe_compress", set_pflag_rx_cqe_compress }, + { "rx_striding_rq", set_pflag_rx_striding_rq }, + { "rx_no_csum_complete", set_pflag_rx_no_csum_complete }, + { "xdp_tx_mpwqe", set_pflag_xdp_tx_mpwqe }, + { "skb_tx_mpwqe", set_pflag_skb_tx_mpwqe }, + { "tx_port_ts", set_pflag_tx_port_ts }, + { "dropless_rq", set_pflag_dropless_rq}, + { "per_channel_stats", set_pflag_per_channel_stats}, + { "tx_xdp_hw_checksum", set_pflag_tx_xdp_hw_checksum}, +}; + +static int mlx5e_handle_pflag(struct net_device *netdev, + u32 wanted_flags, + enum mlx5e_priv_flag flag) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + bool enable = !!(wanted_flags & BIT(flag)); + u32 changes = wanted_flags ^ priv->channels.params.pflags; + int err; + + if (!(changes & BIT(flag))) + return 0; + + err = mlx5e_priv_flags[flag].handler(netdev, enable); + if (err) { + netdev_err(netdev, "%s private flag '%s' failed err %d\n", + enable ? "Enable" : "Disable", mlx5e_priv_flags[flag].name, err); + return err; + } + + MLX5E_SET_PFLAG(&priv->channels.params, flag, enable); + return 0; +} + +static int mlx5e_set_priv_flags(struct net_device *netdev, u32 pflags) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + enum mlx5e_priv_flag pflag; + int err; + + mutex_lock(&priv->state_lock); + + for (pflag = 0; pflag < MLX5E_NUM_PFLAGS; pflag++) { + err = mlx5e_handle_pflag(netdev, pflags, pflag); + if (err) + break; + } + + mutex_unlock(&priv->state_lock); + + /* Need to fix some features.. */ + netdev_update_features(netdev); + + return err; +} + +static u32 mlx5e_get_priv_flags(struct net_device *netdev) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + + return priv->channels.params.pflags; +} + +int mlx5e_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *info, + u32 *rule_locs) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + + /* ETHTOOL_GRXRINGS is needed by ethtool -x which is not part + * of rxnfc. We keep this logic out of mlx5e_ethtool_get_rxnfc, + * to avoid breaking "ethtool -x" when mlx5e_ethtool_get_rxnfc + * is compiled out via CONFIG_MLX5_EN_RXNFC=n. + */ + if (info->cmd == ETHTOOL_GRXRINGS) { + info->data = priv->channels.params.num_channels; + return 0; + } + + return mlx5e_ethtool_get_rxnfc(priv, info, rule_locs); +} + +int mlx5e_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + + return mlx5e_ethtool_set_rxnfc(priv, cmd); +} + +int mlx5_query_port_status(struct mlx5_core_dev *mdev, u32 *status_opcode, + u16 *monitor_opcode, char *status_message) +{ + struct mlx5_ifc_pddr_troubleshooting_page_bits *pddr_troubleshooting_page; + u32 in[MLX5_ST_SZ_DW(pddr_reg)] = {}; + u32 out[MLX5_ST_SZ_DW(pddr_reg)]; + int err; + + MLX5_SET(pddr_reg, in, local_port, 1); + MLX5_SET(pddr_reg, in, page_select, + MLX5_PDDR_REG_PAGE_SELECT_TROUBLESHOOTING_INFO_PAGE); + + pddr_troubleshooting_page = MLX5_ADDR_OF(pddr_reg, in, page_data); + MLX5_SET(pddr_troubleshooting_page, pddr_troubleshooting_page, + group_opcode, MLX5_PDDR_REG_TRBLSH_GROUP_OPCODE_MONITOR); + err = mlx5_core_access_reg(mdev, in, sizeof(in), out, + sizeof(out), MLX5_REG_PDDR, 0, 0); + if (err) + return err; + + pddr_troubleshooting_page = MLX5_ADDR_OF(pddr_reg, out, page_data); + if (status_opcode) + *status_opcode = MLX5_GET(pddr_troubleshooting_page, pddr_troubleshooting_page, + status_opcode); + if (monitor_opcode) + *monitor_opcode = MLX5_GET(pddr_troubleshooting_page, pddr_troubleshooting_page, + status_opcode.pddr_monitor_opcode); + if (status_message) + strncpy(status_message, + MLX5_ADDR_OF(pddr_troubleshooting_page, pddr_troubleshooting_page, + status_message), + MLX5_FLD_SZ_BYTES(pddr_troubleshooting_page, status_message)); + + return 0; +} + +struct mlx5e_ethtool_link_ext_state_opcode_mapping { + u32 status_opcode; + enum ethtool_link_ext_state link_ext_state; + u8 link_ext_substate; +}; + +static const struct mlx5e_ethtool_link_ext_state_opcode_mapping +mlx5e_link_ext_state_opcode_map[] = { + /* States relating to the autonegotiation or issues therein */ + {2, ETHTOOL_LINK_EXT_STATE_AUTONEG, + ETHTOOL_LINK_EXT_SUBSTATE_AN_NO_PARTNER_DETECTED}, + {3, ETHTOOL_LINK_EXT_STATE_AUTONEG, + ETHTOOL_LINK_EXT_SUBSTATE_AN_ACK_NOT_RECEIVED}, + {4, ETHTOOL_LINK_EXT_STATE_AUTONEG, + ETHTOOL_LINK_EXT_SUBSTATE_AN_NEXT_PAGE_EXCHANGE_FAILED}, + {36, ETHTOOL_LINK_EXT_STATE_AUTONEG, + ETHTOOL_LINK_EXT_SUBSTATE_AN_NO_PARTNER_DETECTED_FORCE_MODE}, + {38, ETHTOOL_LINK_EXT_STATE_AUTONEG, + ETHTOOL_LINK_EXT_SUBSTATE_AN_FEC_MISMATCH_DURING_OVERRIDE}, + {39, ETHTOOL_LINK_EXT_STATE_AUTONEG, + ETHTOOL_LINK_EXT_SUBSTATE_AN_NO_HCD}, + + /* Failure during link training */ + {5, ETHTOOL_LINK_EXT_STATE_LINK_TRAINING_FAILURE, + ETHTOOL_LINK_EXT_SUBSTATE_LT_KR_FRAME_LOCK_NOT_ACQUIRED}, + {6, ETHTOOL_LINK_EXT_STATE_LINK_TRAINING_FAILURE, + ETHTOOL_LINK_EXT_SUBSTATE_LT_KR_LINK_INHIBIT_TIMEOUT}, + {7, ETHTOOL_LINK_EXT_STATE_LINK_TRAINING_FAILURE, + ETHTOOL_LINK_EXT_SUBSTATE_LT_KR_LINK_PARTNER_DID_NOT_SET_RECEIVER_READY}, + {8, ETHTOOL_LINK_EXT_STATE_LINK_TRAINING_FAILURE, 0}, + {14, ETHTOOL_LINK_EXT_STATE_LINK_TRAINING_FAILURE, + ETHTOOL_LINK_EXT_SUBSTATE_LT_REMOTE_FAULT}, + + /* Logical mismatch in physical coding sublayer or forward error correction sublayer */ + {9, ETHTOOL_LINK_EXT_STATE_LINK_LOGICAL_MISMATCH, + ETHTOOL_LINK_EXT_SUBSTATE_LLM_PCS_DID_NOT_ACQUIRE_BLOCK_LOCK}, + {10, ETHTOOL_LINK_EXT_STATE_LINK_LOGICAL_MISMATCH, + ETHTOOL_LINK_EXT_SUBSTATE_LLM_PCS_DID_NOT_ACQUIRE_AM_LOCK}, + {11, ETHTOOL_LINK_EXT_STATE_LINK_LOGICAL_MISMATCH, + ETHTOOL_LINK_EXT_SUBSTATE_LLM_PCS_DID_NOT_GET_ALIGN_STATUS}, + {12, ETHTOOL_LINK_EXT_STATE_LINK_LOGICAL_MISMATCH, + ETHTOOL_LINK_EXT_SUBSTATE_LLM_FC_FEC_IS_NOT_LOCKED}, + {13, ETHTOOL_LINK_EXT_STATE_LINK_LOGICAL_MISMATCH, + ETHTOOL_LINK_EXT_SUBSTATE_LLM_RS_FEC_IS_NOT_LOCKED}, + + /* Signal integrity issues */ + {15, ETHTOOL_LINK_EXT_STATE_BAD_SIGNAL_INTEGRITY, 0}, + {17, ETHTOOL_LINK_EXT_STATE_BAD_SIGNAL_INTEGRITY, + ETHTOOL_LINK_EXT_SUBSTATE_BSI_LARGE_NUMBER_OF_PHYSICAL_ERRORS}, + {42, ETHTOOL_LINK_EXT_STATE_BAD_SIGNAL_INTEGRITY, + ETHTOOL_LINK_EXT_SUBSTATE_BSI_UNSUPPORTED_RATE}, + + /* No cable connected */ + {1024, ETHTOOL_LINK_EXT_STATE_NO_CABLE, 0}, + + /* Failure is related to cable, e.g., unsupported cable */ + {16, ETHTOOL_LINK_EXT_STATE_CABLE_ISSUE, + ETHTOOL_LINK_EXT_SUBSTATE_CI_UNSUPPORTED_CABLE}, + {20, ETHTOOL_LINK_EXT_STATE_CABLE_ISSUE, + ETHTOOL_LINK_EXT_SUBSTATE_CI_UNSUPPORTED_CABLE}, + {29, ETHTOOL_LINK_EXT_STATE_CABLE_ISSUE, + ETHTOOL_LINK_EXT_SUBSTATE_CI_UNSUPPORTED_CABLE}, + {1025, ETHTOOL_LINK_EXT_STATE_CABLE_ISSUE, + ETHTOOL_LINK_EXT_SUBSTATE_CI_UNSUPPORTED_CABLE}, + {1029, ETHTOOL_LINK_EXT_STATE_CABLE_ISSUE, + ETHTOOL_LINK_EXT_SUBSTATE_CI_UNSUPPORTED_CABLE}, + {1031, ETHTOOL_LINK_EXT_STATE_CABLE_ISSUE, 0}, + + /* Failure is related to EEPROM, e.g., failure during reading or parsing the data */ + {1027, ETHTOOL_LINK_EXT_STATE_EEPROM_ISSUE, 0}, + + /* Failure during calibration algorithm */ + {23, ETHTOOL_LINK_EXT_STATE_CALIBRATION_FAILURE, 0}, + + /* The hardware is not able to provide the power required from cable or module */ + {1032, ETHTOOL_LINK_EXT_STATE_POWER_BUDGET_EXCEEDED, 0}, + + /* The module is overheated */ + {1030, ETHTOOL_LINK_EXT_STATE_OVERHEAT, 0}, +}; + +static void +mlx5e_set_link_ext_state(struct mlx5e_ethtool_link_ext_state_opcode_mapping + link_ext_state_mapping, + struct ethtool_link_ext_state_info *link_ext_state_info) +{ + switch (link_ext_state_mapping.link_ext_state) { + case ETHTOOL_LINK_EXT_STATE_AUTONEG: + link_ext_state_info->autoneg = + link_ext_state_mapping.link_ext_substate; + break; + case ETHTOOL_LINK_EXT_STATE_LINK_TRAINING_FAILURE: + link_ext_state_info->link_training = + link_ext_state_mapping.link_ext_substate; + break; + case ETHTOOL_LINK_EXT_STATE_LINK_LOGICAL_MISMATCH: + link_ext_state_info->link_logical_mismatch = + link_ext_state_mapping.link_ext_substate; + break; + case ETHTOOL_LINK_EXT_STATE_BAD_SIGNAL_INTEGRITY: + link_ext_state_info->bad_signal_integrity = + link_ext_state_mapping.link_ext_substate; + break; + case ETHTOOL_LINK_EXT_STATE_CABLE_ISSUE: + link_ext_state_info->cable_issue = + link_ext_state_mapping.link_ext_substate; + break; + default: + break; + } + + link_ext_state_info->link_ext_state = link_ext_state_mapping.link_ext_state; +} + +static int +mlx5e_get_link_ext_state(struct net_device *dev, + struct ethtool_link_ext_state_info *link_ext_state_info) +{ + struct mlx5e_ethtool_link_ext_state_opcode_mapping link_ext_state_mapping; + struct mlx5e_priv *priv = netdev_priv(dev); + u32 status_opcode = 0; + int i; + + /* Exit without data if the interface state is OK, since no extended data is + * available in such case + */ + if (netif_carrier_ok(dev)) + return -ENODATA; + + if (mlx5_query_port_status(priv->mdev, &status_opcode, NULL, NULL) || + !status_opcode) + return -ENODATA; + + for (i = 0; i < ARRAY_SIZE(mlx5e_link_ext_state_opcode_map); i++) { + link_ext_state_mapping = mlx5e_link_ext_state_opcode_map[i]; + if (link_ext_state_mapping.status_opcode == status_opcode) { + mlx5e_set_link_ext_state(link_ext_state_mapping, + link_ext_state_info); + return 0; + } + } + + return -ENODATA; +} + +static void mlx5e_get_eth_phy_stats(struct net_device *netdev, + struct ethtool_eth_phy_stats *phy_stats) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + + mlx5e_stats_eth_phy_get(priv, phy_stats); +} + +static void mlx5e_get_eth_mac_stats(struct net_device *netdev, + struct ethtool_eth_mac_stats *mac_stats) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + + mlx5e_stats_eth_mac_get(priv, mac_stats); +} + +static void mlx5e_get_eth_ctrl_stats(struct net_device *netdev, + struct ethtool_eth_ctrl_stats *ctrl_stats) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + + mlx5e_stats_eth_ctrl_get(priv, ctrl_stats); +} + +static void mlx5e_get_rmon_stats(struct net_device *netdev, + struct ethtool_rmon_stats *rmon_stats, + const struct ethtool_rmon_hist_range **ranges) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + + mlx5e_stats_rmon_get(priv, rmon_stats, ranges); +} + +const struct ethtool_ops mlx5e_ethtool_ops = { + .supported_coalesce_params = ETHTOOL_COALESCE_USECS | + ETHTOOL_COALESCE_MAX_FRAMES | + ETHTOOL_COALESCE_USE_ADAPTIVE | + ETHTOOL_COALESCE_USE_CQE, + .get_drvinfo = mlx5e_get_drvinfo, + .get_link = ethtool_op_get_link, + .get_link_ext_state = mlx5e_get_link_ext_state, + .get_strings = mlx5e_get_strings, + .get_sset_count = mlx5e_get_sset_count, + .get_ethtool_stats = mlx5e_get_ethtool_stats, + .get_ringparam = mlx5e_get_ringparam, + .set_ringparam = mlx5e_set_ringparam, + .get_channels = mlx5e_get_channels, + .set_channels = mlx5e_set_channels, + .get_coalesce = mlx5e_get_coalesce, + .set_coalesce = mlx5e_set_coalesce, + .get_link_ksettings = mlx5e_get_link_ksettings, + .set_link_ksettings = mlx5e_set_link_ksettings, + .get_rxfh_key_size = mlx5e_get_rxfh_key_size, + .get_rxfh_indir_size = mlx5e_get_rxfh_indir_size, + .get_rxfh = mlx5e_get_rxfh, + .set_rxfh = mlx5e_set_rxfh, + .get_rxfh_context = mlx5e_get_rxfh_context, + .set_rxfh_context = mlx5e_set_rxfh_context, + .get_rxnfc = mlx5e_get_rxnfc, + .set_rxnfc = mlx5e_set_rxnfc, + .get_tunable = mlx5e_get_tunable, + .set_tunable = mlx5e_set_tunable, + .get_pause_stats = mlx5e_get_pause_stats, + .get_pauseparam = mlx5e_get_pauseparam, + .set_pauseparam = mlx5e_set_pauseparam, + .get_ts_info = mlx5e_get_ts_info, + .set_phys_id = mlx5e_set_phys_id, + .get_wol = mlx5e_get_wol, + .set_wol = mlx5e_set_wol, + .get_module_info = mlx5e_get_module_info, + .get_module_eeprom = mlx5e_get_module_eeprom, + .get_module_eeprom_by_page = mlx5e_get_module_eeprom_by_page, + .flash_device = mlx5e_flash_device, + .get_priv_flags = mlx5e_get_priv_flags, + .set_priv_flags = mlx5e_set_priv_flags, + .self_test = mlx5e_self_test, + .get_msglevel = mlx5e_get_msglevel, + .set_msglevel = mlx5e_set_msglevel, + .get_fec_stats = mlx5e_get_fec_stats, + .set_priv_flags = mlx5e_set_priv_flags, + .get_dump_flag = mlx5e_get_dump_flag, + .get_dump_data = mlx5e_get_dump_data, + .set_dump = mlx5e_set_dump, + .get_fecparam = mlx5e_get_fecparam, + .set_fecparam = mlx5e_set_fecparam, + .get_eth_phy_stats = mlx5e_get_eth_phy_stats, + .get_eth_mac_stats = mlx5e_get_eth_mac_stats, + .get_eth_ctrl_stats = mlx5e_get_eth_ctrl_stats, + .get_rmon_stats = mlx5e_get_rmon_stats, +}; diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c new file mode 100644 index 0000000..aeff1d9 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c @@ -0,0 +1,1357 @@ +/* + * Copyright (c) 2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include "en.h" +#include "en_rep.h" +#include "lib/mpfs.h" +#include "en/ptp.h" + +static int mlx5e_add_l2_flow_rule(struct mlx5e_priv *priv, + struct mlx5e_l2_rule *ai, int type); +static void mlx5e_del_l2_flow_rule(struct mlx5e_priv *priv, + struct mlx5e_l2_rule *ai); + +enum { + MLX5E_FULLMATCH = 0, + MLX5E_ALLMULTI = 1, +}; + +enum { + MLX5E_UC = 0, + MLX5E_MC_IPV4 = 1, + MLX5E_MC_IPV6 = 2, + MLX5E_MC_OTHER = 3, +}; + +enum { + MLX5E_ACTION_NONE = 0, + MLX5E_ACTION_ADD = 1, + MLX5E_ACTION_DEL = 2, +}; + +struct mlx5e_l2_hash_node { + struct hlist_node hlist; + u8 action; + struct mlx5e_l2_rule ai; + bool mpfs; +}; + +static inline int mlx5e_hash_l2(const u8 *addr) +{ + return addr[5]; +} + +static void mlx5e_add_l2_to_hash(struct hlist_head *hash, const u8 *addr) +{ + struct mlx5e_l2_hash_node *hn; + int ix = mlx5e_hash_l2(addr); + int found = 0; + + hlist_for_each_entry(hn, &hash[ix], hlist) + if (ether_addr_equal_64bits(hn->ai.addr, addr)) { + found = 1; + break; + } + + if (found) { + hn->action = MLX5E_ACTION_NONE; + return; + } + + hn = kzalloc(sizeof(*hn), GFP_ATOMIC); + if (!hn) + return; + + ether_addr_copy(hn->ai.addr, addr); + hn->action = MLX5E_ACTION_ADD; + + hlist_add_head(&hn->hlist, &hash[ix]); +} + +static void mlx5e_del_l2_from_hash(struct mlx5e_l2_hash_node *hn) +{ + hlist_del(&hn->hlist); + kfree(hn); +} + +struct mlx5e_vlan_table { + struct mlx5e_flow_table ft; + DECLARE_BITMAP(active_cvlans, VLAN_N_VID); + DECLARE_BITMAP(active_svlans, VLAN_N_VID); + struct mlx5_flow_handle *active_cvlans_rule[VLAN_N_VID]; + struct mlx5_flow_handle *active_svlans_rule[VLAN_N_VID]; + struct mlx5_flow_handle *untagged_rule; + struct mlx5_flow_handle *any_cvlan_rule; + struct mlx5_flow_handle *any_svlan_rule; + struct mlx5_flow_handle *trap_rule; + bool cvlan_filter_disabled; +}; + +unsigned long *mlx5e_vlan_get_active_svlans(struct mlx5e_vlan_table *vlan) +{ + return vlan->active_svlans; +} + +struct mlx5_flow_table *mlx5e_vlan_get_flowtable(struct mlx5e_vlan_table *vlan) +{ + return vlan->ft.t; +} + +static int mlx5e_vport_context_update_vlans(struct mlx5e_priv *priv) +{ + struct net_device *ndev = priv->netdev; + int max_list_size; + int list_size; + u16 *vlans; + int vlan; + int err; + int i; + + list_size = 0; + for_each_set_bit(vlan, priv->fs.vlan->active_cvlans, VLAN_N_VID) + list_size++; + + max_list_size = 1 << MLX5_CAP_GEN(priv->mdev, log_max_vlan_list); + + if (list_size > max_list_size) { + netdev_warn(ndev, + "netdev vlans list size (%d) > (%d) max vport list size, some vlans will be dropped\n", + list_size, max_list_size); + list_size = max_list_size; + } + + vlans = kcalloc(list_size, sizeof(*vlans), GFP_KERNEL); + if (!vlans) + return -ENOMEM; + + i = 0; + for_each_set_bit(vlan, priv->fs.vlan->active_cvlans, VLAN_N_VID) { + if (i >= list_size) + break; + vlans[i++] = vlan; + } + + err = mlx5_modify_nic_vport_vlans(priv->mdev, vlans, list_size); + if (err) + netdev_err(ndev, "Failed to modify vport vlans list err(%d)\n", + err); + + kfree(vlans); + return err; +} + +enum mlx5e_vlan_rule_type { + MLX5E_VLAN_RULE_TYPE_UNTAGGED, + MLX5E_VLAN_RULE_TYPE_ANY_CTAG_VID, + MLX5E_VLAN_RULE_TYPE_ANY_STAG_VID, + MLX5E_VLAN_RULE_TYPE_MATCH_CTAG_VID, + MLX5E_VLAN_RULE_TYPE_MATCH_STAG_VID, +}; + +static int __mlx5e_add_vlan_rule(struct mlx5e_priv *priv, + enum mlx5e_vlan_rule_type rule_type, + u16 vid, struct mlx5_flow_spec *spec) +{ + struct mlx5_flow_table *ft = priv->fs.vlan->ft.t; + struct mlx5_flow_destination dest = {}; + struct mlx5_flow_handle **rule_p; + MLX5_DECLARE_FLOW_ACT(flow_act); + int err = 0; + + dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dest.ft = priv->fs.l2.ft.t; + + spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS; + + switch (rule_type) { + case MLX5E_VLAN_RULE_TYPE_UNTAGGED: + /* cvlan_tag enabled in match criteria and + * disabled in match value means both S & C tags + * don't exist (untagged of both) + */ + rule_p = &priv->fs.vlan->untagged_rule; + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + outer_headers.cvlan_tag); + break; + case MLX5E_VLAN_RULE_TYPE_ANY_CTAG_VID: + rule_p = &priv->fs.vlan->any_cvlan_rule; + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + outer_headers.cvlan_tag); + MLX5_SET(fte_match_param, spec->match_value, outer_headers.cvlan_tag, 1); + break; + case MLX5E_VLAN_RULE_TYPE_ANY_STAG_VID: + rule_p = &priv->fs.vlan->any_svlan_rule; + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + outer_headers.svlan_tag); + MLX5_SET(fte_match_param, spec->match_value, outer_headers.svlan_tag, 1); + break; + case MLX5E_VLAN_RULE_TYPE_MATCH_STAG_VID: + rule_p = &priv->fs.vlan->active_svlans_rule[vid]; + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + outer_headers.svlan_tag); + MLX5_SET(fte_match_param, spec->match_value, outer_headers.svlan_tag, 1); + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + outer_headers.first_vid); + MLX5_SET(fte_match_param, spec->match_value, outer_headers.first_vid, + vid); + break; + default: /* MLX5E_VLAN_RULE_TYPE_MATCH_CTAG_VID */ + rule_p = &priv->fs.vlan->active_cvlans_rule[vid]; + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + outer_headers.cvlan_tag); + MLX5_SET(fte_match_param, spec->match_value, outer_headers.cvlan_tag, 1); + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + outer_headers.first_vid); + MLX5_SET(fte_match_param, spec->match_value, outer_headers.first_vid, + vid); + break; + } + + if (WARN_ONCE(*rule_p, "VLAN rule already exists type %d", rule_type)) + return 0; + + *rule_p = mlx5_add_flow_rules(ft, spec, &flow_act, &dest, 1); + + if (IS_ERR(*rule_p)) { + err = PTR_ERR(*rule_p); + *rule_p = NULL; + netdev_err(priv->netdev, "%s: add rule failed\n", __func__); + } + + return err; +} + +static int mlx5e_add_vlan_rule(struct mlx5e_priv *priv, + enum mlx5e_vlan_rule_type rule_type, u16 vid) +{ + struct mlx5_flow_spec *spec; + int err = 0; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) + return -ENOMEM; + + if (rule_type == MLX5E_VLAN_RULE_TYPE_MATCH_CTAG_VID) + mlx5e_vport_context_update_vlans(priv); + + err = __mlx5e_add_vlan_rule(priv, rule_type, vid, spec); + + kvfree(spec); + + return err; +} + +static void mlx5e_del_vlan_rule(struct mlx5e_priv *priv, + enum mlx5e_vlan_rule_type rule_type, u16 vid) +{ + switch (rule_type) { + case MLX5E_VLAN_RULE_TYPE_UNTAGGED: + if (priv->fs.vlan->untagged_rule) { + mlx5_del_flow_rules(priv->fs.vlan->untagged_rule); + priv->fs.vlan->untagged_rule = NULL; + } + break; + case MLX5E_VLAN_RULE_TYPE_ANY_CTAG_VID: + if (priv->fs.vlan->any_cvlan_rule) { + mlx5_del_flow_rules(priv->fs.vlan->any_cvlan_rule); + priv->fs.vlan->any_cvlan_rule = NULL; + } + break; + case MLX5E_VLAN_RULE_TYPE_ANY_STAG_VID: + if (priv->fs.vlan->any_svlan_rule) { + mlx5_del_flow_rules(priv->fs.vlan->any_svlan_rule); + priv->fs.vlan->any_svlan_rule = NULL; + } + break; + case MLX5E_VLAN_RULE_TYPE_MATCH_STAG_VID: + if (priv->fs.vlan->active_svlans_rule[vid]) { + mlx5_del_flow_rules(priv->fs.vlan->active_svlans_rule[vid]); + priv->fs.vlan->active_svlans_rule[vid] = NULL; + } + break; + case MLX5E_VLAN_RULE_TYPE_MATCH_CTAG_VID: + if (priv->fs.vlan->active_cvlans_rule[vid]) { + mlx5_del_flow_rules(priv->fs.vlan->active_cvlans_rule[vid]); + priv->fs.vlan->active_cvlans_rule[vid] = NULL; + } + mlx5e_vport_context_update_vlans(priv); + break; + } +} + +static void mlx5e_del_any_vid_rules(struct mlx5e_priv *priv) +{ + mlx5e_del_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_ANY_CTAG_VID, 0); + mlx5e_del_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_ANY_STAG_VID, 0); +} + +static int mlx5e_add_any_vid_rules(struct mlx5e_priv *priv) +{ + int err; + + err = mlx5e_add_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_ANY_CTAG_VID, 0); + if (err) + return err; + + return mlx5e_add_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_ANY_STAG_VID, 0); +} + +static struct mlx5_flow_handle * +mlx5e_add_trap_rule(struct mlx5_flow_table *ft, int trap_id, int tir_num) +{ + struct mlx5_flow_destination dest = {}; + MLX5_DECLARE_FLOW_ACT(flow_act); + struct mlx5_flow_handle *rule; + struct mlx5_flow_spec *spec; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) + return ERR_PTR(-ENOMEM); + spec->flow_context.flags |= FLOW_CONTEXT_HAS_TAG; + spec->flow_context.flow_tag = trap_id; + dest.type = MLX5_FLOW_DESTINATION_TYPE_TIR; + dest.tir_num = tir_num; + + rule = mlx5_add_flow_rules(ft, spec, &flow_act, &dest, 1); + kvfree(spec); + return rule; +} + +int mlx5e_add_vlan_trap(struct mlx5e_priv *priv, int trap_id, int tir_num) +{ + struct mlx5_flow_table *ft = priv->fs.vlan->ft.t; + struct mlx5_flow_handle *rule; + int err; + + rule = mlx5e_add_trap_rule(ft, trap_id, tir_num); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + priv->fs.vlan->trap_rule = NULL; + netdev_err(priv->netdev, "%s: add VLAN trap rule failed, err %d\n", + __func__, err); + return err; + } + priv->fs.vlan->trap_rule = rule; + return 0; +} + +void mlx5e_remove_vlan_trap(struct mlx5e_priv *priv) +{ + if (priv->fs.vlan->trap_rule) { + mlx5_del_flow_rules(priv->fs.vlan->trap_rule); + priv->fs.vlan->trap_rule = NULL; + } +} + +int mlx5e_add_mac_trap(struct mlx5e_priv *priv, int trap_id, int tir_num) +{ + struct mlx5_flow_table *ft = priv->fs.l2.ft.t; + struct mlx5_flow_handle *rule; + int err; + + rule = mlx5e_add_trap_rule(ft, trap_id, tir_num); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + priv->fs.l2.trap_rule = NULL; + netdev_err(priv->netdev, "%s: add MAC trap rule failed, err %d\n", + __func__, err); + return err; + } + priv->fs.l2.trap_rule = rule; + return 0; +} + +void mlx5e_remove_mac_trap(struct mlx5e_priv *priv) +{ + if (priv->fs.l2.trap_rule) { + mlx5_del_flow_rules(priv->fs.l2.trap_rule); + priv->fs.l2.trap_rule = NULL; + } +} + +void mlx5e_enable_cvlan_filter(struct mlx5e_priv *priv) +{ + if (!priv->fs.vlan->cvlan_filter_disabled) + return; + + priv->fs.vlan->cvlan_filter_disabled = false; + if (priv->netdev->flags & IFF_PROMISC) + return; + mlx5e_del_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_ANY_CTAG_VID, 0); +} + +void mlx5e_disable_cvlan_filter(struct mlx5e_priv *priv) +{ + if (priv->fs.vlan->cvlan_filter_disabled) + return; + + priv->fs.vlan->cvlan_filter_disabled = true; + if (priv->netdev->flags & IFF_PROMISC) + return; + mlx5e_add_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_ANY_CTAG_VID, 0); +} + +static int mlx5e_vlan_rx_add_cvid(struct mlx5e_priv *priv, u16 vid) +{ + int err; + + set_bit(vid, priv->fs.vlan->active_cvlans); + + err = mlx5e_add_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_MATCH_CTAG_VID, vid); + if (err) + clear_bit(vid, priv->fs.vlan->active_cvlans); + + return err; +} + +static int mlx5e_vlan_rx_add_svid(struct mlx5e_priv *priv, u16 vid) +{ + struct net_device *netdev = priv->netdev; + int err; + + set_bit(vid, priv->fs.vlan->active_svlans); + + err = mlx5e_add_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_MATCH_STAG_VID, vid); + if (err) { + clear_bit(vid, priv->fs.vlan->active_svlans); + return err; + } + + /* Need to fix some features.. */ + netdev_update_features(netdev); + return err; +} + +int mlx5e_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + + if (mlx5e_is_uplink_rep(priv)) + return 0; /* no vlan table for uplink rep */ + + if (be16_to_cpu(proto) == ETH_P_8021Q) + return mlx5e_vlan_rx_add_cvid(priv, vid); + else if (be16_to_cpu(proto) == ETH_P_8021AD) + return mlx5e_vlan_rx_add_svid(priv, vid); + + return -EOPNOTSUPP; +} + +int mlx5e_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + + if (mlx5e_is_uplink_rep(priv)) + return 0; /* no vlan table for uplink rep */ + + if (be16_to_cpu(proto) == ETH_P_8021Q) { + clear_bit(vid, priv->fs.vlan->active_cvlans); + mlx5e_del_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_MATCH_CTAG_VID, vid); + } else if (be16_to_cpu(proto) == ETH_P_8021AD) { + clear_bit(vid, priv->fs.vlan->active_svlans); + mlx5e_del_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_MATCH_STAG_VID, vid); + netdev_update_features(dev); + } + + return 0; +} + +static void mlx5e_add_vlan_rules(struct mlx5e_priv *priv) +{ + int i; + + mlx5e_add_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_UNTAGGED, 0); + + for_each_set_bit(i, priv->fs.vlan->active_cvlans, VLAN_N_VID) { + mlx5e_add_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_MATCH_CTAG_VID, i); + } + + for_each_set_bit(i, priv->fs.vlan->active_svlans, VLAN_N_VID) + mlx5e_add_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_MATCH_STAG_VID, i); + + if (priv->fs.vlan->cvlan_filter_disabled) + mlx5e_add_any_vid_rules(priv); +} + +static void mlx5e_del_vlan_rules(struct mlx5e_priv *priv) +{ + int i; + + mlx5e_del_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_UNTAGGED, 0); + + for_each_set_bit(i, priv->fs.vlan->active_cvlans, VLAN_N_VID) { + mlx5e_del_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_MATCH_CTAG_VID, i); + } + + for_each_set_bit(i, priv->fs.vlan->active_svlans, VLAN_N_VID) + mlx5e_del_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_MATCH_STAG_VID, i); + + WARN_ON_ONCE(!(test_bit(MLX5E_STATE_DESTROYING, &priv->state))); + + mlx5e_remove_vlan_trap(priv); + + /* must be called after DESTROY bit is set and + * set_rx_mode is called and flushed + */ + if (priv->fs.vlan->cvlan_filter_disabled) + mlx5e_del_any_vid_rules(priv); +} + +#define mlx5e_for_each_hash_node(hn, tmp, hash, i) \ + for (i = 0; i < MLX5E_L2_ADDR_HASH_SIZE; i++) \ + hlist_for_each_entry_safe(hn, tmp, &hash[i], hlist) + +static void mlx5e_execute_l2_action(struct mlx5e_priv *priv, + struct mlx5e_l2_hash_node *hn) +{ + u8 action = hn->action; + u8 mac_addr[ETH_ALEN]; + int l2_err = 0; + + ether_addr_copy(mac_addr, hn->ai.addr); + + switch (action) { + case MLX5E_ACTION_ADD: + mlx5e_add_l2_flow_rule(priv, &hn->ai, MLX5E_FULLMATCH); + if (!is_multicast_ether_addr(mac_addr)) { + l2_err = mlx5_mpfs_add_mac(priv->mdev, mac_addr); + hn->mpfs = !l2_err; + } + hn->action = MLX5E_ACTION_NONE; + break; + + case MLX5E_ACTION_DEL: + if (!is_multicast_ether_addr(mac_addr) && hn->mpfs) + l2_err = mlx5_mpfs_del_mac(priv->mdev, mac_addr); + mlx5e_del_l2_flow_rule(priv, &hn->ai); + mlx5e_del_l2_from_hash(hn); + break; + } + + if (l2_err) + netdev_warn(priv->netdev, "MPFS, failed to %s mac %pM, err(%d)\n", + action == MLX5E_ACTION_ADD ? "add" : "del", mac_addr, l2_err); +} + +static void mlx5e_sync_netdev_addr(struct mlx5e_priv *priv) +{ + struct net_device *netdev = priv->netdev; + struct netdev_hw_addr *ha; + + netif_addr_lock_bh(netdev); + + mlx5e_add_l2_to_hash(priv->fs.l2.netdev_uc, + priv->netdev->dev_addr); + + netdev_for_each_uc_addr(ha, netdev) + mlx5e_add_l2_to_hash(priv->fs.l2.netdev_uc, ha->addr); + + netdev_for_each_mc_addr(ha, netdev) + mlx5e_add_l2_to_hash(priv->fs.l2.netdev_mc, ha->addr); + + netif_addr_unlock_bh(netdev); +} + +static void mlx5e_fill_addr_array(struct mlx5e_priv *priv, int list_type, + u8 addr_array[][ETH_ALEN], int size) +{ + bool is_uc = (list_type == MLX5_NVPRT_LIST_TYPE_UC); + struct net_device *ndev = priv->netdev; + struct mlx5e_l2_hash_node *hn; + struct hlist_head *addr_list; + struct hlist_node *tmp; + int i = 0; + int hi; + + addr_list = is_uc ? priv->fs.l2.netdev_uc : priv->fs.l2.netdev_mc; + + if (is_uc) /* Make sure our own address is pushed first */ + ether_addr_copy(addr_array[i++], ndev->dev_addr); + else if (priv->fs.l2.broadcast_enabled) + ether_addr_copy(addr_array[i++], ndev->broadcast); + + mlx5e_for_each_hash_node(hn, tmp, addr_list, hi) { + if (ether_addr_equal(ndev->dev_addr, hn->ai.addr)) + continue; + if (i >= size) + break; + ether_addr_copy(addr_array[i++], hn->ai.addr); + } +} + +static void mlx5e_vport_context_update_addr_list(struct mlx5e_priv *priv, + int list_type) +{ + bool is_uc = (list_type == MLX5_NVPRT_LIST_TYPE_UC); + struct mlx5e_l2_hash_node *hn; + u8 (*addr_array)[ETH_ALEN] = NULL; + struct hlist_head *addr_list; + struct hlist_node *tmp; + int max_size; + int size; + int err; + int hi; + + size = is_uc ? 0 : (priv->fs.l2.broadcast_enabled ? 1 : 0); + max_size = is_uc ? + 1 << MLX5_CAP_GEN(priv->mdev, log_max_current_uc_list) : + 1 << MLX5_CAP_GEN(priv->mdev, log_max_current_mc_list); + + addr_list = is_uc ? priv->fs.l2.netdev_uc : priv->fs.l2.netdev_mc; + mlx5e_for_each_hash_node(hn, tmp, addr_list, hi) + size++; + + if (size > max_size) { + netdev_warn(priv->netdev, + "netdev %s list size (%d) > (%d) max vport list size, some addresses will be dropped\n", + is_uc ? "UC" : "MC", size, max_size); + size = max_size; + } + + if (size) { + addr_array = kcalloc(size, ETH_ALEN, GFP_KERNEL); + if (!addr_array) { + err = -ENOMEM; + goto out; + } + mlx5e_fill_addr_array(priv, list_type, addr_array, size); + } + + err = mlx5_modify_nic_vport_mac_list(priv->mdev, list_type, addr_array, size); +out: + if (err) + netdev_err(priv->netdev, + "Failed to modify vport %s list err(%d)\n", + is_uc ? "UC" : "MC", err); + kfree(addr_array); +} + +static void mlx5e_vport_context_update(struct mlx5e_priv *priv) +{ + struct mlx5e_l2_table *ea = &priv->fs.l2; + + mlx5e_vport_context_update_addr_list(priv, MLX5_NVPRT_LIST_TYPE_UC); + mlx5e_vport_context_update_addr_list(priv, MLX5_NVPRT_LIST_TYPE_MC); + mlx5_modify_nic_vport_promisc(priv->mdev, 0, + ea->allmulti_enabled, + ea->promisc_enabled); +} + +static void mlx5e_apply_netdev_addr(struct mlx5e_priv *priv) +{ + struct mlx5e_l2_hash_node *hn; + struct hlist_node *tmp; + int i; + + mlx5e_for_each_hash_node(hn, tmp, priv->fs.l2.netdev_uc, i) + mlx5e_execute_l2_action(priv, hn); + + mlx5e_for_each_hash_node(hn, tmp, priv->fs.l2.netdev_mc, i) + mlx5e_execute_l2_action(priv, hn); +} + +static void mlx5e_handle_netdev_addr(struct mlx5e_priv *priv) +{ + struct mlx5e_l2_hash_node *hn; + struct hlist_node *tmp; + int i; + + mlx5e_for_each_hash_node(hn, tmp, priv->fs.l2.netdev_uc, i) + hn->action = MLX5E_ACTION_DEL; + mlx5e_for_each_hash_node(hn, tmp, priv->fs.l2.netdev_mc, i) + hn->action = MLX5E_ACTION_DEL; + + if (!test_bit(MLX5E_STATE_DESTROYING, &priv->state)) + mlx5e_sync_netdev_addr(priv); + + mlx5e_apply_netdev_addr(priv); +} + +#define MLX5E_PROMISC_GROUP0_SIZE BIT(0) +#define MLX5E_PROMISC_TABLE_SIZE MLX5E_PROMISC_GROUP0_SIZE + +static int mlx5e_add_promisc_rule(struct mlx5e_priv *priv) +{ + struct mlx5_flow_table *ft = priv->fs.promisc.ft.t; + struct mlx5_flow_destination dest = {}; + struct mlx5_flow_handle **rule_p; + MLX5_DECLARE_FLOW_ACT(flow_act); + struct mlx5_flow_spec *spec; + int err = 0; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) + return -ENOMEM; + dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dest.ft = mlx5_get_ttc_flow_table(priv->fs.ttc); + + rule_p = &priv->fs.promisc.rule; + *rule_p = mlx5_add_flow_rules(ft, spec, &flow_act, &dest, 1); + if (IS_ERR(*rule_p)) { + err = PTR_ERR(*rule_p); + *rule_p = NULL; + netdev_err(priv->netdev, "%s: add promiscuous rule failed\n", __func__); + } + kvfree(spec); + return err; +} + +static int mlx5e_create_promisc_table(struct mlx5e_priv *priv) +{ + struct mlx5e_flow_table *ft = &priv->fs.promisc.ft; + struct mlx5_flow_table_attr ft_attr = {}; + int err; + + ft_attr.max_fte = MLX5E_PROMISC_TABLE_SIZE; + ft_attr.autogroup.max_num_groups = 1; + ft_attr.level = MLX5E_PROMISC_FT_LEVEL; + ft_attr.prio = MLX5E_NIC_PRIO; + + ft->t = mlx5_create_auto_grouped_flow_table(priv->fs.ns, &ft_attr); + if (IS_ERR(ft->t)) { + err = PTR_ERR(ft->t); + netdev_err(priv->netdev, "fail to create promisc table err=%d\n", err); + return err; + } + + err = mlx5e_add_promisc_rule(priv); + if (err) + goto err_destroy_promisc_table; + + return 0; + +err_destroy_promisc_table: + mlx5_destroy_flow_table(ft->t); + ft->t = NULL; + + return err; +} + +static void mlx5e_del_promisc_rule(struct mlx5e_priv *priv) +{ + if (WARN(!priv->fs.promisc.rule, "Trying to remove non-existing promiscuous rule")) + return; + mlx5_del_flow_rules(priv->fs.promisc.rule); + priv->fs.promisc.rule = NULL; +} + +static void mlx5e_destroy_promisc_table(struct mlx5e_priv *priv) +{ + if (WARN(!priv->fs.promisc.ft.t, "Trying to remove non-existing promiscuous table")) + return; + mlx5e_del_promisc_rule(priv); + mlx5_destroy_flow_table(priv->fs.promisc.ft.t); + priv->fs.promisc.ft.t = NULL; +} + +void mlx5e_set_rx_mode_work(struct work_struct *work) +{ + struct mlx5e_priv *priv = container_of(work, struct mlx5e_priv, + set_rx_mode_work); + + struct mlx5e_l2_table *ea = &priv->fs.l2; + struct net_device *ndev = priv->netdev; + + bool rx_mode_enable = !test_bit(MLX5E_STATE_DESTROYING, &priv->state); + bool promisc_enabled = rx_mode_enable && (ndev->flags & IFF_PROMISC); + bool allmulti_enabled = rx_mode_enable && (ndev->flags & IFF_ALLMULTI); + bool broadcast_enabled = rx_mode_enable; + + bool enable_promisc = !ea->promisc_enabled && promisc_enabled; + bool disable_promisc = ea->promisc_enabled && !promisc_enabled; + bool enable_allmulti = !ea->allmulti_enabled && allmulti_enabled; + bool disable_allmulti = ea->allmulti_enabled && !allmulti_enabled; + bool enable_broadcast = !ea->broadcast_enabled && broadcast_enabled; + bool disable_broadcast = ea->broadcast_enabled && !broadcast_enabled; + int err; + + if (enable_promisc) { + err = mlx5e_create_promisc_table(priv); + if (err) + enable_promisc = false; + if (!priv->channels.params.vlan_strip_disable && !err) + netdev_warn_once(ndev, + "S-tagged traffic will be dropped while C-tag vlan stripping is enabled\n"); + } + if (enable_allmulti) + mlx5e_add_l2_flow_rule(priv, &ea->allmulti, MLX5E_ALLMULTI); + if (enable_broadcast) + mlx5e_add_l2_flow_rule(priv, &ea->broadcast, MLX5E_FULLMATCH); + + mlx5e_handle_netdev_addr(priv); + + if (disable_broadcast) + mlx5e_del_l2_flow_rule(priv, &ea->broadcast); + if (disable_allmulti) + mlx5e_del_l2_flow_rule(priv, &ea->allmulti); + if (disable_promisc) + mlx5e_destroy_promisc_table(priv); + + ea->promisc_enabled = promisc_enabled; + ea->allmulti_enabled = allmulti_enabled; + ea->broadcast_enabled = broadcast_enabled; + + mlx5e_vport_context_update(priv); +} + +static void mlx5e_destroy_groups(struct mlx5e_flow_table *ft) +{ + int i; + + for (i = ft->num_groups - 1; i >= 0; i--) { + if (!IS_ERR_OR_NULL(ft->g[i])) + mlx5_destroy_flow_group(ft->g[i]); + ft->g[i] = NULL; + } + ft->num_groups = 0; +} + +void mlx5e_init_l2_addr(struct mlx5e_priv *priv) +{ + ether_addr_copy(priv->fs.l2.broadcast.addr, priv->netdev->broadcast); +} + +void mlx5e_destroy_flow_table(struct mlx5e_flow_table *ft) +{ + mlx5e_destroy_groups(ft); + kfree(ft->g); + mlx5_destroy_flow_table(ft->t); + ft->t = NULL; +} + +static void mlx5e_set_inner_ttc_params(struct mlx5e_priv *priv, + struct ttc_params *ttc_params) +{ + struct mlx5_flow_table_attr *ft_attr = &ttc_params->ft_attr; + int tt; + + memset(ttc_params, 0, sizeof(*ttc_params)); + ttc_params->ns = mlx5_get_flow_namespace(priv->mdev, + MLX5_FLOW_NAMESPACE_KERNEL); + ft_attr->level = MLX5E_INNER_TTC_FT_LEVEL; + ft_attr->prio = MLX5E_NIC_PRIO; + + for (tt = 0; tt < MLX5_NUM_TT; tt++) { + ttc_params->dests[tt].type = MLX5_FLOW_DESTINATION_TYPE_TIR; + ttc_params->dests[tt].tir_num = + tt == MLX5_TT_ANY ? + mlx5e_rx_res_get_tirn_direct(priv->rx_res, 0) : + mlx5e_rx_res_get_tirn_rss_inner(priv->rx_res, + tt); + } +} + +void mlx5e_set_ttc_params(struct mlx5e_priv *priv, + struct ttc_params *ttc_params, bool tunnel) + +{ + struct mlx5_flow_table_attr *ft_attr = &ttc_params->ft_attr; + int tt; + + memset(ttc_params, 0, sizeof(*ttc_params)); + ttc_params->ns = mlx5_get_flow_namespace(priv->mdev, + MLX5_FLOW_NAMESPACE_KERNEL); + ft_attr->level = MLX5E_TTC_FT_LEVEL; + ft_attr->prio = MLX5E_NIC_PRIO; + + for (tt = 0; tt < MLX5_NUM_TT; tt++) { + ttc_params->dests[tt].type = MLX5_FLOW_DESTINATION_TYPE_TIR; + ttc_params->dests[tt].tir_num = + tt == MLX5_TT_ANY ? + mlx5e_rx_res_get_tirn_direct(priv->rx_res, 0) : + mlx5e_rx_res_get_tirn_rss(priv->rx_res, tt); + } + + ttc_params->inner_ttc = tunnel; + if (!tunnel || !mlx5_tunnel_inner_ft_supported(priv->mdev)) + return; + + for (tt = 0; tt < MLX5_NUM_TUNNEL_TT; tt++) { + ttc_params->tunnel_dests[tt].type = + MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + ttc_params->tunnel_dests[tt].ft = + mlx5_get_ttc_flow_table(priv->fs.inner_ttc); + } +} + +static void mlx5e_del_l2_flow_rule(struct mlx5e_priv *priv, + struct mlx5e_l2_rule *ai) +{ + if (!IS_ERR_OR_NULL(ai->rule)) { + mlx5_del_flow_rules(ai->rule); + ai->rule = NULL; + } +} + +static int mlx5e_add_l2_flow_rule(struct mlx5e_priv *priv, + struct mlx5e_l2_rule *ai, int type) +{ + struct mlx5_flow_table *ft = priv->fs.l2.ft.t; + struct mlx5_flow_destination dest = {}; + MLX5_DECLARE_FLOW_ACT(flow_act); + struct mlx5_flow_spec *spec; + int err = 0; + u8 *mc_dmac; + u8 *mv_dmac; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) + return -ENOMEM; + + mc_dmac = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, + outer_headers.dmac_47_16); + mv_dmac = MLX5_ADDR_OF(fte_match_param, spec->match_value, + outer_headers.dmac_47_16); + + dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dest.ft = mlx5_get_ttc_flow_table(priv->fs.ttc); + + switch (type) { + case MLX5E_FULLMATCH: + spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS; + eth_broadcast_addr(mc_dmac); + ether_addr_copy(mv_dmac, ai->addr); + break; + + case MLX5E_ALLMULTI: + spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS; + mc_dmac[0] = 0x01; + mv_dmac[0] = 0x01; + break; + } + + ai->rule = mlx5_add_flow_rules(ft, spec, &flow_act, &dest, 1); + if (IS_ERR(ai->rule)) { + netdev_err(priv->netdev, "%s: add l2 rule(mac:%pM) failed\n", + __func__, mv_dmac); + err = PTR_ERR(ai->rule); + ai->rule = NULL; + } + + kvfree(spec); + + return err; +} + +#define MLX5E_NUM_L2_GROUPS 3 +#define MLX5E_L2_GROUP1_SIZE BIT(15) +#define MLX5E_L2_GROUP2_SIZE BIT(0) +#define MLX5E_L2_GROUP_TRAP_SIZE BIT(0) /* must be last */ +#define MLX5E_L2_TABLE_SIZE (MLX5E_L2_GROUP1_SIZE +\ + MLX5E_L2_GROUP2_SIZE +\ + MLX5E_L2_GROUP_TRAP_SIZE) +static int mlx5e_create_l2_table_groups(struct mlx5e_l2_table *l2_table) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5e_flow_table *ft = &l2_table->ft; + int ix = 0; + u8 *mc_dmac; + u32 *in; + int err; + u8 *mc; + + ft->g = kcalloc(MLX5E_NUM_L2_GROUPS, sizeof(*ft->g), GFP_KERNEL); + if (!ft->g) + return -ENOMEM; + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) { + kfree(ft->g); + return -ENOMEM; + } + + mc = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria); + mc_dmac = MLX5_ADDR_OF(fte_match_param, mc, + outer_headers.dmac_47_16); + /* Flow Group for full match */ + eth_broadcast_addr(mc_dmac); + MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); + MLX5_SET_CFG(in, start_flow_index, ix); + ix += MLX5E_L2_GROUP1_SIZE; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); + if (IS_ERR(ft->g[ft->num_groups])) + goto err_destroy_groups; + ft->num_groups++; + + /* Flow Group for allmulti */ + eth_zero_addr(mc_dmac); + mc_dmac[0] = 0x01; + MLX5_SET_CFG(in, start_flow_index, ix); + ix += MLX5E_L2_GROUP2_SIZE; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); + if (IS_ERR(ft->g[ft->num_groups])) + goto err_destroy_groups; + ft->num_groups++; + + /* Flow Group for l2 traps */ + memset(in, 0, inlen); + MLX5_SET_CFG(in, start_flow_index, ix); + ix += MLX5E_L2_GROUP_TRAP_SIZE; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); + if (IS_ERR(ft->g[ft->num_groups])) + goto err_destroy_groups; + ft->num_groups++; + + kvfree(in); + return 0; + +err_destroy_groups: + err = PTR_ERR(ft->g[ft->num_groups]); + ft->g[ft->num_groups] = NULL; + mlx5e_destroy_groups(ft); + kvfree(in); + kfree(ft->g); + + return err; +} + +static void mlx5e_destroy_l2_table(struct mlx5e_priv *priv) +{ + mlx5e_destroy_flow_table(&priv->fs.l2.ft); +} + +static int mlx5e_create_l2_table(struct mlx5e_priv *priv) +{ + struct mlx5e_l2_table *l2_table = &priv->fs.l2; + struct mlx5e_flow_table *ft = &l2_table->ft; + struct mlx5_flow_table_attr ft_attr = {}; + int err; + + ft->num_groups = 0; + + ft_attr.max_fte = MLX5E_L2_TABLE_SIZE; + ft_attr.level = MLX5E_L2_FT_LEVEL; + ft_attr.prio = MLX5E_NIC_PRIO; + + ft->t = mlx5_create_flow_table(priv->fs.ns, &ft_attr); + if (IS_ERR(ft->t)) { + err = PTR_ERR(ft->t); + ft->t = NULL; + return err; + } + + err = mlx5e_create_l2_table_groups(l2_table); + if (err) + goto err_destroy_flow_table; + + return 0; + +err_destroy_flow_table: + mlx5_destroy_flow_table(ft->t); + ft->t = NULL; + + return err; +} + +#define MLX5E_NUM_VLAN_GROUPS 5 +#define MLX5E_VLAN_GROUP0_SIZE BIT(12) +#define MLX5E_VLAN_GROUP1_SIZE BIT(12) +#define MLX5E_VLAN_GROUP2_SIZE BIT(1) +#define MLX5E_VLAN_GROUP3_SIZE BIT(0) +#define MLX5E_VLAN_GROUP_TRAP_SIZE BIT(0) /* must be last */ +#define MLX5E_VLAN_TABLE_SIZE (MLX5E_VLAN_GROUP0_SIZE +\ + MLX5E_VLAN_GROUP1_SIZE +\ + MLX5E_VLAN_GROUP2_SIZE +\ + MLX5E_VLAN_GROUP3_SIZE +\ + MLX5E_VLAN_GROUP_TRAP_SIZE) + +static int __mlx5e_create_vlan_table_groups(struct mlx5e_flow_table *ft, u32 *in, + int inlen) +{ + int err; + int ix = 0; + u8 *mc = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria); + + memset(in, 0, inlen); + MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); + MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.cvlan_tag); + MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.first_vid); + MLX5_SET_CFG(in, start_flow_index, ix); + ix += MLX5E_VLAN_GROUP0_SIZE; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); + if (IS_ERR(ft->g[ft->num_groups])) + goto err_destroy_groups; + ft->num_groups++; + + memset(in, 0, inlen); + MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); + MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.svlan_tag); + MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.first_vid); + MLX5_SET_CFG(in, start_flow_index, ix); + ix += MLX5E_VLAN_GROUP1_SIZE; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); + if (IS_ERR(ft->g[ft->num_groups])) + goto err_destroy_groups; + ft->num_groups++; + + memset(in, 0, inlen); + MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); + MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.cvlan_tag); + MLX5_SET_CFG(in, start_flow_index, ix); + ix += MLX5E_VLAN_GROUP2_SIZE; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); + if (IS_ERR(ft->g[ft->num_groups])) + goto err_destroy_groups; + ft->num_groups++; + + memset(in, 0, inlen); + MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); + MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.svlan_tag); + MLX5_SET_CFG(in, start_flow_index, ix); + ix += MLX5E_VLAN_GROUP3_SIZE; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); + if (IS_ERR(ft->g[ft->num_groups])) + goto err_destroy_groups; + ft->num_groups++; + + memset(in, 0, inlen); + MLX5_SET_CFG(in, start_flow_index, ix); + ix += MLX5E_VLAN_GROUP_TRAP_SIZE; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); + if (IS_ERR(ft->g[ft->num_groups])) + goto err_destroy_groups; + ft->num_groups++; + + return 0; + +err_destroy_groups: + err = PTR_ERR(ft->g[ft->num_groups]); + ft->g[ft->num_groups] = NULL; + mlx5e_destroy_groups(ft); + + return err; +} + +static int mlx5e_create_vlan_table_groups(struct mlx5e_flow_table *ft) +{ + u32 *in; + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + int err; + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + err = __mlx5e_create_vlan_table_groups(ft, in, inlen); + + kvfree(in); + return err; +} + +static int mlx5e_create_vlan_table(struct mlx5e_priv *priv) +{ + struct mlx5_flow_table_attr ft_attr = {}; + struct mlx5e_flow_table *ft; + int err; + + ft = &priv->fs.vlan->ft; + ft->num_groups = 0; + + ft_attr.max_fte = MLX5E_VLAN_TABLE_SIZE; + ft_attr.level = MLX5E_VLAN_FT_LEVEL; + ft_attr.prio = MLX5E_NIC_PRIO; + + ft->t = mlx5_create_flow_table(priv->fs.ns, &ft_attr); + if (IS_ERR(ft->t)) + return PTR_ERR(ft->t); + + ft->g = kcalloc(MLX5E_NUM_VLAN_GROUPS, sizeof(*ft->g), GFP_KERNEL); + if (!ft->g) { + err = -ENOMEM; + goto err_destroy_vlan_table; + } + + err = mlx5e_create_vlan_table_groups(ft); + if (err) + goto err_free_g; + + mlx5e_add_vlan_rules(priv); + + return 0; + +err_free_g: + kfree(ft->g); +err_destroy_vlan_table: + mlx5_destroy_flow_table(ft->t); + + return err; +} + +static void mlx5e_destroy_vlan_table(struct mlx5e_priv *priv) +{ + mlx5e_del_vlan_rules(priv); + mlx5e_destroy_flow_table(&priv->fs.vlan->ft); +} + +static void mlx5e_destroy_inner_ttc_table(struct mlx5e_priv *priv) +{ + if (!mlx5_tunnel_inner_ft_supported(priv->mdev)) + return; + mlx5_destroy_ttc_table(priv->fs.inner_ttc); +} + +void mlx5e_destroy_ttc_table(struct mlx5e_priv *priv) +{ + mlx5_destroy_ttc_table(priv->fs.ttc); +} + +static int mlx5e_create_inner_ttc_table(struct mlx5e_priv *priv) +{ + struct ttc_params ttc_params = {}; + + if (!mlx5_tunnel_inner_ft_supported(priv->mdev)) + return 0; + + mlx5e_set_inner_ttc_params(priv, &ttc_params); + priv->fs.inner_ttc = mlx5_create_inner_ttc_table(priv->mdev, + &ttc_params); + if (IS_ERR(priv->fs.inner_ttc)) + return PTR_ERR(priv->fs.inner_ttc); + return 0; +} + +int mlx5e_create_ttc_table(struct mlx5e_priv *priv) +{ + struct ttc_params ttc_params = {}; + + mlx5e_set_ttc_params(priv, &ttc_params, true); + priv->fs.ttc = mlx5_create_ttc_table(priv->mdev, &ttc_params); + if (IS_ERR(priv->fs.ttc)) + return PTR_ERR(priv->fs.ttc); + return 0; +} + +int mlx5e_create_flow_steering(struct mlx5e_priv *priv) +{ + int err; + + priv->fs.ns = mlx5_get_flow_namespace(priv->mdev, + MLX5_FLOW_NAMESPACE_KERNEL); + + if (!priv->fs.ns) + return -EOPNOTSUPP; + + err = mlx5e_arfs_create_tables(priv); + if (err) { + netdev_err(priv->netdev, "Failed to create arfs tables, err=%d\n", + err); + priv->netdev->hw_features &= ~NETIF_F_NTUPLE; + } + + err = mlx5e_create_inner_ttc_table(priv); + if (err) { + netdev_err(priv->netdev, + "Failed to create inner ttc table, err=%d\n", + err); + goto err_destroy_arfs_tables; + } + + err = mlx5e_create_ttc_table(priv); + if (err) { + netdev_err(priv->netdev, "Failed to create ttc table, err=%d\n", + err); + goto err_destroy_inner_ttc_table; + } + + err = mlx5e_create_l2_table(priv); + if (err) { + netdev_err(priv->netdev, "Failed to create l2 table, err=%d\n", + err); + goto err_destroy_ttc_table; + } + + err = mlx5e_create_vlan_table(priv); + if (err) { + netdev_err(priv->netdev, "Failed to create vlan table, err=%d\n", + err); + goto err_destroy_l2_table; + } + + err = mlx5e_ptp_alloc_rx_fs(priv); + if (err) + goto err_destory_vlan_table; + + mlx5e_ethtool_init_steering(priv); + + return 0; + +err_destory_vlan_table: + mlx5e_destroy_vlan_table(priv); +err_destroy_l2_table: + mlx5e_destroy_l2_table(priv); +err_destroy_ttc_table: + mlx5e_destroy_ttc_table(priv); +err_destroy_inner_ttc_table: + mlx5e_destroy_inner_ttc_table(priv); +err_destroy_arfs_tables: + mlx5e_arfs_destroy_tables(priv); + + return err; +} + +void mlx5e_destroy_flow_steering(struct mlx5e_priv *priv) +{ + mlx5e_ptp_free_rx_fs(priv); + mlx5e_destroy_vlan_table(priv); + mlx5e_destroy_l2_table(priv); + mlx5e_destroy_ttc_table(priv); + mlx5e_destroy_inner_ttc_table(priv); + mlx5e_arfs_destroy_tables(priv); + mlx5e_ethtool_cleanup_steering(priv); +} + +int mlx5e_fs_init(struct mlx5e_priv *priv) +{ + priv->fs.vlan = kvzalloc(sizeof(*priv->fs.vlan), GFP_KERNEL); + if (!priv->fs.vlan) + return -ENOMEM; + return 0; +} + +void mlx5e_fs_cleanup(struct mlx5e_priv *priv) +{ + kvfree(priv->fs.vlan); + priv->fs.vlan = NULL; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c new file mode 100644 index 0000000..311fcba --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c @@ -0,0 +1,1035 @@ +/* + * Copyright (c) 2016, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include "en.h" +#include "en_accel/ipsec_rxtx.h" +#include "en/params.h" +#include "en/xsk/pool.h" + +static int flow_type_to_traffic_type(u32 flow_type); + +static u32 flow_type_mask(u32 flow_type) +{ + return flow_type & ~(FLOW_EXT | FLOW_MAC_EXT | FLOW_RSS); +} + +struct mlx5e_ethtool_rule { + struct list_head list; + struct ethtool_rx_flow_spec flow_spec; + struct mlx5_flow_handle *rule; + struct mlx5e_ethtool_table *eth_ft; + struct mlx5e_rss *rss; +}; + +static void put_flow_table(struct mlx5e_ethtool_table *eth_ft) +{ + if (!--eth_ft->num_rules) { + mlx5_destroy_flow_table(eth_ft->ft); + eth_ft->ft = NULL; + } +} + +#define MLX5E_ETHTOOL_L3_L4_PRIO 0 +#define MLX5E_ETHTOOL_L2_PRIO (MLX5E_ETHTOOL_L3_L4_PRIO + ETHTOOL_NUM_L3_L4_FTS) +#define MLX5E_ETHTOOL_NUM_ENTRIES 64000 +#define MLX5E_ETHTOOL_NUM_GROUPS 10 +static struct mlx5e_ethtool_table *get_flow_table(struct mlx5e_priv *priv, + struct ethtool_rx_flow_spec *fs, + int num_tuples) +{ + struct mlx5_flow_table_attr ft_attr = {}; + struct mlx5e_ethtool_table *eth_ft; + struct mlx5_flow_namespace *ns; + struct mlx5_flow_table *ft; + int max_tuples; + int table_size; + int prio; + + switch (flow_type_mask(fs->flow_type)) { + case TCP_V4_FLOW: + case UDP_V4_FLOW: +#ifdef CONFIG_MLX5_EN_IPSEC + case ESP_V4_FLOW: + case ESP_V6_FLOW: +#endif + case TCP_V6_FLOW: + case UDP_V6_FLOW: + max_tuples = ETHTOOL_NUM_L3_L4_FTS; + prio = MLX5E_ETHTOOL_L3_L4_PRIO + (max_tuples - num_tuples); + eth_ft = &priv->fs.ethtool.l3_l4_ft[prio]; + break; + case IP_USER_FLOW: + case IPV6_USER_FLOW: + max_tuples = ETHTOOL_NUM_L3_L4_FTS; + prio = MLX5E_ETHTOOL_L3_L4_PRIO + (max_tuples - num_tuples); + eth_ft = &priv->fs.ethtool.l3_l4_ft[prio]; + break; + case ETHER_FLOW: + max_tuples = ETHTOOL_NUM_L2_FTS; + prio = max_tuples - num_tuples; + eth_ft = &priv->fs.ethtool.l2_ft[prio]; + prio += MLX5E_ETHTOOL_L2_PRIO; + break; + default: + return ERR_PTR(-EINVAL); + } + + eth_ft->num_rules++; + if (eth_ft->ft) + return eth_ft; + + ns = mlx5_get_flow_namespace(priv->mdev, + MLX5_FLOW_NAMESPACE_ETHTOOL); + if (!ns) + return ERR_PTR(-EOPNOTSUPP); + + table_size = min_t(u32, BIT(MLX5_CAP_FLOWTABLE(priv->mdev, + flow_table_properties_nic_receive.log_max_ft_size)), + MLX5E_ETHTOOL_NUM_ENTRIES); + + ft_attr.prio = prio; + ft_attr.max_fte = table_size; + ft_attr.autogroup.max_num_groups = MLX5E_ETHTOOL_NUM_GROUPS; + ft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr); + if (IS_ERR(ft)) + return (void *)ft; + + eth_ft->ft = ft; + return eth_ft; +} + +static void mask_spec(u8 *mask, u8 *val, size_t size) +{ + unsigned int i; + + for (i = 0; i < size; i++, mask++, val++) + *((u8 *)val) = *((u8 *)mask) & *((u8 *)val); +} + +#define MLX5E_FTE_SET(header_p, fld, v) \ + MLX5_SET(fte_match_set_lyr_2_4, header_p, fld, v) + +#define MLX5E_FTE_ADDR_OF(header_p, fld) \ + MLX5_ADDR_OF(fte_match_set_lyr_2_4, header_p, fld) + +static void +set_ip4(void *headers_c, void *headers_v, __be32 ip4src_m, + __be32 ip4src_v, __be32 ip4dst_m, __be32 ip4dst_v) +{ + if (ip4src_m) { + memcpy(MLX5E_FTE_ADDR_OF(headers_v, src_ipv4_src_ipv6.ipv4_layout.ipv4), + &ip4src_v, sizeof(ip4src_v)); + memcpy(MLX5E_FTE_ADDR_OF(headers_c, src_ipv4_src_ipv6.ipv4_layout.ipv4), + &ip4src_m, sizeof(ip4src_m)); + } + if (ip4dst_m) { + memcpy(MLX5E_FTE_ADDR_OF(headers_v, dst_ipv4_dst_ipv6.ipv4_layout.ipv4), + &ip4dst_v, sizeof(ip4dst_v)); + memcpy(MLX5E_FTE_ADDR_OF(headers_c, dst_ipv4_dst_ipv6.ipv4_layout.ipv4), + &ip4dst_m, sizeof(ip4dst_m)); + } + + MLX5E_FTE_SET(headers_c, ethertype, 0xffff); + MLX5E_FTE_SET(headers_v, ethertype, ETH_P_IP); +} + +static void +set_ip6(void *headers_c, void *headers_v, __be32 ip6src_m[4], + __be32 ip6src_v[4], __be32 ip6dst_m[4], __be32 ip6dst_v[4]) +{ + u8 ip6_sz = MLX5_FLD_SZ_BYTES(ipv6_layout, ipv6); + + if (!ipv6_addr_any((struct in6_addr *)ip6src_m)) { + memcpy(MLX5E_FTE_ADDR_OF(headers_v, src_ipv4_src_ipv6.ipv6_layout.ipv6), + ip6src_v, ip6_sz); + memcpy(MLX5E_FTE_ADDR_OF(headers_c, src_ipv4_src_ipv6.ipv6_layout.ipv6), + ip6src_m, ip6_sz); + } + if (!ipv6_addr_any((struct in6_addr *)ip6dst_m)) { + memcpy(MLX5E_FTE_ADDR_OF(headers_v, dst_ipv4_dst_ipv6.ipv6_layout.ipv6), + ip6dst_v, ip6_sz); + memcpy(MLX5E_FTE_ADDR_OF(headers_c, dst_ipv4_dst_ipv6.ipv6_layout.ipv6), + ip6dst_m, ip6_sz); + } + + MLX5E_FTE_SET(headers_c, ethertype, 0xffff); + MLX5E_FTE_SET(headers_v, ethertype, ETH_P_IPV6); +} + +static void +set_tcp(void *headers_c, void *headers_v, __be16 psrc_m, __be16 psrc_v, + __be16 pdst_m, __be16 pdst_v) +{ + if (psrc_m) { + MLX5E_FTE_SET(headers_c, tcp_sport, ntohs(psrc_m)); + MLX5E_FTE_SET(headers_v, tcp_sport, ntohs(psrc_v)); + } + if (pdst_m) { + MLX5E_FTE_SET(headers_c, tcp_dport, ntohs(pdst_m)); + MLX5E_FTE_SET(headers_v, tcp_dport, ntohs(pdst_v)); + } + + MLX5E_FTE_SET(headers_c, ip_protocol, 0xffff); + MLX5E_FTE_SET(headers_v, ip_protocol, IPPROTO_TCP); +} + +static void +set_udp(void *headers_c, void *headers_v, __be16 psrc_m, __be16 psrc_v, + __be16 pdst_m, __be16 pdst_v) +{ + if (psrc_m) { + MLX5E_FTE_SET(headers_c, udp_sport, ntohs(psrc_m)); + MLX5E_FTE_SET(headers_v, udp_sport, ntohs(psrc_v)); + } + + if (pdst_m) { + MLX5E_FTE_SET(headers_c, udp_dport, ntohs(pdst_m)); + MLX5E_FTE_SET(headers_v, udp_dport, ntohs(pdst_v)); + } + + MLX5E_FTE_SET(headers_c, ip_protocol, 0xffff); + MLX5E_FTE_SET(headers_v, ip_protocol, IPPROTO_UDP); +} + +static void +parse_tcp4(void *headers_c, void *headers_v, struct ethtool_rx_flow_spec *fs) +{ + struct ethtool_tcpip4_spec *l4_mask = &fs->m_u.tcp_ip4_spec; + struct ethtool_tcpip4_spec *l4_val = &fs->h_u.tcp_ip4_spec; + + set_ip4(headers_c, headers_v, l4_mask->ip4src, l4_val->ip4src, + l4_mask->ip4dst, l4_val->ip4dst); + + set_tcp(headers_c, headers_v, l4_mask->psrc, l4_val->psrc, + l4_mask->pdst, l4_val->pdst); +} + +static void +parse_udp4(void *headers_c, void *headers_v, struct ethtool_rx_flow_spec *fs) +{ + struct ethtool_tcpip4_spec *l4_mask = &fs->m_u.udp_ip4_spec; + struct ethtool_tcpip4_spec *l4_val = &fs->h_u.udp_ip4_spec; + + set_ip4(headers_c, headers_v, l4_mask->ip4src, l4_val->ip4src, + l4_mask->ip4dst, l4_val->ip4dst); + + set_udp(headers_c, headers_v, l4_mask->psrc, l4_val->psrc, + l4_mask->pdst, l4_val->pdst); +} + +static void +parse_ip4(void *headers_c, void *headers_v, struct ethtool_rx_flow_spec *fs) +{ + struct ethtool_usrip4_spec *l3_mask = &fs->m_u.usr_ip4_spec; + struct ethtool_usrip4_spec *l3_val = &fs->h_u.usr_ip4_spec; + + set_ip4(headers_c, headers_v, l3_mask->ip4src, l3_val->ip4src, + l3_mask->ip4dst, l3_val->ip4dst); + + if (l3_mask->proto) { + MLX5E_FTE_SET(headers_c, ip_protocol, l3_mask->proto); + MLX5E_FTE_SET(headers_v, ip_protocol, l3_val->proto); + } +} + +static void +parse_ip6(void *headers_c, void *headers_v, struct ethtool_rx_flow_spec *fs) +{ + struct ethtool_usrip6_spec *l3_mask = &fs->m_u.usr_ip6_spec; + struct ethtool_usrip6_spec *l3_val = &fs->h_u.usr_ip6_spec; + + set_ip6(headers_c, headers_v, l3_mask->ip6src, + l3_val->ip6src, l3_mask->ip6dst, l3_val->ip6dst); + + if (l3_mask->l4_proto) { + MLX5E_FTE_SET(headers_c, ip_protocol, l3_mask->l4_proto); + MLX5E_FTE_SET(headers_v, ip_protocol, l3_val->l4_proto); + } +} + +static void +parse_tcp6(void *headers_c, void *headers_v, struct ethtool_rx_flow_spec *fs) +{ + struct ethtool_tcpip6_spec *l4_mask = &fs->m_u.tcp_ip6_spec; + struct ethtool_tcpip6_spec *l4_val = &fs->h_u.tcp_ip6_spec; + + set_ip6(headers_c, headers_v, l4_mask->ip6src, + l4_val->ip6src, l4_mask->ip6dst, l4_val->ip6dst); + + set_tcp(headers_c, headers_v, l4_mask->psrc, l4_val->psrc, + l4_mask->pdst, l4_val->pdst); +} + +static void +parse_udp6(void *headers_c, void *headers_v, struct ethtool_rx_flow_spec *fs) +{ + struct ethtool_tcpip6_spec *l4_mask = &fs->m_u.udp_ip6_spec; + struct ethtool_tcpip6_spec *l4_val = &fs->h_u.udp_ip6_spec; + + set_ip6(headers_c, headers_v, l4_mask->ip6src, + l4_val->ip6src, l4_mask->ip6dst, l4_val->ip6dst); + + set_udp(headers_c, headers_v, l4_mask->psrc, l4_val->psrc, + l4_mask->pdst, l4_val->pdst); +} + +static void +parse_ether(void *headers_c, void *headers_v, struct ethtool_rx_flow_spec *fs) +{ + struct ethhdr *eth_mask = &fs->m_u.ether_spec; + struct ethhdr *eth_val = &fs->h_u.ether_spec; + + mask_spec((u8 *)eth_mask, (u8 *)eth_val, sizeof(*eth_mask)); + ether_addr_copy(MLX5E_FTE_ADDR_OF(headers_c, smac_47_16), eth_mask->h_source); + ether_addr_copy(MLX5E_FTE_ADDR_OF(headers_v, smac_47_16), eth_val->h_source); + ether_addr_copy(MLX5E_FTE_ADDR_OF(headers_c, dmac_47_16), eth_mask->h_dest); + ether_addr_copy(MLX5E_FTE_ADDR_OF(headers_v, dmac_47_16), eth_val->h_dest); + MLX5E_FTE_SET(headers_c, ethertype, ntohs(eth_mask->h_proto)); + MLX5E_FTE_SET(headers_v, ethertype, ntohs(eth_val->h_proto)); +} + +static void +set_cvlan(void *headers_c, void *headers_v, __be16 vlan_tci) +{ + MLX5E_FTE_SET(headers_c, cvlan_tag, 1); + MLX5E_FTE_SET(headers_v, cvlan_tag, 1); + MLX5E_FTE_SET(headers_c, first_vid, 0xfff); + MLX5E_FTE_SET(headers_v, first_vid, ntohs(vlan_tci)); +} + +static void +set_dmac(void *headers_c, void *headers_v, + unsigned char m_dest[ETH_ALEN], unsigned char v_dest[ETH_ALEN]) +{ + ether_addr_copy(MLX5E_FTE_ADDR_OF(headers_c, dmac_47_16), m_dest); + ether_addr_copy(MLX5E_FTE_ADDR_OF(headers_v, dmac_47_16), v_dest); +} + +static int set_flow_attrs(u32 *match_c, u32 *match_v, + struct ethtool_rx_flow_spec *fs, struct mlx5e_priv *priv) +{ + void *outer_headers_c = MLX5_ADDR_OF(fte_match_param, match_c, + outer_headers); + void *outer_headers_v = MLX5_ADDR_OF(fte_match_param, match_v, + outer_headers); + u32 flow_type = flow_type_mask(fs->flow_type); + +#ifdef CONFIG_MLX5_EN_IPSEC + int err; +#endif + switch (flow_type) { + case TCP_V4_FLOW: + parse_tcp4(outer_headers_c, outer_headers_v, fs); + break; + case UDP_V4_FLOW: + parse_udp4(outer_headers_c, outer_headers_v, fs); + break; +#ifdef CONFIG_MLX5_EN_IPSEC + case ESP_V4_FLOW: + case ESP_V6_FLOW: + err = mlx5e_ipsec_set_flow_attrs(priv, match_c, match_v, fs); + if (err) + return err; + break; +#endif + case IP_USER_FLOW: + parse_ip4(outer_headers_c, outer_headers_v, fs); + break; + case TCP_V6_FLOW: + parse_tcp6(outer_headers_c, outer_headers_v, fs); + break; + case UDP_V6_FLOW: + parse_udp6(outer_headers_c, outer_headers_v, fs); + break; + case IPV6_USER_FLOW: + parse_ip6(outer_headers_c, outer_headers_v, fs); + break; + case ETHER_FLOW: + parse_ether(outer_headers_c, outer_headers_v, fs); + break; + default: + return -EINVAL; + } + + if ((fs->flow_type & FLOW_EXT) && + (fs->m_ext.vlan_tci & cpu_to_be16(VLAN_VID_MASK))) + set_cvlan(outer_headers_c, outer_headers_v, fs->h_ext.vlan_tci); + + if (fs->flow_type & FLOW_MAC_EXT && + !is_zero_ether_addr(fs->m_ext.h_dest)) { + mask_spec(fs->m_ext.h_dest, fs->h_ext.h_dest, ETH_ALEN); + set_dmac(outer_headers_c, outer_headers_v, fs->m_ext.h_dest, + fs->h_ext.h_dest); + } + + return 0; +} + +static void add_rule_to_list(struct mlx5e_priv *priv, + struct mlx5e_ethtool_rule *rule) +{ + struct mlx5e_ethtool_rule *iter; + struct list_head *head = &priv->fs.ethtool.rules; + + list_for_each_entry(iter, &priv->fs.ethtool.rules, list) { + if (iter->flow_spec.location > rule->flow_spec.location) + break; + head = &iter->list; + } + priv->fs.ethtool.tot_num_rules++; + list_add(&rule->list, head); +} + +static bool outer_header_zero(u32 *match_criteria) +{ + int size = MLX5_FLD_SZ_BYTES(fte_match_param, outer_headers); + char *outer_headers_c = MLX5_ADDR_OF(fte_match_param, match_criteria, + outer_headers); + + return outer_headers_c[0] == 0 && !memcmp(outer_headers_c, + outer_headers_c + 1, + size - 1); +} + +static int flow_get_tirn(struct mlx5e_priv *priv, + struct mlx5e_ethtool_rule *eth_rule, + struct ethtool_rx_flow_spec *fs, + u32 rss_context, u32 *tirn) +{ + if (fs->flow_type & FLOW_RSS) { + struct mlx5e_packet_merge_param pkt_merge_param; + struct mlx5e_rss *rss; + u32 flow_type; + int err; + int tt; + + rss = mlx5e_rx_res_rss_get(priv->rx_res, rss_context); + if (!rss) + return -ENOENT; + + flow_type = flow_type_mask(fs->flow_type); + tt = flow_type_to_traffic_type(flow_type); + if (tt < 0) + return -EINVAL; + + pkt_merge_param = priv->channels.params.packet_merge; + err = mlx5e_rss_obtain_tirn(rss, tt, &pkt_merge_param, false, tirn); + if (err) + return err; + eth_rule->rss = rss; + mlx5e_rss_refcnt_inc(eth_rule->rss); + } else { + struct mlx5e_params *params = &priv->channels.params; + enum mlx5e_rq_group group; + u16 ix; + + mlx5e_qid_get_ch_and_group(params, fs->ring_cookie, &ix, &group); + + *tirn = group == MLX5E_RQ_GROUP_XSK ? + mlx5e_rx_res_get_tirn_xsk(priv->rx_res, ix) : + mlx5e_rx_res_get_tirn_direct(priv->rx_res, ix); + } + + return 0; +} + +static bool misc_param_zero(u32 *match_criteria) +{ + int size = MLX5_FLD_SZ_BYTES(fte_match_param, misc_parameters); + char *misc_param_c = MLX5_ADDR_OF(fte_match_param, match_criteria, + misc_parameters); + + return misc_param_c[0] == 0 && !memcmp(misc_param_c, misc_param_c + 1, size - 1); +} + +static struct mlx5_flow_handle * +add_ethtool_flow_rule(struct mlx5e_priv *priv, + struct mlx5e_ethtool_rule *eth_rule, + struct mlx5_flow_table *ft, + struct ethtool_rx_flow_spec *fs, u32 rss_context) +{ + struct mlx5_flow_act flow_act = { .flags = FLOW_ACT_NO_APPEND }; + struct mlx5_flow_destination *dst = NULL; + struct mlx5_flow_handle *rule = NULL; + struct mlx5_flow_spec *spec; + int err = 0; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) + return ERR_PTR(-ENOMEM); + err = set_flow_attrs(spec->match_criteria, spec->match_value, + fs, priv); + if (err) + goto free; + + if (fs->ring_cookie == RX_CLS_FLOW_DISC) { + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP; + } else { + dst = kzalloc(sizeof(*dst), GFP_KERNEL); + if (!dst) { + err = -ENOMEM; + goto free; + } + + err = flow_get_tirn(priv, eth_rule, fs, rss_context, &dst->tir_num); + if (err) + goto free; + + dst->type = MLX5_FLOW_DESTINATION_TYPE_TIR; + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + } + + spec->match_criteria_enable = (!outer_header_zero(spec->match_criteria)); + if (!outer_header_zero(spec->match_criteria)) + spec->match_criteria_enable |= MLX5_MATCH_OUTER_HEADERS; + if (!misc_param_zero(spec->match_criteria)) + spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS; + spec->flow_context.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG; + rule = mlx5_add_flow_rules(ft, spec, &flow_act, dst, dst ? 1 : 0); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + netdev_err(priv->netdev, "%s: failed to add ethtool steering rule: %d\n", + __func__, err); + goto free; + } +free: + kvfree(spec); + kfree(dst); + return err ? ERR_PTR(err) : rule; +} + +static void del_ethtool_rule(struct mlx5e_priv *priv, + struct mlx5e_ethtool_rule *eth_rule) +{ + if (eth_rule->rule) + mlx5_del_flow_rules(eth_rule->rule); + if (eth_rule->rss) + mlx5e_rss_refcnt_dec(eth_rule->rss); + list_del(ð_rule->list); + priv->fs.ethtool.tot_num_rules--; + put_flow_table(eth_rule->eth_ft); + kfree(eth_rule); +} + +static struct mlx5e_ethtool_rule *find_ethtool_rule(struct mlx5e_priv *priv, + int location) +{ + struct mlx5e_ethtool_rule *iter; + + list_for_each_entry(iter, &priv->fs.ethtool.rules, list) { + if (iter->flow_spec.location == location) + return iter; + } + return NULL; +} + +static struct mlx5e_ethtool_rule *get_ethtool_rule(struct mlx5e_priv *priv, + int location) +{ + struct mlx5e_ethtool_rule *eth_rule; + + eth_rule = find_ethtool_rule(priv, location); + if (eth_rule) + del_ethtool_rule(priv, eth_rule); + + eth_rule = kzalloc(sizeof(*eth_rule), GFP_KERNEL); + if (!eth_rule) + return ERR_PTR(-ENOMEM); + + add_rule_to_list(priv, eth_rule); + return eth_rule; +} + +#define MAX_NUM_OF_ETHTOOL_RULES BIT(10) + +#define all_ones(field) (field == (__force typeof(field))-1) +#define all_zeros_or_all_ones(field) \ + ((field) == 0 || (field) == (__force typeof(field))-1) + +static int validate_ethter(struct ethtool_rx_flow_spec *fs) +{ + struct ethhdr *eth_mask = &fs->m_u.ether_spec; + int ntuples = 0; + + if (!is_zero_ether_addr(eth_mask->h_dest)) + ntuples++; + if (!is_zero_ether_addr(eth_mask->h_source)) + ntuples++; + if (eth_mask->h_proto) + ntuples++; + return ntuples; +} + +static int validate_tcpudp4(struct ethtool_rx_flow_spec *fs) +{ + struct ethtool_tcpip4_spec *l4_mask = &fs->m_u.tcp_ip4_spec; + int ntuples = 0; + + if (l4_mask->tos) + return -EINVAL; + + if (l4_mask->ip4src) + ntuples++; + if (l4_mask->ip4dst) + ntuples++; + if (l4_mask->psrc) + ntuples++; + if (l4_mask->pdst) + ntuples++; + /* Flow is TCP/UDP */ + return ++ntuples; +} + +static int validate_ip4(struct ethtool_rx_flow_spec *fs) +{ + struct ethtool_usrip4_spec *l3_mask = &fs->m_u.usr_ip4_spec; + int ntuples = 0; + + if (l3_mask->l4_4_bytes || l3_mask->tos || + fs->h_u.usr_ip4_spec.ip_ver != ETH_RX_NFC_IP4) + return -EINVAL; + if (l3_mask->ip4src) + ntuples++; + if (l3_mask->ip4dst) + ntuples++; + if (l3_mask->proto) + ntuples++; + /* Flow is IPv4 */ + return ++ntuples; +} + +static int validate_ip6(struct ethtool_rx_flow_spec *fs) +{ + struct ethtool_usrip6_spec *l3_mask = &fs->m_u.usr_ip6_spec; + int ntuples = 0; + + if (l3_mask->l4_4_bytes || l3_mask->tclass) + return -EINVAL; + if (!ipv6_addr_any((struct in6_addr *)l3_mask->ip6src)) + ntuples++; + + if (!ipv6_addr_any((struct in6_addr *)l3_mask->ip6dst)) + ntuples++; + if (l3_mask->l4_proto) + ntuples++; + /* Flow is IPv6 */ + return ++ntuples; +} + +static int validate_tcpudp6(struct ethtool_rx_flow_spec *fs) +{ + struct ethtool_tcpip6_spec *l4_mask = &fs->m_u.tcp_ip6_spec; + int ntuples = 0; + + if (l4_mask->tclass) + return -EINVAL; + + if (!ipv6_addr_any((struct in6_addr *)l4_mask->ip6src)) + ntuples++; + + if (!ipv6_addr_any((struct in6_addr *)l4_mask->ip6dst)) + ntuples++; + + if (l4_mask->psrc) + ntuples++; + if (l4_mask->pdst) + ntuples++; + /* Flow is TCP/UDP */ + return ++ntuples; +} + +static int validate_vlan(struct ethtool_rx_flow_spec *fs) +{ + if (fs->m_ext.vlan_etype || + fs->m_ext.vlan_tci != cpu_to_be16(VLAN_VID_MASK)) + return -EINVAL; + + if (fs->m_ext.vlan_tci && + (be16_to_cpu(fs->h_ext.vlan_tci) >= VLAN_N_VID)) + return -EINVAL; + + return 1; +} + +static int validate_flow(struct mlx5e_priv *priv, + struct ethtool_rx_flow_spec *fs) +{ +#ifdef CONFIG_MLX5_EN_IPSEC + struct ethtool_ah_espip4_spec *ipsec4_mask; +#endif + int num_tuples = 0; + int ret = 0; + + if (fs->location >= MAX_NUM_OF_ETHTOOL_RULES) + return -ENOSPC; + + if (fs->ring_cookie != RX_CLS_FLOW_DISC) + if (!mlx5e_qid_validate(priv->profile, &priv->channels.params, + fs->ring_cookie)) + return -EINVAL; + + switch (flow_type_mask(fs->flow_type)) { + case ETHER_FLOW: + num_tuples += validate_ethter(fs); + break; + case TCP_V4_FLOW: + case UDP_V4_FLOW: + ret = validate_tcpudp4(fs); + if (ret < 0) + return ret; + num_tuples += ret; + break; +#ifdef CONFIG_MLX5_EN_IPSEC + case ESP_V4_FLOW: + case ESP_V6_FLOW: + if (fs->m_u.esp_ip4_spec.tos) + return -EINVAL; + ipsec4_mask = &fs->m_u.esp_ip4_spec; + if (!all_ones(ipsec4_mask->ip4src)) + return -EINVAL; + if (!all_ones(ipsec4_mask->ip4dst)) + return -EINVAL; + if (!all_ones(ipsec4_mask->spi)) + return -EINVAL; + /* Flow is ESP, match only on PET offloaded traffic */ + num_tuples++; + break; +#endif + case IP_USER_FLOW: + ret = validate_ip4(fs); + if (ret < 0) + return ret; + num_tuples += ret; + break; + case TCP_V6_FLOW: + case UDP_V6_FLOW: + ret = validate_tcpudp6(fs); + if (ret < 0) + return ret; + num_tuples += ret; + break; + case IPV6_USER_FLOW: + ret = validate_ip6(fs); + if (ret < 0) + return ret; + num_tuples += ret; + break; + default: + return -ENOTSUPP; + } + if ((fs->flow_type & FLOW_EXT)) { + ret = validate_vlan(fs); + if (ret < 0) + return ret; + num_tuples += ret; + } + + if (fs->flow_type & FLOW_MAC_EXT && + !is_zero_ether_addr(fs->m_ext.h_dest)) + num_tuples++; + + return num_tuples; +} + +static int +mlx5e_ethtool_flow_replace(struct mlx5e_priv *priv, + struct ethtool_rx_flow_spec *fs, u32 rss_context) +{ + struct mlx5e_ethtool_table *eth_ft; + struct mlx5e_ethtool_rule *eth_rule; + struct mlx5_flow_handle *rule; + int num_tuples; + int err; + + num_tuples = validate_flow(priv, fs); + if (num_tuples <= 0) { + netdev_warn(priv->netdev, "%s: flow is not valid %d\n", + __func__, num_tuples); + return num_tuples; + } + + eth_ft = get_flow_table(priv, fs, num_tuples); + if (IS_ERR(eth_ft)) + return PTR_ERR(eth_ft); + + eth_rule = get_ethtool_rule(priv, fs->location); + if (IS_ERR(eth_rule)) { + put_flow_table(eth_ft); + return PTR_ERR(eth_rule); + } + + eth_rule->flow_spec = *fs; + eth_rule->eth_ft = eth_ft; + if (!eth_ft->ft) { + err = -EINVAL; + goto del_ethtool_rule; + } + rule = add_ethtool_flow_rule(priv, eth_rule, eth_ft->ft, fs, rss_context); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + goto del_ethtool_rule; + } + + eth_rule->rule = rule; + + return 0; + +del_ethtool_rule: + del_ethtool_rule(priv, eth_rule); + + return err; +} + +static int +mlx5e_ethtool_flow_remove(struct mlx5e_priv *priv, int location) +{ + struct mlx5e_ethtool_rule *eth_rule; + int err = 0; + + if (location >= MAX_NUM_OF_ETHTOOL_RULES) + return -ENOSPC; + + eth_rule = find_ethtool_rule(priv, location); + if (!eth_rule) { + err = -ENOENT; + goto out; + } + + del_ethtool_rule(priv, eth_rule); +out: + return err; +} + +static int +mlx5e_ethtool_get_flow(struct mlx5e_priv *priv, + struct ethtool_rxnfc *info, int location) +{ + struct mlx5e_ethtool_rule *eth_rule; + + if (location < 0 || location >= MAX_NUM_OF_ETHTOOL_RULES) + return -EINVAL; + + list_for_each_entry(eth_rule, &priv->fs.ethtool.rules, list) { + int index; + + if (eth_rule->flow_spec.location != location) + continue; + if (!info) + return 0; + info->fs = eth_rule->flow_spec; + if (!eth_rule->rss) + return 0; + index = mlx5e_rx_res_rss_index(priv->rx_res, eth_rule->rss); + if (index < 0) + return index; + info->rss_context = index; + return 0; + } + + return -ENOENT; +} + +static int +mlx5e_ethtool_get_all_flows(struct mlx5e_priv *priv, + struct ethtool_rxnfc *info, u32 *rule_locs) +{ + int location = 0; + int idx = 0; + int err = 0; + + info->data = MAX_NUM_OF_ETHTOOL_RULES; + while ((!err || err == -ENOENT) && idx < info->rule_cnt) { + err = mlx5e_ethtool_get_flow(priv, NULL, location); + if (!err) + rule_locs[idx++] = location; + location++; + } + return err; +} + +void mlx5e_ethtool_cleanup_steering(struct mlx5e_priv *priv) +{ + struct mlx5e_ethtool_rule *iter; + struct mlx5e_ethtool_rule *temp; + + list_for_each_entry_safe(iter, temp, &priv->fs.ethtool.rules, list) + del_ethtool_rule(priv, iter); +} + +void mlx5e_ethtool_init_steering(struct mlx5e_priv *priv) +{ + INIT_LIST_HEAD(&priv->fs.ethtool.rules); +} + +static int flow_type_to_traffic_type(u32 flow_type) +{ + switch (flow_type) { + case TCP_V4_FLOW: + return MLX5_TT_IPV4_TCP; + case TCP_V6_FLOW: + return MLX5_TT_IPV6_TCP; + case UDP_V4_FLOW: + return MLX5_TT_IPV4_UDP; + case UDP_V6_FLOW: + return MLX5_TT_IPV6_UDP; + case AH_V4_FLOW: + return MLX5_TT_IPV4_IPSEC_AH; + case AH_V6_FLOW: + return MLX5_TT_IPV6_IPSEC_AH; + case ESP_V4_FLOW: + return MLX5_TT_IPV4_IPSEC_ESP; + case ESP_V6_FLOW: + return MLX5_TT_IPV6_IPSEC_ESP; + case IPV4_FLOW: + return MLX5_TT_IPV4; + case IPV6_FLOW: + return MLX5_TT_IPV6; + default: + return -EINVAL; + } +} + +static int mlx5e_set_rss_hash_opt(struct mlx5e_priv *priv, + struct ethtool_rxnfc *nfc) +{ + u8 rx_hash_field = 0; + int err; + int tt; + + tt = flow_type_to_traffic_type(nfc->flow_type); + if (tt < 0) + return tt; + + /* RSS does not support anything other than hashing to queues + * on src IP, dest IP, TCP/UDP src port and TCP/UDP dest + * port. + */ + if (nfc->flow_type != TCP_V4_FLOW && + nfc->flow_type != TCP_V6_FLOW && + nfc->flow_type != UDP_V4_FLOW && + nfc->flow_type != UDP_V6_FLOW) + return -EOPNOTSUPP; + + if (nfc->data & ~(RXH_IP_SRC | RXH_IP_DST | + RXH_L4_B_0_1 | RXH_L4_B_2_3)) + return -EOPNOTSUPP; + + if (nfc->data & RXH_IP_SRC) + rx_hash_field |= MLX5_HASH_FIELD_SEL_SRC_IP; + if (nfc->data & RXH_IP_DST) + rx_hash_field |= MLX5_HASH_FIELD_SEL_DST_IP; + if (nfc->data & RXH_L4_B_0_1) + rx_hash_field |= MLX5_HASH_FIELD_SEL_L4_SPORT; + if (nfc->data & RXH_L4_B_2_3) + rx_hash_field |= MLX5_HASH_FIELD_SEL_L4_DPORT; + + mutex_lock(&priv->state_lock); + err = mlx5e_rx_res_rss_set_hash_fields(priv->rx_res, tt, rx_hash_field); + mutex_unlock(&priv->state_lock); + + return err; +} + +static int mlx5e_get_rss_hash_opt(struct mlx5e_priv *priv, + struct ethtool_rxnfc *nfc) +{ + u32 hash_field = 0; + int tt; + + tt = flow_type_to_traffic_type(nfc->flow_type); + if (tt < 0) + return tt; + + hash_field = mlx5e_rx_res_rss_get_hash_fields(priv->rx_res, tt); + nfc->data = 0; + + if (hash_field & MLX5_HASH_FIELD_SEL_SRC_IP) + nfc->data |= RXH_IP_SRC; + if (hash_field & MLX5_HASH_FIELD_SEL_DST_IP) + nfc->data |= RXH_IP_DST; + if (hash_field & MLX5_HASH_FIELD_SEL_L4_SPORT) + nfc->data |= RXH_L4_B_0_1; + if (hash_field & MLX5_HASH_FIELD_SEL_L4_DPORT) + nfc->data |= RXH_L4_B_2_3; + + return 0; +} + +int mlx5e_ethtool_set_rxnfc(struct mlx5e_priv *priv, struct ethtool_rxnfc *cmd) +{ + int err = 0; + + switch (cmd->cmd) { + case ETHTOOL_SRXCLSRLINS: + err = mlx5e_ethtool_flow_replace(priv, &cmd->fs, cmd->rss_context); + break; + case ETHTOOL_SRXCLSRLDEL: + err = mlx5e_ethtool_flow_remove(priv, cmd->fs.location); + break; + case ETHTOOL_SRXFH: + err = mlx5e_set_rss_hash_opt(priv, cmd); + break; + default: + err = -EOPNOTSUPP; + break; + } + + return err; +} + +int mlx5e_ethtool_get_rxnfc(struct mlx5e_priv *priv, + struct ethtool_rxnfc *info, u32 *rule_locs) +{ + int err = 0; + + switch (info->cmd) { + case ETHTOOL_GRXCLSRLCNT: + info->rule_cnt = priv->fs.ethtool.tot_num_rules; + break; + case ETHTOOL_GRXCLSRULE: + err = mlx5e_ethtool_get_flow(priv, info, info->fs.location); + break; + case ETHTOOL_GRXCLSRLALL: + err = mlx5e_ethtool_get_all_flows(priv, info, rule_locs); + break; + case ETHTOOL_GRXFH: + err = mlx5e_get_rss_hash_opt(priv, info); + break; + default: + err = -EOPNOTSUPP; + break; + } + + return err; +} + diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_main.c new file mode 100644 index 0000000..6ed073a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -0,0 +1,6269 @@ +/* + * Copyright (c) 2015-2016, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "eswitch.h" +#include "en.h" +#include "en/txrx.h" +#include "en_tc.h" +#include "en_rep.h" +#include "en_accel/ipsec.h" +#include "en_accel/macsec.h" +#include "en_accel/en_accel.h" +#include "en_accel/tls.h" +#include "accel/ipsec.h" +#include "accel/tls.h" +#include "lib/vxlan.h" +#include "lib/clock.h" +#include "en/port.h" +#include "en/xdp.h" +#include "lib/eq.h" +#include "en/monitor_stats.h" +#include "en/health.h" +#include "en/params.h" +#include "en/xsk/pool.h" +#include "en/xsk/setup.h" +#include "en/xsk/rx.h" +#include "en/xsk/tx.h" +#include "en/hv_vhca_stats.h" +#include "en/devlink.h" +#include "lib/mlx5.h" +#include "en/ptp.h" +#include "qos.h" +#include "en/trap.h" +#include "fpga/ipsec.h" + +bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev) +{ + bool striding_rq_umr, inline_umr; + u16 max_wqe_sz_cap; + + striding_rq_umr = MLX5_CAP_GEN(mdev, striding_rq) && MLX5_CAP_GEN(mdev, umr_ptr_rlky) && + MLX5_CAP_ETH(mdev, reg_umr_sq); + max_wqe_sz_cap = mlx5e_get_max_sq_aligned_wqebbs(mdev) * MLX5_SEND_WQE_BB; + inline_umr = max_wqe_sz_cap >= MLX5E_UMR_WQE_INLINE_SZ; + if (!striding_rq_umr) + return false; + if (!inline_umr) { + mlx5_core_warn(mdev, "Cannot support Striding RQ: UMR WQE size (%d) exceeds maximum supported (%d).\n", + (int)MLX5E_UMR_WQE_INLINE_SZ, max_wqe_sz_cap); + return false; + } + return true; +} + +void mlx5e_update_carrier(struct mlx5e_priv *priv) +{ + struct mlx5_core_dev *mdev = priv->mdev; + u8 port_state; + bool up; + + port_state = mlx5_query_vport_state(mdev, + MLX5_VPORT_STATE_OP_MOD_VNIC_VPORT, + 0); + + up = port_state == VPORT_STATE_UP; + if (up == netif_carrier_ok(priv->netdev)) + netif_carrier_event(priv->netdev); + if (up) { + netdev_info(priv->netdev, "Link up\n"); + netif_carrier_on(priv->netdev); + } else { + netdev_info(priv->netdev, "Link down\n"); + netif_carrier_off(priv->netdev); + } +} + +static void mlx5e_update_carrier_work(struct work_struct *work) +{ + struct mlx5e_priv *priv = container_of(work, struct mlx5e_priv, + update_carrier_work); + + mutex_lock(&priv->state_lock); + if (test_bit(MLX5E_STATE_OPENED, &priv->state)) + if (priv->profile->update_carrier) + priv->profile->update_carrier(priv); + mutex_unlock(&priv->state_lock); +} + +static void mlx5e_update_stats_work(struct work_struct *work) +{ + struct mlx5e_priv *priv = container_of(work, struct mlx5e_priv, + update_stats_work); + + mutex_lock(&priv->state_lock); + priv->profile->update_stats(priv); + mutex_unlock(&priv->state_lock); +} + +static int mlx5_core_set_delay_drop(struct mlx5_core_dev *dev, + u32 timeout_usec) +{ + u32 in[MLX5_ST_SZ_DW(set_delay_drop_params_in)] = {}; + + MLX5_SET(set_delay_drop_params_in, in, opcode, + MLX5_CMD_OP_SET_DELAY_DROP_PARAMS); + MLX5_SET(set_delay_drop_params_in, in, delay_drop_timeout, + timeout_usec / 100); + return mlx5_cmd_exec_in(dev, set_delay_drop_params, in); +} + +static void mlx5e_delay_drop_handler(struct work_struct *work) +{ + struct mlx5e_delay_drop *delay_drop = + container_of(work, struct mlx5e_delay_drop, work); + struct mlx5e_priv *priv = container_of(delay_drop, struct mlx5e_priv, + delay_drop); + int err; + + mutex_lock(&delay_drop->lock); + err = mlx5_core_set_delay_drop(priv->mdev, + delay_drop->usec_timeout); + if (err) { + mlx5_core_warn(priv->mdev, "Failed to enable delay drop err=%d\n", + err); + delay_drop->activate = false; + } + mutex_unlock(&delay_drop->lock); +} + +void mlx5e_queue_update_stats(struct mlx5e_priv *priv) +{ + if (!priv->profile->update_stats) + return; + + if (unlikely(test_bit(MLX5E_STATE_DESTROYING, &priv->state))) + return; + + queue_work(priv->wq, &priv->update_stats_work); +} + +static int async_event(struct notifier_block *nb, unsigned long event, void *data) +{ + struct mlx5e_priv *priv = container_of(nb, struct mlx5e_priv, events_nb); + struct mlx5_eqe *eqe = data; + + if (event != MLX5_EVENT_TYPE_PORT_CHANGE && + event != MLX5_EVENT_TYPE_GENERAL_EVENT) + return NOTIFY_DONE; + + switch (event) { + case MLX5_EVENT_TYPE_PORT_CHANGE: + switch (eqe->sub_type) { + case MLX5_PORT_CHANGE_SUBTYPE_DOWN: + case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE: + queue_work(priv->wq, &priv->update_carrier_work); + break; + default: + return NOTIFY_DONE; + } + break; + case MLX5_EVENT_TYPE_GENERAL_EVENT: + switch (eqe->sub_type) { + case MLX5_GENERAL_SUBTYPE_DELAY_DROP_TIMEOUT: + if (MLX5E_GET_PFLAG(&priv->channels.params, MLX5E_PFLAG_DROPLESS_RQ)) + queue_work(priv->wq, &priv->delay_drop.work); + break; + default: + return NOTIFY_DONE; + } + break; + default: + return NOTIFY_DONE; + } + + return NOTIFY_OK; +} + +static void mlx5e_enable_async_events(struct mlx5e_priv *priv) +{ + priv->events_nb.notifier_call = async_event; + mlx5_notifier_register(priv->mdev, &priv->events_nb); +} + +static void mlx5e_disable_async_events(struct mlx5e_priv *priv) +{ + mlx5_notifier_unregister(priv->mdev, &priv->events_nb); +} + +static int blocking_event(struct notifier_block *nb, unsigned long event, void *data) +{ + struct mlx5e_priv *priv = container_of(nb, struct mlx5e_priv, blocking_events_nb); + int err; + + switch (event) { + case MLX5_DRIVER_EVENT_TYPE_TRAP: + err = mlx5e_handle_trap_event(priv, data); + break; + default: + netdev_warn(priv->netdev, "Sync event: Unknown event %ld\n", event); + err = -EINVAL; + } + return err; +} + +static void mlx5e_enable_blocking_events(struct mlx5e_priv *priv) +{ + priv->blocking_events_nb.notifier_call = blocking_event; + mlx5_blocking_notifier_register(priv->mdev, &priv->blocking_events_nb); +} + +static void mlx5e_disable_blocking_events(struct mlx5e_priv *priv) +{ + mlx5_blocking_notifier_unregister(priv->mdev, &priv->blocking_events_nb); +} + +static inline void mlx5e_build_umr_wqe(struct mlx5e_rq *rq, + struct mlx5e_icosq *sq, + struct mlx5e_umr_wqe *wqe) +{ + struct mlx5_wqe_ctrl_seg *cseg = &wqe->ctrl; + struct mlx5_wqe_umr_ctrl_seg *ucseg = &wqe->uctrl; + u8 ds_cnt = DIV_ROUND_UP(MLX5E_UMR_WQE_INLINE_SZ, MLX5_SEND_WQE_DS); + + cseg->qpn_ds = cpu_to_be32((sq->sqn << MLX5_WQE_CTRL_QPN_SHIFT) | + ds_cnt); + cseg->umr_mkey = rq->mkey_be; + + ucseg->flags = MLX5_UMR_TRANSLATION_OFFSET_EN | MLX5_UMR_INLINE; + ucseg->xlt_octowords = + cpu_to_be16(MLX5_MTT_OCTW(MLX5_MPWRQ_PAGES_PER_WQE)); + ucseg->mkey_mask = cpu_to_be64(MLX5_MKEY_MASK_FREE); +} + +static int mlx5e_rq_shampo_hd_alloc(struct mlx5e_rq *rq, int node) +{ + rq->mpwqe.shampo = kvzalloc_node(sizeof(*rq->mpwqe.shampo), + GFP_KERNEL, node); + if (!rq->mpwqe.shampo) + return -ENOMEM; + return 0; +} + +static void mlx5e_rq_shampo_hd_free(struct mlx5e_rq *rq) +{ + kvfree(rq->mpwqe.shampo); +} + +static int mlx5e_rq_shampo_hd_info_alloc(struct mlx5e_rq *rq, int node) +{ + struct mlx5e_shampo_hd *shampo = rq->mpwqe.shampo; + + shampo->bitmap = bitmap_zalloc_node(shampo->hd_per_wq, GFP_KERNEL, + node); + if (!shampo->bitmap) + return -ENOMEM; + + shampo->info = kvzalloc_node(array_size(shampo->hd_per_wq, + sizeof(*shampo->info)), + GFP_KERNEL, node); + if (!shampo->info) { + kvfree(shampo->bitmap); + return -ENOMEM; + } + return 0; +} + +static void mlx5e_rq_shampo_hd_info_free(struct mlx5e_rq *rq) +{ + kvfree(rq->mpwqe.shampo->bitmap); + kvfree(rq->mpwqe.shampo->info); +} + +static int mlx5e_rq_alloc_mpwqe_info(struct mlx5e_rq *rq, int node) +{ + int wq_sz = mlx5_wq_ll_get_size(&rq->mpwqe.wq); + + rq->mpwqe.info = kvzalloc_node(array_size(wq_sz, + sizeof(*rq->mpwqe.info)), + GFP_KERNEL, node); + if (!rq->mpwqe.info) + return -ENOMEM; + + mlx5e_build_umr_wqe(rq, rq->icosq, &rq->mpwqe.umr_wqe); + + return 0; +} + +static int mlx5e_create_umr_mtt_mkey(struct mlx5_core_dev *mdev, + u64 npages, u8 page_shift, u32 *umr_mkey, + dma_addr_t filler_addr) +{ + struct mlx5_mtt *mtt; + int inlen; + void *mkc; + u32 *in; + int err; + int i; + + inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + sizeof(*mtt) * npages; + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + + MLX5_SET(mkc, mkc, free, 1); + MLX5_SET(mkc, mkc, umr_en, 1); + MLX5_SET(mkc, mkc, lw, 1); + MLX5_SET(mkc, mkc, lr, 1); + MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT); + mlx5e_mkey_set_relaxed_ordering(mdev, mkc); + MLX5_SET(mkc, mkc, qpn, 0xffffff); + MLX5_SET(mkc, mkc, pd, mdev->mlx5e_res.hw_objs.pdn); + MLX5_SET64(mkc, mkc, len, npages << page_shift); + MLX5_SET(mkc, mkc, translations_octword_size, + MLX5_MTT_OCTW(npages)); + MLX5_SET(mkc, mkc, log_page_size, page_shift); + MLX5_SET(create_mkey_in, in, translations_octword_actual_size, + MLX5_MTT_OCTW(npages)); + + /* Initialize the mkey with all MTTs pointing to a default + * page (filler_addr). When the channels are activated, UMR + * WQEs will redirect the RX WQEs to the actual memory from + * the RQ's pool, while the gaps (wqe_overflow) remain mapped + * to the default page. + */ + mtt = MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); + for (i = 0 ; i < npages ; i++) + mtt[i].ptag = cpu_to_be64(filler_addr); + + err = mlx5_core_create_mkey(mdev, umr_mkey, in, inlen); + + kvfree(in); + return err; +} + +static int mlx5e_create_umr_klm_mkey(struct mlx5_core_dev *mdev, + u64 nentries, + u32 *umr_mkey) +{ + int inlen; + void *mkc; + u32 *in; + int err; + + inlen = MLX5_ST_SZ_BYTES(create_mkey_in); + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + + MLX5_SET(mkc, mkc, free, 1); + MLX5_SET(mkc, mkc, umr_en, 1); + MLX5_SET(mkc, mkc, lw, 1); + MLX5_SET(mkc, mkc, lr, 1); + MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_KLMS); + mlx5e_mkey_set_relaxed_ordering(mdev, mkc); + MLX5_SET(mkc, mkc, qpn, 0xffffff); + MLX5_SET(mkc, mkc, pd, mdev->mlx5e_res.hw_objs.pdn); + MLX5_SET(mkc, mkc, translations_octword_size, nentries); + MLX5_SET(mkc, mkc, length64, 1); + err = mlx5_core_create_mkey(mdev, umr_mkey, in, inlen); + + kvfree(in); + return err; +} + +static int mlx5e_create_rq_umr_mkey(struct mlx5_core_dev *mdev, struct mlx5e_rq *rq) +{ + u64 num_mtts = MLX5E_REQUIRED_MTTS(mlx5_wq_ll_get_size(&rq->mpwqe.wq)); + + return mlx5e_create_umr_mtt_mkey(mdev, num_mtts, PAGE_SHIFT, + &rq->umr_mkey, rq->wqe_overflow.addr); +} + +static int mlx5e_create_rq_hd_umr_mkey(struct mlx5_core_dev *mdev, + struct mlx5e_rq *rq) +{ + u32 max_klm_size = BIT(MLX5_CAP_GEN(mdev, log_max_klm_list_size)); + + if (max_klm_size < rq->mpwqe.shampo->hd_per_wq) { + mlx5_core_err(mdev, "max klm list size 0x%x is smaller than shampo header buffer list size 0x%x\n", + max_klm_size, rq->mpwqe.shampo->hd_per_wq); + return -EINVAL; + } + return mlx5e_create_umr_klm_mkey(mdev, rq->mpwqe.shampo->hd_per_wq, + &rq->mpwqe.shampo->mkey); +} + +static u64 mlx5e_get_mpwqe_offset(u16 wqe_ix) +{ + return MLX5E_REQUIRED_MTTS(wqe_ix) << PAGE_SHIFT; +} + +static void mlx5e_init_frags_partition(struct mlx5e_rq *rq) +{ + struct mlx5e_wqe_frag_info next_frag = {}; + struct mlx5e_wqe_frag_info *prev = NULL; + int i; + + next_frag.di = &rq->wqe.di[0]; + + for (i = 0; i < mlx5_wq_cyc_get_size(&rq->wqe.wq); i++) { + struct mlx5e_rq_frag_info *frag_info = &rq->wqe.info.arr[0]; + struct mlx5e_wqe_frag_info *frag = + &rq->wqe.frags[i << rq->wqe.info.log_num_frags]; + int f; + + for (f = 0; f < rq->wqe.info.num_frags; f++, frag++) { + if (next_frag.offset + frag_info[f].frag_stride > PAGE_SIZE) { + next_frag.di++; + next_frag.offset = 0; + if (prev) + prev->last_in_page = true; + } + *frag = next_frag; + + /* prepare next */ + next_frag.offset += frag_info[f].frag_stride; + prev = frag; + } + } + + if (prev) + prev->last_in_page = true; +} + +int mlx5e_init_di_list(struct mlx5e_rq *rq, int wq_sz, int node) +{ + int len = wq_sz << rq->wqe.info.log_num_frags; + + rq->wqe.di = kvzalloc_node(array_size(len, sizeof(*rq->wqe.di)), GFP_KERNEL, node); + if (!rq->wqe.di) + return -ENOMEM; + + mlx5e_init_frags_partition(rq); + + return 0; +} + +void mlx5e_free_di_list(struct mlx5e_rq *rq) +{ + kvfree(rq->wqe.di); +} + +static void mlx5e_rq_err_cqe_work(struct work_struct *recover_work) +{ + struct mlx5e_rq *rq = container_of(recover_work, struct mlx5e_rq, recover_work); + + mlx5e_reporter_rq_cqe_err(rq); +} + +static int mlx5e_alloc_mpwqe_rq_drop_page(struct mlx5e_rq *rq) +{ + rq->wqe_overflow.page = alloc_page(GFP_KERNEL); + if (!rq->wqe_overflow.page) + return -ENOMEM; + + rq->wqe_overflow.addr = dma_map_page(rq->pdev, rq->wqe_overflow.page, 0, + PAGE_SIZE, rq->buff.map_dir); + if (dma_mapping_error(rq->pdev, rq->wqe_overflow.addr)) { + __free_page(rq->wqe_overflow.page); + return -ENOMEM; + } + return 0; +} + +static void mlx5e_free_mpwqe_rq_drop_page(struct mlx5e_rq *rq) +{ + dma_unmap_page(rq->pdev, rq->wqe_overflow.addr, PAGE_SIZE, + rq->buff.map_dir); + __free_page(rq->wqe_overflow.page); +} + +static int mlx5e_init_rxq_rq(struct mlx5e_channel *c, struct mlx5e_params *params, + struct mlx5e_rq *rq) +{ + struct mlx5_core_dev *mdev = c->mdev; + int err; + + rq->wq_type = params->rq_wq_type; + rq->pdev = c->pdev; + rq->netdev = c->netdev; + rq->priv = c->priv; + rq->tstamp = c->tstamp; + rq->clock = &mdev->clock; + rq->icosq = &c->icosq; + rq->ix = c->ix; + rq->channel = c; + rq->mdev = mdev; + rq->hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu); + rq->xdpsq = &c->rq_xdpsq; + rq->stats = &c->priv->channel_stats[c->ix]->rq; + rq->ptp_cyc2time = mlx5_rq_ts_translator(mdev); + if (mlx5_eswitch_mode(mdev) == MLX5_ESWITCH_OFFLOADS && + mlx5e_esw_offloads_pet_enabled(mdev->priv.eswitch)) { + rq->pet_hdr_size = 8; + } + + err = mlx5e_rq_set_handlers(rq, params, NULL); + if (err) + return err; + + return xdp_rxq_info_reg(&rq->xdp_rxq, rq->netdev, rq->ix, 0); +} + +static int mlx5_rq_shampo_alloc(struct mlx5_core_dev *mdev, + struct mlx5e_params *params, + struct mlx5e_rq_param *rqp, + struct mlx5e_rq *rq, + u32 *pool_size, + int node) +{ + void *wqc = MLX5_ADDR_OF(rqc, rqp->rqc, wq); + int wq_size; + int err; + + if (!test_bit(MLX5E_RQ_STATE_SHAMPO, &rq->state)) + return 0; + err = mlx5e_rq_shampo_hd_alloc(rq, node); + if (err) + goto out; + rq->mpwqe.shampo->hd_per_wq = + mlx5e_shampo_hd_per_wq(mdev, params, rqp); + err = mlx5e_create_rq_hd_umr_mkey(mdev, rq); + if (err) + goto err_shampo_hd; + err = mlx5e_rq_shampo_hd_info_alloc(rq, node); + if (err) + goto err_shampo_info; + rq->hw_gro_data = kvzalloc_node(sizeof(*rq->hw_gro_data), GFP_KERNEL, node); + if (!rq->hw_gro_data) { + err = -ENOMEM; + goto err_hw_gro_data; + } + rq->mpwqe.shampo->key = + cpu_to_be32(rq->mpwqe.shampo->mkey); + rq->mpwqe.shampo->hd_per_wqe = + mlx5e_shampo_hd_per_wqe(mdev, params, rqp); + wq_size = BIT(MLX5_GET(wq, wqc, log_wq_sz)); + *pool_size += (rq->mpwqe.shampo->hd_per_wqe * wq_size) / + MLX5E_SHAMPO_WQ_HEADER_PER_PAGE; + return 0; + +err_hw_gro_data: + mlx5e_rq_shampo_hd_info_free(rq); +err_shampo_info: + mlx5_core_destroy_mkey(mdev, rq->mpwqe.shampo->mkey); +err_shampo_hd: + mlx5e_rq_shampo_hd_free(rq); +out: + return err; +} + +static void mlx5e_rq_free_shampo(struct mlx5e_rq *rq) +{ + if (!test_bit(MLX5E_RQ_STATE_SHAMPO, &rq->state)) + return; + + kvfree(rq->hw_gro_data); + mlx5e_rq_shampo_hd_info_free(rq); + mlx5_core_destroy_mkey(rq->mdev, rq->mpwqe.shampo->mkey); + mlx5e_rq_shampo_hd_free(rq); +} + +static void mlx5e_rx_cache_reduce_clean_pending(struct mlx5e_rq *rq) +{ + struct mlx5e_page_cache_reduce *reduce = &rq->page_cache.reduce; + int i; + + if (!test_bit(MLX5E_RQ_STATE_CACHE_REDUCE_PENDING, &rq->state)) + return; + + for (i = 0; i < reduce->npages; i++) + mlx5e_page_release_dynamic(rq, &reduce->pending[i], false); + + clear_bit(MLX5E_RQ_STATE_CACHE_REDUCE_PENDING, &rq->state); +} + +static void mlx5e_rx_cache_reduce_work(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct mlx5e_page_cache_reduce *reduce = + container_of(dwork, struct mlx5e_page_cache_reduce, reduce_work); + struct mlx5e_page_cache *cache = + container_of(reduce, struct mlx5e_page_cache, reduce); + struct mlx5e_rq *rq = container_of(cache, struct mlx5e_rq, page_cache); + + local_bh_disable(); + napi_schedule(rq->cq.napi); + local_bh_enable(); + mlx5e_rx_cache_reduce_clean_pending(rq); + + if (ilog2(cache->sz) > cache->log_min_sz) + schedule_delayed_work_on(smp_processor_id(), + dwork, reduce->delay); +} + +static int mlx5e_rx_alloc_page_cache(struct mlx5e_rq *rq, + int node, u8 log_init_sz) +{ + struct mlx5e_page_cache *cache = &rq->page_cache; + struct mlx5e_page_cache_reduce *reduce = &cache->reduce; + u32 max_sz; + + cache->log_max_sz = log_init_sz + MLX5E_PAGE_CACHE_LOG_MAX_RQ_MULT; + cache->log_min_sz = log_init_sz; + max_sz = 1 << cache->log_max_sz; + + cache->page_cache = kvzalloc_node(max_sz * sizeof(*cache->page_cache), + GFP_KERNEL, node); + if (!cache->page_cache) + return -ENOMEM; + + reduce->pending = kvzalloc_node(max_sz * sizeof(*reduce->pending), + GFP_KERNEL, node); + if (!reduce->pending) + goto err_free_cache; + + cache->sz = 1 << cache->log_min_sz; + cache->head = -1; + INIT_DELAYED_WORK(&reduce->reduce_work, mlx5e_rx_cache_reduce_work); + reduce->delay = msecs_to_jiffies(MLX5E_PAGE_CACHE_REDUCE_WORK_INTERVAL); + reduce->graceful_period = msecs_to_jiffies(MLX5E_PAGE_CACHE_REDUCE_GRACE_PERIOD); + reduce->next_ts = MAX_JIFFY_OFFSET; /* in init, no reduce is needed */ + + return 0; + +err_free_cache: + kvfree(cache->page_cache); + + return -ENOMEM; +} + +static void mlx5e_rx_free_page_cache(struct mlx5e_rq *rq) +{ + struct mlx5e_page_cache *cache = &rq->page_cache; + struct mlx5e_page_cache_reduce *reduce = &cache->reduce; + int i; + + cancel_delayed_work_sync(&reduce->reduce_work); + mlx5e_rx_cache_reduce_clean_pending(rq); + kvfree(reduce->pending); + + for (i = 0; i <= cache->head; i++) { + struct mlx5e_dma_info *dma_info = &cache->page_cache[i]; + + mlx5e_page_release_dynamic(rq, dma_info, false); + } + kvfree(cache->page_cache); +} + +static int mlx5e_alloc_rq(struct mlx5e_params *params, + struct mlx5e_xsk_param *xsk, + struct mlx5e_rq_param *rqp, + int node, struct mlx5e_rq *rq) +{ + struct page_pool_params pp_params = { 0 }; + struct mlx5_core_dev *mdev = rq->mdev; + void *rqc = rqp->rqc; + void *rqc_wq = MLX5_ADDR_OF(rqc, rqc, wq); + u32 pool_size; + u32 cache_init_sz; + int wq_sz; + int err; + int i; + + rqp->wq.db_numa_node = node; + INIT_WORK(&rq->recover_work, mlx5e_rq_err_cqe_work); + + if (params->xdp_prog) + bpf_prog_inc(params->xdp_prog); + RCU_INIT_POINTER(rq->xdp_prog, params->xdp_prog); + + rq->buff.map_dir = params->xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE; + rq->buff.headroom = mlx5e_get_rq_headroom(mdev, params, xsk); + pool_size = 1 << params->log_rq_mtu_frames; + + switch (rq->wq_type) { + case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ: + err = mlx5_wq_ll_create(mdev, &rqp->wq, rqc_wq, &rq->mpwqe.wq, + &rq->wq_ctrl); + if (err) + goto err_rq_xdp_prog; + + err = mlx5e_alloc_mpwqe_rq_drop_page(rq); + if (err) + goto err_rq_wq_destroy; + + rq->mpwqe.wq.db = &rq->mpwqe.wq.db[MLX5_RCV_DBR]; + + wq_sz = mlx5_wq_ll_get_size(&rq->mpwqe.wq); + + cache_init_sz = wq_sz * MLX5_MPWRQ_PAGES_PER_WQE; + + pool_size = MLX5_MPWRQ_PAGES_PER_WQE << + mlx5e_mpwqe_get_log_rq_size(params, xsk); + + rq->mpwqe.log_stride_sz = mlx5e_mpwqe_get_log_stride_size(mdev, params, xsk); + rq->mpwqe.num_strides = + BIT(mlx5e_mpwqe_get_log_num_strides(mdev, params, xsk)); + + rq->buff.frame0_sz = (1 << rq->mpwqe.log_stride_sz); + + err = mlx5e_create_rq_umr_mkey(mdev, rq); + if (err) + goto err_rq_drop_page; + rq->mkey_be = cpu_to_be32(rq->umr_mkey); + + err = mlx5e_rq_alloc_mpwqe_info(rq, node); + if (err) + goto err_rq_mkey; + + err = mlx5_rq_shampo_alloc(mdev, params, rqp, rq, &pool_size, node); + if (err) + goto err_free_by_rq_type; + + break; + default: /* MLX5_WQ_TYPE_CYCLIC */ + err = mlx5_wq_cyc_create(mdev, &rqp->wq, rqc_wq, &rq->wqe.wq, + &rq->wq_ctrl); + if (err) + goto err_rq_xdp_prog; + + rq->wqe.wq.db = &rq->wqe.wq.db[MLX5_RCV_DBR]; + + wq_sz = mlx5_wq_cyc_get_size(&rq->wqe.wq); + + cache_init_sz = wq_sz; + rq->wqe.info = rqp->frags_info; + rq->buff.frame0_sz = rq->wqe.info.arr[0].frag_stride; + + rq->wqe.frags = + kvzalloc_node(array_size(sizeof(*rq->wqe.frags), + (wq_sz << rq->wqe.info.log_num_frags)), + GFP_KERNEL, node); + if (!rq->wqe.frags) { + err = -ENOMEM; + goto err_rq_wq_destroy; + } + + err = mlx5e_init_di_list(rq, wq_sz, node); + if (err) + goto err_rq_frags; + + rq->mkey_be = cpu_to_be32(mdev->mlx5e_res.hw_objs.mkey); + } + + if (xsk) { + err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, + MEM_TYPE_XSK_BUFF_POOL, NULL); + xsk_pool_set_rxq_info(rq->xsk_pool, &rq->xdp_rxq); + } else { + err = mlx5e_rx_alloc_page_cache(rq, node, + ilog2(cache_init_sz)); + if (err) + goto err_free_by_rq_type; + + /* Create a page_pool and register it with rxq */ + pp_params.order = 0; + pp_params.flags = 0; /* No-internal DMA mapping in page_pool */ + pp_params.pool_size = pool_size; + pp_params.nid = node; + pp_params.dev = rq->pdev; + pp_params.dma_dir = rq->buff.map_dir; + + /* page_pool can be used even when there is no rq->xdp_prog, + * given page_pool does not handle DMA mapping there is no + * required state to clear. And page_pool gracefully handle + * elevated refcnt. + */ + rq->page_pool = page_pool_create(&pp_params); + if (IS_ERR(rq->page_pool)) { + err = PTR_ERR(rq->page_pool); + rq->page_pool = NULL; + goto err_free_shampo; + } + if (xdp_rxq_info_is_reg(&rq->xdp_rxq)) + err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, + MEM_TYPE_PAGE_POOL, rq->page_pool); + } + if (err) + goto err_free_shampo; + + for (i = 0; i < wq_sz; i++) { + if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) { + struct mlx5e_rx_wqe_ll *wqe = + mlx5_wq_ll_get_wqe(&rq->mpwqe.wq, i); + u32 byte_count = + rq->mpwqe.num_strides << rq->mpwqe.log_stride_sz; + u64 dma_offset = mlx5e_get_mpwqe_offset(i); + u16 headroom = test_bit(MLX5E_RQ_STATE_SHAMPO, &rq->state) ? + 0 : rq->buff.headroom; + + wqe->data[0].addr = cpu_to_be64(dma_offset + headroom); + wqe->data[0].byte_count = cpu_to_be32(byte_count); + wqe->data[0].lkey = rq->mkey_be; + } else { + struct mlx5e_rx_wqe_cyc *wqe = + mlx5_wq_cyc_get_wqe(&rq->wqe.wq, i); + int f; + + for (f = 0; f < rq->wqe.info.num_frags; f++) { + u32 frag_size = rq->wqe.info.arr[f].frag_size | + MLX5_HW_START_PADDING; + + wqe->data[f].byte_count = cpu_to_be32(frag_size); + wqe->data[f].lkey = rq->mkey_be; + } + /* check if num_frags is not a pow of two */ + if (rq->wqe.info.num_frags < (1 << rq->wqe.info.log_num_frags)) { + wqe->data[f].byte_count = 0; + wqe->data[f].lkey = cpu_to_be32(MLX5_INVALID_LKEY); + wqe->data[f].addr = 0; + } + } + } + + INIT_WORK(&rq->dim_obj.dim.work, mlx5e_rx_dim_work); + + switch (params->rx_cq_moderation.cq_period_mode) { + case MLX5_CQ_PERIOD_MODE_START_FROM_CQE: + rq->dim_obj.dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_CQE; + break; + case MLX5_CQ_PERIOD_MODE_START_FROM_EQE: + default: + rq->dim_obj.dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE; + } + + return 0; + +err_free_shampo: + mlx5e_rq_free_shampo(rq); +err_free_by_rq_type: + switch (rq->wq_type) { + case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ: + kvfree(rq->mpwqe.info); +err_rq_mkey: + mlx5_core_destroy_mkey(mdev, rq->umr_mkey); +err_rq_drop_page: + mlx5e_free_mpwqe_rq_drop_page(rq); + break; + default: /* MLX5_WQ_TYPE_CYCLIC */ + mlx5e_free_di_list(rq); +err_rq_frags: + kvfree(rq->wqe.frags); + } +err_rq_wq_destroy: + mlx5_wq_destroy(&rq->wq_ctrl); +err_rq_xdp_prog: + if (params->xdp_prog) + bpf_prog_put(params->xdp_prog); + + return err; +} + +static void mlx5e_free_rq(struct mlx5e_rq *rq) +{ + struct bpf_prog *old_prog; + + if (xdp_rxq_info_is_reg(&rq->xdp_rxq)) { + old_prog = rcu_dereference_protected(rq->xdp_prog, + lockdep_is_held(&rq->priv->state_lock)); + if (old_prog) + bpf_prog_put(old_prog); + } + + if (rq->page_cache.page_cache) + mlx5e_rx_free_page_cache(rq); + + switch (rq->wq_type) { + case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ: + kvfree(rq->mpwqe.info); + mlx5_core_destroy_mkey(rq->mdev, rq->umr_mkey); + mlx5e_free_mpwqe_rq_drop_page(rq); + mlx5e_rq_free_shampo(rq); + break; + default: /* MLX5_WQ_TYPE_CYCLIC */ + kvfree(rq->wqe.frags); + mlx5e_free_di_list(rq); + } + + xdp_rxq_info_unreg(&rq->xdp_rxq); + page_pool_destroy(rq->page_pool); + mlx5_wq_destroy(&rq->wq_ctrl); +} + +static int mlx5e_set_delay_drop(struct mlx5e_priv *priv, + struct mlx5e_params *params) +{ + struct mlx5e_delay_drop *delay_drop = &priv->delay_drop; + int err = 0; + + if (!MLX5E_GET_PFLAG(params, MLX5E_PFLAG_DROPLESS_RQ)) { + delay_drop->activate = false; + return 0; + } + + mutex_lock(&delay_drop->lock); + if (delay_drop->activate) + goto out; + + err = mlx5_core_set_delay_drop(priv->mdev, delay_drop->usec_timeout); + if (err) + goto out; + + delay_drop->activate = true; +out: + mutex_unlock(&delay_drop->lock); + return err; +} + +int mlx5e_create_rq(struct mlx5e_rq *rq, struct mlx5e_rq_param *param) +{ + struct mlx5_core_dev *mdev = rq->mdev; + u8 ts_format; + void *in; + void *rqc; + void *wq; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(create_rq_in) + + sizeof(u64) * rq->wq_ctrl.buf.npages; + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + ts_format = mlx5_is_real_time_rq(mdev) ? + MLX5_TIMESTAMP_FORMAT_REAL_TIME : + MLX5_TIMESTAMP_FORMAT_FREE_RUNNING; + rqc = MLX5_ADDR_OF(create_rq_in, in, ctx); + wq = MLX5_ADDR_OF(rqc, rqc, wq); + + memcpy(rqc, param->rqc, sizeof(param->rqc)); + + MLX5_SET(rqc, rqc, cqn, rq->cq.mcq.cqn); + MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RST); + MLX5_SET(rqc, rqc, ts_format, ts_format); + MLX5_SET(wq, wq, log_wq_pg_sz, rq->wq_ctrl.buf.page_shift - + MLX5_ADAPTER_PAGE_SHIFT); + MLX5_SET64(wq, wq, dbr_addr, rq->wq_ctrl.db.dma); + + if (test_bit(MLX5E_RQ_STATE_SHAMPO, &rq->state)) { + MLX5_SET(wq, wq, log_headers_buffer_entry_num, + order_base_2(rq->mpwqe.shampo->hd_per_wq)); + MLX5_SET(wq, wq, headers_mkey, rq->mpwqe.shampo->mkey); + } + + mlx5_fill_page_frag_array(&rq->wq_ctrl.buf, + (__be64 *)MLX5_ADDR_OF(wq, wq, pas)); + + err = mlx5_core_create_rq(mdev, in, inlen, &rq->rqn); + + kvfree(in); + + return err; +} + +int mlx5e_modify_rq_state(struct mlx5e_rq *rq, int curr_state, int next_state) +{ + struct mlx5_core_dev *mdev = rq->mdev; + + void *in; + void *rqc; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(modify_rq_in); + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + if (curr_state == MLX5_RQC_STATE_RST && next_state == MLX5_RQC_STATE_RDY) + mlx5e_rqwq_reset(rq); + + rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx); + + MLX5_SET(modify_rq_in, in, rq_state, curr_state); + MLX5_SET(rqc, rqc, state, next_state); + + err = mlx5_core_modify_rq(mdev, rq->rqn, in); + + kvfree(in); + + return err; +} + +static int mlx5e_modify_rq_scatter_fcs(struct mlx5e_rq *rq, bool enable) +{ + struct mlx5_core_dev *mdev = rq->mdev; + + void *in; + void *rqc; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(modify_rq_in); + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx); + + MLX5_SET(modify_rq_in, in, rq_state, MLX5_RQC_STATE_RDY); + MLX5_SET64(modify_rq_in, in, modify_bitmask, + MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_SCATTER_FCS); + MLX5_SET(rqc, rqc, scatter_fcs, enable); + MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RDY); + + err = mlx5_core_modify_rq(mdev, rq->rqn, in); + + kvfree(in); + + return err; +} + +static int mlx5e_modify_rq_vsd(struct mlx5e_rq *rq, bool vsd) +{ + struct mlx5_core_dev *mdev = rq->mdev; + void *in; + void *rqc; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(modify_rq_in); + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx); + + MLX5_SET(modify_rq_in, in, rq_state, MLX5_RQC_STATE_RDY); + MLX5_SET64(modify_rq_in, in, modify_bitmask, + MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_VSD); + MLX5_SET(rqc, rqc, vsd, vsd); + MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RDY); + + err = mlx5_core_modify_rq(mdev, rq->rqn, in); + + kvfree(in); + + return err; +} + +void mlx5e_destroy_rq(struct mlx5e_rq *rq) +{ + mlx5_core_destroy_rq(rq->mdev, rq->rqn); +} + +int mlx5e_wait_for_min_rx_wqes(struct mlx5e_rq *rq, int wait_time) +{ + unsigned long exp_time = jiffies + msecs_to_jiffies(wait_time); + + u16 min_wqes = mlx5_min_rx_wqes(rq->wq_type, mlx5e_rqwq_get_size(rq)); + + do { + if (mlx5e_rqwq_get_cur_sz(rq) >= min_wqes) + return 0; + + msleep(20); + } while (time_before(jiffies, exp_time)); + + netdev_warn(rq->netdev, "Failed to get min RX wqes on Channel[%d] RQN[0x%x] wq cur_sz(%d) min_rx_wqes(%d)\n", + rq->ix, rq->rqn, mlx5e_rqwq_get_cur_sz(rq), min_wqes); + + if (rq->mdev->state != MLX5_DEVICE_STATE_INTERNAL_ERROR) + mlx5e_reporter_rx_timeout(rq); + return -ETIMEDOUT; +} + +void mlx5e_free_rx_in_progress_descs(struct mlx5e_rq *rq) +{ + struct mlx5_wq_ll *wq; + u16 head; + int i; + + if (rq->wq_type != MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) + return; + + wq = &rq->mpwqe.wq; + head = wq->head; + + /* Outstanding UMR WQEs (in progress) start at wq->head */ + for (i = 0; i < rq->mpwqe.umr_in_progress; i++) { + rq->dealloc_wqe(rq, head); + head = mlx5_wq_ll_get_wqe_next_ix(wq, head); + } + + if (test_bit(MLX5E_RQ_STATE_SHAMPO, &rq->state)) { + u16 len; + + len = (rq->mpwqe.shampo->pi - rq->mpwqe.shampo->ci) & + (rq->mpwqe.shampo->hd_per_wq - 1); + mlx5e_shampo_dealloc_hd(rq, len, rq->mpwqe.shampo->ci, false); + rq->mpwqe.shampo->pi = rq->mpwqe.shampo->ci; + } + + rq->mpwqe.actual_wq_head = wq->head; + rq->mpwqe.umr_in_progress = 0; + rq->mpwqe.umr_completed = 0; +} + +void mlx5e_free_rx_descs(struct mlx5e_rq *rq) +{ + __be16 wqe_ix_be; + u16 wqe_ix; + + if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) { + struct mlx5_wq_ll *wq = &rq->mpwqe.wq; + + mlx5e_free_rx_in_progress_descs(rq); + + while (!mlx5_wq_ll_is_empty(wq)) { + struct mlx5e_rx_wqe_ll *wqe; + + wqe_ix_be = *wq->tail_next; + wqe_ix = be16_to_cpu(wqe_ix_be); + wqe = mlx5_wq_ll_get_wqe(wq, wqe_ix); + rq->dealloc_wqe(rq, wqe_ix); + mlx5_wq_ll_pop(wq, wqe_ix_be, + &wqe->next.next_wqe_index); + } + + if (test_bit(MLX5E_RQ_STATE_SHAMPO, &rq->state)) + mlx5e_shampo_dealloc_hd(rq, rq->mpwqe.shampo->hd_per_wq, + 0, true); + } else { + struct mlx5_wq_cyc *wq = &rq->wqe.wq; + + while (!mlx5_wq_cyc_is_empty(wq)) { + wqe_ix = mlx5_wq_cyc_get_tail(wq); + rq->dealloc_wqe(rq, wqe_ix); + mlx5_wq_cyc_pop(wq); + } + } + +} + +int mlx5e_open_rq(struct mlx5e_priv *priv, struct mlx5e_params *params, + struct mlx5e_rq_param *param, struct mlx5e_xsk_param *xsk, + struct mlx5e_create_cq_param *ccp, struct dim_cq_moder moder, + int node, struct mlx5e_rq *rq) +{ + struct mlx5_core_dev *mdev = rq->mdev; + int err; + + if (priv->shared_rq) { + /* Init post_wqes function handler for non-existent + * RQ so we don't need extra checks in datapath + */ + mlx5e_rq_init_handler(rq); + return 0; + } + + if (params->packet_merge.type == MLX5E_PACKET_MERGE_SHAMPO) + __set_bit(MLX5E_RQ_STATE_SHAMPO, &rq->state); + + err = mlx5e_open_cq(priv, moder, ¶m->cqp, ccp, &rq->cq); + if (err) + return err; + + err = mlx5e_alloc_rq(params, xsk, param, node, rq); + if (err) + goto err_dealloc_rq; + + err = mlx5e_create_rq(rq, param); + if (err) + goto err_free_rq; + + err = mlx5e_set_delay_drop(rq->priv, params); + if (err) + mlx5_core_warn(mdev, "Failed to enable delay drop err=%d\n", + err); + + err = mlx5e_modify_rq_state(rq, MLX5_RQC_STATE_RST, MLX5_RQC_STATE_RDY); + if (err) + goto err_destroy_rq; + + if (mlx5e_is_tls_on(rq->priv) && !mlx5e_accel_is_ktls_device(mdev)) + __set_bit(MLX5E_RQ_STATE_FPGA_TLS, &rq->state); /* must be FPGA */ + + if (MLX5_CAP_ETH(mdev, cqe_checksum_full)) + __set_bit(MLX5E_RQ_STATE_CSUM_FULL, &rq->state); + + if (params->rx_dim_enabled) + __set_bit(MLX5E_RQ_STATE_AM, &rq->state); + + /* We disable csum_complete when XDP is enabled since + * XDP programs might manipulate packets which will render + * skb->checksum incorrect. + */ + if (MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_NO_CSUM_COMPLETE) || params->xdp_prog) + __set_bit(MLX5E_RQ_STATE_NO_CSUM_COMPLETE, &rq->state); + + /* For CQE compression on striding RQ, use stride index provided by + * HW if capability is supported. + */ + if (MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_STRIDING_RQ) && + MLX5_CAP_GEN(mdev, mini_cqe_resp_stride_index)) + __set_bit(MLX5E_RQ_STATE_MINI_CQE_HW_STRIDX, &rq->state); + + return 0; + +err_destroy_rq: + mlx5e_destroy_rq(rq); +err_free_rq: + mlx5e_free_rq(rq); +err_dealloc_rq: + mlx5e_close_cq(&rq->cq); + return err; +} + +void mlx5e_activate_rq(struct mlx5e_rq *rq) +{ + set_bit(MLX5E_RQ_STATE_ENABLED, &rq->state); +} + +void mlx5e_deactivate_rq(struct mlx5e_rq *rq) +{ + clear_bit(MLX5E_RQ_STATE_ENABLED, &rq->state); +} + +void mlx5e_close_rq(struct mlx5e_priv *priv, struct mlx5e_rq *rq) +{ + if (priv->shared_rq) + return; + + cancel_work_sync(&rq->dim_obj.dim.work); + cancel_work_sync(&rq->recover_work); + mlx5e_destroy_rq(rq); + mlx5e_free_rx_descs(rq); + mlx5e_free_rq(rq); + mlx5e_close_cq(&rq->cq); + memset(rq, 0, sizeof(*rq)); +} + +static void mlx5e_free_xdpsq_db(struct mlx5e_xdpsq *sq) +{ + kvfree(sq->db.xdpi_fifo.xi); + kvfree(sq->db.wqe_info); +} + +static int mlx5e_alloc_xdpsq_fifo(struct mlx5e_xdpsq *sq, int numa) +{ + struct mlx5e_xdp_info_fifo *xdpi_fifo = &sq->db.xdpi_fifo; + int wq_sz = mlx5_wq_cyc_get_size(&sq->wq); + int dsegs_per_wq = wq_sz * MLX5_SEND_WQEBB_NUM_DS; + size_t size; + + size = array_size(sizeof(*xdpi_fifo->xi), dsegs_per_wq); + xdpi_fifo->xi = kvzalloc_node(size, GFP_KERNEL, numa); + if (!xdpi_fifo->xi) + return -ENOMEM; + + xdpi_fifo->pc = &sq->xdpi_fifo_pc; + xdpi_fifo->cc = &sq->xdpi_fifo_cc; + xdpi_fifo->mask = dsegs_per_wq - 1; + + return 0; +} + +static int mlx5e_alloc_xdpsq_db(struct mlx5e_xdpsq *sq, int numa) +{ + int wq_sz = mlx5_wq_cyc_get_size(&sq->wq); + size_t size; + int err; + + size = array_size(sizeof(*sq->db.wqe_info), wq_sz); + sq->db.wqe_info = kvzalloc_node(size, GFP_KERNEL, numa); + if (!sq->db.wqe_info) + return -ENOMEM; + + err = mlx5e_alloc_xdpsq_fifo(sq, numa); + if (err) { + mlx5e_free_xdpsq_db(sq); + return err; + } + + return 0; +} + +static int mlx5e_alloc_xdpsq(struct mlx5e_channel *c, + struct mlx5e_params *params, + struct xsk_buff_pool *xsk_pool, + struct mlx5e_sq_param *param, + struct mlx5e_xdpsq *sq, + bool is_redirect) +{ + void *sqc_wq = MLX5_ADDR_OF(sqc, param->sqc, wq); + struct mlx5_core_dev *mdev = c->mdev; + struct mlx5_wq_cyc *wq = &sq->wq; + int err; + + sq->pdev = c->pdev; + sq->mkey_be = c->mkey_be; + sq->channel = c; + sq->uar_map = mdev->mlx5e_res.hw_objs.bfreg.map; + sq->min_inline_mode = params->tx_min_inline_mode; + sq->hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu); + sq->xsk_pool = xsk_pool; + + sq->stats = sq->xsk_pool ? + &c->priv->channel_stats[c->ix]->xsksq : + is_redirect ? + &c->priv->channel_stats[c->ix]->xdpsq : + &c->priv->channel_stats[c->ix]->rq_xdpsq; + sq->stop_room = param->is_mpw ? mlx5e_stop_room_for_mpwqe(mdev) : + mlx5e_stop_room_for_max_wqe(mdev); + sq->max_sq_mpw_wqebbs = mlx5e_get_max_sq_aligned_wqebbs(mdev); + + param->wq.db_numa_node = cpu_to_node(c->cpu); + err = mlx5_wq_cyc_create(mdev, ¶m->wq, sqc_wq, wq, &sq->wq_ctrl); + if (err) + return err; + wq->db = &wq->db[MLX5_SND_DBR]; + + err = mlx5e_alloc_xdpsq_db(sq, cpu_to_node(c->cpu)); + if (err) + goto err_sq_wq_destroy; + + return 0; + +err_sq_wq_destroy: + mlx5_wq_destroy(&sq->wq_ctrl); + + return err; +} + +static void mlx5e_free_xdpsq(struct mlx5e_xdpsq *sq) +{ + mlx5e_free_xdpsq_db(sq); + mlx5_wq_destroy(&sq->wq_ctrl); +} + +static void mlx5e_free_icosq_db(struct mlx5e_icosq *sq) +{ + kvfree(sq->db.wqe_info); +} + +static int mlx5e_alloc_icosq_db(struct mlx5e_icosq *sq, int numa) +{ + int wq_sz = mlx5_wq_cyc_get_size(&sq->wq); + size_t size; + + size = array_size(wq_sz, sizeof(*sq->db.wqe_info)); + sq->db.wqe_info = kvzalloc_node(size, GFP_KERNEL, numa); + if (!sq->db.wqe_info) + return -ENOMEM; + + return 0; +} + +static void mlx5e_icosq_err_cqe_work(struct work_struct *recover_work) +{ + struct mlx5e_icosq *sq = container_of(recover_work, struct mlx5e_icosq, + recover_work); + + mlx5e_reporter_icosq_cqe_err(sq); +} + +static void mlx5e_async_icosq_err_cqe_work(struct work_struct *recover_work) +{ + struct mlx5e_icosq *sq = container_of(recover_work, struct mlx5e_icosq, + recover_work); + + /* Not implemented yet. */ + + netdev_warn(sq->channel->netdev, "async_icosq recovery is not implemented\n"); +} + +static int mlx5e_alloc_icosq(struct mlx5e_channel *c, + struct mlx5e_sq_param *param, + struct mlx5e_icosq *sq, + work_func_t recover_work_func) +{ + void *sqc_wq = MLX5_ADDR_OF(sqc, param->sqc, wq); + struct mlx5_core_dev *mdev = c->mdev; + struct mlx5_wq_cyc *wq = &sq->wq; + int err; + + sq->channel = c; + sq->uar_map = mdev->mlx5e_res.hw_objs.bfreg.map; + sq->reserved_room = param->stop_room; + + param->wq.db_numa_node = cpu_to_node(c->cpu); + err = mlx5_wq_cyc_create(mdev, ¶m->wq, sqc_wq, wq, &sq->wq_ctrl); + if (err) + return err; + wq->db = &wq->db[MLX5_SND_DBR]; + + err = mlx5e_alloc_icosq_db(sq, cpu_to_node(c->cpu)); + if (err) + goto err_sq_wq_destroy; + + INIT_WORK(&sq->recover_work, recover_work_func); + + return 0; + +err_sq_wq_destroy: + mlx5_wq_destroy(&sq->wq_ctrl); + + return err; +} + +static void mlx5e_free_icosq(struct mlx5e_icosq *sq) +{ + mlx5e_free_icosq_db(sq); + mlx5_wq_destroy(&sq->wq_ctrl); +} + +void mlx5e_free_txqsq_db(struct mlx5e_txqsq *sq) +{ + kvfree(sq->db.wqe_info); + kvfree(sq->db.skb_fifo.fifo); + kvfree(sq->db.dma_fifo); +} + +int mlx5e_alloc_txqsq_db(struct mlx5e_txqsq *sq, int numa) +{ + int wq_sz = mlx5_wq_cyc_get_size(&sq->wq); + int df_sz = wq_sz * MLX5_SEND_WQEBB_NUM_DS; + + sq->db.dma_fifo = kvzalloc_node(array_size(df_sz, + sizeof(*sq->db.dma_fifo)), + GFP_KERNEL, numa); + sq->db.skb_fifo.fifo = kvzalloc_node(array_size(df_sz, + sizeof(*sq->db.skb_fifo.fifo)), + GFP_KERNEL, numa); + sq->db.wqe_info = kvzalloc_node(array_size(wq_sz, + sizeof(*sq->db.wqe_info)), + GFP_KERNEL, numa); + if (!sq->db.dma_fifo || !sq->db.skb_fifo.fifo || !sq->db.wqe_info) { + mlx5e_free_txqsq_db(sq); + return -ENOMEM; + } + + sq->dma_fifo_mask = df_sz - 1; + + sq->db.skb_fifo.pc = &sq->skb_fifo_pc; + sq->db.skb_fifo.cc = &sq->skb_fifo_cc; + sq->db.skb_fifo.mask = df_sz - 1; + + return 0; +} + +static int mlx5e_alloc_txqsq(struct mlx5e_channel *c, + int txq_ix, + struct mlx5e_params *params, + struct mlx5e_sq_param *param, + struct mlx5e_txqsq *sq, + int tc) +{ + void *sqc_wq = MLX5_ADDR_OF(sqc, param->sqc, wq); + struct mlx5_core_dev *mdev = c->mdev; + struct mlx5_wq_cyc *wq = &sq->wq; + int err; + + sq->pdev = c->pdev; + sq->clock = &mdev->clock; + sq->mkey_be = c->mkey_be; + sq->netdev = c->netdev; + sq->mdev = c->mdev; + sq->priv = c->priv; + sq->ch_ix = c->ix; + sq->txq_ix = txq_ix; + sq->uar_map = mdev->mlx5e_res.hw_objs.bfreg.map; + sq->min_inline_mode = params->tx_min_inline_mode; + sq->hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu); + sq->max_sq_mpw_wqebbs = mlx5e_get_max_sq_aligned_wqebbs(mdev); + INIT_WORK(&sq->recover_work, mlx5e_tx_err_cqe_work); + if (!MLX5_CAP_ETH(mdev, wqe_vlan_insert)) + set_bit(MLX5E_SQ_STATE_VLAN_NEED_L2_INLINE, &sq->state); + if (MLX5_IPSEC_DEV(c->priv->mdev)) + set_bit(MLX5E_SQ_STATE_IPSEC, &sq->state); + if (param->is_mpw) + set_bit(MLX5E_SQ_STATE_MPWQE, &sq->state); + sq->stop_room = param->stop_room; + sq->ptp_cyc2time = mlx5_sq_ts_translator(mdev); + + param->wq.db_numa_node = cpu_to_node(c->cpu); + err = mlx5_wq_cyc_create(mdev, ¶m->wq, sqc_wq, wq, &sq->wq_ctrl); + if (err) + return err; + wq->db = &wq->db[MLX5_SND_DBR]; + + err = mlx5e_alloc_txqsq_db(sq, cpu_to_node(c->cpu)); + if (err) + goto err_sq_wq_destroy; + + INIT_WORK(&sq->dim_obj.dim.work, mlx5e_tx_dim_work); + sq->dim_obj.dim.mode = params->tx_cq_moderation.cq_period_mode; + + return 0; + +err_sq_wq_destroy: + mlx5_wq_destroy(&sq->wq_ctrl); + + return err; +} + +void mlx5e_free_txqsq(struct mlx5e_txqsq *sq) +{ + mlx5e_free_txqsq_db(sq); + mlx5_wq_destroy(&sq->wq_ctrl); +} + +static int mlx5e_create_sq(struct mlx5_core_dev *mdev, + struct mlx5e_sq_param *param, + struct mlx5e_create_sq_param *csp, + u32 *sqn) +{ + u8 ts_format; + void *in; + void *sqc; + void *wq; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(create_sq_in) + + sizeof(u64) * csp->wq_ctrl->buf.npages; + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + ts_format = mlx5_is_real_time_sq(mdev) ? + MLX5_TIMESTAMP_FORMAT_REAL_TIME : + MLX5_TIMESTAMP_FORMAT_FREE_RUNNING; + sqc = MLX5_ADDR_OF(create_sq_in, in, ctx); + wq = MLX5_ADDR_OF(sqc, sqc, wq); + + memcpy(sqc, param->sqc, sizeof(param->sqc)); + MLX5_SET(sqc, sqc, tis_lst_sz, csp->tis_lst_sz); + MLX5_SET(sqc, sqc, tis_num_0, csp->tisn); + MLX5_SET(sqc, sqc, cqn, csp->cqn); + MLX5_SET(sqc, sqc, ts_cqe_to_dest_cqn, csp->ts_cqe_to_dest_cqn); + MLX5_SET(sqc, sqc, ts_format, ts_format); + + + if (MLX5_CAP_ETH(mdev, wqe_inline_mode) == MLX5_CAP_INLINE_MODE_VPORT_CONTEXT) + MLX5_SET(sqc, sqc, min_wqe_inline_mode, csp->min_inline_mode); + + MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RST); + MLX5_SET(sqc, sqc, flush_in_error_en, 1); + + MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC); + MLX5_SET(wq, wq, uar_page, mdev->mlx5e_res.hw_objs.bfreg.index); + MLX5_SET(wq, wq, log_wq_pg_sz, csp->wq_ctrl->buf.page_shift - + MLX5_ADAPTER_PAGE_SHIFT); + MLX5_SET64(wq, wq, dbr_addr, csp->wq_ctrl->db.dma); + + mlx5_fill_page_frag_array(&csp->wq_ctrl->buf, + (__be64 *)MLX5_ADDR_OF(wq, wq, pas)); + + err = mlx5_core_create_sq(mdev, in, inlen, sqn); + + kvfree(in); + + return err; +} + +int mlx5e_modify_sq(struct mlx5_core_dev *mdev, u32 sqn, + struct mlx5e_modify_sq_param *p) +{ + u64 bitmask = 0; + void *in; + void *sqc; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(modify_sq_in); + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx); + + MLX5_SET(modify_sq_in, in, sq_state, p->curr_state); + MLX5_SET(sqc, sqc, state, p->next_state); + if (p->rl_update && p->next_state == MLX5_SQC_STATE_RDY) { + bitmask |= 1; + MLX5_SET(sqc, sqc, packet_pacing_rate_limit_index, p->rl_index); + } + if (p->qos_update && p->next_state == MLX5_SQC_STATE_RDY) { + bitmask |= 1 << 2; + MLX5_SET(sqc, sqc, qos_queue_group_id, p->qos_queue_group_id); + } + MLX5_SET64(modify_sq_in, in, modify_bitmask, bitmask); + + err = mlx5_core_modify_sq(mdev, sqn, in); + + kvfree(in); + + return err; +} + +void mlx5e_destroy_sq(struct mlx5_core_dev *mdev, u32 sqn) +{ + mlx5_core_destroy_sq(mdev, sqn); +} + +int mlx5e_create_sq_rdy(struct mlx5_core_dev *mdev, + struct mlx5e_sq_param *param, + struct mlx5e_create_sq_param *csp, + u16 qos_queue_group_id, + u32 *sqn) +{ + struct mlx5e_modify_sq_param msp = {0}; + int err; + + err = mlx5e_create_sq(mdev, param, csp, sqn); + if (err) + return err; + + msp.curr_state = MLX5_SQC_STATE_RST; + msp.next_state = MLX5_SQC_STATE_RDY; + if (qos_queue_group_id) { + msp.qos_update = true; + msp.qos_queue_group_id = qos_queue_group_id; + } + err = mlx5e_modify_sq(mdev, *sqn, &msp); + if (err) + mlx5e_destroy_sq(mdev, *sqn); + + return err; +} + +static int mlx5e_set_sq_maxrate(struct net_device *dev, + struct mlx5e_txqsq *sq, u32 rate); + +int mlx5e_open_txqsq(struct mlx5e_channel *c, u32 tisn, int txq_ix, + struct mlx5e_params *params, struct mlx5e_sq_param *param, + struct mlx5e_txqsq *sq, int tc, u16 qos_queue_group_id, + struct mlx5e_sq_stats *sq_stats) +{ + struct mlx5e_create_sq_param csp = {}; + u32 tx_rate; + int err; + + err = mlx5e_alloc_txqsq(c, txq_ix, params, param, sq, tc); + if (err) + return err; + + sq->stats = sq_stats; + + csp.tisn = tisn; + csp.tis_lst_sz = 1; + csp.cqn = sq->cq.mcq.cqn; + csp.wq_ctrl = &sq->wq_ctrl; + csp.min_inline_mode = sq->min_inline_mode; + err = mlx5e_create_sq_rdy(c->mdev, param, &csp, qos_queue_group_id, &sq->sqn); + if (err) + goto err_free_txqsq; + + tx_rate = c->priv->tx_rates[sq->txq_ix]; + if (tx_rate) + mlx5e_set_sq_maxrate(c->netdev, sq, tx_rate); + + if (params->tx_dim_enabled) + sq->state |= BIT(MLX5E_SQ_STATE_AM); + + return 0; + +err_free_txqsq: + mlx5e_free_txqsq(sq); + + return err; +} + +void mlx5e_enable_txqsq(struct mlx5e_txqsq *sq) +{ + sq->txq = netdev_get_tx_queue(sq->netdev, sq->txq_ix); + set_bit(MLX5E_SQ_STATE_ENABLED, &sq->state); +} + +void mlx5e_start_txqsq(struct mlx5e_txqsq *sq) +{ + sq->txq = netdev_get_tx_queue(sq->netdev, sq->txq_ix); + netdev_tx_reset_queue(sq->txq); + netif_tx_start_queue(sq->txq); +} +void mlx5e_tx_disable_queue(struct netdev_queue *txq) +{ + __netif_tx_lock_bh(txq); + netif_tx_stop_queue(txq); + __netif_tx_unlock_bh(txq); +} + +void mlx5e_disable_txqsq(struct mlx5e_txqsq *sq) +{ + clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state); +} + +void mlx5e_stop_txqsq(struct mlx5e_txqsq *sq) +{ + struct mlx5_wq_cyc *wq = &sq->wq; + + /* The SQ must be deactivated, and synchronize_net must be called before + * this function to prevent netif_tx_wake_queue from reenabling the SQ. + */ + mlx5e_tx_disable_queue(sq->txq); + + /* last doorbell out, godspeed .. */ + if (mlx5e_wqc_has_room_for(wq, sq->cc, sq->pc, 1)) { + u16 pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc); + struct mlx5e_tx_wqe *nop; + + sq->db.wqe_info[pi] = (struct mlx5e_tx_wqe_info) { + .num_wqebbs = 1, + }; + + nop = mlx5e_post_nop(wq, sq->sqn, &sq->pc); + mlx5e_notify_hw(wq, sq->pc, sq->uar_map, &nop->ctrl); + } +} + +void mlx5e_close_txqsq(struct mlx5e_txqsq *sq) +{ + struct mlx5_core_dev *mdev = sq->mdev; + struct mlx5_rate_limit rl = {0}; + + cancel_work_sync(&sq->dim_obj.dim.work); + cancel_work_sync(&sq->recover_work); + mlx5e_destroy_sq(mdev, sq->sqn); + if (sq->rate_limit) { + rl.rate = sq->rate_limit; + mlx5_rl_remove_rate(mdev, &rl); + } + mlx5e_free_txqsq_descs(sq); + mlx5e_free_txqsq(sq); +} + +void mlx5e_tx_err_cqe_work(struct work_struct *recover_work) +{ + struct mlx5e_txqsq *sq = container_of(recover_work, struct mlx5e_txqsq, + recover_work); + + mlx5e_reporter_tx_err_cqe(sq); +} + +static int mlx5e_open_icosq(struct mlx5e_channel *c, struct mlx5e_params *params, + struct mlx5e_sq_param *param, struct mlx5e_icosq *sq, + work_func_t recover_work_func) +{ + struct mlx5e_create_sq_param csp = {}; + int err; + + err = mlx5e_alloc_icosq(c, param, sq, recover_work_func); + if (err) + return err; + + csp.cqn = sq->cq.mcq.cqn; + csp.wq_ctrl = &sq->wq_ctrl; + csp.min_inline_mode = params->tx_min_inline_mode; + err = mlx5e_create_sq_rdy(c->mdev, param, &csp, 0, &sq->sqn); + if (err) + goto err_free_icosq; + + if (param->is_tls) { + sq->ktls_resync = mlx5e_ktls_rx_resync_create_resp_list(); + if (IS_ERR(sq->ktls_resync)) { + err = PTR_ERR(sq->ktls_resync); + goto err_destroy_icosq; + } + } + return 0; + +err_destroy_icosq: + mlx5e_destroy_sq(c->mdev, sq->sqn); +err_free_icosq: + mlx5e_free_icosq(sq); + + return err; +} + +void mlx5e_activate_icosq(struct mlx5e_icosq *icosq) +{ + set_bit(MLX5E_SQ_STATE_ENABLED, &icosq->state); +} + +void mlx5e_deactivate_icosq(struct mlx5e_icosq *icosq) +{ + clear_bit(MLX5E_SQ_STATE_ENABLED, &icosq->state); +} + +static void mlx5e_close_icosq(struct mlx5e_icosq *sq) +{ + struct mlx5e_channel *c = sq->channel; + + if (sq->ktls_resync) + mlx5e_ktls_rx_resync_destroy_resp_list(sq->ktls_resync); + mlx5e_destroy_sq(c->mdev, sq->sqn); + mlx5e_free_icosq_descs(sq); + mlx5e_free_icosq(sq); +} + +int mlx5e_open_xdpsq(struct mlx5e_channel *c, struct mlx5e_params *params, + struct mlx5e_sq_param *param, struct xsk_buff_pool *xsk_pool, + struct mlx5e_xdpsq *sq, bool is_redirect) +{ + struct mlx5e_create_sq_param csp = {}; + int err; + + err = mlx5e_alloc_xdpsq(c, params, xsk_pool, param, sq, is_redirect); + if (err) + return err; + + csp.tis_lst_sz = 1; + csp.tisn = c->priv->tisn[c->lag_port][0]; /* tc = 0 */ + csp.cqn = sq->cq.mcq.cqn; + csp.wq_ctrl = &sq->wq_ctrl; + csp.min_inline_mode = sq->min_inline_mode; + err = mlx5e_create_sq_rdy(c->mdev, param, &csp, 0, &sq->sqn); + if (err) + goto err_free_xdpsq; + + mlx5e_set_xmit_fp(sq, param->is_mpw); + if (MLX5E_GET_PFLAG(params, MLX5E_PFLAG_TX_XDP_CSUM)) + set_bit(MLX5E_SQ_STATE_TX_XDP_CSUM, &sq->state); + + if (!param->is_mpw) { + unsigned int ds_cnt = MLX5E_XDP_TX_DS_COUNT; + unsigned int inline_hdr_sz = 0; + int i; + + if (sq->min_inline_mode != MLX5_INLINE_MODE_NONE) { + inline_hdr_sz = MLX5E_XDP_MIN_INLINE; + ds_cnt++; + } + + /* Pre initialize fixed WQE fields */ + for (i = 0; i < mlx5_wq_cyc_get_size(&sq->wq); i++) { + struct mlx5e_tx_wqe *wqe = mlx5_wq_cyc_get_wqe(&sq->wq, i); + struct mlx5_wqe_ctrl_seg *cseg = &wqe->ctrl; + struct mlx5_wqe_eth_seg *eseg = &wqe->eth; + struct mlx5_wqe_data_seg *dseg; + + sq->db.wqe_info[i] = (struct mlx5e_xdp_wqe_info) { + .num_wqebbs = 1, + .num_pkts = 1, + }; + + cseg->qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt); + eseg->inline_hdr.sz = cpu_to_be16(inline_hdr_sz); + + dseg = (struct mlx5_wqe_data_seg *)cseg + (ds_cnt - 1); + dseg->lkey = sq->mkey_be; + } + } + + return 0; + +err_free_xdpsq: + mlx5e_free_xdpsq(sq); + + return err; +} + +void mlx5e_activate_xdpsq(struct mlx5e_xdpsq *sq) +{ + set_bit(MLX5E_SQ_STATE_ENABLED, &sq->state); +} + +void mlx5e_deactivate_xdpsq(struct mlx5e_xdpsq *sq) +{ + clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state); +} + +void mlx5e_close_xdpsq(struct mlx5e_xdpsq *sq) +{ + struct mlx5e_channel *c = sq->channel; + + mlx5e_destroy_sq(c->mdev, sq->sqn); + mlx5e_free_xdpsq_descs(sq); + mlx5e_free_xdpsq(sq); +} + +int mlx5e_alloc_cq_common(struct mlx5e_priv *priv, + struct mlx5e_cq_param *param, + struct mlx5e_cq *cq) +{ + struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5_core_cq *mcq = &cq->mcq; + int err; + u32 i; + + err = mlx5_cqwq_create(mdev, ¶m->wq, param->cqc, &cq->wq, + &cq->wq_ctrl); + if (err) + return err; + + mcq->cqe_sz = 64; + mcq->set_ci_db = cq->wq_ctrl.db.db; + mcq->arm_db = cq->wq_ctrl.db.db + 1; + *mcq->set_ci_db = 0; + *mcq->arm_db = 0; + mcq->vector = param->eq_ix; + mcq->comp = mlx5e_completion_event; + mcq->event = mlx5e_cq_error_event; + + for (i = 0; i < mlx5_cqwq_get_size(&cq->wq); i++) { + struct mlx5_cqe64 *cqe = mlx5_cqwq_get_wqe(&cq->wq, i); + + cqe->op_own = 0xf1; + } + + cq->mdev = mdev; + cq->netdev = priv->netdev; + cq->priv = priv; + + return 0; +} + +static int mlx5e_alloc_cq(struct mlx5e_priv *priv, + struct mlx5e_cq_param *param, + struct mlx5e_create_cq_param *ccp, + struct mlx5e_cq *cq) +{ + int err; + + param->wq.buf_numa_node = ccp->node; + param->wq.db_numa_node = ccp->node; + param->eq_ix = ccp->ix; + + err = mlx5e_alloc_cq_common(priv, param, cq); + + cq->napi = ccp->napi; + cq->ch_stats = ccp->ch_stats; + + return err; +} + +void mlx5e_free_cq(struct mlx5e_cq *cq) +{ + mlx5_wq_destroy(&cq->wq_ctrl); +} + +int mlx5e_create_cq(struct mlx5e_cq *cq, struct mlx5e_cq_param *param) +{ + u32 out[MLX5_ST_SZ_DW(create_cq_out)]; + struct mlx5_core_dev *mdev = cq->mdev; + struct mlx5_core_cq *mcq = &cq->mcq; + + void *in; + void *cqc; + int inlen; + int eqn; + int err; + + err = mlx5_vector2eqn(mdev, param->eq_ix, &eqn); + if (err) + return err; + + inlen = MLX5_ST_SZ_BYTES(create_cq_in) + + sizeof(u64) * cq->wq_ctrl.buf.npages; + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context); + + memcpy(cqc, param->cqc, sizeof(param->cqc)); + + mlx5_fill_page_frag_array(&cq->wq_ctrl.buf, + (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas)); + + MLX5_SET(cqc, cqc, cq_period_mode, param->cq_period_mode); + MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn); + MLX5_SET(cqc, cqc, uar_page, mdev->priv.uar->index); + MLX5_SET(cqc, cqc, log_page_size, cq->wq_ctrl.buf.page_shift - + MLX5_ADAPTER_PAGE_SHIFT); + MLX5_SET64(cqc, cqc, dbr_addr, cq->wq_ctrl.db.dma); + + err = mlx5_core_create_cq(mdev, mcq, in, inlen, out, sizeof(out)); + + kvfree(in); + + if (err) + return err; + + if (!cq->no_arm) + mlx5e_cq_arm(cq); + + return 0; +} + +static void mlx5e_destroy_cq(struct mlx5e_cq *cq) +{ + mlx5_core_destroy_cq(cq->mdev, &cq->mcq); +} + +int mlx5e_open_cq(struct mlx5e_priv *priv, struct dim_cq_moder moder, + struct mlx5e_cq_param *param, struct mlx5e_create_cq_param *ccp, + struct mlx5e_cq *cq) +{ + struct mlx5_core_dev *mdev = priv->mdev; + int err; + + err = mlx5e_alloc_cq(priv, param, ccp, cq); + if (err) + return err; + + err = mlx5e_create_cq(cq, param); + if (err) + goto err_free_cq; + + if (MLX5_CAP_GEN(mdev, cq_moderation)) + mlx5_core_modify_cq_moderation(mdev, &cq->mcq, moder.usec, moder.pkts); + return 0; + +err_free_cq: + mlx5e_free_cq(cq); + + return err; +} + +void mlx5e_close_cq(struct mlx5e_cq *cq) +{ + mlx5e_destroy_cq(cq); + mlx5e_free_cq(cq); +} + +static int mlx5e_open_tx_cqs(struct mlx5e_channel *c, + struct mlx5e_params *params, + struct mlx5e_create_cq_param *ccp, + struct mlx5e_channel_param *cparam) +{ + int err; + int tc; + + for (tc = 0; tc < c->num_tc; tc++) { + err = mlx5e_open_cq(c->priv, params->tx_cq_moderation, &cparam->txq_sq.cqp, + ccp, &c->sq[tc].cq); + if (err) + goto err_close_tx_cqs; + } + + return 0; + +err_close_tx_cqs: + for (tc--; tc >= 0; tc--) + mlx5e_close_cq(&c->sq[tc].cq); + + return err; +} + +static void mlx5e_close_tx_cqs(struct mlx5e_channel *c) +{ + int tc; + + for (tc = 0; tc < c->num_tc; tc++) + mlx5e_close_cq(&c->sq[tc].cq); +} + +static int mlx5e_mqprio_txq_to_tc(struct netdev_tc_txq *tc_to_txq, unsigned int txq) +{ + int tc; + + for (tc = 0; tc < TC_MAX_QUEUE; tc++) + if (txq - tc_to_txq[tc].offset < tc_to_txq[tc].count) + return tc; + + WARN(1, "Unexpected TCs configuration. No match found for txq %u", txq); + return -ENOENT; +} + +static int mlx5e_txq_get_qos_node_hw_id(struct mlx5e_params *params, int txq_ix, + u32 *hw_id) +{ + int tc; + + if (params->mqprio.mode != TC_MQPRIO_MODE_CHANNEL) { + *hw_id = 0; + return 0; + } + + tc = mlx5e_mqprio_txq_to_tc(params->mqprio.tc_to_txq, txq_ix); + if (tc < 0) + return tc; + + if (tc >= params->mqprio.num_tc) { + WARN(1, "Unexpected TCs configuration. tc %d is out of range of %u", + tc, params->mqprio.num_tc); + return -EINVAL; + } + + *hw_id = params->mqprio.channel.hw_id[tc]; + return 0; +} + +static int mlx5e_open_sqs(struct mlx5e_channel *c, + struct mlx5e_params *params, + struct mlx5e_channel_param *cparam) +{ + int err, tc; + + for (tc = 0; tc < mlx5e_get_dcb_num_tc(params); tc++) { + int txq_ix = c->ix + tc * params->num_channels; + u32 qos_queue_group_id; + + err = mlx5e_txq_get_qos_node_hw_id(params, txq_ix, &qos_queue_group_id); + if (err) + goto err_close_sqs; + + err = mlx5e_open_txqsq(c, c->priv->tisn[c->lag_port][tc], txq_ix, + params, &cparam->txq_sq, &c->sq[tc], tc, + qos_queue_group_id, + &c->priv->channel_stats[c->ix]->sq[tc]); + if (err) + goto err_close_sqs; + } + + return 0; + +err_close_sqs: + for (tc--; tc >= 0; tc--) + mlx5e_close_txqsq(&c->sq[tc]); + + return err; +} + +static void mlx5e_close_sqs(struct mlx5e_channel *c) +{ + int tc; + + for (tc = 0; tc < c->num_tc; tc++) + mlx5e_close_txqsq(&c->sq[tc]); +} + +static int mlx5e_set_sq_maxrate(struct net_device *dev, + struct mlx5e_txqsq *sq, u32 rate) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5e_modify_sq_param msp = {0}; + struct mlx5_rate_limit rl = {0}; + u16 rl_index = 0; + int err; + + if (rate == sq->rate_limit) + /* nothing to do */ + return 0; + + if (sq->rate_limit) { + rl.rate = sq->rate_limit; + /* remove current rl index to free space to next ones */ + mlx5_rl_remove_rate(mdev, &rl); + } + + sq->rate_limit = 0; + + if (rate) { + rl.rate = rate; + err = mlx5_rl_add_rate(mdev, &rl_index, &rl); + if (err) { + netdev_err(dev, "Failed configuring rate %u: %d\n", + rate, err); + return err; + } + } + + msp.curr_state = MLX5_SQC_STATE_RDY; + msp.next_state = MLX5_SQC_STATE_RDY; + msp.rl_index = rl_index; + msp.rl_update = true; + err = mlx5e_modify_sq(mdev, sq->sqn, &msp); + if (err) { + netdev_err(dev, "Failed configuring rate %u: %d\n", + rate, err); + /* remove the rate from the table */ + if (rate) + mlx5_rl_remove_rate(mdev, &rl); + return err; + } + + sq->rate_limit = rate; + return 0; +} + +static int mlx5e_set_tx_maxrate(struct net_device *dev, int index, u32 rate) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5e_txqsq *sq = priv->txq2sq[index]; + int err = 0; + + if (!mlx5_rl_is_supported(mdev)) { + netdev_err(dev, "Rate limiting is not supported on this device\n"); + return -EINVAL; + } + + /* rate is given in Mb/sec, HW config is in Kb/sec */ + rate = rate << 10; + + /* Check whether rate in valid range, 0 is always valid */ + if (rate && !mlx5_rl_is_in_range(mdev, rate)) { + netdev_err(dev, "TX rate %u, is not in range\n", rate); + return -ERANGE; + } + + mutex_lock(&priv->state_lock); + if (test_bit(MLX5E_STATE_OPENED, &priv->state)) + err = mlx5e_set_sq_maxrate(dev, sq, rate); + if (!err) + priv->tx_rates[index] = rate; + mutex_unlock(&priv->state_lock); + + return err; +} + +static int mlx5e_open_rxq_rq(struct mlx5e_channel *c, struct mlx5e_params *params, + struct mlx5e_create_cq_param *ccp, struct mlx5e_rq_param *rq_params) +{ + int err; + + err = mlx5e_init_rxq_rq(c, params, &c->rq); + if (err) + return err; + + return mlx5e_open_rq(c->priv, params, rq_params, NULL, ccp, params->rx_cq_moderation, cpu_to_node(c->cpu), &c->rq); +} + +static int mlx5e_open_queues(struct mlx5e_channel *c, + struct mlx5e_params *params, + struct mlx5e_channel_param *cparam) +{ + struct dim_cq_moder icocq_moder = {0, 0}; + struct mlx5e_create_cq_param ccp; + int err; + + mlx5e_build_create_cq_param(&ccp, c); + + err = mlx5e_open_cq(c->priv, icocq_moder, &cparam->async_icosq.cqp, &ccp, + &c->async_icosq.cq); + if (err) + return err; + + err = mlx5e_open_cq(c->priv, icocq_moder, &cparam->icosq.cqp, &ccp, + &c->icosq.cq); + if (err) + goto err_close_async_icosq_cq; + + err = mlx5e_open_tx_cqs(c, params, &ccp, cparam); + if (err) + goto err_close_icosq_cq; + + err = mlx5e_open_cq(c->priv, params->tx_cq_moderation, &cparam->xdp_sq.cqp, &ccp, + &c->xdpsq.cq); + if (err) + goto err_close_tx_cqs; + + err = c->xdp ? mlx5e_open_cq(c->priv, params->tx_cq_moderation, &cparam->xdp_sq.cqp, + &ccp, &c->rq_xdpsq.cq) : 0; + if (err) + goto err_close_xdp_tx_cqs; + + spin_lock_init(&c->async_icosq_lock); + + err = mlx5e_open_icosq(c, params, &cparam->async_icosq, &c->async_icosq, + mlx5e_async_icosq_err_cqe_work); + if (err) + goto err_close_xdpsq_cq; + + mutex_init(&c->icosq_recovery_lock); + + err = mlx5e_open_icosq(c, params, &cparam->icosq, &c->icosq, + mlx5e_icosq_err_cqe_work); + if (err) + goto err_close_async_icosq; + + err = mlx5e_open_sqs(c, params, cparam); + if (err) + goto err_close_icosq; + + err = mlx5e_open_rxq_rq(c, params, &ccp, &cparam->rq); + if (err) + goto err_close_sqs; + + if (c->xdp) { + err = mlx5e_open_xdpsq(c, params, &cparam->xdp_sq, NULL, + &c->rq_xdpsq, false); + if (err) + goto err_close_rq; + } + + err = mlx5e_open_xdpsq(c, params, &cparam->xdp_sq, NULL, &c->xdpsq, true); + if (err) + goto err_close_xdp_sq; + + return 0; + +err_close_xdp_sq: + if (c->xdp) + mlx5e_close_xdpsq(&c->rq_xdpsq); + +err_close_rq: + mlx5e_close_rq(c->priv, &c->rq); + +err_close_sqs: + mlx5e_close_sqs(c); + +err_close_icosq: + mlx5e_close_icosq(&c->icosq); + +err_close_async_icosq: + mlx5e_close_icosq(&c->async_icosq); + +err_close_xdpsq_cq: + if (c->xdp) + mlx5e_close_cq(&c->rq_xdpsq.cq); + +err_close_xdp_tx_cqs: + mlx5e_close_cq(&c->xdpsq.cq); + +err_close_tx_cqs: + mlx5e_close_tx_cqs(c); + +err_close_icosq_cq: + mlx5e_close_cq(&c->icosq.cq); + +err_close_async_icosq_cq: + mlx5e_close_cq(&c->async_icosq.cq); + + return err; +} + +static void mlx5e_close_queues(struct mlx5e_channel *c) +{ + mlx5e_close_xdpsq(&c->xdpsq); + if (c->xdp) + mlx5e_close_xdpsq(&c->rq_xdpsq); + /* The same ICOSQ is used for UMRs for both RQ and XSKRQ. */ + cancel_work_sync(&c->icosq.recover_work); + mlx5e_close_rq(c->priv, &c->rq); + mlx5e_close_sqs(c); + mlx5e_close_icosq(&c->icosq); + mutex_destroy(&c->icosq_recovery_lock); + mlx5e_close_icosq(&c->async_icosq); + if (c->xdp) + mlx5e_close_cq(&c->rq_xdpsq.cq); + mlx5e_close_cq(&c->xdpsq.cq); + mlx5e_close_tx_cqs(c); + mlx5e_close_cq(&c->icosq.cq); + mlx5e_close_cq(&c->async_icosq.cq); +} + +static u8 mlx5e_enumerate_lag_port(struct mlx5_core_dev *mdev, int ix) +{ + u16 port_aff_bias = mlx5_core_is_pf(mdev) ? 0 : MLX5_CAP_GEN(mdev, vhca_id); + + return (ix + port_aff_bias) % mlx5e_get_num_lag_ports(mdev); +} + +static int mlx5e_channel_stats_alloc(struct mlx5e_priv *priv, int ix, int cpu) +{ + if (ix > priv->stats_nch) { + netdev_warn(priv->netdev, "Unexpected channel stats index %d > %d\n", ix, + priv->stats_nch); + return -EINVAL; + } + + if (priv->channel_stats[ix]) + return 0; + + /* Asymmetric dynamic memory allocation. + * Freed in mlx5e_priv_arrays_free, not on channel closure. + */ + mlx5e_dbg(DRV, priv, "Creating channel stats %d\n", ix); + priv->channel_stats[ix] = kvzalloc_node(sizeof(**priv->channel_stats), + GFP_KERNEL, cpu_to_node(cpu)); + if (!priv->channel_stats[ix]) + return -ENOMEM; + priv->stats_nch++; + + return 0; +} + +void mlx5e_trigger_napi_icosq(struct mlx5e_channel *c) +{ + spin_lock_bh(&c->async_icosq_lock); + mlx5e_trigger_irq(&c->async_icosq); + spin_unlock_bh(&c->async_icosq_lock); +} + +void mlx5e_trigger_napi_sched(struct napi_struct *napi) +{ + local_bh_disable(); + napi_schedule(napi); + local_bh_enable(); +} + +static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix, + struct mlx5e_params *params, + struct mlx5e_channel_param *cparam, + struct xsk_buff_pool *xsk_pool, + struct mlx5e_channel **cp) +{ + struct net_device *netdev = priv->netdev; + struct mlx5e_xsk_param xsk; + const struct cpumask *aff; + struct mlx5e_channel *c; + unsigned int irq; + int err; + int cpu; + + err = mlx5_vector2irqn(priv->mdev, ix, &irq); + if (err) + return err; + + aff = irq_get_effective_affinity_mask(irq); + cpu = cpumask_first(aff); + + err = mlx5e_channel_stats_alloc(priv, ix, cpu); + if (err) + return err; + + c = kvzalloc_node(sizeof(*c), GFP_KERNEL, cpu_to_node(cpu)); + if (!c) + return -ENOMEM; + + c->priv = priv; + c->mdev = priv->mdev; + c->tstamp = &priv->tstamp; + c->ix = ix; + c->cpu = cpu; + c->pdev = mlx5_core_dma_dev(priv->mdev); + c->netdev = priv->netdev; + c->mkey_be = cpu_to_be32(priv->mdev->mlx5e_res.hw_objs.mkey); + c->num_tc = mlx5e_get_dcb_num_tc(params); + c->xdp = !!params->xdp_prog; + c->stats = &priv->channel_stats[ix]->ch; + c->aff_mask = irq_get_effective_affinity_mask(irq); + c->lag_port = mlx5e_enumerate_lag_port(priv->mdev, ix); + + netif_napi_add(netdev, &c->napi, mlx5e_napi_poll, 64); + + err = mlx5e_open_queues(c, params, cparam); + if (unlikely(err)) + goto err_napi_del; + + if (xsk_pool) { + mlx5e_build_xsk_param(xsk_pool, &xsk); + err = mlx5e_open_xsk(priv, params, &xsk, xsk_pool, c); + if (unlikely(err)) + goto err_close_queues; + } + + *cp = c; + + return 0; + +err_close_queues: + mlx5e_close_queues(c); + +err_napi_del: + netif_napi_del(&c->napi); + + kvfree(c); + + return err; +} + +static void mlx5e_rq_channel_activate(struct mlx5e_channel *c) +{ + if (c->priv->shared_rq) + return; + + if (c->xdp) + mlx5e_activate_xdpsq(&c->rq_xdpsq); + mlx5e_activate_rq(&c->rq); + mlx5e_activate_xdpsq(&c->xdpsq); + + if (test_bit(MLX5E_CHANNEL_STATE_XSK, c->state)) + mlx5e_activate_xsk(c); + + mlx5e_trigger_napi_icosq(c); +} + +static void mlx5e_enable_channel(struct mlx5e_channel *c) +{ + napi_enable(&c->napi); +} + +static void mlx5e_start_channel(struct mlx5e_channel *c) +{ + int tc; + + for (tc = 0; tc < c->num_tc; tc++) { + mlx5e_enable_txqsq(&c->sq[tc]); + mlx5e_start_txqsq(&c->sq[tc]); + } + mlx5e_activate_icosq(&c->icosq); + mlx5e_activate_icosq(&c->async_icosq); + mlx5e_rq_channel_activate(c); + + if (!mlx5_core_is_sf(c->priv->mdev)) + mlx5_rename_comp_eq(c->priv->mdev, c->ix, c->priv->netdev->name); + +} + +static void mlx5e_rq_channel_deactivate(struct mlx5e_channel *c) +{ + if (c->priv->shared_rq) + return; + + if (test_bit(MLX5E_CHANNEL_STATE_XSK, c->state)) + mlx5e_deactivate_xsk(c); + + mlx5e_deactivate_xdpsq(&c->xdpsq); + mlx5e_deactivate_rq(&c->rq); + if (c->xdp) + mlx5e_deactivate_xdpsq(&c->rq_xdpsq); +} + +static void mlx5e_disable_channel(struct mlx5e_channel *c) +{ + int tc; + + mlx5_rename_comp_eq(c->priv->mdev, c->ix, NULL); + mlx5e_rq_channel_deactivate(c); + mlx5e_deactivate_icosq(&c->async_icosq); + mlx5e_deactivate_icosq(&c->icosq); + for (tc = 0; tc < c->num_tc; tc++) + mlx5e_disable_txqsq(&c->sq[tc]); + mlx5e_qos_deactivate_queues(c, false); +} + +static void mlx5e_stop_channel(struct mlx5e_channel *c) +{ + int tc; + + for (tc = 0; tc < c->num_tc; tc++) + mlx5e_stop_txqsq(&c->sq[tc]); + mlx5e_qos_deactivate_queues(c, true); + napi_disable(&c->napi); +} + +static void mlx5e_close_channel(struct mlx5e_channel *c) +{ + if (test_bit(MLX5E_CHANNEL_STATE_XSK, c->state)) + mlx5e_close_xsk(c); + mlx5e_close_queues(c); + mlx5e_qos_close_queues(c); + netif_napi_del(&c->napi); + + kvfree(c); +} + +int mlx5e_open_channels(struct mlx5e_priv *priv, + struct mlx5e_channels *chs) +{ + struct mlx5e_channel_param *cparam; + int err = -ENOMEM; + int i; + + chs->num = chs->params.num_channels; + + chs->c = kcalloc(chs->num, sizeof(struct mlx5e_channel *), GFP_KERNEL); + cparam = kvzalloc(sizeof(struct mlx5e_channel_param), GFP_KERNEL); + if (!chs->c || !cparam) + goto err_free; + + err = mlx5e_build_channel_param(priv->mdev, &chs->params, priv->q_counter, cparam); + if (err) + goto err_free; + + for (i = 0; i < chs->num; i++) { + struct xsk_buff_pool *xsk_pool = NULL; + + if (chs->params.xdp_prog) + xsk_pool = mlx5e_xsk_get_pool(&chs->params, chs->params.xsk, i); + + err = mlx5e_open_channel(priv, i, &chs->params, cparam, xsk_pool, &chs->c[i]); + if (err) + goto err_close_channels; + } + + if (MLX5E_GET_PFLAG(&chs->params, MLX5E_PFLAG_TX_PORT_TS) || chs->params.ptp_rx) { + err = mlx5e_ptp_open(priv, &chs->params, chs->c[0]->lag_port, &chs->ptp); + if (err) + goto err_close_channels; + } + + err = mlx5e_qos_open_queues(priv, chs); + if (err) + goto err_close_ptp; + + mlx5e_health_channels_update(priv); + kvfree(cparam); + return 0; + +err_close_ptp: + if (chs->ptp) + mlx5e_ptp_close(chs->ptp); + +err_close_channels: + for (i--; i >= 0; i--) + mlx5e_close_channel(chs->c[i]); + +err_free: + kfree(chs->c); + kvfree(cparam); + chs->num = 0; + return err; +} + +static void mlx5e_activate_channels(struct mlx5e_channels *chs) +{ + int i; + + for (i = 0; i < chs->num; i++) { + mlx5e_enable_channel(chs->c[i]); + mlx5e_start_channel(chs->c[i]); + } + + if (chs->ptp) { + mlx5e_ptp_enable_channel(chs->ptp); + mlx5e_ptp_start_channel(chs->ptp); + } +} + +#define MLX5E_RQ_WQES_TIMEOUT 20000 /* msecs */ + +static int mlx5e_wait_channels_min_rx_wqes(struct mlx5e_channels *chs) +{ + int err = 0; + int i; + + for (i = 0; i < chs->num; i++) { + int timeout = err ? 0 : MLX5E_RQ_WQES_TIMEOUT; + + err |= mlx5e_wait_for_min_rx_wqes(&chs->c[i]->rq, timeout); + + /* Don't wait on the XSK RQ, because the newer xdpsock sample + * doesn't provide any Fill Ring entries at the setup stage. + */ + } + + return err ? -ETIMEDOUT : 0; +} + +static void mlx5e_deactivate_channels(struct mlx5e_channels *chs) +{ + int i; + + if (chs->ptp) + mlx5e_ptp_disable_channel(chs->ptp); + + for (i = 0; i < chs->num; i++) + mlx5e_disable_channel(chs->c[i]); + + /* Sync with all NAPIs to wait until they stop using queues. */ + synchronize_net(); + + if (chs->ptp) + mlx5e_ptp_stop_channel(chs->ptp); + + for (i = 0; i < chs->num; i++) + mlx5e_stop_channel(chs->c[i]); +} + +void mlx5e_close_channels(struct mlx5e_channels *chs) +{ + int i; + + if (chs->ptp) { + mlx5e_ptp_close(chs->ptp); + chs->ptp = NULL; + } + for (i = 0; i < chs->num; i++) + mlx5e_close_channel(chs->c[i]); + + kfree(chs->c); + chs->num = 0; +} + +int mlx5e_modify_tirs_packet_merge(struct mlx5e_priv *priv) +{ + struct mlx5e_rx_res *res = priv->rx_res; + + return mlx5e_rx_res_packet_merge_set_param(res, &priv->channels.params.packet_merge); +} + +static MLX5E_DEFINE_PREACTIVATE_WRAPPER_CTX(mlx5e_modify_tirs_packet_merge); + +static int mlx5e_set_mtu(struct mlx5_core_dev *mdev, + struct mlx5e_params *params, u16 mtu) +{ + u16 hw_mtu = MLX5E_SW2HW_MTU(params, mtu); + int err; + + err = mlx5_set_port_mtu(mdev, hw_mtu, 1); + if (err) + return err; + + /* Update vport context MTU */ + mlx5_modify_nic_vport_mtu(mdev, hw_mtu); + return 0; +} + +static void mlx5e_query_mtu(struct mlx5_core_dev *mdev, + struct mlx5e_params *params, u16 *mtu) +{ + u16 hw_mtu = 0; + int err; + + err = mlx5_query_nic_vport_mtu(mdev, &hw_mtu); + if (err || !hw_mtu) /* fallback to port oper mtu */ + mlx5_query_port_oper_mtu(mdev, &hw_mtu, 1); + + *mtu = MLX5E_HW2SW_MTU(params, hw_mtu); +} + +int mlx5e_set_dev_port_mtu(struct mlx5e_priv *priv) +{ + struct mlx5e_params *params = &priv->channels.params; + struct net_device *netdev = priv->netdev; + struct mlx5_core_dev *mdev = priv->mdev; + u16 mtu; + int err; + + err = mlx5e_set_mtu(mdev, params, params->sw_mtu); + if (err) + return err; + + mlx5e_query_mtu(mdev, params, &mtu); + if (mtu != params->sw_mtu) + netdev_warn(netdev, "%s: VPort MTU %d is different than netdev mtu %d\n", + __func__, mtu, params->sw_mtu); + + params->sw_mtu = mtu; + return 0; +} + +MLX5E_DEFINE_PREACTIVATE_WRAPPER_CTX(mlx5e_set_dev_port_mtu); + +void mlx5e_set_netdev_mtu_boundaries(struct mlx5e_priv *priv) +{ + struct mlx5e_params *params = &priv->channels.params; + struct net_device *netdev = priv->netdev; + struct mlx5_core_dev *mdev = priv->mdev; + u16 max_mtu; + + /* MTU range: 68 - hw-specific max */ + netdev->min_mtu = ETH_MIN_MTU; + + mlx5_query_port_max_mtu(mdev, &max_mtu, 1); + netdev->max_mtu = min_t(unsigned int, MLX5E_HW2SW_MTU(params, max_mtu), + ETH_MAX_MTU); +} + +static int mlx5e_netdev_set_tcs(struct net_device *netdev, u16 nch, u8 ntc, + struct netdev_tc_txq *tc_to_txq) +{ + int tc, err; + + netdev_reset_tc(netdev); + + if (ntc == 1) + return 0; + + err = netdev_set_num_tc(netdev, ntc); + if (err) { + netdev_WARN(netdev, "netdev_set_num_tc failed (%d), ntc = %d\n", err, ntc); + return err; + } + + for (tc = 0; tc < ntc; tc++) { + u16 count, offset; + + count = tc_to_txq[tc].count; + offset = tc_to_txq[tc].offset; + netdev_set_tc_queue(netdev, tc, count, offset); + } + + return 0; +} + +int mlx5e_update_tx_netdev_queues(struct mlx5e_priv *priv) +{ + int qos_queues, nch, ntc, num_txqs, err; + + qos_queues = mlx5e_qos_cur_leaf_nodes(priv); + + nch = priv->channels.params.num_channels; + ntc = mlx5e_get_dcb_num_tc(&priv->channels.params); + num_txqs = nch * ntc + qos_queues; + if (MLX5E_GET_PFLAG(&priv->channels.params, MLX5E_PFLAG_TX_PORT_TS)) + num_txqs += ntc; + + mlx5e_dbg(DRV, priv, "Setting num_txqs %d\n", num_txqs); + err = netif_set_real_num_tx_queues(priv->netdev, num_txqs); + if (err) + netdev_warn(priv->netdev, "netif_set_real_num_tx_queues failed (%d > %d), %d\n", + num_txqs, priv->netdev->num_tx_queues, err); + + return err; +} + +static int mlx5e_update_netdev_queues(struct mlx5e_priv *priv) +{ + struct netdev_tc_txq old_tc_to_txq[TC_MAX_QUEUE], *tc_to_txq; + struct net_device *netdev = priv->netdev; + int old_num_txqs, old_ntc; + int num_rxqs, nch, ntc; + int err; + int i; + + old_num_txqs = netdev->real_num_tx_queues; + old_ntc = netdev->num_tc ? : 1; + for (i = 0; i < ARRAY_SIZE(old_tc_to_txq); i++) + old_tc_to_txq[i] = netdev->tc_to_txq[i]; + + nch = priv->channels.params.num_channels; + ntc = priv->channels.params.mqprio.num_tc; + num_rxqs = nch * priv->profile->rq_groups; + tc_to_txq = priv->channels.params.mqprio.tc_to_txq; + + err = mlx5e_netdev_set_tcs(netdev, nch, ntc, tc_to_txq); + if (err) + goto err_out; + err = mlx5e_update_tx_netdev_queues(priv); + if (err) + goto err_tcs; + err = netif_set_real_num_rx_queues(netdev, num_rxqs); + if (err) { + netdev_warn(netdev, "netif_set_real_num_rx_queues failed, %d\n", err); + goto err_txqs; + } + + return 0; + +err_txqs: + /* netif_set_real_num_rx_queues could fail only when nch increased. Only + * one of nch and ntc is changed in this function. That means, the call + * to netif_set_real_num_tx_queues below should not fail, because it + * decreases the number of TX queues. + */ + WARN_ON_ONCE(netif_set_real_num_tx_queues(netdev, old_num_txqs)); + +err_tcs: + WARN_ON_ONCE(mlx5e_netdev_set_tcs(netdev, old_num_txqs / old_ntc, old_ntc, + old_tc_to_txq)); +err_out: + return err; +} + +static MLX5E_DEFINE_PREACTIVATE_WRAPPER_CTX(mlx5e_update_netdev_queues); + +static void mlx5e_set_default_xps_cpumasks(struct mlx5e_priv *priv, + struct mlx5e_params *params) +{ + struct mlx5_core_dev *mdev = priv->mdev; + int num_comp_vectors, ix, irq; + + num_comp_vectors = mlx5_comp_vectors_count(mdev); + + for (ix = 0; ix < params->num_channels; ix++) { + cpumask_clear(priv->scratchpad.cpumask); + + for (irq = ix; irq < num_comp_vectors; irq += params->num_channels) { + int cpu = cpumask_first(mlx5_comp_irq_get_affinity_mask(mdev, irq)); + + cpumask_set_cpu(cpu, priv->scratchpad.cpumask); + } + + netif_set_xps_queue(priv->netdev, priv->scratchpad.cpumask, ix); + } +} + +static int mlx5e_num_channels_changed(struct mlx5e_priv *priv) +{ + u16 count = priv->channels.params.num_channels; + int err; + + err = mlx5e_update_netdev_queues(priv); + if (err) + return err; + + mlx5e_set_default_xps_cpumasks(priv, &priv->channels.params); + + /* This function may be called on attach, before priv->rx_res is created. */ + if (!netif_is_rxfh_configured(priv->netdev) && priv->rx_res) + mlx5e_rx_res_rss_set_indir_uniform(priv->rx_res, count); + + return 0; +} + +MLX5E_DEFINE_PREACTIVATE_WRAPPER_CTX(mlx5e_num_channels_changed); + +void mlx5e_build_txq_maps(struct mlx5e_priv *priv) +{ + int i, ch, tc, num_tc; + + ch = priv->channels.num; + num_tc = mlx5e_get_dcb_num_tc(&priv->channels.params); + + for (i = 0; i < ch; i++) { + for (tc = 0; tc < num_tc; tc++) { + struct mlx5e_channel *c = priv->channels.c[i]; + struct mlx5e_txqsq *sq = &c->sq[tc]; + + priv->txq2sq[sq->txq_ix] = sq; + } + } + + if (!priv->channels.ptp) + goto out; + + if (!test_bit(MLX5E_PTP_STATE_TX, priv->channels.ptp->state)) + goto out; + + for (tc = 0; tc < num_tc; tc++) { + struct mlx5e_ptp *c = priv->channels.ptp; + struct mlx5e_txqsq *sq = &c->ptpsq[tc].txqsq; + + priv->txq2sq[sq->txq_ix] = sq; + } + +out: + /* Make the change to txq2sq visible before the queue is started. + * As mlx5e_xmit runs under a spinlock, there is an implicit ACQUIRE, + * which pairs with this barrier. + */ + smp_wmb(); +} + +void mlx5e_build_selq(struct mlx5e_select_queue_params *selq, + struct mlx5e_params *params, bool htb) +{ + selq->num_channels = params->num_channels; + selq->num_tcs = params->mqprio.num_tc; + selq->num_regular_queues = selq->num_channels * selq->num_tcs; + selq->is_htb = htb; + selq->is_ptp = MLX5E_GET_PFLAG(params, MLX5E_PFLAG_TX_PORT_TS); +} + +void mlx5e_replace_selq(struct mlx5e_priv *priv, struct mlx5e_select_queue_params *selq) +{ + struct mlx5e_select_queue_params *old_selq; + + old_selq = rcu_replace_pointer(priv->selq, selq, lockdep_is_held(&priv->state_lock)); + synchronize_net(); /* Wait until ndo_select_queue starts emitting correct values. */ + kvfree(old_selq); +} + +static void mlx5e_priv_channels_activate_rx(struct mlx5e_priv *priv) +{ + if (priv->shared_rq) + return; + + mlx5e_wait_channels_min_rx_wqes(&priv->channels); + + if (priv->rx_res) + mlx5e_rx_res_channels_activate(priv->rx_res, &priv->channels); + +} + +void mlx5e_activate_priv_channels(struct mlx5e_priv *priv) +{ + mlx5e_build_txq_maps(priv); + mlx5e_activate_channels(&priv->channels); + mlx5e_qos_activate_queues(priv); + mlx5e_xdp_tx_enable(priv); + + /* dev_watchdog() wants all TX queues to be started when the carrier is + * OK, including the ones in range real_num_tx_queues..num_tx_queues-1. + * Make it happy to avoid TX timeout false alarms. + */ + netif_tx_start_all_queues(priv->netdev); + + if (mlx5e_is_vport_rep(priv)) + mlx5e_rep_activate_channels(priv); + + mlx5e_priv_channels_activate_rx(priv); +} + +static void mlx5e_deactivate_priv_channels_rx(struct mlx5e_priv *priv) +{ + if (priv->shared_rq) + return; + + if (priv->rx_res) + mlx5e_rx_res_channels_deactivate(priv->rx_res); +} + +void mlx5e_deactivate_priv_channels(struct mlx5e_priv *priv) +{ + mlx5e_deactivate_priv_channels_rx(priv); + + if (mlx5e_is_vport_rep(priv)) + mlx5e_rep_deactivate_channels(priv); + + /* The results of ndo_select_queue are unreliable, while netdev config + * is being changed (real_num_tx_queues, num_tc). Stop all queues to + * prevent ndo_start_xmit from being called, so that it can assume that + * the selected queue is always valid. + */ + netif_tx_disable(priv->netdev); + + mlx5e_xdp_tx_disable(priv); + mlx5e_deactivate_channels(&priv->channels); +} + +static int mlx5e_switch_priv_params(struct mlx5e_priv *priv, + struct mlx5e_params *new_params, + mlx5e_fp_preactivate preactivate, + void *context) +{ + struct mlx5e_params old_params; + + old_params = priv->channels.params; + priv->channels.params = *new_params; + + if (preactivate) { + int err; + + err = preactivate(priv, context); + if (err) { + priv->channels.params = old_params; + return err; + } + } + + return 0; +} + +static int mlx5e_switch_priv_channels(struct mlx5e_priv *priv, + struct mlx5e_channels *new_chs, + struct mlx5e_select_queue_params *selq, + mlx5e_fp_preactivate preactivate, + void *context) +{ + struct net_device *netdev = priv->netdev; + struct mlx5e_channels old_chs; + int carrier_ok; + int err = 0; + + carrier_ok = netif_carrier_ok(netdev); + netif_carrier_off(netdev); + + mlx5e_deactivate_priv_channels(priv); + + err = mlx5e_open_channels(priv, new_chs); + if (err) + goto out; + + old_chs = priv->channels; + priv->channels = *new_chs; + + /* New channels are ready to roll, call the preactivate hook if needed + * to modify HW settings or update kernel parameters. + */ + if (preactivate) { + err = preactivate(priv, context); + if (err) { + mlx5e_close_channels(new_chs); + priv->channels = old_chs; + goto out; + } + } + + mlx5e_close_channels(&old_chs); + priv->profile->update_rx(priv); + mlx5e_replace_selq(priv, selq); +out: + mlx5e_activate_priv_channels(priv); + + /* return carrier back if needed */ + if (carrier_ok) + netif_carrier_on(netdev); + + return err; +} + +int mlx5e_safe_switch_params(struct mlx5e_priv *priv, + struct mlx5e_params *params, + mlx5e_fp_preactivate preactivate, + void *context, bool reset) +{ + struct mlx5e_select_queue_params *selq; + struct mlx5e_channels new_chs = {}; + int err; + + reset &= test_bit(MLX5E_STATE_OPENED, &priv->state); + if (!reset) + return mlx5e_switch_priv_params(priv, params, preactivate, context); + + new_chs.params = *params; + + selq = kvzalloc(sizeof(*selq), GFP_KERNEL); + if (!selq) + return -ENOMEM; + mlx5e_build_selq(selq, &new_chs.params, !!priv->htb.maj_id); + + err = mlx5e_switch_priv_channels(priv, &new_chs, selq, preactivate, context); + if (err) + goto err_close; + + return 0; + +err_close: + kvfree(selq); + return err; +} + +int mlx5e_safe_reopen_channels(struct mlx5e_priv *priv) +{ + return mlx5e_safe_switch_params(priv, &priv->channels.params, NULL, NULL, true); +} + +void mlx5e_timestamp_init(struct mlx5e_priv *priv) +{ + priv->tstamp.tx_type = HWTSTAMP_TX_OFF; + priv->tstamp.rx_filter = HWTSTAMP_FILTER_NONE; +} + +static void mlx5e_modify_admin_state(struct mlx5_core_dev *mdev, + enum mlx5_port_status state) +{ + struct mlx5_eswitch *esw = mdev->priv.eswitch; + int vport_admin_state; + + mlx5_set_port_admin_status(mdev, state); + + if (mlx5_eswitch_mode(mdev) == MLX5_ESWITCH_OFFLOADS || + !MLX5_CAP_GEN(mdev, uplink_follow)) + return; + + if (state == MLX5_PORT_UP) + vport_admin_state = MLX5_VPORT_ADMIN_STATE_AUTO; + else + vport_admin_state = MLX5_VPORT_ADMIN_STATE_DOWN; + + mlx5_eswitch_set_vport_state(esw, MLX5_VPORT_UPLINK, vport_admin_state); +} + +int mlx5e_open_locked(struct net_device *netdev) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5e_select_queue_params *selq; + int err; + + selq = kvzalloc(sizeof(*selq), GFP_KERNEL); + if (!selq) + return -ENOMEM; + mlx5e_build_selq(selq, &priv->channels.params, !!priv->htb.maj_id); + + set_bit(MLX5E_STATE_OPENED, &priv->state); + + err = mlx5e_open_channels(priv, &priv->channels); + if (err) + goto err_clear_state_opened_flag; + + priv->profile->update_rx(priv); + mlx5e_replace_selq(priv, selq); + mlx5e_activate_priv_channels(priv); + if (!mlx5e_is_uplink_rep(priv) && !mlx5e_is_vport_rep(priv)) + mlx5e_create_debugfs(priv); + mlx5e_apply_traps(priv, true); + if (priv->profile->update_carrier) + priv->profile->update_carrier(priv); + + mlx5e_queue_update_stats(priv); + return 0; + +err_clear_state_opened_flag: + clear_bit(MLX5E_STATE_OPENED, &priv->state); + kvfree(selq); + return err; +} + +int mlx5e_open(struct net_device *netdev) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + int err; + + mutex_lock(&priv->state_lock); + err = mlx5e_open_locked(netdev); + if (!err) + mlx5e_modify_admin_state(priv->mdev, MLX5_PORT_UP); + mutex_unlock(&priv->state_lock); + + return err; +} + +int mlx5e_close_locked(struct net_device *netdev) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + + /* May already be CLOSED in case a previous configuration operation + * (e.g RX/TX queue size change) that involves close&open failed. + */ + if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) + return 0; + + mlx5e_apply_traps(priv, false); + clear_bit(MLX5E_STATE_OPENED, &priv->state); + + netif_carrier_off(priv->netdev); + if (!mlx5e_is_uplink_rep(priv) && !mlx5e_is_vport_rep(priv)) + mlx5e_destroy_debugfs(priv); + mlx5e_deactivate_priv_channels(priv); + mlx5e_close_channels(&priv->channels); + + return 0; +} + +int mlx5e_close(struct net_device *netdev) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + int err; + + if (!netif_device_present(netdev)) + return -ENODEV; + + mutex_lock(&priv->state_lock); + mlx5e_modify_admin_state(priv->mdev, MLX5_PORT_DOWN); + err = mlx5e_close_locked(netdev); + mutex_unlock(&priv->state_lock); + + return err; +} + +static void mlx5e_free_drop_rq(struct mlx5e_rq *rq) +{ + mlx5_wq_destroy(&rq->wq_ctrl); +} + +static int mlx5e_alloc_drop_rq(struct mlx5_core_dev *mdev, + struct mlx5e_rq *rq, + struct mlx5e_rq_param *param) +{ + void *rqc = param->rqc; + void *rqc_wq = MLX5_ADDR_OF(rqc, rqc, wq); + int err; + + param->wq.db_numa_node = param->wq.buf_numa_node; + + err = mlx5_wq_cyc_create(mdev, ¶m->wq, rqc_wq, &rq->wqe.wq, + &rq->wq_ctrl); + if (err) + return err; + + /* Mark as unused given "Drop-RQ" packets never reach XDP */ + xdp_rxq_info_unused(&rq->xdp_rxq); + + rq->mdev = mdev; + + return 0; +} + +static int mlx5e_alloc_drop_cq(struct mlx5e_priv *priv, + struct mlx5e_cq *cq, + struct mlx5e_cq_param *param) +{ + struct mlx5_core_dev *mdev = priv->mdev; + + param->wq.buf_numa_node = dev_to_node(mlx5_core_dma_dev(mdev)); + param->wq.db_numa_node = dev_to_node(mlx5_core_dma_dev(mdev)); + + return mlx5e_alloc_cq_common(priv, param, cq); +} + +int mlx5e_open_drop_rq(struct mlx5e_priv *priv, + struct mlx5e_rq *drop_rq) +{ + struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5e_cq_param cq_param = {}; + struct mlx5e_rq_param rq_param = {}; + struct mlx5e_cq *cq = &drop_rq->cq; + int err; + + mlx5e_build_drop_rq_param(mdev, priv->drop_rq_q_counter, &rq_param); + + err = mlx5e_alloc_drop_cq(priv, cq, &cq_param); + if (err) + return err; + + err = mlx5e_create_cq(cq, &cq_param); + if (err) + goto err_free_cq; + + err = mlx5e_alloc_drop_rq(mdev, drop_rq, &rq_param); + if (err) + goto err_destroy_cq; + + err = mlx5e_create_rq(drop_rq, &rq_param); + if (err) + goto err_free_rq; + + err = mlx5e_modify_rq_state(drop_rq, MLX5_RQC_STATE_RST, MLX5_RQC_STATE_RDY); + if (err) + mlx5_core_warn(priv->mdev, "modify_rq_state failed, rx_if_down_packets won't be counted %d\n", err); + + return 0; + +err_free_rq: + mlx5e_free_drop_rq(drop_rq); + +err_destroy_cq: + mlx5e_destroy_cq(cq); + +err_free_cq: + mlx5e_free_cq(cq); + + return err; +} + +void mlx5e_close_drop_rq(struct mlx5e_rq *drop_rq) +{ + mlx5e_destroy_rq(drop_rq); + mlx5e_free_drop_rq(drop_rq); + mlx5e_destroy_cq(&drop_rq->cq); + mlx5e_free_cq(&drop_rq->cq); +} + +int mlx5e_create_tis(struct mlx5_core_dev *mdev, void *in, u32 *tisn) +{ + void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx); + + MLX5_SET(tisc, tisc, transport_domain, mdev->mlx5e_res.hw_objs.td.tdn); + + if (MLX5_GET(tisc, tisc, tls_en)) + MLX5_SET(tisc, tisc, pd, mdev->mlx5e_res.hw_objs.pdn); + + if (mlx5_lag_is_lacp_owner(mdev)) + MLX5_SET(tisc, tisc, strict_lag_tx_port_affinity, 1); + + return mlx5_core_create_tis(mdev, in, tisn); +} + +void mlx5e_destroy_tis(struct mlx5_core_dev *mdev, u32 tisn) +{ + mlx5_core_destroy_tis(mdev, tisn); +} + +void mlx5e_destroy_tises(struct mlx5e_priv *priv) +{ + int tc, i; + + for (i = 0; i < mlx5e_get_num_lag_ports(priv->mdev); i++) + for (tc = 0; tc < priv->profile->max_tc; tc++) + mlx5e_destroy_tis(priv->mdev, priv->tisn[i][tc]); +} + +static bool mlx5e_lag_should_assign_affinity(struct mlx5_core_dev *mdev) +{ + return MLX5_CAP_GEN(mdev, lag_tx_port_affinity) && mlx5e_get_num_lag_ports(mdev) > 1; +} + +int mlx5e_create_tises(struct mlx5e_priv *priv) +{ + int tc, i; + int err; + + for (i = 0; i < mlx5e_get_num_lag_ports(priv->mdev); i++) { + for (tc = 0; tc < priv->profile->max_tc; tc++) { + u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {}; + void *tisc; + + tisc = MLX5_ADDR_OF(create_tis_in, in, ctx); + + MLX5_SET(tisc, tisc, prio, tc << 1); + + if (mlx5e_lag_should_assign_affinity(priv->mdev)) + MLX5_SET(tisc, tisc, lag_tx_port_affinity, i + 1); + + err = mlx5e_create_tis(priv->mdev, in, &priv->tisn[i][tc]); + if (err) + goto err_close_tises; + } + } + + return 0; + +err_close_tises: + for (; i >= 0; i--) { + for (tc--; tc >= 0; tc--) + mlx5e_destroy_tis(priv->mdev, priv->tisn[i][tc]); + tc = priv->profile->max_tc; + } + + return err; +} + +static void mlx5e_cleanup_nic_tx(struct mlx5e_priv *priv) +{ + if (priv->mqprio_rl) { + mlx5e_mqprio_rl_cleanup(priv->mqprio_rl); + mlx5e_mqprio_rl_free(priv->mqprio_rl); + priv->mqprio_rl = NULL; + } + mlx5e_destroy_tises(priv); +} + +static int mlx5e_modify_channels_scatter_fcs(struct mlx5e_channels *chs, bool enable) +{ + int err = 0; + int i; + + for (i = 0; i < chs->num; i++) { + err = mlx5e_modify_rq_scatter_fcs(&chs->c[i]->rq, enable); + if (err) + return err; + } + + return 0; +} + +static int mlx5e_modify_channels_vsd(struct mlx5e_channels *chs, bool vsd) +{ + int err; + int i; + + for (i = 0; i < chs->num; i++) { + err = mlx5e_modify_rq_vsd(&chs->c[i]->rq, vsd); + if (err) + return err; + } + if (chs->ptp && test_bit(MLX5E_PTP_STATE_RX, chs->ptp->state)) + return mlx5e_modify_rq_vsd(&chs->ptp->rq, vsd); + + return 0; +} + +static void mlx5e_mqprio_build_default_tc_to_txq(struct netdev_tc_txq *tc_to_txq, + int ntc, int nch) +{ + int tc; + + memset(tc_to_txq, 0, sizeof(*tc_to_txq) * TC_MAX_QUEUE); + + /* Map netdev TCs to offset 0. + * We have our own UP to TXQ mapping for DCB mode of QoS + */ + for (tc = 0; tc < ntc; tc++) { + tc_to_txq[tc] = (struct netdev_tc_txq) { + .count = nch, + .offset = 0, + }; + } +} + +static void mlx5e_mqprio_build_tc_to_txq(struct netdev_tc_txq *tc_to_txq, + struct tc_mqprio_qopt *qopt) +{ + int tc; + + for (tc = 0; tc < TC_MAX_QUEUE; tc++) { + tc_to_txq[tc] = (struct netdev_tc_txq) { + .count = qopt->count[tc], + .offset = qopt->offset[tc], + }; + } +} + +static void mlx5e_params_mqprio_dcb_set(struct mlx5e_params *params, u8 num_tc) +{ + params->mqprio.mode = TC_MQPRIO_MODE_DCB; + params->mqprio.num_tc = num_tc; + mlx5e_mqprio_build_default_tc_to_txq(params->mqprio.tc_to_txq, num_tc, + params->num_channels); +} + +static void mlx5e_mqprio_rl_update_params(struct mlx5e_params *params, + struct mlx5e_mqprio_rl *rl) +{ + int tc; + + for (tc = 0; tc < TC_MAX_QUEUE; tc++) { + u32 hw_id = 0; + + if (rl) + mlx5e_mqprio_rl_get_node_hw_id(rl, tc, &hw_id); + params->mqprio.channel.hw_id[tc] = hw_id; + } +} + +static void mlx5e_params_mqprio_channel_set(struct mlx5e_params *params, + struct tc_mqprio_qopt_offload *mqprio, + struct mlx5e_mqprio_rl *rl) +{ + int tc; + + params->mqprio.mode = TC_MQPRIO_MODE_CHANNEL; + params->mqprio.num_tc = mqprio->qopt.num_tc; + + for (tc = 0; tc < TC_MAX_QUEUE; tc++) + params->mqprio.channel.max_rate[tc] = mqprio->max_rate[tc]; + + mlx5e_mqprio_rl_update_params(params, rl); + mlx5e_mqprio_build_tc_to_txq(params->mqprio.tc_to_txq, &mqprio->qopt); +} + +static void mlx5e_params_mqprio_reset(struct mlx5e_params *params) +{ + mlx5e_params_mqprio_dcb_set(params, 1); +} + +static int mlx5e_setup_tc_mqprio_dcb(struct mlx5e_priv *priv, + struct tc_mqprio_qopt *mqprio) +{ + struct mlx5e_params new_params; + u8 tc = mqprio->num_tc; + int err; + + mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS; + if (tc && tc != MLX5E_MAX_NUM_TC +#ifdef CONFIG_MLX5_CORE_EN_DCB + && priv->dcbx_dp.trust_state != MLX5_QPTS_TRUST_PCP +#endif + ) + return -EINVAL; + + new_params = priv->channels.params; + mlx5e_params_mqprio_dcb_set(&new_params, tc ? tc : 1); + +#ifdef CONFIG_MLX5_CORE_EN_DCB + if (priv->dcbx_dp.trust_state == MLX5_QPTS_TRUST_PCP) + priv->pcp_tc_num = tc; +#endif + + err = mlx5e_safe_switch_params(priv, &new_params, + mlx5e_num_channels_changed_ctx, NULL, true); + + if (!err && priv->mqprio_rl) { + mlx5e_mqprio_rl_cleanup(priv->mqprio_rl); + mlx5e_mqprio_rl_free(priv->mqprio_rl); + priv->mqprio_rl = NULL; + } + + priv->max_opened_tc = max_t(u8, priv->max_opened_tc, + mlx5e_get_dcb_num_tc(&priv->channels.params)); + return err; +} + +static int mlx5e_mqprio_channel_validate(struct mlx5e_priv *priv, + struct tc_mqprio_qopt_offload *mqprio) +{ + struct net_device *netdev = priv->netdev; + struct mlx5e_ptp *ptp_channel; + int agg_count = 0; + int i; + + ptp_channel = priv->channels.ptp; + if (ptp_channel && test_bit(MLX5E_PTP_STATE_TX, ptp_channel->state)) { + netdev_err(netdev, + "Cannot activate MQPRIO mode channel since it conflicts with TX port TS\n"); + return -EINVAL; + } + + if (mqprio->qopt.offset[0] != 0 || mqprio->qopt.num_tc < 1 || + mqprio->qopt.num_tc > MLX5E_MAX_NUM_MQPRIO_CH_TC) + return -EINVAL; + + for (i = 0; i < mqprio->qopt.num_tc; i++) { + if (!mqprio->qopt.count[i]) { + netdev_err(netdev, "Zero size for queue-group (%d) is not supported\n", i); + return -EINVAL; + } + if (mqprio->min_rate[i]) { + netdev_err(netdev, "Min tx rate is not supported\n"); + return -EINVAL; + } + + if (mqprio->max_rate[i]) { + int err; + + err = mlx5e_qos_bytes_rate_check(priv->mdev, mqprio->max_rate[i]); + if (err) + return err; + } + + if (mqprio->qopt.offset[i] != agg_count) { + netdev_err(netdev, "Discontinuous queues config is not supported\n"); + return -EINVAL; + } + agg_count += mqprio->qopt.count[i]; + } + + if (priv->channels.params.num_channels != agg_count) { + netdev_err(netdev, "Num of queues (%d) does not match available (%d)\n", + agg_count, priv->channels.params.num_channels); + return -EINVAL; + } + + return 0; +} + +static bool mlx5e_mqprio_rate_limit(u8 num_tc, u64 max_rate[]) +{ + int tc; + + for (tc = 0; tc < num_tc; tc++) + if (max_rate[tc]) + return true; + return false; +} + +static struct mlx5e_mqprio_rl *mlx5e_mqprio_rl_create(struct mlx5_core_dev *mdev, + u8 num_tc, u64 max_rate[]) +{ + struct mlx5e_mqprio_rl *rl; + int err; + + if (!mlx5e_mqprio_rate_limit(num_tc, max_rate)) + return NULL; + + rl = mlx5e_mqprio_rl_alloc(); + if (!rl) + return ERR_PTR(-ENOMEM); + + err = mlx5e_mqprio_rl_init(rl, mdev, num_tc, max_rate); + if (err) { + mlx5e_mqprio_rl_free(rl); + return ERR_PTR(err); + } + + return rl; +} + +static int mlx5e_setup_tc_mqprio_channel(struct mlx5e_priv *priv, + struct tc_mqprio_qopt_offload *mqprio) +{ + mlx5e_fp_preactivate preactivate; + struct mlx5e_params new_params; + struct mlx5e_mqprio_rl *rl; + bool nch_changed; + int err; + + err = mlx5e_mqprio_channel_validate(priv, mqprio); + if (err) + return err; + + rl = mlx5e_mqprio_rl_create(priv->mdev, mqprio->qopt.num_tc, mqprio->max_rate); + if (IS_ERR(rl)) + return PTR_ERR(rl); + + new_params = priv->channels.params; + mlx5e_params_mqprio_channel_set(&new_params, mqprio, rl); + + nch_changed = mlx5e_get_dcb_num_tc(&priv->channels.params) > 1; + preactivate = nch_changed ? mlx5e_num_channels_changed_ctx : + mlx5e_update_netdev_queues_ctx; + err = mlx5e_safe_switch_params(priv, &new_params, preactivate, NULL, true); + if (err) { + if (rl) { + mlx5e_mqprio_rl_cleanup(rl); + mlx5e_mqprio_rl_free(rl); + } + return err; + } + +#ifdef CONFIG_MLX5_CORE_EN_DCB + if (!err && priv->dcbx_dp.trust_state == MLX5_QPTS_TRUST_PCP) + priv->pcp_tc_num = mqprio->qopt.num_tc; +#endif + + if (priv->mqprio_rl) { + mlx5e_mqprio_rl_cleanup(priv->mqprio_rl); + mlx5e_mqprio_rl_free(priv->mqprio_rl); + } + priv->mqprio_rl = rl; + + return 0; +} + +int mlx5e_setup_tc_mqprio(struct mlx5e_priv *priv, + struct tc_mqprio_qopt_offload *mqprio) +{ + /* MQPRIO is another toplevel qdisc that can't be attached + * simultaneously with the offloaded HTB. + */ + if (WARN_ON(priv->htb.maj_id)) + return -EINVAL; + + switch (mqprio->mode) { + case TC_MQPRIO_MODE_DCB: + return mlx5e_setup_tc_mqprio_dcb(priv, &mqprio->qopt); + case TC_MQPRIO_MODE_CHANNEL: + return mlx5e_setup_tc_mqprio_channel(priv, mqprio); + default: + return -EOPNOTSUPP; + } +} + +static int mlx5e_setup_tc_htb(struct mlx5e_priv *priv, struct tc_htb_qopt_offload *htb) +{ + int res; + + switch (htb->command) { + case TC_HTB_CREATE: + return mlx5e_htb_root_add(priv, htb->parent_classid, htb->classid, + htb->extack); + case TC_HTB_DESTROY: + return mlx5e_htb_root_del(priv); + case TC_HTB_LEAF_ALLOC_QUEUE: + res = mlx5e_htb_leaf_alloc_queue(priv, htb->classid, htb->parent_classid, + htb->rate, htb->ceil, htb->extack); + if (res < 0) + return res; + htb->qid = res; + return 0; + case TC_HTB_LEAF_TO_INNER: + return mlx5e_htb_leaf_to_inner(priv, htb->parent_classid, htb->classid, + htb->rate, htb->ceil, htb->extack); + case TC_HTB_LEAF_DEL: + return mlx5e_htb_leaf_del(priv, &htb->classid, htb->extack); + case TC_HTB_LEAF_DEL_LAST: + case TC_HTB_LEAF_DEL_LAST_FORCE: + return mlx5e_htb_leaf_del_last(priv, htb->classid, + htb->command == TC_HTB_LEAF_DEL_LAST_FORCE, + htb->extack); + case TC_HTB_NODE_MODIFY: + return mlx5e_htb_node_modify(priv, htb->classid, htb->rate, htb->ceil, + htb->extack); + case TC_HTB_LEAF_QUERY_QUEUE: + res = mlx5e_get_txq_by_classid(priv, htb->classid); + if (res < 0) + return res; + htb->qid = res; + return 0; + default: + return -EOPNOTSUPP; + } +} + +static LIST_HEAD(mlx5e_block_cb_list); + +static int mlx5e_setup_tc(struct net_device *dev, enum tc_setup_type type, + void *type_data) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + bool tc_unbind = false; + int err; + + if (type == TC_SETUP_BLOCK && + ((struct flow_block_offload *)type_data)->command == FLOW_BLOCK_UNBIND) + tc_unbind = true; + + if (!netif_device_present(dev) && !tc_unbind) + return -ENODEV; + + switch (type) { + case TC_SETUP_BLOCK: { + struct flow_block_offload *f = type_data; + + f->unlocked_driver_cb = true; + return flow_block_cb_setup_simple(type_data, + &mlx5e_block_cb_list, + mlx5e_setup_tc_block_cb, + priv, priv, true); + } + case TC_SETUP_QDISC_MQPRIO: + mutex_lock(&priv->state_lock); + err = mlx5e_setup_tc_mqprio(priv, type_data); + mutex_unlock(&priv->state_lock); + return err; + case TC_SETUP_QDISC_HTB: + mutex_lock(&priv->state_lock); + err = mlx5e_setup_tc_htb(priv, type_data); + mutex_unlock(&priv->state_lock); + return err; + default: + return -EOPNOTSUPP; + } +} + +void mlx5e_fold_sw_stats64(struct mlx5e_priv *priv, struct rtnl_link_stats64 *s) +{ + int i; + + for (i = 0; i < priv->stats_nch; i++) { + struct mlx5e_channel_stats *channel_stats = priv->channel_stats[i]; + struct mlx5e_rq_stats *xskrq_stats = &channel_stats->xskrq; + struct mlx5e_rq_stats *rq_stats = &channel_stats->rq; + int j; + + s->rx_packets += rq_stats->packets + xskrq_stats->packets; + s->rx_bytes += rq_stats->bytes + xskrq_stats->bytes; + s->multicast += rq_stats->mcast_packets + xskrq_stats->mcast_packets; + + for (j = 0; j < priv->max_opened_tc; j++) { + struct mlx5e_sq_stats *sq_stats = &channel_stats->sq[j]; + + s->tx_packets += sq_stats->packets; + s->tx_bytes += sq_stats->bytes; + s->tx_dropped += sq_stats->dropped; + } + } + if (priv->tx_ptp_opened) { + for (i = 0; i < priv->max_opened_tc; i++) { + struct mlx5e_sq_stats *sq_stats = &priv->ptp_stats.sq[i]; + + s->tx_packets += sq_stats->packets; + s->tx_bytes += sq_stats->bytes; + s->tx_dropped += sq_stats->dropped; + } + } + if (priv->rx_ptp_opened) { + struct mlx5e_rq_stats *rq_stats = &priv->ptp_stats.rq; + + s->rx_packets += rq_stats->packets; + s->rx_bytes += rq_stats->bytes; + s->multicast += rq_stats->mcast_packets; + } +} + +void +mlx5e_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5e_pport_stats *pstats = &priv->stats.pport; + + if (!netif_device_present(dev)) + return; + + /* In switchdev mode, monitor counters doesn't monitor + * rx/tx stats of 802_3. The update stats mechanism + * should keep the 802_3 layout counters updated + */ + if (!mlx5e_monitor_counter_supported(priv) || + mlx5e_is_uplink_rep(priv)) { + /* update HW stats in background for next time */ + mlx5e_queue_update_stats(priv); + } + + if (mlx5e_is_uplink_rep(priv)) { + struct mlx5e_vport_stats *vstats = &priv->stats.vport; + + stats->rx_packets = PPORT_802_3_GET(pstats, a_frames_received_ok); + stats->rx_bytes = PPORT_802_3_GET(pstats, a_octets_received_ok); + stats->tx_packets = PPORT_802_3_GET(pstats, a_frames_transmitted_ok); + stats->tx_bytes = PPORT_802_3_GET(pstats, a_octets_transmitted_ok); + + /* vport multicast also counts packets that are dropped due to steering + * or rx out of buffer + */ + stats->multicast = VPORT_COUNTER_GET(vstats, received_eth_multicast.packets); + } else { + mlx5e_fold_sw_stats64(priv, stats); + } + + stats->rx_dropped = priv->stats.qcnt.rx_out_of_buffer; + + stats->rx_length_errors = + PPORT_802_3_GET(pstats, a_in_range_length_errors) + + PPORT_802_3_GET(pstats, a_out_of_range_length_field) + + PPORT_802_3_GET(pstats, a_frame_too_long_errors); + stats->rx_crc_errors = + PPORT_802_3_GET(pstats, a_frame_check_sequence_errors); + stats->rx_frame_errors = PPORT_802_3_GET(pstats, a_alignment_errors); + stats->tx_aborted_errors = PPORT_2863_GET(pstats, if_out_discards); + stats->rx_errors = stats->rx_length_errors + stats->rx_crc_errors + + stats->rx_frame_errors; + stats->tx_errors = stats->tx_aborted_errors + stats->tx_carrier_errors; +} + +static void mlx5e_nic_set_rx_mode(struct mlx5e_priv *priv) +{ + if (mlx5e_is_uplink_rep(priv)) + return; /* no rx mode for uplink rep */ + + queue_work(priv->wq, &priv->set_rx_mode_work); +} + +static void mlx5e_set_rx_mode(struct net_device *dev) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + + mlx5e_nic_set_rx_mode(priv); +} + +static int mlx5e_set_mac(struct net_device *netdev, void *addr) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct sockaddr *saddr = addr; + + if (!is_valid_ether_addr(saddr->sa_data)) + return -EADDRNOTAVAIL; + + netif_addr_lock_bh(netdev); + eth_hw_addr_set(netdev, saddr->sa_data); + netif_addr_unlock_bh(netdev); + + mlx5e_nic_set_rx_mode(priv); + + return 0; +} + +#define MLX5E_SET_FEATURE(features, feature, enable) \ + do { \ + if (enable) \ + *features |= feature; \ + else \ + *features &= ~feature; \ + } while (0) + +typedef int (*mlx5e_feature_handler)(struct net_device *netdev, bool enable); + +static int set_feature_lro(struct net_device *netdev, bool enable) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5e_params *cur_params; + struct mlx5e_params new_params; + bool reset = true; + int err = 0; + + mutex_lock(&priv->state_lock); + + if (enable && priv->xsk.refcnt) { + netdev_warn(netdev, "LRO is incompatible with AF_XDP (%u XSKs are active)\n", + priv->xsk.refcnt); + err = -EINVAL; + goto out; + } + + cur_params = &priv->channels.params; + if (enable && !MLX5E_GET_PFLAG(cur_params, MLX5E_PFLAG_RX_STRIDING_RQ)) { + netdev_warn(netdev, "can't set LRO with legacy RQ\n"); + err = -EINVAL; + goto out; + } + + new_params = *cur_params; + + if (enable) + new_params.packet_merge.type = MLX5E_PACKET_MERGE_LRO; + else if (new_params.packet_merge.type == MLX5E_PACKET_MERGE_LRO) + new_params.packet_merge.type = MLX5E_PACKET_MERGE_NONE; + else + goto out; + + if (!(cur_params->packet_merge.type == MLX5E_PACKET_MERGE_SHAMPO && + new_params.packet_merge.type == MLX5E_PACKET_MERGE_LRO)) { + if (cur_params->rq_wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) { + if (mlx5e_rx_mpwqe_is_linear_skb(mdev, cur_params, NULL) == + mlx5e_rx_mpwqe_is_linear_skb(mdev, &new_params, NULL)) + reset = false; + } + } + + err = mlx5e_safe_switch_params(priv, &new_params, + mlx5e_modify_tirs_packet_merge_ctx, NULL, reset); +out: + mutex_unlock(&priv->state_lock); + return err; +} + +static int set_feature_hw_gro(struct net_device *netdev, bool enable) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5e_params new_params; + bool reset = true; + int err = 0; + + mutex_lock(&priv->state_lock); + new_params = priv->channels.params; + + if (enable) { + new_params.packet_merge.type = MLX5E_PACKET_MERGE_SHAMPO; + new_params.packet_merge.shampo.match_criteria_type = + MLX5_RQC_SHAMPO_MATCH_CRITERIA_TYPE_EXTENDED; + new_params.packet_merge.shampo.alignment_granularity = + MLX5_RQC_SHAMPO_NO_MATCH_ALIGNMENT_GRANULARITY_STRIDE; + } else if (new_params.packet_merge.type == MLX5E_PACKET_MERGE_SHAMPO) { + new_params.packet_merge.type = MLX5E_PACKET_MERGE_NONE; + } else { + goto out; + } + + err = mlx5e_safe_switch_params(priv, &new_params, + mlx5e_modify_tirs_packet_merge_ctx, NULL, reset); +out: + mutex_unlock(&priv->state_lock); + return err; +} + +static int set_feature_cvlan_filter(struct net_device *netdev, bool enable) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + + if (enable) + mlx5e_enable_cvlan_filter(priv); + else + mlx5e_disable_cvlan_filter(priv); + + return 0; +} + +static int set_feature_hw_tc(struct net_device *netdev, bool enable) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + +#if IS_ENABLED(CONFIG_MLX5_CLS_ACT) + int tc_flag = mlx5e_is_uplink_rep(priv) ? MLX5_TC_FLAG(ESW_OFFLOAD) : + MLX5_TC_FLAG(NIC_OFFLOAD); + if (!enable && mlx5e_tc_num_filters(priv, tc_flag)) { + netdev_err(netdev, + "Active offloaded tc filters, can't turn hw_tc_offload off\n"); + return -EINVAL; + } +#endif + + if (!enable && priv->htb.maj_id) { + netdev_err(netdev, "Active HTB offload, can't turn hw_tc_offload off\n"); + return -EINVAL; + } + + return 0; +} + +static int set_feature_rx_all(struct net_device *netdev, bool enable) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5_core_dev *mdev = priv->mdev; + + return mlx5_set_port_fcs(mdev, !enable); +} + +static int mlx5e_set_rx_port_ts(struct mlx5_core_dev *mdev, bool enable) +{ + u32 in[MLX5_ST_SZ_DW(pcmr_reg)] = {}; + bool supported, curr_state; + int err; + + if (!MLX5_CAP_GEN(mdev, ports_check)) + return 0; + + err = mlx5_query_ports_check(mdev, in, sizeof(in)); + if (err) + return err; + + supported = MLX5_GET(pcmr_reg, in, rx_ts_over_crc_cap); + curr_state = MLX5_GET(pcmr_reg, in, rx_ts_over_crc); + + if (!supported || enable == curr_state) + return 0; + + MLX5_SET(pcmr_reg, in, local_port, 1); + MLX5_SET(pcmr_reg, in, rx_ts_over_crc, enable); + + return mlx5_set_ports_check(mdev, in, sizeof(in)); +} + +static int set_feature_rx_fcs(struct net_device *netdev, bool enable) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5e_channels *chs = &priv->channels; + struct mlx5_core_dev *mdev = priv->mdev; + int err; + + mutex_lock(&priv->state_lock); + + if (enable) { + err = mlx5e_set_rx_port_ts(mdev, false); + if (err) + goto out; + + chs->params.scatter_fcs_en = true; + err = mlx5e_modify_channels_scatter_fcs(chs, true); + if (err) { + chs->params.scatter_fcs_en = false; + mlx5e_set_rx_port_ts(mdev, true); + } + } else { + chs->params.scatter_fcs_en = false; + err = mlx5e_modify_channels_scatter_fcs(chs, false); + if (err) { + chs->params.scatter_fcs_en = true; + goto out; + } + err = mlx5e_set_rx_port_ts(mdev, true); + if (err) { + mlx5_core_warn(mdev, "Failed to set RX port timestamp %d\n", err); + err = 0; + } + } + +out: + mutex_unlock(&priv->state_lock); + return err; +} + +static int set_feature_rx_vlan(struct net_device *netdev, bool enable) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + int err = 0; + + mutex_lock(&priv->state_lock); + + priv->channels.params.vlan_strip_disable = !enable; + if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) + goto unlock; + + err = mlx5e_modify_channels_vsd(&priv->channels, !enable); + if (err) + priv->channels.params.vlan_strip_disable = enable; + +unlock: + mutex_unlock(&priv->state_lock); + + return err; +} + +#ifdef CONFIG_MLX5_EN_ARFS +static int set_feature_arfs(struct net_device *netdev, bool enable) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + int err; + + if (enable) + err = mlx5e_arfs_enable(priv); + else + err = mlx5e_arfs_disable(priv); + + return err; +} +#endif + +static int mlx5e_handle_feature(struct net_device *netdev, + netdev_features_t *features, + netdev_features_t feature, + mlx5e_feature_handler feature_handler) +{ + netdev_features_t changes = *features ^ netdev->features; + bool enable = !!(*features & feature); + int err; + + if (!(changes & feature)) + return 0; + + err = feature_handler(netdev, enable); + if (err) { + MLX5E_SET_FEATURE(features, feature, !enable); + netdev_err(netdev, "%s feature %pNF failed, err %d\n", + enable ? "Enable" : "Disable", &feature, err); + return err; + } + + return 0; +} + +int mlx5e_set_features(struct net_device *netdev, netdev_features_t features) +{ + netdev_features_t oper_features = features; + int err = 0; + +#define MLX5E_HANDLE_FEATURE(feature, handler) \ + mlx5e_handle_feature(netdev, &oper_features, feature, handler) + + err |= MLX5E_HANDLE_FEATURE(NETIF_F_LRO, set_feature_lro); + err |= MLX5E_HANDLE_FEATURE(NETIF_F_GRO_HW, set_feature_hw_gro); + err |= MLX5E_HANDLE_FEATURE(NETIF_F_HW_VLAN_CTAG_FILTER, + set_feature_cvlan_filter); + err |= MLX5E_HANDLE_FEATURE(NETIF_F_HW_TC, set_feature_hw_tc); + err |= MLX5E_HANDLE_FEATURE(NETIF_F_RXALL, set_feature_rx_all); + err |= MLX5E_HANDLE_FEATURE(NETIF_F_RXFCS, set_feature_rx_fcs); + err |= MLX5E_HANDLE_FEATURE(NETIF_F_HW_VLAN_CTAG_RX, set_feature_rx_vlan); +#ifdef CONFIG_MLX5_EN_ARFS + err |= MLX5E_HANDLE_FEATURE(NETIF_F_NTUPLE, set_feature_arfs); +#endif + err |= MLX5E_HANDLE_FEATURE(NETIF_F_HW_TLS_RX, mlx5e_ktls_set_feature_rx); + + if (err) { + netdev->features = oper_features; + return -EINVAL; + } + + return 0; +} + +static netdev_features_t mlx5e_fix_uplink_rep_features(struct net_device *netdev, + netdev_features_t features) +{ + features &= ~NETIF_F_HW_TLS_RX; + if (netdev->features & NETIF_F_HW_TLS_RX) + netdev_warn(netdev, "Disabling hw_tls_rx, not supported in switchdev mode\n"); + + features &= ~NETIF_F_HW_TLS_TX; + if (netdev->features & NETIF_F_HW_TLS_TX) + netdev_warn(netdev, "Disabling hw_tls_tx, not supported in switchdev mode\n"); + + features &= ~NETIF_F_NTUPLE; + if (netdev->features & NETIF_F_NTUPLE) + netdev_warn(netdev, "Disabling ntuple, not supported in switchdev mode\n"); + + features &= ~NETIF_F_GRO_HW; + if (netdev->features & NETIF_F_GRO_HW) + netdev_warn(netdev, "Disabling HW_GRO, not supported in switchdev mode\n"); + + return features; +} + +static netdev_features_t mlx5e_fix_features(struct net_device *netdev, + netdev_features_t features) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5e_params *params; + + mutex_lock(&priv->state_lock); + params = &priv->channels.params; + if (!priv->fs.vlan || + !bitmap_empty(mlx5e_vlan_get_active_svlans(priv->fs.vlan), VLAN_N_VID)) { + /* HW strips the outer C-tag header, this is a problem + * for S-tag traffic. + */ + features &= ~NETIF_F_HW_VLAN_CTAG_RX; + if (!params->vlan_strip_disable) + netdev_warn(netdev, "Dropping C-tag vlan stripping offload due to S-tag vlan\n"); + } + + if (!MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_STRIDING_RQ)) { + if (features & NETIF_F_LRO) { + netdev_warn(netdev, "Disabling LRO, not supported in legacy RQ\n"); + features &= ~NETIF_F_LRO; + } + if (features & NETIF_F_GRO_HW) { + netdev_warn(netdev, "Disabling HW-GRO, not supported in legacy RQ\n"); + features &= ~NETIF_F_GRO_HW; + } + } + + if (params->xdp_prog) { + if (features & NETIF_F_LRO) { + netdev_warn(netdev, "LRO is incompatible with XDP\n"); + features &= ~NETIF_F_LRO; + } + if (features & NETIF_F_GRO_HW) { + netdev_warn(netdev, "HW GRO is incompatible with XDP\n"); + features &= ~NETIF_F_GRO_HW; + } + } + + if (priv->xsk.refcnt) { + if (features & NETIF_F_GRO_HW) { + netdev_warn(netdev, "HW GRO is incompatible with AF_XDP (%u XSKs are active)\n", + priv->xsk.refcnt); + features &= ~NETIF_F_GRO_HW; + } + } + + if (MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_CQE_COMPRESS)) { + features &= ~NETIF_F_RXHASH; + if (netdev->features & NETIF_F_RXHASH) + netdev_warn(netdev, "Disabling rxhash, not supported when CQE compress is active\n"); + + if (features & NETIF_F_GRO_HW) { + netdev_warn(netdev, "Disabling HW-GRO, not supported when CQE compress is active\n"); + features &= ~NETIF_F_GRO_HW; + } + } + + /* LRO/HW-GRO features cannot be combined with RX-FCS */ + if (features & NETIF_F_RXFCS) { + if (features & NETIF_F_LRO) { + netdev_warn(netdev, "Dropping LRO feature since RX-FCS is requested\n"); + features &= ~NETIF_F_LRO; + } + if (features & NETIF_F_GRO_HW) { + netdev_warn(netdev, "Dropping HW-GRO feature since RX-FCS is requested\n"); + features &= ~NETIF_F_GRO_HW; + } + } + + if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) { + netdev_warn(netdev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n"); + features &= ~NETIF_F_HW_TLS_RX; + } + + if (mlx5e_is_uplink_rep(priv)) + features = mlx5e_fix_uplink_rep_features(netdev, features); + + mutex_unlock(&priv->state_lock); + + return features; +} + +static bool mlx5e_xsk_validate_mtu(struct net_device *netdev, + struct mlx5e_channels *chs, + struct mlx5e_params *new_params, + struct mlx5_core_dev *mdev) +{ + u16 ix; + + for (ix = 0; ix < chs->params.num_channels; ix++) { + struct xsk_buff_pool *xsk_pool = + mlx5e_xsk_get_pool(&chs->params, chs->params.xsk, ix); + struct mlx5e_xsk_param xsk; + + if (!xsk_pool) + continue; + + mlx5e_build_xsk_param(xsk_pool, &xsk); + + if (!mlx5e_validate_xsk_param(new_params, &xsk, mdev)) { + u32 hr = mlx5e_get_linear_rq_headroom(new_params, &xsk); + int max_mtu_frame, max_mtu_page, max_mtu; + + /* Two criteria must be met: + * 1. HW MTU + all headrooms <= XSK frame size. + * 2. Size of SKBs allocated on XDP_PASS <= PAGE_SIZE. + */ + max_mtu_frame = MLX5E_HW2SW_MTU(new_params, xsk.chunk_size - hr); + max_mtu_page = mlx5e_xdp_max_mtu(new_params, &xsk); + max_mtu = min(max_mtu_frame, max_mtu_page); + + netdev_err(netdev, "MTU %d is too big for an XSK running on channel %u. Try MTU <= %d\n", + new_params->sw_mtu, ix, max_mtu); + return false; + } + } + + return true; +} + +int mlx5e_change_mtu(struct net_device *netdev, int new_mtu, + mlx5e_fp_preactivate preactivate) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5e_params new_params; + struct mlx5e_params *params; + bool reset = true; + int err = 0; + + mutex_lock(&priv->state_lock); + + params = &priv->channels.params; + + new_params = *params; + new_params.sw_mtu = new_mtu; + err = mlx5e_validate_params(priv->mdev, &new_params); + if (err) + goto out; + + if (params->xdp_prog && + !mlx5e_rx_is_linear_skb(&new_params, NULL)) { + netdev_err(netdev, "MTU(%d) > %d is not allowed while XDP enabled\n", + new_mtu, mlx5e_xdp_max_mtu(params, NULL)); + err = -EINVAL; + goto out; + } + + if (priv->xsk.refcnt && + !mlx5e_xsk_validate_mtu(netdev, &priv->channels, + &new_params, priv->mdev)) { + err = -EINVAL; + goto out; + } + + if (params->packet_merge.type == MLX5E_PACKET_MERGE_LRO) + reset = false; + + if (params->rq_wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) { + bool is_linear_old = mlx5e_rx_mpwqe_is_linear_skb(priv->mdev, params, NULL); + bool is_linear_new = mlx5e_rx_mpwqe_is_linear_skb(priv->mdev, + &new_params, NULL); + u8 ppw_old = mlx5e_mpwqe_log_pkts_per_wqe(params, NULL); + u8 ppw_new = mlx5e_mpwqe_log_pkts_per_wqe(&new_params, NULL); + + /* Always reset in linear mode - hw_mtu is used in data path. + * Check that the mode was non-linear and didn't change. + * If XSK is active, XSK RQs are linear. + */ + if (!is_linear_old && !is_linear_new && !priv->xsk.refcnt && + ppw_old == ppw_new) + reset = false; + } + + err = mlx5e_safe_switch_params(priv, &new_params, preactivate, NULL, reset); + +out: + netdev->mtu = params->sw_mtu; + mutex_unlock(&priv->state_lock); + return err; +} + +static int mlx5e_change_nic_mtu(struct net_device *netdev, int new_mtu) +{ + return mlx5e_change_mtu(netdev, new_mtu, mlx5e_set_dev_port_mtu_ctx); +} + +int mlx5e_ptp_rx_manage_fs_ctx(struct mlx5e_priv *priv, void *ctx) +{ + bool set = *(bool *)ctx; + + return mlx5e_ptp_rx_manage_fs(priv, set); +} + +static int mlx5e_hwstamp_config_no_ptp_rx(struct mlx5e_priv *priv, bool rx_filter) +{ + bool rx_cqe_compress_def = priv->channels.params.rx_cqe_compress_def; + int err; + + if (!rx_filter) + /* Reset CQE compression to Admin default */ + return mlx5e_modify_rx_cqe_compression_locked(priv, rx_cqe_compress_def, false); + + if (!MLX5E_GET_PFLAG(&priv->channels.params, MLX5E_PFLAG_RX_CQE_COMPRESS)) + return 0; + + /* Disable CQE compression */ + netdev_warn(priv->netdev, "Disabling RX cqe compression\n"); + err = mlx5e_modify_rx_cqe_compression_locked(priv, false, true); + if (err) + netdev_err(priv->netdev, "Failed disabling cqe compression err=%d\n", err); + + return err; +} + +static int mlx5e_hwstamp_config_ptp_rx(struct mlx5e_priv *priv, bool ptp_rx) +{ + struct mlx5e_params new_params; + + if (ptp_rx == priv->channels.params.ptp_rx) + return 0; + + new_params = priv->channels.params; + new_params.ptp_rx = ptp_rx; + return mlx5e_safe_switch_params(priv, &new_params, mlx5e_ptp_rx_manage_fs_ctx, + &new_params.ptp_rx, true); +} + +int mlx5e_hwstamp_set(struct mlx5e_priv *priv, struct ifreq *ifr) +{ + struct hwtstamp_config config; + bool rx_cqe_compress_def; + bool ptp_rx; + int err; + + if (!MLX5_CAP_GEN(priv->mdev, device_frequency_khz) || + (mlx5_clock_get_ptp_index(priv->mdev) == -1)) + return -EOPNOTSUPP; + + if (copy_from_user(&config, ifr->ifr_data, sizeof(config))) + return -EFAULT; + + /* TX HW timestamp */ + switch (config.tx_type) { + case HWTSTAMP_TX_OFF: + case HWTSTAMP_TX_ON: + break; + default: + return -ERANGE; + } + + mutex_lock(&priv->state_lock); + rx_cqe_compress_def = priv->channels.params.rx_cqe_compress_def; + + /* RX HW timestamp */ + switch (config.rx_filter) { + case HWTSTAMP_FILTER_NONE: + ptp_rx = false; + break; + case HWTSTAMP_FILTER_ALL: + case HWTSTAMP_FILTER_SOME: + case HWTSTAMP_FILTER_PTP_V1_L4_EVENT: + case HWTSTAMP_FILTER_PTP_V1_L4_SYNC: + case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ: + case HWTSTAMP_FILTER_PTP_V2_L4_EVENT: + case HWTSTAMP_FILTER_PTP_V2_L4_SYNC: + case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ: + case HWTSTAMP_FILTER_PTP_V2_L2_EVENT: + case HWTSTAMP_FILTER_PTP_V2_L2_SYNC: + case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ: + case HWTSTAMP_FILTER_PTP_V2_EVENT: + case HWTSTAMP_FILTER_PTP_V2_SYNC: + case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ: + case HWTSTAMP_FILTER_NTP_ALL: + config.rx_filter = HWTSTAMP_FILTER_ALL; + /* ptp_rx is set if both HW TS is set and CQE + * compression is set + */ + ptp_rx = rx_cqe_compress_def; + break; + default: + err = -ERANGE; + goto err_unlock; + } + + if (!mlx5e_profile_feature_cap(priv->profile, PTP_RX)) + err = mlx5e_hwstamp_config_no_ptp_rx(priv, + config.rx_filter != HWTSTAMP_FILTER_NONE); + else + err = mlx5e_hwstamp_config_ptp_rx(priv, ptp_rx); + if (err) + goto err_unlock; + + memcpy(&priv->tstamp, &config, sizeof(config)); + mutex_unlock(&priv->state_lock); + + /* might need to fix some features */ + netdev_update_features(priv->netdev); + + return copy_to_user(ifr->ifr_data, &config, + sizeof(config)) ? -EFAULT : 0; +err_unlock: + mutex_unlock(&priv->state_lock); + return err; +} + +int mlx5e_hwstamp_get(struct mlx5e_priv *priv, struct ifreq *ifr) +{ + struct hwtstamp_config *cfg = &priv->tstamp; + + if (!MLX5_CAP_GEN(priv->mdev, device_frequency_khz)) + return -EOPNOTSUPP; + + return copy_to_user(ifr->ifr_data, cfg, sizeof(*cfg)) ? -EFAULT : 0; +} + +static int mlx5e_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + + switch (cmd) { + case SIOCSHWTSTAMP: + return mlx5e_hwstamp_set(priv, ifr); + case SIOCGHWTSTAMP: + return mlx5e_hwstamp_get(priv, ifr); + default: + return -EOPNOTSUPP; + } +} + +#ifdef CONFIG_MLX5_ESWITCH +int mlx5e_set_vf_mac(struct net_device *dev, int vf, u8 *mac) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5_core_dev *mdev = priv->mdev; + + return mlx5_eswitch_set_vport_mac(mdev->priv.eswitch, vf + 1, mac); +} + +static int mlx5e_set_vf_vlan(struct net_device *dev, int vf, u16 vlan, u8 qos, + __be16 vlan_proto) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5_core_dev *mdev = priv->mdev; + + return mlx5_eswitch_set_vport_vlan(mdev->priv.eswitch, vf + 1, + vlan, qos, vlan_proto); +} + +#ifdef HAVE_NETDEV_OPS_NDO_SET_VF_TRUNK_RANGE +static int mlx5e_add_vf_vlan_trunk_range(struct net_device *dev, int vf, + u16 start_vid, u16 end_vid, + __be16 vlan_proto) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5_core_dev *mdev = priv->mdev; + + if (vlan_proto != htons(ETH_P_8021Q)) + return -EPROTONOSUPPORT; + + return mlx5_eswitch_add_vport_trunk_range(mdev->priv.eswitch, vf + 1, + start_vid, end_vid); +} + +static int mlx5e_del_vf_vlan_trunk_range(struct net_device *dev, int vf, + u16 start_vid, u16 end_vid, + __be16 vlan_proto) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5_core_dev *mdev = priv->mdev; + + if (vlan_proto != htons(ETH_P_8021Q)) + return -EPROTONOSUPPORT; + + return mlx5_eswitch_del_vport_trunk_range(mdev->priv.eswitch, vf + 1, + start_vid, end_vid); +} +#endif + +static int mlx5e_set_vf_spoofchk(struct net_device *dev, int vf, bool setting) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5_core_dev *mdev = priv->mdev; + + return mlx5_eswitch_set_vport_spoofchk(mdev->priv.eswitch, vf + 1, setting); +} + +static int mlx5e_set_vf_trust(struct net_device *dev, int vf, bool setting) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5_core_dev *mdev = priv->mdev; + + return mlx5_eswitch_set_vport_trust(mdev->priv.eswitch, vf + 1, setting); +} + +int mlx5e_set_vf_rate(struct net_device *dev, int vf, int min_tx_rate, + int max_tx_rate) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5_core_dev *mdev = priv->mdev; + + return mlx5_eswitch_set_vport_rate(mdev->priv.eswitch, vf + 1, + max_tx_rate, min_tx_rate); +} + +static int mlx5_vport_link2ifla(u8 esw_link) +{ + switch (esw_link) { + case MLX5_VPORT_ADMIN_STATE_DOWN: + return IFLA_VF_LINK_STATE_DISABLE; + case MLX5_VPORT_ADMIN_STATE_UP: + return IFLA_VF_LINK_STATE_ENABLE; + } + return IFLA_VF_LINK_STATE_AUTO; +} + +static int mlx5_ifla_link2vport(u8 ifla_link) +{ + switch (ifla_link) { + case IFLA_VF_LINK_STATE_DISABLE: + return MLX5_VPORT_ADMIN_STATE_DOWN; + case IFLA_VF_LINK_STATE_ENABLE: + return MLX5_VPORT_ADMIN_STATE_UP; + } + return MLX5_VPORT_ADMIN_STATE_AUTO; +} + +static int mlx5e_set_vf_link_state(struct net_device *dev, int vf, + int link_state) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5_core_dev *mdev = priv->mdev; + + if (mlx5e_is_uplink_rep(priv)) + return -EOPNOTSUPP; + + return mlx5_eswitch_set_vport_state(mdev->priv.eswitch, vf + 1, + mlx5_ifla_link2vport(link_state)); +} + +int mlx5e_get_vf_config(struct net_device *dev, + int vf, struct ifla_vf_info *ivi) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5_core_dev *mdev = priv->mdev; + int err; + + if (!netif_device_present(dev)) + return -EOPNOTSUPP; + + err = mlx5_eswitch_get_vport_config(mdev->priv.eswitch, vf + 1, ivi); + if (err) + return err; + ivi->linkstate = mlx5_vport_link2ifla(ivi->linkstate); + return 0; +} + +int mlx5e_get_vf_stats(struct net_device *dev, + int vf, struct ifla_vf_stats *vf_stats) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5_core_dev *mdev = priv->mdev; + + return mlx5_eswitch_get_vport_stats(mdev->priv.eswitch, vf + 1, + vf_stats); +} + +static bool +mlx5e_has_offload_stats(const struct net_device *dev, int attr_id) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + + if (!netif_device_present(dev)) + return false; + + if (!mlx5e_is_uplink_rep(priv)) + return false; + + return mlx5e_rep_has_offload_stats(dev, attr_id); +} + +static int +mlx5e_get_offload_stats(int attr_id, const struct net_device *dev, + void *sp) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + + if (!mlx5e_is_uplink_rep(priv)) + return -EOPNOTSUPP; + + return mlx5e_rep_get_offload_stats(attr_id, dev, sp); +} +#endif + +static bool mlx5e_tunnel_proto_supported_tx(struct mlx5_core_dev *mdev, u8 proto_type) +{ + switch (proto_type) { + case IPPROTO_GRE: + return MLX5_CAP_ETH(mdev, tunnel_stateless_gre); + case IPPROTO_IPIP: + case IPPROTO_IPV6: + return (MLX5_CAP_ETH(mdev, tunnel_stateless_ip_over_ip) || + MLX5_CAP_ETH(mdev, tunnel_stateless_ip_over_ip_tx)); + default: + return false; + } +} + +static bool mlx5e_gre_tunnel_inner_proto_offload_supported(struct mlx5_core_dev *mdev, + struct sk_buff *skb) +{ + switch (skb->inner_protocol) { + case htons(ETH_P_IP): + case htons(ETH_P_IPV6): + case htons(ETH_P_TEB): + return true; + case htons(ETH_P_MPLS_UC): + case htons(ETH_P_MPLS_MC): + return MLX5_CAP_ETH(mdev, tunnel_stateless_mpls_over_gre); + } + return false; +} + +static netdev_features_t mlx5e_tunnel_features_check(struct mlx5e_priv *priv, + struct sk_buff *skb, + netdev_features_t features) +{ + unsigned int offset = 0; + struct udphdr *udph; + u8 proto; + u16 port; + + switch (vlan_get_protocol(skb)) { + case htons(ETH_P_IP): + proto = ip_hdr(skb)->protocol; + break; + case htons(ETH_P_IPV6): + proto = ipv6_find_hdr(skb, &offset, -1, NULL, NULL); + break; + default: + goto out; + } + + switch (proto) { + case IPPROTO_GRE: + if (mlx5e_gre_tunnel_inner_proto_offload_supported(priv->mdev, skb)) + return features; + break; + case IPPROTO_IPIP: + case IPPROTO_IPV6: + if (mlx5e_tunnel_proto_supported_tx(priv->mdev, IPPROTO_IPIP)) + return features; + break; + case IPPROTO_UDP: + udph = udp_hdr(skb); + port = be16_to_cpu(udph->dest); + + /* Verify if UDP port is being offloaded by HW */ + if (mlx5_vxlan_lookup_port(priv->mdev->vxlan, port)) + return features; + +#if IS_ENABLED(CONFIG_GENEVE) + /* Support Geneve offload for default UDP port */ + if (port == GENEVE_UDP_PORT && mlx5_geneve_tx_allowed(priv->mdev)) + return features; +#endif + break; +#ifdef CONFIG_MLX5_EN_IPSEC + case IPPROTO_ESP: + return mlx5e_ipsec_feature_check(skb, features); +#endif + } + +out: + /* Disable CSUM and GSO if the udp dport is not offloaded by HW */ + return features & ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK); +} + +netdev_features_t mlx5e_features_check(struct sk_buff *skb, + struct net_device *netdev, + netdev_features_t features) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + + features = vlan_features_check(skb, features); + features = vxlan_features_check(skb, features); + + /* Validate if the tunneled packet is being offloaded by HW */ + if (skb->encapsulation && + (features & NETIF_F_CSUM_MASK || features & NETIF_F_GSO_MASK)) + return mlx5e_tunnel_features_check(priv, skb, features); + + return features; +} + +static void mlx5e_tx_timeout_work(struct work_struct *work) +{ + struct mlx5e_priv *priv = container_of(work, struct mlx5e_priv, + tx_timeout_work); + struct net_device *netdev = priv->netdev; + int i; + + if (priv->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) + return; + + rtnl_lock(); + mutex_lock(&priv->state_lock); + + if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) + goto unlock; + + for (i = 0; i < netdev->real_num_tx_queues; i++) { + struct netdev_queue *dev_queue = + netdev_get_tx_queue(netdev, i); + struct mlx5e_txqsq *sq = priv->txq2sq[i]; + + if (!netif_xmit_stopped(dev_queue)) + continue; + + if (mlx5e_reporter_tx_timeout(sq)) + /* break if tried to reopened channels */ + break; + } + +unlock: + mutex_unlock(&priv->state_lock); + rtnl_unlock(); +} + +static void mlx5e_tx_timeout(struct net_device *dev, unsigned int txqueue) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + + netdev_err(dev, "TX timeout detected\n"); + queue_work(priv->wq, &priv->tx_timeout_work); +} + +static int mlx5e_xdp_allowed(struct mlx5e_priv *priv, struct bpf_prog *prog) +{ + struct net_device *netdev = priv->netdev; + struct mlx5e_params new_params; + + if (priv->channels.params.packet_merge.type != MLX5E_PACKET_MERGE_NONE) { + netdev_warn(netdev, "can't set XDP while HW-GRO/LRO is on, disable them first\n"); + return -EINVAL; + } + + if (mlx5_fpga_is_ipsec_device(priv->mdev)) { + netdev_warn(netdev, + "XDP is not available on Innova cards with IPsec support\n"); + return -EINVAL; + } + + new_params = priv->channels.params; + new_params.xdp_prog = prog; + + /* No XSK params: AF_XDP can't be enabled yet at the point of setting + * the XDP program. + */ + if (!mlx5e_rx_is_linear_skb(&new_params, NULL)) { + netdev_warn(netdev, "XDP is not allowed with MTU(%d) > %d\n", + new_params.sw_mtu, + mlx5e_xdp_max_mtu(&new_params, NULL)); + return -EINVAL; + } + + return 0; +} + +static void mlx5e_rq_replace_xdp_prog(struct mlx5e_rq *rq, struct bpf_prog *prog) +{ + struct bpf_prog *old_prog; + + old_prog = rcu_replace_pointer(rq->xdp_prog, prog, + lockdep_is_held(&rq->priv->state_lock)); + if (old_prog) + bpf_prog_put(old_prog); +} + +static int mlx5e_xdp_set(struct net_device *netdev, struct bpf_prog *prog) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5e_params new_params; + struct bpf_prog *old_prog; + int err = 0; + bool reset; + int i; + + mutex_lock(&priv->state_lock); + + if (prog) { + err = mlx5e_xdp_allowed(priv, prog); + if (err) + goto unlock; + } + + /* no need for full reset when exchanging programs */ + reset = (!priv->channels.params.xdp_prog || !prog); + + new_params = priv->channels.params; + new_params.xdp_prog = prog; + + /* XDP affects striding RQ parameters. Block XDP if striding RQ won't be + * supported with the new parameters: if PAGE_SIZE is bigger than + * MLX5_MPWQE_LOG_STRIDE_SZ_MAX, striding RQ can't be used, even though + * the MTU is small enough for the linear mode, because XDP uses strides + * of PAGE_SIZE on regular RQs. + */ + if (reset && MLX5E_GET_PFLAG(&new_params, MLX5E_PFLAG_RX_STRIDING_RQ)) { + /* Checking for regular RQs here; XSK RQs were checked on XSK bind. */ + err = mlx5e_mpwrq_validate_regular(priv->mdev, &new_params); + if (err) + goto unlock; + } + + old_prog = priv->channels.params.xdp_prog; + + err = mlx5e_safe_switch_params(priv, &new_params, NULL, NULL, reset); + if (err) + goto unlock; + + if (old_prog) + bpf_prog_put(old_prog); + + if (!test_bit(MLX5E_STATE_OPENED, &priv->state) || reset) + goto unlock; + + /* exchanging programs w/o reset, we update ref counts on behalf + * of the channels RQs here. + */ + bpf_prog_add(prog, priv->channels.num); + for (i = 0; i < priv->channels.num; i++) { + struct mlx5e_channel *c = priv->channels.c[i]; + + mlx5e_rq_replace_xdp_prog(&c->rq, prog); + if (test_bit(MLX5E_CHANNEL_STATE_XSK, c->state)) { + bpf_prog_inc(prog); + mlx5e_rq_replace_xdp_prog(&c->xskrq, prog); + } + } + +unlock: + mutex_unlock(&priv->state_lock); + + /* Need to fix some features. */ + if (!err) + netdev_update_features(netdev); + + return err; +} + +static int mlx5e_xdp(struct net_device *dev, struct netdev_bpf *xdp) +{ + switch (xdp->command) { + case XDP_SETUP_PROG: + return mlx5e_xdp_set(dev, xdp->prog); + case XDP_SETUP_XSK_POOL: + return mlx5e_xsk_setup_pool(dev, xdp->xsk.pool, + xdp->xsk.queue_id); + default: + return -EINVAL; + } +} + +#ifdef CONFIG_MLX5_ESWITCH +static int mlx5e_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, + struct net_device *dev, u32 filter_mask, + int nlflags) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5_core_dev *mdev = priv->mdev; + u8 mode, setting; + int err; + + err = mlx5_eswitch_get_vepa(mdev->priv.eswitch, &setting); + if (err) + return err; + mode = setting ? BRIDGE_MODE_VEPA : BRIDGE_MODE_VEB; + return ndo_dflt_bridge_getlink(skb, pid, seq, dev, + mode, + 0, 0, nlflags, filter_mask, NULL); +} + +static int mlx5e_bridge_setlink(struct net_device *dev, struct nlmsghdr *nlh, + u16 flags, struct netlink_ext_ack *extack) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5_core_dev *mdev = priv->mdev; + struct nlattr *attr, *br_spec; + u16 mode = BRIDGE_MODE_UNDEF; + u8 setting; + int rem; + + br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); + if (!br_spec) + return -EINVAL; + + nla_for_each_nested(attr, br_spec, rem) { + if (nla_type(attr) != IFLA_BRIDGE_MODE) + continue; + + if (nla_len(attr) < sizeof(mode)) + return -EINVAL; + + mode = nla_get_u16(attr); + if (mode > BRIDGE_MODE_VEPA) + return -EINVAL; + + break; + } + + if (mode == BRIDGE_MODE_UNDEF) + return -EINVAL; + + setting = (mode == BRIDGE_MODE_VEPA) ? 1 : 0; + return mlx5_eswitch_set_vepa(mdev->priv.eswitch, setting); +} +#endif + +const struct net_device_ops mlx5e_netdev_ops = { + .ndo_open = mlx5e_open, + .ndo_stop = mlx5e_close, + .ndo_start_xmit = mlx5e_xmit, + .ndo_setup_tc = mlx5e_setup_tc, + .ndo_select_queue = mlx5e_select_queue, + .ndo_get_stats64 = mlx5e_get_stats, + .ndo_set_rx_mode = mlx5e_set_rx_mode, + .ndo_set_mac_address = mlx5e_set_mac, + .ndo_vlan_rx_add_vid = mlx5e_vlan_rx_add_vid, + .ndo_vlan_rx_kill_vid = mlx5e_vlan_rx_kill_vid, + .ndo_set_features = mlx5e_set_features, + .ndo_fix_features = mlx5e_fix_features, + .ndo_change_mtu = mlx5e_change_nic_mtu, + .ndo_eth_ioctl = mlx5e_ioctl, + .ndo_set_tx_maxrate = mlx5e_set_tx_maxrate, + .ndo_features_check = mlx5e_features_check, + .ndo_tx_timeout = mlx5e_tx_timeout, + .ndo_bpf = mlx5e_xdp, + .ndo_xdp_xmit = mlx5e_xdp_xmit, + .ndo_xsk_wakeup = mlx5e_xsk_wakeup, +#ifdef CONFIG_MLX5_EN_ARFS + .ndo_rx_flow_steer = mlx5e_rx_flow_steer, +#endif +#ifdef CONFIG_MLX5_ESWITCH + .ndo_bridge_setlink = mlx5e_bridge_setlink, + .ndo_bridge_getlink = mlx5e_bridge_getlink, + + /* SRIOV E-Switch NDOs */ + .ndo_set_vf_mac = mlx5e_set_vf_mac, + .ndo_set_vf_vlan = mlx5e_set_vf_vlan, + + /* these ndo's are not upstream yet */ +#ifdef HAVE_NETDEV_OPS_NDO_SET_VF_TRUNK_RANGE + .ndo_add_vf_vlan_trunk_range = mlx5e_add_vf_vlan_trunk_range, + .ndo_del_vf_vlan_trunk_range = mlx5e_del_vf_vlan_trunk_range, +#endif + + .ndo_set_vf_spoofchk = mlx5e_set_vf_spoofchk, + .ndo_set_vf_trust = mlx5e_set_vf_trust, + .ndo_set_vf_rate = mlx5e_set_vf_rate, + .ndo_get_vf_config = mlx5e_get_vf_config, + .ndo_set_vf_link_state = mlx5e_set_vf_link_state, + .ndo_get_vf_stats = mlx5e_get_vf_stats, + .ndo_has_offload_stats = mlx5e_has_offload_stats, + .ndo_get_offload_stats = mlx5e_get_offload_stats, +#endif + .ndo_get_devlink_port = mlx5e_get_devlink_port, +}; + +u32 mlx5e_choose_lro_timeout(struct mlx5_core_dev *mdev, u32 wanted_timeout) +{ + int i; + + /* The supported periods are organized in ascending order */ + for (i = 0; i < MLX5E_LRO_TIMEOUT_ARR_SIZE - 1; i++) + if (MLX5_CAP_ETH(mdev, lro_timer_supported_periods[i]) >= wanted_timeout) + break; + + return MLX5_CAP_ETH(mdev, lro_timer_supported_periods[i]); +} + +static void mlx5e_init_delay_drop(struct mlx5e_priv *priv, + struct mlx5e_params *params) +{ + if (!mlx5e_dropless_rq_supported(priv->mdev)) + return; + + mutex_init(&priv->delay_drop.lock); + priv->delay_drop.activate = false; + priv->delay_drop.usec_timeout = MLX5_MAX_DELAY_DROP_TIMEOUT_MS * 1000; + INIT_WORK(&priv->delay_drop.work, mlx5e_delay_drop_handler); +} + +void mlx5e_build_nic_params(struct mlx5e_priv *priv, struct mlx5e_xsk *xsk, u16 mtu) +{ + struct mlx5e_params *params = &priv->channels.params; + struct mlx5_core_dev *mdev = priv->mdev; + u8 rx_cq_period_mode; + + params->sw_mtu = mtu; + params->hard_mtu = MLX5E_ETH_HARD_MTU; + params->num_channels = min_t(unsigned int, MLX5E_MAX_NUM_CHANNELS / 2, + priv->max_nch); + params->log_rx_page_cache_mult = MLX5E_PAGE_CACHE_LOG_MAX_RQ_MULT; + mlx5e_params_mqprio_reset(params); + + /* SQ */ + params->log_sq_size = is_kdump_kernel() ? + MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE : + MLX5E_PARAMS_DEFAULT_LOG_SQ_SIZE; + MLX5E_SET_PFLAG(params, MLX5E_PFLAG_SKB_TX_MPWQE, mlx5e_tx_mpwqe_supported(mdev)); + + /* XDP SQ */ + MLX5E_SET_PFLAG(params, MLX5E_PFLAG_XDP_TX_MPWQE, mlx5e_tx_mpwqe_supported(mdev)); + + /* set CQE compression */ + params->rx_cqe_compress_def = false; + if (MLX5_CAP_GEN(mdev, cqe_compression) && + MLX5_CAP_GEN(mdev, vport_group_manager)) + params->rx_cqe_compress_def = slow_pci_heuristic(mdev); + + MLX5E_SET_PFLAG(params, MLX5E_PFLAG_RX_CQE_COMPRESS, params->rx_cqe_compress_def); + MLX5E_SET_PFLAG(params, MLX5E_PFLAG_RX_NO_CSUM_COMPLETE, false); + + /* RQ */ + mlx5e_build_rq_params(mdev, params); + + params->packet_merge.timeout = mlx5e_choose_lro_timeout(mdev, MLX5E_DEFAULT_LRO_TIMEOUT); + + /* CQ moderation params */ + rx_cq_period_mode = MLX5_CAP_GEN(mdev, cq_period_start_from_cqe) ? + MLX5_CQ_PERIOD_MODE_START_FROM_CQE : + MLX5_CQ_PERIOD_MODE_START_FROM_EQE; + params->rx_dim_enabled = MLX5_CAP_GEN(mdev, cq_moderation); + params->tx_dim_enabled = MLX5_CAP_GEN(mdev, cq_moderation); + mlx5e_set_rx_cq_mode_params(params, rx_cq_period_mode); + mlx5e_set_tx_cq_mode_params(params, MLX5_CQ_PERIOD_MODE_START_FROM_EQE); + + /* TX inline */ + mlx5_query_min_inline(mdev, ¶ms->tx_min_inline_mode); + + params->tunneled_offload_en = mlx5_tunnel_inner_ft_supported(mdev); + + MLX5E_SET_PFLAG(params, MLX5E_PFLAG_PER_CH_STATS, true); + /* AF_XDP */ + params->xsk = xsk; + + /* TX HW checksum offload for XDP is off by default */ + MLX5E_SET_PFLAG(params, MLX5E_PFLAG_TX_XDP_CSUM, 0); + /* Do not update netdev->features directly in here + * on mlx5e_attach_netdev() we will call mlx5e_update_features() + * To update netdev->features please modify mlx5e_fix_features() + */ +} + +static void mlx5e_set_netdev_dev_addr(struct net_device *netdev) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + u8 addr[ETH_ALEN]; + + mlx5_query_mac_address(priv->mdev, addr); + if (is_zero_ether_addr(addr) && + !MLX5_CAP_GEN(priv->mdev, vport_group_manager)) { + eth_hw_addr_random(netdev); + mlx5_core_info(priv->mdev, "Assigned random MAC address %pM\n", netdev->dev_addr); + return; + } + + eth_hw_addr_set(netdev, addr); +} + +static int mlx5e_vxlan_set_port(struct net_device *netdev, unsigned int table, + unsigned int entry, struct udp_tunnel_info *ti) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + + return mlx5_vxlan_add_port(priv->mdev->vxlan, ntohs(ti->port)); +} + +static int mlx5e_vxlan_unset_port(struct net_device *netdev, unsigned int table, + unsigned int entry, struct udp_tunnel_info *ti) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + + return mlx5_vxlan_del_port(priv->mdev->vxlan, ntohs(ti->port)); +} + +void mlx5e_vxlan_set_netdev_info(struct mlx5e_priv *priv) +{ + if (!mlx5_vxlan_allowed(priv->mdev->vxlan)) + return; + + priv->nic_info.set_port = mlx5e_vxlan_set_port; + priv->nic_info.unset_port = mlx5e_vxlan_unset_port; + priv->nic_info.flags = UDP_TUNNEL_NIC_INFO_MAY_SLEEP | + UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN; + priv->nic_info.tables[0].tunnel_types = UDP_TUNNEL_TYPE_VXLAN; + /* Don't count the space hard-coded to the IANA port */ + priv->nic_info.tables[0].n_entries = + mlx5_vxlan_max_udp_ports(priv->mdev) - 1; + + priv->netdev->udp_tunnel_nic_info = &priv->nic_info; +} + +static bool mlx5e_tunnel_any_tx_proto_supported(struct mlx5_core_dev *mdev) +{ + int tt; + + for (tt = 0; tt < MLX5_NUM_TUNNEL_TT; tt++) { + if (mlx5e_tunnel_proto_supported_tx(mdev, mlx5_get_proto_by_tunnel_type(tt))) + return true; + } + return (mlx5_vxlan_allowed(mdev->vxlan) || mlx5_geneve_tx_allowed(mdev)); +} + +static void mlx5e_build_nic_netdev(struct net_device *netdev) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5_core_dev *mdev = priv->mdev; + bool fcs_supported; + bool fcs_enabled; + + SET_NETDEV_DEV(netdev, mdev->device); + + netdev->netdev_ops = &mlx5e_netdev_ops; + + mlx5e_dcbnl_build_netdev(netdev); + + netdev->watchdog_timeo = 15 * HZ; + + netdev->ethtool_ops = &mlx5e_ethtool_ops; + + netdev->vlan_features |= NETIF_F_SG; + netdev->vlan_features |= NETIF_F_HW_CSUM; + netdev->vlan_features |= NETIF_F_GRO; + netdev->vlan_features |= NETIF_F_TSO; + netdev->vlan_features |= NETIF_F_TSO6; + netdev->vlan_features |= NETIF_F_RXCSUM; + netdev->vlan_features |= NETIF_F_RXHASH; + + netdev->mpls_features |= NETIF_F_SG; + netdev->mpls_features |= NETIF_F_HW_CSUM; + netdev->mpls_features |= NETIF_F_TSO; + netdev->mpls_features |= NETIF_F_TSO6; + + netdev->hw_enc_features |= NETIF_F_HW_VLAN_CTAG_TX; + netdev->hw_enc_features |= NETIF_F_HW_VLAN_CTAG_RX; + + /* Tunneled LRO is not supported in the driver, and the same RQs are + * shared between inner and outer TIRs, so the driver can't disable LRO + * for inner TIRs while having it enabled for outer TIRs. Due to this, + * block LRO altogether if the firmware declares tunneled LRO support. + */ + if (!!MLX5_CAP_ETH(mdev, lro_cap) && + !MLX5_CAP_ETH(mdev, tunnel_lro_vxlan) && + !MLX5_CAP_ETH(mdev, tunnel_lro_gre) && + mlx5e_check_fragmented_striding_rq_cap(mdev)) + netdev->vlan_features |= NETIF_F_LRO; + + netdev->hw_features = netdev->vlan_features; + netdev->hw_features |= NETIF_F_HW_VLAN_CTAG_TX; + netdev->hw_features |= NETIF_F_HW_VLAN_CTAG_RX; + netdev->hw_features |= NETIF_F_HW_VLAN_CTAG_FILTER; + netdev->hw_features |= NETIF_F_HW_VLAN_STAG_TX; + + if (mlx5e_tunnel_any_tx_proto_supported(mdev)) { + netdev->hw_enc_features |= NETIF_F_HW_CSUM; + netdev->hw_enc_features |= NETIF_F_TSO; + netdev->hw_enc_features |= NETIF_F_TSO6; + netdev->hw_enc_features |= NETIF_F_GSO_PARTIAL; + } + + if (mlx5_vxlan_allowed(mdev->vxlan) || mlx5_geneve_tx_allowed(mdev)) { + netdev->hw_features |= NETIF_F_GSO_UDP_TUNNEL | + NETIF_F_GSO_UDP_TUNNEL_CSUM; + netdev->hw_enc_features |= NETIF_F_GSO_UDP_TUNNEL | + NETIF_F_GSO_UDP_TUNNEL_CSUM; + netdev->gso_partial_features = NETIF_F_GSO_UDP_TUNNEL_CSUM; + netdev->vlan_features |= NETIF_F_GSO_UDP_TUNNEL | + NETIF_F_GSO_UDP_TUNNEL_CSUM; + } + + if (mlx5e_tunnel_proto_supported_tx(mdev, IPPROTO_GRE)) { + netdev->hw_features |= NETIF_F_GSO_GRE | + NETIF_F_GSO_GRE_CSUM; + netdev->hw_enc_features |= NETIF_F_GSO_GRE | + NETIF_F_GSO_GRE_CSUM; + netdev->gso_partial_features |= NETIF_F_GSO_GRE | + NETIF_F_GSO_GRE_CSUM; + } + + if (mlx5e_tunnel_proto_supported_tx(mdev, IPPROTO_IPIP)) { + netdev->hw_features |= NETIF_F_GSO_IPXIP4 | + NETIF_F_GSO_IPXIP6; + netdev->hw_enc_features |= NETIF_F_GSO_IPXIP4 | + NETIF_F_GSO_IPXIP6; + netdev->gso_partial_features |= NETIF_F_GSO_IPXIP4 | + NETIF_F_GSO_IPXIP6; + } + + netdev->hw_features |= NETIF_F_GSO_PARTIAL; + netdev->gso_partial_features |= NETIF_F_GSO_UDP_L4; + netdev->hw_features |= NETIF_F_GSO_UDP_L4; + netdev->features |= NETIF_F_GSO_UDP_L4; + + mlx5_query_port_fcs(mdev, &fcs_supported, &fcs_enabled); + + if (fcs_supported) + netdev->hw_features |= NETIF_F_RXALL; + + if (MLX5_CAP_ETH(mdev, scatter_fcs)) + netdev->hw_features |= NETIF_F_RXFCS; + + if (mlx5_qos_is_supported(mdev)) + netdev->hw_features |= NETIF_F_HW_TC; + + netdev->features = netdev->hw_features; + + /* Defaults */ + if (fcs_enabled) + netdev->features &= ~NETIF_F_RXALL; + netdev->features &= ~NETIF_F_LRO; + netdev->features &= ~NETIF_F_GRO_HW; + netdev->features &= ~NETIF_F_RXFCS; + +#define FT_CAP(f) MLX5_CAP_FLOWTABLE(mdev, flow_table_properties_nic_receive.f) + if (FT_CAP(flow_modify_en) && + FT_CAP(modify_root) && + FT_CAP(identified_miss_table_mode) && + FT_CAP(flow_table_modify)) { +#if IS_ENABLED(CONFIG_MLX5_CLS_ACT) + netdev->hw_features |= NETIF_F_HW_TC; +#endif +#ifdef CONFIG_MLX5_EN_ARFS + netdev->hw_features |= NETIF_F_NTUPLE; +#endif + } + + netdev->features |= NETIF_F_HIGHDMA; + netdev->features |= NETIF_F_HW_VLAN_STAG_FILTER; + + netdev->priv_flags |= IFF_UNICAST_FLT; + + mlx5e_set_netdev_dev_addr(netdev); + mlx5e_macsec_build_netdev(priv); + mlx5e_ipsec_build_netdev(priv); + mlx5e_tls_build_netdev(priv); +} + +void mlx5e_create_q_counters(struct mlx5e_priv *priv) +{ + u32 out[MLX5_ST_SZ_DW(alloc_q_counter_out)] = {}; + u32 in[MLX5_ST_SZ_DW(alloc_q_counter_in)] = {}; + struct mlx5_core_dev *mdev = priv->mdev; + int err; + + MLX5_SET(alloc_q_counter_in, in, opcode, MLX5_CMD_OP_ALLOC_Q_COUNTER); + err = mlx5_cmd_exec_inout(mdev, alloc_q_counter, in, out); + if (!err) + priv->q_counter = + MLX5_GET(alloc_q_counter_out, out, counter_set_id); + + err = mlx5_cmd_exec_inout(mdev, alloc_q_counter, in, out); + if (!err) + priv->drop_rq_q_counter = + MLX5_GET(alloc_q_counter_out, out, counter_set_id); +} + +void mlx5e_destroy_q_counters(struct mlx5e_priv *priv) +{ + u32 in[MLX5_ST_SZ_DW(dealloc_q_counter_in)] = {}; + + MLX5_SET(dealloc_q_counter_in, in, opcode, + MLX5_CMD_OP_DEALLOC_Q_COUNTER); + if (priv->q_counter) { + MLX5_SET(dealloc_q_counter_in, in, counter_set_id, + priv->q_counter); + mlx5_cmd_exec_in(priv->mdev, dealloc_q_counter, in); + } + + if (priv->drop_rq_q_counter) { + MLX5_SET(dealloc_q_counter_in, in, counter_set_id, + priv->drop_rq_q_counter); + mlx5_cmd_exec_in(priv->mdev, dealloc_q_counter, in); + } +} + +static int mlx5e_nic_init(struct mlx5_core_dev *mdev, + struct net_device *netdev) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + int err; + + mlx5e_build_nic_params(priv, &priv->xsk, netdev->mtu); + mlx5e_vxlan_set_netdev_info(priv); + mutex_init(&priv->aso_lock); + + mlx5e_init_delay_drop(priv, &priv->channels.params); + + mlx5e_timestamp_init(priv); + + err = mlx5e_fs_init(priv); + if (err) { + mlx5_core_err(mdev, "FS initialization failed, %d\n", err); + return err; + } + + err = mlx5e_ipsec_init(priv); + if (err) + mlx5_core_err(mdev, "IPSec initialization failed, %d\n", err); + + err = mlx5e_tls_init(priv); + if (err) + mlx5_core_err(mdev, "TLS initialization failed, %d\n", err); + + mlx5e_health_create_reporters(priv); + return 0; +} + +static void mlx5e_nic_cleanup(struct mlx5e_priv *priv) +{ + mlx5e_health_destroy_reporters(priv); + mlx5e_tls_cleanup(priv); + mlx5e_ipsec_cleanup(priv); + mlx5e_fs_cleanup(priv); +} + +static int mlx5e_init_nic_rx(struct mlx5e_priv *priv) +{ + struct mlx5_core_dev *mdev = priv->mdev; + enum mlx5e_rx_res_features features; + int err; + + priv->rx_res = mlx5e_rx_res_alloc(); + if (!priv->rx_res) + return -ENOMEM; + + /* Update tunneled offloads cap which might be updated during re-attach */ + priv->channels.params.tunneled_offload_en = mlx5_tunnel_inner_ft_supported(mdev); + + mlx5e_create_q_counters(priv); + + err = mlx5e_open_drop_rq(priv, &priv->drop_rq); + if (err) { + mlx5_core_err(mdev, "open drop rq failed, %d\n", err); + goto err_destroy_q_counters; + } + + features = MLX5E_RX_RES_FEATURE_XSK | MLX5E_RX_RES_FEATURE_PTP; + if (priv->channels.params.tunneled_offload_en) + features |= MLX5E_RX_RES_FEATURE_INNER_FT; + err = mlx5e_rx_res_init(priv->rx_res, priv->mdev, features, + priv->max_nch, priv->drop_rq.rqn, + &priv->channels.params.packet_merge, + priv->channels.params.num_channels); + if (err) + goto err_close_drop_rq; + + err = mlx5e_create_flow_steering(priv); + if (err) { + mlx5_core_warn(mdev, "create flow steering failed, %d\n", err); + goto err_destroy_rx_res; + } + + err = mlx5e_tc_nic_init(priv); + if (err) + goto err_destroy_flow_steering; + + err = mlx5e_accel_init_rx(priv); + if (err) + goto err_tc_nic_cleanup; + +#ifdef CONFIG_MLX5_EN_ARFS + priv->netdev->rx_cpu_rmap = mlx5_eq_table_get_rmap(priv->mdev); +#endif + + return 0; + +err_tc_nic_cleanup: + mlx5e_tc_nic_cleanup(priv); +err_destroy_flow_steering: + mlx5e_destroy_flow_steering(priv); +err_destroy_rx_res: + mlx5e_rx_res_destroy(priv->rx_res); +err_close_drop_rq: + mlx5e_close_drop_rq(&priv->drop_rq); +err_destroy_q_counters: + mlx5e_destroy_q_counters(priv); + mlx5e_rx_res_free(priv->rx_res); + priv->rx_res = NULL; + return err; +} + +static void mlx5e_cleanup_nic_rx(struct mlx5e_priv *priv) +{ + mlx5e_accel_cleanup_rx(priv); + mlx5e_tc_nic_cleanup(priv); + mlx5e_destroy_flow_steering(priv); + mlx5e_rx_res_destroy(priv->rx_res); + mlx5e_close_drop_rq(&priv->drop_rq); + mlx5e_destroy_q_counters(priv); + mlx5e_rx_res_free(priv->rx_res); + priv->rx_res = NULL; +} + +static void mlx5e_set_mqprio_rl(struct mlx5e_priv *priv) +{ + struct mlx5e_params *params; + struct mlx5e_mqprio_rl *rl; + + params = &priv->channels.params; + if (params->mqprio.mode != TC_MQPRIO_MODE_CHANNEL) + return; + + rl = mlx5e_mqprio_rl_create(priv->mdev, params->mqprio.num_tc, + params->mqprio.channel.max_rate); + if (IS_ERR(rl)) + rl = NULL; + priv->mqprio_rl = rl; + mlx5e_mqprio_rl_update_params(params, rl); +} + +static int mlx5e_init_nic_tx(struct mlx5e_priv *priv) +{ + int err; + + err = mlx5e_create_tises(priv); + if (err) { + mlx5_core_warn(priv->mdev, "create tises failed, %d\n", err); + return err; + } + + mlx5e_set_mqprio_rl(priv); + mlx5e_dcbnl_initialize(priv); + return 0; +} + +static void mlx5e_nic_enable(struct mlx5e_priv *priv) +{ + struct net_device *netdev = priv->netdev; + struct mlx5_core_dev *mdev = priv->mdev; + int err; + + mlx5e_init_l2_addr(priv); + + err = mlx5e_macsec_init(priv); + if (err) + mlx5_core_err(mdev, "MACsec initialization failed, %d\n", err); + + /* Marking the link as currently not needed by the Driver */ + if (!netif_running(netdev)) + mlx5e_modify_admin_state(mdev, MLX5_PORT_DOWN); + + mlx5e_set_netdev_mtu_boundaries(priv); + mlx5e_set_dev_port_mtu(priv); + + mlx5_lag_add_netdev(mdev, netdev); + + mlx5e_enable_async_events(priv); + mlx5e_enable_blocking_events(priv); + if (mlx5e_monitor_counter_supported(priv)) + mlx5e_monitor_counter_init(priv); + + mlx5e_hv_vhca_stats_create(priv); + if (netdev->reg_state != NETREG_REGISTERED) + return; + mlx5e_dcbnl_init_app(priv); + + mlx5e_nic_set_rx_mode(priv); + + rtnl_lock(); + if (netif_running(netdev)) + mlx5e_open(netdev); + udp_tunnel_nic_reset_ntf(priv->netdev); + netif_device_attach(netdev); + rtnl_unlock(); +} + +static void mlx5e_nic_disable(struct mlx5e_priv *priv) +{ + struct mlx5_core_dev *mdev = priv->mdev; + + if (priv->netdev->reg_state == NETREG_REGISTERED) + mlx5e_dcbnl_delete_app(priv); + + rtnl_lock(); + if (netif_running(priv->netdev)) + mlx5e_close(priv->netdev); + netif_device_detach(priv->netdev); + rtnl_unlock(); + + mlx5e_nic_set_rx_mode(priv); + + mlx5e_hv_vhca_stats_destroy(priv); + if (mlx5e_monitor_counter_supported(priv)) + mlx5e_monitor_counter_cleanup(priv); + + mlx5e_disable_blocking_events(priv); + if (priv->en_trap) { + mlx5e_deactivate_trap(priv); + mlx5e_close_trap(priv->en_trap); + priv->en_trap = NULL; + } + mlx5e_disable_async_events(priv); + mlx5_lag_remove_netdev(mdev, priv->netdev); + mlx5_vxlan_reset_to_default(mdev->vxlan); + mlx5e_macsec_cleanup(priv); +} + +int mlx5e_update_nic_rx(struct mlx5e_priv *priv) +{ + return mlx5e_refresh_tirs(priv, false, false); +} + +static const struct mlx5e_profile mlx5e_nic_profile = { + .init = mlx5e_nic_init, + .cleanup = mlx5e_nic_cleanup, + .init_rx = mlx5e_init_nic_rx, + .cleanup_rx = mlx5e_cleanup_nic_rx, + .init_tx = mlx5e_init_nic_tx, + .cleanup_tx = mlx5e_cleanup_nic_tx, + .enable = mlx5e_nic_enable, + .disable = mlx5e_nic_disable, + .update_rx = mlx5e_update_nic_rx, + .update_stats = mlx5e_stats_update_ndo_stats, + .update_carrier = mlx5e_update_carrier, + .rx_handlers = &mlx5e_rx_handlers_nic, + .max_tc = MLX5E_MAX_NUM_TC, + .rq_groups = MLX5E_NUM_RQ_GROUPS(XSK), + .stats_grps = mlx5e_nic_stats_grps, + .stats_grps_num = mlx5e_nic_stats_grps_num, + .features = BIT(MLX5E_PROFILE_FEATURE_PTP_RX) | + BIT(MLX5E_PROFILE_FEATURE_PTP_TX) | + BIT(MLX5E_PROFILE_FEATURE_QOS_HTB), +}; + +static int mlx5e_profile_max_num_channels(struct mlx5_core_dev *mdev, + const struct mlx5e_profile *profile) +{ + int nch; + + nch = mlx5e_get_max_num_channels(mdev); + + if (profile->max_nch_limit) + nch = min_t(int, nch, profile->max_nch_limit(mdev)); + return nch; +} + +static unsigned int +mlx5e_calc_max_nch(struct mlx5_core_dev *mdev, struct net_device *netdev, + const struct mlx5e_profile *profile) + +{ + unsigned int max_nch, tmp; + + /* core resources */ + max_nch = mlx5e_profile_max_num_channels(mdev, profile); + + /* netdev rx queues */ + tmp = netdev->num_rx_queues / max_t(u8, profile->rq_groups, 1); + max_nch = min_t(unsigned int, max_nch, tmp); + + /* netdev tx queues */ + tmp = netdev->num_tx_queues; + if (mlx5_qos_is_supported(mdev)) + tmp -= mlx5e_qos_max_leaf_nodes(mdev); + if (MLX5_CAP_GEN(mdev, ts_cqe_to_dest_cqn)) + tmp -= profile->max_tc; + tmp = tmp / profile->max_tc; + max_nch = min_t(unsigned int, max_nch, tmp); + + return max_nch; +} + +int mlx5e_get_pf_num_tirs(struct mlx5_core_dev *mdev) +{ + /* Indirect TIRS: 2 sets of TTCs (inner + outer steering) + * and 1 set of direct TIRS + */ + return 2 * MLX5E_NUM_INDIR_TIRS + + mlx5e_profile_max_num_channels(mdev, &mlx5e_nic_profile); +} + +/* mlx5e generic netdev management API (move to en_common.c) */ +int mlx5e_priv_init(struct mlx5e_priv *priv, + const struct mlx5e_profile *profile, + struct net_device *netdev, + struct mlx5_core_dev *mdev) +{ + int nch, num_txqs, node; + + num_txqs = netdev->num_tx_queues; + nch = mlx5e_calc_max_nch(mdev, netdev, profile); + node = dev_to_node(mlx5_core_dma_dev(mdev)); + + /* priv init */ + priv->mdev = mdev; + priv->netdev = netdev; + priv->msglevel = MLX5E_MSG_LEVEL; + priv->max_nch = nch; + priv->max_opened_tc = 1; + priv->pcp_tc_num = 1; + + if (!alloc_cpumask_var(&priv->scratchpad.cpumask, GFP_KERNEL)) + return -ENOMEM; + + priv->selq = kvzalloc(sizeof(*priv->selq), GFP_KERNEL); + if (!priv->selq) + goto err_free_cpumask; + /* Assign dummy values, so that mlx5e_select_queue won't crash. */ + *priv->selq = (struct mlx5e_select_queue_params) { + .num_regular_queues = 1, + .num_channels = 1, + .num_tcs = 1, + .is_htb = false, + .is_ptp = false, + }; + + mutex_init(&priv->state_lock); + hash_init(priv->htb.qos_tc2node); + INIT_WORK(&priv->update_carrier_work, mlx5e_update_carrier_work); + INIT_WORK(&priv->set_rx_mode_work, mlx5e_set_rx_mode_work); + INIT_WORK(&priv->tx_timeout_work, mlx5e_tx_timeout_work); + INIT_WORK(&priv->update_stats_work, mlx5e_update_stats_work); + + priv->wq = create_singlethread_workqueue("mlx5e"); + if (!priv->wq) + goto err_free_selq; + + priv->txq2sq = kcalloc_node(num_txqs, sizeof(*priv->txq2sq), GFP_KERNEL, node); + if (!priv->txq2sq) + goto err_destroy_workqueue; + + priv->tx_rates = kcalloc_node(num_txqs, sizeof(*priv->tx_rates), GFP_KERNEL, node); + if (!priv->tx_rates) + goto err_free_txq2sq; + + priv->channel_stats = + kcalloc_node(nch, sizeof(*priv->channel_stats), GFP_KERNEL, node); + if (!priv->channel_stats) + goto err_free_tx_rates; + + return 0; + +err_free_tx_rates: + kfree(priv->tx_rates); +err_free_txq2sq: + kfree(priv->txq2sq); +err_destroy_workqueue: + destroy_workqueue(priv->wq); +err_free_selq: + kvfree(priv->selq); + priv->selq = NULL; +err_free_cpumask: + free_cpumask_var(priv->scratchpad.cpumask); + return -ENOMEM; +} + +void mlx5e_priv_cleanup(struct mlx5e_priv *priv) +{ + int i; + + /* bail if change profile failed and also rollback failed */ + if (!priv->mdev) + return; + + for (i = 0; i < priv->stats_nch; i++) + kvfree(priv->channel_stats[i]); + kfree(priv->channel_stats); + kfree(priv->tx_rates); + kfree(priv->txq2sq); + destroy_workqueue(priv->wq); + mutex_lock(&priv->state_lock); + mlx5e_replace_selq(priv, NULL); + mutex_unlock(&priv->state_lock); + free_cpumask_var(priv->scratchpad.cpumask); + + for (i = 0; i < priv->htb.max_qos_sqs; i++) + kfree(priv->htb.qos_sq_stats[i]); + kvfree(priv->htb.qos_sq_stats); + + memset(priv, 0, sizeof(*priv)); +} + +static unsigned int mlx5e_get_max_num_txqs(struct mlx5_core_dev *mdev, + const struct mlx5e_profile *profile) +{ + unsigned int nch, ptp_txqs, qos_txqs; + + nch = mlx5e_profile_max_num_channels(mdev, profile); + + ptp_txqs = MLX5_CAP_GEN(mdev, ts_cqe_to_dest_cqn) && + mlx5e_profile_feature_cap(profile, PTP_TX) ? + profile->max_tc : 0; + + qos_txqs = mlx5_qos_is_supported(mdev) && + mlx5e_profile_feature_cap(profile, QOS_HTB) ? + mlx5e_qos_max_leaf_nodes(mdev) : 0; + + return nch * profile->max_tc + ptp_txqs + qos_txqs; +} + +static unsigned int mlx5e_get_max_num_rxqs(struct mlx5_core_dev *mdev, + const struct mlx5e_profile *profile) +{ + unsigned int nch; + + nch = mlx5e_profile_max_num_channels(mdev, profile); + + return nch * profile->rq_groups; +} + +struct net_device * +mlx5e_create_netdev(struct mlx5_core_dev *mdev, const struct mlx5e_profile *profile) +{ + struct net_device *netdev; + unsigned int txqs, rxqs; + int err; + + txqs = mlx5e_get_max_num_txqs(mdev, profile); + rxqs = mlx5e_get_max_num_rxqs(mdev, profile); + + netdev = alloc_etherdev_mqs(sizeof(struct mlx5e_priv), txqs, rxqs); + if (!netdev) { + mlx5_core_err(mdev, "alloc_etherdev_mqs() failed\n"); + return NULL; + } + + err = mlx5e_priv_init(netdev_priv(netdev), profile, netdev, mdev); + if (err) { + mlx5_core_err(mdev, "mlx5e_priv_init failed, err=%d\n", err); + goto err_free_netdev; + } + + netif_carrier_off(netdev); + netif_tx_disable(netdev); + dev_net_set(netdev, mlx5_core_net(mdev)); + + return netdev; + +err_free_netdev: + free_netdev(netdev); + + return NULL; +} + +static void mlx5e_update_features(struct net_device *netdev) +{ + if (netdev->reg_state != NETREG_REGISTERED) + return; /* features will be updated on netdev registration */ + + rtnl_lock(); + netdev_update_features(netdev); + rtnl_unlock(); +} + +static void mlx5e_reset_channels(struct net_device *netdev) +{ + netdev_reset_tc(netdev); +} + +int mlx5e_attach_netdev(struct mlx5e_priv *priv) +{ + const bool take_rtnl = priv->netdev->reg_state == NETREG_REGISTERED; + const struct mlx5e_profile *profile = priv->profile; + int max_nch; + int err; + + clear_bit(MLX5E_STATE_DESTROYING, &priv->state); + + /* max number of channels may have changed */ + max_nch = mlx5e_calc_max_nch(priv->mdev, priv->netdev, profile); + if (priv->channels.params.num_channels > max_nch) { + mlx5_core_warn(priv->mdev, "MLX5E: Reducing number of channels to %d\n", max_nch); + /* Reducing the number of channels - RXFH has to be reset, and + * mlx5e_num_channels_changed below will build the RQT. + */ + priv->netdev->priv_flags &= ~IFF_RXFH_CONFIGURED; + priv->channels.params.num_channels = max_nch; + if (priv->channels.params.mqprio.mode == TC_MQPRIO_MODE_CHANNEL) { + mlx5_core_warn(priv->mdev, "MLX5E: Disabling MQPRIO channel mode\n"); + mlx5e_params_mqprio_reset(&priv->channels.params); + } + } + if (max_nch != priv->max_nch) { + mlx5_core_warn(priv->mdev, + "MLX5E: Updating max number of channels from %u to %u\n", + priv->max_nch, max_nch); + priv->max_nch = max_nch; + } + + /* 1. Set the real number of queues in the kernel the first time. + * 2. Set our default XPS cpumask. + * 3. Build the RQT. + * + * rtnl_lock is required by netif_set_real_num_*_queues in case the + * netdev has been registered by this point (if this function was called + * in the reload or resume flow). + */ + if (take_rtnl) + rtnl_lock(); + err = mlx5e_num_channels_changed(priv); + if (take_rtnl) + rtnl_unlock(); + if (err) + goto out; + + err = profile->init_tx(priv); + if (err) + goto out; + + err = profile->init_rx(priv); + if (err) + goto err_cleanup_tx; + + if (profile->enable) + profile->enable(priv); + + mlx5e_update_features(priv->netdev); + + return 0; + +err_cleanup_tx: + profile->cleanup_tx(priv); + +out: + mlx5e_reset_channels(priv->netdev); + set_bit(MLX5E_STATE_DESTROYING, &priv->state); + cancel_work_sync(&priv->update_stats_work); + return err; +} + +void mlx5e_detach_netdev(struct mlx5e_priv *priv) +{ + const struct mlx5e_profile *profile = priv->profile; + + set_bit(MLX5E_STATE_DESTROYING, &priv->state); + + if (profile->disable) + profile->disable(priv); + flush_workqueue(priv->wq); + + profile->cleanup_rx(priv); + profile->cleanup_tx(priv); + mlx5e_reset_channels(priv->netdev); + cancel_work_sync(&priv->update_stats_work); +} + +static int +mlx5e_netdev_init_profile(struct net_device *netdev, struct mlx5_core_dev *mdev, + const struct mlx5e_profile *new_profile, void *new_ppriv) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + int err; + + err = mlx5e_priv_init(priv, new_profile, netdev, mdev); + if (err) { + mlx5_core_err(mdev, "mlx5e_priv_init failed, err=%d\n", err); + return err; + } + netif_carrier_off(netdev); + priv->profile = new_profile; + priv->ppriv = new_ppriv; + err = new_profile->init(priv->mdev, priv->netdev); + if (err) + goto priv_cleanup; + + return 0; + +priv_cleanup: + mlx5e_priv_cleanup(priv); + return err; +} + +static int +mlx5e_netdev_attach_profile(struct net_device *netdev, struct mlx5_core_dev *mdev, + const struct mlx5e_profile *new_profile, void *new_ppriv) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + int err; + + err = mlx5e_netdev_init_profile(netdev, mdev, new_profile, new_ppriv); + if (err) + return err; + + err = mlx5e_attach_netdev(priv); + if (err) + goto profile_cleanup; + return err; + +profile_cleanup: + new_profile->cleanup(priv); + mlx5e_priv_cleanup(priv); + return err; +} + +int mlx5e_netdev_change_profile(struct mlx5e_priv *priv, + const struct mlx5e_profile *new_profile, void *new_ppriv) +{ + const struct mlx5e_profile *orig_profile = priv->profile; + struct net_device *netdev = priv->netdev; + struct mlx5_core_dev *mdev = priv->mdev; + void *orig_ppriv = priv->ppriv; + int err, rollback_err; + + /* cleanup old profile */ + mlx5e_detach_netdev(priv); + priv->profile->cleanup(priv); + mlx5e_priv_cleanup(priv); + + if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { + mlx5e_netdev_init_profile(netdev, mdev, new_profile, new_ppriv); + set_bit(MLX5E_STATE_DESTROYING, &priv->state); + return -EIO; + } + + err = mlx5e_netdev_attach_profile(netdev, mdev, new_profile, new_ppriv); + if (err) { /* roll back to original profile */ + netdev_warn(netdev, "%s: new profile init failed, %d\n", __func__, err); + goto rollback; + } + + return 0; + +rollback: + rollback_err = mlx5e_netdev_attach_profile(netdev, mdev, orig_profile, orig_ppriv); + if (rollback_err) + netdev_err(netdev, "%s: failed to rollback to orig profile, %d\n", + __func__, rollback_err); + return err; +} + +void mlx5e_netdev_attach_nic_profile(struct mlx5e_priv *priv) +{ + mlx5e_netdev_change_profile(priv, &mlx5e_nic_profile, NULL); +} + +void mlx5e_destroy_netdev(struct mlx5e_priv *priv) +{ + struct net_device *netdev = priv->netdev; + + mlx5e_priv_cleanup(priv); + free_netdev(netdev); +} + +static int mlx5e_resume(struct auxiliary_device *adev) +{ + struct mlx5_adev *edev = container_of(adev, struct mlx5_adev, adev); + struct mlx5e_priv *priv = auxiliary_get_drvdata(adev); + struct net_device *netdev = priv->netdev; + struct mlx5_core_dev *mdev = edev->mdev; + int err; + + if (netif_device_present(netdev)) + return 0; + + err = mlx5e_create_mdev_resources(mdev); + if (err) + return err; + + err = mlx5e_attach_netdev(priv); + if (err) { + mlx5e_destroy_mdev_resources(mdev); + return err; + } + + return 0; +} + +static int mlx5e_suspend(struct auxiliary_device *adev, pm_message_t state) +{ + struct mlx5e_priv *priv = auxiliary_get_drvdata(adev); + struct net_device *netdev = priv->netdev; + struct mlx5_core_dev *mdev = priv->mdev; + + if (!netif_device_present(netdev)) { + if (test_bit(MLX5E_STATE_DESTROYING, &priv->state) && + !test_bit(MLX5_INTERFACE_STATE_TEARDOWN, &mdev->intf_state)) + mlx5e_destroy_mdev_resources(mdev); + return -ENODEV; + } + + mlx5e_detach_netdev(priv); + mlx5e_destroy_mdev_resources(mdev); + return 0; +} + +static int mlx5e_probe(struct auxiliary_device *adev, + const struct auxiliary_device_id *id) +{ + struct mlx5_adev *edev = container_of(adev, struct mlx5_adev, adev); + const struct mlx5e_profile *profile = &mlx5e_nic_profile; + struct mlx5_core_dev *mdev = edev->mdev; + struct net_device *netdev; + pm_message_t state = {}; + struct mlx5e_priv *priv; + int err; + + if (mdev->disable_en) + return -ENOMEM; + + netdev = mlx5e_create_netdev(mdev, profile); + if (!netdev) { + mlx5_core_err(mdev, "mlx5e_create_netdev failed\n"); + return -ENOMEM; + } + + mlx5e_build_nic_netdev(netdev); + + priv = netdev_priv(netdev); + auxiliary_set_drvdata(adev, priv); + + priv->profile = profile; + priv->ppriv = NULL; + + err = mlx5e_devlink_port_register(priv); + if (err) { + mlx5_core_err(mdev, "mlx5e_devlink_port_register failed, %d\n", err); + goto err_destroy_netdev; + } + + err = profile->init(mdev, netdev); + if (err) { + mlx5_core_err(mdev, "mlx5e_nic_profile init failed, %d\n", err); + goto err_devlink_cleanup; + } + + err = mlx5e_resume(adev); + if (err) { + mlx5_core_err(mdev, "mlx5e_resume failed, %d\n", err); + goto err_profile_cleanup; + } + + err = register_netdev(netdev); + if (err) { + mlx5_core_err(mdev, "register_netdev failed, %d\n", err); + goto err_resume; + } + + mlx5e_devlink_port_type_eth_set(priv); + + err = mlx5e_sysfs_create(netdev); + if (err) + goto err_unregister_netdev; + + mlx5e_dcbnl_init_app(priv); + mlx5_uplink_netdev_set(mdev, netdev); + return 0; + +err_unregister_netdev: + unregister_netdev(netdev); +err_resume: + mlx5e_suspend(adev, state); +err_profile_cleanup: + profile->cleanup(priv); +err_devlink_cleanup: + mlx5e_devlink_port_unregister(priv); +err_destroy_netdev: + mlx5e_destroy_netdev(priv); + return err; +} + +static void mlx5e_remove(struct auxiliary_device *adev) +{ + struct mlx5e_priv *priv = auxiliary_get_drvdata(adev); + pm_message_t state = {}; + + mlx5e_dcbnl_delete_app(priv); + mlx5e_sysfs_remove(priv->netdev); + unregister_netdev(priv->netdev); + mlx5e_suspend(adev, state); + priv->profile->cleanup(priv); + mlx5e_devlink_port_unregister(priv); + mlx5e_destroy_netdev(priv); +} + +static const struct auxiliary_device_id mlx5e_id_table[] = { + { .name = MLX5_ADEV_NAME ".eth", }, + {}, +}; + +MODULE_DEVICE_TABLE(auxiliary_mlx5e_id_table, mlx5e_id_table); + +static struct auxiliary_driver mlx5e_driver = { + .name = "eth", + .probe = mlx5e_probe, + .remove = mlx5e_remove, + .suspend = mlx5e_suspend, + .resume = mlx5e_resume, + .id_table = mlx5e_id_table, +}; + +int mlx5e_init(void) +{ + int ret; + + mlx5e_ipsec_build_inverse_table(); + mlx5e_build_ptys2ethtool_map(); + ret = auxiliary_driver_register(&mlx5e_driver); + if (ret) + return ret; + + ret = mlx5e_rep_init(); + if (ret) + auxiliary_driver_unregister(&mlx5e_driver); + return ret; +} + +void mlx5e_cleanup(void) +{ + mlx5e_rep_cleanup(); + auxiliary_driver_unregister(&mlx5e_driver); +} + +bool mlx5e_is_rep_shared_rq(const struct mlx5e_priv *priv) +{ + struct mlx5_core_dev *mdev = priv->mdev; + + if (!mlx5e_is_vport_rep(priv)) + return false; + + if (mlx5e_is_uplink_rep(priv)) + return false; + + if (!mlx5e_esw_offloads_pet_enabled(mdev->priv.eswitch)) + return false; + + return true; +} + diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c new file mode 100644 index 0000000..3219d1f --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -0,0 +1,1739 @@ +/* + * Copyright (c) 2016, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include + +#include "eswitch.h" +#include "mlx5_esw_devm.h" +#include "en.h" +#include "en_rep.h" +#include "en/params.h" +#include "en/txrx.h" +#include "en_tc.h" +#include "en/rep/tc.h" +#include "en/rep/neigh.h" +#include "en/rep/sysfs.h" +#include "en/rep/meter.h" +#include "en/rep/bridge.h" +#include "en/devlink.h" +#include "fs_core.h" +#include "ecpf.h" +#include "lib/mlx5.h" +#include "lib/devcom.h" +#include "lib/vxlan.h" +#define CREATE_TRACE_POINTS +#include "diag/en_rep_tracepoint.h" +#include "en_accel/ipsec.h" +#include "en/ptp.h" +#include "en/tc/int_port.h" +#include + +#define MLX5E_REP_PARAMS_DEF_NUM_CHANNELS 1 + +static const char mlx5e_rep_driver_name[] = "mlx5e_rep"; + +static void mlx5e_rep_get_drvinfo(struct net_device *dev, + struct ethtool_drvinfo *drvinfo) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5_core_dev *mdev = priv->mdev; + + strlcpy(drvinfo->driver, mlx5e_rep_driver_name, + sizeof(drvinfo->driver)); + strlcpy(drvinfo->version, UTS_RELEASE, sizeof(drvinfo->version)); + snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), + "%d.%d.%04d (%.16s)", + fw_rev_maj(mdev), fw_rev_min(mdev), + fw_rev_sub(mdev), mdev->board_id); +} + +static const struct counter_desc sw_rep_stats_desc[] = { + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_packets) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_bytes) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_packets) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_bytes) }, +}; + +struct vport_stats { + u64 vport_rx_packets; + u64 vport_tx_packets; + u64 vport_rx_bytes; + u64 vport_tx_bytes; +}; + +static const struct counter_desc vport_rep_stats_desc[] = { + { MLX5E_DECLARE_STAT(struct vport_stats, vport_rx_packets) }, + { MLX5E_DECLARE_STAT(struct vport_stats, vport_rx_bytes) }, + { MLX5E_DECLARE_STAT(struct vport_stats, vport_tx_packets) }, + { MLX5E_DECLARE_STAT(struct vport_stats, vport_tx_bytes) }, +}; + +#define NUM_VPORT_REP_SW_COUNTERS ARRAY_SIZE(sw_rep_stats_desc) +#define NUM_VPORT_REP_HW_COUNTERS ARRAY_SIZE(vport_rep_stats_desc) + +static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(sw_rep) +{ + return NUM_VPORT_REP_SW_COUNTERS; +} + +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(sw_rep) +{ + int i; + + for (i = 0; i < NUM_VPORT_REP_SW_COUNTERS; i++) + strcpy(data + (idx++) * ETH_GSTRING_LEN, + sw_rep_stats_desc[i].format); + return idx; +} + +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(sw_rep) +{ + int i; + + for (i = 0; i < NUM_VPORT_REP_SW_COUNTERS; i++) + data[idx++] = MLX5E_READ_CTR64_CPU(&priv->stats.sw, + sw_rep_stats_desc, i); + return idx; +} + +static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(sw_rep) +{ + struct mlx5e_sw_stats *s = &priv->stats.sw; + struct rtnl_link_stats64 stats64 = {}; + + memset(s, 0, sizeof(*s)); + mlx5e_fold_sw_stats64(priv, &stats64); + + s->rx_packets = stats64.rx_packets; + s->rx_bytes = stats64.rx_bytes; + s->tx_packets = stats64.tx_packets; + s->tx_bytes = stats64.tx_bytes; + s->tx_queue_dropped = stats64.tx_dropped; +} + +static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(vport_rep) +{ + return NUM_VPORT_REP_HW_COUNTERS; +} + +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(vport_rep) +{ + int i; + + for (i = 0; i < NUM_VPORT_REP_HW_COUNTERS; i++) + strcpy(data + (idx++) * ETH_GSTRING_LEN, vport_rep_stats_desc[i].format); + return idx; +} + +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(vport_rep) +{ + int i; + + for (i = 0; i < NUM_VPORT_REP_HW_COUNTERS; i++) + data[idx++] = MLX5E_READ_CTR64_CPU(&priv->stats.vf_vport, + vport_rep_stats_desc, i); + return idx; +} + +static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(vport_rep) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + struct mlx5e_rep_priv *rpriv = priv->ppriv; + struct mlx5_eswitch_rep *rep = rpriv->rep; + struct rtnl_link_stats64 *vport_stats; + struct ifla_vf_stats vf_stats; + int err; + + err = mlx5_eswitch_get_vport_stats(esw, rep->vport, &vf_stats); + if (err) { + netdev_warn(priv->netdev, "vport %d error %d reading stats\n", + rep->vport, err); + return; + } + + vport_stats = &priv->stats.vf_vport; + /* flip tx/rx as we are reporting the counters for the switch vport */ + vport_stats->rx_packets = vf_stats.tx_packets; + vport_stats->rx_bytes = vf_stats.tx_bytes; + vport_stats->tx_packets = vf_stats.rx_packets; + vport_stats->tx_bytes = vf_stats.rx_bytes; +} + +static void mlx5e_rep_get_strings(struct net_device *dev, + u32 stringset, uint8_t *data) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + + switch (stringset) { + case ETH_SS_STATS: + mlx5e_stats_fill_strings(priv, data); + break; + } +} + +static void mlx5e_rep_get_ethtool_stats(struct net_device *dev, + struct ethtool_stats *stats, u64 *data) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + + mlx5e_ethtool_get_ethtool_stats(priv, stats, data); +} + +static int mlx5e_rep_get_sset_count(struct net_device *dev, int sset) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + + switch (sset) { + case ETH_SS_STATS: + return mlx5e_stats_total_num(priv); + default: + return -EOPNOTSUPP; + } +} + +static void +mlx5e_rep_get_ringparam(struct net_device *dev, + struct ethtool_ringparam *param, + struct kernel_ethtool_ringparam *kernel_param, + struct netlink_ext_ack *extack) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + + mlx5e_ethtool_get_ringparam(priv, param); +} + +static int +mlx5e_rep_set_ringparam(struct net_device *dev, + struct ethtool_ringparam *param, + struct kernel_ethtool_ringparam *kernel_param, + struct netlink_ext_ack *extack) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + + return mlx5e_ethtool_set_ringparam(priv, param); +} + +static void mlx5e_rep_get_channels(struct net_device *dev, + struct ethtool_channels *ch) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + + mlx5e_ethtool_get_channels(priv, ch); +} + +static int mlx5e_rep_set_channels(struct net_device *dev, + struct ethtool_channels *ch) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + + return mlx5e_ethtool_set_channels(priv, ch); +} + +static int mlx5e_rep_get_coalesce(struct net_device *netdev, + struct ethtool_coalesce *coal, + struct kernel_ethtool_coalesce *kernel_coal, + struct netlink_ext_ack *extack) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + + return mlx5e_ethtool_get_coalesce(priv, coal, kernel_coal); +} + +static int mlx5e_rep_set_coalesce(struct net_device *netdev, + struct ethtool_coalesce *coal, + struct kernel_ethtool_coalesce *kernel_coal, + struct netlink_ext_ack *extack) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + + return mlx5e_ethtool_set_coalesce(priv, coal, kernel_coal, extack); +} + +static u32 mlx5e_rep_get_rxfh_key_size(struct net_device *netdev) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + + return mlx5e_ethtool_get_rxfh_key_size(priv); +} + +static u32 mlx5e_rep_get_rxfh_indir_size(struct net_device *netdev) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + + return mlx5e_ethtool_get_rxfh_indir_size(priv); +} + +static const struct ethtool_ops mlx5e_rep_ethtool_ops = { + .supported_coalesce_params = ETHTOOL_COALESCE_USECS | + ETHTOOL_COALESCE_MAX_FRAMES | + ETHTOOL_COALESCE_USE_ADAPTIVE, + .get_drvinfo = mlx5e_rep_get_drvinfo, + .get_link = ethtool_op_get_link, + .get_strings = mlx5e_rep_get_strings, + .get_sset_count = mlx5e_rep_get_sset_count, + .get_ethtool_stats = mlx5e_rep_get_ethtool_stats, + .get_link_ksettings = mlx5e_get_link_ksettings, + .set_link_ksettings = mlx5e_set_link_ksettings, + .get_ringparam = mlx5e_rep_get_ringparam, + .set_ringparam = mlx5e_rep_set_ringparam, + .get_channels = mlx5e_rep_get_channels, + .set_channels = mlx5e_rep_set_channels, + .get_coalesce = mlx5e_rep_get_coalesce, + .set_coalesce = mlx5e_rep_set_coalesce, + .get_rxfh_key_size = mlx5e_rep_get_rxfh_key_size, + .get_rxfh_indir_size = mlx5e_rep_get_rxfh_indir_size, +}; + +static void mlx5e_sqs2vport_stop(struct mlx5_eswitch *esw, + struct mlx5_eswitch_rep *rep) +{ + struct mlx5e_rep_sq *rep_sq, *tmp; + struct mlx5e_rep_priv *rpriv; + + if (esw->mode != MLX5_ESWITCH_OFFLOADS) + return; + + rpriv = mlx5e_rep_to_rep_priv(rep); + list_for_each_entry_safe(rep_sq, tmp, &rpriv->vport_sqs_list, list) { + mlx5_eswitch_del_send_to_vport_rule(rep_sq->send_to_vport_rule); + if (rep_sq->send_to_vport_rule_peer) + mlx5_eswitch_del_send_to_vport_rule(rep_sq->send_to_vport_rule_peer); + list_del(&rep_sq->list); + kfree(rep_sq); + } +} + +static int mlx5e_sqs2vport_start(struct mlx5_eswitch *esw, + struct mlx5_eswitch_rep *rep, + u32 *sqns_array, int sqns_num) +{ + struct mlx5_eswitch *peer_esw = NULL; + struct mlx5_flow_handle *flow_rule; + struct mlx5e_rep_priv *rpriv; + struct mlx5e_rep_sq *rep_sq; + int err; + int i; + + if (esw->mode != MLX5_ESWITCH_OFFLOADS) + return 0; + + rpriv = mlx5e_rep_to_rep_priv(rep); + if (mlx5_devcom_is_paired(esw->dev->priv.devcom, MLX5_DEVCOM_ESW_OFFLOADS)) + peer_esw = mlx5_devcom_get_peer_data(esw->dev->priv.devcom, + MLX5_DEVCOM_ESW_OFFLOADS); + + for (i = 0; i < sqns_num; i++) { + rep_sq = kzalloc(sizeof(*rep_sq), GFP_KERNEL); + if (!rep_sq) { + err = -ENOMEM; + goto out_err; + } + + /* Add re-inject rule to the PF/representor sqs */ + flow_rule = mlx5_eswitch_add_send_to_vport_rule(esw, esw, rep, + sqns_array[i]); + if (IS_ERR(flow_rule)) { + err = PTR_ERR(flow_rule); + kfree(rep_sq); + goto out_err; + } + rep_sq->send_to_vport_rule = flow_rule; + rep_sq->sqn = sqns_array[i]; + + if (peer_esw) { + flow_rule = mlx5_eswitch_add_send_to_vport_rule(peer_esw, esw, + rep, sqns_array[i]); + if (IS_ERR(flow_rule)) { + err = PTR_ERR(flow_rule); + mlx5_eswitch_del_send_to_vport_rule(rep_sq->send_to_vport_rule); + kfree(rep_sq); + goto out_err; + } + rep_sq->send_to_vport_rule_peer = flow_rule; + } + + list_add(&rep_sq->list, &rpriv->vport_sqs_list); + } + + if (peer_esw) + mlx5_devcom_release_peer_data(esw->dev->priv.devcom, MLX5_DEVCOM_ESW_OFFLOADS); + + return 0; + +out_err: + mlx5e_sqs2vport_stop(esw, rep); + + if (peer_esw) + mlx5_devcom_release_peer_data(esw->dev->priv.devcom, MLX5_DEVCOM_ESW_OFFLOADS); + + return err; +} + +static int +mlx5e_add_sqs_fwd_rules(struct mlx5e_priv *priv) +{ + int sqs_per_channel = mlx5e_get_dcb_num_tc(&priv->channels.params); + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + bool is_uplink_rep = mlx5e_is_uplink_rep(priv); + struct mlx5e_rep_priv *rpriv = priv->ppriv; + struct mlx5_eswitch_rep *rep = rpriv->rep; + struct mlx5e_channel *c; + int n, tc, num_sqs = 0; + int err = -ENOMEM; + bool ptp_sq; + u32 *sqs; + int num_txqs = priv->channels.params.num_channels * priv->channels.params.mqprio.num_tc; + + ptp_sq = !!(priv->channels.ptp && + MLX5E_GET_PFLAG(&priv->channels.params, MLX5E_PFLAG_TX_PORT_TS)); + num_txqs += ptp_sq ? priv->channels.ptp->num_tc : 0; + /* +2 for xdpsqs, they don't exist on the ptp channel but will not be + * counted for by num_sqs. + */ + if (is_uplink_rep) + sqs_per_channel += 2; + + sqs = kvcalloc(num_txqs * sqs_per_channel, sizeof(*sqs), GFP_KERNEL); + if (!sqs) + goto out; + + for (n = 0; n < priv->channels.num; n++) { + c = priv->channels.c[n]; + for (tc = 0; tc < c->num_tc; tc++) + sqs[num_sqs++] = c->sq[tc].sqn; + + if (is_uplink_rep) { + if (c->xdp) + sqs[num_sqs++] = c->rq_xdpsq.sqn; + + sqs[num_sqs++] = c->xdpsq.sqn; + } + } + + if (ptp_sq) { + struct mlx5e_ptp *ptp_ch = priv->channels.ptp; + + for (tc = 0; tc < ptp_ch->num_tc; tc++) + sqs[num_sqs++] = ptp_ch->ptpsq[tc].txqsq.sqn; + } + + err = mlx5e_sqs2vport_start(esw, rep, sqs, num_sqs); + kfree(sqs); + +out: + if (err) + netdev_warn(priv->netdev, "Failed to add SQs FWD rules %d\n", err); + return err; +} + +static void +mlx5e_remove_sqs_fwd_rules(struct mlx5e_priv *priv) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + struct mlx5e_rep_priv *rpriv = priv->ppriv; + struct mlx5_eswitch_rep *rep = rpriv->rep; + + mlx5e_sqs2vport_stop(esw, rep); +} + +static int +mlx5e_rep_add_meta_tunnel_rule(struct mlx5e_priv *priv) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + struct mlx5e_rep_priv *rpriv = priv->ppriv; + struct mlx5_eswitch_rep *rep = rpriv->rep; + struct mlx5_flow_handle *flow_rule; + struct mlx5_flow_group *g; + + g = esw->fdb_table.offloads.send_to_vport_meta_grp; + if (!g) + return 0; + + flow_rule = mlx5_eswitch_add_send_to_vport_meta_rule(esw, rep->vport); + if (IS_ERR(flow_rule)) + return PTR_ERR(flow_rule); + + rpriv->send_to_vport_meta_rule = flow_rule; + + return 0; +} + +static void +mlx5e_rep_del_meta_tunnel_rule(struct mlx5e_priv *priv) +{ + struct mlx5e_rep_priv *rpriv = priv->ppriv; + + if (rpriv->send_to_vport_meta_rule) + mlx5_eswitch_del_send_to_vport_meta_rule(rpriv->send_to_vport_meta_rule); +} + +void mlx5e_rep_activate_channels(struct mlx5e_priv *priv) +{ + mlx5e_add_sqs_fwd_rules(priv); + mlx5e_rep_add_meta_tunnel_rule(priv); +} + +void mlx5e_rep_deactivate_channels(struct mlx5e_priv *priv) +{ + mlx5e_rep_del_meta_tunnel_rule(priv); + mlx5e_remove_sqs_fwd_rules(priv); +} + +static int mlx5e_rep_open(struct net_device *dev) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5e_rep_priv *rpriv = priv->ppriv; + struct mlx5_eswitch_rep *rep = rpriv->rep; + int err; + + mutex_lock(&priv->state_lock); + err = mlx5e_open_locked(dev); + if (err) + goto unlock; + + if (!mlx5_modify_vport_admin_state(priv->mdev, + MLX5_VPORT_STATE_OP_MOD_ESW_VPORT, + rep->vport, 1, + MLX5_VPORT_ADMIN_STATE_UP)) + netif_carrier_on(dev); + +unlock: + mutex_unlock(&priv->state_lock); + return err; +} + +static int mlx5e_rep_close(struct net_device *dev) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5e_rep_priv *rpriv = priv->ppriv; + struct mlx5_eswitch_rep *rep = rpriv->rep; + int ret; + + mutex_lock(&priv->state_lock); + mlx5_modify_vport_admin_state(priv->mdev, + MLX5_VPORT_STATE_OP_MOD_ESW_VPORT, + rep->vport, 1, + MLX5_VPORT_ADMIN_STATE_DOWN); + ret = mlx5e_close_locked(dev); + mutex_unlock(&priv->state_lock); + return ret; +} + +bool mlx5e_is_uplink_rep(const struct mlx5e_priv *priv) +{ + struct mlx5e_rep_priv *rpriv = priv->ppriv; + struct mlx5_eswitch_rep *rep; + + if (!MLX5_ESWITCH_MANAGER(priv->mdev)) + return false; + + if (!rpriv) /* non vport rep mlx5e instances don't use this field */ + return false; + + rep = rpriv->rep; + return (rep->vport == MLX5_VPORT_UPLINK); +} + +bool mlx5e_rep_has_offload_stats(const struct net_device *dev, int attr_id) +{ + switch (attr_id) { + case IFLA_OFFLOAD_XSTATS_CPU_HIT: + return true; + } + + return false; +} + +static void mlx5e_rep_get_port_parent_id(struct net_device *dev, + struct netdev_phys_item_id *ppid) +{ + struct mlx5e_priv *priv; + u64 parent_id; + + priv = netdev_priv(dev); + + parent_id = mlx5_query_nic_system_image_guid(priv->mdev); + ppid->id_len = sizeof(parent_id); + memcpy(ppid->id, &parent_id, sizeof(parent_id)); +} + +static int mlx5e_rep_sf_port_parent_id(struct net_device *dev, + struct netdev_phys_item_id *ppid) +{ + mlx5e_rep_get_port_parent_id(dev, ppid); + return 0; +} + +static int mlx5e_rep_sf_get_phys_port_name(struct net_device *dev, + char *buf, size_t len) +{ +#ifdef CONFIG_MLXDEVM + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5e_rep_priv *rpriv = priv->ppriv; + struct mlx5_eswitch_rep *rep = rpriv->rep; + struct mlx5_eswitch *esw; + unsigned int fn; + u32 controller; + int ret; + + fn = mlx5_get_dev_index(priv->mdev); + esw = priv->mdev->priv.eswitch; + + controller = mlx5_devm_sf_vport_to_controller(priv->mdev, rep->vport); + if (controller) + ret = snprintf(buf, len, "c%dpf%dsf%d", + controller, fn, + mlx5_devm_sf_vport_to_sfnum(priv->mdev, rep->vport)); + else + ret = snprintf(buf, len, "pf%dsf%d", fn, + mlx5_devm_sf_vport_to_sfnum(priv->mdev, rep->vport)); + if (ret >= len) + return -EOPNOTSUPP; +#endif + return 0; +} + +static int +mlx5e_get_sw_stats64(const struct net_device *dev, + struct rtnl_link_stats64 *stats) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + + mlx5e_fold_sw_stats64(priv, stats); + return 0; +} + +int mlx5e_rep_get_offload_stats(int attr_id, const struct net_device *dev, + void *sp) +{ + switch (attr_id) { + case IFLA_OFFLOAD_XSTATS_CPU_HIT: + return mlx5e_get_sw_stats64(dev, sp); + } + + return -EINVAL; +} + +static void +mlx5e_rep_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + + /* update HW stats in background for next time */ + mlx5e_queue_update_stats(priv); + memcpy(stats, &priv->stats.vf_vport, sizeof(*stats)); +} + +static int mlx5e_rep_change_mtu(struct net_device *netdev, int new_mtu) +{ + return mlx5e_change_mtu(netdev, new_mtu, NULL); +} + +static struct devlink_port *mlx5e_rep_get_devlink_port(struct net_device *netdev) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5e_rep_priv *rpriv = priv->ppriv; + struct mlx5_core_dev *dev = priv->mdev; + + return mlx5_esw_offloads_devlink_port(dev->priv.eswitch, rpriv->rep->vport); +} + +static int mlx5e_rep_change_carrier(struct net_device *dev, bool new_carrier) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5e_rep_priv *rpriv = priv->ppriv; + struct mlx5_eswitch_rep *rep = rpriv->rep; + int err; + + if (new_carrier) { + err = mlx5_modify_vport_admin_state(priv->mdev, MLX5_VPORT_STATE_OP_MOD_ESW_VPORT, + rep->vport, 1, MLX5_VPORT_ADMIN_STATE_UP); + if (err) + return err; + netif_carrier_on(dev); + } else { + err = mlx5_modify_vport_admin_state(priv->mdev, MLX5_VPORT_STATE_OP_MOD_ESW_VPORT, + rep->vport, 1, MLX5_VPORT_ADMIN_STATE_DOWN); + if (err) + return err; + netif_carrier_off(dev); + } + return 0; +} + +static const struct net_device_ops mlx5e_netdev_ops_rep = { + .ndo_open = mlx5e_rep_open, + .ndo_stop = mlx5e_rep_close, + .ndo_start_xmit = mlx5e_xmit, + .ndo_setup_tc = mlx5e_rep_setup_tc, + .ndo_get_devlink_port = mlx5e_rep_get_devlink_port, + .ndo_get_stats64 = mlx5e_rep_get_stats, + .ndo_has_offload_stats = mlx5e_rep_has_offload_stats, + .ndo_get_offload_stats = mlx5e_rep_get_offload_stats, + .ndo_change_mtu = mlx5e_rep_change_mtu, + .ndo_change_carrier = mlx5e_rep_change_carrier, +}; + +static const struct net_device_ops mlx5e_netdev_ops_rep_sf = { + .ndo_open = mlx5e_rep_open, + .ndo_stop = mlx5e_rep_close, + .ndo_start_xmit = mlx5e_xmit, + .ndo_setup_tc = mlx5e_rep_setup_tc, + .ndo_get_devlink_port = mlx5e_rep_get_devlink_port, + .ndo_get_port_parent_id = mlx5e_rep_sf_port_parent_id, + .ndo_get_phys_port_name = mlx5e_rep_sf_get_phys_port_name, + .ndo_get_stats64 = mlx5e_rep_get_stats, + .ndo_has_offload_stats = mlx5e_rep_has_offload_stats, + .ndo_get_offload_stats = mlx5e_rep_get_offload_stats, + .ndo_change_mtu = mlx5e_rep_change_mtu, +}; + +bool mlx5e_eswitch_uplink_rep(const struct net_device *netdev) +{ + return netdev->netdev_ops == &mlx5e_netdev_ops && + mlx5e_is_uplink_rep(netdev_priv(netdev)); +} + +bool mlx5e_eswitch_vf_rep(const struct net_device *netdev) +{ + return netdev->netdev_ops == &mlx5e_netdev_ops_rep || + netdev->netdev_ops == &mlx5e_netdev_ops_rep_sf; +} + +/* One indirect TIR set for outer. Inner not supported in reps. */ +#define REP_NUM_INDIR_TIRS MLX5E_NUM_INDIR_TIRS + +static int mlx5e_rep_max_nch_limit(struct mlx5_core_dev *mdev) +{ + int max_tir_num = 1 << MLX5_CAP_GEN(mdev, log_max_tir); + int num_vports = mlx5_eswitch_get_total_vports(mdev); + + return (max_tir_num - mlx5e_get_pf_num_tirs(mdev) + - (num_vports * REP_NUM_INDIR_TIRS)) / num_vports; +} + +static void mlx5e_build_rep_params(struct net_device *netdev) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5e_rep_priv *rpriv = priv->ppriv; + struct mlx5_eswitch_rep *rep = rpriv->rep; + struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5e_params *params; + + u8 cq_period_mode = MLX5_CAP_GEN(mdev, cq_period_start_from_cqe) ? + MLX5_CQ_PERIOD_MODE_START_FROM_CQE : + MLX5_CQ_PERIOD_MODE_START_FROM_EQE; + + params = &priv->channels.params; + + params->num_channels = MLX5E_REP_PARAMS_DEF_NUM_CHANNELS; + params->hard_mtu = MLX5E_ETH_HARD_MTU; + params->sw_mtu = netdev->mtu; + + /* SQ */ + params->log_sq_size = MLX5E_PARAMS_DEFAULT_LOG_SQ_SIZE; + + /* RQ */ + mlx5e_build_rq_params(mdev, params); + + /* CQ moderation params */ + params->rx_dim_enabled = MLX5_CAP_GEN(mdev, cq_moderation); + mlx5e_set_rx_cq_mode_params(params, cq_period_mode); + + params->mqprio.num_tc = 1; + params->tunneled_offload_en = false; + if (rep->vport != MLX5_VPORT_UPLINK) + params->vlan_strip_disable = true; + + mlx5_query_min_inline(mdev, ¶ms->tx_min_inline_mode); + + MLX5E_SET_PFLAG(params, MLX5E_PFLAG_PER_CH_STATS, true); +} + +static void mlx5e_build_rep_netdev(struct net_device *netdev, + struct mlx5_core_dev *mdev, + struct mlx5_eswitch_rep *rep) +{ + SET_NETDEV_DEV(netdev, mdev->device); + + if (mlx5_esw_is_sf_vport(mdev->priv.eswitch, rep->vport)) + netdev->netdev_ops = &mlx5e_netdev_ops_rep_sf; + else + netdev->netdev_ops = &mlx5e_netdev_ops_rep; + + eth_hw_addr_random(netdev); + netdev->ethtool_ops = &mlx5e_rep_ethtool_ops; + + netdev->watchdog_timeo = 15 * HZ; + +#if IS_ENABLED(CONFIG_MLX5_CLS_ACT) + netdev->hw_features |= NETIF_F_HW_TC; +#endif + netdev->hw_features |= NETIF_F_SG; + netdev->hw_features |= NETIF_F_IP_CSUM; + netdev->hw_features |= NETIF_F_IPV6_CSUM; + netdev->hw_features |= NETIF_F_GRO; + netdev->hw_features |= NETIF_F_TSO; + netdev->hw_features |= NETIF_F_TSO6; + netdev->hw_features |= NETIF_F_RXCSUM; + + netdev->features |= netdev->hw_features; + netdev->features |= NETIF_F_NETNS_LOCAL; +} + +static int mlx5e_init_rep(struct mlx5_core_dev *mdev, + struct net_device *netdev) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + + mlx5e_build_rep_params(netdev); + mlx5e_build_txq_maps(priv); + + mlx5e_timestamp_init(priv); + + return 0; +} + +static int mlx5e_init_ul_rep(struct mlx5_core_dev *mdev, + struct net_device *netdev) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + int err; + + err = mlx5e_ipsec_init(priv); + if (err) + mlx5_core_err(mdev, "Uplink rep IPsec initialization failed, %d\n", err); + + mlx5e_vxlan_set_netdev_info(priv); + mutex_init(&priv->aso_lock); + return mlx5e_init_rep(mdev, netdev); +} + +static void mlx5e_cleanup_rep(struct mlx5e_priv *priv) +{ + struct mlx5e_rep_priv *rpriv = priv->ppriv; + + if (!rpriv) + return; + + if (rpriv->rep->vport == MLX5_VPORT_UPLINK) + mlx5e_ipsec_cleanup(priv); +} + +static int mlx5e_create_rep_ttc_table(struct mlx5e_priv *priv) +{ + struct mlx5e_rep_priv *rpriv = priv->ppriv; + struct mlx5_eswitch_rep *rep = rpriv->rep; + struct ttc_params ttc_params = {}; + int err; + + priv->fs.ns = mlx5_get_flow_namespace(priv->mdev, + MLX5_FLOW_NAMESPACE_KERNEL); + + /* The inner_ttc in the ttc params is intentionally not set */ + mlx5e_set_ttc_params(priv, &ttc_params, false); + + if (rep->vport != MLX5_VPORT_UPLINK) + /* To give uplik rep TTC a lower level for chaining from root ft */ + ttc_params.ft_attr.level = MLX5E_TTC_FT_LEVEL + 1; + + priv->fs.ttc = mlx5_create_ttc_table(priv->mdev, &ttc_params); + if (IS_ERR(priv->fs.ttc)) { + err = PTR_ERR(priv->fs.ttc); + netdev_err(priv->netdev, "Failed to create rep ttc table, err=%d\n", + err); + return err; + } + return 0; +} + +static int mlx5e_create_rep_root_ft(struct mlx5e_priv *priv) +{ + struct mlx5e_rep_priv *rpriv = priv->ppriv; + struct mlx5_eswitch_rep *rep = rpriv->rep; + struct mlx5_flow_table_attr ft_attr = {}; + struct mlx5_flow_namespace *ns; + int err = 0; + + if (rep->vport != MLX5_VPORT_UPLINK) { + /* non uplik reps will skip any bypass tables and go directly to + * their own ttc + */ + rpriv->root_ft = mlx5_get_ttc_flow_table(priv->fs.ttc); + return 0; + } + + /* uplink root ft will be used to auto chain, to ethtool or ttc tables */ + ns = mlx5_get_flow_namespace(priv->mdev, MLX5_FLOW_NAMESPACE_OFFLOADS); + if (!ns) { + netdev_err(priv->netdev, "Failed to get reps offloads namespace\n"); + return -EOPNOTSUPP; + } + + ft_attr.max_fte = 0; /* Empty table, miss rule will always point to next table */ + ft_attr.prio = 1; + ft_attr.level = 1; + + rpriv->root_ft = mlx5_create_flow_table(ns, &ft_attr); + if (IS_ERR(rpriv->root_ft)) { + err = PTR_ERR(rpriv->root_ft); + rpriv->root_ft = NULL; + } + + return err; +} + +static void mlx5e_destroy_rep_root_ft(struct mlx5e_priv *priv) +{ + struct mlx5e_rep_priv *rpriv = priv->ppriv; + struct mlx5_eswitch_rep *rep = rpriv->rep; + + if (rep->vport != MLX5_VPORT_UPLINK) + return; + mlx5_destroy_flow_table(rpriv->root_ft); +} + +static struct mlx5_flow_table *mlx5e_get_root_ft(struct mlx5e_priv *priv, + struct mlx5_eswitch *esw) +{ + struct mlx5e_rep_priv *rpriv = priv->ppriv; + struct mlx5e_rep_priv *uplink_rpriv; + + if (priv->shared_rq) { + uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH); + return uplink_rpriv->root_ft; + } + + return rpriv->root_ft; +} + +static int mlx5e_create_rep_vport_rx_rule(struct mlx5e_priv *priv) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + struct mlx5e_rep_priv *rpriv = priv->ppriv; + struct mlx5_eswitch_rep *rep = rpriv->rep; + struct mlx5_flow_handle *flow_rule; + struct mlx5_flow_destination dest; + + dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dest.ft = mlx5e_get_root_ft(priv, esw); + + flow_rule = mlx5_eswitch_create_vport_rx_rule(esw, rep->vport, &dest); + if (IS_ERR(flow_rule)) + return PTR_ERR(flow_rule); + rpriv->vport_rx_rule = flow_rule; + return 0; +} + +static void rep_vport_rx_rule_destroy(struct mlx5e_priv *priv) +{ + struct mlx5e_rep_priv *rpriv = priv->ppriv; + + if (!rpriv->vport_rx_rule) + return; + + mlx5_del_flow_rules(rpriv->vport_rx_rule); + rpriv->vport_rx_rule = NULL; +} + +int mlx5e_rep_bond_update(struct mlx5e_priv *priv, bool cleanup) +{ + rep_vport_rx_rule_destroy(priv); + + return cleanup ? 0 : mlx5e_create_rep_vport_rx_rule(priv); +} + +static int mlx5e_init_rep_shared_rq(struct mlx5e_priv *priv) +{ + int err; + + err = mlx5e_create_rep_vport_rx_rule(priv); + if (err) + mlx5_core_warn(priv->mdev, "create_rep_vport_rx_rule failed with err %d\n", err); + + return err; +} + +static int mlx5e_init_rep_dedicated_rq(struct mlx5e_priv *priv) +{ + struct mlx5_core_dev *mdev = priv->mdev; + int err; + + priv->rx_res = mlx5e_rx_res_alloc(); + if (!priv->rx_res) + return -ENOMEM; + + mlx5e_init_l2_addr(priv); + + err = mlx5e_open_drop_rq(priv, &priv->drop_rq); + if (err) { + mlx5_core_err(mdev, "open drop rq failed, %d\n", err); + return err; + } + + err = mlx5e_rx_res_init(priv->rx_res, priv->mdev, 0, + priv->max_nch, priv->drop_rq.rqn, + &priv->channels.params.packet_merge, + priv->channels.params.num_channels); + if (err) + goto err_close_drop_rq; + + err = mlx5e_create_rep_ttc_table(priv); + if (err) + goto err_destroy_rx_res; + + err = mlx5e_esw_offloads_pet_setup(priv->mdev->priv.eswitch, + mlx5_get_ttc_flow_table(priv->fs.ttc)); + if (err) + goto err_destroy_ttc_table; + + err = mlx5e_create_rep_root_ft(priv); + if (err) + goto err_cleanup_pet_setup; + + err = mlx5e_create_rep_vport_rx_rule(priv); + if (err) + goto err_destroy_root_ft; + + mlx5e_ethtool_init_steering(priv); + + return 0; + +err_destroy_root_ft: + mlx5e_destroy_rep_root_ft(priv); +err_cleanup_pet_setup: + mlx5e_esw_offloads_pet_cleanup(priv->mdev->priv.eswitch); +err_destroy_ttc_table: + mlx5_destroy_ttc_table(priv->fs.ttc); +err_destroy_rx_res: + mlx5e_rx_res_destroy(priv->rx_res); +err_close_drop_rq: + mlx5e_close_drop_rq(&priv->drop_rq); + mlx5e_rx_res_free(priv->rx_res); + priv->rx_res = NULL; + return err; +} + +static int mlx5e_init_rep_rx(struct mlx5e_priv *priv) +{ + if (mlx5e_is_rep_shared_rq(priv)) + return mlx5e_init_rep_shared_rq(priv); + else + return mlx5e_init_rep_dedicated_rq(priv); +} + +static void mlx5e_cleanup_rep_shared_rq(struct mlx5e_priv *priv) +{ + rep_vport_rx_rule_destroy(priv); +} + +static void mlx5e_cleanup_rep_dedicated_rq(struct mlx5e_priv *priv) +{ + mlx5e_ethtool_cleanup_steering(priv); + rep_vport_rx_rule_destroy(priv); + mlx5e_destroy_rep_root_ft(priv); + mlx5e_esw_offloads_pet_cleanup(priv->mdev->priv.eswitch); + mlx5_destroy_ttc_table(priv->fs.ttc); + mlx5e_rx_res_destroy(priv->rx_res); + mlx5e_close_drop_rq(&priv->drop_rq); + mlx5e_rx_res_free(priv->rx_res); + priv->rx_res = NULL; +} + +static void mlx5e_cleanup_rep_rx(struct mlx5e_priv *priv) +{ + if (mlx5e_is_rep_shared_rq(priv)) + return mlx5e_cleanup_rep_shared_rq(priv); + else + return mlx5e_cleanup_rep_dedicated_rq(priv); +} + +static int mlx5e_init_ul_rep_rx(struct mlx5e_priv *priv) +{ + int err; + + mlx5e_create_q_counters(priv); + err = mlx5e_init_rep_rx(priv); + if (err) + goto out; + + mlx5e_tc_int_port_init_rep_rx(priv); + +out: + return err; +} + +static void mlx5e_cleanup_ul_rep_rx(struct mlx5e_priv *priv) +{ + mlx5e_tc_int_port_cleanup_rep_rx(priv); + mlx5e_cleanup_rep_rx(priv); + mlx5e_destroy_q_counters(priv); +} + +static int mlx5e_init_uplink_rep_tx(struct mlx5e_rep_priv *rpriv) +{ + struct mlx5_rep_uplink_priv *uplink_priv; + struct net_device *netdev; + struct mlx5e_priv *priv; + int err; + + netdev = rpriv->netdev; + priv = netdev_priv(netdev); + uplink_priv = &rpriv->uplink_priv; + + err = mlx5e_rep_tc_init(rpriv); + if (err) + return err; + + mlx5_init_port_tun_entropy(&uplink_priv->tun_entropy, priv->mdev); + + mlx5e_rep_bond_init(rpriv); + err = mlx5e_rep_tc_netdevice_event_register(rpriv); + if (err) { + mlx5_core_err(priv->mdev, "Failed to register netdev notifier, err: %d\n", + err); + goto err_event_reg; + } + + return 0; + +err_event_reg: + mlx5e_rep_bond_cleanup(rpriv); + mlx5e_rep_tc_cleanup(rpriv); + return err; +} + +static void mlx5e_cleanup_uplink_rep_tx(struct mlx5e_rep_priv *rpriv) +{ + mlx5e_rep_tc_netdevice_event_unregister(rpriv); + mlx5e_rep_bond_cleanup(rpriv); + mlx5e_rep_tc_cleanup(rpriv); +} + +static int mlx5e_init_rep_tx(struct mlx5e_priv *priv) +{ + struct mlx5e_rep_priv *rpriv = priv->ppriv; + int err; + + err = mlx5e_create_tises(priv); + if (err) { + mlx5_core_warn(priv->mdev, "create tises failed, %d\n", err); + return err; + } + + if (rpriv->rep->vport == MLX5_VPORT_UPLINK) { + err = mlx5e_init_uplink_rep_tx(rpriv); + if (err) + goto err_init_tx; + } + + err = mlx5e_tc_ht_init(&rpriv->tc_ht); + if (err) + goto err_ht_init; + + return 0; + +err_ht_init: + if (rpriv->rep->vport == MLX5_VPORT_UPLINK) + mlx5e_cleanup_uplink_rep_tx(rpriv); +err_init_tx: + mlx5e_destroy_tises(priv); + return err; +} + +static void mlx5e_cleanup_rep_tx(struct mlx5e_priv *priv) +{ + struct mlx5e_rep_priv *rpriv = priv->ppriv; + + mlx5e_tc_ht_cleanup(&rpriv->tc_ht); + + if (rpriv->rep->vport == MLX5_VPORT_UPLINK) + mlx5e_cleanup_uplink_rep_tx(rpriv); + + mlx5e_destroy_tises(priv); +} + +static void mlx5e_rep_enable(struct mlx5e_priv *priv) +{ + struct mlx5e_rep_priv *rpriv = priv->ppriv; + + mlx5e_set_netdev_mtu_boundaries(priv); + mlx5e_rep_neigh_init(rpriv); +} + +static void mlx5e_rep_disable(struct mlx5e_priv *priv) +{ + struct mlx5e_rep_priv *rpriv = priv->ppriv; + + mlx5e_rep_neigh_cleanup(rpriv); +} + +static int mlx5e_update_rep_rx(struct mlx5e_priv *priv) +{ + return 0; +} + +static int mlx5e_obj_change_event(struct mlx5e_priv *priv, struct mlx5_eqe *eqe) +{ + struct mlx5_eqe_obj_change *obj_change = &eqe->data.obj_change; + u16 obj_type = be16_to_cpu(obj_change->obj_type); + u32 obj_id = be32_to_cpu(obj_change->obj_id); + + if (obj_type == MLX5_GENERAL_OBJECT_TYPES_IPSEC) + return mlx5e_ipsec_async_event(priv, obj_id); + + return NOTIFY_DONE; +} + +static int mlx5e_port_change_event(struct mlx5e_priv *priv, struct mlx5_eqe *eqe) +{ + switch (eqe->sub_type) { + case MLX5_PORT_CHANGE_SUBTYPE_DOWN: + case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE: + queue_work(priv->wq, &priv->update_carrier_work); + break; + default: + return NOTIFY_DONE; + } + + return NOTIFY_OK; +} + +static int uplink_rep_async_event(struct notifier_block *nb, unsigned long event, void *data) +{ + struct mlx5e_priv *priv = container_of(nb, struct mlx5e_priv, events_nb); + struct mlx5_eqe *eqe = data; + + switch (event) { + case MLX5_EVENT_TYPE_OBJECT_CHANGE_EVENT: + return mlx5e_obj_change_event(priv, eqe); + case MLX5_EVENT_TYPE_PORT_CHANGE: + return mlx5e_port_change_event(priv, eqe); + case MLX5_DEV_EVENT_PORT_AFFINITY: + return mlx5e_rep_tc_event_port_affinity(priv); + } + + return NOTIFY_DONE; +} + +static void mlx5e_uplink_rep_enable(struct mlx5e_priv *priv) +{ + struct mlx5e_rep_priv *rpriv = priv->ppriv; + struct net_device *netdev = priv->netdev; + struct mlx5_core_dev *mdev = priv->mdev; + u16 max_mtu; + + netdev->min_mtu = ETH_MIN_MTU; + mlx5_query_port_max_mtu(priv->mdev, &max_mtu, 1); + netdev->max_mtu = MLX5E_HW2SW_MTU(&priv->channels.params, max_mtu); + mlx5e_set_dev_port_mtu(priv); + + mlx5e_rep_tc_enable(priv); + + if (MLX5_CAP_GEN(mdev, uplink_follow)) + mlx5_modify_vport_admin_state(mdev, MLX5_VPORT_STATE_OP_MOD_UPLINK, + 0, 0, MLX5_VPORT_ADMIN_STATE_AUTO); + priv->events_nb.notifier_call = uplink_rep_async_event; + mlx5_notifier_register(mdev, &priv->events_nb); + mlx5e_dcbnl_initialize(priv); + mlx5e_dcbnl_init_app(priv); + mlx5e_rep_neigh_init(rpriv); + mlx5e_rep_bridge_init(priv); + + netdev->wanted_features |= NETIF_F_HW_TC; + + rtnl_lock(); + if (netif_running(netdev)) + mlx5e_open(netdev); + udp_tunnel_nic_reset_ntf(priv->netdev); + netif_device_attach(netdev); + rtnl_unlock(); + + mlx5_lag_add_netdev(mdev, netdev); +} + +static void mlx5e_uplink_rep_disable(struct mlx5e_priv *priv) +{ + struct mlx5e_rep_priv *rpriv = priv->ppriv; + struct mlx5_core_dev *mdev = priv->mdev; + + rtnl_lock(); + if (netif_running(priv->netdev)) + mlx5e_close(priv->netdev); + netif_device_detach(priv->netdev); + rtnl_unlock(); + + mlx5e_rep_bridge_cleanup(priv); + mlx5e_rep_neigh_cleanup(rpriv); + mlx5e_dcbnl_delete_app(priv); + mlx5_notifier_unregister(mdev, &priv->events_nb); + mlx5e_rep_tc_disable(priv); + mlx5_lag_remove_netdev(mdev, priv->netdev); + mlx5_vxlan_reset_to_default(mdev->vxlan); +} + +static MLX5E_DEFINE_STATS_GRP(sw_rep, 0); +static MLX5E_DEFINE_STATS_GRP(vport_rep, MLX5E_NDO_UPDATE_STATS); + +/* The stats groups order is opposite to the update_stats() order calls */ +static mlx5e_stats_grp_t mlx5e_rep_stats_grps[] = { + &MLX5E_STATS_GRP(sw_rep), + &MLX5E_STATS_GRP(vport_rep), +}; + +static unsigned int mlx5e_rep_stats_grps_num(struct mlx5e_priv *priv) +{ + return ARRAY_SIZE(mlx5e_rep_stats_grps); +} + +/* The stats groups order is opposite to the update_stats() order calls */ +static mlx5e_stats_grp_t mlx5e_ul_rep_stats_grps[] = { + &MLX5E_STATS_GRP(sw), + &MLX5E_STATS_GRP(qcnt), + &MLX5E_STATS_GRP(vnic_env), + &MLX5E_STATS_GRP(vport), + &MLX5E_STATS_GRP(802_3), + &MLX5E_STATS_GRP(2863), + &MLX5E_STATS_GRP(2819), + &MLX5E_STATS_GRP(phy), + &MLX5E_STATS_GRP(eth_ext), + &MLX5E_STATS_GRP(pcie), + &MLX5E_STATS_GRP(per_prio), + &MLX5E_STATS_GRP(pme), + &MLX5E_STATS_GRP(channels), + &MLX5E_STATS_GRP(per_port_buff_congest), +#ifdef CONFIG_MLX5_EN_IPSEC + &MLX5E_STATS_GRP(ipsec_sw), + &MLX5E_STATS_GRP(ipsec_hw), +#endif + &MLX5E_STATS_GRP(ptp), +}; + +static unsigned int mlx5e_ul_rep_stats_grps_num(struct mlx5e_priv *priv) +{ + return ARRAY_SIZE(mlx5e_ul_rep_stats_grps); +} + +static const struct mlx5e_profile mlx5e_rep_profile = { + .init = mlx5e_init_rep, + .cleanup = mlx5e_cleanup_rep, + .init_rx = mlx5e_init_rep_rx, + .cleanup_rx = mlx5e_cleanup_rep_rx, + .init_tx = mlx5e_init_rep_tx, + .cleanup_tx = mlx5e_cleanup_rep_tx, + .enable = mlx5e_rep_enable, + .disable = mlx5e_rep_disable, + .update_rx = mlx5e_update_rep_rx, + .update_stats = mlx5e_stats_update_ndo_stats, + .rx_handlers = &mlx5e_rx_handlers_rep, + .max_tc = 1, + .rq_groups = MLX5E_NUM_RQ_GROUPS(REGULAR), + .stats_grps = mlx5e_rep_stats_grps, + .stats_grps_num = mlx5e_rep_stats_grps_num, + .max_nch_limit = mlx5e_rep_max_nch_limit, +}; + +static const struct mlx5e_profile mlx5e_uplink_rep_profile = { + .init = mlx5e_init_ul_rep, + .cleanup = mlx5e_cleanup_rep, + .init_rx = mlx5e_init_ul_rep_rx, + .cleanup_rx = mlx5e_cleanup_ul_rep_rx, + .init_tx = mlx5e_init_rep_tx, + .cleanup_tx = mlx5e_cleanup_rep_tx, + .enable = mlx5e_uplink_rep_enable, + .disable = mlx5e_uplink_rep_disable, + .update_rx = mlx5e_update_rep_rx, + .update_stats = mlx5e_stats_update_ndo_stats, + .update_carrier = mlx5e_update_carrier, + .rx_handlers = &mlx5e_rx_handlers_rep, + .max_tc = MLX5E_MAX_NUM_TC, + /* XSK is needed so we can replace profile with NIC netdev */ + .rq_groups = MLX5E_NUM_RQ_GROUPS(XSK), + .stats_grps = mlx5e_ul_rep_stats_grps, + .stats_grps_num = mlx5e_ul_rep_stats_grps_num, +}; + +static int mlx5e_rep_metadata_insert(struct mlx5e_priv *priv, struct mlx5_eswitch_rep *rep) +{ + struct mlx5e_rep_priv *rpriv_uplink; + struct mlx5_eswitch *esw; + u32 vport_metadata; + int err; + + esw = priv->mdev->priv.eswitch; + if (!mlx5e_esw_offloads_pet_enabled(esw)) + return 0; + + rpriv_uplink = mlx5_eswitch_get_uplink_priv(esw, REP_ETH); + vport_metadata = mlx5_eswitch_get_vport_metadata_for_match(esw, rep->vport); + err = xa_insert(&rpriv_uplink->vport_rep_map, vport_metadata, rep, GFP_KERNEL); + if (err) { + esw_warn(esw->dev, "Error %d inserting metadata for vport %d\n", err, rep->vport); + goto err; + } + return 0; +err: + return err; +} + +static void mlx5e_rep_metadata_remove(struct mlx5e_priv *priv, struct mlx5_eswitch_rep *rep) +{ + struct mlx5e_rep_priv *rpriv_uplink; + struct mlx5_eswitch *esw; + u32 vport_metadata; + + esw = priv->mdev->priv.eswitch; + if (!mlx5e_esw_offloads_pet_enabled(esw)) + return; + + rpriv_uplink = mlx5_eswitch_get_uplink_priv(esw, REP_ETH); + vport_metadata = mlx5_eswitch_get_vport_metadata_for_match(esw, rep->vport); + xa_erase(&rpriv_uplink->vport_rep_map, vport_metadata); + synchronize_net(); +} + +/* e-Switch vport representors */ +static int +mlx5e_vport_uplink_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) +{ + struct mlx5e_priv *priv = netdev_priv(mlx5_uplink_netdev_get(dev)); + struct mlx5e_rep_priv *rpriv = mlx5e_rep_to_rep_priv(rep); + struct devlink_port *dl_port; + int err; + + rpriv->netdev = priv->netdev; + + xa_init(&rpriv->vport_rep_map); + err = mlx5e_rep_metadata_insert(priv, rep); + if (err) + return err; + + mlx5e_ipsec_ul_cleanup(priv); + + err = mlx5e_netdev_change_profile(priv, &mlx5e_uplink_rep_profile, + rpriv); + if (err) + goto err_metadata_insert; + + dl_port = mlx5_esw_offloads_devlink_port(dev->priv.eswitch, rpriv->rep->vport); + if (dl_port) + devlink_port_type_eth_set(dl_port, rpriv->netdev); + + mlx5_smartnic_sysfs_init(rpriv->netdev); + mlx5_rep_sysfs_init(rpriv); + mlx5e_ipsec_build_netdev(priv); + + return 0; +err_metadata_insert: + xa_destroy(&rpriv->vport_rep_map); + + return err; +} + +static void +mlx5e_vport_uplink_rep_unload(struct mlx5e_rep_priv *rpriv) +{ + struct net_device *netdev = rpriv->netdev; + struct devlink_port *dl_port; + struct mlx5_core_dev *dev; + struct mlx5e_priv *priv; + + priv = netdev_priv(netdev); + dev = priv->mdev; + + mlx5e_rep_metadata_remove(priv, rpriv->rep); + xa_destroy(&rpriv->vport_rep_map); + + mlx5e_ipsec_ul_cleanup(priv); + + dl_port = mlx5_esw_offloads_devlink_port(dev->priv.eswitch, rpriv->rep->vport); + if (dl_port) + devlink_port_type_clear(dl_port); + + mlx5_smartnic_sysfs_cleanup(netdev); + if (test_bit(MLX5_INTERFACE_STATE_TEARDOWN, &dev->intf_state)) { + mlx5e_detach_netdev(priv); + priv->profile->cleanup(priv); + mlx5e_destroy_mdev_resources(priv->mdev); + } else { + mlx5e_netdev_attach_nic_profile(priv); + } +} + +static int +mlx5e_vport_vf_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) +{ + struct mlx5e_rep_priv *rpriv = mlx5e_rep_to_rep_priv(rep); + const struct mlx5e_profile *profile; + struct devlink_port *dl_port; + struct net_device *netdev; + struct mlx5e_priv *priv; + int err; + + profile = &mlx5e_rep_profile; + netdev = mlx5e_create_netdev(dev, profile); + if (!netdev) { + mlx5_core_warn(dev, + "Failed to create representor netdev for vport %d\n", + rep->vport); + return -EINVAL; + } + + mlx5e_build_rep_netdev(netdev, dev, rep); + rpriv->netdev = netdev; + + priv = netdev_priv(netdev); + priv->profile = profile; + priv->ppriv = rpriv; + priv->shared_rq = mlx5e_is_rep_shared_rq(priv); + err = mlx5e_rep_metadata_insert(netdev_priv(netdev), rep); + if (err) + return err; + + err = profile->init(dev, netdev); + if (err) { + netdev_warn(netdev, "rep profile init failed, %d\n", err); + goto err_destroy_netdev; + } + + err = mlx5e_attach_netdev(netdev_priv(netdev)); + if (err) { + netdev_warn(netdev, + "Failed to attach representor netdev for vport %d\n", + rep->vport); + goto err_cleanup_profile; + } + + err = register_netdev(netdev); + if (err) { + netdev_warn(netdev, + "Failed to register representor netdev for vport %d\n", + rep->vport); + goto err_detach_netdev; + } + + mlx5_rep_sysfs_init(rpriv); + + dl_port = mlx5_esw_offloads_devlink_port(dev->priv.eswitch, rpriv->rep->vport); + if (dl_port) + devlink_port_type_eth_set(dl_port, netdev); + mlx5_devm_sf_port_type_eth_set(dev, rpriv->rep->vport, netdev); + return 0; + +err_detach_netdev: + mlx5e_detach_netdev(netdev_priv(netdev)); + +err_cleanup_profile: + priv->profile->cleanup(priv); + +err_destroy_netdev: + mlx5e_destroy_netdev(netdev_priv(netdev)); + return err; +} + +static int +mlx5e_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) +{ + struct mlx5e_rep_priv *rpriv; + int err; + + rpriv = kzalloc(sizeof(*rpriv), GFP_KERNEL); + if (!rpriv) + return -ENOMEM; + + /* rpriv->rep to be looked up when profile->init() is called */ + rpriv->rep = rep; + rep->rep_data[REP_ETH].priv = rpriv; + INIT_LIST_HEAD(&rpriv->vport_sqs_list); + + if (rep->vport == MLX5_VPORT_UPLINK) + err = mlx5e_vport_uplink_rep_load(dev, rep); + else + err = mlx5e_vport_vf_rep_load(dev, rep); + + if (err) + kfree(rpriv); + + return err; +} + +static void +mlx5e_vport_rep_unload(struct mlx5_eswitch_rep *rep) +{ + struct mlx5e_rep_priv *rpriv = mlx5e_rep_to_rep_priv(rep); + struct net_device *netdev = rpriv->netdev; + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5_core_dev *dev = priv->mdev; + struct devlink_port *dl_port; + void *ppriv = priv->ppriv; + + mlx5_rep_destroy_miss_meter(dev, rpriv); + mlx5_rep_sysfs_cleanup(rpriv); + + if (rep->vport == MLX5_VPORT_UPLINK) { + mlx5e_vport_uplink_rep_unload(rpriv); + kfree(ppriv); /* mlx5e_rep_priv */ + priv->ppriv = NULL; + return; + } + + mlx5e_rep_metadata_remove(priv, rep); + dl_port = mlx5_esw_offloads_devlink_port(dev->priv.eswitch, rpriv->rep->vport); + if (dl_port) + devlink_port_type_clear(dl_port); + unregister_netdev(netdev); + mlx5e_detach_netdev(priv); + priv->profile->cleanup(priv); + mlx5e_destroy_netdev(priv); + kfree(ppriv); /* mlx5e_rep_priv */ + priv->ppriv = NULL; +} + +static void *mlx5e_vport_rep_get_proto_dev(struct mlx5_eswitch_rep *rep) +{ + struct mlx5e_rep_priv *rpriv; + + rpriv = mlx5e_rep_to_rep_priv(rep); + + return rpriv->netdev; +} + +static void mlx5e_vport_rep_event_unpair(struct mlx5_eswitch_rep *rep) +{ + struct mlx5e_rep_priv *rpriv; + struct mlx5e_rep_sq *rep_sq; + + rpriv = mlx5e_rep_to_rep_priv(rep); + list_for_each_entry(rep_sq, &rpriv->vport_sqs_list, list) { + if (!rep_sq->send_to_vport_rule_peer) + continue; + mlx5_eswitch_del_send_to_vport_rule(rep_sq->send_to_vport_rule_peer); + rep_sq->send_to_vport_rule_peer = NULL; + } +} + +static int mlx5e_vport_rep_event_pair(struct mlx5_eswitch *esw, + struct mlx5_eswitch_rep *rep, + struct mlx5_eswitch *peer_esw) +{ + struct mlx5_flow_handle *flow_rule; + struct mlx5e_rep_priv *rpriv; + struct mlx5e_rep_sq *rep_sq; + + rpriv = mlx5e_rep_to_rep_priv(rep); + list_for_each_entry(rep_sq, &rpriv->vport_sqs_list, list) { + if (rep_sq->send_to_vport_rule_peer) + continue; + flow_rule = mlx5_eswitch_add_send_to_vport_rule(peer_esw, esw, rep, rep_sq->sqn); + if (IS_ERR(flow_rule)) + goto err_out; + rep_sq->send_to_vport_rule_peer = flow_rule; + } + + return 0; +err_out: + mlx5e_vport_rep_event_unpair(rep); + return PTR_ERR(flow_rule); +} + +static int mlx5e_vport_rep_event(struct mlx5_eswitch *esw, + struct mlx5_eswitch_rep *rep, + enum mlx5_switchdev_event event, + void *data) +{ + int err = 0; + + if (event == MLX5_SWITCHDEV_EVENT_PAIR) + err = mlx5e_vport_rep_event_pair(esw, rep, data); + else if (event == MLX5_SWITCHDEV_EVENT_UNPAIR) + mlx5e_vport_rep_event_unpair(rep); + + return err; +} + +static const struct mlx5_eswitch_rep_ops rep_ops = { + .load = mlx5e_vport_rep_load, + .unload = mlx5e_vport_rep_unload, + .get_proto_dev = mlx5e_vport_rep_get_proto_dev, + .event = mlx5e_vport_rep_event, +}; + +static int mlx5e_rep_probe(struct auxiliary_device *adev, + const struct auxiliary_device_id *id) +{ + struct mlx5_adev *edev = container_of(adev, struct mlx5_adev, adev); + struct mlx5_core_dev *mdev = edev->mdev; + struct mlx5_eswitch *esw; + + esw = mdev->priv.eswitch; + mlx5_eswitch_register_vport_reps(esw, &rep_ops, REP_ETH); + return 0; +} + +static void mlx5e_rep_remove(struct auxiliary_device *adev) +{ + struct mlx5_adev *vdev = container_of(adev, struct mlx5_adev, adev); + struct mlx5_core_dev *mdev = vdev->mdev; + struct mlx5_eswitch *esw; + + esw = mdev->priv.eswitch; + mlx5_eswitch_unregister_vport_reps(esw, REP_ETH); +} + +static const struct auxiliary_device_id mlx5e_rep_id_table[] = { + { .name = MLX5_ADEV_NAME ".eth-rep", }, + {}, +}; + +MODULE_DEVICE_TABLE(auxiliary_mlx5e_rep_id_table, mlx5e_rep_id_table); + +static struct auxiliary_driver mlx5e_rep_driver = { + .name = "eth-rep", + .probe = mlx5e_rep_probe, + .remove = mlx5e_rep_remove, + .id_table = mlx5e_rep_id_table, +}; + +int mlx5e_rep_init(void) +{ + return auxiliary_driver_register(&mlx5e_rep_driver); +} + +void mlx5e_rep_cleanup(void) +{ + auxiliary_driver_unregister(&mlx5e_rep_driver); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h new file mode 100644 index 0000000..202021d --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h @@ -0,0 +1,298 @@ +/* + * Copyright (c) 2017, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __MLX5E_REP_H__ +#define __MLX5E_REP_H__ + +#include +#include +#include +#include "eswitch.h" +#include "en.h" +#include "lib/port_tun.h" + +#ifdef CONFIG_MLX5_ESWITCH +extern const struct mlx5e_rx_handlers mlx5e_rx_handlers_rep; + +struct mlx5e_neigh_update_table { + struct rhashtable neigh_ht; + /* Save the neigh hash entries in a list in addition to the hash table + * (neigh_ht). In order to iterate easily over the neigh entries. + * Used for stats query. + */ + struct list_head neigh_list; + /* protect lookup/remove operations */ + struct mutex encap_lock; + struct notifier_block netevent_nb; + struct delayed_work neigh_stats_work; + unsigned long min_interval; /* jiffies */ +}; + +struct mlx5_tc_ct_priv; +struct mlx5_tc_int_port_priv; +struct mlx5e_rep_bond; +struct mlx5e_tc_tun_encap; +struct mlx5e_post_act; +struct mlx5e_flow_meters; + +struct mlx5_rep_uplink_priv { + /* indirect block callbacks are invoked on bind/unbind events + * on registered higher level devices (e.g. tunnel devices) + * + * tc_indr_block_cb_priv_list is used to lookup indirect callback + * private data + * + */ + struct list_head tc_indr_block_priv_list; + + struct mlx5_tun_entropy tun_entropy; + + /* protects unready_flows */ + struct mutex unready_flows_lock; + struct list_head unready_flows; + struct work_struct reoffload_flows_work; + + /* maps tun_info to a unique id*/ + struct mapping_ctx *tunnel_mapping; + /* maps tun_enc_opts to a unique id*/ + struct mapping_ctx *tunnel_enc_opts_mapping; + + struct mlx5e_post_act *post_act; + struct mlx5_tc_ct_priv *ct_priv; + struct mlx5e_tc_psample *tc_psample; + + /* support eswitch vports bonding */ + struct mlx5e_rep_bond *bond; + + /* tc tunneling encapsulation private data */ + struct mlx5e_tc_tun_encap *encap; + + /* OVS internal port support */ + struct mlx5e_tc_int_port_priv *int_port_priv; + + struct mlx5e_flow_meters *flow_meters; +}; + +struct mlx5_meter_handle; + +struct rep_meter { + u64 rate; + u64 burst; + struct mlx5e_flow_meter_handle *meter_hndl; + struct mlx5_flow_handle *meter_rule; + struct mlx5_flow_handle *drop_red_rule; + struct mlx5_fc *drop_counter; + u64 packets_dropped; + u64 bytes_dropped; +}; + +struct mlx5_rep_sysfs { + struct mlx5_eswitch *esw; + struct kobject kobj; + struct kobject paging_kobj; + int vport; +}; + +struct mlx5e_rep_priv { + struct mlx5_eswitch_rep *rep; + struct mlx5e_neigh_update_table neigh_update; + struct net_device *netdev; + struct mlx5_flow_table *root_ft; + struct mlx5_flow_handle *vport_rx_rule; + struct list_head vport_sqs_list; + struct mlx5_rep_uplink_priv uplink_priv; /* valid for uplink rep */ + struct rtnl_link_stats64 prev_vf_vport_stats; + struct xarray vport_rep_map; + struct rep_meter rep_meter; + struct mlx5_rep_sysfs rep_sysfs; + struct mlx5_flow_handle *send_to_vport_meta_rule; + struct rhashtable tc_ht; +}; + +static inline +struct mlx5e_rep_priv *mlx5e_rep_to_rep_priv(struct mlx5_eswitch_rep *rep) +{ + return rep->rep_data[REP_ETH].priv; +} + +struct mlx5e_neigh { + union { + __be32 v4; + struct in6_addr v6; + } dst_ip; + int family; +}; + +struct mlx5e_neigh_hash_entry { + struct rhash_head rhash_node; + struct mlx5e_neigh m_neigh; + struct mlx5e_priv *priv; + struct net_device *neigh_dev; + + /* Save the neigh hash entry in a list on the representor in + * addition to the hash table. In order to iterate easily over the + * neighbour entries. Used for stats query. + */ + struct list_head neigh_list; + + /* protects encap list */ + spinlock_t encap_list_lock; + /* encap list sharing the same neigh */ + struct list_head encap_list; + + /* neigh hash entry can be deleted only when the refcount is zero. + * refcount is needed to avoid neigh hash entry removal by TC, while + * it's used by the neigh notification call. + */ + refcount_t refcnt; + + /* Save the last reported time offloaded traffic pass over one of the + * neigh hash entry flows. Use it to periodically update the neigh + * 'used' value and avoid neigh deleting by the kernel. + */ + unsigned long reported_lastuse; + + struct rcu_head rcu; +}; + +enum { + /* set when the encap entry is successfully offloaded into HW */ + MLX5_ENCAP_ENTRY_VALID = BIT(0), + MLX5_REFORMAT_DECAP = BIT(1), + MLX5_ENCAP_ENTRY_NO_ROUTE = BIT(2), +}; + +struct mlx5e_decap_key { + struct ethhdr key; +}; + +struct mlx5e_decap_entry { + struct mlx5e_decap_key key; + struct list_head flows; + struct hlist_node hlist; + refcount_t refcnt; + struct completion res_ready; + int compl_result; + struct mlx5_pkt_reformat *pkt_reformat; + struct rcu_head rcu; +}; + +struct mlx5e_mpls_info { + u32 label; + u8 tc; + u8 bos; + u8 ttl; +}; + +struct mlx5e_encap_entry { + /* attached neigh hash entry */ + struct mlx5e_neigh_hash_entry *nhe; + /* neigh hash entry list of encaps sharing the same neigh */ + struct list_head encap_list; + /* a node of the eswitch encap hash table which keeping all the encap + * entries + */ + struct hlist_node encap_hlist; + struct list_head flows; + struct list_head route_list; + struct mlx5_pkt_reformat *pkt_reformat; + const struct ip_tunnel_info *tun_info; + struct mlx5e_mpls_info mpls_info; + unsigned char h_dest[ETH_ALEN]; /* destination eth addr */ + + struct net_device *out_dev; + int route_dev_ifindex; + struct mlx5e_tc_tunnel *tunnel; + int reformat_type; + u8 flags; + char *encap_header; + int encap_size; + refcount_t refcnt; + struct completion res_ready; + int compl_result; + struct rcu_head rcu; +}; + +struct mlx5e_rep_sq { + struct mlx5_flow_handle *send_to_vport_rule; + struct mlx5_flow_handle *send_to_vport_rule_peer; + u32 sqn; + struct list_head list; +}; + +int mlx5e_rep_init(void); +void mlx5e_rep_cleanup(void); +int mlx5e_rep_bond_init(struct mlx5e_rep_priv *rpriv); +void mlx5e_rep_bond_cleanup(struct mlx5e_rep_priv *rpriv); +int mlx5e_rep_bond_enslave(struct mlx5_eswitch *esw, struct net_device *netdev, + struct net_device *lag_dev); +void mlx5e_rep_bond_unslave(struct mlx5_eswitch *esw, + const struct net_device *netdev, + const struct net_device *lag_dev); +int mlx5e_rep_bond_update(struct mlx5e_priv *priv, bool cleanup); + +bool mlx5e_rep_has_offload_stats(const struct net_device *dev, int attr_id); +int mlx5e_rep_get_offload_stats(int attr_id, const struct net_device *dev, + void *sp); + +bool mlx5e_is_uplink_rep(const struct mlx5e_priv *priv); +void mlx5e_rep_activate_channels(struct mlx5e_priv *priv); +void mlx5e_rep_deactivate_channels(struct mlx5e_priv *priv); + +void mlx5e_rep_queue_neigh_stats_work(struct mlx5e_priv *priv); + +bool mlx5e_eswitch_vf_rep(const struct net_device *netdev); +bool mlx5e_eswitch_uplink_rep(const struct net_device *netdev); +static inline bool mlx5e_eswitch_rep(const struct net_device *netdev) +{ + return mlx5e_eswitch_vf_rep(netdev) || + mlx5e_eswitch_uplink_rep(netdev); +} + +#else /* CONFIG_MLX5_ESWITCH */ +static inline bool mlx5e_is_uplink_rep(const struct mlx5e_priv *priv) { return false; } +static inline void mlx5e_rep_activate_channels(struct mlx5e_priv *priv) {} +static inline void mlx5e_rep_deactivate_channels(struct mlx5e_priv *priv) {} +static inline int mlx5e_rep_init(void) { return 0; }; +static inline void mlx5e_rep_cleanup(void) {}; +static inline bool mlx5e_rep_has_offload_stats(const struct net_device *dev, + int attr_id) { return false; } +static inline int mlx5e_rep_get_offload_stats(int attr_id, + const struct net_device *dev, + void *sp) { return -EOPNOTSUPP; } +#endif + +static inline bool mlx5e_is_vport_rep(const struct mlx5e_priv *priv) +{ + return (MLX5_ESWITCH_MANAGER(priv->mdev) && priv->ppriv); +} +#endif /* __MLX5E_REP_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c new file mode 100644 index 0000000..b224a70 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -0,0 +1,2706 @@ +/* + * Copyright (c) 2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "en.h" +#include "en/txrx.h" +#include "en_tc.h" +#include "eswitch.h" +#include "en_rep.h" +#include "en/rep/tc.h" +#include "ipoib/ipoib.h" +#include "accel/ipsec.h" +#include "fpga/ipsec.h" +#include "en_accel/macsec.h" +#include "en_accel/ipsec_rxtx.h" +#include "en_accel/tls_rxtx.h" +#include "en/xdp.h" +#include "en/xsk/rx.h" +#include "en/health.h" +#include "en/params.h" +#include "devlink.h" +#include "en/devlink.h" +#include "esw/ipsec.h" + +static struct sk_buff * +mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, + u16 cqe_bcnt, u32 head_offset, u32 page_idx); +static struct sk_buff * +mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, + u16 cqe_bcnt, u32 head_offset, u32 page_idx); +static void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe); +static void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe); +static void mlx5e_handle_rx_cqe_mpwrq_shampo(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe); + +const struct mlx5e_rx_handlers mlx5e_rx_handlers_nic = { + .handle_rx_cqe = mlx5e_handle_rx_cqe, + .handle_rx_cqe_mpwqe = mlx5e_handle_rx_cqe_mpwrq, + .handle_rx_cqe_mpwqe_shampo = mlx5e_handle_rx_cqe_mpwrq_shampo, +}; + +static inline bool mlx5e_rx_hw_stamp(struct hwtstamp_config *config) +{ + return config->rx_filter == HWTSTAMP_FILTER_ALL; +} + +static inline void mlx5e_read_cqe_slot(struct mlx5_cqwq *wq, + u32 cqcc, void *data) +{ + u32 ci = mlx5_cqwq_ctr2ix(wq, cqcc); + + memcpy(data, mlx5_cqwq_get_wqe(wq, ci), sizeof(struct mlx5_cqe64)); +} + +static inline void mlx5e_read_title_slot(struct mlx5e_rq *rq, + struct mlx5_cqwq *wq, + u32 cqcc) +{ + struct mlx5e_cq_decomp *cqd = &rq->cqd; + struct mlx5_cqe64 *title = &cqd->title; + + mlx5e_read_cqe_slot(wq, cqcc, title); + cqd->left = be32_to_cpu(title->byte_cnt); + cqd->wqe_counter = be16_to_cpu(title->wqe_counter); + rq->stats->cqe_compress_blks++; +} + +static inline void mlx5e_read_mini_arr_slot(struct mlx5_cqwq *wq, + struct mlx5e_cq_decomp *cqd, + u32 cqcc) +{ + mlx5e_read_cqe_slot(wq, cqcc, cqd->mini_arr); + cqd->mini_arr_idx = 0; +} + +static inline void mlx5e_cqes_update_owner(struct mlx5_cqwq *wq, int n) +{ + u32 cqcc = wq->cc; + u8 op_own = mlx5_cqwq_get_ctr_wrap_cnt(wq, cqcc) & 1; + u32 ci = mlx5_cqwq_ctr2ix(wq, cqcc); + u32 wq_sz = mlx5_cqwq_get_size(wq); + u32 ci_top = min_t(u32, wq_sz, ci + n); + + for (; ci < ci_top; ci++, n--) { + struct mlx5_cqe64 *cqe = mlx5_cqwq_get_wqe(wq, ci); + + cqe->op_own = op_own; + } + + if (unlikely(ci == wq_sz)) { + op_own = !op_own; + for (ci = 0; ci < n; ci++) { + struct mlx5_cqe64 *cqe = mlx5_cqwq_get_wqe(wq, ci); + + cqe->op_own = op_own; + } + } +} + +static inline void mlx5e_decompress_cqe(struct mlx5e_rq *rq, + struct mlx5_cqwq *wq, + u32 cqcc) +{ + struct mlx5e_cq_decomp *cqd = &rq->cqd; + struct mlx5_mini_cqe8 *mini_cqe = &cqd->mini_arr[cqd->mini_arr_idx]; + struct mlx5_cqe64 *title = &cqd->title; + + title->byte_cnt = mini_cqe->byte_cnt; + title->check_sum = mini_cqe->checksum; + title->op_own &= 0xf0; + title->op_own |= 0x01 & (cqcc >> wq->fbc.log_sz); + + /* state bit set implies linked-list striding RQ wq type and + * HW stride index capability supported + */ + if (test_bit(MLX5E_RQ_STATE_MINI_CQE_HW_STRIDX, &rq->state)) { + title->wqe_counter = mini_cqe->stridx; + return; + } + + /* HW stride index capability not supported */ + title->wqe_counter = cpu_to_be16(cqd->wqe_counter); + if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) + cqd->wqe_counter += mpwrq_get_cqe_consumed_strides(title); + else + cqd->wqe_counter = + mlx5_wq_cyc_ctr2ix(&rq->wqe.wq, cqd->wqe_counter + 1); +} + +static inline void mlx5e_decompress_cqe_no_hash(struct mlx5e_rq *rq, + struct mlx5_cqwq *wq, + u32 cqcc) +{ + struct mlx5e_cq_decomp *cqd = &rq->cqd; + + mlx5e_decompress_cqe(rq, wq, cqcc); + cqd->title.rss_hash_type = 0; + cqd->title.rss_hash_result = 0; +} + +static inline u32 mlx5e_decompress_cqes_cont(struct mlx5e_rq *rq, + struct mlx5_cqwq *wq, + int update_owner_only, + int budget_rem) +{ + struct mlx5e_cq_decomp *cqd = &rq->cqd; + u32 cqcc = wq->cc + update_owner_only; + u32 cqe_count; + u32 i; + + cqe_count = min_t(u32, cqd->left, budget_rem); + + for (i = update_owner_only; i < cqe_count; + i++, cqd->mini_arr_idx++, cqcc++) { + if (cqd->mini_arr_idx == MLX5_MINI_CQE_ARRAY_SIZE) + mlx5e_read_mini_arr_slot(wq, cqd, cqcc); + + mlx5e_decompress_cqe_no_hash(rq, wq, cqcc); + INDIRECT_CALL_3(rq->handle_rx_cqe, mlx5e_handle_rx_cqe_mpwrq, + mlx5e_handle_rx_cqe_mpwrq_shampo, mlx5e_handle_rx_cqe, + rq, &cqd->title); + } + mlx5e_cqes_update_owner(wq, cqcc - wq->cc); + wq->cc = cqcc; + cqd->left -= cqe_count; + rq->stats->cqe_compress_pkts += cqe_count; + + return cqe_count; +} + +static inline u32 mlx5e_decompress_cqes_start(struct mlx5e_rq *rq, + struct mlx5_cqwq *wq, + int budget_rem) +{ + struct mlx5e_cq_decomp *cqd = &rq->cqd; + u32 cc = wq->cc; + + mlx5e_read_title_slot(rq, wq, cc); + mlx5e_read_mini_arr_slot(wq, cqd, cc + 1); + mlx5e_decompress_cqe(rq, wq, cc); + INDIRECT_CALL_3(rq->handle_rx_cqe, mlx5e_handle_rx_cqe_mpwrq, + mlx5e_handle_rx_cqe_mpwrq_shampo, mlx5e_handle_rx_cqe, + rq, &cqd->title); + cqd->mini_arr_idx++; + + return mlx5e_decompress_cqes_cont(rq, wq, 1, budget_rem) - 1; +} + +static inline void mlx5e_rx_cache_page_swap(struct mlx5e_page_cache *cache, + u32 a, u32 b) +{ + struct mlx5e_dma_info tmp; + + tmp = cache->page_cache[a]; + cache->page_cache[a] = cache->page_cache[b]; + cache->page_cache[b] = tmp; +} + +static inline void +mlx5e_rx_cache_reduce_reset_watch(struct mlx5e_page_cache *cache) +{ + struct mlx5e_page_cache_reduce *reduce = &cache->reduce; + + reduce->next_ts = ilog2(cache->sz) == cache->log_min_sz ? + MAX_JIFFY_OFFSET : + jiffies + reduce->graceful_period; + reduce->successive = 0; +} + +static inline bool mlx5e_rx_cache_is_empty(struct mlx5e_page_cache *cache) +{ + return cache->head < 0; +} +static inline bool mlx5e_rx_cache_page_busy(struct mlx5e_page_cache *cache, + u32 i) +{ + struct mlx5e_dma_info *di = &cache->page_cache[i]; + + return (page_ref_count(di->page) - di->refcnt_bias) != 1; +} + +static inline bool mlx5e_rx_cache_check_reduce(struct mlx5e_rq *rq) +{ + struct mlx5e_page_cache *cache = &rq->page_cache; + + if (!cache->page_cache) + return false; + + if (unlikely(test_bit(MLX5E_RQ_STATE_CACHE_REDUCE_PENDING, &rq->state))) + return false; + + if (time_before(jiffies, cache->reduce.next_ts)) + return false; + + if (likely(!mlx5e_rx_cache_is_empty(cache)) && + mlx5e_rx_cache_page_busy(cache, cache->head)) + goto reset_watch; + + if (ilog2(cache->sz) == cache->log_min_sz) + goto reset_watch; + + /* would like to reduce */ + if (cache->reduce.successive < MLX5E_PAGE_CACHE_REDUCE_SUCCESSIVE_CNT) { + cache->reduce.successive++; + return false; + } + + return true; + +reset_watch: + mlx5e_rx_cache_reduce_reset_watch(cache); + return false; + +} + +static inline void mlx5e_rx_cache_may_reduce(struct mlx5e_rq *rq) +{ + struct mlx5e_page_cache *cache = &rq->page_cache; + struct mlx5e_page_cache_reduce *reduce = &cache->reduce; + int max_new_head; + + if (!mlx5e_rx_cache_check_reduce(rq)) + return; + + /* do reduce */ + rq->stats->cache_rdc++; + cache->sz >>= 1; + max_new_head = (cache->sz >> 1) - 1; + if (cache->head > max_new_head) { + u32 npages = cache->head - max_new_head; + + cache->head = max_new_head; + if (cache->lrs >= cache->head) + cache->lrs = 0; + + memcpy(reduce->pending, &cache->page_cache[cache->head + 1], + npages * sizeof(*reduce->pending)); + reduce->npages = npages; + set_bit(MLX5E_RQ_STATE_CACHE_REDUCE_PENDING, &rq->state); + } + + mlx5e_rx_cache_reduce_reset_watch(cache); +} + +static inline bool mlx5e_rx_cache_extend(struct mlx5e_rq *rq) +{ + struct mlx5e_page_cache *cache = &rq->page_cache; + struct mlx5e_page_cache_reduce *reduce = &cache->reduce; + struct mlx5e_params *params = &rq->priv->channels.params; + u8 log_limit_sz = cache->log_min_sz + params->log_rx_page_cache_mult; + + if (ilog2(cache->sz) >= log_limit_sz) + return false; + + rq->stats->cache_ext++; + cache->sz <<= 1; + + mlx5e_rx_cache_reduce_reset_watch(cache); + schedule_delayed_work_on(smp_processor_id(), &reduce->reduce_work, + reduce->delay); + return true; +} + +static inline bool mlx5e_rx_cache_put(struct mlx5e_rq *rq, + struct mlx5e_dma_info *dma_info) +{ + struct mlx5e_page_cache *cache = &rq->page_cache; + struct mlx5e_rq_stats *stats = rq->stats; + + if (unlikely(cache->head == cache->sz - 1)) { + if (!mlx5e_rx_cache_extend(rq)) { + rq->stats->cache_full++; + return false; + } + } + + if (!dev_page_is_reusable(dma_info->page)) { + stats->cache_waive++; + return false; + } + + cache->page_cache[++cache->head] = *dma_info; + return true; +} + +/* Pageref elevation on page-alloc. + * Don't get too close to U32_MAX, keep room for other components + * to do further increments (SKB clones, forwarding, etc...) + */ +#define PAGE_REF_ELEV (U16_MAX) + +/* Upper bound on number of packets that share a single page */ +#define PAGE_REF_THRSD (PAGE_SIZE / 64) + +static inline void page_ref_elev(struct mlx5e_dma_info *dma_info) +{ + page_ref_add(dma_info->page, PAGE_REF_ELEV); + dma_info->refcnt_bias += PAGE_REF_ELEV; +} + +static inline bool mlx5e_rx_cache_get(struct mlx5e_rq *rq, + struct mlx5e_dma_info *dma_info) +{ + struct mlx5e_page_cache *cache = &rq->page_cache; + struct mlx5e_rq_stats *stats = rq->stats; + + if (unlikely(mlx5e_rx_cache_is_empty(cache))) + goto err_no_page; + + mlx5e_rx_cache_page_swap(cache, cache->head, cache->lrs); + cache->lrs++; + if (cache->lrs >= cache->head) + cache->lrs = 0; + if (mlx5e_rx_cache_page_busy(cache, cache->head)) + goto err_no_page; + + stats->cache_reuse++; + *dma_info = cache->page_cache[cache->head--]; + + dma_sync_single_for_device(rq->pdev, dma_info->addr, + PAGE_SIZE, + DMA_FROM_DEVICE); + + if (unlikely(page_ref_count(dma_info->page) <= PAGE_REF_THRSD)) + page_ref_elev(dma_info); + + return true; + +err_no_page: + stats->cache_alloc++; + cache->reduce.successive = 0; + + return false; +} + +static inline int mlx5e_page_alloc_pool(struct mlx5e_rq *rq, + struct mlx5e_dma_info *dma_info) +{ + if (mlx5e_rx_cache_get(rq, dma_info)) + return 0; + + dma_info->page = page_pool_dev_alloc_pages(rq->page_pool); + if (unlikely(!dma_info->page)) + return -ENOMEM; + + dma_info->refcnt_bias = 0; + page_ref_elev(dma_info); + + dma_info->addr = dma_map_page_attrs(rq->pdev, dma_info->page, 0, PAGE_SIZE, + rq->buff.map_dir, DMA_ATTR_SKIP_CPU_SYNC); + if (unlikely(dma_mapping_error(rq->pdev, dma_info->addr))) { + page_pool_recycle_direct(rq->page_pool, dma_info->page); + page_ref_sub(dma_info->page, dma_info->refcnt_bias); + dma_info->page = NULL; + return -ENOMEM; + } + + return 0; +} + +static inline int mlx5e_page_alloc(struct mlx5e_rq *rq, + struct mlx5e_dma_info *dma_info) +{ + if (rq->xsk_pool) + return mlx5e_xsk_page_alloc_pool(rq, dma_info); + else + return mlx5e_page_alloc_pool(rq, dma_info); +} + +void mlx5e_page_dma_unmap(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info) +{ + dma_unmap_page_attrs(rq->pdev, dma_info->addr, PAGE_SIZE, rq->buff.map_dir, + DMA_ATTR_SKIP_CPU_SYNC); +} + +void mlx5e_page_release_dynamic(struct mlx5e_rq *rq, + struct mlx5e_dma_info *dma_info, + bool recycle) +{ + if (likely(recycle)) { + if (mlx5e_rx_cache_put(rq, dma_info)) + return; + + mlx5e_page_dma_unmap(rq, dma_info); + page_ref_sub(dma_info->page, dma_info->refcnt_bias); + page_pool_recycle_direct(rq->page_pool, dma_info->page); + } else { + mlx5e_page_dma_unmap(rq, dma_info); + page_pool_release_page(rq->page_pool, dma_info->page); + mlx5e_put_page(dma_info); + } +} + +static inline void mlx5e_page_release(struct mlx5e_rq *rq, + struct mlx5e_dma_info *dma_info, + bool recycle) +{ + if (rq->xsk_pool) + /* The `recycle` parameter is ignored, and the page is always + * put into the Reuse Ring, because there is no way to return + * the page to the userspace when the interface goes down. + */ + xsk_buff_free(dma_info->xsk); + else + mlx5e_page_release_dynamic(rq, dma_info, recycle); +} + +static inline int mlx5e_get_rx_frag(struct mlx5e_rq *rq, + struct mlx5e_wqe_frag_info *frag) +{ + int err = 0; + + if (!frag->offset) + /* On first frag (offset == 0), replenish page (dma_info actually). + * Other frags that point to the same dma_info (with a different + * offset) should just use the new one without replenishing again + * by themselves. + */ + err = mlx5e_page_alloc(rq, frag->di); + + return err; +} + +static inline void mlx5e_put_rx_frag(struct mlx5e_rq *rq, + struct mlx5e_wqe_frag_info *frag, + bool recycle) +{ + if (frag->last_in_page) + mlx5e_page_release(rq, frag->di, recycle); +} + +static inline struct mlx5e_wqe_frag_info *get_frag(struct mlx5e_rq *rq, u16 ix) +{ + return &rq->wqe.frags[ix << rq->wqe.info.log_num_frags]; +} + +static int mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe_cyc *wqe, + u16 ix) +{ + struct mlx5e_wqe_frag_info *frag = get_frag(rq, ix); + int err; + int i; + + for (i = 0; i < rq->wqe.info.num_frags; i++, frag++) { + err = mlx5e_get_rx_frag(rq, frag); + if (unlikely(err)) + goto free_frags; + + wqe->data[i].addr = cpu_to_be64(frag->di->addr + + frag->offset + rq->buff.headroom); + } + + return 0; + +free_frags: + while (--i >= 0) + mlx5e_put_rx_frag(rq, --frag, true); + + return err; +} + +static inline void mlx5e_free_rx_wqe(struct mlx5e_rq *rq, + struct mlx5e_wqe_frag_info *wi, + bool recycle) +{ + int i; + + for (i = 0; i < rq->wqe.info.num_frags; i++, wi++) + mlx5e_put_rx_frag(rq, wi, recycle); +} + +static void mlx5e_dealloc_rx_wqe(struct mlx5e_rq *rq, u16 ix) +{ + struct mlx5e_wqe_frag_info *wi = get_frag(rq, ix); + + mlx5e_free_rx_wqe(rq, wi, false); +} + +static int mlx5e_alloc_rx_wqes(struct mlx5e_rq *rq, u16 ix, u8 wqe_bulk) +{ + struct mlx5_wq_cyc *wq = &rq->wqe.wq; + int err; + int i; + + if (rq->xsk_pool) { + int pages_desired = wqe_bulk << rq->wqe.info.log_num_frags; + + /* Check in advance that we have enough frames, instead of + * allocating one-by-one, failing and moving frames to the + * Reuse Ring. + */ + if (unlikely(!xsk_buff_can_alloc(rq->xsk_pool, pages_desired))) + return -ENOMEM; + } + + for (i = 0; i < wqe_bulk; i++) { + struct mlx5e_rx_wqe_cyc *wqe = mlx5_wq_cyc_get_wqe(wq, ix + i); + + err = mlx5e_alloc_rx_wqe(rq, wqe, ix + i); + if (unlikely(err)) + goto free_wqes; + } + + return 0; + +free_wqes: + while (--i >= 0) + mlx5e_dealloc_rx_wqe(rq, ix + i); + + return err; +} + +static inline void +mlx5e_add_skb_frag(struct mlx5e_rq *rq, struct sk_buff *skb, + struct mlx5e_dma_info *di, u32 frag_offset, u32 len, + unsigned int truesize) +{ + dma_sync_single_for_cpu(rq->pdev, + di->addr + frag_offset, + len, DMA_FROM_DEVICE); + di->refcnt_bias--; + skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, + di->page, frag_offset, len, truesize); +} + +static inline void +mlx5e_copy_skb_header(struct device *pdev, struct sk_buff *skb, + struct mlx5e_dma_info *dma_info, + int offset_from, int dma_offset, u32 headlen) +{ + const void *from = page_address(dma_info->page) + offset_from; + /* Aligning len to sizeof(long) optimizes memcpy performance */ + unsigned int len = ALIGN(headlen, sizeof(long)); + + dma_sync_single_for_cpu(pdev, dma_info->addr + dma_offset, len, + DMA_FROM_DEVICE); + skb_copy_to_linear_data(skb, from, len); +} + +static void mlx5e_mpwqe_page_release(struct mlx5e_rq *rq, + struct mlx5e_dma_info *dma_info, + bool recycle) +{ +#ifdef CONFIG_PPC + if (dma_info->page) { + mlx5e_page_release(rq, dma_info, recycle); + dma_info->page = NULL; + } +#else + mlx5e_page_release(rq, dma_info, recycle); +#endif +} + +static void +mlx5e_free_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, bool recycle) +{ + bool no_xdp_xmit; + struct mlx5e_dma_info *dma_info = wi->umr.dma_info; + int i; + + /* A common case for AF_XDP. */ + if (bitmap_full(wi->xdp_xmit_bitmap, MLX5_MPWRQ_PAGES_PER_WQE)) + return; + + no_xdp_xmit = bitmap_empty(wi->xdp_xmit_bitmap, + MLX5_MPWRQ_PAGES_PER_WQE); + + for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) + if (no_xdp_xmit || !test_bit(i, wi->xdp_xmit_bitmap)) + mlx5e_mpwqe_page_release(rq, &dma_info[i], recycle); +} + +static void mlx5e_post_rx_mpwqe(struct mlx5e_rq *rq, u8 n) +{ + struct mlx5_wq_ll *wq = &rq->mpwqe.wq; + + do { + u16 next_wqe_index = mlx5_wq_ll_get_wqe_next_ix(wq, wq->head); + + mlx5_wq_ll_push(wq, next_wqe_index); + } while (--n); + + /* ensure wqes are visible to device before updating doorbell record */ + dma_wmb(); + + mlx5_wq_ll_update_db_record(wq); + + mlx5e_rx_cache_may_reduce(rq); +} + +/* This function returns the size of the continuous free space inside a bitmap + * that starts from first and no longer than len including circular ones. + */ +static int bitmap_find_window(unsigned long *bitmap, int len, + int bitmap_size, int first) +{ + int next_one, count; + + next_one = find_next_bit(bitmap, bitmap_size, first); + if (next_one == bitmap_size) { + if (bitmap_size - first >= len) + return len; + next_one = find_next_bit(bitmap, bitmap_size, 0); + count = next_one + bitmap_size - first; + } else { + count = next_one - first; + } + + return min(len, count); +} + +static void build_klm_umr(struct mlx5e_icosq *sq, struct mlx5e_umr_wqe *umr_wqe, + __be32 key, u16 offset, u16 klm_len, u16 wqe_bbs) +{ + memset(umr_wqe, 0, offsetof(struct mlx5e_umr_wqe, inline_klms)); + umr_wqe->ctrl.opmod_idx_opcode = + cpu_to_be32((sq->pc << MLX5_WQE_CTRL_WQE_INDEX_SHIFT) | + MLX5_OPCODE_UMR); + umr_wqe->ctrl.umr_mkey = key; + umr_wqe->ctrl.qpn_ds = cpu_to_be32((sq->sqn << MLX5_WQE_CTRL_QPN_SHIFT) + | MLX5E_KLM_UMR_DS_CNT(klm_len)); + umr_wqe->uctrl.flags = MLX5_UMR_TRANSLATION_OFFSET_EN | MLX5_UMR_INLINE; + umr_wqe->uctrl.xlt_offset = cpu_to_be16(offset); + umr_wqe->uctrl.xlt_octowords = cpu_to_be16(klm_len); + umr_wqe->uctrl.mkey_mask = cpu_to_be64(MLX5_MKEY_MASK_FREE); +} + +static int mlx5e_build_shampo_hd_umr(struct mlx5e_rq *rq, + struct mlx5e_icosq *sq, + u16 klm_entries, u16 index) +{ + struct mlx5e_shampo_hd *shampo = rq->mpwqe.shampo; + u16 entries, pi, header_offset, err, wqe_bbs, new_entries; + u32 lkey = rq->mdev->mlx5e_res.hw_objs.mkey; + struct page *page = shampo->last_page; + u64 addr = shampo->last_addr; + struct mlx5e_dma_info *dma_info; + struct mlx5e_umr_wqe *umr_wqe; + int headroom, i; + + headroom = rq->buff.headroom; + new_entries = klm_entries - (shampo->pi & (MLX5_UMR_KLM_ALIGNMENT - 1)); + entries = ALIGN(klm_entries, MLX5_UMR_KLM_ALIGNMENT); + wqe_bbs = MLX5E_KLM_UMR_WQEBBS(entries); + pi = mlx5e_icosq_get_next_pi(sq, wqe_bbs); + umr_wqe = mlx5_wq_cyc_get_wqe(&sq->wq, pi); + build_klm_umr(sq, umr_wqe, shampo->key, index, entries, wqe_bbs); + + for (i = 0; i < entries; i++, index++) { + dma_info = &shampo->info[index]; + if (i >= klm_entries || (index < shampo->pi && shampo->pi - index < + MLX5_UMR_KLM_ALIGNMENT)) + goto update_klm; + header_offset = (index & (MLX5E_SHAMPO_WQ_HEADER_PER_PAGE - 1)) << + MLX5E_SHAMPO_LOG_MAX_HEADER_ENTRY_SIZE; + if (!(header_offset & (PAGE_SIZE - 1))) { + err = mlx5e_page_alloc(rq, dma_info); + if (unlikely(err)) + goto err_unmap; + addr = dma_info->addr; + page = dma_info->page; + } else { + dma_info->addr = addr + header_offset; + dma_info->page = page; + } + +update_klm: + umr_wqe->inline_klms[i].bcount = + cpu_to_be32(MLX5E_RX_MAX_HEAD); + umr_wqe->inline_klms[i].key = cpu_to_be32(lkey); + umr_wqe->inline_klms[i].va = + cpu_to_be64(dma_info->addr + headroom); + } + + sq->db.wqe_info[pi] = (struct mlx5e_icosq_wqe_info) { + .wqe_type = MLX5E_ICOSQ_WQE_SHAMPO_HD_UMR, + .num_wqebbs = wqe_bbs, + .shampo.len = new_entries, + }; + + shampo->pi = (shampo->pi + new_entries) & (shampo->hd_per_wq - 1); + shampo->last_page = page; + shampo->last_addr = addr; + sq->pc += wqe_bbs; + sq->doorbell_cseg = &umr_wqe->ctrl; + + return 0; + +err_unmap: + while (--i >= 0) { + dma_info = &shampo->info[--index]; + if (!(i & (MLX5E_SHAMPO_WQ_HEADER_PER_PAGE - 1))) { + dma_info->addr = ALIGN_DOWN(dma_info->addr, PAGE_SIZE); + mlx5e_page_release(rq, dma_info, true); + } + } + rq->stats->buff_alloc_err++; + return err; +} + +static int mlx5e_alloc_rx_hd_mpwqe(struct mlx5e_rq *rq) +{ + struct mlx5e_shampo_hd *shampo = rq->mpwqe.shampo; + u16 klm_entries, num_wqe, index, entries_before; + struct mlx5e_icosq *sq = rq->icosq; + int i, err, max_klm_entries, len; + + max_klm_entries = MLX5E_MAX_KLM_PER_WQE(rq->mdev); + klm_entries = bitmap_find_window(shampo->bitmap, + shampo->hd_per_wqe, + shampo->hd_per_wq, shampo->pi); + if (!klm_entries) + return 0; + + klm_entries += (shampo->pi & (MLX5_UMR_KLM_ALIGNMENT - 1)); + index = ALIGN_DOWN(shampo->pi, MLX5_UMR_KLM_ALIGNMENT); + entries_before = shampo->hd_per_wq - index; + + if (unlikely(entries_before < klm_entries)) + num_wqe = DIV_ROUND_UP(entries_before, max_klm_entries) + + DIV_ROUND_UP(klm_entries - entries_before, max_klm_entries); + else + num_wqe = DIV_ROUND_UP(klm_entries, max_klm_entries); + + for (i = 0; i < num_wqe; i++) { + len = (klm_entries > max_klm_entries) ? max_klm_entries : + klm_entries; + if (unlikely(index + len > shampo->hd_per_wq)) + len = shampo->hd_per_wq - index; + err = mlx5e_build_shampo_hd_umr(rq, sq, len, index); + if (unlikely(err)) + return err; + index = (index + len) & (rq->mpwqe.shampo->hd_per_wq - 1); + klm_entries -= len; + } + + return 0; +} + +static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix) +{ + struct mlx5e_mpw_info *wi = &rq->mpwqe.info[ix]; + struct mlx5e_dma_info *dma_info = &wi->umr.dma_info[0]; + struct mlx5e_icosq *sq = rq->icosq; + struct mlx5_wq_cyc *wq = &sq->wq; + struct mlx5e_umr_wqe *umr_wqe; + u16 pi; + int err; + int i; + + /* Check in advance that we have enough frames, instead of allocating + * one-by-one, failing and moving frames to the Reuse Ring. + */ + if (rq->xsk_pool && + unlikely(!xsk_buff_can_alloc(rq->xsk_pool, MLX5_MPWRQ_PAGES_PER_WQE))) { + err = -ENOMEM; + goto err; + } + + if (test_bit(MLX5E_RQ_STATE_SHAMPO, &rq->state)) { + err = mlx5e_alloc_rx_hd_mpwqe(rq); + if (unlikely(err)) + goto err; + } + + pi = mlx5e_icosq_get_next_pi(sq, MLX5E_UMR_WQEBBS); + umr_wqe = mlx5_wq_cyc_get_wqe(wq, pi); + memcpy(umr_wqe, &rq->mpwqe.umr_wqe, offsetof(struct mlx5e_umr_wqe, inline_mtts)); + + for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++, dma_info++) { + err = mlx5e_page_alloc(rq, dma_info); + if (unlikely(err)) + goto err_unmap; + umr_wqe->inline_mtts[i].ptag = cpu_to_be64(dma_info->addr | MLX5_EN_WR); + } + + bitmap_zero(wi->xdp_xmit_bitmap, MLX5_MPWRQ_PAGES_PER_WQE); + wi->consumed_strides = 0; + + umr_wqe->ctrl.opmod_idx_opcode = + cpu_to_be32((sq->pc << MLX5_WQE_CTRL_WQE_INDEX_SHIFT) | + MLX5_OPCODE_UMR); + umr_wqe->uctrl.xlt_offset = + cpu_to_be16(MLX5_ALIGNED_MTTS_OCTW(MLX5E_REQUIRED_MTTS(ix))); + + sq->db.wqe_info[pi] = (struct mlx5e_icosq_wqe_info) { + .wqe_type = MLX5E_ICOSQ_WQE_UMR_RX, + .num_wqebbs = MLX5E_UMR_WQEBBS, + .umr.rq = rq, + }; + + sq->pc += MLX5E_UMR_WQEBBS; + + sq->doorbell_cseg = &umr_wqe->ctrl; + + return 0; + +err_unmap: + while (--i >= 0) { + dma_info--; + mlx5e_mpwqe_page_release(rq, dma_info, true); + } + +err: + rq->stats->buff_alloc_err++; + + return err; +} + +/* This function is responsible to dealloc SHAMPO header buffer. + * close == true specifies that we are in the middle of closing RQ operation so + * we go over all the entries and if they are not in use we free them, + * otherwise we only go over a specific range inside the header buffer that are + * not in use. + */ +void mlx5e_shampo_dealloc_hd(struct mlx5e_rq *rq, u16 len, u16 start, bool close) +{ + struct mlx5e_shampo_hd *shampo = rq->mpwqe.shampo; + int hd_per_wq = shampo->hd_per_wq; + struct page *deleted_page = NULL; + struct mlx5e_dma_info *hd_info; + int i, index = start; + + for (i = 0; i < len; i++, index++) { + if (index == hd_per_wq) + index = 0; + + if (close && !test_bit(index, shampo->bitmap)) + continue; + + hd_info = &shampo->info[index]; + hd_info->addr = ALIGN_DOWN(hd_info->addr, PAGE_SIZE); + if (hd_info->page != deleted_page) { + deleted_page = hd_info->page; + mlx5e_page_release(rq, hd_info, false); + } + } + + if (start + len > hd_per_wq) { + len -= hd_per_wq - start; + bitmap_clear(shampo->bitmap, start, hd_per_wq - start); + start = 0; + } + + bitmap_clear(shampo->bitmap, start, len); +} + +static void mlx5e_dealloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix) +{ + struct mlx5e_mpw_info *wi = &rq->mpwqe.info[ix]; + /* Don't recycle, this function is called on rq/netdev close */ + mlx5e_free_rx_mpwqe(rq, wi, false); +} + +INDIRECT_CALLABLE_SCOPE bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq) +{ + struct mlx5_wq_cyc *wq = &rq->wqe.wq; + u8 wqe_bulk; + int err; + + if (unlikely(!test_bit(MLX5E_RQ_STATE_ENABLED, &rq->state))) + return false; + + wqe_bulk = rq->wqe.info.wqe_bulk; + + if (mlx5_wq_cyc_missing(wq) < wqe_bulk) + return false; + + if (rq->page_pool) + page_pool_nid_changed(rq->page_pool, numa_mem_id()); + + do { + u16 head = mlx5_wq_cyc_get_head(wq); + + err = mlx5e_alloc_rx_wqes(rq, head, wqe_bulk); + if (unlikely(err)) { + rq->stats->buff_alloc_err++; + break; + } + + mlx5_wq_cyc_push_n(wq, wqe_bulk); + } while (mlx5_wq_cyc_missing(wq) >= wqe_bulk); + + /* ensure wqes are visible to device before updating doorbell record */ + dma_wmb(); + + mlx5_wq_cyc_update_db_record(wq); + + mlx5e_rx_cache_may_reduce(rq); + + return !!err; +} + +INDIRECT_CALLABLE_SCOPE bool mlx5e_post_rx_skip(struct mlx5e_rq *rq) +{ + /* Empty function for representors in + * shared_rq mode. return false as if + * RQ (even though non-existent) is not + * ready. + */ + return false; +} + +void mlx5e_free_icosq_descs(struct mlx5e_icosq *sq) +{ + u16 sqcc; + + sqcc = sq->cc; + + while (sqcc != sq->pc) { + struct mlx5e_icosq_wqe_info *wi; + u16 ci; + + ci = mlx5_wq_cyc_ctr2ix(&sq->wq, sqcc); + wi = &sq->db.wqe_info[ci]; + sqcc += wi->num_wqebbs; +#ifdef CONFIG_MLX5_EN_TLS + switch (wi->wqe_type) { + case MLX5E_ICOSQ_WQE_SET_PSV_TLS: + mlx5e_ktls_handle_ctx_completion(wi); + break; + case MLX5E_ICOSQ_WQE_GET_PSV_TLS: + mlx5e_ktls_handle_get_psv_completion(wi, sq); + break; + } +#endif + } + sq->cc = sqcc; +} + +static void mlx5e_handle_shampo_hd_umr(struct mlx5e_shampo_umr umr, + struct mlx5e_icosq *sq) +{ + struct mlx5e_channel *c = container_of(sq, struct mlx5e_channel, icosq); + struct mlx5e_shampo_hd *shampo; + /* assume 1:1 relationship between RQ and icosq */ + struct mlx5e_rq *rq = &c->rq; + int end, from, len = umr.len; + + shampo = rq->mpwqe.shampo; + end = shampo->hd_per_wq; + from = shampo->ci; + if (from + len > shampo->hd_per_wq) { + len -= end - from; + bitmap_set(shampo->bitmap, from, end - from); + from = 0; + } + + bitmap_set(shampo->bitmap, from, len); + shampo->ci = (shampo->ci + umr.len) & (shampo->hd_per_wq - 1); +} + +int mlx5e_poll_ico_cq(struct mlx5e_cq *cq) +{ + struct mlx5e_icosq *sq = container_of(cq, struct mlx5e_icosq, cq); + struct mlx5_cqe64 *cqe; + u16 sqcc; + int i; + + if (unlikely(!test_bit(MLX5E_SQ_STATE_ENABLED, &sq->state))) + return 0; + + cqe = mlx5_cqwq_get_cqe(&cq->wq); + if (likely(!cqe)) + return 0; + + /* sq->cc must be updated only after mlx5_cqwq_update_db_record(), + * otherwise a cq overrun may occur + */ + sqcc = sq->cc; + + i = 0; + do { + u16 wqe_counter; + bool last_wqe; + + mlx5_cqwq_pop(&cq->wq); + + wqe_counter = be16_to_cpu(cqe->wqe_counter); + + do { + struct mlx5e_icosq_wqe_info *wi; + u16 ci; + + last_wqe = (sqcc == wqe_counter); + + ci = mlx5_wq_cyc_ctr2ix(&sq->wq, sqcc); + wi = &sq->db.wqe_info[ci]; + sqcc += wi->num_wqebbs; + + if (last_wqe && unlikely(get_cqe_opcode(cqe) != MLX5_CQE_REQ)) { + netdev_WARN_ONCE(cq->netdev, + "Bad OP in ICOSQ CQE: 0x%x\n", + get_cqe_opcode(cqe)); + mlx5e_dump_error_cqe(&sq->cq, sq->sqn, + (struct mlx5_err_cqe *)cqe); + mlx5_wq_cyc_wqe_dump(&sq->wq, ci, wi->num_wqebbs); + if (!test_and_set_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state)) + queue_work(cq->priv->wq, &sq->recover_work); + break; + } + + switch (wi->wqe_type) { + case MLX5E_ICOSQ_WQE_UMR_RX: + wi->umr.rq->mpwqe.umr_completed++; + break; + case MLX5E_ICOSQ_WQE_NOP: + break; + case MLX5E_ICOSQ_WQE_SHAMPO_HD_UMR: + mlx5e_handle_shampo_hd_umr(wi->shampo, sq); + break; +#ifdef CONFIG_MLX5_EN_TLS + case MLX5E_ICOSQ_WQE_UMR_TLS: + break; + case MLX5E_ICOSQ_WQE_SET_PSV_TLS: + mlx5e_ktls_handle_ctx_completion(wi); + break; + case MLX5E_ICOSQ_WQE_GET_PSV_TLS: + mlx5e_ktls_handle_get_psv_completion(wi, sq); + break; +#endif + default: + netdev_WARN_ONCE(cq->netdev, + "Bad WQE type in ICOSQ WQE info: 0x%x\n", + wi->wqe_type); + } + } while (!last_wqe); + } while ((++i < MLX5E_TX_CQ_POLL_BUDGET) && (cqe = mlx5_cqwq_get_cqe(&cq->wq))); + + sq->cc = sqcc; + + mlx5_cqwq_update_db_record(&cq->wq); + + return i; +} + +INDIRECT_CALLABLE_SCOPE bool mlx5e_post_rx_mpwqes(struct mlx5e_rq *rq) +{ + struct mlx5_wq_ll *wq = &rq->mpwqe.wq; + u8 umr_completed = rq->mpwqe.umr_completed; + struct mlx5e_icosq *sq = rq->icosq; + int alloc_err = 0; + u8 missing, i; + u16 head; + + if (unlikely(!test_bit(MLX5E_RQ_STATE_ENABLED, &rq->state))) + return false; + + if (umr_completed) { + mlx5e_post_rx_mpwqe(rq, umr_completed); + rq->mpwqe.umr_in_progress -= umr_completed; + rq->mpwqe.umr_completed = 0; + } + + missing = mlx5_wq_ll_missing(wq) - rq->mpwqe.umr_in_progress; + + if (unlikely(rq->mpwqe.umr_in_progress > rq->mpwqe.umr_last_bulk)) + rq->stats->congst_umr++; + +#define UMR_WQE_BULK (2) + if (likely(missing < UMR_WQE_BULK)) + return false; + + if (rq->page_pool) + page_pool_nid_changed(rq->page_pool, numa_mem_id()); + + head = rq->mpwqe.actual_wq_head; + i = missing; + do { + alloc_err = mlx5e_alloc_rx_mpwqe(rq, head); + + if (unlikely(alloc_err)) + break; + head = mlx5_wq_ll_get_wqe_next_ix(wq, head); + } while (--i); + + rq->mpwqe.umr_last_bulk = missing - i; + if (sq->doorbell_cseg) { + mlx5e_notify_hw(&sq->wq, sq->pc, sq->uar_map, sq->doorbell_cseg); + sq->doorbell_cseg = NULL; + } + + rq->mpwqe.umr_in_progress += rq->mpwqe.umr_last_bulk; + rq->mpwqe.actual_wq_head = head; + + /* If XSK Fill Ring doesn't have enough frames, report the error, so + * that one of the actions can be performed: + * 1. If need_wakeup is used, signal that the application has to kick + * the driver when it refills the Fill Ring. + * 2. Otherwise, busy poll by rescheduling the NAPI poll. + */ + if (unlikely(alloc_err == -ENOMEM && rq->xsk_pool)) + return true; + + return false; +} + +static void mlx5e_lro_update_tcp_hdr(struct mlx5_cqe64 *cqe, struct tcphdr *tcp) +{ + u8 l4_hdr_type = get_cqe_l4_hdr_type(cqe); + u8 tcp_ack = (l4_hdr_type == CQE_L4_HDR_TYPE_TCP_ACK_NO_DATA) || + (l4_hdr_type == CQE_L4_HDR_TYPE_TCP_ACK_AND_DATA); + + tcp->check = 0; + tcp->psh = get_cqe_lro_tcppsh(cqe); + + if (tcp_ack) { + tcp->ack = 1; + tcp->ack_seq = cqe->lro.ack_seq_num; + tcp->window = cqe->lro.tcp_win; + } +} + +static void mlx5e_lro_update_hdr(struct sk_buff *skb, struct mlx5_cqe64 *cqe, + u32 cqe_bcnt) +{ + struct ethhdr *eth = (struct ethhdr *)(skb->data); + struct tcphdr *tcp; + int network_depth = 0; + __wsum check; + __be16 proto; + u16 tot_len; + void *ip_p; + + proto = __vlan_get_protocol(skb, eth->h_proto, &network_depth); + + tot_len = cqe_bcnt - network_depth; + ip_p = skb->data + network_depth; + + if (proto == htons(ETH_P_IP)) { + struct iphdr *ipv4 = ip_p; + + tcp = ip_p + sizeof(struct iphdr); + skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; + + ipv4->ttl = cqe->lro.min_ttl; + ipv4->tot_len = cpu_to_be16(tot_len); + ipv4->check = 0; + ipv4->check = ip_fast_csum((unsigned char *)ipv4, + ipv4->ihl); + + mlx5e_lro_update_tcp_hdr(cqe, tcp); + check = csum_partial(tcp, tcp->doff * 4, + csum_unfold((__force __sum16)cqe->check_sum)); + /* Almost done, don't forget the pseudo header */ + tcp->check = csum_tcpudp_magic(ipv4->saddr, ipv4->daddr, + tot_len - sizeof(struct iphdr), + IPPROTO_TCP, check); + } else { + u16 payload_len = tot_len - sizeof(struct ipv6hdr); + struct ipv6hdr *ipv6 = ip_p; + + tcp = ip_p + sizeof(struct ipv6hdr); + skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6; + + ipv6->hop_limit = cqe->lro.min_ttl; + ipv6->payload_len = cpu_to_be16(payload_len); + + mlx5e_lro_update_tcp_hdr(cqe, tcp); + check = csum_partial(tcp, tcp->doff * 4, + csum_unfold((__force __sum16)cqe->check_sum)); + /* Almost done, don't forget the pseudo header */ + tcp->check = csum_ipv6_magic(&ipv6->saddr, &ipv6->daddr, payload_len, + IPPROTO_TCP, check); + } +} + +static void *mlx5e_shampo_get_packet_hd(struct mlx5e_rq *rq, u16 header_index) +{ + struct mlx5e_dma_info *last_head = &rq->mpwqe.shampo->info[header_index]; + u16 head_offset = (last_head->addr & (PAGE_SIZE - 1)) + rq->buff.headroom; + + return page_address(last_head->page) + head_offset; +} + +static void mlx5e_shampo_update_ipv4_udp_hdr(struct mlx5e_rq *rq, struct iphdr *ipv4) +{ + int udp_off = rq->hw_gro_data->fk.control.thoff; + struct sk_buff *skb = rq->hw_gro_data->skb; + struct udphdr *uh; + + uh = (struct udphdr *)(skb->data + udp_off); + uh->len = htons(skb->len - udp_off); + + if (uh->check) + uh->check = ~udp_v4_check(skb->len - udp_off, ipv4->saddr, + ipv4->daddr, 0); + + skb->csum_start = (unsigned char *)uh - skb->head; + skb->csum_offset = offsetof(struct udphdr, check); + + skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_L4; +} + +static void mlx5e_shampo_update_ipv6_udp_hdr(struct mlx5e_rq *rq, struct ipv6hdr *ipv6) +{ + int udp_off = rq->hw_gro_data->fk.control.thoff; + struct sk_buff *skb = rq->hw_gro_data->skb; + struct udphdr *uh; + + uh = (struct udphdr *)(skb->data + udp_off); + uh->len = htons(skb->len - udp_off); + + if (uh->check) + uh->check = ~udp_v6_check(skb->len - udp_off, &ipv6->saddr, + &ipv6->daddr, 0); + + skb->csum_start = (unsigned char *)uh - skb->head; + skb->csum_offset = offsetof(struct udphdr, check); + + skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_L4; +} + +static void mlx5e_shampo_update_fin_psh_flags(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe, + struct tcphdr *skb_tcp_hd) +{ + u16 header_index = mlx5e_shampo_get_cqe_header_index(rq, cqe); + struct tcphdr *last_tcp_hd; + void *last_hd_addr; + + last_hd_addr = mlx5e_shampo_get_packet_hd(rq, header_index); + last_tcp_hd = last_hd_addr + ETH_HLEN + rq->hw_gro_data->fk.control.thoff; + tcp_flag_word(skb_tcp_hd) |= tcp_flag_word(last_tcp_hd) & (TCP_FLAG_FIN | TCP_FLAG_PSH); +} + +static void mlx5e_shampo_update_ipv4_tcp_hdr(struct mlx5e_rq *rq, struct iphdr *ipv4, + struct mlx5_cqe64 *cqe, bool match) +{ + int tcp_off = rq->hw_gro_data->fk.control.thoff; + struct sk_buff *skb = rq->hw_gro_data->skb; + struct tcphdr *tcp; + + tcp = (struct tcphdr *)(skb->data + tcp_off); + if (match) + mlx5e_shampo_update_fin_psh_flags(rq, cqe, tcp); + + tcp->check = ~tcp_v4_check(skb->len - tcp_off, ipv4->saddr, + ipv4->daddr, 0); + skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4; + if (ntohs(ipv4->id) == rq->hw_gro_data->second_ip_id) + skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_FIXEDID; + + skb->csum_start = (unsigned char *)tcp - skb->head; + skb->csum_offset = offsetof(struct tcphdr, check); + + if (tcp->cwr) + skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; +} + +static void mlx5e_shampo_update_ipv6_tcp_hdr(struct mlx5e_rq *rq, struct ipv6hdr *ipv6, + struct mlx5_cqe64 *cqe, bool match) +{ + int tcp_off = rq->hw_gro_data->fk.control.thoff; + struct sk_buff *skb = rq->hw_gro_data->skb; + struct tcphdr *tcp; + + tcp = (struct tcphdr *)(skb->data + tcp_off); + if (match) + mlx5e_shampo_update_fin_psh_flags(rq, cqe, tcp); + + tcp->check = ~tcp_v6_check(skb->len - tcp_off, &ipv6->saddr, + &ipv6->daddr, 0); + skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV6; + skb->csum_start = (unsigned char *)tcp - skb->head; + skb->csum_offset = offsetof(struct tcphdr, check); + + if (tcp->cwr) + skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; +} + +static void mlx5e_shampo_update_hdr(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe, bool match) +{ + bool is_ipv4 = (rq->hw_gro_data->fk.basic.n_proto == htons(ETH_P_IP)); + struct sk_buff *skb = rq->hw_gro_data->skb; + + skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; + skb->ip_summed = CHECKSUM_PARTIAL; + + if (is_ipv4) { + int nhoff = rq->hw_gro_data->fk.control.thoff - sizeof(struct iphdr); + struct iphdr *ipv4 = (struct iphdr *)(skb->data + nhoff); + __be16 newlen = htons(skb->len - nhoff); + + csum_replace2(&ipv4->check, ipv4->tot_len, newlen); + ipv4->tot_len = newlen; + + if (ipv4->protocol == IPPROTO_TCP) + mlx5e_shampo_update_ipv4_tcp_hdr(rq, ipv4, cqe, match); + else + mlx5e_shampo_update_ipv4_udp_hdr(rq, ipv4); + } else { + int nhoff = rq->hw_gro_data->fk.control.thoff - sizeof(struct ipv6hdr); + struct ipv6hdr *ipv6 = (struct ipv6hdr *)(skb->data + nhoff); + + ipv6->payload_len = htons(skb->len - nhoff - sizeof(*ipv6)); + + if (ipv6->nexthdr == IPPROTO_TCP) + mlx5e_shampo_update_ipv6_tcp_hdr(rq, ipv6, cqe, match); + else + mlx5e_shampo_update_ipv6_udp_hdr(rq, ipv6); + } +} + +static inline void mlx5e_skb_set_hash(struct mlx5_cqe64 *cqe, + struct sk_buff *skb) +{ + u8 cht = cqe->rss_hash_type; + int ht = (cht & CQE_RSS_HTYPE_L4) ? PKT_HASH_TYPE_L4 : + (cht & CQE_RSS_HTYPE_IP) ? PKT_HASH_TYPE_L3 : + PKT_HASH_TYPE_NONE; + skb_set_hash(skb, be32_to_cpu(cqe->rss_hash_result), ht); +} + +static inline bool is_last_ethertype_ip(struct sk_buff *skb, int *network_depth, + __be16 *proto) +{ + *proto = ((struct ethhdr *)skb->data)->h_proto; + *proto = __vlan_get_protocol(skb, *proto, network_depth); + + if (*proto == htons(ETH_P_IP)) + return pskb_may_pull(skb, *network_depth + sizeof(struct iphdr)); + + if (*proto == htons(ETH_P_IPV6)) + return pskb_may_pull(skb, *network_depth + sizeof(struct ipv6hdr)); + + return false; +} + +static inline void mlx5e_enable_ecn(struct mlx5e_rq *rq, struct sk_buff *skb) +{ + int network_depth = 0; + __be16 proto; + void *ip; + int rc; + + if (unlikely(!is_last_ethertype_ip(skb, &network_depth, &proto))) + return; + + ip = skb->data + network_depth; + rc = ((proto == htons(ETH_P_IP)) ? IP_ECN_set_ce((struct iphdr *)ip) : + IP6_ECN_set_ce(skb, (struct ipv6hdr *)ip)); + + rq->stats->ecn_mark += !!rc; +} + +static u8 get_ip_proto(struct sk_buff *skb, int network_depth, __be16 proto) +{ + void *ip_p = skb->data + network_depth; + + return (proto == htons(ETH_P_IP)) ? ((struct iphdr *)ip_p)->protocol : + ((struct ipv6hdr *)ip_p)->nexthdr; +} + +#define short_frame(size) ((size) <= ETH_ZLEN + ETH_FCS_LEN) + +#define MAX_PADDING 8 + +static void +tail_padding_csum_slow(struct sk_buff *skb, int offset, int len, + struct mlx5e_rq_stats *stats) +{ + stats->csum_complete_tail_slow++; + skb->csum = csum_block_add(skb->csum, + skb_checksum(skb, offset, len, 0), + offset); +} + +static void +tail_padding_csum(struct sk_buff *skb, int offset, + struct mlx5e_rq_stats *stats) +{ + u8 tail_padding[MAX_PADDING]; + int len = skb->len - offset; + void *tail; + + if (unlikely(len > MAX_PADDING)) { + tail_padding_csum_slow(skb, offset, len, stats); + return; + } + + tail = skb_header_pointer(skb, offset, len, tail_padding); + if (unlikely(!tail)) { + tail_padding_csum_slow(skb, offset, len, stats); + return; + } + + stats->csum_complete_tail++; + skb->csum = csum_block_add(skb->csum, csum_partial(tail, len, 0), offset); +} + +static void +mlx5e_skb_csum_fixup(struct sk_buff *skb, int network_depth, __be16 proto, + struct mlx5e_rq_stats *stats) +{ + struct ipv6hdr *ip6; + struct iphdr *ip4; + int pkt_len; + + /* Fixup vlan headers, if any */ + if (network_depth > ETH_HLEN) + /* CQE csum is calculated from the IP header and does + * not cover VLAN headers (if present). This will add + * the checksum manually. + */ + skb->csum = csum_partial(skb->data + ETH_HLEN, + network_depth - ETH_HLEN, + skb->csum); + + /* Fixup tail padding, if any */ + switch (proto) { + case htons(ETH_P_IP): + ip4 = (struct iphdr *)(skb->data + network_depth); + pkt_len = network_depth + ntohs(ip4->tot_len); + break; + case htons(ETH_P_IPV6): + ip6 = (struct ipv6hdr *)(skb->data + network_depth); + pkt_len = network_depth + sizeof(*ip6) + ntohs(ip6->payload_len); + break; + default: + return; + } + + if (likely(pkt_len >= skb->len)) + return; + + tail_padding_csum(skb, pkt_len, stats); +} + +static inline void mlx5e_handle_csum(struct net_device *netdev, + struct mlx5_cqe64 *cqe, + struct mlx5e_rq *rq, + struct sk_buff *skb, + bool lro) +{ + struct mlx5e_rq_stats *stats = rq->stats; + int network_depth = 0; + __be16 proto; + + if (unlikely(!(netdev->features & NETIF_F_RXCSUM))) + goto csum_none; + + if (lro) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + stats->csum_unnecessary++; + return; + } + + /* True when explicitly set via priv flag, or XDP prog is loaded */ + if (test_bit(MLX5E_RQ_STATE_NO_CSUM_COMPLETE, &rq->state) || + get_cqe_tls_offload(cqe)) + goto csum_unnecessary; + + /* CQE csum doesn't cover padding octets in short ethernet + * frames. And the pad field is appended prior to calculating + * and appending the FCS field. + * + * Detecting these padded frames requires to verify and parse + * IP headers, so we simply force all those small frames to be + * CHECKSUM_UNNECESSARY even if they are not padded. + */ + if (short_frame(skb->len)) + goto csum_unnecessary; + + if (likely(is_last_ethertype_ip(skb, &network_depth, &proto))) { + if (unlikely(get_ip_proto(skb, network_depth, proto) == IPPROTO_SCTP)) + goto csum_unnecessary; + + stats->csum_complete++; + skb->ip_summed = CHECKSUM_COMPLETE; + skb->csum = csum_unfold((__force __sum16)cqe->check_sum); + + if (test_bit(MLX5E_RQ_STATE_CSUM_FULL, &rq->state)) + return; /* CQE csum covers all received bytes */ + + /* csum might need some fixups ...*/ + mlx5e_skb_csum_fixup(skb, network_depth, proto, stats); + return; + } + +csum_unnecessary: + if (likely((cqe->hds_ip_ext & CQE_L3_OK) && + (cqe->hds_ip_ext & CQE_L4_OK))) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + if (cqe_is_tunneled(cqe)) { + skb->csum_level = 1; + skb->encapsulation = 1; + stats->csum_unnecessary_inner++; + return; + } + stats->csum_unnecessary++; + return; + } +csum_none: + skb->ip_summed = CHECKSUM_NONE; + stats->csum_none++; +} + +#define MLX5E_CE_BIT_MASK 0x80 + +static inline void mlx5e_build_rx_skb(struct mlx5_cqe64 *cqe, + u32 cqe_bcnt, + struct mlx5e_rq *rq, + struct sk_buff *skb) +{ + u8 lro_num_seg = be32_to_cpu(cqe->srqn) >> 24; + struct mlx5e_rq_stats *stats = rq->stats; + struct net_device *netdev = rq->netdev; + + skb->mac_len = ETH_HLEN; + + mlx5e_tls_handle_rx_skb(rq, skb, cqe, &cqe_bcnt); + + if (unlikely(mlx5_ipsec_is_rx_flow(cqe))) + mlx5e_ipsec_offload_handle_rx_skb(netdev, skb, cqe); + + if (unlikely(mlx5e_macsec_is_rx_flow(cqe))) + mlx5e_macsec_offload_handle_rx_skb(netdev, skb, cqe); + + if (lro_num_seg > 1) { + mlx5e_lro_update_hdr(skb, cqe, cqe_bcnt); + skb_shinfo(skb)->gso_size = DIV_ROUND_UP(cqe_bcnt, lro_num_seg); + /* Subtract one since we already counted this as one + * "regular" packet in mlx5e_complete_rx_cqe() + */ + stats->packets += lro_num_seg - 1; + stats->lro_packets++; + stats->lro_bytes += cqe_bcnt; + } + + if (unlikely(mlx5e_rx_hw_stamp(rq->tstamp))) + skb_hwtstamps(skb)->hwtstamp = mlx5e_cqe_ts_to_ns(rq->ptp_cyc2time, + rq->clock, get_cqe_ts(cqe)); + skb_record_rx_queue(skb, rq->ix); + + if (likely(netdev->features & NETIF_F_RXHASH)) + mlx5e_skb_set_hash(cqe, skb); + + if (cqe_has_vlan(cqe)) { + __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), + be16_to_cpu(cqe->vlan_info)); + stats->removed_vlan_packets++; + } + + skb->mark = be32_to_cpu(cqe->sop_drop_qpn) & MLX5E_TC_FLOW_ID_MASK; + + mlx5e_handle_csum(netdev, cqe, rq, skb, !!lro_num_seg); + /* checking CE bit in cqe - MSB in ml_path field */ + if (unlikely(cqe->ml_path & MLX5E_CE_BIT_MASK)) + mlx5e_enable_ecn(rq, skb); + + skb->protocol = eth_type_trans(skb, netdev); + + if (unlikely(mlx5e_skb_is_multicast(skb))) + stats->mcast_packets++; +} + +static void mlx5e_shampo_complete_rx_cqe(struct mlx5e_rq *rq, + struct mlx5_cqe64 *cqe, + u32 cqe_bcnt, + struct sk_buff *skb) +{ + struct mlx5e_rq_stats *stats = rq->stats; + + stats->packets++; + stats->gro_packets++; + stats->bytes += cqe_bcnt; + stats->gro_bytes += cqe_bcnt; + if (NAPI_GRO_CB(skb)->count != 1) + return; + mlx5e_build_rx_skb(cqe, cqe_bcnt, rq, skb); + skb_reset_network_header(skb); + if (!skb_flow_dissect_flow_keys(skb, &rq->hw_gro_data->fk, 0)) { + napi_gro_receive(rq->cq.napi, skb); + rq->hw_gro_data->skb = NULL; + } +} + +static inline void mlx5e_complete_rx_cqe(struct mlx5e_rq *rq, + struct mlx5_cqe64 *cqe, + u32 cqe_bcnt, + struct sk_buff *skb) +{ + struct mlx5e_rq_stats *stats = rq->stats; + u8 l4_hdr_type = get_cqe_l4_hdr_type(cqe); + + stats->packets++; + stats->bytes += cqe_bcnt; + mlx5e_build_rx_skb(cqe, cqe_bcnt, rq, skb); + + if (l4_hdr_type != CQE_L4_HDR_TYPE_TCP_ACK_NO_DATA) { + rq->dim_obj.sample.pkt_ctr = rq->stats->packets; + rq->dim_obj.sample.byte_ctr = rq->stats->bytes; + } +} + +static inline +struct sk_buff *mlx5e_build_linear_skb(struct mlx5e_rq *rq, void *va, + u32 frag_size, u16 headroom, + u32 cqe_bcnt) +{ + struct sk_buff *skb = build_skb(va, frag_size); + + if (unlikely(!skb)) { + rq->stats->buff_alloc_err++; + return NULL; + } + + skb_reserve(skb, headroom); + skb_put(skb, cqe_bcnt); + + return skb; +} + +static void mlx5e_fill_xdp_buff(struct mlx5e_rq *rq, void *va, u16 headroom, + u32 len, struct xdp_buff *xdp) +{ + xdp_init_buff(xdp, rq->buff.frame0_sz, &rq->xdp_rxq); + xdp_prepare_buff(xdp, va, headroom, len, false); +} + +static struct sk_buff * +mlx5e_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe, + struct mlx5e_wqe_frag_info *wi, u32 cqe_bcnt) +{ + struct mlx5e_dma_info *di = wi->di; + u16 rx_headroom = rq->buff.headroom; + struct xdp_buff xdp; + struct sk_buff *skb; + void *va, *data; + u32 frag_size; + + va = page_address(di->page) + wi->offset; + data = va + rx_headroom; + frag_size = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt); + + dma_sync_single_range_for_cpu(rq->pdev, di->addr, wi->offset, + frag_size, DMA_FROM_DEVICE); + net_prefetchw(va); /* xdp_frame data area */ + net_prefetch(data); + + mlx5e_fill_xdp_buff(rq, va, rx_headroom, cqe_bcnt, &xdp); + if (mlx5e_xdp_handle(rq, di, &cqe_bcnt, &xdp)) + return NULL; /* page/packet was consumed by XDP */ + + rx_headroom = xdp.data - xdp.data_hard_start; + frag_size = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt); + skb = mlx5e_build_linear_skb(rq, va, frag_size, rx_headroom, cqe_bcnt); + if (unlikely(!skb)) + return NULL; + + /* queue up for recycling/reuse */ + di->refcnt_bias--; + + return skb; +} + +static struct sk_buff * +mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe, + struct mlx5e_wqe_frag_info *wi, u32 cqe_bcnt) +{ + struct mlx5e_rq_frag_info *frag_info = &rq->wqe.info.arr[0]; + struct mlx5e_wqe_frag_info *head_wi = wi; + u16 headlen = min_t(u32, MLX5E_RX_MAX_HEAD, cqe_bcnt); + u16 frag_headlen = headlen; + u16 byte_cnt = cqe_bcnt - headlen; + struct sk_buff *skb; + + /* XDP is not supported in this configuration, as incoming packets + * might spread among multiple pages. + */ + skb = napi_alloc_skb(rq->cq.napi, + ALIGN(MLX5E_RX_MAX_HEAD, sizeof(long))); + if (unlikely(!skb)) { + rq->stats->buff_alloc_err++; + return NULL; + } + + net_prefetchw(skb->data); + + while (byte_cnt) { + u16 frag_consumed_bytes = + min_t(u16, frag_info->frag_size - frag_headlen, byte_cnt); + + mlx5e_add_skb_frag(rq, skb, wi->di, wi->offset + frag_headlen, + frag_consumed_bytes, frag_info->frag_stride); + byte_cnt -= frag_consumed_bytes; + frag_headlen = 0; + frag_info++; + wi++; + } + + /* copy header */ + mlx5e_copy_skb_header(rq->pdev, skb, head_wi->di, head_wi->offset, head_wi->offset, + headlen); + /* skb linear part was allocated with headlen and aligned to long */ + skb->tail += headlen; + skb->len += headlen; + + return skb; +} + +static void trigger_report(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe) +{ + struct mlx5_err_cqe *err_cqe = (struct mlx5_err_cqe *)cqe; + struct mlx5e_priv *priv = rq->priv; + + if (cqe_syndrome_needs_recover(err_cqe->syndrome) && + !test_and_set_bit(MLX5E_RQ_STATE_RECOVERING, &rq->state)) { + mlx5e_dump_error_cqe(&rq->cq, rq->rqn, err_cqe); + queue_work(priv->wq, &rq->recover_work); + } +} + +static void mlx5e_handle_rx_err_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe) +{ + trigger_report(rq, cqe); + rq->stats->wqe_err++; +} + +static void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe) +{ + struct mlx5_wq_cyc *wq = &rq->wqe.wq; + struct mlx5e_wqe_frag_info *wi; + struct sk_buff *skb; + u32 cqe_bcnt; + u16 ci; + + ci = mlx5_wq_cyc_ctr2ix(wq, be16_to_cpu(cqe->wqe_counter)); + wi = get_frag(rq, ci); + cqe_bcnt = be32_to_cpu(cqe->byte_cnt); + + if (unlikely(MLX5E_RX_ERR_CQE(cqe))) { + mlx5e_handle_rx_err_cqe(rq, cqe); + goto free_wqe; + } + + skb = INDIRECT_CALL_2(rq->wqe.skb_from_cqe, + mlx5e_skb_from_cqe_linear, + mlx5e_skb_from_cqe_nonlinear, + rq, cqe, wi, cqe_bcnt); + if (!skb) { + /* probably for XDP */ + if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) { + /* do not return page to cache, + * it will be returned on XDP_TX completion. + */ + goto wq_cyc_pop; + } + goto free_wqe; + } + + mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb); + + if (mlx5e_cqe_regb_chain(cqe)) + if (!mlx5e_tc_update_skb(cqe, skb)) { + dev_kfree_skb_any(skb); + goto free_wqe; + } + + napi_gro_receive(rq->cq.napi, skb); + +free_wqe: + mlx5e_free_rx_wqe(rq, wi, true); +wq_cyc_pop: + mlx5_wq_cyc_pop(wq); +} + +#ifdef CONFIG_MLX5_ESWITCH +/* Metadata_0 is used for vport identification */ +struct mlx5_pet_hdr { + u16 pet_ether_type; + u16 metadata_0; + u16 metadata_1; + u16 metadata_2; + u16 orig_ether_type; +} __packed; + +static void *mlx5e_rep_netdev_get(struct mlx5_eswitch *esw, struct mlx5_eswitch_rep *rep) +{ + struct mlx5e_rep_priv *rpriv; + + rpriv = rep->rep_data[REP_ETH].priv; + return rpriv->netdev; +} + +/* Hardware will insert 2 bytes of programmable ether type (0x8CE4) followed + * by 6 bytes of metadata. out of 6 bytes, we are only interested in upper 2 + * bytes that will contain unique metadata assigned by eswitch. + * Strip 8 bytes and update skb protocol and dev fields. + */ +static bool mlx5e_rep_lookup_and_update(struct mlx5e_rq *rq, struct sk_buff *skb) +{ + struct mlx5e_priv *priv = netdev_priv(rq->netdev); + struct mlx5_eswitch *esw = rq->mdev->priv.eswitch; + struct mlx5e_rep_priv *rpriv = priv->ppriv; + struct mlx5_eswitch_rep *vport_rep; + struct mlx5_pet_hdr pet_hdr = {}; + struct ethhdr *curr_eth_hdr; + struct ethhdr *new_eth_hdr; + + if (!esw->offloads.pet_info.enabled) + return true; + + /* Copy 10 bytes of PET header to local stack for parsing */ + skb_copy_bits(skb, -2, &pet_hdr, sizeof(pet_hdr)); + if (pet_hdr.pet_ether_type != esw->offloads.pet_info.ether_type) { + rq->stats->pet_hdr_lookup_drop++; + return false; + } + + vport_rep = xa_load(&rpriv->vport_rep_map, be32_to_cpu(pet_hdr.metadata_0)); + if (!vport_rep) { + rq->stats->pet_mdata_lookup_drop++; + return false; + } + + skb_push(skb, ETH_HLEN); + curr_eth_hdr = (struct ethhdr *)(skb->data); + new_eth_hdr = (struct ethhdr *)(skb->data + 8); + memmove(new_eth_hdr, curr_eth_hdr, 12); + skb_set_mac_header(skb, 8); + skb_pull_inline(skb, ETH_HLEN + 8); + skb->protocol = pet_hdr.orig_ether_type; + skb->dev = mlx5e_rep_netdev_get(esw, vport_rep); + return true; +} + +static void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe) +{ + struct net_device *netdev = rq->netdev; + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5e_rep_priv *rpriv = priv->ppriv; + struct mlx5_eswitch_rep *rep = rpriv->rep; + struct mlx5_wq_cyc *wq = &rq->wqe.wq; + struct mlx5e_wqe_frag_info *wi; + struct sk_buff *skb; + u32 cqe_bcnt; + u16 ci; + + ci = mlx5_wq_cyc_ctr2ix(wq, be16_to_cpu(cqe->wqe_counter)); + wi = get_frag(rq, ci); + cqe_bcnt = be32_to_cpu(cqe->byte_cnt); + + if (unlikely(MLX5E_RX_ERR_CQE(cqe))) { + mlx5e_handle_rx_err_cqe(rq, cqe); + goto free_wqe; + } + + skb = INDIRECT_CALL_2(rq->wqe.skb_from_cqe, + mlx5e_skb_from_cqe_linear, + mlx5e_skb_from_cqe_nonlinear, + rq, cqe, wi, cqe_bcnt); + if (!skb) { + /* probably for XDP */ + if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) { + /* do not return page to cache, + * it will be returned on XDP_TX completion. + */ + goto wq_cyc_pop; + } + goto free_wqe; + } + + mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb); + + if (!mlx5e_rep_lookup_and_update(rq, skb)) + goto free_wqe; + + if (rep->vlan && skb_vlan_tag_present(skb)) + skb_vlan_pop(skb); + + mlx5e_rep_tc_receive(cqe, rq, skb); + +free_wqe: + mlx5e_free_rx_wqe(rq, wi, true); +wq_cyc_pop: + mlx5_wq_cyc_pop(wq); +} + +static void mlx5e_handle_rx_cqe_mpwrq_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe) +{ + u16 cstrides = mpwrq_get_cqe_consumed_strides(cqe); + u16 wqe_id = be16_to_cpu(cqe->wqe_id); + struct mlx5e_mpw_info *wi = &rq->mpwqe.info[wqe_id]; + u16 stride_ix = mpwrq_get_cqe_stride_index(cqe); + u32 wqe_offset = stride_ix << rq->mpwqe.log_stride_sz; + u32 head_offset = wqe_offset & (PAGE_SIZE - 1); + u32 page_idx = wqe_offset >> PAGE_SHIFT; + struct mlx5e_rx_wqe_ll *wqe; + struct mlx5_wq_ll *wq; + struct sk_buff *skb; + u16 cqe_bcnt; + + wi->consumed_strides += cstrides; + + if (unlikely(MLX5E_RX_ERR_CQE(cqe))) { + mlx5e_handle_rx_err_cqe(rq, cqe); + goto mpwrq_cqe_out; + } + + if (unlikely(mpwrq_is_filler_cqe(cqe))) { + struct mlx5e_rq_stats *stats = rq->stats; + + stats->mpwqe_filler_cqes++; + stats->mpwqe_filler_strides += cstrides; + goto mpwrq_cqe_out; + } + + cqe_bcnt = mpwrq_get_cqe_byte_cnt(cqe); + + skb = INDIRECT_CALL_2(rq->mpwqe.skb_from_cqe_mpwrq, + mlx5e_skb_from_cqe_mpwrq_linear, + mlx5e_skb_from_cqe_mpwrq_nonlinear, + rq, wi, cqe_bcnt, head_offset, page_idx); + if (!skb) + goto mpwrq_cqe_out; + + mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb); + + if (!mlx5e_rep_lookup_and_update(rq, skb)) + goto mpwrq_cqe_out; + + mlx5e_rep_tc_receive(cqe, rq, skb); + +mpwrq_cqe_out: + if (likely(wi->consumed_strides < rq->mpwqe.num_strides)) + return; + + wq = &rq->mpwqe.wq; + wqe = mlx5_wq_ll_get_wqe(wq, wqe_id); + mlx5e_free_rx_mpwqe(rq, wi, true); + mlx5_wq_ll_pop(wq, cqe->wqe_id, &wqe->next.next_wqe_index); +} + +const struct mlx5e_rx_handlers mlx5e_rx_handlers_rep = { + .handle_rx_cqe = mlx5e_handle_rx_cqe_rep, + .handle_rx_cqe_mpwqe = mlx5e_handle_rx_cqe_mpwrq_rep, +}; +#endif + +static void +mlx5e_fill_skb_data(struct sk_buff *skb, struct mlx5e_rq *rq, struct mlx5e_dma_info *di, + u32 data_bcnt, u32 data_offset) +{ + net_prefetchw(skb->data); + + while (data_bcnt) { + u32 pg_consumed_bytes = min_t(u32, PAGE_SIZE - data_offset, data_bcnt); + unsigned int truesize; + + if (test_bit(MLX5E_RQ_STATE_SHAMPO, &rq->state)) + truesize = pg_consumed_bytes; + else + truesize = ALIGN(pg_consumed_bytes, BIT(rq->mpwqe.log_stride_sz)); + + mlx5e_add_skb_frag(rq, skb, di, data_offset, + pg_consumed_bytes, truesize); + + data_bcnt -= pg_consumed_bytes; + data_offset = 0; + di++; + } +} + +static struct sk_buff * +mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, + u16 cqe_bcnt, u32 head_offset, u32 page_idx) +{ + u16 headlen = min_t(u16, MLX5E_RX_MAX_HEAD, cqe_bcnt); + struct mlx5e_dma_info *di = &wi->umr.dma_info[page_idx]; + u32 frag_offset = head_offset + headlen; + u32 byte_cnt = cqe_bcnt - headlen; + struct mlx5e_dma_info *head_di = di; + struct sk_buff *skb; + + skb = napi_alloc_skb(rq->cq.napi, + ALIGN(MLX5E_RX_MAX_HEAD, sizeof(long))); + if (unlikely(!skb)) { + rq->stats->buff_alloc_err++; + return NULL; + } + + net_prefetchw(skb->data); + + if (unlikely(frag_offset >= PAGE_SIZE)) { + di++; + frag_offset -= PAGE_SIZE; + } + + mlx5e_fill_skb_data(skb, rq, di, byte_cnt, frag_offset); + /* copy header */ + mlx5e_copy_skb_header(rq->pdev, skb, head_di, head_offset, head_offset, headlen); + /* skb linear part was allocated with headlen and aligned to long */ + skb->tail += headlen; + skb->len += headlen; + + return skb; +} + +static struct sk_buff * +mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, + u16 cqe_bcnt, u32 head_offset, u32 page_idx) +{ + struct mlx5e_dma_info *di = &wi->umr.dma_info[page_idx]; + u16 rx_headroom = rq->buff.headroom; + u32 cqe_bcnt32 = cqe_bcnt; + struct xdp_buff xdp; + struct sk_buff *skb; + void *va, *data; + u32 frag_size; + + /* Check packet size. Note LRO doesn't use linear SKB */ + if (unlikely(cqe_bcnt > rq->hw_mtu + rq->pet_hdr_size)) { + rq->stats->oversize_pkts_sw_drop++; + return NULL; + } + + va = page_address(di->page) + head_offset; + data = va + rx_headroom; + frag_size = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt32); + + dma_sync_single_range_for_cpu(rq->pdev, di->addr, head_offset, + frag_size, DMA_FROM_DEVICE); + net_prefetchw(va); /* xdp_frame data area */ + net_prefetch(data); + + mlx5e_fill_xdp_buff(rq, va, rx_headroom, cqe_bcnt32, &xdp); + if (mlx5e_xdp_handle(rq, di, &cqe_bcnt32, &xdp)) { + if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) + __set_bit(page_idx, wi->xdp_xmit_bitmap); /* non-atomic */ + return NULL; /* page/packet was consumed by XDP */ + } + + rx_headroom = xdp.data - xdp.data_hard_start; + frag_size = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt32); + skb = mlx5e_build_linear_skb(rq, va, frag_size, rx_headroom, cqe_bcnt32); + if (unlikely(!skb)) + return NULL; + + /* queue up for recycling/reuse */ + di->refcnt_bias--; + + return skb; +} + +static struct sk_buff * +mlx5e_skb_from_cqe_shampo(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, + struct mlx5_cqe64 *cqe, u16 header_index) +{ + struct mlx5e_dma_info *head = &rq->mpwqe.shampo->info[header_index]; + u16 head_offset = head->addr & (PAGE_SIZE - 1); + u16 head_size = cqe->shampo.header_size; + u16 rx_headroom = rq->buff.headroom; + struct sk_buff *skb = NULL; + void *hdr, *data; + u32 frag_size; + + hdr = page_address(head->page) + head_offset; + data = hdr + rx_headroom; + frag_size = MLX5_SKB_FRAG_SZ(rx_headroom + head_size); + + if (likely(frag_size <= BIT(MLX5E_SHAMPO_LOG_MAX_HEADER_ENTRY_SIZE))) { + /* build SKB around header */ + dma_sync_single_range_for_cpu(rq->pdev, head->addr, 0, frag_size, DMA_FROM_DEVICE); + prefetchw(hdr); + prefetch(data); + skb = mlx5e_build_linear_skb(rq, hdr, frag_size, rx_headroom, head_size); + + if (unlikely(!skb)) + return NULL; + + /* queue up for recycling/reuse */ + page_ref_inc(head->page); + + } else { + /* allocate SKB and copy header for large header */ + rq->stats->gro_large_hds++; + skb = napi_alloc_skb(rq->cq.napi, + ALIGN(head_size, sizeof(long))); + if (unlikely(!skb)) { + rq->stats->buff_alloc_err++; + return NULL; + } + + prefetchw(skb->data); + mlx5e_copy_skb_header(rq->pdev, skb, head, + head_offset + rx_headroom, + rx_headroom, head_size); + /* skb linear part was allocated with headlen and aligned to long */ + skb->tail += head_size; + skb->len += head_size; + } + return skb; +} + +static void +mlx5e_shampo_align_fragment(struct sk_buff *skb, u8 log_stride_sz) +{ + skb_frag_t *last_frag = &skb_shinfo(skb)->frags[skb_shinfo(skb)->nr_frags - 1]; + unsigned int frag_size = skb_frag_size(last_frag); + unsigned int frag_truesize; + + frag_truesize = ALIGN(frag_size, BIT(log_stride_sz)); + skb->truesize += frag_truesize - frag_size; +} + +static void +mlx5e_shampo_flush_skb(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe, bool match) +{ + struct sk_buff *skb = rq->hw_gro_data->skb; + struct mlx5e_rq_stats *stats = rq->stats; + + stats->gro_skbs++; + if (likely(skb_shinfo(skb)->nr_frags)) + mlx5e_shampo_align_fragment(skb, rq->mpwqe.log_stride_sz); + if (NAPI_GRO_CB(skb)->count > 1) + mlx5e_shampo_update_hdr(rq, cqe, match); + napi_gro_receive(rq->cq.napi, skb); + rq->hw_gro_data->skb = NULL; +} + +static bool +mlx5e_hw_gro_skb_has_enough_space(struct sk_buff *skb, u16 data_bcnt) +{ + int nr_frags = skb_shinfo(skb)->nr_frags; + + return PAGE_SIZE * nr_frags + data_bcnt <= GSO_MAX_SIZE; +} + +static void +mlx5e_free_rx_shampo_hd_entry(struct mlx5e_rq *rq, u16 header_index) +{ + struct mlx5e_shampo_hd *shampo = rq->mpwqe.shampo; + u64 addr = shampo->info[header_index].addr; + + if (((header_index + 1) & (MLX5E_SHAMPO_WQ_HEADER_PER_PAGE - 1)) == 0) { + shampo->info[header_index].addr = ALIGN_DOWN(addr, PAGE_SIZE); + mlx5e_page_release(rq, &shampo->info[header_index], true); + } + bitmap_clear(shampo->bitmap, header_index, 1); +} + +static void mlx5e_handle_rx_cqe_mpwrq_shampo(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe) +{ + u16 data_bcnt = mpwrq_get_cqe_byte_cnt(cqe) - cqe->shampo.header_size; + u16 header_index = mlx5e_shampo_get_cqe_header_index(rq, cqe); + u32 wqe_offset = be32_to_cpu(cqe->shampo.data_offset); + u16 cstrides = mpwrq_get_cqe_consumed_strides(cqe); + u32 data_offset = wqe_offset & (PAGE_SIZE - 1); + u32 cqe_bcnt = mpwrq_get_cqe_byte_cnt(cqe); + u16 wqe_id = be16_to_cpu(cqe->wqe_id); + u32 page_idx = wqe_offset >> PAGE_SHIFT; + u16 head_size = cqe->shampo.header_size; + struct sk_buff **skb = &rq->hw_gro_data->skb; + bool flush = cqe->shampo.flush; + bool match = cqe->shampo.match; + struct mlx5e_rq_stats *stats = rq->stats; + struct mlx5e_rx_wqe_ll *wqe; + struct mlx5e_dma_info *di; + struct mlx5e_mpw_info *wi; + struct mlx5_wq_ll *wq; + + wi = &rq->mpwqe.info[wqe_id]; + wi->consumed_strides += cstrides; + + if (unlikely(MLX5E_RX_ERR_CQE(cqe))) { + mlx5e_handle_rx_err_cqe(rq, cqe); + goto mpwrq_cqe_out; + } + + if (unlikely(mpwrq_is_filler_cqe(cqe))) { + stats->mpwqe_filler_cqes++; + stats->mpwqe_filler_strides += cstrides; + goto mpwrq_cqe_out; + } + + stats->gro_match_packets += match; + + if (*skb && (!match || !(mlx5e_hw_gro_skb_has_enough_space(*skb, data_bcnt)))) { + match = false; + mlx5e_shampo_flush_skb(rq, cqe, match); + } + + if (!*skb) { + if (likely(head_size)) + *skb = mlx5e_skb_from_cqe_shampo(rq, wi, cqe, header_index); + else + *skb = mlx5e_skb_from_cqe_mpwrq_nonlinear(rq, wi, cqe_bcnt, data_offset, + page_idx); + if (unlikely(!*skb)) + goto free_hd_entry; + + NAPI_GRO_CB(*skb)->count = 1; + skb_shinfo(*skb)->gso_size = cqe_bcnt - head_size; + } else { + NAPI_GRO_CB(*skb)->count++; + if (NAPI_GRO_CB(*skb)->count == 2 && + rq->hw_gro_data->fk.basic.n_proto == htons(ETH_P_IP)) { + void *hd_addr = mlx5e_shampo_get_packet_hd(rq, header_index); + int nhoff = ETH_HLEN + rq->hw_gro_data->fk.control.thoff - + sizeof(struct iphdr); + struct iphdr *iph = (struct iphdr *)(hd_addr + nhoff); + + rq->hw_gro_data->second_ip_id = ntohs(iph->id); + } + } + + if (likely(head_size)) { + di = &wi->umr.dma_info[page_idx]; + mlx5e_fill_skb_data(*skb, rq, di, data_bcnt, data_offset); + } + + mlx5e_shampo_complete_rx_cqe(rq, cqe, cqe_bcnt, *skb); + if (flush) + mlx5e_shampo_flush_skb(rq, cqe, match); +free_hd_entry: + mlx5e_free_rx_shampo_hd_entry(rq, header_index); +mpwrq_cqe_out: + if (likely(wi->consumed_strides < rq->mpwqe.num_strides)) + return; + + wq = &rq->mpwqe.wq; + wqe = mlx5_wq_ll_get_wqe(wq, wqe_id); + mlx5e_free_rx_mpwqe(rq, wi, true); + mlx5_wq_ll_pop(wq, cqe->wqe_id, &wqe->next.next_wqe_index); +} + +static void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe) +{ + u16 cstrides = mpwrq_get_cqe_consumed_strides(cqe); + u16 wqe_id = be16_to_cpu(cqe->wqe_id); + struct mlx5e_mpw_info *wi = &rq->mpwqe.info[wqe_id]; + u16 stride_ix = mpwrq_get_cqe_stride_index(cqe); + u32 wqe_offset = stride_ix << rq->mpwqe.log_stride_sz; + u32 head_offset = wqe_offset & (PAGE_SIZE - 1); + u32 page_idx = wqe_offset >> PAGE_SHIFT; + struct mlx5e_rx_wqe_ll *wqe; + struct mlx5_wq_ll *wq; + struct sk_buff *skb; + u16 cqe_bcnt; + + wi->consumed_strides += cstrides; + + if (unlikely(MLX5E_RX_ERR_CQE(cqe))) { + mlx5e_handle_rx_err_cqe(rq, cqe); + goto mpwrq_cqe_out; + } + + if (unlikely(mpwrq_is_filler_cqe(cqe))) { + struct mlx5e_rq_stats *stats = rq->stats; + + stats->mpwqe_filler_cqes++; + stats->mpwqe_filler_strides += cstrides; + goto mpwrq_cqe_out; + } + + cqe_bcnt = mpwrq_get_cqe_byte_cnt(cqe); + + skb = INDIRECT_CALL_2(rq->mpwqe.skb_from_cqe_mpwrq, + mlx5e_skb_from_cqe_mpwrq_linear, + mlx5e_skb_from_cqe_mpwrq_nonlinear, + rq, wi, cqe_bcnt, head_offset, page_idx); + if (!skb) + goto mpwrq_cqe_out; + + mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb); + + if (mlx5e_cqe_regb_chain(cqe)) + if (!mlx5e_tc_update_skb(cqe, skb)) { + dev_kfree_skb_any(skb); + goto mpwrq_cqe_out; + } + + napi_gro_receive(rq->cq.napi, skb); + +mpwrq_cqe_out: + if (likely(wi->consumed_strides < rq->mpwqe.num_strides)) + return; + + wq = &rq->mpwqe.wq; + wqe = mlx5_wq_ll_get_wqe(wq, wqe_id); + mlx5e_free_rx_mpwqe(rq, wi, true); + mlx5_wq_ll_pop(wq, cqe->wqe_id, &wqe->next.next_wqe_index); +} + +int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget) +{ + struct mlx5e_rq *rq = container_of(cq, struct mlx5e_rq, cq); + struct mlx5_cqwq *cqwq = &cq->wq; + struct mlx5_cqe64 *cqe; + int work_done = 0; + + if (unlikely(!test_bit(MLX5E_RQ_STATE_ENABLED, &rq->state))) + return 0; + + if (rq->cqd.left) { + work_done += mlx5e_decompress_cqes_cont(rq, cqwq, 0, budget); + if (work_done >= budget) + goto out; + } + + cqe = mlx5_cqwq_get_cqe(cqwq); + if (!cqe) { + if (unlikely(work_done)) + goto out; + return 0; + } + + do { + if (mlx5_get_cqe_format(cqe) == MLX5_COMPRESSED) { + work_done += + mlx5e_decompress_cqes_start(rq, cqwq, + budget - work_done); + continue; + } + + mlx5_cqwq_pop(cqwq); + + INDIRECT_CALL_3(rq->handle_rx_cqe, mlx5e_handle_rx_cqe_mpwrq, + mlx5e_handle_rx_cqe, mlx5e_handle_rx_cqe_mpwrq_shampo, + rq, cqe); + } while ((++work_done < budget) && (cqe = mlx5_cqwq_get_cqe(cqwq))); + +out: + if (test_bit(MLX5E_RQ_STATE_SHAMPO, &rq->state) && rq->hw_gro_data->skb) + mlx5e_shampo_flush_skb(rq, NULL, false); + + if (rcu_access_pointer(rq->xdp_prog)) + mlx5e_xdp_rx_poll_complete(rq); + + mlx5_cqwq_update_db_record(cqwq); + + /* ensure cq space is freed before enabling more cqes */ + wmb(); + + return work_done; +} + +#ifdef CONFIG_MLX5_CORE_IPOIB + +#define MLX5_IB_GRH_SGID_OFFSET 8 +#define MLX5_IB_GRH_DGID_OFFSET 24 +#define MLX5_GID_SIZE 16 + +static inline void mlx5i_complete_rx_cqe(struct mlx5e_rq *rq, + struct mlx5_cqe64 *cqe, + u32 cqe_bcnt, + struct sk_buff *skb) +{ + struct hwtstamp_config *tstamp; + struct mlx5e_rq_stats *stats; + struct net_device *netdev; + struct mlx5e_priv *priv; + char *pseudo_header; + u8 l4_hdr_type = get_cqe_l4_hdr_type(cqe); + u32 flags_rqpn; + u32 qpn; + u8 *dgid; + u8 g; + + qpn = be32_to_cpu(cqe->sop_drop_qpn) & 0xffffff; + netdev = mlx5i_pkey_get_netdev(rq->netdev, qpn); + + /* No mapping present, cannot process SKB. This might happen if a child + * interface is going down while having unprocessed CQEs on parent RQ + */ + if (unlikely(!netdev)) { + /* TODO: add drop counters support */ + skb->dev = NULL; + pr_warn_once("Unable to map QPN %u to dev - dropping skb\n", qpn); + return; + } + + priv = mlx5i_epriv(netdev); + tstamp = &priv->tstamp; + stats = rq->stats; + + flags_rqpn = be32_to_cpu(cqe->flags_rqpn); + g = (flags_rqpn >> 28) & 3; + dgid = skb->data + MLX5_IB_GRH_DGID_OFFSET; + if ((!g) || dgid[0] != 0xff) + skb->pkt_type = PACKET_HOST; + else if (memcmp(dgid, netdev->broadcast + 4, MLX5_GID_SIZE) == 0) + skb->pkt_type = PACKET_BROADCAST; + else + skb->pkt_type = PACKET_MULTICAST; + + /* Drop packets that this interface sent, ie multicast packets + * that the HCA has replicated. + */ + if (g && (qpn == (flags_rqpn & 0xffffff)) && + (memcmp(netdev->dev_addr + 4, skb->data + MLX5_IB_GRH_SGID_OFFSET, + MLX5_GID_SIZE) == 0)) { + skb->dev = NULL; + return; + } + + skb_pull(skb, MLX5_IB_GRH_BYTES); + + skb->protocol = *((__be16 *)(skb->data)); + + if ((netdev->features & NETIF_F_RXCSUM) && + (likely((cqe->hds_ip_ext & CQE_L3_OK) && + (cqe->hds_ip_ext & CQE_L4_OK)))) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + stats->csum_unnecessary++; + } else { + skb->ip_summed = CHECKSUM_NONE; + stats->csum_none++; + } + + if (unlikely(mlx5e_rx_hw_stamp(tstamp))) + skb_hwtstamps(skb)->hwtstamp = mlx5e_cqe_ts_to_ns(rq->ptp_cyc2time, + rq->clock, get_cqe_ts(cqe)); + skb_record_rx_queue(skb, rq->ix); + + if (likely(netdev->features & NETIF_F_RXHASH)) + mlx5e_skb_set_hash(cqe, skb); + + /* 20 bytes of ipoib header and 4 for encap existing */ + pseudo_header = skb_push(skb, MLX5_IPOIB_PSEUDO_LEN); + memset(pseudo_header, 0, MLX5_IPOIB_PSEUDO_LEN); + skb_reset_mac_header(skb); + skb_pull(skb, MLX5_IPOIB_HARD_LEN); + + skb->dev = netdev; + + stats->packets++; + stats->bytes += cqe_bcnt; + + if (l4_hdr_type != CQE_L4_HDR_TYPE_TCP_ACK_NO_DATA) { + rq->dim_obj.sample.pkt_ctr = rq->stats->packets; + rq->dim_obj.sample.byte_ctr = rq->stats->bytes; + } +} + +static void mlx5i_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe) +{ + struct mlx5_wq_cyc *wq = &rq->wqe.wq; + struct mlx5e_wqe_frag_info *wi; + struct sk_buff *skb; + u32 cqe_bcnt; + u16 ci; + + ci = mlx5_wq_cyc_ctr2ix(wq, be16_to_cpu(cqe->wqe_counter)); + wi = get_frag(rq, ci); + cqe_bcnt = be32_to_cpu(cqe->byte_cnt); + + if (unlikely(MLX5E_RX_ERR_CQE(cqe))) { + rq->stats->wqe_err++; + goto wq_free_wqe; + } + + skb = INDIRECT_CALL_2(rq->wqe.skb_from_cqe, + mlx5e_skb_from_cqe_linear, + mlx5e_skb_from_cqe_nonlinear, + rq, cqe, wi, cqe_bcnt); + if (!skb) + goto wq_free_wqe; + + mlx5i_complete_rx_cqe(rq, cqe, cqe_bcnt, skb); + if (unlikely(!skb->dev)) { + dev_kfree_skb_any(skb); + goto wq_free_wqe; + } + napi_gro_receive(rq->cq.napi, skb); + +wq_free_wqe: + mlx5e_free_rx_wqe(rq, wi, true); + mlx5_wq_cyc_pop(wq); +} + +const struct mlx5e_rx_handlers mlx5i_rx_handlers = { + .handle_rx_cqe = mlx5i_handle_rx_cqe, + .handle_rx_cqe_mpwqe = NULL, /* Not supported */ +}; +#endif /* CONFIG_MLX5_CORE_IPOIB */ + +#ifdef CONFIG_MLX5_EN_IPSEC + +static void mlx5e_ipsec_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe) +{ + struct mlx5_wq_cyc *wq = &rq->wqe.wq; + struct mlx5e_wqe_frag_info *wi; + struct sk_buff *skb; + u32 cqe_bcnt; + u16 ci; + + ci = mlx5_wq_cyc_ctr2ix(wq, be16_to_cpu(cqe->wqe_counter)); + wi = get_frag(rq, ci); + cqe_bcnt = be32_to_cpu(cqe->byte_cnt); + + if (unlikely(MLX5E_RX_ERR_CQE(cqe))) { + rq->stats->wqe_err++; + goto wq_free_wqe; + } + + skb = INDIRECT_CALL_2(rq->wqe.skb_from_cqe, + mlx5e_skb_from_cqe_linear, + mlx5e_skb_from_cqe_nonlinear, + rq, cqe, wi, cqe_bcnt); + if (unlikely(!skb)) /* a DROP, save the page-reuse checks */ + goto wq_free_wqe; + + skb = mlx5e_ipsec_handle_rx_skb(rq->netdev, skb, &cqe_bcnt); + if (unlikely(!skb)) + goto wq_free_wqe; + + mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb); + napi_gro_receive(rq->cq.napi, skb); + +wq_free_wqe: + mlx5e_free_rx_wqe(rq, wi, true); + mlx5_wq_cyc_pop(wq); +} + +#endif /* CONFIG_MLX5_EN_IPSEC */ + +void mlx5e_rq_init_handler(struct mlx5e_rq *rq) +{ + rq->post_wqes = mlx5e_post_rx_skip; +} + +int mlx5e_rq_set_handlers(struct mlx5e_rq *rq, struct mlx5e_params *params, bool xsk) +{ + struct net_device *netdev = rq->netdev; + struct mlx5_core_dev *mdev = rq->mdev; + struct mlx5e_priv *priv = rq->priv; + + switch (rq->wq_type) { + case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ: + rq->mpwqe.skb_from_cqe_mpwrq = xsk ? + mlx5e_xsk_skb_from_cqe_mpwrq_linear : + mlx5e_rx_mpwqe_is_linear_skb(mdev, params, NULL) ? + mlx5e_skb_from_cqe_mpwrq_linear : + mlx5e_skb_from_cqe_mpwrq_nonlinear; + rq->post_wqes = mlx5e_post_rx_mpwqes; + rq->dealloc_wqe = mlx5e_dealloc_rx_mpwqe; + + if (mlx5_fpga_is_ipsec_device(mdev)) { + netdev_err(netdev, "MPWQE RQ with Innova IPSec offload not supported\n"); + return -EINVAL; + } + if (params->packet_merge.type == MLX5E_PACKET_MERGE_SHAMPO) { + rq->handle_rx_cqe = priv->profile->rx_handlers->handle_rx_cqe_mpwqe_shampo; + if (!rq->handle_rx_cqe) { + netdev_err(netdev, "RX handler of SHAMPO MPWQE RQ is not set\n"); + return -EINVAL; + } + } else { + rq->handle_rx_cqe = priv->profile->rx_handlers->handle_rx_cqe_mpwqe; + if (!rq->handle_rx_cqe) { + netdev_err(netdev, "RX handler of MPWQE RQ is not set\n"); + return -EINVAL; + } + } + + break; + default: /* MLX5_WQ_TYPE_CYCLIC */ + rq->wqe.skb_from_cqe = xsk ? + mlx5e_xsk_skb_from_cqe_linear : + mlx5e_rx_is_linear_skb(params, NULL) ? + mlx5e_skb_from_cqe_linear : + mlx5e_skb_from_cqe_nonlinear; + rq->post_wqes = mlx5e_post_rx_wqes; + rq->dealloc_wqe = mlx5e_dealloc_rx_wqe; + +#ifdef CONFIG_MLX5_EN_IPSEC + if ((mlx5_fpga_ipsec_device_caps(mdev) & MLX5_ACCEL_IPSEC_CAP_DEVICE) && + priv->ipsec) + rq->handle_rx_cqe = mlx5e_ipsec_handle_rx_cqe; + else +#endif + rq->handle_rx_cqe = priv->profile->rx_handlers->handle_rx_cqe; + if (!rq->handle_rx_cqe) { + netdev_err(netdev, "RX handler of RQ is not set\n"); + return -EINVAL; + } + } + + return 0; +} + +static void mlx5e_trap_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe) +{ + struct mlx5e_priv *priv = netdev_priv(rq->netdev); + struct mlx5_wq_cyc *wq = &rq->wqe.wq; + struct mlx5e_wqe_frag_info *wi; + struct devlink_port *dl_port; + struct sk_buff *skb; + u32 cqe_bcnt; + u16 trap_id; + u16 ci; + + trap_id = get_cqe_flow_tag(cqe); + ci = mlx5_wq_cyc_ctr2ix(wq, be16_to_cpu(cqe->wqe_counter)); + wi = get_frag(rq, ci); + cqe_bcnt = be32_to_cpu(cqe->byte_cnt); + + if (unlikely(MLX5E_RX_ERR_CQE(cqe))) { + rq->stats->wqe_err++; + goto free_wqe; + } + + skb = mlx5e_skb_from_cqe_nonlinear(rq, cqe, wi, cqe_bcnt); + if (!skb) + goto free_wqe; + + mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb); + skb_push(skb, ETH_HLEN); + + dl_port = mlx5e_devlink_get_dl_port(priv); + mlx5_devlink_trap_report(rq->mdev, trap_id, skb, dl_port); + dev_kfree_skb_any(skb); + +free_wqe: + mlx5e_free_rx_wqe(rq, wi, false); + mlx5_wq_cyc_pop(wq); +} + +void mlx5e_rq_set_trap_handlers(struct mlx5e_rq *rq, struct mlx5e_params *params) +{ + rq->wqe.skb_from_cqe = mlx5e_rx_is_linear_skb(params, NULL) ? + mlx5e_skb_from_cqe_linear : + mlx5e_skb_from_cqe_nonlinear; + rq->post_wqes = mlx5e_post_rx_wqes; + rq->dealloc_wqe = mlx5e_dealloc_rx_wqe; + rq->handle_rx_cqe = mlx5e_trap_handle_rx_cqe; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c new file mode 100644 index 0000000..08a7565 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c @@ -0,0 +1,371 @@ +/* + * Copyright (c) 2016, Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include "en.h" +#include "en/port.h" +#include "eswitch.h" + +static int mlx5e_test_health_info(struct mlx5e_priv *priv) +{ + struct mlx5_core_health *health = &priv->mdev->priv.health; + + return health->fatal_error ? 1 : 0; +} + +static int mlx5e_test_link_state(struct mlx5e_priv *priv) +{ + u8 port_state; + + if (!netif_carrier_ok(priv->netdev)) + return 1; + + port_state = mlx5_query_vport_state(priv->mdev, MLX5_VPORT_STATE_OP_MOD_VNIC_VPORT, 0); + return port_state == VPORT_STATE_UP ? 0 : 1; +} + +static int mlx5e_test_link_speed(struct mlx5e_priv *priv) +{ + u32 speed; + + if (!netif_carrier_ok(priv->netdev)) + return 1; + + return mlx5e_port_linkspeed(priv->mdev, &speed); +} + +struct mlx5ehdr { + __be32 version; + __be64 magic; +}; + +#ifdef CONFIG_INET +/* loopback test */ +#define MLX5E_TEST_PKT_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) +\ + sizeof(struct udphdr) + sizeof(struct mlx5ehdr)) +#define MLX5E_TEST_MAGIC 0x5AEED15C001ULL + +static struct sk_buff *mlx5e_test_get_udp_skb(struct mlx5e_priv *priv) +{ + struct sk_buff *skb = NULL; + struct mlx5ehdr *mlxh; + struct ethhdr *ethh; + struct udphdr *udph; + struct iphdr *iph; + int iplen; + + skb = netdev_alloc_skb(priv->netdev, MLX5E_TEST_PKT_SIZE); + if (!skb) { + netdev_err(priv->netdev, "\tFailed to alloc loopback skb\n"); + return NULL; + } + + net_prefetchw(skb->data); + skb_reserve(skb, NET_IP_ALIGN); + + /* Reserve for ethernet and IP header */ + ethh = skb_push(skb, ETH_HLEN); + skb_reset_mac_header(skb); + + skb_set_network_header(skb, skb->len); + iph = skb_put(skb, sizeof(struct iphdr)); + + skb_set_transport_header(skb, skb->len); + udph = skb_put(skb, sizeof(struct udphdr)); + + /* Fill ETH header */ + ether_addr_copy(ethh->h_dest, priv->netdev->dev_addr); + eth_zero_addr(ethh->h_source); + ethh->h_proto = htons(ETH_P_IP); + + /* Fill UDP header */ + udph->source = htons(9); + udph->dest = htons(9); /* Discard Protocol */ + udph->len = htons(sizeof(struct mlx5ehdr) + sizeof(struct udphdr)); + udph->check = 0; + + /* Fill IP header */ + iph->ihl = 5; + iph->ttl = 32; + iph->version = 4; + iph->protocol = IPPROTO_UDP; + iplen = sizeof(struct iphdr) + sizeof(struct udphdr) + + sizeof(struct mlx5ehdr); + iph->tot_len = htons(iplen); + iph->frag_off = 0; + iph->saddr = 0; + iph->daddr = 0; + iph->tos = 0; + iph->id = 0; + ip_send_check(iph); + + /* Fill test header and data */ + mlxh = skb_put(skb, sizeof(*mlxh)); + mlxh->version = 0; + mlxh->magic = cpu_to_be64(MLX5E_TEST_MAGIC); + + skb->csum = 0; + skb->ip_summed = CHECKSUM_PARTIAL; + udp4_hwcsum(skb, iph->saddr, iph->daddr); + + skb->protocol = htons(ETH_P_IP); + skb->pkt_type = PACKET_HOST; + skb->dev = priv->netdev; + + return skb; +} + +struct mlx5e_lbt_priv { + struct packet_type pt; + struct completion comp; + bool loopback_ok; + bool local_lb; +}; + +static int +mlx5e_test_loopback_validate(struct sk_buff *skb, + struct net_device *ndev, + struct packet_type *pt, + struct net_device *orig_ndev) +{ + struct mlx5e_lbt_priv *lbtp = pt->af_packet_priv; + struct mlx5ehdr *mlxh; + struct ethhdr *ethh; + struct udphdr *udph; + struct iphdr *iph; + + /* We are only going to peek, no need to clone the SKB */ + if (MLX5E_TEST_PKT_SIZE - ETH_HLEN > skb_headlen(skb)) + goto out; + + ethh = (struct ethhdr *)skb_mac_header(skb); + if (!ether_addr_equal(ethh->h_dest, orig_ndev->dev_addr)) + goto out; + + iph = ip_hdr(skb); + if (iph->protocol != IPPROTO_UDP) + goto out; + + /* Don't assume skb_transport_header() was set */ + udph = (struct udphdr *)((u8 *)iph + 4 * iph->ihl); + if (udph->dest != htons(9)) + goto out; + + mlxh = (struct mlx5ehdr *)((char *)udph + sizeof(*udph)); + if (mlxh->magic != cpu_to_be64(MLX5E_TEST_MAGIC)) + goto out; /* so close ! */ + + /* bingo */ + lbtp->loopback_ok = true; + complete(&lbtp->comp); +out: + kfree_skb(skb); + return 0; +} + +static int mlx5e_test_loopback_setup(struct mlx5e_priv *priv, + struct mlx5e_lbt_priv *lbtp) +{ + int err = 0; + + /* Temporarily enable local_lb */ + err = mlx5_nic_vport_query_local_lb(priv->mdev, &lbtp->local_lb); + if (err) + return err; + + if (!lbtp->local_lb) { + err = mlx5_nic_vport_update_local_lb(priv->mdev, true); + if (err) + return err; + } + + err = mlx5e_refresh_tirs(priv, true, false); + if (err) + goto out; + + lbtp->loopback_ok = false; + init_completion(&lbtp->comp); + + lbtp->pt.type = htons(ETH_P_IP); + lbtp->pt.func = mlx5e_test_loopback_validate; + lbtp->pt.dev = priv->netdev; + lbtp->pt.af_packet_priv = lbtp; + dev_add_pack(&lbtp->pt); + + return 0; + +out: + if (!lbtp->local_lb) + mlx5_nic_vport_update_local_lb(priv->mdev, false); + + return err; +} + +static void mlx5e_test_loopback_cleanup(struct mlx5e_priv *priv, + struct mlx5e_lbt_priv *lbtp) +{ + if (!lbtp->local_lb) + mlx5_nic_vport_update_local_lb(priv->mdev, false); + + dev_remove_pack(&lbtp->pt); + mlx5e_refresh_tirs(priv, false, false); +} + +static int mlx5e_cond_loopback(struct mlx5e_priv *priv) +{ + if (is_mdev_switchdev_mode(priv->mdev)) + return -EOPNOTSUPP; + + return 0; +} + +#define MLX5E_LB_VERIFY_TIMEOUT (msecs_to_jiffies(200)) +static int mlx5e_test_loopback(struct mlx5e_priv *priv) +{ + struct mlx5e_lbt_priv *lbtp; + struct sk_buff *skb = NULL; + int err; + + if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) { + netdev_err(priv->netdev, + "\tCan't perform loopback test while device is down\n"); + return -ENODEV; + } + + lbtp = kzalloc(sizeof(*lbtp), GFP_KERNEL); + if (!lbtp) + return -ENOMEM; + lbtp->loopback_ok = false; + + err = mlx5e_test_loopback_setup(priv, lbtp); + if (err) + goto out; + + skb = mlx5e_test_get_udp_skb(priv); + if (!skb) { + err = -ENOMEM; + goto cleanup; + } + + skb_set_queue_mapping(skb, 0); + err = dev_queue_xmit(skb); + if (err) { + netdev_err(priv->netdev, + "\tFailed to xmit loopback packet err(%d)\n", + err); + goto cleanup; + } + + wait_for_completion_timeout(&lbtp->comp, MLX5E_LB_VERIFY_TIMEOUT); + err = !lbtp->loopback_ok; + +cleanup: + mlx5e_test_loopback_cleanup(priv, lbtp); +out: + kfree(lbtp); + return err; +} +#endif + +typedef int (*mlx5e_st_func)(struct mlx5e_priv *); + +struct mlx5e_st { + char name[ETH_GSTRING_LEN]; + mlx5e_st_func st_func; + mlx5e_st_func cond_func; +}; + +static struct mlx5e_st mlx5e_sts[] = { + { "Link Test", mlx5e_test_link_state }, + { "Speed Test", mlx5e_test_link_speed }, + { "Health Test", mlx5e_test_health_info }, +#ifdef CONFIG_INET + { "Loopback Test", mlx5e_test_loopback, mlx5e_cond_loopback }, +#endif +}; + +#define MLX5E_ST_NUM ARRAY_SIZE(mlx5e_sts) + +void mlx5e_self_test(struct net_device *ndev, struct ethtool_test *etest, + u64 *buf) +{ + struct mlx5e_priv *priv = netdev_priv(ndev); + int i, count = 0; + + mutex_lock(&priv->state_lock); + netdev_info(ndev, "Self test begin..\n"); + + for (i = 0; i < MLX5E_ST_NUM; i++) { + struct mlx5e_st st = mlx5e_sts[i]; + + if (st.cond_func && st.cond_func(priv)) + continue; + netdev_info(ndev, "\t[%d] %s start..\n", i, st.name); + buf[count] = st.st_func(priv); + netdev_info(ndev, "\t[%d] %s end: result(%lld)\n", i, st.name, buf[count]); + count++; + } + + mutex_unlock(&priv->state_lock); + + for (i = 0; i < count; i++) { + if (buf[i]) { + etest->flags |= ETH_TEST_FL_FAILED; + break; + } + } + netdev_info(ndev, "Self test out: status flags(0x%x)\n", + etest->flags); +} + +int mlx5e_self_test_fill_strings(struct mlx5e_priv *priv, u8 *data) +{ + int i, count = 0; + + for (i = 0; i < MLX5E_ST_NUM; i++) { + struct mlx5e_st st = mlx5e_sts[i]; + + if (st.cond_func && st.cond_func(priv)) + continue; + if (data) + strcpy(data + count * ETH_GSTRING_LEN, st.name); + count++; + } + return count; +} + +int mlx5e_self_test_num(struct mlx5e_priv *priv) +{ + return mlx5e_self_test_fill_strings(priv, NULL); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c new file mode 100644 index 0000000..b3692e4 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c @@ -0,0 +1,2404 @@ +/* + * Copyright (c) 2017, Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "lib/mlx5.h" +#include "en.h" +#include "en_accel/tls.h" +#include "en_accel/en_accel.h" +#include "en/ptp.h" +#include "en/port.h" + +static unsigned int stats_grps_num(struct mlx5e_priv *priv) +{ + return !priv->profile->stats_grps_num ? 0 : + priv->profile->stats_grps_num(priv); +} + +unsigned int mlx5e_stats_total_num(struct mlx5e_priv *priv) +{ + mlx5e_stats_grp_t *stats_grps = priv->profile->stats_grps; + const unsigned int num_stats_grps = stats_grps_num(priv); + unsigned int total = 0; + int i; + + for (i = 0; i < num_stats_grps; i++) + total += stats_grps[i]->get_num_stats(priv); + + return total; +} + +void mlx5e_stats_update_ndo_stats(struct mlx5e_priv *priv) +{ + mlx5e_stats_grp_t *stats_grps = priv->profile->stats_grps; + const unsigned int num_stats_grps = stats_grps_num(priv); + int i; + + for (i = num_stats_grps - 1; i >= 0; i--) + if (stats_grps[i]->update_stats && + stats_grps[i]->update_stats_mask & MLX5E_NDO_UPDATE_STATS) + stats_grps[i]->update_stats(priv); +} + +void mlx5e_stats_update(struct mlx5e_priv *priv) +{ + mlx5e_stats_grp_t *stats_grps = priv->profile->stats_grps; + const unsigned int num_stats_grps = stats_grps_num(priv); + int i; + + for (i = num_stats_grps - 1; i >= 0; i--) + if (stats_grps[i]->update_stats) + stats_grps[i]->update_stats(priv); +} + +void mlx5e_stats_fill(struct mlx5e_priv *priv, u64 *data, int idx) +{ + mlx5e_stats_grp_t *stats_grps = priv->profile->stats_grps; + const unsigned int num_stats_grps = stats_grps_num(priv); + int i; + + for (i = 0; i < num_stats_grps; i++) + idx = stats_grps[i]->fill_stats(priv, data, idx); +} + +void mlx5e_stats_fill_strings(struct mlx5e_priv *priv, u8 *data) +{ + mlx5e_stats_grp_t *stats_grps = priv->profile->stats_grps; + const unsigned int num_stats_grps = stats_grps_num(priv); + int i, idx = 0; + + for (i = 0; i < num_stats_grps; i++) + idx = stats_grps[i]->fill_strings(priv, data, idx); +} + +/* Concrete NIC Stats */ + +static const struct counter_desc sw_stats_desc[] = { + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_packets) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_bytes) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_packets) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_bytes) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_tso_packets) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_tso_bytes) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_tso_inner_packets) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_tso_inner_bytes) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_added_vlan_packets) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_nop) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_mpwqe_blks) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_mpwqe_pkts) }, + +#ifdef CONFIG_MLX5_EN_TLS + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_tls_encrypted_packets) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_tls_encrypted_bytes) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_tls_ooo) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_tls_dump_packets) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_tls_dump_bytes) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_tls_resync_bytes) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_tls_skip_no_sync_data) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_tls_drop_no_sync_data) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_tls_drop_bypass_req) }, +#endif + + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_lro_packets) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_lro_bytes) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_gro_packets) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_gro_bytes) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_gro_skbs) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_gro_match_packets) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_gro_large_hds) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_ecn_mark) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_removed_vlan_packets) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_csum_unnecessary) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_csum_none) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_csum_complete) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_csum_complete_tail) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_csum_complete_tail_slow) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_csum_unnecessary_inner) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_drop) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_redirect) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_tx_xmit) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_tx_mpwqe) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_tx_inlnw) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_tx_nops) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_tx_full) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_tx_err) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_tx_cqe) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_csum_none) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_csum_partial) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_csum_partial_inner) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_queue_stopped) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_queue_dropped) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xmit_more) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_recover) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_cqes) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_queue_wake) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_cqe_err) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xdp_xmit) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xdp_mpwqe) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xdp_inlnw) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xdp_nops) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xdp_full) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xdp_err) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xdp_cqes) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_wqe_err) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_mpwqe_filler_cqes) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_mpwqe_filler_strides) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_oversize_pkts_sw_drop) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_buff_alloc_err) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_cqe_compress_blks) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_cqe_compress_pkts) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_cache_reuse) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_cache_full) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_cache_empty) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_cache_busy) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_cache_ext) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_cache_rdc) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_cache_alloc) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_cache_waive) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_congst_umr) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_arfs_err) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_recover) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_pet_hdr_lookup_drop) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_pet_mdata_lookup_drop) }, +#ifdef CONFIG_MLX5_EN_TLS + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_tls_decrypted_packets) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_tls_decrypted_bytes) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_tls_resync_req_pkt) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_tls_resync_req_start) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_tls_resync_req_end) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_tls_resync_req_skip) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_tls_resync_res_ok) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_tls_resync_res_retry) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_tls_resync_res_skip) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_tls_err) }, +#endif + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, ch_events) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, ch_poll) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, ch_arm) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, ch_aff_change) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, ch_force_irq) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, ch_eq_rearm) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xsk_packets) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xsk_bytes) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xsk_csum_complete) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xsk_csum_unnecessary) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xsk_csum_unnecessary_inner) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xsk_csum_none) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xsk_ecn_mark) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xsk_removed_vlan_packets) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xsk_xdp_drop) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xsk_xdp_redirect) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xsk_wqe_err) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xsk_mpwqe_filler_cqes) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xsk_mpwqe_filler_strides) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xsk_oversize_pkts_sw_drop) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xsk_buff_alloc_err) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xsk_cqe_compress_blks) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xsk_cqe_compress_pkts) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xsk_congst_umr) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xsk_arfs_err) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xsk_xmit) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xsk_mpwqe) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xsk_inlnw) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xsk_full) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xsk_err) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xsk_cqes) }, +}; + +#define NUM_SW_COUNTERS ARRAY_SIZE(sw_stats_desc) + +static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(sw) +{ + return NUM_SW_COUNTERS; +} + +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(sw) +{ + int i; + + for (i = 0; i < NUM_SW_COUNTERS; i++) + strcpy(data + (idx++) * ETH_GSTRING_LEN, sw_stats_desc[i].format); + return idx; +} + +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(sw) +{ + int i; + + for (i = 0; i < NUM_SW_COUNTERS; i++) + data[idx++] = MLX5E_READ_CTR64_CPU(&priv->stats.sw, sw_stats_desc, i); + return idx; +} + +static void mlx5e_stats_grp_sw_update_stats_xdp_red(struct mlx5e_sw_stats *s, + struct mlx5e_xdpsq_stats *xdpsq_red_stats) +{ + s->tx_xdp_xmit += xdpsq_red_stats->xmit; + s->tx_xdp_mpwqe += xdpsq_red_stats->mpwqe; + s->tx_xdp_inlnw += xdpsq_red_stats->inlnw; + s->tx_xdp_nops += xdpsq_red_stats->nops; + s->tx_xdp_full += xdpsq_red_stats->full; + s->tx_xdp_err += xdpsq_red_stats->err; + s->tx_xdp_cqes += xdpsq_red_stats->cqes; +} + +static void mlx5e_stats_grp_sw_update_stats_xdpsq(struct mlx5e_sw_stats *s, + struct mlx5e_xdpsq_stats *xdpsq_stats) +{ + s->rx_xdp_tx_xmit += xdpsq_stats->xmit; + s->rx_xdp_tx_mpwqe += xdpsq_stats->mpwqe; + s->rx_xdp_tx_inlnw += xdpsq_stats->inlnw; + s->rx_xdp_tx_nops += xdpsq_stats->nops; + s->rx_xdp_tx_full += xdpsq_stats->full; + s->rx_xdp_tx_err += xdpsq_stats->err; + s->rx_xdp_tx_cqe += xdpsq_stats->cqes; +} + +static void mlx5e_stats_grp_sw_update_stats_xsksq(struct mlx5e_sw_stats *s, + struct mlx5e_xdpsq_stats *xsksq_stats) +{ + s->tx_xsk_xmit += xsksq_stats->xmit; + s->tx_xsk_mpwqe += xsksq_stats->mpwqe; + s->tx_xsk_inlnw += xsksq_stats->inlnw; + s->tx_xsk_full += xsksq_stats->full; + s->tx_xsk_err += xsksq_stats->err; + s->tx_xsk_cqes += xsksq_stats->cqes; +} + +static void mlx5e_stats_grp_sw_update_stats_xskrq(struct mlx5e_sw_stats *s, + struct mlx5e_rq_stats *xskrq_stats) +{ + s->rx_xsk_packets += xskrq_stats->packets; + s->rx_xsk_bytes += xskrq_stats->bytes; + s->rx_xsk_csum_complete += xskrq_stats->csum_complete; + s->rx_xsk_csum_unnecessary += xskrq_stats->csum_unnecessary; + s->rx_xsk_csum_unnecessary_inner += xskrq_stats->csum_unnecessary_inner; + s->rx_xsk_csum_none += xskrq_stats->csum_none; + s->rx_xsk_ecn_mark += xskrq_stats->ecn_mark; + s->rx_xsk_removed_vlan_packets += xskrq_stats->removed_vlan_packets; + s->rx_xsk_xdp_drop += xskrq_stats->xdp_drop; + s->rx_xsk_xdp_redirect += xskrq_stats->xdp_redirect; + s->rx_xsk_wqe_err += xskrq_stats->wqe_err; + s->rx_xsk_mpwqe_filler_cqes += xskrq_stats->mpwqe_filler_cqes; + s->rx_xsk_mpwqe_filler_strides += xskrq_stats->mpwqe_filler_strides; + s->rx_xsk_oversize_pkts_sw_drop += xskrq_stats->oversize_pkts_sw_drop; + s->rx_xsk_buff_alloc_err += xskrq_stats->buff_alloc_err; + s->rx_xsk_cqe_compress_blks += xskrq_stats->cqe_compress_blks; + s->rx_xsk_cqe_compress_pkts += xskrq_stats->cqe_compress_pkts; + s->rx_xsk_congst_umr += xskrq_stats->congst_umr; + s->rx_xsk_arfs_err += xskrq_stats->arfs_err; +} + +static void mlx5e_stats_grp_sw_update_stats_rq_stats(struct mlx5e_sw_stats *s, + struct mlx5e_rq_stats *rq_stats) +{ + s->rx_packets += rq_stats->packets; + s->rx_bytes += rq_stats->bytes; + s->rx_lro_packets += rq_stats->lro_packets; + s->rx_lro_bytes += rq_stats->lro_bytes; + s->rx_gro_packets += rq_stats->gro_packets; + s->rx_gro_bytes += rq_stats->gro_bytes; + s->rx_gro_skbs += rq_stats->gro_skbs; + s->rx_gro_match_packets += rq_stats->gro_match_packets; + s->rx_gro_large_hds += rq_stats->gro_large_hds; + s->rx_ecn_mark += rq_stats->ecn_mark; + s->rx_removed_vlan_packets += rq_stats->removed_vlan_packets; + s->rx_csum_none += rq_stats->csum_none; + s->rx_csum_complete += rq_stats->csum_complete; + s->rx_csum_complete_tail += rq_stats->csum_complete_tail; + s->rx_csum_complete_tail_slow += rq_stats->csum_complete_tail_slow; + s->rx_csum_unnecessary += rq_stats->csum_unnecessary; + s->rx_csum_unnecessary_inner += rq_stats->csum_unnecessary_inner; + s->rx_xdp_drop += rq_stats->xdp_drop; + s->rx_xdp_redirect += rq_stats->xdp_redirect; + s->rx_wqe_err += rq_stats->wqe_err; + s->rx_mpwqe_filler_cqes += rq_stats->mpwqe_filler_cqes; + s->rx_mpwqe_filler_strides += rq_stats->mpwqe_filler_strides; + s->rx_oversize_pkts_sw_drop += rq_stats->oversize_pkts_sw_drop; + s->rx_buff_alloc_err += rq_stats->buff_alloc_err; + s->rx_cqe_compress_blks += rq_stats->cqe_compress_blks; + s->rx_cqe_compress_pkts += rq_stats->cqe_compress_pkts; + s->rx_cache_reuse += rq_stats->cache_reuse; + s->rx_cache_full += rq_stats->cache_full; + s->rx_cache_empty += rq_stats->cache_empty; + s->rx_cache_busy += rq_stats->cache_busy; + s->rx_cache_waive += rq_stats->cache_waive; + s->rx_cache_ext += rq_stats->cache_ext; + s->rx_cache_rdc += rq_stats->cache_rdc; + s->rx_cache_alloc += rq_stats->cache_alloc; + s->rx_congst_umr += rq_stats->congst_umr; + s->rx_arfs_err += rq_stats->arfs_err; + s->rx_recover += rq_stats->recover; + s->rx_pet_hdr_lookup_drop += rq_stats->pet_hdr_lookup_drop; + s->rx_pet_mdata_lookup_drop += rq_stats->pet_mdata_lookup_drop; +#ifdef CONFIG_MLX5_EN_TLS + s->rx_tls_decrypted_packets += rq_stats->tls_decrypted_packets; + s->rx_tls_decrypted_bytes += rq_stats->tls_decrypted_bytes; + s->rx_tls_resync_req_pkt += rq_stats->tls_resync_req_pkt; + s->rx_tls_resync_req_start += rq_stats->tls_resync_req_start; + s->rx_tls_resync_req_end += rq_stats->tls_resync_req_end; + s->rx_tls_resync_req_skip += rq_stats->tls_resync_req_skip; + s->rx_tls_resync_res_ok += rq_stats->tls_resync_res_ok; + s->rx_tls_resync_res_retry += rq_stats->tls_resync_res_retry; + s->rx_tls_resync_res_skip += rq_stats->tls_resync_res_skip; + s->rx_tls_err += rq_stats->tls_err; +#endif +} + +static void mlx5e_stats_grp_sw_update_stats_ch_stats(struct mlx5e_sw_stats *s, + struct mlx5e_ch_stats *ch_stats) +{ + s->ch_events += ch_stats->events; + s->ch_poll += ch_stats->poll; + s->ch_arm += ch_stats->arm; + s->ch_aff_change += ch_stats->aff_change; + s->ch_force_irq += ch_stats->force_irq; + s->ch_eq_rearm += ch_stats->eq_rearm; +} + +static void mlx5e_stats_grp_sw_update_stats_sq(struct mlx5e_sw_stats *s, + struct mlx5e_sq_stats *sq_stats) +{ + s->tx_packets += sq_stats->packets; + s->tx_bytes += sq_stats->bytes; + s->tx_tso_packets += sq_stats->tso_packets; + s->tx_tso_bytes += sq_stats->tso_bytes; + s->tx_tso_inner_packets += sq_stats->tso_inner_packets; + s->tx_tso_inner_bytes += sq_stats->tso_inner_bytes; + s->tx_added_vlan_packets += sq_stats->added_vlan_packets; + s->tx_nop += sq_stats->nop; + s->tx_mpwqe_blks += sq_stats->mpwqe_blks; + s->tx_mpwqe_pkts += sq_stats->mpwqe_pkts; + s->tx_queue_stopped += sq_stats->stopped; + s->tx_queue_wake += sq_stats->wake; + s->tx_queue_dropped += sq_stats->dropped; + s->tx_cqe_err += sq_stats->cqe_err; + s->tx_recover += sq_stats->recover; + s->tx_xmit_more += sq_stats->xmit_more; + s->tx_csum_partial_inner += sq_stats->csum_partial_inner; + s->tx_csum_none += sq_stats->csum_none; + s->tx_csum_partial += sq_stats->csum_partial; +#ifdef CONFIG_MLX5_EN_TLS + s->tx_tls_encrypted_packets += sq_stats->tls_encrypted_packets; + s->tx_tls_encrypted_bytes += sq_stats->tls_encrypted_bytes; + s->tx_tls_ooo += sq_stats->tls_ooo; + s->tx_tls_dump_bytes += sq_stats->tls_dump_bytes; + s->tx_tls_dump_packets += sq_stats->tls_dump_packets; + s->tx_tls_resync_bytes += sq_stats->tls_resync_bytes; + s->tx_tls_skip_no_sync_data += sq_stats->tls_skip_no_sync_data; + s->tx_tls_drop_no_sync_data += sq_stats->tls_drop_no_sync_data; + s->tx_tls_drop_bypass_req += sq_stats->tls_drop_bypass_req; +#endif + s->tx_cqes += sq_stats->cqes; +} + +static void mlx5e_stats_grp_sw_update_stats_ptp(struct mlx5e_priv *priv, + struct mlx5e_sw_stats *s) +{ + int i; + + if (!priv->tx_ptp_opened && !priv->rx_ptp_opened) + return; + + mlx5e_stats_grp_sw_update_stats_ch_stats(s, &priv->ptp_stats.ch); + + if (priv->tx_ptp_opened) { + for (i = 0; i < priv->max_opened_tc; i++) { + mlx5e_stats_grp_sw_update_stats_sq(s, &priv->ptp_stats.sq[i]); + + /* https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92657 */ + barrier(); + } + } + if (priv->rx_ptp_opened) { + mlx5e_stats_grp_sw_update_stats_rq_stats(s, &priv->ptp_stats.rq); + + /* https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92657 */ + barrier(); + } +} + +static void mlx5e_stats_grp_sw_update_stats_qos(struct mlx5e_priv *priv, + struct mlx5e_sw_stats *s) +{ + struct mlx5e_sq_stats **stats; + u16 max_qos_sqs; + int i; + + /* Pairs with smp_store_release in mlx5e_open_qos_sq. */ + max_qos_sqs = smp_load_acquire(&priv->htb.max_qos_sqs); + stats = READ_ONCE(priv->htb.qos_sq_stats); + + for (i = 0; i < max_qos_sqs; i++) { + mlx5e_stats_grp_sw_update_stats_sq(s, READ_ONCE(stats[i])); + + /* https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92657 */ + barrier(); + } +} + +static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(sw) +{ + struct mlx5e_sw_stats *s = &priv->stats.sw; + int i; + + memset(s, 0, sizeof(*s)); + + for (i = 0; i < priv->stats_nch; i++) { + struct mlx5e_channel_stats *channel_stats = + priv->channel_stats[i]; + int j; + + mlx5e_stats_grp_sw_update_stats_rq_stats(s, &channel_stats->rq); + mlx5e_stats_grp_sw_update_stats_xdpsq(s, &channel_stats->rq_xdpsq); + mlx5e_stats_grp_sw_update_stats_ch_stats(s, &channel_stats->ch); + /* xdp redirect */ + mlx5e_stats_grp_sw_update_stats_xdp_red(s, &channel_stats->xdpsq); + /* AF_XDP zero-copy */ + mlx5e_stats_grp_sw_update_stats_xskrq(s, &channel_stats->xskrq); + mlx5e_stats_grp_sw_update_stats_xsksq(s, &channel_stats->xsksq); + + for (j = 0; j < priv->max_opened_tc; j++) { + mlx5e_stats_grp_sw_update_stats_sq(s, &channel_stats->sq[j]); + + /* https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92657 */ + barrier(); + } + } + mlx5e_stats_grp_sw_update_stats_ptp(priv, s); + mlx5e_stats_grp_sw_update_stats_qos(priv, s); +} + +static const struct counter_desc q_stats_desc[] = { + { MLX5E_DECLARE_STAT(struct mlx5e_qcounter_stats, rx_out_of_buffer) }, +}; + +static const struct counter_desc drop_rq_stats_desc[] = { + { MLX5E_DECLARE_STAT(struct mlx5e_qcounter_stats, rx_if_down_packets) }, +}; + +#define NUM_Q_COUNTERS ARRAY_SIZE(q_stats_desc) +#define NUM_DROP_RQ_COUNTERS ARRAY_SIZE(drop_rq_stats_desc) + +static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(qcnt) +{ + int num_stats = 0; + + if (priv->q_counter) + num_stats += NUM_Q_COUNTERS; + + if (priv->drop_rq_q_counter) + num_stats += NUM_DROP_RQ_COUNTERS; + + return num_stats; +} + +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(qcnt) +{ + int i; + + for (i = 0; i < NUM_Q_COUNTERS && priv->q_counter; i++) + strcpy(data + (idx++) * ETH_GSTRING_LEN, + q_stats_desc[i].format); + + for (i = 0; i < NUM_DROP_RQ_COUNTERS && priv->drop_rq_q_counter; i++) + strcpy(data + (idx++) * ETH_GSTRING_LEN, + drop_rq_stats_desc[i].format); + + return idx; +} + +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(qcnt) +{ + int i; + + for (i = 0; i < NUM_Q_COUNTERS && priv->q_counter; i++) + data[idx++] = MLX5E_READ_CTR32_CPU(&priv->stats.qcnt, + q_stats_desc, i); + for (i = 0; i < NUM_DROP_RQ_COUNTERS && priv->drop_rq_q_counter; i++) + data[idx++] = MLX5E_READ_CTR32_CPU(&priv->stats.qcnt, + drop_rq_stats_desc, i); + return idx; +} + +static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(qcnt) +{ + struct mlx5e_qcounter_stats *qcnt = &priv->stats.qcnt; + u32 out[MLX5_ST_SZ_DW(query_q_counter_out)] = {}; + u32 in[MLX5_ST_SZ_DW(query_q_counter_in)] = {}; + int ret; + + MLX5_SET(query_q_counter_in, in, opcode, MLX5_CMD_OP_QUERY_Q_COUNTER); + + if (priv->q_counter) { + MLX5_SET(query_q_counter_in, in, counter_set_id, + priv->q_counter); + ret = mlx5_cmd_exec_inout(priv->mdev, query_q_counter, in, out); + if (!ret) + qcnt->rx_out_of_buffer = MLX5_GET(query_q_counter_out, + out, out_of_buffer); + } + + if (priv->drop_rq_q_counter) { + MLX5_SET(query_q_counter_in, in, counter_set_id, + priv->drop_rq_q_counter); + ret = mlx5_cmd_exec_inout(priv->mdev, query_q_counter, in, out); + if (!ret) + qcnt->rx_if_down_packets = MLX5_GET(query_q_counter_out, + out, out_of_buffer); + } +} + +#define VNIC_ENV_OFF(c) MLX5_BYTE_OFF(query_vnic_env_out, c) +static const struct counter_desc vnic_env_stats_steer_desc[] = { + { "rx_steer_missed_packets", + VNIC_ENV_OFF(vport_env.nic_receive_steering_discard) }, +}; + +static const struct counter_desc vnic_env_stats_dev_oob_desc[] = { + { "dev_internal_queue_oob", + VNIC_ENV_OFF(vport_env.internal_rq_out_of_buffer) }, +}; + +#define NUM_VNIC_ENV_STEER_COUNTERS(dev) \ + (MLX5_CAP_GEN(dev, nic_receive_steering_discard) ? \ + ARRAY_SIZE(vnic_env_stats_steer_desc) : 0) +#define NUM_VNIC_ENV_DEV_OOB_COUNTERS(dev) \ + (MLX5_CAP_GEN(dev, vnic_env_int_rq_oob) ? \ + ARRAY_SIZE(vnic_env_stats_dev_oob_desc) : 0) + +static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(vnic_env) +{ + return NUM_VNIC_ENV_STEER_COUNTERS(priv->mdev) + + NUM_VNIC_ENV_DEV_OOB_COUNTERS(priv->mdev); +} + +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(vnic_env) +{ + int i; + + for (i = 0; i < NUM_VNIC_ENV_STEER_COUNTERS(priv->mdev); i++) + strcpy(data + (idx++) * ETH_GSTRING_LEN, + vnic_env_stats_steer_desc[i].format); + + for (i = 0; i < NUM_VNIC_ENV_DEV_OOB_COUNTERS(priv->mdev); i++) + strcpy(data + (idx++) * ETH_GSTRING_LEN, + vnic_env_stats_dev_oob_desc[i].format); + return idx; +} + +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(vnic_env) +{ + int i; + + for (i = 0; i < NUM_VNIC_ENV_STEER_COUNTERS(priv->mdev); i++) + data[idx++] = MLX5E_READ_CTR64_BE(priv->stats.vnic.query_vnic_env_out, + vnic_env_stats_steer_desc, i); + + for (i = 0; i < NUM_VNIC_ENV_DEV_OOB_COUNTERS(priv->mdev); i++) + data[idx++] = MLX5E_READ_CTR32_BE(priv->stats.vnic.query_vnic_env_out, + vnic_env_stats_dev_oob_desc, i); + return idx; +} + +static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(vnic_env) +{ + u32 *out = (u32 *)priv->stats.vnic.query_vnic_env_out; + u32 in[MLX5_ST_SZ_DW(query_vnic_env_in)] = {}; + struct mlx5_core_dev *mdev = priv->mdev; + + if (!MLX5_CAP_GEN(priv->mdev, nic_receive_steering_discard)) + return; + + MLX5_SET(query_vnic_env_in, in, opcode, MLX5_CMD_OP_QUERY_VNIC_ENV); + mlx5_cmd_exec_inout(mdev, query_vnic_env, in, out); +} + +#define VPORT_COUNTER_OFF(c) MLX5_BYTE_OFF(query_vport_counter_out, c) +static const struct counter_desc vport_stats_desc[] = { + { "rx_vport_unicast_packets", + VPORT_COUNTER_OFF(received_eth_unicast.packets) }, + { "rx_vport_unicast_bytes", + VPORT_COUNTER_OFF(received_eth_unicast.octets) }, + { "tx_vport_unicast_packets", + VPORT_COUNTER_OFF(transmitted_eth_unicast.packets) }, + { "tx_vport_unicast_bytes", + VPORT_COUNTER_OFF(transmitted_eth_unicast.octets) }, + { "rx_vport_multicast_packets", + VPORT_COUNTER_OFF(received_eth_multicast.packets) }, + { "rx_vport_multicast_bytes", + VPORT_COUNTER_OFF(received_eth_multicast.octets) }, + { "tx_vport_multicast_packets", + VPORT_COUNTER_OFF(transmitted_eth_multicast.packets) }, + { "tx_vport_multicast_bytes", + VPORT_COUNTER_OFF(transmitted_eth_multicast.octets) }, + { "rx_vport_broadcast_packets", + VPORT_COUNTER_OFF(received_eth_broadcast.packets) }, + { "rx_vport_broadcast_bytes", + VPORT_COUNTER_OFF(received_eth_broadcast.octets) }, + { "tx_vport_broadcast_packets", + VPORT_COUNTER_OFF(transmitted_eth_broadcast.packets) }, + { "tx_vport_broadcast_bytes", + VPORT_COUNTER_OFF(transmitted_eth_broadcast.octets) }, + { "rx_vport_rdma_unicast_packets", + VPORT_COUNTER_OFF(received_ib_unicast.packets) }, + { "rx_vport_rdma_unicast_bytes", + VPORT_COUNTER_OFF(received_ib_unicast.octets) }, + { "tx_vport_rdma_unicast_packets", + VPORT_COUNTER_OFF(transmitted_ib_unicast.packets) }, + { "tx_vport_rdma_unicast_bytes", + VPORT_COUNTER_OFF(transmitted_ib_unicast.octets) }, + { "rx_vport_rdma_multicast_packets", + VPORT_COUNTER_OFF(received_ib_multicast.packets) }, + { "rx_vport_rdma_multicast_bytes", + VPORT_COUNTER_OFF(received_ib_multicast.octets) }, + { "tx_vport_rdma_multicast_packets", + VPORT_COUNTER_OFF(transmitted_ib_multicast.packets) }, + { "tx_vport_rdma_multicast_bytes", + VPORT_COUNTER_OFF(transmitted_ib_multicast.octets) }, +}; + +#define NUM_VPORT_COUNTERS ARRAY_SIZE(vport_stats_desc) + +static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(vport) +{ + return NUM_VPORT_COUNTERS; +} + +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(vport) +{ + int i; + + for (i = 0; i < NUM_VPORT_COUNTERS; i++) + strcpy(data + (idx++) * ETH_GSTRING_LEN, vport_stats_desc[i].format); + return idx; +} + +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(vport) +{ + int i; + + for (i = 0; i < NUM_VPORT_COUNTERS; i++) + data[idx++] = MLX5E_READ_CTR64_BE(priv->stats.vport.query_vport_out, + vport_stats_desc, i); + return idx; +} + +static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(vport) +{ + u32 *out = (u32 *)priv->stats.vport.query_vport_out; + u32 in[MLX5_ST_SZ_DW(query_vport_counter_in)] = {}; + struct mlx5_core_dev *mdev = priv->mdev; + + MLX5_SET(query_vport_counter_in, in, opcode, MLX5_CMD_OP_QUERY_VPORT_COUNTER); + mlx5_cmd_exec_inout(mdev, query_vport_counter, in, out); +} + +#define PPORT_802_3_OFF(c) \ + MLX5_BYTE_OFF(ppcnt_reg, \ + counter_set.eth_802_3_cntrs_grp_data_layout.c##_high) +static const struct counter_desc pport_802_3_stats_desc[] = { + { "tx_packets_phy", PPORT_802_3_OFF(a_frames_transmitted_ok) }, + { "rx_packets_phy", PPORT_802_3_OFF(a_frames_received_ok) }, + { "rx_crc_errors_phy", PPORT_802_3_OFF(a_frame_check_sequence_errors) }, + { "tx_bytes_phy", PPORT_802_3_OFF(a_octets_transmitted_ok) }, + { "rx_bytes_phy", PPORT_802_3_OFF(a_octets_received_ok) }, + { "tx_multicast_phy", PPORT_802_3_OFF(a_multicast_frames_xmitted_ok) }, + { "tx_broadcast_phy", PPORT_802_3_OFF(a_broadcast_frames_xmitted_ok) }, + { "rx_multicast_phy", PPORT_802_3_OFF(a_multicast_frames_received_ok) }, + { "rx_broadcast_phy", PPORT_802_3_OFF(a_broadcast_frames_received_ok) }, + { "rx_in_range_len_errors_phy", PPORT_802_3_OFF(a_in_range_length_errors) }, + { "rx_out_of_range_len_phy", PPORT_802_3_OFF(a_out_of_range_length_field) }, + { "rx_oversize_pkts_phy", PPORT_802_3_OFF(a_frame_too_long_errors) }, + { "rx_symbol_err_phy", PPORT_802_3_OFF(a_symbol_error_during_carrier) }, + { "tx_mac_control_phy", PPORT_802_3_OFF(a_mac_control_frames_transmitted) }, + { "rx_mac_control_phy", PPORT_802_3_OFF(a_mac_control_frames_received) }, + { "rx_unsupported_op_phy", PPORT_802_3_OFF(a_unsupported_opcodes_received) }, + { "rx_pause_ctrl_phy", PPORT_802_3_OFF(a_pause_mac_ctrl_frames_received) }, + { "tx_pause_ctrl_phy", PPORT_802_3_OFF(a_pause_mac_ctrl_frames_transmitted) }, +}; + +#define NUM_PPORT_802_3_COUNTERS ARRAY_SIZE(pport_802_3_stats_desc) + +static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(802_3) +{ + return NUM_PPORT_802_3_COUNTERS; +} + +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(802_3) +{ + int i; + + for (i = 0; i < NUM_PPORT_802_3_COUNTERS; i++) + strcpy(data + (idx++) * ETH_GSTRING_LEN, pport_802_3_stats_desc[i].format); + return idx; +} + +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(802_3) +{ + int i; + + for (i = 0; i < NUM_PPORT_802_3_COUNTERS; i++) + data[idx++] = MLX5E_READ_CTR64_BE(&priv->stats.pport.IEEE_802_3_counters, + pport_802_3_stats_desc, i); + return idx; +} + +#define MLX5_BASIC_PPCNT_SUPPORTED(mdev) \ + (MLX5_CAP_GEN(mdev, pcam_reg) ? MLX5_CAP_PCAM_REG(mdev, ppcnt) : 1) + +static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(802_3) +{ + struct mlx5e_pport_stats *pstats = &priv->stats.pport; + struct mlx5_core_dev *mdev = priv->mdev; + u32 in[MLX5_ST_SZ_DW(ppcnt_reg)] = {0}; + int sz = MLX5_ST_SZ_BYTES(ppcnt_reg); + void *out; + + if (!MLX5_BASIC_PPCNT_SUPPORTED(mdev)) + return; + + MLX5_SET(ppcnt_reg, in, local_port, 1); + out = pstats->IEEE_802_3_counters; + MLX5_SET(ppcnt_reg, in, grp, MLX5_IEEE_802_3_COUNTERS_GROUP); + mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0); +} + +#define MLX5E_READ_CTR64_BE_F(ptr, set, c) \ + be64_to_cpu(*(__be64 *)((char *)ptr + \ + MLX5_BYTE_OFF(ppcnt_reg, \ + counter_set.set.c##_high))) + +static int mlx5e_stats_get_ieee(struct mlx5_core_dev *mdev, + u32 *ppcnt_ieee_802_3) +{ + u32 in[MLX5_ST_SZ_DW(ppcnt_reg)] = {}; + int sz = MLX5_ST_SZ_BYTES(ppcnt_reg); + + if (!MLX5_BASIC_PPCNT_SUPPORTED(mdev)) + return -EOPNOTSUPP; + + MLX5_SET(ppcnt_reg, in, local_port, 1); + MLX5_SET(ppcnt_reg, in, grp, MLX5_IEEE_802_3_COUNTERS_GROUP); + return mlx5_core_access_reg(mdev, in, sz, ppcnt_ieee_802_3, + sz, MLX5_REG_PPCNT, 0, 0); +} + +void mlx5e_stats_pause_get(struct mlx5e_priv *priv, + struct ethtool_pause_stats *pause_stats) +{ + u32 ppcnt_ieee_802_3[MLX5_ST_SZ_DW(ppcnt_reg)]; + struct mlx5_core_dev *mdev = priv->mdev; + + if (mlx5e_stats_get_ieee(mdev, ppcnt_ieee_802_3)) + return; + + pause_stats->tx_pause_frames = + MLX5E_READ_CTR64_BE_F(ppcnt_ieee_802_3, + eth_802_3_cntrs_grp_data_layout, + a_pause_mac_ctrl_frames_transmitted); + pause_stats->rx_pause_frames = + MLX5E_READ_CTR64_BE_F(ppcnt_ieee_802_3, + eth_802_3_cntrs_grp_data_layout, + a_pause_mac_ctrl_frames_received); +} + +void mlx5e_stats_eth_phy_get(struct mlx5e_priv *priv, + struct ethtool_eth_phy_stats *phy_stats) +{ + u32 ppcnt_ieee_802_3[MLX5_ST_SZ_DW(ppcnt_reg)]; + struct mlx5_core_dev *mdev = priv->mdev; + + if (mlx5e_stats_get_ieee(mdev, ppcnt_ieee_802_3)) + return; + + phy_stats->SymbolErrorDuringCarrier = + MLX5E_READ_CTR64_BE_F(ppcnt_ieee_802_3, + eth_802_3_cntrs_grp_data_layout, + a_symbol_error_during_carrier); +} + +void mlx5e_stats_eth_mac_get(struct mlx5e_priv *priv, + struct ethtool_eth_mac_stats *mac_stats) +{ + u32 ppcnt_ieee_802_3[MLX5_ST_SZ_DW(ppcnt_reg)]; + struct mlx5_core_dev *mdev = priv->mdev; + + if (mlx5e_stats_get_ieee(mdev, ppcnt_ieee_802_3)) + return; + +#define RD(name) \ + MLX5E_READ_CTR64_BE_F(ppcnt_ieee_802_3, \ + eth_802_3_cntrs_grp_data_layout, \ + name) + + mac_stats->FramesTransmittedOK = RD(a_frames_transmitted_ok); + mac_stats->FramesReceivedOK = RD(a_frames_received_ok); + mac_stats->FrameCheckSequenceErrors = RD(a_frame_check_sequence_errors); + mac_stats->OctetsTransmittedOK = RD(a_octets_transmitted_ok); + mac_stats->OctetsReceivedOK = RD(a_octets_received_ok); + mac_stats->MulticastFramesXmittedOK = RD(a_multicast_frames_xmitted_ok); + mac_stats->BroadcastFramesXmittedOK = RD(a_broadcast_frames_xmitted_ok); + mac_stats->MulticastFramesReceivedOK = RD(a_multicast_frames_received_ok); + mac_stats->BroadcastFramesReceivedOK = RD(a_broadcast_frames_received_ok); + mac_stats->InRangeLengthErrors = RD(a_in_range_length_errors); + mac_stats->OutOfRangeLengthField = RD(a_out_of_range_length_field); + mac_stats->FrameTooLongErrors = RD(a_frame_too_long_errors); +#undef RD +} + +void mlx5e_stats_eth_ctrl_get(struct mlx5e_priv *priv, + struct ethtool_eth_ctrl_stats *ctrl_stats) +{ + u32 ppcnt_ieee_802_3[MLX5_ST_SZ_DW(ppcnt_reg)]; + struct mlx5_core_dev *mdev = priv->mdev; + + if (mlx5e_stats_get_ieee(mdev, ppcnt_ieee_802_3)) + return; + + ctrl_stats->MACControlFramesTransmitted = + MLX5E_READ_CTR64_BE_F(ppcnt_ieee_802_3, + eth_802_3_cntrs_grp_data_layout, + a_mac_control_frames_transmitted); + ctrl_stats->MACControlFramesReceived = + MLX5E_READ_CTR64_BE_F(ppcnt_ieee_802_3, + eth_802_3_cntrs_grp_data_layout, + a_mac_control_frames_received); + ctrl_stats->UnsupportedOpcodesReceived = + MLX5E_READ_CTR64_BE_F(ppcnt_ieee_802_3, + eth_802_3_cntrs_grp_data_layout, + a_unsupported_opcodes_received); +} + +#define PPORT_2863_OFF(c) \ + MLX5_BYTE_OFF(ppcnt_reg, \ + counter_set.eth_2863_cntrs_grp_data_layout.c##_high) +static const struct counter_desc pport_2863_stats_desc[] = { + { "rx_discards_phy", PPORT_2863_OFF(if_in_discards) }, + { "tx_discards_phy", PPORT_2863_OFF(if_out_discards) }, + { "tx_errors_phy", PPORT_2863_OFF(if_out_errors) }, +}; + +#define NUM_PPORT_2863_COUNTERS ARRAY_SIZE(pport_2863_stats_desc) + +static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(2863) +{ + return NUM_PPORT_2863_COUNTERS; +} + +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(2863) +{ + int i; + + for (i = 0; i < NUM_PPORT_2863_COUNTERS; i++) + strcpy(data + (idx++) * ETH_GSTRING_LEN, pport_2863_stats_desc[i].format); + return idx; +} + +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(2863) +{ + int i; + + for (i = 0; i < NUM_PPORT_2863_COUNTERS; i++) + data[idx++] = MLX5E_READ_CTR64_BE(&priv->stats.pport.RFC_2863_counters, + pport_2863_stats_desc, i); + return idx; +} + +static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(2863) +{ + struct mlx5e_pport_stats *pstats = &priv->stats.pport; + struct mlx5_core_dev *mdev = priv->mdev; + u32 in[MLX5_ST_SZ_DW(ppcnt_reg)] = {0}; + int sz = MLX5_ST_SZ_BYTES(ppcnt_reg); + void *out; + + MLX5_SET(ppcnt_reg, in, local_port, 1); + out = pstats->RFC_2863_counters; + MLX5_SET(ppcnt_reg, in, grp, MLX5_RFC_2863_COUNTERS_GROUP); + mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0); +} + +#define PPORT_2819_OFF(c) \ + MLX5_BYTE_OFF(ppcnt_reg, \ + counter_set.eth_2819_cntrs_grp_data_layout.c##_high) +static const struct counter_desc pport_2819_stats_desc[] = { + { "rx_undersize_pkts_phy", PPORT_2819_OFF(ether_stats_undersize_pkts) }, + { "rx_fragments_phy", PPORT_2819_OFF(ether_stats_fragments) }, + { "rx_jabbers_phy", PPORT_2819_OFF(ether_stats_jabbers) }, + { "rx_64_bytes_phy", PPORT_2819_OFF(ether_stats_pkts64octets) }, + { "rx_65_to_127_bytes_phy", PPORT_2819_OFF(ether_stats_pkts65to127octets) }, + { "rx_128_to_255_bytes_phy", PPORT_2819_OFF(ether_stats_pkts128to255octets) }, + { "rx_256_to_511_bytes_phy", PPORT_2819_OFF(ether_stats_pkts256to511octets) }, + { "rx_512_to_1023_bytes_phy", PPORT_2819_OFF(ether_stats_pkts512to1023octets) }, + { "rx_1024_to_1518_bytes_phy", PPORT_2819_OFF(ether_stats_pkts1024to1518octets) }, + { "rx_1519_to_2047_bytes_phy", PPORT_2819_OFF(ether_stats_pkts1519to2047octets) }, + { "rx_2048_to_4095_bytes_phy", PPORT_2819_OFF(ether_stats_pkts2048to4095octets) }, + { "rx_4096_to_8191_bytes_phy", PPORT_2819_OFF(ether_stats_pkts4096to8191octets) }, + { "rx_8192_to_10239_bytes_phy", PPORT_2819_OFF(ether_stats_pkts8192to10239octets) }, +}; + +#define NUM_PPORT_2819_COUNTERS ARRAY_SIZE(pport_2819_stats_desc) + +static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(2819) +{ + return NUM_PPORT_2819_COUNTERS; +} + +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(2819) +{ + int i; + + for (i = 0; i < NUM_PPORT_2819_COUNTERS; i++) + strcpy(data + (idx++) * ETH_GSTRING_LEN, pport_2819_stats_desc[i].format); + return idx; +} + +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(2819) +{ + int i; + + for (i = 0; i < NUM_PPORT_2819_COUNTERS; i++) + data[idx++] = MLX5E_READ_CTR64_BE(&priv->stats.pport.RFC_2819_counters, + pport_2819_stats_desc, i); + return idx; +} + +static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(2819) +{ + struct mlx5e_pport_stats *pstats = &priv->stats.pport; + struct mlx5_core_dev *mdev = priv->mdev; + u32 in[MLX5_ST_SZ_DW(ppcnt_reg)] = {0}; + int sz = MLX5_ST_SZ_BYTES(ppcnt_reg); + void *out; + + if (!MLX5_BASIC_PPCNT_SUPPORTED(mdev)) + return; + + MLX5_SET(ppcnt_reg, in, local_port, 1); + out = pstats->RFC_2819_counters; + MLX5_SET(ppcnt_reg, in, grp, MLX5_RFC_2819_COUNTERS_GROUP); + mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0); +} + +static const struct ethtool_rmon_hist_range mlx5e_rmon_ranges[] = { + { 0, 64 }, + { 65, 127 }, + { 128, 255 }, + { 256, 511 }, + { 512, 1023 }, + { 1024, 1518 }, + { 1519, 2047 }, + { 2048, 4095 }, + { 4096, 8191 }, + { 8192, 10239 }, + {} +}; + +void mlx5e_stats_rmon_get(struct mlx5e_priv *priv, + struct ethtool_rmon_stats *rmon, + const struct ethtool_rmon_hist_range **ranges) +{ + u32 ppcnt_RFC_2819_counters[MLX5_ST_SZ_DW(ppcnt_reg)]; + struct mlx5_core_dev *mdev = priv->mdev; + u32 in[MLX5_ST_SZ_DW(ppcnt_reg)] = {0}; + int sz = MLX5_ST_SZ_BYTES(ppcnt_reg); + + MLX5_SET(ppcnt_reg, in, local_port, 1); + MLX5_SET(ppcnt_reg, in, grp, MLX5_RFC_2819_COUNTERS_GROUP); + if (mlx5_core_access_reg(mdev, in, sz, ppcnt_RFC_2819_counters, + sz, MLX5_REG_PPCNT, 0, 0)) + return; + +#define RD(name) \ + MLX5E_READ_CTR64_BE_F(ppcnt_RFC_2819_counters, \ + eth_2819_cntrs_grp_data_layout, \ + name) + + rmon->undersize_pkts = RD(ether_stats_undersize_pkts); + rmon->fragments = RD(ether_stats_fragments); + rmon->jabbers = RD(ether_stats_jabbers); + + rmon->hist[0] = RD(ether_stats_pkts64octets); + rmon->hist[1] = RD(ether_stats_pkts65to127octets); + rmon->hist[2] = RD(ether_stats_pkts128to255octets); + rmon->hist[3] = RD(ether_stats_pkts256to511octets); + rmon->hist[4] = RD(ether_stats_pkts512to1023octets); + rmon->hist[5] = RD(ether_stats_pkts1024to1518octets); + rmon->hist[6] = RD(ether_stats_pkts1519to2047octets); + rmon->hist[7] = RD(ether_stats_pkts2048to4095octets); + rmon->hist[8] = RD(ether_stats_pkts4096to8191octets); + rmon->hist[9] = RD(ether_stats_pkts8192to10239octets); +#undef RD + + *ranges = mlx5e_rmon_ranges; +} + +#define PPORT_PHY_STATISTICAL_OFF(c) \ + MLX5_BYTE_OFF(ppcnt_reg, \ + counter_set.phys_layer_statistical_cntrs.c##_high) +static const struct counter_desc pport_phy_statistical_stats_desc[] = { + { "rx_pcs_symbol_err_phy", PPORT_PHY_STATISTICAL_OFF(phy_symbol_errors) }, + { "rx_corrected_bits_phy", PPORT_PHY_STATISTICAL_OFF(phy_corrected_bits) }, +}; + +static const struct counter_desc +pport_phy_statistical_err_lanes_stats_desc[] = { + { "rx_err_lane_0_phy", PPORT_PHY_STATISTICAL_OFF(phy_corrected_bits_lane0) }, + { "rx_err_lane_1_phy", PPORT_PHY_STATISTICAL_OFF(phy_corrected_bits_lane1) }, + { "rx_err_lane_2_phy", PPORT_PHY_STATISTICAL_OFF(phy_corrected_bits_lane2) }, + { "rx_err_lane_3_phy", PPORT_PHY_STATISTICAL_OFF(phy_corrected_bits_lane3) }, +}; + +#define NUM_PPORT_PHY_STATISTICAL_COUNTERS \ + ARRAY_SIZE(pport_phy_statistical_stats_desc) +#define NUM_PPORT_PHY_STATISTICAL_PER_LANE_COUNTERS \ + ARRAY_SIZE(pport_phy_statistical_err_lanes_stats_desc) + +static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(phy) +{ + struct mlx5_core_dev *mdev = priv->mdev; + int num_stats; + + /* "1" for link_down_events special counter */ + num_stats = 1; + + num_stats += MLX5_CAP_PCAM_FEATURE(mdev, ppcnt_statistical_group) ? + NUM_PPORT_PHY_STATISTICAL_COUNTERS : 0; + + num_stats += MLX5_CAP_PCAM_FEATURE(mdev, per_lane_error_counters) ? + NUM_PPORT_PHY_STATISTICAL_PER_LANE_COUNTERS : 0; + + return num_stats; +} + +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(phy) +{ + struct mlx5_core_dev *mdev = priv->mdev; + int i; + + strcpy(data + (idx++) * ETH_GSTRING_LEN, "link_down_events_phy"); + + if (!MLX5_CAP_PCAM_FEATURE(mdev, ppcnt_statistical_group)) + return idx; + + for (i = 0; i < NUM_PPORT_PHY_STATISTICAL_COUNTERS; i++) + strcpy(data + (idx++) * ETH_GSTRING_LEN, + pport_phy_statistical_stats_desc[i].format); + + if (MLX5_CAP_PCAM_FEATURE(mdev, per_lane_error_counters)) + for (i = 0; i < NUM_PPORT_PHY_STATISTICAL_PER_LANE_COUNTERS; i++) + strcpy(data + (idx++) * ETH_GSTRING_LEN, + pport_phy_statistical_err_lanes_stats_desc[i].format); + + return idx; +} + +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(phy) +{ + struct mlx5_core_dev *mdev = priv->mdev; + int i; + + /* link_down_events_phy has special handling since it is not stored in __be64 format */ + data[idx++] = MLX5_GET(ppcnt_reg, priv->stats.pport.phy_counters, + counter_set.phys_layer_cntrs.link_down_events); + + if (!MLX5_CAP_PCAM_FEATURE(mdev, ppcnt_statistical_group)) + return idx; + + for (i = 0; i < NUM_PPORT_PHY_STATISTICAL_COUNTERS; i++) + data[idx++] = + MLX5E_READ_CTR64_BE(&priv->stats.pport.phy_statistical_counters, + pport_phy_statistical_stats_desc, i); + + if (MLX5_CAP_PCAM_FEATURE(mdev, per_lane_error_counters)) + for (i = 0; i < NUM_PPORT_PHY_STATISTICAL_PER_LANE_COUNTERS; i++) + data[idx++] = + MLX5E_READ_CTR64_BE(&priv->stats.pport.phy_statistical_counters, + pport_phy_statistical_err_lanes_stats_desc, + i); + return idx; +} + +static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(phy) +{ + struct mlx5e_pport_stats *pstats = &priv->stats.pport; + struct mlx5_core_dev *mdev = priv->mdev; + u32 in[MLX5_ST_SZ_DW(ppcnt_reg)] = {0}; + int sz = MLX5_ST_SZ_BYTES(ppcnt_reg); + void *out; + + MLX5_SET(ppcnt_reg, in, local_port, 1); + out = pstats->phy_counters; + MLX5_SET(ppcnt_reg, in, grp, MLX5_PHYSICAL_LAYER_COUNTERS_GROUP); + mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0); + + if (!MLX5_CAP_PCAM_FEATURE(mdev, ppcnt_statistical_group)) + return; + + out = pstats->phy_statistical_counters; + MLX5_SET(ppcnt_reg, in, grp, MLX5_PHYSICAL_LAYER_STATISTICAL_GROUP); + mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0); +} + +static int fec_num_lanes(struct mlx5_core_dev *dev) +{ + u32 out[MLX5_ST_SZ_DW(pmlp_reg)] = {}; + u32 in[MLX5_ST_SZ_DW(pmlp_reg)] = {}; + int err; + + MLX5_SET(pmlp_reg, in, local_port, 1); + err = mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out), + MLX5_REG_PMLP, 0, 0); + if (err) + return 0; + + return MLX5_GET(pmlp_reg, out, width); +} + +static int fec_active_mode(struct mlx5_core_dev *mdev) +{ + unsigned long fec_active_long; + u32 fec_active; + + if (mlx5e_get_fec_mode(mdev, &fec_active, NULL)) + return MLX5E_FEC_NOFEC; + + fec_active_long = fec_active; + return find_first_bit(&fec_active_long, sizeof(unsigned long) * BITS_PER_BYTE); +} + +#define MLX5E_STATS_SET_FEC_BLOCK(idx) ({ \ + fec_stats->corrected_blocks.lanes[(idx)] = \ + MLX5E_READ_CTR64_BE_F(ppcnt, phys_layer_cntrs, \ + fc_fec_corrected_blocks_lane##idx); \ + fec_stats->uncorrectable_blocks.lanes[(idx)] = \ + MLX5E_READ_CTR64_BE_F(ppcnt, phys_layer_cntrs, \ + fc_fec_uncorrectable_blocks_lane##idx); \ +}) + +static void fec_set_fc_stats(struct ethtool_fec_stats *fec_stats, + u32 *ppcnt, u8 lanes) +{ + if (lanes > 3) { /* 4 lanes */ + MLX5E_STATS_SET_FEC_BLOCK(3); + MLX5E_STATS_SET_FEC_BLOCK(2); + } + if (lanes > 1) /* 2 lanes */ + MLX5E_STATS_SET_FEC_BLOCK(1); + if (lanes > 0) /* 1 lane */ + MLX5E_STATS_SET_FEC_BLOCK(0); +} + +static void fec_set_rs_stats(struct ethtool_fec_stats *fec_stats, u32 *ppcnt) +{ + fec_stats->corrected_blocks.total = + MLX5E_READ_CTR64_BE_F(ppcnt, phys_layer_cntrs, + rs_fec_corrected_blocks); + fec_stats->uncorrectable_blocks.total = + MLX5E_READ_CTR64_BE_F(ppcnt, phys_layer_cntrs, + rs_fec_uncorrectable_blocks); +} + +static void fec_set_block_stats(struct mlx5e_priv *priv, + struct ethtool_fec_stats *fec_stats) +{ + struct mlx5_core_dev *mdev = priv->mdev; + u32 out[MLX5_ST_SZ_DW(ppcnt_reg)] = {}; + u32 in[MLX5_ST_SZ_DW(ppcnt_reg)] = {}; + int sz = MLX5_ST_SZ_BYTES(ppcnt_reg); + int mode = fec_active_mode(mdev); + + if (mode == MLX5E_FEC_NOFEC) + return; + + MLX5_SET(ppcnt_reg, in, local_port, 1); + MLX5_SET(ppcnt_reg, in, grp, MLX5_PHYSICAL_LAYER_COUNTERS_GROUP); + if (mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0)) + return; + + switch (mode) { + case MLX5E_FEC_RS_528_514: + case MLX5E_FEC_RS_544_514: + case MLX5E_FEC_LLRS_272_257_1: + fec_set_rs_stats(fec_stats, out); + return; + case MLX5E_FEC_FIRECODE: + fec_set_fc_stats(fec_stats, out, fec_num_lanes(mdev)); + } +} + +static void fec_set_corrected_bits_total(struct mlx5e_priv *priv, + struct ethtool_fec_stats *fec_stats) +{ + u32 ppcnt_phy_statistical[MLX5_ST_SZ_DW(ppcnt_reg)]; + struct mlx5_core_dev *mdev = priv->mdev; + u32 in[MLX5_ST_SZ_DW(ppcnt_reg)] = {}; + int sz = MLX5_ST_SZ_BYTES(ppcnt_reg); + + MLX5_SET(ppcnt_reg, in, local_port, 1); + MLX5_SET(ppcnt_reg, in, grp, MLX5_PHYSICAL_LAYER_STATISTICAL_GROUP); + if (mlx5_core_access_reg(mdev, in, sz, ppcnt_phy_statistical, + sz, MLX5_REG_PPCNT, 0, 0)) + return; + + fec_stats->corrected_bits.total = + MLX5E_READ_CTR64_BE_F(ppcnt_phy_statistical, + phys_layer_statistical_cntrs, + phy_corrected_bits); +} + +void mlx5e_stats_fec_get(struct mlx5e_priv *priv, + struct ethtool_fec_stats *fec_stats) +{ + if (!MLX5_CAP_PCAM_FEATURE(priv->mdev, ppcnt_statistical_group)) + return; + + fec_set_corrected_bits_total(priv, fec_stats); + fec_set_block_stats(priv, fec_stats); +} + +#define PPORT_ETH_EXT_OFF(c) \ + MLX5_BYTE_OFF(ppcnt_reg, \ + counter_set.eth_extended_cntrs_grp_data_layout.c##_high) +static const struct counter_desc pport_eth_ext_stats_desc[] = { + { "rx_buffer_passed_thres_phy", PPORT_ETH_EXT_OFF(rx_buffer_almost_full) }, +}; + +#define NUM_PPORT_ETH_EXT_COUNTERS ARRAY_SIZE(pport_eth_ext_stats_desc) + +static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(eth_ext) +{ + if (MLX5_CAP_PCAM_FEATURE((priv)->mdev, rx_buffer_fullness_counters)) + return NUM_PPORT_ETH_EXT_COUNTERS; + + return 0; +} + +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(eth_ext) +{ + int i; + + if (MLX5_CAP_PCAM_FEATURE((priv)->mdev, rx_buffer_fullness_counters)) + for (i = 0; i < NUM_PPORT_ETH_EXT_COUNTERS; i++) + strcpy(data + (idx++) * ETH_GSTRING_LEN, + pport_eth_ext_stats_desc[i].format); + return idx; +} + +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(eth_ext) +{ + int i; + + if (MLX5_CAP_PCAM_FEATURE((priv)->mdev, rx_buffer_fullness_counters)) + for (i = 0; i < NUM_PPORT_ETH_EXT_COUNTERS; i++) + data[idx++] = + MLX5E_READ_CTR64_BE(&priv->stats.pport.eth_ext_counters, + pport_eth_ext_stats_desc, i); + return idx; +} + +static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(eth_ext) +{ + struct mlx5e_pport_stats *pstats = &priv->stats.pport; + struct mlx5_core_dev *mdev = priv->mdev; + u32 in[MLX5_ST_SZ_DW(ppcnt_reg)] = {0}; + int sz = MLX5_ST_SZ_BYTES(ppcnt_reg); + void *out; + + if (!MLX5_CAP_PCAM_FEATURE(mdev, rx_buffer_fullness_counters)) + return; + + MLX5_SET(ppcnt_reg, in, local_port, 1); + out = pstats->eth_ext_counters; + MLX5_SET(ppcnt_reg, in, grp, MLX5_ETHERNET_EXTENDED_COUNTERS_GROUP); + mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0); +} + +#define PCIE_PERF_OFF(c) \ + MLX5_BYTE_OFF(mpcnt_reg, counter_set.pcie_perf_cntrs_grp_data_layout.c) +static const struct counter_desc pcie_perf_stats_desc[] = { + { "rx_pci_signal_integrity", PCIE_PERF_OFF(rx_errors) }, + { "tx_pci_signal_integrity", PCIE_PERF_OFF(tx_errors) }, +}; + +#define PCIE_PERF_OFF64(c) \ + MLX5_BYTE_OFF(mpcnt_reg, counter_set.pcie_perf_cntrs_grp_data_layout.c##_high) +static const struct counter_desc pcie_perf_stats_desc64[] = { + { "outbound_pci_buffer_overflow", PCIE_PERF_OFF64(tx_overflow_buffer_pkt) }, +}; + +static const struct counter_desc pcie_perf_stall_stats_desc[] = { + { "outbound_pci_stalled_rd", PCIE_PERF_OFF(outbound_stalled_reads) }, + { "outbound_pci_stalled_wr", PCIE_PERF_OFF(outbound_stalled_writes) }, + { "outbound_pci_stalled_rd_events", PCIE_PERF_OFF(outbound_stalled_reads_events) }, + { "outbound_pci_stalled_wr_events", PCIE_PERF_OFF(outbound_stalled_writes_events) }, +}; + +#define NUM_PCIE_PERF_COUNTERS ARRAY_SIZE(pcie_perf_stats_desc) +#define NUM_PCIE_PERF_COUNTERS64 ARRAY_SIZE(pcie_perf_stats_desc64) +#define NUM_PCIE_PERF_STALL_COUNTERS ARRAY_SIZE(pcie_perf_stall_stats_desc) + +static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(pcie) +{ + int num_stats = 0; + + if (MLX5_CAP_MCAM_FEATURE((priv)->mdev, pcie_performance_group)) + num_stats += NUM_PCIE_PERF_COUNTERS; + + if (MLX5_CAP_MCAM_FEATURE((priv)->mdev, tx_overflow_buffer_pkt)) + num_stats += NUM_PCIE_PERF_COUNTERS64; + + if (MLX5_CAP_MCAM_FEATURE((priv)->mdev, pcie_outbound_stalled)) + num_stats += NUM_PCIE_PERF_STALL_COUNTERS; + + return num_stats; +} + +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(pcie) +{ + int i; + + if (MLX5_CAP_MCAM_FEATURE((priv)->mdev, pcie_performance_group)) + for (i = 0; i < NUM_PCIE_PERF_COUNTERS; i++) + strcpy(data + (idx++) * ETH_GSTRING_LEN, + pcie_perf_stats_desc[i].format); + + if (MLX5_CAP_MCAM_FEATURE((priv)->mdev, tx_overflow_buffer_pkt)) + for (i = 0; i < NUM_PCIE_PERF_COUNTERS64; i++) + strcpy(data + (idx++) * ETH_GSTRING_LEN, + pcie_perf_stats_desc64[i].format); + + if (MLX5_CAP_MCAM_FEATURE((priv)->mdev, pcie_outbound_stalled)) + for (i = 0; i < NUM_PCIE_PERF_STALL_COUNTERS; i++) + strcpy(data + (idx++) * ETH_GSTRING_LEN, + pcie_perf_stall_stats_desc[i].format); + return idx; +} + +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(pcie) +{ + int i; + + if (MLX5_CAP_MCAM_FEATURE((priv)->mdev, pcie_performance_group)) + for (i = 0; i < NUM_PCIE_PERF_COUNTERS; i++) + data[idx++] = + MLX5E_READ_CTR32_BE(&priv->stats.pcie.pcie_perf_counters, + pcie_perf_stats_desc, i); + + if (MLX5_CAP_MCAM_FEATURE((priv)->mdev, tx_overflow_buffer_pkt)) + for (i = 0; i < NUM_PCIE_PERF_COUNTERS64; i++) + data[idx++] = + MLX5E_READ_CTR64_BE(&priv->stats.pcie.pcie_perf_counters, + pcie_perf_stats_desc64, i); + + if (MLX5_CAP_MCAM_FEATURE((priv)->mdev, pcie_outbound_stalled)) + for (i = 0; i < NUM_PCIE_PERF_STALL_COUNTERS; i++) + data[idx++] = + MLX5E_READ_CTR32_BE(&priv->stats.pcie.pcie_perf_counters, + pcie_perf_stall_stats_desc, i); + return idx; +} + +static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(pcie) +{ + struct mlx5e_pcie_stats *pcie_stats = &priv->stats.pcie; + struct mlx5_core_dev *mdev = priv->mdev; + u32 in[MLX5_ST_SZ_DW(mpcnt_reg)] = {0}; + int sz = MLX5_ST_SZ_BYTES(mpcnt_reg); + void *out; + + if (!MLX5_CAP_MCAM_FEATURE(mdev, pcie_performance_group)) + return; + + out = pcie_stats->pcie_perf_counters; + MLX5_SET(mpcnt_reg, in, grp, MLX5_PCIE_PERFORMANCE_COUNTERS_GROUP); + mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_MPCNT, 0, 0); +} + +#define PPORT_PER_TC_PRIO_OFF(c) \ + MLX5_BYTE_OFF(ppcnt_reg, \ + counter_set.eth_per_tc_prio_grp_data_layout.c##_high) + +static const struct counter_desc pport_per_tc_prio_stats_desc[] = { + { "rx_prio%d_buf_discard", PPORT_PER_TC_PRIO_OFF(no_buffer_discard_uc) }, +}; + +#define NUM_PPORT_PER_TC_PRIO_COUNTERS ARRAY_SIZE(pport_per_tc_prio_stats_desc) + +#define PPORT_PER_TC_CONGEST_PRIO_OFF(c) \ + MLX5_BYTE_OFF(ppcnt_reg, \ + counter_set.eth_per_tc_congest_prio_grp_data_layout.c##_high) + +static const struct counter_desc pport_per_tc_congest_prio_stats_desc[] = { + { "rx_prio%d_cong_discard", PPORT_PER_TC_CONGEST_PRIO_OFF(wred_discard) }, + { "rx_prio%d_marked", PPORT_PER_TC_CONGEST_PRIO_OFF(ecn_marked_tc) }, +}; + +#define NUM_PPORT_PER_TC_CONGEST_PRIO_COUNTERS \ + ARRAY_SIZE(pport_per_tc_congest_prio_stats_desc) + +static int mlx5e_grp_per_tc_prio_get_num_stats(struct mlx5e_priv *priv) +{ + struct mlx5_core_dev *mdev = priv->mdev; + + if (!MLX5_CAP_GEN(mdev, sbcam_reg)) + return 0; + + return NUM_PPORT_PER_TC_PRIO_COUNTERS * NUM_PPORT_PRIO; +} + +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(per_port_buff_congest) +{ + struct mlx5_core_dev *mdev = priv->mdev; + int i, prio; + + if (!MLX5_CAP_GEN(mdev, sbcam_reg)) + return idx; + + for (prio = 0; prio < NUM_PPORT_PRIO; prio++) { + for (i = 0; i < NUM_PPORT_PER_TC_PRIO_COUNTERS; i++) + sprintf(data + (idx++) * ETH_GSTRING_LEN, + pport_per_tc_prio_stats_desc[i].format, prio); + for (i = 0; i < NUM_PPORT_PER_TC_CONGEST_PRIO_COUNTERS; i++) + sprintf(data + (idx++) * ETH_GSTRING_LEN, + pport_per_tc_congest_prio_stats_desc[i].format, prio); + } + + return idx; +} + +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(per_port_buff_congest) +{ + struct mlx5e_pport_stats *pport = &priv->stats.pport; + struct mlx5_core_dev *mdev = priv->mdev; + int i, prio; + + if (!MLX5_CAP_GEN(mdev, sbcam_reg)) + return idx; + + for (prio = 0; prio < NUM_PPORT_PRIO; prio++) { + for (i = 0; i < NUM_PPORT_PER_TC_PRIO_COUNTERS; i++) + data[idx++] = + MLX5E_READ_CTR64_BE(&pport->per_tc_prio_counters[prio], + pport_per_tc_prio_stats_desc, i); + for (i = 0; i < NUM_PPORT_PER_TC_CONGEST_PRIO_COUNTERS ; i++) + data[idx++] = + MLX5E_READ_CTR64_BE(&pport->per_tc_congest_prio_counters[prio], + pport_per_tc_congest_prio_stats_desc, i); + } + + return idx; +} + +static void mlx5e_grp_per_tc_prio_update_stats(struct mlx5e_priv *priv) +{ + struct mlx5e_pport_stats *pstats = &priv->stats.pport; + struct mlx5_core_dev *mdev = priv->mdev; + u32 in[MLX5_ST_SZ_DW(ppcnt_reg)] = {}; + int sz = MLX5_ST_SZ_BYTES(ppcnt_reg); + void *out; + int prio; + + if (!MLX5_CAP_GEN(mdev, sbcam_reg)) + return; + + MLX5_SET(ppcnt_reg, in, pnat, 2); + MLX5_SET(ppcnt_reg, in, grp, MLX5_PER_TRAFFIC_CLASS_COUNTERS_GROUP); + for (prio = 0; prio < NUM_PPORT_PRIO; prio++) { + out = pstats->per_tc_prio_counters[prio]; + MLX5_SET(ppcnt_reg, in, prio_tc, prio); + mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0); + } +} + +static int mlx5e_grp_per_tc_congest_prio_get_num_stats(struct mlx5e_priv *priv) +{ + struct mlx5_core_dev *mdev = priv->mdev; + + if (!MLX5_CAP_GEN(mdev, sbcam_reg)) + return 0; + + return NUM_PPORT_PER_TC_CONGEST_PRIO_COUNTERS * NUM_PPORT_PRIO; +} + +static void mlx5e_grp_per_tc_congest_prio_update_stats(struct mlx5e_priv *priv) +{ + struct mlx5e_pport_stats *pstats = &priv->stats.pport; + struct mlx5_core_dev *mdev = priv->mdev; + u32 in[MLX5_ST_SZ_DW(ppcnt_reg)] = {}; + int sz = MLX5_ST_SZ_BYTES(ppcnt_reg); + void *out; + int prio; + + if (!MLX5_CAP_GEN(mdev, sbcam_reg)) + return; + + MLX5_SET(ppcnt_reg, in, pnat, 2); + MLX5_SET(ppcnt_reg, in, grp, MLX5_PER_TRAFFIC_CLASS_CONGESTION_GROUP); + for (prio = 0; prio < NUM_PPORT_PRIO; prio++) { + out = pstats->per_tc_congest_prio_counters[prio]; + MLX5_SET(ppcnt_reg, in, prio_tc, prio); + mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0); + } +} + +static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(per_port_buff_congest) +{ + return mlx5e_grp_per_tc_prio_get_num_stats(priv) + + mlx5e_grp_per_tc_congest_prio_get_num_stats(priv); +} + +static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(per_port_buff_congest) +{ + mlx5e_grp_per_tc_prio_update_stats(priv); + mlx5e_grp_per_tc_congest_prio_update_stats(priv); +} + +#define PPORT_PER_PRIO_OFF(c) \ + MLX5_BYTE_OFF(ppcnt_reg, \ + counter_set.eth_per_prio_grp_data_layout.c##_high) +static const struct counter_desc pport_per_prio_traffic_stats_desc[] = { + { "rx_prio%d_bytes", PPORT_PER_PRIO_OFF(rx_octets) }, + { "rx_prio%d_packets", PPORT_PER_PRIO_OFF(rx_frames) }, + { "rx_prio%d_discards", PPORT_PER_PRIO_OFF(rx_discards) }, + { "tx_prio%d_bytes", PPORT_PER_PRIO_OFF(tx_octets) }, + { "tx_prio%d_packets", PPORT_PER_PRIO_OFF(tx_frames) }, +}; + +#define NUM_PPORT_PER_PRIO_TRAFFIC_COUNTERS ARRAY_SIZE(pport_per_prio_traffic_stats_desc) + +static int mlx5e_grp_per_prio_traffic_get_num_stats(void) +{ + return NUM_PPORT_PER_PRIO_TRAFFIC_COUNTERS * NUM_PPORT_PRIO; +} + +static int mlx5e_grp_per_prio_traffic_fill_strings(struct mlx5e_priv *priv, + u8 *data, + int idx) +{ + int i, prio; + + for (prio = 0; prio < NUM_PPORT_PRIO; prio++) { + for (i = 0; i < NUM_PPORT_PER_PRIO_TRAFFIC_COUNTERS; i++) + sprintf(data + (idx++) * ETH_GSTRING_LEN, + pport_per_prio_traffic_stats_desc[i].format, prio); + } + + return idx; +} + +static int mlx5e_grp_per_prio_traffic_fill_stats(struct mlx5e_priv *priv, + u64 *data, + int idx) +{ + int i, prio; + + for (prio = 0; prio < NUM_PPORT_PRIO; prio++) { + for (i = 0; i < NUM_PPORT_PER_PRIO_TRAFFIC_COUNTERS; i++) + data[idx++] = + MLX5E_READ_CTR64_BE(&priv->stats.pport.per_prio_counters[prio], + pport_per_prio_traffic_stats_desc, i); + } + + return idx; +} + +static const struct counter_desc pport_per_prio_pfc_stats_desc[] = { + /* %s is "global" or "prio{i}" */ + { "rx_%s_pause", PPORT_PER_PRIO_OFF(rx_pause) }, + { "rx_%s_pause_duration", PPORT_PER_PRIO_OFF(rx_pause_duration) }, + { "tx_%s_pause", PPORT_PER_PRIO_OFF(tx_pause) }, + { "tx_%s_pause_duration", PPORT_PER_PRIO_OFF(tx_pause_duration) }, + { "rx_%s_pause_transition", PPORT_PER_PRIO_OFF(rx_pause_transition) }, +}; + +static const struct counter_desc pport_pfc_stall_stats_desc[] = { + { "tx_pause_storm_warning_events", PPORT_PER_PRIO_OFF(device_stall_minor_watermark_cnt) }, + { "tx_pause_storm_error_events", PPORT_PER_PRIO_OFF(device_stall_critical_watermark_cnt) }, +}; + +#define NUM_PPORT_PER_PRIO_PFC_COUNTERS ARRAY_SIZE(pport_per_prio_pfc_stats_desc) +#define NUM_PPORT_PFC_STALL_COUNTERS(priv) (ARRAY_SIZE(pport_pfc_stall_stats_desc) * \ + MLX5_CAP_PCAM_FEATURE((priv)->mdev, pfcc_mask) * \ + MLX5_CAP_DEBUG((priv)->mdev, stall_detect)) + +static unsigned long mlx5e_query_pfc_combined(struct mlx5e_priv *priv) +{ + struct mlx5_core_dev *mdev = priv->mdev; + u8 pfc_en_tx; + u8 pfc_en_rx; + int err; + + if (MLX5_CAP_GEN(mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) + return 0; + + err = mlx5_query_port_pfc(mdev, &pfc_en_tx, &pfc_en_rx); + + return err ? 0 : pfc_en_tx | pfc_en_rx; +} + +static bool mlx5e_query_global_pause_combined(struct mlx5e_priv *priv) +{ + struct mlx5_core_dev *mdev = priv->mdev; + u32 rx_pause; + u32 tx_pause; + int err; + + if (MLX5_CAP_GEN(mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) + return false; + + err = mlx5_query_port_pause(mdev, &rx_pause, &tx_pause); + + return err ? false : rx_pause | tx_pause; +} + +static int mlx5e_grp_per_prio_pfc_get_num_stats(struct mlx5e_priv *priv) +{ + return (mlx5e_query_global_pause_combined(priv) + + hweight8(mlx5e_query_pfc_combined(priv))) * + NUM_PPORT_PER_PRIO_PFC_COUNTERS + + NUM_PPORT_PFC_STALL_COUNTERS(priv); +} + +static int mlx5e_grp_per_prio_pfc_fill_strings(struct mlx5e_priv *priv, + u8 *data, + int idx) +{ + unsigned long pfc_combined; + int i, prio; + + pfc_combined = mlx5e_query_pfc_combined(priv); + for_each_set_bit(prio, &pfc_combined, NUM_PPORT_PRIO) { + for (i = 0; i < NUM_PPORT_PER_PRIO_PFC_COUNTERS; i++) { + char pfc_string[ETH_GSTRING_LEN]; + + snprintf(pfc_string, sizeof(pfc_string), "prio%d", prio); + sprintf(data + (idx++) * ETH_GSTRING_LEN, + pport_per_prio_pfc_stats_desc[i].format, pfc_string); + } + } + + if (mlx5e_query_global_pause_combined(priv)) { + for (i = 0; i < NUM_PPORT_PER_PRIO_PFC_COUNTERS; i++) { + sprintf(data + (idx++) * ETH_GSTRING_LEN, + pport_per_prio_pfc_stats_desc[i].format, "global"); + } + } + + for (i = 0; i < NUM_PPORT_PFC_STALL_COUNTERS(priv); i++) + strcpy(data + (idx++) * ETH_GSTRING_LEN, + pport_pfc_stall_stats_desc[i].format); + + return idx; +} + +static int mlx5e_grp_per_prio_pfc_fill_stats(struct mlx5e_priv *priv, + u64 *data, + int idx) +{ + unsigned long pfc_combined; + int i, prio; + + pfc_combined = mlx5e_query_pfc_combined(priv); + for_each_set_bit(prio, &pfc_combined, NUM_PPORT_PRIO) { + for (i = 0; i < NUM_PPORT_PER_PRIO_PFC_COUNTERS; i++) { + data[idx++] = + MLX5E_READ_CTR64_BE(&priv->stats.pport.per_prio_counters[prio], + pport_per_prio_pfc_stats_desc, i); + } + } + + if (mlx5e_query_global_pause_combined(priv)) { + for (i = 0; i < NUM_PPORT_PER_PRIO_PFC_COUNTERS; i++) { + data[idx++] = + MLX5E_READ_CTR64_BE(&priv->stats.pport.per_prio_counters[0], + pport_per_prio_pfc_stats_desc, i); + } + } + + for (i = 0; i < NUM_PPORT_PFC_STALL_COUNTERS(priv); i++) + data[idx++] = MLX5E_READ_CTR64_BE(&priv->stats.pport.per_prio_counters[0], + pport_pfc_stall_stats_desc, i); + + return idx; +} + +static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(per_prio) +{ + return mlx5e_grp_per_prio_traffic_get_num_stats() + + mlx5e_grp_per_prio_pfc_get_num_stats(priv); +} + +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(per_prio) +{ + idx = mlx5e_grp_per_prio_traffic_fill_strings(priv, data, idx); + idx = mlx5e_grp_per_prio_pfc_fill_strings(priv, data, idx); + return idx; +} + +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(per_prio) +{ + idx = mlx5e_grp_per_prio_traffic_fill_stats(priv, data, idx); + idx = mlx5e_grp_per_prio_pfc_fill_stats(priv, data, idx); + return idx; +} + +static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(per_prio) +{ + struct mlx5e_pport_stats *pstats = &priv->stats.pport; + struct mlx5_core_dev *mdev = priv->mdev; + u32 in[MLX5_ST_SZ_DW(ppcnt_reg)] = {0}; + int sz = MLX5_ST_SZ_BYTES(ppcnt_reg); + int prio; + void *out; + + if (!MLX5_BASIC_PPCNT_SUPPORTED(mdev)) + return; + + MLX5_SET(ppcnt_reg, in, local_port, 1); + MLX5_SET(ppcnt_reg, in, grp, MLX5_PER_PRIORITY_COUNTERS_GROUP); + for (prio = 0; prio < NUM_PPORT_PRIO; prio++) { + out = pstats->per_prio_counters[prio]; + MLX5_SET(ppcnt_reg, in, prio_tc, prio); + mlx5_core_access_reg(mdev, in, sz, out, sz, + MLX5_REG_PPCNT, 0, 0); + } +} + +static const struct counter_desc mlx5e_pme_status_desc[] = { + { "module_unplug", sizeof(u64) * MLX5_MODULE_STATUS_UNPLUGGED }, +}; + +static const struct counter_desc mlx5e_pme_error_desc[] = { + { "module_bus_stuck", sizeof(u64) * MLX5_MODULE_EVENT_ERROR_BUS_STUCK }, + { "module_high_temp", sizeof(u64) * MLX5_MODULE_EVENT_ERROR_HIGH_TEMPERATURE }, + { "module_bad_shorted", sizeof(u64) * MLX5_MODULE_EVENT_ERROR_BAD_CABLE }, +}; + +#define NUM_PME_STATUS_STATS ARRAY_SIZE(mlx5e_pme_status_desc) +#define NUM_PME_ERR_STATS ARRAY_SIZE(mlx5e_pme_error_desc) + +static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(pme) +{ + return NUM_PME_STATUS_STATS + NUM_PME_ERR_STATS; +} + +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(pme) +{ + int i; + + for (i = 0; i < NUM_PME_STATUS_STATS; i++) + strcpy(data + (idx++) * ETH_GSTRING_LEN, mlx5e_pme_status_desc[i].format); + + for (i = 0; i < NUM_PME_ERR_STATS; i++) + strcpy(data + (idx++) * ETH_GSTRING_LEN, mlx5e_pme_error_desc[i].format); + + return idx; +} + +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(pme) +{ + struct mlx5_pme_stats pme_stats; + int i; + + mlx5_get_pme_stats(priv->mdev, &pme_stats); + + for (i = 0; i < NUM_PME_STATUS_STATS; i++) + data[idx++] = MLX5E_READ_CTR64_CPU(pme_stats.status_counters, + mlx5e_pme_status_desc, i); + + for (i = 0; i < NUM_PME_ERR_STATS; i++) + data[idx++] = MLX5E_READ_CTR64_CPU(pme_stats.error_counters, + mlx5e_pme_error_desc, i); + + return idx; +} + +static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(pme) { return; } + +static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(tls) +{ + return mlx5e_tls_get_count(priv); +} + +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(tls) +{ + return idx + mlx5e_tls_get_strings(priv, data + idx * ETH_GSTRING_LEN); +} + +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(tls) +{ + return idx + mlx5e_tls_get_stats(priv, data + idx); +} + +static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(tls) { return; } + +static const struct counter_desc rq_stats_desc[] = { + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, packets) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, bytes) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, csum_complete) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, csum_complete_tail) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, csum_complete_tail_slow) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, csum_unnecessary) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, csum_unnecessary_inner) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, csum_none) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, xdp_drop) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, xdp_redirect) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, lro_packets) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, lro_bytes) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, gro_packets) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, gro_bytes) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, gro_skbs) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, gro_match_packets) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, gro_large_hds) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, ecn_mark) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, removed_vlan_packets) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, wqe_err) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, mpwqe_filler_cqes) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, mpwqe_filler_strides) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, oversize_pkts_sw_drop) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, buff_alloc_err) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, cqe_compress_blks) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, cqe_compress_pkts) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, cache_reuse) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, cache_full) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, cache_empty) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, cache_busy) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, cache_waive) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, cache_ext) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, cache_rdc) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, cache_alloc) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, congst_umr) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, arfs_err) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, recover) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, pet_hdr_lookup_drop) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, pet_mdata_lookup_drop) }, +#ifdef CONFIG_MLX5_EN_TLS + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, tls_decrypted_packets) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, tls_decrypted_bytes) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, tls_resync_req_pkt) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, tls_resync_req_start) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, tls_resync_req_end) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, tls_resync_req_skip) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, tls_resync_res_ok) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, tls_resync_res_retry) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, tls_resync_res_skip) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, tls_err) }, +#endif +}; + +static const struct counter_desc sq_stats_desc[] = { + { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, packets) }, + { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, bytes) }, + { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, tso_packets) }, + { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, tso_bytes) }, + { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, tso_inner_packets) }, + { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, tso_inner_bytes) }, + { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, csum_partial) }, + { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, csum_partial_inner) }, + { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, added_vlan_packets) }, + { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, nop) }, + { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, mpwqe_blks) }, + { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, mpwqe_pkts) }, +#ifdef CONFIG_MLX5_EN_TLS + { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, tls_encrypted_packets) }, + { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, tls_encrypted_bytes) }, + { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, tls_ooo) }, + { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, tls_dump_packets) }, + { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, tls_dump_bytes) }, + { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, tls_resync_bytes) }, + { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, tls_skip_no_sync_data) }, + { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, tls_drop_no_sync_data) }, + { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, tls_drop_bypass_req) }, +#endif + { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, csum_none) }, + { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, stopped) }, + { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, dropped) }, + { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, xmit_more) }, + { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, recover) }, + { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, cqes) }, + { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, wake) }, + { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, cqe_err) }, +}; + +static const struct counter_desc rq_xdpsq_stats_desc[] = { + { MLX5E_DECLARE_RQ_XDPSQ_STAT(struct mlx5e_xdpsq_stats, xmit) }, + { MLX5E_DECLARE_RQ_XDPSQ_STAT(struct mlx5e_xdpsq_stats, mpwqe) }, + { MLX5E_DECLARE_RQ_XDPSQ_STAT(struct mlx5e_xdpsq_stats, inlnw) }, + { MLX5E_DECLARE_RQ_XDPSQ_STAT(struct mlx5e_xdpsq_stats, nops) }, + { MLX5E_DECLARE_RQ_XDPSQ_STAT(struct mlx5e_xdpsq_stats, full) }, + { MLX5E_DECLARE_RQ_XDPSQ_STAT(struct mlx5e_xdpsq_stats, err) }, + { MLX5E_DECLARE_RQ_XDPSQ_STAT(struct mlx5e_xdpsq_stats, cqes) }, +}; + +static const struct counter_desc xdpsq_stats_desc[] = { + { MLX5E_DECLARE_XDPSQ_STAT(struct mlx5e_xdpsq_stats, xmit) }, + { MLX5E_DECLARE_XDPSQ_STAT(struct mlx5e_xdpsq_stats, mpwqe) }, + { MLX5E_DECLARE_XDPSQ_STAT(struct mlx5e_xdpsq_stats, inlnw) }, + { MLX5E_DECLARE_XDPSQ_STAT(struct mlx5e_xdpsq_stats, nops) }, + { MLX5E_DECLARE_XDPSQ_STAT(struct mlx5e_xdpsq_stats, full) }, + { MLX5E_DECLARE_XDPSQ_STAT(struct mlx5e_xdpsq_stats, err) }, + { MLX5E_DECLARE_XDPSQ_STAT(struct mlx5e_xdpsq_stats, cqes) }, +}; + +static const struct counter_desc xskrq_stats_desc[] = { + { MLX5E_DECLARE_XSKRQ_STAT(struct mlx5e_rq_stats, packets) }, + { MLX5E_DECLARE_XSKRQ_STAT(struct mlx5e_rq_stats, bytes) }, + { MLX5E_DECLARE_XSKRQ_STAT(struct mlx5e_rq_stats, csum_complete) }, + { MLX5E_DECLARE_XSKRQ_STAT(struct mlx5e_rq_stats, csum_unnecessary) }, + { MLX5E_DECLARE_XSKRQ_STAT(struct mlx5e_rq_stats, csum_unnecessary_inner) }, + { MLX5E_DECLARE_XSKRQ_STAT(struct mlx5e_rq_stats, csum_none) }, + { MLX5E_DECLARE_XSKRQ_STAT(struct mlx5e_rq_stats, ecn_mark) }, + { MLX5E_DECLARE_XSKRQ_STAT(struct mlx5e_rq_stats, removed_vlan_packets) }, + { MLX5E_DECLARE_XSKRQ_STAT(struct mlx5e_rq_stats, xdp_drop) }, + { MLX5E_DECLARE_XSKRQ_STAT(struct mlx5e_rq_stats, xdp_redirect) }, + { MLX5E_DECLARE_XSKRQ_STAT(struct mlx5e_rq_stats, wqe_err) }, + { MLX5E_DECLARE_XSKRQ_STAT(struct mlx5e_rq_stats, mpwqe_filler_cqes) }, + { MLX5E_DECLARE_XSKRQ_STAT(struct mlx5e_rq_stats, mpwqe_filler_strides) }, + { MLX5E_DECLARE_XSKRQ_STAT(struct mlx5e_rq_stats, oversize_pkts_sw_drop) }, + { MLX5E_DECLARE_XSKRQ_STAT(struct mlx5e_rq_stats, buff_alloc_err) }, + { MLX5E_DECLARE_XSKRQ_STAT(struct mlx5e_rq_stats, cqe_compress_blks) }, + { MLX5E_DECLARE_XSKRQ_STAT(struct mlx5e_rq_stats, cqe_compress_pkts) }, + { MLX5E_DECLARE_XSKRQ_STAT(struct mlx5e_rq_stats, congst_umr) }, + { MLX5E_DECLARE_XSKRQ_STAT(struct mlx5e_rq_stats, arfs_err) }, +}; + +static const struct counter_desc xsksq_stats_desc[] = { + { MLX5E_DECLARE_XSKSQ_STAT(struct mlx5e_xdpsq_stats, xmit) }, + { MLX5E_DECLARE_XSKSQ_STAT(struct mlx5e_xdpsq_stats, mpwqe) }, + { MLX5E_DECLARE_XSKSQ_STAT(struct mlx5e_xdpsq_stats, inlnw) }, + { MLX5E_DECLARE_XSKSQ_STAT(struct mlx5e_xdpsq_stats, full) }, + { MLX5E_DECLARE_XSKSQ_STAT(struct mlx5e_xdpsq_stats, err) }, + { MLX5E_DECLARE_XSKSQ_STAT(struct mlx5e_xdpsq_stats, cqes) }, +}; + +static const struct counter_desc ch_stats_desc[] = { + { MLX5E_DECLARE_CH_STAT(struct mlx5e_ch_stats, events) }, + { MLX5E_DECLARE_CH_STAT(struct mlx5e_ch_stats, poll) }, + { MLX5E_DECLARE_CH_STAT(struct mlx5e_ch_stats, arm) }, + { MLX5E_DECLARE_CH_STAT(struct mlx5e_ch_stats, aff_change) }, + { MLX5E_DECLARE_CH_STAT(struct mlx5e_ch_stats, force_irq) }, + { MLX5E_DECLARE_CH_STAT(struct mlx5e_ch_stats, eq_rearm) }, +}; + +static const struct counter_desc ptp_sq_stats_desc[] = { + { MLX5E_DECLARE_PTP_TX_STAT(struct mlx5e_sq_stats, packets) }, + { MLX5E_DECLARE_PTP_TX_STAT(struct mlx5e_sq_stats, bytes) }, + { MLX5E_DECLARE_PTP_TX_STAT(struct mlx5e_sq_stats, csum_partial) }, + { MLX5E_DECLARE_PTP_TX_STAT(struct mlx5e_sq_stats, csum_partial_inner) }, + { MLX5E_DECLARE_PTP_TX_STAT(struct mlx5e_sq_stats, added_vlan_packets) }, + { MLX5E_DECLARE_PTP_TX_STAT(struct mlx5e_sq_stats, nop) }, + { MLX5E_DECLARE_PTP_TX_STAT(struct mlx5e_sq_stats, csum_none) }, + { MLX5E_DECLARE_PTP_TX_STAT(struct mlx5e_sq_stats, stopped) }, + { MLX5E_DECLARE_PTP_TX_STAT(struct mlx5e_sq_stats, dropped) }, + { MLX5E_DECLARE_PTP_TX_STAT(struct mlx5e_sq_stats, xmit_more) }, + { MLX5E_DECLARE_PTP_TX_STAT(struct mlx5e_sq_stats, recover) }, + { MLX5E_DECLARE_PTP_TX_STAT(struct mlx5e_sq_stats, cqes) }, + { MLX5E_DECLARE_PTP_TX_STAT(struct mlx5e_sq_stats, wake) }, + { MLX5E_DECLARE_PTP_TX_STAT(struct mlx5e_sq_stats, cqe_err) }, +}; + +static const struct counter_desc ptp_ch_stats_desc[] = { + { MLX5E_DECLARE_PTP_CH_STAT(struct mlx5e_ch_stats, events) }, + { MLX5E_DECLARE_PTP_CH_STAT(struct mlx5e_ch_stats, poll) }, + { MLX5E_DECLARE_PTP_CH_STAT(struct mlx5e_ch_stats, arm) }, + { MLX5E_DECLARE_PTP_CH_STAT(struct mlx5e_ch_stats, eq_rearm) }, +}; + +static const struct counter_desc ptp_cq_stats_desc[] = { + { MLX5E_DECLARE_PTP_CQ_STAT(struct mlx5e_ptp_cq_stats, cqe) }, + { MLX5E_DECLARE_PTP_CQ_STAT(struct mlx5e_ptp_cq_stats, err_cqe) }, + { MLX5E_DECLARE_PTP_CQ_STAT(struct mlx5e_ptp_cq_stats, abort) }, + { MLX5E_DECLARE_PTP_CQ_STAT(struct mlx5e_ptp_cq_stats, abort_abs_diff_ns) }, + { MLX5E_DECLARE_PTP_CQ_STAT(struct mlx5e_ptp_cq_stats, resync_cqe) }, + { MLX5E_DECLARE_PTP_CQ_STAT(struct mlx5e_ptp_cq_stats, resync_event) }, +}; + +static const struct counter_desc ptp_rq_stats_desc[] = { + { MLX5E_DECLARE_PTP_RQ_STAT(struct mlx5e_rq_stats, packets) }, + { MLX5E_DECLARE_PTP_RQ_STAT(struct mlx5e_rq_stats, bytes) }, + { MLX5E_DECLARE_PTP_RQ_STAT(struct mlx5e_rq_stats, csum_complete) }, + { MLX5E_DECLARE_PTP_RQ_STAT(struct mlx5e_rq_stats, csum_complete_tail) }, + { MLX5E_DECLARE_PTP_RQ_STAT(struct mlx5e_rq_stats, csum_complete_tail_slow) }, + { MLX5E_DECLARE_PTP_RQ_STAT(struct mlx5e_rq_stats, csum_unnecessary) }, + { MLX5E_DECLARE_PTP_RQ_STAT(struct mlx5e_rq_stats, csum_unnecessary_inner) }, + { MLX5E_DECLARE_PTP_RQ_STAT(struct mlx5e_rq_stats, csum_none) }, + { MLX5E_DECLARE_PTP_RQ_STAT(struct mlx5e_rq_stats, xdp_drop) }, + { MLX5E_DECLARE_PTP_RQ_STAT(struct mlx5e_rq_stats, xdp_redirect) }, + { MLX5E_DECLARE_PTP_RQ_STAT(struct mlx5e_rq_stats, lro_packets) }, + { MLX5E_DECLARE_PTP_RQ_STAT(struct mlx5e_rq_stats, lro_bytes) }, + { MLX5E_DECLARE_PTP_RQ_STAT(struct mlx5e_rq_stats, ecn_mark) }, + { MLX5E_DECLARE_PTP_RQ_STAT(struct mlx5e_rq_stats, removed_vlan_packets) }, + { MLX5E_DECLARE_PTP_RQ_STAT(struct mlx5e_rq_stats, wqe_err) }, + { MLX5E_DECLARE_PTP_RQ_STAT(struct mlx5e_rq_stats, mpwqe_filler_cqes) }, + { MLX5E_DECLARE_PTP_RQ_STAT(struct mlx5e_rq_stats, mpwqe_filler_strides) }, + { MLX5E_DECLARE_PTP_RQ_STAT(struct mlx5e_rq_stats, oversize_pkts_sw_drop) }, + { MLX5E_DECLARE_PTP_RQ_STAT(struct mlx5e_rq_stats, buff_alloc_err) }, + { MLX5E_DECLARE_PTP_RQ_STAT(struct mlx5e_rq_stats, cqe_compress_blks) }, + { MLX5E_DECLARE_PTP_RQ_STAT(struct mlx5e_rq_stats, cqe_compress_pkts) }, + { MLX5E_DECLARE_PTP_RQ_STAT(struct mlx5e_rq_stats, cache_reuse) }, + { MLX5E_DECLARE_PTP_RQ_STAT(struct mlx5e_rq_stats, cache_full) }, + { MLX5E_DECLARE_PTP_RQ_STAT(struct mlx5e_rq_stats, cache_empty) }, + { MLX5E_DECLARE_PTP_RQ_STAT(struct mlx5e_rq_stats, cache_busy) }, + { MLX5E_DECLARE_PTP_RQ_STAT(struct mlx5e_rq_stats, cache_waive) }, + { MLX5E_DECLARE_PTP_RQ_STAT(struct mlx5e_rq_stats, congst_umr) }, + { MLX5E_DECLARE_PTP_RQ_STAT(struct mlx5e_rq_stats, arfs_err) }, + { MLX5E_DECLARE_PTP_RQ_STAT(struct mlx5e_rq_stats, recover) }, +}; + +static const struct counter_desc qos_sq_stats_desc[] = { + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, packets) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, bytes) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, tso_packets) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, tso_bytes) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, tso_inner_packets) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, tso_inner_bytes) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, csum_partial) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, csum_partial_inner) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, added_vlan_packets) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, nop) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, mpwqe_blks) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, mpwqe_pkts) }, +#ifdef CONFIG_MLX5_EN_TLS + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, tls_encrypted_packets) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, tls_encrypted_bytes) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, tls_ooo) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, tls_dump_packets) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, tls_dump_bytes) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, tls_resync_bytes) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, tls_skip_no_sync_data) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, tls_drop_no_sync_data) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, tls_drop_bypass_req) }, +#endif + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, csum_none) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, stopped) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, dropped) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, xmit_more) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, recover) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, cqes) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, wake) }, + { MLX5E_DECLARE_QOS_TX_STAT(struct mlx5e_sq_stats, cqe_err) }, +}; + +#define NUM_RQ_STATS ARRAY_SIZE(rq_stats_desc) +#define NUM_SQ_STATS ARRAY_SIZE(sq_stats_desc) +#define NUM_XDPSQ_STATS ARRAY_SIZE(xdpsq_stats_desc) +#define NUM_RQ_XDPSQ_STATS ARRAY_SIZE(rq_xdpsq_stats_desc) +#define NUM_XSKRQ_STATS ARRAY_SIZE(xskrq_stats_desc) +#define NUM_XSKSQ_STATS ARRAY_SIZE(xsksq_stats_desc) +#define NUM_CH_STATS ARRAY_SIZE(ch_stats_desc) +#define NUM_PTP_SQ_STATS ARRAY_SIZE(ptp_sq_stats_desc) +#define NUM_PTP_CH_STATS ARRAY_SIZE(ptp_ch_stats_desc) +#define NUM_PTP_CQ_STATS ARRAY_SIZE(ptp_cq_stats_desc) +#define NUM_PTP_RQ_STATS ARRAY_SIZE(ptp_rq_stats_desc) +#define NUM_QOS_SQ_STATS ARRAY_SIZE(qos_sq_stats_desc) +#define MLX5E_PER_CHANNEL_STATS(priv) \ + (priv->max_nch * MLX5E_GET_PFLAG(&(priv)->channels.params, MLX5E_PFLAG_PER_CH_STATS)) + +static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(qos) +{ + /* Pairs with smp_store_release in mlx5e_open_qos_sq. */ + return NUM_QOS_SQ_STATS * smp_load_acquire(&priv->htb.max_qos_sqs); +} + +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(qos) +{ + /* Pairs with smp_store_release in mlx5e_open_qos_sq. */ + u16 max_qos_sqs = smp_load_acquire(&priv->htb.max_qos_sqs); + int i, qid; + + for (qid = 0; qid < max_qos_sqs; qid++) + for (i = 0; i < NUM_QOS_SQ_STATS; i++) + sprintf(data + (idx++) * ETH_GSTRING_LEN, + qos_sq_stats_desc[i].format, qid); + + return idx; +} + +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(qos) +{ + struct mlx5e_sq_stats **stats; + u16 max_qos_sqs; + int i, qid; + + /* Pairs with smp_store_release in mlx5e_open_qos_sq. */ + max_qos_sqs = smp_load_acquire(&priv->htb.max_qos_sqs); + stats = READ_ONCE(priv->htb.qos_sq_stats); + + for (qid = 0; qid < max_qos_sqs; qid++) { + struct mlx5e_sq_stats *s = READ_ONCE(stats[qid]); + + for (i = 0; i < NUM_QOS_SQ_STATS; i++) + data[idx++] = MLX5E_READ_CTR64_CPU(s, qos_sq_stats_desc, i); + } + + return idx; +} + +static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(qos) { return; } + +static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(ptp) +{ + int num = NUM_PTP_CH_STATS; + + if (!priv->tx_ptp_opened && !priv->rx_ptp_opened) + return 0; + + if (priv->tx_ptp_opened) + num += (NUM_PTP_SQ_STATS + NUM_PTP_CQ_STATS) * priv->max_opened_tc; + if (priv->rx_ptp_opened) + num += NUM_PTP_RQ_STATS; + + return num; +} + +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(ptp) +{ + int i, tc; + + if (!priv->tx_ptp_opened && !priv->rx_ptp_opened) + return idx; + + for (i = 0; i < NUM_PTP_CH_STATS; i++) + sprintf(data + (idx++) * ETH_GSTRING_LEN, + "%s", ptp_ch_stats_desc[i].format); + + if (priv->tx_ptp_opened) { + for (tc = 0; tc < priv->max_opened_tc; tc++) + for (i = 0; i < NUM_PTP_SQ_STATS; i++) + sprintf(data + (idx++) * ETH_GSTRING_LEN, + ptp_sq_stats_desc[i].format, tc); + + for (tc = 0; tc < priv->max_opened_tc; tc++) + for (i = 0; i < NUM_PTP_CQ_STATS; i++) + sprintf(data + (idx++) * ETH_GSTRING_LEN, + ptp_cq_stats_desc[i].format, tc); + } + if (priv->rx_ptp_opened) { + for (i = 0; i < NUM_PTP_RQ_STATS; i++) + sprintf(data + (idx++) * ETH_GSTRING_LEN, + ptp_rq_stats_desc[i].format, MLX5E_PTP_CHANNEL_IX); + } + return idx; +} + +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(ptp) +{ + int i, tc; + + if (!priv->tx_ptp_opened && !priv->rx_ptp_opened) + return idx; + + for (i = 0; i < NUM_PTP_CH_STATS; i++) + data[idx++] = + MLX5E_READ_CTR64_CPU(&priv->ptp_stats.ch, + ptp_ch_stats_desc, i); + + if (priv->tx_ptp_opened) { + for (tc = 0; tc < priv->max_opened_tc; tc++) + for (i = 0; i < NUM_PTP_SQ_STATS; i++) + data[idx++] = + MLX5E_READ_CTR64_CPU(&priv->ptp_stats.sq[tc], + ptp_sq_stats_desc, i); + + for (tc = 0; tc < priv->max_opened_tc; tc++) + for (i = 0; i < NUM_PTP_CQ_STATS; i++) + data[idx++] = + MLX5E_READ_CTR64_CPU(&priv->ptp_stats.cq[tc], + ptp_cq_stats_desc, i); + } + if (priv->rx_ptp_opened) { + for (i = 0; i < NUM_PTP_RQ_STATS; i++) + data[idx++] = + MLX5E_READ_CTR64_CPU(&priv->ptp_stats.rq, + ptp_rq_stats_desc, i); + } + return idx; +} + +static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(ptp) { return; } + +static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(channels) +{ + int max_nch = priv->stats_nch; + + return (NUM_RQ_STATS * max_nch) + + (NUM_CH_STATS * max_nch) + + (NUM_SQ_STATS * max_nch * priv->max_opened_tc) + + (NUM_RQ_XDPSQ_STATS * max_nch) + + (NUM_XDPSQ_STATS * max_nch) + + (NUM_XSKRQ_STATS * max_nch * priv->xsk.ever_used) + + (NUM_XSKSQ_STATS * max_nch * priv->xsk.ever_used); +} + +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(channels) +{ + bool is_xsk = priv->xsk.ever_used; + int max_nch = priv->stats_nch; + int i, j, tc; + + for (i = 0; i < max_nch; i++) + for (j = 0; j < NUM_CH_STATS; j++) + sprintf(data + (idx++) * ETH_GSTRING_LEN, + ch_stats_desc[j].format, i); + + for (i = 0; i < max_nch; i++) { + for (j = 0; j < NUM_RQ_STATS; j++) + sprintf(data + (idx++) * ETH_GSTRING_LEN, + rq_stats_desc[j].format, i); + for (j = 0; j < NUM_XSKRQ_STATS * is_xsk; j++) + sprintf(data + (idx++) * ETH_GSTRING_LEN, + xskrq_stats_desc[j].format, i); + for (j = 0; j < NUM_RQ_XDPSQ_STATS; j++) + sprintf(data + (idx++) * ETH_GSTRING_LEN, + rq_xdpsq_stats_desc[j].format, i); + } + + for (tc = 0; tc < priv->max_opened_tc; tc++) + for (i = 0; i < max_nch; i++) + for (j = 0; j < NUM_SQ_STATS; j++) + sprintf(data + (idx++) * ETH_GSTRING_LEN, + sq_stats_desc[j].format, + i + tc * max_nch); + + for (i = 0; i < max_nch; i++) { + for (j = 0; j < NUM_XSKSQ_STATS * is_xsk; j++) + sprintf(data + (idx++) * ETH_GSTRING_LEN, + xsksq_stats_desc[j].format, i); + for (j = 0; j < NUM_XDPSQ_STATS; j++) + sprintf(data + (idx++) * ETH_GSTRING_LEN, + xdpsq_stats_desc[j].format, i); + } + + return idx; +} + +static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(channels) +{ + bool is_xsk = priv->xsk.ever_used; + int max_nch = priv->stats_nch; + int i, j, tc; + + for (i = 0; i < max_nch; i++) + for (j = 0; j < NUM_CH_STATS; j++) + data[idx++] = + MLX5E_READ_CTR64_CPU(&priv->channel_stats[i]->ch, + ch_stats_desc, j); + + for (i = 0; i < max_nch; i++) { + for (j = 0; j < NUM_RQ_STATS; j++) + data[idx++] = + MLX5E_READ_CTR64_CPU(&priv->channel_stats[i]->rq, + rq_stats_desc, j); + for (j = 0; j < NUM_XSKRQ_STATS * is_xsk; j++) + data[idx++] = + MLX5E_READ_CTR64_CPU(&priv->channel_stats[i]->xskrq, + xskrq_stats_desc, j); + for (j = 0; j < NUM_RQ_XDPSQ_STATS; j++) + data[idx++] = + MLX5E_READ_CTR64_CPU(&priv->channel_stats[i]->rq_xdpsq, + rq_xdpsq_stats_desc, j); + } + + for (tc = 0; tc < priv->max_opened_tc; tc++) + for (i = 0; i < max_nch; i++) + for (j = 0; j < NUM_SQ_STATS; j++) + data[idx++] = + MLX5E_READ_CTR64_CPU(&priv->channel_stats[i]->sq[tc], + sq_stats_desc, j); + + for (i = 0; i < max_nch; i++) { + for (j = 0; j < NUM_XSKSQ_STATS * is_xsk; j++) + data[idx++] = + MLX5E_READ_CTR64_CPU(&priv->channel_stats[i]->xsksq, + xsksq_stats_desc, j); + for (j = 0; j < NUM_XDPSQ_STATS; j++) + data[idx++] = + MLX5E_READ_CTR64_CPU(&priv->channel_stats[i]->xdpsq, + xdpsq_stats_desc, j); + } + + return idx; +} + +static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(channels) { return; } + +MLX5E_DEFINE_STATS_GRP(sw, 0); +MLX5E_DEFINE_STATS_GRP(qcnt, MLX5E_NDO_UPDATE_STATS); +MLX5E_DEFINE_STATS_GRP(vnic_env, 0); +MLX5E_DEFINE_STATS_GRP(vport, MLX5E_NDO_UPDATE_STATS); +MLX5E_DEFINE_STATS_GRP(802_3, MLX5E_NDO_UPDATE_STATS); +MLX5E_DEFINE_STATS_GRP(2863, 0); +MLX5E_DEFINE_STATS_GRP(2819, 0); +MLX5E_DEFINE_STATS_GRP(phy, 0); +MLX5E_DEFINE_STATS_GRP(pcie, 0); +MLX5E_DEFINE_STATS_GRP(per_prio, 0); +MLX5E_DEFINE_STATS_GRP(pme, 0); +MLX5E_DEFINE_STATS_GRP(channels, 0); +MLX5E_DEFINE_STATS_GRP(per_port_buff_congest, 0); +MLX5E_DEFINE_STATS_GRP(eth_ext, 0); +static MLX5E_DEFINE_STATS_GRP(tls, 0); +MLX5E_DEFINE_STATS_GRP(ptp, 0); +static MLX5E_DEFINE_STATS_GRP(qos, 0); + +/* The stats groups order is opposite to the update_stats() order calls */ +mlx5e_stats_grp_t mlx5e_nic_stats_grps[] = { + &MLX5E_STATS_GRP(sw), + &MLX5E_STATS_GRP(qcnt), + &MLX5E_STATS_GRP(vnic_env), + &MLX5E_STATS_GRP(vport), + &MLX5E_STATS_GRP(802_3), + &MLX5E_STATS_GRP(2863), + &MLX5E_STATS_GRP(2819), + &MLX5E_STATS_GRP(phy), + &MLX5E_STATS_GRP(eth_ext), + &MLX5E_STATS_GRP(pcie), + &MLX5E_STATS_GRP(per_prio), + &MLX5E_STATS_GRP(pme), +#ifdef CONFIG_MLX5_EN_IPSEC + &MLX5E_STATS_GRP(ipsec_sw), + &MLX5E_STATS_GRP(ipsec_hw), +#endif + &MLX5E_STATS_GRP(tls), + &MLX5E_STATS_GRP(channels), + &MLX5E_STATS_GRP(per_port_buff_congest), + &MLX5E_STATS_GRP(ptp), + &MLX5E_STATS_GRP(qos), +#ifdef CONFIG_MLX5_EN_MACSEC + &MLX5E_STATS_GRP(macsec_hw), +#endif +}; + +unsigned int mlx5e_nic_stats_grps_num(struct mlx5e_priv *priv) +{ + return ARRAY_SIZE(mlx5e_nic_stats_grps); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h new file mode 100644 index 0000000..a76e92a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h @@ -0,0 +1,477 @@ +/* + * Copyright (c) 2015-2016, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __MLX5_EN_STATS_H__ +#define __MLX5_EN_STATS_H__ + +#define MLX5E_READ_CTR64_CPU(ptr, dsc, i) \ + (*(u64 *)((char *)ptr + dsc[i].offset)) +#define MLX5E_READ_CTR64_BE(ptr, dsc, i) \ + be64_to_cpu(*(__be64 *)((char *)ptr + dsc[i].offset)) +#define MLX5E_READ_CTR32_CPU(ptr, dsc, i) \ + (*(u32 *)((char *)ptr + dsc[i].offset)) +#define MLX5E_READ_CTR32_BE(ptr, dsc, i) \ + be32_to_cpu(*(__be32 *)((char *)ptr + dsc[i].offset)) + +#define MLX5E_DECLARE_STAT(type, fld) #fld, offsetof(type, fld) +#define MLX5E_DECLARE_RX_STAT(type, fld) "rx%d_"#fld, offsetof(type, fld) +#define MLX5E_DECLARE_TX_STAT(type, fld) "tx%d_"#fld, offsetof(type, fld) +#define MLX5E_DECLARE_XDPSQ_STAT(type, fld) "tx%d_xdp_"#fld, offsetof(type, fld) +#define MLX5E_DECLARE_RQ_XDPSQ_STAT(type, fld) "rx%d_xdp_tx_"#fld, offsetof(type, fld) +#define MLX5E_DECLARE_XSKRQ_STAT(type, fld) "rx%d_xsk_"#fld, offsetof(type, fld) +#define MLX5E_DECLARE_XSKSQ_STAT(type, fld) "tx%d_xsk_"#fld, offsetof(type, fld) +#define MLX5E_DECLARE_CH_STAT(type, fld) "ch%d_"#fld, offsetof(type, fld) + +#define MLX5E_DECLARE_PTP_TX_STAT(type, fld) "ptp_tx%d_"#fld, offsetof(type, fld) +#define MLX5E_DECLARE_PTP_CH_STAT(type, fld) "ptp_ch_"#fld, offsetof(type, fld) +#define MLX5E_DECLARE_PTP_CQ_STAT(type, fld) "ptp_cq%d_"#fld, offsetof(type, fld) +#define MLX5E_DECLARE_PTP_RQ_STAT(type, fld) "ptp_rq%d_"#fld, offsetof(type, fld) + +#define MLX5E_DECLARE_QOS_TX_STAT(type, fld) "qos_tx%d_"#fld, offsetof(type, fld) + +struct counter_desc { + char format[ETH_GSTRING_LEN]; + size_t offset; /* Byte offset */ +}; + +enum { + MLX5E_NDO_UPDATE_STATS = BIT(0x1), +}; + +struct mlx5e_priv; +struct mlx5e_stats_grp { + u16 update_stats_mask; + int (*get_num_stats)(struct mlx5e_priv *priv); + int (*fill_strings)(struct mlx5e_priv *priv, u8 *data, int idx); + int (*fill_stats)(struct mlx5e_priv *priv, u64 *data, int idx); + void (*update_stats)(struct mlx5e_priv *priv); +}; + +typedef const struct mlx5e_stats_grp *const mlx5e_stats_grp_t; + +#define MLX5E_STATS_GRP_OP(grp, name) mlx5e_stats_grp_ ## grp ## _ ## name + +#define MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(grp) \ + int MLX5E_STATS_GRP_OP(grp, num_stats)(struct mlx5e_priv *priv) + +#define MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(grp) \ + void MLX5E_STATS_GRP_OP(grp, update_stats)(struct mlx5e_priv *priv) + +#define MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(grp) \ + int MLX5E_STATS_GRP_OP(grp, fill_strings)(struct mlx5e_priv *priv, u8 *data, int idx) + +#define MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(grp) \ + int MLX5E_STATS_GRP_OP(grp, fill_stats)(struct mlx5e_priv *priv, u64 *data, int idx) + +#define MLX5E_STATS_GRP(grp) mlx5e_stats_grp_ ## grp + +#define MLX5E_DECLARE_STATS_GRP(grp) \ + const struct mlx5e_stats_grp MLX5E_STATS_GRP(grp) + +#define MLX5E_DEFINE_STATS_GRP(grp, mask) \ +MLX5E_DECLARE_STATS_GRP(grp) = { \ + .get_num_stats = MLX5E_STATS_GRP_OP(grp, num_stats), \ + .fill_stats = MLX5E_STATS_GRP_OP(grp, fill_stats), \ + .fill_strings = MLX5E_STATS_GRP_OP(grp, fill_strings), \ + .update_stats = MLX5E_STATS_GRP_OP(grp, update_stats), \ + .update_stats_mask = mask, \ +} + +unsigned int mlx5e_stats_total_num(struct mlx5e_priv *priv); +void mlx5e_stats_update(struct mlx5e_priv *priv); +void mlx5e_stats_fill(struct mlx5e_priv *priv, u64 *data, int idx); +void mlx5e_stats_fill_strings(struct mlx5e_priv *priv, u8 *data); +void mlx5e_stats_update_ndo_stats(struct mlx5e_priv *priv); + +void mlx5e_stats_pause_get(struct mlx5e_priv *priv, + struct ethtool_pause_stats *pause_stats); +void mlx5e_stats_fec_get(struct mlx5e_priv *priv, + struct ethtool_fec_stats *fec_stats); + +void mlx5e_stats_eth_phy_get(struct mlx5e_priv *priv, + struct ethtool_eth_phy_stats *phy_stats); +void mlx5e_stats_eth_mac_get(struct mlx5e_priv *priv, + struct ethtool_eth_mac_stats *mac_stats); +void mlx5e_stats_eth_ctrl_get(struct mlx5e_priv *priv, + struct ethtool_eth_ctrl_stats *ctrl_stats); +void mlx5e_stats_rmon_get(struct mlx5e_priv *priv, + struct ethtool_rmon_stats *rmon, + const struct ethtool_rmon_hist_range **ranges); + +/* Concrete NIC Stats */ + +struct mlx5e_sw_stats { + u64 rx_packets; + u64 rx_bytes; + u64 tx_packets; + u64 tx_bytes; + u64 tx_tso_packets; + u64 tx_tso_bytes; + u64 tx_tso_inner_packets; + u64 tx_tso_inner_bytes; + u64 tx_added_vlan_packets; + u64 tx_nop; + u64 tx_mpwqe_blks; + u64 tx_mpwqe_pkts; + u64 rx_lro_packets; + u64 rx_lro_bytes; + u64 rx_gro_packets; + u64 rx_gro_bytes; + u64 rx_gro_skbs; + u64 rx_gro_match_packets; + u64 rx_gro_large_hds; + u64 rx_mcast_packets; + u64 rx_ecn_mark; + u64 rx_removed_vlan_packets; + u64 rx_csum_unnecessary; + u64 rx_csum_none; + u64 rx_csum_complete; + u64 rx_csum_complete_tail; + u64 rx_csum_complete_tail_slow; + u64 rx_csum_unnecessary_inner; + u64 rx_xdp_drop; + u64 rx_xdp_redirect; + u64 rx_xdp_tx_xmit; + u64 rx_xdp_tx_mpwqe; + u64 rx_xdp_tx_inlnw; + u64 rx_xdp_tx_nops; + u64 rx_xdp_tx_full; + u64 rx_xdp_tx_err; + u64 rx_xdp_tx_cqe; + u64 tx_csum_none; + u64 tx_csum_partial; + u64 tx_csum_partial_inner; + u64 tx_queue_stopped; + u64 tx_queue_dropped; + u64 tx_xmit_more; + u64 tx_recover; + u64 tx_cqes; + u64 tx_queue_wake; + u64 tx_cqe_err; + u64 tx_xdp_xmit; + u64 tx_xdp_mpwqe; + u64 tx_xdp_inlnw; + u64 tx_xdp_nops; + u64 tx_xdp_full; + u64 tx_xdp_err; + u64 tx_xdp_cqes; + u64 rx_wqe_err; + u64 rx_mpwqe_filler_cqes; + u64 rx_mpwqe_filler_strides; + u64 rx_oversize_pkts_sw_drop; + u64 rx_buff_alloc_err; + u64 rx_cqe_compress_blks; + u64 rx_cqe_compress_pkts; + u64 rx_cache_reuse; + u64 rx_cache_full; + u64 rx_cache_empty; + u64 rx_cache_busy; + u64 rx_cache_ext; + u64 rx_cache_rdc; + u64 rx_cache_alloc; + u64 rx_cache_waive; + u64 rx_congst_umr; + u64 rx_arfs_err; + u64 rx_recover; + u64 rx_pet_hdr_lookup_drop; + u64 rx_pet_mdata_lookup_drop; + u64 ch_events; + u64 ch_poll; + u64 ch_arm; + u64 ch_aff_change; + u64 ch_force_irq; + u64 ch_eq_rearm; + +#ifdef CONFIG_MLX5_EN_TLS + u64 tx_tls_encrypted_packets; + u64 tx_tls_encrypted_bytes; + u64 tx_tls_ooo; + u64 tx_tls_dump_packets; + u64 tx_tls_dump_bytes; + u64 tx_tls_resync_bytes; + u64 tx_tls_skip_no_sync_data; + u64 tx_tls_drop_no_sync_data; + u64 tx_tls_drop_bypass_req; + + u64 rx_tls_decrypted_packets; + u64 rx_tls_decrypted_bytes; + u64 rx_tls_resync_req_pkt; + u64 rx_tls_resync_req_start; + u64 rx_tls_resync_req_end; + u64 rx_tls_resync_req_skip; + u64 rx_tls_resync_res_ok; + u64 rx_tls_resync_res_retry; + u64 rx_tls_resync_res_skip; + u64 rx_tls_err; +#endif + + u64 rx_xsk_packets; + u64 rx_xsk_bytes; + u64 rx_xsk_csum_complete; + u64 rx_xsk_csum_unnecessary; + u64 rx_xsk_csum_unnecessary_inner; + u64 rx_xsk_csum_none; + u64 rx_xsk_ecn_mark; + u64 rx_xsk_removed_vlan_packets; + u64 rx_xsk_xdp_drop; + u64 rx_xsk_xdp_redirect; + u64 rx_xsk_wqe_err; + u64 rx_xsk_mpwqe_filler_cqes; + u64 rx_xsk_mpwqe_filler_strides; + u64 rx_xsk_oversize_pkts_sw_drop; + u64 rx_xsk_buff_alloc_err; + u64 rx_xsk_cqe_compress_blks; + u64 rx_xsk_cqe_compress_pkts; + u64 rx_xsk_congst_umr; + u64 rx_xsk_arfs_err; + u64 tx_xsk_xmit; + u64 tx_xsk_mpwqe; + u64 tx_xsk_inlnw; + u64 tx_xsk_full; + u64 tx_xsk_err; + u64 tx_xsk_cqes; +}; + +struct mlx5e_qcounter_stats { + u32 rx_out_of_buffer; + u32 rx_if_down_packets; +}; + +struct mlx5e_vnic_env_stats { + __be64 query_vnic_env_out[MLX5_ST_SZ_QW(query_vnic_env_out)]; +}; + +#define VPORT_COUNTER_GET(vstats, c) MLX5_GET64(query_vport_counter_out, \ + vstats->query_vport_out, c) + +struct mlx5e_vport_stats { + __be64 query_vport_out[MLX5_ST_SZ_QW(query_vport_counter_out)]; +}; + +#define PPORT_802_3_GET(pstats, c) \ + MLX5_GET64(ppcnt_reg, pstats->IEEE_802_3_counters, \ + counter_set.eth_802_3_cntrs_grp_data_layout.c##_high) +#define PPORT_2863_GET(pstats, c) \ + MLX5_GET64(ppcnt_reg, pstats->RFC_2863_counters, \ + counter_set.eth_2863_cntrs_grp_data_layout.c##_high) +#define PPORT_2819_GET(pstats, c) \ + MLX5_GET64(ppcnt_reg, pstats->RFC_2819_counters, \ + counter_set.eth_2819_cntrs_grp_data_layout.c##_high) +#define PPORT_PHY_STATISTICAL_GET(pstats, c) \ + MLX5_GET64(ppcnt_reg, (pstats)->phy_statistical_counters, \ + counter_set.phys_layer_statistical_cntrs.c##_high) +#define PPORT_PER_PRIO_GET(pstats, prio, c) \ + MLX5_GET64(ppcnt_reg, pstats->per_prio_counters[prio], \ + counter_set.eth_per_prio_grp_data_layout.c##_high) +#define NUM_PPORT_PRIO 8 +#define PPORT_ETH_EXT_GET(pstats, c) \ + MLX5_GET64(ppcnt_reg, (pstats)->eth_ext_counters, \ + counter_set.eth_extended_cntrs_grp_data_layout.c##_high) + +struct mlx5e_pport_stats { + __be64 IEEE_802_3_counters[MLX5_ST_SZ_QW(ppcnt_reg)]; + __be64 RFC_2863_counters[MLX5_ST_SZ_QW(ppcnt_reg)]; + __be64 RFC_2819_counters[MLX5_ST_SZ_QW(ppcnt_reg)]; + __be64 per_prio_counters[NUM_PPORT_PRIO][MLX5_ST_SZ_QW(ppcnt_reg)]; + __be64 phy_counters[MLX5_ST_SZ_QW(ppcnt_reg)]; + __be64 phy_statistical_counters[MLX5_ST_SZ_QW(ppcnt_reg)]; + __be64 eth_ext_counters[MLX5_ST_SZ_QW(ppcnt_reg)]; + __be64 per_tc_prio_counters[NUM_PPORT_PRIO][MLX5_ST_SZ_QW(ppcnt_reg)]; + __be64 per_tc_congest_prio_counters[NUM_PPORT_PRIO][MLX5_ST_SZ_QW(ppcnt_reg)]; +}; + +#define PCIE_PERF_GET(pcie_stats, c) \ + MLX5_GET(mpcnt_reg, (pcie_stats)->pcie_perf_counters, \ + counter_set.pcie_perf_cntrs_grp_data_layout.c) + +#define PCIE_PERF_GET64(pcie_stats, c) \ + MLX5_GET64(mpcnt_reg, (pcie_stats)->pcie_perf_counters, \ + counter_set.pcie_perf_cntrs_grp_data_layout.c##_high) + +struct mlx5e_pcie_stats { + __be64 pcie_perf_counters[MLX5_ST_SZ_QW(mpcnt_reg)]; +}; + +struct mlx5e_rq_stats { + u64 packets; + u64 bytes; + u64 csum_complete; + u64 csum_complete_tail; + u64 csum_complete_tail_slow; + u64 csum_unnecessary; + u64 csum_unnecessary_inner; + u64 csum_none; + u64 lro_packets; + u64 lro_bytes; + u64 gro_packets; + u64 gro_bytes; + u64 gro_skbs; + u64 gro_match_packets; + u64 gro_large_hds; + u64 mcast_packets; + u64 ecn_mark; + u64 removed_vlan_packets; + u64 xdp_drop; + u64 xdp_redirect; + u64 wqe_err; + u64 mpwqe_filler_cqes; + u64 mpwqe_filler_strides; + u64 oversize_pkts_sw_drop; + u64 buff_alloc_err; + u64 cqe_compress_blks; + u64 cqe_compress_pkts; + u64 cache_reuse; + u64 cache_full; + u64 cache_empty; + u64 cache_busy; + u64 cache_ext; + u64 cache_rdc; + u64 cache_alloc; + u64 cache_waive; + u64 congst_umr; + u64 arfs_err; + u64 recover; + u64 pet_hdr_lookup_drop; + u64 pet_mdata_lookup_drop; +#ifdef CONFIG_MLX5_EN_TLS + u64 tls_decrypted_packets; + u64 tls_decrypted_bytes; + u64 tls_resync_req_pkt; + u64 tls_resync_req_start; + u64 tls_resync_req_end; + u64 tls_resync_req_skip; + u64 tls_resync_res_ok; + u64 tls_resync_res_retry; + u64 tls_resync_res_skip; + u64 tls_err; +#endif +}; + +struct mlx5e_sq_stats { + /* commonly accessed in data path */ + u64 packets; + u64 bytes; + u64 xmit_more; + u64 tso_packets; + u64 tso_bytes; + u64 tso_inner_packets; + u64 tso_inner_bytes; + u64 csum_partial; + u64 csum_partial_inner; + u64 added_vlan_packets; + u64 nop; + u64 mpwqe_blks; + u64 mpwqe_pkts; +#ifdef CONFIG_MLX5_EN_TLS + u64 tls_encrypted_packets; + u64 tls_encrypted_bytes; + u64 tls_ooo; + u64 tls_dump_packets; + u64 tls_dump_bytes; + u64 tls_resync_bytes; + u64 tls_skip_no_sync_data; + u64 tls_drop_no_sync_data; + u64 tls_drop_bypass_req; +#endif + /* less likely accessed in data path */ + u64 csum_none; + u64 stopped; + u64 dropped; + u64 recover; + /* dirtied @completion */ + u64 cqes ____cacheline_aligned_in_smp; + u64 wake; + u64 cqe_err; +}; + +struct mlx5e_xdpsq_stats { + u64 xmit; + u64 mpwqe; + u64 inlnw; + u64 nops; + u64 full; + u64 err; + /* dirtied @completion */ + u64 cqes ____cacheline_aligned_in_smp; +}; + +struct mlx5e_ch_stats { + u64 events; + u64 poll; + u64 arm; + u64 aff_change; + u64 force_irq; + u64 eq_rearm; +}; + +struct mlx5e_ptp_cq_stats { + u64 cqe; + u64 err_cqe; + u64 abort; + u64 abort_abs_diff_ns; + u64 resync_cqe; + u64 resync_event; +}; + +struct mlx5e_stats { + struct mlx5e_sw_stats sw; + struct mlx5e_qcounter_stats qcnt; + struct mlx5e_vnic_env_stats vnic; + struct mlx5e_vport_stats vport; + struct mlx5e_pport_stats pport; + struct rtnl_link_stats64 vf_vport; + struct mlx5e_pcie_stats pcie; +}; + +extern mlx5e_stats_grp_t mlx5e_nic_stats_grps[]; +unsigned int mlx5e_nic_stats_grps_num(struct mlx5e_priv *priv); + +extern MLX5E_DECLARE_STATS_GRP(sw); +extern MLX5E_DECLARE_STATS_GRP(qcnt); +extern MLX5E_DECLARE_STATS_GRP(vnic_env); +extern MLX5E_DECLARE_STATS_GRP(vport); +extern MLX5E_DECLARE_STATS_GRP(802_3); +extern MLX5E_DECLARE_STATS_GRP(2863); +extern MLX5E_DECLARE_STATS_GRP(2819); +extern MLX5E_DECLARE_STATS_GRP(phy); +extern MLX5E_DECLARE_STATS_GRP(eth_ext); +extern MLX5E_DECLARE_STATS_GRP(pcie); +extern MLX5E_DECLARE_STATS_GRP(per_prio); +extern MLX5E_DECLARE_STATS_GRP(pme); +extern MLX5E_DECLARE_STATS_GRP(channels); +extern MLX5E_DECLARE_STATS_GRP(per_port_buff_congest); +extern MLX5E_DECLARE_STATS_GRP(ipsec_hw); +extern MLX5E_DECLARE_STATS_GRP(ipsec_sw); +extern MLX5E_DECLARE_STATS_GRP(ptp); +extern MLX5E_DECLARE_STATS_GRP(macsec_hw); + +#endif /* __MLX5_EN_STATS_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_sysfs.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_sysfs.c new file mode 100644 index 0000000..abe6d87 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_sysfs.c @@ -0,0 +1,1608 @@ +/* + * Copyright (c) 2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include "en.h" +#include "en/port.h" +#include "en_tc.h" +#include "eswitch.h" +#include "en_ecn.h" +#ifdef CONFIG_MLX5_CORE_EN_DCB +#include "en/port_buffer.h" +#endif + +#define MLX5E_SKPRIOS_NUM 16 +#define MLX5E_GBPS_TO_KBPS 1000000 +#define MLX5E_100MBPS_TO_KBPS 100000 +#define set_kobj_mode(mdev) mlx5_core_is_pf(mdev) ? S_IWUSR | S_IRUGO : S_IRUGO + +static ssize_t mlx5e_show_tc_num(struct device *device, + struct device_attribute *attr, + char *buf) +{ + struct mlx5e_priv *priv = netdev_priv(to_net_dev(device)); + struct net_device *netdev = priv->netdev; + int len = 0; + len += sprintf(buf + len, "%d\n", netdev_get_num_tc(netdev)); + + return len; +} + +static ssize_t mlx5e_store_tc_num(struct device *device, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct mlx5e_priv *priv = netdev_priv(to_net_dev(device)); + struct net_device *netdev = priv->netdev; + struct tc_mqprio_qopt_offload mqprio = { 0 }; + int tc_num; + int err = 0; + + err = sscanf(buf, "%d", &tc_num); + + if (err != 1) + return -EINVAL; + + if (tc_num != MLX5E_MAX_NUM_TC && tc_num != MLX5E_MIN_NUM_TC) + return -EINVAL; + + rtnl_lock(); + netdev_set_num_tc(netdev, tc_num); + mqprio.qopt.num_tc = tc_num; + mlx5e_setup_tc_mqprio(priv, &mqprio); + rtnl_unlock(); + return count; +} + +static ssize_t mlx5e_show_maxrate(struct device *device, + struct device_attribute *attr, + char *buf) +{ + struct mlx5e_priv *priv = netdev_priv(to_net_dev(device)); + u8 max_bw_value[MLX5E_MAX_NUM_TC]; + u8 max_bw_unit[MLX5E_MAX_NUM_TC]; + int len = 0; + int ret; + int i; + + ret = mlx5_query_port_ets_rate_limit(priv->mdev,max_bw_value, + max_bw_unit); + if (ret) { + netdev_err(priv->netdev, "Failed to query port ets rate limit, ret = %d\n", ret); + return ret; + } + + for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) { + u64 maxrate = 0; + if (max_bw_unit[i] == MLX5_100_MBPS_UNIT) + maxrate = max_bw_value[i] * MLX5E_100MBPS_TO_KBPS; + else if (max_bw_unit[i] == MLX5_GBPS_UNIT) + maxrate = max_bw_value[i] * MLX5E_GBPS_TO_KBPS; + len += sprintf(buf + len, "%lld ", maxrate); + } + len += sprintf(buf + len, "\n"); + + return len; +} + +static ssize_t mlx5e_store_maxrate(struct device *device, + struct device_attribute *attr, + const char *buf, size_t count) +{ + __u64 upper_limit_mbps = roundup(255 * MLX5E_100MBPS_TO_KBPS, + MLX5E_GBPS_TO_KBPS); + struct mlx5e_priv *priv = netdev_priv(to_net_dev(device)); + u8 max_bw_value[MLX5E_MAX_NUM_TC]; + u8 max_bw_unit[MLX5E_MAX_NUM_TC]; + u64 tc_maxrate[IEEE_8021QAZ_MAX_TCS]; + int i = 0; + char delimiter; + int ret; + + do { + int len; + u64 input_maxrate; + + if (i >= MLX5E_MAX_NUM_TC) + goto bad_elem_count; + + len = strcspn(buf, " "); + + /* nul-terminate and parse */ + delimiter = buf[len]; + ((char *)buf)[len] = '\0'; + + if (sscanf(buf, "%lld", &input_maxrate) != 1 + || input_maxrate < 0) { + netdev_err(priv->netdev, "bad maxrate value: '%s'\n", + buf); + goto out; + } + tc_maxrate[i] = input_maxrate; + + buf += len + 1; + i++; + } while (delimiter == ' '); + + if (i != MLX5E_MAX_NUM_TC) + goto bad_elem_count; + + for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) { + if (!tc_maxrate[i]) { + max_bw_unit[i] = MLX5_BW_NO_LIMIT; + continue; + } + if (tc_maxrate[i] < upper_limit_mbps) { + max_bw_value[i] = div_u64(tc_maxrate[i], + MLX5E_100MBPS_TO_KBPS); + max_bw_value[i] = max_bw_value[i] ? max_bw_value[i] : 1; + max_bw_unit[i] = MLX5_100_MBPS_UNIT; + } else { + max_bw_value[i] = div_u64(tc_maxrate[i], + MLX5E_GBPS_TO_KBPS); + max_bw_unit[i] = MLX5_GBPS_UNIT; + } + } + + ret = mlx5_modify_port_ets_rate_limit(priv->mdev, + max_bw_value, max_bw_unit); + if (ret) { + netdev_err(priv->netdev, "Failed to modify port ets rate limit, err = %d\n" + , ret); + return ret; + } + return count; + +bad_elem_count: + netdev_err(priv->netdev, "bad number of elemets in maxrate array\n"); +out: + return -EINVAL; +} + +static DEVICE_ATTR(maxrate, S_IRUGO | S_IWUSR, + mlx5e_show_maxrate, mlx5e_store_maxrate); +static DEVICE_ATTR(tc_num, S_IRUGO | S_IWUSR, + mlx5e_show_tc_num, mlx5e_store_tc_num); + +static ssize_t mlx5e_show_lro_timeout(struct device *device, + struct device_attribute *attr, + char *buf) +{ + struct mlx5e_priv *priv = netdev_priv(to_net_dev(device)); + int len = 0; + int i; + + rtnl_lock(); + len += sprintf(buf + len, "Actual timeout: %d\n", + priv->channels.params.packet_merge.timeout); + + len += sprintf(buf + len, "Supported timeout:"); + + for (i = 0; i < MLX5E_LRO_TIMEOUT_ARR_SIZE; i++) + len += sprintf(buf + len, " %d", + MLX5_CAP_ETH(priv->mdev, + lro_timer_supported_periods[i])); + + len += sprintf(buf + len, "\n"); + + rtnl_unlock(); + + return len; +} + +static ssize_t mlx5e_store_lro_timeout(struct device *device, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct mlx5e_priv *priv = netdev_priv(to_net_dev(device)); + struct net_device *netdev = priv->netdev; + u32 lro_timeout; + int err = 0; + + err = sscanf(buf, "%d", &lro_timeout); + + if (err != 1) + goto bad_input; + + rtnl_lock(); + if (lro_timeout > MLX5_CAP_ETH(priv->mdev, + lro_timer_supported_periods + [MLX5E_LRO_TIMEOUT_ARR_SIZE - 1])) + goto bad_input_unlock; + + lro_timeout = mlx5e_choose_lro_timeout(priv->mdev, lro_timeout); + + mutex_lock(&priv->state_lock); + + if (priv->channels.params.packet_merge.timeout == lro_timeout) { + err = 0; + goto unlock; + } + + priv->channels.params.packet_merge.timeout = lro_timeout; + err = mlx5e_modify_tirs_packet_merge(priv); + +unlock: + mutex_unlock(&priv->state_lock); + rtnl_unlock(); + + if (err) + return err; + + return count; + +bad_input_unlock: + rtnl_unlock(); +bad_input: + netdev_err(netdev, "Bad Input\n"); + return -EINVAL; +} + +static DEVICE_ATTR(lro_timeout, S_IRUGO | S_IWUSR, + mlx5e_show_lro_timeout, mlx5e_store_lro_timeout); + +#ifdef ETH_SS_RSS_HASH_FUNCS +#define MLX5E_HFUNC_TOP ETH_RSS_HASH_TOP +#define MLX5E_HFUNC_XOR ETH_RSS_HASH_XOR +#else +#define MLX5E_HFUNC_TOP 1 +#define MLX5E_HFUNC_XOR 2 +#endif + +static ssize_t mlx5e_show_hfunc(struct device *device, + struct device_attribute *attr, + char *buf) +{ + struct mlx5e_priv *priv = netdev_priv(to_net_dev(device)); + u8 hfunc; + int err, len = 0; + + rtnl_lock(); + mutex_lock(&priv->state_lock); + err = mlx5e_rx_res_rss_get_rxfh(priv->rx_res, 0, NULL, NULL, &hfunc); + mutex_unlock(&priv->state_lock); + if (err) + goto out; + + len += sprintf(buf + len, "Operational hfunc: %s\n", + hfunc == MLX5E_HFUNC_XOR ? + "xor" : "toeplitz"); + len += sprintf(buf + len, "Supported hfuncs: xor toeplitz\n"); + +out: + rtnl_unlock(); + + return err ? err : len; +} + +static ssize_t mlx5e_store_hfunc(struct device *device, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct mlx5e_priv *priv = netdev_priv(to_net_dev(device)); + struct net_device *netdev = priv->netdev; + char hfunc[ETH_GSTRING_LEN]; + u8 ethtool_hfunc; + int err; + + err = sscanf(buf, "%31s", hfunc); + + if (err != 1) + goto bad_input; + + if (!strcmp(hfunc, "xor")) + ethtool_hfunc = MLX5E_HFUNC_XOR; + else if (!strcmp(hfunc, "toeplitz")) + ethtool_hfunc = MLX5E_HFUNC_TOP; + else + goto bad_input; + + rtnl_lock(); + mutex_lock(&priv->state_lock); + err = mlx5e_rx_res_rss_set_rxfh(priv->rx_res, 0, NULL, NULL, + ðtool_hfunc); + mutex_unlock(&priv->state_lock); + rtnl_unlock(); + + if (err) + return err; + + return count; + +bad_input: + netdev_err(netdev, "Bad Input\n"); + return -EINVAL; +} + +static DEVICE_ATTR(hfunc, S_IRUGO | S_IWUSR, + mlx5e_show_hfunc, mlx5e_store_hfunc); + +static ssize_t mlx5e_show_link_down_reason(struct device *device, + struct device_attribute *attr, + char *buf) +{ + char status_message[MLX5_FLD_SZ_BYTES(pddr_troubleshooting_page, + status_message)]; + struct mlx5e_priv *priv = netdev_priv(to_net_dev(device)); + u16 monitor_opcode = 0; + int len = 0; + + if (mlx5_query_port_status(priv->mdev, NULL, &monitor_opcode, status_message)) + return -ENODATA; + + len += sprintf(buf + len, "monitor_opcode: %#x\n", monitor_opcode); + len += sprintf(buf + len, "status_message: %s\n", status_message); + + return len; +} + +static DEVICE_ATTR(link_down_reason, S_IRUGO, + mlx5e_show_link_down_reason, NULL); +#define MLX5E_PFC_PREVEN_CRITICAL_AUTO_MSEC 100 +#define MLX5E_PFC_PREVEN_MINOR_AUTO_MSEC 85 +#define MLX5E_PFC_PREVEN_CRITICAL_DEFAULT_MSEC 8000 +#define MLX5E_PFC_PREVEN_MINOR_DEFAULT_MSEC 6800 + +static ssize_t mlx5e_get_pfc_prevention_mode(struct device *device, + struct device_attribute *attr, + char *buf) +{ + struct mlx5e_priv *priv = netdev_priv(to_net_dev(device)); + struct mlx5_core_dev *mdev = priv->mdev; + u16 pfc_prevention_critical; + char *str_critical; + int len = 0; + int err; + + if (!MLX5_CAP_PCAM_FEATURE(mdev, pfcc_mask)) + return -EOPNOTSUPP; + + err = mlx5_query_port_pfc_prevention(mdev, &pfc_prevention_critical); + if (err) + return err; + + str_critical = (pfc_prevention_critical == + MLX5E_PFC_PREVEN_CRITICAL_DEFAULT_MSEC) ? + "default" : "auto"; + len += sprintf(buf, "%s\n", str_critical); + + return len; +} + +static ssize_t mlx5e_set_pfc_prevention_mode(struct device *device, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct mlx5e_priv *priv = netdev_priv(to_net_dev(device)); + struct net_device *netdev = priv->netdev; + struct mlx5_core_dev *mdev = priv->mdev; + char pfc_stall_prevention[ETH_GSTRING_LEN]; + u16 pfc_prevention_critical; + u16 pfc_prevention_minor; + int err; + + if (!MLX5_CAP_PCAM_FEATURE(mdev, pfcc_mask)) + return -EOPNOTSUPP; + + err = sscanf(buf, "%31s", pfc_stall_prevention); + + if (!strcmp(pfc_stall_prevention, "default")) { + pfc_prevention_critical = MLX5E_PFC_PREVEN_CRITICAL_DEFAULT_MSEC; + pfc_prevention_minor = MLX5E_PFC_PREVEN_MINOR_DEFAULT_MSEC; + } else if (!strcmp(pfc_stall_prevention, "auto")) { + pfc_prevention_critical = MLX5E_PFC_PREVEN_CRITICAL_AUTO_MSEC; + pfc_prevention_minor = MLX5E_PFC_PREVEN_MINOR_AUTO_MSEC; + } else { + goto bad_input; + } + + rtnl_lock(); + + err = mlx5_set_port_pfc_prevention(mdev, pfc_prevention_critical, + pfc_prevention_minor); + + rtnl_unlock(); + if (err) + return err; + + return count; + +bad_input: + netdev_err(netdev, "Bad Input\n"); + return -EINVAL; +} + +static DEVICE_ATTR(pfc_stall_prevention, S_IRUGO | S_IWUSR, + mlx5e_get_pfc_prevention_mode, mlx5e_set_pfc_prevention_mode); + +static const char *mlx5e_get_cong_protocol(int protocol) +{ + switch (protocol) { + case MLX5E_CON_PROTOCOL_802_1_RP: + return "802.1.qau_rp"; + case MLX5E_CON_PROTOCOL_R_ROCE_RP: + return "roce_rp"; + case MLX5E_CON_PROTOCOL_R_ROCE_NP: + return "roce_np"; + } + return ""; +} + +static void mlx5e_fill_rp_attributes(struct kobject *kobj, + struct mlx5_core_dev *mdev, + struct mlx5e_ecn_rp_attributes *rp_attr) +{ + int err; + + rp_attr->mdev = mdev; + + sysfs_attr_init(&rp_attr->clamp_tgt_rate.attr); + rp_attr->clamp_tgt_rate.attr.name = "clamp_tgt_rate"; + rp_attr->clamp_tgt_rate.attr.mode = set_kobj_mode(mdev); + rp_attr->clamp_tgt_rate.show = mlx5e_show_clamp_tgt_rate; + rp_attr->clamp_tgt_rate.store = mlx5e_store_clamp_tgt_rate; + err = sysfs_create_file(kobj, &rp_attr->clamp_tgt_rate.attr); + + sysfs_attr_init(&rp_attr->clamp_tgt_rate_ati.attr); + rp_attr->clamp_tgt_rate_ati.attr.name = "clamp_tgt_rate_after_time_inc"; + rp_attr->clamp_tgt_rate_ati.attr.mode = set_kobj_mode(mdev); + rp_attr->clamp_tgt_rate_ati.show = mlx5e_show_clamp_tgt_rate_ati; + rp_attr->clamp_tgt_rate_ati.store = mlx5e_store_clamp_tgt_rate_ati; + err = sysfs_create_file(kobj, &rp_attr->clamp_tgt_rate_ati.attr); + + sysfs_attr_init(&rp_attr->rpg_time_reset.attr); + rp_attr->rpg_time_reset.attr.name = "rpg_time_reset"; + rp_attr->rpg_time_reset.attr.mode = set_kobj_mode(mdev); + rp_attr->rpg_time_reset.show = mlx5e_show_rpg_time_reset; + rp_attr->rpg_time_reset.store = mlx5e_store_rpg_time_reset; + err = sysfs_create_file(kobj, &rp_attr->rpg_time_reset.attr); + + sysfs_attr_init(&rp_attr->rpg_byte_reset.attr); + rp_attr->rpg_byte_reset.attr.name = "rpg_byte_reset"; + rp_attr->rpg_byte_reset.attr.mode = set_kobj_mode(mdev); + rp_attr->rpg_byte_reset.show = mlx5e_show_rpg_byte_reset; + rp_attr->rpg_byte_reset.store = mlx5e_store_rpg_byte_reset; + err = sysfs_create_file(kobj, &rp_attr->rpg_byte_reset.attr); + + sysfs_attr_init(&rp_attr->rpg_threshold.attr); + rp_attr->rpg_threshold.attr.name = "rpg_threshold"; + rp_attr->rpg_threshold.attr.mode = set_kobj_mode(mdev); + rp_attr->rpg_threshold.show = mlx5e_show_rpg_threshold; + rp_attr->rpg_threshold.store = mlx5e_store_rpg_threshold; + err = sysfs_create_file(kobj, &rp_attr->rpg_threshold.attr); + + sysfs_attr_init(&rp_attr->rpg_max_rate.attr); + rp_attr->rpg_max_rate.attr.name = "rpg_max_rate"; + rp_attr->rpg_max_rate.attr.mode = set_kobj_mode(mdev); + rp_attr->rpg_max_rate.show = mlx5e_show_rpg_max_rate; + rp_attr->rpg_max_rate.store = mlx5e_store_rpg_max_rate; + err = sysfs_create_file(kobj, &rp_attr->rpg_max_rate.attr); + + sysfs_attr_init(&rp_attr->rpg_ai_rate.attr); + rp_attr->rpg_ai_rate.attr.name = "rpg_ai_rate"; + rp_attr->rpg_ai_rate.attr.mode = set_kobj_mode(mdev); + rp_attr->rpg_ai_rate.show = mlx5e_show_rpg_ai_rate; + rp_attr->rpg_ai_rate.store = mlx5e_store_rpg_ai_rate; + err = sysfs_create_file(kobj, &rp_attr->rpg_ai_rate.attr); + + sysfs_attr_init(&rp_attr->rpg_hai_rate.attr); + rp_attr->rpg_hai_rate.attr.name = "rpg_hai_rate"; + rp_attr->rpg_hai_rate.attr.mode = set_kobj_mode(mdev); + rp_attr->rpg_hai_rate.show = mlx5e_show_rpg_hai_rate; + rp_attr->rpg_hai_rate.store = mlx5e_store_rpg_hai_rate; + err = sysfs_create_file(kobj, &rp_attr->rpg_hai_rate.attr); + + sysfs_attr_init(&rp_attr->rpg_gd.attr); + rp_attr->rpg_gd.attr.name = "rpg_gd"; + rp_attr->rpg_gd.attr.mode = set_kobj_mode(mdev); + rp_attr->rpg_gd.show = mlx5e_show_rpg_gd; + rp_attr->rpg_gd.store = mlx5e_store_rpg_gd; + + err = sysfs_create_file(kobj, &rp_attr->rpg_gd.attr); + + sysfs_attr_init(&rp_attr->rpg_min_dec_fac.attr); + rp_attr->rpg_min_dec_fac.attr.name = "rpg_min_dec_fac"; + rp_attr->rpg_min_dec_fac.attr.mode = set_kobj_mode(mdev); + rp_attr->rpg_min_dec_fac.show = mlx5e_show_rpg_min_dec_fac; + rp_attr->rpg_min_dec_fac.store = mlx5e_store_rpg_min_dec_fac; + err = sysfs_create_file(kobj, &rp_attr->rpg_min_dec_fac.attr); + + sysfs_attr_init(&rp_attr->rpg_min_rate.attr); + rp_attr->rpg_min_rate.attr.name = "rpg_min_rate"; + rp_attr->rpg_min_rate.attr.mode = set_kobj_mode(mdev); + rp_attr->rpg_min_rate.show = mlx5e_show_rpg_min_rate; + rp_attr->rpg_min_rate.store = mlx5e_store_rpg_min_rate; + err = sysfs_create_file(kobj, &rp_attr->rpg_min_rate.attr); + + sysfs_attr_init(&rp_attr->rate2set_fcnp.attr); + rp_attr->rate2set_fcnp.attr.name = "rate_to_set_on_first_cnp"; + rp_attr->rate2set_fcnp.attr.mode = set_kobj_mode(mdev); + rp_attr->rate2set_fcnp.show = mlx5e_show_rate2set_fcnp; + rp_attr->rate2set_fcnp.store = mlx5e_store_rate2set_fcnp; + err = sysfs_create_file(kobj, &rp_attr->rate2set_fcnp.attr); + + sysfs_attr_init(&rp_attr->dce_tcp_g.attr); + rp_attr->dce_tcp_g.attr.name = "dce_tcp_g"; + rp_attr->dce_tcp_g.attr.mode = set_kobj_mode(mdev); + rp_attr->dce_tcp_g.show = mlx5e_show_dce_tcp_g; + rp_attr->dce_tcp_g.store = mlx5e_store_dce_tcp_g; + err = sysfs_create_file(kobj, &rp_attr->dce_tcp_g.attr); + + sysfs_attr_init(&rp_attr->dce_tcp_rtt.attr); + rp_attr->dce_tcp_rtt.attr.name = "dce_tcp_rtt"; + rp_attr->dce_tcp_rtt.attr.mode = set_kobj_mode(mdev); + rp_attr->dce_tcp_rtt.show = mlx5e_show_dce_tcp_rtt; + rp_attr->dce_tcp_rtt.store = mlx5e_store_dce_tcp_rtt; + err = sysfs_create_file(kobj, &rp_attr->dce_tcp_rtt.attr); + + sysfs_attr_init(&rp_attr->rreduce_mperiod.attr); + rp_attr->rreduce_mperiod.attr.name = "rate_reduce_monitor_period"; + rp_attr->rreduce_mperiod.attr.mode = set_kobj_mode(mdev); + rp_attr->rreduce_mperiod.show = mlx5e_show_rreduce_mperiod; + rp_attr->rreduce_mperiod.store = mlx5e_store_rreduce_mperiod; + err = sysfs_create_file(kobj, &rp_attr->rreduce_mperiod.attr); + + sysfs_attr_init(&rp_attr->initial_alpha_value.attr); + rp_attr->initial_alpha_value.attr.name = "initial_alpha_value"; + rp_attr->initial_alpha_value.attr.mode = set_kobj_mode(mdev); + rp_attr->initial_alpha_value.show = mlx5e_show_initial_alpha_value; + rp_attr->initial_alpha_value.store = mlx5e_store_initial_alpha_value; + err = sysfs_create_file(kobj, &rp_attr->initial_alpha_value.attr); +} + +static void mlx5e_remove_rp_attributes(struct kobject *kobj, + struct mlx5e_ecn_rp_attributes *rp_attr) +{ + sysfs_remove_file(kobj, &rp_attr->clamp_tgt_rate.attr); + sysfs_remove_file(kobj, &rp_attr->clamp_tgt_rate_ati.attr); + sysfs_remove_file(kobj, &rp_attr->rpg_time_reset.attr); + sysfs_remove_file(kobj, &rp_attr->rpg_byte_reset.attr); + sysfs_remove_file(kobj, &rp_attr->rpg_threshold.attr); + sysfs_remove_file(kobj, &rp_attr->rpg_max_rate.attr); + sysfs_remove_file(kobj, &rp_attr->rpg_ai_rate.attr); + sysfs_remove_file(kobj, &rp_attr->rpg_hai_rate.attr); + sysfs_remove_file(kobj, &rp_attr->rpg_gd.attr); + sysfs_remove_file(kobj, &rp_attr->rpg_min_dec_fac.attr); + sysfs_remove_file(kobj, &rp_attr->rpg_min_rate.attr); + sysfs_remove_file(kobj, &rp_attr->rate2set_fcnp.attr); + sysfs_remove_file(kobj, &rp_attr->dce_tcp_g.attr); + sysfs_remove_file(kobj, &rp_attr->dce_tcp_rtt.attr); + sysfs_remove_file(kobj, &rp_attr->rreduce_mperiod.attr); + sysfs_remove_file(kobj, &rp_attr->initial_alpha_value.attr); +} + +static void mlx5e_fill_np_attributes(struct kobject *kobj, + struct mlx5_core_dev *mdev, + struct mlx5e_ecn_np_attributes *np_attr) +{ + int err; + + np_attr->mdev = mdev; + + sysfs_attr_init(&np_attr->min_time_between_cnps.attr); + np_attr->min_time_between_cnps.attr.name = "min_time_between_cnps"; + np_attr->min_time_between_cnps.attr.mode = set_kobj_mode(mdev); + np_attr->min_time_between_cnps.show = mlx5e_show_min_time_between_cnps; + np_attr->min_time_between_cnps.store = + mlx5e_store_min_time_between_cnps; + err = sysfs_create_file(kobj, &np_attr->min_time_between_cnps.attr); + + sysfs_attr_init(&np_attr->cnp_dscp.attr); + np_attr->cnp_dscp.attr.name = "cnp_dscp"; + np_attr->cnp_dscp.attr.mode = set_kobj_mode(mdev); + np_attr->cnp_dscp.show = mlx5e_show_cnp_dscp; + np_attr->cnp_dscp.store = mlx5e_store_cnp_dscp; + err = sysfs_create_file(kobj, &np_attr->cnp_dscp.attr); + + sysfs_attr_init(&np_attr->cnp_802p_prio.attr); + np_attr->cnp_802p_prio.attr.name = "cnp_802p_prio"; + np_attr->cnp_802p_prio.attr.mode = set_kobj_mode(mdev); + np_attr->cnp_802p_prio.show = mlx5e_show_cnp_802p_prio; + np_attr->cnp_802p_prio.store = mlx5e_store_cnp_802p_prio; + err = sysfs_create_file(kobj, &np_attr->cnp_802p_prio.attr); +} + +static void mlx5e_remove_np_attributes(struct kobject *kobj, + struct mlx5e_ecn_np_attributes *np_attr) +{ + sysfs_remove_file(kobj, &np_attr->min_time_between_cnps.attr); + sysfs_remove_file(kobj, &np_attr->cnp_dscp.attr); + sysfs_remove_file(kobj, &np_attr->cnp_802p_prio.attr); +} + +static void mlx5e_fill_attributes(struct mlx5e_priv *priv, + int proto) +{ + const char *priority_arr[8] = {"0", "1", "2", "3", "4", "5", "6", "7"}; + struct mlx5e_resources *res = &priv->mdev->mlx5e_res; + struct mlx5e_ecn_ctx *ecn_ctx = &res->compat.ecn_ctx[proto]; + struct mlx5e_ecn_enable_ctx *ecn_enable_ctx; + int i, err; + + ecn_ctx->ecn_enable_kobj = kobject_create_and_add("enable", + ecn_ctx->ecn_proto_kobj); + + for (i = 0; i < 8; i++) { + ecn_enable_ctx = &res->compat.ecn_enable_ctx[proto][i]; + ecn_enable_ctx->priority = i; + ecn_enable_ctx->cong_protocol = proto; + ecn_enable_ctx->mdev = priv->mdev; + sysfs_attr_init(&ecn_enable_ctx->enable.attr); + ecn_enable_ctx->enable.attr.name = priority_arr[i]; + ecn_enable_ctx->enable.attr.mode = set_kobj_mode(priv->mdev); + ecn_enable_ctx->enable.show = mlx5e_show_ecn_enable; + ecn_enable_ctx->enable.store = mlx5e_store_ecn_enable; + err = sysfs_create_file(ecn_ctx->ecn_enable_kobj, + &ecn_enable_ctx->enable.attr); + } + + switch (proto) { + case MLX5E_CON_PROTOCOL_802_1_RP: + return; + case MLX5E_CON_PROTOCOL_R_ROCE_RP: + return mlx5e_fill_rp_attributes(ecn_ctx->ecn_proto_kobj, + priv->mdev, + &ecn_ctx->ecn_attr.rp_attr); + case MLX5E_CON_PROTOCOL_R_ROCE_NP: + return mlx5e_fill_np_attributes(ecn_ctx->ecn_proto_kobj, + priv->mdev, + &ecn_ctx->ecn_attr.np_attr); + } +} + +#ifdef CONFIG_MLX5_ESWITCH +static ssize_t mlx5e_show_vf_roce(struct device *device, + struct device_attribute *attr, + char *buf) +{ + struct mlx5e_priv *priv = netdev_priv(to_net_dev(device)); + struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5_eswitch *esw = mdev->priv.eswitch; + struct mlx5_vport *vport; + int len = 0; + int num_vfs; + bool mode; + int err = 0; + unsigned long i; + + /* This is a workaround for RM 2769801 */ + if (mlx5_core_is_ecpf_esw_manager(esw->dev)) + num_vfs = esw->esw_funcs.num_vfs; + else + num_vfs = pci_num_vf(mdev->pdev); + + mlx5_esw_for_each_vf_vport(esw, i, vport, num_vfs) { + err = mlx5_eswitch_vport_get_other_hca_cap_roce(esw, vport, &mode); + if (err) + break; + len += sprintf(buf + len, "vf_num %lu: %d\n", i - 1, mode); + } + + if (err) + return 0; + + return len; +} + +static ssize_t mlx5e_store_vf_roce(struct device *device, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct mlx5e_priv *priv = netdev_priv(to_net_dev(device)); + struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5_eswitch *esw = mdev->priv.eswitch; + struct mlx5_vport *vport; + int vf_num, err; + int mode; + + err = sscanf(buf, "%d %d", &vf_num, &mode); + if (err != 2) + return -EINVAL; + + vport = mlx5_eswitch_get_vport(esw, vf_num + 1); + if (IS_ERR(vport)) + return PTR_ERR(vport); + + err = mlx5_eswitch_vport_modify_other_hca_cap_roce(esw, vport, (bool)mode); + if (err) + return err; + + return count; +} +#endif + +static void mlx5e_remove_attributes(struct mlx5e_priv *priv, + int proto) +{ + struct mlx5e_ecn_ctx *ecn_ctx = &priv->mdev->mlx5e_res.compat.ecn_ctx[proto]; + struct mlx5e_resources *res = &priv->mdev->mlx5e_res; + struct mlx5e_ecn_enable_ctx *ecn_enable_ctx; + int i; + + for (i = 0; i < 8; i++) { + ecn_enable_ctx = &res->compat.ecn_enable_ctx[proto][i]; + sysfs_remove_file(res->compat.ecn_ctx[proto].ecn_enable_kobj, + &ecn_enable_ctx->enable.attr); + } + + kobject_put(res->compat.ecn_ctx[proto].ecn_enable_kobj); + + switch (proto) { + case MLX5E_CON_PROTOCOL_802_1_RP: + return; + case MLX5E_CON_PROTOCOL_R_ROCE_RP: + mlx5e_remove_rp_attributes(res->compat.ecn_ctx[proto].ecn_proto_kobj, + &ecn_ctx->ecn_attr.rp_attr); + break; + case MLX5E_CON_PROTOCOL_R_ROCE_NP: + mlx5e_remove_np_attributes(res->compat.ecn_ctx[proto].ecn_proto_kobj, + &ecn_ctx->ecn_attr.np_attr); + break; + } +} + +#ifdef CONFIG_MLX5_CORE_EN_DCB +static ssize_t mlx5e_show_prio2buffer(struct device *device, + struct device_attribute *attr, + char *buf) +{ + struct net_device *dev = to_net_dev(device); + struct mlx5e_priv *priv = netdev_priv(dev); + u8 prio2buffer[MLX5E_MAX_PRIORITY]; + int len = 0; + int err; + int i; + + err = mlx5e_port_query_priority2buffer(priv->mdev, prio2buffer); + if (err) + return err; + + len += sprintf(buf + len, "Priority\tBuffer\n"); + for (i = 0; i < MLX5E_MAX_PRIORITY; i++) + len += sprintf(buf + len, "%d\t\t%d\n", + i, prio2buffer[i]); + + return len; +} + +static ssize_t mlx5e_store_prio2buffer(struct device *device, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct net_device *dev = to_net_dev(device); + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5_core_dev *mdev = priv->mdev; + u8 old_prio2buffer[MLX5E_MAX_PRIORITY]; + u8 prio2buffer[MLX5E_MAX_PRIORITY]; + unsigned int temp; + char *options; + char *p; + u32 changed = 0; + int i = 0; + int err; + + options = kstrdup(buf, GFP_KERNEL); + while ((p = strsep(&options, ",")) != NULL && i < MLX5E_MAX_PRIORITY) { + if (sscanf(p, "%u", &temp) != 1) + continue; + if (temp > 7) + return -EINVAL; + prio2buffer[i] = temp; + i++; + } + + if (i != MLX5E_MAX_PRIORITY) + return -EINVAL; + + err = mlx5e_port_query_priority2buffer(mdev, old_prio2buffer); + if (err) + return err; + + for (i = 0; i < MLX5E_MAX_PRIORITY; i++) { + if (prio2buffer[i] != old_prio2buffer[i]) { + changed = MLX5E_PORT_BUFFER_PRIO2BUFFER; + break; + } + } + + err = mlx5e_port_manual_buffer_config(priv, changed, dev->mtu, NULL, NULL, prio2buffer); + if (err) + return err; + + return count; +} + +static ssize_t mlx5e_show_buffer_size(struct device *device, + struct device_attribute *attr, + char *buf) +{ + struct net_device *dev = to_net_dev(device); + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5e_port_buffer port_buffer; + int len = 0; + int err; + int i; + + err = mlx5e_port_query_buffer(priv, &port_buffer); + if (err) + return err; + + len += sprintf(buf + len, "Port buffer size = %d\n", port_buffer.port_buffer_size); + len += sprintf(buf + len, "Spare buffer size = %d\n", port_buffer.spare_buffer_size); + len += sprintf(buf + len, "Buffer\tSize\txoff_threshold\txon_threshold\n"); + for (i = 0; i < MLX5E_MAX_BUFFER; i++) + len += sprintf(buf + len, "%d\t%d\t%d\t\t%d\n", i, + port_buffer.buffer[i].size, + port_buffer.buffer[i].xoff, + port_buffer.buffer[i].xon); + + return len; +} + +static ssize_t mlx5e_store_buffer_size(struct device *device, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct net_device *dev = to_net_dev(device); + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5e_port_buffer port_buffer; + u32 buffer_size[MLX5E_MAX_BUFFER]; + unsigned int temp; + char *options; + char *p; + u32 changed = 0; + int i = 0; + int err; + + options = kstrdup(buf, GFP_KERNEL); + while ((p = strsep(&options, ",")) != NULL && i < MLX5E_MAX_BUFFER) { + if (sscanf(p, "%u", &temp) != 1) + continue; + buffer_size[i] = temp; + i++; + } + + if (i != MLX5E_MAX_BUFFER) + return -EINVAL; + + err = mlx5e_port_query_buffer(priv, &port_buffer); + if (err) + return err; + + for (i = 0; i < MLX5E_MAX_BUFFER; i++) { + if (port_buffer.buffer[i].size != buffer_size[i]) { + changed = MLX5E_PORT_BUFFER_SIZE; + break; + } + } + + err = mlx5e_port_manual_buffer_config(priv, changed, dev->mtu, NULL, buffer_size, NULL); + if (err) + return err; + + return count; +} +#endif + +#ifdef CONFIG_MLX5_CORE_EN_DCB +static DEVICE_ATTR(buffer_size, S_IRUGO | S_IWUSR, + mlx5e_show_buffer_size, + mlx5e_store_buffer_size); + +static DEVICE_ATTR(prio2buffer, S_IRUGO | S_IWUSR, + mlx5e_show_prio2buffer, + mlx5e_store_prio2buffer); +#endif + +#ifdef CONFIG_MLX5_ESWITCH +static DEVICE_ATTR(vf_roce, S_IRUGO | S_IWUSR, + mlx5e_show_vf_roce, + mlx5e_store_vf_roce); +#endif + +static ssize_t mlx5e_show_force_local_lb(struct device *device, + struct device_attribute *attr, + char *buf) +{ + struct net_device *dev = to_net_dev(device); + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5_core_dev *mdev = priv->mdev; + bool force_disable_lb = mdev->local_lb.user_force_disable; + int len = 0; + + len += sprintf(buf, "Force local loopback disable is %s\n", force_disable_lb ? "ON" : "OFF"); + + return len; +} + +static ssize_t mlx5e_store_force_local_lb(struct device *device, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct net_device *dev = to_net_dev(device); + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5_core_dev *mdev = priv->mdev; + bool disable; + int err; + + err = kstrtobool(buf, &disable); + if (err) + return -EINVAL; + + if (mdev->local_lb.user_force_disable != disable) { + mdev->local_lb.user_force_disable = disable; + mlx5_nic_vport_update_local_lb(mdev, + mdev->local_lb.driver_state); + } + + return count; +} + +static DEVICE_ATTR(force_local_lb_disable, S_IRUGO | S_IWUSR, + mlx5e_show_force_local_lb, + mlx5e_store_force_local_lb); + +static ssize_t mlx5e_show_log_rx_page_cache_mult_limit(struct device *device, + struct device_attribute *attr, + char *buf) +{ + struct net_device *dev = to_net_dev(device); + struct mlx5e_priv *priv = netdev_priv(dev); + int len; + + mutex_lock(&priv->state_lock); + len = sprintf(buf, "log rx page cache mult limit is %u\n", + priv->channels.params.log_rx_page_cache_mult); + mutex_unlock(&priv->state_lock); + + return len; +} + +static ssize_t mlx5e_store_log_rx_page_cache_mult_limit(struct device *device, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct net_device *dev = to_net_dev(device); + struct mlx5e_priv *priv = netdev_priv(dev); + int err, udata; + + err = kstrtoint(buf, 0, &udata); + if (err) + return -EINVAL; + + if (udata > MLX5E_PAGE_CACHE_LOG_MAX_RQ_MULT || udata < 0) { + netdev_err(priv->netdev, "log rx page cache mult limit cannot exceed above %d or below 0\n", + MLX5E_PAGE_CACHE_LOG_MAX_RQ_MULT); + return -EINVAL; + } + + mutex_lock(&priv->state_lock); + priv->channels.params.log_rx_page_cache_mult = (u8)udata; + mutex_unlock(&priv->state_lock); + + return count; +} + +static DEVICE_ATTR(log_mult_limit, S_IRUGO | S_IWUSR, + mlx5e_show_log_rx_page_cache_mult_limit, + mlx5e_store_log_rx_page_cache_mult_limit); + +static struct attribute *mlx5e_settings_attrs[] = { + &dev_attr_hfunc.attr, + &dev_attr_pfc_stall_prevention.attr, + NULL, +}; + +static struct attribute_group settings_group = { + .name = "settings", + .attrs = mlx5e_settings_attrs, +}; + +static struct attribute *mlx5e_debug_group_attrs[] = { + &dev_attr_lro_timeout.attr, + &dev_attr_link_down_reason.attr, + NULL, +}; + +static struct attribute *mlx5e_qos_attrs[] = { + &dev_attr_tc_num.attr, + &dev_attr_maxrate.attr, + NULL, +}; + +static struct attribute_group qos_group = { + .name = "qos", + .attrs = mlx5e_qos_attrs, +}; + +static struct attribute_group debug_group = { + .name = "debug", + .attrs = mlx5e_debug_group_attrs, +}; + +#define PHY_STAT_ENTRY(name, cnt) \ +static ssize_t name##_show(struct device *d, \ + struct device_attribute *attr, char *buf) \ +{ \ + struct net_device *dev = to_net_dev(d); \ + struct mlx5e_priv *priv = netdev_priv(dev); \ + struct mlx5e_pport_stats *pstats = &priv->stats.pport; \ + \ + return sprintf(buf, "%llu\n", \ + PPORT_802_3_GET(pstats, cnt)); \ +} \ +static DEVICE_ATTR(name, S_IRUGO, name##_show, NULL) + +PHY_STAT_ENTRY(rx_packets, a_frames_received_ok); +PHY_STAT_ENTRY(tx_packets, a_frames_transmitted_ok); +PHY_STAT_ENTRY(rx_bytes, a_octets_received_ok); +PHY_STAT_ENTRY(tx_bytes, a_octets_transmitted_ok); + +static struct attribute *mlx5e_phy_stat_attrs[] = { + &dev_attr_rx_packets.attr, + &dev_attr_tx_packets.attr, + &dev_attr_rx_bytes.attr, + &dev_attr_tx_bytes.attr, + NULL, +}; + +static struct attribute_group phy_stat_group = { + .name = "phy_stats", + .attrs = mlx5e_phy_stat_attrs, +}; + +static struct attribute *mlx5e_log_rx_page_cache_attrs[] = { + &dev_attr_log_mult_limit.attr, + NULL, +}; + +static struct attribute_group rx_page_cache_group = { + .name = "rx_page_cache", + .attrs = mlx5e_log_rx_page_cache_attrs, +}; + +static int update_qos_sysfs(struct net_device *dev, + struct mlx5_core_dev *mdev) +{ + int err = 0; + +#ifdef CONFIG_MLX5_CORE_EN_DCB + if (MLX5_BUFFER_SUPPORTED(mdev)) { + err = sysfs_add_file_to_group(&dev->dev.kobj, + &dev_attr_prio2buffer.attr, + "qos"); + err = sysfs_add_file_to_group(&dev->dev.kobj, + &dev_attr_buffer_size.attr, + "qos"); + } +#endif + + return err; +} + +static int update_settings_sysfs(struct net_device *dev, + struct mlx5_core_dev *mdev) +{ + int err = 0; + +#ifdef CONFIG_MLX5_ESWITCH + if (MLX5_CAP_GEN(mdev, vport_group_manager) && + MLX5_CAP_GEN(mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) { + err = sysfs_add_file_to_group(&dev->dev.kobj, + &dev_attr_vf_roce.attr, + "settings"); + } +#endif + + if (MLX5_CAP_GEN(mdev, disable_local_lb_mc) || + MLX5_CAP_GEN(mdev, disable_local_lb_uc)) { + err = sysfs_add_file_to_group(&dev->dev.kobj, + &dev_attr_force_local_lb_disable.attr, + "settings"); + } + + return err; +} + +#if IS_ENABLED(CONFIG_MLX5_CLS_ACT) +struct prio_hp_attributes { + struct attribute attr; + ssize_t (*show)(struct mlx5_prio_hp *, struct prio_hp_attributes *, + char *buf); + ssize_t (*store)(struct mlx5_prio_hp *, struct prio_hp_attributes *, + const char *buf, size_t count); +}; + +static ssize_t prio_hp_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct prio_hp_attributes *ga = + container_of(attr, struct prio_hp_attributes, attr); + struct mlx5_prio_hp *g = container_of(kobj, struct mlx5_prio_hp, kobj); + + if (!ga->show) + return -EIO; + + return ga->show(g, ga, buf); +} + +static ssize_t prio_hp_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t size) +{ + struct prio_hp_attributes *ga = + container_of(attr, struct prio_hp_attributes, attr); + struct mlx5_prio_hp *g = container_of(kobj, struct mlx5_prio_hp, kobj); + + if (!ga->store) + return -EIO; + + return ga->store(g, ga, buf, size); +} + +static const struct sysfs_ops prio_hp_ops = { + .show = prio_hp_attr_show, + .store = prio_hp_attr_store, +}; + +static ssize_t rate_store(struct mlx5_prio_hp *g, + struct prio_hp_attributes *oa, + const char *buf, + size_t count) +{ + struct mlx5e_priv *priv = g->priv; + struct mlx5_core_dev *mdev = priv->mdev; + int user_rate, rate; + int err; + + if (sscanf(buf, "%d", &user_rate) != 1) + return -EINVAL; + + if (user_rate == g->rate) + /* nothing to do */ + return count; + + if (!mlx5_rl_is_supported(mdev)) { + netdev_err(priv->netdev, "Rate limiting is not supported on this device\n"); + return -EINVAL; + } + + /* rate is given in Mb/sec, HW config is in Kb/sec */ + rate = user_rate << 10; + + /* Check whether rate in valid range, 0 is always valid */ + if (rate && !mlx5_rl_is_in_range(mdev, rate)) { + netdev_err(priv->netdev, "TX rate %u, is not in range\n", rate); + return -ERANGE; + } + + mutex_lock(&priv->state_lock); + if (test_bit(MLX5E_STATE_OPENED, &priv->state)) { + err = mlx5e_set_prio_hairpin_rate(priv, g->prio, rate); + if (err) { + mutex_unlock(&priv->state_lock); + + return err; + } + } + + g->rate = user_rate; + mutex_unlock(&priv->state_lock); + + return count; +} + +static ssize_t rate_show(struct mlx5_prio_hp *g, struct prio_hp_attributes *oa, + char *buf) +{ + return sprintf(buf, "%d\n", g->rate); +} + +#define PRIO_HP_ATTR(_name) struct prio_hp_attributes prio_hp_attr_##_name = \ + __ATTR(_name, 0644, _name##_show, _name##_store) +PRIO_HP_ATTR(rate); + +static struct attribute *prio_hp_attrs[] = { + &prio_hp_attr_rate.attr, + NULL +}; + +static struct kobj_type prio_hp_sysfs = { + .sysfs_ops = &prio_hp_ops, + .default_attrs = prio_hp_attrs +}; + +int create_prio_hp_sysfs(struct mlx5e_priv *priv, int prio) +{ + struct mlx5e_tc_table *tc = &priv->fs.tc; + struct mlx5_prio_hp *prio_hp = tc->prio_hp; + int err; + + err = kobject_init_and_add(&prio_hp[prio].kobj, &prio_hp_sysfs, tc->hp_config, + "%d", prio); + if (err) { + netdev_err(priv->netdev, "can't create hp queues per q sysfs %d, err %d\n", + prio, err); + return err; + } + + kobject_uevent(&prio_hp[prio].kobj, KOBJ_ADD); + + return 0; +} + +static ssize_t prio_hp_num_store(struct device *device, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct mlx5e_priv *priv = netdev_priv(to_net_dev(device)); + struct mlx5e_tc_table *tc = &priv->fs.tc; + struct net_device *peer_dev; + char ifname[IFNAMSIZ]; + int num_hp; + int err; + + err = sscanf(buf, "%d %15s", &num_hp, ifname); + if (err != 2) + return -EINVAL; + + if (num_hp < 0 || num_hp > MLX5E_MAX_HP_PRIO) + return -EINVAL; + + rtnl_lock(); + mutex_lock(&priv->state_lock); + + peer_dev = __dev_get_by_name(dev_net(priv->netdev), ifname); + if (!peer_dev) + return -EINVAL; + + if (num_hp && !tc->num_prio_hp) { + err = mlx5e_prio_hairpin_mode_enable(priv, num_hp, peer_dev); + if (err) + goto err_config; + } else if (!num_hp && tc->num_prio_hp) { + err = mlx5e_prio_hairpin_mode_disable(priv); + if (err) + goto err_config; + } else { + err = -EINVAL; + goto err_config; + } + + mutex_unlock(&priv->state_lock); + rtnl_unlock(); + + return count; + +err_config: + mutex_unlock(&priv->state_lock); + rtnl_unlock(); + return err; +} + +static ssize_t prio_hp_num_show(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mlx5e_priv *priv = netdev_priv(to_net_dev(device)); + struct mlx5e_tc_table *tc = &priv->fs.tc; + ssize_t result; + + mutex_lock(&priv->state_lock); + result = sprintf(buf, "%d\n", tc->num_prio_hp); + mutex_unlock(&priv->state_lock); + + return result; +} + +static ssize_t hp_oob_cnt_mode_store(struct device *device, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct mlx5e_priv *priv = netdev_priv(to_net_dev(device)); + struct mlx5e_tc_table *tc = &priv->fs.tc; + struct net_device *peer_dev; + char ifname[IFNAMSIZ]; + char mode[5]; + int err; + + err = sscanf(buf, "%s %15s", mode, ifname); + if (err != 2) + return -EINVAL; + + if (strcmp(mode, "on") && strcmp(mode, "off")) + return -EINVAL; + + rtnl_lock(); + mutex_lock(&priv->state_lock); + + peer_dev = __dev_get_by_name(dev_net(priv->netdev), ifname); + if (!peer_dev) + return -EINVAL; + + if (!strcmp(mode, "on") && !tc->hp_oob) { + err = mlx5e_hairpin_oob_cnt_enable(priv, peer_dev); + if (err) + goto err_config; + } else if (!strcmp(mode, "off") && tc->hp_oob) { + err = mlx5e_hairpin_oob_cnt_disable(priv); + if (err) + goto err_config; + } else { + err = -EINVAL; + goto err_config; + } + + mutex_unlock(&priv->state_lock); + rtnl_unlock(); + + return count; + +err_config: + mutex_unlock(&priv->state_lock); + rtnl_unlock(); + return err; +} + +static ssize_t hp_oob_cnt_mode_show(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mlx5e_priv *priv = netdev_priv(to_net_dev(device)); + struct mlx5e_tc_table *tc = &priv->fs.tc; + ssize_t result; + + mutex_lock(&priv->state_lock); + result = sprintf(buf, "%s\n", tc->hp_oob ? "on" : "off"); + mutex_unlock(&priv->state_lock); + + return result; +} + +static ssize_t hp_oob_cnt_show(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mlx5e_priv *priv = netdev_priv(to_net_dev(device)); + ssize_t result; + u64 oob_cnt; + + mutex_lock(&priv->state_lock); + mlx5e_hairpin_oob_cnt_get(priv, &oob_cnt); + result = sprintf(buf, "%llu\n", oob_cnt); + mutex_unlock(&priv->state_lock); + + return result; +} + +/* Limiting max packet pacing burst size configuration using + * a typical 1514 Byte MTU size. + */ +#define MLX5E_MAX_HP_PP_BURST_MTUS 30 +#define MLX5E_MAX_HP_PP_BURST_SIZE (MLX5E_MAX_HP_PP_BURST_MTUS * 1514) +static ssize_t pp_burst_size_store(struct device *device, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct mlx5e_priv *priv = netdev_priv(to_net_dev(device)); + struct mlx5e_tc_table *tc = &priv->fs.tc; + int burst_size; + int err; + + if (!MLX5_CAP_QOS(priv->mdev, packet_pacing_burst_bound)) { + netdev_warn(priv->netdev, "Packet pacing burst size config is not supported by the device\n"); + return -EOPNOTSUPP; + } + + err = sscanf(buf, "%d", &burst_size); + if (err != 1) + return -EINVAL; + + if (burst_size < 0 || burst_size > MLX5E_MAX_HP_PP_BURST_SIZE) + return -EINVAL; + + rtnl_lock(); + mutex_lock(&priv->state_lock); + + tc->max_pp_burst_size = burst_size; + + mutex_unlock(&priv->state_lock); + rtnl_unlock(); + + return count; +} + +static ssize_t pp_burst_size_show(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct mlx5e_priv *priv = netdev_priv(to_net_dev(device)); + struct mlx5e_tc_table *tc = &priv->fs.tc; + ssize_t result; + + mutex_lock(&priv->state_lock); + result = sprintf(buf, "%d\n", tc->max_pp_burst_size); + mutex_unlock(&priv->state_lock); + + return result; +} + +static DEVICE_ATTR(num_prio_hp, S_IRUGO | S_IWUSR, + prio_hp_num_show, prio_hp_num_store); +static DEVICE_ATTR(hp_pp_burst_size, S_IRUGO | S_IWUSR, + pp_burst_size_show, pp_burst_size_store); +static DEVICE_ATTR(hp_oob_cnt_mode, S_IRUGO | S_IWUSR, + hp_oob_cnt_mode_show, hp_oob_cnt_mode_store); +static DEVICE_ATTR_RO(hp_oob_cnt); + +static struct device_attribute *mlx5_class_attributes[] = { + &dev_attr_num_prio_hp, + &dev_attr_hp_pp_burst_size, + &dev_attr_hp_oob_cnt_mode, + &dev_attr_hp_oob_cnt, +}; + +int hp_sysfs_init(struct mlx5e_priv *priv) +{ + struct device *device = &priv->netdev->dev; + int i, err; + + for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) { + err = device_create_file(device, mlx5_class_attributes[i]); + if (err) + return err; + } + + return 0; +} + +void hp_sysfs_cleanup(struct mlx5e_priv *priv) +{ + struct device *device = &priv->netdev->dev; + int i; + + for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) + device_remove_file(device, mlx5_class_attributes[i]); +} + +#else + +int hp_sysfs_init(struct mlx5e_priv *priv) +{ return 0; } + +void hp_sysfs_cleanup(struct mlx5e_priv *priv) +{} + +#endif /*CONFIG_MLX5_CLS_ACT*/ + +int mlx5e_sysfs_create(struct net_device *dev) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5e_resources *res = &priv->mdev->mlx5e_res; + int err = 0; + int i; + + if (mlx5_core_is_sf(priv->mdev)) + return 0; + + res->compat.ecn_root_kobj = kobject_create_and_add("ecn", &dev->dev.kobj); + + for (i = 1; i < MLX5E_CONG_PROTOCOL_NUM; i++) { + res->compat.ecn_ctx[i].ecn_proto_kobj = + kobject_create_and_add(mlx5e_get_cong_protocol(i), + res->compat.ecn_root_kobj); + mlx5e_fill_attributes(priv, i); + } + + err = sysfs_create_group(&dev->dev.kobj, &settings_group); + if (err) + goto remove_attributes; + + err = update_settings_sysfs(dev, priv->mdev); + if (err) + goto remove_settings_group; + + err = sysfs_create_group(&dev->dev.kobj, &qos_group); + if (err) + goto remove_settings_group; + + err = update_qos_sysfs(dev, priv->mdev); + if (err) + goto remove_qos_group; + + err = sysfs_create_group(&dev->dev.kobj, &debug_group); + if (err) + goto remove_qos_group; + + err = sysfs_create_group(&dev->dev.kobj, &phy_stat_group); + if (err) + goto remove_debug_group; + + err = sysfs_create_group(&dev->dev.kobj, &rx_page_cache_group); + + if (err) + goto remove_phy_stat_group; + + err = hp_sysfs_init(priv); + if (err) + goto remove_rx_page_cache_group; + + mlx5_eswitch_compat_sysfs_init(dev); + + return 0; + +remove_rx_page_cache_group: + sysfs_remove_group(&dev->dev.kobj, &rx_page_cache_group); +remove_phy_stat_group: + sysfs_remove_group(&dev->dev.kobj, &phy_stat_group); +remove_debug_group: + sysfs_remove_group(&dev->dev.kobj, &debug_group); +remove_qos_group: + sysfs_remove_group(&dev->dev.kobj, &qos_group); +remove_settings_group: + sysfs_remove_group(&dev->dev.kobj, &settings_group); +remove_attributes: + for (i = 1; i < MLX5E_CONG_PROTOCOL_NUM; i++) { + mlx5e_remove_attributes(priv, i); + kobject_put(res->compat.ecn_ctx[i].ecn_proto_kobj); + } + + kobject_put(res->compat.ecn_root_kobj); + res->compat.ecn_root_kobj = NULL; + + return err; +} + +void mlx5e_sysfs_remove(struct net_device *dev) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5e_resources *res = &priv->mdev->mlx5e_res; + int i; + + if (mlx5_core_is_sf(priv->mdev)) + return; + + if (!res->compat.ecn_root_kobj) + return; + + mlx5_eswitch_compat_sysfs_cleanup(dev); + + sysfs_remove_group(&dev->dev.kobj, &qos_group); + sysfs_remove_group(&dev->dev.kobj, &debug_group); + sysfs_remove_group(&dev->dev.kobj, &settings_group); + sysfs_remove_group(&dev->dev.kobj, &phy_stat_group); + sysfs_remove_group(&dev->dev.kobj, &rx_page_cache_group); + hp_sysfs_cleanup(priv); + + for (i = 1; i < MLX5E_CONG_PROTOCOL_NUM; i++) { + mlx5e_remove_attributes(priv, i); + kobject_put(res->compat.ecn_ctx[i].ecn_proto_kobj); + } + + kobject_put(res->compat.ecn_root_kobj); + res->compat.ecn_root_kobj = NULL; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c new file mode 100644 index 0000000..4e5bbbb --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -0,0 +1,6112 @@ +/* + * Copyright (c) 2016, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "en.h" +#include "en/tc/post_act.h" +#include "en_rep.h" +#include "en/rep/tc.h" +#include "en/rep/neigh.h" +#include "en_tc.h" +#include "eswitch.h" +#include "fs_core.h" +#include "en/port.h" +#include "en/tc_tun.h" +#include "en/mapping.h" +#include "en/tc_ct.h" +#include "en/mod_hdr.h" +#include "en/tc_tun_encap.h" +#include "en/tc/sample.h" +#include "en/tc/act/act.h" +#include "en/tc/post_meter.h" +#include "lib/devcom.h" +#include "lib/geneve.h" +#include "lib/fs_chains.h" +#include "diag/en_tc_tracepoint.h" +#include +#include "lag/lag.h" +#include "lag/mp.h" +#include "esw/vf_meter.h" + +#define MLX5E_TC_TABLE_NUM_GROUPS 4 +#define MLX5E_TC_TABLE_MAX_GROUP_SIZE BIT(18) + +struct mlx5e_tc_attr_to_reg_mapping mlx5e_tc_attr_to_reg_mappings[] = { + [CHAIN_TO_REG] = { + .mfield = MLX5_ACTION_IN_FIELD_METADATA_REG_C_0, + .moffset = 0, + .mlen = 16, + }, + [VPORT_TO_REG] = { + .mfield = MLX5_ACTION_IN_FIELD_METADATA_REG_C_0, + .moffset = 16, + .mlen = 16, + }, + [TUNNEL_TO_REG] = { + .mfield = MLX5_ACTION_IN_FIELD_METADATA_REG_C_1, + .moffset = 8, + .mlen = ESW_TUN_OPTS_BITS + ESW_TUN_ID_BITS, + .soffset = MLX5_BYTE_OFF(fte_match_param, + misc_parameters_2.metadata_reg_c_1), + }, + [ZONE_TO_REG] = zone_to_reg_ct, + [ZONE_RESTORE_TO_REG] = zone_restore_to_reg_ct, + [CTSTATE_TO_REG] = ctstate_to_reg_ct, + [MARK_TO_REG] = mark_to_reg_ct, + [LABELS_TO_REG] = labels_to_reg_ct, + [FTEID_TO_REG] = fteid_to_reg_ct, + /* For NIC rules we store the restore metadata directly + * into reg_b that is passed to SW since we don't + * jump between steering domains. + */ + [NIC_CHAIN_TO_REG] = { + .mfield = MLX5_ACTION_IN_FIELD_METADATA_REG_B, + .moffset = 0, + .mlen = 16, + }, + [NIC_ZONE_RESTORE_TO_REG] = nic_zone_restore_to_reg_ct, + /* reg_c1 is not used by tc nic rules so we can overwrite it */ + [USER_PRIO_TO_REG] = { + .mfield = MLX5_ACTION_IN_FIELD_METADATA_REG_C_1, + .moffset = 0, + .mlen = 32, + .soffset = MLX5_BYTE_OFF(fte_match_param, + misc_parameters_2.metadata_reg_c_1), + }, + [HP_OOB_CNT_COLOR_REG] = { + .mfield = MLX5_ACTION_IN_FIELD_METADATA_REG_B, + .moffset = 0, + .mlen = 4, + }, + [HP_OOB_TX_CNT_COLOR_REG] = { + .mfield = MLX5_ACTION_IN_FIELD_METADATA_REG_A, + .moffset = 0, + .mlen = 4, + .soffset = MLX5_BYTE_OFF(fte_match_param, + misc_parameters_2.metadata_reg_a), + }, + [PACKET_COLOR_TO_REG] = packet_color_to_reg, +}; + +/* To avoid false lock dependency warning set the tc_ht lock + * class different than the lock class of the ht being used when deleting + * last flow from a group and then deleting a group, we get into del_sw_flow_group() + * which call rhashtable_destroy on fg->ftes_hash which will take ht->mutex but + * it's different than the ht->mutex here. + */ +static struct lock_class_key tc_ht_lock_key; + +static void mlx5e_put_flow_tunnel_id(struct mlx5e_tc_flow *flow); +static void free_flow_post_acts(struct mlx5e_tc_flow *flow); + +void +mlx5e_tc_match_to_reg_match(struct mlx5_flow_spec *spec, + enum mlx5e_tc_attr_to_reg type, + u32 val, + u32 mask) +{ + void *headers_c = spec->match_criteria, *headers_v = spec->match_value, *fmask, *fval; + int soffset = mlx5e_tc_attr_to_reg_mappings[type].soffset; + int moffset = mlx5e_tc_attr_to_reg_mappings[type].moffset; + int match_len = mlx5e_tc_attr_to_reg_mappings[type].mlen; + u32 max_mask = GENMASK(match_len - 1, 0); + __be32 curr_mask_be, curr_val_be; + u32 curr_mask, curr_val; + + fmask = headers_c + soffset; + fval = headers_v + soffset; + + memcpy(&curr_mask_be, fmask, 4); + memcpy(&curr_val_be, fval, 4); + + curr_mask = be32_to_cpu(curr_mask_be); + curr_val = be32_to_cpu(curr_val_be); + + //move to correct offset + WARN_ON(mask > max_mask); + mask <<= moffset; + val <<= moffset; + max_mask <<= moffset; + + //zero val and mask + curr_mask &= ~max_mask; + curr_val &= ~max_mask; + + //add current to mask + curr_mask |= mask; + curr_val |= val; + + //back to be32 and write + curr_mask_be = cpu_to_be32(curr_mask); + curr_val_be = cpu_to_be32(curr_val); + + memcpy(fmask, &curr_mask_be, 4); + memcpy(fval, &curr_val_be, 4); + + spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS_2; +} + +void +mlx5e_tc_match_to_reg_get_match(struct mlx5_flow_spec *spec, + enum mlx5e_tc_attr_to_reg type, + u32 *val, + u32 *mask) +{ + void *headers_c = spec->match_criteria, *headers_v = spec->match_value, *fmask, *fval; + int soffset = mlx5e_tc_attr_to_reg_mappings[type].soffset; + int moffset = mlx5e_tc_attr_to_reg_mappings[type].moffset; + int match_len = mlx5e_tc_attr_to_reg_mappings[type].mlen; + u32 max_mask = GENMASK(match_len - 1, 0); + __be32 curr_mask_be, curr_val_be; + u32 curr_mask, curr_val; + + fmask = headers_c + soffset; + fval = headers_v + soffset; + + memcpy(&curr_mask_be, fmask, 4); + memcpy(&curr_val_be, fval, 4); + + curr_mask = be32_to_cpu(curr_mask_be); + curr_val = be32_to_cpu(curr_val_be); + + *mask = (curr_mask >> moffset) & max_mask; + *val = (curr_val >> moffset) & max_mask; +} + +int +mlx5e_tc_match_to_reg_set_and_get_id(struct mlx5_core_dev *mdev, + struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts, + enum mlx5_flow_namespace_type ns, + enum mlx5e_tc_attr_to_reg type, + u32 data) +{ + int moffset = mlx5e_tc_attr_to_reg_mappings[type].moffset; + int mfield = mlx5e_tc_attr_to_reg_mappings[type].mfield; + int mlen = mlx5e_tc_attr_to_reg_mappings[type].mlen; + char *modact; + int err; + + modact = mlx5e_mod_hdr_alloc(mdev, ns, mod_hdr_acts); + if (IS_ERR(modact)) + return PTR_ERR(modact); + + /* Firmware has 5bit length field and 0 means 32bits */ + if (mlen == 32) + mlen = 0; + + MLX5_SET(set_action_in, modact, action_type, MLX5_ACTION_TYPE_SET); + MLX5_SET(set_action_in, modact, field, mfield); + MLX5_SET(set_action_in, modact, offset, moffset); + MLX5_SET(set_action_in, modact, length, mlen); + MLX5_SET(set_action_in, modact, data, data); + err = mod_hdr_acts->num_actions; + mod_hdr_acts->num_actions++; + + return err; +} + +struct mlx5e_tc_int_port_priv * +mlx5e_get_int_port_priv(struct mlx5e_priv *priv) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + struct mlx5_rep_uplink_priv *uplink_priv; + struct mlx5e_rep_priv *uplink_rpriv; + + if (is_mdev_switchdev_mode(priv->mdev)) { + uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH); + uplink_priv = &uplink_rpriv->uplink_priv; + + return uplink_priv->int_port_priv; + } + + return NULL; +} + +struct mlx5e_flow_meters * +mlx5e_get_flow_meters(struct mlx5_core_dev *dev) +{ + struct mlx5_eswitch *esw = dev->priv.eswitch; + struct mlx5_rep_uplink_priv *uplink_priv; + struct mlx5e_rep_priv *uplink_rpriv; + + if (is_mdev_switchdev_mode(dev)) { + uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH); + uplink_priv = &uplink_rpriv->uplink_priv; + return uplink_priv->flow_meters; + } + + return NULL; +} + +static struct mlx5_tc_ct_priv * +get_ct_priv(struct mlx5e_priv *priv) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + struct mlx5_rep_uplink_priv *uplink_priv; + struct mlx5e_rep_priv *uplink_rpriv; + + if (is_mdev_switchdev_mode(priv->mdev)) { + uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH); + uplink_priv = &uplink_rpriv->uplink_priv; + + return uplink_priv->ct_priv; + } + + return priv->fs.tc.ct; +} + +static struct mlx5e_tc_psample * +get_sample_priv(struct mlx5e_priv *priv) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + struct mlx5_rep_uplink_priv *uplink_priv; + struct mlx5e_rep_priv *uplink_rpriv; + + if (is_mdev_switchdev_mode(priv->mdev)) { + uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH); + uplink_priv = &uplink_rpriv->uplink_priv; + + return uplink_priv->tc_psample; + } + + return NULL; +} + +static struct mlx5e_post_act * +get_post_action(struct mlx5e_priv *priv) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + struct mlx5_rep_uplink_priv *uplink_priv; + struct mlx5e_rep_priv *uplink_rpriv; + + if (is_mdev_switchdev_mode(priv->mdev)) { + uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH); + uplink_priv = &uplink_rpriv->uplink_priv; + + return uplink_priv->post_act; + } + + return priv->fs.tc.post_act; +} + +struct mlx5_flow_handle * +mlx5_tc_rule_insert(struct mlx5e_priv *priv, + struct mlx5_flow_spec *spec, + struct mlx5_flow_attr *attr) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + + if (is_mdev_switchdev_mode(priv->mdev)) + return mlx5_eswitch_add_offloaded_rule(esw, spec, attr); + + return mlx5e_add_offloaded_nic_rule(priv, spec, attr); +} + +void +mlx5_tc_rule_delete(struct mlx5e_priv *priv, + struct mlx5_flow_handle *rule, + struct mlx5_flow_attr *attr) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + + if (is_mdev_switchdev_mode(priv->mdev)) { + mlx5_eswitch_del_offloaded_rule(esw, rule, attr); + return; + } + + mlx5e_del_offloaded_nic_rule(priv, rule, attr); +} + +static bool +is_flow_meter_action(struct mlx5_flow_attr *attr) +{ + return ((attr->action & MLX5_FLOW_CONTEXT_ACTION_EXECUTE_ASO) && + (attr->exe_aso_type == MLX5_EXE_ASO_FLOW_METER)); +} + +static int +mlx5e_tc_add_flow_meter(struct mlx5e_priv *priv, + struct mlx5_flow_attr *attr) +{ + struct mlx5e_post_act *post_act = get_post_action(priv); + struct mlx5e_post_meter_priv *post_meter; + enum mlx5_flow_namespace_type ns_type; + struct mlx5e_flow_meter_handle *meter; + + meter = mlx5e_tc_meter_replace(priv->mdev, &attr->meter_attr.params); + if (IS_ERR(meter)) { + mlx5_core_err(priv->mdev, "Failed to get flow meter\n"); + return PTR_ERR(meter); + } + + ns_type = mlx5e_tc_meter_get_namespace(meter->flow_meters); + post_meter = mlx5e_post_meter_init(priv, ns_type, post_act, meter->green_counter, + meter->red_counter); + if (IS_ERR(post_meter)) { + mlx5_core_err(priv->mdev, "Failed to init post meter\n"); + goto err_meter_init; + } + + attr->meter_attr.meter = meter; + attr->meter_attr.post_meter = post_meter; + attr->dest_ft = mlx5e_post_meter_get_ft(post_meter); + attr->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + + return 0; + +err_meter_init: + mlx5e_tc_meter_put(meter); + return PTR_ERR(post_meter); +} + +static void +mlx5e_tc_del_flow_meter(struct mlx5_flow_attr *attr) +{ + mlx5e_post_meter_cleanup(attr->meter_attr.post_meter); + mlx5e_tc_meter_put(attr->meter_attr.meter); +} + +struct mlx5_flow_handle * +mlx5e_tc_rule_offload(struct mlx5e_priv *priv, + struct mlx5_flow_spec *spec, + struct mlx5_flow_attr *attr) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + int err; + + if (attr->flags & MLX5_ATTR_FLAG_CT) { + struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts = + &attr->parse_attr->mod_hdr_acts; + + return mlx5_tc_ct_flow_offload(get_ct_priv(priv), + spec, attr, + mod_hdr_acts); + } + + if (!is_mdev_switchdev_mode(priv->mdev)) + return mlx5e_add_offloaded_nic_rule(priv, spec, attr); + + if (attr->flags & MLX5_ATTR_FLAG_SAMPLE) + return mlx5e_tc_sample_offload(get_sample_priv(priv), spec, attr); + + if (is_flow_meter_action(attr)) { + err = mlx5e_tc_add_flow_meter(priv, attr); + if (err) + return ERR_PTR(err); + } + + return mlx5_eswitch_add_offloaded_rule(esw, spec, attr); +} + +void +mlx5e_tc_rule_unoffload(struct mlx5e_priv *priv, + struct mlx5_flow_handle *rule, + struct mlx5_flow_attr *attr) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + + if (attr->flags & MLX5_ATTR_FLAG_CT) { + mlx5_tc_ct_delete_flow(get_ct_priv(priv), attr); + return; + } + + if (!is_mdev_switchdev_mode(priv->mdev)) { + mlx5e_del_offloaded_nic_rule(priv, rule, attr); + return; + } + + if (attr->flags & MLX5_ATTR_FLAG_SAMPLE) { + mlx5e_tc_sample_unoffload(get_sample_priv(priv), rule, attr); + return; + } + + mlx5_eswitch_del_offloaded_rule(esw, rule, attr); + + if (attr->meter_attr.meter) + mlx5e_tc_del_flow_meter(attr); +} + +int +mlx5e_tc_match_to_reg_set(struct mlx5_core_dev *mdev, + struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts, + enum mlx5_flow_namespace_type ns, + enum mlx5e_tc_attr_to_reg type, + u32 data) +{ + int ret = mlx5e_tc_match_to_reg_set_and_get_id(mdev, mod_hdr_acts, ns, type, data); + + return ret < 0 ? ret : 0; +} + +void mlx5e_tc_match_to_reg_mod_hdr_change(struct mlx5_core_dev *mdev, + struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts, + enum mlx5e_tc_attr_to_reg type, + int act_id, u32 data) +{ + int moffset = mlx5e_tc_attr_to_reg_mappings[type].moffset; + int mfield = mlx5e_tc_attr_to_reg_mappings[type].mfield; + int mlen = mlx5e_tc_attr_to_reg_mappings[type].mlen; + char *modact; + + modact = mlx5e_mod_hdr_get_item(mod_hdr_acts, act_id); + + /* Firmware has 5bit length field and 0 means 32bits */ + if (mlen == 32) + mlen = 0; + + MLX5_SET(set_action_in, modact, action_type, MLX5_ACTION_TYPE_SET); + MLX5_SET(set_action_in, modact, field, mfield); + MLX5_SET(set_action_in, modact, offset, moffset); + MLX5_SET(set_action_in, modact, length, mlen); + MLX5_SET(set_action_in, modact, data, data); +} + +struct mlx5e_hairpin { + struct mlx5_hairpin *pair; + + struct mlx5_core_dev *func_mdev; + struct mlx5e_priv *func_priv; + u32 tdn; + struct mlx5e_tir direct_tir; + + int num_channels; + struct mlx5e_rqt indir_rqt; + struct mlx5e_tir indir_tir[MLX5E_NUM_INDIR_TIRS]; + struct mlx5_ttc_table *ttc; + + u32 rate_limit; + u32 max_pp_burst_size; +}; + +struct mlx5e_hairpin_entry { + /* a node of a hash table which keeps all the hairpin entries */ + struct hlist_node hairpin_hlist; + + /* protects flows list */ + spinlock_t flows_lock; + /* flows sharing the same hairpin */ + struct list_head flows; + /* hpe's that were not fully initialized when dead peer update event + * function traversed them. + */ + struct list_head dead_peer_wait_list; + + u16 peer_vhca_id; + u16 prio; + struct mlx5e_hairpin *hp; + struct mlx5_flow_handle *fwd_rule; + refcount_t refcnt; + struct completion res_ready; +}; + +static void mlx5e_tc_del_flow(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow); + +struct mlx5e_tc_flow *mlx5e_flow_get(struct mlx5e_tc_flow *flow) +{ + if (!flow || !refcount_inc_not_zero(&flow->refcnt)) + return ERR_PTR(-EINVAL); + return flow; +} + +void mlx5e_flow_put(struct mlx5e_priv *priv, struct mlx5e_tc_flow *flow) +{ + if (refcount_dec_and_test(&flow->refcnt)) { + mlx5e_tc_del_flow(priv, flow); + kfree_rcu(flow, rcu_head); + } +} + +bool mlx5e_is_eswitch_flow(struct mlx5e_tc_flow *flow) +{ + return flow_flag_test(flow, ESWITCH); +} + +bool mlx5e_is_ft_flow(struct mlx5e_tc_flow *flow) +{ + return flow_flag_test(flow, FT); +} + +bool mlx5e_is_offloaded_flow(struct mlx5e_tc_flow *flow) +{ + return flow_flag_test(flow, OFFLOADED); +} + +int mlx5e_get_flow_namespace(struct mlx5e_tc_flow *flow) +{ + return mlx5e_is_eswitch_flow(flow) ? + MLX5_FLOW_NAMESPACE_FDB : MLX5_FLOW_NAMESPACE_KERNEL; +} + +static struct mod_hdr_tbl * +get_mod_hdr_table(struct mlx5e_priv *priv, struct mlx5e_tc_flow *flow) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + + return mlx5e_get_flow_namespace(flow) == MLX5_FLOW_NAMESPACE_FDB ? + &esw->offloads.mod_hdr : + &priv->fs.tc.mod_hdr; +} + +static int mlx5e_attach_mod_hdr(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow, + struct mlx5e_tc_flow_parse_attr *parse_attr) +{ + struct mlx5_modify_hdr *modify_hdr; + struct mlx5e_mod_hdr_handle *mh; + + mh = mlx5e_mod_hdr_attach(priv->mdev, get_mod_hdr_table(priv, flow), + mlx5e_get_flow_namespace(flow), + &parse_attr->mod_hdr_acts); + if (IS_ERR(mh)) + return PTR_ERR(mh); + + modify_hdr = mlx5e_mod_hdr_get(mh); + flow->attr->modify_hdr = modify_hdr; + flow->mh = mh; + + return 0; +} + +static void mlx5e_detach_mod_hdr(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow) +{ + /* flow wasn't fully initialized */ + if (!flow->mh) + return; + + mlx5e_mod_hdr_detach(priv->mdev, get_mod_hdr_table(priv, flow), + flow->mh); + flow->mh = NULL; +} + +static +struct mlx5_core_dev *mlx5e_hairpin_get_mdev(struct net *net, int ifindex) +{ + struct mlx5_core_dev *mdev; + struct net_device *netdev; + struct mlx5e_priv *priv; + + netdev = dev_get_by_index(net, ifindex); + if (!netdev) + return ERR_PTR(-ENODEV); + + priv = netdev_priv(netdev); + mdev = priv->mdev; + dev_put(netdev); + + /* Mirred tc action holds a refcount on the ifindex net_device (see + * net/sched/act_mirred.c:tcf_mirred_get_dev). So, it's okay to continue using mdev + * after dev_put(netdev), while we're in the context of adding a tc flow. + * + * The mdev pointer corresponds to the peer/out net_device of a hairpin. It is then + * stored in a hairpin object, which exists until all flows, that refer to it, get + * removed. + * + * On the other hand, after a hairpin object has been created, the peer net_device may + * be removed/unbound while there are still some hairpin flows that are using it. This + * case is handled by mlx5e_tc_hairpin_update_dead_peer, which is hooked to + * NETDEV_UNREGISTER event of the peer net_device. + */ + return mdev; +} + +static int mlx5e_hairpin_create_transport(struct mlx5e_hairpin *hp) +{ + struct mlx5e_tir_builder *builder; + int err; + + builder = mlx5e_tir_builder_alloc(false); + if (!builder) + return -ENOMEM; + + err = mlx5_core_alloc_transport_domain(hp->func_mdev, &hp->tdn); + if (err) + goto out; + + mlx5e_tir_builder_build_inline(builder, hp->tdn, hp->pair->rqn[0]); + err = mlx5e_tir_init(&hp->direct_tir, builder, hp->func_mdev, false); + if (err) + goto create_tir_err; + +out: + mlx5e_tir_builder_free(builder); + return err; + +create_tir_err: + mlx5_core_dealloc_transport_domain(hp->func_mdev, hp->tdn); + + goto out; +} + +static void mlx5e_hairpin_destroy_transport(struct mlx5e_hairpin *hp) +{ + mlx5e_tir_destroy(&hp->direct_tir); + mlx5_core_dealloc_transport_domain(hp->func_mdev, hp->tdn); +} + +static int mlx5e_hairpin_create_indirect_rqt(struct mlx5e_hairpin *hp) +{ + struct mlx5e_priv *priv = hp->func_priv; + struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5e_rss_params_indir *indir; + int err; + + indir = kvmalloc(sizeof(*indir), GFP_KERNEL); + if (!indir) + return -ENOMEM; + + mlx5e_rss_params_indir_init_uniform(indir, hp->num_channels); + err = mlx5e_rqt_init_indir(&hp->indir_rqt, mdev, hp->pair->rqn, hp->num_channels, + mlx5e_rx_res_get_current_hash(priv->rx_res).hfunc, + indir); + + kvfree(indir); + return err; +} + +static int mlx5e_hairpin_create_indirect_tirs(struct mlx5e_hairpin *hp) +{ + struct mlx5e_priv *priv = hp->func_priv; + struct mlx5e_rss_params_hash rss_hash; + enum mlx5_traffic_types tt, max_tt; + struct mlx5e_tir_builder *builder; + int err = 0; + + builder = mlx5e_tir_builder_alloc(false); + if (!builder) + return -ENOMEM; + + rss_hash = mlx5e_rx_res_get_current_hash(priv->rx_res); + + for (tt = 0; tt < MLX5E_NUM_INDIR_TIRS; tt++) { + struct mlx5e_rss_params_traffic_type rss_tt; + + rss_tt = mlx5e_rss_get_default_tt_config(tt); + + mlx5e_tir_builder_build_rqt(builder, hp->tdn, + mlx5e_rqt_get_rqtn(&hp->indir_rqt), + false); + mlx5e_tir_builder_build_rss(builder, &rss_hash, &rss_tt, false); + + err = mlx5e_tir_init(&hp->indir_tir[tt], builder, hp->func_mdev, false); + if (err) { + mlx5_core_warn(hp->func_mdev, "create indirect tirs failed, %d\n", err); + goto err_destroy_tirs; + } + + mlx5e_tir_builder_clear(builder); + } + +out: + mlx5e_tir_builder_free(builder); + return err; + +err_destroy_tirs: + max_tt = tt; + for (tt = 0; tt < max_tt; tt++) + mlx5e_tir_destroy(&hp->indir_tir[tt]); + + goto out; +} + +static void mlx5e_hairpin_destroy_indirect_tirs(struct mlx5e_hairpin *hp) +{ + int tt; + + for (tt = 0; tt < MLX5E_NUM_INDIR_TIRS; tt++) + mlx5e_tir_destroy(&hp->indir_tir[tt]); +} + +static void mlx5e_hairpin_set_ttc_params(struct mlx5e_hairpin *hp, + struct ttc_params *ttc_params) +{ + struct mlx5_flow_table_attr *ft_attr = &ttc_params->ft_attr; + int tt; + + memset(ttc_params, 0, sizeof(*ttc_params)); + + ttc_params->ns = mlx5_get_flow_namespace(hp->func_mdev, + MLX5_FLOW_NAMESPACE_KERNEL); + for (tt = 0; tt < MLX5_NUM_TT; tt++) { + ttc_params->dests[tt].type = MLX5_FLOW_DESTINATION_TYPE_TIR; + ttc_params->dests[tt].tir_num = + tt == MLX5_TT_ANY ? + mlx5e_tir_get_tirn(&hp->direct_tir) : + mlx5e_tir_get_tirn(&hp->indir_tir[tt]); + } + + ft_attr->level = MLX5E_TC_TTC_FT_LEVEL; + ft_attr->prio = MLX5E_TC_PRIO; +} + +static int mlx5e_hairpin_rss_init(struct mlx5e_hairpin *hp) +{ + struct mlx5e_priv *priv = hp->func_priv; + struct ttc_params ttc_params; + int err; + + err = mlx5e_hairpin_create_indirect_rqt(hp); + if (err) + return err; + + err = mlx5e_hairpin_create_indirect_tirs(hp); + if (err) + goto err_create_indirect_tirs; + + mlx5e_hairpin_set_ttc_params(hp, &ttc_params); + hp->ttc = mlx5_create_ttc_table(priv->mdev, &ttc_params); + if (IS_ERR(hp->ttc)) { + err = PTR_ERR(hp->ttc); + goto err_create_ttc_table; + } + + netdev_dbg(priv->netdev, "add hairpin: using %d channels rss ttc table id %x\n", + hp->num_channels, + mlx5_get_ttc_flow_table(priv->fs.ttc)->id); + + return 0; + +err_create_ttc_table: + mlx5e_hairpin_destroy_indirect_tirs(hp); +err_create_indirect_tirs: + mlx5e_rqt_destroy(&hp->indir_rqt); + + return err; +} + +static void mlx5e_hairpin_rss_cleanup(struct mlx5e_hairpin *hp) +{ + mlx5_destroy_ttc_table(hp->ttc); + mlx5e_hairpin_destroy_indirect_tirs(hp); + mlx5e_rqt_destroy(&hp->indir_rqt); +} + +enum { + MLX5_HP_OOB_CNT_RED = 1, + MLX5_HP_OOB_CNT_BLUE, +}; + +static struct mlx5_flow_handle * +mlx5e_hairpin_create_oob_rx_flow(struct mlx5e_hp_oob_cnt *oob, + struct mlx5_modify_hdr *mod_hdr, + struct mlx5_fc *cnt) +{ + struct mlx5_flow_act flow_act = { .flags = FLOW_ACT_NO_APPEND, }; + struct mlx5_flow_destination dest[2]; + struct mlx5e_priv *priv = oob->priv; + struct mlx5_flow_handle *flow_rule; + + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_COUNT | MLX5_FLOW_CONTEXT_ACTION_MOD_HDR | + MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + flow_act.modify_hdr = mod_hdr; + + memcpy(&dest[0], &oob->rx_dest, sizeof(struct mlx5_flow_destination)); + + dest[1].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; + dest[1].counter_id = mlx5_fc_id(cnt); + + flow_rule = mlx5_add_flow_rules(oob->ft, NULL, &flow_act, dest, 2); + if (IS_ERR(flow_rule)) + mlx5_core_warn(priv->mdev, "Failed to create hairpin oob rx rule color: err(%d)\n", + (int)PTR_ERR(flow_rule)); + + return flow_rule; +} + +static void +mlx5e_hairpin_oob_cnt_dest_unset(struct mlx5e_priv *priv) +{ + struct mlx5e_tc_table *tc = &priv->fs.tc; + struct mlx5e_hp_oob_cnt *oob = tc->hp_oob; + + if (!oob->dest_valid) + return; + + mlx5_del_flow_rules(oob->rx_rule); + oob->rx_rule = NULL; + oob->dest_valid = false; +} + +static void +mlx5e_hairpin_oob_cnt_dest_set(struct mlx5e_priv *priv, struct mlx5e_hairpin *hp) +{ + struct mlx5e_tc_table *tc = &priv->fs.tc; + struct mlx5_flow_destination *dest; + struct mlx5_flow_handle *rule; + struct mlx5e_hp_oob_cnt *oob; + + oob = tc->hp_oob; + dest = &oob->rx_dest; + + if (tc->num_prio_hp) { + dest->type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dest->ft = tc->hp_fwd; + } else if (hp->num_channels > 1) { + dest->type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dest->ft = mlx5_get_ttc_flow_table(hp->ttc); + } else { + dest->type = MLX5_FLOW_DESTINATION_TYPE_TIR; + dest->tir_num = mlx5e_tir_get_tirn(&hp->direct_tir); + } + + rule = mlx5e_hairpin_create_oob_rx_flow(oob, oob->curr_mod_hdr, + oob->curr_cnt); + if (IS_ERR(rule)) { + oob->dest_valid = false; + + return; + } + + oob->rx_rule = rule; + oob->dest_valid = true; +} + +static void +mlx5e_hairpin_destroy_oob_cntrs(struct mlx5e_priv *priv, struct mlx5e_hp_oob_cnt *oob) +{ + int i; + + for (i = 0; i < HAIRPIN_OOB_NUM_CNT_PER_SET; i++) + if (!IS_ERR_OR_NULL(oob->cntrs[i])) + mlx5_fc_destroy(priv->mdev, oob->cntrs[i]); + + for (; i < HAIRPIN_OOB_NUM_CNT_PER_SET * 2; i++) + if (!IS_ERR_OR_NULL(oob->cntrs[i])) + mlx5_fc_destroy(oob->peer_dev, oob->cntrs[i]); +} + +static int +mlx5e_hairpin_create_oob_cntrs(struct mlx5e_priv *priv, struct mlx5e_hp_oob_cnt *oob) +{ + int err, i; + + /* Alloc pair of counter on local dev for RX rules */ + for (i = 0; i < HAIRPIN_OOB_NUM_CNT_PER_SET; i++) { + oob->cntrs[i] = mlx5_fc_create(priv->mdev, true); + if (IS_ERR(oob->cntrs[i])) { + err = PTR_ERR(oob->cntrs[i]); + goto err_cntrs; + } + } + + /* Alloc pair of counter on peer dev for TX rules */ + for (; i < HAIRPIN_OOB_NUM_CNT_PER_SET * 2; i++) { + oob->cntrs[i] = mlx5_fc_create(oob->peer_dev, true); + if (IS_ERR(oob->cntrs[i])) { + err = PTR_ERR(oob->cntrs[i]); + goto err_cntrs; + } + } + + return 0; + +err_cntrs: + mlx5e_hairpin_destroy_oob_cntrs(priv, oob); + + return err; +} + +static struct mlx5_modify_hdr * +mlx5e_hairpin_create_oob_cnt_mod_hdr(struct mlx5e_priv *priv, int color) +{ + struct mlx5e_tc_mod_hdr_acts mod_acts = {}; + struct mlx5_modify_hdr *mh; + int err; + + err = mlx5e_tc_match_to_reg_set(priv->mdev, &mod_acts, MLX5_FLOW_NAMESPACE_KERNEL, + HP_OOB_CNT_COLOR_REG, color); + if (err) + return ERR_PTR(err); + + mh = mlx5_modify_header_alloc(priv->mdev, MLX5_FLOW_NAMESPACE_KERNEL, + mod_acts.num_actions, + mod_acts.actions); + + mlx5e_mod_hdr_dealloc(&mod_acts); + + return mh; +} + +static struct mlx5_flow_handle * +mlx5e_hairpin_add_oob_tx_flow(struct mlx5e_priv *priv, struct mlx5_fc *cnt, + struct mlx5_flow_table *ft, + int color) +{ + struct mlx5_flow_act flow_act = { .flags = FLOW_ACT_NO_APPEND, }; + struct mlx5_flow_handle *flow_rule; + struct mlx5_flow_destination dest; + struct mlx5_flow_spec *spec; + + spec = kzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) + return ERR_PTR(-ENOMEM); + + mlx5e_tc_match_to_reg_match(spec, HP_OOB_TX_CNT_COLOR_REG, + color, MLX5_REG_MAPPING_MASK(HP_OOB_TX_CNT_COLOR_REG)); + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_COUNT | MLX5_FLOW_CONTEXT_ACTION_ALLOW; + + dest.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; + dest.counter_id = mlx5_fc_id(cnt); + flow_rule = mlx5_add_flow_rules(ft, spec, &flow_act, &dest, 1); + + if (IS_ERR(flow_rule)) + mlx5_core_warn(priv->mdev, "Failed to create hairpin oob tx rule, err %d\n", + (int)PTR_ERR(flow_rule)); + + kfree(spec); + + return flow_rule; +} + +static int +mlx5e_hairpin_create_oob_peer_steering(struct mlx5e_priv *priv, struct mlx5e_hp_oob_cnt *oob) +{ + struct mlx5_flow_table_attr ft_attr = {}; + struct mlx5_flow_namespace *ns; + struct mlx5_flow_table *ft; + int err = 0; + + ns = mlx5_get_flow_namespace(oob->peer_dev, MLX5_FLOW_NAMESPACE_EGRESS_KERNEL); + if (!ns) { + netdev_warn(priv->netdev, "Failed to get hairpin oob peer ft namespace type:%d\n", + MLX5_FLOW_NAMESPACE_EGRESS_KERNEL); + return -EOPNOTSUPP; + } + + ft_attr.max_fte = 2; + + ft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr); + if (IS_ERR(ft)) { + err = PTR_ERR(ft); + netdev_warn(priv->netdev, "Failed to create hairpin oob tx table, err %d\n", err); + + return err; + } + oob->tx_ft = ft; + + /* Starting with red color as active color */ + oob->tx_red_rule = mlx5e_hairpin_add_oob_tx_flow(priv, oob->curr_peer_cnt, + ft, MLX5_HP_OOB_CNT_RED); + if (IS_ERR(oob->tx_red_rule)) { + err = PTR_ERR(oob->tx_red_rule); + goto err_red; + } + + oob->tx_blue_rule = mlx5e_hairpin_add_oob_tx_flow(priv, oob->standby_peer_cnt, + ft, MLX5_HP_OOB_CNT_BLUE); + if (IS_ERR(oob->tx_blue_rule)) { + err = PTR_ERR(oob->tx_blue_rule); + goto err_blue; + } + + return 0; + +err_blue: + mlx5_del_flow_rules(oob->tx_red_rule); + +err_red: + mlx5_destroy_flow_table(ft); + + return err; +} + +static int +mlx5e_hairpin_create_oob_steering(struct mlx5e_priv *priv, struct mlx5e_hp_oob_cnt *oob) +{ + struct mlx5_flow_table_attr ft_attr = {}; + struct mlx5_flow_namespace *ns; + struct mlx5_flow_table *ft; + int err = 0; + + ns = mlx5_get_flow_namespace(priv->mdev, MLX5_FLOW_NAMESPACE_KERNEL); + if (!ns) { + netdev_warn(priv->netdev, "Failed to get hp oob cnt ft namespace type:%d\n", + MLX5_FLOW_NAMESPACE_KERNEL); + return -EOPNOTSUPP; + } + + ft_attr.max_fte = 2; + ft_attr.autogroup.max_num_groups = 1; + ft_attr.level = MLX5E_TC_HP_OOB_CNT_LEVEL; + ft_attr.prio = MLX5E_TC_PRIO; + + ft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr); + if (IS_ERR(ft)) { + err = PTR_ERR(ft); + netdev_warn(priv->netdev, "Failed to create hp oob cnt table, err %d\n", err); + return err; + } + + oob->ft = ft; + + /* Create counter color mark modify headers - Start with red as active one*/ + oob->curr_mod_hdr = mlx5e_hairpin_create_oob_cnt_mod_hdr(priv, MLX5_HP_OOB_CNT_RED); + if (IS_ERR(oob->curr_mod_hdr)) { + err = PTR_ERR(oob->curr_mod_hdr); + goto err_mod_hdr; + } + + oob->standby_mod_hdr = mlx5e_hairpin_create_oob_cnt_mod_hdr(priv, MLX5_HP_OOB_CNT_BLUE); + if (IS_ERR(oob->standby_mod_hdr)) { + err = PTR_ERR(oob->standby_mod_hdr); + goto err_mod_hdr2; + } + + err = mlx5e_hairpin_create_oob_peer_steering(priv, oob); + if (err) + goto err_tx; + + return 0; + +err_tx: + mlx5_modify_header_dealloc(priv->mdev, oob->standby_mod_hdr); + +err_mod_hdr2: + mlx5_modify_header_dealloc(priv->mdev, oob->curr_mod_hdr); + +err_mod_hdr: + mlx5_destroy_flow_table(ft); + + return err; +} + +static void +mlx5e_hairpin_destroy_oob_steering(struct mlx5e_priv *priv, struct mlx5e_hp_oob_cnt *oob) +{ + /* Destroy peer TX resources */ + mlx5_del_flow_rules(oob->tx_red_rule); + mlx5_del_flow_rules(oob->tx_blue_rule); + mlx5_destroy_flow_table(oob->tx_ft); + + /* Destroy local RX resources */ + if (!IS_ERR_OR_NULL(oob->rx_rule)) + mlx5_del_flow_rules(oob->rx_rule); + + mlx5_destroy_flow_table(oob->ft); + mlx5_modify_header_dealloc(priv->mdev, oob->curr_mod_hdr); + mlx5_modify_header_dealloc(priv->mdev, oob->standby_mod_hdr); +} + +void mlx5e_hairpin_oob_cnt_get(struct mlx5e_priv *priv, u64 *cnt) +{ + struct mlx5e_tc_table *tc = &priv->fs.tc; + struct mlx5e_hp_oob_cnt *oob; + + oob = tc->hp_oob; + + if (!oob) { + *cnt = 0; + return; + } + + mutex_lock(&oob->cnt_lock); + *cnt = oob->drop_cnt; + mutex_unlock(&oob->cnt_lock); +} + +static void mlx5e_queue_hp_oob_work(struct mlx5e_priv *priv, unsigned long delay) +{ + struct mlx5e_tc_table *tc = &priv->fs.tc; + + queue_delayed_work(priv->wq, &tc->hp_oob->hp_oob_work, delay); +} + +static void mlx5_do_hp_oob_work(struct work_struct *work) +{ + struct delayed_work *delayed_work = to_delayed_work(work); + struct mlx5_fc *tmp_peer_cnt, *tmp_cnt; + struct mlx5_modify_hdr *tmp_mod_hdr; + struct mlx5_flow_handle *flow_rule; + struct mlx5e_hp_oob_cnt *oob; + struct mlx5e_priv *priv; + u64 tx_packets = 0; + u64 rx_packets = 0; + u64 bytes, lastuse; + + oob = container_of(delayed_work, struct mlx5e_hp_oob_cnt, + hp_oob_work); + + priv = oob->priv; + /* Grab stats from standby counters before the switch */ + mlx5_fc_query_cached(oob->standby_peer_cnt, &bytes, &tx_packets, &lastuse); + mlx5_fc_query_cached(oob->standby_cnt, &bytes, &rx_packets, &lastuse); + + mutex_lock(&oob->cnt_lock); + oob->drop_cnt += rx_packets - tx_packets; + mutex_unlock(&oob->cnt_lock); + + mutex_lock(&priv->fs.tc.hairpin_tbl_lock); + if (oob->dest_valid) { + /* Switch packet color and counter on the RX side */ + flow_rule = mlx5e_hairpin_create_oob_rx_flow(oob, oob->standby_mod_hdr, + oob->standby_cnt); + if (IS_ERR(flow_rule)) + return; + + mlx5_del_flow_rules(oob->rx_rule); + + oob->rx_rule = flow_rule; + } + mutex_unlock(&priv->fs.tc.hairpin_tbl_lock); + + tmp_mod_hdr = oob->curr_mod_hdr; + tmp_cnt = oob->curr_cnt; + tmp_peer_cnt = oob->curr_peer_cnt; + + oob->curr_mod_hdr = oob->standby_mod_hdr; + oob->curr_cnt = oob->standby_cnt; + oob->curr_peer_cnt = oob->standby_peer_cnt; + + oob->standby_mod_hdr = tmp_mod_hdr; + oob->standby_cnt = tmp_cnt; + oob->standby_peer_cnt = tmp_peer_cnt; + + mlx5e_queue_hp_oob_work(oob->priv, HZ); +} + +int +mlx5e_hairpin_oob_cnt_disable(struct mlx5e_priv *priv) +{ + struct mlx5e_tc_table *tc = &priv->fs.tc; + struct mlx5e_hp_oob_cnt *oob; + + oob = tc->hp_oob; + + if (!oob) + return 0; + + /* If not in prio hp mode check that no hp entries + * exist before proceeding. Otherwise, the entries will + * be removed as part of prio hp cleanup. + */ + if (!hash_empty(tc->hairpin_tbl) && !tc->num_prio_hp) { + netdev_warn(priv->netdev, "Can't disable hairpin oob counter while active hairpin rules exist"); + return -EBUSY; + } + + cancel_delayed_work_sync(&oob->hp_oob_work); + mlx5e_hairpin_destroy_oob_steering(priv, oob); + mlx5e_hairpin_destroy_oob_cntrs(priv, oob); + + kfree(oob); + + tc->hp_oob = NULL; + + return 0; +} + +int +mlx5e_hairpin_oob_cnt_enable(struct mlx5e_priv *priv, struct net_device *peer_dev) +{ + struct mlx5e_priv *peer_priv = netdev_priv(peer_dev); + struct mlx5e_tc_table *tc = &priv->fs.tc; + struct mlx5_core_dev *peer_mdev; + struct mlx5e_hp_oob_cnt *oob; + int err; + + peer_mdev = peer_priv->mdev; + + if (!(priv->netdev->netdev_ops == peer_dev->netdev_ops) || + !mlx5_same_hw_devs(peer_mdev, peer_priv->mdev)) { + netdev_err(priv->netdev, "Can't enable hp oob cnt, peer dev %s is not on the same HW\n", + peer_dev->name); + return -EINVAL; + } + + if (!hash_empty(tc->hairpin_tbl)) { + netdev_warn(priv->netdev, "Can't enable hairpin oob counter while active hairpin rules exist"); + return -EBUSY; + } + + if (!MLX5_CAP_GEN(priv->mdev, hairpin) || !MLX5_CAP_GEN(peer_mdev, hairpin)) { + netdev_warn(priv->netdev, "Can't enable hp oob cnt, hairpin is not supported"); + return -EOPNOTSUPP; + } + + oob = kzalloc(sizeof(*oob), GFP_KERNEL); + if (!oob) + return -ENOMEM; + + oob->peer_dev = peer_mdev; + oob->priv = priv; + + err = mlx5e_hairpin_create_oob_cntrs(priv, oob); + if (err) + goto err_cntrs; + + err = mlx5e_hairpin_create_oob_steering(priv, oob); + if (err) + goto err_steering; + + tc->hp_oob = oob; + + INIT_DELAYED_WORK(&tc->hp_oob->hp_oob_work, mlx5_do_hp_oob_work); + + mlx5e_queue_hp_oob_work(oob->priv, HZ); + + return 0; + +err_steering: + mlx5e_hairpin_destroy_oob_cntrs(priv, oob); + +err_cntrs: + kfree(oob); + + return err; +} + +static struct mlx5e_hairpin * +mlx5e_hairpin_create(struct mlx5e_priv *priv, struct mlx5_hairpin_params *params, + struct mlx5_core_dev *peer_mdev) +{ + struct mlx5_core_dev *func_mdev; + struct mlx5e_hairpin *hp; + struct mlx5_hairpin *pair; + int err; + + hp = kzalloc(sizeof(*hp), GFP_KERNEL); + if (!hp) + return ERR_PTR(-ENOMEM); + + func_mdev = priv->mdev; + if (IS_ERR(peer_mdev)) { + err = PTR_ERR(peer_mdev); + goto create_pair_err; + } + + pair = mlx5_core_hairpin_create(func_mdev, peer_mdev, params); + if (IS_ERR(pair)) { + err = PTR_ERR(pair); + goto create_pair_err; + } + hp->pair = pair; + hp->func_mdev = func_mdev; + hp->func_priv = priv; + hp->num_channels = params->num_channels; + + err = mlx5e_hairpin_create_transport(hp); + if (err) + goto create_transport_err; + + if (hp->num_channels > 1) { + err = mlx5e_hairpin_rss_init(hp); + if (err) + goto rss_init_err; + } + + return hp; + +rss_init_err: + mlx5e_hairpin_destroy_transport(hp); +create_transport_err: + mlx5_core_hairpin_destroy(hp->pair); +create_pair_err: + kfree(hp); + return ERR_PTR(err); +} + +static void mlx5e_hairpin_destroy(struct mlx5e_hairpin *hp) +{ + if (hp->num_channels > 1) + mlx5e_hairpin_rss_cleanup(hp); + mlx5e_hairpin_destroy_transport(hp); + mlx5_core_hairpin_destroy(hp->pair); + kvfree(hp); +} + +static inline u32 hash_hairpin_info(u16 peer_vhca_id, u16 prio) +{ + return (peer_vhca_id << 16 | prio); +} + +static struct mlx5e_hairpin_entry *mlx5e_hairpin_get(struct mlx5e_priv *priv, + u16 peer_vhca_id, u16 prio) +{ + struct mlx5e_hairpin_entry *hpe; + u32 hash_key = hash_hairpin_info(peer_vhca_id, prio); + + hash_for_each_possible(priv->fs.tc.hairpin_tbl, hpe, + hairpin_hlist, hash_key) { + if (hpe->peer_vhca_id == peer_vhca_id && hpe->prio == prio) { + refcount_inc(&hpe->refcnt); + return hpe; + } + } + + return NULL; +} + +static void mlx5e_hairpin_put(struct mlx5e_priv *priv, + struct mlx5e_hairpin_entry *hpe) +{ + struct mlx5e_tc_table *tc = &priv->fs.tc; + + /* no more hairpin flows for us, release the hairpin pair */ + if (!refcount_dec_and_mutex_lock(&hpe->refcnt, &priv->fs.tc.hairpin_tbl_lock)) + return; + hash_del(&hpe->hairpin_hlist); + if (tc->hp_oob && !tc->num_prio_hp) + mlx5e_hairpin_oob_cnt_dest_unset(priv); + mutex_unlock(&priv->fs.tc.hairpin_tbl_lock); + + if (!IS_ERR_OR_NULL(hpe->hp)) { + netdev_dbg(priv->netdev, "del hairpin: peer %s\n", + dev_name(hpe->hp->pair->peer_mdev->device)); + + mlx5e_hairpin_destroy(hpe->hp); + } + + WARN_ON(!list_empty(&hpe->flows)); + kfree(hpe); +} + +#define UNKNOWN_MATCH_PRIO 8 + +static int mlx5e_hairpin_get_prio(struct mlx5e_priv *priv, + struct mlx5_flow_spec *spec, + u16 user_prio, u16 *match_prio, + struct netlink_ext_ack *extack) +{ + void *headers_c, *headers_v; + u8 prio_val, prio_mask = 0; + bool vlan_present; + + if (priv->fs.tc.num_prio_hp) { + *match_prio = user_prio + UNKNOWN_MATCH_PRIO; + + return 0; + } + +#ifdef CONFIG_MLX5_CORE_EN_DCB + if (priv->dcbx_dp.trust_state != MLX5_QPTS_TRUST_PCP) { + NL_SET_ERR_MSG_MOD(extack, + "only PCP trust state supported for hairpin"); + return -EOPNOTSUPP; + } +#endif + headers_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, outer_headers); + headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, outer_headers); + + vlan_present = MLX5_GET(fte_match_set_lyr_2_4, headers_v, cvlan_tag); + if (vlan_present) { + prio_mask = MLX5_GET(fte_match_set_lyr_2_4, headers_c, first_prio); + prio_val = MLX5_GET(fte_match_set_lyr_2_4, headers_v, first_prio); + } + + if (!vlan_present || !prio_mask) { + prio_val = 0; + } else if (prio_mask != 0x7) { + NL_SET_ERR_MSG_MOD(extack, + "masked priority match not supported for hairpin"); + return -EOPNOTSUPP; + } + + *match_prio = prio_val; + return 0; +} + +static struct mlx5e_hairpin_entry * +mlx5e_get_hairpin_entry(struct mlx5e_priv *priv, struct mlx5_core_dev *peer_mdev, + u16 match_prio) +{ + struct mlx5e_tc_table *tc = &priv->fs.tc; + struct mlx5_hairpin_params params; + struct mlx5e_hairpin_entry *hpe; + struct mlx5e_hairpin *hp; + u64 link_speed64; + u32 link_speed; + u16 peer_id; + + peer_id = MLX5_CAP_GEN(peer_mdev, vhca_id); + mutex_lock(&priv->fs.tc.hairpin_tbl_lock); + hpe = mlx5e_hairpin_get(priv, peer_id, match_prio); + if (hpe) + goto complete; + + hpe = kzalloc(sizeof(*hpe), GFP_KERNEL); + if (!hpe) { + hpe = ERR_PTR(-ENOMEM); + goto complete; + } + + INIT_LIST_HEAD(&hpe->flows); + spin_lock_init(&hpe->flows_lock); + INIT_LIST_HEAD(&hpe->dead_peer_wait_list); + hpe->peer_vhca_id = peer_id; + hpe->prio = match_prio; + refcount_set(&hpe->refcnt, 1); + init_completion(&hpe->res_ready); + + /* set hairpin pair per each 50Gbs share of the link + * unless we are in priority hairpin mode. + */ + if (!tc->num_prio_hp) { + params.log_data_size = 16; + params.log_data_size = min_t(u8, params.log_data_size, + MLX5_CAP_GEN(priv->mdev, log_max_hairpin_wq_data_sz)); + params.log_data_size = max_t(u8, params.log_data_size, + MLX5_CAP_GEN(priv->mdev, log_min_hairpin_wq_data_sz)); + + mlx5e_port_max_linkspeed(priv->mdev, &link_speed); + link_speed = max_t(u32, link_speed, 50000); + link_speed64 = link_speed; + do_div(link_speed64, 50000); + params.num_channels = link_speed64; + } else { /* prio hp uses max buffer with 1 channel */ + params.log_data_size = MLX5_CAP_GEN(priv->mdev, log_max_hairpin_wq_data_sz); + params.num_channels = 1; + } + + params.log_num_packets = params.log_data_size - + MLX5_MPWRQ_MIN_LOG_STRIDE_SZ(priv->mdev); + params.log_num_packets = min_t(u8, params.log_num_packets, + MLX5_CAP_GEN(priv->mdev, log_max_hairpin_num_packets)); + + params.q_counter = priv->q_counter; + + hp = mlx5e_hairpin_create(priv, ¶ms, peer_mdev); + complete_all(&hpe->res_ready); + if (IS_ERR(hp)) { + netdev_warn(priv->netdev, "failed to create hairpin: peer %s prio %d err %ld\n", + dev_name(hp->pair->peer_mdev->device), match_prio, + PTR_ERR(hp)); + kfree(hpe); + hpe = ERR_CAST(hp); + goto complete; + } + + netdev_dbg(priv->netdev, "add hairpin: tirn %x rqn %x peer %s sqn %x prio %d (log) data %d packets %d\n", + mlx5e_tir_get_tirn(&hp->direct_tir), hp->pair->rqn[0], + dev_name(hp->pair->peer_mdev->device), + hp->pair->sqn[0], match_prio, params.log_data_size, params.log_num_packets); + + hpe->hp = hp; + hash_add(priv->fs.tc.hairpin_tbl, &hpe->hairpin_hlist, + hash_hairpin_info(peer_id, match_prio)); + + /* Set the hp destination for the oob counter wa. + * Here it is for the legacy hairpin mode and only for + * priority 0. + */ + if (tc->hp_oob && peer_mdev == tc->hp_oob->peer_dev && !match_prio && !tc->num_prio_hp) + mlx5e_hairpin_oob_cnt_dest_set(priv, hp); + +complete: + mutex_unlock(&priv->fs.tc.hairpin_tbl_lock); + + return hpe; +} + +static struct mlx5_flow_handle * +mlx5e_add_prio_hp_flow(struct mlx5e_priv *priv, + struct mlx5e_hairpin_entry *hpe, + int prio) +{ + struct mlx5_flow_act flow_act = { .flags = FLOW_ACT_NO_APPEND, }; + struct mlx5e_tc_table *tc = &priv->fs.tc; + struct mlx5_flow_handle *flow_rule; + struct mlx5_flow_destination dest; + struct mlx5_flow_spec *spec; + struct mlx5_flow_table *ft; + + ft = tc->hp_fwd; + + spec = kzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) + return ERR_PTR(-ENOMEM); + + mlx5e_tc_match_to_reg_match(spec, USER_PRIO_TO_REG, + prio, MLX5_REG_MAPPING_MASK(USER_PRIO_TO_REG)); + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + + dest.type = MLX5_FLOW_DESTINATION_TYPE_TIR; + dest.tir_num = mlx5e_tir_get_tirn(&hpe->hp->direct_tir); + flow_rule = mlx5_add_flow_rules(ft, spec, &flow_act, &dest, 1); + + if (IS_ERR(flow_rule)) + mlx5_core_warn(priv->mdev, "Failed to create hp fwd rule for prio: %d, err(%d)\n", + prio, (int)PTR_ERR(flow_rule)); + + kfree(spec); + + return flow_rule; +} + +static int mlx5e_prio_hairpin_init_queues(struct mlx5e_priv *priv, u32 num_queues, + struct net_device *peer_dev) +{ + struct mlx5e_priv *peer_priv = netdev_priv(peer_dev); + struct mlx5e_tc_table *tc = &priv->fs.tc; + struct mlx5e_hairpin_entry *hpe; + struct mlx5_core_dev *peer_mdev; + struct mlx5_flow_handle *rule; + struct hlist_node *tmp; + u16 match_prio; + int i, err; + + if (!num_queues) + return 0; + + if (!(priv->netdev->netdev_ops == peer_dev->netdev_ops) || + !mlx5e_same_hw_devs(priv, peer_priv)) { + netdev_err(priv->netdev, "can't alloc hp_queues, peer dev %s is not on the same HW\n", + peer_dev->name); + return -EINVAL; + } + + peer_mdev = peer_priv->mdev; + + if (!MLX5_CAP_GEN(priv->mdev, hairpin) || !MLX5_CAP_GEN(peer_mdev, hairpin)) { + netdev_warn(priv->netdev, "hairpin is not supported"); + return -EOPNOTSUPP; + } + + for (i = 0; i < num_queues; i++) { + err = mlx5e_hairpin_get_prio(priv, NULL, i, &match_prio, + NULL); + if (err) + goto err_queues; + + hpe = mlx5e_get_hairpin_entry(priv, peer_mdev, match_prio); + if (IS_ERR(hpe)) { + err = PTR_ERR(hpe); + goto err_queues; + } + + rule = mlx5e_add_prio_hp_flow(priv, hpe, i); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + goto err_queues; + } + hpe->fwd_rule = rule; + } + + tc->prio_hp_ppriv = peer_priv; + + return 0; + +err_queues: + hash_for_each_safe(priv->fs.tc.hairpin_tbl, i, tmp, + hpe, hairpin_hlist) { + if (hpe->fwd_rule) + mlx5_del_flow_rules(hpe->fwd_rule); + mlx5e_hairpin_put(priv, hpe); + } + + return err; +} + +static void mlx5e_prio_hairpin_destroy_queues(struct mlx5e_priv *priv) +{ + struct mlx5e_hairpin_entry *hpe; + struct mlx5_rate_limit rl = {0}; + struct mlx5_core_dev *peer_dev; + struct hlist_node *tmp; + int i; + + hash_for_each_safe(priv->fs.tc.hairpin_tbl, i, tmp, + hpe, hairpin_hlist) { + if (!IS_ERR_OR_NULL(hpe->fwd_rule)) + mlx5_del_flow_rules(hpe->fwd_rule); + rl.rate = hpe->hp->rate_limit; + rl.max_burst_sz = hpe->hp->max_pp_burst_size; + peer_dev = hpe->hp->pair->peer_mdev; + mlx5e_hairpin_put(priv, hpe); + + if (rl.rate) + mlx5_rl_remove_rate(peer_dev, &rl); + } +} + +int mlx5e_prio_hairpin_fwd_tbl_create(struct mlx5e_priv *priv, int num_hp) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_flow_table_attr ft_attr = {}; + struct mlx5e_tc_table *tc = &priv->fs.tc; + struct mlx5_core_dev *dev = priv->mdev; + struct mlx5_flow_namespace *ns; + void *match_criteria, *misc; + struct mlx5_flow_table *ft; + struct mlx5_flow_group *g; + int err = 0; + u32 *flow_group_in; + + ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_KERNEL); + if (!ns) { + netdev_warn(priv->netdev, "Failed to get prio hp fwd ft namespace type:%d\n", + MLX5_FLOW_NAMESPACE_KERNEL); + return -EOPNOTSUPP; + } + + flow_group_in = kvzalloc(inlen, GFP_KERNEL); + if (!flow_group_in) { + err = -ENOMEM; + goto out_free; + } + + ft_attr.max_fte = num_hp; + ft_attr.level = MLX5E_TC_TTC_FT_LEVEL; + ft_attr.prio = MLX5E_TC_PRIO; + + ft = mlx5_create_flow_table(ns, &ft_attr); + if (IS_ERR(ft)) { + err = PTR_ERR(ft); + netdev_warn(priv->netdev, "Failed to create prio hp fwd table, err %d\n", err); + goto out_free; + } + + memset(flow_group_in, 0, inlen); + MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, + MLX5_MATCH_MISC_PARAMETERS_2); + + match_criteria = MLX5_ADDR_OF(create_flow_group_in, flow_group_in, match_criteria); + misc = MLX5_ADDR_OF(fte_match_param, match_criteria, misc_parameters_2); + MLX5_SET(fte_match_set_misc2, misc, metadata_reg_c_1, 0xFFFFFFFF); + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 0); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, ft_attr.max_fte - 1); + g = mlx5_create_flow_group(ft, flow_group_in); + if (IS_ERR(g)) { + err = PTR_ERR(g); + netdev_warn(priv->netdev, "Failed to create prio hp fwd group err(%d)\n", err); + goto err_group; + } + + tc->hp_fwd = ft; + tc->hp_fwd_g = g; + atomic_set(&tc->hp_fwd_ref_cnt, 0); + + kvfree(flow_group_in); + + return 0; + +err_group: + mlx5_destroy_flow_table(ft); +out_free: + kvfree(flow_group_in); + + return err; +} + +void mlx5e_prio_hairpin_fwd_tbl_destroy(struct mlx5e_priv *priv) +{ + struct mlx5e_tc_table *tc = &priv->fs.tc; + + mlx5_destroy_flow_group(tc->hp_fwd_g); + mlx5_destroy_flow_table(tc->hp_fwd); + tc->hp_fwd = NULL; +} + +int mlx5e_set_prio_hairpin_rate(struct mlx5e_priv *priv, + u16 prio, int rate) +{ + struct mlx5e_tc_table *tc = &priv->fs.tc; + struct mlx5e_modify_sq_param msp = {0}; + struct mlx5e_hairpin_entry *hpe; + struct mlx5_rate_limit rl = {0}; + struct mlx5_core_dev *peer_mdev; + struct mlx5e_hairpin *hp; + u16 rl_index = 0; + u16 match_prio; + int err; + + peer_mdev = tc->prio_hp_ppriv->mdev; + if (!peer_mdev) { + netdev_warn(priv->netdev, "Couldn't find peer dev to get hairpin queue entry\n"); + return -EINVAL; + } + + err = mlx5e_hairpin_get_prio(priv, NULL, prio, &match_prio, + NULL); + if (err) { + netdev_warn(priv->netdev, "Set prio hairpin rate: Invalid hairpin priority %d\n", + prio); + return -EINVAL; + } + + hpe = mlx5e_get_hairpin_entry(priv, peer_mdev, match_prio); + if (IS_ERR(hpe)) { + err = PTR_ERR(hpe); + netdev_warn(priv->netdev, "Set prio hairpin rate: Can't find hairpin entry, err %d\n", + err); + return err; + } + + hp = hpe->hp; + if (hp->rate_limit) { + rl.rate = hp->rate_limit; + rl.max_burst_sz = hp->max_pp_burst_size; + /* remove current rl index to free space to next ones */ + mlx5_rl_remove_rate(peer_mdev, &rl); + } + + hp->rate_limit = 0; + + if (rate) { + rl.rate = rate; + rl.max_burst_sz = tc->max_pp_burst_size; + err = mlx5_rl_add_rate(peer_mdev, &rl_index, &rl); + if (err) { + netdev_err(priv->netdev, "Failed configuring rate %u: %d\n", + rate, err); + goto finish; + } + } + + msp.curr_state = MLX5_SQC_STATE_RDY; + msp.next_state = MLX5_SQC_STATE_RDY; + msp.rl_index = rl_index; + msp.rl_update = true; + err = mlx5e_modify_sq(peer_mdev, hp->pair->sqn[0], &msp); + if (err) { + netdev_err(priv->netdev, "Failed configuring rate %u: %d\n", + rate, err); + /* remove the rate from the table */ + if (rate) + mlx5_rl_remove_rate(peer_mdev, &rl); + goto finish; + } + + hp->rate_limit = rate; + hp->max_pp_burst_size = rate ? tc->max_pp_burst_size : 0; + +finish: + mlx5e_hairpin_put(priv, hpe); + return err; +} + +int mlx5e_prio_hairpin_mode_enable(struct mlx5e_priv *priv, int num_hp, + struct net_device *peer_dev) +{ + struct mlx5e_tc_table *tc = &priv->fs.tc; + struct mlx5_prio_hp *prio_hp; + int err, i; + + tc->hp_config = kobject_create_and_add("hp_queues", &priv->netdev->dev.kobj); + if (!tc->hp_config) { + netdev_err(priv->netdev, "can't alloc hp_queues group\n"); + return -ENOMEM; + } + + prio_hp = kcalloc(num_hp, sizeof(*prio_hp), GFP_KERNEL); + if (!prio_hp) { + err = -ENOMEM; + goto err_array; + } + + tc->prio_hp = prio_hp; + /* Create sysfs entry per queue */ + for (i = 0; i < num_hp; i++) { + err = create_prio_hp_sysfs(priv, i); + if (err) + goto err_sysfs; + + prio_hp[i].prio = i; + prio_hp[i].priv = priv; + } + + /* Creating prio2hp forwarding table */ + err = mlx5e_prio_hairpin_fwd_tbl_create(priv, num_hp); + if (err) + goto err_tbl; + + tc->num_prio_hp = num_hp; + + /* Create the prio hp queues */ + err = mlx5e_prio_hairpin_init_queues(priv, num_hp, peer_dev); + if (err) { + netdev_err(priv->netdev, "Can't create %d prio hairpin queues, err %d\n", + num_hp, err); + goto err_queues; + } + + if (tc->hp_oob) + mlx5e_hairpin_oob_cnt_dest_set(priv, NULL); + + return 0; + +err_queues: + mlx5e_prio_hairpin_fwd_tbl_destroy(priv); + tc->num_prio_hp = 0; +err_tbl: +err_sysfs: + for (i--; i >= 0; i--) + kobject_put(&tc->prio_hp[i].kobj); + kfree(tc->prio_hp); +err_array: + kobject_put(tc->hp_config); + + return err; +} + +int mlx5e_prio_hairpin_mode_disable(struct mlx5e_priv *priv) +{ + struct mlx5e_tc_table *tc = &priv->fs.tc; + int i = tc->num_prio_hp - 1; + + if (!tc->num_prio_hp) + return 0; + + if (atomic_read(&tc->hp_fwd_ref_cnt)) { + netdev_warn(priv->netdev, "Can't destroy hairpin fwd table, still have flows attached\n"); + return -EBUSY; + } + + mlx5e_prio_hairpin_destroy_queues(priv); + for (; i >= 0; i--) + kobject_put(&tc->prio_hp[i].kobj); + + if (tc->hp_oob) + mlx5e_hairpin_oob_cnt_dest_unset(priv); + + mlx5e_prio_hairpin_fwd_tbl_destroy(priv); + kfree(tc->prio_hp); + kobject_put(tc->hp_config); + + tc->num_prio_hp = 0; + + return 0; +} + +static int mlx5e_hairpin_flow_add(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow, + struct mlx5e_tc_flow_parse_attr *parse_attr, + struct netlink_ext_ack *extack) +{ + struct mlx5_nic_flow_attr *attr = flow->attr->nic_attr; + int peer_ifindex = parse_attr->mirred_ifindex[0]; + struct mlx5e_tc_table *tc = &priv->fs.tc; + struct mlx5_core_dev *peer_mdev; + struct mlx5e_hairpin_entry *hpe; + u16 match_prio; + int err; + + if (priv->fs.tc.num_prio_hp) { + if (tc->hp_oob && tc->hp_oob->dest_valid) + attr->hairpin_ft = tc->hp_oob->ft; + else + attr->hairpin_ft = tc->hp_fwd; + return 0; + } + + peer_mdev = mlx5e_hairpin_get_mdev(dev_net(priv->netdev), peer_ifindex); + if (IS_ERR(peer_mdev)) { + NL_SET_ERR_MSG_MOD(extack, "invalid ifindex of mirred device"); + return PTR_ERR(peer_mdev); + } + + if (!MLX5_CAP_GEN(priv->mdev, hairpin) || !MLX5_CAP_GEN(peer_mdev, hairpin)) { + NL_SET_ERR_MSG_MOD(extack, "hairpin is not supported"); + return -EOPNOTSUPP; + } + + err = mlx5e_hairpin_get_prio(priv, &parse_attr->spec, 0, &match_prio, + extack); + if (err) + return err; + + hpe = mlx5e_get_hairpin_entry(priv, peer_mdev, match_prio); + if (IS_ERR(hpe)) + return PTR_ERR(hpe); + + if (tc->hp_oob && tc->hp_oob->dest_valid) { + attr->hairpin_ft = tc->hp_oob->ft; + } else if (hpe->hp->num_channels > 1) { + flow_flag_set(flow, HAIRPIN_RSS); + attr->hairpin_ft = mlx5_get_ttc_flow_table(hpe->hp->ttc); + } else { + attr->hairpin_tirn = mlx5e_tir_get_tirn(&hpe->hp->direct_tir); + } + + flow->hpe = hpe; + spin_lock(&hpe->flows_lock); + list_add(&flow->hairpin, &hpe->flows); + spin_unlock(&hpe->flows_lock); + + return 0; + +} + +static void mlx5e_hairpin_flow_del(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow) +{ + /* hp prio mode - flows aren't attached directly + * to hpe. + */ + if (priv->fs.tc.num_prio_hp) + return; + + /* flow wasn't fully initialized */ + if (!flow->hpe) + return; + + spin_lock(&flow->hpe->flows_lock); + list_del(&flow->hairpin); + spin_unlock(&flow->hpe->flows_lock); + + mlx5e_hairpin_put(priv, flow->hpe); + flow->hpe = NULL; +} + +struct mlx5_flow_handle * +mlx5e_add_offloaded_nic_rule(struct mlx5e_priv *priv, + struct mlx5_flow_spec *spec, + struct mlx5_flow_attr *attr) +{ + struct mlx5_flow_context *flow_context = &spec->flow_context; + struct mlx5_fs_chains *nic_chains = mlx5e_nic_chains(priv); + struct mlx5_nic_flow_attr *nic_attr = attr->nic_attr; + struct mlx5e_tc_table *tc = &priv->fs.tc; + struct mlx5_flow_destination dest[2] = {}; + struct mlx5_flow_act flow_act = { + .action = attr->action, + .flags = FLOW_ACT_NO_APPEND, + }; + struct mlx5_flow_handle *rule; + struct mlx5_flow_table *ft; + int dest_ix = 0; + + flow_context->flags |= FLOW_CONTEXT_HAS_TAG; + flow_context->flow_tag = nic_attr->flow_tag; + + if (attr->dest_ft) { + dest[dest_ix].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dest[dest_ix].ft = attr->dest_ft; + dest_ix++; + } else if (nic_attr->hairpin_ft) { + dest[dest_ix].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dest[dest_ix].ft = nic_attr->hairpin_ft; + dest_ix++; + } else if (nic_attr->hairpin_tirn) { + dest[dest_ix].type = MLX5_FLOW_DESTINATION_TYPE_TIR; + dest[dest_ix].tir_num = nic_attr->hairpin_tirn; + dest_ix++; + } else if (attr->action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) { + dest[dest_ix].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + if (attr->dest_chain) { + dest[dest_ix].ft = mlx5_chains_get_table(nic_chains, + attr->dest_chain, 1, + MLX5E_TC_FT_LEVEL); + if (IS_ERR(dest[dest_ix].ft)) + return ERR_CAST(dest[dest_ix].ft); + } else { + dest[dest_ix].ft = mlx5e_vlan_get_flowtable(priv->fs.vlan); + } + dest_ix++; + } + + if (dest[0].type == MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE && + MLX5_CAP_FLOWTABLE_NIC_RX(priv->mdev, ignore_flow_level)) + flow_act.flags |= FLOW_ACT_IGNORE_FLOW_LEVEL; + + if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_COUNT) { + dest[dest_ix].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; + dest[dest_ix].counter_id = mlx5_fc_id(attr->counter); + dest_ix++; + } + + if (attr->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR) + flow_act.modify_hdr = attr->modify_hdr; + + mutex_lock(&tc->t_lock); + if (IS_ERR_OR_NULL(tc->t)) { + /* Create the root table here if doesn't exist yet */ + tc->t = + mlx5_chains_get_table(nic_chains, 0, 1, MLX5E_TC_FT_LEVEL); + + if (IS_ERR(tc->t)) { + mutex_unlock(&tc->t_lock); + netdev_err(priv->netdev, + "Failed to create tc offload table\n"); + rule = ERR_CAST(priv->fs.tc.t); + goto err_ft_get; + } + } + mutex_unlock(&tc->t_lock); + + if (attr->chain || attr->prio) + ft = mlx5_chains_get_table(nic_chains, + attr->chain, attr->prio, + MLX5E_TC_FT_LEVEL); + else + ft = attr->ft; + + if (IS_ERR(ft)) { + rule = ERR_CAST(ft); + goto err_ft_get; + } + + if (attr->outer_match_level != MLX5_MATCH_NONE) + spec->match_criteria_enable |= MLX5_MATCH_OUTER_HEADERS; + + rule = mlx5_add_flow_rules(ft, spec, + &flow_act, dest, dest_ix); + if (IS_ERR(rule)) + goto err_rule; + + if (dest[0].type == MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE && + dest[0].ft == tc->hp_fwd) + atomic_inc(&tc->hp_fwd_ref_cnt); + + return rule; + +err_rule: + if (attr->chain || attr->prio) + mlx5_chains_put_table(nic_chains, + attr->chain, attr->prio, + MLX5E_TC_FT_LEVEL); +err_ft_get: + if (attr->dest_chain) + mlx5_chains_put_table(nic_chains, + attr->dest_chain, 1, + MLX5E_TC_FT_LEVEL); + + return ERR_CAST(rule); +} + +static int +alloc_flow_attr_counter(struct mlx5_core_dev *counter_dev, + struct mlx5_flow_attr *attr) + +{ + struct mlx5_fc *counter; + + counter = mlx5_fc_create(counter_dev, true); + if (IS_ERR(counter)) + return PTR_ERR(counter); + + attr->counter = counter; + return 0; +} + +static int +mlx5e_tc_add_nic_flow(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow, + struct netlink_ext_ack *extack) +{ + struct mlx5e_tc_flow_parse_attr *parse_attr; + struct mlx5_flow_attr *attr = flow->attr; + struct mlx5_core_dev *dev = priv->mdev; + int err; + + parse_attr = attr->parse_attr; + + if (flow_flag_test(flow, HAIRPIN)) { + err = mlx5e_hairpin_flow_add(priv, flow, parse_attr, extack); + if (err) + return err; + } + + if (attr->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) { + err = alloc_flow_attr_counter(dev, attr); + if (err) + return err; + } + + if (attr->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR) { + err = mlx5e_attach_mod_hdr(priv, flow, parse_attr); + if (err) + return err; + } + + if (attr->flags & MLX5_ATTR_FLAG_CT) + flow->rule[0] = mlx5_tc_ct_flow_offload(get_ct_priv(priv), &parse_attr->spec, + attr, &parse_attr->mod_hdr_acts); + else + flow->rule[0] = mlx5e_add_offloaded_nic_rule(priv, &parse_attr->spec, + attr); + + return PTR_ERR_OR_ZERO(flow->rule[0]); +} + +void mlx5e_del_offloaded_nic_rule(struct mlx5e_priv *priv, + struct mlx5_flow_handle *rule, + struct mlx5_flow_attr *attr) +{ + struct mlx5_fs_chains *nic_chains = mlx5e_nic_chains(priv); + + mlx5_del_flow_rules(rule); + + if (attr->chain || attr->prio) + mlx5_chains_put_table(nic_chains, attr->chain, attr->prio, + MLX5E_TC_FT_LEVEL); + + if (attr->dest_chain) + mlx5_chains_put_table(nic_chains, attr->dest_chain, 1, + MLX5E_TC_FT_LEVEL); +} + +static void mlx5e_tc_del_nic_flow(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow) +{ + struct mlx5_flow_attr *attr = flow->attr; + struct mlx5e_tc_table *tc = &priv->fs.tc; + + flow_flag_clear(flow, OFFLOADED); + + if (attr->flags & MLX5_ATTR_FLAG_CT) + mlx5_tc_ct_delete_flow(get_ct_priv(flow->priv), attr); + else if (!IS_ERR_OR_NULL(flow->rule[0])) + mlx5e_del_offloaded_nic_rule(priv, flow->rule[0], attr); + + /* Remove root table if no rules are left to avoid + * extra steering hops. + */ + mutex_lock(&priv->fs.tc.t_lock); + if (!mlx5e_tc_num_filters(priv, MLX5_TC_FLAG(NIC_OFFLOAD)) && + !IS_ERR_OR_NULL(tc->t)) { + mlx5_chains_put_table(mlx5e_nic_chains(priv), 0, 1, MLX5E_TC_FT_LEVEL); + priv->fs.tc.t = NULL; + } + mutex_unlock(&priv->fs.tc.t_lock); + + if (attr->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR) { + mlx5e_mod_hdr_dealloc(&attr->parse_attr->mod_hdr_acts); + mlx5e_detach_mod_hdr(priv, flow); + } + + if (attr->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) + mlx5_fc_destroy(priv->mdev, attr->counter); + + if (flow_flag_test(flow, HAIRPIN)) { + if (tc->num_prio_hp) + atomic_dec(&tc->hp_fwd_ref_cnt); + mlx5e_hairpin_flow_del(priv, flow); + } + + free_flow_post_acts(flow); + + kvfree(attr->parse_attr); + kfree(flow->attr); +} + +struct mlx5_flow_handle * +mlx5e_tc_offload_fdb_rules(struct mlx5_eswitch *esw, + struct mlx5e_tc_flow *flow, + struct mlx5_flow_spec *spec, + struct mlx5_flow_attr *attr) +{ + struct mlx5_flow_handle *rule; + + if (attr->flags & MLX5_ATTR_FLAG_SLOW_PATH) + return mlx5_eswitch_add_offloaded_rule(esw, spec, attr); + + rule = mlx5e_tc_rule_offload(flow->priv, spec, attr); + + if (IS_ERR(rule)) + return rule; + + if (attr->esw_attr->split_count) { + flow->rule[1] = mlx5_eswitch_add_fwd_rule(esw, spec, attr); + if (IS_ERR(flow->rule[1])) + goto err_rule1; + } + + return rule; + +err_rule1: + mlx5e_tc_rule_unoffload(flow->priv, rule, attr); + return flow->rule[1]; +} + +void mlx5e_tc_unoffload_fdb_rules(struct mlx5_eswitch *esw, + struct mlx5e_tc_flow *flow, + struct mlx5_flow_attr *attr) +{ + flow_flag_clear(flow, OFFLOADED); + + if (attr->flags & MLX5_ATTR_FLAG_SLOW_PATH) + return mlx5_eswitch_del_offloaded_rule(esw, flow->rule[0], attr); + + if (attr->esw_attr->split_count) + mlx5_eswitch_del_fwd_rule(esw, flow->rule[1], attr); + + mlx5e_tc_rule_unoffload(flow->priv, flow->rule[0], attr); +} + +struct mlx5_flow_handle * +mlx5e_tc_offload_to_slow_path(struct mlx5_eswitch *esw, + struct mlx5e_tc_flow *flow, + struct mlx5_flow_spec *spec) +{ + struct mlx5e_tc_mod_hdr_acts mod_acts = {}; + struct mlx5e_mod_hdr_handle *mh = NULL; + struct mlx5_flow_attr *slow_attr; + struct mlx5_flow_handle *rule; + bool fwd_and_modify_cap; + u32 chain_mapping = 0; + int err; + + slow_attr = mlx5_alloc_flow_attr(MLX5_FLOW_NAMESPACE_FDB); + if (!slow_attr) + return ERR_PTR(-ENOMEM); + + memcpy(slow_attr, flow->attr, ESW_FLOW_ATTR_SZ); + slow_attr->action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + slow_attr->esw_attr->split_count = 0; + slow_attr->flags |= MLX5_ATTR_FLAG_SLOW_PATH; + + fwd_and_modify_cap = MLX5_CAP_ESW_FLOWTABLE((esw)->dev, fdb_modify_header_fwd_to_table); + if (!fwd_and_modify_cap) + goto skip_restore; + + err = mlx5_chains_get_chain_mapping(esw_chains(esw), flow->attr->chain, &chain_mapping); + if (err) + goto err_get_chain; + + err = mlx5e_tc_match_to_reg_set(esw->dev, &mod_acts, MLX5_FLOW_NAMESPACE_FDB, + CHAIN_TO_REG, chain_mapping); + if (err) + goto err_reg_set; + + mh = mlx5e_mod_hdr_attach(esw->dev, get_mod_hdr_table(flow->priv, flow), + MLX5_FLOW_NAMESPACE_FDB, &mod_acts); + if (IS_ERR(mh)) { + err = PTR_ERR(mh); + goto err_attach; + } + + slow_attr->action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR; + slow_attr->modify_hdr = mlx5e_mod_hdr_get(mh); + +skip_restore: + rule = mlx5e_tc_offload_fdb_rules(esw, flow, spec, slow_attr); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + goto err_offload; + } + + flow->slow_mh = mh; + flow->chain_mapping = chain_mapping; + flow_flag_set(flow, SLOW); + + mlx5e_mod_hdr_dealloc(&mod_acts); + kfree(slow_attr); + + return rule; + +err_offload: + if (fwd_and_modify_cap) + mlx5e_mod_hdr_detach(esw->dev, get_mod_hdr_table(flow->priv, flow), mh); +err_attach: +err_reg_set: + if (fwd_and_modify_cap) + mlx5_chains_put_chain_mapping(esw_chains(esw), chain_mapping); +err_get_chain: + mlx5e_mod_hdr_dealloc(&mod_acts); + kfree(slow_attr); + return ERR_PTR(err); +} + +void mlx5e_tc_unoffload_from_slow_path(struct mlx5_eswitch *esw, + struct mlx5e_tc_flow *flow) +{ + struct mlx5_flow_attr *slow_attr; + + slow_attr = mlx5_alloc_flow_attr(MLX5_FLOW_NAMESPACE_FDB); + if (!slow_attr) { + mlx5_core_warn(flow->priv->mdev, "Unable to alloc attr to unoffload slow path rule\n"); + return; + } + + memcpy(slow_attr, flow->attr, ESW_FLOW_ATTR_SZ); + slow_attr->action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + slow_attr->esw_attr->split_count = 0; + slow_attr->flags |= MLX5_ATTR_FLAG_SLOW_PATH; + if (flow->slow_mh) { + slow_attr->action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR; + slow_attr->modify_hdr = mlx5e_mod_hdr_get(flow->slow_mh); + } + mlx5e_tc_unoffload_fdb_rules(esw, flow, slow_attr); + if (flow->slow_mh) { + mlx5e_mod_hdr_detach(esw->dev, get_mod_hdr_table(flow->priv, flow), flow->slow_mh); + mlx5_chains_put_chain_mapping(esw_chains(esw), flow->chain_mapping); + flow->chain_mapping = 0; + flow->slow_mh = NULL; + } + flow_flag_clear(flow, SLOW); + kfree(slow_attr); +} + +/* Caller must obtain uplink_priv->unready_flows_lock mutex before calling this + * function. + */ +static void unready_flow_add(struct mlx5e_tc_flow *flow, + struct list_head *unready_flows) +{ + flow_flag_set(flow, NOT_READY); + list_add_tail(&flow->unready, unready_flows); +} + +/* Caller must obtain uplink_priv->unready_flows_lock mutex before calling this + * function. + */ +static void unready_flow_del(struct mlx5e_tc_flow *flow) +{ + list_del(&flow->unready); + flow_flag_clear(flow, NOT_READY); +} + +static void add_unready_flow(struct mlx5e_tc_flow *flow) +{ + struct mlx5_rep_uplink_priv *uplink_priv; + struct mlx5e_rep_priv *rpriv; + struct mlx5_eswitch *esw; + + esw = flow->priv->mdev->priv.eswitch; + rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH); + uplink_priv = &rpriv->uplink_priv; + + mutex_lock(&uplink_priv->unready_flows_lock); + unready_flow_add(flow, &uplink_priv->unready_flows); + mutex_unlock(&uplink_priv->unready_flows_lock); +} + +static void remove_unready_flow(struct mlx5e_tc_flow *flow) +{ + struct mlx5_rep_uplink_priv *uplink_priv; + struct mlx5e_rep_priv *rpriv; + struct mlx5_eswitch *esw; + + esw = flow->priv->mdev->priv.eswitch; + rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH); + uplink_priv = &rpriv->uplink_priv; + + mutex_lock(&uplink_priv->unready_flows_lock); + unready_flow_del(flow); + mutex_unlock(&uplink_priv->unready_flows_lock); +} + +bool mlx5e_tc_is_vf_tunnel(struct net_device *out_dev, struct net_device *route_dev) +{ + struct mlx5_core_dev *out_mdev, *route_mdev; + struct mlx5e_priv *out_priv, *route_priv; + + out_priv = netdev_priv(out_dev); + out_mdev = out_priv->mdev; + route_priv = netdev_priv(route_dev); + route_mdev = route_priv->mdev; + + if (out_mdev->coredev_type != MLX5_COREDEV_PF) + return false; + + if (route_mdev->coredev_type != MLX5_COREDEV_VF && + route_mdev->coredev_type != MLX5_COREDEV_SF) + return false; + + return mlx5e_same_hw_devs(out_priv, route_priv); +} + +int mlx5e_tc_query_route_vport(struct net_device *out_dev, struct net_device *route_dev, u16 *vport) +{ + struct mlx5e_priv *out_priv, *route_priv; + struct mlx5_devcom *devcom = NULL; + struct mlx5_core_dev *route_mdev; + struct mlx5_eswitch *esw; + u16 vhca_id; + int err; + + out_priv = netdev_priv(out_dev); + esw = out_priv->mdev->priv.eswitch; + route_priv = netdev_priv(route_dev); + route_mdev = route_priv->mdev; + + vhca_id = MLX5_CAP_GEN(route_mdev, vhca_id); + if (mlx5_lag_is_active(out_priv->mdev)) { + /* In lag case we may get devices from different eswitch instances. + * If we failed to get vport num, it means, mostly, that we on the wrong + * eswitch. + */ + err = mlx5_eswitch_vhca_id_to_vport(esw, vhca_id, vport); + if (err != -ENOENT) + return err; + + devcom = out_priv->mdev->priv.devcom; + esw = mlx5_devcom_get_peer_data(devcom, MLX5_DEVCOM_ESW_OFFLOADS); + if (!esw) + return -ENODEV; + } + + err = mlx5_eswitch_vhca_id_to_vport(esw, vhca_id, vport); + if (devcom) + mlx5_devcom_release_peer_data(devcom, MLX5_DEVCOM_ESW_OFFLOADS); + return err; +} + +int mlx5e_tc_add_flow_mod_hdr(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow, + struct mlx5_flow_attr *attr) +{ + struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts = &attr->parse_attr->mod_hdr_acts; + struct mlx5_modify_hdr *mod_hdr; + + mod_hdr = mlx5_modify_header_alloc(priv->mdev, + mlx5e_get_flow_namespace(flow), + mod_hdr_acts->num_actions, + mod_hdr_acts->actions); + if (IS_ERR(mod_hdr)) + return PTR_ERR(mod_hdr); + + WARN_ON(attr->modify_hdr); + attr->modify_hdr = mod_hdr; + + return 0; +} + +static int +set_encap_dests(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow, + struct mlx5_flow_attr *attr, + struct netlink_ext_ack *extack, + bool *vf_tun) +{ + struct mlx5e_tc_flow_parse_attr *parse_attr; + struct mlx5_esw_flow_attr *esw_attr; + struct net_device *encap_dev = NULL; + struct mlx5e_rep_priv *rpriv; + struct mlx5e_priv *out_priv; + int out_index; + int err = 0; + + if (!mlx5e_is_eswitch_flow(flow)) + return 0; + + parse_attr = attr->parse_attr; + esw_attr = attr->esw_attr; + *vf_tun = false; + + for (out_index = 0; out_index < MLX5_MAX_FLOW_FWD_VPORTS; out_index++) { + struct net_device *out_dev; + int mirred_ifindex; + + if (!(esw_attr->dests[out_index].flags & MLX5_ESW_DEST_ENCAP)) + continue; + + mirred_ifindex = parse_attr->mirred_ifindex[out_index]; + out_dev = dev_get_by_index(dev_net(priv->netdev), mirred_ifindex); + if (!out_dev) { + NL_SET_ERR_MSG_MOD(extack, "Requested mirred device not found"); + err = -ENODEV; + goto out; + } + err = mlx5e_attach_encap(priv, flow, attr, out_dev, out_index, + extack, &encap_dev); + dev_put(out_dev); + if (err) + goto out; + + if (esw_attr->dests[out_index].flags & + MLX5_ESW_DEST_CHAIN_WITH_SRC_PORT_CHANGE && + !esw_attr->dest_int_port) + *vf_tun = true; + + out_priv = netdev_priv(encap_dev); + rpriv = out_priv->ppriv; + esw_attr->dests[out_index].rep = rpriv->rep; + esw_attr->dests[out_index].mdev = out_priv->mdev; + } + + if (*vf_tun && esw_attr->out_count > 1) { + NL_SET_ERR_MSG_MOD(extack, "VF tunnel encap with mirroring is not supported"); + err = -EOPNOTSUPP; + goto out; + } + +out: + return err; +} + +static void +clean_encap_dests(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow, + struct mlx5_flow_attr *attr, + bool *vf_tun) +{ + struct mlx5_esw_flow_attr *esw_attr; + int out_index; + + if (!mlx5e_is_eswitch_flow(flow)) + return; + + esw_attr = attr->esw_attr; + *vf_tun = false; + + for (out_index = 0; out_index < MLX5_MAX_FLOW_FWD_VPORTS; out_index++) { + if (!(esw_attr->dests[out_index].flags & MLX5_ESW_DEST_ENCAP)) + continue; + + if (esw_attr->dests[out_index].flags & + MLX5_ESW_DEST_CHAIN_WITH_SRC_PORT_CHANGE && + !esw_attr->dest_int_port) + *vf_tun = true; + + mlx5e_detach_encap(priv, flow, attr, out_index); + kfree(attr->parse_attr->tun_info[out_index]); + } +} + +static int +mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow, + struct netlink_ext_ack *extack) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + struct mlx5e_tc_flow_parse_attr *parse_attr; + struct mlx5_flow_attr *attr = flow->attr; + struct mlx5_esw_flow_attr *esw_attr; + u32 max_prio, max_chain; + bool vf_tun; + int err = 0; + + parse_attr = attr->parse_attr; + esw_attr = attr->esw_attr; + + /* We check chain range only for tc flows. + * For ft flows, we checked attr->chain was originally 0 and set it to + * FDB_FT_CHAIN which is outside tc range. + * See mlx5e_rep_setup_ft_cb(). + */ + max_chain = mlx5_chains_get_chain_range(esw_chains(esw)); + if (!mlx5e_is_ft_flow(flow) && attr->chain > max_chain) { + NL_SET_ERR_MSG_MOD(extack, + "Requested chain is out of supported range"); + err = -EOPNOTSUPP; + goto err_out; + } + + max_prio = mlx5_chains_get_prio_range(esw_chains(esw)); + if (attr->prio > max_prio) { + NL_SET_ERR_MSG_MOD(extack, + "Requested priority is out of supported range"); + err = -EOPNOTSUPP; + goto err_out; + } + + if (flow_flag_test(flow, TUN_RX)) { + err = mlx5e_attach_decap_route(priv, flow); + if (err) + goto err_out; + + if (!attr->chain && esw_attr->int_port && + attr->action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) { + /* If decap route device is internal port, change the + * source vport value in reg_c0 back to uplink just in + * case the rule performs goto chain > 0. If we have a miss + * on chain > 0 we want the metadata regs to hold the + * chain id so SW will resume handling of this packet + * from the proper chain. + */ + u32 metadata = mlx5_eswitch_get_vport_metadata_for_set(esw, + esw_attr->in_rep->vport); + + err = mlx5e_tc_match_to_reg_set(priv->mdev, &parse_attr->mod_hdr_acts, + MLX5_FLOW_NAMESPACE_KERNEL, VPORT_TO_REG, + metadata); + if (err) + goto err_out; + + attr->action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR; + } + } + + if (flow_flag_test(flow, L3_TO_L2_DECAP)) { + err = mlx5e_attach_decap(priv, flow, extack); + if (err) + goto err_out; + } + + if (netif_is_ovs_master(parse_attr->filter_dev)) { + struct mlx5e_tc_int_port *int_port; + + if (attr->chain) { + NL_SET_ERR_MSG_MOD(extack, + "Internal port rule is only supported on chain 0"); + err = -EOPNOTSUPP; + goto err_out; + } + + if (attr->dest_chain) { + NL_SET_ERR_MSG_MOD(extack, + "Internal port rule offload doesn't support goto action"); + err = -EOPNOTSUPP; + goto err_out; + } + + int_port = mlx5e_tc_int_port_get(mlx5e_get_int_port_priv(priv), + parse_attr->filter_dev->ifindex, + flow_flag_test(flow, EGRESS) ? + MLX5E_TC_INT_PORT_EGRESS : + MLX5E_TC_INT_PORT_INGRESS); + if (IS_ERR(int_port)) { + err = PTR_ERR(int_port); + goto err_out; + } + + esw_attr->int_port = int_port; + } + + err = set_encap_dests(priv, flow, attr, extack, &vf_tun); + if (err) + goto err_out; + + err = mlx5_eswitch_add_vlan_action(esw, attr); + if (err) + goto err_out; + + if (attr->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR) { + if (vf_tun) { + err = mlx5e_tc_add_flow_mod_hdr(priv, flow, attr); + if (err) + goto err_out; + } else { + err = mlx5e_attach_mod_hdr(priv, flow, parse_attr); + if (err) + goto err_out; + } + } + + if (attr->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) { + err = alloc_flow_attr_counter(esw_attr->counter_dev, attr); + if (err) + goto err_out; + } + + /* we get here if one of the following takes place: + * (1) there's no error + * (2) there's an encap action and we don't have valid neigh + */ + if (flow_flag_test(flow, SLOW)) + flow->rule[0] = mlx5e_tc_offload_to_slow_path(esw, flow, &parse_attr->spec); + else + flow->rule[0] = mlx5e_tc_offload_fdb_rules(esw, flow, &parse_attr->spec, attr); + + if (IS_ERR(flow->rule[0])) { + err = PTR_ERR(flow->rule[0]); + goto err_out; + } + flow_flag_set(flow, OFFLOADED); + + return 0; + +err_out: + flow_flag_set(flow, FAILED); + return err; +} + +static bool mlx5_flow_has_geneve_opt(struct mlx5e_tc_flow *flow) +{ + struct mlx5_flow_spec *spec = &flow->attr->parse_attr->spec; + void *headers_v = MLX5_ADDR_OF(fte_match_param, + spec->match_value, + misc_parameters_3); + u32 geneve_tlv_opt_0_data = MLX5_GET(fte_match_set_misc3, + headers_v, + geneve_tlv_option_0_data); + + return !!geneve_tlv_opt_0_data; +} + +static void mlx5e_tc_del_fdb_flow(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + struct mlx5_flow_attr *attr = flow->attr; + struct mlx5_esw_flow_attr *esw_attr; + bool vf_tun; + + esw_attr = attr->esw_attr; + mlx5e_put_flow_tunnel_id(flow); + + if (flow_flag_test(flow, NOT_READY)) + remove_unready_flow(flow); + + if (mlx5e_is_offloaded_flow(flow)) { + if (flow_flag_test(flow, SLOW)) + mlx5e_tc_unoffload_from_slow_path(esw, flow); + else + mlx5e_tc_unoffload_fdb_rules(esw, flow, attr); + } + complete_all(&flow->del_hw_done); + + if (mlx5_flow_has_geneve_opt(flow)) + mlx5_geneve_tlv_option_del(priv->mdev->geneve); + + mlx5_eswitch_del_vlan_action(esw, attr); + + if (flow->decap_route) + mlx5e_detach_decap_route(priv, flow); + + clean_encap_dests(priv, flow, attr, &vf_tun); + + mlx5_tc_ct_match_del(get_ct_priv(priv), &flow->attr->ct_attr); + + if (attr->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR) { + mlx5e_mod_hdr_dealloc(&attr->parse_attr->mod_hdr_acts); + if (vf_tun && attr->modify_hdr) + mlx5_modify_header_dealloc(priv->mdev, attr->modify_hdr); + else + mlx5e_detach_mod_hdr(priv, flow); + } + + if (attr->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) + mlx5_fc_destroy(esw_attr->counter_dev, attr->counter); + + if (esw_attr->int_port) + mlx5e_tc_int_port_put(mlx5e_get_int_port_priv(priv), esw_attr->int_port); + + if (esw_attr->dest_int_port) + mlx5e_tc_int_port_put(mlx5e_get_int_port_priv(priv), esw_attr->dest_int_port); + + if (flow_flag_test(flow, L3_TO_L2_DECAP)) + mlx5e_detach_decap(priv, flow); + + free_flow_post_acts(flow); + + kvfree(attr->esw_attr->rx_tun_attr); + kvfree(attr->parse_attr); + kfree(flow->attr); +} + +struct mlx5_fc *mlx5e_tc_get_counter(struct mlx5e_tc_flow *flow) +{ + struct mlx5_flow_attr *attr; + + attr = list_first_entry(&flow->attrs, struct mlx5_flow_attr, list); + return attr->counter; +} + +/* Iterate over tmp_list of flows attached to flow_list head. */ +void mlx5e_put_flow_list(struct mlx5e_priv *priv, struct list_head *flow_list) +{ + struct mlx5e_tc_flow *flow, *tmp; + + list_for_each_entry_safe(flow, tmp, flow_list, tmp_list) + mlx5e_flow_put(priv, flow); +} + +static void __mlx5e_tc_del_fdb_peer_flow(struct mlx5e_tc_flow *flow) +{ + struct mlx5_eswitch *esw = flow->priv->mdev->priv.eswitch; + + if (!flow_flag_test(flow, ESWITCH) || + !flow_flag_test(flow, DUP)) + return; + + mutex_lock(&esw->offloads.peer_mutex); + list_del(&flow->peer); + mutex_unlock(&esw->offloads.peer_mutex); + + flow_flag_clear(flow, DUP); + + if (refcount_dec_and_test(&flow->peer_flow->refcnt)) { + mlx5e_tc_del_fdb_flow(flow->peer_flow->priv, flow->peer_flow); + kfree(flow->peer_flow); + } + + flow->peer_flow = NULL; +} + +static void mlx5e_tc_del_fdb_peer_flow(struct mlx5e_tc_flow *flow) +{ + struct mlx5_core_dev *dev = flow->priv->mdev; + struct mlx5_devcom *devcom = dev->priv.devcom; + struct mlx5_eswitch *peer_esw; + + peer_esw = mlx5_devcom_get_peer_data(devcom, MLX5_DEVCOM_ESW_OFFLOADS); + if (!peer_esw) + return; + + __mlx5e_tc_del_fdb_peer_flow(flow); + mlx5_devcom_release_peer_data(devcom, MLX5_DEVCOM_ESW_OFFLOADS); +} + +static void mlx5e_tc_del_flow(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow) +{ + if (mlx5e_is_eswitch_flow(flow)) { + mlx5e_tc_del_fdb_peer_flow(flow); + mlx5e_tc_del_fdb_flow(priv, flow); + } else { + mlx5e_tc_del_nic_flow(priv, flow); + } +} + +static bool flow_requires_tunnel_mapping(u32 chain, struct flow_cls_offload *f) +{ + struct flow_rule *rule = flow_cls_offload_flow_rule(f); + struct flow_action *flow_action = &rule->action; + const struct flow_action_entry *act; + int i; + + if (chain) + return false; + + flow_action_for_each(i, act, flow_action) { + switch (act->id) { + case FLOW_ACTION_GOTO: + return true; + case FLOW_ACTION_SAMPLE: + return true; + default: + continue; + } + } + + return false; +} + +static int +enc_opts_is_dont_care_or_full_match(struct mlx5e_priv *priv, + struct flow_dissector_key_enc_opts *opts, + struct netlink_ext_ack *extack, + bool *dont_care) +{ + struct geneve_opt *opt; + int off = 0; + + *dont_care = true; + + while (opts->len > off) { + opt = (struct geneve_opt *)&opts->data[off]; + + if (!(*dont_care) || opt->opt_class || opt->type || + memchr_inv(opt->opt_data, 0, opt->length * 4)) { + *dont_care = false; + + if (opt->opt_class != htons(U16_MAX) || + opt->type != U8_MAX) { + NL_SET_ERR_MSG_MOD(extack, + "Partial match of tunnel options in chain > 0 isn't supported"); + netdev_warn(priv->netdev, + "Partial match of tunnel options in chain > 0 isn't supported"); + return -EOPNOTSUPP; + } + } + + off += sizeof(struct geneve_opt) + opt->length * 4; + } + + return 0; +} + +#define COPY_DISSECTOR(rule, diss_key, dst)\ +({ \ + struct flow_rule *__rule = (rule);\ + typeof(dst) __dst = dst;\ +\ + memcpy(__dst,\ + skb_flow_dissector_target(__rule->match.dissector,\ + diss_key,\ + __rule->match.key),\ + sizeof(*__dst));\ +}) + +static int mlx5e_get_flow_tunnel_id(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow, + struct flow_cls_offload *f, + struct net_device *filter_dev) +{ + struct flow_rule *rule = flow_cls_offload_flow_rule(f); + struct netlink_ext_ack *extack = f->common.extack; + struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts; + struct flow_match_enc_opts enc_opts_match; + struct tunnel_match_enc_opts tun_enc_opts; + struct mlx5_rep_uplink_priv *uplink_priv; + struct mlx5_flow_attr *attr = flow->attr; + struct mlx5e_rep_priv *uplink_rpriv; + struct tunnel_match_key tunnel_key; + bool enc_opts_is_dont_care = true; + u32 tun_id, enc_opts_id = 0; + struct mlx5_eswitch *esw; + u32 value, mask; + int err; + + esw = priv->mdev->priv.eswitch; + uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH); + uplink_priv = &uplink_rpriv->uplink_priv; + + memset(&tunnel_key, 0, sizeof(tunnel_key)); + COPY_DISSECTOR(rule, FLOW_DISSECTOR_KEY_ENC_CONTROL, + &tunnel_key.enc_control); + if (tunnel_key.enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) + COPY_DISSECTOR(rule, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, + &tunnel_key.enc_ipv4); + else + COPY_DISSECTOR(rule, FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, + &tunnel_key.enc_ipv6); + COPY_DISSECTOR(rule, FLOW_DISSECTOR_KEY_ENC_IP, &tunnel_key.enc_ip); + COPY_DISSECTOR(rule, FLOW_DISSECTOR_KEY_ENC_PORTS, + &tunnel_key.enc_tp); + COPY_DISSECTOR(rule, FLOW_DISSECTOR_KEY_ENC_KEYID, + &tunnel_key.enc_key_id); + tunnel_key.filter_ifindex = filter_dev->ifindex; + + err = mapping_add(uplink_priv->tunnel_mapping, &tunnel_key, &tun_id); + if (err) + return err; + + flow_rule_match_enc_opts(rule, &enc_opts_match); + err = enc_opts_is_dont_care_or_full_match(priv, + enc_opts_match.mask, + extack, + &enc_opts_is_dont_care); + if (err) + goto err_enc_opts; + + if (!enc_opts_is_dont_care) { + memset(&tun_enc_opts, 0, sizeof(tun_enc_opts)); + memcpy(&tun_enc_opts.key, enc_opts_match.key, + sizeof(*enc_opts_match.key)); + memcpy(&tun_enc_opts.mask, enc_opts_match.mask, + sizeof(*enc_opts_match.mask)); + + err = mapping_add(uplink_priv->tunnel_enc_opts_mapping, + &tun_enc_opts, &enc_opts_id); + if (err) + goto err_enc_opts; + } + + value = tun_id << ENC_OPTS_BITS | enc_opts_id; + mask = enc_opts_id ? TUNNEL_ID_MASK : + (TUNNEL_ID_MASK & ~ENC_OPTS_BITS_MASK); + + if (attr->chain) { + mlx5e_tc_match_to_reg_match(&attr->parse_attr->spec, + TUNNEL_TO_REG, value, mask); + } else { + mod_hdr_acts = &attr->parse_attr->mod_hdr_acts; + err = mlx5e_tc_match_to_reg_set(priv->mdev, + mod_hdr_acts, MLX5_FLOW_NAMESPACE_FDB, + TUNNEL_TO_REG, value); + if (err) + goto err_set; + + attr->action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR; + } + + flow->attr->tunnel_id = value; + return 0; + +err_set: + if (enc_opts_id) + mapping_remove(uplink_priv->tunnel_enc_opts_mapping, + enc_opts_id); +err_enc_opts: + mapping_remove(uplink_priv->tunnel_mapping, tun_id); + return err; +} + +static void mlx5e_put_flow_tunnel_id(struct mlx5e_tc_flow *flow) +{ + u32 enc_opts_id = flow->attr->tunnel_id & ENC_OPTS_BITS_MASK; + u32 tun_id = flow->attr->tunnel_id >> ENC_OPTS_BITS; + struct mlx5_rep_uplink_priv *uplink_priv; + struct mlx5e_rep_priv *uplink_rpriv; + struct mlx5_eswitch *esw; + + esw = flow->priv->mdev->priv.eswitch; + uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH); + uplink_priv = &uplink_rpriv->uplink_priv; + + if (tun_id) + mapping_remove(uplink_priv->tunnel_mapping, tun_id); + if (enc_opts_id) + mapping_remove(uplink_priv->tunnel_enc_opts_mapping, + enc_opts_id); +} + +void mlx5e_tc_set_ethertype(struct mlx5_core_dev *mdev, + struct flow_match_basic *match, bool outer, + void *headers_c, void *headers_v) +{ + bool ip_version_cap; + + ip_version_cap = outer ? + MLX5_CAP_FLOWTABLE_NIC_RX(mdev, + ft_field_support.outer_ip_version) : + MLX5_CAP_FLOWTABLE_NIC_RX(mdev, + ft_field_support.inner_ip_version); + + if (ip_version_cap && match->mask->n_proto == htons(0xFFFF) && + (match->key->n_proto == htons(ETH_P_IP) || + match->key->n_proto == htons(ETH_P_IPV6))) { + MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, ip_version); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_version, + match->key->n_proto == htons(ETH_P_IP) ? 4 : 6); + } else { + MLX5_SET(fte_match_set_lyr_2_4, headers_c, ethertype, + ntohs(match->mask->n_proto)); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ethertype, + ntohs(match->key->n_proto)); + } +} + +u8 mlx5e_tc_get_ip_version(struct mlx5_flow_spec *spec, bool outer) +{ + void *headers_v; + u16 ethertype; + u8 ip_version; + + if (outer) + headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, outer_headers); + else + headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, inner_headers); + + ip_version = MLX5_GET(fte_match_set_lyr_2_4, headers_v, ip_version); + /* Return ip_version converted from ethertype anyway */ + if (!ip_version) { + ethertype = MLX5_GET(fte_match_set_lyr_2_4, headers_v, ethertype); + if (ethertype == ETH_P_IP || ethertype == ETH_P_ARP) + ip_version = 4; + else if (ethertype == ETH_P_IPV6) + ip_version = 6; + } + return ip_version; +} + +/* Tunnel device follows RFC 6040, see include/net/inet_ecn.h. + * And changes inner ip_ecn depending on inner and outer ip_ecn as follows: + * +---------+----------------------------------------+ + * |Arriving | Arriving Outer Header | + * | Inner +---------+---------+---------+----------+ + * | Header | Not-ECT | ECT(0) | ECT(1) | CE | + * +---------+---------+---------+---------+----------+ + * | Not-ECT | Not-ECT | Not-ECT | Not-ECT | | + * | ECT(0) | ECT(0) | ECT(0) | ECT(1) | CE* | + * | ECT(1) | ECT(1) | ECT(1) | ECT(1)* | CE* | + * | CE | CE | CE | CE | CE | + * +---------+---------+---------+---------+----------+ + * + * Tc matches on inner after decapsulation on tunnel device, but hw offload matches + * the inner ip_ecn value before hardware decap action. + * + * Cells marked are changed from original inner packet ip_ecn value during decap, and + * so matching those values on inner ip_ecn before decap will fail. + * + * The following helper allows offload when inner ip_ecn won't be changed by outer ip_ecn, + * except for the outer ip_ecn = CE, where in all cases inner ip_ecn will be changed to CE, + * and such we can drop the inner ip_ecn=CE match. + */ + +static int mlx5e_tc_verify_tunnel_ecn(struct mlx5e_priv *priv, + struct flow_cls_offload *f, + bool *match_inner_ecn) +{ + u8 outer_ecn_mask = 0, outer_ecn_key = 0, inner_ecn_mask = 0, inner_ecn_key = 0; + struct flow_rule *rule = flow_cls_offload_flow_rule(f); + struct netlink_ext_ack *extack = f->common.extack; + struct flow_match_ip match; + + *match_inner_ecn = true; + + if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_IP)) { + flow_rule_match_enc_ip(rule, &match); + outer_ecn_key = match.key->tos & INET_ECN_MASK; + outer_ecn_mask = match.mask->tos & INET_ECN_MASK; + } + + if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_IP)) { + flow_rule_match_ip(rule, &match); + inner_ecn_key = match.key->tos & INET_ECN_MASK; + inner_ecn_mask = match.mask->tos & INET_ECN_MASK; + } + + if (outer_ecn_mask != 0 && outer_ecn_mask != INET_ECN_MASK) { + NL_SET_ERR_MSG_MOD(extack, "Partial match on enc_tos ecn bits isn't supported"); + netdev_warn(priv->netdev, "Partial match on enc_tos ecn bits isn't supported"); + return -EOPNOTSUPP; + } + + if (!outer_ecn_mask) { + if (!inner_ecn_mask) + return 0; + + NL_SET_ERR_MSG_MOD(extack, + "Matching on tos ecn bits without also matching enc_tos ecn bits isn't supported"); + netdev_warn(priv->netdev, + "Matching on tos ecn bits without also matching enc_tos ecn bits isn't supported"); + return -EOPNOTSUPP; + } + + if (inner_ecn_mask && inner_ecn_mask != INET_ECN_MASK) { + NL_SET_ERR_MSG_MOD(extack, + "Partial match on tos ecn bits with match on enc_tos ecn bits isn't supported"); + netdev_warn(priv->netdev, + "Partial match on tos ecn bits with match on enc_tos ecn bits isn't supported"); + return -EOPNOTSUPP; + } + + if (!inner_ecn_mask) + return 0; + + /* Both inner and outer have full mask on ecn */ + + if (outer_ecn_key == INET_ECN_ECT_1) { + /* inner ecn might change by DECAP action */ + + NL_SET_ERR_MSG_MOD(extack, "Match on enc_tos ecn = ECT(1) isn't supported"); + netdev_warn(priv->netdev, "Match on enc_tos ecn = ECT(1) isn't supported"); + return -EOPNOTSUPP; + } + + if (outer_ecn_key != INET_ECN_CE) + return 0; + + if (inner_ecn_key != INET_ECN_CE) { + /* Can't happen in software, as packet ecn will be changed to CE after decap */ + NL_SET_ERR_MSG_MOD(extack, + "Match on tos enc_tos ecn = CE while match on tos ecn != CE isn't supported"); + netdev_warn(priv->netdev, + "Match on tos enc_tos ecn = CE while match on tos ecn != CE isn't supported"); + return -EOPNOTSUPP; + } + + /* outer ecn = CE, inner ecn = CE, as decap will change inner ecn to CE in anycase, + * drop match on inner ecn + */ + *match_inner_ecn = false; + + return 0; +} + +static int parse_tunnel_attr(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow, + struct mlx5_flow_spec *spec, + struct flow_cls_offload *f, + struct net_device *filter_dev, + u8 *match_level, + bool *match_inner) +{ + struct mlx5e_tc_tunnel *tunnel = mlx5e_get_tc_tun(filter_dev); + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + struct netlink_ext_ack *extack = f->common.extack; + bool needs_mapping, sets_mapping; + int err; + + if (!mlx5e_is_eswitch_flow(flow)) { + NL_SET_ERR_MSG_MOD(extack, "Match on tunnel is not supported"); + return -EOPNOTSUPP; + } + + needs_mapping = !!flow->attr->chain; + sets_mapping = flow_requires_tunnel_mapping(flow->attr->chain, f); + *match_inner = !needs_mapping; + + if ((needs_mapping || sets_mapping) && + !mlx5_eswitch_reg_c1_loopback_enabled(esw)) { + NL_SET_ERR_MSG_MOD(extack, + "Chains on tunnel devices isn't supported without register loopback support"); + netdev_warn(priv->netdev, + "Chains on tunnel devices isn't supported without register loopback support"); + return -EOPNOTSUPP; + } + + if (!flow->attr->chain) { + err = mlx5e_tc_tun_parse(filter_dev, priv, spec, f, + match_level); + if (err) { + NL_SET_ERR_MSG_MOD(extack, + "Failed to parse tunnel attributes"); + netdev_warn(priv->netdev, + "Failed to parse tunnel attributes"); + return err; + } + + /* With mpls over udp we decapsulate using packet reformat + * object + */ + if (!netif_is_bareudp(filter_dev)) + flow->attr->action |= MLX5_FLOW_CONTEXT_ACTION_DECAP; + err = mlx5e_tc_set_attr_rx_tun(flow, spec); + if (err) + return err; + } else if (tunnel && tunnel->tunnel_type == MLX5E_TC_TUNNEL_TYPE_VXLAN) { + struct mlx5_flow_spec *tmp_spec; + + tmp_spec = kvzalloc(sizeof(*tmp_spec), GFP_KERNEL); + if (!tmp_spec) { + NL_SET_ERR_MSG_MOD(extack, "Failed to allocate memory for vxlan tmp spec"); + netdev_warn(priv->netdev, "Failed to allocate memory for vxlan tmp spec"); + return -ENOMEM; + } + memcpy(tmp_spec, spec, sizeof(*tmp_spec)); + + err = mlx5e_tc_tun_parse(filter_dev, priv, tmp_spec, f, match_level); + if (err) { + kvfree(tmp_spec); + NL_SET_ERR_MSG_MOD(extack, "Failed to parse tunnel attributes"); + netdev_warn(priv->netdev, "Failed to parse tunnel attributes"); + return err; + } + err = mlx5e_tc_set_attr_rx_tun(flow, tmp_spec); + kvfree(tmp_spec); + if (err) + return err; + } + + if (!needs_mapping && !sets_mapping) + return 0; + + return mlx5e_get_flow_tunnel_id(priv, flow, f, filter_dev); +} + +static void *get_match_inner_headers_criteria(struct mlx5_flow_spec *spec) +{ + return MLX5_ADDR_OF(fte_match_param, spec->match_criteria, + inner_headers); +} + +static void *get_match_inner_headers_value(struct mlx5_flow_spec *spec) +{ + return MLX5_ADDR_OF(fte_match_param, spec->match_value, + inner_headers); +} + +static void *get_match_outer_headers_criteria(struct mlx5_flow_spec *spec) +{ + return MLX5_ADDR_OF(fte_match_param, spec->match_criteria, + outer_headers); +} + +static void *get_match_outer_headers_value(struct mlx5_flow_spec *spec) +{ + return MLX5_ADDR_OF(fte_match_param, spec->match_value, + outer_headers); +} + +void *mlx5e_get_match_headers_value(u32 flags, struct mlx5_flow_spec *spec) +{ + return (flags & MLX5_FLOW_CONTEXT_ACTION_DECAP) ? + get_match_inner_headers_value(spec) : + get_match_outer_headers_value(spec); +} + +void *mlx5e_get_match_headers_criteria(u32 flags, struct mlx5_flow_spec *spec) +{ + return (flags & MLX5_FLOW_CONTEXT_ACTION_DECAP) ? + get_match_inner_headers_criteria(spec) : + get_match_outer_headers_criteria(spec); +} + +static int mlx5e_flower_parse_meta(struct net_device *filter_dev, + struct flow_cls_offload *f) +{ + struct flow_rule *rule = flow_cls_offload_flow_rule(f); + struct netlink_ext_ack *extack = f->common.extack; + struct net_device *ingress_dev; + struct flow_match_meta match; + + if (!flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_META)) + return 0; + + flow_rule_match_meta(rule, &match); + if (!match.mask->ingress_ifindex) + return 0; + + if (match.mask->ingress_ifindex != 0xFFFFFFFF) { + NL_SET_ERR_MSG_MOD(extack, "Unsupported ingress ifindex mask"); + return -EOPNOTSUPP; + } + + ingress_dev = __dev_get_by_index(dev_net(filter_dev), + match.key->ingress_ifindex); + if (!ingress_dev) { + NL_SET_ERR_MSG_MOD(extack, + "Can't find the ingress port to match on"); + return -ENOENT; + } + + if (ingress_dev != filter_dev) { + NL_SET_ERR_MSG_MOD(extack, + "Can't match on the ingress filter port"); + return -EOPNOTSUPP; + } + + return 0; +} + +static bool skip_key_basic(struct net_device *filter_dev, + struct flow_cls_offload *f) +{ + /* When doing mpls over udp decap, the user needs to provide + * MPLS_UC as the protocol in order to be able to match on mpls + * label fields. However, the actual ethertype is IP so we want to + * avoid matching on this, otherwise we'll fail the match. + */ + if (netif_is_bareudp(filter_dev) && f->common.chain_index == 0) + return true; + + return false; +} + +static int __parse_cls_flower(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow, + struct mlx5_flow_spec *spec, + struct flow_cls_offload *f, + struct net_device *filter_dev, + u8 *inner_match_level, u8 *outer_match_level, + bool *is_tunnel_flow) +{ + struct netlink_ext_ack *extack = f->common.extack; + void *headers_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, + outer_headers); + void *headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, + outer_headers); + void *misc_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, + misc_parameters); + void *misc_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, + misc_parameters); + void *misc_c_3 = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, + misc_parameters_3); + void *misc_v_3 = MLX5_ADDR_OF(fte_match_param, spec->match_value, + misc_parameters_3); + struct flow_rule *rule = flow_cls_offload_flow_rule(f); + struct flow_dissector *dissector = rule->match.dissector; + enum fs_flow_table_type fs_type; + bool match_inner_ecn = true; + u16 addr_type = 0; + u8 ip_proto = 0; + u8 *match_level; + int err; + + fs_type = mlx5e_is_eswitch_flow(flow) ? FS_FT_FDB : FS_FT_NIC_RX; + match_level = outer_match_level; + + if (dissector->used_keys & + ~(BIT(FLOW_DISSECTOR_KEY_META) | + BIT(FLOW_DISSECTOR_KEY_CONTROL) | + BIT(FLOW_DISSECTOR_KEY_BASIC) | + BIT(FLOW_DISSECTOR_KEY_ETH_ADDRS) | + BIT(FLOW_DISSECTOR_KEY_VLAN) | + BIT(FLOW_DISSECTOR_KEY_CVLAN) | + BIT(FLOW_DISSECTOR_KEY_IPV4_ADDRS) | + BIT(FLOW_DISSECTOR_KEY_IPV6_ADDRS) | + BIT(FLOW_DISSECTOR_KEY_PORTS) | + BIT(FLOW_DISSECTOR_KEY_ENC_KEYID) | + BIT(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) | + BIT(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) | + BIT(FLOW_DISSECTOR_KEY_ENC_PORTS) | + BIT(FLOW_DISSECTOR_KEY_ENC_CONTROL) | + BIT(FLOW_DISSECTOR_KEY_TCP) | + BIT(FLOW_DISSECTOR_KEY_IP) | + BIT(FLOW_DISSECTOR_KEY_CT) | + BIT(FLOW_DISSECTOR_KEY_ENC_IP) | + BIT(FLOW_DISSECTOR_KEY_ENC_OPTS) | + BIT(FLOW_DISSECTOR_KEY_ICMP) | + BIT(FLOW_DISSECTOR_KEY_MPLS))) { + NL_SET_ERR_MSG_MOD(extack, "Unsupported key"); + netdev_dbg(priv->netdev, "Unsupported key used: 0x%x\n", + dissector->used_keys); + return -EOPNOTSUPP; + } + + if (mlx5e_get_tc_tun(filter_dev)) { + bool match_inner = false; + + err = parse_tunnel_attr(priv, flow, spec, f, filter_dev, + outer_match_level, &match_inner); + if (err) + return err; + + if (match_inner) { + /* header pointers should point to the inner headers + * if the packet was decapsulated already. + * outer headers are set by parse_tunnel_attr. + */ + match_level = inner_match_level; + headers_c = get_match_inner_headers_criteria(spec); + headers_v = get_match_inner_headers_value(spec); + } + *is_tunnel_flow = true; + + err = mlx5e_tc_verify_tunnel_ecn(priv, f, &match_inner_ecn); + if (err) + return err; + } + + err = mlx5e_flower_parse_meta(filter_dev, f); + if (err) + return err; + + if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_BASIC) && + !skip_key_basic(filter_dev, f)) { + struct flow_match_basic match; + + flow_rule_match_basic(rule, &match); + mlx5e_tc_set_ethertype(priv->mdev, &match, + match_level == outer_match_level, + headers_c, headers_v); + + if (match.mask->n_proto) + *match_level = MLX5_MATCH_L2; + } + if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_VLAN) || + is_vlan_dev(filter_dev)) { + struct flow_dissector_key_vlan filter_dev_mask; + struct flow_dissector_key_vlan filter_dev_key; + struct flow_match_vlan match; + + if (is_vlan_dev(filter_dev)) { + match.key = &filter_dev_key; + match.key->vlan_id = vlan_dev_vlan_id(filter_dev); + match.key->vlan_tpid = vlan_dev_vlan_proto(filter_dev); + match.key->vlan_priority = 0; + match.mask = &filter_dev_mask; + memset(match.mask, 0xff, sizeof(*match.mask)); + match.mask->vlan_priority = 0; + } else { + flow_rule_match_vlan(rule, &match); + } + if (match.mask->vlan_id || + match.mask->vlan_priority || + match.mask->vlan_tpid) { + if (match.key->vlan_tpid == htons(ETH_P_8021AD)) { + MLX5_SET(fte_match_set_lyr_2_4, headers_c, + svlan_tag, 1); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, + svlan_tag, 1); + } else { + MLX5_SET(fte_match_set_lyr_2_4, headers_c, + cvlan_tag, 1); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, + cvlan_tag, 1); + } + + MLX5_SET(fte_match_set_lyr_2_4, headers_c, first_vid, + match.mask->vlan_id); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, first_vid, + match.key->vlan_id); + + MLX5_SET(fte_match_set_lyr_2_4, headers_c, first_prio, + match.mask->vlan_priority); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, first_prio, + match.key->vlan_priority); + + *match_level = MLX5_MATCH_L2; + } + } else if (*match_level != MLX5_MATCH_NONE) { + /* cvlan_tag enabled in match criteria and + * disabled in match value means both S & C tags + * don't exist (untagged of both) + */ + MLX5_SET(fte_match_set_lyr_2_4, headers_c, cvlan_tag, 1); + *match_level = MLX5_MATCH_L2; + } + + if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_CVLAN)) { + struct flow_match_vlan match; + + flow_rule_match_cvlan(rule, &match); + if (match.mask->vlan_id || + match.mask->vlan_priority || + match.mask->vlan_tpid) { + if (!MLX5_CAP_FLOWTABLE_TYPE(priv->mdev, ft_field_support.outer_second_vid, + fs_type)) { + NL_SET_ERR_MSG_MOD(extack, + "Matching on CVLAN is not supported"); + return -EOPNOTSUPP; + } + + if (match.key->vlan_tpid == htons(ETH_P_8021AD)) { + MLX5_SET(fte_match_set_misc, misc_c, + outer_second_svlan_tag, 1); + MLX5_SET(fte_match_set_misc, misc_v, + outer_second_svlan_tag, 1); + } else { + MLX5_SET(fte_match_set_misc, misc_c, + outer_second_cvlan_tag, 1); + MLX5_SET(fte_match_set_misc, misc_v, + outer_second_cvlan_tag, 1); + } + + MLX5_SET(fte_match_set_misc, misc_c, outer_second_vid, + match.mask->vlan_id); + MLX5_SET(fte_match_set_misc, misc_v, outer_second_vid, + match.key->vlan_id); + MLX5_SET(fte_match_set_misc, misc_c, outer_second_prio, + match.mask->vlan_priority); + MLX5_SET(fte_match_set_misc, misc_v, outer_second_prio, + match.key->vlan_priority); + + *match_level = MLX5_MATCH_L2; + spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS; + } + } + + if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ETH_ADDRS)) { + struct flow_match_eth_addrs match; + + flow_rule_match_eth_addrs(rule, &match); + ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c, + dmac_47_16), + match.mask->dst); + ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, + dmac_47_16), + match.key->dst); + + ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c, + smac_47_16), + match.mask->src); + ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, + smac_47_16), + match.key->src); + + if (!is_zero_ether_addr(match.mask->src) || + !is_zero_ether_addr(match.mask->dst)) + *match_level = MLX5_MATCH_L2; + } + + if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_CONTROL)) { + struct flow_match_control match; + + flow_rule_match_control(rule, &match); + addr_type = match.key->addr_type; + + /* the HW doesn't support frag first/later */ + if (match.mask->flags & FLOW_DIS_FIRST_FRAG) { + NL_SET_ERR_MSG_MOD(extack, "Match on frag first/later is not supported"); + return -EOPNOTSUPP; + } + + if (match.mask->flags & FLOW_DIS_IS_FRAGMENT) { + MLX5_SET(fte_match_set_lyr_2_4, headers_c, frag, 1); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, frag, + match.key->flags & FLOW_DIS_IS_FRAGMENT); + + /* the HW doesn't need L3 inline to match on frag=no */ + if (!(match.key->flags & FLOW_DIS_IS_FRAGMENT)) + *match_level = MLX5_MATCH_L2; + /* *** L2 attributes parsing up to here *** */ + else + *match_level = MLX5_MATCH_L3; + } + } + + if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_BASIC)) { + struct flow_match_basic match; + + flow_rule_match_basic(rule, &match); + ip_proto = match.key->ip_proto; + + MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol, + match.mask->ip_proto); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol, + match.key->ip_proto); + + if (match.mask->ip_proto) + *match_level = MLX5_MATCH_L3; + } + + if (addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { + struct flow_match_ipv4_addrs match; + + flow_rule_match_ipv4_addrs(rule, &match); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c, + src_ipv4_src_ipv6.ipv4_layout.ipv4), + &match.mask->src, sizeof(match.mask->src)); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, + src_ipv4_src_ipv6.ipv4_layout.ipv4), + &match.key->src, sizeof(match.key->src)); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c, + dst_ipv4_dst_ipv6.ipv4_layout.ipv4), + &match.mask->dst, sizeof(match.mask->dst)); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, + dst_ipv4_dst_ipv6.ipv4_layout.ipv4), + &match.key->dst, sizeof(match.key->dst)); + + if (match.mask->src || match.mask->dst) + *match_level = MLX5_MATCH_L3; + } + + if (addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { + struct flow_match_ipv6_addrs match; + + flow_rule_match_ipv6_addrs(rule, &match); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c, + src_ipv4_src_ipv6.ipv6_layout.ipv6), + &match.mask->src, sizeof(match.mask->src)); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, + src_ipv4_src_ipv6.ipv6_layout.ipv6), + &match.key->src, sizeof(match.key->src)); + + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c, + dst_ipv4_dst_ipv6.ipv6_layout.ipv6), + &match.mask->dst, sizeof(match.mask->dst)); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, + dst_ipv4_dst_ipv6.ipv6_layout.ipv6), + &match.key->dst, sizeof(match.key->dst)); + + if (ipv6_addr_type(&match.mask->src) != IPV6_ADDR_ANY || + ipv6_addr_type(&match.mask->dst) != IPV6_ADDR_ANY) + *match_level = MLX5_MATCH_L3; + } + + if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_IP)) { + struct flow_match_ip match; + + flow_rule_match_ip(rule, &match); + if (match_inner_ecn) { + MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_ecn, + match.mask->tos & 0x3); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_ecn, + match.key->tos & 0x3); + } + + MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_dscp, + match.mask->tos >> 2); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_dscp, + match.key->tos >> 2); + + MLX5_SET(fte_match_set_lyr_2_4, headers_c, ttl_hoplimit, + match.mask->ttl); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ttl_hoplimit, + match.key->ttl); + + if (match.mask->ttl && + !MLX5_CAP_ESW_FLOWTABLE_FDB(priv->mdev, + ft_field_support.outer_ipv4_ttl)) { + NL_SET_ERR_MSG_MOD(extack, + "Matching on TTL is not supported"); + return -EOPNOTSUPP; + } + + if (match.mask->tos || match.mask->ttl) + *match_level = MLX5_MATCH_L3; + } + + /* *** L3 attributes parsing up to here *** */ + + if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_PORTS)) { + struct flow_match_ports match; + + flow_rule_match_ports(rule, &match); + switch (ip_proto) { + case IPPROTO_TCP: + MLX5_SET(fte_match_set_lyr_2_4, headers_c, + tcp_sport, ntohs(match.mask->src)); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, + tcp_sport, ntohs(match.key->src)); + + MLX5_SET(fte_match_set_lyr_2_4, headers_c, + tcp_dport, ntohs(match.mask->dst)); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, + tcp_dport, ntohs(match.key->dst)); + break; + + case IPPROTO_UDP: + MLX5_SET(fte_match_set_lyr_2_4, headers_c, + udp_sport, ntohs(match.mask->src)); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, + udp_sport, ntohs(match.key->src)); + + MLX5_SET(fte_match_set_lyr_2_4, headers_c, + udp_dport, ntohs(match.mask->dst)); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, + udp_dport, ntohs(match.key->dst)); + break; + default: + NL_SET_ERR_MSG_MOD(extack, + "Only UDP and TCP transports are supported for L4 matching"); + netdev_err(priv->netdev, + "Only UDP and TCP transport are supported\n"); + return -EINVAL; + } + + if (match.mask->src || match.mask->dst) + *match_level = MLX5_MATCH_L4; + } + + if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_TCP)) { + struct flow_match_tcp match; + + flow_rule_match_tcp(rule, &match); + MLX5_SET(fte_match_set_lyr_2_4, headers_c, tcp_flags, + ntohs(match.mask->flags)); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, tcp_flags, + ntohs(match.key->flags)); + + if (match.mask->flags) + *match_level = MLX5_MATCH_L4; + } + if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ICMP)) { + struct flow_match_icmp match; + + flow_rule_match_icmp(rule, &match); + switch (ip_proto) { + case IPPROTO_ICMP: + if (!(MLX5_CAP_GEN(priv->mdev, flex_parser_protocols) & + MLX5_FLEX_PROTO_ICMP)) { + NL_SET_ERR_MSG_MOD(extack, + "Match on Flex protocols for ICMP is not supported"); + return -EOPNOTSUPP; + } + MLX5_SET(fte_match_set_misc3, misc_c_3, icmp_type, + match.mask->type); + MLX5_SET(fte_match_set_misc3, misc_v_3, icmp_type, + match.key->type); + MLX5_SET(fte_match_set_misc3, misc_c_3, icmp_code, + match.mask->code); + MLX5_SET(fte_match_set_misc3, misc_v_3, icmp_code, + match.key->code); + break; + case IPPROTO_ICMPV6: + if (!(MLX5_CAP_GEN(priv->mdev, flex_parser_protocols) & + MLX5_FLEX_PROTO_ICMPV6)) { + NL_SET_ERR_MSG_MOD(extack, + "Match on Flex protocols for ICMPV6 is not supported"); + return -EOPNOTSUPP; + } + MLX5_SET(fte_match_set_misc3, misc_c_3, icmpv6_type, + match.mask->type); + MLX5_SET(fte_match_set_misc3, misc_v_3, icmpv6_type, + match.key->type); + MLX5_SET(fte_match_set_misc3, misc_c_3, icmpv6_code, + match.mask->code); + MLX5_SET(fte_match_set_misc3, misc_v_3, icmpv6_code, + match.key->code); + break; + default: + NL_SET_ERR_MSG_MOD(extack, + "Code and type matching only with ICMP and ICMPv6"); + netdev_err(priv->netdev, + "Code and type matching only with ICMP and ICMPv6\n"); + return -EINVAL; + } + if (match.mask->code || match.mask->type) { + *match_level = MLX5_MATCH_L4; + spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS_3; + } + } + /* Currently supported only for MPLS over UDP */ + if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_MPLS) && + !netif_is_bareudp(filter_dev)) { + NL_SET_ERR_MSG_MOD(extack, + "Matching on MPLS is supported only for MPLS over UDP"); + netdev_err(priv->netdev, + "Matching on MPLS is supported only for MPLS over UDP\n"); + return -EOPNOTSUPP; + } + + return 0; +} + +static int parse_cls_flower(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow, + struct mlx5_flow_spec *spec, + struct flow_cls_offload *f, + struct net_device *filter_dev) +{ + u8 inner_match_level, outer_match_level, non_tunnel_match_level; + struct netlink_ext_ack *extack = f->common.extack; + struct mlx5_core_dev *dev = priv->mdev; + struct mlx5_eswitch *esw = dev->priv.eswitch; + struct mlx5e_rep_priv *rpriv = priv->ppriv; + struct mlx5_eswitch_rep *rep; + bool is_eswitch_flow, is_tunnel_flow; + int err; + + inner_match_level = MLX5_MATCH_NONE; + outer_match_level = MLX5_MATCH_NONE; + is_tunnel_flow = false; + + err = __parse_cls_flower(priv, flow, spec, f, filter_dev, + &inner_match_level, &outer_match_level, + &is_tunnel_flow); + + non_tunnel_match_level = (inner_match_level == MLX5_MATCH_NONE) ? + outer_match_level : inner_match_level; + + is_eswitch_flow = mlx5e_is_eswitch_flow(flow); + if (!err && is_eswitch_flow) { + rep = rpriv->rep; + if (rep->vport != MLX5_VPORT_UPLINK && + (esw->offloads.inline_mode != MLX5_INLINE_MODE_NONE && + esw->offloads.inline_mode < non_tunnel_match_level)) { + NL_SET_ERR_MSG_MOD(extack, + "Flow is not offloaded due to min inline setting"); + netdev_warn(priv->netdev, + "Flow is not offloaded due to min inline setting, required %d actual %d\n", + non_tunnel_match_level, esw->offloads.inline_mode); + return -EOPNOTSUPP; + } + } + + flow->attr->inner_match_level = inner_match_level; + flow->attr->outer_match_level = outer_match_level; + + return err; +} + +struct mlx5_fields { + u8 field; + u8 field_bsize; + u32 field_mask; + u32 offset; + u32 match_offset; +}; + +#define OFFLOAD(fw_field, field_bsize, field_mask, field, off, match_field) \ + {MLX5_ACTION_IN_FIELD_OUT_ ## fw_field, field_bsize, field_mask, \ + offsetof(struct pedit_headers, field) + (off), \ + MLX5_BYTE_OFF(fte_match_set_lyr_2_4, match_field)} + +/* masked values are the same and there are no rewrites that do not have a + * match. + */ +#define SAME_VAL_MASK(type, valp, maskp, matchvalp, matchmaskp) ({ \ + type matchmaskx = *(type *)(matchmaskp); \ + type matchvalx = *(type *)(matchvalp); \ + type maskx = *(type *)(maskp); \ + type valx = *(type *)(valp); \ + \ + (valx & maskx) == (matchvalx & matchmaskx) && !(maskx & (maskx ^ \ + matchmaskx)); \ +}) + +static bool cmp_val_mask(void *valp, void *maskp, void *matchvalp, + void *matchmaskp, u8 bsize) +{ + bool same = false; + + switch (bsize) { + case 8: + same = SAME_VAL_MASK(u8, valp, maskp, matchvalp, matchmaskp); + break; + case 16: + same = SAME_VAL_MASK(u16, valp, maskp, matchvalp, matchmaskp); + break; + case 32: + same = SAME_VAL_MASK(u32, valp, maskp, matchvalp, matchmaskp); + break; + } + + return same; +} + +static struct mlx5_fields fields[] = { + OFFLOAD(DMAC_47_16, 32, U32_MAX, eth.h_dest[0], 0, dmac_47_16), + OFFLOAD(DMAC_15_0, 16, U16_MAX, eth.h_dest[4], 0, dmac_15_0), + OFFLOAD(SMAC_47_16, 32, U32_MAX, eth.h_source[0], 0, smac_47_16), + OFFLOAD(SMAC_15_0, 16, U16_MAX, eth.h_source[4], 0, smac_15_0), + OFFLOAD(ETHERTYPE, 16, U16_MAX, eth.h_proto, 0, ethertype), + OFFLOAD(FIRST_VID, 16, U16_MAX, vlan.h_vlan_TCI, 0, first_vid), + + OFFLOAD(IP_DSCP, 8, 0xfc, ip4.tos, 0, ip_dscp), + OFFLOAD(IP_TTL, 8, U8_MAX, ip4.ttl, 0, ttl_hoplimit), + OFFLOAD(SIPV4, 32, U32_MAX, ip4.saddr, 0, src_ipv4_src_ipv6.ipv4_layout.ipv4), + OFFLOAD(DIPV4, 32, U32_MAX, ip4.daddr, 0, dst_ipv4_dst_ipv6.ipv4_layout.ipv4), + + OFFLOAD(SIPV6_127_96, 32, U32_MAX, ip6.saddr.s6_addr32[0], 0, + src_ipv4_src_ipv6.ipv6_layout.ipv6[0]), + OFFLOAD(SIPV6_95_64, 32, U32_MAX, ip6.saddr.s6_addr32[1], 0, + src_ipv4_src_ipv6.ipv6_layout.ipv6[4]), + OFFLOAD(SIPV6_63_32, 32, U32_MAX, ip6.saddr.s6_addr32[2], 0, + src_ipv4_src_ipv6.ipv6_layout.ipv6[8]), + OFFLOAD(SIPV6_31_0, 32, U32_MAX, ip6.saddr.s6_addr32[3], 0, + src_ipv4_src_ipv6.ipv6_layout.ipv6[12]), + OFFLOAD(DIPV6_127_96, 32, U32_MAX, ip6.daddr.s6_addr32[0], 0, + dst_ipv4_dst_ipv6.ipv6_layout.ipv6[0]), + OFFLOAD(DIPV6_95_64, 32, U32_MAX, ip6.daddr.s6_addr32[1], 0, + dst_ipv4_dst_ipv6.ipv6_layout.ipv6[4]), + OFFLOAD(DIPV6_63_32, 32, U32_MAX, ip6.daddr.s6_addr32[2], 0, + dst_ipv4_dst_ipv6.ipv6_layout.ipv6[8]), + OFFLOAD(DIPV6_31_0, 32, U32_MAX, ip6.daddr.s6_addr32[3], 0, + dst_ipv4_dst_ipv6.ipv6_layout.ipv6[12]), + OFFLOAD(IPV6_HOPLIMIT, 8, U8_MAX, ip6.hop_limit, 0, ttl_hoplimit), + OFFLOAD(IP_DSCP, 16, 0xc00f, ip6, 0, ip_dscp), + + OFFLOAD(TCP_SPORT, 16, U16_MAX, tcp.source, 0, tcp_sport), + OFFLOAD(TCP_DPORT, 16, U16_MAX, tcp.dest, 0, tcp_dport), + /* in linux iphdr tcp_flags is 8 bits long */ + OFFLOAD(TCP_FLAGS, 8, U8_MAX, tcp.ack_seq, 5, tcp_flags), + + OFFLOAD(UDP_SPORT, 16, U16_MAX, udp.source, 0, udp_sport), + OFFLOAD(UDP_DPORT, 16, U16_MAX, udp.dest, 0, udp_dport), +}; + +static unsigned long mask_to_le(unsigned long mask, int size) +{ + __be32 mask_be32; + __be16 mask_be16; + + if (size == 32) { + mask_be32 = (__force __be32)(mask); + mask = (__force unsigned long)cpu_to_le32(be32_to_cpu(mask_be32)); + } else if (size == 16) { + mask_be32 = (__force __be32)(mask); + mask_be16 = *(__be16 *)&mask_be32; + mask = (__force unsigned long)cpu_to_le16(be16_to_cpu(mask_be16)); + } + + return mask; +} + +static int offload_pedit_fields(struct mlx5e_priv *priv, + int namespace, + struct mlx5e_tc_flow_parse_attr *parse_attr, + u32 *action_flags, + struct netlink_ext_ack *extack) +{ + struct pedit_headers *set_masks, *add_masks, *set_vals, *add_vals; + struct pedit_headers_action *hdrs = parse_attr->hdrs; + void *headers_c, *headers_v, *action, *vals_p; + u32 *s_masks_p, *a_masks_p, s_mask, a_mask; + struct mlx5e_tc_mod_hdr_acts *mod_acts; + unsigned long mask, field_mask; + int i, first, last, next_z; + struct mlx5_fields *f; + u8 cmd; + + mod_acts = &parse_attr->mod_hdr_acts; + headers_c = mlx5e_get_match_headers_criteria(*action_flags, &parse_attr->spec); + headers_v = mlx5e_get_match_headers_value(*action_flags, &parse_attr->spec); + + set_masks = &hdrs[0].masks; + add_masks = &hdrs[1].masks; + set_vals = &hdrs[0].vals; + add_vals = &hdrs[1].vals; + + for (i = 0; i < ARRAY_SIZE(fields); i++) { + bool skip; + + f = &fields[i]; + /* avoid seeing bits set from previous iterations */ + s_mask = 0; + a_mask = 0; + + s_masks_p = (void *)set_masks + f->offset; + a_masks_p = (void *)add_masks + f->offset; + + s_mask = *s_masks_p & f->field_mask; + a_mask = *a_masks_p & f->field_mask; + + if (!s_mask && !a_mask) /* nothing to offload here */ + continue; + + if (s_mask && a_mask) { + NL_SET_ERR_MSG_MOD(extack, + "can't set and add to the same HW field"); + netdev_warn(priv->netdev, + "mlx5: can't set and add to the same HW field (%x)\n", + f->field); + return -EOPNOTSUPP; + } + + skip = false; + if (s_mask) { + void *match_mask = headers_c + f->match_offset; + void *match_val = headers_v + f->match_offset; + + cmd = MLX5_ACTION_TYPE_SET; + mask = s_mask; + vals_p = (void *)set_vals + f->offset; + /* don't rewrite if we have a match on the same value */ + if (cmp_val_mask(vals_p, s_masks_p, match_val, + match_mask, f->field_bsize)) + skip = true; + /* clear to denote we consumed this field */ + *s_masks_p &= ~f->field_mask; + } else { + cmd = MLX5_ACTION_TYPE_ADD; + mask = a_mask; + vals_p = (void *)add_vals + f->offset; + /* add 0 is no change */ + if ((*(u32 *)vals_p & f->field_mask) == 0) + skip = true; + /* clear to denote we consumed this field */ + *a_masks_p &= ~f->field_mask; + } + if (skip) + continue; + + mask = mask_to_le(mask, f->field_bsize); + + first = find_first_bit(&mask, f->field_bsize); + next_z = find_next_zero_bit(&mask, f->field_bsize, first); + last = find_last_bit(&mask, f->field_bsize); + if (first < next_z && next_z < last) { + NL_SET_ERR_MSG_MOD(extack, + "rewrite of few sub-fields isn't supported"); + netdev_warn(priv->netdev, + "mlx5: rewrite of few sub-fields (mask %lx) isn't offloaded\n", + mask); + return -EOPNOTSUPP; + } + + action = mlx5e_mod_hdr_alloc(priv->mdev, namespace, mod_acts); + if (IS_ERR(action)) { + NL_SET_ERR_MSG_MOD(extack, + "too many pedit actions, can't offload"); + mlx5_core_warn(priv->mdev, + "mlx5: parsed %d pedit actions, can't do more\n", + mod_acts->num_actions); + return PTR_ERR(action); + } + + MLX5_SET(set_action_in, action, action_type, cmd); + MLX5_SET(set_action_in, action, field, f->field); + + if (cmd == MLX5_ACTION_TYPE_SET) { + int start; + + field_mask = mask_to_le(f->field_mask, f->field_bsize); + + /* if field is bit sized it can start not from first bit */ + start = find_first_bit(&field_mask, f->field_bsize); + + MLX5_SET(set_action_in, action, offset, first - start); + /* length is num of bits to be written, zero means length of 32 */ + MLX5_SET(set_action_in, action, length, (last - first + 1)); + } + + if (f->field_bsize == 32) + MLX5_SET(set_action_in, action, data, ntohl(*(__be32 *)vals_p) >> first); + else if (f->field_bsize == 16) + MLX5_SET(set_action_in, action, data, ntohs(*(__be16 *)vals_p) >> first); + else if (f->field_bsize == 8) + MLX5_SET(set_action_in, action, data, *(u8 *)vals_p >> first); + + ++mod_acts->num_actions; + } + + return 0; +} + +static const struct pedit_headers zero_masks = {}; + +static int verify_offload_pedit_fields(struct mlx5e_priv *priv, + struct mlx5e_tc_flow_parse_attr *parse_attr, + struct netlink_ext_ack *extack) +{ + struct pedit_headers *cmd_masks; + u8 cmd; + + for (cmd = 0; cmd < __PEDIT_CMD_MAX; cmd++) { + cmd_masks = &parse_attr->hdrs[cmd].masks; + if (memcmp(cmd_masks, &zero_masks, sizeof(zero_masks))) { + NL_SET_ERR_MSG_MOD(extack, "attempt to offload an unsupported field"); + netdev_warn(priv->netdev, "attempt to offload an unsupported field (cmd %d)\n", cmd); + print_hex_dump(KERN_WARNING, "mask: ", DUMP_PREFIX_ADDRESS, + 16, 1, cmd_masks, sizeof(zero_masks), true); + return -EOPNOTSUPP; + } + } + + return 0; +} + +static int alloc_tc_pedit_action(struct mlx5e_priv *priv, int namespace, + struct mlx5e_tc_flow_parse_attr *parse_attr, + u32 *action_flags, + struct netlink_ext_ack *extack) +{ + int err; + + err = offload_pedit_fields(priv, namespace, parse_attr, action_flags, extack); + if (err) + goto out_dealloc_parsed_actions; + + err = verify_offload_pedit_fields(priv, parse_attr, extack); + if (err) + goto out_dealloc_parsed_actions; + + return 0; + +out_dealloc_parsed_actions: + mlx5e_mod_hdr_dealloc(&parse_attr->mod_hdr_acts); + return err; +} + +struct ip_ttl_word { + __u8 ttl; + __u8 protocol; + __sum16 check; +}; + +struct ipv6_hoplimit_word { + __be16 payload_len; + __u8 nexthdr; + __u8 hop_limit; +}; + +static bool +is_action_keys_supported(const struct flow_action_entry *act, bool ct_flow, + bool *modify_ip_header, bool *modify_tuple, + struct netlink_ext_ack *extack) +{ + u32 mask, offset; + u8 htype; + + htype = act->mangle.htype; + offset = act->mangle.offset; + mask = ~act->mangle.mask; + /* For IPv4 & IPv6 header check 4 byte word, + * to determine that modified fields + * are NOT ttl & hop_limit only. + */ + if (htype == FLOW_ACT_MANGLE_HDR_TYPE_IP4) { + struct ip_ttl_word *ttl_word = + (struct ip_ttl_word *)&mask; + + if (offset != offsetof(struct iphdr, ttl) || + ttl_word->protocol || + ttl_word->check) { + *modify_ip_header = true; + } + + if (offset >= offsetof(struct iphdr, saddr)) + *modify_tuple = true; + + if (ct_flow && *modify_tuple) { + NL_SET_ERR_MSG_MOD(extack, + "can't offload re-write of ipv4 address with action ct"); + return false; + } + } else if (htype == FLOW_ACT_MANGLE_HDR_TYPE_IP6) { + struct ipv6_hoplimit_word *hoplimit_word = + (struct ipv6_hoplimit_word *)&mask; + + if (offset != offsetof(struct ipv6hdr, payload_len) || + hoplimit_word->payload_len || + hoplimit_word->nexthdr) { + *modify_ip_header = true; + } + + if (ct_flow && offset >= offsetof(struct ipv6hdr, saddr)) + *modify_tuple = true; + + if (ct_flow && *modify_tuple) { + NL_SET_ERR_MSG_MOD(extack, + "can't offload re-write of ipv6 address with action ct"); + return false; + } + } else if (htype == FLOW_ACT_MANGLE_HDR_TYPE_TCP || + htype == FLOW_ACT_MANGLE_HDR_TYPE_UDP) { + *modify_tuple = true; + if (ct_flow) { + NL_SET_ERR_MSG_MOD(extack, + "can't offload re-write of transport header ports with action ct"); + return false; + } + } + + return true; +} + +static bool modify_tuple_supported(bool modify_tuple, bool ct_clear, + bool ct_flow, struct netlink_ext_ack *extack, + struct mlx5e_priv *priv, + struct mlx5_flow_spec *spec) +{ + if (!modify_tuple || ct_clear) + return true; + + if (ct_flow) { + NL_SET_ERR_MSG_MOD(extack, + "can't offload tuple modification with non-clear ct()"); + netdev_info(priv->netdev, + "can't offload tuple modification with non-clear ct()"); + return false; + } + + /* Add ct_state=-trk match so it will be offloaded for non ct flows + * (or after clear action), as otherwise, since the tuple is changed, + * we can't restore ct state + */ + if (mlx5_tc_ct_add_no_trk_match(spec)) { + NL_SET_ERR_MSG_MOD(extack, + "can't offload tuple modification with ct matches and no ct(clear) action"); + netdev_info(priv->netdev, + "can't offload tuple modification with ct matches and no ct(clear) action"); + return false; + } + + return true; +} + +static bool modify_header_match_supported(struct mlx5e_priv *priv, + struct mlx5_flow_spec *spec, + struct flow_action *flow_action, + u32 actions, bool ct_flow, + bool ct_clear, + struct netlink_ext_ack *extack) +{ + const struct flow_action_entry *act; + bool modify_ip_header, modify_tuple; + void *headers_c; + void *headers_v; + u16 ethertype; + u8 ip_proto; + int i; + + headers_c = mlx5e_get_match_headers_criteria(actions, spec); + headers_v = mlx5e_get_match_headers_value(actions, spec); + ethertype = MLX5_GET(fte_match_set_lyr_2_4, headers_v, ethertype); + + /* for non-IP we only re-write MACs, so we're okay */ + if (MLX5_GET(fte_match_set_lyr_2_4, headers_c, ip_version) == 0 && + ethertype != ETH_P_IP && ethertype != ETH_P_IPV6) + goto out_ok; + + modify_ip_header = false; + modify_tuple = false; + flow_action_for_each(i, act, flow_action) { + if (act->id != FLOW_ACTION_MANGLE && + act->id != FLOW_ACTION_ADD) + continue; + + if (!is_action_keys_supported(act, ct_flow, + &modify_ip_header, + &modify_tuple, extack)) + return false; + } + + if (!modify_tuple_supported(modify_tuple, ct_clear, ct_flow, extack, + priv, spec)) + return false; + + ip_proto = MLX5_GET(fte_match_set_lyr_2_4, headers_v, ip_protocol); + if (modify_ip_header && ip_proto != IPPROTO_TCP && + ip_proto != IPPROTO_UDP && ip_proto != IPPROTO_ICMP) { + NL_SET_ERR_MSG_MOD(extack, + "can't offload re-write of non TCP/UDP"); + netdev_info(priv->netdev, "can't offload re-write of ip proto %d\n", + ip_proto); + return false; + } + +out_ok: + return true; +} + +static bool +actions_match_supported_fdb(struct mlx5e_priv *priv, + struct mlx5e_tc_flow_parse_attr *parse_attr, + struct mlx5e_tc_flow *flow, + struct netlink_ext_ack *extack) +{ + struct mlx5_esw_flow_attr *esw_attr = flow->attr->esw_attr; + bool ct_flow, ct_clear; + + ct_clear = flow->attr->ct_attr.ct_action & TCA_CT_ACT_CLEAR; + ct_flow = flow_flag_test(flow, CT) && !ct_clear; + + if (esw_attr->split_count && ct_flow && + !MLX5_CAP_GEN(esw_attr->in_mdev, reg_c_preserve)) { + /* All registers used by ct are cleared when using + * split rules. + */ + NL_SET_ERR_MSG_MOD(extack, "Can't offload mirroring with action ct"); + return false; + } + + if (esw_attr->split_count > 0 && !mlx5_esw_has_fwd_fdb(priv->mdev)) { + NL_SET_ERR_MSG_MOD(extack, + "current firmware doesn't support split rule for port mirroring"); + netdev_warn_once(priv->netdev, + "current firmware doesn't support split rule for port mirroring\n"); + return false; + } + + return true; +} + +static bool +actions_match_supported(struct mlx5e_priv *priv, + struct flow_action *flow_action, + u32 actions, + struct mlx5e_tc_flow_parse_attr *parse_attr, + struct mlx5e_tc_flow *flow, + struct netlink_ext_ack *extack) +{ + bool ct_flow, ct_clear; + + ct_clear = flow->attr->ct_attr.ct_action & TCA_CT_ACT_CLEAR; + ct_flow = flow_flag_test(flow, CT) && !ct_clear; + + if (!(actions & + (MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | MLX5_FLOW_CONTEXT_ACTION_DROP))) { + NL_SET_ERR_MSG_MOD(extack, "Rule must have at least one forward/drop action"); + return false; + } + + if (!(~actions & + (MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | MLX5_FLOW_CONTEXT_ACTION_DROP))) { + NL_SET_ERR_MSG_MOD(extack, "Rule cannot support forward+drop action"); + return false; + } + + if (!(~actions & + (MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | MLX5_FLOW_CONTEXT_ACTION_DROP))) { + NL_SET_ERR_MSG_MOD(extack, "Rule cannot support forward+drop action"); + return false; + } + + if (actions & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR && + actions & MLX5_FLOW_CONTEXT_ACTION_DROP) { + NL_SET_ERR_MSG_MOD(extack, "Drop with modify header action is not supported"); + return false; + } + + if (actions & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR && + actions & MLX5_FLOW_CONTEXT_ACTION_DROP) { + NL_SET_ERR_MSG_MOD(extack, "Drop with modify header action is not supported"); + return false; + } + + if (actions & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR && + !modify_header_match_supported(priv, &parse_attr->spec, flow_action, + actions, ct_flow, ct_clear, extack)) + return false; + + if (mlx5e_is_eswitch_flow(flow) && + !actions_match_supported_fdb(priv, parse_attr, flow, extack)) + return false; + + return true; +} + +static bool same_port_devs(struct mlx5e_priv *priv, struct mlx5e_priv *peer_priv) +{ + return priv->mdev == peer_priv->mdev; +} + +bool mlx5e_same_hw_devs(struct mlx5e_priv *priv, struct mlx5e_priv *peer_priv) +{ + struct mlx5_core_dev *fmdev, *pmdev; + u64 fsystem_guid, psystem_guid; + + fmdev = priv->mdev; + pmdev = peer_priv->mdev; + + fsystem_guid = mlx5_query_nic_system_image_guid(fmdev); + psystem_guid = mlx5_query_nic_system_image_guid(pmdev); + + return (fsystem_guid == psystem_guid); +} + +static int +actions_prepare_mod_hdr_actions(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow, + struct mlx5_flow_attr *attr, + struct netlink_ext_ack *extack) +{ + struct mlx5e_tc_flow_parse_attr *parse_attr = attr->parse_attr; + struct pedit_headers_action *hdrs = parse_attr->hdrs; + enum mlx5_flow_namespace_type ns_type; + int err; + + if (!hdrs[TCA_PEDIT_KEY_EX_CMD_SET].pedits && + !hdrs[TCA_PEDIT_KEY_EX_CMD_ADD].pedits) + return 0; + + ns_type = mlx5e_get_flow_namespace(flow); + + err = alloc_tc_pedit_action(priv, ns_type, parse_attr, &attr->action, extack); + if (err) + return err; + + if (parse_attr->mod_hdr_acts.num_actions > 0) + return 0; + + /* In case all pedit actions are skipped, remove the MOD_HDR flag. */ + attr->action &= ~MLX5_FLOW_CONTEXT_ACTION_MOD_HDR; + mlx5e_mod_hdr_dealloc(&parse_attr->mod_hdr_acts); + + if (ns_type != MLX5_FLOW_NAMESPACE_FDB) + return 0; + + if (!((attr->action & MLX5_FLOW_CONTEXT_ACTION_VLAN_POP) || + (attr->action & MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH))) + attr->esw_attr->split_count = 0; + + return 0; +} + +static struct mlx5_flow_attr* +mlx5e_clone_flow_attr_for_post_act(struct mlx5_flow_attr *attr, + enum mlx5_flow_namespace_type ns_type) +{ + struct mlx5e_tc_flow_parse_attr *parse_attr; + u32 attr_sz = ns_to_attr_sz(ns_type); + struct mlx5_flow_attr *attr2; + + attr2 = mlx5_alloc_flow_attr(ns_type); + parse_attr = kvzalloc(sizeof(*parse_attr), GFP_KERNEL); + if (!attr2 || !parse_attr) { + kvfree(parse_attr); + kfree(attr2); + return NULL; + } + + memcpy(attr2, attr, attr_sz); + INIT_LIST_HEAD(&attr2->list); + parse_attr->filter_dev = attr->parse_attr->filter_dev; + attr2->action = 0; + attr2->flags = 0; + attr2->parse_attr = parse_attr; + return attr2; +} + +static struct mlx5_core_dev * +get_flow_counter_dev(struct mlx5e_tc_flow *flow) +{ + return mlx5e_is_eswitch_flow(flow) ? flow->attr->esw_attr->counter_dev : flow->priv->mdev; +} + +struct mlx5_flow_attr * +mlx5e_tc_get_encap_attr(struct mlx5e_tc_flow *flow) +{ + struct mlx5_esw_flow_attr *esw_attr; + struct mlx5_flow_attr *attr; + int i; + + list_for_each_entry(attr, &flow->attrs, list) { + esw_attr = attr->esw_attr; + for (i = 0; i < MLX5_MAX_FLOW_FWD_VPORTS; i++) { + if (esw_attr->dests[i].flags & MLX5_ESW_DEST_ENCAP) + return attr; + } + } + + return NULL; +} + +void +mlx5e_tc_unoffload_flow_post_acts(struct mlx5e_tc_flow *flow) +{ + struct mlx5e_post_act *post_act = get_post_action(flow->priv); + struct mlx5_flow_attr *attr; + + list_for_each_entry(attr, &flow->attrs, list) { + if (list_is_last(&attr->list, &flow->attrs)) + break; + + mlx5e_tc_post_act_unoffload(post_act, attr->post_act_handle); + } +} + +static void +free_flow_post_acts(struct mlx5e_tc_flow *flow) +{ + struct mlx5_core_dev *counter_dev = get_flow_counter_dev(flow); + struct mlx5e_post_act *post_act = get_post_action(flow->priv); + struct mlx5_flow_attr *attr, *tmp; + bool vf_tun; + + list_for_each_entry_safe(attr, tmp, &flow->attrs, list) { + if (list_is_last(&attr->list, &flow->attrs)) + break; + + if (attr->post_act_handle) + mlx5e_tc_post_act_del(post_act, attr->post_act_handle); + + clean_encap_dests(flow->priv, flow, attr, &vf_tun); + + if (attr->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) + mlx5_fc_destroy(counter_dev, attr->counter); + + if (attr->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR) { + mlx5e_mod_hdr_dealloc(&attr->parse_attr->mod_hdr_acts); + if (attr->modify_hdr) + mlx5_modify_header_dealloc(flow->priv->mdev, attr->modify_hdr); + } + + list_del(&attr->list); + kvfree(attr->parse_attr); + kfree(attr); + } +} + +int +mlx5e_tc_offload_flow_post_acts(struct mlx5e_tc_flow *flow) +{ + struct mlx5e_post_act *post_act = get_post_action(flow->priv); + struct mlx5_flow_attr *attr; + int err = 0; + + list_for_each_entry(attr, &flow->attrs, list) { + if (list_is_last(&attr->list, &flow->attrs)) + break; + + err = mlx5e_tc_post_act_offload(post_act, attr->post_act_handle); + if (err) + break; + } + + return err; +} + +/* TC filter rule HW translation: + * + * +---------------------+ + * + ft prio (tc chain) + + * + original match + + * +---------------------+ + * | + * | if multi table action + * | + * v + * +---------------------+ + * + post act ft |<----. + * + match fte id | | split on multi table action + * + do actions |-----' + * +---------------------+ + * | + * | + * v + * Do rest of the actions after last multi table action. + */ +static int +alloc_flow_post_acts(struct mlx5e_tc_flow *flow, struct netlink_ext_ack *extack) +{ + struct mlx5e_post_act *post_act = get_post_action(flow->priv); + struct mlx5_flow_attr *attr, *next_attr = NULL; + struct mlx5e_post_act_handle *handle; + bool vf_tun; + int err; + + /* This is going in reverse order as needed. + * The first entry is the last attribute. + */ + list_for_each_entry(attr, &flow->attrs, list) { + if (!next_attr) { + /* Set counter action on last post act rule. */ + attr->action |= MLX5_FLOW_CONTEXT_ACTION_COUNT; + } else { + err = mlx5e_tc_act_set_next_post_act(flow, attr, next_attr); + if (err) + goto out_free; + } + + /* Don't add post_act rule for first attr (last in the list). + * It's being handled by the caller. + */ + if (list_is_last(&attr->list, &flow->attrs)) + break; + + err = set_encap_dests(flow->priv, flow, attr, extack, &vf_tun); + if (err) + goto out_free; + + err = actions_prepare_mod_hdr_actions(flow->priv, flow, attr, extack); + if (err) + goto out_free; + + if (attr->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR) { + err = mlx5e_tc_add_flow_mod_hdr(flow->priv, flow, attr); + if (err) + goto out_free; + } + + if (attr->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) { + err = alloc_flow_attr_counter(get_flow_counter_dev(flow), attr); + if (err) + goto out_free; + } + + handle = mlx5e_tc_post_act_add(post_act, attr); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto out_free; + } + + attr->post_act_handle = handle; + next_attr = attr; + } + + if (flow_flag_test(flow, SLOW)) + goto out; + + err = mlx5e_tc_offload_flow_post_acts(flow); + if (err) + goto out_free; + +out: + return 0; + +out_free: + free_flow_post_acts(flow); + return err; +} + +static int +parse_tc_actions(struct mlx5e_tc_act_parse_state *parse_state, + struct flow_action *flow_action) +{ + struct netlink_ext_ack *extack = parse_state->extack; + struct mlx5e_tc_flow_action flow_action_reorder; + struct mlx5e_tc_flow *flow = parse_state->flow; + struct mlx5_flow_attr *attr = flow->attr; + enum mlx5_flow_namespace_type ns_type; + struct mlx5e_priv *priv = flow->priv; + struct flow_action_entry *act, **_act; + struct mlx5e_tc_act *tc_act; + int err, i; + + flow_action_reorder.num_entries = flow_action->num_entries; + flow_action_reorder.entries = kcalloc(flow_action->num_entries, + sizeof(flow_action), GFP_KERNEL); + if (!flow_action_reorder.entries) + return -ENOMEM; + + mlx5e_tc_act_reorder_flow_actions(flow_action, &flow_action_reorder); + + ns_type = mlx5e_get_flow_namespace(flow); + list_add(&attr->list, &flow->attrs); + + flow_action_for_each(i, _act, &flow_action_reorder) { + act = *_act; + tc_act = mlx5e_tc_act_get(act->id, ns_type); + if (!tc_act) { + NL_SET_ERR_MSG_MOD(extack, "Not implemented offload action"); + err = -EOPNOTSUPP; + goto out_free; + } + + if (!tc_act->can_offload(parse_state, act, i, attr)) { + err = -EOPNOTSUPP; + goto out_free; + } + + err = tc_act->parse_action(parse_state, act, priv, attr); + if (err) + goto out_free; + + parse_state->actions |= attr->action; + + /* Split attr for multi table act if not the last act. */ + if (tc_act->is_multi_table_act && + tc_act->is_multi_table_act(priv, act, attr) && + i < flow_action_reorder.num_entries - 1) { + err = mlx5e_tc_act_post_parse(parse_state, flow_action, attr, ns_type); + if (err) + goto out_free; + + attr = mlx5e_clone_flow_attr_for_post_act(flow->attr, ns_type); + if (!attr) { + err = -ENOMEM; + goto out_free; + } + + list_add(&attr->list, &flow->attrs); + } + } + + kfree(flow_action_reorder.entries); + + err = mlx5e_tc_act_post_parse(parse_state, flow_action, attr, ns_type); + if (err) + goto out_free_post_acts; + + err = alloc_flow_post_acts(flow, extack); + if (err) + goto out_free_post_acts; + + return 0; + +out_free: + kfree(flow_action_reorder.entries); +out_free_post_acts: + free_flow_post_acts(flow); + + return err; +} + +static int +flow_action_supported(struct flow_action *flow_action, + struct netlink_ext_ack *extack) +{ + if (!flow_action_has_entries(flow_action)) { + NL_SET_ERR_MSG_MOD(extack, "Flow action doesn't have any entries"); + return -EINVAL; + } + + if (!flow_action_hw_stats_check(flow_action, extack, + FLOW_ACTION_HW_STATS_DELAYED_BIT)) { + NL_SET_ERR_MSG_MOD(extack, "Flow action HW stats type is not supported"); + return -EOPNOTSUPP; + } + + return 0; +} + +static int +parse_tc_nic_actions(struct mlx5e_priv *priv, + struct flow_action *flow_action, + struct mlx5e_tc_flow *flow, + struct netlink_ext_ack *extack) +{ + struct mlx5e_tc_act_parse_state *parse_state; + struct mlx5e_tc_flow_parse_attr *parse_attr; + struct mlx5_flow_attr *attr = flow->attr; + int err; + + err = flow_action_supported(flow_action, extack); + if (err) + return err; + + attr->nic_attr->flow_tag = MLX5_FS_DEFAULT_FLOW_TAG; + parse_attr = attr->parse_attr; + parse_state = &parse_attr->parse_state; + mlx5e_tc_act_init_parse_state(parse_state, flow, flow_action, extack); + parse_state->ct_priv = get_ct_priv(priv); + + err = parse_tc_actions(parse_state, flow_action); + if (err) + return err; + + err = actions_prepare_mod_hdr_actions(priv, flow, attr, extack); + if (err) + return err; + + if (!actions_match_supported(priv, flow_action, parse_state->actions, + parse_attr, flow, extack)) + return -EOPNOTSUPP; + + return 0; +} + +static bool is_merged_eswitch_vfs(struct mlx5e_priv *priv, + struct net_device *peer_netdev) +{ + struct mlx5e_priv *peer_priv; + + peer_priv = netdev_priv(peer_netdev); + + return (MLX5_CAP_ESW(priv->mdev, merged_eswitch) && + mlx5e_eswitch_vf_rep(priv->netdev) && + mlx5e_eswitch_vf_rep(peer_netdev) && + mlx5e_same_hw_devs(priv, peer_priv)); +} + +static bool same_hw_reps(struct mlx5e_priv *priv, + struct net_device *peer_netdev) +{ + struct mlx5e_priv *peer_priv; + + peer_priv = netdev_priv(peer_netdev); + + return mlx5e_eswitch_rep(priv->netdev) && + mlx5e_eswitch_rep(peer_netdev) && + mlx5e_same_hw_devs(priv, peer_priv); +} + +static bool is_lag_dev(struct mlx5e_priv *priv, + struct net_device *peer_netdev) +{ + return ((mlx5_lag_is_sriov(priv->mdev) || + mlx5_lag_is_mpesw(priv->mdev) || + mlx5_lag_is_multipath(priv->mdev)) && + same_hw_reps(priv, peer_netdev)); +} + +static bool is_multiport_eligible(struct mlx5e_priv *priv, struct net_device *out_dev) +{ + if (!mlx5_lag_mpesw_is_activated(priv->mdev->priv.eswitch)) + return false; + + if (!mlx5e_eswitch_uplink_rep(out_dev)) + return false; + + return true; +} + +bool mlx5e_is_valid_eswitch_fwd_dev(struct mlx5e_priv *priv, + struct net_device *out_dev) +{ + if (is_merged_eswitch_vfs(priv, out_dev)) + return true; + + if (is_multiport_eligible(priv, out_dev)) + return true; + + if (is_lag_dev(priv, out_dev)) + return true; + + return mlx5e_eswitch_rep(out_dev) && + same_port_devs(priv, netdev_priv(out_dev)); +} + +int mlx5e_set_fwd_to_int_port_actions(struct mlx5e_priv *priv, + struct mlx5_flow_attr *attr, + int ifindex, + enum mlx5e_tc_int_port_type type, + u32 *action, + int out_index) +{ + struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr; + struct mlx5e_tc_int_port_priv *int_port_priv; + struct mlx5e_tc_flow_parse_attr *parse_attr; + struct mlx5e_tc_int_port *dest_int_port; + int err; + + parse_attr = attr->parse_attr; + int_port_priv = mlx5e_get_int_port_priv(priv); + + dest_int_port = mlx5e_tc_int_port_get(int_port_priv, ifindex, type); + if (IS_ERR(dest_int_port)) + return PTR_ERR(dest_int_port); + + err = mlx5e_tc_match_to_reg_set(priv->mdev, &parse_attr->mod_hdr_acts, + MLX5_FLOW_NAMESPACE_FDB, VPORT_TO_REG, + mlx5e_tc_int_port_get_metadata(dest_int_port)); + if (err) { + mlx5e_tc_int_port_put(int_port_priv, dest_int_port); + return err; + } + + *action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR; + + esw_attr->dest_int_port = dest_int_port; + esw_attr->dests[out_index].flags |= MLX5_ESW_DEST_CHAIN_WITH_SRC_PORT_CHANGE; + + /* Forward to root fdb for matching against the new source vport */ + attr->dest_chain = 0; + + return 0; +} + +static int +parse_tc_fdb_actions(struct mlx5e_priv *priv, + struct flow_action *flow_action, + struct mlx5e_tc_flow *flow, + struct netlink_ext_ack *extack) +{ + struct mlx5e_tc_act_parse_state *parse_state; + struct mlx5e_tc_flow_parse_attr *parse_attr; + struct mlx5_flow_attr *attr = flow->attr; + struct mlx5_esw_flow_attr *esw_attr; + struct net_device *filter_dev; + int err; + + err = flow_action_supported(flow_action, extack); + if (err) + return err; + + esw_attr = attr->esw_attr; + parse_attr = attr->parse_attr; + filter_dev = parse_attr->filter_dev; + parse_state = &parse_attr->parse_state; + mlx5e_tc_act_init_parse_state(parse_state, flow, flow_action, extack); + parse_state->ct_priv = get_ct_priv(priv); + + err = parse_tc_actions(parse_state, flow_action); + if (err) + return err; + + /* Forward to/from internal port can only have 1 dest */ + if ((netif_is_ovs_master(filter_dev) || esw_attr->dest_int_port) && + esw_attr->out_count > 1) { + NL_SET_ERR_MSG_MOD(extack, + "Rules with internal port can have only one destination"); + return -EOPNOTSUPP; + } + + /* Forward from tunnel/internal port to internal port is not supported */ + if ((mlx5e_get_tc_tun(filter_dev) || netif_is_ovs_master(filter_dev)) && + esw_attr->dest_int_port) { + NL_SET_ERR_MSG_MOD(extack, + "Forwarding from tunnel/internal port to internal port is not supported"); + return -EOPNOTSUPP; + } + + err = actions_prepare_mod_hdr_actions(priv, flow, attr, extack); + if (err) + return err; + + if (!actions_match_supported(priv, flow_action, parse_state->actions, + parse_attr, flow, extack)) + return -EOPNOTSUPP; + + return 0; +} + +static void get_flags(int flags, unsigned long *flow_flags) +{ + unsigned long __flow_flags = 0; + + if (flags & MLX5_TC_FLAG(INGRESS)) + __flow_flags |= BIT(MLX5E_TC_FLOW_FLAG_INGRESS); + if (flags & MLX5_TC_FLAG(EGRESS)) + __flow_flags |= BIT(MLX5E_TC_FLOW_FLAG_EGRESS); + + if (flags & MLX5_TC_FLAG(ESW_OFFLOAD)) + __flow_flags |= BIT(MLX5E_TC_FLOW_FLAG_ESWITCH); + if (flags & MLX5_TC_FLAG(NIC_OFFLOAD)) + __flow_flags |= BIT(MLX5E_TC_FLOW_FLAG_NIC); + if (flags & MLX5_TC_FLAG(FT_OFFLOAD)) + __flow_flags |= BIT(MLX5E_TC_FLOW_FLAG_FT); + + *flow_flags = __flow_flags; +} + +static const struct rhashtable_params tc_ht_params = { + .head_offset = offsetof(struct mlx5e_tc_flow, node), + .key_offset = offsetof(struct mlx5e_tc_flow, cookie), + .key_len = sizeof(((struct mlx5e_tc_flow *)0)->cookie), + .automatic_shrinking = true, +}; + +static struct rhashtable *get_tc_ht(struct mlx5e_priv *priv, + unsigned long flags) +{ + struct mlx5e_rep_priv *rpriv; + + if (flags & MLX5_TC_FLAG(ESW_OFFLOAD)) { + rpriv = priv->ppriv; + return &rpriv->tc_ht; + } else /* NIC offload */ + return &priv->fs.tc.ht; +} + +static bool is_peer_flow_needed(struct mlx5e_tc_flow *flow) +{ + struct mlx5_esw_flow_attr *esw_attr = flow->attr->esw_attr; + struct mlx5_flow_attr *attr = flow->attr; + bool is_rep_ingress = esw_attr->in_rep->vport != MLX5_VPORT_UPLINK && + flow_flag_test(flow, INGRESS); + bool act_is_encap = !!(attr->action & + MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT); + bool esw_paired = mlx5_devcom_is_paired(esw_attr->in_mdev->priv.devcom, + MLX5_DEVCOM_ESW_OFFLOADS); + + if (!esw_paired) + return false; + + if ((mlx5_lag_is_sriov(esw_attr->in_mdev) || + mlx5_lag_is_multipath(esw_attr->in_mdev)) && + (is_rep_ingress || act_is_encap)) + return true; + + return false; +} + +struct mlx5_flow_attr * +mlx5_alloc_flow_attr(enum mlx5_flow_namespace_type type) +{ + u32 ex_attr_size = (type == MLX5_FLOW_NAMESPACE_FDB) ? + sizeof(struct mlx5_esw_flow_attr) : + sizeof(struct mlx5_nic_flow_attr); + struct mlx5_flow_attr *attr; + + attr = kzalloc(sizeof(*attr) + ex_attr_size, GFP_KERNEL); + if (!attr) + return attr; + + INIT_LIST_HEAD(&attr->list); + return attr; +} + +static int +mlx5e_alloc_flow(struct mlx5e_priv *priv, int attr_size, + struct flow_cls_offload *f, unsigned long flow_flags, + struct mlx5e_tc_flow_parse_attr **__parse_attr, + struct mlx5e_tc_flow **__flow) +{ + struct mlx5e_tc_flow_parse_attr *parse_attr; + struct mlx5_flow_attr *attr; + struct mlx5e_tc_flow *flow; + int err = -ENOMEM; + int out_index; + + flow = kzalloc(sizeof(*flow), GFP_KERNEL); + parse_attr = kvzalloc(sizeof(*parse_attr), GFP_KERNEL); + if (!parse_attr || !flow) + goto err_free; + + flow->flags = flow_flags; + flow->cookie = f->cookie; + flow->priv = priv; + + attr = mlx5_alloc_flow_attr(mlx5e_get_flow_namespace(flow)); + if (!attr) + goto err_free; + + flow->attr = attr; + + for (out_index = 0; out_index < MLX5_MAX_FLOW_FWD_VPORTS; out_index++) + INIT_LIST_HEAD(&flow->encaps[out_index].list); + INIT_LIST_HEAD(&flow->hairpin); + INIT_LIST_HEAD(&flow->l3_to_l2_reformat); + INIT_LIST_HEAD(&flow->attrs); + refcount_set(&flow->refcnt, 1); + init_completion(&flow->init_done); + init_completion(&flow->del_hw_done); + + *__flow = flow; + *__parse_attr = parse_attr; + + return 0; + +err_free: + kfree(flow); + kvfree(parse_attr); + return err; +} + +static void +mlx5e_flow_attr_init(struct mlx5_flow_attr *attr, + struct mlx5e_tc_flow_parse_attr *parse_attr, + struct flow_cls_offload *f) +{ + attr->parse_attr = parse_attr; + attr->chain = f->common.chain_index; + attr->prio = f->common.prio; +} + +static void +mlx5e_flow_esw_attr_init(struct mlx5_flow_attr *attr, + struct mlx5e_priv *priv, + struct mlx5e_tc_flow_parse_attr *parse_attr, + struct flow_cls_offload *f, + struct mlx5_eswitch_rep *in_rep, + struct mlx5_core_dev *in_mdev) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr; + + mlx5e_flow_attr_init(attr, parse_attr, f); + + esw_attr->in_rep = in_rep; + esw_attr->in_mdev = in_mdev; + + if (MLX5_CAP_ESW(esw->dev, counter_eswitch_affinity) == + MLX5_COUNTER_SOURCE_ESWITCH) + esw_attr->counter_dev = in_mdev; + else + esw_attr->counter_dev = priv->mdev; +} + +static struct mlx5e_tc_flow * +__mlx5e_add_fdb_flow(struct mlx5e_priv *priv, + struct flow_cls_offload *f, + unsigned long flow_flags, + struct net_device *filter_dev, + struct mlx5_eswitch_rep *in_rep, + struct mlx5_core_dev *in_mdev) +{ + struct flow_rule *rule = flow_cls_offload_flow_rule(f); + struct netlink_ext_ack *extack = f->common.extack; + struct mlx5e_tc_flow_parse_attr *parse_attr; + struct mlx5e_tc_flow *flow; + int attr_size, err; + + flow_flags |= BIT(MLX5E_TC_FLOW_FLAG_ESWITCH); + attr_size = sizeof(struct mlx5_esw_flow_attr); + err = mlx5e_alloc_flow(priv, attr_size, f, flow_flags, + &parse_attr, &flow); + if (err) + goto out; + + parse_attr->filter_dev = filter_dev; + mlx5e_flow_esw_attr_init(flow->attr, + priv, parse_attr, + f, in_rep, in_mdev); + + err = parse_cls_flower(flow->priv, flow, &parse_attr->spec, + f, filter_dev); + if (err) + goto err_free; + + /* actions validation depends on parsing the ct matches first */ + err = mlx5_tc_ct_match_add(get_ct_priv(priv), &parse_attr->spec, f, + &flow->attr->ct_attr, extack); + if (err) + goto err_free; + + /* always set IP version for indirect table handling */ + flow->attr->ip_version = mlx5e_tc_get_ip_version(&parse_attr->spec, true); + + err = parse_tc_fdb_actions(priv, &rule->action, flow, extack); + if (err) + goto err_free; + + err = mlx5e_tc_add_fdb_flow(priv, flow, extack); + complete_all(&flow->init_done); + if (err) { + if (!(err == -ENETUNREACH && mlx5_lag_is_multipath(in_mdev))) + goto err_free; + add_unready_flow(flow); + } + + return flow; + +err_free: + mlx5e_flow_put(priv, flow); +out: + return ERR_PTR(err); +} + +static int mlx5e_tc_add_fdb_peer_flow(struct flow_cls_offload *f, + struct mlx5e_tc_flow *flow, + unsigned long flow_flags) +{ + struct mlx5e_priv *priv = flow->priv, *peer_priv; + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch, *peer_esw; + struct mlx5_esw_flow_attr *attr = flow->attr->esw_attr; + struct mlx5_devcom *devcom = priv->mdev->priv.devcom; + struct mlx5e_tc_flow_parse_attr *parse_attr; + struct mlx5e_rep_priv *peer_urpriv; + struct mlx5e_tc_flow *peer_flow; + struct mlx5_core_dev *in_mdev; + int err = 0; + + peer_esw = mlx5_devcom_get_peer_data(devcom, MLX5_DEVCOM_ESW_OFFLOADS); + if (!peer_esw) + return -ENODEV; + + peer_urpriv = mlx5_eswitch_get_uplink_priv(peer_esw, REP_ETH); + peer_priv = netdev_priv(peer_urpriv->netdev); + + /* in_mdev is assigned of which the packet originated from. + * So packets redirected to uplink use the same mdev of the + * original flow and packets redirected from uplink use the + * peer mdev. + */ + if (attr->in_rep->vport == MLX5_VPORT_UPLINK) + in_mdev = peer_priv->mdev; + else + in_mdev = priv->mdev; + + parse_attr = flow->attr->parse_attr; + peer_flow = __mlx5e_add_fdb_flow(peer_priv, f, flow_flags, + parse_attr->filter_dev, + attr->in_rep, in_mdev); + if (IS_ERR(peer_flow)) { + err = PTR_ERR(peer_flow); + goto out; + } + + flow->peer_flow = peer_flow; + flow_flag_set(flow, DUP); + mutex_lock(&esw->offloads.peer_mutex); + list_add_tail(&flow->peer, &esw->offloads.peer_flows); + mutex_unlock(&esw->offloads.peer_mutex); + +out: + mlx5_devcom_release_peer_data(devcom, MLX5_DEVCOM_ESW_OFFLOADS); + return err; +} + +static int +mlx5e_add_fdb_flow(struct mlx5e_priv *priv, + struct flow_cls_offload *f, + unsigned long flow_flags, + struct net_device *filter_dev, + struct mlx5e_tc_flow **__flow) +{ + struct mlx5e_rep_priv *rpriv = priv->ppriv; + struct mlx5_core_dev *in_mdev = priv->mdev; + struct mlx5_eswitch_rep *in_rep; + struct mlx5e_tc_flow *flow; + int err; + + if (!rpriv) + return -EINVAL; + in_rep = rpriv->rep; + + flow = __mlx5e_add_fdb_flow(priv, f, flow_flags, filter_dev, in_rep, + in_mdev); + if (IS_ERR(flow)) + return PTR_ERR(flow); + + if (is_peer_flow_needed(flow)) { + err = mlx5e_tc_add_fdb_peer_flow(f, flow, flow_flags); + if (err) { + mlx5e_tc_del_fdb_flow(priv, flow); + goto out; + } + } + + *__flow = flow; + + return 0; + +out: + return err; +} + +static int +mlx5e_add_nic_flow(struct mlx5e_priv *priv, + struct flow_cls_offload *f, + unsigned long flow_flags, + struct net_device *filter_dev, + struct mlx5e_tc_flow **__flow) +{ + struct flow_rule *rule = flow_cls_offload_flow_rule(f); + struct netlink_ext_ack *extack = f->common.extack; + struct mlx5e_tc_flow_parse_attr *parse_attr; + struct mlx5e_tc_flow *flow; + int attr_size, err; + + if (!MLX5_CAP_FLOWTABLE_NIC_RX(priv->mdev, ignore_flow_level)) { + if (!tc_cls_can_offload_and_chain0(priv->netdev, &f->common)) + return -EOPNOTSUPP; + } else if (!tc_can_offload_extack(priv->netdev, f->common.extack)) { + return -EOPNOTSUPP; + } + + flow_flags |= BIT(MLX5E_TC_FLOW_FLAG_NIC); + attr_size = sizeof(struct mlx5_nic_flow_attr); + err = mlx5e_alloc_flow(priv, attr_size, f, flow_flags, + &parse_attr, &flow); + if (err) + goto out; + + parse_attr->filter_dev = filter_dev; + mlx5e_flow_attr_init(flow->attr, parse_attr, f); + + err = parse_cls_flower(flow->priv, flow, &parse_attr->spec, + f, filter_dev); + if (err) + goto err_free; + + err = mlx5_tc_ct_match_add(get_ct_priv(priv), &parse_attr->spec, f, + &flow->attr->ct_attr, extack); + if (err) + goto err_free; + + err = parse_tc_nic_actions(priv, &rule->action, flow, extack); + if (err) + goto err_free; + + err = mlx5e_tc_add_nic_flow(priv, flow, extack); + if (err) + goto err_free; + + flow_flag_set(flow, OFFLOADED); + *__flow = flow; + + return 0; + +err_free: + flow_flag_set(flow, FAILED); + mlx5e_mod_hdr_dealloc(&parse_attr->mod_hdr_acts); + mlx5e_flow_put(priv, flow); +out: + return err; +} + +static int +mlx5e_tc_add_flow(struct mlx5e_priv *priv, + struct flow_cls_offload *f, + unsigned long flags, + struct net_device *filter_dev, + struct mlx5e_tc_flow **flow) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + unsigned long flow_flags; + int err; + + get_flags(flags, &flow_flags); + + if (!tc_can_offload_extack(priv->netdev, f->common.extack)) + return -EOPNOTSUPP; + + if (esw && esw->mode == MLX5_ESWITCH_OFFLOADS) + err = mlx5e_add_fdb_flow(priv, f, flow_flags, + filter_dev, flow); + else + err = mlx5e_add_nic_flow(priv, f, flow_flags, + filter_dev, flow); + + return err; +} + +static bool is_flow_rule_duplicate_allowed(struct net_device *dev, + struct mlx5e_rep_priv *rpriv) +{ + /* Offloaded flow rule is allowed to duplicate on non-uplink representor + * sharing tc block with other slaves of a lag device. Rpriv can be NULL if this + * function is called from NIC mode. + */ + return netif_is_lag_port(dev) && rpriv && rpriv->rep->vport != MLX5_VPORT_UPLINK; +} + +int mlx5e_configure_flower(struct net_device *dev, struct mlx5e_priv *priv, + struct flow_cls_offload *f, unsigned long flags) +{ + struct netlink_ext_ack *extack = f->common.extack; + struct rhashtable *tc_ht = get_tc_ht(priv, flags); + struct mlx5e_rep_priv *rpriv = priv->ppriv; + struct mlx5e_tc_flow *flow; + int err = 0; + + if (!mlx5_esw_hold(priv->mdev)) + return -EBUSY; + + mlx5_esw_get(priv->mdev); + + rcu_read_lock(); + flow = rhashtable_lookup(tc_ht, &f->cookie, tc_ht_params); + if (flow) { + /* Same flow rule offloaded to non-uplink representor sharing tc block, + * just return 0. + */ + if (is_flow_rule_duplicate_allowed(dev, rpriv) && flow->orig_dev != dev) + goto rcu_unlock; + + NL_SET_ERR_MSG_MOD(extack, + "flow cookie already exists, ignoring"); + netdev_warn_once(priv->netdev, + "flow cookie %lx already exists, ignoring\n", + f->cookie); + err = -EEXIST; + goto rcu_unlock; + } +rcu_unlock: + rcu_read_unlock(); + if (flow) + goto out; + + trace_mlx5e_configure_flower(f); + err = mlx5e_tc_add_flow(priv, f, flags, dev, &flow); + if (err) + goto out; + + /* Flow rule offloaded to non-uplink representor sharing tc block, + * set the flow's owner dev. + */ + if (is_flow_rule_duplicate_allowed(dev, rpriv)) + flow->orig_dev = dev; + + err = rhashtable_lookup_insert_fast(tc_ht, &flow->node, tc_ht_params); + if (err) + goto err_free; + + mlx5_esw_release(priv->mdev); + return 0; + +err_free: + mlx5e_flow_put(priv, flow); +out: + mlx5_esw_put(priv->mdev); + mlx5_esw_release(priv->mdev); + return err; +} + +static bool same_flow_direction(struct mlx5e_tc_flow *flow, int flags) +{ + bool dir_ingress = !!(flags & MLX5_TC_FLAG(INGRESS)); + bool dir_egress = !!(flags & MLX5_TC_FLAG(EGRESS)); + + return flow_flag_test(flow, INGRESS) == dir_ingress && + flow_flag_test(flow, EGRESS) == dir_egress; +} + +int mlx5e_delete_flower(struct net_device *dev, struct mlx5e_priv *priv, + struct flow_cls_offload *f, unsigned long flags) +{ + struct rhashtable *tc_ht = get_tc_ht(priv, flags); + struct mlx5e_tc_flow *flow; + int err; + + rcu_read_lock(); + flow = rhashtable_lookup(tc_ht, &f->cookie, tc_ht_params); + if (!flow || !same_flow_direction(flow, flags)) { + err = -EINVAL; + goto errout; + } + + /* Only delete the flow if it doesn't have MLX5E_TC_FLOW_DELETED flag + * set. + */ + if (flow_flag_test_and_set(flow, DELETED)) { + err = -EINVAL; + goto errout; + } + rhashtable_remove_fast(tc_ht, &flow->node, tc_ht_params); + rcu_read_unlock(); + + trace_mlx5e_delete_flower(f); + mlx5e_flow_put(priv, flow); + + mlx5_esw_put(priv->mdev); + return 0; + +errout: + rcu_read_unlock(); + return err; +} + +int mlx5e_stats_flower(struct net_device *dev, struct mlx5e_priv *priv, + struct flow_cls_offload *f, unsigned long flags) +{ + struct mlx5_devcom *devcom = priv->mdev->priv.devcom; + struct rhashtable *tc_ht = get_tc_ht(priv, flags); + struct mlx5_eswitch *peer_esw; + struct mlx5e_tc_flow *flow; + struct mlx5_fc *counter; + u64 lastuse = 0; + u64 packets = 0; + u64 bytes = 0; + int err = 0; + + rcu_read_lock(); + flow = mlx5e_flow_get(rhashtable_lookup(tc_ht, &f->cookie, + tc_ht_params)); + rcu_read_unlock(); + if (IS_ERR(flow)) + return PTR_ERR(flow); + + if (!same_flow_direction(flow, flags)) { + err = -EINVAL; + goto errout; + } + + if (mlx5e_is_offloaded_flow(flow) || flow_flag_test(flow, CT)) { + counter = mlx5e_tc_get_counter(flow); + if (!counter) + goto errout; + + mlx5_fc_query_cached(counter, &bytes, &packets, &lastuse); + } + + /* Under multipath it's possible for one rule to be currently + * un-offloaded while the other rule is offloaded. + */ + peer_esw = mlx5_devcom_get_peer_data(devcom, MLX5_DEVCOM_ESW_OFFLOADS); + if (!peer_esw) + goto out; + + if (flow_flag_test(flow, DUP) && + flow_flag_test(flow->peer_flow, OFFLOADED)) { + u64 bytes2; + u64 packets2; + u64 lastuse2; + + counter = mlx5e_tc_get_counter(flow->peer_flow); + if (!counter) + goto no_peer_counter; + mlx5_fc_query_cached(counter, &bytes2, &packets2, &lastuse2); + + bytes += bytes2; + packets += packets2; + lastuse = max_t(u64, lastuse, lastuse2); + } + +no_peer_counter: + mlx5_devcom_release_peer_data(devcom, MLX5_DEVCOM_ESW_OFFLOADS); +out: + flow_stats_update(&f->stats, bytes, packets, 0, lastuse, + FLOW_ACTION_HW_STATS_DELAYED); + trace_mlx5e_stats_flower(f); +errout: + mlx5e_flow_put(priv, flow); + return err; +} + +static int apply_police_params(struct mlx5e_priv *priv, u64 rate, + struct netlink_ext_ack *extack) +{ + struct mlx5e_rep_priv *rpriv = priv->ppriv; + struct mlx5_eswitch *esw; + u32 rate_mbps = 0; + u16 vport_num; + int err; + + vport_num = rpriv->rep->vport; + if (vport_num >= MLX5_VPORT_ECPF) { + NL_SET_ERR_MSG_MOD(extack, + "Ingress rate limit is supported only for Eswitch ports connected to VFs"); + return -EOPNOTSUPP; + } + + esw = priv->mdev->priv.eswitch; + /* rate is given in bytes/sec. + * First convert to bits/sec and then round to the nearest mbit/secs. + * mbit means million bits. + * Moreover, if rate is non zero we choose to configure to a minimum of + * 1 mbit/sec. + */ + if (rate) { + rate = (rate * BITS_PER_BYTE) + 500000; + do_div(rate, 1000000); + rate_mbps = max_t(u32, rate, 1); + } + + err = mlx5_esw_qos_modify_vport_rate(esw, vport_num, rate_mbps); + if (err) + NL_SET_ERR_MSG_MOD(extack, "failed applying action to hardware"); + + return err; +} + +static int scan_tc_matchall_fdb_actions(struct mlx5e_priv *priv, + struct flow_action *flow_action, + struct netlink_ext_ack *extack) +{ + struct mlx5e_rep_priv *rpriv = priv->ppriv; + const struct flow_action_entry *act; + int err; + int i; + + if (!flow_action_has_entries(flow_action)) { + NL_SET_ERR_MSG_MOD(extack, "matchall called with no action"); + return -EINVAL; + } + + if (!flow_offload_has_one_action(flow_action)) { + NL_SET_ERR_MSG_MOD(extack, "matchall policing support only a single action"); + return -EOPNOTSUPP; + } + + if (!flow_action_basic_hw_stats_check(flow_action, extack)) { + NL_SET_ERR_MSG_MOD(extack, "Flow action HW stats type is not supported"); + return -EOPNOTSUPP; + } + + flow_action_for_each(i, act, flow_action) { + switch (act->id) { + case FLOW_ACTION_POLICE: + if (act->police.rate_pkt_ps) { + NL_SET_ERR_MSG_MOD(extack, "QoS offload not support packets per second"); + return -EOPNOTSUPP; + } + err = apply_police_params(priv, act->police.rate_bytes_ps, extack); + if (err) + return err; + + rpriv->prev_vf_vport_stats = priv->stats.vf_vport; + break; + default: + NL_SET_ERR_MSG_MOD(extack, "mlx5 supports only police action for matchall"); + return -EOPNOTSUPP; + } + } + + return 0; +} + +int mlx5e_tc_configure_matchall(struct mlx5e_priv *priv, + struct tc_cls_matchall_offload *ma) +{ + struct netlink_ext_ack *extack = ma->common.extack; + + if (ma->common.prio != 1) { + NL_SET_ERR_MSG_MOD(extack, "only priority 1 is supported"); + return -EINVAL; + } + + return scan_tc_matchall_fdb_actions(priv, &ma->rule->action, extack); +} + +int mlx5e_tc_delete_matchall(struct mlx5e_priv *priv, + struct tc_cls_matchall_offload *ma) +{ + struct netlink_ext_ack *extack = ma->common.extack; + + return apply_police_params(priv, 0, extack); +} + +void mlx5e_tc_stats_matchall(struct mlx5e_priv *priv, + struct tc_cls_matchall_offload *ma) +{ + struct mlx5e_rep_priv *rpriv = priv->ppriv; + struct rtnl_link_stats64 cur_stats; + u64 dbytes; + u64 dpkts; + + cur_stats = priv->stats.vf_vport; + dpkts = cur_stats.rx_packets - rpriv->prev_vf_vport_stats.rx_packets; + dbytes = cur_stats.rx_bytes - rpriv->prev_vf_vport_stats.rx_bytes; + rpriv->prev_vf_vport_stats = cur_stats; + flow_stats_update(&ma->stats, dbytes, dpkts, 0, jiffies, + FLOW_ACTION_HW_STATS_DELAYED); +} + +static void mlx5e_tc_hairpin_update_dead_peer(struct mlx5e_priv *priv, + struct mlx5e_priv *peer_priv) +{ + struct mlx5_core_dev *peer_mdev = peer_priv->mdev; + struct mlx5e_tc_table *tc = &priv->fs.tc; + struct mlx5e_hairpin_entry *hpe, *tmp; + LIST_HEAD(init_wait_list); + u16 peer_vhca_id; + int bkt; + + if (!mlx5e_same_hw_devs(priv, peer_priv)) + return; + + peer_vhca_id = MLX5_CAP_GEN(peer_mdev, vhca_id); + + mutex_lock(&priv->fs.tc.hairpin_tbl_lock); + hash_for_each(priv->fs.tc.hairpin_tbl, bkt, hpe, hairpin_hlist) + if (refcount_inc_not_zero(&hpe->refcnt)) + list_add(&hpe->dead_peer_wait_list, &init_wait_list); + mutex_unlock(&priv->fs.tc.hairpin_tbl_lock); + + list_for_each_entry_safe(hpe, tmp, &init_wait_list, dead_peer_wait_list) { + wait_for_completion(&hpe->res_ready); + if (!IS_ERR_OR_NULL(hpe->hp) && hpe->peer_vhca_id == peer_vhca_id) + mlx5_core_hairpin_clear_dead_peer(hpe->hp->pair); + + mlx5e_hairpin_put(priv, hpe); + } + + if (tc->hp_oob && peer_priv->mdev == tc->hp_oob->peer_dev) { + mlx5_core_warn(priv->mdev, "hp oob peer %s is going down, disabling oob count\n", + peer_priv->netdev->name); + mlx5e_hairpin_oob_cnt_disable(priv); + } +} + +static int mlx5e_tc_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *ndev = netdev_notifier_info_to_dev(ptr); + struct mlx5e_flow_steering *fs; + struct mlx5e_priv *peer_priv; + struct mlx5e_tc_table *tc; + struct mlx5e_priv *priv; + + if (ndev->netdev_ops != &mlx5e_netdev_ops || + event != NETDEV_UNREGISTER || + ndev->reg_state == NETREG_REGISTERED) + return NOTIFY_DONE; + + tc = container_of(this, struct mlx5e_tc_table, netdevice_nb); + fs = container_of(tc, struct mlx5e_flow_steering, tc); + priv = container_of(fs, struct mlx5e_priv, fs); + peer_priv = netdev_priv(ndev); + if (priv == peer_priv || + !(priv->netdev->features & NETIF_F_HW_TC)) + return NOTIFY_DONE; + + mlx5e_tc_hairpin_update_dead_peer(priv, peer_priv); + + return NOTIFY_DONE; +} + +static int mlx5e_tc_nic_get_ft_size(struct mlx5_core_dev *dev) +{ + int tc_grp_size, tc_tbl_size; + u32 max_flow_counter; + + max_flow_counter = (MLX5_CAP_GEN(dev, max_flow_counter_31_16) << 16) | + MLX5_CAP_GEN(dev, max_flow_counter_15_0); + + tc_grp_size = min_t(int, max_flow_counter, MLX5E_TC_TABLE_MAX_GROUP_SIZE); + + tc_tbl_size = min_t(int, tc_grp_size * MLX5E_TC_TABLE_NUM_GROUPS, + BIT(MLX5_CAP_FLOWTABLE_NIC_RX(dev, log_max_ft_size))); + + return tc_tbl_size; +} + +static int mlx5e_tc_nic_create_miss_table(struct mlx5e_priv *priv) +{ + struct mlx5_flow_table **ft = &priv->fs.tc.miss_t; + struct mlx5_flow_table_attr ft_attr = {}; + struct mlx5_flow_namespace *ns; + int err = 0; + + ft_attr.max_fte = 1; + ft_attr.autogroup.max_num_groups = 1; + ft_attr.level = MLX5E_TC_MISS_LEVEL; + ft_attr.prio = 0; + ns = mlx5_get_flow_namespace(priv->mdev, MLX5_FLOW_NAMESPACE_KERNEL); + + *ft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr); + if (IS_ERR(*ft)) { + err = PTR_ERR(*ft); + netdev_err(priv->netdev, "failed to create tc nic miss table err=%d\n", err); + } + + return err; +} + +static void mlx5e_tc_nic_destroy_miss_table(struct mlx5e_priv *priv) +{ + mlx5_destroy_flow_table(priv->fs.tc.miss_t); +} + +int mlx5e_tc_nic_init(struct mlx5e_priv *priv) +{ + struct mlx5e_tc_table *tc = &priv->fs.tc; + struct mlx5_core_dev *dev = priv->mdev; + struct mapping_ctx *chains_mapping; + struct mlx5_chains_attr attr = {}; + u64 mapping_id; + int err; + + mlx5e_mod_hdr_tbl_init(&tc->mod_hdr); + mutex_init(&tc->t_lock); + mutex_init(&tc->hairpin_tbl_lock); + hash_init(tc->hairpin_tbl); + + err = rhashtable_init(&tc->ht, &tc_ht_params); + if (err) + return err; + + lockdep_set_class(&tc->ht.mutex, &tc_ht_lock_key); + + mapping_id = mlx5_query_nic_system_image_guid(dev); + + chains_mapping = mapping_create_for_id(mapping_id, MAPPING_TYPE_CHAIN, + sizeof(struct mlx5_mapped_obj), + MLX5E_TC_TABLE_CHAIN_TAG_MASK, true); + + if (IS_ERR(chains_mapping)) { + err = PTR_ERR(chains_mapping); + goto err_mapping; + } + tc->mapping = chains_mapping; + + err = mlx5e_tc_nic_create_miss_table(priv); + if (err) + goto err_chains; + + if (MLX5_CAP_FLOWTABLE_NIC_RX(priv->mdev, ignore_flow_level)) + attr.flags = MLX5_CHAINS_AND_PRIOS_SUPPORTED | + MLX5_CHAINS_IGNORE_FLOW_LEVEL_SUPPORTED; + attr.ns = MLX5_FLOW_NAMESPACE_KERNEL; + attr.max_ft_sz = mlx5e_tc_nic_get_ft_size(dev); + attr.max_grp_num = MLX5E_TC_TABLE_NUM_GROUPS; + attr.default_ft = priv->fs.tc.miss_t; + attr.mapping = chains_mapping; + + tc->chains = mlx5_chains_create(dev, &attr); + if (IS_ERR(tc->chains)) { + err = PTR_ERR(tc->chains); + goto err_miss; + } + + tc->post_act = mlx5e_tc_post_act_init(priv, tc->chains, MLX5_FLOW_NAMESPACE_KERNEL); + tc->ct = mlx5_tc_ct_init(priv, tc->chains, &priv->fs.tc.mod_hdr, + MLX5_FLOW_NAMESPACE_KERNEL, tc->post_act); + + tc->netdevice_nb.notifier_call = mlx5e_tc_netdev_event; + err = register_netdevice_notifier_dev_net(priv->netdev, + &tc->netdevice_nb, + &tc->netdevice_nn); + if (err) { + tc->netdevice_nb.notifier_call = NULL; + mlx5_core_warn(priv->mdev, "Failed to register netdev notifier\n"); + goto err_reg; + } + + return 0; + +err_reg: + mlx5_tc_ct_clean(tc->ct); + mlx5e_tc_post_act_destroy(tc->post_act); + mlx5_chains_destroy(tc->chains); +err_miss: + mlx5e_tc_nic_destroy_miss_table(priv); +err_chains: + mapping_destroy(chains_mapping); +err_mapping: + rhashtable_destroy(&tc->ht); + return err; +} + +static void _mlx5e_tc_del_flow(void *ptr, void *arg) +{ + struct mlx5e_tc_flow *flow = ptr; + struct mlx5e_priv *priv = flow->priv; + + mlx5e_tc_del_flow(priv, flow); + kfree(flow); +} + +void mlx5e_tc_nic_cleanup(struct mlx5e_priv *priv) +{ + struct mlx5e_tc_table *tc = &priv->fs.tc; + + if (tc->netdevice_nb.notifier_call) + unregister_netdevice_notifier_dev_net(priv->netdev, + &tc->netdevice_nb, + &tc->netdevice_nn); + + mlx5e_mod_hdr_tbl_destroy(&tc->mod_hdr); + mutex_destroy(&tc->hairpin_tbl_lock); + + rhashtable_free_and_destroy(&tc->ht, _mlx5e_tc_del_flow, NULL); + + if (!IS_ERR_OR_NULL(tc->t)) { + mlx5_chains_put_table(tc->chains, 0, 1, MLX5E_TC_FT_LEVEL); + tc->t = NULL; + } + mutex_destroy(&tc->t_lock); + + mlx5_tc_ct_clean(tc->ct); + mlx5e_tc_post_act_destroy(tc->post_act); + mlx5e_hairpin_oob_cnt_disable(priv); + mlx5e_prio_hairpin_mode_disable(priv); + mapping_destroy(tc->mapping); + mlx5_chains_destroy(tc->chains); + mlx5e_tc_nic_destroy_miss_table(priv); +} + +int mlx5e_tc_ht_init(struct rhashtable *tc_ht) +{ + int err; + + err = rhashtable_init(tc_ht, &tc_ht_params); + if (err) + return err; + + lockdep_set_class(&tc_ht->mutex, &tc_ht_lock_key); + + return 0; +} + +void mlx5e_tc_ht_cleanup(struct rhashtable *tc_ht) +{ + rhashtable_free_and_destroy(tc_ht, _mlx5e_tc_del_flow, NULL); +} + +int mlx5e_tc_esw_init(struct mlx5_rep_uplink_priv *uplink_priv) +{ + const size_t sz_enc_opts = sizeof(struct tunnel_match_enc_opts); + struct mlx5e_rep_priv *rpriv; + struct mapping_ctx *mapping; + struct mlx5_eswitch *esw; + struct mlx5e_priv *priv; + u64 mapping_id; + int err = 0; + + rpriv = container_of(uplink_priv, struct mlx5e_rep_priv, uplink_priv); + priv = netdev_priv(rpriv->netdev); + esw = priv->mdev->priv.eswitch; + + uplink_priv->post_act = mlx5e_tc_post_act_init(priv, esw_chains(esw), + MLX5_FLOW_NAMESPACE_FDB); + uplink_priv->ct_priv = mlx5_tc_ct_init(netdev_priv(priv->netdev), + esw_chains(esw), + &esw->offloads.mod_hdr, + MLX5_FLOW_NAMESPACE_FDB, + uplink_priv->post_act); + + uplink_priv->int_port_priv = mlx5e_tc_int_port_init(netdev_priv(priv->netdev)); + + uplink_priv->tc_psample = mlx5e_tc_sample_init(esw, uplink_priv->post_act); + + mapping_id = mlx5_query_nic_system_image_guid(esw->dev); + + mapping = mapping_create_for_id(mapping_id, MAPPING_TYPE_TUNNEL, + sizeof(struct tunnel_match_key), + TUNNEL_INFO_BITS_MASK, true); + + if (IS_ERR(mapping)) { + err = PTR_ERR(mapping); + goto err_tun_mapping; + } + uplink_priv->tunnel_mapping = mapping; + + /* Two last values are reserved for stack devices slow path table mark + * and bridge ingress push mark. + */ + mapping = mapping_create_for_id(mapping_id, MAPPING_TYPE_TUNNEL_ENC_OPTS, + sz_enc_opts, ENC_OPTS_BITS_MASK - 2, true); + if (IS_ERR(mapping)) { + err = PTR_ERR(mapping); + goto err_enc_opts_mapping; + } + uplink_priv->tunnel_enc_opts_mapping = mapping; + + uplink_priv->encap = mlx5e_tc_tun_init(priv); + if (IS_ERR(uplink_priv->encap)) { + err = PTR_ERR(uplink_priv->encap); + goto err_register_fib_notifier; + } + + esw_vf_meter_create_meters(priv->mdev->priv.eswitch); + uplink_priv->flow_meters = mlx5e_flow_meters_init(priv, MLX5_FLOW_NAMESPACE_FDB, + uplink_priv->post_act); + + return 0; + +err_register_fib_notifier: + mapping_destroy(uplink_priv->tunnel_enc_opts_mapping); +err_enc_opts_mapping: + mapping_destroy(uplink_priv->tunnel_mapping); +err_tun_mapping: + mlx5e_tc_sample_cleanup(uplink_priv->tc_psample); + mlx5e_tc_int_port_cleanup(uplink_priv->int_port_priv); + mlx5_tc_ct_clean(uplink_priv->ct_priv); + netdev_warn(priv->netdev, + "Failed to initialize tc (eswitch), err: %d", err); + mlx5e_tc_post_act_destroy(uplink_priv->post_act); + return err; +} + +void mlx5e_tc_esw_cleanup(struct mlx5_rep_uplink_priv *uplink_priv) +{ + struct mlx5e_rep_priv *rpriv; + struct mlx5e_priv *priv; + + mlx5e_tc_tun_cleanup(uplink_priv->encap); + + mapping_destroy(uplink_priv->tunnel_enc_opts_mapping); + mapping_destroy(uplink_priv->tunnel_mapping); + + mlx5e_tc_sample_cleanup(uplink_priv->tc_psample); + mlx5e_tc_int_port_cleanup(uplink_priv->int_port_priv); + mlx5_tc_ct_clean(uplink_priv->ct_priv); + rpriv = container_of(uplink_priv, struct mlx5e_rep_priv, uplink_priv); + priv = netdev_priv(rpriv->netdev); + esw_vf_meter_destroy_all(priv->mdev->priv.eswitch); + mlx5e_flow_meters_cleanup(uplink_priv->flow_meters); + mlx5e_tc_post_act_destroy(uplink_priv->post_act); +} + +int mlx5e_tc_num_filters(struct mlx5e_priv *priv, unsigned long flags) +{ + struct rhashtable *tc_ht = get_tc_ht(priv, flags); + + return atomic_read(&tc_ht->nelems); +} + +void mlx5e_tc_clean_fdb_peer_flows(struct mlx5_eswitch *esw) +{ + struct mlx5e_tc_flow *flow, *tmp; + + list_for_each_entry_safe(flow, tmp, &esw->offloads.peer_flows, peer) + __mlx5e_tc_del_fdb_peer_flow(flow); +} + +void mlx5e_tc_reoffload_flows_work(struct work_struct *work) +{ + struct mlx5_rep_uplink_priv *rpriv = + container_of(work, struct mlx5_rep_uplink_priv, + reoffload_flows_work); + struct mlx5e_tc_flow *flow, *tmp; + + mutex_lock(&rpriv->unready_flows_lock); + list_for_each_entry_safe(flow, tmp, &rpriv->unready_flows, unready) { + if (!mlx5e_tc_add_fdb_flow(flow->priv, flow, NULL)) + unready_flow_del(flow); + } + mutex_unlock(&rpriv->unready_flows_lock); +} + +static int mlx5e_setup_tc_cls_flower(struct mlx5e_priv *priv, + struct flow_cls_offload *cls_flower, + unsigned long flags) +{ + switch (cls_flower->command) { + case FLOW_CLS_REPLACE: + return mlx5e_configure_flower(priv->netdev, priv, cls_flower, + flags); + case FLOW_CLS_DESTROY: + return mlx5e_delete_flower(priv->netdev, priv, cls_flower, + flags); + case FLOW_CLS_STATS: + return mlx5e_stats_flower(priv->netdev, priv, cls_flower, + flags); + default: + return -EOPNOTSUPP; + } +} + +int mlx5e_setup_tc_block_cb(enum tc_setup_type type, void *type_data, + void *cb_priv) +{ + unsigned long flags = MLX5_TC_FLAG(INGRESS); + struct mlx5e_priv *priv = cb_priv; + + if (!priv->netdev || !netif_device_present(priv->netdev)) + return -EOPNOTSUPP; + + if (mlx5e_is_uplink_rep(priv)) + flags |= MLX5_TC_FLAG(ESW_OFFLOAD); + else + flags |= MLX5_TC_FLAG(NIC_OFFLOAD); + + switch (type) { + case TC_SETUP_CLSFLOWER: + return mlx5e_setup_tc_cls_flower(priv, type_data, flags); + default: + return -EOPNOTSUPP; + } +} + +bool mlx5e_tc_update_skb(struct mlx5_cqe64 *cqe, + struct sk_buff *skb) +{ +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) + u32 chain = 0, chain_tag, reg_b, zone_restore_id; + struct mlx5e_priv *priv = netdev_priv(skb->dev); + struct mlx5e_tc_table *tc = &priv->fs.tc; + struct mlx5_mapped_obj mapped_obj; + struct tc_skb_ext *tc_skb_ext; + int err; + + reg_b = be32_to_cpu(cqe->ft_metadata); + + chain_tag = reg_b & MLX5E_TC_TABLE_CHAIN_TAG_MASK; + + err = mapping_find(tc->mapping, chain_tag, &mapped_obj); + if (err) { + netdev_dbg(priv->netdev, + "Couldn't find chain for chain tag: %d, err: %d\n", + chain_tag, err); + return false; + } + + if (mapped_obj.type == MLX5_MAPPED_OBJ_CHAIN) { + chain = mapped_obj.chain; + tc_skb_ext = tc_skb_ext_alloc(skb); + if (WARN_ON(!tc_skb_ext)) + return false; + + tc_skb_ext->chain = chain; + + zone_restore_id = (reg_b >> MLX5_REG_MAPPING_MOFFSET(NIC_ZONE_RESTORE_TO_REG)) & + ESW_ZONE_ID_MASK; + + if (!mlx5e_tc_ct_restore_flow(tc->ct, skb, + zone_restore_id)) + return false; + } else { + netdev_dbg(priv->netdev, "Invalid mapped object type: %d\n", mapped_obj.type); + return false; + } +#endif /* CONFIG_NET_TC_SKB_EXT */ + + return true; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h new file mode 100644 index 0000000..12793c3 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h @@ -0,0 +1,397 @@ +/* + * Copyright (c) 2016, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __MLX5_EN_TC_H__ +#define __MLX5_EN_TC_H__ + +#include +#include "en.h" +#include "eswitch.h" +#include "en/tc_ct.h" +#include "en/tc_tun.h" +#include "en/tc/int_port.h" +#include "en/tc/meter.h" +#include "en_rep.h" + +#define MLX5E_TC_FLOW_ID_MASK 0x0000ffff + +#ifdef CONFIG_MLX5_ESWITCH + +#define NIC_FLOW_ATTR_SZ (sizeof(struct mlx5_flow_attr) +\ + sizeof(struct mlx5_nic_flow_attr)) +#define ESW_FLOW_ATTR_SZ (sizeof(struct mlx5_flow_attr) +\ + sizeof(struct mlx5_esw_flow_attr)) +#define ns_to_attr_sz(ns) (((ns) == MLX5_FLOW_NAMESPACE_FDB) ?\ + ESW_FLOW_ATTR_SZ :\ + NIC_FLOW_ATTR_SZ) + +struct mlx5_flow_attr; + +int mlx5e_tc_num_filters(struct mlx5e_priv *priv, unsigned long flags); + +struct mlx5e_tc_update_priv { + struct net_device *fwd_dev; +}; + +struct mlx5_nic_flow_attr { + u32 flow_tag; + u32 hairpin_tirn; + struct mlx5_flow_table *hairpin_ft; + u32 user_prio; +}; + +struct mlx5_flow_attr { + u32 action; + struct mlx5_fc *counter; + struct mlx5_modify_hdr *modify_hdr; + struct mlx5_ct_attr ct_attr; + struct mlx5e_sample_attr sample_attr; + struct mlx5e_meter_attr meter_attr; + struct mlx5e_tc_flow_parse_attr *parse_attr; + u32 chain; + u16 prio; + u32 dest_chain; + struct mlx5_flow_table *ft; + struct mlx5_flow_table *dest_ft; + u8 inner_match_level; + u8 outer_match_level; + u8 ip_version; + u8 tun_ip_version; + int tunnel_id; /* mapped tunnel id */ + u32 flags; + u32 exe_aso_type; + struct list_head list; + struct mlx5e_post_act_handle *post_act_handle; + union { + struct mlx5_esw_flow_attr esw_attr[0]; + struct mlx5_nic_flow_attr nic_attr[0]; + }; +}; + +enum { + MLX5_ATTR_FLAG_VLAN_HANDLED = BIT(0), + MLX5_ATTR_FLAG_SLOW_PATH = BIT(1), + MLX5_ATTR_FLAG_NO_IN_PORT = BIT(2), + MLX5_ATTR_FLAG_SRC_REWRITE = BIT(3), + MLX5_ATTR_FLAG_SAMPLE = BIT(4), + MLX5_ATTR_FLAG_ACCEPT = BIT(5), + MLX5_ATTR_FLAG_CT = BIT(6), +}; + +/* Returns true if any of the flags that require skipping further TC/NF processing are set. */ +static inline bool +mlx5e_tc_attr_flags_skip(u32 attr_flags) +{ + return attr_flags & (MLX5_ATTR_FLAG_SLOW_PATH | MLX5_ATTR_FLAG_ACCEPT); +} + +struct mlx5_rx_tun_attr { + u16 decap_vport; + union { + __be32 v4; + struct in6_addr v6; + } src_ip; /* Valid if decap_vport is not zero */ + union { + __be32 v4; + struct in6_addr v6; + } dst_ip; /* Valid if decap_vport is not zero */ + u32 vni; +}; + +#define MLX5E_TC_TABLE_CHAIN_TAG_BITS 16 +#define MLX5E_TC_TABLE_CHAIN_TAG_MASK GENMASK(MLX5E_TC_TABLE_CHAIN_TAG_BITS - 1, 0) + +#define MLX5E_TC_MAX_INT_PORT_NUM (8) + +#if IS_ENABLED(CONFIG_MLX5_CLS_ACT) + +struct tunnel_match_key { + struct flow_dissector_key_control enc_control; + struct flow_dissector_key_keyid enc_key_id; + struct flow_dissector_key_ports enc_tp; + struct flow_dissector_key_ip enc_ip; + union { + struct flow_dissector_key_ipv4_addrs enc_ipv4; + struct flow_dissector_key_ipv6_addrs enc_ipv6; + }; + + int filter_ifindex; +}; + +struct tunnel_match_enc_opts { + struct flow_dissector_key_enc_opts key; + struct flow_dissector_key_enc_opts mask; +}; + +/* Tunnel_id mapping is TUNNEL_INFO_BITS + ENC_OPTS_BITS. + * Upper TUNNEL_INFO_BITS for general tunnel info. + * Lower ENC_OPTS_BITS bits for enc_opts. + */ +#define TUNNEL_INFO_BITS 12 +#define TUNNEL_INFO_BITS_MASK GENMASK(TUNNEL_INFO_BITS - 1, 0) +#define ENC_OPTS_BITS 11 +#define ENC_OPTS_BITS_MASK GENMASK(ENC_OPTS_BITS - 1, 0) +#define TUNNEL_ID_BITS (TUNNEL_INFO_BITS + ENC_OPTS_BITS) +#define TUNNEL_ID_MASK GENMASK(TUNNEL_ID_BITS - 1, 0) + +enum { + MLX5E_TC_FLAG_INGRESS_BIT, + MLX5E_TC_FLAG_EGRESS_BIT, + MLX5E_TC_FLAG_NIC_OFFLOAD_BIT, + MLX5E_TC_FLAG_ESW_OFFLOAD_BIT, + MLX5E_TC_FLAG_FT_OFFLOAD_BIT, + MLX5E_TC_FLAG_LAST_EXPORTED_BIT = MLX5E_TC_FLAG_FT_OFFLOAD_BIT, +}; + +#define MLX5_TC_FLAG(flag) BIT(MLX5E_TC_FLAG_##flag##_BIT) + +int mlx5e_tc_esw_init(struct mlx5_rep_uplink_priv *uplink_priv); +void mlx5e_tc_esw_cleanup(struct mlx5_rep_uplink_priv *uplink_priv); + +int mlx5e_tc_ht_init(struct rhashtable *tc_ht); +void mlx5e_tc_ht_cleanup(struct rhashtable *tc_ht); + +int mlx5e_configure_flower(struct net_device *dev, struct mlx5e_priv *priv, + struct flow_cls_offload *f, unsigned long flags); +int mlx5e_delete_flower(struct net_device *dev, struct mlx5e_priv *priv, + struct flow_cls_offload *f, unsigned long flags); + +int mlx5e_stats_flower(struct net_device *dev, struct mlx5e_priv *priv, + struct flow_cls_offload *f, unsigned long flags); + +int mlx5e_tc_configure_matchall(struct mlx5e_priv *priv, + struct tc_cls_matchall_offload *f); +int mlx5e_tc_delete_matchall(struct mlx5e_priv *priv, + struct tc_cls_matchall_offload *f); +void mlx5e_tc_stats_matchall(struct mlx5e_priv *priv, + struct tc_cls_matchall_offload *ma); + +struct mlx5e_encap_entry; +void mlx5e_tc_encap_flows_add(struct mlx5e_priv *priv, + struct mlx5e_encap_entry *e, + struct list_head *flow_list); +void mlx5e_tc_encap_flows_del(struct mlx5e_priv *priv, + struct mlx5e_encap_entry *e, + struct list_head *flow_list); +bool mlx5e_encap_take(struct mlx5e_encap_entry *e); +void mlx5e_encap_put(struct mlx5e_priv *priv, struct mlx5e_encap_entry *e); + +void mlx5e_take_all_encap_flows(struct mlx5e_encap_entry *e, struct list_head *flow_list); +void mlx5e_put_flow_list(struct mlx5e_priv *priv, struct list_head *flow_list); + +struct mlx5e_neigh_hash_entry; +struct mlx5e_encap_entry * +mlx5e_get_next_init_encap(struct mlx5e_neigh_hash_entry *nhe, + struct mlx5e_encap_entry *e); +void mlx5e_tc_update_neigh_used_value(struct mlx5e_neigh_hash_entry *nhe); + +void mlx5e_tc_reoffload_flows_work(struct work_struct *work); + +enum mlx5e_tc_attr_to_reg { + CHAIN_TO_REG, + VPORT_TO_REG, + TUNNEL_TO_REG, + CTSTATE_TO_REG, + ZONE_TO_REG, + ZONE_RESTORE_TO_REG, + MARK_TO_REG, + LABELS_TO_REG, + FTEID_TO_REG, + NIC_CHAIN_TO_REG, + NIC_ZONE_RESTORE_TO_REG, + USER_PRIO_TO_REG, + HP_OOB_CNT_COLOR_REG, + HP_OOB_TX_CNT_COLOR_REG, + PACKET_COLOR_TO_REG, +}; + +struct mlx5e_tc_attr_to_reg_mapping { + int mfield; /* rewrite field */ + int moffset; /* bit offset of mfield */ + int mlen; /* bits to rewrite/match */ + + int soffset; /* byte offset of spec for match */ +}; + +extern struct mlx5e_tc_attr_to_reg_mapping mlx5e_tc_attr_to_reg_mappings[]; + +#define MLX5_REG_MAPPING_MOFFSET(reg_id) (mlx5e_tc_attr_to_reg_mappings[reg_id].moffset) +#define MLX5_REG_MAPPING_MBITS(reg_id) (mlx5e_tc_attr_to_reg_mappings[reg_id].mlen) +#define MLX5_REG_MAPPING_MASK(reg_id) (GENMASK(mlx5e_tc_attr_to_reg_mappings[reg_id].mlen - 1, 0)) + +bool mlx5e_is_valid_eswitch_fwd_dev(struct mlx5e_priv *priv, + struct net_device *out_dev); + +int mlx5e_tc_match_to_reg_set(struct mlx5_core_dev *mdev, + struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts, + enum mlx5_flow_namespace_type ns, + enum mlx5e_tc_attr_to_reg type, + u32 data); + +void mlx5e_tc_match_to_reg_mod_hdr_change(struct mlx5_core_dev *mdev, + struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts, + enum mlx5e_tc_attr_to_reg type, + int act_id, u32 data); + +void mlx5e_tc_match_to_reg_match(struct mlx5_flow_spec *spec, + enum mlx5e_tc_attr_to_reg type, + u32 data, + u32 mask); + +void mlx5e_tc_match_to_reg_get_match(struct mlx5_flow_spec *spec, + enum mlx5e_tc_attr_to_reg type, + u32 *data, + u32 *mask); + +int mlx5e_tc_match_to_reg_set_and_get_id(struct mlx5_core_dev *mdev, + struct mlx5e_tc_mod_hdr_acts *mod_hdr_acts, + enum mlx5_flow_namespace_type ns, + enum mlx5e_tc_attr_to_reg type, + u32 data); + +int mlx5e_tc_add_flow_mod_hdr(struct mlx5e_priv *priv, + struct mlx5e_tc_flow *flow, + struct mlx5_flow_attr *attr); + +void mlx5e_tc_set_ethertype(struct mlx5_core_dev *mdev, + struct flow_match_basic *match, bool outer, + void *headers_c, void *headers_v); + +int mlx5e_tc_nic_init(struct mlx5e_priv *priv); +void mlx5e_tc_nic_cleanup(struct mlx5e_priv *priv); + +int mlx5e_setup_tc_block_cb(enum tc_setup_type type, void *type_data, + void *cb_priv); + +struct mlx5_flow_handle * +mlx5e_add_offloaded_nic_rule(struct mlx5e_priv *priv, + struct mlx5_flow_spec *spec, + struct mlx5_flow_attr *attr); +void mlx5e_del_offloaded_nic_rule(struct mlx5e_priv *priv, + struct mlx5_flow_handle *rule, + struct mlx5_flow_attr *attr); + +struct mlx5_flow_handle * +mlx5_tc_rule_insert(struct mlx5e_priv *priv, + struct mlx5_flow_spec *spec, + struct mlx5_flow_attr *attr); +void +mlx5_tc_rule_delete(struct mlx5e_priv *priv, + struct mlx5_flow_handle *rule, + struct mlx5_flow_attr *attr); + +bool mlx5e_tc_is_vf_tunnel(struct net_device *out_dev, struct net_device *route_dev); +int mlx5e_tc_query_route_vport(struct net_device *out_dev, struct net_device *route_dev, + u16 *vport); + +int mlx5e_set_fwd_to_int_port_actions(struct mlx5e_priv *priv, + struct mlx5_flow_attr *attr, + int ifindex, + enum mlx5e_tc_int_port_type type, + u32 *action, + int out_index); +#else /* CONFIG_MLX5_CLS_ACT */ +static inline int mlx5e_tc_nic_init(struct mlx5e_priv *priv) { return 0; } +static inline void mlx5e_tc_nic_cleanup(struct mlx5e_priv *priv) {} +static inline int mlx5e_tc_ht_init(struct rhashtable *tc_ht) { return 0; } +static inline void mlx5e_tc_ht_cleanup(struct rhashtable *tc_ht) {} +static inline int +mlx5e_setup_tc_block_cb(enum tc_setup_type type, void *type_data, void *cb_priv) +{ return -EOPNOTSUPP; } + +#endif /* CONFIG_MLX5_CLS_ACT */ + +struct mlx5_flow_attr *mlx5_alloc_flow_attr(enum mlx5_flow_namespace_type type); + +struct mlx5_flow_handle * +mlx5e_add_offloaded_nic_rule(struct mlx5e_priv *priv, + struct mlx5_flow_spec *spec, + struct mlx5_flow_attr *attr); +void mlx5e_del_offloaded_nic_rule(struct mlx5e_priv *priv, + struct mlx5_flow_handle *rule, + struct mlx5_flow_attr *attr); + +#else /* CONFIG_MLX5_ESWITCH */ +static inline int mlx5e_tc_nic_init(struct mlx5e_priv *priv) { return 0; } +static inline void mlx5e_tc_nic_cleanup(struct mlx5e_priv *priv) {} +static inline int mlx5e_tc_num_filters(struct mlx5e_priv *priv, + unsigned long flags) +{ + return 0; +} + +static inline int +mlx5e_setup_tc_block_cb(enum tc_setup_type type, void *type_data, void *cb_priv) +{ return -EOPNOTSUPP; } +#endif + +#if IS_ENABLED(CONFIG_MLX5_CLS_ACT) +static inline bool mlx5e_cqe_regb_chain(struct mlx5_cqe64 *cqe) +{ +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) + u32 chain, reg_b; + + reg_b = be32_to_cpu(cqe->ft_metadata); + + if (reg_b >> (MLX5E_TC_TABLE_CHAIN_TAG_BITS + ESW_ZONE_ID_BITS)) + return false; + + chain = reg_b & MLX5E_TC_TABLE_CHAIN_TAG_MASK; + if (chain) + return true; +#endif + + return false; +} + +bool mlx5e_tc_update_skb(struct mlx5_cqe64 *cqe, struct sk_buff *skb); +int mlx5e_prio_hairpin_mode_enable(struct mlx5e_priv *priv, int num_hp, + struct net_device *peer_dev); +int mlx5e_prio_hairpin_mode_disable(struct mlx5e_priv *priv); +int create_prio_hp_sysfs(struct mlx5e_priv *priv, int prio); +int mlx5e_set_prio_hairpin_rate(struct mlx5e_priv *priv, + u16 prio, int rate); +int +mlx5e_hairpin_oob_cnt_enable(struct mlx5e_priv *priv, struct net_device *peer_dev); +int +mlx5e_hairpin_oob_cnt_disable(struct mlx5e_priv *priv); +void mlx5e_hairpin_oob_cnt_get(struct mlx5e_priv *priv, u64 *cnt); + +#else /* CONFIG_MLX5_CLS_ACT */ +static inline bool mlx5e_cqe_regb_chain(struct mlx5_cqe64 *cqe) +{ return false; } +static inline bool +mlx5e_tc_update_skb(struct mlx5_cqe64 *cqe, struct sk_buff *skb) +{ return true; } +#endif + +#endif /* __MLX5_EN_TC_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c new file mode 100644 index 0000000..1fbb432 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c @@ -0,0 +1,1058 @@ +/* + * Copyright (c) 2015-2016, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include "en.h" +#include "en/txrx.h" +#include "ipoib/ipoib.h" +#include "en_accel/en_accel.h" +#include "en_accel/ipsec_rxtx.h" +#include "en_accel/macsec.h" +#include "en/ptp.h" + +static void mlx5e_dma_unmap_wqe_err(struct mlx5e_txqsq *sq, u8 num_dma) +{ + int i; + + for (i = 0; i < num_dma; i++) { + struct mlx5e_sq_dma *last_pushed_dma = + mlx5e_dma_get(sq, --sq->dma_fifo_pc); + + mlx5e_tx_dma_unmap(sq->pdev, last_pushed_dma); + } +} + +#ifdef CONFIG_MLX5_CORE_EN_DCB +static inline int mlx5e_get_dscp_up(struct mlx5e_priv *priv, struct sk_buff *skb) +{ + int dscp_cp = 0; + + if (skb->protocol == htons(ETH_P_IP)) + dscp_cp = ipv4_get_dsfield(ip_hdr(skb)) >> 2; + else if (skb->protocol == htons(ETH_P_IPV6)) + dscp_cp = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2; + + return priv->dcbx_dp.dscp2prio[dscp_cp]; +} +#endif + +static int mlx5e_get_up(struct mlx5e_priv *priv, struct sk_buff *skb) +{ +#ifdef CONFIG_MLX5_CORE_EN_DCB + if (READ_ONCE(priv->dcbx_dp.trust_state) == MLX5_QPTS_TRUST_DSCP) + return mlx5e_get_dscp_up(priv, skb); +#endif + if (skb_vlan_tag_present(skb)) + return skb_vlan_tag_get_prio(skb); + return 0; +} + +static u16 mlx5e_select_ptpsq(struct net_device *dev, struct sk_buff *skb, + struct mlx5e_select_queue_params *selq) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + int up; + + up = selq->num_tcs > 1 ? mlx5e_get_up(priv, skb) : 0; + + return selq->num_regular_queues + up; +} + +static int mlx5e_select_htb_queue(struct mlx5e_priv *priv, struct sk_buff *skb) +{ + u16 classid; + + /* Order maj_id before defcls - pairs with mlx5e_htb_root_add. */ + if ((TC_H_MAJ(skb->priority) >> 16) == smp_load_acquire(&priv->htb.maj_id)) + classid = TC_H_MIN(skb->priority); + else + classid = READ_ONCE(priv->htb.defcls); + + if (!classid) + return 0; + + return mlx5e_get_txq_by_classid(priv, classid); +} + +u16 mlx5e_select_queue(struct net_device *dev, struct sk_buff *skb, + struct net_device *sb_dev) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5e_select_queue_params *selq; + int txq_ix, up; + + selq = rcu_dereference_bh(priv->selq); + /* This is a workaround needed only for the mlx5e_netdev_change_profile + * flow that zeroes out the whole priv without unregistering the netdev + * and without preventing ndo_select_queue from being called. + */ + if (unlikely(!selq)) + return 0; + if (unlikely(selq->is_ptp || selq->is_htb)) { + if (unlikely(selq->is_htb)) { + txq_ix = mlx5e_select_htb_queue(priv, skb); + if (txq_ix > 0) + return txq_ix; + } + + if (unlikely(selq->is_ptp)) + if (unlikely(mlx5e_use_ptpsq(skb))) + return mlx5e_select_ptpsq(dev, skb, selq); + + txq_ix = netdev_pick_tx(dev, skb, NULL); + /* Fix netdev_pick_tx() not to choose ptp_channel and HTB txqs. + * If they are selected, switch to regular queues. + * Driver to select these queues only at mlx5e_select_ptpsq() + * and mlx5e_select_htb_queue(). + */ + if (unlikely(txq_ix >= selq->num_regular_queues)) + txq_ix %= selq->num_regular_queues; + } else { + txq_ix = netdev_pick_tx(dev, skb, NULL); + } + + if (selq->num_tcs <= 1) + return txq_ix; + + up = mlx5e_get_up(priv, skb); + + /* Normalize any picked txq_ix to [0, num_channels), + * So we can return a txq_ix that matches the channel and + * packet UP. + */ + return txq_ix % selq->num_channels + up * selq->num_channels; +} + +static inline int mlx5e_skb_l2_header_offset(struct sk_buff *skb) +{ +#define MLX5E_MIN_INLINE (ETH_HLEN + VLAN_HLEN) + + return max(skb_network_offset(skb), MLX5E_MIN_INLINE); +} + +static inline int mlx5e_skb_l3_header_offset(struct sk_buff *skb) +{ + if (skb_transport_header_was_set(skb)) + return skb_transport_offset(skb); + else + return mlx5e_skb_l2_header_offset(skb); +} + +static inline u16 mlx5e_calc_min_inline(enum mlx5_inline_modes mode, + struct sk_buff *skb) +{ + u16 hlen; + + switch (mode) { + case MLX5_INLINE_MODE_NONE: + return 0; + case MLX5_INLINE_MODE_TCP_UDP: + hlen = eth_get_headlen(skb->dev, skb->data, skb_headlen(skb)); + if (hlen == ETH_HLEN && !skb_vlan_tag_present(skb)) + hlen += VLAN_HLEN; + break; + case MLX5_INLINE_MODE_IP: + hlen = mlx5e_skb_l3_header_offset(skb); + break; + case MLX5_INLINE_MODE_L2: + default: + hlen = mlx5e_skb_l2_header_offset(skb); + } + return min_t(u16, hlen, skb_headlen(skb)); +} + +static inline void mlx5e_insert_vlan(void *start, struct sk_buff *skb, u16 ihs) +{ + struct vlan_ethhdr *vhdr = (struct vlan_ethhdr *)start; + int cpy1_sz = 2 * ETH_ALEN; + int cpy2_sz = ihs - cpy1_sz; + + memcpy(&vhdr->addrs, skb->data, cpy1_sz); + vhdr->h_vlan_proto = skb->vlan_proto; + vhdr->h_vlan_TCI = cpu_to_be16(skb_vlan_tag_get(skb)); + memcpy(&vhdr->h_vlan_encapsulated_proto, skb->data + cpy1_sz, cpy2_sz); +} + +static inline void +mlx5e_txwqe_build_eseg_csum(struct mlx5e_txqsq *sq, struct sk_buff *skb, + struct mlx5e_accel_tx_state *accel, + struct mlx5_wqe_eth_seg *eseg) +{ + if (unlikely(mlx5e_ipsec_txwqe_build_eseg_csum(sq, skb, eseg))) + return; + + if (likely(skb->ip_summed == CHECKSUM_PARTIAL)) { + eseg->cs_flags = MLX5_ETH_WQE_L3_CSUM; + if (skb->encapsulation) { + eseg->cs_flags |= MLX5_ETH_WQE_L3_INNER_CSUM | + MLX5_ETH_WQE_L4_INNER_CSUM; + sq->stats->csum_partial_inner++; + } else { + eseg->cs_flags |= MLX5_ETH_WQE_L4_CSUM; + sq->stats->csum_partial++; + } +#ifdef CONFIG_MLX5_EN_TLS + } else if (unlikely(accel && accel->tls.tls_tisn)) { + eseg->cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM; + sq->stats->csum_partial++; +#endif + } else + sq->stats->csum_none++; +} + +static inline u16 +mlx5e_tx_get_gso_ihs(struct mlx5e_txqsq *sq, struct sk_buff *skb) +{ + struct mlx5e_sq_stats *stats = sq->stats; + u16 ihs; + + if (skb->encapsulation) { + ihs = skb_inner_transport_offset(skb) + inner_tcp_hdrlen(skb); + stats->tso_inner_packets++; + stats->tso_inner_bytes += skb->len - ihs; + } else { + if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) + ihs = skb_transport_offset(skb) + sizeof(struct udphdr); + else + ihs = skb_transport_offset(skb) + tcp_hdrlen(skb); + stats->tso_packets++; + stats->tso_bytes += skb->len - ihs; + } + + return ihs; +} + +static inline int +mlx5e_txwqe_build_dsegs(struct mlx5e_txqsq *sq, struct sk_buff *skb, + unsigned char *skb_data, u16 headlen, + struct mlx5_wqe_data_seg *dseg) +{ + dma_addr_t dma_addr = 0; + u8 num_dma = 0; + int i; + + if (headlen) { + dma_addr = dma_map_single(sq->pdev, skb_data, headlen, + DMA_TO_DEVICE); + if (unlikely(dma_mapping_error(sq->pdev, dma_addr))) + goto dma_unmap_wqe_err; + + dseg->addr = cpu_to_be64(dma_addr); + dseg->lkey = sq->mkey_be; + dseg->byte_count = cpu_to_be32(headlen); + + mlx5e_dma_push(sq, dma_addr, headlen, MLX5E_DMA_MAP_SINGLE); + num_dma++; + dseg++; + } + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + int fsz = skb_frag_size(frag); + + dma_addr = skb_frag_dma_map(sq->pdev, frag, 0, fsz, + DMA_TO_DEVICE); + if (unlikely(dma_mapping_error(sq->pdev, dma_addr))) + goto dma_unmap_wqe_err; + + dseg->addr = cpu_to_be64(dma_addr); + dseg->lkey = sq->mkey_be; + dseg->byte_count = cpu_to_be32(fsz); + + mlx5e_dma_push(sq, dma_addr, fsz, MLX5E_DMA_MAP_PAGE); + num_dma++; + dseg++; + } + + return num_dma; + +dma_unmap_wqe_err: + mlx5e_dma_unmap_wqe_err(sq, num_dma); + return -ENOMEM; +} + +struct mlx5e_tx_attr { + u32 num_bytes; + u16 headlen; + u16 ihs; + __be16 mss; + u16 insz; + u8 opcode; +}; + +struct mlx5e_tx_wqe_attr { + u16 ds_cnt; + u16 ds_cnt_inl; + u16 ds_cnt_ids; + u8 num_wqebbs; +}; + +static u8 +mlx5e_tx_wqe_inline_mode(struct mlx5e_txqsq *sq, struct sk_buff *skb, + struct mlx5e_accel_tx_state *accel) +{ + u8 mode; + +#ifdef CONFIG_MLX5_EN_TLS + if (accel && accel->tls.tls_tisn) + return MLX5_INLINE_MODE_TCP_UDP; +#endif + + mode = sq->min_inline_mode; + + if (skb_vlan_tag_present(skb) && + test_bit(MLX5E_SQ_STATE_VLAN_NEED_L2_INLINE, &sq->state)) + mode = max_t(u8, MLX5_INLINE_MODE_L2, mode); + + return mode; +} + +static void mlx5e_sq_xmit_prepare(struct mlx5e_txqsq *sq, struct sk_buff *skb, + struct mlx5e_accel_tx_state *accel, + struct mlx5e_tx_attr *attr) +{ + struct mlx5e_sq_stats *stats = sq->stats; + + if (skb_is_gso(skb)) { + u16 ihs = mlx5e_tx_get_gso_ihs(sq, skb); + + *attr = (struct mlx5e_tx_attr) { + .opcode = MLX5_OPCODE_LSO, + .mss = cpu_to_be16(skb_shinfo(skb)->gso_size), + .ihs = ihs, + .num_bytes = skb->len + (skb_shinfo(skb)->gso_segs - 1) * ihs, + .headlen = skb_headlen(skb) - ihs, + }; + + stats->packets += skb_shinfo(skb)->gso_segs; + } else { + u8 mode = mlx5e_tx_wqe_inline_mode(sq, skb, accel); + u16 ihs = mlx5e_calc_min_inline(mode, skb); + + *attr = (struct mlx5e_tx_attr) { + .opcode = MLX5_OPCODE_SEND, + .mss = cpu_to_be16(0), + .ihs = ihs, + .num_bytes = max_t(unsigned int, skb->len, ETH_ZLEN), + .headlen = skb_headlen(skb) - ihs, + }; + + stats->packets++; + } + + attr->insz = mlx5e_accel_tx_ids_len(sq, accel); + stats->bytes += attr->num_bytes; +} + +static void mlx5e_sq_calc_wqe_attr(struct sk_buff *skb, const struct mlx5e_tx_attr *attr, + struct mlx5e_tx_wqe_attr *wqe_attr) +{ + u16 ds_cnt = MLX5E_TX_WQE_EMPTY_DS_COUNT; + u16 ds_cnt_inl = 0; + u16 ds_cnt_ids = 0; + + if (attr->insz) + ds_cnt_ids = DIV_ROUND_UP(sizeof(struct mlx5_wqe_inline_seg) + attr->insz, + MLX5_SEND_WQE_DS); + + ds_cnt += !!attr->headlen + skb_shinfo(skb)->nr_frags + ds_cnt_ids; + if (attr->ihs) { + u16 inl = attr->ihs - INL_HDR_START_SZ; + + if (skb_vlan_tag_present(skb)) + inl += VLAN_HLEN; + + ds_cnt_inl = DIV_ROUND_UP(inl, MLX5_SEND_WQE_DS); + ds_cnt += ds_cnt_inl; + } + + *wqe_attr = (struct mlx5e_tx_wqe_attr) { + .ds_cnt = ds_cnt, + .ds_cnt_inl = ds_cnt_inl, + .ds_cnt_ids = ds_cnt_ids, + .num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS), + }; +} + +static void mlx5e_tx_skb_update_hwts_flags(struct sk_buff *skb) +{ + if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) + skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS; +} + +static void mlx5e_tx_check_stop(struct mlx5e_txqsq *sq) +{ + if (unlikely(!mlx5e_wqc_has_room_for(&sq->wq, sq->cc, sq->pc, sq->stop_room))) { + netif_tx_stop_queue(sq->txq); + sq->stats->stopped++; + } +} + +static inline void +mlx5e_txwqe_complete(struct mlx5e_txqsq *sq, struct sk_buff *skb, + const struct mlx5e_tx_attr *attr, + const struct mlx5e_tx_wqe_attr *wqe_attr, u8 num_dma, + struct mlx5e_tx_wqe_info *wi, struct mlx5_wqe_ctrl_seg *cseg, + bool xmit_more) +{ + struct mlx5_wq_cyc *wq = &sq->wq; + bool send_doorbell; + + *wi = (struct mlx5e_tx_wqe_info) { + .skb = skb, + .num_bytes = attr->num_bytes, + .num_dma = num_dma, + .num_wqebbs = wqe_attr->num_wqebbs, + .num_fifo_pkts = 0, + }; + + cseg->opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | attr->opcode); + cseg->qpn_ds = cpu_to_be32((sq->sqn << 8) | wqe_attr->ds_cnt); + + mlx5e_tx_skb_update_hwts_flags(skb); + + sq->pc += wi->num_wqebbs; + + mlx5e_tx_check_stop(sq); + + if (unlikely(sq->ptpsq)) { + mlx5e_skb_cb_hwtstamp_init(skb); + mlx5e_skb_fifo_push(&sq->ptpsq->skb_fifo, skb); + if (!netif_tx_queue_stopped(sq->txq) && + !mlx5e_skb_fifo_has_room(&sq->ptpsq->skb_fifo)) { + netif_tx_stop_queue(sq->txq); + sq->stats->stopped++; + } + skb_get(skb); + } + + send_doorbell = __netdev_tx_sent_queue(sq->txq, attr->num_bytes, xmit_more); + if (send_doorbell) + mlx5e_notify_hw(wq, sq->pc, sq->uar_map, cseg); +} + +static void +mlx5e_sq_xmit_wqe(struct mlx5e_txqsq *sq, struct sk_buff *skb, + const struct mlx5e_tx_attr *attr, const struct mlx5e_tx_wqe_attr *wqe_attr, + struct mlx5e_tx_wqe *wqe, u16 pi, bool xmit_more) +{ + struct mlx5_wqe_ctrl_seg *cseg; + struct mlx5_wqe_eth_seg *eseg; + struct mlx5_wqe_data_seg *dseg; + struct mlx5e_tx_wqe_info *wi; + + struct mlx5e_sq_stats *stats = sq->stats; + int num_dma; + + stats->xmit_more += xmit_more; + + /* fill wqe */ + wi = &sq->db.wqe_info[pi]; + cseg = &wqe->ctrl; + eseg = &wqe->eth; + dseg = wqe->data; + + eseg->mss = attr->mss; + + if (attr->ihs) { + if (skb_vlan_tag_present(skb)) { + eseg->inline_hdr.sz |= cpu_to_be16(attr->ihs + VLAN_HLEN); + mlx5e_insert_vlan(eseg->inline_hdr.start, skb, attr->ihs); + stats->added_vlan_packets++; + } else { + eseg->inline_hdr.sz |= cpu_to_be16(attr->ihs); + memcpy(eseg->inline_hdr.start, skb->data, attr->ihs); + } + dseg += wqe_attr->ds_cnt_inl; + } else if (skb_vlan_tag_present(skb)) { + eseg->insert.type = cpu_to_be16(MLX5_ETH_WQE_INSERT_VLAN); + if (skb->vlan_proto == cpu_to_be16(ETH_P_8021AD)) + eseg->insert.type |= cpu_to_be16(MLX5_ETH_WQE_SVLAN); + eseg->insert.vlan_tci = cpu_to_be16(skb_vlan_tag_get(skb)); + stats->added_vlan_packets++; + } + + dseg += wqe_attr->ds_cnt_ids; + num_dma = mlx5e_txwqe_build_dsegs(sq, skb, skb->data + attr->ihs, + attr->headlen, dseg); + if (unlikely(num_dma < 0)) + goto err_drop; + + mlx5e_txwqe_complete(sq, skb, attr, wqe_attr, num_dma, wi, cseg, xmit_more); + + sq->dim_obj.sample.pkt_ctr = sq->stats->packets; + sq->dim_obj.sample.byte_ctr = sq->stats->bytes; + + return; + +err_drop: + stats->dropped++; + dev_kfree_skb_any(skb); +} + +static bool mlx5e_tx_skb_supports_mpwqe(struct sk_buff *skb, struct mlx5e_tx_attr *attr) +{ + return !skb_is_nonlinear(skb) && !skb_vlan_tag_present(skb) && !attr->ihs && + !attr->insz && !mlx5e_macsec_skb_is_offload(skb); +} + +static bool mlx5e_tx_mpwqe_same_eseg(struct mlx5e_txqsq *sq, struct mlx5_wqe_eth_seg *eseg) +{ + struct mlx5e_tx_mpwqe *session = &sq->mpwqe; + + /* Assumes the session is already running and has at least one packet. */ + return !memcmp(&session->wqe->eth, eseg, MLX5E_ACCEL_ESEG_LEN); +} + +static void mlx5e_tx_mpwqe_session_start(struct mlx5e_txqsq *sq, + struct mlx5_wqe_eth_seg *eseg) +{ + struct mlx5e_tx_mpwqe *session = &sq->mpwqe; + struct mlx5e_tx_wqe *wqe; + u16 pi; + + pi = mlx5e_txqsq_get_next_pi(sq, sq->max_sq_mpw_wqebbs); + wqe = MLX5E_TX_FETCH_WQE(sq, pi); + net_prefetchw(wqe->data); + + *session = (struct mlx5e_tx_mpwqe) { + .wqe = wqe, + .bytes_count = 0, + .ds_count = MLX5E_TX_WQE_EMPTY_DS_COUNT, + .pkt_count = 0, + .inline_on = 0, + }; + + memcpy(&session->wqe->eth, eseg, MLX5E_ACCEL_ESEG_LEN); + + sq->stats->mpwqe_blks++; +} + +static bool mlx5e_tx_mpwqe_session_is_active(struct mlx5e_txqsq *sq) +{ + return sq->mpwqe.wqe; +} + +static void mlx5e_tx_mpwqe_add_dseg(struct mlx5e_txqsq *sq, struct mlx5e_xmit_data *txd) +{ + struct mlx5e_tx_mpwqe *session = &sq->mpwqe; + struct mlx5_wqe_data_seg *dseg; + + dseg = (struct mlx5_wqe_data_seg *)session->wqe + session->ds_count; + + session->pkt_count++; + session->bytes_count += txd->len; + + dseg->addr = cpu_to_be64(txd->dma_addr); + dseg->byte_count = cpu_to_be32(txd->len); + dseg->lkey = sq->mkey_be; + session->ds_count++; + + sq->stats->mpwqe_pkts++; +} + +static struct mlx5_wqe_ctrl_seg *mlx5e_tx_mpwqe_session_complete(struct mlx5e_txqsq *sq) +{ + struct mlx5e_tx_mpwqe *session = &sq->mpwqe; + u8 ds_count = session->ds_count; + struct mlx5_wqe_ctrl_seg *cseg; + struct mlx5e_tx_wqe_info *wi; + u16 pi; + + cseg = &session->wqe->ctrl; + cseg->opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | MLX5_OPCODE_ENHANCED_MPSW); + cseg->qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_count); + + pi = mlx5_wq_cyc_ctr2ix(&sq->wq, sq->pc); + wi = &sq->db.wqe_info[pi]; + *wi = (struct mlx5e_tx_wqe_info) { + .skb = NULL, + .num_bytes = session->bytes_count, + .num_wqebbs = DIV_ROUND_UP(ds_count, MLX5_SEND_WQEBB_NUM_DS), + .num_dma = session->pkt_count, + .num_fifo_pkts = session->pkt_count, + }; + + sq->pc += wi->num_wqebbs; + + session->wqe = NULL; + + mlx5e_tx_check_stop(sq); + + return cseg; +} + +static void +mlx5e_sq_xmit_mpwqe(struct mlx5e_txqsq *sq, struct sk_buff *skb, + struct mlx5_wqe_eth_seg *eseg, bool xmit_more) +{ + struct mlx5_wqe_ctrl_seg *cseg; + struct mlx5e_xmit_data txd; + + if (!mlx5e_tx_mpwqe_session_is_active(sq)) { + mlx5e_tx_mpwqe_session_start(sq, eseg); + } else if (!mlx5e_tx_mpwqe_same_eseg(sq, eseg)) { + mlx5e_tx_mpwqe_session_complete(sq); + mlx5e_tx_mpwqe_session_start(sq, eseg); + } + + sq->stats->xmit_more += xmit_more; + + txd.data = skb->data; + txd.len = skb->len; + + txd.dma_addr = dma_map_single(sq->pdev, txd.data, txd.len, DMA_TO_DEVICE); + if (unlikely(dma_mapping_error(sq->pdev, txd.dma_addr))) + goto err_unmap; + mlx5e_dma_push(sq, txd.dma_addr, txd.len, MLX5E_DMA_MAP_SINGLE); + + mlx5e_skb_fifo_push(&sq->db.skb_fifo, skb); + + mlx5e_tx_mpwqe_add_dseg(sq, &txd); + + mlx5e_tx_skb_update_hwts_flags(skb); + + if (unlikely(mlx5e_tx_mpwqe_is_full(&sq->mpwqe, sq->max_sq_mpw_wqebbs))) { + /* Might stop the queue and affect the retval of __netdev_tx_sent_queue. */ + cseg = mlx5e_tx_mpwqe_session_complete(sq); + + if (__netdev_tx_sent_queue(sq->txq, txd.len, xmit_more)) + mlx5e_notify_hw(&sq->wq, sq->pc, sq->uar_map, cseg); + } else if (__netdev_tx_sent_queue(sq->txq, txd.len, xmit_more)) { + /* Might stop the queue, but we were asked to ring the doorbell anyway. */ + cseg = mlx5e_tx_mpwqe_session_complete(sq); + + mlx5e_notify_hw(&sq->wq, sq->pc, sq->uar_map, cseg); + } + + return; + +err_unmap: + mlx5e_dma_unmap_wqe_err(sq, 1); + sq->stats->dropped++; + dev_kfree_skb_any(skb); +} + +void mlx5e_tx_mpwqe_ensure_complete(struct mlx5e_txqsq *sq) +{ + /* Unlikely in non-MPWQE workloads; not important in MPWQE workloads. */ + if (unlikely(mlx5e_tx_mpwqe_session_is_active(sq))) + mlx5e_tx_mpwqe_session_complete(sq); +} + +static void mlx5e_cqe_ts_id_eseg(struct mlx5e_txqsq *sq, struct sk_buff *skb, + struct mlx5_wqe_eth_seg *eseg) +{ + if (MLX5_CAP_GEN_2(sq->mdev, ts_cqe_metadata_size2wqe_counter) && + unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) + eseg->flow_table_metadata = cpu_to_be32(sq->ptpsq->skb_fifo_pc & + sq->ptpsq->ts_cqe_ctr_mask); +} + +static void mlx5e_txwqe_build_eseg(struct mlx5e_priv *priv, struct mlx5e_txqsq *sq, + struct sk_buff *skb, struct mlx5e_accel_tx_state *accel, + struct mlx5_wqe_eth_seg *eseg, u16 ihs) +{ + mlx5e_accel_tx_eseg(priv, skb, eseg, ihs); + mlx5e_txwqe_build_eseg_csum(sq, skb, accel, eseg); + if (unlikely(sq->ptpsq)) + mlx5e_cqe_ts_id_eseg(sq, skb, eseg); +} + +netdev_tx_t mlx5e_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + struct mlx5e_accel_tx_state accel = {}; + struct mlx5e_tx_wqe_attr wqe_attr; + struct mlx5e_tx_attr attr; + struct mlx5e_tx_wqe *wqe; + struct mlx5e_txqsq *sq; + u16 pi; + + sq = priv->txq2sq[skb_get_queue_mapping(skb)]; + if (unlikely(!sq)) { + /* HTB queues are not guaranteed to be present in txq2sq. First, + * the HTB node is registered, which allows mlx5e_select_queue + * to select the corresponding queue ID. The SQ is created a bit + * later, which leaves a time frame where txq2sq is still NULL. + * Also, the SQ might fail to be created, which leaves txq2sq as + * NULL for indefinite time. + */ + dev_kfree_skb_any(skb); + return NETDEV_TX_OK; + } + + /* May send SKBs and WQEs. */ + if (unlikely(!mlx5e_accel_tx_begin(dev, sq, skb, &accel))) + return NETDEV_TX_OK; + + mlx5e_sq_xmit_prepare(sq, skb, &accel, &attr); + + if (test_bit(MLX5E_SQ_STATE_MPWQE, &sq->state)) { + if (mlx5e_tx_skb_supports_mpwqe(skb, &attr)) { + struct mlx5_wqe_eth_seg eseg = {}; + + mlx5e_txwqe_build_eseg(priv, sq, skb, &accel, &eseg, attr.ihs); + mlx5e_sq_xmit_mpwqe(sq, skb, &eseg, netdev_xmit_more()); + return NETDEV_TX_OK; + } + + mlx5e_tx_mpwqe_ensure_complete(sq); + } + + mlx5e_sq_calc_wqe_attr(skb, &attr, &wqe_attr); + pi = mlx5e_txqsq_get_next_pi(sq, wqe_attr.num_wqebbs); + wqe = MLX5E_TX_FETCH_WQE(sq, pi); + + /* May update the WQE, but may not post other WQEs. */ + mlx5e_accel_tx_finish(sq, wqe, &accel, + (struct mlx5_wqe_inline_seg *)(wqe->data + wqe_attr.ds_cnt_inl)); + mlx5e_txwqe_build_eseg(priv, sq, skb, &accel, &wqe->eth, attr.ihs); + mlx5e_sq_xmit_wqe(sq, skb, &attr, &wqe_attr, wqe, pi, netdev_xmit_more()); + + return NETDEV_TX_OK; +} + +void mlx5e_sq_xmit_simple(struct mlx5e_txqsq *sq, struct sk_buff *skb, bool xmit_more) +{ + struct mlx5e_tx_wqe_attr wqe_attr; + struct mlx5e_tx_attr attr; + struct mlx5e_tx_wqe *wqe; + u16 pi; + + mlx5e_sq_xmit_prepare(sq, skb, NULL, &attr); + mlx5e_sq_calc_wqe_attr(skb, &attr, &wqe_attr); + pi = mlx5e_txqsq_get_next_pi(sq, wqe_attr.num_wqebbs); + wqe = MLX5E_TX_FETCH_WQE(sq, pi); + mlx5e_txwqe_build_eseg_csum(sq, skb, NULL, &wqe->eth); + mlx5e_sq_xmit_wqe(sq, skb, &attr, &wqe_attr, wqe, pi, xmit_more); +} + +static void mlx5e_tx_wi_dma_unmap(struct mlx5e_txqsq *sq, struct mlx5e_tx_wqe_info *wi, + u32 *dma_fifo_cc) +{ + int i; + + for (i = 0; i < wi->num_dma; i++) { + struct mlx5e_sq_dma *dma = mlx5e_dma_get(sq, (*dma_fifo_cc)++); + + mlx5e_tx_dma_unmap(sq->pdev, dma); + } +} + +static void mlx5e_consume_skb(struct mlx5e_txqsq *sq, struct sk_buff *skb, + struct mlx5_cqe64 *cqe, int napi_budget) +{ + if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) { + struct skb_shared_hwtstamps hwts = {}; + u64 ts = get_cqe_ts(cqe); + + hwts.hwtstamp = mlx5e_cqe_ts_to_ns(sq->ptp_cyc2time, sq->clock, ts); + if (sq->ptpsq) + mlx5e_skb_cb_hwtstamp_handler(skb, MLX5E_SKB_CB_CQE_HWTSTAMP, + hwts.hwtstamp, sq->ptpsq->cq_stats); + else + skb_tstamp_tx(skb, &hwts); + } + + napi_consume_skb(skb, napi_budget); +} + +static void mlx5e_tx_wi_consume_fifo_skbs(struct mlx5e_txqsq *sq, struct mlx5e_tx_wqe_info *wi, + struct mlx5_cqe64 *cqe, int napi_budget) +{ + int i; + + for (i = 0; i < wi->num_fifo_pkts; i++) { + struct sk_buff *skb = mlx5e_skb_fifo_pop(&sq->db.skb_fifo); + + mlx5e_consume_skb(sq, skb, cqe, napi_budget); + } +} + +bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget) +{ + struct mlx5e_sq_stats *stats; + struct mlx5e_txqsq *sq; + struct mlx5_cqe64 *cqe; + u32 dma_fifo_cc; + u32 nbytes; + u16 npkts; + u16 sqcc; + int i; + + sq = container_of(cq, struct mlx5e_txqsq, cq); + + if (unlikely(!test_bit(MLX5E_SQ_STATE_ENABLED, &sq->state))) + return false; + + cqe = mlx5_cqwq_get_cqe(&cq->wq); + if (!cqe) + return false; + + stats = sq->stats; + + npkts = 0; + nbytes = 0; + + /* sq->cc must be updated only after mlx5_cqwq_update_db_record(), + * otherwise a cq overrun may occur + */ + sqcc = sq->cc; + + /* avoid dirtying sq cache line every cqe */ + dma_fifo_cc = sq->dma_fifo_cc; + + i = 0; + do { + struct mlx5e_tx_wqe_info *wi; + u16 wqe_counter; + bool last_wqe; + u16 ci; + + mlx5_cqwq_pop(&cq->wq); + + wqe_counter = be16_to_cpu(cqe->wqe_counter); + + do { + last_wqe = (sqcc == wqe_counter); + + ci = mlx5_wq_cyc_ctr2ix(&sq->wq, sqcc); + wi = &sq->db.wqe_info[ci]; + + sqcc += wi->num_wqebbs; + + if (likely(wi->skb)) { + mlx5e_tx_wi_dma_unmap(sq, wi, &dma_fifo_cc); + mlx5e_consume_skb(sq, wi->skb, cqe, napi_budget); + + npkts++; + nbytes += wi->num_bytes; + continue; + } + + if (unlikely(mlx5e_ktls_tx_try_handle_resync_dump_comp(sq, wi, + &dma_fifo_cc))) + continue; + + if (wi->num_fifo_pkts) { + mlx5e_tx_wi_dma_unmap(sq, wi, &dma_fifo_cc); + mlx5e_tx_wi_consume_fifo_skbs(sq, wi, cqe, napi_budget); + + npkts += wi->num_fifo_pkts; + nbytes += wi->num_bytes; + } + } while (!last_wqe); + + if (unlikely(get_cqe_opcode(cqe) == MLX5_CQE_REQ_ERR)) { + if (!test_and_set_bit(MLX5E_SQ_STATE_RECOVERING, + &sq->state)) { + mlx5e_dump_error_cqe(&sq->cq, sq->sqn, + (struct mlx5_err_cqe *)cqe); + mlx5_wq_cyc_wqe_dump(&sq->wq, ci, wi->num_wqebbs); + queue_work(cq->priv->wq, &sq->recover_work); + } + stats->cqe_err++; + } + + } while ((++i < MLX5E_TX_CQ_POLL_BUDGET) && (cqe = mlx5_cqwq_get_cqe(&cq->wq))); + + stats->cqes += i; + + mlx5_cqwq_update_db_record(&cq->wq); + + /* ensure cq space is freed before enabling more cqes */ + wmb(); + + sq->dma_fifo_cc = dma_fifo_cc; + sq->cc = sqcc; + + netdev_tx_completed_queue(sq->txq, npkts, nbytes); + + if (netif_tx_queue_stopped(sq->txq) && + mlx5e_wqc_has_room_for(&sq->wq, sq->cc, sq->pc, sq->stop_room) && + mlx5e_ptpsq_fifo_has_room(sq) && + !test_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state)) { + netif_tx_wake_queue(sq->txq); + stats->wake++; + } + + return (i == MLX5E_TX_CQ_POLL_BUDGET); +} + +static void mlx5e_tx_wi_kfree_fifo_skbs(struct mlx5e_txqsq *sq, struct mlx5e_tx_wqe_info *wi) +{ + int i; + + for (i = 0; i < wi->num_fifo_pkts; i++) + dev_kfree_skb_any(mlx5e_skb_fifo_pop(&sq->db.skb_fifo)); +} + +void mlx5e_free_txqsq_descs(struct mlx5e_txqsq *sq) +{ + struct mlx5e_tx_wqe_info *wi; + u32 dma_fifo_cc, nbytes = 0; + u16 ci, sqcc, npkts = 0; + + sqcc = sq->cc; + dma_fifo_cc = sq->dma_fifo_cc; + + while (sqcc != sq->pc) { + ci = mlx5_wq_cyc_ctr2ix(&sq->wq, sqcc); + wi = &sq->db.wqe_info[ci]; + + sqcc += wi->num_wqebbs; + + if (likely(wi->skb)) { + mlx5e_tx_wi_dma_unmap(sq, wi, &dma_fifo_cc); + dev_kfree_skb_any(wi->skb); + + npkts++; + nbytes += wi->num_bytes; + continue; + } + + if (unlikely(mlx5e_ktls_tx_try_handle_resync_dump_comp(sq, wi, &dma_fifo_cc))) + continue; + + if (wi->num_fifo_pkts) { + mlx5e_tx_wi_dma_unmap(sq, wi, &dma_fifo_cc); + mlx5e_tx_wi_kfree_fifo_skbs(sq, wi); + + npkts += wi->num_fifo_pkts; + nbytes += wi->num_bytes; + } + } + + sq->dma_fifo_cc = dma_fifo_cc; + sq->cc = sqcc; + + netdev_tx_completed_queue(sq->txq, npkts, nbytes); +} + +#ifdef CONFIG_MLX5_CORE_IPOIB +static inline void +mlx5i_txwqe_build_datagram(struct mlx5_av *av, u32 dqpn, u32 dqkey, + struct mlx5_wqe_datagram_seg *dseg) +{ + memcpy(&dseg->av, av, sizeof(struct mlx5_av)); + dseg->av.dqp_dct = cpu_to_be32(dqpn | MLX5_EXTENDED_UD_AV); + dseg->av.key.qkey.qkey = cpu_to_be32(dqkey); +} + +static void mlx5i_sq_calc_wqe_attr(struct sk_buff *skb, + const struct mlx5e_tx_attr *attr, + struct mlx5e_tx_wqe_attr *wqe_attr) +{ + u16 ds_cnt = sizeof(struct mlx5i_tx_wqe) / MLX5_SEND_WQE_DS; + u16 ds_cnt_inl = 0; + + ds_cnt += !!attr->headlen + skb_shinfo(skb)->nr_frags; + + if (attr->ihs) { + u16 inl = attr->ihs - INL_HDR_START_SZ; + + ds_cnt_inl = DIV_ROUND_UP(inl, MLX5_SEND_WQE_DS); + ds_cnt += ds_cnt_inl; + } + + *wqe_attr = (struct mlx5e_tx_wqe_attr) { + .ds_cnt = ds_cnt, + .ds_cnt_inl = ds_cnt_inl, + .num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS), + }; +} + +void mlx5i_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb, + struct mlx5_av *av, u32 dqpn, u32 dqkey, bool xmit_more) +{ + struct mlx5e_tx_wqe_attr wqe_attr; + struct mlx5e_tx_attr attr; + struct mlx5i_tx_wqe *wqe; + + struct mlx5_wqe_datagram_seg *datagram; + struct mlx5_wqe_ctrl_seg *cseg; + struct mlx5_wqe_eth_seg *eseg; + struct mlx5_wqe_data_seg *dseg; + struct mlx5e_tx_wqe_info *wi; + + struct mlx5e_sq_stats *stats = sq->stats; + int num_dma; + u16 pi; + + mlx5e_sq_xmit_prepare(sq, skb, NULL, &attr); + mlx5i_sq_calc_wqe_attr(skb, &attr, &wqe_attr); + + pi = mlx5e_txqsq_get_next_pi(sq, wqe_attr.num_wqebbs); + wqe = MLX5I_SQ_FETCH_WQE(sq, pi); + + stats->xmit_more += xmit_more; + + /* fill wqe */ + wi = &sq->db.wqe_info[pi]; + cseg = &wqe->ctrl; + datagram = &wqe->datagram; + eseg = &wqe->eth; + dseg = wqe->data; + + mlx5i_txwqe_build_datagram(av, dqpn, dqkey, datagram); + + mlx5e_txwqe_build_eseg_csum(sq, skb, NULL, eseg); + + eseg->mss = attr.mss; + + if (attr.ihs) { + memcpy(eseg->inline_hdr.start, skb->data, attr.ihs); + eseg->inline_hdr.sz = cpu_to_be16(attr.ihs); + dseg += wqe_attr.ds_cnt_inl; + } + + num_dma = mlx5e_txwqe_build_dsegs(sq, skb, skb->data + attr.ihs, + attr.headlen, dseg); + if (unlikely(num_dma < 0)) + goto err_drop; + + mlx5e_txwqe_complete(sq, skb, &attr, &wqe_attr, num_dma, wi, cseg, xmit_more); + + sq->dim_obj.sample.pkt_ctr = sq->stats->packets; + sq->dim_obj.sample.byte_ctr = sq->stats->bytes; + + return; + +err_drop: + stats->dropped++; + dev_kfree_skb_any(skb); +} +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c new file mode 100644 index 0000000..f3cd608 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c @@ -0,0 +1,264 @@ +/* + * Copyright (c) 2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include "en.h" +#include "en/txrx.h" +#include "en/xdp.h" +#include "en/xsk/rx.h" +#include "en/xsk/tx.h" +#include "en_accel/ktls_txrx.h" +#include "en/txrx.h" + +static inline bool mlx5e_channel_no_affinity_change(struct mlx5e_channel *c) +{ + int current_cpu = smp_processor_id(); + + return cpumask_test_cpu(current_cpu, c->aff_mask); +} + +static void mlx5e_handle_tx_dim(struct mlx5e_txqsq *sq) +{ + struct dim_sample *sample = &sq->dim_obj.sample; + + if (unlikely(!test_bit(MLX5E_SQ_STATE_AM, &sq->state))) + return; + + dim_update_sample(sq->cq.event_ctr, sample->pkt_ctr, sample->byte_ctr, sample); + net_dim(&sq->dim_obj.dim, *sample); +} + +static void mlx5e_handle_rx_dim(struct mlx5e_rq *rq) +{ + struct dim_sample *sample = &rq->dim_obj.sample; + + if (unlikely(!test_bit(MLX5E_RQ_STATE_AM, &rq->state))) + return; + + dim_update_sample(rq->cq.event_ctr, sample->pkt_ctr, sample->byte_ctr, sample); + net_dim(&rq->dim_obj.dim, *sample); +} + +static void mlx5e_rx_dim_cq_rearm(struct mlx5e_priv *priv, struct mlx5e_rq *rq) +{ + mlx5e_handle_rx_dim(rq); + if (test_bit(MLX5E_RQ_STATE_ENABLED, &rq->state)) + mlx5e_cq_arm(&rq->cq); +} + +void mlx5e_trigger_irq(struct mlx5e_icosq *sq) +{ + struct mlx5_wq_cyc *wq = &sq->wq; + struct mlx5e_tx_wqe *nopwqe; + u16 pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc); + + sq->db.wqe_info[pi] = (struct mlx5e_icosq_wqe_info) { + .wqe_type = MLX5E_ICOSQ_WQE_NOP, + .num_wqebbs = 1, + }; + + nopwqe = mlx5e_post_nop(wq, sq->sqn, &sq->pc); + mlx5e_notify_hw(wq, sq->pc, sq->uar_map, &nopwqe->ctrl); +} + +static bool mlx5e_napi_xsk_post(struct mlx5e_xdpsq *xsksq, struct mlx5e_rq *xskrq) +{ + bool busy_xsk = false, xsk_rx_alloc_err; + + /* Handle the race between the application querying need_wakeup and the + * driver setting it: + * 1. Update need_wakeup both before and after the TX. If it goes to + * "yes", it can only happen with the first update. + * 2. If the application queried need_wakeup before we set it, the + * packets will be transmitted anyway, even w/o a wakeup. + * 3. Give a chance to clear need_wakeup after new packets were queued + * for TX. + */ + mlx5e_xsk_update_tx_wakeup(xsksq); + busy_xsk |= mlx5e_xsk_tx(xsksq, MLX5E_TX_XSK_POLL_BUDGET); + mlx5e_xsk_update_tx_wakeup(xsksq); + + xsk_rx_alloc_err = INDIRECT_CALL_2(xskrq->post_wqes, + mlx5e_post_rx_mpwqes, + mlx5e_post_rx_wqes, + xskrq); + busy_xsk |= mlx5e_xsk_update_rx_wakeup(xskrq, xsk_rx_alloc_err); + + return busy_xsk; +} + +int mlx5e_napi_poll(struct napi_struct *napi, int budget) +{ + struct mlx5e_channel *c = container_of(napi, struct mlx5e_channel, + napi); + struct mlx5e_ch_stats *ch_stats = c->stats; + struct mlx5e_xdpsq *xsksq = &c->xsksq; + struct mlx5e_txqsq __rcu **qos_sqs; + struct mlx5e_rq *xskrq = &c->xskrq; + struct mlx5e_rq *rq = &c->rq; + bool aff_change = false; + bool busy_xsk = false; + bool busy = false; + int work_done = 0; + u16 qos_sqs_size; + bool xsk_open; + int i; + + rcu_read_lock(); + + qos_sqs = rcu_dereference(c->qos_sqs); + + xsk_open = test_bit(MLX5E_CHANNEL_STATE_XSK, c->state); + + ch_stats->poll++; + + for (i = 0; i < c->num_tc; i++) + busy |= mlx5e_poll_tx_cq(&c->sq[i].cq, budget); + + if (unlikely(qos_sqs)) { + smp_rmb(); /* Pairs with mlx5e_qos_alloc_queues. */ + qos_sqs_size = READ_ONCE(c->qos_sqs_size); + + for (i = 0; i < qos_sqs_size; i++) { + struct mlx5e_txqsq *sq = rcu_dereference(qos_sqs[i]); + + if (sq) + busy |= mlx5e_poll_tx_cq(&sq->cq, budget); + } + } + + busy |= mlx5e_poll_xdpsq_cq(&c->xdpsq.cq); + + if (c->xdp) + busy |= mlx5e_poll_xdpsq_cq(&c->rq_xdpsq.cq); + + if (likely(budget)) { /* budget=0 means: don't poll rx rings */ + if (xsk_open) + work_done = mlx5e_poll_rx_cq(&xskrq->cq, budget); + + if (likely(budget - work_done)) + work_done += mlx5e_poll_rx_cq(&rq->cq, budget - work_done); + + busy |= work_done == budget; + } + + mlx5e_poll_ico_cq(&c->icosq.cq); + if (mlx5e_poll_ico_cq(&c->async_icosq.cq)) + /* Don't clear the flag if nothing was polled to prevent + * queueing more WQEs and overflowing the async ICOSQ. + */ + clear_bit(MLX5E_SQ_STATE_PENDING_XSK_TX, &c->async_icosq.state); + + /* Keep after async ICOSQ CQ poll */ + if (unlikely(mlx5e_ktls_rx_pending_resync_list(c, budget))) + busy |= mlx5e_ktls_rx_handle_resync_list(c, budget); + + busy |= INDIRECT_CALL_2(rq->post_wqes, + mlx5e_post_rx_mpwqes, + mlx5e_post_rx_wqes, + rq); + if (xsk_open) { + busy |= mlx5e_poll_xdpsq_cq(&xsksq->cq); + busy_xsk |= mlx5e_napi_xsk_post(xsksq, xskrq); + } + + busy |= busy_xsk; + + if (busy) { + if (likely(mlx5e_channel_no_affinity_change(c))) { + work_done = budget; + goto out; + } + ch_stats->aff_change++; + aff_change = true; + if (budget && work_done == budget) + work_done--; + } + + if (unlikely(!napi_complete_done(napi, work_done))) + goto out; + + ch_stats->arm++; + + for (i = 0; i < c->num_tc; i++) { + mlx5e_handle_tx_dim(&c->sq[i]); + mlx5e_cq_arm(&c->sq[i].cq); + } + if (unlikely(qos_sqs)) { + for (i = 0; i < qos_sqs_size; i++) { + struct mlx5e_txqsq *sq = rcu_dereference(qos_sqs[i]); + + if (sq) { + mlx5e_handle_tx_dim(sq); + mlx5e_cq_arm(&sq->cq); + } + } + } + + mlx5e_rx_dim_cq_rearm(c->priv, rq); + mlx5e_cq_arm(&c->icosq.cq); + mlx5e_cq_arm(&c->async_icosq.cq); + mlx5e_cq_arm(&c->xdpsq.cq); + + if (xsk_open) { + mlx5e_rx_dim_cq_rearm(c->priv, xskrq); + mlx5e_cq_arm(&xsksq->cq); + } + + if (unlikely(aff_change && busy_xsk)) { + mlx5e_trigger_irq(&c->icosq); + ch_stats->force_irq++; + } + +out: + rcu_read_unlock(); + + return work_done; +} + +void mlx5e_completion_event(struct mlx5_core_cq *mcq, struct mlx5_eqe *eqe) +{ + struct mlx5e_cq *cq = container_of(mcq, struct mlx5e_cq, mcq); + + napi_schedule(cq->napi); + cq->event_ctr++; + cq->ch_stats->events++; +} + +void mlx5e_cq_error_event(struct mlx5_core_cq *mcq, enum mlx5_event event) +{ + struct mlx5e_cq *cq = container_of(mcq, struct mlx5e_cq, mcq); + struct net_device *netdev = cq->netdev; + + netdev_err(netdev, "%s: cqn=0x%.6x event=0x%.2x\n", + __func__, mcq->cqn, event); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/eq.c new file mode 100644 index 0000000..2ae701e --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -0,0 +1,1256 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2013-2021, Mellanox Technologies inc. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_RFS_ACCEL +#include +#endif +#include "mlx5_core.h" +#include "lib/eq.h" +#include "fpga/core.h" +#include "eswitch.h" +#include "lib/clock.h" +#include "diag/fw_tracer.h" +#include "mlx5_irq.h" +#include "devlink.h" +#include "mlx5_devm.h" + +enum { + MLX5_EQE_OWNER_INIT_VAL = 0x1, +}; + +enum { + MLX5_EQ_STATE_ARMED = 0x9, + MLX5_EQ_STATE_FIRED = 0xa, + MLX5_EQ_STATE_ALWAYS_ARMED = 0xb, +}; + +enum { + MLX5_EQ_DOORBEL_OFFSET = 0x40, +}; + +/* budget must be smaller than MLX5_NUM_SPARE_EQE to guarantee that we update + * the ci before we polled all the entries in the EQ. MLX5_NUM_SPARE_EQE is + * used to set the EQ size, budget must be smaller than the EQ size. + */ +enum { + MLX5_EQ_POLLING_BUDGET = 128, +}; + +static_assert(MLX5_EQ_POLLING_BUDGET <= MLX5_NUM_SPARE_EQE); + +struct mlx5_eq_table { + struct list_head comp_eqs_list; + struct mlx5_eq_async pages_eq; + struct mlx5_eq_async cmd_eq; + struct mlx5_eq_async async_eq; + + struct atomic_notifier_head nh[MLX5_EVENT_TYPE_MAX]; + + /* Since CQ DB is stored in async_eq */ + struct mlx5_nb cq_err_nb; + + struct mutex lock; /* sync async eqs creations */ + int num_comp_eqs; + struct mlx5_irq_table *irq_table; + struct mlx5_irq **comp_irqs; + struct mlx5_irq *ctrl_irq; +#ifdef CONFIG_RFS_ACCEL + struct cpu_rmap *rmap; +#endif +}; + +#define MLX5_ASYNC_EVENT_MASK ((1ull << MLX5_EVENT_TYPE_PATH_MIG) | \ + (1ull << MLX5_EVENT_TYPE_COMM_EST) | \ + (1ull << MLX5_EVENT_TYPE_SQ_DRAINED) | \ + (1ull << MLX5_EVENT_TYPE_CQ_ERROR) | \ + (1ull << MLX5_EVENT_TYPE_WQ_CATAS_ERROR) | \ + (1ull << MLX5_EVENT_TYPE_PATH_MIG_FAILED) | \ + (1ull << MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR) | \ + (1ull << MLX5_EVENT_TYPE_WQ_ACCESS_ERROR) | \ + (1ull << MLX5_EVENT_TYPE_PORT_CHANGE) | \ + (1ull << MLX5_EVENT_TYPE_SRQ_CATAS_ERROR) | \ + (1ull << MLX5_EVENT_TYPE_SRQ_LAST_WQE) | \ + (1ull << MLX5_EVENT_TYPE_SRQ_RQ_LIMIT)) + +static int mlx5_cmd_destroy_eq(struct mlx5_core_dev *dev, u8 eqn) +{ + u32 in[MLX5_ST_SZ_DW(destroy_eq_in)] = {}; + + MLX5_SET(destroy_eq_in, in, opcode, MLX5_CMD_OP_DESTROY_EQ); + MLX5_SET(destroy_eq_in, in, eq_number, eqn); + return mlx5_cmd_exec_in(dev, destroy_eq, in); +} + +/* caller must eventually call mlx5_cq_put on the returned cq */ +static struct mlx5_core_cq *mlx5_eq_cq_get(struct mlx5_eq *eq, u32 cqn) +{ + struct mlx5_cq_table *table = &eq->cq_table; + struct mlx5_core_cq *cq = NULL; + + rcu_read_lock(); + cq = radix_tree_lookup(&table->tree, cqn); + if (likely(cq)) + mlx5_cq_hold(cq); + rcu_read_unlock(); + + return cq; +} + +static int mlx5_eq_comp_int(struct notifier_block *nb, + __always_unused unsigned long action, + __always_unused void *data) +{ + struct mlx5_eq_comp *eq_comp = + container_of(nb, struct mlx5_eq_comp, irq_nb); + struct mlx5_eq *eq = &eq_comp->core; + struct mlx5_eqe *eqe; + int num_eqes = 0; + u32 cqn = -1; + + eqe = next_eqe_sw(eq); + if (!eqe) + goto out; + + do { + struct mlx5_core_cq *cq; + + /* Make sure we read EQ entry contents after we've + * checked the ownership bit. + */ + dma_rmb(); + /* Assume (eqe->type) is always MLX5_EVENT_TYPE_COMP */ + cqn = be32_to_cpu(eqe->data.comp.cqn) & 0xffffff; + + cq = mlx5_eq_cq_get(eq, cqn); + if (likely(cq)) { + ++cq->arm_sn; + cq->comp(cq, eqe); + mlx5_cq_put(cq); + } else { + dev_dbg_ratelimited(eq->dev->device, + "Completion event for bogus CQ 0x%x\n", cqn); + } + + ++eq->cons_index; + + } while ((++num_eqes < MLX5_EQ_POLLING_BUDGET) && (eqe = next_eqe_sw(eq))); + +out: + eq_update_ci(eq, 1); + + if (cqn != -1) + tasklet_schedule(&eq_comp->tasklet_ctx.task); + + return 0; +} + +/* Some architectures don't latch interrupts when they are disabled, so using + * mlx5_eq_poll_irq_disabled could end up losing interrupts while trying to + * avoid losing them. It is not recommended to use it, unless this is the last + * resort. + */ +u32 mlx5_eq_poll_irq_disabled(struct mlx5_eq_comp *eq) +{ + u32 count_eqe; + + disable_irq(eq->core.irqn); + count_eqe = eq->core.cons_index; + mlx5_eq_comp_int(&eq->irq_nb, 0, NULL); + count_eqe = eq->core.cons_index - count_eqe; + enable_irq(eq->core.irqn); + + return count_eqe; +} + +static void mlx5_eq_async_int_lock(struct mlx5_eq_async *eq, bool recovery, + unsigned long *flags) + __acquires(&eq->lock) +{ + if (!recovery) + spin_lock(&eq->lock); + else + spin_lock_irqsave(&eq->lock, *flags); +} + +static void mlx5_eq_async_int_unlock(struct mlx5_eq_async *eq, bool recovery, + unsigned long *flags) + __releases(&eq->lock) +{ + if (!recovery) + spin_unlock(&eq->lock); + else + spin_unlock_irqrestore(&eq->lock, *flags); +} + +enum async_eq_nb_action { + ASYNC_EQ_IRQ_HANDLER = 0, + ASYNC_EQ_RECOVER = 1, +}; + +static int mlx5_eq_async_int(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct mlx5_eq_async *eq_async = + container_of(nb, struct mlx5_eq_async, irq_nb); + struct mlx5_eq *eq = &eq_async->core; + struct mlx5_eq_table *eqt; + struct mlx5_core_dev *dev; + struct mlx5_eqe *eqe; + unsigned long flags; + int num_eqes = 0; + bool recovery; + + dev = eq->dev; + eqt = dev->priv.eq_table; + + recovery = action == ASYNC_EQ_RECOVER; + mlx5_eq_async_int_lock(eq_async, recovery, &flags); + + eqe = next_eqe_sw(eq); + if (!eqe) + goto out; + + do { + /* + * Make sure we read EQ entry contents after we've + * checked the ownership bit. + */ + dma_rmb(); + + atomic_notifier_call_chain(&eqt->nh[eqe->type], eqe->type, eqe); + atomic_notifier_call_chain(&eqt->nh[MLX5_EVENT_TYPE_NOTIFY_ANY], eqe->type, eqe); + + ++eq->cons_index; + + } while ((++num_eqes < MLX5_EQ_POLLING_BUDGET) && (eqe = next_eqe_sw(eq))); + +out: + eq_update_ci(eq, 1); + mlx5_eq_async_int_unlock(eq_async, recovery, &flags); + + return unlikely(recovery) ? num_eqes : 0; +} + +void mlx5_cmd_eq_recover(struct mlx5_core_dev *dev) +{ + struct mlx5_eq_async *eq = &dev->priv.eq_table->cmd_eq; + int eqes; + + eqes = mlx5_eq_async_int(&eq->irq_nb, ASYNC_EQ_RECOVER, NULL); + if (eqes) + mlx5_core_warn(dev, "Recovered %d EQEs on cmd_eq\n", eqes); +} + +static void init_eq_buf(struct mlx5_eq *eq) +{ + struct mlx5_eqe *eqe; + int i; + + for (i = 0; i < eq_get_size(eq); i++) { + eqe = get_eqe(eq, i); + eqe->owner = MLX5_EQE_OWNER_INIT_VAL; + } +} + +static void clear_rmap(struct mlx5_core_dev *dev) +{ +#ifdef CONFIG_RFS_ACCEL + struct mlx5_eq_table *eq_table = dev->priv.eq_table; + + free_irq_cpu_rmap(eq_table->rmap); + eq_table->rmap = NULL; +#endif +} + +static int +create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, + struct mlx5_eq_param *param) +{ + u8 log_eq_size = order_base_2(param->nent + MLX5_NUM_SPARE_EQE); + struct mlx5_cq_table *cq_table = &eq->cq_table; + u32 out[MLX5_ST_SZ_DW(create_eq_out)] = {0}; + u8 log_eq_stride = ilog2(MLX5_EQE_SIZE); + struct mlx5_priv *priv = &dev->priv; + __be64 *pas; + u16 vecidx; + void *eqc; + int inlen; + u32 *in; + int err; + int i; + + /* Init CQ table */ + memset(cq_table, 0, sizeof(*cq_table)); + spin_lock_init(&cq_table->lock); + INIT_RADIX_TREE(&cq_table->tree, GFP_ATOMIC); + + eq->cons_index = 0; + + err = mlx5_frag_buf_alloc_node(dev, wq_get_byte_sz(log_eq_size, log_eq_stride), + &eq->frag_buf, dev->priv.numa_node); + if (err) + return err; + + mlx5_init_fbc(eq->frag_buf.frags, log_eq_stride, log_eq_size, &eq->fbc); + init_eq_buf(eq); + + eq->irq = param->irq; + vecidx = mlx5_irq_get_index(eq->irq); + + inlen = MLX5_ST_SZ_BYTES(create_eq_in) + + MLX5_FLD_SZ_BYTES(create_eq_in, pas[0]) * eq->frag_buf.npages; + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_buf; + } + + pas = (__be64 *)MLX5_ADDR_OF(create_eq_in, in, pas); + mlx5_fill_page_frag_array(&eq->frag_buf, pas); + + MLX5_SET(create_eq_in, in, opcode, MLX5_CMD_OP_CREATE_EQ); + if (!param->mask[0] && MLX5_CAP_GEN(dev, log_max_uctx)) + MLX5_SET(create_eq_in, in, uid, MLX5_SHARED_RESOURCE_UID); + + for (i = 0; i < 4; i++) + MLX5_ARRAY_SET64(create_eq_in, in, event_bitmask, i, + param->mask[i]); + + eqc = MLX5_ADDR_OF(create_eq_in, in, eq_context_entry); + MLX5_SET(eqc, eqc, log_eq_size, eq->fbc.log_sz); + MLX5_SET(eqc, eqc, uar_page, priv->uar->index); + MLX5_SET(eqc, eqc, intr, vecidx); + MLX5_SET(eqc, eqc, log_page_size, + eq->frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); + + err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); + if (err) + goto err_in; + + eq->vecidx = vecidx; + eq->eqn = MLX5_GET(create_eq_out, out, eq_number); + eq->irqn = pci_irq_vector(dev->pdev, vecidx); + eq->dev = dev; + eq->doorbell = priv->uar->map + MLX5_EQ_DOORBEL_OFFSET; + + err = mlx5_debug_eq_add(dev, eq); + if (err) + goto err_eq; + + kvfree(in); + return 0; + +err_eq: + mlx5_cmd_destroy_eq(dev, eq->eqn); + +err_in: + kvfree(in); + +err_buf: + mlx5_frag_buf_free(dev, &eq->frag_buf); + return err; +} + +/** + * mlx5_eq_enable - Enable EQ for receiving EQEs + * @dev : Device which owns the eq + * @eq : EQ to enable + * @nb : Notifier call block + * + * Must be called after EQ is created in device. + * + * @return: 0 if no error + */ +int mlx5_eq_enable(struct mlx5_core_dev *dev, struct mlx5_eq *eq, + struct notifier_block *nb) +{ + int err; + + err = mlx5_irq_attach_nb(eq->irq, nb); + if (!err) + eq_update_ci(eq, 1); + + return err; +} +EXPORT_SYMBOL(mlx5_eq_enable); + +/** + * mlx5_eq_disable - Disable EQ for receiving EQEs + * @dev : Device which owns the eq + * @eq : EQ to disable + * @nb : Notifier call block + * + * Must be called before EQ is destroyed. + */ +void mlx5_eq_disable(struct mlx5_core_dev *dev, struct mlx5_eq *eq, + struct notifier_block *nb) +{ + mlx5_irq_detach_nb(eq->irq, nb); +} +EXPORT_SYMBOL(mlx5_eq_disable); + +static int destroy_unmap_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, + bool reentry) +{ + int err; + + mlx5_debug_eq_remove(dev, eq); + + err = mlx5_cmd_destroy_eq(dev, eq->eqn); + if (err) { + mlx5_core_warn(dev, "failed to destroy a previously created eq: eqn %d\n", + eq->eqn); + if (!reentry) + return err; + } + + mlx5_frag_buf_free(dev, &eq->frag_buf); + return err; +} + +int mlx5_eq_add_cq(struct mlx5_eq *eq, struct mlx5_core_cq *cq) +{ + struct mlx5_cq_table *table = &eq->cq_table; + int err; + + spin_lock(&table->lock); + err = radix_tree_insert(&table->tree, cq->cqn, cq); + spin_unlock(&table->lock); + + return err; +} + +void mlx5_eq_del_cq(struct mlx5_eq *eq, struct mlx5_core_cq *cq) +{ + struct mlx5_cq_table *table = &eq->cq_table; + struct mlx5_core_cq *tmp; + + spin_lock(&table->lock); + tmp = radix_tree_delete(&table->tree, cq->cqn); + spin_unlock(&table->lock); + + if (!tmp) { + mlx5_core_dbg(eq->dev, "cq 0x%x not found in eq 0x%x tree\n", + eq->eqn, cq->cqn); + return; + } + + if (tmp != cq) + mlx5_core_dbg(eq->dev, "corruption on cqn 0x%x in eq 0x%x\n", + eq->eqn, cq->cqn); +} + +int mlx5_eq_table_init(struct mlx5_core_dev *dev) +{ + struct mlx5_eq_table *eq_table; + int i; + + eq_table = kvzalloc(sizeof(*eq_table), GFP_KERNEL); + if (!eq_table) + return -ENOMEM; + + dev->priv.eq_table = eq_table; + + mlx5_eq_debugfs_init(dev); + + mutex_init(&eq_table->lock); + for (i = 0; i < MLX5_EVENT_TYPE_MAX; i++) + ATOMIC_INIT_NOTIFIER_HEAD(&eq_table->nh[i]); + + eq_table->irq_table = mlx5_irq_table_get(dev); + return 0; +} + +void mlx5_eq_table_cleanup(struct mlx5_core_dev *dev) +{ + mlx5_eq_debugfs_cleanup(dev); + kvfree(dev->priv.eq_table); +} + +/* Async EQs */ + +static int create_async_eq(struct mlx5_core_dev *dev, + struct mlx5_eq *eq, struct mlx5_eq_param *param) +{ + struct mlx5_eq_table *eq_table = dev->priv.eq_table; + int err; + + mutex_lock(&eq_table->lock); + err = create_map_eq(dev, eq, param); + mutex_unlock(&eq_table->lock); + return err; +} + +int mlx5_vector2eq(struct mlx5_core_dev *dev, int vector, struct mlx5_eq_comp *eqc) +{ + struct mlx5_eq_table *table = dev->priv.eq_table; + struct mlx5_eq_comp *eq, *n; + int err = -ENOENT; + int i = 0; + + mutex_lock(&table->lock); + list_for_each_entry_safe(eq, n, &table->comp_eqs_list, list) { + if (i++ == vector) { + *eqc = *eq; + err = 0; + break; + } + } + mutex_unlock(&table->lock); + + return err; +} + +void mlx5_rename_comp_eq(struct mlx5_core_dev *dev, unsigned int eq_ix, + char *name) +{ + struct mlx5_eq_table *table = dev->priv.eq_table; + struct mlx5_eq_comp *eq, *n; + int err = -ENOENT; + int i = 0; + + if (mlx5_core_is_sf(dev)) + return; + + mutex_lock(&table->lock); + if (eq_ix >= table->num_comp_eqs) { + dev_err(&dev->pdev->dev, "%s: failed: %d\n", + __func__, err); + goto unlock; + } + list_for_each_entry_safe(eq, n, &table->comp_eqs_list, list) + if (i++ == eq_ix) + break; + mlx5_irq_rename(dev, eq->core.irq, name); +unlock: + mutex_unlock(&table->lock); +} + +static int destroy_async_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq) +{ + struct mlx5_eq_table *eq_table = dev->priv.eq_table; + int err; + + mutex_lock(&eq_table->lock); + err = destroy_unmap_eq(dev, eq, true); + mutex_unlock(&eq_table->lock); + return err; +} + +static int cq_err_event_notifier(struct notifier_block *nb, + unsigned long type, void *data) +{ + struct mlx5_eq_table *eqt; + struct mlx5_core_cq *cq; + struct mlx5_eqe *eqe; + struct mlx5_eq *eq; + u32 cqn; + + /* type == MLX5_EVENT_TYPE_CQ_ERROR */ + + eqt = mlx5_nb_cof(nb, struct mlx5_eq_table, cq_err_nb); + eq = &eqt->async_eq.core; + eqe = data; + + cqn = be32_to_cpu(eqe->data.cq_err.cqn) & 0xffffff; + mlx5_core_warn(eq->dev, "CQ error on CQN 0x%x, syndrome 0x%x\n", + cqn, eqe->data.cq_err.syndrome); + + cq = mlx5_eq_cq_get(eq, cqn); + if (unlikely(!cq)) { + mlx5_core_warn(eq->dev, "Async event for bogus CQ 0x%x\n", cqn); + return NOTIFY_OK; + } + + if (cq->event) + cq->event(cq, type); + + mlx5_cq_put(cq); + + return NOTIFY_OK; +} + +static void gather_user_async_events(struct mlx5_core_dev *dev, u64 mask[4]) +{ + __be64 *user_unaffiliated_events; + __be64 *user_affiliated_events; + int i; + + user_affiliated_events = + MLX5_CAP_DEV_EVENT(dev, user_affiliated_events); + user_unaffiliated_events = + MLX5_CAP_DEV_EVENT(dev, user_unaffiliated_events); + + for (i = 0; i < 4; i++) + mask[i] |= be64_to_cpu(user_affiliated_events[i] | + user_unaffiliated_events[i]); +} + +static void gather_async_events_mask(struct mlx5_core_dev *dev, u64 mask[4]) +{ + u64 async_event_mask = MLX5_ASYNC_EVENT_MASK; + + if (MLX5_VPORT_MANAGER(dev)) + async_event_mask |= (1ull << MLX5_EVENT_TYPE_NIC_VPORT_CHANGE); + + if (MLX5_CAP_GEN(dev, general_notification_event)) + async_event_mask |= (1ull << MLX5_EVENT_TYPE_GENERAL_EVENT); + + if (MLX5_CAP_GEN(dev, port_module_event)) + async_event_mask |= (1ull << MLX5_EVENT_TYPE_PORT_MODULE_EVENT); + else + mlx5_core_dbg(dev, "port_module_event is not set\n"); + + if (MLX5_PPS_CAP(dev)) + async_event_mask |= (1ull << MLX5_EVENT_TYPE_PPS_EVENT); + + if (MLX5_CAP_GEN(dev, fpga)) + async_event_mask |= (1ull << MLX5_EVENT_TYPE_FPGA_ERROR) | + (1ull << MLX5_EVENT_TYPE_FPGA_QP_ERROR); + + if (MLX5_CAP_GEN(dev, nvmf_target_offload)) + async_event_mask |= (1ull << MLX5_EVENT_TYPE_XRQ_ERROR); + + if (MLX5_CAP_GEN_MAX(dev, dct)) + async_event_mask |= (1ull << MLX5_EVENT_TYPE_DCT_DRAINED); + + if (MLX5_CAP_GEN(dev, temp_warn_event)) + async_event_mask |= (1ull << MLX5_EVENT_TYPE_TEMP_WARN_EVENT); + + if (MLX5_CAP_MCAM_REG(dev, tracer_registers)) + async_event_mask |= (1ull << MLX5_EVENT_TYPE_DEVICE_TRACER); + + if (MLX5_CAP_GEN(dev, max_num_of_monitor_counters)) + async_event_mask |= (1ull << MLX5_EVENT_TYPE_MONITOR_COUNTER); + + if (mlx5_eswitch_is_funcs_handler(dev)) + async_event_mask |= + (1ull << MLX5_EVENT_TYPE_ESW_FUNCTIONS_CHANGED); + + if (MLX5_CAP_GEN_MAX(dev, vhca_state)) + async_event_mask |= (1ull << MLX5_EVENT_TYPE_VHCA_STATE_CHANGE); + + if (MLX5_CAP_IPSEC(dev, ipsec_full_offload)) + async_event_mask |= + (1ull << MLX5_EVENT_TYPE_OBJECT_CHANGE_EVENT); + + mask[0] = async_event_mask; + + if (MLX5_CAP_GEN(dev, event_cap)) + gather_user_async_events(dev, mask); +} + +static int +setup_async_eq(struct mlx5_core_dev *dev, struct mlx5_eq_async *eq, + struct mlx5_eq_param *param, const char *name) +{ + int err; + + eq->irq_nb.notifier_call = mlx5_eq_async_int; + spin_lock_init(&eq->lock); + + err = create_async_eq(dev, &eq->core, param); + if (err) { + mlx5_core_warn(dev, "failed to create %s EQ %d\n", name, err); + return err; + } + err = mlx5_eq_enable(dev, &eq->core, &eq->irq_nb); + if (err) { + mlx5_core_warn(dev, "failed to enable %s EQ %d\n", name, err); + destroy_async_eq(dev, &eq->core); + } + return err; +} + +static void cleanup_async_eq(struct mlx5_core_dev *dev, + struct mlx5_eq_async *eq, const char *name) +{ + int err; + + mlx5_eq_disable(dev, &eq->core, &eq->irq_nb); + err = destroy_async_eq(dev, &eq->core); + if (err) + mlx5_core_err(dev, "failed to destroy %s eq, err(%d)\n", + name, err); +} + +static u16 async_eq_depth_devlink_param_get(struct mlx5_core_dev *dev) +{ + struct devlink *devlink = priv_to_devlink(dev); + union devlink_param_value val; + int err; + + err = devlink_param_driverinit_value_get(devlink, + DEVLINK_PARAM_GENERIC_ID_EVENT_EQ_SIZE, + &val); + if (!err) + return val.vu32; + mlx5_core_dbg(dev, "Failed to get param. using default. err = %d\n", err); + return MLX5_NUM_ASYNC_EQE; +} +static int create_async_eqs(struct mlx5_core_dev *dev) +{ + struct mlx5_eq_table *table = dev->priv.eq_table; + struct mlx5_eq_param param = {}; + int err; + + /* All the async_eqs are using single IRQ, request one IRQ and share its + * index among all the async_eqs of this device. + */ + table->ctrl_irq = mlx5_ctrl_irq_request(dev); + if (IS_ERR(table->ctrl_irq)) + return PTR_ERR(table->ctrl_irq); + + MLX5_NB_INIT(&table->cq_err_nb, cq_err_event_notifier, CQ_ERROR); + mlx5_eq_notifier_register(dev, &table->cq_err_nb); + + param = (struct mlx5_eq_param) { + .irq = table->ctrl_irq, + .nent = MLX5_NUM_CMD_EQE, + .mask[0] = 1ull << MLX5_EVENT_TYPE_CMD, + }; + mlx5_cmd_allowed_opcode(dev, MLX5_CMD_OP_CREATE_EQ); + err = setup_async_eq(dev, &table->cmd_eq, ¶m, "cmd"); + if (err) + goto err1; + + mlx5_cmd_use_events(dev); + mlx5_cmd_allowed_opcode(dev, CMD_ALLOWED_OPCODE_ALL); + + param = (struct mlx5_eq_param) { + .irq = table->ctrl_irq, + .nent = async_eq_depth_devlink_param_get(dev), + }; + + if (mlx5_core_is_sf(dev) && dev->async_eq_depth) + param.nent = dev->async_eq_depth; + + gather_async_events_mask(dev, param.mask); + err = setup_async_eq(dev, &table->async_eq, ¶m, "async"); + if (err) + goto err2; + + param = (struct mlx5_eq_param) { + .irq = table->ctrl_irq, + .nent = /* TODO: sriov max_vf + */ 1, + .mask[0] = 1ull << MLX5_EVENT_TYPE_PAGE_REQUEST, + }; + + err = setup_async_eq(dev, &table->pages_eq, ¶m, "pages"); + if (err) + goto err3; + + return 0; + +err3: + cleanup_async_eq(dev, &table->async_eq, "async"); +err2: + mlx5_cmd_use_polling(dev); + cleanup_async_eq(dev, &table->cmd_eq, "cmd"); +err1: + mlx5_cmd_allowed_opcode(dev, CMD_ALLOWED_OPCODE_ALL); + mlx5_eq_notifier_unregister(dev, &table->cq_err_nb); + mlx5_ctrl_irq_release(table->ctrl_irq); + return err; +} + +static void destroy_async_eqs(struct mlx5_core_dev *dev) +{ + struct mlx5_eq_table *table = dev->priv.eq_table; + + cleanup_async_eq(dev, &table->pages_eq, "pages"); + cleanup_async_eq(dev, &table->async_eq, "async"); + mlx5_cmd_allowed_opcode(dev, MLX5_CMD_OP_DESTROY_EQ); + mlx5_cmd_use_polling(dev); + cleanup_async_eq(dev, &table->cmd_eq, "cmd"); + mlx5_cmd_allowed_opcode(dev, CMD_ALLOWED_OPCODE_ALL); + mlx5_eq_notifier_unregister(dev, &table->cq_err_nb); + mlx5_ctrl_irq_release(table->ctrl_irq); +} + +struct mlx5_eq *mlx5_get_async_eq(struct mlx5_core_dev *dev) +{ + return &dev->priv.eq_table->async_eq.core; +} + +void mlx5_eq_synchronize_async_irq(struct mlx5_core_dev *dev) +{ + synchronize_irq(dev->priv.eq_table->async_eq.core.irqn); +} + +void mlx5_eq_synchronize_cmd_irq(struct mlx5_core_dev *dev) +{ + synchronize_irq(dev->priv.eq_table->cmd_eq.core.irqn); +} + +/* Generic EQ API for mlx5_core consumers + * Needed For RDMA ODP EQ for now + */ +struct mlx5_eq * +mlx5_eq_create_generic(struct mlx5_core_dev *dev, + struct mlx5_eq_param *param) +{ + struct mlx5_eq *eq = kvzalloc(sizeof(*eq), GFP_KERNEL); + int err; + + if (!eq) + return ERR_PTR(-ENOMEM); + + param->irq = dev->priv.eq_table->ctrl_irq; + err = create_async_eq(dev, eq, param); + if (err) { + kvfree(eq); + eq = ERR_PTR(err); + } + + return eq; +} +EXPORT_SYMBOL(mlx5_eq_create_generic); + +int mlx5_eq_destroy_generic(struct mlx5_core_dev *dev, struct mlx5_eq *eq) +{ + struct mlx5_eq_table *eq_table = dev->priv.eq_table; + int err; + + if (IS_ERR(eq)) + return -EINVAL; + + mutex_lock(&eq_table->lock); + err = destroy_unmap_eq(dev, eq, false); + mutex_unlock(&eq_table->lock); + if (err) + goto out; + + kvfree(eq); +out: + return err; +} +EXPORT_SYMBOL(mlx5_eq_destroy_generic); + +struct mlx5_eqe *mlx5_eq_get_eqe(struct mlx5_eq *eq, u32 cc) +{ + u32 ci = eq->cons_index + cc; + u32 nent = eq_get_size(eq); + struct mlx5_eqe *eqe; + + eqe = get_eqe(eq, ci & (nent - 1)); + eqe = ((eqe->owner & 1) ^ !!(ci & nent)) ? NULL : eqe; + /* Make sure we read EQ entry contents after we've + * checked the ownership bit. + */ + if (eqe) + dma_rmb(); + + return eqe; +} +EXPORT_SYMBOL(mlx5_eq_get_eqe); + +void mlx5_eq_update_ci(struct mlx5_eq *eq, u32 cc, bool arm) +{ + __be32 __iomem *addr = eq->doorbell + (arm ? 0 : 2); + u32 val; + + eq->cons_index += cc; + val = (eq->cons_index & 0xffffff) | (eq->eqn << 24); + + __raw_writel((__force u32)cpu_to_be32(val), addr); + /* We still want ordering, just not swabbing, so add a barrier */ + wmb(); +} +EXPORT_SYMBOL(mlx5_eq_update_ci); + +static int comp_irqs_request_by_cpu_affinity(struct mlx5_core_dev *dev) +{ + struct mlx5_eq_table *table = dev->priv.eq_table; + cpumask_var_t user_mask; + int ret; + + if (!zalloc_cpumask_var(&user_mask, GFP_KERNEL)) + return -ENOMEM; + + ret = mlx5_devm_affinity_get_param(dev, user_mask); + if (ret) + goto out; + + ret = mlx5_irqs_request_mask(dev, table->comp_irqs, user_mask); +out: + free_cpumask_var(user_mask); + return ret; +} + +static void comp_irqs_release(struct mlx5_core_dev *dev) +{ + struct mlx5_eq_table *table = dev->priv.eq_table; + + if (mlx5_core_is_sf(dev)) + mlx5_irq_affinity_irqs_release(dev, table->comp_irqs, table->num_comp_eqs); + else + mlx5_irqs_release_vectors(table->comp_irqs, table->num_comp_eqs); + kfree(table->comp_irqs); +} + +static int comp_irqs_request(struct mlx5_core_dev *dev) +{ + struct mlx5_eq_table *table = dev->priv.eq_table; + int ncomp_eqs = table->num_comp_eqs; + u16 *cpus; + int ret; + int i; + + ncomp_eqs = table->num_comp_eqs; + table->comp_irqs = kcalloc(ncomp_eqs, sizeof(*table->comp_irqs), GFP_KERNEL); + if (!table->comp_irqs) + return -ENOMEM; + + ret = comp_irqs_request_by_cpu_affinity(dev); + if (ret > 0) + return ret; + mlx5_core_dbg(dev, "failed to get param cpu_affinity. use default policy\n"); + if (mlx5_core_is_sf(dev)) { + ret = mlx5_irq_affinity_irqs_request_auto(dev, ncomp_eqs, table->comp_irqs); + if (ret < 0) + goto free_irqs; + return ret; + } + + cpus = kcalloc(ncomp_eqs, sizeof(*cpus), GFP_KERNEL); + if (!cpus) { + ret = -ENOMEM; + goto free_irqs; + } + for (i = 0; i < ncomp_eqs; i++) + cpus[i] = cpumask_local_spread(i, dev->priv.numa_node); + ret = mlx5_irqs_request_vectors(dev, cpus, ncomp_eqs, table->comp_irqs); + kfree(cpus); + if (ret < 0) + goto free_irqs; + return ret; + +free_irqs: + kfree(table->comp_irqs); + return ret; +} + +static void destroy_comp_eqs(struct mlx5_core_dev *dev) +{ + struct mlx5_eq_table *table = dev->priv.eq_table; + struct mlx5_eq_comp *eq, *n; + + list_for_each_entry_safe(eq, n, &table->comp_eqs_list, list) { + list_del(&eq->list); + mlx5_eq_disable(dev, &eq->core, &eq->irq_nb); + if (destroy_unmap_eq(dev, &eq->core, true)) + mlx5_core_warn(dev, "failed to destroy comp EQ 0x%x\n", + eq->core.eqn); + tasklet_disable(&eq->tasklet_ctx.task); + kfree(eq); + } + comp_irqs_release(dev); +} + +static u16 comp_eq_depth_devlink_param_get(struct mlx5_core_dev *dev) +{ + struct devlink *devlink = priv_to_devlink(dev); + union devlink_param_value val; + int err; + + err = devlink_param_driverinit_value_get(devlink, + DEVLINK_PARAM_GENERIC_ID_IO_EQ_SIZE, + &val); + if (!err) + return val.vu32; + mlx5_core_dbg(dev, "Failed to get param. using default. err = %d\n", err); + return MLX5_COMP_EQ_SIZE; +} + +static int create_comp_eqs(struct mlx5_core_dev *dev) +{ + struct mlx5_eq_table *table = dev->priv.eq_table; + struct mlx5_eq_comp *eq; + int ncomp_eqs; + int nent; + int err; + int i; + + ncomp_eqs = comp_irqs_request(dev); + if (ncomp_eqs < 0) + return ncomp_eqs; + INIT_LIST_HEAD(&table->comp_eqs_list); + nent = comp_eq_depth_devlink_param_get(dev); + + /* if user specified completion eq depth, honor that */ + if (mlx5_core_is_sf(dev) && dev->cmpl_eq_depth) + nent = dev->cmpl_eq_depth; + + for (i = 0; i < ncomp_eqs; i++) { + struct mlx5_eq_param param = {}; + + eq = kzalloc(sizeof(*eq), GFP_KERNEL); + if (!eq) { + err = -ENOMEM; + goto clean; + } + + INIT_LIST_HEAD(&eq->tasklet_ctx.list); + INIT_LIST_HEAD(&eq->tasklet_ctx.process_list); + spin_lock_init(&eq->tasklet_ctx.lock); + tasklet_setup(&eq->tasklet_ctx.task, mlx5_cq_tasklet_cb); + + eq->irq_nb.notifier_call = mlx5_eq_comp_int; + param = (struct mlx5_eq_param) { + .irq = table->comp_irqs[i], + .nent = nent, + }; + + err = create_map_eq(dev, &eq->core, ¶m); + if (err) + goto clean_eq; + err = mlx5_eq_enable(dev, &eq->core, &eq->irq_nb); + if (err) { + destroy_unmap_eq(dev, &eq->core, true); + goto clean_eq; + } + + mlx5_core_dbg(dev, "allocated completion EQN %d\n", eq->core.eqn); + /* add tail, to keep the list ordered, for mlx5_vector2eqn to work */ + list_add_tail(&eq->list, &table->comp_eqs_list); + } + + table->num_comp_eqs = ncomp_eqs; + return 0; + +clean_eq: + kfree(eq); +clean: + if (!mlx5_core_is_sf(dev)) + clear_rmap(dev); + + destroy_comp_eqs(dev); + return err; +} + +static int vector2eqnirqn(struct mlx5_core_dev *dev, int vector, int *eqn, + unsigned int *irqn) +{ + struct mlx5_eq_table *table = dev->priv.eq_table; + struct mlx5_eq_comp *eq, *n; + int err = -ENOENT; + int i = 0; + + list_for_each_entry_safe(eq, n, &table->comp_eqs_list, list) { + if (i++ == vector) { + if (irqn) + *irqn = eq->core.irqn; + if (eqn) + *eqn = eq->core.eqn; + err = 0; + break; + } + } + + return err; +} + +int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn) +{ + return vector2eqnirqn(dev, vector, eqn, NULL); +} +EXPORT_SYMBOL(mlx5_vector2eqn); + +int mlx5_vector2irqn(struct mlx5_core_dev *dev, int vector, unsigned int *irqn) +{ + return vector2eqnirqn(dev, vector, NULL, irqn); +} + +unsigned int mlx5_comp_vectors_count(struct mlx5_core_dev *dev) +{ + return dev->priv.eq_table->num_comp_eqs; +} +EXPORT_SYMBOL(mlx5_comp_vectors_count); + +struct cpumask * +mlx5_comp_irq_get_affinity_mask(struct mlx5_core_dev *dev, int vector) +{ + struct mlx5_eq_table *table = dev->priv.eq_table; + struct mlx5_eq_comp *eq, *n; + int i = 0; + + list_for_each_entry_safe(eq, n, &table->comp_eqs_list, list) { + if (i++ == vector) + break; + } + + return mlx5_irq_get_affinity_mask(eq->core.irq); +} +EXPORT_SYMBOL(mlx5_comp_irq_get_affinity_mask); + +void mlx5_core_affinity_get(struct mlx5_core_dev *dev, struct cpumask *dev_mask) +{ + struct mlx5_eq_table *table = dev->priv.eq_table; + struct mlx5_eq_comp *eq, *n; + + list_for_each_entry_safe(eq, n, &table->comp_eqs_list, list) + cpumask_or(dev_mask, dev_mask, + mlx5_irq_get_affinity_mask(eq->core.irq)); +} + +#ifdef CONFIG_RFS_ACCEL +struct cpu_rmap *mlx5_eq_table_get_rmap(struct mlx5_core_dev *dev) +{ + return dev->priv.eq_table->rmap; +} +#endif + +struct mlx5_eq_comp *mlx5_eqn2comp_eq(struct mlx5_core_dev *dev, int eqn) +{ + struct mlx5_eq_table *table = dev->priv.eq_table; + struct mlx5_eq_comp *eq; + + list_for_each_entry(eq, &table->comp_eqs_list, list) { + if (eq->core.eqn == eqn) + return eq; + } + + return ERR_PTR(-ENOENT); +} + +static int set_rmap(struct mlx5_core_dev *mdev) +{ + int err = 0; +#ifdef CONFIG_RFS_ACCEL + struct mlx5_eq_table *eq_table = mdev->priv.eq_table; + int vecidx; + + eq_table->rmap = alloc_irq_cpu_rmap(eq_table->num_comp_eqs); + if (!eq_table->rmap) { + err = -ENOMEM; + mlx5_core_err(mdev, "Failed to allocate cpu_rmap. err %d", err); + goto err_out; + } + + for (vecidx = 0; vecidx < eq_table->num_comp_eqs; vecidx++) { + err = irq_cpu_rmap_add(eq_table->rmap, + pci_irq_vector(mdev->pdev, vecidx)); + if (err) { + mlx5_core_err(mdev, "irq_cpu_rmap_add failed. err %d", + err); + goto err_irq_cpu_rmap_add; + } + } + return 0; + +err_irq_cpu_rmap_add: + clear_rmap(mdev); +err_out: +#endif + return err; +} + +/* This function should only be called after mlx5_cmd_force_teardown_hca */ +void mlx5_core_eq_free_irqs(struct mlx5_core_dev *dev) +{ + struct mlx5_eq_table *table = dev->priv.eq_table; + + mutex_lock(&table->lock); /* sync with create/destroy_async_eq */ + if (!mlx5_core_is_sf(dev)) + clear_rmap(dev); + mlx5_irq_table_destroy(dev); + mutex_unlock(&table->lock); +} + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING +#define MLX5_MAX_ASYNC_EQS 4 +#else +#define MLX5_MAX_ASYNC_EQS 3 +#endif + +int mlx5_eq_table_create(struct mlx5_core_dev *dev) +{ + struct mlx5_eq_table *eq_table = dev->priv.eq_table; + int num_eqs = MLX5_CAP_GEN(dev, max_num_eqs) ? + MLX5_CAP_GEN(dev, max_num_eqs) : + 1 << MLX5_CAP_GEN(dev, log_max_eq); + int max_eqs_sf; + int err; + + eq_table->num_comp_eqs = + min_t(int, + mlx5_irq_table_get_num_comp(eq_table->irq_table), + num_eqs - MLX5_MAX_ASYNC_EQS); + if (mlx5_core_is_sf(dev)) { + max_eqs_sf = min_t(int, MLX5_COMP_EQS_PER_SF, + mlx5_irq_table_get_sfs_vec(eq_table->irq_table)); + eq_table->num_comp_eqs = min_t(int, eq_table->num_comp_eqs, + max_eqs_sf); + /* If user has setup non zero max completion EQs, honor that */ + if (dev->max_cmpl_eq_count) + eq_table->num_comp_eqs = min_t(int, dev->max_cmpl_eq_count, + eq_table->num_comp_eqs); + } + + err = create_async_eqs(dev); + if (err) { + mlx5_core_err(dev, "Failed to create async EQs\n"); + goto err_async_eqs; + } + + if (!mlx5_core_is_sf(dev)) { + /* rmap is a mapping between irq number and queue number. + * each irq can be assign only to a single rmap. + * since SFs share IRQs, rmap mapping cannot function correctly + * for irqs that are shared for different core/netdev RX rings. + * Hence we don't allow netdev rmap for SFs + */ + err = set_rmap(dev); + if (err) + goto err_rmap; + } + + err = create_comp_eqs(dev); + if (err) { + mlx5_core_err(dev, "Failed to create completion EQs\n"); + goto err_comp_eqs; + } + + return 0; +err_comp_eqs: + if (!mlx5_core_is_sf(dev)) + clear_rmap(dev); +err_rmap: + destroy_async_eqs(dev); +err_async_eqs: + return err; +} + +void mlx5_eq_table_destroy(struct mlx5_core_dev *dev) +{ + if (!mlx5_core_is_sf(dev)) + clear_rmap(dev); + destroy_comp_eqs(dev); + destroy_async_eqs(dev); +} + +int mlx5_eq_notifier_register(struct mlx5_core_dev *dev, struct mlx5_nb *nb) +{ + struct mlx5_eq_table *eqt = dev->priv.eq_table; + + return atomic_notifier_chain_register(&eqt->nh[nb->event_type], &nb->nb); +} +EXPORT_SYMBOL(mlx5_eq_notifier_register); + +int mlx5_eq_notifier_unregister(struct mlx5_core_dev *dev, struct mlx5_nb *nb) +{ + struct mlx5_eq_table *eqt = dev->priv.eq_table; + + return atomic_notifier_chain_unregister(&eqt->nh[nb->event_type], &nb->nb); +} +EXPORT_SYMBOL(mlx5_eq_notifier_unregister); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/Makefile new file mode 100644 index 0000000..c78512e --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +subdir-ccflags-y += -I$(src)/.. diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_lgcy.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_lgcy.c new file mode 100644 index 0000000..d477084 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_lgcy.c @@ -0,0 +1,288 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020 Mellanox Technologies Inc. All rights reserved. */ + +#include "mlx5_core.h" +#include "eswitch.h" +#include "helper.h" +#include "lgcy.h" + +static void esw_acl_egress_lgcy_rules_destroy(struct mlx5_vport *vport) +{ + struct mlx5_acl_vlan *trunk_vlan_rule, *tmp; + + esw_acl_egress_vlan_destroy(vport); + + list_for_each_entry_safe(trunk_vlan_rule, tmp, + &vport->egress.legacy.allow_vlans_rules, list) { + mlx5_del_flow_rules(trunk_vlan_rule->acl_vlan_rule); + list_del(&trunk_vlan_rule->list); + kfree(trunk_vlan_rule); + } + + if (!IS_ERR_OR_NULL(vport->egress.legacy.drop_rule)) { + mlx5_del_flow_rules(vport->egress.legacy.drop_rule); + vport->egress.legacy.drop_rule = NULL; + } + + if (!IS_ERR_OR_NULL(vport->egress.legacy.allow_untagged_rule)) { + mlx5_del_flow_rules(vport->egress.legacy.allow_untagged_rule); + vport->egress.legacy.allow_untagged_rule = NULL; + } +} + +static int esw_acl_egress_lgcy_groups_create(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_core_dev *dev = esw->dev; + struct mlx5_flow_group *untagged_grp; + struct mlx5_flow_group *drop_grp; + void *match_criteria; + u32 *flow_group_in; + int err = 0; + + flow_group_in = kvzalloc(inlen, GFP_KERNEL); + if (!flow_group_in) + return -ENOMEM; + + MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); + match_criteria = MLX5_ADDR_OF(create_flow_group_in, flow_group_in, match_criteria); + + /* Create flow group for allowed untagged flow rule */ + MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.cvlan_tag); + MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.svlan_tag); + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 0); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 0); + + untagged_grp = mlx5_create_flow_group(vport->egress.acl, flow_group_in); + if (IS_ERR(untagged_grp)) { + err = PTR_ERR(untagged_grp); + esw_warn(dev, "Failed to create E-Switch vport[%d] egress untagged flow group, err(%d)\n", + vport->vport, err); + goto untagged_grp_err; + } + + /* Create flow group for allowed tagged flow rules */ + err = esw_acl_egress_vlan_grp_create(esw, vport, 1, VLAN_N_VID); + if (err) { + esw_warn(dev, "Failed to create E-Switch vport[%d] egress tagged flow group, err(%d)\n", + vport->vport, err); + goto tagged_grp_err; + } + + /* Create flow group for drop rule */ + memset(flow_group_in, 0, inlen); + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, VLAN_N_VID + 1); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, VLAN_N_VID + 1); + drop_grp = mlx5_create_flow_group(vport->egress.acl, flow_group_in); + if (IS_ERR(drop_grp)) { + err = PTR_ERR(drop_grp); + esw_warn(dev, "Failed to create E-Switch vport[%d] egress drop flow group, err(%d)\n", + vport->vport, err); + goto drop_grp_err; + } + + vport->egress.legacy.allow_untagged_grp = untagged_grp; + vport->egress.legacy.drop_grp = drop_grp; + kvfree(flow_group_in); + return 0; + +drop_grp_err: + esw_acl_egress_vlan_grp_destroy(vport); +tagged_grp_err: + if (!IS_ERR_OR_NULL(untagged_grp)) + mlx5_destroy_flow_group(untagged_grp); +untagged_grp_err: + kvfree(flow_group_in); + return err; +} + +static void esw_acl_egress_lgcy_groups_destroy(struct mlx5_vport *vport) +{ + if (!IS_ERR_OR_NULL(vport->egress.legacy.drop_grp)) { + mlx5_destroy_flow_group(vport->egress.legacy.drop_grp); + vport->egress.legacy.drop_grp = NULL; + } + esw_acl_egress_vlan_grp_destroy(vport); + + if (!IS_ERR_OR_NULL(vport->egress.legacy.allow_untagged_grp)) { + mlx5_destroy_flow_group(vport->egress.legacy.allow_untagged_grp); + vport->egress.legacy.allow_untagged_grp = NULL; + } +} + +int esw_acl_egress_lgcy_setup(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + bool need_vlan_filter = !!bitmap_weight(vport->info.vlan_trunk_8021q_bitmap, + VLAN_N_VID); + bool need_acl_table = vport->info.vlan || vport->info.qos || + need_vlan_filter; + enum esw_vst_mode vst_mode = esw_get_vst_mode(esw); + struct mlx5_acl_vlan *trunk_vlan_rule; + struct mlx5_flow_destination drop_ctr_dst = {}; + struct mlx5_flow_destination *dst = NULL; + struct mlx5_fc *drop_counter = NULL; + struct mlx5_flow_act flow_act = {}; + /* The egress acl table contains 3 groups: + * 1)Allow untagged traffic + * 2)Allow tagged traffic with vlan_tag=vst_vlan_id/vgt+_vlan_id + * 3)Drop all other traffic + */ + int table_size = VLAN_N_VID + 2; + struct mlx5_flow_spec *spec; + int dest_num = 0; + u16 vlan_id = 0; + int err = 0; + + esw_acl_egress_lgcy_rules_destroy(vport); + + esw_acl_egress_lgcy_cleanup(esw, vport); + if (!need_acl_table) + return 0; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) + return -ENOMEM; + + if (vport->egress.legacy.drop_counter) { + drop_counter = vport->egress.legacy.drop_counter; + } else if (MLX5_CAP_ESW_EGRESS_ACL(esw->dev, flow_counter)) { + drop_counter = mlx5_fc_create(esw->dev, false); + if (IS_ERR(drop_counter)) { + esw_warn(esw->dev, + "vport[%d] configure egress drop rule counter err(%ld)\n", + vport->vport, PTR_ERR(drop_counter)); + drop_counter = NULL; + } + vport->egress.legacy.drop_counter = drop_counter; + } + + if (!vport->egress.acl) { + vport->egress.acl = esw_acl_table_create(esw, vport, + MLX5_FLOW_NAMESPACE_ESW_EGRESS, + 0, table_size); + + if (IS_ERR(vport->egress.acl)) { + err = PTR_ERR(vport->egress.acl); + vport->egress.acl = NULL; + goto out; + } + + err = esw_acl_egress_lgcy_groups_create(esw, vport); + if (err) + goto out; + } + + esw_debug(esw->dev, + "vport[%d] configure egress rules, vlan(%d) qos(%d)\n", + vport->vport, vport->info.vlan, vport->info.qos); + + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.cvlan_tag); + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.svlan_tag); + spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS; + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_ALLOW; + + /* Allow untagged */ + if (need_vlan_filter && test_bit(0, vport->info.vlan_trunk_8021q_bitmap)) { + vport->egress.legacy.allow_untagged_rule = + mlx5_add_flow_rules(vport->egress.acl, spec, + &flow_act, NULL, 0); + if (IS_ERR(vport->egress.legacy.allow_untagged_rule)) { + err = PTR_ERR(vport->egress.legacy.allow_untagged_rule); + esw_warn(esw->dev, + "vport[%d] configure egress allow rule, err(%d)\n", + vport->vport, err); + vport->egress.legacy.allow_untagged_rule = NULL; + } + } + + /* VST rule */ + if (vport->info.vlan || vport->info.qos) { + int actions_flag = MLX5_FLOW_CONTEXT_ACTION_ALLOW; + + if (vst_mode == ESW_VST_MODE_STEERING) + actions_flag |= MLX5_FLOW_CONTEXT_ACTION_VLAN_POP; + err = esw_egress_acl_vlan_create(esw, vport, NULL, vport->info.vlan_proto, + vport->info.vlan, actions_flag); + if (err) + goto out; + } + + /* VGT+ rules */ + if (vport->info.vlan_proto == htons(ETH_P_8021Q)) + MLX5_SET_TO_ONES(fte_match_param, spec->match_value, outer_headers.cvlan_tag); + else + MLX5_SET_TO_ONES(fte_match_param, spec->match_value, outer_headers.svlan_tag); + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.first_vid); + for_each_set_bit(vlan_id, vport->acl_vlan_8021q_bitmap, VLAN_N_VID) { + trunk_vlan_rule = kzalloc(sizeof(*trunk_vlan_rule), GFP_KERNEL); + if (!trunk_vlan_rule) { + err = -ENOMEM; + goto out; + } + + MLX5_SET(fte_match_param, spec->match_value, outer_headers.first_vid, + vlan_id); + trunk_vlan_rule->acl_vlan_rule = + mlx5_add_flow_rules(vport->egress.acl, spec, &flow_act, NULL, 0); + if (IS_ERR(trunk_vlan_rule->acl_vlan_rule)) { + err = PTR_ERR(trunk_vlan_rule->acl_vlan_rule); + esw_warn(esw->dev, + "vport[%d] configure egress allowed vlan rule failed, err(%d)\n", + vport->vport, err); + trunk_vlan_rule->acl_vlan_rule = NULL; + goto out; + } + list_add(&trunk_vlan_rule->list, &vport->egress.legacy.allow_vlans_rules); + } + + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP; + + /* Attach egress drop flow counter */ + if (drop_counter) { + flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_COUNT; + drop_ctr_dst.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; + drop_ctr_dst.counter_id = mlx5_fc_id(drop_counter); + dst = &drop_ctr_dst; + dest_num++; + } + vport->egress.legacy.drop_rule = + mlx5_add_flow_rules(vport->egress.acl, NULL, + &flow_act, dst, dest_num); + if (IS_ERR(vport->egress.legacy.drop_rule)) { + err = PTR_ERR(vport->egress.legacy.drop_rule); + esw_warn(esw->dev, + "vport[%d] configure egress drop rule failed, err(%d)\n", + vport->vport, err); + vport->egress.legacy.drop_rule = NULL; + goto out; + } + + kvfree(spec); + return err; + +out: + esw_acl_egress_lgcy_cleanup(esw, vport); + kvfree(spec); + return err; +} + +void esw_acl_egress_lgcy_cleanup(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + if (IS_ERR_OR_NULL(vport->egress.acl)) + goto clean_drop_counter; + + esw_debug(esw->dev, "Destroy vport[%d] E-Switch egress ACL\n", vport->vport); + + esw_acl_egress_lgcy_rules_destroy(vport); + esw_acl_egress_lgcy_groups_destroy(vport); + esw_acl_egress_table_destroy(vport); + +clean_drop_counter: + if (vport->egress.legacy.drop_counter) { + mlx5_fc_destroy(esw->dev, vport->egress.legacy.drop_counter); + vport->egress.legacy.drop_counter = NULL; + } +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_ofld.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_ofld.c new file mode 100644 index 0000000..ab18a8f --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_ofld.c @@ -0,0 +1,266 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020 Mellanox Technologies Inc. All rights reserved. */ + +#include "mlx5_core.h" +#include "eswitch.h" +#include "helper.h" +#include "ofld.h" +#include "esw/vf_meter.h" + +#define MLX5_ESW_EGRESS_ACL_DEFAULT_PRIO 4 + +static void esw_acl_egress_ofld_fwd2vport_destroy(struct mlx5_vport *vport) +{ + if (!vport->egress.offloads.fwd_rule) + return; + + mlx5_del_flow_rules(vport->egress.offloads.fwd_rule); + vport->egress.offloads.fwd_rule = NULL; +} + +static void esw_acl_egress_ofld_bounce_rule_destroy(struct mlx5_vport *vport) +{ + if (!vport->egress.offloads.bounce_rule) + return; + + mlx5_del_flow_rules(vport->egress.offloads.bounce_rule); + vport->egress.offloads.bounce_rule = NULL; +} + +static int esw_acl_egress_ofld_fwd2vport_create(struct mlx5_eswitch *esw, + struct mlx5_vport *vport, + struct mlx5_flow_destination *fwd_dest) +{ + struct mlx5_flow_act flow_act = {}; + int err = 0; + + esw_debug(esw->dev, "vport(%d) configure egress acl rule fwd2vport(%d)\n", + vport->vport, fwd_dest->vport.num); + + /* Delete the old egress forward-to-vport rule if any */ + esw_acl_egress_ofld_fwd2vport_destroy(vport); + + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + + vport->egress.offloads.fwd_rule = + mlx5_add_flow_rules(vport->egress.acl, NULL, + &flow_act, fwd_dest, 1); + if (IS_ERR(vport->egress.offloads.fwd_rule)) { + err = PTR_ERR(vport->egress.offloads.fwd_rule); + esw_warn(esw->dev, + "vport(%d) failed to add fwd2vport acl rule err(%d)\n", + vport->vport, err); + vport->egress.offloads.fwd_rule = NULL; + } + + return err; +} + +static int esw_acl_egress_ofld_rules_create(struct mlx5_eswitch *esw, + struct mlx5_vport *vport, + struct mlx5_flow_destination *fwd_dest) +{ + int err = 0; + int action; + + if (MLX5_CAP_GEN(esw->dev, prio_tag_required)) { + /* For prio tag mode, there is only 1 FTEs: + * 1) prio tag packets - pop the prio tag VLAN, allow + * Unmatched traffic is allowed by default + */ + esw_debug(esw->dev, + "vport[%d] configure prio tag egress rules\n", vport->vport); + + action = MLX5_FLOW_CONTEXT_ACTION_VLAN_POP; + action |= fwd_dest ? MLX5_FLOW_CONTEXT_ACTION_FWD_DEST : + MLX5_FLOW_CONTEXT_ACTION_ALLOW; + + /* prio tag vlan rule - pop it so vport receives untagged packets */ + err = esw_egress_acl_vlan_create(esw, vport, fwd_dest, vport->info.vlan_proto, 0, action); + if (err) + goto prio_err; + } + + if (fwd_dest) { + err = esw_acl_egress_ofld_fwd2vport_create(esw, vport, fwd_dest); + if (err) + goto fwd_err; + } + + return 0; + +fwd_err: + esw_acl_egress_vlan_destroy(vport); +prio_err: + return err; +} + +static void esw_acl_egress_ofld_rules_destroy(struct mlx5_vport *vport) +{ + esw_acl_egress_vlan_destroy(vport); + esw_acl_egress_ofld_fwd2vport_destroy(vport); + esw_acl_egress_ofld_bounce_rule_destroy(vport); +} + +static int esw_acl_egress_ofld_groups_create(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_flow_group *fwd_grp; + u32 *flow_group_in; + u32 flow_index = 0; + int ret = 0; + + if (MLX5_CAP_GEN(esw->dev, prio_tag_required)) { + ret = esw_acl_egress_vlan_grp_create(esw, vport, 0, 0); + if (ret) + return ret; + + flow_index++; + } + + if (!mlx5_esw_acl_egress_fwd2vport_supported(esw)) + goto out; + + flow_group_in = kvzalloc(inlen, GFP_KERNEL); + if (!flow_group_in) { + ret = -ENOMEM; + goto fwd_grp_err; + } + + /* This group holds 1 FTE to forward all packets to other vport + * when bond vports is supported. + */ + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, flow_index); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, flow_index); + fwd_grp = mlx5_create_flow_group(vport->egress.acl, flow_group_in); + if (IS_ERR(fwd_grp)) { + ret = PTR_ERR(fwd_grp); + esw_warn(esw->dev, + "Failed to create vport[%d] egress fwd2vport flow group, err(%d)\n", + vport->vport, ret); + kvfree(flow_group_in); + goto fwd_grp_err; + } + vport->egress.offloads.fwd_grp = fwd_grp; + kvfree(flow_group_in); + return 0; + +fwd_grp_err: + esw_acl_egress_vlan_grp_destroy(vport); +out: + return ret; +} + +static void esw_acl_egress_ofld_groups_destroy(struct mlx5_vport *vport) +{ + if (!IS_ERR_OR_NULL(vport->egress.offloads.fwd_grp)) { + mlx5_destroy_flow_group(vport->egress.offloads.fwd_grp); + vport->egress.offloads.fwd_grp = NULL; + } + + if (!IS_ERR_OR_NULL(vport->egress.offloads.bounce_grp)) { + mlx5_destroy_flow_group(vport->egress.offloads.bounce_grp); + vport->egress.offloads.bounce_grp = NULL; + } + + esw_acl_egress_vlan_grp_destroy(vport); +} + +static bool esw_acl_egress_needed(struct mlx5_eswitch *esw, u16 vport_num) +{ + return mlx5_eswitch_is_vf_vport(esw, vport_num) || mlx5_esw_is_sf_vport(esw, vport_num); +} + +int esw_acl_egress_ofld_setup(struct mlx5_eswitch *esw, struct mlx5_vport *vport) +{ + int table_size = 0; + int err; + + if (!mlx5_esw_acl_egress_fwd2vport_supported(esw) && + !MLX5_CAP_GEN(esw->dev, prio_tag_required)) + return 0; + + if (!esw_acl_egress_needed(esw, vport->vport)) + return 0; + + esw_acl_egress_ofld_rules_destroy(vport); + + if (mlx5_esw_acl_egress_fwd2vport_supported(esw)) + table_size++; + if (MLX5_CAP_GEN(esw->dev, prio_tag_required)) + table_size++; + + vport->egress.acl = esw_acl_table_create(esw, vport, + MLX5_FLOW_NAMESPACE_ESW_EGRESS, + MLX5_ESW_EGRESS_ACL_DEFAULT_PRIO, + table_size); + if (IS_ERR(vport->egress.acl)) { + err = PTR_ERR(vport->egress.acl); + vport->egress.acl = NULL; + return err; + } + + err = esw_acl_egress_ofld_groups_create(esw, vport); + if (err) + goto group_err; + + esw_debug(esw->dev, "vport[%d] configure egress rules\n", vport->vport); + + err = esw_acl_egress_ofld_rules_create(esw, vport, NULL); + if (err) + goto rules_err; + + return 0; + +rules_err: + esw_acl_egress_ofld_groups_destroy(vport); +group_err: + esw_acl_egress_table_destroy(vport); + return err; +} + +void esw_acl_egress_ofld_cleanup(struct mlx5_vport *vport) +{ + esw_vf_meter_egress_destroy(vport); + esw_acl_egress_ofld_rules_destroy(vport); + esw_acl_egress_ofld_groups_destroy(vport); + esw_acl_egress_table_destroy(vport); +} + +int mlx5_esw_acl_egress_vport_bond(struct mlx5_eswitch *esw, u16 active_vport_num, + u16 passive_vport_num) +{ + struct mlx5_vport *passive_vport = mlx5_eswitch_get_vport(esw, passive_vport_num); + struct mlx5_vport *active_vport = mlx5_eswitch_get_vport(esw, active_vport_num); + struct mlx5_flow_destination fwd_dest = {}; + + if (IS_ERR(active_vport)) + return PTR_ERR(active_vport); + if (IS_ERR(passive_vport)) + return PTR_ERR(passive_vport); + + /* Cleanup and recreate rules WITHOUT fwd2vport of active vport */ + esw_acl_egress_ofld_rules_destroy(active_vport); + esw_acl_egress_ofld_rules_create(esw, active_vport, NULL); + + /* Cleanup and recreate all rules + fwd2vport rule of passive vport to forward */ + esw_acl_egress_ofld_rules_destroy(passive_vport); + fwd_dest.type = MLX5_FLOW_DESTINATION_TYPE_VPORT; + fwd_dest.vport.num = active_vport_num; + fwd_dest.vport.vhca_id = MLX5_CAP_GEN(esw->dev, vhca_id); + fwd_dest.vport.flags = MLX5_FLOW_DEST_VPORT_VHCA_ID; + + return esw_acl_egress_ofld_rules_create(esw, passive_vport, &fwd_dest); +} + +int mlx5_esw_acl_egress_vport_unbond(struct mlx5_eswitch *esw, u16 vport_num) +{ + struct mlx5_vport *vport = mlx5_eswitch_get_vport(esw, vport_num); + + if (IS_ERR(vport)) + return PTR_ERR(vport); + + esw_acl_egress_ofld_rules_destroy(vport); + return esw_acl_egress_ofld_rules_create(esw, vport, NULL); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.c new file mode 100644 index 0000000..b08957c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.c @@ -0,0 +1,171 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020 Mellanox Technologies Inc. All rights reserved. */ + +#include "mlx5_core.h" +#include "eswitch.h" +#include "helper.h" + +struct mlx5_flow_table * +esw_acl_table_create(struct mlx5_eswitch *esw, struct mlx5_vport *vport, int ns, + int prio, int size) +{ + struct mlx5_flow_table_attr ft_attr = {}; + struct mlx5_core_dev *dev = esw->dev; + struct mlx5_flow_namespace *root_ns; + struct mlx5_flow_table *acl; + int acl_supported; + u16 vport_num; + int err; + + acl_supported = (ns == MLX5_FLOW_NAMESPACE_ESW_INGRESS) ? + MLX5_CAP_ESW_INGRESS_ACL(dev, ft_support) : + MLX5_CAP_ESW_EGRESS_ACL(dev, ft_support); + + if (!acl_supported) + return ERR_PTR(-EOPNOTSUPP); + + vport_num = vport->vport; + esw_debug(dev, "Create vport[%d] %s ACL table\n", vport_num, + ns == MLX5_FLOW_NAMESPACE_ESW_INGRESS ? "ingress" : "egress"); + + root_ns = mlx5_get_flow_vport_acl_namespace(dev, ns, vport->index); + if (!root_ns) { + esw_warn(dev, "Failed to get E-Switch root namespace for vport (%d)\n", + vport_num); + return ERR_PTR(-EOPNOTSUPP); + } + + ft_attr.max_fte = size; + ft_attr.prio = prio; + ft_attr.flags = MLX5_FLOW_TABLE_OTHER_VPORT; + acl = mlx5_create_vport_flow_table(root_ns, &ft_attr, vport_num); + if (IS_ERR(acl)) { + err = PTR_ERR(acl); + esw_warn(dev, "vport[%d] create %s ACL table, err(%d)\n", vport_num, + ns == MLX5_FLOW_NAMESPACE_ESW_INGRESS ? "ingress" : "egress", err); + } + return acl; +} + +int esw_egress_acl_vlan_create(struct mlx5_eswitch *esw, + struct mlx5_vport *vport, + struct mlx5_flow_destination *fwd_dest, + __be16 vlan_proto, u16 vlan_id, u32 flow_action) +{ + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_spec *spec; + int err = 0; + + if (vport->egress.allowed_vlan) + return -EEXIST; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) + return -ENOMEM; + + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.cvlan_tag); + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.svlan_tag); + if (vlan_proto == htons(ETH_P_8021Q)) + MLX5_SET_TO_ONES(fte_match_param, spec->match_value, outer_headers.cvlan_tag); + else + MLX5_SET_TO_ONES(fte_match_param, spec->match_value, outer_headers.svlan_tag); + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.first_vid); + MLX5_SET(fte_match_param, spec->match_value, outer_headers.first_vid, vlan_id); + + spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS; + flow_act.action = flow_action; + vport->egress.allowed_vlan = + mlx5_add_flow_rules(vport->egress.acl, spec, + &flow_act, fwd_dest, 0); + if (IS_ERR(vport->egress.allowed_vlan)) { + err = PTR_ERR(vport->egress.allowed_vlan); + esw_warn(esw->dev, + "vport[%d] configure egress vlan rule failed, err(%d)\n", + vport->vport, err); + vport->egress.allowed_vlan = NULL; + } + + kvfree(spec); + return err; +} + +void esw_acl_egress_vlan_destroy(struct mlx5_vport *vport) +{ + if (!IS_ERR_OR_NULL(vport->egress.allowed_vlan)) { + mlx5_del_flow_rules(vport->egress.allowed_vlan); + vport->egress.allowed_vlan = NULL; + } +} + +int esw_acl_egress_vlan_grp_create(struct mlx5_eswitch *esw, struct mlx5_vport *vport, + u32 start_index, u32 end_index) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_flow_group *vlan_grp; + void *match_criteria; + u32 *flow_group_in; + int ret = 0; + + flow_group_in = kvzalloc(inlen, GFP_KERNEL); + if (!flow_group_in) + return -ENOMEM; + + MLX5_SET(create_flow_group_in, flow_group_in, + match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); + match_criteria = MLX5_ADDR_OF(create_flow_group_in, + flow_group_in, match_criteria); + MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.cvlan_tag); + MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.svlan_tag); + MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.first_vid); + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, start_index); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, end_index); + + vlan_grp = mlx5_create_flow_group(vport->egress.acl, flow_group_in); + if (IS_ERR(vlan_grp)) { + ret = PTR_ERR(vlan_grp); + esw_warn(esw->dev, + "Failed to create E-Switch vport[%d] egress pop vlans flow group, err(%d)\n", + vport->vport, ret); + goto out; + } + vport->egress.vlan_grp = vlan_grp; + +out: + kvfree(flow_group_in); + return ret; +} + +void esw_acl_egress_vlan_grp_destroy(struct mlx5_vport *vport) +{ + if (!IS_ERR_OR_NULL(vport->egress.vlan_grp)) { + mlx5_destroy_flow_group(vport->egress.vlan_grp); + vport->egress.vlan_grp = NULL; + } +} + +void esw_acl_egress_table_destroy(struct mlx5_vport *vport) +{ + if (IS_ERR_OR_NULL(vport->egress.acl)) + return; + + mlx5_destroy_flow_table(vport->egress.acl); + vport->egress.acl = NULL; +} + +void esw_acl_ingress_table_destroy(struct mlx5_vport *vport) +{ + if (!vport->ingress.acl) + return; + + mlx5_destroy_flow_table(vport->ingress.acl); + vport->ingress.acl = NULL; +} + +void esw_acl_ingress_allow_rule_destroy(struct mlx5_vport *vport) +{ + if (!vport->ingress.allow_rule) + return; + + mlx5_del_flow_rules(vport->ingress.allow_rule); + vport->ingress.allow_rule = NULL; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.h new file mode 100644 index 0000000..d2ff587 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/helper.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2020 Mellanox Technologies Inc. All rights reserved. */ + +#ifndef __MLX5_ESWITCH_ACL_HELPER_H__ +#define __MLX5_ESWITCH_ACL_HELPER_H__ + +#include "eswitch.h" + +/* General acl helper functions */ +struct mlx5_flow_table * +esw_acl_table_create(struct mlx5_eswitch *esw, struct mlx5_vport *vport, int ns, + int prio, int size); + +/* Egress acl helper functions */ +void esw_acl_egress_table_destroy(struct mlx5_vport *vport); +int esw_egress_acl_vlan_create(struct mlx5_eswitch *esw, struct mlx5_vport *vport, + struct mlx5_flow_destination *fwd_dest, + __be16 vlan_proto, u16 vlan_id, u32 flow_action); +void esw_acl_egress_vlan_destroy(struct mlx5_vport *vport); +int esw_acl_egress_vlan_grp_create(struct mlx5_eswitch *esw, struct mlx5_vport *vport, + u32 start_index, u32 end_index); +void esw_acl_egress_vlan_grp_destroy(struct mlx5_vport *vport); + +/* Ingress acl helper functions */ +void esw_acl_ingress_table_destroy(struct mlx5_vport *vport); +void esw_acl_ingress_allow_rule_destroy(struct mlx5_vport *vport); + +#endif /* __MLX5_ESWITCH_ACL_HELPER_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_lgcy.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_lgcy.c new file mode 100644 index 0000000..f43f934 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_lgcy.c @@ -0,0 +1,366 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020 Mellanox Technologies Inc. All rights reserved. */ + +#include "mlx5_core.h" +#include "eswitch.h" +#include "helper.h" +#include "lgcy.h" + +static void esw_acl_ingress_lgcy_rules_destroy(struct mlx5_vport *vport) +{ + struct mlx5_acl_vlan *trunk_vlan_rule, *tmp; + + if (vport->ingress.legacy.drop_rule) { + mlx5_del_flow_rules(vport->ingress.legacy.drop_rule); + vport->ingress.legacy.drop_rule = NULL; + } + + list_for_each_entry_safe(trunk_vlan_rule, tmp, + &vport->ingress.legacy.allow_vlans_rules, + list) { + mlx5_del_flow_rules(trunk_vlan_rule->acl_vlan_rule); + list_del(&trunk_vlan_rule->list); + kfree(trunk_vlan_rule); + } + + if (vport->ingress.legacy.allow_untagged_rule) { + mlx5_del_flow_rules(vport->ingress.legacy.allow_untagged_rule); + vport->ingress.legacy.allow_untagged_rule = NULL; + } +} + +static int esw_acl_ingress_lgcy_groups_create(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + bool need_vlan_filter = !!bitmap_weight(vport->info.vlan_trunk_8021q_bitmap, + VLAN_N_VID); + enum esw_vst_mode vst_mode = esw_get_vst_mode(esw); + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_flow_group *untagged_spoof_grp = NULL; + struct mlx5_flow_group *tagged_spoof_grp = NULL; + struct mlx5_flow_table *acl = vport->ingress.acl; + struct mlx5_flow_group *drop_grp = NULL; + struct mlx5_core_dev *dev = esw->dev; + void *match_criteria; + bool push_on_any_pkt; + int allow_grp_sz = 1; + u32 *flow_group_in; + int err; + + flow_group_in = kvzalloc(inlen, GFP_KERNEL); + if (!flow_group_in) + return -ENOMEM; + + match_criteria = MLX5_ADDR_OF(create_flow_group_in, flow_group_in, match_criteria); + + push_on_any_pkt = (vst_mode != ESW_VST_MODE_BASIC) && + !vport->info.spoofchk && !need_vlan_filter; + if (!push_on_any_pkt) + MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); + + if (need_vlan_filter || (vst_mode == ESW_VST_MODE_BASIC && + (vport->info.vlan || vport->info.qos))) + MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.cvlan_tag); + + if (vport->info.spoofchk) { + MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.smac_47_16); + MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.smac_15_0); + } + + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 0); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 0); + + untagged_spoof_grp = mlx5_create_flow_group(acl, flow_group_in); + if (IS_ERR(untagged_spoof_grp)) { + err = PTR_ERR(untagged_spoof_grp); + esw_warn(dev, "Failed to create E-Switch vport[%d] ingress untagged spoofchk flow group, err(%d)\n", + vport->vport, err); + goto spoof_err; + } + + if (push_on_any_pkt) + goto set_grp; + + if (!need_vlan_filter) + goto drop_grp; + + memset(flow_group_in, 0, inlen); + MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, + MLX5_MATCH_OUTER_HEADERS); + if (vport->info.spoofchk) { + MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.smac_47_16); + MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.smac_15_0); + } + MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.cvlan_tag); + MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.first_vid); + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 1); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, VLAN_N_VID); + allow_grp_sz = VLAN_N_VID + 1; + + tagged_spoof_grp = mlx5_create_flow_group(acl, flow_group_in); + if (IS_ERR(tagged_spoof_grp)) { + err = PTR_ERR(tagged_spoof_grp); + esw_warn(dev, "Failed to create E-Switch vport[%d] ingress spoofchk flow group, err(%d)\n", + vport->vport, err); + goto allow_spoof_err; + } + +drop_grp: + memset(flow_group_in, 0, inlen); + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, allow_grp_sz); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, allow_grp_sz); + + drop_grp = mlx5_create_flow_group(acl, flow_group_in); + if (IS_ERR(drop_grp)) { + err = PTR_ERR(drop_grp); + esw_warn(dev, "Failed to create E-Switch vport[%d] ingress drop flow group, err(%d)\n", + vport->vport, err); + goto drop_err; + } + +set_grp: + vport->ingress.legacy.allow_untagged_spoofchk_grp = untagged_spoof_grp; + vport->ingress.legacy.allow_tagged_spoofchk_grp = tagged_spoof_grp; + vport->ingress.legacy.drop_grp = drop_grp; + kvfree(flow_group_in); + return 0; + +drop_err: + if (!IS_ERR_OR_NULL(tagged_spoof_grp)) + mlx5_destroy_flow_group(tagged_spoof_grp); +allow_spoof_err: + if (!IS_ERR_OR_NULL(untagged_spoof_grp)) + mlx5_destroy_flow_group(untagged_spoof_grp); +spoof_err: + kvfree(flow_group_in); + return err; +} + +static void esw_acl_ingress_lgcy_groups_destroy(struct mlx5_vport *vport) +{ + if (vport->ingress.legacy.allow_tagged_spoofchk_grp) { + mlx5_destroy_flow_group(vport->ingress.legacy.allow_tagged_spoofchk_grp); + vport->ingress.legacy.allow_tagged_spoofchk_grp = NULL; + } + if (vport->ingress.legacy.allow_untagged_spoofchk_grp) { + mlx5_destroy_flow_group(vport->ingress.legacy.allow_untagged_spoofchk_grp); + vport->ingress.legacy.allow_untagged_spoofchk_grp = NULL; + } + if (vport->ingress.legacy.drop_grp) { + mlx5_destroy_flow_group(vport->ingress.legacy.drop_grp); + vport->ingress.legacy.drop_grp = NULL; + } +} + +int esw_acl_ingress_lgcy_setup(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + bool need_vlan_filter = !!bitmap_weight(vport->info.vlan_trunk_8021q_bitmap, + VLAN_N_VID); + enum esw_vst_mode vst_mode = esw_get_vst_mode(esw); + struct mlx5_flow_destination drop_ctr_dst = {}; + struct mlx5_flow_destination *dst = NULL; + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_spec *spec = NULL; + struct mlx5_acl_vlan *trunk_vlan_rule; + struct mlx5_fc *counter = NULL; + bool need_acl_table = true; + bool push_on_any_pkt; + /* The ingress acl table contains 4 groups + * (2 active rules at the same time - + * 1 allow rule from one of the first 3 groups. + * 1 drop rule from the last group): + * 1)Allow untagged traffic with smac=original mac. + * 2)Allow untagged traffic. + * 3)Allow tagged traffic with smac=original mac. + * 4)Drop all other traffic. + */ + int table_size = need_vlan_filter ? 8192 : 4; + int dest_num = 0; + u16 vlan_id = 0; + int err = 0; + u8 *smac_v; + + if ((vport->info.vlan || vport->info.qos) && need_vlan_filter) { + mlx5_core_warn(esw->dev, + "vport[%d] configure ingress rules failed, Cannot enable both VGT+ and VST\n", + vport->vport); + return -EPERM; + } + + need_acl_table = vport->info.vlan || vport->info.qos || + vport->info.spoofchk || need_vlan_filter; + + esw_acl_ingress_lgcy_rules_destroy(vport); + + esw_acl_ingress_lgcy_cleanup(esw, vport); + if (!need_acl_table) + return 0; + + if (vport->ingress.legacy.drop_counter) { + counter = vport->ingress.legacy.drop_counter; + } else if (MLX5_CAP_ESW_INGRESS_ACL(esw->dev, flow_counter)) { + counter = mlx5_fc_create(esw->dev, false); + if (IS_ERR(counter)) { + esw_warn(esw->dev, + "vport[%d] configure ingress drop rule counter failed\n", + vport->vport); + counter = NULL; + } + vport->ingress.legacy.drop_counter = counter; + } + + vport->ingress.acl = esw_acl_table_create(esw, vport, + MLX5_FLOW_NAMESPACE_ESW_INGRESS, 0, table_size); + + if (IS_ERR_OR_NULL(vport->ingress.acl)) { + err = PTR_ERR(vport->ingress.acl); + vport->ingress.acl = NULL; + return err; + } + + err = esw_acl_ingress_lgcy_groups_create(esw, vport); + if (err) + goto out; + + esw_debug(esw->dev, + "vport[%d] configure ingress rules, vlan(%d) qos(%d) vst_mode (%d)\n", + vport->vport, vport->info.vlan, vport->info.qos, vst_mode); + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) { + err = -ENOMEM; + goto out; + } + + push_on_any_pkt = (vst_mode != ESW_VST_MODE_BASIC) && + !vport->info.spoofchk && !need_vlan_filter; + if (!push_on_any_pkt) + spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS; + + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_ALLOW; + if (vst_mode == ESW_VST_MODE_STEERING && + (vport->info.vlan || vport->info.qos)) { + flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH; + flow_act.vlan[0].prio = vport->info.qos; + flow_act.vlan[0].vid = vport->info.vlan; + flow_act.vlan[0].ethtype = ntohs(vport->info.vlan_proto); + } + + if (need_vlan_filter || (vst_mode == ESW_VST_MODE_BASIC && + (vport->info.vlan || vport->info.qos))) + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.cvlan_tag); + + if (vport->info.spoofchk) { + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.smac_47_16); + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.smac_15_0); + smac_v = MLX5_ADDR_OF(fte_match_param, + spec->match_value, + outer_headers.smac_47_16); + ether_addr_copy(smac_v, vport->info.mac); + } + + /* Allow untagged */ + if (!need_vlan_filter || + (need_vlan_filter && + test_bit(0, vport->info.vlan_trunk_8021q_bitmap))) { + vport->ingress.legacy.allow_untagged_rule = + mlx5_add_flow_rules(vport->ingress.acl, spec, + &flow_act, NULL, 0); + if (IS_ERR(vport->ingress.legacy.allow_untagged_rule)) { + err = PTR_ERR(vport->ingress.legacy.allow_untagged_rule); + esw_warn(esw->dev, + "vport[%d] configure ingress allow rule, err(%d)\n", + vport->vport, err); + vport->ingress.legacy.allow_untagged_rule = NULL; + goto out; + } + } + + if (push_on_any_pkt) + goto out; + + if (!need_vlan_filter) + goto drop_rule; + + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.cvlan_tag); + MLX5_SET_TO_ONES(fte_match_param, spec->match_value, outer_headers.cvlan_tag); + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.first_vid); + + /* VGT+ rules */ + for_each_set_bit(vlan_id, vport->acl_vlan_8021q_bitmap, VLAN_N_VID) { + trunk_vlan_rule = kzalloc(sizeof(*trunk_vlan_rule), GFP_KERNEL); + if (!trunk_vlan_rule) { + err = -ENOMEM; + goto out; + } + + MLX5_SET(fte_match_param, + spec->match_value, outer_headers.first_vid, vlan_id); + trunk_vlan_rule->acl_vlan_rule = + mlx5_add_flow_rules(vport->ingress.acl, + spec, &flow_act, NULL, 0); + if (IS_ERR(trunk_vlan_rule->acl_vlan_rule)) { + err = PTR_ERR(trunk_vlan_rule->acl_vlan_rule); + esw_warn(esw->dev, + "vport[%d] configure ingress allowed vlan rule failed, err(%d)\n", + vport->vport, err); + trunk_vlan_rule->acl_vlan_rule = NULL; + goto out; + } + list_add(&trunk_vlan_rule->list, + &vport->ingress.legacy.allow_vlans_rules); + } + +drop_rule: + memset(spec, 0, sizeof(*spec)); + memset(&flow_act, 0, sizeof(flow_act)); + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP; + + /* Attach drop flow counter */ + if (counter) { + flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_COUNT; + drop_ctr_dst.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; + drop_ctr_dst.counter_id = mlx5_fc_id(counter); + dst = &drop_ctr_dst; + dest_num++; + } + vport->ingress.legacy.drop_rule = + mlx5_add_flow_rules(vport->ingress.acl, NULL, + &flow_act, dst, dest_num); + if (IS_ERR(vport->ingress.legacy.drop_rule)) { + err = PTR_ERR(vport->ingress.legacy.drop_rule); + esw_warn(esw->dev, + "vport[%d] configure ingress drop rule, err(%d)\n", + vport->vport, err); + vport->ingress.legacy.drop_rule = NULL; + goto out; + } + kvfree(spec); + return 0; + +out: + if (err) + esw_acl_ingress_lgcy_cleanup(esw, vport); + kvfree(spec); + return err; +} + +void esw_acl_ingress_lgcy_cleanup(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + if (IS_ERR_OR_NULL(vport->ingress.acl)) + goto clean_drop_counter; + + esw_debug(esw->dev, "Destroy vport[%d] E-Switch ingress ACL\n", vport->vport); + + esw_acl_ingress_lgcy_rules_destroy(vport); + esw_acl_ingress_lgcy_groups_destroy(vport); + esw_acl_ingress_table_destroy(vport); + +clean_drop_counter: + if (!IS_ERR_OR_NULL(vport->ingress.legacy.drop_counter)) { + mlx5_fc_destroy(esw->dev, vport->ingress.legacy.drop_counter); + vport->ingress.legacy.drop_counter = NULL; + } +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_ofld.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_ofld.c new file mode 100644 index 0000000..b8d216a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_ofld.c @@ -0,0 +1,414 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020 Mellanox Technologies Inc. All rights reserved. */ + +#include "mlx5_core.h" +#include "eswitch.h" +#include "helper.h" +#include "ofld.h" +#include "esw/vf_meter.h" + +#define MLX5_ESW_INGRESS_ACL_DEFAULT_PRIO 4 + +static bool +esw_acl_ingress_prio_tag_enabled(struct mlx5_eswitch *esw, + const struct mlx5_vport *vport) +{ + return (MLX5_CAP_GEN(esw->dev, prio_tag_required) && + mlx5_eswitch_is_vf_vport(esw, vport->vport)); +} + +static int esw_acl_ingress_prio_tag_create(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_spec *spec; + int err = 0; + + /* For prio tag mode, there is only 1 FTEs: + * 1) Untagged packets - push prio tag VLAN and modify metadata if + * required, allow + * Unmatched traffic is allowed by default + */ + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) + return -ENOMEM; + + /* Untagged packets - push prio tag VLAN, allow */ + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.cvlan_tag); + MLX5_SET(fte_match_param, spec->match_value, outer_headers.cvlan_tag, 0); + spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS; + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH | + MLX5_FLOW_CONTEXT_ACTION_ALLOW; + flow_act.vlan[0].ethtype = ETH_P_8021Q; + flow_act.vlan[0].vid = 0; + flow_act.vlan[0].prio = 0; + + if (vport->ingress.offloads.modify_metadata_rule) { + flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR; + flow_act.modify_hdr = vport->ingress.offloads.modify_metadata; + } + + vport->ingress.allow_rule = mlx5_add_flow_rules(vport->ingress.acl, spec, + &flow_act, NULL, 0); + if (IS_ERR(vport->ingress.allow_rule)) { + err = PTR_ERR(vport->ingress.allow_rule); + esw_warn(esw->dev, + "vport[%d] configure ingress untagged allow rule, err(%d)\n", + vport->vport, err); + vport->ingress.allow_rule = NULL; + } + + kvfree(spec); + return err; +} + +static int esw_acl_ingress_mod_metadata_create(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + u8 action[MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto)] = {}; + struct mlx5_flow_act flow_act = {}; + int err = 0; + u32 key; + + key = mlx5_eswitch_get_vport_metadata_for_match(esw, vport->vport); + key >>= ESW_SOURCE_PORT_METADATA_OFFSET; + + MLX5_SET(set_action_in, action, action_type, MLX5_ACTION_TYPE_SET); + MLX5_SET(set_action_in, action, field, + MLX5_ACTION_IN_FIELD_METADATA_REG_C_0); + MLX5_SET(set_action_in, action, data, key); + MLX5_SET(set_action_in, action, offset, + ESW_SOURCE_PORT_METADATA_OFFSET); + MLX5_SET(set_action_in, action, length, + ESW_SOURCE_PORT_METADATA_BITS); + + vport->ingress.offloads.modify_metadata = + mlx5_modify_header_alloc(esw->dev, MLX5_FLOW_NAMESPACE_ESW_INGRESS, + 1, action); + if (IS_ERR(vport->ingress.offloads.modify_metadata)) { + err = PTR_ERR(vport->ingress.offloads.modify_metadata); + esw_warn(esw->dev, + "failed to alloc modify header for vport %d ingress acl (%d)\n", + vport->vport, err); + return err; + } + + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_MOD_HDR | MLX5_FLOW_CONTEXT_ACTION_ALLOW; + flow_act.modify_hdr = vport->ingress.offloads.modify_metadata; + flow_act.fg = vport->ingress.offloads.metadata_allmatch_grp; + vport->ingress.offloads.modify_metadata_rule = + mlx5_add_flow_rules(vport->ingress.acl, + NULL, &flow_act, NULL, 0); + if (IS_ERR(vport->ingress.offloads.modify_metadata_rule)) { + err = PTR_ERR(vport->ingress.offloads.modify_metadata_rule); + esw_warn(esw->dev, + "failed to add setting metadata rule for vport %d ingress acl, err(%d)\n", + vport->vport, err); + mlx5_modify_header_dealloc(esw->dev, vport->ingress.offloads.modify_metadata); + vport->ingress.offloads.modify_metadata_rule = NULL; + } + return err; +} + +static void esw_acl_ingress_mod_metadata_destroy(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + if (!vport->ingress.offloads.modify_metadata_rule) + return; + + mlx5_del_flow_rules(vport->ingress.offloads.modify_metadata_rule); + mlx5_modify_header_dealloc(esw->dev, vport->ingress.offloads.modify_metadata); + vport->ingress.offloads.modify_metadata_rule = NULL; +} + +static int esw_acl_ingress_src_port_drop_create(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_handle *flow_rule; + int err = 0; + + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP; + flow_act.fg = vport->ingress.offloads.drop_grp; + flow_rule = mlx5_add_flow_rules(vport->ingress.acl, NULL, &flow_act, NULL, 0); + if (IS_ERR(flow_rule)) { + err = PTR_ERR(flow_rule); + goto out; + } + + vport->ingress.offloads.drop_rule = flow_rule; +out: + return err; +} + +static void esw_acl_ingress_src_port_drop_destroy(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + if (!vport->ingress.offloads.drop_rule) + return; + + mlx5_del_flow_rules(vport->ingress.offloads.drop_rule); + vport->ingress.offloads.drop_rule = NULL; +} + +static int esw_acl_ingress_ofld_rules_create(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + int err; + + if (mlx5_eswitch_vport_match_metadata_enabled(esw)) { + err = esw_acl_ingress_mod_metadata_create(esw, vport); + if (err) { + esw_warn(esw->dev, + "vport(%d) create ingress modify metadata, err(%d)\n", + vport->vport, err); + return err; + } + } + + if (esw_acl_ingress_prio_tag_enabled(esw, vport)) { + err = esw_acl_ingress_prio_tag_create(esw, vport); + if (err) { + esw_warn(esw->dev, + "vport(%d) create ingress prio tag rule, err(%d)\n", + vport->vport, err); + goto prio_tag_err; + } + } + + return 0; + +prio_tag_err: + esw_acl_ingress_mod_metadata_destroy(esw, vport); + return err; +} + +static void esw_acl_ingress_ofld_rules_destroy(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + esw_acl_ingress_allow_rule_destroy(vport); + esw_acl_ingress_mod_metadata_destroy(esw, vport); + esw_acl_ingress_src_port_drop_destroy(esw, vport); +} + +static int esw_acl_ingress_ofld_groups_create(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_flow_group *g; + void *match_criteria; + u32 *flow_group_in; + u32 flow_index = 0; + int ret = 0; + + flow_group_in = kvzalloc(inlen, GFP_KERNEL); + if (!flow_group_in) + return -ENOMEM; + + if (vport->vport == MLX5_VPORT_UPLINK) { + /* This group can hold an FTE to drop all traffic. + * Need in case LAG is enabled. + */ + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, flow_index); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, flow_index); + + g = mlx5_create_flow_group(vport->ingress.acl, flow_group_in); + if (IS_ERR(g)) { + ret = PTR_ERR(g); + esw_warn(esw->dev, "vport[%d] ingress create drop flow group, err(%d)\n", + vport->vport, ret); + goto drop_err; + } + vport->ingress.offloads.drop_grp = g; + flow_index++; + } + + if (esw_acl_ingress_prio_tag_enabled(esw, vport)) { + /* This group is to hold FTE to match untagged packets when prio_tag + * is enabled. + */ + memset(flow_group_in, 0, inlen); + match_criteria = MLX5_ADDR_OF(create_flow_group_in, + flow_group_in, match_criteria); + MLX5_SET(create_flow_group_in, flow_group_in, + match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); + MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.cvlan_tag); + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, flow_index); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, flow_index); + + g = mlx5_create_flow_group(vport->ingress.acl, flow_group_in); + if (IS_ERR(g)) { + ret = PTR_ERR(g); + esw_warn(esw->dev, "vport[%d] ingress create untagged flow group, err(%d)\n", + vport->vport, ret); + goto prio_tag_err; + } + vport->ingress.offloads.metadata_prio_tag_grp = g; + flow_index++; + } + + if (mlx5_eswitch_vport_match_metadata_enabled(esw)) { + /* This group holds an FTE with no match to add metadata for + * tagged packets if prio-tag is enabled, or for all untagged + * traffic in case prio-tag is disabled. + */ + memset(flow_group_in, 0, inlen); + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, flow_index); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, flow_index); + + g = mlx5_create_flow_group(vport->ingress.acl, flow_group_in); + if (IS_ERR(g)) { + ret = PTR_ERR(g); + esw_warn(esw->dev, "vport[%d] ingress create drop flow group, err(%d)\n", + vport->vport, ret); + goto metadata_err; + } + vport->ingress.offloads.metadata_allmatch_grp = g; + } + + kvfree(flow_group_in); + return 0; + +metadata_err: + if (!IS_ERR_OR_NULL(vport->ingress.offloads.metadata_prio_tag_grp)) { + mlx5_destroy_flow_group(vport->ingress.offloads.metadata_prio_tag_grp); + vport->ingress.offloads.metadata_prio_tag_grp = NULL; + } +prio_tag_err: + if (!IS_ERR_OR_NULL(vport->ingress.offloads.drop_grp)) { + mlx5_destroy_flow_group(vport->ingress.offloads.drop_grp); + vport->ingress.offloads.drop_grp = NULL; + } +drop_err: + kvfree(flow_group_in); + return ret; +} + +static void esw_acl_ingress_ofld_groups_destroy(struct mlx5_vport *vport) +{ + if (vport->ingress.offloads.metadata_allmatch_grp) { + mlx5_destroy_flow_group(vport->ingress.offloads.metadata_allmatch_grp); + vport->ingress.offloads.metadata_allmatch_grp = NULL; + } + + if (vport->ingress.offloads.metadata_prio_tag_grp) { + mlx5_destroy_flow_group(vport->ingress.offloads.metadata_prio_tag_grp); + vport->ingress.offloads.metadata_prio_tag_grp = NULL; + } + + if (vport->ingress.offloads.drop_grp) { + mlx5_destroy_flow_group(vport->ingress.offloads.drop_grp); + vport->ingress.offloads.drop_grp = NULL; + } +} + +int esw_acl_ingress_ofld_setup(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + int num_ftes = 0; + int err; + + if (!mlx5_eswitch_vport_match_metadata_enabled(esw) && + !esw_acl_ingress_prio_tag_enabled(esw, vport)) + return 0; + + esw_acl_ingress_allow_rule_destroy(vport); + + if (mlx5_eswitch_vport_match_metadata_enabled(esw)) + num_ftes++; + if (vport->vport == MLX5_VPORT_UPLINK) + num_ftes++; + if (esw_acl_ingress_prio_tag_enabled(esw, vport)) + num_ftes++; + + vport->ingress.acl = esw_acl_table_create(esw, vport, + MLX5_FLOW_NAMESPACE_ESW_INGRESS, + MLX5_ESW_INGRESS_ACL_DEFAULT_PRIO, + num_ftes); + if (IS_ERR(vport->ingress.acl)) { + err = PTR_ERR(vport->ingress.acl); + vport->ingress.acl = NULL; + return err; + } + + err = esw_acl_ingress_ofld_groups_create(esw, vport); + if (err) + goto group_err; + + esw_debug(esw->dev, + "vport[%d] configure ingress rules\n", vport->vport); + + err = esw_acl_ingress_ofld_rules_create(esw, vport); + if (err) + goto rules_err; + + return 0; + +rules_err: + esw_acl_ingress_ofld_groups_destroy(vport); +group_err: + esw_acl_ingress_table_destroy(vport); + return err; +} + +void esw_acl_ingress_ofld_cleanup(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + esw_vf_meter_ingress_destroy(vport); + esw_acl_ingress_ofld_rules_destroy(esw, vport); + esw_acl_ingress_ofld_groups_destroy(vport); + esw_acl_ingress_table_destroy(vport); +} + +/* Caller must hold rtnl_lock */ +int mlx5_esw_acl_ingress_vport_bond_update(struct mlx5_eswitch *esw, u16 vport_num, + u32 metadata) +{ + struct mlx5_vport *vport = mlx5_eswitch_get_vport(esw, vport_num); + int err; + + if (WARN_ON_ONCE(IS_ERR(vport))) { + esw_warn(esw->dev, "vport(%d) invalid!\n", vport_num); + err = PTR_ERR(vport); + goto out; + } + + esw_acl_ingress_ofld_rules_destroy(esw, vport); + + vport->metadata = metadata ? metadata : vport->default_metadata; + + /* Recreate ingress acl rules with vport->metadata */ + err = esw_acl_ingress_ofld_rules_create(esw, vport); + if (err) + goto out; + + return 0; + +out: + vport->metadata = vport->default_metadata; + return err; +} + +int mlx5_esw_acl_ingress_vport_drop_rule_create(struct mlx5_eswitch *esw, u16 vport_num) +{ + struct mlx5_vport *vport = mlx5_eswitch_get_vport(esw, vport_num); + + if (IS_ERR(vport)) { + esw_warn(esw->dev, "vport(%d) invalid!\n", vport_num); + return PTR_ERR(vport); + } + + return esw_acl_ingress_src_port_drop_create(esw, vport); +} + +void mlx5_esw_acl_ingress_vport_drop_rule_destroy(struct mlx5_eswitch *esw, u16 vport_num) +{ + struct mlx5_vport *vport = mlx5_eswitch_get_vport(esw, vport_num); + + if (WARN_ON_ONCE(IS_ERR(vport))) { + esw_warn(esw->dev, "vport(%d) invalid!\n", vport_num); + return; + } + + esw_acl_ingress_src_port_drop_destroy(esw, vport); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/lgcy.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/lgcy.h new file mode 100644 index 0000000..44c152d --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/lgcy.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2020 Mellanox Technologies Inc. All rights reserved. */ + +#ifndef __MLX5_ESWITCH_ACL_LGCY_H__ +#define __MLX5_ESWITCH_ACL_LGCY_H__ + +#include "eswitch.h" + +/* Eswitch acl egress external APIs */ +int esw_acl_egress_lgcy_setup(struct mlx5_eswitch *esw, struct mlx5_vport *vport); +void esw_acl_egress_lgcy_cleanup(struct mlx5_eswitch *esw, struct mlx5_vport *vport); + +/* Eswitch acl ingress external APIs */ +int esw_acl_ingress_lgcy_setup(struct mlx5_eswitch *esw, struct mlx5_vport *vport); +void esw_acl_ingress_lgcy_cleanup(struct mlx5_eswitch *esw, struct mlx5_vport *vport); + +#endif /* __MLX5_ESWITCH_ACL_LGCY_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ofld.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ofld.h new file mode 100644 index 0000000..11d3d39 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ofld.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2020 Mellanox Technologies Inc. All rights reserved. */ + +#ifndef __MLX5_ESWITCH_ACL_OFLD_H__ +#define __MLX5_ESWITCH_ACL_OFLD_H__ + +#include "eswitch.h" + +#ifdef CONFIG_MLX5_ESWITCH +/* Eswitch acl egress external APIs */ +int esw_acl_egress_ofld_setup(struct mlx5_eswitch *esw, struct mlx5_vport *vport); +void esw_acl_egress_ofld_cleanup(struct mlx5_vport *vport); +int mlx5_esw_acl_egress_vport_bond(struct mlx5_eswitch *esw, u16 active_vport_num, + u16 passive_vport_num); +int mlx5_esw_acl_egress_vport_unbond(struct mlx5_eswitch *esw, u16 vport_num); + +static inline bool mlx5_esw_acl_egress_fwd2vport_supported(struct mlx5_eswitch *esw) +{ + return esw && esw->mode == MLX5_ESWITCH_OFFLOADS && + mlx5_eswitch_vport_match_metadata_enabled(esw) && + MLX5_CAP_ESW_FLOWTABLE(esw->dev, egress_acl_forward_to_vport); +} + +/* Eswitch acl ingress external APIs */ +int esw_acl_ingress_ofld_setup(struct mlx5_eswitch *esw, struct mlx5_vport *vport); +void esw_acl_ingress_ofld_cleanup(struct mlx5_eswitch *esw, struct mlx5_vport *vport); +int mlx5_esw_acl_ingress_vport_bond_update(struct mlx5_eswitch *esw, u16 vport_num, + u32 metadata); +void mlx5_esw_acl_ingress_vport_drop_rule_destroy(struct mlx5_eswitch *esw, u16 vport_num); +int mlx5_esw_acl_ingress_vport_drop_rule_create(struct mlx5_eswitch *esw, u16 vport_num); + +#else /* CONFIG_MLX5_ESWITCH */ +static void +mlx5_esw_acl_ingress_vport_drop_rule_destroy(struct mlx5_eswitch *esw, + u16 vport_num) +{} + +static int mlx5_esw_acl_ingress_vport_drop_rule_create(struct mlx5_eswitch *esw, + u16 vport_num) +{ + return 0; +} +#endif /* CONFIG_MLX5_ESWITCH */ +#endif /* __MLX5_ESWITCH_ACL_OFLD_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c new file mode 100644 index 0000000..05e08ce --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c @@ -0,0 +1,1605 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2021 Mellanox Technologies. */ + +#include +#include +#include +#include +#include "lib/devcom.h" +#include "bridge.h" +#include "eswitch.h" +#include "bridge_priv.h" +#define CREATE_TRACE_POINTS +#include "diag/bridge_tracepoint.h" + +#define MLX5_ESW_BRIDGE_INGRESS_TABLE_SIZE 64000 +#define MLX5_ESW_BRIDGE_INGRESS_TABLE_VLAN_GRP_IDX_FROM 0 +#define MLX5_ESW_BRIDGE_INGRESS_TABLE_VLAN_GRP_IDX_TO (MLX5_ESW_BRIDGE_INGRESS_TABLE_SIZE / 4 - 1) +#define MLX5_ESW_BRIDGE_INGRESS_TABLE_FILTER_GRP_IDX_FROM \ + (MLX5_ESW_BRIDGE_INGRESS_TABLE_VLAN_GRP_IDX_TO + 1) +#define MLX5_ESW_BRIDGE_INGRESS_TABLE_FILTER_GRP_IDX_TO \ + (MLX5_ESW_BRIDGE_INGRESS_TABLE_SIZE / 2 - 1) +#define MLX5_ESW_BRIDGE_INGRESS_TABLE_MAC_GRP_IDX_FROM \ + (MLX5_ESW_BRIDGE_INGRESS_TABLE_FILTER_GRP_IDX_TO + 1) +#define MLX5_ESW_BRIDGE_INGRESS_TABLE_MAC_GRP_IDX_TO (MLX5_ESW_BRIDGE_INGRESS_TABLE_SIZE - 1) + +#define MLX5_ESW_BRIDGE_EGRESS_TABLE_SIZE 64000 +#define MLX5_ESW_BRIDGE_EGRESS_TABLE_VLAN_GRP_IDX_FROM 0 +#define MLX5_ESW_BRIDGE_EGRESS_TABLE_VLAN_GRP_IDX_TO (MLX5_ESW_BRIDGE_EGRESS_TABLE_SIZE / 2 - 1) +#define MLX5_ESW_BRIDGE_EGRESS_TABLE_MAC_GRP_IDX_FROM \ + (MLX5_ESW_BRIDGE_EGRESS_TABLE_VLAN_GRP_IDX_TO + 1) +#define MLX5_ESW_BRIDGE_EGRESS_TABLE_MAC_GRP_IDX_TO (MLX5_ESW_BRIDGE_EGRESS_TABLE_SIZE - 2) +#define MLX5_ESW_BRIDGE_EGRESS_TABLE_MISS_GRP_IDX_FROM \ + (MLX5_ESW_BRIDGE_EGRESS_TABLE_MAC_GRP_IDX_TO + 1) +#define MLX5_ESW_BRIDGE_EGRESS_TABLE_MISS_GRP_IDX_TO (MLX5_ESW_BRIDGE_EGRESS_TABLE_SIZE - 1) + +#define MLX5_ESW_BRIDGE_SKIP_TABLE_SIZE 0 + +enum { + MLX5_ESW_BRIDGE_LEVEL_INGRESS_TABLE, + MLX5_ESW_BRIDGE_LEVEL_EGRESS_TABLE, + MLX5_ESW_BRIDGE_LEVEL_SKIP_TABLE, +}; + +static const struct rhashtable_params fdb_ht_params = { + .key_offset = offsetof(struct mlx5_esw_bridge_fdb_entry, key), + .key_len = sizeof(struct mlx5_esw_bridge_fdb_key), + .head_offset = offsetof(struct mlx5_esw_bridge_fdb_entry, ht_node), + .automatic_shrinking = true, +}; + +enum { + MLX5_ESW_BRIDGE_VLAN_FILTERING_FLAG = BIT(0), +}; + +struct mlx5_esw_bridge { + int ifindex; + int refcnt; + struct list_head list; + struct mlx5_esw_bridge_offloads *br_offloads; + + struct list_head fdb_list; + struct rhashtable fdb_ht; + + struct mlx5_flow_table *egress_ft; + struct mlx5_flow_group *egress_vlan_fg; + struct mlx5_flow_group *egress_mac_fg; + struct mlx5_flow_group *egress_miss_fg; + struct mlx5_pkt_reformat *egress_miss_pkt_reformat; + struct mlx5_flow_handle *egress_miss_handle; + unsigned long ageing_time; + u32 flags; +}; + +static void +mlx5_esw_bridge_fdb_offload_notify(struct net_device *dev, const unsigned char *addr, u16 vid, + unsigned long val) +{ + struct switchdev_notifier_fdb_info send_info = {}; + + send_info.addr = addr; + send_info.vid = vid; + send_info.offloaded = true; + call_switchdev_notifiers(val, dev, &send_info.info, NULL); +} + +static void +mlx5_esw_bridge_fdb_del_notify(struct mlx5_esw_bridge_fdb_entry *entry) +{ + if (!(entry->flags & (MLX5_ESW_BRIDGE_FLAG_ADDED_BY_USER | MLX5_ESW_BRIDGE_FLAG_PEER))) + mlx5_esw_bridge_fdb_offload_notify(entry->dev, entry->key.addr, + entry->key.vid, + SWITCHDEV_FDB_DEL_TO_BRIDGE); +} + +static bool mlx5_esw_bridge_pkt_reformat_vlan_pop_supported(struct mlx5_eswitch *esw) +{ + return BIT(MLX5_CAP_ESW_FLOWTABLE_FDB(esw->dev, reformat_remove)) && + MLX5_CAP_GEN_2(esw->dev, max_reformat_remove_size) >= sizeof(struct vlan_hdr) && + MLX5_CAP_GEN_2(esw->dev, max_reformat_remove_offset) >= + offsetof(struct vlan_ethhdr, h_vlan_proto); +} + +static struct mlx5_pkt_reformat * +mlx5_esw_bridge_pkt_reformat_vlan_pop_create(struct mlx5_eswitch *esw) +{ + struct mlx5_pkt_reformat_params reformat_params = {}; + + reformat_params.type = MLX5_REFORMAT_TYPE_REMOVE_HDR; + reformat_params.param_0 = MLX5_REFORMAT_CONTEXT_ANCHOR_MAC_START; + reformat_params.param_1 = offsetof(struct vlan_ethhdr, h_vlan_proto); + reformat_params.size = sizeof(struct vlan_hdr); + return mlx5_packet_reformat_alloc(esw->dev, &reformat_params, MLX5_FLOW_NAMESPACE_FDB); +} + +static struct mlx5_flow_table * +mlx5_esw_bridge_table_create(int max_fte, u32 level, struct mlx5_eswitch *esw) +{ + struct mlx5_flow_table_attr ft_attr = {}; + struct mlx5_core_dev *dev = esw->dev; + struct mlx5_flow_namespace *ns; + struct mlx5_flow_table *fdb; + + ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_FDB); + if (!ns) { + esw_warn(dev, "Failed to get FDB namespace\n"); + return ERR_PTR(-ENOENT); + } + + ft_attr.flags = MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT; + ft_attr.max_fte = max_fte; + ft_attr.level = level; + ft_attr.prio = FDB_BR_OFFLOAD; + fdb = mlx5_create_flow_table(ns, &ft_attr); + if (IS_ERR(fdb)) + esw_warn(dev, "Failed to create bridge FDB Table (err=%ld)\n", PTR_ERR(fdb)); + + return fdb; +} + +static struct mlx5_flow_group * +mlx5_esw_bridge_ingress_vlan_fg_create(struct mlx5_eswitch *esw, struct mlx5_flow_table *ingress_ft) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_flow_group *fg; + u32 *in, *match; + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return ERR_PTR(-ENOMEM); + + MLX5_SET(create_flow_group_in, in, match_criteria_enable, + MLX5_MATCH_OUTER_HEADERS | MLX5_MATCH_MISC_PARAMETERS_2); + match = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria); + + MLX5_SET_TO_ONES(fte_match_param, match, outer_headers.smac_47_16); + MLX5_SET_TO_ONES(fte_match_param, match, outer_headers.smac_15_0); + MLX5_SET_TO_ONES(fte_match_param, match, outer_headers.cvlan_tag); + MLX5_SET_TO_ONES(fte_match_param, match, outer_headers.first_vid); + + MLX5_SET(fte_match_param, match, misc_parameters_2.metadata_reg_c_0, + mlx5_eswitch_get_vport_metadata_mask()); + + MLX5_SET(create_flow_group_in, in, start_flow_index, + MLX5_ESW_BRIDGE_INGRESS_TABLE_VLAN_GRP_IDX_FROM); + MLX5_SET(create_flow_group_in, in, end_flow_index, + MLX5_ESW_BRIDGE_INGRESS_TABLE_VLAN_GRP_IDX_TO); + + fg = mlx5_create_flow_group(ingress_ft, in); + kvfree(in); + if (IS_ERR(fg)) + esw_warn(esw->dev, + "Failed to create VLAN flow group for bridge ingress table (err=%ld)\n", + PTR_ERR(fg)); + + return fg; +} + +static struct mlx5_flow_group * +mlx5_esw_bridge_ingress_filter_fg_create(struct mlx5_eswitch *esw, + struct mlx5_flow_table *ingress_ft) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_flow_group *fg; + u32 *in, *match; + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return ERR_PTR(-ENOMEM); + + MLX5_SET(create_flow_group_in, in, match_criteria_enable, + MLX5_MATCH_OUTER_HEADERS | MLX5_MATCH_MISC_PARAMETERS_2); + match = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria); + + MLX5_SET_TO_ONES(fte_match_param, match, outer_headers.smac_47_16); + MLX5_SET_TO_ONES(fte_match_param, match, outer_headers.smac_15_0); + MLX5_SET_TO_ONES(fte_match_param, match, outer_headers.cvlan_tag); + + MLX5_SET(fte_match_param, match, misc_parameters_2.metadata_reg_c_0, + mlx5_eswitch_get_vport_metadata_mask()); + + MLX5_SET(create_flow_group_in, in, start_flow_index, + MLX5_ESW_BRIDGE_INGRESS_TABLE_FILTER_GRP_IDX_FROM); + MLX5_SET(create_flow_group_in, in, end_flow_index, + MLX5_ESW_BRIDGE_INGRESS_TABLE_FILTER_GRP_IDX_TO); + + fg = mlx5_create_flow_group(ingress_ft, in); + if (IS_ERR(fg)) + esw_warn(esw->dev, + "Failed to create bridge ingress table VLAN filter flow group (err=%ld)\n", + PTR_ERR(fg)); + + kvfree(in); + return fg; +} + +static struct mlx5_flow_group * +mlx5_esw_bridge_ingress_mac_fg_create(struct mlx5_eswitch *esw, struct mlx5_flow_table *ingress_ft) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_flow_group *fg; + u32 *in, *match; + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return ERR_PTR(-ENOMEM); + + MLX5_SET(create_flow_group_in, in, match_criteria_enable, + MLX5_MATCH_OUTER_HEADERS | MLX5_MATCH_MISC_PARAMETERS_2); + match = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria); + + MLX5_SET_TO_ONES(fte_match_param, match, outer_headers.smac_47_16); + MLX5_SET_TO_ONES(fte_match_param, match, outer_headers.smac_15_0); + + MLX5_SET(fte_match_param, match, misc_parameters_2.metadata_reg_c_0, + mlx5_eswitch_get_vport_metadata_mask()); + + MLX5_SET(create_flow_group_in, in, start_flow_index, + MLX5_ESW_BRIDGE_INGRESS_TABLE_MAC_GRP_IDX_FROM); + MLX5_SET(create_flow_group_in, in, end_flow_index, + MLX5_ESW_BRIDGE_INGRESS_TABLE_MAC_GRP_IDX_TO); + + fg = mlx5_create_flow_group(ingress_ft, in); + if (IS_ERR(fg)) + esw_warn(esw->dev, + "Failed to create MAC flow group for bridge ingress table (err=%ld)\n", + PTR_ERR(fg)); + + kvfree(in); + return fg; +} + +static struct mlx5_flow_group * +mlx5_esw_bridge_egress_vlan_fg_create(struct mlx5_eswitch *esw, struct mlx5_flow_table *egress_ft) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_flow_group *fg; + u32 *in, *match; + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return ERR_PTR(-ENOMEM); + + MLX5_SET(create_flow_group_in, in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); + match = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria); + + MLX5_SET_TO_ONES(fte_match_param, match, outer_headers.dmac_47_16); + MLX5_SET_TO_ONES(fte_match_param, match, outer_headers.dmac_15_0); + MLX5_SET_TO_ONES(fte_match_param, match, outer_headers.cvlan_tag); + MLX5_SET_TO_ONES(fte_match_param, match, outer_headers.first_vid); + + MLX5_SET(create_flow_group_in, in, start_flow_index, + MLX5_ESW_BRIDGE_EGRESS_TABLE_VLAN_GRP_IDX_FROM); + MLX5_SET(create_flow_group_in, in, end_flow_index, + MLX5_ESW_BRIDGE_EGRESS_TABLE_VLAN_GRP_IDX_TO); + + fg = mlx5_create_flow_group(egress_ft, in); + if (IS_ERR(fg)) + esw_warn(esw->dev, + "Failed to create VLAN flow group for bridge egress table (err=%ld)\n", + PTR_ERR(fg)); + kvfree(in); + return fg; +} + +static struct mlx5_flow_group * +mlx5_esw_bridge_egress_mac_fg_create(struct mlx5_eswitch *esw, struct mlx5_flow_table *egress_ft) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_flow_group *fg; + u32 *in, *match; + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return ERR_PTR(-ENOMEM); + + MLX5_SET(create_flow_group_in, in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); + match = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria); + + MLX5_SET_TO_ONES(fte_match_param, match, outer_headers.dmac_47_16); + MLX5_SET_TO_ONES(fte_match_param, match, outer_headers.dmac_15_0); + + MLX5_SET(create_flow_group_in, in, start_flow_index, + MLX5_ESW_BRIDGE_EGRESS_TABLE_MAC_GRP_IDX_FROM); + MLX5_SET(create_flow_group_in, in, end_flow_index, + MLX5_ESW_BRIDGE_EGRESS_TABLE_MAC_GRP_IDX_TO); + + fg = mlx5_create_flow_group(egress_ft, in); + if (IS_ERR(fg)) + esw_warn(esw->dev, + "Failed to create bridge egress table MAC flow group (err=%ld)\n", + PTR_ERR(fg)); + kvfree(in); + return fg; +} + +static struct mlx5_flow_group * +mlx5_esw_bridge_egress_miss_fg_create(struct mlx5_eswitch *esw, struct mlx5_flow_table *egress_ft) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_flow_group *fg; + u32 *in, *match; + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return ERR_PTR(-ENOMEM); + + MLX5_SET(create_flow_group_in, in, match_criteria_enable, MLX5_MATCH_MISC_PARAMETERS_2); + match = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria); + + MLX5_SET(fte_match_param, match, misc_parameters_2.metadata_reg_c_1, ESW_TUN_MASK); + + MLX5_SET(create_flow_group_in, in, start_flow_index, + MLX5_ESW_BRIDGE_EGRESS_TABLE_MISS_GRP_IDX_FROM); + MLX5_SET(create_flow_group_in, in, end_flow_index, + MLX5_ESW_BRIDGE_EGRESS_TABLE_MISS_GRP_IDX_TO); + + fg = mlx5_create_flow_group(egress_ft, in); + if (IS_ERR(fg)) + esw_warn(esw->dev, + "Failed to create bridge egress table miss flow group (err=%ld)\n", + PTR_ERR(fg)); + kvfree(in); + return fg; +} + +static int +mlx5_esw_bridge_ingress_table_init(struct mlx5_esw_bridge_offloads *br_offloads) +{ + struct mlx5_flow_group *mac_fg, *filter_fg, *vlan_fg; + struct mlx5_flow_table *ingress_ft, *skip_ft; + struct mlx5_eswitch *esw = br_offloads->esw; + int err; + + if (!mlx5_eswitch_vport_match_metadata_enabled(esw)) + return -EOPNOTSUPP; + + ingress_ft = mlx5_esw_bridge_table_create(MLX5_ESW_BRIDGE_INGRESS_TABLE_SIZE, + MLX5_ESW_BRIDGE_LEVEL_INGRESS_TABLE, + esw); + if (IS_ERR(ingress_ft)) + return PTR_ERR(ingress_ft); + + skip_ft = mlx5_esw_bridge_table_create(MLX5_ESW_BRIDGE_SKIP_TABLE_SIZE, + MLX5_ESW_BRIDGE_LEVEL_SKIP_TABLE, + esw); + if (IS_ERR(skip_ft)) { + err = PTR_ERR(skip_ft); + goto err_skip_tbl; + } + + vlan_fg = mlx5_esw_bridge_ingress_vlan_fg_create(esw, ingress_ft); + if (IS_ERR(vlan_fg)) { + err = PTR_ERR(vlan_fg); + goto err_vlan_fg; + } + + filter_fg = mlx5_esw_bridge_ingress_filter_fg_create(esw, ingress_ft); + if (IS_ERR(filter_fg)) { + err = PTR_ERR(filter_fg); + goto err_filter_fg; + } + + mac_fg = mlx5_esw_bridge_ingress_mac_fg_create(esw, ingress_ft); + if (IS_ERR(mac_fg)) { + err = PTR_ERR(mac_fg); + goto err_mac_fg; + } + + br_offloads->ingress_ft = ingress_ft; + br_offloads->skip_ft = skip_ft; + br_offloads->ingress_vlan_fg = vlan_fg; + br_offloads->ingress_filter_fg = filter_fg; + br_offloads->ingress_mac_fg = mac_fg; + return 0; + +err_mac_fg: + mlx5_destroy_flow_group(filter_fg); +err_filter_fg: + mlx5_destroy_flow_group(vlan_fg); +err_vlan_fg: + mlx5_destroy_flow_table(skip_ft); +err_skip_tbl: + mlx5_destroy_flow_table(ingress_ft); + return err; +} + +static void +mlx5_esw_bridge_ingress_table_cleanup(struct mlx5_esw_bridge_offloads *br_offloads) +{ + mlx5_destroy_flow_group(br_offloads->ingress_mac_fg); + br_offloads->ingress_mac_fg = NULL; + mlx5_destroy_flow_group(br_offloads->ingress_filter_fg); + br_offloads->ingress_filter_fg = NULL; + mlx5_destroy_flow_group(br_offloads->ingress_vlan_fg); + br_offloads->ingress_vlan_fg = NULL; + mlx5_destroy_flow_table(br_offloads->skip_ft); + br_offloads->skip_ft = NULL; + mlx5_destroy_flow_table(br_offloads->ingress_ft); + br_offloads->ingress_ft = NULL; +} + +static struct mlx5_flow_handle * +mlx5_esw_bridge_egress_miss_flow_create(struct mlx5_flow_table *egress_ft, + struct mlx5_flow_table *skip_ft, + struct mlx5_pkt_reformat *pkt_reformat); + +static int +mlx5_esw_bridge_egress_table_init(struct mlx5_esw_bridge_offloads *br_offloads, + struct mlx5_esw_bridge *bridge) +{ + struct mlx5_flow_group *miss_fg = NULL, *mac_fg, *vlan_fg; + struct mlx5_pkt_reformat *miss_pkt_reformat = NULL; + struct mlx5_flow_handle *miss_handle = NULL; + struct mlx5_eswitch *esw = br_offloads->esw; + struct mlx5_flow_table *egress_ft; + int err; + + egress_ft = mlx5_esw_bridge_table_create(MLX5_ESW_BRIDGE_EGRESS_TABLE_SIZE, + MLX5_ESW_BRIDGE_LEVEL_EGRESS_TABLE, + esw); + if (IS_ERR(egress_ft)) + return PTR_ERR(egress_ft); + + vlan_fg = mlx5_esw_bridge_egress_vlan_fg_create(esw, egress_ft); + if (IS_ERR(vlan_fg)) { + err = PTR_ERR(vlan_fg); + goto err_vlan_fg; + } + + mac_fg = mlx5_esw_bridge_egress_mac_fg_create(esw, egress_ft); + if (IS_ERR(mac_fg)) { + err = PTR_ERR(mac_fg); + goto err_mac_fg; + } + + if (mlx5_esw_bridge_pkt_reformat_vlan_pop_supported(esw)) { + miss_fg = mlx5_esw_bridge_egress_miss_fg_create(esw, egress_ft); + if (IS_ERR(miss_fg)) { + esw_warn(esw->dev, "Failed to create miss flow group (err=%ld)\n", + PTR_ERR(miss_fg)); + miss_fg = NULL; + goto skip_miss_flow; + } + + miss_pkt_reformat = mlx5_esw_bridge_pkt_reformat_vlan_pop_create(esw); + if (IS_ERR(miss_pkt_reformat)) { + esw_warn(esw->dev, + "Failed to alloc packet reformat REMOVE_HEADER (err=%ld)\n", + PTR_ERR(miss_pkt_reformat)); + miss_pkt_reformat = NULL; + mlx5_destroy_flow_group(miss_fg); + miss_fg = NULL; + goto skip_miss_flow; + } + + miss_handle = mlx5_esw_bridge_egress_miss_flow_create(egress_ft, + br_offloads->skip_ft, + miss_pkt_reformat); + if (IS_ERR(miss_handle)) { + esw_warn(esw->dev, "Failed to create miss flow (err=%ld)\n", + PTR_ERR(miss_handle)); + miss_handle = NULL; + mlx5_packet_reformat_dealloc(esw->dev, miss_pkt_reformat); + miss_pkt_reformat = NULL; + mlx5_destroy_flow_group(miss_fg); + miss_fg = NULL; + goto skip_miss_flow; + } + } +skip_miss_flow: + + bridge->egress_ft = egress_ft; + bridge->egress_vlan_fg = vlan_fg; + bridge->egress_mac_fg = mac_fg; + bridge->egress_miss_fg = miss_fg; + bridge->egress_miss_pkt_reformat = miss_pkt_reformat; + bridge->egress_miss_handle = miss_handle; + return 0; + +err_mac_fg: + mlx5_destroy_flow_group(vlan_fg); +err_vlan_fg: + mlx5_destroy_flow_table(egress_ft); + return err; +} + +static void +mlx5_esw_bridge_egress_table_cleanup(struct mlx5_esw_bridge *bridge) +{ + if (bridge->egress_miss_handle) + mlx5_del_flow_rules(bridge->egress_miss_handle); + if (bridge->egress_miss_pkt_reformat) + mlx5_packet_reformat_dealloc(bridge->br_offloads->esw->dev, + bridge->egress_miss_pkt_reformat); + if (bridge->egress_miss_fg) + mlx5_destroy_flow_group(bridge->egress_miss_fg); + mlx5_destroy_flow_group(bridge->egress_mac_fg); + mlx5_destroy_flow_group(bridge->egress_vlan_fg); + mlx5_destroy_flow_table(bridge->egress_ft); +} + +static struct mlx5_flow_handle * +mlx5_esw_bridge_ingress_flow_with_esw_create(u16 vport_num, const unsigned char *addr, + struct mlx5_esw_bridge_vlan *vlan, u32 counter_id, + struct mlx5_esw_bridge *bridge, + struct mlx5_eswitch *esw) +{ + struct mlx5_esw_bridge_offloads *br_offloads = bridge->br_offloads; + struct mlx5_flow_act flow_act = { + .action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | MLX5_FLOW_CONTEXT_ACTION_COUNT, + .flags = FLOW_ACT_NO_APPEND, + }; + struct mlx5_flow_destination dests[2] = {}; + struct mlx5_flow_spec *rule_spec; + struct mlx5_flow_handle *handle; + u8 *smac_v, *smac_c; + + rule_spec = kvzalloc(sizeof(*rule_spec), GFP_KERNEL); + if (!rule_spec) + return ERR_PTR(-ENOMEM); + + rule_spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS | MLX5_MATCH_MISC_PARAMETERS_2; + + smac_v = MLX5_ADDR_OF(fte_match_param, rule_spec->match_value, + outer_headers.smac_47_16); + ether_addr_copy(smac_v, addr); + smac_c = MLX5_ADDR_OF(fte_match_param, rule_spec->match_criteria, + outer_headers.smac_47_16); + eth_broadcast_addr(smac_c); + + MLX5_SET(fte_match_param, rule_spec->match_criteria, + misc_parameters_2.metadata_reg_c_0, mlx5_eswitch_get_vport_metadata_mask()); + MLX5_SET(fte_match_param, rule_spec->match_value, misc_parameters_2.metadata_reg_c_0, + mlx5_eswitch_get_vport_metadata_for_match(esw, vport_num)); + + if (vlan && vlan->pkt_reformat_push) { + flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT | + MLX5_FLOW_CONTEXT_ACTION_MOD_HDR; + flow_act.pkt_reformat = vlan->pkt_reformat_push; + flow_act.modify_hdr = vlan->pkt_mod_hdr_push_mark; + } else if (vlan) { + MLX5_SET_TO_ONES(fte_match_param, rule_spec->match_criteria, + outer_headers.cvlan_tag); + MLX5_SET_TO_ONES(fte_match_param, rule_spec->match_value, + outer_headers.cvlan_tag); + MLX5_SET_TO_ONES(fte_match_param, rule_spec->match_criteria, + outer_headers.first_vid); + MLX5_SET(fte_match_param, rule_spec->match_value, outer_headers.first_vid, + vlan->vid); + } + + dests[0].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dests[0].ft = bridge->egress_ft; + dests[1].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; + dests[1].counter_id = counter_id; + + handle = mlx5_add_flow_rules(br_offloads->ingress_ft, rule_spec, &flow_act, dests, + ARRAY_SIZE(dests)); + + kvfree(rule_spec); + return handle; +} + +static struct mlx5_flow_handle * +mlx5_esw_bridge_ingress_flow_create(u16 vport_num, const unsigned char *addr, + struct mlx5_esw_bridge_vlan *vlan, u32 counter_id, + struct mlx5_esw_bridge *bridge) +{ + return mlx5_esw_bridge_ingress_flow_with_esw_create(vport_num, addr, vlan, counter_id, + bridge, bridge->br_offloads->esw); +} + +static struct mlx5_flow_handle * +mlx5_esw_bridge_ingress_flow_peer_create(u16 vport_num, const unsigned char *addr, + struct mlx5_esw_bridge_vlan *vlan, u32 counter_id, + struct mlx5_esw_bridge *bridge) +{ + struct mlx5_devcom *devcom = bridge->br_offloads->esw->dev->priv.devcom; + static struct mlx5_flow_handle *handle; + struct mlx5_eswitch *peer_esw; + + peer_esw = mlx5_devcom_get_peer_data(devcom, MLX5_DEVCOM_ESW_OFFLOADS); + if (!peer_esw) + return ERR_PTR(-ENODEV); + + handle = mlx5_esw_bridge_ingress_flow_with_esw_create(vport_num, addr, vlan, counter_id, + bridge, peer_esw); + + mlx5_devcom_release_peer_data(devcom, MLX5_DEVCOM_ESW_OFFLOADS); + return handle; +} + +static struct mlx5_flow_handle * +mlx5_esw_bridge_ingress_filter_flow_create(u16 vport_num, const unsigned char *addr, + struct mlx5_esw_bridge *bridge) +{ + struct mlx5_esw_bridge_offloads *br_offloads = bridge->br_offloads; + struct mlx5_flow_destination dest = { + .type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE, + .ft = br_offloads->skip_ft, + }; + struct mlx5_flow_act flow_act = { + .action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST, + .flags = FLOW_ACT_NO_APPEND, + }; + struct mlx5_flow_spec *rule_spec; + struct mlx5_flow_handle *handle; + u8 *smac_v, *smac_c; + + rule_spec = kvzalloc(sizeof(*rule_spec), GFP_KERNEL); + if (!rule_spec) + return ERR_PTR(-ENOMEM); + + rule_spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS | MLX5_MATCH_MISC_PARAMETERS_2; + + smac_v = MLX5_ADDR_OF(fte_match_param, rule_spec->match_value, + outer_headers.smac_47_16); + ether_addr_copy(smac_v, addr); + smac_c = MLX5_ADDR_OF(fte_match_param, rule_spec->match_criteria, + outer_headers.smac_47_16); + eth_broadcast_addr(smac_c); + + MLX5_SET(fte_match_param, rule_spec->match_criteria, + misc_parameters_2.metadata_reg_c_0, mlx5_eswitch_get_vport_metadata_mask()); + MLX5_SET(fte_match_param, rule_spec->match_value, misc_parameters_2.metadata_reg_c_0, + mlx5_eswitch_get_vport_metadata_for_match(br_offloads->esw, vport_num)); + + MLX5_SET_TO_ONES(fte_match_param, rule_spec->match_criteria, + outer_headers.cvlan_tag); + MLX5_SET_TO_ONES(fte_match_param, rule_spec->match_value, + outer_headers.cvlan_tag); + + handle = mlx5_add_flow_rules(br_offloads->ingress_ft, rule_spec, &flow_act, &dest, 1); + + kvfree(rule_spec); + return handle; +} + +static struct mlx5_flow_handle * +mlx5_esw_bridge_egress_flow_create(u16 vport_num, u16 esw_owner_vhca_id, const unsigned char *addr, + struct mlx5_esw_bridge_vlan *vlan, + struct mlx5_esw_bridge *bridge) +{ + struct mlx5_flow_destination dest = { + .type = MLX5_FLOW_DESTINATION_TYPE_VPORT, + .vport.num = vport_num, + }; + struct mlx5_flow_act flow_act = { + .action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST, + .flags = FLOW_ACT_NO_APPEND, + }; + struct mlx5_flow_spec *rule_spec; + struct mlx5_flow_handle *handle; + u8 *dmac_v, *dmac_c; + + rule_spec = kvzalloc(sizeof(*rule_spec), GFP_KERNEL); + if (!rule_spec) + return ERR_PTR(-ENOMEM); + + if (MLX5_CAP_ESW_FLOWTABLE(bridge->br_offloads->esw->dev, flow_source) && + vport_num == MLX5_VPORT_UPLINK) + rule_spec->flow_context.flow_source = + MLX5_FLOW_CONTEXT_FLOW_SOURCE_LOCAL_VPORT; + rule_spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS; + + dmac_v = MLX5_ADDR_OF(fte_match_param, rule_spec->match_value, + outer_headers.dmac_47_16); + ether_addr_copy(dmac_v, addr); + dmac_c = MLX5_ADDR_OF(fte_match_param, rule_spec->match_criteria, + outer_headers.dmac_47_16); + eth_broadcast_addr(dmac_c); + + if (vlan) { + if (vlan->pkt_reformat_pop) { + flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT; + flow_act.pkt_reformat = vlan->pkt_reformat_pop; + } + + MLX5_SET_TO_ONES(fte_match_param, rule_spec->match_criteria, + outer_headers.cvlan_tag); + MLX5_SET_TO_ONES(fte_match_param, rule_spec->match_value, + outer_headers.cvlan_tag); + MLX5_SET_TO_ONES(fte_match_param, rule_spec->match_criteria, + outer_headers.first_vid); + MLX5_SET(fte_match_param, rule_spec->match_value, outer_headers.first_vid, + vlan->vid); + } + + if (MLX5_CAP_ESW(bridge->br_offloads->esw->dev, merged_eswitch)) { + dest.vport.flags = MLX5_FLOW_DEST_VPORT_VHCA_ID; + dest.vport.vhca_id = esw_owner_vhca_id; + } + handle = mlx5_add_flow_rules(bridge->egress_ft, rule_spec, &flow_act, &dest, 1); + + kvfree(rule_spec); + return handle; +} + +static struct mlx5_flow_handle * +mlx5_esw_bridge_egress_miss_flow_create(struct mlx5_flow_table *egress_ft, + struct mlx5_flow_table *skip_ft, + struct mlx5_pkt_reformat *pkt_reformat) +{ + struct mlx5_flow_destination dest = { + .type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE, + .ft = skip_ft, + }; + struct mlx5_flow_act flow_act = { + .action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | + MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT, + .flags = FLOW_ACT_NO_APPEND, + .pkt_reformat = pkt_reformat, + }; + struct mlx5_flow_spec *rule_spec; + struct mlx5_flow_handle *handle; + + rule_spec = kvzalloc(sizeof(*rule_spec), GFP_KERNEL); + if (!rule_spec) + return ERR_PTR(-ENOMEM); + + rule_spec->match_criteria_enable = MLX5_MATCH_MISC_PARAMETERS_2; + + MLX5_SET(fte_match_param, rule_spec->match_criteria, + misc_parameters_2.metadata_reg_c_1, ESW_TUN_MASK); + MLX5_SET(fte_match_param, rule_spec->match_value, misc_parameters_2.metadata_reg_c_1, + ESW_TUN_BRIDGE_INGRESS_PUSH_VLAN_MARK); + + handle = mlx5_add_flow_rules(egress_ft, rule_spec, &flow_act, &dest, 1); + + kvfree(rule_spec); + return handle; +} + +static struct mlx5_esw_bridge *mlx5_esw_bridge_create(int ifindex, + struct mlx5_esw_bridge_offloads *br_offloads) +{ + struct mlx5_esw_bridge *bridge; + int err; + + bridge = kvzalloc(sizeof(*bridge), GFP_KERNEL); + if (!bridge) + return ERR_PTR(-ENOMEM); + + bridge->br_offloads = br_offloads; + err = mlx5_esw_bridge_egress_table_init(br_offloads, bridge); + if (err) + goto err_egress_tbl; + + err = rhashtable_init(&bridge->fdb_ht, &fdb_ht_params); + if (err) + goto err_fdb_ht; + + INIT_LIST_HEAD(&bridge->fdb_list); + bridge->ifindex = ifindex; + bridge->refcnt = 1; + bridge->ageing_time = clock_t_to_jiffies(BR_DEFAULT_AGEING_TIME); + list_add(&bridge->list, &br_offloads->bridges); + + return bridge; + +err_fdb_ht: + mlx5_esw_bridge_egress_table_cleanup(bridge); +err_egress_tbl: + kvfree(bridge); + return ERR_PTR(err); +} + +static void mlx5_esw_bridge_get(struct mlx5_esw_bridge *bridge) +{ + bridge->refcnt++; +} + +static void mlx5_esw_bridge_put(struct mlx5_esw_bridge_offloads *br_offloads, + struct mlx5_esw_bridge *bridge) +{ + if (--bridge->refcnt) + return; + + mlx5_esw_bridge_egress_table_cleanup(bridge); + list_del(&bridge->list); + rhashtable_destroy(&bridge->fdb_ht); + kvfree(bridge); + + if (list_empty(&br_offloads->bridges)) + mlx5_esw_bridge_ingress_table_cleanup(br_offloads); +} + +static struct mlx5_esw_bridge * +mlx5_esw_bridge_lookup(int ifindex, struct mlx5_esw_bridge_offloads *br_offloads) +{ + struct mlx5_esw_bridge *bridge; + + ASSERT_RTNL(); + + list_for_each_entry(bridge, &br_offloads->bridges, list) { + if (bridge->ifindex == ifindex) { + mlx5_esw_bridge_get(bridge); + return bridge; + } + } + + if (!br_offloads->ingress_ft) { + int err = mlx5_esw_bridge_ingress_table_init(br_offloads); + + if (err) + return ERR_PTR(err); + } + + bridge = mlx5_esw_bridge_create(ifindex, br_offloads); + if (IS_ERR(bridge) && list_empty(&br_offloads->bridges)) + mlx5_esw_bridge_ingress_table_cleanup(br_offloads); + return bridge; +} + +static unsigned long mlx5_esw_bridge_port_key_from_data(u16 vport_num, u16 esw_owner_vhca_id) +{ + return vport_num | (unsigned long)esw_owner_vhca_id << sizeof(vport_num) * BITS_PER_BYTE; +} + +static unsigned long mlx5_esw_bridge_port_key(struct mlx5_esw_bridge_port *port) +{ + return mlx5_esw_bridge_port_key_from_data(port->vport_num, port->esw_owner_vhca_id); +} + +static int mlx5_esw_bridge_port_insert(struct mlx5_esw_bridge_port *port, + struct mlx5_esw_bridge_offloads *br_offloads) +{ + return xa_insert(&br_offloads->ports, mlx5_esw_bridge_port_key(port), port, GFP_KERNEL); +} + +static struct mlx5_esw_bridge_port * +mlx5_esw_bridge_port_lookup(u16 vport_num, u16 esw_owner_vhca_id, + struct mlx5_esw_bridge_offloads *br_offloads) +{ + return xa_load(&br_offloads->ports, mlx5_esw_bridge_port_key_from_data(vport_num, + esw_owner_vhca_id)); +} + +static void mlx5_esw_bridge_port_erase(struct mlx5_esw_bridge_port *port, + struct mlx5_esw_bridge_offloads *br_offloads) +{ + xa_erase(&br_offloads->ports, mlx5_esw_bridge_port_key(port)); +} + +static void mlx5_esw_bridge_fdb_entry_refresh(struct mlx5_esw_bridge_fdb_entry *entry) +{ + trace_mlx5_esw_bridge_fdb_entry_refresh(entry); + + mlx5_esw_bridge_fdb_offload_notify(entry->dev, entry->key.addr, + entry->key.vid, + SWITCHDEV_FDB_ADD_TO_BRIDGE); +} + +static void +mlx5_esw_bridge_fdb_entry_cleanup(struct mlx5_esw_bridge_fdb_entry *entry, + struct mlx5_esw_bridge *bridge) +{ + trace_mlx5_esw_bridge_fdb_entry_cleanup(entry); + + rhashtable_remove_fast(&bridge->fdb_ht, &entry->ht_node, fdb_ht_params); + mlx5_del_flow_rules(entry->egress_handle); + if (entry->filter_handle) + mlx5_del_flow_rules(entry->filter_handle); + mlx5_del_flow_rules(entry->ingress_handle); + mlx5_fc_destroy(bridge->br_offloads->esw->dev, entry->ingress_counter); + list_del(&entry->vlan_list); + list_del(&entry->list); + kvfree(entry); +} + +static void +mlx5_esw_bridge_fdb_entry_notify_and_cleanup(struct mlx5_esw_bridge_fdb_entry *entry, + struct mlx5_esw_bridge *bridge) +{ + mlx5_esw_bridge_fdb_del_notify(entry); + mlx5_esw_bridge_fdb_entry_cleanup(entry, bridge); +} + +static void mlx5_esw_bridge_fdb_flush(struct mlx5_esw_bridge *bridge) +{ + struct mlx5_esw_bridge_fdb_entry *entry, *tmp; + + list_for_each_entry_safe(entry, tmp, &bridge->fdb_list, list) + mlx5_esw_bridge_fdb_entry_notify_and_cleanup(entry, bridge); +} + +static struct mlx5_esw_bridge_vlan * +mlx5_esw_bridge_vlan_lookup(u16 vid, struct mlx5_esw_bridge_port *port) +{ + return xa_load(&port->vlans, vid); +} + +static int +mlx5_esw_bridge_vlan_push_create(struct mlx5_esw_bridge_vlan *vlan, struct mlx5_eswitch *esw) +{ + struct { + __be16 h_vlan_proto; + __be16 h_vlan_TCI; + } vlan_hdr = { htons(ETH_P_8021Q), htons(vlan->vid) }; + struct mlx5_pkt_reformat_params reformat_params = {}; + struct mlx5_pkt_reformat *pkt_reformat; + + if (!BIT(MLX5_CAP_ESW_FLOWTABLE_FDB(esw->dev, reformat_insert)) || + MLX5_CAP_GEN_2(esw->dev, max_reformat_insert_size) < sizeof(vlan_hdr) || + MLX5_CAP_GEN_2(esw->dev, max_reformat_insert_offset) < + offsetof(struct vlan_ethhdr, h_vlan_proto)) { + esw_warn(esw->dev, "Packet reformat INSERT_HEADER is not supported\n"); + return -EOPNOTSUPP; + } + + reformat_params.type = MLX5_REFORMAT_TYPE_INSERT_HDR; + reformat_params.param_0 = MLX5_REFORMAT_CONTEXT_ANCHOR_MAC_START; + reformat_params.param_1 = offsetof(struct vlan_ethhdr, h_vlan_proto); + reformat_params.size = sizeof(vlan_hdr); + reformat_params.data = &vlan_hdr; + pkt_reformat = mlx5_packet_reformat_alloc(esw->dev, + &reformat_params, + MLX5_FLOW_NAMESPACE_FDB); + if (IS_ERR(pkt_reformat)) { + esw_warn(esw->dev, "Failed to alloc packet reformat INSERT_HEADER (err=%ld)\n", + PTR_ERR(pkt_reformat)); + return PTR_ERR(pkt_reformat); + } + + vlan->pkt_reformat_push = pkt_reformat; + return 0; +} + +static void +mlx5_esw_bridge_vlan_push_cleanup(struct mlx5_esw_bridge_vlan *vlan, struct mlx5_eswitch *esw) +{ + mlx5_packet_reformat_dealloc(esw->dev, vlan->pkt_reformat_push); + vlan->pkt_reformat_push = NULL; +} + +static int +mlx5_esw_bridge_vlan_pop_create(struct mlx5_esw_bridge_vlan *vlan, struct mlx5_eswitch *esw) +{ + struct mlx5_pkt_reformat *pkt_reformat; + + if (!mlx5_esw_bridge_pkt_reformat_vlan_pop_supported(esw)) { + esw_warn(esw->dev, "Packet reformat REMOVE_HEADER is not supported\n"); + return -EOPNOTSUPP; + } + + pkt_reformat = mlx5_esw_bridge_pkt_reformat_vlan_pop_create(esw); + if (IS_ERR(pkt_reformat)) { + esw_warn(esw->dev, "Failed to alloc packet reformat REMOVE_HEADER (err=%ld)\n", + PTR_ERR(pkt_reformat)); + return PTR_ERR(pkt_reformat); + } + + vlan->pkt_reformat_pop = pkt_reformat; + return 0; +} + +static void +mlx5_esw_bridge_vlan_pop_cleanup(struct mlx5_esw_bridge_vlan *vlan, struct mlx5_eswitch *esw) +{ + mlx5_packet_reformat_dealloc(esw->dev, vlan->pkt_reformat_pop); + vlan->pkt_reformat_pop = NULL; +} + +static int +mlx5_esw_bridge_vlan_push_mark_create(struct mlx5_esw_bridge_vlan *vlan, struct mlx5_eswitch *esw) +{ + u8 action[MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto)] = {}; + struct mlx5_modify_hdr *pkt_mod_hdr; + + MLX5_SET(set_action_in, action, action_type, MLX5_ACTION_TYPE_SET); + MLX5_SET(set_action_in, action, field, MLX5_ACTION_IN_FIELD_METADATA_REG_C_1); + MLX5_SET(set_action_in, action, offset, 8); + MLX5_SET(set_action_in, action, length, ESW_TUN_OPTS_BITS + ESW_TUN_ID_BITS); + MLX5_SET(set_action_in, action, data, ESW_TUN_BRIDGE_INGRESS_PUSH_VLAN); + + pkt_mod_hdr = mlx5_modify_header_alloc(esw->dev, MLX5_FLOW_NAMESPACE_FDB, 1, action); + if (IS_ERR(pkt_mod_hdr)) + return PTR_ERR(pkt_mod_hdr); + + vlan->pkt_mod_hdr_push_mark = pkt_mod_hdr; + return 0; +} + +static void +mlx5_esw_bridge_vlan_push_mark_cleanup(struct mlx5_esw_bridge_vlan *vlan, struct mlx5_eswitch *esw) +{ + mlx5_modify_header_dealloc(esw->dev, vlan->pkt_mod_hdr_push_mark); + vlan->pkt_mod_hdr_push_mark = NULL; +} + +static struct mlx5_esw_bridge_vlan * +mlx5_esw_bridge_vlan_create(u16 vid, u16 flags, struct mlx5_esw_bridge_port *port, + struct mlx5_eswitch *esw) +{ + struct mlx5_esw_bridge_vlan *vlan; + int err; + + vlan = kvzalloc(sizeof(*vlan), GFP_KERNEL); + if (!vlan) + return ERR_PTR(-ENOMEM); + + vlan->vid = vid; + vlan->flags = flags; + INIT_LIST_HEAD(&vlan->fdb_list); + + if (flags & BRIDGE_VLAN_INFO_PVID) { + err = mlx5_esw_bridge_vlan_push_create(vlan, esw); + if (err) + goto err_vlan_push; + + err = mlx5_esw_bridge_vlan_push_mark_create(vlan, esw); + if (err) + goto err_vlan_push_mark; + } + if (flags & BRIDGE_VLAN_INFO_UNTAGGED) { + err = mlx5_esw_bridge_vlan_pop_create(vlan, esw); + if (err) + goto err_vlan_pop; + } + + err = xa_insert(&port->vlans, vid, vlan, GFP_KERNEL); + if (err) + goto err_xa_insert; + + trace_mlx5_esw_bridge_vlan_create(vlan); + return vlan; + +err_xa_insert: + if (vlan->pkt_reformat_pop) + mlx5_esw_bridge_vlan_pop_cleanup(vlan, esw); +err_vlan_pop: + if (vlan->pkt_mod_hdr_push_mark) + mlx5_esw_bridge_vlan_push_mark_cleanup(vlan, esw); +err_vlan_push_mark: + if (vlan->pkt_reformat_push) + mlx5_esw_bridge_vlan_push_cleanup(vlan, esw); +err_vlan_push: + kvfree(vlan); + return ERR_PTR(err); +} + +static void mlx5_esw_bridge_vlan_erase(struct mlx5_esw_bridge_port *port, + struct mlx5_esw_bridge_vlan *vlan) +{ + xa_erase(&port->vlans, vlan->vid); +} + +static void mlx5_esw_bridge_vlan_flush(struct mlx5_esw_bridge_vlan *vlan, + struct mlx5_esw_bridge *bridge) +{ + struct mlx5_eswitch *esw = bridge->br_offloads->esw; + struct mlx5_esw_bridge_fdb_entry *entry, *tmp; + + list_for_each_entry_safe(entry, tmp, &vlan->fdb_list, vlan_list) + mlx5_esw_bridge_fdb_entry_notify_and_cleanup(entry, bridge); + + if (vlan->pkt_reformat_pop) + mlx5_esw_bridge_vlan_pop_cleanup(vlan, esw); + if (vlan->pkt_mod_hdr_push_mark) + mlx5_esw_bridge_vlan_push_mark_cleanup(vlan, esw); + if (vlan->pkt_reformat_push) + mlx5_esw_bridge_vlan_push_cleanup(vlan, esw); +} + +static void mlx5_esw_bridge_vlan_cleanup(struct mlx5_esw_bridge_port *port, + struct mlx5_esw_bridge_vlan *vlan, + struct mlx5_esw_bridge *bridge) +{ + trace_mlx5_esw_bridge_vlan_cleanup(vlan); + mlx5_esw_bridge_vlan_flush(vlan, bridge); + mlx5_esw_bridge_vlan_erase(port, vlan); + kvfree(vlan); +} + +static void mlx5_esw_bridge_port_vlans_flush(struct mlx5_esw_bridge_port *port, + struct mlx5_esw_bridge *bridge) +{ + struct mlx5_esw_bridge_vlan *vlan; + unsigned long index; + + xa_for_each(&port->vlans, index, vlan) + mlx5_esw_bridge_vlan_cleanup(port, vlan, bridge); +} + +static struct mlx5_esw_bridge_vlan * +mlx5_esw_bridge_port_vlan_lookup(u16 vid, u16 vport_num, u16 esw_owner_vhca_id, + struct mlx5_esw_bridge *bridge, struct mlx5_eswitch *esw) +{ + struct mlx5_esw_bridge_port *port; + struct mlx5_esw_bridge_vlan *vlan; + + port = mlx5_esw_bridge_port_lookup(vport_num, esw_owner_vhca_id, bridge->br_offloads); + if (!port) { + /* FDB is added asynchronously on wq while port might have been deleted + * concurrently. Report on 'info' logging level and skip the FDB offload. + */ + esw_info(esw->dev, "Failed to lookup bridge port (vport=%u)\n", vport_num); + return ERR_PTR(-EINVAL); + } + + vlan = mlx5_esw_bridge_vlan_lookup(vid, port); + if (!vlan) { + /* FDB is added asynchronously on wq while vlan might have been deleted + * concurrently. Report on 'info' logging level and skip the FDB offload. + */ + esw_info(esw->dev, "Failed to lookup bridge port vlan metadata (vport=%u)\n", + vport_num); + return ERR_PTR(-EINVAL); + } + + return vlan; +} + +static struct mlx5_esw_bridge_fdb_entry * +mlx5_esw_bridge_fdb_lookup(struct mlx5_esw_bridge *bridge, + const unsigned char *addr, u16 vid) +{ + struct mlx5_esw_bridge_fdb_key key = {}; + + ether_addr_copy(key.addr, addr); + key.vid = vid; + return rhashtable_lookup_fast(&bridge->fdb_ht, &key, fdb_ht_params); +} + +static struct mlx5_esw_bridge_fdb_entry * +mlx5_esw_bridge_fdb_entry_init(struct net_device *dev, u16 vport_num, u16 esw_owner_vhca_id, + const unsigned char *addr, u16 vid, bool added_by_user, bool peer, + struct mlx5_eswitch *esw, struct mlx5_esw_bridge *bridge) +{ + struct mlx5_esw_bridge_vlan *vlan = NULL; + struct mlx5_esw_bridge_fdb_entry *entry; + struct mlx5_flow_handle *handle; + struct mlx5_fc *counter; + int err; + + if (bridge->flags & MLX5_ESW_BRIDGE_VLAN_FILTERING_FLAG && vid) { + vlan = mlx5_esw_bridge_port_vlan_lookup(vid, vport_num, esw_owner_vhca_id, bridge, + esw); + if (IS_ERR(vlan)) + return ERR_CAST(vlan); + } + + entry = mlx5_esw_bridge_fdb_lookup(bridge, addr, vid); + if (entry) + mlx5_esw_bridge_fdb_entry_notify_and_cleanup(entry, bridge); + + entry = kvzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return ERR_PTR(-ENOMEM); + + ether_addr_copy(entry->key.addr, addr); + entry->key.vid = vid; + entry->dev = dev; + entry->vport_num = vport_num; + entry->esw_owner_vhca_id = esw_owner_vhca_id; + entry->lastuse = jiffies; + if (added_by_user) + entry->flags |= MLX5_ESW_BRIDGE_FLAG_ADDED_BY_USER; + if (peer) + entry->flags |= MLX5_ESW_BRIDGE_FLAG_PEER; + + counter = mlx5_fc_create(esw->dev, true); + if (IS_ERR(counter)) { + err = PTR_ERR(counter); + goto err_ingress_fc_create; + } + entry->ingress_counter = counter; + + handle = peer ? + mlx5_esw_bridge_ingress_flow_peer_create(vport_num, addr, vlan, + mlx5_fc_id(counter), bridge) : + mlx5_esw_bridge_ingress_flow_create(vport_num, addr, vlan, + mlx5_fc_id(counter), bridge); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + esw_warn(esw->dev, "Failed to create ingress flow(vport=%u,err=%d)\n", + vport_num, err); + goto err_ingress_flow_create; + } + entry->ingress_handle = handle; + + if (bridge->flags & MLX5_ESW_BRIDGE_VLAN_FILTERING_FLAG) { + handle = mlx5_esw_bridge_ingress_filter_flow_create(vport_num, addr, bridge); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + esw_warn(esw->dev, "Failed to create ingress filter(vport=%u,err=%d)\n", + vport_num, err); + goto err_ingress_filter_flow_create; + } + entry->filter_handle = handle; + } + + handle = mlx5_esw_bridge_egress_flow_create(vport_num, esw_owner_vhca_id, addr, vlan, + bridge); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + esw_warn(esw->dev, "Failed to create egress flow(vport=%u,err=%d)\n", + vport_num, err); + goto err_egress_flow_create; + } + entry->egress_handle = handle; + + err = rhashtable_insert_fast(&bridge->fdb_ht, &entry->ht_node, fdb_ht_params); + if (err) { + esw_warn(esw->dev, "Failed to insert FDB flow(vport=%u,err=%d)\n", vport_num, err); + goto err_ht_init; + } + + if (vlan) + list_add(&entry->vlan_list, &vlan->fdb_list); + else + INIT_LIST_HEAD(&entry->vlan_list); + list_add(&entry->list, &bridge->fdb_list); + + trace_mlx5_esw_bridge_fdb_entry_init(entry); + return entry; + +err_ht_init: + mlx5_del_flow_rules(entry->egress_handle); +err_egress_flow_create: + if (entry->filter_handle) + mlx5_del_flow_rules(entry->filter_handle); +err_ingress_filter_flow_create: + mlx5_del_flow_rules(entry->ingress_handle); +err_ingress_flow_create: + mlx5_fc_destroy(esw->dev, entry->ingress_counter); +err_ingress_fc_create: + kvfree(entry); + return ERR_PTR(err); +} + +int mlx5_esw_bridge_ageing_time_set(u16 vport_num, u16 esw_owner_vhca_id, unsigned long ageing_time, + struct mlx5_esw_bridge_offloads *br_offloads) +{ + struct mlx5_esw_bridge_port *port; + + port = mlx5_esw_bridge_port_lookup(vport_num, esw_owner_vhca_id, br_offloads); + if (!port) + return -EINVAL; + + port->bridge->ageing_time = clock_t_to_jiffies(ageing_time); + return 0; +} + +int mlx5_esw_bridge_vlan_filtering_set(u16 vport_num, u16 esw_owner_vhca_id, bool enable, + struct mlx5_esw_bridge_offloads *br_offloads) +{ + struct mlx5_esw_bridge_port *port; + struct mlx5_esw_bridge *bridge; + bool filtering; + + port = mlx5_esw_bridge_port_lookup(vport_num, esw_owner_vhca_id, br_offloads); + if (!port) + return -EINVAL; + + bridge = port->bridge; + filtering = bridge->flags & MLX5_ESW_BRIDGE_VLAN_FILTERING_FLAG; + if (filtering == enable) + return 0; + + mlx5_esw_bridge_fdb_flush(bridge); + if (enable) + bridge->flags |= MLX5_ESW_BRIDGE_VLAN_FILTERING_FLAG; + else + bridge->flags &= ~MLX5_ESW_BRIDGE_VLAN_FILTERING_FLAG; + + return 0; +} + +static int mlx5_esw_bridge_vport_init(u16 vport_num, u16 esw_owner_vhca_id, u16 flags, + struct mlx5_esw_bridge_offloads *br_offloads, + struct mlx5_esw_bridge *bridge) +{ + struct mlx5_eswitch *esw = br_offloads->esw; + struct mlx5_esw_bridge_port *port; + int err; + + port = kvzalloc(sizeof(*port), GFP_KERNEL); + if (!port) + return -ENOMEM; + + port->vport_num = vport_num; + port->esw_owner_vhca_id = esw_owner_vhca_id; + port->bridge = bridge; + port->flags |= flags; + xa_init(&port->vlans); + err = mlx5_esw_bridge_port_insert(port, br_offloads); + if (err) { + esw_warn(esw->dev, + "Failed to insert port metadata (vport=%u,esw_owner_vhca_id=%u,err=%d)\n", + port->vport_num, port->esw_owner_vhca_id, err); + goto err_port_insert; + } + trace_mlx5_esw_bridge_vport_init(port); + + return 0; + +err_port_insert: + kvfree(port); + return err; +} + +static int mlx5_esw_bridge_vport_cleanup(struct mlx5_esw_bridge_offloads *br_offloads, + struct mlx5_esw_bridge_port *port) +{ + u16 vport_num = port->vport_num, esw_owner_vhca_id = port->esw_owner_vhca_id; + struct mlx5_esw_bridge *bridge = port->bridge; + struct mlx5_esw_bridge_fdb_entry *entry, *tmp; + + list_for_each_entry_safe(entry, tmp, &bridge->fdb_list, list) + if (entry->vport_num == vport_num && entry->esw_owner_vhca_id == esw_owner_vhca_id) + mlx5_esw_bridge_fdb_entry_cleanup(entry, bridge); + + trace_mlx5_esw_bridge_vport_cleanup(port); + mlx5_esw_bridge_port_vlans_flush(port, bridge); + mlx5_esw_bridge_port_erase(port, br_offloads); + kvfree(port); + mlx5_esw_bridge_put(br_offloads, bridge); + return 0; +} + +static int mlx5_esw_bridge_vport_link_with_flags(int ifindex, u16 vport_num, u16 esw_owner_vhca_id, + u16 flags, + struct mlx5_esw_bridge_offloads *br_offloads, + struct netlink_ext_ack *extack) +{ + struct mlx5_esw_bridge *bridge; + int err; + + bridge = mlx5_esw_bridge_lookup(ifindex, br_offloads); + if (IS_ERR(bridge)) { + NL_SET_ERR_MSG_MOD(extack, "Error checking for existing bridge with same ifindex"); + return PTR_ERR(bridge); + } + + err = mlx5_esw_bridge_vport_init(vport_num, esw_owner_vhca_id, flags, br_offloads, bridge); + if (err) { + NL_SET_ERR_MSG_MOD(extack, "Error initializing port"); + goto err_vport; + } + return 0; + +err_vport: + mlx5_esw_bridge_put(br_offloads, bridge); + return err; +} + +int mlx5_esw_bridge_vport_link(int ifindex, u16 vport_num, u16 esw_owner_vhca_id, + struct mlx5_esw_bridge_offloads *br_offloads, + struct netlink_ext_ack *extack) +{ + return mlx5_esw_bridge_vport_link_with_flags(ifindex, vport_num, esw_owner_vhca_id, 0, + br_offloads, extack); +} + +int mlx5_esw_bridge_vport_unlink(int ifindex, u16 vport_num, u16 esw_owner_vhca_id, + struct mlx5_esw_bridge_offloads *br_offloads, + struct netlink_ext_ack *extack) +{ + struct mlx5_esw_bridge_port *port; + int err; + + port = mlx5_esw_bridge_port_lookup(vport_num, esw_owner_vhca_id, br_offloads); + if (!port) { + NL_SET_ERR_MSG_MOD(extack, "Port is not attached to any bridge"); + return -EINVAL; + } + if (port->bridge->ifindex != ifindex) { + NL_SET_ERR_MSG_MOD(extack, "Port is attached to another bridge"); + return -EINVAL; + } + + err = mlx5_esw_bridge_vport_cleanup(br_offloads, port); + if (err) + NL_SET_ERR_MSG_MOD(extack, "Port cleanup failed"); + return err; +} + +int mlx5_esw_bridge_vport_peer_link(int ifindex, u16 vport_num, u16 esw_owner_vhca_id, + struct mlx5_esw_bridge_offloads *br_offloads, + struct netlink_ext_ack *extack) +{ + if (!MLX5_CAP_ESW(br_offloads->esw->dev, merged_eswitch)) + return 0; + + return mlx5_esw_bridge_vport_link_with_flags(ifindex, vport_num, esw_owner_vhca_id, + MLX5_ESW_BRIDGE_PORT_FLAG_PEER, + br_offloads, extack); +} + +int mlx5_esw_bridge_vport_peer_unlink(int ifindex, u16 vport_num, u16 esw_owner_vhca_id, + struct mlx5_esw_bridge_offloads *br_offloads, + struct netlink_ext_ack *extack) +{ + return mlx5_esw_bridge_vport_unlink(ifindex, vport_num, esw_owner_vhca_id, br_offloads, + extack); +} + +int mlx5_esw_bridge_port_vlan_add(u16 vport_num, u16 esw_owner_vhca_id, u16 vid, u16 flags, + struct mlx5_esw_bridge_offloads *br_offloads, + struct netlink_ext_ack *extack) +{ + struct mlx5_esw_bridge_port *port; + struct mlx5_esw_bridge_vlan *vlan; + + port = mlx5_esw_bridge_port_lookup(vport_num, esw_owner_vhca_id, br_offloads); + if (!port) + return -EINVAL; + + vlan = mlx5_esw_bridge_vlan_lookup(vid, port); + if (vlan) { + if (vlan->flags == flags) + return 0; + mlx5_esw_bridge_vlan_cleanup(port, vlan, port->bridge); + } + + vlan = mlx5_esw_bridge_vlan_create(vid, flags, port, br_offloads->esw); + if (IS_ERR(vlan)) { + NL_SET_ERR_MSG_MOD(extack, "Failed to create VLAN entry"); + return PTR_ERR(vlan); + } + return 0; +} + +void mlx5_esw_bridge_port_vlan_del(u16 vport_num, u16 esw_owner_vhca_id, u16 vid, + struct mlx5_esw_bridge_offloads *br_offloads) +{ + struct mlx5_esw_bridge_port *port; + struct mlx5_esw_bridge_vlan *vlan; + + port = mlx5_esw_bridge_port_lookup(vport_num, esw_owner_vhca_id, br_offloads); + if (!port) + return; + + vlan = mlx5_esw_bridge_vlan_lookup(vid, port); + if (!vlan) + return; + mlx5_esw_bridge_vlan_cleanup(port, vlan, port->bridge); +} + +void mlx5_esw_bridge_fdb_update_used(struct net_device *dev, u16 vport_num, u16 esw_owner_vhca_id, + struct mlx5_esw_bridge_offloads *br_offloads, + struct switchdev_notifier_fdb_info *fdb_info) +{ + struct mlx5_esw_bridge_fdb_entry *entry; + struct mlx5_esw_bridge_port *port; + struct mlx5_esw_bridge *bridge; + + port = mlx5_esw_bridge_port_lookup(vport_num, esw_owner_vhca_id, br_offloads); + if (!port || port->flags & MLX5_ESW_BRIDGE_PORT_FLAG_PEER) + return; + + bridge = port->bridge; + entry = mlx5_esw_bridge_fdb_lookup(bridge, fdb_info->addr, fdb_info->vid); + if (!entry) { + esw_debug(br_offloads->esw->dev, + "FDB entry with specified key not found (MAC=%pM,vid=%u,vport=%u)\n", + fdb_info->addr, fdb_info->vid, vport_num); + return; + } + + entry->lastuse = jiffies; +} + +void mlx5_esw_bridge_fdb_create(struct net_device *dev, u16 vport_num, u16 esw_owner_vhca_id, + struct mlx5_esw_bridge_offloads *br_offloads, + struct switchdev_notifier_fdb_info *fdb_info) +{ + struct mlx5_esw_bridge_fdb_entry *entry; + struct mlx5_esw_bridge_port *port; + struct mlx5_esw_bridge *bridge; + + port = mlx5_esw_bridge_port_lookup(vport_num, esw_owner_vhca_id, br_offloads); + if (!port) + return; + + bridge = port->bridge; + entry = mlx5_esw_bridge_fdb_entry_init(dev, vport_num, esw_owner_vhca_id, fdb_info->addr, + fdb_info->vid, fdb_info->added_by_user, + port->flags & MLX5_ESW_BRIDGE_PORT_FLAG_PEER, + br_offloads->esw, bridge); + if (IS_ERR(entry)) + return; + + if (entry->flags & MLX5_ESW_BRIDGE_FLAG_ADDED_BY_USER) + mlx5_esw_bridge_fdb_offload_notify(dev, entry->key.addr, entry->key.vid, + SWITCHDEV_FDB_OFFLOADED); + else if (!(entry->flags & MLX5_ESW_BRIDGE_FLAG_PEER)) + /* Take over dynamic entries to prevent kernel bridge from aging them out. */ + mlx5_esw_bridge_fdb_offload_notify(dev, entry->key.addr, entry->key.vid, + SWITCHDEV_FDB_ADD_TO_BRIDGE); +} + +void mlx5_esw_bridge_fdb_remove(struct net_device *dev, u16 vport_num, u16 esw_owner_vhca_id, + struct mlx5_esw_bridge_offloads *br_offloads, + struct switchdev_notifier_fdb_info *fdb_info) +{ + struct mlx5_eswitch *esw = br_offloads->esw; + struct mlx5_esw_bridge_fdb_entry *entry; + struct mlx5_esw_bridge_port *port; + struct mlx5_esw_bridge *bridge; + + port = mlx5_esw_bridge_port_lookup(vport_num, esw_owner_vhca_id, br_offloads); + if (!port) + return; + + bridge = port->bridge; + entry = mlx5_esw_bridge_fdb_lookup(bridge, fdb_info->addr, fdb_info->vid); + if (!entry) { + esw_warn(esw->dev, + "FDB entry with specified key not found (MAC=%pM,vid=%u,vport=%u)\n", + fdb_info->addr, fdb_info->vid, vport_num); + return; + } + + mlx5_esw_bridge_fdb_entry_notify_and_cleanup(entry, bridge); +} + +void mlx5_esw_bridge_update(struct mlx5_esw_bridge_offloads *br_offloads) +{ + struct mlx5_esw_bridge_fdb_entry *entry, *tmp; + struct mlx5_esw_bridge *bridge; + + list_for_each_entry(bridge, &br_offloads->bridges, list) { + list_for_each_entry_safe(entry, tmp, &bridge->fdb_list, list) { + unsigned long lastuse = + (unsigned long)mlx5_fc_query_lastuse(entry->ingress_counter); + + if (entry->flags & MLX5_ESW_BRIDGE_FLAG_ADDED_BY_USER) + continue; + + if (time_after(lastuse, entry->lastuse)) + mlx5_esw_bridge_fdb_entry_refresh(entry); + else if (!(entry->flags & MLX5_ESW_BRIDGE_FLAG_PEER) && + time_is_before_jiffies(entry->lastuse + bridge->ageing_time)) + mlx5_esw_bridge_fdb_entry_notify_and_cleanup(entry, bridge); + } + } +} + +static void mlx5_esw_bridge_flush(struct mlx5_esw_bridge_offloads *br_offloads) +{ + struct mlx5_esw_bridge_port *port; + unsigned long i; + + xa_for_each(&br_offloads->ports, i, port) + mlx5_esw_bridge_vport_cleanup(br_offloads, port); + + WARN_ONCE(!list_empty(&br_offloads->bridges), + "Cleaning up bridge offloads while still having bridges attached\n"); +} + +struct mlx5_esw_bridge_offloads *mlx5_esw_bridge_init(struct mlx5_eswitch *esw) +{ + struct mlx5_esw_bridge_offloads *br_offloads; + + ASSERT_RTNL(); + + br_offloads = kvzalloc(sizeof(*br_offloads), GFP_KERNEL); + if (!br_offloads) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&br_offloads->bridges); + xa_init(&br_offloads->ports); + br_offloads->esw = esw; + esw->br_offloads = br_offloads; + + return br_offloads; +} + +void mlx5_esw_bridge_cleanup(struct mlx5_eswitch *esw) +{ + struct mlx5_esw_bridge_offloads *br_offloads = esw->br_offloads; + + ASSERT_RTNL(); + + if (!br_offloads) + return; + + mlx5_esw_bridge_flush(br_offloads); + WARN_ON(!xa_empty(&br_offloads->ports)); + + esw->br_offloads = NULL; + kvfree(br_offloads); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.h new file mode 100644 index 0000000..efc3997 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2021 Mellanox Technologies. */ + +#ifndef __MLX5_ESW_BRIDGE_H__ +#define __MLX5_ESW_BRIDGE_H__ + +#include +#include +#include +#include +#include "eswitch.h" + +struct mlx5_flow_table; +struct mlx5_flow_group; + +struct mlx5_esw_bridge_offloads { + struct mlx5_eswitch *esw; + struct list_head bridges; + struct xarray ports; + + struct notifier_block netdev_nb; + struct notifier_block nb_blk; + struct notifier_block nb; + struct workqueue_struct *wq; + struct delayed_work update_work; + + struct mlx5_flow_table *ingress_ft; + struct mlx5_flow_group *ingress_vlan_fg; + struct mlx5_flow_group *ingress_filter_fg; + struct mlx5_flow_group *ingress_mac_fg; + + struct mlx5_flow_table *skip_ft; +}; + +struct mlx5_esw_bridge_offloads *mlx5_esw_bridge_init(struct mlx5_eswitch *esw); +void mlx5_esw_bridge_cleanup(struct mlx5_eswitch *esw); +int mlx5_esw_bridge_vport_link(int ifindex, u16 vport_num, u16 esw_owner_vhca_id, + struct mlx5_esw_bridge_offloads *br_offloads, + struct netlink_ext_ack *extack); +int mlx5_esw_bridge_vport_unlink(int ifindex, u16 vport_num, u16 esw_owner_vhca_id, + struct mlx5_esw_bridge_offloads *br_offloads, + struct netlink_ext_ack *extack); +int mlx5_esw_bridge_vport_peer_link(int ifindex, u16 vport_num, u16 esw_owner_vhca_id, + struct mlx5_esw_bridge_offloads *br_offloads, + struct netlink_ext_ack *extack); +int mlx5_esw_bridge_vport_peer_unlink(int ifindex, u16 vport_num, u16 esw_owner_vhca_id, + struct mlx5_esw_bridge_offloads *br_offloads, + struct netlink_ext_ack *extack); +void mlx5_esw_bridge_fdb_update_used(struct net_device *dev, u16 vport_num, u16 esw_owner_vhca_id, + struct mlx5_esw_bridge_offloads *br_offloads, + struct switchdev_notifier_fdb_info *fdb_info); +void mlx5_esw_bridge_fdb_create(struct net_device *dev, u16 vport_num, u16 esw_owner_vhca_id, + struct mlx5_esw_bridge_offloads *br_offloads, + struct switchdev_notifier_fdb_info *fdb_info); +void mlx5_esw_bridge_fdb_remove(struct net_device *dev, u16 vport_num, u16 esw_owner_vhca_id, + struct mlx5_esw_bridge_offloads *br_offloads, + struct switchdev_notifier_fdb_info *fdb_info); +void mlx5_esw_bridge_update(struct mlx5_esw_bridge_offloads *br_offloads); +int mlx5_esw_bridge_ageing_time_set(u16 vport_num, u16 esw_owner_vhca_id, unsigned long ageing_time, + struct mlx5_esw_bridge_offloads *br_offloads); +int mlx5_esw_bridge_vlan_filtering_set(u16 vport_num, u16 esw_owner_vhca_id, bool enable, + struct mlx5_esw_bridge_offloads *br_offloads); +int mlx5_esw_bridge_port_vlan_add(u16 vport_num, u16 esw_owner_vhca_id, u16 vid, u16 flags, + struct mlx5_esw_bridge_offloads *br_offloads, + struct netlink_ext_ack *extack); +void mlx5_esw_bridge_port_vlan_del(u16 vport_num, u16 esw_owner_vhca_id, u16 vid, + struct mlx5_esw_bridge_offloads *br_offloads); + +#endif /* __MLX5_ESW_BRIDGE_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_priv.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_priv.h new file mode 100644 index 0000000..878311f --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_priv.h @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2021 Mellanox Technologies. */ + +#ifndef _MLX5_ESW_BRIDGE_PRIVATE_ +#define _MLX5_ESW_BRIDGE_PRIVATE_ + +#include +#include +#include +#include +#include +#include +#include "fs_core.h" + +struct mlx5_esw_bridge_fdb_key { + unsigned char addr[ETH_ALEN]; + u16 vid; +}; + +enum { + MLX5_ESW_BRIDGE_FLAG_ADDED_BY_USER = BIT(0), + MLX5_ESW_BRIDGE_FLAG_PEER = BIT(1), +}; + +enum { + MLX5_ESW_BRIDGE_PORT_FLAG_PEER = BIT(0), +}; + +struct mlx5_esw_bridge_fdb_entry { + struct mlx5_esw_bridge_fdb_key key; + struct rhash_head ht_node; + struct net_device *dev; + struct list_head list; + struct list_head vlan_list; + u16 vport_num; + u16 esw_owner_vhca_id; + u16 flags; + + struct mlx5_flow_handle *ingress_handle; + struct mlx5_fc *ingress_counter; + unsigned long lastuse; + struct mlx5_flow_handle *egress_handle; + struct mlx5_flow_handle *filter_handle; +}; + +struct mlx5_esw_bridge_vlan { + u16 vid; + u16 flags; + struct list_head fdb_list; + struct mlx5_pkt_reformat *pkt_reformat_push; + struct mlx5_pkt_reformat *pkt_reformat_pop; + struct mlx5_modify_hdr *pkt_mod_hdr_push_mark; +}; + +struct mlx5_esw_bridge_port { + u16 vport_num; + u16 esw_owner_vhca_id; + u16 flags; + struct mlx5_esw_bridge *bridge; + struct xarray vlans; +}; + +#endif /* _MLX5_ESW_BRIDGE_PRIVATE_ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/debugfs.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/debugfs.c new file mode 100644 index 0000000..2db13c7 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/debugfs.c @@ -0,0 +1,182 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#include +#include "eswitch.h" + +enum vnic_diag_counter { + MLX5_VNIC_DIAG_TOTAL_Q_UNDER_PROCESSOR_HANDLE, + MLX5_VNIC_DIAG_SEND_QUEUE_PRIORITY_UPDATE_FLOW, + MLX5_VNIC_DIAG_COMP_EQ_OVERRUN, + MLX5_VNIC_DIAG_ASYNC_EQ_OVERRUN, + MLX5_VNIC_DIAG_CQ_OVERRUN, + MLX5_VNIC_DIAG_INVALID_COMMAND, + MLX5_VNIC_DIAG_QOUTA_EXCEEDED_COMMAND, +}; + +static int mlx5_esw_query_vnic_diag(struct mlx5_vport *vport, enum vnic_diag_counter counter, + u32 *val) +{ + u32 out[MLX5_ST_SZ_DW(query_vnic_env_out)] = {}; + u32 in[MLX5_ST_SZ_DW(query_vnic_env_in)] = {}; + struct mlx5_core_dev *dev = vport->dev; + u16 vport_num = vport->vport; + void *vnic_diag_out; + int err; + + MLX5_SET(query_vnic_env_in, in, opcode, MLX5_CMD_OP_QUERY_VNIC_ENV); + MLX5_SET(query_vnic_env_in, in, vport_number, vport_num); + if (!mlx5_esw_is_manager_vport(dev->priv.eswitch, vport_num)) + MLX5_SET(query_vnic_env_in, in, other_vport, 1); + + err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + if (err) + return err; + + vnic_diag_out = MLX5_ADDR_OF(query_vnic_env_out, out, vport_env); + switch (counter) { + case MLX5_VNIC_DIAG_TOTAL_Q_UNDER_PROCESSOR_HANDLE: + *val = MLX5_GET(vnic_diagnostic_statistics, vnic_diag_out, total_error_queues); + break; + case MLX5_VNIC_DIAG_SEND_QUEUE_PRIORITY_UPDATE_FLOW: + *val = MLX5_GET(vnic_diagnostic_statistics, vnic_diag_out, + send_queue_priority_update_flow); + break; + case MLX5_VNIC_DIAG_COMP_EQ_OVERRUN: + *val = MLX5_GET(vnic_diagnostic_statistics, vnic_diag_out, comp_eq_overrun); + break; + case MLX5_VNIC_DIAG_ASYNC_EQ_OVERRUN: + *val = MLX5_GET(vnic_diagnostic_statistics, vnic_diag_out, async_eq_overrun); + break; + case MLX5_VNIC_DIAG_CQ_OVERRUN: + *val = MLX5_GET(vnic_diagnostic_statistics, vnic_diag_out, cq_overrun); + break; + case MLX5_VNIC_DIAG_INVALID_COMMAND: + *val = MLX5_GET(vnic_diagnostic_statistics, vnic_diag_out, invalid_command); + break; + case MLX5_VNIC_DIAG_QOUTA_EXCEEDED_COMMAND: + *val = MLX5_GET(vnic_diagnostic_statistics, vnic_diag_out, quota_exceeded_command); + break; + } + + return 0; +} + +static int __show_vnic_diag(struct seq_file *file, struct mlx5_vport *vport, + enum vnic_diag_counter type) +{ + u32 val = 0; + int ret; + + ret = mlx5_esw_query_vnic_diag(vport, type, &val); + if (ret) + return ret; + + seq_printf(file, "%d\n", val); + return 0; +} + +static int total_q_under_processor_handle_show(struct seq_file *file, void *priv) +{ + return __show_vnic_diag(file, file->private, MLX5_VNIC_DIAG_TOTAL_Q_UNDER_PROCESSOR_HANDLE); +} + +static int send_queue_priority_update_flow_show(struct seq_file *file, void *priv) +{ + return __show_vnic_diag(file, file->private, + MLX5_VNIC_DIAG_SEND_QUEUE_PRIORITY_UPDATE_FLOW); +} + +static int comp_eq_overrun_show(struct seq_file *file, void *priv) +{ + return __show_vnic_diag(file, file->private, MLX5_VNIC_DIAG_COMP_EQ_OVERRUN); +} + +static int async_eq_overrun_show(struct seq_file *file, void *priv) +{ + return __show_vnic_diag(file, file->private, MLX5_VNIC_DIAG_ASYNC_EQ_OVERRUN); +} + +static int cq_overrun_show(struct seq_file *file, void *priv) +{ + return __show_vnic_diag(file, file->private, MLX5_VNIC_DIAG_CQ_OVERRUN); +} + +static int invalid_command_show(struct seq_file *file, void *priv) +{ + return __show_vnic_diag(file, file->private, MLX5_VNIC_DIAG_INVALID_COMMAND); +} + +static int quota_exceeded_command_show(struct seq_file *file, void *priv) +{ + return __show_vnic_diag(file, file->private, MLX5_VNIC_DIAG_QOUTA_EXCEEDED_COMMAND); +} + +DEFINE_SHOW_ATTRIBUTE(total_q_under_processor_handle); +DEFINE_SHOW_ATTRIBUTE(send_queue_priority_update_flow); +DEFINE_SHOW_ATTRIBUTE(comp_eq_overrun); +DEFINE_SHOW_ATTRIBUTE(async_eq_overrun); +DEFINE_SHOW_ATTRIBUTE(cq_overrun); +DEFINE_SHOW_ATTRIBUTE(invalid_command); +DEFINE_SHOW_ATTRIBUTE(quota_exceeded_command); + +void mlx5_esw_vport_debugfs_destroy(struct mlx5_eswitch *esw, u16 vport_num) +{ + struct mlx5_vport *vport = mlx5_eswitch_get_vport(esw, vport_num); + + debugfs_remove_recursive(vport->dbgfs); + vport->dbgfs = NULL; +} + +/* vnic diag dir name is "pf", "ecpf" or "{vf/sf}_xxxx" */ +#define VNIC_DIAG_DIR_NAME_MAX_LEN 8 + +void mlx5_esw_vport_debugfs_create(struct mlx5_eswitch *esw, u16 vport_num, bool is_sf, u16 sf_num) +{ + struct mlx5_vport *vport = mlx5_eswitch_get_vport(esw, vport_num); + struct dentry *vnic_diag; + char dir_name[VNIC_DIAG_DIR_NAME_MAX_LEN]; + int err; + + if (!MLX5_CAP_GEN(esw->dev, vport_group_manager)) + return; + + if (vport_num == MLX5_VPORT_PF) { + strcpy(dir_name, "pf"); + } else if (vport_num == MLX5_VPORT_ECPF) { + strcpy(dir_name, "ecpf"); + } else { + err = snprintf(dir_name, VNIC_DIAG_DIR_NAME_MAX_LEN, "%s_%d", is_sf ? "sf" : "vf", + is_sf ? sf_num : vport_num - MLX5_VPORT_FIRST_VF); + if (WARN_ON(err < 0)) + return; + } + + vport->dbgfs = debugfs_create_dir(dir_name, esw->dbgfs); + vnic_diag = debugfs_create_dir("vnic_diag", vport->dbgfs); + + if (MLX5_CAP_GEN(esw->dev, vnic_env_queue_counters)) { + debugfs_create_file("total_q_under_processor_handle", 0444, vnic_diag, vport, + &total_q_under_processor_handle_fops); + debugfs_create_file("send_queue_priority_update_flow", 0444, vnic_diag, vport, + &send_queue_priority_update_flow_fops); + } + + if (MLX5_CAP_GEN(esw->dev, eq_overrun_count)) { + debugfs_create_file("comp_eq_overrun", 0444, vnic_diag, vport, + &comp_eq_overrun_fops); + debugfs_create_file("async_eq_overrun", 0444, vnic_diag, vport, + &async_eq_overrun_fops); + } + + if (MLX5_CAP_GEN(esw->dev, vnic_env_cq_overrun)) + debugfs_create_file("cq_overrun", 0444, vnic_diag, vport, &cq_overrun_fops); + + if (MLX5_CAP_GEN(esw->dev, invalid_command_count)) + debugfs_create_file("invalid_command", 0444, vnic_diag, vport, + &invalid_command_fops); + + if (MLX5_CAP_GEN(esw->dev, quota_exceeded_count)) + debugfs_create_file("quota_exceeded_command", 0444, vnic_diag, vport, + "a_exceeded_command_fops); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c new file mode 100644 index 0000000..045d451 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c @@ -0,0 +1,219 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020 Mellanox Technologies Ltd. */ + +#include +#include "eswitch.h" +#include "mlx5_esw_devm.h" + +static void +mlx5_esw_get_port_parent_id(struct mlx5_core_dev *dev, struct netdev_phys_item_id *ppid) +{ + u64 parent_id; + + parent_id = mlx5_query_nic_system_image_guid(dev); + ppid->id_len = sizeof(parent_id); + memcpy(ppid->id, &parent_id, sizeof(parent_id)); +} + +static bool mlx5_esw_devlink_port_supported(struct mlx5_eswitch *esw, u16 vport_num) +{ + return vport_num == MLX5_VPORT_UPLINK || + (mlx5_core_is_ecpf(esw->dev) && vport_num == MLX5_VPORT_PF) || + mlx5_eswitch_is_vf_vport(esw, vport_num); +} + +static struct devlink_port *mlx5_esw_dl_port_alloc(struct mlx5_eswitch *esw, u16 vport_num) +{ + struct mlx5_core_dev *dev = esw->dev; + struct devlink_port_attrs attrs = {}; + struct netdev_phys_item_id ppid = {}; + struct devlink_port *dl_port; + u32 controller_num = 0; + bool external; + u16 pfnum; + + dl_port = kzalloc(sizeof(*dl_port), GFP_KERNEL); + if (!dl_port) + return NULL; + + mlx5_esw_get_port_parent_id(dev, &ppid); + pfnum = mlx5_get_dev_index(dev); + external = mlx5_core_is_ecpf_esw_manager(dev); + if (external) + controller_num = dev->priv.eswitch->offloads.host_number + 1; + + if (vport_num == MLX5_VPORT_UPLINK) { + attrs.flavour = DEVLINK_PORT_FLAVOUR_PHYSICAL; + attrs.phys.port_number = pfnum; + memcpy(attrs.switch_id.id, ppid.id, ppid.id_len); + attrs.switch_id.id_len = ppid.id_len; + devlink_port_attrs_set(dl_port, &attrs); + } else if (vport_num == MLX5_VPORT_PF) { + memcpy(dl_port->attrs.switch_id.id, ppid.id, ppid.id_len); + dl_port->attrs.switch_id.id_len = ppid.id_len; + devlink_port_attrs_pci_pf_set(dl_port, controller_num, pfnum, external); + } else if (mlx5_eswitch_is_vf_vport(esw, vport_num)) { + memcpy(dl_port->attrs.switch_id.id, ppid.id, ppid.id_len); + dl_port->attrs.switch_id.id_len = ppid.id_len; + devlink_port_attrs_pci_vf_set(dl_port, controller_num, pfnum, + vport_num - 1, external); + } + return dl_port; +} + +static void mlx5_esw_dl_port_free(struct devlink_port *dl_port) +{ + kfree(dl_port); +} + +int mlx5_esw_offloads_devlink_port_register(struct mlx5_eswitch *esw, u16 vport_num) +{ + struct mlx5_core_dev *dev = esw->dev; + struct devlink_port *dl_port; + unsigned int dl_port_index; + struct mlx5_vport *vport; + struct devlink *devlink; + int err; + + if (!mlx5_esw_devlink_port_supported(esw, vport_num)) + return 0; + + vport = mlx5_eswitch_get_vport(esw, vport_num); + if (IS_ERR(vport)) + return PTR_ERR(vport); + + dl_port = mlx5_esw_dl_port_alloc(esw, vport_num); + if (!dl_port) + return -ENOMEM; + + devlink = priv_to_devlink(dev); + dl_port_index = mlx5_esw_vport_to_devlink_port_index(dev, vport_num); + err = devlink_port_register(devlink, dl_port, dl_port_index); + if (err) + goto reg_err; + + err = devlink_rate_leaf_create(dl_port, vport); + if (err) + goto rate_err; + + vport->dl_port = dl_port; + return 0; + +rate_err: + devlink_port_unregister(dl_port); +reg_err: + mlx5_esw_dl_port_free(dl_port); + return err; +} + +void mlx5_esw_offloads_devlink_port_unregister(struct mlx5_eswitch *esw, u16 vport_num) +{ + struct mlx5_vport *vport; + + if (!mlx5_esw_devlink_port_supported(esw, vport_num)) + return; + + vport = mlx5_eswitch_get_vport(esw, vport_num); + if (IS_ERR(vport)) + return; + + if (vport->dl_port->devlink_rate) { + if (!test_bit(MLX5_INTERFACE_STATE_TEARDOWN, &esw->dev->intf_state)) + mlx5_esw_qos_vport_update_group(esw, vport, NULL, NULL); + devlink_rate_leaf_destroy(vport->dl_port); + } + + devlink_port_unregister(vport->dl_port); + mlx5_esw_dl_port_free(vport->dl_port); + vport->dl_port = NULL; +} + +struct devlink_port *mlx5_esw_offloads_devlink_port(struct mlx5_eswitch *esw, u16 vport_num) +{ + struct mlx5_vport *vport; + + vport = mlx5_eswitch_get_vport(esw, vport_num); + return IS_ERR(vport) ? ERR_CAST(vport) : vport->dl_port; +} + +int _mlx5_esw_devlink_sf_port_register(struct mlx5_eswitch *esw, struct devlink_port *dl_port, + u16 vport_num, u32 controller, u32 sfnum) +{ + struct mlx5_core_dev *dev = esw->dev; + struct netdev_phys_item_id ppid = {}; + unsigned int dl_port_index; + struct mlx5_vport *vport; + struct devlink *devlink; + u16 pfnum; + int err; + + vport = mlx5_eswitch_get_vport(esw, vport_num); + if (IS_ERR(vport)) + return PTR_ERR(vport); + + pfnum = mlx5_get_dev_index(dev); + mlx5_esw_get_port_parent_id(dev, &ppid); + memcpy(dl_port->attrs.switch_id.id, &ppid.id[0], ppid.id_len); + dl_port->attrs.switch_id.id_len = ppid.id_len; + devlink_port_attrs_pci_sf_set(dl_port, controller, pfnum, sfnum, !!controller); + devlink = priv_to_devlink(dev); + dl_port_index = mlx5_esw_vport_to_devlink_port_index(dev, vport_num); + err = devlink_port_register(devlink, dl_port, dl_port_index); + if (err) + return err; + + err = devlink_rate_leaf_create(dl_port, vport); + if (err) + goto rate_err; + + vport->dl_port = dl_port; + return 0; + +rate_err: + devlink_port_unregister(dl_port); + return err; +} + +int mlx5_esw_devlink_sf_port_register(struct mlx5_eswitch *esw, + struct devlink_port *dl_port, + u16 vport_num, u32 controller, + u32 sfnum) +{ + int err = 0; + + err = _mlx5_esw_devlink_sf_port_register(esw, dl_port, vport_num, + controller, sfnum); + if (err) + return err; + +#if IS_ENABLED(CONFIG_MLXDEVM) + err = mlx5_devm_sf_port_register(esw->dev, vport_num, controller, sfnum, dl_port); +#endif + return err; +} + +void _mlx5_esw_devlink_sf_port_unregister(struct mlx5_eswitch *esw, u16 vport_num) +{ + struct mlx5_vport *vport; + + vport = mlx5_eswitch_get_vport(esw, vport_num); + if (IS_ERR(vport)) + return; + + if (vport->dl_port->devlink_rate) { + mlx5_esw_qos_vport_update_group(esw, vport, NULL, NULL); + devlink_rate_leaf_destroy(vport->dl_port); + } + + devlink_port_unregister(vport->dl_port); + vport->dl_port = NULL; +} + +void mlx5_esw_devlink_sf_port_unregister(struct mlx5_eswitch *esw, + u16 vport_num) +{ + _mlx5_esw_devlink_sf_port_unregister(esw, vport_num); +#if IS_ENABLED(CONFIG_MLXDEVM) + mlx5_devm_sf_port_unregister(esw->dev, vport_num); +#endif +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/devm_port.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/devm_port.c new file mode 100644 index 0000000..11dab62 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/devm_port.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2021 Mellanox Technologies Ltd. */ + +#include +#include +#include "eswitch.h" +#include "mlx5_esw_devm.h" + +int mlx5_devm_sf_port_register(struct mlx5_core_dev *dev, u16 vport_num, + u32 controller, u32 sfnum, + struct devlink_port *dl_port) +{ + struct mlx5_devm_device *devm_dev; + struct mlxdevm_port_attrs attrs; + struct mlx5_devm_port *port; + unsigned int dl_port_index; + u16 pfnum; + int ret; + + devm_dev = mlx5_devm_device_get(dev); + if (!devm_dev) + return -ENODEV; + port = kzalloc(sizeof(*port), GFP_KERNEL); + if (!port) + return -ENOMEM; + pfnum = mlx5_get_dev_index(dev); + dl_port_index = mlx5_esw_vport_to_devlink_port_index(dev, vport_num); + port->sfnum = sfnum; + port->port_index = dl_port_index; + port->vport_num = vport_num; + + attrs.flavour = MLXDEVM_PORT_FLAVOUR_PCI_SF; + attrs.pci_sf.controller = controller; + attrs.pci_sf.sf = sfnum; + attrs.pci_sf.pf = pfnum; + mlxdevm_port_attr_set(&port->port, &attrs); + + ret = mlxdevm_port_register(&devm_dev->device, &port->port, dl_port_index); + if (ret) + goto port_err; + + port->port.dl_port = dl_port; + down_write(&devm_dev->port_list_rwsem); + list_add_tail(&port->list, &devm_dev->port_list); + up_write(&devm_dev->port_list_rwsem); + + return 0; + +port_err: + kfree(port); + return ret; +} + +void mlx5_devm_sf_port_unregister(struct mlx5_core_dev *dev, u16 vport_num) +{ + struct mlx5_devm_device *devm_dev; + struct mlx5_devm_port *port; + bool found = false; + + devm_dev = mlx5_devm_device_get(dev); + if (!devm_dev) + return; + + down_write(&devm_dev->port_list_rwsem); + list_for_each_entry(port, &devm_dev->port_list, list) { + if (port->vport_num != vport_num) + continue; + /* found the port */ + list_del(&port->list); + found = true; + break; + } + up_write(&devm_dev->port_list_rwsem); + + WARN_ON(!found); + mlxdevm_port_unregister(&port->port); + kfree(port); +} + +void mlx5_devm_sf_port_type_eth_set(struct mlx5_core_dev *dev, u16 vport_num, + struct net_device *ndev) +{ + struct mlx5_devm_device *devm_dev; + struct mlx5_devm_port *port; + + devm_dev = mlx5_devm_device_get(dev); + if (!devm_dev) + return; + + down_read(&devm_dev->port_list_rwsem); + list_for_each_entry(port, &devm_dev->port_list, list) { + if (port->vport_num != vport_num) + continue; + /* found the port */ + mlxdevm_port_type_eth_set(&port->port, ndev); + up_read(&devm_dev->port_list_rwsem); + return; + } + up_read(&devm_dev->port_list_rwsem); +} + +u32 mlx5_devm_sf_vport_to_sfnum(struct mlx5_core_dev *dev, u16 vport_num) +{ + struct mlx5_devm_device *devm_dev; + struct mlx5_devm_port *port; + u32 sfnum = 0; + + devm_dev = mlx5_devm_device_get(dev); + if (!devm_dev) + return -EOPNOTSUPP; + + down_read(&devm_dev->port_list_rwsem); + list_for_each_entry(port, &devm_dev->port_list, list) { + if (port->vport_num == vport_num) { + /* found the port */ + sfnum = port->sfnum; + break; + } + } + up_read(&devm_dev->port_list_rwsem); + return sfnum; +} + +u32 mlx5_devm_sf_vport_to_controller(struct mlx5_core_dev *dev, u16 vport_num) +{ + struct mlx5_devm_device *devm_dev; + struct mlx5_devm_port *port; + u32 controller = 0; + + devm_dev = mlx5_devm_device_get(dev); + if (!devm_dev) + return 0; + + down_read(&devm_dev->port_list_rwsem); + list_for_each_entry(port, &devm_dev->port_list, list) { + if (port->vport_num == vport_num) { + /* found the port */ + controller = port->port.attrs.pci_sf.controller; + break; + } + } + up_read(&devm_dev->port_list_rwsem); + return controller; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/diag/bridge_tracepoint.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/diag/bridge_tracepoint.h new file mode 100644 index 0000000..51ac24e --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/diag/bridge_tracepoint.h @@ -0,0 +1,120 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2021 Mellanox Technologies. */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM mlx5 + +#if !defined(_MLX5_ESW_BRIDGE_TRACEPOINT_) || defined(TRACE_HEADER_MULTI_READ) +#define _MLX5_ESW_BRIDGE_TRACEPOINT_ + +#include +#include "../bridge_priv.h" + +DECLARE_EVENT_CLASS(mlx5_esw_bridge_fdb_template, + TP_PROTO(const struct mlx5_esw_bridge_fdb_entry *fdb), + TP_ARGS(fdb), + TP_STRUCT__entry( + __array(char, dev_name, IFNAMSIZ) + __array(unsigned char, addr, ETH_ALEN) + __field(u16, vid) + __field(u16, flags) + __field(unsigned int, used) + ), + TP_fast_assign( + strscpy(__entry->dev_name, + netdev_name(fdb->dev), + IFNAMSIZ); + memcpy(__entry->addr, fdb->key.addr, ETH_ALEN); + __entry->vid = fdb->key.vid; + __entry->flags = fdb->flags; + __entry->used = jiffies_to_msecs(jiffies - fdb->lastuse) + ), + TP_printk("net_device=%s addr=%pM vid=%hu flags=%hx used=%u", + __entry->dev_name, + __entry->addr, + __entry->vid, + __entry->flags, + __entry->used / 1000) + ); + +DEFINE_EVENT(mlx5_esw_bridge_fdb_template, + mlx5_esw_bridge_fdb_entry_init, + TP_PROTO(const struct mlx5_esw_bridge_fdb_entry *fdb), + TP_ARGS(fdb) + ); +DEFINE_EVENT(mlx5_esw_bridge_fdb_template, + mlx5_esw_bridge_fdb_entry_refresh, + TP_PROTO(const struct mlx5_esw_bridge_fdb_entry *fdb), + TP_ARGS(fdb) + ); +DEFINE_EVENT(mlx5_esw_bridge_fdb_template, + mlx5_esw_bridge_fdb_entry_cleanup, + TP_PROTO(const struct mlx5_esw_bridge_fdb_entry *fdb), + TP_ARGS(fdb) + ); + +DECLARE_EVENT_CLASS(mlx5_esw_bridge_vlan_template, + TP_PROTO(const struct mlx5_esw_bridge_vlan *vlan), + TP_ARGS(vlan), + TP_STRUCT__entry( + __field(u16, vid) + __field(u16, flags) + ), + TP_fast_assign( + __entry->vid = vlan->vid; + __entry->flags = vlan->flags; + ), + TP_printk("vid=%hu flags=%hx", + __entry->vid, + __entry->flags) + ); + +DEFINE_EVENT(mlx5_esw_bridge_vlan_template, + mlx5_esw_bridge_vlan_create, + TP_PROTO(const struct mlx5_esw_bridge_vlan *vlan), + TP_ARGS(vlan) + ); +DEFINE_EVENT(mlx5_esw_bridge_vlan_template, + mlx5_esw_bridge_vlan_cleanup, + TP_PROTO(const struct mlx5_esw_bridge_vlan *vlan), + TP_ARGS(vlan) + ); + +DECLARE_EVENT_CLASS(mlx5_esw_bridge_port_template, + TP_PROTO(const struct mlx5_esw_bridge_port *port), + TP_ARGS(port), + TP_STRUCT__entry( + __field(u16, vport_num) + __field(u16, esw_owner_vhca_id) + __field(u16, flags) + ), + TP_fast_assign( + __entry->vport_num = port->vport_num; + __entry->esw_owner_vhca_id = port->esw_owner_vhca_id; + __entry->flags = port->flags; + ), + TP_printk("vport_num=%hu esw_owner_vhca_id=%hu flags=%hx", + __entry->vport_num, + __entry->esw_owner_vhca_id, + __entry->flags) + ); + +DEFINE_EVENT(mlx5_esw_bridge_port_template, + mlx5_esw_bridge_vport_init, + TP_PROTO(const struct mlx5_esw_bridge_port *port), + TP_ARGS(port) + ); +DEFINE_EVENT(mlx5_esw_bridge_port_template, + mlx5_esw_bridge_vport_cleanup, + TP_PROTO(const struct mlx5_esw_bridge_port *port), + TP_ARGS(port) + ); + +#endif + +/* This part must be outside protection */ +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH esw/diag +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE bridge_tracepoint +#include diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/diag/qos_tracepoint.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/diag/qos_tracepoint.h new file mode 100644 index 0000000..458baf0 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/diag/qos_tracepoint.h @@ -0,0 +1,123 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM mlx5 + +#if !defined(_MLX5_ESW_TP_) || defined(TRACE_HEADER_MULTI_READ) +#define _MLX5_ESW_TP_ + +#include +#include "eswitch.h" + +TRACE_EVENT(mlx5_esw_vport_qos_destroy, + TP_PROTO(const struct mlx5_vport *vport), + TP_ARGS(vport), + TP_STRUCT__entry(__string(devname, dev_name(vport->dev->device)) + __field(unsigned short, vport_id) + __field(unsigned int, tsar_ix) + ), + TP_fast_assign(__assign_str(devname, dev_name(vport->dev->device)); + __entry->vport_id = vport->vport; + __entry->tsar_ix = vport->qos.esw_tsar_ix; + ), + TP_printk("(%s) vport=%hu tsar_ix=%u\n", + __get_str(devname), __entry->vport_id, __entry->tsar_ix + ) +); + +DECLARE_EVENT_CLASS(mlx5_esw_vport_qos_template, + TP_PROTO(const struct mlx5_vport *vport, u32 bw_share, u32 max_rate), + TP_ARGS(vport, bw_share, max_rate), + TP_STRUCT__entry(__string(devname, dev_name(vport->dev->device)) + __field(unsigned short, vport_id) + __field(unsigned int, tsar_ix) + __field(unsigned int, bw_share) + __field(unsigned int, max_rate) + __field(void *, group) + ), + TP_fast_assign(__assign_str(devname, dev_name(vport->dev->device)); + __entry->vport_id = vport->vport; + __entry->tsar_ix = vport->qos.esw_tsar_ix; + __entry->bw_share = bw_share; + __entry->max_rate = max_rate; + __entry->group = vport->qos.group; + ), + TP_printk("(%s) vport=%hu tsar_ix=%u bw_share=%u, max_rate=%u group=%p\n", + __get_str(devname), __entry->vport_id, __entry->tsar_ix, + __entry->bw_share, __entry->max_rate, __entry->group + ) +); + +DEFINE_EVENT(mlx5_esw_vport_qos_template, mlx5_esw_vport_qos_create, + TP_PROTO(const struct mlx5_vport *vport, u32 bw_share, u32 max_rate), + TP_ARGS(vport, bw_share, max_rate) + ); + +DEFINE_EVENT(mlx5_esw_vport_qos_template, mlx5_esw_vport_qos_config, + TP_PROTO(const struct mlx5_vport *vport, u32 bw_share, u32 max_rate), + TP_ARGS(vport, bw_share, max_rate) + ); + +DECLARE_EVENT_CLASS(mlx5_esw_group_qos_template, + TP_PROTO(const struct mlx5_core_dev *dev, + const struct mlx5_esw_rate_group *group, + unsigned int tsar_ix), + TP_ARGS(dev, group, tsar_ix), + TP_STRUCT__entry(__string(devname, dev_name(dev->device)) + __field(const void *, group) + __field(unsigned int, tsar_ix) + ), + TP_fast_assign(__assign_str(devname, dev_name(dev->device)); + __entry->group = group; + __entry->tsar_ix = tsar_ix; + ), + TP_printk("(%s) group=%p tsar_ix=%u\n", + __get_str(devname), __entry->group, __entry->tsar_ix + ) +); + +DEFINE_EVENT(mlx5_esw_group_qos_template, mlx5_esw_group_qos_create, + TP_PROTO(const struct mlx5_core_dev *dev, + const struct mlx5_esw_rate_group *group, + unsigned int tsar_ix), + TP_ARGS(dev, group, tsar_ix) + ); + +DEFINE_EVENT(mlx5_esw_group_qos_template, mlx5_esw_group_qos_destroy, + TP_PROTO(const struct mlx5_core_dev *dev, + const struct mlx5_esw_rate_group *group, + unsigned int tsar_ix), + TP_ARGS(dev, group, tsar_ix) + ); + +TRACE_EVENT(mlx5_esw_group_qos_config, + TP_PROTO(const struct mlx5_core_dev *dev, + const struct mlx5_esw_rate_group *group, + unsigned int tsar_ix, u32 bw_share, u32 max_rate), + TP_ARGS(dev, group, tsar_ix, bw_share, max_rate), + TP_STRUCT__entry(__string(devname, dev_name(dev->device)) + __field(const void *, group) + __field(unsigned int, tsar_ix) + __field(unsigned int, bw_share) + __field(unsigned int, max_rate) + ), + TP_fast_assign(__assign_str(devname, dev_name(dev->device)); + __entry->group = group; + __entry->tsar_ix = tsar_ix; + __entry->bw_share = bw_share; + __entry->max_rate = max_rate; + ), + TP_printk("(%s) group=%p tsar_ix=%u bw_share=%u max_rate=%u\n", + __get_str(devname), __entry->group, __entry->tsar_ix, + __entry->bw_share, __entry->max_rate + ) +); +#endif /* _MLX5_ESW_TP_ */ + +/* This part must be outside protection */ +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH esw/diag +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE qos_tracepoint +#include diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/indir_table.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/indir_table.c new file mode 100644 index 0000000..c9a9115 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/indir_table.c @@ -0,0 +1,523 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2021 Mellanox Technologies. */ + +#include +#include +#include +#include +#include +#include +#include "mlx5_core.h" +#include "eswitch.h" +#include "en.h" +#include "en_tc.h" +#include "fs_core.h" +#include "esw/indir_table.h" +#include "lib/fs_chains.h" +#include "en/mod_hdr.h" + +#define MLX5_ESW_INDIR_TABLE_SIZE 128 +#define MLX5_ESW_INDIR_TABLE_RECIRC_IDX_MAX (MLX5_ESW_INDIR_TABLE_SIZE - 2) +#define MLX5_ESW_INDIR_TABLE_FWD_IDX (MLX5_ESW_INDIR_TABLE_SIZE - 1) + +struct mlx5_esw_indir_table_rule { + struct list_head list; + struct mlx5_flow_handle *handle; + union { + __be32 v4; + struct in6_addr v6; + } dst_ip; + u32 vni; + struct mlx5_modify_hdr *mh; + refcount_t refcnt; +}; + +struct mlx5_esw_indir_table_entry { + struct hlist_node hlist; + struct mlx5_flow_table *ft; + struct mlx5_flow_group *recirc_grp; + struct mlx5_flow_group *fwd_grp; + struct mlx5_flow_handle *fwd_rule; + struct list_head recirc_rules; + int recirc_cnt; + int fwd_ref; + + u16 vport; + u8 ip_version; +}; + +struct mlx5_esw_indir_table { + struct mutex lock; /* protects table */ + DECLARE_HASHTABLE(table, 8); +}; + +struct mlx5_esw_indir_table * +mlx5_esw_indir_table_init(void) +{ + struct mlx5_esw_indir_table *indir = kvzalloc(sizeof(*indir), GFP_KERNEL); + + if (!indir) + return ERR_PTR(-ENOMEM); + + mutex_init(&indir->lock); + hash_init(indir->table); + return indir; +} + +void +mlx5_esw_indir_table_destroy(struct mlx5_esw_indir_table *indir) +{ + mutex_destroy(&indir->lock); + kvfree(indir); +} + +bool +mlx5_esw_indir_table_needed(struct mlx5_eswitch *esw, + struct mlx5_flow_attr *attr, + u16 vport_num, + struct mlx5_core_dev *dest_mdev) +{ + struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr; + bool vf_sf_vport; + + vf_sf_vport = mlx5_eswitch_is_vf_vport(esw, vport_num) || + mlx5_esw_is_sf_vport(esw, vport_num); + + /* Use indirect table for all IP traffic from UL to VF with vport + * destination when source rewrite flag is set. + */ + return esw_attr->in_rep->vport == MLX5_VPORT_UPLINK && + vf_sf_vport && + esw->dev == dest_mdev && + attr->ip_version && + attr->flags & MLX5_ATTR_FLAG_SRC_REWRITE; +} + +u16 +mlx5_esw_indir_table_decap_vport(struct mlx5_flow_attr *attr) +{ + struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr; + + return esw_attr->rx_tun_attr ? esw_attr->rx_tun_attr->decap_vport : 0; +} + +static struct mlx5_esw_indir_table_rule * +mlx5_esw_indir_table_rule_lookup(struct mlx5_esw_indir_table_entry *e, + struct mlx5_esw_flow_attr *attr) +{ + struct mlx5_esw_indir_table_rule *rule; + + list_for_each_entry(rule, &e->recirc_rules, list) + if (rule->vni == attr->rx_tun_attr->vni && + !memcmp(&rule->dst_ip, &attr->rx_tun_attr->dst_ip, + sizeof(attr->rx_tun_attr->dst_ip))) + goto found; + return NULL; + +found: + refcount_inc(&rule->refcnt); + return rule; +} + +static int mlx5_esw_indir_table_rule_get(struct mlx5_eswitch *esw, + struct mlx5_flow_attr *attr, + struct mlx5_flow_spec *spec, + struct mlx5_esw_indir_table_entry *e) +{ + struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr; + struct mlx5_fs_chains *chains = esw_chains(esw); + struct mlx5e_tc_mod_hdr_acts mod_acts = {}; + struct mlx5_flow_destination dest = {}; + struct mlx5_esw_indir_table_rule *rule; + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_spec *rule_spec; + struct mlx5_flow_handle *handle; + int err = 0; + u32 data; + + rule = mlx5_esw_indir_table_rule_lookup(e, esw_attr); + if (rule) + return 0; + + if (e->recirc_cnt == MLX5_ESW_INDIR_TABLE_RECIRC_IDX_MAX) + return -EINVAL; + + rule_spec = kvzalloc(sizeof(*rule_spec), GFP_KERNEL); + if (!rule_spec) + return -ENOMEM; + + rule = kzalloc(sizeof(*rule), GFP_KERNEL); + if (!rule) { + err = -ENOMEM; + goto out; + } + + rule_spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS | + MLX5_MATCH_MISC_PARAMETERS | + MLX5_MATCH_MISC_PARAMETERS_2; + if (MLX5_CAP_FLOWTABLE_NIC_RX(esw->dev, ft_field_support.outer_ip_version)) { + MLX5_SET(fte_match_param, rule_spec->match_criteria, + outer_headers.ip_version, 0xf); + MLX5_SET(fte_match_param, rule_spec->match_value, outer_headers.ip_version, + attr->ip_version); + } else if (attr->ip_version) { + MLX5_SET_TO_ONES(fte_match_param, rule_spec->match_criteria, + outer_headers.ethertype); + MLX5_SET(fte_match_param, rule_spec->match_value, outer_headers.ethertype, + (attr->ip_version == 4 ? ETH_P_IP : ETH_P_IPV6)); + } else { + err = -EOPNOTSUPP; + goto err_ethertype; + } + + if (attr->ip_version == 4) { + MLX5_SET_TO_ONES(fte_match_param, rule_spec->match_criteria, + outer_headers.dst_ipv4_dst_ipv6.ipv4_layout.ipv4); + MLX5_SET(fte_match_param, rule_spec->match_value, + outer_headers.dst_ipv4_dst_ipv6.ipv4_layout.ipv4, + ntohl(esw_attr->rx_tun_attr->dst_ip.v4)); + } else if (attr->ip_version == 6) { + int len = sizeof(struct in6_addr); + + memset(MLX5_ADDR_OF(fte_match_param, rule_spec->match_criteria, + outer_headers.dst_ipv4_dst_ipv6.ipv6_layout.ipv6), + 0xff, len); + memcpy(MLX5_ADDR_OF(fte_match_param, rule_spec->match_value, + outer_headers.dst_ipv4_dst_ipv6.ipv6_layout.ipv6), + &esw_attr->rx_tun_attr->dst_ip.v6, len); + } + + MLX5_SET_TO_ONES(fte_match_param, rule_spec->match_criteria, + misc_parameters.vxlan_vni); + MLX5_SET(fte_match_param, rule_spec->match_value, misc_parameters.vxlan_vni, + MLX5_GET(fte_match_param, spec->match_value, misc_parameters.vxlan_vni)); + + MLX5_SET(fte_match_param, rule_spec->match_criteria, + misc_parameters_2.metadata_reg_c_0, mlx5_eswitch_get_vport_metadata_mask()); + MLX5_SET(fte_match_param, rule_spec->match_value, misc_parameters_2.metadata_reg_c_0, + mlx5_eswitch_get_vport_metadata_for_match(esw_attr->in_mdev->priv.eswitch, + MLX5_VPORT_UPLINK)); + + /* Modify flow source to recirculate packet */ + data = mlx5_eswitch_get_vport_metadata_for_set(esw, esw_attr->rx_tun_attr->decap_vport); + err = mlx5e_tc_match_to_reg_set(esw->dev, &mod_acts, MLX5_FLOW_NAMESPACE_FDB, + VPORT_TO_REG, data); + if (err) + goto err_mod_hdr_regc0; + + err = mlx5e_tc_match_to_reg_set(esw->dev, &mod_acts, MLX5_FLOW_NAMESPACE_FDB, + TUNNEL_TO_REG, ESW_TUN_SLOW_TABLE_GOTO_VPORT); + if (err) + goto err_mod_hdr_regc1; + + flow_act.modify_hdr = mlx5_modify_header_alloc(esw->dev, MLX5_FLOW_NAMESPACE_FDB, + mod_acts.num_actions, mod_acts.actions); + if (IS_ERR(flow_act.modify_hdr)) { + err = PTR_ERR(flow_act.modify_hdr); + goto err_mod_hdr_alloc; + } + + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | MLX5_FLOW_CONTEXT_ACTION_MOD_HDR; + flow_act.flags = FLOW_ACT_IGNORE_FLOW_LEVEL | FLOW_ACT_NO_APPEND; + dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dest.ft = mlx5_chains_get_table(chains, 0, 1, 0); + if (IS_ERR(dest.ft)) { + err = PTR_ERR(dest.ft); + goto err_table; + } + handle = mlx5_add_flow_rules(e->ft, rule_spec, &flow_act, &dest, 1); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto err_handle; + } + + mlx5e_mod_hdr_dealloc(&mod_acts); + rule->handle = handle; + rule->vni = esw_attr->rx_tun_attr->vni; + rule->mh = flow_act.modify_hdr; + memcpy(&rule->dst_ip, &esw_attr->rx_tun_attr->dst_ip, + sizeof(esw_attr->rx_tun_attr->dst_ip)); + refcount_set(&rule->refcnt, 1); + list_add(&rule->list, &e->recirc_rules); + e->recirc_cnt++; + goto out; + +err_handle: + mlx5_chains_put_table(chains, 0, 1, 0); +err_table: + mlx5_modify_header_dealloc(esw->dev, flow_act.modify_hdr); +err_mod_hdr_alloc: +err_mod_hdr_regc1: + mlx5e_mod_hdr_dealloc(&mod_acts); +err_mod_hdr_regc0: +err_ethertype: + kfree(rule); +out: + kvfree(rule_spec); + return err; +} + +static void mlx5_esw_indir_table_rule_put(struct mlx5_eswitch *esw, + struct mlx5_flow_attr *attr, + struct mlx5_esw_indir_table_entry *e) +{ + struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr; + struct mlx5_fs_chains *chains = esw_chains(esw); + struct mlx5_esw_indir_table_rule *rule; + + list_for_each_entry(rule, &e->recirc_rules, list) + if (rule->vni == esw_attr->rx_tun_attr->vni && + !memcmp(&rule->dst_ip, &esw_attr->rx_tun_attr->dst_ip, + sizeof(esw_attr->rx_tun_attr->dst_ip))) + goto found; + + return; + +found: + if (!refcount_dec_and_test(&rule->refcnt)) + return; + + mlx5_del_flow_rules(rule->handle); + mlx5_chains_put_table(chains, 0, 1, 0); + mlx5_modify_header_dealloc(esw->dev, rule->mh); + list_del(&rule->list); + kfree(rule); + e->recirc_cnt--; +} + +static int mlx5_create_indir_recirc_group(struct mlx5_eswitch *esw, + struct mlx5_flow_attr *attr, + struct mlx5_flow_spec *spec, + struct mlx5_esw_indir_table_entry *e) +{ + int err = 0, inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + u32 *in, *match; + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(create_flow_group_in, in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS | + MLX5_MATCH_MISC_PARAMETERS | MLX5_MATCH_MISC_PARAMETERS_2); + match = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria); + + if (MLX5_CAP_FLOWTABLE_NIC_RX(esw->dev, ft_field_support.outer_ip_version)) + MLX5_SET(fte_match_param, match, outer_headers.ip_version, 0xf); + else + MLX5_SET_TO_ONES(fte_match_param, match, outer_headers.ethertype); + + if (attr->ip_version == 4) { + MLX5_SET_TO_ONES(fte_match_param, match, + outer_headers.dst_ipv4_dst_ipv6.ipv4_layout.ipv4); + } else if (attr->ip_version == 6) { + memset(MLX5_ADDR_OF(fte_match_param, match, + outer_headers.dst_ipv4_dst_ipv6.ipv6_layout.ipv6), + 0xff, sizeof(struct in6_addr)); + } else { + err = -EOPNOTSUPP; + goto out; + } + + MLX5_SET_TO_ONES(fte_match_param, match, misc_parameters.vxlan_vni); + MLX5_SET(fte_match_param, match, misc_parameters_2.metadata_reg_c_0, + mlx5_eswitch_get_vport_metadata_mask()); + MLX5_SET(create_flow_group_in, in, start_flow_index, 0); + MLX5_SET(create_flow_group_in, in, end_flow_index, MLX5_ESW_INDIR_TABLE_RECIRC_IDX_MAX); + e->recirc_grp = mlx5_create_flow_group(e->ft, in); + if (IS_ERR(e->recirc_grp)) { + err = PTR_ERR(e->recirc_grp); + goto out; + } + + INIT_LIST_HEAD(&e->recirc_rules); + e->recirc_cnt = 0; + +out: + kvfree(in); + return err; +} + +static int mlx5_create_indir_fwd_group(struct mlx5_eswitch *esw, + struct mlx5_esw_indir_table_entry *e) +{ + int err = 0, inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_flow_destination dest = {}; + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_spec *spec; + u32 *in; + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) { + kvfree(in); + return -ENOMEM; + } + + /* Hold one entry */ + MLX5_SET(create_flow_group_in, in, start_flow_index, MLX5_ESW_INDIR_TABLE_FWD_IDX); + MLX5_SET(create_flow_group_in, in, end_flow_index, MLX5_ESW_INDIR_TABLE_FWD_IDX); + e->fwd_grp = mlx5_create_flow_group(e->ft, in); + if (IS_ERR(e->fwd_grp)) { + err = PTR_ERR(e->fwd_grp); + goto err_out; + } + + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + dest.type = MLX5_FLOW_DESTINATION_TYPE_VPORT; + dest.vport.num = e->vport; + dest.vport.vhca_id = MLX5_CAP_GEN(esw->dev, vhca_id); + dest.vport.flags = MLX5_FLOW_DEST_VPORT_VHCA_ID; + e->fwd_rule = mlx5_add_flow_rules(e->ft, spec, &flow_act, &dest, 1); + if (IS_ERR(e->fwd_rule)) { + mlx5_destroy_flow_group(e->fwd_grp); + err = PTR_ERR(e->fwd_rule); + } + +err_out: + kvfree(spec); + kvfree(in); + return err; +} + +static struct mlx5_esw_indir_table_entry * +mlx5_esw_indir_table_entry_create(struct mlx5_eswitch *esw, struct mlx5_flow_attr *attr, + struct mlx5_flow_spec *spec, u16 vport, bool decap) +{ + struct mlx5_flow_table_attr ft_attr = {}; + struct mlx5_flow_namespace *root_ns; + struct mlx5_esw_indir_table_entry *e; + struct mlx5_flow_table *ft; + int err = 0; + + root_ns = mlx5_get_flow_namespace(esw->dev, MLX5_FLOW_NAMESPACE_FDB); + if (!root_ns) + return ERR_PTR(-ENOENT); + + e = kzalloc(sizeof(*e), GFP_KERNEL); + if (!e) + return ERR_PTR(-ENOMEM); + + ft_attr.prio = FDB_TC_OFFLOAD; + ft_attr.max_fte = MLX5_ESW_INDIR_TABLE_SIZE; + ft_attr.flags = MLX5_FLOW_TABLE_UNMANAGED; + ft_attr.level = 1; + + ft = mlx5_create_flow_table(root_ns, &ft_attr); + if (IS_ERR(ft)) { + err = PTR_ERR(ft); + goto tbl_err; + } + e->ft = ft; + e->vport = vport; + e->ip_version = attr->ip_version; + e->fwd_ref = !decap; + + err = mlx5_create_indir_recirc_group(esw, attr, spec, e); + if (err) + goto recirc_grp_err; + + if (decap) { + err = mlx5_esw_indir_table_rule_get(esw, attr, spec, e); + if (err) + goto recirc_rule_err; + } + + err = mlx5_create_indir_fwd_group(esw, e); + if (err) + goto fwd_grp_err; + + hash_add(esw->fdb_table.offloads.indir->table, &e->hlist, + vport << 16 | attr->ip_version); + + return e; + +fwd_grp_err: + if (decap) + mlx5_esw_indir_table_rule_put(esw, attr, e); +recirc_rule_err: + mlx5_destroy_flow_group(e->recirc_grp); +recirc_grp_err: + mlx5_destroy_flow_table(e->ft); +tbl_err: + kfree(e); + return ERR_PTR(err); +} + +static struct mlx5_esw_indir_table_entry * +mlx5_esw_indir_table_entry_lookup(struct mlx5_eswitch *esw, u16 vport, u8 ip_version) +{ + struct mlx5_esw_indir_table_entry *e; + u32 key = vport << 16 | ip_version; + + hash_for_each_possible(esw->fdb_table.offloads.indir->table, e, hlist, key) + if (e->vport == vport && e->ip_version == ip_version) + return e; + + return NULL; +} + +struct mlx5_flow_table *mlx5_esw_indir_table_get(struct mlx5_eswitch *esw, + struct mlx5_flow_attr *attr, + struct mlx5_flow_spec *spec, + u16 vport, bool decap) +{ + struct mlx5_esw_indir_table_entry *e; + int err; + + mutex_lock(&esw->fdb_table.offloads.indir->lock); + e = mlx5_esw_indir_table_entry_lookup(esw, vport, attr->ip_version); + if (e) { + if (!decap) { + e->fwd_ref++; + } else { + err = mlx5_esw_indir_table_rule_get(esw, attr, spec, e); + if (err) + goto out_err; + } + } else { + e = mlx5_esw_indir_table_entry_create(esw, attr, spec, vport, decap); + if (IS_ERR(e)) { + err = PTR_ERR(e); + esw_warn(esw->dev, "Failed to create indirection table, err %d.\n", err); + goto out_err; + } + } + mutex_unlock(&esw->fdb_table.offloads.indir->lock); + return e->ft; + +out_err: + mutex_unlock(&esw->fdb_table.offloads.indir->lock); + return ERR_PTR(err); +} + +void mlx5_esw_indir_table_put(struct mlx5_eswitch *esw, + struct mlx5_flow_attr *attr, + u16 vport, bool decap) +{ + struct mlx5_esw_indir_table_entry *e; + + mutex_lock(&esw->fdb_table.offloads.indir->lock); + e = mlx5_esw_indir_table_entry_lookup(esw, vport, attr->ip_version); + if (!e) + goto out; + + if (!decap) + e->fwd_ref--; + else + mlx5_esw_indir_table_rule_put(esw, attr, e); + + if (e->fwd_ref || e->recirc_cnt) + goto out; + + hash_del(&e->hlist); + mlx5_destroy_flow_group(e->recirc_grp); + mlx5_del_flow_rules(e->fwd_rule); + mlx5_destroy_flow_group(e->fwd_grp); + mlx5_destroy_flow_table(e->ft); + kfree(e); +out: + mutex_unlock(&esw->fdb_table.offloads.indir->lock); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/indir_table.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/indir_table.h new file mode 100644 index 0000000..f66554b --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/indir_table.h @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2021 Mellanox Technologies. */ + +#ifndef __MLX5_ESW_FT_H__ +#define __MLX5_ESW_FT_H__ + +#include "en/tc_ct.h" +#ifdef CONFIG_MLX5_CLS_ACT + +struct mlx5_esw_indir_table * +mlx5_esw_indir_table_init(void); +void +mlx5_esw_indir_table_destroy(struct mlx5_esw_indir_table *indir); + +struct mlx5_flow_table *mlx5_esw_indir_table_get(struct mlx5_eswitch *esw, + struct mlx5_flow_attr *attr, + struct mlx5_flow_spec *spec, + u16 vport, bool decap); +void mlx5_esw_indir_table_put(struct mlx5_eswitch *esw, + struct mlx5_flow_attr *attr, + u16 vport, bool decap); + +bool +mlx5_esw_indir_table_needed(struct mlx5_eswitch *esw, + struct mlx5_flow_attr *attr, + u16 vport_num, + struct mlx5_core_dev *dest_mdev); + +u16 +mlx5_esw_indir_table_decap_vport(struct mlx5_flow_attr *attr); + +#else +/* indir API stubs */ +static inline struct mlx5_esw_indir_table * +mlx5_esw_indir_table_init(void) +{ + return NULL; +} + +static inline void +mlx5_esw_indir_table_destroy(struct mlx5_esw_indir_table *indir) +{ +} + +static inline struct mlx5_flow_table * +mlx5_esw_indir_table_get(struct mlx5_eswitch *esw, + struct mlx5_flow_attr *attr, + struct mlx5_flow_spec *spec, + u16 vport, bool decap) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline void +mlx5_esw_indir_table_put(struct mlx5_eswitch *esw, + struct mlx5_flow_attr *attr, + u16 vport, bool decap) +{ +} + +static inline bool +mlx5_esw_indir_table_needed(struct mlx5_eswitch *esw, + struct mlx5_flow_attr *attr, + u16 vport_num, + struct mlx5_core_dev *dest_mdev) +{ + return false; +} + +static inline u16 +mlx5_esw_indir_table_decap_vport(struct mlx5_flow_attr *attr) +{ + return 0; +} +#endif + +#endif /* __MLX5_ESW_FT_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/ipsec.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/ipsec.c new file mode 100644 index 0000000..dbb69d0 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/ipsec.c @@ -0,0 +1,790 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// Copyright (c) 2020 Mellanox Technologies. + +#include +#include +#include +#include "lib/fs_chains.h" +#include "esw/ipsec.h" +#include "mlx5_core.h" +#include "accel/ipsec_offload.h" +#include "../fs_core.h" + +#define esw_ipsec_priv(esw) ((esw)->fdb_table.offloads.esw_ipsec_priv) +#define esw_ipsec_ft_crypto_rx(esw) (esw_ipsec_priv(esw)->ipsec_fdb_crypto_rx) +#define esw_ipsec_ft_crypto_rx_miss_grp(esw) (esw_ipsec_priv(esw)->ipsec_fdb_crypto_rx_miss_grp) +#define esw_ipsec_ft_crypto_rx_fwd_rule(esw) (esw_ipsec_priv(esw)->ipsec_fdb_crypto_rx_fwd_rule) + +#define esw_ipsec_ft_decap_rx(esw) (esw_ipsec_priv(esw)->ipsec_fdb_decap_rx) +#define esw_ipsec_decap_miss_grp(esw) (esw_ipsec_priv(esw)->ipsec_fdb_decap_miss_grp) +#define esw_ipsec_decap_miss_rule(esw) (esw_ipsec_priv(esw)->ipsec_fdb_decap_miss_rule) +#define esw_ipsec_decap_rule(esw) (esw_ipsec_priv(esw)->ipsec_fdb_decap_rule) +#define esw_ipsec_pkt_reformat(esw) (esw_ipsec_priv(esw)->pkt_reformat) +#define esw_ipsec_decap_modify_hdr(esw) (esw_ipsec_priv(esw)->modify_hdr) +#define esw_ipsec_decap_rule_counter(esw) (esw_ipsec_priv(esw)->decap_rule_counter) +#define esw_ipsec_decap_miss_rule_counter(esw) (esw_ipsec_priv(esw)->decap_miss_rule_counter) + +#define esw_ipsec_ft_ike_tx(esw) (esw_ipsec_priv(esw)->ipsec_fdb_ike_tx) +#define esw_ipsec_ft_ike_tx_miss_grp(esw) (esw_ipsec_priv(esw)->ipsec_fdb_ike_tx_miss_grp) +#define esw_ipsec_ft_ike_tx_miss_rule(esw) (esw_ipsec_priv(esw)->ipsec_fdb_ike_tx_miss_rule) +#define esw_ipsec_ft_ike_tx_grp(esw) (esw_ipsec_priv(esw)->ipsec_fdb_ike_tx_grp) +#define esw_ipsec_ft_ike_tx_rule(esw) (esw_ipsec_priv(esw)->ipsec_fdb_ike_tx_rule) +#define esw_ipsec_ft_crypto_tx(esw) (esw_ipsec_priv(esw)->ipsec_fdb_crypto_tx) +#define esw_ipsec_ft_crypto_tx_grp(esw) (esw_ipsec_priv(esw)->ipsec_fdb_crypto_tx_grp) +#define esw_ipsec_ft_crypto_tx_miss_rule(esw) (esw_ipsec_priv(esw)->ipsec_fdb_crypto_tx_miss_rule) +#define esw_ipsec_ft_tx_chk(esw) (esw_ipsec_priv(esw)->ipsec_fdb_tx_chk) +#define esw_ipsec_ft_tx_chk_grp(esw) (esw_ipsec_priv(esw)->ipsec_fdb_tx_chk_grp) +#define esw_ipsec_ft_tx_chk_rule(esw) (esw_ipsec_priv(esw)->ipsec_fdb_tx_chk_rule) +#define esw_ipsec_ft_tx_chk_rule_drop(esw) (esw_ipsec_priv(esw)->ipsec_fdb_tx_chk_rule_drop) +#define esw_ipsec_tx_chk_counter(esw) (esw_ipsec_priv(esw)->tx_chk_rule_counter) +#define esw_ipsec_tx_chk_drop_counter(esw) (esw_ipsec_priv(esw)->tx_chk_drop_rule_counter) + +#define esw_ipsec_refcnt(esw) (esw_ipsec_priv(esw)->refcnt) +#define NUM_IPSEC_FTE BIT(18) + +struct mlx5_esw_ipsec_priv { + /* Rx tables, groups and miss rules */ + struct mlx5_flow_table *ipsec_fdb_crypto_rx; + struct mlx5_flow_group *ipsec_fdb_crypto_rx_miss_grp; + struct mlx5_flow_handle *ipsec_fdb_crypto_rx_fwd_rule; + + struct mlx5_flow_table *ipsec_fdb_decap_rx; + struct mlx5_flow_group *ipsec_fdb_decap_miss_grp; + struct mlx5_flow_handle *ipsec_fdb_decap_miss_rule; + struct mlx5_flow_handle *ipsec_fdb_decap_rule; + struct mlx5_pkt_reformat *pkt_reformat; + struct mlx5_modify_hdr *modify_hdr; + struct mlx5_fc *decap_rule_counter; + struct mlx5_fc *decap_miss_rule_counter; + + /* Tx tables, groups and default rules */ + struct mlx5_flow_table *ipsec_fdb_ike_tx; + struct mlx5_flow_group *ipsec_fdb_ike_tx_miss_grp; + struct mlx5_flow_handle *ipsec_fdb_ike_tx_miss_rule; + struct mlx5_flow_group *ipsec_fdb_ike_tx_grp; + struct mlx5_flow_handle *ipsec_fdb_ike_tx_rule; + + struct mlx5_flow_table *ipsec_fdb_crypto_tx; + struct mlx5_flow_group *ipsec_fdb_crypto_tx_grp; + struct mlx5_flow_handle *ipsec_fdb_crypto_tx_miss_rule; + struct mlx5_flow_table *ipsec_fdb_tx_chk; + struct mlx5_flow_group *ipsec_fdb_tx_chk_grp; + struct mlx5_flow_handle *ipsec_fdb_tx_chk_rule; + struct mlx5_flow_handle *ipsec_fdb_tx_chk_rule_drop; + struct mlx5_fc *tx_chk_rule_counter; + struct mlx5_fc *tx_chk_drop_rule_counter; + + /* Flow tables refcount */ + atomic_t refcnt; +}; + +static struct mlx5_flow_table *esw_ipsec_table_create(struct mlx5_flow_namespace *ns, + struct mlx5_eswitch *esw, int prio, + int level, int num_res, + int max_num_groups, int max_fte) +{ + struct mlx5_flow_table_attr ft_attr = {}; + struct mlx5_flow_table *fdb = NULL; + + /* reserve entry for the match all miss group and rule */ + ft_attr.autogroup.num_reserved_entries = num_res; + ft_attr.autogroup.max_num_groups = max_num_groups; + ft_attr.flags = MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT; + ft_attr.level = level; + ft_attr.max_fte = max_fte; + ft_attr.prio = prio; + fdb = mlx5_create_auto_grouped_flow_table(ns, &ft_attr); + if (IS_ERR(fdb)) { + esw_warn(esw->dev, "Failed to create IPsec Crypto FDB Table, prio %d err %ld\n", + prio, PTR_ERR(fdb)); + return fdb; + } + + return fdb; +} + +static void esw_offloads_ipsec_tables_rx_destroy(struct mlx5_eswitch *esw) +{ + if (esw_ipsec_decap_rule(esw)) { + mlx5_del_flow_rules(esw_ipsec_decap_rule(esw)); + esw_ipsec_decap_rule(esw) = NULL; + } + + if (esw_ipsec_pkt_reformat(esw)) { + mlx5_packet_reformat_dealloc(esw->dev, esw_ipsec_pkt_reformat(esw)); + esw_ipsec_pkt_reformat(esw) = NULL; + mlx5_chains_put_table(esw_chains(esw), 0, 1, 0); + } + + if (esw_ipsec_decap_modify_hdr(esw)) { + mlx5_modify_header_dealloc(esw->dev, esw_ipsec_decap_modify_hdr(esw)); + esw_ipsec_decap_modify_hdr(esw) = NULL; + } + + if (esw_ipsec_decap_rule_counter(esw)) { + mlx5_fc_destroy(esw->dev, esw_ipsec_decap_rule_counter(esw)); + esw_ipsec_decap_rule_counter(esw) = NULL; + } + + if (esw_ipsec_decap_miss_rule(esw)) { + mlx5_del_flow_rules(esw_ipsec_decap_miss_rule(esw)); + esw_ipsec_decap_miss_rule(esw) = NULL; + } + + if (esw_ipsec_decap_miss_rule_counter(esw)) { + mlx5_fc_destroy(esw->dev, esw_ipsec_decap_miss_rule_counter(esw)); + esw_ipsec_decap_miss_rule_counter(esw) = NULL; + } + + if (esw_ipsec_decap_miss_grp(esw)) { + mlx5_destroy_flow_group(esw_ipsec_decap_miss_grp(esw)); + esw_ipsec_decap_miss_grp(esw) = NULL; + } + + if (esw_ipsec_ft_decap_rx(esw)) { + mlx5_destroy_flow_table(esw_ipsec_ft_decap_rx(esw)); + esw_ipsec_ft_decap_rx(esw) = NULL; + } + + if (esw_ipsec_ft_crypto_rx_fwd_rule(esw)) { + mlx5_del_flow_rules(esw_ipsec_ft_crypto_rx_fwd_rule(esw)); + esw_ipsec_ft_crypto_rx_fwd_rule(esw) = NULL; + } + + if (esw_ipsec_ft_crypto_rx_miss_grp(esw)) { + mlx5_destroy_flow_group(esw_ipsec_ft_crypto_rx_miss_grp(esw)); + esw_ipsec_ft_crypto_rx_miss_grp(esw) = NULL; + mlx5_chains_put_table(esw_chains(esw), 0, 1, 0); + } + + if (esw_ipsec_ft_crypto_rx(esw)) { + mlx5_destroy_flow_table(esw_ipsec_ft_crypto_rx(esw)); + esw_ipsec_ft_crypto_rx(esw) = NULL; + } +} + +static int esw_offloads_ipsec_tables_rx_create(struct mlx5_flow_namespace *ns, struct mlx5_eswitch *esw) +{ + u8 action[MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto)] = {}; + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_pkt_reformat_params reformat_params; + struct mlx5_core_dev *mdev = esw->dev; + struct mlx5_flow_destination dest[2]; + struct mlx5_modify_hdr *modify_hdr; + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_spec *spec; + struct mlx5_flow_handle *rule; + struct mlx5_fc *flow_counter; + struct mlx5_flow_table *ft; + struct mlx5_flow_group *g; + u32 *flow_group_in; + int err = 0; + + flow_group_in = kvzalloc(inlen, GFP_KERNEL); + if (!flow_group_in) + return -ENOMEM; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) { + kvfree(flow_group_in); + return -ENOMEM; + } + + /* Rx Table 1 */ +#define RX_TABLE_LEVEL_1 0 + ft = esw_ipsec_table_create(ns, esw, FDB_CRYPTO_INGRESS, RX_TABLE_LEVEL_1, 1, 2, NUM_IPSEC_FTE); + if (IS_ERR(ft)) { + err = PTR_ERR(ft); + esw_warn(esw->dev, "Failed to create Rx table 1 err(%d)\n", err); + goto out_err; + } + esw_ipsec_ft_crypto_rx(esw) = ft; + + /* Rx Table 1 - match all group create */ + memset(flow_group_in, 0, inlen); + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, esw_ipsec_ft_crypto_rx(esw)->max_fte - 1); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, esw_ipsec_ft_crypto_rx(esw)->max_fte - 1); + g = mlx5_create_flow_group(esw_ipsec_ft_crypto_rx(esw), flow_group_in); + if (IS_ERR(g)) { + err = PTR_ERR(g); + esw_warn(esw->dev, "Failed to create Rx table1 default forward flow group err(%d)\n", err); + goto out_err; + } + esw_ipsec_ft_crypto_rx_miss_grp(esw) = g; + + /* Rx Table 1 - default forward rule */ + memset(dest, 0, 2 * sizeof(struct mlx5_flow_destination)); + memset(spec, 0, sizeof(*spec)); + memset(&flow_act, 0, sizeof(flow_act)); + flow_act.flags = FLOW_ACT_NO_APPEND; + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + dest[0].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dest[0].ft = mlx5_chains_get_table(esw_chains(esw), 0, 1, 0); + rule = mlx5_add_flow_rules(esw_ipsec_ft_crypto_rx(esw), spec, &flow_act, dest, 1); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + esw_warn(esw->dev, "Failed to add IPsec Rx crypto forward rule err=%d\n", err); + goto out_err; + } + esw_ipsec_ft_crypto_rx_fwd_rule(esw) = rule; + + /* Rx Table 2 */ +#define RX_TABLE_LEVEL_2 1 + ft = esw_ipsec_table_create(ns, esw, FDB_CRYPTO_INGRESS, RX_TABLE_LEVEL_2, 1, 1, NUM_IPSEC_FTE); + if (IS_ERR(ft)) { + err = PTR_ERR(ft); + esw_warn(esw->dev, "Failed to create Rx table 2 err(%d)\n", err); + goto out_err; + } + esw_ipsec_ft_decap_rx(esw) = ft; + + /* Rx Table 2 - match all group create */ + memset(flow_group_in, 0, inlen); + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, esw_ipsec_ft_decap_rx(esw)->max_fte - 1); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, esw_ipsec_ft_decap_rx(esw)->max_fte - 1); + g = mlx5_create_flow_group(esw_ipsec_ft_decap_rx(esw), flow_group_in); + if (IS_ERR(g)) { + err = PTR_ERR(g); + esw_warn(esw->dev, "Failed to create Rx table2 default drop flow group err(%d)\n", err); + goto out_err; + } + esw_ipsec_decap_miss_grp(esw) = g; + + /* Rx Table 2 - add default drop rule */ + flow_counter = mlx5_fc_create(esw->dev, false); + if (IS_ERR(flow_counter)) { + esw_warn(esw->dev, "fail to create decap miss rule flow counter err(%ld)\n", PTR_ERR(flow_counter)); + err = PTR_ERR(flow_counter); + goto out_err; + } + esw_ipsec_decap_miss_rule_counter(esw) = flow_counter; + + memset(dest, 0, 2 * sizeof(struct mlx5_flow_destination)); + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP | MLX5_FLOW_CONTEXT_ACTION_COUNT; + dest[0].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; + dest[0].counter_id = mlx5_fc_id(esw_ipsec_decap_miss_rule_counter(esw)); + spec->flow_context.flow_source = MLX5_FLOW_CONTEXT_FLOW_SOURCE_UPLINK; + rule = mlx5_add_flow_rules(esw_ipsec_ft_decap_rx(esw), spec, &flow_act, dest, 1); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + esw_warn(esw->dev, "fs offloads: Failed to add ipsec_fdb_decap_rx default drop rule %d\n", err); + goto out_err; + } + esw_ipsec_decap_miss_rule(esw) = rule; + + /* Rx Table 2 - add decap rule */ + flow_counter = mlx5_fc_create(esw->dev, false); + if (IS_ERR(flow_counter)) { + esw_warn(esw->dev, "fail to create decap rule flow counter err(%ld)\n", PTR_ERR(flow_counter)); + err = PTR_ERR(flow_counter); + goto out_err; + } + esw_ipsec_decap_rule_counter(esw) = flow_counter; + + MLX5_SET(set_action_in, action, action_type, MLX5_ACTION_TYPE_SET); + MLX5_SET(set_action_in, action, field, MLX5_ACTION_IN_FIELD_METADATA_REG_C_1); + MLX5_SET(set_action_in, action, data, 1); + MLX5_SET(set_action_in, action, offset, 31); + MLX5_SET(set_action_in, action, length, 1); + modify_hdr = mlx5_modify_header_alloc(mdev, MLX5_FLOW_NAMESPACE_FDB, 1, action); + if (IS_ERR(modify_hdr)) { + err = PTR_ERR(modify_hdr); + esw_warn(esw->dev, "fail to alloc ipsec decap set modify_header_id err=%d\n", err); + goto out_err; + } + esw_ipsec_decap_modify_hdr(esw) = modify_hdr; + + /* Rx Table 2 - check ipsec_syndrome and aso_return_reg (set to REG_C_5) */ + memset(dest, 0, 2 * sizeof(struct mlx5_flow_destination)); + memset(spec, 0, sizeof(*spec)); + memset(&flow_act, 0, sizeof(flow_act)); + + memset(&reformat_params, 0, sizeof(reformat_params)); + reformat_params.type = MLX5_REFORMAT_TYPE_DEL_ESP_TRANSPORT; + flow_act.pkt_reformat = mlx5_packet_reformat_alloc(mdev, &reformat_params, + MLX5_FLOW_NAMESPACE_FDB); + if (IS_ERR(flow_act.pkt_reformat)) { + err = PTR_ERR(flow_act.pkt_reformat); + esw_warn(esw->dev, "Failed to allocate delete esp reformat, err=%d\n", err); + goto out_err; + } + esw_ipsec_pkt_reformat(esw) = flow_act.pkt_reformat; + + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, misc_parameters_2.ipsec_syndrome); + MLX5_SET(fte_match_param, spec->match_value, misc_parameters_2.ipsec_syndrome, 0); + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, misc_parameters_2.metadata_reg_c_4); + MLX5_SET(fte_match_param, spec->match_value, misc_parameters_2.metadata_reg_c_4, 0); + spec->flow_context.flow_source = MLX5_FLOW_CONTEXT_FLOW_SOURCE_UPLINK; + spec->match_criteria_enable = MLX5_MATCH_MISC_PARAMETERS_2; + flow_act.modify_hdr = modify_hdr; + flow_act.flags = FLOW_ACT_NO_APPEND; + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT | + MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | + MLX5_FLOW_CONTEXT_ACTION_COUNT | + MLX5_FLOW_CONTEXT_ACTION_MOD_HDR; + dest[0].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dest[0].ft = mlx5_chains_get_table(esw_chains(esw), 0, 1, 0); + dest[1].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; + dest[1].counter_id = mlx5_fc_id(esw_ipsec_decap_rule_counter(esw)); + rule = mlx5_add_flow_rules(esw_ipsec_ft_decap_rx(esw), spec, &flow_act, dest, 2); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + esw_warn(esw->dev, "Failed to add IPsec Rx decap rule err=%d\n", err); + goto out_err; + } + esw_ipsec_decap_rule(esw) = rule; + + goto out; + +out_err: + esw_offloads_ipsec_tables_rx_destroy(esw); +out: + kvfree(spec); + kvfree(flow_group_in); + return err; +} + +static void esw_offloads_ipsec_tables_tx_destroy(struct mlx5_eswitch *esw) +{ + /* Tx table 3 */ + if (esw_ipsec_ft_tx_chk_rule(esw)) { + mlx5_del_flow_rules(esw_ipsec_ft_tx_chk_rule(esw)); + esw_ipsec_ft_tx_chk_rule(esw) = NULL; + } + + if (esw_ipsec_tx_chk_counter(esw)) { + mlx5_fc_destroy(esw->dev, esw_ipsec_tx_chk_counter(esw)); + esw_ipsec_tx_chk_counter(esw) = NULL; + } + + if (esw_ipsec_ft_tx_chk_rule_drop(esw)) { + mlx5_del_flow_rules(esw_ipsec_ft_tx_chk_rule_drop(esw)); + esw_ipsec_ft_tx_chk_rule_drop(esw) = NULL; + } + + if (esw_ipsec_tx_chk_drop_counter(esw)) { + mlx5_fc_destroy(esw->dev, esw_ipsec_tx_chk_drop_counter(esw)); + esw_ipsec_tx_chk_drop_counter(esw) = NULL; + } + + if (esw_ipsec_ft_tx_chk_grp(esw)) { + mlx5_destroy_flow_group(esw_ipsec_ft_tx_chk_grp(esw)); + esw_ipsec_ft_tx_chk_grp(esw) = NULL; + } + + if (esw_ipsec_ft_tx_chk(esw)) { + mlx5_destroy_flow_table(esw_ipsec_ft_tx_chk(esw)); + esw_ipsec_ft_tx_chk(esw) = NULL; + } + + /* Tx table2 */ + if (esw_ipsec_ft_ike_tx_miss_rule(esw)) { + mlx5_del_flow_rules(esw_ipsec_ft_ike_tx_miss_rule(esw)); + esw_ipsec_ft_ike_tx_miss_rule(esw) = NULL; + } + + if (esw_ipsec_ft_crypto_tx_miss_rule(esw)) { + mlx5_del_flow_rules(esw_ipsec_ft_crypto_tx_miss_rule(esw)); + esw_ipsec_ft_crypto_tx_miss_rule(esw) = NULL; + } + + if (esw_ipsec_ft_crypto_tx_grp(esw)) { + mlx5_destroy_flow_group(esw_ipsec_ft_crypto_tx_grp(esw)); + esw_ipsec_ft_crypto_tx_grp(esw) = NULL; + } + + if (esw_ipsec_ft_crypto_tx(esw)) { + mlx5_destroy_flow_table(esw_ipsec_ft_crypto_tx(esw)); + esw_ipsec_ft_crypto_tx(esw) = NULL; + } + + /* Tx table 1 */ + if (esw_ipsec_ft_ike_tx_miss_grp(esw)) { + mlx5_destroy_flow_group(esw_ipsec_ft_ike_tx_miss_grp(esw)); + esw_ipsec_ft_ike_tx_miss_grp(esw) = NULL; + } + + if (esw_ipsec_ft_ike_tx_rule(esw)) { + mlx5_del_flow_rules(esw_ipsec_ft_ike_tx_rule(esw)); + esw_ipsec_ft_ike_tx_rule(esw) = NULL; + } + + if (esw_ipsec_ft_ike_tx_grp(esw)) { + mlx5_destroy_flow_group(esw_ipsec_ft_ike_tx_grp(esw)); + esw_ipsec_ft_ike_tx_grp(esw) = NULL; + } + + if (esw_ipsec_ft_ike_tx(esw)) { + mlx5_destroy_flow_table(esw_ipsec_ft_ike_tx(esw)); + esw_ipsec_ft_ike_tx(esw) = NULL; + } + +} + +#define IKE_UDP_PORT 500 +static int esw_offloads_ipsec_tables_tx_create(struct mlx5_flow_namespace *ns, struct mlx5_eswitch *esw) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_flow_table_attr ft_attr = {}; + struct mlx5_flow_destination dest[2]; + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_handle *rule; + struct mlx5_fc *flow_counter; + struct mlx5_flow_spec *spec; + struct mlx5_flow_table *ft; + struct mlx5_flow_group *g; + void *match_criteria; + u32 *flow_group_in; + int err = 0; + + flow_group_in = kvzalloc(inlen, GFP_KERNEL); + if (!flow_group_in) + return -ENOMEM; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) { + kvfree(flow_group_in); + return -ENOMEM; + } + + /* Tx table 1 */ +#define TX_TABLE_LEVEL_1 0 + ft_attr.flags = MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT; + ft_attr.level = TX_TABLE_LEVEL_1; + ft_attr.max_fte = 2; + ft_attr.prio = FDB_CRYPTO_EGRESS; + ft = mlx5_create_flow_table(ns, &ft_attr); + if (IS_ERR(ft)) { + err = PTR_ERR(ft); + esw_warn(esw->dev, "Failed to create IPsec IKE Tx table err(%d)\n", err); + goto out_err; + } + esw_ipsec_ft_ike_tx(esw) = ft; + + /* IKE table Exclusion ike group */ + memset(flow_group_in, 0, inlen); + MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, + MLX5_MATCH_OUTER_HEADERS); + match_criteria = MLX5_ADDR_OF(create_flow_group_in, flow_group_in, + match_criteria.outer_headers); + MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, match_criteria, udp_dport); + MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, match_criteria, ip_protocol); + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 0); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 0); + g = mlx5_create_flow_group(esw_ipsec_ft_ike_tx(esw), flow_group_in); + if (IS_ERR(g)) { + err = PTR_ERR(g); + esw_warn(esw->dev, "Failed to IPsec IKE Tx table default flow group err(%d)\n", err); + goto out_err; + } + esw_ipsec_ft_ike_tx_grp(esw) = g; + + /* IKE table Exclusion ike rule */ + memset(spec, 0, sizeof(*spec)); + spec->match_criteria_enable |= MLX5_MATCH_OUTER_HEADERS; + MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, spec->match_criteria, udp_dport); + MLX5_SET(fte_match_set_lyr_2_4, spec->match_value, udp_dport, IKE_UDP_PORT); + MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, spec->match_criteria, ip_protocol); + MLX5_SET(fte_match_set_lyr_2_4, spec->match_value, ip_protocol, IPPROTO_UDP); + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + memset(dest, 0, 2 * sizeof(struct mlx5_flow_destination)); + dest[0].type = MLX5_FLOW_DESTINATION_TYPE_VPORT; + dest[0].vport.num = MLX5_VPORT_UPLINK; + rule = mlx5_add_flow_rules(esw_ipsec_ft_ike_tx(esw), spec, &flow_act, dest, 1); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + esw_warn(esw->dev, "Failed to add IPsec Tx table exclusion ike rule %d\n", err); + goto out_err; + } + esw_ipsec_ft_ike_tx_rule(esw) = rule; + + /* IKE table default miss group */ + memset(flow_group_in, 0, inlen); + memset(spec, 0, sizeof(*spec)); + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 1); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 1); + g = mlx5_create_flow_group(esw_ipsec_ft_ike_tx(esw), flow_group_in); + if (IS_ERR(g)) { + err = PTR_ERR(g); + esw_warn(esw->dev, "Failed to IPsec IKE Tx table default flow miss group err(%d)\n", err); + goto out_err; + } + esw_ipsec_ft_ike_tx_miss_grp(esw) = g; + + /* Tx table 2 */ +#define TX_TABLE_LEVEL_2 1 + ft = esw_ipsec_table_create(ns, esw, FDB_CRYPTO_EGRESS, TX_TABLE_LEVEL_2, 1, 4, NUM_IPSEC_FTE); + if (IS_ERR(ft)) { + err = PTR_ERR(ft); + esw_warn(esw->dev, "Failed to create IPsec Tx table err(%d)\n", err); + goto out_err; + } + esw_ipsec_ft_crypto_tx(esw) = ft; + + /* default miss group/rule */ + memset(flow_group_in, 0, inlen); + memset(spec, 0, sizeof(*spec)); + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, esw_ipsec_ft_crypto_tx(esw)->max_fte - 1); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, esw_ipsec_ft_crypto_tx(esw)->max_fte - 1); + g = mlx5_create_flow_group(esw_ipsec_ft_crypto_tx(esw), flow_group_in); + if (IS_ERR(g)) { + err = PTR_ERR(g); + esw_warn(esw->dev, "Failed to IPsec Tx table default flow group err(%d)\n", err); + goto out_err; + } + esw_ipsec_ft_crypto_tx_grp(esw) = g; + + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + memset(dest, 0, 2 * sizeof(struct mlx5_flow_destination)); + dest[0].type = MLX5_FLOW_DESTINATION_TYPE_VPORT; + dest[0].vport.num = MLX5_VPORT_UPLINK; + rule = mlx5_add_flow_rules(esw_ipsec_ft_crypto_tx(esw), spec, &flow_act, dest, 1); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + esw_warn(esw->dev, "Failed to add IPsec Tx table default miss rule %d\n", err); + goto out_err; + } + esw_ipsec_ft_crypto_tx_miss_rule(esw) = rule; + + /* IKE table default miss rule */ + memset(spec, 0, sizeof(*spec)); + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + memset(dest, 0, 2 * sizeof(struct mlx5_flow_destination)); + dest[0].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dest[0].ft = esw_ipsec_ft_crypto_tx(esw); + rule = mlx5_add_flow_rules(esw_ipsec_ft_ike_tx(esw), spec, &flow_act, dest, 1); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + esw_warn(esw->dev, "Failed to add IPsec IKE Tx table default miss rule %d\n", err); + goto out_err; + } + esw_ipsec_ft_ike_tx_miss_rule(esw) = rule; + + /* Tx Table 3 */ +#define TX_TABLE_LEVEL_3 2 + ft = esw_ipsec_table_create(ns, esw, FDB_CRYPTO_EGRESS, TX_TABLE_LEVEL_3, 1, 1, 2); + if (IS_ERR(ft)) { + err = PTR_ERR(ft); + esw_warn(esw->dev, "Failed to create Tx table 2 err(%d)\n", err); + goto out_err; + } + esw_ipsec_ft_tx_chk(esw) = ft; + + /* Tx Table 3 - match all group create */ + memset(flow_group_in, 0, inlen); + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, esw_ipsec_ft_tx_chk(esw)->max_fte - 1); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, esw_ipsec_ft_tx_chk(esw)->max_fte - 1); + g = mlx5_create_flow_group(esw_ipsec_ft_tx_chk(esw), flow_group_in); + if (IS_ERR(g)) { + err = PTR_ERR(g); + esw_warn(esw->dev, "Failed to create Tx table2 default drop flow group err(%d)\n", err); + goto out_err; + } + esw_ipsec_ft_tx_chk_grp(esw) = g; + + /* Tx Table 3 - add default drop rule */ + flow_counter = mlx5_fc_create(esw->dev, false); + if (IS_ERR(flow_counter)) { + esw_warn(esw->dev, "fail to create tx chk drop flow counter err(%ld)\n", PTR_ERR(flow_counter)); + err = PTR_ERR(flow_counter); + goto out_err; + } + esw_ipsec_tx_chk_drop_counter(esw) = flow_counter; + + memset(dest, 0, 2 * sizeof(struct mlx5_flow_destination)); + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP | MLX5_FLOW_CONTEXT_ACTION_COUNT; + dest[0].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; + dest[0].counter_id = mlx5_fc_id(esw_ipsec_tx_chk_drop_counter(esw)); + spec->flow_context.flow_source = MLX5_FLOW_CONTEXT_FLOW_SOURCE_LOCAL_VPORT; + rule = mlx5_add_flow_rules(esw_ipsec_ft_tx_chk(esw), spec, &flow_act, dest, 1); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + esw_warn(esw->dev, "fs offloads: Failed to add tx chk drop rule %d\n", err); + goto out_err; + } + esw_ipsec_ft_tx_chk_rule_drop(esw) = rule; + + /* Tx Table 3 - add tx chk rule */ + flow_counter = mlx5_fc_create(esw->dev, false); + if (IS_ERR(flow_counter)) { + esw_warn(esw->dev, "fail to create tx chk rule flow counter err(%ld)\n", PTR_ERR(flow_counter)); + err = PTR_ERR(flow_counter); + goto out_err; + } + esw_ipsec_tx_chk_counter(esw) = flow_counter; + + /* Tx Table 3 - check aso_return_reg (set to REG_C_5) */ + memset(dest, 0, 2 * sizeof(struct mlx5_flow_destination)); + memset(spec, 0, sizeof(*spec)); + memset(&flow_act, 0, sizeof(flow_act)); + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, misc_parameters_2.metadata_reg_c_4); + MLX5_SET(fte_match_param, spec->match_value, misc_parameters_2.metadata_reg_c_4, 0); + spec->flow_context.flow_source = MLX5_FLOW_CONTEXT_FLOW_SOURCE_LOCAL_VPORT; + spec->match_criteria_enable = MLX5_MATCH_MISC_PARAMETERS_2; + flow_act.flags = FLOW_ACT_NO_APPEND; + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | MLX5_FLOW_CONTEXT_ACTION_COUNT; + + dest[0].type = MLX5_FLOW_DESTINATION_TYPE_VPORT; + dest[0].vport.num = MLX5_VPORT_UPLINK; + dest[1].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; + dest[1].counter_id = mlx5_fc_id(esw_ipsec_tx_chk_counter(esw)); + + rule = mlx5_add_flow_rules(esw_ipsec_ft_tx_chk(esw), spec, &flow_act, dest, 2); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + esw_warn(esw->dev, "Failed to add IPsec Tx chk rule err=%d\n", err); + goto out_err; + } + esw_ipsec_ft_tx_chk_rule(esw) = rule; + + goto out; + +out_err: + esw_offloads_ipsec_tables_tx_destroy(esw); +out: + kvfree(spec); + kvfree(flow_group_in); + return err; +} + +int mlx5_esw_ipsec_get_refcnt(struct mlx5_eswitch *esw) +{ + if (esw && esw_ipsec_priv(esw) && !atomic_inc_unless_negative(&esw_ipsec_refcnt(esw))) + return -EOPNOTSUPP; + + return 0; +} + +void mlx5_esw_ipsec_put_refcnt(struct mlx5_eswitch *esw) +{ + if (esw && esw_ipsec_priv(esw)) + atomic_dec(&esw_ipsec_refcnt(esw)); +} + +struct mlx5_flow_table *mlx5_esw_ipsec_get_table(struct mlx5_eswitch *esw, enum mlx5_esw_ipsec_table_type type) +{ + switch (type) { + case MLX5_ESW_IPSEC_FT_RX_CRYPTO: + return esw_ipsec_ft_crypto_rx(esw); + case MLX5_ESW_IPSEC_FT_RX_DECAP: + return esw_ipsec_ft_decap_rx(esw); + case MLX5_ESW_IPSEC_FT_TX_IKE: + return esw_ipsec_ft_ike_tx(esw); + case MLX5_ESW_IPSEC_FT_TX_CRYPTO: + return esw_ipsec_ft_crypto_tx(esw); + case MLX5_ESW_IPSEC_FT_TX_CHK: + return esw_ipsec_ft_tx_chk(esw); + default: return NULL; + } +} + +bool mlx5_esw_ipsec_try_hold(struct mlx5_eswitch *esw) +{ + if (!esw || !esw_ipsec_priv(esw)) + return true; + + return atomic_dec_unless_positive(&esw_ipsec_refcnt(esw)); +} + +void mlx5_esw_ipsec_release(struct mlx5_eswitch *esw) +{ + if (esw && esw_ipsec_priv(esw)) + atomic_set(&esw_ipsec_refcnt(esw), 0); +} + +int mlx5_esw_ipsec_create(struct mlx5_eswitch *esw) +{ + struct mlx5_esw_ipsec_priv *ipsec_priv; + struct mlx5_flow_namespace *ns; + int err; + + if (!mlx5_is_ipsec_device(esw->dev)) + return 0; + + if (esw->offloads.ipsec != DEVLINK_ESWITCH_IPSEC_MODE_FULL) + return 0; + + ipsec_priv = kzalloc(sizeof(*ipsec_priv), GFP_KERNEL); + if (!ipsec_priv) + return -ENOMEM; + + esw_ipsec_priv(esw) = ipsec_priv; + ns = mlx5_get_flow_namespace(esw->dev, MLX5_FLOW_NAMESPACE_FDB); + err = esw_offloads_ipsec_tables_rx_create(ns, esw); + if (err) { + esw_warn(esw->dev, "Failed to create IPsec Rx offloads FDB Tables err %d\n", err); + goto err_rx_create; + } + + err = esw_offloads_ipsec_tables_tx_create(ns, esw); + if (err) { + esw_warn(esw->dev, "Failed to create IPsec Tx offloads FDB Tables err %d\n", err); + goto err_tx_create; + } + + atomic_set(&esw_ipsec_refcnt(esw), 0); + return 0; + +err_tx_create: + esw_offloads_ipsec_tables_rx_destroy(esw); +err_rx_create: + kfree(ipsec_priv); + esw_ipsec_priv(esw) = NULL; + return err; +} + +void mlx5_esw_ipsec_destroy(struct mlx5_eswitch *esw) +{ + if (!mlx5_is_ipsec_device(esw->dev)) + return; + + if (esw->offloads.ipsec != DEVLINK_ESWITCH_IPSEC_MODE_FULL) + return; + + esw_offloads_ipsec_tables_tx_destroy(esw); + esw_offloads_ipsec_tables_rx_destroy(esw); + kfree(esw_ipsec_priv(esw)); + esw_ipsec_priv(esw) = NULL; +} + +bool mlx5_esw_ipsec_is_full_initialized (struct mlx5_eswitch *esw) +{ + return esw && esw_ipsec_priv(esw); +} + +void mlx5_esw_ipsec_full_offload_get_stats(struct mlx5_eswitch *esw, void *ipsec_stats) +{ + struct mlx5e_ipsec_stats *stats; + + stats = (struct mlx5e_ipsec_stats *)ipsec_stats; + + stats->ipsec_full_rx_pkts = 0; + stats->ipsec_full_rx_bytes = 0; + stats->ipsec_full_rx_pkts_drop = 0; + stats->ipsec_full_rx_bytes_drop = 0; + stats->ipsec_full_tx_pkts = 0; + stats->ipsec_full_tx_bytes = 0; + stats->ipsec_full_tx_pkts_drop = 0; + stats->ipsec_full_tx_bytes_drop = 0; + + if (!esw || !esw_ipsec_priv(esw)) + return; + + if (!esw_ipsec_decap_rule_counter(esw) || + !esw_ipsec_decap_miss_rule_counter(esw) || + !esw_ipsec_tx_chk_drop_counter(esw) || + !esw_ipsec_tx_chk_counter(esw)) + return; + + mlx5_fc_query(esw->dev, esw_ipsec_decap_rule_counter(esw), + &stats->ipsec_full_rx_pkts, &stats->ipsec_full_rx_bytes); + + mlx5_fc_query(esw->dev, esw_ipsec_decap_miss_rule_counter(esw), + &stats->ipsec_full_rx_pkts_drop, &stats->ipsec_full_rx_bytes_drop); + + mlx5_fc_query(esw->dev, esw_ipsec_tx_chk_counter(esw), + &stats->ipsec_full_tx_pkts, &stats->ipsec_full_tx_bytes); + + mlx5_fc_query(esw->dev, esw_ipsec_tx_chk_drop_counter(esw), + &stats->ipsec_full_tx_pkts_drop, &stats->ipsec_full_tx_bytes_drop); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/ipsec.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/ipsec.h new file mode 100644 index 0000000..3147322 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/ipsec.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2020 Mellanox Technologies. */ + +#ifndef __ML5_ESW_IPSEC_H__ +#define __ML5_ESW_IPSEC_H__ + +#include "eswitch.h" +#include "en_accel/ipsec.h" + +enum mlx5_esw_ipsec_table_type { + MLX5_ESW_IPSEC_FT_RX_CRYPTO, + MLX5_ESW_IPSEC_FT_RX_DECAP, + MLX5_ESW_IPSEC_FT_TX_IKE, + MLX5_ESW_IPSEC_FT_TX_CRYPTO, + MLX5_ESW_IPSEC_FT_TX_CHK, +}; + +#if IS_ENABLED(CONFIG_MLX5_EN_IPSEC) +int mlx5_esw_ipsec_create(struct mlx5_eswitch *esw); +void mlx5_esw_ipsec_destroy(struct mlx5_eswitch *esw); +struct mlx5_flow_table *mlx5_esw_ipsec_get_table(struct mlx5_eswitch *esw, enum mlx5_esw_ipsec_table_type type); +bool mlx5_esw_ipsec_is_full_initialized (struct mlx5_eswitch *esw); +int mlx5_esw_ipsec_get_refcnt(struct mlx5_eswitch *esw); +void mlx5_esw_ipsec_put_refcnt(struct mlx5_eswitch *esw); +bool mlx5_esw_ipsec_try_hold(struct mlx5_eswitch *esw); +void mlx5_esw_ipsec_release(struct mlx5_eswitch *esw); +void mlx5_esw_ipsec_full_offload_get_stats(struct mlx5_eswitch *esw, void *ipsec_stats); +static inline int mlx5_is_ipsec_full_offload(struct mlx5e_priv *priv) +{ + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + + return esw && (mlx5_eswitch_mode(priv->mdev) == MLX5_ESWITCH_OFFLOADS) && + (esw->offloads.ipsec == DEVLINK_ESWITCH_IPSEC_MODE_FULL); +} + +#else /* CONFIG_MLX5_EN_IPSEC */ + +static inline struct mlx5_flow_table *mlx5_esw_ipsec_get_table(struct mlx5_eswitch *esw, + enum mlx5_esw_ipsec_table_type type) +{ + return NULL; +} +static inline int mlx5_esw_ipsec_create(struct mlx5_eswitch *esw) { return 0; } +static inline void mlx5_esw_ipsec_destroy(struct mlx5_eswitch *esw) {} +static inline bool mlx5_esw_ipsec_try_hold(struct mlx5_eswitch *esw) { return true; } +static inline void mlx5_esw_ipsec_release(struct mlx5_eswitch *esw) { return; } +static inline void +mlx5_esw_ipsec_full_offload_get_stats(struct mlx5_eswitch *esw, void *ipsec_stats) {} + +static inline int mlx5_is_ipsec_full_offload(struct mlx5e_priv *priv) +{ + return 0; +} +#endif /* CONFIG_MLX5_EN_IPSEC */ + +#endif /* __ML5_ESW_IPSEC_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/legacy.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/legacy.c new file mode 100644 index 0000000..64bdf87 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/legacy.c @@ -0,0 +1,529 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2021 Mellanox Technologies Ltd */ + +#include +#include +#include +#include +#include +#include "esw/acl/lgcy.h" +#include "esw/legacy.h" +#include "mlx5_core.h" +#include "eswitch.h" +#include "fs_core.h" +#include "fs_ft_pool.h" +#include "esw/qos.h" + +enum { + LEGACY_VEPA_PRIO = 0, + LEGACY_FDB_PRIO, +}; + +static int esw_create_legacy_vepa_table(struct mlx5_eswitch *esw) +{ + struct mlx5_flow_table_attr ft_attr = {}; + struct mlx5_core_dev *dev = esw->dev; + struct mlx5_flow_namespace *root_ns; + struct mlx5_flow_table *fdb; + int err; + + root_ns = mlx5_get_fdb_sub_ns(dev, 0); + if (!root_ns) { + esw_warn(dev, "Failed to get FDB flow namespace\n"); + return -EOPNOTSUPP; + } + + /* num FTE 2, num FG 2 */ + ft_attr.prio = LEGACY_VEPA_PRIO; + ft_attr.max_fte = 2; + ft_attr.autogroup.max_num_groups = 2; + fdb = mlx5_create_auto_grouped_flow_table(root_ns, &ft_attr); + if (IS_ERR(fdb)) { + err = PTR_ERR(fdb); + esw_warn(dev, "Failed to create VEPA FDB err %d\n", err); + return err; + } + esw->fdb_table.legacy.vepa_fdb = fdb; + + return 0; +} + +static void esw_destroy_legacy_fdb_table(struct mlx5_eswitch *esw) +{ + esw_debug(esw->dev, "Destroy FDB Table\n"); + if (!esw->fdb_table.legacy.fdb) + return; + + if (esw->fdb_table.legacy.promisc_grp) + mlx5_destroy_flow_group(esw->fdb_table.legacy.promisc_grp); + if (esw->fdb_table.legacy.allmulti_grp) + mlx5_destroy_flow_group(esw->fdb_table.legacy.allmulti_grp); + if (esw->fdb_table.legacy.addr_grp) + mlx5_destroy_flow_group(esw->fdb_table.legacy.addr_grp); + mlx5_destroy_flow_table(esw->fdb_table.legacy.fdb); + + esw->fdb_table.legacy.fdb = NULL; + esw->fdb_table.legacy.addr_grp = NULL; + esw->fdb_table.legacy.allmulti_grp = NULL; + esw->fdb_table.legacy.promisc_grp = NULL; + atomic64_set(&esw->user_count, 0); +} + +static int esw_create_legacy_fdb_table(struct mlx5_eswitch *esw) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_flow_table_attr ft_attr = {}; + struct mlx5_core_dev *dev = esw->dev; + struct mlx5_flow_namespace *root_ns; + struct mlx5_flow_table *fdb; + struct mlx5_flow_group *g; + void *match_criteria; + int table_size; + u32 *flow_group_in; + u8 *dmac; + int err = 0; + + esw_debug(dev, "Create FDB log_max_size(%d)\n", + MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size)); + + root_ns = mlx5_get_fdb_sub_ns(dev, 0); + if (!root_ns) { + esw_warn(dev, "Failed to get FDB flow namespace\n"); + return -EOPNOTSUPP; + } + + flow_group_in = kvzalloc(inlen, GFP_KERNEL); + if (!flow_group_in) + return -ENOMEM; + + ft_attr.max_fte = POOL_NEXT_SIZE; + ft_attr.prio = LEGACY_FDB_PRIO; + fdb = mlx5_create_flow_table(root_ns, &ft_attr); + if (IS_ERR(fdb)) { + err = PTR_ERR(fdb); + esw_warn(dev, "Failed to create FDB Table err %d\n", err); + goto out; + } + esw->fdb_table.legacy.fdb = fdb; + table_size = fdb->max_fte; + + /* Addresses group : Full match unicast/multicast addresses */ + MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, + MLX5_MATCH_OUTER_HEADERS); + match_criteria = MLX5_ADDR_OF(create_flow_group_in, flow_group_in, match_criteria); + dmac = MLX5_ADDR_OF(fte_match_param, match_criteria, outer_headers.dmac_47_16); + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 0); + /* Preserve 2 entries for allmulti and promisc rules*/ + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, table_size - 3); + eth_broadcast_addr(dmac); + g = mlx5_create_flow_group(fdb, flow_group_in); + if (IS_ERR(g)) { + err = PTR_ERR(g); + esw_warn(dev, "Failed to create flow group err(%d)\n", err); + goto out; + } + esw->fdb_table.legacy.addr_grp = g; + + /* Allmulti group : One rule that forwards any mcast traffic */ + MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, + MLX5_MATCH_OUTER_HEADERS); + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, table_size - 2); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, table_size - 2); + eth_zero_addr(dmac); + dmac[0] = 0x01; + g = mlx5_create_flow_group(fdb, flow_group_in); + if (IS_ERR(g)) { + err = PTR_ERR(g); + esw_warn(dev, "Failed to create allmulti flow group err(%d)\n", err); + goto out; + } + esw->fdb_table.legacy.allmulti_grp = g; + + /* Promiscuous group : + * One rule that forward all unmatched traffic from previous groups + */ + eth_zero_addr(dmac); + MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, + MLX5_MATCH_MISC_PARAMETERS); + MLX5_SET_TO_ONES(fte_match_param, match_criteria, misc_parameters.source_port); + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, table_size - 1); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, table_size - 1); + g = mlx5_create_flow_group(fdb, flow_group_in); + if (IS_ERR(g)) { + err = PTR_ERR(g); + esw_warn(dev, "Failed to create promisc flow group err(%d)\n", err); + goto out; + } + esw->fdb_table.legacy.promisc_grp = g; + +out: + if (err) + esw_destroy_legacy_fdb_table(esw); + + kvfree(flow_group_in); + return err; +} + +static void esw_destroy_legacy_vepa_table(struct mlx5_eswitch *esw) +{ + esw_debug(esw->dev, "Destroy VEPA Table\n"); + if (!esw->fdb_table.legacy.vepa_fdb) + return; + + mlx5_destroy_flow_table(esw->fdb_table.legacy.vepa_fdb); + esw->fdb_table.legacy.vepa_fdb = NULL; +} + +static int esw_create_legacy_table(struct mlx5_eswitch *esw) +{ + int err; + + memset(&esw->fdb_table.legacy, 0, sizeof(struct legacy_fdb)); + atomic64_set(&esw->user_count, 0); + + err = esw_create_legacy_vepa_table(esw); + if (err) + return err; + + err = esw_create_legacy_fdb_table(esw); + if (err) + esw_destroy_legacy_vepa_table(esw); + + return err; +} + +static void esw_cleanup_vepa_rules(struct mlx5_eswitch *esw) +{ + if (esw->fdb_table.legacy.vepa_uplink_rule) + mlx5_del_flow_rules(esw->fdb_table.legacy.vepa_uplink_rule); + + if (esw->fdb_table.legacy.vepa_star_rule) + mlx5_del_flow_rules(esw->fdb_table.legacy.vepa_star_rule); + + esw->fdb_table.legacy.vepa_uplink_rule = NULL; + esw->fdb_table.legacy.vepa_star_rule = NULL; +} + +static void esw_destroy_legacy_table(struct mlx5_eswitch *esw) +{ + esw_cleanup_vepa_rules(esw); + esw_destroy_legacy_fdb_table(esw); + esw_destroy_legacy_vepa_table(esw); +} + +#define MLX5_LEGACY_SRIOV_VPORT_EVENTS (MLX5_VPORT_UC_ADDR_CHANGE | \ + MLX5_VPORT_MC_ADDR_CHANGE | \ + MLX5_VPORT_PROMISC_CHANGE) + +int esw_legacy_enable(struct mlx5_eswitch *esw) +{ + struct mlx5_vport *vport; + unsigned long i; + int ret; + + ret = esw_create_legacy_table(esw); + if (ret) + return ret; + + mlx5_esw_for_each_vf_vport(esw, i, vport, esw->esw_funcs.num_vfs) + vport->info.link_state = MLX5_VPORT_ADMIN_STATE_AUTO; + + ret = mlx5_eswitch_enable_pf_vf_vports(esw, MLX5_LEGACY_SRIOV_VPORT_EVENTS); + if (ret) + esw_destroy_legacy_table(esw); + return ret; +} + +void esw_legacy_disable(struct mlx5_eswitch *esw) +{ + struct esw_mc_addr *mc_promisc; + + mlx5_eswitch_disable_pf_vf_vports(esw); + + mc_promisc = &esw->mc_promisc; + if (mc_promisc->uplink_rule) + mlx5_del_flow_rules(mc_promisc->uplink_rule); + + esw_destroy_legacy_table(esw); +} + +static int _mlx5_eswitch_set_vepa_locked(struct mlx5_eswitch *esw, + u8 setting) +{ + struct mlx5_flow_destination dest = {}; + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_handle *flow_rule; + struct mlx5_flow_spec *spec; + int err = 0; + void *misc; + + if (!setting) { + esw_cleanup_vepa_rules(esw); + return 0; + } + + if (esw->fdb_table.legacy.vepa_uplink_rule) + return 0; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) + return -ENOMEM; + + /* Uplink rule forward uplink traffic to FDB */ + misc = MLX5_ADDR_OF(fte_match_param, spec->match_value, misc_parameters); + MLX5_SET(fte_match_set_misc, misc, source_port, MLX5_VPORT_UPLINK); + + misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, misc_parameters); + MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_port); + + spec->match_criteria_enable = MLX5_MATCH_MISC_PARAMETERS; + dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dest.ft = esw->fdb_table.legacy.fdb; + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + flow_rule = mlx5_add_flow_rules(esw->fdb_table.legacy.vepa_fdb, spec, + &flow_act, &dest, 1); + if (IS_ERR(flow_rule)) { + err = PTR_ERR(flow_rule); + goto out; + } else { + esw->fdb_table.legacy.vepa_uplink_rule = flow_rule; + } + + /* Star rule to forward all traffic to uplink vport */ + memset(&dest, 0, sizeof(dest)); + dest.type = MLX5_FLOW_DESTINATION_TYPE_VPORT; + dest.vport.num = MLX5_VPORT_UPLINK; + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + flow_rule = mlx5_add_flow_rules(esw->fdb_table.legacy.vepa_fdb, NULL, + &flow_act, &dest, 1); + if (IS_ERR(flow_rule)) { + err = PTR_ERR(flow_rule); + goto out; + } else { + esw->fdb_table.legacy.vepa_star_rule = flow_rule; + } + +out: + kvfree(spec); + if (err) + esw_cleanup_vepa_rules(esw); + return err; +} + +int mlx5_eswitch_set_vepa(struct mlx5_eswitch *esw, u8 setting) +{ + int err = 0; + + if (!esw) + return -EOPNOTSUPP; + + if (!mlx5_esw_allowed(esw)) + return -EPERM; + + mutex_lock(&esw->state_lock); + if (esw->mode != MLX5_ESWITCH_LEGACY) { + err = -EOPNOTSUPP; + goto out; + } + + err = _mlx5_eswitch_set_vepa_locked(esw, setting); + +out: + mutex_unlock(&esw->state_lock); + return err; +} + +int mlx5_eswitch_get_vepa(struct mlx5_eswitch *esw, u8 *setting) +{ + if (!esw) + return -EOPNOTSUPP; + + if (!mlx5_esw_allowed(esw)) + return -EPERM; + + if (esw->mode != MLX5_ESWITCH_LEGACY) + return -EOPNOTSUPP; + + *setting = esw->fdb_table.legacy.vepa_uplink_rule ? 1 : 0; + return 0; +} + +int esw_legacy_vport_acl_setup(struct mlx5_eswitch *esw, struct mlx5_vport *vport) +{ + int ret; + + /* Only non manager vports need ACL in legacy mode */ + if (mlx5_esw_is_manager_vport(esw, vport->vport)) + return 0; + + ret = esw_acl_ingress_lgcy_setup(esw, vport); + if (ret) + goto ingress_err; + + ret = esw_acl_egress_lgcy_setup(esw, vport); + if (ret) + goto egress_err; + + return 0; + +egress_err: + esw_acl_ingress_lgcy_cleanup(esw, vport); +ingress_err: + return ret; +} + +void esw_legacy_vport_acl_cleanup(struct mlx5_eswitch *esw, struct mlx5_vport *vport) +{ + if (mlx5_esw_is_manager_vport(esw, vport->vport)) + return; + + esw_acl_egress_lgcy_cleanup(esw, vport); + esw_acl_ingress_lgcy_cleanup(esw, vport); +} + +int mlx5_esw_query_vport_drop_stats(struct mlx5_core_dev *dev, + struct mlx5_vport *vport, + struct mlx5_vport_drop_stats *stats) +{ + u64 rx_discard_vport_down, tx_discard_vport_down; + struct mlx5_eswitch *esw = dev->priv.eswitch; + u64 bytes = 0; + int err = 0; + + if (esw->mode != MLX5_ESWITCH_LEGACY) + return 0; + + mutex_lock(&esw->state_lock); + if (!vport->enabled) + goto unlock; + + if (!IS_ERR_OR_NULL(vport->egress.legacy.drop_counter)) + mlx5_fc_query(dev, vport->egress.legacy.drop_counter, + &stats->rx_dropped, &bytes); + + if (vport->ingress.legacy.drop_counter) + mlx5_fc_query(dev, vport->ingress.legacy.drop_counter, + &stats->tx_dropped, &bytes); + + if (!MLX5_CAP_GEN(dev, receive_discard_vport_down) && + !MLX5_CAP_GEN(dev, transmit_discard_vport_down)) + goto unlock; + + err = mlx5_query_vport_down_stats(dev, vport->vport, 1, + &rx_discard_vport_down, + &tx_discard_vport_down); + if (err) + goto unlock; + + if (MLX5_CAP_GEN(dev, receive_discard_vport_down)) + stats->rx_dropped += rx_discard_vport_down; + if (MLX5_CAP_GEN(dev, transmit_discard_vport_down)) + stats->tx_dropped += tx_discard_vport_down; + +unlock: + mutex_unlock(&esw->state_lock); + return err; +} + +int mlx5_eswitch_set_vport_vlan(struct mlx5_eswitch *esw, int vport, + u16 vlan, u8 qos, __be16 vlan_proto) +{ + u8 set_flags = 0; + int err = 0; + + if (!mlx5_esw_allowed(esw)) + return vlan ? -EPERM : 0; + + if (vlan || qos) + set_flags = SET_VLAN_STRIP | SET_VLAN_INSERT; + + mutex_lock(&esw->state_lock); + if (esw->mode != MLX5_ESWITCH_LEGACY) { + if (!vlan) + goto unlock; /* compatibility with libvirt */ + + err = -EOPNOTSUPP; + goto unlock; + } + + err = __mlx5_eswitch_set_vport_vlan(esw, vport, vlan, qos, vlan_proto, set_flags); + +unlock: + mutex_unlock(&esw->state_lock); + return err; +} + +int mlx5_eswitch_set_vport_spoofchk(struct mlx5_eswitch *esw, + u16 vport, bool spoofchk) +{ + struct mlx5_vport *evport = mlx5_eswitch_get_vport(esw, vport); + bool pschk; + int err = 0; + + if (!mlx5_esw_allowed(esw)) + return -EPERM; + if (IS_ERR(evport)) + return PTR_ERR(evport); + + mutex_lock(&esw->state_lock); + if (esw->mode != MLX5_ESWITCH_LEGACY) { + err = -EOPNOTSUPP; + goto unlock; + } + pschk = evport->info.spoofchk; + evport->info.spoofchk = spoofchk; + if (pschk && !is_valid_ether_addr(evport->info.mac)) + mlx5_core_warn(esw->dev, + "Spoofchk in set while MAC is invalid, vport(%d)\n", + evport->vport); + if (evport->enabled && esw->mode == MLX5_ESWITCH_LEGACY) + err = esw_acl_ingress_lgcy_setup(esw, evport); + if (err) + evport->info.spoofchk = pschk; + +unlock: + mutex_unlock(&esw->state_lock); + return err; +} + +int mlx5_eswitch_set_vport_trust(struct mlx5_eswitch *esw, + u16 vport, bool setting) +{ + struct mlx5_vport *evport = mlx5_eswitch_get_vport(esw, vport); + int err = 0; + + if (!mlx5_esw_allowed(esw)) + return -EPERM; + if (IS_ERR(evport)) + return PTR_ERR(evport); + + mutex_lock(&esw->state_lock); + if (esw->mode != MLX5_ESWITCH_LEGACY) { + err = -EOPNOTSUPP; + goto unlock; + } + evport->info.trusted = setting; + if (evport->enabled) + esw_vport_change_handle_locked(evport); + +unlock: + mutex_unlock(&esw->state_lock); + return err; +} + +int mlx5_eswitch_set_vport_rate(struct mlx5_eswitch *esw, u16 vport, + u32 max_rate, u32 min_rate) +{ + struct mlx5_vport *evport = mlx5_eswitch_get_vport(esw, vport); + int err; + + if (!mlx5_esw_allowed(esw)) + return -EPERM; + if (IS_ERR(evport)) + return PTR_ERR(evport); + + mutex_lock(&esw->state_lock); + err = mlx5_esw_qos_set_vport_rate(esw, evport, max_rate, min_rate); + mutex_unlock(&esw->state_lock); + return err; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/legacy.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/legacy.h new file mode 100644 index 0000000..e0820bb --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/legacy.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2021 Mellanox Technologies Ltd */ + +#ifndef __MLX5_ESW_LEGACY_H__ +#define __MLX5_ESW_LEGACY_H__ + +#define MLX5_LEGACY_SRIOV_VPORT_EVENTS (MLX5_VPORT_UC_ADDR_CHANGE | \ + MLX5_VPORT_MC_ADDR_CHANGE | \ + MLX5_VPORT_PROMISC_CHANGE) + +struct mlx5_eswitch; + +int esw_legacy_enable(struct mlx5_eswitch *esw); +void esw_legacy_disable(struct mlx5_eswitch *esw); + +int esw_legacy_vport_acl_setup(struct mlx5_eswitch *esw, struct mlx5_vport *vport); +void esw_legacy_vport_acl_cleanup(struct mlx5_eswitch *esw, struct mlx5_vport *vport); + +int mlx5_esw_query_vport_drop_stats(struct mlx5_core_dev *dev, + struct mlx5_vport *vport, + struct mlx5_vport_drop_stats *stats); +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/pet_offloads.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/pet_offloads.c new file mode 100644 index 0000000..1fd0ff2 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/pet_offloads.c @@ -0,0 +1,406 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// Copyright (c) 2021 Mellanox Technologies +#include +#include +#include +#include +#include +#include +#include "mlx5_core.h" +#include +#include "esw/acl/ofld.h" +#include "accel/ipsec_offload.h" +#include "esw/ipsec.h" +#include "esw/indir_table.h" +#include "rdma.h" +#include "en.h" +#include "fs_core.h" +#include "lib/devcom.h" +#include "lib/eq.h" +#include "lib/fs_chains.h" +#include "en_tc.h" +#include "en_rep.h" + +#ifdef CONFIG_MLX5_ESWITCH + +bool mlx5_eswitch_pet_insert_allowed(const struct mlx5_eswitch *esw) +{ + return !!(esw->flags & MLX5_ESWITCH_PET_INSERT); +} + +bool mlx5e_esw_offloads_pet_supported(const struct mlx5_eswitch *esw) +{ + if (MLX5_CAP_GEN_2(esw->dev, max_reformat_insert_size) && + MLX5_CAP_GEN_2(esw->dev, non_tunnel_reformat)) + return true; + + return false; +} + +bool mlx5e_esw_offloads_pet_enabled(const struct mlx5_eswitch *esw) +{ + if (!mlx5_eswitch_pet_insert_allowed(esw)) + return false; + + if (!mlx5_eswitch_vport_match_metadata_enabled(esw)) + return false; + + return true; +} + +static int mlx5_pet_create_ft(struct mlx5_eswitch *esw, struct mlx5_flow_table **ft, int size) +{ + struct mlx5_flow_table_attr ft_attr = {}; + struct mlx5_flow_namespace *ns; + int err; + + ns = mlx5_get_flow_namespace(esw->dev, MLX5_FLOW_NAMESPACE_KERNEL); + if (!ns) { + esw_warn(esw->dev, "Failed to get FDB flow namespace\n"); + return -EOPNOTSUPP; + } + + ft_attr.max_fte = size; + ft_attr.prio = 1; + + *ft = mlx5_create_flow_table(ns, &ft_attr); + if (IS_ERR(*ft)) { + err = PTR_ERR(*ft); + *ft = NULL; + esw_warn(esw->dev, "Failed to create flow table - err %d\n", err); + return err; + } + + return 0; +} + +static void mlx5_pet_destroy_ft(struct mlx5_eswitch *esw, struct mlx5_flow_table *ft) +{ + if (!ft) + return; + + mlx5_destroy_flow_table(ft); +} + +static int mlx5_pet_create_fg(struct mlx5_eswitch *esw, + struct mlx5_flow_table *ft, + struct mlx5_flow_group **fg) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + u32 *flow_group_in; + int err = 0; + + flow_group_in = kvzalloc(inlen, GFP_KERNEL); + if (!flow_group_in) + return -ENOMEM; + + memset(flow_group_in, 0, inlen); + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 0); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 1); + *fg = mlx5_create_flow_group(ft, flow_group_in); + if (IS_ERR(*fg)) { + err = PTR_ERR(*fg); + mlx5_core_warn(esw->dev, "Failed to create flowgroup with err %d\n", err); + *fg = NULL; + goto out; + } + +out: + kvfree(flow_group_in); + return err; +} + +static void mlx5_pet_destroy_fg(struct mlx5_eswitch *esw, struct mlx5_flow_group *fg) +{ + if (!fg) + return; + mlx5_destroy_flow_group(fg); +} + +int mlx5_pet_push_hdr_ft(struct mlx5_eswitch *esw) +{ + int err; + + err = mlx5_pet_create_ft(esw, + &esw->offloads.pet_vport_action.push_pet_hdr.ft, 2); + if (err) { + mlx5_core_warn(esw->dev, "failed with err %d\n", err); + return err; + } + + return 0; +} + +void mlx5_pet_push_hdr_ft_cleanup(struct mlx5_eswitch *esw) +{ + mlx5_pet_destroy_ft(esw, esw->offloads.pet_vport_action.push_pet_hdr.ft); +} + +int mlx5_pet_push_hdr_rule(struct mlx5_eswitch *esw) +{ + struct mlx5_flow_table *ft = esw->offloads.pet_vport_action.push_pet_hdr.ft; + int mlnx_ether = htons(MLX5_CAP_GEN(esw->dev, mlnx_tag_ethertype)); + struct mlx5_pkt_reformat_params reformat_params; + struct mlx5_flow_destination dest = {}; + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_handle *flow_rule; + struct mlx5_flow_spec *spec; + struct mlx5_flow_group *fg; + int reformat_type; + char *reformat_buf; + int buf_offset = 12; + int buf_size = 8; + int err; + + reformat_buf = kzalloc(buf_size, GFP_KERNEL); + if (!reformat_buf) + return -ENOMEM; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) { + err = -ENOMEM; + goto err_alloc; + } + + err = mlx5_pet_create_fg(esw, ft, &fg); + if (err) { + mlx5_core_warn(esw->dev, "failed with err %d\n", err); + goto err_create_group; + } + + dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dest.ft = esw->offloads.pet_vport_action.copy_data_to_pet_hdr.ft; + + flow_act.flags |= FLOW_ACT_IGNORE_FLOW_LEVEL; + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | + MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT; + reformat_type = MLX5_REFORMAT_TYPE_INSERT_HDR; + + memcpy(reformat_buf, &mlnx_ether, 2); + + reformat_params.type = reformat_type; + reformat_params.param_0 = MLX5_REFORMAT_CONTEXT_ANCHOR_MAC_START; + reformat_params.param_1 = buf_offset; + reformat_params.size = buf_size; + reformat_params.data = reformat_buf; + flow_act.pkt_reformat = mlx5_packet_reformat_alloc(esw->dev, &reformat_params, + MLX5_FLOW_NAMESPACE_KERNEL); + if (IS_ERR(flow_act.pkt_reformat)) { + err = PTR_ERR(flow_act.pkt_reformat); + mlx5_core_err(esw->dev, "packet reformat alloc err %d\n", err); + goto err_pkt_reformat; + } + + flow_rule = mlx5_add_flow_rules(ft, spec, &flow_act, &dest, 1); + if (IS_ERR(flow_rule)) { + err = PTR_ERR(flow_rule); + mlx5_core_err(esw->dev, "Failed to add flow rule for insert header, err %d\n", err); + goto err_flow_rule; + } + + esw->offloads.pet_vport_action.push_pet_hdr.fg = fg; + esw->offloads.pet_vport_action.push_pet_hdr.rule = flow_rule; + esw->offloads.pet_vport_action.push_pet_hdr.pkt_reformat = flow_act.pkt_reformat; + kvfree(spec); + kvfree(reformat_buf); + return 0; + +err_flow_rule: + mlx5_packet_reformat_dealloc(esw->dev, flow_act.pkt_reformat); +err_pkt_reformat: + mlx5_pet_destroy_fg(esw, fg); +err_create_group: + kvfree(spec); +err_alloc: + kvfree(reformat_buf); + return err; +} + +void mlx5_pet_push_hdr_rule_cleanup(struct mlx5_eswitch *esw) +{ + struct mlx5_pet_actions pet_action = esw->offloads.pet_vport_action.push_pet_hdr; + + if (!pet_action.rule) + return; + + mlx5_del_flow_rules(pet_action.rule); + mlx5_packet_reformat_dealloc(esw->dev, pet_action.pkt_reformat); + mlx5_pet_destroy_fg(esw, pet_action.fg); +} + +int mlx5_pet_copy_data_ft(struct mlx5_eswitch *esw) +{ + int err; + + err = mlx5_pet_create_ft(esw, &esw->offloads.pet_vport_action.copy_data_to_pet_hdr.ft, 2); + if (err) { + mlx5_core_warn(esw->dev, "failed with err %d\n", err); + return err; + } + + return 0; +} + +void mlx5_pet_copy_data_ft_cleanup(struct mlx5_eswitch *esw) +{ + mlx5_pet_destroy_ft(esw, esw->offloads.pet_vport_action.copy_data_to_pet_hdr.ft); +} + +int mlx5_pet_copy_data_rule(struct mlx5_eswitch *esw, struct mlx5_flow_table *dest_ft) +{ + struct mlx5_flow_table *ft = esw->offloads.pet_vport_action.copy_data_to_pet_hdr.ft; + u8 action[MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto)] = {}; + struct mlx5_flow_destination dest = {}; + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_handle *flow_rule; + struct mlx5_modify_hdr *modify_hdr; + struct mlx5_flow_spec *spec; + struct mlx5_flow_group *fg; + int err; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) + return -ENOMEM; + + err = mlx5_pet_create_fg(esw, ft, &fg); + if (err) { + mlx5_core_warn(esw->dev, "failed with err %d\n", err); + goto err_create_group; + } + + MLX5_SET(copy_action_in, action, action_type, MLX5_ACTION_TYPE_COPY); + MLX5_SET(copy_action_in, action, src_field, MLX5_ACTION_IN_FIELD_METADATA_REG_C_0); + MLX5_SET(copy_action_in, action, src_offset, 16); + MLX5_SET(copy_action_in, action, dst_field, MLX5_ACTION_IN_FIELD_OUT_EMD_47_32); + MLX5_SET(copy_action_in, action, dst_offset, 0); + MLX5_SET(copy_action_in, action, length, 16); + + modify_hdr = mlx5_modify_header_alloc(esw->dev, MLX5_FLOW_NAMESPACE_KERNEL, + 1, action); + if (IS_ERR(modify_hdr)) { + err = PTR_ERR(modify_hdr); + mlx5_core_warn(esw->dev, "modify header alloc failed with err %d\n", err); + modify_hdr = NULL; + goto header_alloc_fail; + } + + dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dest.ft = dest_ft; + flow_act.flags |= FLOW_ACT_IGNORE_FLOW_LEVEL; + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_MOD_HDR | MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + flow_act.modify_hdr = modify_hdr; + + flow_rule = mlx5_add_flow_rules(ft, spec, &flow_act, &dest, 1); + if (IS_ERR(flow_rule)) { + err = PTR_ERR(flow_rule); + mlx5_core_warn(esw->dev, "add rule failed with err %d\n", err); + flow_rule = NULL; + goto add_flow_rule_fail; + } + + esw->offloads.pet_vport_action.copy_data_to_pet_hdr.fg = fg; + esw->offloads.pet_vport_action.copy_data_to_pet_hdr.hdr = modify_hdr; + esw->offloads.pet_vport_action.copy_data_to_pet_hdr.rule = flow_rule; + kvfree(spec); + return 0; + +add_flow_rule_fail: + mlx5_modify_header_dealloc(esw->dev, modify_hdr); +header_alloc_fail: + mlx5_pet_destroy_fg(esw, fg); +err_create_group: + kvfree(spec); + return err; +} + +void mlx5_pet_copy_data_rule_cleanup(struct mlx5_eswitch *esw) +{ + struct mlx5_pet_actions pet_action = esw->offloads.pet_vport_action.copy_data_to_pet_hdr; + + if (!pet_action.rule) + return; + + mlx5_del_flow_rules(pet_action.rule); + mlx5_modify_header_dealloc(esw->dev, pet_action.hdr); + pet_action.rule = NULL; + + mlx5_pet_destroy_fg(esw, pet_action.fg); +} + +/* Setup 2 flowtables - One to insert PET header. this will + * be a 8 byte buffer with first 2 bytes containg + * FW provided ethertype as part of it. Second flowtable + * to copy vport id from reg_c_0 right after the FW + * provided ethertype. All packets going thru FDB slow path + * will be tagged with this header. + */ +int mlx5e_esw_offloads_pet_setup(struct mlx5_eswitch *esw, struct mlx5_flow_table *ft) +{ + int err; + + if (!mlx5e_esw_offloads_pet_enabled(esw)) + return 0; + + err = mlx5_pet_push_hdr_ft(esw); + if (err) + return err; + + err = mlx5_pet_copy_data_ft(esw); + if (err) + goto err_copy_data_ft; + + err = mlx5_pet_push_hdr_rule(esw); + if (err) + goto err_push_hdr_rule; + + err = mlx5_pet_copy_data_rule(esw, ft); + if (err) + goto err_copy_data_rule; + + return 0; + +err_copy_data_rule: + mlx5_pet_push_hdr_rule_cleanup(esw); +err_push_hdr_rule: + mlx5_pet_copy_data_ft_cleanup(esw); +err_copy_data_ft: + mlx5_pet_push_hdr_ft_cleanup(esw); + return err; +} + +void mlx5e_esw_offloads_pet_cleanup(struct mlx5_eswitch *esw) +{ + if (!mlx5e_esw_offloads_pet_enabled(esw)) + return; + + mlx5_pet_copy_data_rule_cleanup(esw); + mlx5_pet_push_hdr_rule_cleanup(esw); + + mlx5_pet_copy_data_ft_cleanup(esw); + mlx5_pet_push_hdr_ft_cleanup(esw); +} + +int mlx5_esw_offloads_pet_insert_set(struct mlx5_eswitch *esw, bool enable) +{ + int err = 0; + + down_write(&esw->mode_lock); + if (esw->mode >= MLX5_ESWITCH_OFFLOADS) { + err = -EOPNOTSUPP; + goto done; + } + if (!mlx5e_esw_offloads_pet_supported(esw)) { + err = -EOPNOTSUPP; + goto done; + } + if (enable) + esw->flags |= MLX5_ESWITCH_PET_INSERT; + else + esw->flags &= ~MLX5_ESWITCH_PET_INSERT; + +done: + up_write(&esw->mode_lock); + return err; +} +#endif /* CONFIG_MLX5_ESWITCH */ + diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c new file mode 100644 index 0000000..9d1dcca --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c @@ -0,0 +1,1136 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#include "eswitch.h" +#include "esw/qos.h" +#include "en/port.h" +#define CREATE_TRACE_POINTS +#include "diag/qos_tracepoint.h" + +/* Minimum supported BW share value by the HW is 1 Mbit/sec */ +#define MLX5_MIN_BW_SHARE 1 + +#define MLX5_RATE_TO_BW_SHARE(rate, divider, limit) \ + min_t(u32, max_t(u32, DIV_ROUND_UP(rate, divider), MLX5_MIN_BW_SHARE), limit) + +static int esw_qos_tsar_config(struct mlx5_core_dev *dev, u32 *sched_ctx, + u32 parent_ix, u32 tsar_ix, + u32 max_rate, u32 bw_share) +{ + u32 bitmask = 0; + + if (!MLX5_CAP_GEN(dev, qos) || !MLX5_CAP_QOS(dev, esw_scheduling)) + return -EOPNOTSUPP; + + MLX5_SET(scheduling_context, sched_ctx, parent_element_id, parent_ix); + MLX5_SET(scheduling_context, sched_ctx, max_average_bw, max_rate); + MLX5_SET(scheduling_context, sched_ctx, bw_share, bw_share); + bitmask |= MODIFY_SCHEDULING_ELEMENT_IN_MODIFY_BITMASK_MAX_AVERAGE_BW; + bitmask |= MODIFY_SCHEDULING_ELEMENT_IN_MODIFY_BITMASK_BW_SHARE; + + return mlx5_modify_scheduling_element_cmd(dev, + SCHEDULING_HIERARCHY_E_SWITCH, + sched_ctx, + tsar_ix, + bitmask); +} + +static int esw_qos_group_config(struct mlx5_eswitch *esw, struct mlx5_esw_rate_group *group, + u32 max_rate, u32 bw_share, struct netlink_ext_ack *extack) +{ + u32 sched_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {}; + struct mlx5_core_dev *dev = esw->dev; + int err; + + err = esw_qos_tsar_config(dev, sched_ctx, + esw->qos.root_tsar_ix, group->tsar_ix, + max_rate, bw_share); + if (err) + NL_SET_ERR_MSG_MOD(extack, "E-Switch modify group TSAR element failed"); + + trace_mlx5_esw_group_qos_config(dev, group, group->tsar_ix, bw_share, max_rate); + + return err; +} + +int +mlx5_esw_get_esw_and_vport(struct devlink *devlink, struct devlink_port *port, + struct mlx5_eswitch **esw, struct mlx5_vport **vport, + struct netlink_ext_ack *extack) +{ + u16 vport_num; + + *esw = mlx5_devlink_eswitch_get(devlink); + if (IS_ERR(*esw)) { + NL_SET_ERR_MSG_MOD(extack, "Esw not found"); + return PTR_ERR(*esw); + } + + vport_num = mlx5_esw_devlink_port_index_to_vport_num(port->index); + *vport = mlx5_eswitch_get_vport(*esw, vport_num); + if (IS_ERR(*vport)) { + NL_SET_ERR_MSG_MOD(extack, "Failed to get vport"); + return PTR_ERR(*vport); + } + + return 0; +} + +static int esw_qos_vport_config(struct mlx5_eswitch *esw, + struct mlx5_vport *vport, + u32 max_rate, u32 bw_share, + struct netlink_ext_ack *extack) +{ + u32 sched_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {}; + struct mlx5_esw_rate_group *group = vport->qos.group; + struct mlx5_core_dev *dev = esw->dev; + u32 parent_tsar_ix; + void *vport_elem; + int err; + + if (!vport->qos.enabled) + return -EIO; + + parent_tsar_ix = group ? group->tsar_ix : esw->qos.root_tsar_ix; + MLX5_SET(scheduling_context, sched_ctx, element_type, + SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT); + vport_elem = MLX5_ADDR_OF(scheduling_context, sched_ctx, + element_attributes); + MLX5_SET(vport_element, vport_elem, vport_number, vport->vport); + + err = esw_qos_tsar_config(dev, sched_ctx, parent_tsar_ix, vport->qos.esw_tsar_ix, + max_rate, bw_share); + if (err) { + esw_warn(esw->dev, + "E-Switch modify TSAR vport element failed (vport=%d,err=%d)\n", + vport->vport, err); + NL_SET_ERR_MSG_MOD(extack, "E-Switch modify TSAR vport element failed"); + return err; + } + + trace_mlx5_esw_vport_qos_config(vport, bw_share, max_rate); + + return 0; +} + +static u32 esw_qos_calculate_min_rate_divider(struct mlx5_eswitch *esw, + struct mlx5_esw_rate_group *group, + bool group_level) +{ + u32 fw_max_bw_share = MLX5_CAP_QOS(esw->dev, max_tsar_bw_share); + struct mlx5_vport *evport; + u32 max_guarantee = 0; + unsigned long i; + + if (group_level) { + struct mlx5_esw_rate_group *group; + + list_for_each_entry(group, &esw->qos.groups, list) { + if (group->min_rate < max_guarantee) + continue; + max_guarantee = group->min_rate; + } + } else { + mlx5_esw_for_each_vport(esw, i, evport) { + if (!evport->enabled || !evport->qos.enabled || + evport->qos.group != group || evport->qos.min_rate < max_guarantee) + continue; + max_guarantee = evport->qos.min_rate; + } + } + + if (max_guarantee) + return max_t(u32, max_guarantee / fw_max_bw_share, 1); + + /* If vports min rate divider is 0 but their group has bw_share configured, then + * need to set bw_share for vports to minimal value. + */ + if (!group_level && !max_guarantee && group && group->bw_share) + return 1; + return 0; +} + +static u32 esw_qos_calc_bw_share(u32 min_rate, u32 divider, u32 fw_max) +{ + if (divider) + return MLX5_RATE_TO_BW_SHARE(min_rate, divider, fw_max); + + return 0; +} + +static int esw_qos_normalize_vports_min_rate(struct mlx5_eswitch *esw, + struct mlx5_esw_rate_group *group, + struct netlink_ext_ack *extack) +{ + u32 fw_max_bw_share = MLX5_CAP_QOS(esw->dev, max_tsar_bw_share); + u32 divider = esw_qos_calculate_min_rate_divider(esw, group, false); + struct mlx5_vport *evport; + unsigned long i; + u32 bw_share; + int err; + + mlx5_esw_for_each_vport(esw, i, evport) { + if (!evport->enabled || !evport->qos.enabled || evport->qos.group != group) + continue; + bw_share = esw_qos_calc_bw_share(evport->qos.min_rate, divider, fw_max_bw_share); + + if (bw_share == evport->qos.bw_share) + continue; + + err = esw_qos_vport_config(esw, evport, evport->qos.max_rate, bw_share, extack); + if (err) + return err; + + evport->qos.bw_share = bw_share; + } + + return 0; +} + +static int esw_qos_normalize_groups_min_rate(struct mlx5_eswitch *esw, u32 divider, + struct netlink_ext_ack *extack) +{ + u32 fw_max_bw_share = MLX5_CAP_QOS(esw->dev, max_tsar_bw_share); + struct mlx5_esw_rate_group *group; + u32 bw_share; + int err; + + list_for_each_entry(group, &esw->qos.groups, list) { + bw_share = esw_qos_calc_bw_share(group->min_rate, divider, fw_max_bw_share); + + if (bw_share == group->bw_share) + continue; + + err = esw_qos_group_config(esw, group, group->max_rate, bw_share, extack); + if (err) + return err; + + group->bw_share = bw_share; + + /* All the group's vports need to be set with default bw_share + * to enable them with QOS + */ + err = esw_qos_normalize_vports_min_rate(esw, group, extack); + + if (err) + return err; + } + + return 0; +} + +int esw_qos_set_vport_min_rate(struct mlx5_eswitch *esw, struct mlx5_vport *evport, + u32 min_rate, struct netlink_ext_ack *extack) +{ + u32 fw_max_bw_share, previous_min_rate; + bool min_rate_supported; + int err; + + lockdep_assert_held(&esw->state_lock); + fw_max_bw_share = MLX5_CAP_QOS(esw->dev, max_tsar_bw_share); + min_rate_supported = MLX5_CAP_QOS(esw->dev, esw_bw_share) && + fw_max_bw_share >= MLX5_MIN_BW_SHARE; + if (min_rate && !min_rate_supported) + return -EOPNOTSUPP; + if (min_rate == evport->qos.min_rate) + return 0; + if (!evport->enabled) { + return -EPERM; + } + + previous_min_rate = evport->qos.min_rate; + evport->qos.min_rate = min_rate; + err = esw_qos_normalize_vports_min_rate(esw, evport->qos.group, extack); + if (err) + evport->qos.min_rate = previous_min_rate; + + return err; +} + +int esw_qos_set_vport_max_rate(struct mlx5_eswitch *esw, struct mlx5_vport *evport, + u32 max_rate, struct netlink_ext_ack *extack) +{ + u32 act_max_rate = max_rate; + bool max_rate_supported; + int err; + + lockdep_assert_held(&esw->state_lock); + max_rate_supported = MLX5_CAP_QOS(esw->dev, esw_rate_limit); + + if (max_rate && !max_rate_supported) + return -EOPNOTSUPP; + if (max_rate == evport->qos.max_rate) + return 0; + + /* If parent group has rate limit need to set to group + * value when new max rate is 0. + */ + if (evport->qos.group && !max_rate) + act_max_rate = evport->qos.group->max_rate; + + err = esw_qos_vport_config(esw, evport, act_max_rate, evport->qos.bw_share, extack); + + if (!err) + evport->qos.max_rate = max_rate; + + return err; +} + +int esw_qos_set_group_min_rate(struct mlx5_eswitch *esw, struct mlx5_esw_rate_group *group, + u32 min_rate, struct netlink_ext_ack *extack) +{ + u32 fw_max_bw_share = MLX5_CAP_QOS(esw->dev, max_tsar_bw_share); + struct mlx5_core_dev *dev = esw->dev; + u32 previous_min_rate, divider; + int err; + + if (!(MLX5_CAP_QOS(dev, esw_bw_share) && fw_max_bw_share >= MLX5_MIN_BW_SHARE)) + return -EOPNOTSUPP; + + if (min_rate == group->min_rate) + return 0; + + previous_min_rate = group->min_rate; + group->min_rate = min_rate; + divider = esw_qos_calculate_min_rate_divider(esw, group, true); + err = esw_qos_normalize_groups_min_rate(esw, divider, extack); + if (err) { + group->min_rate = previous_min_rate; + NL_SET_ERR_MSG_MOD(extack, "E-Switch group min rate setting failed"); + + /* Attempt restoring previous configuration */ + divider = esw_qos_calculate_min_rate_divider(esw, group, true); + if (esw_qos_normalize_groups_min_rate(esw, divider, extack)) + NL_SET_ERR_MSG_MOD(extack, "E-Switch BW share restore failed"); + } + + return err; +} + +int esw_qos_set_group_max_rate(struct mlx5_eswitch *esw, + struct mlx5_esw_rate_group *group, + u32 max_rate, struct netlink_ext_ack *extack) +{ + struct mlx5_vport *vport; + unsigned long i; + int err; + + if (group->max_rate == max_rate) + return 0; + + err = esw_qos_group_config(esw, group, max_rate, group->bw_share, extack); + if (err) + return err; + + group->max_rate = max_rate; + + /* Any unlimited vports in the group should be set + * with the value of the group. + */ + mlx5_esw_for_each_vport(esw, i, vport) { + if (!vport->enabled || !vport->qos.enabled || + vport->qos.group != group || vport->qos.max_rate) + continue; + + err = esw_qos_vport_config(esw, vport, max_rate, vport->qos.bw_share, extack); + if (err) + NL_SET_ERR_MSG_MOD(extack, + "E-Switch vport implicit rate limit setting failed"); + } + + return err; +} + +static int esw_qos_vport_create_sched_element(struct mlx5_eswitch *esw, + struct mlx5_vport *vport, + u32 max_rate, u32 bw_share) +{ + u32 sched_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {}; + struct mlx5_esw_rate_group *group = vport->qos.group; + struct mlx5_core_dev *dev = esw->dev; + u32 parent_tsar_ix; + void *vport_elem; + int err; + + parent_tsar_ix = group ? group->tsar_ix : esw->qos.root_tsar_ix; + MLX5_SET(scheduling_context, sched_ctx, element_type, + SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT); + vport_elem = MLX5_ADDR_OF(scheduling_context, sched_ctx, element_attributes); + MLX5_SET(vport_element, vport_elem, vport_number, vport->vport); + MLX5_SET(scheduling_context, sched_ctx, parent_element_id, parent_tsar_ix); + MLX5_SET(scheduling_context, sched_ctx, max_average_bw, max_rate); + MLX5_SET(scheduling_context, sched_ctx, bw_share, bw_share); + + err = mlx5_create_scheduling_element_cmd(dev, + SCHEDULING_HIERARCHY_E_SWITCH, + sched_ctx, + &vport->qos.esw_tsar_ix); + if (err) { + esw_warn(esw->dev, "E-Switch create TSAR vport element failed (vport=%d,err=%d)\n", + vport->vport, err); + return err; + } + + return 0; +} + +static void esw_qos_destroy_sysfs_rate_group(struct mlx5_eswitch *esw, + struct mlx5_vport *vport, + struct mlx5_esw_rate_group *group) +{ + if (!group) + return; + if (group == esw->qos.group0) + return; + + if (vport->vport != MLX5_VPORT_PF) + group->num_vports--; + if (group->group_id && !group->num_vports && + !mlx5_esw_is_sf_vport(esw, vport->vport)) + esw_qos_destroy_rate_group(esw, group, NULL); +} + +static int esw_qos_update_group_scheduling_element(struct mlx5_eswitch *esw, + struct mlx5_vport *vport, + struct mlx5_esw_rate_group *curr_group, + struct mlx5_esw_rate_group *new_group, + struct netlink_ext_ack *extack) +{ + u32 max_rate; + int err; + + err = mlx5_destroy_scheduling_element_cmd(esw->dev, + SCHEDULING_HIERARCHY_E_SWITCH, + vport->qos.esw_tsar_ix); + if (err) { + NL_SET_ERR_MSG_MOD(extack, "E-Switch destroy TSAR vport element failed"); + return err; + } + + vport->qos.group = new_group; + max_rate = vport->qos.max_rate ? vport->qos.max_rate : new_group->max_rate; + + /* If vport is unlimited, we set the group's value. + * Therefore, if the group is limited it will apply to + * the vport as well and if not, vport will remain unlimited. + */ + err = esw_qos_vport_create_sched_element(esw, vport, max_rate, vport->qos.bw_share); + if (err) { + NL_SET_ERR_MSG_MOD(extack, "E-Switch vport group set failed."); + goto err_sched; + } + + return 0; + +err_sched: + vport->qos.group = curr_group; + max_rate = vport->qos.max_rate ? vport->qos.max_rate : curr_group->max_rate; + if (esw_qos_vport_create_sched_element(esw, vport, max_rate, vport->qos.bw_share)) + esw_warn(esw->dev, "E-Switch vport group restore failed (vport=%d)\n", + vport->vport); + + return err; +} + +static int esw_qos_vport_update_group(struct mlx5_eswitch *esw, + struct mlx5_vport *vport, + struct mlx5_esw_rate_group *group, + struct netlink_ext_ack *extack) +{ + struct mlx5_esw_rate_group *new_group, *curr_group; + int err; + + if (!vport->enabled) + return -EINVAL; + + curr_group = vport->qos.group; + new_group = group ?: esw->qos.group0; + if (curr_group == new_group) + return 0; + + err = esw_qos_update_group_scheduling_element(esw, vport, curr_group, new_group, extack); + if (err) + return err; + + /* Recalculate bw share weights of old and new groups */ + if (vport->qos.bw_share || new_group->bw_share) { + esw_qos_normalize_vports_min_rate(esw, curr_group, extack); + esw_qos_normalize_vports_min_rate(esw, new_group, extack); + } + + return 0; +} + +static struct mlx5_esw_rate_group * +__esw_qos_create_rate_group(struct mlx5_eswitch *esw, u32 group_id, + struct netlink_ext_ack *extack) +{ + u32 tsar_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {}; + struct mlx5_esw_rate_group *group; + u32 divider; + int err; + + group = kzalloc(sizeof(*group), GFP_KERNEL); + if (!group) + return ERR_PTR(-ENOMEM); + + group->group_id = group_id; + MLX5_SET(scheduling_context, tsar_ctx, parent_element_id, + esw->qos.root_tsar_ix); + err = mlx5_create_scheduling_element_cmd(esw->dev, + SCHEDULING_HIERARCHY_E_SWITCH, + tsar_ctx, + &group->tsar_ix); + if (err) { + NL_SET_ERR_MSG_MOD(extack, "E-Switch create TSAR for group failed"); + goto err_sched_elem; + } + + if (group_id != MLX5_ESW_QOS_NON_SYSFS_GROUP) { + group->dev = esw->dev; + err = mlx5_create_vf_group_sysfs(esw->dev, group_id, &group->kobj); + if (err) + goto err_sched_elem; + } + + list_add_tail(&group->list, &esw->qos.groups); + + divider = esw_qos_calculate_min_rate_divider(esw, group, true); + if (divider) { + err = esw_qos_normalize_groups_min_rate(esw, divider, extack); + if (err) { + NL_SET_ERR_MSG_MOD(extack, "E-Switch groups normalization failed"); + goto err_min_rate; + } + } + trace_mlx5_esw_group_qos_create(esw->dev, group, group->tsar_ix); + + return group; + +err_min_rate: + list_del(&group->list); + if (mlx5_destroy_scheduling_element_cmd(esw->dev, + SCHEDULING_HIERARCHY_E_SWITCH, + group->tsar_ix)) + NL_SET_ERR_MSG_MOD(extack, "E-Switch destroy TSAR for group failed"); + if (group_id != MLX5_ESW_QOS_NON_SYSFS_GROUP) + mlx5_destroy_vf_group_sysfs(esw->dev, &group->kobj); +err_sched_elem: + kfree(group); + return ERR_PTR(err); +} + +static int esw_qos_get(struct mlx5_eswitch *esw, struct netlink_ext_ack *extack); +static void esw_qos_put(struct mlx5_eswitch *esw); + +struct mlx5_esw_rate_group * +esw_qos_create_rate_group(struct mlx5_eswitch *esw, u32 group_id, + struct netlink_ext_ack *extack) +{ + struct mlx5_esw_rate_group *group; + int err; + + if (!MLX5_CAP_QOS(esw->dev, log_esw_max_sched_depth)) + return ERR_PTR(-EOPNOTSUPP); + + err = esw_qos_get(esw, extack); + if (err) + return ERR_PTR(err); + + group = __esw_qos_create_rate_group(esw, group_id, extack); + if (IS_ERR(group)) + esw_qos_put(esw); + + return group; +} + +static int __esw_qos_destroy_rate_group(struct mlx5_eswitch *esw, + struct mlx5_esw_rate_group *group, + struct netlink_ext_ack *extack) +{ + u32 divider; + int err; + + if (group->group_id != MLX5_ESW_QOS_NON_SYSFS_GROUP) + mlx5_destroy_vf_group_sysfs(esw->dev, &group->kobj); + list_del(&group->list); + + divider = esw_qos_calculate_min_rate_divider(esw, NULL, true); + err = esw_qos_normalize_groups_min_rate(esw, divider, extack); + if (err) + NL_SET_ERR_MSG_MOD(extack, "E-Switch groups' normalization failed"); + + err = mlx5_destroy_scheduling_element_cmd(esw->dev, + SCHEDULING_HIERARCHY_E_SWITCH, + group->tsar_ix); + if (err) + NL_SET_ERR_MSG_MOD(extack, "E-Switch destroy TSAR_ID failed"); + + trace_mlx5_esw_group_qos_destroy(esw->dev, group, group->tsar_ix); + + kfree(group); + + return err; +} + +int esw_qos_destroy_rate_group(struct mlx5_eswitch *esw, + struct mlx5_esw_rate_group *group, + struct netlink_ext_ack *extack) +{ + int err; + + err = __esw_qos_destroy_rate_group(esw, group, extack); + esw_qos_put(esw); + + return err; +} + +static bool esw_qos_element_type_supported(struct mlx5_core_dev *dev, int type) +{ + switch (type) { + case SCHEDULING_CONTEXT_ELEMENT_TYPE_TSAR: + return MLX5_CAP_QOS(dev, esw_element_type) & + ELEMENT_TYPE_CAP_MASK_TASR; + case SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT: + return MLX5_CAP_QOS(dev, esw_element_type) & + ELEMENT_TYPE_CAP_MASK_VPORT; + case SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT_TC: + return MLX5_CAP_QOS(dev, esw_element_type) & + ELEMENT_TYPE_CAP_MASK_VPORT_TC; + case SCHEDULING_CONTEXT_ELEMENT_TYPE_PARA_VPORT_TC: + return MLX5_CAP_QOS(dev, esw_element_type) & + ELEMENT_TYPE_CAP_MASK_PARA_VPORT_TC; + } + return false; +} + +static int esw_qos_create(struct mlx5_eswitch *esw, struct netlink_ext_ack *extack) +{ + u32 tsar_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {}; + struct mlx5_core_dev *dev = esw->dev; + __be32 *attr; + int err; + + if (!MLX5_CAP_GEN(dev, qos) || !MLX5_CAP_QOS(dev, esw_scheduling)) + return -EOPNOTSUPP; + + if (!esw_qos_element_type_supported(dev, SCHEDULING_CONTEXT_ELEMENT_TYPE_TSAR)) + return -EOPNOTSUPP; + + MLX5_SET(scheduling_context, tsar_ctx, element_type, + SCHEDULING_CONTEXT_ELEMENT_TYPE_TSAR); + + attr = MLX5_ADDR_OF(scheduling_context, tsar_ctx, element_attributes); + *attr = cpu_to_be32(TSAR_ELEMENT_TSAR_TYPE_DWRR << 16); + + err = mlx5_create_scheduling_element_cmd(dev, + SCHEDULING_HIERARCHY_E_SWITCH, + tsar_ctx, + &esw->qos.root_tsar_ix); + if (err) { + esw_warn(dev, "E-Switch create root TSAR failed (%d)\n", err); + return err; + } + + INIT_LIST_HEAD(&esw->qos.groups); + if (MLX5_CAP_QOS(dev, log_esw_max_sched_depth)) { + esw->qos.group0 = __esw_qos_create_rate_group(esw, 0, extack); + if (IS_ERR(esw->qos.group0)) { + esw_warn(dev, "E-Switch create rate group 0 failed (%ld)\n", + PTR_ERR(esw->qos.group0)); + err = PTR_ERR(esw->qos.group0); + goto err_group0; + } + } + refcount_set(&esw->qos.refcnt, 1); + + return 0; + +err_group0: + if (mlx5_destroy_scheduling_element_cmd(esw->dev, SCHEDULING_HIERARCHY_E_SWITCH, + esw->qos.root_tsar_ix)) + esw_warn(esw->dev, "E-Switch destroy root TSAR failed.\n"); + + return err; +} + +static void esw_qos_destroy(struct mlx5_eswitch *esw) +{ + struct mlx5_esw_rate_group *group, *tmp; + int err; + + /* destroy all sysfs groups and group0 */ + list_for_each_entry_safe(group, tmp, &esw->qos.groups, list) { + if (group->group_id == MLX5_ESW_QOS_NON_SYSFS_GROUP) + continue; + __esw_qos_destroy_rate_group(esw, group, NULL); + } + esw->qos.group0 = NULL; + + err = mlx5_destroy_scheduling_element_cmd(esw->dev, + SCHEDULING_HIERARCHY_E_SWITCH, + esw->qos.root_tsar_ix); + if (err) + esw_warn(esw->dev, "E-Switch destroy root TSAR failed (%d)\n", err); +} + +static int esw_qos_get(struct mlx5_eswitch *esw, struct netlink_ext_ack *extack) +{ + int err = 0; + + lockdep_assert_held(&esw->state_lock); + + if (!refcount_inc_not_zero(&esw->qos.refcnt)) { + /* esw_qos_create() set refcount to 1 only on success. + * No need to decrement on failure. + */ + err = esw_qos_create(esw, extack); + } + + return err; +} + +static void esw_qos_put(struct mlx5_eswitch *esw) +{ + lockdep_assert_held(&esw->state_lock); + if (refcount_dec_and_test(&esw->qos.refcnt)) + esw_qos_destroy(esw); +} + +int esw_qos_vport_enable(struct mlx5_eswitch *esw, struct mlx5_vport *vport, + u32 max_rate, u32 bw_share, struct netlink_ext_ack *extack) +{ + int err; + + lockdep_assert_held(&esw->state_lock); + if (vport->qos.enabled) + return 0; + + err = esw_qos_get(esw, extack); + if (err) + return err; + + vport->qos.group = esw->qos.group0; + + err = esw_qos_vport_create_sched_element(esw, vport, max_rate, bw_share); + if (err) + goto err_out; + + vport->qos.enabled = true; + trace_mlx5_esw_vport_qos_create(vport, bw_share, max_rate); + + return 0; + +err_out: + esw_qos_put(esw); + + return err; +} + +void mlx5_esw_qos_vport_disable(struct mlx5_eswitch *esw, struct mlx5_vport *vport) +{ + struct mlx5_esw_rate_group *group = vport->qos.group; + int err; + + lockdep_assert_held(&esw->state_lock); + if (!vport->qos.enabled) + return; + esw_qos_destroy_sysfs_rate_group(esw, vport, group); + err = mlx5_destroy_scheduling_element_cmd(esw->dev, + SCHEDULING_HIERARCHY_E_SWITCH, + vport->qos.esw_tsar_ix); + if (err) + esw_warn(esw->dev, "E-Switch destroy TSAR vport element failed (vport=%d,err=%d)\n", + vport->vport, err); + + memset(&vport->qos, 0, sizeof(vport->qos)); + trace_mlx5_esw_vport_qos_destroy(vport); + + esw_qos_put(esw); +} + +int mlx5_esw_qos_set_vport_rate(struct mlx5_eswitch *esw, struct mlx5_vport *vport, + u32 max_rate, u32 min_rate) +{ + int err; + + lockdep_assert_held(&esw->state_lock); + err = esw_qos_vport_enable(esw, vport, 0, 0, NULL); + if (err) + return err; + + err = esw_qos_set_vport_min_rate(esw, vport, min_rate, NULL); + if (!err) + err = esw_qos_set_vport_max_rate(esw, vport, max_rate, NULL); + + return err; +} + +int mlx5_esw_qos_modify_vport_rate(struct mlx5_eswitch *esw, u16 vport_num, u32 rate_mbps) +{ + u32 ctx[MLX5_ST_SZ_DW(scheduling_context)] = {}; + struct mlx5_vport *vport; + u32 bitmask; + int err; + + vport = mlx5_eswitch_get_vport(esw, vport_num); + if (IS_ERR(vport)) + return PTR_ERR(vport); + + mutex_lock(&esw->state_lock); + if (!vport->qos.enabled) { + /* Eswitch QoS wasn't enabled yet. Enable it and vport QoS. */ + err = esw_qos_vport_enable(esw, vport, rate_mbps, vport->qos.bw_share, NULL); + } else { + MLX5_SET(scheduling_context, ctx, max_average_bw, rate_mbps); + + bitmask = MODIFY_SCHEDULING_ELEMENT_IN_MODIFY_BITMASK_MAX_AVERAGE_BW; + err = mlx5_modify_scheduling_element_cmd(esw->dev, + SCHEDULING_HIERARCHY_E_SWITCH, + ctx, + vport->qos.esw_tsar_ix, + bitmask); + } + mutex_unlock(&esw->state_lock); + + return err; +} + +#ifdef HAVE_DEVLINK_HAS_RATE_FUNCTIONS +#define MLX5_LINKSPEED_UNIT 125000 /* 1Mbps in Bps */ + +/* Converts bytes per second value passed in a pointer into megabits per + * second, rewriting last. If converted rate exceed link speed or is not a + * fraction of Mbps - returns error. + */ +static +int esw_qos_devlink_rate_to_mbps(struct mlx5_core_dev *mdev, const char *name, + u64 *rate, struct netlink_ext_ack *extack) +{ + u32 link_speed_max, reminder; + u64 value; + int err; + + err = mlx5e_port_max_linkspeed(mdev, &link_speed_max); + if (err) { + NL_SET_ERR_MSG_MOD(extack, "Failed to get link maximum speed"); + return err; + } + + value = div_u64_rem(*rate, MLX5_LINKSPEED_UNIT, &reminder); + if (reminder) { + pr_err("%s rate value %lluBps not in link speed units of 1Mbps.\n", + name, *rate); + NL_SET_ERR_MSG_MOD(extack, "TX rate value not in link speed units of 1Mbps"); + return -EINVAL; + } + + if (value > link_speed_max) { + pr_err("%s rate value %lluMbps exceed link maximum speed %u.\n", + name, value, link_speed_max); + NL_SET_ERR_MSG_MOD(extack, "TX rate value exceed link maximum speed"); + return -EINVAL; + } + + *rate = value; + return 0; +} +#endif + +static bool esw_qos_groups_are_supported(struct mlx5_core_dev *dev) +{ + return MLX5_CAP_GEN(dev, qos) && + MLX5_CAP_QOS(dev, log_esw_max_sched_depth) && + MLX5_CAP_QOS(dev, esw_scheduling); +} + +static struct mlx5_esw_rate_group * +esw_qos_find_sysfs_group(struct mlx5_eswitch *esw, u32 group_id) +{ + struct mlx5_esw_rate_group *tmp; + + list_for_each_entry(tmp, &esw->qos.groups, list) { + if (tmp->group_id == MLX5_ESW_QOS_NON_SYSFS_GROUP) + continue; + if (tmp->group_id == group_id) + return tmp; + } + + return NULL; +} + +int mlx5_esw_qos_vport_update_sysfs_group(struct mlx5_eswitch *esw, int vport_num, u32 group_id) +{ + struct mlx5_vport *vport = mlx5_eswitch_get_vport(esw, vport_num); + struct mlx5_esw_rate_group *curr_group, *new_group; + int err = 0; + + if (IS_ERR(vport)) + return PTR_ERR(vport); + + if (!esw_qos_groups_are_supported(esw->dev)) + return -EOPNOTSUPP; + + mutex_lock(&esw->state_lock); + err = esw_qos_vport_enable(esw, vport, 0, 0, NULL); + if (err) + goto out; + + curr_group = vport->qos.group; + if (curr_group && curr_group->group_id == group_id) + goto out; + + new_group = esw_qos_find_sysfs_group(esw, group_id); + if (!new_group) { + new_group = esw_qos_create_rate_group(esw, group_id, NULL); + if (IS_ERR(new_group)) { + err = PTR_ERR(new_group); + esw_warn(esw->dev, "E-Switch couldn't create new sysfs group %d (%d)\n", + group_id, err); + goto out; + } + } + + err = esw_qos_update_group_scheduling_element(esw, vport, curr_group, new_group, NULL); + if (err) + goto err_update; + + if (curr_group != esw->qos.group0) + curr_group->num_vports--; + if (curr_group != esw->qos.group0 && !curr_group->num_vports) + esw_qos_destroy_rate_group(esw, curr_group, NULL); + + if (new_group != esw->qos.group0) + new_group->num_vports++; + goto out; + +err_update: + if (new_group != esw->qos.group0 && !new_group->num_vports) + esw_qos_destroy_rate_group(esw, new_group, NULL); +out: + mutex_unlock(&esw->state_lock); + return err; +} + +int mlx5_esw_qos_set_sysfs_group_max_rate(struct mlx5_eswitch *esw, + struct mlx5_esw_rate_group *group, + u32 max_rate) +{ + int err; + + if (!esw_qos_groups_are_supported(esw->dev) || + !MLX5_CAP_QOS(esw->dev, esw_rate_limit)) + return -EOPNOTSUPP; + + mutex_lock(&esw->state_lock); + if (!esw_qos_find_sysfs_group(esw, group->group_id)) { + err = -EINVAL; + goto unlock; + } + + err = esw_qos_set_group_max_rate(esw, group, max_rate, NULL); +unlock: + mutex_unlock(&esw->state_lock); + return err; +} + +int mlx5_esw_qos_set_sysfs_group_min_rate(struct mlx5_eswitch *esw, + struct mlx5_esw_rate_group *group, + u32 min_rate) +{ + int err = 0; + + if (!MLX5_CAP_GEN(esw->dev, qos) || + !MLX5_CAP_QOS(esw->dev, log_esw_max_sched_depth)) + return -EOPNOTSUPP; + + mutex_lock(&esw->state_lock); + if (!esw_qos_find_sysfs_group(esw, group->group_id)) { + err = -EINVAL; + goto unlock; + } + + err = esw_qos_set_group_min_rate(esw, group, min_rate, NULL); +unlock: + mutex_unlock(&esw->state_lock); + + return err; +} + +#ifdef HAVE_DEVLINK_HAS_RATE_FUNCTIONS + +/* Eswitch devlink rate API */ + +int mlx5_esw_devlink_rate_leaf_tx_share_set(struct devlink_rate *rate_leaf, void *priv, + u64 tx_share, struct netlink_ext_ack *extack) +{ + struct mlx5_vport *vport = priv; + struct mlx5_eswitch *esw; + int err; + + esw = vport->dev->priv.eswitch; + if (!mlx5_esw_allowed(esw)) + return -EPERM; + + err = esw_qos_devlink_rate_to_mbps(vport->dev, "tx_share", &tx_share, extack); + if (err) + return err; + + mutex_lock(&esw->state_lock); + err = esw_qos_vport_enable(esw, vport, 0, 0, extack); + if (err) + goto unlock; + + err = esw_qos_set_vport_min_rate(esw, vport, tx_share, extack); +unlock: + mutex_unlock(&esw->state_lock); + return err; +} + +int mlx5_esw_devlink_rate_leaf_tx_max_set(struct devlink_rate *rate_leaf, void *priv, + u64 tx_max, struct netlink_ext_ack *extack) +{ + struct mlx5_vport *vport = priv; + struct mlx5_eswitch *esw; + int err; + + esw = vport->dev->priv.eswitch; + if (!mlx5_esw_allowed(esw)) + return -EPERM; + + err = esw_qos_devlink_rate_to_mbps(vport->dev, "tx_max", &tx_max, extack); + if (err) + return err; + + mutex_lock(&esw->state_lock); + err = esw_qos_vport_enable(esw, vport, 0, 0, extack); + if (err) + goto unlock; + + err = esw_qos_set_vport_max_rate(esw, vport, tx_max, extack); +unlock: + mutex_unlock(&esw->state_lock); + return err; +} + +int mlx5_esw_devlink_rate_node_tx_share_set(struct devlink_rate *rate_node, void *priv, + u64 tx_share, struct netlink_ext_ack *extack) +{ + struct mlx5_core_dev *dev = devlink_priv(rate_node->devlink); + struct mlx5_eswitch *esw = dev->priv.eswitch; + struct mlx5_esw_rate_group *group = priv; + int err; + + err = esw_qos_devlink_rate_to_mbps(dev, "tx_share", &tx_share, extack); + if (err) + return err; + + mutex_lock(&esw->state_lock); + err = esw_qos_set_group_min_rate(esw, group, tx_share, extack); + mutex_unlock(&esw->state_lock); + return err; +} + +int mlx5_esw_devlink_rate_node_tx_max_set(struct devlink_rate *rate_node, void *priv, + u64 tx_max, struct netlink_ext_ack *extack) +{ + struct mlx5_core_dev *dev = devlink_priv(rate_node->devlink); + struct mlx5_eswitch *esw = dev->priv.eswitch; + struct mlx5_esw_rate_group *group = priv; + int err; + + err = esw_qos_devlink_rate_to_mbps(dev, "tx_max", &tx_max, extack); + if (err) + return err; + + mutex_lock(&esw->state_lock); + err = esw_qos_set_group_max_rate(esw, group, tx_max, extack); + mutex_unlock(&esw->state_lock); + return err; +} + +int mlx5_esw_devlink_rate_node_new(struct devlink_rate *rate_node, void **priv, + struct netlink_ext_ack *extack) +{ + struct mlx5_esw_rate_group *group; + struct mlx5_eswitch *esw; + int err = 0; + + esw = mlx5_devlink_eswitch_get(rate_node->devlink); + if (IS_ERR(esw)) + return PTR_ERR(esw); + + mutex_lock(&esw->state_lock); + if (esw->mode != MLX5_ESWITCH_OFFLOADS) { + NL_SET_ERR_MSG_MOD(extack, + "Rate node creation supported only in switchdev mode"); + err = -EOPNOTSUPP; + goto unlock; + } + + group = esw_qos_create_rate_group(esw, MLX5_ESW_QOS_NON_SYSFS_GROUP, extack); + if (IS_ERR(group)) { + err = PTR_ERR(group); + goto unlock; + } + + *priv = group; +unlock: + mutex_unlock(&esw->state_lock); + return err; +} + +int mlx5_esw_devlink_rate_node_del(struct devlink_rate *rate_node, void *priv, + struct netlink_ext_ack *extack) +{ + struct mlx5_esw_rate_group *group = priv; + struct mlx5_eswitch *esw; + int err; + + esw = mlx5_devlink_eswitch_get(rate_node->devlink); + if (IS_ERR(esw)) + return PTR_ERR(esw); + + mutex_lock(&esw->state_lock); + err = esw_qos_destroy_rate_group(esw, group, extack); + mutex_unlock(&esw->state_lock); + return err; +} + +int mlx5_esw_devlink_rate_parent_set(struct devlink_rate *devlink_rate, + struct devlink_rate *parent, + void *priv, void *parent_priv, + struct netlink_ext_ack *extack) +{ + struct mlx5_esw_rate_group *group; + struct mlx5_vport *vport = priv; + + if (!parent) + return mlx5_esw_qos_vport_update_group(vport->dev->priv.eswitch, + vport, NULL, extack); + + group = parent_priv; + return mlx5_esw_qos_vport_update_group(vport->dev->priv.eswitch, vport, group, extack); +} + +#endif + +int mlx5_esw_qos_vport_update_group(struct mlx5_eswitch *esw, + struct mlx5_vport *vport, + struct mlx5_esw_rate_group *group, + struct netlink_ext_ack *extack) +{ + int err = 0; + + mutex_lock(&esw->state_lock); + if (!vport->qos.enabled && !group) + goto unlock; + + err = esw_qos_vport_enable(esw, vport, 0, 0, extack); + if (!err) + err = esw_qos_vport_update_group(esw, vport, group, extack); +unlock: + mutex_unlock(&esw->state_lock); + return err; +} + diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.h new file mode 100644 index 0000000..81c16e2 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.h @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#ifndef __MLX5_ESW_QOS_H__ +#define __MLX5_ESW_QOS_H__ + +#ifdef CONFIG_MLX5_ESWITCH + +#define MLX5_ESW_QOS_SYSFS_GROUP_MAX_ID 255 +#define MLX5_ESW_QOS_NON_SYSFS_GROUP (MLX5_ESW_QOS_SYSFS_GROUP_MAX_ID + 1) +#include "net/mlxdevm.h" + +struct mlx5_esw_rate_group { + struct mlx5_core_dev *dev; + struct mlxdevm_rate_group devm; + u32 tsar_ix; + u32 max_rate; + u32 min_rate; + u32 bw_share; + struct list_head list; + + /* sysfs group related fields */ + struct kobject kobj; + u32 group_id; + u32 num_vports; +}; + +int mlx5_esw_qos_set_vport_rate(struct mlx5_eswitch *esw, struct mlx5_vport *evport, + u32 max_rate, u32 min_rate); +void mlx5_esw_qos_vport_disable(struct mlx5_eswitch *esw, struct mlx5_vport *vport); + +int mlx5_esw_qos_vport_update_sysfs_group(struct mlx5_eswitch *esw, int vport_num, + u32 group_id); +int mlx5_esw_qos_set_sysfs_group_max_rate(struct mlx5_eswitch *esw, + struct mlx5_esw_rate_group *group, + u32 max_rate); +int mlx5_esw_qos_set_sysfs_group_min_rate(struct mlx5_eswitch *esw, + struct mlx5_esw_rate_group *group, + u32 min_rate); +struct mlx5_esw_rate_group * +esw_qos_create_rate_group(struct mlx5_eswitch *esw, u32 group_id, + struct netlink_ext_ack *extack); +int esw_qos_destroy_rate_group(struct mlx5_eswitch *esw, + struct mlx5_esw_rate_group *group, + struct netlink_ext_ack *extack); +int esw_qos_set_group_max_rate(struct mlx5_eswitch *esw, + struct mlx5_esw_rate_group *group, + u32 max_rate, struct netlink_ext_ack *extack); +int esw_qos_set_group_min_rate(struct mlx5_eswitch *esw, struct mlx5_esw_rate_group *group, + u32 min_rate, struct netlink_ext_ack *extack); +int +mlx5_esw_get_esw_and_vport(struct devlink *devlink, struct devlink_port *port, + struct mlx5_eswitch **esw, struct mlx5_vport **vport, + struct netlink_ext_ack *extack); +int esw_qos_vport_enable(struct mlx5_eswitch *esw, struct mlx5_vport *vport, + u32 max_rate, u32 bw_share, struct netlink_ext_ack *extack); +int esw_qos_set_vport_min_rate(struct mlx5_eswitch *esw, struct mlx5_vport *evport, + u32 min_rate, struct netlink_ext_ack *extack); +int esw_qos_set_vport_max_rate(struct mlx5_eswitch *esw, struct mlx5_vport *evport, + u32 max_rate, struct netlink_ext_ack *extack); +#ifdef HAVE_DEVLINK_HAS_RATE_FUNCTIONS + +int mlx5_esw_devlink_rate_leaf_tx_share_set(struct devlink_rate *rate_leaf, void *priv, + u64 tx_share, struct netlink_ext_ack *extack); +int mlx5_esw_devlink_rate_leaf_tx_max_set(struct devlink_rate *rate_leaf, void *priv, + u64 tx_max, struct netlink_ext_ack *extack); +int mlx5_esw_devlink_rate_node_tx_share_set(struct devlink_rate *rate_node, void *priv, + u64 tx_share, struct netlink_ext_ack *extack); +int mlx5_esw_devlink_rate_node_tx_max_set(struct devlink_rate *rate_node, void *priv, + u64 tx_max, struct netlink_ext_ack *extack); +int mlx5_esw_devlink_rate_node_new(struct devlink_rate *rate_node, void **priv, + struct netlink_ext_ack *extack); +int mlx5_esw_devlink_rate_node_del(struct devlink_rate *rate_node, void *priv, + struct netlink_ext_ack *extack); +int mlx5_esw_devlink_rate_parent_set(struct devlink_rate *devlink_rate, + struct devlink_rate *parent, + void *priv, void *parent_priv, + struct netlink_ext_ack *extack); + +#endif /* HAVE_DEVLINK_HAS_RATE_FUNCTIONS */ + +#endif + +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/vf_meter.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/vf_meter.c new file mode 100644 index 0000000..9225d37 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/vf_meter.c @@ -0,0 +1,477 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2021 Mellanox Technologies. */ + +#include "eswitch.h" +#include "en/tc/meter.h" +#include "esw/acl/helper.h" + +enum { + MLX5_FLOW_METER_BPS_PRIO = 0, + MLX5_FLOW_METER_PPS_PRIO = 2, +}; + +static void +esw_acl_destrory_meter(struct mlx5_vport *vport, struct vport_meter *meter) +{ + u64 bytes, packets; + + if (meter->drop_red_rule) { + mlx5_del_flow_rules(meter->drop_red_rule); + meter->drop_red_rule = NULL; + } + + if (meter->fwd_green_rule) { + mlx5_del_flow_rules(meter->fwd_green_rule); + meter->fwd_green_rule = NULL; + } + + if (meter->color_grp) { + mlx5_destroy_flow_group(meter->color_grp); + meter->color_grp = NULL; + } + + if (meter->drop_counter) { + mlx5_fc_query(vport->dev, meter->drop_counter, &packets, &bytes); + meter->packets_dropped += packets; + meter->bytes_dropped += bytes; + mlx5_fc_destroy(vport->dev, meter->drop_counter); + meter->drop_counter = NULL; + } + + if (meter->color_tbl) { + mlx5_destroy_flow_table(meter->color_tbl); + meter->color_tbl = NULL; + } + + if (meter->meter_rule) { + mlx5_del_flow_rules(meter->meter_rule); + meter->meter_rule = NULL; + } + + if (meter->meter_grp) { + mlx5_destroy_flow_group(meter->meter_grp); + meter->meter_grp = NULL; + } + + if (meter->meter_hndl) { + mlx5e_free_flow_meter(vport->dev, meter->meter_hndl); + meter->meter_hndl = NULL; + } + + if (meter->meter_tbl) { + mlx5_destroy_flow_table(meter->meter_tbl); + meter->meter_tbl = NULL; + } +} + +int +esw_acl_create_meter(struct mlx5_vport *vport, struct vport_meter *meter, + int ns, int prio) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_flow_destination drop_ctr_dst = {}; + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_handle *rule; + void *misc2, *match_criteria; + struct mlx5_fc *drop_counter; + struct mlx5_flow_table *tbl; + struct mlx5_flow_group *grp; + struct mlx5_flow_spec *spec; + u32 *flow_group_in; + int err = 0; + + flow_group_in = kvzalloc(inlen, GFP_KERNEL); + if (!flow_group_in) + return -ENOMEM; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) { + kfree(flow_group_in); + return -ENOMEM; + } + + tbl = esw_acl_table_create(vport->dev->priv.eswitch, vport, + ns, prio, 1); + if (IS_ERR(tbl)) { + err = PTR_ERR(tbl); + goto out; + } + meter->meter_tbl = tbl; + + /* only one FTE in this group */ + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 0); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 0); + + grp = mlx5_create_flow_group(meter->meter_tbl, flow_group_in); + if (IS_ERR(grp)) { + err = PTR_ERR(grp); + goto out; + } + meter->meter_grp = grp; + + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO | + MLX5_FLOW_CONTEXT_ACTION_EXECUTE_ASO; + flow_act.exe_aso.type = MLX5_EXE_ASO_FLOW_METER; + flow_act.exe_aso.object_id = meter->meter_hndl->obj_id; + flow_act.exe_aso.flow_meter.meter_idx = meter->meter_hndl->idx; + flow_act.exe_aso.flow_meter.init_color = MLX5_FLOW_METER_COLOR_GREEN; + flow_act.exe_aso.return_reg_id = 5; /* use reg c5 */ + rule = mlx5_add_flow_rules(meter->meter_tbl, NULL, &flow_act, NULL, 0); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + goto out; + } + meter->meter_rule = rule; + + tbl = esw_acl_table_create(vport->dev->priv.eswitch, vport, + ns, prio + 1, 2); + if (IS_ERR(tbl)) { + err = PTR_ERR(tbl); + goto out; + } + meter->color_tbl = tbl; + + MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, + MLX5_MATCH_MISC_PARAMETERS_2); + match_criteria = MLX5_ADDR_OF(create_flow_group_in, flow_group_in, + match_criteria); + misc2 = MLX5_ADDR_OF(fte_match_param, match_criteria, misc_parameters_2); + MLX5_SET(fte_match_set_misc2, misc2, metadata_reg_c_5, 0x3); + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 0); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 1); + + grp = mlx5_create_flow_group(meter->color_tbl, flow_group_in); + if (IS_ERR(grp)) { + err = PTR_ERR(grp); + goto out; + } + meter->color_grp = grp; + + memset(&flow_act, 0, sizeof(flow_act)); + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO; + spec->match_criteria_enable = MLX5_MATCH_MISC_PARAMETERS_2; + misc2 = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, + misc_parameters_2); + MLX5_SET(fte_match_set_misc2, misc2, metadata_reg_c_5, 0x3); + misc2 = MLX5_ADDR_OF(fte_match_param, spec->match_value, misc_parameters_2); + MLX5_SET(fte_match_set_misc2, misc2, metadata_reg_c_5, + MLX5_FLOW_METER_COLOR_GREEN); + + rule = mlx5_add_flow_rules(meter->color_tbl, spec, &flow_act, NULL, 0); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + goto out; + } + meter->fwd_green_rule = rule; + + drop_counter = mlx5_fc_create(vport->dev, false); + if (IS_ERR(drop_counter)) { + err = PTR_ERR(drop_counter); + goto out; + } + meter->drop_counter = drop_counter; + + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP; + MLX5_SET(fte_match_set_misc2, misc2, metadata_reg_c_5, + MLX5_FLOW_METER_COLOR_RED); + flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_COUNT; + drop_ctr_dst.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; + drop_ctr_dst.counter_id = mlx5_fc_id(drop_counter); + + rule = mlx5_add_flow_rules(meter->color_tbl, spec, &flow_act, + &drop_ctr_dst, 1); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + goto out; + } + meter->drop_red_rule = rule; + +out: + if (err) + esw_acl_destrory_meter(vport, meter); + kfree(flow_group_in); + kfree(spec); + return err; +} + +static struct vport_meter * +esw_acl_get_meter(struct mlx5_vport *vport, int rx_tx, int xps) +{ + if (rx_tx == MLX5_RATE_LIMIT_TX) + return vport->ingress.offloads.meter_xps[xps]; + else + return vport->egress.offloads.meter_xps[xps]; +} + +int +esw_vf_meter_set_rate_limit(struct mlx5_vport *vport, struct vport_meter *meter, + int rx_tx, int xps, u64 rate, u64 burst) +{ + struct mlx5e_flow_meter_handle *meter_hndl; + struct mlx5e_flow_meter_params params; + int ns, prio; + int err; + + if (rate == meter->rate && burst == meter->burst) + return 0; + + if (rate == 0 || burst == 0) { + esw_acl_destrory_meter(vport, meter); + goto update; + } + + if (!meter->meter_hndl) { + meter_hndl = mlx5e_alloc_flow_meter(vport->dev); + if (IS_ERR(meter_hndl)) + return PTR_ERR(meter_hndl); + meter->meter_hndl = meter_hndl; + } + + params.mode = xps; + params.rate = rate; + params.burst = burst; + err = mlx5e_aso_send_flow_meter_aso(vport->dev, meter->meter_hndl, ¶ms); + if (err) + goto check_and_free_meter_aso; + + if (!meter->meter_tbl) { + if (rx_tx == MLX5_RATE_LIMIT_TX) + ns = MLX5_FLOW_NAMESPACE_ESW_INGRESS; + else + ns = MLX5_FLOW_NAMESPACE_ESW_EGRESS; + + if (xps == MLX5_RATE_LIMIT_PPS) + prio = MLX5_FLOW_METER_PPS_PRIO; + else + prio = MLX5_FLOW_METER_BPS_PRIO; + + err = esw_acl_create_meter(vport, meter, ns, prio); + if (err) + return err; + } + +update: + meter->rate = rate; + meter->burst = burst; + + return 0; + +check_and_free_meter_aso: + if (!meter->meter_tbl) { + mlx5e_free_flow_meter(vport->dev, meter->meter_hndl); + meter->meter_hndl = NULL; + } + return err; +} + +void +esw_vf_meter_destroy_meters(struct mlx5_eswitch *esw) +{ + struct mlx5_vport *vport; + int j; + unsigned long i; + + mlx5_esw_for_each_vf_vport(esw, i, vport, esw->esw_funcs.num_vfs) { + for (j = MLX5_RATE_LIMIT_BPS; j <= MLX5_RATE_LIMIT_PPS; j++) { + kfree(vport->ingress.offloads.meter_xps[j]); + vport->ingress.offloads.meter_xps[j] = NULL; + kfree(vport->egress.offloads.meter_xps[j]); + vport->egress.offloads.meter_xps[j] = NULL; + } + } +} + +int +esw_vf_meter_create_meters(struct mlx5_eswitch *esw) +{ + struct vport_meter *meter; + struct mlx5_vport *vport; + int j; + unsigned long i; + + mlx5_esw_for_each_vf_vport(esw, i, vport, esw->esw_funcs.num_vfs) { + for (j = MLX5_RATE_LIMIT_BPS; j <= MLX5_RATE_LIMIT_PPS; j++) { + meter = kzalloc(sizeof(*meter), GFP_KERNEL); + if (!meter) + goto err_out; + vport->egress.offloads.meter_xps[j] = meter; + } + + for (j = MLX5_RATE_LIMIT_BPS; j <= MLX5_RATE_LIMIT_PPS; j++) { + meter = kzalloc(sizeof(*meter), GFP_KERNEL); + if (!meter) + goto err_out; + vport->ingress.offloads.meter_xps[j] = meter; + } + } + + return 0; + +err_out: + esw_vf_meter_destroy_meters(esw); + + return -ENOMEM; +} + +void +esw_vf_meter_ingress_destroy(struct mlx5_vport *vport) +{ + struct vport_meter *meter; + int i; + + mutex_lock(&vport->ingress.offloads.vf_meter_lock); + for (i = MLX5_RATE_LIMIT_BPS; i <= MLX5_RATE_LIMIT_PPS; i++) { + meter = vport->ingress.offloads.meter_xps[i]; + if (meter) { + esw_acl_destrory_meter(vport, meter); + vport->ingress.offloads.meter_xps[i] = NULL; + kfree(meter); + } + } + mutex_unlock(&vport->ingress.offloads.vf_meter_lock); +} + +void +esw_vf_meter_egress_destroy(struct mlx5_vport *vport) +{ + struct vport_meter *meter; + int i; + + mutex_lock(&vport->egress.offloads.vf_meter_lock); + for (i = MLX5_RATE_LIMIT_BPS; i <= MLX5_RATE_LIMIT_PPS; i++) { + meter = vport->egress.offloads.meter_xps[i]; + if (meter) { + esw_acl_destrory_meter(vport, meter); + vport->egress.offloads.meter_xps[i] = NULL; + kfree(meter); + } + } + mutex_unlock(&vport->egress.offloads.vf_meter_lock); +} + +void +esw_vf_meter_destroy_all(struct mlx5_eswitch *esw) +{ + struct mlx5_vport *vport; + unsigned long i; + + mlx5_esw_for_each_vf_vport(esw, i, vport, esw->esw_funcs.num_vfs) { + esw_vf_meter_egress_destroy(vport); + esw_vf_meter_ingress_destroy(vport); + } +} + +int +mlx5_eswitch_set_vf_meter_data(struct mlx5_eswitch *esw, int vport_num, + int data_type, int rx_tx, int xps, u64 data) +{ + struct vport_meter *meter; + struct mlx5_vport *vport; + int err; + + if (esw->mode != MLX5_ESWITCH_OFFLOADS) + return -EOPNOTSUPP; + + vport = mlx5_eswitch_get_vport(esw, vport_num); + if (IS_ERR_OR_NULL(vport)) + return -EINVAL; + + if (rx_tx == MLX5_RATE_LIMIT_TX) + mutex_lock(&vport->ingress.offloads.vf_meter_lock); + else + mutex_lock(&vport->egress.offloads.vf_meter_lock); + + meter = esw_acl_get_meter(vport, rx_tx, xps); + if (!meter) { + err = -EOPNOTSUPP; + goto unlock; + } + + switch (data_type) { + case MLX5_RATE_LIMIT_DATA_RATE: + err = esw_vf_meter_set_rate_limit(vport, meter, rx_tx, xps, + data, meter->burst); + break; + case MLX5_RATE_LIMIT_DATA_BURST: + err = esw_vf_meter_set_rate_limit(vport, meter, rx_tx, xps, + meter->rate, data); + break; + default: + err = -EINVAL; + } + +unlock: + if (rx_tx == MLX5_RATE_LIMIT_TX) + mutex_unlock(&vport->ingress.offloads.vf_meter_lock); + else + mutex_unlock(&vport->egress.offloads.vf_meter_lock); + + return err; +} + +int +mlx5_eswitch_get_vf_meter_data(struct mlx5_eswitch *esw, int vport_num, + int data_type, int rx_tx, int xps, u64 *data) +{ + struct vport_meter *meter; + struct mlx5_vport *vport; + u64 bytes, packets; + int err = 0; + + if (esw->mode != MLX5_ESWITCH_OFFLOADS) + return -EOPNOTSUPP; + + vport = mlx5_eswitch_get_vport(esw, vport_num); + if (IS_ERR_OR_NULL(vport)) + return -EINVAL; + + if (rx_tx == MLX5_RATE_LIMIT_TX) + mutex_lock(&vport->ingress.offloads.vf_meter_lock); + else + mutex_lock(&vport->egress.offloads.vf_meter_lock); + + meter = esw_acl_get_meter(vport, rx_tx, xps); + if (!meter) { + err = -EOPNOTSUPP; + goto unlock; + } + + switch (data_type) { + case MLX5_RATE_LIMIT_DATA_RATE: + *data = meter->rate; + break; + case MLX5_RATE_LIMIT_DATA_BURST: + *data = meter->burst; + break; + case MLX5_RATE_LIMIT_DATA_PACKETS_DROPPED: + if (meter->drop_counter) { + mlx5_fc_query(vport->dev, meter->drop_counter, + &packets, &bytes); + *data = packets; + } else { + *data = 0; + } + *data += meter->packets_dropped; + break; + case MLX5_RATE_LIMIT_DATA_BYTES_DROPPED: + if (meter->drop_counter) { + mlx5_fc_query(vport->dev, meter->drop_counter, + &packets, &bytes); + *data = bytes; + } else { + *data = 0; + } + *data += meter->bytes_dropped; + break; + default: + err = -EINVAL; + } + +unlock: + if (rx_tx == MLX5_RATE_LIMIT_TX) + mutex_unlock(&vport->ingress.offloads.vf_meter_lock); + else + mutex_unlock(&vport->egress.offloads.vf_meter_lock); + + return err; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/vf_meter.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/vf_meter.h new file mode 100644 index 0000000..d2380bb --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/vf_meter.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2021 Mellanox Technologies. */ + +#ifndef __MLX5_ESW_VF_METERS_H__ +#define __MLX5_ESW_VF_METERS_H__ + +int esw_vf_meter_create_meters(struct mlx5_eswitch *esw); +void esw_vf_meter_destroy_meters(struct mlx5_eswitch *esw); +void esw_vf_meter_ingress_destroy(struct mlx5_vport *vport); +void esw_vf_meter_egress_destroy(struct mlx5_vport *vport); +void esw_vf_meter_destroy_all(struct mlx5_eswitch *esw); + +int mlx5_eswitch_set_vf_meter_data(struct mlx5_eswitch *esw, int vport_num, + int data_type, int rx_tx, int xps, u64 data); +int mlx5_eswitch_get_vf_meter_data(struct mlx5_eswitch *esw, int vport_num, + int data_type, int rx_tx, int xps, u64 *data); + +#endif /* __MLX5_ESW_VF_METERS_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/vporttbl.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/vporttbl.c new file mode 100644 index 0000000..9e72118 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/esw/vporttbl.c @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// Copyright (c) 2021 Mellanox Technologies. + +#include "eswitch.h" + +/* This struct is used as a key to the hash table and we need it to be packed + * so hash result is consistent + */ +struct mlx5_vport_key { + u32 chain; + u16 prio; + u16 vport; + u16 vhca_id; + const struct esw_vport_tbl_namespace *vport_ns; +} __packed; + +struct mlx5_vport_table { + struct hlist_node hlist; + struct mlx5_flow_table *fdb; + u32 num_rules; + struct mlx5_vport_key key; +}; + +static struct mlx5_flow_table * +esw_vport_tbl_create(struct mlx5_eswitch *esw, struct mlx5_flow_namespace *ns, + const struct esw_vport_tbl_namespace *vport_ns) +{ + struct mlx5_flow_table_attr ft_attr = {}; + struct mlx5_flow_table *fdb; + + if (vport_ns->max_num_groups) + ft_attr.autogroup.max_num_groups = vport_ns->max_num_groups; + else + ft_attr.autogroup.max_num_groups = esw->params.large_group_num; + ft_attr.max_fte = vport_ns->max_fte; + ft_attr.prio = FDB_PER_VPORT; + ft_attr.flags = vport_ns->flags; + fdb = mlx5_create_auto_grouped_flow_table(ns, &ft_attr); + if (IS_ERR(fdb)) { + esw_warn(esw->dev, "Failed to create per vport FDB Table err %ld\n", + PTR_ERR(fdb)); + } + + return fdb; +} + +static u32 flow_attr_to_vport_key(struct mlx5_eswitch *esw, + struct mlx5_vport_tbl_attr *attr, + struct mlx5_vport_key *key) +{ + key->vport = attr->vport; + key->chain = attr->chain; + key->prio = attr->prio; + key->vhca_id = MLX5_CAP_GEN(esw->dev, vhca_id); + key->vport_ns = attr->vport_ns; + return jhash(key, sizeof(*key), 0); +} + +/* caller must hold vports.lock */ +static struct mlx5_vport_table * +esw_vport_tbl_lookup(struct mlx5_eswitch *esw, struct mlx5_vport_key *skey, u32 key) +{ + struct mlx5_vport_table *e; + + hash_for_each_possible(esw->fdb_table.offloads.vports.table, e, hlist, key) + if (!memcmp(&e->key, skey, sizeof(*skey))) + return e; + + return NULL; +} + +struct mlx5_flow_table * +mlx5_esw_vporttbl_get(struct mlx5_eswitch *esw, struct mlx5_vport_tbl_attr *attr) +{ + struct mlx5_core_dev *dev = esw->dev; + struct mlx5_flow_namespace *ns; + struct mlx5_flow_table *fdb; + struct mlx5_vport_table *e; + struct mlx5_vport_key skey; + u32 hkey; + + mutex_lock(&esw->fdb_table.offloads.vports.lock); + hkey = flow_attr_to_vport_key(esw, attr, &skey); + e = esw_vport_tbl_lookup(esw, &skey, hkey); + if (e) { + e->num_rules++; + goto out; + } + + e = kzalloc(sizeof(*e), GFP_KERNEL); + if (!e) { + fdb = ERR_PTR(-ENOMEM); + goto err_alloc; + } + + ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_FDB); + if (!ns) { + esw_warn(dev, "Failed to get FDB namespace\n"); + fdb = ERR_PTR(-ENOENT); + goto err_ns; + } + + fdb = esw_vport_tbl_create(esw, ns, attr->vport_ns); + if (IS_ERR(fdb)) + goto err_ns; + + e->fdb = fdb; + e->num_rules = 1; + e->key = skey; + hash_add(esw->fdb_table.offloads.vports.table, &e->hlist, hkey); +out: + mutex_unlock(&esw->fdb_table.offloads.vports.lock); + return e->fdb; + +err_ns: + kfree(e); +err_alloc: + mutex_unlock(&esw->fdb_table.offloads.vports.lock); + return fdb; +} + +void +mlx5_esw_vporttbl_put(struct mlx5_eswitch *esw, struct mlx5_vport_tbl_attr *attr) +{ + struct mlx5_vport_table *e; + struct mlx5_vport_key key; + u32 hkey; + + mutex_lock(&esw->fdb_table.offloads.vports.lock); + hkey = flow_attr_to_vport_key(esw, attr, &key); + e = esw_vport_tbl_lookup(esw, &key, hkey); + if (!e || --e->num_rules) + goto out; + + hash_del(&e->hlist); + mlx5_destroy_flow_table(e->fdb); + kfree(e); +out: + mutex_unlock(&esw->fdb_table.offloads.vports.lock); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c new file mode 100644 index 0000000..8c560d4 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c @@ -0,0 +1,2731 @@ +/* + * Copyright (c) 2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "esw/acl/lgcy.h" +#include "esw/vf_meter.h" +#include "esw/legacy.h" +#include "esw/qos.h" +#include "mlx5_core.h" +#include "lib/eq.h" +#include "eswitch.h" +#include "fs_core.h" +#include "devlink.h" +#include "ecpf.h" +#include "en/mod_hdr.h" +#include "mlx5_devm.h" + +enum { + MLX5_ACTION_NONE = 0, + MLX5_ACTION_ADD = 1, + MLX5_ACTION_DEL = 2, +}; + +/* Vport UC/MC hash node */ +struct vport_addr { + struct l2addr_node node; + u8 action; + u16 vport; + struct mlx5_flow_handle *flow_rule; + bool mpfs; /* UC MAC was added to MPFs */ + /* A flag indicating that mac was added due to mc promiscuous vport */ + bool mc_promisc; +}; + +DEFINE_IDA(mlx5e_vport_match_ida); +DEFINE_MUTEX(mlx5e_vport_match_ida_mutex); + +/* Vport context events */ +#define SRIOV_VPORT_EVENTS (MLX5_VPORT_UC_ADDR_CHANGE | \ + MLX5_VPORT_MC_ADDR_CHANGE | \ + MLX5_VPORT_VLAN_CHANGE | \ + MLX5_VPORT_PROMISC_CHANGE) + +static int mlx5_eswitch_check(const struct mlx5_core_dev *dev) +{ + if (MLX5_CAP_GEN(dev, port_type) != MLX5_CAP_PORT_TYPE_ETH) + return -EOPNOTSUPP; + + if (!MLX5_ESWITCH_MANAGER(dev)) + return -EOPNOTSUPP; + + return 0; +} + +struct mlx5_eswitch *mlx5_devlink_eswitch_get(struct devlink *devlink) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + int err; + + err = mlx5_eswitch_check(dev); + if (err) + return ERR_PTR(err); + + return dev->priv.eswitch; +} + +struct mlx5_vport *__must_check +mlx5_eswitch_get_vport(struct mlx5_eswitch *esw, u16 vport_num) +{ + struct mlx5_vport *vport; + + if (!esw || !MLX5_CAP_GEN(esw->dev, vport_group_manager)) + return ERR_PTR(-EPERM); + + vport = xa_load(&esw->vports, vport_num); + if (!vport) { + esw_debug(esw->dev, "vport out of range: num(0x%x)\n", vport_num); + return ERR_PTR(-EINVAL); + } + return vport; +} + +bool mlx5_esw_host_functions_enabled(const struct mlx5_core_dev *dev) +{ + if (!dev->priv.eswitch) + return true; + + return !dev->priv.eswitch->esw_funcs.host_funcs_disabled; +} + +static bool is_esw_manager_vport(const struct mlx5_eswitch *esw, u16 vport_num) +{ + return esw->manager_vport == vport_num; +} + +static int arm_vport_context_events_cmd(struct mlx5_core_dev *dev, u16 vport, + u32 events_mask) +{ + u32 in[MLX5_ST_SZ_DW(modify_nic_vport_context_in)] = {}; + void *nic_vport_ctx; + + MLX5_SET(modify_nic_vport_context_in, in, + opcode, MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT); + MLX5_SET(modify_nic_vport_context_in, in, field_select.change_event, 1); + MLX5_SET(modify_nic_vport_context_in, in, vport_number, vport); + MLX5_SET(modify_nic_vport_context_in, in, other_vport, 1); + nic_vport_ctx = MLX5_ADDR_OF(modify_nic_vport_context_in, + in, nic_vport_context); + + MLX5_SET(nic_vport_context, nic_vport_ctx, arm_change_event, 1); + + if (events_mask & MLX5_VPORT_UC_ADDR_CHANGE) + MLX5_SET(nic_vport_context, nic_vport_ctx, + event_on_uc_address_change, 1); + if (events_mask & MLX5_VPORT_MC_ADDR_CHANGE) + MLX5_SET(nic_vport_context, nic_vport_ctx, + event_on_mc_address_change, 1); + if (events_mask & MLX5_VPORT_PROMISC_CHANGE) + MLX5_SET(nic_vport_context, nic_vport_ctx, + event_on_promisc_change, 1); + + return mlx5_cmd_exec_in(dev, modify_nic_vport_context, in); +} + +/* E-Switch vport context HW commands */ +int mlx5_eswitch_modify_esw_vport_context(struct mlx5_core_dev *dev, u16 vport, + bool other_vport, void *in) +{ + MLX5_SET(modify_esw_vport_context_in, in, opcode, + MLX5_CMD_OP_MODIFY_ESW_VPORT_CONTEXT); + MLX5_SET(modify_esw_vport_context_in, in, vport_number, vport); + MLX5_SET(modify_esw_vport_context_in, in, other_vport, other_vport); + return mlx5_cmd_exec_in(dev, modify_esw_vport_context, in); +} + +static int modify_esw_vport_cvlan(struct mlx5_core_dev *dev, u16 vport, + u16 vlan, u8 qos, u8 set_flags, + enum esw_vst_mode vst_mode) +{ + u32 in[MLX5_ST_SZ_DW(modify_esw_vport_context_in)] = {}; + + if (!MLX5_CAP_ESW(dev, vport_cvlan_strip) || + !MLX5_CAP_ESW(dev, vport_cvlan_insert_if_not_exist)) + return -EOPNOTSUPP; + + esw_debug(dev, "Set Vport[%d] VLAN %d qos %d set=%x\n", + vport, vlan, qos, set_flags); + + if (set_flags & SET_VLAN_STRIP) + MLX5_SET(modify_esw_vport_context_in, in, + esw_vport_context.vport_cvlan_strip, 1); + + if (set_flags & SET_VLAN_INSERT) { + if (vst_mode == ESW_VST_MODE_INSERT_ALWAYS) { + /* insert either if vlan exist in packet or not */ + MLX5_SET(modify_esw_vport_context_in, in, + esw_vport_context.vport_cvlan_insert, + MLX5_VPORT_CVLAN_INSERT_ALWAYS); + } else { + /* insert only if no vlan in packet */ + MLX5_SET(modify_esw_vport_context_in, in, + esw_vport_context.vport_cvlan_insert, + MLX5_VPORT_CVLAN_INSERT_WHEN_NO_CVLAN); + } + MLX5_SET(modify_esw_vport_context_in, in, + esw_vport_context.cvlan_pcp, qos); + MLX5_SET(modify_esw_vport_context_in, in, + esw_vport_context.cvlan_id, vlan); + } + + MLX5_SET(modify_esw_vport_context_in, in, + field_select.vport_cvlan_strip, 1); + MLX5_SET(modify_esw_vport_context_in, in, + field_select.vport_cvlan_insert, 1); + + return mlx5_eswitch_modify_esw_vport_context(dev, vport, true, in); +} + +/* E-Switch FDB */ +static struct mlx5_flow_handle * +__esw_fdb_set_vport_rule(struct mlx5_eswitch *esw, u16 vport, bool rx_rule, + u8 mac_c[ETH_ALEN], u8 mac_v[ETH_ALEN]) +{ + int match_header = (is_zero_ether_addr(mac_c) ? 0 : + MLX5_MATCH_OUTER_HEADERS); + struct mlx5_flow_handle *flow_rule = NULL; + struct mlx5_flow_act flow_act = {0}; + struct mlx5_flow_destination dest = {}; + struct mlx5_flow_spec *spec; + void *mv_misc = NULL; + void *mc_misc = NULL; + u8 *dmac_v = NULL; + u8 *dmac_c = NULL; + + if (rx_rule) + match_header |= MLX5_MATCH_MISC_PARAMETERS; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) + return NULL; + + dmac_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, + outer_headers.dmac_47_16); + dmac_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, + outer_headers.dmac_47_16); + + if (match_header & MLX5_MATCH_OUTER_HEADERS) { + ether_addr_copy(dmac_v, mac_v); + ether_addr_copy(dmac_c, mac_c); + } + + if (match_header & MLX5_MATCH_MISC_PARAMETERS) { + mv_misc = MLX5_ADDR_OF(fte_match_param, spec->match_value, + misc_parameters); + mc_misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, + misc_parameters); + MLX5_SET(fte_match_set_misc, mv_misc, source_port, MLX5_VPORT_UPLINK); + MLX5_SET_TO_ONES(fte_match_set_misc, mc_misc, source_port); + } + + dest.type = MLX5_FLOW_DESTINATION_TYPE_VPORT; + dest.vport.num = vport; + + esw_debug(esw->dev, + "\tFDB add rule dmac_v(%pM) dmac_c(%pM) -> vport(%d)\n", + dmac_v, dmac_c, vport); + spec->match_criteria_enable = match_header; + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + flow_rule = + mlx5_add_flow_rules(esw->fdb_table.legacy.fdb, spec, + &flow_act, &dest, 1); + if (IS_ERR(flow_rule)) { + esw_warn(esw->dev, + "FDB: Failed to add flow rule: dmac_v(%pM) dmac_c(%pM) -> vport(%d), err(%ld)\n", + dmac_v, dmac_c, vport, PTR_ERR(flow_rule)); + flow_rule = NULL; + } + + kvfree(spec); + return flow_rule; +} + +static struct mlx5_flow_handle * +esw_fdb_set_vport_rule(struct mlx5_eswitch *esw, u8 mac[ETH_ALEN], u16 vport) +{ + u8 mac_c[ETH_ALEN]; + + eth_broadcast_addr(mac_c); + return __esw_fdb_set_vport_rule(esw, vport, false, mac_c, mac); +} + +static struct mlx5_flow_handle * +esw_fdb_set_vport_allmulti_rule(struct mlx5_eswitch *esw, u16 vport) +{ + u8 mac_c[ETH_ALEN]; + u8 mac_v[ETH_ALEN]; + + eth_zero_addr(mac_c); + eth_zero_addr(mac_v); + mac_c[0] = 0x01; + mac_v[0] = 0x01; + return __esw_fdb_set_vport_rule(esw, vport, false, mac_c, mac_v); +} + +static struct mlx5_flow_handle * +esw_fdb_set_vport_promisc_rule(struct mlx5_eswitch *esw, u16 vport) +{ + u8 mac_c[ETH_ALEN]; + u8 mac_v[ETH_ALEN]; + + eth_zero_addr(mac_c); + eth_zero_addr(mac_v); + return __esw_fdb_set_vport_rule(esw, vport, true, mac_c, mac_v); +} + +/* E-Switch vport UC/MC lists management */ +typedef int (*vport_addr_action)(struct mlx5_eswitch *esw, + struct vport_addr *vaddr); + +static int esw_add_uc_addr(struct mlx5_eswitch *esw, struct vport_addr *vaddr) +{ + u8 *mac = vaddr->node.addr; + u16 vport = vaddr->vport; + int err; + + /* Skip mlx5_mpfs_add_mac for eswitch_managers, + * it is already done by its netdev in mlx5e_execute_l2_action + */ + if (mlx5_esw_is_manager_vport(esw, vport)) + goto fdb_add; + + err = mlx5_mpfs_add_mac(esw->dev, mac); + if (err) { + esw_warn(esw->dev, + "Failed to add L2 table mac(%pM) for vport(0x%x), err(%d)\n", + mac, vport, err); + return err; + } + vaddr->mpfs = true; + +fdb_add: + /* SRIOV is enabled: Forward UC MAC to vport */ + if (esw->fdb_table.legacy.fdb && esw->mode == MLX5_ESWITCH_LEGACY) + vaddr->flow_rule = esw_fdb_set_vport_rule(esw, mac, vport); + + esw_debug(esw->dev, "\tADDED UC MAC: vport[%d] %pM fr(%p)\n", + vport, mac, vaddr->flow_rule); + + return 0; +} + +static int esw_del_uc_addr(struct mlx5_eswitch *esw, struct vport_addr *vaddr) +{ + u8 *mac = vaddr->node.addr; + u16 vport = vaddr->vport; + int err = 0; + + /* Skip mlx5_mpfs_del_mac for eswitch managers, + * it is already done by its netdev in mlx5e_execute_l2_action + */ + if (!vaddr->mpfs || mlx5_esw_is_manager_vport(esw, vport)) + goto fdb_del; + + err = mlx5_mpfs_del_mac(esw->dev, mac); + if (err) + esw_warn(esw->dev, + "Failed to del L2 table mac(%pM) for vport(%d), err(%d)\n", + mac, vport, err); + vaddr->mpfs = false; + +fdb_del: + if (vaddr->flow_rule) + mlx5_del_flow_rules(vaddr->flow_rule); + vaddr->flow_rule = NULL; + + return 0; +} + +static void update_allmulti_vports(struct mlx5_eswitch *esw, + struct vport_addr *vaddr, + struct esw_mc_addr *esw_mc) +{ + u8 *mac = vaddr->node.addr; + struct mlx5_vport *vport; + unsigned long i; + u16 vport_num; + + mlx5_esw_for_each_vport(esw, i, vport) { + struct hlist_head *vport_hash = vport->mc_list; + struct vport_addr *iter_vaddr = + l2addr_hash_find(vport_hash, + mac, + struct vport_addr); + vport_num = vport->vport; + if (IS_ERR_OR_NULL(vport->allmulti_rule) || + vaddr->vport == vport_num) + continue; + switch (vaddr->action) { + case MLX5_ACTION_ADD: + if (iter_vaddr) + continue; + iter_vaddr = l2addr_hash_add(vport_hash, mac, + struct vport_addr, + GFP_KERNEL); + if (!iter_vaddr) { + esw_warn(esw->dev, + "ALL-MULTI: Failed to add MAC(%pM) to vport[%d] DB\n", + mac, vport_num); + continue; + } + iter_vaddr->vport = vport_num; + iter_vaddr->flow_rule = + esw_fdb_set_vport_rule(esw, + mac, + vport_num); + iter_vaddr->mc_promisc = true; + break; + case MLX5_ACTION_DEL: + if (!iter_vaddr) + continue; + mlx5_del_flow_rules(iter_vaddr->flow_rule); + l2addr_hash_del(iter_vaddr); + break; + } + } +} + +static int esw_add_mc_addr(struct mlx5_eswitch *esw, struct vport_addr *vaddr) +{ + struct hlist_head *hash = esw->mc_table; + struct esw_mc_addr *esw_mc; + u8 *mac = vaddr->node.addr; + u16 vport = vaddr->vport; + + if (!esw->fdb_table.legacy.fdb) + return 0; + + esw_mc = l2addr_hash_find(hash, mac, struct esw_mc_addr); + if (esw_mc) + goto add; + + esw_mc = l2addr_hash_add(hash, mac, struct esw_mc_addr, GFP_KERNEL); + if (!esw_mc) + return -ENOMEM; + + esw_mc->uplink_rule = /* Forward MC MAC to Uplink */ + esw_fdb_set_vport_rule(esw, mac, MLX5_VPORT_UPLINK); + + /* Add this multicast mac to all the mc promiscuous vports */ + update_allmulti_vports(esw, vaddr, esw_mc); + +add: + /* If the multicast mac is added as a result of mc promiscuous vport, + * don't increment the multicast ref count + */ + if (!vaddr->mc_promisc) + esw_mc->refcnt++; + + /* Forward MC MAC to vport */ + vaddr->flow_rule = esw_fdb_set_vport_rule(esw, mac, vport); + esw_debug(esw->dev, + "\tADDED MC MAC: vport[%d] %pM fr(%p) refcnt(%d) uplinkfr(%p)\n", + vport, mac, vaddr->flow_rule, + esw_mc->refcnt, esw_mc->uplink_rule); + return 0; +} + +static int esw_del_mc_addr(struct mlx5_eswitch *esw, struct vport_addr *vaddr) +{ + struct hlist_head *hash = esw->mc_table; + struct esw_mc_addr *esw_mc; + u8 *mac = vaddr->node.addr; + u16 vport = vaddr->vport; + + if (!esw->fdb_table.legacy.fdb) + return 0; + + esw_mc = l2addr_hash_find(hash, mac, struct esw_mc_addr); + if (!esw_mc) { + esw_warn(esw->dev, + "Failed to find eswitch MC addr for MAC(%pM) vport(%d)", + mac, vport); + return -EINVAL; + } + esw_debug(esw->dev, + "\tDELETE MC MAC: vport[%d] %pM fr(%p) refcnt(%d) uplinkfr(%p)\n", + vport, mac, vaddr->flow_rule, esw_mc->refcnt, + esw_mc->uplink_rule); + + if (vaddr->flow_rule) + mlx5_del_flow_rules(vaddr->flow_rule); + vaddr->flow_rule = NULL; + + /* If the multicast mac is added as a result of mc promiscuous vport, + * don't decrement the multicast ref count. + */ + if (vaddr->mc_promisc || (--esw_mc->refcnt > 0)) + return 0; + + /* Remove this multicast mac from all the mc promiscuous vports */ + update_allmulti_vports(esw, vaddr, esw_mc); + + if (esw_mc->uplink_rule) { + mlx5_del_flow_rules(esw_mc->uplink_rule); + esw_mc->uplink_rule = NULL; + } + + l2addr_hash_del(esw_mc); + return 0; +} + +/* Apply vport UC/MC list to HW l2 table and FDB table */ +static void esw_apply_vport_addr_list(struct mlx5_eswitch *esw, + struct mlx5_vport *vport, int list_type) +{ + bool is_uc = list_type == MLX5_NVPRT_LIST_TYPE_UC; + vport_addr_action vport_addr_add; + vport_addr_action vport_addr_del; + struct vport_addr *addr; + struct l2addr_node *node; + struct hlist_head *hash; + struct hlist_node *tmp; + int hi; + + vport_addr_add = is_uc ? esw_add_uc_addr : + esw_add_mc_addr; + vport_addr_del = is_uc ? esw_del_uc_addr : + esw_del_mc_addr; + + hash = is_uc ? vport->uc_list : vport->mc_list; + for_each_l2hash_node(node, tmp, hash, hi) { + addr = container_of(node, struct vport_addr, node); + switch (addr->action) { + case MLX5_ACTION_ADD: + vport_addr_add(esw, addr); + addr->action = MLX5_ACTION_NONE; + break; + case MLX5_ACTION_DEL: + vport_addr_del(esw, addr); + l2addr_hash_del(addr); + break; + } + } +} + +/* Sync vport UC/MC list from vport context */ +static void esw_update_vport_addr_list(struct mlx5_eswitch *esw, + struct mlx5_vport *vport, int list_type) +{ + bool is_uc = list_type == MLX5_NVPRT_LIST_TYPE_UC; + u8 (*mac_list)[ETH_ALEN]; + struct l2addr_node *node; + struct vport_addr *addr; + struct hlist_head *hash; + struct hlist_node *tmp; + int size; + int err; + int hi; + int i; + + size = is_uc ? MLX5_MAX_UC_PER_VPORT(esw->dev) : + MLX5_MAX_MC_PER_VPORT(esw->dev); + + mac_list = kcalloc(size, ETH_ALEN, GFP_KERNEL); + if (!mac_list) + return; + + hash = is_uc ? vport->uc_list : vport->mc_list; + + for_each_l2hash_node(node, tmp, hash, hi) { + addr = container_of(node, struct vport_addr, node); + addr->action = MLX5_ACTION_DEL; + } + + if (!vport->enabled) + goto out; + + err = mlx5_query_nic_vport_mac_list(esw->dev, vport->vport, list_type, + mac_list, &size); + if (err) + goto out; + esw_debug(esw->dev, "vport[%d] context update %s list size (%d)\n", + vport->vport, is_uc ? "UC" : "MC", size); + + for (i = 0; i < size; i++) { + if (is_uc && !is_valid_ether_addr(mac_list[i])) + continue; + + if (!is_uc && !is_multicast_ether_addr(mac_list[i])) + continue; + + addr = l2addr_hash_find(hash, mac_list[i], struct vport_addr); + if (addr) { + addr->action = MLX5_ACTION_NONE; + /* If this mac was previously added because of allmulti + * promiscuous rx mode, its now converted to be original + * vport mac. + */ + if (addr->mc_promisc) { + struct esw_mc_addr *esw_mc = + l2addr_hash_find(esw->mc_table, + mac_list[i], + struct esw_mc_addr); + if (!esw_mc) { + esw_warn(esw->dev, + "Failed to MAC(%pM) in mcast DB\n", + mac_list[i]); + continue; + } + esw_mc->refcnt++; + addr->mc_promisc = false; + } + continue; + } + + addr = l2addr_hash_add(hash, mac_list[i], struct vport_addr, + GFP_KERNEL); + if (!addr) { + esw_warn(esw->dev, + "Failed to add MAC(%pM) to vport[%d] DB\n", + mac_list[i], vport->vport); + continue; + } + addr->vport = vport->vport; + addr->action = MLX5_ACTION_ADD; + } +out: + kfree(mac_list); +} + +static void esw_update_acl_trunk_bitmap(struct mlx5_eswitch *esw, u32 vport_num) +{ + struct mlx5_vport *vport = mlx5_eswitch_get_vport(esw, vport_num); + + if(IS_ERR(vport)) + return; + + bitmap_and(vport->acl_vlan_8021q_bitmap, vport->req_vlan_bitmap, + vport->info.vlan_trunk_8021q_bitmap, VLAN_N_VID); +} + +/* Sync vport vlan list from vport context */ +static void esw_update_vport_vlan_list(struct mlx5_eswitch *esw, u32 vport_num) +{ + struct mlx5_vport *vport = mlx5_eswitch_get_vport(esw, vport_num); + DECLARE_BITMAP(tmp_vlans_bitmap, VLAN_N_VID); + int err; + + if(IS_ERR(vport)) + return; + + if (!vport->enabled) + return; + + bitmap_copy(tmp_vlans_bitmap, vport->req_vlan_bitmap, VLAN_N_VID); + bitmap_zero(vport->req_vlan_bitmap, VLAN_N_VID); + + err = mlx5_query_nic_vport_vlans(esw->dev, vport_num, + vport->req_vlan_bitmap); + if (err) + return; + + bitmap_xor(tmp_vlans_bitmap, tmp_vlans_bitmap, vport->req_vlan_bitmap, + VLAN_N_VID); + if (!bitmap_weight(tmp_vlans_bitmap, VLAN_N_VID)) + return; + + esw_update_acl_trunk_bitmap(esw, vport_num); + esw_acl_egress_lgcy_setup(esw, vport); + esw_acl_ingress_lgcy_setup(esw, vport); +} + +/* Sync vport UC/MC list from vport context + * Must be called after esw_update_vport_addr_list + */ +static void esw_update_vport_mc_promisc(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + struct l2addr_node *node; + struct vport_addr *addr; + struct hlist_head *hash; + struct hlist_node *tmp; + int hi; + + hash = vport->mc_list; + + for_each_l2hash_node(node, tmp, esw->mc_table, hi) { + u8 *mac = node->addr; + + addr = l2addr_hash_find(hash, mac, struct vport_addr); + if (addr) { + if (addr->action == MLX5_ACTION_DEL) + addr->action = MLX5_ACTION_NONE; + continue; + } + addr = l2addr_hash_add(hash, mac, struct vport_addr, + GFP_KERNEL); + if (!addr) { + esw_warn(esw->dev, + "Failed to add allmulti MAC(%pM) to vport[%d] DB\n", + mac, vport->vport); + continue; + } + addr->vport = vport->vport; + addr->action = MLX5_ACTION_ADD; + addr->mc_promisc = true; + } +} + +/* Apply vport rx mode to HW FDB table */ +static void esw_apply_vport_rx_mode(struct mlx5_eswitch *esw, + struct mlx5_vport *vport, + bool promisc, bool mc_promisc) +{ + struct esw_mc_addr *allmulti_addr = &esw->mc_promisc; + + if (IS_ERR_OR_NULL(vport->allmulti_rule) != mc_promisc) + goto promisc; + + if (mc_promisc) { + vport->allmulti_rule = + esw_fdb_set_vport_allmulti_rule(esw, vport->vport); + if (!allmulti_addr->uplink_rule) + allmulti_addr->uplink_rule = + esw_fdb_set_vport_allmulti_rule(esw, + MLX5_VPORT_UPLINK); + allmulti_addr->refcnt++; + } else if (vport->allmulti_rule) { + mlx5_del_flow_rules(vport->allmulti_rule); + vport->allmulti_rule = NULL; + + if (--allmulti_addr->refcnt > 0) + goto promisc; + + if (allmulti_addr->uplink_rule) + mlx5_del_flow_rules(allmulti_addr->uplink_rule); + allmulti_addr->uplink_rule = NULL; + } + +promisc: + if (IS_ERR_OR_NULL(vport->promisc_rule) != promisc) + return; + + if (promisc) { + vport->promisc_rule = + esw_fdb_set_vport_promisc_rule(esw, vport->vport); + } else if (vport->promisc_rule) { + mlx5_del_flow_rules(vport->promisc_rule); + vport->promisc_rule = NULL; + } +} + +/* Sync vport rx mode from vport context */ +static void esw_update_vport_rx_mode(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + struct esw_mc_addr *allmulti_addr = &esw->mc_promisc; + struct mlx5_core_dev *dev = vport->dev; + int promisc_all = 0; + int promisc_uc = 0; + int promisc_mc = 0; + int err; + + err = mlx5_query_nic_vport_promisc(esw->dev, + vport->vport, + &promisc_uc, + &promisc_mc, + &promisc_all); + if (err) { + if (!pci_channel_offline(dev->pdev) && + dev->state != MLX5_DEVICE_STATE_INTERNAL_ERROR) + return; + + /* EEH or PCI error. Delete promisc, multi and uplink multi rules */ + if (vport->allmulti_rule) { + mlx5_del_flow_rules(vport->allmulti_rule); + vport->allmulti_rule = NULL; + + allmulti_addr->refcnt --; + if (!allmulti_addr->refcnt && allmulti_addr->uplink_rule) { + mlx5_del_flow_rules(allmulti_addr->uplink_rule); + allmulti_addr->uplink_rule = NULL; + } + } + + if (vport->promisc_rule) { + mlx5_del_flow_rules(vport->promisc_rule); + vport->promisc_rule = NULL; + } + + return; + } + esw_debug(esw->dev, "vport[%d] context update rx mode promisc_all=%d, all_multi=%d\n", + vport->vport, promisc_all, promisc_mc); + + if (!vport->info.trusted || !vport->enabled) { + promisc_uc = 0; + promisc_mc = 0; + promisc_all = 0; + } + + esw_apply_vport_rx_mode(esw, vport, promisc_all, + (promisc_all || promisc_mc)); +} + +void esw_vport_change_handle_locked(struct mlx5_vport *vport) +{ + struct mlx5_core_dev *dev = vport->dev; + struct mlx5_eswitch *esw = dev->priv.eswitch; + u8 mac[ETH_ALEN]; + + mlx5_query_nic_vport_mac_address(dev, vport->vport, true, mac); + esw_debug(dev, "vport[%d] Context Changed: perm mac: %pM\n", + vport->vport, mac); + + if (vport->enabled_events & MLX5_VPORT_UC_ADDR_CHANGE) { + esw_update_vport_addr_list(esw, vport, MLX5_NVPRT_LIST_TYPE_UC); + esw_apply_vport_addr_list(esw, vport, MLX5_NVPRT_LIST_TYPE_UC); + } + + if (vport->enabled_events & MLX5_VPORT_MC_ADDR_CHANGE) + esw_update_vport_addr_list(esw, vport, MLX5_NVPRT_LIST_TYPE_MC); + + if (vport->enabled_events & MLX5_VPORT_VLAN_CHANGE) + esw_update_vport_vlan_list(esw, vport->vport); + + if (vport->enabled_events & MLX5_VPORT_PROMISC_CHANGE) { + esw_update_vport_rx_mode(esw, vport); + if (!IS_ERR_OR_NULL(vport->allmulti_rule)) + esw_update_vport_mc_promisc(esw, vport); + } + + if (vport->enabled_events & (MLX5_VPORT_PROMISC_CHANGE | MLX5_VPORT_MC_ADDR_CHANGE)) + esw_apply_vport_addr_list(esw, vport, MLX5_NVPRT_LIST_TYPE_MC); + + esw_debug(esw->dev, "vport[%d] Context Changed: Done\n", vport->vport); + if (vport->enabled) + arm_vport_context_events_cmd(dev, vport->vport, + vport->enabled_events); +} + +static void esw_vport_change_handler(struct work_struct *work) +{ + struct mlx5_vport *vport = + container_of(work, struct mlx5_vport, vport_change_handler); + struct mlx5_eswitch *esw = vport->dev->priv.eswitch; + + mutex_lock(&esw->state_lock); + esw_vport_change_handle_locked(vport); + mutex_unlock(&esw->state_lock); +} + +int mlx5_esw_modify_vport_rate(struct mlx5_eswitch *esw, u16 vport_num, + u32 rate_mbps) +{ + u32 ctx[MLX5_ST_SZ_DW(scheduling_context)] = {}; + struct mlx5_vport *vport; + + vport = mlx5_eswitch_get_vport(esw, vport_num); + if (IS_ERR(vport)) + return PTR_ERR(vport); + + if (!vport->qos.enabled) + return -EOPNOTSUPP; + + MLX5_SET(scheduling_context, ctx, max_average_bw, rate_mbps); + + return mlx5_modify_scheduling_element_cmd(esw->dev, + SCHEDULING_HIERARCHY_E_SWITCH, + ctx, + vport->qos.esw_tsar_ix, + MODIFY_SCHEDULING_ELEMENT_IN_MODIFY_BITMASK_MAX_AVERAGE_BW); +} + +static void node_guid_gen_from_mac(u64 *node_guid, const u8 *mac) +{ + ((u8 *)node_guid)[7] = mac[0]; + ((u8 *)node_guid)[6] = mac[1]; + ((u8 *)node_guid)[5] = mac[2]; + ((u8 *)node_guid)[4] = 0xff; + ((u8 *)node_guid)[3] = 0xfe; + ((u8 *)node_guid)[2] = mac[3]; + ((u8 *)node_guid)[1] = mac[4]; + ((u8 *)node_guid)[0] = mac[5]; +} + +static int esw_vport_setup_acl(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + if (esw->mode == MLX5_ESWITCH_LEGACY) + return esw_legacy_vport_acl_setup(esw, vport); + else + return esw_vport_create_offloads_acl_tables(esw, vport); +} + +static void esw_vport_cleanup_acl(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + if (esw->mode == MLX5_ESWITCH_LEGACY) + esw_legacy_vport_acl_cleanup(esw, vport); + else + esw_vport_destroy_offloads_acl_tables(esw, vport); +} + +static int esw_vport_setup(struct mlx5_eswitch *esw, struct mlx5_vport *vport) +{ + enum esw_vst_mode vst_mode = esw_get_vst_mode(esw); + u16 vport_num = vport->vport; + int flags; + int err; + + err = esw_vport_setup_acl(esw, vport); + if (err) + return err; + + if (mlx5_esw_is_manager_vport(esw, vport_num)) + return 0; + + mlx5_modify_vport_admin_state(esw->dev, + MLX5_VPORT_STATE_OP_MOD_ESW_VPORT, + vport_num, 1, + vport->info.link_state); + + /* Host PF has its own mac/guid. */ + if (vport_num) { + mlx5_modify_nic_vport_mac_address(esw->dev, vport_num, + vport->info.mac); + mlx5_modify_nic_vport_node_guid(esw->dev, vport_num, + vport->info.node_guid); + } + + flags = (vport->info.vlan || vport->info.qos) ? + SET_VLAN_STRIP | SET_VLAN_INSERT : 0; + if (vst_mode != ESW_VST_MODE_STEERING) + modify_esw_vport_cvlan(esw->dev, vport_num, vport->info.vlan, + vport->info.qos, flags, vst_mode); + + return 0; +} + +static void esw_vport_query(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + u16 vport_num = vport->vport; + + /* Update vport info with host PF mac */ + if ((vport_num == MLX5_VPORT_PF) && mlx5_core_is_ecpf(esw->dev)) + mlx5_query_nic_vport_mac_address(esw->dev, 0, 1, + vport->info.mac); +} + +/* Don't cleanup vport->info, it's needed to restore vport configuration */ +static void esw_vport_cleanup(struct mlx5_eswitch *esw, struct mlx5_vport *vport) +{ + u16 vport_num = vport->vport; + + if (!mlx5_esw_is_manager_vport(esw, vport_num)) + mlx5_modify_vport_admin_state(esw->dev, + MLX5_VPORT_STATE_OP_MOD_ESW_VPORT, + vport_num, 1, + MLX5_VPORT_ADMIN_STATE_DOWN); + + mlx5_esw_qos_vport_disable(esw, vport); + esw_vport_cleanup_acl(esw, vport); +} + +static int mlx5_esw_query_hca_trusted(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + bool trusted; + int err; + + err = mlx5_esw_get_hca_trusted(esw, vport->vport, &trusted); + if (err == -EOPNOTSUPP) + return 0; + + if (err) + return err; + + vport->info.offloads_trusted = trusted; + + return 0; +} + +int mlx5_esw_vport_enable(struct mlx5_eswitch *esw, u16 vport_num, + enum mlx5_eswitch_vport_event enabled_events) +{ + struct mlx5_vport *vport; + int ret; + + vport = mlx5_eswitch_get_vport(esw, vport_num); + if (IS_ERR(vport)) + return PTR_ERR(vport); + + mutex_lock(&esw->state_lock); + if (vport->enabled) + goto done; + + esw_debug(esw->dev, "Enabling VPORT(%d)\n", vport_num); + spin_lock_init(&vport->pg_counters_lock); + + bitmap_zero(vport->req_vlan_bitmap, VLAN_N_VID); + bitmap_zero(vport->acl_vlan_8021q_bitmap, VLAN_N_VID); + bitmap_zero(vport->info.vlan_trunk_8021q_bitmap, VLAN_N_VID); + + esw_vport_query(esw, vport); + ret = esw_vport_setup(esw, vport); + if (ret) + goto done; + + /* Sync with current vport context */ + vport->enabled_events = enabled_events; + vport->enabled = true; + + /* Esw manager is trusted by default. Host PF (vport 0) is trusted as well + * in smartNIC as it's a vport group manager. + */ + if (mlx5_esw_is_manager_vport(esw, vport_num) || + (!vport_num && mlx5_core_is_ecpf(esw->dev))) + vport->info.trusted = true; + + if (!mlx5_esw_is_manager_vport(esw, vport->vport) && + MLX5_CAP_GEN(esw->dev, vhca_resource_manager)) { + ret = mlx5_esw_vport_vhca_id_set(esw, vport_num); + if (ret) + goto err_vhca_mapping; + } + + if (!mlx5_esw_is_manager_vport(esw, vport->vport)) { + ret = mlx5_esw_query_hca_trusted(esw, vport); + if (ret) + goto err_vhca_mapping; + } + + /* External controller host PF has factory programmed MAC. + * Read it from the device. + */ + if (mlx5_core_is_ecpf(esw->dev) && vport_num == MLX5_VPORT_PF) + mlx5_query_nic_vport_mac_address(esw->dev, vport_num, true, vport->info.mac); + + esw_vport_change_handle_locked(vport); + + esw->enabled_vports++; + esw_debug(esw->dev, "Enabled VPORT(%d)\n", vport_num); +done: + mutex_unlock(&esw->state_lock); + return ret; + +err_vhca_mapping: + esw_vport_cleanup(esw, vport); + mutex_unlock(&esw->state_lock); + return ret; +} + +void mlx5_esw_vport_disable(struct mlx5_eswitch *esw, u16 vport_num) +{ + struct mlx5_vport *vport; + + vport = mlx5_eswitch_get_vport(esw, vport_num); + if (IS_ERR(vport)) + return; + + mutex_lock(&esw->state_lock); + if (!vport->enabled) + goto done; + + esw_debug(esw->dev, "Disabling vport(%d)\n", vport_num); + /* Mark this vport as disabled to discard new events */ + vport->enabled = false; + + /* Disable events from this vport */ + arm_vport_context_events_cmd(esw->dev, vport->vport, 0); + + if (!mlx5_esw_is_manager_vport(esw, vport->vport) && + MLX5_CAP_GEN(esw->dev, vhca_resource_manager)) + mlx5_esw_vport_vhca_id_clear(esw, vport_num); + + /* We don't assume VFs will cleanup after themselves. + * Calling vport change handler while vport is disabled will cleanup + * the vport resources. + */ + esw_vport_change_handle_locked(vport); + vport->enabled_events = 0; + esw_vport_cleanup(esw, vport); + esw->enabled_vports--; + +done: + mutex_unlock(&esw->state_lock); +} + +static int eswitch_vport_event(struct notifier_block *nb, + unsigned long type, void *data) +{ + struct mlx5_eswitch *esw = mlx5_nb_cof(nb, struct mlx5_eswitch, nb); + struct mlx5_eqe *eqe = data; + struct mlx5_vport *vport; + u16 vport_num; + + vport_num = be16_to_cpu(eqe->data.vport_change.vport_num); + vport = mlx5_eswitch_get_vport(esw, vport_num); + if (!IS_ERR(vport)) + queue_work(esw->work_queue, &vport->vport_change_handler); + return NOTIFY_OK; +} + +/** + * mlx5_esw_query_functions - Returns raw output about functions state + * @dev: Pointer to device to query + * + * mlx5_esw_query_functions() allocates and returns functions changed + * raw output memory pointer from device on success. Otherwise returns ERR_PTR. + * Caller must free the memory using kvfree() when valid pointer is returned. + */ +const u32 *mlx5_esw_query_functions(struct mlx5_core_dev *dev) +{ + int outlen = MLX5_ST_SZ_BYTES(query_esw_functions_out); + u32 in[MLX5_ST_SZ_DW(query_esw_functions_in)] = {}; + u32 *out; + int err; + + out = kvzalloc(outlen, GFP_KERNEL); + if (!out) + return ERR_PTR(-ENOMEM); + + MLX5_SET(query_esw_functions_in, in, opcode, + MLX5_CMD_OP_QUERY_ESW_FUNCTIONS); + + err = mlx5_cmd_exec(dev, in, sizeof(in), out, outlen); + if (!err) + return out; + + kvfree(out); + return ERR_PTR(err); +} + +static int mlx5_esw_host_functions_enabled_query(struct mlx5_eswitch *esw) +{ + const u32 *query_host_out; + + if (!mlx5_core_is_ecpf_esw_manager(esw->dev)) + return 0; + + query_host_out = mlx5_esw_query_functions(esw->dev); + if (IS_ERR(query_host_out)) + return PTR_ERR(query_host_out); + + esw->esw_funcs.host_funcs_disabled = MLX5_GET(query_esw_functions_out, query_host_out, + host_params_context.host_pf_not_exist); + kvfree(query_host_out); + return 0; +} + +static void mlx5_eswitch_event_handlers_register(struct mlx5_eswitch *esw) +{ + MLX5_NB_INIT(&esw->nb, eswitch_vport_event, NIC_VPORT_CHANGE); + mlx5_eq_notifier_register(esw->dev, &esw->nb); + + if (esw->mode == MLX5_ESWITCH_OFFLOADS && mlx5_eswitch_is_funcs_handler(esw->dev)) { + MLX5_NB_INIT(&esw->esw_funcs.nb, mlx5_esw_funcs_changed_handler, + ESW_FUNCTIONS_CHANGED); + mlx5_eq_notifier_register(esw->dev, &esw->esw_funcs.nb); + } +} + +static void mlx5_eswitch_event_handlers_unregister(struct mlx5_eswitch *esw) +{ + if (esw->mode == MLX5_ESWITCH_OFFLOADS && mlx5_eswitch_is_funcs_handler(esw->dev)) + mlx5_eq_notifier_unregister(esw->dev, &esw->esw_funcs.nb); + + mlx5_eq_notifier_unregister(esw->dev, &esw->nb); + + flush_workqueue(esw->work_queue); +} + +static void mlx5_eswitch_clear_vf_vports_info(struct mlx5_eswitch *esw) +{ + struct mlx5_vport *vport; + unsigned long i; + + mlx5_esw_for_each_vf_vport(esw, i, vport, esw->esw_funcs.num_vfs) { + memset(&vport->qos, 0, sizeof(vport->qos)); + memset(&vport->info, 0, sizeof(vport->info)); + vport->info.link_state = MLX5_VPORT_ADMIN_STATE_AUTO; + vport->info.vlan_proto = htons(ETH_P_8021Q); + } +} + +/* Public E-Switch API */ +int mlx5_eswitch_load_vport(struct mlx5_eswitch *esw, u16 vport_num, + enum mlx5_eswitch_vport_event enabled_events) +{ + int err; + + err = mlx5_esw_vport_enable(esw, vport_num, enabled_events); + if (err) + return err; + + mlx5_esw_vport_debugfs_create(esw, vport_num, false, 0); + err = esw_offloads_load_rep(esw, vport_num); + if (err) + goto err_rep; + + return err; + +err_rep: + mlx5_esw_vport_debugfs_destroy(esw, vport_num); + mlx5_esw_vport_disable(esw, vport_num); + return err; +} + +void mlx5_eswitch_unload_vport(struct mlx5_eswitch *esw, u16 vport_num) +{ + esw_offloads_unload_rep(esw, vport_num); + mlx5_esw_vport_debugfs_destroy(esw, vport_num); + mlx5_esw_vport_disable(esw, vport_num); +} + +void mlx5_eswitch_unload_vf_vports(struct mlx5_eswitch *esw, u16 num_vfs) +{ + struct mlx5_vport *vport; + unsigned long i; + + mlx5_esw_for_each_vf_vport(esw, i, vport, num_vfs) { + if (!vport->enabled) + continue; + mlx5_eswitch_unload_vport(esw, vport->vport); + } +} + +int mlx5_eswitch_load_vf_vports(struct mlx5_eswitch *esw, u16 num_vfs, + enum mlx5_eswitch_vport_event enabled_events) +{ + struct mlx5_vport *vport; + unsigned long i; + int err; + + mlx5_esw_for_each_vf_vport(esw, i, vport, num_vfs) { + err = mlx5_eswitch_load_vport(esw, vport->vport, enabled_events); + if (err) + goto vf_err; + } + + return 0; + +vf_err: + mlx5_eswitch_unload_vf_vports(esw, num_vfs); + return err; +} + +static int host_pf_enable_hca(struct mlx5_core_dev *dev) +{ + if (!mlx5_core_is_ecpf(dev)) + return 0; + + /* Once vport and representor are ready, take out the external host PF + * out of initializing state. Enabling HCA clears the iser->initializing + * bit and host PF driver loading can progress. + */ + return mlx5_cmd_host_pf_enable_hca(dev); +} + +static void host_pf_disable_hca(struct mlx5_core_dev *dev) +{ + if (!mlx5_core_is_ecpf(dev)) + return; + + mlx5_cmd_host_pf_disable_hca(dev); +} + +/* mlx5_eswitch_enable_pf_vf_vports() enables vports of PF, ECPF and VFs + * whichever are present on the eswitch. + */ +int +mlx5_eswitch_enable_pf_vf_vports(struct mlx5_eswitch *esw, + enum mlx5_eswitch_vport_event enabled_events) +{ + int ret; + + /* Enable PF vport, if it exists */ + if (mlx5_esw_host_functions_enabled(esw->dev)) { + ret = mlx5_eswitch_load_vport(esw, MLX5_VPORT_PF, enabled_events); + if (ret) + return ret; + + /* Enable external host PF HCA */ + ret = host_pf_enable_hca(esw->dev); + if (ret) + goto pf_hca_err; + } + + /* Enable ECPF vport */ + if (mlx5_ecpf_vport_exists(esw->dev)) { + ret = mlx5_eswitch_load_vport(esw, MLX5_VPORT_ECPF, enabled_events); + if (ret) + goto ecpf_err; + } + + /* Enable VF vports */ + ret = mlx5_eswitch_load_vf_vports(esw, esw->esw_funcs.num_vfs, + enabled_events); + if (ret) + goto vf_err; + return 0; + +vf_err: + if (mlx5_ecpf_vport_exists(esw->dev)) + mlx5_eswitch_unload_vport(esw, MLX5_VPORT_ECPF); +ecpf_err: + if (mlx5_esw_host_functions_enabled(esw->dev)) + host_pf_disable_hca(esw->dev); +pf_hca_err: + if (mlx5_esw_host_functions_enabled(esw->dev)) + mlx5_eswitch_unload_vport(esw, MLX5_VPORT_PF); + return ret; +} + +/* mlx5_eswitch_disable_pf_vf_vports() disables vports of PF, ECPF and VFs + * whichever are previously enabled on the eswitch. + */ +void mlx5_eswitch_disable_pf_vf_vports(struct mlx5_eswitch *esw) +{ + mlx5_eswitch_unload_vf_vports(esw, esw->esw_funcs.num_vfs); + + if (mlx5_ecpf_vport_exists(esw->dev)) + mlx5_eswitch_unload_vport(esw, MLX5_VPORT_ECPF); + + if (mlx5_esw_host_functions_enabled(esw->dev)) { + host_pf_disable_hca(esw->dev); + mlx5_eswitch_unload_vport(esw, MLX5_VPORT_PF); + } +} + +static void mlx5_eswitch_get_devlink_param(struct mlx5_eswitch *esw) +{ + struct devlink *devlink = priv_to_devlink(esw->dev); + union devlink_param_value val; + int err; + + err = devlink_param_driverinit_value_get(devlink, + MLX5_DEVLINK_PARAM_ID_ESW_LARGE_GROUP_NUM, + &val); + if (!err) { + esw->params.large_group_num = val.vu32; + } else { + esw_warn(esw->dev, + "Devlink can't get param fdb_large_groups, uses default (%d).\n", + ESW_OFFLOADS_DEFAULT_NUM_GROUPS); + esw->params.large_group_num = ESW_OFFLOADS_DEFAULT_NUM_GROUPS; + } +} + +static void +mlx5_eswitch_update_num_of_vfs(struct mlx5_eswitch *esw, int num_vfs) +{ + const u32 *out; + + if (num_vfs < 0) + return; + + if (!mlx5_core_is_ecpf_esw_manager(esw->dev)) { + esw->esw_funcs.num_vfs = num_vfs; + return; + } + + out = mlx5_esw_query_functions(esw->dev); + if (IS_ERR(out)) + return; + + esw->esw_funcs.num_vfs = MLX5_GET(query_esw_functions_out, out, + host_params_context.host_num_of_vfs); + kvfree(out); +} + +static void mlx5_esw_mode_change_notify(struct mlx5_eswitch *esw, u16 mode) +{ + struct mlx5_esw_event_info info = {}; + + info.new_mode = mode; + + blocking_notifier_call_chain(&esw->n_head, 0, &info); +} + +static int mlx5_esw_acls_ns_init(struct mlx5_eswitch *esw) +{ + struct mlx5_core_dev *dev = esw->dev; + int total_vports; + int err; + + if (esw->flags & MLX5_ESWITCH_VPORT_ACL_NS_CREATED) + return 0; + + total_vports = mlx5_eswitch_get_total_vports(dev); + + if (MLX5_CAP_ESW_EGRESS_ACL(dev, ft_support)) { + err = mlx5_fs_egress_acls_init(dev, total_vports); + if (err) + return err; + } else { + esw_warn(dev, "engress ACL is not supported by FW\n"); + } + + if (MLX5_CAP_ESW_INGRESS_ACL(dev, ft_support)) { + err = mlx5_fs_ingress_acls_init(dev, total_vports); + if (err) + goto err; + } else { + esw_warn(dev, "ingress ACL is not supported by FW\n"); + } + esw->flags |= MLX5_ESWITCH_VPORT_ACL_NS_CREATED; + return 0; + +err: + if (MLX5_CAP_ESW_EGRESS_ACL(dev, ft_support)) + mlx5_fs_egress_acls_cleanup(dev); + return err; +} + +static void mlx5_esw_acls_ns_cleanup(struct mlx5_eswitch *esw) +{ + struct mlx5_core_dev *dev = esw->dev; + + esw->flags &= ~MLX5_ESWITCH_VPORT_ACL_NS_CREATED; + if (MLX5_CAP_ESW_INGRESS_ACL(dev, ft_support)) + mlx5_fs_ingress_acls_cleanup(dev); + if (MLX5_CAP_ESW_EGRESS_ACL(dev, ft_support)) + mlx5_fs_egress_acls_cleanup(dev); +} + +/** + * mlx5_eswitch_enable_locked - Enable eswitch + * @esw: Pointer to eswitch + * @num_vfs: Enable eswitch for given number of VFs. This is optional. + * Valid value are 0, > 0 and MLX5_ESWITCH_IGNORE_NUM_VFS. + * Caller should pass num_vfs > 0 when enabling eswitch for + * vf vports. Caller should pass num_vfs = 0, when eswitch + * is enabled without sriov VFs or when caller + * is unaware of the sriov state of the host PF on ECPF based + * eswitch. Caller should pass < 0 when num_vfs should be + * completely ignored. This is typically the case when eswitch + * is enabled without sriov regardless of PF/ECPF system. + * mlx5_eswitch_enable_locked() Enables eswitch in either legacy or offloads + * mode. If num_vfs >=0 is provided, it setup VF related eswitch vports. + * It returns 0 on success or error code on failure. + */ +int mlx5_eswitch_enable_locked(struct mlx5_eswitch *esw, int num_vfs) +{ + int err; + + lockdep_assert_held(&esw->mode_lock); + + if (!MLX5_CAP_ESW_FLOWTABLE_FDB(esw->dev, ft_support)) { + esw_warn(esw->dev, "FDB is not supported, aborting ...\n"); + return -EOPNOTSUPP; + } + + mlx5_eswitch_get_devlink_param(esw); + + err = mlx5_esw_acls_ns_init(esw); + if (err) + return err; + + mlx5_eswitch_update_num_of_vfs(esw, num_vfs); + + if (num_vfs > 0 && esw->mode == MLX5_ESWITCH_OFFLOADS && + mlx5_core_is_ecpf_esw_manager(esw->dev)) { + esw_warn(esw->dev, + "Failed to enable switchdev mode. Must destroy VFs from host PF.\n"); + return -EOPNOTSUPP; + } + + if (esw->mode == MLX5_ESWITCH_LEGACY) { + err = esw_legacy_enable(esw); + } else { + mlx5_rescan_drivers(esw->dev); + err = esw_offloads_enable(esw); + } + + if (err) + goto abort; + + err = mlx5_activate_mpesw_lag(esw); + if (err) + goto esw_disable; + + esw->fdb_table.flags |= MLX5_ESW_FDB_CREATED; + + mlx5_eswitch_event_handlers_register(esw); + + esw_info(esw->dev, "Enable: mode(%s), nvfs(%d), active vports(%d)\n", + esw->mode == MLX5_ESWITCH_LEGACY ? "LEGACY" : "OFFLOADS", + esw->esw_funcs.num_vfs, esw->enabled_vports); + + mlx5_esw_mode_change_notify(esw, esw->mode); + + return 0; + +esw_disable: + if (esw->mode == MLX5_ESWITCH_LEGACY) + esw_legacy_disable(esw); + else + esw_offloads_disable(esw); +abort: + mlx5_esw_acls_ns_cleanup(esw); + return err; +} + +/** + * mlx5_eswitch_enable - Enable eswitch + * @esw: Pointer to eswitch + * @num_vfs: Enable eswitch switch for given number of VFs. + * Caller must pass num_vfs > 0 when enabling eswitch for + * vf vports. + * mlx5_eswitch_enable() returns 0 on success or error code on failure. + */ +int mlx5_eswitch_enable(struct mlx5_eswitch *esw, int num_vfs) +{ + bool toggle_lag; + int ret; + + if (!mlx5_esw_allowed(esw)) + return 0; + + toggle_lag = !mlx5_sriov_is_enabled(esw->dev) && !is_mdev_switchdev_mode(esw->dev); + + if (toggle_lag) + mlx5_lag_disable_change(esw->dev); + + down_write(&esw->mode_lock); + if (!mlx5_esw_is_fdb_created(esw)) { + ret = mlx5_eswitch_enable_locked(esw, num_vfs); + } else { + enum mlx5_eswitch_vport_event vport_events; + + vport_events = (esw->mode == MLX5_ESWITCH_LEGACY) ? + MLX5_LEGACY_SRIOV_VPORT_EVENTS : MLX5_VPORT_UC_ADDR_CHANGE; + ret = mlx5_eswitch_load_vf_vports(esw, num_vfs, vport_events); + if (!ret) + esw->esw_funcs.num_vfs = num_vfs; + } + up_write(&esw->mode_lock); + + if (toggle_lag) + mlx5_lag_enable_change(esw->dev); + + return ret; +} + +/* When disabling sriov, free driver level resources. */ +void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw, bool clear_vf) +{ + if (!mlx5_esw_allowed(esw)) + return; + + down_write(&esw->mode_lock); + /* If driver is unloaded, this function is called twice by remove_one() + * and mlx5_unload(). Prevent the second call. + */ + if (!esw->esw_funcs.num_vfs && !clear_vf) + goto unlock; + + esw_info(esw->dev, "Unload vfs: mode(%s), nvfs(%d), active vports(%d)\n", + esw->mode == MLX5_ESWITCH_LEGACY ? "LEGACY" : "OFFLOADS", + esw->esw_funcs.num_vfs, esw->enabled_vports); + + mlx5_eswitch_unload_vf_vports(esw, esw->esw_funcs.num_vfs); + if (clear_vf) + mlx5_eswitch_clear_vf_vports_info(esw); + /* If disabling sriov in switchdev mode, free meta rules here + * because it depends on num_vfs. + */ + if (esw->mode == MLX5_ESWITCH_OFFLOADS) { + struct devlink *devlink = priv_to_devlink(esw->dev); + + devlink_rate_nodes_destroy(devlink); +#if IS_ENABLED(CONFIG_MLXDEVM) + mlx5_devm_rate_nodes_destroy(esw->dev); +#endif + } + + esw->esw_funcs.num_vfs = 0; + atomic64_set(&esw->user_count, 0); + +unlock: + up_write(&esw->mode_lock); +} + +/* Free resources for corresponding eswitch mode. It is called by devlink + * when changing eswitch mode or modprobe when unloading driver. + */ +void mlx5_eswitch_disable_locked(struct mlx5_eswitch *esw) +{ + struct devlink *devlink = priv_to_devlink(esw->dev); + + lockdep_assert_held_write(&esw->mode_lock); + + /* Notify eswitch users that it is exiting from current mode. + * So that it can do necessary cleanup before the eswitch is disabled. + */ + mlx5_esw_mode_change_notify(esw, MLX5_ESWITCH_LEGACY); + + mlx5_eswitch_event_handlers_unregister(esw); + mlx5_deactivate_mpesw_lag(esw); + + esw_info(esw->dev, "Disable: mode(%s), nvfs(%d), active vports(%d)\n", + esw->mode == MLX5_ESWITCH_LEGACY ? "LEGACY" : "OFFLOADS", + esw->esw_funcs.num_vfs, esw->enabled_vports); + + if (esw->fdb_table.flags & MLX5_ESW_FDB_CREATED) { + esw->fdb_table.flags &= ~MLX5_ESW_FDB_CREATED; + if (esw->mode == MLX5_ESWITCH_OFFLOADS) + esw_offloads_disable(esw); + else if (esw->mode == MLX5_ESWITCH_LEGACY) + esw_legacy_disable(esw); + mlx5_esw_acls_ns_cleanup(esw); + } + + if (esw->mode == MLX5_ESWITCH_OFFLOADS) { + devlink_rate_nodes_destroy(devlink); +#if IS_ENABLED(CONFIG_MLXDEVM) + mlx5_devm_rate_nodes_destroy(esw->dev); +#endif + } +} + +void mlx5_eswitch_disable(struct mlx5_eswitch *esw) +{ + if (!mlx5_esw_allowed(esw)) + return; + + mlx5_lag_disable_change(esw->dev); + down_write(&esw->mode_lock); + mlx5_eswitch_disable_locked(esw); + up_write(&esw->mode_lock); + mlx5_lag_enable_change(esw->dev); +} + +static int mlx5_query_hca_cap_host_pf(struct mlx5_core_dev *dev, void *out) +{ + u16 opmod = (MLX5_CAP_GENERAL << 1) | (HCA_CAP_OPMOD_GET_MAX & 0x01); + u8 in[MLX5_ST_SZ_BYTES(query_hca_cap_in)] = {}; + + MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP); + MLX5_SET(query_hca_cap_in, in, op_mod, opmod); + MLX5_SET(query_hca_cap_in, in, function_id, MLX5_VPORT_PF); + MLX5_SET(query_hca_cap_in, in, other_function, true); + return mlx5_cmd_exec_inout(dev, query_hca_cap, in, out); +} + +int mlx5_esw_sf_max_hpf_functions(struct mlx5_core_dev *dev, u16 *max_sfs, u16 *sf_base_id) + +{ + int query_out_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out); + void *query_ctx; + void *hca_caps; + int err; + + if (!mlx5_core_is_ecpf(dev) || !mlx5_esw_host_functions_enabled(dev)) { + *max_sfs = 0; + return 0; + } + + query_ctx = kzalloc(query_out_sz, GFP_KERNEL); + if (!query_ctx) + return -ENOMEM; + + err = mlx5_query_hca_cap_host_pf(dev, query_ctx); + if (err) + goto out_free; + + hca_caps = MLX5_ADDR_OF(query_hca_cap_out, query_ctx, capability); + *max_sfs = MLX5_GET(cmd_hca_cap, hca_caps, max_num_sf); + *sf_base_id = MLX5_GET(cmd_hca_cap, hca_caps, sf_base_id); + +out_free: + kfree(query_ctx); + return err; +} + +static void mlx5_esw_vport_matchid_free(struct mlx5_vport *vport) +{ + mutex_lock(&mlx5e_vport_match_ida_mutex); + ida_simple_remove(&mlx5e_vport_match_ida, vport->match_id); + mutex_unlock(&mlx5e_vport_match_ida_mutex); +} + +static void mlx5_esw_vports_matchid_free(struct mlx5_eswitch *esw) +{ + struct mlx5_vport *vport; + unsigned long i; + + mlx5_esw_for_each_vport(esw, i, vport) + mlx5_esw_vport_matchid_free(vport); +} + +static int mlx5_esw_vport_matchid_alloc(struct mlx5_eswitch *esw, struct mlx5_core_dev *dev, struct mlx5_vport *vport) +{ + bool access_other_hca_roce; + int err = 0; + bool roce; + + vport->info.vlan_proto = htons(ETH_P_8021Q); + vport->info.roce = true; + mutex_lock(&mlx5e_vport_match_ida_mutex); + vport->match_id = ida_simple_get(&mlx5e_vport_match_ida, + 0, VHCA_VPORT_MATCH_ID_SIZE, + GFP_KERNEL); + mutex_unlock(&mlx5e_vport_match_ida_mutex); + if (vport->match_id < 0) { + err = -ENOSPC; + goto abort; + } + + access_other_hca_roce = MLX5_CAP_GEN(dev, vhca_group_manager) && + MLX5_CAP_GEN(dev, access_other_hca_roce); + mutex_init(&vport->ingress.offloads.vf_meter_lock); + mutex_init(&vport->egress.offloads.vf_meter_lock); + if (access_other_hca_roce && + vport->vport != MLX5_VPORT_UPLINK && + !mlx5_esw_is_sf_vport(esw, vport->vport)) { + mlx5_get_other_hca_cap_roce(dev, vport->vport, + &roce); + vport->info.roce = roce; + } + INIT_LIST_HEAD(&vport->egress.legacy.allow_vlans_rules); + INIT_LIST_HEAD(&vport->ingress.legacy.allow_vlans_rules); + +abort: + return err; +} + +static int mlx5_esw_vports_matchid_alloc(struct mlx5_eswitch *esw, struct mlx5_core_dev *dev) +{ + struct mlx5_vport *vport; + unsigned long i; + int err; + + mlx5_esw_for_each_vport(esw, i, vport) { + err = mlx5_esw_vport_matchid_alloc(esw, dev, vport); + if (err) + goto special_err; + } + + return 0; + +special_err: + mlx5_esw_for_each_vport(esw, i, vport) { + if (vport->match_id < 0) + break; + mlx5_esw_vport_matchid_free(vport); + } + + return err; +} + +static int mlx5_esw_vport_alloc(struct mlx5_eswitch *esw, struct mlx5_core_dev *dev, + int index, u16 vport_num) +{ + struct mlx5_vport *vport; + int err; + + vport = kzalloc(sizeof(*vport), GFP_KERNEL); + if (!vport) + return -ENOMEM; + + vport->dev = esw->dev; + vport->vport = vport_num; + vport->index = index; + vport->info.link_state = MLX5_VPORT_ADMIN_STATE_AUTO; + INIT_WORK(&vport->vport_change_handler, esw_vport_change_handler); + err = xa_insert(&esw->vports, vport_num, vport, GFP_KERNEL); + if (err) + goto insert_err; + + esw->total_vports++; + return 0; + +insert_err: + kfree(vport); + return err; +} + +static void mlx5_esw_vport_free(struct mlx5_eswitch *esw, struct mlx5_vport *vport) +{ + xa_erase(&esw->vports, vport->vport); + kfree(vport); +} + +static void mlx5_esw_vports_cleanup(struct mlx5_eswitch *esw) +{ + struct mlx5_vport *vport; + unsigned long i; + + mlx5_esw_for_each_vport(esw, i, vport) + mlx5_esw_vport_free(esw, vport); + xa_destroy(&esw->vports); +} + +static int mlx5_esw_vports_init(struct mlx5_eswitch *esw) +{ + struct mlx5_core_dev *dev = esw->dev; + u16 max_host_pf_sfs; + u16 base_sf_num; + int idx = 0; + int err; + int i; + + xa_init(&esw->vports); + + if (mlx5_esw_host_functions_enabled(esw->dev)) { + err = mlx5_esw_vport_alloc(esw, dev, idx, MLX5_VPORT_PF); + if (err) + goto err; + if (esw->first_host_vport == MLX5_VPORT_PF) + xa_set_mark(&esw->vports, idx, MLX5_ESW_VPT_HOST_FN); + idx++; + + for (i = 0; i < mlx5_core_max_vfs(dev); i++) { + err = mlx5_esw_vport_alloc(esw, dev, idx, idx); + if (err) + goto err; + xa_set_mark(&esw->vports, idx, MLX5_ESW_VPT_VF); + xa_set_mark(&esw->vports, idx, MLX5_ESW_VPT_HOST_FN); + idx++; + } + } + + base_sf_num = mlx5_sf_start_function_id(dev); + for (i = 0; i < mlx5_sf_max_functions(dev); i++) { + err = mlx5_esw_vport_alloc(esw, dev, idx, base_sf_num + i); + if (err) + goto err; + xa_set_mark(&esw->vports, base_sf_num + i, MLX5_ESW_VPT_SF); + idx++; + } + + err = mlx5_esw_sf_max_hpf_functions(dev, &max_host_pf_sfs, &base_sf_num); + if (err) + goto err; + for (i = 0; i < max_host_pf_sfs; i++) { + err = mlx5_esw_vport_alloc(esw, dev, idx, base_sf_num + i); + if (err) + goto err; + xa_set_mark(&esw->vports, base_sf_num + i, MLX5_ESW_VPT_SF); + idx++; + } + + /* Once the code is restructured to do eswitch manager port allocation + * and initialization, have the check for ecpf, because on ECPF, ACL + * enablement is needed. + */ + if (mlx5_ecpf_vport_exists(dev) || mlx5_core_is_ecpf(esw->dev)) { + err = mlx5_esw_vport_alloc(esw, dev, idx, MLX5_VPORT_ECPF); + if (err) + goto err; + idx++; + } + err = mlx5_esw_vport_alloc(esw, dev, idx, MLX5_VPORT_UPLINK); + if (err) + goto err; + return 0; + +err: + mlx5_esw_vports_cleanup(esw); + return err; +} + +static int mlx5_esw_ib_init(struct mlx5_core_dev *dev) +{ + struct mlx5_hca_vport_context vpc = {}; + int err; + + if (MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_IB && + mlx5_core_is_ecpf(dev)) { + err = mlx5_query_hca_vport_context(dev, 1, 0, 0, &vpc); + if (err) + return err; + + vpc.policy = MLX5_POLICY_FOLLOW; + vpc.field_select = MLX5_HCA_VPORT_SEL_STATE_POLICY; + + return mlx5_core_modify_hca_vport_context(dev, 1, 0, 0, &vpc); + } + + return 0; +} + +int mlx5_eswitch_init(struct mlx5_core_dev *dev) +{ + struct mlx5_eswitch *esw; + int err; + + if (!MLX5_VPORT_MANAGER(dev)) + return mlx5_esw_ib_init(dev); + + esw = kzalloc(sizeof(*esw), GFP_KERNEL); + if (!esw) + return -ENOMEM; + + esw->dev = dev; + esw->manager_vport = mlx5_eswitch_manager_vport(dev); + esw->first_host_vport = mlx5_eswitch_first_host_vport_num(dev); + dev->priv.eswitch = esw; + + esw->work_queue = create_singlethread_workqueue("mlx5_esw_wq"); + if (!esw->work_queue) { + err = -ENOMEM; + goto abort; + } + + err = mlx5_esw_host_functions_enabled_query(esw); + if (err) + goto abort; + + err = mlx5_esw_vports_init(esw); + if (err) + goto abort; + + err = mlx5_esw_vports_matchid_alloc(esw, dev); + if (err) + goto matchid_err; + + err = esw_offloads_init_reps(esw); + if (err) + goto reps_err; + + mutex_init(&esw->offloads.encap_tbl_lock); + hash_init(esw->offloads.encap_tbl); + mutex_init(&esw->offloads.decap_tbl_lock); + hash_init(esw->offloads.decap_tbl); + mlx5e_mod_hdr_tbl_init(&esw->offloads.mod_hdr); + atomic64_set(&esw->offloads.num_flows, 0); + ida_init(&esw->offloads.vport_metadata_ida); + xa_init_flags(&esw->offloads.vhca_map, XA_FLAGS_ALLOC); + mutex_init(&esw->state_lock); + lockdep_register_key(&esw->mode_lock_key); + init_rwsem(&esw->mode_lock); + lockdep_set_class(&esw->mode_lock, &esw->mode_lock_key); + refcount_set(&esw->qos.refcnt, 0); + + esw->enabled_vports = 0; + esw->mode = MLX5_ESWITCH_LEGACY; + esw->offloads.inline_mode = MLX5_INLINE_MODE_NONE; + if (MLX5_CAP_ESW_FLOWTABLE_FDB(dev, reformat) && + MLX5_CAP_ESW_FLOWTABLE_FDB(dev, decap)) + esw->offloads.encap = DEVLINK_ESWITCH_ENCAP_MODE_BASIC; + else + esw->offloads.encap = DEVLINK_ESWITCH_ENCAP_MODE_NONE; + + BLOCKING_INIT_NOTIFIER_HEAD(&esw->n_head); + + esw->dbgfs = debugfs_create_dir("esw", mlx5_debugfs_get_dev_root(esw->dev)); + esw_info(dev, + "Total vports %d, per vport: max uc(%d) max mc(%d)\n", + esw->total_vports, + MLX5_MAX_UC_PER_VPORT(dev), + MLX5_MAX_MC_PER_VPORT(dev)); + return 0; + +reps_err: + mlx5_esw_vports_matchid_free(esw); +matchid_err: + mlx5_esw_vports_cleanup(esw); +abort: + if (esw->work_queue) + destroy_workqueue(esw->work_queue); + kfree(esw); + return err; +} + +int mlx5_eswitch_vport_modify_other_hca_cap_roce(struct mlx5_eswitch *esw, + struct mlx5_vport *vport, bool value) +{ + int err = 0; + + if (!(MLX5_CAP_GEN(esw->dev, vhca_group_manager) && + MLX5_CAP_GEN(esw->dev, access_other_hca_roce))) + return -EOPNOTSUPP; + + mutex_lock(&esw->state_lock); + + if (vport->info.roce == value) + goto out; + + err = mlx5_modify_other_hca_cap_roce(esw->dev, vport->vport, value); + if (!err) + vport->info.roce = value; + +out: + mutex_unlock(&esw->state_lock); + return err; +} + +int mlx5_eswitch_vport_get_other_hca_cap_roce(struct mlx5_eswitch *esw, + struct mlx5_vport *vport, bool *value) +{ + if (!(MLX5_CAP_GEN(esw->dev, vhca_group_manager) && + MLX5_CAP_GEN(esw->dev, access_other_hca_roce))) + return -EOPNOTSUPP; + + mutex_lock(&esw->state_lock); + *value = vport->info.roce; + mutex_unlock(&esw->state_lock); + + return 0; +} + +void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw) +{ + if (!esw || !MLX5_VPORT_MANAGER(esw->dev)) + return; + + esw_info(esw->dev, "cleanup\n"); + + debugfs_remove_recursive(esw->dbgfs); + esw->dev->priv.eswitch = NULL; + destroy_workqueue(esw->work_queue); + WARN_ON(refcount_read(&esw->qos.refcnt)); + lockdep_unregister_key(&esw->mode_lock_key); + mutex_destroy(&esw->state_lock); + WARN_ON(!xa_empty(&esw->offloads.vhca_map)); + xa_destroy(&esw->offloads.vhca_map); + ida_destroy(&esw->offloads.vport_metadata_ida); + mlx5e_mod_hdr_tbl_destroy(&esw->offloads.mod_hdr); + mutex_destroy(&esw->offloads.encap_tbl_lock); + mutex_destroy(&esw->offloads.decap_tbl_lock); + esw_offloads_cleanup_reps(esw); + mlx5_esw_vports_matchid_free(esw); + mlx5_esw_vports_cleanup(esw); + kfree(esw); +} + +/* Vport Administration */ +static int +mlx5_esw_set_vport_mac_locked(struct mlx5_eswitch *esw, + struct mlx5_vport *evport, const u8 *mac) +{ + u16 vport_num = evport->vport; + u64 node_guid; + int err = 0; + + if (is_multicast_ether_addr(mac)) + return -EINVAL; + + if (evport->info.spoofchk && !is_valid_ether_addr(mac)) + mlx5_core_warn(esw->dev, + "Set invalid MAC while spoofchk is on, vport(%d)\n", + vport_num); + + err = mlx5_modify_nic_vport_mac_address(esw->dev, vport_num, mac); + if (err) { + mlx5_core_warn(esw->dev, + "Failed to mlx5_modify_nic_vport_mac vport(%d) err=(%d)\n", + vport_num, err); + return err; + } + + node_guid_gen_from_mac(&node_guid, mac); + err = mlx5_modify_nic_vport_node_guid(esw->dev, vport_num, node_guid); + if (err) + mlx5_core_warn(esw->dev, + "Failed to set vport %d node guid, err = %d. RDMA_CM will not function properly for this VF.\n", + vport_num, err); + + ether_addr_copy(evport->info.mac, mac); + evport->info.node_guid = node_guid; + if (evport->enabled && esw->mode == MLX5_ESWITCH_LEGACY) + err = esw_acl_ingress_lgcy_setup(esw, evport); + + return err; +} + +int mlx5_eswitch_set_vport_mac(struct mlx5_eswitch *esw, + u16 vport, const u8 *mac) +{ + struct mlx5_vport *evport = mlx5_eswitch_get_vport(esw, vport); + int err = 0; + + if (IS_ERR(evport)) + return PTR_ERR(evport); + + mutex_lock(&esw->state_lock); + err = mlx5_esw_set_vport_mac_locked(esw, evport, mac); + mutex_unlock(&esw->state_lock); + return err; +} + +static bool mlx5_esw_check_port_type(struct mlx5_eswitch *esw, u16 vport_num, xa_mark_t mark) +{ + struct mlx5_vport *vport; + + vport = mlx5_eswitch_get_vport(esw, vport_num); + if (IS_ERR(vport)) + return false; + + return xa_get_mark(&esw->vports, vport_num, mark); +} + +bool mlx5_eswitch_is_vf_vport(struct mlx5_eswitch *esw, u16 vport_num) +{ + return mlx5_esw_check_port_type(esw, vport_num, MLX5_ESW_VPT_VF); +} + +bool mlx5_esw_is_sf_vport(struct mlx5_eswitch *esw, u16 vport_num) +{ + return mlx5_esw_check_port_type(esw, vport_num, MLX5_ESW_VPT_SF); +} + +static int +mlx5_esw_set_hca_trusted(struct mlx5_eswitch *esw, u16 vport_num, bool trusted) +{ + u32 out[MLX5_ST_SZ_DW(vhca_trust_level)] = {0}; + u32 in[MLX5_ST_SZ_DW(vhca_trust_level)] = {0}; + int sz = MLX5_ST_SZ_BYTES(vhca_trust_level); + u16 vhca_id; + int err; + + if (!MLX5_CAP_GEN(esw->dev, vhca_trust_level_reg)) + return -EOPNOTSUPP; + + err = mlx5_esw_query_vport_vhca_id(esw, vport_num, &vhca_id); + if (err) { + esw_warn(esw->dev, "Getting vhca_id for vport failed (vport=%u,err=%d)\n", + vport_num, err); + return err; + } + + MLX5_SET(vhca_trust_level, in, vhca_id, vhca_id); + MLX5_SET(vhca_trust_level, in, trust_level, trusted); + return mlx5_core_access_reg(esw->dev, in, sz, out, sz, MLX5_REG_TRUST_LEVEL, 0, 1); +} + +int mlx5_eswitch_get_vport_mac(struct mlx5_eswitch *esw, + u16 vport, u8 *mac) +{ + struct mlx5_vport *evport = mlx5_eswitch_get_vport(esw, vport); + + if (IS_ERR(evport)) + return PTR_ERR(evport); + + mutex_lock(&esw->state_lock); + ether_addr_copy(mac, evport->info.mac); + mutex_unlock(&esw->state_lock); + return 0; +} + +static int mlx5_eswitch_update_vport_trunk(struct mlx5_eswitch *esw, + struct mlx5_vport *evport, + unsigned long *old_trunk) { + DECLARE_BITMAP(diff_vlan_bm, VLAN_N_VID); + int err = 0; + + bitmap_xor(diff_vlan_bm, old_trunk, + evport->info.vlan_trunk_8021q_bitmap, VLAN_N_VID); + if (!bitmap_weight(diff_vlan_bm, VLAN_N_VID)) + return err; + + esw_update_acl_trunk_bitmap(esw, evport->vport); + if (evport->enabled && esw->mode == MLX5_ESWITCH_OFFLOADS) { + err = esw_acl_egress_lgcy_setup(esw, evport); + if (!err) + err = esw_acl_ingress_lgcy_setup(esw, evport); + } + if (err) { + bitmap_copy(evport->info.vlan_trunk_8021q_bitmap, old_trunk, + VLAN_N_VID); + esw_update_acl_trunk_bitmap(esw, evport->vport); + esw_acl_egress_lgcy_setup(esw, evport); + esw_acl_ingress_lgcy_setup(esw, evport); + } + + return err; +} + +int mlx5_eswitch_add_vport_trunk_range(struct mlx5_eswitch *esw, + int vport, u16 start_vlan, u16 end_vlan) +{ + DECLARE_BITMAP(prev_vport_bitmap, VLAN_N_VID); + struct mlx5_vport *evport = mlx5_eswitch_get_vport(esw, vport); + int err = 0; + + if (!mlx5_esw_allowed(esw)) + return -EPERM; + + if (IS_ERR(evport)) + return PTR_ERR(evport); + + if (end_vlan > VLAN_N_VID || start_vlan > end_vlan) + return -EINVAL; + + mutex_lock(&esw->state_lock); + + if (evport->info.vlan || evport->info.qos) { + err = -EPERM; + mlx5_core_warn(esw->dev, + "VGT+ is not allowed when operating in VST mode vport(%d)\n", + vport); + goto unlock; + } + + bitmap_copy(prev_vport_bitmap, evport->info.vlan_trunk_8021q_bitmap, + VLAN_N_VID); + bitmap_set(evport->info.vlan_trunk_8021q_bitmap, start_vlan, + end_vlan - start_vlan + 1); + err = mlx5_eswitch_update_vport_trunk(esw, evport, prev_vport_bitmap); + +unlock: + mutex_unlock(&esw->state_lock); + + return err; +} + +int mlx5_eswitch_del_vport_trunk_range(struct mlx5_eswitch *esw, + int vport, u16 start_vlan, u16 end_vlan) +{ + DECLARE_BITMAP(prev_vport_bitmap, VLAN_N_VID); + struct mlx5_vport *evport = mlx5_eswitch_get_vport(esw, vport); + int err = 0; + + if (!mlx5_esw_allowed(esw)) + return -EPERM; + + if (IS_ERR(evport)) + return PTR_ERR(evport); + + if (end_vlan > VLAN_N_VID || start_vlan > end_vlan) + return -EINVAL; + + mutex_lock(&esw->state_lock); + bitmap_copy(prev_vport_bitmap, evport->info.vlan_trunk_8021q_bitmap, + VLAN_N_VID); + bitmap_clear(evport->info.vlan_trunk_8021q_bitmap, start_vlan, + end_vlan - start_vlan + 1); + err = mlx5_eswitch_update_vport_trunk(esw, evport, prev_vport_bitmap); + mutex_unlock(&esw->state_lock); + + return err; +} + +static bool +is_port_function_supported(struct mlx5_eswitch *esw, u16 vport_num) +{ + return vport_num == MLX5_VPORT_PF || + mlx5_eswitch_is_vf_vport(esw, vport_num) || + mlx5_esw_is_sf_vport(esw, vport_num); +} + +int mlx5_devlink_port_function_trust_set(struct devlink *devlink, + struct devlink_port *port, + bool trust, + struct netlink_ext_ack *extack) +{ + struct mlx5_eswitch *esw; + struct mlx5_vport *vport; + u16 vport_num; + int err; + + esw = mlx5_devlink_eswitch_get(devlink); + if (IS_ERR(esw)) + return PTR_ERR(esw); + + if (!mlx5_core_is_ecpf(esw->dev)) + return -EOPNOTSUPP; + + vport_num = mlx5_esw_devlink_port_index_to_vport_num(port->index); + if (!is_port_function_supported(esw, vport_num)) + return -EOPNOTSUPP; + + vport = mlx5_eswitch_get_vport(esw, vport_num); + if (IS_ERR(vport)) { + NL_SET_ERR_MSG_MOD(extack, "Invalid port"); + return PTR_ERR(vport); + } + + err = mlx5_esw_set_hca_trusted(esw, vport_num, trust); + if (!err) + vport->info.offloads_trusted = trust; + + return err; +} + +int +mlx5_esw_get_hca_trusted(struct mlx5_eswitch *esw, + u16 vport_num, + bool *trusted) +{ + u32 out[MLX5_ST_SZ_DW(vhca_trust_level)] = {0}; + u32 in[MLX5_ST_SZ_DW(vhca_trust_level)] = {0}; + int sz = MLX5_ST_SZ_BYTES(vhca_trust_level); + int trust_level; + u16 vhca_id; + int err; + + if (!MLX5_CAP_GEN(esw->dev, vhca_trust_level_reg)) + return -EOPNOTSUPP; + + err = mlx5_esw_query_vport_vhca_id(esw, vport_num, &vhca_id); + if (err) { + esw_warn(esw->dev, "Query of vhca_id for vport %d failed, err %d\n", + vport_num, err); + return err; + } + + MLX5_SET(vhca_trust_level, in, vhca_id, vhca_id); + mlx5_core_access_reg(esw->dev, in, sz, out, sz, MLX5_REG_TRUST_LEVEL, 0, 0); + trust_level = MLX5_GET(vhca_trust_level, out, trust_level); + *trusted = trust_level & 0x1; + + return 0; +} + +int mlx5_devlink_port_function_trust_get(struct devlink *devlink, + struct devlink_port *port, + bool *trusted, + struct netlink_ext_ack *extack) +{ + struct mlx5_eswitch *esw; + struct mlx5_vport *vport; + u16 vport_num; + + esw = mlx5_devlink_eswitch_get(devlink); + if (IS_ERR(esw)) + return PTR_ERR(esw); + + if (!mlx5_core_is_ecpf(esw->dev)) + return -EOPNOTSUPP; + + vport_num = mlx5_esw_devlink_port_index_to_vport_num(port->index); + if (!is_port_function_supported(esw, vport_num)) + return -EOPNOTSUPP; + + vport = mlx5_eswitch_get_vport(esw, vport_num); + if (IS_ERR(vport)) { + NL_SET_ERR_MSG_MOD(extack, "Invalid port"); + return PTR_ERR(vport); + } + + *trusted = vport->info.offloads_trusted; + + return 0; +} + +int mlx5_eswitch_set_vport_state(struct mlx5_eswitch *esw, + u16 vport, int link_state) +{ + struct mlx5_vport *evport = mlx5_eswitch_get_vport(esw, vport); + int opmod = MLX5_VPORT_STATE_OP_MOD_ESW_VPORT; + int other_vport = 1; + int err = 0; + + if (!mlx5_esw_allowed(esw)) + return -EPERM; + if (IS_ERR(evport)) + return PTR_ERR(evport); + + if (vport == MLX5_VPORT_UPLINK) { + opmod = MLX5_VPORT_STATE_OP_MOD_UPLINK; + other_vport = 0; + vport = 0; + } + mutex_lock(&esw->state_lock); + if (esw->mode != MLX5_ESWITCH_LEGACY) { + err = -EOPNOTSUPP; + goto unlock; + } + + err = mlx5_modify_vport_admin_state(esw->dev, opmod, vport, other_vport, link_state); + if (err) { + mlx5_core_warn(esw->dev, "Failed to set vport %d link state, opmod = %d, err = %d", + vport, opmod, err); + goto unlock; + } + + evport->info.link_state = link_state; + +unlock: + mutex_unlock(&esw->state_lock); + return err; +} + +int mlx5_eswitch_get_vport_config(struct mlx5_eswitch *esw, + u16 vport, struct ifla_vf_info *ivi) +{ + struct mlx5_vport *evport = mlx5_eswitch_get_vport(esw, vport); + + if (IS_ERR(evport)) + return PTR_ERR(evport); + + memset(ivi, 0, sizeof(*ivi)); + ivi->vf = vport - 1; + + mutex_lock(&esw->state_lock); + ether_addr_copy(ivi->mac, evport->info.mac); + ivi->linkstate = evport->info.link_state; + ivi->vlan = evport->info.vlan; + ivi->qos = evport->info.qos; + ivi->vlan_proto = evport->info.vlan_proto; + ivi->spoofchk = evport->info.spoofchk; + ivi->trusted = evport->info.trusted; + if (evport->qos.enabled) { + ivi->min_tx_rate = evport->qos.min_rate; + ivi->max_tx_rate = evport->qos.max_rate; + } + mutex_unlock(&esw->state_lock); + + return 0; +} + +int __mlx5_eswitch_set_vport_vlan(struct mlx5_eswitch *esw, int vport, u16 vlan, + u8 qos, __be16 proto, u8 set_flags) +{ + struct mlx5_vport *evport = mlx5_eswitch_get_vport(esw, vport); + enum esw_vst_mode vst_mode; + int err = 0; + + if (IS_ERR(evport)) + return PTR_ERR(evport); + if (vlan > 4095 || qos > 7) + return -EINVAL; + if (proto != htons(ETH_P_8021Q) && proto != htons(ETH_P_8021AD)) + return -EINVAL; + + vst_mode = esw_get_vst_mode(esw); + if (proto == htons(ETH_P_8021AD) && (vst_mode != ESW_VST_MODE_STEERING)) + return -EPROTONOSUPPORT; + + if (bitmap_weight(evport->info.vlan_trunk_8021q_bitmap, VLAN_N_VID)) { + err = -EPERM; + mlx5_core_warn(esw->dev, + "VST is not allowed when operating in VGT+ mode vport(%d)\n", + vport); + return err; + } + + if (vst_mode != ESW_VST_MODE_STEERING) { + err = modify_esw_vport_cvlan(esw->dev, vport, vlan, qos, set_flags, vst_mode); + if (err) + return err; + } + + evport->info.vlan = vlan; + evport->info.qos = qos; + evport->info.vlan_proto = proto; + if (evport->enabled && esw->mode == MLX5_ESWITCH_LEGACY) { + err = esw_acl_ingress_lgcy_setup(esw, evport); + if (err) + return err; + err = esw_acl_egress_lgcy_setup(esw, evport); + } + + return err; +} + +int mlx5_eswitch_get_vport_stats(struct mlx5_eswitch *esw, + u16 vport_num, + struct ifla_vf_stats *vf_stats) +{ + struct mlx5_vport *vport = mlx5_eswitch_get_vport(esw, vport_num); + int outlen = MLX5_ST_SZ_BYTES(query_vport_counter_out); + u32 in[MLX5_ST_SZ_DW(query_vport_counter_in)] = {}; + struct mlx5_vport_drop_stats stats = {}; + int err = 0; + u32 *out; + + if (IS_ERR(vport)) + return PTR_ERR(vport); + + out = kvzalloc(outlen, GFP_KERNEL); + if (!out) + return -ENOMEM; + + MLX5_SET(query_vport_counter_in, in, opcode, + MLX5_CMD_OP_QUERY_VPORT_COUNTER); + MLX5_SET(query_vport_counter_in, in, op_mod, 0); + MLX5_SET(query_vport_counter_in, in, vport_number, vport->vport); + MLX5_SET(query_vport_counter_in, in, other_vport, 1); + + err = mlx5_cmd_exec_inout(esw->dev, query_vport_counter, in, out); + if (err) + goto free_out; + + #define MLX5_GET_CTR(p, x) \ + MLX5_GET64(query_vport_counter_out, p, x) + + memset(vf_stats, 0, sizeof(*vf_stats)); + vf_stats->rx_packets = + MLX5_GET_CTR(out, received_eth_unicast.packets) + + MLX5_GET_CTR(out, received_ib_unicast.packets) + + MLX5_GET_CTR(out, received_eth_multicast.packets) + + MLX5_GET_CTR(out, received_ib_multicast.packets) + + MLX5_GET_CTR(out, received_eth_broadcast.packets); + + vf_stats->rx_bytes = + MLX5_GET_CTR(out, received_eth_unicast.octets) + + MLX5_GET_CTR(out, received_ib_unicast.octets) + + MLX5_GET_CTR(out, received_eth_multicast.octets) + + MLX5_GET_CTR(out, received_ib_multicast.octets) + + MLX5_GET_CTR(out, received_eth_broadcast.octets); + + vf_stats->tx_packets = + MLX5_GET_CTR(out, transmitted_eth_unicast.packets) + + MLX5_GET_CTR(out, transmitted_ib_unicast.packets) + + MLX5_GET_CTR(out, transmitted_eth_multicast.packets) + + MLX5_GET_CTR(out, transmitted_ib_multicast.packets) + + MLX5_GET_CTR(out, transmitted_eth_broadcast.packets); + + vf_stats->tx_bytes = + MLX5_GET_CTR(out, transmitted_eth_unicast.octets) + + MLX5_GET_CTR(out, transmitted_ib_unicast.octets) + + MLX5_GET_CTR(out, transmitted_eth_multicast.octets) + + MLX5_GET_CTR(out, transmitted_ib_multicast.octets) + + MLX5_GET_CTR(out, transmitted_eth_broadcast.octets); + + vf_stats->multicast = + MLX5_GET_CTR(out, received_eth_multicast.packets) + + MLX5_GET_CTR(out, received_ib_multicast.packets); + + vf_stats->broadcast = + MLX5_GET_CTR(out, received_eth_broadcast.packets); + + err = mlx5_esw_query_vport_drop_stats(esw->dev, vport, &stats); + if (err) + goto free_out; + vf_stats->rx_dropped = stats.rx_dropped; + vf_stats->tx_dropped = stats.tx_dropped; + +free_out: + kvfree(out); + return err; +} + +int mlx5_eswitch_get_vport_stats_backport(struct mlx5_eswitch *esw, + int vport, + struct ifla_vf_stats_backport *vf_stats_backport) +{ + int outlen = MLX5_ST_SZ_BYTES(query_vport_counter_out); + u32 in[MLX5_ST_SZ_DW(query_vport_counter_in)] = {0}; + int err = 0; + u32 *out; + + if (!mlx5_esw_allowed(esw)) + return -EPERM; + + out = kvzalloc(outlen, GFP_KERNEL); + if (!out) + return -ENOMEM; + + MLX5_SET(query_vport_counter_in, in, opcode, + MLX5_CMD_OP_QUERY_VPORT_COUNTER); + MLX5_SET(query_vport_counter_in, in, op_mod, 0); + MLX5_SET(query_vport_counter_in, in, vport_number, vport); + if (vport) + MLX5_SET(query_vport_counter_in, in, other_vport, 1); + + memset(out, 0, outlen); + err = mlx5_cmd_exec(esw->dev, in, sizeof(in), out, outlen); + if (err) + goto free_out; + + #define MLX5_GET_CTR(p, x) \ + MLX5_GET64(query_vport_counter_out, p, x) + + memset(vf_stats_backport, 0, sizeof(*vf_stats_backport)); + vf_stats_backport->tx_multicast = + MLX5_GET_CTR(out, transmitted_eth_multicast.packets) + + MLX5_GET_CTR(out, transmitted_ib_multicast.packets); + + vf_stats_backport->tx_broadcast = + MLX5_GET_CTR(out, transmitted_eth_broadcast.packets); + +free_out: + kvfree(out); + return err; +} + +u8 mlx5_eswitch_mode(const struct mlx5_core_dev *dev) +{ + struct mlx5_eswitch *esw = dev->priv.eswitch; + + return mlx5_esw_allowed(esw) ? esw->mode : MLX5_ESWITCH_LEGACY; +} +EXPORT_SYMBOL_GPL(mlx5_eswitch_mode); + +enum devlink_eswitch_encap_mode +mlx5_eswitch_get_encap_mode(const struct mlx5_core_dev *dev) +{ + struct mlx5_eswitch *esw; + + esw = dev->priv.eswitch; + return (mlx5_eswitch_mode(dev) == MLX5_ESWITCH_OFFLOADS) ? esw->offloads.encap : + DEVLINK_ESWITCH_ENCAP_MODE_NONE; +} +EXPORT_SYMBOL(mlx5_eswitch_get_encap_mode); + +bool mlx5_eswitch_is_manager_vport(const struct mlx5_eswitch *esw, u16 vport_num) +{ + return mlx5_esw_allowed(esw) ? is_esw_manager_vport(esw, vport_num) : false; +} +EXPORT_SYMBOL_GPL(mlx5_eswitch_is_manager_vport); + +bool mlx5_esw_multipath_prereq(struct mlx5_core_dev *dev0, + struct mlx5_core_dev *dev1) +{ + return (dev0->priv.eswitch->mode == MLX5_ESWITCH_OFFLOADS && + dev1->priv.eswitch->mode == MLX5_ESWITCH_OFFLOADS); +} + +int mlx5_esw_event_notifier_register(struct mlx5_eswitch *esw, struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&esw->n_head, nb); +} + +void mlx5_esw_event_notifier_unregister(struct mlx5_eswitch *esw, struct notifier_block *nb) +{ + blocking_notifier_chain_unregister(&esw->n_head, nb); +} + +/** + * mlx5_esw_hold() - Try to take a read lock on esw mode lock. + * @mdev: mlx5 core device. + * + * Should be called by esw resources callers. + * + * Return: true on success or false. + */ +bool mlx5_esw_hold(struct mlx5_core_dev *mdev) +{ + struct mlx5_eswitch *esw = mdev->priv.eswitch; + + /* e.g. VF doesn't have eswitch so nothing to do */ + if (!mlx5_esw_allowed(esw)) + return true; + + if (down_read_trylock(&esw->mode_lock) != 0) + return true; + + return false; +} + +/** + * mlx5_esw_release() - Release a read lock on esw mode lock. + * @mdev: mlx5 core device. + */ +void mlx5_esw_release(struct mlx5_core_dev *mdev) +{ + struct mlx5_eswitch *esw = mdev->priv.eswitch; + + if (mlx5_esw_allowed(esw)) + up_read(&esw->mode_lock); +} + +/** + * mlx5_esw_get() - Increase esw user count. + * @mdev: mlx5 core device. + */ +void mlx5_esw_get(struct mlx5_core_dev *mdev) +{ + struct mlx5_eswitch *esw = mdev->priv.eswitch; + + if (mlx5_esw_allowed(esw)) + atomic64_inc(&esw->user_count); +} + +/** + * mlx5_esw_put() - Decrease esw user count. + * @mdev: mlx5 core device. + */ +void mlx5_esw_put(struct mlx5_core_dev *mdev) +{ + struct mlx5_eswitch *esw = mdev->priv.eswitch; + + if (mlx5_esw_allowed(esw)) + atomic64_dec_if_positive(&esw->user_count); +} + +/** + * mlx5_esw_try_lock() - Take a write lock on esw mode lock. + * @esw: eswitch device. + * + * Should be called by esw mode change routine. + * + * Return: + * * 0 - esw mode if successfully locked and refcount is 0. + * * -EBUSY - refcount is not 0. + * * -EINVAL - In the middle of switching mode or lock is already held. + */ +int mlx5_esw_try_lock(struct mlx5_eswitch *esw) +{ + if (down_write_trylock(&esw->mode_lock) == 0) + return -EINVAL; + + if (atomic64_read(&esw->user_count) > 0) { + up_write(&esw->mode_lock); + return -EBUSY; + } + + return esw->mode; +} + +/** + * mlx5_esw_unlock() - Release write lock on esw mode lock + * @esw: eswitch device. + */ +void mlx5_esw_unlock(struct mlx5_eswitch *esw) +{ + if (!mlx5_esw_allowed(esw)) + return; + up_write(&esw->mode_lock); +} + +/** + * mlx5_eswitch_get_total_vports - Get total vports of the eswitch + * + * @dev: Pointer to core device + * + * mlx5_eswitch_get_total_vports returns total number of eswitch vports. + */ +u16 mlx5_eswitch_get_total_vports(const struct mlx5_core_dev *dev) +{ + struct mlx5_eswitch *esw; + + esw = dev->priv.eswitch; + return mlx5_esw_allowed(esw) ? esw->total_vports : 0; +} +EXPORT_SYMBOL_GPL(mlx5_eswitch_get_total_vports); + +/** + * mlx5_eswitch_get_core_dev - Get the mdev device + * @esw : eswitch device. + * + * Return the mellanox core device which manages the eswitch. + */ +struct mlx5_core_dev *mlx5_eswitch_get_core_dev(struct mlx5_eswitch *esw) +{ + return mlx5_esw_allowed(esw) ? esw->dev : NULL; +} +EXPORT_SYMBOL(mlx5_eswitch_get_core_dev); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h new file mode 100644 index 0000000..db76977 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h @@ -0,0 +1,1016 @@ +/* + * + * + * Copyright (c) 2015, Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __MLX5_ESWITCH_H__ +#define __MLX5_ESWITCH_H__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "lib/mpfs.h" +#include "mlx5_core.h" +#include "lib/fs_chains.h" +#include "sf/sf.h" +#include "en/tc_ct.h" +#include "en/tc/sample.h" + +enum mlx5_mapped_obj_type { + MLX5_MAPPED_OBJ_CHAIN, + MLX5_MAPPED_OBJ_SAMPLE, + MLX5_MAPPED_OBJ_INT_PORT_METADATA, +}; + +struct mlx5_mapped_obj { + enum mlx5_mapped_obj_type type; + union { + u32 chain; + struct { + u32 group_id; + u32 rate; + u32 trunc_size; + u32 tunnel_id; + } sample; + u32 int_port_metadata; + }; +}; + +#ifdef CONFIG_MLX5_ESWITCH + +#define ESW_OFFLOADS_DEFAULT_NUM_GROUPS 15 + +#define MLX5_MAX_UC_PER_VPORT(dev) \ + (1 << MLX5_CAP_GEN(dev, log_max_current_uc_list)) + +#define MLX5_MAX_MC_PER_VPORT(dev) \ + (1 << MLX5_CAP_GEN(dev, log_max_current_mc_list)) + +#define MLX5_MAX_VLAN_PER_VPORT(dev) \ + (1 << MLX5_CAP_GEN(dev, log_max_vlan_list)) + +#define mlx5_esw_has_fwd_fdb(dev) \ + MLX5_CAP_ESW_FLOWTABLE(dev, fdb_multi_path_to_table) + +#define esw_chains(esw) \ + ((esw)->fdb_table.offloads.esw_chains_priv) + +#define VHCA_VPORT_MATCH_ID_BITS 16 +#define VHCA_VPORT_MATCH_ID_SIZE BIT(VHCA_VPORT_MATCH_ID_BITS) + +enum { + MAPPING_TYPE_CHAIN, + MAPPING_TYPE_TUNNEL, + MAPPING_TYPE_TUNNEL_ENC_OPTS, + MAPPING_TYPE_LABELS, + MAPPING_TYPE_ZONE, + MAPPING_TYPE_INT_PORT, +}; + +struct vport_meter { + u64 rate; + u64 burst; + struct mlx5e_flow_meter_handle *meter_hndl; + struct mlx5_flow_table *meter_tbl; + struct mlx5_flow_group *meter_grp; + struct mlx5_flow_handle *meter_rule; + struct mlx5_flow_table *color_tbl; + struct mlx5_flow_group *color_grp; + struct mlx5_flow_handle *fwd_green_rule; + struct mlx5_flow_handle *drop_red_rule; + struct mlx5_fc *drop_counter; + u64 packets_dropped; + u64 bytes_dropped; +}; + +struct mlx5_esw_ipsec_priv; + +struct vport_ingress { + struct mlx5_flow_table *acl; + struct mlx5_flow_handle *allow_rule; + struct { + struct mlx5_flow_group *allow_tagged_spoofchk_grp; + struct mlx5_flow_group *allow_untagged_spoofchk_grp; + struct mlx5_flow_group *drop_grp; + struct mlx5_flow_handle *drop_rule; + struct mlx5_flow_handle *allow_untagged_rule; + struct list_head allow_vlans_rules; + struct mlx5_fc *drop_counter; + } legacy; + struct { + struct mutex vf_meter_lock; /* protect vf meter operations */ + struct vport_meter *meter_xps[2]; /* bps: 0, pps: 1 */ + /* Optional group to add an FTE to do internal priority + * tagging on ingress packets. + */ + struct mlx5_flow_group *metadata_prio_tag_grp; + /* Group to add default match-all FTE entry to tag ingress + * packet with metadata. + */ + struct mlx5_flow_group *metadata_allmatch_grp; + /* Optional group to add a drop all rule */ + struct mlx5_flow_group *drop_grp; + struct mlx5_modify_hdr *modify_metadata; + struct mlx5_flow_handle *modify_metadata_rule; + struct mlx5_esw_ipsec_priv *esw_ipsec_priv; + struct mlx5_flow_handle *drop_rule; + } offloads; +}; + +struct vport_egress { + struct mlx5_flow_table *acl; + struct mlx5_flow_handle *allowed_vlan; + struct mlx5_flow_group *vlan_grp; + struct { + struct mlx5_flow_group *allow_untagged_grp; + struct mlx5_flow_group *drop_grp; + struct mlx5_flow_handle *drop_rule; + struct mlx5_flow_handle *allow_untagged_rule; + struct list_head allow_vlans_rules; + struct mlx5_fc *drop_counter; + } legacy; + struct { + struct mutex vf_meter_lock; /* protect vf meter operations */ + struct vport_meter *meter_xps[2]; /* bps: 0, pps: 1 */ + struct mlx5_flow_handle *fwd_rule; + struct mlx5_flow_group *fwd_grp; + struct mlx5_flow_handle *bounce_rule; + struct mlx5_flow_group *bounce_grp; + } offloads; +}; + +struct mlx5_vport_drop_stats { + u64 rx_dropped; + u64 tx_dropped; +}; + +struct mlx5_vport_info { + u8 mac[ETH_ALEN]; + u16 vlan; + u64 node_guid; + int link_state; + u8 qos; + __be16 vlan_proto; + u8 spoofchk: 1; + u8 trusted: 1; + u8 offloads_trusted: 1; + u8 roce: 1; + /* the admin approved vlan list */ + DECLARE_BITMAP(vlan_trunk_8021q_bitmap, VLAN_N_VID); + u32 group; +}; + +/* Vport context events */ +enum mlx5_eswitch_vport_event { + MLX5_VPORT_UC_ADDR_CHANGE = BIT(0), + MLX5_VPORT_MC_ADDR_CHANGE = BIT(1), + MLX5_VPORT_VLAN_CHANGE = BIT(1), + MLX5_VPORT_PROMISC_CHANGE = BIT(3), +}; + +struct mlx5_vport { + struct mlx5_core_dev *dev; + struct hlist_head uc_list[MLX5_L2_ADDR_HASH_SIZE]; + struct hlist_head mc_list[MLX5_L2_ADDR_HASH_SIZE]; + /* The requested vlan list from the vport side */ + DECLARE_BITMAP(req_vlan_bitmap, VLAN_N_VID); + /* Actual accepted vlans on the acl tables */ + DECLARE_BITMAP(acl_vlan_8021q_bitmap, VLAN_N_VID); + struct mlx5_flow_handle *promisc_rule; + struct mlx5_flow_handle *allmulti_rule; + struct work_struct vport_change_handler; + + struct vport_ingress ingress; + struct vport_egress egress; + u32 default_metadata; + u32 metadata; + + struct mlx5_vport_info info; + + struct { + bool enabled; + u32 esw_tsar_ix; + u32 bw_share; + u32 min_rate; + u32 max_rate; + struct mlx5_esw_rate_group *group; + } qos; + + u16 vport; + bool enabled; + enum mlx5_eswitch_vport_event enabled_events; + int index; + struct devlink_port *dl_port; + struct dentry *dbgfs; + u16 match_id; + u32 fw_pages; + u32 page_limit; + spinlock_t pg_counters_lock; /* protects page counter and limit */ +}; + +struct mlx5_esw_indir_table; + +struct mlx5_eswitch_fdb { + union { + struct legacy_fdb { + struct mlx5_flow_table *fdb; + struct mlx5_flow_group *addr_grp; + struct mlx5_flow_group *allmulti_grp; + struct mlx5_flow_group *promisc_grp; + struct mlx5_flow_table *vepa_fdb; + struct mlx5_flow_handle *vepa_uplink_rule; + struct mlx5_flow_handle *vepa_star_rule; + } legacy; + + struct offloads_fdb { + struct mlx5_flow_namespace *ns; + struct mlx5_flow_table *tc_miss_table; + struct mlx5_flow_table *slow_fdb; + struct mlx5_flow_group *send_to_vport_grp; + struct mlx5_flow_group *send_to_vport_meta_grp; + struct mlx5_flow_group *peer_miss_grp; + struct mlx5_flow_handle **peer_miss_rules; + struct mlx5_flow_group *miss_grp; + struct mlx5_flow_handle **send_to_vport_meta_rules; + struct mlx5_flow_handle *miss_rule_uni; + struct mlx5_flow_handle *miss_rule_multi; + struct mlx5_flow_table *miss_meter_fdb; + struct mlx5_flow_group *miss_meter_grp; + struct mlx5_flow_table *post_miss_meter_fdb; + struct mlx5_flow_group *post_miss_meter_grp; + int vlan_push_pop_refcount; + + struct mlx5_fs_chains *esw_chains_priv; + struct { + DECLARE_HASHTABLE(table, 8); + /* Protects vports.table */ + struct mutex lock; + } vports; + struct mlx5_esw_ipsec_priv *esw_ipsec_priv; + + struct mlx5_esw_indir_table *indir; + + } offloads; + }; + u32 flags; +}; + +struct mlx5_acl_vlan { + struct mlx5_flow_handle *acl_vlan_rule; + struct list_head list; +}; + +struct mlx5_pet_actions { + struct mlx5_flow_table *ft; + struct mlx5_flow_group *fg; + struct mlx5_modify_hdr *hdr; + struct mlx5_flow_handle *rule; + struct mlx5_pkt_reformat *pkt_reformat; +}; + +struct mlx5_vport_match_actions { + struct mlx5_pet_actions push_pet_hdr; + struct mlx5_pet_actions copy_data_to_pet_hdr; +}; + +struct mlx5_pet_info { + u16 ether_type; + bool enabled; +}; + +struct mlx5_esw_offload { + struct mlx5_flow_table *ft_offloads_restore; + struct mlx5_flow_group *restore_group; + struct mlx5_modify_hdr *restore_copy_hdr_id; + struct mapping_ctx *reg_c0_obj_pool; + + struct mlx5_flow_table *ft_offloads; + struct mlx5_flow_group *vport_rx_group; + struct xarray vport_reps; + struct mlx5_flow_group *vport_rx_drop_group; + struct mlx5_flow_handle *vport_rx_drop_rule; + struct list_head peer_flows; + struct mutex peer_mutex; + struct mutex encap_tbl_lock; /* protects encap_tbl */ + DECLARE_HASHTABLE(encap_tbl, 8); + struct mutex decap_tbl_lock; /* protects decap_tbl */ + DECLARE_HASHTABLE(decap_tbl, 8); + struct mod_hdr_tbl mod_hdr; + DECLARE_HASHTABLE(termtbl_tbl, 8); + struct mutex termtbl_mutex; /* protects termtbl hash */ + struct xarray vhca_map; + const struct mlx5_eswitch_rep_ops *rep_ops[NUM_REP_TYPES]; + struct mlx5_vport_match_actions pet_vport_action; + struct mlx5_pet_info pet_info; + u8 inline_mode; + atomic64_t num_flows; + enum devlink_eswitch_encap_mode encap; + enum devlink_eswitch_ipsec_mode ipsec; + struct ida vport_metadata_ida; + unsigned int host_number; /* ECPF supports one external host */ +}; + +/* E-Switch MC FDB table hash node */ +struct esw_mc_addr { /* SRIOV only */ + struct l2addr_node node; + struct mlx5_flow_handle *uplink_rule; /* Forward to uplink rule */ + u32 refcnt; +}; + +struct mlx5_host_work { + struct work_struct work; + struct mlx5_eswitch *esw; +}; + +struct mlx5_esw_functions { + struct mlx5_nb nb; + u16 num_vfs; + bool host_funcs_disabled; +}; + +enum { + MLX5_ESWITCH_VPORT_MATCH_METADATA = BIT(0), + MLX5_ESWITCH_REG_C1_LOOPBACK_ENABLED = BIT(1), + MLX5_ESWITCH_VPORT_ACL_NS_CREATED = BIT(2), + MLX5_ESWITCH_PET_INSERT = BIT(3), +}; + +struct mlx5_esw_bridge_offloads; + +struct mlx5_smart_nic_vport { + struct mlx5_eswitch *esw; + struct kobject kobj; + int vport; +}; + +struct mlx5_smart_nic_sysfs { + struct kobject *kobj; + struct mlx5_smart_nic_vport *vport; +}; + +struct mlx5_esw_bridge_offloads; + +enum { + MLX5_ESW_FDB_CREATED = BIT(0), +}; + +struct mlx5_eswitch { + struct mlx5_core_dev *dev; + struct mlx5_nb nb; + struct mlx5_eswitch_fdb fdb_table; + /* legacy data structures */ + struct hlist_head mc_table[MLX5_L2_ADDR_HASH_SIZE]; + struct esw_mc_addr mc_promisc; + /* end of legacy */ + struct workqueue_struct *work_queue; + struct xarray vports; + u32 flags; + int total_vports; + int enabled_vports; + /* Synchronize between vport change events + * and async SRIOV admin state changes + */ + struct mutex state_lock; + + /* Protects eswitch mode change that occurs via one or more + * user commands, i.e. sriov state change, devlink commands. + */ + struct rw_semaphore mode_lock; + atomic64_t user_count; + + struct { + u32 root_tsar_ix; + struct mlx5_esw_rate_group *group0; + struct list_head groups; /* Protected by esw->state_lock */ + + /* Protected by esw->state_lock. + * Initially 0, meaning no QoS users and QoS is disabled. + */ + refcount_t refcnt; + bool enabled; + } qos; + + struct mlx5_esw_bridge_offloads *br_offloads; + struct mlx5_esw_offload offloads; + int mode; + u16 manager_vport; + u16 first_host_vport; + struct { + u32 large_group_num; + } params; + struct mlx5_esw_functions esw_funcs; + struct mlx5_smart_nic_sysfs smart_nic_sysfs; + struct blocking_notifier_head n_head; + struct dentry *dbgfs; + struct lock_class_key mode_lock_key; +}; + +void esw_offloads_disable(struct mlx5_eswitch *esw); +int esw_offloads_enable(struct mlx5_eswitch *esw); +void esw_offloads_cleanup_reps(struct mlx5_eswitch *esw); +int esw_offloads_init_reps(struct mlx5_eswitch *esw); + +struct mlx5_flow_handle * +mlx5_eswitch_add_send_to_vport_meta_rule(struct mlx5_eswitch *esw, u16 vport_num); +void mlx5_eswitch_del_send_to_vport_meta_rule(struct mlx5_flow_handle *rule); + +int mlx5_esw_offloads_pet_insert_set(struct mlx5_eswitch *esw, bool enable); +bool mlx5e_esw_offloads_pet_supported(const struct mlx5_eswitch *esw); + +bool mlx5_esw_vport_match_metadata_supported(const struct mlx5_eswitch *esw); +int mlx5_esw_offloads_vport_metadata_set(struct mlx5_eswitch *esw, bool enable); +u32 mlx5_esw_match_metadata_alloc(struct mlx5_eswitch *esw); +void mlx5_esw_match_metadata_free(struct mlx5_eswitch *esw, u32 metadata); + +int mlx5_esw_qos_modify_vport_rate(struct mlx5_eswitch *esw, u16 vport_num, u32 rate_mbps); + +/* E-Switch API */ +int mlx5_eswitch_init(struct mlx5_core_dev *dev); +void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw); + +#define MLX5_ESWITCH_IGNORE_NUM_VFS (-1) +int mlx5_eswitch_enable_locked(struct mlx5_eswitch *esw, int num_vfs); +int mlx5_eswitch_enable(struct mlx5_eswitch *esw, int num_vfs); +void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw, bool clear_vf); +void mlx5_eswitch_disable_locked(struct mlx5_eswitch *esw); +void mlx5_eswitch_disable(struct mlx5_eswitch *esw); +int mlx5_eswitch_set_vport_mac(struct mlx5_eswitch *esw, + u16 vport, const u8 *mac); +int mlx5_eswitch_set_vport_state(struct mlx5_eswitch *esw, + u16 vport, int link_state); +int mlx5_eswitch_set_vport_vlan(struct mlx5_eswitch *esw, + int vport, u16 vlan, u8 qos, __be16 vlan_proto); +int mlx5_eswitch_set_vport_spoofchk(struct mlx5_eswitch *esw, + u16 vport, bool spoofchk); +int mlx5_eswitch_set_vport_trust(struct mlx5_eswitch *esw, + u16 vport_num, bool setting); +int mlx5_eswitch_set_vport_rate(struct mlx5_eswitch *esw, u16 vport, + u32 max_rate, u32 min_rate); +int mlx5_esw_qos_vport_update_group(struct mlx5_eswitch *esw, + struct mlx5_vport *vport, + struct mlx5_esw_rate_group *group, + struct netlink_ext_ack *extack); +int mlx5_eswitch_set_vepa(struct mlx5_eswitch *esw, u8 setting); +int mlx5_eswitch_get_vepa(struct mlx5_eswitch *esw, u8 *setting); +int mlx5_eswitch_get_vport_config(struct mlx5_eswitch *esw, + u16 vport, struct ifla_vf_info *ivi); +int mlx5_eswitch_get_vport_stats(struct mlx5_eswitch *esw, + u16 vport, + struct ifla_vf_stats *vf_stats); +int mlx5_eswitch_get_vport_mac(struct mlx5_eswitch *esw, + u16 vport, u8 *mac); +int mlx5_eswitch_vport_update_group(struct mlx5_eswitch *esw, int vport_num, + u32 group_id, const char *group_name); + +struct ifla_vf_stats_backport { + __u64 tx_broadcast; + __u64 tx_multicast; +}; + +int mlx5_eswitch_get_vport_stats_backport(struct mlx5_eswitch *esw, + int vport, + struct ifla_vf_stats_backport *vf_stats_backport); +void mlx5_eswitch_del_send_to_vport_rule(struct mlx5_flow_handle *rule); +int mlx5_eswitch_add_vport_trunk_range(struct mlx5_eswitch *esw, + int vport, u16 start_vlan, u16 end_vlan); +int mlx5_eswitch_del_vport_trunk_range(struct mlx5_eswitch *esw, + int vport, u16 start_vlan, u16 end_vlan); + +int mlx5_eswitch_modify_esw_vport_context(struct mlx5_core_dev *dev, u16 vport, + bool other_vport, void *in); + +struct mlx5_flow_spec; +struct mlx5_esw_flow_attr; +struct mlx5_termtbl_handle; + +bool +mlx5_eswitch_termtbl_required(struct mlx5_eswitch *esw, + struct mlx5_flow_attr *attr, + struct mlx5_flow_act *flow_act, + struct mlx5_flow_spec *spec); + +struct mlx5_flow_handle * +mlx5_eswitch_add_termtbl_rule(struct mlx5_eswitch *esw, + struct mlx5_flow_table *ft, + struct mlx5_flow_spec *spec, + struct mlx5_esw_flow_attr *attr, + struct mlx5_flow_act *flow_act, + struct mlx5_flow_destination *dest, + int num_dest); + +void +mlx5_eswitch_termtbl_put(struct mlx5_eswitch *esw, + struct mlx5_termtbl_handle *tt); + +void +mlx5_eswitch_set_rule_source_port(struct mlx5_eswitch *esw, + struct mlx5_flow_spec *spec, + struct mlx5_flow_attr *attr, + struct mlx5_eswitch *src_esw, + u16 vport); + +void +mlx5_eswitch_clear_rule_source_port(struct mlx5_eswitch *esw, struct mlx5_flow_spec *spec); + +struct mlx5_flow_handle * +mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw, + struct mlx5_flow_spec *spec, + struct mlx5_flow_attr *attr); +struct mlx5_flow_handle * +mlx5_eswitch_add_fwd_rule(struct mlx5_eswitch *esw, + struct mlx5_flow_spec *spec, + struct mlx5_flow_attr *attr); +void +mlx5_eswitch_del_offloaded_rule(struct mlx5_eswitch *esw, + struct mlx5_flow_handle *rule, + struct mlx5_flow_attr *attr); +void +mlx5_eswitch_del_fwd_rule(struct mlx5_eswitch *esw, + struct mlx5_flow_handle *rule, + struct mlx5_flow_attr *attr); + +struct mlx5_flow_handle * +mlx5_eswitch_create_vport_rx_rule(struct mlx5_eswitch *esw, u16 vport, + struct mlx5_flow_destination *dest); + +enum { + SET_VLAN_STRIP = BIT(0), + SET_VLAN_INSERT = BIT(1) +}; + +enum mlx5_flow_match_level { + MLX5_MATCH_NONE = MLX5_INLINE_MODE_NONE, + MLX5_MATCH_L2 = MLX5_INLINE_MODE_L2, + MLX5_MATCH_L3 = MLX5_INLINE_MODE_IP, + MLX5_MATCH_L4 = MLX5_INLINE_MODE_TCP_UDP, +}; + +/* current maximum for flow based vport multicasting */ +#define MLX5_MAX_FLOW_FWD_VPORTS 32 + +enum { + MLX5_ESW_DEST_ENCAP = BIT(0), + MLX5_ESW_DEST_ENCAP_VALID = BIT(1), + MLX5_ESW_DEST_CHAIN_WITH_SRC_PORT_CHANGE = BIT(2), +}; + +struct mlx5_esw_flow_attr { + struct mlx5_eswitch_rep *in_rep; + struct mlx5_core_dev *in_mdev; + struct mlx5_core_dev *counter_dev; + struct mlx5e_tc_int_port *dest_int_port; + struct mlx5e_tc_int_port *int_port; + + int split_count; + int out_count; + + __be16 vlan_proto[MLX5_FS_VLAN_DEPTH]; + u16 vlan_vid[MLX5_FS_VLAN_DEPTH]; + u8 vlan_prio[MLX5_FS_VLAN_DEPTH]; + u8 total_vlan; + struct { + u32 flags; + struct mlx5_eswitch_rep *rep; + struct mlx5_pkt_reformat *pkt_reformat; + struct mlx5_core_dev *mdev; + struct mlx5_termtbl_handle *termtbl; + int src_port_rewrite_act_id; + } dests[MLX5_MAX_FLOW_FWD_VPORTS]; + struct mlx5_rx_tun_attr *rx_tun_attr; + bool is_tunnel_flow; + struct mlx5_pkt_reformat *decap_pkt_reformat; +}; + +int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode, + struct netlink_ext_ack *extack); +int mlx5_devlink_eswitch_mode_get(struct devlink *devlink, u16 *mode); +int mlx5_devlink_eswitch_inline_mode_set(struct devlink *devlink, u8 mode, + struct netlink_ext_ack *extack); +int mlx5_devlink_eswitch_inline_mode_get(struct devlink *devlink, u8 *mode); + +int mlx5_eswitch_vport_modify_other_hca_cap_roce(struct mlx5_eswitch *esw, + struct mlx5_vport *vport, bool value); +int mlx5_eswitch_vport_get_other_hca_cap_roce(struct mlx5_eswitch *esw, + struct mlx5_vport *vport, bool *value); + +int mlx5_devlink_eswitch_encap_mode_set(struct devlink *devlink, + enum devlink_eswitch_encap_mode encap, + struct netlink_ext_ack *extack); +int mlx5_devlink_eswitch_encap_mode_get(struct devlink *devlink, + enum devlink_eswitch_encap_mode *encap); +int mlx5_devlink_eswitch_ipsec_mode_set(struct devlink *devlink, + enum devlink_eswitch_ipsec_mode ipsec, + struct netlink_ext_ack *extack); +int mlx5_devlink_eswitch_ipsec_mode_get(struct devlink *devlink, + enum devlink_eswitch_ipsec_mode *ipsec); +int mlx5_devlink_port_function_hw_addr_get(struct devlink_port *port, + u8 *hw_addr, int *hw_addr_len, + struct netlink_ext_ack *extack); +int mlx5_devlink_port_function_hw_addr_set(struct devlink_port *port, + const u8 *hw_addr, int hw_addr_len, + struct netlink_ext_ack *extack); +int mlx5_devlink_rate_node_tx_max_set(struct devlink *devlink, + const char *group, u64 tx_max, + struct netlink_ext_ack *extack); +int mlx5_devlink_rate_node_tx_share_set(struct devlink *devlink, + const char *group, u64 tx_share, + struct netlink_ext_ack *extack); +int mlx5_devlink_rate_node_new(struct devlink *devlink, const char *group, + struct netlink_ext_ack *extack); +int mlx5_devlink_rate_node_del(struct devlink *devlink, const char *group, + struct netlink_ext_ack *extack); +void *mlx5_eswitch_get_uplink_priv(struct mlx5_eswitch *esw, u8 rep_type); +int mlx5_esw_query_vport_vhca_id(struct mlx5_eswitch *esw, + u16 vport_num, + u16 *vhca_id); + +int mlx5_eswitch_add_vlan_action(struct mlx5_eswitch *esw, + struct mlx5_flow_attr *attr); +int mlx5_eswitch_del_vlan_action(struct mlx5_eswitch *esw, + struct mlx5_flow_attr *attr); +int __mlx5_eswitch_set_vport_vlan(struct mlx5_eswitch *esw, int vport, + u16 vlan, u8 qos, __be16 proto, u8 set_flags); +int mlx5_devlink_port_function_trust_get(struct devlink *devlink, + struct devlink_port *port, + bool *trusted, + struct netlink_ext_ack *extack); +int mlx5_devlink_port_function_trust_set(struct devlink *devlink, + struct devlink_port *port, + bool trusted, + struct netlink_ext_ack *extack); +int mlx5_esw_get_hca_trusted(struct mlx5_eswitch *esw, + u16 vport_num, + bool *trusted); + +static inline bool mlx5_eswitch_vlan_actions_supported(struct mlx5_core_dev *dev, + u8 vlan_depth) +{ + bool ret = MLX5_CAP_ESW_FLOWTABLE_FDB(dev, pop_vlan) && + MLX5_CAP_ESW_FLOWTABLE_FDB(dev, push_vlan); + + if (vlan_depth == 1) + return ret; + + return ret && MLX5_CAP_ESW_FLOWTABLE_FDB(dev, pop_vlan_2) && + MLX5_CAP_ESW_FLOWTABLE_FDB(dev, push_vlan_2); +} + +enum esw_vst_mode { + ESW_VST_MODE_BASIC, + ESW_VST_MODE_STEERING, + ESW_VST_MODE_INSERT_ALWAYS, +}; + +static inline enum esw_vst_mode esw_get_vst_mode(struct mlx5_eswitch *esw) +{ + /* vst mode precedence: + * if vst steering mode is supported use it + * if not, look for vst vport insert always support + * if both not supported, we use basic vst, can't support QinQ + */ + if (MLX5_CAP_ESW_EGRESS_ACL(esw->dev, pop_vlan) && + MLX5_CAP_ESW_INGRESS_ACL(esw->dev, push_vlan)) + return ESW_VST_MODE_STEERING; + else if (MLX5_CAP_ESW(esw->dev, vport_cvlan_insert_always)) + return ESW_VST_MODE_INSERT_ALWAYS; + else + return ESW_VST_MODE_BASIC; +} + +bool mlx5_esw_multipath_prereq(struct mlx5_core_dev *dev0, + struct mlx5_core_dev *dev1); + +const u32 *mlx5_esw_query_functions(struct mlx5_core_dev *dev); + +#define MLX5_DEBUG_ESWITCH_MASK BIT(3) + +#define esw_info(__dev, format, ...) \ + dev_info((__dev)->device, "E-Switch: " format, ##__VA_ARGS__) + +#define esw_warn(__dev, format, ...) \ + dev_warn((__dev)->device, "E-Switch: " format, ##__VA_ARGS__) + +#define esw_debug(dev, format, ...) \ + mlx5_core_dbg_mask(dev, MLX5_DEBUG_ESWITCH_MASK, format, ##__VA_ARGS__) + +static inline bool mlx5_esw_allowed(const struct mlx5_eswitch *esw) +{ + return esw && MLX5_ESWITCH_MANAGER(esw->dev); +} + +/* The returned number is valid only when the dev is eswitch manager. */ +static inline u16 mlx5_eswitch_manager_vport(struct mlx5_core_dev *dev) +{ + return mlx5_core_is_ecpf_esw_manager(dev) ? + MLX5_VPORT_ECPF : MLX5_VPORT_PF; +} + +static inline bool +mlx5_esw_is_manager_vport(const struct mlx5_eswitch *esw, u16 vport_num) +{ + return MLX5_VPORT_MANAGER(esw->dev) && + esw->manager_vport == vport_num; +} + +static inline u16 mlx5_eswitch_first_host_vport_num(struct mlx5_core_dev *dev) +{ + return mlx5_core_is_ecpf_esw_manager(dev) ? + MLX5_VPORT_PF : MLX5_VPORT_FIRST_VF; +} + +static inline bool mlx5_eswitch_is_funcs_handler(const struct mlx5_core_dev *dev) +{ + return mlx5_core_is_ecpf_esw_manager(dev); +} + +/* SF vport numbers in device range from the esw_sf_base_id and log_max_esw_sf. + * Below helpers perform conversion from SF vport index in software array + * to vport number and vice versa. + */ +static inline u16 mlx5_eswitch_sf_vport_base_id(const struct mlx5_core_dev *dev) +{ + return MLX5_CAP_GEN(dev, sf_base_id); +} + +static inline unsigned int +mlx5_esw_vport_to_devlink_port_index(const struct mlx5_core_dev *dev, + u16 vport_num) +{ + return (MLX5_CAP_GEN(dev, vhca_id) << 16) | vport_num; +} + +static inline u16 +mlx5_esw_devlink_port_index_to_vport_num(unsigned int dl_port_index) +{ + return dl_port_index & 0xffff; +} + +static inline bool mlx5_esw_is_fdb_created(struct mlx5_eswitch *esw) +{ + return esw->fdb_table.flags & MLX5_ESW_FDB_CREATED; +} + +/* TODO: This mlx5e_tc function shouldn't be called by eswitch */ +void mlx5e_tc_clean_fdb_peer_flows(struct mlx5_eswitch *esw); + +/* Each mark identifies eswitch vport type. + * MLX5_ESW_VPT_HOST_FN is used to identify both PF and VF ports using + * a single mark. + * MLX5_ESW_VPT_VF identifies a SRIOV VF vport. + * MLX5_ESW_VPT_SF identifies SF vport. + */ +#define MLX5_ESW_VPT_HOST_FN XA_MARK_0 +#define MLX5_ESW_VPT_VF XA_MARK_1 +#define MLX5_ESW_VPT_SF XA_MARK_2 + +/* The vport iterator is valid only after vport are initialized in mlx5_eswitch_init. + * Borrowed the idea from xa_for_each_marked() but with support for desired last element. + */ + +#define mlx5_esw_for_each_vport(esw, index, vport) \ + xa_for_each(&((esw)->vports), index, vport) + +#define mlx5_esw_for_each_entry_marked(xa, index, entry, last, filter) \ + for (index = 0, entry = xa_find(xa, &index, last, filter); \ + entry; entry = xa_find_after(xa, &index, last, filter)) + +#define mlx5_esw_for_each_vport_marked(esw, index, vport, last, filter) \ + mlx5_esw_for_each_entry_marked(&((esw)->vports), index, vport, last, filter) + +#define mlx5_esw_for_each_vf_vport(esw, index, vport, last) \ + mlx5_esw_for_each_vport_marked(esw, index, vport, last, MLX5_ESW_VPT_VF) + +#define mlx5_esw_for_each_host_func_vport(esw, index, vport, last) \ + mlx5_esw_for_each_vport_marked(esw, index, vport, last, MLX5_ESW_VPT_HOST_FN) + +struct mlx5_eswitch *mlx5_devlink_eswitch_get(struct devlink *devlink); +struct mlx5_vport *__must_check +mlx5_eswitch_get_vport(struct mlx5_eswitch *esw, u16 vport_num); + +bool mlx5_eswitch_is_vf_vport(struct mlx5_eswitch *esw, u16 vport_num); +bool mlx5_esw_is_sf_vport(struct mlx5_eswitch *esw, u16 vport_num); + +int mlx5_esw_funcs_changed_handler(struct notifier_block *nb, unsigned long type, void *data); + +int +mlx5_eswitch_enable_pf_vf_vports(struct mlx5_eswitch *esw, + enum mlx5_eswitch_vport_event enabled_events); +void mlx5_eswitch_disable_pf_vf_vports(struct mlx5_eswitch *esw); + +int mlx5_esw_vport_enable(struct mlx5_eswitch *esw, u16 vport_num, + enum mlx5_eswitch_vport_event enabled_events); +void mlx5_esw_vport_disable(struct mlx5_eswitch *esw, u16 vport_num); + +int +esw_vport_create_offloads_acl_tables(struct mlx5_eswitch *esw, + struct mlx5_vport *vport); +void +esw_vport_destroy_offloads_acl_tables(struct mlx5_eswitch *esw, + struct mlx5_vport *vport); + +struct esw_vport_tbl_namespace { + int max_fte; + int max_num_groups; + u32 flags; +}; + +struct mlx5_vport_tbl_attr { + u32 chain; + u16 prio; + u16 vport; + const struct esw_vport_tbl_namespace *vport_ns; +}; + +struct mlx5_flow_table * +mlx5_esw_vporttbl_get(struct mlx5_eswitch *esw, struct mlx5_vport_tbl_attr *attr); +void +mlx5_esw_vporttbl_put(struct mlx5_eswitch *esw, struct mlx5_vport_tbl_attr *attr); + +struct mlx5_flow_handle * +esw_add_restore_rule(struct mlx5_eswitch *esw, u32 tag); + +int esw_offloads_load_rep(struct mlx5_eswitch *esw, u16 vport_num); +void esw_offloads_unload_rep(struct mlx5_eswitch *esw, u16 vport_num); + +int mlx5_esw_offloads_rep_load(struct mlx5_eswitch *esw, u16 vport_num); +void mlx5_esw_offloads_rep_unload(struct mlx5_eswitch *esw, u16 vport_num); + +int mlx5_eswitch_load_vport(struct mlx5_eswitch *esw, u16 vport_num, + enum mlx5_eswitch_vport_event enabled_events); +void mlx5_eswitch_unload_vport(struct mlx5_eswitch *esw, u16 vport_num); + +int mlx5_eswitch_load_vf_vports(struct mlx5_eswitch *esw, u16 num_vfs, + enum mlx5_eswitch_vport_event enabled_events); +void mlx5_eswitch_unload_vf_vports(struct mlx5_eswitch *esw, u16 num_vfs); + +int mlx5_esw_offloads_devlink_port_register(struct mlx5_eswitch *esw, u16 vport_num); +void mlx5_esw_offloads_devlink_port_unregister(struct mlx5_eswitch *esw, u16 vport_num); +struct devlink_port *mlx5_esw_offloads_devlink_port(struct mlx5_eswitch *esw, u16 vport_num); + +void mlx5_esw_vport_debugfs_create(struct mlx5_eswitch *esw, u16 vport_num, bool is_sf, u16 sf_num); +void mlx5_esw_vport_debugfs_destroy(struct mlx5_eswitch *esw, u16 vport_num); + +int mlx5_esw_devlink_sf_port_register(struct mlx5_eswitch *esw, struct devlink_port *dl_port, + u16 vport_num, u32 controller, u32 sfnum); +void mlx5_esw_devlink_sf_port_unregister(struct mlx5_eswitch *esw, u16 vport_num); + +int mlx5_esw_offloads_sf_vport_enable(struct mlx5_eswitch *esw, struct devlink_port *dl_port, + u16 vport_num, u32 controller, u32 sfnum); +void mlx5_esw_offloads_sf_vport_disable(struct mlx5_eswitch *esw, u16 vport_num); +int mlx5_esw_sf_max_hpf_functions(struct mlx5_core_dev *dev, u16 *max_sfs, u16 *sf_base_id); + +int mlx5_esw_vport_vhca_id_set(struct mlx5_eswitch *esw, u16 vport_num); +void mlx5_esw_vport_vhca_id_clear(struct mlx5_eswitch *esw, u16 vport_num); +int mlx5_eswitch_vhca_id_to_vport(struct mlx5_eswitch *esw, u16 vhca_id, u16 *vport_num); + +/** + * mlx5_esw_event_info - Indicates eswitch mode changed/changing. + * + * @new_mode: New mode of eswitch. + */ +struct mlx5_esw_event_info { + u16 new_mode; +}; + +int mlx5_esw_event_notifier_register(struct mlx5_eswitch *esw, struct notifier_block *n); +void mlx5_esw_event_notifier_unregister(struct mlx5_eswitch *esw, struct notifier_block *n); +bool mlx5e_esw_offloads_pet_enabled(const struct mlx5_eswitch *esw); +int mlx5e_esw_offloads_pet_setup(struct mlx5_eswitch *esw, struct mlx5_flow_table *ft); +void mlx5e_esw_offloads_pet_cleanup(struct mlx5_eswitch *esw); + +bool mlx5_esw_hold(struct mlx5_core_dev *dev); +void mlx5_esw_release(struct mlx5_core_dev *dev); +void mlx5_esw_get(struct mlx5_core_dev *dev); +void mlx5_esw_put(struct mlx5_core_dev *dev); +int mlx5_esw_try_lock(struct mlx5_eswitch *esw); +void mlx5_esw_unlock(struct mlx5_eswitch *esw); + +void esw_vport_change_handle_locked(struct mlx5_vport *vport); + +bool mlx5_esw_offloads_controller_valid(const struct mlx5_eswitch *esw, u32 controller); +int mlx5_devlink_eswitch_steering_mode_set(struct devlink *devlink, + enum devlink_eswitch_steering_mode mode); +int mlx5_devlink_eswitch_steering_mode_get(struct devlink *devlink, + enum devlink_eswitch_steering_mode *mode); +int mlx5_devlink_eswitch_vport_match_mode_set(struct devlink *devlink, + enum devlink_eswitch_vport_match_mode mode); +int mlx5_devlink_eswitch_vport_match_mode_get(struct devlink *devlink, + enum devlink_eswitch_vport_match_mode *mode); +int mlx5_devlink_eswitch_lag_port_select_mode_get(struct devlink *devlink, + enum devlink_eswitch_lag_port_select_mode *mode); +int +mlx5_devlink_eswitch_lag_port_select_mode_set(struct devlink *devlink, + enum devlink_eswitch_lag_port_select_mode mode); +int mlx5_eswitch_offloads_config_single_fdb(struct mlx5_eswitch *master_esw, + struct mlx5_eswitch *slave_esw); +void mlx5_eswitch_offloads_destroy_single_fdb(struct mlx5_eswitch *master_esw, + struct mlx5_eswitch *slave_esw); +int mlx5_eswitch_reload_reps(struct mlx5_eswitch *esw); +bool mlx5_esw_host_functions_enabled(const struct mlx5_core_dev *dev); + +static inline int mlx5_eswitch_num_vfs(struct mlx5_eswitch *esw) +{ + if (mlx5_esw_allowed(esw)) + return esw->esw_funcs.num_vfs; + + return 0; +} + +#else /* CONFIG_MLX5_ESWITCH */ +/* eswitch API stubs */ +static inline int mlx5_eswitch_init(struct mlx5_core_dev *dev) { return 0; } +static inline void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw) {} +static inline int mlx5_eswitch_enable(struct mlx5_eswitch *esw, int num_vfs) { return 0; } +static inline void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw, bool clear_vf) {} +static inline void mlx5_eswitch_disable(struct mlx5_eswitch *esw) {} +static inline bool mlx5_eswitch_is_funcs_handler(struct mlx5_core_dev *dev) { return false; } +static inline +int mlx5_eswitch_set_vport_state(struct mlx5_eswitch *esw, u16 vport, int link_state) { return 0; } +static inline const u32 *mlx5_esw_query_functions(struct mlx5_core_dev *dev) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline void mlx5_esw_unlock(struct mlx5_eswitch *esw) { return; } +static inline void mlx5_esw_lock(struct mlx5_eswitch *esw) { return; } + +static inline struct mlx5_flow_handle * +esw_add_restore_rule(struct mlx5_eswitch *esw, u32 tag) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline bool +mlx5_esw_is_manager_vport(const struct mlx5_eswitch *esw, u16 vport_num) +{ + return vport_num ? true : false; +} + +static inline unsigned int +mlx5_esw_vport_to_devlink_port_index(const struct mlx5_core_dev *dev, + u16 vport_num) +{ + return vport_num; +} + +static inline int +mlx5_eswitch_offloads_config_single_fdb(struct mlx5_eswitch *master_esw, + struct mlx5_eswitch *slave_esw) +{ + return 0; +} + +static inline void +mlx5_eswitch_offloads_destroy_single_fdb(struct mlx5_eswitch *master_esw, + struct mlx5_eswitch *slave_esw) {} + +static inline int +mlx5_eswitch_reload_reps(struct mlx5_eswitch *esw) +{ + return 0; +} + +static inline bool mlx5_esw_host_functions_enabled(const struct mlx5_core_dev *dev) +{ + return true; +} +#endif /* CONFIG_MLX5_ESWITCH */ + +int mlx5_eswitch_compat_sysfs_init(struct net_device *netdev); +void mlx5_eswitch_compat_sysfs_cleanup(struct net_device *netdev); +#endif /* __MLX5_ESWITCH_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/eswitch_devlink_compat.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/eswitch_devlink_compat.c new file mode 100644 index 0000000..d53cbe4 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/eswitch_devlink_compat.c @@ -0,0 +1,451 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include "mlx5_core.h" +#include "eswitch.h" +#include "devlink.h" +#include "en.h" + +#ifdef CONFIG_MLX5_ESWITCH + +static char *mode_to_str[] = { + [DEVLINK_ESWITCH_MODE_LEGACY] = "legacy", + [DEVLINK_ESWITCH_MODE_SWITCHDEV] = "switchdev", +}; + +static char *inline_to_str[] = { + [DEVLINK_ESWITCH_INLINE_MODE_NONE] = "none", + [DEVLINK_ESWITCH_INLINE_MODE_LINK] = "link", + [DEVLINK_ESWITCH_INLINE_MODE_NETWORK] = "network", + [DEVLINK_ESWITCH_INLINE_MODE_TRANSPORT] = "transport", +}; + +static char *encap_to_str[] = { + [DEVLINK_ESWITCH_ENCAP_MODE_NONE] = "none", + [DEVLINK_ESWITCH_ENCAP_MODE_BASIC] = "basic", +}; + +static char *steering_mode_to_str[] = { + [DEVLINK_ESWITCH_STEERING_MODE_DMFS] = "dmfs", + [DEVLINK_ESWITCH_STEERING_MODE_SMFS] = "smfs", +}; + +#ifdef HAVE_XFRM_OFFLOAD_FULL +static char *ipsec_to_str[] = { + [DEVLINK_ESWITCH_IPSEC_MODE_NONE] = "none", + [DEVLINK_ESWITCH_IPSEC_MODE_FULL] = "full", +}; +#endif + +static char *vport_match_to_str[] = { + [DEVLINK_ESWITCH_VPORT_MATCH_MODE_METADATA] = "metadata", + [DEVLINK_ESWITCH_VPORT_MATCH_MODE_LEGACY] = "legacy", +}; + +static char *devlink_param_bool_to_str[] = { + [0] = "disable", + [1] = "enable", +}; + +static char *lag_port_select_mode_to_str[] = { + [DEVLINK_ESWITCH_LAG_PORT_SELECT_MODE_QUEUE_AFFINITY] = + "queue_affinity", + [DEVLINK_ESWITCH_LAG_PORT_SELECT_MODE_HASH] = "hash", + [DEVLINK_ESWITCH_LAG_PORT_SELECT_MODE_MULTIPORT_ESW] = "multiport_esw", +}; + +struct devlink_compat_op { +#ifdef HAVE_DEVLINK_ESWITCH_MODE_SET_EXTACK + int (*write_enum)(struct devlink *devlink, enum devlink_eswitch_encap_mode set, struct netlink_ext_ack *extack); + int (*write_enum_ipsec)(struct devlink *devlink, enum devlink_eswitch_ipsec_mode ipsec, struct netlink_ext_ack *extack); + int (*write_u8)(struct devlink *devlink, u8 set, struct netlink_ext_ack *extack); + int (*write_u16)(struct devlink *devlink, u16 set, struct netlink_ext_ack *extack); +#else + int (*write_enum_ipsec)(struct devlink *devlink, enum devlink_eswitch_ipsec_mode ipsec); + int (*write_enum)(struct devlink *devlink, enum devlink_eswitch_encap_mode set); + int (*write_u8)(struct devlink *devlink, u8 set); + int (*write_u16)(struct devlink *devlink, u16 set); +#endif + int (*read_enum)(struct devlink *devlink, enum devlink_eswitch_encap_mode *read); + int (*read_enum_ipsec)(struct devlink *devlink, enum devlink_eswitch_ipsec_mode *ipsec); + int (*read_u8)(struct devlink *devlink, u8 *read); + int (*read_u16)(struct devlink *devlink, u16 *read); + + int (*read_steering_mode)(struct devlink *devlink, enum devlink_eswitch_steering_mode *read); + int (*write_steering_mode)(struct devlink *devlink, enum devlink_eswitch_steering_mode set); + + int (*read_vport_match_mode)(struct devlink *devlink, enum devlink_eswitch_vport_match_mode *read); + int (*write_vport_match_mode)(struct devlink *devlink, enum devlink_eswitch_vport_match_mode set); + + int (*read_lag_port_select_mode)(struct devlink *devlink, + enum devlink_eswitch_lag_port_select_mode *read); + int (*write_lag_port_select_mode)(struct devlink *devlink, + enum devlink_eswitch_lag_port_select_mode set); + + int (*read_ct_action_on_nat_conns)(struct devlink *devlink, u32 id, + struct devlink_param_gset_ctx *ctx); + int (*write_ct_action_on_nat_conns)(struct devlink *devlink, u32 id, + struct devlink_param_gset_ctx *ctx); + + int (*read_param_bool)(struct devlink *devlink, u32 id, + struct devlink_param_gset_ctx *ctx); + int (*write_param_bool)(struct devlink *devlink, u32 id, + struct devlink_param_gset_ctx *ctx); + + char **map; + int map_size; + char *compat_name; +}; + +static struct devlink_compat_op devlink_compat_ops[] = { + { + .read_u16 = mlx5_devlink_eswitch_mode_get, + .write_u16 = mlx5_devlink_eswitch_mode_set, + .map = mode_to_str, + .map_size = ARRAY_SIZE(mode_to_str), + .compat_name = "mode", + }, + { + .read_u8 = mlx5_devlink_eswitch_inline_mode_get, + .write_u8 = mlx5_devlink_eswitch_inline_mode_set, + .map = inline_to_str, + .map_size = ARRAY_SIZE(inline_to_str), + .compat_name = "inline", + }, + { +#ifdef HAVE_DEVLINK_HAS_ESWITCH_ENCAP_MODE_SET_GET_WITH_ENUM + .read_enum = mlx5_devlink_eswitch_encap_mode_get, + .write_enum = mlx5_devlink_eswitch_encap_mode_set, +#else + .read_u8 = mlx5_devlink_eswitch_encap_mode_get, + .write_u8 = mlx5_devlink_eswitch_encap_mode_set, +#endif + .map = encap_to_str, + .map_size = ARRAY_SIZE(encap_to_str), + .compat_name = "encap", + }, + { + .read_steering_mode = mlx5_devlink_eswitch_steering_mode_get, + .write_steering_mode = mlx5_devlink_eswitch_steering_mode_set, + .map = steering_mode_to_str, + .map_size = ARRAY_SIZE(steering_mode_to_str), + .compat_name = "steering_mode", + }, +#ifdef HAVE_XFRM_OFFLOAD_FULL + { + .read_enum_ipsec = mlx5_devlink_eswitch_ipsec_mode_get, + .write_enum_ipsec = mlx5_devlink_eswitch_ipsec_mode_set, + .map = ipsec_to_str, + .map_size = ARRAY_SIZE(ipsec_to_str), + .compat_name = "ipsec_mode", + }, +#endif + { + .read_vport_match_mode = mlx5_devlink_eswitch_vport_match_mode_get, + .write_vport_match_mode = mlx5_devlink_eswitch_vport_match_mode_set, + .map = vport_match_to_str, + .map_size = ARRAY_SIZE(vport_match_to_str), + .compat_name = "vport_match_mode", + }, + { + .read_param_bool = mlx5_devlink_ct_action_on_nat_conns_get, + .write_param_bool = mlx5_devlink_ct_action_on_nat_conns_set, + .compat_name = "ct_action_on_nat_conns", + }, + { + .read_lag_port_select_mode = + mlx5_devlink_eswitch_lag_port_select_mode_get, + .write_lag_port_select_mode = + mlx5_devlink_eswitch_lag_port_select_mode_set, + .map = lag_port_select_mode_to_str, + .map_size = ARRAY_SIZE(lag_port_select_mode_to_str), + .compat_name = "lag_port_select_mode", + }, + { + .read_param_bool = mlx5_devlink_ct_labels_mapping_get, + .write_param_bool = mlx5_devlink_ct_labels_mapping_set, + .compat_name = "ct_labels_mapping", + }, +}; + +struct compat_devlink { + struct mlx5_core_dev *mdev; + struct kobj_attribute devlink_kobj; +}; + +static ssize_t esw_compat_read(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct compat_devlink *cdevlink = container_of(attr, + struct compat_devlink, + devlink_kobj); + struct mlx5_core_dev *dev = cdevlink->mdev; + const char *entname = attr->attr.name; + int i = 0, ret, len = 0, map_size; + struct devlink_compat_op *op = 0; + struct devlink *devlink; + char **map; + u8 read8; + u16 read; + + for (i = 0; i < ARRAY_SIZE(devlink_compat_ops); i++) { + if (!strcmp(devlink_compat_ops[i].compat_name, entname)) + op = &devlink_compat_ops[i]; + } + + if (!op) + return -ENOENT; + + devlink = priv_to_devlink(dev); + map_size = op->map_size; + map = op->map; + + if (op->read_u16) { + ret = op->read_u16(devlink, &read); + } else if (op->read_u8) { + ret = op->read_u8(devlink, &read8); + read = read8; + } else if (op->read_enum) { + enum devlink_eswitch_encap_mode read_enum; + + ret = op->read_enum(devlink, &read_enum); + read = read_enum; + } else if (op->read_steering_mode) { + enum devlink_eswitch_steering_mode read_steering_mode; + + ret = op->read_steering_mode(devlink, &read_steering_mode); + read = read_steering_mode; + } else if (op->read_lag_port_select_mode) { + enum devlink_eswitch_lag_port_select_mode lag_port_select_mode; + + ret = op->read_lag_port_select_mode(devlink, + &lag_port_select_mode); + read = lag_port_select_mode; + } else if (op->read_enum_ipsec) { + enum devlink_eswitch_ipsec_mode read_enum_ipsec; + + ret = op->read_enum_ipsec(devlink, &read_enum_ipsec); + read = read_enum_ipsec; + } else if (op->read_vport_match_mode) { + enum devlink_eswitch_vport_match_mode read_vport_match_mode; + + ret = op->read_vport_match_mode(devlink, &read_vport_match_mode); + read = read_vport_match_mode; + } else if (op->read_param_bool) { + struct devlink_param_gset_ctx ctx; + + ret = op->read_param_bool(devlink, 0, &ctx); + read = ctx.val.vbool; + map = devlink_param_bool_to_str; + map_size = ARRAY_SIZE(devlink_param_bool_to_str); + } else + ret = -ENOENT; + + if (ret < 0) + return ret; + + if (read < map_size && map[read]) + len = sprintf(buf, "%s\n", map[read]); + else + len = sprintf(buf, "return: %d\n", read); + + return len; +} + +static ssize_t esw_compat_write(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct compat_devlink *cdevlink = container_of(attr, + struct compat_devlink, + devlink_kobj); + struct mlx5_core_dev *dev = cdevlink->mdev; +#ifdef HAVE_NETLINK_EXT_ACK + static struct netlink_ext_ack ack = { ._msg = NULL }; +#endif + const char *entname = attr->attr.name; + struct devlink_compat_op *op = 0; + int ret = 0, i = 0, map_size; + struct devlink *devlink; + u16 set = 0; + char **map; + + for (i = 0; i < ARRAY_SIZE(devlink_compat_ops); i++) { + if (!strcmp(devlink_compat_ops[i].compat_name, entname)) { + op = &devlink_compat_ops[i]; + break; + } + } + + if (!op) + return -ENOENT; + + devlink = priv_to_devlink(dev); + map = op->map; + map_size = op->map_size; + + if (op->write_param_bool) { + map = devlink_param_bool_to_str; + map_size = ARRAY_SIZE(devlink_param_bool_to_str); + } + + for (i = 0; i < map_size; i++) { + if (map[i] && sysfs_streq(map[i], buf)) { + set = i; + break; + } + } + + if (i >= map_size) { + mlx5_core_warn(dev, "devlink op %s doesn't support %s argument\n", + op->compat_name, buf); + return -EINVAL; + } + + if (op->write_u16) + ret = op->write_u16(devlink, set +#ifdef HAVE_DEVLINK_ESWITCH_MODE_SET_EXTACK + , &ack +#endif + ); + else if (op->write_u8) + ret = op->write_u8(devlink, set +#ifdef HAVE_DEVLINK_ESWITCH_MODE_SET_EXTACK + , &ack +#endif + ); + else if (op->write_enum) + ret = op->write_enum(devlink, set +#ifdef HAVE_DEVLINK_ESWITCH_MODE_SET_EXTACK + , &ack +#endif + ); + else if (op->write_steering_mode) + ret = op->write_steering_mode(devlink, set); + else if (op->write_lag_port_select_mode) + ret = op->write_lag_port_select_mode(devlink, set); + else if (op->write_param_bool) { + struct devlink_param_gset_ctx ctx; + + ctx.val.vbool = set; + ret = op->write_param_bool(devlink, 0, &ctx); + } else if (op->write_vport_match_mode) + ret = op->write_vport_match_mode(devlink, set); + else if (op->write_enum_ipsec) + ret = op->write_enum_ipsec(devlink, set +#ifdef HAVE_DEVLINK_ESWITCH_MODE_SET_EXTACK + , &ack +#endif + ); + else + ret = -EINVAL; + +#ifdef HAVE_NETLINK_EXT_ACK + if (ack._msg) + mlx5_core_warn(dev, "%s\n", ack._msg); +#endif + if (ret < 0) + return ret; + + return count; +} + +int mlx5_eswitch_compat_sysfs_init(struct net_device *netdev) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct kobj_attribute *kobj; + struct compat_devlink *cdevlink; + struct mlx5_core_dev *mdev; + int i; + int err; + + mdev = priv->mdev; + mdev->mlx5e_res.compat.compat_kobj = kobject_create_and_add("compat", + &netdev->dev.kobj); + if (!mdev->mlx5e_res.compat.compat_kobj) + return -ENOMEM; + + mdev->mlx5e_res.compat.devlink_kobj = + kobject_create_and_add("devlink", + mdev->mlx5e_res.compat.compat_kobj); + if (!mdev->mlx5e_res.compat.devlink_kobj) { + err = -ENOMEM; + goto cleanup_compat; + } + + cdevlink = kzalloc(sizeof(*cdevlink) * ARRAY_SIZE(devlink_compat_ops), + GFP_KERNEL); + if (!cdevlink) { + err = -ENOMEM; + goto cleanup_devlink; + } + mdev->mlx5e_res.compat.devlink_attributes = cdevlink; + + for (i = 0; i < ARRAY_SIZE(devlink_compat_ops); i++) { + cdevlink->mdev = priv->mdev; + kobj = &cdevlink->devlink_kobj; + sysfs_attr_init(&kobj->attr); + kobj->attr.mode = 0644; + kobj->attr.name = devlink_compat_ops[i].compat_name; + kobj->show = esw_compat_read; + kobj->store = esw_compat_write; + WARN_ON_ONCE(sysfs_create_file(mdev->mlx5e_res.compat.devlink_kobj, + &kobj->attr)); + cdevlink++; + } + + return 0; + +cleanup_devlink: + kobject_put(mdev->mlx5e_res.compat.devlink_kobj); +cleanup_compat: + kobject_put(mdev->mlx5e_res.compat.compat_kobj); + mdev->mlx5e_res.compat.devlink_kobj = NULL; + return err; +} + +void mlx5_eswitch_compat_sysfs_cleanup(struct net_device *netdev) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct compat_devlink *cdevlink; + struct kobj_attribute *kobj; + struct mlx5_core_dev *mdev; + int i; + + mdev = priv->mdev; + if (!mdev->mlx5e_res.compat.devlink_kobj) + return; + + cdevlink = mdev->mlx5e_res.compat.devlink_attributes; + + for (i = 0; i < ARRAY_SIZE(devlink_compat_ops); i++) { + kobj = &cdevlink->devlink_kobj; + + sysfs_remove_file(mdev->mlx5e_res.compat.devlink_kobj, &kobj->attr); + cdevlink++; + } + kfree(mdev->mlx5e_res.compat.devlink_attributes); + kobject_put(mdev->mlx5e_res.compat.devlink_kobj); + kobject_put(mdev->mlx5e_res.compat.compat_kobj); + + mdev->mlx5e_res.compat.devlink_kobj = NULL; +} + +#else + +int mlx5_eswitch_compat_sysfs_init(struct net_device *netdev) +{ + return 0; +} + +void mlx5_eswitch_compat_sysfs_cleanup(struct net_device *netdev) +{ +} + +#endif /* CONFIG_MLX5_ESWITCH */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c new file mode 100644 index 0000000..4f855f4 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -0,0 +1,4471 @@ +/* + * Copyright (c) 2016, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include "mlx5_core.h" +#include +#include "esw/indir_table.h" +#include "esw/acl/ofld.h" +#include "accel/ipsec_offload.h" +#include "rdma.h" +#include "en.h" +#include "fs_core.h" +#include "lib/devcom.h" +#include "lib/eq.h" +#include "lib/fs_chains.h" +#include "en_tc.h" +#include "en/mapping.h" +#include "esw/ipsec.h" +#include "lag/lag.h" +#include "devlink.h" + +#define mlx5_esw_for_each_rep(esw, i, rep) \ + xa_for_each(&((esw)->offloads.vport_reps), i, rep) + +#define mlx5_esw_for_each_sf_rep(esw, i, rep) \ + xa_for_each_marked(&((esw)->offloads.vport_reps), i, rep, MLX5_ESW_VPT_SF) + +#define mlx5_esw_for_each_vf_rep(esw, index, rep) \ + mlx5_esw_for_each_entry_marked(&((esw)->offloads.vport_reps), index, \ + rep, (esw)->esw_funcs.num_vfs, MLX5_ESW_VPT_VF) + +/* There are two match-all miss flows, one for unicast dst mac and + * one for multicast. + */ +#define MLX5_ESW_MISS_FLOWS (2) +#define UPLINK_REP_INDEX 0 + +#define MLX5_ESW_VPORT_TBL_SIZE 128 +#define MLX5_ESW_VPORT_TBL_NUM_GROUPS 4 + +static const struct esw_vport_tbl_namespace mlx5_esw_vport_tbl_mirror_ns = { + .max_fte = MLX5_ESW_VPORT_TBL_SIZE, + .max_num_groups = MLX5_ESW_VPORT_TBL_NUM_GROUPS, + .flags = 0, +}; + +static struct mlx5_eswitch_rep *mlx5_eswitch_get_rep(struct mlx5_eswitch *esw, + u16 vport_num) +{ + return xa_load(&esw->offloads.vport_reps, vport_num); +} + +static void +mlx5_eswitch_set_rule_flow_source(struct mlx5_eswitch *esw, + struct mlx5_flow_spec *spec, + struct mlx5_esw_flow_attr *attr) +{ + if (!MLX5_CAP_ESW_FLOWTABLE(esw->dev, flow_source) || !attr || !attr->in_rep) + return; + + if (attr->int_port) { + spec->flow_context.flow_source = mlx5e_tc_int_port_get_flow_source(attr->int_port); + + return; + } + + spec->flow_context.flow_source = (attr->in_rep->vport == MLX5_VPORT_UPLINK) ? + MLX5_FLOW_CONTEXT_FLOW_SOURCE_UPLINK : + MLX5_FLOW_CONTEXT_FLOW_SOURCE_LOCAL_VPORT; +} + +/* Actually only the upper 16 bits of reg c0 need to be cleared, but the lower 16 bits + * are not needed as well in the following process. So clear them all for simplicity. + */ +void +mlx5_eswitch_clear_rule_source_port(struct mlx5_eswitch *esw, struct mlx5_flow_spec *spec) +{ + if (mlx5_eswitch_vport_match_metadata_enabled(esw)) { + void *misc2; + + misc2 = MLX5_ADDR_OF(fte_match_param, spec->match_value, misc_parameters_2); + MLX5_SET(fte_match_set_misc2, misc2, metadata_reg_c_0, 0); + + misc2 = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, misc_parameters_2); + MLX5_SET(fte_match_set_misc2, misc2, metadata_reg_c_0, 0); + + if (!memchr_inv(misc2, 0, MLX5_ST_SZ_BYTES(fte_match_set_misc2))) + spec->match_criteria_enable &= ~MLX5_MATCH_MISC_PARAMETERS_2; + } +} + +void mlx5_eswitch_set_rule_source_port(struct mlx5_eswitch *esw, + struct mlx5_flow_spec *spec, + struct mlx5_flow_attr *attr, + struct mlx5_eswitch *src_esw, + u16 vport) +{ + struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr; + u32 metadata; + void *misc2; + void *misc; + + /* Use metadata matching because vport is not represented by single + * VHCA in dual-port RoCE mode, and matching on source vport may fail. + */ + if (mlx5_eswitch_vport_match_metadata_enabled(esw)) { + if (attr && mlx5_esw_indir_table_decap_vport(attr)) + vport = mlx5_esw_indir_table_decap_vport(attr); + + if (attr && !attr->chain && esw_attr->int_port) + metadata = + mlx5e_tc_int_port_get_metadata_for_match(esw_attr->int_port); + else + metadata = + mlx5_eswitch_get_vport_metadata_for_match(src_esw, vport); + + misc2 = MLX5_ADDR_OF(fte_match_param, spec->match_value, misc_parameters_2); + MLX5_SET(fte_match_set_misc2, misc2, metadata_reg_c_0, metadata); + + misc2 = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, misc_parameters_2); + MLX5_SET(fte_match_set_misc2, misc2, metadata_reg_c_0, + mlx5_eswitch_get_vport_metadata_mask()); + + spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS_2; + } else { + misc = MLX5_ADDR_OF(fte_match_param, spec->match_value, misc_parameters); + MLX5_SET(fte_match_set_misc, misc, source_port, vport); + + if (MLX5_CAP_ESW(esw->dev, merged_eswitch)) + MLX5_SET(fte_match_set_misc, misc, + source_eswitch_owner_vhca_id, + MLX5_CAP_GEN(src_esw->dev, vhca_id)); + + misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, misc_parameters); + MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_port); + if (MLX5_CAP_ESW(esw->dev, merged_eswitch)) + MLX5_SET_TO_ONES(fte_match_set_misc, misc, + source_eswitch_owner_vhca_id); + + spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS; + } +} + +static int +esw_setup_decap_indir(struct mlx5_eswitch *esw, + struct mlx5_flow_attr *attr, + struct mlx5_flow_spec *spec) +{ + struct mlx5_flow_table *ft; + + if (!(attr->flags & MLX5_ATTR_FLAG_SRC_REWRITE)) + return -EOPNOTSUPP; + + ft = mlx5_esw_indir_table_get(esw, attr, spec, + mlx5_esw_indir_table_decap_vport(attr), true); + return PTR_ERR_OR_ZERO(ft); +} + +static void +esw_cleanup_decap_indir(struct mlx5_eswitch *esw, + struct mlx5_flow_attr *attr) +{ + if (mlx5_esw_indir_table_decap_vport(attr)) + mlx5_esw_indir_table_put(esw, attr, + mlx5_esw_indir_table_decap_vport(attr), + true); +} + +static int +esw_setup_sampler_dest(struct mlx5_flow_destination *dest, + struct mlx5_flow_act *flow_act, + u32 sampler_id, + int i) +{ + flow_act->flags |= FLOW_ACT_IGNORE_FLOW_LEVEL; + dest[i].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_SAMPLER; + dest[i].sampler_id = sampler_id; + + return 0; +} + +static int +esw_setup_ft_dest(struct mlx5_flow_destination *dest, + struct mlx5_flow_act *flow_act, + struct mlx5_eswitch *esw, + struct mlx5_flow_attr *attr, + struct mlx5_flow_spec *spec, + int i) +{ + flow_act->flags |= FLOW_ACT_IGNORE_FLOW_LEVEL; + dest[i].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dest[i].ft = attr->dest_ft; + + if (mlx5_esw_indir_table_decap_vport(attr)) + return esw_setup_decap_indir(esw, attr, spec); + return 0; +} + +static void +esw_setup_accept_dest(struct mlx5_flow_destination *dest, struct mlx5_flow_act *flow_act, + struct mlx5_fs_chains *chains, int i) +{ + if (mlx5_chains_ignore_flow_level_supported(chains)) + flow_act->flags |= FLOW_ACT_IGNORE_FLOW_LEVEL; + dest[i].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dest[i].ft = mlx5_chains_get_tc_end_ft(chains); +} + +static void +esw_setup_slow_path_dest(struct mlx5_flow_destination *dest, struct mlx5_flow_act *flow_act, + struct mlx5_eswitch *esw, int i) +{ + if (MLX5_CAP_ESW_FLOWTABLE_FDB(esw->dev, ignore_flow_level)) + flow_act->flags |= FLOW_ACT_IGNORE_FLOW_LEVEL; + dest[i].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dest[i].ft = esw->fdb_table.offloads.slow_fdb; +} + +static int +esw_setup_chain_dest(struct mlx5_flow_destination *dest, + struct mlx5_flow_act *flow_act, + struct mlx5_fs_chains *chains, + u32 chain, u32 prio, u32 level, + int i) +{ + struct mlx5_flow_table *ft; + + flow_act->flags |= FLOW_ACT_IGNORE_FLOW_LEVEL; + ft = mlx5_chains_get_table(chains, chain, prio, level); + if (IS_ERR(ft)) + return PTR_ERR(ft); + + dest[i].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dest[i].ft = ft; + return 0; +} + +static void esw_put_dest_tables_loop(struct mlx5_eswitch *esw, struct mlx5_flow_attr *attr, + int from, int to) +{ + struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr; + struct mlx5_fs_chains *chains = esw_chains(esw); + int i; + + for (i = from; i < to; i++) + if (esw_attr->dests[i].flags & MLX5_ESW_DEST_CHAIN_WITH_SRC_PORT_CHANGE) + mlx5_chains_put_table(chains, 0, 1, 0); + else if (mlx5_esw_indir_table_needed(esw, attr, esw_attr->dests[i].rep->vport, + esw_attr->dests[i].mdev)) + mlx5_esw_indir_table_put(esw, attr, esw_attr->dests[i].rep->vport, + false); +} + +static bool +esw_is_chain_src_port_rewrite(struct mlx5_eswitch *esw, struct mlx5_esw_flow_attr *esw_attr) +{ + int i; + + for (i = esw_attr->split_count; i < esw_attr->out_count; i++) + if (esw_attr->dests[i].flags & MLX5_ESW_DEST_CHAIN_WITH_SRC_PORT_CHANGE) + return true; + return false; +} + +static int +esw_setup_chain_src_port_rewrite(struct mlx5_flow_destination *dest, + struct mlx5_flow_act *flow_act, + struct mlx5_eswitch *esw, + struct mlx5_fs_chains *chains, + struct mlx5_flow_attr *attr, + int *i) +{ + struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr; + int err; + + if (!(attr->flags & MLX5_ATTR_FLAG_SRC_REWRITE)) + return -EOPNOTSUPP; + + /* flow steering cannot handle more than one dest with the same ft + * in a single flow + */ + if (esw_attr->out_count - esw_attr->split_count > 1) + return -EOPNOTSUPP; + + err = esw_setup_chain_dest(dest, flow_act, chains, attr->dest_chain, 1, 0, *i); + if (err) + return err; + + if (esw_attr->dests[esw_attr->split_count].pkt_reformat) { + flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT; + flow_act->pkt_reformat = esw_attr->dests[esw_attr->split_count].pkt_reformat; + } + (*i)++; + + return 0; +} + +static void esw_cleanup_chain_src_port_rewrite(struct mlx5_eswitch *esw, + struct mlx5_flow_attr *attr) +{ + struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr; + + esw_put_dest_tables_loop(esw, attr, esw_attr->split_count, esw_attr->out_count); +} + +static bool +esw_is_indir_table(struct mlx5_eswitch *esw, struct mlx5_flow_attr *attr) +{ + struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr; + bool result = false; + int i; + + /* Indirect table is supported only for flows with in_port uplink + * and the destination is vport on the same eswitch as the uplink, + * return false in case at least one of destinations doesn't meet + * this criteria. + */ + for (i = esw_attr->split_count; i < esw_attr->out_count; i++) { + if (esw_attr->dests[i].rep && + mlx5_esw_indir_table_needed(esw, attr, esw_attr->dests[i].rep->vport, + esw_attr->dests[i].mdev)) { + result = true; + } else { + result = false; + break; + } + } + return result; +} + +static int +esw_setup_indir_table(struct mlx5_flow_destination *dest, + struct mlx5_flow_act *flow_act, + struct mlx5_eswitch *esw, + struct mlx5_flow_attr *attr, + struct mlx5_flow_spec *spec, + bool ignore_flow_lvl, + int *i) +{ + struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr; + int j, err; + + if (!(attr->flags & MLX5_ATTR_FLAG_SRC_REWRITE)) + return -EOPNOTSUPP; + + for (j = esw_attr->split_count; j < esw_attr->out_count; j++, (*i)++) { + if (ignore_flow_lvl) + flow_act->flags |= FLOW_ACT_IGNORE_FLOW_LEVEL; + dest[*i].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + + dest[*i].ft = mlx5_esw_indir_table_get(esw, attr, spec, + esw_attr->dests[j].rep->vport, false); + if (IS_ERR(dest[*i].ft)) { + err = PTR_ERR(dest[*i].ft); + goto err_indir_tbl_get; + } + } + + if (mlx5_esw_indir_table_decap_vport(attr)) { + err = esw_setup_decap_indir(esw, attr, spec); + if (err) + goto err_indir_tbl_get; + } + + return 0; + +err_indir_tbl_get: + esw_put_dest_tables_loop(esw, attr, esw_attr->split_count, j); + return err; +} + +static void esw_cleanup_indir_table(struct mlx5_eswitch *esw, struct mlx5_flow_attr *attr) +{ + struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr; + + esw_put_dest_tables_loop(esw, attr, esw_attr->split_count, esw_attr->out_count); + esw_cleanup_decap_indir(esw, attr); +} + +static void +esw_cleanup_chain_dest(struct mlx5_fs_chains *chains, u32 chain, u32 prio, u32 level) +{ + mlx5_chains_put_table(chains, chain, prio, level); +} + +static bool esw_same_vhca_id(struct mlx5_core_dev *mdev1, struct mlx5_core_dev *mdev2) +{ + return (MLX5_CAP_GEN(mdev1, vhca_id) == MLX5_CAP_GEN(mdev2, vhca_id)); +} + +static void +esw_setup_vport_dest(struct mlx5_flow_destination *dest, struct mlx5_flow_act *flow_act, + struct mlx5_eswitch *esw, struct mlx5_esw_flow_attr *esw_attr, + int attr_idx, int dest_idx, bool pkt_reformat) +{ + if (esw->offloads.ipsec == DEVLINK_ESWITCH_IPSEC_MODE_FULL && + esw_attr->dests[attr_idx].rep && + esw_attr->dests[attr_idx].rep->vport == MLX5_VPORT_UPLINK && + esw_attr->dests[attr_idx].rep != esw_attr->in_rep && + esw_same_vhca_id(esw_attr->dests[attr_idx].mdev, esw->dev)) { + /* No vhca_id support with IPsec */ + dest[dest_idx].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dest[dest_idx].ft = mlx5_esw_ipsec_get_table(esw, MLX5_ESW_IPSEC_FT_TX_IKE); + } else { + if (esw_attr->dests[attr_idx].rep->vport == MLX5_VPORT_UPLINK && + mlx5_lag_mpesw_is_activated(esw)) + dest[dest_idx].type = MLX5_FLOW_DESTINATION_TYPE_UPLINK; + else + dest[dest_idx].type = MLX5_FLOW_DESTINATION_TYPE_VPORT; + dest[dest_idx].vport.num = esw_attr->dests[attr_idx].rep->vport; + if (MLX5_CAP_ESW(esw->dev, merged_eswitch)) { + dest[dest_idx].vport.vhca_id = + MLX5_CAP_GEN(esw_attr->dests[attr_idx].mdev, vhca_id); + dest[dest_idx].vport.flags |= MLX5_FLOW_DEST_VPORT_VHCA_ID; + } + } + + if (esw_attr->dests[attr_idx].flags & MLX5_ESW_DEST_ENCAP_VALID) { + if (pkt_reformat) { + flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT; + flow_act->pkt_reformat = esw_attr->dests[attr_idx].pkt_reformat; + } + dest[dest_idx].vport.flags |= MLX5_FLOW_DEST_VPORT_REFORMAT_ID; + dest[dest_idx].vport.pkt_reformat = esw_attr->dests[attr_idx].pkt_reformat; + } +} + + +static int +esw_setup_vport_dests(struct mlx5_flow_destination *dest, struct mlx5_flow_act *flow_act, + struct mlx5_eswitch *esw, struct mlx5_esw_flow_attr *esw_attr, + int i) +{ + int j; + + for (j = esw_attr->split_count; j < esw_attr->out_count; j++, i++) + esw_setup_vport_dest(dest, flow_act, esw, esw_attr, j, i, true); + return i; +} + +static bool +esw_src_port_rewrite_supported(struct mlx5_eswitch *esw) +{ + return MLX5_CAP_GEN(esw->dev, reg_c_preserve) && + mlx5_eswitch_vport_match_metadata_enabled(esw) && + MLX5_CAP_ESW_FLOWTABLE_FDB(esw->dev, ignore_flow_level); +} + +static int +esw_setup_dests(struct mlx5_flow_destination *dest, + struct mlx5_flow_act *flow_act, + struct mlx5_eswitch *esw, + struct mlx5_flow_attr *attr, + struct mlx5_flow_spec *spec, + int *i) +{ + struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr; + struct mlx5_fs_chains *chains = esw_chains(esw); + int err = 0; + + if (!mlx5_eswitch_termtbl_required(esw, attr, flow_act, spec) && + esw_src_port_rewrite_supported(esw)) + attr->flags |= MLX5_ATTR_FLAG_SRC_REWRITE; + + if (attr->flags & MLX5_ATTR_FLAG_SAMPLE && + !(attr->flags & MLX5_ATTR_FLAG_SLOW_PATH)) { + esw_setup_sampler_dest(dest, flow_act, attr->sample_attr.sampler_id, *i); + (*i)++; + } else if (attr->dest_ft) { + esw_setup_ft_dest(dest, flow_act, esw, attr, spec, *i); + (*i)++; + } else if (attr->flags & MLX5_ATTR_FLAG_SLOW_PATH) { + esw_setup_slow_path_dest(dest, flow_act, esw, *i); + (*i)++; + } else if (attr->flags & MLX5_ATTR_FLAG_ACCEPT) { + esw_setup_accept_dest(dest, flow_act, chains, *i); + (*i)++; + } else if (attr->dest_chain) { + err = esw_setup_chain_dest(dest, flow_act, chains, attr->dest_chain, + 1, 0, *i); + (*i)++; + } else if (esw_is_indir_table(esw, attr)) { + err = esw_setup_indir_table(dest, flow_act, esw, attr, spec, true, i); + } else if (esw_is_chain_src_port_rewrite(esw, esw_attr)) { + err = esw_setup_chain_src_port_rewrite(dest, flow_act, esw, chains, attr, i); + } else { + *i = esw_setup_vport_dests(dest, flow_act, esw, esw_attr, *i); + } + + return err; +} + +static void +esw_cleanup_dests(struct mlx5_eswitch *esw, + struct mlx5_flow_attr *attr) +{ + struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr; + struct mlx5_fs_chains *chains = esw_chains(esw); + + if (attr->dest_ft) { + esw_cleanup_decap_indir(esw, attr); + } else if (!mlx5e_tc_attr_flags_skip(attr->flags)) { + if (attr->dest_chain) + esw_cleanup_chain_dest(chains, attr->dest_chain, 1, 0); + else if (esw_is_indir_table(esw, attr)) + esw_cleanup_indir_table(esw, attr); + else if (esw_is_chain_src_port_rewrite(esw, esw_attr)) + esw_cleanup_chain_src_port_rewrite(esw, attr); + } +} + +static void +esw_setup_meter(struct mlx5_flow_attr *attr, struct mlx5_flow_act *flow_act) +{ + struct mlx5e_flow_meter_handle *meter; + + meter = attr->meter_attr.meter; + flow_act->exe_aso.type = attr->exe_aso_type; + flow_act->exe_aso.object_id = meter->obj_id; + flow_act->exe_aso.flow_meter.meter_idx = meter->idx; + flow_act->exe_aso.flow_meter.init_color = MLX5_FLOW_METER_COLOR_GREEN; + /* use metadata reg 5 for packet color */ + flow_act->exe_aso.return_reg_id = 5; +} + +struct mlx5_flow_handle * +mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw, + struct mlx5_flow_spec *spec, + struct mlx5_flow_attr *attr) +{ + struct mlx5_flow_act flow_act = { .flags = FLOW_ACT_NO_APPEND, }; + struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr; + struct mlx5_fs_chains *chains = esw_chains(esw); + bool split = !!(esw_attr->split_count); + struct mlx5_vport_tbl_attr fwd_attr; + struct mlx5_flow_destination *dest; + struct mlx5_flow_handle *rule; + struct mlx5_flow_table *fdb; + int i = 0; + + if (esw->mode != MLX5_ESWITCH_OFFLOADS) + return ERR_PTR(-EOPNOTSUPP); + + dest = kcalloc(MLX5_MAX_FLOW_FWD_VPORTS + 1, sizeof(*dest), GFP_KERNEL); + if (!dest) + return ERR_PTR(-ENOMEM); + + flow_act.action = attr->action; + /* if per flow vlan pop/push is emulated, don't set that into the firmware */ + if (!mlx5_eswitch_vlan_actions_supported(esw->dev, 1)) + flow_act.action &= ~(MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH | + MLX5_FLOW_CONTEXT_ACTION_VLAN_POP); + else if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH) { + flow_act.vlan[0].ethtype = ntohs(esw_attr->vlan_proto[0]); + flow_act.vlan[0].vid = esw_attr->vlan_vid[0]; + flow_act.vlan[0].prio = esw_attr->vlan_prio[0]; + if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH_2) { + flow_act.vlan[1].ethtype = ntohs(esw_attr->vlan_proto[1]); + flow_act.vlan[1].vid = esw_attr->vlan_vid[1]; + flow_act.vlan[1].prio = esw_attr->vlan_prio[1]; + } + } + + mlx5_eswitch_set_rule_flow_source(esw, spec, esw_attr); + + if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) { + int err; + + err = esw_setup_dests(dest, &flow_act, esw, attr, spec, &i); + if (err) { + rule = ERR_PTR(err); + goto err_create_goto_table; + } + } + + if (esw_attr->decap_pkt_reformat) + flow_act.pkt_reformat = esw_attr->decap_pkt_reformat; + + if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_COUNT) { + dest[i].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; + dest[i].counter_id = mlx5_fc_id(attr->counter); + i++; + } + + if (attr->outer_match_level != MLX5_MATCH_NONE) + spec->match_criteria_enable |= MLX5_MATCH_OUTER_HEADERS; + if (attr->inner_match_level != MLX5_MATCH_NONE) + spec->match_criteria_enable |= MLX5_MATCH_INNER_HEADERS; + + if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR) + flow_act.modify_hdr = attr->modify_hdr; + + if ((flow_act.action & MLX5_FLOW_CONTEXT_ACTION_EXECUTE_ASO) && + attr->exe_aso_type == MLX5_EXE_ASO_FLOW_METER) + esw_setup_meter(attr, &flow_act); + + if (split) { + fwd_attr.chain = attr->chain; + fwd_attr.prio = attr->prio; + fwd_attr.vport = esw_attr->in_rep->vport; + fwd_attr.vport_ns = &mlx5_esw_vport_tbl_mirror_ns; + + fdb = mlx5_esw_vporttbl_get(esw, &fwd_attr); + } else { + if (attr->chain || attr->prio) + fdb = mlx5_chains_get_table(chains, attr->chain, + attr->prio, 0); + else + fdb = attr->ft; + + if (!(attr->flags & MLX5_ATTR_FLAG_NO_IN_PORT)) + mlx5_eswitch_set_rule_source_port(esw, spec, attr, + esw_attr->in_mdev->priv.eswitch, + esw_attr->in_rep->vport); + } + if (IS_ERR(fdb)) { + rule = ERR_CAST(fdb); + goto err_esw_get; + } + + if (mlx5_eswitch_termtbl_required(esw, attr, &flow_act, spec)) + rule = mlx5_eswitch_add_termtbl_rule(esw, fdb, spec, esw_attr, + &flow_act, dest, i); + else + rule = mlx5_add_flow_rules(fdb, spec, &flow_act, dest, i); + if (IS_ERR(rule)) + goto err_add_rule; + else + atomic64_inc(&esw->offloads.num_flows); + + kfree(dest); + return rule; + +err_add_rule: + if (split) + mlx5_esw_vporttbl_put(esw, &fwd_attr); + else if (attr->chain || attr->prio) + mlx5_chains_put_table(chains, attr->chain, attr->prio, 0); +err_esw_get: + esw_cleanup_dests(esw, attr); +err_create_goto_table: + kfree(dest); + return rule; +} + +struct mlx5_flow_handle * +mlx5_eswitch_add_fwd_rule(struct mlx5_eswitch *esw, + struct mlx5_flow_spec *spec, + struct mlx5_flow_attr *attr) +{ + struct mlx5_flow_act flow_act = { .flags = FLOW_ACT_NO_APPEND, }; + struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr; + struct mlx5_fs_chains *chains = esw_chains(esw); + struct mlx5_vport_tbl_attr fwd_attr; + struct mlx5_flow_destination *dest; + struct mlx5_flow_table *fast_fdb; + struct mlx5_flow_table *fwd_fdb; + struct mlx5_flow_handle *rule; + int i, err = 0; + + dest = kcalloc(MLX5_MAX_FLOW_FWD_VPORTS + 1, sizeof(*dest), GFP_KERNEL); + if (!dest) + return ERR_PTR(-ENOMEM); + + fast_fdb = mlx5_chains_get_table(chains, attr->chain, attr->prio, 0); + if (IS_ERR(fast_fdb)) { + rule = ERR_CAST(fast_fdb); + goto err_get_fast; + } + + fwd_attr.chain = attr->chain; + fwd_attr.prio = attr->prio; + fwd_attr.vport = esw_attr->in_rep->vport; + fwd_attr.vport_ns = &mlx5_esw_vport_tbl_mirror_ns; + fwd_fdb = mlx5_esw_vporttbl_get(esw, &fwd_attr); + if (IS_ERR(fwd_fdb)) { + rule = ERR_CAST(fwd_fdb); + goto err_get_fwd; + } + + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + for (i = 0; i < esw_attr->split_count; i++) { + if (esw_is_indir_table(esw, attr)) + err = esw_setup_indir_table(dest, &flow_act, esw, attr, spec, false, &i); + else if (esw_is_chain_src_port_rewrite(esw, esw_attr)) + err = esw_setup_chain_src_port_rewrite(dest, &flow_act, esw, chains, attr, + &i); + else + esw_setup_vport_dest(dest, &flow_act, esw, esw_attr, i, i, false); + + if (err) { + rule = ERR_PTR(err); + goto err_chain_src_rewrite; + } + } + dest[i].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dest[i].ft = fwd_fdb; + i++; + + mlx5_eswitch_set_rule_source_port(esw, spec, attr, + esw_attr->in_mdev->priv.eswitch, + esw_attr->in_rep->vport); + + if (attr->outer_match_level != MLX5_MATCH_NONE) + spec->match_criteria_enable |= MLX5_MATCH_OUTER_HEADERS; + + flow_act.flags |= FLOW_ACT_IGNORE_FLOW_LEVEL; + rule = mlx5_add_flow_rules(fast_fdb, spec, &flow_act, dest, i); + + if (IS_ERR(rule)) { + i = esw_attr->split_count; + goto err_chain_src_rewrite; + } + + atomic64_inc(&esw->offloads.num_flows); + + kfree(dest); + return rule; +err_chain_src_rewrite: + esw_put_dest_tables_loop(esw, attr, 0, i); + mlx5_esw_vporttbl_put(esw, &fwd_attr); +err_get_fwd: + mlx5_chains_put_table(chains, attr->chain, attr->prio, 0); +err_get_fast: + kfree(dest); + return rule; +} + +static void +__mlx5_eswitch_del_rule(struct mlx5_eswitch *esw, + struct mlx5_flow_handle *rule, + struct mlx5_flow_attr *attr, + bool fwd_rule) +{ + struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr; + struct mlx5_fs_chains *chains = esw_chains(esw); + bool split = (esw_attr->split_count > 0); + struct mlx5_vport_tbl_attr fwd_attr; + int i; + + mlx5_del_flow_rules(rule); + + if (!mlx5e_tc_attr_flags_skip(attr->flags)) { + /* unref the term table */ + for (i = 0; i < MLX5_MAX_FLOW_FWD_VPORTS; i++) { + if (esw_attr->dests[i].termtbl) + mlx5_eswitch_termtbl_put(esw, esw_attr->dests[i].termtbl); + } + } + + atomic64_dec(&esw->offloads.num_flows); + + if (fwd_rule || split) { + fwd_attr.chain = attr->chain; + fwd_attr.prio = attr->prio; + fwd_attr.vport = esw_attr->in_rep->vport; + fwd_attr.vport_ns = &mlx5_esw_vport_tbl_mirror_ns; + } + + if (fwd_rule) { + mlx5_esw_vporttbl_put(esw, &fwd_attr); + mlx5_chains_put_table(chains, attr->chain, attr->prio, 0); + esw_put_dest_tables_loop(esw, attr, 0, esw_attr->split_count); + } else { + if (split) + mlx5_esw_vporttbl_put(esw, &fwd_attr); + else if (attr->chain || attr->prio) + mlx5_chains_put_table(chains, attr->chain, attr->prio, 0); + esw_cleanup_dests(esw, attr); + } +} + +void +mlx5_eswitch_del_offloaded_rule(struct mlx5_eswitch *esw, + struct mlx5_flow_handle *rule, + struct mlx5_flow_attr *attr) +{ + __mlx5_eswitch_del_rule(esw, rule, attr, false); +} + +void +mlx5_eswitch_del_fwd_rule(struct mlx5_eswitch *esw, + struct mlx5_flow_handle *rule, + struct mlx5_flow_attr *attr) +{ + __mlx5_eswitch_del_rule(esw, rule, attr, true); +} + +static int esw_set_global_vlan_pop(struct mlx5_eswitch *esw, u8 val) +{ + struct mlx5_eswitch_rep *rep; + unsigned long i; + int err = 0; + + esw_debug(esw->dev, "%s applying global %s policy\n", __func__, val ? "pop" : "none"); + mlx5_esw_for_each_host_func_vport(esw, i, rep, esw->esw_funcs.num_vfs) { + if (atomic_read(&rep->rep_data[REP_ETH].state) != REP_LOADED) + continue; + + err = __mlx5_eswitch_set_vport_vlan(esw, rep->vport, 0, 0, + htons(ETH_P_8021Q), val); + if (err) + goto out; + } + +out: + return err; +} + +static struct mlx5_eswitch_rep * +esw_vlan_action_get_vport(struct mlx5_esw_flow_attr *attr, bool push, bool pop) +{ + struct mlx5_eswitch_rep *in_rep, *out_rep, *vport = NULL; + + in_rep = attr->in_rep; + out_rep = attr->dests[0].rep; + + if (push) + vport = in_rep; + else if (pop) + vport = out_rep; + else + vport = in_rep; + + return vport; +} + +static int esw_add_vlan_action_check(struct mlx5_esw_flow_attr *attr, + bool push, bool pop, bool fwd) +{ + struct mlx5_eswitch_rep *in_rep, *out_rep; + + if ((push || pop) && !fwd) + goto out_notsupp; + + in_rep = attr->in_rep; + out_rep = attr->dests[0].rep; + + if (push && in_rep->vport == MLX5_VPORT_UPLINK) + goto out_notsupp; + + if (pop && out_rep->vport == MLX5_VPORT_UPLINK) + goto out_notsupp; + + /* vport has vlan push configured, can't offload VF --> wire rules w.o it */ + if (!push && !pop && fwd) + if (in_rep->vlan && out_rep->vport == MLX5_VPORT_UPLINK) + goto out_notsupp; + + /* protects against (1) setting rules with different vlans to push and + * (2) setting rules w.o vlans (attr->vlan = 0) && w. vlans to push (!= 0) + */ + if (push && in_rep->vlan_refcount && (in_rep->vlan != attr->vlan_vid[0])) + goto out_notsupp; + + return 0; + +out_notsupp: + return -EOPNOTSUPP; +} + +int mlx5_eswitch_add_vlan_action(struct mlx5_eswitch *esw, + struct mlx5_flow_attr *attr) +{ + struct offloads_fdb *offloads = &esw->fdb_table.offloads; + struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr; + struct mlx5_eswitch_rep *vport = NULL; + bool push, pop, fwd; + int err = 0; + + /* nop if we're on the vlan push/pop non emulation mode */ + if (mlx5_eswitch_vlan_actions_supported(esw->dev, 1)) + return 0; + + push = !!(attr->action & MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH); + pop = !!(attr->action & MLX5_FLOW_CONTEXT_ACTION_VLAN_POP); + fwd = !!((attr->action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) && + !attr->dest_chain); + + mutex_lock(&esw->state_lock); + + err = esw_add_vlan_action_check(esw_attr, push, pop, fwd); + if (err) + goto unlock; + + attr->flags &= ~MLX5_ATTR_FLAG_VLAN_HANDLED; + + vport = esw_vlan_action_get_vport(esw_attr, push, pop); + + if (!push && !pop && fwd) { + /* tracks VF --> wire rules without vlan push action */ + if (esw_attr->dests[0].rep->vport == MLX5_VPORT_UPLINK) { + vport->vlan_refcount++; + attr->flags |= MLX5_ATTR_FLAG_VLAN_HANDLED; + } + + goto unlock; + } + + if (!push && !pop) + goto unlock; + + if (!(offloads->vlan_push_pop_refcount)) { + /* it's the 1st vlan rule, apply global vlan pop policy */ + err = esw_set_global_vlan_pop(esw, SET_VLAN_STRIP); + if (err) + goto out; + } + offloads->vlan_push_pop_refcount++; + + if (push) { + if (vport->vlan_refcount) + goto skip_set_push; + + err = __mlx5_eswitch_set_vport_vlan(esw, vport->vport, esw_attr->vlan_vid[0], 0, + htons(ETH_P_8021Q), + SET_VLAN_INSERT | SET_VLAN_STRIP); + if (err) + goto out; + vport->vlan = esw_attr->vlan_vid[0]; +skip_set_push: + vport->vlan_refcount++; + } +out: + if (!err) + attr->flags |= MLX5_ATTR_FLAG_VLAN_HANDLED; +unlock: + mutex_unlock(&esw->state_lock); + return err; +} + +int mlx5_eswitch_del_vlan_action(struct mlx5_eswitch *esw, + struct mlx5_flow_attr *attr) +{ + struct offloads_fdb *offloads = &esw->fdb_table.offloads; + struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr; + struct mlx5_eswitch_rep *vport = NULL; + bool push, pop, fwd; + int err = 0; + + /* nop if we're on the vlan push/pop non emulation mode */ + if (mlx5_eswitch_vlan_actions_supported(esw->dev, 1)) + return 0; + + if (!(attr->flags & MLX5_ATTR_FLAG_VLAN_HANDLED)) + return 0; + + push = !!(attr->action & MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH); + pop = !!(attr->action & MLX5_FLOW_CONTEXT_ACTION_VLAN_POP); + fwd = !!(attr->action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST); + + mutex_lock(&esw->state_lock); + + vport = esw_vlan_action_get_vport(esw_attr, push, pop); + + if (!push && !pop && fwd) { + /* tracks VF --> wire rules without vlan push action */ + if (esw_attr->dests[0].rep->vport == MLX5_VPORT_UPLINK) + vport->vlan_refcount--; + + goto out; + } + + if (push) { + vport->vlan_refcount--; + if (vport->vlan_refcount) + goto skip_unset_push; + + vport->vlan = 0; + err = __mlx5_eswitch_set_vport_vlan(esw, vport->vport, 0, 0, + htons(ETH_P_8021Q), + SET_VLAN_STRIP); + if (err) + goto out; + } + +skip_unset_push: + offloads->vlan_push_pop_refcount--; + if (offloads->vlan_push_pop_refcount) + goto out; + + /* no more vlan rules, stop global vlan pop policy */ + err = esw_set_global_vlan_pop(esw, 0); + +out: + mutex_unlock(&esw->state_lock); + return err; +} + +struct mlx5_flow_handle * +mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *on_esw, + struct mlx5_eswitch *from_esw, + struct mlx5_eswitch_rep *rep, + u32 sqn) +{ + struct mlx5_flow_act flow_act = {0}; + struct mlx5_flow_destination dest = {}; + struct mlx5_flow_handle *flow_rule; + struct mlx5_flow_spec *spec; + void *misc; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) { + flow_rule = ERR_PTR(-ENOMEM); + goto out; + } + + misc = MLX5_ADDR_OF(fte_match_param, spec->match_value, misc_parameters); + MLX5_SET(fte_match_set_misc, misc, source_sqn, sqn); + /* source vport is the esw manager */ + MLX5_SET(fte_match_set_misc, misc, source_port, from_esw->manager_vport); + if (MLX5_CAP_ESW(on_esw->dev, merged_eswitch)) + MLX5_SET(fte_match_set_misc, misc, source_eswitch_owner_vhca_id, + MLX5_CAP_GEN(from_esw->dev, vhca_id)); + + misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, misc_parameters); + MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_sqn); + MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_port); + if (MLX5_CAP_ESW(on_esw->dev, merged_eswitch)) + MLX5_SET_TO_ONES(fte_match_set_misc, misc, + source_eswitch_owner_vhca_id); + + spec->match_criteria_enable = MLX5_MATCH_MISC_PARAMETERS; + if (on_esw->offloads.ipsec == DEVLINK_ESWITCH_IPSEC_MODE_FULL && + rep->vport == MLX5_VPORT_UPLINK) { + /* no vhca_id support for IPsec */ + dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dest.ft = mlx5_esw_ipsec_get_table(on_esw, MLX5_ESW_IPSEC_FT_TX_IKE); + } else { + dest.type = MLX5_FLOW_DESTINATION_TYPE_VPORT; + dest.vport.num = rep->vport; + dest.vport.vhca_id = MLX5_CAP_GEN(rep->esw->dev, vhca_id); + dest.vport.flags |= MLX5_FLOW_DEST_VPORT_VHCA_ID; + } + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + + if (rep->vport == MLX5_VPORT_UPLINK) + spec->flow_context.flow_source = MLX5_FLOW_CONTEXT_FLOW_SOURCE_LOCAL_VPORT; + + flow_rule = mlx5_add_flow_rules(on_esw->fdb_table.offloads.slow_fdb, + spec, &flow_act, &dest, 1); + if (IS_ERR(flow_rule)) + esw_warn(on_esw->dev, "FDB: Failed to add send to vport rule err %ld\n", + PTR_ERR(flow_rule)); +out: + kvfree(spec); + return flow_rule; +} +EXPORT_SYMBOL(mlx5_eswitch_add_send_to_vport_rule); + +void mlx5_eswitch_del_send_to_vport_rule(struct mlx5_flow_handle *rule) +{ + mlx5_del_flow_rules(rule); +} + +void mlx5_eswitch_del_send_to_vport_meta_rule(struct mlx5_flow_handle *rule) +{ + if (rule) + mlx5_del_flow_rules(rule); +} + +struct mlx5_flow_handle * +mlx5_eswitch_add_send_to_vport_meta_rule(struct mlx5_eswitch *esw, u16 vport_num) +{ + struct mlx5_flow_destination dest = {}; + struct mlx5_flow_act flow_act = {0}; + struct mlx5_flow_handle *flow_rule; + struct mlx5_flow_spec *spec; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) + return ERR_PTR(-ENOMEM); + + MLX5_SET(fte_match_param, spec->match_criteria, + misc_parameters_2.metadata_reg_c_0, mlx5_eswitch_get_vport_metadata_mask()); + MLX5_SET(fte_match_param, spec->match_criteria, + misc_parameters_2.metadata_reg_c_1, ESW_TUN_MASK); + MLX5_SET(fte_match_param, spec->match_value, misc_parameters_2.metadata_reg_c_1, + ESW_TUN_SLOW_TABLE_GOTO_VPORT_MARK); + + spec->match_criteria_enable = MLX5_MATCH_MISC_PARAMETERS_2; + dest.type = MLX5_FLOW_DESTINATION_TYPE_VPORT; + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + + MLX5_SET(fte_match_param, spec->match_value, misc_parameters_2.metadata_reg_c_0, + mlx5_eswitch_get_vport_metadata_for_match(esw, vport_num)); + dest.vport.num = vport_num; + + flow_rule = mlx5_add_flow_rules(esw->fdb_table.offloads.slow_fdb, + spec, &flow_act, &dest, 1); + if (IS_ERR(flow_rule)) + esw_warn(esw->dev, "FDB: Failed to add send to vport meta rule vport %d, err %ld\n", + vport_num, PTR_ERR(flow_rule)); + + kvfree(spec); + return flow_rule; +} + +static bool mlx5_eswitch_reg_c1_loopback_supported(struct mlx5_eswitch *esw) +{ + return MLX5_CAP_ESW_FLOWTABLE(esw->dev, fdb_to_vport_reg_c_id) & + MLX5_FDB_TO_VPORT_REG_C_1; +} + +static int esw_set_passing_vport_metadata(struct mlx5_eswitch *esw, bool enable) +{ + u32 out[MLX5_ST_SZ_DW(query_esw_vport_context_out)] = {}; + u32 min[MLX5_ST_SZ_DW(modify_esw_vport_context_in)] = {}; + u32 in[MLX5_ST_SZ_DW(query_esw_vport_context_in)] = {}; + u8 curr, wanted; + int err; + + if (!mlx5_eswitch_reg_c1_loopback_supported(esw) && + !mlx5_eswitch_vport_match_metadata_enabled(esw)) + return 0; + + MLX5_SET(query_esw_vport_context_in, in, opcode, + MLX5_CMD_OP_QUERY_ESW_VPORT_CONTEXT); + err = mlx5_cmd_exec_inout(esw->dev, query_esw_vport_context, in, out); + if (err) + return err; + + curr = MLX5_GET(query_esw_vport_context_out, out, + esw_vport_context.fdb_to_vport_reg_c_id); + wanted = MLX5_FDB_TO_VPORT_REG_C_0; + if (mlx5_eswitch_reg_c1_loopback_supported(esw)) + wanted |= MLX5_FDB_TO_VPORT_REG_C_1; + + if (enable) + curr |= wanted; + else + curr &= ~wanted; + + MLX5_SET(modify_esw_vport_context_in, min, + esw_vport_context.fdb_to_vport_reg_c_id, curr); + MLX5_SET(modify_esw_vport_context_in, min, + field_select.fdb_to_vport_reg_c_id, 1); + + err = mlx5_eswitch_modify_esw_vport_context(esw->dev, 0, false, min); + if (!err) { + if (enable && (curr & MLX5_FDB_TO_VPORT_REG_C_1)) + esw->flags |= MLX5_ESWITCH_REG_C1_LOOPBACK_ENABLED; + else + esw->flags &= ~MLX5_ESWITCH_REG_C1_LOOPBACK_ENABLED; + } + + return err; +} + +static void peer_miss_rules_setup(struct mlx5_eswitch *esw, + struct mlx5_core_dev *peer_dev, + struct mlx5_flow_spec *spec, + struct mlx5_flow_destination *dest) +{ + void *misc; + + if (mlx5_eswitch_vport_match_metadata_enabled(esw)) { + misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, + misc_parameters_2); + MLX5_SET(fte_match_set_misc2, misc, metadata_reg_c_0, + mlx5_eswitch_get_vport_metadata_mask()); + + spec->match_criteria_enable = MLX5_MATCH_MISC_PARAMETERS_2; + } else { + misc = MLX5_ADDR_OF(fte_match_param, spec->match_value, + misc_parameters); + + MLX5_SET(fte_match_set_misc, misc, source_eswitch_owner_vhca_id, + MLX5_CAP_GEN(peer_dev, vhca_id)); + + spec->match_criteria_enable = MLX5_MATCH_MISC_PARAMETERS; + + misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, + misc_parameters); + MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_port); + MLX5_SET_TO_ONES(fte_match_set_misc, misc, + source_eswitch_owner_vhca_id); + } + + dest->type = MLX5_FLOW_DESTINATION_TYPE_VPORT; + dest->vport.num = peer_dev->priv.eswitch->manager_vport; + dest->vport.vhca_id = MLX5_CAP_GEN(peer_dev, vhca_id); + dest->vport.flags |= MLX5_FLOW_DEST_VPORT_VHCA_ID; +} + +static void esw_set_peer_miss_rule_source_port(struct mlx5_eswitch *esw, + struct mlx5_eswitch *peer_esw, + struct mlx5_flow_spec *spec, + u16 vport) +{ + void *misc; + + if (mlx5_eswitch_vport_match_metadata_enabled(esw)) { + misc = MLX5_ADDR_OF(fte_match_param, spec->match_value, + misc_parameters_2); + MLX5_SET(fte_match_set_misc2, misc, metadata_reg_c_0, + mlx5_eswitch_get_vport_metadata_for_match(peer_esw, + vport)); + } else { + misc = MLX5_ADDR_OF(fte_match_param, spec->match_value, + misc_parameters); + MLX5_SET(fte_match_set_misc, misc, source_port, vport); + } +} + +static int esw_add_fdb_peer_miss_rules(struct mlx5_eswitch *esw, + struct mlx5_core_dev *peer_dev) +{ + struct mlx5_flow_destination dest = {}; + struct mlx5_flow_act flow_act = {0}; + struct mlx5_flow_handle **flows; + /* total vports is the same for both e-switches */ + int nvports = esw->total_vports; + struct mlx5_flow_handle *flow; + struct mlx5_flow_spec *spec; + struct mlx5_vport *vport; + unsigned long i; + void *misc; + int err; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) + return -ENOMEM; + + peer_miss_rules_setup(esw, peer_dev, spec, &dest); + + flows = kvcalloc(nvports, sizeof(*flows), GFP_KERNEL); + if (!flows) { + err = -ENOMEM; + goto alloc_flows_err; + } + + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + misc = MLX5_ADDR_OF(fte_match_param, spec->match_value, + misc_parameters); + + if (mlx5_core_is_ecpf_esw_manager(esw->dev) && + mlx5_esw_host_functions_enabled(esw->dev) && + mlx5_esw_host_functions_enabled(peer_dev)) { + vport = mlx5_eswitch_get_vport(esw, MLX5_VPORT_PF); + esw_set_peer_miss_rule_source_port(esw, peer_dev->priv.eswitch, + spec, MLX5_VPORT_PF); + + flow = mlx5_add_flow_rules(esw->fdb_table.offloads.slow_fdb, + spec, &flow_act, &dest, 1); + if (IS_ERR(flow)) { + err = PTR_ERR(flow); + goto add_pf_flow_err; + } + flows[vport->index] = flow; + } + + if (mlx5_ecpf_vport_exists(esw->dev)) { + vport = mlx5_eswitch_get_vport(esw, MLX5_VPORT_ECPF); + MLX5_SET(fte_match_set_misc, misc, source_port, MLX5_VPORT_ECPF); + flow = mlx5_add_flow_rules(esw->fdb_table.offloads.slow_fdb, + spec, &flow_act, &dest, 1); + if (IS_ERR(flow)) { + err = PTR_ERR(flow); + goto add_ecpf_flow_err; + } + flows[vport->index] = flow; + } + + if (mlx5_esw_host_functions_enabled(esw->dev) && + mlx5_esw_host_functions_enabled(peer_dev)) { + mlx5_esw_for_each_vf_vport(esw, i, vport, mlx5_core_max_vfs(esw->dev)) { + esw_set_peer_miss_rule_source_port(esw, peer_dev->priv.eswitch, + spec, vport->vport); + + flow = mlx5_add_flow_rules(esw->fdb_table.offloads.slow_fdb, + spec, &flow_act, &dest, 1); + if (IS_ERR(flow)) { + err = PTR_ERR(flow); + goto add_vf_flow_err; + } + flows[vport->index] = flow; + } + } + + esw->fdb_table.offloads.peer_miss_rules = flows; + + kvfree(spec); + return 0; + +add_vf_flow_err: + mlx5_esw_for_each_vf_vport(esw, i, vport, mlx5_core_max_vfs(esw->dev)) { + if (!flows[vport->index]) + continue; + mlx5_del_flow_rules(flows[vport->index]); + } + if (mlx5_ecpf_vport_exists(esw->dev)) { + vport = mlx5_eswitch_get_vport(esw, MLX5_VPORT_ECPF); + mlx5_del_flow_rules(flows[vport->index]); + } +add_ecpf_flow_err: + if (mlx5_core_is_ecpf_esw_manager(esw->dev) && + mlx5_esw_host_functions_enabled(esw->dev) && + mlx5_esw_host_functions_enabled(peer_dev)) { + vport = mlx5_eswitch_get_vport(esw, MLX5_VPORT_PF); + mlx5_del_flow_rules(flows[vport->index]); + } +add_pf_flow_err: + esw_warn(esw->dev, "FDB: Failed to add peer miss flow rule err %d\n", err); + kvfree(flows); +alloc_flows_err: + kvfree(spec); + return err; +} + +static void esw_del_fdb_peer_miss_rules(struct mlx5_eswitch *esw) +{ + struct mlx5_flow_handle **flows; + struct mlx5_vport *vport; + unsigned long i; + + flows = esw->fdb_table.offloads.peer_miss_rules; + + mlx5_esw_for_each_vf_vport(esw, i, vport, mlx5_core_max_vfs(esw->dev)) { + if (!flows[vport->index]) + continue; + mlx5_del_flow_rules(flows[vport->index]); + } + + if (mlx5_ecpf_vport_exists(esw->dev)) { + vport = mlx5_eswitch_get_vport(esw, MLX5_VPORT_ECPF); + mlx5_del_flow_rules(flows[vport->index]); + } + + if (mlx5_core_is_ecpf_esw_manager(esw->dev) && + mlx5_esw_host_functions_enabled(esw->dev)) { + vport = mlx5_eswitch_get_vport(esw, MLX5_VPORT_PF); + if (flows[vport->index]) + mlx5_del_flow_rules(flows[vport->index]); + } + kvfree(flows); +} + +static int esw_add_fdb_miss_rule(struct mlx5_eswitch *esw) +{ + struct mlx5_flow_act flow_act = {0}; + struct mlx5_flow_destination dest = {}; + struct mlx5_flow_handle *flow_rule = NULL; + struct mlx5_flow_spec *spec; + void *headers_c; + void *headers_v; + int err = 0; + u8 *dmac_c; + u8 *dmac_v; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) { + err = -ENOMEM; + goto out; + } + + spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS; + headers_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, + outer_headers); + dmac_c = MLX5_ADDR_OF(fte_match_param, headers_c, + outer_headers.dmac_47_16); + dmac_c[0] = 0x01; + + if (mlx5e_esw_offloads_pet_enabled(esw)) { + esw->offloads.pet_info.enabled = true; + esw->offloads.pet_info.ether_type = + htons(MLX5_CAP_GEN(esw->dev, mlnx_tag_ethertype)); + } + + dest.type = MLX5_FLOW_DESTINATION_TYPE_VPORT; + dest.vport.num = esw->manager_vport; + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + + flow_rule = mlx5_add_flow_rules(esw->fdb_table.offloads.slow_fdb, + spec, &flow_act, &dest, 1); + if (IS_ERR(flow_rule)) { + err = PTR_ERR(flow_rule); + esw_warn(esw->dev, "FDB: Failed to add unicast miss flow rule err %d\n", err); + goto out; + } + + esw->fdb_table.offloads.miss_rule_uni = flow_rule; + + headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, + outer_headers); + dmac_v = MLX5_ADDR_OF(fte_match_param, headers_v, + outer_headers.dmac_47_16); + dmac_v[0] = 0x01; + flow_rule = mlx5_add_flow_rules(esw->fdb_table.offloads.slow_fdb, + spec, &flow_act, &dest, 1); + if (IS_ERR(flow_rule)) { + err = PTR_ERR(flow_rule); + esw_warn(esw->dev, "FDB: Failed to add multicast miss flow rule err %d\n", err); + mlx5_del_flow_rules(esw->fdb_table.offloads.miss_rule_uni); + goto out; + } + + esw->fdb_table.offloads.miss_rule_multi = flow_rule; + +out: + kvfree(spec); + return err; +} + +struct mlx5_flow_handle * +esw_add_restore_rule(struct mlx5_eswitch *esw, u32 tag) +{ + struct mlx5_flow_act flow_act = { .flags = FLOW_ACT_NO_APPEND, }; + struct mlx5_flow_table *ft = esw->offloads.ft_offloads_restore; + struct mlx5_flow_context *flow_context; + struct mlx5_flow_handle *flow_rule; + struct mlx5_flow_destination dest; + struct mlx5_flow_spec *spec; + void *misc; + + if (!mlx5_eswitch_reg_c1_loopback_supported(esw)) + return ERR_PTR(-EOPNOTSUPP); + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) + return ERR_PTR(-ENOMEM); + + misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, + misc_parameters_2); + MLX5_SET(fte_match_set_misc2, misc, metadata_reg_c_0, + ESW_REG_C0_USER_DATA_METADATA_MASK); + misc = MLX5_ADDR_OF(fte_match_param, spec->match_value, + misc_parameters_2); + MLX5_SET(fte_match_set_misc2, misc, metadata_reg_c_0, tag); + spec->match_criteria_enable = MLX5_MATCH_MISC_PARAMETERS_2; + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | + MLX5_FLOW_CONTEXT_ACTION_MOD_HDR; + flow_act.modify_hdr = esw->offloads.restore_copy_hdr_id; + + flow_context = &spec->flow_context; + flow_context->flags |= FLOW_CONTEXT_HAS_TAG; + flow_context->flow_tag = tag; + dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dest.ft = esw->offloads.ft_offloads; + + flow_rule = mlx5_add_flow_rules(ft, spec, &flow_act, &dest, 1); + kvfree(spec); + + if (IS_ERR(flow_rule)) + esw_warn(esw->dev, + "Failed to create restore rule for tag: %d, err(%d)\n", + tag, (int)PTR_ERR(flow_rule)); + + return flow_rule; +} + +#define MAX_PF_SQ 256 +#define MAX_SQ_NVPORTS 32 + +static void esw_set_flow_group_source_port(struct mlx5_eswitch *esw, + u32 *flow_group_in) +{ + void *match_criteria = MLX5_ADDR_OF(create_flow_group_in, + flow_group_in, + match_criteria); + + if (mlx5_eswitch_vport_match_metadata_enabled(esw)) { + MLX5_SET(create_flow_group_in, flow_group_in, + match_criteria_enable, + MLX5_MATCH_MISC_PARAMETERS_2); + + MLX5_SET(fte_match_param, match_criteria, + misc_parameters_2.metadata_reg_c_0, + mlx5_eswitch_get_vport_metadata_mask()); + } else { + MLX5_SET(create_flow_group_in, flow_group_in, + match_criteria_enable, + MLX5_MATCH_MISC_PARAMETERS); + + MLX5_SET_TO_ONES(fte_match_param, match_criteria, + misc_parameters.source_port); + } +} + +#if IS_ENABLED(CONFIG_MLX5_CLS_ACT) +static void esw_vport_tbl_put(struct mlx5_eswitch *esw) +{ + struct mlx5_vport_tbl_attr attr; + struct mlx5_vport *vport; + unsigned long i; + + attr.chain = 0; + attr.prio = 1; + mlx5_esw_for_each_vport(esw, i, vport) { + attr.vport = vport->vport; + attr.vport_ns = &mlx5_esw_vport_tbl_mirror_ns; + mlx5_esw_vporttbl_put(esw, &attr); + } +} + +static int esw_vport_tbl_get(struct mlx5_eswitch *esw) +{ + struct mlx5_vport_tbl_attr attr; + struct mlx5_flow_table *fdb; + struct mlx5_vport *vport; + unsigned long i; + + attr.chain = 0; + attr.prio = 1; + mlx5_esw_for_each_vport(esw, i, vport) { + attr.vport = vport->vport; + attr.vport_ns = &mlx5_esw_vport_tbl_mirror_ns; + fdb = mlx5_esw_vporttbl_get(esw, &attr); + if (IS_ERR(fdb)) + goto out; + } + return 0; + +out: + esw_vport_tbl_put(esw); + return PTR_ERR(fdb); +} + +#define fdb_modify_header_fwd_to_table_supported(esw) \ + (MLX5_CAP_ESW_FLOWTABLE((esw)->dev, fdb_modify_header_fwd_to_table)) +static void esw_init_chains_offload_flags(struct mlx5_eswitch *esw, u32 *flags) +{ + struct mlx5_core_dev *dev = esw->dev; + + if (MLX5_CAP_ESW_FLOWTABLE_FDB(dev, ignore_flow_level)) + *flags |= MLX5_CHAINS_IGNORE_FLOW_LEVEL_SUPPORTED; + + if (!MLX5_CAP_ESW_FLOWTABLE(dev, multi_fdb_encap) && + esw->offloads.encap != DEVLINK_ESWITCH_ENCAP_MODE_NONE) { + *flags &= ~MLX5_CHAINS_AND_PRIOS_SUPPORTED; + esw_warn(dev, "Tc chains and priorities offload aren't supported, update firmware if needed\n"); + } else if (!mlx5_eswitch_reg_c1_loopback_enabled(esw)) { + *flags &= ~MLX5_CHAINS_AND_PRIOS_SUPPORTED; + esw_warn(dev, "Tc chains and priorities offload aren't supported\n"); + } else if (!fdb_modify_header_fwd_to_table_supported(esw)) { + /* Disabled when ttl workaround is needed, e.g + * when ESWITCH_IPV4_TTL_MODIFY_ENABLE = true in mlxconfig + */ + esw_warn(dev, + "Tc chains and priorities offload aren't supported, check firmware version, or mlxconfig settings\n"); + *flags &= ~MLX5_CHAINS_AND_PRIOS_SUPPORTED; + } else { + *flags |= MLX5_CHAINS_AND_PRIOS_SUPPORTED; + esw_info(dev, "Supported tc chains and prios offload\n"); + } + + if (esw->offloads.encap != DEVLINK_ESWITCH_ENCAP_MODE_NONE) + *flags |= MLX5_CHAINS_FT_TUNNEL_SUPPORTED; +} + +static int +esw_chains_create(struct mlx5_eswitch *esw, struct mlx5_flow_table *miss_fdb) +{ + struct mlx5_core_dev *dev = esw->dev; + struct mlx5_flow_table *nf_ft, *ft; + struct mlx5_chains_attr attr = {}; + struct mlx5_fs_chains *chains; + u32 fdb_max; + int err; + + fdb_max = 1 << MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size); + + esw_init_chains_offload_flags(esw, &attr.flags); + attr.ns = MLX5_FLOW_NAMESPACE_FDB; + attr.max_ft_sz = fdb_max; + attr.max_grp_num = esw->params.large_group_num; + attr.default_ft = miss_fdb; + attr.mapping = esw->offloads.reg_c0_obj_pool; + + chains = mlx5_chains_create(dev, &attr); + if (IS_ERR(chains)) { + err = PTR_ERR(chains); + esw_warn(dev, "Failed to create fdb chains err(%d)\n", err); + return err; + } + + esw->fdb_table.offloads.esw_chains_priv = chains; + + /* Create tc_end_ft which is the always created ft chain */ + nf_ft = mlx5_chains_get_table(chains, mlx5_chains_get_nf_ft_chain(chains), + 1, 0); + if (IS_ERR(nf_ft)) { + err = PTR_ERR(nf_ft); + goto nf_ft_err; + } + + /* Always open the root for fast path */ + ft = mlx5_chains_get_table(chains, 0, 1, 0); + if (IS_ERR(ft)) { + err = PTR_ERR(ft); + goto level_0_err; + } + + /* Open level 1 for split fdb rules now if prios isn't supported */ + if (!mlx5_chains_prios_supported(chains)) { + err = esw_vport_tbl_get(esw); + if (err) + goto level_1_err; + } + + mlx5_chains_set_end_ft(chains, nf_ft); + + return 0; + +level_1_err: + mlx5_chains_put_table(chains, 0, 1, 0); +level_0_err: + mlx5_chains_put_table(chains, mlx5_chains_get_nf_ft_chain(chains), 1, 0); +nf_ft_err: + mlx5_chains_destroy(chains); + esw->fdb_table.offloads.esw_chains_priv = NULL; + + return err; +} + +static void +esw_chains_destroy(struct mlx5_eswitch *esw, struct mlx5_fs_chains *chains) +{ + if (!mlx5_chains_prios_supported(chains)) + esw_vport_tbl_put(esw); + mlx5_chains_put_table(chains, 0, 1, 0); + mlx5_chains_put_table(chains, mlx5_chains_get_nf_ft_chain(chains), 1, 0); + mlx5_chains_destroy(chains); +} + +#else /* CONFIG_MLX5_CLS_ACT */ + +static int +esw_chains_create(struct mlx5_eswitch *esw, struct mlx5_flow_table *miss_fdb) +{ return 0; } + +static void +esw_chains_destroy(struct mlx5_eswitch *esw, struct mlx5_fs_chains *chains) +{} + +#endif + +static int +esw_create_send_to_vport_group(struct mlx5_eswitch *esw, + struct mlx5_flow_table *fdb, + u32 *flow_group_in, + int *ix) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_flow_group *g; + void *match_criteria; + int count, err = 0; + + memset(flow_group_in, 0, inlen); + + MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, + MLX5_MATCH_MISC_PARAMETERS); + + match_criteria = MLX5_ADDR_OF(create_flow_group_in, flow_group_in, match_criteria); + + MLX5_SET_TO_ONES(fte_match_param, match_criteria, misc_parameters.source_sqn); + MLX5_SET_TO_ONES(fte_match_param, match_criteria, misc_parameters.source_port); + if (MLX5_CAP_ESW(esw->dev, merged_eswitch)) { + MLX5_SET_TO_ONES(fte_match_param, match_criteria, + misc_parameters.source_eswitch_owner_vhca_id); + MLX5_SET(create_flow_group_in, flow_group_in, + source_eswitch_owner_vhca_id_valid, 1); + } + + /* See comment at table_size calculation */ + count = MLX5_MAX_PORTS * (esw->total_vports * MAX_SQ_NVPORTS + MAX_PF_SQ); + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 0); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, *ix + count - 1); + *ix += count; + + g = mlx5_create_flow_group(fdb, flow_group_in); + if (IS_ERR(g)) { + err = PTR_ERR(g); + esw_warn(esw->dev, "Failed to create send-to-vport flow group err(%d)\n", err); + goto out; + } + esw->fdb_table.offloads.send_to_vport_grp = g; + +out: + return err; +} + +static int +esw_create_meta_send_to_vport_group(struct mlx5_eswitch *esw, + struct mlx5_flow_table *fdb, + u32 *flow_group_in, + int *ix) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_flow_group *g; + void *match_criteria; + int err = 0; + + if (!esw_src_port_rewrite_supported(esw)) + return 0; + + memset(flow_group_in, 0, inlen); + + MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, + MLX5_MATCH_MISC_PARAMETERS_2); + + match_criteria = MLX5_ADDR_OF(create_flow_group_in, flow_group_in, match_criteria); + + MLX5_SET(fte_match_param, match_criteria, + misc_parameters_2.metadata_reg_c_0, + mlx5_eswitch_get_vport_metadata_mask()); + MLX5_SET(fte_match_param, match_criteria, + misc_parameters_2.metadata_reg_c_1, ESW_TUN_MASK); + + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, *ix); + MLX5_SET(create_flow_group_in, flow_group_in, + end_flow_index, *ix + esw->total_vports - 1); + *ix += esw->total_vports; + + g = mlx5_create_flow_group(fdb, flow_group_in); + if (IS_ERR(g)) { + err = PTR_ERR(g); + esw_warn(esw->dev, + "Failed to create send-to-vport meta flow group err(%d)\n", err); + goto send_vport_meta_err; + } + esw->fdb_table.offloads.send_to_vport_meta_grp = g; + + return 0; + +send_vport_meta_err: + return err; +} + +static int +esw_create_peer_esw_miss_group(struct mlx5_eswitch *esw, + struct mlx5_flow_table *fdb, + u32 *flow_group_in, + int *ix) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_flow_group *g; + void *match_criteria; + int err = 0; + + if (!MLX5_CAP_ESW(esw->dev, merged_eswitch)) + return 0; + + memset(flow_group_in, 0, inlen); + + esw_set_flow_group_source_port(esw, flow_group_in); + + if (!mlx5_eswitch_vport_match_metadata_enabled(esw)) { + match_criteria = MLX5_ADDR_OF(create_flow_group_in, + flow_group_in, + match_criteria); + + MLX5_SET_TO_ONES(fte_match_param, match_criteria, + misc_parameters.source_eswitch_owner_vhca_id); + + MLX5_SET(create_flow_group_in, flow_group_in, + source_eswitch_owner_vhca_id_valid, 1); + } + + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, *ix); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, + *ix + esw->total_vports - 1); + *ix += esw->total_vports; + + g = mlx5_create_flow_group(fdb, flow_group_in); + if (IS_ERR(g)) { + err = PTR_ERR(g); + esw_warn(esw->dev, "Failed to create peer miss flow group err(%d)\n", err); + goto out; + } + esw->fdb_table.offloads.peer_miss_grp = g; + +out: + return err; +} + +static int +esw_create_miss_group(struct mlx5_eswitch *esw, + struct mlx5_flow_table *fdb, + u32 *flow_group_in, + int *ix) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_flow_group *g; + void *match_criteria; + int err = 0; + u8 *dmac; + + memset(flow_group_in, 0, inlen); + + MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, + MLX5_MATCH_OUTER_HEADERS); + match_criteria = MLX5_ADDR_OF(create_flow_group_in, flow_group_in, + match_criteria); + dmac = MLX5_ADDR_OF(fte_match_param, match_criteria, + outer_headers.dmac_47_16); + dmac[0] = 0x01; + + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, *ix); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, + *ix + MLX5_ESW_MISS_FLOWS); + + g = mlx5_create_flow_group(fdb, flow_group_in); + if (IS_ERR(g)) { + err = PTR_ERR(g); + esw_warn(esw->dev, "Failed to create miss flow group err(%d)\n", err); + goto miss_err; + } + esw->fdb_table.offloads.miss_grp = g; + + err = esw_add_fdb_miss_rule(esw); + if (err) + goto miss_rule_err; + + return 0; + +miss_rule_err: + mlx5_destroy_flow_group(esw->fdb_table.offloads.miss_grp); +miss_err: + return err; +} + +static int esw_create_miss_meter_fdb_tables(struct mlx5_eswitch *esw) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_flow_table_attr ft_attr = {}; + int num_vfs, table_size, err = 0; + struct mlx5_core_dev *dev = esw->dev; + struct mlx5_flow_namespace *root_ns; + struct mlx5_flow_table *fdb = NULL; + struct mlx5_flow_group *g; + void *match_criteria; + void *misc2; + u32 *flow_group_in; + + flow_group_in = kvzalloc(inlen, GFP_KERNEL); + if (!flow_group_in) + return -ENOMEM; + + num_vfs = mlx5_core_max_vfs(esw->dev); + root_ns = esw->fdb_table.offloads.ns; + + /* Create miss meter table and group */ + table_size = num_vfs + 1; + if (mlx5_core_is_ecpf(dev)) + table_size++; + + table_size += mlx5_sf_max_functions(esw->dev); + + ft_attr.max_fte = table_size; + ft_attr.prio = FDB_MISS_METER; + + fdb = mlx5_create_flow_table(root_ns, &ft_attr); + if (IS_ERR(fdb)) { + err = PTR_ERR(fdb); + esw_warn(dev, "Failed to create miss meter FDB Table err %d\n", err); + goto meter_fdb_err; + } + + esw->fdb_table.offloads.miss_meter_fdb = fdb; + + match_criteria = MLX5_ADDR_OF(create_flow_group_in, flow_group_in, match_criteria); + esw_set_flow_group_source_port(esw, flow_group_in); + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 0); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, table_size - 1); + + g = mlx5_create_flow_group(fdb, flow_group_in); + if (IS_ERR(g)) { + err = PTR_ERR(g); + esw_warn(dev, "Failed to create miss meter flow group err(%d)\n", err); + goto meter_g_err; + } + esw->fdb_table.offloads.miss_meter_grp = g; + + /* Create post meter table and group - we only + * need 1 rule per rep to match on red color since + * green will continue to slow_fdb via miss on this + * table. + */ + + ft_attr.level = 1; + + fdb = mlx5_create_flow_table(root_ns, &ft_attr); + if (IS_ERR(fdb)) { + err = PTR_ERR(fdb); + esw_warn(dev, "Failed to create post miss meter FDB Table err %d\n", err); + goto post_meter_fdb_err; + } + esw->fdb_table.offloads.post_miss_meter_fdb = fdb; + + MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, + MLX5_GET(create_flow_group_in, flow_group_in, + match_criteria_enable) | + MLX5_MATCH_MISC_PARAMETERS_2); + misc2 = MLX5_ADDR_OF(fte_match_param, match_criteria, misc_parameters_2); + MLX5_SET(fte_match_set_misc2, misc2, metadata_reg_c_5, 0x3); + + /* Use the already masked source vport and add the + * meter color to the match criteria. + */ + g = mlx5_create_flow_group(fdb, flow_group_in); + if (IS_ERR(g)) { + err = PTR_ERR(g); + esw_warn(dev, "Failed to create post miss meter flow group err(%d)\n", err); + goto post_meter_g_err; + } + esw->fdb_table.offloads.post_miss_meter_grp = g; + + kvfree(flow_group_in); + return 0; + +post_meter_g_err: + mlx5_destroy_flow_table(esw->fdb_table.offloads.post_miss_meter_fdb); + +post_meter_fdb_err: + mlx5_destroy_flow_group(esw->fdb_table.offloads.miss_meter_grp); + +meter_g_err: + mlx5_destroy_flow_table(esw->fdb_table.offloads.miss_meter_fdb); + +meter_fdb_err: + kvfree(flow_group_in); + + return err; +} + +static void esw_destroy_miss_meter_fdb_tables(struct mlx5_eswitch *esw) +{ + mlx5_destroy_flow_group(esw->fdb_table.offloads.post_miss_meter_grp); + mlx5_destroy_flow_table(esw->fdb_table.offloads.post_miss_meter_fdb); + + mlx5_destroy_flow_group(esw->fdb_table.offloads.miss_meter_grp); + mlx5_destroy_flow_table(esw->fdb_table.offloads.miss_meter_fdb); +} + +static int esw_create_offloads_fdb_tables(struct mlx5_eswitch *esw) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_flow_table_attr ft_attr = {}; + struct mlx5_core_dev *dev = esw->dev; + struct mlx5_flow_namespace *root_ns; + struct mlx5_flow_table *fdb = NULL; + int table_size, ix = 0, err = 0; + struct mlx5_flow_table *miss_fdb; + u32 flags = 0, *flow_group_in; + bool miss_meter_supp; + + esw_debug(esw->dev, "Create offloads FDB Tables\n"); + + flow_group_in = kvzalloc(inlen, GFP_KERNEL); + if (!flow_group_in) + return -ENOMEM; + + root_ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_FDB); + if (!root_ns) { + esw_warn(dev, "Failed to get FDB flow namespace\n"); + err = -EOPNOTSUPP; + goto ns_err; + } + esw->fdb_table.offloads.ns = root_ns; + err = mlx5_flow_namespace_set_mode(root_ns, + esw->dev->priv.steering->mode); + if (err) { + esw_warn(dev, "Failed to set FDB namespace steering mode\n"); + goto ns_err; + } + + /* To be strictly correct: + * MLX5_MAX_PORTS * (esw->total_vports * MAX_SQ_NVPORTS + MAX_PF_SQ) + * should be: + * esw->total_vports * MAX_SQ_NVPORTS + MAX_PF_SQ + + * peer_esw->total_vports * MAX_SQ_NVPORTS + MAX_PF_SQ + * but as the peer device might not be in switchdev mode it's not + * possible. We use the fact that by default FW sets max vfs and max sfs + * to the same value on both devices. If it needs to be changed in the future note + * the peer miss group should also be created based on the number of + * total vports of the peer (currently is also uses esw->total_vports). + */ + table_size = 2 * esw->total_vports * MAX_SQ_NVPORTS + + 2 * MLX5E_MAX_NUM_TC * MAX_PF_SQ + + MLX5_ESW_MISS_FLOWS + esw->total_vports * 2; + + /* create the slow path fdb with encap set, so further table instances + * can be created at run time while VFs are probed if the FW allows that. + */ + if (esw->offloads.encap != DEVLINK_ESWITCH_ENCAP_MODE_NONE) + flags |= (MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT | + MLX5_FLOW_TABLE_TUNNEL_EN_DECAP); + + ft_attr.flags = flags; + ft_attr.max_fte = table_size; + ft_attr.prio = FDB_SLOW_PATH; + + fdb = mlx5_create_flow_table(root_ns, &ft_attr); + if (IS_ERR(fdb)) { + err = PTR_ERR(fdb); + esw_warn(dev, "Failed to create slow path FDB Table err %d\n", err); + goto slow_fdb_err; + } + esw->fdb_table.offloads.slow_fdb = fdb; + miss_fdb = esw->fdb_table.offloads.slow_fdb; + + miss_meter_supp = !!(MLX5_CAP_GEN_64(esw->dev, general_obj_types) & + MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_FLOW_METER_ASO); + if (miss_meter_supp) { + err = esw_create_miss_meter_fdb_tables(esw); + if (err) { + esw_warn(dev, "Failed to open miss meter fdb err(%d)\n", err); + goto miss_meter_fdb_err; + } + + miss_fdb = esw->fdb_table.offloads.miss_meter_fdb; + } + + /* Create empty TC-miss managed table. This allows plugging in following + * priorities without directly exposing their level 0 table to + * eswitch_offloads and passing it as miss_fdb to following call to + * esw_chains_create(). + */ + memset(&ft_attr, 0, sizeof(ft_attr)); + ft_attr.prio = FDB_TC_MISS; + esw->fdb_table.offloads.tc_miss_table = mlx5_create_flow_table(root_ns, &ft_attr); + if (IS_ERR(esw->fdb_table.offloads.tc_miss_table)) { + err = PTR_ERR(esw->fdb_table.offloads.tc_miss_table); + esw_warn(dev, "Failed to create TC miss FDB Table err %d\n", err); + goto tc_miss_table_err; + } + + err = esw_chains_create(esw, esw->fdb_table.offloads.tc_miss_table); + if (err) { + esw_warn(dev, "Failed to open fdb chains err(%d)\n", err); + goto fdb_chains_err; + } + + err = mlx5_esw_ipsec_create(esw); + if (err) { + esw_warn(esw->dev, "Failed to create IPsec offloads FDB Tables err %d\n", err); + goto fdb_ipsec_rx_err; + } + + err = esw_create_send_to_vport_group(esw, fdb, flow_group_in, &ix); + if (err) + goto send_vport_err; + + err = esw_create_meta_send_to_vport_group(esw, fdb, flow_group_in, &ix); + if (err) + goto send_vport_meta_err; + + err = esw_create_peer_esw_miss_group(esw, fdb, flow_group_in, &ix); + if (err) + goto peer_miss_err; + + err = esw_create_miss_group(esw, fdb, flow_group_in, &ix); + if (err) + goto miss_err; + + kvfree(flow_group_in); + return 0; + +miss_err: + if (MLX5_CAP_ESW(esw->dev, merged_eswitch)) + mlx5_destroy_flow_group(esw->fdb_table.offloads.peer_miss_grp); +peer_miss_err: + if (esw->fdb_table.offloads.send_to_vport_meta_grp) + mlx5_destroy_flow_group(esw->fdb_table.offloads.send_to_vport_meta_grp); +send_vport_meta_err: + mlx5_destroy_flow_group(esw->fdb_table.offloads.send_to_vport_grp); +send_vport_err: + mlx5_esw_ipsec_destroy(esw); +fdb_ipsec_rx_err: + esw_chains_destroy(esw, esw_chains(esw)); +fdb_chains_err: + mlx5_destroy_flow_table(esw->fdb_table.offloads.tc_miss_table); +tc_miss_table_err: + if (miss_meter_supp) + esw_destroy_miss_meter_fdb_tables(esw); +miss_meter_fdb_err: + mlx5_destroy_flow_table(esw->fdb_table.offloads.slow_fdb); +slow_fdb_err: + /* Holds true only as long as DMFS is the default */ + mlx5_flow_namespace_set_mode(root_ns, MLX5_FLOW_STEERING_MODE_DMFS); +ns_err: + kvfree(flow_group_in); + return err; +} + +static void esw_destroy_offloads_fdb_tables(struct mlx5_eswitch *esw) +{ + if (!esw->fdb_table.offloads.slow_fdb) + return; + + esw_debug(esw->dev, "Destroy offloads FDB Tables\n"); + mlx5_del_flow_rules(esw->fdb_table.offloads.miss_rule_multi); + mlx5_del_flow_rules(esw->fdb_table.offloads.miss_rule_uni); + mlx5_destroy_flow_group(esw->fdb_table.offloads.send_to_vport_grp); + if (esw->fdb_table.offloads.send_to_vport_meta_grp) + mlx5_destroy_flow_group(esw->fdb_table.offloads.send_to_vport_meta_grp); + if (MLX5_CAP_ESW(esw->dev, merged_eswitch)) + mlx5_destroy_flow_group(esw->fdb_table.offloads.peer_miss_grp); + mlx5_destroy_flow_group(esw->fdb_table.offloads.miss_grp); + + mlx5_esw_ipsec_destroy(esw); + esw_chains_destroy(esw, esw_chains(esw)); + + if (MLX5_CAP_GEN_64(esw->dev, general_obj_types) & + MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_FLOW_METER_ASO) + esw_destroy_miss_meter_fdb_tables(esw); + + mlx5_destroy_flow_table(esw->fdb_table.offloads.tc_miss_table); + mlx5_destroy_flow_table(esw->fdb_table.offloads.slow_fdb); + /* Holds true only as long as DMFS is the default */ + mlx5_flow_namespace_set_mode(esw->fdb_table.offloads.ns, + MLX5_FLOW_STEERING_MODE_DMFS); + atomic64_set(&esw->user_count, 0); +} + +static int esw_get_offloads_ft_size(struct mlx5_eswitch *esw) +{ + int nvports; + + nvports = esw->total_vports + MLX5_ESW_MISS_FLOWS; + if (mlx5e_tc_int_port_supported(esw)) + nvports += MLX5E_TC_MAX_INT_PORT_NUM; + + return nvports; +} + +static int esw_create_offloads_table(struct mlx5_eswitch *esw) +{ + struct mlx5_flow_table_attr ft_attr = {}; + struct mlx5_core_dev *dev = esw->dev; + struct mlx5_flow_table *ft_offloads; + struct mlx5_flow_namespace *ns; + int err = 0; +#define ESW_NIC_RX_DROP_RULE (1) + + ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_OFFLOADS); + if (!ns) { + esw_warn(esw->dev, "Failed to get offloads flow namespace\n"); + return -EOPNOTSUPP; + } + + ft_attr.max_fte = esw_get_offloads_ft_size(esw) + ESW_NIC_RX_DROP_RULE; + ft_attr.prio = 1; + + ft_offloads = mlx5_create_flow_table(ns, &ft_attr); + if (IS_ERR(ft_offloads)) { + err = PTR_ERR(ft_offloads); + esw_warn(esw->dev, "Failed to create offloads table, err %d\n", err); + return err; + } + + esw->offloads.ft_offloads = ft_offloads; + return 0; +} + +static void esw_destroy_offloads_table(struct mlx5_eswitch *esw) +{ + struct mlx5_esw_offload *offloads = &esw->offloads; + + mlx5_destroy_flow_table(offloads->ft_offloads); +} + +static int esw_create_vport_rx_group(struct mlx5_eswitch *esw) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_flow_group *g; + u32 *flow_group_in; + int nvports; + int err = 0; + + nvports = esw_get_offloads_ft_size(esw); + flow_group_in = kvzalloc(inlen, GFP_KERNEL); + if (!flow_group_in) + return -ENOMEM; + + /* create vport rx group */ + esw_set_flow_group_source_port(esw, flow_group_in); + + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 0); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, nvports - 1); + + g = mlx5_create_flow_group(esw->offloads.ft_offloads, flow_group_in); + + if (IS_ERR(g)) { + err = PTR_ERR(g); + mlx5_core_warn(esw->dev, "Failed to create vport rx group err %d\n", err); + goto out; + } + + + esw->offloads.vport_rx_group = g; + +out: + kvfree(flow_group_in); + return err; +} + +static void esw_destroy_vport_rx_group(struct mlx5_eswitch *esw) +{ + mlx5_destroy_flow_group(esw->offloads.vport_rx_group); +} + +static int esw_create_vport_rx_drop_group(struct mlx5_eswitch *esw) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_flow_group *g; + u32 *flow_group_in; + int nvports; + int err = 0; + + nvports = esw_get_offloads_ft_size(esw); + + flow_group_in = kvzalloc(inlen, GFP_KERNEL); + if (!flow_group_in) + return -ENOMEM; + + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, nvports); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, nvports + 1); + + g = mlx5_create_flow_group(esw->offloads.ft_offloads, flow_group_in); + + if (IS_ERR(g)) { + err = PTR_ERR(g); + mlx5_core_warn(esw->dev, "Failed to create vport rx drop group err %d\n", err); + goto out; + } + + esw->offloads.vport_rx_drop_group = g; +out: + kvfree(flow_group_in); + return err; +} + +static void esw_destroy_vport_rx_drop_group(struct mlx5_eswitch *esw) +{ + if (esw->offloads.vport_rx_drop_group) + mlx5_destroy_flow_group(esw->offloads.vport_rx_drop_group); +} + +struct mlx5_flow_handle * +mlx5_eswitch_create_vport_rx_rule(struct mlx5_eswitch *esw, u16 vport, + struct mlx5_flow_destination *dest) +{ + struct mlx5_flow_act flow_act = {0}; + struct mlx5_flow_handle *flow_rule; + struct mlx5_flow_spec *spec; + void *misc; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) { + flow_rule = ERR_PTR(-ENOMEM); + goto out; + } + + if (mlx5_eswitch_vport_match_metadata_enabled(esw)) { + misc = MLX5_ADDR_OF(fte_match_param, spec->match_value, misc_parameters_2); + MLX5_SET(fte_match_set_misc2, misc, metadata_reg_c_0, + mlx5_eswitch_get_vport_metadata_for_match(esw, vport)); + + misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, misc_parameters_2); + MLX5_SET(fte_match_set_misc2, misc, metadata_reg_c_0, + mlx5_eswitch_get_vport_metadata_mask()); + + spec->match_criteria_enable = MLX5_MATCH_MISC_PARAMETERS_2; + } else { + misc = MLX5_ADDR_OF(fte_match_param, spec->match_value, misc_parameters); + MLX5_SET(fte_match_set_misc, misc, source_port, vport); + + misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, misc_parameters); + MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_port); + + spec->match_criteria_enable = MLX5_MATCH_MISC_PARAMETERS; + } + + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + flow_rule = mlx5_add_flow_rules(esw->offloads.ft_offloads, spec, + &flow_act, dest, 1); + if (IS_ERR(flow_rule)) { + esw_warn(esw->dev, "fs offloads: Failed to add vport rx rule err %ld\n", PTR_ERR(flow_rule)); + goto out; + } + +out: + kvfree(spec); + return flow_rule; +} + +static int mlx5_eswitch_create_vport_rx_drop_rule(struct mlx5_eswitch *esw) +{ + struct mlx5_flow_act flow_act = {0}; + struct mlx5_flow_handle *flow_rule; + + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP; + flow_rule = mlx5_add_flow_rules(esw->offloads.ft_offloads, NULL, + &flow_act, NULL, 0); + if (IS_ERR(flow_rule)) { + esw_warn(esw->dev, + "fs offloads: Failed to add vport rx drop rule err %ld\n", + PTR_ERR(flow_rule)); + return PTR_ERR(flow_rule); + } + + esw->offloads.vport_rx_drop_rule = flow_rule; + + return 0; +} + +static void mlx5_eswitch_destroy_vport_rx_drop_rule(struct mlx5_eswitch *esw) +{ + if (esw->offloads.vport_rx_drop_rule) + mlx5_del_flow_rules(esw->offloads.vport_rx_drop_rule); +} + +static int mlx5_eswitch_inline_mode_get(struct mlx5_eswitch *esw, u8 *mode) +{ + u8 prev_mlx5_mode, mlx5_mode = MLX5_INLINE_MODE_L2; + struct mlx5_core_dev *dev = esw->dev; + struct mlx5_vport *vport; + unsigned long i; + + if (!MLX5_CAP_GEN(dev, vport_group_manager)) + return -EOPNOTSUPP; + + if (!mlx5_sriov_is_enabled(esw->dev) && !is_mdev_switchdev_mode(esw->dev)) + return -EOPNOTSUPP; + + switch (MLX5_CAP_ETH(dev, wqe_inline_mode)) { + case MLX5_CAP_INLINE_MODE_NOT_REQUIRED: + mlx5_mode = MLX5_INLINE_MODE_NONE; + goto out; + case MLX5_CAP_INLINE_MODE_L2: + mlx5_mode = MLX5_INLINE_MODE_L2; + goto out; + case MLX5_CAP_INLINE_MODE_VPORT_CONTEXT: + goto query_vports; + } + +query_vports: + mlx5_query_nic_vport_min_inline(dev, esw->first_host_vport, &prev_mlx5_mode); + mlx5_esw_for_each_host_func_vport(esw, i, vport, esw->esw_funcs.num_vfs) { + mlx5_query_nic_vport_min_inline(dev, vport->vport, &mlx5_mode); + if (prev_mlx5_mode != mlx5_mode) + return -EINVAL; + prev_mlx5_mode = mlx5_mode; + } + +out: + *mode = mlx5_mode; + return 0; +} + +static void esw_destroy_restore_table(struct mlx5_eswitch *esw) +{ + struct mlx5_esw_offload *offloads = &esw->offloads; + + if (!mlx5_eswitch_reg_c1_loopback_supported(esw)) + return; + + mlx5_modify_header_dealloc(esw->dev, offloads->restore_copy_hdr_id); + mlx5_destroy_flow_group(offloads->restore_group); + mlx5_destroy_flow_table(offloads->ft_offloads_restore); +} + +static int esw_create_restore_table(struct mlx5_eswitch *esw) +{ + u8 modact[MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto)] = {}; + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_flow_table_attr ft_attr = {}; + struct mlx5_core_dev *dev = esw->dev; + struct mlx5_flow_namespace *ns; + struct mlx5_modify_hdr *mod_hdr; + void *match_criteria, *misc; + struct mlx5_flow_table *ft; + struct mlx5_flow_group *g; + u32 *flow_group_in; + int err = 0; + + if (!mlx5_eswitch_reg_c1_loopback_supported(esw)) + return 0; + + ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_OFFLOADS); + if (!ns) { + esw_warn(esw->dev, "Failed to get offloads flow namespace\n"); + return -EOPNOTSUPP; + } + + flow_group_in = kvzalloc(inlen, GFP_KERNEL); + if (!flow_group_in) { + err = -ENOMEM; + goto out_free; + } + + ft_attr.max_fte = 1 << ESW_REG_C0_USER_DATA_METADATA_BITS; + ft = mlx5_create_flow_table(ns, &ft_attr); + if (IS_ERR(ft)) { + err = PTR_ERR(ft); + esw_warn(esw->dev, "Failed to create restore table, err %d\n", + err); + goto out_free; + } + + match_criteria = MLX5_ADDR_OF(create_flow_group_in, flow_group_in, + match_criteria); + misc = MLX5_ADDR_OF(fte_match_param, match_criteria, + misc_parameters_2); + + MLX5_SET(fte_match_set_misc2, misc, metadata_reg_c_0, + ESW_REG_C0_USER_DATA_METADATA_MASK); + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 0); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, + ft_attr.max_fte - 1); + MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, + MLX5_MATCH_MISC_PARAMETERS_2); + g = mlx5_create_flow_group(ft, flow_group_in); + if (IS_ERR(g)) { + err = PTR_ERR(g); + esw_warn(dev, "Failed to create restore flow group, err: %d\n", + err); + goto err_group; + } + + MLX5_SET(copy_action_in, modact, action_type, MLX5_ACTION_TYPE_COPY); + MLX5_SET(copy_action_in, modact, src_field, + MLX5_ACTION_IN_FIELD_METADATA_REG_C_1); + MLX5_SET(copy_action_in, modact, dst_field, + MLX5_ACTION_IN_FIELD_METADATA_REG_B); + mod_hdr = mlx5_modify_header_alloc(esw->dev, + MLX5_FLOW_NAMESPACE_KERNEL, 1, + modact); + if (IS_ERR(mod_hdr)) { + err = PTR_ERR(mod_hdr); + esw_warn(dev, "Failed to create restore mod header, err: %d\n", + err); + goto err_mod_hdr; + } + + esw->offloads.ft_offloads_restore = ft; + esw->offloads.restore_group = g; + esw->offloads.restore_copy_hdr_id = mod_hdr; + + kvfree(flow_group_in); + + return 0; + +err_mod_hdr: + mlx5_destroy_flow_group(g); +err_group: + mlx5_destroy_flow_table(ft); +out_free: + kvfree(flow_group_in); + + return err; +} + +static int esw_offloads_start(struct mlx5_eswitch *esw, + struct netlink_ext_ack *extack) +{ + int err; + + esw->mode = MLX5_ESWITCH_OFFLOADS; + err = mlx5_eswitch_enable_locked(esw, esw->dev->priv.sriov.num_vfs); + if (err) { + NL_SET_ERR_MSG_MOD(extack, "Failed setting eswitch to offloads"); + esw->mode = MLX5_ESWITCH_LEGACY; + esw->fdb_table.flags &= ~MLX5_ESW_FDB_CREATED; + mlx5_rescan_drivers(esw->dev); + } + if (esw->offloads.inline_mode == MLX5_INLINE_MODE_NONE) { + if (mlx5_eswitch_inline_mode_get(esw, + &esw->offloads.inline_mode)) { + esw->offloads.inline_mode = MLX5_INLINE_MODE_L2; + NL_SET_ERR_MSG_MOD(extack, + "Inline mode is different between vports"); + } + } + + return err; +} + +static void mlx5_esw_offloads_rep_mark_set(struct mlx5_eswitch *esw, + struct mlx5_eswitch_rep *rep, + xa_mark_t mark) +{ + bool mark_set; + + /* Copy the mark from vport to its rep */ + mark_set = xa_get_mark(&esw->vports, rep->vport, mark); + if (mark_set) + xa_set_mark(&esw->offloads.vport_reps, rep->vport, mark); +} + +static int mlx5_esw_offloads_rep_init(struct mlx5_eswitch *esw, const struct mlx5_vport *vport) +{ + struct mlx5_eswitch_rep *rep; + int rep_type; + int err; + + rep = kzalloc(sizeof(*rep), GFP_KERNEL); + if (!rep) + return -ENOMEM; + + rep->vport = vport->vport; + rep->vport_index = vport->index; + for (rep_type = 0; rep_type < NUM_REP_TYPES; rep_type++) + atomic_set(&rep->rep_data[rep_type].state, REP_UNREGISTERED); + + err = xa_insert(&esw->offloads.vport_reps, rep->vport, rep, GFP_KERNEL); + if (err) + goto insert_err; + + mlx5_esw_offloads_rep_mark_set(esw, rep, MLX5_ESW_VPT_HOST_FN); + mlx5_esw_offloads_rep_mark_set(esw, rep, MLX5_ESW_VPT_VF); + mlx5_esw_offloads_rep_mark_set(esw, rep, MLX5_ESW_VPT_SF); + return 0; + +insert_err: + kfree(rep); + return err; +} + +static void mlx5_esw_offloads_rep_cleanup(struct mlx5_eswitch *esw, + struct mlx5_eswitch_rep *rep) +{ + xa_erase(&esw->offloads.vport_reps, rep->vport); + kfree(rep); +} + +void esw_offloads_cleanup_reps(struct mlx5_eswitch *esw) +{ + struct mlx5_eswitch_rep *rep; + unsigned long i; + + mlx5_esw_for_each_rep(esw, i, rep) + mlx5_esw_offloads_rep_cleanup(esw, rep); + xa_destroy(&esw->offloads.vport_reps); +} + +int esw_offloads_init_reps(struct mlx5_eswitch *esw) +{ + struct mlx5_vport *vport; + unsigned long i; + int err; + + xa_init(&esw->offloads.vport_reps); + + mlx5_esw_for_each_vport(esw, i, vport) { + err = mlx5_esw_offloads_rep_init(esw, vport); + if (err) + goto err; + } + return 0; + +err: + esw_offloads_cleanup_reps(esw); + return err; +} + +static void __esw_offloads_unload_rep(struct mlx5_eswitch *esw, + struct mlx5_eswitch_rep *rep, u8 rep_type) +{ + if (atomic_read(&rep->rep_data[rep_type].state) == REP_LOADED) { + atomic_set(&rep->rep_data[rep_type].state, REP_REGISTERED); + esw->offloads.rep_ops[rep_type]->unload(rep); + } +} + +static void __unload_reps_sf_vport(struct mlx5_eswitch *esw, u8 rep_type) +{ + struct mlx5_eswitch_rep *rep; + unsigned long i; + + mlx5_esw_for_each_sf_rep(esw, i, rep) + __esw_offloads_unload_rep(esw, rep, rep_type); +} + +static void __unload_reps_all_vport(struct mlx5_eswitch *esw, u8 rep_type) +{ + struct mlx5_eswitch_rep *rep; + unsigned long i; + + __unload_reps_sf_vport(esw, rep_type); + + mlx5_esw_for_each_vf_rep(esw, i, rep) + __esw_offloads_unload_rep(esw, rep, rep_type); + + if (mlx5_ecpf_vport_exists(esw->dev)) { + rep = mlx5_eswitch_get_rep(esw, MLX5_VPORT_ECPF); + __esw_offloads_unload_rep(esw, rep, rep_type); + } + + if (mlx5_core_is_ecpf_esw_manager(esw->dev) && + mlx5_esw_host_functions_enabled(esw->dev)) { + rep = mlx5_eswitch_get_rep(esw, MLX5_VPORT_PF); + __esw_offloads_unload_rep(esw, rep, rep_type); + } + + rep = mlx5_eswitch_get_rep(esw, MLX5_VPORT_UPLINK); + down_write(&esw->mode_lock); + __esw_offloads_unload_rep(esw, rep, rep_type); + up_write(&esw->mode_lock); +} + +int mlx5_esw_offloads_rep_load(struct mlx5_eswitch *esw, u16 vport_num) +{ + struct mlx5_eswitch_rep *rep; + int rep_type; + int err; + + rep = mlx5_eswitch_get_rep(esw, vport_num); + for (rep_type = 0; rep_type < NUM_REP_TYPES; rep_type++) + if (atomic_read(&rep->rep_data[rep_type].state) == REP_REGISTERED) { + err = esw->offloads.rep_ops[rep_type]->load(esw->dev, rep); + if (err) + goto err_reps; + atomic_set(&rep->rep_data[rep_type].state, REP_LOADED); + } + + return 0; + +err_reps: + for (--rep_type; rep_type >= 0; rep_type--) + __esw_offloads_unload_rep(esw, rep, rep_type); + return err; +} + +void mlx5_esw_offloads_rep_unload(struct mlx5_eswitch *esw, u16 vport_num) +{ + struct mlx5_eswitch_rep *rep; + int rep_type; + + rep = mlx5_eswitch_get_rep(esw, vport_num); + for (rep_type = NUM_REP_TYPES - 1; rep_type >= 0; rep_type--) + __esw_offloads_unload_rep(esw, rep, rep_type); +} + +int esw_offloads_load_rep(struct mlx5_eswitch *esw, u16 vport_num) +{ + int err; + + if (esw->mode != MLX5_ESWITCH_OFFLOADS) + return 0; + + if (vport_num != MLX5_VPORT_UPLINK) { + err = mlx5_esw_offloads_devlink_port_register(esw, vport_num); + if (err) + return err; + } + + err = mlx5_esw_offloads_rep_load(esw, vport_num); + if (err) + goto load_err; + return err; + +load_err: + if (vport_num != MLX5_VPORT_UPLINK) + mlx5_esw_offloads_devlink_port_unregister(esw, vport_num); + return err; +} + +void esw_offloads_unload_rep(struct mlx5_eswitch *esw, u16 vport_num) +{ + if (esw->mode != MLX5_ESWITCH_OFFLOADS) + return; + + mlx5_esw_offloads_rep_unload(esw, vport_num); + + if (vport_num != MLX5_VPORT_UPLINK) + mlx5_esw_offloads_devlink_port_unregister(esw, vport_num); +} + +static int esw_set_slave_root_fdb(struct mlx5_core_dev *master, + struct mlx5_core_dev *slave) +{ + u32 in[MLX5_ST_SZ_DW(set_flow_table_root_in)] = {}; + u32 out[MLX5_ST_SZ_DW(set_flow_table_root_out)] = {}; + struct mlx5_flow_root_namespace *root; + struct mlx5_flow_namespace *ns; + int err; + + MLX5_SET(set_flow_table_root_in, in, opcode, + MLX5_CMD_OP_SET_FLOW_TABLE_ROOT); + MLX5_SET(set_flow_table_root_in, in, table_type, + FS_FT_FDB); + + if (master) { + ns = mlx5_get_flow_namespace(master, + MLX5_FLOW_NAMESPACE_FDB); + root = find_root(&ns->node); + mutex_lock(&root->chain_lock); + MLX5_SET(set_flow_table_root_in, in, + table_eswitch_owner_vhca_id_valid, 1); + MLX5_SET(set_flow_table_root_in, in, + table_eswitch_owner_vhca_id, + MLX5_CAP_GEN(master, vhca_id)); + MLX5_SET(set_flow_table_root_in, in, table_id, + root->root_ft->id); + } else { + ns = mlx5_get_flow_namespace(slave, + MLX5_FLOW_NAMESPACE_FDB); + root = find_root(&ns->node); + mutex_lock(&root->chain_lock); + MLX5_SET(set_flow_table_root_in, in, table_id, + root->root_ft->id); + } + + err = mlx5_cmd_exec(slave, in, sizeof(in), out, sizeof(out)); + mutex_unlock(&root->chain_lock); + + return err; +} + +static int __esw_set_master_egress_rule(struct mlx5_core_dev *master, + struct mlx5_core_dev *slave, + struct mlx5_vport *vport, + struct mlx5_flow_table *acl) +{ + struct mlx5_flow_handle *flow_rule = NULL; + struct mlx5_flow_destination dest = {}; + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_spec *spec; + int err = 0; + void *misc; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) + return -ENOMEM; + + spec->match_criteria_enable = MLX5_MATCH_MISC_PARAMETERS; + misc = MLX5_ADDR_OF(fte_match_param, spec->match_value, + misc_parameters); + MLX5_SET(fte_match_set_misc, misc, source_port, MLX5_VPORT_UPLINK); + MLX5_SET(fte_match_set_misc, misc, source_eswitch_owner_vhca_id, + MLX5_CAP_GEN(slave, vhca_id)); + + misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, misc_parameters); + MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_port); + MLX5_SET_TO_ONES(fte_match_set_misc, misc, + source_eswitch_owner_vhca_id); + + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + dest.type = MLX5_FLOW_DESTINATION_TYPE_VPORT; + dest.vport.num = slave->priv.eswitch->manager_vport; + dest.vport.vhca_id = MLX5_CAP_GEN(slave, vhca_id); + dest.vport.flags |= MLX5_FLOW_DEST_VPORT_VHCA_ID; + + flow_rule = mlx5_add_flow_rules(acl, spec, &flow_act, + &dest, 1); + if (IS_ERR(flow_rule)) + err = PTR_ERR(flow_rule); + else + vport->egress.offloads.bounce_rule = flow_rule; + + kvfree(spec); + return err; +} + +static int esw_set_master_egress_rule(struct mlx5_core_dev *master, + struct mlx5_core_dev *slave) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_eswitch *esw = master->priv.eswitch; + struct mlx5_flow_table_attr ft_attr = { + .max_fte = 1, .prio = 0, .level = 0, + .flags = MLX5_FLOW_TABLE_OTHER_VPORT, + }; + struct mlx5_flow_namespace *egress_ns; + struct mlx5_flow_table *acl; + struct mlx5_flow_group *g; + struct mlx5_vport *vport; + void *match_criteria; + u32 *flow_group_in; + int err; + + vport = mlx5_eswitch_get_vport(esw, esw->manager_vport); + if (IS_ERR(vport)) + return PTR_ERR(vport); + + egress_ns = mlx5_get_flow_vport_acl_namespace(master, + MLX5_FLOW_NAMESPACE_ESW_EGRESS, + vport->index); + if (!egress_ns) + return -EINVAL; + + if (vport->egress.acl) + return -EINVAL; + + flow_group_in = kvzalloc(inlen, GFP_KERNEL); + if (!flow_group_in) + return -ENOMEM; + + acl = mlx5_create_vport_flow_table(egress_ns, &ft_attr, vport->vport); + if (IS_ERR(acl)) { + err = PTR_ERR(acl); + goto out; + } + + match_criteria = MLX5_ADDR_OF(create_flow_group_in, flow_group_in, + match_criteria); + MLX5_SET_TO_ONES(fte_match_param, match_criteria, + misc_parameters.source_port); + MLX5_SET_TO_ONES(fte_match_param, match_criteria, + misc_parameters.source_eswitch_owner_vhca_id); + MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, + MLX5_MATCH_MISC_PARAMETERS); + + MLX5_SET(create_flow_group_in, flow_group_in, + source_eswitch_owner_vhca_id_valid, 1); + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 0); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 0); + + g = mlx5_create_flow_group(acl, flow_group_in); + if (IS_ERR(g)) { + err = PTR_ERR(g); + goto err_group; + } + + err = __esw_set_master_egress_rule(master, slave, vport, acl); + if (err) + goto err_rule; + + vport->egress.acl = acl; + vport->egress.offloads.bounce_grp = g; + + kvfree(flow_group_in); + + return 0; + +err_rule: + mlx5_destroy_flow_group(g); +err_group: + mlx5_destroy_flow_table(acl); +out: + kvfree(flow_group_in); + return err; +} + +static void esw_unset_master_egress_rule(struct mlx5_core_dev *dev) +{ + struct mlx5_vport *vport; + + vport = mlx5_eswitch_get_vport(dev->priv.eswitch, + dev->priv.eswitch->manager_vport); + + esw_acl_egress_ofld_cleanup(vport); +} + +int mlx5_eswitch_offloads_config_single_fdb(struct mlx5_eswitch *master_esw, + struct mlx5_eswitch *slave_esw) +{ + int err; + + err = esw_set_slave_root_fdb(master_esw->dev, + slave_esw->dev); + if (err) + return err; + + err = esw_set_master_egress_rule(master_esw->dev, + slave_esw->dev); + if (err) + goto err_acl; + + return err; + +err_acl: + esw_set_slave_root_fdb(NULL, slave_esw->dev); + + return err; +} + +void mlx5_eswitch_offloads_destroy_single_fdb(struct mlx5_eswitch *master_esw, + struct mlx5_eswitch *slave_esw) +{ + esw_unset_master_egress_rule(master_esw->dev); + esw_set_slave_root_fdb(NULL, slave_esw->dev); +} + +#define ESW_OFFLOADS_DEVCOM_PAIR (0) +#define ESW_OFFLOADS_DEVCOM_UNPAIR (1) + +static void mlx5_esw_offloads_rep_event_unpair(struct mlx5_eswitch *esw) +{ + const struct mlx5_eswitch_rep_ops *ops; + struct mlx5_eswitch_rep *rep; + unsigned long i; + u8 rep_type; + + mlx5_esw_for_each_rep(esw, i, rep) { + rep_type = NUM_REP_TYPES; + while (rep_type--) { + ops = esw->offloads.rep_ops[rep_type]; + if (atomic_read(&rep->rep_data[rep_type].state) == REP_LOADED && + ops->event) + ops->event(esw, rep, MLX5_SWITCHDEV_EVENT_UNPAIR, NULL); + } + } +} + +static void mlx5_esw_offloads_unpair(struct mlx5_eswitch *esw) +{ +#if IS_ENABLED(CONFIG_MLX5_CLS_ACT) + mlx5e_tc_clean_fdb_peer_flows(esw); +#endif + mlx5_esw_offloads_rep_event_unpair(esw); + esw_del_fdb_peer_miss_rules(esw); +} + +static int mlx5_esw_offloads_pair(struct mlx5_eswitch *esw, + struct mlx5_eswitch *peer_esw) +{ + const struct mlx5_eswitch_rep_ops *ops; + struct mlx5_eswitch_rep *rep; + unsigned long i; + u8 rep_type; + int err; + + err = esw_add_fdb_peer_miss_rules(esw, peer_esw->dev); + if (err) + return err; + + mlx5_esw_for_each_rep(esw, i, rep) { + for (rep_type = 0; rep_type < NUM_REP_TYPES; rep_type++) { + ops = esw->offloads.rep_ops[rep_type]; + if (atomic_read(&rep->rep_data[rep_type].state) == REP_LOADED && + ops->event) { + err = ops->event(esw, rep, MLX5_SWITCHDEV_EVENT_PAIR, peer_esw); + if (err) + goto err_out; + } + } + } + + return 0; + +err_out: + mlx5_esw_offloads_unpair(esw); + return err; +} + +static int mlx5_esw_offloads_set_ns_peer(struct mlx5_eswitch *esw, + struct mlx5_eswitch *peer_esw, + bool pair) +{ + struct mlx5_flow_root_namespace *peer_ns; + struct mlx5_flow_root_namespace *ns; + int err; + + peer_ns = peer_esw->dev->priv.steering->fdb_root_ns; + ns = esw->dev->priv.steering->fdb_root_ns; + + if (pair) { + err = mlx5_flow_namespace_set_peer(ns, peer_ns); + if (err) + return err; + + err = mlx5_flow_namespace_set_peer(peer_ns, ns); + if (err) { + mlx5_flow_namespace_set_peer(ns, NULL); + return err; + } + } else { + mlx5_flow_namespace_set_peer(ns, NULL); + mlx5_flow_namespace_set_peer(peer_ns, NULL); + } + + return 0; +} + +static int mlx5_esw_offloads_devcom_event(int event, + void *my_data, + void *event_data) +{ + struct mlx5_eswitch *esw = my_data; + struct mlx5_devcom *devcom = esw->dev->priv.devcom; + struct mlx5_eswitch *peer_esw = event_data; + int err; + + switch (event) { + case ESW_OFFLOADS_DEVCOM_PAIR: + if (mlx5_get_next_phys_dev_lag(esw->dev) != peer_esw->dev) + break; + + if (mlx5_eswitch_vport_match_metadata_enabled(esw) != + mlx5_eswitch_vport_match_metadata_enabled(peer_esw)) + break; + + err = mlx5_esw_offloads_set_ns_peer(esw, peer_esw, true); + if (err) + goto err_out; + err = mlx5_esw_offloads_pair(esw, peer_esw); + if (err) + goto err_peer; + + err = mlx5_esw_offloads_pair(peer_esw, esw); + if (err) + goto err_pair; + + mlx5_devcom_set_paired(devcom, MLX5_DEVCOM_ESW_OFFLOADS, true); + break; + + case ESW_OFFLOADS_DEVCOM_UNPAIR: + if (!mlx5_devcom_is_paired(devcom, MLX5_DEVCOM_ESW_OFFLOADS)) + break; + + mlx5_devcom_set_paired(devcom, MLX5_DEVCOM_ESW_OFFLOADS, false); + mlx5_esw_offloads_unpair(peer_esw); + mlx5_esw_offloads_unpair(esw); + mlx5_esw_offloads_set_ns_peer(esw, peer_esw, false); + break; + } + + return 0; + +err_pair: + mlx5_esw_offloads_unpair(esw); +err_peer: + mlx5_esw_offloads_set_ns_peer(esw, peer_esw, false); +err_out: + mlx5_core_err(esw->dev, "esw offloads devcom event failure, event %u err %d", + event, err); + return err; +} + +static void esw_offloads_devcom_init(struct mlx5_eswitch *esw) +{ + struct mlx5_devcom *devcom = esw->dev->priv.devcom; + + INIT_LIST_HEAD(&esw->offloads.peer_flows); + mutex_init(&esw->offloads.peer_mutex); + + if (!MLX5_CAP_ESW(esw->dev, merged_eswitch)) + return; + + if (!mlx5_lag_is_supported(esw->dev)) + return; + + mlx5_devcom_register_component(devcom, + MLX5_DEVCOM_ESW_OFFLOADS, + mlx5_esw_offloads_devcom_event, + esw); + + mlx5_devcom_send_event(devcom, + MLX5_DEVCOM_ESW_OFFLOADS, + ESW_OFFLOADS_DEVCOM_PAIR, esw); +} + +static void esw_offloads_devcom_cleanup(struct mlx5_eswitch *esw) +{ + struct mlx5_devcom *devcom = esw->dev->priv.devcom; + + if (!MLX5_CAP_ESW(esw->dev, merged_eswitch)) + return; + + if (!mlx5_lag_is_supported(esw->dev)) + return; + + mlx5_devcom_send_event(devcom, MLX5_DEVCOM_ESW_OFFLOADS, + ESW_OFFLOADS_DEVCOM_UNPAIR, esw); + + mlx5_devcom_unregister_component(devcom, MLX5_DEVCOM_ESW_OFFLOADS); +} + +bool mlx5_esw_vport_match_metadata_supported(const struct mlx5_eswitch *esw) +{ + if (!MLX5_CAP_ESW(esw->dev, esw_uplink_ingress_acl)) + return false; + + if (!(MLX5_CAP_ESW_FLOWTABLE(esw->dev, fdb_to_vport_reg_c_id) & + MLX5_FDB_TO_VPORT_REG_C_0)) + return false; + + if (!MLX5_CAP_ESW_FLOWTABLE(esw->dev, flow_source)) + return false; + + return true; +} + +#define MLX5_ESW_METADATA_RSVD_UPLINK 1 + +/* Share the same metadata for uplink's. This is fine because: + * (a) In shared FDB mode (LAG) both uplink's are treated the + * same and tagged with the same metadata. + * (b) In non shared FDB mode, packets from physical port0 + * cannot hit eswitch of PF1 and vice versa. + */ +static u32 mlx5_esw_match_metadata_reserved(struct mlx5_eswitch *esw) +{ + return MLX5_ESW_METADATA_RSVD_UPLINK; +} + +u32 mlx5_esw_match_metadata_alloc(struct mlx5_eswitch *esw) +{ + u32 vport_end_ida = (1 << ESW_VPORT_BITS) - 1; + /* Reserve 0xf for internal port offload */ + u32 max_pf_num = (1 << ESW_PFNUM_BITS) - 2; + u32 pf_num; + int id; + + /* Only 4 bits of pf_num */ + pf_num = mlx5_get_dev_index(esw->dev); + if (pf_num > max_pf_num) + return 0; + + /* Metadata is 4 bits of PFNUM and 12 bits of unique id */ + /* Use only non-zero vport_id (2-4095) for all PF's */ + id = ida_alloc_range(&esw->offloads.vport_metadata_ida, + MLX5_ESW_METADATA_RSVD_UPLINK + 1, + vport_end_ida, GFP_KERNEL); + if (id < 0) + return 0; + id = (pf_num << ESW_VPORT_BITS) | id; + return id; +} + +void mlx5_esw_match_metadata_free(struct mlx5_eswitch *esw, u32 metadata) +{ + u32 vport_bit_mask = (1 << ESW_VPORT_BITS) - 1; + + /* Metadata contains only 12 bits of actual ida id */ + ida_free(&esw->offloads.vport_metadata_ida, metadata & vport_bit_mask); +} + +static int esw_offloads_vport_metadata_setup(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + if (vport->vport == MLX5_VPORT_UPLINK) + vport->default_metadata = mlx5_esw_match_metadata_reserved(esw); + else + vport->default_metadata = mlx5_esw_match_metadata_alloc(esw); + + vport->metadata = vport->default_metadata; + return vport->metadata ? 0 : -ENOSPC; +} + +static void esw_offloads_vport_metadata_cleanup(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + if (!vport->default_metadata) + return; + + if (vport->vport == MLX5_VPORT_UPLINK) + return; + + WARN_ON(vport->metadata != vport->default_metadata); + mlx5_esw_match_metadata_free(esw, vport->default_metadata); +} + +static void esw_offloads_metadata_uninit(struct mlx5_eswitch *esw) +{ + struct mlx5_vport *vport; + unsigned long i; + + if (!mlx5_eswitch_vport_match_metadata_enabled(esw)) + return; + + mlx5_esw_for_each_vport(esw, i, vport) + esw_offloads_vport_metadata_cleanup(esw, vport); +} + +static int esw_offloads_metadata_init(struct mlx5_eswitch *esw) +{ + struct mlx5_vport *vport; + unsigned long i; + int err; + + if (!mlx5_eswitch_vport_match_metadata_enabled(esw)) + return 0; + + mlx5_esw_for_each_vport(esw, i, vport) { + err = esw_offloads_vport_metadata_setup(esw, vport); + if (err) + goto metadata_err; + } + + return 0; + +metadata_err: + esw_offloads_metadata_uninit(esw); + return err; +} + +int mlx5_esw_offloads_vport_metadata_set(struct mlx5_eswitch *esw, bool enable) +{ + int err = 0; + + down_write(&esw->mode_lock); + if (mlx5_sriov_is_enabled(esw->dev) || is_mdev_switchdev_mode(esw->dev)) { + err = -EBUSY; + goto done; + } + if (!mlx5_esw_vport_match_metadata_supported(esw)) { + err = -EOPNOTSUPP; + goto done; + } + if (enable) + esw->flags |= MLX5_ESWITCH_VPORT_MATCH_METADATA; + else + esw->flags &= ~MLX5_ESWITCH_VPORT_MATCH_METADATA; +done: + up_write(&esw->mode_lock); + return err; +} + +int +esw_vport_create_offloads_acl_tables(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + int err; + + err = esw_acl_ingress_ofld_setup(esw, vport); + if (err) + return err; + + err = esw_acl_egress_ofld_setup(esw, vport); + if (err) + goto egress_err; + + return 0; + +egress_err: + esw_acl_ingress_ofld_cleanup(esw, vport); + return err; +} + +void +esw_vport_destroy_offloads_acl_tables(struct mlx5_eswitch *esw, + struct mlx5_vport *vport) +{ + esw_acl_egress_ofld_cleanup(vport); + esw_acl_ingress_ofld_cleanup(esw, vport); +} + +static int esw_create_default_offloads_acl_tables(struct mlx5_eswitch *esw) +{ + struct mlx5_vport *vport; + int err; + + vport = mlx5_eswitch_get_vport(esw, MLX5_VPORT_UPLINK); + err = esw_vport_create_offloads_acl_tables(esw, vport); + if (err) + goto uplink_err; + + vport = mlx5_eswitch_get_vport(esw, MLX5_VPORT_ECPF); + if (!IS_ERR(vport)) { + err = esw_vport_create_offloads_acl_tables(esw, vport); + if (err) + goto ecpf_err; + } + + return 0; + +ecpf_err: + vport = mlx5_eswitch_get_vport(esw, MLX5_VPORT_UPLINK); + esw_vport_destroy_offloads_acl_tables(esw, vport); +uplink_err: + return err; +} + +static void esw_destroy_default_offloads_acl_tables(struct mlx5_eswitch *esw) +{ + struct mlx5_vport *vport; + + vport = mlx5_eswitch_get_vport(esw, MLX5_VPORT_ECPF); + if (!IS_ERR(vport)) { + vport = mlx5_eswitch_get_vport(esw, MLX5_VPORT_ECPF); + esw_vport_destroy_offloads_acl_tables(esw, vport); + } + + vport = mlx5_eswitch_get_vport(esw, MLX5_VPORT_UPLINK); + if (IS_ERR(vport)) + return; + + esw_vport_destroy_offloads_acl_tables(esw, vport); +} + +int mlx5_eswitch_reload_reps(struct mlx5_eswitch *esw) +{ + struct mlx5_eswitch_rep *rep; + unsigned long i; + int ret; + + if (!esw || esw->mode != MLX5_ESWITCH_OFFLOADS) + return 0; + + rep = mlx5_eswitch_get_rep(esw, MLX5_VPORT_UPLINK); + if (atomic_read(&rep->rep_data[REP_ETH].state) != REP_LOADED) + return 0; + + ret = mlx5_esw_offloads_rep_load(esw, MLX5_VPORT_UPLINK); + if (ret) + return ret; + + mlx5_esw_for_each_rep(esw, i, rep) { + if (atomic_read(&rep->rep_data[REP_ETH].state) == REP_LOADED) + mlx5_esw_offloads_rep_load(esw, rep->vport); + } + + return 0; +} + +static int esw_offloads_steering_init(struct mlx5_eswitch *esw) +{ + struct mlx5_esw_indir_table *indir; + int err; + + memset(&esw->fdb_table.offloads, 0, sizeof(struct offloads_fdb)); + mutex_init(&esw->fdb_table.offloads.vports.lock); + hash_init(esw->fdb_table.offloads.vports.table); + atomic64_set(&esw->user_count, 0); + + indir = mlx5_esw_indir_table_init(); + if (IS_ERR(indir)) { + err = PTR_ERR(indir); + goto create_indir_err; + } + esw->fdb_table.offloads.indir = indir; + + err = esw_create_default_offloads_acl_tables(esw); + if (err) + goto create_acl_err; + + err = esw_create_offloads_table(esw); + if (err) + goto create_offloads_err; + + err = esw_create_restore_table(esw); + if (err) + goto create_restore_err; + + err = esw_create_offloads_fdb_tables(esw); + if (err) + goto create_fdb_err; + + err = esw_create_vport_rx_group(esw); + if (err) + goto create_fg_err; + + err = esw_create_vport_rx_drop_group(esw); + if (err) + goto create_rx_fg_err; + + err = mlx5_eswitch_create_vport_rx_drop_rule(esw); + if (err) + goto create_rx_drop_fg_err; + + return 0; + +create_rx_drop_fg_err: + esw_destroy_vport_rx_drop_group(esw); +create_rx_fg_err: + esw_destroy_vport_rx_group(esw); +create_fg_err: + esw_destroy_offloads_fdb_tables(esw); +create_fdb_err: + esw_destroy_restore_table(esw); +create_restore_err: + esw_destroy_offloads_table(esw); +create_offloads_err: + esw_destroy_default_offloads_acl_tables(esw); +create_acl_err: + mlx5_esw_indir_table_destroy(esw->fdb_table.offloads.indir); +create_indir_err: + mutex_destroy(&esw->fdb_table.offloads.vports.lock); + return err; +} + +static void esw_offloads_steering_cleanup(struct mlx5_eswitch *esw) +{ + mlx5_eswitch_destroy_vport_rx_drop_rule(esw); + esw_destroy_vport_rx_drop_group(esw); + esw_destroy_vport_rx_group(esw); + esw_destroy_offloads_fdb_tables(esw); + esw_destroy_restore_table(esw); + esw_destroy_offloads_table(esw); + esw_destroy_default_offloads_acl_tables(esw); + mlx5_esw_indir_table_destroy(esw->fdb_table.offloads.indir); + mutex_destroy(&esw->fdb_table.offloads.vports.lock); +} + +static void +esw_vfs_changed_event_handler(struct mlx5_eswitch *esw, const u32 *out) +{ + bool host_pf_disabled; + u16 new_num_vfs; + + new_num_vfs = MLX5_GET(query_esw_functions_out, out, + host_params_context.host_num_of_vfs); + host_pf_disabled = MLX5_GET(query_esw_functions_out, out, + host_params_context.host_pf_disabled); + + if (new_num_vfs == esw->esw_funcs.num_vfs || host_pf_disabled) + return; + + /* Number of VFs can only change from "0 to x" or "x to 0". */ + if (esw->esw_funcs.num_vfs > 0) { + mlx5_eswitch_unload_vf_vports(esw, esw->esw_funcs.num_vfs); + } else { + int err; + + err = mlx5_eswitch_load_vf_vports(esw, new_num_vfs, + MLX5_VPORT_UC_ADDR_CHANGE); + if (err) + return; + } + esw->esw_funcs.num_vfs = new_num_vfs; +} + +static void esw_functions_changed_event_handler(struct work_struct *work) +{ + struct mlx5_host_work *host_work; + struct mlx5_eswitch *esw; + const u32 *out; + + host_work = container_of(work, struct mlx5_host_work, work); + esw = host_work->esw; + + out = mlx5_esw_query_functions(esw->dev); + if (IS_ERR(out)) + goto out; + + esw_vfs_changed_event_handler(esw, out); + kvfree(out); +out: + kfree(host_work); +} + +int mlx5_esw_funcs_changed_handler(struct notifier_block *nb, unsigned long type, void *data) +{ + struct mlx5_esw_functions *esw_funcs; + struct mlx5_host_work *host_work; + struct mlx5_eswitch *esw; + + host_work = kzalloc(sizeof(*host_work), GFP_ATOMIC); + if (!host_work) + return NOTIFY_DONE; + + esw_funcs = mlx5_nb_cof(nb, struct mlx5_esw_functions, nb); + esw = container_of(esw_funcs, struct mlx5_eswitch, esw_funcs); + + host_work->esw = esw; + + INIT_WORK(&host_work->work, esw_functions_changed_event_handler); + queue_work(esw->work_queue, &host_work->work); + + return NOTIFY_OK; +} + +static int mlx5_esw_host_number_init(struct mlx5_eswitch *esw) +{ + const u32 *query_host_out; + + if (!mlx5_core_is_ecpf_esw_manager(esw->dev)) + return 0; + + query_host_out = mlx5_esw_query_functions(esw->dev); + if (IS_ERR(query_host_out)) + return PTR_ERR(query_host_out); + + /* Mark non local controller with non zero controller number. */ + esw->offloads.host_number = MLX5_GET(query_esw_functions_out, query_host_out, + host_params_context.host_number); + kvfree(query_host_out); + return 0; +} + +bool mlx5_esw_offloads_controller_valid(const struct mlx5_eswitch *esw, u32 controller) +{ + /* Local controller is always valid */ + if (controller == 0) + return true; + + if (!mlx5_core_is_ecpf_esw_manager(esw->dev)) + return false; + + /* External host number starts with zero in device */ + return (controller == esw->offloads.host_number + 1); +} + +int esw_offloads_enable(struct mlx5_eswitch *esw) +{ + struct mapping_ctx *reg_c0_obj_pool; + struct mlx5_vport *vport; + unsigned long i; + u64 mapping_id; + int err; + + mutex_init(&esw->offloads.termtbl_mutex); + mlx5_rdma_enable_roce(esw->dev); + + err = mlx5_esw_host_number_init(esw); + if (err) + goto err_metadata; + + err = esw_offloads_metadata_init(esw); + if (err) + goto err_metadata; + + err = esw_set_passing_vport_metadata(esw, true); + if (err) + goto err_vport_metadata; + + mapping_id = mlx5_query_nic_system_image_guid(esw->dev); + + reg_c0_obj_pool = mapping_create_for_id(mapping_id, MAPPING_TYPE_CHAIN, + sizeof(struct mlx5_mapped_obj), + ESW_REG_C0_USER_DATA_METADATA_MASK, + true); + + if (IS_ERR(reg_c0_obj_pool)) { + err = PTR_ERR(reg_c0_obj_pool); + goto err_pool; + } + esw->offloads.reg_c0_obj_pool = reg_c0_obj_pool; + + err = esw_offloads_steering_init(esw); + if (err) + goto err_steering_init; + + /* Representor will control the vport link state */ + mlx5_esw_for_each_vf_vport(esw, i, vport, esw->esw_funcs.num_vfs) + vport->info.link_state = MLX5_VPORT_ADMIN_STATE_DOWN; + + /* Uplink vport rep must load first. */ + err = esw_offloads_load_rep(esw, MLX5_VPORT_UPLINK); + if (err) + goto err_uplink; + + err = mlx5_eswitch_enable_pf_vf_vports(esw, MLX5_VPORT_UC_ADDR_CHANGE); + if (err) + goto err_vports; + + esw_offloads_devcom_init(esw); + + return 0; + +err_vports: + esw_offloads_unload_rep(esw, MLX5_VPORT_UPLINK); +err_uplink: + esw_offloads_steering_cleanup(esw); +err_steering_init: + mapping_destroy(reg_c0_obj_pool); +err_pool: + esw_set_passing_vport_metadata(esw, false); +err_vport_metadata: + esw_offloads_metadata_uninit(esw); +err_metadata: + mlx5_rdma_disable_roce(esw->dev); + mutex_destroy(&esw->offloads.termtbl_mutex); + return err; +} + +static int esw_offloads_stop(struct mlx5_eswitch *esw, + struct netlink_ext_ack *extack) +{ + int err; + + esw->mode = MLX5_ESWITCH_LEGACY; + err = mlx5_eswitch_enable_locked(esw, MLX5_ESWITCH_IGNORE_NUM_VFS); + if (err) { + NL_SET_ERR_MSG_MOD(extack, "Failed setting eswitch to legacy"); + esw->fdb_table.flags &= ~MLX5_ESW_FDB_CREATED; + } + return err; +} + +void esw_offloads_disable(struct mlx5_eswitch *esw) +{ + esw_offloads_devcom_cleanup(esw); + mlx5_eswitch_disable_pf_vf_vports(esw); + esw_offloads_unload_rep(esw, MLX5_VPORT_UPLINK); + esw_set_passing_vport_metadata(esw, false); + esw_offloads_steering_cleanup(esw); + mapping_destroy(esw->offloads.reg_c0_obj_pool); + esw_offloads_metadata_uninit(esw); + mlx5_rdma_disable_roce(esw->dev); + mutex_destroy(&esw->offloads.termtbl_mutex); +} + +static int esw_mode_from_devlink(u16 mode, u16 *mlx5_mode) +{ + switch (mode) { + case DEVLINK_ESWITCH_MODE_LEGACY: + *mlx5_mode = MLX5_ESWITCH_LEGACY; + break; + case DEVLINK_ESWITCH_MODE_SWITCHDEV: + *mlx5_mode = MLX5_ESWITCH_OFFLOADS; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int esw_mode_to_devlink(u16 mlx5_mode, u16 *mode) +{ + switch (mlx5_mode) { + case MLX5_ESWITCH_LEGACY: + *mode = DEVLINK_ESWITCH_MODE_LEGACY; + break; + case MLX5_ESWITCH_OFFLOADS: + *mode = DEVLINK_ESWITCH_MODE_SWITCHDEV; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int esw_inline_mode_from_devlink(u8 mode, u8 *mlx5_mode) +{ + switch (mode) { + case DEVLINK_ESWITCH_INLINE_MODE_NONE: + *mlx5_mode = MLX5_INLINE_MODE_NONE; + break; + case DEVLINK_ESWITCH_INLINE_MODE_LINK: + *mlx5_mode = MLX5_INLINE_MODE_L2; + break; + case DEVLINK_ESWITCH_INLINE_MODE_NETWORK: + *mlx5_mode = MLX5_INLINE_MODE_IP; + break; + case DEVLINK_ESWITCH_INLINE_MODE_TRANSPORT: + *mlx5_mode = MLX5_INLINE_MODE_TCP_UDP; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int esw_inline_mode_to_devlink(u8 mlx5_mode, u8 *mode) +{ + switch (mlx5_mode) { + case MLX5_INLINE_MODE_NONE: + *mode = DEVLINK_ESWITCH_INLINE_MODE_NONE; + break; + case MLX5_INLINE_MODE_L2: + *mode = DEVLINK_ESWITCH_INLINE_MODE_LINK; + break; + case MLX5_INLINE_MODE_IP: + *mode = DEVLINK_ESWITCH_INLINE_MODE_NETWORK; + break; + case MLX5_INLINE_MODE_TCP_UDP: + *mode = DEVLINK_ESWITCH_INLINE_MODE_TRANSPORT; + break; + default: + return -EINVAL; + } + + return 0; +} + +int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode, + struct netlink_ext_ack *extack) +{ + u16 cur_mlx5_mode, mlx5_mode = 0; + struct mlx5_eswitch *esw; + + int err = 0; + + esw = mlx5_devlink_eswitch_get(devlink); + if (IS_ERR(esw)) + return PTR_ERR(esw); + + if (esw_mode_from_devlink(mode, &mlx5_mode)) + return -EINVAL; + + mlx5_lag_disable_change(esw->dev); + err = mlx5_esw_try_lock(esw); + if (err < 0) { + NL_SET_ERR_MSG_MOD(extack, "Can't change mode, E-Switch is busy"); + goto enable_lag; + } + cur_mlx5_mode = err; + err = 0; + + if (cur_mlx5_mode == mlx5_mode) + goto unlock; + + if (atomic64_read(&esw->offloads.num_flows) > 0) { + NL_SET_ERR_MSG_MOD(extack, + "Can't change mode when flows are configured"); + err = -EOPNOTSUPP; + goto unlock; + } + + if (!mlx5_esw_ipsec_try_hold(esw)) { + NL_SET_ERR_MSG_MOD(extack, + "Can't change mode when IPsec flows are configured"); + err = -EOPNOTSUPP; + goto unlock; + } + + mlx5_eswitch_disable_locked(esw); + if (mode == DEVLINK_ESWITCH_MODE_SWITCHDEV) { + if (mlx5_devlink_trap_get_num_active(esw->dev)) { + NL_SET_ERR_MSG_MOD(extack, + "Can't change mode while devlink traps are active"); + err = -EOPNOTSUPP; + goto unlock; + } + err = esw_offloads_start(esw, extack); + } else if (mode == DEVLINK_ESWITCH_MODE_LEGACY) { + err = esw_offloads_stop(esw, extack); + mlx5_rescan_drivers(esw->dev); + } else { + err = -EINVAL; + } + + mlx5_esw_ipsec_release(esw); + +unlock: + mlx5_esw_unlock(esw); +enable_lag: + mlx5_lag_enable_change(esw->dev); + return err; +} + +int mlx5_devlink_eswitch_mode_get(struct devlink *devlink, u16 *mode) +{ + struct mlx5_eswitch *esw; + int err; + + esw = mlx5_devlink_eswitch_get(devlink); + if (IS_ERR(esw)) + return PTR_ERR(esw); + + if (!down_write_trylock(&esw->mode_lock)) + return -EBUSY; + + err = esw_mode_to_devlink(esw->mode, mode); + up_write(&esw->mode_lock); + return err; +} + +static int mlx5_esw_vports_inline_set(struct mlx5_eswitch *esw, u8 mlx5_mode, + struct netlink_ext_ack *extack) +{ + struct mlx5_core_dev *dev = esw->dev; + struct mlx5_vport *vport; + u16 err_vport_num = 0; + unsigned long i; + int err = 0; + + mlx5_esw_for_each_host_func_vport(esw, i, vport, esw->esw_funcs.num_vfs) { + err = mlx5_modify_nic_vport_min_inline(dev, vport->vport, mlx5_mode); + if (err) { + err_vport_num = vport->vport; + NL_SET_ERR_MSG_MOD(extack, + "Failed to set min inline on vport"); + goto revert_inline_mode; + } + } + return 0; + +revert_inline_mode: + mlx5_esw_for_each_host_func_vport(esw, i, vport, esw->esw_funcs.num_vfs) { + if (vport->vport == err_vport_num) + break; + mlx5_modify_nic_vport_min_inline(dev, + vport->vport, + esw->offloads.inline_mode); + } + return err; +} + +int mlx5_devlink_eswitch_inline_mode_set(struct devlink *devlink, u8 mode, + struct netlink_ext_ack *extack) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + struct mlx5_eswitch *esw; + u8 mlx5_mode; + int err; + + esw = mlx5_devlink_eswitch_get(devlink); + if (IS_ERR(esw)) + return PTR_ERR(esw); + + down_write(&esw->mode_lock); + + switch (MLX5_CAP_ETH(dev, wqe_inline_mode)) { + case MLX5_CAP_INLINE_MODE_NOT_REQUIRED: + if (mode == DEVLINK_ESWITCH_INLINE_MODE_NONE) { + err = 0; + goto out; + } + + fallthrough; + case MLX5_CAP_INLINE_MODE_L2: + NL_SET_ERR_MSG_MOD(extack, "Inline mode can't be set"); + err = -EOPNOTSUPP; + goto out; + case MLX5_CAP_INLINE_MODE_VPORT_CONTEXT: + break; + } + + if (atomic64_read(&esw->offloads.num_flows) > 0) { + NL_SET_ERR_MSG_MOD(extack, + "Can't set inline mode when flows are configured"); + err = -EOPNOTSUPP; + goto out; + } + + err = esw_inline_mode_from_devlink(mode, &mlx5_mode); + if (err) + goto out; + + err = mlx5_esw_vports_inline_set(esw, mlx5_mode, extack); + if (err) + goto out; + + esw->offloads.inline_mode = mlx5_mode; + up_write(&esw->mode_lock); + return 0; + +out: + up_write(&esw->mode_lock); + return err; +} + +int mlx5_devlink_eswitch_inline_mode_get(struct devlink *devlink, u8 *mode) +{ + struct mlx5_eswitch *esw; + int err; + + esw = mlx5_devlink_eswitch_get(devlink); + if (IS_ERR(esw)) + return PTR_ERR(esw); + + if (!down_write_trylock(&esw->mode_lock)) + return -EBUSY; + + err = esw_inline_mode_to_devlink(esw->offloads.inline_mode, mode); + up_write(&esw->mode_lock); + return err; +} + +int mlx5_devlink_eswitch_encap_mode_set(struct devlink *devlink, + enum devlink_eswitch_encap_mode encap, + struct netlink_ext_ack *extack) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + struct mlx5_eswitch *esw; + int err = 0; + + esw = mlx5_devlink_eswitch_get(devlink); + if (IS_ERR(esw)) + return PTR_ERR(esw); + + down_write(&esw->mode_lock); + + if (encap != DEVLINK_ESWITCH_ENCAP_MODE_NONE && + (!MLX5_CAP_ESW_FLOWTABLE_FDB(dev, reformat) || + !MLX5_CAP_ESW_FLOWTABLE_FDB(dev, decap))) { + err = -EOPNOTSUPP; + goto unlock; + } + + if (encap && encap != DEVLINK_ESWITCH_ENCAP_MODE_BASIC) { + err = -EOPNOTSUPP; + goto unlock; + } + + if (esw->mode == MLX5_ESWITCH_LEGACY) { + esw->offloads.encap = encap; + goto unlock; + } + + if (esw->offloads.encap == encap) + goto unlock; + + if (atomic64_read(&esw->offloads.num_flows) > 0) { + NL_SET_ERR_MSG_MOD(extack, + "Can't set encapsulation when flows are configured"); + err = -EOPNOTSUPP; + goto unlock; + } + + esw_destroy_offloads_fdb_tables(esw); + + esw->offloads.encap = encap; + + err = esw_create_offloads_fdb_tables(esw); + + if (err) { + NL_SET_ERR_MSG_MOD(extack, + "Failed re-creating fast FDB table"); + esw->offloads.encap = !encap; + (void)esw_create_offloads_fdb_tables(esw); + } + +unlock: + up_write(&esw->mode_lock); + return err; +} + +int mlx5_devlink_eswitch_encap_mode_get(struct devlink *devlink, + enum devlink_eswitch_encap_mode *encap) +{ + struct mlx5_eswitch *esw; + + esw = mlx5_devlink_eswitch_get(devlink); + if (IS_ERR(esw)) + return PTR_ERR(esw); + + if (!down_write_trylock(&esw->mode_lock)) + return -EBUSY; + + *encap = esw->offloads.encap; + up_write(&esw->mode_lock); + return 0; +} + +static bool +mlx5_eswitch_vport_has_rep(const struct mlx5_eswitch *esw, u16 vport_num) +{ + /* Currently, only ECPF based device has representor for host PF. */ + if (vport_num == MLX5_VPORT_PF && + (!mlx5_core_is_ecpf_esw_manager(esw->dev) || + !mlx5_esw_host_functions_enabled(esw->dev))) + return false; + + if (vport_num == MLX5_VPORT_ECPF && + !mlx5_ecpf_vport_exists(esw->dev)) + return false; + + return true; +} + +int mlx5_devlink_eswitch_ipsec_mode_set(struct devlink *devlink, + enum devlink_eswitch_ipsec_mode ipsec, + struct netlink_ext_ack *extack) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + struct mlx5_eswitch *esw = dev->priv.eswitch; + int err = 0; + + memset(extack, 0, sizeof(*extack)); + + esw = mlx5_devlink_eswitch_get(devlink); + if (IS_ERR(esw)) + return PTR_ERR(esw); + + err = mlx5_esw_try_lock(esw); + if (err < 0) { + NL_SET_ERR_MSG_MOD(extack, + "Can't set ipsec mode, E-Switch is busy"); + return err; + } + + if (!mlx5_is_ipsec_device(dev)) { + err = -EOPNOTSUPP; + goto unlock; + } + + if (ipsec > DEVLINK_ESWITCH_IPSEC_MODE_FULL) { + err = -EOPNOTSUPP; + goto unlock; + } + + if (esw->mode == MLX5_ESWITCH_OFFLOADS) { + NL_SET_ERR_MSG_MOD(extack, + "Can't change IPsec mode while in switchdev mode"); + err = -EOPNOTSUPP; + goto unlock; + } + + if (esw->offloads.ipsec == ipsec) + goto unlock; + + esw->offloads.ipsec = ipsec; +unlock: + mlx5_esw_unlock(esw); + return err; +} + +int mlx5_devlink_eswitch_ipsec_mode_get(struct devlink *devlink, + enum devlink_eswitch_ipsec_mode *ipsec) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + struct mlx5_eswitch *esw = dev->priv.eswitch; + + esw = mlx5_devlink_eswitch_get(devlink); + if (IS_ERR(esw)) + return PTR_ERR(esw); + + down_write(&esw->mode_lock); + *ipsec = esw->offloads.ipsec; + up_write(&esw->mode_lock); + return 0; +} + +int mlx5_devlink_eswitch_steering_mode_set(struct devlink *devlink, + enum devlink_eswitch_steering_mode mode) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + + if (mlx5_eswitch_mode(dev) == MLX5_ESWITCH_OFFLOADS) { + esw_warn(dev, "Cannot switch to DMFS/SMFS while switchdev enabled\n"); + return -EOPNOTSUPP; + } + + if (mode == DEVLINK_ESWITCH_STEERING_MODE_DMFS) { + dev->priv.steering->mode = MLX5_FLOW_STEERING_MODE_DMFS; + } else if (mode == DEVLINK_ESWITCH_STEERING_MODE_SMFS) { + if (!mlx5_fs_dr_is_supported(dev)) { + esw_warn(dev, + "Software managed steering is not supported by current device\n"); + return -EOPNOTSUPP; + } + dev->priv.steering->mode = MLX5_FLOW_STEERING_MODE_SMFS; + } else { + return -EINVAL; + } + + return 0; +} + +int mlx5_devlink_eswitch_steering_mode_get(struct devlink *devlink, + enum devlink_eswitch_steering_mode *mode) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + *mode = dev->priv.steering->mode; + return 0; +} + +int +mlx5_devlink_eswitch_lag_port_select_mode_set(struct devlink *devlink, + enum devlink_eswitch_lag_port_select_mode mode) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + u8 eswitch_mode = mlx5_eswitch_mode(dev); + + if (!mlx5_lag_is_supported(dev)) + return -EOPNOTSUPP; + + if (mode == DEVLINK_ESWITCH_LAG_PORT_SELECT_MODE_HASH && + !MLX5_CAP_PORT_SELECTION(dev, port_select_flow_table)) { + mlx5_core_err(dev, + "hash based LAG is not supported by current device"); + return -EOPNOTSUPP; + } + + if (eswitch_mode == MLX5_ESWITCH_OFFLOADS) { + mlx5_core_err(dev, + "Configure lag port selection mode is not supported when eswitch offloads enabled."); + return -EOPNOTSUPP; + } + + switch (mode) { + case DEVLINK_ESWITCH_LAG_PORT_SELECT_MODE_HASH: + mlx5_lag_set_user_mode(dev, MLX5_LAG_USER_PREF_MODE_HASH); + break; + case DEVLINK_ESWITCH_LAG_PORT_SELECT_MODE_QUEUE_AFFINITY: + mlx5_lag_set_user_mode(dev, MLX5_LAG_USER_PREF_MODE_QUEUE_AFFINITY); + break; + case DEVLINK_ESWITCH_LAG_PORT_SELECT_MODE_MULTIPORT_ESW: + if (!MLX5_CAP_GEN(dev, create_lag_when_not_master_up)) { + mlx5_core_err(dev, "multiport eswitch is supported if LAG_RESOURCE_ALLOCATION is set"); + return -EOPNOTSUPP; + } + mlx5_lag_set_user_mode(dev, MLX5_LAG_USER_PREF_MODE_MULTI_PORT_ESW); + break; + default: + return -EINVAL; + } + + return 0; +} + +int +mlx5_devlink_eswitch_lag_port_select_mode_get(struct devlink *devlink, + enum devlink_eswitch_lag_port_select_mode *mode) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + enum mlx5_lag_user_pref md; + + if (!mlx5_lag_is_supported(dev)) + return -EOPNOTSUPP; + + md = mlx5_lag_get_user_mode(dev); + + switch (md) { + case MLX5_LAG_USER_PREF_MODE_HASH: + *mode = DEVLINK_ESWITCH_LAG_PORT_SELECT_MODE_HASH; + break; + case MLX5_LAG_USER_PREF_MODE_QUEUE_AFFINITY: + *mode = DEVLINK_ESWITCH_LAG_PORT_SELECT_MODE_QUEUE_AFFINITY; + break; + case MLX5_LAG_USER_PREF_MODE_MULTI_PORT_ESW: + *mode = DEVLINK_ESWITCH_LAG_PORT_SELECT_MODE_MULTIPORT_ESW; + break; + default: + return -EINVAL; + } + + return 0; +} + +int mlx5_devlink_eswitch_vport_match_mode_set(struct devlink *devlink, + enum devlink_eswitch_vport_match_mode mode) +{ + struct mlx5_eswitch *esw; + + esw = mlx5_devlink_eswitch_get(devlink); + if (IS_ERR(esw)) + return PTR_ERR(esw); + + if (is_mdev_switchdev_mode(esw->dev)) { + esw_warn(esw->dev, "Cannot set vport match mode while switchdev enabled\n"); + return -EOPNOTSUPP; + } + + if (mode == DEVLINK_ESWITCH_VPORT_MATCH_MODE_LEGACY) { + esw->flags &= ~MLX5_ESWITCH_VPORT_MATCH_METADATA; + } else if (mode == DEVLINK_ESWITCH_VPORT_MATCH_MODE_METADATA) { + if (!mlx5_esw_vport_match_metadata_supported(esw)) + return -EOPNOTSUPP; + esw->flags |= MLX5_ESWITCH_VPORT_MATCH_METADATA; + } else { + return -EINVAL; + } + + return 0; +} + +int mlx5_devlink_eswitch_vport_match_mode_get(struct devlink *devlink, + enum devlink_eswitch_vport_match_mode *mode) +{ + struct mlx5_eswitch *esw; + + esw = mlx5_devlink_eswitch_get(devlink); + if (IS_ERR(esw)) + return PTR_ERR(esw); + + *mode = (esw->flags & MLX5_ESWITCH_VPORT_MATCH_METADATA) ? + DEVLINK_ESWITCH_VPORT_MATCH_MODE_METADATA : + DEVLINK_ESWITCH_VPORT_MATCH_MODE_LEGACY; + + return 0; +} + +void mlx5_eswitch_register_vport_reps(struct mlx5_eswitch *esw, + const struct mlx5_eswitch_rep_ops *ops, + u8 rep_type) +{ + struct mlx5_eswitch_rep_data *rep_data; + struct mlx5_eswitch_rep *rep; + unsigned long i; + + esw->offloads.rep_ops[rep_type] = ops; + mlx5_esw_for_each_rep(esw, i, rep) { + if (likely(mlx5_eswitch_vport_has_rep(esw, rep->vport))) { + rep->esw = esw; + rep_data = &rep->rep_data[rep_type]; + atomic_set(&rep_data->state, REP_REGISTERED); + } + } +} +EXPORT_SYMBOL(mlx5_eswitch_register_vport_reps); + +void mlx5_eswitch_unregister_vport_reps(struct mlx5_eswitch *esw, u8 rep_type) +{ + struct mlx5_eswitch_rep *rep; + unsigned long i; + + if (esw->mode == MLX5_ESWITCH_OFFLOADS) + __unload_reps_all_vport(esw, rep_type); + + mlx5_esw_for_each_rep(esw, i, rep) + atomic_set(&rep->rep_data[rep_type].state, REP_UNREGISTERED); +} +EXPORT_SYMBOL(mlx5_eswitch_unregister_vport_reps); + +void *mlx5_eswitch_get_uplink_priv(struct mlx5_eswitch *esw, u8 rep_type) +{ + struct mlx5_eswitch_rep *rep; + + rep = mlx5_eswitch_get_rep(esw, MLX5_VPORT_UPLINK); + return rep->rep_data[rep_type].priv; +} + +void *mlx5_eswitch_get_proto_dev(struct mlx5_eswitch *esw, + u16 vport, + u8 rep_type) +{ + struct mlx5_eswitch_rep *rep; + + rep = mlx5_eswitch_get_rep(esw, vport); + + if (atomic_read(&rep->rep_data[rep_type].state) == REP_LOADED && + esw->offloads.rep_ops[rep_type]->get_proto_dev) + return esw->offloads.rep_ops[rep_type]->get_proto_dev(rep); + return NULL; +} +EXPORT_SYMBOL(mlx5_eswitch_get_proto_dev); + +void *mlx5_eswitch_uplink_get_proto_dev(struct mlx5_eswitch *esw, u8 rep_type) +{ + return mlx5_eswitch_get_proto_dev(esw, MLX5_VPORT_UPLINK, rep_type); +} +EXPORT_SYMBOL(mlx5_eswitch_uplink_get_proto_dev); + +struct mlx5_eswitch_rep *mlx5_eswitch_vport_rep(struct mlx5_eswitch *esw, + u16 vport) +{ + return mlx5_eswitch_get_rep(esw, vport); +} +EXPORT_SYMBOL(mlx5_eswitch_vport_rep); + +bool mlx5_eswitch_reg_c1_loopback_enabled(const struct mlx5_eswitch *esw) +{ + return !!(esw->flags & MLX5_ESWITCH_REG_C1_LOOPBACK_ENABLED); +} +EXPORT_SYMBOL(mlx5_eswitch_reg_c1_loopback_enabled); + +bool mlx5_eswitch_vport_match_metadata_enabled(const struct mlx5_eswitch *esw) +{ + return !!(esw->flags & MLX5_ESWITCH_VPORT_MATCH_METADATA); +} +EXPORT_SYMBOL(mlx5_eswitch_vport_match_metadata_enabled); + +u32 mlx5_eswitch_get_vport_metadata_for_match(struct mlx5_eswitch *esw, + u16 vport_num) +{ + struct mlx5_vport *vport = mlx5_eswitch_get_vport(esw, vport_num); + + if (WARN_ON_ONCE(IS_ERR(vport))) + return 0; + + return vport->metadata << (32 - ESW_SOURCE_PORT_METADATA_BITS); +} +EXPORT_SYMBOL(mlx5_eswitch_get_vport_metadata_for_match); + +int mlx5_esw_offloads_sf_vport_enable(struct mlx5_eswitch *esw, struct devlink_port *dl_port, + u16 vport_num, u32 controller, u32 sfnum) +{ + int err; + + err = mlx5_esw_vport_enable(esw, vport_num, MLX5_VPORT_UC_ADDR_CHANGE); + if (err) + return err; + + err = mlx5_esw_devlink_sf_port_register(esw, dl_port, vport_num, controller, sfnum); + if (err) + goto devlink_err; + + mlx5_esw_vport_debugfs_create(esw, vport_num, true, sfnum); + err = mlx5_esw_offloads_rep_load(esw, vport_num); + if (err) + goto rep_err; + return 0; + +rep_err: + mlx5_esw_vport_debugfs_destroy(esw, vport_num); + mlx5_esw_devlink_sf_port_unregister(esw, vport_num); +devlink_err: + mlx5_esw_vport_disable(esw, vport_num); + return err; +} + +void mlx5_esw_offloads_sf_vport_disable(struct mlx5_eswitch *esw, u16 vport_num) +{ + mlx5_esw_offloads_rep_unload(esw, vport_num); + mlx5_esw_vport_debugfs_destroy(esw, vport_num); + mlx5_esw_devlink_sf_port_unregister(esw, vport_num); + mlx5_esw_vport_disable(esw, vport_num); +} + +int mlx5_esw_query_vport_vhca_id(struct mlx5_eswitch *esw, u16 vport_num, u16 *vhca_id) +{ + int query_out_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out); + void *query_ctx; + void *hca_caps; + int err; + + *vhca_id = 0; + if (mlx5_esw_is_manager_vport(esw, vport_num) || + !MLX5_CAP_GEN(esw->dev, vhca_resource_manager)) + return -EPERM; + + query_ctx = kzalloc(query_out_sz, GFP_KERNEL); + if (!query_ctx) + return -ENOMEM; + + err = mlx5_vport_get_other_func_cap(esw->dev, vport_num, query_ctx); + if (err) + goto out_free; + + hca_caps = MLX5_ADDR_OF(query_hca_cap_out, query_ctx, capability); + *vhca_id = MLX5_GET(cmd_hca_cap, hca_caps, vhca_id); + +out_free: + kfree(query_ctx); + return err; +} + +struct mlx5_vhca_map_entry { + u16 vhca_id; + u16 vport_num; +}; + +int mlx5_esw_vport_vhca_id_set(struct mlx5_eswitch *esw, u16 vport_num) +{ + struct mlx5_vhca_map_entry *old_entry, *new_entry; + u16 vhca_id; + int err; + + err = mlx5_esw_query_vport_vhca_id(esw, vport_num, &vhca_id); + if (err) { + esw_warn(esw->dev, "Getting vhca_id for vport failed (vport=%u,err=%d)\n", + vport_num, err); + return err; + } + + new_entry = kmalloc(sizeof(*new_entry), GFP_KERNEL); + if (!new_entry) + return -ENOMEM; + + new_entry->vport_num = vport_num; + new_entry->vhca_id = vhca_id; + old_entry = xa_store(&esw->offloads.vhca_map, vhca_id, new_entry, GFP_KERNEL); + if (xa_is_err(old_entry)) { + kfree(new_entry); + return xa_err(old_entry); + } + kfree(old_entry); + return 0; +} + +void mlx5_esw_vport_vhca_id_clear(struct mlx5_eswitch *esw, u16 vport_num) +{ + XA_STATE(xas, &esw->offloads.vhca_map, 0); + struct mlx5_vhca_map_entry *entry; + u16 vhca_id; + int err; + + err = mlx5_esw_query_vport_vhca_id(esw, vport_num, &vhca_id); + if (err) { + esw_debug(esw->dev, "Getting vhca_id for vport failed (vport=%hu,err=%d)\n", + vport_num, err); + + xas_for_each(&xas, entry, USHRT_MAX) { + if (entry->vport_num == vport_num) + vhca_id = entry->vhca_id; + } + } + + entry = xa_erase(&esw->offloads.vhca_map, vhca_id); + kfree(entry); +} + +int mlx5_eswitch_vhca_id_to_vport(struct mlx5_eswitch *esw, u16 vhca_id, u16 *vport_num) +{ + struct mlx5_vhca_map_entry *entry; + + entry = xa_load(&esw->offloads.vhca_map, vhca_id); + + if (!entry) + return -ENOENT; + + *vport_num = entry->vport_num; + return 0; +} + +u32 mlx5_eswitch_get_vport_metadata_for_set(struct mlx5_eswitch *esw, + u16 vport_num) +{ + struct mlx5_vport *vport = mlx5_eswitch_get_vport(esw, vport_num); + + if (WARN_ON_ONCE(IS_ERR(vport))) + return 0; + + return vport->metadata; +} +EXPORT_SYMBOL(mlx5_eswitch_get_vport_metadata_for_set); + +static bool +is_port_function_supported(struct mlx5_eswitch *esw, u16 vport_num) +{ + return vport_num == MLX5_VPORT_PF || + mlx5_eswitch_is_vf_vport(esw, vport_num) || + mlx5_esw_is_sf_vport(esw, vport_num); +} + +int mlx5_devlink_port_function_hw_addr_get(struct devlink_port *port, + u8 *hw_addr, int *hw_addr_len, + struct netlink_ext_ack *extack) +{ + struct mlx5_eswitch *esw; + struct mlx5_vport *vport; + u16 vport_num; + + esw = mlx5_devlink_eswitch_get(port->devlink); + if (IS_ERR(esw)) + return PTR_ERR(esw); + + vport_num = mlx5_esw_devlink_port_index_to_vport_num(port->index); + if (!is_port_function_supported(esw, vport_num)) + return -EOPNOTSUPP; + + vport = mlx5_eswitch_get_vport(esw, vport_num); + if (IS_ERR(vport)) { + NL_SET_ERR_MSG_MOD(extack, "Invalid port"); + return PTR_ERR(vport); + } + + mutex_lock(&esw->state_lock); + ether_addr_copy(hw_addr, vport->info.mac); + *hw_addr_len = ETH_ALEN; + mutex_unlock(&esw->state_lock); + return 0; +} + +int mlx5_devlink_port_function_hw_addr_set(struct devlink_port *port, + const u8 *hw_addr, int hw_addr_len, + struct netlink_ext_ack *extack) +{ + struct mlx5_eswitch *esw; + u16 vport_num; + + esw = mlx5_devlink_eswitch_get(port->devlink); + if (IS_ERR(esw)) { + NL_SET_ERR_MSG_MOD(extack, "Eswitch doesn't support set hw_addr"); + return PTR_ERR(esw); + } + + vport_num = mlx5_esw_devlink_port_index_to_vport_num(port->index); + if (!is_port_function_supported(esw, vport_num)) { + NL_SET_ERR_MSG_MOD(extack, "Port doesn't support set hw_addr"); + return -EINVAL; + } + + return mlx5_eswitch_set_vport_mac(esw, vport_num, hw_addr); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_termtbl.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_termtbl.c new file mode 100644 index 0000000..4080323 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_termtbl.c @@ -0,0 +1,335 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// Copyright (c) 2019 Mellanox Technologies. + +#include +#include +#include "eswitch.h" +#include "en_tc.h" +#include "fs_core.h" + +struct mlx5_termtbl_handle { + struct hlist_node termtbl_hlist; + + struct mlx5_flow_table *termtbl; + struct mlx5_flow_act flow_act; + struct mlx5_flow_destination dest; + + struct mlx5_flow_handle *rule; + int ref_count; +}; + +static u32 +mlx5_eswitch_termtbl_hash(struct mlx5_flow_act *flow_act, + struct mlx5_flow_destination *dest) +{ + u32 hash; + + hash = jhash_1word(flow_act->action, 0); + hash = jhash((const void *)&flow_act->vlan, + sizeof(flow_act->vlan), hash); + hash = jhash((const void *)&dest->vport.num, + sizeof(dest->vport.num), hash); + hash = jhash((const void *)&dest->vport.vhca_id, + sizeof(dest->vport.num), hash); + if (flow_act->pkt_reformat) + hash = jhash(flow_act->pkt_reformat, + sizeof(*flow_act->pkt_reformat), + hash); + return hash; +} + +static int +mlx5_eswitch_termtbl_cmp(struct mlx5_flow_act *flow_act1, + struct mlx5_flow_destination *dest1, + struct mlx5_flow_act *flow_act2, + struct mlx5_flow_destination *dest2) +{ + int ret; + + ret = flow_act1->action != flow_act2->action || + dest1->vport.num != dest2->vport.num || + dest1->vport.vhca_id != dest2->vport.vhca_id || + memcmp(&flow_act1->vlan, &flow_act2->vlan, + sizeof(flow_act1->vlan)); + if (ret) + return ret; + + if (flow_act1->pkt_reformat && flow_act2->pkt_reformat) + return memcmp(flow_act1->pkt_reformat, flow_act2->pkt_reformat, + sizeof(*flow_act1->pkt_reformat)); + + return flow_act1->pkt_reformat || flow_act2->pkt_reformat; +} + +static int +mlx5_eswitch_termtbl_create(struct mlx5_core_dev *dev, + struct mlx5_termtbl_handle *tt, + struct mlx5_flow_act *flow_act) +{ + struct mlx5_flow_table_attr ft_attr = {}; + struct mlx5_flow_namespace *root_ns; + int err, err2; + + root_ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_FDB); + if (!root_ns) { + esw_warn(dev, "Failed to get FDB flow namespace\n"); + return -EOPNOTSUPP; + } + + /* As this is the terminating action then the termination table is the + * same prio as the slow path + */ + ft_attr.flags = MLX5_FLOW_TABLE_TERMINATION | MLX5_FLOW_TABLE_UNMANAGED | + MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT; + ft_attr.prio = FDB_TC_OFFLOAD; + ft_attr.max_fte = 1; + ft_attr.level = 1; + ft_attr.autogroup.max_num_groups = 1; + tt->termtbl = mlx5_create_auto_grouped_flow_table(root_ns, &ft_attr); + if (IS_ERR(tt->termtbl)) { + err = PTR_ERR(tt->termtbl); + esw_warn(dev, "Failed to create termination table, err %pe\n", tt->termtbl); + return err; + } + + tt->rule = mlx5_add_flow_rules(tt->termtbl, NULL, flow_act, + &tt->dest, 1); + if (IS_ERR(tt->rule)) { + err = PTR_ERR(tt->rule); + esw_warn(dev, "Failed to create termination table rule, err %pe\n", tt->rule); + goto add_flow_err; + } + return 0; + +add_flow_err: + err2 = mlx5_destroy_flow_table(tt->termtbl); + if (err2) + esw_warn(dev, "Failed to destroy termination table, err %d\n", err2); + + return err; +} + +static struct mlx5_termtbl_handle * +mlx5_eswitch_termtbl_get_create(struct mlx5_eswitch *esw, + struct mlx5_flow_act *flow_act, + struct mlx5_flow_destination *dest, + struct mlx5_esw_flow_attr *attr) +{ + struct mlx5_termtbl_handle *tt; + bool found = false; + u32 hash_key; + int err; + + mutex_lock(&esw->offloads.termtbl_mutex); + hash_key = mlx5_eswitch_termtbl_hash(flow_act, dest); + hash_for_each_possible(esw->offloads.termtbl_tbl, tt, + termtbl_hlist, hash_key) { + if (!mlx5_eswitch_termtbl_cmp(&tt->flow_act, &tt->dest, + flow_act, dest)) { + found = true; + break; + } + } + if (found) + goto tt_add_ref; + + tt = kzalloc(sizeof(*tt), GFP_KERNEL); + if (!tt) { + err = -ENOMEM; + goto tt_create_err; + } + + tt->dest.type = MLX5_FLOW_DESTINATION_TYPE_VPORT; + tt->dest.vport.num = dest->vport.num; + tt->dest.vport.vhca_id = dest->vport.vhca_id; + tt->dest.vport.flags = dest->vport.flags; + memcpy(&tt->flow_act, flow_act, sizeof(*flow_act)); + + err = mlx5_eswitch_termtbl_create(esw->dev, tt, flow_act); + if (err) + goto tt_create_err; + + hash_add(esw->offloads.termtbl_tbl, &tt->termtbl_hlist, hash_key); +tt_add_ref: + tt->ref_count++; + mutex_unlock(&esw->offloads.termtbl_mutex); + return tt; +tt_create_err: + kfree(tt); + mutex_unlock(&esw->offloads.termtbl_mutex); + return ERR_PTR(err); +} + +void +mlx5_eswitch_termtbl_put(struct mlx5_eswitch *esw, + struct mlx5_termtbl_handle *tt) +{ + mutex_lock(&esw->offloads.termtbl_mutex); + if (--tt->ref_count == 0) + hash_del(&tt->termtbl_hlist); + mutex_unlock(&esw->offloads.termtbl_mutex); + + if (!tt->ref_count) { + mlx5_del_flow_rules(tt->rule); + mlx5_destroy_flow_table(tt->termtbl); + kfree(tt); + } +} + +static void +mlx5_eswitch_termtbl_actions_move(struct mlx5_flow_act *src, + struct mlx5_flow_act *dst) +{ + if (src->action & MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH) { + src->action &= ~MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH; + dst->action |= MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH; + memcpy(&dst->vlan[0], &src->vlan[0], sizeof(src->vlan[0])); + memset(&src->vlan[0], 0, sizeof(src->vlan[0])); + + if (src->action & MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH_2) { + src->action &= ~MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH_2; + dst->action |= MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH_2; + memcpy(&dst->vlan[1], &src->vlan[1], sizeof(src->vlan[1])); + memset(&src->vlan[1], 0, sizeof(src->vlan[1])); + } + } +} + +static bool mlx5_eswitch_offload_is_uplink_port(const struct mlx5_eswitch *esw, + const struct mlx5_flow_spec *spec) +{ + u16 port_mask, port_value; + + if (MLX5_CAP_ESW_FLOWTABLE(esw->dev, flow_source)) + return spec->flow_context.flow_source == + MLX5_FLOW_CONTEXT_FLOW_SOURCE_UPLINK; + + port_mask = MLX5_GET(fte_match_param, spec->match_criteria, + misc_parameters.source_port); + port_value = MLX5_GET(fte_match_param, spec->match_value, + misc_parameters.source_port); + return (port_mask & port_value) == MLX5_VPORT_UPLINK; +} + +bool +mlx5_eswitch_termtbl_required(struct mlx5_eswitch *esw, + struct mlx5_flow_attr *attr, + struct mlx5_flow_act *flow_act, + struct mlx5_flow_spec *spec) +{ + struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr; + int i; + + if (!MLX5_CAP_ESW_FLOWTABLE_FDB(esw->dev, termination_table) || + !MLX5_CAP_ESW_FLOWTABLE_FDB(esw->dev, ignore_flow_level) || + mlx5e_tc_attr_flags_skip(attr->flags) || + (!mlx5_eswitch_offload_is_uplink_port(esw, spec) && !esw_attr->int_port)) + return false; + + /* push vlan on RX */ + if (flow_act->action & MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH && + !(esw->dev->priv.steering->mode == MLX5_FLOW_STEERING_MODE_SMFS && + MLX5_CAP_GEN(esw->dev, steering_format_version) >= + MLX5_STEERING_FORMAT_CONNECTX_6DX)) + return true; + + /* hairpin */ + for (i = esw_attr->split_count; i < esw_attr->out_count; i++) + if (!esw_attr->dest_int_port && esw_attr->dests[i].rep && + esw_attr->dests[i].rep->vport == MLX5_VPORT_UPLINK) + return true; + + return false; +} + +struct mlx5_flow_handle * +mlx5_eswitch_add_termtbl_rule(struct mlx5_eswitch *esw, + struct mlx5_flow_table *fdb, + struct mlx5_flow_spec *spec, + struct mlx5_esw_flow_attr *attr, + struct mlx5_flow_act *flow_act, + struct mlx5_flow_destination *dest, + int num_dest) +{ + struct mlx5_flow_act term_tbl_act = {}; + struct mlx5_flow_handle *rule = NULL; + bool term_table_created = false; + int num_vport_dests = 0; + int i, curr_dest; + + mlx5_eswitch_termtbl_actions_move(flow_act, &term_tbl_act); + term_tbl_act.action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + + for (i = 0; i < num_dest; i++) { + struct mlx5_termtbl_handle *tt; + + /* only vport destinations can be terminated */ + if (dest[i].type != MLX5_FLOW_DESTINATION_TYPE_VPORT) + continue; + + if (attr->dests[num_vport_dests].flags & MLX5_ESW_DEST_ENCAP) { + term_tbl_act.action |= MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT; + term_tbl_act.pkt_reformat = attr->dests[num_vport_dests].pkt_reformat; + } else { + term_tbl_act.action &= ~MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT; + term_tbl_act.pkt_reformat = NULL; + } + + /* get the terminating table for the action list */ + tt = mlx5_eswitch_termtbl_get_create(esw, &term_tbl_act, + &dest[i], attr); + if (IS_ERR(tt)) { + esw_warn(esw->dev, "Failed to get termination table, err %pe\n", tt); + goto revert_changes; + } + attr->dests[num_vport_dests].termtbl = tt; + num_vport_dests++; + + /* link the destination with the termination table */ + dest[i].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dest[i].ft = tt->termtbl; + term_table_created = true; + } + + /* at least one destination should reference a termination table */ + if (!term_table_created) + goto revert_changes; + + /* create the FTE */ + flow_act->action &= ~MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT; + flow_act->pkt_reformat = NULL; + flow_act->flags |= FLOW_ACT_IGNORE_FLOW_LEVEL; + rule = mlx5_add_flow_rules(fdb, spec, flow_act, dest, num_dest); + if (IS_ERR(rule)) + goto revert_changes; + + goto out; + +revert_changes: + /* revert the changes that were made to the original flow_act + * and fall-back to the original rule actions + */ + mlx5_eswitch_termtbl_actions_move(&term_tbl_act, flow_act); + + for (curr_dest = 0; curr_dest < num_vport_dests; curr_dest++) { + struct mlx5_termtbl_handle *tt = attr->dests[curr_dest].termtbl; + + /* search for the destination associated with the + * current term table + */ + for (i = 0; i < num_dest; i++) { + if (dest[i].ft != tt->termtbl) + continue; + + memset(&dest[i], 0, sizeof(dest[i])); + dest[i].type = MLX5_FLOW_DESTINATION_TYPE_VPORT; + dest[i].vport.num = tt->dest.vport.num; + dest[i].vport.vhca_id = tt->dest.vport.vhca_id; + mlx5_eswitch_termtbl_put(esw, tt); + break; + } + } + rule = mlx5_add_flow_rules(fdb, spec, flow_act, dest, num_dest); +out: + return rule; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/events.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/events.c new file mode 100644 index 0000000..8704a91 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/events.c @@ -0,0 +1,449 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// Copyright (c) 2018 Mellanox Technologies + +#include + +#include "mlx5_core.h" +#include "lib/eq.h" +#include "lib/mlx5.h" + +struct mlx5_event_nb { + struct mlx5_nb nb; + void *ctx; +}; + +/* General events handlers for the low level mlx5_core driver + * + * Other Major feature specific events such as + * clock/eswitch/fpga/FW trace and many others, are handled elsewhere, with + * separate notifiers callbacks, specifically by those mlx5 components. + */ +static int any_notifier(struct notifier_block *, unsigned long, void *); +static int temp_warn(struct notifier_block *, unsigned long, void *); +static int port_module(struct notifier_block *, unsigned long, void *); +static int pcie_core(struct notifier_block *, unsigned long, void *); + +/* handler which forwards the event to events->fw_nh, driver notifiers */ +static int forward_event(struct notifier_block *, unsigned long, void *); + +static struct mlx5_nb events_nbs_ref[] = { + /* Events to be processed by mlx5_core */ + {.nb.notifier_call = any_notifier, .event_type = MLX5_EVENT_TYPE_NOTIFY_ANY }, + {.nb.notifier_call = temp_warn, .event_type = MLX5_EVENT_TYPE_TEMP_WARN_EVENT }, + {.nb.notifier_call = port_module, .event_type = MLX5_EVENT_TYPE_PORT_MODULE_EVENT }, + {.nb.notifier_call = pcie_core, .event_type = MLX5_EVENT_TYPE_GENERAL_EVENT }, + + /* Events to be forwarded (as is) to mlx5 core interfaces (mlx5e/mlx5_ib) */ + {.nb.notifier_call = forward_event, .event_type = MLX5_EVENT_TYPE_PORT_CHANGE }, + {.nb.notifier_call = forward_event, .event_type = MLX5_EVENT_TYPE_GENERAL_EVENT }, + {.nb.notifier_call = forward_event, .event_type = MLX5_EVENT_TYPE_OBJECT_CHANGE_EVENT }, + /* QP/WQ resource events to forward */ + {.nb.notifier_call = forward_event, .event_type = MLX5_EVENT_TYPE_DCT_DRAINED }, + {.nb.notifier_call = forward_event, .event_type = MLX5_EVENT_TYPE_PATH_MIG }, + {.nb.notifier_call = forward_event, .event_type = MLX5_EVENT_TYPE_COMM_EST }, + {.nb.notifier_call = forward_event, .event_type = MLX5_EVENT_TYPE_SQ_DRAINED }, + {.nb.notifier_call = forward_event, .event_type = MLX5_EVENT_TYPE_SRQ_LAST_WQE }, + {.nb.notifier_call = forward_event, .event_type = MLX5_EVENT_TYPE_WQ_CATAS_ERROR }, + {.nb.notifier_call = forward_event, .event_type = MLX5_EVENT_TYPE_PATH_MIG_FAILED }, + {.nb.notifier_call = forward_event, .event_type = MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR }, + {.nb.notifier_call = forward_event, .event_type = MLX5_EVENT_TYPE_WQ_ACCESS_ERROR }, + {.nb.notifier_call = forward_event, .event_type = MLX5_EVENT_TYPE_XRQ_ERROR }, + /* SRQ events */ + {.nb.notifier_call = forward_event, .event_type = MLX5_EVENT_TYPE_SRQ_CATAS_ERROR }, + {.nb.notifier_call = forward_event, .event_type = MLX5_EVENT_TYPE_SRQ_RQ_LIMIT }, +}; + +struct mlx5_events { + struct mlx5_core_dev *dev; + struct workqueue_struct *wq; + struct mlx5_event_nb notifiers[ARRAY_SIZE(events_nbs_ref)]; + /* driver notifier chain for fw events */ + struct atomic_notifier_head fw_nh; + /* port module events stats */ + struct mlx5_pme_stats pme_stats; + /*pcie_core*/ + struct work_struct pcie_core_work; + /* driver notifier chain for sw events */ + struct blocking_notifier_head sw_nh; +}; + +static const char *eqe_type_str(u8 type) +{ + switch (type) { + case MLX5_EVENT_TYPE_COMP: + return "MLX5_EVENT_TYPE_COMP"; + case MLX5_EVENT_TYPE_PATH_MIG: + return "MLX5_EVENT_TYPE_PATH_MIG"; + case MLX5_EVENT_TYPE_COMM_EST: + return "MLX5_EVENT_TYPE_COMM_EST"; + case MLX5_EVENT_TYPE_SQ_DRAINED: + return "MLX5_EVENT_TYPE_SQ_DRAINED"; + case MLX5_EVENT_TYPE_SRQ_LAST_WQE: + return "MLX5_EVENT_TYPE_SRQ_LAST_WQE"; + case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT: + return "MLX5_EVENT_TYPE_SRQ_RQ_LIMIT"; + case MLX5_EVENT_TYPE_CQ_ERROR: + return "MLX5_EVENT_TYPE_CQ_ERROR"; + case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: + return "MLX5_EVENT_TYPE_WQ_CATAS_ERROR"; + case MLX5_EVENT_TYPE_PATH_MIG_FAILED: + return "MLX5_EVENT_TYPE_PATH_MIG_FAILED"; + case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: + return "MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR"; + case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: + return "MLX5_EVENT_TYPE_WQ_ACCESS_ERROR"; + case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR: + return "MLX5_EVENT_TYPE_SRQ_CATAS_ERROR"; + case MLX5_EVENT_TYPE_INTERNAL_ERROR: + return "MLX5_EVENT_TYPE_INTERNAL_ERROR"; + case MLX5_EVENT_TYPE_PORT_CHANGE: + return "MLX5_EVENT_TYPE_PORT_CHANGE"; + case MLX5_EVENT_TYPE_GPIO_EVENT: + return "MLX5_EVENT_TYPE_GPIO_EVENT"; + case MLX5_EVENT_TYPE_PORT_MODULE_EVENT: + return "MLX5_EVENT_TYPE_PORT_MODULE_EVENT"; + case MLX5_EVENT_TYPE_TEMP_WARN_EVENT: + return "MLX5_EVENT_TYPE_TEMP_WARN_EVENT"; + case MLX5_EVENT_TYPE_REMOTE_CONFIG: + return "MLX5_EVENT_TYPE_REMOTE_CONFIG"; + case MLX5_EVENT_TYPE_DB_BF_CONGESTION: + return "MLX5_EVENT_TYPE_DB_BF_CONGESTION"; + case MLX5_EVENT_TYPE_STALL_EVENT: + return "MLX5_EVENT_TYPE_STALL_EVENT"; + case MLX5_EVENT_TYPE_CMD: + return "MLX5_EVENT_TYPE_CMD"; + case MLX5_EVENT_TYPE_ESW_FUNCTIONS_CHANGED: + return "MLX5_EVENT_TYPE_ESW_FUNCTIONS_CHANGED"; + case MLX5_EVENT_TYPE_VHCA_STATE_CHANGE: + return "MLX5_EVENT_TYPE_VHCA_STATE_CHANGE"; + case MLX5_EVENT_TYPE_PAGE_REQUEST: + return "MLX5_EVENT_TYPE_PAGE_REQUEST"; + case MLX5_EVENT_TYPE_PAGE_FAULT: + return "MLX5_EVENT_TYPE_PAGE_FAULT"; + case MLX5_EVENT_TYPE_PPS_EVENT: + return "MLX5_EVENT_TYPE_PPS_EVENT"; + case MLX5_EVENT_TYPE_NIC_VPORT_CHANGE: + return "MLX5_EVENT_TYPE_NIC_VPORT_CHANGE"; + case MLX5_EVENT_TYPE_FPGA_ERROR: + return "MLX5_EVENT_TYPE_FPGA_ERROR"; + case MLX5_EVENT_TYPE_FPGA_QP_ERROR: + return "MLX5_EVENT_TYPE_FPGA_QP_ERROR"; + case MLX5_EVENT_TYPE_GENERAL_EVENT: + return "MLX5_EVENT_TYPE_GENERAL_EVENT"; + case MLX5_EVENT_TYPE_MONITOR_COUNTER: + return "MLX5_EVENT_TYPE_MONITOR_COUNTER"; + case MLX5_EVENT_TYPE_DEVICE_TRACER: + return "MLX5_EVENT_TYPE_DEVICE_TRACER"; + case MLX5_EVENT_TYPE_XRQ_ERROR: + return "MLX5_EVENT_TYPE_XRQ_ERROR"; + case MLX5_EVENT_TYPE_OBJECT_CHANGE_EVENT: + return "MLX5_EVENT_TYPE_OBJECT_CHANGE_EVENT"; + default: + return "Unrecognized event"; + } +} + +/* handles all FW events, type == eqe->type */ +static int any_notifier(struct notifier_block *nb, + unsigned long type, void *data) +{ + struct mlx5_event_nb *event_nb = mlx5_nb_cof(nb, struct mlx5_event_nb, nb); + struct mlx5_events *events = event_nb->ctx; + struct mlx5_eqe *eqe = data; + + mlx5_core_dbg(events->dev, "Async eqe type %s, subtype (%d)\n", + eqe_type_str(eqe->type), eqe->sub_type); + return NOTIFY_OK; +} + +/* type == MLX5_EVENT_TYPE_TEMP_WARN_EVENT */ +static int temp_warn(struct notifier_block *nb, unsigned long type, void *data) +{ + struct mlx5_event_nb *event_nb = mlx5_nb_cof(nb, struct mlx5_event_nb, nb); + struct mlx5_events *events = event_nb->ctx; + struct mlx5_eqe *eqe = data; + u64 value_lsb; + u64 value_msb; + + value_lsb = be64_to_cpu(eqe->data.temp_warning.sensor_warning_lsb); + value_msb = be64_to_cpu(eqe->data.temp_warning.sensor_warning_msb); + + mlx5_core_warn(events->dev, + "High temperature on sensors with bit set %llx %llx", + value_msb, value_lsb); + + return NOTIFY_OK; +} + +/* MLX5_EVENT_TYPE_PORT_MODULE_EVENT */ +static const char *mlx5_pme_status_to_string(enum port_module_event_status_type status) +{ + switch (status) { + case MLX5_MODULE_STATUS_PLUGGED: + return "Cable plugged"; + case MLX5_MODULE_STATUS_UNPLUGGED: + return "Cable unplugged"; + case MLX5_MODULE_STATUS_ERROR: + return "Cable error"; + case MLX5_MODULE_STATUS_DISABLED: + return "Cable disabled"; + default: + return "Unknown status"; + } +} + +static const char *mlx5_pme_error_to_string(enum port_module_event_error_type error) +{ + switch (error) { + case MLX5_MODULE_EVENT_ERROR_POWER_BUDGET_EXCEEDED: + return "Power budget exceeded"; + case MLX5_MODULE_EVENT_ERROR_LONG_RANGE_FOR_NON_MLNX: + return "Long Range for non MLNX cable"; + case MLX5_MODULE_EVENT_ERROR_BUS_STUCK: + return "Bus stuck (I2C or data shorted)"; + case MLX5_MODULE_EVENT_ERROR_NO_EEPROM_RETRY_TIMEOUT: + return "No EEPROM/retry timeout"; + case MLX5_MODULE_EVENT_ERROR_ENFORCE_PART_NUMBER_LIST: + return "Enforce part number list"; + case MLX5_MODULE_EVENT_ERROR_UNKNOWN_IDENTIFIER: + return "Unknown identifier"; + case MLX5_MODULE_EVENT_ERROR_HIGH_TEMPERATURE: + return "High Temperature"; + case MLX5_MODULE_EVENT_ERROR_BAD_CABLE: + return "Bad or shorted cable/module"; + case MLX5_MODULE_EVENT_ERROR_PCIE_POWER_SLOT_EXCEEDED: + return "One or more network ports have been powered down due to insufficient/unadvertised power on the PCIe slot"; + default: + return "Unknown error"; + } +} + +/* type == MLX5_EVENT_TYPE_PORT_MODULE_EVENT */ +static int port_module(struct notifier_block *nb, unsigned long type, void *data) +{ + struct mlx5_event_nb *event_nb = mlx5_nb_cof(nb, struct mlx5_event_nb, nb); + struct mlx5_events *events = event_nb->ctx; + struct mlx5_eqe *eqe = data; + + enum port_module_event_status_type module_status; + enum port_module_event_error_type error_type; + struct mlx5_eqe_port_module *module_event_eqe; + const char *status_str; + u8 module_num; + + module_event_eqe = &eqe->data.port_module; + module_status = module_event_eqe->module_status & + PORT_MODULE_EVENT_MODULE_STATUS_MASK; + error_type = module_event_eqe->error_type & + PORT_MODULE_EVENT_ERROR_TYPE_MASK; + + if (module_status < MLX5_MODULE_STATUS_NUM) + events->pme_stats.status_counters[module_status]++; + + if (module_status == MLX5_MODULE_STATUS_ERROR) + if (error_type < MLX5_MODULE_EVENT_ERROR_NUM) + events->pme_stats.error_counters[error_type]++; + + if (!printk_ratelimit()) + return NOTIFY_OK; + + module_num = module_event_eqe->module; + status_str = mlx5_pme_status_to_string(module_status); + if (module_status == MLX5_MODULE_STATUS_ERROR) { + const char *error_str = mlx5_pme_error_to_string(error_type); + + mlx5_core_err(events->dev, + "Port module event[error]: module %u, %s, %s\n", + module_num, status_str, error_str); + } else { + mlx5_core_info(events->dev, + "Port module event: module %u, %s\n", + module_num, status_str); + } + + return NOTIFY_OK; +} + +enum { + MLX5_PCI_POWER_COULD_NOT_BE_READ = 0x0, + MLX5_PCI_POWER_SUFFICIENT_REPORTED = 0x1, + MLX5_PCI_POWER_INSUFFICIENT_REPORTED = 0x2, +}; + +static void mlx5_pcie_event(struct work_struct *work) +{ + u32 out[MLX5_ST_SZ_DW(mpein_reg)] = {0}; + u32 in[MLX5_ST_SZ_DW(mpein_reg)] = {0}; + struct mlx5_events *events; + struct mlx5_core_dev *dev; + u8 power_status; + u16 pci_power; + + events = container_of(work, struct mlx5_events, pcie_core_work); + dev = events->dev; + + if (!MLX5_CAP_MCAM_FEATURE(dev, pci_status_and_power)) + return; + + mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out), + MLX5_REG_MPEIN, 0, 0); + power_status = MLX5_GET(mpein_reg, out, pwr_status); + pci_power = MLX5_GET(mpein_reg, out, pci_power); + + switch (power_status) { + case MLX5_PCI_POWER_COULD_NOT_BE_READ: + mlx5_core_info_rl(dev, + "PCIe slot power capability was not advertised.\n"); + break; + case MLX5_PCI_POWER_INSUFFICIENT_REPORTED: + mlx5_core_warn_rl(dev, + "Detected insufficient power on the PCIe slot (%uW).\n", + pci_power); + break; + case MLX5_PCI_POWER_SUFFICIENT_REPORTED: + mlx5_core_info_rl(dev, + "PCIe slot advertised sufficient power (%uW).\n", + pci_power); + break; + } +} + +static int pcie_core(struct notifier_block *nb, unsigned long type, void *data) +{ + struct mlx5_event_nb *event_nb = mlx5_nb_cof(nb, + struct mlx5_event_nb, + nb); + struct mlx5_events *events = event_nb->ctx; + struct mlx5_eqe *eqe = data; + + switch (eqe->sub_type) { + case MLX5_GENERAL_SUBTYPE_PCI_POWER_CHANGE_EVENT: + queue_work(events->wq, &events->pcie_core_work); + break; + default: + return NOTIFY_DONE; + } + + return NOTIFY_OK; +} + +void mlx5_get_pme_stats(struct mlx5_core_dev *dev, struct mlx5_pme_stats *stats) +{ + *stats = dev->priv.events->pme_stats; +} + +/* forward event as is to registered interfaces (mlx5e/mlx5_ib) */ +static int forward_event(struct notifier_block *nb, unsigned long event, void *data) +{ + struct mlx5_event_nb *event_nb = mlx5_nb_cof(nb, struct mlx5_event_nb, nb); + struct mlx5_events *events = event_nb->ctx; + struct mlx5_eqe *eqe = data; + + mlx5_core_dbg(events->dev, "Async eqe type %s, subtype (%d) forward to interfaces\n", + eqe_type_str(eqe->type), eqe->sub_type); + atomic_notifier_call_chain(&events->fw_nh, event, data); + return NOTIFY_OK; +} + +int mlx5_events_init(struct mlx5_core_dev *dev) +{ + struct mlx5_events *events = kzalloc(sizeof(*events), GFP_KERNEL); + + if (!events) + return -ENOMEM; + + ATOMIC_INIT_NOTIFIER_HEAD(&events->fw_nh); + events->dev = dev; + dev->priv.events = events; + events->wq = create_singlethread_workqueue("mlx5_events"); + if (!events->wq) { + kfree(events); + return -ENOMEM; + } + INIT_WORK(&events->pcie_core_work, mlx5_pcie_event); + BLOCKING_INIT_NOTIFIER_HEAD(&events->sw_nh); + + return 0; +} + +void mlx5_events_cleanup(struct mlx5_core_dev *dev) +{ + destroy_workqueue(dev->priv.events->wq); + kvfree(dev->priv.events); +} + +void mlx5_events_start(struct mlx5_core_dev *dev) +{ + struct mlx5_events *events = dev->priv.events; + int i; + + for (i = 0; i < ARRAY_SIZE(events_nbs_ref); i++) { + events->notifiers[i].nb = events_nbs_ref[i]; + events->notifiers[i].ctx = events; + mlx5_eq_notifier_register(dev, &events->notifiers[i].nb); + } +} + +void mlx5_events_stop(struct mlx5_core_dev *dev) +{ + struct mlx5_events *events = dev->priv.events; + int i; + + for (i = ARRAY_SIZE(events_nbs_ref) - 1; i >= 0 ; i--) + mlx5_eq_notifier_unregister(dev, &events->notifiers[i].nb); + flush_workqueue(events->wq); +} + +/* This API is used only for processing and forwarding firmware + * events to mlx5 consumer. + */ +int mlx5_notifier_register(struct mlx5_core_dev *dev, struct notifier_block *nb) +{ + struct mlx5_events *events = dev->priv.events; + + return atomic_notifier_chain_register(&events->fw_nh, nb); +} +EXPORT_SYMBOL(mlx5_notifier_register); + +int mlx5_notifier_unregister(struct mlx5_core_dev *dev, struct notifier_block *nb) +{ + struct mlx5_events *events = dev->priv.events; + + return atomic_notifier_chain_unregister(&events->fw_nh, nb); +} +EXPORT_SYMBOL(mlx5_notifier_unregister); + +int mlx5_notifier_call_chain(struct mlx5_events *events, unsigned int event, void *data) +{ + return atomic_notifier_call_chain(&events->fw_nh, event, data); +} + +/* This API is used only for processing and forwarding driver-specific + * events to mlx5 consumers. + */ +int mlx5_blocking_notifier_register(struct mlx5_core_dev *dev, struct notifier_block *nb) +{ + struct mlx5_events *events = dev->priv.events; + + return blocking_notifier_chain_register(&events->sw_nh, nb); +} + +int mlx5_blocking_notifier_unregister(struct mlx5_core_dev *dev, struct notifier_block *nb) +{ + struct mlx5_events *events = dev->priv.events; + + return blocking_notifier_chain_unregister(&events->sw_nh, nb); +} + +int mlx5_blocking_notifier_call_chain(struct mlx5_core_dev *dev, unsigned int event, + void *data) +{ + struct mlx5_events *events = dev->priv.events; + + return blocking_notifier_call_chain(&events->sw_nh, event, data); +} + +void mlx5_events_work_enqueue(struct mlx5_core_dev *dev, struct work_struct *work) +{ + queue_work(dev->priv.events->wq, work); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.c new file mode 100644 index 0000000..9a37077 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.c @@ -0,0 +1,235 @@ +/* + * Copyright (c) 2017, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include "mlx5_core.h" +#include "fpga/cmd.h" + +#define MLX5_FPGA_ACCESS_REG_SZ (MLX5_ST_SZ_DW(fpga_access_reg) + \ + MLX5_FPGA_ACCESS_REG_SIZE_MAX) + +int mlx5_fpga_access_reg(struct mlx5_core_dev *dev, u8 size, u64 addr, + void *buf, bool write) +{ + u32 in[MLX5_FPGA_ACCESS_REG_SZ] = {0}; + u32 out[MLX5_FPGA_ACCESS_REG_SZ]; + int err; + + if (size & 3) + return -EINVAL; + if (addr & 3) + return -EINVAL; + if (size > MLX5_FPGA_ACCESS_REG_SIZE_MAX) + return -EINVAL; + + MLX5_SET(fpga_access_reg, in, size, size); + MLX5_SET64(fpga_access_reg, in, address, addr); + if (write) + memcpy(MLX5_ADDR_OF(fpga_access_reg, in, data), buf, size); + + err = mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out), + MLX5_REG_FPGA_ACCESS_REG, 0, write); + if (err) + return err; + + if (!write) + memcpy(buf, MLX5_ADDR_OF(fpga_access_reg, out, data), size); + + return 0; +} + +int mlx5_fpga_caps(struct mlx5_core_dev *dev) +{ + u32 in[MLX5_ST_SZ_DW(fpga_cap)] = {0}; + + return mlx5_core_access_reg(dev, in, sizeof(in), dev->caps.fpga, + MLX5_ST_SZ_BYTES(fpga_cap), + MLX5_REG_FPGA_CAP, 0, 0); +} + +int mlx5_fpga_ctrl_op(struct mlx5_core_dev *dev, u8 op) +{ + u32 in[MLX5_ST_SZ_DW(fpga_ctrl)] = {0}; + u32 out[MLX5_ST_SZ_DW(fpga_ctrl)]; + + MLX5_SET(fpga_ctrl, in, operation, op); + + return mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out), + MLX5_REG_FPGA_CTRL, 0, true); +} + +int mlx5_fpga_sbu_caps(struct mlx5_core_dev *dev, void *caps, int size) +{ + unsigned int cap_size = MLX5_CAP_FPGA(dev, sandbox_extended_caps_len); + u64 addr = MLX5_CAP64_FPGA(dev, sandbox_extended_caps_addr); + unsigned int read; + int ret = 0; + + if (cap_size > size) { + mlx5_core_warn(dev, "Not enough buffer %u for FPGA SBU caps %u", + size, cap_size); + return -EINVAL; + } + + while (cap_size > 0) { + read = min_t(unsigned int, cap_size, + MLX5_FPGA_ACCESS_REG_SIZE_MAX); + + ret = mlx5_fpga_access_reg(dev, read, addr, caps, false); + if (ret) { + mlx5_core_warn(dev, "Error reading FPGA SBU caps %u bytes at address 0x%llx: %d", + read, addr, ret); + return ret; + } + + cap_size -= read; + addr += read; + caps += read; + } + + return ret; +} + +int mlx5_fpga_query(struct mlx5_core_dev *dev, struct mlx5_fpga_query *query) +{ + u32 in[MLX5_ST_SZ_DW(fpga_ctrl)] = {0}; + u32 out[MLX5_ST_SZ_DW(fpga_ctrl)]; + int err; + + err = mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out), + MLX5_REG_FPGA_CTRL, 0, false); + if (err) + return err; + + query->status = MLX5_GET(fpga_ctrl, out, status); + query->admin_image = MLX5_GET(fpga_ctrl, out, flash_select_admin); + query->oper_image = MLX5_GET(fpga_ctrl, out, flash_select_oper); + return 0; +} + +int mlx5_fpga_create_qp(struct mlx5_core_dev *dev, void *fpga_qpc, + u32 *fpga_qpn) +{ + u32 out[MLX5_ST_SZ_DW(fpga_create_qp_out)] = {}; + u32 in[MLX5_ST_SZ_DW(fpga_create_qp_in)] = {}; + int ret; + + MLX5_SET(fpga_create_qp_in, in, opcode, MLX5_CMD_OP_FPGA_CREATE_QP); + memcpy(MLX5_ADDR_OF(fpga_create_qp_in, in, fpga_qpc), fpga_qpc, + MLX5_FLD_SZ_BYTES(fpga_create_qp_in, fpga_qpc)); + + ret = mlx5_cmd_exec_inout(dev, fpga_create_qp, in, out); + if (ret) + return ret; + + memcpy(fpga_qpc, MLX5_ADDR_OF(fpga_create_qp_out, out, fpga_qpc), + MLX5_FLD_SZ_BYTES(fpga_create_qp_out, fpga_qpc)); + *fpga_qpn = MLX5_GET(fpga_create_qp_out, out, fpga_qpn); + return ret; +} + +int mlx5_fpga_modify_qp(struct mlx5_core_dev *dev, u32 fpga_qpn, + enum mlx5_fpga_qpc_field_select fields, + void *fpga_qpc) +{ + u32 in[MLX5_ST_SZ_DW(fpga_modify_qp_in)] = {}; + + MLX5_SET(fpga_modify_qp_in, in, opcode, MLX5_CMD_OP_FPGA_MODIFY_QP); + MLX5_SET(fpga_modify_qp_in, in, field_select, fields); + MLX5_SET(fpga_modify_qp_in, in, fpga_qpn, fpga_qpn); + memcpy(MLX5_ADDR_OF(fpga_modify_qp_in, in, fpga_qpc), fpga_qpc, + MLX5_FLD_SZ_BYTES(fpga_modify_qp_in, fpga_qpc)); + + return mlx5_cmd_exec_in(dev, fpga_modify_qp, in); +} + +int mlx5_fpga_query_qp(struct mlx5_core_dev *dev, + u32 fpga_qpn, void *fpga_qpc) +{ + u32 out[MLX5_ST_SZ_DW(fpga_query_qp_out)] = {}; + u32 in[MLX5_ST_SZ_DW(fpga_query_qp_in)] = {}; + int ret; + + MLX5_SET(fpga_query_qp_in, in, opcode, MLX5_CMD_OP_FPGA_QUERY_QP); + MLX5_SET(fpga_query_qp_in, in, fpga_qpn, fpga_qpn); + + ret = mlx5_cmd_exec_inout(dev, fpga_query_qp, in, out); + if (ret) + return ret; + + memcpy(fpga_qpc, MLX5_ADDR_OF(fpga_query_qp_out, out, fpga_qpc), + MLX5_FLD_SZ_BYTES(fpga_query_qp_out, fpga_qpc)); + return ret; +} + +int mlx5_fpga_destroy_qp(struct mlx5_core_dev *dev, u32 fpga_qpn) +{ + u32 in[MLX5_ST_SZ_DW(fpga_destroy_qp_in)] = {}; + + MLX5_SET(fpga_destroy_qp_in, in, opcode, MLX5_CMD_OP_FPGA_DESTROY_QP); + MLX5_SET(fpga_destroy_qp_in, in, fpga_qpn, fpga_qpn); + + return mlx5_cmd_exec_in(dev, fpga_destroy_qp, in); +} + +int mlx5_fpga_query_qp_counters(struct mlx5_core_dev *dev, u32 fpga_qpn, + bool clear, struct mlx5_fpga_qp_counters *data) +{ + u32 out[MLX5_ST_SZ_DW(fpga_query_qp_counters_out)] = {}; + u32 in[MLX5_ST_SZ_DW(fpga_query_qp_counters_in)] = {}; + int ret; + + MLX5_SET(fpga_query_qp_counters_in, in, opcode, + MLX5_CMD_OP_FPGA_QUERY_QP_COUNTERS); + MLX5_SET(fpga_query_qp_counters_in, in, clear, clear); + MLX5_SET(fpga_query_qp_counters_in, in, fpga_qpn, fpga_qpn); + + ret = mlx5_cmd_exec_inout(dev, fpga_query_qp_counters, in, out); + if (ret) + return ret; + + data->rx_ack_packets = MLX5_GET64(fpga_query_qp_counters_out, out, + rx_ack_packets); + data->rx_send_packets = MLX5_GET64(fpga_query_qp_counters_out, out, + rx_send_packets); + data->tx_ack_packets = MLX5_GET64(fpga_query_qp_counters_out, out, + tx_ack_packets); + data->tx_send_packets = MLX5_GET64(fpga_query_qp_counters_out, out, + tx_send_packets); + data->rx_total_drop = MLX5_GET64(fpga_query_qp_counters_out, out, + rx_total_drop); + + return ret; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.h new file mode 100644 index 0000000..11621d2 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.h @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2017, Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __MLX5_FPGA_H__ +#define __MLX5_FPGA_H__ + +#include + +enum mlx5_fpga_id { + MLX5_FPGA_NEWTON = 0, + MLX5_FPGA_EDISON = 1, + MLX5_FPGA_MORSE = 2, + MLX5_FPGA_MORSEQ = 3, +}; + +enum mlx5_fpga_image { + MLX5_FPGA_IMAGE_USER = 0, + MLX5_FPGA_IMAGE_FACTORY, +}; + +enum mlx5_fpga_status { + MLX5_FPGA_STATUS_SUCCESS = 0, + MLX5_FPGA_STATUS_FAILURE = 1, + MLX5_FPGA_STATUS_IN_PROGRESS = 2, + MLX5_FPGA_STATUS_NONE = 0xFFFF, +}; + +struct mlx5_fpga_query { + enum mlx5_fpga_image admin_image; + enum mlx5_fpga_image oper_image; + enum mlx5_fpga_status status; +}; + +enum mlx5_fpga_qpc_field_select { + MLX5_FPGA_QPC_STATE = BIT(0), +}; + +struct mlx5_fpga_qp_counters { + u64 rx_ack_packets; + u64 rx_send_packets; + u64 tx_ack_packets; + u64 tx_send_packets; + u64 rx_total_drop; +}; + +int mlx5_fpga_caps(struct mlx5_core_dev *dev); +int mlx5_fpga_query(struct mlx5_core_dev *dev, struct mlx5_fpga_query *query); +int mlx5_fpga_ctrl_op(struct mlx5_core_dev *dev, u8 op); +int mlx5_fpga_access_reg(struct mlx5_core_dev *dev, u8 size, u64 addr, + void *buf, bool write); +int mlx5_fpga_sbu_caps(struct mlx5_core_dev *dev, void *caps, int size); + +int mlx5_fpga_create_qp(struct mlx5_core_dev *dev, void *fpga_qpc, + u32 *fpga_qpn); +int mlx5_fpga_modify_qp(struct mlx5_core_dev *dev, u32 fpga_qpn, + enum mlx5_fpga_qpc_field_select fields, void *fpga_qpc); +int mlx5_fpga_query_qp(struct mlx5_core_dev *dev, u32 fpga_qpn, void *fpga_qpc); +int mlx5_fpga_query_qp_counters(struct mlx5_core_dev *dev, u32 fpga_qpn, + bool clear, struct mlx5_fpga_qp_counters *data); +int mlx5_fpga_destroy_qp(struct mlx5_core_dev *dev, u32 fpga_qpn); + +#endif /* __MLX5_FPGA_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c new file mode 100644 index 0000000..12abe99 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c @@ -0,0 +1,1001 @@ +/* + * Copyright (c) 2017 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include +#include + +#include "mlx5_core.h" +#include "lib/mlx5.h" +#include "fpga/conn.h" + +#define MLX5_FPGA_PKEY 0xFFFF +#define MLX5_FPGA_PKEY_INDEX 0 /* RoCE PKEY 0xFFFF is always at index 0 */ +#define MLX5_FPGA_RECV_SIZE 2048 +#define MLX5_FPGA_PORT_NUM 1 +#define MLX5_FPGA_CQ_BUDGET 64 + +static int mlx5_fpga_conn_map_buf(struct mlx5_fpga_conn *conn, + struct mlx5_fpga_dma_buf *buf) +{ + struct device *dma_device; + int err = 0; + + if (unlikely(!buf->sg[0].data)) + goto out; + + dma_device = mlx5_core_dma_dev(conn->fdev->mdev); + buf->sg[0].dma_addr = dma_map_single(dma_device, buf->sg[0].data, + buf->sg[0].size, buf->dma_dir); + err = dma_mapping_error(dma_device, buf->sg[0].dma_addr); + if (unlikely(err)) { + mlx5_fpga_warn(conn->fdev, "DMA error on sg 0: %d\n", err); + err = -ENOMEM; + goto out; + } + + if (!buf->sg[1].data) + goto out; + + buf->sg[1].dma_addr = dma_map_single(dma_device, buf->sg[1].data, + buf->sg[1].size, buf->dma_dir); + err = dma_mapping_error(dma_device, buf->sg[1].dma_addr); + if (unlikely(err)) { + mlx5_fpga_warn(conn->fdev, "DMA error on sg 1: %d\n", err); + dma_unmap_single(dma_device, buf->sg[0].dma_addr, + buf->sg[0].size, buf->dma_dir); + err = -ENOMEM; + } + +out: + return err; +} + +static void mlx5_fpga_conn_unmap_buf(struct mlx5_fpga_conn *conn, + struct mlx5_fpga_dma_buf *buf) +{ + struct device *dma_device; + + dma_device = mlx5_core_dma_dev(conn->fdev->mdev); + if (buf->sg[1].data) + dma_unmap_single(dma_device, buf->sg[1].dma_addr, + buf->sg[1].size, buf->dma_dir); + + if (likely(buf->sg[0].data)) + dma_unmap_single(dma_device, buf->sg[0].dma_addr, + buf->sg[0].size, buf->dma_dir); +} + +static int mlx5_fpga_conn_post_recv(struct mlx5_fpga_conn *conn, + struct mlx5_fpga_dma_buf *buf) +{ + struct mlx5_wqe_data_seg *data; + unsigned int ix; + int err = 0; + + err = mlx5_fpga_conn_map_buf(conn, buf); + if (unlikely(err)) + goto out; + + if (unlikely(conn->qp.rq.pc - conn->qp.rq.cc >= conn->qp.rq.size)) { + mlx5_fpga_conn_unmap_buf(conn, buf); + return -EBUSY; + } + + ix = conn->qp.rq.pc & (conn->qp.rq.size - 1); + data = mlx5_wq_cyc_get_wqe(&conn->qp.wq.rq, ix); + data->byte_count = cpu_to_be32(buf->sg[0].size); + data->lkey = cpu_to_be32(conn->fdev->conn_res.mkey); + data->addr = cpu_to_be64(buf->sg[0].dma_addr); + + conn->qp.rq.pc++; + conn->qp.rq.bufs[ix] = buf; + + /* Make sure that descriptors are written before doorbell record. */ + dma_wmb(); + *conn->qp.wq.rq.db = cpu_to_be32(conn->qp.rq.pc & 0xffff); +out: + return err; +} + +static void mlx5_fpga_conn_notify_hw(struct mlx5_fpga_conn *conn, void *wqe) +{ + /* ensure wqe is visible to device before updating doorbell record */ + dma_wmb(); + *conn->qp.wq.sq.db = cpu_to_be32(conn->qp.sq.pc); + /* Make sure that doorbell record is visible before ringing */ + wmb(); + mlx5_write64(wqe, conn->fdev->conn_res.uar->map + MLX5_BF_OFFSET); +} + +static void mlx5_fpga_conn_post_send(struct mlx5_fpga_conn *conn, + struct mlx5_fpga_dma_buf *buf) +{ + struct mlx5_wqe_ctrl_seg *ctrl; + struct mlx5_wqe_data_seg *data; + unsigned int ix, sgi; + int size = 1; + + ix = conn->qp.sq.pc & (conn->qp.sq.size - 1); + + ctrl = mlx5_wq_cyc_get_wqe(&conn->qp.wq.sq, ix); + data = (void *)(ctrl + 1); + + for (sgi = 0; sgi < ARRAY_SIZE(buf->sg); sgi++) { + if (!buf->sg[sgi].data) + break; + data->byte_count = cpu_to_be32(buf->sg[sgi].size); + data->lkey = cpu_to_be32(conn->fdev->conn_res.mkey); + data->addr = cpu_to_be64(buf->sg[sgi].dma_addr); + data++; + size++; + } + + ctrl->imm = 0; + ctrl->fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE; + ctrl->opmod_idx_opcode = cpu_to_be32(((conn->qp.sq.pc & 0xffff) << 8) | + MLX5_OPCODE_SEND); + ctrl->qpn_ds = cpu_to_be32(size | (conn->qp.qpn << 8)); + + conn->qp.sq.pc++; + conn->qp.sq.bufs[ix] = buf; + mlx5_fpga_conn_notify_hw(conn, ctrl); +} + +int mlx5_fpga_conn_send(struct mlx5_fpga_conn *conn, + struct mlx5_fpga_dma_buf *buf) +{ + unsigned long flags; + int err; + + if (!conn->qp.active) + return -ENOTCONN; + + buf->dma_dir = DMA_TO_DEVICE; + err = mlx5_fpga_conn_map_buf(conn, buf); + if (err) + return err; + + spin_lock_irqsave(&conn->qp.sq.lock, flags); + + if (conn->qp.sq.pc - conn->qp.sq.cc >= conn->qp.sq.size) { + list_add_tail(&buf->list, &conn->qp.sq.backlog); + goto out_unlock; + } + + mlx5_fpga_conn_post_send(conn, buf); + +out_unlock: + spin_unlock_irqrestore(&conn->qp.sq.lock, flags); + return err; +} + +static int mlx5_fpga_conn_post_recv_buf(struct mlx5_fpga_conn *conn) +{ + struct mlx5_fpga_dma_buf *buf; + int err; + + buf = kzalloc(sizeof(*buf) + MLX5_FPGA_RECV_SIZE, 0); + if (!buf) + return -ENOMEM; + + buf->sg[0].data = (void *)(buf + 1); + buf->sg[0].size = MLX5_FPGA_RECV_SIZE; + buf->dma_dir = DMA_FROM_DEVICE; + + err = mlx5_fpga_conn_post_recv(conn, buf); + if (err) + kfree(buf); + + return err; +} + +static int mlx5_fpga_conn_create_mkey(struct mlx5_core_dev *mdev, u32 pdn, + u32 *mkey) +{ + int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); + void *mkc; + u32 *in; + int err; + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA); + MLX5_SET(mkc, mkc, lw, 1); + MLX5_SET(mkc, mkc, lr, 1); + + MLX5_SET(mkc, mkc, pd, pdn); + MLX5_SET(mkc, mkc, length64, 1); + MLX5_SET(mkc, mkc, qpn, 0xffffff); + + err = mlx5_core_create_mkey(mdev, mkey, in, inlen); + + kvfree(in); + return err; +} + +static void mlx5_fpga_conn_rq_cqe(struct mlx5_fpga_conn *conn, + struct mlx5_cqe64 *cqe, u8 status) +{ + struct mlx5_fpga_dma_buf *buf; + int ix, err; + + ix = be16_to_cpu(cqe->wqe_counter) & (conn->qp.rq.size - 1); + buf = conn->qp.rq.bufs[ix]; + conn->qp.rq.bufs[ix] = NULL; + conn->qp.rq.cc++; + + if (unlikely(status && (status != MLX5_CQE_SYNDROME_WR_FLUSH_ERR))) + mlx5_fpga_warn(conn->fdev, "RQ buf %p on FPGA QP %u completion status %d\n", + buf, conn->fpga_qpn, status); + else + mlx5_fpga_dbg(conn->fdev, "RQ buf %p on FPGA QP %u completion status %d\n", + buf, conn->fpga_qpn, status); + + mlx5_fpga_conn_unmap_buf(conn, buf); + + if (unlikely(status || !conn->qp.active)) { + conn->qp.active = false; + kfree(buf); + return; + } + + buf->sg[0].size = be32_to_cpu(cqe->byte_cnt); + mlx5_fpga_dbg(conn->fdev, "Message with %u bytes received successfully\n", + buf->sg[0].size); + conn->recv_cb(conn->cb_arg, buf); + + buf->sg[0].size = MLX5_FPGA_RECV_SIZE; + err = mlx5_fpga_conn_post_recv(conn, buf); + if (unlikely(err)) { + mlx5_fpga_warn(conn->fdev, + "Failed to re-post recv buf: %d\n", err); + kfree(buf); + } +} + +static void mlx5_fpga_conn_sq_cqe(struct mlx5_fpga_conn *conn, + struct mlx5_cqe64 *cqe, u8 status) +{ + struct mlx5_fpga_dma_buf *buf, *nextbuf; + unsigned long flags; + int ix; + + spin_lock_irqsave(&conn->qp.sq.lock, flags); + + ix = be16_to_cpu(cqe->wqe_counter) & (conn->qp.sq.size - 1); + buf = conn->qp.sq.bufs[ix]; + conn->qp.sq.bufs[ix] = NULL; + conn->qp.sq.cc++; + + /* Handle backlog still under the spinlock to ensure message post order */ + if (unlikely(!list_empty(&conn->qp.sq.backlog))) { + if (likely(conn->qp.active)) { + nextbuf = list_first_entry(&conn->qp.sq.backlog, + struct mlx5_fpga_dma_buf, list); + list_del(&nextbuf->list); + mlx5_fpga_conn_post_send(conn, nextbuf); + } + } + + spin_unlock_irqrestore(&conn->qp.sq.lock, flags); + + if (unlikely(status && (status != MLX5_CQE_SYNDROME_WR_FLUSH_ERR))) + mlx5_fpga_warn(conn->fdev, "SQ buf %p on FPGA QP %u completion status %d\n", + buf, conn->fpga_qpn, status); + else + mlx5_fpga_dbg(conn->fdev, "SQ buf %p on FPGA QP %u completion status %d\n", + buf, conn->fpga_qpn, status); + + mlx5_fpga_conn_unmap_buf(conn, buf); + + if (likely(buf->complete)) + buf->complete(conn, conn->fdev, buf, status); + + if (unlikely(status)) + conn->qp.active = false; +} + +static void mlx5_fpga_conn_handle_cqe(struct mlx5_fpga_conn *conn, + struct mlx5_cqe64 *cqe) +{ + u8 opcode, status = 0; + + opcode = get_cqe_opcode(cqe); + + switch (opcode) { + case MLX5_CQE_REQ_ERR: + status = ((struct mlx5_err_cqe *)cqe)->syndrome; + fallthrough; + case MLX5_CQE_REQ: + mlx5_fpga_conn_sq_cqe(conn, cqe, status); + break; + + case MLX5_CQE_RESP_ERR: + status = ((struct mlx5_err_cqe *)cqe)->syndrome; + fallthrough; + case MLX5_CQE_RESP_SEND: + mlx5_fpga_conn_rq_cqe(conn, cqe, status); + break; + default: + mlx5_fpga_warn(conn->fdev, "Unexpected cqe opcode %u\n", + opcode); + } +} + +static void mlx5_fpga_conn_arm_cq(struct mlx5_fpga_conn *conn) +{ + mlx5_cq_arm(&conn->cq.mcq, MLX5_CQ_DB_REQ_NOT, + conn->fdev->conn_res.uar->map, conn->cq.wq.cc); +} + +static inline void mlx5_fpga_conn_cqes(struct mlx5_fpga_conn *conn, + unsigned int budget) +{ + struct mlx5_cqe64 *cqe; + + while (budget) { + cqe = mlx5_cqwq_get_cqe(&conn->cq.wq); + if (!cqe) + break; + + budget--; + mlx5_cqwq_pop(&conn->cq.wq); + mlx5_fpga_conn_handle_cqe(conn, cqe); + mlx5_cqwq_update_db_record(&conn->cq.wq); + } + if (!budget) { + tasklet_schedule(&conn->cq.tasklet); + return; + } + + mlx5_fpga_dbg(conn->fdev, "Re-arming CQ with cc# %u\n", conn->cq.wq.cc); + /* ensure cq space is freed before enabling more cqes */ + wmb(); + mlx5_fpga_conn_arm_cq(conn); +} + +static void mlx5_fpga_conn_cq_tasklet(struct tasklet_struct *t) +{ + struct mlx5_fpga_conn *conn = from_tasklet(conn, t, cq.tasklet); + + if (unlikely(!conn->qp.active)) + return; + mlx5_fpga_conn_cqes(conn, MLX5_FPGA_CQ_BUDGET); +} + +static void mlx5_fpga_conn_cq_complete(struct mlx5_core_cq *mcq, + struct mlx5_eqe *eqe) +{ + struct mlx5_fpga_conn *conn; + + conn = container_of(mcq, struct mlx5_fpga_conn, cq.mcq); + if (unlikely(!conn->qp.active)) + return; + mlx5_fpga_conn_cqes(conn, MLX5_FPGA_CQ_BUDGET); +} + +static int mlx5_fpga_conn_create_cq(struct mlx5_fpga_conn *conn, int cq_size) +{ + struct mlx5_fpga_device *fdev = conn->fdev; + struct mlx5_core_dev *mdev = fdev->mdev; + u32 temp_cqc[MLX5_ST_SZ_DW(cqc)] = {0}; + u32 out[MLX5_ST_SZ_DW(create_cq_out)]; + struct mlx5_wq_param wqp; + struct mlx5_cqe64 *cqe; + int inlen, err, eqn; + void *cqc, *in; + __be64 *pas; + u32 i; + + cq_size = roundup_pow_of_two(cq_size); + MLX5_SET(cqc, temp_cqc, log_cq_size, ilog2(cq_size)); + + wqp.buf_numa_node = mdev->priv.numa_node; + wqp.db_numa_node = mdev->priv.numa_node; + + err = mlx5_cqwq_create(mdev, &wqp, temp_cqc, &conn->cq.wq, + &conn->cq.wq_ctrl); + if (err) + return err; + + for (i = 0; i < mlx5_cqwq_get_size(&conn->cq.wq); i++) { + cqe = mlx5_cqwq_get_wqe(&conn->cq.wq, i); + cqe->op_own = MLX5_CQE_INVALID << 4 | MLX5_CQE_OWNER_MASK; + } + + inlen = MLX5_ST_SZ_BYTES(create_cq_in) + + sizeof(u64) * conn->cq.wq_ctrl.buf.npages; + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_cqwq; + } + + err = mlx5_vector2eqn(mdev, smp_processor_id(), &eqn); + if (err) { + kvfree(in); + goto err_cqwq; + } + + cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context); + MLX5_SET(cqc, cqc, log_cq_size, ilog2(cq_size)); + MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn); + MLX5_SET(cqc, cqc, uar_page, fdev->conn_res.uar->index); + MLX5_SET(cqc, cqc, log_page_size, conn->cq.wq_ctrl.buf.page_shift - + MLX5_ADAPTER_PAGE_SHIFT); + MLX5_SET64(cqc, cqc, dbr_addr, conn->cq.wq_ctrl.db.dma); + + pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas); + mlx5_fill_page_frag_array(&conn->cq.wq_ctrl.buf, pas); + + err = mlx5_core_create_cq(mdev, &conn->cq.mcq, in, inlen, out, sizeof(out)); + kvfree(in); + + if (err) + goto err_cqwq; + + conn->cq.mcq.cqe_sz = 64; + conn->cq.mcq.set_ci_db = conn->cq.wq_ctrl.db.db; + conn->cq.mcq.arm_db = conn->cq.wq_ctrl.db.db + 1; + *conn->cq.mcq.set_ci_db = 0; + *conn->cq.mcq.arm_db = 0; + conn->cq.mcq.vector = 0; + conn->cq.mcq.comp = mlx5_fpga_conn_cq_complete; + conn->cq.mcq.uar = fdev->conn_res.uar; + tasklet_setup(&conn->cq.tasklet, mlx5_fpga_conn_cq_tasklet); + + mlx5_fpga_dbg(fdev, "Created CQ #0x%x\n", conn->cq.mcq.cqn); + + goto out; + +err_cqwq: + mlx5_wq_destroy(&conn->cq.wq_ctrl); +out: + return err; +} + +static void mlx5_fpga_conn_destroy_cq(struct mlx5_fpga_conn *conn) +{ + tasklet_disable(&conn->cq.tasklet); + tasklet_kill(&conn->cq.tasklet); + mlx5_core_destroy_cq(conn->fdev->mdev, &conn->cq.mcq); + mlx5_wq_destroy(&conn->cq.wq_ctrl); +} + +static int mlx5_fpga_conn_create_wq(struct mlx5_fpga_conn *conn, void *qpc) +{ + struct mlx5_fpga_device *fdev = conn->fdev; + struct mlx5_core_dev *mdev = fdev->mdev; + struct mlx5_wq_param wqp; + + wqp.buf_numa_node = mdev->priv.numa_node; + wqp.db_numa_node = mdev->priv.numa_node; + + return mlx5_wq_qp_create(mdev, &wqp, qpc, &conn->qp.wq, + &conn->qp.wq_ctrl); +} + +static int mlx5_fpga_conn_create_qp(struct mlx5_fpga_conn *conn, + unsigned int tx_size, unsigned int rx_size) +{ + struct mlx5_fpga_device *fdev = conn->fdev; + u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {}; + struct mlx5_core_dev *mdev = fdev->mdev; + u32 temp_qpc[MLX5_ST_SZ_DW(qpc)] = {}; + void *in = NULL, *qpc; + int err, inlen; + + conn->qp.rq.pc = 0; + conn->qp.rq.cc = 0; + conn->qp.rq.size = roundup_pow_of_two(rx_size); + conn->qp.sq.pc = 0; + conn->qp.sq.cc = 0; + conn->qp.sq.size = roundup_pow_of_two(tx_size); + + MLX5_SET(qpc, temp_qpc, log_rq_stride, ilog2(MLX5_SEND_WQE_DS) - 4); + MLX5_SET(qpc, temp_qpc, log_rq_size, ilog2(conn->qp.rq.size)); + MLX5_SET(qpc, temp_qpc, log_sq_size, ilog2(conn->qp.sq.size)); + err = mlx5_fpga_conn_create_wq(conn, temp_qpc); + if (err) + goto out; + + conn->qp.rq.bufs = kvcalloc(conn->qp.rq.size, + sizeof(conn->qp.rq.bufs[0]), + GFP_KERNEL); + if (!conn->qp.rq.bufs) { + err = -ENOMEM; + goto err_wq; + } + + conn->qp.sq.bufs = kvcalloc(conn->qp.sq.size, + sizeof(conn->qp.sq.bufs[0]), + GFP_KERNEL); + if (!conn->qp.sq.bufs) { + err = -ENOMEM; + goto err_rq_bufs; + } + + inlen = MLX5_ST_SZ_BYTES(create_qp_in) + + MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) * + conn->qp.wq_ctrl.buf.npages; + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_sq_bufs; + } + + qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); + MLX5_SET(qpc, qpc, uar_page, fdev->conn_res.uar->index); + MLX5_SET(qpc, qpc, log_page_size, + conn->qp.wq_ctrl.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); + MLX5_SET(qpc, qpc, fre, 1); + MLX5_SET(qpc, qpc, rlky, 1); + MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC); + MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED); + MLX5_SET(qpc, qpc, pd, fdev->conn_res.pdn); + MLX5_SET(qpc, qpc, log_rq_stride, ilog2(MLX5_SEND_WQE_DS) - 4); + MLX5_SET(qpc, qpc, log_rq_size, ilog2(conn->qp.rq.size)); + MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ); + MLX5_SET(qpc, qpc, log_sq_size, ilog2(conn->qp.sq.size)); + MLX5_SET(qpc, qpc, cqn_snd, conn->cq.mcq.cqn); + MLX5_SET(qpc, qpc, cqn_rcv, conn->cq.mcq.cqn); + MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(mdev)); + MLX5_SET64(qpc, qpc, dbr_addr, conn->qp.wq_ctrl.db.dma); + if (MLX5_CAP_GEN(mdev, cqe_version) == 1) + MLX5_SET(qpc, qpc, user_index, 0xFFFFFF); + + mlx5_fill_page_frag_array(&conn->qp.wq_ctrl.buf, + (__be64 *)MLX5_ADDR_OF(create_qp_in, in, pas)); + + MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP); + err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out)); + if (err) + goto err_sq_bufs; + + conn->qp.qpn = MLX5_GET(create_qp_out, out, qpn); + mlx5_fpga_dbg(fdev, "Created QP #0x%x\n", conn->qp.qpn); + + goto out; + +err_sq_bufs: + kvfree(conn->qp.sq.bufs); +err_rq_bufs: + kvfree(conn->qp.rq.bufs); +err_wq: + mlx5_wq_destroy(&conn->qp.wq_ctrl); +out: + kvfree(in); + return err; +} + +static void mlx5_fpga_conn_free_recv_bufs(struct mlx5_fpga_conn *conn) +{ + int ix; + + for (ix = 0; ix < conn->qp.rq.size; ix++) { + if (!conn->qp.rq.bufs[ix]) + continue; + mlx5_fpga_conn_unmap_buf(conn, conn->qp.rq.bufs[ix]); + kfree(conn->qp.rq.bufs[ix]); + conn->qp.rq.bufs[ix] = NULL; + } +} + +static void mlx5_fpga_conn_flush_send_bufs(struct mlx5_fpga_conn *conn) +{ + struct mlx5_fpga_dma_buf *buf, *temp; + int ix; + + for (ix = 0; ix < conn->qp.sq.size; ix++) { + buf = conn->qp.sq.bufs[ix]; + if (!buf) + continue; + conn->qp.sq.bufs[ix] = NULL; + mlx5_fpga_conn_unmap_buf(conn, buf); + if (!buf->complete) + continue; + buf->complete(conn, conn->fdev, buf, MLX5_CQE_SYNDROME_WR_FLUSH_ERR); + } + list_for_each_entry_safe(buf, temp, &conn->qp.sq.backlog, list) { + mlx5_fpga_conn_unmap_buf(conn, buf); + if (!buf->complete) + continue; + buf->complete(conn, conn->fdev, buf, MLX5_CQE_SYNDROME_WR_FLUSH_ERR); + } +} + +static void mlx5_fpga_conn_destroy_qp(struct mlx5_fpga_conn *conn) +{ + struct mlx5_core_dev *dev = conn->fdev->mdev; + u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {}; + + MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP); + MLX5_SET(destroy_qp_in, in, qpn, conn->qp.qpn); + mlx5_cmd_exec_in(dev, destroy_qp, in); + + mlx5_fpga_conn_free_recv_bufs(conn); + mlx5_fpga_conn_flush_send_bufs(conn); + kvfree(conn->qp.sq.bufs); + kvfree(conn->qp.rq.bufs); + mlx5_wq_destroy(&conn->qp.wq_ctrl); +} + +static int mlx5_fpga_conn_reset_qp(struct mlx5_fpga_conn *conn) +{ + struct mlx5_core_dev *mdev = conn->fdev->mdev; + u32 in[MLX5_ST_SZ_DW(qp_2rst_in)] = {}; + + mlx5_fpga_dbg(conn->fdev, "Modifying QP %u to RST\n", conn->qp.qpn); + + MLX5_SET(qp_2rst_in, in, opcode, MLX5_CMD_OP_2RST_QP); + MLX5_SET(qp_2rst_in, in, qpn, conn->qp.qpn); + + return mlx5_cmd_exec_in(mdev, qp_2rst, in); +} + +static int mlx5_fpga_conn_init_qp(struct mlx5_fpga_conn *conn) +{ + u32 in[MLX5_ST_SZ_DW(rst2init_qp_in)] = {}; + struct mlx5_fpga_device *fdev = conn->fdev; + struct mlx5_core_dev *mdev = fdev->mdev; + u32 *qpc; + + mlx5_fpga_dbg(conn->fdev, "Modifying QP %u to INIT\n", conn->qp.qpn); + + qpc = MLX5_ADDR_OF(rst2init_qp_in, in, qpc); + + MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC); + MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED); + MLX5_SET(qpc, qpc, primary_address_path.pkey_index, MLX5_FPGA_PKEY_INDEX); + MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, MLX5_FPGA_PORT_NUM); + MLX5_SET(qpc, qpc, pd, conn->fdev->conn_res.pdn); + MLX5_SET(qpc, qpc, cqn_snd, conn->cq.mcq.cqn); + MLX5_SET(qpc, qpc, cqn_rcv, conn->cq.mcq.cqn); + MLX5_SET64(qpc, qpc, dbr_addr, conn->qp.wq_ctrl.db.dma); + + MLX5_SET(rst2init_qp_in, in, opcode, MLX5_CMD_OP_RST2INIT_QP); + MLX5_SET(rst2init_qp_in, in, qpn, conn->qp.qpn); + + return mlx5_cmd_exec_in(mdev, rst2init_qp, in); +} + +static int mlx5_fpga_conn_rtr_qp(struct mlx5_fpga_conn *conn) +{ + u32 in[MLX5_ST_SZ_DW(init2rtr_qp_in)] = {}; + struct mlx5_fpga_device *fdev = conn->fdev; + struct mlx5_core_dev *mdev = fdev->mdev; + u32 *qpc; + + mlx5_fpga_dbg(conn->fdev, "QP RTR\n"); + + qpc = MLX5_ADDR_OF(init2rtr_qp_in, in, qpc); + + MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_1K_BYTES); + MLX5_SET(qpc, qpc, log_msg_max, (u8)MLX5_CAP_GEN(mdev, log_max_msg)); + MLX5_SET(qpc, qpc, remote_qpn, conn->fpga_qpn); + MLX5_SET(qpc, qpc, next_rcv_psn, + MLX5_GET(fpga_qpc, conn->fpga_qpc, next_send_psn)); + MLX5_SET(qpc, qpc, primary_address_path.pkey_index, MLX5_FPGA_PKEY_INDEX); + MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, MLX5_FPGA_PORT_NUM); + ether_addr_copy(MLX5_ADDR_OF(qpc, qpc, primary_address_path.rmac_47_32), + MLX5_ADDR_OF(fpga_qpc, conn->fpga_qpc, fpga_mac_47_32)); + MLX5_SET(qpc, qpc, primary_address_path.udp_sport, + MLX5_CAP_ROCE(mdev, r_roce_min_src_udp_port)); + MLX5_SET(qpc, qpc, primary_address_path.src_addr_index, + conn->qp.sgid_index); + MLX5_SET(qpc, qpc, primary_address_path.hop_limit, 0); + memcpy(MLX5_ADDR_OF(qpc, qpc, primary_address_path.rgid_rip), + MLX5_ADDR_OF(fpga_qpc, conn->fpga_qpc, fpga_ip), + MLX5_FLD_SZ_BYTES(qpc, primary_address_path.rgid_rip)); + + MLX5_SET(init2rtr_qp_in, in, opcode, MLX5_CMD_OP_INIT2RTR_QP); + MLX5_SET(init2rtr_qp_in, in, qpn, conn->qp.qpn); + + return mlx5_cmd_exec_in(mdev, init2rtr_qp, in); +} + +static int mlx5_fpga_conn_rts_qp(struct mlx5_fpga_conn *conn) +{ + struct mlx5_fpga_device *fdev = conn->fdev; + u32 in[MLX5_ST_SZ_DW(rtr2rts_qp_in)] = {}; + struct mlx5_core_dev *mdev = fdev->mdev; + u32 *qpc; + + mlx5_fpga_dbg(conn->fdev, "QP RTS\n"); + + qpc = MLX5_ADDR_OF(rtr2rts_qp_in, in, qpc); + + MLX5_SET(qpc, qpc, log_ack_req_freq, 8); + MLX5_SET(qpc, qpc, min_rnr_nak, 0x12); + MLX5_SET(qpc, qpc, primary_address_path.ack_timeout, 0x12); /* ~1.07s */ + MLX5_SET(qpc, qpc, next_send_psn, + MLX5_GET(fpga_qpc, conn->fpga_qpc, next_rcv_psn)); + MLX5_SET(qpc, qpc, retry_count, 7); + MLX5_SET(qpc, qpc, rnr_retry, 7); /* Infinite retry if RNR NACK */ + + MLX5_SET(rtr2rts_qp_in, in, opcode, MLX5_CMD_OP_RTR2RTS_QP); + MLX5_SET(rtr2rts_qp_in, in, qpn, conn->qp.qpn); + MLX5_SET(rtr2rts_qp_in, in, opt_param_mask, MLX5_QP_OPTPAR_RNR_TIMEOUT); + + return mlx5_cmd_exec_in(mdev, rtr2rts_qp, in); +} + +static int mlx5_fpga_conn_connect(struct mlx5_fpga_conn *conn) +{ + struct mlx5_fpga_device *fdev = conn->fdev; + int err; + + MLX5_SET(fpga_qpc, conn->fpga_qpc, state, MLX5_FPGA_QPC_STATE_ACTIVE); + err = mlx5_fpga_modify_qp(conn->fdev->mdev, conn->fpga_qpn, + MLX5_FPGA_QPC_STATE, &conn->fpga_qpc); + if (err) { + mlx5_fpga_err(fdev, "Failed to activate FPGA RC QP: %d\n", err); + goto out; + } + + err = mlx5_fpga_conn_reset_qp(conn); + if (err) { + mlx5_fpga_err(fdev, "Failed to change QP state to reset\n"); + goto err_fpga_qp; + } + + err = mlx5_fpga_conn_init_qp(conn); + if (err) { + mlx5_fpga_err(fdev, "Failed to modify QP from RESET to INIT\n"); + goto err_fpga_qp; + } + conn->qp.active = true; + + while (!mlx5_fpga_conn_post_recv_buf(conn)) + ; + + err = mlx5_fpga_conn_rtr_qp(conn); + if (err) { + mlx5_fpga_err(fdev, "Failed to change QP state from INIT to RTR\n"); + goto err_recv_bufs; + } + + err = mlx5_fpga_conn_rts_qp(conn); + if (err) { + mlx5_fpga_err(fdev, "Failed to change QP state from RTR to RTS\n"); + goto err_recv_bufs; + } + goto out; + +err_recv_bufs: + mlx5_fpga_conn_free_recv_bufs(conn); +err_fpga_qp: + MLX5_SET(fpga_qpc, conn->fpga_qpc, state, MLX5_FPGA_QPC_STATE_INIT); + if (mlx5_fpga_modify_qp(conn->fdev->mdev, conn->fpga_qpn, + MLX5_FPGA_QPC_STATE, &conn->fpga_qpc)) + mlx5_fpga_err(fdev, "Failed to revert FPGA QP to INIT\n"); +out: + return err; +} + +struct mlx5_fpga_conn *mlx5_fpga_conn_create(struct mlx5_fpga_device *fdev, + struct mlx5_fpga_conn_attr *attr, + enum mlx5_ifc_fpga_qp_type qp_type) +{ + struct mlx5_fpga_conn *ret, *conn; + u8 *remote_mac, *remote_ip; + int err; + + if (!attr->recv_cb) + return ERR_PTR(-EINVAL); + + conn = kzalloc(sizeof(*conn), GFP_KERNEL); + if (!conn) + return ERR_PTR(-ENOMEM); + + conn->fdev = fdev; + INIT_LIST_HEAD(&conn->qp.sq.backlog); + + spin_lock_init(&conn->qp.sq.lock); + + conn->recv_cb = attr->recv_cb; + conn->cb_arg = attr->cb_arg; + + remote_mac = MLX5_ADDR_OF(fpga_qpc, conn->fpga_qpc, remote_mac_47_32); + err = mlx5_query_mac_address(fdev->mdev, remote_mac); + if (err) { + mlx5_fpga_err(fdev, "Failed to query local MAC: %d\n", err); + ret = ERR_PTR(err); + goto err; + } + + /* Build Modified EUI-64 IPv6 address from the MAC address */ + remote_ip = MLX5_ADDR_OF(fpga_qpc, conn->fpga_qpc, remote_ip); + remote_ip[0] = 0xfe; + remote_ip[1] = 0x80; + addrconf_addr_eui48(&remote_ip[8], remote_mac); + + err = mlx5_core_reserved_gid_alloc(fdev->mdev, &conn->qp.sgid_index); + if (err) { + mlx5_fpga_err(fdev, "Failed to allocate SGID: %d\n", err); + ret = ERR_PTR(err); + goto err; + } + + err = mlx5_core_roce_gid_set(fdev->mdev, conn->qp.sgid_index, + MLX5_ROCE_VERSION_2, + MLX5_ROCE_L3_TYPE_IPV6, + remote_ip, remote_mac, true, 0, + MLX5_FPGA_PORT_NUM); + if (err) { + mlx5_fpga_err(fdev, "Failed to set SGID: %d\n", err); + ret = ERR_PTR(err); + goto err_rsvd_gid; + } + mlx5_fpga_dbg(fdev, "Reserved SGID index %u\n", conn->qp.sgid_index); + + /* Allow for one cqe per rx/tx wqe, plus one cqe for the next wqe, + * created during processing of the cqe + */ + err = mlx5_fpga_conn_create_cq(conn, + (attr->tx_size + attr->rx_size) * 2); + if (err) { + mlx5_fpga_err(fdev, "Failed to create CQ: %d\n", err); + ret = ERR_PTR(err); + goto err_gid; + } + + mlx5_fpga_conn_arm_cq(conn); + + err = mlx5_fpga_conn_create_qp(conn, attr->tx_size, attr->rx_size); + if (err) { + mlx5_fpga_err(fdev, "Failed to create QP: %d\n", err); + ret = ERR_PTR(err); + goto err_cq; + } + + MLX5_SET(fpga_qpc, conn->fpga_qpc, state, MLX5_FPGA_QPC_STATE_INIT); + MLX5_SET(fpga_qpc, conn->fpga_qpc, qp_type, qp_type); + MLX5_SET(fpga_qpc, conn->fpga_qpc, st, MLX5_FPGA_QPC_ST_RC); + MLX5_SET(fpga_qpc, conn->fpga_qpc, ether_type, ETH_P_8021Q); + MLX5_SET(fpga_qpc, conn->fpga_qpc, vid, 0); + MLX5_SET(fpga_qpc, conn->fpga_qpc, next_rcv_psn, 1); + MLX5_SET(fpga_qpc, conn->fpga_qpc, next_send_psn, 0); + MLX5_SET(fpga_qpc, conn->fpga_qpc, pkey, MLX5_FPGA_PKEY); + MLX5_SET(fpga_qpc, conn->fpga_qpc, remote_qpn, conn->qp.qpn); + MLX5_SET(fpga_qpc, conn->fpga_qpc, rnr_retry, 7); + MLX5_SET(fpga_qpc, conn->fpga_qpc, retry_count, 7); + + err = mlx5_fpga_create_qp(fdev->mdev, &conn->fpga_qpc, + &conn->fpga_qpn); + if (err) { + mlx5_fpga_err(fdev, "Failed to create FPGA RC QP: %d\n", err); + ret = ERR_PTR(err); + goto err_qp; + } + + err = mlx5_fpga_conn_connect(conn); + if (err) { + ret = ERR_PTR(err); + goto err_conn; + } + + mlx5_fpga_dbg(fdev, "FPGA QPN is %u\n", conn->fpga_qpn); + ret = conn; + goto out; + +err_conn: + mlx5_fpga_destroy_qp(conn->fdev->mdev, conn->fpga_qpn); +err_qp: + mlx5_fpga_conn_destroy_qp(conn); +err_cq: + mlx5_fpga_conn_destroy_cq(conn); +err_gid: + mlx5_core_roce_gid_set(fdev->mdev, conn->qp.sgid_index, 0, 0, NULL, + NULL, false, 0, MLX5_FPGA_PORT_NUM); +err_rsvd_gid: + mlx5_core_reserved_gid_free(fdev->mdev, conn->qp.sgid_index); +err: + kfree(conn); +out: + return ret; +} + +void mlx5_fpga_conn_destroy(struct mlx5_fpga_conn *conn) +{ + conn->qp.active = false; + tasklet_disable(&conn->cq.tasklet); + synchronize_irq(conn->cq.mcq.irqn); + + mlx5_fpga_destroy_qp(conn->fdev->mdev, conn->fpga_qpn); + mlx5_fpga_conn_destroy_qp(conn); + mlx5_fpga_conn_destroy_cq(conn); + + mlx5_core_roce_gid_set(conn->fdev->mdev, conn->qp.sgid_index, 0, 0, + NULL, NULL, false, 0, MLX5_FPGA_PORT_NUM); + mlx5_core_reserved_gid_free(conn->fdev->mdev, conn->qp.sgid_index); + kfree(conn); +} + +int mlx5_fpga_conn_device_init(struct mlx5_fpga_device *fdev) +{ + int err; + + err = mlx5_nic_vport_enable_roce(fdev->mdev); + if (err) { + mlx5_fpga_err(fdev, "Failed to enable RoCE: %d\n", err); + goto out; + } + + fdev->conn_res.uar = mlx5_get_uars_page(fdev->mdev); + if (IS_ERR(fdev->conn_res.uar)) { + err = PTR_ERR(fdev->conn_res.uar); + mlx5_fpga_err(fdev, "get_uars_page failed, %d\n", err); + goto err_roce; + } + mlx5_fpga_dbg(fdev, "Allocated UAR index %u\n", + fdev->conn_res.uar->index); + + err = mlx5_core_alloc_pd(fdev->mdev, &fdev->conn_res.pdn); + if (err) { + mlx5_fpga_err(fdev, "alloc pd failed, %d\n", err); + goto err_uar; + } + mlx5_fpga_dbg(fdev, "Allocated PD %u\n", fdev->conn_res.pdn); + + err = mlx5_fpga_conn_create_mkey(fdev->mdev, fdev->conn_res.pdn, + &fdev->conn_res.mkey); + if (err) { + mlx5_fpga_err(fdev, "create mkey failed, %d\n", err); + goto err_dealloc_pd; + } + mlx5_fpga_dbg(fdev, "Created mkey 0x%x\n", fdev->conn_res.mkey); + + return 0; + +err_dealloc_pd: + mlx5_core_dealloc_pd(fdev->mdev, fdev->conn_res.pdn); +err_uar: + mlx5_put_uars_page(fdev->mdev, fdev->conn_res.uar); +err_roce: + mlx5_nic_vport_disable_roce(fdev->mdev); +out: + return err; +} + +void mlx5_fpga_conn_device_cleanup(struct mlx5_fpga_device *fdev) +{ + mlx5_core_destroy_mkey(fdev->mdev, fdev->conn_res.mkey); + mlx5_core_dealloc_pd(fdev->mdev, fdev->conn_res.pdn); + mlx5_put_uars_page(fdev->mdev, fdev->conn_res.uar); + mlx5_nic_vport_disable_roce(fdev->mdev); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.h new file mode 100644 index 0000000..5116e86 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2017 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef __MLX5_FPGA_CONN_H__ +#define __MLX5_FPGA_CONN_H__ + +#include +#include + +#include "fpga/core.h" +#include "fpga/sdk.h" +#include "wq.h" + +struct mlx5_fpga_conn { + struct mlx5_fpga_device *fdev; + + void (*recv_cb)(void *cb_arg, struct mlx5_fpga_dma_buf *buf); + void *cb_arg; + + /* FPGA QP */ + u32 fpga_qpc[MLX5_ST_SZ_DW(fpga_qpc)]; + u32 fpga_qpn; + + /* CQ */ + struct { + struct mlx5_cqwq wq; + struct mlx5_wq_ctrl wq_ctrl; + struct mlx5_core_cq mcq; + struct tasklet_struct tasklet; + } cq; + + /* QP */ + struct { + bool active; + int sgid_index; + struct mlx5_wq_qp wq; + struct mlx5_wq_ctrl wq_ctrl; + u32 qpn; + struct { + spinlock_t lock; /* Protects all SQ state */ + unsigned int pc; + unsigned int cc; + unsigned int size; + struct mlx5_fpga_dma_buf **bufs; + struct list_head backlog; + } sq; + struct { + unsigned int pc; + unsigned int cc; + unsigned int size; + struct mlx5_fpga_dma_buf **bufs; + } rq; + } qp; +}; + +int mlx5_fpga_conn_device_init(struct mlx5_fpga_device *fdev); +void mlx5_fpga_conn_device_cleanup(struct mlx5_fpga_device *fdev); +struct mlx5_fpga_conn * +mlx5_fpga_conn_create(struct mlx5_fpga_device *fdev, + struct mlx5_fpga_conn_attr *attr, + enum mlx5_ifc_fpga_qp_type qp_type); +void mlx5_fpga_conn_destroy(struct mlx5_fpga_conn *conn); +int mlx5_fpga_conn_send(struct mlx5_fpga_conn *conn, + struct mlx5_fpga_dma_buf *buf); + +#endif /* __MLX5_FPGA_CONN_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c new file mode 100644 index 0000000..2ce4241 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c @@ -0,0 +1,376 @@ +/* + * Copyright (c) 2017, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include "mlx5_core.h" +#include "lib/mlx5.h" +#include "lib/eq.h" +#include "fpga/core.h" +#include "fpga/conn.h" + +static const char *const mlx5_fpga_error_strings[] = { + "Null Syndrome", + "Corrupted DDR", + "Flash Timeout", + "Internal Link Error", + "Watchdog HW Failure", + "I2C Failure", + "Image Changed", + "Temperature Critical", +}; + +static const char * const mlx5_fpga_qp_error_strings[] = { + "Null Syndrome", + "Retry Counter Expired", + "RNR Expired", +}; +static struct mlx5_fpga_device *mlx5_fpga_device_alloc(void) +{ + struct mlx5_fpga_device *fdev = NULL; + + fdev = kzalloc(sizeof(*fdev), GFP_KERNEL); + if (!fdev) + return NULL; + + spin_lock_init(&fdev->state_lock); + fdev->state = MLX5_FPGA_STATUS_NONE; + return fdev; +} + +static const char *mlx5_fpga_image_name(enum mlx5_fpga_image image) +{ + switch (image) { + case MLX5_FPGA_IMAGE_USER: + return "user"; + case MLX5_FPGA_IMAGE_FACTORY: + return "factory"; + default: + return "unknown"; + } +} + +static const char *mlx5_fpga_name(u32 fpga_id) +{ + static char ret[32]; + + switch (fpga_id) { + case MLX5_FPGA_NEWTON: + return "Newton"; + case MLX5_FPGA_EDISON: + return "Edison"; + case MLX5_FPGA_MORSE: + return "Morse"; + case MLX5_FPGA_MORSEQ: + return "MorseQ"; + } + + snprintf(ret, sizeof(ret), "Unknown %d", fpga_id); + return ret; +} + +static int mlx5_is_fpga_lookaside(u32 fpga_id) +{ + return fpga_id != MLX5_FPGA_NEWTON && fpga_id != MLX5_FPGA_EDISON; +} + +static int mlx5_fpga_device_load_check(struct mlx5_fpga_device *fdev) +{ + struct mlx5_fpga_query query; + int err; + + err = mlx5_fpga_query(fdev->mdev, &query); + if (err) { + mlx5_fpga_err(fdev, "Failed to query status: %d\n", err); + return err; + } + + fdev->last_admin_image = query.admin_image; + fdev->last_oper_image = query.oper_image; + + mlx5_fpga_info(fdev, "Status %u; Admin image %u; Oper image %u\n", + query.status, query.admin_image, query.oper_image); + + /* for FPGA lookaside projects FPGA load status is not important */ + if (mlx5_is_fpga_lookaside(MLX5_CAP_FPGA(fdev->mdev, fpga_id))) + return 0; + + if (query.status != MLX5_FPGA_STATUS_SUCCESS) { + mlx5_fpga_err(fdev, "%s image failed to load; status %u\n", + mlx5_fpga_image_name(fdev->last_oper_image), + query.status); + return -EIO; + } + + return 0; +} + +static int mlx5_fpga_device_brb(struct mlx5_fpga_device *fdev) +{ + int err; + struct mlx5_core_dev *mdev = fdev->mdev; + + err = mlx5_fpga_ctrl_op(mdev, MLX5_FPGA_CTRL_OPERATION_SANDBOX_BYPASS_ON); + if (err) { + mlx5_fpga_err(fdev, "Failed to set bypass on: %d\n", err); + return err; + } + err = mlx5_fpga_ctrl_op(mdev, MLX5_FPGA_CTRL_OPERATION_RESET_SANDBOX); + if (err) { + mlx5_fpga_err(fdev, "Failed to reset SBU: %d\n", err); + return err; + } + err = mlx5_fpga_ctrl_op(mdev, MLX5_FPGA_CTRL_OPERATION_SANDBOX_BYPASS_OFF); + if (err) { + mlx5_fpga_err(fdev, "Failed to set bypass off: %d\n", err); + return err; + } + return 0; +} + +static int mlx5_fpga_event(struct mlx5_fpga_device *, unsigned long, void *); + +static int fpga_err_event(struct notifier_block *nb, unsigned long event, void *eqe) +{ + struct mlx5_fpga_device *fdev = mlx5_nb_cof(nb, struct mlx5_fpga_device, fpga_err_nb); + + return mlx5_fpga_event(fdev, event, eqe); +} + +static int fpga_qp_err_event(struct notifier_block *nb, unsigned long event, void *eqe) +{ + struct mlx5_fpga_device *fdev = mlx5_nb_cof(nb, struct mlx5_fpga_device, fpga_qp_err_nb); + + return mlx5_fpga_event(fdev, event, eqe); +} + +int mlx5_fpga_device_start(struct mlx5_core_dev *mdev) +{ + struct mlx5_fpga_device *fdev = mdev->fpga; + unsigned int max_num_qps; + unsigned long flags; + u32 fpga_id; + int err; + + if (!fdev) + return 0; + + err = mlx5_fpga_caps(fdev->mdev); + if (err) + goto out; + + err = mlx5_fpga_device_load_check(fdev); + if (err) + goto out; + + fpga_id = MLX5_CAP_FPGA(fdev->mdev, fpga_id); + mlx5_fpga_info(fdev, "FPGA card %s:%u\n", mlx5_fpga_name(fpga_id), fpga_id); + + /* No QPs if FPGA does not participate in net processing */ + if (mlx5_is_fpga_lookaside(fpga_id)) + goto out; + + mlx5_fpga_info(fdev, "%s(%d): image, version %u; SBU %06x:%04x version %d\n", + mlx5_fpga_image_name(fdev->last_oper_image), + fdev->last_oper_image, + MLX5_CAP_FPGA(fdev->mdev, image_version), + MLX5_CAP_FPGA(fdev->mdev, ieee_vendor_id), + MLX5_CAP_FPGA(fdev->mdev, sandbox_product_id), + MLX5_CAP_FPGA(fdev->mdev, sandbox_product_version)); + + max_num_qps = MLX5_CAP_FPGA(mdev, shell_caps.max_num_qps); + if (!max_num_qps) { + mlx5_fpga_err(fdev, "FPGA reports 0 QPs in SHELL_CAPS\n"); + err = -ENOTSUPP; + goto out; + } + + err = mlx5_core_reserve_gids(mdev, max_num_qps); + if (err) + goto out; + + MLX5_NB_INIT(&fdev->fpga_err_nb, fpga_err_event, FPGA_ERROR); + MLX5_NB_INIT(&fdev->fpga_qp_err_nb, fpga_qp_err_event, FPGA_QP_ERROR); + mlx5_eq_notifier_register(fdev->mdev, &fdev->fpga_err_nb); + mlx5_eq_notifier_register(fdev->mdev, &fdev->fpga_qp_err_nb); + + err = mlx5_fpga_conn_device_init(fdev); + if (err) + goto err_rsvd_gid; + + if (fdev->last_oper_image == MLX5_FPGA_IMAGE_USER) { + err = mlx5_fpga_device_brb(fdev); + if (err) + goto err_conn_init; + } + + goto out; + +err_conn_init: + mlx5_fpga_conn_device_cleanup(fdev); + +err_rsvd_gid: + mlx5_eq_notifier_unregister(fdev->mdev, &fdev->fpga_err_nb); + mlx5_eq_notifier_unregister(fdev->mdev, &fdev->fpga_qp_err_nb); + mlx5_core_unreserve_gids(mdev, max_num_qps); +out: + spin_lock_irqsave(&fdev->state_lock, flags); + fdev->state = err ? MLX5_FPGA_STATUS_FAILURE : MLX5_FPGA_STATUS_SUCCESS; + spin_unlock_irqrestore(&fdev->state_lock, flags); + return err; +} + +int mlx5_fpga_init(struct mlx5_core_dev *mdev) +{ + struct mlx5_fpga_device *fdev = NULL; + + if (!MLX5_CAP_GEN(mdev, fpga)) { + mlx5_core_dbg(mdev, "FPGA capability not present\n"); + return 0; + } + + mlx5_core_dbg(mdev, "Initializing FPGA\n"); + + fdev = mlx5_fpga_device_alloc(); + if (!fdev) + return -ENOMEM; + + fdev->mdev = mdev; + mdev->fpga = fdev; + + return 0; +} + +void mlx5_fpga_device_stop(struct mlx5_core_dev *mdev) +{ + struct mlx5_fpga_device *fdev = mdev->fpga; + unsigned int max_num_qps; + unsigned long flags; + int err; + + if (!fdev) + return; + + if (mlx5_is_fpga_lookaside(MLX5_CAP_FPGA(fdev->mdev, fpga_id))) + return; + + spin_lock_irqsave(&fdev->state_lock, flags); + if (fdev->state != MLX5_FPGA_STATUS_SUCCESS) { + spin_unlock_irqrestore(&fdev->state_lock, flags); + return; + } + fdev->state = MLX5_FPGA_STATUS_NONE; + spin_unlock_irqrestore(&fdev->state_lock, flags); + + if (fdev->last_oper_image == MLX5_FPGA_IMAGE_USER) { + err = mlx5_fpga_ctrl_op(mdev, MLX5_FPGA_CTRL_OPERATION_SANDBOX_BYPASS_ON); + if (err) + mlx5_fpga_err(fdev, "Failed to re-set SBU bypass on: %d\n", + err); + } + + mlx5_fpga_conn_device_cleanup(fdev); + mlx5_eq_notifier_unregister(fdev->mdev, &fdev->fpga_err_nb); + mlx5_eq_notifier_unregister(fdev->mdev, &fdev->fpga_qp_err_nb); + + max_num_qps = MLX5_CAP_FPGA(mdev, shell_caps.max_num_qps); + mlx5_core_unreserve_gids(mdev, max_num_qps); +} + +void mlx5_fpga_cleanup(struct mlx5_core_dev *mdev) +{ + struct mlx5_fpga_device *fdev = mdev->fpga; + + mlx5_fpga_device_stop(mdev); + kfree(fdev); + mdev->fpga = NULL; +} + +static const char *mlx5_fpga_syndrome_to_string(u8 syndrome) +{ + if (syndrome < ARRAY_SIZE(mlx5_fpga_error_strings)) + return mlx5_fpga_error_strings[syndrome]; + return "Unknown"; +} + +static const char *mlx5_fpga_qp_syndrome_to_string(u8 syndrome) +{ + if (syndrome < ARRAY_SIZE(mlx5_fpga_qp_error_strings)) + return mlx5_fpga_qp_error_strings[syndrome]; + return "Unknown"; +} + +static int mlx5_fpga_event(struct mlx5_fpga_device *fdev, + unsigned long event, void *eqe) +{ + void *data = ((struct mlx5_eqe *)eqe)->data.raw; + const char *event_name; + bool teardown = false; + unsigned long flags; + u8 syndrome; + + switch (event) { + case MLX5_EVENT_TYPE_FPGA_ERROR: + syndrome = MLX5_GET(fpga_error_event, data, syndrome); + event_name = mlx5_fpga_syndrome_to_string(syndrome); + break; + case MLX5_EVENT_TYPE_FPGA_QP_ERROR: + syndrome = MLX5_GET(fpga_qp_error_event, data, syndrome); + event_name = mlx5_fpga_qp_syndrome_to_string(syndrome); + break; + default: + return NOTIFY_DONE; + } + + spin_lock_irqsave(&fdev->state_lock, flags); + switch (fdev->state) { + case MLX5_FPGA_STATUS_SUCCESS: + mlx5_fpga_warn(fdev, "Error %u: %s\n", syndrome, event_name); + teardown = true; + break; + default: + mlx5_fpga_warn_ratelimited(fdev, "Unexpected error event %u: %s\n", + syndrome, event_name); + } + spin_unlock_irqrestore(&fdev->state_lock, flags); + /* We tear-down the card's interfaces and functionality because + * the FPGA bump-on-the-wire is misbehaving and we lose ability + * to communicate with the network. User may still be able to + * recover by re-programming or debugging the FPGA + */ + if (teardown) + mlx5_trigger_health_work(fdev->mdev); + + return NOTIFY_OK; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.h new file mode 100644 index 0000000..2a984e8 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.h @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2017, Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __MLX5_FPGA_CORE_H__ +#define __MLX5_FPGA_CORE_H__ + +#ifdef CONFIG_MLX5_FPGA + +#include + +#include "mlx5_core.h" +#include "lib/eq.h" +#include "fpga/cmd.h" + +/* Represents an Innova device */ +struct mlx5_fpga_device { + struct mlx5_core_dev *mdev; + struct mlx5_nb fpga_err_nb; + struct mlx5_nb fpga_qp_err_nb; + spinlock_t state_lock; /* Protects state transitions */ + enum mlx5_fpga_status state; + enum mlx5_fpga_image last_admin_image; + enum mlx5_fpga_image last_oper_image; + + /* QP Connection resources */ + struct { + u32 pdn; + u32 mkey; + struct mlx5_uars_page *uar; + } conn_res; + + struct mlx5_fpga_ipsec *ipsec; + struct mlx5_fpga_tls *tls; +}; + +#define mlx5_fpga_dbg(__adev, format, ...) \ + mlx5_core_dbg((__adev)->mdev, "FPGA: %s:%d:(pid %d): " format, \ + __func__, __LINE__, current->pid, ##__VA_ARGS__) + +#define mlx5_fpga_err(__adev, format, ...) \ + mlx5_core_err((__adev)->mdev, "FPGA: %s:%d:(pid %d): " format, \ + __func__, __LINE__, current->pid, ##__VA_ARGS__) + +#define mlx5_fpga_warn(__adev, format, ...) \ + mlx5_core_warn((__adev)->mdev, "FPGA: %s:%d:(pid %d): " format, \ + __func__, __LINE__, current->pid, ##__VA_ARGS__) + +#define mlx5_fpga_warn_ratelimited(__adev, format, ...) \ + mlx5_core_err_rl((__adev)->mdev, "FPGA: %s:%d: " \ + format, __func__, __LINE__, ##__VA_ARGS__) + +#define mlx5_fpga_notice(__adev, format, ...) \ + mlx5_core_info((__adev)->mdev, "FPGA: " format, ##__VA_ARGS__) + +#define mlx5_fpga_info(__adev, format, ...) \ + mlx5_core_info((__adev)->mdev, "FPGA: " format, ##__VA_ARGS__) + +int mlx5_fpga_init(struct mlx5_core_dev *mdev); +void mlx5_fpga_cleanup(struct mlx5_core_dev *mdev); +int mlx5_fpga_device_start(struct mlx5_core_dev *mdev); +void mlx5_fpga_device_stop(struct mlx5_core_dev *mdev); + +#else + +static inline int mlx5_fpga_init(struct mlx5_core_dev *mdev) +{ + return 0; +} + +static inline void mlx5_fpga_cleanup(struct mlx5_core_dev *mdev) +{ +} + +static inline int mlx5_fpga_device_start(struct mlx5_core_dev *mdev) +{ + return 0; +} + +static inline void mlx5_fpga_device_stop(struct mlx5_core_dev *mdev) +{ +} + +#endif + +#endif /* __MLX5_FPGA_CORE_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c new file mode 100644 index 0000000..9b1fb84 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c @@ -0,0 +1,1583 @@ +/* + * Copyright (c) 2017 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include +#include +#include +#include + +#include "mlx5_core.h" +#include "fs_cmd.h" +#include "fpga/ipsec.h" +#include "fpga/sdk.h" +#include "fpga/core.h" + +enum mlx5_fpga_ipsec_cmd_status { + MLX5_FPGA_IPSEC_CMD_PENDING, + MLX5_FPGA_IPSEC_CMD_SEND_FAIL, + MLX5_FPGA_IPSEC_CMD_COMPLETE, +}; + +struct mlx5_fpga_ipsec_cmd_context { + struct mlx5_fpga_dma_buf buf; + enum mlx5_fpga_ipsec_cmd_status status; + struct mlx5_ifc_fpga_ipsec_cmd_resp resp; + int status_code; + struct completion complete; + struct mlx5_fpga_device *dev; + struct list_head list; /* Item in pending_cmds */ + u8 command[]; +}; + +struct mlx5_fpga_esp_xfrm; + +struct mlx5_fpga_ipsec_sa_ctx { + struct rhash_head hash; + struct mlx5_ifc_fpga_ipsec_sa hw_sa; + u32 sa_handle; + struct mlx5_core_dev *dev; + struct mlx5_fpga_esp_xfrm *fpga_xfrm; +}; + +struct mlx5_fpga_esp_xfrm { + unsigned int num_rules; + struct mlx5_fpga_ipsec_sa_ctx *sa_ctx; + struct mutex lock; /* xfrm lock */ + struct mlx5_accel_esp_xfrm accel_xfrm; +}; + +struct mlx5_fpga_ipsec_rule { + struct rb_node node; + struct fs_fte *fte; + struct mlx5_fpga_ipsec_sa_ctx *ctx; +}; + +static const struct rhashtable_params rhash_sa = { + /* Keep out "cmd" field from the key as it's + * value is not constant during the lifetime + * of the key object. + */ + .key_len = sizeof_field(struct mlx5_fpga_ipsec_sa_ctx, hw_sa) - + sizeof_field(struct mlx5_ifc_fpga_ipsec_sa_v1, cmd), + .key_offset = offsetof(struct mlx5_fpga_ipsec_sa_ctx, hw_sa) + + sizeof_field(struct mlx5_ifc_fpga_ipsec_sa_v1, cmd), + .head_offset = offsetof(struct mlx5_fpga_ipsec_sa_ctx, hash), + .automatic_shrinking = true, + .min_size = 1, +}; + +struct mlx5_fpga_ipsec { + struct mlx5_fpga_device *fdev; + struct list_head pending_cmds; + spinlock_t pending_cmds_lock; /* Protects pending_cmds */ + u32 caps[MLX5_ST_SZ_DW(ipsec_extended_cap)]; + struct mlx5_fpga_conn *conn; + + struct notifier_block fs_notifier_ingress_bypass; + struct notifier_block fs_notifier_egress; + + /* Map hardware SA --> SA context + * (mlx5_fpga_ipsec_sa) (mlx5_fpga_ipsec_sa_ctx) + * We will use this hash to avoid SAs duplication in fpga which + * aren't allowed + */ + struct rhashtable sa_hash; /* hw_sa -> mlx5_fpga_ipsec_sa_ctx */ + struct mutex sa_hash_lock; + + /* Tree holding all rules for this fpga device + * Key for searching a rule (mlx5_fpga_ipsec_rule) is (ft, id) + */ + struct rb_root rules_rb; + struct mutex rules_rb_lock; /* rules lock */ + + struct ida halloc; +}; + +bool mlx5_fpga_is_ipsec_device(struct mlx5_core_dev *mdev) +{ + if (!mdev->fpga || !MLX5_CAP_GEN(mdev, fpga)) + return false; + + if (MLX5_CAP_FPGA(mdev, ieee_vendor_id) != + MLX5_FPGA_CAP_SANDBOX_VENDOR_ID_MLNX) + return false; + + if (MLX5_CAP_FPGA(mdev, sandbox_product_id) != + MLX5_FPGA_CAP_SANDBOX_PRODUCT_ID_IPSEC) + return false; + + return true; +} + +static void mlx5_fpga_ipsec_send_complete(struct mlx5_fpga_conn *conn, + struct mlx5_fpga_device *fdev, + struct mlx5_fpga_dma_buf *buf, + u8 status) +{ + struct mlx5_fpga_ipsec_cmd_context *context; + + if (status) { + context = container_of(buf, struct mlx5_fpga_ipsec_cmd_context, + buf); + mlx5_fpga_warn(fdev, "IPSec command send failed with status %u\n", + status); + context->status = MLX5_FPGA_IPSEC_CMD_SEND_FAIL; + complete(&context->complete); + } +} + +static inline +int syndrome_to_errno(enum mlx5_ifc_fpga_ipsec_response_syndrome syndrome) +{ + switch (syndrome) { + case MLX5_FPGA_IPSEC_RESPONSE_SUCCESS: + return 0; + case MLX5_FPGA_IPSEC_RESPONSE_SADB_ISSUE: + return -EEXIST; + case MLX5_FPGA_IPSEC_RESPONSE_ILLEGAL_REQUEST: + return -EINVAL; + case MLX5_FPGA_IPSEC_RESPONSE_WRITE_RESPONSE_ISSUE: + return -EIO; + } + return -EIO; +} + +static void mlx5_fpga_ipsec_recv(void *cb_arg, struct mlx5_fpga_dma_buf *buf) +{ + struct mlx5_ifc_fpga_ipsec_cmd_resp *resp = buf->sg[0].data; + struct mlx5_fpga_ipsec_cmd_context *context; + enum mlx5_ifc_fpga_ipsec_response_syndrome syndrome; + struct mlx5_fpga_device *fdev = cb_arg; + unsigned long flags; + + if (buf->sg[0].size < sizeof(*resp)) { + mlx5_fpga_warn(fdev, "Short receive from FPGA IPSec: %u < %zu bytes\n", + buf->sg[0].size, sizeof(*resp)); + return; + } + + mlx5_fpga_dbg(fdev, "mlx5_ipsec recv_cb syndrome %08x\n", + ntohl(resp->syndrome)); + + spin_lock_irqsave(&fdev->ipsec->pending_cmds_lock, flags); + context = list_first_entry_or_null(&fdev->ipsec->pending_cmds, + struct mlx5_fpga_ipsec_cmd_context, + list); + if (context) + list_del(&context->list); + spin_unlock_irqrestore(&fdev->ipsec->pending_cmds_lock, flags); + + if (!context) { + mlx5_fpga_warn(fdev, "Received IPSec offload response without pending command request\n"); + return; + } + mlx5_fpga_dbg(fdev, "Handling response for %p\n", context); + + syndrome = ntohl(resp->syndrome); + context->status_code = syndrome_to_errno(syndrome); + context->status = MLX5_FPGA_IPSEC_CMD_COMPLETE; + memcpy(&context->resp, resp, sizeof(*resp)); + + if (context->status_code) + mlx5_fpga_warn(fdev, "IPSec command failed with syndrome %08x\n", + syndrome); + + complete(&context->complete); +} + +static void *mlx5_fpga_ipsec_cmd_exec(struct mlx5_core_dev *mdev, + const void *cmd, int cmd_size) +{ + struct mlx5_fpga_ipsec_cmd_context *context; + struct mlx5_fpga_device *fdev = mdev->fpga; + unsigned long flags; + int res; + + if (!fdev || !fdev->ipsec) + return ERR_PTR(-EOPNOTSUPP); + + if (cmd_size & 3) + return ERR_PTR(-EINVAL); + + context = kzalloc(sizeof(*context) + cmd_size, GFP_ATOMIC); + if (!context) + return ERR_PTR(-ENOMEM); + + context->status = MLX5_FPGA_IPSEC_CMD_PENDING; + context->dev = fdev; + context->buf.complete = mlx5_fpga_ipsec_send_complete; + init_completion(&context->complete); + memcpy(&context->command, cmd, cmd_size); + context->buf.sg[0].size = cmd_size; + context->buf.sg[0].data = &context->command; + + spin_lock_irqsave(&fdev->ipsec->pending_cmds_lock, flags); + res = mlx5_fpga_sbu_conn_sendmsg(fdev->ipsec->conn, &context->buf); + if (!res) + list_add_tail(&context->list, &fdev->ipsec->pending_cmds); + spin_unlock_irqrestore(&fdev->ipsec->pending_cmds_lock, flags); + + if (res) { + mlx5_fpga_warn(fdev, "Failed to send IPSec command: %d\n", res); + kfree(context); + return ERR_PTR(res); + } + + /* Context should be freed by the caller after completion. */ + return context; +} + +static int mlx5_fpga_ipsec_cmd_wait(void *ctx) +{ + struct mlx5_fpga_ipsec_cmd_context *context = ctx; + unsigned long timeout = + msecs_to_jiffies(MLX5_FPGA_CMD_TIMEOUT_MSEC); + int res; + + res = wait_for_completion_timeout(&context->complete, timeout); + if (!res) { + mlx5_fpga_warn(context->dev, "Failure waiting for IPSec command response\n"); + return -ETIMEDOUT; + } + + if (context->status == MLX5_FPGA_IPSEC_CMD_COMPLETE) + res = context->status_code; + else + res = -EIO; + + return res; +} + +static inline bool is_v2_sadb_supported(struct mlx5_fpga_ipsec *fipsec) +{ + if (MLX5_GET(ipsec_extended_cap, fipsec->caps, v2_command)) + return true; + return false; +} + +static int mlx5_fpga_ipsec_update_hw_sa(struct mlx5_fpga_device *fdev, + struct mlx5_ifc_fpga_ipsec_sa *hw_sa, + int opcode) +{ + struct mlx5_core_dev *dev = fdev->mdev; + struct mlx5_ifc_fpga_ipsec_sa *sa; + struct mlx5_fpga_ipsec_cmd_context *cmd_context; + size_t sa_cmd_size; + int err; + + hw_sa->ipsec_sa_v1.cmd = htonl(opcode); + if (is_v2_sadb_supported(fdev->ipsec)) + sa_cmd_size = sizeof(*hw_sa); + else + sa_cmd_size = sizeof(hw_sa->ipsec_sa_v1); + + cmd_context = (struct mlx5_fpga_ipsec_cmd_context *) + mlx5_fpga_ipsec_cmd_exec(dev, hw_sa, sa_cmd_size); + if (IS_ERR(cmd_context)) + return PTR_ERR(cmd_context); + + err = mlx5_fpga_ipsec_cmd_wait(cmd_context); + if (err) + goto out; + + sa = (struct mlx5_ifc_fpga_ipsec_sa *)&cmd_context->command; + if (sa->ipsec_sa_v1.sw_sa_handle != cmd_context->resp.sw_sa_handle) { + mlx5_fpga_err(fdev, "mismatch SA handle. cmd 0x%08x vs resp 0x%08x\n", + ntohl(sa->ipsec_sa_v1.sw_sa_handle), + ntohl(cmd_context->resp.sw_sa_handle)); + err = -EIO; + } + +out: + kfree(cmd_context); + return err; +} + +u32 mlx5_fpga_ipsec_device_caps(struct mlx5_core_dev *mdev) +{ + struct mlx5_fpga_device *fdev = mdev->fpga; + u32 ret = 0; + + if (mlx5_fpga_is_ipsec_device(mdev)) { + ret |= MLX5_ACCEL_IPSEC_CAP_DEVICE; + ret |= MLX5_ACCEL_IPSEC_CAP_REQUIRED_METADATA; + } else { + return ret; + } + + if (!fdev->ipsec) + return ret; + + if (MLX5_GET(ipsec_extended_cap, fdev->ipsec->caps, esp)) + ret |= MLX5_ACCEL_IPSEC_CAP_ESP; + + if (MLX5_GET(ipsec_extended_cap, fdev->ipsec->caps, ipv6)) + ret |= MLX5_ACCEL_IPSEC_CAP_IPV6; + + if (MLX5_GET(ipsec_extended_cap, fdev->ipsec->caps, lso)) + ret |= MLX5_ACCEL_IPSEC_CAP_LSO; + + if (MLX5_GET(ipsec_extended_cap, fdev->ipsec->caps, rx_no_trailer)) + ret |= MLX5_ACCEL_IPSEC_CAP_RX_NO_TRAILER; + + if (MLX5_GET(ipsec_extended_cap, fdev->ipsec->caps, esn)) { + ret |= MLX5_ACCEL_IPSEC_CAP_ESN; + ret |= MLX5_ACCEL_IPSEC_CAP_TX_IV_IS_ESN; + } + + return ret; +} + +static unsigned int mlx5_fpga_ipsec_counters_count(struct mlx5_core_dev *mdev) +{ + struct mlx5_fpga_device *fdev = mdev->fpga; + + if (!fdev || !fdev->ipsec) + return 0; + + return MLX5_GET(ipsec_extended_cap, fdev->ipsec->caps, + number_of_ipsec_counters); +} + +static int mlx5_fpga_ipsec_counters_read(struct mlx5_core_dev *mdev, u64 *counters, + unsigned int counters_count) +{ + struct mlx5_fpga_device *fdev = mdev->fpga; + unsigned int i; + __be32 *data; + u32 count; + u64 addr; + int ret; + + if (!fdev || !fdev->ipsec) + return 0; + + addr = (u64)MLX5_GET(ipsec_extended_cap, fdev->ipsec->caps, + ipsec_counters_addr_low) + + ((u64)MLX5_GET(ipsec_extended_cap, fdev->ipsec->caps, + ipsec_counters_addr_high) << 32); + + count = mlx5_fpga_ipsec_counters_count(mdev); + + data = kzalloc(array3_size(sizeof(*data), count, 2), GFP_KERNEL); + if (!data) { + ret = -ENOMEM; + goto out; + } + + ret = mlx5_fpga_mem_read(fdev, count * sizeof(u64), addr, data, + MLX5_FPGA_ACCESS_TYPE_DONTCARE); + if (ret < 0) { + mlx5_fpga_err(fdev, "Failed to read IPSec counters from HW: %d\n", + ret); + goto out; + } + ret = 0; + + if (count > counters_count) + count = counters_count; + + /* Each counter is low word, then high. But each word is big-endian */ + for (i = 0; i < count; i++) + counters[i] = (u64)ntohl(data[i * 2]) | + ((u64)ntohl(data[i * 2 + 1]) << 32); + +out: + kfree(data); + return ret; +} + +static int mlx5_fpga_ipsec_set_caps(struct mlx5_core_dev *mdev, u32 flags) +{ + struct mlx5_fpga_ipsec_cmd_context *context; + struct mlx5_ifc_fpga_ipsec_cmd_cap cmd = {0}; + int err; + + cmd.cmd = htonl(MLX5_FPGA_IPSEC_CMD_OP_SET_CAP); + cmd.flags = htonl(flags); + context = mlx5_fpga_ipsec_cmd_exec(mdev, &cmd, sizeof(cmd)); + if (IS_ERR(context)) + return PTR_ERR(context); + + err = mlx5_fpga_ipsec_cmd_wait(context); + if (err) + goto out; + + if ((context->resp.flags & cmd.flags) != cmd.flags) { + mlx5_fpga_err(context->dev, "Failed to set capabilities. cmd 0x%08x vs resp 0x%08x\n", + cmd.flags, + context->resp.flags); + err = -EIO; + } + +out: + kfree(context); + return err; +} + +static int mlx5_fpga_ipsec_enable_supported_caps(struct mlx5_core_dev *mdev) +{ + u32 dev_caps = mlx5_fpga_ipsec_device_caps(mdev); + u32 flags = 0; + + if (dev_caps & MLX5_ACCEL_IPSEC_CAP_RX_NO_TRAILER) + flags |= MLX5_FPGA_IPSEC_CAP_NO_TRAILER; + + return mlx5_fpga_ipsec_set_caps(mdev, flags); +} + +static void +mlx5_fpga_ipsec_build_hw_xfrm(struct mlx5_core_dev *mdev, + const struct mlx5_accel_esp_xfrm_attrs *xfrm_attrs, + struct mlx5_ifc_fpga_ipsec_sa *hw_sa) +{ + const struct aes_gcm_keymat *aes_gcm = &xfrm_attrs->keymat.aes_gcm; + + /* key */ + memcpy(&hw_sa->ipsec_sa_v1.key_enc, aes_gcm->aes_key, + aes_gcm->key_len / 8); + /* Duplicate 128 bit key twice according to HW layout */ + if (aes_gcm->key_len == 128) + memcpy(&hw_sa->ipsec_sa_v1.key_enc[16], + aes_gcm->aes_key, aes_gcm->key_len / 8); + + /* salt and seq_iv */ + memcpy(&hw_sa->ipsec_sa_v1.gcm.salt_iv, &aes_gcm->seq_iv, + sizeof(aes_gcm->seq_iv)); + memcpy(&hw_sa->ipsec_sa_v1.gcm.salt, &aes_gcm->salt, + sizeof(aes_gcm->salt)); + + /* esn */ + if (xfrm_attrs->flags & MLX5_ACCEL_ESP_FLAGS_ESN_TRIGGERED) { + hw_sa->ipsec_sa_v1.flags |= MLX5_FPGA_IPSEC_SA_ESN_EN; + hw_sa->ipsec_sa_v1.flags |= + (xfrm_attrs->flags & + MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP) ? + MLX5_FPGA_IPSEC_SA_ESN_OVERLAP : 0; + hw_sa->esn = htonl(xfrm_attrs->esn); + } else { + hw_sa->ipsec_sa_v1.flags &= ~MLX5_FPGA_IPSEC_SA_ESN_EN; + hw_sa->ipsec_sa_v1.flags &= + ~(xfrm_attrs->flags & + MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP) ? + MLX5_FPGA_IPSEC_SA_ESN_OVERLAP : 0; + hw_sa->esn = 0; + } + + /* rx handle */ + hw_sa->ipsec_sa_v1.sw_sa_handle = htonl(xfrm_attrs->sa_handle); + + /* enc mode */ + switch (aes_gcm->key_len) { + case 128: + hw_sa->ipsec_sa_v1.enc_mode = + MLX5_FPGA_IPSEC_SA_ENC_MODE_AES_GCM_128_AUTH_128; + break; + case 256: + hw_sa->ipsec_sa_v1.enc_mode = + MLX5_FPGA_IPSEC_SA_ENC_MODE_AES_GCM_256_AUTH_128; + break; + } + + /* flags */ + hw_sa->ipsec_sa_v1.flags |= MLX5_FPGA_IPSEC_SA_SA_VALID | + MLX5_FPGA_IPSEC_SA_SPI_EN | + MLX5_FPGA_IPSEC_SA_IP_ESP; + + if (xfrm_attrs->action & MLX5_ACCEL_ESP_ACTION_ENCRYPT) + hw_sa->ipsec_sa_v1.flags |= MLX5_FPGA_IPSEC_SA_DIR_SX; + else + hw_sa->ipsec_sa_v1.flags &= ~MLX5_FPGA_IPSEC_SA_DIR_SX; +} + +static void +mlx5_fpga_ipsec_build_hw_sa(struct mlx5_core_dev *mdev, + struct mlx5_accel_esp_xfrm_attrs *xfrm_attrs, + const __be32 saddr[4], + const __be32 daddr[4], + const __be32 spi, bool is_ipv6, + struct mlx5_ifc_fpga_ipsec_sa *hw_sa) +{ + mlx5_fpga_ipsec_build_hw_xfrm(mdev, xfrm_attrs, hw_sa); + + /* IPs */ + memcpy(hw_sa->ipsec_sa_v1.sip, saddr, sizeof(hw_sa->ipsec_sa_v1.sip)); + memcpy(hw_sa->ipsec_sa_v1.dip, daddr, sizeof(hw_sa->ipsec_sa_v1.dip)); + + /* SPI */ + hw_sa->ipsec_sa_v1.spi = spi; + + /* flags */ + if (is_ipv6) + hw_sa->ipsec_sa_v1.flags |= MLX5_FPGA_IPSEC_SA_IPV6; +} + +static bool is_full_mask(const void *p, size_t len) +{ + WARN_ON(len % 4); + + return !memchr_inv(p, 0xff, len); +} + +static bool validate_fpga_full_mask(struct mlx5_core_dev *dev, + const u32 *match_c, + const u32 *match_v) +{ + const void *misc_params_c = MLX5_ADDR_OF(fte_match_param, + match_c, + misc_parameters); + const void *headers_c = MLX5_ADDR_OF(fte_match_param, + match_c, + outer_headers); + const void *headers_v = MLX5_ADDR_OF(fte_match_param, + match_v, + outer_headers); + + if (mlx5_fs_is_outer_ipv4_flow(dev, headers_c, headers_v)) { + const void *s_ipv4_c = MLX5_ADDR_OF(fte_match_set_lyr_2_4, + headers_c, + src_ipv4_src_ipv6.ipv4_layout.ipv4); + const void *d_ipv4_c = MLX5_ADDR_OF(fte_match_set_lyr_2_4, + headers_c, + dst_ipv4_dst_ipv6.ipv4_layout.ipv4); + + if (!is_full_mask(s_ipv4_c, MLX5_FLD_SZ_BYTES(ipv4_layout, + ipv4)) || + !is_full_mask(d_ipv4_c, MLX5_FLD_SZ_BYTES(ipv4_layout, + ipv4))) + return false; + } else { + const void *s_ipv6_c = MLX5_ADDR_OF(fte_match_set_lyr_2_4, + headers_c, + src_ipv4_src_ipv6.ipv6_layout.ipv6); + const void *d_ipv6_c = MLX5_ADDR_OF(fte_match_set_lyr_2_4, + headers_c, + dst_ipv4_dst_ipv6.ipv6_layout.ipv6); + + if (!is_full_mask(s_ipv6_c, MLX5_FLD_SZ_BYTES(ipv6_layout, + ipv6)) || + !is_full_mask(d_ipv6_c, MLX5_FLD_SZ_BYTES(ipv6_layout, + ipv6))) + return false; + } + + if (!is_full_mask(MLX5_ADDR_OF(fte_match_set_misc, misc_params_c, + outer_esp_spi), + MLX5_FLD_SZ_BYTES(fte_match_set_misc, outer_esp_spi))) + return false; + + return true; +} + +static bool mlx5_is_fpga_ipsec_rule(struct mlx5_core_dev *dev, + u8 match_criteria_enable, + const u32 *match_c, + const u32 *match_v) +{ + u32 ipsec_dev_caps = mlx5_fpga_ipsec_device_caps(dev); + bool ipv6_flow; + + ipv6_flow = mlx5_fs_is_outer_ipv6_flow(dev, match_c, match_v); + + if (!(match_criteria_enable & MLX5_MATCH_OUTER_HEADERS) || + mlx5_fs_is_outer_udp_flow(match_c, match_v) || + mlx5_fs_is_outer_tcp_flow(match_c, match_v) || + mlx5_fs_is_vxlan_flow(match_c) || + !(mlx5_fs_is_outer_ipv4_flow(dev, match_c, match_v) || + ipv6_flow)) + return false; + + if (!(ipsec_dev_caps & MLX5_ACCEL_IPSEC_CAP_DEVICE)) + return false; + + if (!(ipsec_dev_caps & MLX5_ACCEL_IPSEC_CAP_ESP) && + mlx5_fs_is_outer_ipsec_flow(match_c)) + return false; + + if (!(ipsec_dev_caps & MLX5_ACCEL_IPSEC_CAP_IPV6) && + ipv6_flow) + return false; + + if (!validate_fpga_full_mask(dev, match_c, match_v)) + return false; + + return true; +} + +static bool mlx5_is_fpga_egress_ipsec_rule(struct mlx5_core_dev *dev, + u8 match_criteria_enable, + const u32 *match_c, + const u32 *match_v, + struct mlx5_flow_act *flow_act, + struct mlx5_flow_context *flow_context) +{ + const void *outer_c = MLX5_ADDR_OF(fte_match_param, match_c, + outer_headers); + bool is_dmac = MLX5_GET(fte_match_set_lyr_2_4, outer_c, dmac_47_16) || + MLX5_GET(fte_match_set_lyr_2_4, outer_c, dmac_15_0); + bool is_smac = MLX5_GET(fte_match_set_lyr_2_4, outer_c, smac_47_16) || + MLX5_GET(fte_match_set_lyr_2_4, outer_c, smac_15_0); + int ret; + + ret = mlx5_is_fpga_ipsec_rule(dev, match_criteria_enable, match_c, + match_v); + if (!ret) + return ret; + + if (is_dmac || is_smac || + (match_criteria_enable & + ~(MLX5_MATCH_OUTER_HEADERS | MLX5_MATCH_MISC_PARAMETERS)) || + (flow_act->action & ~(MLX5_FLOW_CONTEXT_ACTION_ENCRYPT | MLX5_FLOW_CONTEXT_ACTION_ALLOW)) || + (flow_context->flags & FLOW_CONTEXT_HAS_TAG)) + return false; + + return true; +} + +static void *mlx5_fpga_ipsec_create_sa_ctx(struct mlx5_core_dev *mdev, + struct mlx5_accel_esp_xfrm *accel_xfrm, + const __be32 saddr[4], const __be32 daddr[4], + const __be32 spi, bool is_ipv6, u32 pdn, + u32 *sa_handle) +{ + struct mlx5_fpga_ipsec_sa_ctx *sa_ctx; + struct mlx5_fpga_esp_xfrm *fpga_xfrm = + container_of(accel_xfrm, typeof(*fpga_xfrm), + accel_xfrm); + struct mlx5_fpga_device *fdev = mdev->fpga; + struct mlx5_fpga_ipsec *fipsec = fdev->ipsec; + int opcode, err; + void *context; + + /* alloc SA */ + sa_ctx = kzalloc(sizeof(*sa_ctx), GFP_KERNEL); + if (!sa_ctx) + return ERR_PTR(-ENOMEM); + + sa_ctx->dev = mdev; + + /* build candidate SA */ + mlx5_fpga_ipsec_build_hw_sa(mdev, &accel_xfrm->attrs, + saddr, daddr, spi, is_ipv6, + &sa_ctx->hw_sa); + + mutex_lock(&fpga_xfrm->lock); + + if (fpga_xfrm->sa_ctx) { /* multiple rules for same accel_xfrm */ + /* all rules must be with same IPs and SPI */ + if (memcmp(&sa_ctx->hw_sa, &fpga_xfrm->sa_ctx->hw_sa, + sizeof(sa_ctx->hw_sa))) { + context = ERR_PTR(-EINVAL); + goto exists; + } + + ++fpga_xfrm->num_rules; + context = fpga_xfrm->sa_ctx; + goto exists; + } + + if (accel_xfrm->attrs.action == MLX5_ACCEL_ESP_ACTION_DECRYPT) { + err = ida_alloc_min(&fipsec->halloc, 1, GFP_KERNEL); + if (err < 0) { + context = ERR_PTR(err); + goto exists; + } + + sa_ctx->sa_handle = err; + if (sa_handle) + *sa_handle = sa_ctx->sa_handle; + } + /* This is unbounded fpga_xfrm, try to add to hash */ + mutex_lock(&fipsec->sa_hash_lock); + + err = rhashtable_lookup_insert_fast(&fipsec->sa_hash, &sa_ctx->hash, + rhash_sa); + if (err) { + /* Can't bound different accel_xfrm to already existing sa_ctx. + * This is because we can't support multiple ketmats for + * same IPs and SPI + */ + context = ERR_PTR(-EEXIST); + goto unlock_hash; + } + + /* Bound accel_xfrm to sa_ctx */ + opcode = is_v2_sadb_supported(fdev->ipsec) ? + MLX5_FPGA_IPSEC_CMD_OP_ADD_SA_V2 : + MLX5_FPGA_IPSEC_CMD_OP_ADD_SA; + err = mlx5_fpga_ipsec_update_hw_sa(fdev, &sa_ctx->hw_sa, opcode); + sa_ctx->hw_sa.ipsec_sa_v1.cmd = 0; + if (err) { + context = ERR_PTR(err); + goto delete_hash; + } + + mutex_unlock(&fipsec->sa_hash_lock); + + ++fpga_xfrm->num_rules; + fpga_xfrm->sa_ctx = sa_ctx; + sa_ctx->fpga_xfrm = fpga_xfrm; + + mutex_unlock(&fpga_xfrm->lock); + + return sa_ctx; + +delete_hash: + WARN_ON(rhashtable_remove_fast(&fipsec->sa_hash, &sa_ctx->hash, + rhash_sa)); +unlock_hash: + mutex_unlock(&fipsec->sa_hash_lock); + if (accel_xfrm->attrs.action == MLX5_ACCEL_ESP_ACTION_DECRYPT) + ida_free(&fipsec->halloc, sa_ctx->sa_handle); +exists: + mutex_unlock(&fpga_xfrm->lock); + kfree(sa_ctx); + return context; +} + +static void * +mlx5_fpga_ipsec_fs_create_sa_ctx(struct mlx5_core_dev *mdev, + struct fs_fte *fte, + bool is_egress) +{ + struct mlx5_accel_esp_xfrm *accel_xfrm; + __be32 saddr[4], daddr[4], spi; + struct mlx5_flow_group *fg; + bool is_ipv6 = false; + + fs_get_obj(fg, fte->node.parent); + /* validate */ + if (is_egress && + !mlx5_is_fpga_egress_ipsec_rule(mdev, + fg->mask.match_criteria_enable, + fg->mask.match_criteria, + fte->val, + &fte->action, + &fte->flow_context)) + return ERR_PTR(-EINVAL); + else if (!mlx5_is_fpga_ipsec_rule(mdev, + fg->mask.match_criteria_enable, + fg->mask.match_criteria, + fte->val)) + return ERR_PTR(-EINVAL); + + /* get xfrm context */ + accel_xfrm = + (struct mlx5_accel_esp_xfrm *)fte->action.esp_id; + + /* IPs */ + if (mlx5_fs_is_outer_ipv4_flow(mdev, fg->mask.match_criteria, + fte->val)) { + memcpy(&saddr[3], + MLX5_ADDR_OF(fte_match_set_lyr_2_4, + fte->val, + src_ipv4_src_ipv6.ipv4_layout.ipv4), + sizeof(saddr[3])); + memcpy(&daddr[3], + MLX5_ADDR_OF(fte_match_set_lyr_2_4, + fte->val, + dst_ipv4_dst_ipv6.ipv4_layout.ipv4), + sizeof(daddr[3])); + } else { + memcpy(saddr, + MLX5_ADDR_OF(fte_match_param, + fte->val, + outer_headers.src_ipv4_src_ipv6.ipv6_layout.ipv6), + sizeof(saddr)); + memcpy(daddr, + MLX5_ADDR_OF(fte_match_param, + fte->val, + outer_headers.dst_ipv4_dst_ipv6.ipv6_layout.ipv6), + sizeof(daddr)); + is_ipv6 = true; + } + + /* SPI */ + spi = MLX5_GET_BE(typeof(spi), + fte_match_param, fte->val, + misc_parameters.outer_esp_spi); + + /* create */ + return mlx5_fpga_ipsec_create_sa_ctx(mdev, accel_xfrm, + saddr, daddr, + spi, is_ipv6, 0, NULL); +} + +static void +mlx5_fpga_ipsec_release_sa_ctx(struct mlx5_fpga_ipsec_sa_ctx *sa_ctx) +{ + struct mlx5_fpga_device *fdev = sa_ctx->dev->fpga; + struct mlx5_fpga_ipsec *fipsec = fdev->ipsec; + int opcode = is_v2_sadb_supported(fdev->ipsec) ? + MLX5_FPGA_IPSEC_CMD_OP_DEL_SA_V2 : + MLX5_FPGA_IPSEC_CMD_OP_DEL_SA; + int err; + + err = mlx5_fpga_ipsec_update_hw_sa(fdev, &sa_ctx->hw_sa, opcode); + sa_ctx->hw_sa.ipsec_sa_v1.cmd = 0; + if (err) { + WARN_ON(err); + return; + } + + if (sa_ctx->fpga_xfrm->accel_xfrm.attrs.action == + MLX5_ACCEL_ESP_ACTION_DECRYPT) + ida_free(&fipsec->halloc, sa_ctx->sa_handle); + + mutex_lock(&fipsec->sa_hash_lock); + WARN_ON(rhashtable_remove_fast(&fipsec->sa_hash, &sa_ctx->hash, + rhash_sa)); + mutex_unlock(&fipsec->sa_hash_lock); +} + +static void mlx5_fpga_ipsec_delete_sa_ctx(void *context) +{ + struct mlx5_fpga_esp_xfrm *fpga_xfrm = + ((struct mlx5_fpga_ipsec_sa_ctx *)context)->fpga_xfrm; + + mutex_lock(&fpga_xfrm->lock); + if (!--fpga_xfrm->num_rules) { + mlx5_fpga_ipsec_release_sa_ctx(fpga_xfrm->sa_ctx); + kfree(fpga_xfrm->sa_ctx); + fpga_xfrm->sa_ctx = NULL; + } + mutex_unlock(&fpga_xfrm->lock); +} + +static inline struct mlx5_fpga_ipsec_rule * +_rule_search(struct rb_root *root, struct fs_fte *fte) +{ + struct rb_node *node = root->rb_node; + + while (node) { + struct mlx5_fpga_ipsec_rule *rule = + container_of(node, struct mlx5_fpga_ipsec_rule, + node); + + if (rule->fte < fte) + node = node->rb_left; + else if (rule->fte > fte) + node = node->rb_right; + else + return rule; + } + return NULL; +} + +static struct mlx5_fpga_ipsec_rule * +rule_search(struct mlx5_fpga_ipsec *ipsec_dev, struct fs_fte *fte) +{ + struct mlx5_fpga_ipsec_rule *rule; + + mutex_lock(&ipsec_dev->rules_rb_lock); + rule = _rule_search(&ipsec_dev->rules_rb, fte); + mutex_unlock(&ipsec_dev->rules_rb_lock); + + return rule; +} + +static inline int _rule_insert(struct rb_root *root, + struct mlx5_fpga_ipsec_rule *rule) +{ + struct rb_node **new = &root->rb_node, *parent = NULL; + + /* Figure out where to put new node */ + while (*new) { + struct mlx5_fpga_ipsec_rule *this = + container_of(*new, struct mlx5_fpga_ipsec_rule, + node); + + parent = *new; + if (rule->fte < this->fte) + new = &((*new)->rb_left); + else if (rule->fte > this->fte) + new = &((*new)->rb_right); + else + return -EEXIST; + } + + /* Add new node and rebalance tree. */ + rb_link_node(&rule->node, parent, new); + rb_insert_color(&rule->node, root); + + return 0; +} + +static int rule_insert(struct mlx5_fpga_ipsec *ipsec_dev, + struct mlx5_fpga_ipsec_rule *rule) +{ + int ret; + + mutex_lock(&ipsec_dev->rules_rb_lock); + ret = _rule_insert(&ipsec_dev->rules_rb, rule); + mutex_unlock(&ipsec_dev->rules_rb_lock); + + return ret; +} + +static inline void _rule_delete(struct mlx5_fpga_ipsec *ipsec_dev, + struct mlx5_fpga_ipsec_rule *rule) +{ + struct rb_root *root = &ipsec_dev->rules_rb; + + mutex_lock(&ipsec_dev->rules_rb_lock); + rb_erase(&rule->node, root); + mutex_unlock(&ipsec_dev->rules_rb_lock); +} + +static void rule_delete(struct mlx5_fpga_ipsec *ipsec_dev, + struct mlx5_fpga_ipsec_rule *rule) +{ + _rule_delete(ipsec_dev, rule); + kfree(rule); +} + +struct mailbox_mod { + uintptr_t saved_esp_id; + u32 saved_action; + u32 saved_outer_esp_spi_value; +}; + +static void restore_spec_mailbox(struct fs_fte *fte, + struct mailbox_mod *mbox_mod) +{ + char *misc_params_v = MLX5_ADDR_OF(fte_match_param, + fte->val, + misc_parameters); + + MLX5_SET(fte_match_set_misc, misc_params_v, outer_esp_spi, + mbox_mod->saved_outer_esp_spi_value); + fte->action.action |= mbox_mod->saved_action; + fte->action.esp_id = (uintptr_t)mbox_mod->saved_esp_id; +} + +static void modify_spec_mailbox(struct mlx5_core_dev *mdev, + struct fs_fte *fte, + struct mailbox_mod *mbox_mod) +{ + char *misc_params_v = MLX5_ADDR_OF(fte_match_param, + fte->val, + misc_parameters); + + mbox_mod->saved_esp_id = fte->action.esp_id; + mbox_mod->saved_action = fte->action.action & + (MLX5_FLOW_CONTEXT_ACTION_ENCRYPT | + MLX5_FLOW_CONTEXT_ACTION_DECRYPT); + mbox_mod->saved_outer_esp_spi_value = + MLX5_GET(fte_match_set_misc, misc_params_v, + outer_esp_spi); + + fte->action.esp_id = 0; + fte->action.action &= ~(MLX5_FLOW_CONTEXT_ACTION_ENCRYPT | + MLX5_FLOW_CONTEXT_ACTION_DECRYPT); + if (!MLX5_CAP_FLOWTABLE(mdev, + flow_table_properties_nic_receive.ft_field_support.outer_esp_spi)) + MLX5_SET(fte_match_set_misc, misc_params_v, outer_esp_spi, 0); +} + +static enum fs_flow_table_type egress_to_fs_ft(bool egress) +{ + return egress ? FS_FT_NIC_TX : FS_FT_NIC_RX; +} + +static int fpga_ipsec_fs_create_flow_group(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + u32 *in, + struct mlx5_flow_group *fg, + bool is_egress) +{ + int (*create_flow_group)(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, u32 *in, + struct mlx5_flow_group *fg) = + mlx5_fs_cmd_get_default(egress_to_fs_ft(is_egress))->create_flow_group; + char *misc_params_c = MLX5_ADDR_OF(create_flow_group_in, in, + match_criteria.misc_parameters); + struct mlx5_core_dev *dev = ns->dev; + u32 saved_outer_esp_spi_mask; + u8 match_criteria_enable; + int ret; + + if (MLX5_CAP_FLOWTABLE(dev, + flow_table_properties_nic_receive.ft_field_support.outer_esp_spi)) + return create_flow_group(ns, ft, in, fg); + + match_criteria_enable = + MLX5_GET(create_flow_group_in, in, match_criteria_enable); + saved_outer_esp_spi_mask = + MLX5_GET(fte_match_set_misc, misc_params_c, outer_esp_spi); + if (!match_criteria_enable || !saved_outer_esp_spi_mask) + return create_flow_group(ns, ft, in, fg); + + MLX5_SET(fte_match_set_misc, misc_params_c, outer_esp_spi, 0); + + if (!(*misc_params_c) && + !memcmp(misc_params_c, misc_params_c + 1, MLX5_ST_SZ_BYTES(fte_match_set_misc) - 1)) + MLX5_SET(create_flow_group_in, in, match_criteria_enable, + match_criteria_enable & ~MLX5_MATCH_MISC_PARAMETERS); + + ret = create_flow_group(ns, ft, in, fg); + + MLX5_SET(fte_match_set_misc, misc_params_c, outer_esp_spi, saved_outer_esp_spi_mask); + MLX5_SET(create_flow_group_in, in, match_criteria_enable, match_criteria_enable); + + return ret; +} + +static int fpga_ipsec_fs_create_fte(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + struct mlx5_flow_group *fg, + struct fs_fte *fte, + bool is_egress) +{ + int (*create_fte)(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + struct mlx5_flow_group *fg, + struct fs_fte *fte) = + mlx5_fs_cmd_get_default(egress_to_fs_ft(is_egress))->create_fte; + struct mlx5_core_dev *dev = ns->dev; + struct mlx5_fpga_device *fdev = dev->fpga; + struct mlx5_fpga_ipsec *fipsec = fdev->ipsec; + struct mlx5_fpga_ipsec_rule *rule; + bool is_esp = fte->action.esp_id; + struct mailbox_mod mbox_mod; + int ret; + + if (!is_esp || + !(fte->action.action & + (MLX5_FLOW_CONTEXT_ACTION_ENCRYPT | + MLX5_FLOW_CONTEXT_ACTION_DECRYPT))) + return create_fte(ns, ft, fg, fte); + + rule = kzalloc(sizeof(*rule), GFP_KERNEL); + if (!rule) + return -ENOMEM; + + rule->ctx = mlx5_fpga_ipsec_fs_create_sa_ctx(dev, fte, is_egress); + if (IS_ERR(rule->ctx)) { + int err = PTR_ERR(rule->ctx); + + kfree(rule); + return err; + } + + rule->fte = fte; + WARN_ON(rule_insert(fipsec, rule)); + + modify_spec_mailbox(dev, fte, &mbox_mod); + ret = create_fte(ns, ft, fg, fte); + restore_spec_mailbox(fte, &mbox_mod); + if (ret) { + _rule_delete(fipsec, rule); + mlx5_fpga_ipsec_delete_sa_ctx(rule->ctx); + kfree(rule); + } + + return ret; +} + +static int fpga_ipsec_fs_update_fte(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + struct mlx5_flow_group *fg, + int modify_mask, + struct fs_fte *fte, + bool is_egress) +{ + int (*update_fte)(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + struct mlx5_flow_group *fg, + int modify_mask, + struct fs_fte *fte) = + mlx5_fs_cmd_get_default(egress_to_fs_ft(is_egress))->update_fte; + struct mlx5_core_dev *dev = ns->dev; + bool is_esp = fte->action.esp_id; + struct mailbox_mod mbox_mod; + int ret; + + if (!is_esp || + !(fte->action.action & + (MLX5_FLOW_CONTEXT_ACTION_ENCRYPT | + MLX5_FLOW_CONTEXT_ACTION_DECRYPT))) + return update_fte(ns, ft, fg, modify_mask, fte); + + modify_spec_mailbox(dev, fte, &mbox_mod); + ret = update_fte(ns, ft, fg, modify_mask, fte); + restore_spec_mailbox(fte, &mbox_mod); + + return ret; +} + +static int fpga_ipsec_fs_delete_fte(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + struct fs_fte *fte, + bool is_egress) +{ + int (*delete_fte)(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + struct fs_fte *fte) = + mlx5_fs_cmd_get_default(egress_to_fs_ft(is_egress))->delete_fte; + struct mlx5_core_dev *dev = ns->dev; + struct mlx5_fpga_device *fdev = dev->fpga; + struct mlx5_fpga_ipsec *fipsec = fdev->ipsec; + struct mlx5_fpga_ipsec_rule *rule; + bool is_esp = fte->action.esp_id; + struct mailbox_mod mbox_mod; + int ret; + + if (!is_esp || + !(fte->action.action & + (MLX5_FLOW_CONTEXT_ACTION_ENCRYPT | + MLX5_FLOW_CONTEXT_ACTION_DECRYPT))) + return delete_fte(ns, ft, fte); + + rule = rule_search(fipsec, fte); + if (!rule) + return -ENOENT; + + mlx5_fpga_ipsec_delete_sa_ctx(rule->ctx); + rule_delete(fipsec, rule); + + modify_spec_mailbox(dev, fte, &mbox_mod); + ret = delete_fte(ns, ft, fte); + restore_spec_mailbox(fte, &mbox_mod); + + return ret; +} + +static int +mlx5_fpga_ipsec_fs_create_flow_group_egress(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + u32 *in, + struct mlx5_flow_group *fg) +{ + return fpga_ipsec_fs_create_flow_group(ns, ft, in, fg, true); +} + +static int +mlx5_fpga_ipsec_fs_create_fte_egress(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + struct mlx5_flow_group *fg, + struct fs_fte *fte) +{ + return fpga_ipsec_fs_create_fte(ns, ft, fg, fte, true); +} + +static int +mlx5_fpga_ipsec_fs_update_fte_egress(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + struct mlx5_flow_group *fg, + int modify_mask, + struct fs_fte *fte) +{ + return fpga_ipsec_fs_update_fte(ns, ft, fg, modify_mask, fte, + true); +} + +static int +mlx5_fpga_ipsec_fs_delete_fte_egress(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + struct fs_fte *fte) +{ + return fpga_ipsec_fs_delete_fte(ns, ft, fte, true); +} + +static int +mlx5_fpga_ipsec_fs_create_flow_group_ingress(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + u32 *in, + struct mlx5_flow_group *fg) +{ + return fpga_ipsec_fs_create_flow_group(ns, ft, in, fg, false); +} + +static int +mlx5_fpga_ipsec_fs_create_fte_ingress(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + struct mlx5_flow_group *fg, + struct fs_fte *fte) +{ + return fpga_ipsec_fs_create_fte(ns, ft, fg, fte, false); +} + +static int +mlx5_fpga_ipsec_fs_update_fte_ingress(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + struct mlx5_flow_group *fg, + int modify_mask, + struct fs_fte *fte) +{ + return fpga_ipsec_fs_update_fte(ns, ft, fg, modify_mask, fte, + false); +} + +static int +mlx5_fpga_ipsec_fs_delete_fte_ingress(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + struct fs_fte *fte) +{ + return fpga_ipsec_fs_delete_fte(ns, ft, fte, false); +} + +static struct mlx5_flow_cmds fpga_ipsec_ingress; +static struct mlx5_flow_cmds fpga_ipsec_egress; + +const struct mlx5_flow_cmds *mlx5_fs_cmd_get_default_ipsec_fpga_cmds(enum fs_flow_table_type type) +{ + switch (type) { + case FS_FT_NIC_RX: + return &fpga_ipsec_ingress; + case FS_FT_NIC_TX: + return &fpga_ipsec_egress; + default: + WARN_ON(true); + return NULL; + } +} + +static int mlx5_fpga_ipsec_init(struct mlx5_core_dev *mdev) +{ + struct mlx5_fpga_conn_attr init_attr = {0}; + struct mlx5_fpga_device *fdev = mdev->fpga; + struct mlx5_fpga_conn *conn; + int err; + + if (!mlx5_fpga_is_ipsec_device(mdev)) + return 0; + + fdev->ipsec = kzalloc(sizeof(*fdev->ipsec), GFP_KERNEL); + if (!fdev->ipsec) + return -ENOMEM; + + fdev->ipsec->fdev = fdev; + + err = mlx5_fpga_get_sbu_caps(fdev, sizeof(fdev->ipsec->caps), + fdev->ipsec->caps); + if (err) { + mlx5_fpga_err(fdev, "Failed to retrieve IPSec extended capabilities: %d\n", + err); + goto error; + } + + INIT_LIST_HEAD(&fdev->ipsec->pending_cmds); + spin_lock_init(&fdev->ipsec->pending_cmds_lock); + + init_attr.rx_size = SBU_QP_QUEUE_SIZE; + init_attr.tx_size = SBU_QP_QUEUE_SIZE; + init_attr.recv_cb = mlx5_fpga_ipsec_recv; + init_attr.cb_arg = fdev; + conn = mlx5_fpga_sbu_conn_create(fdev, &init_attr); + if (IS_ERR(conn)) { + err = PTR_ERR(conn); + mlx5_fpga_err(fdev, "Error creating IPSec command connection %d\n", + err); + goto error; + } + fdev->ipsec->conn = conn; + + err = rhashtable_init(&fdev->ipsec->sa_hash, &rhash_sa); + if (err) + goto err_destroy_conn; + mutex_init(&fdev->ipsec->sa_hash_lock); + + fdev->ipsec->rules_rb = RB_ROOT; + mutex_init(&fdev->ipsec->rules_rb_lock); + + err = mlx5_fpga_ipsec_enable_supported_caps(mdev); + if (err) { + mlx5_fpga_err(fdev, "Failed to enable IPSec extended capabilities: %d\n", + err); + goto err_destroy_hash; + } + + ida_init(&fdev->ipsec->halloc); + + return 0; + +err_destroy_hash: + rhashtable_destroy(&fdev->ipsec->sa_hash); + +err_destroy_conn: + mlx5_fpga_sbu_conn_destroy(conn); + +error: + kfree(fdev->ipsec); + fdev->ipsec = NULL; + return err; +} + +static void destroy_rules_rb(struct rb_root *root) +{ + struct mlx5_fpga_ipsec_rule *r, *tmp; + + rbtree_postorder_for_each_entry_safe(r, tmp, root, node) { + rb_erase(&r->node, root); + mlx5_fpga_ipsec_delete_sa_ctx(r->ctx); + kfree(r); + } +} + +static void mlx5_fpga_ipsec_cleanup(struct mlx5_core_dev *mdev) +{ + struct mlx5_fpga_device *fdev = mdev->fpga; + + if (!mlx5_fpga_is_ipsec_device(mdev)) + return; + + ida_destroy(&fdev->ipsec->halloc); + destroy_rules_rb(&fdev->ipsec->rules_rb); + rhashtable_destroy(&fdev->ipsec->sa_hash); + + mlx5_fpga_sbu_conn_destroy(fdev->ipsec->conn); + kfree(fdev->ipsec); + fdev->ipsec = NULL; +} + +void mlx5_fpga_ipsec_build_fs_cmds(void) +{ + /* ingress */ + fpga_ipsec_ingress.create_flow_table = + mlx5_fs_cmd_get_default(egress_to_fs_ft(false))->create_flow_table; + fpga_ipsec_ingress.destroy_flow_table = + mlx5_fs_cmd_get_default(egress_to_fs_ft(false))->destroy_flow_table; + fpga_ipsec_ingress.modify_flow_table = + mlx5_fs_cmd_get_default(egress_to_fs_ft(false))->modify_flow_table; + fpga_ipsec_ingress.create_flow_group = + mlx5_fpga_ipsec_fs_create_flow_group_ingress; + fpga_ipsec_ingress.destroy_flow_group = + mlx5_fs_cmd_get_default(egress_to_fs_ft(false))->destroy_flow_group; + fpga_ipsec_ingress.create_fte = + mlx5_fpga_ipsec_fs_create_fte_ingress; + fpga_ipsec_ingress.update_fte = + mlx5_fpga_ipsec_fs_update_fte_ingress; + fpga_ipsec_ingress.delete_fte = + mlx5_fpga_ipsec_fs_delete_fte_ingress; + fpga_ipsec_ingress.update_root_ft = + mlx5_fs_cmd_get_default(egress_to_fs_ft(false))->update_root_ft; + + /* egress */ + fpga_ipsec_egress.create_flow_table = + mlx5_fs_cmd_get_default(egress_to_fs_ft(true))->create_flow_table; + fpga_ipsec_egress.destroy_flow_table = + mlx5_fs_cmd_get_default(egress_to_fs_ft(true))->destroy_flow_table; + fpga_ipsec_egress.modify_flow_table = + mlx5_fs_cmd_get_default(egress_to_fs_ft(true))->modify_flow_table; + fpga_ipsec_egress.create_flow_group = + mlx5_fpga_ipsec_fs_create_flow_group_egress; + fpga_ipsec_egress.destroy_flow_group = + mlx5_fs_cmd_get_default(egress_to_fs_ft(true))->destroy_flow_group; + fpga_ipsec_egress.create_fte = + mlx5_fpga_ipsec_fs_create_fte_egress; + fpga_ipsec_egress.update_fte = + mlx5_fpga_ipsec_fs_update_fte_egress; + fpga_ipsec_egress.delete_fte = + mlx5_fpga_ipsec_fs_delete_fte_egress; + fpga_ipsec_egress.update_root_ft = + mlx5_fs_cmd_get_default(egress_to_fs_ft(true))->update_root_ft; +} + +static int +mlx5_fpga_esp_validate_xfrm_attrs(struct mlx5_core_dev *mdev, + const struct mlx5_accel_esp_xfrm_attrs *attrs) +{ + if (attrs->tfc_pad) { + mlx5_core_err(mdev, "Cannot offload xfrm states with tfc padding\n"); + return -EOPNOTSUPP; + } + + if (attrs->replay_type != MLX5_ACCEL_ESP_REPLAY_NONE) { + mlx5_core_err(mdev, "Cannot offload xfrm states with anti replay\n"); + return -EOPNOTSUPP; + } + + if (attrs->keymat_type != MLX5_ACCEL_ESP_KEYMAT_AES_GCM) { + mlx5_core_err(mdev, "Only aes gcm keymat is supported\n"); + return -EOPNOTSUPP; + } + + if (attrs->keymat.aes_gcm.iv_algo != + MLX5_ACCEL_ESP_AES_GCM_IV_ALGO_SEQ) { + mlx5_core_err(mdev, "Only iv sequence algo is supported\n"); + return -EOPNOTSUPP; + } + + if (attrs->keymat.aes_gcm.icv_len != 128) { + mlx5_core_err(mdev, "Cannot offload xfrm states with AEAD ICV length other than 128bit\n"); + return -EOPNOTSUPP; + } + + if (attrs->keymat.aes_gcm.key_len != 128 && + attrs->keymat.aes_gcm.key_len != 256) { + mlx5_core_err(mdev, "Cannot offload xfrm states with AEAD key length other than 128/256 bit\n"); + return -EOPNOTSUPP; + } + + if ((attrs->flags & MLX5_ACCEL_ESP_FLAGS_ESN_TRIGGERED) && + (!MLX5_GET(ipsec_extended_cap, mdev->fpga->ipsec->caps, + v2_command))) { + mlx5_core_err(mdev, "Cannot offload xfrm states with AEAD key length other than 128/256 bit\n"); + return -EOPNOTSUPP; + } + + return 0; +} + +static struct mlx5_accel_esp_xfrm * +mlx5_fpga_esp_create_xfrm(struct mlx5_core_dev *mdev, + const struct mlx5_accel_esp_xfrm_attrs *attrs, + u32 flags) +{ + struct mlx5_fpga_esp_xfrm *fpga_xfrm; + + if (!(flags & MLX5_ACCEL_XFRM_FLAG_REQUIRE_METADATA)) { + mlx5_core_warn(mdev, "Tried to create an esp action without metadata\n"); + return ERR_PTR(-EINVAL); + } + + if (mlx5_fpga_esp_validate_xfrm_attrs(mdev, attrs)) { + mlx5_core_warn(mdev, "Tried to create an esp with unsupported attrs\n"); + return ERR_PTR(-EOPNOTSUPP); + } + + fpga_xfrm = kzalloc(sizeof(*fpga_xfrm), GFP_KERNEL); + if (!fpga_xfrm) + return ERR_PTR(-ENOMEM); + + mutex_init(&fpga_xfrm->lock); + memcpy(&fpga_xfrm->accel_xfrm.attrs, attrs, + sizeof(fpga_xfrm->accel_xfrm.attrs)); + + return &fpga_xfrm->accel_xfrm; +} + +static void mlx5_fpga_esp_destroy_xfrm(struct mlx5_accel_esp_xfrm *xfrm) +{ + struct mlx5_fpga_esp_xfrm *fpga_xfrm = + container_of(xfrm, struct mlx5_fpga_esp_xfrm, + accel_xfrm); + /* assuming no sa_ctx are connected to this xfrm_ctx */ + kfree(fpga_xfrm); +} + +static int mlx5_fpga_esp_modify_xfrm(struct mlx5_accel_esp_xfrm *xfrm, + const struct mlx5_accel_esp_xfrm_attrs *attrs) +{ + struct mlx5_core_dev *mdev = xfrm->mdev; + struct mlx5_fpga_device *fdev = mdev->fpga; + struct mlx5_fpga_ipsec *fipsec = fdev->ipsec; + struct mlx5_fpga_esp_xfrm *fpga_xfrm; + struct mlx5_ifc_fpga_ipsec_sa org_hw_sa; + + int err = 0; + + if (!memcmp(&xfrm->attrs, attrs, sizeof(xfrm->attrs))) + return 0; + + if (mlx5_fpga_esp_validate_xfrm_attrs(mdev, attrs)) { + mlx5_core_warn(mdev, "Tried to create an esp with unsupported attrs\n"); + return -EOPNOTSUPP; + } + + if (is_v2_sadb_supported(fipsec)) { + mlx5_core_warn(mdev, "Modify esp is not supported\n"); + return -EOPNOTSUPP; + } + + fpga_xfrm = container_of(xfrm, struct mlx5_fpga_esp_xfrm, accel_xfrm); + + mutex_lock(&fpga_xfrm->lock); + + if (!fpga_xfrm->sa_ctx) + /* Unbounded xfrm, change only sw attrs */ + goto change_sw_xfrm_attrs; + + /* copy original hw sa */ + memcpy(&org_hw_sa, &fpga_xfrm->sa_ctx->hw_sa, sizeof(org_hw_sa)); + mutex_lock(&fipsec->sa_hash_lock); + /* remove original hw sa from hash */ + WARN_ON(rhashtable_remove_fast(&fipsec->sa_hash, + &fpga_xfrm->sa_ctx->hash, rhash_sa)); + /* update hw_sa with new xfrm attrs*/ + mlx5_fpga_ipsec_build_hw_xfrm(xfrm->mdev, attrs, + &fpga_xfrm->sa_ctx->hw_sa); + /* try to insert new hw_sa to hash */ + err = rhashtable_insert_fast(&fipsec->sa_hash, + &fpga_xfrm->sa_ctx->hash, rhash_sa); + if (err) + goto rollback_sa; + + /* modify device with new hw_sa */ + err = mlx5_fpga_ipsec_update_hw_sa(fdev, &fpga_xfrm->sa_ctx->hw_sa, + MLX5_FPGA_IPSEC_CMD_OP_MOD_SA_V2); + fpga_xfrm->sa_ctx->hw_sa.ipsec_sa_v1.cmd = 0; + if (err) + WARN_ON(rhashtable_remove_fast(&fipsec->sa_hash, + &fpga_xfrm->sa_ctx->hash, + rhash_sa)); +rollback_sa: + if (err) { + /* return original hw_sa to hash */ + memcpy(&fpga_xfrm->sa_ctx->hw_sa, &org_hw_sa, + sizeof(org_hw_sa)); + WARN_ON(rhashtable_insert_fast(&fipsec->sa_hash, + &fpga_xfrm->sa_ctx->hash, + rhash_sa)); + } + mutex_unlock(&fipsec->sa_hash_lock); + +change_sw_xfrm_attrs: + if (!err) + memcpy(&xfrm->attrs, attrs, sizeof(xfrm->attrs)); + mutex_unlock(&fpga_xfrm->lock); + return err; +} + +static const struct mlx5_accel_ipsec_ops fpga_ipsec_ops = { + .device_caps = mlx5_fpga_ipsec_device_caps, + .counters_count = mlx5_fpga_ipsec_counters_count, + .counters_read = mlx5_fpga_ipsec_counters_read, + .create_hw_context = mlx5_fpga_ipsec_create_sa_ctx, + .free_hw_context = mlx5_fpga_ipsec_delete_sa_ctx, + .init = mlx5_fpga_ipsec_init, + .cleanup = mlx5_fpga_ipsec_cleanup, + .esp_create_xfrm = mlx5_fpga_esp_create_xfrm, + .esp_modify_xfrm = mlx5_fpga_esp_modify_xfrm, + .esp_destroy_xfrm = mlx5_fpga_esp_destroy_xfrm, +}; + +const struct mlx5_accel_ipsec_ops *mlx5_fpga_ipsec_ops(struct mlx5_core_dev *mdev) +{ + if (!mlx5_fpga_is_ipsec_device(mdev)) + return NULL; + + return &fpga_ipsec_ops; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.h new file mode 100644 index 0000000..8931b55 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2017 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef __MLX5_FPGA_IPSEC_H__ +#define __MLX5_FPGA_IPSEC_H__ + +#include "accel/ipsec.h" +#include "fs_cmd.h" + +#ifdef CONFIG_MLX5_FPGA_IPSEC +const struct mlx5_accel_ipsec_ops *mlx5_fpga_ipsec_ops(struct mlx5_core_dev *mdev); +u32 mlx5_fpga_ipsec_device_caps(struct mlx5_core_dev *mdev); +const struct mlx5_flow_cmds * +mlx5_fs_cmd_get_default_ipsec_fpga_cmds(enum fs_flow_table_type type); +void mlx5_fpga_ipsec_build_fs_cmds(void); +bool mlx5_fpga_is_ipsec_device(struct mlx5_core_dev *mdev); +#else +static inline +const struct mlx5_accel_ipsec_ops *mlx5_fpga_ipsec_ops(struct mlx5_core_dev *mdev) +{ return NULL; } +static inline u32 mlx5_fpga_ipsec_device_caps(struct mlx5_core_dev *mdev) { return 0; } +static inline const struct mlx5_flow_cmds * +mlx5_fs_cmd_get_default_ipsec_fpga_cmds(enum fs_flow_table_type type) +{ + return mlx5_fs_cmd_get_default(type); +} + +static inline void mlx5_fpga_ipsec_build_fs_cmds(void) {}; +static inline bool mlx5_fpga_is_ipsec_device(struct mlx5_core_dev *mdev) { return false; } + +#endif /* CONFIG_MLX5_FPGA_IPSEC */ +#endif /* __MLX5_FPGA_IPSEC_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.c new file mode 100644 index 0000000..1496296 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.c @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2017 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include + +#include "fpga/core.h" +#include "fpga/conn.h" +#include "fpga/sdk.h" + +struct mlx5_fpga_conn * +mlx5_fpga_sbu_conn_create(struct mlx5_fpga_device *fdev, + struct mlx5_fpga_conn_attr *attr) +{ + return mlx5_fpga_conn_create(fdev, attr, MLX5_FPGA_QPC_QP_TYPE_SANDBOX_QP); +} +EXPORT_SYMBOL(mlx5_fpga_sbu_conn_create); + +void mlx5_fpga_sbu_conn_destroy(struct mlx5_fpga_conn *conn) +{ + mlx5_fpga_conn_destroy(conn); +} +EXPORT_SYMBOL(mlx5_fpga_sbu_conn_destroy); + +int mlx5_fpga_sbu_conn_sendmsg(struct mlx5_fpga_conn *conn, + struct mlx5_fpga_dma_buf *buf) +{ + return mlx5_fpga_conn_send(conn, buf); +} +EXPORT_SYMBOL(mlx5_fpga_sbu_conn_sendmsg); + +static int mlx5_fpga_mem_read_i2c(struct mlx5_fpga_device *fdev, size_t size, + u64 addr, u8 *buf) +{ + size_t max_size = MLX5_FPGA_ACCESS_REG_SIZE_MAX; + size_t bytes_done = 0; + u8 actual_size; + int err; + + if (!size) + return -EINVAL; + + if (!fdev->mdev) + return -ENOTCONN; + + while (bytes_done < size) { + actual_size = min(max_size, (size - bytes_done)); + + err = mlx5_fpga_access_reg(fdev->mdev, actual_size, + addr + bytes_done, + buf + bytes_done, false); + if (err) { + mlx5_fpga_err(fdev, "Failed to read over I2C: %d\n", + err); + break; + } + + bytes_done += actual_size; + } + + return err; +} + +static int mlx5_fpga_mem_write_i2c(struct mlx5_fpga_device *fdev, size_t size, + u64 addr, u8 *buf) +{ + size_t max_size = MLX5_FPGA_ACCESS_REG_SIZE_MAX; + size_t bytes_done = 0; + u8 actual_size; + int err; + + if (!size) + return -EINVAL; + + if (!fdev->mdev) + return -ENOTCONN; + + while (bytes_done < size) { + actual_size = min(max_size, (size - bytes_done)); + + err = mlx5_fpga_access_reg(fdev->mdev, actual_size, + addr + bytes_done, + buf + bytes_done, true); + if (err) { + mlx5_fpga_err(fdev, "Failed to write FPGA crspace\n"); + break; + } + + bytes_done += actual_size; + } + + return err; +} + +int mlx5_fpga_mem_read(struct mlx5_fpga_device *fdev, size_t size, u64 addr, + void *buf, enum mlx5_fpga_access_type access_type) +{ + int ret; + + switch (access_type) { + case MLX5_FPGA_ACCESS_TYPE_I2C: + ret = mlx5_fpga_mem_read_i2c(fdev, size, addr, buf); + if (ret) + return ret; + break; + default: + mlx5_fpga_warn(fdev, "Unexpected read access_type %u\n", + access_type); + return -EACCES; + } + + return size; +} +EXPORT_SYMBOL(mlx5_fpga_mem_read); + +int mlx5_fpga_mem_write(struct mlx5_fpga_device *fdev, size_t size, u64 addr, + void *buf, enum mlx5_fpga_access_type access_type) +{ + int ret; + + switch (access_type) { + case MLX5_FPGA_ACCESS_TYPE_I2C: + ret = mlx5_fpga_mem_write_i2c(fdev, size, addr, buf); + if (ret) + return ret; + break; + default: + mlx5_fpga_warn(fdev, "Unexpected write access_type %u\n", + access_type); + return -EACCES; + } + + return size; +} +EXPORT_SYMBOL(mlx5_fpga_mem_write); + +int mlx5_fpga_get_sbu_caps(struct mlx5_fpga_device *fdev, int size, void *buf) +{ + return mlx5_fpga_sbu_caps(fdev->mdev, buf, size); +} +EXPORT_SYMBOL(mlx5_fpga_get_sbu_caps); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.h new file mode 100644 index 0000000..89ef592 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.h @@ -0,0 +1,214 @@ +/* + * Copyright (c) 2017 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef MLX5_FPGA_SDK_H +#define MLX5_FPGA_SDK_H + +#include +#include + +/** + * DOC: Innova SDK + * This header defines the in-kernel API for Innova FPGA client drivers. + */ +#define SBU_QP_QUEUE_SIZE 8 +#define MLX5_FPGA_CMD_TIMEOUT_MSEC (60 * 1000) + +/** + * enum mlx5_fpga_access_type - Enumerated the different methods possible for + * accessing the device memory address space + * + * @MLX5_FPGA_ACCESS_TYPE_I2C: Use the slow CX-FPGA I2C bus + * @MLX5_FPGA_ACCESS_TYPE_DONTCARE: Use the fastest available method + */ +enum mlx5_fpga_access_type { + MLX5_FPGA_ACCESS_TYPE_I2C = 0x0, + MLX5_FPGA_ACCESS_TYPE_DONTCARE = 0x0, +}; + +struct mlx5_fpga_conn; +struct mlx5_fpga_device; + +/** + * struct mlx5_fpga_dma_entry - A scatter-gather DMA entry + */ +struct mlx5_fpga_dma_entry { + /** @data: Virtual address pointer to the data */ + void *data; + /** @size: Size in bytes of the data */ + unsigned int size; + /** @dma_addr: Private member. Physical DMA-mapped address of the data */ + dma_addr_t dma_addr; +}; + +/** + * struct mlx5_fpga_dma_buf - A packet buffer + * May contain up to 2 scatter-gather data entries + */ +struct mlx5_fpga_dma_buf { + /** @dma_dir: DMA direction */ + enum dma_data_direction dma_dir; + /** @sg: Scatter-gather entries pointing to the data in memory */ + struct mlx5_fpga_dma_entry sg[2]; + /** @list: Item in SQ backlog, for TX packets */ + struct list_head list; + /** + * @complete: Completion routine, for TX packets + * @conn: FPGA Connection this packet was sent to + * @fdev: FPGA device this packet was sent to + * @buf: The packet buffer + * @status: 0 if successful, or an error code otherwise + */ + void (*complete)(struct mlx5_fpga_conn *conn, + struct mlx5_fpga_device *fdev, + struct mlx5_fpga_dma_buf *buf, u8 status); +}; + +/** + * struct mlx5_fpga_conn_attr - FPGA connection attributes + * Describes the attributes of a connection + */ +struct mlx5_fpga_conn_attr { + /** @tx_size: Size of connection TX queue, in packets */ + unsigned int tx_size; + /** @rx_size: Size of connection RX queue, in packets */ + unsigned int rx_size; + /** + * @recv_cb: Callback function which is called for received packets + * @cb_arg: The value provided in mlx5_fpga_conn_attr.cb_arg + * @buf: A buffer containing a received packet + * + * buf is guaranteed to only contain a single scatter-gather entry. + * The size of the actual packet received is specified in buf.sg[0].size + * When this callback returns, the packet buffer may be re-used for + * subsequent receives. + */ + void (*recv_cb)(void *cb_arg, struct mlx5_fpga_dma_buf *buf); + /** @cb_arg: A context to be passed to recv_cb callback */ + void *cb_arg; +}; + +/** + * mlx5_fpga_sbu_conn_create() - Initialize a new FPGA SBU connection + * @fdev: The FPGA device + * @attr: Attributes of the new connection + * + * Sets up a new FPGA SBU connection with the specified attributes. + * The receive callback function may be called for incoming messages even + * before this function returns. + * + * The caller must eventually destroy the connection by calling + * mlx5_fpga_sbu_conn_destroy. + * + * Return: A new connection, or ERR_PTR() error value otherwise. + */ +struct mlx5_fpga_conn * +mlx5_fpga_sbu_conn_create(struct mlx5_fpga_device *fdev, + struct mlx5_fpga_conn_attr *attr); + +/** + * mlx5_fpga_sbu_conn_destroy() - Destroy an FPGA SBU connection + * @conn: The FPGA SBU connection to destroy + * + * Cleans up an FPGA SBU connection which was previously created with + * mlx5_fpga_sbu_conn_create. + */ +void mlx5_fpga_sbu_conn_destroy(struct mlx5_fpga_conn *conn); + +/** + * mlx5_fpga_sbu_conn_sendmsg() - Queue the transmission of a packet + * @conn: An FPGA SBU connection + * @buf: The packet buffer + * + * Queues a packet for transmission over an FPGA SBU connection. + * The buffer should not be modified or freed until completion. + * Upon completion, the buf's complete() callback is invoked, indicating the + * success or error status of the transmission. + * + * Return: 0 if successful, or an error value otherwise. + */ +int mlx5_fpga_sbu_conn_sendmsg(struct mlx5_fpga_conn *conn, + struct mlx5_fpga_dma_buf *buf); + +/** + * mlx5_fpga_mem_read() - Read from FPGA memory address space + * @fdev: The FPGA device + * @size: Size of chunk to read, in bytes + * @addr: Starting address to read from, in FPGA address space + * @buf: Buffer to read into + * @access_type: Method for reading + * + * Reads from the specified address into the specified buffer. + * The address may point to configuration space or to DDR. + * Large reads may be performed internally as several non-atomic operations. + * This function may sleep, so should not be called from atomic contexts. + * + * Return: 0 if successful, or an error value otherwise. + */ +int mlx5_fpga_mem_read(struct mlx5_fpga_device *fdev, size_t size, u64 addr, + void *buf, enum mlx5_fpga_access_type access_type); + +/** + * mlx5_fpga_mem_write() - Write to FPGA memory address space + * @fdev: The FPGA device + * @size: Size of chunk to write, in bytes + * @addr: Starting address to write to, in FPGA address space + * @buf: Buffer which contains data to write + * @access_type: Method for writing + * + * Writes the specified buffer data to FPGA memory at the specified address. + * The address may point to configuration space or to DDR. + * Large writes may be performed internally as several non-atomic operations. + * This function may sleep, so should not be called from atomic contexts. + * + * Return: 0 if successful, or an error value otherwise. + */ +int mlx5_fpga_mem_write(struct mlx5_fpga_device *fdev, size_t size, u64 addr, + void *buf, enum mlx5_fpga_access_type access_type); + +/** + * mlx5_fpga_get_sbu_caps() - Read the SBU capabilities + * @fdev: The FPGA device + * @size: Size of the buffer to read into + * @buf: Buffer to read the capabilities into + * + * Reads the FPGA SBU capabilities into the specified buffer. + * The format of the capabilities buffer is SBU-dependent. + * + * Return: 0 if successful + * -EINVAL if the buffer is not large enough to contain SBU caps + * or any other error value otherwise. + */ +int mlx5_fpga_get_sbu_caps(struct mlx5_fpga_device *fdev, int size, void *buf); + +#endif /* MLX5_FPGA_SDK_H */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.c new file mode 100644 index 0000000..29b7339 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.c @@ -0,0 +1,622 @@ +/* + * Copyright (c) 2018 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include "fpga/tls.h" +#include "fpga/cmd.h" +#include "fpga/sdk.h" +#include "fpga/core.h" +#include "accel/tls.h" + +struct mlx5_fpga_tls_command_context; + +typedef void (*mlx5_fpga_tls_command_complete) + (struct mlx5_fpga_conn *conn, struct mlx5_fpga_device *fdev, + struct mlx5_fpga_tls_command_context *ctx, + struct mlx5_fpga_dma_buf *resp); + +struct mlx5_fpga_tls_command_context { + struct list_head list; + /* There is no guarantee on the order between the TX completion + * and the command response. + * The TX completion is going to touch cmd->buf even in + * the case of successful transmission. + * So instead of requiring separate allocations for cmd + * and cmd->buf we've decided to use a reference counter + */ + refcount_t ref; + struct mlx5_fpga_dma_buf buf; + mlx5_fpga_tls_command_complete complete; +}; + +static void +mlx5_fpga_tls_put_command_ctx(struct mlx5_fpga_tls_command_context *ctx) +{ + if (refcount_dec_and_test(&ctx->ref)) + kfree(ctx); +} + +static void mlx5_fpga_tls_cmd_complete(struct mlx5_fpga_device *fdev, + struct mlx5_fpga_dma_buf *resp) +{ + struct mlx5_fpga_conn *conn = fdev->tls->conn; + struct mlx5_fpga_tls_command_context *ctx; + struct mlx5_fpga_tls *tls = fdev->tls; + unsigned long flags; + + spin_lock_irqsave(&tls->pending_cmds_lock, flags); + ctx = list_first_entry(&tls->pending_cmds, + struct mlx5_fpga_tls_command_context, list); + list_del(&ctx->list); + spin_unlock_irqrestore(&tls->pending_cmds_lock, flags); + ctx->complete(conn, fdev, ctx, resp); +} + +static void mlx5_fpga_cmd_send_complete(struct mlx5_fpga_conn *conn, + struct mlx5_fpga_device *fdev, + struct mlx5_fpga_dma_buf *buf, + u8 status) +{ + struct mlx5_fpga_tls_command_context *ctx = + container_of(buf, struct mlx5_fpga_tls_command_context, buf); + + mlx5_fpga_tls_put_command_ctx(ctx); + + if (unlikely(status)) + mlx5_fpga_tls_cmd_complete(fdev, NULL); +} + +static void mlx5_fpga_tls_cmd_send(struct mlx5_fpga_device *fdev, + struct mlx5_fpga_tls_command_context *cmd, + mlx5_fpga_tls_command_complete complete) +{ + struct mlx5_fpga_tls *tls = fdev->tls; + unsigned long flags; + int ret; + + refcount_set(&cmd->ref, 2); + cmd->complete = complete; + cmd->buf.complete = mlx5_fpga_cmd_send_complete; + + spin_lock_irqsave(&tls->pending_cmds_lock, flags); + /* mlx5_fpga_sbu_conn_sendmsg is called under pending_cmds_lock + * to make sure commands are inserted to the tls->pending_cmds list + * and the command QP in the same order. + */ + ret = mlx5_fpga_sbu_conn_sendmsg(tls->conn, &cmd->buf); + if (likely(!ret)) + list_add_tail(&cmd->list, &tls->pending_cmds); + else + complete(tls->conn, fdev, cmd, NULL); + spin_unlock_irqrestore(&tls->pending_cmds_lock, flags); +} + +/* Start of context identifiers range (inclusive) */ +#define SWID_START 0 +/* End of context identifiers range (exclusive) */ +#define SWID_END BIT(24) + +static int mlx5_fpga_tls_alloc_swid(struct idr *idr, spinlock_t *idr_spinlock, + void *ptr) +{ + unsigned long flags; + int ret; + + /* TLS metadata format is 1 byte for syndrome followed + * by 3 bytes of swid (software ID) + * swid must not exceed 3 bytes. + * See tls_rxtx.c:insert_pet() for details + */ + BUILD_BUG_ON((SWID_END - 1) & 0xFF000000); + + idr_preload(GFP_KERNEL); + spin_lock_irqsave(idr_spinlock, flags); + ret = idr_alloc(idr, ptr, SWID_START, SWID_END, GFP_ATOMIC); + spin_unlock_irqrestore(idr_spinlock, flags); + idr_preload_end(); + + return ret; +} + +static void *mlx5_fpga_tls_release_swid(struct idr *idr, + spinlock_t *idr_spinlock, u32 swid) +{ + unsigned long flags; + void *ptr; + + spin_lock_irqsave(idr_spinlock, flags); + ptr = idr_remove(idr, swid); + spin_unlock_irqrestore(idr_spinlock, flags); + return ptr; +} + +static void mlx_tls_kfree_complete(struct mlx5_fpga_conn *conn, + struct mlx5_fpga_device *fdev, + struct mlx5_fpga_dma_buf *buf, u8 status) +{ + kfree(buf); +} + +static void +mlx5_fpga_tls_teardown_completion(struct mlx5_fpga_conn *conn, + struct mlx5_fpga_device *fdev, + struct mlx5_fpga_tls_command_context *cmd, + struct mlx5_fpga_dma_buf *resp) +{ + if (resp) { + u32 syndrome = MLX5_GET(tls_resp, resp->sg[0].data, syndrome); + + if (syndrome) + mlx5_fpga_err(fdev, + "Teardown stream failed with syndrome = %d", + syndrome); + } + mlx5_fpga_tls_put_command_ctx(cmd); +} + +static void mlx5_fpga_tls_flow_to_cmd(void *flow, void *cmd) +{ + memcpy(MLX5_ADDR_OF(tls_cmd, cmd, src_port), flow, + MLX5_BYTE_OFF(tls_flow, ipv6)); + + MLX5_SET(tls_cmd, cmd, ipv6, MLX5_GET(tls_flow, flow, ipv6)); + MLX5_SET(tls_cmd, cmd, direction_sx, + MLX5_GET(tls_flow, flow, direction_sx)); +} + +int mlx5_fpga_tls_resync_rx(struct mlx5_core_dev *mdev, __be32 handle, + u32 seq, __be64 rcd_sn) +{ + struct mlx5_fpga_dma_buf *buf; + int size = sizeof(*buf) + MLX5_TLS_COMMAND_SIZE; + void *flow; + void *cmd; + int ret; + + buf = kzalloc(size, GFP_ATOMIC); + if (!buf) + return -ENOMEM; + + cmd = (buf + 1); + + rcu_read_lock(); + flow = idr_find(&mdev->fpga->tls->rx_idr, ntohl(handle)); + if (unlikely(!flow)) { + rcu_read_unlock(); + WARN_ONCE(1, "Received NULL pointer for handle\n"); + kfree(buf); + return -EINVAL; + } + mlx5_fpga_tls_flow_to_cmd(flow, cmd); + rcu_read_unlock(); + + MLX5_SET(tls_cmd, cmd, swid, ntohl(handle)); + MLX5_SET64(tls_cmd, cmd, tls_rcd_sn, be64_to_cpu(rcd_sn)); + MLX5_SET(tls_cmd, cmd, tcp_sn, seq); + MLX5_SET(tls_cmd, cmd, command_type, CMD_RESYNC_RX); + + buf->sg[0].data = cmd; + buf->sg[0].size = MLX5_TLS_COMMAND_SIZE; + buf->complete = mlx_tls_kfree_complete; + + ret = mlx5_fpga_sbu_conn_sendmsg(mdev->fpga->tls->conn, buf); + if (ret < 0) + kfree(buf); + + return ret; +} + +static void mlx5_fpga_tls_send_teardown_cmd(struct mlx5_core_dev *mdev, + void *flow, u32 swid, gfp_t flags) +{ + struct mlx5_fpga_tls_command_context *ctx; + struct mlx5_fpga_dma_buf *buf; + void *cmd; + + ctx = kzalloc(sizeof(*ctx) + MLX5_TLS_COMMAND_SIZE, flags); + if (!ctx) + return; + + buf = &ctx->buf; + cmd = (ctx + 1); + MLX5_SET(tls_cmd, cmd, command_type, CMD_TEARDOWN_STREAM); + MLX5_SET(tls_cmd, cmd, swid, swid); + + mlx5_fpga_tls_flow_to_cmd(flow, cmd); + kfree(flow); + + buf->sg[0].data = cmd; + buf->sg[0].size = MLX5_TLS_COMMAND_SIZE; + + mlx5_fpga_tls_cmd_send(mdev->fpga, ctx, + mlx5_fpga_tls_teardown_completion); +} + +void mlx5_fpga_tls_del_flow(struct mlx5_core_dev *mdev, u32 swid, + gfp_t flags, bool direction_sx) +{ + struct mlx5_fpga_tls *tls = mdev->fpga->tls; + void *flow; + + if (direction_sx) + flow = mlx5_fpga_tls_release_swid(&tls->tx_idr, + &tls->tx_idr_spinlock, + swid); + else + flow = mlx5_fpga_tls_release_swid(&tls->rx_idr, + &tls->rx_idr_spinlock, + swid); + + if (!flow) { + mlx5_fpga_err(mdev->fpga, "No flow information for swid %u\n", + swid); + return; + } + + synchronize_rcu(); /* before kfree(flow) */ + mlx5_fpga_tls_send_teardown_cmd(mdev, flow, swid, flags); +} + +enum mlx5_fpga_setup_stream_status { + MLX5_FPGA_CMD_PENDING, + MLX5_FPGA_CMD_SEND_FAILED, + MLX5_FPGA_CMD_RESPONSE_RECEIVED, + MLX5_FPGA_CMD_ABANDONED, +}; + +struct mlx5_setup_stream_context { + struct mlx5_fpga_tls_command_context cmd; + atomic_t status; + u32 syndrome; + struct completion comp; +}; + +static void +mlx5_fpga_tls_setup_completion(struct mlx5_fpga_conn *conn, + struct mlx5_fpga_device *fdev, + struct mlx5_fpga_tls_command_context *cmd, + struct mlx5_fpga_dma_buf *resp) +{ + struct mlx5_setup_stream_context *ctx = + container_of(cmd, struct mlx5_setup_stream_context, cmd); + int status = MLX5_FPGA_CMD_SEND_FAILED; + void *tls_cmd = ctx + 1; + + /* If we failed to send to command resp == NULL */ + if (resp) { + ctx->syndrome = MLX5_GET(tls_resp, resp->sg[0].data, syndrome); + status = MLX5_FPGA_CMD_RESPONSE_RECEIVED; + } + + status = atomic_xchg_release(&ctx->status, status); + if (likely(status != MLX5_FPGA_CMD_ABANDONED)) { + complete(&ctx->comp); + return; + } + + mlx5_fpga_err(fdev, "Command was abandoned, syndrome = %u\n", + ctx->syndrome); + + if (!ctx->syndrome) { + /* The process was killed while waiting for the context to be + * added, and the add completed successfully. + * We need to destroy the HW context, and we can't can't reuse + * the command context because we might not have received + * the tx completion yet. + */ + mlx5_fpga_tls_del_flow(fdev->mdev, + MLX5_GET(tls_cmd, tls_cmd, swid), + GFP_ATOMIC, + MLX5_GET(tls_cmd, tls_cmd, + direction_sx)); + } + + mlx5_fpga_tls_put_command_ctx(cmd); +} + +static int mlx5_fpga_tls_setup_stream_cmd(struct mlx5_core_dev *mdev, + struct mlx5_setup_stream_context *ctx) +{ + struct mlx5_fpga_dma_buf *buf; + void *cmd = ctx + 1; + int status, ret = 0; + + buf = &ctx->cmd.buf; + buf->sg[0].data = cmd; + buf->sg[0].size = MLX5_TLS_COMMAND_SIZE; + MLX5_SET(tls_cmd, cmd, command_type, CMD_SETUP_STREAM); + + init_completion(&ctx->comp); + atomic_set(&ctx->status, MLX5_FPGA_CMD_PENDING); + ctx->syndrome = -1; + + mlx5_fpga_tls_cmd_send(mdev->fpga, &ctx->cmd, + mlx5_fpga_tls_setup_completion); + wait_for_completion_killable(&ctx->comp); + + status = atomic_xchg_acquire(&ctx->status, MLX5_FPGA_CMD_ABANDONED); + if (unlikely(status == MLX5_FPGA_CMD_PENDING)) + /* ctx is going to be released in mlx5_fpga_tls_setup_completion */ + return -EINTR; + + if (unlikely(ctx->syndrome)) + ret = -ENOMEM; + + mlx5_fpga_tls_put_command_ctx(&ctx->cmd); + return ret; +} + +static void mlx5_fpga_tls_hw_qp_recv_cb(void *cb_arg, + struct mlx5_fpga_dma_buf *buf) +{ + struct mlx5_fpga_device *fdev = (struct mlx5_fpga_device *)cb_arg; + + mlx5_fpga_tls_cmd_complete(fdev, buf); +} + +bool mlx5_fpga_is_tls_device(struct mlx5_core_dev *mdev) +{ + if (!mdev->fpga || !MLX5_CAP_GEN(mdev, fpga)) + return false; + + if (MLX5_CAP_FPGA(mdev, ieee_vendor_id) != + MLX5_FPGA_CAP_SANDBOX_VENDOR_ID_MLNX) + return false; + + if (MLX5_CAP_FPGA(mdev, sandbox_product_id) != + MLX5_FPGA_CAP_SANDBOX_PRODUCT_ID_TLS) + return false; + + if (MLX5_CAP_FPGA(mdev, sandbox_product_version) != 0) + return false; + + return true; +} + +static int mlx5_fpga_tls_get_caps(struct mlx5_fpga_device *fdev, + u32 *p_caps) +{ + int err, cap_size = MLX5_ST_SZ_BYTES(tls_extended_cap); + u32 caps = 0; + void *buf; + + buf = kzalloc(cap_size, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + err = mlx5_fpga_get_sbu_caps(fdev, cap_size, buf); + if (err) + goto out; + + if (MLX5_GET(tls_extended_cap, buf, tx)) + caps |= MLX5_ACCEL_TLS_TX; + if (MLX5_GET(tls_extended_cap, buf, rx)) + caps |= MLX5_ACCEL_TLS_RX; + if (MLX5_GET(tls_extended_cap, buf, tls_v12)) + caps |= MLX5_ACCEL_TLS_V12; + if (MLX5_GET(tls_extended_cap, buf, tls_v13)) + caps |= MLX5_ACCEL_TLS_V13; + if (MLX5_GET(tls_extended_cap, buf, lro)) + caps |= MLX5_ACCEL_TLS_LRO; + if (MLX5_GET(tls_extended_cap, buf, ipv6)) + caps |= MLX5_ACCEL_TLS_IPV6; + + if (MLX5_GET(tls_extended_cap, buf, aes_gcm_128)) + caps |= MLX5_ACCEL_TLS_AES_GCM128; + if (MLX5_GET(tls_extended_cap, buf, aes_gcm_256)) + caps |= MLX5_ACCEL_TLS_AES_GCM256; + + *p_caps = caps; + err = 0; +out: + kfree(buf); + return err; +} + +int mlx5_fpga_tls_init(struct mlx5_core_dev *mdev) +{ + struct mlx5_fpga_device *fdev = mdev->fpga; + struct mlx5_fpga_conn_attr init_attr = {0}; + struct mlx5_fpga_conn *conn; + struct mlx5_fpga_tls *tls; + int err = 0; + + if (!mlx5_fpga_is_tls_device(mdev) || !fdev) + return 0; + + tls = kzalloc(sizeof(*tls), GFP_KERNEL); + if (!tls) + return -ENOMEM; + + err = mlx5_fpga_tls_get_caps(fdev, &tls->caps); + if (err) + goto error; + + if (!(tls->caps & (MLX5_ACCEL_TLS_V12 | MLX5_ACCEL_TLS_AES_GCM128))) { + err = -ENOTSUPP; + goto error; + } + + init_attr.rx_size = SBU_QP_QUEUE_SIZE; + init_attr.tx_size = SBU_QP_QUEUE_SIZE; + init_attr.recv_cb = mlx5_fpga_tls_hw_qp_recv_cb; + init_attr.cb_arg = fdev; + conn = mlx5_fpga_sbu_conn_create(fdev, &init_attr); + if (IS_ERR(conn)) { + err = PTR_ERR(conn); + mlx5_fpga_err(fdev, "Error creating TLS command connection %d\n", + err); + goto error; + } + + tls->conn = conn; + spin_lock_init(&tls->pending_cmds_lock); + INIT_LIST_HEAD(&tls->pending_cmds); + + idr_init(&tls->tx_idr); + idr_init(&tls->rx_idr); + spin_lock_init(&tls->tx_idr_spinlock); + spin_lock_init(&tls->rx_idr_spinlock); + fdev->tls = tls; + return 0; + +error: + kfree(tls); + return err; +} + +void mlx5_fpga_tls_cleanup(struct mlx5_core_dev *mdev) +{ + struct mlx5_fpga_device *fdev = mdev->fpga; + + if (!fdev || !fdev->tls) + return; + + mlx5_fpga_sbu_conn_destroy(fdev->tls->conn); + kfree(fdev->tls); + fdev->tls = NULL; +} + +static void mlx5_fpga_tls_set_aes_gcm128_ctx(void *cmd, + struct tls_crypto_info *info, + __be64 *rcd_sn) +{ + struct tls12_crypto_info_aes_gcm_128 *crypto_info = + (struct tls12_crypto_info_aes_gcm_128 *)info; + + memcpy(MLX5_ADDR_OF(tls_cmd, cmd, tls_rcd_sn), crypto_info->rec_seq, + TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE); + + memcpy(MLX5_ADDR_OF(tls_cmd, cmd, tls_implicit_iv), + crypto_info->salt, TLS_CIPHER_AES_GCM_128_SALT_SIZE); + memcpy(MLX5_ADDR_OF(tls_cmd, cmd, encryption_key), + crypto_info->key, TLS_CIPHER_AES_GCM_128_KEY_SIZE); + + /* in AES-GCM 128 we need to write the key twice */ + memcpy(MLX5_ADDR_OF(tls_cmd, cmd, encryption_key) + + TLS_CIPHER_AES_GCM_128_KEY_SIZE, + crypto_info->key, TLS_CIPHER_AES_GCM_128_KEY_SIZE); + + MLX5_SET(tls_cmd, cmd, alg, MLX5_TLS_ALG_AES_GCM_128); +} + +static int mlx5_fpga_tls_set_key_material(void *cmd, u32 caps, + struct tls_crypto_info *crypto_info) +{ + __be64 rcd_sn; + + switch (crypto_info->cipher_type) { + case TLS_CIPHER_AES_GCM_128: + if (!(caps & MLX5_ACCEL_TLS_AES_GCM128)) + return -EINVAL; + mlx5_fpga_tls_set_aes_gcm128_ctx(cmd, crypto_info, &rcd_sn); + break; + default: + return -EINVAL; + } + + return 0; +} + +static int _mlx5_fpga_tls_add_flow(struct mlx5_core_dev *mdev, void *flow, + struct tls_crypto_info *crypto_info, + u32 swid, u32 tcp_sn) +{ + u32 caps = mlx5_fpga_tls_device_caps(mdev); + struct mlx5_setup_stream_context *ctx; + int ret = -ENOMEM; + size_t cmd_size; + void *cmd; + + cmd_size = MLX5_TLS_COMMAND_SIZE + sizeof(*ctx); + ctx = kzalloc(cmd_size, GFP_KERNEL); + if (!ctx) + goto out; + + cmd = ctx + 1; + ret = mlx5_fpga_tls_set_key_material(cmd, caps, crypto_info); + if (ret) + goto free_ctx; + + mlx5_fpga_tls_flow_to_cmd(flow, cmd); + + MLX5_SET(tls_cmd, cmd, swid, swid); + MLX5_SET(tls_cmd, cmd, tcp_sn, tcp_sn); + + return mlx5_fpga_tls_setup_stream_cmd(mdev, ctx); + +free_ctx: + kfree(ctx); +out: + return ret; +} + +int mlx5_fpga_tls_add_flow(struct mlx5_core_dev *mdev, void *flow, + struct tls_crypto_info *crypto_info, + u32 start_offload_tcp_sn, u32 *p_swid, + bool direction_sx) +{ + struct mlx5_fpga_tls *tls = mdev->fpga->tls; + int ret = -ENOMEM; + u32 swid; + + if (direction_sx) + ret = mlx5_fpga_tls_alloc_swid(&tls->tx_idr, + &tls->tx_idr_spinlock, flow); + else + ret = mlx5_fpga_tls_alloc_swid(&tls->rx_idr, + &tls->rx_idr_spinlock, flow); + + if (ret < 0) + return ret; + + swid = ret; + MLX5_SET(tls_flow, flow, direction_sx, direction_sx ? 1 : 0); + + ret = _mlx5_fpga_tls_add_flow(mdev, flow, crypto_info, swid, + start_offload_tcp_sn); + if (ret && ret != -EINTR) + goto free_swid; + + *p_swid = swid; + return 0; +free_swid: + if (direction_sx) + mlx5_fpga_tls_release_swid(&tls->tx_idr, + &tls->tx_idr_spinlock, swid); + else + mlx5_fpga_tls_release_swid(&tls->rx_idr, + &tls->rx_idr_spinlock, swid); + + return ret; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.h new file mode 100644 index 0000000..5714cf3 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2018 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef __MLX5_FPGA_TLS_H__ +#define __MLX5_FPGA_TLS_H__ + +#include + +#include +#include "fpga/core.h" + +struct mlx5_fpga_tls { + struct list_head pending_cmds; + spinlock_t pending_cmds_lock; /* Protects pending_cmds */ + u32 caps; + struct mlx5_fpga_conn *conn; + + struct idr tx_idr; + struct idr rx_idr; + spinlock_t tx_idr_spinlock; /* protects the IDR */ + spinlock_t rx_idr_spinlock; /* protects the IDR */ +}; + +int mlx5_fpga_tls_add_flow(struct mlx5_core_dev *mdev, void *flow, + struct tls_crypto_info *crypto_info, + u32 start_offload_tcp_sn, u32 *p_swid, + bool direction_sx); + +void mlx5_fpga_tls_del_flow(struct mlx5_core_dev *mdev, u32 swid, + gfp_t flags, bool direction_sx); + +bool mlx5_fpga_is_tls_device(struct mlx5_core_dev *mdev); +int mlx5_fpga_tls_init(struct mlx5_core_dev *mdev); +void mlx5_fpga_tls_cleanup(struct mlx5_core_dev *mdev); + +static inline u32 mlx5_fpga_tls_device_caps(struct mlx5_core_dev *mdev) +{ + return mdev->fpga->tls->caps; +} + +int mlx5_fpga_tls_resync_rx(struct mlx5_core_dev *mdev, __be32 handle, + u32 seq, __be64 rcd_sn); + +#endif /* __MLX5_FPGA_TLS_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c new file mode 100644 index 0000000..d14361b --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c @@ -0,0 +1,1114 @@ +/* + * Copyright (c) 2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include "fs_core.h" +#include "fs_cmd.h" +#include "fs_ft_pool.h" +#include "mlx5_core.h" +#include "eswitch.h" + +static int mlx5_cmd_stub_update_root_ft(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + u32 underlay_qpn, + bool disconnect) +{ + return 0; +} + +static int mlx5_cmd_stub_create_flow_table(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + struct mlx5_flow_table_attr *ft_attr, + struct mlx5_flow_table *next_ft) +{ + int max_fte = ft_attr->max_fte; + + ft->max_fte = max_fte ? roundup_pow_of_two(max_fte) : 1; + + return 0; +} + +static int mlx5_cmd_stub_destroy_flow_table(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft) +{ + return 0; +} + +static int mlx5_cmd_stub_modify_flow_table(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + struct mlx5_flow_table *next_ft) +{ + return 0; +} + +static int mlx5_cmd_stub_create_flow_group(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + u32 *in, + struct mlx5_flow_group *fg) +{ + return 0; +} + +static int mlx5_cmd_stub_destroy_flow_group(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + struct mlx5_flow_group *fg) +{ + return 0; +} + +static int mlx5_cmd_stub_create_fte(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + struct mlx5_flow_group *group, + struct fs_fte *fte) +{ + return 0; +} + +static int mlx5_cmd_stub_update_fte(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + struct mlx5_flow_group *group, + int modify_mask, + struct fs_fte *fte) +{ + return -EOPNOTSUPP; +} + +static int mlx5_cmd_stub_delete_fte(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + struct fs_fte *fte) +{ + return 0; +} + +static int mlx5_cmd_stub_packet_reformat_alloc(struct mlx5_flow_root_namespace *ns, + struct mlx5_pkt_reformat_params *params, + enum mlx5_flow_namespace_type namespace, + struct mlx5_pkt_reformat *pkt_reformat) +{ + return 0; +} + +static void mlx5_cmd_stub_packet_reformat_dealloc(struct mlx5_flow_root_namespace *ns, + struct mlx5_pkt_reformat *pkt_reformat) +{ +} + +static int mlx5_cmd_stub_modify_header_alloc(struct mlx5_flow_root_namespace *ns, + u8 namespace, u8 num_actions, + void *modify_actions, + struct mlx5_modify_hdr *modify_hdr) +{ + return 0; +} + +static void mlx5_cmd_stub_modify_header_dealloc(struct mlx5_flow_root_namespace *ns, + struct mlx5_modify_hdr *modify_hdr) +{ +} + +static int mlx5_cmd_stub_set_peer(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_root_namespace *peer_ns) +{ + return 0; +} + +static int mlx5_cmd_stub_create_ns(struct mlx5_flow_root_namespace *ns) +{ + return 0; +} + +static int mlx5_cmd_stub_destroy_ns(struct mlx5_flow_root_namespace *ns) +{ + return 0; +} + +static int mlx5_cmd_set_slave_root_fdb(struct mlx5_core_dev *master, + struct mlx5_core_dev *slave, + bool ft_id_valid, + u32 ft_id) +{ + u32 out[MLX5_ST_SZ_DW(set_flow_table_root_out)] = {}; + u32 in[MLX5_ST_SZ_DW(set_flow_table_root_in)] = {}; + struct mlx5_flow_root_namespace *root; + struct mlx5_flow_namespace *ns; + + MLX5_SET(set_flow_table_root_in, in, opcode, + MLX5_CMD_OP_SET_FLOW_TABLE_ROOT); + MLX5_SET(set_flow_table_root_in, in, table_type, + FS_FT_FDB); + if (ft_id_valid) { + MLX5_SET(set_flow_table_root_in, in, + table_eswitch_owner_vhca_id_valid, 1); + MLX5_SET(set_flow_table_root_in, in, + table_eswitch_owner_vhca_id, + MLX5_CAP_GEN(master, vhca_id)); + MLX5_SET(set_flow_table_root_in, in, table_id, + ft_id); + } else { + ns = mlx5_get_flow_namespace(slave, + MLX5_FLOW_NAMESPACE_FDB); + root = find_root(&ns->node); + MLX5_SET(set_flow_table_root_in, in, table_id, + root->root_ft->id); + } + + return mlx5_cmd_exec(slave, in, sizeof(in), out, sizeof(out)); +} + +static int +mlx5_cmd_stub_destroy_match_definer(struct mlx5_flow_root_namespace *ns, + int definer_id) +{ + return 0; +} + +static int +mlx5_cmd_stub_create_match_definer(struct mlx5_flow_root_namespace *ns, + u16 format_id, u32 *match_mask) +{ + return 0; +} + +static int mlx5_cmd_update_root_ft(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, u32 underlay_qpn, + bool disconnect) +{ + u32 in[MLX5_ST_SZ_DW(set_flow_table_root_in)] = {}; + struct mlx5_core_dev *dev = ns->dev; + int err; + + if ((MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_IB) && + underlay_qpn == 0) + return 0; + + if (ft->type == FS_FT_FDB && + mlx5_lag_is_shared_fdb(dev) && + !mlx5_lag_is_master(dev)) + return 0; + + MLX5_SET(set_flow_table_root_in, in, opcode, + MLX5_CMD_OP_SET_FLOW_TABLE_ROOT); + MLX5_SET(set_flow_table_root_in, in, table_type, ft->type); + + if (disconnect) + MLX5_SET(set_flow_table_root_in, in, op_mod, 1); + else + MLX5_SET(set_flow_table_root_in, in, table_id, ft->id); + + MLX5_SET(set_flow_table_root_in, in, underlay_qpn, underlay_qpn); + MLX5_SET(set_flow_table_root_in, in, vport_number, ft->vport); + MLX5_SET(set_flow_table_root_in, in, other_vport, + !!(ft->flags & MLX5_FLOW_TABLE_OTHER_VPORT)); + + err = mlx5_cmd_exec_in(dev, set_flow_table_root, in); + if (!err && + ft->type == FS_FT_FDB && + mlx5_lag_is_shared_fdb(dev) && + mlx5_lag_is_master(dev)) { + err = mlx5_cmd_set_slave_root_fdb(dev, + mlx5_lag_get_peer_mdev(dev), + !disconnect, (!disconnect) ? + ft->id : 0); + if (err && !disconnect) { + MLX5_SET(set_flow_table_root_in, in, op_mod, 0); + MLX5_SET(set_flow_table_root_in, in, table_id, + ns->root_ft->id); + mlx5_cmd_exec_in(dev, set_flow_table_root, in); + } + } + + return err; +} + +static int mlx5_cmd_create_flow_table(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + struct mlx5_flow_table_attr *ft_attr, + struct mlx5_flow_table *next_ft) +{ + int en_encap = !!(ft->flags & MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT); + int en_decap = !!(ft->flags & MLX5_FLOW_TABLE_TUNNEL_EN_DECAP); + int term = !!(ft->flags & MLX5_FLOW_TABLE_TERMINATION); + u32 out[MLX5_ST_SZ_DW(create_flow_table_out)] = {}; + u32 in[MLX5_ST_SZ_DW(create_flow_table_in)] = {}; + struct mlx5_core_dev *dev = ns->dev; + unsigned int size; + int err; + + if (ft_attr->max_fte != POOL_NEXT_SIZE) + size = roundup_pow_of_two(ft_attr->max_fte); + size = mlx5_ft_pool_get_avail_sz(dev, ft->type, ft_attr->max_fte); + if (!size) + return -ENOSPC; + + MLX5_SET(create_flow_table_in, in, opcode, + MLX5_CMD_OP_CREATE_FLOW_TABLE); + + MLX5_SET(create_flow_table_in, in, uid, ft_attr->uid); + MLX5_SET(create_flow_table_in, in, table_type, ft->type); + MLX5_SET(create_flow_table_in, in, flow_table_context.level, ft->level); + MLX5_SET(create_flow_table_in, in, flow_table_context.log_size, size ? ilog2(size) : 0); + MLX5_SET(create_flow_table_in, in, vport_number, ft->vport); + MLX5_SET(create_flow_table_in, in, other_vport, + !!(ft->flags & MLX5_FLOW_TABLE_OTHER_VPORT)); + + MLX5_SET(create_flow_table_in, in, flow_table_context.decap_en, + en_decap); + MLX5_SET(create_flow_table_in, in, flow_table_context.reformat_en, + en_encap); + MLX5_SET(create_flow_table_in, in, flow_table_context.termination_table, + term); + + switch (ft->op_mod) { + case FS_FT_OP_MOD_NORMAL: + if (next_ft) { + MLX5_SET(create_flow_table_in, in, + flow_table_context.table_miss_action, + MLX5_FLOW_TABLE_MISS_ACTION_FWD); + MLX5_SET(create_flow_table_in, in, + flow_table_context.table_miss_id, next_ft->id); + } else { + MLX5_SET(create_flow_table_in, in, + flow_table_context.table_miss_action, + ft->def_miss_action); + } + break; + + case FS_FT_OP_MOD_LAG_DEMUX: + MLX5_SET(create_flow_table_in, in, op_mod, 0x1); + if (next_ft) + MLX5_SET(create_flow_table_in, in, + flow_table_context.lag_master_next_table_id, + next_ft->id); + break; + } + + err = mlx5_cmd_exec_inout(dev, create_flow_table, in, out); + if (!err) { + ft->id = MLX5_GET(create_flow_table_out, out, + table_id); + ft->max_fte = size; + } else { + mlx5_ft_pool_put_sz(ns->dev, size); + } + + return err; +} + +static int mlx5_cmd_destroy_flow_table(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft) +{ + u32 in[MLX5_ST_SZ_DW(destroy_flow_table_in)] = {}; + struct mlx5_core_dev *dev = ns->dev; + int err; + + MLX5_SET(destroy_flow_table_in, in, opcode, + MLX5_CMD_OP_DESTROY_FLOW_TABLE); + MLX5_SET(destroy_flow_table_in, in, table_type, ft->type); + MLX5_SET(destroy_flow_table_in, in, table_id, ft->id); + MLX5_SET(destroy_flow_table_in, in, vport_number, ft->vport); + MLX5_SET(destroy_flow_table_in, in, other_vport, + !!(ft->flags & MLX5_FLOW_TABLE_OTHER_VPORT)); + + err = mlx5_cmd_exec_in(dev, destroy_flow_table, in); + if (!err) + mlx5_ft_pool_put_sz(ns->dev, ft->max_fte); + + return err; +} + +static int mlx5_cmd_modify_flow_table(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + struct mlx5_flow_table *next_ft) +{ + u32 in[MLX5_ST_SZ_DW(modify_flow_table_in)] = {}; + struct mlx5_core_dev *dev = ns->dev; + + MLX5_SET(modify_flow_table_in, in, opcode, + MLX5_CMD_OP_MODIFY_FLOW_TABLE); + MLX5_SET(modify_flow_table_in, in, table_type, ft->type); + MLX5_SET(modify_flow_table_in, in, table_id, ft->id); + + if (ft->op_mod == FS_FT_OP_MOD_LAG_DEMUX) { + MLX5_SET(modify_flow_table_in, in, modify_field_select, + MLX5_MODIFY_FLOW_TABLE_LAG_NEXT_TABLE_ID); + if (next_ft) { + MLX5_SET(modify_flow_table_in, in, + flow_table_context.lag_master_next_table_id, next_ft->id); + } else { + MLX5_SET(modify_flow_table_in, in, + flow_table_context.lag_master_next_table_id, 0); + } + } else { + MLX5_SET(modify_flow_table_in, in, vport_number, ft->vport); + MLX5_SET(modify_flow_table_in, in, other_vport, + !!(ft->flags & MLX5_FLOW_TABLE_OTHER_VPORT)); + MLX5_SET(modify_flow_table_in, in, modify_field_select, + MLX5_MODIFY_FLOW_TABLE_MISS_TABLE_ID); + if (next_ft) { + MLX5_SET(modify_flow_table_in, in, + flow_table_context.table_miss_action, + MLX5_FLOW_TABLE_MISS_ACTION_FWD); + MLX5_SET(modify_flow_table_in, in, + flow_table_context.table_miss_id, + next_ft->id); + } else { + MLX5_SET(modify_flow_table_in, in, + flow_table_context.table_miss_action, + ft->def_miss_action); + } + } + + return mlx5_cmd_exec_in(dev, modify_flow_table, in); +} + +static int mlx5_cmd_create_flow_group(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + u32 *in, + struct mlx5_flow_group *fg) +{ + u32 out[MLX5_ST_SZ_DW(create_flow_group_out)] = {}; + struct mlx5_core_dev *dev = ns->dev; + int err; + + MLX5_SET(create_flow_group_in, in, opcode, + MLX5_CMD_OP_CREATE_FLOW_GROUP); + MLX5_SET(create_flow_group_in, in, table_type, ft->type); + MLX5_SET(create_flow_group_in, in, table_id, ft->id); + if (ft->vport) { + MLX5_SET(create_flow_group_in, in, vport_number, ft->vport); + MLX5_SET(create_flow_group_in, in, other_vport, 1); + } + + MLX5_SET(create_flow_group_in, in, vport_number, ft->vport); + MLX5_SET(create_flow_group_in, in, other_vport, + !!(ft->flags & MLX5_FLOW_TABLE_OTHER_VPORT)); + err = mlx5_cmd_exec_inout(dev, create_flow_group, in, out); + if (!err) + fg->id = MLX5_GET(create_flow_group_out, out, + group_id); + return err; +} + +static int mlx5_cmd_destroy_flow_group(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + struct mlx5_flow_group *fg) +{ + u32 in[MLX5_ST_SZ_DW(destroy_flow_group_in)] = {}; + struct mlx5_core_dev *dev = ns->dev; + + MLX5_SET(destroy_flow_group_in, in, opcode, + MLX5_CMD_OP_DESTROY_FLOW_GROUP); + MLX5_SET(destroy_flow_group_in, in, table_type, ft->type); + MLX5_SET(destroy_flow_group_in, in, table_id, ft->id); + MLX5_SET(destroy_flow_group_in, in, group_id, fg->id); + MLX5_SET(destroy_flow_group_in, in, vport_number, ft->vport); + MLX5_SET(destroy_flow_group_in, in, other_vport, + !!(ft->flags & MLX5_FLOW_TABLE_OTHER_VPORT)); + return mlx5_cmd_exec_in(dev, destroy_flow_group, in); +} + +static int mlx5_set_extended_dest(struct mlx5_core_dev *dev, + struct fs_fte *fte, bool *extended_dest) +{ + int fw_log_max_fdb_encap_uplink = + MLX5_CAP_ESW(dev, log_max_fdb_encap_uplink); + int num_fwd_destinations = 0; + struct mlx5_flow_rule *dst; + int num_encap = 0; + + *extended_dest = false; + if (!(fte->action.action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST)) + return 0; + + list_for_each_entry(dst, &fte->node.children, node.list) { + if (dst->dest_attr.type == MLX5_FLOW_DESTINATION_TYPE_COUNTER) + continue; + if ((dst->dest_attr.type == MLX5_FLOW_DESTINATION_TYPE_VPORT || + dst->dest_attr.type == MLX5_FLOW_DESTINATION_TYPE_UPLINK) && + dst->dest_attr.vport.flags & MLX5_FLOW_DEST_VPORT_REFORMAT_ID) + num_encap++; + num_fwd_destinations++; + } + if (num_fwd_destinations > 1 && num_encap > 0) + *extended_dest = true; + + if (*extended_dest && !fw_log_max_fdb_encap_uplink) { + mlx5_core_warn(dev, "FW does not support extended destination"); + return -EOPNOTSUPP; + } + if (num_encap > (1 << fw_log_max_fdb_encap_uplink)) { + mlx5_core_warn(dev, "FW does not support more than %d encaps", + 1 << fw_log_max_fdb_encap_uplink); + return -EOPNOTSUPP; + } + + return 0; +} + +static void +mlx5_cmd_set_fte_flow_meter(struct fs_fte *fte, void *in_flow_context) +{ + void *execute_aso; + void *exe_aso_ctrl; + + execute_aso = MLX5_ADDR_OF(flow_context, in_flow_context, + execute_aso[0]); + MLX5_SET(execute_aso, execute_aso, valid, 1); + MLX5_SET(execute_aso, execute_aso, aso_object_id, + fte->action.exe_aso.object_id); + + exe_aso_ctrl = MLX5_ADDR_OF(execute_aso, execute_aso, exe_aso_ctrl); + MLX5_SET(exe_aso_ctrl_flow_meter, exe_aso_ctrl, return_reg_id, + fte->action.exe_aso.return_reg_id); + MLX5_SET(exe_aso_ctrl_flow_meter, exe_aso_ctrl, aso_type, + fte->action.exe_aso.type); + MLX5_SET(exe_aso_ctrl_flow_meter, exe_aso_ctrl, init_color, + fte->action.exe_aso.flow_meter.init_color); + MLX5_SET(exe_aso_ctrl_flow_meter, exe_aso_ctrl, meter_id, + fte->action.exe_aso.flow_meter.meter_idx); +} + +static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev, + int opmod, int modify_mask, + struct mlx5_flow_table *ft, + unsigned group_id, + struct fs_fte *fte) +{ + u32 out[MLX5_ST_SZ_DW(set_fte_out)] = {0}; + bool extended_dest = false; + struct mlx5_flow_rule *dst; + void *in_flow_context, *vlan; + void *in_match_value; + unsigned int inlen; + int dst_cnt_size; + void *in_dests; + u32 *in; + int err; + + if (mlx5_set_extended_dest(dev, fte, &extended_dest)) + return -EOPNOTSUPP; + + if (!extended_dest) + dst_cnt_size = MLX5_ST_SZ_BYTES(dest_format_struct); + else + dst_cnt_size = MLX5_ST_SZ_BYTES(extended_dest_format); + + inlen = MLX5_ST_SZ_BYTES(set_fte_in) + fte->dests_size * dst_cnt_size; + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(set_fte_in, in, opcode, MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY); + MLX5_SET(set_fte_in, in, op_mod, opmod); + MLX5_SET(set_fte_in, in, modify_enable_mask, modify_mask); + MLX5_SET(set_fte_in, in, table_type, ft->type); + MLX5_SET(set_fte_in, in, table_id, ft->id); + MLX5_SET(set_fte_in, in, flow_index, fte->index); + MLX5_SET(set_fte_in, in, ignore_flow_level, + !!(fte->action.flags & FLOW_ACT_IGNORE_FLOW_LEVEL)); + + MLX5_SET(set_fte_in, in, vport_number, ft->vport); + MLX5_SET(set_fte_in, in, other_vport, + !!(ft->flags & MLX5_FLOW_TABLE_OTHER_VPORT)); + + in_flow_context = MLX5_ADDR_OF(set_fte_in, in, flow_context); + MLX5_SET(flow_context, in_flow_context, group_id, group_id); + + MLX5_SET(flow_context, in_flow_context, flow_tag, + fte->flow_context.flow_tag); + MLX5_SET(flow_context, in_flow_context, flow_source, + fte->flow_context.flow_source); + + MLX5_SET(flow_context, in_flow_context, extended_destination, + extended_dest); + if (extended_dest) { + u32 action; + + action = fte->action.action & + ~MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT; + MLX5_SET(flow_context, in_flow_context, action, action); + } else { + u32 id; + MLX5_SET(flow_context, in_flow_context, action, + fte->action.action); + if (fte->action.pkt_reformat) { + if (fte->action.pkt_reformat->sw_owned) { + switch (fte->action.pkt_reformat->reformat_type) { + case MLX5_REFORMAT_TYPE_L2_TO_VXLAN: + case MLX5_REFORMAT_TYPE_L2_TO_NVGRE: + case MLX5_REFORMAT_TYPE_L2_TO_L2_TUNNEL: + case MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL: + id = mlx5_fs_dr_action_get_pkt_reformat_id(fte->action.pkt_reformat); + break; + default: + err = -ENOTSUPP; + goto err_out; + } + } else { + id = fte->action.pkt_reformat->id; + } + MLX5_SET(flow_context, in_flow_context, + packet_reformat_id, id); + } + } + if (fte->action.modify_hdr) { + if (fte->action.modify_hdr->sw_owned) { + err = -ENOTSUPP; + goto err_out; + } + MLX5_SET(flow_context, in_flow_context, modify_header_id, + fte->action.modify_hdr->id); + } + + MLX5_SET(flow_context, in_flow_context, encrypt_decrypt_type, + fte->action.crypto.type); + MLX5_SET(flow_context, in_flow_context, encrypt_decrypt_obj_id, + fte->action.crypto.obj_id); + + vlan = MLX5_ADDR_OF(flow_context, in_flow_context, push_vlan); + + MLX5_SET(vlan, vlan, ethtype, fte->action.vlan[0].ethtype); + MLX5_SET(vlan, vlan, vid, fte->action.vlan[0].vid); + MLX5_SET(vlan, vlan, prio, fte->action.vlan[0].prio); + + vlan = MLX5_ADDR_OF(flow_context, in_flow_context, push_vlan_2); + + MLX5_SET(vlan, vlan, ethtype, fte->action.vlan[1].ethtype); + MLX5_SET(vlan, vlan, vid, fte->action.vlan[1].vid); + MLX5_SET(vlan, vlan, prio, fte->action.vlan[1].prio); + + in_match_value = MLX5_ADDR_OF(flow_context, in_flow_context, + match_value); + memcpy(in_match_value, &fte->val, sizeof(fte->val)); + + in_dests = MLX5_ADDR_OF(flow_context, in_flow_context, destination); + if (fte->action.action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) { + int list_size = 0; + + list_for_each_entry(dst, &fte->node.children, node.list) { + unsigned int id, type = dst->dest_attr.type; + + if (type == MLX5_FLOW_DESTINATION_TYPE_COUNTER) + continue; + + switch (type) { + case MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM: + id = dst->dest_attr.ft_num; + type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + break; + case MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE: + id = dst->dest_attr.ft->id; + break; + case MLX5_FLOW_DESTINATION_TYPE_UPLINK: + case MLX5_FLOW_DESTINATION_TYPE_VPORT: + MLX5_SET(dest_format_struct, in_dests, + destination_eswitch_owner_vhca_id_valid, + !!(dst->dest_attr.vport.flags & + MLX5_FLOW_DEST_VPORT_VHCA_ID)); + MLX5_SET(dest_format_struct, in_dests, + destination_eswitch_owner_vhca_id, + dst->dest_attr.vport.vhca_id); + if (type == MLX5_FLOW_DESTINATION_TYPE_UPLINK) { + /* destination_id is reserved */ + id = 0; + break; + } + id = dst->dest_attr.vport.num; + if (extended_dest && + dst->dest_attr.vport.pkt_reformat) { + MLX5_SET(dest_format_struct, in_dests, + packet_reformat, + !!(dst->dest_attr.vport.flags & + MLX5_FLOW_DEST_VPORT_REFORMAT_ID)); + MLX5_SET(extended_dest_format, in_dests, + packet_reformat_id, + dst->dest_attr.vport.pkt_reformat->id); + } + break; + case MLX5_FLOW_DESTINATION_TYPE_FLOW_SAMPLER: + id = dst->dest_attr.sampler_id; + break; + case MLX5_FLOW_DESTINATION_TYPE_TABLE_TYPE: + MLX5_SET(dest_format_struct, in_dests, + destination_table_type, dst->dest_attr.ft->type); + id = dst->dest_attr.ft->id; + break; + default: + id = dst->dest_attr.tir_num; + } + + MLX5_SET(dest_format_struct, in_dests, destination_type, + type); + MLX5_SET(dest_format_struct, in_dests, destination_id, id); + in_dests += dst_cnt_size; + list_size++; + } + + MLX5_SET(flow_context, in_flow_context, destination_list_size, + list_size); + } + + if (fte->action.action & MLX5_FLOW_CONTEXT_ACTION_COUNT) { + int max_list_size = BIT(MLX5_CAP_FLOWTABLE_TYPE(dev, + log_max_flow_counter, + ft->type)); + int list_size = 0; + + list_for_each_entry(dst, &fte->node.children, node.list) { + if (dst->dest_attr.type != + MLX5_FLOW_DESTINATION_TYPE_COUNTER) + continue; + + MLX5_SET(flow_counter_list, in_dests, flow_counter_id, + dst->dest_attr.counter_id); + in_dests += dst_cnt_size; + list_size++; + } + if (list_size > max_list_size) { + err = -EINVAL; + goto err_out; + } + + MLX5_SET(flow_context, in_flow_context, flow_counter_list_size, + list_size); + } + + if (fte->action.action & MLX5_FLOW_CONTEXT_ACTION_EXECUTE_ASO) { + if (fte->action.exe_aso.type == MLX5_EXE_ASO_FLOW_METER) { + mlx5_cmd_set_fte_flow_meter(fte, in_flow_context); + } else { + err = -EOPNOTSUPP; + goto err_out; + } + } + + err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); +err_out: + kvfree(in); + return err; +} + +static int mlx5_cmd_create_fte(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + struct mlx5_flow_group *group, + struct fs_fte *fte) +{ + struct mlx5_core_dev *dev = ns->dev; + unsigned int group_id = group->id; + + return mlx5_cmd_set_fte(dev, 0, 0, ft, group_id, fte); +} + +static int mlx5_cmd_update_fte(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + struct mlx5_flow_group *fg, + int modify_mask, + struct fs_fte *fte) +{ + int opmod; + struct mlx5_core_dev *dev = ns->dev; + int atomic_mod_cap = MLX5_CAP_FLOWTABLE(dev, + flow_table_properties_nic_receive. + flow_modify_en); + if (!atomic_mod_cap) + return -EOPNOTSUPP; + opmod = 1; + + return mlx5_cmd_set_fte(dev, opmod, modify_mask, ft, fg->id, fte); +} + +static int mlx5_cmd_delete_fte(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + struct fs_fte *fte) +{ + u32 in[MLX5_ST_SZ_DW(delete_fte_in)] = {}; + struct mlx5_core_dev *dev = ns->dev; + + MLX5_SET(delete_fte_in, in, opcode, MLX5_CMD_OP_DELETE_FLOW_TABLE_ENTRY); + MLX5_SET(delete_fte_in, in, table_type, ft->type); + MLX5_SET(delete_fte_in, in, table_id, ft->id); + MLX5_SET(delete_fte_in, in, flow_index, fte->index); + MLX5_SET(delete_fte_in, in, vport_number, ft->vport); + MLX5_SET(delete_fte_in, in, other_vport, + !!(ft->flags & MLX5_FLOW_TABLE_OTHER_VPORT)); + + return mlx5_cmd_exec_in(dev, delete_fte, in); +} + +int mlx5_cmd_fc_bulk_alloc(struct mlx5_core_dev *dev, + enum mlx5_fc_bulk_alloc_bitmask alloc_bitmask, + u32 *id) +{ + u32 out[MLX5_ST_SZ_DW(alloc_flow_counter_out)] = {}; + u32 in[MLX5_ST_SZ_DW(alloc_flow_counter_in)] = {}; + int err; + + MLX5_SET(alloc_flow_counter_in, in, opcode, + MLX5_CMD_OP_ALLOC_FLOW_COUNTER); + MLX5_SET(alloc_flow_counter_in, in, flow_counter_bulk, alloc_bitmask); + + err = mlx5_cmd_exec_inout(dev, alloc_flow_counter, in, out); + if (!err) + *id = MLX5_GET(alloc_flow_counter_out, out, flow_counter_id); + return err; +} + +int mlx5_cmd_fc_alloc(struct mlx5_core_dev *dev, u32 *id) +{ + return mlx5_cmd_fc_bulk_alloc(dev, 0, id); +} + +int mlx5_cmd_fc_free(struct mlx5_core_dev *dev, u32 id) +{ + u32 in[MLX5_ST_SZ_DW(dealloc_flow_counter_in)] = {}; + + MLX5_SET(dealloc_flow_counter_in, in, opcode, + MLX5_CMD_OP_DEALLOC_FLOW_COUNTER); + MLX5_SET(dealloc_flow_counter_in, in, flow_counter_id, id); + return mlx5_cmd_exec_in(dev, dealloc_flow_counter, in); +} + +int mlx5_cmd_fc_query(struct mlx5_core_dev *dev, u32 id, + u64 *packets, u64 *bytes, bool clear) +{ + u32 out[MLX5_ST_SZ_BYTES(query_flow_counter_out) + + MLX5_ST_SZ_BYTES(traffic_counter)] = {}; + u32 in[MLX5_ST_SZ_DW(query_flow_counter_in)] = {}; + void *stats; + int err = 0; + + MLX5_SET(query_flow_counter_in, in, opcode, + MLX5_CMD_OP_QUERY_FLOW_COUNTER); + MLX5_SET(query_flow_counter_in, in, op_mod, 0); + MLX5_SET(query_flow_counter_in, in, flow_counter_id, id); + MLX5_SET(query_flow_counter_in, in, clear, clear ? 1 : 0); + err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + if (err) + return err; + + stats = MLX5_ADDR_OF(query_flow_counter_out, out, flow_statistics); + *packets = MLX5_GET64(traffic_counter, stats, packets); + *bytes = MLX5_GET64(traffic_counter, stats, octets); + return 0; +} + +int mlx5_cmd_fc_get_bulk_query_out_len(int bulk_len) +{ + return MLX5_ST_SZ_BYTES(query_flow_counter_out) + + MLX5_ST_SZ_BYTES(traffic_counter) * bulk_len; +} + +int mlx5_cmd_fc_bulk_query(struct mlx5_core_dev *dev, u32 base_id, int bulk_len, + u32 *out) +{ + int outlen = mlx5_cmd_fc_get_bulk_query_out_len(bulk_len); + u32 in[MLX5_ST_SZ_DW(query_flow_counter_in)] = {}; + + MLX5_SET(query_flow_counter_in, in, opcode, + MLX5_CMD_OP_QUERY_FLOW_COUNTER); + MLX5_SET(query_flow_counter_in, in, flow_counter_id, base_id); + MLX5_SET(query_flow_counter_in, in, num_of_counters, bulk_len); + return mlx5_cmd_exec(dev, in, sizeof(in), out, outlen); +} + +static int mlx5_cmd_packet_reformat_alloc(struct mlx5_flow_root_namespace *ns, + struct mlx5_pkt_reformat_params *params, + enum mlx5_flow_namespace_type namespace, + struct mlx5_pkt_reformat *pkt_reformat) +{ + u32 out[MLX5_ST_SZ_DW(alloc_packet_reformat_context_out)] = {}; + struct mlx5_core_dev *dev = ns->dev; + void *packet_reformat_context_in; + int max_encap_size; + void *reformat; + int inlen; + int err; + u32 *in; + + if (namespace == MLX5_FLOW_NAMESPACE_FDB || + namespace == MLX5_FLOW_NAMESPACE_FDB_BYPASS) + max_encap_size = MLX5_CAP_ESW(dev, max_encap_header_size); + else + max_encap_size = MLX5_CAP_FLOWTABLE(dev, max_encap_header_size); + + if (params->size > max_encap_size) { + mlx5_core_warn(dev, "encap size %zd too big, max supported is %d\n", + params->size, max_encap_size); + return -EINVAL; + } + + in = kzalloc(MLX5_ST_SZ_BYTES(alloc_packet_reformat_context_in) + + params->size, GFP_KERNEL); + if (!in) + return -ENOMEM; + + packet_reformat_context_in = MLX5_ADDR_OF(alloc_packet_reformat_context_in, + in, packet_reformat_context); + reformat = MLX5_ADDR_OF(packet_reformat_context_in, + packet_reformat_context_in, + reformat_data); + inlen = reformat - (void *)in + params->size; + + MLX5_SET(alloc_packet_reformat_context_in, in, opcode, + MLX5_CMD_OP_ALLOC_PACKET_REFORMAT_CONTEXT); + MLX5_SET(packet_reformat_context_in, packet_reformat_context_in, + reformat_data_size, params->size); + MLX5_SET(packet_reformat_context_in, packet_reformat_context_in, + reformat_type, params->type); + MLX5_SET(packet_reformat_context_in, packet_reformat_context_in, + reformat_param_0, params->param_0); + MLX5_SET(packet_reformat_context_in, packet_reformat_context_in, + reformat_param_1, params->param_1); + if (params->data && params->size) + memcpy(reformat, params->data, params->size); + + err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); + + pkt_reformat->id = MLX5_GET(alloc_packet_reformat_context_out, + out, packet_reformat_id); + kfree(in); + return err; +} + +static void mlx5_cmd_packet_reformat_dealloc(struct mlx5_flow_root_namespace *ns, + struct mlx5_pkt_reformat *pkt_reformat) +{ + u32 in[MLX5_ST_SZ_DW(dealloc_packet_reformat_context_in)] = {}; + struct mlx5_core_dev *dev = ns->dev; + + MLX5_SET(dealloc_packet_reformat_context_in, in, opcode, + MLX5_CMD_OP_DEALLOC_PACKET_REFORMAT_CONTEXT); + MLX5_SET(dealloc_packet_reformat_context_in, in, packet_reformat_id, + pkt_reformat->id); + + mlx5_cmd_exec_in(dev, dealloc_packet_reformat_context, in); +} + +static int mlx5_cmd_modify_header_alloc(struct mlx5_flow_root_namespace *ns, + u8 namespace, u8 num_actions, + void *modify_actions, + struct mlx5_modify_hdr *modify_hdr) +{ + u32 out[MLX5_ST_SZ_DW(alloc_modify_header_context_out)] = {}; + int max_actions, actions_size, inlen, err; + struct mlx5_core_dev *dev = ns->dev; + void *actions_in; + u8 table_type; + u32 *in; + + switch (namespace) { + case MLX5_FLOW_NAMESPACE_FDB: + case MLX5_FLOW_NAMESPACE_FDB_BYPASS: + max_actions = MLX5_CAP_ESW_FLOWTABLE_FDB(dev, max_modify_header_actions); + table_type = FS_FT_FDB; + break; + case MLX5_FLOW_NAMESPACE_KERNEL_RX_MACSEC: + case MLX5_FLOW_NAMESPACE_KERNEL: + case MLX5_FLOW_NAMESPACE_BYPASS: + max_actions = MLX5_CAP_FLOWTABLE_NIC_RX(dev, max_modify_header_actions); + table_type = FS_FT_NIC_RX; + break; + case MLX5_FLOW_NAMESPACE_EGRESS: +#ifdef CONFIG_MLX5_IPSEC + case MLX5_FLOW_NAMESPACE_EGRESS_IPSEC: +#endif + case MLX5_FLOW_NAMESPACE_EGRESS_MACSEC: + max_actions = MLX5_CAP_FLOWTABLE_NIC_TX(dev, max_modify_header_actions); + table_type = FS_FT_NIC_TX; + break; + case MLX5_FLOW_NAMESPACE_ESW_INGRESS: + max_actions = MLX5_CAP_ESW_INGRESS_ACL(dev, max_modify_header_actions); + table_type = FS_FT_ESW_INGRESS_ACL; + break; + case MLX5_FLOW_NAMESPACE_RDMA_TX_MACSEC: + case MLX5_FLOW_NAMESPACE_RDMA_TX: + max_actions = MLX5_CAP_FLOWTABLE_RDMA_TX(dev, max_modify_header_actions); + table_type = FS_FT_RDMA_TX; + break; + default: + return -EOPNOTSUPP; + } + + if (num_actions > max_actions) { + mlx5_core_warn(dev, "too many modify header actions %d, max supported %d\n", + num_actions, max_actions); + return -EOPNOTSUPP; + } + + actions_size = MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto) * num_actions; + inlen = MLX5_ST_SZ_BYTES(alloc_modify_header_context_in) + actions_size; + + in = kzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(alloc_modify_header_context_in, in, opcode, + MLX5_CMD_OP_ALLOC_MODIFY_HEADER_CONTEXT); + MLX5_SET(alloc_modify_header_context_in, in, table_type, table_type); + MLX5_SET(alloc_modify_header_context_in, in, num_of_actions, num_actions); + + actions_in = MLX5_ADDR_OF(alloc_modify_header_context_in, in, actions); + memcpy(actions_in, modify_actions, actions_size); + + err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); + + modify_hdr->id = MLX5_GET(alloc_modify_header_context_out, out, modify_header_id); + kfree(in); + return err; +} + +static void mlx5_cmd_modify_header_dealloc(struct mlx5_flow_root_namespace *ns, + struct mlx5_modify_hdr *modify_hdr) +{ + u32 in[MLX5_ST_SZ_DW(dealloc_modify_header_context_in)] = {}; + struct mlx5_core_dev *dev = ns->dev; + + MLX5_SET(dealloc_modify_header_context_in, in, opcode, + MLX5_CMD_OP_DEALLOC_MODIFY_HEADER_CONTEXT); + MLX5_SET(dealloc_modify_header_context_in, in, modify_header_id, + modify_hdr->id); + + mlx5_cmd_exec_in(dev, dealloc_modify_header_context, in); +} + +static int mlx5_cmd_destroy_match_definer(struct mlx5_flow_root_namespace *ns, + int definer_id) +{ + u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {}; + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)]; + + MLX5_SET(general_obj_in_cmd_hdr, in, opcode, + MLX5_CMD_OP_DESTROY_GENERAL_OBJECT); + MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, + MLX5_OBJ_TYPE_MATCH_DEFINER); + MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, definer_id); + + return mlx5_cmd_exec(ns->dev, in, sizeof(in), out, sizeof(out)); +} + +static int mlx5_cmd_create_match_definer(struct mlx5_flow_root_namespace *ns, + u16 format_id, u32 *match_mask) +{ + u32 out[MLX5_ST_SZ_DW(create_match_definer_out)] = {}; + u32 in[MLX5_ST_SZ_DW(create_match_definer_in)] = {}; + struct mlx5_core_dev *dev = ns->dev; + void *ptr; + int err; + + MLX5_SET(create_match_definer_in, in, general_obj_in_cmd_hdr.opcode, + MLX5_CMD_OP_CREATE_GENERAL_OBJECT); + MLX5_SET(create_match_definer_in, in, general_obj_in_cmd_hdr.obj_type, + MLX5_OBJ_TYPE_MATCH_DEFINER); + + ptr = MLX5_ADDR_OF(create_match_definer_in, in, obj_context); + MLX5_SET(match_definer, ptr, format_id, format_id); + + ptr = MLX5_ADDR_OF(match_definer, ptr, match_mask); + memcpy(ptr, match_mask, MLX5_FLD_SZ_BYTES(match_definer, match_mask)); + + err = mlx5_cmd_exec_inout(dev, create_match_definer, in, out); + return err ? err : MLX5_GET(general_obj_out_cmd_hdr, out, obj_id); +} + +static const struct mlx5_flow_cmds mlx5_flow_cmds = { + .create_flow_table = mlx5_cmd_create_flow_table, + .destroy_flow_table = mlx5_cmd_destroy_flow_table, + .modify_flow_table = mlx5_cmd_modify_flow_table, + .create_flow_group = mlx5_cmd_create_flow_group, + .destroy_flow_group = mlx5_cmd_destroy_flow_group, + .create_fte = mlx5_cmd_create_fte, + .update_fte = mlx5_cmd_update_fte, + .delete_fte = mlx5_cmd_delete_fte, + .update_root_ft = mlx5_cmd_update_root_ft, + .packet_reformat_alloc = mlx5_cmd_packet_reformat_alloc, + .packet_reformat_dealloc = mlx5_cmd_packet_reformat_dealloc, + .modify_header_alloc = mlx5_cmd_modify_header_alloc, + .modify_header_dealloc = mlx5_cmd_modify_header_dealloc, + .create_match_definer = mlx5_cmd_create_match_definer, + .destroy_match_definer = mlx5_cmd_destroy_match_definer, + .set_peer = mlx5_cmd_stub_set_peer, + .create_ns = mlx5_cmd_stub_create_ns, + .destroy_ns = mlx5_cmd_stub_destroy_ns, +}; + +static const struct mlx5_flow_cmds mlx5_flow_cmd_stubs = { + .create_flow_table = mlx5_cmd_stub_create_flow_table, + .destroy_flow_table = mlx5_cmd_stub_destroy_flow_table, + .modify_flow_table = mlx5_cmd_stub_modify_flow_table, + .create_flow_group = mlx5_cmd_stub_create_flow_group, + .destroy_flow_group = mlx5_cmd_stub_destroy_flow_group, + .create_fte = mlx5_cmd_stub_create_fte, + .update_fte = mlx5_cmd_stub_update_fte, + .delete_fte = mlx5_cmd_stub_delete_fte, + .update_root_ft = mlx5_cmd_stub_update_root_ft, + .packet_reformat_alloc = mlx5_cmd_stub_packet_reformat_alloc, + .packet_reformat_dealloc = mlx5_cmd_stub_packet_reformat_dealloc, + .modify_header_alloc = mlx5_cmd_stub_modify_header_alloc, + .modify_header_dealloc = mlx5_cmd_stub_modify_header_dealloc, + .create_match_definer = mlx5_cmd_stub_create_match_definer, + .destroy_match_definer = mlx5_cmd_stub_destroy_match_definer, + .set_peer = mlx5_cmd_stub_set_peer, + .create_ns = mlx5_cmd_stub_create_ns, + .destroy_ns = mlx5_cmd_stub_destroy_ns, +}; + +const struct mlx5_flow_cmds *mlx5_fs_cmd_get_fw_cmds(void) +{ + return &mlx5_flow_cmds; +} + +static const struct mlx5_flow_cmds *mlx5_fs_cmd_get_stub_cmds(void) +{ + return &mlx5_flow_cmd_stubs; +} + +const struct mlx5_flow_cmds *mlx5_fs_cmd_get_default(enum fs_flow_table_type type) +{ + switch (type) { + case FS_FT_NIC_RX: + case FS_FT_ESW_EGRESS_ACL: + case FS_FT_ESW_INGRESS_ACL: + case FS_FT_FDB: + case FS_FT_SNIFFER_RX: + case FS_FT_SNIFFER_TX: + case FS_FT_NIC_TX: + case FS_FT_RDMA_RX: + case FS_FT_RDMA_TX: + case FS_FT_PORT_SEL: + return mlx5_fs_cmd_get_fw_cmds(); + default: + return mlx5_fs_cmd_get_stub_cmds(); + } +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h new file mode 100644 index 0000000..feebdcc --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _MLX5_FS_CMD_ +#define _MLX5_FS_CMD_ + +#include "fs_core.h" + +struct mlx5_flow_cmds { + int (*create_flow_table)(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + struct mlx5_flow_table_attr *ft_attr, + struct mlx5_flow_table *next_ft); + int (*destroy_flow_table)(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft); + + int (*modify_flow_table)(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + struct mlx5_flow_table *next_ft); + + int (*create_flow_group)(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + u32 *in, + struct mlx5_flow_group *fg); + + int (*destroy_flow_group)(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + struct mlx5_flow_group *fg); + + int (*create_fte)(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + struct mlx5_flow_group *fg, + struct fs_fte *fte); + + int (*update_fte)(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + struct mlx5_flow_group *fg, + int modify_mask, + struct fs_fte *fte); + + int (*delete_fte)(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + struct fs_fte *fte); + + int (*update_root_ft)(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + u32 underlay_qpn, + bool disconnect); + + int (*packet_reformat_alloc)(struct mlx5_flow_root_namespace *ns, + struct mlx5_pkt_reformat_params *params, + enum mlx5_flow_namespace_type namespace, + struct mlx5_pkt_reformat *pkt_reformat); + + void (*packet_reformat_dealloc)(struct mlx5_flow_root_namespace *ns, + struct mlx5_pkt_reformat *pkt_reformat); + + int (*modify_header_alloc)(struct mlx5_flow_root_namespace *ns, + u8 namespace, u8 num_actions, + void *modify_actions, + struct mlx5_modify_hdr *modify_hdr); + + void (*modify_header_dealloc)(struct mlx5_flow_root_namespace *ns, + struct mlx5_modify_hdr *modify_hdr); + + int (*set_peer)(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_root_namespace *peer_ns); + + int (*create_ns)(struct mlx5_flow_root_namespace *ns); + int (*destroy_ns)(struct mlx5_flow_root_namespace *ns); + int (*create_match_definer)(struct mlx5_flow_root_namespace *ns, + u16 format_id, u32 *match_mask); + int (*destroy_match_definer)(struct mlx5_flow_root_namespace *ns, + int definer_id); +}; + +int mlx5_cmd_fc_alloc(struct mlx5_core_dev *dev, u32 *id); +int mlx5_cmd_fc_bulk_alloc(struct mlx5_core_dev *dev, + enum mlx5_fc_bulk_alloc_bitmask alloc_bitmask, + u32 *id); +int mlx5_cmd_fc_free(struct mlx5_core_dev *dev, u32 id); +int mlx5_cmd_fc_query(struct mlx5_core_dev *dev, u32 id, + u64 *packets, u64 *bytes, bool clear); + +int mlx5_cmd_fc_get_bulk_query_out_len(int bulk_len); +int mlx5_cmd_fc_bulk_query(struct mlx5_core_dev *dev, u32 base_id, int bulk_len, + u32 *out); + +const struct mlx5_flow_cmds *mlx5_fs_cmd_get_default(enum fs_flow_table_type type); +const struct mlx5_flow_cmds *mlx5_fs_cmd_get_fw_cmds(void); + +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c new file mode 100644 index 0000000..2a88bea --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -0,0 +1,3624 @@ +/* + * Copyright (c) 2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#include "mlx5_core.h" +#include "fs_core.h" +#include "fs_cmd.h" +#include "fs_ft_pool.h" +#include "diag/fs_tracepoint.h" +#include "accel/ipsec.h" +#include "fpga/ipsec.h" + +#define INIT_TREE_NODE_ARRAY_SIZE(...) (sizeof((struct init_tree_node[]){__VA_ARGS__}) /\ + sizeof(struct init_tree_node)) + +#define ADD_PRIO(num_prios_val, min_level_val, num_levels_val, caps_val,\ + ...) {.type = FS_TYPE_PRIO,\ + .min_ft_level = min_level_val,\ + .num_levels = num_levels_val,\ + .num_leaf_prios = num_prios_val,\ + .caps = caps_val,\ + .children = (struct init_tree_node[]) {__VA_ARGS__},\ + .ar_size = INIT_TREE_NODE_ARRAY_SIZE(__VA_ARGS__) \ +} + +#define ADD_MULTIPLE_PRIO(num_prios_val, num_levels_val, ...)\ + ADD_PRIO(num_prios_val, 0, num_levels_val, {},\ + __VA_ARGS__)\ + +#define ADD_NS(def_miss_act, ...) {.type = FS_TYPE_NAMESPACE, \ + .def_miss_action = def_miss_act,\ + .children = (struct init_tree_node[]) {__VA_ARGS__},\ + .ar_size = INIT_TREE_NODE_ARRAY_SIZE(__VA_ARGS__) \ +} + +#define INIT_CAPS_ARRAY_SIZE(...) (sizeof((long[]){__VA_ARGS__}) /\ + sizeof(long)) + +#define FS_CAP(cap) (__mlx5_bit_off(flow_table_nic_cap, cap)) + +#define FS_REQUIRED_CAPS(...) {.arr_sz = INIT_CAPS_ARRAY_SIZE(__VA_ARGS__), \ + .caps = (long[]) {__VA_ARGS__} } + +#define FS_CHAINING_CAPS FS_REQUIRED_CAPS(FS_CAP(flow_table_properties_nic_receive.flow_modify_en), \ + FS_CAP(flow_table_properties_nic_receive.modify_root), \ + FS_CAP(flow_table_properties_nic_receive.identified_miss_table_mode), \ + FS_CAP(flow_table_properties_nic_receive.flow_table_modify)) + +#define FS_CHAINING_CAPS_EGRESS \ + FS_REQUIRED_CAPS( \ + FS_CAP(flow_table_properties_nic_transmit.flow_modify_en), \ + FS_CAP(flow_table_properties_nic_transmit.modify_root), \ + FS_CAP(flow_table_properties_nic_transmit \ + .identified_miss_table_mode), \ + FS_CAP(flow_table_properties_nic_transmit.flow_table_modify)) + +#define FS_CHAINING_CAPS_RDMA_TX \ + FS_REQUIRED_CAPS( \ + FS_CAP(flow_table_properties_nic_transmit_rdma.flow_modify_en), \ + FS_CAP(flow_table_properties_nic_transmit_rdma.modify_root), \ + FS_CAP(flow_table_properties_nic_transmit_rdma \ + .identified_miss_table_mode), \ + FS_CAP(flow_table_properties_nic_transmit_rdma \ + .flow_table_modify)) + +#define LEFTOVERS_NUM_LEVELS 1 +#define LEFTOVERS_NUM_PRIOS 1 + +#define RDMA_RX_COUNTERS_PRIO_NUM_LEVELS 1 +#define RDMA_TX_COUNTERS_PRIO_NUM_LEVELS 1 + +#define BY_PASS_PRIO_NUM_LEVELS 1 +#define BY_PASS_MIN_LEVEL (ETHTOOL_MIN_LEVEL + MLX5_BY_PASS_NUM_PRIOS +\ + LEFTOVERS_NUM_PRIOS) + +#define KERNEL_RX_MACSEC_NUM_PRIOS 1 +#define KERNEL_RX_MACSEC_NUM_LEVELS 3 +#define KERNEL_RX_MACSEC_MIN_LEVEL (BY_PASS_MIN_LEVEL + KERNEL_RX_MACSEC_NUM_PRIOS) + +#define ETHTOOL_PRIO_NUM_LEVELS 1 +#define ETHTOOL_NUM_PRIOS 11 +#define ETHTOOL_MIN_LEVEL (KERNEL_MIN_LEVEL + ETHTOOL_NUM_PRIOS) +/* Promiscuous, Vlan, mac, ttc, inner ttc, {UDP/ANY/aRFS/accel/{esp, esp_err}} */ +#define KERNEL_NIC_PRIO_NUM_LEVELS 7 +#define KERNEL_NIC_NUM_PRIOS 1 +/* One more level for tc */ +#define KERNEL_MIN_LEVEL (KERNEL_NIC_PRIO_NUM_LEVELS + 1) + +#define KERNEL_NIC_TC_NUM_PRIOS 1 +#define KERNEL_NIC_TC_NUM_LEVELS 4 + +#define ANCHOR_NUM_LEVELS 1 +#define ANCHOR_NUM_PRIOS 1 +#define ANCHOR_MIN_LEVEL (BY_PASS_MIN_LEVEL + 1) + +#define OFFLOADS_MAX_FT 2 +#define OFFLOADS_NUM_PRIOS 2 +#define OFFLOADS_MIN_LEVEL (ANCHOR_MIN_LEVEL + OFFLOADS_NUM_PRIOS) + +#define LAG_PRIO_NUM_LEVELS 1 +#define LAG_NUM_PRIOS 1 +#define LAG_MIN_LEVEL (OFFLOADS_MIN_LEVEL + KERNEL_RX_MACSEC_MIN_LEVEL + 1) + +#define KERNEL_TX_HP_OOB_NUM_PRIOS 1 +#define KERNEL_TX_HP_OOB_NUM_LEVELS 1 + +#define KERNEL_TX_IPSEC_NUM_PRIOS 1 +#define KERNEL_TX_IPSEC_NUM_LEVELS 1 +#define KERNEL_TX_IPSEC_MIN_LEVEL (KERNEL_TX_IPSEC_NUM_LEVELS + 1) + +#define KERNEL_TX_MACSEC_NUM_PRIOS 1 +#define KERNEL_TX_MACSEC_NUM_LEVELS 2 +#define KERNEL_TX_MACSEC_MIN_LEVEL (KERNEL_TX_IPSEC_MIN_LEVEL + KERNEL_TX_MACSEC_NUM_PRIOS) + +#define MLX5_INGRESS_ACL_NUM_PRIOS 5 +#define MLX5_EGRESS_ACL_NUM_PRIOS 5 + +struct node_caps { + size_t arr_sz; + long *caps; +}; + +static struct init_tree_node { + enum fs_node_type type; + struct init_tree_node *children; + int ar_size; + struct node_caps caps; + int min_ft_level; + int num_leaf_prios; + int prio; + int num_levels; + enum mlx5_flow_table_miss_action def_miss_action; +} root_fs = { + .type = FS_TYPE_NAMESPACE, + .ar_size = 8, + .children = (struct init_tree_node[]){ + ADD_PRIO(0, BY_PASS_MIN_LEVEL, 0, FS_CHAINING_CAPS, + ADD_NS(MLX5_FLOW_TABLE_MISS_ACTION_DEF, + ADD_MULTIPLE_PRIO(MLX5_BY_PASS_NUM_PRIOS, + BY_PASS_PRIO_NUM_LEVELS))), + ADD_PRIO(0, KERNEL_RX_MACSEC_MIN_LEVEL, 0, FS_CHAINING_CAPS, + ADD_NS(MLX5_FLOW_TABLE_MISS_ACTION_DEF, + ADD_MULTIPLE_PRIO(KERNEL_RX_MACSEC_NUM_PRIOS, + KERNEL_RX_MACSEC_NUM_LEVELS))), + ADD_PRIO(0, LAG_MIN_LEVEL, 0, FS_CHAINING_CAPS, + ADD_NS(MLX5_FLOW_TABLE_MISS_ACTION_DEF, + ADD_MULTIPLE_PRIO(LAG_NUM_PRIOS, + LAG_PRIO_NUM_LEVELS))), + ADD_PRIO(0, OFFLOADS_MIN_LEVEL, 0, FS_CHAINING_CAPS, + ADD_NS(MLX5_FLOW_TABLE_MISS_ACTION_DEF, + ADD_MULTIPLE_PRIO(OFFLOADS_NUM_PRIOS, + OFFLOADS_MAX_FT))), + ADD_PRIO(0, ETHTOOL_MIN_LEVEL, 0, FS_CHAINING_CAPS, + ADD_NS(MLX5_FLOW_TABLE_MISS_ACTION_DEF, + ADD_MULTIPLE_PRIO(ETHTOOL_NUM_PRIOS, + ETHTOOL_PRIO_NUM_LEVELS))), + ADD_PRIO(0, KERNEL_MIN_LEVEL, 0, {}, + ADD_NS(MLX5_FLOW_TABLE_MISS_ACTION_DEF, + ADD_MULTIPLE_PRIO(KERNEL_NIC_TC_NUM_PRIOS, + KERNEL_NIC_TC_NUM_LEVELS), + ADD_MULTIPLE_PRIO(KERNEL_NIC_NUM_PRIOS, + KERNEL_NIC_PRIO_NUM_LEVELS))), + ADD_PRIO(0, BY_PASS_MIN_LEVEL, 0, FS_CHAINING_CAPS, + ADD_NS(MLX5_FLOW_TABLE_MISS_ACTION_DEF, + ADD_MULTIPLE_PRIO(LEFTOVERS_NUM_PRIOS, + LEFTOVERS_NUM_LEVELS))), + ADD_PRIO(0, ANCHOR_MIN_LEVEL, 0, {}, + ADD_NS(MLX5_FLOW_TABLE_MISS_ACTION_DEF, + ADD_MULTIPLE_PRIO(ANCHOR_NUM_PRIOS, + ANCHOR_NUM_LEVELS))), + } +}; + +static struct init_tree_node egress_root_fs = { + .type = FS_TYPE_NAMESPACE, + .ar_size = 3, + .children = (struct init_tree_node[]) { + ADD_PRIO(0, MLX5_BY_PASS_NUM_PRIOS, 0, + FS_CHAINING_CAPS_EGRESS, + ADD_NS(MLX5_FLOW_TABLE_MISS_ACTION_DEF, + ADD_MULTIPLE_PRIO(MLX5_BY_PASS_NUM_PRIOS, + BY_PASS_PRIO_NUM_LEVELS))), + ADD_PRIO(0, KERNEL_TX_IPSEC_MIN_LEVEL, 0, + FS_CHAINING_CAPS_EGRESS, + ADD_NS(MLX5_FLOW_TABLE_MISS_ACTION_DEF, + ADD_MULTIPLE_PRIO(KERNEL_TX_HP_OOB_NUM_PRIOS, + KERNEL_TX_HP_OOB_NUM_LEVELS), +#ifdef CONFIG_MLX5_IPSEC + ADD_MULTIPLE_PRIO(KERNEL_TX_IPSEC_NUM_PRIOS, + KERNEL_TX_IPSEC_NUM_LEVELS), +#endif + )), + ADD_PRIO(0, KERNEL_TX_MACSEC_MIN_LEVEL, 0, + FS_CHAINING_CAPS_EGRESS, + ADD_NS(MLX5_FLOW_TABLE_MISS_ACTION_DEF, + ADD_MULTIPLE_PRIO(KERNEL_TX_MACSEC_NUM_PRIOS, + KERNEL_TX_MACSEC_NUM_LEVELS))), + } +}; + +enum { + RDMA_RX_MACSEC_PRIO, + RDMA_RX_COUNTERS_PRIO, + RDMA_RX_BYPASS_PRIO, + RDMA_RX_KERNEL_PRIO, +}; + +#define RDMA_RX_BYPASS_MIN_LEVEL MLX5_BY_PASS_NUM_REGULAR_PRIOS +#define RDMA_RX_KERNEL_MIN_LEVEL (RDMA_RX_BYPASS_MIN_LEVEL + 1) +#define RDMA_RX_COUNTERS_MIN_LEVEL (RDMA_RX_KERNEL_MIN_LEVEL + 2) + +#define RDMA_RX_MACSEC_NUM_PRIOS 1 +#define RDMA_RX_MACSEC_PRIO_NUM_LEVELS 2 +#define RDMA_RX_MACSEC_MIN_LEVEL (RDMA_RX_COUNTERS_MIN_LEVEL + RDMA_RX_MACSEC_NUM_PRIOS) + +static struct init_tree_node rdma_rx_root_fs = { + .type = FS_TYPE_NAMESPACE, + .ar_size = 4, + .children = (struct init_tree_node[]) { + [RDMA_RX_MACSEC_PRIO] = + ADD_PRIO(0, RDMA_RX_MACSEC_MIN_LEVEL, 0, + FS_CHAINING_CAPS, + ADD_NS(MLX5_FLOW_TABLE_MISS_ACTION_DEF, + ADD_MULTIPLE_PRIO(RDMA_RX_MACSEC_NUM_PRIOS, + RDMA_RX_MACSEC_PRIO_NUM_LEVELS))), + [RDMA_RX_COUNTERS_PRIO] = + ADD_PRIO(0, RDMA_RX_COUNTERS_MIN_LEVEL, 0, + FS_CHAINING_CAPS, + ADD_NS(MLX5_FLOW_TABLE_MISS_ACTION_DEF, + ADD_MULTIPLE_PRIO(MLX5_RDMA_RX_NUM_COUNTERS_PRIOS, + RDMA_RX_COUNTERS_PRIO_NUM_LEVELS))), + [RDMA_RX_BYPASS_PRIO] = + ADD_PRIO(0, RDMA_RX_BYPASS_MIN_LEVEL, 0, + FS_CHAINING_CAPS, + ADD_NS(MLX5_FLOW_TABLE_MISS_ACTION_DEF, + ADD_MULTIPLE_PRIO(MLX5_BY_PASS_NUM_REGULAR_PRIOS, + BY_PASS_PRIO_NUM_LEVELS))), + [RDMA_RX_KERNEL_PRIO] = + ADD_PRIO(0, RDMA_RX_KERNEL_MIN_LEVEL, 0, + FS_CHAINING_CAPS, + ADD_NS(MLX5_FLOW_TABLE_MISS_ACTION_SWITCH_DOMAIN, + ADD_MULTIPLE_PRIO(1, 1))), + } +}; + +enum { + RDMA_TX_COUNTERS_PRIO, + RDMA_TX_MACSEC_PRIO, + RDMA_TX_BYPASS_PRIO, +}; + +#define RDMA_TX_BYPASS_MIN_LEVEL MLX5_BY_PASS_NUM_PRIOS +#define RDMA_TX_COUNTERS_MIN_LEVEL (RDMA_TX_BYPASS_MIN_LEVEL + 1) + +#define RDMA_TX_MACSEC_NUM_PRIOS 1 +#define RDMA_TX_MACESC_PRIO_NUM_LEVELS 1 +#define RDMA_TX_MACSEC_MIN_LEVEL (RDMA_TX_COUNTERS_MIN_LEVEL + RDMA_TX_MACSEC_NUM_PRIOS) + +static struct init_tree_node rdma_tx_root_fs = { + .type = FS_TYPE_NAMESPACE, + .ar_size = 3, + .children = (struct init_tree_node[]) { + [RDMA_TX_COUNTERS_PRIO] = + ADD_PRIO(0, RDMA_TX_COUNTERS_MIN_LEVEL, 0, + FS_CHAINING_CAPS, + ADD_NS(MLX5_FLOW_TABLE_MISS_ACTION_DEF, + ADD_MULTIPLE_PRIO(MLX5_RDMA_TX_NUM_COUNTERS_PRIOS, + RDMA_TX_COUNTERS_PRIO_NUM_LEVELS))), + [RDMA_TX_MACSEC_PRIO] = + ADD_PRIO(0, RDMA_TX_MACSEC_MIN_LEVEL, 0, + FS_CHAINING_CAPS, + ADD_NS(MLX5_FLOW_TABLE_MISS_ACTION_DEF, + ADD_MULTIPLE_PRIO(RDMA_TX_MACSEC_NUM_PRIOS, + RDMA_TX_MACESC_PRIO_NUM_LEVELS))), + [RDMA_TX_BYPASS_PRIO] = + ADD_PRIO(0, RDMA_TX_BYPASS_MIN_LEVEL, 0, + FS_CHAINING_CAPS_RDMA_TX, + ADD_NS(MLX5_FLOW_TABLE_MISS_ACTION_DEF, + ADD_MULTIPLE_PRIO(RDMA_TX_BYPASS_MIN_LEVEL, + BY_PASS_PRIO_NUM_LEVELS))), + } +}; + +enum fs_i_lock_class { + FS_LOCK_GRANDPARENT, + FS_LOCK_PARENT, + FS_LOCK_CHILD +}; + +static const struct rhashtable_params rhash_fte = { + .key_len = sizeof_field(struct fs_fte, val), + .key_offset = offsetof(struct fs_fte, val), + .head_offset = offsetof(struct fs_fte, hash), + .automatic_shrinking = true, + .min_size = 1, +}; + +static const struct rhashtable_params rhash_fg = { + .key_len = sizeof_field(struct mlx5_flow_group, mask), + .key_offset = offsetof(struct mlx5_flow_group, mask), + .head_offset = offsetof(struct mlx5_flow_group, hash), + .automatic_shrinking = true, + .min_size = 1, + +}; + +static void del_hw_flow_table(struct fs_node *node); +static void del_hw_flow_group(struct fs_node *node); +static void del_hw_fte(struct fs_node *node); +static void del_sw_flow_table(struct fs_node *node); +static void del_sw_flow_group(struct fs_node *node); +static void del_sw_fte(struct fs_node *node); +static void del_sw_prio(struct fs_node *node); +static void del_sw_ns(struct fs_node *node); +/* Delete rule (destination) is special case that + * requires to lock the FTE for all the deletion process. + */ +static void del_sw_hw_rule(struct fs_node *node); +static bool mlx5_flow_dests_cmp(struct mlx5_flow_destination *d1, + struct mlx5_flow_destination *d2); +static void cleanup_root_ns(struct mlx5_flow_root_namespace *root_ns); +static struct mlx5_flow_rule * +find_flow_rule(struct fs_fte *fte, + struct mlx5_flow_destination *dest); + +static void tree_init_node(struct fs_node *node, + void (*del_hw_func)(struct fs_node *), + void (*del_sw_func)(struct fs_node *)) +{ + refcount_set(&node->refcount, 1); + INIT_LIST_HEAD(&node->list); + INIT_LIST_HEAD(&node->children); + init_rwsem(&node->lock); + node->del_hw_func = del_hw_func; + node->del_sw_func = del_sw_func; + node->active = false; +} + +static void tree_add_node(struct fs_node *node, struct fs_node *parent) +{ + if (parent) + refcount_inc(&parent->refcount); + node->parent = parent; + + /* Parent is the root */ + if (!parent) + node->root = node; + else + node->root = parent->root; +} + +static int tree_get_node(struct fs_node *node) +{ + return refcount_inc_not_zero(&node->refcount); +} + +static void nested_down_read_ref_node(struct fs_node *node, + enum fs_i_lock_class class) +{ + if (node) { + down_read_nested(&node->lock, class); + refcount_inc(&node->refcount); + } +} + +static void nested_down_write_ref_node(struct fs_node *node, + enum fs_i_lock_class class) +{ + if (node) { + down_write_nested(&node->lock, class); + refcount_inc(&node->refcount); + } +} + +static void down_write_ref_node(struct fs_node *node, bool locked) +{ + if (node) { + if (!locked) + down_write(&node->lock); + refcount_inc(&node->refcount); + } +} + +static void up_read_ref_node(struct fs_node *node) +{ + refcount_dec(&node->refcount); + up_read(&node->lock); +} + +static void up_write_ref_node(struct fs_node *node, bool locked) +{ + refcount_dec(&node->refcount); + if (!locked) + up_write(&node->lock); +} + +static void tree_put_node(struct fs_node *node, bool locked) +{ + struct fs_node *parent_node = node->parent; + + if (refcount_dec_and_test(&node->refcount)) { + if (node->del_hw_func) + node->del_hw_func(node); + if (parent_node) { + down_write_ref_node(parent_node, locked); + list_del_init(&node->list); + } + node->del_sw_func(node); + if (parent_node) + up_write_ref_node(parent_node, locked); + node = NULL; + } + if (!node && parent_node) + tree_put_node(parent_node, locked); +} + +static int tree_remove_node(struct fs_node *node, bool locked) +{ + if (refcount_read(&node->refcount) > 1) { + refcount_dec(&node->refcount); + return -EEXIST; + } + tree_put_node(node, locked); + return 0; +} + +static struct fs_prio *find_prio(struct mlx5_flow_namespace *ns, + unsigned int prio) +{ + struct fs_prio *iter_prio; + + fs_for_each_prio(iter_prio, ns) { + if (iter_prio->prio == prio) + return iter_prio; + } + + return NULL; +} + +static bool is_fwd_next_action(u32 action) +{ + return action & (MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO | + MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_NS); +} + +static bool check_valid_spec(const struct mlx5_flow_spec *spec) +{ + int i; + + for (i = 0; i < MLX5_ST_SZ_DW_MATCH_PARAM; i++) + if (spec->match_value[i] & ~spec->match_criteria[i]) { + pr_warn("mlx5_core: match_value differs from match_criteria\n"); + return false; + } + + return true; +} + +struct mlx5_flow_root_namespace *find_root(struct fs_node *node) +{ + struct fs_node *root; + struct mlx5_flow_namespace *ns; + + root = node->root; + + if (WARN_ON(root->type != FS_TYPE_NAMESPACE)) { + pr_warn("mlx5: flow steering node is not in tree or garbaged\n"); + return NULL; + } + + ns = container_of(root, struct mlx5_flow_namespace, node); + return container_of(ns, struct mlx5_flow_root_namespace, ns); +} + +static inline struct mlx5_flow_steering *get_steering(struct fs_node *node) +{ + struct mlx5_flow_root_namespace *root = find_root(node); + + if (root) + return root->dev->priv.steering; + return NULL; +} + +static inline struct mlx5_core_dev *get_dev(struct fs_node *node) +{ + struct mlx5_flow_root_namespace *root = find_root(node); + + if (root) + return root->dev; + return NULL; +} + +static void del_sw_ns(struct fs_node *node) +{ + kfree(node); +} + +static void del_sw_prio(struct fs_node *node) +{ + kfree(node); +} + +static void del_hw_flow_table(struct fs_node *node) +{ + struct mlx5_flow_root_namespace *root; + struct mlx5_flow_table *ft; + struct mlx5_core_dev *dev; + int err; + + fs_get_obj(ft, node); + dev = get_dev(&ft->node); + root = find_root(&ft->node); + trace_mlx5_fs_del_ft(ft); + + if (node->active) { + err = root->cmds->destroy_flow_table(root, ft); + if (err) + mlx5_core_warn(dev, "flow steering can't destroy ft\n"); + } +} + +static void del_sw_flow_table(struct fs_node *node) +{ + struct mlx5_flow_table *ft; + struct fs_prio *prio; + + fs_get_obj(ft, node); + + rhltable_destroy(&ft->fgs_hash); + if (ft->node.parent) { + fs_get_obj(prio, ft->node.parent); + prio->num_ft--; + } + kfree(ft); +} + +static void modify_fte(struct fs_fte *fte) +{ + struct mlx5_flow_root_namespace *root; + struct mlx5_flow_table *ft; + struct mlx5_flow_group *fg; + struct mlx5_core_dev *dev; + int err; + + fs_get_obj(fg, fte->node.parent); + fs_get_obj(ft, fg->node.parent); + dev = get_dev(&fte->node); + + root = find_root(&ft->node); + err = root->cmds->update_fte(root, ft, fg, fte->modify_mask, fte); + if (err) + mlx5_core_warn(dev, + "%s can't del rule fg id=%d fte_index=%d\n", + __func__, fg->id, fte->index); + fte->modify_mask = 0; +} + +static void del_sw_hw_rule(struct fs_node *node) +{ + struct mlx5_flow_rule *rule; + struct fs_fte *fte; + + fs_get_obj(rule, node); + fs_get_obj(fte, rule->node.parent); + trace_mlx5_fs_del_rule(rule); + if (is_fwd_next_action(rule->sw_action)) { + mutex_lock(&rule->dest_attr.ft->lock); + list_del(&rule->next_ft); + mutex_unlock(&rule->dest_attr.ft->lock); + } + + if (rule->dest_attr.type == MLX5_FLOW_DESTINATION_TYPE_COUNTER && + --fte->dests_size) { + fte->modify_mask |= + BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_ACTION) | + BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_FLOW_COUNTERS); + fte->action.action &= ~MLX5_FLOW_CONTEXT_ACTION_COUNT; + goto out; + } + + if (rule->dest_attr.type == MLX5_FLOW_DESTINATION_TYPE_PORT && + --fte->dests_size) { + fte->modify_mask |= BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_ACTION); + fte->action.action &= ~MLX5_FLOW_CONTEXT_ACTION_ALLOW; + goto out; + } + + if ((fte->action.action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) && + --fte->dests_size) { + fte->modify_mask |= + BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_DESTINATION_LIST); + } +out: + kfree(rule); +} + +static void del_hw_fte(struct fs_node *node) +{ + struct mlx5_flow_root_namespace *root; + struct mlx5_flow_table *ft; + struct mlx5_flow_group *fg; + struct mlx5_core_dev *dev; + struct fs_fte *fte; + int err; + + fs_get_obj(fte, node); + fs_get_obj(fg, fte->node.parent); + fs_get_obj(ft, fg->node.parent); + + trace_mlx5_fs_del_fte(fte); + dev = get_dev(&ft->node); + root = find_root(&ft->node); + if (node->active) { + err = root->cmds->delete_fte(root, ft, fte); + if (err) + mlx5_core_warn(dev, + "flow steering can't delete fte in index %d of flow group id %d\n", + fte->index, fg->id); + node->active = false; + } +} + +static void del_sw_fte(struct fs_node *node) +{ + struct mlx5_flow_steering *steering = get_steering(node); + struct mlx5_flow_group *fg; + struct fs_fte *fte; + int err; + + fs_get_obj(fte, node); + fs_get_obj(fg, fte->node.parent); + + err = rhashtable_remove_fast(&fg->ftes_hash, + &fte->hash, + rhash_fte); + WARN_ON(err); + ida_free(&fg->fte_allocator, fte->index - fg->start_index); + kmem_cache_free(steering->ftes_cache, fte); +} + +static void del_hw_flow_group(struct fs_node *node) +{ + struct mlx5_flow_root_namespace *root; + struct mlx5_flow_group *fg; + struct mlx5_flow_table *ft; + struct mlx5_core_dev *dev; + + fs_get_obj(fg, node); + fs_get_obj(ft, fg->node.parent); + dev = get_dev(&ft->node); + trace_mlx5_fs_del_fg(fg); + + root = find_root(&ft->node); + if (fg->node.active && root->cmds->destroy_flow_group(root, ft, fg)) + mlx5_core_warn(dev, "flow steering can't destroy fg %d of ft %d\n", + fg->id, ft->id); +} + +static void del_sw_flow_group(struct fs_node *node) +{ + struct mlx5_flow_steering *steering = get_steering(node); + struct mlx5_flow_group *fg; + struct mlx5_flow_table *ft; + int err; + + fs_get_obj(fg, node); + fs_get_obj(ft, fg->node.parent); + + rhashtable_destroy(&fg->ftes_hash); + ida_destroy(&fg->fte_allocator); + if (ft->autogroup.active && + fg->max_ftes == ft->autogroup.group_size && + fg->start_index < ft->autogroup.max_fte) + ft->autogroup.num_groups--; + err = rhltable_remove(&ft->fgs_hash, + &fg->hash, + rhash_fg); + WARN_ON(err); + kmem_cache_free(steering->fgs_cache, fg); +} + +static int insert_fte(struct mlx5_flow_group *fg, struct fs_fte *fte) +{ + int index; + int ret; + + index = ida_alloc_max(&fg->fte_allocator, fg->max_ftes - 1, GFP_KERNEL); + if (index < 0) + return index; + + fte->index = index + fg->start_index; + ret = rhashtable_insert_fast(&fg->ftes_hash, + &fte->hash, + rhash_fte); + if (ret) + goto err_ida_remove; + + tree_add_node(&fte->node, &fg->node); + list_add_tail(&fte->node.list, &fg->node.children); + return 0; + +err_ida_remove: + ida_free(&fg->fte_allocator, index); + return ret; +} + +static struct fs_fte *alloc_fte(struct mlx5_flow_table *ft, + const struct mlx5_flow_spec *spec, + struct mlx5_flow_act *flow_act) +{ + struct mlx5_flow_steering *steering = get_steering(&ft->node); + struct fs_fte *fte; + + fte = kmem_cache_zalloc(steering->ftes_cache, GFP_KERNEL); + if (!fte) + return ERR_PTR(-ENOMEM); + + memcpy(fte->val, &spec->match_value, sizeof(fte->val)); + fte->node.type = FS_TYPE_FLOW_ENTRY; + fte->action = *flow_act; + fte->flow_context = spec->flow_context; + + tree_init_node(&fte->node, del_hw_fte, del_sw_fte); + + return fte; +} + +static void dealloc_flow_group(struct mlx5_flow_steering *steering, + struct mlx5_flow_group *fg) +{ + rhashtable_destroy(&fg->ftes_hash); + kmem_cache_free(steering->fgs_cache, fg); +} + +static struct mlx5_flow_group *alloc_flow_group(struct mlx5_flow_steering *steering, + u8 match_criteria_enable, + const void *match_criteria, + int start_index, + int end_index) +{ + struct mlx5_flow_group *fg; + int ret; + + fg = kmem_cache_zalloc(steering->fgs_cache, GFP_KERNEL); + if (!fg) + return ERR_PTR(-ENOMEM); + + ret = rhashtable_init(&fg->ftes_hash, &rhash_fte); + if (ret) { + kmem_cache_free(steering->fgs_cache, fg); + return ERR_PTR(ret); + } + + ida_init(&fg->fte_allocator); + fg->mask.match_criteria_enable = match_criteria_enable; + memcpy(&fg->mask.match_criteria, match_criteria, + sizeof(fg->mask.match_criteria)); + fg->node.type = FS_TYPE_FLOW_GROUP; + fg->start_index = start_index; + fg->max_ftes = end_index - start_index + 1; + + return fg; +} + +static struct mlx5_flow_group *alloc_insert_flow_group(struct mlx5_flow_table *ft, + u8 match_criteria_enable, + const void *match_criteria, + int start_index, + int end_index, + struct list_head *prev) +{ + struct mlx5_flow_steering *steering = get_steering(&ft->node); + struct mlx5_flow_group *fg; + int ret; + + fg = alloc_flow_group(steering, match_criteria_enable, match_criteria, + start_index, end_index); + if (IS_ERR(fg)) + return fg; + + /* initialize refcnt, add to parent list */ + ret = rhltable_insert(&ft->fgs_hash, + &fg->hash, + rhash_fg); + if (ret) { + dealloc_flow_group(steering, fg); + return ERR_PTR(ret); + } + + tree_init_node(&fg->node, del_hw_flow_group, del_sw_flow_group); + tree_add_node(&fg->node, &ft->node); + /* Add node to group list */ + list_add(&fg->node.list, prev); + atomic_inc(&ft->node.version); + + return fg; +} + +static struct mlx5_flow_table *alloc_flow_table(int level, u16 vport, + enum fs_flow_table_type table_type, + enum fs_flow_table_op_mod op_mod, + u32 flags) +{ + struct mlx5_flow_table *ft; + int ret; + + ft = kzalloc(sizeof(*ft), GFP_KERNEL); + if (!ft) + return ERR_PTR(-ENOMEM); + + ret = rhltable_init(&ft->fgs_hash, &rhash_fg); + if (ret) { + kfree(ft); + return ERR_PTR(ret); + } + + ft->level = level; + ft->node.type = FS_TYPE_FLOW_TABLE; + ft->op_mod = op_mod; + ft->type = table_type; + ft->vport = vport; + ft->flags = flags; + INIT_LIST_HEAD(&ft->fwd_rules); + mutex_init(&ft->lock); + + return ft; +} + +/* If reverse is false, then we search for the first flow table in the + * root sub-tree from start(closest from right), else we search for the + * last flow table in the root sub-tree till start(closest from left). + */ +static struct mlx5_flow_table *find_closest_ft_recursive(struct fs_node *root, + struct list_head *start, + bool reverse, + bool ignore_chains_attr) +{ +#define list_advance_entry(pos, reverse) \ + ((reverse) ? list_prev_entry(pos, list) : list_next_entry(pos, list)) + +#define list_for_each_advance_continue(pos, head, reverse) \ + for (pos = list_advance_entry(pos, reverse); \ + &pos->list != (head); \ + pos = list_advance_entry(pos, reverse)) + + struct fs_node *iter = list_entry(start, struct fs_node, list); + struct mlx5_flow_table *ft = NULL; + + if (!root || (root->type == FS_TYPE_PRIO_CHAINS && + !ignore_chains_attr)) + return NULL; + + list_for_each_advance_continue(iter, &root->children, reverse) { + if (iter->type == FS_TYPE_FLOW_TABLE) { + fs_get_obj(ft, iter); + return ft; + } + ft = find_closest_ft_recursive(iter, &iter->children, reverse, + ignore_chains_attr); + if (ft) + return ft; + } + + return ft; +} + +/* If reverse is false then return the first flow table in next priority of + * prio in the tree, else return the last flow table in the previous priority + * of prio in the tree. + */ +static struct mlx5_flow_table *find_closest_ft(struct fs_prio *prio, bool reverse) +{ + struct mlx5_flow_table *ft = NULL; + struct fs_node *curr_node; + struct fs_node *parent; + + parent = prio->node.parent; + curr_node = &prio->node; + while (!ft && parent) { + ft = find_closest_ft_recursive(parent, &curr_node->list, + reverse, + parent->type != + FS_TYPE_PRIO_CHAINS); + curr_node = parent; + parent = curr_node->parent; + } + return ft; +} + +/* Assuming all the tree is locked by mutex chain lock */ +static struct mlx5_flow_table *find_next_chained_ft(struct fs_prio *prio) +{ + return find_closest_ft(prio, false); +} + +/* Assuming all the tree is locked by mutex chain lock */ +static struct mlx5_flow_table *find_prev_chained_ft(struct fs_prio *prio) +{ + return find_closest_ft(prio, true); +} + +static struct mlx5_flow_table *find_next_fwd_ft(struct mlx5_flow_table *ft, + struct mlx5_flow_act *flow_act) +{ + struct fs_prio *prio; + bool next_ns; + + next_ns = flow_act->action & MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_NS; + fs_get_obj(prio, next_ns ? ft->ns->node.parent : ft->node.parent); + + return find_next_chained_ft(prio); +} + +static int connect_fts_in_prio(struct mlx5_core_dev *dev, + struct fs_prio *prio, + struct mlx5_flow_table *ft) +{ + struct mlx5_flow_root_namespace *root = find_root(&prio->node); + struct mlx5_flow_table *iter; + int err; + + fs_for_each_ft(iter, prio) { + err = root->cmds->modify_flow_table(root, iter, ft); + if (err) { + mlx5_core_err(dev, + "Failed to modify flow table id %d, type %d, err %d\n", + iter->id, iter->type, err); + /* The driver is out of sync with the FW */ + return err; + } + } + return 0; +} + +/* Connect flow tables from previous priority of prio to ft */ +static int connect_prev_fts(struct mlx5_core_dev *dev, + struct mlx5_flow_table *ft, + struct fs_prio *prio) +{ + struct mlx5_flow_table *prev_ft; + + prev_ft = find_prev_chained_ft(prio); + if (prev_ft) { + struct fs_prio *prev_prio; + + fs_get_obj(prev_prio, prev_ft->node.parent); + return connect_fts_in_prio(dev, prev_prio, ft); + } + return 0; +} + +static int update_root_ft_create(struct mlx5_flow_table *ft, struct fs_prio + *prio) +{ + struct mlx5_flow_root_namespace *root = find_root(&prio->node); + struct mlx5_ft_underlay_qp *uqp; + int min_level = INT_MAX; + int err = 0; + u32 qpn; + + if (root->root_ft) + min_level = root->root_ft->level; + + if (ft->level >= min_level) + return 0; + + if (list_empty(&root->underlay_qpns)) { + /* Don't set any QPN (zero) in case QPN list is empty */ + qpn = 0; + err = root->cmds->update_root_ft(root, ft, qpn, false); + } else { + list_for_each_entry(uqp, &root->underlay_qpns, list) { + qpn = uqp->qpn; + err = root->cmds->update_root_ft(root, ft, + qpn, false); + if (err) + break; + } + } + + if (err) + mlx5_core_warn(root->dev, + "Update root flow table of id(%u) qpn(%d) failed\n", + ft->id, qpn); + else + root->root_ft = ft; + + return err; +} + +static int _mlx5_modify_rule_destination(struct mlx5_flow_rule *rule, + struct mlx5_flow_destination *dest) +{ + struct mlx5_flow_root_namespace *root; + struct mlx5_flow_table *ft; + struct mlx5_flow_group *fg; + struct fs_fte *fte; + int modify_mask = BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_DESTINATION_LIST); + int err = 0; + + fs_get_obj(fte, rule->node.parent); + if (!(fte->action.action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST)) + return -EINVAL; + down_write_ref_node(&fte->node, false); + fs_get_obj(fg, fte->node.parent); + fs_get_obj(ft, fg->node.parent); + + memcpy(&rule->dest_attr, dest, sizeof(*dest)); + root = find_root(&ft->node); + err = root->cmds->update_fte(root, ft, fg, + modify_mask, fte); + up_write_ref_node(&fte->node, false); + + return err; +} + +int mlx5_modify_rule_destination(struct mlx5_flow_handle *handle, + struct mlx5_flow_destination *new_dest, + struct mlx5_flow_destination *old_dest) +{ + int i; + + if (!old_dest) { + if (handle->num_rules != 1) + return -EINVAL; + return _mlx5_modify_rule_destination(handle->rule[0], + new_dest); + } + + for (i = 0; i < handle->num_rules; i++) { + if (mlx5_flow_dests_cmp(new_dest, &handle->rule[i]->dest_attr)) + return _mlx5_modify_rule_destination(handle->rule[i], + new_dest); + } + + return -EINVAL; +} + +/* Modify/set FWD rules that point on old_next_ft to point on new_next_ft */ +static int connect_fwd_rules(struct mlx5_core_dev *dev, + struct mlx5_flow_table *new_next_ft, + struct mlx5_flow_table *old_next_ft) +{ + struct mlx5_flow_destination dest = {}; + struct mlx5_flow_rule *iter; + int err = 0; + + /* new_next_ft and old_next_ft could be NULL only + * when we create/destroy the anchor flow table. + */ + if (!new_next_ft || !old_next_ft) + return 0; + + dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dest.ft = new_next_ft; + + mutex_lock(&old_next_ft->lock); + list_splice_init(&old_next_ft->fwd_rules, &new_next_ft->fwd_rules); + mutex_unlock(&old_next_ft->lock); + list_for_each_entry(iter, &new_next_ft->fwd_rules, next_ft) { + if ((iter->sw_action & MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_NS) && + iter->ft->ns == new_next_ft->ns) + continue; + + err = _mlx5_modify_rule_destination(iter, &dest); + if (err) + pr_err("mlx5_core: failed to modify rule to point on flow table %d\n", + new_next_ft->id); + } + return 0; +} + +static int connect_flow_table(struct mlx5_core_dev *dev, struct mlx5_flow_table *ft, + struct fs_prio *prio) +{ + struct mlx5_flow_table *next_ft, *first_ft; + int err = 0; + + /* Connect_prev_fts and update_root_ft_create are mutually exclusive */ + + first_ft = list_first_entry_or_null(&prio->node.children, + struct mlx5_flow_table, node.list); + if (!first_ft || first_ft->level > ft->level) { + err = connect_prev_fts(dev, ft, prio); + if (err) + return err; + + next_ft = first_ft ? first_ft : find_next_chained_ft(prio); + err = connect_fwd_rules(dev, ft, next_ft); + if (err) + return err; + } + + if (MLX5_CAP_FLOWTABLE(dev, + flow_table_properties_nic_receive.modify_root)) + err = update_root_ft_create(ft, prio); + return err; +} + +static void list_add_flow_table(struct mlx5_flow_table *ft, + struct fs_prio *prio) +{ + struct list_head *prev = &prio->node.children; + struct mlx5_flow_table *iter; + + fs_for_each_ft(iter, prio) { + if (iter->level > ft->level) + break; + prev = &iter->node.list; + } + list_add(&ft->node.list, prev); +} + +static struct mlx5_flow_table *__mlx5_create_flow_table(struct mlx5_flow_namespace *ns, + struct mlx5_flow_table_attr *ft_attr, + enum fs_flow_table_op_mod op_mod, + u16 vport) +{ + struct mlx5_flow_root_namespace *root = find_root(&ns->node); + bool unmanaged = ft_attr->flags & MLX5_FLOW_TABLE_UNMANAGED; + struct mlx5_flow_table *next_ft; + struct fs_prio *fs_prio = NULL; + struct mlx5_flow_table *ft; + int err; + + if (!root) { + pr_err("mlx5: flow steering failed to find root of namespace\n"); + return ERR_PTR(-ENODEV); + } + + mutex_lock(&root->chain_lock); + fs_prio = find_prio(ns, ft_attr->prio); + if (!fs_prio) { + err = -EINVAL; + goto unlock_root; + } + if (!unmanaged) { + /* The level is related to the + * priority level range. + */ + if (ft_attr->level >= fs_prio->num_levels) { + err = -ENOSPC; + goto unlock_root; + } + + ft_attr->level += fs_prio->start_level; + } + + /* The level is related to the + * priority level range. + */ + ft = alloc_flow_table(ft_attr->level, + vport, + root->table_type, + op_mod, ft_attr->flags); + if (IS_ERR(ft)) { + err = PTR_ERR(ft); + goto unlock_root; + } + + tree_init_node(&ft->node, del_hw_flow_table, del_sw_flow_table); + next_ft = unmanaged ? ft_attr->next_ft : + find_next_chained_ft(fs_prio); + ft->def_miss_action = ns->def_miss_action; + ft->ns = ns; + err = root->cmds->create_flow_table(root, ft, ft_attr, next_ft); + if (err) + goto free_ft; + + if (!unmanaged) { + err = connect_flow_table(root->dev, ft, fs_prio); + if (err) + goto destroy_ft; + } + + ft->node.active = true; + down_write_ref_node(&fs_prio->node, false); + if (!unmanaged) { + tree_add_node(&ft->node, &fs_prio->node); + list_add_flow_table(ft, fs_prio); + } else { + ft->node.root = fs_prio->node.root; + } + fs_prio->num_ft++; + up_write_ref_node(&fs_prio->node, false); + mutex_unlock(&root->chain_lock); + trace_mlx5_fs_add_ft(ft); + return ft; +destroy_ft: + root->cmds->destroy_flow_table(root, ft); +free_ft: + rhltable_destroy(&ft->fgs_hash); + kfree(ft); +unlock_root: + mutex_unlock(&root->chain_lock); + return ERR_PTR(err); +} + +struct mlx5_flow_table *mlx5_create_flow_table(struct mlx5_flow_namespace *ns, + struct mlx5_flow_table_attr *ft_attr) +{ + return __mlx5_create_flow_table(ns, ft_attr, FS_FT_OP_MOD_NORMAL, 0); +} +EXPORT_SYMBOL(mlx5_create_flow_table); + +u32 mlx5_flow_table_id(struct mlx5_flow_table *ft) +{ + return ft->id; +} +EXPORT_SYMBOL(mlx5_flow_table_id); + +struct mlx5_flow_table * +mlx5_create_vport_flow_table(struct mlx5_flow_namespace *ns, + struct mlx5_flow_table_attr *ft_attr, u16 vport) +{ + return __mlx5_create_flow_table(ns, ft_attr, FS_FT_OP_MOD_NORMAL, vport); +} + +struct mlx5_flow_table* +mlx5_create_lag_demux_flow_table(struct mlx5_flow_namespace *ns, + int prio, u32 level) +{ + struct mlx5_flow_table_attr ft_attr = {}; + + ft_attr.level = level; + ft_attr.prio = prio; + ft_attr.max_fte = 1; + + return __mlx5_create_flow_table(ns, &ft_attr, FS_FT_OP_MOD_LAG_DEMUX, 0); +} +EXPORT_SYMBOL(mlx5_create_lag_demux_flow_table); + +#define MAX_FLOW_GROUP_SIZE BIT(24) +struct mlx5_flow_table* +mlx5_create_auto_grouped_flow_table(struct mlx5_flow_namespace *ns, + struct mlx5_flow_table_attr *ft_attr) +{ + int num_reserved_entries = ft_attr->autogroup.num_reserved_entries; + int max_num_groups = ft_attr->autogroup.max_num_groups; + struct mlx5_flow_table *ft; + int autogroups_max_fte; + + ft = mlx5_create_flow_table(ns, ft_attr); + if (IS_ERR(ft)) + return ft; + + autogroups_max_fte = ft->max_fte - num_reserved_entries; + if (max_num_groups > autogroups_max_fte) + goto err_validate; + if (num_reserved_entries > ft->max_fte) + goto err_validate; + + /* Align the number of groups according to the largest group size */ + if (autogroups_max_fte / (max_num_groups + 1) > MAX_FLOW_GROUP_SIZE) + max_num_groups = (autogroups_max_fte / MAX_FLOW_GROUP_SIZE) - 1; + + ft->autogroup.active = true; + ft->autogroup.required_groups = max_num_groups; + ft->autogroup.max_fte = autogroups_max_fte; + /* We save place for flow groups in addition to max types */ + ft->autogroup.group_size = autogroups_max_fte / (max_num_groups + 1); + + return ft; + +err_validate: + mlx5_destroy_flow_table(ft); + return ERR_PTR(-ENOSPC); +} +EXPORT_SYMBOL(mlx5_create_auto_grouped_flow_table); + +static bool is_valid_matcher(u32 *fg_in) +{ + u8 fg_type = MLX5_GET(create_flow_group_in, fg_in, group_type); + u8 match_criteria_enable = MLX5_GET(create_flow_group_in, + fg_in, + match_criteria_enable); + int definer_id = MLX5_GET(create_flow_group_in, fg_in, + match_definer_id); + + if (!definer_id) + return fg_type != + MLX5_CREATE_FLOW_GROUP_IN_GROUP_TYPE_HASH_SPLIT; + + if (match_criteria_enable || + fg_type != MLX5_CREATE_FLOW_GROUP_IN_GROUP_TYPE_HASH_SPLIT) + return false; + return true; +} + +struct mlx5_flow_group *mlx5_create_flow_group(struct mlx5_flow_table *ft, + u32 *fg_in) +{ + struct mlx5_flow_root_namespace *root = find_root(&ft->node); + void *match_criteria = MLX5_ADDR_OF(create_flow_group_in, + fg_in, match_criteria); + u8 match_criteria_enable = MLX5_GET(create_flow_group_in, + fg_in, + match_criteria_enable); + int start_index = MLX5_GET(create_flow_group_in, fg_in, + start_flow_index); + int end_index = MLX5_GET(create_flow_group_in, fg_in, + end_flow_index); + struct mlx5_flow_group *fg; + int err; + + if (ft->autogroup.active && start_index < ft->autogroup.max_fte) + return ERR_PTR(-EPERM); + + if (!is_valid_matcher(fg_in)) { + mlx5_core_warn(root->dev, "Not a valid matcher\n"); + return ERR_PTR(-EINVAL); + } + + down_write_ref_node(&ft->node, false); + fg = alloc_insert_flow_group(ft, match_criteria_enable, match_criteria, + start_index, end_index, + ft->node.children.prev); + up_write_ref_node(&ft->node, false); + if (IS_ERR(fg)) + return fg; + + err = root->cmds->create_flow_group(root, ft, fg_in, fg); + if (err) { + tree_put_node(&fg->node, false); + return ERR_PTR(err); + } + trace_mlx5_fs_add_fg(fg); + fg->node.active = true; + + return fg; +} +EXPORT_SYMBOL(mlx5_create_flow_group); + +static struct mlx5_flow_rule *alloc_rule(struct mlx5_flow_destination *dest) +{ + struct mlx5_flow_rule *rule; + + rule = kzalloc(sizeof(*rule), GFP_KERNEL); + if (!rule) + return NULL; + + INIT_LIST_HEAD(&rule->next_ft); + rule->node.type = FS_TYPE_FLOW_DEST; + if (dest) + memcpy(&rule->dest_attr, dest, sizeof(*dest)); + + return rule; +} + +static struct mlx5_flow_handle *alloc_handle(int num_rules) +{ + struct mlx5_flow_handle *handle; + + handle = kzalloc(struct_size(handle, rule, num_rules), GFP_KERNEL); + if (!handle) + return NULL; + + handle->num_rules = num_rules; + + return handle; +} + +static void destroy_flow_handle(struct fs_fte *fte, + struct mlx5_flow_handle *handle, + struct mlx5_flow_destination *dest, + int i) +{ + for (; --i >= 0;) { + if (refcount_dec_and_test(&handle->rule[i]->node.refcount)) { + fte->dests_size--; + list_del(&handle->rule[i]->node.list); + kfree(handle->rule[i]); + } + } + kfree(handle); +} + +static struct mlx5_flow_handle * +create_flow_handle(struct fs_fte *fte, + struct mlx5_flow_destination *dest, + int dest_num, + int *modify_mask, + bool *new_rule) +{ + struct mlx5_flow_handle *handle; + struct mlx5_flow_rule *rule = NULL; + static int count = BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_FLOW_COUNTERS); + static int dst = BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_DESTINATION_LIST); + int type; + int i = 0; + + handle = alloc_handle((dest_num) ? dest_num : 1); + if (!handle) + return ERR_PTR(-ENOMEM); + + do { + if (dest) { + rule = find_flow_rule(fte, dest + i); + if (rule) { + refcount_inc(&rule->node.refcount); + goto rule_found; + } + } + + *new_rule = true; + rule = alloc_rule(dest + i); + if (!rule) + goto free_rules; + + /* Add dest to dests list- we need flow tables to be in the + * end of the list for forward to next prio rules. + */ + tree_init_node(&rule->node, NULL, del_sw_hw_rule); + if (dest && + dest[i].type != MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE) + list_add(&rule->node.list, &fte->node.children); + else + list_add_tail(&rule->node.list, &fte->node.children); + if (dest) { + fte->dests_size++; + + type = dest[i].type == + MLX5_FLOW_DESTINATION_TYPE_COUNTER; + *modify_mask |= type ? count : dst; + } +rule_found: + handle->rule[i] = rule; + } while (++i < dest_num); + + return handle; + +free_rules: + destroy_flow_handle(fte, handle, dest, i); + return ERR_PTR(-ENOMEM); +} + +/* fte should not be deleted while calling this function */ +static struct mlx5_flow_handle * +add_rule_fte(struct fs_fte *fte, + struct mlx5_flow_group *fg, + struct mlx5_flow_destination *dest, + int dest_num, + bool update_action) +{ + struct mlx5_flow_root_namespace *root; + struct mlx5_flow_handle *handle; + struct mlx5_flow_table *ft; + int modify_mask = 0; + int err; + bool new_rule = false; + + handle = create_flow_handle(fte, dest, dest_num, &modify_mask, + &new_rule); + if (IS_ERR(handle) || !new_rule) + goto out; + + if (update_action) + modify_mask |= BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_ACTION); + + fs_get_obj(ft, fg->node.parent); + root = find_root(&fg->node); + if (!(fte->status & FS_FTE_STATUS_EXISTING)) + err = root->cmds->create_fte(root, ft, fg, fte); + else + err = root->cmds->update_fte(root, ft, fg, modify_mask, fte); + if (err) + goto free_handle; + + fte->node.active = true; + fte->status |= FS_FTE_STATUS_EXISTING; + atomic_inc(&fg->node.version); + +out: + return handle; + +free_handle: + destroy_flow_handle(fte, handle, dest, handle->num_rules); + return ERR_PTR(err); +} + +static struct mlx5_flow_group *alloc_auto_flow_group(struct mlx5_flow_table *ft, + const struct mlx5_flow_spec *spec) +{ + struct list_head *prev = &ft->node.children; + u32 max_fte = ft->autogroup.max_fte; + unsigned int candidate_index = 0; + unsigned int group_size = 0; + struct mlx5_flow_group *fg; + + if (!ft->autogroup.active) + return ERR_PTR(-ENOENT); + + if (ft->autogroup.num_groups < ft->autogroup.required_groups) + group_size = ft->autogroup.group_size; + + /* max_fte == ft->autogroup.max_types */ + if (group_size == 0) + group_size = 1; + + /* sorted by start_index */ + fs_for_each_fg(fg, ft) { + if (candidate_index + group_size > fg->start_index) + candidate_index = fg->start_index + fg->max_ftes; + else + break; + prev = &fg->node.list; + } + + if (candidate_index + group_size > max_fte) + return ERR_PTR(-ENOSPC); + + fg = alloc_insert_flow_group(ft, + spec->match_criteria_enable, + spec->match_criteria, + candidate_index, + candidate_index + group_size - 1, + prev); + if (IS_ERR(fg)) + goto out; + + if (group_size == ft->autogroup.group_size) + ft->autogroup.num_groups++; + +out: + return fg; +} + +static int create_auto_flow_group(struct mlx5_flow_table *ft, + struct mlx5_flow_group *fg) +{ + struct mlx5_flow_root_namespace *root = find_root(&ft->node); + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + void *match_criteria_addr; + u8 src_esw_owner_mask_on; + void *misc; + int err; + u32 *in; + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(create_flow_group_in, in, match_criteria_enable, + fg->mask.match_criteria_enable); + MLX5_SET(create_flow_group_in, in, start_flow_index, fg->start_index); + MLX5_SET(create_flow_group_in, in, end_flow_index, fg->start_index + + fg->max_ftes - 1); + + misc = MLX5_ADDR_OF(fte_match_param, fg->mask.match_criteria, + misc_parameters); + src_esw_owner_mask_on = !!MLX5_GET(fte_match_set_misc, misc, + source_eswitch_owner_vhca_id); + MLX5_SET(create_flow_group_in, in, + source_eswitch_owner_vhca_id_valid, src_esw_owner_mask_on); + + match_criteria_addr = MLX5_ADDR_OF(create_flow_group_in, + in, match_criteria); + memcpy(match_criteria_addr, fg->mask.match_criteria, + sizeof(fg->mask.match_criteria)); + + err = root->cmds->create_flow_group(root, ft, in, fg); + if (!err) { + fg->node.active = true; + trace_mlx5_fs_add_fg(fg); + } + + kvfree(in); + return err; +} + +static bool mlx5_flow_dests_cmp(struct mlx5_flow_destination *d1, + struct mlx5_flow_destination *d2) +{ + if (d1->type == d2->type) { + if (((d1->type == MLX5_FLOW_DESTINATION_TYPE_VPORT || + d1->type == MLX5_FLOW_DESTINATION_TYPE_UPLINK) && + d1->vport.num == d2->vport.num && + d1->vport.flags == d2->vport.flags && + ((d1->vport.flags & MLX5_FLOW_DEST_VPORT_VHCA_ID) ? + (d1->vport.vhca_id == d2->vport.vhca_id) : true) && + ((d1->vport.flags & MLX5_FLOW_DEST_VPORT_REFORMAT_ID) ? + (d1->vport.pkt_reformat->id == + d2->vport.pkt_reformat->id) : true)) || + (d1->type == MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE && + d1->ft == d2->ft) || + (d1->type == MLX5_FLOW_DESTINATION_TYPE_TIR && + d1->tir_num == d2->tir_num) || + (d1->type == MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM && + d1->ft_num == d2->ft_num) || + (d1->type == MLX5_FLOW_DESTINATION_TYPE_FLOW_SAMPLER && + d1->sampler_id == d2->sampler_id)) + return true; + } + + return false; +} + +static struct mlx5_flow_rule *find_flow_rule(struct fs_fte *fte, + struct mlx5_flow_destination *dest) +{ + struct mlx5_flow_rule *rule; + + list_for_each_entry(rule, &fte->node.children, node.list) { + if (mlx5_flow_dests_cmp(&rule->dest_attr, dest)) + return rule; + } + return NULL; +} + +static bool check_conflicting_actions_vlan(const struct mlx5_fs_vlan *vlan0, + const struct mlx5_fs_vlan *vlan1) +{ + return vlan0->ethtype != vlan1->ethtype || + vlan0->vid != vlan1->vid || + vlan0->prio != vlan1->prio; +} + +static bool check_conflicting_actions(const struct mlx5_flow_act *act1, + const struct mlx5_flow_act *act2) +{ + u32 action1 = act1->action; + u32 action2 = act2->action; + u32 xored_actions; + + xored_actions = action1 ^ action2; + + /* if one rule only wants to count, it's ok */ + if (action1 == MLX5_FLOW_CONTEXT_ACTION_COUNT || + action2 == MLX5_FLOW_CONTEXT_ACTION_COUNT) + return false; + + if (xored_actions & (MLX5_FLOW_CONTEXT_ACTION_DROP | + MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT | + MLX5_FLOW_CONTEXT_ACTION_DECAP | + MLX5_FLOW_CONTEXT_ACTION_MOD_HDR | + MLX5_FLOW_CONTEXT_ACTION_VLAN_POP | + MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH | + MLX5_FLOW_CONTEXT_ACTION_VLAN_POP_2 | + MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH_2)) + return true; + + if (action1 & MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT && + act1->pkt_reformat != act2->pkt_reformat) + return true; + + if (action1 & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR && + act1->modify_hdr != act2->modify_hdr) + return true; + + if (action1 & MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH && + check_conflicting_actions_vlan(&act1->vlan[0], &act2->vlan[0])) + return true; + + if (action1 & MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH_2 && + check_conflicting_actions_vlan(&act1->vlan[1], &act2->vlan[1])) + return true; + + return false; +} + +static int check_conflicting_ftes(struct fs_fte *fte, + const struct mlx5_flow_context *flow_context, + const struct mlx5_flow_act *flow_act) +{ + if (check_conflicting_actions(flow_act, &fte->action)) { + mlx5_core_warn(get_dev(&fte->node), + "Found two FTEs with conflicting actions\n"); + return -EEXIST; + } + + if ((flow_context->flags & FLOW_CONTEXT_HAS_TAG) && + fte->flow_context.flow_tag != flow_context->flow_tag) { + mlx5_core_warn(get_dev(&fte->node), + "FTE flow tag %u already exists with different flow tag %u\n", + fte->flow_context.flow_tag, + flow_context->flow_tag); + return -EEXIST; + } + + return 0; +} + +static struct mlx5_flow_handle *add_rule_fg(struct mlx5_flow_group *fg, + const struct mlx5_flow_spec *spec, + struct mlx5_flow_act *flow_act, + struct mlx5_flow_destination *dest, + int dest_num, + struct fs_fte *fte) +{ + struct mlx5_flow_handle *handle; + int old_action; + int i; + int ret; + + ret = check_conflicting_ftes(fte, &spec->flow_context, flow_act); + if (ret) + return ERR_PTR(ret); + + old_action = fte->action.action; + fte->action.action |= flow_act->action; + handle = add_rule_fte(fte, fg, dest, dest_num, + old_action != flow_act->action); + if (IS_ERR(handle)) { + fte->action.action = old_action; + return handle; + } + trace_mlx5_fs_set_fte(fte, false); + + for (i = 0; i < handle->num_rules; i++) { + if (refcount_read(&handle->rule[i]->node.refcount) == 1) { + tree_add_node(&handle->rule[i]->node, &fte->node); + trace_mlx5_fs_add_rule(handle->rule[i]); + } + } + return handle; +} + +static bool counter_is_valid(u32 action) +{ + return (action & (MLX5_FLOW_CONTEXT_ACTION_DROP | + MLX5_FLOW_CONTEXT_ACTION_ALLOW | + MLX5_FLOW_CONTEXT_ACTION_FWD_DEST)); +} + +static bool dest_is_valid(struct mlx5_flow_destination *dest, + struct mlx5_flow_act *flow_act, + struct mlx5_flow_table *ft) +{ + bool ignore_level = flow_act->flags & FLOW_ACT_IGNORE_FLOW_LEVEL; + u32 action = flow_act->action; + + if (dest && (dest->type == MLX5_FLOW_DESTINATION_TYPE_COUNTER)) + return counter_is_valid(action); + + if (!(action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST)) + return true; + + if (ignore_level) { + if (ft->type != FS_FT_FDB && + ft->type != FS_FT_NIC_RX) + return false; + + if (dest->type == MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE && + ft->type != dest->ft->type) + return false; + } + + if (!dest || ((dest->type == + MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE) && + (dest->ft->level <= ft->level && !ignore_level))) + return false; + return true; +} + +struct match_list { + struct list_head list; + struct mlx5_flow_group *g; +}; + +static void free_match_list(struct match_list *head, bool ft_locked) +{ + struct match_list *iter, *match_tmp; + + list_for_each_entry_safe(iter, match_tmp, &head->list, + list) { + tree_put_node(&iter->g->node, ft_locked); + list_del(&iter->list); + kfree(iter); + } +} + +static int build_match_list(struct match_list *match_head, + struct mlx5_flow_table *ft, + const struct mlx5_flow_spec *spec, + struct mlx5_flow_group *fg, + bool ft_locked) +{ + struct rhlist_head *tmp, *list; + struct mlx5_flow_group *g; + int err = 0; + + rcu_read_lock(); + INIT_LIST_HEAD(&match_head->list); + /* Collect all fgs which has a matching match_criteria */ + list = rhltable_lookup(&ft->fgs_hash, spec, rhash_fg); + /* RCU is atomic, we can't execute FW commands here */ + rhl_for_each_entry_rcu(g, tmp, list, hash) { + struct match_list *curr_match; + + if (fg && fg != g) + continue; + + if (unlikely(!tree_get_node(&g->node))) + continue; + + curr_match = kmalloc(sizeof(*curr_match), GFP_ATOMIC); + if (!curr_match) { + rcu_read_unlock(); + free_match_list(match_head, ft_locked); + return -ENOMEM; + } + curr_match->g = g; + list_add_tail(&curr_match->list, &match_head->list); + } + rcu_read_unlock(); + return err; +} + +static u64 matched_fgs_get_version(struct list_head *match_head) +{ + struct match_list *iter; + u64 version = 0; + + list_for_each_entry(iter, match_head, list) + version += (u64)atomic_read(&iter->g->node.version); + return version; +} + +static struct fs_fte * +lookup_fte_locked(struct mlx5_flow_group *g, + const u32 *match_value, + bool take_write) +{ + struct fs_fte *fte_tmp; + + if (take_write) + nested_down_write_ref_node(&g->node, FS_LOCK_PARENT); + else + nested_down_read_ref_node(&g->node, FS_LOCK_PARENT); + fte_tmp = rhashtable_lookup_fast(&g->ftes_hash, match_value, + rhash_fte); + if (!fte_tmp || !tree_get_node(&fte_tmp->node)) { + fte_tmp = NULL; + goto out; + } + if (!fte_tmp->node.active) { + tree_put_node(&fte_tmp->node, false); + fte_tmp = NULL; + goto out; + } + + nested_down_write_ref_node(&fte_tmp->node, FS_LOCK_CHILD); +out: + if (take_write) + up_write_ref_node(&g->node, false); + else + up_read_ref_node(&g->node); + return fte_tmp; +} + +static struct mlx5_flow_handle * +try_add_to_existing_fg(struct mlx5_flow_table *ft, + struct list_head *match_head, + const struct mlx5_flow_spec *spec, + struct mlx5_flow_act *flow_act, + struct mlx5_flow_destination *dest, + int dest_num, + int ft_version) +{ + struct mlx5_flow_steering *steering = get_steering(&ft->node); + struct mlx5_flow_group *g; + struct mlx5_flow_handle *rule; + struct match_list *iter; + bool take_write = false; + struct fs_fte *fte; + u64 version = 0; + bool try_again = false; + int err; + + fte = alloc_fte(ft, spec, flow_act); + if (IS_ERR(fte)) + return ERR_PTR(-ENOMEM); + +search_again_locked: + if (flow_act->flags & FLOW_ACT_NO_APPEND) + goto skip_search; + version = matched_fgs_get_version(match_head); + /* Try to find an fte with identical match value and attempt update its + * action. + */ + list_for_each_entry(iter, match_head, list) { + struct fs_fte *fte_tmp; + + g = iter->g; + fte_tmp = lookup_fte_locked(g, spec->match_value, take_write); + if (!fte_tmp) + continue; + rule = add_rule_fg(g, spec, flow_act, dest, dest_num, fte_tmp); + /* No error check needed here, because insert_fte() is not called */ + up_write_ref_node(&fte_tmp->node, false); + tree_put_node(&fte_tmp->node, false); + kmem_cache_free(steering->ftes_cache, fte); + return rule; + } + +skip_search: + /* No group with matching fte found, or we skipped the search. + * Try to add a new fte to any matching fg. + */ + + /* Check the ft version, for case that new flow group + * was added while the fgs weren't locked + */ + if (atomic_read(&ft->node.version) != ft_version) { + rule = ERR_PTR(-EAGAIN); + goto out; + } + + /* Check the fgs version. If version have changed it could be that an + * FTE with the same match value was added while the fgs weren't + * locked. + */ + if (!(flow_act->flags & FLOW_ACT_NO_APPEND) && + version != matched_fgs_get_version(match_head)) { + take_write = true; + goto search_again_locked; + } + + list_for_each_entry(iter, match_head, list) { + g = iter->g; + + nested_down_write_ref_node(&g->node, FS_LOCK_PARENT); + + if (!g->node.active) { + try_again = true; + up_write_ref_node(&g->node, false); + continue; + } + + err = insert_fte(g, fte); + if (err) { + up_write_ref_node(&g->node, false); + if (err == -ENOSPC) + continue; + kmem_cache_free(steering->ftes_cache, fte); + return ERR_PTR(err); + } + + nested_down_write_ref_node(&fte->node, FS_LOCK_CHILD); + up_write_ref_node(&g->node, false); + rule = add_rule_fg(g, spec, flow_act, dest, dest_num, fte); + up_write_ref_node(&fte->node, false); + if (IS_ERR(rule)) + tree_put_node(&fte->node, false); + return rule; + } + if (try_again) + err = -EAGAIN; + else + err = -ENOENT; + rule = ERR_PTR(err); +out: + kmem_cache_free(steering->ftes_cache, fte); + return rule; +} + +static struct mlx5_flow_handle * +_mlx5_add_flow_rules(struct mlx5_flow_table *ft, + const struct mlx5_flow_spec *spec, + struct mlx5_flow_act *flow_act, + struct mlx5_flow_destination *dest, + int dest_num) + +{ + struct mlx5_flow_steering *steering = get_steering(&ft->node); + struct mlx5_flow_handle *rule; + struct match_list match_head; + struct mlx5_flow_group *g; + bool take_write = false; + struct fs_fte *fte; + int version; + int err; + int i; + + if (!check_valid_spec(spec)) + return ERR_PTR(-EINVAL); + + if (flow_act->fg && ft->autogroup.active) + return ERR_PTR(-EINVAL); + + for (i = 0; i < dest_num; i++) { + if (!dest_is_valid(&dest[i], flow_act, ft)) + return ERR_PTR(-EINVAL); + } + nested_down_read_ref_node(&ft->node, FS_LOCK_GRANDPARENT); +search_again_locked: + version = atomic_read(&ft->node.version); + + /* Collect all fgs which has a matching match_criteria */ + err = build_match_list(&match_head, ft, spec, flow_act->fg, take_write); + if (!take_write) { + up_read_ref_node(&ft->node); + } else { + up_write_ref_node(&ft->node, false); + take_write = false; + } + + if (err) + return ERR_PTR(err); + + rule = try_add_to_existing_fg(ft, &match_head.list, spec, flow_act, dest, + dest_num, version); + free_match_list(&match_head, take_write); + if (!IS_ERR(rule) || + (PTR_ERR(rule) != -ENOENT && PTR_ERR(rule) != -EAGAIN)) { + if (take_write) + up_write_ref_node(&ft->node, false); + return rule; + } + + if (!take_write) { + nested_down_write_ref_node(&ft->node, FS_LOCK_GRANDPARENT); + take_write = true; + } + + if (PTR_ERR(rule) == -EAGAIN || + version != atomic_read(&ft->node.version)) + goto search_again_locked; + + g = alloc_auto_flow_group(ft, spec); + if (IS_ERR(g)) { + rule = ERR_CAST(g); + up_write_ref_node(&ft->node, false); + return rule; + } + + fte = alloc_fte(ft, spec, flow_act); + if (IS_ERR(fte)) { + up_write_ref_node(&ft->node, false); + err = PTR_ERR(fte); + goto err_alloc_fte; + } + + nested_down_write_ref_node(&g->node, FS_LOCK_PARENT); + up_write_ref_node(&ft->node, false); + + err = create_auto_flow_group(ft, g); + if (err) + goto err_release_fg; + + err = insert_fte(g, fte); + if (err) + goto err_release_fg; + + nested_down_write_ref_node(&fte->node, FS_LOCK_CHILD); + up_write_ref_node(&g->node, false); + rule = add_rule_fg(g, spec, flow_act, dest, dest_num, fte); + up_write_ref_node(&fte->node, false); + if (IS_ERR(rule)) + tree_put_node(&fte->node, false); + tree_put_node(&g->node, false); + return rule; + +err_release_fg: + up_write_ref_node(&g->node, false); + kmem_cache_free(steering->ftes_cache, fte); +err_alloc_fte: + tree_put_node(&g->node, false); + return ERR_PTR(err); +} + +static bool fwd_next_prio_supported(struct mlx5_flow_table *ft) +{ + return ((ft->type == FS_FT_ESW_EGRESS_ACL || + ft->type == FS_FT_ESW_INGRESS_ACL) || + ((ft->type == FS_FT_NIC_RX) && + (MLX5_CAP_FLOWTABLE(get_dev(&ft->node), nic_rx_multi_path_tirs)))); +} + +struct mlx5_flow_handle * +mlx5_add_flow_rules(struct mlx5_flow_table *ft, + const struct mlx5_flow_spec *spec, + struct mlx5_flow_act *flow_act, + struct mlx5_flow_destination *dest, + int num_dest) +{ + struct mlx5_flow_root_namespace *root = find_root(&ft->node); + static const struct mlx5_flow_spec zero_spec = {}; + struct mlx5_flow_destination *gen_dest = NULL; + struct mlx5_flow_table *next_ft = NULL; + struct mlx5_flow_handle *handle = NULL; + u32 sw_action = flow_act->action; + int i; + + if (!spec) + spec = &zero_spec; + + if (!is_fwd_next_action(sw_action)) + return _mlx5_add_flow_rules(ft, spec, flow_act, dest, num_dest); + + if (!fwd_next_prio_supported(ft)) + return ERR_PTR(-EOPNOTSUPP); + + mutex_lock(&root->chain_lock); + next_ft = find_next_fwd_ft(ft, flow_act); + if (!next_ft) { + handle = ERR_PTR(-EOPNOTSUPP); + goto unlock; + } + + gen_dest = kcalloc(num_dest + 1, sizeof(*dest), + GFP_KERNEL); + if (!gen_dest) { + handle = ERR_PTR(-ENOMEM); + goto unlock; + } + for (i = 0; i < num_dest; i++) + gen_dest[i] = dest[i]; + gen_dest[i].type = + MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + gen_dest[i].ft = next_ft; + dest = gen_dest; + num_dest++; + flow_act->action &= ~(MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO | + MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_NS); + flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + handle = _mlx5_add_flow_rules(ft, spec, flow_act, dest, num_dest); + if (IS_ERR(handle)) + goto unlock; + + if (list_empty(&handle->rule[num_dest - 1]->next_ft)) { + mutex_lock(&next_ft->lock); + list_add(&handle->rule[num_dest - 1]->next_ft, + &next_ft->fwd_rules); + mutex_unlock(&next_ft->lock); + handle->rule[num_dest - 1]->sw_action = sw_action; + handle->rule[num_dest - 1]->ft = ft; + } +unlock: + mutex_unlock(&root->chain_lock); + kfree(gen_dest); + return handle; +} +EXPORT_SYMBOL(mlx5_add_flow_rules); + +void mlx5_del_flow_rules(struct mlx5_flow_handle *handle) +{ + struct fs_fte *fte; + int i; + + /* In order to consolidate the HW changes we lock the FTE for other + * changes, and increase its refcount, in order not to perform the + * "del" functions of the FTE. Will handle them here. + * The removal of the rules is done under locked FTE. + * After removing all the handle's rules, if there are remaining + * rules, it means we just need to modify the FTE in FW, and + * unlock/decrease the refcount we increased before. + * Otherwise, it means the FTE should be deleted. First delete the + * FTE in FW. Then, unlock the FTE, and proceed the tree_put_node of + * the FTE, which will handle the last decrease of the refcount, as + * well as required handling of its parent. + */ + fs_get_obj(fte, handle->rule[0]->node.parent); + down_write_ref_node(&fte->node, false); + for (i = handle->num_rules - 1; i >= 0; i--) + tree_remove_node(&handle->rule[i]->node, true); + if (list_empty(&fte->node.children)) { + del_hw_fte(&fte->node); + /* Avoid double call to del_hw_fte */ + fte->node.del_hw_func = NULL; + up_write_ref_node(&fte->node, false); + tree_put_node(&fte->node, false); + } else if (fte->dests_size) { + if (fte->modify_mask) + modify_fte(fte); + up_write_ref_node(&fte->node, false); + } else { + up_write_ref_node(&fte->node, false); + } + kfree(handle); +} +EXPORT_SYMBOL(mlx5_del_flow_rules); + +/* Assuming prio->node.children(flow tables) is sorted by level */ +static struct mlx5_flow_table *find_next_ft(struct mlx5_flow_table *ft) +{ + struct fs_prio *prio; + + fs_get_obj(prio, ft->node.parent); + + if (!list_is_last(&ft->node.list, &prio->node.children)) + return list_next_entry(ft, node.list); + return find_next_chained_ft(prio); +} + +static int update_root_ft_destroy(struct mlx5_flow_table *ft) +{ + struct mlx5_flow_root_namespace *root = find_root(&ft->node); + struct mlx5_ft_underlay_qp *uqp; + struct mlx5_flow_table *new_root_ft = NULL; + int err = 0; + u32 qpn; + + if (root->root_ft != ft) + return 0; + + new_root_ft = find_next_ft(ft); + if (!new_root_ft) { + root->root_ft = NULL; + return 0; + } + + if (list_empty(&root->underlay_qpns)) { + /* Don't set any QPN (zero) in case QPN list is empty */ + qpn = 0; + err = root->cmds->update_root_ft(root, new_root_ft, + qpn, false); + } else { + list_for_each_entry(uqp, &root->underlay_qpns, list) { + qpn = uqp->qpn; + err = root->cmds->update_root_ft(root, + new_root_ft, qpn, + false); + if (err) + break; + } + } + + if (err) + mlx5_core_warn(root->dev, + "Update root flow table of id(%u) qpn(%d) failed\n", + ft->id, qpn); + else + root->root_ft = new_root_ft; + + return 0; +} + +/* Connect flow table from previous priority to + * the next flow table. + */ +static int disconnect_flow_table(struct mlx5_flow_table *ft) +{ + struct mlx5_core_dev *dev = get_dev(&ft->node); + struct mlx5_flow_table *next_ft; + struct fs_prio *prio; + int err = 0; + + err = update_root_ft_destroy(ft); + if (err) + return err; + + fs_get_obj(prio, ft->node.parent); + if (!(list_first_entry(&prio->node.children, + struct mlx5_flow_table, + node.list) == ft)) + return 0; + + next_ft = find_next_ft(ft); + err = connect_fwd_rules(dev, next_ft, ft); + if (err) + return err; + + err = connect_prev_fts(dev, next_ft, prio); + if (err) + mlx5_core_warn(dev, "Failed to disconnect flow table %d\n", + ft->id); + return err; +} + +int mlx5_destroy_flow_table(struct mlx5_flow_table *ft) +{ + struct mlx5_flow_root_namespace *root = find_root(&ft->node); + int err = 0; + + mutex_lock(&root->chain_lock); + if (!(ft->flags & MLX5_FLOW_TABLE_UNMANAGED)) + err = disconnect_flow_table(ft); + if (err) { + mutex_unlock(&root->chain_lock); + return err; + } + if (tree_remove_node(&ft->node, false)) + mlx5_core_warn(get_dev(&ft->node), "Flow table %d wasn't destroyed, refcount > 1\n", + ft->id); + mutex_unlock(&root->chain_lock); + + return err; +} +EXPORT_SYMBOL(mlx5_destroy_flow_table); + +void mlx5_destroy_flow_group(struct mlx5_flow_group *fg) +{ + if (tree_remove_node(&fg->node, false)) + mlx5_core_warn(get_dev(&fg->node), "Flow group %d wasn't destroyed, refcount > 1\n", + fg->id); +} +EXPORT_SYMBOL(mlx5_destroy_flow_group); + +struct mlx5_flow_namespace *mlx5_get_fdb_sub_ns(struct mlx5_core_dev *dev, + int n) +{ + struct mlx5_flow_steering *steering = dev->priv.steering; + + if (!steering || !steering->fdb_sub_ns) + return NULL; + + return steering->fdb_sub_ns[n]; +} +EXPORT_SYMBOL(mlx5_get_fdb_sub_ns); + +static bool is_nic_rx_ns(enum mlx5_flow_namespace_type type) +{ + switch (type) { + case MLX5_FLOW_NAMESPACE_BYPASS: + case MLX5_FLOW_NAMESPACE_KERNEL_RX_MACSEC: + case MLX5_FLOW_NAMESPACE_LAG: + case MLX5_FLOW_NAMESPACE_OFFLOADS: + case MLX5_FLOW_NAMESPACE_ETHTOOL: + case MLX5_FLOW_NAMESPACE_KERNEL: + case MLX5_FLOW_NAMESPACE_LEFTOVERS: + case MLX5_FLOW_NAMESPACE_ANCHOR: + return true; + default: + return false; + } +} + +struct mlx5_flow_namespace *mlx5_get_flow_namespace(struct mlx5_core_dev *dev, + enum mlx5_flow_namespace_type type) +{ + struct mlx5_flow_steering *steering = dev->priv.steering; + struct mlx5_flow_root_namespace *root_ns; + int prio = 0; + struct fs_prio *fs_prio; + struct mlx5_flow_namespace *ns; + + if (!steering) + return NULL; + + switch (type) { + case MLX5_FLOW_NAMESPACE_FDB: + if (steering->fdb_root_ns) + return &steering->fdb_root_ns->ns; + return NULL; + case MLX5_FLOW_NAMESPACE_PORT_SEL: + if (steering->port_sel_root_ns) + return &steering->port_sel_root_ns->ns; + return NULL; + case MLX5_FLOW_NAMESPACE_SNIFFER_RX: + if (steering->sniffer_rx_root_ns) + return &steering->sniffer_rx_root_ns->ns; + return NULL; + case MLX5_FLOW_NAMESPACE_SNIFFER_TX: + if (steering->sniffer_tx_root_ns) + return &steering->sniffer_tx_root_ns->ns; + return NULL; + case MLX5_FLOW_NAMESPACE_FDB_BYPASS: + root_ns = steering->fdb_root_ns; + prio = FDB_BYPASS_PATH; + break; + case MLX5_FLOW_NAMESPACE_EGRESS: + case MLX5_FLOW_NAMESPACE_EGRESS_IPSEC: + case MLX5_FLOW_NAMESPACE_EGRESS_MACSEC: + root_ns = steering->egress_root_ns; + prio = type - MLX5_FLOW_NAMESPACE_EGRESS; + break; + case MLX5_FLOW_NAMESPACE_RDMA_RX: + root_ns = steering->rdma_rx_root_ns; + prio = RDMA_RX_BYPASS_PRIO; + break; + case MLX5_FLOW_NAMESPACE_RDMA_RX_KERNEL: + root_ns = steering->rdma_rx_root_ns; + prio = RDMA_RX_KERNEL_PRIO; + break; + case MLX5_FLOW_NAMESPACE_RDMA_TX: + root_ns = steering->rdma_tx_root_ns; + break; + case MLX5_FLOW_NAMESPACE_RDMA_RX_COUNTERS: + root_ns = steering->rdma_rx_root_ns; + prio = RDMA_RX_COUNTERS_PRIO; + break; + case MLX5_FLOW_NAMESPACE_RDMA_TX_COUNTERS: + root_ns = steering->rdma_tx_root_ns; + prio = RDMA_TX_COUNTERS_PRIO; + break; + case MLX5_FLOW_NAMESPACE_RDMA_RX_MACSEC: + root_ns = steering->rdma_rx_root_ns; + prio = RDMA_RX_MACSEC_PRIO; + break; + case MLX5_FLOW_NAMESPACE_RDMA_TX_MACSEC: + root_ns = steering->rdma_tx_root_ns; + prio = RDMA_TX_MACSEC_PRIO; + break; + default: /* Must be NIC RX */ + WARN_ON(!is_nic_rx_ns(type)); + root_ns = steering->root_ns; + prio = type; + break; + } + + if (!root_ns) + return NULL; + + fs_prio = find_prio(&root_ns->ns, prio); + if (!fs_prio) + return NULL; + + ns = list_first_entry(&fs_prio->node.children, + typeof(*ns), + node.list); + + return ns; +} +EXPORT_SYMBOL(mlx5_get_flow_namespace); + +struct mlx5_flow_namespace *mlx5_get_flow_vport_acl_namespace(struct mlx5_core_dev *dev, + enum mlx5_flow_namespace_type type, + int vport) +{ + struct mlx5_flow_steering *steering = dev->priv.steering; + + if (!steering) + return NULL; + + switch (type) { + case MLX5_FLOW_NAMESPACE_ESW_EGRESS: + if (vport >= steering->esw_egress_acl_vports) + return NULL; + if (steering->esw_egress_root_ns && + steering->esw_egress_root_ns[vport]) + return &steering->esw_egress_root_ns[vport]->ns; + else + return NULL; + case MLX5_FLOW_NAMESPACE_ESW_INGRESS: + if (vport >= steering->esw_ingress_acl_vports) + return NULL; + if (steering->esw_ingress_root_ns && + steering->esw_ingress_root_ns[vport]) + return &steering->esw_ingress_root_ns[vport]->ns; + else + return NULL; + default: + return NULL; + } +} + +static struct fs_prio *_fs_create_prio(struct mlx5_flow_namespace *ns, + unsigned int prio, + int num_levels, + enum fs_node_type type) +{ + struct fs_prio *fs_prio; + + fs_prio = kzalloc(sizeof(*fs_prio), GFP_KERNEL); + if (!fs_prio) + return ERR_PTR(-ENOMEM); + + fs_prio->node.type = type; + tree_init_node(&fs_prio->node, NULL, del_sw_prio); + tree_add_node(&fs_prio->node, &ns->node); + fs_prio->num_levels = num_levels; + fs_prio->prio = prio; + list_add_tail(&fs_prio->node.list, &ns->node.children); + + return fs_prio; +} + +static struct fs_prio *fs_create_prio_chained(struct mlx5_flow_namespace *ns, + unsigned int prio, + int num_levels) +{ + return _fs_create_prio(ns, prio, num_levels, FS_TYPE_PRIO_CHAINS); +} + +static struct fs_prio *fs_create_prio(struct mlx5_flow_namespace *ns, + unsigned int prio, int num_levels) +{ + return _fs_create_prio(ns, prio, num_levels, FS_TYPE_PRIO); +} + +static struct mlx5_flow_namespace *fs_init_namespace(struct mlx5_flow_namespace + *ns) +{ + ns->node.type = FS_TYPE_NAMESPACE; + + return ns; +} + +static struct mlx5_flow_namespace *fs_create_namespace(struct fs_prio *prio, + int def_miss_act) +{ + struct mlx5_flow_namespace *ns; + + ns = kzalloc(sizeof(*ns), GFP_KERNEL); + if (!ns) + return ERR_PTR(-ENOMEM); + + fs_init_namespace(ns); + ns->def_miss_action = def_miss_act; + tree_init_node(&ns->node, NULL, del_sw_ns); + tree_add_node(&ns->node, &prio->node); + list_add_tail(&ns->node.list, &prio->node.children); + + return ns; +} + +static int create_leaf_prios(struct mlx5_flow_namespace *ns, int prio, + struct init_tree_node *prio_metadata) +{ + struct fs_prio *fs_prio; + int i; + + for (i = 0; i < prio_metadata->num_leaf_prios; i++) { + fs_prio = fs_create_prio(ns, prio++, prio_metadata->num_levels); + if (IS_ERR(fs_prio)) + return PTR_ERR(fs_prio); + } + return 0; +} + +#define FLOW_TABLE_BIT_SZ 1 +#define GET_FLOW_TABLE_CAP(dev, offset) \ + ((be32_to_cpu(*((__be32 *)(dev->caps.hca[MLX5_CAP_FLOW_TABLE]->cur) + \ + offset / 32)) >> \ + (32 - FLOW_TABLE_BIT_SZ - (offset & 0x1f))) & FLOW_TABLE_BIT_SZ) +static bool has_required_caps(struct mlx5_core_dev *dev, struct node_caps *caps) +{ + int i; + + for (i = 0; i < caps->arr_sz; i++) { + if (!GET_FLOW_TABLE_CAP(dev, caps->caps[i])) + return false; + } + return true; +} + +static int init_root_tree_recursive(struct mlx5_flow_steering *steering, + struct init_tree_node *init_node, + struct fs_node *fs_parent_node, + struct init_tree_node *init_parent_node, + int prio) +{ + int max_ft_level = MLX5_CAP_FLOWTABLE(steering->dev, + flow_table_properties_nic_receive. + max_ft_level); + struct mlx5_flow_namespace *fs_ns; + struct fs_prio *fs_prio; + struct fs_node *base; + int i; + int err; + + if (init_node->type == FS_TYPE_PRIO) { + if ((init_node->min_ft_level > max_ft_level) || + !has_required_caps(steering->dev, &init_node->caps)) + return 0; + + fs_get_obj(fs_ns, fs_parent_node); + if (init_node->num_leaf_prios) + return create_leaf_prios(fs_ns, prio, init_node); + fs_prio = fs_create_prio(fs_ns, prio, init_node->num_levels); + if (IS_ERR(fs_prio)) + return PTR_ERR(fs_prio); + base = &fs_prio->node; + } else if (init_node->type == FS_TYPE_NAMESPACE) { + fs_get_obj(fs_prio, fs_parent_node); + fs_ns = fs_create_namespace(fs_prio, init_node->def_miss_action); + if (IS_ERR(fs_ns)) + return PTR_ERR(fs_ns); + base = &fs_ns->node; + } else { + return -EINVAL; + } + prio = 0; + for (i = 0; i < init_node->ar_size; i++) { + err = init_root_tree_recursive(steering, &init_node->children[i], + base, init_node, prio); + if (err) + return err; + if (init_node->children[i].type == FS_TYPE_PRIO && + init_node->children[i].num_leaf_prios) { + prio += init_node->children[i].num_leaf_prios; + } + } + + return 0; +} + +static int init_root_tree(struct mlx5_flow_steering *steering, + struct init_tree_node *init_node, + struct fs_node *fs_parent_node) +{ + int err; + int i; + + for (i = 0; i < init_node->ar_size; i++) { + err = init_root_tree_recursive(steering, &init_node->children[i], + fs_parent_node, + init_node, i); + if (err) + return err; + } + return 0; +} + +static void del_sw_root_ns(struct fs_node *node) +{ + struct mlx5_flow_root_namespace *root_ns; + struct mlx5_flow_namespace *ns; + + fs_get_obj(ns, node); + root_ns = container_of(ns, struct mlx5_flow_root_namespace, ns); + mutex_destroy(&root_ns->chain_lock); + kfree(node); +} + +static struct mlx5_flow_root_namespace +*create_root_ns(struct mlx5_flow_steering *steering, + enum fs_flow_table_type table_type) +{ + const struct mlx5_flow_cmds *cmds = mlx5_fs_cmd_get_default(table_type); + struct mlx5_flow_root_namespace *root_ns; + struct mlx5_flow_namespace *ns; + + if (mlx5_fpga_ipsec_device_caps(steering->dev) & MLX5_ACCEL_IPSEC_CAP_DEVICE && + (table_type == FS_FT_NIC_RX || table_type == FS_FT_NIC_TX)) + cmds = mlx5_fs_cmd_get_default_ipsec_fpga_cmds(table_type); + + /* Create the root namespace */ + root_ns = kzalloc(sizeof(*root_ns), GFP_KERNEL); + if (!root_ns) + return NULL; + + root_ns->dev = steering->dev; + root_ns->table_type = table_type; + root_ns->cmds = cmds; + + INIT_LIST_HEAD(&root_ns->underlay_qpns); + + ns = &root_ns->ns; + fs_init_namespace(ns); + mutex_init(&root_ns->chain_lock); + tree_init_node(&ns->node, NULL, del_sw_root_ns); + tree_add_node(&ns->node, NULL); + + return root_ns; +} + +static void set_prio_attrs_in_prio(struct fs_prio *prio, int acc_level); + +static int set_prio_attrs_in_ns(struct mlx5_flow_namespace *ns, int acc_level) +{ + struct fs_prio *prio; + + fs_for_each_prio(prio, ns) { + /* This updates prio start_level and num_levels */ + set_prio_attrs_in_prio(prio, acc_level); + acc_level += prio->num_levels; + } + return acc_level; +} + +static void set_prio_attrs_in_prio(struct fs_prio *prio, int acc_level) +{ + struct mlx5_flow_namespace *ns; + int acc_level_ns = acc_level; + + prio->start_level = acc_level; + fs_for_each_ns(ns, prio) { + /* This updates start_level and num_levels of ns's priority descendants */ + acc_level_ns = set_prio_attrs_in_ns(ns, acc_level); + + /* If this a prio with chains, and we can jump from one chain + * (namespace) to another, so we accumulate the levels + */ + if (prio->node.type == FS_TYPE_PRIO_CHAINS) + acc_level = acc_level_ns; + } + + if (!prio->num_levels) + prio->num_levels = acc_level_ns - prio->start_level; + WARN_ON(prio->num_levels < acc_level_ns - prio->start_level); +} + +static void set_prio_attrs(struct mlx5_flow_root_namespace *root_ns) +{ + struct mlx5_flow_namespace *ns = &root_ns->ns; + struct fs_prio *prio; + int start_level = 0; + + fs_for_each_prio(prio, ns) { + set_prio_attrs_in_prio(prio, start_level); + start_level += prio->num_levels; + } +} + +#define ANCHOR_PRIO 0 +#define ANCHOR_SIZE 1 +#define ANCHOR_LEVEL 0 +static int create_anchor_flow_table(struct mlx5_flow_steering *steering) +{ + struct mlx5_flow_namespace *ns = NULL; + struct mlx5_flow_table_attr ft_attr = {}; + struct mlx5_flow_table *ft; + + ns = mlx5_get_flow_namespace(steering->dev, MLX5_FLOW_NAMESPACE_ANCHOR); + if (WARN_ON(!ns)) + return -EINVAL; + + ft_attr.max_fte = ANCHOR_SIZE; + ft_attr.level = ANCHOR_LEVEL; + ft_attr.prio = ANCHOR_PRIO; + + ft = mlx5_create_flow_table(ns, &ft_attr); + if (IS_ERR(ft)) { + mlx5_core_err(steering->dev, "Failed to create last anchor flow table"); + return PTR_ERR(ft); + } + return 0; +} + +static int init_root_ns(struct mlx5_flow_steering *steering) +{ + int err; + + steering->root_ns = create_root_ns(steering, FS_FT_NIC_RX); + if (!steering->root_ns) + return -ENOMEM; + + err = init_root_tree(steering, &root_fs, &steering->root_ns->ns.node); + if (err) + goto out_err; + + set_prio_attrs(steering->root_ns); + err = create_anchor_flow_table(steering); + if (err) + goto out_err; + + return 0; + +out_err: + cleanup_root_ns(steering->root_ns); + steering->root_ns = NULL; + return err; +} + +static void clean_tree(struct fs_node *node) +{ + if (node) { + struct fs_node *iter; + struct fs_node *temp; + + tree_get_node(node); + list_for_each_entry_safe(iter, temp, &node->children, list) + clean_tree(iter); + tree_put_node(node, false); + tree_remove_node(node, false); + } +} + +static void cleanup_root_ns(struct mlx5_flow_root_namespace *root_ns) +{ + if (!root_ns) + return; + + clean_tree(&root_ns->ns.node); +} + +void mlx5_cleanup_fs(struct mlx5_core_dev *dev) +{ + struct mlx5_flow_steering *steering = dev->priv.steering; + + cleanup_root_ns(steering->root_ns); + cleanup_root_ns(steering->fdb_root_ns); + steering->fdb_root_ns = NULL; + kfree(steering->fdb_sub_ns); + steering->fdb_sub_ns = NULL; + cleanup_root_ns(steering->port_sel_root_ns); + cleanup_root_ns(steering->sniffer_rx_root_ns); + cleanup_root_ns(steering->sniffer_tx_root_ns); + cleanup_root_ns(steering->rdma_rx_root_ns); + cleanup_root_ns(steering->rdma_tx_root_ns); + cleanup_root_ns(steering->egress_root_ns); + mlx5_cleanup_fc_stats(dev); + kmem_cache_destroy(steering->ftes_cache); + kmem_cache_destroy(steering->fgs_cache); + mlx5_ft_pool_destroy(dev); + kfree(steering); +} + +static int init_sniffer_tx_root_ns(struct mlx5_flow_steering *steering) +{ + struct fs_prio *prio; + + steering->sniffer_tx_root_ns = create_root_ns(steering, FS_FT_SNIFFER_TX); + if (!steering->sniffer_tx_root_ns) + return -ENOMEM; + + /* Create single prio */ + prio = fs_create_prio(&steering->sniffer_tx_root_ns->ns, 0, 1); + return PTR_ERR_OR_ZERO(prio); +} + +static int init_sniffer_rx_root_ns(struct mlx5_flow_steering *steering) +{ + struct fs_prio *prio; + + steering->sniffer_rx_root_ns = create_root_ns(steering, FS_FT_SNIFFER_RX); + if (!steering->sniffer_rx_root_ns) + return -ENOMEM; + + /* Create single prio */ + prio = fs_create_prio(&steering->sniffer_rx_root_ns->ns, 0, 1); + return PTR_ERR_OR_ZERO(prio); +} + +#define PORT_SEL_NUM_LEVELS 3 +static int init_port_sel_root_ns(struct mlx5_flow_steering *steering) +{ + struct fs_prio *prio; + + steering->port_sel_root_ns = create_root_ns(steering, FS_FT_PORT_SEL); + if (!steering->port_sel_root_ns) + return -ENOMEM; + + /* Create single prio */ + prio = fs_create_prio(&steering->port_sel_root_ns->ns, 0, + PORT_SEL_NUM_LEVELS); + return PTR_ERR_OR_ZERO(prio); +} + +static int init_rdma_rx_root_ns(struct mlx5_flow_steering *steering) +{ + int err; + + steering->rdma_rx_root_ns = create_root_ns(steering, FS_FT_RDMA_RX); + if (!steering->rdma_rx_root_ns) + return -ENOMEM; + + err = init_root_tree(steering, &rdma_rx_root_fs, + &steering->rdma_rx_root_ns->ns.node); + if (err) + goto out_err; + + set_prio_attrs(steering->rdma_rx_root_ns); + + return 0; + +out_err: + cleanup_root_ns(steering->rdma_rx_root_ns); + steering->rdma_rx_root_ns = NULL; + return err; +} + +static int init_rdma_tx_root_ns(struct mlx5_flow_steering *steering) +{ + int err; + + steering->rdma_tx_root_ns = create_root_ns(steering, FS_FT_RDMA_TX); + if (!steering->rdma_tx_root_ns) + return -ENOMEM; + + err = init_root_tree(steering, &rdma_tx_root_fs, + &steering->rdma_tx_root_ns->ns.node); + if (err) + goto out_err; + + set_prio_attrs(steering->rdma_tx_root_ns); + + return 0; + +out_err: + cleanup_root_ns(steering->rdma_tx_root_ns); + steering->rdma_tx_root_ns = NULL; + return err; +} + +/* FT and tc chains are stored in the same array so we can re-use the + * mlx5_get_fdb_sub_ns() and tc api for FT chains. + * When creating a new ns for each chain store it in the first available slot. + * Assume tc chains are created and stored first and only then the FT chain. + */ +static void store_fdb_sub_ns_prio_chain(struct mlx5_flow_steering *steering, + struct mlx5_flow_namespace *ns) +{ + int chain = 0; + + while (steering->fdb_sub_ns[chain]) + ++chain; + + steering->fdb_sub_ns[chain] = ns; +} + +static int create_fdb_sub_ns_prio_chain(struct mlx5_flow_steering *steering, + struct fs_prio *maj_prio) +{ + struct mlx5_flow_namespace *ns; + struct fs_prio *min_prio; + int prio; + + ns = fs_create_namespace(maj_prio, MLX5_FLOW_TABLE_MISS_ACTION_DEF); + if (IS_ERR(ns)) + return PTR_ERR(ns); + + for (prio = 0; prio < FDB_TC_MAX_PRIO; prio++) { + min_prio = fs_create_prio(ns, prio, FDB_TC_LEVELS_PER_PRIO); + if (IS_ERR(min_prio)) + return PTR_ERR(min_prio); + } + + store_fdb_sub_ns_prio_chain(steering, ns); + + return 0; +} + +static int create_fdb_chains(struct mlx5_flow_steering *steering, + int fs_prio, + int chains) +{ + struct fs_prio *maj_prio; + int levels; + int chain; + int err; + + levels = FDB_TC_LEVELS_PER_PRIO * FDB_TC_MAX_PRIO * chains; + maj_prio = fs_create_prio_chained(&steering->fdb_root_ns->ns, + fs_prio, + levels); + if (IS_ERR(maj_prio)) + return PTR_ERR(maj_prio); + + for (chain = 0; chain < chains; chain++) { + err = create_fdb_sub_ns_prio_chain(steering, maj_prio); + if (err) + return err; + } + + return 0; +} + +static int create_fdb_fast_path(struct mlx5_flow_steering *steering) +{ + int err; + + steering->fdb_sub_ns = kcalloc(FDB_NUM_CHAINS, + sizeof(*steering->fdb_sub_ns), + GFP_KERNEL); + if (!steering->fdb_sub_ns) + return -ENOMEM; + + err = create_fdb_chains(steering, FDB_TC_OFFLOAD, FDB_TC_MAX_CHAIN + 1); + if (err) + return err; + + err = create_fdb_chains(steering, FDB_FT_OFFLOAD, 1); + if (err) + return err; + + return 0; +} + +static int create_fdb_bypass(struct mlx5_flow_steering *steering) +{ + struct mlx5_flow_namespace *ns; + struct fs_prio *prio; + int i; + + prio = fs_create_prio(&steering->fdb_root_ns->ns, FDB_BYPASS_PATH, 0); + if (IS_ERR(prio)) + return PTR_ERR(prio); + + ns = fs_create_namespace(prio, MLX5_FLOW_TABLE_MISS_ACTION_DEF); + if (IS_ERR(ns)) + return PTR_ERR(ns); + + for (i = 0; i < MLX5_BY_PASS_NUM_REGULAR_PRIOS; i++) { + prio = fs_create_prio(ns, i, 1); + if (IS_ERR(prio)) + return PTR_ERR(prio); + } + return 0; +} + +static int init_fdb_root_ns(struct mlx5_flow_steering *steering) +{ + struct fs_prio *maj_prio; + int err; + + steering->fdb_root_ns = create_root_ns(steering, FS_FT_FDB); + if (!steering->fdb_root_ns) + return -ENOMEM; + + err = create_fdb_bypass(steering); + if (err) + goto out_err; + + maj_prio = fs_create_prio(&steering->fdb_root_ns->ns, FDB_CRYPTO_INGRESS, 2); + if (IS_ERR(maj_prio)) { + err = PTR_ERR(maj_prio); + goto out_err; + } + + err = create_fdb_fast_path(steering); + if (err) + goto out_err; + + maj_prio = fs_create_prio(&steering->fdb_root_ns->ns, FDB_TC_MISS, 1); + if (IS_ERR(maj_prio)) { + err = PTR_ERR(maj_prio); + goto out_err; + } + + maj_prio = fs_create_prio(&steering->fdb_root_ns->ns, FDB_MISS_METER, 2); + if (IS_ERR(maj_prio)) { + err = PTR_ERR(maj_prio); + goto out_err; + } + + maj_prio = fs_create_prio(&steering->fdb_root_ns->ns, FDB_BR_OFFLOAD, 3); + if (IS_ERR(maj_prio)) { + err = PTR_ERR(maj_prio); + goto out_err; + } + + maj_prio = fs_create_prio(&steering->fdb_root_ns->ns, FDB_SLOW_PATH, 1); + if (IS_ERR(maj_prio)) { + err = PTR_ERR(maj_prio); + goto out_err; + } + + maj_prio = fs_create_prio(&steering->fdb_root_ns->ns, FDB_CRYPTO_EGRESS, 3); + if (IS_ERR(maj_prio)) { + err = PTR_ERR(maj_prio); + goto out_err; + } + + /* We put this priority last, knowing that nothing will get here + * unless explicitly forwarded to. This is possible because the + * slow path tables have catch all rules and nothing gets passed + * those tables. + */ + maj_prio = fs_create_prio(&steering->fdb_root_ns->ns, FDB_PER_VPORT, 1); + if (IS_ERR(maj_prio)) { + err = PTR_ERR(maj_prio); + goto out_err; + } + + set_prio_attrs(steering->fdb_root_ns); + return 0; + +out_err: + cleanup_root_ns(steering->fdb_root_ns); + kfree(steering->fdb_sub_ns); + steering->fdb_sub_ns = NULL; + steering->fdb_root_ns = NULL; + return err; +} + +static int init_egress_acl_root_ns(struct mlx5_flow_steering *steering, int vport) +{ + struct fs_prio *prio; + int i; + + steering->esw_egress_root_ns[vport] = create_root_ns(steering, FS_FT_ESW_EGRESS_ACL); + if (!steering->esw_egress_root_ns[vport]) + return -ENOMEM; + + /* create 5 prios, and the first 4 are for vf metering */ + for (i = 0; i < MLX5_EGRESS_ACL_NUM_PRIOS; i++) { + prio = fs_create_prio(&steering->esw_egress_root_ns[vport]->ns, i, 1); + if (IS_ERR(prio)) + return PTR_ERR(prio); + } + set_prio_attrs(steering->esw_egress_root_ns[vport]); + + return 0; +} + +static int init_ingress_acl_root_ns(struct mlx5_flow_steering *steering, int vport) +{ + struct fs_prio *prio; + int i; + + steering->esw_ingress_root_ns[vport] = create_root_ns(steering, FS_FT_ESW_INGRESS_ACL); + if (!steering->esw_ingress_root_ns[vport]) + return -ENOMEM; + + /* create 5 prios, and the first 4 are for vf metering */ + for (i = 0; i < MLX5_INGRESS_ACL_NUM_PRIOS; i++) { + prio = fs_create_prio(&steering->esw_ingress_root_ns[vport]->ns, i, 1); + if (IS_ERR(prio)) + return PTR_ERR(prio); + } + set_prio_attrs(steering->esw_ingress_root_ns[vport]); + + return 0; +} + +int mlx5_fs_egress_acls_init(struct mlx5_core_dev *dev, int total_vports) +{ + struct mlx5_flow_steering *steering = dev->priv.steering; + int err; + int i; + + steering->esw_egress_root_ns = + kcalloc(total_vports, + sizeof(*steering->esw_egress_root_ns), + GFP_KERNEL); + if (!steering->esw_egress_root_ns) + return -ENOMEM; + + for (i = 0; i < total_vports; i++) { + err = init_egress_acl_root_ns(steering, i); + if (err) + goto cleanup_root_ns; + } + steering->esw_egress_acl_vports = total_vports; + return 0; + +cleanup_root_ns: + for (i--; i >= 0; i--) + cleanup_root_ns(steering->esw_egress_root_ns[i]); + kfree(steering->esw_egress_root_ns); + steering->esw_egress_root_ns = NULL; + return err; +} + +void mlx5_fs_egress_acls_cleanup(struct mlx5_core_dev *dev) +{ + struct mlx5_flow_steering *steering = dev->priv.steering; + int i; + + if (!steering->esw_egress_root_ns) + return; + + for (i = 0; i < steering->esw_egress_acl_vports; i++) + cleanup_root_ns(steering->esw_egress_root_ns[i]); + + kfree(steering->esw_egress_root_ns); + steering->esw_egress_root_ns = NULL; +} + +int mlx5_fs_ingress_acls_init(struct mlx5_core_dev *dev, int total_vports) +{ + struct mlx5_flow_steering *steering = dev->priv.steering; + int err; + int i; + + steering->esw_ingress_root_ns = + kcalloc(total_vports, + sizeof(*steering->esw_ingress_root_ns), + GFP_KERNEL); + if (!steering->esw_ingress_root_ns) + return -ENOMEM; + + for (i = 0; i < total_vports; i++) { + err = init_ingress_acl_root_ns(steering, i); + if (err) + goto cleanup_root_ns; + } + steering->esw_ingress_acl_vports = total_vports; + return 0; + +cleanup_root_ns: + for (i--; i >= 0; i--) + cleanup_root_ns(steering->esw_ingress_root_ns[i]); + kfree(steering->esw_ingress_root_ns); + steering->esw_ingress_root_ns = NULL; + return err; +} + +void mlx5_fs_ingress_acls_cleanup(struct mlx5_core_dev *dev) +{ + struct mlx5_flow_steering *steering = dev->priv.steering; + int i; + + if (!steering->esw_ingress_root_ns) + return; + + for (i = 0; i < steering->esw_ingress_acl_vports; i++) + cleanup_root_ns(steering->esw_ingress_root_ns[i]); + + kfree(steering->esw_ingress_root_ns); + steering->esw_ingress_root_ns = NULL; +} + +static int init_egress_root_ns(struct mlx5_flow_steering *steering) +{ + int err; + + steering->egress_root_ns = create_root_ns(steering, + FS_FT_NIC_TX); + if (!steering->egress_root_ns) + return -ENOMEM; + + err = init_root_tree(steering, &egress_root_fs, + &steering->egress_root_ns->ns.node); + if (err) + goto cleanup; + set_prio_attrs(steering->egress_root_ns); + return 0; +cleanup: + cleanup_root_ns(steering->egress_root_ns); + steering->egress_root_ns = NULL; + return err; +} + +#define CACHE_SIZE_NAME 30 +int mlx5_init_fs(struct mlx5_core_dev *dev) +{ + struct mlx5_flow_steering *steering; + char *ftes_cache_name; + char *fgs_cache_name; + int err = 0; + + err = mlx5_init_fc_stats(dev); + if (err) + return err; + + err = mlx5_ft_pool_init(dev); + if (err) + return err; + + steering = kzalloc(sizeof(*steering), GFP_KERNEL); + if (!steering) { + err = -ENOMEM; + goto err; + } + + ftes_cache_name = kzalloc(sizeof(char) * CACHE_SIZE_NAME, GFP_KERNEL); + fgs_cache_name = kzalloc(sizeof(char) * CACHE_SIZE_NAME, GFP_KERNEL); + if (!ftes_cache_name || !fgs_cache_name) { + err = -ENOMEM; + goto err; + } + + steering->dev = dev; + dev->priv.steering = steering; + + if (mlx5_fs_dr_is_supported(dev)) + steering->mode = MLX5_FLOW_STEERING_MODE_SMFS; + else + steering->mode = MLX5_FLOW_STEERING_MODE_DMFS; + + snprintf(ftes_cache_name, CACHE_SIZE_NAME, "fs_ftes_%s", dev_name(dev->device)); + snprintf(fgs_cache_name, CACHE_SIZE_NAME, "fs_fgs_%s", dev_name(dev->device)); + steering->fgs_cache = kmem_cache_create(fgs_cache_name, + sizeof(struct mlx5_flow_group), 0, + 0, NULL); + steering->ftes_cache = kmem_cache_create(ftes_cache_name, + sizeof(struct fs_fte), 0, + 0, NULL); + if (!steering->ftes_cache || !steering->fgs_cache) { + err = -ENOMEM; + goto err; + } + + if ((((MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_ETH) && + (MLX5_CAP_GEN(dev, nic_flow_table))) || + ((MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_IB) && + MLX5_CAP_GEN(dev, ipoib_enhanced_offloads))) && + MLX5_CAP_FLOWTABLE_NIC_RX(dev, ft_support)) { + err = init_root_ns(steering); + if (err) + goto err; + } + + if (MLX5_ESWITCH_MANAGER(dev)) { + if (MLX5_CAP_ESW_FLOWTABLE_FDB(dev, ft_support)) { + err = init_fdb_root_ns(steering); + if (err) + goto err; + } + } + + if (MLX5_CAP_FLOWTABLE_SNIFFER_RX(dev, ft_support)) { + err = init_sniffer_rx_root_ns(steering); + if (err) + goto err; + } + + if (MLX5_CAP_FLOWTABLE_SNIFFER_TX(dev, ft_support)) { + err = init_sniffer_tx_root_ns(steering); + if (err) + goto err; + } + + if (MLX5_CAP_FLOWTABLE_PORT_SELECTION(dev, ft_support)) { + err = init_port_sel_root_ns(steering); + if (err) + goto err; + } + + if (MLX5_CAP_FLOWTABLE_RDMA_RX(dev, ft_support) && + MLX5_CAP_FLOWTABLE_RDMA_RX(dev, table_miss_action_domain)) { + err = init_rdma_rx_root_ns(steering); + if (err) + goto err; + } + + if (MLX5_CAP_FLOWTABLE_RDMA_TX(dev, ft_support)) { + err = init_rdma_tx_root_ns(steering); + if (err) + goto err; + } + + if (mlx5_fpga_ipsec_device_caps(steering->dev) & MLX5_ACCEL_IPSEC_CAP_DEVICE || + MLX5_CAP_FLOWTABLE_NIC_TX(dev, ft_support)) { + err = init_egress_root_ns(steering); + if (err) + goto err; + } + + kfree(ftes_cache_name); + kfree(fgs_cache_name); + return 0; +err: + kfree(ftes_cache_name); + kfree(fgs_cache_name); + mlx5_cleanup_fs(dev); + return err; +} + +int mlx5_fs_add_rx_underlay_qpn(struct mlx5_core_dev *dev, u32 underlay_qpn) +{ + struct mlx5_flow_root_namespace *root = dev->priv.steering->root_ns; + struct mlx5_ft_underlay_qp *new_uqp; + int err = 0; + + new_uqp = kzalloc(sizeof(*new_uqp), GFP_KERNEL); + if (!new_uqp) + return -ENOMEM; + + mutex_lock(&root->chain_lock); + + if (!root->root_ft) { + err = -EINVAL; + goto update_ft_fail; + } + + err = root->cmds->update_root_ft(root, root->root_ft, underlay_qpn, + false); + if (err) { + mlx5_core_warn(dev, "Failed adding underlay QPN (%u) to root FT err(%d)\n", + underlay_qpn, err); + goto update_ft_fail; + } + + new_uqp->qpn = underlay_qpn; + list_add_tail(&new_uqp->list, &root->underlay_qpns); + + mutex_unlock(&root->chain_lock); + + return 0; + +update_ft_fail: + mutex_unlock(&root->chain_lock); + kfree(new_uqp); + return err; +} +EXPORT_SYMBOL(mlx5_fs_add_rx_underlay_qpn); + +int mlx5_fs_remove_rx_underlay_qpn(struct mlx5_core_dev *dev, u32 underlay_qpn) +{ + struct mlx5_flow_root_namespace *root = dev->priv.steering->root_ns; + struct mlx5_ft_underlay_qp *uqp; + bool found = false; + int err = 0; + + mutex_lock(&root->chain_lock); + list_for_each_entry(uqp, &root->underlay_qpns, list) { + if (uqp->qpn == underlay_qpn) { + found = true; + break; + } + } + + if (!found) { + mlx5_core_warn(dev, "Failed finding underlay qp (%u) in qpn list\n", + underlay_qpn); + err = -EINVAL; + goto out; + } + + err = root->cmds->update_root_ft(root, root->root_ft, underlay_qpn, + true); + if (err) + mlx5_core_warn(dev, "Failed removing underlay QPN (%u) from root FT err(%d)\n", + underlay_qpn, err); + + list_del(&uqp->list); + mutex_unlock(&root->chain_lock); + kfree(uqp); + + return 0; + +out: + mutex_unlock(&root->chain_lock); + return err; +} +EXPORT_SYMBOL(mlx5_fs_remove_rx_underlay_qpn); + +static struct mlx5_flow_root_namespace +*get_root_namespace(struct mlx5_core_dev *dev, enum mlx5_flow_namespace_type ns_type) +{ + struct mlx5_flow_namespace *ns; + + if (ns_type == MLX5_FLOW_NAMESPACE_ESW_EGRESS || + ns_type == MLX5_FLOW_NAMESPACE_ESW_INGRESS) + ns = mlx5_get_flow_vport_acl_namespace(dev, ns_type, 0); + else + ns = mlx5_get_flow_namespace(dev, ns_type); + if (!ns) + return NULL; + + return find_root(&ns->node); +} + +struct mlx5_modify_hdr *mlx5_modify_header_alloc(struct mlx5_core_dev *dev, + u8 ns_type, u8 num_actions, + void *modify_actions) +{ + struct mlx5_flow_root_namespace *root; + struct mlx5_modify_hdr *modify_hdr; + int err; + + root = get_root_namespace(dev, ns_type); + if (!root) + return ERR_PTR(-EOPNOTSUPP); + + modify_hdr = kzalloc(sizeof(*modify_hdr), GFP_KERNEL); + if (!modify_hdr) + return ERR_PTR(-ENOMEM); + + modify_hdr->ns_type = ns_type; + err = root->cmds->modify_header_alloc(root, ns_type, num_actions, + modify_actions, modify_hdr); + if (err) { + kfree(modify_hdr); + return ERR_PTR(err); + } + + return modify_hdr; +} +EXPORT_SYMBOL(mlx5_modify_header_alloc); + +void mlx5_modify_header_dealloc(struct mlx5_core_dev *dev, + struct mlx5_modify_hdr *modify_hdr) +{ + struct mlx5_flow_root_namespace *root; + + root = get_root_namespace(dev, modify_hdr->ns_type); + if (WARN_ON(!root)) + return; + root->cmds->modify_header_dealloc(root, modify_hdr); + kfree(modify_hdr); +} +EXPORT_SYMBOL(mlx5_modify_header_dealloc); + +struct mlx5_pkt_reformat *mlx5_packet_reformat_alloc(struct mlx5_core_dev *dev, + struct mlx5_pkt_reformat_params *params, + enum mlx5_flow_namespace_type ns_type) +{ + struct mlx5_pkt_reformat *pkt_reformat; + struct mlx5_flow_root_namespace *root; + int err; + + root = get_root_namespace(dev, ns_type); + if (!root) + return ERR_PTR(-EOPNOTSUPP); + + pkt_reformat = kzalloc(sizeof(*pkt_reformat), GFP_KERNEL); + if (!pkt_reformat) + return ERR_PTR(-ENOMEM); + + pkt_reformat->ns_type = ns_type; + pkt_reformat->reformat_type = params->type; + err = root->cmds->packet_reformat_alloc(root, params, ns_type, + pkt_reformat); + if (err) { + kfree(pkt_reformat); + return ERR_PTR(err); + } + + return pkt_reformat; +} +EXPORT_SYMBOL(mlx5_packet_reformat_alloc); + +void mlx5_packet_reformat_dealloc(struct mlx5_core_dev *dev, + struct mlx5_pkt_reformat *pkt_reformat) +{ + struct mlx5_flow_root_namespace *root; + + root = get_root_namespace(dev, pkt_reformat->ns_type); + if (WARN_ON(!root)) + return; + root->cmds->packet_reformat_dealloc(root, pkt_reformat); + kfree(pkt_reformat); +} +EXPORT_SYMBOL(mlx5_packet_reformat_dealloc); + +int mlx5_get_match_definer_id(struct mlx5_flow_definer *definer) +{ + return definer->id; +} + +struct mlx5_flow_definer * +mlx5_create_match_definer(struct mlx5_core_dev *dev, + enum mlx5_flow_namespace_type ns_type, u16 format_id, + u32 *match_mask) +{ + struct mlx5_flow_root_namespace *root; + struct mlx5_flow_definer *definer; + int id; + + root = get_root_namespace(dev, ns_type); + if (!root) + return ERR_PTR(-EOPNOTSUPP); + + definer = kzalloc(sizeof(*definer), GFP_KERNEL); + if (!definer) + return ERR_PTR(-ENOMEM); + + definer->ns_type = ns_type; + id = root->cmds->create_match_definer(root, format_id, match_mask); + if (id < 0) { + mlx5_core_warn(root->dev, "Failed to create match definer (%d)\n", id); + kfree(definer); + return ERR_PTR(id); + } + definer->id = id; + return definer; +} + +void mlx5_destroy_match_definer(struct mlx5_core_dev *dev, + struct mlx5_flow_definer *definer) +{ + struct mlx5_flow_root_namespace *root; + + root = get_root_namespace(dev, definer->ns_type); + if (WARN_ON(!root)) + return; + + root->cmds->destroy_match_definer(root, definer->id); + kfree(definer); +} + +int mlx5_flow_namespace_set_peer(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_root_namespace *peer_ns) +{ + if (peer_ns && ns->mode != peer_ns->mode) { + mlx5_core_err(ns->dev, + "Can't peer namespace of different steering mode\n"); + return -EINVAL; + } + + return ns->cmds->set_peer(ns, peer_ns); +} + +/* This function should be called only at init stage of the namespace. + * It is not safe to call this function while steering operations + * are executed in the namespace. + */ +int mlx5_flow_namespace_set_mode(struct mlx5_flow_namespace *ns, + enum mlx5_flow_steering_mode mode) +{ + struct mlx5_flow_root_namespace *root; + const struct mlx5_flow_cmds *cmds; + int err; + + root = find_root(&ns->node); + if (&root->ns != ns) + /* Can't set cmds to non root namespace */ + return -EINVAL; + + if (root->table_type != FS_FT_FDB) + return -EOPNOTSUPP; + + if (root->mode == mode) + return 0; + + if (mode == MLX5_FLOW_STEERING_MODE_SMFS) + cmds = mlx5_fs_cmd_get_dr_cmds(); + else + cmds = mlx5_fs_cmd_get_fw_cmds(); + if (!cmds) + return -EOPNOTSUPP; + + err = cmds->create_ns(root); + if (err) { + mlx5_core_err(root->dev, "Failed to create flow namespace (%d)\n", + err); + return err; + } + + root->cmds->destroy_ns(root); + root->cmds = cmds; + root->mode = mode; + + return 0; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h new file mode 100644 index 0000000..d931f10 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h @@ -0,0 +1,357 @@ +/* + * Copyright (c) 2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _MLX5_FS_CORE_ +#define _MLX5_FS_CORE_ + +#include +#include +#include +#include +#include + +#define FDB_TC_MAX_CHAIN 3 +#define FDB_FT_CHAIN (FDB_TC_MAX_CHAIN + 1) +#define FDB_TC_SLOW_PATH_CHAIN (FDB_FT_CHAIN + 1) + +/* The index of the last real chain (FT) + 1 as chain zero is valid as well */ +#define FDB_NUM_CHAINS (FDB_FT_CHAIN + 1) + +#define FDB_TC_MAX_PRIO 16 +#define FDB_TC_LEVELS_PER_PRIO 2 + +struct mlx5_flow_definer { + enum mlx5_flow_namespace_type ns_type; + u32 id; +}; + +struct mlx5_modify_hdr { + enum mlx5_flow_namespace_type ns_type; + bool sw_owned; + union { + struct mlx5_fs_dr_action action; + u32 id; + }; +}; + +struct mlx5_pkt_reformat { + enum mlx5_flow_namespace_type ns_type; + int reformat_type; /* from mlx5_ifc */ + bool sw_owned; + union { + struct mlx5_fs_dr_action action; + u32 id; + }; +}; + +/* FS_TYPE_PRIO_CHAINS is a PRIO that will have namespaces only, + * and those are in parallel to one another when going over them to connect + * a new flow table. Meaning the last flow table in a TYPE_PRIO prio in one + * parallel namespace will not automatically connect to the first flow table + * found in any prio in any next namespace, but skip the entire containing + * TYPE_PRIO_CHAINS prio. + * + * This is used to implement tc chains, each chain of prios is a different + * namespace inside a containing TYPE_PRIO_CHAINS prio. + */ + +enum fs_node_type { + FS_TYPE_NAMESPACE, + FS_TYPE_PRIO, + FS_TYPE_PRIO_CHAINS, + FS_TYPE_FLOW_TABLE, + FS_TYPE_FLOW_GROUP, + FS_TYPE_FLOW_ENTRY, + FS_TYPE_FLOW_DEST +}; + +enum fs_flow_table_type { + FS_FT_NIC_RX = 0x0, + FS_FT_NIC_TX = 0x1, + FS_FT_ESW_EGRESS_ACL = 0x2, + FS_FT_ESW_INGRESS_ACL = 0x3, + FS_FT_FDB = 0X4, + FS_FT_SNIFFER_RX = 0X5, + FS_FT_SNIFFER_TX = 0X6, + FS_FT_RDMA_RX = 0X7, + FS_FT_RDMA_TX = 0X8, + FS_FT_PORT_SEL = 0X9, + FS_FT_MAX_TYPE = FS_FT_PORT_SEL, +}; + +enum fs_flow_table_op_mod { + FS_FT_OP_MOD_NORMAL, + FS_FT_OP_MOD_LAG_DEMUX, +}; + +enum fs_fte_status { + FS_FTE_STATUS_EXISTING = 1UL << 0, +}; + +enum mlx5_flow_steering_mode { + MLX5_FLOW_STEERING_MODE_DMFS, + MLX5_FLOW_STEERING_MODE_SMFS +}; + +struct mlx5_flow_steering { + struct mlx5_core_dev *dev; + enum mlx5_flow_steering_mode mode; + struct kmem_cache *fgs_cache; + struct kmem_cache *ftes_cache; + struct mlx5_flow_root_namespace *root_ns; + struct mlx5_flow_root_namespace *fdb_root_ns; + struct mlx5_flow_namespace **fdb_sub_ns; + struct mlx5_flow_root_namespace **esw_egress_root_ns; + struct mlx5_flow_root_namespace **esw_ingress_root_ns; + struct mlx5_flow_root_namespace *sniffer_tx_root_ns; + struct mlx5_flow_root_namespace *sniffer_rx_root_ns; + struct mlx5_flow_root_namespace *rdma_rx_root_ns; + struct mlx5_flow_root_namespace *rdma_tx_root_ns; + struct mlx5_flow_root_namespace *egress_root_ns; + struct mlx5_flow_root_namespace *port_sel_root_ns; + int esw_egress_acl_vports; + int esw_ingress_acl_vports; +}; + +struct fs_node { + struct list_head list; + struct list_head children; + enum fs_node_type type; + struct fs_node *parent; + struct fs_node *root; + /* lock the node for writing and traversing */ + struct rw_semaphore lock; + refcount_t refcount; + bool active; + void (*del_hw_func)(struct fs_node *); + void (*del_sw_func)(struct fs_node *); + atomic_t version; +}; + +struct mlx5_flow_rule { + struct fs_node node; + struct mlx5_flow_table *ft; + struct mlx5_flow_destination dest_attr; + /* next_ft should be accessed under chain_lock and only of + * destination type is FWD_NEXT_fT. + */ + struct list_head next_ft; + u32 sw_action; +}; + +struct mlx5_flow_handle { + int num_rules; + struct mlx5_flow_rule *rule[]; +}; + +/* Type of children is mlx5_flow_group */ +struct mlx5_flow_table { + struct fs_node node; + struct mlx5_fs_dr_table fs_dr_table; + u32 id; + u16 vport; + unsigned int max_fte; + unsigned int level; + enum fs_flow_table_type type; + enum fs_flow_table_op_mod op_mod; + struct { + bool active; + unsigned int required_groups; + unsigned int group_size; + unsigned int num_groups; + unsigned int max_fte; + } autogroup; + /* Protect fwd_rules */ + struct mutex lock; + /* FWD rules that point on this flow table */ + struct list_head fwd_rules; + u32 flags; + struct rhltable fgs_hash; + enum mlx5_flow_table_miss_action def_miss_action; + struct mlx5_flow_namespace *ns; +}; + +struct mlx5_ft_underlay_qp { + struct list_head list; + u32 qpn; +}; + +#define MLX5_FTE_MATCH_PARAM_RESERVED reserved_at_e00 +/* Calculate the fte_match_param length and without the reserved length. + * Make sure the reserved field is the last. + */ +#define MLX5_ST_SZ_DW_MATCH_PARAM \ + ((MLX5_BYTE_OFF(fte_match_param, MLX5_FTE_MATCH_PARAM_RESERVED) / sizeof(u32)) + \ + BUILD_BUG_ON_ZERO(MLX5_ST_SZ_BYTES(fte_match_param) != \ + MLX5_FLD_SZ_BYTES(fte_match_param, \ + MLX5_FTE_MATCH_PARAM_RESERVED) +\ + MLX5_BYTE_OFF(fte_match_param, \ + MLX5_FTE_MATCH_PARAM_RESERVED))) + +/* Type of children is mlx5_flow_rule */ +struct fs_fte { + struct fs_node node; + struct mlx5_fs_dr_rule fs_dr_rule; + u32 val[MLX5_ST_SZ_DW_MATCH_PARAM]; + u32 dests_size; + u32 index; + struct mlx5_flow_context flow_context; + struct mlx5_flow_act action; + enum fs_fte_status status; + struct mlx5_fc *counter; + struct rhash_head hash; + int modify_mask; +}; + +/* Type of children is mlx5_flow_table/namespace */ +struct fs_prio { + struct fs_node node; + unsigned int num_levels; + unsigned int start_level; + unsigned int prio; + unsigned int num_ft; +}; + +/* Type of children is fs_prio */ +struct mlx5_flow_namespace { + /* parent == NULL => root ns */ + struct fs_node node; + enum mlx5_flow_table_miss_action def_miss_action; +}; + +struct mlx5_flow_group_mask { + u8 match_criteria_enable; + u32 match_criteria[MLX5_ST_SZ_DW_MATCH_PARAM]; +}; + +/* Type of children is fs_fte */ +struct mlx5_flow_group { + struct fs_node node; + struct mlx5_fs_dr_matcher fs_dr_matcher; + struct mlx5_flow_group_mask mask; + u32 start_index; + u32 max_ftes; + struct ida fte_allocator; + u32 id; + struct rhashtable ftes_hash; + struct rhlist_head hash; +}; + +struct mlx5_flow_root_namespace { + struct mlx5_flow_namespace ns; + enum mlx5_flow_steering_mode mode; + struct mlx5_fs_dr_domain fs_dr_domain; + enum fs_flow_table_type table_type; + struct mlx5_core_dev *dev; + struct mlx5_flow_table *root_ft; + /* Should be held when chaining flow tables */ + struct mutex chain_lock; + struct list_head underlay_qpns; + const struct mlx5_flow_cmds *cmds; +}; + +int mlx5_init_fc_stats(struct mlx5_core_dev *dev); +void mlx5_cleanup_fc_stats(struct mlx5_core_dev *dev); +void mlx5_fc_queue_stats_work(struct mlx5_core_dev *dev, + struct delayed_work *dwork, + unsigned long delay); +void mlx5_fc_update_sampling_interval(struct mlx5_core_dev *dev, + unsigned long interval); + +const struct mlx5_flow_cmds *mlx5_fs_cmd_get_fw_cmds(void); + +int mlx5_flow_namespace_set_peer(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_root_namespace *peer_ns); + +int mlx5_flow_namespace_set_mode(struct mlx5_flow_namespace *ns, + enum mlx5_flow_steering_mode mode); + +int mlx5_init_fs(struct mlx5_core_dev *dev); +void mlx5_cleanup_fs(struct mlx5_core_dev *dev); + +int mlx5_fs_egress_acls_init(struct mlx5_core_dev *dev, int total_vports); +void mlx5_fs_egress_acls_cleanup(struct mlx5_core_dev *dev); +int mlx5_fs_ingress_acls_init(struct mlx5_core_dev *dev, int total_vports); +void mlx5_fs_ingress_acls_cleanup(struct mlx5_core_dev *dev); + +struct mlx5_flow_root_namespace *find_root(struct fs_node *node); + +#define fs_get_obj(v, _node) {v = container_of((_node), typeof(*v), node); } + +#define fs_list_for_each_entry(pos, root) \ + list_for_each_entry(pos, root, node.list) + +#define fs_list_for_each_entry_safe(pos, tmp, root) \ + list_for_each_entry_safe(pos, tmp, root, node.list) + +#define fs_for_each_ns_or_ft_reverse(pos, prio) \ + list_for_each_entry_reverse(pos, &(prio)->node.children, list) + +#define fs_for_each_ns_or_ft(pos, prio) \ + list_for_each_entry(pos, (&(prio)->node.children), list) + +#define fs_for_each_prio(pos, ns) \ + fs_list_for_each_entry(pos, &(ns)->node.children) + +#define fs_for_each_ns(pos, prio) \ + fs_list_for_each_entry(pos, &(prio)->node.children) + +#define fs_for_each_ft(pos, prio) \ + fs_list_for_each_entry(pos, &(prio)->node.children) + +#define fs_for_each_ft_safe(pos, tmp, prio) \ + fs_list_for_each_entry_safe(pos, tmp, &(prio)->node.children) + +#define fs_for_each_fg(pos, ft) \ + fs_list_for_each_entry(pos, &(ft)->node.children) + +#define fs_for_each_fte(pos, fg) \ + fs_list_for_each_entry(pos, &(fg)->node.children) + +#define fs_for_each_dst(pos, fte) \ + fs_list_for_each_entry(pos, &(fte)->node.children) + +#define MLX5_CAP_FLOWTABLE_TYPE(mdev, cap, type) ( \ + (type == FS_FT_NIC_RX) ? MLX5_CAP_FLOWTABLE_NIC_RX(mdev, cap) : \ + (type == FS_FT_NIC_TX) ? MLX5_CAP_FLOWTABLE_NIC_TX(mdev, cap) : \ + (type == FS_FT_ESW_EGRESS_ACL) ? MLX5_CAP_ESW_EGRESS_ACL(mdev, cap) : \ + (type == FS_FT_ESW_INGRESS_ACL) ? MLX5_CAP_ESW_INGRESS_ACL(mdev, cap) : \ + (type == FS_FT_FDB) ? MLX5_CAP_ESW_FLOWTABLE_FDB(mdev, cap) : \ + (type == FS_FT_SNIFFER_RX) ? MLX5_CAP_FLOWTABLE_SNIFFER_RX(mdev, cap) : \ + (type == FS_FT_SNIFFER_TX) ? MLX5_CAP_FLOWTABLE_SNIFFER_TX(mdev, cap) : \ + (type == FS_FT_RDMA_RX) ? MLX5_CAP_FLOWTABLE_RDMA_RX(mdev, cap) : \ + (type == FS_FT_RDMA_TX) ? MLX5_CAP_FLOWTABLE_RDMA_TX(mdev, cap) : \ + (type == FS_FT_PORT_SEL) ? MLX5_CAP_FLOWTABLE_PORT_SELECTION(mdev, cap) : \ + (BUILD_BUG_ON_ZERO(FS_FT_PORT_SEL != FS_FT_MAX_TYPE))\ + ) + +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c new file mode 100644 index 0000000..478f9c6 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c @@ -0,0 +1,807 @@ +/* + * Copyright (c) 2016, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include "mlx5_core.h" +#include "fs_core.h" +#include "fs_cmd.h" + +#define MLX5_FC_STATS_PERIOD msecs_to_jiffies(1000) +#define MLX5_FC_BULK_QUERY_ALLOC_PERIOD msecs_to_jiffies(180 * 1000) +/* Max number of counters to query in bulk read is 32K */ +#define MLX5_SW_MAX_COUNTERS_BULK BIT(15) +#define MLX5_INIT_COUNTERS_BULK 8 +#define MLX5_FC_POOL_MAX_THRESHOLD BIT(18) +#define MLX5_FC_POOL_USED_BUFF_RATIO 10 + +struct mlx5_fc_cache { + u64 packets; + u64 bytes; + u64 lastuse; +}; + +struct mlx5_fc { + struct list_head list; + struct llist_node addlist; + struct llist_node dellist; + + /* last{packets,bytes} members are used when calculating the delta since + * last reading + */ + u64 lastpackets; + u64 lastbytes; + + struct mlx5_fc_bulk *bulk; + u32 id; + bool aging; + + struct mlx5_fc_cache cache ____cacheline_aligned_in_smp; +}; + +static void mlx5_fc_pool_init(struct mlx5_fc_pool *fc_pool, struct mlx5_core_dev *dev); +static void mlx5_fc_pool_cleanup(struct mlx5_fc_pool *fc_pool); +static struct mlx5_fc *mlx5_fc_pool_acquire_counter(struct mlx5_fc_pool *fc_pool); +static void mlx5_fc_pool_release_counter(struct mlx5_fc_pool *fc_pool, struct mlx5_fc *fc); + +/* locking scheme: + * + * It is the responsibility of the user to prevent concurrent calls or bad + * ordering to mlx5_fc_create(), mlx5_fc_destroy() and accessing a reference + * to struct mlx5_fc. + * e.g en_tc.c is protected by RTNL lock of its caller, and will never call a + * dump (access to struct mlx5_fc) after a counter is destroyed. + * + * access to counter list: + * - create (user context) + * - mlx5_fc_create() only adds to an addlist to be used by + * mlx5_fc_stats_work(). addlist is a lockless single linked list + * that doesn't require any additional synchronization when adding single + * node. + * - spawn thread to do the actual destroy + * + * - destroy (user context) + * - add a counter to lockless dellist + * - spawn thread to do the actual del + * + * - dump (user context) + * user should not call dump after destroy + * + * - query (single thread workqueue context) + * destroy/dump - no conflict (see destroy) + * query/dump - packets and bytes might be inconsistent (since update is not + * atomic) + * query/create - no conflict (see create) + * since every create/destroy spawn the work, only after necessary time has + * elapsed, the thread will actually query the hardware. + */ + +static struct list_head *mlx5_fc_counters_lookup_next(struct mlx5_core_dev *dev, + u32 id) +{ + struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats; + unsigned long next_id = (unsigned long)id + 1; + struct mlx5_fc *counter; + unsigned long tmp; + + rcu_read_lock(); + /* skip counters that are in idr, but not yet in counters list */ + idr_for_each_entry_continue_ul(&fc_stats->counters_idr, + counter, tmp, next_id) { + if (!list_empty(&counter->list)) + break; + } + rcu_read_unlock(); + + return counter ? &counter->list : &fc_stats->counters; +} + +static void mlx5_fc_stats_insert(struct mlx5_core_dev *dev, + struct mlx5_fc *counter) +{ + struct list_head *next = mlx5_fc_counters_lookup_next(dev, counter->id); + + list_add_tail(&counter->list, next); +} + +static void mlx5_fc_stats_remove(struct mlx5_core_dev *dev, + struct mlx5_fc *counter) +{ + struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats; + + list_del(&counter->list); + + spin_lock(&fc_stats->counters_idr_lock); + WARN_ON(!idr_remove(&fc_stats->counters_idr, counter->id)); + spin_unlock(&fc_stats->counters_idr_lock); +} + +static int get_init_bulk_query_len(struct mlx5_core_dev *dev) +{ + return min_t(int, MLX5_INIT_COUNTERS_BULK, + (1 << MLX5_CAP_GEN(dev, log_max_flow_counter_bulk))); +} + +static int get_max_bulk_query_len(struct mlx5_core_dev *dev) +{ + return min_t(int, MLX5_SW_MAX_COUNTERS_BULK, + (1 << MLX5_CAP_GEN(dev, log_max_flow_counter_bulk))); +} + +static void update_counter_cache(int index, u32 *bulk_raw_data, + struct mlx5_fc_cache *cache) +{ + void *stats = MLX5_ADDR_OF(query_flow_counter_out, bulk_raw_data, + flow_statistics[index]); + u64 packets = MLX5_GET64(traffic_counter, stats, packets); + u64 bytes = MLX5_GET64(traffic_counter, stats, octets); + + if (cache->packets == packets) + return; + + cache->packets = packets; + cache->bytes = bytes; + cache->lastuse = jiffies; +} + +static void mlx5_fc_stats_query_counter_range(struct mlx5_core_dev *dev, + struct mlx5_fc *first, + u32 last_id) +{ + struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats; + bool query_more_counters = (first->id <= last_id); + int cur_bulk_len = fc_stats->bulk_query_len; + u32 *data = fc_stats->bulk_query_out; + struct mlx5_fc *counter = first; + u32 bulk_base_id; + int bulk_len; + int err; + + while (query_more_counters) { + /* first id must be aligned to 4 when using bulk query */ + bulk_base_id = counter->id & ~0x3; + + /* number of counters to query inc. the last counter */ + bulk_len = min_t(int, cur_bulk_len, + ALIGN(last_id - bulk_base_id + 1, 4)); + + err = mlx5_cmd_fc_bulk_query(dev, bulk_base_id, bulk_len, + data); + if (err) { + mlx5_core_err(dev, "Error doing bulk query: %d\n", err); + return; + } + query_more_counters = false; + + list_for_each_entry_from(counter, &fc_stats->counters, list) { + int counter_index = counter->id - bulk_base_id; + struct mlx5_fc_cache *cache = &counter->cache; + + if (counter->id >= bulk_base_id + bulk_len) { + query_more_counters = true; + break; + } + + update_counter_cache(counter_index, data, cache); + } + } +} + +static void mlx5_fc_free(struct mlx5_core_dev *dev, struct mlx5_fc *counter) +{ + mlx5_cmd_fc_free(dev, counter->id); + kfree(counter); +} + +static void mlx5_fc_release(struct mlx5_core_dev *dev, struct mlx5_fc *counter) +{ + struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats; + + if (counter->bulk) + mlx5_fc_pool_release_counter(&fc_stats->fc_pool, counter); + else + mlx5_fc_free(dev, counter); +} + +static void mlx5_fc_stats_bulk_query_size_increase(struct mlx5_core_dev *dev) +{ + struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats; + int max_bulk_len = get_max_bulk_query_len(dev); + unsigned long now = jiffies; + u32 *bulk_query_out_tmp; + int max_out_len; + + if (fc_stats->bulk_query_alloc_failed && + time_before(now, fc_stats->next_bulk_query_alloc)) + return; + + max_out_len = mlx5_cmd_fc_get_bulk_query_out_len(max_bulk_len); + bulk_query_out_tmp = kzalloc(max_out_len, GFP_KERNEL); + if (!bulk_query_out_tmp) { + mlx5_core_warn_once(dev, + "Can't increase flow counters bulk query buffer size, insufficient memory, bulk_size(%d)\n", + max_bulk_len); + fc_stats->bulk_query_alloc_failed = true; + fc_stats->next_bulk_query_alloc = + now + MLX5_FC_BULK_QUERY_ALLOC_PERIOD; + return; + } + + kfree(fc_stats->bulk_query_out); + fc_stats->bulk_query_out = bulk_query_out_tmp; + fc_stats->bulk_query_len = max_bulk_len; + if (fc_stats->bulk_query_alloc_failed) { + mlx5_core_info(dev, + "Flow counters bulk query buffer size increased, bulk_size(%d)\n", + max_bulk_len); + fc_stats->bulk_query_alloc_failed = false; + } +} + +static void mlx5_fc_stats_work(struct work_struct *work) +{ + struct mlx5_core_dev *dev = container_of(work, struct mlx5_core_dev, + priv.fc_stats.work.work); + struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats; + /* Take dellist first to ensure that counters cannot be deleted before + * they are inserted. + */ + struct llist_node *dellist = llist_del_all(&fc_stats->dellist); + struct llist_node *addlist = llist_del_all(&fc_stats->addlist); + struct mlx5_fc *counter = NULL, *last = NULL, *tmp; + unsigned long now = jiffies; + + if (addlist || !list_empty(&fc_stats->counters)) + queue_delayed_work(fc_stats->wq, &fc_stats->work, + fc_stats->sampling_interval); + + llist_for_each_entry(counter, addlist, addlist) { + mlx5_fc_stats_insert(dev, counter); + fc_stats->num_counters++; + } + + llist_for_each_entry_safe(counter, tmp, dellist, dellist) { + mlx5_fc_stats_remove(dev, counter); + + mlx5_fc_release(dev, counter); + fc_stats->num_counters--; + } + + if (fc_stats->bulk_query_len < get_max_bulk_query_len(dev) && + fc_stats->num_counters > get_init_bulk_query_len(dev)) + mlx5_fc_stats_bulk_query_size_increase(dev); + + if (time_before(now, fc_stats->next_query) || + list_empty(&fc_stats->counters)) + return; + last = list_last_entry(&fc_stats->counters, struct mlx5_fc, list); + + counter = list_first_entry(&fc_stats->counters, struct mlx5_fc, + list); + if (counter) + mlx5_fc_stats_query_counter_range(dev, counter, last->id); + + fc_stats->next_query = now + fc_stats->sampling_interval; +} + +static struct mlx5_fc *mlx5_fc_single_alloc(struct mlx5_core_dev *dev) +{ + struct mlx5_fc *counter; + int err; + + counter = kzalloc(sizeof(*counter), GFP_KERNEL); + if (!counter) + return ERR_PTR(-ENOMEM); + + err = mlx5_cmd_fc_alloc(dev, &counter->id); + if (err) { + kfree(counter); + return ERR_PTR(err); + } + + return counter; +} + +static struct mlx5_fc *mlx5_fc_acquire(struct mlx5_core_dev *dev, bool aging) +{ + struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats; + struct mlx5_fc *counter; + + if (aging && MLX5_CAP_GEN(dev, flow_counter_bulk_alloc) != 0) { + counter = mlx5_fc_pool_acquire_counter(&fc_stats->fc_pool); + if (!IS_ERR(counter)) + return counter; + } + + return mlx5_fc_single_alloc(dev); +} + +struct mlx5_fc *mlx5_fc_create_ex(struct mlx5_core_dev *dev, bool aging) +{ + struct mlx5_fc *counter = mlx5_fc_acquire(dev, aging); + struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats; + int err; + + if (IS_ERR(counter)) + return counter; + + INIT_LIST_HEAD(&counter->list); + counter->aging = aging; + + if (aging) { + u32 id = counter->id; + + counter->cache.lastuse = jiffies; + counter->lastbytes = counter->cache.bytes; + counter->lastpackets = counter->cache.packets; + + idr_preload(GFP_KERNEL); + spin_lock(&fc_stats->counters_idr_lock); + + err = idr_alloc_u32(&fc_stats->counters_idr, counter, &id, id, + GFP_NOWAIT); + + spin_unlock(&fc_stats->counters_idr_lock); + idr_preload_end(); + if (err) + goto err_out_alloc; + + llist_add(&counter->addlist, &fc_stats->addlist); + } + + return counter; + +err_out_alloc: + mlx5_fc_release(dev, counter); + return ERR_PTR(err); +} + +struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging) +{ + struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats; + struct mlx5_fc *counter; + int err; + + if (dev->disable_fc) + return ERR_PTR(-EOPNOTSUPP); + + counter = mlx5_fc_acquire(dev, aging); + if (IS_ERR(counter)) + return counter; + + INIT_LIST_HEAD(&counter->list); + counter->aging = aging; + + if (aging) { + u32 id = counter->id; + counter->cache.lastuse = jiffies; + counter->lastbytes = counter->cache.bytes; + counter->lastpackets = counter->cache.packets; + + idr_preload(GFP_KERNEL); + spin_lock(&fc_stats->counters_idr_lock); + + err = idr_alloc_u32(&fc_stats->counters_idr, counter, &id, id, + GFP_NOWAIT); + + spin_unlock(&fc_stats->counters_idr_lock); + idr_preload_end(); + if (err) + goto err_out_alloc; + llist_add(&counter->addlist, &fc_stats->addlist); + + mod_delayed_work(fc_stats->wq, &fc_stats->work, 0); + } + + return counter; + +err_out_alloc: + mlx5_fc_release(dev, counter); + return ERR_PTR(err); +} +EXPORT_SYMBOL(mlx5_fc_create); + +u32 mlx5_fc_id(struct mlx5_fc *counter) +{ + return counter->id; +} +EXPORT_SYMBOL(mlx5_fc_id); + +void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter) +{ + struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats; + + if (!counter) + return; + + if (counter->aging) { + llist_add(&counter->dellist, &fc_stats->dellist); + return; + } + + mlx5_fc_release(dev, counter); +} +EXPORT_SYMBOL(mlx5_fc_destroy); + +int mlx5_init_fc_stats(struct mlx5_core_dev *dev) +{ + struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats; + int init_bulk_len; + int init_out_len; + + if (dev->disable_fc) + return 0; + + spin_lock_init(&fc_stats->counters_idr_lock); + idr_init(&fc_stats->counters_idr); + INIT_LIST_HEAD(&fc_stats->counters); + init_llist_head(&fc_stats->addlist); + init_llist_head(&fc_stats->dellist); + + init_bulk_len = get_init_bulk_query_len(dev); + init_out_len = mlx5_cmd_fc_get_bulk_query_out_len(init_bulk_len); + fc_stats->bulk_query_out = kzalloc(init_out_len, GFP_KERNEL); + if (!fc_stats->bulk_query_out) + return -ENOMEM; + fc_stats->bulk_query_len = init_bulk_len; + + fc_stats->wq = create_singlethread_workqueue("mlx5_fc"); + if (!fc_stats->wq) + goto err_wq_create; + + fc_stats->sampling_interval = MLX5_FC_STATS_PERIOD; + INIT_DELAYED_WORK(&fc_stats->work, mlx5_fc_stats_work); + + mlx5_fc_pool_init(&fc_stats->fc_pool, dev); + return 0; + +err_wq_create: + kfree(fc_stats->bulk_query_out); + return -ENOMEM; +} + +void mlx5_cleanup_fc_stats(struct mlx5_core_dev *dev) +{ + struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats; + struct llist_node *tmplist; + struct mlx5_fc *counter; + struct mlx5_fc *tmp; + + if (dev->disable_fc) + return; + + cancel_delayed_work_sync(&dev->priv.fc_stats.work); + destroy_workqueue(dev->priv.fc_stats.wq); + dev->priv.fc_stats.wq = NULL; + + tmplist = llist_del_all(&fc_stats->addlist); + llist_for_each_entry_safe(counter, tmp, tmplist, addlist) + mlx5_fc_release(dev, counter); + + list_for_each_entry_safe(counter, tmp, &fc_stats->counters, list) + mlx5_fc_release(dev, counter); + + mlx5_fc_pool_cleanup(&fc_stats->fc_pool); + idr_destroy(&fc_stats->counters_idr); + kfree(fc_stats->bulk_query_out); +} + +int mlx5_fc_query(struct mlx5_core_dev *dev, struct mlx5_fc *counter, + u64 *packets, u64 *bytes) +{ + return mlx5_cmd_fc_query(dev, counter->id, packets, bytes, false); +} +EXPORT_SYMBOL(mlx5_fc_query); + +int mlx5_fc_query_and_clear(struct mlx5_core_dev *dev, struct mlx5_fc *counter, + u64 *packets, u64 *bytes) +{ + return mlx5_cmd_fc_query(dev, counter->id, packets, bytes, true); +} + +u64 mlx5_fc_query_lastuse(struct mlx5_fc *counter) +{ + return counter->cache.lastuse; +} + +void mlx5_fc_query_cached(struct mlx5_fc *counter, + u64 *bytes, u64 *packets, u64 *lastuse) +{ + struct mlx5_fc_cache c; + + c = counter->cache; + + *bytes = c.bytes - counter->lastbytes; + *packets = c.packets - counter->lastpackets; + *lastuse = c.lastuse; + + counter->lastbytes = c.bytes; + counter->lastpackets = c.packets; +} + +void mlx5_fc_queue_stats_work(struct mlx5_core_dev *dev, + struct delayed_work *dwork, + unsigned long delay) +{ + struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats; + + queue_delayed_work(fc_stats->wq, dwork, delay); +} + +void mlx5_fc_update_sampling_interval(struct mlx5_core_dev *dev, + unsigned long interval) +{ + struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats; + + fc_stats->sampling_interval = min_t(unsigned long, interval, + fc_stats->sampling_interval); +} + +/* Flow counter bluks */ + +struct mlx5_fc_bulk { + struct list_head pool_list; + u32 base_id; + int bulk_len; + unsigned long *bitmask; + struct mlx5_fc fcs[]; +}; + +static void mlx5_fc_init(struct mlx5_fc *counter, struct mlx5_fc_bulk *bulk, + u32 id) +{ + counter->bulk = bulk; + counter->id = id; +} + +static int mlx5_fc_bulk_get_free_fcs_amount(struct mlx5_fc_bulk *bulk) +{ + return bitmap_weight(bulk->bitmask, bulk->bulk_len); +} + +static struct mlx5_fc_bulk *mlx5_fc_bulk_create(struct mlx5_core_dev *dev) +{ + enum mlx5_fc_bulk_alloc_bitmask alloc_bitmask; + struct mlx5_fc_bulk *bulk; + int err = -ENOMEM; + int bulk_len; + u32 base_id; + int i; + + alloc_bitmask = MLX5_CAP_GEN(dev, flow_counter_bulk_alloc); + bulk_len = alloc_bitmask > 0 ? MLX5_FC_BULK_NUM_FCS(alloc_bitmask) : 1; + + bulk = kvzalloc(struct_size(bulk, fcs, bulk_len), GFP_KERNEL); + if (!bulk) + goto err_alloc_bulk; + + bulk->bitmask = kvcalloc(BITS_TO_LONGS(bulk_len), sizeof(unsigned long), + GFP_KERNEL); + if (!bulk->bitmask) + goto err_alloc_bitmask; + + err = mlx5_cmd_fc_bulk_alloc(dev, alloc_bitmask, &base_id); + if (err) + goto err_mlx5_cmd_bulk_alloc; + + bulk->base_id = base_id; + bulk->bulk_len = bulk_len; + for (i = 0; i < bulk_len; i++) { + mlx5_fc_init(&bulk->fcs[i], bulk, base_id + i); + set_bit(i, bulk->bitmask); + } + + return bulk; + +err_mlx5_cmd_bulk_alloc: + kvfree(bulk->bitmask); +err_alloc_bitmask: + kvfree(bulk); +err_alloc_bulk: + return ERR_PTR(err); +} + +static int +mlx5_fc_bulk_destroy(struct mlx5_core_dev *dev, struct mlx5_fc_bulk *bulk) +{ + if (mlx5_fc_bulk_get_free_fcs_amount(bulk) < bulk->bulk_len) { + mlx5_core_err(dev, "Freeing bulk before all counters were released\n"); + return -EBUSY; + } + + mlx5_cmd_fc_free(dev, bulk->base_id); + kvfree(bulk->bitmask); + kvfree(bulk); + + return 0; +} + +static struct mlx5_fc *mlx5_fc_bulk_acquire_fc(struct mlx5_fc_bulk *bulk) +{ + int free_fc_index = find_first_bit(bulk->bitmask, bulk->bulk_len); + + if (free_fc_index >= bulk->bulk_len) + return ERR_PTR(-ENOSPC); + + clear_bit(free_fc_index, bulk->bitmask); + return &bulk->fcs[free_fc_index]; +} + +static int mlx5_fc_bulk_release_fc(struct mlx5_fc_bulk *bulk, struct mlx5_fc *fc) +{ + int fc_index = fc->id - bulk->base_id; + + if (test_bit(fc_index, bulk->bitmask)) + return -EINVAL; + + set_bit(fc_index, bulk->bitmask); + return 0; +} + +/* Flow counters pool API */ + +static void mlx5_fc_pool_init(struct mlx5_fc_pool *fc_pool, struct mlx5_core_dev *dev) +{ + fc_pool->dev = dev; + mutex_init(&fc_pool->pool_lock); + INIT_LIST_HEAD(&fc_pool->fully_used); + INIT_LIST_HEAD(&fc_pool->partially_used); + INIT_LIST_HEAD(&fc_pool->unused); + fc_pool->available_fcs = 0; + fc_pool->used_fcs = 0; + fc_pool->threshold = 0; +} + +static void mlx5_fc_pool_cleanup(struct mlx5_fc_pool *fc_pool) +{ + struct mlx5_core_dev *dev = fc_pool->dev; + struct mlx5_fc_bulk *bulk; + struct mlx5_fc_bulk *tmp; + + list_for_each_entry_safe(bulk, tmp, &fc_pool->fully_used, pool_list) + mlx5_fc_bulk_destroy(dev, bulk); + list_for_each_entry_safe(bulk, tmp, &fc_pool->partially_used, pool_list) + mlx5_fc_bulk_destroy(dev, bulk); + list_for_each_entry_safe(bulk, tmp, &fc_pool->unused, pool_list) + mlx5_fc_bulk_destroy(dev, bulk); +} + +static void mlx5_fc_pool_update_threshold(struct mlx5_fc_pool *fc_pool) +{ + fc_pool->threshold = min_t(int, MLX5_FC_POOL_MAX_THRESHOLD, + fc_pool->used_fcs / MLX5_FC_POOL_USED_BUFF_RATIO); +} + +static struct mlx5_fc_bulk * +mlx5_fc_pool_alloc_new_bulk(struct mlx5_fc_pool *fc_pool) +{ + struct mlx5_core_dev *dev = fc_pool->dev; + struct mlx5_fc_bulk *new_bulk; + + new_bulk = mlx5_fc_bulk_create(dev); + if (!IS_ERR(new_bulk)) + fc_pool->available_fcs += new_bulk->bulk_len; + mlx5_fc_pool_update_threshold(fc_pool); + return new_bulk; +} + +static void +mlx5_fc_pool_free_bulk(struct mlx5_fc_pool *fc_pool, struct mlx5_fc_bulk *bulk) +{ + struct mlx5_core_dev *dev = fc_pool->dev; + + fc_pool->available_fcs -= bulk->bulk_len; + mlx5_fc_bulk_destroy(dev, bulk); + mlx5_fc_pool_update_threshold(fc_pool); +} + +static struct mlx5_fc * +mlx5_fc_pool_acquire_from_list(struct list_head *src_list, + struct list_head *next_list, + bool move_non_full_bulk) +{ + struct mlx5_fc_bulk *bulk; + struct mlx5_fc *fc; + + if (list_empty(src_list)) + return ERR_PTR(-ENODATA); + + bulk = list_first_entry(src_list, struct mlx5_fc_bulk, pool_list); + fc = mlx5_fc_bulk_acquire_fc(bulk); + if (move_non_full_bulk || mlx5_fc_bulk_get_free_fcs_amount(bulk) == 0) + list_move(&bulk->pool_list, next_list); + return fc; +} + +static struct mlx5_fc * +mlx5_fc_pool_acquire_counter(struct mlx5_fc_pool *fc_pool) +{ + struct mlx5_fc_bulk *new_bulk; + struct mlx5_fc *fc; + + mutex_lock(&fc_pool->pool_lock); + + fc = mlx5_fc_pool_acquire_from_list(&fc_pool->partially_used, + &fc_pool->fully_used, false); + if (IS_ERR(fc)) + fc = mlx5_fc_pool_acquire_from_list(&fc_pool->unused, + &fc_pool->partially_used, + true); + if (IS_ERR(fc)) { + new_bulk = mlx5_fc_pool_alloc_new_bulk(fc_pool); + if (IS_ERR(new_bulk)) { + fc = ERR_CAST(new_bulk); + goto out; + } + fc = mlx5_fc_bulk_acquire_fc(new_bulk); + list_add(&new_bulk->pool_list, &fc_pool->partially_used); + } + fc_pool->available_fcs--; + fc_pool->used_fcs++; + +out: + mutex_unlock(&fc_pool->pool_lock); + return fc; +} + +static void +mlx5_fc_pool_release_counter(struct mlx5_fc_pool *fc_pool, struct mlx5_fc *fc) +{ + struct mlx5_core_dev *dev = fc_pool->dev; + struct mlx5_fc_bulk *bulk = fc->bulk; + int bulk_free_fcs_amount; + + mutex_lock(&fc_pool->pool_lock); + + if (mlx5_fc_bulk_release_fc(bulk, fc)) { + mlx5_core_warn(dev, "Attempted to release a counter which is not acquired\n"); + goto unlock; + } + + fc_pool->available_fcs++; + fc_pool->used_fcs--; + + bulk_free_fcs_amount = mlx5_fc_bulk_get_free_fcs_amount(bulk); + if (bulk_free_fcs_amount == 1) + list_move_tail(&bulk->pool_list, &fc_pool->partially_used); + if (bulk_free_fcs_amount == bulk->bulk_len) { + list_del(&bulk->pool_list); + if (fc_pool->available_fcs > fc_pool->threshold) + mlx5_fc_pool_free_bulk(fc_pool, bulk); + else + list_add(&bulk->pool_list, &fc_pool->unused); + } + +unlock: + mutex_unlock(&fc_pool->pool_lock); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fs_ft_pool.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fs_ft_pool.c new file mode 100644 index 0000000..c14590a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fs_ft_pool.c @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2021 Mellanox Technologies. */ + +#include "fs_ft_pool.h" + +/* Firmware currently has 4 pool of 4 sizes that it supports (FT_POOLS), + * and a virtual memory region of 16M (MLX5_FT_SIZE), this region is duplicated + * for each flow table pool. We can allocate up to 16M of each pool, + * and we keep track of how much we used via mlx5_ft_pool_get_avail_sz. + * Firmware doesn't report any of this for now. + * ESW_POOL is expected to be sorted from large to small and match firmware + * pools. + */ +#define FT_SIZE (16 * 1024 * 1024) +static const unsigned int FT_POOLS[] = { 4 * 1024 * 1024, + 1 * 1024 * 1024, + 64 * 1024, + 128, + 1 /* size for termination tables */ }; +struct mlx5_ft_pool { + int ft_left[ARRAY_SIZE(FT_POOLS)]; +}; + +int mlx5_ft_pool_init(struct mlx5_core_dev *dev) +{ + struct mlx5_ft_pool *ft_pool; + int i; + + ft_pool = kzalloc(sizeof(*ft_pool), GFP_KERNEL); + if (!ft_pool) + return -ENOMEM; + + for (i = ARRAY_SIZE(FT_POOLS) - 1; i >= 0; i--) + ft_pool->ft_left[i] = FT_SIZE / FT_POOLS[i]; + + dev->priv.ft_pool = ft_pool; + return 0; +} + +void mlx5_ft_pool_destroy(struct mlx5_core_dev *dev) +{ + kfree(dev->priv.ft_pool); +} + +int +mlx5_ft_pool_get_avail_sz(struct mlx5_core_dev *dev, enum fs_flow_table_type table_type, + int desired_size) +{ + u32 max_ft_size = 1 << MLX5_CAP_FLOWTABLE_TYPE(dev, log_max_ft_size, table_type); + int i, found_i = -1; + + for (i = ARRAY_SIZE(FT_POOLS) - 1; i >= 0; i--) { + if (dev->priv.ft_pool->ft_left[i] && FT_POOLS[i] >= desired_size && + FT_POOLS[i] <= max_ft_size) { + found_i = i; + if (desired_size != POOL_NEXT_SIZE) + break; + } + } + + if (found_i != -1) { + --dev->priv.ft_pool->ft_left[found_i]; + return FT_POOLS[found_i]; + } + + return 0; +} + +void +mlx5_ft_pool_put_sz(struct mlx5_core_dev *dev, int sz) +{ + int i; + + if (!sz) + return; + + for (i = ARRAY_SIZE(FT_POOLS) - 1; i >= 0; i--) { + if (sz == FT_POOLS[i]) { + ++dev->priv.ft_pool->ft_left[i]; + return; + } + } + + WARN_ONCE(1, "Couldn't find size %d in flow table size pool", sz); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fs_ft_pool.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fs_ft_pool.h new file mode 100644 index 0000000..25f4274 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fs_ft_pool.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2021 Mellanox Technologies. */ + +#ifndef __MLX5_FS_FT_POOL_H__ +#define __MLX5_FS_FT_POOL_H__ + +#include +#include "fs_core.h" + +#define POOL_NEXT_SIZE 0 + +int mlx5_ft_pool_init(struct mlx5_core_dev *dev); +void mlx5_ft_pool_destroy(struct mlx5_core_dev *dev); + +int +mlx5_ft_pool_get_avail_sz(struct mlx5_core_dev *dev, enum fs_flow_table_type table_type, + int desired_size); +void +mlx5_ft_pool_put_sz(struct mlx5_core_dev *dev, int sz); + +#endif /* __MLX5_FS_FT_POOL_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fw.c new file mode 100644 index 0000000..a04ea4e --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fw.c @@ -0,0 +1,952 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include "mlx5_core.h" +#include "../../mlxfw/mlxfw.h" +#include "lib/tout.h" +#include "accel/tls.h" + +enum { + MCQS_IDENTIFIER_BOOT_IMG = 0x1, + MCQS_IDENTIFIER_OEM_NVCONFIG = 0x4, + MCQS_IDENTIFIER_MLNX_NVCONFIG = 0x5, + MCQS_IDENTIFIER_CS_TOKEN = 0x6, + MCQS_IDENTIFIER_DBG_TOKEN = 0x7, + MCQS_IDENTIFIER_GEARBOX = 0xA, +}; + +enum { + MCQS_UPDATE_STATE_IDLE, + MCQS_UPDATE_STATE_IN_PROGRESS, + MCQS_UPDATE_STATE_APPLIED, + MCQS_UPDATE_STATE_ACTIVE, + MCQS_UPDATE_STATE_ACTIVE_PENDING_RESET, + MCQS_UPDATE_STATE_FAILED, + MCQS_UPDATE_STATE_CANCELED, + MCQS_UPDATE_STATE_BUSY, +}; + +enum { + MCQI_INFO_TYPE_CAPABILITIES = 0x0, + MCQI_INFO_TYPE_VERSION = 0x1, + MCQI_INFO_TYPE_ACTIVATION_METHOD = 0x5, +}; + +enum { + MCQI_FW_RUNNING_VERSION = 0, + MCQI_FW_STORED_VERSION = 1, +}; + +int mlx5_query_board_id(struct mlx5_core_dev *dev) +{ + u32 *out; + int outlen = MLX5_ST_SZ_BYTES(query_adapter_out); + u32 in[MLX5_ST_SZ_DW(query_adapter_in)] = {}; + int err; + + out = kzalloc(outlen, GFP_KERNEL); + if (!out) + return -ENOMEM; + + MLX5_SET(query_adapter_in, in, opcode, MLX5_CMD_OP_QUERY_ADAPTER); + err = mlx5_cmd_exec_inout(dev, query_adapter, in, out); + if (err) + goto out; + + memcpy(dev->board_id, + MLX5_ADDR_OF(query_adapter_out, out, + query_adapter_struct.vsd_contd_psid), + MLX5_FLD_SZ_BYTES(query_adapter_out, + query_adapter_struct.vsd_contd_psid)); + +out: + kfree(out); + return err; +} + +int mlx5_core_query_vendor_id(struct mlx5_core_dev *mdev, u32 *vendor_id) +{ + u32 *out; + int outlen = MLX5_ST_SZ_BYTES(query_adapter_out); + u32 in[MLX5_ST_SZ_DW(query_adapter_in)] = {}; + int err; + + out = kzalloc(outlen, GFP_KERNEL); + if (!out) + return -ENOMEM; + + MLX5_SET(query_adapter_in, in, opcode, MLX5_CMD_OP_QUERY_ADAPTER); + err = mlx5_cmd_exec_inout(mdev, query_adapter, in, out); + if (err) + goto out; + + *vendor_id = MLX5_GET(query_adapter_out, out, + query_adapter_struct.ieee_vendor_id); +out: + kfree(out); + return err; +} +EXPORT_SYMBOL(mlx5_core_query_vendor_id); + +static int mlx5_get_pcam_reg(struct mlx5_core_dev *dev) +{ + return mlx5_query_pcam_reg(dev, dev->caps.pcam, + MLX5_PCAM_FEATURE_ENHANCED_FEATURES, + MLX5_PCAM_REGS_5000_TO_507F); +} + +static int mlx5_get_mcam_access_reg_group(struct mlx5_core_dev *dev, + enum mlx5_mcam_reg_groups group) +{ + return mlx5_query_mcam_reg(dev, dev->caps.mcam[group], + MLX5_MCAM_FEATURE_ENHANCED_FEATURES, group); +} + +static int mlx5_get_qcam_reg(struct mlx5_core_dev *dev) +{ + return mlx5_query_qcam_reg(dev, dev->caps.qcam, + MLX5_QCAM_FEATURE_ENHANCED_FEATURES, + MLX5_QCAM_REGS_FIRST_128); +} + +int mlx5_query_hca_caps(struct mlx5_core_dev *dev) +{ + int err; + + err = mlx5_core_get_caps(dev, MLX5_CAP_GENERAL); + if (err) + return err; + + if (MLX5_CAP_GEN(dev, port_selection_cap)) { + err = mlx5_core_get_caps(dev, MLX5_CAP_PORT_SELECTION); + if (err) + return err; + } + + if (MLX5_CAP_GEN(dev, hca_cap_2)) { + err = mlx5_core_get_caps(dev, MLX5_CAP_GENERAL_2); + if (err) + return err; + } + + if (MLX5_CAP_GEN(dev, eth_net_offloads)) { + err = mlx5_core_get_caps(dev, MLX5_CAP_ETHERNET_OFFLOADS); + if (err) + return err; + } + + if (MLX5_CAP_GEN(dev, ipoib_enhanced_offloads)) { + err = mlx5_core_get_caps(dev, MLX5_CAP_IPOIB_ENHANCED_OFFLOADS); + if (err) + return err; + } + + if (MLX5_CAP_GEN(dev, pg)) { + err = mlx5_core_get_caps(dev, MLX5_CAP_ODP); + if (err) + return err; + } + + if (MLX5_CAP_GEN(dev, atomic)) { + err = mlx5_core_get_caps(dev, MLX5_CAP_ATOMIC); + if (err) + return err; + } + + if (MLX5_CAP_GEN(dev, roce)) { + err = mlx5_core_get_caps(dev, MLX5_CAP_ROCE); + if (err) + return err; + } + + if (MLX5_CAP_GEN(dev, nic_flow_table) || + MLX5_CAP_GEN(dev, ipoib_enhanced_offloads)) { + err = mlx5_core_get_caps(dev, MLX5_CAP_FLOW_TABLE); + if (err) + return err; + } + + if (MLX5_CAP_GEN(dev, vport_group_manager) && + MLX5_ESWITCH_MANAGER(dev)) { + err = mlx5_core_get_caps(dev, MLX5_CAP_ESWITCH_FLOW_TABLE); + if (err) + return err; + } + + if (MLX5_ESWITCH_MANAGER(dev)) { + err = mlx5_core_get_caps(dev, MLX5_CAP_ESWITCH); + if (err) + return err; + } + + if (MLX5_CAP_GEN(dev, vector_calc)) { + err = mlx5_core_get_caps(dev, MLX5_CAP_VECTOR_CALC); + if (err) + return err; + } + + if (MLX5_CAP_GEN(dev, qos)) { + err = mlx5_core_get_caps(dev, MLX5_CAP_QOS); + if (err) + return err; + } + + if (MLX5_CAP_GEN(dev, debug)) + mlx5_core_get_caps(dev, MLX5_CAP_DEBUG); + + if (MLX5_CAP_GEN(dev, debug)) { + err = mlx5_core_get_caps(dev, MLX5_CAP_DEBUG); + if (err) + return err; + } + + if (MLX5_CAP_GEN(dev, nvmf_target_offload)) { + err = mlx5_core_get_caps(dev, MLX5_CAP_NVMF); + if (err) + return err; + } + + if (MLX5_CAP_GEN(dev, pcam_reg)) + mlx5_get_pcam_reg(dev); + + if (MLX5_CAP_GEN(dev, mcam_reg)) { + mlx5_get_mcam_access_reg_group(dev, MLX5_MCAM_REGS_FIRST_128); + mlx5_get_mcam_access_reg_group(dev, MLX5_MCAM_REGS_0x9080_0x90FF); + mlx5_get_mcam_access_reg_group(dev, MLX5_MCAM_REGS_0x9100_0x917F); + } + + if (MLX5_CAP_GEN(dev, qcam_reg)) + mlx5_get_qcam_reg(dev); + + if (MLX5_CAP_GEN(dev, device_memory)) { + err = mlx5_core_get_caps(dev, MLX5_CAP_DEV_MEM); + if (err) + return err; + } + + if (MLX5_CAP_GEN(dev, event_cap)) { + err = mlx5_core_get_caps(dev, MLX5_CAP_DEV_EVENT); + if (err) + return err; + } + + if (mlx5_accel_is_ktls_tx(dev) || mlx5_accel_is_ktls_rx(dev)) { + err = mlx5_core_get_caps(dev, MLX5_CAP_TLS); + if (err) + return err; + } + + if (MLX5_CAP_GEN_64(dev, general_obj_types) & + MLX5_GENERAL_OBJ_TYPES_CAP_VIRTIO_NET_Q) { + err = mlx5_core_get_caps(dev, MLX5_CAP_VDPA_EMULATION); + if (err) + return err; + } + + if (MLX5_CAP_GEN(dev, ipsec_offload)) { + err = mlx5_core_get_caps(dev, MLX5_CAP_IPSEC); + if (err) + return err; + } + + if (MLX5_CAP_GEN(dev, shampo)) { + err = mlx5_core_get_caps(dev, MLX5_CAP_DEV_SHAMPO); + if (err) + return err; + } + + err = mlx5_core_query_special_contexts(dev); + if (err) + return err; + + if (MLX5_CAP_GEN_64(dev, general_obj_types) & + MLX5_GENERAL_OBJ_TYPES_CAP_MACSEC_OFFLOAD) { + err = mlx5_core_get_caps(dev, MLX5_CAP_MACSEC); + if (err) + return err; + } + + return 0; +} + +int mlx5_cmd_init_hca(struct mlx5_core_dev *dev, uint32_t *sw_owner_id) +{ + u32 in[MLX5_ST_SZ_DW(init_hca_in)] = {}; + int i; + + MLX5_SET(init_hca_in, in, opcode, MLX5_CMD_OP_INIT_HCA); + + if (MLX5_CAP_GEN(dev, sw_owner_id)) { + for (i = 0; i < 4; i++) + MLX5_ARRAY_SET(init_hca_in, in, sw_owner_id, i, + sw_owner_id[i]); + } + + return mlx5_cmd_exec_in(dev, init_hca, in); +} + +int mlx5_cmd_teardown_hca(struct mlx5_core_dev *dev) +{ + u32 in[MLX5_ST_SZ_DW(teardown_hca_in)] = {}; + + MLX5_SET(teardown_hca_in, in, opcode, MLX5_CMD_OP_TEARDOWN_HCA); + return mlx5_cmd_exec_in(dev, teardown_hca, in); +} + +int mlx5_cmd_force_teardown_hca(struct mlx5_core_dev *dev) +{ + u32 out[MLX5_ST_SZ_DW(teardown_hca_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(teardown_hca_in)] = {0}; + int force_state; + int ret; + + if (!MLX5_CAP_GEN(dev, force_teardown)) { + mlx5_core_dbg(dev, "force teardown is not supported in the firmware\n"); + return -EOPNOTSUPP; + } + + MLX5_SET(teardown_hca_in, in, opcode, MLX5_CMD_OP_TEARDOWN_HCA); + MLX5_SET(teardown_hca_in, in, profile, MLX5_TEARDOWN_HCA_IN_PROFILE_FORCE_CLOSE); + + ret = mlx5_cmd_exec_polling(dev, in, sizeof(in), out, sizeof(out)); + if (ret) + return ret; + + force_state = MLX5_GET(teardown_hca_out, out, state); + if (force_state == MLX5_TEARDOWN_HCA_OUT_FORCE_STATE_FAIL) { + mlx5_core_dbg(dev, "teardown with force mode failed, doing normal teardown\n"); + return -EIO; + } + + return 0; +} + +int mlx5_cmd_fast_teardown_hca(struct mlx5_core_dev *dev) +{ + unsigned long end, delay_ms = mlx5_tout_ms(dev, TEARDOWN); + u32 out[MLX5_ST_SZ_DW(teardown_hca_out)] = {}; + u32 in[MLX5_ST_SZ_DW(teardown_hca_in)] = {}; + int state; + int ret; + + if (!MLX5_CAP_GEN(dev, fast_teardown)) { + mlx5_core_dbg(dev, "fast teardown is not supported in the firmware\n"); + return -EOPNOTSUPP; + } + + MLX5_SET(teardown_hca_in, in, opcode, MLX5_CMD_OP_TEARDOWN_HCA); + MLX5_SET(teardown_hca_in, in, profile, + MLX5_TEARDOWN_HCA_IN_PROFILE_PREPARE_FAST_TEARDOWN); + + ret = mlx5_cmd_exec_inout(dev, teardown_hca, in, out); + if (ret) + return ret; + + state = MLX5_GET(teardown_hca_out, out, state); + if (state == MLX5_TEARDOWN_HCA_OUT_FORCE_STATE_FAIL) { + mlx5_core_dbg(dev, "teardown with fast mode failed\n"); + return -EIO; + } + + mlx5_set_nic_state(dev, MLX5_NIC_IFC_DISABLED); + + /* Loop until device state turns to disable */ + end = jiffies + msecs_to_jiffies(delay_ms); + do { + if (mlx5_get_nic_state(dev) == MLX5_NIC_IFC_DISABLED) + break; + + cond_resched(); + } while (!time_after(jiffies, end)); + + if (mlx5_get_nic_state(dev) != MLX5_NIC_IFC_DISABLED) { + dev_err(&dev->pdev->dev, "NIC IFC still %d after %lums.\n", + mlx5_get_nic_state(dev), delay_ms); + return -EIO; + } + + return 0; +} + +enum mlxsw_reg_mcc_instruction { + MLX5_REG_MCC_INSTRUCTION_LOCK_UPDATE_HANDLE = 0x01, + MLX5_REG_MCC_INSTRUCTION_RELEASE_UPDATE_HANDLE = 0x02, + MLX5_REG_MCC_INSTRUCTION_UPDATE_COMPONENT = 0x03, + MLX5_REG_MCC_INSTRUCTION_VERIFY_COMPONENT = 0x04, + MLX5_REG_MCC_INSTRUCTION_ACTIVATE = 0x06, + MLX5_REG_MCC_INSTRUCTION_CANCEL = 0x08, +}; + +static int mlx5_reg_mcc_set(struct mlx5_core_dev *dev, + enum mlxsw_reg_mcc_instruction instr, + u16 component_index, u32 update_handle, + u32 component_size) +{ + u32 out[MLX5_ST_SZ_DW(mcc_reg)]; + u32 in[MLX5_ST_SZ_DW(mcc_reg)]; + + memset(in, 0, sizeof(in)); + + MLX5_SET(mcc_reg, in, instruction, instr); + MLX5_SET(mcc_reg, in, component_index, component_index); + MLX5_SET(mcc_reg, in, update_handle, update_handle); + MLX5_SET(mcc_reg, in, component_size, component_size); + + return mlx5_core_access_reg(dev, in, sizeof(in), out, + sizeof(out), MLX5_REG_MCC, 0, 1); +} + +static int mlx5_reg_mcc_query(struct mlx5_core_dev *dev, + u32 *update_handle, u8 *error_code, + u8 *control_state) +{ + u32 out[MLX5_ST_SZ_DW(mcc_reg)]; + u32 in[MLX5_ST_SZ_DW(mcc_reg)]; + int err; + + memset(in, 0, sizeof(in)); + memset(out, 0, sizeof(out)); + MLX5_SET(mcc_reg, in, update_handle, *update_handle); + + err = mlx5_core_access_reg(dev, in, sizeof(in), out, + sizeof(out), MLX5_REG_MCC, 0, 0); + if (err) + goto out; + + *update_handle = MLX5_GET(mcc_reg, out, update_handle); + *error_code = MLX5_GET(mcc_reg, out, error_code); + *control_state = MLX5_GET(mcc_reg, out, control_state); + +out: + return err; +} + +static int mlx5_reg_mcda_set(struct mlx5_core_dev *dev, + u32 update_handle, + u32 offset, u16 size, + u8 *data) +{ + int err, in_size = MLX5_ST_SZ_BYTES(mcda_reg) + size; + u32 out[MLX5_ST_SZ_DW(mcda_reg)]; + int i, j, dw_size = size >> 2; + __be32 data_element; + u32 *in; + + in = kzalloc(in_size, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(mcda_reg, in, update_handle, update_handle); + MLX5_SET(mcda_reg, in, offset, offset); + MLX5_SET(mcda_reg, in, size, size); + + for (i = 0; i < dw_size; i++) { + j = i * 4; + data_element = htonl(*(u32 *)&data[j]); + memcpy(MLX5_ADDR_OF(mcda_reg, in, data) + j, &data_element, 4); + } + + err = mlx5_core_access_reg(dev, in, in_size, out, + sizeof(out), MLX5_REG_MCDA, 0, 1); + kfree(in); + return err; +} + +static int mlx5_reg_mcqi_query(struct mlx5_core_dev *dev, + u16 component_index, bool read_pending, + u8 info_type, u16 data_size, void *mcqi_data) +{ + u32 out[MLX5_ST_SZ_DW(mcqi_reg) + MLX5_UN_SZ_DW(mcqi_reg_data)] = {}; + u32 in[MLX5_ST_SZ_DW(mcqi_reg)] = {}; + void *data; + int err; + + MLX5_SET(mcqi_reg, in, component_index, component_index); + MLX5_SET(mcqi_reg, in, read_pending_component, read_pending); + MLX5_SET(mcqi_reg, in, info_type, info_type); + MLX5_SET(mcqi_reg, in, data_size, data_size); + + err = mlx5_core_access_reg(dev, in, sizeof(in), out, + MLX5_ST_SZ_BYTES(mcqi_reg) + data_size, + MLX5_REG_MCQI, 0, 0); + if (err) + return err; + + data = MLX5_ADDR_OF(mcqi_reg, out, data); + memcpy(mcqi_data, data, data_size); + + return 0; +} + +static int mlx5_reg_mcqi_caps_query(struct mlx5_core_dev *dev, u16 component_index, + u32 *max_component_size, u8 *log_mcda_word_size, + u16 *mcda_max_write_size) +{ + u32 mcqi_reg[MLX5_ST_SZ_DW(mcqi_cap)] = {}; + int err; + + err = mlx5_reg_mcqi_query(dev, component_index, 0, + MCQI_INFO_TYPE_CAPABILITIES, + MLX5_ST_SZ_BYTES(mcqi_cap), mcqi_reg); + if (err) + return err; + + *max_component_size = MLX5_GET(mcqi_cap, mcqi_reg, max_component_size); + *log_mcda_word_size = MLX5_GET(mcqi_cap, mcqi_reg, log_mcda_word_size); + *mcda_max_write_size = MLX5_GET(mcqi_cap, mcqi_reg, mcda_max_write_size); + + return 0; +} + +struct mlx5_mlxfw_dev { + struct mlxfw_dev mlxfw_dev; + struct mlx5_core_dev *mlx5_core_dev; +}; + +static int mlx5_component_query(struct mlxfw_dev *mlxfw_dev, + u16 component_index, u32 *p_max_size, + u8 *p_align_bits, u16 *p_max_write_size) +{ + struct mlx5_mlxfw_dev *mlx5_mlxfw_dev = + container_of(mlxfw_dev, struct mlx5_mlxfw_dev, mlxfw_dev); + struct mlx5_core_dev *dev = mlx5_mlxfw_dev->mlx5_core_dev; + + if (!MLX5_CAP_GEN(dev, mcam_reg) || !MLX5_CAP_MCAM_REG(dev, mcqi)) { + mlx5_core_warn(dev, "caps query isn't supported by running FW\n"); + return -EOPNOTSUPP; + } + + return mlx5_reg_mcqi_caps_query(dev, component_index, p_max_size, + p_align_bits, p_max_write_size); +} + +static int mlx5_fsm_lock(struct mlxfw_dev *mlxfw_dev, u32 *fwhandle) +{ + struct mlx5_mlxfw_dev *mlx5_mlxfw_dev = + container_of(mlxfw_dev, struct mlx5_mlxfw_dev, mlxfw_dev); + struct mlx5_core_dev *dev = mlx5_mlxfw_dev->mlx5_core_dev; + u8 control_state, error_code; + int err; + + *fwhandle = 0; + err = mlx5_reg_mcc_query(dev, fwhandle, &error_code, &control_state); + if (err) + return err; + + if (control_state != MLXFW_FSM_STATE_IDLE) + return -EBUSY; + + return mlx5_reg_mcc_set(dev, MLX5_REG_MCC_INSTRUCTION_LOCK_UPDATE_HANDLE, + 0, *fwhandle, 0); +} + +static int mlx5_fsm_component_update(struct mlxfw_dev *mlxfw_dev, u32 fwhandle, + u16 component_index, u32 component_size) +{ + struct mlx5_mlxfw_dev *mlx5_mlxfw_dev = + container_of(mlxfw_dev, struct mlx5_mlxfw_dev, mlxfw_dev); + struct mlx5_core_dev *dev = mlx5_mlxfw_dev->mlx5_core_dev; + + return mlx5_reg_mcc_set(dev, MLX5_REG_MCC_INSTRUCTION_UPDATE_COMPONENT, + component_index, fwhandle, component_size); +} + +static int mlx5_fsm_block_download(struct mlxfw_dev *mlxfw_dev, u32 fwhandle, + u8 *data, u16 size, u32 offset) +{ + struct mlx5_mlxfw_dev *mlx5_mlxfw_dev = + container_of(mlxfw_dev, struct mlx5_mlxfw_dev, mlxfw_dev); + struct mlx5_core_dev *dev = mlx5_mlxfw_dev->mlx5_core_dev; + + return mlx5_reg_mcda_set(dev, fwhandle, offset, size, data); +} + +static int mlx5_fsm_component_verify(struct mlxfw_dev *mlxfw_dev, u32 fwhandle, + u16 component_index) +{ + struct mlx5_mlxfw_dev *mlx5_mlxfw_dev = + container_of(mlxfw_dev, struct mlx5_mlxfw_dev, mlxfw_dev); + struct mlx5_core_dev *dev = mlx5_mlxfw_dev->mlx5_core_dev; + + return mlx5_reg_mcc_set(dev, MLX5_REG_MCC_INSTRUCTION_VERIFY_COMPONENT, + component_index, fwhandle, 0); +} + +static int mlx5_fsm_activate(struct mlxfw_dev *mlxfw_dev, u32 fwhandle) +{ + struct mlx5_mlxfw_dev *mlx5_mlxfw_dev = + container_of(mlxfw_dev, struct mlx5_mlxfw_dev, mlxfw_dev); + struct mlx5_core_dev *dev = mlx5_mlxfw_dev->mlx5_core_dev; + + return mlx5_reg_mcc_set(dev, MLX5_REG_MCC_INSTRUCTION_ACTIVATE, 0, + fwhandle, 0); +} + +static int mlx5_fsm_query_state(struct mlxfw_dev *mlxfw_dev, u32 fwhandle, + enum mlxfw_fsm_state *fsm_state, + enum mlxfw_fsm_state_err *fsm_state_err) +{ + struct mlx5_mlxfw_dev *mlx5_mlxfw_dev = + container_of(mlxfw_dev, struct mlx5_mlxfw_dev, mlxfw_dev); + struct mlx5_core_dev *dev = mlx5_mlxfw_dev->mlx5_core_dev; + u8 control_state, error_code; + int err; + + err = mlx5_reg_mcc_query(dev, &fwhandle, &error_code, &control_state); + if (err) + return err; + + *fsm_state = control_state; + *fsm_state_err = min_t(enum mlxfw_fsm_state_err, error_code, + MLXFW_FSM_STATE_ERR_MAX); + return 0; +} + +static void mlx5_fsm_cancel(struct mlxfw_dev *mlxfw_dev, u32 fwhandle) +{ + struct mlx5_mlxfw_dev *mlx5_mlxfw_dev = + container_of(mlxfw_dev, struct mlx5_mlxfw_dev, mlxfw_dev); + struct mlx5_core_dev *dev = mlx5_mlxfw_dev->mlx5_core_dev; + + mlx5_reg_mcc_set(dev, MLX5_REG_MCC_INSTRUCTION_CANCEL, 0, fwhandle, 0); +} + +static void mlx5_fsm_release(struct mlxfw_dev *mlxfw_dev, u32 fwhandle) +{ + struct mlx5_mlxfw_dev *mlx5_mlxfw_dev = + container_of(mlxfw_dev, struct mlx5_mlxfw_dev, mlxfw_dev); + struct mlx5_core_dev *dev = mlx5_mlxfw_dev->mlx5_core_dev; + + mlx5_reg_mcc_set(dev, MLX5_REG_MCC_INSTRUCTION_RELEASE_UPDATE_HANDLE, 0, + fwhandle, 0); +} + +static int mlx5_fsm_reactivate(struct mlxfw_dev *mlxfw_dev, u8 *status) +{ + struct mlx5_mlxfw_dev *mlx5_mlxfw_dev = + container_of(mlxfw_dev, struct mlx5_mlxfw_dev, mlxfw_dev); + struct mlx5_core_dev *dev = mlx5_mlxfw_dev->mlx5_core_dev; + u32 out[MLX5_ST_SZ_DW(mirc_reg)]; + u32 in[MLX5_ST_SZ_DW(mirc_reg)]; + unsigned long exp_time; + int err; + + exp_time = jiffies + msecs_to_jiffies(mlx5_tout_ms(dev, FSM_REACTIVATE)); + + if (!MLX5_CAP_MCAM_REG2(dev, mirc)) + return -EOPNOTSUPP; + + memset(in, 0, sizeof(in)); + + err = mlx5_core_access_reg(dev, in, sizeof(in), out, + sizeof(out), MLX5_REG_MIRC, 0, 1); + if (err) + return err; + + do { + memset(out, 0, sizeof(out)); + err = mlx5_core_access_reg(dev, in, sizeof(in), out, + sizeof(out), MLX5_REG_MIRC, 0, 0); + if (err) + return err; + + *status = MLX5_GET(mirc_reg, out, status_code); + if (*status != MLXFW_FSM_REACTIVATE_STATUS_BUSY) + return 0; + + msleep(20); + } while (time_before(jiffies, exp_time)); + + return 0; +} + +static const struct mlxfw_dev_ops mlx5_mlxfw_dev_ops = { + .component_query = mlx5_component_query, + .fsm_lock = mlx5_fsm_lock, + .fsm_component_update = mlx5_fsm_component_update, + .fsm_block_download = mlx5_fsm_block_download, + .fsm_component_verify = mlx5_fsm_component_verify, + .fsm_activate = mlx5_fsm_activate, + .fsm_reactivate = mlx5_fsm_reactivate, + .fsm_query_state = mlx5_fsm_query_state, + .fsm_cancel = mlx5_fsm_cancel, + .fsm_release = mlx5_fsm_release +}; + +int mlx5_firmware_flash(struct mlx5_core_dev *dev, + const struct firmware *firmware, + struct netlink_ext_ack *extack) +{ + struct mlx5_mlxfw_dev mlx5_mlxfw_dev = { + .mlxfw_dev = { + .ops = &mlx5_mlxfw_dev_ops, + .psid = dev->board_id, + .psid_size = strlen(dev->board_id), + .devlink = priv_to_devlink(dev), + }, + .mlx5_core_dev = dev + }; + + if (!MLX5_CAP_GEN(dev, mcam_reg) || + !MLX5_CAP_MCAM_REG(dev, mcqi) || + !MLX5_CAP_MCAM_REG(dev, mcc) || + !MLX5_CAP_MCAM_REG(dev, mcda)) { + pr_info("%s flashing isn't supported by the running FW\n", __func__); + return -EOPNOTSUPP; + } + + return mlxfw_firmware_flash(&mlx5_mlxfw_dev.mlxfw_dev, + firmware, extack); +} + +static int mlx5_reg_mcqi_version_query(struct mlx5_core_dev *dev, + u16 component_index, bool read_pending, + u32 *mcqi_version_out) +{ + return mlx5_reg_mcqi_query(dev, component_index, read_pending, + MCQI_INFO_TYPE_VERSION, + MLX5_ST_SZ_BYTES(mcqi_version), + mcqi_version_out); +} + +static int mlx5_reg_mcqs_query(struct mlx5_core_dev *dev, u32 *out, + u16 component_index) +{ + u8 out_sz = MLX5_ST_SZ_BYTES(mcqs_reg); + u32 in[MLX5_ST_SZ_DW(mcqs_reg)] = {}; + int err; + + memset(out, 0, out_sz); + + MLX5_SET(mcqs_reg, in, component_index, component_index); + + err = mlx5_core_access_reg(dev, in, sizeof(in), out, + out_sz, MLX5_REG_MCQS, 0, 0); + return err; +} + +/* scans component index sequentially, to find the boot img index */ +static int mlx5_get_boot_img_component_index(struct mlx5_core_dev *dev) +{ + u32 out[MLX5_ST_SZ_DW(mcqs_reg)] = {}; + u16 identifier, component_idx = 0; + bool quit; + int err; + + do { + err = mlx5_reg_mcqs_query(dev, out, component_idx); + if (err) + return err; + + identifier = MLX5_GET(mcqs_reg, out, identifier); + quit = !!MLX5_GET(mcqs_reg, out, last_index_flag); + quit |= identifier == MCQS_IDENTIFIER_BOOT_IMG; + } while (!quit && ++component_idx); + + if (identifier != MCQS_IDENTIFIER_BOOT_IMG) { + mlx5_core_warn(dev, "mcqs: can't find boot_img component ix, last scanned idx %d\n", + component_idx); + return -EOPNOTSUPP; + } + + return component_idx; +} + +static int +mlx5_fw_image_pending(struct mlx5_core_dev *dev, + int component_index, + bool *pending_version_exists) +{ + u32 out[MLX5_ST_SZ_DW(mcqs_reg)]; + u8 component_update_state; + int err; + + err = mlx5_reg_mcqs_query(dev, out, component_index); + if (err) + return err; + + component_update_state = MLX5_GET(mcqs_reg, out, component_update_state); + + if (component_update_state == MCQS_UPDATE_STATE_IDLE) { + *pending_version_exists = false; + } else if (component_update_state == MCQS_UPDATE_STATE_ACTIVE_PENDING_RESET) { + *pending_version_exists = true; + } else { + mlx5_core_warn(dev, + "mcqs: can't read pending fw version while fw state is %d\n", + component_update_state); + return -ENODATA; + } + return 0; +} + +int mlx5_fw_version_query(struct mlx5_core_dev *dev, + u32 *running_ver, u32 *pending_ver) +{ + u32 reg_mcqi_version[MLX5_ST_SZ_DW(mcqi_version)] = {}; + bool pending_version_exists; + int component_index; + int err; + + if (!MLX5_CAP_GEN(dev, mcam_reg) || !MLX5_CAP_MCAM_REG(dev, mcqi) || + !MLX5_CAP_MCAM_REG(dev, mcqs)) { + mlx5_core_warn(dev, "fw query isn't supported by the FW\n"); + return -EOPNOTSUPP; + } + + component_index = mlx5_get_boot_img_component_index(dev); + if (component_index < 0) + return component_index; + + err = mlx5_reg_mcqi_version_query(dev, component_index, + MCQI_FW_RUNNING_VERSION, + reg_mcqi_version); + if (err) + return err; + + *running_ver = MLX5_GET(mcqi_version, reg_mcqi_version, version); + + err = mlx5_fw_image_pending(dev, component_index, &pending_version_exists); + if (err) + return err; + + if (!pending_version_exists) { + *pending_ver = 0; + return 0; + } + + err = mlx5_reg_mcqi_version_query(dev, component_index, + MCQI_FW_STORED_VERSION, + reg_mcqi_version); + if (err) + return err; + + *pending_ver = MLX5_GET(mcqi_version, reg_mcqi_version, version); + + return 0; +} + +static int query_other_hca_cap(struct mlx5_core_dev *mdev, + u16 function_id, void *out) +{ + int out_sz = MLX5_ST_SZ_BYTES(query_other_hca_cap_out); + int in_sz = MLX5_ST_SZ_BYTES(query_other_hca_cap_in); + void *in; + int err; + + in = kzalloc(in_sz, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(query_other_hca_cap_in, in, opcode, + MLX5_CMD_OP_QUERY_OTHER_HCA_CAP); + MLX5_SET(query_other_hca_cap_in, in, function_id, function_id); + + err = mlx5_cmd_exec(mdev, in, in_sz, out, out_sz); + + kfree(in); + return err; +} + +static int modify_other_hca_cap(struct mlx5_core_dev *mdev, + u16 function_id, void *in) +{ + int out_sz = MLX5_ST_SZ_BYTES(modify_other_hca_cap_out); + int in_sz = MLX5_ST_SZ_BYTES(modify_other_hca_cap_in); + void *out; + int err; + + out = kzalloc(out_sz, GFP_KERNEL); + if (!out) + return -ENOMEM; + + MLX5_SET(modify_other_hca_cap_in, in, opcode, + MLX5_CMD_OP_MODIFY_OTHER_HCA_CAP); + MLX5_SET(modify_other_hca_cap_in, in, function_id, function_id); + + err = mlx5_cmd_exec(mdev, in, in_sz, out, out_sz); + + kfree(out); + return err; +} + +int mlx5_get_other_hca_cap_roce(struct mlx5_core_dev *mdev, + u16 function_id, bool *value) +{ + int out_sz = MLX5_ST_SZ_BYTES(query_other_hca_cap_out); + void *out; + void *other_capability; + int err; + + out = kzalloc(out_sz, GFP_KERNEL); + if (!out) + return -ENOMEM; + + err = query_other_hca_cap(mdev, function_id, out); + if (err) + goto out; + + other_capability = MLX5_ADDR_OF(query_other_hca_cap_out, + out, other_capability); + *value = MLX5_GET(other_hca_cap, other_capability, roce); + +out: + kfree(out); + return err; +} + +int mlx5_modify_other_hca_cap_roce(struct mlx5_core_dev *mdev, + u16 function_id, bool value) +{ + int in_sz = MLX5_ST_SZ_BYTES(modify_other_hca_cap_in); + struct mlx5_ifc_other_hca_cap_bits *other_capability; + void *in; + int err; + + in = kzalloc(in_sz, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(modify_other_hca_cap_in, in, field_select, ROCE_SELECT); + other_capability = (struct mlx5_ifc_other_hca_cap_bits *) + MLX5_ADDR_OF(modify_other_hca_cap_in, + in, other_capability); + MLX5_SET(other_hca_cap, other_capability, roce, value); + + err = modify_other_hca_cap(mdev, function_id, in); + + kfree(in); + return err; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fw_exp.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fw_exp.c new file mode 100644 index 0000000..301288c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fw_exp.c @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2016, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +int mlx5_core_set_dc_cnak_trace(struct mlx5_core_dev *dev, int enable_val, + u64 addr) +{ + u32 in[MLX5_ST_SZ_DW(set_dc_cnak_trace_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(set_dc_cnak_trace_out)] = {0}; + __be64 be_addr; + void *pas; + + MLX5_SET(set_dc_cnak_trace_in, in, opcode, + MLX5_CMD_OP_SET_DC_CNAK_TRACE); + MLX5_SET(set_dc_cnak_trace_in, in, enable, enable_val); + pas = MLX5_ADDR_OF(set_dc_cnak_trace_in, in, pas); + be_addr = cpu_to_be64(addr); + memcpy(MLX5_ADDR_OF(cmd_pas, pas, pa_h), &be_addr, sizeof(be_addr)); + + return mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); +} +EXPORT_SYMBOL_GPL(mlx5_core_set_dc_cnak_trace); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c new file mode 100644 index 0000000..e823804 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c @@ -0,0 +1,775 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. */ + +#include "fw_reset.h" +#include "diag/fw_tracer.h" +#include "lib/tout.h" + +enum { + MLX5_FW_RESET_FLAGS_RESET_REQUESTED, + MLX5_FW_RESET_FLAGS_NACK_RESET_REQUEST, + MLX5_FW_RESET_FLAGS_PENDING_COMP, + MLX5_FW_RESET_FLAGS_DROP_NEW_REQUESTS +}; + +struct mlx5_fw_reset { + struct mlx5_core_dev *dev; + struct mlx5_nb nb; + struct workqueue_struct *wq; + struct work_struct fw_live_patch_work; + struct work_struct reset_request_work; + struct work_struct reset_reload_work; + struct work_struct reset_now_work; + struct work_struct reset_abort_work; + unsigned long reset_flags; + struct timer_list timer; + struct completion done; + int ret; +}; + +void mlx5_fw_reset_enable_remote_dev_reset_set(struct mlx5_core_dev *dev, bool enable) +{ + struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset; + + if (enable) + clear_bit(MLX5_FW_RESET_FLAGS_NACK_RESET_REQUEST, &fw_reset->reset_flags); + else + set_bit(MLX5_FW_RESET_FLAGS_NACK_RESET_REQUEST, &fw_reset->reset_flags); +} + +bool mlx5_fw_reset_enable_remote_dev_reset_get(struct mlx5_core_dev *dev) +{ + struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset; + + return !test_bit(MLX5_FW_RESET_FLAGS_NACK_RESET_REQUEST, &fw_reset->reset_flags); +} + +static int mlx5_reg_mfrl_set(struct mlx5_core_dev *dev, u8 reset_level, + u8 reset_type_sel, u8 sync_resp, bool sync_start) +{ + u32 out[MLX5_ST_SZ_DW(mfrl_reg)] = {}; + u32 in[MLX5_ST_SZ_DW(mfrl_reg)] = {}; + + MLX5_SET(mfrl_reg, in, reset_level, reset_level); + MLX5_SET(mfrl_reg, in, rst_type_sel, reset_type_sel); + MLX5_SET(mfrl_reg, in, pci_sync_for_fw_update_resp, sync_resp); + MLX5_SET(mfrl_reg, in, pci_sync_for_fw_update_start, sync_start); + + return mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out), MLX5_REG_MFRL, 0, 1); +} + +static int mlx5_reg_mfrl_query(struct mlx5_core_dev *dev, u8 *reset_level, + u8 *reset_type, u8 *reset_state) +{ + u32 out[MLX5_ST_SZ_DW(mfrl_reg)] = {}; + u32 in[MLX5_ST_SZ_DW(mfrl_reg)] = {}; + int err; + + err = mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out), MLX5_REG_MFRL, 0, 0); + if (err) + return err; + + if (reset_level) + *reset_level = MLX5_GET(mfrl_reg, out, reset_level); + if (reset_type) + *reset_type = MLX5_GET(mfrl_reg, out, reset_type); + if (reset_state) + *reset_state = MLX5_GET(mfrl_reg, out, reset_state); + + return 0; +} + +int mlx5_fw_reset_query(struct mlx5_core_dev *dev, u8 *reset_level, u8 *reset_type) +{ + return mlx5_reg_mfrl_query(dev, reset_level, reset_type, NULL); +} + +static int mlx5_fw_reset_get_reset_state_err(struct mlx5_core_dev *dev, + struct netlink_ext_ack *extack) +{ + u8 reset_state; + + if (mlx5_reg_mfrl_query(dev, NULL, NULL, &reset_state)) + goto out; + + switch (reset_state) { + case MLX5_MFRL_REG_RESET_STATE_IN_NEGOTIATION: + case MLX5_MFRL_REG_RESET_STATE_RESET_IN_PROGRESS: + NL_SET_ERR_MSG_MOD(extack, "Sync reset was already triggered"); + return -EBUSY; + case MLX5_MFRL_REG_RESET_STATE_TIMEOUT: + NL_SET_ERR_MSG_MOD(extack, "Sync reset got timeout"); + return -ETIMEDOUT; + case MLX5_MFRL_REG_RESET_STATE_NACK: + NL_SET_ERR_MSG_MOD(extack, "One of the hosts disabled reset"); + return -EPERM; + } + +out: + NL_SET_ERR_MSG_MOD(extack, "Sync reset failed"); + return -EIO; +} + +int mlx5_fw_reset_set_reset_sync(struct mlx5_core_dev *dev, u8 reset_type_sel, + struct netlink_ext_ack *extack) +{ + struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset; + u32 out[MLX5_ST_SZ_DW(mfrl_reg)] = {}; + u32 in[MLX5_ST_SZ_DW(mfrl_reg)] = {}; + int err; + + set_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags); + + MLX5_SET(mfrl_reg, in, reset_level, MLX5_MFRL_REG_RESET_LEVEL3); + MLX5_SET(mfrl_reg, in, rst_type_sel, reset_type_sel); + MLX5_SET(mfrl_reg, in, pci_sync_for_fw_update_start, 1); + err = mlx5_access_reg(dev, in, sizeof(in), out, sizeof(out), + MLX5_REG_MFRL, 0, 1, false); + if (!err) + return 0; + + clear_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags); + if (err == -EREMOTEIO && MLX5_CAP_MCAM_FEATURE(dev, reset_state)) + return mlx5_fw_reset_get_reset_state_err(dev, extack); + + NL_SET_ERR_MSG_MOD(extack, "Sync reset command failed"); + return mlx5_cmd_check(dev, err, in, out); +} + +int mlx5_fw_reset_set_live_patch(struct mlx5_core_dev *dev) +{ + return mlx5_reg_mfrl_set(dev, MLX5_MFRL_REG_RESET_LEVEL0, 0, 0, false); +} + +static void mlx5_fw_reset_complete_reload(struct mlx5_core_dev *dev) +{ + struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset; + + /* if this is the driver that initiated the fw reset, devlink completed the reload */ + if (test_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags)) { + complete(&fw_reset->done); + } else { + mlx5_load_one(dev, false); + devlink_remote_reload_actions_performed(priv_to_devlink(dev), 0, + BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT) | + BIT(DEVLINK_RELOAD_ACTION_FW_ACTIVATE)); + } +} + +static void mlx5_stop_sync_reset_poll(struct mlx5_core_dev *dev) +{ + struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset; + + del_timer_sync(&fw_reset->timer); +} + +static int mlx5_sync_reset_clear_reset_requested(struct mlx5_core_dev *dev, bool poll_health) +{ + struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset; + + if (!test_and_clear_bit(MLX5_FW_RESET_FLAGS_RESET_REQUESTED, &fw_reset->reset_flags)) { + mlx5_core_warn(dev, "Reset request was already cleared\n"); + return -EALREADY; + } + + mlx5_stop_sync_reset_poll(dev); + if (poll_health) + mlx5_start_health_poll(dev); + return 0; +} + +static void mlx5_sync_reset_reload_work(struct work_struct *work) +{ + struct mlx5_fw_reset *fw_reset = container_of(work, struct mlx5_fw_reset, + reset_reload_work); + struct mlx5_core_dev *dev = fw_reset->dev; + int err; + + mlx5_sync_reset_clear_reset_requested(dev, false); + mlx5_enter_error_state(dev, true); + mlx5_unload_one(dev); + err = mlx5_health_wait_pci_up(dev); + if (err) + mlx5_core_err(dev, "reset reload flow aborted, PCI reads still not working\n"); + fw_reset->ret = err; + mlx5_fw_reset_complete_reload(dev); +} + +#define MLX5_RESET_POLL_INTERVAL (HZ / 10) +static void poll_sync_reset(struct timer_list *t) +{ + struct mlx5_fw_reset *fw_reset = from_timer(fw_reset, t, timer); + struct mlx5_core_dev *dev = fw_reset->dev; + u32 fatal_error; + + if (!test_bit(MLX5_FW_RESET_FLAGS_RESET_REQUESTED, &fw_reset->reset_flags)) + return; + + fatal_error = mlx5_health_check_fatal_sensors(dev); + + if (fatal_error) { + mlx5_core_warn(dev, "Got Device Reset\n"); + dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR; + if (!test_bit(MLX5_FW_RESET_FLAGS_DROP_NEW_REQUESTS, &fw_reset->reset_flags)) + queue_work(fw_reset->wq, &fw_reset->reset_reload_work); + else + mlx5_core_err(dev, "Device is being removed, Drop new reset work\n"); + return; + } + + mod_timer(&fw_reset->timer, round_jiffies(jiffies + MLX5_RESET_POLL_INTERVAL)); +} + +static void mlx5_start_sync_reset_poll(struct mlx5_core_dev *dev) +{ + struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset; + + timer_setup(&fw_reset->timer, poll_sync_reset, 0); + fw_reset->timer.expires = round_jiffies(jiffies + MLX5_RESET_POLL_INTERVAL); + add_timer(&fw_reset->timer); +} + +static int mlx5_fw_reset_set_reset_sync_ack(struct mlx5_core_dev *dev) +{ + return mlx5_reg_mfrl_set(dev, MLX5_MFRL_REG_RESET_LEVEL3, 0, 1, false); +} + +static int mlx5_fw_reset_set_reset_sync_nack(struct mlx5_core_dev *dev) +{ + return mlx5_reg_mfrl_set(dev, MLX5_MFRL_REG_RESET_LEVEL3, 0, 2, false); +} + +static int mlx5_sync_reset_set_reset_requested(struct mlx5_core_dev *dev) +{ + struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset; + + if (test_and_set_bit(MLX5_FW_RESET_FLAGS_RESET_REQUESTED, &fw_reset->reset_flags)) { + mlx5_core_warn(dev, "Reset request was already set\n"); + return -EALREADY; + } + mlx5_stop_health_poll(dev, true); + mlx5_start_sync_reset_poll(dev); + return 0; +} + +static void mlx5_fw_live_patch_event(struct work_struct *work) +{ + struct mlx5_fw_reset *fw_reset = container_of(work, struct mlx5_fw_reset, + fw_live_patch_work); + struct mlx5_core_dev *dev = fw_reset->dev; + + mlx5_core_info(dev, "Live patch updated firmware version: %d.%d.%d\n", fw_rev_maj(dev), + fw_rev_min(dev), fw_rev_sub(dev)); + + if (mlx5_fw_tracer_reload(dev->tracer)) + mlx5_core_err(dev, "Failed to reload FW tracer\n"); +} + +static void mlx5_sync_reset_request_event(struct work_struct *work) +{ + struct mlx5_fw_reset *fw_reset = container_of(work, struct mlx5_fw_reset, + reset_request_work); + struct mlx5_core_dev *dev = fw_reset->dev; + int err; + + if (test_bit(MLX5_FW_RESET_FLAGS_NACK_RESET_REQUEST, &fw_reset->reset_flags)) { + err = mlx5_fw_reset_set_reset_sync_nack(dev); + mlx5_core_warn(dev, "PCI Sync FW Update Reset Nack %s", + err ? "Failed" : "Sent"); + return; + } + if (mlx5_sync_reset_set_reset_requested(dev)) + return; + + err = mlx5_fw_reset_set_reset_sync_ack(dev); + if (err) + mlx5_core_warn(dev, "PCI Sync FW Update Reset Ack Failed. Error code: %d\n", err); + else + mlx5_core_warn(dev, "PCI Sync FW Update Reset Ack. Device reset is expected.\n"); +} + +static int mlx5_pci_config_hw_control(struct pci_dev *root_port, + bool new_val, bool *prev_val) +{ + u16 root_ctl, root_cap; + bool curr_val; + int ret; + + pcie_capability_read_word(root_port, PCI_EXP_RTCAP, &root_cap); + if (!(root_cap & PCI_EXP_RTCAP_CRSVIS)) + return 0; + + pcie_capability_read_word(root_port, PCI_EXP_RTCTL, &root_ctl); + curr_val = !(root_ctl & PCI_EXP_RTCTL_CRSSVE); + + if (prev_val) + *prev_val = curr_val; + + if (curr_val == new_val) + return 0; + + if (new_val) + ret = pcie_capability_clear_word(root_port, PCI_EXP_RTCTL, + PCI_EXP_RTCTL_CRSSVE); + else + ret = pcie_capability_set_word(root_port, PCI_EXP_RTCTL, + PCI_EXP_RTCTL_CRSSVE); + + if (ret) { + dev_err(&root_port->dev, "Failed to config CRSSVE bit, err(%d)\n", ret); + return ret; + } + + return 0; +} + +static void mlx5_pci_restore(struct pci_dev *dev) +{ + struct pci_dev *pdev; + + pci_restore_state(dev); + if (!dev->subordinate) + return; + list_for_each_entry(pdev, &dev->subordinate->devices, bus_list) + mlx5_pci_restore(pdev); +} + +static void mlx5_pci_unlock(struct pci_dev *dev) +{ + struct pci_dev *pdev; + + pci_cfg_access_unlock(dev); + if (!dev->subordinate) + return; + list_for_each_entry(pdev, &dev->subordinate->devices, bus_list) + mlx5_pci_unlock(pdev); +} + +static bool mlx5_pci_is_dev_same_id(struct pci_dev *pdev, u16 dev_id) +{ + u16 pdev_id; + + if (pci_read_config_word(pdev, PCI_DEVICE_ID, &pdev_id)) + return false; + if (pdev_id != dev_id) + return false; + return true; +} + +static bool mlx5_pci_are_all_devs_same_id(struct pci_dev *dev, u16 nic_dev_id) +{ + struct pci_dev *pdev; + int type; + + if (!dev->subordinate) { + type = pci_pcie_type(dev); + if (type != PCI_EXP_TYPE_ENDPOINT && + type != PCI_EXP_TYPE_LEG_END) + return true; + return mlx5_pci_is_dev_same_id(dev, nic_dev_id); + } + + list_for_each_entry(pdev, &dev->subordinate->devices, bus_list) + if (!mlx5_pci_are_all_devs_same_id(pdev, nic_dev_id)) + return false; + + return true; +} + +static void mlx5_pci_save_locked(struct pci_dev *dev) +{ + struct pci_dev *pdev; + + pci_cfg_access_lock(dev); + pci_save_state(dev); + if (!dev->subordinate) + return; + list_for_each_entry(pdev, &dev->subordinate->devices, bus_list) + mlx5_pci_save_locked(pdev); +} + +static int mlx5_check_reset_criteria(struct mlx5_core_dev *dev, + struct pci_dev *root_port) +{ + u16 dev_id; + int err; + + err = pci_read_config_word(dev->pdev, PCI_DEVICE_ID, &dev_id); + if (err) + return err; + + if (!mlx5_pci_are_all_devs_same_id(root_port, dev_id)) + return -EPERM; + + return 0; +} + +static int mlx5_reset_pci_topology(struct mlx5_core_dev *dev, + struct pci_dev *root_port) +{ + struct pci_dev *bridge; + unsigned long timeout; + int err; + + bridge = dev->pdev->bus->self; + + /* Disable PCI link */ + err = pcie_capability_set_word(bridge, PCI_EXP_LNKCTL, + PCI_EXP_LNKCTL_LD); + if (err) { + mlx5_core_err(dev, "Failed to disable pci link, err (%d)\n", err); + return err; + } + msleep(500); + + /* Wait for PCI link comes back with timeout */ + timeout = jiffies + msecs_to_jiffies(mlx5_tout_ms(dev, PCI_TOGGLE)); + do { + if (pci_device_is_present(root_port)) + break; + msleep(20); + } while (!time_after(jiffies, timeout)); + + if (pci_device_is_present(root_port)) { + mlx5_core_info(dev, "PCIe topology is ready\n"); + } else { + mlx5_core_err(dev, "PCIe topology is not ready after %lu ms\n", + msecs_to_jiffies(mlx5_tout_ms(dev, PCI_TOGGLE))); + return -ETIMEDOUT; + } + return 0; +} + +static int mlx5_pci_link_toggle_ecpf(struct mlx5_core_dev *dev) +{ + struct pci_dev *root_port; + bool prev; + int err; + + root_port = pcie_find_root_port(dev->pdev); + if (!root_port) { + mlx5_core_err(dev, "Failed to find root port\n"); + return -ENODEV; + } + + err = mlx5_check_reset_criteria(dev, root_port); + if (err) { + mlx5_core_err(dev, "Device does not match reset criteria, err(%d)\n", + err); + return err; + } + + err = mlx5_pci_config_hw_control(root_port, true, &prev); + if (err) { + mlx5_core_err(dev, "Failed to enable hardware retry\n"); + return -EIO; + } + + mlx5_pci_save_locked(root_port); + + err = mlx5_reset_pci_topology(dev, root_port); + if (err) + goto err_reset; + + mlx5_pci_restore(root_port); +err_reset: + mlx5_pci_unlock(root_port); + mlx5_pci_config_hw_control(root_port, prev, NULL); + return err; +} + +static const struct pci_device_id rshim_device_ids[] = { + { PCI_VDEVICE(MELLANOX, 0xc2d2) }, /* BlueField1 RShim device ID */ + { PCI_VDEVICE(MELLANOX, 0xc2d3) }, /* BlueField2 RShim device ID */ + { PCI_VDEVICE(MELLANOX, 0xc2d6) }, /* BlueField2 RShim device ID2 */ + { PCI_VDEVICE(MELLANOX, 0xc2d4) } /* BlueField3 RShim device ID */ +}; + +static bool mlx5_is_rshim_pci_device(struct mlx5_core_dev *dev, u16 dev_id) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(rshim_device_ids); ++i) { + if (rshim_device_ids[i].device == dev_id) + return true; + } + + return false; +} + +static int mlx5_pci_link_toggle_pf(struct mlx5_core_dev *dev) +{ + struct pci_bus *bridge_bus = dev->pdev->bus; + struct pci_dev *bridge = bridge_bus->self; + u16 reg16, dev_id, sdev_id; + unsigned long timeout; + struct pci_dev *sdev; + int cap, err; + u32 reg32; + + /* Check that all functions under the pci bridge are PFs of + * this device otherwise fail this function. + */ + err = pci_read_config_word(dev->pdev, PCI_DEVICE_ID, &dev_id); + if (err) + return err; + list_for_each_entry(sdev, &bridge_bus->devices, bus_list) { + err = pci_read_config_word(sdev, PCI_DEVICE_ID, &sdev_id); + if (err) + return err; + if (sdev_id == dev_id) + continue; + + if (mlx5_is_rshim_pci_device(dev, sdev_id)) + continue; + + /* Non mlx5 device present on bus, abort */ + return -EPERM; + } + if (!bridge) + return -EOPNOTSUPP; + + cap = pci_find_capability(bridge, PCI_CAP_ID_EXP); + if (!cap) + return -EOPNOTSUPP; + + list_for_each_entry(sdev, &bridge_bus->devices, bus_list) { + pci_save_state(sdev); + pci_cfg_access_lock(sdev); + } + /* PCI link toggle */ + err = pci_read_config_word(bridge, cap + PCI_EXP_LNKCTL, ®16); + if (err) + return err; + reg16 |= PCI_EXP_LNKCTL_LD; + err = pci_write_config_word(bridge, cap + PCI_EXP_LNKCTL, reg16); + if (err) + return err; + msleep(500); + reg16 &= ~PCI_EXP_LNKCTL_LD; + err = pci_write_config_word(bridge, cap + PCI_EXP_LNKCTL, reg16); + if (err) + return err; + + /* Check link */ + err = pci_read_config_dword(bridge, cap + PCI_EXP_LNKCAP, ®32); + if (err) + return err; + if (!(reg32 & PCI_EXP_LNKCAP_DLLLARC)) { + mlx5_core_warn(dev, "No PCI link reporting capability (0x%08x)\n", reg32); + msleep(1000); + goto restore; + } + + timeout = jiffies + msecs_to_jiffies(mlx5_tout_ms(dev, PCI_TOGGLE)); + do { + err = pci_read_config_word(bridge, cap + PCI_EXP_LNKSTA, ®16); + if (err) + return err; + if (reg16 & PCI_EXP_LNKSTA_DLLLA) + break; + msleep(20); + } while (!time_after(jiffies, timeout)); + + if (reg16 & PCI_EXP_LNKSTA_DLLLA) { + mlx5_core_info(dev, "PCI Link up\n"); + } else { + mlx5_core_err(dev, "PCI link not ready (0x%04x) after %llu ms\n", + reg16, mlx5_tout_ms(dev, PCI_TOGGLE)); + err = -ETIMEDOUT; + } + + do { + err = pci_read_config_word(dev->pdev, PCI_DEVICE_ID, ®16); + if (err) + return err; + if (reg16 == dev_id) + break; + msleep(20); + } while (!time_after(jiffies, timeout)); + + if (reg16 == dev_id) { + mlx5_core_info(dev, "Firmware responds to PCI config cycles again\n"); + } else { + mlx5_core_err(dev, "Firmware is not responsive (0x%04x) after %llu ms\n", + reg16, mlx5_tout_ms(dev, PCI_TOGGLE)); + err = -ETIMEDOUT; + } + +restore: + list_for_each_entry(sdev, &bridge_bus->devices, bus_list) { + pci_cfg_access_unlock(sdev); + pci_restore_state(sdev); + } + + return err; +} + +static int mlx5_pci_link_toggle(struct mlx5_core_dev *dev) +{ + if (mlx5_core_is_ecpf(dev)) + return mlx5_pci_link_toggle_ecpf(dev); + else + return mlx5_pci_link_toggle_pf(dev); +} + +static void mlx5_sync_reset_now_event(struct work_struct *work) +{ + struct mlx5_fw_reset *fw_reset = container_of(work, struct mlx5_fw_reset, + reset_now_work); + struct mlx5_core_dev *dev = fw_reset->dev; + int err; + + if (mlx5_sync_reset_clear_reset_requested(dev, false)) + return; + + mlx5_core_warn(dev, "Sync Reset now. Device is going to reset.\n"); + + err = mlx5_cmd_fast_teardown_hca(dev); + if (err) { + mlx5_core_warn(dev, "Fast teardown failed, no reset done, err %d\n", err); + goto done; + } + + err = mlx5_pci_link_toggle(dev); + if (err) { + mlx5_core_warn(dev, "mlx5_pci_link_toggle failed, no reset done, err %d\n", err); + goto done; + } + + mlx5_enter_error_state(dev, true); + mlx5_unload_one(dev); +done: + fw_reset->ret = err; + mlx5_fw_reset_complete_reload(dev); +} + +static void mlx5_sync_reset_abort_event(struct work_struct *work) +{ + struct mlx5_fw_reset *fw_reset = container_of(work, struct mlx5_fw_reset, + reset_abort_work); + struct mlx5_core_dev *dev = fw_reset->dev; + + if (mlx5_sync_reset_clear_reset_requested(dev, true)) + return; + mlx5_core_warn(dev, "PCI Sync FW Update Reset Aborted.\n"); +} + +static void mlx5_sync_reset_events_handle(struct mlx5_fw_reset *fw_reset, struct mlx5_eqe *eqe) +{ + struct mlx5_eqe_sync_fw_update *sync_fw_update_eqe; + u8 sync_event_rst_type; + + sync_fw_update_eqe = &eqe->data.sync_fw_update; + sync_event_rst_type = sync_fw_update_eqe->sync_rst_state & SYNC_RST_STATE_MASK; + switch (sync_event_rst_type) { + case MLX5_SYNC_RST_STATE_RESET_REQUEST: + queue_work(fw_reset->wq, &fw_reset->reset_request_work); + break; + case MLX5_SYNC_RST_STATE_RESET_NOW: + queue_work(fw_reset->wq, &fw_reset->reset_now_work); + break; + case MLX5_SYNC_RST_STATE_RESET_ABORT: + queue_work(fw_reset->wq, &fw_reset->reset_abort_work); + break; + } +} + +static int fw_reset_event_notifier(struct notifier_block *nb, unsigned long action, void *data) +{ + struct mlx5_fw_reset *fw_reset = mlx5_nb_cof(nb, struct mlx5_fw_reset, nb); + struct mlx5_eqe *eqe = data; + + if (test_bit(MLX5_FW_RESET_FLAGS_DROP_NEW_REQUESTS, &fw_reset->reset_flags)) + return NOTIFY_DONE; + + switch (eqe->sub_type) { + case MLX5_GENERAL_SUBTYPE_FW_LIVE_PATCH_EVENT: + queue_work(fw_reset->wq, &fw_reset->fw_live_patch_work); + break; + case MLX5_GENERAL_SUBTYPE_PCI_SYNC_FOR_FW_UPDATE_EVENT: + mlx5_sync_reset_events_handle(fw_reset, eqe); + break; + default: + return NOTIFY_DONE; + } + + return NOTIFY_OK; +} + +int mlx5_fw_reset_wait_reset_done(struct mlx5_core_dev *dev) +{ + unsigned long pci_sync_update_timeout = mlx5_tout_ms(dev, PCI_SYNC_UPDATE); + unsigned long timeout = msecs_to_jiffies(pci_sync_update_timeout); + struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset; + int err; + + if (!wait_for_completion_timeout(&fw_reset->done, timeout)) { + mlx5_core_warn(dev, "FW sync reset timeout after %lu seconds\n", + pci_sync_update_timeout / 1000); + err = -ETIMEDOUT; + goto out; + } + err = fw_reset->ret; +out: + clear_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags); + return err; +} + +void mlx5_fw_reset_events_start(struct mlx5_core_dev *dev) +{ + struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset; + + MLX5_NB_INIT(&fw_reset->nb, fw_reset_event_notifier, GENERAL_EVENT); + mlx5_eq_notifier_register(dev, &fw_reset->nb); +} + +void mlx5_fw_reset_events_stop(struct mlx5_core_dev *dev) +{ + mlx5_eq_notifier_unregister(dev, &dev->priv.fw_reset->nb); +} + +void mlx5_drain_fw_reset(struct mlx5_core_dev *dev) +{ + struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset; + + set_bit(MLX5_FW_RESET_FLAGS_DROP_NEW_REQUESTS, &fw_reset->reset_flags); + cancel_work_sync(&fw_reset->fw_live_patch_work); + cancel_work_sync(&fw_reset->reset_request_work); + cancel_work_sync(&fw_reset->reset_reload_work); + cancel_work_sync(&fw_reset->reset_now_work); + cancel_work_sync(&fw_reset->reset_abort_work); +} + +int mlx5_fw_reset_init(struct mlx5_core_dev *dev) +{ + struct mlx5_fw_reset *fw_reset = kzalloc(sizeof(*fw_reset), GFP_KERNEL); + + if (!fw_reset) + return -ENOMEM; + fw_reset->wq = create_singlethread_workqueue("mlx5_fw_reset_events"); + if (!fw_reset->wq) { + kfree(fw_reset); + return -ENOMEM; + } + + fw_reset->dev = dev; + dev->priv.fw_reset = fw_reset; + + INIT_WORK(&fw_reset->fw_live_patch_work, mlx5_fw_live_patch_event); + INIT_WORK(&fw_reset->reset_request_work, mlx5_sync_reset_request_event); + INIT_WORK(&fw_reset->reset_reload_work, mlx5_sync_reset_reload_work); + INIT_WORK(&fw_reset->reset_now_work, mlx5_sync_reset_now_event); + INIT_WORK(&fw_reset->reset_abort_work, mlx5_sync_reset_abort_event); + + init_completion(&fw_reset->done); + return 0; +} + +void mlx5_fw_reset_cleanup(struct mlx5_core_dev *dev) +{ + struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset; + + destroy_workqueue(fw_reset->wq); + kfree(dev->priv.fw_reset); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.h new file mode 100644 index 0000000..dc141c7 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. */ + +#ifndef __MLX5_FW_RESET_H +#define __MLX5_FW_RESET_H + +#include "mlx5_core.h" + +void mlx5_fw_reset_enable_remote_dev_reset_set(struct mlx5_core_dev *dev, bool enable); +bool mlx5_fw_reset_enable_remote_dev_reset_get(struct mlx5_core_dev *dev); +int mlx5_fw_reset_query(struct mlx5_core_dev *dev, u8 *reset_level, u8 *reset_type); +int mlx5_fw_reset_set_reset_sync(struct mlx5_core_dev *dev, u8 reset_type_sel, + struct netlink_ext_ack *extack); +int mlx5_fw_reset_set_live_patch(struct mlx5_core_dev *dev); + +int mlx5_fw_reset_wait_reset_done(struct mlx5_core_dev *dev); +void mlx5_fw_reset_events_start(struct mlx5_core_dev *dev); +void mlx5_fw_reset_events_stop(struct mlx5_core_dev *dev); +void mlx5_drain_fw_reset(struct mlx5_core_dev *dev); +int mlx5_fw_reset_init(struct mlx5_core_dev *dev); +void mlx5_fw_reset_cleanup(struct mlx5_core_dev *dev); + +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/health.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/health.c new file mode 100644 index 0000000..c0c2d30 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -0,0 +1,964 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "mlx5_core.h" +#include "lib/eq.h" +#include "lib/mlx5.h" +#include "lib/pci_vsc.h" +#include "lib/tout.h" +#include "diag/fw_tracer.h" + +enum { + MAX_MISSES = 3, +}; + +enum { + MLX5_HEALTH_SYNDR_FW_ERR = 0x1, + MLX5_HEALTH_SYNDR_IRISC_ERR = 0x7, + MLX5_HEALTH_SYNDR_HW_UNRECOVERABLE_ERR = 0x8, + MLX5_HEALTH_SYNDR_CRC_ERR = 0x9, + MLX5_HEALTH_SYNDR_FETCH_PCI_ERR = 0xa, + MLX5_HEALTH_SYNDR_HW_FTL_ERR = 0xb, + MLX5_HEALTH_SYNDR_ASYNC_EQ_OVERRUN_ERR = 0xc, + MLX5_HEALTH_SYNDR_EQ_ERR = 0xd, + MLX5_HEALTH_SYNDR_EQ_INV = 0xe, + MLX5_HEALTH_SYNDR_FFSER_ERR = 0xf, + MLX5_HEALTH_SYNDR_HIGH_TEMP = 0x10 +}; + +enum { + MLX5_DROP_NEW_HEALTH_WORK, +}; + +enum { + MLX5_SENSOR_NO_ERR = 0, + MLX5_SENSOR_PCI_COMM_ERR = 1, + MLX5_SENSOR_PCI_ERR = 2, + MLX5_SENSOR_NIC_DISABLED = 3, + MLX5_SENSOR_NIC_SW_RESET = 4, + MLX5_SENSOR_FW_SYND_RFR = 5, +}; + +enum { + MLX5_SEVERITY_MASK = 0x7, + MLX5_SEVERITY_VALID_MASK = 0x8, +}; + +u8 mlx5_get_nic_state(struct mlx5_core_dev *dev) +{ + return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 7; +} + +void mlx5_set_nic_state(struct mlx5_core_dev *dev, u8 state) +{ + u32 cur_cmdq_addr_l_sz; + + cur_cmdq_addr_l_sz = ioread32be(&dev->iseg->cmdq_addr_l_sz); + iowrite32be((cur_cmdq_addr_l_sz & 0xFFFFF000) | + state << MLX5_NIC_IFC_OFFSET, + &dev->iseg->cmdq_addr_l_sz); +} + +bool mlx5_sensor_pci_not_working(struct mlx5_core_dev *dev) +{ + struct mlx5_core_health *health = &dev->priv.health; + struct health_buffer __iomem *h = health->health; + + /* Offline PCI reads return 0xffffffff */ + return (ioread32be(&h->fw_ver) == 0xffffffff); +} + +static int mlx5_health_get_rfr(u8 rfr_severity) +{ + return rfr_severity >> MLX5_RFR_BIT_OFFSET; +} + +static bool sensor_fw_synd_rfr(struct mlx5_core_dev *dev) +{ + struct mlx5_core_health *health = &dev->priv.health; + struct health_buffer __iomem *h = health->health; + u8 synd = ioread8(&h->synd); + u8 rfr; + + rfr = mlx5_health_get_rfr(ioread8(&h->rfr_severity)); + + if (rfr && synd) + mlx5_core_dbg(dev, "FW requests reset, synd: %d\n", synd); + return rfr && synd; +} + +u32 mlx5_health_check_fatal_sensors(struct mlx5_core_dev *dev) +{ + if (mlx5_sensor_pci_not_working(dev)) + return MLX5_SENSOR_PCI_COMM_ERR; + if (pci_channel_offline(dev->pdev)) + return MLX5_SENSOR_PCI_ERR; + if (mlx5_get_nic_state(dev) == MLX5_NIC_IFC_DISABLED) + return MLX5_SENSOR_NIC_DISABLED; + if (mlx5_get_nic_state(dev) == MLX5_NIC_IFC_SW_RESET) + return MLX5_SENSOR_NIC_SW_RESET; + if (sensor_fw_synd_rfr(dev)) + return MLX5_SENSOR_FW_SYND_RFR; + + return MLX5_SENSOR_NO_ERR; +} + +static int lock_sem_sw_reset(struct mlx5_core_dev *dev, bool lock) +{ + enum mlx5_vsc_state state; + int ret; + + if (!mlx5_core_is_pf(dev)) + return -EBUSY; + + /* Try to lock GW access, this stage doesn't return + * EBUSY because locked GW does not mean that other PF + * already started the reset. + */ + ret = mlx5_vsc_gw_lock(dev); + if (ret == -EBUSY) + return -EINVAL; + if (ret) + return ret; + + state = lock ? MLX5_VSC_LOCK : MLX5_VSC_UNLOCK; + /* At this stage, if the return status == EBUSY, then we know + * for sure that another PF started the reset, so don't allow + * another reset. + */ + ret = mlx5_vsc_sem_set_space(dev, MLX5_SEMAPHORE_SW_RESET, state); + if (ret) + mlx5_core_warn(dev, "Failed to lock SW reset semaphore\n"); + + /* Unlock GW access */ + mlx5_vsc_gw_unlock(dev); + + return ret; +} + +static bool reset_fw_if_needed(struct mlx5_core_dev *dev) +{ + bool supported = (ioread32be(&dev->iseg->initializing) >> + MLX5_FW_RESET_SUPPORTED_OFFSET) & 1; + u32 fatal_error; + + if (!supported) + return false; + + /* The reset only needs to be issued by one PF. The health buffer is + * shared between all functions, and will be cleared during a reset. + * Check again to avoid a redundant 2nd reset. If the fatal errors was + * PCI related a reset won't help. + */ + fatal_error = mlx5_health_check_fatal_sensors(dev); + if (fatal_error == MLX5_SENSOR_PCI_COMM_ERR || + fatal_error == MLX5_SENSOR_NIC_DISABLED || + fatal_error == MLX5_SENSOR_NIC_SW_RESET) { + mlx5_core_warn(dev, "Not issuing FW reset. Either it's already done or won't help."); + return false; + } + + mlx5_core_warn(dev, "Issuing FW Reset\n"); + /* Write the NIC interface field to initiate the reset, the command + * interface address also resides here, don't overwrite it. + */ + mlx5_set_nic_state(dev, MLX5_NIC_IFC_SW_RESET); + + return true; +} + +static void enter_error_state(struct mlx5_core_dev *dev, bool force) +{ + u32 fatal_error; + + fatal_error = mlx5_health_check_fatal_sensors(dev); + if (fatal_error || force) { /* protected state setting */ + dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR; + mlx5_cmd_flush(dev); + } + + if (fatal_error == MLX5_SENSOR_FW_SYND_RFR) { + if (mlx5_fill_cr_dump(dev)) + mlx5_core_err(dev, "Failed to collect crdump area\n"); + } + + mlx5_notifier_call_chain(dev->priv.events, MLX5_DEV_EVENT_SYS_ERROR, (void *)1); +} + +void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force) +{ + bool err_detected = false; + + /* Mark the device as fatal in order to abort FW commands */ + if ((mlx5_health_check_fatal_sensors(dev) || force) && + dev->state == MLX5_DEVICE_STATE_UP) { + dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR; + err_detected = true; + } + mutex_lock(&dev->intf_state_mutex); + if (!err_detected && dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) + goto unlock;/* a previous error is still being handled */ + + enter_error_state(dev, force); +unlock: + mutex_unlock(&dev->intf_state_mutex); +} + +void mlx5_error_sw_reset(struct mlx5_core_dev *dev) +{ + unsigned long end, delay_ms = mlx5_tout_ms(dev, PCI_TOGGLE); + int lock = -EBUSY; + + mutex_lock(&dev->intf_state_mutex); + if (dev->state != MLX5_DEVICE_STATE_INTERNAL_ERROR) + goto unlock; + + mlx5_core_err(dev, "start\n"); + + if (mlx5_health_check_fatal_sensors(dev) == MLX5_SENSOR_FW_SYND_RFR) { + /* Get cr-dump and reset FW semaphore */ + lock = lock_sem_sw_reset(dev, true); + + if (lock == -EBUSY) { + delay_ms = mlx5_tout_ms(dev, FULL_CRDUMP); + goto recover_from_sw_reset; + } + /* Execute SW reset */ + reset_fw_if_needed(dev); + } + +recover_from_sw_reset: + /* Recover from SW reset */ + end = jiffies + msecs_to_jiffies(delay_ms); + do { + if (mlx5_get_nic_state(dev) == MLX5_NIC_IFC_DISABLED) + break; + + msleep(20); + } while (!time_after(jiffies, end)); + + if (mlx5_get_nic_state(dev) != MLX5_NIC_IFC_DISABLED) { + dev_err(&dev->pdev->dev, "NIC IFC still %d after %lums.\n", + mlx5_get_nic_state(dev), delay_ms); + } + + /* Release FW semaphore if you are the lock owner */ + if (!lock) + lock_sem_sw_reset(dev, false); + + mlx5_core_err(dev, "end\n"); + +unlock: + mutex_unlock(&dev->intf_state_mutex); +} + +static void mlx5_handle_bad_state(struct mlx5_core_dev *dev) +{ + u8 nic_interface = mlx5_get_nic_state(dev); + + switch (nic_interface) { + case MLX5_NIC_IFC_FULL: + mlx5_core_warn(dev, "Expected to see disabled NIC but it is full driver\n"); + break; + + case MLX5_NIC_IFC_DISABLED: + mlx5_core_warn(dev, "starting teardown\n"); + break; + + case MLX5_NIC_IFC_NO_DRAM_NIC: + mlx5_core_warn(dev, "Expected to see disabled NIC but it is no dram nic\n"); + break; + + case MLX5_NIC_IFC_SW_RESET: + /* The IFC mode field is 3 bits, so it will read 0x7 in 2 cases: + * 1. PCI has been disabled (ie. PCI-AER, PF driver unloaded + * and this is a VF), this is not recoverable by SW reset. + * Logging of this is handled elsewhere. + * 2. FW reset has been issued by another function, driver can + * be reloaded to recover after the mode switches to + * MLX5_NIC_IFC_DISABLED. + */ + if (dev->priv.health.fatal_error != MLX5_SENSOR_PCI_COMM_ERR) + mlx5_core_warn(dev, "NIC SW reset in progress\n"); + break; + + default: + mlx5_core_warn(dev, "Expected to see disabled NIC but it is has invalid value %d\n", + nic_interface); + } + + mlx5_disable_device(dev); +} + +int mlx5_health_wait_pci_up(struct mlx5_core_dev *dev) +{ + unsigned long end; + + end = jiffies + msecs_to_jiffies(mlx5_tout_ms(dev, FW_RESET)); + while (mlx5_sensor_pci_not_working(dev)) { + if (time_after(jiffies, end)) + return -ETIMEDOUT; + if (test_bit(MLX5_INTERFACE_STATE_TEARDOWN, &dev->intf_state)) { + mlx5_core_warn(dev, "device is being removed, stop load\n"); + return -ENODEV; + } + msleep(100); + } + return 0; +} + +static int mlx5_health_try_recover(struct mlx5_core_dev *dev) +{ + struct mlx5_core_health *health = &dev->priv.health; + + mlx5_core_warn(dev, "handling bad device here\n"); + mlx5_handle_bad_state(dev); + if (mlx5_health_wait_pci_up(dev)) { + mlx5_core_err(dev, "health recovery flow aborted, PCI reads still not working\n"); + goto err_eio; + } + mlx5_core_err(dev, "starting health recovery flow\n"); + if (mlx5_recover_device(dev) || mlx5_health_check_fatal_sensors(dev)) { + mlx5_core_err(dev, "health recovery failed\n"); + goto err_eio; + } + + health->failed_in_seq = 0; + mlx5_core_info(dev, "health recovery succeeded\n"); + return 0; + +err_eio: + health->failed_in_seq++; + return -EIO; +} + +static const char *hsynd_str(u8 synd) +{ + switch (synd) { + case MLX5_HEALTH_SYNDR_FW_ERR: + return "firmware internal error"; + case MLX5_HEALTH_SYNDR_IRISC_ERR: + return "irisc not responding"; + case MLX5_HEALTH_SYNDR_HW_UNRECOVERABLE_ERR: + return "unrecoverable hardware error"; + case MLX5_HEALTH_SYNDR_CRC_ERR: + return "firmware CRC error"; + case MLX5_HEALTH_SYNDR_FETCH_PCI_ERR: + return "ICM fetch PCI error"; + case MLX5_HEALTH_SYNDR_HW_FTL_ERR: + return "HW fatal error\n"; + case MLX5_HEALTH_SYNDR_ASYNC_EQ_OVERRUN_ERR: + return "async EQ buffer overrun"; + case MLX5_HEALTH_SYNDR_EQ_ERR: + return "EQ error"; + case MLX5_HEALTH_SYNDR_EQ_INV: + return "Invalid EQ referenced"; + case MLX5_HEALTH_SYNDR_FFSER_ERR: + return "FFSER error"; + case MLX5_HEALTH_SYNDR_HIGH_TEMP: + return "High temperature"; + default: + return "unrecognized error"; + } +} + +static const char *mlx5_loglevel_str(int level) +{ + switch (level) { + case LOGLEVEL_EMERG: + return "EMERGENCY"; + case LOGLEVEL_ALERT: + return "ALERT"; + case LOGLEVEL_CRIT: + return "CRITICAL"; + case LOGLEVEL_ERR: + return "ERROR"; + case LOGLEVEL_WARNING: + return "WARNING"; + case LOGLEVEL_NOTICE: + return "NOTICE"; + case LOGLEVEL_INFO: + return "INFO"; + case LOGLEVEL_DEBUG: + return "DEBUG"; + } + return "Unknown log level"; +} + +static int mlx5_health_get_severity(u8 rfr_severity) +{ + return rfr_severity & MLX5_SEVERITY_VALID_MASK ? + rfr_severity & MLX5_SEVERITY_MASK : LOGLEVEL_ERR; +} + +static void print_health_info(struct mlx5_core_dev *dev) +{ + struct mlx5_core_health *health = &dev->priv.health; + struct health_buffer __iomem *h = health->health; + u8 rfr_severity; + int severity; + int i; + + /* If the syndrome is 0, the device is OK and no need to print buffer */ + if (!ioread8(&h->synd)) + return; + + if (ioread32be(&h->fw_ver) == 0xFFFFFFFF) { + mlx5_log(dev, LOGLEVEL_ERR, "PCI slot is unavailable\n"); + return; + } + + rfr_severity = ioread8(&h->rfr_severity); + severity = mlx5_health_get_severity(rfr_severity); + mlx5_log(dev, severity, "Health issue observed, %s, severity(%d) %s:\n", + hsynd_str(ioread8(&h->synd)), severity, mlx5_loglevel_str(severity)); + + for (i = 0; i < ARRAY_SIZE(h->assert_var); i++) + mlx5_log(dev, severity, "assert_var[%d] 0x%08x\n", i, + ioread32be(h->assert_var + i)); + + mlx5_log(dev, severity, "assert_exit_ptr 0x%08x\n", ioread32be(&h->assert_exit_ptr)); + mlx5_log(dev, severity, "assert_callra 0x%08x\n", ioread32be(&h->assert_callra)); + mlx5_log(dev, severity, "fw_ver %d.%d.%d", fw_rev_maj(dev), fw_rev_min(dev), + fw_rev_sub(dev)); + mlx5_log(dev, severity, "time %u\n", ioread32be(&h->time)); + mlx5_log(dev, severity, "hw_id 0x%08x\n", ioread32be(&h->hw_id)); + mlx5_log(dev, severity, "rfr %d\n", mlx5_health_get_rfr(rfr_severity)); + mlx5_log(dev, severity, "severity %d (%s)\n", severity, mlx5_loglevel_str(severity)); + mlx5_log(dev, severity, "irisc_index %d\n", ioread8(&h->irisc_index)); + mlx5_log(dev, severity, "synd 0x%x: %s\n", ioread8(&h->synd), + hsynd_str(ioread8(&h->synd))); + mlx5_log(dev, severity, "ext_synd 0x%04x\n", ioread16be(&h->ext_synd)); + mlx5_log(dev, severity, "raw fw_ver 0x%08x\n", ioread32be(&h->fw_ver)); +} + +static int +mlx5_fw_reporter_diagnose(struct devlink_health_reporter *reporter, + struct devlink_fmsg *fmsg, + struct netlink_ext_ack *extack) +{ + struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter); + struct mlx5_core_health *health = &dev->priv.health; + struct health_buffer __iomem *h = health->health; + u8 synd; + int err; + + synd = ioread8(&h->synd); + err = devlink_fmsg_u8_pair_put(fmsg, "Syndrome", synd); + if (err || !synd) + return err; + return devlink_fmsg_string_pair_put(fmsg, "Description", hsynd_str(synd)); +} + +struct mlx5_fw_reporter_ctx { + u8 err_synd; + int miss_counter; +}; + +static int +mlx5_fw_reporter_ctx_pairs_put(struct devlink_fmsg *fmsg, + struct mlx5_fw_reporter_ctx *fw_reporter_ctx) +{ + int err; + + err = devlink_fmsg_u8_pair_put(fmsg, "syndrome", + fw_reporter_ctx->err_synd); + if (err) + return err; + err = devlink_fmsg_u32_pair_put(fmsg, "fw_miss_counter", + fw_reporter_ctx->miss_counter); + if (err) + return err; + return 0; +} + +static int +mlx5_fw_reporter_heath_buffer_data_put(struct mlx5_core_dev *dev, + struct devlink_fmsg *fmsg) +{ + struct mlx5_core_health *health = &dev->priv.health; + struct health_buffer __iomem *h = health->health; + u8 rfr_severity; + int err; + int i; + + if (!ioread8(&h->synd)) + return 0; + + err = devlink_fmsg_pair_nest_start(fmsg, "health buffer"); + if (err) + return err; + err = devlink_fmsg_obj_nest_start(fmsg); + if (err) + return err; + err = devlink_fmsg_arr_pair_nest_start(fmsg, "assert_var"); + if (err) + return err; + + for (i = 0; i < ARRAY_SIZE(h->assert_var); i++) { + err = devlink_fmsg_u32_put(fmsg, ioread32be(h->assert_var + i)); + if (err) + return err; + } + err = devlink_fmsg_arr_pair_nest_end(fmsg); + if (err) + return err; + err = devlink_fmsg_u32_pair_put(fmsg, "assert_exit_ptr", + ioread32be(&h->assert_exit_ptr)); + if (err) + return err; + err = devlink_fmsg_u32_pair_put(fmsg, "assert_callra", + ioread32be(&h->assert_callra)); + if (err) + return err; + err = devlink_fmsg_u32_pair_put(fmsg, "time", ioread32be(&h->time)); + if (err) + return err; + err = devlink_fmsg_u32_pair_put(fmsg, "hw_id", ioread32be(&h->hw_id)); + if (err) + return err; + rfr_severity = ioread8(&h->rfr_severity); + err = devlink_fmsg_u8_pair_put(fmsg, "rfr", mlx5_health_get_rfr(rfr_severity)); + if (err) + return err; + err = devlink_fmsg_u8_pair_put(fmsg, "severity", mlx5_health_get_severity(rfr_severity)); + if (err) + return err; + err = devlink_fmsg_u8_pair_put(fmsg, "irisc_index", + ioread8(&h->irisc_index)); + if (err) + return err; + err = devlink_fmsg_u8_pair_put(fmsg, "synd", ioread8(&h->synd)); + if (err) + return err; + err = devlink_fmsg_u32_pair_put(fmsg, "ext_synd", + ioread16be(&h->ext_synd)); + if (err) + return err; + err = devlink_fmsg_u32_pair_put(fmsg, "raw_fw_ver", + ioread32be(&h->fw_ver)); + if (err) + return err; + err = devlink_fmsg_obj_nest_end(fmsg); + if (err) + return err; + return devlink_fmsg_pair_nest_end(fmsg); +} + +static int +mlx5_fw_reporter_dump(struct devlink_health_reporter *reporter, + struct devlink_fmsg *fmsg, void *priv_ctx, + struct netlink_ext_ack *extack) +{ + struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter); + int err; + + err = mlx5_fw_tracer_trigger_core_dump_general(dev); + if (err) + return err; + + if (priv_ctx) { + struct mlx5_fw_reporter_ctx *fw_reporter_ctx = priv_ctx; + + err = mlx5_fw_reporter_ctx_pairs_put(fmsg, fw_reporter_ctx); + if (err) + return err; + } + + err = mlx5_fw_reporter_heath_buffer_data_put(dev, fmsg); + if (err) + return err; + return mlx5_fw_tracer_get_saved_traces_objects(dev->tracer, fmsg); +} + +static void mlx5_fw_reporter_err_work(struct work_struct *work) +{ + struct mlx5_fw_reporter_ctx fw_reporter_ctx; + struct mlx5_core_health *health; + + health = container_of(work, struct mlx5_core_health, report_work); + + if (IS_ERR_OR_NULL(health->fw_reporter)) + return; + + fw_reporter_ctx.err_synd = health->synd; + fw_reporter_ctx.miss_counter = health->miss_counter; + if (fw_reporter_ctx.err_synd) { + devlink_health_report(health->fw_reporter, + "FW syndrom reported", &fw_reporter_ctx); + return; + } + if (fw_reporter_ctx.miss_counter) + devlink_health_report(health->fw_reporter, + "FW miss counter reported", + &fw_reporter_ctx); +} + +static const struct devlink_health_reporter_ops mlx5_fw_reporter_ops = { + .name = "fw", + .diagnose = mlx5_fw_reporter_diagnose, + .dump = mlx5_fw_reporter_dump, +}; + +static int +mlx5_fw_fatal_reporter_recover(struct devlink_health_reporter *reporter, + void *priv_ctx, + struct netlink_ext_ack *extack) +{ + struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter); + + return mlx5_health_try_recover(dev); +} + +static int +mlx5_fw_fatal_reporter_dump(struct devlink_health_reporter *reporter, + struct devlink_fmsg *fmsg, void *priv_ctx, + struct netlink_ext_ack *extack) +{ + struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter); + u32 crdump_size = dev->priv.health.crdump_size; + u32 *cr_data; + int err; + + if (!mlx5_core_is_pf(dev)) + return -EPERM; + + if (!crdump_size) + return -EOPNOTSUPP; + + cr_data = kvmalloc(crdump_size, GFP_KERNEL); + if (!cr_data) + return -ENOMEM; + err = mlx5_crdump_collect(dev, cr_data); + if (err) + goto free_data; + + if (priv_ctx) { + struct mlx5_fw_reporter_ctx *fw_reporter_ctx = priv_ctx; + + err = mlx5_fw_reporter_ctx_pairs_put(fmsg, fw_reporter_ctx); + if (err) + goto free_data; + } + + err = devlink_fmsg_binary_pair_put(fmsg, "crdump_data", cr_data, crdump_size); + +free_data: + kvfree(cr_data); + return err; +} + +#define MLX5_MAX_FAILED_RECOVERIES_IN_SEQUENCE 3 +static void mlx5_fw_fatal_reporter_err_work(struct work_struct *work) +{ + struct mlx5_fw_reporter_ctx fw_reporter_ctx; + struct mlx5_core_health *health; + struct mlx5_core_dev *dev; + struct mlx5_priv *priv; + + health = container_of(work, struct mlx5_core_health, fatal_report_work); + priv = container_of(health, struct mlx5_priv, health); + dev = container_of(priv, struct mlx5_core_dev, priv); + + enter_error_state(dev, false); + if (IS_ERR_OR_NULL(health->fw_fatal_reporter)) { + if (mlx5_health_try_recover(dev)) + mlx5_core_err(dev, "health recovery failed\n"); + return; + } + fw_reporter_ctx.err_synd = health->synd; + fw_reporter_ctx.miss_counter = health->miss_counter; + if (health->failed_in_seq && + health->failed_in_seq < MLX5_MAX_FAILED_RECOVERIES_IN_SEQUENCE) + devlink_health_reporter_state_update(health->fw_fatal_reporter, + DEVLINK_HEALTH_REPORTER_STATE_HEALTHY); + if (devlink_health_report(health->fw_fatal_reporter, + "FW fatal error reported", &fw_reporter_ctx) == -ECANCELED) { + /* If recovery wasn't performed, due to grace period, + * unload the driver. This ensures that the driver + * closes all its resources and it is not subjected to + * requests from the kernel. + */ + mlx5_core_err(dev, "Driver is in error state. Unloading\n"); + mlx5_unload_one(dev); + } +} + +static const struct devlink_health_reporter_ops mlx5_fw_fatal_reporter_ops = { + .name = "fw_fatal", + .recover = mlx5_fw_fatal_reporter_recover, + .dump = mlx5_fw_fatal_reporter_dump, +}; + +#define MLX5_FW_REPORTER_ECPF_GRACEFUL_PERIOD 180000 +#define MLX5_FW_REPORTER_PF_GRACEFUL_PERIOD 60000 +#define MLX5_FW_REPORTER_VF_GRACEFUL_PERIOD 30000 +#define MLX5_FW_REPORTER_DEFAULT_GRACEFUL_PERIOD MLX5_FW_REPORTER_VF_GRACEFUL_PERIOD + +static void mlx5_fw_reporters_create(struct mlx5_core_dev *dev) +{ + struct mlx5_core_health *health = &dev->priv.health; + struct devlink *devlink = priv_to_devlink(dev); + u64 grace_period; + + if (mlx5_core_is_ecpf(dev)) { + grace_period = MLX5_FW_REPORTER_ECPF_GRACEFUL_PERIOD; + } else if (mlx5_core_is_pf(dev)) { + grace_period = MLX5_FW_REPORTER_PF_GRACEFUL_PERIOD; + } else { + /* VF or SF */ + grace_period = MLX5_FW_REPORTER_DEFAULT_GRACEFUL_PERIOD; + } + + health->fw_reporter = + devlink_health_reporter_create(devlink, &mlx5_fw_reporter_ops, + 0, dev); + if (IS_ERR(health->fw_reporter)) + mlx5_core_warn(dev, "Failed to create fw reporter, err = %ld\n", + PTR_ERR(health->fw_reporter)); + + health->fw_fatal_reporter = + devlink_health_reporter_create(devlink, + &mlx5_fw_fatal_reporter_ops, + grace_period, dev); + if (IS_ERR(health->fw_fatal_reporter)) + mlx5_core_warn(dev, "Failed to create fw fatal reporter, err = %ld\n", + PTR_ERR(health->fw_fatal_reporter)); +} + +static void mlx5_fw_reporters_destroy(struct mlx5_core_dev *dev) +{ + struct mlx5_core_health *health = &dev->priv.health; + + if (!IS_ERR_OR_NULL(health->fw_reporter)) + devlink_health_reporter_destroy(health->fw_reporter); + + if (!IS_ERR_OR_NULL(health->fw_fatal_reporter)) + devlink_health_reporter_destroy(health->fw_fatal_reporter); +} + +static unsigned long get_next_poll_jiffies(struct mlx5_core_dev *dev) +{ + unsigned long next; + + get_random_bytes(&next, sizeof(next)); + next %= HZ; + next += jiffies + msecs_to_jiffies(mlx5_tout_ms(dev, HEALTH_POLL_INTERVAL)); + + return next; +} + +void mlx5_trigger_health_work(struct mlx5_core_dev *dev) +{ + struct mlx5_core_health *health = &dev->priv.health; + unsigned long flags; + + spin_lock_irqsave(&health->wq_lock, flags); + if (!test_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags)) + queue_work(health->wq, &health->fatal_report_work); + else + mlx5_core_err(dev, "new health works are not permitted at this stage\n"); + spin_unlock_irqrestore(&health->wq_lock, flags); +} + +#define MLX5_MSEC_PER_HOUR (MSEC_PER_SEC * 60 * 60) +static void mlx5_health_log_ts_update(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + u32 out[MLX5_ST_SZ_DW(mrtc_reg)] = {}; + u32 in[MLX5_ST_SZ_DW(mrtc_reg)] = {}; + struct mlx5_core_health *health; + struct mlx5_core_dev *dev; + struct mlx5_priv *priv; + u64 now_us; + + health = container_of(dwork, struct mlx5_core_health, update_fw_log_ts_work); + priv = container_of(health, struct mlx5_priv, health); + dev = container_of(priv, struct mlx5_core_dev, priv); + + now_us = ktime_to_us(ktime_get_real()); + + MLX5_SET(mrtc_reg, in, time_h, now_us >> 32); + MLX5_SET(mrtc_reg, in, time_l, now_us & 0xFFFFFFFF); + mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out), MLX5_REG_MRTC, 0, 1); + + queue_delayed_work(health->wq, &health->update_fw_log_ts_work, + msecs_to_jiffies(MLX5_MSEC_PER_HOUR)); +} + +static void poll_health(struct timer_list *t) +{ + struct mlx5_core_dev *dev = from_timer(dev, t, priv.health.timer); + struct mlx5_core_health *health = &dev->priv.health; + struct health_buffer __iomem *h = health->health; + u32 fatal_error; + u8 prev_synd; + u32 count; + + if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) + goto out; + + fatal_error = mlx5_health_check_fatal_sensors(dev); + + if (fatal_error && !health->fatal_error) { + mlx5_core_err(dev, "Fatal error %u detected\n", fatal_error); + dev->priv.health.fatal_error = fatal_error; + print_health_info(dev); + dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR; + mlx5_trigger_health_work(dev); + return; + } + + count = ioread32be(health->health_counter); + if (count == health->prev) + ++health->miss_counter; + else + health->miss_counter = 0; + + health->prev = count; + if (health->miss_counter == MAX_MISSES) { + mlx5_core_err(dev, "device's health compromised - reached miss count\n"); + print_health_info(dev); + queue_work(health->wq, &health->report_work); + } + + prev_synd = health->synd; + health->synd = ioread8(&h->synd); + if (health->synd && health->synd != prev_synd) + queue_work(health->wq, &health->report_work); + +out: + mod_timer(&health->timer, get_next_poll_jiffies(dev)); +} + +void mlx5_start_health_poll(struct mlx5_core_dev *dev) +{ + u64 poll_interval_ms = mlx5_tout_ms(dev, HEALTH_POLL_INTERVAL); + struct mlx5_core_health *health = &dev->priv.health; + + timer_setup(&health->timer, poll_health, 0); + health->fatal_error = MLX5_SENSOR_NO_ERR; + clear_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags); + health->health = &dev->iseg->health; + health->health_counter = &dev->iseg->health_counter; + + health->timer.expires = jiffies + msecs_to_jiffies(poll_interval_ms); + add_timer(&health->timer); +} + +void mlx5_stop_health_poll(struct mlx5_core_dev *dev, bool disable_health) +{ + struct mlx5_core_health *health = &dev->priv.health; + unsigned long flags; + + if (disable_health) { + spin_lock_irqsave(&health->wq_lock, flags); + set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags); + spin_unlock_irqrestore(&health->wq_lock, flags); + } + + del_timer_sync(&health->timer); +} + +void mlx5_start_health_fw_log_up(struct mlx5_core_dev *dev) +{ + struct mlx5_core_health *health = &dev->priv.health; + + if (mlx5_core_is_pf(dev) && MLX5_CAP_MCAM_REG(dev, mrtc)) + queue_delayed_work(health->wq, &health->update_fw_log_ts_work, 0); +} + +void mlx5_drain_health_wq(struct mlx5_core_dev *dev) +{ + struct mlx5_core_health *health = &dev->priv.health; + unsigned long flags; + + spin_lock_irqsave(&health->wq_lock, flags); + set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags); + spin_unlock_irqrestore(&health->wq_lock, flags); + cancel_delayed_work_sync(&health->update_fw_log_ts_work); + cancel_work_sync(&health->report_work); + cancel_work_sync(&health->fatal_report_work); +} + +void mlx5_health_flush(struct mlx5_core_dev *dev) +{ + struct mlx5_core_health *health = &dev->priv.health; + + flush_workqueue(health->wq); +} + +void mlx5_health_cleanup(struct mlx5_core_dev *dev) +{ + struct mlx5_core_health *health = &dev->priv.health; + + cancel_delayed_work_sync(&health->update_fw_log_ts_work); + destroy_workqueue(health->wq); + mlx5_fw_reporters_destroy(dev); +} + +int mlx5_health_init(struct mlx5_core_dev *dev) +{ + struct mlx5_core_health *health; + char *name; + + mlx5_fw_reporters_create(dev); + + health = &dev->priv.health; + name = kmalloc(64, GFP_KERNEL); + if (!name) + goto out_err; + + strcpy(name, "mlx5_health"); + strcat(name, dev_name(dev->device)); + health->wq = create_singlethread_workqueue(name); + kfree(name); + if (!health->wq) + goto out_err; + spin_lock_init(&health->wq_lock); + INIT_WORK(&health->fatal_report_work, mlx5_fw_fatal_reporter_err_work); + INIT_WORK(&health->report_work, mlx5_fw_reporter_err_work); + INIT_DELAYED_WORK(&health->update_fw_log_ts_work, mlx5_health_log_ts_update); + + return 0; + +out_err: + mlx5_fw_reporters_destroy(dev); + return -ENOMEM; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c new file mode 100644 index 0000000..f4f7eaf --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c @@ -0,0 +1,278 @@ +/* + * Copyright (c) 2017, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "en.h" +#include "ipoib.h" + +static void mlx5i_get_drvinfo(struct net_device *dev, + struct ethtool_drvinfo *drvinfo) +{ + struct mlx5e_priv *priv = mlx5i_epriv(dev); + + mlx5e_ethtool_get_drvinfo(priv, drvinfo); + strlcpy(drvinfo->driver, KBUILD_MODNAME "[ib_ipoib]", + sizeof(drvinfo->driver)); +} + +static void mlx5i_get_strings(struct net_device *dev, u32 stringset, u8 *data) +{ + struct mlx5e_priv *priv = mlx5i_epriv(dev); + + mlx5e_ethtool_get_strings(priv, stringset, data); +} + +static int mlx5i_get_sset_count(struct net_device *dev, int sset) +{ + struct mlx5e_priv *priv = mlx5i_epriv(dev); + + return mlx5e_ethtool_get_sset_count(priv, sset); +} + +static void mlx5i_get_ethtool_stats(struct net_device *dev, + struct ethtool_stats *stats, + u64 *data) +{ + struct mlx5e_priv *priv = mlx5i_epriv(dev); + + mlx5e_ethtool_get_ethtool_stats(priv, stats, data); +} + +static int mlx5i_set_ringparam(struct net_device *dev, + struct ethtool_ringparam *param, + struct kernel_ethtool_ringparam *kernel_param, + struct netlink_ext_ack *extack) +{ + struct mlx5e_priv *priv = mlx5i_epriv(dev); + + return mlx5e_ethtool_set_ringparam(priv, param); +} + +static void mlx5i_get_ringparam(struct net_device *dev, + struct ethtool_ringparam *param, + struct kernel_ethtool_ringparam *kernel_param, + struct netlink_ext_ack *extack) +{ + struct mlx5e_priv *priv = mlx5i_epriv(dev); + + mlx5e_ethtool_get_ringparam(priv, param); +} + +static int mlx5i_set_channels(struct net_device *dev, + struct ethtool_channels *ch) +{ + struct mlx5e_priv *priv = mlx5i_epriv(dev); + + return mlx5e_ethtool_set_channels(priv, ch); +} + +static void mlx5i_get_channels(struct net_device *dev, + struct ethtool_channels *ch) +{ + struct mlx5e_priv *priv = mlx5i_epriv(dev); + + mlx5e_ethtool_get_channels(priv, ch); +} + +static int mlx5i_set_coalesce(struct net_device *netdev, + struct ethtool_coalesce *coal, + struct kernel_ethtool_coalesce *kernel_coal, + struct netlink_ext_ack *extack) +{ + struct mlx5e_priv *priv = mlx5i_epriv(netdev); + + return mlx5e_ethtool_set_coalesce(priv, coal, kernel_coal, extack); +} + +static int mlx5i_get_coalesce(struct net_device *netdev, + struct ethtool_coalesce *coal, + struct kernel_ethtool_coalesce *kernel_coal, + struct netlink_ext_ack *extack) +{ + struct mlx5e_priv *priv = mlx5i_epriv(netdev); + + return mlx5e_ethtool_get_coalesce(priv, coal, kernel_coal); +} + +static int mlx5i_get_ts_info(struct net_device *netdev, + struct ethtool_ts_info *info) +{ + struct mlx5e_priv *priv = mlx5i_epriv(netdev); + + return mlx5e_ethtool_get_ts_info(priv, info); +} + +static int mlx5i_flash_device(struct net_device *netdev, + struct ethtool_flash *flash) +{ + struct mlx5e_priv *priv = mlx5i_epriv(netdev); + + return mlx5e_ethtool_flash_device(priv, flash); +} + +static inline int mlx5_ptys_width_enum_to_int(enum mlx5_ptys_width width) +{ + switch (width) { + case MLX5_PTYS_WIDTH_1X: return 1; + case MLX5_PTYS_WIDTH_2X: return 2; + case MLX5_PTYS_WIDTH_4X: return 4; + case MLX5_PTYS_WIDTH_8X: return 8; + case MLX5_PTYS_WIDTH_12X: return 12; + default: return -1; + } +} + +enum mlx5_ptys_rate { + MLX5_PTYS_RATE_SDR = 1 << 0, + MLX5_PTYS_RATE_DDR = 1 << 1, + MLX5_PTYS_RATE_QDR = 1 << 2, + MLX5_PTYS_RATE_FDR10 = 1 << 3, + MLX5_PTYS_RATE_FDR = 1 << 4, + MLX5_PTYS_RATE_EDR = 1 << 5, + MLX5_PTYS_RATE_HDR = 1 << 6, + MLX5_PTYS_RATE_NDR = 1 << 7, +}; + +static inline int mlx5_ptys_rate_enum_to_int(enum mlx5_ptys_rate rate) +{ + switch (rate) { + case MLX5_PTYS_RATE_SDR: return 2500; + case MLX5_PTYS_RATE_DDR: return 5000; + case MLX5_PTYS_RATE_QDR: + case MLX5_PTYS_RATE_FDR10: return 10000; + case MLX5_PTYS_RATE_FDR: return 14000; + case MLX5_PTYS_RATE_EDR: return 25000; + case MLX5_PTYS_RATE_HDR: return 50000; + case MLX5_PTYS_RATE_NDR: return 100000; + default: return -1; + } +} + +static int mlx5i_get_speed_settings(u16 ib_link_width_oper, u16 ib_proto_oper) +{ + int rate, width; + + rate = mlx5_ptys_rate_enum_to_int(ib_proto_oper); + if (rate < 0) + return -EINVAL; + width = mlx5_ptys_width_enum_to_int(ib_link_width_oper); + if (width < 0) + return -EINVAL; + + return rate * width; +} + +static int mlx5i_get_link_ksettings(struct net_device *netdev, + struct ethtool_link_ksettings *link_ksettings) +{ + struct mlx5e_priv *priv = mlx5i_epriv(netdev); + struct mlx5_core_dev *mdev = priv->mdev; + u16 ib_link_width_oper; + u16 ib_proto_oper; + int speed, ret; + + ret = mlx5_query_ib_port_oper(mdev, &ib_link_width_oper, &ib_proto_oper, + 1); + if (ret) + return ret; + + ethtool_link_ksettings_zero_link_mode(link_ksettings, supported); + ethtool_link_ksettings_zero_link_mode(link_ksettings, advertising); + + speed = mlx5i_get_speed_settings(ib_link_width_oper, ib_proto_oper); + if (speed < 0) + return -EINVAL; + + link_ksettings->base.duplex = DUPLEX_FULL; + link_ksettings->base.port = PORT_OTHER; + + link_ksettings->base.autoneg = AUTONEG_DISABLE; + + link_ksettings->base.speed = speed; + + return 0; +} + +#ifdef CONFIG_MLX5_EN_RXNFC +static u32 mlx5i_flow_type_mask(u32 flow_type) +{ + return flow_type & ~(FLOW_EXT | FLOW_MAC_EXT | FLOW_RSS); +} + +static int mlx5i_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd) +{ + struct mlx5e_priv *priv = mlx5i_epriv(dev); + struct ethtool_rx_flow_spec *fs = &cmd->fs; + + if (mlx5i_flow_type_mask(fs->flow_type) == ETHER_FLOW) + return -EINVAL; + + return mlx5e_ethtool_set_rxnfc(priv, cmd); +} + +static int mlx5i_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *info, + u32 *rule_locs) +{ + struct mlx5e_priv *priv = mlx5i_epriv(dev); + + return mlx5e_ethtool_get_rxnfc(priv, info, rule_locs); +} +#endif + +const struct ethtool_ops mlx5i_ethtool_ops = { + .supported_coalesce_params = ETHTOOL_COALESCE_USECS | + ETHTOOL_COALESCE_MAX_FRAMES | + ETHTOOL_COALESCE_USE_ADAPTIVE, + .get_drvinfo = mlx5i_get_drvinfo, + .get_strings = mlx5i_get_strings, + .get_sset_count = mlx5i_get_sset_count, + .get_ethtool_stats = mlx5i_get_ethtool_stats, + .get_ringparam = mlx5i_get_ringparam, + .set_ringparam = mlx5i_set_ringparam, + .flash_device = mlx5i_flash_device, + .get_channels = mlx5i_get_channels, + .set_channels = mlx5i_set_channels, + .get_coalesce = mlx5i_get_coalesce, + .set_coalesce = mlx5i_set_coalesce, + .get_ts_info = mlx5i_get_ts_info, +#ifdef CONFIG_MLX5_EN_RXNFC + .get_rxnfc = mlx5i_get_rxnfc, + .set_rxnfc = mlx5i_set_rxnfc, +#endif + .get_link_ksettings = mlx5i_get_link_ksettings, + .get_link = ethtool_op_get_link, +}; + +const struct ethtool_ops mlx5i_pkey_ethtool_ops = { + .get_drvinfo = mlx5i_get_drvinfo, + .get_link = ethtool_op_get_link, + .get_ts_info = mlx5i_get_ts_info, +}; diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c new file mode 100644 index 0000000..0a99a02 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c @@ -0,0 +1,785 @@ +/* + * Copyright (c) 2017, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include "en.h" +#include "en/params.h" +#include "ipoib.h" + +#define IB_DEFAULT_Q_KEY 0xb1b +#define MLX5I_PARAMS_DEFAULT_LOG_RQ_SIZE 9 + +static int mlx5i_open(struct net_device *netdev); +static int mlx5i_close(struct net_device *netdev); +static int mlx5i_change_mtu(struct net_device *netdev, int new_mtu); + +static const struct net_device_ops mlx5i_netdev_ops = { + .ndo_open = mlx5i_open, + .ndo_stop = mlx5i_close, + .ndo_get_stats64 = mlx5i_get_stats, + .ndo_init = mlx5i_dev_init, + .ndo_uninit = mlx5i_dev_cleanup, + .ndo_change_mtu = mlx5i_change_mtu, + .ndo_eth_ioctl = mlx5i_ioctl, +}; + +/* IPoIB mlx5 netdev profile */ +static void mlx5i_build_nic_params(struct mlx5_core_dev *mdev, + struct mlx5e_params *params) +{ + /* Override RQ params as IPoIB supports only LINKED LIST RQ for now */ + MLX5E_SET_PFLAG(params, MLX5E_PFLAG_RX_STRIDING_RQ, false); + mlx5e_set_rq_type(mdev, params); + mlx5e_init_rq_type_params(mdev, params); + + /* RQ size in ipoib by default is 512 */ + params->log_rq_mtu_frames = is_kdump_kernel() ? + MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE : + MLX5I_PARAMS_DEFAULT_LOG_RQ_SIZE; + + params->packet_merge.type = MLX5E_PACKET_MERGE_NONE; + params->hard_mtu = MLX5_IB_GRH_BYTES + MLX5_IPOIB_HARD_LEN; + params->tunneled_offload_en = false; +} + +/* Called directly after IPoIB netdevice was created to initialize SW structs */ +int mlx5i_init(struct mlx5_core_dev *mdev, struct net_device *netdev) +{ + struct mlx5e_priv *priv = mlx5i_epriv(netdev); + + netif_carrier_off(netdev); + mlx5e_set_netdev_mtu_boundaries(priv); + netdev->mtu = netdev->max_mtu; + + mlx5e_build_nic_params(priv, NULL, netdev->mtu); + mlx5i_build_nic_params(mdev, &priv->channels.params); + + mlx5e_timestamp_init(priv); + + /* netdev init */ + netdev->hw_features |= NETIF_F_SG; + netdev->hw_features |= NETIF_F_IP_CSUM; + netdev->hw_features |= NETIF_F_IPV6_CSUM; + netdev->hw_features |= NETIF_F_GRO; + netdev->hw_features |= NETIF_F_TSO; + netdev->hw_features |= NETIF_F_TSO6; + netdev->hw_features |= NETIF_F_RXCSUM; + netdev->hw_features |= NETIF_F_RXHASH; + + netdev->netdev_ops = &mlx5i_netdev_ops; + netdev->ethtool_ops = &mlx5i_ethtool_ops; + + return 0; +} + +/* Called directly before IPoIB netdevice is destroyed to cleanup SW structs */ +void mlx5i_cleanup(struct mlx5e_priv *priv) +{ + mlx5e_priv_cleanup(priv); +} + +static void mlx5i_grp_sw_update_stats(struct mlx5e_priv *priv) +{ + struct rtnl_link_stats64 s = {}; + int i, j; + + for (i = 0; i < priv->stats_nch; i++) { + struct mlx5e_channel_stats *channel_stats; + struct mlx5e_rq_stats *rq_stats; + + channel_stats = priv->channel_stats[i]; + rq_stats = &channel_stats->rq; + + s.rx_packets += rq_stats->packets; + s.rx_bytes += rq_stats->bytes; + + for (j = 0; j < priv->max_opened_tc; j++) { + struct mlx5e_sq_stats *sq_stats = &channel_stats->sq[j]; + + s.tx_packets += sq_stats->packets; + s.tx_bytes += sq_stats->bytes; + s.tx_dropped += sq_stats->dropped; + } + } + + memset(&priv->stats.sw, 0, sizeof(s)); + + priv->stats.sw.rx_packets = s.rx_packets; + priv->stats.sw.rx_bytes = s.rx_bytes; + priv->stats.sw.tx_packets = s.tx_packets; + priv->stats.sw.tx_bytes = s.tx_bytes; + priv->stats.sw.tx_queue_dropped = s.tx_dropped; +} + +void mlx5i_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) +{ + struct mlx5e_priv *priv = mlx5i_epriv(dev); + struct mlx5e_sw_stats *sstats = &priv->stats.sw; + + mlx5i_grp_sw_update_stats(priv); + + stats->rx_packets = sstats->rx_packets; + stats->rx_bytes = sstats->rx_bytes; + stats->tx_packets = sstats->tx_packets; + stats->tx_bytes = sstats->tx_bytes; + stats->tx_dropped = sstats->tx_queue_dropped; +} + +int mlx5i_init_underlay_qp(struct mlx5e_priv *priv) +{ + struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5i_priv *ipriv = priv->ppriv; + int ret; + + { + u32 in[MLX5_ST_SZ_DW(rst2init_qp_in)] = {}; + u32 *qpc; + + qpc = MLX5_ADDR_OF(rst2init_qp_in, in, qpc); + + MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED); + MLX5_SET(qpc, qpc, primary_address_path.pkey_index, + ipriv->pkey_index); + MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1); + MLX5_SET(qpc, qpc, q_key, IB_DEFAULT_Q_KEY); + + MLX5_SET(rst2init_qp_in, in, opcode, MLX5_CMD_OP_RST2INIT_QP); + MLX5_SET(rst2init_qp_in, in, qpn, ipriv->qpn); + ret = mlx5_cmd_exec_in(mdev, rst2init_qp, in); + if (ret) + goto err_qp_modify_to_err; + } + { + u32 in[MLX5_ST_SZ_DW(init2rtr_qp_in)] = {}; + + MLX5_SET(init2rtr_qp_in, in, opcode, MLX5_CMD_OP_INIT2RTR_QP); + MLX5_SET(init2rtr_qp_in, in, qpn, ipriv->qpn); + ret = mlx5_cmd_exec_in(mdev, init2rtr_qp, in); + if (ret) + goto err_qp_modify_to_err; + } + { + u32 in[MLX5_ST_SZ_DW(rtr2rts_qp_in)] = {}; + + MLX5_SET(rtr2rts_qp_in, in, opcode, MLX5_CMD_OP_RTR2RTS_QP); + MLX5_SET(rtr2rts_qp_in, in, qpn, ipriv->qpn); + ret = mlx5_cmd_exec_in(mdev, rtr2rts_qp, in); + if (ret) + goto err_qp_modify_to_err; + } + return 0; + +err_qp_modify_to_err: + { + u32 in[MLX5_ST_SZ_DW(qp_2err_in)] = {}; + + MLX5_SET(qp_2err_in, in, opcode, MLX5_CMD_OP_2ERR_QP); + MLX5_SET(qp_2err_in, in, qpn, ipriv->qpn); + mlx5_cmd_exec_in(mdev, qp_2err, in); + } + return ret; +} + +void mlx5i_uninit_underlay_qp(struct mlx5e_priv *priv) +{ + struct mlx5i_priv *ipriv = priv->ppriv; + struct mlx5_core_dev *mdev = priv->mdev; + u32 in[MLX5_ST_SZ_DW(qp_2rst_in)] = {}; + + MLX5_SET(qp_2rst_in, in, opcode, MLX5_CMD_OP_2RST_QP); + MLX5_SET(qp_2rst_in, in, qpn, ipriv->qpn); + mlx5_cmd_exec_in(mdev, qp_2rst, in); +} + +#define MLX5_QP_ENHANCED_ULP_STATELESS_MODE 2 + +int mlx5i_create_underlay_qp(struct mlx5e_priv *priv) +{ + const unsigned char *dev_addr = priv->netdev->dev_addr; + u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {}; + u32 in[MLX5_ST_SZ_DW(create_qp_in)] = {}; + struct mlx5i_priv *ipriv = priv->ppriv; + void *addr_path; + int qpn = 0; + int ret = 0; + void *qpc; + + if (MLX5_CAP_GEN(priv->mdev, mkey_by_name)) { + qpn = (dev_addr[1] << 16) + (dev_addr[2] << 8) + dev_addr[3]; + MLX5_SET(create_qp_in, in, input_qpn, qpn); + } + + qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); + MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(priv->mdev)); + MLX5_SET(qpc, qpc, st, MLX5_QP_ST_UD); + MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED); + MLX5_SET(qpc, qpc, ulp_stateless_offload_mode, + MLX5_QP_ENHANCED_ULP_STATELESS_MODE); + + addr_path = MLX5_ADDR_OF(qpc, qpc, primary_address_path); + MLX5_SET(ads, addr_path, vhca_port_num, 1); + MLX5_SET(ads, addr_path, grh, 1); + + MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP); + ret = mlx5_cmd_exec_inout(priv->mdev, create_qp, in, out); + if (ret) + return ret; + + ipriv->qpn = MLX5_GET(create_qp_out, out, qpn); + + return 0; +} + +void mlx5i_destroy_underlay_qp(struct mlx5_core_dev *mdev, u32 qpn) +{ + u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {}; + + MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP); + MLX5_SET(destroy_qp_in, in, qpn, qpn); + mlx5_cmd_exec_in(mdev, destroy_qp, in); +} + +int mlx5i_update_nic_rx(struct mlx5e_priv *priv) +{ + return mlx5e_refresh_tirs(priv, true, true); +} + +int mlx5i_create_tis(struct mlx5_core_dev *mdev, u32 underlay_qpn, u32 *tisn) +{ + u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {}; + void *tisc; + + tisc = MLX5_ADDR_OF(create_tis_in, in, ctx); + + MLX5_SET(tisc, tisc, underlay_qpn, underlay_qpn); + + return mlx5e_create_tis(mdev, in, tisn); +} + +static int mlx5i_init_tx(struct mlx5e_priv *priv) +{ + struct mlx5i_priv *ipriv = priv->ppriv; + int err; + + err = mlx5i_create_underlay_qp(priv); + if (err) { + mlx5_core_warn(priv->mdev, "create underlay QP failed, %d\n", err); + return err; + } + + err = mlx5i_create_tis(priv->mdev, ipriv->qpn, &priv->tisn[0][0]); + if (err) { + mlx5_core_warn(priv->mdev, "create tis failed, %d\n", err); + goto err_destroy_underlay_qp; + } + + return 0; + +err_destroy_underlay_qp: + mlx5i_destroy_underlay_qp(priv->mdev, ipriv->qpn); + return err; +} + +static void mlx5i_cleanup_tx(struct mlx5e_priv *priv) +{ + struct mlx5i_priv *ipriv = priv->ppriv; + + mlx5e_destroy_tis(priv->mdev, priv->tisn[0][0]); + mlx5i_destroy_underlay_qp(priv->mdev, ipriv->qpn); +} + +static int mlx5i_create_flow_steering(struct mlx5e_priv *priv) +{ + int err; + + priv->fs.ns = mlx5_get_flow_namespace(priv->mdev, + MLX5_FLOW_NAMESPACE_KERNEL); + + if (!priv->fs.ns) + return -EINVAL; + + err = mlx5e_arfs_create_tables(priv); + if (err) { + netdev_err(priv->netdev, "Failed to create arfs tables, err=%d\n", + err); + priv->netdev->hw_features &= ~NETIF_F_NTUPLE; + } + + err = mlx5e_create_ttc_table(priv); + if (err) { + netdev_err(priv->netdev, "Failed to create ttc table, err=%d\n", + err); + goto err_destroy_arfs_tables; + } + + mlx5e_ethtool_init_steering(priv); + + return 0; + +err_destroy_arfs_tables: + mlx5e_arfs_destroy_tables(priv); + + return err; +} + +static void mlx5i_destroy_flow_steering(struct mlx5e_priv *priv) +{ + mlx5e_destroy_ttc_table(priv); + mlx5e_arfs_destroy_tables(priv); + mlx5e_ethtool_cleanup_steering(priv); +} + +static int mlx5i_init_rx(struct mlx5e_priv *priv) +{ + struct mlx5_core_dev *mdev = priv->mdev; + int err; + + priv->rx_res = mlx5e_rx_res_alloc(); + if (!priv->rx_res) + return -ENOMEM; + + mlx5e_create_q_counters(priv); + + err = mlx5e_open_drop_rq(priv, &priv->drop_rq); + if (err) { + mlx5_core_err(mdev, "open drop rq failed, %d\n", err); + goto err_destroy_q_counters; + } + + err = mlx5e_rx_res_init(priv->rx_res, priv->mdev, 0, + priv->max_nch, priv->drop_rq.rqn, + &priv->channels.params.packet_merge, + priv->channels.params.num_channels); + if (err) + goto err_close_drop_rq; + + err = mlx5i_create_flow_steering(priv); + if (err) + goto err_destroy_rx_res; + + return 0; + +err_destroy_rx_res: + mlx5e_rx_res_destroy(priv->rx_res); +err_close_drop_rq: + mlx5e_close_drop_rq(&priv->drop_rq); +err_destroy_q_counters: + mlx5e_destroy_q_counters(priv); + mlx5e_rx_res_free(priv->rx_res); + priv->rx_res = NULL; + return err; +} + +static void mlx5i_cleanup_rx(struct mlx5e_priv *priv) +{ + mlx5i_destroy_flow_steering(priv); + mlx5e_rx_res_destroy(priv->rx_res); + mlx5e_close_drop_rq(&priv->drop_rq); + mlx5e_destroy_q_counters(priv); + mlx5e_rx_res_free(priv->rx_res); + priv->rx_res = NULL; +} + +/* The stats groups order is opposite to the update_stats() order calls */ +static mlx5e_stats_grp_t mlx5i_stats_grps[] = { + &MLX5E_STATS_GRP(sw), + &MLX5E_STATS_GRP(qcnt), + &MLX5E_STATS_GRP(vnic_env), + &MLX5E_STATS_GRP(vport), + &MLX5E_STATS_GRP(802_3), + &MLX5E_STATS_GRP(2863), + &MLX5E_STATS_GRP(2819), + &MLX5E_STATS_GRP(phy), + &MLX5E_STATS_GRP(pcie), + &MLX5E_STATS_GRP(per_prio), + &MLX5E_STATS_GRP(pme), + &MLX5E_STATS_GRP(channels), + &MLX5E_STATS_GRP(per_port_buff_congest), +}; + +static unsigned int mlx5i_stats_grps_num(struct mlx5e_priv *priv) +{ + return ARRAY_SIZE(mlx5i_stats_grps); +} + +static const struct mlx5e_profile mlx5i_nic_profile = { + .init = mlx5i_init, + .cleanup = mlx5i_cleanup, + .init_tx = mlx5i_init_tx, + .cleanup_tx = mlx5i_cleanup_tx, + .init_rx = mlx5i_init_rx, + .cleanup_rx = mlx5i_cleanup_rx, + .enable = NULL, /* mlx5i_enable */ + .disable = NULL, /* mlx5i_disable */ + .update_rx = mlx5i_update_nic_rx, + .update_stats = NULL, /* mlx5i_update_stats */ + .update_carrier = NULL, /* no HW update in IB link */ + .rx_handlers = &mlx5i_rx_handlers, + .max_tc = MLX5I_MAX_NUM_TC, + .rq_groups = MLX5E_NUM_RQ_GROUPS(REGULAR), + .stats_grps = mlx5i_stats_grps, + .stats_grps_num = mlx5i_stats_grps_num, +}; + +/* mlx5i netdev NDos */ + +static int mlx5i_change_mtu(struct net_device *netdev, int new_mtu) +{ + struct mlx5e_priv *priv = mlx5i_epriv(netdev); + struct mlx5e_params new_params; + int err = 0; + + mutex_lock(&priv->state_lock); + + new_params = priv->channels.params; + new_params.sw_mtu = new_mtu; + + err = mlx5e_safe_switch_params(priv, &new_params, NULL, NULL, true); + if (err) + goto out; + + netdev->mtu = new_params.sw_mtu; + +out: + mutex_unlock(&priv->state_lock); + return err; +} + +int mlx5i_dev_init(struct net_device *dev) +{ + struct mlx5e_priv *priv = mlx5i_epriv(dev); + struct mlx5i_priv *ipriv = priv->ppriv; + u8 addr_mod[3]; + + /* Set dev address using underlay QP */ + addr_mod[0] = (ipriv->qpn >> 16) & 0xff; + addr_mod[1] = (ipriv->qpn >> 8) & 0xff; + addr_mod[2] = (ipriv->qpn) & 0xff; + dev_addr_mod(dev, 1, addr_mod, sizeof(addr_mod)); + + /* Add QPN to net-device mapping to HT */ + mlx5i_pkey_add_qpn(dev, ipriv->qpn); + + return 0; +} + +int mlx5i_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +{ + struct mlx5e_priv *priv = mlx5i_epriv(dev); + + switch (cmd) { + case SIOCSHWTSTAMP: + return mlx5e_hwstamp_set(priv, ifr); + case SIOCGHWTSTAMP: + return mlx5e_hwstamp_get(priv, ifr); + default: + return -EOPNOTSUPP; + } +} + +void mlx5i_dev_cleanup(struct net_device *dev) +{ + struct mlx5e_priv *priv = mlx5i_epriv(dev); + struct mlx5i_priv *ipriv = priv->ppriv; + + mlx5i_uninit_underlay_qp(priv); + + /* Delete QPN to net-device mapping from HT */ + mlx5i_pkey_del_qpn(dev, ipriv->qpn); +} + +static int mlx5i_open(struct net_device *netdev) +{ + struct mlx5e_priv *epriv = mlx5i_epriv(netdev); + struct mlx5i_priv *ipriv = epriv->ppriv; + struct mlx5_core_dev *mdev = epriv->mdev; + int err; + + mutex_lock(&epriv->state_lock); + + set_bit(MLX5E_STATE_OPENED, &epriv->state); + + err = mlx5i_init_underlay_qp(epriv); + if (err) { + mlx5_core_warn(mdev, "prepare underlay qp state failed, %d\n", err); + goto err_clear_state_opened_flag; + } + + err = mlx5_fs_add_rx_underlay_qpn(mdev, ipriv->qpn); + if (err) { + mlx5_core_warn(mdev, "attach underlay qp to ft failed, %d\n", err); + goto err_reset_qp; + } + + err = mlx5e_open_channels(epriv, &epriv->channels); + if (err) + goto err_remove_fs_underlay_qp; + + epriv->profile->update_rx(epriv); + mlx5e_activate_priv_channels(epriv); + + mutex_unlock(&epriv->state_lock); + return 0; + +err_remove_fs_underlay_qp: + mlx5_fs_remove_rx_underlay_qpn(mdev, ipriv->qpn); +err_reset_qp: + mlx5i_uninit_underlay_qp(epriv); +err_clear_state_opened_flag: + clear_bit(MLX5E_STATE_OPENED, &epriv->state); + mutex_unlock(&epriv->state_lock); + return err; +} + +static int mlx5i_close(struct net_device *netdev) +{ + struct mlx5e_priv *epriv = mlx5i_epriv(netdev); + struct mlx5i_priv *ipriv = epriv->ppriv; + struct mlx5_core_dev *mdev = epriv->mdev; + + /* May already be CLOSED in case a previous configuration operation + * (e.g RX/TX queue size change) that involves close&open failed. + */ + mutex_lock(&epriv->state_lock); + + if (!test_bit(MLX5E_STATE_OPENED, &epriv->state)) + goto unlock; + + clear_bit(MLX5E_STATE_OPENED, &epriv->state); + + netif_carrier_off(epriv->netdev); + mlx5_fs_remove_rx_underlay_qpn(mdev, ipriv->qpn); + mlx5e_deactivate_priv_channels(epriv); + mlx5e_close_channels(&epriv->channels); + mlx5i_uninit_underlay_qp(epriv); +unlock: + mutex_unlock(&epriv->state_lock); + return 0; +} + +/* IPoIB RDMA netdev callbacks */ +static int mlx5i_attach_mcast(struct net_device *netdev, struct ib_device *hca, + union ib_gid *gid, u16 lid, int set_qkey, + u32 qkey) +{ + struct mlx5e_priv *epriv = mlx5i_epriv(netdev); + struct mlx5_core_dev *mdev = epriv->mdev; + struct mlx5i_priv *ipriv = epriv->ppriv; + int err; + + mlx5_core_dbg(mdev, "attaching QPN 0x%x, MGID %pI6\n", ipriv->qpn, + gid->raw); + err = mlx5_core_attach_mcg(mdev, gid, ipriv->qpn); + if (err) + mlx5_core_warn(mdev, "failed attaching QPN 0x%x, MGID %pI6\n", + ipriv->qpn, gid->raw); + + if (set_qkey) { + mlx5_core_dbg(mdev, "%s setting qkey 0x%x\n", + netdev->name, qkey); + ipriv->qkey = qkey; + } + + return err; +} + +static int mlx5i_detach_mcast(struct net_device *netdev, struct ib_device *hca, + union ib_gid *gid, u16 lid) +{ + struct mlx5e_priv *epriv = mlx5i_epriv(netdev); + struct mlx5_core_dev *mdev = epriv->mdev; + struct mlx5i_priv *ipriv = epriv->ppriv; + int err; + + mlx5_core_dbg(mdev, "detaching QPN 0x%x, MGID %pI6\n", ipriv->qpn, + gid->raw); + + err = mlx5_core_detach_mcg(mdev, gid, ipriv->qpn); + if (err) + mlx5_core_dbg(mdev, "failed detaching QPN 0x%x, MGID %pI6\n", + ipriv->qpn, gid->raw); + + return err; +} + +static int mlx5i_xmit(struct net_device *dev, struct sk_buff *skb, + struct ib_ah *address, u32 dqpn) +{ + struct mlx5e_priv *epriv = mlx5i_epriv(dev); + struct mlx5e_txqsq *sq = epriv->txq2sq[skb_get_queue_mapping(skb)]; + struct mlx5_ib_ah *mah = to_mah(address); + struct mlx5i_priv *ipriv = epriv->ppriv; + + mlx5i_sq_xmit(sq, skb, &mah->av, dqpn, ipriv->qkey, netdev_xmit_more()); + + return NETDEV_TX_OK; +} + +static void mlx5i_set_pkey_index(struct net_device *netdev, int id) +{ + struct mlx5i_priv *ipriv = netdev_priv(netdev); + + ipriv->pkey_index = (u16)id; +} + +static int mlx5i_check_required_hca_cap(struct mlx5_core_dev *mdev) +{ + if (MLX5_CAP_GEN(mdev, port_type) != MLX5_CAP_PORT_TYPE_IB) + return -EOPNOTSUPP; + + if (!MLX5_CAP_GEN(mdev, ipoib_enhanced_offloads)) { + mlx5_core_warn(mdev, "IPoIB enhanced offloads are not supported\n"); + return -EOPNOTSUPP; + } + + return 0; +} + +static void mlx5_rdma_netdev_free(struct net_device *netdev) +{ + struct mlx5e_priv *priv = mlx5i_epriv(netdev); + struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5i_priv *ipriv = priv->ppriv; + const struct mlx5e_profile *profile = priv->profile; + + mlx5e_detach_netdev(priv); + profile->cleanup(priv); + + if (!ipriv->sub_interface) { + mlx5i_pkey_qpn_ht_cleanup(netdev); + mlx5e_destroy_mdev_resources(mdev); + } +} + +static bool mlx5_is_sub_interface(struct mlx5_core_dev *mdev) +{ + return mdev->mlx5e_res.hw_objs.pdn != 0; +} + +static const struct mlx5e_profile *mlx5_get_profile(struct mlx5_core_dev *mdev) +{ + if (mlx5_is_sub_interface(mdev)) + return mlx5i_pkey_get_profile(); + return &mlx5i_nic_profile; +} + +static int mlx5_rdma_setup_rn(struct ib_device *ibdev, u32 port_num, + struct net_device *netdev, void *param) +{ + struct mlx5_core_dev *mdev = (struct mlx5_core_dev *)param; + const struct mlx5e_profile *prof = mlx5_get_profile(mdev); + struct mlx5i_priv *ipriv; + struct mlx5e_priv *epriv; + struct rdma_netdev *rn; + int err; + + ipriv = netdev_priv(netdev); + epriv = mlx5i_epriv(netdev); + + ipriv->sub_interface = mlx5_is_sub_interface(mdev); + if (!ipriv->sub_interface) { + err = mlx5i_pkey_qpn_ht_init(netdev); + if (err) { + mlx5_core_warn(mdev, "allocate qpn_to_netdev ht failed\n"); + return err; + } + + /* This should only be called once per mdev */ + err = mlx5e_create_mdev_resources(mdev); + if (err) + goto destroy_ht; + } + + err = mlx5e_priv_init(epriv, prof, netdev, mdev); + if (err) + goto destroy_mdev_resources; + + epriv->profile = prof; + epriv->ppriv = ipriv; + + prof->init(mdev, netdev); + + err = mlx5e_attach_netdev(epriv); + if (err) + goto detach; + netif_carrier_off(netdev); + + /* set rdma_netdev func pointers */ + rn = &ipriv->rn; + rn->hca = ibdev; + rn->send = mlx5i_xmit; + rn->attach_mcast = mlx5i_attach_mcast; + rn->detach_mcast = mlx5i_detach_mcast; + rn->set_id = mlx5i_set_pkey_index; + + netdev->priv_destructor = mlx5_rdma_netdev_free; + netdev->needs_free_netdev = 1; + + return 0; + +detach: + prof->cleanup(epriv); + if (ipriv->sub_interface) + return err; +destroy_mdev_resources: + mlx5e_destroy_mdev_resources(mdev); +destroy_ht: + mlx5i_pkey_qpn_ht_cleanup(netdev); + return err; +} + +int mlx5_rdma_rn_get_params(struct mlx5_core_dev *mdev, + struct ib_device *device, + struct rdma_netdev_alloc_params *params) +{ + int nch; + int rc; + + rc = mlx5i_check_required_hca_cap(mdev); + if (rc) + return rc; + + nch = mlx5e_get_max_num_channels(mdev); + + *params = (struct rdma_netdev_alloc_params){ + .sizeof_priv = sizeof(struct mlx5i_priv) + + sizeof(struct mlx5e_priv), + .txqs = nch * MLX5E_MAX_NUM_TC, + .rxqs = nch, + .param = mdev, + .initialize_rdma_netdev = mlx5_rdma_setup_rn, + }; + + return 0; +} +EXPORT_SYMBOL(mlx5_rdma_rn_get_params); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h new file mode 100644 index 0000000..99d46fd --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2017, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __MLX5E_IPOB_H__ +#define __MLX5E_IPOB_H__ + +#ifdef CONFIG_MLX5_CORE_IPOIB + +#include +#include "en.h" + +#define MLX5I_MAX_NUM_TC 1 + +extern const struct ethtool_ops mlx5i_ethtool_ops; +extern const struct ethtool_ops mlx5i_pkey_ethtool_ops; +extern const struct mlx5e_rx_handlers mlx5i_rx_handlers; + +#define MLX5_IB_GRH_BYTES 40 +#define MLX5_IPOIB_ENCAP_LEN 4 +#define MLX5_IPOIB_PSEUDO_LEN 20 +#define MLX5_IPOIB_HARD_LEN (MLX5_IPOIB_PSEUDO_LEN + MLX5_IPOIB_ENCAP_LEN) + +/* ipoib rdma netdev's private data structure */ +struct mlx5i_priv { + struct rdma_netdev rn; /* keep this first */ + u32 qpn; + bool sub_interface; + u32 qkey; + u16 pkey_index; + struct mlx5i_pkey_qpn_ht *qpn_htbl; + char *mlx5e_priv[]; +}; + +int mlx5i_create_tis(struct mlx5_core_dev *mdev, u32 underlay_qpn, u32 *tisn); + +/* Underlay QP create/destroy functions */ +int mlx5i_create_underlay_qp(struct mlx5e_priv *priv); +void mlx5i_destroy_underlay_qp(struct mlx5_core_dev *mdev, u32 qpn); + +/* Underlay QP state modification init/uninit functions */ +int mlx5i_init_underlay_qp(struct mlx5e_priv *priv); +void mlx5i_uninit_underlay_qp(struct mlx5e_priv *priv); + +/* Allocate/Free underlay QPN to net-device hash table */ +int mlx5i_pkey_qpn_ht_init(struct net_device *netdev); +void mlx5i_pkey_qpn_ht_cleanup(struct net_device *netdev); + +/* Add/Remove an underlay QPN to net-device mapping to/from the hash table */ +int mlx5i_pkey_add_qpn(struct net_device *netdev, u32 qpn); +int mlx5i_pkey_del_qpn(struct net_device *netdev, u32 qpn); + +/* Get the net-device corresponding to the given underlay QPN */ +struct net_device *mlx5i_pkey_get_netdev(struct net_device *netdev, u32 qpn); + +/* Shared ndo functions */ +int mlx5i_dev_init(struct net_device *dev); +void mlx5i_dev_cleanup(struct net_device *dev); +int mlx5i_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd); + +/* Parent profile functions */ +int mlx5i_init(struct mlx5_core_dev *mdev, struct net_device *netdev); +void mlx5i_cleanup(struct mlx5e_priv *priv); + +int mlx5i_update_nic_rx(struct mlx5e_priv *priv); + +/* Get child interface nic profile */ +const struct mlx5e_profile *mlx5i_pkey_get_profile(void); + +/* Extract mlx5e_priv from IPoIB netdev */ +#define mlx5i_epriv(netdev) ((void *)(((struct mlx5i_priv *)netdev_priv(netdev))->mlx5e_priv)) + +struct mlx5_wqe_eth_pad { + u8 rsvd0[16]; +}; + +struct mlx5i_tx_wqe { + struct mlx5_wqe_ctrl_seg ctrl; + struct mlx5_wqe_datagram_seg datagram; + struct mlx5_wqe_eth_pad pad; + struct mlx5_wqe_eth_seg eth; + struct mlx5_wqe_data_seg data[]; +}; + +#define MLX5I_SQ_FETCH_WQE(sq, pi) \ + ((struct mlx5i_tx_wqe *)mlx5e_fetch_wqe(&(sq)->wq, pi, sizeof(struct mlx5i_tx_wqe))) + +void mlx5i_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb, + struct mlx5_av *av, u32 dqpn, u32 dqkey, bool xmit_more); +void mlx5i_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats); + +#endif /* CONFIG_MLX5_CORE_IPOIB */ +#endif /* __MLX5E_IPOB_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c new file mode 100644 index 0000000..0c9c013 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c @@ -0,0 +1,359 @@ +/* + * Copyright (c) 2017, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include "ipoib.h" + +#define MLX5I_MAX_LOG_PKEY_SUP 7 + +struct qpn_to_netdev { + struct net_device *netdev; + struct hlist_node hlist; + u32 underlay_qpn; +}; + +struct mlx5i_pkey_qpn_ht { + struct hlist_head buckets[1 << MLX5I_MAX_LOG_PKEY_SUP]; + spinlock_t ht_lock; /* Synchronise with NAPI */ +}; + +int mlx5i_pkey_qpn_ht_init(struct net_device *netdev) +{ + struct mlx5i_priv *ipriv = netdev_priv(netdev); + struct mlx5i_pkey_qpn_ht *qpn_htbl; + + qpn_htbl = kzalloc(sizeof(*qpn_htbl), GFP_KERNEL); + if (!qpn_htbl) + return -ENOMEM; + + ipriv->qpn_htbl = qpn_htbl; + spin_lock_init(&qpn_htbl->ht_lock); + + return 0; +} + +void mlx5i_pkey_qpn_ht_cleanup(struct net_device *netdev) +{ + struct mlx5i_priv *ipriv = netdev_priv(netdev); + + kfree(ipriv->qpn_htbl); +} + +static struct qpn_to_netdev *mlx5i_find_qpn_to_netdev_node(struct hlist_head *buckets, + u32 qpn) +{ + struct hlist_head *h = &buckets[hash_32(qpn, MLX5I_MAX_LOG_PKEY_SUP)]; + struct qpn_to_netdev *node; + + hlist_for_each_entry(node, h, hlist) { + if (node->underlay_qpn == qpn) + return node; + } + + return NULL; +} + +int mlx5i_pkey_add_qpn(struct net_device *netdev, u32 qpn) +{ + struct mlx5i_priv *ipriv = netdev_priv(netdev); + struct mlx5i_pkey_qpn_ht *ht = ipriv->qpn_htbl; + u8 key = hash_32(qpn, MLX5I_MAX_LOG_PKEY_SUP); + struct qpn_to_netdev *new_node; + + new_node = kzalloc(sizeof(*new_node), GFP_KERNEL); + if (!new_node) + return -ENOMEM; + + new_node->netdev = netdev; + new_node->underlay_qpn = qpn; + spin_lock_bh(&ht->ht_lock); + hlist_add_head(&new_node->hlist, &ht->buckets[key]); + spin_unlock_bh(&ht->ht_lock); + + return 0; +} + +int mlx5i_pkey_del_qpn(struct net_device *netdev, u32 qpn) +{ + struct mlx5e_priv *epriv = mlx5i_epriv(netdev); + struct mlx5i_priv *ipriv = epriv->ppriv; + struct mlx5i_pkey_qpn_ht *ht = ipriv->qpn_htbl; + struct qpn_to_netdev *node; + + node = mlx5i_find_qpn_to_netdev_node(ht->buckets, qpn); + if (!node) { + mlx5_core_warn(epriv->mdev, "QPN to netdev delete from HT failed\n"); + return -EINVAL; + } + + spin_lock_bh(&ht->ht_lock); + hlist_del_init(&node->hlist); + spin_unlock_bh(&ht->ht_lock); + kfree(node); + + return 0; +} + +struct net_device *mlx5i_pkey_get_netdev(struct net_device *netdev, u32 qpn) +{ + struct mlx5i_priv *ipriv = netdev_priv(netdev); + struct qpn_to_netdev *node; + + node = mlx5i_find_qpn_to_netdev_node(ipriv->qpn_htbl->buckets, qpn); + if (!node) + return NULL; + + return node->netdev; +} + +static int mlx5i_pkey_open(struct net_device *netdev); +static int mlx5i_pkey_close(struct net_device *netdev); +static int mlx5i_pkey_dev_init(struct net_device *dev); +static void mlx5i_pkey_dev_cleanup(struct net_device *netdev); +static int mlx5i_pkey_change_mtu(struct net_device *netdev, int new_mtu); +static int mlx5i_pkey_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd); + +static const struct net_device_ops mlx5i_pkey_netdev_ops = { + .ndo_open = mlx5i_pkey_open, + .ndo_stop = mlx5i_pkey_close, + .ndo_init = mlx5i_pkey_dev_init, + .ndo_get_stats64 = mlx5i_get_stats, + .ndo_uninit = mlx5i_pkey_dev_cleanup, + .ndo_change_mtu = mlx5i_pkey_change_mtu, + .ndo_eth_ioctl = mlx5i_pkey_ioctl, +}; + +/* Child NDOs */ +static int mlx5i_pkey_dev_init(struct net_device *dev) +{ + struct mlx5e_priv *priv = mlx5i_epriv(dev); + struct mlx5i_priv *ipriv, *parent_ipriv; + struct net_device *parent_dev; + int parent_ifindex; + + ipriv = priv->ppriv; + + /* Get QPN to netdevice hash table from parent */ + parent_ifindex = dev->netdev_ops->ndo_get_iflink(dev); + parent_dev = dev_get_by_index(dev_net(dev), parent_ifindex); + if (!parent_dev) { + mlx5_core_warn(priv->mdev, "failed to get parent device\n"); + return -EINVAL; + } + + parent_ipriv = netdev_priv(parent_dev); + ipriv->qpn_htbl = parent_ipriv->qpn_htbl; + dev_put(parent_dev); + + return mlx5i_dev_init(dev); +} + +static int mlx5i_pkey_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +{ + return mlx5i_ioctl(dev, ifr, cmd); +} + +static void mlx5i_pkey_dev_cleanup(struct net_device *netdev) +{ + return mlx5i_dev_cleanup(netdev); +} + +static int mlx5i_pkey_open(struct net_device *netdev) +{ + struct mlx5e_priv *epriv = mlx5i_epriv(netdev); + struct mlx5i_priv *ipriv = epriv->ppriv; + struct mlx5_core_dev *mdev = epriv->mdev; + int err; + + mutex_lock(&epriv->state_lock); + + set_bit(MLX5E_STATE_OPENED, &epriv->state); + + err = mlx5i_init_underlay_qp(epriv); + if (err) { + mlx5_core_warn(mdev, "prepare child underlay qp state failed, %d\n", err); + goto err_release_lock; + } + + err = mlx5_fs_add_rx_underlay_qpn(mdev, ipriv->qpn); + if (err) { + mlx5_core_warn(mdev, "attach child underlay qp to ft failed, %d\n", err); + goto err_unint_underlay_qp; + } + + err = mlx5i_create_tis(mdev, ipriv->qpn, &epriv->tisn[0][0]); + if (err) { + mlx5_core_warn(mdev, "create child tis failed, %d\n", err); + goto err_remove_rx_uderlay_qp; + } + + err = mlx5e_open_channels(epriv, &epriv->channels); + if (err) { + mlx5_core_warn(mdev, "opening child channels failed, %d\n", err); + goto err_clear_state_opened_flag; + } + epriv->profile->update_rx(epriv); + mlx5e_activate_priv_channels(epriv); + mutex_unlock(&epriv->state_lock); + + return 0; + +err_clear_state_opened_flag: + mlx5e_destroy_tis(mdev, epriv->tisn[0][0]); +err_remove_rx_uderlay_qp: + mlx5_fs_remove_rx_underlay_qpn(mdev, ipriv->qpn); +err_unint_underlay_qp: + mlx5i_uninit_underlay_qp(epriv); +err_release_lock: + clear_bit(MLX5E_STATE_OPENED, &epriv->state); + mutex_unlock(&epriv->state_lock); + return err; +} + +static int mlx5i_pkey_close(struct net_device *netdev) +{ + struct mlx5e_priv *priv = mlx5i_epriv(netdev); + struct mlx5i_priv *ipriv = priv->ppriv; + struct mlx5_core_dev *mdev = priv->mdev; + + mutex_lock(&priv->state_lock); + + if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) + goto unlock; + + clear_bit(MLX5E_STATE_OPENED, &priv->state); + + netif_carrier_off(priv->netdev); + mlx5_fs_remove_rx_underlay_qpn(mdev, ipriv->qpn); + mlx5i_uninit_underlay_qp(priv); + mlx5e_deactivate_priv_channels(priv); + mlx5e_close_channels(&priv->channels); + mlx5e_destroy_tis(mdev, priv->tisn[0][0]); +unlock: + mutex_unlock(&priv->state_lock); + return 0; +} + +static int mlx5i_pkey_change_mtu(struct net_device *netdev, int new_mtu) +{ + struct mlx5e_priv *priv = mlx5i_epriv(netdev); + + mutex_lock(&priv->state_lock); + netdev->mtu = new_mtu; + mutex_unlock(&priv->state_lock); + + return 0; +} + +/* Called directly after IPoIB netdevice was created to initialize SW structs */ +static int mlx5i_pkey_init(struct mlx5_core_dev *mdev, + struct net_device *netdev) +{ + struct mlx5e_priv *priv = mlx5i_epriv(netdev); + int err; + + priv->profile = mlx5i_pkey_get_profile(); + err = mlx5i_init(mdev, netdev); + if (err) + return err; + + /* Override parent ndo */ + netdev->netdev_ops = &mlx5i_pkey_netdev_ops; + + /* Set child limited ethtool support */ + netdev->ethtool_ops = &mlx5i_pkey_ethtool_ops; + + /* Use dummy rqs */ + priv->channels.params.log_rq_mtu_frames = MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE; + + return 0; +} + +/* Called directly before IPoIB netdevice is destroyed to cleanup SW structs */ +static void mlx5i_pkey_cleanup(struct mlx5e_priv *priv) +{ + mlx5i_cleanup(priv); +} + +static int mlx5i_pkey_init_tx(struct mlx5e_priv *priv) +{ + int err; + + err = mlx5i_create_underlay_qp(priv); + if (err) + mlx5_core_warn(priv->mdev, "create child underlay QP failed, %d\n", err); + + return err; +} + +static void mlx5i_pkey_cleanup_tx(struct mlx5e_priv *priv) +{ + struct mlx5i_priv *ipriv = priv->ppriv; + + mlx5i_destroy_underlay_qp(priv->mdev, ipriv->qpn); +} + +static int mlx5i_pkey_init_rx(struct mlx5e_priv *priv) +{ + /* Since the rx resources are shared between child and parent, the + * parent interface is taking care of rx resource allocation and init + */ + return 0; +} + +static void mlx5i_pkey_cleanup_rx(struct mlx5e_priv *priv) +{ + /* Since the rx resources are shared between child and parent, the + * parent interface is taking care of rx resource free and de-init + */ +} + +static const struct mlx5e_profile mlx5i_pkey_nic_profile = { + .init = mlx5i_pkey_init, + .cleanup = mlx5i_pkey_cleanup, + .init_tx = mlx5i_pkey_init_tx, + .cleanup_tx = mlx5i_pkey_cleanup_tx, + .init_rx = mlx5i_pkey_init_rx, + .cleanup_rx = mlx5i_pkey_cleanup_rx, + .enable = NULL, + .disable = NULL, + .update_rx = mlx5i_update_nic_rx, + .update_stats = NULL, + .rx_handlers = &mlx5i_rx_handlers, + .max_tc = MLX5I_MAX_NUM_TC, + .rq_groups = MLX5E_NUM_RQ_GROUPS(REGULAR), +}; + +const struct mlx5e_profile *mlx5i_pkey_get_profile(void) +{ + return &mlx5i_pkey_nic_profile; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/irq_affinity.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/irq_affinity.c new file mode 100644 index 0000000..d1491de --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/irq_affinity.c @@ -0,0 +1,229 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#include "mlx5_core.h" +#include "mlx5_irq.h" +#include "pci_irq.h" + +static void cpu_put(struct mlx5_irq_pool *pool, int cpu) +{ + pool->irqs_per_cpu[cpu]--; +} + +static void cpu_get(struct mlx5_irq_pool *pool, int cpu) +{ + pool->irqs_per_cpu[cpu]++; +} + +/* Gets the least loaded CPU. e.g.: the CPU with least IRQs bound to it */ +static int cpu_get_least_loaded(struct mlx5_irq_pool *pool, + const struct cpumask *req_mask) +{ + int best_cpu = -1; + int cpu; + + for_each_cpu_and(cpu, req_mask, cpu_online_mask) { + /* CPU has zero IRQs on it. No need to search any more CPUs. */ + if (!pool->irqs_per_cpu[cpu]) { + best_cpu = cpu; + break; + } + if (best_cpu < 0) + best_cpu = cpu; + if (pool->irqs_per_cpu[cpu] < pool->irqs_per_cpu[best_cpu]) + best_cpu = cpu; + } + if (best_cpu == -1) { + /* There isn't online CPUs in req_mask */ + mlx5_core_err(pool->dev, "NO online CPUs in req_mask (%*pbl)\n", + cpumask_pr_args(req_mask)); + best_cpu = cpumask_first(cpu_online_mask); + } + pool->irqs_per_cpu[best_cpu]++; + return best_cpu; +} + +/* Creating an IRQ from irq_pool */ +static struct mlx5_irq * +irq_pool_request_irq(struct mlx5_irq_pool *pool, const struct cpumask *req_mask) +{ + cpumask_var_t auto_mask; + struct mlx5_irq *irq; + u32 irq_index; + int err; + + if (!zalloc_cpumask_var(&auto_mask, GFP_KERNEL)) + return ERR_PTR(-ENOMEM); + err = xa_alloc(&pool->irqs, &irq_index, NULL, pool->xa_num_irqs, GFP_KERNEL); + if (err) { + if (err == -EBUSY) + err = -EUSERS; + return ERR_PTR(err); + } + if (pool->irqs_per_cpu) { + if (cpumask_weight(req_mask) > 1) + /* if req_mask contain more then one CPU, set the least loadad CPU + * of req_mask + */ + cpumask_set_cpu(cpu_get_least_loaded(pool, req_mask), auto_mask); + else + cpu_get(pool, cpumask_first(req_mask)); + } + irq = mlx5_irq_alloc(pool, irq_index, cpumask_empty(auto_mask) ? req_mask : auto_mask); + free_cpumask_var(auto_mask); + return irq; +} + +/* Looking for the IRQ with the smallest refcount that fits req_mask. + * If pool is sf_comp_pool, then we are looking for an IRQ with any of the + * requested CPUs in req_mask. + * for example: req_mask = 0xf, irq0_mask = 0x10, irq1_mask = 0x1. irq0_mask + * isn't subset of req_mask, so we will skip it. irq1_mask is subset of req_mask, + * we don't skip it. + * If pool is sf_ctrl_pool, then all IRQs have the same mask, so any IRQ will + * fit. And since mask is subset of itself, we will pass the first if bellow. + */ +static struct mlx5_irq * +irq_pool_find_least_loaded(struct mlx5_irq_pool *pool, const struct cpumask *req_mask) +{ + int start = pool->xa_num_irqs.min; + int end = pool->xa_num_irqs.max; + struct mlx5_irq *irq = NULL; + struct mlx5_irq *iter; + int irq_refcount = 0; + unsigned long index; + + lockdep_assert_held(&pool->lock); + xa_for_each_range(&pool->irqs, index, iter, start, end) { + struct cpumask *iter_mask = mlx5_irq_get_affinity_mask(iter); + int iter_refcount = mlx5_irq_read_locked(iter); + + if (!cpumask_subset(iter_mask, req_mask)) + /* skip IRQs with a mask which is not subset of req_mask */ + continue; + if (iter_refcount < pool->min_threshold) + /* If we found an IRQ with less than min_thres, return it */ + return iter; + if (!irq || iter_refcount < irq_refcount) { + /* In case we won't find an IRQ with less than min_thres, + * keep a pointer to the least used IRQ + */ + irq_refcount = iter_refcount; + irq = iter; + } + } + return irq; +} + +/** + * mlx5_irq_affinity_request - request an IRQ according to the given mask. + * @pool: IRQ pool to request from. + * @req_mask: cpumask requested for this IRQ. + * + * This function returns a pointer to IRQ, or ERR_PTR in case of error. + */ +struct mlx5_irq * +mlx5_irq_affinity_request(struct mlx5_irq_pool *pool, const struct cpumask *req_mask) +{ + struct mlx5_irq *least_loaded_irq, *new_irq; + + mutex_lock(&pool->lock); + least_loaded_irq = irq_pool_find_least_loaded(pool, req_mask); + if (least_loaded_irq && + mlx5_irq_read_locked(least_loaded_irq) < pool->min_threshold) + goto out; + /* We didn't find an IRQ with less than min_thres, try to allocate a new IRQ */ + new_irq = irq_pool_request_irq(pool, req_mask); + if (IS_ERR(new_irq)) { + if (!least_loaded_irq) { + /* We failed to create an IRQ and we didn't find an IRQ */ + mlx5_core_err(pool->dev, "Didn't find a matching IRQ. err = %ld\n", + PTR_ERR(new_irq)); + mutex_unlock(&pool->lock); + return new_irq; + } + /* We failed to create a new IRQ for the requested affinity, + * sharing existing IRQ. + */ + goto out; + } + least_loaded_irq = new_irq; + goto unlock; +out: + mlx5_irq_get_locked(least_loaded_irq); + if (mlx5_irq_read_locked(least_loaded_irq) > pool->max_threshold) + mlx5_core_dbg(pool->dev, "IRQ %u overloaded, pool_name: %s, %u EQs on this irq\n", + pci_irq_vector(pool->dev->pdev, + mlx5_irq_get_index(least_loaded_irq)), pool->name, + mlx5_irq_read_locked(least_loaded_irq) / MLX5_EQ_REFS_PER_IRQ); +unlock: + mutex_unlock(&pool->lock); + return least_loaded_irq; +} + +void mlx5_irq_affinity_irqs_release(struct mlx5_core_dev *dev, struct mlx5_irq **irqs, + int num_irqs) +{ + struct mlx5_irq_pool *pool = mlx5_irq_pool_get(dev); + int i; + + for (i = 0; i < num_irqs; i++) { + int cpu = cpumask_first(mlx5_irq_get_affinity_mask(irqs[i])); + + synchronize_irq(pci_irq_vector(pool->dev->pdev, + mlx5_irq_get_index(irqs[i]))); + if (mlx5_irq_put(irqs[i])) + if (pool->irqs_per_cpu) + cpu_put(pool, cpu); + } +} + +/** + * mlx5_irq_affinity_irqs_request_auto - request one or more IRQs for mlx5 device. + * @dev: mlx5 device that is requesting the IRQs. + * @nirqs: number of IRQs to request. + * @irqs: an output array of IRQs pointers. + * + * Each IRQ is bounded to at most 1 CPU. + * This function is requesting IRQs according to the default assignment. + * The default assignment policy is: + * - in each iteration, request the least loaded IRQ which is not bound to any + * CPU of the previous IRQs requested. + * + * This function returns the number of IRQs requested, (which might be smaller than + * @nirqs), if successful, or a negative error code in case of an error. + */ +int mlx5_irq_affinity_irqs_request_auto(struct mlx5_core_dev *dev, int nirqs, + struct mlx5_irq **irqs) +{ + struct mlx5_irq_pool *pool = mlx5_irq_pool_get(dev); + cpumask_var_t req_mask; + struct mlx5_irq *irq; + int i = 0; + + if (!zalloc_cpumask_var(&req_mask, GFP_KERNEL)) + return -ENOMEM; + cpumask_copy(req_mask, cpu_online_mask); + for (i = 0; i < nirqs; i++) { + if (mlx5_irq_pool_is_sf_pool(pool)) + irq = mlx5_irq_affinity_request(pool, req_mask); + else + /* In case SF pool doesn't exists, fallback to the PF IRQs. + * The PF IRQs are already allocated and binded to CPU + * at this point. Hence, only an index is needed. + */ + irq = mlx5_irq_request(dev, i, NULL); + if (IS_ERR(irq)) + break; + irqs[i] = irq; + cpumask_clear_cpu(cpumask_first(mlx5_irq_get_affinity_mask(irq)), req_mask); + mlx5_core_dbg(dev, "IRQ %u mapped to cpu %*pbl, %u EQs on this irq\n", + pci_irq_vector(dev->pdev, mlx5_irq_get_index(irq)), + cpumask_pr_args(mlx5_irq_get_affinity_mask(irq)), + mlx5_irq_read_locked(irq) / MLX5_EQ_REFS_PER_IRQ); + } + free_cpumask_var(req_mask); + if (!i) + return PTR_ERR(irq); + return i; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lag/debugfs.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lag/debugfs.c new file mode 100644 index 0000000..a490ce0 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lag/debugfs.c @@ -0,0 +1,175 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#include "lag.h" + +static char *get_str_mode_type(struct mlx5_lag *ldev) +{ + if (ldev->flags & MLX5_LAG_FLAG_ROCE) + return "roce"; + if (ldev->flags & MLX5_LAG_FLAG_SRIOV) + return "switchdev"; + if (ldev->flags & MLX5_LAG_FLAG_MULTIPATH) + return "multipath"; + if (ldev->flags & MLX5_LAG_FLAG_MULTI_PORT_ESW) + return "mpesw"; + + return NULL; +} + +static int type_show(struct seq_file *file, void *priv) +{ + struct mlx5_core_dev *dev = file->private; + struct mlx5_lag *ldev; + char *mode = NULL; + + ldev = dev->priv.lag; + mutex_lock(&ldev->lock); + if (__mlx5_lag_is_active(ldev)) + mode = get_str_mode_type(ldev); + mutex_unlock(&ldev->lock); + if (!mode) + return -EINVAL; + seq_printf(file, "%s\n", mode); + + return 0; +} + +static int port_sel_mode_show(struct seq_file *file, void *priv) +{ + struct mlx5_core_dev *dev = file->private; + struct mlx5_lag *ldev; + int ret = 0; + char *mode; + + ldev = dev->priv.lag; + mutex_lock(&ldev->lock); + if (__mlx5_lag_is_active(ldev)) + mode = get_str_port_sel_mode(ldev->flags); + else + ret = -EINVAL; + mutex_unlock(&ldev->lock); + if (ret || !mode) + return ret; + + seq_printf(file, "%s\n", mode); + return 0; +} + +static int state_show(struct seq_file *file, void *priv) +{ + struct mlx5_core_dev *dev = file->private; + struct mlx5_lag *ldev; + bool active; + + ldev = dev->priv.lag; + mutex_lock(&ldev->lock); + active = __mlx5_lag_is_active(ldev); + mutex_unlock(&ldev->lock); + seq_printf(file, "%s\n", active ? "active" : "disabled"); + return 0; +} + +static int flags_show(struct seq_file *file, void *priv) +{ + struct mlx5_core_dev *dev = file->private; + struct mlx5_lag *ldev; + bool shared_fdb; + bool lag_active; + + ldev = dev->priv.lag; + mutex_lock(&ldev->lock); + lag_active = __mlx5_lag_is_active(ldev); + if (lag_active) + shared_fdb = ldev->shared_fdb; + + mutex_unlock(&ldev->lock); + if (!lag_active) + return -EINVAL; + + seq_printf(file, "%s:%s\n", "shared_fdb", shared_fdb ? "on" : "off"); + return 0; +} + +static int mapping_show(struct seq_file *file, void *priv) +{ + struct mlx5_core_dev *dev = file->private; + u8 ports[MLX5_MAX_PORTS] = {}; + struct mlx5_lag *ldev; + bool hash = false; + bool lag_active; + int num_ports; + int i; + + ldev = dev->priv.lag; + mutex_lock(&ldev->lock); + lag_active = __mlx5_lag_is_active(ldev); + if (lag_active) { + if (ldev->flags & MLX5_LAG_FLAG_HASH_BASED) { + mlx5_infer_tx_enabled(&ldev->tracker, ldev->ports, ports, + &num_ports); + hash = true; + } else { + for (i = 0; i < ldev->ports; i++) + ports[i] = ldev->v2p_map[i]; + num_ports = ldev->ports; + } + } + mutex_unlock(&ldev->lock); + if (!lag_active) + return -EINVAL; + + for (i = 0; i < num_ports; i++) { + if (hash) + seq_printf(file, "%d\n", ports[i] + 1); + else + seq_printf(file, "%d:%d\n", i + 1, ports[i]); + } + + return 0; +} + +static int members_show(struct seq_file *file, void *priv) +{ + struct mlx5_core_dev *dev = file->private; + struct mlx5_lag *ldev; + int i; + + ldev = dev->priv.lag; + mutex_lock(&ldev->lock); + for (i = 0; i < ldev->ports; i++) { + if (!ldev->pf[i].dev) + continue; + seq_printf(file, "%s\n", dev_name(ldev->pf[i].dev->device)); + } + mutex_unlock(&ldev->lock); + + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(type); +DEFINE_SHOW_ATTRIBUTE(port_sel_mode); +DEFINE_SHOW_ATTRIBUTE(state); +DEFINE_SHOW_ATTRIBUTE(flags); +DEFINE_SHOW_ATTRIBUTE(mapping); +DEFINE_SHOW_ATTRIBUTE(members); + +void mlx5_ldev_add_debugfs(struct mlx5_core_dev *dev) +{ + struct dentry *dbg; + + dbg = debugfs_create_dir("lag", mlx5_debugfs_get_dev_root(dev)); + dev->priv.dbg.lag_debugfs = dbg; + + debugfs_create_file("type", 0444, dbg, dev, &type_fops); + debugfs_create_file("port_sel_mode", 0444, dbg, dev, &port_sel_mode_fops); + debugfs_create_file("state", 0444, dbg, dev, &state_fops); + debugfs_create_file("flags", 0444, dbg, dev, &flags_fops); + debugfs_create_file("mapping", 0444, dbg, dev, &mapping_fops); + debugfs_create_file("members", 0444, dbg, dev, &members_fops); +} + +void mlx5_ldev_remove_debugfs(struct dentry *dbg) +{ + debugfs_remove_recursive(dbg); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c new file mode 100644 index 0000000..9d5a30c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c @@ -0,0 +1,1758 @@ +/* + * Copyright (c) 2016, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include "lib/devcom.h" +#include "mlx5_core.h" +#include "eswitch.h" +#include "lag.h" +#include "mp.h" +#include "esw/acl/ofld.h" + +enum { + MLX5_LAG_EGRESS_PORT_1 = 1, + MLX5_LAG_EGRESS_PORT_2, +}; +/* General purpose, use for short periods of time. + * Beware of lock dependencies (preferably, no locks should be acquired + * under it). + */ +static DEFINE_SPINLOCK(lag_lock); + +static u8 lag_active_port_bits(struct mlx5_lag *ldev) +{ + u8 enabled_ports[MLX5_MAX_PORTS] = {}; + u8 active_port = 0; + int num_enabled; + int idx; + + mlx5_infer_tx_enabled(&ldev->tracker, ldev->ports, enabled_ports, + &num_enabled); + for (idx = 0; idx < num_enabled; idx++) + active_port |= BIT_MASK(enabled_ports[idx]); + + return active_port; +} + +static int mlx5_cmd_create_lag(struct mlx5_core_dev *dev, u8 *ports, bool shared_fdb, u8 flags) +{ + u32 in[MLX5_ST_SZ_DW(create_lag_in)] = {}; + void *lag_ctx = MLX5_ADDR_OF(create_lag_in, in, ctx); + + WARN_ON(flags & MLX5_LAG_FLAG_MULTI_PORT_ESW && + flags & MLX5_LAG_FLAG_HASH_BASED); + MLX5_SET(create_lag_in, in, opcode, MLX5_CMD_OP_CREATE_LAG); + + MLX5_SET(lagc, lag_ctx, fdb_selection_mode, shared_fdb); + if (flags & MLX5_LAG_FLAG_MULTI_PORT_ESW) { + MLX5_SET(lagc, lag_ctx, port_select_mode, MLX5_LAG_PORT_MULTI_PORT_ESW); + } else if (!(flags & MLX5_LAG_FLAG_HASH_BASED)) { + MLX5_SET(lagc, lag_ctx, tx_remap_affinity_1, ports[0]); + MLX5_SET(lagc, lag_ctx, tx_remap_affinity_2, ports[1]); + } else { + MLX5_SET(lagc, lag_ctx, port_select_mode, + MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_FT); + + if (MLX5_CAP_PORT_SELECTION(dev, port_select_flow_table_bypass)) + MLX5_SET(lagc, lag_ctx, active_port, + lag_active_port_bits(mlx5_lag_dev(dev))); + } + + return mlx5_cmd_exec_in(dev, create_lag, in); +} + +static int mlx5_cmd_modify_lag(struct mlx5_core_dev *dev, u8 num_ports, + u8 *ports) +{ + u32 in[MLX5_ST_SZ_DW(modify_lag_in)] = {}; + void *lag_ctx = MLX5_ADDR_OF(modify_lag_in, in, ctx); + + MLX5_SET(modify_lag_in, in, opcode, MLX5_CMD_OP_MODIFY_LAG); + MLX5_SET(modify_lag_in, in, field_select, 0x1); + + MLX5_SET(lagc, lag_ctx, tx_remap_affinity_1, ports[0]); + MLX5_SET(lagc, lag_ctx, tx_remap_affinity_2, ports[1]); + + return mlx5_cmd_exec_in(dev, modify_lag, in); +} + +int mlx5_cmd_create_vport_lag(struct mlx5_core_dev *dev) +{ + u32 in[MLX5_ST_SZ_DW(create_vport_lag_in)] = {}; + + MLX5_SET(create_vport_lag_in, in, opcode, MLX5_CMD_OP_CREATE_VPORT_LAG); + + return mlx5_cmd_exec_in(dev, create_vport_lag, in); +} +EXPORT_SYMBOL(mlx5_cmd_create_vport_lag); + +int mlx5_cmd_destroy_vport_lag(struct mlx5_core_dev *dev) +{ + u32 in[MLX5_ST_SZ_DW(destroy_vport_lag_in)] = {}; + + MLX5_SET(destroy_vport_lag_in, in, opcode, MLX5_CMD_OP_DESTROY_VPORT_LAG); + + return mlx5_cmd_exec_in(dev, destroy_vport_lag, in); +} +EXPORT_SYMBOL(mlx5_cmd_destroy_vport_lag); + +static void mlx5_infer_tx_disabled(struct lag_tracker *tracker, u8 num_ports, + u8 *ports, int *num_disabled) +{ + int i; + + *num_disabled = 0; + for (i = 0; i < num_ports; i++) { + if (!tracker->netdev_state[i].tx_enabled || + !tracker->netdev_state[i].link_up) + ports[(*num_disabled)++] = i; + } +} + +void mlx5_infer_tx_enabled(struct lag_tracker *tracker, u8 num_ports, + u8 *ports, int *num_enabled) +{ + int i; + + *num_enabled = 0; + for (i = 0; i < num_ports; i++) { + if (tracker->netdev_state[i].tx_enabled && + tracker->netdev_state[i].link_up) + ports[(*num_enabled)++] = i; + } + + if (*num_enabled == 0) + mlx5_infer_tx_disabled(tracker, num_ports, ports, num_enabled); +} + +static void mlx5_lag_print_mapping(struct mlx5_core_dev *dev, + struct mlx5_lag *ldev, + struct lag_tracker *tracker, + u8 flags) +{ + char buf[MLX5_MAX_PORTS * 10 + 1] = {}; + u8 enabled_ports[MLX5_MAX_PORTS] = {}; + int written = 0; + int num_enabled; + int idx; + int err; + int i; + int j; + + if (flags & MLX5_LAG_FLAG_HASH_BASED) { + mlx5_infer_tx_enabled(tracker, ldev->ports, enabled_ports, + &num_enabled); + for (i = 0; i < num_enabled; i++) { + err = scnprintf(buf + written, 4, "%d, ", enabled_ports[i] + 1); + if (err != 3) + return; + written += err; + } + buf[written - 2] = 0; + mlx5_core_info(dev, "lag map active ports: %s\n", buf); + } else { + for (i = 0; i < ldev->ports; i++) { + for (j = 0; j < ldev->buckets; j++) { + idx = i * ldev->buckets + j; + err = scnprintf(buf + written, 10, + " port %d:%d", i + 1, ldev->v2p_map[idx]); + if (err != 9) + return; + written += err; + } + } + mlx5_core_info(dev, "lag map:%s\n", buf); + } +} + +static int mlx5_lag_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr); +static void mlx5_do_bond_work(struct work_struct *work); + +static void mlx5_ldev_free(struct kref *ref) +{ + struct mlx5_lag *ldev = container_of(ref, struct mlx5_lag, ref); + + if (ldev->nb.notifier_call) + unregister_netdevice_notifier_net(&init_net, &ldev->nb); + mlx5_lag_mp_cleanup(ldev); + cancel_delayed_work_sync(&ldev->bond_work); + destroy_workqueue(ldev->wq); + mutex_destroy(&ldev->lock); + kfree(ldev); +} + +static void mlx5_ldev_put(struct mlx5_lag *ldev) +{ + kref_put(&ldev->ref, mlx5_ldev_free); +} + +static void mlx5_ldev_get(struct mlx5_lag *ldev) +{ + kref_get(&ldev->ref); +} + +static struct mlx5_lag *mlx5_lag_dev_alloc(struct mlx5_core_dev *dev) +{ + struct mlx5_lag *ldev; + int err; + + ldev = kzalloc(sizeof(*ldev), GFP_KERNEL); + if (!ldev) + return NULL; + + ldev->wq = create_singlethread_workqueue("mlx5_lag"); + if (!ldev->wq) { + kfree(ldev); + return NULL; + } + + kref_init(&ldev->ref); + mutex_init(&ldev->lock); + INIT_DELAYED_WORK(&ldev->bond_work, mlx5_do_bond_work); + + ldev->nb.notifier_call = mlx5_lag_netdev_event; + if (register_netdevice_notifier_net(&init_net, &ldev->nb)) { + ldev->nb.notifier_call = NULL; + mlx5_core_err(dev, "Failed to register LAG netdev notifier\n"); + } + + err = mlx5_lag_mp_init(ldev); + if (err) + mlx5_core_err(dev, "Failed to init multipath lag err=%d\n", + err); + ldev->ports = MLX5_CAP_GEN(dev, num_lag_ports); + ldev->buckets = 1; + + return ldev; +} + +int mlx5_lag_dev_get_netdev_idx(struct mlx5_lag *ldev, + struct net_device *ndev) +{ + int i; + + for (i = 0; i < ldev->ports; i++) + if (ldev->pf[i].netdev == ndev) + return i; + + return -ENOENT; +} + +static bool __mlx5_lag_is_roce(struct mlx5_lag *ldev) +{ + return !!(ldev->flags & MLX5_LAG_FLAG_ROCE); +} + +static bool __mlx5_lag_is_sriov(struct mlx5_lag *ldev) +{ + return !!(ldev->flags & MLX5_LAG_FLAG_SRIOV); +} + +/* Create a mapping between steering slots and active ports. + * As we have ldev->buckets slots per port first assume the native + * mapping should be used. + * If there are ports that are disabled fill the relevant slots + * with mapping that points to active ports. + */ +void mlx5_lag_infer_tx_affinity_mapping(struct lag_tracker *tracker, + u8 num_ports, + u8 buckets, + u8 *ports) + +{ + int disabled[MLX5_MAX_PORTS] = {}; + int enabled[MLX5_MAX_PORTS] = {}; + int disabled_ports_num = 0; + int enabled_ports_num = 0; + int idx; + u32 rand; + int i; + int j; + + for (i = 0; i < num_ports; i++) { + if (tracker->netdev_state[i].tx_enabled && + tracker->netdev_state[i].link_up) + enabled[enabled_ports_num++] = i; + else + disabled[disabled_ports_num++] = i; + } + + /* Use native mapping by default where each port's buckets + * point the native port: 1 1 1 .. 1 2 2 2 ... 2 3 3 3 ... 3 etc + */ + for (i = 0; i < num_ports; i++) + for (j = 0; j < buckets; j++) { + idx = i * buckets + j; + ports[idx] = MLX5_LAG_EGRESS_PORT_1 + i; + } + + /* If all ports are disabled/enabled keep native mapping */ + if (enabled_ports_num == num_ports || + disabled_ports_num == num_ports) + return; + + /* Go over the disabled ports and for each assign a random active port */ + for (i = 0; i < disabled_ports_num; i++) { + for (j = 0; j < buckets; j++) { + get_random_bytes(&rand, 4); + ports[disabled[i] * buckets + j] = enabled[rand % enabled_ports_num] + 1; + } + } +} + +static bool mlx5_lag_has_drop_rule(struct mlx5_lag *ldev) +{ + int i; + + for (i = 0; i < ldev->ports; i++) + if (ldev->pf[i].has_drop) + return true; + return false; +} + +static void mlx5_lag_drop_rule_cleanup(struct mlx5_lag *ldev) +{ + int i; + + for (i = 0; i < ldev->ports; i++) { + if (!ldev->pf[i].has_drop) + continue; + + mlx5_esw_acl_ingress_vport_drop_rule_destroy(ldev->pf[i].dev->priv.eswitch, + MLX5_VPORT_UPLINK); + ldev->pf[i].has_drop = false; + } +} + +static void mlx5_lag_drop_rule_setup(struct mlx5_lag *ldev, + struct lag_tracker *tracker) +{ + u8 disabled_ports[MLX5_MAX_PORTS] = {}; + struct mlx5_core_dev *dev; + int disabled_index; + int num_disabled; + int err; + int i; + + /* First delete the current drop rule so there won't be any dropped + * packets + */ + mlx5_lag_drop_rule_cleanup(ldev); + + if (!ldev->tracker.has_inactive) + return; + + mlx5_infer_tx_disabled(tracker, ldev->ports, disabled_ports, &num_disabled); + + for (i = 0; i < num_disabled; i++) { + disabled_index = disabled_ports[i]; + dev = ldev->pf[disabled_index].dev; + err = mlx5_esw_acl_ingress_vport_drop_rule_create(dev->priv.eswitch, + MLX5_VPORT_UPLINK); + if (!err) + ldev->pf[disabled_index].has_drop = true; + else + mlx5_core_err(dev, + "Failed to create lag drop rule, error: %d", err); + } +} + +static int mlx5_cmd_modify_active_port(struct mlx5_core_dev *dev, u8 ports) +{ + u32 in[MLX5_ST_SZ_DW(modify_lag_in)] = {}; + void *lag_ctx; + + lag_ctx = MLX5_ADDR_OF(modify_lag_in, in, ctx); + + MLX5_SET(modify_lag_in, in, opcode, MLX5_CMD_OP_MODIFY_LAG); + MLX5_SET(modify_lag_in, in, field_select, 0x2); + + MLX5_SET(lagc, lag_ctx, active_port, ports); + + return mlx5_cmd_exec_in(dev, modify_lag, in); +} + +static int _mlx5_modify_lag(struct mlx5_lag *ldev, u8 *ports) +{ + struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev; + u8 active_ports; + int ret; + + if (ldev->flags & MLX5_LAG_FLAG_HASH_BASED) { + ret = mlx5_lag_port_sel_modify(ldev, ports); + if (ret || + !MLX5_CAP_PORT_SELECTION(dev0, port_select_flow_table_bypass)) + return ret; + + active_ports = lag_active_port_bits(ldev); + + return mlx5_cmd_modify_active_port(dev0, active_ports); + } + return mlx5_cmd_modify_lag(dev0, ldev->ports, ports); +} + +void mlx5_modify_lag(struct mlx5_lag *ldev, + struct lag_tracker *tracker) +{ + u8 ports[MLX5_MAX_PORTS * MLX5_LAG_MAX_HASH_BUCKETS] = {}; + struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev; + int idx; + int err; + int i; + int j; + + mlx5_lag_infer_tx_affinity_mapping(tracker, ldev->ports, ldev->buckets, ports); + + for (i = 0; i < ldev->ports; i++) { + for (j = 0; j < ldev->buckets; j++) { + idx = i * ldev->buckets + j; + if (ports[idx] == ldev->v2p_map[idx]) + continue; + err = _mlx5_modify_lag(ldev, ports); + if (err) { + mlx5_core_err(dev0, + "Failed to modify LAG (%d)\n", + err); + return; + } + memcpy(ldev->v2p_map, ports, sizeof(ports)); + + mlx5_lag_print_mapping(dev0, ldev, tracker, + ldev->flags); + break; + } + } + + if (tracker->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP && + !(ldev->flags & MLX5_LAG_FLAG_ROCE)) + mlx5_lag_drop_rule_setup(ldev, tracker); +} + +enum mlx5_lag_user_pref mlx5_lag_get_user_mode(struct mlx5_core_dev *dev) +{ + struct mlx5_lag *ldev = dev->priv.lag; + int i; + + for (i = 0; i < ldev->ports; i++) + if (ldev->pf[i].dev == dev) + break; + return ldev->pf[i].user_mode; +} + +void mlx5_lag_set_user_mode(struct mlx5_core_dev *dev, + enum mlx5_lag_user_pref mode) +{ + struct mlx5_lag *ldev = dev->priv.lag; + int i; + + for (i = 0; i < ldev->ports; i++) + if (ldev->pf[i].dev == dev) + break; + ldev->pf[i].user_mode = mode; +} + +#define MLX5_LAG_ROCE_HASH_PORTS_SUPPORTED 4 +static int mlx5_lag_set_port_sel_mode(struct mlx5_lag *ldev, + struct lag_tracker *tracker, u8 *flags) +{ + bool roce_lag = !!(*flags & MLX5_LAG_FLAG_ROCE); + struct lag_func *dev0 = &ldev->pf[MLX5_LAG_P1]; + struct lag_func *dev1 = &ldev->pf[MLX5_LAG_P2]; + + if (dev0->user_mode != dev1->user_mode) { + mlx5_core_err(dev0->dev, + "LAG port selection mode must be the same for both devices\n"); + return -EINVAL; + } + + if (dev0->user_mode == MLX5_LAG_USER_PREF_MODE_QUEUE_AFFINITY && + ldev->ports > 2) { + mlx5_core_err(dev0->dev, + "LAG on NICs with more than 2 ports does not support queue affinity\n"); + return -EINVAL; + } + + if (dev0->user_mode == MLX5_LAG_USER_PREF_MODE_HASH) { + if (MLX5_CAP_PORT_SELECTION(dev0->dev, port_select_flow_table) && + (!roce_lag || ldev->ports >= 2)) { + *flags |= MLX5_LAG_FLAG_HASH_BASED; + if (ldev->ports > 2) + ldev->buckets = MLX5_LAG_MAX_HASH_BUCKETS; + } else { + mlx5_core_dbg(dev0->dev, + "LAG port selection mode is not suported, using queue affinity\n"); + return 0; + } + } else if (dev0->user_mode == MLX5_LAG_USER_PREF_MODE_MULTI_PORT_ESW) { + if (roce_lag || !MLX5_CAP_PORT_SELECTION(dev0->dev, port_select_eswitch)) { + mlx5_core_dbg(dev0->dev, "Multi port eswitch is not supported, using queue affinity\n"); + return 0; + } + + mlx5_core_info(dev0->dev, "Multi port eswitch supported\n"); + *flags |= MLX5_LAG_FLAG_MULTI_PORT_ESW; + } + return 0; +} + +char *get_str_port_sel_mode(u8 flags) +{ + if (flags & MLX5_LAG_FLAG_HASH_BASED) + return "hash"; + return "queue_affinity"; +} + +static int mlx5_cmd_destroy_lag(struct mlx5_core_dev *dev) +{ + u32 in[MLX5_ST_SZ_DW(destroy_lag_in)] = {}; + + MLX5_SET(destroy_lag_in, in, opcode, MLX5_CMD_OP_DESTROY_LAG); + return mlx5_cmd_exec_in(dev, destroy_lag, in); +} + +static int mlx5_create_lag(struct mlx5_lag *ldev, + struct lag_tracker *tracker, + bool shared_fdb, u8 flags) +{ + struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev; + struct mlx5_core_dev *dev1 = ldev->pf[MLX5_LAG_P2].dev; + int err; + + mlx5_lag_print_mapping(dev0, ldev, tracker, flags); + mlx5_core_info(dev0, "shared_fdb:%d mode:%s\n", + shared_fdb, get_str_port_sel_mode(flags)); + + err = mlx5_cmd_create_lag(dev0, ldev->v2p_map, shared_fdb, flags); + if (err) { + mlx5_core_err(dev0, + "Failed to create LAG (%d)\n", + err); + return err; + } + + if (shared_fdb) { + err = mlx5_eswitch_offloads_config_single_fdb(dev0->priv.eswitch, + dev1->priv.eswitch); + if (err) + mlx5_core_err(dev0, "Can't enable single FDB mode\n"); + else + mlx5_core_info(dev0, "Operation mode is single FDB\n"); + } + + if (err) { + if (mlx5_cmd_destroy_lag(dev0)) + mlx5_core_err(dev0, + "Failed to deactivate RoCE LAG; driver restart required\n"); + } + + return err; +} + +int mlx5_activate_lag(struct mlx5_lag *ldev, + struct lag_tracker *tracker, + u8 flags, + bool shared_fdb) +{ + bool roce_lag = !!(flags & MLX5_LAG_FLAG_ROCE); + struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev; + int err; + + err = mlx5_lag_set_port_sel_mode(ldev, tracker, &flags); + if (err) + return err; + + mlx5_lag_infer_tx_affinity_mapping(tracker, ldev->ports, ldev->buckets, ldev->v2p_map); + + if (flags & MLX5_LAG_FLAG_HASH_BASED) { + err = mlx5_lag_port_sel_create(ldev, tracker->hash_type, ldev->v2p_map); + if (err) { + mlx5_core_err(dev0, + "Failed to create LAG port selection(%d)\n", + err); + return err; + } + } + + err = mlx5_create_lag(ldev, tracker, shared_fdb, flags); + if (err) { + if (flags & MLX5_LAG_FLAG_HASH_BASED) + mlx5_lag_port_sel_destroy(ldev); + if (roce_lag) + mlx5_core_err(dev0, + "Failed to activate RoCE LAG\n"); + else + mlx5_core_err(dev0, + "Failed to activate VF LAG\n" + "Make sure all VFs are unbound prior to VF LAG activation or deactivation\n"); + return err; + } + + if (tracker->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP && + !roce_lag) + mlx5_lag_drop_rule_setup(ldev, tracker); + + ldev->flags |= flags; + ldev->shared_fdb = shared_fdb; + return 0; +} + +static int mlx5_deactivate_lag(struct mlx5_lag *ldev) +{ + struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev; + struct mlx5_core_dev *dev1 = ldev->pf[MLX5_LAG_P2].dev; + u32 in[MLX5_ST_SZ_DW(destroy_lag_in)] = {}; + bool roce_lag = __mlx5_lag_is_roce(ldev); + u8 flags = ldev->flags; + int err; + + ldev->flags &= ~MLX5_LAG_MODE_FLAGS; + mlx5_lag_mp_reset(ldev); + + if (ldev->shared_fdb) { + mlx5_eswitch_offloads_destroy_single_fdb(dev0->priv.eswitch, + dev1->priv.eswitch); + ldev->shared_fdb = false; + } + + MLX5_SET(destroy_lag_in, in, opcode, MLX5_CMD_OP_DESTROY_LAG); + err = mlx5_cmd_exec_in(dev0, destroy_lag, in); + if (err) { + if (roce_lag) { + mlx5_core_err(dev0, + "Failed to deactivate RoCE LAG; driver restart required\n"); + } else { + mlx5_core_err(dev0, + "Failed to deactivate VF LAG; driver restart required\n" + "Make sure all VFs are unbound prior to VF LAG activation or deactivation\n"); + } + return err; + } + + if (flags & MLX5_LAG_FLAG_HASH_BASED) + mlx5_lag_port_sel_destroy(ldev); + if (mlx5_lag_has_drop_rule(ldev)) + mlx5_lag_drop_rule_cleanup(ldev); + + return 0; +} + +#define MLX5_LAG_OFFLOADS_SUPPORTED_PORTS 2 +static bool mlx5_lag_check_prereq(struct mlx5_lag *ldev) +{ +#ifdef CONFIG_MLX5_ESWITCH + struct mlx5_core_dev *dev; + u8 mode; +#endif + int i; + + for (i = 0; i < ldev->ports; i++) + if (!ldev->pf[i].dev) + return false; + +#ifdef CONFIG_MLX5_ESWITCH + dev = ldev->pf[MLX5_LAG_P1].dev; + if (mlx5_eswitch_num_vfs(dev->priv.eswitch) && !is_mdev_switchdev_mode(dev)) + return false; + + mode = mlx5_eswitch_mode(dev); + for (i = 0; i < ldev->ports; i++) + if (mlx5_eswitch_mode(ldev->pf[i].dev) != mode) + return false; + + if (mode == MLX5_ESWITCH_OFFLOADS && ldev->ports != MLX5_LAG_OFFLOADS_SUPPORTED_PORTS) + return false; +#else + for (i = 0; i < ldev->ports; i++) + if (mlx5_sriov_is_enabled(ldev->pf[i].dev)) + return false; +#endif + return true; +} + +static void mlx5_lag_add_devices(struct mlx5_lag *ldev) +{ + int i; + + for (i = 0; i < ldev->ports; i++) { + if (!ldev->pf[i].dev) + continue; + + if (ldev->pf[i].dev->priv.flags & + MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV) + continue; + + ldev->pf[i].dev->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV; + mlx5_rescan_drivers_locked(ldev->pf[i].dev); + } +} + +static void mlx5_lag_remove_devices(struct mlx5_lag *ldev) +{ + int i; + + for (i = 0; i < ldev->ports; i++) { + if (!ldev->pf[i].dev) + continue; + + if (ldev->pf[i].dev->priv.flags & + MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV) + continue; + + ldev->pf[i].dev->priv.flags |= MLX5_PRIV_FLAGS_DISABLE_IB_ADEV; + mlx5_rescan_drivers_locked(ldev->pf[i].dev); + } +} + +static void mlx5_disable_lag(struct mlx5_lag *ldev) +{ + struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev; + struct mlx5_core_dev *dev1 = ldev->pf[MLX5_LAG_P2].dev; + bool shared_fdb = ldev->shared_fdb; + bool roce_lag; + int err; + int i; + + roce_lag = __mlx5_lag_is_roce(ldev); + + if (shared_fdb) { + mlx5_lag_remove_devices(ldev); + } else if (roce_lag) { + if (!(dev0->priv.flags & MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV)) { + dev0->priv.flags |= MLX5_PRIV_FLAGS_DISABLE_IB_ADEV; + mlx5_rescan_drivers_locked(dev0); + } + for (i = 1; i < ldev->ports; i++) + mlx5_nic_vport_disable_roce(ldev->pf[i].dev); + } + + err = mlx5_deactivate_lag(ldev); + if (err) + return; + + if (shared_fdb || roce_lag) + mlx5_lag_add_devices(ldev); + + if (shared_fdb) { + if (!(dev0->priv.flags & MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV)) + mlx5_eswitch_reload_reps(dev0->priv.eswitch); + if (!(dev1->priv.flags & MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV)) + mlx5_eswitch_reload_reps(dev1->priv.eswitch); + } +} + +static bool mlx5_shared_fdb_supported(struct mlx5_lag *ldev) +{ + struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev; + struct mlx5_core_dev *dev1 = ldev->pf[MLX5_LAG_P2].dev; + + if (is_mdev_switchdev_mode(dev0) && + is_mdev_switchdev_mode(dev1) && + mlx5_eswitch_vport_match_metadata_enabled(dev0->priv.eswitch) && + mlx5_eswitch_vport_match_metadata_enabled(dev1->priv.eswitch) && + mlx5_devcom_is_paired(dev0->priv.devcom, + MLX5_DEVCOM_ESW_OFFLOADS) && + MLX5_CAP_GEN(dev1, lag_native_fdb_selection) && + MLX5_CAP_ESW(dev1, root_ft_on_other_esw) && + MLX5_CAP_ESW(dev0, esw_shared_ingress_acl)) + return true; + + return false; +} + +static bool mlx5_lag_is_roce_lag(struct mlx5_lag *ldev) +{ + bool roce_lag = true; + int i; + + for (i = 0; i < ldev->ports; i++) + roce_lag = roce_lag && !mlx5_sriov_is_enabled(ldev->pf[i].dev); + +#ifdef CONFIG_MLX5_ESWITCH + for (i = 0; i < ldev->ports; i++) + roce_lag = roce_lag && is_mdev_legacy_mode(ldev->pf[i].dev); +#endif + + return roce_lag; +} + +static void mlx5_do_bond(struct mlx5_lag *ldev) +{ + struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev; + struct mlx5_core_dev *dev1 = ldev->pf[MLX5_LAG_P2].dev; + struct lag_tracker tracker = { }; + bool do_bond, roce_lag; + int err; + int i; + + if (__mlx5_lag_is_active(ldev) && mlx5_lag_is_multipath(dev0)) + return; + + if (!mlx5_lag_is_ready(ldev)) { + do_bond = false; + } else { + /* VF LAG is in multipath mode, ignore bond change requests */ + if (mlx5_lag_is_multipath(dev0)) + return; + + tracker = ldev->tracker; + + do_bond = tracker.is_bonded && mlx5_lag_check_prereq(ldev); + } + + if (do_bond && !__mlx5_lag_is_active(ldev)) { + bool shared_fdb = mlx5_shared_fdb_supported(ldev); + + roce_lag = mlx5_lag_is_roce_lag(ldev); + + if (shared_fdb || roce_lag) + mlx5_lag_remove_devices(ldev); + + err = mlx5_activate_lag(ldev, &tracker, + roce_lag ? MLX5_LAG_FLAG_ROCE : + MLX5_LAG_FLAG_SRIOV, + shared_fdb); + if (err) { + if (shared_fdb || roce_lag) + mlx5_lag_add_devices(ldev); + + return; + } else if (roce_lag) { + dev0->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV; + mlx5_rescan_drivers_locked(dev0); + for (i = 1; i < ldev->ports; i++) + mlx5_nic_vport_enable_roce(ldev->pf[i].dev); + } else if (shared_fdb) { + dev0->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV; + mlx5_rescan_drivers_locked(dev0); + + err = mlx5_eswitch_reload_reps(dev0->priv.eswitch); + if (!err) + err = mlx5_eswitch_reload_reps(dev1->priv.eswitch); + + if (err) { + dev0->priv.flags |= MLX5_PRIV_FLAGS_DISABLE_IB_ADEV; + mlx5_rescan_drivers_locked(dev0); + mlx5_deactivate_lag(ldev); + mlx5_lag_add_devices(ldev); + mlx5_eswitch_reload_reps(dev0->priv.eswitch); + mlx5_eswitch_reload_reps(dev1->priv.eswitch); + mlx5_core_err(dev0, "Failed to enable lag\n"); + return; + } + } + } else if (do_bond && __mlx5_lag_is_active(ldev)) { + mlx5_modify_lag(ldev, &tracker); + } else if (!do_bond && __mlx5_lag_is_active(ldev)) { + mlx5_disable_lag(ldev); + } +} + +static void mlx5_queue_bond_work(struct mlx5_lag *ldev, unsigned long delay) +{ + queue_delayed_work(ldev->wq, &ldev->bond_work, delay); +} + +static void mlx5_do_bond_work(struct work_struct *work) +{ + struct delayed_work *delayed_work = to_delayed_work(work); + struct mlx5_lag *ldev = container_of(delayed_work, struct mlx5_lag, + bond_work); + int status; + + if (ldev->pf[0].user_mode == MLX5_LAG_USER_PREF_MODE_MULTI_PORT_ESW || + ldev->pf[1].user_mode == MLX5_LAG_USER_PREF_MODE_MULTI_PORT_ESW) + return; + + if (ldev->flags & MLX5_LAG_FLAG_MULTI_PORT_ESW) + return; + + status = mlx5_dev_list_trylock(); + if (!status) { + mlx5_queue_bond_work(ldev, HZ); + return; + } + + mutex_lock(&ldev->lock); + if (ldev->mode_changes_in_progress) { + mutex_unlock(&ldev->lock); + mlx5_dev_list_unlock(); + mlx5_queue_bond_work(ldev, HZ); + return; + } + + mlx5_do_bond(ldev); + mutex_unlock(&ldev->lock); + mlx5_dev_list_unlock(); +} + +static bool mlx5_lag_eval_bonding_conds(struct mlx5_lag *ldev, + struct lag_tracker *tracker, + struct net_device *upper, + enum netdev_lag_tx_type tx_type, + struct netdev_notifier_changeupper_info *info) +{ + int num_slaves = 0, idx; + struct net_device *ndev_tmp; + bool is_bonded, is_in_lag, mode_supported; + bool has_inactive = 0; + u8 bond_status = 0; + struct slave *slave; + bool changed = false; + + rcu_read_lock(); + for_each_netdev_in_bond_rcu(upper, ndev_tmp) { + idx = mlx5_lag_dev_get_netdev_idx(ldev, ndev_tmp); + if (idx >= 0) { + slave = bond_slave_get_rcu(ndev_tmp); + if (slave) + has_inactive |= bond_is_slave_inactive(slave); + bond_status |= (1 << idx); + } + + num_slaves++; + } + rcu_read_unlock(); + + /* None of this lagdev's netdevs are slaves of this master. */ + if (!(bond_status & GENMASK(ldev->ports - 1, 0))) + return false; + + tracker->tx_type = tx_type; + tracker->has_inactive = has_inactive; + /* Determine bonding status: + * A device is considered bonded if both its physical ports are slaves + * of the same lag master, and only them. + */ + is_in_lag = num_slaves == ldev->ports && + bond_status == GENMASK(ldev->ports - 1, 0); + + /* Lag mode must be activebackup or hash. */ + mode_supported = tracker->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP || + tracker->tx_type == NETDEV_LAG_TX_TYPE_HASH; + + is_bonded = is_in_lag && mode_supported; + if (tracker->is_bonded != is_bonded) { + tracker->is_bonded = is_bonded; + changed = true; + } + + if (!is_in_lag) + return changed; + + if (!mlx5_lag_is_ready(ldev)) { + if (info) + NL_SET_ERR_MSG_MOD(info->info.extack, + "Can't activate LAG offload, PF is configured with more than 64 VFs"); + } + else if (!mode_supported) { + if (info) + NL_SET_ERR_MSG_MOD(info->info.extack, + "Can't activate LAG offload, TX type isn't supported"); + } + + return changed; +} + +static bool mlx5_handle_changeupper_event(struct mlx5_lag *ldev, + struct lag_tracker *tracker, + struct net_device *ndev, + struct netdev_notifier_changeupper_info *info) +{ + enum netdev_lag_tx_type tx_type = NETDEV_LAG_TX_TYPE_UNKNOWN; + struct netdev_lag_upper_info *lag_upper_info; + struct net_device *upper = info->upper_dev; + + if (!netif_is_lag_master(upper)) + return false; + + if (info->linking) { + lag_upper_info = info->upper_info; + + if (lag_upper_info) { + tx_type = lag_upper_info->tx_type; + tracker->hash_type = lag_upper_info->hash_type; + } + } + + return mlx5_lag_eval_bonding_conds(ldev, tracker, upper, tx_type ,info); +} + +static bool mlx5_handle_changelowerstate_event(struct mlx5_lag *ldev, + struct lag_tracker *tracker, + struct net_device *ndev, + struct netdev_notifier_changelowerstate_info *info) +{ + struct netdev_lag_lower_state_info *lag_lower_info; + int idx; + + if (!netif_is_lag_port(ndev)) + return 0; + + idx = mlx5_lag_dev_get_netdev_idx(ldev, ndev); + if (idx < 0) + return 0; + + /* This information is used to determine virtual to physical + * port mapping. + */ + lag_lower_info = info->lower_state_info; + if (!lag_lower_info) + return 0; + + tracker->netdev_state[idx] = *lag_lower_info; + + return 1; +} + +static int mlx5_handle_changeinfodata_event(struct mlx5_lag *ldev, + struct lag_tracker *tracker, + struct net_device *ndev) +{ + struct net_device *ndev_tmp; + struct slave *slave; + bool has_inactive = 0; + int idx; + + if (!netif_is_lag_master(ndev)) + return 0; + + rcu_read_lock(); + for_each_netdev_in_bond_rcu(ndev, ndev_tmp) { + idx = mlx5_lag_dev_get_netdev_idx(ldev, ndev_tmp); + if (idx < 0) + continue; + + slave = bond_slave_get_rcu(ndev_tmp); + if (slave) + has_inactive |= bond_is_slave_inactive(slave); + } + rcu_read_unlock(); + + if (tracker->has_inactive == has_inactive) + return 0; + + tracker->has_inactive = has_inactive; + + return 1; +} + +static int mlx5_lag_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *ndev = netdev_notifier_info_to_dev(ptr); + struct lag_tracker tracker; + struct mlx5_lag *ldev; + bool changed = 0; + + if (event != NETDEV_CHANGEUPPER && + event != NETDEV_CHANGELOWERSTATE && + event != NETDEV_CHANGEINFODATA) + return NOTIFY_DONE; + + ldev = container_of(this, struct mlx5_lag, nb); + if (ldev->flags & MLX5_LAG_FLAG_MULTI_PORT_ESW) + return NOTIFY_DONE; + + tracker = ldev->tracker; + + switch (event) { + case NETDEV_CHANGEUPPER: + changed = mlx5_handle_changeupper_event(ldev, &tracker, ndev, + ptr); + break; + case NETDEV_CHANGELOWERSTATE: + changed = mlx5_handle_changelowerstate_event(ldev, &tracker, + ndev, ptr); + break; + case NETDEV_CHANGEINFODATA: + changed = mlx5_handle_changeinfodata_event(ldev, &tracker, ndev); + break; + } + + ldev->tracker = tracker; + + if (changed) + mlx5_queue_bond_work(ldev, 0); + + return NOTIFY_DONE; +} + +static void mlx5_lag_set_default_port_sel_mode(struct mlx5_lag *ldev, + struct mlx5_core_dev *dev) +{ + unsigned int fn = mlx5_get_dev_index(dev); + + if (ldev->pf[fn].user_mode) + return; + + if (MLX5_CAP_PORT_SELECTION(dev, port_select_flow_table)) { + ldev->pf[fn].user_mode = MLX5_LAG_USER_PREF_MODE_HASH; + } else { + ldev->pf[fn].user_mode = MLX5_LAG_USER_PREF_MODE_QUEUE_AFFINITY; + } +} + +static void mlx5_ldev_add_netdev(struct mlx5_lag *ldev, + struct mlx5_core_dev *dev, + struct net_device *netdev) +{ + unsigned int fn = mlx5_get_dev_index(dev); + unsigned long flags; + + if (fn >= ldev->ports) + return; + + spin_lock_irqsave(&lag_lock, flags); + mlx5_lag_set_default_port_sel_mode(ldev, dev); + ldev->pf[fn].netdev = netdev; + ldev->tracker.netdev_state[fn].link_up = 0; + ldev->tracker.netdev_state[fn].tx_enabled = 0; + spin_unlock_irqrestore(&lag_lock, flags); +} + +static void mlx5_ldev_remove_netdev(struct mlx5_lag *ldev, + struct net_device *netdev) +{ + unsigned long flags; + int i; + + spin_lock_irqsave(&lag_lock, flags); + for (i = 0; i < ldev->ports; i++) { + if (ldev->pf[i].netdev == netdev) { + ldev->pf[i].netdev = NULL; + break; + } + } + spin_unlock_irqrestore(&lag_lock, flags); +} + +static void mlx5_ldev_add_mdev(struct mlx5_lag *ldev, + struct mlx5_core_dev *dev) +{ + unsigned int fn = mlx5_get_dev_index(dev); + + if (fn >= ldev->ports) + return; + + ldev->pf[fn].dev = dev; + dev->priv.lag = ldev; +} + +static void mlx5_lag_update_trackers(struct mlx5_lag *ldev) +{ + enum netdev_lag_tx_type tx_type = NETDEV_LAG_TX_TYPE_UNKNOWN; + struct net_device *upper = NULL, *ndev; + struct lag_tracker *tracker; + struct bonding *bond; + struct slave *slave; + int i; + + rtnl_lock(); + tracker = &ldev->tracker; + + for (i = 0; i < ldev->ports; i++) { + ndev = ldev->pf[i].netdev; + if (!ndev) + continue; + + if (ndev->reg_state != NETREG_REGISTERED) + continue; + + if (!netif_is_bond_slave(ndev)) + continue; + + rcu_read_lock(); + slave = bond_slave_get_rcu(ndev); + rcu_read_unlock(); + bond = bond_get_bond_by_slave(slave); + + tracker->netdev_state[i].link_up = bond_slave_is_up(slave); + tracker->netdev_state[i].tx_enabled = bond_slave_can_tx(slave); + + if (bond_mode_uses_xmit_hash(bond)) + tx_type = NETDEV_LAG_TX_TYPE_HASH; + else if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP) + tx_type = NETDEV_LAG_TX_TYPE_ACTIVEBACKUP; + + upper = bond->dev; + } + + if (!upper) + goto out; + + if (mlx5_lag_eval_bonding_conds(ldev, tracker, upper, tx_type, NULL)) + mlx5_queue_bond_work(ldev, 0); + +out: + rtnl_unlock(); +} + +static void mlx5_ldev_remove_mdev(struct mlx5_lag *ldev, + struct mlx5_core_dev *dev) +{ + int i; + + for (i = 0; i < ldev->ports; i++) + if (ldev->pf[i].dev == dev) + break; + + if (i == ldev->ports) + return; + + ldev->pf[i].dev = NULL; + dev->priv.lag = NULL; +} + +/* Must be called with intf_mutex held */ +static int __mlx5_lag_dev_add_mdev(struct mlx5_core_dev *dev) +{ + struct mlx5_lag *ldev = NULL; + struct mlx5_core_dev *tmp_dev; + + tmp_dev = mlx5_get_next_phys_dev_lag(dev); + if (tmp_dev) + ldev = tmp_dev->priv.lag; + + if (!ldev) { + ldev = mlx5_lag_dev_alloc(dev); + if (!ldev) { + mlx5_core_err(dev, "Failed to alloc lag dev\n"); + return 0; + } + mlx5_ldev_add_mdev(ldev, dev); + return 0; + } + + mutex_lock(&ldev->lock); + if (ldev->mode_changes_in_progress) { + mutex_unlock(&ldev->lock); + return -EAGAIN; + } + mlx5_ldev_get(ldev); + mlx5_ldev_add_mdev(ldev, dev); + mutex_unlock(&ldev->lock); + + return 0; +} + +void mlx5_lag_remove_mdev(struct mlx5_core_dev *dev) +{ + struct mlx5_lag *ldev; + + ldev = mlx5_lag_dev(dev); + if (!ldev) + return; + + /* mdev is being removed, might as well remove debugfs + * as early as possible. + */ + mlx5_ldev_remove_debugfs(dev->priv.dbg.lag_debugfs); +recheck: + mutex_lock(&ldev->lock); + if (ldev->mode_changes_in_progress) { + mutex_unlock(&ldev->lock); + msleep(100); + goto recheck; + } + mlx5_ldev_remove_mdev(ldev, dev); + mutex_unlock(&ldev->lock); + mlx5_ldev_put(ldev); +} + +void mlx5_lag_add_mdev(struct mlx5_core_dev *dev) +{ + int err; + + if (!mlx5_lag_is_supported(dev)) + return; + +recheck: + mlx5_dev_list_lock(); + err = __mlx5_lag_dev_add_mdev(dev); + mlx5_dev_list_unlock(); + + if (err) { + msleep(100); + goto recheck; + } + mlx5_ldev_add_debugfs(dev); +} + +void mlx5_lag_remove_netdev(struct mlx5_core_dev *dev, + struct net_device *netdev) +{ + struct mlx5_lag *ldev; + bool lag_is_active; + + ldev = mlx5_lag_dev(dev); + if (!ldev) + return; + + mutex_lock(&ldev->lock); + mlx5_ldev_remove_netdev(ldev, netdev); + ldev->flags &= ~MLX5_LAG_FLAG_READY; + + if (mlx5_lag_is_multipath(dev)) + ldev->flags &= ~MLX5_LAG_FLAG_MULTIPATH; + + lag_is_active = __mlx5_lag_is_active(ldev); + mutex_unlock(&ldev->lock); + + if (lag_is_active) + mlx5_queue_bond_work(ldev, 0); +} + +/* Must be called with intf_mutex held */ +void mlx5_lag_add_netdev(struct mlx5_core_dev *dev, + struct net_device *netdev) +{ + struct mlx5_lag *ldev; + int i; + + ldev = mlx5_lag_dev(dev); + if (!ldev) + return; + + mutex_lock(&ldev->lock); + mlx5_ldev_add_netdev(ldev, dev, netdev); + + for (i = 0; i < ldev->ports; i++) + if (!ldev->pf[i].netdev) + break; + + if (i >= ldev->ports) + ldev->flags |= MLX5_LAG_FLAG_READY; + + mutex_unlock(&ldev->lock); + mlx5_lag_update_trackers(ldev); +} + +bool mlx5_lag_is_roce(struct mlx5_core_dev *dev) +{ + struct mlx5_lag *ldev; + unsigned long flags; + bool res; + + spin_lock_irqsave(&lag_lock, flags); + ldev = mlx5_lag_dev(dev); + res = ldev && __mlx5_lag_is_roce(ldev); + spin_unlock_irqrestore(&lag_lock, flags); + + return res; +} +EXPORT_SYMBOL(mlx5_lag_is_roce); + +bool mlx5_lag_is_active(struct mlx5_core_dev *dev) +{ + struct mlx5_lag *ldev; + unsigned long flags; + bool res; + + spin_lock_irqsave(&lag_lock, flags); + ldev = mlx5_lag_dev(dev); + res = ldev && __mlx5_lag_is_active(ldev); + spin_unlock_irqrestore(&lag_lock, flags); + + return res; +} +EXPORT_SYMBOL(mlx5_lag_is_active); + +bool mlx5_lag_mode_is_hash(struct mlx5_core_dev *dev) +{ + struct mlx5_lag *ldev; + unsigned long flags; + bool res = 0; + + spin_lock_irqsave(&lag_lock, flags); + ldev = mlx5_lag_dev(dev); + if (ldev) + res = ldev->flags & MLX5_LAG_FLAG_HASH_BASED; + spin_unlock_irqrestore(&lag_lock, flags); + + return res; +} +EXPORT_SYMBOL(mlx5_lag_mode_is_hash); + +bool mlx5_lag_is_master(struct mlx5_core_dev *dev) +{ + struct mlx5_lag *ldev; + unsigned long flags; + bool res; + + spin_lock_irqsave(&lag_lock, flags); + ldev = mlx5_lag_dev(dev); + res = ldev && __mlx5_lag_is_active(ldev) && + dev == ldev->pf[MLX5_LAG_P1].dev; + spin_unlock_irqrestore(&lag_lock, flags); + + return res; +} +EXPORT_SYMBOL(mlx5_lag_is_master); + +bool mlx5_lag_is_sriov(struct mlx5_core_dev *dev) +{ + struct mlx5_lag *ldev; + unsigned long flags; + bool res; + + spin_lock_irqsave(&lag_lock, flags); + ldev = mlx5_lag_dev(dev); + res = ldev && __mlx5_lag_is_sriov(ldev); + spin_unlock_irqrestore(&lag_lock, flags); + + return res; +} +EXPORT_SYMBOL(mlx5_lag_is_sriov); + +bool mlx5_lag_is_mpesw(struct mlx5_core_dev *dev) +{ + struct mlx5_lag *ldev; + unsigned long flags; + bool res; + + spin_lock_irqsave(&lag_lock, flags); + ldev = mlx5_lag_dev(dev); + res = ldev && ldev->flags & MLX5_LAG_FLAG_MULTI_PORT_ESW; + spin_unlock_irqrestore(&lag_lock, flags); + + return res; +} +EXPORT_SYMBOL(mlx5_lag_is_mpesw); + +bool mlx5_lag_is_shared_fdb(struct mlx5_core_dev *dev) +{ + struct mlx5_lag *ldev; + unsigned long flags; + bool res; + + spin_lock_irqsave(&lag_lock, flags); + ldev = mlx5_lag_dev(dev); + res = ldev && __mlx5_lag_is_sriov(ldev) && ldev->shared_fdb; + spin_unlock_irqrestore(&lag_lock, flags); + + return res; +} +EXPORT_SYMBOL(mlx5_lag_is_shared_fdb); + +void mlx5_lag_disable_change(struct mlx5_core_dev *dev) +{ + struct mlx5_lag *ldev; + + ldev = mlx5_lag_dev(dev); + if (!ldev) + return; + + mlx5_dev_list_lock(); + mutex_lock(&ldev->lock); + + ldev->mode_changes_in_progress++; + if (__mlx5_lag_is_active(ldev)) + mlx5_disable_lag(ldev); + + mutex_unlock(&ldev->lock); + mlx5_dev_list_unlock(); +} + +void mlx5_lag_enable_change(struct mlx5_core_dev *dev) +{ + struct mlx5_lag *ldev; + + ldev = mlx5_lag_dev(dev); + if (!ldev) + return; + + mutex_lock(&ldev->lock); + ldev->mode_changes_in_progress--; + mutex_unlock(&ldev->lock); + mlx5_queue_bond_work(ldev, 0); +} + +struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev) +{ + struct net_device *ndev = NULL; + struct mlx5_lag *ldev; + unsigned long flags; + int i; + + spin_lock_irqsave(&lag_lock, flags); + ldev = mlx5_lag_dev(dev); + + if (!(ldev && __mlx5_lag_is_roce(ldev))) + goto unlock; + + if (ldev->tracker.tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) { + for (i = 0; i < ldev->ports; i++) + if (ldev->tracker.netdev_state[i].tx_enabled) + ndev = ldev->pf[i].netdev; + if (!ndev) + ndev = ldev->pf[ldev->ports - 1].netdev; + } else { + ndev = ldev->pf[MLX5_LAG_P1].netdev; + } + if (ndev) + dev_hold(ndev); + +unlock: + spin_unlock_irqrestore(&lag_lock, flags); + + return ndev; +} +EXPORT_SYMBOL(mlx5_lag_get_roce_netdev); + +u8 mlx5_lag_get_slave_port(struct mlx5_core_dev *dev, + struct net_device *slave) +{ + struct mlx5_lag *ldev; + unsigned long flags; + u8 port = 0; + int i; + + spin_lock_irqsave(&lag_lock, flags); + ldev = mlx5_lag_dev(dev); + if (!(ldev && __mlx5_lag_is_roce(ldev))) + goto unlock; + + for (i = 0; i < ldev->ports; i++) { + if (ldev->pf[i].netdev == slave) { + port = i; + break; + } + } + + port = ldev->v2p_map[port * ldev->buckets]; + +unlock: + spin_unlock_irqrestore(&lag_lock, flags); + return port; +} +EXPORT_SYMBOL(mlx5_lag_get_slave_port); + +u8 mlx5_lag_get_num_ports(struct mlx5_core_dev *dev) +{ + struct mlx5_lag *ldev; + + ldev = mlx5_lag_dev(dev); + if (!ldev) + return 0; + + return ldev->ports; +} +EXPORT_SYMBOL(mlx5_lag_get_num_ports); + +struct mlx5_core_dev *mlx5_lag_get_peer_mdev(struct mlx5_core_dev *dev) +{ + struct mlx5_core_dev *peer_dev = NULL; + struct mlx5_lag *ldev; + unsigned long flags; + + spin_lock_irqsave(&lag_lock, flags); + ldev = mlx5_lag_dev(dev); + if (!ldev) + goto unlock; + + peer_dev = ldev->pf[MLX5_LAG_P1].dev == dev ? + ldev->pf[MLX5_LAG_P2].dev : + ldev->pf[MLX5_LAG_P1].dev; + +unlock: + spin_unlock_irqrestore(&lag_lock, flags); + return peer_dev; +} +EXPORT_SYMBOL(mlx5_lag_get_peer_mdev); + +int mlx5_lag_query_cong_counters(struct mlx5_core_dev *dev, + u64 *values, + int num_counters, + size_t *offsets) +{ + int outlen = MLX5_ST_SZ_BYTES(query_cong_statistics_out); + struct mlx5_core_dev **mdev; + struct mlx5_lag *ldev; + unsigned long flags; + int num_ports; + int ret = 0; + int i, j; + void *out; + + out = kvzalloc(outlen, GFP_KERNEL); + if (!out) + return -ENOMEM; + + mdev = kvzalloc(sizeof(mdev[0]) * MLX5_MAX_PORTS, GFP_KERNEL); + if (!mdev) { + ret = -ENOMEM; + goto free_out; + } + + memset(values, 0, sizeof(*values) * num_counters); + + spin_lock_irqsave(&lag_lock, flags); + ldev = mlx5_lag_dev(dev); + if (ldev && __mlx5_lag_is_active(ldev)) { + num_ports = ldev->ports; + for (i = 0; i < ldev->ports; i++) + mdev[i] = ldev->pf[i].dev; + } else { + num_ports = 1; + mdev[MLX5_LAG_P1] = dev; + } + spin_unlock_irqrestore(&lag_lock, flags); + + for (i = 0; i < num_ports; ++i) { + u32 in[MLX5_ST_SZ_DW(query_cong_statistics_in)] = {}; + + MLX5_SET(query_cong_statistics_in, in, opcode, + MLX5_CMD_OP_QUERY_CONG_STATISTICS); + ret = mlx5_cmd_exec_inout(mdev[i], query_cong_statistics, in, + out); + if (ret) + goto free_mdev; + + for (j = 0; j < num_counters; ++j) + values[j] += be64_to_cpup((__be64 *)(out + offsets[j])); + } + +free_mdev: + kvfree(mdev); +free_out: + kvfree(out); + return ret; +} +EXPORT_SYMBOL(mlx5_lag_query_cong_counters); + +static int mlx5_cmd_modify_cong_params(struct mlx5_core_dev *dev, + void *in, int in_size) +{ + u32 out[MLX5_ST_SZ_DW(modify_cong_params_out)] = { }; + + return mlx5_cmd_exec(dev, in, in_size, out, sizeof(out)); +} + +int mlx5_lag_modify_cong_params(struct mlx5_core_dev *dev, + void *in, int in_size) +{ + struct mlx5_core_dev **mdev; + struct mlx5_lag *ldev; + unsigned long flags; + int num_ports; + int ret = 0; + int i; + + mdev = kvzalloc(sizeof(mdev[0]) * MLX5_MAX_PORTS, GFP_KERNEL); + if (!mdev) + return -ENOMEM; + + spin_lock_irqsave(&lag_lock, flags); + ldev = mlx5_lag_dev(dev); + if (ldev && __mlx5_lag_is_active(ldev)) { + num_ports = ldev->ports; + for (i = 0; i < ldev->ports; i++) + mdev[i] = ldev->pf[i].dev; + } else { + num_ports = 1; + mdev[0] = dev; + } + spin_unlock_irqrestore(&lag_lock, flags); + + for (i = 0; i < num_ports; i++) { + ret = mlx5_cmd_modify_cong_params(mdev[i], in, in_size); + if (ret) + goto unlock; + } + +unlock: + kfree(mdev); + return ret; +} +EXPORT_SYMBOL(mlx5_lag_modify_cong_params); + +int mlx5_activate_mpesw_lag(struct mlx5_eswitch *esw) +{ + struct mlx5_core_dev *dev = esw->dev; + struct mlx5_lag *ldev = dev->priv.lag; + struct mlx5_core_dev *dev1, *dev2; + int err = 0; + + if (!ldev) + return 0; + + dev1 = ldev->pf[0].dev; + dev2 = ldev->pf[1].dev; + if (!dev1 || !dev2) + return 0; + + if (mlx5_lag_get_user_mode(dev1) != MLX5_LAG_USER_PREF_MODE_MULTI_PORT_ESW || + mlx5_lag_get_user_mode(dev2) != MLX5_LAG_USER_PREF_MODE_MULTI_PORT_ESW) + return 0; + + if (!MLX5_CAP_PORT_SELECTION(dev, port_select_eswitch) || + mlx5_lag_mpesw_is_activated(esw) || + __mlx5_lag_is_active(ldev)) + return 0; + + mlx5_lag_infer_tx_affinity_mapping(&ldev->tracker, ldev->ports, + ldev->buckets, ldev->v2p_map); + err = mlx5_cmd_create_lag(dev1, ldev->v2p_map, true, MLX5_LAG_FLAG_MULTI_PORT_ESW); + if (err) + return err; + + ldev->flags |= MLX5_LAG_FLAG_MULTI_PORT_ESW; + + return 0; +} + +void mlx5_deactivate_mpesw_lag(struct mlx5_eswitch *esw) +{ + struct mlx5_core_dev *dev = esw->dev; + struct mlx5_lag *ldev = dev->priv.lag; + + if (!ldev) + return; + + if (ldev->flags & MLX5_LAG_FLAG_MULTI_PORT_ESW && + mlx5_lag_mpesw_is_activated(esw) && + dev == ldev->pf[0].dev) { + mlx5_cmd_destroy_lag(dev); + ldev->flags &= ~MLX5_LAG_FLAG_MULTI_PORT_ESW; + } +} + +bool mlx5_lag_mpesw_is_activated(struct mlx5_eswitch *esw) +{ + return esw && esw->dev->priv.lag && + esw->dev->priv.lag->flags & MLX5_LAG_FLAG_MULTI_PORT_ESW; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h new file mode 100644 index 0000000..f575e19 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h @@ -0,0 +1,126 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2019 Mellanox Technologies. */ + +#ifndef __MLX5_LAG_H__ +#define __MLX5_LAG_H__ + +#include +#define MLX5_LAG_MAX_HASH_BUCKETS 16 +#include "mlx5_core.h" +#include "mp.h" +#include "port_sel.h" + +enum { + MLX5_LAG_P1, + MLX5_LAG_P2, +}; + +enum mlx5_lag_user_pref { + MLX5_LAG_USER_PREF_MODE_QUEUE_AFFINITY = 1, + MLX5_LAG_USER_PREF_MODE_HASH, + MLX5_LAG_USER_PREF_MODE_MULTI_PORT_ESW +}; + +enum { + MLX5_LAG_FLAG_ROCE = 1 << 0, + MLX5_LAG_FLAG_SRIOV = 1 << 1, + MLX5_LAG_FLAG_MULTIPATH = 1 << 2, + MLX5_LAG_FLAG_READY = 1 << 3, + MLX5_LAG_FLAG_HASH_BASED = 1 << 4, + MLX5_LAG_FLAG_MULTI_PORT_ESW = 1 << 5, +}; + +#define MLX5_LAG_MODE_FLAGS (MLX5_LAG_FLAG_ROCE | MLX5_LAG_FLAG_SRIOV |\ + MLX5_LAG_FLAG_MULTIPATH | \ + MLX5_LAG_FLAG_HASH_BASED | MLX5_LAG_FLAG_MULTI_PORT_ESW) + +struct lag_func { + struct mlx5_core_dev *dev; + struct net_device *netdev; + enum mlx5_lag_user_pref user_mode; + bool has_drop; +}; + +/* Used for collection of netdev event info. */ +struct lag_tracker { + enum netdev_lag_tx_type tx_type; + struct netdev_lag_lower_state_info netdev_state[MLX5_MAX_PORTS]; + unsigned int is_bonded:1; + enum netdev_lag_hash hash_type; + unsigned int has_inactive:1; +}; + +/* LAG data of a ConnectX card. + * It serves both its phys functions. + */ +struct mlx5_lag { + u8 flags; + u8 ports; + u8 buckets; + int mode_changes_in_progress; + bool shared_fdb; + u8 v2p_map[MLX5_MAX_PORTS * MLX5_LAG_MAX_HASH_BUCKETS]; + struct kref ref; + struct lag_func pf[MLX5_MAX_PORTS]; + struct lag_tracker tracker; + struct workqueue_struct *wq; + struct delayed_work bond_work; + struct notifier_block nb; + struct lag_mp lag_mp; + struct mlx5_lag_port_sel port_sel; + /* Protect lag fields/state changes */ + struct mutex lock; +}; + +static inline bool mlx5_lag_is_supported(struct mlx5_core_dev *dev) +{ + if (!MLX5_CAP_GEN(dev, vport_group_manager) || + !MLX5_CAP_GEN(dev, lag_master) || + MLX5_CAP_GEN(dev, num_lag_ports) < 2 || + MLX5_CAP_GEN(dev, num_lag_ports) > MLX5_MAX_PORTS) + return false; + return true; +} + +static inline struct mlx5_lag * +mlx5_lag_dev(struct mlx5_core_dev *dev) +{ + return dev->priv.lag; +} + +static inline bool +__mlx5_lag_is_active(struct mlx5_lag *ldev) +{ + return !!(ldev->flags & MLX5_LAG_MODE_FLAGS); +} + +static inline bool +mlx5_lag_is_ready(struct mlx5_lag *ldev) +{ + return ldev->flags & MLX5_LAG_FLAG_READY; +} + +void mlx5_lag_infer_tx_affinity_mapping(struct lag_tracker *tracker, + u8 num_ports, u8 buckets, u8 *ports); +void mlx5_modify_lag(struct mlx5_lag *ldev, + struct lag_tracker *tracker); +int mlx5_activate_lag(struct mlx5_lag *ldev, + struct lag_tracker *tracker, + u8 flags, + bool shared_fdb); +int mlx5_lag_dev_get_netdev_idx(struct mlx5_lag *ldev, + struct net_device *ndev); + +enum mlx5_lag_user_pref mlx5_lag_get_user_mode(struct mlx5_core_dev *dev); +void mlx5_lag_set_user_mode(struct mlx5_core_dev *dev, + enum mlx5_lag_user_pref mode); +bool mlx5_lag_is_mpesw(struct mlx5_core_dev *dev); + +char *get_str_port_sel_mode(u8 flags); +void mlx5_infer_tx_enabled(struct lag_tracker *tracker, u8 num_ports, + u8 *ports, int *num_enabled); + +void mlx5_ldev_add_debugfs(struct mlx5_core_dev *dev); +void mlx5_ldev_remove_debugfs(struct dentry *dbg); + +#endif /* __MLX5_LAG_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c new file mode 100644 index 0000000..b650617 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.c @@ -0,0 +1,371 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2019 Mellanox Technologies. */ + +#include +#include +#include "lag/lag.h" +#include "lag/mp.h" +#include "mlx5_core.h" +#include "eswitch.h" +#include "lib/mlx5.h" + +static bool __mlx5_lag_is_multipath(struct mlx5_lag *ldev) +{ + return !!(ldev->flags & MLX5_LAG_FLAG_MULTIPATH); +} + +static bool mlx5_lag_multipath_check_prereq(struct mlx5_lag *ldev) +{ + if (!mlx5_lag_is_ready(ldev)) + return false; + + if (__mlx5_lag_is_active(ldev) && !__mlx5_lag_is_multipath(ldev)) + return false; + + return mlx5_esw_multipath_prereq(ldev->pf[MLX5_LAG_P1].dev, + ldev->pf[MLX5_LAG_P2].dev); +} + +bool mlx5_lag_is_multipath(struct mlx5_core_dev *dev) +{ + struct mlx5_lag *ldev; + bool res; + + ldev = mlx5_lag_dev(dev); + res = ldev && __mlx5_lag_is_multipath(ldev); + + return res; +} + +/** + * mlx5_lag_set_port_affinity + * + * @ldev: lag device + * @port: + * 0 - set normal affinity. + * 1 - set affinity to port 1. + * 2 - set affinity to port 2. + * + **/ +static void mlx5_lag_set_port_affinity(struct mlx5_lag *ldev, + enum mlx5_lag_port_affinity port) +{ + struct lag_tracker tracker; + + if (!__mlx5_lag_is_multipath(ldev)) + return; + + switch (port) { + case MLX5_LAG_NORMAL_AFFINITY: + tracker.netdev_state[MLX5_LAG_P1].tx_enabled = true; + tracker.netdev_state[MLX5_LAG_P2].tx_enabled = true; + tracker.netdev_state[MLX5_LAG_P1].link_up = true; + tracker.netdev_state[MLX5_LAG_P2].link_up = true; + break; + case MLX5_LAG_P1_AFFINITY: + tracker.netdev_state[MLX5_LAG_P1].tx_enabled = true; + tracker.netdev_state[MLX5_LAG_P1].link_up = true; + tracker.netdev_state[MLX5_LAG_P2].tx_enabled = false; + tracker.netdev_state[MLX5_LAG_P2].link_up = false; + break; + case MLX5_LAG_P2_AFFINITY: + tracker.netdev_state[MLX5_LAG_P1].tx_enabled = false; + tracker.netdev_state[MLX5_LAG_P1].link_up = false; + tracker.netdev_state[MLX5_LAG_P2].tx_enabled = true; + tracker.netdev_state[MLX5_LAG_P2].link_up = true; + break; + default: + mlx5_core_warn(ldev->pf[MLX5_LAG_P1].dev, + "Invalid affinity port %d", port); + return; + } + + if (tracker.netdev_state[MLX5_LAG_P1].tx_enabled) + mlx5_notifier_call_chain(ldev->pf[MLX5_LAG_P1].dev->priv.events, + MLX5_DEV_EVENT_PORT_AFFINITY, + (void *)0); + + if (tracker.netdev_state[MLX5_LAG_P2].tx_enabled) + mlx5_notifier_call_chain(ldev->pf[MLX5_LAG_P2].dev->priv.events, + MLX5_DEV_EVENT_PORT_AFFINITY, + (void *)0); + + mlx5_modify_lag(ldev, &tracker); +} + +static void mlx5_lag_fib_event_flush(struct notifier_block *nb) +{ + struct lag_mp *mp = container_of(nb, struct lag_mp, fib_nb); + + flush_workqueue(mp->wq); +} + +static void mlx5_lag_fib_set(struct lag_mp *mp, struct fib_info *fi, u32 dst, int dst_len) +{ + mp->fib.mfi = fi; + mp->fib.priority = fi->fib_priority; + mp->fib.dst = dst; + mp->fib.dst_len = dst_len; +} + +struct mlx5_fib_event_work { + struct work_struct work; + struct mlx5_lag *ldev; + unsigned long event; + union { + struct fib_entry_notifier_info fen_info; + struct fib_nh_notifier_info fnh_info; + }; +}; + +static void mlx5_lag_fib_route_event(struct mlx5_lag *ldev, unsigned long event, + struct fib_entry_notifier_info *fen_info) +{ + struct fib_info *fi = fen_info->fi; + struct lag_mp *mp = &ldev->lag_mp; + struct fib_nh *fib_nh0, *fib_nh1; + unsigned int nhs; + + /* Handle delete event */ + if (event == FIB_EVENT_ENTRY_DEL) { + /* stop track */ + if (mp->fib.mfi == fi) + mp->fib.mfi = NULL; + return; + } + + /* Handle multipath entry with lower priority value */ + if (mp->fib.mfi && mp->fib.mfi != fi && + (mp->fib.dst != fen_info->dst || mp->fib.dst_len != fen_info->dst_len) && + fi->fib_priority >= mp->fib.priority) + return; + + /* Handle add/replace event */ + nhs = fib_info_num_path(fi); + if (nhs == 1) { + if (__mlx5_lag_is_active(ldev)) { + struct fib_nh *nh = fib_info_nh(fi, 0); + struct net_device *nh_dev = nh->fib_nh_dev; + int i = mlx5_lag_dev_get_netdev_idx(ldev, nh_dev); + + if (i < 0) + return; + + i++; + mlx5_lag_set_port_affinity(ldev, i); + mlx5_lag_fib_set(mp, fi, fen_info->dst, fen_info->dst_len); + } + + return; + } + + if (nhs != 2) + return; + + /* Verify next hops are ports of the same hca */ + fib_nh0 = fib_info_nh(fi, 0); + fib_nh1 = fib_info_nh(fi, 1); + if (!(fib_nh0->fib_nh_dev == ldev->pf[MLX5_LAG_P1].netdev && + fib_nh1->fib_nh_dev == ldev->pf[MLX5_LAG_P2].netdev) && + !(fib_nh0->fib_nh_dev == ldev->pf[MLX5_LAG_P2].netdev && + fib_nh1->fib_nh_dev == ldev->pf[MLX5_LAG_P1].netdev)) { + mlx5_core_warn(ldev->pf[MLX5_LAG_P1].dev, + "Multipath offload require two ports of the same HCA\n"); + return; + } + + if (__mlx5_lag_is_active(ldev) && !__mlx5_lag_is_multipath(ldev)) + return; + + /* First time we see multipath route */ + if (!mp->fib.mfi && !__mlx5_lag_is_active(ldev)) { + struct lag_tracker tracker; + + tracker = ldev->tracker; + mlx5_activate_lag(ldev, &tracker, MLX5_LAG_FLAG_MULTIPATH, false); + } + + mlx5_lag_set_port_affinity(ldev, MLX5_LAG_NORMAL_AFFINITY); + mlx5_lag_fib_set(mp, fi, fen_info->dst, fen_info->dst_len); +} + +static void mlx5_lag_fib_nexthop_event(struct mlx5_lag *ldev, + unsigned long event, + struct fib_nh *fib_nh, + struct fib_info *fi) +{ + struct lag_mp *mp = &ldev->lag_mp; + + /* Check the nh event is related to the route */ + if (!mp->fib.mfi || mp->fib.mfi != fi) + return; + + /* nh added/removed */ + if (event == FIB_EVENT_NH_DEL) { + int i = mlx5_lag_dev_get_netdev_idx(ldev, fib_nh->fib_nh_dev); + + if (i >= 0) { + i = (i + 1) % 2 + 1; /* peer port */ + mlx5_lag_set_port_affinity(ldev, i); + } + } else if (event == FIB_EVENT_NH_ADD && + fib_info_num_path(fi) == 2) { + mlx5_lag_set_port_affinity(ldev, MLX5_LAG_NORMAL_AFFINITY); + } +} + +static void mlx5_lag_fib_update(struct work_struct *work) +{ + struct mlx5_fib_event_work *fib_work = + container_of(work, struct mlx5_fib_event_work, work); + struct mlx5_lag *ldev = fib_work->ldev; + struct fib_nh *fib_nh; + + /* Protect internal structures from changes */ + rtnl_lock(); + switch (fib_work->event) { + case FIB_EVENT_ENTRY_REPLACE: + case FIB_EVENT_ENTRY_DEL: + mlx5_lag_fib_route_event(ldev, fib_work->event, + &fib_work->fen_info); + fib_info_put(fib_work->fen_info.fi); + break; + case FIB_EVENT_NH_ADD: + case FIB_EVENT_NH_DEL: + fib_nh = fib_work->fnh_info.fib_nh; + mlx5_lag_fib_nexthop_event(ldev, + fib_work->event, + fib_work->fnh_info.fib_nh, + fib_nh->nh_parent); + fib_info_put(fib_work->fnh_info.fib_nh->nh_parent); + break; + } + + rtnl_unlock(); + kfree(fib_work); +} + +static struct mlx5_fib_event_work * +mlx5_lag_init_fib_work(struct mlx5_lag *ldev, unsigned long event) +{ + struct mlx5_fib_event_work *fib_work; + + fib_work = kzalloc(sizeof(*fib_work), GFP_ATOMIC); + if (WARN_ON(!fib_work)) + return NULL; + + INIT_WORK(&fib_work->work, mlx5_lag_fib_update); + fib_work->ldev = ldev; + fib_work->event = event; + + return fib_work; +} + +static int mlx5_lag_fib_event(struct notifier_block *nb, + unsigned long event, + void *ptr) +{ + struct lag_mp *mp = container_of(nb, struct lag_mp, fib_nb); + struct mlx5_lag *ldev = container_of(mp, struct mlx5_lag, lag_mp); + struct fib_notifier_info *info = ptr; + struct mlx5_fib_event_work *fib_work; + struct fib_entry_notifier_info *fen_info; + struct fib_nh_notifier_info *fnh_info; + struct net_device *fib_dev; + struct fib_info *fi; + + if (info->family != AF_INET) + return NOTIFY_DONE; + + if (!mlx5_lag_multipath_check_prereq(ldev)) + return NOTIFY_DONE; + + switch (event) { + case FIB_EVENT_ENTRY_REPLACE: + case FIB_EVENT_ENTRY_DEL: + fen_info = container_of(info, struct fib_entry_notifier_info, + info); + fi = fen_info->fi; + if (fi->nh) + return NOTIFY_DONE; + fib_dev = fib_info_nh(fen_info->fi, 0)->fib_nh_dev; + if (fib_dev != ldev->pf[MLX5_LAG_P1].netdev && + fib_dev != ldev->pf[MLX5_LAG_P2].netdev) { + return NOTIFY_DONE; + } + fib_work = mlx5_lag_init_fib_work(ldev, event); + if (!fib_work) + return NOTIFY_DONE; + fib_work->fen_info = *fen_info; + /* Take reference on fib_info to prevent it from being + * freed while work is queued. Release it afterwards. + */ + fib_info_hold(fib_work->fen_info.fi); + break; + case FIB_EVENT_NH_ADD: + case FIB_EVENT_NH_DEL: + fnh_info = container_of(info, struct fib_nh_notifier_info, + info); + fib_work = mlx5_lag_init_fib_work(ldev, event); + if (!fib_work) + return NOTIFY_DONE; + fib_work->fnh_info = *fnh_info; + fib_info_hold(fib_work->fnh_info.fib_nh->nh_parent); + break; + default: + return NOTIFY_DONE; + } + + queue_work(mp->wq, &fib_work->work); + + return NOTIFY_DONE; +} + +void mlx5_lag_mp_reset(struct mlx5_lag *ldev) +{ + /* Clear mfi, as it might become stale when a route delete event + * has been missed, see mlx5_lag_fib_route_event(). + */ + ldev->lag_mp.fib.mfi = NULL; +} + +int mlx5_lag_mp_init(struct mlx5_lag *ldev) +{ + struct lag_mp *mp = &ldev->lag_mp; + int err; + + /* always clear mfi, as it might become stale when a route delete event + * has been missed + */ + mp->fib.mfi = NULL; + + if (mp->fib_nb.notifier_call) + return 0; + + mp->wq = create_singlethread_workqueue("mlx5_lag_mp"); + if (!mp->wq) + return -ENOMEM; + + mp->fib_nb.notifier_call = mlx5_lag_fib_event; + err = register_fib_notifier(&init_net, &mp->fib_nb, + mlx5_lag_fib_event_flush, NULL); + if (err) { + destroy_workqueue(mp->wq); + mp->fib_nb.notifier_call = NULL; + } + + return err; +} + +void mlx5_lag_mp_cleanup(struct mlx5_lag *ldev) +{ + struct lag_mp *mp = &ldev->lag_mp; + + if (!mp->fib_nb.notifier_call) + return; + + unregister_fib_notifier(&init_net, &mp->fib_nb); + destroy_workqueue(mp->wq); + mp->fib_nb.notifier_call = NULL; + mp->fib.mfi = NULL; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.h new file mode 100644 index 0000000..056a066 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lag/mp.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2019 Mellanox Technologies. */ + +#ifndef __MLX5_LAG_MP_H__ +#define __MLX5_LAG_MP_H__ + +#include "lag.h" +#include "mlx5_core.h" + +enum mlx5_lag_port_affinity { + MLX5_LAG_NORMAL_AFFINITY, + MLX5_LAG_P1_AFFINITY, + MLX5_LAG_P2_AFFINITY, +}; + +struct lag_mp { + struct notifier_block fib_nb; + struct { + const void *mfi; /* used in tracking fib events */ + u32 priority; + u32 dst; + int dst_len; + } fib; + struct workqueue_struct *wq; +}; + +#ifdef CONFIG_MLX5_ESWITCH + +void mlx5_lag_mp_reset(struct mlx5_lag *ldev); +int mlx5_lag_mp_init(struct mlx5_lag *ldev); +void mlx5_lag_mp_cleanup(struct mlx5_lag *ldev); +bool mlx5_lag_is_multipath(struct mlx5_core_dev *dev); + +#else /* CONFIG_MLX5_ESWITCH */ + +static inline void mlx5_lag_mp_reset(struct mlx5_lag *ldev) {}; +static inline int mlx5_lag_mp_init(struct mlx5_lag *ldev) { return 0; } +static inline void mlx5_lag_mp_cleanup(struct mlx5_lag *ldev) {} +static inline bool mlx5_lag_is_multipath(struct mlx5_core_dev *dev) { return false; } + +#endif /* CONFIG_MLX5_ESWITCH */ +#endif /* __MLX5_LAG_MP_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c new file mode 100644 index 0000000..ca00fca --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c @@ -0,0 +1,641 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. */ + +#include +#include "lag.h" + +enum { + MLX5_LAG_FT_LEVEL_TTC, + MLX5_LAG_FT_LEVEL_INNER_TTC, + MLX5_LAG_FT_LEVEL_DEFINER, +}; + +static struct mlx5_flow_group * +mlx5_create_hash_flow_group(struct mlx5_flow_table *ft, + struct mlx5_flow_definer *definer, + int num_entries) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_flow_group *fg; + u32 *in; + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return ERR_PTR(-ENOMEM); + + MLX5_SET(create_flow_group_in, in, match_definer_id, + mlx5_get_match_definer_id(definer)); + MLX5_SET(create_flow_group_in, in, start_flow_index, 0); + MLX5_SET(create_flow_group_in, in, end_flow_index, num_entries - 1); + MLX5_SET(create_flow_group_in, in, group_type, + MLX5_CREATE_FLOW_GROUP_IN_GROUP_TYPE_HASH_SPLIT); + + fg = mlx5_create_flow_group(ft, in); + kvfree(in); + return fg; +} + +static int mlx5_lag_create_port_sel_table(struct mlx5_lag *ldev, + struct mlx5_lag_definer *lag_definer, + u8 *ports) +{ + struct mlx5_core_dev *dev = ldev->pf[MLX5_LAG_P1].dev; + struct mlx5_flow_table_attr ft_attr = {}; + struct mlx5_flow_destination dest = {}; + MLX5_DECLARE_FLOW_ACT(flow_act); + struct mlx5_flow_namespace *ns; + int err, i; + int idx; + int j; + + ft_attr.max_fte = ldev->ports * ldev->buckets; + ft_attr.level = MLX5_LAG_FT_LEVEL_DEFINER; + + ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_PORT_SEL); + if (!ns) { + mlx5_core_warn(dev, "Failed to get port selection namespace\n"); + return -EOPNOTSUPP; + } + + lag_definer->ft = mlx5_create_flow_table(ns, &ft_attr); + if (IS_ERR(lag_definer->ft)) { + mlx5_core_warn(dev, "Failed to create port selection table\n"); + return PTR_ERR(lag_definer->ft); + } + + lag_definer->fg = mlx5_create_hash_flow_group(lag_definer->ft, + lag_definer->definer, + ft_attr.max_fte); + if (IS_ERR(lag_definer->fg)) { + err = PTR_ERR(lag_definer->fg); + goto destroy_ft; + } + + dest.type = MLX5_FLOW_DESTINATION_TYPE_UPLINK; + dest.vport.flags |= MLX5_FLOW_DEST_VPORT_VHCA_ID; + flow_act.flags |= FLOW_ACT_NO_APPEND; + for (i = 0; i < ldev->ports; i++) { + for (j = 0; j < ldev->buckets; j++) { + u8 affinity; + + idx = i * ldev->buckets + j; + affinity = ports[idx]; + + dest.vport.vhca_id = MLX5_CAP_GEN(ldev->pf[affinity - 1].dev, + vhca_id); + lag_definer->rules[idx] = mlx5_add_flow_rules(lag_definer->ft, + NULL, &flow_act, + &dest, 1); + if (IS_ERR(lag_definer->rules[idx])) { + err = PTR_ERR(lag_definer->rules[idx]); + while (i--) + while (j--) + mlx5_del_flow_rules(lag_definer->rules[idx]); + goto destroy_fg; + } + } + } + + return 0; + +destroy_fg: + mlx5_destroy_flow_group(lag_definer->fg); +destroy_ft: + mlx5_destroy_flow_table(lag_definer->ft); + return err; +} + +static int mlx5_lag_set_definer_inner(u32 *match_definer_mask, + enum mlx5_traffic_types tt) +{ + int format_id; + u8 *ipv6; + + switch (tt) { + case MLX5_TT_IPV4_UDP: + case MLX5_TT_IPV4_TCP: + format_id = 23; + MLX5_SET_TO_ONES(match_definer_format_23, match_definer_mask, + inner_l4_sport); + MLX5_SET_TO_ONES(match_definer_format_23, match_definer_mask, + inner_l4_dport); + MLX5_SET_TO_ONES(match_definer_format_23, match_definer_mask, + inner_ip_src_addr); + MLX5_SET_TO_ONES(match_definer_format_23, match_definer_mask, + inner_ip_dest_addr); + break; + case MLX5_TT_IPV4: + format_id = 23; + MLX5_SET_TO_ONES(match_definer_format_23, match_definer_mask, + inner_l3_type); + MLX5_SET_TO_ONES(match_definer_format_23, match_definer_mask, + inner_dmac_47_16); + MLX5_SET_TO_ONES(match_definer_format_23, match_definer_mask, + inner_dmac_15_0); + MLX5_SET_TO_ONES(match_definer_format_23, match_definer_mask, + inner_smac_47_16); + MLX5_SET_TO_ONES(match_definer_format_23, match_definer_mask, + inner_smac_15_0); + MLX5_SET_TO_ONES(match_definer_format_23, match_definer_mask, + inner_ip_src_addr); + MLX5_SET_TO_ONES(match_definer_format_23, match_definer_mask, + inner_ip_dest_addr); + break; + case MLX5_TT_IPV6_TCP: + case MLX5_TT_IPV6_UDP: + format_id = 31; + MLX5_SET_TO_ONES(match_definer_format_31, match_definer_mask, + inner_l4_sport); + MLX5_SET_TO_ONES(match_definer_format_31, match_definer_mask, + inner_l4_dport); + ipv6 = MLX5_ADDR_OF(match_definer_format_31, match_definer_mask, + inner_ip_dest_addr); + memset(ipv6, 0xff, 16); + ipv6 = MLX5_ADDR_OF(match_definer_format_31, match_definer_mask, + inner_ip_src_addr); + memset(ipv6, 0xff, 16); + break; + case MLX5_TT_IPV6: + format_id = 32; + ipv6 = MLX5_ADDR_OF(match_definer_format_32, match_definer_mask, + inner_ip_dest_addr); + memset(ipv6, 0xff, 16); + ipv6 = MLX5_ADDR_OF(match_definer_format_32, match_definer_mask, + inner_ip_src_addr); + memset(ipv6, 0xff, 16); + MLX5_SET_TO_ONES(match_definer_format_32, match_definer_mask, + inner_dmac_47_16); + MLX5_SET_TO_ONES(match_definer_format_32, match_definer_mask, + inner_dmac_15_0); + MLX5_SET_TO_ONES(match_definer_format_32, match_definer_mask, + inner_smac_47_16); + MLX5_SET_TO_ONES(match_definer_format_32, match_definer_mask, + inner_smac_15_0); + break; + default: + format_id = 23; + MLX5_SET_TO_ONES(match_definer_format_23, match_definer_mask, + inner_l3_type); + MLX5_SET_TO_ONES(match_definer_format_23, match_definer_mask, + inner_dmac_47_16); + MLX5_SET_TO_ONES(match_definer_format_23, match_definer_mask, + inner_dmac_15_0); + MLX5_SET_TO_ONES(match_definer_format_23, match_definer_mask, + inner_smac_47_16); + MLX5_SET_TO_ONES(match_definer_format_23, match_definer_mask, + inner_smac_15_0); + break; + } + + return format_id; +} + +static int mlx5_lag_set_definer(u32 *match_definer_mask, + enum mlx5_traffic_types tt, bool tunnel, + enum netdev_lag_hash hash) +{ + int format_id; + u8 *ipv6; + + if (tunnel) + return mlx5_lag_set_definer_inner(match_definer_mask, tt); + + switch (tt) { + case MLX5_TT_IPV4_UDP: + case MLX5_TT_IPV4_TCP: + format_id = 22; + MLX5_SET_TO_ONES(match_definer_format_22, match_definer_mask, + outer_l4_sport); + MLX5_SET_TO_ONES(match_definer_format_22, match_definer_mask, + outer_l4_dport); + MLX5_SET_TO_ONES(match_definer_format_22, match_definer_mask, + outer_ip_src_addr); + MLX5_SET_TO_ONES(match_definer_format_22, match_definer_mask, + outer_ip_dest_addr); + break; + case MLX5_TT_IPV4: + format_id = 22; + MLX5_SET_TO_ONES(match_definer_format_22, match_definer_mask, + outer_l3_type); + MLX5_SET_TO_ONES(match_definer_format_22, match_definer_mask, + outer_dmac_47_16); + MLX5_SET_TO_ONES(match_definer_format_22, match_definer_mask, + outer_dmac_15_0); + MLX5_SET_TO_ONES(match_definer_format_22, match_definer_mask, + outer_smac_47_16); + MLX5_SET_TO_ONES(match_definer_format_22, match_definer_mask, + outer_smac_15_0); + MLX5_SET_TO_ONES(match_definer_format_22, match_definer_mask, + outer_ip_src_addr); + MLX5_SET_TO_ONES(match_definer_format_22, match_definer_mask, + outer_ip_dest_addr); + break; + case MLX5_TT_IPV6_TCP: + case MLX5_TT_IPV6_UDP: + format_id = 29; + MLX5_SET_TO_ONES(match_definer_format_29, match_definer_mask, + outer_l4_sport); + MLX5_SET_TO_ONES(match_definer_format_29, match_definer_mask, + outer_l4_dport); + ipv6 = MLX5_ADDR_OF(match_definer_format_29, match_definer_mask, + outer_ip_dest_addr); + memset(ipv6, 0xff, 16); + ipv6 = MLX5_ADDR_OF(match_definer_format_29, match_definer_mask, + outer_ip_src_addr); + memset(ipv6, 0xff, 16); + break; + case MLX5_TT_IPV6: + format_id = 30; + ipv6 = MLX5_ADDR_OF(match_definer_format_30, match_definer_mask, + outer_ip_dest_addr); + memset(ipv6, 0xff, 16); + ipv6 = MLX5_ADDR_OF(match_definer_format_30, match_definer_mask, + outer_ip_src_addr); + memset(ipv6, 0xff, 16); + MLX5_SET_TO_ONES(match_definer_format_30, match_definer_mask, + outer_dmac_47_16); + MLX5_SET_TO_ONES(match_definer_format_30, match_definer_mask, + outer_dmac_15_0); + MLX5_SET_TO_ONES(match_definer_format_30, match_definer_mask, + outer_smac_47_16); + MLX5_SET_TO_ONES(match_definer_format_30, match_definer_mask, + outer_smac_15_0); + break; + default: + format_id = 0; + MLX5_SET_TO_ONES(match_definer_format_0, match_definer_mask, + outer_smac_47_16); + MLX5_SET_TO_ONES(match_definer_format_0, match_definer_mask, + outer_smac_15_0); + + if (hash == NETDEV_LAG_HASH_VLAN_SRCMAC) { + MLX5_SET_TO_ONES(match_definer_format_0, + match_definer_mask, + outer_first_vlan_vid); + break; + } + + MLX5_SET_TO_ONES(match_definer_format_0, match_definer_mask, + outer_ethertype); + MLX5_SET_TO_ONES(match_definer_format_0, match_definer_mask, + outer_dmac_47_16); + MLX5_SET_TO_ONES(match_definer_format_0, match_definer_mask, + outer_dmac_15_0); + break; + } + + return format_id; +} + +static struct mlx5_lag_definer * +mlx5_lag_create_definer(struct mlx5_lag *ldev, enum netdev_lag_hash hash, + enum mlx5_traffic_types tt, bool tunnel, u8 *ports) +{ + struct mlx5_core_dev *dev = ldev->pf[MLX5_LAG_P1].dev; + struct mlx5_lag_definer *lag_definer; + u32 *match_definer_mask; + int format_id, err; + + lag_definer = kzalloc(sizeof(*lag_definer), GFP_KERNEL); + if (!lag_definer) + return ERR_PTR(-ENOMEM); + + match_definer_mask = kvzalloc(MLX5_FLD_SZ_BYTES(match_definer, + match_mask), + GFP_KERNEL); + if (!match_definer_mask) { + err = -ENOMEM; + goto free_lag_definer; + } + + format_id = mlx5_lag_set_definer(match_definer_mask, tt, tunnel, hash); + lag_definer->definer = + mlx5_create_match_definer(dev, MLX5_FLOW_NAMESPACE_PORT_SEL, + format_id, match_definer_mask); + if (IS_ERR(lag_definer->definer)) { + err = PTR_ERR(lag_definer->definer); + goto free_mask; + } + + err = mlx5_lag_create_port_sel_table(ldev, lag_definer, ports); + if (err) + goto destroy_match_definer; + + kvfree(match_definer_mask); + + return lag_definer; + +destroy_match_definer: + mlx5_destroy_match_definer(dev, lag_definer->definer); +free_mask: + kvfree(match_definer_mask); +free_lag_definer: + kfree(lag_definer); + return ERR_PTR(err); +} + +static void mlx5_lag_destroy_definer(struct mlx5_lag *ldev, + struct mlx5_lag_definer *lag_definer) +{ + struct mlx5_core_dev *dev = ldev->pf[MLX5_LAG_P1].dev; + int idx; + int i; + int j; + + for (i = 0; i < ldev->ports; i++) { + for (j = 0; j < ldev->buckets; j++) { + idx = i * ldev->buckets + j; + mlx5_del_flow_rules(lag_definer->rules[idx]); + } + } + mlx5_destroy_flow_group(lag_definer->fg); + mlx5_destroy_flow_table(lag_definer->ft); + mlx5_destroy_match_definer(dev, lag_definer->definer); + kfree(lag_definer); +} + +static void mlx5_lag_destroy_definers(struct mlx5_lag *ldev) +{ + struct mlx5_lag_port_sel *port_sel = &ldev->port_sel; + int tt; + + for_each_set_bit(tt, port_sel->tt_map, MLX5_NUM_TT) { + if (port_sel->outer.definers[tt]) + mlx5_lag_destroy_definer(ldev, + port_sel->outer.definers[tt]); + if (port_sel->inner.definers[tt]) + mlx5_lag_destroy_definer(ldev, + port_sel->inner.definers[tt]); + } +} + +static int mlx5_lag_create_definers(struct mlx5_lag *ldev, + enum netdev_lag_hash hash_type, + u8 *ports) +{ + struct mlx5_lag_port_sel *port_sel = &ldev->port_sel; + struct mlx5_lag_definer *lag_definer; + int tt, err; + + for_each_set_bit(tt, port_sel->tt_map, MLX5_NUM_TT) { + lag_definer = mlx5_lag_create_definer(ldev, hash_type, tt, + false, ports); + if (IS_ERR(lag_definer)) { + err = PTR_ERR(lag_definer); + goto destroy_definers; + } + port_sel->outer.definers[tt] = lag_definer; + + if (!port_sel->tunnel) + continue; + + lag_definer = + mlx5_lag_create_definer(ldev, hash_type, tt, + true, ports); + if (IS_ERR(lag_definer)) { + err = PTR_ERR(lag_definer); + goto destroy_definers; + } + port_sel->inner.definers[tt] = lag_definer; + } + + return 0; + +destroy_definers: + mlx5_lag_destroy_definers(ldev); + return err; +} + +static void set_tt_map(struct mlx5_lag_port_sel *port_sel, + enum netdev_lag_hash hash) +{ + port_sel->tunnel = false; + + switch (hash) { + case NETDEV_LAG_HASH_E34: + port_sel->tunnel = true; + fallthrough; + case NETDEV_LAG_HASH_L34: + set_bit(MLX5_TT_IPV4_TCP, port_sel->tt_map); + set_bit(MLX5_TT_IPV4_UDP, port_sel->tt_map); + set_bit(MLX5_TT_IPV6_TCP, port_sel->tt_map); + set_bit(MLX5_TT_IPV6_UDP, port_sel->tt_map); + set_bit(MLX5_TT_IPV4, port_sel->tt_map); + set_bit(MLX5_TT_IPV6, port_sel->tt_map); + set_bit(MLX5_TT_ANY, port_sel->tt_map); + break; + case NETDEV_LAG_HASH_E23: + port_sel->tunnel = true; + fallthrough; + case NETDEV_LAG_HASH_L23: + set_bit(MLX5_TT_IPV4, port_sel->tt_map); + set_bit(MLX5_TT_IPV6, port_sel->tt_map); + set_bit(MLX5_TT_ANY, port_sel->tt_map); + break; + default: + set_bit(MLX5_TT_ANY, port_sel->tt_map); + break; + } +} + +#define SET_IGNORE_DESTS_BITS(tt_map, dests) \ + do { \ + int idx; \ + \ + for_each_clear_bit(idx, tt_map, MLX5_NUM_TT) \ + set_bit(idx, dests); \ + } while (0) + +static void mlx5_lag_set_inner_ttc_params(struct mlx5_lag *ldev, + struct ttc_params *ttc_params) +{ + struct mlx5_core_dev *dev = ldev->pf[MLX5_LAG_P1].dev; + struct mlx5_lag_port_sel *port_sel = &ldev->port_sel; + struct mlx5_flow_table_attr *ft_attr; + int tt; + + ttc_params->ns = mlx5_get_flow_namespace(dev, + MLX5_FLOW_NAMESPACE_PORT_SEL); + ft_attr = &ttc_params->ft_attr; + ft_attr->level = MLX5_LAG_FT_LEVEL_INNER_TTC; + + for_each_set_bit(tt, port_sel->tt_map, MLX5_NUM_TT) { + ttc_params->dests[tt].type = + MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + ttc_params->dests[tt].ft = port_sel->inner.definers[tt]->ft; + } + SET_IGNORE_DESTS_BITS(port_sel->tt_map, ttc_params->ignore_dests); +} + +static void mlx5_lag_set_outer_ttc_params(struct mlx5_lag *ldev, + struct ttc_params *ttc_params) +{ + struct mlx5_core_dev *dev = ldev->pf[MLX5_LAG_P1].dev; + struct mlx5_lag_port_sel *port_sel = &ldev->port_sel; + struct mlx5_flow_table_attr *ft_attr; + int tt; + + ttc_params->ns = mlx5_get_flow_namespace(dev, + MLX5_FLOW_NAMESPACE_PORT_SEL); + ft_attr = &ttc_params->ft_attr; + ft_attr->level = MLX5_LAG_FT_LEVEL_TTC; + + for_each_set_bit(tt, port_sel->tt_map, MLX5_NUM_TT) { + ttc_params->dests[tt].type = + MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + ttc_params->dests[tt].ft = port_sel->outer.definers[tt]->ft; + } + SET_IGNORE_DESTS_BITS(port_sel->tt_map, ttc_params->ignore_dests); + + ttc_params->inner_ttc = port_sel->tunnel; + if (!port_sel->tunnel) + return; + + for (tt = 0; tt < MLX5_NUM_TUNNEL_TT; tt++) { + ttc_params->tunnel_dests[tt].type = + MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + ttc_params->tunnel_dests[tt].ft = + mlx5_get_ttc_flow_table(port_sel->inner.ttc); + } +} + +static int mlx5_lag_create_ttc_table(struct mlx5_lag *ldev) +{ + struct mlx5_core_dev *dev = ldev->pf[MLX5_LAG_P1].dev; + struct mlx5_lag_port_sel *port_sel = &ldev->port_sel; + struct ttc_params ttc_params = {}; + + mlx5_lag_set_outer_ttc_params(ldev, &ttc_params); + port_sel->outer.ttc = mlx5_create_ttc_table(dev, &ttc_params); + if (IS_ERR(port_sel->outer.ttc)) + return PTR_ERR(port_sel->outer.ttc); + + return 0; +} + +static int mlx5_lag_create_inner_ttc_table(struct mlx5_lag *ldev) +{ + struct mlx5_core_dev *dev = ldev->pf[MLX5_LAG_P1].dev; + struct mlx5_lag_port_sel *port_sel = &ldev->port_sel; + struct ttc_params ttc_params = {}; + + mlx5_lag_set_inner_ttc_params(ldev, &ttc_params); + port_sel->inner.ttc = mlx5_create_inner_ttc_table(dev, &ttc_params); + if (IS_ERR(port_sel->inner.ttc)) + return PTR_ERR(port_sel->inner.ttc); + + return 0; +} + +int mlx5_lag_port_sel_create(struct mlx5_lag *ldev, + enum netdev_lag_hash hash_type, u8 *ports) +{ + struct mlx5_lag_port_sel *port_sel = &ldev->port_sel; + int err; + + set_tt_map(port_sel, hash_type); + err = mlx5_lag_create_definers(ldev, hash_type, ports); + if (err) + return err; + + if (port_sel->tunnel) { + err = mlx5_lag_create_inner_ttc_table(ldev); + if (err) + goto destroy_definers; + } + + err = mlx5_lag_create_ttc_table(ldev); + if (err) + goto destroy_inner; + + return 0; + +destroy_inner: + if (port_sel->tunnel) + mlx5_destroy_ttc_table(port_sel->inner.ttc); +destroy_definers: + mlx5_lag_destroy_definers(ldev); + return err; +} + +static int __mlx5_lag_modify_definers_destinations(struct mlx5_lag *ldev, + struct mlx5_lag_definer *def, + u8 *ports) +{ + struct mlx5_flow_destination dest = {}; + int idx; + int err; + int i; + int j; + + dest.type = MLX5_FLOW_DESTINATION_TYPE_UPLINK; + dest.vport.flags |= MLX5_FLOW_DEST_VPORT_VHCA_ID; + + for (i = 0; i < ldev->ports; i++) { + for (j = 0; j < ldev->buckets; j++) { + idx = i * ldev->buckets + j; + if (ldev->v2p_map[i] == ports[i]) + continue; + + dest.vport.vhca_id = MLX5_CAP_GEN(ldev->pf[ports[idx] - 1].dev, + vhca_id); + err = mlx5_modify_rule_destination(def->rules[idx], &dest, NULL); + if (err) + return err; + } + } + + return 0; +} + +static int +mlx5_lag_modify_definers_destinations(struct mlx5_lag *ldev, + struct mlx5_lag_definer **definers, + u8 *ports) +{ + struct mlx5_lag_port_sel *port_sel = &ldev->port_sel; + struct mlx5_flow_destination dest = {}; + int err; + int tt; + + dest.type = MLX5_FLOW_DESTINATION_TYPE_UPLINK; + dest.vport.flags |= MLX5_FLOW_DEST_VPORT_VHCA_ID; + + for_each_set_bit(tt, port_sel->tt_map, MLX5_NUM_TT) { + err = __mlx5_lag_modify_definers_destinations(ldev, definers[tt], ports); + if (err) + return err; + } + + return 0; +} + +int mlx5_lag_port_sel_modify(struct mlx5_lag *ldev, u8 *ports) +{ + struct mlx5_lag_port_sel *port_sel = &ldev->port_sel; + int err; + + err = mlx5_lag_modify_definers_destinations(ldev, + port_sel->outer.definers, + ports); + if (err) + return err; + + if (!port_sel->tunnel) + return 0; + + return mlx5_lag_modify_definers_destinations(ldev, + port_sel->inner.definers, + ports); +} + +void mlx5_lag_port_sel_destroy(struct mlx5_lag *ldev) +{ + struct mlx5_lag_port_sel *port_sel = &ldev->port_sel; + + mlx5_destroy_ttc_table(port_sel->outer.ttc); + if (port_sel->tunnel) + mlx5_destroy_ttc_table(port_sel->inner.ttc); + mlx5_lag_destroy_definers(ldev); + memset(port_sel, 0, sizeof(*port_sel)); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.h new file mode 100644 index 0000000..b46a172 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. */ + +#ifndef __MLX5_LAG_FS_H__ +#define __MLX5_LAG_FS_H__ + +#include "lib/fs_ttc.h" + +struct mlx5_lag_definer { + struct mlx5_flow_definer *definer; + struct mlx5_flow_table *ft; + struct mlx5_flow_group *fg; + struct mlx5_flow_handle *rules[MLX5_MAX_PORTS * MLX5_LAG_MAX_HASH_BUCKETS]; +}; + +struct mlx5_lag_ttc { + struct mlx5_ttc_table *ttc; + struct mlx5_lag_definer *definers[MLX5_NUM_TT]; +}; + +struct mlx5_lag_port_sel { + DECLARE_BITMAP(tt_map, MLX5_NUM_TT); + bool tunnel; + struct mlx5_lag_ttc outer; + struct mlx5_lag_ttc inner; +}; + +#ifdef CONFIG_MLX5_ESWITCH + +int mlx5_lag_port_sel_modify(struct mlx5_lag *ldev, u8 *ports); +void mlx5_lag_port_sel_destroy(struct mlx5_lag *ldev); +int mlx5_lag_port_sel_create(struct mlx5_lag *ldev, + enum netdev_lag_hash hash_type, u8 *potrts); + +#else /* CONFIG_MLX5_ESWITCH */ +static inline int mlx5_lag_port_sel_create(struct mlx5_lag *ldev, + enum netdev_lag_hash hash_type, + u8 *ports) +{ + return 0; +} + +static inline int mlx5_lag_port_sel_modify(struct mlx5_lag *ldev, u8 *ports) +{ + return 0; +} + +static inline void mlx5_lag_port_sel_destroy(struct mlx5_lag *ldev) {} +#endif /* CONFIG_MLX5_ESWITCH */ +#endif /* __MLX5_LAG_FS_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c new file mode 100644 index 0000000..21e1450 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c @@ -0,0 +1,433 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +#include +#include +#include "aso.h" +#include "wq.h" + +struct mlx5_aso_cq { + /* data path - accessed per cqe */ + struct mlx5_cqwq wq; + + /* data path - accessed per napi poll */ + struct mlx5_core_cq mcq; + + /* control */ + struct mlx5_core_dev *mdev; + struct mlx5_wq_ctrl wq_ctrl; +} ____cacheline_aligned_in_smp; + +struct mlx5_aso { + /* data path */ + u16 cc; + u16 pc; + + struct mlx5_wqe_ctrl_seg *doorbell_cseg; + struct mlx5_aso_cq cq; + + /* read only */ + struct mlx5_wq_cyc wq; + void __iomem *uar_map; + u32 sqn; + + /* control path */ + struct mlx5_wq_ctrl wq_ctrl; + +} ____cacheline_aligned_in_smp; + +static void mlx5_aso_free_cq(struct mlx5_aso_cq *cq) +{ + mlx5_wq_destroy(&cq->wq_ctrl); +} + +static int mlx5_aso_alloc_cq(struct mlx5_core_dev *mdev, int numa_node, + void *cqc_data, struct mlx5_aso_cq *cq) +{ + struct mlx5_core_cq *mcq = &cq->mcq; + struct mlx5_wq_param param; + int err; + u32 i; + + param.buf_numa_node = numa_node; + param.db_numa_node = numa_node; + + err = mlx5_cqwq_create(mdev, ¶m, cqc_data, &cq->wq, &cq->wq_ctrl); + if (err) + return err; + + mcq->cqe_sz = 64; + mcq->set_ci_db = cq->wq_ctrl.db.db; + mcq->arm_db = cq->wq_ctrl.db.db + 1; + + for (i = 0; i < mlx5_cqwq_get_size(&cq->wq); i++) { + struct mlx5_cqe64 *cqe = mlx5_cqwq_get_wqe(&cq->wq, i); + + cqe->op_own = 0xf1; + } + + cq->mdev = mdev; + + return 0; +} + +static int create_aso_cq(struct mlx5_aso_cq *cq, void *cqc_data) +{ + u32 out[MLX5_ST_SZ_DW(create_cq_out)]; + struct mlx5_core_dev *mdev = cq->mdev; + struct mlx5_core_cq *mcq = &cq->mcq; + void *in, *cqc; + int inlen, eqn; + int err; + + err = mlx5_vector2eqn(mdev, 0, &eqn); + if (err) + return err; + + inlen = MLX5_ST_SZ_BYTES(create_cq_in) + + sizeof(u64) * cq->wq_ctrl.buf.npages; + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context); + + memcpy(cqc, cqc_data, MLX5_ST_SZ_BYTES(cqc)); + + mlx5_fill_page_frag_array(&cq->wq_ctrl.buf, + (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas)); + + MLX5_SET(cqc, cqc, cq_period_mode, DIM_CQ_PERIOD_MODE_START_FROM_EQE); + MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn); + MLX5_SET(cqc, cqc, uar_page, mdev->priv.uar->index); + MLX5_SET(cqc, cqc, log_page_size, cq->wq_ctrl.buf.page_shift - + MLX5_ADAPTER_PAGE_SHIFT); + MLX5_SET64(cqc, cqc, dbr_addr, cq->wq_ctrl.db.dma); + + err = mlx5_core_create_cq(mdev, mcq, in, inlen, out, sizeof(out)); + + kvfree(in); + + return err; +} + +static void mlx5_aso_destroy_cq(struct mlx5_aso_cq *cq) +{ + mlx5_core_destroy_cq(cq->mdev, &cq->mcq); + mlx5_wq_destroy(&cq->wq_ctrl); +} + +static int mlx5_aso_create_cq(struct mlx5_core_dev *mdev, int numa_node, + struct mlx5_aso_cq *cq) +{ + void *cqc_data; + int err; + + cqc_data = kvzalloc(MLX5_ST_SZ_BYTES(cqc), GFP_KERNEL); + if (!cqc_data) + return -ENOMEM; + + MLX5_SET(cqc, cqc_data, log_cq_size, 1); + MLX5_SET(cqc, cqc_data, uar_page, mdev->priv.uar->index); + if (MLX5_CAP_GEN(mdev, cqe_128_always) && cache_line_size() >= 128) + MLX5_SET(cqc, cqc_data, cqe_sz, CQE_STRIDE_128_PAD); + + err = mlx5_aso_alloc_cq(mdev, numa_node, cqc_data, cq); + if (err) { + mlx5_core_err(mdev, "Failed to alloc aso wq cq, err=%d\n", err); + goto err_out; + } + + err = create_aso_cq(cq, cqc_data); + if (err) { + mlx5_core_err(mdev, "Failed to create aso wq cq, err=%d\n", err); + goto err_free_cq; + } + + kvfree(cqc_data); + return 0; + +err_free_cq: + mlx5_aso_free_cq(cq); +err_out: + kvfree(cqc_data); + return err; +} + +static int mlx5_aso_alloc_sq(struct mlx5_core_dev *mdev, int numa_node, + void *sqc_data, struct mlx5_aso *sq) +{ + void *sqc_wq = MLX5_ADDR_OF(sqc, sqc_data, wq); + struct mlx5_wq_cyc *wq = &sq->wq; + struct mlx5_wq_param param; + int err; + + sq->uar_map = mdev->mlx5e_res.hw_objs.bfreg.map; + + param.db_numa_node = numa_node; + param.buf_numa_node = numa_node; + err = mlx5_wq_cyc_create(mdev, ¶m, sqc_wq, wq, &sq->wq_ctrl); + if (err) + return err; + wq->db = &wq->db[MLX5_SND_DBR]; + + return 0; +} + +static int create_aso_sq(struct mlx5_core_dev *mdev, int pdn, + void *sqc_data, struct mlx5_aso *sq) +{ + void *in, *sqc, *wq; + int inlen, err; + + inlen = MLX5_ST_SZ_BYTES(create_sq_in) + + sizeof(u64) * sq->wq_ctrl.buf.npages; + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + sqc = MLX5_ADDR_OF(create_sq_in, in, ctx); + wq = MLX5_ADDR_OF(sqc, sqc, wq); + + memcpy(sqc, sqc_data, MLX5_ST_SZ_BYTES(sqc)); + MLX5_SET(sqc, sqc, cqn, sq->cq.mcq.cqn); + + MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RST); + MLX5_SET(sqc, sqc, flush_in_error_en, 1); + + MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC); + MLX5_SET(wq, wq, uar_page, mdev->mlx5e_res.hw_objs.bfreg.index); + MLX5_SET(wq, wq, log_wq_pg_sz, sq->wq_ctrl.buf.page_shift - + MLX5_ADAPTER_PAGE_SHIFT); + MLX5_SET64(wq, wq, dbr_addr, sq->wq_ctrl.db.dma); + + mlx5_fill_page_frag_array(&sq->wq_ctrl.buf, + (__be64 *)MLX5_ADDR_OF(wq, wq, pas)); + + err = mlx5_core_create_sq(mdev, in, inlen, &sq->sqn); + + kvfree(in); + + return err; +} + +static int mlx5_aso_set_sq_rdy(struct mlx5_core_dev *mdev, u32 sqn) +{ + void *in, *sqc; + int inlen, err; + + inlen = MLX5_ST_SZ_BYTES(modify_sq_in); + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(modify_sq_in, in, sq_state, MLX5_SQC_STATE_RST); + sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx); + MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RDY); + + err = mlx5_core_modify_sq(mdev, sqn, in); + + kvfree(in); + + return err; +} + +static int mlx5_aso_create_sq_rdy(struct mlx5_core_dev *mdev, u32 pdn, + void *sqc_data, struct mlx5_aso *sq) +{ + int err; + + err = create_aso_sq(mdev, pdn, sqc_data, sq); + if (err) + return err; + + err = mlx5_aso_set_sq_rdy(mdev, sq->sqn); + if (err) + mlx5_core_destroy_sq(mdev, sq->sqn); + + return err; +} + +static void mlx5_aso_free_sq(struct mlx5_aso *sq) +{ + mlx5_wq_destroy(&sq->wq_ctrl); +} + +static void mlx5_aso_destroy_sq(struct mlx5_aso *sq) +{ + mlx5_core_destroy_sq(sq->cq.mdev, sq->sqn); + mlx5_aso_free_sq(sq); +} + +static int mlx5_aso_create_sq(struct mlx5_core_dev *mdev, int numa_node, + u32 pdn, struct mlx5_aso *sq) +{ + void *sqc_data, *wq; + int err; + + sqc_data = kvzalloc(MLX5_ST_SZ_BYTES(sqc), GFP_KERNEL); + if (!sqc_data) + return -ENOMEM; + + wq = MLX5_ADDR_OF(sqc, sqc_data, wq); + MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB)); + MLX5_SET(wq, wq, pd, pdn); + MLX5_SET(wq, wq, log_wq_sz, 1); + + err = mlx5_aso_alloc_sq(mdev, numa_node, sqc_data, sq); + if (err) { + mlx5_core_err(mdev, "Failed to alloc aso wq sq, err=%d\n", err); + goto err_out; + } + + err = mlx5_aso_create_sq_rdy(mdev, pdn, sqc_data, sq); + if (err) { + mlx5_core_err(mdev, "Failed to open aso wq sq, err=%d\n", err); + goto err_free_asosq; + } + + mlx5_core_dbg(mdev, "aso sq->sqn = 0x%x\n", sq->sqn); + + kvfree(sqc_data); + return 0; + +err_free_asosq: + mlx5_aso_free_sq(sq); +err_out: + kvfree(sqc_data); + return err; +} + +struct mlx5_aso *mlx5_aso_create(struct mlx5_core_dev *mdev, u32 pdn) +{ + int numa_node = dev_to_node(mlx5_core_dma_dev(mdev)); + struct mlx5_aso *aso; + int err; + + aso = kzalloc(sizeof(*aso), GFP_KERNEL); + if (!aso) + return ERR_PTR(-ENOMEM); + + err = mlx5_aso_create_cq(mdev, numa_node, &aso->cq); + if (err) + goto err_cq; + + err = mlx5_aso_create_sq(mdev, numa_node, pdn, aso); + if (err) + goto err_sq; + + return aso; + +err_sq: + mlx5_aso_destroy_cq(&aso->cq); +err_cq: + kfree(aso); + return ERR_PTR(err); +} + +void mlx5_aso_destroy(struct mlx5_aso *aso) +{ + if (IS_ERR_OR_NULL(aso)) + return; + + mlx5_aso_destroy_sq(aso); + mlx5_aso_destroy_cq(&aso->cq); + kfree(aso); +} + +void mlx5_aso_build_wqe(struct mlx5_aso *aso, u8 ds_cnt, + struct mlx5_aso_wqe *aso_wqe, + u32 obj_id, u32 opc_mode) +{ + struct mlx5_wqe_ctrl_seg *cseg = &aso_wqe->ctrl; + + cseg->opmod_idx_opcode = cpu_to_be32((opc_mode << MLX5_WQE_CTRL_WQE_OPC_MOD_SHIFT) | + (aso->pc << MLX5_WQE_CTRL_WQE_INDEX_SHIFT) | + MLX5_OPCODE_ACCESS_ASO); + cseg->qpn_ds = cpu_to_be32((aso->sqn << MLX5_WQE_CTRL_QPN_SHIFT) | ds_cnt); + cseg->fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE; + cseg->general_id = cpu_to_be32(obj_id); +} + +void *mlx5_aso_get_wqe(struct mlx5_aso *aso) +{ + u16 pi; + + pi = mlx5_wq_cyc_ctr2ix(&aso->wq, aso->pc); + return mlx5_wq_cyc_get_wqe(&aso->wq, pi); +} + +void mlx5_aso_post_wqe(struct mlx5_aso *aso, bool with_data, + struct mlx5_wqe_ctrl_seg *doorbell_cseg) +{ + doorbell_cseg->fm_ce_se |= MLX5_WQE_CTRL_CQ_UPDATE; + /* ensure wqe is visible to device before updating doorbell record */ + dma_wmb(); + + if (with_data) + aso->pc += MLX5_ASO_WQEBBS_DATA; + else + aso->pc += MLX5_ASO_WQEBBS; + *aso->wq.db = cpu_to_be32(aso->pc); + + /* ensure doorbell record is visible to device before ringing the + * doorbell + */ + wmb(); + + mlx5_write64((__be32 *)doorbell_cseg, aso->uar_map); + + /* Ensure doorbell is written on uar_page before poll_cq */ + WRITE_ONCE(doorbell_cseg, NULL); +} + +int mlx5_aso_poll_cq(struct mlx5_aso *aso, bool with_data, u32 interval_ms) +{ + struct mlx5_aso_cq *cq = &aso->cq; + struct mlx5_cqe64 *cqe; + unsigned long expires; + + cqe = mlx5_cqwq_get_cqe(&cq->wq); + + expires = jiffies + msecs_to_jiffies(interval_ms); + while (!cqe && time_is_after_jiffies(expires)) { + usleep_range(2, 10); + cqe = mlx5_cqwq_get_cqe(&cq->wq); + } + + if (!cqe) + return -ETIMEDOUT; + + /* sq->cc must be updated only after mlx5_cqwq_update_db_record(), + * otherwise a cq overrun may occur + */ + mlx5_cqwq_pop(&cq->wq); + + if (unlikely(get_cqe_opcode(cqe) != MLX5_CQE_REQ)) { + struct mlx5_err_cqe *err_cqe; + + mlx5_core_err(cq->mdev, "Bad OP in ASOSQ CQE: 0x%x\n", + get_cqe_opcode(cqe)); + + err_cqe = (struct mlx5_err_cqe *)cqe; + mlx5_core_err(cq->mdev, "vendor_err_synd=%x\n", + err_cqe->vendor_err_synd); + mlx5_core_err(cq->mdev, "syndrome=%x\n", + err_cqe->syndrome); + print_hex_dump(KERN_WARNING, "", DUMP_PREFIX_OFFSET, + 16, 1, err_cqe, + sizeof(*err_cqe), false); + } + + mlx5_cqwq_update_db_record(&cq->wq); + + /* ensure cq space is freed before enabling more cqes */ + wmb(); + + if (with_data) + aso->cc += MLX5_ASO_WQEBBS_DATA; + else + aso->cc += MLX5_ASO_WQEBBS; + + return 0; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.h new file mode 100644 index 0000000..7420df0 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.h @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#ifndef __MLX5_LIB_ASO_H__ +#define __MLX5_LIB_ASO_H__ + +#include +#include "mlx5_core.h" + +#define MLX5_ASO_WQEBBS \ + (DIV_ROUND_UP(sizeof(struct mlx5_aso_wqe), MLX5_SEND_WQE_BB)) +#define MLX5_ASO_WQEBBS_DATA \ + (DIV_ROUND_UP(sizeof(struct mlx5_aso_wqe_data), MLX5_SEND_WQE_BB)) +#define MLX5_WQE_CTRL_WQE_OPC_MOD_SHIFT 24 + +struct mlx5_wqe_aso_ctrl_seg { + __be32 va_h; + __be32 va_l; /* include read_enable */ + __be32 l_key; + u8 data_mask_mode; + u8 condition_1_0_operand; + u8 condition_1_0_offset; + u8 data_offset_condition_operand; + __be32 condition_0_data; + __be32 condition_0_mask; + __be32 condition_1_data; + __be32 condition_1_mask; + __be64 bitwise_data; + __be64 data_mask; +}; + +struct mlx5_wqe_aso_data_seg { + __be32 bytewise_data[16]; +}; + +struct mlx5_aso_wqe { + struct mlx5_wqe_ctrl_seg ctrl; + struct mlx5_wqe_aso_ctrl_seg aso_ctrl; +}; + +struct mlx5_aso_wqe_data { + struct mlx5_wqe_ctrl_seg ctrl; + struct mlx5_wqe_aso_ctrl_seg aso_ctrl; + struct mlx5_wqe_aso_data_seg aso_data; +}; + +enum { + MLX5_ASO_ALWAYS_FALSE, + MLX5_ASO_ALWAYS_TRUE, + MLX5_ASO_EQUAL, + MLX5_ASO_NOT_EQUAL, + MLX5_ASO_GREATER_OR_EQUAL, + MLX5_ASO_LESSER_OR_EQUAL, + MLX5_ASO_LESSER, + MLX5_ASO_GREATER, + MLX5_ASO_CYCLIC_GREATER, + MLX5_ASO_CYCLIC_LESSER, +}; + +enum { + MLX5_ASO_DATA_MASK_MODE_BITWISE_64BIT, + MLX5_ASO_DATA_MASK_MODE_BYTEWISE_64BYTE, + MLX5_ASO_DATA_MASK_MODE_CALCULATED_64BYTE, +}; + +struct mlx5_aso; + +void *mlx5_aso_get_wqe(struct mlx5_aso *aso); +void mlx5_aso_build_wqe(struct mlx5_aso *aso, u8 ds_cnt, + struct mlx5_aso_wqe *aso_wqe, + u32 obj_id, u32 opc_mode); +void mlx5_aso_post_wqe(struct mlx5_aso *aso, bool with_data, + struct mlx5_wqe_ctrl_seg *doorbell_cseg); +int mlx5_aso_poll_cq(struct mlx5_aso *aso, bool with_data, u32 interval_ms); + +struct mlx5_aso *mlx5_aso_create(struct mlx5_core_dev *mdev, u32 pdn); +void mlx5_aso_destroy(struct mlx5_aso *aso); +#endif /* __MLX5_LIB_ASO_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c new file mode 100644 index 0000000..d3a9ae8 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c @@ -0,0 +1,1021 @@ +/* + * Copyright (c) 2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include "lib/eq.h" +#include "en.h" +#include "clock.h" + +enum { + MLX5_CYCLES_SHIFT = 23 +}; + +enum { + MLX5_PIN_MODE_IN = 0x0, + MLX5_PIN_MODE_OUT = 0x1, +}; + +enum { + MLX5_OUT_PATTERN_PULSE = 0x0, + MLX5_OUT_PATTERN_PERIODIC = 0x1, +}; + +enum { + MLX5_EVENT_MODE_DISABLE = 0x0, + MLX5_EVENT_MODE_REPETETIVE = 0x1, + MLX5_EVENT_MODE_ONCE_TILL_ARM = 0x2, +}; + +enum { + MLX5_MTPPS_FS_ENABLE = BIT(0x0), + MLX5_MTPPS_FS_PATTERN = BIT(0x2), + MLX5_MTPPS_FS_PIN_MODE = BIT(0x3), + MLX5_MTPPS_FS_TIME_STAMP = BIT(0x4), + MLX5_MTPPS_FS_OUT_PULSE_DURATION = BIT(0x5), + MLX5_MTPPS_FS_ENH_OUT_PER_ADJ = BIT(0x7), + MLX5_MTPPS_FS_NPPS_PERIOD = BIT(0x9), + MLX5_MTPPS_FS_OUT_PULSE_DURATION_NS = BIT(0xa), +}; + +static bool mlx5_real_time_mode(struct mlx5_core_dev *mdev) +{ + return (mlx5_is_real_time_rq(mdev) || mlx5_is_real_time_sq(mdev)); +} + +static bool mlx5_npps_real_time_supported(struct mlx5_core_dev *mdev) +{ + return (mlx5_real_time_mode(mdev) && + MLX5_CAP_MCAM_FEATURE(mdev, npps_period) && + MLX5_CAP_MCAM_FEATURE(mdev, out_pulse_duration_ns)); +} + +static bool mlx5_modify_mtutc_allowed(struct mlx5_core_dev *mdev) +{ + return MLX5_CAP_MCAM_FEATURE(mdev, ptpcyc2realtime_modify); +} + +static int mlx5_set_mtutc(struct mlx5_core_dev *dev, u32 *mtutc, u32 size) +{ + u32 out[MLX5_ST_SZ_DW(mtutc_reg)] = {}; + + if (!MLX5_CAP_MCAM_REG(dev, mtutc)) + return -EOPNOTSUPP; + + return mlx5_core_access_reg(dev, mtutc, size, out, sizeof(out), + MLX5_REG_MTUTC, 0, 1); +} + +static u64 mlx5_read_time(struct mlx5_core_dev *dev, + struct ptp_system_timestamp *sts, + bool real_time) +{ + u32 timer_h, timer_h1, timer_l; + + timer_h = ioread32be(real_time ? &dev->iseg->real_time_h : + &dev->iseg->internal_timer_h); + ptp_read_system_prets(sts); + timer_l = ioread32be(real_time ? &dev->iseg->real_time_l : + &dev->iseg->internal_timer_l); + ptp_read_system_postts(sts); + timer_h1 = ioread32be(real_time ? &dev->iseg->real_time_h : + &dev->iseg->internal_timer_h); + if (timer_h != timer_h1) { + /* wrap around */ + ptp_read_system_prets(sts); + timer_l = ioread32be(real_time ? &dev->iseg->real_time_l : + &dev->iseg->internal_timer_l); + ptp_read_system_postts(sts); + } + + return real_time ? REAL_TIME_TO_NS(timer_h1, timer_l) : + (u64)timer_l | (u64)timer_h1 << 32; +} + +static u64 read_internal_timer(const struct cyclecounter *cc) +{ + struct mlx5_timer *timer = container_of(cc, struct mlx5_timer, cycles); + struct mlx5_clock *clock = container_of(timer, struct mlx5_clock, timer); + struct mlx5_core_dev *mdev = container_of(clock, struct mlx5_core_dev, + clock); + + return mlx5_read_time(mdev, NULL, false) & cc->mask; +} + +static void mlx5_update_clock_info_page(struct mlx5_core_dev *mdev) +{ + struct mlx5_ib_clock_info *clock_info = mdev->clock_info; + struct mlx5_clock *clock = &mdev->clock; + struct mlx5_timer *timer; + u32 sign; + + if (!clock_info) + return; + + sign = smp_load_acquire(&clock_info->sign); + smp_store_mb(clock_info->sign, + sign | MLX5_IB_CLOCK_INFO_KERNEL_UPDATING); + + timer = &clock->timer; + clock_info->cycles = timer->tc.cycle_last; + clock_info->mult = timer->cycles.mult; + clock_info->nsec = timer->tc.nsec; + clock_info->frac = timer->tc.frac; + + smp_store_release(&clock_info->sign, + sign + MLX5_IB_CLOCK_INFO_KERNEL_UPDATING * 2); +} + +static void mlx5_pps_out(struct work_struct *work) +{ + struct mlx5_pps *pps_info = container_of(work, struct mlx5_pps, + out_work); + struct mlx5_clock *clock = container_of(pps_info, struct mlx5_clock, + pps_info); + struct mlx5_core_dev *mdev = container_of(clock, struct mlx5_core_dev, + clock); + u32 in[MLX5_ST_SZ_DW(mtpps_reg)] = {0}; + unsigned long flags; + int i; + + for (i = 0; i < clock->ptp_info.n_pins; i++) { + u64 tstart; + + write_seqlock_irqsave(&clock->lock, flags); + tstart = clock->pps_info.start[i]; + clock->pps_info.start[i] = 0; + write_sequnlock_irqrestore(&clock->lock, flags); + if (!tstart) + continue; + + MLX5_SET(mtpps_reg, in, pin, i); + MLX5_SET64(mtpps_reg, in, time_stamp, tstart); + MLX5_SET(mtpps_reg, in, field_select, MLX5_MTPPS_FS_TIME_STAMP); + mlx5_set_mtpps(mdev, in, sizeof(in)); + } +} + +static void mlx5_timestamp_overflow(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct mlx5_core_dev *mdev; + struct mlx5_timer *timer; + struct mlx5_clock *clock; + unsigned long flags; + + timer = container_of(dwork, struct mlx5_timer, overflow_work); + clock = container_of(timer, struct mlx5_clock, timer); + mdev = container_of(clock, struct mlx5_core_dev, clock); + + write_seqlock_irqsave(&clock->lock, flags); + timecounter_read(&timer->tc); + mlx5_update_clock_info_page(mdev); + write_sequnlock_irqrestore(&clock->lock, flags); + schedule_delayed_work(&timer->overflow_work, timer->overflow_period); +} + +static int mlx5_ptp_settime_real_time(struct mlx5_core_dev *mdev, + const struct timespec64 *ts) +{ + u32 in[MLX5_ST_SZ_DW(mtutc_reg)] = {}; + + if (!mlx5_modify_mtutc_allowed(mdev)) + return 0; + + if (ts->tv_sec < 0 || ts->tv_sec > U32_MAX || + ts->tv_nsec < 0 || ts->tv_nsec > NSEC_PER_SEC) + return -EINVAL; + + MLX5_SET(mtutc_reg, in, operation, MLX5_MTUTC_OPERATION_SET_TIME_IMMEDIATE); + MLX5_SET(mtutc_reg, in, utc_sec, ts->tv_sec); + MLX5_SET(mtutc_reg, in, utc_nsec, ts->tv_nsec); + + return mlx5_set_mtutc(mdev, in, sizeof(in)); +} + +static int mlx5_ptp_settime(struct ptp_clock_info *ptp, const struct timespec64 *ts) +{ + struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock, ptp_info); + struct mlx5_timer *timer = &clock->timer; + struct mlx5_core_dev *mdev; + unsigned long flags; + int err; + + mdev = container_of(clock, struct mlx5_core_dev, clock); + err = mlx5_ptp_settime_real_time(mdev, ts); + if (err) + return err; + + write_seqlock_irqsave(&clock->lock, flags); + timecounter_init(&timer->tc, &timer->cycles, timespec64_to_ns(ts)); + mlx5_update_clock_info_page(mdev); + write_sequnlock_irqrestore(&clock->lock, flags); + + return 0; +} + +static +struct timespec64 mlx5_ptp_gettimex_real_time(struct mlx5_core_dev *mdev, + struct ptp_system_timestamp *sts) +{ + struct timespec64 ts; + u64 time; + + time = mlx5_read_time(mdev, sts, true); + ts = ns_to_timespec64(time); + return ts; +} + +static int mlx5_ptp_gettimex(struct ptp_clock_info *ptp, struct timespec64 *ts, + struct ptp_system_timestamp *sts) +{ + struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock, ptp_info); + struct mlx5_timer *timer = &clock->timer; + struct mlx5_core_dev *mdev; + unsigned long flags; + u64 cycles, ns; + + mdev = container_of(clock, struct mlx5_core_dev, clock); + if (mlx5_real_time_mode(mdev)) { + *ts = mlx5_ptp_gettimex_real_time(mdev, sts); + goto out; + } + + write_seqlock_irqsave(&clock->lock, flags); + cycles = mlx5_read_time(mdev, sts, false); + ns = timecounter_cyc2time(&timer->tc, cycles); + write_sequnlock_irqrestore(&clock->lock, flags); + *ts = ns_to_timespec64(ns); +out: + return 0; +} + +static int mlx5_ptp_adjtime_real_time(struct mlx5_core_dev *mdev, s64 delta) +{ + u32 in[MLX5_ST_SZ_DW(mtutc_reg)] = {}; + + if (!mlx5_modify_mtutc_allowed(mdev)) + return 0; + + /* HW time adjustment range is s16. If out of range, settime instead */ + if (delta < S16_MIN || delta > S16_MAX) { + struct timespec64 ts; + s64 ns; + + ts = mlx5_ptp_gettimex_real_time(mdev, NULL); + ns = timespec64_to_ns(&ts) + delta; + ts = ns_to_timespec64(ns); + return mlx5_ptp_settime_real_time(mdev, &ts); + } + + MLX5_SET(mtutc_reg, in, operation, MLX5_MTUTC_OPERATION_ADJUST_TIME); + MLX5_SET(mtutc_reg, in, time_adjustment, delta); + + return mlx5_set_mtutc(mdev, in, sizeof(in)); +} + +static int mlx5_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta) +{ + struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock, ptp_info); + struct mlx5_timer *timer = &clock->timer; + struct mlx5_core_dev *mdev; + unsigned long flags; + int err; + + mdev = container_of(clock, struct mlx5_core_dev, clock); + + err = mlx5_ptp_adjtime_real_time(mdev, delta); + if (err) + return err; + write_seqlock_irqsave(&clock->lock, flags); + timecounter_adjtime(&timer->tc, delta); + mlx5_update_clock_info_page(mdev); + write_sequnlock_irqrestore(&clock->lock, flags); + + return 0; +} + +static int mlx5_ptp_adjfreq_real_time(struct mlx5_core_dev *mdev, s32 freq) +{ + u32 in[MLX5_ST_SZ_DW(mtutc_reg)] = {}; + + if (!mlx5_modify_mtutc_allowed(mdev)) + return 0; + + MLX5_SET(mtutc_reg, in, operation, MLX5_MTUTC_OPERATION_ADJUST_FREQ_UTC); + MLX5_SET(mtutc_reg, in, freq_adjustment, freq); + + return mlx5_set_mtutc(mdev, in, sizeof(in)); +} + +static int mlx5_ptp_adjfreq(struct ptp_clock_info *ptp, s32 delta) +{ + struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock, ptp_info); + struct mlx5_timer *timer = &clock->timer; + struct mlx5_core_dev *mdev; + unsigned long flags; + int neg_adj = 0; + u32 diff; + u64 adj; + int err; + + mdev = container_of(clock, struct mlx5_core_dev, clock); + err = mlx5_ptp_adjfreq_real_time(mdev, delta); + if (err) + return err; + + if (delta < 0) { + neg_adj = 1; + delta = -delta; + } + + adj = timer->nominal_c_mult; + adj *= delta; + diff = div_u64(adj, 1000000000ULL); + + write_seqlock_irqsave(&clock->lock, flags); + timecounter_read(&timer->tc); + timer->cycles.mult = neg_adj ? timer->nominal_c_mult - diff : + timer->nominal_c_mult + diff; + mlx5_update_clock_info_page(mdev); + write_sequnlock_irqrestore(&clock->lock, flags); + + return 0; +} + +static int mlx5_extts_configure(struct ptp_clock_info *ptp, + struct ptp_clock_request *rq, + int on) +{ + struct mlx5_clock *clock = + container_of(ptp, struct mlx5_clock, ptp_info); + struct mlx5_core_dev *mdev = + container_of(clock, struct mlx5_core_dev, clock); + u32 in[MLX5_ST_SZ_DW(mtpps_reg)] = {0}; + u32 field_select = 0; + u8 pin_mode = 0; + u8 pattern = 0; + int pin = -1; + int err = 0; + + if (!MLX5_PPS_CAP(mdev)) + return -EOPNOTSUPP; + + /* Reject requests with unsupported flags */ + if (rq->extts.flags & ~(PTP_ENABLE_FEATURE | + PTP_RISING_EDGE | + PTP_FALLING_EDGE | + PTP_STRICT_FLAGS)) + return -EOPNOTSUPP; + + /* Reject requests to enable time stamping on both edges. */ + if ((rq->extts.flags & PTP_STRICT_FLAGS) && + (rq->extts.flags & PTP_ENABLE_FEATURE) && + (rq->extts.flags & PTP_EXTTS_EDGES) == PTP_EXTTS_EDGES) + return -EOPNOTSUPP; + + if (rq->extts.index >= clock->ptp_info.n_pins) + return -EINVAL; + + pin = ptp_find_pin(clock->ptp, PTP_PF_EXTTS, rq->extts.index); + if (pin < 0) + return -EBUSY; + + if (on) { + pin_mode = MLX5_PIN_MODE_IN; + pattern = !!(rq->extts.flags & PTP_FALLING_EDGE); + field_select = MLX5_MTPPS_FS_PIN_MODE | + MLX5_MTPPS_FS_PATTERN | + MLX5_MTPPS_FS_ENABLE; + } else { + field_select = MLX5_MTPPS_FS_ENABLE; + } + + MLX5_SET(mtpps_reg, in, pin, pin); + MLX5_SET(mtpps_reg, in, pin_mode, pin_mode); + MLX5_SET(mtpps_reg, in, pattern, pattern); + MLX5_SET(mtpps_reg, in, enable, on); + MLX5_SET(mtpps_reg, in, field_select, field_select); + + err = mlx5_set_mtpps(mdev, in, sizeof(in)); + if (err) + return err; + + return mlx5_set_mtppse(mdev, pin, 0, + MLX5_EVENT_MODE_REPETETIVE & on); +} + +static u64 find_target_cycles(struct mlx5_core_dev *mdev, s64 target_ns) +{ + struct mlx5_clock *clock = &mdev->clock; + u64 cycles_now, cycles_delta; + u64 nsec_now, nsec_delta; + struct mlx5_timer *timer; + unsigned long flags; + + timer = &clock->timer; + + cycles_now = mlx5_read_time(mdev, NULL, false); + write_seqlock_irqsave(&clock->lock, flags); + nsec_now = timecounter_cyc2time(&timer->tc, cycles_now); + nsec_delta = target_ns - nsec_now; + cycles_delta = div64_u64(nsec_delta << timer->cycles.shift, + timer->cycles.mult); + write_sequnlock_irqrestore(&clock->lock, flags); + + return cycles_now + cycles_delta; +} + +static u64 perout_conf_internal_timer(struct mlx5_core_dev *mdev, s64 sec) +{ + struct timespec64 ts = {}; + s64 target_ns; + + ts.tv_sec = sec; + target_ns = timespec64_to_ns(&ts); + + return find_target_cycles(mdev, target_ns); +} + +static u64 perout_conf_real_time(s64 sec, u32 nsec) +{ + return (u64)nsec | (u64)sec << 32; +} + +static int perout_conf_1pps(struct mlx5_core_dev *mdev, struct ptp_clock_request *rq, + u64 *time_stamp, bool real_time) +{ + struct timespec64 ts; + s64 ns; + + ts.tv_nsec = rq->perout.period.nsec; + ts.tv_sec = rq->perout.period.sec; + ns = timespec64_to_ns(&ts); + + if ((ns >> 1) != 500000000LL) + return -EINVAL; + + *time_stamp = real_time ? perout_conf_real_time(rq->perout.start.sec, 0) : + perout_conf_internal_timer(mdev, rq->perout.start.sec); + + return 0; +} + +#define MLX5_MAX_PULSE_DURATION (BIT(__mlx5_bit_sz(mtpps_reg, out_pulse_duration_ns)) - 1) +static int mlx5_perout_conf_out_pulse_duration(struct mlx5_core_dev *mdev, + struct ptp_clock_request *rq, + u32 *out_pulse_duration_ns) +{ + struct mlx5_pps *pps_info = &mdev->clock.pps_info; + u32 out_pulse_duration; + struct timespec64 ts; + + if (rq->perout.flags & PTP_PEROUT_DUTY_CYCLE) { + ts.tv_sec = rq->perout.on.sec; + ts.tv_nsec = rq->perout.on.nsec; + out_pulse_duration = (u32)timespec64_to_ns(&ts); + } else { + /* out_pulse_duration_ns should be up to 50% of the + * pulse period as default + */ + ts.tv_sec = rq->perout.period.sec; + ts.tv_nsec = rq->perout.period.nsec; + out_pulse_duration = (u32)timespec64_to_ns(&ts) >> 1; + } + + if (out_pulse_duration < pps_info->min_out_pulse_duration_ns || + out_pulse_duration > MLX5_MAX_PULSE_DURATION) { + mlx5_core_err(mdev, "NPPS pulse duration %u is not in [%llu, %lu]\n", + out_pulse_duration, pps_info->min_out_pulse_duration_ns, + MLX5_MAX_PULSE_DURATION); + return -EINVAL; + } + *out_pulse_duration_ns = out_pulse_duration; + + return 0; +} + +static int perout_conf_npps_real_time(struct mlx5_core_dev *mdev, struct ptp_clock_request *rq, + u32 *field_select, u32 *out_pulse_duration_ns, + u64 *period, u64 *time_stamp) +{ + struct mlx5_pps *pps_info = &mdev->clock.pps_info; + struct ptp_clock_time *time = &rq->perout.start; + struct timespec64 ts; + + ts.tv_sec = rq->perout.period.sec; + ts.tv_nsec = rq->perout.period.nsec; + if (timespec64_to_ns(&ts) < pps_info->min_npps_period) { + mlx5_core_err(mdev, "NPPS period is lower than minimal npps period %llu\n", + pps_info->min_npps_period); + return -EINVAL; + } + *period = perout_conf_real_time(rq->perout.period.sec, rq->perout.period.nsec); + + if (mlx5_perout_conf_out_pulse_duration(mdev, rq, out_pulse_duration_ns)) + return -EINVAL; + + *time_stamp = perout_conf_real_time(time->sec, time->nsec); + *field_select |= MLX5_MTPPS_FS_NPPS_PERIOD | + MLX5_MTPPS_FS_OUT_PULSE_DURATION_NS; + + return 0; +} + +static bool mlx5_perout_verify_flags(struct mlx5_core_dev *mdev, unsigned int flags) +{ + return ((!mlx5_npps_real_time_supported(mdev) && flags) || + (mlx5_npps_real_time_supported(mdev) && flags & ~PTP_PEROUT_DUTY_CYCLE)); +} + +static int mlx5_perout_configure(struct ptp_clock_info *ptp, + struct ptp_clock_request *rq, + int on) +{ + struct mlx5_clock *clock = + container_of(ptp, struct mlx5_clock, ptp_info); + struct mlx5_core_dev *mdev = + container_of(clock, struct mlx5_core_dev, clock); + bool rt_mode = mlx5_real_time_mode(mdev); + u32 in[MLX5_ST_SZ_DW(mtpps_reg)] = {0}; + u32 out_pulse_duration_ns = 0; + u32 field_select = 0; + u64 npps_period = 0; + u64 time_stamp = 0; + u8 pin_mode = 0; + u8 pattern = 0; + int pin = -1; + int err = 0; + + if (!MLX5_PPS_CAP(mdev)) + return -EOPNOTSUPP; + + /* Reject requests with unsupported flags */ + if (mlx5_perout_verify_flags(mdev, rq->perout.flags)) + return -EOPNOTSUPP; + + if (rq->perout.index >= clock->ptp_info.n_pins) + return -EINVAL; + + field_select = MLX5_MTPPS_FS_ENABLE; + pin = ptp_find_pin(clock->ptp, PTP_PF_PEROUT, rq->perout.index); + if (pin < 0) + return -EBUSY; + + if (on) { + bool rt_mode = mlx5_real_time_mode(mdev); + + pin_mode = MLX5_PIN_MODE_OUT; + pattern = MLX5_OUT_PATTERN_PERIODIC; + + if (rt_mode && rq->perout.start.sec > U32_MAX) + return -EINVAL; + + field_select |= MLX5_MTPPS_FS_PIN_MODE | + MLX5_MTPPS_FS_PATTERN | + MLX5_MTPPS_FS_TIME_STAMP; + + if (mlx5_npps_real_time_supported(mdev)) + err = perout_conf_npps_real_time(mdev, rq, &field_select, + &out_pulse_duration_ns, &npps_period, + &time_stamp); + else + err = perout_conf_1pps(mdev, rq, &time_stamp, rt_mode); + if (err) + return err; + } + + MLX5_SET(mtpps_reg, in, pin, pin); + MLX5_SET(mtpps_reg, in, pin_mode, pin_mode); + MLX5_SET(mtpps_reg, in, pattern, pattern); + MLX5_SET(mtpps_reg, in, enable, on); + MLX5_SET64(mtpps_reg, in, time_stamp, time_stamp); + MLX5_SET(mtpps_reg, in, field_select, field_select); + MLX5_SET64(mtpps_reg, in, npps_period, npps_period); + MLX5_SET(mtpps_reg, in, out_pulse_duration_ns, out_pulse_duration_ns); + err = mlx5_set_mtpps(mdev, in, sizeof(in)); + if (err) + return err; + + if (rt_mode) + return 0; + + return mlx5_set_mtppse(mdev, pin, 0, + MLX5_EVENT_MODE_REPETETIVE & on); +} + +static int mlx5_pps_configure(struct ptp_clock_info *ptp, + struct ptp_clock_request *rq, + int on) +{ + struct mlx5_clock *clock = + container_of(ptp, struct mlx5_clock, ptp_info); + + clock->pps_info.enabled = !!on; + return 0; +} + +static int mlx5_ptp_enable(struct ptp_clock_info *ptp, + struct ptp_clock_request *rq, + int on) +{ + switch (rq->type) { + case PTP_CLK_REQ_EXTTS: + return mlx5_extts_configure(ptp, rq, on); + case PTP_CLK_REQ_PEROUT: + return mlx5_perout_configure(ptp, rq, on); + case PTP_CLK_REQ_PPS: + return mlx5_pps_configure(ptp, rq, on); + default: + return -EOPNOTSUPP; + } + return 0; +} + +enum { + MLX5_MTPPS_REG_CAP_PIN_X_MODE_SUPPORT_PPS_IN = BIT(0), + MLX5_MTPPS_REG_CAP_PIN_X_MODE_SUPPORT_PPS_OUT = BIT(1), +}; + +static int mlx5_ptp_verify(struct ptp_clock_info *ptp, unsigned int pin, + enum ptp_pin_function func, unsigned int chan) +{ + struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock, + ptp_info); + + switch (func) { + case PTP_PF_NONE: + return 0; + case PTP_PF_EXTTS: + return !(clock->pps_info.pin_caps[pin] & + MLX5_MTPPS_REG_CAP_PIN_X_MODE_SUPPORT_PPS_IN); + case PTP_PF_PEROUT: + return !(clock->pps_info.pin_caps[pin] & + MLX5_MTPPS_REG_CAP_PIN_X_MODE_SUPPORT_PPS_OUT); + default: + return -EOPNOTSUPP; + } +} + +static const struct ptp_clock_info mlx5_ptp_clock_info = { + .owner = THIS_MODULE, + .name = "mlx5_ptp", + .max_adj = 100000000, + .n_alarm = 0, + .n_ext_ts = 0, + .n_per_out = 0, + .n_pins = 0, + .pps = 0, + .adjfreq = mlx5_ptp_adjfreq, + .adjtime = mlx5_ptp_adjtime, + .gettimex64 = mlx5_ptp_gettimex, + .settime64 = mlx5_ptp_settime, + .enable = NULL, + .verify = NULL, +}; + +static int mlx5_query_mtpps_pin_mode(struct mlx5_core_dev *mdev, u8 pin, + u32 *mtpps, u32 mtpps_size) +{ + u32 in[MLX5_ST_SZ_DW(mtpps_reg)] = {}; + + MLX5_SET(mtpps_reg, in, pin, pin); + + return mlx5_core_access_reg(mdev, in, sizeof(in), mtpps, + mtpps_size, MLX5_REG_MTPPS, 0, 0); +} + +static int mlx5_get_pps_pin_mode(struct mlx5_clock *clock, u8 pin) +{ + struct mlx5_core_dev *mdev = container_of(clock, struct mlx5_core_dev, clock); + + u32 out[MLX5_ST_SZ_DW(mtpps_reg)] = {}; + u8 mode; + int err; + + err = mlx5_query_mtpps_pin_mode(mdev, pin, out, sizeof(out)); + if (err || !MLX5_GET(mtpps_reg, out, enable)) + return PTP_PF_NONE; + + mode = MLX5_GET(mtpps_reg, out, pin_mode); + + if (mode == MLX5_PIN_MODE_IN) + return PTP_PF_EXTTS; + else if (mode == MLX5_PIN_MODE_OUT) + return PTP_PF_PEROUT; + + return PTP_PF_NONE; +} + +static void mlx5_init_pin_config(struct mlx5_clock *clock) +{ + int i; + + if (!clock->ptp_info.n_pins) + return; + + clock->ptp_info.pin_config = + kcalloc(clock->ptp_info.n_pins, + sizeof(*clock->ptp_info.pin_config), + GFP_KERNEL); + if (!clock->ptp_info.pin_config) + return; + clock->ptp_info.enable = mlx5_ptp_enable; + clock->ptp_info.verify = mlx5_ptp_verify; + clock->ptp_info.pps = 1; + + for (i = 0; i < clock->ptp_info.n_pins; i++) { + snprintf(clock->ptp_info.pin_config[i].name, + sizeof(clock->ptp_info.pin_config[i].name), + "mlx5_pps%d", i); + clock->ptp_info.pin_config[i].index = i; + clock->ptp_info.pin_config[i].func = mlx5_get_pps_pin_mode(clock, i); + clock->ptp_info.pin_config[i].chan = 0; + } +} + +static void mlx5_get_pps_caps(struct mlx5_core_dev *mdev) +{ + struct mlx5_clock *clock = &mdev->clock; + u32 out[MLX5_ST_SZ_DW(mtpps_reg)] = {0}; + + mlx5_query_mtpps(mdev, out, sizeof(out)); + + clock->ptp_info.n_pins = MLX5_GET(mtpps_reg, out, + cap_number_of_pps_pins); + clock->ptp_info.n_ext_ts = MLX5_GET(mtpps_reg, out, + cap_max_num_of_pps_in_pins); + clock->ptp_info.n_per_out = MLX5_GET(mtpps_reg, out, + cap_max_num_of_pps_out_pins); + + if (MLX5_CAP_MCAM_FEATURE(mdev, npps_period)) + clock->pps_info.min_npps_period = 1 << MLX5_GET(mtpps_reg, out, + cap_log_min_npps_period); + if (MLX5_CAP_MCAM_FEATURE(mdev, out_pulse_duration_ns)) + clock->pps_info.min_out_pulse_duration_ns = 1 << MLX5_GET(mtpps_reg, out, + cap_log_min_out_pulse_duration_ns); + + clock->pps_info.pin_caps[0] = MLX5_GET(mtpps_reg, out, cap_pin_0_mode); + clock->pps_info.pin_caps[1] = MLX5_GET(mtpps_reg, out, cap_pin_1_mode); + clock->pps_info.pin_caps[2] = MLX5_GET(mtpps_reg, out, cap_pin_2_mode); + clock->pps_info.pin_caps[3] = MLX5_GET(mtpps_reg, out, cap_pin_3_mode); + clock->pps_info.pin_caps[4] = MLX5_GET(mtpps_reg, out, cap_pin_4_mode); + clock->pps_info.pin_caps[5] = MLX5_GET(mtpps_reg, out, cap_pin_5_mode); + clock->pps_info.pin_caps[6] = MLX5_GET(mtpps_reg, out, cap_pin_6_mode); + clock->pps_info.pin_caps[7] = MLX5_GET(mtpps_reg, out, cap_pin_7_mode); +} + +static void ts_next_sec(struct timespec64 *ts) +{ + ts->tv_sec += 1; + ts->tv_nsec = 0; +} + +static u64 perout_conf_next_event_timer(struct mlx5_core_dev *mdev, + struct mlx5_clock *clock) +{ + struct timespec64 ts; + s64 target_ns; + + mlx5_ptp_gettimex(&clock->ptp_info, &ts, NULL); + ts_next_sec(&ts); + target_ns = timespec64_to_ns(&ts); + + return find_target_cycles(mdev, target_ns); +} + +static int mlx5_pps_event(struct notifier_block *nb, + unsigned long type, void *data) +{ + struct mlx5_clock *clock = mlx5_nb_cof(nb, struct mlx5_clock, pps_nb); + struct ptp_clock_event ptp_event; + struct mlx5_eqe *eqe = data; + int pin = eqe->data.pps.pin; + struct mlx5_core_dev *mdev; + unsigned long flags; + u64 ns; + + mdev = container_of(clock, struct mlx5_core_dev, clock); + + switch (clock->ptp_info.pin_config[pin].func) { + case PTP_PF_EXTTS: + ptp_event.index = pin; + ptp_event.timestamp = mlx5_real_time_mode(mdev) ? + mlx5_real_time_cyc2time(clock, + be64_to_cpu(eqe->data.pps.time_stamp)) : + mlx5_timecounter_cyc2time(clock, + be64_to_cpu(eqe->data.pps.time_stamp)); + if (clock->pps_info.enabled) { + ptp_event.type = PTP_CLOCK_PPSUSR; + ptp_event.pps_times.ts_real = + ns_to_timespec64(ptp_event.timestamp); + } else { + ptp_event.type = PTP_CLOCK_EXTTS; + } + /* TODOL clock->ptp can be NULL if ptp_clock_register fails */ + ptp_clock_event(clock->ptp, &ptp_event); + break; + case PTP_PF_PEROUT: + ns = perout_conf_next_event_timer(mdev, clock); + write_seqlock_irqsave(&clock->lock, flags); + clock->pps_info.start[pin] = ns; + write_sequnlock_irqrestore(&clock->lock, flags); + schedule_work(&clock->pps_info.out_work); + break; + default: + mlx5_core_err(mdev, " Unhandled clock PPS event, func %d\n", + clock->ptp_info.pin_config[pin].func); + } + + return NOTIFY_OK; +} + +static void mlx5_timecounter_init(struct mlx5_core_dev *mdev) +{ + struct mlx5_clock *clock = &mdev->clock; + struct mlx5_timer *timer = &clock->timer; + u32 dev_freq; + + dev_freq = MLX5_CAP_GEN(mdev, device_frequency_khz); + timer->cycles.read = read_internal_timer; + timer->cycles.shift = MLX5_CYCLES_SHIFT; + timer->cycles.mult = clocksource_khz2mult(dev_freq, + timer->cycles.shift); + timer->nominal_c_mult = timer->cycles.mult; + timer->cycles.mask = CLOCKSOURCE_MASK(41); + + timecounter_init(&timer->tc, &timer->cycles, + ktime_to_ns(ktime_get_real())); +} + +static void mlx5_init_overflow_period(struct mlx5_clock *clock) +{ + struct mlx5_core_dev *mdev = container_of(clock, struct mlx5_core_dev, clock); + struct mlx5_ib_clock_info *clock_info = mdev->clock_info; + struct mlx5_timer *timer = &clock->timer; + u64 overflow_cycles; + u64 frac = 0; + u64 ns; + + /* Calculate period in seconds to call the overflow watchdog - to make + * sure counter is checked at least twice every wrap around. + * The period is calculated as the minimum between max HW cycles count + * (The clock source mask) and max amount of cycles that can be + * multiplied by clock multiplier where the result doesn't exceed + * 64bits. + */ + overflow_cycles = div64_u64(~0ULL >> 1, timer->cycles.mult); + overflow_cycles = min(overflow_cycles, div_u64(timer->cycles.mask, 3)); + + ns = cyclecounter_cyc2ns(&timer->cycles, overflow_cycles, + frac, &frac); + do_div(ns, NSEC_PER_SEC / HZ); + timer->overflow_period = ns; + + INIT_DELAYED_WORK(&timer->overflow_work, mlx5_timestamp_overflow); + if (timer->overflow_period) + schedule_delayed_work(&timer->overflow_work, 0); + else + mlx5_core_warn(mdev, + "invalid overflow period, overflow_work is not scheduled\n"); + + if (clock_info) + clock_info->overflow_period = timer->overflow_period; +} + +static void mlx5_init_clock_info(struct mlx5_core_dev *mdev) +{ + struct mlx5_clock *clock = &mdev->clock; + struct mlx5_ib_clock_info *info; + struct mlx5_timer *timer; + + mdev->clock_info = (struct mlx5_ib_clock_info *)get_zeroed_page(GFP_KERNEL); + if (!mdev->clock_info) { + mlx5_core_warn(mdev, "Failed to allocate IB clock info page\n"); + return; + } + + info = mdev->clock_info; + timer = &clock->timer; + + info->nsec = timer->tc.nsec; + info->cycles = timer->tc.cycle_last; + info->mask = timer->cycles.mask; + info->mult = timer->nominal_c_mult; + info->shift = timer->cycles.shift; + info->frac = timer->tc.frac; +} + +static void mlx5_init_timer_clock(struct mlx5_core_dev *mdev) +{ + struct mlx5_clock *clock = &mdev->clock; + + mlx5_timecounter_init(mdev); + mlx5_init_clock_info(mdev); + mlx5_init_overflow_period(clock); + clock->ptp_info = mlx5_ptp_clock_info; + + if (mlx5_real_time_mode(mdev)) { + struct timespec64 ts; + + ktime_get_real_ts64(&ts); + mlx5_ptp_settime(&clock->ptp_info, &ts); + } +} + +static void mlx5_init_pps(struct mlx5_core_dev *mdev) +{ + struct mlx5_clock *clock = &mdev->clock; + + if (!MLX5_PPS_CAP(mdev)) + return; + + mlx5_get_pps_caps(mdev); + mlx5_init_pin_config(clock); +} + +void mlx5_init_clock(struct mlx5_core_dev *mdev) +{ + struct mlx5_clock *clock = &mdev->clock; + + if (!MLX5_CAP_GEN(mdev, device_frequency_khz)) { + mlx5_core_warn(mdev, "invalid device_frequency_khz, aborting HW clock init\n"); + return; + } + + seqlock_init(&clock->lock); + mlx5_init_timer_clock(mdev); + INIT_WORK(&clock->pps_info.out_work, mlx5_pps_out); + + /* Configure the PHC */ + clock->ptp_info = mlx5_ptp_clock_info; + + /* Initialize 1PPS data structures */ + mlx5_init_pps(mdev); + + clock->ptp = ptp_clock_register(&clock->ptp_info, + &mdev->pdev->dev); + if (IS_ERR(clock->ptp)) { + mlx5_core_warn(mdev, "ptp_clock_register failed %ld\n", + PTR_ERR(clock->ptp)); + clock->ptp = NULL; + } + + MLX5_NB_INIT(&clock->pps_nb, mlx5_pps_event, PPS_EVENT); + mlx5_eq_notifier_register(mdev, &clock->pps_nb); +} + +void mlx5_cleanup_clock(struct mlx5_core_dev *mdev) +{ + struct mlx5_clock *clock = &mdev->clock; + + if (!MLX5_CAP_GEN(mdev, device_frequency_khz)) + return; + + mlx5_eq_notifier_unregister(mdev, &clock->pps_nb); + if (clock->ptp) { + ptp_clock_unregister(clock->ptp); + clock->ptp = NULL; + } + + cancel_work_sync(&clock->pps_info.out_work); + cancel_delayed_work_sync(&clock->timer.overflow_work); + + if (mdev->clock_info) { + free_page((unsigned long)mdev->clock_info); + mdev->clock_info = NULL; + } + + kfree(clock->ptp_info.pin_config); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h new file mode 100644 index 0000000..bd95b9f --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2017, Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __LIB_CLOCK_H__ +#define __LIB_CLOCK_H__ + +static inline bool mlx5_is_real_time_rq(struct mlx5_core_dev *mdev) +{ + u8 rq_ts_format_cap = MLX5_CAP_GEN(mdev, rq_ts_format); + + return (rq_ts_format_cap == MLX5_TIMESTAMP_FORMAT_CAP_REAL_TIME || + rq_ts_format_cap == + MLX5_TIMESTAMP_FORMAT_CAP_FREE_RUNNING_AND_REAL_TIME); +} + +static inline bool mlx5_is_real_time_sq(struct mlx5_core_dev *mdev) +{ + u8 sq_ts_format_cap = MLX5_CAP_GEN(mdev, sq_ts_format); + + return (sq_ts_format_cap == MLX5_TIMESTAMP_FORMAT_CAP_REAL_TIME || + sq_ts_format_cap == + MLX5_TIMESTAMP_FORMAT_CAP_FREE_RUNNING_AND_REAL_TIME); +} + +typedef ktime_t (*cqe_ts_to_ns)(struct mlx5_clock *, u64); + +#if IS_ENABLED(CONFIG_PTP_1588_CLOCK) +void mlx5_init_clock(struct mlx5_core_dev *mdev); +void mlx5_cleanup_clock(struct mlx5_core_dev *mdev); + +static inline int mlx5_clock_get_ptp_index(struct mlx5_core_dev *mdev) +{ + return mdev->clock.ptp ? ptp_clock_index(mdev->clock.ptp) : -1; +} + +static inline ktime_t mlx5_timecounter_cyc2time(struct mlx5_clock *clock, + u64 timestamp) +{ + struct mlx5_timer *timer = &clock->timer; + unsigned int seq; + u64 nsec; + + do { + seq = read_seqbegin(&clock->lock); + nsec = timecounter_cyc2time(&timer->tc, timestamp); + } while (read_seqretry(&clock->lock, seq)); + + return ns_to_ktime(nsec); +} + +#define REAL_TIME_TO_NS(hi, low) (((u64)hi) * NSEC_PER_SEC + ((u64)low)) + +static inline ktime_t mlx5_real_time_cyc2time(struct mlx5_clock *clock, + u64 timestamp) +{ + u64 time = REAL_TIME_TO_NS(timestamp >> 32, timestamp & 0xFFFFFFFF); + + return ns_to_ktime(time); +} +#else +static inline void mlx5_init_clock(struct mlx5_core_dev *mdev) {} +static inline void mlx5_cleanup_clock(struct mlx5_core_dev *mdev) {} +static inline int mlx5_clock_get_ptp_index(struct mlx5_core_dev *mdev) +{ + return -1; +} + +static inline ktime_t mlx5_timecounter_cyc2time(struct mlx5_clock *clock, + u64 timestamp) +{ + return 0; +} + +static inline ktime_t mlx5_real_time_cyc2time(struct mlx5_clock *clock, + u64 timestamp) +{ + return 0; +} +#endif + +static inline cqe_ts_to_ns mlx5_rq_ts_translator(struct mlx5_core_dev *mdev) +{ + return mlx5_is_real_time_rq(mdev) ? mlx5_real_time_cyc2time : + mlx5_timecounter_cyc2time; +} + +static inline cqe_ts_to_ns mlx5_sq_ts_translator(struct mlx5_core_dev *mdev) +{ + return mlx5_is_real_time_sq(mdev) ? mlx5_real_time_cyc2time : + mlx5_timecounter_cyc2time; +} +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/crypto.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/crypto.c new file mode 100644 index 0000000..e995f83 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/crypto.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// Copyright (c) 2019 Mellanox Technologies. + +#include "mlx5_core.h" +#include "lib/mlx5.h" + +int mlx5_create_encryption_key(struct mlx5_core_dev *mdev, + void *key, u32 sz_bytes, + u32 key_type, u32 *p_key_id) +{ + u32 in[MLX5_ST_SZ_DW(create_encryption_key_in)] = {}; + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)]; + u32 sz_bits = sz_bytes * BITS_PER_BYTE; + u8 general_obj_key_size; + u64 general_obj_types; + void *obj, *key_p; + int err; + + obj = MLX5_ADDR_OF(create_encryption_key_in, in, encryption_key_object); + key_p = MLX5_ADDR_OF(encryption_key_obj, obj, key); + + general_obj_types = MLX5_CAP_GEN_64(mdev, general_obj_types); + if (!(general_obj_types & + MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_ENCRYPTION_KEY)) + return -EINVAL; + + switch (sz_bits) { + case 128: + general_obj_key_size = + MLX5_GENERAL_OBJECT_TYPE_ENCRYPTION_KEY_KEY_SIZE_128; + key_p += sz_bytes; + break; + case 256: + general_obj_key_size = + MLX5_GENERAL_OBJECT_TYPE_ENCRYPTION_KEY_KEY_SIZE_256; + break; + default: + return -EINVAL; + } + + memcpy(key_p, key, sz_bytes); + + MLX5_SET(encryption_key_obj, obj, key_size, general_obj_key_size); + MLX5_SET(encryption_key_obj, obj, key_type, key_type); + MLX5_SET(general_obj_in_cmd_hdr, in, opcode, + MLX5_CMD_OP_CREATE_GENERAL_OBJECT); + MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, + MLX5_GENERAL_OBJECT_TYPES_ENCRYPTION_KEY); + MLX5_SET(encryption_key_obj, obj, pd, mdev->mlx5e_res.hw_objs.pdn); + + err = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); + if (!err) + *p_key_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id); + + /* avoid leaking key on the stack */ + memzero_explicit(in, sizeof(in)); + + return err; +} + +void mlx5_destroy_encryption_key(struct mlx5_core_dev *mdev, u32 key_id) +{ + u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {}; + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)]; + + MLX5_SET(general_obj_in_cmd_hdr, in, opcode, + MLX5_CMD_OP_DESTROY_GENERAL_OBJECT); + MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, + MLX5_GENERAL_OBJECT_TYPES_ENCRYPTION_KEY); + MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, key_id); + + mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c new file mode 100644 index 0000000..1c1a76d --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c @@ -0,0 +1,262 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2018 Mellanox Technologies */ + +#include +#include "lib/devcom.h" +#include "lag/lag.h" + +static LIST_HEAD(devcom_list); + +#define devcom_for_each_component(priv, comp, iter) \ + for (iter = 0; \ + comp = &(priv)->components[iter], iter < MLX5_DEVCOM_NUM_COMPONENTS; \ + iter++) + +struct mlx5_devcom_component { + struct { + void *data; + } device[MLX5_DEVCOM_PORTS_SUPPORTED]; + + mlx5_devcom_event_handler_t handler; + struct rw_semaphore sem; + bool paired; +}; + +struct mlx5_devcom_list { + struct list_head list; + + struct mlx5_devcom_component components[MLX5_DEVCOM_NUM_COMPONENTS]; + struct mlx5_core_dev *devs[MLX5_DEVCOM_PORTS_SUPPORTED]; +}; + +struct mlx5_devcom { + struct mlx5_devcom_list *priv; + int idx; +}; + +static struct mlx5_devcom_list *mlx5_devcom_list_alloc(void) +{ + struct mlx5_devcom_component *comp; + struct mlx5_devcom_list *priv; + int i; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return NULL; + + devcom_for_each_component(priv, comp, i) + init_rwsem(&comp->sem); + + return priv; +} + +static struct mlx5_devcom *mlx5_devcom_alloc(struct mlx5_devcom_list *priv, + u8 idx) +{ + struct mlx5_devcom *devcom; + + devcom = kzalloc(sizeof(*devcom), GFP_KERNEL); + if (!devcom) + return NULL; + + devcom->priv = priv; + devcom->idx = idx; + return devcom; +} + +/* Must be called with intf_mutex held */ +struct mlx5_devcom *mlx5_devcom_register_device(struct mlx5_core_dev *dev) +{ + struct mlx5_devcom_list *priv = NULL, *iter; + struct mlx5_devcom *devcom = NULL; + bool new_priv = false; + u64 sguid0, sguid1; + int idx, i; + + if (!mlx5_core_is_pf(dev)) + return NULL; + + if (MLX5_CAP_GEN(dev, num_lag_ports) != MLX5_DEVCOM_PORTS_SUPPORTED) + return NULL; + + if (!mlx5_lag_is_supported(dev)) + return NULL; + + sguid0 = mlx5_query_nic_system_image_guid(dev); + list_for_each_entry(iter, &devcom_list, list) { + struct mlx5_core_dev *tmp_dev = NULL; + + idx = -1; + for (i = 0; i < MLX5_DEVCOM_PORTS_SUPPORTED; i++) { + if (iter->devs[i]) + tmp_dev = iter->devs[i]; + else + idx = i; + } + + if (idx == -1) + continue; + + sguid1 = mlx5_query_nic_system_image_guid(tmp_dev); + if (sguid0 != sguid1) + continue; + + priv = iter; + break; + } + + if (!priv) { + priv = mlx5_devcom_list_alloc(); + if (!priv) + return ERR_PTR(-ENOMEM); + + idx = 0; + new_priv = true; + } + + priv->devs[idx] = dev; + devcom = mlx5_devcom_alloc(priv, idx); + if (!devcom) { + kfree(priv); + return ERR_PTR(-ENOMEM); + } + + if (new_priv) + list_add(&priv->list, &devcom_list); + + return devcom; +} + +/* Must be called with intf_mutex held */ +void mlx5_devcom_unregister_device(struct mlx5_devcom *devcom) +{ + struct mlx5_devcom_list *priv; + int i; + + if (IS_ERR_OR_NULL(devcom)) + return; + + priv = devcom->priv; + priv->devs[devcom->idx] = NULL; + + kfree(devcom); + + for (i = 0; i < MLX5_DEVCOM_PORTS_SUPPORTED; i++) + if (priv->devs[i]) + break; + + if (i != MLX5_DEVCOM_PORTS_SUPPORTED) + return; + + list_del(&priv->list); + kfree(priv); +} + +void mlx5_devcom_register_component(struct mlx5_devcom *devcom, + enum mlx5_devcom_components id, + mlx5_devcom_event_handler_t handler, + void *data) +{ + struct mlx5_devcom_component *comp; + + if (IS_ERR_OR_NULL(devcom)) + return; + + WARN_ON(!data); + + comp = &devcom->priv->components[id]; + down_write(&comp->sem); + comp->handler = handler; + comp->device[devcom->idx].data = data; + up_write(&comp->sem); +} + +void mlx5_devcom_unregister_component(struct mlx5_devcom *devcom, + enum mlx5_devcom_components id) +{ + struct mlx5_devcom_component *comp; + + if (IS_ERR_OR_NULL(devcom)) + return; + + comp = &devcom->priv->components[id]; + down_write(&comp->sem); + comp->device[devcom->idx].data = NULL; + up_write(&comp->sem); +} + +int mlx5_devcom_send_event(struct mlx5_devcom *devcom, + enum mlx5_devcom_components id, + int event, + void *event_data) +{ + struct mlx5_devcom_component *comp; + int err = -ENODEV, i; + + if (IS_ERR_OR_NULL(devcom)) + return err; + + comp = &devcom->priv->components[id]; + down_write(&comp->sem); + for (i = 0; i < MLX5_DEVCOM_PORTS_SUPPORTED; i++) + if (i != devcom->idx && comp->device[i].data) { + err = comp->handler(event, comp->device[i].data, + event_data); + break; + } + + up_write(&comp->sem); + return err; +} + +void mlx5_devcom_set_paired(struct mlx5_devcom *devcom, + enum mlx5_devcom_components id, + bool paired) +{ + struct mlx5_devcom_component *comp; + + comp = &devcom->priv->components[id]; + WARN_ON(!rwsem_is_locked(&comp->sem)); + + comp->paired = paired; +} + +bool mlx5_devcom_is_paired(struct mlx5_devcom *devcom, + enum mlx5_devcom_components id) +{ + if (IS_ERR_OR_NULL(devcom)) + return false; + + return devcom->priv->components[id].paired; +} + +void *mlx5_devcom_get_peer_data(struct mlx5_devcom *devcom, + enum mlx5_devcom_components id) +{ + struct mlx5_devcom_component *comp; + int i; + + if (IS_ERR_OR_NULL(devcom)) + return NULL; + + comp = &devcom->priv->components[id]; + down_read(&comp->sem); + if (!comp->paired) { + up_read(&comp->sem); + return NULL; + } + + for (i = 0; i < MLX5_DEVCOM_PORTS_SUPPORTED; i++) + if (i != devcom->idx) + break; + + return comp->device[i].data; +} + +void mlx5_devcom_release_peer_data(struct mlx5_devcom *devcom, + enum mlx5_devcom_components id) +{ + struct mlx5_devcom_component *comp = &devcom->priv->components[id]; + + up_read(&comp->sem); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.h new file mode 100644 index 0000000..94313c1 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2018 Mellanox Technologies */ + +#ifndef __LIB_MLX5_DEVCOM_H__ +#define __LIB_MLX5_DEVCOM_H__ + +#include + +#define MLX5_DEVCOM_PORTS_SUPPORTED 2 + +enum mlx5_devcom_components { + MLX5_DEVCOM_ESW_OFFLOADS, + + MLX5_DEVCOM_NUM_COMPONENTS, +}; + +typedef int (*mlx5_devcom_event_handler_t)(int event, + void *my_data, + void *event_data); + +struct mlx5_devcom *mlx5_devcom_register_device(struct mlx5_core_dev *dev); +void mlx5_devcom_unregister_device(struct mlx5_devcom *devcom); + +void mlx5_devcom_register_component(struct mlx5_devcom *devcom, + enum mlx5_devcom_components id, + mlx5_devcom_event_handler_t handler, + void *data); +void mlx5_devcom_unregister_component(struct mlx5_devcom *devcom, + enum mlx5_devcom_components id); + +int mlx5_devcom_send_event(struct mlx5_devcom *devcom, + enum mlx5_devcom_components id, + int event, + void *event_data); + +void mlx5_devcom_set_paired(struct mlx5_devcom *devcom, + enum mlx5_devcom_components id, + bool paired); +bool mlx5_devcom_is_paired(struct mlx5_devcom *devcom, + enum mlx5_devcom_components id); + +void *mlx5_devcom_get_peer_data(struct mlx5_devcom *devcom, + enum mlx5_devcom_components id); +void mlx5_devcom_release_peer_data(struct mlx5_devcom *devcom, + enum mlx5_devcom_components id); + +#endif + diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/dm.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/dm.c new file mode 100644 index 0000000..2417586 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/dm.c @@ -0,0 +1,307 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// Copyright (c) 2019 Mellanox Technologies + +#include +#include + +#include "mlx5_core.h" +#include "lib/mlx5.h" + +struct mlx5_dm { + /* protect access to icm bitmask */ + spinlock_t lock; + unsigned long *steering_sw_icm_alloc_blocks; + unsigned long *header_modify_sw_icm_alloc_blocks; + unsigned long *header_modify_pattern_sw_icm_alloc_blocks; + unsigned long *header_encap_sw_icm_alloc_blocks; +}; + +struct mlx5_dm *mlx5_dm_create(struct mlx5_core_dev *dev) +{ + u64 header_modify_pattern_icm_blocks = 0; + u64 header_sw_encap_icm_blocks = 0; + u64 header_modify_icm_blocks = 0; + u64 steering_icm_blocks = 0; + struct mlx5_dm *dm; + bool support_v2; + + if (!(MLX5_CAP_GEN_64(dev, general_obj_types) & MLX5_GENERAL_OBJ_TYPES_CAP_SW_ICM)) + return NULL; + + dm = kzalloc(sizeof(*dm), GFP_KERNEL); + if (!dm) + return ERR_PTR(-ENOMEM); + + spin_lock_init(&dm->lock); + + if (MLX5_CAP64_DEV_MEM(dev, steering_sw_icm_start_address)) { + steering_icm_blocks = + BIT(MLX5_CAP_DEV_MEM(dev, log_steering_sw_icm_size) - + MLX5_LOG_SW_ICM_BLOCK_SIZE(dev)); + + dm->steering_sw_icm_alloc_blocks = + kcalloc(BITS_TO_LONGS(steering_icm_blocks), + sizeof(unsigned long), GFP_KERNEL); + if (!dm->steering_sw_icm_alloc_blocks) + goto err_steering; + } + + if (MLX5_CAP64_DEV_MEM(dev, header_modify_sw_icm_start_address)) { + header_modify_icm_blocks = + BIT(MLX5_CAP_DEV_MEM(dev, log_header_modify_sw_icm_size) - + MLX5_LOG_SW_ICM_BLOCK_SIZE(dev)); + + dm->header_modify_sw_icm_alloc_blocks = + kcalloc(BITS_TO_LONGS(header_modify_icm_blocks), + sizeof(unsigned long), GFP_KERNEL); + if (!dm->header_modify_sw_icm_alloc_blocks) + goto err_modify_hdr; + } + + if (MLX5_CAP_DEV_MEM(dev, log_indirect_encap_sw_icm_size)) { + header_sw_encap_icm_blocks = + BIT(MLX5_CAP_DEV_MEM(dev, log_indirect_encap_sw_icm_size) - + MLX5_LOG_SW_ICM_BLOCK_SIZE(dev)); + + dm->header_encap_sw_icm_alloc_blocks = + kcalloc(BITS_TO_LONGS(header_sw_encap_icm_blocks), + sizeof(unsigned long), GFP_KERNEL); + if (!dm->header_encap_sw_icm_alloc_blocks) + goto err_sw_encap; + } + + support_v2 = MLX5_CAP_FLOWTABLE_NIC_RX(dev, sw_owner_v2) && + MLX5_CAP_FLOWTABLE_NIC_TX(dev, sw_owner_v2); + + if (support_v2 && + MLX5_CAP64_DEV_MEM(dev, header_modify_pattern_sw_icm_start_address)) { + header_modify_pattern_icm_blocks = + BIT(MLX5_CAP_DEV_MEM(dev, log_header_modify_pattern_sw_icm_size) - + MLX5_LOG_SW_ICM_BLOCK_SIZE(dev)); + + dm->header_modify_pattern_sw_icm_alloc_blocks = + kcalloc(BITS_TO_LONGS(header_modify_pattern_icm_blocks), + sizeof(unsigned long), GFP_KERNEL); + if (!dm->header_modify_pattern_sw_icm_alloc_blocks) + goto err_pattern; + } + + return dm; + +err_pattern: + kfree(dm->header_encap_sw_icm_alloc_blocks); + +err_sw_encap: + kfree(dm->header_modify_sw_icm_alloc_blocks); + +err_modify_hdr: + kfree(dm->steering_sw_icm_alloc_blocks); + +err_steering: + kfree(dm); + + return ERR_PTR(-ENOMEM); +} + +void mlx5_dm_cleanup(struct mlx5_core_dev *dev) +{ + struct mlx5_dm *dm = dev->dm; + + if (!dev->dm) + return; + + if (dm->steering_sw_icm_alloc_blocks) { + WARN_ON(!bitmap_empty(dm->steering_sw_icm_alloc_blocks, + BIT(MLX5_CAP_DEV_MEM(dev, log_steering_sw_icm_size) - + MLX5_LOG_SW_ICM_BLOCK_SIZE(dev)))); + kfree(dm->steering_sw_icm_alloc_blocks); + } + + if (dm->header_modify_sw_icm_alloc_blocks) { + WARN_ON(!bitmap_empty(dm->header_modify_sw_icm_alloc_blocks, + BIT(MLX5_CAP_DEV_MEM(dev, + log_header_modify_sw_icm_size) - + MLX5_LOG_SW_ICM_BLOCK_SIZE(dev)))); + kfree(dm->header_modify_sw_icm_alloc_blocks); + } + + if (dm->header_encap_sw_icm_alloc_blocks) { + WARN_ON(!bitmap_empty(dm->header_encap_sw_icm_alloc_blocks, + BIT(MLX5_CAP_DEV_MEM(dev, + log_indirect_encap_sw_icm_size) - + MLX5_LOG_SW_ICM_BLOCK_SIZE(dev)))); + kfree(dm->header_encap_sw_icm_alloc_blocks); + } + + if (dm->header_modify_pattern_sw_icm_alloc_blocks) { + WARN_ON(!bitmap_empty(dm->header_modify_pattern_sw_icm_alloc_blocks, + BIT(MLX5_CAP_DEV_MEM(dev, + log_header_modify_pattern_sw_icm_size) - + MLX5_LOG_SW_ICM_BLOCK_SIZE(dev)))); + kfree(dm->header_modify_pattern_sw_icm_alloc_blocks); + } + + kfree(dm); +} + +int mlx5_dm_sw_icm_alloc(struct mlx5_core_dev *dev, enum mlx5_sw_icm_type type, + u64 length, u32 log_alignment, u16 uid, + phys_addr_t *addr, u32 *obj_id) +{ + u32 num_blocks = DIV_ROUND_UP_ULL(length, MLX5_SW_ICM_BLOCK_SIZE(dev)); + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {}; + u32 in[MLX5_ST_SZ_DW(create_sw_icm_in)] = {}; + struct mlx5_dm *dm = dev->dm; + unsigned long *block_map; + u64 icm_start_addr; + u32 log_icm_size; + u64 align_mask; + u32 max_blocks; + u64 block_idx; + void *sw_icm; + int ret; + + if (!dev->dm) + return -EOPNOTSUPP; + + if (!length || (length & (length - 1)) || + length & (MLX5_SW_ICM_BLOCK_SIZE(dev) - 1)) + return -EINVAL; + + MLX5_SET(general_obj_in_cmd_hdr, in, opcode, + MLX5_CMD_OP_CREATE_GENERAL_OBJECT); + MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_OBJ_TYPE_SW_ICM); + MLX5_SET(general_obj_in_cmd_hdr, in, uid, uid); + + switch (type) { + case MLX5_SW_ICM_TYPE_STEERING: + icm_start_addr = MLX5_CAP64_DEV_MEM(dev, steering_sw_icm_start_address); + log_icm_size = MLX5_CAP_DEV_MEM(dev, log_steering_sw_icm_size); + block_map = dm->steering_sw_icm_alloc_blocks; + break; + case MLX5_SW_ICM_TYPE_HEADER_MODIFY: + icm_start_addr = MLX5_CAP64_DEV_MEM(dev, header_modify_sw_icm_start_address); + log_icm_size = MLX5_CAP_DEV_MEM(dev, + log_header_modify_sw_icm_size); + block_map = dm->header_modify_sw_icm_alloc_blocks; + break; + case MLX5_SW_ICM_TYPE_HEADER_MODIFY_PATTERN: + icm_start_addr = MLX5_CAP64_DEV_MEM(dev, + header_modify_pattern_sw_icm_start_address); + log_icm_size = MLX5_CAP_DEV_MEM(dev, + log_header_modify_pattern_sw_icm_size); + block_map = dm->header_modify_pattern_sw_icm_alloc_blocks; + break; + case MLX5_SW_ICM_TYPE_SW_ENCAP: + icm_start_addr = MLX5_CAP64_DEV_MEM(dev, + indirect_encap_sw_icm_start_address); + log_icm_size = MLX5_CAP_DEV_MEM(dev, + log_indirect_encap_sw_icm_size); + block_map = dm->header_encap_sw_icm_alloc_blocks; + break; + default: + return -EINVAL; + } + + if (!block_map) + return -EOPNOTSUPP; + + max_blocks = BIT(log_icm_size - MLX5_LOG_SW_ICM_BLOCK_SIZE(dev)); + + if (log_alignment < MLX5_LOG_SW_ICM_BLOCK_SIZE(dev)) + log_alignment = MLX5_LOG_SW_ICM_BLOCK_SIZE(dev); + align_mask = BIT(log_alignment - MLX5_LOG_SW_ICM_BLOCK_SIZE(dev)) - 1; + + spin_lock(&dm->lock); + block_idx = bitmap_find_next_zero_area(block_map, max_blocks, 0, + num_blocks, align_mask); + + if (block_idx < max_blocks) + bitmap_set(block_map, + block_idx, num_blocks); + + spin_unlock(&dm->lock); + + if (block_idx >= max_blocks) + return -ENOMEM; + + sw_icm = MLX5_ADDR_OF(create_sw_icm_in, in, sw_icm); + icm_start_addr += block_idx << MLX5_LOG_SW_ICM_BLOCK_SIZE(dev); + MLX5_SET64(sw_icm, sw_icm, sw_icm_start_addr, + icm_start_addr); + MLX5_SET(sw_icm, sw_icm, log_sw_icm_size, ilog2(length)); + + ret = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + if (ret) { + spin_lock(&dm->lock); + bitmap_clear(block_map, + block_idx, num_blocks); + spin_unlock(&dm->lock); + + return ret; + } + + *addr = icm_start_addr; + *obj_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id); + + return 0; +} +EXPORT_SYMBOL_GPL(mlx5_dm_sw_icm_alloc); + +int mlx5_dm_sw_icm_dealloc(struct mlx5_core_dev *dev, enum mlx5_sw_icm_type type, + u64 length, u16 uid, phys_addr_t addr, u32 obj_id) +{ + u32 num_blocks = DIV_ROUND_UP_ULL(length, MLX5_SW_ICM_BLOCK_SIZE(dev)); + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {}; + u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {}; + struct mlx5_dm *dm = dev->dm; + unsigned long *block_map; + u64 icm_start_addr; + u64 start_idx; + int err; + + if (!dev->dm) + return -EOPNOTSUPP; + + switch (type) { + case MLX5_SW_ICM_TYPE_STEERING: + icm_start_addr = MLX5_CAP64_DEV_MEM(dev, steering_sw_icm_start_address); + block_map = dm->steering_sw_icm_alloc_blocks; + break; + case MLX5_SW_ICM_TYPE_HEADER_MODIFY: + icm_start_addr = MLX5_CAP64_DEV_MEM(dev, header_modify_sw_icm_start_address); + block_map = dm->header_modify_sw_icm_alloc_blocks; + break; + case MLX5_SW_ICM_TYPE_HEADER_MODIFY_PATTERN: + icm_start_addr = MLX5_CAP64_DEV_MEM(dev, + header_modify_pattern_sw_icm_start_address); + block_map = dm->header_modify_pattern_sw_icm_alloc_blocks; + break; + case MLX5_SW_ICM_TYPE_SW_ENCAP: + icm_start_addr = MLX5_CAP64_DEV_MEM(dev, + indirect_encap_sw_icm_start_address); + block_map = dm->header_encap_sw_icm_alloc_blocks; + break; + default: + return -EINVAL; + } + + MLX5_SET(general_obj_in_cmd_hdr, in, opcode, + MLX5_CMD_OP_DESTROY_GENERAL_OBJECT); + MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_OBJ_TYPE_SW_ICM); + MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, obj_id); + MLX5_SET(general_obj_in_cmd_hdr, in, uid, uid); + + err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + if (err) + return err; + + start_idx = (addr - icm_start_addr) >> MLX5_LOG_SW_ICM_BLOCK_SIZE(dev); + spin_lock(&dm->lock); + bitmap_clear(block_map, + start_idx, num_blocks); + spin_unlock(&dm->lock); + + return 0; +} +EXPORT_SYMBOL_GPL(mlx5_dm_sw_icm_dealloc); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h new file mode 100644 index 0000000..0bd8c2d --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2018-2021, Mellanox Technologies inc. All rights reserved. */ + +#ifndef __LIB_MLX5_EQ_H__ +#define __LIB_MLX5_EQ_H__ +#include +#include +#include + +#define MLX5_EQE_SIZE (sizeof(struct mlx5_eqe)) + +struct mlx5_eq_tasklet { + struct list_head list; + struct list_head process_list; + struct tasklet_struct task; + spinlock_t lock; /* lock completion tasklet list */ +}; + +struct mlx5_cq_table { + spinlock_t lock; /* protect radix tree */ + struct radix_tree_root tree; +}; + +struct mlx5_eq { + struct mlx5_frag_buf_ctrl fbc; + struct mlx5_frag_buf frag_buf; + struct mlx5_core_dev *dev; + struct mlx5_cq_table cq_table; + __be32 __iomem *doorbell; + u32 cons_index; + unsigned int vecidx; + unsigned int irqn; + u8 eqn; + struct mlx5_rsc_debug *dbg; + struct mlx5_irq *irq; +}; + +struct mlx5_eq_async { + struct mlx5_eq core; + struct notifier_block irq_nb; + spinlock_t lock; /* To avoid irq EQ handle races with resiliency flows */ +}; + +struct mlx5_eq_comp { + struct mlx5_eq core; + struct notifier_block irq_nb; + struct mlx5_eq_tasklet tasklet_ctx; + struct list_head list; +}; + +static inline u32 eq_get_size(struct mlx5_eq *eq) +{ + return eq->fbc.sz_m1 + 1; +} + +static inline struct mlx5_eqe *get_eqe(struct mlx5_eq *eq, u32 entry) +{ + return mlx5_frag_buf_get_wqe(&eq->fbc, entry); +} + +static inline struct mlx5_eqe *next_eqe_sw(struct mlx5_eq *eq) +{ + struct mlx5_eqe *eqe = get_eqe(eq, eq->cons_index & eq->fbc.sz_m1); + + return (eqe->owner ^ (eq->cons_index >> eq->fbc.log_sz)) & 1 ? NULL : eqe; +} + +static inline void eq_update_ci(struct mlx5_eq *eq, int arm) +{ + __be32 __iomem *addr = eq->doorbell + (arm ? 0 : 2); + u32 val = (eq->cons_index & 0xffffff) | (eq->eqn << 24); + + __raw_writel((__force u32)cpu_to_be32(val), addr); + /* We still want ordering, just not swabbing, so add a barrier */ + mb(); +} + +int mlx5_eq_table_init(struct mlx5_core_dev *dev); +void mlx5_eq_table_cleanup(struct mlx5_core_dev *dev); +int mlx5_eq_table_create(struct mlx5_core_dev *dev); +void mlx5_eq_table_destroy(struct mlx5_core_dev *dev); + +int mlx5_eq_add_cq(struct mlx5_eq *eq, struct mlx5_core_cq *cq); +void mlx5_eq_del_cq(struct mlx5_eq *eq, struct mlx5_core_cq *cq); +struct mlx5_eq_comp *mlx5_eqn2comp_eq(struct mlx5_core_dev *dev, int eqn); +struct mlx5_eq *mlx5_get_async_eq(struct mlx5_core_dev *dev); +void mlx5_cq_tasklet_cb(struct tasklet_struct *t); +struct cpumask *mlx5_eq_comp_cpumask(struct mlx5_core_dev *dev, int ix); + +u32 mlx5_eq_poll_irq_disabled(struct mlx5_eq_comp *eq); +void mlx5_cmd_eq_recover(struct mlx5_core_dev *dev); +int mlx5_vector2eq(struct mlx5_core_dev *dev, int vector, struct mlx5_eq_comp *eqc); +void mlx5_eq_synchronize_async_irq(struct mlx5_core_dev *dev); +void mlx5_eq_synchronize_cmd_irq(struct mlx5_core_dev *dev); + +int mlx5_debug_eq_add(struct mlx5_core_dev *dev, struct mlx5_eq *eq); +void mlx5_debug_eq_remove(struct mlx5_core_dev *dev, struct mlx5_eq *eq); +void mlx5_eq_debugfs_init(struct mlx5_core_dev *dev); +void mlx5_eq_debugfs_cleanup(struct mlx5_core_dev *dev); + +/* This function should only be called after mlx5_cmd_force_teardown_hca */ +void mlx5_core_eq_free_irqs(struct mlx5_core_dev *dev); +#ifdef CONFIG_RFS_ACCEL +struct cpu_rmap *mlx5_eq_table_get_rmap(struct mlx5_core_dev *dev); +#endif + +int mlx5_vector2irqn(struct mlx5_core_dev *dev, int vector, unsigned int *irqn); + +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c new file mode 100644 index 0000000..df58cba --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c @@ -0,0 +1,810 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// Copyright (c) 2020 Mellanox Technologies. + +#include +#include +#include + +#include "lib/fs_chains.h" +#include "fs_ft_pool.h" +#include "en/mapping.h" +#include "fs_core.h" +#include "en_tc.h" + +#define chains_lock(chains) ((chains)->lock) +#define chains_ht(chains) ((chains)->chains_ht) +#define prios_ht(chains) ((chains)->prios_ht) +#define tc_default_ft(chains) ((chains)->tc_default_ft) +#define tc_end_ft(chains) ((chains)->tc_end_ft) +#define ns_to_chains_fs_prio(ns) ((ns) == MLX5_FLOW_NAMESPACE_FDB ? \ + FDB_TC_OFFLOAD : MLX5E_TC_PRIO) +#define FT_TBL_SZ (64 * 1024) + +struct mlx5_fs_chains { + struct mlx5_core_dev *dev; + + struct rhashtable chains_ht; + struct rhashtable prios_ht; + /* Protects above chains_ht and prios_ht */ + struct mutex lock; + + struct mlx5_flow_table *tc_default_ft; + struct mlx5_flow_table *tc_end_ft; + struct mapping_ctx *chains_mapping; + + enum mlx5_flow_namespace_type ns; + u32 group_num; + u32 flags; +}; + +struct fs_chain { + struct rhash_head node; + + u32 chain; + + int ref; + int id; + + struct mlx5_fs_chains *chains; + struct list_head prios_list; + struct mlx5_flow_handle *restore_rule; + struct mlx5_modify_hdr *miss_modify_hdr; +}; + +struct prio_key { + u32 chain; + u32 prio; + u32 level; +}; + +struct prio { + struct rhash_head node; + struct list_head list; + + struct prio_key key; + + int ref; + + struct fs_chain *chain; + struct mlx5_flow_table *ft; + struct mlx5_flow_table *next_ft; + struct mlx5_flow_group *miss_group; + struct mlx5_flow_handle *miss_rule; +}; + +static const struct rhashtable_params chain_params = { + .head_offset = offsetof(struct fs_chain, node), + .key_offset = offsetof(struct fs_chain, chain), + .key_len = sizeof_field(struct fs_chain, chain), + .automatic_shrinking = true, +}; + +static const struct rhashtable_params prio_params = { + .head_offset = offsetof(struct prio, node), + .key_offset = offsetof(struct prio, key), + .key_len = sizeof_field(struct prio, key), + .automatic_shrinking = true, +}; + +bool mlx5_chains_prios_supported(struct mlx5_fs_chains *chains) +{ + return chains->flags & MLX5_CHAINS_AND_PRIOS_SUPPORTED; +} + +bool mlx5_chains_ignore_flow_level_supported(struct mlx5_fs_chains *chains) +{ + return chains->flags & MLX5_CHAINS_IGNORE_FLOW_LEVEL_SUPPORTED; +} + +bool mlx5_chains_backwards_supported(struct mlx5_fs_chains *chains) +{ + return mlx5_chains_prios_supported(chains) && + mlx5_chains_ignore_flow_level_supported(chains); +} + +u32 mlx5_chains_get_chain_range(struct mlx5_fs_chains *chains) +{ + if (!mlx5_chains_prios_supported(chains)) + return 1; + + if (mlx5_chains_ignore_flow_level_supported(chains)) + return UINT_MAX - 1; + + /* We should get here only for eswitch case */ + return FDB_TC_MAX_CHAIN; +} + +u32 mlx5_chains_get_nf_ft_chain(struct mlx5_fs_chains *chains) +{ + return mlx5_chains_get_chain_range(chains) + 1; +} + +u32 mlx5_chains_get_prio_range(struct mlx5_fs_chains *chains) +{ + if (mlx5_chains_ignore_flow_level_supported(chains)) + return UINT_MAX; + + if (!chains->dev->priv.eswitch || + chains->dev->priv.eswitch->mode != MLX5_ESWITCH_OFFLOADS) + return 1; + + /* We should get here only for eswitch case */ + return FDB_TC_MAX_PRIO; +} + +static unsigned int mlx5_chains_get_level_range(struct mlx5_fs_chains *chains) +{ + if (mlx5_chains_ignore_flow_level_supported(chains)) + return UINT_MAX; + + /* Same value for FDB and NIC RX tables */ + return FDB_TC_LEVELS_PER_PRIO; +} + +void +mlx5_chains_set_end_ft(struct mlx5_fs_chains *chains, + struct mlx5_flow_table *ft) +{ + tc_end_ft(chains) = ft; +} + +static struct mlx5_flow_table * +mlx5_chains_create_table(struct mlx5_fs_chains *chains, + u32 chain, u32 prio, u32 level) +{ + struct mlx5_flow_table_attr ft_attr = {}; + struct mlx5_flow_namespace *ns; + struct mlx5_flow_table *ft; + int sz; + + if (chains->flags & MLX5_CHAINS_FT_TUNNEL_SUPPORTED) + ft_attr.flags |= (MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT | + MLX5_FLOW_TABLE_TUNNEL_EN_DECAP); + + sz = (chain == mlx5_chains_get_nf_ft_chain(chains)) ? FT_TBL_SZ : POOL_NEXT_SIZE; + ft_attr.max_fte = sz; + + /* We use tc_default_ft(chains) as the table's next_ft till + * ignore_flow_level is allowed on FT creation and not just for FTEs. + * Instead caller should add an explicit miss rule if needed. + */ + ft_attr.next_ft = tc_default_ft(chains); + + /* The root table(chain 0, prio 1, level 0) is required to be + * connected to the previous fs_core managed prio. + * We always create it, as a managed table, in order to align with + * fs_core logic. + */ + if (!mlx5_chains_ignore_flow_level_supported(chains) || + (chain == 0 && prio == 1 && level == 0)) { + ft_attr.level = level; + ft_attr.prio = prio - 1; + ns = (chains->ns == MLX5_FLOW_NAMESPACE_FDB) ? + mlx5_get_fdb_sub_ns(chains->dev, chain) : + mlx5_get_flow_namespace(chains->dev, chains->ns); + } else { + ft_attr.flags |= MLX5_FLOW_TABLE_UNMANAGED; + ft_attr.prio = ns_to_chains_fs_prio(chains->ns); + /* Firmware doesn't allow us to create another level 0 table, + * so we create all unmanaged tables as level 1. + * + * To connect them, we use explicit miss rules with + * ignore_flow_level. Caller is responsible to create + * these rules (if needed). + */ + ft_attr.level = 1; + ns = mlx5_get_flow_namespace(chains->dev, chains->ns); + } + + ft_attr.autogroup.num_reserved_entries = 2; + ft_attr.autogroup.max_num_groups = chains->group_num; + ft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr); + if (IS_ERR(ft)) { + mlx5_core_warn(chains->dev, "Failed to create chains table err %d (chain: %d, prio: %d, level: %d, size: %d)\n", + (int)PTR_ERR(ft), chain, prio, level, sz); + return ft; + } + + return ft; +} + +static int +create_chain_restore(struct fs_chain *chain) +{ + struct mlx5_eswitch *esw = chain->chains->dev->priv.eswitch; + u8 modact[MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto)] = {}; + struct mlx5_fs_chains *chains = chain->chains; + enum mlx5e_tc_attr_to_reg chain_to_reg; + struct mlx5_modify_hdr *mod_hdr; + u32 index; + int err; + + if (chain->chain == mlx5_chains_get_nf_ft_chain(chains) || + !mlx5_chains_prios_supported(chains)) + return 0; + + err = mlx5_chains_get_chain_mapping(chains, chain->chain, &index); + if (err) + return err; + if (index == MLX5_FS_DEFAULT_FLOW_TAG) { + /* we got the special default flow tag id, so we won't know + * if we actually marked the packet with the restore rule + * we create. + * + * This case isn't possible with MLX5_FS_DEFAULT_FLOW_TAG = 0. + */ + err = mlx5_chains_get_chain_mapping(chains, chain->chain, &index); + mapping_remove(chains->chains_mapping, MLX5_FS_DEFAULT_FLOW_TAG); + if (err) + return err; + } + + chain->id = index; + + if (chains->ns == MLX5_FLOW_NAMESPACE_FDB) { + chain_to_reg = CHAIN_TO_REG; + chain->restore_rule = esw_add_restore_rule(esw, chain->id); + if (IS_ERR(chain->restore_rule)) { + err = PTR_ERR(chain->restore_rule); + goto err_rule; + } + } else if (chains->ns == MLX5_FLOW_NAMESPACE_KERNEL) { + /* For NIC RX we don't need a restore rule + * since we write the metadata to reg_b + * that is passed to SW directly. + */ + chain_to_reg = NIC_CHAIN_TO_REG; + } else { + err = -EINVAL; + goto err_rule; + } + + MLX5_SET(set_action_in, modact, action_type, MLX5_ACTION_TYPE_SET); + MLX5_SET(set_action_in, modact, field, + mlx5e_tc_attr_to_reg_mappings[chain_to_reg].mfield); + MLX5_SET(set_action_in, modact, offset, + mlx5e_tc_attr_to_reg_mappings[chain_to_reg].moffset); + MLX5_SET(set_action_in, modact, length, + mlx5e_tc_attr_to_reg_mappings[chain_to_reg].mlen == 32 ? + 0 : mlx5e_tc_attr_to_reg_mappings[chain_to_reg].mlen); + MLX5_SET(set_action_in, modact, data, chain->id); + mod_hdr = mlx5_modify_header_alloc(chains->dev, chains->ns, + 1, modact); + if (IS_ERR(mod_hdr)) { + err = PTR_ERR(mod_hdr); + goto err_mod_hdr; + } + chain->miss_modify_hdr = mod_hdr; + + return 0; + +err_mod_hdr: + if (!IS_ERR_OR_NULL(chain->restore_rule)) + mlx5_del_flow_rules(chain->restore_rule); +err_rule: + /* Datapath can't find this mapping, so we can safely remove it */ + mapping_remove(chains->chains_mapping, chain->id); + return err; +} + +static void destroy_chain_restore(struct fs_chain *chain) +{ + struct mlx5_fs_chains *chains = chain->chains; + + if (!chain->miss_modify_hdr) + return; + + if (chain->restore_rule) + mlx5_del_flow_rules(chain->restore_rule); + + mlx5_modify_header_dealloc(chains->dev, chain->miss_modify_hdr); + mapping_remove(chains->chains_mapping, chain->id); +} + +static struct fs_chain * +mlx5_chains_create_chain(struct mlx5_fs_chains *chains, u32 chain) +{ + struct fs_chain *chain_s = NULL; + int err; + + chain_s = kvzalloc(sizeof(*chain_s), GFP_KERNEL); + if (!chain_s) + return ERR_PTR(-ENOMEM); + + chain_s->chains = chains; + chain_s->chain = chain; + INIT_LIST_HEAD(&chain_s->prios_list); + + err = create_chain_restore(chain_s); + if (err) + goto err_restore; + + err = rhashtable_insert_fast(&chains_ht(chains), &chain_s->node, + chain_params); + if (err) + goto err_insert; + + return chain_s; + +err_insert: + destroy_chain_restore(chain_s); +err_restore: + kvfree(chain_s); + return ERR_PTR(err); +} + +static void +mlx5_chains_destroy_chain(struct fs_chain *chain) +{ + struct mlx5_fs_chains *chains = chain->chains; + + rhashtable_remove_fast(&chains_ht(chains), &chain->node, + chain_params); + + destroy_chain_restore(chain); + kvfree(chain); +} + +static struct fs_chain * +mlx5_chains_get_chain(struct mlx5_fs_chains *chains, u32 chain) +{ + struct fs_chain *chain_s; + + chain_s = rhashtable_lookup_fast(&chains_ht(chains), &chain, + chain_params); + if (!chain_s) { + chain_s = mlx5_chains_create_chain(chains, chain); + if (IS_ERR(chain_s)) + return chain_s; + } + + chain_s->ref++; + + return chain_s; +} + +static struct mlx5_flow_handle * +mlx5_chains_add_miss_rule(struct fs_chain *chain, + struct mlx5_flow_table *ft, + struct mlx5_flow_table *next_ft) +{ + struct mlx5_fs_chains *chains = chain->chains; + struct mlx5_flow_destination dest = {}; + struct mlx5_flow_act act = {}; + + act.flags = FLOW_ACT_NO_APPEND; + if (mlx5_chains_ignore_flow_level_supported(chain->chains)) + act.flags |= FLOW_ACT_IGNORE_FLOW_LEVEL; + + act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dest.ft = next_ft; + + if (next_ft == tc_end_ft(chains) && + chain->chain != mlx5_chains_get_nf_ft_chain(chains) && + mlx5_chains_prios_supported(chains)) { + act.modify_hdr = chain->miss_modify_hdr; + act.action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR; + } + + return mlx5_add_flow_rules(ft, NULL, &act, &dest, 1); +} + +static int +mlx5_chains_update_prio_prevs(struct prio *prio, + struct mlx5_flow_table *next_ft) +{ + struct mlx5_flow_handle *miss_rules[FDB_TC_LEVELS_PER_PRIO + 1] = {}; + struct fs_chain *chain = prio->chain; + struct prio *pos; + int n = 0, err; + + if (prio->key.level) + return 0; + + /* Iterate in reverse order until reaching the level 0 rule of + * the previous priority, adding all the miss rules first, so we can + * revert them if any of them fails. + */ + pos = prio; + list_for_each_entry_continue_reverse(pos, + &chain->prios_list, + list) { + miss_rules[n] = mlx5_chains_add_miss_rule(chain, + pos->ft, + next_ft); + if (IS_ERR(miss_rules[n])) { + err = PTR_ERR(miss_rules[n]); + goto err_prev_rule; + } + + n++; + if (!pos->key.level) + break; + } + + /* Success, delete old miss rules, and update the pointers. */ + n = 0; + pos = prio; + list_for_each_entry_continue_reverse(pos, + &chain->prios_list, + list) { + mlx5_del_flow_rules(pos->miss_rule); + + pos->miss_rule = miss_rules[n]; + pos->next_ft = next_ft; + + n++; + if (!pos->key.level) + break; + } + + return 0; + +err_prev_rule: + while (--n >= 0) + mlx5_del_flow_rules(miss_rules[n]); + + return err; +} + +static void +mlx5_chains_put_chain(struct fs_chain *chain) +{ + if (--chain->ref == 0) + mlx5_chains_destroy_chain(chain); +} + +static struct prio * +mlx5_chains_create_prio(struct mlx5_fs_chains *chains, + u32 chain, u32 prio, u32 level) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_flow_handle *miss_rule; + struct mlx5_flow_group *miss_group; + struct mlx5_flow_table *next_ft; + struct mlx5_flow_table *ft; + struct fs_chain *chain_s; + struct list_head *pos; + struct prio *prio_s; + u32 *flow_group_in; + int err; + + chain_s = mlx5_chains_get_chain(chains, chain); + if (IS_ERR(chain_s)) + return ERR_CAST(chain_s); + + prio_s = kvzalloc(sizeof(*prio_s), GFP_KERNEL); + flow_group_in = kvzalloc(inlen, GFP_KERNEL); + if (!prio_s || !flow_group_in) { + err = -ENOMEM; + goto err_alloc; + } + + /* Chain's prio list is sorted by prio and level. + * And all levels of some prio point to the next prio's level 0. + * Example list (prio, level): + * (3,0)->(3,1)->(5,0)->(5,1)->(6,1)->(7,0) + * In hardware, we will we have the following pointers: + * (3,0) -> (5,0) -> (7,0) -> Slow path + * (3,1) -> (5,0) + * (5,1) -> (7,0) + * (6,1) -> (7,0) + */ + + /* Default miss for each chain: */ + next_ft = (chain == mlx5_chains_get_nf_ft_chain(chains)) ? + tc_default_ft(chains) : + tc_end_ft(chains); + list_for_each(pos, &chain_s->prios_list) { + struct prio *p = list_entry(pos, struct prio, list); + + /* exit on first pos that is larger */ + if (prio < p->key.prio || (prio == p->key.prio && + level < p->key.level)) { + /* Get next level 0 table */ + next_ft = p->key.level == 0 ? p->ft : p->next_ft; + break; + } + } + + ft = mlx5_chains_create_table(chains, chain, prio, level); + if (IS_ERR(ft)) { + err = PTR_ERR(ft); + goto err_create; + } + + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, + ft->max_fte - 2); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, + ft->max_fte - 1); + miss_group = mlx5_create_flow_group(ft, flow_group_in); + if (IS_ERR(miss_group)) { + err = PTR_ERR(miss_group); + goto err_group; + } + + /* Add miss rule to next_ft */ + miss_rule = mlx5_chains_add_miss_rule(chain_s, ft, next_ft); + if (IS_ERR(miss_rule)) { + err = PTR_ERR(miss_rule); + goto err_miss_rule; + } + + prio_s->miss_group = miss_group; + prio_s->miss_rule = miss_rule; + prio_s->next_ft = next_ft; + prio_s->chain = chain_s; + prio_s->key.chain = chain; + prio_s->key.prio = prio; + prio_s->key.level = level; + prio_s->ft = ft; + + err = rhashtable_insert_fast(&prios_ht(chains), &prio_s->node, + prio_params); + if (err) + goto err_insert; + + list_add(&prio_s->list, pos->prev); + + /* Table is ready, connect it */ + err = mlx5_chains_update_prio_prevs(prio_s, ft); + if (err) + goto err_update; + + kvfree(flow_group_in); + return prio_s; + +err_update: + list_del(&prio_s->list); + rhashtable_remove_fast(&prios_ht(chains), &prio_s->node, + prio_params); +err_insert: + mlx5_del_flow_rules(miss_rule); +err_miss_rule: + mlx5_destroy_flow_group(miss_group); +err_group: + mlx5_destroy_flow_table(ft); +err_create: +err_alloc: + kvfree(prio_s); + kvfree(flow_group_in); + mlx5_chains_put_chain(chain_s); + return ERR_PTR(err); +} + +static void +mlx5_chains_destroy_prio(struct mlx5_fs_chains *chains, + struct prio *prio) +{ + struct fs_chain *chain = prio->chain; + + WARN_ON(mlx5_chains_update_prio_prevs(prio, + prio->next_ft)); + + list_del(&prio->list); + rhashtable_remove_fast(&prios_ht(chains), &prio->node, + prio_params); + mlx5_del_flow_rules(prio->miss_rule); + mlx5_destroy_flow_group(prio->miss_group); + mlx5_destroy_flow_table(prio->ft); + mlx5_chains_put_chain(chain); + kvfree(prio); +} + +struct mlx5_flow_table * +mlx5_chains_get_table(struct mlx5_fs_chains *chains, u32 chain, u32 prio, + u32 level) +{ + struct mlx5_flow_table *prev_fts; + struct prio *prio_s; + struct prio_key key; + int l = 0; + + if ((chain > mlx5_chains_get_chain_range(chains) && + chain != mlx5_chains_get_nf_ft_chain(chains)) || + prio > mlx5_chains_get_prio_range(chains) || + level > mlx5_chains_get_level_range(chains)) + return ERR_PTR(-EOPNOTSUPP); + + /* create earlier levels for correct fs_core lookup when + * connecting tables. + */ + for (l = 0; l < level; l++) { + prev_fts = mlx5_chains_get_table(chains, chain, prio, l); + if (IS_ERR(prev_fts)) { + prio_s = ERR_CAST(prev_fts); + goto err_get_prevs; + } + } + + key.chain = chain; + key.prio = prio; + key.level = level; + + mutex_lock(&chains_lock(chains)); + prio_s = rhashtable_lookup_fast(&prios_ht(chains), &key, + prio_params); + if (!prio_s) { + prio_s = mlx5_chains_create_prio(chains, chain, + prio, level); + if (IS_ERR(prio_s)) + goto err_create_prio; + } + + ++prio_s->ref; + mutex_unlock(&chains_lock(chains)); + + return prio_s->ft; + +err_create_prio: + mutex_unlock(&chains_lock(chains)); +err_get_prevs: + while (--l >= 0) + mlx5_chains_put_table(chains, chain, prio, l); + return ERR_CAST(prio_s); +} + +void +mlx5_chains_put_table(struct mlx5_fs_chains *chains, u32 chain, u32 prio, + u32 level) +{ + struct prio *prio_s; + struct prio_key key; + + key.chain = chain; + key.prio = prio; + key.level = level; + + mutex_lock(&chains_lock(chains)); + prio_s = rhashtable_lookup_fast(&prios_ht(chains), &key, + prio_params); + if (!prio_s) + goto err_get_prio; + + if (--prio_s->ref == 0) + mlx5_chains_destroy_prio(chains, prio_s); + mutex_unlock(&chains_lock(chains)); + + while (level-- > 0) + mlx5_chains_put_table(chains, chain, prio, level); + + return; + +err_get_prio: + mutex_unlock(&chains_lock(chains)); + WARN_ONCE(1, + "Couldn't find table: (chain: %d prio: %d level: %d)", + chain, prio, level); +} + +struct mlx5_flow_table * +mlx5_chains_get_tc_end_ft(struct mlx5_fs_chains *chains) +{ + return tc_end_ft(chains); +} + +struct mlx5_flow_table * +mlx5_chains_create_global_table(struct mlx5_fs_chains *chains) +{ + u32 chain, prio, level; + int err; + + if (!mlx5_chains_ignore_flow_level_supported(chains)) { + err = -EOPNOTSUPP; + + mlx5_core_warn(chains->dev, + "Couldn't create global flow table, ignore_flow_level not supported."); + goto err_ignore; + } + + chain = mlx5_chains_get_chain_range(chains), + prio = mlx5_chains_get_prio_range(chains); + level = mlx5_chains_get_level_range(chains); + + return mlx5_chains_create_table(chains, chain, prio, level); + +err_ignore: + return ERR_PTR(err); +} + +void +mlx5_chains_destroy_global_table(struct mlx5_fs_chains *chains, + struct mlx5_flow_table *ft) +{ + mlx5_destroy_flow_table(ft); +} + +static struct mlx5_fs_chains * +mlx5_chains_init(struct mlx5_core_dev *dev, struct mlx5_chains_attr *attr) +{ + struct mlx5_fs_chains *chains_priv; + u32 max_flow_counter; + int err; + + chains_priv = kzalloc(sizeof(*chains_priv), GFP_KERNEL); + if (!chains_priv) + return ERR_PTR(-ENOMEM); + + max_flow_counter = (MLX5_CAP_GEN(dev, max_flow_counter_31_16) << 16) | + MLX5_CAP_GEN(dev, max_flow_counter_15_0); + + mlx5_core_dbg(dev, + "Init flow table chains, max counters(%d), groups(%d), max flow table size(%d)\n", + max_flow_counter, attr->max_grp_num, attr->max_ft_sz); + + chains_priv->dev = dev; + chains_priv->flags = attr->flags; + chains_priv->ns = attr->ns; + chains_priv->group_num = attr->max_grp_num; + chains_priv->chains_mapping = attr->mapping; + tc_default_ft(chains_priv) = tc_end_ft(chains_priv) = attr->default_ft; + + mlx5_core_info(dev, "Supported tc offload range - chains: %u, prios: %u\n", + mlx5_chains_get_chain_range(chains_priv), + mlx5_chains_get_prio_range(chains_priv)); + + err = rhashtable_init(&chains_ht(chains_priv), &chain_params); + if (err) + goto init_chains_ht_err; + + err = rhashtable_init(&prios_ht(chains_priv), &prio_params); + if (err) + goto init_prios_ht_err; + + mutex_init(&chains_lock(chains_priv)); + + return chains_priv; + +init_prios_ht_err: + rhashtable_destroy(&chains_ht(chains_priv)); +init_chains_ht_err: + kfree(chains_priv); + return ERR_PTR(err); +} + +static void +mlx5_chains_cleanup(struct mlx5_fs_chains *chains) +{ + mutex_destroy(&chains_lock(chains)); + rhashtable_destroy(&prios_ht(chains)); + rhashtable_destroy(&chains_ht(chains)); + + kfree(chains); +} + +struct mlx5_fs_chains * +mlx5_chains_create(struct mlx5_core_dev *dev, struct mlx5_chains_attr *attr) +{ + struct mlx5_fs_chains *chains; + + chains = mlx5_chains_init(dev, attr); + + return chains; +} + +void +mlx5_chains_destroy(struct mlx5_fs_chains *chains) +{ + mlx5_chains_cleanup(chains); +} + +int +mlx5_chains_get_chain_mapping(struct mlx5_fs_chains *chains, u32 chain, + u32 *chain_mapping) +{ + struct mapping_ctx *ctx = chains->chains_mapping; + struct mlx5_mapped_obj mapped_obj = {}; + + mapped_obj.type = MLX5_MAPPED_OBJ_CHAIN; + mapped_obj.chain = chain; + return mapping_add(ctx, &mapped_obj, chain_mapping); +} + +int +mlx5_chains_put_chain_mapping(struct mlx5_fs_chains *chains, u32 chain_mapping) +{ + struct mapping_ctx *ctx = chains->chains_mapping; + + return mapping_remove(ctx, chain_mapping); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.h new file mode 100644 index 0000000..d50bdb2 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.h @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2020 Mellanox Technologies. */ + +#ifndef __ML5_ESW_CHAINS_H__ +#define __ML5_ESW_CHAINS_H__ + +#include + +struct mlx5_fs_chains; +struct mlx5_mapped_obj; + +enum mlx5_chains_flags { + MLX5_CHAINS_AND_PRIOS_SUPPORTED = BIT(0), + MLX5_CHAINS_IGNORE_FLOW_LEVEL_SUPPORTED = BIT(1), + MLX5_CHAINS_FT_TUNNEL_SUPPORTED = BIT(2), +}; + +struct mlx5_chains_attr { + enum mlx5_flow_namespace_type ns; + u32 flags; + u32 max_ft_sz; + u32 max_grp_num; + struct mlx5_flow_table *default_ft; + struct mapping_ctx *mapping; +}; + +#if IS_ENABLED(CONFIG_MLX5_CLS_ACT) + +bool +mlx5_chains_prios_supported(struct mlx5_fs_chains *chains); +bool mlx5_chains_ignore_flow_level_supported(struct mlx5_fs_chains *chains); +bool +mlx5_chains_backwards_supported(struct mlx5_fs_chains *chains); +u32 +mlx5_chains_get_prio_range(struct mlx5_fs_chains *chains); +u32 +mlx5_chains_get_chain_range(struct mlx5_fs_chains *chains); +u32 +mlx5_chains_get_nf_ft_chain(struct mlx5_fs_chains *chains); + +struct mlx5_flow_table * +mlx5_chains_get_table(struct mlx5_fs_chains *chains, u32 chain, u32 prio, + u32 level); +void +mlx5_chains_put_table(struct mlx5_fs_chains *chains, u32 chain, u32 prio, + u32 level); + +struct mlx5_flow_table * +mlx5_chains_get_tc_end_ft(struct mlx5_fs_chains *chains); + +struct mlx5_flow_table * +mlx5_chains_create_global_table(struct mlx5_fs_chains *chains); +void +mlx5_chains_destroy_global_table(struct mlx5_fs_chains *chains, + struct mlx5_flow_table *ft); + +int +mlx5_chains_get_chain_mapping(struct mlx5_fs_chains *chains, u32 chain, + u32 *chain_mapping); +int +mlx5_chains_put_chain_mapping(struct mlx5_fs_chains *chains, + u32 chain_mapping); + +struct mlx5_fs_chains * +mlx5_chains_create(struct mlx5_core_dev *dev, struct mlx5_chains_attr *attr); +void mlx5_chains_destroy(struct mlx5_fs_chains *chains); + +void +mlx5_chains_set_end_ft(struct mlx5_fs_chains *chains, + struct mlx5_flow_table *ft); + +#else /* CONFIG_MLX5_CLS_ACT */ + +static inline bool +mlx5_chains_ignore_flow_level_supported(struct mlx5_fs_chains *chains) +{ return false; } + +static inline struct mlx5_flow_table * +mlx5_chains_get_table(struct mlx5_fs_chains *chains, u32 chain, u32 prio, + u32 level) { return ERR_PTR(-EOPNOTSUPP); } +static inline void +mlx5_chains_put_table(struct mlx5_fs_chains *chains, u32 chain, u32 prio, + u32 level) {}; + +static inline struct mlx5_flow_table * +mlx5_chains_get_tc_end_ft(struct mlx5_fs_chains *chains) { return ERR_PTR(-EOPNOTSUPP); } + +static inline struct mlx5_fs_chains * +mlx5_chains_create(struct mlx5_core_dev *dev, struct mlx5_chains_attr *attr) +{ return NULL; } +static inline void +mlx5_chains_destroy(struct mlx5_fs_chains *chains) {}; + +#endif /* CONFIG_MLX5_CLS_ACT */ + +#endif /* __ML5_ESW_CHAINS_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_ttc.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_ttc.c new file mode 100644 index 0000000..b78f2ba --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_ttc.c @@ -0,0 +1,608 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. + +#include +#include +#include +#include +#include +#include "mlx5_core.h" +#include "lib/fs_ttc.h" + +#define MLX5_TTC_NUM_GROUPS 3 +#define MLX5_TTC_GROUP1_SIZE (BIT(3) + MLX5_NUM_TUNNEL_TT) +#define MLX5_TTC_GROUP2_SIZE BIT(1) +#define MLX5_TTC_GROUP3_SIZE BIT(0) +#define MLX5_TTC_TABLE_SIZE (MLX5_TTC_GROUP1_SIZE +\ + MLX5_TTC_GROUP2_SIZE +\ + MLX5_TTC_GROUP3_SIZE) + +#define MLX5_INNER_TTC_NUM_GROUPS 3 +#define MLX5_INNER_TTC_GROUP1_SIZE BIT(3) +#define MLX5_INNER_TTC_GROUP2_SIZE BIT(1) +#define MLX5_INNER_TTC_GROUP3_SIZE BIT(0) +#define MLX5_INNER_TTC_TABLE_SIZE (MLX5_INNER_TTC_GROUP1_SIZE +\ + MLX5_INNER_TTC_GROUP2_SIZE +\ + MLX5_INNER_TTC_GROUP3_SIZE) + +/* L3/L4 traffic type classifier */ +struct mlx5_ttc_table { + int num_groups; + struct mlx5_flow_table *t; + struct mlx5_flow_group **g; + struct mlx5_ttc_rule rules[MLX5_NUM_TT]; + struct mlx5_flow_handle *tunnel_rules[MLX5_NUM_TUNNEL_TT]; +}; + +struct mlx5_flow_table *mlx5_get_ttc_flow_table(struct mlx5_ttc_table *ttc) +{ + return ttc->t; +} + +static void mlx5_cleanup_ttc_rules(struct mlx5_ttc_table *ttc) +{ + int i; + + for (i = 0; i < MLX5_NUM_TT; i++) { + if (!IS_ERR_OR_NULL(ttc->rules[i].rule)) { + mlx5_del_flow_rules(ttc->rules[i].rule); + ttc->rules[i].rule = NULL; + } + } + + for (i = 0; i < MLX5_NUM_TUNNEL_TT; i++) { + if (!IS_ERR_OR_NULL(ttc->tunnel_rules[i])) { + mlx5_del_flow_rules(ttc->tunnel_rules[i]); + ttc->tunnel_rules[i] = NULL; + } + } +} + +struct mlx5_etype_proto { + u16 etype; + u8 proto; +}; + +static struct mlx5_etype_proto ttc_rules[] = { + [MLX5_TT_IPV4_TCP] = { + .etype = ETH_P_IP, + .proto = IPPROTO_TCP, + }, + [MLX5_TT_IPV6_TCP] = { + .etype = ETH_P_IPV6, + .proto = IPPROTO_TCP, + }, + [MLX5_TT_IPV4_UDP] = { + .etype = ETH_P_IP, + .proto = IPPROTO_UDP, + }, + [MLX5_TT_IPV6_UDP] = { + .etype = ETH_P_IPV6, + .proto = IPPROTO_UDP, + }, + [MLX5_TT_IPV4_IPSEC_AH] = { + .etype = ETH_P_IP, + .proto = IPPROTO_AH, + }, + [MLX5_TT_IPV6_IPSEC_AH] = { + .etype = ETH_P_IPV6, + .proto = IPPROTO_AH, + }, + [MLX5_TT_IPV4_IPSEC_ESP] = { + .etype = ETH_P_IP, + .proto = IPPROTO_ESP, + }, + [MLX5_TT_IPV6_IPSEC_ESP] = { + .etype = ETH_P_IPV6, + .proto = IPPROTO_ESP, + }, + [MLX5_TT_IPV4] = { + .etype = ETH_P_IP, + .proto = 0, + }, + [MLX5_TT_IPV6] = { + .etype = ETH_P_IPV6, + .proto = 0, + }, + [MLX5_TT_ANY] = { + .etype = 0, + .proto = 0, + }, +}; + +static struct mlx5_etype_proto ttc_tunnel_rules[] = { + [MLX5_TT_IPV4_GRE] = { + .etype = ETH_P_IP, + .proto = IPPROTO_GRE, + }, + [MLX5_TT_IPV6_GRE] = { + .etype = ETH_P_IPV6, + .proto = IPPROTO_GRE, + }, + [MLX5_TT_IPV4_IPIP] = { + .etype = ETH_P_IP, + .proto = IPPROTO_IPIP, + }, + [MLX5_TT_IPV6_IPIP] = { + .etype = ETH_P_IPV6, + .proto = IPPROTO_IPIP, + }, + [MLX5_TT_IPV4_IPV6] = { + .etype = ETH_P_IP, + .proto = IPPROTO_IPV6, + }, + [MLX5_TT_IPV6_IPV6] = { + .etype = ETH_P_IPV6, + .proto = IPPROTO_IPV6, + }, + +}; + +u8 mlx5_get_proto_by_tunnel_type(enum mlx5_tunnel_types tt) +{ + return ttc_tunnel_rules[tt].proto; +} + +static bool mlx5_tunnel_proto_supported_rx(struct mlx5_core_dev *mdev, + u8 proto_type) +{ + switch (proto_type) { + case IPPROTO_GRE: + return MLX5_CAP_ETH(mdev, tunnel_stateless_gre); + case IPPROTO_IPIP: + case IPPROTO_IPV6: + return (MLX5_CAP_ETH(mdev, tunnel_stateless_ip_over_ip) || + MLX5_CAP_ETH(mdev, tunnel_stateless_ip_over_ip_rx)); + default: + return false; + } +} + +static bool mlx5_tunnel_any_rx_proto_supported(struct mlx5_core_dev *mdev) +{ + int tt; + + for (tt = 0; tt < MLX5_NUM_TUNNEL_TT; tt++) { + if (mlx5_tunnel_proto_supported_rx(mdev, + ttc_tunnel_rules[tt].proto)) + return true; + } + return false; +} + +bool mlx5_tunnel_inner_ft_supported(struct mlx5_core_dev *mdev) +{ + return (mlx5_tunnel_any_rx_proto_supported(mdev) && + MLX5_CAP_FLOWTABLE_NIC_RX(mdev, + ft_field_support.inner_ip_version)); +} + +static u8 mlx5_etype_to_ipv(u16 ethertype) +{ + if (ethertype == ETH_P_IP) + return 4; + + if (ethertype == ETH_P_IPV6) + return 6; + + return 0; +} + +static struct mlx5_flow_handle * +mlx5_generate_ttc_rule(struct mlx5_core_dev *dev, struct mlx5_flow_table *ft, + struct mlx5_flow_destination *dest, u16 etype, u8 proto) +{ + int match_ipv_outer = + MLX5_CAP_FLOWTABLE_NIC_RX(dev, + ft_field_support.outer_ip_version); + MLX5_DECLARE_FLOW_ACT(flow_act); + struct mlx5_flow_handle *rule; + struct mlx5_flow_spec *spec; + int err = 0; + u8 ipv; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) + return ERR_PTR(-ENOMEM); + + if (proto) { + spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS; + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.ip_protocol); + MLX5_SET(fte_match_param, spec->match_value, outer_headers.ip_protocol, proto); + } + + ipv = mlx5_etype_to_ipv(etype); + if (match_ipv_outer && ipv) { + spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS; + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.ip_version); + MLX5_SET(fte_match_param, spec->match_value, outer_headers.ip_version, ipv); + } else if (etype) { + spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS; + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.ethertype); + MLX5_SET(fte_match_param, spec->match_value, outer_headers.ethertype, etype); + } + + rule = mlx5_add_flow_rules(ft, spec, &flow_act, dest, 1); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + mlx5_core_err(dev, "%s: add rule failed\n", __func__); + } + + kvfree(spec); + return err ? ERR_PTR(err) : rule; +} + +static int mlx5_generate_ttc_table_rules(struct mlx5_core_dev *dev, + struct ttc_params *params, + struct mlx5_ttc_table *ttc) +{ + struct mlx5_flow_handle **trules; + struct mlx5_ttc_rule *rules; + struct mlx5_flow_table *ft; + int tt; + int err; + + ft = ttc->t; + rules = ttc->rules; + for (tt = 0; tt < MLX5_NUM_TT; tt++) { + struct mlx5_ttc_rule *rule = &rules[tt]; + + if (test_bit(tt, params->ignore_dests)) + continue; + rule->rule = mlx5_generate_ttc_rule(dev, ft, ¶ms->dests[tt], + ttc_rules[tt].etype, + ttc_rules[tt].proto); + if (IS_ERR(rule->rule)) { + err = PTR_ERR(rule->rule); + rule->rule = NULL; + goto del_rules; + } + rule->default_dest = params->dests[tt]; + } + + if (!params->inner_ttc || !mlx5_tunnel_inner_ft_supported(dev)) + return 0; + + trules = ttc->tunnel_rules; + for (tt = 0; tt < MLX5_NUM_TUNNEL_TT; tt++) { + if (!mlx5_tunnel_proto_supported_rx(dev, + ttc_tunnel_rules[tt].proto)) + continue; + if (test_bit(tt, params->ignore_tunnel_dests)) + continue; + trules[tt] = mlx5_generate_ttc_rule(dev, ft, + ¶ms->tunnel_dests[tt], + ttc_tunnel_rules[tt].etype, + ttc_tunnel_rules[tt].proto); + if (IS_ERR(trules[tt])) { + err = PTR_ERR(trules[tt]); + trules[tt] = NULL; + goto del_rules; + } + } + + return 0; + +del_rules: + mlx5_cleanup_ttc_rules(ttc); + return err; +} + +static int mlx5_create_ttc_table_groups(struct mlx5_ttc_table *ttc, + bool use_ipv) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + int ix = 0; + u32 *in; + int err; + u8 *mc; + + ttc->g = kcalloc(MLX5_TTC_NUM_GROUPS, sizeof(*ttc->g), GFP_KERNEL); + if (!ttc->g) + return -ENOMEM; + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) { + kfree(ttc->g); + ttc->g = NULL; + return -ENOMEM; + } + + /* L4 Group */ + mc = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria); + MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ip_protocol); + if (use_ipv) + MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ip_version); + else + MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ethertype); + MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); + MLX5_SET_CFG(in, start_flow_index, ix); + ix += MLX5_TTC_GROUP1_SIZE; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ttc->g[ttc->num_groups] = mlx5_create_flow_group(ttc->t, in); + if (IS_ERR(ttc->g[ttc->num_groups])) + goto err; + ttc->num_groups++; + + /* L3 Group */ + MLX5_SET(fte_match_param, mc, outer_headers.ip_protocol, 0); + MLX5_SET_CFG(in, start_flow_index, ix); + ix += MLX5_TTC_GROUP2_SIZE; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ttc->g[ttc->num_groups] = mlx5_create_flow_group(ttc->t, in); + if (IS_ERR(ttc->g[ttc->num_groups])) + goto err; + ttc->num_groups++; + + /* Any Group */ + memset(in, 0, inlen); + MLX5_SET_CFG(in, start_flow_index, ix); + ix += MLX5_TTC_GROUP3_SIZE; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ttc->g[ttc->num_groups] = mlx5_create_flow_group(ttc->t, in); + if (IS_ERR(ttc->g[ttc->num_groups])) + goto err; + ttc->num_groups++; + + kvfree(in); + return 0; + +err: + err = PTR_ERR(ttc->g[ttc->num_groups]); + ttc->g[ttc->num_groups] = NULL; + kvfree(in); + + return err; +} + +static struct mlx5_flow_handle * +mlx5_generate_inner_ttc_rule(struct mlx5_core_dev *dev, + struct mlx5_flow_table *ft, + struct mlx5_flow_destination *dest, + u16 etype, u8 proto) +{ + MLX5_DECLARE_FLOW_ACT(flow_act); + struct mlx5_flow_handle *rule; + struct mlx5_flow_spec *spec; + int err = 0; + u8 ipv; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) + return ERR_PTR(-ENOMEM); + + ipv = mlx5_etype_to_ipv(etype); + if (etype && ipv) { + spec->match_criteria_enable = MLX5_MATCH_INNER_HEADERS; + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, inner_headers.ip_version); + MLX5_SET(fte_match_param, spec->match_value, inner_headers.ip_version, ipv); + } + + if (proto) { + spec->match_criteria_enable = MLX5_MATCH_INNER_HEADERS; + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, inner_headers.ip_protocol); + MLX5_SET(fte_match_param, spec->match_value, inner_headers.ip_protocol, proto); + } + + rule = mlx5_add_flow_rules(ft, spec, &flow_act, dest, 1); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + mlx5_core_err(dev, "%s: add inner TTC rule failed\n", __func__); + } + + kvfree(spec); + return err ? ERR_PTR(err) : rule; +} + +static int mlx5_generate_inner_ttc_table_rules(struct mlx5_core_dev *dev, + struct ttc_params *params, + struct mlx5_ttc_table *ttc) +{ + struct mlx5_ttc_rule *rules; + struct mlx5_flow_table *ft; + int err; + int tt; + + ft = ttc->t; + rules = ttc->rules; + + for (tt = 0; tt < MLX5_NUM_TT; tt++) { + struct mlx5_ttc_rule *rule = &rules[tt]; + + if (test_bit(tt, params->ignore_dests)) + continue; + rule->rule = mlx5_generate_inner_ttc_rule(dev, ft, + ¶ms->dests[tt], + ttc_rules[tt].etype, + ttc_rules[tt].proto); + if (IS_ERR(rule->rule)) { + err = PTR_ERR(rule->rule); + rule->rule = NULL; + goto del_rules; + } + rule->default_dest = params->dests[tt]; + } + + return 0; + +del_rules: + + mlx5_cleanup_ttc_rules(ttc); + return err; +} + +static int mlx5_create_inner_ttc_table_groups(struct mlx5_ttc_table *ttc) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + int ix = 0; + u32 *in; + int err; + u8 *mc; + + ttc->g = kcalloc(MLX5_INNER_TTC_NUM_GROUPS, sizeof(*ttc->g), + GFP_KERNEL); + if (!ttc->g) + return -ENOMEM; + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) { + kfree(ttc->g); + ttc->g = NULL; + return -ENOMEM; + } + + /* L4 Group */ + mc = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria); + MLX5_SET_TO_ONES(fte_match_param, mc, inner_headers.ip_protocol); + MLX5_SET_TO_ONES(fte_match_param, mc, inner_headers.ip_version); + MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_INNER_HEADERS); + MLX5_SET_CFG(in, start_flow_index, ix); + ix += MLX5_INNER_TTC_GROUP1_SIZE; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ttc->g[ttc->num_groups] = mlx5_create_flow_group(ttc->t, in); + if (IS_ERR(ttc->g[ttc->num_groups])) + goto err; + ttc->num_groups++; + + /* L3 Group */ + MLX5_SET(fte_match_param, mc, inner_headers.ip_protocol, 0); + MLX5_SET_CFG(in, start_flow_index, ix); + ix += MLX5_INNER_TTC_GROUP2_SIZE; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ttc->g[ttc->num_groups] = mlx5_create_flow_group(ttc->t, in); + if (IS_ERR(ttc->g[ttc->num_groups])) + goto err; + ttc->num_groups++; + + /* Any Group */ + memset(in, 0, inlen); + MLX5_SET_CFG(in, start_flow_index, ix); + ix += MLX5_INNER_TTC_GROUP3_SIZE; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ttc->g[ttc->num_groups] = mlx5_create_flow_group(ttc->t, in); + if (IS_ERR(ttc->g[ttc->num_groups])) + goto err; + ttc->num_groups++; + + kvfree(in); + return 0; + +err: + err = PTR_ERR(ttc->g[ttc->num_groups]); + ttc->g[ttc->num_groups] = NULL; + kvfree(in); + + return err; +} + +struct mlx5_ttc_table *mlx5_create_inner_ttc_table(struct mlx5_core_dev *dev, + struct ttc_params *params) +{ + struct mlx5_ttc_table *ttc; + int err; + + ttc = kvzalloc(sizeof(*ttc), GFP_KERNEL); + if (!ttc) + return ERR_PTR(-ENOMEM); + + WARN_ON_ONCE(params->ft_attr.max_fte); + params->ft_attr.max_fte = MLX5_INNER_TTC_TABLE_SIZE; + ttc->t = mlx5_create_flow_table(params->ns, ¶ms->ft_attr); + if (IS_ERR(ttc->t)) { + err = PTR_ERR(ttc->t); + kvfree(ttc); + return ERR_PTR(err); + } + + err = mlx5_create_inner_ttc_table_groups(ttc); + if (err) + goto destroy_ft; + + err = mlx5_generate_inner_ttc_table_rules(dev, params, ttc); + if (err) + goto destroy_ft; + + return ttc; + +destroy_ft: + mlx5_destroy_ttc_table(ttc); + return ERR_PTR(err); +} + +void mlx5_destroy_ttc_table(struct mlx5_ttc_table *ttc) +{ + int i; + + mlx5_cleanup_ttc_rules(ttc); + for (i = ttc->num_groups - 1; i >= 0; i--) { + if (!IS_ERR_OR_NULL(ttc->g[i])) + mlx5_destroy_flow_group(ttc->g[i]); + ttc->g[i] = NULL; + } + + kfree(ttc->g); + mlx5_destroy_flow_table(ttc->t); + kvfree(ttc); +} + +struct mlx5_ttc_table *mlx5_create_ttc_table(struct mlx5_core_dev *dev, + struct ttc_params *params) +{ + bool match_ipv_outer = + MLX5_CAP_FLOWTABLE_NIC_RX(dev, + ft_field_support.outer_ip_version); + struct mlx5_ttc_table *ttc; + int err; + + ttc = kvzalloc(sizeof(*ttc), GFP_KERNEL); + if (!ttc) + return ERR_PTR(-ENOMEM); + + WARN_ON_ONCE(params->ft_attr.max_fte); + params->ft_attr.max_fte = MLX5_TTC_TABLE_SIZE; + ttc->t = mlx5_create_flow_table(params->ns, ¶ms->ft_attr); + if (IS_ERR(ttc->t)) { + err = PTR_ERR(ttc->t); + kvfree(ttc); + return ERR_PTR(err); + } + + err = mlx5_create_ttc_table_groups(ttc, match_ipv_outer); + if (err) + goto destroy_ft; + + err = mlx5_generate_ttc_table_rules(dev, params, ttc); + if (err) + goto destroy_ft; + + return ttc; + +destroy_ft: + mlx5_destroy_ttc_table(ttc); + return ERR_PTR(err); +} + +int mlx5_ttc_fwd_dest(struct mlx5_ttc_table *ttc, enum mlx5_traffic_types type, + struct mlx5_flow_destination *new_dest) +{ + return mlx5_modify_rule_destination(ttc->rules[type].rule, new_dest, + NULL); +} + +struct mlx5_flow_destination +mlx5_ttc_get_default_dest(struct mlx5_ttc_table *ttc, + enum mlx5_traffic_types type) +{ + struct mlx5_flow_destination *dest = &ttc->rules[type].default_dest; + + WARN_ONCE(dest->type != MLX5_FLOW_DESTINATION_TYPE_TIR, + "TTC[%d] default dest is not setup yet", type); + + return *dest; +} + +int mlx5_ttc_fwd_default_dest(struct mlx5_ttc_table *ttc, + enum mlx5_traffic_types type) +{ + struct mlx5_flow_destination dest = mlx5_ttc_get_default_dest(ttc, type); + + return mlx5_ttc_fwd_dest(ttc, type, &dest); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_ttc.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_ttc.h new file mode 100644 index 0000000..85fef0c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_ttc.h @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2020 Mellanox Technologies. */ + +#ifndef __ML5_FS_TTC_H__ +#define __ML5_FS_TTC_H__ + +#include + +enum mlx5_traffic_types { + MLX5_TT_IPV4_TCP, + MLX5_TT_IPV6_TCP, + MLX5_TT_IPV4_UDP, + MLX5_TT_IPV6_UDP, + MLX5_TT_IPV4_IPSEC_AH, + MLX5_TT_IPV6_IPSEC_AH, + MLX5_TT_IPV4_IPSEC_ESP, + MLX5_TT_IPV6_IPSEC_ESP, + MLX5_TT_IPV4, + MLX5_TT_IPV6, + MLX5_TT_ANY, + MLX5_NUM_TT, + MLX5_NUM_INDIR_TIRS = MLX5_TT_ANY, +}; + +enum mlx5_tunnel_types { + MLX5_TT_IPV4_GRE, + MLX5_TT_IPV6_GRE, + MLX5_TT_IPV4_IPIP, + MLX5_TT_IPV6_IPIP, + MLX5_TT_IPV4_IPV6, + MLX5_TT_IPV6_IPV6, + MLX5_NUM_TUNNEL_TT, +}; + +struct mlx5_ttc_rule { + struct mlx5_flow_handle *rule; + struct mlx5_flow_destination default_dest; +}; + +struct mlx5_ttc_table; + +struct ttc_params { + struct mlx5_flow_namespace *ns; + struct mlx5_flow_table_attr ft_attr; + struct mlx5_flow_destination dests[MLX5_NUM_TT]; + DECLARE_BITMAP(ignore_dests, MLX5_NUM_TT); + bool inner_ttc; + DECLARE_BITMAP(ignore_tunnel_dests, MLX5_NUM_TUNNEL_TT); + struct mlx5_flow_destination tunnel_dests[MLX5_NUM_TUNNEL_TT]; +}; + +struct mlx5_flow_table *mlx5_get_ttc_flow_table(struct mlx5_ttc_table *ttc); + +struct mlx5_ttc_table *mlx5_create_ttc_table(struct mlx5_core_dev *dev, + struct ttc_params *params); +void mlx5_destroy_ttc_table(struct mlx5_ttc_table *ttc); + +struct mlx5_ttc_table *mlx5_create_inner_ttc_table(struct mlx5_core_dev *dev, + struct ttc_params *params); + +int mlx5_ttc_fwd_dest(struct mlx5_ttc_table *ttc, enum mlx5_traffic_types type, + struct mlx5_flow_destination *new_dest); +struct mlx5_flow_destination +mlx5_ttc_get_default_dest(struct mlx5_ttc_table *ttc, + enum mlx5_traffic_types type); +int mlx5_ttc_fwd_default_dest(struct mlx5_ttc_table *ttc, + enum mlx5_traffic_types type); + +bool mlx5_tunnel_inner_ft_supported(struct mlx5_core_dev *mdev); +u8 mlx5_get_proto_by_tunnel_type(enum mlx5_tunnel_types tt); + +#endif /* __MLX5_FS_TTC_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/geneve.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/geneve.c new file mode 100644 index 0000000..23361a9 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/geneve.c @@ -0,0 +1,157 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2019 Mellanox Technologies. */ + +#include +#include "mlx5_core.h" +#include "geneve.h" + +struct mlx5_geneve { + struct mlx5_core_dev *mdev; + __be16 opt_class; + u8 opt_type; + u32 obj_id; + struct mutex sync_lock; /* protect GENEVE obj operations */ + u32 refcount; +}; + +static int mlx5_geneve_tlv_option_create(struct mlx5_core_dev *mdev, + __be16 class, + u8 type, + u8 len) +{ + u32 in[MLX5_ST_SZ_DW(create_geneve_tlv_option_in)] = {}; + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {}; + u64 general_obj_types; + void *hdr, *opt; + u16 obj_id; + int err; + + general_obj_types = MLX5_CAP_GEN_64(mdev, general_obj_types); + if (!(general_obj_types & MLX5_GENERAL_OBJ_TYPES_CAP_GENEVE_TLV_OPT)) + return -EINVAL; + + hdr = MLX5_ADDR_OF(create_geneve_tlv_option_in, in, hdr); + opt = MLX5_ADDR_OF(create_geneve_tlv_option_in, in, geneve_tlv_opt); + + MLX5_SET(general_obj_in_cmd_hdr, hdr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT); + MLX5_SET(general_obj_in_cmd_hdr, hdr, obj_type, MLX5_OBJ_TYPE_GENEVE_TLV_OPT); + + MLX5_SET(geneve_tlv_option, opt, option_class, be16_to_cpu(class)); + MLX5_SET(geneve_tlv_option, opt, option_type, type); + MLX5_SET(geneve_tlv_option, opt, option_data_length, len); + + err = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); + if (err) + return err; + + obj_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id); + return obj_id; +} + +static void mlx5_geneve_tlv_option_destroy(struct mlx5_core_dev *mdev, u16 obj_id) +{ + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {}; + u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {}; + + MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT); + MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_OBJ_TYPE_GENEVE_TLV_OPT); + MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, obj_id); + + mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); +} + +int mlx5_geneve_tlv_option_add(struct mlx5_geneve *geneve, struct geneve_opt *opt) +{ + int res = 0; + + if (IS_ERR_OR_NULL(geneve)) + return -EOPNOTSUPP; + + mutex_lock(&geneve->sync_lock); + + if (geneve->refcount) { + if (geneve->opt_class == opt->opt_class && + geneve->opt_type == opt->type) { + /* We already have TLV options obj allocated */ + geneve->refcount++; + } else { + /* TLV options obj allocated, but its params + * do not match the new request. + * We support only one such object. + */ + mlx5_core_warn(geneve->mdev, + "Won't create Geneve TLV opt object with class:type:len = 0x%x:0x%x:%d (another class:type already exists)\n", + be16_to_cpu(opt->opt_class), + opt->type, + opt->length); + res = -EOPNOTSUPP; + goto unlock; + } + } else { + /* We don't have any TLV options obj allocated */ + + res = mlx5_geneve_tlv_option_create(geneve->mdev, + opt->opt_class, + opt->type, + opt->length); + if (res < 0) { + mlx5_core_warn(geneve->mdev, + "Failed creating Geneve TLV opt object class:type:len = 0x%x:0x%x:%d (err=%d)\n", + be16_to_cpu(opt->opt_class), + opt->type, opt->length, res); + goto unlock; + } + geneve->opt_class = opt->opt_class; + geneve->opt_type = opt->type; + geneve->obj_id = res; + geneve->refcount++; + } + +unlock: + mutex_unlock(&geneve->sync_lock); + return res; +} + +void mlx5_geneve_tlv_option_del(struct mlx5_geneve *geneve) +{ + if (IS_ERR_OR_NULL(geneve)) + return; + + mutex_lock(&geneve->sync_lock); + if (--geneve->refcount == 0) { + /* We've just removed the last user of Geneve option. + * Now delete the object in FW. + */ + mlx5_geneve_tlv_option_destroy(geneve->mdev, geneve->obj_id); + + geneve->opt_class = 0; + geneve->opt_type = 0; + geneve->obj_id = 0; + } + mutex_unlock(&geneve->sync_lock); +} + +struct mlx5_geneve *mlx5_geneve_create(struct mlx5_core_dev *mdev) +{ + struct mlx5_geneve *geneve = + kzalloc(sizeof(*geneve), GFP_KERNEL); + + if (!geneve) + return ERR_PTR(-ENOMEM); + geneve->mdev = mdev; + mutex_init(&geneve->sync_lock); + + return geneve; +} + +void mlx5_geneve_destroy(struct mlx5_geneve *geneve) +{ + if (IS_ERR_OR_NULL(geneve)) + return; + + /* Lockless since we are unloading */ + if (geneve->refcount) + mlx5_geneve_tlv_option_destroy(geneve->mdev, geneve->obj_id); + + kfree(geneve); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/geneve.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/geneve.h new file mode 100644 index 0000000..adee0cb --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/geneve.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2019 Mellanox Technologies. */ + +#ifndef __MLX5_GENEVE_H__ +#define __MLX5_GENEVE_H__ + +#include +#include + +struct mlx5_geneve; + +#ifdef CONFIG_MLX5_ESWITCH + +struct mlx5_geneve *mlx5_geneve_create(struct mlx5_core_dev *mdev); +void mlx5_geneve_destroy(struct mlx5_geneve *geneve); + +int mlx5_geneve_tlv_option_add(struct mlx5_geneve *geneve, struct geneve_opt *opt); +void mlx5_geneve_tlv_option_del(struct mlx5_geneve *geneve); + +#else /* CONFIG_MLX5_ESWITCH */ + +static inline struct mlx5_geneve +*mlx5_geneve_create(struct mlx5_core_dev *mdev) { return NULL; } +static inline void +mlx5_geneve_destroy(struct mlx5_geneve *geneve) {} +static inline int +mlx5_geneve_tlv_option_add(struct mlx5_geneve *geneve, struct geneve_opt *opt) { return 0; } +static inline void +mlx5_geneve_tlv_option_del(struct mlx5_geneve *geneve) {} + +#endif /* CONFIG_MLX5_ESWITCH */ + +#endif /* __MLX5_GENEVE_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/gid.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/gid.c new file mode 100644 index 0000000..96ffc0a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/gid.c @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2017, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include "mlx5_core.h" +#include "lib/mlx5.h" + +void mlx5_init_reserved_gids(struct mlx5_core_dev *dev) +{ + unsigned int tblsz = MLX5_CAP_ROCE(dev, roce_address_table_size); + + ida_init(&dev->roce.reserved_gids.ida); + dev->roce.reserved_gids.start = tblsz; + dev->roce.reserved_gids.count = 0; +} + +void mlx5_cleanup_reserved_gids(struct mlx5_core_dev *dev) +{ + WARN_ON(!ida_is_empty(&dev->roce.reserved_gids.ida)); + dev->roce.reserved_gids.start = 0; + dev->roce.reserved_gids.count = 0; + ida_destroy(&dev->roce.reserved_gids.ida); +} + +int mlx5_core_reserve_gids(struct mlx5_core_dev *dev, unsigned int count) +{ + if (dev->roce.reserved_gids.start < count) { + mlx5_core_warn(dev, "GID table exhausted attempting to reserve %d more GIDs\n", + count); + return -ENOMEM; + } + if (dev->roce.reserved_gids.count + count > MLX5_MAX_RESERVED_GIDS) { + mlx5_core_warn(dev, "Unable to reserve %d more GIDs\n", count); + return -ENOMEM; + } + + dev->roce.reserved_gids.start -= count; + dev->roce.reserved_gids.count += count; + mlx5_core_dbg(dev, "Reserved %u GIDs starting at %u\n", + dev->roce.reserved_gids.count, + dev->roce.reserved_gids.start); + return 0; +} + +void mlx5_core_unreserve_gids(struct mlx5_core_dev *dev, unsigned int count) +{ + WARN(count > dev->roce.reserved_gids.count, "Unreserving %u GIDs when only %u reserved", + count, dev->roce.reserved_gids.count); + + dev->roce.reserved_gids.start += count; + dev->roce.reserved_gids.count -= count; + mlx5_core_dbg(dev, "%u GIDs starting at %u left reserved\n", + dev->roce.reserved_gids.count, + dev->roce.reserved_gids.start); +} + +int mlx5_core_reserved_gid_alloc(struct mlx5_core_dev *dev, int *gid_index) +{ + int end = dev->roce.reserved_gids.start + + dev->roce.reserved_gids.count - 1; + int index = 0; + + index = ida_alloc_range(&dev->roce.reserved_gids.ida, + dev->roce.reserved_gids.start, end, + GFP_KERNEL); + if (index < 0) + return index; + + mlx5_core_dbg(dev, "Allocating reserved GID %u\n", index); + *gid_index = index; + return 0; +} + +void mlx5_core_reserved_gid_free(struct mlx5_core_dev *dev, int gid_index) +{ + mlx5_core_dbg(dev, "Freeing reserved GID %u\n", gid_index); + ida_free(&dev->roce.reserved_gids.ida, gid_index); +} + +unsigned int mlx5_core_reserved_gids_count(struct mlx5_core_dev *dev) +{ + return dev->roce.reserved_gids.count; +} +EXPORT_SYMBOL_GPL(mlx5_core_reserved_gids_count); + +int mlx5_core_roce_gid_set(struct mlx5_core_dev *dev, unsigned int index, + u8 roce_version, u8 roce_l3_type, const u8 *gid, + const u8 *mac, bool vlan, u16 vlan_id, u8 port_num) +{ +#define MLX5_SET_RA(p, f, v) MLX5_SET(roce_addr_layout, p, f, v) + u32 in[MLX5_ST_SZ_DW(set_roce_address_in)] = {}; + void *in_addr = MLX5_ADDR_OF(set_roce_address_in, in, roce_address); + char *addr_l3_addr = MLX5_ADDR_OF(roce_addr_layout, in_addr, + source_l3_address); + void *addr_mac = MLX5_ADDR_OF(roce_addr_layout, in_addr, + source_mac_47_32); + int gidsz = MLX5_FLD_SZ_BYTES(roce_addr_layout, source_l3_address); + + if (MLX5_CAP_GEN(dev, port_type) != MLX5_CAP_PORT_TYPE_ETH) + return -EINVAL; + + if (gid) { + if (vlan) { + MLX5_SET_RA(in_addr, vlan_valid, 1); + MLX5_SET_RA(in_addr, vlan_id, vlan_id); + } + + ether_addr_copy(addr_mac, mac); + memcpy(addr_l3_addr, gid, gidsz); + } + MLX5_SET_RA(in_addr, roce_version, roce_version); + MLX5_SET_RA(in_addr, roce_l3_type, roce_l3_type); + + if (MLX5_CAP_GEN(dev, num_vhca_ports) > 0) + MLX5_SET(set_roce_address_in, in, vhca_port_num, port_num); + + MLX5_SET(set_roce_address_in, in, roce_address_index, index); + MLX5_SET(set_roce_address_in, in, opcode, MLX5_CMD_OP_SET_ROCE_ADDRESS); + return mlx5_cmd_exec_in(dev, set_roce_address, in); +} +EXPORT_SYMBOL(mlx5_core_roce_gid_set); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c new file mode 100644 index 0000000..583dc7e --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// Copyright (c) 2018 Mellanox Technologies + +#include +#include "mlx5_core.h" +#include "lib/hv.h" + +static int mlx5_hv_config_common(struct mlx5_core_dev *dev, void *buf, int len, + int offset, bool read) +{ + int rc = -EOPNOTSUPP; + int bytes_returned; + int block_id; + + if (offset % HV_CONFIG_BLOCK_SIZE_MAX || len != HV_CONFIG_BLOCK_SIZE_MAX) + return -EINVAL; + + block_id = offset / HV_CONFIG_BLOCK_SIZE_MAX; + + rc = read ? + hyperv_read_cfg_blk(dev->pdev, buf, + HV_CONFIG_BLOCK_SIZE_MAX, block_id, + &bytes_returned) : + hyperv_write_cfg_blk(dev->pdev, buf, + HV_CONFIG_BLOCK_SIZE_MAX, block_id); + + /* Make sure len bytes were read successfully */ + if (read && !rc && len != bytes_returned) + rc = -EIO; + + if (rc) { + mlx5_core_err(dev, "Failed to %s hv config, err = %d, len = %d, offset = %d\n", + read ? "read" : "write", rc, len, + offset); + return rc; + } + + return 0; +} + +int mlx5_hv_read_config(struct mlx5_core_dev *dev, void *buf, int len, + int offset) +{ + return mlx5_hv_config_common(dev, buf, len, offset, true); +} + +int mlx5_hv_write_config(struct mlx5_core_dev *dev, void *buf, int len, + int offset) +{ + return mlx5_hv_config_common(dev, buf, len, offset, false); +} + +int mlx5_hv_register_invalidate(struct mlx5_core_dev *dev, void *context, + void (*block_invalidate)(void *context, + u64 block_mask)) +{ + return hyperv_reg_block_invalidate(dev->pdev, context, + block_invalidate); +} + +void mlx5_hv_unregister_invalidate(struct mlx5_core_dev *dev) +{ + hyperv_reg_block_invalidate(dev->pdev, NULL, NULL); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h new file mode 100644 index 0000000..f9a4557 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2019 Mellanox Technologies. */ + +#ifndef __LIB_HV_H__ +#define __LIB_HV_H__ + +#if IS_ENABLED(CONFIG_PCI_HYPERV_INTERFACE) + +#include +#include + +int mlx5_hv_read_config(struct mlx5_core_dev *dev, void *buf, int len, + int offset); +int mlx5_hv_write_config(struct mlx5_core_dev *dev, void *buf, int len, + int offset); +int mlx5_hv_register_invalidate(struct mlx5_core_dev *dev, void *context, + void (*block_invalidate)(void *context, + u64 block_mask)); +void mlx5_hv_unregister_invalidate(struct mlx5_core_dev *dev); +#endif + +#endif /* __LIB_HV_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c new file mode 100644 index 0000000..4047629 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c @@ -0,0 +1,371 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// Copyright (c) 2018 Mellanox Technologies + +#include +#include "mlx5_core.h" +#include "lib/hv.h" +#include "lib/hv_vhca.h" + +struct mlx5_hv_vhca { + struct mlx5_core_dev *dev; + struct workqueue_struct *work_queue; + struct mlx5_hv_vhca_agent *agents[MLX5_HV_VHCA_AGENT_MAX]; + struct mutex agents_lock; /* Protect agents array */ +}; + +struct mlx5_hv_vhca_work { + struct work_struct invalidate_work; + struct mlx5_hv_vhca *hv_vhca; + u64 block_mask; +}; + +struct mlx5_hv_vhca_data_block { + u16 sequence; + u16 offset; + u8 reserved[4]; + u64 data[15]; +}; + +struct mlx5_hv_vhca_agent { + enum mlx5_hv_vhca_agent_type type; + struct mlx5_hv_vhca *hv_vhca; + void *priv; + u16 seq; + void (*control)(struct mlx5_hv_vhca_agent *agent, + struct mlx5_hv_vhca_control_block *block); + void (*invalidate)(struct mlx5_hv_vhca_agent *agent, + u64 block_mask); + void (*cleanup)(struct mlx5_hv_vhca_agent *agent); +}; + +struct mlx5_hv_vhca *mlx5_hv_vhca_create(struct mlx5_core_dev *dev) +{ + struct mlx5_hv_vhca *hv_vhca = NULL; + + hv_vhca = kzalloc(sizeof(*hv_vhca), GFP_KERNEL); + if (!hv_vhca) + return ERR_PTR(-ENOMEM); + + hv_vhca->work_queue = create_singlethread_workqueue("mlx5_hv_vhca"); + if (!hv_vhca->work_queue) { + kfree(hv_vhca); + return ERR_PTR(-ENOMEM); + } + + hv_vhca->dev = dev; + mutex_init(&hv_vhca->agents_lock); + + return hv_vhca; +} + +void mlx5_hv_vhca_destroy(struct mlx5_hv_vhca *hv_vhca) +{ + if (IS_ERR_OR_NULL(hv_vhca)) + return; + + destroy_workqueue(hv_vhca->work_queue); + kfree(hv_vhca); +} + +static void mlx5_hv_vhca_invalidate_work(struct work_struct *work) +{ + struct mlx5_hv_vhca_work *hwork; + struct mlx5_hv_vhca *hv_vhca; + int i; + + hwork = container_of(work, struct mlx5_hv_vhca_work, invalidate_work); + hv_vhca = hwork->hv_vhca; + + mutex_lock(&hv_vhca->agents_lock); + for (i = 0; i < MLX5_HV_VHCA_AGENT_MAX; i++) { + struct mlx5_hv_vhca_agent *agent = hv_vhca->agents[i]; + + if (!agent || !agent->invalidate) + continue; + + if (!(BIT(agent->type) & hwork->block_mask)) + continue; + + agent->invalidate(agent, hwork->block_mask); + } + mutex_unlock(&hv_vhca->agents_lock); + + kfree(hwork); +} + +void mlx5_hv_vhca_invalidate(void *context, u64 block_mask) +{ + struct mlx5_hv_vhca *hv_vhca = (struct mlx5_hv_vhca *)context; + struct mlx5_hv_vhca_work *work; + + work = kzalloc(sizeof(*work), GFP_ATOMIC); + if (!work) + return; + + INIT_WORK(&work->invalidate_work, mlx5_hv_vhca_invalidate_work); + work->hv_vhca = hv_vhca; + work->block_mask = block_mask; + + queue_work(hv_vhca->work_queue, &work->invalidate_work); +} + +#define AGENT_MASK(type) (type ? BIT(type - 1) : 0 /* control */) + +static void mlx5_hv_vhca_agents_control(struct mlx5_hv_vhca *hv_vhca, + struct mlx5_hv_vhca_control_block *block) +{ + int i; + + for (i = 0; i < MLX5_HV_VHCA_AGENT_MAX; i++) { + struct mlx5_hv_vhca_agent *agent = hv_vhca->agents[i]; + + if (!agent || !agent->control) + continue; + + if (!(AGENT_MASK(agent->type) & block->control)) + continue; + + agent->control(agent, block); + } +} + +static void mlx5_hv_vhca_capabilities(struct mlx5_hv_vhca *hv_vhca, + u32 *capabilities) +{ + int i; + + for (i = 0; i < MLX5_HV_VHCA_AGENT_MAX; i++) { + struct mlx5_hv_vhca_agent *agent = hv_vhca->agents[i]; + + if (agent) + *capabilities |= AGENT_MASK(agent->type); + } +} + +static void +mlx5_hv_vhca_control_agent_invalidate(struct mlx5_hv_vhca_agent *agent, + u64 block_mask) +{ + struct mlx5_hv_vhca *hv_vhca = agent->hv_vhca; + struct mlx5_core_dev *dev = hv_vhca->dev; + struct mlx5_hv_vhca_control_block *block; + u32 capabilities = 0; + int err; + + block = kzalloc(sizeof(*block), GFP_KERNEL); + if (!block) + return; + + err = mlx5_hv_read_config(dev, block, sizeof(*block), 0); + if (err) + goto free_block; + + mlx5_hv_vhca_capabilities(hv_vhca, &capabilities); + + /* In case no capabilities, send empty block in return */ + if (!capabilities) { + memset(block, 0, sizeof(*block)); + goto write; + } + + if (block->capabilities != capabilities) + block->capabilities = capabilities; + + if (block->control & ~capabilities) + goto free_block; + + mlx5_hv_vhca_agents_control(hv_vhca, block); + block->command_ack = block->command; + +write: + mlx5_hv_write_config(dev, block, sizeof(*block), 0); + +free_block: + kfree(block); +} + +static struct mlx5_hv_vhca_agent * +mlx5_hv_vhca_control_agent_create(struct mlx5_hv_vhca *hv_vhca) +{ + return mlx5_hv_vhca_agent_create(hv_vhca, MLX5_HV_VHCA_AGENT_CONTROL, + NULL, + mlx5_hv_vhca_control_agent_invalidate, + NULL, NULL); +} + +static void mlx5_hv_vhca_control_agent_destroy(struct mlx5_hv_vhca_agent *agent) +{ + mlx5_hv_vhca_agent_destroy(agent); +} + +int mlx5_hv_vhca_init(struct mlx5_hv_vhca *hv_vhca) +{ + struct mlx5_hv_vhca_agent *agent; + int err; + + if (IS_ERR_OR_NULL(hv_vhca)) + return IS_ERR_OR_NULL(hv_vhca); + + err = mlx5_hv_register_invalidate(hv_vhca->dev, hv_vhca, + mlx5_hv_vhca_invalidate); + if (err) + return err; + + agent = mlx5_hv_vhca_control_agent_create(hv_vhca); + if (IS_ERR_OR_NULL(agent)) { + mlx5_hv_unregister_invalidate(hv_vhca->dev); + return IS_ERR_OR_NULL(agent); + } + + hv_vhca->agents[MLX5_HV_VHCA_AGENT_CONTROL] = agent; + + return 0; +} + +void mlx5_hv_vhca_cleanup(struct mlx5_hv_vhca *hv_vhca) +{ + struct mlx5_hv_vhca_agent *agent; + int i; + + if (IS_ERR_OR_NULL(hv_vhca)) + return; + + agent = hv_vhca->agents[MLX5_HV_VHCA_AGENT_CONTROL]; + if (agent) + mlx5_hv_vhca_control_agent_destroy(agent); + + mutex_lock(&hv_vhca->agents_lock); + for (i = 0; i < MLX5_HV_VHCA_AGENT_MAX; i++) + WARN_ON(hv_vhca->agents[i]); + + mutex_unlock(&hv_vhca->agents_lock); + + mlx5_hv_unregister_invalidate(hv_vhca->dev); +} + +static void mlx5_hv_vhca_agents_update(struct mlx5_hv_vhca *hv_vhca) +{ + mlx5_hv_vhca_invalidate(hv_vhca, BIT(MLX5_HV_VHCA_AGENT_CONTROL)); +} + +struct mlx5_hv_vhca_agent * +mlx5_hv_vhca_agent_create(struct mlx5_hv_vhca *hv_vhca, + enum mlx5_hv_vhca_agent_type type, + void (*control)(struct mlx5_hv_vhca_agent*, + struct mlx5_hv_vhca_control_block *block), + void (*invalidate)(struct mlx5_hv_vhca_agent*, + u64 block_mask), + void (*cleaup)(struct mlx5_hv_vhca_agent *agent), + void *priv) +{ + struct mlx5_hv_vhca_agent *agent; + + if (IS_ERR_OR_NULL(hv_vhca)) + return ERR_PTR(-ENOMEM); + + if (type >= MLX5_HV_VHCA_AGENT_MAX) + return ERR_PTR(-EINVAL); + + mutex_lock(&hv_vhca->agents_lock); + if (hv_vhca->agents[type]) { + mutex_unlock(&hv_vhca->agents_lock); + return ERR_PTR(-EINVAL); + } + mutex_unlock(&hv_vhca->agents_lock); + + agent = kzalloc(sizeof(*agent), GFP_KERNEL); + if (!agent) + return ERR_PTR(-ENOMEM); + + agent->type = type; + agent->hv_vhca = hv_vhca; + agent->priv = priv; + agent->control = control; + agent->invalidate = invalidate; + agent->cleanup = cleaup; + + mutex_lock(&hv_vhca->agents_lock); + hv_vhca->agents[type] = agent; + mutex_unlock(&hv_vhca->agents_lock); + + mlx5_hv_vhca_agents_update(hv_vhca); + + return agent; +} + +void mlx5_hv_vhca_agent_destroy(struct mlx5_hv_vhca_agent *agent) +{ + struct mlx5_hv_vhca *hv_vhca = agent->hv_vhca; + + mutex_lock(&hv_vhca->agents_lock); + + if (WARN_ON(agent != hv_vhca->agents[agent->type])) { + mutex_unlock(&hv_vhca->agents_lock); + return; + } + + hv_vhca->agents[agent->type] = NULL; + mutex_unlock(&hv_vhca->agents_lock); + + if (agent->cleanup) + agent->cleanup(agent); + + kfree(agent); + + mlx5_hv_vhca_agents_update(hv_vhca); +} + +static int mlx5_hv_vhca_data_block_prepare(struct mlx5_hv_vhca_agent *agent, + struct mlx5_hv_vhca_data_block *data_block, + void *src, int len, int *offset) +{ + int bytes = min_t(int, (int)sizeof(data_block->data), len); + + data_block->sequence = agent->seq; + data_block->offset = (*offset)++; + memcpy(data_block->data, src, bytes); + + return bytes; +} + +static void mlx5_hv_vhca_agent_seq_update(struct mlx5_hv_vhca_agent *agent) +{ + agent->seq++; +} + +int mlx5_hv_vhca_agent_write(struct mlx5_hv_vhca_agent *agent, + void *buf, int len) +{ + int offset = agent->type * HV_CONFIG_BLOCK_SIZE_MAX; + int block_offset = 0; + int total = 0; + int err; + + while (len) { + struct mlx5_hv_vhca_data_block data_block = {0}; + int bytes; + + bytes = mlx5_hv_vhca_data_block_prepare(agent, &data_block, + buf + total, + len, &block_offset); + if (!bytes) + return -ENOMEM; + + err = mlx5_hv_write_config(agent->hv_vhca->dev, &data_block, + sizeof(data_block), offset); + if (err) + return err; + + total += bytes; + len -= bytes; + } + + mlx5_hv_vhca_agent_seq_update(agent); + + return 0; +} + +void *mlx5_hv_vhca_agent_priv(struct mlx5_hv_vhca_agent *agent) +{ + return agent->priv; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h new file mode 100644 index 0000000..857ed8f --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h @@ -0,0 +1,101 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2019 Mellanox Technologies. */ + +#ifndef __LIB_HV_VHCA_H__ +#define __LIB_HV_VHCA_H__ + +struct mlx5_hv_vhca_agent; +struct mlx5_hv_vhca; +struct mlx5_hv_vhca_control_block; + +enum mlx5_hv_vhca_agent_type { + MLX5_HV_VHCA_AGENT_CONTROL = 0, + MLX5_HV_VHCA_AGENT_STATS = 1, + MLX5_HV_VHCA_AGENT_MAX = 32, +}; + +#if IS_ENABLED(CONFIG_PCI_HYPERV_INTERFACE) + +struct mlx5_hv_vhca_control_block { + u32 capabilities; + u32 control; + u16 command; + u16 command_ack; + u16 version; + u16 rings; + u32 reserved1[28]; +}; + +struct mlx5_hv_vhca *mlx5_hv_vhca_create(struct mlx5_core_dev *dev); +void mlx5_hv_vhca_destroy(struct mlx5_hv_vhca *hv_vhca); +int mlx5_hv_vhca_init(struct mlx5_hv_vhca *hv_vhca); +void mlx5_hv_vhca_cleanup(struct mlx5_hv_vhca *hv_vhca); +void mlx5_hv_vhca_invalidate(void *context, u64 block_mask); + +struct mlx5_hv_vhca_agent * +mlx5_hv_vhca_agent_create(struct mlx5_hv_vhca *hv_vhca, + enum mlx5_hv_vhca_agent_type type, + void (*control)(struct mlx5_hv_vhca_agent*, + struct mlx5_hv_vhca_control_block *block), + void (*invalidate)(struct mlx5_hv_vhca_agent*, + u64 block_mask), + void (*cleanup)(struct mlx5_hv_vhca_agent *agent), + void *context); + +void mlx5_hv_vhca_agent_destroy(struct mlx5_hv_vhca_agent *agent); +int mlx5_hv_vhca_agent_write(struct mlx5_hv_vhca_agent *agent, + void *buf, int len); +void *mlx5_hv_vhca_agent_priv(struct mlx5_hv_vhca_agent *agent); + +#else + +static inline struct mlx5_hv_vhca * +mlx5_hv_vhca_create(struct mlx5_core_dev *dev) +{ + return NULL; +} + +static inline void mlx5_hv_vhca_destroy(struct mlx5_hv_vhca *hv_vhca) +{ +} + +static inline int mlx5_hv_vhca_init(struct mlx5_hv_vhca *hv_vhca) +{ + return 0; +} + +static inline void mlx5_hv_vhca_cleanup(struct mlx5_hv_vhca *hv_vhca) +{ +} + +static inline void mlx5_hv_vhca_invalidate(void *context, + u64 block_mask) +{ +} + +static inline struct mlx5_hv_vhca_agent * +mlx5_hv_vhca_agent_create(struct mlx5_hv_vhca *hv_vhca, + enum mlx5_hv_vhca_agent_type type, + void (*control)(struct mlx5_hv_vhca_agent*, + struct mlx5_hv_vhca_control_block *block), + void (*invalidate)(struct mlx5_hv_vhca_agent*, + u64 block_mask), + void (*cleanup)(struct mlx5_hv_vhca_agent *agent), + void *context) +{ + return NULL; +} + +static inline void mlx5_hv_vhca_agent_destroy(struct mlx5_hv_vhca_agent *agent) +{ +} + +static inline int +mlx5_hv_vhca_write_agent(struct mlx5_hv_vhca_agent *agent, + void *buf, int len) +{ + return 0; +} +#endif + +#endif /* __LIB_HV_VHCA_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h new file mode 100644 index 0000000..032adb2 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2017, Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __LIB_MLX5_H__ +#define __LIB_MLX5_H__ + +#include "mlx5_core.h" + +void mlx5_init_reserved_gids(struct mlx5_core_dev *dev); +void mlx5_cleanup_reserved_gids(struct mlx5_core_dev *dev); +int mlx5_core_reserve_gids(struct mlx5_core_dev *dev, unsigned int count); +void mlx5_core_unreserve_gids(struct mlx5_core_dev *dev, unsigned int count); +int mlx5_core_reserved_gid_alloc(struct mlx5_core_dev *dev, int *gid_index); +void mlx5_core_reserved_gid_free(struct mlx5_core_dev *dev, int gid_index); +int mlx5_crdump_enable(struct mlx5_core_dev *dev); +void mlx5_crdump_disable(struct mlx5_core_dev *dev); +int mlx5_crdump_collect(struct mlx5_core_dev *dev, u32 *cr_data); + +/* TODO move to lib/events.h */ + +#define PORT_MODULE_EVENT_MODULE_STATUS_MASK 0xF +#define PORT_MODULE_EVENT_ERROR_TYPE_MASK 0xF + +enum port_module_event_status_type { + MLX5_MODULE_STATUS_PLUGGED = 0x1, + MLX5_MODULE_STATUS_UNPLUGGED = 0x2, + MLX5_MODULE_STATUS_ERROR = 0x3, + MLX5_MODULE_STATUS_DISABLED = 0x4, + MLX5_MODULE_STATUS_NUM, +}; + +enum port_module_event_error_type { + MLX5_MODULE_EVENT_ERROR_POWER_BUDGET_EXCEEDED = 0x0, + MLX5_MODULE_EVENT_ERROR_LONG_RANGE_FOR_NON_MLNX = 0x1, + MLX5_MODULE_EVENT_ERROR_BUS_STUCK = 0x2, + MLX5_MODULE_EVENT_ERROR_NO_EEPROM_RETRY_TIMEOUT = 0x3, + MLX5_MODULE_EVENT_ERROR_ENFORCE_PART_NUMBER_LIST = 0x4, + MLX5_MODULE_EVENT_ERROR_UNKNOWN_IDENTIFIER = 0x5, + MLX5_MODULE_EVENT_ERROR_HIGH_TEMPERATURE = 0x6, + MLX5_MODULE_EVENT_ERROR_BAD_CABLE = 0x7, + MLX5_MODULE_EVENT_ERROR_PCIE_POWER_SLOT_EXCEEDED = 0xc, + MLX5_MODULE_EVENT_ERROR_NUM, +}; + +struct mlx5_pme_stats { + u64 status_counters[MLX5_MODULE_STATUS_NUM]; + u64 error_counters[MLX5_MODULE_EVENT_ERROR_NUM]; +}; + +void mlx5_get_pme_stats(struct mlx5_core_dev *dev, struct mlx5_pme_stats *stats); +int mlx5_notifier_call_chain(struct mlx5_events *events, unsigned int event, void *data); + +/* Crypto */ +enum { + MLX5_ACCEL_OBJ_TLS_KEY = MLX5_GENERAL_OBJECT_TYPE_ENCRYPTION_KEY_TYPE_TLS, + MLX5_ACCEL_OBJ_IPSEC_KEY = MLX5_GENERAL_OBJECT_TYPE_ENCRYPTION_KEY_TYPE_IPSEC, + MLX5_ACCEL_OBJ_MACSEC_KEY = MLX5_GENERAL_OBJECT_TYPE_ENCRYPTION_KEY_TYPE_MACSEC, +}; + +int mlx5_create_encryption_key(struct mlx5_core_dev *mdev, + void *key, u32 sz_bytes, + u32 key_type, u32 *p_key_id); +void mlx5_destroy_encryption_key(struct mlx5_core_dev *mdev, u32 key_id); + +static inline struct net *mlx5_core_net(struct mlx5_core_dev *dev) +{ + return devlink_net(priv_to_devlink(dev)); +} + +static inline void mlx5_uplink_netdev_set(struct mlx5_core_dev *mdev, struct net_device *netdev) +{ + mdev->mlx5e_res.uplink_netdev = netdev; +} + +static inline struct net_device *mlx5_uplink_netdev_get(struct mlx5_core_dev *mdev) +{ + return mdev->mlx5e_res.uplink_netdev; +} +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c new file mode 100644 index 0000000..8ff1631 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2017, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include "mlx5_core.h" +#include "lib/mpfs.h" + +/* HW L2 Table (MPFS) management */ +static int set_l2table_entry_cmd(struct mlx5_core_dev *dev, u32 index, u8 *mac) +{ + u32 in[MLX5_ST_SZ_DW(set_l2_table_entry_in)] = {}; + u8 *in_mac_addr; + + MLX5_SET(set_l2_table_entry_in, in, opcode, MLX5_CMD_OP_SET_L2_TABLE_ENTRY); + MLX5_SET(set_l2_table_entry_in, in, table_index, index); + + in_mac_addr = MLX5_ADDR_OF(set_l2_table_entry_in, in, mac_address); + ether_addr_copy(&in_mac_addr[2], mac); + + return mlx5_cmd_exec_in(dev, set_l2_table_entry, in); +} + +static int del_l2table_entry_cmd(struct mlx5_core_dev *dev, u32 index) +{ + u32 in[MLX5_ST_SZ_DW(delete_l2_table_entry_in)] = {}; + + MLX5_SET(delete_l2_table_entry_in, in, opcode, MLX5_CMD_OP_DELETE_L2_TABLE_ENTRY); + MLX5_SET(delete_l2_table_entry_in, in, table_index, index); + return mlx5_cmd_exec_in(dev, delete_l2_table_entry, in); +} + +/* UC L2 table hash node */ +struct l2table_node { + struct l2addr_node node; + u32 index; /* index in HW l2 table */ + int ref_count; +}; + +struct mlx5_mpfs { + struct hlist_head hash[MLX5_L2_ADDR_HASH_SIZE]; + struct mutex lock; /* Synchronize l2 table access */ + u32 size; + unsigned long *bitmap; +}; + +static int alloc_l2table_index(struct mlx5_mpfs *l2table, u32 *ix) +{ + int err = 0; + + *ix = find_first_zero_bit(l2table->bitmap, l2table->size); + if (*ix >= l2table->size) + err = -ENOSPC; + else + __set_bit(*ix, l2table->bitmap); + + return err; +} + +static void free_l2table_index(struct mlx5_mpfs *l2table, u32 ix) +{ + __clear_bit(ix, l2table->bitmap); +} + +int mlx5_mpfs_init(struct mlx5_core_dev *dev) +{ + int l2table_size = 1 << MLX5_CAP_GEN(dev, log_max_l2_table); + struct mlx5_mpfs *mpfs; + + if (!MLX5_ESWITCH_MANAGER(dev)) + return 0; + + mpfs = kzalloc(sizeof(*mpfs), GFP_KERNEL); + if (!mpfs) + return -ENOMEM; + + mutex_init(&mpfs->lock); + mpfs->size = l2table_size; + mpfs->bitmap = bitmap_zalloc(l2table_size, GFP_KERNEL); + if (!mpfs->bitmap) { + kfree(mpfs); + return -ENOMEM; + } + + dev->priv.mpfs = mpfs; + return 0; +} + +void mlx5_mpfs_cleanup(struct mlx5_core_dev *dev) +{ + struct mlx5_mpfs *mpfs = dev->priv.mpfs; + + if (!mpfs) + return; + + WARN_ON(!hlist_empty(mpfs->hash)); + bitmap_free(mpfs->bitmap); + kfree(mpfs); +} + +int mlx5_mpfs_add_mac(struct mlx5_core_dev *dev, u8 *mac) +{ + struct mlx5_mpfs *mpfs = dev->priv.mpfs; + struct l2table_node *l2addr; + int err = 0; + u32 index; + + if (!mpfs) + return 0; + + mutex_lock(&mpfs->lock); + + l2addr = l2addr_hash_find(mpfs->hash, mac, struct l2table_node); + if (l2addr) { + l2addr->ref_count++; + goto out; + } + + err = alloc_l2table_index(mpfs, &index); + if (err) + goto out; + + l2addr = l2addr_hash_add(mpfs->hash, mac, struct l2table_node, GFP_KERNEL); + if (!l2addr) { + err = -ENOMEM; + goto hash_add_err; + } + + err = set_l2table_entry_cmd(dev, index, mac); + if (err) + goto set_table_entry_err; + + l2addr->index = index; + l2addr->ref_count = 1; + + mlx5_core_dbg(dev, "MPFS mac added %pM, index (%d)\n", mac, index); + goto out; + +set_table_entry_err: + l2addr_hash_del(l2addr); +hash_add_err: + free_l2table_index(mpfs, index); +out: + mutex_unlock(&mpfs->lock); + return err; +} +EXPORT_SYMBOL(mlx5_mpfs_add_mac); + +int mlx5_mpfs_del_mac(struct mlx5_core_dev *dev, u8 *mac) +{ + struct mlx5_mpfs *mpfs = dev->priv.mpfs; + struct l2table_node *l2addr; + int err = 0; + u32 index; + + if (!mpfs) + return 0; + + mutex_lock(&mpfs->lock); + + l2addr = l2addr_hash_find(mpfs->hash, mac, struct l2table_node); + if (!l2addr) { + err = -ENOENT; + goto unlock; + } + + if (--l2addr->ref_count > 0) + goto unlock; + + index = l2addr->index; + del_l2table_entry_cmd(dev, index); + l2addr_hash_del(l2addr); + free_l2table_index(mpfs, index); + mlx5_core_dbg(dev, "MPFS mac deleted %pM, index (%d)\n", mac, index); +unlock: + mutex_unlock(&mpfs->lock); + return err; +} +EXPORT_SYMBOL(mlx5_mpfs_del_mac); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.h new file mode 100644 index 0000000..4a29354 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.h @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2017, Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __MLX5_MPFS_H__ +#define __MLX5_MPFS_H__ + +#include +#include + +/* L2 -mac address based- hash helpers */ +#define MLX5_L2_ADDR_HASH_SIZE (BIT(BITS_PER_BYTE)) +#define MLX5_L2_ADDR_HASH(addr) (addr[5]) + +struct l2addr_node { + struct hlist_node hlist; + u8 addr[ETH_ALEN]; +}; + +#define for_each_l2hash_node(hn, tmp, hash, i) \ + for (i = 0; i < MLX5_L2_ADDR_HASH_SIZE; i++) \ + hlist_for_each_entry_safe(hn, tmp, &(hash)[i], hlist) + +#define l2addr_hash_find(hash, mac, type) ({ \ + int ix = MLX5_L2_ADDR_HASH(mac); \ + bool found = false; \ + type *ptr = NULL; \ + \ + hlist_for_each_entry(ptr, &(hash)[ix], node.hlist) \ + if (ether_addr_equal(ptr->node.addr, mac)) {\ + found = true; \ + break; \ + } \ + if (!found) \ + ptr = NULL; \ + ptr; \ +}) + +#define l2addr_hash_add(hash, mac, type, gfp) ({ \ + int ix = MLX5_L2_ADDR_HASH(mac); \ + type *ptr = NULL; \ + \ + ptr = kzalloc(sizeof(type), gfp); \ + if (ptr) { \ + ether_addr_copy(ptr->node.addr, mac); \ + hlist_add_head(&ptr->node.hlist, &(hash)[ix]);\ + } \ + ptr; \ +}) + +#define l2addr_hash_del(ptr) ({ \ + hlist_del(&(ptr)->node.hlist); \ + kfree(ptr); \ +}) + +#ifdef CONFIG_MLX5_MPFS +int mlx5_mpfs_init(struct mlx5_core_dev *dev); +void mlx5_mpfs_cleanup(struct mlx5_core_dev *dev); +#else /* #ifndef CONFIG_MLX5_MPFS */ +static inline int mlx5_mpfs_init(struct mlx5_core_dev *dev) { return 0; } +static inline void mlx5_mpfs_cleanup(struct mlx5_core_dev *dev) {} +#endif + +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/pci_vsc.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/pci_vsc.c new file mode 100644 index 0000000..6b774e0 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/pci_vsc.c @@ -0,0 +1,316 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2019 Mellanox Technologies */ + +#include +#include "mlx5_core.h" +#include "pci_vsc.h" + +#define MLX5_EXTRACT_C(source, offset, size) \ + ((((u32)(source)) >> (offset)) & MLX5_ONES32(size)) +#define MLX5_EXTRACT(src, start, len) \ + (((len) == 32) ? (src) : MLX5_EXTRACT_C(src, start, len)) +#define MLX5_ONES32(size) \ + ((size) ? (0xffffffff >> (32 - (size))) : 0) +#define MLX5_MASK32(offset, size) \ + (MLX5_ONES32(size) << (offset)) +#define MLX5_MERGE_C(rsrc1, rsrc2, start, len) \ + ((((rsrc2) << (start)) & (MLX5_MASK32((start), (len)))) | \ + ((rsrc1) & (~MLX5_MASK32((start), (len))))) +#define MLX5_MERGE(rsrc1, rsrc2, start, len) \ + (((len) == 32) ? (rsrc2) : MLX5_MERGE_C(rsrc1, rsrc2, start, len)) +#define vsc_read(dev, offset, val) \ + pci_read_config_dword((dev)->pdev, (dev)->vsc_addr + (offset), (val)) +#define vsc_write(dev, offset, val) \ + pci_write_config_dword((dev)->pdev, (dev)->vsc_addr + (offset), (val)) +#define VSC_MAX_RETRIES 2048 + +enum { + VSC_CTRL_OFFSET = 0x4, + VSC_COUNTER_OFFSET = 0x8, + VSC_SEMAPHORE_OFFSET = 0xc, + VSC_ADDR_OFFSET = 0x10, + VSC_DATA_OFFSET = 0x14, + + VSC_FLAG_BIT_OFFS = 31, + VSC_FLAG_BIT_LEN = 1, + + VSC_SYND_BIT_OFFS = 30, + VSC_SYND_BIT_LEN = 1, + + VSC_ADDR_BIT_OFFS = 0, + VSC_ADDR_BIT_LEN = 30, + + VSC_SPACE_BIT_OFFS = 0, + VSC_SPACE_BIT_LEN = 16, + + VSC_SIZE_VLD_BIT_OFFS = 28, + VSC_SIZE_VLD_BIT_LEN = 1, + + VSC_STATUS_BIT_OFFS = 29, + VSC_STATUS_BIT_LEN = 3, +}; + +void mlx5_pci_vsc_init(struct mlx5_core_dev *dev) +{ + if (!mlx5_core_is_pf(dev)) + return; + + dev->vsc_addr = pci_find_capability(dev->pdev, + PCI_CAP_ID_VNDR); + if (!dev->vsc_addr) + mlx5_core_warn(dev, "Failed to get valid vendor specific ID\n"); +} + +int mlx5_vsc_gw_lock(struct mlx5_core_dev *dev) +{ + u32 counter = 0; + int retries = 0; + u32 lock_val; + int ret; + + pci_cfg_access_lock(dev->pdev); + do { + if (retries > VSC_MAX_RETRIES) { + ret = -EBUSY; + goto pci_unlock; + } + + /* Check if semaphore is already locked */ + ret = vsc_read(dev, VSC_SEMAPHORE_OFFSET, &lock_val); + if (ret) + goto pci_unlock; + + if (lock_val) { + retries++; + usleep_range(1000, 2000); + continue; + } + + /* Read and write counter value, if written value is + * the same, semaphore was acquired successfully. + */ + ret = vsc_read(dev, VSC_COUNTER_OFFSET, &counter); + if (ret) + goto pci_unlock; + + ret = vsc_write(dev, VSC_SEMAPHORE_OFFSET, counter); + if (ret) + goto pci_unlock; + + ret = vsc_read(dev, VSC_SEMAPHORE_OFFSET, &lock_val); + if (ret) + goto pci_unlock; + + retries++; + } while (counter != lock_val); + + return 0; + +pci_unlock: + pci_cfg_access_unlock(dev->pdev); + return ret; +} + +int mlx5_vsc_gw_unlock(struct mlx5_core_dev *dev) +{ + int ret; + + ret = vsc_write(dev, VSC_SEMAPHORE_OFFSET, MLX5_VSC_UNLOCK); + pci_cfg_access_unlock(dev->pdev); + return ret; +} + +int mlx5_vsc_gw_set_space(struct mlx5_core_dev *dev, u16 space, + u32 *ret_space_size) +{ + int ret; + u32 val = 0; + + if (!mlx5_vsc_accessible(dev)) + return -EINVAL; + + if (ret_space_size) + *ret_space_size = 0; + + /* Get a unique val */ + ret = vsc_read(dev, VSC_CTRL_OFFSET, &val); + if (ret) + goto out; + + /* Try to modify the lock */ + val = MLX5_MERGE(val, space, VSC_SPACE_BIT_OFFS, VSC_SPACE_BIT_LEN); + ret = vsc_write(dev, VSC_CTRL_OFFSET, val); + if (ret) + goto out; + + /* Verify lock was modified */ + ret = vsc_read(dev, VSC_CTRL_OFFSET, &val); + if (ret) + goto out; + + if (MLX5_EXTRACT(val, VSC_STATUS_BIT_OFFS, VSC_STATUS_BIT_LEN) == 0) + return -EINVAL; + + /* Get space max address if indicated by size valid bit */ + if (ret_space_size && + MLX5_EXTRACT(val, VSC_SIZE_VLD_BIT_OFFS, VSC_SIZE_VLD_BIT_LEN)) { + ret = vsc_read(dev, VSC_ADDR_OFFSET, &val); + if (ret) { + mlx5_core_warn(dev, "Failed to get max space size\n"); + goto out; + } + *ret_space_size = MLX5_EXTRACT(val, VSC_ADDR_BIT_OFFS, + VSC_ADDR_BIT_LEN); + } + return 0; + +out: + return ret; +} + +static int mlx5_vsc_wait_on_flag(struct mlx5_core_dev *dev, u8 expected_val) +{ + int retries = 0; + u32 flag; + int ret; + + do { + if (retries > VSC_MAX_RETRIES) + return -EBUSY; + + ret = vsc_read(dev, VSC_ADDR_OFFSET, &flag); + if (ret) + return ret; + flag = MLX5_EXTRACT(flag, VSC_FLAG_BIT_OFFS, VSC_FLAG_BIT_LEN); + retries++; + + if ((retries & 0xf) == 0) + usleep_range(1000, 2000); + + } while (flag != expected_val); + + return 0; +} + +static int mlx5_vsc_gw_write(struct mlx5_core_dev *dev, unsigned int address, + u32 data) +{ + int ret; + + if (MLX5_EXTRACT(address, VSC_SYND_BIT_OFFS, + VSC_FLAG_BIT_LEN + VSC_SYND_BIT_LEN)) + return -EINVAL; + + /* Set flag to 0x1 */ + address = MLX5_MERGE(address, 1, VSC_FLAG_BIT_OFFS, 1); + ret = vsc_write(dev, VSC_DATA_OFFSET, data); + if (ret) + goto out; + + ret = vsc_write(dev, VSC_ADDR_OFFSET, address); + if (ret) + goto out; + + /* Wait for the flag to be cleared */ + ret = mlx5_vsc_wait_on_flag(dev, 0); + +out: + return ret; +} + +static int mlx5_vsc_gw_read(struct mlx5_core_dev *dev, unsigned int address, + u32 *data) +{ + int ret; + + if (MLX5_EXTRACT(address, VSC_SYND_BIT_OFFS, + VSC_FLAG_BIT_LEN + VSC_SYND_BIT_LEN)) + return -EINVAL; + + ret = vsc_write(dev, VSC_ADDR_OFFSET, address); + if (ret) + goto out; + + ret = mlx5_vsc_wait_on_flag(dev, 1); + if (ret) + goto out; + + ret = vsc_read(dev, VSC_DATA_OFFSET, data); +out: + return ret; +} + +static int mlx5_vsc_gw_read_fast(struct mlx5_core_dev *dev, + unsigned int read_addr, + unsigned int *next_read_addr, + u32 *data) +{ + int ret; + + ret = mlx5_vsc_gw_read(dev, read_addr, data); + if (ret) + goto out; + + ret = vsc_read(dev, VSC_ADDR_OFFSET, next_read_addr); + if (ret) + goto out; + + *next_read_addr = MLX5_EXTRACT(*next_read_addr, VSC_ADDR_BIT_OFFS, + VSC_ADDR_BIT_LEN); + + if (*next_read_addr <= read_addr) + ret = -EINVAL; +out: + return ret; +} + +int mlx5_vsc_gw_read_block_fast(struct mlx5_core_dev *dev, u32 *data, + int length) +{ + unsigned int next_read_addr = 0; + unsigned int read_addr = 0; + + while (read_addr < length) { + if (mlx5_vsc_gw_read_fast(dev, read_addr, &next_read_addr, + &data[(read_addr >> 2)])) + return read_addr; + + read_addr = next_read_addr; + } + return length; +} + +int mlx5_vsc_sem_set_space(struct mlx5_core_dev *dev, u16 space, + enum mlx5_vsc_state state) +{ + u32 data, id = 0; + int ret; + + ret = mlx5_vsc_gw_set_space(dev, MLX5_SEMAPHORE_SPACE_DOMAIN, NULL); + if (ret) { + mlx5_core_warn(dev, "Failed to set gw space %d\n", ret); + return ret; + } + + if (state == MLX5_VSC_LOCK) { + /* Get a unique ID based on the counter */ + ret = vsc_read(dev, VSC_COUNTER_OFFSET, &id); + if (ret) + return ret; + } + + /* Try to modify lock */ + ret = mlx5_vsc_gw_write(dev, space, id); + if (ret) + return ret; + + /* Verify lock was modified */ + ret = mlx5_vsc_gw_read(dev, space, &data); + if (ret) + return -EINVAL; + + if (data != id) + return -EBUSY; + + return 0; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/pci_vsc.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/pci_vsc.h new file mode 100644 index 0000000..64272a6 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/pci_vsc.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2019 Mellanox Technologies */ + +#ifndef __MLX5_PCI_VSC_H__ +#define __MLX5_PCI_VSC_H__ + +enum mlx5_vsc_state { + MLX5_VSC_UNLOCK, + MLX5_VSC_LOCK, +}; + +enum { + MLX5_VSC_SPACE_SCAN_CRSPACE = 0x7, +}; + +void mlx5_pci_vsc_init(struct mlx5_core_dev *dev); +int mlx5_vsc_gw_lock(struct mlx5_core_dev *dev); +int mlx5_vsc_gw_unlock(struct mlx5_core_dev *dev); +int mlx5_vsc_gw_set_space(struct mlx5_core_dev *dev, u16 space, + u32 *ret_space_size); +int mlx5_vsc_gw_read_block_fast(struct mlx5_core_dev *dev, u32 *data, + int length); + +static inline bool mlx5_vsc_accessible(struct mlx5_core_dev *dev) +{ + return !!dev->vsc_addr; +} + +int mlx5_vsc_sem_set_space(struct mlx5_core_dev *dev, u16 space, + enum mlx5_vsc_state state); + +#endif /* __MLX5_PCI_VSC_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/port_tun.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/port_tun.c new file mode 100644 index 0000000..e042e09 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/port_tun.c @@ -0,0 +1,187 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2019 Mellanox Technologies. */ + +#include +#include +#include +#include "mlx5_core.h" +#include "lib/port_tun.h" + +struct mlx5_port_tun_entropy_flags { + bool force_supported, force_enabled; + bool calc_supported, calc_enabled; + bool gre_calc_supported, gre_calc_enabled; +}; + +static void mlx5_query_port_tun_entropy(struct mlx5_core_dev *mdev, + struct mlx5_port_tun_entropy_flags *entropy_flags) +{ + u32 out[MLX5_ST_SZ_DW(pcmr_reg)]; + /* Default values for FW which do not support MLX5_REG_PCMR */ + entropy_flags->force_supported = false; + entropy_flags->calc_supported = false; + entropy_flags->gre_calc_supported = false; + entropy_flags->force_enabled = false; + entropy_flags->calc_enabled = true; + entropy_flags->gre_calc_enabled = true; + + if (!MLX5_CAP_GEN(mdev, ports_check)) + return; + + if (mlx5_query_ports_check(mdev, out, sizeof(out))) + return; + + entropy_flags->force_supported = !!(MLX5_GET(pcmr_reg, out, entropy_force_cap)); + entropy_flags->calc_supported = !!(MLX5_GET(pcmr_reg, out, entropy_calc_cap)); + entropy_flags->gre_calc_supported = !!(MLX5_GET(pcmr_reg, out, entropy_gre_calc_cap)); + entropy_flags->force_enabled = !!(MLX5_GET(pcmr_reg, out, entropy_force)); + entropy_flags->calc_enabled = !!(MLX5_GET(pcmr_reg, out, entropy_calc)); + entropy_flags->gre_calc_enabled = !!(MLX5_GET(pcmr_reg, out, entropy_gre_calc)); +} + +static int mlx5_set_port_tun_entropy_calc(struct mlx5_core_dev *mdev, u8 enable, + u8 force) +{ + u32 in[MLX5_ST_SZ_DW(pcmr_reg)] = {0}; + int err; + + err = mlx5_query_ports_check(mdev, in, sizeof(in)); + if (err) + return err; + MLX5_SET(pcmr_reg, in, local_port, 1); + MLX5_SET(pcmr_reg, in, entropy_force, force); + MLX5_SET(pcmr_reg, in, entropy_calc, enable); + return mlx5_set_ports_check(mdev, in, sizeof(in)); +} + +static int mlx5_set_port_gre_tun_entropy_calc(struct mlx5_core_dev *mdev, + u8 enable, u8 force) +{ + u32 in[MLX5_ST_SZ_DW(pcmr_reg)] = {0}; + int err; + + err = mlx5_query_ports_check(mdev, in, sizeof(in)); + if (err) + return err; + MLX5_SET(pcmr_reg, in, local_port, 1); + MLX5_SET(pcmr_reg, in, entropy_force, force); + MLX5_SET(pcmr_reg, in, entropy_gre_calc, enable); + return mlx5_set_ports_check(mdev, in, sizeof(in)); +} + +void mlx5_init_port_tun_entropy(struct mlx5_tun_entropy *tun_entropy, + struct mlx5_core_dev *mdev) +{ + struct mlx5_port_tun_entropy_flags entropy_flags; + + tun_entropy->mdev = mdev; + mutex_init(&tun_entropy->lock); + mlx5_query_port_tun_entropy(mdev, &entropy_flags); + tun_entropy->num_enabling_entries = 0; + tun_entropy->num_disabling_entries = 0; + tun_entropy->enabled = entropy_flags.calc_supported ? + entropy_flags.calc_enabled : true; +} + +static int mlx5_set_entropy(struct mlx5_tun_entropy *tun_entropy, + int reformat_type, bool enable) +{ + struct mlx5_port_tun_entropy_flags entropy_flags; + int err; + + mlx5_query_port_tun_entropy(tun_entropy->mdev, &entropy_flags); + /* Tunnel entropy calculation may be controlled either on port basis + * for all tunneling protocols or specifically for GRE protocol. + * Prioritize GRE protocol control (if capable) over global port + * configuration. + */ + if (entropy_flags.gre_calc_supported && + reformat_type == MLX5_REFORMAT_TYPE_L2_TO_NVGRE) { + if (!entropy_flags.force_supported) + return 0; + err = mlx5_set_port_gre_tun_entropy_calc(tun_entropy->mdev, + enable, !enable); + if (err) + return err; + } else if (entropy_flags.calc_supported) { + /* Other applications may change the global FW entropy + * calculations settings. Check that the current entropy value + * is the negative of the updated value. + */ + if (entropy_flags.force_enabled && + enable == entropy_flags.calc_enabled) { + mlx5_core_warn(tun_entropy->mdev, + "Unexpected entropy calc setting - expected %d", + !entropy_flags.calc_enabled); + return -EOPNOTSUPP; + } + /* GRE requires disabling entropy calculation. if there are + * enabling entries (i.e VXLAN) we cannot turn it off for them, + * thus fail. + */ + if (tun_entropy->num_enabling_entries) + return -EOPNOTSUPP; + err = mlx5_set_port_tun_entropy_calc(tun_entropy->mdev, enable, + entropy_flags.force_supported); + if (err) + return err; + tun_entropy->enabled = enable; + /* if we turn on the entropy we don't need to force it anymore */ + if (entropy_flags.force_supported && enable) { + err = mlx5_set_port_tun_entropy_calc(tun_entropy->mdev, 1, 0); + if (err) + return err; + } + } + + return 0; +} + +/* the function manages the refcount for enabling/disabling tunnel types. + * the return value indicates if the inc is successful or not, depending on + * entropy capabilities and configuration. + */ +int mlx5_tun_entropy_refcount_inc(struct mlx5_tun_entropy *tun_entropy, + int reformat_type) +{ + int err = -EOPNOTSUPP; + + mutex_lock(&tun_entropy->lock); + if ((reformat_type == MLX5_REFORMAT_TYPE_L2_TO_VXLAN || + reformat_type == MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL) && + tun_entropy->enabled) { + /* in case entropy calculation is enabled for all tunneling + * types, it is ok for VXLAN, so approve. + * otherwise keep the error default. + */ + tun_entropy->num_enabling_entries++; + err = 0; + } else if (reformat_type == MLX5_REFORMAT_TYPE_L2_TO_NVGRE) { + /* turn off the entropy only for the first GRE rule. + * for the next rules the entropy was already disabled + * successfully. + */ + if (tun_entropy->num_disabling_entries == 0) + err = mlx5_set_entropy(tun_entropy, reformat_type, 0); + else + err = 0; + if (!err) + tun_entropy->num_disabling_entries++; + } + mutex_unlock(&tun_entropy->lock); + + return err; +} + +void mlx5_tun_entropy_refcount_dec(struct mlx5_tun_entropy *tun_entropy, + int reformat_type) +{ + mutex_lock(&tun_entropy->lock); + if (reformat_type == MLX5_REFORMAT_TYPE_L2_TO_VXLAN) + tun_entropy->num_enabling_entries--; + else if (reformat_type == MLX5_REFORMAT_TYPE_L2_TO_NVGRE && + --tun_entropy->num_disabling_entries == 0) + mlx5_set_entropy(tun_entropy, reformat_type, 1); + mutex_unlock(&tun_entropy->lock); +} + diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/port_tun.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/port_tun.h new file mode 100644 index 0000000..54c42a8 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/port_tun.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2019 Mellanox Technologies. */ + +#ifndef __MLX5_PORT_TUN_H__ +#define __MLX5_PORT_TUN_H__ + +#include + +struct mlx5_tun_entropy { + struct mlx5_core_dev *mdev; + u32 num_enabling_entries; + u32 num_disabling_entries; + u8 enabled; + struct mutex lock; /* lock the entropy fields */ +}; + +void mlx5_init_port_tun_entropy(struct mlx5_tun_entropy *tun_entropy, + struct mlx5_core_dev *mdev); +int mlx5_tun_entropy_refcount_inc(struct mlx5_tun_entropy *tun_entropy, + int reformat_type); +void mlx5_tun_entropy_refcount_dec(struct mlx5_tun_entropy *tun_entropy, + int reformat_type); + +#endif /* __MLX5_PORT_TUN_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/sf.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/sf.h new file mode 100644 index 0000000..84e5683 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/sf.h @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2021 Mellanox Technologies Ltd */ + +#ifndef __LIB_MLX5_SF_H__ +#define __LIB_MLX5_SF_H__ + +#include + +static inline u16 mlx5_sf_start_function_id(const struct mlx5_core_dev *dev) +{ + return MLX5_CAP_GEN(dev, sf_base_id); +} + +#ifdef CONFIG_MLX5_SF + +static inline bool mlx5_sf_supported(const struct mlx5_core_dev *dev) +{ + return MLX5_CAP_GEN(dev, sf); +} + +static inline u16 mlx5_sf_max_functions(const struct mlx5_core_dev *dev) +{ + if (!mlx5_sf_supported(dev)) + return 0; + if (MLX5_CAP_GEN(dev, max_num_sf)) + return MLX5_CAP_GEN(dev, max_num_sf); + else + return 1 << MLX5_CAP_GEN(dev, log_max_sf); +} + +#else + +static inline bool mlx5_sf_supported(const struct mlx5_core_dev *dev) +{ + return false; +} + +static inline u16 mlx5_sf_max_functions(const struct mlx5_core_dev *dev) +{ + return 0; +} + +#endif + +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/smfs.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/smfs.c new file mode 100644 index 0000000..9b8c051 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/smfs.c @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. */ + +#include +#include + +#include "smfs.h" + +struct mlx5dr_matcher * +mlx5_smfs_matcher_create(struct mlx5dr_table *table, u32 priority, struct mlx5_flow_spec *spec) +{ + struct mlx5dr_match_parameters matcher_mask = {}; + + matcher_mask.match_buf = (u64 *)&spec->match_criteria; + matcher_mask.match_sz = DR_SZ_MATCH_PARAM; + + return mlx5dr_matcher_create(table, priority, spec->match_criteria_enable, &matcher_mask); +} + +void +mlx5_smfs_matcher_destroy(struct mlx5dr_matcher *matcher) +{ + mlx5dr_matcher_destroy(matcher); +} + +struct mlx5dr_table * +mlx5_smfs_table_get_from_fs_ft(struct mlx5_flow_table *ft) +{ + return mlx5dr_table_get_from_fs_ft(ft); +} + +struct mlx5dr_action * +mlx5_smfs_action_create_dest_table(struct mlx5dr_table *table) +{ + return mlx5dr_action_create_dest_table(table); +} + +struct mlx5dr_action * +mlx5_smfs_action_create_flow_counter(u32 counter_id) +{ + return mlx5dr_action_create_flow_counter(counter_id); +} + +void +mlx5_smfs_action_destroy(struct mlx5dr_action *action) +{ + mlx5dr_action_destroy(action); +} + +struct mlx5dr_rule * +mlx5_smfs_rule_create(struct mlx5dr_matcher *matcher, struct mlx5_flow_spec *spec, + size_t num_actions, struct mlx5dr_action *actions[], + u32 flow_source) +{ + struct mlx5dr_match_parameters value = {}; + + value.match_buf = (u64 *)spec->match_value; + value.match_sz = DR_SZ_MATCH_PARAM; + + return mlx5dr_rule_create(matcher, &value, num_actions, actions, flow_source); +} + +void +mlx5_smfs_rule_destroy(struct mlx5dr_rule *rule) +{ + mlx5dr_rule_destroy(rule); +} + diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/smfs.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/smfs.h new file mode 100644 index 0000000..452d0df --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/smfs.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. */ + +#ifndef __MLX5_LIB_SMFS_H__ +#define __MLX5_LIB_SMFS_H__ + +#include "steering/mlx5dr.h" +#include "steering/dr_types.h" + +struct mlx5dr_matcher * +mlx5_smfs_matcher_create(struct mlx5dr_table *table, u32 priority, struct mlx5_flow_spec *spec); + +void +mlx5_smfs_matcher_destroy(struct mlx5dr_matcher *matcher); + +struct mlx5dr_table * +mlx5_smfs_table_get_from_fs_ft(struct mlx5_flow_table *ft); + +struct mlx5dr_action * +mlx5_smfs_action_create_dest_table(struct mlx5dr_table *table); + +struct mlx5dr_action * +mlx5_smfs_action_create_flow_counter(u32 counter_id); + +void +mlx5_smfs_action_destroy(struct mlx5dr_action *action); + +struct mlx5dr_rule * +mlx5_smfs_rule_create(struct mlx5dr_matcher *matcher, struct mlx5_flow_spec *spec, + size_t num_actions, struct mlx5dr_action *actions[], + u32 flow_source); + +void +mlx5_smfs_rule_destroy(struct mlx5dr_rule *rule); + +#endif /* __MLX5_LIB_SMFS_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/tout.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/tout.c new file mode 100644 index 0000000..696e45e --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/tout.c @@ -0,0 +1,159 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#include +#include "lib/tout.h" + +struct mlx5_timeouts { + u64 to[MAX_TIMEOUT_TYPES]; +}; + +static const u32 tout_def_sw_val[MAX_TIMEOUT_TYPES] = { + [MLX5_TO_FW_PRE_INIT_TIMEOUT_MS] = 120000, + [MLX5_TO_FW_PRE_INIT_ON_RECOVERY_TIMEOUT_MS] = 7200000, + [MLX5_TO_FW_PRE_INIT_WARN_MESSAGE_INTERVAL_MS] = 20000, + [MLX5_TO_FW_PRE_INIT_WAIT_MS] = 2, + [MLX5_TO_FW_INIT_MS] = 2000, + [MLX5_TO_CMD_MS] = 60000, + [MLX5_TO_PCI_TOGGLE_MS] = 2000, + [MLX5_TO_HEALTH_POLL_INTERVAL_MS] = 2000, + [MLX5_TO_FULL_CRDUMP_MS] = 60000, + [MLX5_TO_FW_RESET_MS] = 60000, + [MLX5_TO_FLUSH_ON_ERROR_MS] = 2000, + [MLX5_TO_PCI_SYNC_UPDATE_MS] = 5000, + [MLX5_TO_TEARDOWN_MS] = 3000, + [MLX5_TO_FSM_REACTIVATE_MS] = 5000, + [MLX5_TO_RECLAIM_PAGES_MS] = 5000, + [MLX5_TO_RECLAIM_VFS_PAGES_MS] = 120000 +}; + +static void tout_set(struct mlx5_core_dev *dev, u64 val, enum mlx5_timeouts_types type) +{ + dev->timeouts->to[type] = val; +} + +int mlx5_tout_init(struct mlx5_core_dev *dev) +{ + int i; + + dev->timeouts = kmalloc(sizeof(*dev->timeouts), GFP_KERNEL); + if (!dev->timeouts) + return -ENOMEM; + + for (i = 0; i < MAX_TIMEOUT_TYPES; i++) + tout_set(dev, tout_def_sw_val[i], i); + + return 0; +} + +void mlx5_tout_cleanup(struct mlx5_core_dev *dev) +{ + kfree(dev->timeouts); +} + +/* Time register consists of two fields to_multiplier(time out multiplier) + * and to_value(time out value). to_value is the quantity of the time units and + * to_multiplier is the type and should be one off these four values. + * 0x0: millisecond + * 0x1: seconds + * 0x2: minutes + * 0x3: hours + * this function converts the time stored in the two register fields into + * millisecond. + */ +static u64 tout_convert_reg_field_to_ms(u32 to_mul, u32 to_val) +{ + u64 msec = to_val; + + to_mul &= 0x3; + /* convert hours/minutes/seconds to miliseconds */ + if (to_mul) + msec *= 1000 * int_pow(60, to_mul - 1); + + return msec; +} + +static u64 tout_convert_iseg_to_ms(u32 iseg_to) +{ + return tout_convert_reg_field_to_ms(iseg_to >> 29, iseg_to & 0xfffff); +} + +static bool tout_is_supported(struct mlx5_core_dev *dev) +{ + return !!ioread32be(&dev->iseg->cmd_q_init_to); +} + +void mlx5_tout_query_iseg(struct mlx5_core_dev *dev) +{ + u32 to; + + if (!tout_is_supported(dev)) + return; + + to = ioread32be(&dev->iseg->cmd_q_init_to); + tout_set(dev, tout_convert_iseg_to_ms(to), MLX5_TO_FW_INIT_MS); + + to = ioread32be(&dev->iseg->cmd_exec_to); + tout_set(dev, tout_convert_iseg_to_ms(to), MLX5_TO_CMD_MS); +} + +u64 _mlx5_tout_ms(struct mlx5_core_dev *dev, enum mlx5_timeouts_types type) +{ + return dev->timeouts->to[type]; +} + +#define MLX5_TIMEOUT_QUERY(fld, reg_out) \ + ({ \ + struct mlx5_ifc_default_timeout_bits *time_field; \ + u32 to_multi, to_value; \ + u64 to_val_ms; \ + \ + time_field = MLX5_ADDR_OF(dtor_reg, reg_out, fld); \ + to_multi = MLX5_GET(default_timeout, time_field, to_multiplier); \ + to_value = MLX5_GET(default_timeout, time_field, to_value); \ + to_val_ms = tout_convert_reg_field_to_ms(to_multi, to_value); \ + to_val_ms; \ + }) + +#define MLX5_TIMEOUT_FILL(fld, reg_out, dev, to_type, to_extra) \ + ({ \ + u64 fw_to = MLX5_TIMEOUT_QUERY(fld, reg_out); \ + tout_set(dev, fw_to + (to_extra), to_type); \ + fw_to; \ + }) + +static int tout_query_dtor(struct mlx5_core_dev *dev) +{ + u64 pcie_toggle_to_val, tear_down_to_val; + u32 out[MLX5_ST_SZ_DW(dtor_reg)] = {}; + u32 in[MLX5_ST_SZ_DW(dtor_reg)] = {}; + int err; + + err = mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out), MLX5_REG_DTOR, 0, 0); + if (err) + return err; + + pcie_toggle_to_val = MLX5_TIMEOUT_FILL(pcie_toggle_to, out, dev, MLX5_TO_PCI_TOGGLE_MS, 0); + MLX5_TIMEOUT_FILL(fw_reset_to, out, dev, MLX5_TO_FW_RESET_MS, pcie_toggle_to_val); + + tear_down_to_val = MLX5_TIMEOUT_FILL(tear_down_to, out, dev, MLX5_TO_TEARDOWN_MS, 0); + MLX5_TIMEOUT_FILL(pci_sync_update_to, out, dev, MLX5_TO_PCI_SYNC_UPDATE_MS, + tear_down_to_val); + + MLX5_TIMEOUT_FILL(health_poll_to, out, dev, MLX5_TO_HEALTH_POLL_INTERVAL_MS, 0); + MLX5_TIMEOUT_FILL(full_crdump_to, out, dev, MLX5_TO_FULL_CRDUMP_MS, 0); + MLX5_TIMEOUT_FILL(flush_on_err_to, out, dev, MLX5_TO_FLUSH_ON_ERROR_MS, 0); + MLX5_TIMEOUT_FILL(fsm_reactivate_to, out, dev, MLX5_TO_FSM_REACTIVATE_MS, 0); + MLX5_TIMEOUT_FILL(reclaim_pages_to, out, dev, MLX5_TO_RECLAIM_PAGES_MS, 0); + MLX5_TIMEOUT_FILL(reclaim_vfs_pages_to, out, dev, MLX5_TO_RECLAIM_VFS_PAGES_MS, 0); + + return 0; +} + +int mlx5_tout_query_dtor(struct mlx5_core_dev *dev) +{ + if (tout_is_supported(dev)) + return tout_query_dtor(dev); + + return 0; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/tout.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/tout.h new file mode 100644 index 0000000..bc9e9ae --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/tout.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#ifndef MLX5_TIMEOUTS_H +#define MLX5_TIMEOUTS_H + +enum mlx5_timeouts_types { + /* pre init timeouts (not read from FW) */ + MLX5_TO_FW_PRE_INIT_TIMEOUT_MS, + MLX5_TO_FW_PRE_INIT_ON_RECOVERY_TIMEOUT_MS, + MLX5_TO_FW_PRE_INIT_WARN_MESSAGE_INTERVAL_MS, + MLX5_TO_FW_PRE_INIT_WAIT_MS, + + /* init segment timeouts */ + MLX5_TO_FW_INIT_MS, + MLX5_TO_CMD_MS, + + /* DTOR timeouts */ + MLX5_TO_PCI_TOGGLE_MS, + MLX5_TO_HEALTH_POLL_INTERVAL_MS, + MLX5_TO_FULL_CRDUMP_MS, + MLX5_TO_FW_RESET_MS, + MLX5_TO_FLUSH_ON_ERROR_MS, + MLX5_TO_PCI_SYNC_UPDATE_MS, + MLX5_TO_TEARDOWN_MS, + MLX5_TO_FSM_REACTIVATE_MS, + MLX5_TO_RECLAIM_PAGES_MS, + MLX5_TO_RECLAIM_VFS_PAGES_MS, + + MAX_TIMEOUT_TYPES +}; + +struct mlx5_core_dev; +int mlx5_tout_init(struct mlx5_core_dev *dev); +void mlx5_tout_cleanup(struct mlx5_core_dev *dev); +void mlx5_tout_query_iseg(struct mlx5_core_dev *dev); +int mlx5_tout_query_dtor(struct mlx5_core_dev *dev); +u64 _mlx5_tout_ms(struct mlx5_core_dev *dev, enum mlx5_timeouts_types type); + +#define mlx5_tout_ms(dev, type) _mlx5_tout_ms(dev, MLX5_TO_##type##_MS) + +# endif /* MLX5_TIMEOUTS_H */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.c new file mode 100644 index 0000000..e3b0a13 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.c @@ -0,0 +1,197 @@ +/* + * Copyright (c) 2016, Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include "mlx5_core.h" +#include "vxlan.h" + +struct mlx5_vxlan { + struct mlx5_core_dev *mdev; + /* max_num_ports is usually 4, 16 buckets is more than enough */ + DECLARE_HASHTABLE(htable, 4); + struct mutex sync_lock; /* sync add/del port HW operations */ +}; + +struct mlx5_vxlan_port { + struct hlist_node hlist; + u16 udp_port; +}; + +static int mlx5_vxlan_core_add_port_cmd(struct mlx5_core_dev *mdev, u16 port) +{ + u32 in[MLX5_ST_SZ_DW(add_vxlan_udp_dport_in)] = {}; + + MLX5_SET(add_vxlan_udp_dport_in, in, opcode, + MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT); + MLX5_SET(add_vxlan_udp_dport_in, in, vxlan_udp_port, port); + return mlx5_cmd_exec_in(mdev, add_vxlan_udp_dport, in); +} + +static int mlx5_vxlan_core_del_port_cmd(struct mlx5_core_dev *mdev, u16 port) +{ + u32 in[MLX5_ST_SZ_DW(delete_vxlan_udp_dport_in)] = {}; + + MLX5_SET(delete_vxlan_udp_dport_in, in, opcode, + MLX5_CMD_OP_DELETE_VXLAN_UDP_DPORT); + MLX5_SET(delete_vxlan_udp_dport_in, in, vxlan_udp_port, port); + return mlx5_cmd_exec_in(mdev, delete_vxlan_udp_dport, in); +} + +bool mlx5_vxlan_lookup_port(struct mlx5_vxlan *vxlan, u16 port) +{ + struct mlx5_vxlan_port *vxlanp; + bool found = false; + + if (!mlx5_vxlan_allowed(vxlan)) + return NULL; + + rcu_read_lock(); + hash_for_each_possible_rcu(vxlan->htable, vxlanp, hlist, port) + if (vxlanp->udp_port == port) { + found = true; + break; + } + rcu_read_unlock(); + + return found; +} + +static struct mlx5_vxlan_port *vxlan_lookup_port(struct mlx5_vxlan *vxlan, u16 port) +{ + struct mlx5_vxlan_port *vxlanp; + + hash_for_each_possible(vxlan->htable, vxlanp, hlist, port) + if (vxlanp->udp_port == port) + return vxlanp; + return NULL; +} + +int mlx5_vxlan_add_port(struct mlx5_vxlan *vxlan, u16 port) +{ + struct mlx5_vxlan_port *vxlanp; + int ret; + + vxlanp = kzalloc(sizeof(*vxlanp), GFP_KERNEL); + if (!vxlanp) + return -ENOMEM; + vxlanp->udp_port = port; + + ret = mlx5_vxlan_core_add_port_cmd(vxlan->mdev, port); + if (ret) { + kfree(vxlanp); + return ret; + } + + mutex_lock(&vxlan->sync_lock); + hash_add_rcu(vxlan->htable, &vxlanp->hlist, port); + mutex_unlock(&vxlan->sync_lock); + + return 0; +} + +int mlx5_vxlan_del_port(struct mlx5_vxlan *vxlan, u16 port) +{ + struct mlx5_vxlan_port *vxlanp; + int ret = 0; + + mutex_lock(&vxlan->sync_lock); + + vxlanp = vxlan_lookup_port(vxlan, port); + if (WARN_ON(!vxlanp)) { + ret = -ENOENT; + goto out_unlock; + } + + hash_del_rcu(&vxlanp->hlist); + synchronize_rcu(); + mlx5_vxlan_core_del_port_cmd(vxlan->mdev, port); + kfree(vxlanp); + +out_unlock: + mutex_unlock(&vxlan->sync_lock); + return ret; +} + +struct mlx5_vxlan *mlx5_vxlan_create(struct mlx5_core_dev *mdev) +{ + struct mlx5_vxlan *vxlan; + + if (!MLX5_CAP_ETH(mdev, tunnel_stateless_vxlan) || !mlx5_core_is_pf(mdev)) + return ERR_PTR(-ENOTSUPP); + + vxlan = kzalloc(sizeof(*vxlan), GFP_KERNEL); + if (!vxlan) + return ERR_PTR(-ENOMEM); + + vxlan->mdev = mdev; + mutex_init(&vxlan->sync_lock); + hash_init(vxlan->htable); + + /* Hardware adds 4789 (IANA_VXLAN_UDP_PORT) by default */ + mlx5_vxlan_add_port(vxlan, IANA_VXLAN_UDP_PORT); + + return vxlan; +} + +void mlx5_vxlan_destroy(struct mlx5_vxlan *vxlan) +{ + if (!mlx5_vxlan_allowed(vxlan)) + return; + + mlx5_vxlan_del_port(vxlan, IANA_VXLAN_UDP_PORT); + WARN_ON(!hash_empty(vxlan->htable)); + + kfree(vxlan); +} + +void mlx5_vxlan_reset_to_default(struct mlx5_vxlan *vxlan) +{ + struct mlx5_vxlan_port *vxlanp; + struct hlist_node *tmp; + int bkt; + + if (!mlx5_vxlan_allowed(vxlan)) + return; + + hash_for_each_safe(vxlan->htable, bkt, tmp, vxlanp, hlist) { + /* Don't delete default UDP port added by the HW. + * Remove only user configured ports + */ + if (vxlanp->udp_port == IANA_VXLAN_UDP_PORT) + continue; + mlx5_vxlan_del_port(vxlan, vxlanp->udp_port); + } +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.h new file mode 100644 index 0000000..34ef662 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2016, Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __MLX5_VXLAN_H__ +#define __MLX5_VXLAN_H__ + +#include + +struct mlx5_vxlan; +struct mlx5_vxlan_port; + +static inline u8 mlx5_vxlan_max_udp_ports(struct mlx5_core_dev *mdev) +{ + return MLX5_CAP_ETH(mdev, max_vxlan_udp_ports) ?: 4; +} + +static inline bool mlx5_vxlan_allowed(struct mlx5_vxlan *vxlan) +{ + /* not allowed reason is encoded in vxlan pointer as error, + * on mlx5_vxlan_create + */ + return !IS_ERR_OR_NULL(vxlan); +} + +#if IS_ENABLED(CONFIG_VXLAN) +struct mlx5_vxlan *mlx5_vxlan_create(struct mlx5_core_dev *mdev); +void mlx5_vxlan_destroy(struct mlx5_vxlan *vxlan); +int mlx5_vxlan_add_port(struct mlx5_vxlan *vxlan, u16 port); +int mlx5_vxlan_del_port(struct mlx5_vxlan *vxlan, u16 port); +bool mlx5_vxlan_lookup_port(struct mlx5_vxlan *vxlan, u16 port); +void mlx5_vxlan_reset_to_default(struct mlx5_vxlan *vxlan); +#else +static inline struct mlx5_vxlan* +mlx5_vxlan_create(struct mlx5_core_dev *mdev) { return ERR_PTR(-EOPNOTSUPP); } +static inline void mlx5_vxlan_destroy(struct mlx5_vxlan *vxlan) { return; } +static inline int mlx5_vxlan_add_port(struct mlx5_vxlan *vxlan, u16 port) { return -EOPNOTSUPP; } +static inline int mlx5_vxlan_del_port(struct mlx5_vxlan *vxlan, u16 port) { return -EOPNOTSUPP; } +static inline bool mlx5_vxlan_lookup_port(struct mlx5_vxlan *vxlan, u16 port) { return false; } +static inline void mlx5_vxlan_reset_to_default(struct mlx5_vxlan *vxlan) { return; } +#endif + +#endif /* __MLX5_VXLAN_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/main.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/main.c new file mode 100644 index 0000000..683378b --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -0,0 +1,2469 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_RFS_ACCEL +#include +#endif +#include +#include "mlx5_core.h" +#include "lib/eq.h" +#include "fs_core.h" +#include "lib/mpfs.h" +#include "eswitch.h" +#include "devlink.h" +#include "fw_reset.h" +#include "lib/mlx5.h" +#include "lib/tout.h" +#include "fpga/core.h" +#include "fpga/ipsec.h" +#include "accel/ipsec.h" +#include "accel/tls.h" +#include "lib/clock.h" +#include "lib/vxlan.h" +#include "lib/geneve.h" +#include "lib/devcom.h" +#include "lib/pci_vsc.h" +#include "diag/fw_tracer.h" +#include "ecpf.h" +#include "lib/hv_vhca.h" +#include "diag/rsc_dump.h" +#include "sf/vhca_event.h" +#include "sf/dev/dev.h" +#include "sf/sf.h" +#include "mlx5_devm.h" +#include "mlx5_irq.h" +#include "diag/diag_cnt.h" + +MODULE_AUTHOR("Eli Cohen "); +MODULE_DESCRIPTION("Mellanox 5th generation network adapters (ConnectX series) core driver"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION(DRIVER_VERSION); +MODULE_INFO(basedon, "Korg 5.17-rc4"); + +unsigned int mlx5_core_debug_mask; +module_param_named(debug_mask, mlx5_core_debug_mask, uint, 0644); +MODULE_PARM_DESC(debug_mask, "debug mask: 1 = dump cmd data, 2 = dump cmd exec time, 3 = both. Default=0"); + +static unsigned int prof_sel = MLX5_DEFAULT_PROF; +module_param_named(prof_sel, prof_sel, uint, 0444); +MODULE_PARM_DESC(prof_sel, "profile selector. Valid range 0 - 3"); + +MODULE_ALIAS("auxiliary:mlx5_core.eth"); +MODULE_ALIAS("auxiliary:mlx5_core.eth-rep"); + +struct proc_dir_entry *mlx5_core_proc_dir; +struct proc_dir_entry *mlx5_crdump_dir; + +static u32 sw_owner_id[4]; + +enum { + MLX5_ATOMIC_REQ_MODE_BE = 0x0, + MLX5_ATOMIC_REQ_MODE_HOST_ENDIANNESS = 0x1, +}; + +#define LOG_MAX_SUPPORTED_QPS 0xff + +static struct mlx5_profile profile[] = { + [0] = { + .mask = 0, + }, + [1] = { + .mask = MLX5_PROF_MASK_QP_SIZE, + .log_max_qp = 12, + }, + [2] = { + .mask = MLX5_PROF_MASK_QP_SIZE | + MLX5_PROF_MASK_MR_CACHE, + .log_max_qp = LOG_MAX_SUPPORTED_QPS, + .mr_cache[0] = { + .size = 1500, + .limit = 750 + }, + .mr_cache[1] = { + .size = 1500, + .limit = 750 + }, + .mr_cache[2] = { + .size = 500, + .limit = 250 + }, + .mr_cache[3] = { + .size = 500, + .limit = 250 + }, + .mr_cache[4] = { + .size = 500, + .limit = 250 + }, + .mr_cache[5] = { + .size = 500, + .limit = 250 + }, + .mr_cache[6] = { + .size = 500, + .limit = 250 + }, + .mr_cache[7] = { + .size = 500, + .limit = 250 + }, + .mr_cache[8] = { + .size = 500, + .limit = 250 + }, + .mr_cache[9] = { + .size = 500, + .limit = 250 + }, + .mr_cache[10] = { + .size = 500, + .limit = 250 + }, + .mr_cache[11] = { + .size = 500, + .limit = 250 + }, + .mr_cache[12] = { + .size = 64, + .limit = 32 + }, + .mr_cache[13] = { + .size = 32, + .limit = 16 + }, + .mr_cache[14] = { + .size = 16, + .limit = 8 + }, + .mr_cache[15] = { + .size = 8, + .limit = 4 + }, + }, + [3] = { + .mask = MLX5_PROF_MASK_QP_SIZE | + MLX5_PROF_MASK_MR_CACHE , + .log_max_qp = 18, + .mr_cache[0] = { + .size = 1500, + .limit = 750 + }, + .mr_cache[1] = { + .size = 1500, + .limit = 750 + }, + .mr_cache[2] = { + .size = 500, + .limit = 250 + }, + .mr_cache[3] = { + .size = 500, + .limit = 250 + }, + .mr_cache[4] = { + .size = 500, + .limit = 250 + }, + .mr_cache[5] = { + .size = 500, + .limit = 250 + }, + .mr_cache[6] = { + .size = 500, + .limit = 250 + }, + .mr_cache[7] = { + .size = 500, + .limit = 250 + }, + .mr_cache[8] = { + .size = 500, + .limit = 250 + }, + .mr_cache[9] = { + .size = 500, + .limit = 250 + }, + .mr_cache[10] = { + .size = 500, + .limit = 250 + }, + .mr_cache[11] = { + .size = 500, + .limit = 250 + }, + .mr_cache[12] = { + .size = 64, + .limit = 32 + }, + .mr_cache[13] = { + .size = 32, + .limit = 16 + }, + .mr_cache[14] = { + .size = 16, + .limit = 8 + }, + .mr_cache[15] = { + .size = 8, + .limit = 4 + }, + .mr_cache[16] = { + .size = 8, + .limit = 4 + }, + .mr_cache[17] = { + .size = 8, + .limit = 4 + }, + .mr_cache[18] = { + .size = 8, + .limit = 4 + }, + .mr_cache[19] = { + .size = 4, + .limit = 2 + }, + .mr_cache[20] = { + .size = 4, + .limit = 2 + }, + }, +}; + +static int fw_initializing(struct mlx5_core_dev *dev) +{ + return ioread32be(&dev->iseg->initializing) >> 31; +} + +static int wait_fw_init(struct mlx5_core_dev *dev, u32 max_wait_mili, + u32 warn_time_mili) +{ + unsigned long warn = jiffies + msecs_to_jiffies(warn_time_mili); + unsigned long end = jiffies + msecs_to_jiffies(max_wait_mili); + int err = 0; + + while (fw_initializing(dev)) { + if (time_after(jiffies, end) || + test_bit(MLX5_INTERFACE_STATE_TEARDOWN, &dev->intf_state)) { + err = -EBUSY; + break; + } + if (warn_time_mili && time_after(jiffies, warn)) { + mlx5_core_warn(dev, "Waiting for FW initialization, timeout abort in %ds\n", + jiffies_to_msecs(end - warn) / 1000); + warn = jiffies + msecs_to_jiffies(warn_time_mili); + } + msleep(mlx5_tout_ms(dev, FW_PRE_INIT_WAIT)); + } + + return err; +} + +static void mlx5_set_driver_version(struct mlx5_core_dev *dev) +{ + int driver_ver_sz = MLX5_FLD_SZ_BYTES(set_driver_version_in, + driver_version); + u8 in[MLX5_ST_SZ_BYTES(set_driver_version_in)] = {}; + int remaining_size = driver_ver_sz; + char *string; + + if (!MLX5_CAP_GEN(dev, driver_version)) + return; + + string = MLX5_ADDR_OF(set_driver_version_in, in, driver_version); + + strncpy(string, "Linux", remaining_size); + + remaining_size = max_t(int, 0, driver_ver_sz - strlen(string)); + strncat(string, ",", remaining_size); + + remaining_size = max_t(int, 0, driver_ver_sz - strlen(string)); + strncat(string, KBUILD_MODNAME, remaining_size); + + remaining_size = max_t(int, 0, driver_ver_sz - strlen(string)); + strncat(string, ",", remaining_size); + + remaining_size = max_t(int, 0, driver_ver_sz - strlen(string)); + strncat(string, DRIVER_VERSION, remaining_size); + + /*Send the command*/ + MLX5_SET(set_driver_version_in, in, opcode, + MLX5_CMD_OP_SET_DRIVER_VERSION); + + mlx5_cmd_exec_in(dev, set_driver_version, in); +} + +static int set_dma_caps(struct pci_dev *pdev) +{ + int err; + + err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); + if (err) { + dev_warn(&pdev->dev, "Warning: couldn't set 64-bit PCI DMA mask\n"); + err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)); + if (err) { + dev_err(&pdev->dev, "Can't set PCI DMA mask, aborting\n"); + return err; + } + } + + dma_set_max_seg_size(&pdev->dev, 2u * 1024 * 1024 * 1024); + return err; +} + +static int mlx5_pci_enable_device(struct mlx5_core_dev *dev) +{ + struct pci_dev *pdev = dev->pdev; + int err = 0; + + mutex_lock(&dev->pci_status_mutex); + if (dev->pci_status == MLX5_PCI_STATUS_DISABLED) { + err = pci_enable_device(pdev); + if (!err) + dev->pci_status = MLX5_PCI_STATUS_ENABLED; + } + mutex_unlock(&dev->pci_status_mutex); + + return err; +} + +static void mlx5_pci_disable_device(struct mlx5_core_dev *dev) +{ + struct pci_dev *pdev = dev->pdev; + + mutex_lock(&dev->pci_status_mutex); + if (dev->pci_status == MLX5_PCI_STATUS_ENABLED) { + pci_disable_device(pdev); + dev->pci_status = MLX5_PCI_STATUS_DISABLED; + } + mutex_unlock(&dev->pci_status_mutex); +} + +static int request_bar(struct pci_dev *pdev) +{ + int err = 0; + + if (!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM)) { + dev_err(&pdev->dev, "Missing registers BAR, aborting\n"); + return -ENODEV; + } + + err = pci_request_regions(pdev, KBUILD_MODNAME); + if (err) + dev_err(&pdev->dev, "Couldn't get PCI resources, aborting\n"); + + return err; +} + +static void release_bar(struct pci_dev *pdev) +{ + pci_release_regions(pdev); +} + +struct mlx5_reg_host_endianness { + u8 he; + u8 rsvd[15]; +}; + +#define CAP_MASK(pos, size) ((u64)((1 << (size)) - 1) << (pos)) + +enum { + MLX5_CAP_BITS_RW_MASK = CAP_MASK(MLX5_CAP_OFF_CMDIF_CSUM, 2) | + MLX5_DEV_CAP_FLAG_DCT, +}; + +static u16 to_fw_pkey_sz(struct mlx5_core_dev *dev, u32 size) +{ + switch (size) { + case 128: + return 0; + case 256: + return 1; + case 512: + return 2; + case 1024: + return 3; + case 2048: + return 4; + case 4096: + return 5; + default: + mlx5_core_warn(dev, "invalid pkey table size %d\n", size); + return 0; + } +} + +int mlx5_core_other_function_get_caps(struct mlx5_core_dev *dev, + u16 function_id, void *out) +{ + u16 opmod = (MLX5_CAP_GENERAL << 1) | (HCA_CAP_OPMOD_GET_MAX & 0x01); + u8 in[MLX5_ST_SZ_BYTES(query_hca_cap_in)] = {}; + + MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP); + MLX5_SET(query_hca_cap_in, in, op_mod, opmod); + MLX5_SET(query_hca_cap_in, in, function_id, function_id); + MLX5_SET(query_hca_cap_in, in, other_function, true); + return mlx5_cmd_exec_inout(dev, query_hca_cap, in, out); +} + +static int mlx5_core_get_caps_mode(struct mlx5_core_dev *dev, + enum mlx5_cap_type cap_type, + enum mlx5_cap_mode cap_mode) +{ + u8 in[MLX5_ST_SZ_BYTES(query_hca_cap_in)]; + int out_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out); + void *out, *hca_caps; + u16 opmod = (cap_type << 1) | (cap_mode & 0x01); + int err; + + memset(in, 0, sizeof(in)); + out = kzalloc(out_sz, GFP_KERNEL); + if (!out) + return -ENOMEM; + + MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP); + MLX5_SET(query_hca_cap_in, in, op_mod, opmod); + err = mlx5_cmd_exec_inout(dev, query_hca_cap, in, out); + if (err) { + mlx5_core_warn(dev, + "QUERY_HCA_CAP : type(%x) opmode(%x) Failed(%d)\n", + cap_type, cap_mode, err); + goto query_ex; + } + + hca_caps = MLX5_ADDR_OF(query_hca_cap_out, out, capability); + + switch (cap_mode) { + case HCA_CAP_OPMOD_GET_MAX: + memcpy(dev->caps.hca[cap_type]->max, hca_caps, + MLX5_UN_SZ_BYTES(hca_cap_union)); + break; + case HCA_CAP_OPMOD_GET_CUR: + memcpy(dev->caps.hca[cap_type]->cur, hca_caps, + MLX5_UN_SZ_BYTES(hca_cap_union)); + break; + default: + mlx5_core_warn(dev, + "Tried to query dev cap type(%x) with wrong opmode(%x)\n", + cap_type, cap_mode); + err = -EINVAL; + break; + } +query_ex: + kfree(out); + return err; +} + +int mlx5_core_query_special_contexts(struct mlx5_core_dev *dev) +{ + u32 in[MLX5_ST_SZ_DW(query_special_contexts_in)]; + u32 out[MLX5_ST_SZ_DW(query_special_contexts_out)]; + int err; + + memset(in, 0, sizeof(in)); + memset(out, 0, sizeof(out)); + + MLX5_SET(query_special_contexts_in, in, opcode, + MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS); + err = mlx5_cmd_exec(dev, in, sizeof(in), out, + sizeof(out)); + if (err) + return err; + + dev->special_contexts.resd_lkey = MLX5_GET(query_special_contexts_out, + out, resd_lkey); + + return err; +} + +int mlx5_core_get_caps(struct mlx5_core_dev *dev, enum mlx5_cap_type cap_type) +{ + int ret; + + ret = mlx5_core_get_caps_mode(dev, cap_type, HCA_CAP_OPMOD_GET_CUR); + if (ret) + return ret; + return mlx5_core_get_caps_mode(dev, cap_type, HCA_CAP_OPMOD_GET_MAX); +} + +static int set_caps(struct mlx5_core_dev *dev, void *in, int opmod) +{ + MLX5_SET(set_hca_cap_in, in, opcode, MLX5_CMD_OP_SET_HCA_CAP); + MLX5_SET(set_hca_cap_in, in, op_mod, opmod << 1); + return mlx5_cmd_exec_in(dev, set_hca_cap, in); +} + +static int handle_hca_cap_atomic(struct mlx5_core_dev *dev, void *set_ctx) +{ + void *set_hca_cap; + int req_endianness; + int err; + + if (!MLX5_CAP_GEN(dev, atomic)) + return 0; + + err = mlx5_core_get_caps(dev, MLX5_CAP_ATOMIC); + if (err) + return err; + + req_endianness = + MLX5_CAP_ATOMIC(dev, + supported_atomic_req_8B_endianness_mode_1); + + if (req_endianness != MLX5_ATOMIC_REQ_MODE_HOST_ENDIANNESS) + return 0; + + set_hca_cap = MLX5_ADDR_OF(set_hca_cap_in, set_ctx, capability); + + /* Set requestor to host endianness */ + MLX5_SET(atomic_caps, set_hca_cap, atomic_req_8B_endianness_mode, + MLX5_ATOMIC_REQ_MODE_HOST_ENDIANNESS); + + return set_caps(dev, set_ctx, MLX5_SET_HCA_CAP_OP_MOD_ATOMIC); +} + +static int handle_hca_cap_odp(struct mlx5_core_dev *dev, void *set_ctx) +{ + void *set_hca_cap; + bool do_set = false; + int err; + + if (!IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) || + !MLX5_CAP_GEN(dev, pg)) + return 0; + + err = mlx5_core_get_caps(dev, MLX5_CAP_ODP); + if (err) + return err; + + set_hca_cap = MLX5_ADDR_OF(set_hca_cap_in, set_ctx, capability); + memcpy(set_hca_cap, dev->caps.hca[MLX5_CAP_ODP]->cur, + MLX5_ST_SZ_BYTES(odp_cap)); + +#define ODP_CAP_SET_MAX(dev, field) \ + do { \ + u32 _res = MLX5_CAP_ODP_MAX(dev, field); \ + if (_res) { \ + do_set = true; \ + MLX5_SET(odp_cap, set_hca_cap, field, _res); \ + } \ + } while (0) + + ODP_CAP_SET_MAX(dev, ud_odp_caps.srq_receive); + ODP_CAP_SET_MAX(dev, rc_odp_caps.srq_receive); + ODP_CAP_SET_MAX(dev, xrc_odp_caps.srq_receive); + ODP_CAP_SET_MAX(dev, xrc_odp_caps.send); + ODP_CAP_SET_MAX(dev, xrc_odp_caps.receive); + ODP_CAP_SET_MAX(dev, xrc_odp_caps.write); + ODP_CAP_SET_MAX(dev, xrc_odp_caps.read); + ODP_CAP_SET_MAX(dev, xrc_odp_caps.atomic); + ODP_CAP_SET_MAX(dev, dc_odp_caps.srq_receive); + ODP_CAP_SET_MAX(dev, dc_odp_caps.send); + ODP_CAP_SET_MAX(dev, dc_odp_caps.receive); + ODP_CAP_SET_MAX(dev, dc_odp_caps.write); + ODP_CAP_SET_MAX(dev, dc_odp_caps.read); + ODP_CAP_SET_MAX(dev, dc_odp_caps.atomic); + + if (!do_set) + return 0; + + return set_caps(dev, set_ctx, MLX5_SET_HCA_CAP_OP_MOD_ODP); +} + +static int max_uc_list_get_devlink_param(struct mlx5_core_dev *dev) +{ + struct devlink *devlink = priv_to_devlink(dev); + union devlink_param_value val; + int err; + + err = devlink_param_driverinit_value_get(devlink, + DEVLINK_PARAM_GENERIC_ID_MAX_MACS, + &val); + if (!err) + return val.vu32; + mlx5_core_dbg(dev, "Failed to get param. err = %d\n", err); + return err; +} + +int mlx5_core_other_function_set_caps(struct mlx5_core_dev *dev, + const void *hca_cap_on_behalf, + u16 function_id) +{ + int set_sz = MLX5_ST_SZ_BYTES(set_hca_cap_in); + void *set_hca_cap; + void *set_ctx; + int ret; + + set_ctx = kzalloc(set_sz, GFP_KERNEL); + if (!set_ctx) + return -ENOMEM; + + set_hca_cap = MLX5_ADDR_OF(set_hca_cap_in, set_ctx, capability); + memcpy(set_hca_cap, hca_cap_on_behalf, MLX5_ST_SZ_BYTES(cmd_hca_cap)); + MLX5_SET(set_hca_cap_in, set_ctx, function_id, function_id); + MLX5_SET(set_hca_cap_in, set_ctx, other_function, true); + + ret = set_caps(dev, set_ctx, MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE); + kfree(set_ctx); + return ret; +} + +bool mlx5_is_roce_on(struct mlx5_core_dev *dev) +{ + struct devlink *devlink = priv_to_devlink(dev); + union devlink_param_value val; + int err; + + err = devlink_param_driverinit_value_get(devlink, + DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE, + &val); + + if (!err) + return val.vbool; + + mlx5_core_dbg(dev, "Failed to get param. err = %d\n", err); + return MLX5_CAP_GEN(dev, roce); +} +EXPORT_SYMBOL(mlx5_is_roce_on); + +static int handle_hca_cap(struct mlx5_core_dev *dev, void *set_ctx) +{ + struct mlx5_profile *prof = &dev->profile; + void *set_hca_cap; + int max_uc_list; + int err; + + err = mlx5_core_get_caps(dev, MLX5_CAP_GENERAL); + if (err) + return err; + + set_hca_cap = MLX5_ADDR_OF(set_hca_cap_in, set_ctx, + capability); + memcpy(set_hca_cap, dev->caps.hca[MLX5_CAP_GENERAL]->cur, + MLX5_ST_SZ_BYTES(cmd_hca_cap)); + + mlx5_core_dbg(dev, "Current Pkey table size %d Setting new size %d\n", + mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(dev, pkey_table_size)), + 128); + /* we limit the size of the pkey table to 128 entries for now */ + MLX5_SET(cmd_hca_cap, set_hca_cap, pkey_table_size, + to_fw_pkey_sz(dev, 128)); + + /* Check log_max_qp from HCA caps to set in current profile */ + if (prof->log_max_qp == LOG_MAX_SUPPORTED_QPS) { + prof->log_max_qp = min_t(u8, 18, MLX5_CAP_GEN_MAX(dev, log_max_qp)); + } else if (MLX5_CAP_GEN_MAX(dev, log_max_qp) < prof->log_max_qp) { + mlx5_core_warn(dev, "log_max_qp value in current profile is %d, changing it to HCA capability limit (%d)\n", + prof->log_max_qp, + MLX5_CAP_GEN_MAX(dev, log_max_qp)); + prof->log_max_qp = MLX5_CAP_GEN_MAX(dev, log_max_qp); + } + if (prof->mask & MLX5_PROF_MASK_QP_SIZE) + MLX5_SET(cmd_hca_cap, set_hca_cap, log_max_qp, + prof->log_max_qp); + + /* disable cmdif checksum */ + MLX5_SET(cmd_hca_cap, set_hca_cap, cmdif_checksum, 0); + + /* enable drain sigerr */ + MLX5_SET(cmd_hca_cap, set_hca_cap, drain_sigerr, 1); + + /* Enable 4K UAR only when HCA supports it and page size is bigger + * than 4K. + */ + if (MLX5_CAP_GEN_MAX(dev, uar_4k) && PAGE_SIZE > 4096) + MLX5_SET(cmd_hca_cap, set_hca_cap, uar_4k, 1); + + MLX5_SET(cmd_hca_cap, set_hca_cap, log_uar_page_sz, PAGE_SHIFT - 12); + + if (MLX5_CAP_GEN_MAX(dev, cache_line_128byte)) + MLX5_SET(cmd_hca_cap, + set_hca_cap, + cache_line_128byte, + cache_line_size() >= 128 ? 1 : 0); + + if (MLX5_CAP_GEN_MAX(dev, dct)) + MLX5_SET(cmd_hca_cap, set_hca_cap, dct, 1); + + if (MLX5_CAP_GEN_MAX(dev, pci_sync_for_fw_update_event)) + MLX5_SET(cmd_hca_cap, set_hca_cap, pci_sync_for_fw_update_event, 1); + + if (MLX5_CAP_GEN_MAX(dev, num_vhca_ports)) + MLX5_SET(cmd_hca_cap, + set_hca_cap, + num_vhca_ports, + MLX5_CAP_GEN_MAX(dev, num_vhca_ports)); + + if (MLX5_CAP_GEN_MAX(dev, release_all_pages)) + MLX5_SET(cmd_hca_cap, set_hca_cap, release_all_pages, 1); + + if (MLX5_CAP_GEN_MAX(dev, mkey_by_name)) + MLX5_SET(cmd_hca_cap, set_hca_cap, mkey_by_name, 1); + + mlx5_vhca_state_cap_handle(dev, set_hca_cap); + + if (MLX5_CAP_GEN_MAX(dev, num_total_dynamic_vf_msix)) + MLX5_SET(cmd_hca_cap, set_hca_cap, num_total_dynamic_vf_msix, + MLX5_CAP_GEN_MAX(dev, num_total_dynamic_vf_msix)); + + if (MLX5_CAP_GEN(dev, roce_rw_supported)) + MLX5_SET(cmd_hca_cap, set_hca_cap, roce, mlx5_is_roce_on(dev)); + + max_uc_list = max_uc_list_get_devlink_param(dev); + if (max_uc_list > 0) + MLX5_SET(cmd_hca_cap, set_hca_cap, log_max_current_uc_list, + ilog2(max_uc_list)); + + return set_caps(dev, set_ctx, MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE); +} + +/* Cached MLX5_CAP_GEN(dev, roce) can be out of sync this early in the + * boot process. + * In case RoCE cap is writable in FW and user/devlink requested to change the + * cap, we are yet to query the final state of the above cap. + * Hence, the need for this function. + * + * Returns + * True: + * 1) RoCE cap is read only in FW and already disabled + * OR: + * 2) RoCE cap is writable in FW and user/devlink requested it off. + * + * In any other case, return False. + */ +static bool is_roce_fw_disabled(struct mlx5_core_dev *dev) +{ + return (MLX5_CAP_GEN(dev, roce_rw_supported) && !mlx5_is_roce_on(dev)) || + (!MLX5_CAP_GEN(dev, roce_rw_supported) && !MLX5_CAP_GEN(dev, roce)); +} + +static int handle_hca_cap_roce(struct mlx5_core_dev *dev, void *set_ctx) +{ + void *set_hca_cap; + int err; + + if (is_roce_fw_disabled(dev)) + return 0; + + err = mlx5_core_get_caps(dev, MLX5_CAP_ROCE); + if (err) + return err; + + if (MLX5_CAP_ROCE(dev, sw_r_roce_src_udp_port) || + !MLX5_CAP_ROCE_MAX(dev, sw_r_roce_src_udp_port)) + return 0; + + set_hca_cap = MLX5_ADDR_OF(set_hca_cap_in, set_ctx, capability); + memcpy(set_hca_cap, dev->caps.hca[MLX5_CAP_ROCE]->cur, + MLX5_ST_SZ_BYTES(roce_cap)); + MLX5_SET(roce_cap, set_hca_cap, sw_r_roce_src_udp_port, 1); + + err = set_caps(dev, set_ctx, MLX5_SET_HCA_CAP_OP_MOD_ROCE); + return err; +} + +static int handle_hca_cap_port_selection(struct mlx5_core_dev *dev, + void *set_ctx) +{ + void *set_hca_cap; + int err; + + if (!MLX5_CAP_GEN(dev, port_selection_cap)) + return 0; + + err = mlx5_core_get_caps(dev, MLX5_CAP_PORT_SELECTION); + if (err) + return err; + + if (MLX5_CAP_PORT_SELECTION(dev, port_select_flow_table_bypass) || + !MLX5_CAP_PORT_SELECTION_MAX(dev, port_select_flow_table_bypass)) + return 0; + + set_hca_cap = MLX5_ADDR_OF(set_hca_cap_in, set_ctx, capability); + memcpy(set_hca_cap, dev->caps.hca[MLX5_CAP_PORT_SELECTION]->cur, + MLX5_ST_SZ_BYTES(port_selection_cap)); + MLX5_SET(port_selection_cap, set_hca_cap, port_select_flow_table_bypass, 1); + + err = set_caps(dev, set_ctx, MLX5_SET_HCA_CAP_OP_MODE_PORT_SELECTION); + + return err; +} + +static int set_hca_cap(struct mlx5_core_dev *dev) +{ + int set_sz = MLX5_ST_SZ_BYTES(set_hca_cap_in); + void *set_ctx; + int err; + + set_ctx = kzalloc(set_sz, GFP_KERNEL); + if (!set_ctx) + return -ENOMEM; + + err = handle_hca_cap(dev, set_ctx); + if (err) { + mlx5_core_err(dev, "handle_hca_cap failed\n"); + goto out; + } + + memset(set_ctx, 0, set_sz); + err = handle_hca_cap_atomic(dev, set_ctx); + if (err) { + mlx5_core_err(dev, "handle_hca_cap_atomic failed\n"); + goto out; + } + + memset(set_ctx, 0, set_sz); + err = handle_hca_cap_odp(dev, set_ctx); + if (err) { + mlx5_core_err(dev, "handle_hca_cap_odp failed\n"); + goto out; + } + + memset(set_ctx, 0, set_sz); + err = handle_hca_cap_roce(dev, set_ctx); + if (err) { + mlx5_core_err(dev, "handle_hca_cap_roce failed\n"); + goto out; + } + + memset(set_ctx, 0, set_sz); + err = handle_hca_cap_port_selection(dev, set_ctx); + if (err) { + mlx5_core_err(dev, "handle_hca_cap_port_selection failed\n"); + goto out; + } + +out: + kfree(set_ctx); + return err; +} + +static int set_hca_ctrl(struct mlx5_core_dev *dev) +{ + struct mlx5_reg_host_endianness he_in; + struct mlx5_reg_host_endianness he_out; + int err; + + if (!mlx5_core_is_pf(dev)) + return 0; + + memset(&he_in, 0, sizeof(he_in)); + he_in.he = MLX5_SET_HOST_ENDIANNESS; + err = mlx5_core_access_reg(dev, &he_in, sizeof(he_in), + &he_out, sizeof(he_out), + MLX5_REG_HOST_ENDIANNESS, 0, 1); + return err; +} + +static int mlx5_core_set_hca_defaults(struct mlx5_core_dev *dev) +{ + int ret = 0; + + /* Disable local_lb by default */ + if (MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_ETH) + ret = mlx5_nic_vport_update_local_lb(dev, false); + + return ret; +} + +static int enable_hca(struct mlx5_core_dev *dev, u16 func_id, bool ecpu) +{ + u32 in[MLX5_ST_SZ_DW(enable_hca_in)] = {}; + + MLX5_SET(enable_hca_in, in, opcode, MLX5_CMD_OP_ENABLE_HCA); + MLX5_SET(enable_hca_in, in, function_id, func_id); + MLX5_SET(enable_hca_in, in, embedded_cpu_function, ecpu); + return mlx5_cmd_exec_in(dev, enable_hca, in); +} + +int mlx5_core_enable_hca(struct mlx5_core_dev *dev, u16 func_id) +{ + return enable_hca(dev, func_id, dev->caps.embedded_cpu); +} + +int mlx5_core_enable_sf_hca(struct mlx5_core_dev *dev, u16 sf_func_id) +{ + /* When enabling SF, it doesn't matter if is enabled on ECPF or PF, + * embedded_cpu bit must be cleared as expected by device firmware. + * SF function ids are split between ECPF And PF. A given SF is for + * ECPF or for PF is decided by SF's function id by the firmware. + */ + return enable_hca(dev, sf_func_id, 0); +} + +static int disable_hca(struct mlx5_core_dev *dev, u16 func_id, bool ecpu) +{ + u32 in[MLX5_ST_SZ_DW(disable_hca_in)] = {}; + + MLX5_SET(disable_hca_in, in, opcode, MLX5_CMD_OP_DISABLE_HCA); + MLX5_SET(disable_hca_in, in, function_id, func_id); + MLX5_SET(enable_hca_in, in, embedded_cpu_function, ecpu); + return mlx5_cmd_exec_in(dev, disable_hca, in); +} + +int mlx5_core_disable_hca(struct mlx5_core_dev *dev, u16 func_id) +{ + return disable_hca(dev, func_id, dev->caps.embedded_cpu); +} + +int mlx5_core_disable_sf_hca(struct mlx5_core_dev *dev, u16 sf_func_id) +{ + return disable_hca(dev, sf_func_id, 0); +} + +static int mlx5_core_set_issi(struct mlx5_core_dev *dev) +{ + u32 query_out[MLX5_ST_SZ_DW(query_issi_out)] = {}; + u32 query_in[MLX5_ST_SZ_DW(query_issi_in)] = {}; + u32 sup_issi; + int err; + + MLX5_SET(query_issi_in, query_in, opcode, MLX5_CMD_OP_QUERY_ISSI); + err = mlx5_cmd_exec_inout(dev, query_issi, query_in, query_out); + if (err) { + u32 syndrome = MLX5_GET(query_issi_out, query_out, syndrome); + u8 status = MLX5_GET(query_issi_out, query_out, status); + + if (!status || syndrome == MLX5_DRIVER_SYND) { + mlx5_core_err(dev, "Failed to query ISSI err(%d) status(%d) synd(%d)\n", + err, status, syndrome); + return err; + } + + mlx5_core_warn(dev, "Query ISSI is not supported by FW, ISSI is 0\n"); + dev->issi = 0; + return 0; + } + + sup_issi = MLX5_GET(query_issi_out, query_out, supported_issi_dw0); + + if (sup_issi & (1 << 1)) { + u32 set_in[MLX5_ST_SZ_DW(set_issi_in)] = {}; + + MLX5_SET(set_issi_in, set_in, opcode, MLX5_CMD_OP_SET_ISSI); + MLX5_SET(set_issi_in, set_in, current_issi, 1); + err = mlx5_cmd_exec_in(dev, set_issi, set_in); + if (err) { + mlx5_core_err(dev, "Failed to set ISSI to 1 err(%d)\n", + err); + return err; + } + + dev->issi = 1; + + return 0; + } else if (sup_issi & (1 << 0) || !sup_issi) { + return 0; + } + + return -EOPNOTSUPP; +} + +static ssize_t mlx5_roce_enable_show_enabled(struct device *device, + struct device_attribute *attr, + char *buf) +{ + struct pci_dev *pdev = container_of(device, struct pci_dev, dev); + struct mlx5_core_dev *dev = pci_get_drvdata(pdev); + int ret; + + mutex_lock(&dev->roce.state_lock); + ret = dev->roce.enabled; + mutex_unlock(&dev->roce.state_lock); + + return sprintf(buf, "%d\n", ret); +} + +static ssize_t mlx5_roce_enable_set_enabled(struct device *device, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct pci_dev *pdev = container_of(device, struct pci_dev, dev); + struct mlx5_core_dev *dev = pci_get_drvdata(pdev); + struct devlink *devlink = priv_to_devlink(dev); + union devlink_param_value value; + int ret; + bool val; + + ret = kstrtobool(buf, &val); + if (ret) + return -EINVAL; + + if (val && !MLX5_CAP_GEN(dev, roce)) + return -EOPNOTSUPP; + + mutex_lock(&dev->roce.state_lock); + dev->roce.enabled = val; + value.vbool = val; + devlink_param_driverinit_value_set(devlink, + DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE, + value); + mutex_unlock(&dev->roce.state_lock); + + return count; +} + +static DEVICE_ATTR(roce_enable, 0644, mlx5_roce_enable_show_enabled, + mlx5_roce_enable_set_enabled); + +static struct device_attribute *mlx5_roce_enable_dev_attrs = + &dev_attr_roce_enable; + +static int mlx5_pci_init(struct mlx5_core_dev *dev, struct pci_dev *pdev, + const struct pci_device_id *id) +{ + int err = 0; + + mutex_init(&dev->pci_status_mutex); + + dev->bar_addr = pci_resource_start(pdev, 0); + + err = mlx5_pci_enable_device(dev); + if (err) { + mlx5_core_err(dev, "Cannot enable PCI device, aborting\n"); + goto err_file; + } + + err = request_bar(pdev); + if (err) { + mlx5_core_err(dev, "error requesting BARs, aborting\n"); + goto err_disable; + } + + pci_set_master(pdev); + + err = set_dma_caps(pdev); + if (err) { + mlx5_core_err(dev, "Failed setting DMA capabilities mask, aborting\n"); + goto err_clr_master; + } + + if (pci_enable_atomic_ops_to_root(pdev, PCI_EXP_DEVCAP2_ATOMIC_COMP32) && + pci_enable_atomic_ops_to_root(pdev, PCI_EXP_DEVCAP2_ATOMIC_COMP64) && + pci_enable_atomic_ops_to_root(pdev, PCI_EXP_DEVCAP2_ATOMIC_COMP128)) + mlx5_core_dbg(dev, "Enabling pci atomics failed\n"); + + dev->iseg_base = dev->bar_addr; + dev->iseg = ioremap(dev->iseg_base, sizeof(*dev->iseg)); + if (!dev->iseg) { + err = -ENOMEM; + mlx5_core_err(dev, "Failed mapping initialization segment, aborting\n"); + goto err_clr_master; + } + + mlx5_pci_vsc_init(dev); + dev->caps.embedded_cpu = mlx5_read_embedded_cpu(dev); + + err = pci_save_state(pdev); + if (err) { + dev_err(&pdev->dev, "pci_save_state failed with error code: %d\n", err); + goto err_io_unmap; + } + + + return 0; + +err_io_unmap: + iounmap(dev->iseg); +err_clr_master: + pci_clear_master(dev->pdev); + release_bar(dev->pdev); +err_disable: + mlx5_pci_disable_device(dev); +err_file: + device_remove_file(&pdev->dev, mlx5_roce_enable_dev_attrs); + return err; +} + +static void mlx5_pci_close(struct mlx5_core_dev *dev) +{ + /* health work might still be active, and it needs pci bar in + * order to know the NIC state. Therefore, drain the health WQ + * before removing the pci bars + */ + mlx5_drain_health_wq(dev); + iounmap(dev->iseg); + pci_clear_master(dev->pdev); + release_bar(dev->pdev); + mlx5_pci_disable_device(dev); + device_remove_file(&dev->pdev->dev, mlx5_roce_enable_dev_attrs); +} + +static int mlx5_init_once(struct mlx5_core_dev *dev) +{ + int err; + + dev->priv.devcom = mlx5_devcom_register_device(dev); + if (IS_ERR(dev->priv.devcom)) + mlx5_core_err(dev, "failed to register with devcom (0x%p)\n", + dev->priv.devcom); + + err = mlx5_query_board_id(dev); + if (err) { + mlx5_core_err(dev, "query board id failed\n"); + goto err_devcom; + } + + err = mlx5_irq_table_init(dev); + if (err) { + mlx5_core_err(dev, "failed to initialize irq table\n"); + goto err_devcom; + } + + err = mlx5_eq_table_init(dev); + if (err) { + mlx5_core_err(dev, "failed to initialize eq\n"); + goto err_irq_cleanup; + } + + err = mlx5_events_init(dev); + if (err) { + mlx5_core_err(dev, "failed to initialize events\n"); + goto err_eq_cleanup; + } + + err = mlx5_fw_reset_init(dev); + if (err) { + mlx5_core_err(dev, "failed to initialize fw reset events\n"); + goto err_events_cleanup; + } + + mlx5_cq_debugfs_init(dev); + + mlx5_init_reserved_gids(dev); + + mlx5_init_clock(dev); + + dev->vxlan = mlx5_vxlan_create(dev); + dev->geneve = mlx5_geneve_create(dev); + + err = mlx5_init_rl_table(dev); + if (err) { + mlx5_core_err(dev, "Failed to init rate limiting\n"); + goto err_tables_cleanup; + } + + err = mlx5_mpfs_init(dev); + if (err) { + mlx5_core_err(dev, "Failed to init l2 table %d\n", err); + goto err_rl_cleanup; + } + + err = mlx5_sriov_init(dev); + if (err) { + mlx5_core_err(dev, "Failed to init sriov %d\n", err); + goto err_mpfs_cleanup; + } + + err = mlx5_mst_dump_init(dev); + if (err) { + mlx5_core_err(dev, "Failed to init mst dump %d\n", err); + goto err_mst_dump_cleanup; + } + + err = mlx5_eswitch_init(dev); + if (err) { + mlx5_core_err(dev, "Failed to init eswitch %d\n", err); + goto err_sriov_cleanup; + } + + err = mlx5_fpga_init(dev); + if (err) { + mlx5_core_err(dev, "Failed to init fpga device %d\n", err); + goto err_eswitch_cleanup; + } + + err = mlx5_vhca_event_init(dev); + if (err) { + mlx5_core_err(dev, "Failed to init vhca event notifier %d\n", err); + goto err_fpga_cleanup; + } + + err = mlx5_sf_hw_table_init(dev); + if (err) { + mlx5_core_err(dev, "Failed to init SF HW table %d\n", err); + goto err_sf_hw_table_cleanup; + } + + err = mlx5_sf_table_init(dev); + if (err) { + mlx5_core_err(dev, "Failed to init SF table %d\n", err); + goto err_sf_table_cleanup; + } + + err = mlx5_devm_register(dev); + if (err) + goto err_devm; + + err = mlx5_regex_sysfs_init(dev); + if (err) { + mlx5_core_err(dev, "Failed to init regex sysfs %d\n", err); + goto err_regex; + } + + dev->dm = mlx5_dm_create(dev); + if (IS_ERR(dev->dm)) + mlx5_core_warn(dev, "Failed to init device memory%d\n", err); + + dev->tracer = mlx5_fw_tracer_create(dev); + dev->hv_vhca = mlx5_hv_vhca_create(dev); + dev->rsc_dump = mlx5_rsc_dump_create(dev); + + return 0; + +err_regex: + mlx5_devm_unregister(dev); +err_devm: + mlx5_sf_table_cleanup(dev); +err_sf_table_cleanup: + mlx5_sf_hw_table_cleanup(dev); +err_sf_hw_table_cleanup: + mlx5_vhca_event_cleanup(dev); +err_fpga_cleanup: + mlx5_fpga_cleanup(dev); +err_eswitch_cleanup: + mlx5_eswitch_cleanup(dev->priv.eswitch); +err_sriov_cleanup: + mlx5_sriov_cleanup(dev); +err_mst_dump_cleanup: + mlx5_mst_dump_cleanup(dev); +err_mpfs_cleanup: + mlx5_mpfs_cleanup(dev); +err_rl_cleanup: + mlx5_cleanup_rl_table(dev); +err_tables_cleanup: + mlx5_geneve_destroy(dev->geneve); + mlx5_vxlan_destroy(dev->vxlan); + mlx5_cq_debugfs_cleanup(dev); + mlx5_fw_reset_cleanup(dev); +err_events_cleanup: + mlx5_events_cleanup(dev); +err_eq_cleanup: + mlx5_eq_table_cleanup(dev); +err_irq_cleanup: + mlx5_irq_table_cleanup(dev); +err_devcom: + mlx5_devcom_unregister_device(dev->priv.devcom); + + return err; +} + +static void mlx5_cleanup_once(struct mlx5_core_dev *dev) +{ + mlx5_rsc_dump_destroy(dev); + mlx5_hv_vhca_destroy(dev->hv_vhca); + mlx5_fw_tracer_destroy(dev->tracer); + mlx5_dm_cleanup(dev); + mlx5_regex_sysfs_cleanup(dev); + mlx5_devm_unregister(dev); + mlx5_sf_table_cleanup(dev); + mlx5_sf_hw_table_cleanup(dev); + mlx5_vhca_event_cleanup(dev); + mlx5_fpga_cleanup(dev); + mlx5_eswitch_cleanup(dev->priv.eswitch); + mlx5_sriov_cleanup(dev); + mlx5_mst_dump_cleanup(dev); + mlx5_mpfs_cleanup(dev); + mlx5_cleanup_rl_table(dev); + mlx5_geneve_destroy(dev->geneve); + mlx5_vxlan_destroy(dev->vxlan); + mlx5_cleanup_clock(dev); + mlx5_cleanup_reserved_gids(dev); + mlx5_cq_debugfs_cleanup(dev); + mlx5_fw_reset_cleanup(dev); + mlx5_events_cleanup(dev); + mlx5_eq_table_cleanup(dev); + mlx5_irq_table_cleanup(dev); + mlx5_devcom_unregister_device(dev->priv.devcom); +} + +static int mlx5_function_setup(struct mlx5_core_dev *dev, bool boot, bool recovery) +{ + u64 timeout; + int err; + + mlx5_core_info(dev, "firmware version: %d.%d.%d\n", fw_rev_maj(dev), + fw_rev_min(dev), fw_rev_sub(dev)); + + /* Only PFs hold the relevant PCIe information for this query */ + if (mlx5_core_is_pf(dev)) + pcie_print_link_status(dev->pdev); + + /* wait for firmware to accept initialization segments configurations + */ + if (recovery) + timeout = mlx5_tout_ms(dev, FW_PRE_INIT_ON_RECOVERY_TIMEOUT); + else + timeout = mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT); + err = wait_fw_init(dev, timeout, + mlx5_tout_ms(dev, FW_PRE_INIT_WARN_MESSAGE_INTERVAL)); + if (err) { + mlx5_core_err(dev, "Firmware over %llu MS in pre-initializing state, aborting\n", + timeout); + return err; + } + + err = mlx5_cmd_init(dev); + if (err) { + mlx5_core_err(dev, "Failed initializing command interface, aborting\n"); + return err; + } + + mlx5_tout_query_iseg(dev); + + err = wait_fw_init(dev, mlx5_tout_ms(dev, FW_INIT), 0); + if (err) { + mlx5_core_err(dev, "Firmware over %llu MS in initializing state, aborting\n", + mlx5_tout_ms(dev, FW_INIT)); + goto err_cmd_cleanup; + } + + mlx5_cmd_set_state(dev, MLX5_CMDIF_STATE_UP); + + mlx5_start_health_poll(dev); + + err = mlx5_core_enable_hca(dev, 0); + if (err) { + mlx5_core_err(dev, "enable hca failed\n"); + goto stop_health_poll; + } + + err = mlx5_core_set_issi(dev); + if (err) { + mlx5_core_err(dev, "failed to set issi\n"); + goto err_disable_hca; + } + + err = mlx5_satisfy_startup_pages(dev, 1); + if (err) { + mlx5_core_err(dev, "failed to allocate boot pages\n"); + goto err_disable_hca; + } + + err = mlx5_tout_query_dtor(dev); + if (err) { + mlx5_core_err(dev, "failed to read dtor\n"); + goto reclaim_boot_pages; + } + + err = mlx5_update_guids(dev); + if (err) + mlx5_core_err(dev, "failed to update guids. continue with default...\n"); + + err = set_hca_ctrl(dev); + if (err) { + mlx5_core_err(dev, "set_hca_ctrl failed\n"); + goto reclaim_boot_pages; + } + + err = set_hca_cap(dev); + if (err) { + mlx5_core_err(dev, "set_hca_cap failed\n"); + goto reclaim_boot_pages; + } + + err = mlx5_satisfy_startup_pages(dev, 0); + if (err) { + mlx5_core_err(dev, "failed to allocate init pages\n"); + goto reclaim_boot_pages; + } + + err = mlx5_cmd_init_hca(dev, sw_owner_id); + if (err) { + mlx5_core_err(dev, "init hca failed\n"); + goto reclaim_boot_pages; + } + + mlx5_set_driver_version(dev); + + err = mlx5_query_hca_caps(dev); + if (err) { + mlx5_core_err(dev, "query hca failed\n"); + goto reclaim_boot_pages; + } + mlx5_start_health_fw_log_up(dev); + + return 0; + +reclaim_boot_pages: + mlx5_reclaim_startup_pages(dev); +err_disable_hca: + mlx5_core_disable_hca(dev, 0); +stop_health_poll: + mlx5_stop_health_poll(dev, boot); +err_cmd_cleanup: + mlx5_cmd_set_state(dev, MLX5_CMDIF_STATE_DOWN); + mlx5_cmd_cleanup(dev); + + return err; +} + +static int mlx5_function_teardown(struct mlx5_core_dev *dev, bool boot) +{ + int err; + + err = mlx5_cmd_teardown_hca(dev); + if (err) { + mlx5_core_err(dev, "tear_down_hca failed, skip cleanup\n"); + return err; + } + mlx5_reclaim_startup_pages(dev); + mlx5_core_disable_hca(dev, 0); + mlx5_stop_health_poll(dev, boot); + mlx5_cmd_set_state(dev, MLX5_CMDIF_STATE_DOWN); + mlx5_cmd_cleanup(dev); + + return 0; +} + +static int mlx5_load(struct mlx5_core_dev *dev) +{ + int err; + + dev->priv.uar = mlx5_get_uars_page(dev); + if (IS_ERR(dev->priv.uar)) { + mlx5_core_err(dev, "Failed allocating uar, aborting\n"); + err = PTR_ERR(dev->priv.uar); + return err; + } + + mlx5_events_start(dev); + mlx5_pagealloc_start(dev); + + err = mlx5_irq_table_create(dev); + if (err) { + mlx5_core_err(dev, "Failed to alloc IRQs\n"); + goto err_irq_table; + } + + err = mlx5_eq_table_create(dev); + if (err) { + mlx5_core_err(dev, "Failed to create EQs\n"); + goto err_eq_table; + } + + err = mlx5_fw_tracer_init(dev->tracer); + if (err) { + mlx5_core_err(dev, "Failed to init FW tracer %d\n", err); + mlx5_fw_tracer_destroy(dev->tracer); + dev->tracer = NULL; + } + + mlx5_fw_reset_events_start(dev); + mlx5_hv_vhca_init(dev->hv_vhca); + + err = mlx5_rsc_dump_init(dev); + if (err) { + mlx5_core_err(dev, "Failed to init Resource dump %d\n", err); + mlx5_rsc_dump_destroy(dev); + dev->rsc_dump = NULL; + } + + err = mlx5_fpga_device_start(dev); + if (err) { + mlx5_core_err(dev, "fpga device start failed %d\n", err); + goto err_fpga_start; + } + + mlx5_accel_ipsec_init(dev); + + err = mlx5_accel_tls_init(dev); + if (err) { + mlx5_core_err(dev, "TLS device start failed %d\n", err); + goto err_tls_start; + } + + err = mlx5_init_fs(dev); + if (err) { + mlx5_core_err(dev, "Failed to init flow steering\n"); + goto err_fs; + } + + err = mlx5_core_set_hca_defaults(dev); + if (err) { + mlx5_core_err(dev, "Failed to set hca defaults\n"); + goto err_set_hca; + } + + mlx5_vhca_event_start(dev); + + err = mlx5_sf_hw_table_create(dev); + if (err) { + mlx5_core_err(dev, "sf table create failed %d\n", err); + goto err_vhca; + } + + err = mlx5_ec_init(dev); + if (err) { + mlx5_core_err(dev, "Failed to init embedded CPU\n"); + goto err_ec; + } + + mlx5_diag_cnt_init(dev); + + if (!mlx5_core_is_sf(dev)) + register_pcie_dev_attr_group(dev->pdev); + + mlx5_lag_add_mdev(dev); + err = mlx5_sriov_attach(dev); + if (err) { + mlx5_core_err(dev, "sriov init failed %d\n", err); + goto err_sriov; + } + + mlx5_sf_dev_table_create(dev); + + return 0; + +err_sriov: + mlx5_lag_remove_mdev(dev); + unregister_pcie_dev_attr_group(dev->pdev); + mlx5_diag_cnt_cleanup(dev); + mlx5_ec_cleanup(dev); +err_ec: + mlx5_sf_hw_table_destroy(dev); +err_vhca: + mlx5_vhca_event_stop(dev); +err_set_hca: + mlx5_cleanup_fs(dev); +err_fs: + mlx5_accel_tls_cleanup(dev); +err_tls_start: + mlx5_accel_ipsec_cleanup(dev); + mlx5_fpga_device_stop(dev); +err_fpga_start: + mlx5_rsc_dump_cleanup(dev); + mlx5_hv_vhca_cleanup(dev->hv_vhca); + mlx5_fw_reset_events_stop(dev); + mlx5_fw_tracer_cleanup(dev->tracer); + mlx5_eq_table_destroy(dev); +err_eq_table: + mlx5_irq_table_destroy(dev); +err_irq_table: + mlx5_pagealloc_stop(dev); + mlx5_events_stop(dev); + mlx5_put_uars_page(dev, dev->priv.uar); + return err; +} + +static void mlx5_unload(struct mlx5_core_dev *dev) +{ + mlx5_sf_dev_table_destroy(dev); + mlx5_eswitch_disable(dev->priv.eswitch); + mlx5_sriov_detach(dev); + mlx5_lag_remove_mdev(dev); + unregister_pcie_dev_attr_group(dev->pdev); + mlx5_ec_cleanup(dev); + mlx5_diag_cnt_cleanup(dev); + mlx5_sf_hw_table_destroy(dev); + mlx5_vhca_event_stop(dev); + mlx5_cleanup_fs(dev); + mlx5_accel_ipsec_cleanup(dev); + mlx5_accel_tls_cleanup(dev); + mlx5_fpga_device_stop(dev); + mlx5_rsc_dump_cleanup(dev); + mlx5_hv_vhca_cleanup(dev->hv_vhca); + mlx5_fw_reset_events_stop(dev); + mlx5_fw_tracer_cleanup(dev->tracer); + mlx5_eq_table_destroy(dev); + mlx5_irq_table_destroy(dev); + mlx5_pagealloc_stop(dev); + mlx5_events_stop(dev); + mlx5_put_uars_page(dev, dev->priv.uar); +} + +int mlx5_init_one(struct mlx5_core_dev *dev) +{ + int err = 0; + + mutex_lock(&dev->intf_state_mutex); + dev->state = MLX5_DEVICE_STATE_UP; + + err = mlx5_function_setup(dev, true, false); + if (err) + goto err_function; + + dev->roce.enabled = MLX5_CAP_GEN(dev, roce); + err = mlx5_init_once(dev); + if (err) { + mlx5_core_err(dev, "sw objs init failed\n"); + goto function_teardown; + } + + err = mlx5_load(dev); + if (err) + goto err_load; + + /* the publishing of the params need to be done AFTER eq_table + * is created. + */ + mlx5_devm_params_publish(dev); + set_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state); + + err = mlx5_devlink_register(priv_to_devlink(dev)); + if (err) + goto err_devlink_reg; + + err = mlx5_register_device(dev); + if (err) + goto err_register; + + mutex_unlock(&dev->intf_state_mutex); + return 0; + +err_register: + mlx5_devlink_unregister(priv_to_devlink(dev)); +err_devlink_reg: + clear_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state); + mlx5_unload(dev); +err_load: + mlx5_cleanup_once(dev); +function_teardown: + mlx5_function_teardown(dev, true); +err_function: + dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR; + mutex_unlock(&dev->intf_state_mutex); + return err; +} + +void mlx5_uninit_one(struct mlx5_core_dev *dev) +{ + mutex_lock(&dev->intf_state_mutex); + + mlx5_unregister_device(dev); + mlx5_devlink_unregister(priv_to_devlink(dev)); + + if (!test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state)) { + mlx5_core_warn(dev, "%s: interface is down, NOP\n", + __func__); + mlx5_cleanup_once(dev); + goto out; + } + + clear_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state); + mlx5_unload(dev); + mlx5_cleanup_once(dev); + mlx5_function_teardown(dev, true); +out: + mutex_unlock(&dev->intf_state_mutex); +} + +int mlx5_load_one(struct mlx5_core_dev *dev, bool recovery) +{ + int err = 0; + + mutex_lock(&dev->intf_state_mutex); + if (test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state)) { + mlx5_core_warn(dev, "interface is up, NOP\n"); + goto out; + } + /* remove any previous indication of internal error */ + dev->state = MLX5_DEVICE_STATE_UP; + + if (test_bit(MLX5_INTERFACE_STATE_TEARDOWN, &dev->intf_state)) { + mlx5_core_warn(dev, "device is being removed, stop load\n"); + err = -ENODEV; + goto out; + } + + err = mlx5_function_setup(dev, false, recovery); + if (err) + goto err_function; + + err = mlx5_load(dev); + if (err) + goto err_load; + + set_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state); + + err = mlx5_attach_device(dev); + if (err) + goto err_attach; + + mutex_unlock(&dev->intf_state_mutex); + return 0; + +err_attach: + clear_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state); + mlx5_unload(dev); +err_load: + mlx5_function_teardown(dev, false); +err_function: + dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR; +out: + mutex_unlock(&dev->intf_state_mutex); + return err; +} + +static int mlx5_try_fast_unload(struct mlx5_core_dev *dev) +{ + bool fast_teardown = false, force_teardown = false; + bool sf_dev_allocated; + int ret = 1; + + fast_teardown = MLX5_CAP_GEN(dev, fast_teardown); + force_teardown = MLX5_CAP_GEN(dev, force_teardown); + + mlx5_core_dbg(dev, "force teardown firmware support=%d\n", force_teardown); + mlx5_core_dbg(dev, "fast teardown firmware support=%d\n", fast_teardown); + + if (!fast_teardown && !force_teardown) + return -EOPNOTSUPP; + + if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { + mlx5_core_dbg(dev, "Device in internal error state, giving up\n"); + return -EAGAIN; + } + + sf_dev_allocated = mlx5_sf_dev_allocated(dev); + if (sf_dev_allocated) + return -EBUSY; + + /* Panic tear down fw command will stop the PCI bus communication + * with the HCA, so the health polll is no longer needed. + */ + mlx5_drain_health_wq(dev); + mlx5_stop_health_poll(dev, false); + + if (mlx5_sensor_pci_not_working(dev)) { + mlx5_core_dbg(dev, "PCI interface is down, giving up\n"); + mlx5_enter_error_state(dev, true); + return -EIO; + } + + ret = mlx5_cmd_fast_teardown_hca(dev); + if (!ret) + goto succeed; + + ret = mlx5_cmd_force_teardown_hca(dev); + if (!ret) + goto succeed; + + mlx5_core_dbg(dev, "Firmware couldn't do fast unload error: %d\n", ret); + mlx5_start_health_poll(dev); + return ret; + +succeed: + mlx5_enter_error_state(dev, true); + + return 0; +} + +void mlx5_unload_one(struct mlx5_core_dev *dev) +{ + mutex_lock(&dev->intf_state_mutex); + + mlx5_detach_device(dev); + + if (!test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state)) { + mlx5_core_warn(dev, "%s: interface is down, NOP\n", + __func__); + goto out; + } + + clear_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state); + mlx5_unload(dev); + mlx5_function_teardown(dev, false); +out: + mutex_unlock(&dev->intf_state_mutex); +} + +static const int types[] = { + MLX5_CAP_GENERAL, + MLX5_CAP_GENERAL_2, + MLX5_CAP_ETHERNET_OFFLOADS, + MLX5_CAP_IPOIB_ENHANCED_OFFLOADS, + MLX5_CAP_ODP, + MLX5_CAP_ATOMIC, + MLX5_CAP_ROCE, + MLX5_CAP_IPOIB_OFFLOADS, + MLX5_CAP_FLOW_TABLE, + MLX5_CAP_ESWITCH_FLOW_TABLE, + MLX5_CAP_ESWITCH, + MLX5_CAP_VECTOR_CALC, + MLX5_CAP_QOS, + MLX5_CAP_DEBUG, + MLX5_CAP_NVMF, + MLX5_CAP_DEV_MEM, + MLX5_CAP_DEV_EVENT, + MLX5_CAP_TLS, + MLX5_CAP_VDPA_EMULATION, + MLX5_CAP_IPSEC, + MLX5_CAP_PORT_SELECTION, + MLX5_CAP_DEV_SHAMPO, + MLX5_CAP_MACSEC, +}; + +static void mlx5_hca_caps_free(struct mlx5_core_dev *dev) +{ + int type; + int i; + + for (i = 0; i < ARRAY_SIZE(types); i++) { + type = types[i]; + kfree(dev->caps.hca[type]); + } +} + +static int mlx5_hca_caps_alloc(struct mlx5_core_dev *dev) +{ + struct mlx5_hca_cap *cap; + int type; + int i; + + for (i = 0; i < ARRAY_SIZE(types); i++) { + cap = kzalloc(sizeof(*cap), GFP_KERNEL); + if (!cap) + goto err; + type = types[i]; + dev->caps.hca[type] = cap; + } + + return 0; + +err: + mlx5_hca_caps_free(dev); + return -ENOMEM; +} + +int mlx5_mdev_init(struct mlx5_core_dev *dev, int profile_idx) +{ + struct mlx5_priv *priv = &dev->priv; + int err; + + memcpy(&dev->profile, &profile[profile_idx], sizeof(dev->profile)); + INIT_LIST_HEAD(&priv->ctx_list); + spin_lock_init(&priv->ctx_lock); + lockdep_register_key(&dev->lock_key); + mutex_init(&dev->intf_state_mutex); + lockdep_set_class(&dev->intf_state_mutex, &dev->lock_key); + + mutex_init(&priv->bfregs.reg_head.lock); + mutex_init(&priv->bfregs.wc_head.lock); + INIT_LIST_HEAD(&priv->bfregs.reg_head.list); + INIT_LIST_HEAD(&priv->bfregs.wc_head.list); + mutex_init(&dev->roce.state_lock); + mutex_init(&priv->alloc_mutex); + mutex_init(&priv->pgdir_mutex); + INIT_LIST_HEAD(&priv->pgdir_list); + + priv->numa_node = dev_to_node(mlx5_core_dma_dev(dev)); + priv->dbg.dbg_root = debugfs_create_dir(dev_name(dev->device), + mlx5_debugfs_root); + INIT_LIST_HEAD(&priv->traps); + + err = mlx5_tout_init(dev); + if (err) { + mlx5_core_err(dev, "Failed initializing timeouts, aborting\n"); + goto err_timeout_init; + } + + err = mlx5_health_init(dev); + if (err) + goto err_health_init; + + err = mlx5_pagealloc_init(dev); + if (err) + goto err_pagealloc_init; + + err = mlx5_adev_init(dev); + if (err) + goto err_adev_init; + + err = mlx5_hca_caps_alloc(dev); + if (err) + goto err_hca_caps; + + return 0; + +err_hca_caps: + mlx5_adev_cleanup(dev); +err_adev_init: + mlx5_pagealloc_cleanup(dev); +err_pagealloc_init: + mlx5_health_cleanup(dev); +err_health_init: + mlx5_tout_cleanup(dev); +err_timeout_init: + debugfs_remove(dev->priv.dbg.dbg_root); + mutex_destroy(&priv->pgdir_mutex); + mutex_destroy(&priv->alloc_mutex); + mutex_destroy(&priv->bfregs.wc_head.lock); + mutex_destroy(&priv->bfregs.reg_head.lock); + mutex_destroy(&dev->intf_state_mutex); + lockdep_unregister_key(&dev->lock_key); + return err; +} + +void mlx5_mdev_uninit(struct mlx5_core_dev *dev) +{ + struct mlx5_priv *priv = &dev->priv; + + mlx5_hca_caps_free(dev); + mlx5_adev_cleanup(dev); + mlx5_pagealloc_cleanup(dev); + mlx5_health_cleanup(dev); + mlx5_tout_cleanup(dev); + debugfs_remove_recursive(dev->priv.dbg.dbg_root); + mutex_destroy(&priv->pgdir_mutex); + mutex_destroy(&priv->alloc_mutex); + mutex_destroy(&priv->bfregs.wc_head.lock); + mutex_destroy(&priv->bfregs.reg_head.lock); + mutex_destroy(&dev->intf_state_mutex); + lockdep_unregister_key(&dev->lock_key); +} + +static int probe_one(struct pci_dev *pdev, const struct pci_device_id *id) +{ + struct mlx5_core_dev *dev; + struct devlink *devlink; + int err; + + devlink = mlx5_devlink_alloc(&pdev->dev); + if (!devlink) { + dev_err(&pdev->dev, "devlink alloc failed\n"); + return -ENOMEM; + } + + err = device_create_file(&pdev->dev, mlx5_roce_enable_dev_attrs); + if (err) + goto remove_roce_file; + + dev = devlink_priv(devlink); + dev->device = &pdev->dev; + dev->pdev = pdev; + + dev->coredev_type = id->driver_data & MLX5_PCI_DEV_IS_VF ? + MLX5_COREDEV_VF : MLX5_COREDEV_PF; + + pci_set_drvdata(dev->pdev, dev); + + dev->priv.adev_idx = mlx5_adev_idx_alloc(); + if (dev->priv.adev_idx < 0) { + err = dev->priv.adev_idx; + goto adev_init_err; + } + + err = mlx5_mdev_init(dev, prof_sel); + if (err) + goto mdev_init_err; + + err = mlx5_pci_init(dev, pdev, id); + if (err) { + mlx5_core_err(dev, "mlx5_pci_init failed with error code %d\n", + err); + goto pci_init_err; + } + + dev->priv.sw_reset_lag = false; + err = mlx5_crdump_init(dev); + if (err) { + dev_err(&pdev->dev, "mlx5_crdump_init failed with error code %d\n", err); + goto clean_crdump; + } + + err = mlx5_init_one(dev); + if (err) { + mlx5_core_err(dev, "mlx5_init_one failed with error code %d\n", + err); + goto err_init_one; + } + + err = mlx5_crdump_enable(dev); + if (err) + dev_err(&pdev->dev, "mlx5_crdump_enable failed with error code %d\n", err); + + pci_save_state(pdev); + devlink_register(devlink); + return 0; + +err_init_one: + mlx5_crdump_cleanup(dev); +clean_crdump: + mlx5_pci_close(dev); +pci_init_err: + mlx5_mdev_uninit(dev); +mdev_init_err: + mlx5_adev_idx_free(dev->priv.adev_idx); +adev_init_err: + device_remove_file(&pdev->dev, mlx5_roce_enable_dev_attrs); +remove_roce_file: + mlx5_devlink_free(devlink); + + return err; +} + +static void remove_one(struct pci_dev *pdev) +{ + struct mlx5_core_dev *dev = pci_get_drvdata(pdev); + struct devlink *devlink = priv_to_devlink(dev); + + /* mlx5_drain_fw_reset() is using devlink APIs. Hence, we must drain + * fw_reset before unregistering the devlink. + */ + mlx5_drain_fw_reset(dev); + set_bit(MLX5_INTERFACE_STATE_TEARDOWN, &dev->intf_state); + if (mlx5_try_fast_unload(dev)) + dev_dbg(&dev->pdev->dev, "mlx5_try_fast_unload failed\n"); + + devlink_unregister(devlink); + mlx5_crdump_disable(dev); + mlx5_drain_health_wq(dev); + mlx5_uninit_one(dev); + mlx5_crdump_cleanup(dev); + mlx5_pci_close(dev); + mlx5_mdev_uninit(dev); + mlx5_adev_idx_free(dev->priv.adev_idx); + device_remove_file(&pdev->dev, mlx5_roce_enable_dev_attrs); + mlx5_devlink_free(devlink); +} + +#define mlx5_pci_trace(dev, fmt, ...) ({ \ + struct mlx5_core_dev *__dev = (dev); \ + mlx5_core_info(__dev, "%s Device state = %d health sensors: %d pci_status: %d. " fmt, \ + __func__, __dev->state, mlx5_health_check_fatal_sensors(__dev), \ + __dev->pci_status, ##__VA_ARGS__); \ +}) + +static const char *result2str(enum pci_ers_result result) +{ + return result == PCI_ERS_RESULT_NEED_RESET ? "need reset" : + result == PCI_ERS_RESULT_DISCONNECT ? "disconnect" : + result == PCI_ERS_RESULT_RECOVERED ? "recovered" : + "unknown"; +} + +#ifdef CONFIG_PM +static int suspend(struct device *device) +{ + struct pci_dev *pdev = to_pci_dev(device); + struct mlx5_core_dev *dev = pci_get_drvdata(pdev); + int err; + + dev_info(&pdev->dev, "suspend was called\n"); + + if (pdev->is_virtfn) + return 0; + + mlx5_unload_one(dev); + + err = pci_save_state(pdev); + if (err) { + dev_err(&pdev->dev, "pci_save_state failed with error code: %d\n", err); + return err; + } + + err = pci_enable_wake(pdev, PCI_D3hot, 0); + if (err) { + dev_err(&pdev->dev, "pci_enable_wake failed with error code: %d\n", err); + return err; + } + + mlx5_pci_disable_device(dev); + err = pci_set_power_state(pdev, PCI_D3hot); + if (err) { + dev_warn(&pdev->dev, "pci_set_power_state failed with error code: %d\n", err); + return err; + } + + return 0; +} + +static int resume(struct device *device) +{ + struct pci_dev *pdev = to_pci_dev(device); + struct mlx5_core_dev *dev = pci_get_drvdata(pdev); + int err; + + dev_info(&pdev->dev, "resume was called\n"); + + if (pdev->is_virtfn) + return 0; + + err = pci_set_power_state(pdev, PCI_D0); + if (err) { + dev_warn(&pdev->dev, "pci_set_power_state failed with error code: %d\n", err); + return err; + } + + pci_restore_state(pdev); + err = pci_save_state(pdev); + if (err) { + dev_err(&pdev->dev, "pci_save_state failed with error code: %d\n", err); + return err; + } + err = mlx5_pci_enable_device(dev); + if (err) { + dev_err(&pdev->dev, "mlx5_pci_enabel_device failed with error code: %d\n", err); + return err; + } + pci_set_master(pdev); + + err = mlx5_load_one(dev, false); + if (err) { + dev_err(&pdev->dev, "mlx5_load_one failed with error code: %d\n", err); + return err; + } + + return 0; +} + +static const struct dev_pm_ops mlnx_pm = { + .suspend = suspend, + .resume = resume, +}; + +#endif /* CONFIG_PM */ +static pci_ers_result_t mlx5_pci_err_detected(struct pci_dev *pdev, + pci_channel_state_t state) +{ + struct mlx5_core_dev *dev = pci_get_drvdata(pdev); + enum pci_ers_result res; + + mlx5_pci_trace(dev, "Enter, pci channel state = %d\n", state); + + if (pdev->is_virtfn) + return PCI_ERS_RESULT_CAN_RECOVER; + + mlx5_enter_error_state(dev, false); + mlx5_error_sw_reset(dev); + mlx5_unload_one(dev); + mlx5_drain_health_wq(dev); + mlx5_pci_disable_device(dev); + + res = state == pci_channel_io_perm_failure ? + PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET; + + mlx5_core_info(dev, "%s Done, result = %d, %s\n", __func__, res, result2str(res)); + return res; +} + +/* wait for the device to show vital signs by waiting + * for the health counter to start counting. + */ +static int wait_vital(struct pci_dev *pdev) +{ + struct mlx5_core_dev *dev = pci_get_drvdata(pdev); + struct mlx5_core_health *health = &dev->priv.health; + const int niter = 100; + u32 last_count = 0; + u32 count; + int i; + + for (i = 0; i < niter; i++) { + count = ioread32be(health->health_counter); + if (count && count != 0xffffffff) { + if (last_count && last_count != count) { + mlx5_core_info(dev, + "wait vital counter value 0x%x after %d iterations\n", + count, i); + return 0; + } + last_count = count; + } + msleep(50); + } + + return -ETIMEDOUT; +} + +static pci_ers_result_t mlx5_pci_slot_reset(struct pci_dev *pdev) +{ + enum pci_ers_result res = PCI_ERS_RESULT_DISCONNECT; + struct mlx5_core_dev *dev = pci_get_drvdata(pdev); + int err; + + mlx5_core_info(dev, "%s was called\n", __func__); + + if (pdev->is_virtfn) + return PCI_ERS_RESULT_NEED_RESET; + + err = mlx5_pci_enable_device(dev); + if (err) { + mlx5_core_err(dev, "%s: mlx5_pci_enable_device failed with error code: %d\n", + __func__, err); + goto out; + } + + pci_set_master(pdev); + pci_restore_state(pdev); + pci_save_state(pdev); + + err = wait_vital(pdev); + if (err) { + mlx5_core_err(dev, "%s: wait vital failed with error code: %d\n", + __func__, err); + goto out; + } + + res = PCI_ERS_RESULT_RECOVERED; +out: + mlx5_pci_trace(dev, "Done, err = %d, result = %d, %s\n", err, res, result2str(res)); + return res; +} + +static void mlx5_pci_resume(struct pci_dev *pdev) +{ + struct mlx5_core_dev *dev = pci_get_drvdata(pdev); + int err; + + mlx5_pci_trace(dev, "Enter, loading driver..\n"); + + if (pdev->is_virtfn) + return; + + dev->priv.sw_reset_lag = dev->priv.lag_enabled; + err = mlx5_load_one(dev, false); + + mlx5_pci_trace(dev, "Done, err = %d, device %s\n", err, + !err ? "recovered" : "Failed"); +} + +static const struct pci_error_handlers mlx5_err_handler = { + .error_detected = mlx5_pci_err_detected, + .slot_reset = mlx5_pci_slot_reset, + .resume = mlx5_pci_resume +}; + +static void shutdown(struct pci_dev *pdev) +{ + struct mlx5_core_dev *dev = pci_get_drvdata(pdev); + int err; + + mlx5_core_info(dev, "Shutdown was called\n"); + + if (pdev->is_virtfn) + return; + + set_bit(MLX5_INTERFACE_STATE_TEARDOWN, &dev->intf_state); + err = mlx5_try_fast_unload(dev); + if (err) { + mlx5_unload_one(dev); + } else { + /* Some platforms requiring freeing the IRQ's in the shutdown + * flow. If they aren't freed they can't be allocated after + * kexec. There is no need to cleanup the mlx5_core software + * contexts. + */ + mlx5_core_eq_free_irqs(dev); + } + + mlx5_pci_disable_device(dev); +} + +static int mlx5_suspend(struct pci_dev *pdev, pm_message_t state) +{ + struct mlx5_core_dev *dev = pci_get_drvdata(pdev); + + mlx5_unload_one(dev); + + return 0; +} + +static int mlx5_resume(struct pci_dev *pdev) +{ + struct mlx5_core_dev *dev = pci_get_drvdata(pdev); + + return mlx5_load_one(dev, false); +} + +static const struct pci_device_id mlx5_core_pci_table[] = { + { PCI_VDEVICE(MELLANOX, PCI_DEVICE_ID_MELLANOX_CONNECTX4) }, + { PCI_VDEVICE(MELLANOX, 0x1014), MLX5_PCI_DEV_IS_VF}, /* ConnectX-4 VF */ + { PCI_VDEVICE(MELLANOX, PCI_DEVICE_ID_MELLANOX_CONNECTX4_LX) }, + { PCI_VDEVICE(MELLANOX, 0x1016), MLX5_PCI_DEV_IS_VF}, /* ConnectX-4LX VF */ + { PCI_VDEVICE(MELLANOX, 0x1017) }, /* ConnectX-5, PCIe 3.0 */ + { PCI_VDEVICE(MELLANOX, 0x1018), MLX5_PCI_DEV_IS_VF}, /* ConnectX-5 VF */ + { PCI_VDEVICE(MELLANOX, 0x1019) }, /* ConnectX-5 Ex */ + { PCI_VDEVICE(MELLANOX, 0x101a), MLX5_PCI_DEV_IS_VF}, /* ConnectX-5 Ex VF */ + { PCI_VDEVICE(MELLANOX, 0x101b) }, /* ConnectX-6 */ + { PCI_VDEVICE(MELLANOX, 0x101c), MLX5_PCI_DEV_IS_VF}, /* ConnectX-6 VF */ + { PCI_VDEVICE(MELLANOX, 0x101d) }, /* ConnectX-6 Dx */ + { PCI_VDEVICE(MELLANOX, 0x101e), MLX5_PCI_DEV_IS_VF}, /* ConnectX Family mlx5Gen Virtual Function */ + { PCI_VDEVICE(MELLANOX, 0x101f) }, /* ConnectX-6 LX */ + { PCI_VDEVICE(MELLANOX, 0x1021) }, /* ConnectX-7 */ + { PCI_VDEVICE(MELLANOX, 0x1023) }, /* ConnectX-8 */ + { PCI_VDEVICE(MELLANOX, 0xa2d2) }, /* BlueField integrated ConnectX-5 network controller */ + { PCI_VDEVICE(MELLANOX, 0xa2d3), MLX5_PCI_DEV_IS_VF}, /* BlueField integrated ConnectX-5 network controller VF */ + { PCI_VDEVICE(MELLANOX, 0xa2d6) }, /* BlueField-2 integrated ConnectX-6 Dx network controller */ + { PCI_VDEVICE(MELLANOX, 0xa2dc) }, /* BlueField-3 integrated ConnectX-7 network controller */ + { PCI_VDEVICE(MELLANOX, 0xa2df) }, /* BlueField-4 integrated ConnectX-8 network controller */ + { 0, } +}; + +MODULE_DEVICE_TABLE(pci, mlx5_core_pci_table); + +void mlx5_disable_device(struct mlx5_core_dev *dev) +{ + mlx5_error_sw_reset(dev); + mlx5_unload_one(dev); +} + +int mlx5_recover_device(struct mlx5_core_dev *dev) +{ + if (!mlx5_core_is_sf(dev)) { + mlx5_pci_disable_device(dev); + if (mlx5_pci_slot_reset(dev->pdev) != PCI_ERS_RESULT_RECOVERED) + return -EIO; + } + + return mlx5_load_one(dev, true); +} + +static struct pci_driver mlx5_core_driver = { + .name = KBUILD_MODNAME, + .id_table = mlx5_core_pci_table, +#ifdef CONFIG_PM + .driver = { + .pm = &mlnx_pm, + }, +#endif /* CONFIG_PM */ + .probe = probe_one, + .remove = remove_one, + .suspend = mlx5_suspend, + .resume = mlx5_resume, + .shutdown = shutdown, + .err_handler = &mlx5_err_handler, + .sriov_configure = mlx5_core_sriov_configure, + .sriov_get_vf_total_msix = mlx5_sriov_get_vf_total_msix, + .sriov_set_msix_vec_count = mlx5_core_sriov_set_msix_vec_count, +}; + +static void mlx5_core_verify_params(void) +{ + if (prof_sel >= ARRAY_SIZE(profile)) { + pr_warn("mlx5_core: WARNING: Invalid module parameter prof_sel %d, valid range 0-%zu, changing back to default(%d)\n", + prof_sel, + ARRAY_SIZE(profile) - 1, + MLX5_DEFAULT_PROF); + prof_sel = MLX5_DEFAULT_PROF; + } +} + +static int mlx5_create_core_dir(void) +{ + if (!mlx5_core_proc_dir) { + mlx5_core_proc_dir = proc_mkdir(MLX5_CORE_PROC, NULL); + if (!mlx5_core_proc_dir) + return -1; + } + + mlx5_crdump_dir = proc_mkdir(MLX5_CORE_PROC_CRDUMP, mlx5_core_proc_dir); + if (!mlx5_crdump_dir) { + remove_proc_entry(MLX5_CORE_PROC, NULL); + return -1; + } + + return 0; +} + +static void mlx5_remove_core_dir(void) +{ + if (mlx5_core_proc_dir) { + if (mlx5_crdump_dir) + remove_proc_entry(MLX5_CORE_PROC_CRDUMP, mlx5_core_proc_dir); + remove_proc_entry(MLX5_CORE_PROC, NULL); + } +} + +static int __init init(void) +{ + int err; + + WARN_ONCE(strcmp(MLX5_ADEV_NAME, KBUILD_MODNAME), + "mlx5_core name not in sync with kernel module name"); + + get_random_bytes(&sw_owner_id, sizeof(sw_owner_id)); + + mlx5_core_verify_params(); + mlx5_fpga_ipsec_build_fs_cmds(); + mlx5_register_debugfs(); + + err = mlx5_create_core_dir(); + if (err) + goto err_debug; + + err = pci_register_driver(&mlx5_core_driver); + if (err) + goto err_core_dir; + + err = mlx5_sf_driver_register(); + if (err) + goto err_sf; + + err = mlx5e_init(); + if (err) + goto err_en; + + return 0; + +err_en: + mlx5_sf_driver_unregister(); +err_sf: + pci_unregister_driver(&mlx5_core_driver); +err_core_dir: + mlx5_remove_core_dir(); +err_debug: + mlx5_unregister_debugfs(); + return err; +} + +static void __exit cleanup(void) +{ + mlx5e_cleanup(); + mlx5_sf_driver_unregister(); + pci_unregister_driver(&mlx5_core_driver); + mlx5_remove_core_dir(); + mlx5_unregister_debugfs(); +} + +module_init(init); +module_exit(cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/mcg.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/mcg.c new file mode 100644 index 0000000..e019d68 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/mcg.c @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include "mlx5_core.h" + +int mlx5_core_attach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn) +{ + u32 in[MLX5_ST_SZ_DW(attach_to_mcg_in)] = {}; + void *gid; + + MLX5_SET(attach_to_mcg_in, in, opcode, MLX5_CMD_OP_ATTACH_TO_MCG); + MLX5_SET(attach_to_mcg_in, in, qpn, qpn); + gid = MLX5_ADDR_OF(attach_to_mcg_in, in, multicast_gid); + memcpy(gid, mgid, sizeof(*mgid)); + return mlx5_cmd_exec_in(dev, attach_to_mcg, in); +} +EXPORT_SYMBOL(mlx5_core_attach_mcg); + +int mlx5_core_detach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn) +{ + u32 in[MLX5_ST_SZ_DW(detach_from_mcg_in)] = {}; + void *gid; + + MLX5_SET(detach_from_mcg_in, in, opcode, MLX5_CMD_OP_DETACH_FROM_MCG); + MLX5_SET(detach_from_mcg_in, in, qpn, qpn); + gid = MLX5_ADDR_OF(detach_from_mcg_in, in, multicast_gid); + memcpy(gid, mgid, sizeof(*mgid)); + return mlx5_cmd_exec_in(dev, detach_from_mcg, in); +} +EXPORT_SYMBOL(mlx5_core_detach_mcg); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h new file mode 100644 index 0000000..d97d6e7 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h @@ -0,0 +1,483 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __MLX5_CORE_H__ +#define __MLX5_CORE_H__ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define DRIVER_VERSION "5.8-1.1.2" + +extern uint mlx5_core_debug_mask; + +#define mlx5_core_dbg(__dev, format, ...) \ + dev_dbg((__dev)->device, "%s:%d:(pid %d): " format, \ + __func__, __LINE__, current->pid, \ + ##__VA_ARGS__) + +#define mlx5_core_dbg_once(__dev, format, ...) \ + dev_dbg_once((__dev)->device, \ + "%s:%d:(pid %d): " format, \ + __func__, __LINE__, current->pid, \ + ##__VA_ARGS__) + +#define mlx5_core_dbg_mask(__dev, mask, format, ...) \ +do { \ + if ((mask) & mlx5_core_debug_mask) \ + mlx5_core_dbg(__dev, format, ##__VA_ARGS__); \ +} while (0) + +#define mlx5_core_err(__dev, format, ...) \ + dev_err((__dev)->device, "%s:%d:(pid %d): " format, \ + __func__, __LINE__, current->pid, \ + ##__VA_ARGS__) + +#define mlx5_core_err_rl(__dev, format, ...) \ + dev_err_ratelimited((__dev)->device, \ + "%s:%d:(pid %d): " format, \ + __func__, __LINE__, current->pid, \ + ##__VA_ARGS__) + +#define mlx5_core_warn(__dev, format, ...) \ + dev_warn((__dev)->device, "%s:%d:(pid %d): " format, \ + __func__, __LINE__, current->pid, \ + ##__VA_ARGS__) + +#define mlx5_core_warn_once(__dev, format, ...) \ + dev_warn_once((__dev)->device, "%s:%d:(pid %d): " format, \ + __func__, __LINE__, current->pid, \ + ##__VA_ARGS__) + +#define mlx5_core_warn_rl(__dev, format, ...) \ + dev_warn_ratelimited((__dev)->device, \ + "%s:%d:(pid %d): " format, \ + __func__, __LINE__, current->pid, \ + ##__VA_ARGS__) + +#define mlx5_core_info(__dev, format, ...) \ + dev_info((__dev)->device, format, ##__VA_ARGS__) + +#define mlx5_core_info_rl(__dev, format, ...) \ + dev_info_ratelimited((__dev)->device, \ + "%s:%d:(pid %d): " format, \ + __func__, __LINE__, current->pid, \ + ##__VA_ARGS__) + +static inline void mlx5_printk(struct mlx5_core_dev *dev, int level, const char *format, ...) +{ + struct device *device = dev->device; + struct va_format vaf; + va_list args; + + if (WARN_ONCE(level < LOGLEVEL_EMERG || level > LOGLEVEL_DEBUG, + "Level %d is out of range, set to default level\n", level)) + level = LOGLEVEL_DEFAULT; + + va_start(args, format); + vaf.fmt = format; + vaf.va = &args; + + dev_printk_emit(level, device, "%s %s: %pV", dev_driver_string(device), dev_name(device), + &vaf); + va_end(args); +} + +#define mlx5_log(__dev, level, format, ...) \ + mlx5_printk(__dev, level, "%s:%d:(pid %d): " format, \ + __func__, __LINE__, current->pid, \ + ##__VA_ARGS__) + +static inline struct device *mlx5_core_dma_dev(struct mlx5_core_dev *dev) +{ + return &dev->pdev->dev; +} + +enum { + MLX5_CMD_DATA, /* print command payload only */ + MLX5_CMD_TIME, /* print command execution time */ +}; + +enum { + MLX5_DRIVER_STATUS_ABORTED = 0xfe, + MLX5_DRIVER_SYND = 0xbadd00de, +}; + +enum mlx5_semaphore_space_address { + MLX5_SEMAPHORE_SPACE_DOMAIN = 0xA, + MLX5_SEMAPHORE_SW_RESET = 0x20, +}; + +enum mlx5_pddr_page_select { + MLX5_PDDR_OPERATIONAL_INFO_PAGE = 0x0, + MLX5_PDDR_TROUBLESHOOTING_INFO_PAGE = 0x1, + MLX5_PDDR_MODULE_INFO_PAGE = 0x3, +}; + +enum mlx5_pddr_monitor_opcodes { + MLX5_LINK_NO_ISSUE_OBSERVED = 0x0, + MLX5_LINK_PORT_CLOSED = 0x1, + MLX5_LINK_AN_FAILURE = 0x2, + MLX5_LINK_TRAINING_FAILURE = 0x5, + MLX5_LINK_LOGICAL_MISMATCH = 0x9, + MLX5_LINK_REMOTE_FAULT_INDICATION = 0xe, + MLX5_LINK_BAD_SIGNAL_INTEGRITY = 0xf, + MLX5_LINK_CABLE_COMPLIANCE_CODE_MISMATCH = 0x10, + MLX5_LINK_INTERNAL_ERR = 0x17, + MLX5_LINK_INFO_NOT_AVAIL = 0x3ff, + MLX5_LINK_CABLE_UNPLUGGED = 0x400, + MLX5_LINK_LONG_RANGE_FOR_NON_MLX_CABLE = 0x401, + MLX5_LINK_BUS_STUCK = 0x402, + MLX5_LINK_UNSUPP_EEPROM = 0x403, + MLX5_LINK_PART_NUM_LIST = 0x404, + MLX5_LINK_UNSUPP_CABLE = 0x405, + MLX5_LINK_MODULE_TEMP_SHUTDOWN = 0x406, + MLX5_LINK_SHORTED_CABLE = 0x407, + MLX5_LINK_POWER_BUDGET_EXCEEDED = 0x408, + MLX5_LINK_MNG_FORCED_DOWN = 0x409, +}; + +enum mlx5_icmd_conf_address { + MLX5_ICMD_CTRL = 0x0, /* RW */ + MLX5_ICMD_MAILBOX_SZ = 0x1000, /* RO */ + MLX5_ICMD_SYNDROME = 0x1008, /* RO */ + MLX5_ICMD_MAILBOX = 0x100000, /* RW */ +}; + +enum mlx5_icmd_ctrl_opcode { + MLX5_ICMD_ACCESS_REG = 0x9001, +}; + +enum mlx5_icmd_access_reg_id { + MLX5_ICMD_MCION = 0x9052, +}; + +enum mlx5_icmd_access_reg_method { + MLX5_ICMD_QUERY = 0x1, + MLX5_ICMD_WRITE = 0x2, +}; + +enum { + MLX5_ICMD_ACCESS_REG_DATA_DW_SZ = 0x2, +}; + +struct mlx5_icmd_ctrl_bits { + u16 opcode; + u8 status; + u8 busy; +} __packed; + +struct mlx5_icmd_access_reg_input_bits { + u16 constant_1_2; + u8 reserved_0[0x2]; + u16 register_id; + u8 method; + u8 constant_3; + u8 reserved_1[0x8]; + u16 len; + u8 reserved_2[0x2]; + u32 reg_data[MLX5_ICMD_ACCESS_REG_DATA_DW_SZ]; +} __packed; + +struct mlx5_icmd_access_reg_output_bits { + u8 reserved_0[0x2]; + u8 status; + u8 reserved_1[0x1]; + u16 register_id; + u8 reserved_2[0xA]; + u16 len; + u8 reserved_3[0x2]; + u32 reg_data[MLX5_ICMD_ACCESS_REG_DATA_DW_SZ]; +} __packed; + +struct mlx5_mcion_reg { + u8 reserved_0[0x1]; + u8 module; + u8 reserved_1[0x5]; + u8 module_status; +} __packed; + +#define MLX5_DEFAULT_PROF 2 + +int mlx5_query_hca_caps(struct mlx5_core_dev *dev); +int mlx5_query_board_id(struct mlx5_core_dev *dev); +int mlx5_cmd_init(struct mlx5_core_dev *dev); +void mlx5_cmd_cleanup(struct mlx5_core_dev *dev); +void mlx5_cmd_set_state(struct mlx5_core_dev *dev, + enum mlx5_cmdif_state cmdif_state); +int mlx5_cmd_init_hca(struct mlx5_core_dev *dev, uint32_t *sw_owner_id); +int mlx5_cmd_teardown_hca(struct mlx5_core_dev *dev); +int mlx5_cmd_force_teardown_hca(struct mlx5_core_dev *dev); +int mlx5_cmd_fast_teardown_hca(struct mlx5_core_dev *dev); +void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force); +bool mlx5_sensor_pci_not_working(struct mlx5_core_dev *dev); +void mlx5_error_sw_reset(struct mlx5_core_dev *dev); +u32 mlx5_health_check_fatal_sensors(struct mlx5_core_dev *dev); +int mlx5_health_wait_pci_up(struct mlx5_core_dev *dev); +void mlx5_disable_device(struct mlx5_core_dev *dev); +int mlx5_recover_device(struct mlx5_core_dev *dev); +void mlx5_rename_comp_eq(struct mlx5_core_dev *dev, unsigned int eq_ix, + char *name); +int mlx5_sriov_init(struct mlx5_core_dev *dev); +void mlx5_sriov_cleanup(struct mlx5_core_dev *dev); +int mlx5_sriov_attach(struct mlx5_core_dev *dev); +void mlx5_sriov_detach(struct mlx5_core_dev *dev); +int mlx5_core_sriov_configure(struct pci_dev *dev, int num_vfs); +int mlx5_core_sriov_set_msix_vec_count(struct pci_dev *vf, int msix_vec_count); +int mlx5_core_enable_hca(struct mlx5_core_dev *dev, u16 func_id); +int mlx5_sriov_sysfs_init(struct mlx5_core_dev *dev); +void mlx5_sriov_sysfs_cleanup(struct mlx5_core_dev *dev); +int mlx5_create_vfs_sysfs(struct mlx5_core_dev *dev, int num_vfs); +void mlx5_destroy_vfs_sysfs(struct mlx5_core_dev *dev, int num_vfs); +int mlx5_create_vf_group_sysfs(struct mlx5_core_dev *dev, + u32 group_id, struct kobject *group_kobj); +void mlx5_destroy_vf_group_sysfs(struct mlx5_core_dev *dev, + struct kobject *group_kobj); +int mlx5_core_disable_hca(struct mlx5_core_dev *dev, u16 func_id); +int mlx5_core_enable_sf_hca(struct mlx5_core_dev *dev, u16 sf_func_id); +int mlx5_core_disable_sf_hca(struct mlx5_core_dev *dev, u16 sf_func_id); +int mlx5_create_scheduling_element_cmd(struct mlx5_core_dev *dev, u8 hierarchy, + void *context, u32 *element_id); +int mlx5_modify_scheduling_element_cmd(struct mlx5_core_dev *dev, u8 hierarchy, + void *context, u32 element_id, + u32 modify_bitmask); +int mlx5_destroy_scheduling_element_cmd(struct mlx5_core_dev *dev, u8 hierarchy, + u32 element_id); +int mlx5_wait_for_pages(struct mlx5_core_dev *dev, int *pages); + +void mlx5_cmd_trigger_completions(struct mlx5_core_dev *dev); +void mlx5_cmd_flush(struct mlx5_core_dev *dev); +void mlx5_cq_debugfs_init(struct mlx5_core_dev *dev); +void mlx5_cq_debugfs_cleanup(struct mlx5_core_dev *dev); + +int mlx5_query_pcam_reg(struct mlx5_core_dev *dev, u32 *pcam, u8 feature_group, + u8 access_reg_group); +int mlx5_query_mcam_reg(struct mlx5_core_dev *dev, u32 *mcap, u8 feature_group, + u8 access_reg_group); +int mlx5_query_qcam_reg(struct mlx5_core_dev *mdev, u32 *qcam, + u8 feature_group, u8 access_reg_group); +int mlx5_query_pddr_troubleshooting_info(struct mlx5_core_dev *mdev, + u16 *monitor_opcode, + u8 *status_message); + +void mlx5_lag_add_netdev(struct mlx5_core_dev *dev, struct net_device *netdev); +void mlx5_lag_remove_netdev(struct mlx5_core_dev *dev, struct net_device *netdev); +void mlx5_lag_add_mdev(struct mlx5_core_dev *dev); +void mlx5_lag_remove_mdev(struct mlx5_core_dev *dev); +void mlx5_lag_disable_change(struct mlx5_core_dev *dev); +void mlx5_lag_enable_change(struct mlx5_core_dev *dev); + +int mlx5_events_init(struct mlx5_core_dev *dev); +void mlx5_events_cleanup(struct mlx5_core_dev *dev); +void mlx5_events_start(struct mlx5_core_dev *dev); +void mlx5_events_stop(struct mlx5_core_dev *dev); + +int mlx5_adev_idx_alloc(void); +void mlx5_adev_idx_free(int idx); +void mlx5_adev_cleanup(struct mlx5_core_dev *dev); +int mlx5_adev_init(struct mlx5_core_dev *dev); + +int mlx5_attach_device(struct mlx5_core_dev *dev); +void mlx5_attach_device_by_protocol(struct mlx5_core_dev *dev, int protocol); +void mlx5_detach_device(struct mlx5_core_dev *dev); +int mlx5_register_device(struct mlx5_core_dev *dev); +void mlx5_unregister_device(struct mlx5_core_dev *dev); +struct mlx5_core_dev *mlx5_get_next_phys_dev(struct mlx5_core_dev *dev); +struct mlx5_core_dev *mlx5_get_next_phys_dev_lag(struct mlx5_core_dev *dev); +void mlx5_dev_list_lock(void); +void mlx5_dev_list_unlock(void); +int mlx5_dev_list_trylock(void); + +int mlx5_eswitch_offloads_config_single_fdb(struct mlx5_eswitch *master_esw, + struct mlx5_eswitch *slave_esw); +void mlx5_eswitch_offloads_destroy_single_fdb(struct mlx5_eswitch *master_esw, + struct mlx5_eswitch *slave_esw); +int mlx5_eswitch_reload_reps(struct mlx5_eswitch *esw); + +int mlx5_query_mtpps(struct mlx5_core_dev *dev, u32 *mtpps, u32 mtpps_size); +int mlx5_set_mtpps(struct mlx5_core_dev *mdev, u32 *mtpps, u32 mtpps_size); +int mlx5_query_mtppse(struct mlx5_core_dev *mdev, u8 pin, u8 *arm, u8 *mode); +int mlx5_set_mtppse(struct mlx5_core_dev *mdev, u8 pin, u8 arm, u8 mode); + +struct mlx5_dm *mlx5_dm_create(struct mlx5_core_dev *dev); +void mlx5_dm_cleanup(struct mlx5_core_dev *dev); + +#define MLX5_PPS_CAP(mdev) (MLX5_CAP_GEN((mdev), pps) && \ + MLX5_CAP_GEN((mdev), pps_modify) && \ + MLX5_CAP_MCAM_FEATURE((mdev), mtpps_fs) && \ + MLX5_CAP_MCAM_FEATURE((mdev), mtpps_enh_out_per_adj)) + +int mlx5_firmware_flash(struct mlx5_core_dev *dev, const struct firmware *fw, + struct netlink_ext_ack *extack); +int mlx5_fw_version_query(struct mlx5_core_dev *dev, + u32 *running_ver, u32 *stored_ver); + +enum { + UNLOCK, + LOCK, + CAP_ID = 0x9, +}; + +int mlx5_pciconf_cap9_sem(struct mlx5_core_dev *dev, int state); +int mlx5_pciconf_set_addr_space(struct mlx5_core_dev *dev, u16 space); +int mlx5_pciconf_set_protected_addr_space(struct mlx5_core_dev *dev, + u32 *ret_space_size); +int mlx5_block_op_pciconf(struct mlx5_core_dev *dev, + unsigned int offset, u32 *data, + int length); +int mlx5_block_op_pciconf_fast(struct mlx5_core_dev *dev, + u32 *data, + int length); +int mlx5_mst_dump_init(struct mlx5_core_dev *dev); +int mlx5_mst_capture(struct mlx5_core_dev *dev); +u32 mlx5_mst_dump(struct mlx5_core_dev *dev, void *buff, u32 buff_sz); +void mlx5_mst_free_capture(struct mlx5_core_dev *dev); +void mlx5_mst_dump_cleanup(struct mlx5_core_dev *dev); + +int mlx5_icmd_access_register(struct mlx5_core_dev *dev, + int reg_id, + int method, + void *io_buff, + u32 io_buff_dw_sz); + +#ifdef CONFIG_MLX5_CORE_EN +int mlx5e_init(void); +void mlx5e_cleanup(void); +#else +static inline int mlx5e_init(void){ return 0; } +static inline void mlx5e_cleanup(void){} +#endif + +int mlx5_modify_other_hca_cap_roce(struct mlx5_core_dev *mdev, + u16 function_id, bool value); +int mlx5_get_other_hca_cap_roce(struct mlx5_core_dev *mdev, + u16 function_id, bool *value); + +static inline bool mlx5_sriov_is_enabled(struct mlx5_core_dev *dev) +{ + return pci_num_vf(dev->pdev) ? true : false; +} + +/* crdump */ +struct mlx5_fw_crdump { + u32 crspace_size; + /* sync reading/freeing the data */ + struct mutex crspace_mutex; + u32 vsec_addr; + u8 *crspace; + u16 space; +}; + +int mlx5_cr_protected_capture(struct mlx5_core_dev *dev); + +#define MLX5_CORE_PROC "driver/mlx5_core" +#define MLX5_CORE_PROC_CRDUMP "crdump" +extern struct proc_dir_entry *mlx5_crdump_dir; +int mlx5_crdump_init(struct mlx5_core_dev *dev); +void mlx5_crdump_cleanup(struct mlx5_core_dev *dev); +int mlx5_fill_cr_dump(struct mlx5_core_dev *dev); + +static inline int mlx5_lag_is_lacp_owner(struct mlx5_core_dev *dev) +{ + /* LACP owner conditions: + * 1) Function is physical. + * 2) LAG is supported by FW. + * 3) LAG is managed by driver (currently the only option). + */ + return MLX5_CAP_GEN(dev, vport_group_manager) && + (MLX5_CAP_GEN(dev, num_lag_ports) > 1) && + MLX5_CAP_GEN(dev, lag_master); +} + +int mlx5_rescan_drivers_locked(struct mlx5_core_dev *dev); +static inline int mlx5_rescan_drivers(struct mlx5_core_dev *dev) +{ + int ret; + + mlx5_dev_list_lock(); + ret = mlx5_rescan_drivers_locked(dev); + mlx5_dev_list_unlock(); + return ret; +} + +void mlx5_lag_update(struct mlx5_core_dev *dev); + +enum { + MLX5_NIC_IFC_FULL = 0, + MLX5_NIC_IFC_DISABLED = 1, + MLX5_NIC_IFC_NO_DRAM_NIC = 2, + MLX5_NIC_IFC_SW_RESET = 7 +}; + +u8 mlx5_get_nic_state(struct mlx5_core_dev *dev); +void mlx5_set_nic_state(struct mlx5_core_dev *dev, u8 state); + +int mlx5_mdev_init(struct mlx5_core_dev *dev, int profile_idx); +void mlx5_mdev_uninit(struct mlx5_core_dev *dev); + +static inline bool mlx5_core_is_sf(const struct mlx5_core_dev *dev) +{ + return dev->coredev_type == MLX5_COREDEV_SF; +} + +int mlx5_mdev_init(struct mlx5_core_dev *dev, int profile_idx); +void mlx5_mdev_uninit(struct mlx5_core_dev *dev); +int mlx5_init_one(struct mlx5_core_dev *dev); +void mlx5_uninit_one(struct mlx5_core_dev *dev); +void mlx5_pcie_print_link_status(struct mlx5_core_dev *dev); +void mlx5_unload_one(struct mlx5_core_dev *dev); +int mlx5_load_one(struct mlx5_core_dev *dev, bool recovery); + +int mlx5_vport_get_other_func_cap(struct mlx5_core_dev *dev, u16 function_id, void *out); + +void mlx5_events_work_enqueue(struct mlx5_core_dev *dev, struct work_struct *work); +static inline u32 mlx5_sriov_get_vf_total_msix(struct pci_dev *pdev) +{ + struct mlx5_core_dev *dev = pci_get_drvdata(pdev); + + return MLX5_CAP_GEN_MAX(dev, num_total_dynamic_vf_msix); +} + +bool mlx5_eth_supported(struct mlx5_core_dev *dev); +bool mlx5_rdma_supported(struct mlx5_core_dev *dev); +bool mlx5_vnet_supported(struct mlx5_core_dev *dev); +bool mlx5_same_hw_devs(struct mlx5_core_dev *dev, struct mlx5_core_dev *peer_dev); + +void mlx5_core_affinity_get(struct mlx5_core_dev *dev, struct cpumask *dev_mask); + +#endif /* __MLX5_CORE_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/mlx5_devm.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/mlx5_devm.c new file mode 100644 index 0000000..6df0754 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/mlx5_devm.c @@ -0,0 +1,860 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2021 Mellanox Technologies Ltd. */ + +#include +#include "mlx5_core.h" +#include "fs_core.h" +#include "eswitch.h" +#include "sf/dev/dev.h" +#include "sf/sf.h" +#include "sf/mlx5_ifc_vhca_event.h" +#include +#include "mlx5_devm.h" +#include "mlx5_esw_devm.h" +#include "mlx5_irq.h" +#include "esw/qos.h" + +static LIST_HEAD(dev_head); +/* The mutex below protects the dev_head list */ +static DEFINE_MUTEX(mlx5_mlxdevm_mutex); + +/** + * Functions to translate between mlxdevm function states and devlink fn states, + * for use by shim layer + */ +static enum devlink_port_fn_state mlxdevm_to_devlink_state(enum mlxdevm_port_fn_state state) +{ + switch (state) { + case MLXDEVM_PORT_FN_STATE_ACTIVE: + return DEVLINK_PORT_FN_STATE_ACTIVE; + case MLXDEVM_PORT_FN_STATE_INACTIVE: + default: + return DEVLINK_PORT_FN_STATE_INACTIVE; + } +} + +static enum mlxdevm_port_fn_opstate devlink_to_mlxdevm_opstate(enum devlink_port_fn_opstate state) +{ + switch (state) { + case DEVLINK_PORT_FN_OPSTATE_ATTACHED: + return MLXDEVM_PORT_FN_OPSTATE_ATTACHED; + case DEVLINK_PORT_FN_OPSTATE_DETACHED: + default: + return MLXDEVM_PORT_FN_OPSTATE_DETACHED; + } +} + +static enum mlxdevm_port_fn_state devlink_to_mlxdevm_state(enum devlink_port_fn_state state) +{ + switch (state) { + case DEVLINK_PORT_FN_STATE_ACTIVE: + return MLXDEVM_PORT_FN_STATE_ACTIVE; + case DEVLINK_PORT_FN_STATE_INACTIVE: + default: + return MLXDEVM_PORT_FN_STATE_INACTIVE; + } +} + +struct mlx5_devm_device *mlx5_devm_device_get(struct mlx5_core_dev *dev) +{ + struct mlx5_devm_device *mdevm; + + /* find the mlxdevm device associated with this core dev */ + mutex_lock(&mlx5_mlxdevm_mutex); + list_for_each_entry(mdevm, &dev_head, list) { + if (mdevm->dev == dev) { + mutex_unlock(&mlx5_mlxdevm_mutex); + return mdevm; + } + } + mutex_unlock(&mlx5_mlxdevm_mutex); + return NULL; +} + +static enum devlink_port_flavour devm2devlink_flavour(enum mlxdevm_port_flavour devm_flv) +{ + /* return a real port flavour only if pci_sf */ + switch (devm_flv) { + case MLXDEVM_PORT_FLAVOUR_PCI_SF: + return DEVLINK_PORT_FLAVOUR_PCI_SF; + default: + return DEVLINK_PORT_FLAVOUR_PHYSICAL; + } + return DEVLINK_PORT_FLAVOUR_PHYSICAL; +} + +static void dm_new_attrs2devl_new_attrs(const struct mlxdevm_port_new_attrs *new_devm, + struct devlink_port_new_attrs *new_devlink) +{ + memset(new_devlink, 0, sizeof(*new_devlink)); + new_devlink->flavour = devm2devlink_flavour(new_devm->flavour); + new_devlink->port_index = new_devm->port_index; + new_devlink->controller = new_devm->controller; + new_devlink->sfnum = new_devm->sfnum; + new_devlink->pfnum = new_devm->pfnum; + new_devlink->port_index_valid = new_devm->port_index_valid; + new_devlink->controller_valid = new_devm->controller_valid; + new_devlink->sfnum_valid = new_devm->sfnum_valid; +} + +static struct devlink *mlxdevm_to_devlink(struct mlxdevm *devm) +{ + return priv_to_devlink(container_of(devm, struct mlx5_devm_device, device)->dev); +} + +int mlx5_devm_sf_port_new(struct mlxdevm *devm_dev, + const struct mlxdevm_port_new_attrs *attrs, + struct netlink_ext_ack *extack, + unsigned int *new_port_index) +{ + struct devlink_port_new_attrs devl_attrs; + struct devlink *devlink; + + devlink = mlxdevm_to_devlink(devm_dev); + dm_new_attrs2devl_new_attrs(attrs, &devl_attrs); + return mlx5_devlink_sf_port_new(devlink, &devl_attrs, extack, new_port_index); +} + +int mlx5_devm_sf_port_del(struct mlxdevm *devm_dev, + unsigned int port_index, + struct netlink_ext_ack *extack) +{ + struct devlink *devlink; + + devlink = mlxdevm_to_devlink(devm_dev); + return mlx5_devlink_sf_port_del(devlink, port_index, extack); +} + +int mlx5_devm_sf_port_fn_state_get(struct mlxdevm_port *port, + enum mlxdevm_port_fn_state *state, + enum mlxdevm_port_fn_opstate *opstate, + struct netlink_ext_ack *extack) +{ + enum devlink_port_fn_opstate dl_opstate; + enum devlink_port_fn_state dl_state; + struct devlink_port devport; + struct devlink *devlink; + int ret; + + devlink = mlxdevm_to_devlink(port->devm); + memset(&devport, 0, sizeof(devport)); + devport.devlink = devlink; + devport.index = port->index; + + ret = mlx5_devlink_sf_port_fn_state_get(&devport, &dl_state, &dl_opstate, extack); + if (!ret) { + *state = devlink_to_mlxdevm_state(dl_state); + *opstate = devlink_to_mlxdevm_opstate(dl_opstate); + } + return ret; +} + +int mlx5_devm_sf_port_fn_state_set(struct mlxdevm_port *port, + enum mlxdevm_port_fn_state state, + struct netlink_ext_ack *extack) +{ + enum devlink_port_fn_state dl_state; + struct devlink_port devport; + struct devlink *devlink; + + devlink = mlxdevm_to_devlink(port->devm); + memset(&devport, 0, sizeof(devport)); + devport.devlink = devlink; + devport.index = port->index; + dl_state = mlxdevm_to_devlink_state(state); + return mlx5_devlink_sf_port_fn_state_set(&devport, dl_state, extack); +} + +int mlx5_devm_sf_port_fn_hw_addr_get(struct mlxdevm_port *port, + u8 *hw_addr, int *hw_addr_len, + struct netlink_ext_ack *extack) +{ + struct devlink_port devport; + struct devlink *devlink; + + devlink = mlxdevm_to_devlink(port->devm); + memset(&devport, 0, sizeof(devport)); + devport.devlink = devlink; + devport.index = port->index; + + return mlx5_devlink_port_function_hw_addr_get(&devport, hw_addr, + hw_addr_len, extack); +} + +int mlx5_devm_sf_port_function_trust_get(struct mlxdevm_port *port, + bool *trusted, + struct netlink_ext_ack *extack) +{ + struct devlink_port devport; + struct devlink *devlink; + + devlink = mlxdevm_to_devlink(port->devm); + memset(&devport, 0, sizeof(devport)); + devport.index = port->index; + return mlx5_devlink_port_function_trust_get(devlink, &devport, + trusted, extack); +} + +int mlx5_devm_sf_port_fn_hw_addr_set(struct mlxdevm_port *port, + const u8 *hw_addr, int hw_addr_len, + struct netlink_ext_ack *extack) +{ + struct devlink_port devport; + struct devlink *devlink; + + devlink = mlxdevm_to_devlink(port->devm); + memset(&devport, 0, sizeof(devport)); + devport.devlink = devlink; + devport.index = port->index; + return mlx5_devlink_port_function_hw_addr_set(&devport, hw_addr, + hw_addr_len, extack); +} + +int mlx5_devm_sf_port_function_trust_set(struct mlxdevm_port *port, + bool trusted, + struct netlink_ext_ack *extack) +{ + struct devlink_port devport; + struct devlink *devlink; + + devlink = mlxdevm_to_devlink(port->devm); + memset(&devport, 0, sizeof(devport)); + devport.index = port->index; + return mlx5_devlink_port_function_trust_set(devlink, &devport, + trusted, extack); +} + +static +struct mlx5_core_dev *mlx5_devm_core_dev_get(struct mlxdevm *devm_dev) +{ + struct mlx5_devm_device *mlx5_devm; + + mlx5_devm = container_of(devm_dev, struct mlx5_devm_device, device); + return mlx5_devm->dev; +} + +int mlx5_devm_sf_port_fn_cap_get(struct mlxdevm_port *port, + struct mlxdevm_port_fn_cap *cap, + struct netlink_ext_ack *extack) +{ + int query_out_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out); + struct mlx5_core_dev *parent_dev; + struct mlx5_devm_port *mlx5_port; + struct devlink *devlink; + unsigned int port_index; + void *query_ctx; + void *hca_caps; + u16 hw_fn_id; + int ret; + + query_ctx = kzalloc(query_out_sz, GFP_KERNEL); + if (!query_ctx) + return -ENOMEM; + + parent_dev = mlx5_devm_core_dev_get(port->devm); + + mlx5_port = container_of(port, struct mlx5_devm_port, port); + port_index = mlx5_port->port_index; + + devlink = mlxdevm_to_devlink(port->devm); + ret = mlx5_sf_index_to_hw_id(devlink, &hw_fn_id, port_index, extack); + if (ret) + goto out_free; + + ret = mlx5_core_other_function_get_caps(parent_dev, hw_fn_id, query_ctx); + if (ret) + goto out_free; + + hca_caps = MLX5_ADDR_OF(query_hca_cap_out, query_ctx, capability); + if (MLX5_GET(cmd_hca_cap, hca_caps, roce)) + cap->roce = MLXDEVM_PORT_FN_CAP_ROCE_ENABLE; + else + cap->roce = MLXDEVM_PORT_FN_CAP_ROCE_DISABLE; + cap->roce_cap_valid = true; + + cap->max_uc_list = 1 << MLX5_GET(cmd_hca_cap, hca_caps, log_max_current_uc_list); + cap->uc_list_cap_valid = true; + +out_free: + kfree(query_ctx); + return ret; +} + +int mlx5_devm_sf_port_fn_cap_set(struct mlxdevm_port *port, + const struct mlxdevm_port_fn_cap *cap, + struct netlink_ext_ack *extack) +{ + int query_out_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out); + struct mlx5_core_dev *parent_dev; + struct mlx5_devm_port *mlx5_port; + struct devlink *devlink; + unsigned int port_index; + u8 cap_ilog2_val; + void *query_ctx; + void *hca_caps; + u16 hw_fn_id; + int ret; + + query_ctx = kzalloc(query_out_sz, GFP_KERNEL); + if (!query_ctx) + return -ENOMEM; + + parent_dev = mlx5_devm_core_dev_get(port->devm); + + mlx5_port = container_of(port, struct mlx5_devm_port, port); + port_index = mlx5_port->port_index; + + devlink = mlxdevm_to_devlink(port->devm); + ret = mlx5_sf_index_to_hw_id(devlink, &hw_fn_id, port_index, extack); + if (ret) + goto out_free; + + ret = mlx5_core_other_function_get_caps(parent_dev, hw_fn_id, query_ctx); + if (ret) + goto out_free; + + hca_caps = MLX5_ADDR_OF(query_hca_cap_out, query_ctx, capability); + if (cap->roce_cap_valid) { + if (cap->roce == MLXDEVM_PORT_FN_CAP_ROCE_ENABLE) + MLX5_SET(cmd_hca_cap, hca_caps, roce, true); + else + MLX5_SET(cmd_hca_cap, hca_caps, roce, false); + } + if (cap->uc_list_cap_valid) { + /* At least one unicast mac is needed */ + if (cap->max_uc_list == 0) { + NL_SET_ERR_MSG_MOD(extack, "max_uc_macs value can not be 0."); + ret = -EOPNOTSUPP; + goto out_free; + } + /* Check if its power of 2 or not */ + if (cap->max_uc_list & (cap->max_uc_list - 1)) { + NL_SET_ERR_MSG_MOD(extack, + "Only power of 2 values are supported for max_uc_macs."); + ret = -EOPNOTSUPP; + goto out_free; + } + cap_ilog2_val = ilog2(cap->max_uc_list); + /* PRM has only 5 bits for it */ + if (cap_ilog2_val > 31) { + NL_SET_ERR_MSG_MOD(extack, "max_uc_macs value is too large."); + ret = -EOPNOTSUPP; + goto out_free; + } + MLX5_SET(cmd_hca_cap, hca_caps, log_max_current_uc_list, cap_ilog2_val); + } + ret = mlx5_core_other_function_set_caps(parent_dev, hca_caps, hw_fn_id); + +out_free: + kfree(query_ctx); + return ret; +} + +static struct mlx5_esw_rate_group * +esw_qos_find_devm_group(struct mlx5_eswitch *esw, const char *group) +{ + struct mlx5_esw_rate_group *tmp; + + if (!refcount_read(&esw->qos.refcnt)) + return NULL; + + list_for_each_entry(tmp, &esw->qos.groups, list) { + if (tmp->devm.name && !strcmp(tmp->devm.name, group)) + return tmp; + } + + return NULL; +} + +static +int mlx5_devlink_rate_leaf_get(struct devlink *devlink, + struct devlink_port *port, + u64 *tx_max, u64 *tx_share, char **group, + struct netlink_ext_ack *extack) +{ + struct mlx5_eswitch *esw; + struct mlx5_vport *vport; + int err; + + err = mlx5_esw_get_esw_and_vport(devlink, port, &esw, &vport, extack); + if (err) + return err; + + mutex_lock(&esw->state_lock); + if (!vport->enabled) { + NL_SET_ERR_MSG_MOD(extack, "Eswitch vport is disabled"); + err = -EOPNOTSUPP; + goto out; + } + + *tx_max = vport->qos.max_rate; + *tx_share = vport->qos.min_rate; + if (vport->qos.group) + *group = vport->qos.group->devm.name; + +out: + mutex_unlock(&esw->state_lock); + return err; +} + +int mlx5_devm_rate_leaf_get(struct mlxdevm_port *port, + u64 *tx_max, u64 *tx_share, char **group, + struct netlink_ext_ack *extack) +{ + struct devlink_port devport; + struct devlink *devlink; + + devlink = mlxdevm_to_devlink(port->devm); + memset(&devport, 0, sizeof(devport)); + devport.index = port->index; + + return mlx5_devlink_rate_leaf_get(devlink, &devport, + tx_max, tx_share, group, extack); +} + +static +int mlx5_devlink_rate_leaf_tx_max_set(struct devlink *devlink, + struct devlink_port *port, + u64 tx_max, + struct netlink_ext_ack *extack) +{ + struct mlx5_eswitch *esw; + struct mlx5_vport *vport; + int err; + + err = mlx5_esw_get_esw_and_vport(devlink, port, &esw, &vport, extack); + if (err) + return err; + + if (!mlx5_esw_allowed(esw)) + return -EPERM; + + mutex_lock(&esw->state_lock); + err = esw_qos_vport_enable(esw, vport, 0, 0, extack); + if (err) + goto unlock; + + err = esw_qos_set_vport_max_rate(esw, vport, tx_max, extack); +unlock: + mutex_unlock(&esw->state_lock); + return err; +} + +int mlx5_devm_rate_leaf_tx_max_set(struct mlxdevm_port *port, + u64 tx_max, struct netlink_ext_ack *extack) +{ + struct devlink_port devport; + struct devlink *devlink; + + devlink = mlxdevm_to_devlink(port->devm); + memset(&devport, 0, sizeof(devport)); + devport.index = port->index; + return mlx5_devlink_rate_leaf_tx_max_set(devlink, &devport, tx_max, extack); +} + +static +int mlx5_devlink_rate_leaf_tx_share_set(struct devlink *devlink, + struct devlink_port *port, + u64 tx_share, + struct netlink_ext_ack *extack) +{ + struct mlx5_vport *vport; + struct mlx5_eswitch *esw; + int err; + + err = mlx5_esw_get_esw_and_vport(devlink, port, &esw, &vport, extack); + if (err) + return err; + + if (!mlx5_esw_allowed(esw)) + return -EPERM; + + mutex_lock(&esw->state_lock); + err = esw_qos_vport_enable(esw, vport, 0, 0, extack); + if (err) + goto unlock; + + err = esw_qos_set_vport_min_rate(esw, vport, tx_share, extack); +unlock: + mutex_unlock(&esw->state_lock); + return err; +} + +int mlx5_devm_rate_leaf_tx_share_set(struct mlxdevm_port *port, + u64 tx_share, struct netlink_ext_ack *extack) +{ + struct devlink_port devport; + struct devlink *devlink; + + devlink = mlxdevm_to_devlink(port->devm); + memset(&devport, 0, sizeof(devport)); + devport.index = port->index; + return mlx5_devlink_rate_leaf_tx_share_set(devlink, &devport, tx_share, extack); +} + +static +int mlx5_devlink_rate_leaf_group_set(struct devlink *devlink, + struct devlink_port *port, + const char *group_name, + struct netlink_ext_ack *extack) +{ + struct mlx5_esw_rate_group *curr_group, *new_group; + struct mlx5_eswitch *esw; + struct mlx5_vport *vport; + int err; + + err = mlx5_esw_get_esw_and_vport(devlink, port, &esw, &vport, extack); + if (err) + return err; + + curr_group = vport->qos.group; + if (!strlen(group_name)) { + err = mlx5_esw_qos_vport_update_group(vport->dev->priv.eswitch, + vport, NULL, extack); + if (!err && curr_group && curr_group->devm.name) + curr_group->num_vports--; + return err; + } + + new_group = esw_qos_find_devm_group(esw, group_name); + if (!new_group) + return -EINVAL; + + err = mlx5_esw_qos_vport_update_group(vport->dev->priv.eswitch, vport, new_group, extack); + if (!err) { + new_group->num_vports++; + if (curr_group && curr_group->devm.name) + curr_group->num_vports--; + } + return err; +} + +int mlx5_devm_rate_leaf_group_set(struct mlxdevm_port *port, + const char *group, struct netlink_ext_ack *extack) +{ + struct devlink_port devport; + struct devlink *devlink; + + devlink = mlxdevm_to_devlink(port->devm); + memset(&devport, 0, sizeof(devport)); + devport.index = port->index; + return mlx5_devlink_rate_leaf_group_set(devlink, &devport, group, extack); +} + +int mlx5_devm_rate_node_tx_share_set(struct mlxdevm *devm_dev, const char *group_name, + u64 tx_share, struct netlink_ext_ack *extack) +{ + struct mlx5_esw_rate_group *group; + struct mlx5_core_dev *dev; + struct mlx5_eswitch *esw; + struct devlink *devlink; + int err; + + devlink = mlxdevm_to_devlink(devm_dev); + dev = devlink_priv(devlink); + esw = dev->priv.eswitch; + + mutex_lock(&esw->state_lock); + group = esw_qos_find_devm_group(esw, group_name); + if (!group) { + NL_SET_ERR_MSG_MOD(extack, "Can't find node"); + err = -ENODEV; + goto unlock; + } + err = esw_qos_set_group_min_rate(esw, group, tx_share, extack); + if (!err) + group->devm.tx_share = tx_share; +unlock: + mutex_unlock(&esw->state_lock); + return err; +} + +int mlx5_devm_rate_node_tx_max_set(struct mlxdevm *devm_dev, const char *group_name, + u64 tx_max, struct netlink_ext_ack *extack) +{ + struct mlx5_esw_rate_group *group; + struct mlx5_core_dev *dev; + struct mlx5_eswitch *esw; + struct devlink *devlink; + int err; + + devlink = mlxdevm_to_devlink(devm_dev); + dev = devlink_priv(devlink); + esw = dev->priv.eswitch; + + mutex_lock(&esw->state_lock); + group = esw_qos_find_devm_group(esw, group_name); + if (!group) { + NL_SET_ERR_MSG_MOD(extack, "Can't find node"); + err = -ENODEV; + goto unlock; + } + err = esw_qos_set_group_max_rate(esw, group, tx_max, extack); + if (!err) + group->devm.tx_max = tx_max; +unlock: + mutex_unlock(&esw->state_lock); + return err; +} + +int mlx5_devm_rate_node_new(struct mlxdevm *devm_dev, const char *group_name, + struct netlink_ext_ack *extack) +{ + struct mlx5_esw_rate_group *group; + struct devlink *devlink; + struct mlx5_eswitch *esw; + int err = 0; + + devlink = mlxdevm_to_devlink(devm_dev); + + esw = mlx5_devlink_eswitch_get(devlink); + if (IS_ERR(esw)) + return PTR_ERR(esw); + + mutex_lock(&esw->state_lock); + if (esw->mode != MLX5_ESWITCH_OFFLOADS) { + NL_SET_ERR_MSG_MOD(extack, + "Rate node creation supported only in switchdev mode"); + err = -EOPNOTSUPP; + goto unlock; + } + + group = esw_qos_create_rate_group(esw, MLX5_ESW_QOS_NON_SYSFS_GROUP, extack); + if (IS_ERR(group)) { + err = PTR_ERR(group); + goto unlock; + } + + group->devm.name = kstrdup(group_name, GFP_KERNEL); + err = mlxdevm_rate_group_register(devm_dev, + &group->devm); + +unlock: + mutex_unlock(&esw->state_lock); + return err; +} + +int mlx5_devm_rate_node_del(struct mlxdevm *devm_dev, const char *group_name, + struct netlink_ext_ack *extack) +{ + struct mlx5_esw_rate_group *group; + struct mlx5_eswitch *esw; + struct devlink *devlink; + int err; + + devlink = mlxdevm_to_devlink(devm_dev); + + esw = mlx5_devlink_eswitch_get(devlink); + if (IS_ERR(esw)) + return PTR_ERR(esw); + + mutex_lock(&esw->state_lock); + + group = esw_qos_find_devm_group(esw, group_name); + if (!group) { + NL_SET_ERR_MSG_MOD(extack, "Can't find node"); + err = -ENODEV; + goto unlock; + } + if (group->num_vports) { + err = -EBUSY; + NL_SET_ERR_MSG_MOD(extack, "Node has children. Cannot delete node."); + goto unlock; + } + mlxdevm_rate_group_unregister(devm_dev, + &group->devm); + kfree(group->devm.name); + err = esw_qos_destroy_rate_group(esw, group, extack); +unlock: + mutex_unlock(&esw->state_lock); + return err; +} + +static const struct mlxdevm_ops mlx5_devm_ops = { +#ifdef CONFIG_MLX5_ESWITCH + .port_fn_hw_addr_set = mlx5_devm_sf_port_fn_hw_addr_set, + .port_fn_hw_addr_get = mlx5_devm_sf_port_fn_hw_addr_get, + .port_new = mlx5_devm_sf_port_new, + .port_del = mlx5_devm_sf_port_del, + .port_fn_state_get = mlx5_devm_sf_port_fn_state_get, + .port_fn_state_set = mlx5_devm_sf_port_fn_state_set, + .port_fn_cap_get = mlx5_devm_sf_port_fn_cap_get, + .port_fn_cap_set = mlx5_devm_sf_port_fn_cap_set, + .rate_leaf_tx_max_set = mlx5_devm_rate_leaf_tx_max_set, + .rate_leaf_tx_share_set = mlx5_devm_rate_leaf_tx_share_set, + .rate_leaf_group_set = mlx5_devm_rate_leaf_group_set, + .rate_leaf_get = mlx5_devm_rate_leaf_get, + .rate_node_tx_max_set = mlx5_devm_rate_node_tx_max_set, + .rate_node_tx_share_set = mlx5_devm_rate_node_tx_share_set, + .rate_node_new = mlx5_devm_rate_node_new, + .rate_node_del = mlx5_devm_rate_node_del, + .port_fn_trust_set = mlx5_devm_sf_port_function_trust_set, + .port_fn_trust_get = mlx5_devm_sf_port_function_trust_get, +#endif +}; + +static int mlx5_devm_cpu_affinity_validate(struct mlxdevm *devm, u32 id, + union mlxdevm_param_value val, + struct netlink_ext_ack *extack) +{ + struct mlx5_core_dev *dev = mlx5_devm_core_dev_get(devm); + u16 *arr = val.vu16arr.data; + int max_eqs_sf; + int i; + + if (!mlx5_irq_table_have_dedicated_sfs_irqs(mlx5_irq_table_get(dev))) { + NL_SET_ERR_MSG_MOD(extack, "SF doesn’t have dedicated IRQs"); + return -EOPNOTSUPP; + } + + for (i = 0; i < val.vu16arr.array_len; i++) { + if (arr[i] > nr_cpu_ids || arr[i] >= num_present_cpus()) { + NL_SET_ERR_MSG_MOD(extack, "Some CPUs aren't present"); + return -ERANGE; + } + if (!cpu_online(arr[i])) { + NL_SET_ERR_MSG_MOD(extack, "Some CPUs aren't online"); + return -EINVAL; + } + } + + max_eqs_sf = min_t(int, MLX5_COMP_EQS_PER_SF, + mlx5_irq_table_get_sfs_vec(mlx5_irq_table_get(dev))); + if (i > max_eqs_sf) { + NL_SET_ERR_MSG_MOD(extack, "SF doesn't have enught IRQs"); + return -EINVAL; + } + return 0; +} + +int mlx5_devm_affinity_get_param(struct mlx5_core_dev *dev, struct cpumask *mask) +{ + struct mlx5_devm_device *mdevn_dev = mlx5_devm_device_get(dev); + union mlxdevm_param_value val; + u16 *arr = val.vu16arr.data; + int err; + int i; + + err = mlxdevm_param_driverinit_value_get(&mdevn_dev->device, + MLX5_DEVM_PARAM_ID_CPU_AFFINITY, + &val); + if (err) + goto err; + for (i = 0; i < val.vu16arr.array_len; i++) + cpumask_set_cpu(arr[i], mask); + return 0; +err: + mlx5_core_dbg(dev, "mlxdevm can't get param cpu_affinity. use default policy\n"); + return err; +} + +static const struct mlxdevm_param mlx5_devm_params[] = { + MLXDEVM_PARAM_DRIVER(MLX5_DEVM_PARAM_ID_CPU_AFFINITY, "cpu_affinity", + MLXDEVM_PARAM_TYPE_ARRAY_U16, + BIT(MLXDEVM_PARAM_CMODE_DRIVERINIT), NULL, NULL, + mlx5_devm_cpu_affinity_validate), +}; + +static void mlx5_devm_set_params_init_values(struct mlxdevm *devm) +{ + struct mlx5_core_dev *dev = mlx5_devm_core_dev_get(devm); + union mlxdevm_param_value value; + u16 *arr = value.vu16arr.data; + cpumask_var_t dev_mask; + int i = 0; + int cpu; + + if (!zalloc_cpumask_var(&dev_mask, GFP_KERNEL)) + return; + + mlx5_core_affinity_get(dev, dev_mask); + + memset(value.vu16arr.data, 0, sizeof(value.vu16arr.data)); + for_each_cpu(cpu, dev_mask) { + arr[i] = cpu; + i++; + } + value.vu16arr.array_len = i; + mlxdevm_param_driverinit_value_set(devm, MLX5_DEVM_PARAM_ID_CPU_AFFINITY, value); + free_cpumask_var(dev_mask); +} + +void mlx5_devm_params_publish(struct mlx5_core_dev *dev) +{ + struct mlx5_devm_device *mdevm_dev = mlx5_devm_device_get(dev); + + if (!mlx5_core_is_sf(dev)) + return; + + mlx5_devm_set_params_init_values(&mdevm_dev->device); + mlxdevm_params_publish(&mdevm_dev->device); +} + +int mlx5_devm_register(struct mlx5_core_dev *dev) +{ + struct mlx5_devm_device *mdevm_dev; + int err; + + mdevm_dev = kzalloc(sizeof(*mdevm_dev), GFP_KERNEL); + if (!mdevm_dev) + return -ENOMEM; + + mdevm_dev->dev = dev; + mdevm_dev->device.ops = &mlx5_devm_ops; + mdevm_dev->device.device = dev->device; + INIT_LIST_HEAD(&mdevm_dev->port_list); + init_rwsem(&mdevm_dev->port_list_rwsem); + mutex_lock(&mlx5_mlxdevm_mutex); + list_add(&mdevm_dev->list, &dev_head); + mutex_unlock(&mlx5_mlxdevm_mutex); + err = mlxdevm_register(&mdevm_dev->device); + if (err) + goto reg_err; + + if (mlx5_core_is_sf(dev)) + err = mlxdevm_params_register(&mdevm_dev->device, mlx5_devm_params, + ARRAY_SIZE(mlx5_devm_params)); + if (err) + goto params_reg_err; + + return 0; + +params_reg_err: + mlxdevm_unregister(&mdevm_dev->device); +reg_err: + mutex_lock(&mlx5_mlxdevm_mutex); + list_del(&mdevm_dev->list); + mutex_unlock(&mlx5_mlxdevm_mutex); + kfree(mdevm_dev); + return err; +} + +void mlx5_devm_unregister(struct mlx5_core_dev *dev) +{ + struct mlx5_devm_device *mdevm; + + mdevm = mlx5_devm_device_get(dev); + if (!mdevm) + return; + + if (mlx5_core_is_sf(dev)) + mlxdevm_params_unregister(&mdevm->device, mlx5_devm_params, + ARRAY_SIZE(mlx5_devm_params)); + + mlxdevm_unregister(&mdevm->device); + + mutex_lock(&mlx5_mlxdevm_mutex); + list_del(&mdevm->list); + mutex_unlock(&mlx5_mlxdevm_mutex); + kfree(mdevm); +} + +void mlx5_devm_rate_nodes_destroy(struct mlx5_core_dev *dev) +{ + struct mlx5_devm_device *mdevm; + + mdevm = mlx5_devm_device_get(dev); + if (!mdevm) + return; + mlxdevm_rate_nodes_destroy(&mdevm->device); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/mlx5_devm.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/mlx5_devm.h new file mode 100644 index 0000000..aa287db --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/mlx5_devm.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2021 Mellanox Technologies Ltd. */ + +#ifndef MLX5_DEVM_H +#define MLX5_DEVM_H + +#if IS_ENABLED(CONFIG_MLXDEVM) +#include +#include + +struct mlx5_devm_device { + struct mlxdevm device; + struct list_head port_list; + struct mlx5_core_dev *dev; + struct list_head list; + struct rw_semaphore port_list_rwsem; +}; + +enum mlx5_devm_param_id { + MLX5_DEVM_PARAM_ID_CPU_AFFINITY, +}; + +struct mlx5_devm_device *mlx5_devm_device_get(struct mlx5_core_dev *dev); +int mlx5_devm_register(struct mlx5_core_dev *dev); +void mlx5_devm_unregister(struct mlx5_core_dev *dev); +void mlx5_devm_rate_nodes_destroy(struct mlx5_core_dev *dev); +int mlx5_devm_affinity_get_param(struct mlx5_core_dev *dev, struct cpumask *mask); +void mlx5_devm_params_publish(struct mlx5_core_dev *dev); + +#else +static inline int mlx5_devm_register(struct mlx5_core_dev *dev) +{ + return 0; +} + +static inline void mlx5_devm_unregister(struct mlx5_core_dev *dev) +{ +} + +static inline int +mlx5_devm_affinity_get_param(struct mlx5_core_dev *dev, struct cpumask *mask) +{ + return 0; +} + +static inline void mlx5_devm_params_publish(struct mlx5_core_dev *dev) +{ +} +#endif +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/mlx5_esw_devm.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/mlx5_esw_devm.h new file mode 100644 index 0000000..ee3e3ba --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/mlx5_esw_devm.h @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2021 Mellanox Technologies Ltd. */ + +#ifndef MLX5_ESW_DEVM_H +#define MLX5_ESW_DEVM_H + +#include +#include +#include "mlx5_devm.h" + +#if IS_ENABLED(CONFIG_MLXDEVM) +struct mlx5_devm_port { + struct mlxdevm_port port; + struct list_head list; + unsigned int port_index; + u32 sfnum; + u16 vport_num; +}; + +int mlx5_devm_sf_port_register(struct mlx5_core_dev *dev, u16 vport_num, + u32 contoller, u32 sfnum, struct devlink_port *dl_port); +void mlx5_devm_sf_port_unregister(struct mlx5_core_dev *dev, u16 vport_num); +void mlx5_devm_sf_port_type_eth_set(struct mlx5_core_dev *dev, u16 vport_num, + struct net_device *ndev); +u32 mlx5_devm_sf_vport_to_sfnum(struct mlx5_core_dev *dev, u16 vport_num); +u32 mlx5_devm_sf_vport_to_controller(struct mlx5_core_dev *dev, u16 vport_num); +#else +static inline int mlx5_devm_sf_port_register(struct mlx5_core_dev *dev, + u32 controller, + u16 vport_num, u32 sfnum) +{ + return 0; +} + +static inline void mlx5_devm_sf_port_unregister(struct mlx5_core_dev *dev, u16 vport_num) +{ +} + +static inline void mlx5_devm_sf_port_type_eth_set(struct mlx5_core_dev *dev, u16 vport_num, + struct net_device *ndev) +{ +} +#endif + +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/mlx5_irq.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/mlx5_irq.h new file mode 100644 index 0000000..041cac8 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/mlx5_irq.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2021 Mellanox Technologies. */ + +#ifndef __MLX5_IRQ_H__ +#define __MLX5_IRQ_H__ + +#include + +#define MLX5_COMP_EQS_PER_SF 8 + +struct mlx5_irq; + +int mlx5_irq_table_init(struct mlx5_core_dev *dev); +void mlx5_irq_table_cleanup(struct mlx5_core_dev *dev); +int mlx5_irq_table_create(struct mlx5_core_dev *dev); +void mlx5_irq_table_destroy(struct mlx5_core_dev *dev); +int mlx5_irq_table_get_num_comp(struct mlx5_irq_table *table); +int mlx5_irq_table_get_sfs_vec(struct mlx5_irq_table *table); +bool mlx5_irq_table_have_dedicated_sfs_irqs(struct mlx5_irq_table *table); +struct mlx5_irq_table *mlx5_irq_table_get(struct mlx5_core_dev *dev); + +void mlx5_irq_rename(struct mlx5_core_dev *dev, struct mlx5_irq *irq, + const char *name); +int mlx5_set_msix_vec_count(struct mlx5_core_dev *dev, int devfn, + int msix_vec_count); +int mlx5_get_default_msix_vec_count(struct mlx5_core_dev *dev, int num_vfs); + +struct mlx5_irq *mlx5_ctrl_irq_request(struct mlx5_core_dev *dev); +void mlx5_ctrl_irq_release(struct mlx5_irq *ctrl_irq); +struct mlx5_irq *mlx5_irq_request(struct mlx5_core_dev *dev, u16 vecidx, + const struct cpumask *affinity); +int mlx5_irqs_request_vectors(struct mlx5_core_dev *dev, u16 *cpus, int nirqs, + struct mlx5_irq **irqs); +void mlx5_irqs_release_vectors(struct mlx5_irq **irqs, int nirqs); +int mlx5_irqs_request_mask(struct mlx5_core_dev *dev, struct mlx5_irq **irqs, + struct cpumask *irqs_req_mask); +int mlx5_irq_attach_nb(struct mlx5_irq *irq, struct notifier_block *nb); +int mlx5_irq_detach_nb(struct mlx5_irq *irq, struct notifier_block *nb); +struct cpumask *mlx5_irq_get_affinity_mask(struct mlx5_irq *irq); +int mlx5_irq_get_index(struct mlx5_irq *irq); + +struct mlx5_irq_pool; +#ifdef CONFIG_MLX5_SF +int mlx5_irq_affinity_irqs_request_auto(struct mlx5_core_dev *dev, int nirqs, + struct mlx5_irq **irqs); +struct mlx5_irq *mlx5_irq_affinity_request(struct mlx5_irq_pool *pool, + const struct cpumask *req_mask); +void mlx5_irq_affinity_irqs_release(struct mlx5_core_dev *dev, struct mlx5_irq **irqs, + int num_irqs); +#else +static inline int mlx5_irq_affinity_irqs_request_auto(struct mlx5_core_dev *dev, int nirqs, + struct mlx5_irq **irqs) +{ + return -EOPNOTSUPP; +} + +static inline struct mlx5_irq * +mlx5_irq_affinity_request(struct mlx5_irq_pool *pool, const struct cpumask *req_mask) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline void mlx5_irq_affinity_irqs_release(struct mlx5_core_dev *dev, + struct mlx5_irq **irqs, int num_irqs) {} +#endif +#endif /* __MLX5_IRQ_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/mr.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/mr.c new file mode 100644 index 0000000..f099a08 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/mr.c @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include "mlx5_core.h" + +int mlx5_core_create_mkey(struct mlx5_core_dev *dev, u32 *mkey, u32 *in, + int inlen) +{ + u32 lout[MLX5_ST_SZ_DW(create_mkey_out)] = {}; + u32 mkey_index; + int err; + + MLX5_SET(create_mkey_in, in, opcode, MLX5_CMD_OP_CREATE_MKEY); + + err = mlx5_cmd_exec(dev, in, inlen, lout, sizeof(lout)); + if (err) + return err; + + mkey_index = MLX5_GET(create_mkey_out, lout, mkey_index); + *mkey = MLX5_GET(create_mkey_in, in, memory_key_mkey_entry.mkey_7_0) | + mlx5_idx_to_mkey(mkey_index); + + mlx5_core_dbg(dev, "out 0x%x, mkey 0x%x\n", mkey_index, *mkey); + return 0; +} +EXPORT_SYMBOL(mlx5_core_create_mkey); + +int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev, u32 mkey) +{ + u32 in[MLX5_ST_SZ_DW(destroy_mkey_in)] = {}; + + MLX5_SET(destroy_mkey_in, in, opcode, MLX5_CMD_OP_DESTROY_MKEY); + MLX5_SET(destroy_mkey_in, in, mkey_index, mlx5_mkey_to_idx(mkey)); + return mlx5_cmd_exec_in(dev, destroy_mkey, in); +} +EXPORT_SYMBOL(mlx5_core_destroy_mkey); + +int mlx5_core_query_mkey(struct mlx5_core_dev *dev, u32 mkey, u32 *out, + int outlen) +{ + u32 in[MLX5_ST_SZ_DW(query_mkey_in)] = {}; + + memset(out, 0, outlen); + MLX5_SET(query_mkey_in, in, opcode, MLX5_CMD_OP_QUERY_MKEY); + MLX5_SET(query_mkey_in, in, mkey_index, mlx5_mkey_to_idx(mkey)); + return mlx5_cmd_exec(dev, in, sizeof(in), out, outlen); +} +EXPORT_SYMBOL(mlx5_core_query_mkey); + +static inline u32 mlx5_get_psv(u32 *out, int psv_index) +{ + switch (psv_index) { + case 1: return MLX5_GET(create_psv_out, out, psv1_index); + case 2: return MLX5_GET(create_psv_out, out, psv2_index); + case 3: return MLX5_GET(create_psv_out, out, psv3_index); + default: return MLX5_GET(create_psv_out, out, psv0_index); + } +} + +int mlx5_core_create_psv(struct mlx5_core_dev *dev, u32 pdn, + int npsvs, u32 *sig_index) +{ + u32 out[MLX5_ST_SZ_DW(create_psv_out)] = {}; + u32 in[MLX5_ST_SZ_DW(create_psv_in)] = {}; + int i, err; + + if (npsvs > MLX5_MAX_PSVS) + return -EINVAL; + + MLX5_SET(create_psv_in, in, opcode, MLX5_CMD_OP_CREATE_PSV); + MLX5_SET(create_psv_in, in, pd, pdn); + MLX5_SET(create_psv_in, in, num_psv, npsvs); + + err = mlx5_cmd_exec_inout(dev, create_psv, in, out); + if (err) + return err; + + for (i = 0; i < npsvs; i++) + sig_index[i] = mlx5_get_psv(out, i); + + return err; +} +EXPORT_SYMBOL(mlx5_core_create_psv); + +int mlx5_core_destroy_psv(struct mlx5_core_dev *dev, int psv_num) +{ + u32 in[MLX5_ST_SZ_DW(destroy_psv_in)] = {}; + + MLX5_SET(destroy_psv_in, in, opcode, MLX5_CMD_OP_DESTROY_PSV); + MLX5_SET(destroy_psv_in, in, psvn, psv_num); + return mlx5_cmd_exec_in(dev, destroy_psv, in); +} +EXPORT_SYMBOL(mlx5_core_destroy_psv); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/mst_dump.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/mst_dump.c new file mode 100644 index 0000000..d15a99e --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/mst_dump.c @@ -0,0 +1,7490 @@ +/* + * Copyright (c) 2016, Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#include "mlx5_core.h" + +#define MLX5_EXTRACT_C(source, offset, size) \ +((((unsigned)(source)) >> (offset)) & MLX5_ONES32(size)) +#define MLX5_EXTRACT(src, start, len) \ +(((len) == 32) ? (src) : MLX5_EXTRACT_C(src, start, len)) +#define MLX5_ONES32(size) \ +((size) ? (0xffffffff >> (32 - (size))) : 0) +#define MLX5_MASK32(offset, size) \ +(MLX5_ONES32(size) << (offset)) +#define MLX5_MERGE_C(rsrc1, rsrc2, start, len) \ +((((rsrc2) << (start)) & (MLX5_MASK32((start), (len)))) | \ +((rsrc1) & (~MLX5_MASK32((start), (len))))) +#define MLX5_MERGE(rsrc1, rsrc2, start, len) \ +(((len) == 32) ? (rsrc2) : MLX5_MERGE_C(rsrc1, rsrc2, start, len)) + +#define MLX5_CR_SPACE_DOMAIN 0x2 +#define MLX5_ICMD_SPACE_DOMAIN 0x3 + +#define MLX5_HWID_ADDR 0xf0014 +#define MLX5_ADDR_REG 0x58 +#define MLX5_DATA_REG 0x5c + +#define MLX5_NUM_MST_OFFSETS_MT4117 3128 +#define MLX5_NUM_MST_OFFSETS_MT4115 3577 +#define MLX5_MST_DUMP_SIZE_BYTES_MT4115 (0x561e0l) +#define MLX5_MST_DUMP_SIZE_BYTES_MT4117 (0x4eed0l) + +static const unsigned long +mlx5_mst_dump_regs_mt4117[MLX5_NUM_MST_OFFSETS_MT4117][2] = {{0x000000, 16388}, + {0x010084, 1}, + {0x01008c, 2}, + {0x010180, 6}, + {0x01019c, 9}, + {0x010304, 1}, + {0x0103bc, 52}, + {0x010500, 35}, + {0x010604, 1}, + {0x01060c, 1}, + {0x010624, 1}, + {0x01062c, 1}, + {0x010644, 1}, + {0x01064c, 3}, + {0x010800, 1}, + {0x010814, 3}, + {0x010844, 1}, + {0x01084c, 4}, + {0x011000, 32}, + {0x011200, 32}, + {0x011400, 32}, + {0x011600, 32}, + {0x011800, 32}, + {0x011a00, 5}, + {0x011a24, 1}, + {0x011a2c, 1}, + {0x011a34, 3}, + {0x011a44, 1}, + {0x011a4c, 2}, + {0x012000, 20}, + {0x012088, 22}, + {0x012100, 18}, + {0x012200, 36}, + {0x012300, 3}, + {0x012310, 1}, + {0x012320, 1}, + {0x012330, 1}, + {0x012340, 1}, + {0x012350, 1}, + {0x012360, 1}, + {0x012370, 1}, + {0x012400, 166}, + {0x0126a0, 1}, + {0x012700, 32}, + {0x012800, 1}, + {0x012808, 1}, + {0x012810, 1}, + {0x012818, 1}, + {0x012820, 1}, + {0x012828, 1}, + {0x012830, 1}, + {0x012838, 1}, + {0x012850, 1}, + {0x012858, 1}, + {0x012860, 1}, + {0x012868, 1}, + {0x012870, 1}, + {0x012878, 1}, + {0x012880, 1}, + {0x012888, 1}, + {0x012890, 5}, + {0x012900, 10}, + {0x012940, 6}, + {0x012960, 6}, + {0x012980, 5}, + {0x012a00, 6}, + {0x012a40, 5}, + {0x012a80, 9}, + {0x012ac0, 19}, + {0x012b10, 2}, + {0x014000, 1}, + {0x014010, 10}, + {0x014104, 1}, + {0x01410c, 4}, + {0x014200, 1}, + {0x014210, 4}, + {0x014300, 4}, + {0x014400, 3}, + {0x014410, 1}, + {0x014444, 1}, + {0x01444c, 4}, + {0x014480, 2}, + {0x014500, 35}, + {0x014590, 1}, + {0x014600, 32}, + {0x014704, 1}, + {0x01470c, 1}, + {0x014804, 2}, + {0x014814, 3}, + {0x014824, 2}, + {0x014834, 3}, + {0x014844, 2}, + {0x014854, 5}, + {0x014a00, 4}, + {0x014a14, 2}, + {0x015004, 1}, + {0x01500c, 4}, + {0x015204, 1}, + {0x015214, 4}, + {0x015228, 1}, + {0x015244, 1}, + {0x015250, 4}, + {0x015274, 1}, + {0x01527c, 1}, + {0x0152a4, 1}, + {0x015300, 4}, + {0x015314, 1}, + {0x01531c, 3}, + {0x015404, 2}, + {0x015430, 20}, + {0x015484, 2}, + {0x015498, 5}, + {0x0154bc, 1}, + {0x0154c8, 1}, + {0x0154d4, 2}, + {0x01550c, 3}, + {0x015524, 3}, + {0x015608, 3}, + {0x015618, 4}, + {0x015680, 16}, + {0x015704, 2}, + {0x015730, 20}, + {0x015784, 3}, + {0x015794, 3}, + {0x0157c0, 6}, + {0x015800, 52}, + {0x0158d4, 1}, + {0x0158dc, 1}, + {0x0158e4, 3}, + {0x0158f4, 1}, + {0x0158fc, 52}, + {0x015b04, 2}, + {0x015b14, 3}, + {0x015b24, 2}, + {0x015b38, 6}, + {0x015b60, 5}, + {0x015c04, 1}, + {0x015cb8, 18}, + {0x015d98, 1}, + {0x015e00, 1}, + {0x015e08, 1}, + {0x015e10, 1}, + {0x016000, 32}, + {0x016100, 7}, + {0x016120, 2}, + {0x016144, 3}, + {0x016204, 1}, + {0x01620c, 1}, + {0x016224, 1}, + {0x01622c, 1}, + {0x016234, 1}, + {0x01623c, 1}, + {0x016244, 1}, + {0x01624c, 1}, + {0x016254, 1}, + {0x01625c, 2}, + {0x016270, 1}, + {0x016280, 1}, + {0x016290, 1}, + {0x0162a0, 1}, + {0x0162b0, 1}, + {0x0162c0, 1}, + {0x0162d0, 1}, + {0x0162e0, 1}, + {0x0162f0, 1}, + {0x016300, 1}, + {0x016310, 1}, + {0x016320, 1}, + {0x016330, 1}, + {0x016340, 32}, + {0x016400, 19}, + {0x016454, 7}, + {0x016484, 1}, + {0x01648c, 1}, + {0x016800, 9}, + {0x016840, 16}, + {0x017800, 1}, + {0x017828, 1}, + {0x017850, 1}, + {0x017878, 1}, + {0x0178a0, 12}, + {0x0179ac, 1}, + {0x0179d8, 4}, + {0x017a00, 6}, + {0x017a20, 6}, + {0x017a40, 3}, + {0x017c00, 6}, + {0x017c20, 6}, + {0x017c40, 6}, + {0x017c60, 6}, + {0x017c80, 6}, + {0x017ca0, 6}, + {0x017cc0, 6}, + {0x017ce0, 6}, + {0x017d00, 3}, + {0x017d10, 6}, + {0x018400, 11}, + {0x018430, 2}, + {0x018440, 4}, + {0x018604, 1}, + {0x018618, 2}, + {0x018640, 4}, + {0x018660, 2}, + {0x018800, 1}, + {0x018810, 4}, + {0x018844, 1}, + {0x01884c, 4}, + {0x018880, 2}, + {0x01a400, 12}, + {0x01a444, 1}, + {0x01a44c, 1}, + {0x01a800, 1}, + {0x01a814, 3}, + {0x01a844, 1}, + {0x01a84c, 4}, + {0x01c400, 7}, + {0x01c500, 8}, + {0x01c544, 1}, + {0x01c554, 3}, + {0x01c564, 3}, + {0x01c574, 3}, + {0x01c604, 1}, + {0x01c60c, 3}, + {0x01c800, 1}, + {0x01c814, 3}, + {0x01c844, 1}, + {0x01c84c, 4}, + {0x01f000, 1}, + {0x01f400, 1}, + {0x01f408, 4}, + {0x01f41c, 3}, + {0x01f500, 2}, + {0x01f800, 1}, + {0x01f814, 3}, + {0x01f844, 1}, + {0x01f84c, 4}, + {0x01f880, 3}, + {0x020004, 1}, + {0x02000c, 4}, + {0x020080, 3}, + {0x020090, 5}, + {0x020800, 16}, + {0x020900, 192}, + {0x021004, 1}, + {0x02100c, 4}, + {0x021400, 5}, + {0x021418, 5}, + {0x021480, 1}, + {0x021704, 2}, + {0x02173c, 17}, + {0x0217fc, 2}, + {0x021844, 1}, + {0x022280, 12}, + {0x022408, 6}, + {0x022444, 1}, + {0x022454, 3}, + {0x022504, 1}, + {0x02250c, 4}, + {0x022624, 1}, + {0x02262c, 3}, + {0x02263c, 1}, + {0x022804, 1}, + {0x02280c, 4}, + {0x022904, 4}, + {0x022924, 4}, + {0x024000, 36}, + {0x0240c0, 21}, + {0x024120, 11}, + {0x024200, 5}, + {0x024220, 1}, + {0x024230, 8}, + {0x024258, 1}, + {0x024260, 1}, + {0x024270, 9}, + {0x0242a0, 1}, + {0x0242b0, 4}, + {0x0242c8, 2}, + {0x024300, 5}, + {0x024318, 2}, + {0x02439c, 25}, + {0x024424, 4}, + {0x024464, 13}, + {0x0244a4, 1}, + {0x0244ac, 3}, + {0x0244c0, 2}, + {0x0244d0, 3}, + {0x0244e0, 3}, + {0x0244f0, 3}, + {0x024500, 3}, + {0x024510, 3}, + {0x024520, 3}, + {0x024530, 3}, + {0x024884, 11}, + {0x0248b4, 3}, + {0x0248c4, 1}, + {0x0248cc, 1}, + {0x0248d4, 1}, + {0x0248dc, 1}, + {0x0248f0, 2}, + {0x024908, 6}, + {0x024928, 6}, + {0x024968, 6}, + {0x024984, 3}, + {0x024994, 1}, + {0x02499c, 6}, + {0x0249b8, 7}, + {0x024a08, 6}, + {0x024a28, 6}, + {0x024a68, 6}, + {0x024a84, 1}, + {0x024a8c, 1}, + {0x024a94, 1}, + {0x024a9c, 13}, + {0x024ae0, 6}, + {0x024b00, 7}, + {0x024b20, 6}, + {0x024b40, 8}, + {0x024c00, 2}, + {0x024c24, 3}, + {0x024c34, 3}, + {0x025004, 6}, + {0x025800, 37}, + {0x025904, 1}, + {0x02590c, 1}, + {0x026000, 99}, + {0x026200, 1}, + {0x026800, 7}, + {0x026824, 6}, + {0x026840, 2}, + {0x026864, 1}, + {0x02686c, 1}, + {0x026874, 3}, + {0x026884, 1}, + {0x0268a4, 7}, + {0x026904, 1}, + {0x02690c, 4}, + {0x026940, 1}, + {0x026980, 33}, + {0x026a0c, 7}, + {0x026a30, 1}, + {0x026a44, 4}, + {0x026a60, 1}, + {0x026a70, 1}, + {0x026b00, 9}, + {0x026b44, 2}, + {0x026b68, 6}, + {0x026b84, 2}, + {0x026ba8, 14}, + {0x026c00, 16}, + {0x026c44, 1}, + {0x026c4c, 1}, + {0x026c84, 1}, + {0x026c8c, 4}, + {0x026cc4, 1}, + {0x026ccc, 4}, + {0x026d00, 2}, + {0x028800, 2}, + {0x028844, 1}, + {0x02884c, 4}, + {0x029004, 7}, + {0x029b30, 2}, + {0x029b50, 4}, + {0x02a004, 1}, + {0x02a00c, 4}, + {0x02a040, 6}, + {0x02a200, 1}, + {0x02a210, 11}, + {0x02a240, 8}, + {0x02a484, 1}, + {0x02a4c0, 16}, + {0x02a780, 1}, + {0x02a7a0, 4}, + {0x02a7c0, 1}, + {0x02a900, 1}, + {0x02aa04, 1}, + {0x02aa0c, 4}, + {0x02ab00, 40}, + {0x02aba4, 1}, + {0x02abac, 1}, + {0x02abb4, 1}, + {0x02abbc, 1}, + {0x02abc4, 1}, + {0x02abcc, 1}, + {0x02abe0, 4}, + {0x02abf4, 2}, + {0x02ac00, 2}, + {0x02ac10, 3}, + {0x02ac44, 3}, + {0x02ad00, 1}, + {0x02ad08, 1}, + {0x02ad10, 1}, + {0x02ad18, 1}, + {0x02ad20, 1}, + {0x02ad28, 1}, + {0x02ad30, 1}, + {0x02ad38, 1}, + {0x02ad40, 1}, + {0x02ad48, 1}, + {0x02ad50, 1}, + {0x02ad58, 1}, + {0x02ad60, 1}, + {0x02ad68, 1}, + {0x02ad70, 1}, + {0x02ad78, 1}, + {0x02ad80, 1}, + {0x02ad88, 1}, + {0x02ad90, 1}, + {0x02ad98, 1}, + {0x02ada0, 1}, + {0x02ada8, 1}, + {0x02adb0, 1}, + {0x02adb8, 1}, + {0x02adc0, 1}, + {0x02adc8, 1}, + {0x02add0, 1}, + {0x02add8, 1}, + {0x02ade0, 1}, + {0x02ade8, 1}, + {0x02adf0, 1}, + {0x02adf8, 1}, + {0x02ae00, 6}, + {0x02ae20, 5}, + {0x02ae40, 4}, + {0x02c000, 6}, + {0x02c100, 5}, + {0x02c204, 1}, + {0x02c214, 3}, + {0x02c224, 1}, + {0x02c22c, 4}, + {0x02c244, 2}, + {0x02c250, 5}, + {0x02c400, 2}, + {0x02c428, 2}, + {0x02c450, 2}, + {0x02c478, 2}, + {0x02c4a0, 21}, + {0x02c5ac, 1}, + {0x02c5d8, 4}, + {0x02c600, 6}, + {0x02c620, 6}, + {0x02c640, 6}, + {0x02c660, 6}, + {0x02c680, 3}, + {0x02c800, 3}, + {0x02c820, 6}, + {0x02c840, 6}, + {0x02c860, 6}, + {0x02c880, 6}, + {0x02c8a0, 6}, + {0x02c8c0, 6}, + {0x02c8e0, 6}, + {0x02c900, 6}, + {0x02c920, 6}, + {0x02c940, 6}, + {0x02c960, 6}, + {0x02c980, 6}, + {0x02c9a0, 6}, + {0x02c9c0, 6}, + {0x02c9e0, 6}, + {0x02ca00, 6}, + {0x02ca20, 6}, + {0x02ca40, 6}, + {0x02ca60, 6}, + {0x02ca80, 6}, + {0x02caa0, 6}, + {0x02cac0, 6}, + {0x02cae0, 6}, + {0x02cb00, 6}, + {0x02cb20, 6}, + {0x02cc48, 5}, + {0x02cd00, 9}, + {0x02cd40, 18}, + {0x030004, 2}, + {0x030034, 19}, + {0x030084, 2}, + {0x0300bc, 17}, + {0x030104, 2}, + {0x030138, 27}, + {0x030310, 3}, + {0x030340, 2}, + {0x03034c, 2}, + {0x030384, 1}, + {0x0303c0, 16}, + {0x030404, 1}, + {0x03040c, 4}, + {0x030804, 1}, + {0x03080c, 4}, + {0x030c04, 1}, + {0x030c0c, 4}, + {0x030c40, 4}, + {0x031000, 11}, + {0x031100, 11}, + {0x031200, 17}, + {0x031280, 6}, + {0x031304, 1}, + {0x03130c, 5}, + {0x031400, 6}, + {0x031420, 1}, + {0x031444, 2}, + {0x031454, 3}, + {0x031464, 2}, + {0x031474, 11}, + {0x031500, 7}, + {0x031520, 6}, + {0x031540, 8}, + {0x031600, 13}, + {0x031640, 6}, + {0x031700, 2}, + {0x034200, 24}, + {0x034280, 10}, + {0x0342ac, 2}, + {0x0342c0, 6}, + {0x0342f0, 39}, + {0x034600, 24}, + {0x034680, 10}, + {0x0346ac, 2}, + {0x0346c0, 6}, + {0x0346f0, 39}, + {0x034c00, 6}, + {0x034c20, 4}, + {0x034c40, 9}, + {0x034c80, 9}, + {0x034cc0, 3}, + {0x034cd0, 3}, + {0x034e04, 2}, + {0x034e10, 4}, + {0x034e44, 1}, + {0x034e4c, 4}, + {0x034e80, 6}, + {0x034ea0, 4}, + {0x034f04, 1}, + {0x034f18, 11}, + {0x034f80, 2}, + {0x035000, 2}, + {0x035010, 3}, + {0x035044, 3}, + {0x035100, 66}, + {0x035210, 3}, + {0x035244, 3}, + {0x035300, 64}, + {0x035404, 1}, + {0x03540c, 8}, + {0x037000, 6}, + {0x03702c, 7}, + {0x037080, 10}, + {0x0370ac, 4}, + {0x037100, 4}, + {0x037200, 2}, + {0x037210, 3}, + {0x037244, 3}, + {0x037300, 70}, + {0x03742c, 7}, + {0x037480, 10}, + {0x0374ac, 4}, + {0x037500, 4}, + {0x037600, 2}, + {0x037610, 3}, + {0x037644, 3}, + {0x037700, 69}, + {0x037818, 4}, + {0x038000, 3}, + {0x038104, 3}, + {0x03813c, 2}, + {0x038150, 3}, + {0x038400, 1}, + {0x038428, 1}, + {0x038450, 1}, + {0x038478, 1}, + {0x0384a0, 20}, + {0x0385ac, 1}, + {0x0385d8, 4}, + {0x038600, 6}, + {0x038620, 6}, + {0x038640, 3}, + {0x038800, 3}, + {0x038820, 6}, + {0x038840, 6}, + {0x038860, 6}, + {0x038880, 6}, + {0x0388a0, 6}, + {0x0388c0, 6}, + {0x038900, 28}, + {0x038978, 2}, + {0x038a40, 25}, + {0x038ac0, 16}, + {0x039000, 35}, + {0x039090, 3}, + {0x039100, 35}, + {0x039190, 3}, + {0x039200, 35}, + {0x039290, 3}, + {0x039300, 35}, + {0x039390, 3}, + {0x039400, 35}, + {0x039490, 3}, + {0x039500, 1}, + {0x039800, 3}, + {0x039884, 1}, + {0x0398c0, 16}, + {0x039904, 2}, + {0x039934, 20}, + {0x039a04, 2}, + {0x039a10, 4}, + {0x039a24, 2}, + {0x039a30, 4}, + {0x039a44, 2}, + {0x039a50, 4}, + {0x039a64, 2}, + {0x039a70, 8}, + {0x039c00, 7}, + {0x039c20, 6}, + {0x039c40, 8}, + {0x039d00, 11}, + {0x039d40, 11}, + {0x039d84, 1}, + {0x039dc0, 26}, + {0x039e30, 2}, + {0x039e44, 3}, + {0x039e54, 1}, + {0x039e5c, 2}, + {0x039e80, 2}, + {0x039e90, 3}, + {0x039ea4, 1}, + {0x039eac, 3}, + {0x039ec0, 3}, + {0x039f00, 9}, + {0x039f40, 22}, + {0x039fa0, 6}, + {0x039fc0, 8}, + {0x03c000, 3}, + {0x03c010, 3}, + {0x03c020, 3}, + {0x03c040, 9}, + {0x03c068, 6}, + {0x03c090, 2}, + {0x03c0a0, 3}, + {0x03c0c0, 12}, + {0x03c0f4, 1}, + {0x03c100, 2}, + {0x03c110, 3}, + {0x03c120, 1}, + {0x03c130, 11}, + {0x03c160, 2}, + {0x03c180, 4}, + {0x03c194, 3}, + {0x03c1a4, 2}, + {0x03c1b0, 4}, + {0x03c2e0, 5}, + {0x03c2f8, 2}, + {0x03c30c, 13}, + {0x03c34c, 77}, + {0x03c48c, 18}, + {0x03c500, 32}, + {0x03c800, 16}, + {0x03c84c, 18}, + {0x03c8e0, 2}, + {0x03c8ec, 3}, + {0x03ca78, 34}, + {0x03cb3c, 18}, + {0x03cb94, 3}, + {0x03cba4, 3}, + {0x03cbf0, 1}, + {0x03cbf8, 10}, + {0x03cc30, 1}, + {0x03cc44, 4}, + {0x03cc60, 1}, + {0x03cc80, 1}, + {0x03cc90, 1}, + {0x03d004, 6}, + {0x03e004, 1}, + {0x03e00c, 4}, + {0x03e404, 1}, + {0x03e40c, 4}, + {0x03e604, 1}, + {0x03e60c, 4}, + {0x03e800, 1}, + {0x03f034, 19}, + {0x03f084, 2}, + {0x03f0c0, 16}, + {0x03f200, 1}, + {0x03f210, 1}, + {0x03f300, 6}, + {0x03f320, 6}, + {0x03f380, 9}, + {0x03f3c0, 16}, + {0x050000, 1}, + {0x050008, 2}, + {0x050044, 8}, + {0x050104, 1}, + {0x050178, 34}, + {0x050204, 1}, + {0x05020c, 1}, + {0x050214, 1}, + {0x050224, 1}, + {0x05022c, 1}, + {0x050234, 1}, + {0x05023c, 1}, + {0x050244, 1}, + {0x05024c, 1}, + {0x050254, 1}, + {0x050264, 3}, + {0x050280, 2}, + {0x050290, 2}, + {0x0502b4, 1}, + {0x0502bc, 1}, + {0x0502c4, 1}, + {0x0502cc, 1}, + {0x0502d4, 1}, + {0x0502dc, 1}, + {0x0502e4, 1}, + {0x0502ec, 1}, + {0x0502f4, 1}, + {0x0502fc, 1}, + {0x050304, 1}, + {0x05030c, 1}, + {0x050314, 3}, + {0x050324, 3}, + {0x050334, 6}, + {0x050380, 32}, + {0x050404, 1}, + {0x050438, 18}, + {0x050500, 2}, + {0x050544, 1}, + {0x05054c, 4}, + {0x050584, 2}, + {0x050598, 2}, + {0x0505a4, 1}, + {0x0505b0, 4}, + {0x0505c4, 1}, + {0x0505cc, 1}, + {0x0505d4, 1}, + {0x0505e0, 1}, + {0x0505f0, 17}, + {0x050700, 2}, + {0x050800, 1}, + {0x050820, 9}, + {0x050850, 5}, + {0x050870, 5}, + {0x0508c4, 3}, + {0x0508d4, 3}, + {0x0508e4, 3}, + {0x0508f4, 4}, + {0x050910, 5}, + {0x050930, 4}, + {0x050944, 1}, + {0x05094c, 47}, + {0x050a40, 1}, + {0x050a50, 28}, + {0x050ac4, 1}, + {0x050acc, 2}, + {0x050ad8, 10}, + {0x050b04, 1}, + {0x050b0c, 1}, + {0x050b14, 1}, + {0x050b20, 9}, + {0x050b50, 4}, + {0x050b64, 1}, + {0x050b70, 6}, + {0x050b8c, 4}, + {0x050ba4, 4}, + {0x050bc0, 1}, + {0x050bd0, 1}, + {0x050be0, 1}, + {0x050bf0, 1}, + {0x050c00, 1}, + {0x050c14, 1}, + {0x050c1c, 1}, + {0x050c24, 1}, + {0x050c2c, 1}, + {0x050c34, 1}, + {0x050c3c, 1}, + {0x050c44, 1}, + {0x050c4c, 1}, + {0x050c54, 4}, + {0x050c70, 1}, + {0x050c80, 1}, + {0x050c90, 1}, + {0x050ca0, 1}, + {0x050cb0, 1}, + {0x050cc0, 1}, + {0x050cd0, 1}, + {0x050ce0, 1}, + {0x050cf0, 1}, + {0x050d00, 1}, + {0x050d10, 1}, + {0x050d20, 1}, + {0x050d30, 1}, + {0x050d40, 1}, + {0x050d50, 1}, + {0x050d60, 1}, + {0x050d70, 1}, + {0x050d80, 1}, + {0x050d90, 1}, + {0x050da0, 1}, + {0x050db0, 1}, + {0x050dc0, 1}, + {0x050dd0, 1}, + {0x050de0, 1}, + {0x050e00, 64}, + {0x052000, 192}, + {0x052800, 2}, + {0x053000, 51}, + {0x053400, 3}, + {0x053420, 6}, + {0x053440, 6}, + {0x053460, 6}, + {0x053480, 6}, + {0x0534a0, 6}, + {0x0534c0, 6}, + {0x0534e0, 6}, + {0x053500, 6}, + {0x053520, 6}, + {0x053540, 6}, + {0x053560, 6}, + {0x053614, 11}, + {0x053654, 11}, + {0x0536d4, 11}, + {0x053704, 1}, + {0x053740, 18}, + {0x0537a0, 10}, + {0x0537e0, 29}, + {0x053860, 10}, + {0x054204, 1}, + {0x05420c, 1}, + {0x054214, 1}, + {0x054224, 1}, + {0x05422c, 1}, + {0x054234, 1}, + {0x054244, 1}, + {0x05424c, 3}, + {0x054304, 1}, + {0x054340, 16}, + {0x0543a0, 1}, + {0x0543b0, 1}, + {0x0543c0, 1}, + {0x0543d0, 4}, + {0x054400, 8}, + {0x054440, 10}, + {0x054820, 3}, + {0x054928, 9}, + {0x054954, 4}, + {0x054970, 1}, + {0x054980, 1}, + {0x054990, 5}, + {0x0549c4, 13}, + {0x054a04, 1}, + {0x054a0c, 4}, + {0x054a40, 1}, + {0x054a50, 5}, + {0x054a70, 1}, + {0x054a80, 1}, + {0x054a90, 2}, + {0x054c04, 1}, + {0x054c0c, 4}, + {0x054c40, 1}, + {0x054c50, 4}, + {0x054d40, 1}, + {0x054d80, 7}, + {0x054e04, 1}, + {0x054e0c, 4}, + {0x054e40, 1}, + {0x054e50, 4}, + {0x054f40, 1}, + {0x054f80, 7}, + {0x055000, 11}, + {0x055050, 7}, + {0x055070, 16}, + {0x0550b8, 8}, + {0x055200, 13}, + {0x055240, 11}, + {0x055300, 16}, + {0x055400, 3}, + {0x055420, 6}, + {0x055440, 6}, + {0x055460, 6}, + {0x055480, 6}, + {0x0554a0, 6}, + {0x0554c0, 6}, + {0x0554e0, 6}, + {0x055500, 6}, + {0x055520, 6}, + {0x055540, 6}, + {0x055560, 6}, + {0x055580, 6}, + {0x0555a0, 6}, + {0x0555c0, 6}, + {0x0555e0, 6}, + {0x055600, 6}, + {0x055620, 6}, + {0x055640, 6}, + {0x055660, 6}, + {0x055680, 6}, + {0x056000, 2}, + {0x056044, 1}, + {0x05604c, 4}, + {0x056800, 2}, + {0x056844, 1}, + {0x05684c, 4}, + {0x057004, 7}, + {0x057b20, 2}, + {0x057b30, 2}, + {0x057b50, 5}, + {0x058004, 1}, + {0x05800c, 4}, + {0x058800, 7}, + {0x058820, 6}, + {0x058844, 2}, + {0x058864, 9}, + {0x058890, 1}, + {0x0588a0, 1}, + {0x0588b0, 1}, + {0x0588c0, 1}, + {0x0588d0, 1}, + {0x058904, 3}, + {0x058978, 66}, + {0x058a84, 1}, + {0x058a8c, 1}, + {0x058a94, 1}, + {0x058aa0, 2}, + {0x058ff0, 4}, + {0x059004, 1}, + {0x05900c, 4}, + {0x059080, 39}, + {0x059120, 1}, + {0x059130, 1}, + {0x059140, 1}, + {0x059150, 1}, + {0x059160, 1}, + {0x059800, 1}, + {0x059810, 1}, + {0x059820, 1}, + {0x059830, 1}, + {0x059840, 1}, + {0x059850, 1}, + {0x059860, 1}, + {0x059870, 1}, + {0x059880, 1}, + {0x059890, 1}, + {0x0598a0, 1}, + {0x0598b0, 1}, + {0x0598d8, 3}, + {0x059900, 3}, + {0x059980, 9}, + {0x0599c0, 32}, + {0x059a44, 3}, + {0x059a54, 1}, + {0x059a5c, 1}, + {0x059c00, 2}, + {0x059c28, 2}, + {0x059c50, 2}, + {0x059c78, 2}, + {0x059ca0, 18}, + {0x059dac, 1}, + {0x059dd8, 4}, + {0x059e00, 6}, + {0x059e20, 6}, + {0x059e40, 6}, + {0x059e60, 6}, + {0x059e80, 3}, + {0x05a000, 4}, + {0x05a020, 2}, + {0x05a030, 1}, + {0x05a04c, 2}, + {0x05a070, 4}, + {0x05a088, 3}, + {0x05a0a0, 1}, + {0x05a0b0, 1}, + {0x05a0c0, 1}, + {0x05a0d4, 1}, + {0x060020, 4}, + {0x060120, 4}, + {0x060144, 2}, + {0x060154, 7}, + {0x060174, 1}, + {0x06017c, 3}, + {0x06018c, 1}, + {0x0601a0, 4}, + {0x0601b8, 2}, + {0x0601c4, 2}, + {0x0601e4, 7}, + {0x060204, 1}, + {0x06020c, 4}, + {0x060240, 16}, + {0x060304, 1}, + {0x06030c, 1}, + {0x060400, 2}, + {0x060420, 3}, + {0x060430, 1}, + {0x060440, 1}, + {0x060484, 2}, + {0x0604b8, 18}, + {0x060504, 1}, + {0x06050c, 1}, + {0x060514, 1}, + {0x060900, 1}, + {0x060914, 1}, + {0x06091c, 2}, + {0x060930, 1}, + {0x060a00, 32}, + {0x060a84, 1}, + {0x060a8c, 1}, + {0x060a94, 1}, + {0x060a9c, 1}, + {0x060bf0, 3}, + {0x060c00, 11}, + {0x060c30, 3}, + {0x061004, 1}, + {0x061010, 1}, + {0x061018, 4}, + {0x061044, 2}, + {0x061054, 3}, + {0x061100, 32}, + {0x061200, 1}, + {0x061404, 1}, + {0x06140c, 4}, + {0x061440, 2}, + {0x061468, 2}, + {0x061478, 2}, + {0x06180c, 8}, + {0x061908, 6}, + {0x061928, 6}, + {0x061968, 6}, + {0x061988, 6}, + {0x0619a8, 6}, + {0x0619e8, 14}, + {0x062008, 5}, + {0x062024, 3}, + {0x062034, 1}, + {0x06203c, 1}, + {0x062044, 1}, + {0x06204c, 1}, + {0x062054, 1}, + {0x06205c, 3}, + {0x06206c, 1}, + {0x062080, 2}, + {0x062094, 1}, + {0x06209c, 2}, + {0x0620b0, 1}, + {0x0620c0, 1}, + {0x0620d0, 1}, + {0x0620e0, 1}, + {0x0620f4, 4}, + {0x0621c0, 2}, + {0x0621e4, 3}, + {0x0621f4, 3}, + {0x062404, 1}, + {0x06240c, 4}, + {0x062800, 12}, + {0x062834, 1}, + {0x06283c, 5}, + {0x062854, 1}, + {0x062900, 3}, + {0x062914, 1}, + {0x06291c, 1}, + {0x062924, 1}, + {0x06292c, 1}, + {0x062934, 1}, + {0x06293c, 2}, + {0x062950, 3}, + {0x062960, 2}, + {0x063000, 1}, + {0x063010, 4}, + {0x063024, 1}, + {0x06302c, 1}, + {0x063034, 1}, + {0x063044, 1}, + {0x06304c, 1}, + {0x063054, 1}, + {0x06305c, 2}, + {0x063070, 1}, + {0x063080, 1}, + {0x063090, 1}, + {0x0630a4, 3}, + {0x063100, 2}, + {0x063144, 1}, + {0x06314c, 1}, + {0x063154, 1}, + {0x063164, 1}, + {0x06316c, 1}, + {0x063174, 1}, + {0x063180, 16}, + {0x063208, 1}, + {0x063210, 1}, + {0x063218, 1}, + {0x063224, 3}, + {0x063240, 10}, + {0x063300, 8}, + {0x064004, 1}, + {0x06400c, 4}, + {0x064200, 3}, + {0x064300, 1}, + {0x064308, 6}, + {0x064324, 2}, + {0x064338, 2}, + {0x064380, 2}, + {0x064394, 1}, + {0x06439c, 2}, + {0x064400, 2}, + {0x064420, 3}, + {0x064430, 1}, + {0x064440, 1}, + {0x064484, 2}, + {0x0644b8, 18}, + {0x064504, 1}, + {0x06450c, 1}, + {0x064514, 1}, + {0x064804, 1}, + {0x064884, 2}, + {0x0648c8, 14}, + {0x065018, 2}, + {0x06507c, 10}, + {0x0650c0, 16}, + {0x065104, 3}, + {0x065114, 1}, + {0x065200, 3}, + {0x065220, 6}, + {0x065240, 6}, + {0x065260, 6}, + {0x065280, 6}, + {0x065300, 1}, + {0x065f00, 2}, + {0x066000, 2}, + {0x066028, 2}, + {0x066050, 2}, + {0x066078, 2}, + {0x0660a0, 27}, + {0x0661ac, 1}, + {0x0661d8, 4}, + {0x066200, 6}, + {0x066220, 6}, + {0x066240, 6}, + {0x066260, 6}, + {0x066280, 3}, + {0x066400, 6}, + {0x066500, 17}, + {0x066550, 8}, + {0x066574, 3}, + {0x070004, 1}, + {0x07000c, 4}, + {0x070400, 7}, + {0x070420, 11}, + {0x070500, 5}, + {0x070524, 1}, + {0x07052c, 1}, + {0x070534, 1}, + {0x070540, 4}, + {0x070600, 142}, + {0x070884, 2}, + {0x0708b4, 19}, + {0x070904, 1}, + {0x070940, 16}, + {0x071000, 1}, + {0x071094, 1}, + {0x071104, 2}, + {0x071140, 16}, + {0x071204, 1}, + {0x07120c, 4}, + {0x071404, 1}, + {0x07140c, 1}, + {0x071414, 1}, + {0x07141c, 1}, + {0x071424, 1}, + {0x07142c, 1}, + {0x071434, 1}, + {0x07143c, 1}, + {0x071800, 2}, + {0x071904, 1}, + {0x071940, 16}, + {0x071a04, 1}, + {0x071a40, 16}, + {0x071b04, 1}, + {0x071b40, 16}, + {0x072000, 3}, + {0x072804, 1}, + {0x07280c, 4}, + {0x072880, 1}, + {0x072888, 3}, + {0x073000, 1}, + {0x073020, 1}, + {0x073040, 1}, + {0x073060, 1}, + {0x073080, 1}, + {0x0730a0, 1}, + {0x0730c0, 1}, + {0x0730e0, 1}, + {0x073100, 1}, + {0x073120, 1}, + {0x073140, 1}, + {0x073160, 1}, + {0x073180, 1}, + {0x0731a0, 1}, + {0x0731c0, 1}, + {0x0731e0, 1}, + {0x073200, 1}, + {0x073220, 1}, + {0x073240, 1}, + {0x073260, 1}, + {0x073280, 1}, + {0x0732a0, 1}, + {0x0732c0, 1}, + {0x0732e0, 1}, + {0x073300, 1}, + {0x073320, 1}, + {0x073340, 1}, + {0x073360, 1}, + {0x073380, 1}, + {0x0733a0, 1}, + {0x0733c0, 1}, + {0x0733e0, 1}, + {0x073800, 16}, + {0x074830, 4}, + {0x074884, 1}, + {0x074890, 4}, + {0x074900, 3}, + {0x074920, 6}, + {0x074940, 6}, + {0x074980, 9}, + {0x0749c0, 16}, + {0x075000, 3}, + {0x075010, 2}, + {0x075020, 4}, + {0x075100, 2}, + {0x080000, 29}, + {0x080340, 14}, + {0x08037c, 3}, + {0x08038c, 1}, + {0x0803c0, 2}, + {0x0803e4, 3}, + {0x0803f4, 3}, + {0x080404, 6}, + {0x080804, 2}, + {0x080874, 35}, + {0x081000, 129}, + {0x081210, 4}, + {0x081228, 3}, + {0x081240, 2}, + {0x081264, 2}, + {0x081274, 3}, + {0x081284, 2}, + {0x081298, 2}, + {0x0812a4, 1}, + {0x0812b4, 21}, + {0x081310, 8}, + {0x081344, 1}, + {0x08134c, 1}, + {0x081354, 1}, + {0x081364, 2}, + {0x081370, 4}, + {0x081384, 2}, + {0x081390, 4}, + {0x081404, 2}, + {0x081478, 34}, + {0x081504, 2}, + {0x081518, 14}, + {0x081580, 5}, + {0x081598, 2}, + {0x0815a4, 10}, + {0x082000, 29}, + {0x082340, 14}, + {0x08237c, 3}, + {0x08238c, 1}, + {0x0823c0, 2}, + {0x0823e4, 3}, + {0x0823f4, 3}, + {0x082404, 1}, + {0x08240c, 4}, + {0x082804, 2}, + {0x082874, 35}, + {0x082904, 2}, + {0x082974, 35}, + {0x083000, 129}, + {0x083210, 6}, + {0x083244, 2}, + {0x083254, 7}, + {0x083284, 1}, + {0x08328c, 1}, + {0x083294, 1}, + {0x0832a4, 1}, + {0x0832b4, 19}, + {0x083304, 2}, + {0x083310, 4}, + {0x083324, 2}, + {0x083330, 14}, + {0x084000, 29}, + {0x084340, 14}, + {0x08437c, 3}, + {0x08438c, 1}, + {0x0843c0, 2}, + {0x0843e4, 3}, + {0x0843f4, 3}, + {0x084404, 1}, + {0x08440c, 4}, + {0x084804, 2}, + {0x084874, 35}, + {0x084904, 2}, + {0x084974, 35}, + {0x085000, 32}, + {0x085200, 1}, + {0x085210, 7}, + {0x085240, 12}, + {0x085280, 2}, + {0x0852a4, 1}, + {0x0852b4, 3}, + {0x085304, 1}, + {0x08530c, 1}, + {0x085314, 1}, + {0x085324, 2}, + {0x085334, 3}, + {0x085344, 2}, + {0x085358, 2}, + {0x085364, 2}, + {0x085378, 2}, + {0x085384, 2}, + {0x085398, 2}, + {0x0853c0, 23}, + {0x086000, 2}, + {0x086020, 2}, + {0x086040, 1}, + {0x086400, 11}, + {0x086800, 3}, + {0x086820, 6}, + {0x086840, 6}, + {0x086860, 6}, + {0x086880, 6}, + {0x0868a0, 6}, + {0x0868c0, 6}, + {0x0868e0, 6}, + {0x086900, 9}, + {0x086940, 17}, + {0x087000, 26}, + {0x087100, 1}, + {0x087108, 1}, + {0x087110, 1}, + {0x087118, 1}, + {0x087120, 1}, + {0x087128, 1}, + {0x087130, 1}, + {0x087138, 1}, + {0x087140, 1}, + {0x087148, 1}, + {0x087150, 1}, + {0x087158, 1}, + {0x087160, 1}, + {0x087168, 1}, + {0x087170, 1}, + {0x087178, 1}, + {0x087180, 10}, + {0x0871b0, 9}, + {0x087200, 1}, + {0x087208, 1}, + {0x087210, 3}, + {0x090000, 17}, + {0x090060, 2}, + {0x09006c, 1}, + {0x090104, 1}, + {0x090140, 25}, + {0x0901a8, 2}, + {0x0901c0, 9}, + {0x0901e8, 2}, + {0x090204, 1}, + {0x090220, 24}, + {0x090300, 6}, + {0x090320, 9}, + {0x090348, 1}, + {0x090350, 1}, + {0x090400, 6}, + {0x090420, 9}, + {0x090448, 1}, + {0x090450, 1}, + {0x090500, 6}, + {0x090520, 6}, + {0x090540, 2}, + {0x090564, 2}, + {0x090578, 3}, + {0x091004, 3}, + {0x091800, 8}, + {0x091824, 2}, + {0x091830, 10}, + {0x091860, 6}, + {0x092000, 32}, + {0x093000, 1}, + {0x093020, 1}, + {0x093040, 1}, + {0x093060, 1}, + {0x093080, 1}, + {0x0930a0, 1}, + {0x0930c0, 1}, + {0x0930e0, 1}, + {0x093100, 1}, + {0x0931a0, 1}, + {0x0931c0, 1}, + {0x093200, 3}, + {0x093404, 1}, + {0x093440, 16}, + {0x093504, 1}, + {0x09353c, 28}, + {0x0935b0, 2}, + {0x0935c0, 3}, + {0x094000, 9}, + {0x094040, 19}, + {0x0940c0, 2}, + {0x094800, 1}, + {0x094828, 1}, + {0x094850, 1}, + {0x094878, 1}, + {0x0948a0, 8}, + {0x0949ac, 1}, + {0x0949d8, 4}, + {0x094a00, 6}, + {0x094a20, 6}, + {0x094a40, 3}, + {0x096000, 1}, + {0x096010, 4}, + {0x096028, 3}, + {0x096104, 1}, + {0x09610c, 7}, + {0x096204, 1}, + {0x09620c, 4}, + {0x096488, 1}, + {0x096498, 3}, + {0x0964b0, 4}, + {0x096504, 1}, + {0x09650c, 4}, + {0x096584, 4}, + {0x096600, 16}, + {0x096644, 2}, + {0x096658, 10}, + {0x096684, 2}, + {0x0966bc, 51}, + {0x096800, 11}, + {0x096884, 3}, + {0x0968a0, 12}, + {0x097100, 6}, + {0x097120, 1}, + {0x0a2004, 1}, + {0x0a200c, 4}, + {0x0a2400, 2}, + {0x0a240c, 6}, + {0x0a2440, 1}, + {0x0a2450, 4}, + {0x0a2468, 3}, + {0x0a2480, 1}, + {0x0a24a0, 10}, + {0x0a24d0, 9}, + {0x0a2804, 1}, + {0x0a280c, 4}, + {0x0a2c00, 2}, + {0x0a2c0c, 2}, + {0x0a2c40, 1}, + {0x0a2c50, 4}, + {0x0a2c68, 3}, + {0x0a2c88, 2}, + {0x0a2cf0, 1}, + {0x0a3004, 1}, + {0x0a300c, 4}, + {0x0a3040, 2}, + {0x0a3064, 3}, + {0x0a3074, 3}, + {0x0a3200, 9}, + {0x0a3230, 2}, + {0x0a323c, 2}, + {0x0a3248, 4}, + {0x0a3400, 1}, + {0x0a3408, 3}, + {0x0a3418, 4}, + {0x0a3430, 2}, + {0x0a343c, 1}, + {0x0a3480, 1}, + {0x0a3490, 1}, + {0x0a3504, 1}, + {0x0a3510, 76}, + {0x0a4000, 48}, + {0x0a4100, 3}, + {0x0a4110, 6}, + {0x0a412c, 4}, + {0x0a4140, 1}, + {0x0a4304, 1}, + {0x0a4318, 10}, + {0x0a4804, 1}, + {0x0a480c, 4}, + {0x0a4840, 2}, + {0x0a4864, 3}, + {0x0a4874, 3}, + {0x0a4c04, 2}, + {0x0a4c10, 2}, + {0x0a4c1c, 6}, + {0x0a4c38, 2}, + {0x0a4c50, 8}, + {0x0a4c78, 19}, + {0x0a4d04, 2}, + {0x0a4d40, 21}, + {0x0a4da4, 1}, + {0x0a4dac, 1}, + {0x0a4db4, 1}, + {0x0a4dc0, 1}, + {0x0a5000, 14}, + {0x0a6000, 1}, + {0x0a6028, 1}, + {0x0a6050, 1}, + {0x0a6078, 1}, + {0x0a60a0, 18}, + {0x0a61ac, 1}, + {0x0a61d8, 4}, + {0x0a6200, 6}, + {0x0a6220, 6}, + {0x0a6240, 3}, + {0x0a6400, 3}, + {0x0a6420, 6}, + {0x0a6440, 6}, + {0x0a6460, 6}, + {0x0a6480, 6}, + {0x0a64a0, 6}, + {0x0a64c0, 6}, + {0x0a6500, 9}, + {0x0a6540, 18}, + {0x0a65c0, 4}, + {0x0a8000, 10}, + {0x0a802c, 15}, + {0x0a806c, 5}, + {0x0a8700, 17}, + {0x0a8750, 4}, + {0x0a8800, 4}, + {0x0a8880, 1}, + {0x0a88a0, 1}, + {0x0a88c0, 1}, + {0x0a8900, 1}, + {0x0a8960, 1}, + {0x0a8980, 4}, + {0x0a89a4, 3}, + {0x0a89c0, 1}, + {0x0a8a00, 4}, + {0x0a8ad4, 10}, + {0x0a8b00, 2}, + {0x0a8b80, 4}, + {0x0a9804, 1}, + {0x0a980c, 4}, + {0x0a9f00, 6}, + {0x0a9f20, 6}, + {0x0a9f40, 6}, + {0x0a9f60, 6}, + {0x0a9f80, 9}, + {0x0a9fc0, 16}, + {0x0aa020, 1}, + {0x0aa028, 2}, + {0x0b0000, 2}, + {0x0b0010, 4}, + {0x0b1000, 15}, + {0x0b1040, 10}, + {0x0b1080, 20}, + {0x0b1100, 2}, + {0x0b1110, 2}, + {0x0b1120, 2}, + {0x0b1160, 2}, + {0x0b116c, 1}, + {0x0b1180, 3}, + {0x0b1190, 3}, + {0x0b11a0, 3}, + {0x0b11e0, 2}, + {0x0b11ec, 1}, + {0x0b1200, 2}, + {0x0b1210, 2}, + {0x0b1260, 2}, + {0x0b126c, 1}, + {0x0b1280, 3}, + {0x0b1290, 3}, + {0x0b12e0, 2}, + {0x0b12ec, 1}, + {0x0b1300, 2}, + {0x0b1310, 2}, + {0x0b1320, 2}, + {0x0b1360, 2}, + {0x0b136c, 1}, + {0x0b1380, 3}, + {0x0b1390, 3}, + {0x0b13e0, 2}, + {0x0b13ec, 1}, + {0x0b1400, 3}, + {0x0b1410, 3}, + {0x0b1460, 2}, + {0x0b146c, 1}, + {0x0b1480, 3}, + {0x0b1490, 7}, + {0x0b14b0, 4}, + {0x0b14e0, 2}, + {0x0b14ec, 1}, + {0x0b1500, 3}, + {0x0b1510, 3}, + {0x0b1560, 2}, + {0x0b156c, 1}, + {0x0b1580, 2}, + {0x0b1590, 2}, + {0x0b15e0, 2}, + {0x0b15ec, 1}, + {0x0b1600, 3}, + {0x0b1610, 3}, + {0x0b1660, 2}, + {0x0b166c, 1}, + {0x0b1680, 3}, + {0x0b1690, 3}, + {0x0b16e0, 2}, + {0x0b16ec, 1}, + {0x0b1700, 8}, + {0x0b1760, 2}, + {0x0b176c, 1}, + {0x0b1780, 3}, + {0x0b1790, 3}, + {0x0b17e0, 2}, + {0x0b17ec, 1}, + {0x0b1800, 9}, + {0x0b1840, 16}, + {0x0b2000, 9}, + {0x0b2040, 48}, + {0x0b2104, 7}, + {0x0b2200, 9}, + {0x0b2240, 48}, + {0x0b2304, 7}, + {0x0b2400, 9}, + {0x0b2440, 48}, + {0x0b2504, 7}, + {0x0b2600, 9}, + {0x0b2640, 48}, + {0x0b2704, 7}, + {0x0b2800, 9}, + {0x0b2840, 48}, + {0x0b2904, 7}, + {0x0b2a00, 9}, + {0x0b2a40, 48}, + {0x0b2b04, 7}, + {0x0b2c00, 9}, + {0x0b2c40, 48}, + {0x0b2d04, 7}, + {0x0b2e00, 9}, + {0x0b2e40, 48}, + {0x0b2f04, 7}, + {0x0b3000, 65}, + {0x0b3140, 35}, + {0x0b31d0, 1}, + {0x0b4000, 132}, + {0x0b4240, 13}, + {0x0b4278, 26}, + {0x0b4300, 9}, + {0x0b4380, 32}, + {0x0b4404, 1}, + {0x0b440c, 4}, + {0x0b4444, 1}, + {0x0b444c, 4}, + {0x0b4484, 1}, + {0x0b448c, 4}, + {0x0b44c4, 3}, + {0x0b4800, 73}, + {0x0b4940, 28}, + {0x0b49c4, 5}, + {0x0b4a00, 1}, + {0x0b4a20, 16}, + {0x0b5000, 7}, + {0x0b5040, 12}, + {0x0b5100, 9}, + {0x0b5140, 16}, + {0x0b6000, 3}, + {0x0b6028, 3}, + {0x0b6050, 3}, + {0x0b6078, 3}, + {0x0b60a0, 7}, + {0x0b61ac, 1}, + {0x0b61d8, 4}, + {0x0b6200, 6}, + {0x0b6220, 6}, + {0x0b6240, 6}, + {0x0b6260, 6}, + {0x0b6280, 6}, + {0x0b62a0, 6}, + {0x0b62c0, 3}, + {0x0b7800, 3}, + {0x0b7820, 6}, + {0x0b7840, 6}, + {0x0b7860, 6}, + {0x0b7880, 6}, + {0x0b78a0, 6}, + {0x0b78c0, 6}, + {0x0b78e0, 6}, + {0x0b7900, 6}, + {0x0b7a00, 1}, + {0x0d0000, 5}, + {0x0d0038, 4}, + {0x0d0050, 4}, + {0x0d0080, 3}, + {0x0d00a0, 6}, + {0x0d00c0, 6}, + {0x0d0100, 3}, + {0x0d0120, 6}, + {0x0d0140, 6}, + {0x0d0180, 14}, + {0x0d01c0, 6}, + {0x0d1000, 2}, + {0x0d100c, 1}, + {0x0d1020, 13}, + {0x0d1058, 1}, + {0x0d1060, 6}, + {0x0d1080, 1}, + {0x0d1100, 2}, + {0x0d110c, 1}, + {0x0d1120, 13}, + {0x0d1158, 1}, + {0x0d1160, 6}, + {0x0d1180, 1}, + {0x0d1200, 2}, + {0x0d120c, 1}, + {0x0d1220, 13}, + {0x0d1258, 1}, + {0x0d1260, 6}, + {0x0d1280, 1}, + {0x0d1300, 2}, + {0x0d130c, 1}, + {0x0d1320, 13}, + {0x0d1358, 1}, + {0x0d1360, 6}, + {0x0d1380, 1}, + {0x0d1400, 3}, + {0x0d1410, 10}, + {0x0d1440, 1}, + {0x0d1450, 6}, + {0x0d1484, 6}, + {0x0d14a0, 6}, + {0x0d1504, 25}, + {0x0d1580, 14}, + {0x0d15c0, 4}, + {0x0d1600, 6}, + {0x0d1620, 6}, + {0x0d1640, 3}, + {0x0d1660, 6}, + {0x0d1700, 2}, + {0x0d170c, 3}, + {0x0d1720, 6}, + {0x0d1800, 11}, + {0x0d1830, 7}, + {0x0d1850, 7}, + {0x0d1880, 5}, + {0x0d18a0, 3}, + {0x0d18c0, 7}, + {0x0d1a00, 1}, + {0x0d1a08, 13}, + {0x0d1a40, 11}, + {0x0d1c00, 24}, + {0x0d1c64, 5}, + {0x0d1c80, 3}, + {0x0d1c90, 2}, + {0x0d1d00, 15}, + {0x0d1d40, 16}, + {0x0d1d90, 4}, + {0x0d1dc0, 12}, + {0x0d1e00, 1}, + {0x0d1e20, 1}, + {0x0d1e28, 12}, + {0x0d1e5c, 1}, + {0x0d1e80, 6}, + {0x0d1f00, 14}, + {0x0d2000, 2}, + {0x0d200c, 1}, + {0x0d2020, 13}, + {0x0d2058, 1}, + {0x0d2060, 6}, + {0x0d2080, 1}, + {0x0d2100, 2}, + {0x0d210c, 1}, + {0x0d2120, 13}, + {0x0d2158, 1}, + {0x0d2160, 6}, + {0x0d2180, 1}, + {0x0d2200, 2}, + {0x0d220c, 1}, + {0x0d2220, 13}, + {0x0d2258, 1}, + {0x0d2260, 6}, + {0x0d2280, 1}, + {0x0d2300, 2}, + {0x0d230c, 1}, + {0x0d2320, 13}, + {0x0d2358, 1}, + {0x0d2360, 6}, + {0x0d2380, 1}, + {0x0d2400, 3}, + {0x0d2410, 10}, + {0x0d2440, 1}, + {0x0d2450, 6}, + {0x0d2484, 6}, + {0x0d24a0, 6}, + {0x0d2504, 25}, + {0x0d2580, 14}, + {0x0d25c0, 4}, + {0x0d2600, 6}, + {0x0d2620, 6}, + {0x0d2640, 3}, + {0x0d2660, 6}, + {0x0d2700, 2}, + {0x0d270c, 3}, + {0x0d2720, 6}, + {0x0d2800, 11}, + {0x0d2830, 7}, + {0x0d2850, 7}, + {0x0d2880, 5}, + {0x0d28a0, 3}, + {0x0d28c0, 7}, + {0x0d2a00, 1}, + {0x0d2a08, 13}, + {0x0d2a40, 11}, + {0x0d2c00, 24}, + {0x0d2c64, 5}, + {0x0d2c80, 3}, + {0x0d2c90, 2}, + {0x0d2d00, 15}, + {0x0d2d40, 16}, + {0x0d2d90, 4}, + {0x0d2dc0, 12}, + {0x0d3000, 2}, + {0x0d3400, 1}, + {0x0d3428, 1}, + {0x0d3450, 1}, + {0x0d3478, 1}, + {0x0d34a0, 5}, + {0x0d35ac, 1}, + {0x0d35d8, 4}, + {0x0d3600, 6}, + {0x0d3620, 6}, + {0x0d3640, 3}, + {0x0d3e00, 3}, + {0x0d3e20, 6}, + {0x0d3e40, 6}, + {0x0d3e80, 3}, + {0x0d3e90, 2}, + {0x0d4000, 29}, + {0x0d4078, 4}, + {0x0d4090, 2}, + {0x0d40a0, 7}, + {0x0d40c0, 11}, + {0x0d4100, 14}, + {0x0d4140, 14}, + {0x0d4180, 61}, + {0x0d4278, 4}, + {0x0d4290, 2}, + {0x0d42a0, 7}, + {0x0d42c0, 11}, + {0x0d4300, 14}, + {0x0d4340, 14}, + {0x0d4380, 61}, + {0x0d4478, 4}, + {0x0d4490, 2}, + {0x0d44a0, 7}, + {0x0d44c0, 11}, + {0x0d4500, 14}, + {0x0d4540, 14}, + {0x0d4580, 61}, + {0x0d4678, 4}, + {0x0d4690, 2}, + {0x0d46a0, 7}, + {0x0d46c0, 11}, + {0x0d4700, 14}, + {0x0d4740, 14}, + {0x0d4780, 69}, + {0x0d4c00, 6}, + {0x0d4c40, 14}, + {0x0d4c80, 9}, + {0x0d4d00, 9}, + {0x0d4d2c, 1}, + {0x0d4d40, 3}, + {0x0d4d60, 1}, + {0x0d4d80, 3}, + {0x0d4e00, 2}, + {0x0d4e0c, 1}, + {0x0d4e14, 5}, + {0x0d4e2c, 1}, + {0x0d4e34, 5}, + {0x0d4e4c, 1}, + {0x0d4e54, 5}, + {0x0d4e6c, 1}, + {0x0d4e74, 5}, + {0x0d4e8c, 1}, + {0x0d4e94, 5}, + {0x0d4eac, 1}, + {0x0d4eb4, 3}, + {0x0d6000, 15}, + {0x0d6070, 3}, + {0x0d6080, 6}, + {0x0d6100, 9}, + {0x0d6204, 8}, + {0x0d6240, 13}, + {0x0d6280, 16}, + {0x0d6400, 8}, + {0x0d6424, 15}, + {0x0d6464, 15}, + {0x0d64a4, 15}, + {0x0d64e4, 30}, + {0x0d6580, 10}, + {0x0d65ac, 1}, + {0x0d65b4, 5}, + {0x0d65cc, 1}, + {0x0d65d4, 5}, + {0x0d65ec, 1}, + {0x0d65f4, 13}, + {0x0d6680, 4}, + {0x0d6694, 2}, + {0x0d66a0, 5}, + {0x0d66c0, 5}, + {0x0d66e0, 4}, + {0x0d6800, 19}, + {0x0d6850, 10}, + {0x0d6880, 19}, + {0x0d68d0, 10}, + {0x0d6900, 19}, + {0x0d6950, 10}, + {0x0d6980, 19}, + {0x0d69d0, 10}, + {0x0d6a00, 19}, + {0x0d6a50, 10}, + {0x0d6a80, 19}, + {0x0d6ad0, 10}, + {0x0d6c00, 19}, + {0x0d6c60, 6}, + {0x0d6c84, 1}, + {0x0d6c94, 8}, + {0x0d6cb8, 9}, + {0x0d6ce0, 4}, + {0x0d7000, 9}, + {0x0d7040, 16}, + {0x0d8000, 6}, + {0x0d8020, 3}, + {0x0d8030, 3}, + {0x0d8040, 6}, + {0x0d8060, 17}, + {0x0d80c0, 38}, + {0x0d8180, 2}, + {0x0d8400, 2}, + {0x0d8428, 2}, + {0x0d8450, 2}, + {0x0d8478, 2}, + {0x0d84a0, 16}, + {0x0d85ac, 1}, + {0x0d85d8, 4}, + {0x0d8600, 6}, + {0x0d8620, 6}, + {0x0d8640, 6}, + {0x0d8660, 6}, + {0x0d8680, 3}, + {0x0d8800, 2}, + {0x0d9000, 36}, + {0x0d9100, 26}, + {0x0d916c, 7}, + {0x0d91a0, 1}, + {0x0d91c0, 9}, + {0x0d91e8, 1}, + {0x0d9200, 6}, + {0x0d9220, 6}, + {0x0d9248, 4}, + {0x0d9280, 6}, + {0x0d929c, 1}, + {0x0d92a4, 2}, + {0x0d92b8, 9}, + {0x0d9304, 4}, + {0x0d9328, 3}, + {0x0d9340, 6}, + {0x0d9400, 1}, + {0x0d9408, 1}, + {0x0d9410, 2}, + {0x0d9424, 2}, + {0x0d9444, 1}, + {0x0d9480, 27}, + {0x0d9500, 6}, + {0x0d9520, 12}, + {0x0d9700, 7}, + {0x0d9744, 9}, + {0x0d976c, 2}, + {0x0d9780, 6}, + {0x0d97a0, 2}, + {0x0d9800, 36}, + {0x0d9900, 26}, + {0x0d996c, 7}, + {0x0d99a0, 1}, + {0x0d99c0, 9}, + {0x0d99e8, 1}, + {0x0d9a00, 6}, + {0x0d9a20, 6}, + {0x0d9a48, 4}, + {0x0d9a80, 6}, + {0x0d9a9c, 1}, + {0x0d9aa4, 2}, + {0x0d9ab8, 9}, + {0x0d9b04, 4}, + {0x0d9b28, 3}, + {0x0d9b40, 6}, + {0x0d9c00, 1}, + {0x0d9c08, 1}, + {0x0d9c10, 2}, + {0x0d9c24, 2}, + {0x0d9c44, 1}, + {0x0d9c80, 27}, + {0x0d9d00, 6}, + {0x0d9d20, 12}, + {0x0d9f00, 7}, + {0x0d9f44, 9}, + {0x0d9f6c, 2}, + {0x0d9f80, 6}, + {0x0d9fa0, 2}, + {0x0db000, 1028}, + {0x0dc018, 18}, + {0x0dc100, 4}, + {0x0dc118, 18}, + {0x0dc200, 12}, + {0x0dc300, 6}, + {0x0dc320, 5}, + {0x0dc340, 6}, + {0x0dc360, 5}, + {0x0dc380, 6}, + {0x0dc400, 9}, + {0x0dc440, 26}, + {0x0dc4c4, 1}, + {0x0dc4cc, 1}, + {0x0dc4d4, 1}, + {0x0dc50c, 7}, + {0x0dc544, 2}, + {0x0dc55c, 9}, + {0x0dc584, 7}, + {0x0dc5a4, 2}, + {0x0dc5b8, 2}, + {0x0dc5c4, 2}, + {0x0dc5d8, 2}, + {0x0dc600, 2}, + {0x0dcfbc, 15}, + {0x0dd000, 7}, + {0x0dd020, 6}, + {0x0dd040, 8}, + {0x0dd104, 1}, + {0x0dd10c, 1}, + {0x0dd200, 8}, + {0x0dd244, 2}, + {0x0dd268, 18}, + {0x0dd404, 1}, + {0x0dd440, 40}, + {0x0dd504, 3}, + {0x0dd514, 3}, + {0x0dd524, 3}, + {0x0dd534, 3}, + {0x0dd544, 3}, + {0x0dd554, 3}, + {0x0dd564, 3}, + {0x0dd574, 3}, + {0x0dd584, 3}, + {0x0dd594, 3}, + {0x0dd5a4, 3}, + {0x0dd5b4, 3}, + {0x0dd604, 2}, + {0x0dd640, 16}, + {0x0dd684, 3}, + {0x0dd704, 2}, + {0x0dd740, 18}, + {0x0ddc00, 4}, + {0x0ddc80, 1}, + {0x0ddd00, 6}, + {0x0ddd20, 6}, + {0x0ddd40, 6}, + {0x0ddd80, 1}, + {0x0dde00, 3}, + {0x0dde20, 10}, + {0x0dde50, 6}, + {0x0dde80, 9}, + {0x0ddec0, 16}, + {0x0de000, 135}, + {0x0de300, 2}, + {0x0de30c, 3}, + {0x0de320, 2}, + {0x0de32c, 3}, + {0x0de340, 2}, + {0x0de34c, 3}, + {0x0de360, 2}, + {0x0de36c, 3}, + {0x0de380, 2}, + {0x0de38c, 3}, + {0x0de3a0, 2}, + {0x0de3ac, 3}, + {0x0de3c0, 2}, + {0x0de3cc, 3}, + {0x0de3e0, 2}, + {0x0de3ec, 3}, + {0x0de400, 2}, + {0x0de40c, 3}, + {0x0de420, 2}, + {0x0de42c, 3}, + {0x0de440, 2}, + {0x0de44c, 3}, + {0x0de460, 2}, + {0x0de46c, 3}, + {0x0de480, 2}, + {0x0de48c, 3}, + {0x0de4a0, 2}, + {0x0de4ac, 3}, + {0x0de4c0, 2}, + {0x0de4cc, 3}, + {0x0de4e0, 2}, + {0x0de4ec, 3}, + {0x0de500, 135}, + {0x0de800, 2}, + {0x0de80c, 3}, + {0x0de820, 2}, + {0x0de82c, 3}, + {0x0de840, 2}, + {0x0de84c, 3}, + {0x0de860, 2}, + {0x0de86c, 3}, + {0x0de880, 2}, + {0x0de88c, 3}, + {0x0de8a0, 2}, + {0x0de8ac, 3}, + {0x0de8c0, 2}, + {0x0de8cc, 3}, + {0x0de8e0, 2}, + {0x0de8ec, 3}, + {0x0de900, 2}, + {0x0de90c, 3}, + {0x0de920, 2}, + {0x0de92c, 3}, + {0x0de940, 2}, + {0x0de94c, 3}, + {0x0de960, 2}, + {0x0de96c, 3}, + {0x0de980, 2}, + {0x0de98c, 3}, + {0x0de9a0, 2}, + {0x0de9ac, 3}, + {0x0de9c0, 2}, + {0x0de9cc, 3}, + {0x0de9e0, 2}, + {0x0de9ec, 3}, + {0x0dea00, 135}, + {0x0ded00, 2}, + {0x0ded0c, 3}, + {0x0ded20, 2}, + {0x0ded2c, 3}, + {0x0ded40, 2}, + {0x0ded4c, 3}, + {0x0ded60, 2}, + {0x0ded6c, 3}, + {0x0ded80, 2}, + {0x0ded8c, 3}, + {0x0deda0, 2}, + {0x0dedac, 3}, + {0x0dedc0, 2}, + {0x0dedcc, 3}, + {0x0dede0, 2}, + {0x0dedec, 3}, + {0x0dee00, 2}, + {0x0dee0c, 3}, + {0x0dee20, 2}, + {0x0dee2c, 3}, + {0x0dee40, 2}, + {0x0dee4c, 3}, + {0x0dee60, 2}, + {0x0dee6c, 3}, + {0x0dee80, 2}, + {0x0dee8c, 3}, + {0x0deea0, 2}, + {0x0deeac, 3}, + {0x0deec0, 2}, + {0x0deecc, 3}, + {0x0deee0, 2}, + {0x0deeec, 3}, + {0x0def00, 135}, + {0x0df200, 2}, + {0x0df20c, 3}, + {0x0df220, 2}, + {0x0df22c, 3}, + {0x0df240, 2}, + {0x0df24c, 3}, + {0x0df260, 2}, + {0x0df26c, 3}, + {0x0df280, 2}, + {0x0df28c, 3}, + {0x0df2a0, 2}, + {0x0df2ac, 3}, + {0x0df2c0, 2}, + {0x0df2cc, 3}, + {0x0df2e0, 2}, + {0x0df2ec, 3}, + {0x0df300, 2}, + {0x0df30c, 3}, + {0x0df320, 2}, + {0x0df32c, 3}, + {0x0df340, 2}, + {0x0df34c, 3}, + {0x0df360, 2}, + {0x0df36c, 3}, + {0x0df380, 2}, + {0x0df38c, 3}, + {0x0df3a0, 2}, + {0x0df3ac, 3}, + {0x0df3c0, 2}, + {0x0df3cc, 3}, + {0x0df3e0, 2}, + {0x0df3ec, 3}, + {0x0df400, 135}, + {0x0df700, 2}, + {0x0df70c, 3}, + {0x0df720, 2}, + {0x0df72c, 3}, + {0x0df740, 2}, + {0x0df74c, 3}, + {0x0df760, 2}, + {0x0df76c, 3}, + {0x0df780, 2}, + {0x0df78c, 3}, + {0x0df7a0, 2}, + {0x0df7ac, 3}, + {0x0df7c0, 2}, + {0x0df7cc, 3}, + {0x0df7e0, 2}, + {0x0df7ec, 3}, + {0x0df800, 2}, + {0x0df80c, 3}, + {0x0df820, 2}, + {0x0df82c, 3}, + {0x0df840, 2}, + {0x0df84c, 3}, + {0x0df860, 2}, + {0x0df86c, 3}, + {0x0df880, 2}, + {0x0df88c, 3}, + {0x0df8a0, 2}, + {0x0df8ac, 3}, + {0x0df8c0, 2}, + {0x0df8cc, 3}, + {0x0df8e0, 2}, + {0x0df8ec, 3}, + {0x0df900, 2}, + {0x0e0000, 3}, + {0x0e0010, 4}, + {0x0e0028, 3}, + {0x0e0048, 2}, + {0x0e0058, 2}, + {0x0e0064, 32}, + {0x0e00f0, 1}, + {0x0e00fc, 35}, + {0x0e019c, 15}, + {0x0e01e0, 1}, + {0x0e01e8, 5}, + {0x0e0204, 5}, + {0x0e021c, 1}, + {0x0e0300, 16}, + {0x0e0400, 3}, + {0x0e0410, 4}, + {0x0e0428, 3}, + {0x0e0448, 2}, + {0x0e0458, 2}, + {0x0e0464, 32}, + {0x0e04f0, 1}, + {0x0e04fc, 35}, + {0x0e059c, 15}, + {0x0e05e0, 1}, + {0x0e05e8, 5}, + {0x0e0604, 5}, + {0x0e061c, 1}, + {0x0e0700, 16}, + {0x0e0800, 3}, + {0x0e0810, 4}, + {0x0e0828, 3}, + {0x0e0848, 2}, + {0x0e0858, 2}, + {0x0e0864, 32}, + {0x0e08f0, 1}, + {0x0e08fc, 35}, + {0x0e099c, 15}, + {0x0e09e0, 1}, + {0x0e09e8, 5}, + {0x0e0a04, 5}, + {0x0e0a1c, 1}, + {0x0e0b00, 16}, + {0x0e0c00, 3}, + {0x0e0c10, 4}, + {0x0e0c28, 3}, + {0x0e0c48, 2}, + {0x0e0c58, 2}, + {0x0e0c64, 32}, + {0x0e0cf0, 1}, + {0x0e0cfc, 35}, + {0x0e0d9c, 15}, + {0x0e0de0, 1}, + {0x0e0de8, 5}, + {0x0e0e04, 5}, + {0x0e0e1c, 1}, + {0x0e0f00, 16}, + {0x0e1000, 3}, + {0x0e1010, 4}, + {0x0e1028, 3}, + {0x0e1048, 2}, + {0x0e1058, 2}, + {0x0e1064, 32}, + {0x0e10f0, 1}, + {0x0e10fc, 35}, + {0x0e119c, 15}, + {0x0e11e0, 1}, + {0x0e11e8, 5}, + {0x0e1204, 5}, + {0x0e121c, 1}, + {0x0e1300, 16}, + {0x0e1400, 3}, + {0x0e1410, 4}, + {0x0e1428, 3}, + {0x0e1448, 2}, + {0x0e1458, 2}, + {0x0e1464, 32}, + {0x0e14f0, 1}, + {0x0e14fc, 35}, + {0x0e159c, 15}, + {0x0e15e0, 1}, + {0x0e15e8, 5}, + {0x0e1604, 5}, + {0x0e161c, 1}, + {0x0e1700, 16}, + {0x0e1800, 3}, + {0x0e1810, 4}, + {0x0e1828, 3}, + {0x0e1848, 2}, + {0x0e1858, 2}, + {0x0e1864, 32}, + {0x0e18f0, 1}, + {0x0e18fc, 35}, + {0x0e199c, 15}, + {0x0e19e0, 1}, + {0x0e19e8, 5}, + {0x0e1a04, 5}, + {0x0e1a1c, 1}, + {0x0e1b00, 16}, + {0x0e1c00, 3}, + {0x0e1c10, 4}, + {0x0e1c28, 3}, + {0x0e1c48, 2}, + {0x0e1c58, 2}, + {0x0e1c64, 32}, + {0x0e1cf0, 1}, + {0x0e1cfc, 35}, + {0x0e1d9c, 15}, + {0x0e1de0, 1}, + {0x0e1de8, 5}, + {0x0e1e04, 5}, + {0x0e1e1c, 1}, + {0x0e1f00, 16}, + {0x0e20c0, 9}, + {0x0e20ec, 5}, + {0x0e2108, 3}, + {0x0e2200, 5}, + {0x0e2218, 36}, + {0x0e2300, 6}, + {0x0e2330, 4}, + {0x0e2500, 3}, + {0x0e2510, 12}, + {0x0e26e0, 6}, + {0x0e2700, 6}, + {0x0e2720, 6}, + {0x0e2740, 3}, + {0x0e2780, 6}, + {0x0e27a0, 6}, + {0x0e27c0, 3}, + {0x0e2800, 67}, + {0x0e2a00, 6}, + {0x0e2a20, 6}, + {0x0e2a40, 3}, + {0x0e2a50, 3}, + {0x0e2a60, 1}, + {0x0e2a80, 17}, + {0x0e3020, 10}, + {0x0e3070, 2}, + {0x0e3080, 2}, + {0x0e308c, 1}, + {0x0e3440, 21}, + {0x0e34e4, 13}, + {0x0e3520, 6}, + {0x0e3540, 6}, + {0x0e3560, 6}, + {0x0e3580, 6}, + {0x0e35a0, 6}, + {0x0e35c0, 6}, + {0x0e35e0, 6}, + {0x0e3600, 16}, + {0x0e3804, 3}, + {0x0e3900, 33}, + {0x0e3a00, 6}, + {0x0e3a20, 2}, + {0x0e3a30, 1}, + {0x0e3a40, 8}, + {0x0e3a64, 5}, + {0x0e3c00, 1}, + {0x0e3c28, 1}, + {0x0e3c50, 1}, + {0x0e3c78, 1}, + {0x0e3ca0, 2}, + {0x0e3dac, 1}, + {0x0e3dd8, 4}, + {0x0e3e00, 6}, + {0x0e3e20, 6}, + {0x0e3e40, 3}, + {0x0e4010, 12}, + {0x0e4044, 3}, + {0x0e4084, 2}, + {0x0e40bc, 84}, + {0x0e4240, 18}, + {0x0e45f0, 4}, + {0x0e4604, 1}, + {0x0e4640, 16}, + {0x0e46f0, 4}, + {0x0e4704, 1}, + {0x0e4740, 16}, + {0x0e5000, 8}, + {0x0e6000, 9}, + {0x0e6040, 16}, + {0x0e8000, 9}, + {0x0e8080, 6}, + {0x0e80a0, 3}, + {0x0f0000, 3}, + {0x0f0014, 11}, + {0x0f004c, 3}, + {0x0f0060, 8}, + {0x0f00f0, 3}, + {0x0f0100, 1}, + {0x0f010c, 2}, + {0x0f0118, 1}, + {0x0f0130, 4}, + {0x0f01a8, 1}, + {0x0f01c0, 2}, + {0x0f01d0, 10}, + {0x0f0200, 62}, + {0x0f0404, 9}, + {0x0f0440, 8}, + {0x0f0480, 5}, + {0x0f04b8, 21}, + {0x0f0520, 1}, + {0x0f0528, 1}, + {0x0f0540, 2}, + {0x0f0580, 4}, + {0x0f05a0, 1}, + {0x0f05c0, 8}, + {0x0f0800, 17}, + {0x0f0850, 9}, + {0x0f0880, 9}, + {0x0f0920, 4}, + {0x0f093c, 5}, + {0x0f095c, 5}, + {0x0f097c, 5}, + {0x0f099c, 1}, + {0x0f0a90, 3}, + {0x0f0c00, 128}, + {0x0f0e04, 1}, + {0x0f0e14, 9}, + {0x0f0e3c, 1}, + {0x0f1000, 16}, + {0x0f1080, 10}, + {0x0f10c0, 1}, + {0x0f10e0, 2}, + {0x0f10ec, 1}, + {0x0f10f4, 3}, + {0x0f1400, 6}, + {0x0f1420, 6}, + {0x0f1440, 6}, + {0x0f1460, 6}, + {0x0f1480, 6}, + {0x0f14a0, 6}, + {0x0f14c0, 6}, + {0x0f14e0, 6}, + {0x0f1500, 6}, + {0x0f1520, 6}, + {0x0f1540, 6}, + {0x0f1560, 6}, + {0x0f1580, 6}, + {0x0f15a0, 3}, + {0x0f1800, 3}, + {0x0f1840, 4}, + {0x0f1854, 3}, + {0x0f1864, 3}, + {0x0f1874, 3}, + {0x0f2000, 2}, + {0x0f200c, 3}, + {0x0f2020, 10}, + {0x0f2060, 6}, + {0x0f2080, 2}, + {0x0f208c, 3}, + {0x0f20a0, 10}, + {0x0f20e0, 6}, + {0x0f2100, 2}, + {0x0f210c, 3}, + {0x0f2120, 10}, + {0x0f2160, 6}, + {0x0f2180, 2}, + {0x0f218c, 3}, + {0x0f21a0, 10}, + {0x0f21e0, 6}, + {0x0f2200, 2}, + {0x0f220c, 3}, + {0x0f2220, 10}, + {0x0f2260, 6}, + {0x0f2280, 2}, + {0x0f228c, 3}, + {0x0f22a0, 10}, + {0x0f22e0, 6}, + {0x0f2300, 2}, + {0x0f230c, 3}, + {0x0f2320, 10}, + {0x0f2360, 6}, + {0x0f2380, 2}, + {0x0f238c, 3}, + {0x0f23a0, 10}, + {0x0f23e0, 6}, + {0x0f2400, 2}, + {0x0f240c, 3}, + {0x0f2420, 10}, + {0x0f2460, 6}, + {0x0f2480, 2}, + {0x0f248c, 3}, + {0x0f24a0, 10}, + {0x0f24e0, 6}, + {0x0f2500, 2}, + {0x0f250c, 3}, + {0x0f2520, 10}, + {0x0f2560, 6}, + {0x0f2580, 2}, + {0x0f258c, 3}, + {0x0f25a0, 10}, + {0x0f25e0, 6}, + {0x0f2600, 2}, + {0x0f260c, 3}, + {0x0f2620, 10}, + {0x0f2660, 6}, + {0x0f2680, 2}, + {0x0f268c, 3}, + {0x0f26a0, 10}, + {0x0f26e0, 6}, + {0x0f2700, 2}, + {0x0f270c, 3}, + {0x0f2720, 10}, + {0x0f2760, 6}, + {0x0f2780, 2}, + {0x0f278c, 3}, + {0x0f27a0, 10}, + {0x0f27e0, 6}, + {0x0f2800, 2}, + {0x0f280c, 3}, + {0x0f2820, 10}, + {0x0f2860, 6}, + {0x0f2880, 2}, + {0x0f288c, 3}, + {0x0f28a0, 10}, + {0x0f28e0, 6}, + {0x0f2900, 2}, + {0x0f290c, 3}, + {0x0f2920, 10}, + {0x0f2960, 6}, + {0x0f2980, 2}, + {0x0f298c, 3}, + {0x0f29a0, 10}, + {0x0f29e0, 6}, + {0x0f4000, 7}, + {0x0f4020, 4}, + {0x0f4204, 1}, + {0x0f4280, 35}, + {0x0f4310, 4}, + {0x0f4404, 1}, + {0x0f4480, 34}, + {0x0f4510, 10}, + {0x0f453c, 3}, + {0x0f4800, 7}, + {0x0f4820, 4}, + {0x0f4a04, 1}, + {0x0f4a80, 35}, + {0x0f4b10, 4}, + {0x0f4c04, 1}, + {0x0f4c80, 34}, + {0x0f4d10, 10}, + {0x0f4d3c, 3}, + {0x0f5000, 7}, + {0x0f5020, 4}, + {0x0f5204, 1}, + {0x0f5280, 35}, + {0x0f5310, 4}, + {0x0f5404, 1}, + {0x0f5480, 34}, + {0x0f5510, 10}, + {0x0f553c, 3}, + {0x0f5800, 7}, + {0x0f5820, 4}, + {0x0f5a04, 1}, + {0x0f5a80, 35}, + {0x0f5b10, 4}, + {0x0f5c04, 1}, + {0x0f5c80, 34}, + {0x0f5d10, 10}, + {0x0f5d3c, 3}, + {0x100000, 1}, + {0x100008, 1}, + {0x100010, 2}, + {0x100020, 1}, + {0x100028, 1}, + {0x100030, 2}, + {0x100040, 1}, + {0x100048, 1}, + {0x100050, 2}, + {0x100060, 1}, + {0x100068, 1}, + {0x100070, 2}, + {0x100080, 21}, + {0x1000d8, 2}, + {0x100100, 21}, + {0x100158, 2}, + {0x100180, 21}, + {0x1001d8, 2}, + {0x100200, 21}, + {0x100258, 2}, + {0x100284, 1}, + {0x1003b0, 5}, + {0x100400, 13}, + {0x100440, 13}, + {0x100480, 13}, + {0x1004c0, 13}, + {0x100500, 68}, + {0x100618, 1}, + {0x100804, 1}, + {0x10080c, 4}, + {0x100820, 9}, + {0x1008a0, 24}, + {0x100920, 24}, + {0x100a00, 48}, + {0x100adc, 68}, + {0x100d00, 1}, + {0x100d08, 2}, + {0x100d80, 4}, + {0x100e00, 4}, + {0x100e20, 1}, + {0x100e28, 7}, + {0x100e48, 7}, + {0x100e68, 7}, + {0x100e88, 6}, + {0x100ebc, 9}, + {0x100f00, 6}, + {0x100f1c, 10}, + {0x100f70, 8}, + {0x100f94, 4}, + {0x101404, 1}, + {0x10141c, 26}, + {0x101504, 1}, + {0x10151c, 30}, + {0x101600, 1}, + {0x101628, 6}, + {0x101648, 6}, + {0x101680, 16}, + {0x1016e0, 16}, + {0x101780, 2}, + {0x101790, 16}, + {0x101a00, 14}, + {0x101a3c, 4}, + {0x101a50, 2}, + {0x101a60, 2}, + {0x101a70, 2}, + {0x101a80, 2}, + {0x101a90, 1}, + {0x101a9c, 11}, + {0x101b0c, 5}, + {0x101c00, 34}, + {0x101d00, 3}, + {0x102000, 1}, + {0x102028, 1}, + {0x102050, 1}, + {0x102078, 1}, + {0x1020a0, 5}, + {0x1021ac, 1}, + {0x1021d8, 4}, + {0x102200, 6}, + {0x102220, 6}, + {0x102240, 3}, + {0x102560, 1}, + {0x102584, 10}, + {0x1025b0, 1}, + {0x1025fc, 1}, + {0x102604, 1}, + {0x1026ec, 69}, + {0x103000, 32}, + {0x103084, 5}, + {0x1030f8, 3}, + {0x103108, 3}, + {0x103118, 7}, + {0x103144, 1}, + {0x103160, 10}, + {0x103200, 32}, + {0x103284, 5}, + {0x1032f8, 3}, + {0x103308, 3}, + {0x103318, 7}, + {0x103344, 1}, + {0x103360, 10}, + {0x103400, 32}, + {0x103484, 5}, + {0x1034f8, 3}, + {0x103508, 3}, + {0x103518, 7}, + {0x103544, 1}, + {0x103560, 10}, + {0x103600, 32}, + {0x103684, 5}, + {0x1036f8, 3}, + {0x103708, 3}, + {0x103718, 7}, + {0x103744, 1}, + {0x103760, 10}, + {0x103800, 1}, + {0x10380c, 1}, + {0x10397c, 97}, + {0x104000, 3}, + {0x104020, 3}, + {0x104040, 3}, + {0x104060, 3}, + {0x104084, 1}, + {0x104090, 4}, + {0x1040a4, 1}, + {0x1040b0, 4}, + {0x1040c4, 1}, + {0x1040d0, 4}, + {0x1040e4, 1}, + {0x1040f0, 21}, + {0x104148, 18}, + {0x1041f0, 4}, + {0x104204, 3}, + {0x104308, 9}, + {0x104330, 2}, + {0x104340, 16}, + {0x1043b4, 4}, + {0x1043c8, 4}, + {0x1043dc, 4}, + {0x1043f0, 4}, + {0x104404, 1}, + {0x104470, 36}, + {0x104504, 1}, + {0x104570, 36}, + {0x104604, 1}, + {0x104670, 36}, + {0x104704, 1}, + {0x104770, 50}, + {0x104840, 2}, + {0x10484c, 1}, + {0x104900, 1}, + {0x104908, 1}, + {0x104984, 1}, + {0x1049a0, 24}, + {0x104a04, 7}, + {0x104a24, 7}, + {0x104a44, 7}, + {0x104a64, 7}, + {0x104a84, 7}, + {0x104aa4, 7}, + {0x104ac4, 7}, + {0x104ae4, 16}, + {0x104b40, 16}, + {0x104c00, 6}, + {0x104c20, 6}, + {0x104c40, 6}, + {0x104c60, 6}, + {0x104c80, 6}, + {0x104ca0, 6}, + {0x104cc0, 6}, + {0x104ce0, 6}, + {0x104d00, 3}, + {0x104d20, 6}, + {0x104d40, 6}, + {0x105000, 448}, + {0x105704, 3}, + {0x105734, 1}, + {0x106000, 62}, + {0x106100, 35}, + {0x1061c0, 6}, + {0x1061e0, 6}, + {0x106200, 6}, + {0x106220, 6}, + {0x106240, 6}, + {0x106260, 6}, + {0x106280, 6}, + {0x1062a0, 6}, + {0x1062c0, 6}, + {0x1062e0, 6}, + {0x106300, 6}, + {0x106320, 6}, + {0x106340, 6}, + {0x106360, 6}, + {0x106380, 6}, + {0x1063a0, 6}, + {0x107010, 1}, + {0x110000, 7}, + {0x110020, 7}, + {0x110040, 5}, + {0x110060, 6}, + {0x110080, 5}, + {0x110098, 1}, + {0x1100a0, 4}, + {0x1100b8, 8}, + {0x1100e0, 6}, + {0x110200, 7}, + {0x110220, 7}, + {0x110240, 5}, + {0x110260, 6}, + {0x110280, 5}, + {0x110298, 1}, + {0x1102a0, 4}, + {0x1102b8, 8}, + {0x1102e0, 6}, + {0x110400, 7}, + {0x110420, 7}, + {0x110440, 5}, + {0x110460, 6}, + {0x110480, 5}, + {0x110498, 1}, + {0x1104a0, 4}, + {0x1104b8, 8}, + {0x1104e0, 6}, + {0x110600, 7}, + {0x110620, 7}, + {0x110640, 5}, + {0x110660, 6}, + {0x110680, 5}, + {0x110698, 1}, + {0x1106a0, 4}, + {0x1106b8, 8}, + {0x1106e0, 6}, + {0x110800, 21}, + {0x110880, 15}, + {0x1108c0, 3}, + {0x1108d0, 2}, + {0x110900, 1}, + {0x111000, 1}, + {0x111028, 1}, + {0x111050, 1}, + {0x111078, 1}, + {0x1110a0, 5}, + {0x1111ac, 1}, + {0x1111d8, 4}, + {0x111200, 6}, + {0x111220, 6}, + {0x111240, 3}, + {0x111400, 6}, + {0x111420, 3}, + {0x111480, 9}, + {0x1114c0, 25}, + {0x111540, 25}, + {0x1115c0, 25}, + {0x111640, 20}, + {0x118000, 29}, + {0x118078, 4}, + {0x118090, 2}, + {0x1180a0, 7}, + {0x1180c0, 11}, + {0x118100, 14}, + {0x118140, 14}, + {0x118180, 61}, + {0x118278, 4}, + {0x118290, 2}, + {0x1182a0, 7}, + {0x1182c0, 11}, + {0x118300, 14}, + {0x118340, 14}, + {0x118380, 61}, + {0x118478, 4}, + {0x118490, 2}, + {0x1184a0, 7}, + {0x1184c0, 11}, + {0x118500, 14}, + {0x118540, 14}, + {0x118580, 61}, + {0x118678, 4}, + {0x118690, 2}, + {0x1186a0, 7}, + {0x1186c0, 11}, + {0x118700, 14}, + {0x118740, 14}, + {0x118780, 69}, + {0x118c00, 6}, + {0x118c40, 14}, + {0x118c80, 9}, + {0x118d00, 9}, + {0x118d2c, 1}, + {0x118d40, 3}, + {0x118d60, 1}, + {0x118d80, 3}, + {0x118e00, 2}, + {0x118e0c, 1}, + {0x118e14, 5}, + {0x118e2c, 1}, + {0x118e34, 5}, + {0x118e4c, 1}, + {0x118e54, 5}, + {0x118e6c, 1}, + {0x118e74, 5}, + {0x118e8c, 1}, + {0x118e94, 5}, + {0x118eac, 1}, + {0x118eb4, 3}, + {0x119000, 29}, + {0x119078, 4}, + {0x119090, 2}, + {0x1190a0, 7}, + {0x1190c0, 11}, + {0x119100, 14}, + {0x119140, 14}, + {0x119180, 61}, + {0x119278, 4}, + {0x119290, 2}, + {0x1192a0, 7}, + {0x1192c0, 11}, + {0x119300, 14}, + {0x119340, 14}, + {0x119380, 61}, + {0x119478, 4}, + {0x119490, 2}, + {0x1194a0, 7}, + {0x1194c0, 11}, + {0x119500, 14}, + {0x119540, 14}, + {0x119580, 61}, + {0x119678, 4}, + {0x119690, 2}, + {0x1196a0, 7}, + {0x1196c0, 11}, + {0x119700, 14}, + {0x119740, 14}, + {0x119780, 69}, + {0x119c00, 6}, + {0x119c40, 14}, + {0x119c80, 9}, + {0x119d00, 9}, + {0x119d2c, 1}, + {0x119d40, 3}, + {0x119d60, 1}, + {0x119d80, 3}, + {0x119e00, 2}, + {0x119e0c, 1}, + {0x119e14, 5}, + {0x119e2c, 1}, + {0x119e34, 5}, + {0x119e4c, 1}, + {0x119e54, 5}, + {0x119e6c, 1}, + {0x119e74, 5}, + {0x119e8c, 1}, + {0x119e94, 5}, + {0x119eac, 1}, + {0x119eb4, 3}, + {0x11c000, 19}, + {0x11c050, 10}, + {0x11c080, 19}, + {0x11c0d0, 10}, + {0x11c100, 19}, + {0x11c150, 10}, + {0x11c180, 19}, + {0x11c1d0, 10}, + {0x11c200, 19}, + {0x11c250, 10}, + {0x11c280, 19}, + {0x11c2d0, 10}, + {0x11c300, 19}, + {0x11c350, 10}, + {0x11c380, 19}, + {0x11c3d0, 10}, + {0x11c400, 19}, + {0x11c460, 6}, + {0x11c484, 1}, + {0x11c494, 8}, + {0x11c4b8, 9}, + {0x11c4e0, 4}, + {0x11c500, 3}, + {0x11d000, 7}, + {0x11d020, 15}, + {0x11d060, 15}, + {0x11d0a0, 15}, + {0x11d0e0, 15}, + {0x11d120, 15}, + {0x11d160, 15}, + {0x11d1a0, 15}, + {0x11d1e0, 8}, + {0x11d400, 9}, + {0x11d428, 3}, + {0x11d440, 5}, + {0x11d480, 9}, + {0x11d4a8, 3}, + {0x11d4c0, 5}, + {0x11d500, 9}, + {0x11d528, 3}, + {0x11d540, 5}, + {0x11d580, 9}, + {0x11d5a8, 3}, + {0x11d5c0, 5}, + {0x11d600, 6}, + {0x11d620, 6}, + {0x11d640, 6}, + {0x11d660, 6}, + {0x11d680, 6}, + {0x11d6a0, 6}, + {0x11d6c0, 6}, + {0x11d6e0, 6}, + {0x11d708, 2}, + {0x11d718, 4}, + {0x11d734, 1}, + {0x11d73c, 4}, + {0x11d750, 4}, + {0x11d764, 1}, + {0x11d76c, 2}, + {0x11d800, 8}, + {0x11d840, 8}, + {0x11d880, 8}, + {0x11d8c0, 8}, + {0x11d900, 8}, + {0x11d940, 8}, + {0x11d988, 4}, + {0x11da00, 3}, + {0x11da18, 2}, + {0x11da24, 16}, + {0x11da80, 3}, + {0x11da98, 2}, + {0x11daa4, 16}, + {0x11db00, 3}, + {0x11db18, 2}, + {0x11db24, 16}, + {0x11db80, 3}, + {0x11db98, 2}, + {0x11dba4, 16}, + {0x11dc00, 8}, + {0x11dc40, 1}, + {0x11e000, 72}, + {0x11e200, 72}, + {0x11e400, 72}, + {0x11e600, 72}, + {0x11e800, 6}, + {0x11e820, 6}, + {0x11e840, 6}, + {0x11e860, 6}, + {0x11e880, 6}, + {0x11e8a0, 6}, + {0x11e8c0, 6}, + {0x11e8e0, 6}, + {0x11e900, 6}, + {0x11e920, 6}, + {0x11e940, 6}, + {0x11e9fc, 7}, + {0x11ea20, 6}, + {0x11ea40, 6}, + {0x11ea60, 6}, + {0x11ea80, 6}, + {0x11eaa0, 6}, + {0x11eac0, 6}, + {0x11eae0, 6}, + {0x11eb00, 6}, + {0x11eb20, 6}, + {0x11eb40, 6}, + {0x11ebfc, 7}, + {0x11ec20, 6}, + {0x11ec40, 6}, + {0x11ec60, 6}, + {0x11ec80, 6}, + {0x11eca0, 6}, + {0x11ecc0, 6}, + {0x11ece0, 6}, + {0x11ed00, 6}, + {0x11ed20, 6}, + {0x11ed40, 6}, + {0x11edfc, 7}, + {0x11ee20, 6}, + {0x11ee40, 6}, + {0x11ee60, 6}, + {0x11ee80, 6}, + {0x11eea0, 6}, + {0x11eec0, 6}, + {0x11eee0, 6}, + {0x11ef00, 6}, + {0x11ef20, 6}, + {0x11ef40, 6}, + {0x11effc, 15}, + {0x11f040, 2}, + {0x11f080, 14}, + {0x11f0c0, 2}, + {0x11f100, 14}, + {0x11f140, 2}, + {0x11f180, 14}, + {0x11f1c0, 2}, + {0x11f400, 17}, + {0x11f448, 5}, + {0x11f460, 3}, + {0x11f470, 3}, + {0x11f480, 16}, + {0x11f500, 17}, + {0x11f548, 5}, + {0x11f560, 3}, + {0x11f570, 3}, + {0x11f580, 16}, + {0x11f600, 17}, + {0x11f648, 5}, + {0x11f660, 3}, + {0x11f670, 3}, + {0x11f680, 16}, + {0x11f700, 17}, + {0x11f748, 5}, + {0x11f760, 3}, + {0x11f770, 3}, + {0x11f780, 16}, + {0x11f800, 8}, + {0x11f824, 15}, + {0x11f864, 15}, + {0x11f8a4, 15}, + {0x11f8e4, 30}, + {0x11f980, 10}, + {0x11f9ac, 1}, + {0x11f9b4, 5}, + {0x11f9cc, 1}, + {0x11f9d4, 5}, + {0x11f9ec, 1}, + {0x11f9f4, 13}, + {0x11fa80, 4}, + {0x11fa94, 2}, + {0x11faa0, 5}, + {0x11fac0, 5}, + {0x11fae0, 4}, + {0x11fc04, 8}, + {0x11fc40, 13}, + {0x11fd00, 6}, + {0x11fd20, 3}, + {0x11fd30, 3}, + {0x11fd44, 1}, + {0x11fd4c, 1}, + {0x11fd54, 1}, + {0x11fd5c, 8}, + {0x11fd80, 11}, + {0x11fdc0, 3} }; + +static const unsigned long +mlx5_mst_dump_regs_mt4115[MLX5_NUM_MST_OFFSETS_MT4115][2] = {{0x000000, 16388}, + {0x010084, 1}, + {0x01008c, 2}, + {0x010180, 6}, + {0x01019c, 9}, + {0x010304, 1}, + {0x0103bc, 52}, + {0x010500, 35}, + {0x010604, 1}, + {0x01060c, 1}, + {0x010624, 1}, + {0x01062c, 1}, + {0x010644, 1}, + {0x01064c, 3}, + {0x010800, 1}, + {0x010814, 3}, + {0x010844, 1}, + {0x01084c, 4}, + {0x011000, 32}, + {0x011200, 32}, + {0x011400, 32}, + {0x011600, 32}, + {0x011800, 32}, + {0x011a00, 5}, + {0x011a24, 1}, + {0x011a2c, 1}, + {0x011a34, 3}, + {0x011a44, 1}, + {0x011a4c, 2}, + {0x012000, 20}, + {0x012088, 22}, + {0x012100, 18}, + {0x012200, 36}, + {0x012300, 3}, + {0x012310, 1}, + {0x012320, 1}, + {0x012330, 1}, + {0x012340, 1}, + {0x012350, 1}, + {0x012360, 1}, + {0x012370, 1}, + {0x012400, 166}, + {0x0126a0, 1}, + {0x012700, 32}, + {0x012800, 1}, + {0x012808, 1}, + {0x012810, 1}, + {0x012818, 1}, + {0x012820, 1}, + {0x012828, 1}, + {0x012830, 1}, + {0x012838, 1}, + {0x012850, 1}, + {0x012858, 1}, + {0x012860, 1}, + {0x012868, 1}, + {0x012870, 1}, + {0x012878, 1}, + {0x012880, 1}, + {0x012888, 1}, + {0x012890, 5}, + {0x012900, 10}, + {0x012940, 6}, + {0x012960, 6}, + {0x012980, 5}, + {0x012a00, 6}, + {0x012a40, 5}, + {0x012a80, 9}, + {0x012ac0, 19}, + {0x012b10, 2}, + {0x014000, 1}, + {0x014010, 10}, + {0x014104, 1}, + {0x01410c, 4}, + {0x014200, 1}, + {0x014210, 4}, + {0x014300, 4}, + {0x014400, 3}, + {0x014410, 1}, + {0x014444, 1}, + {0x01444c, 4}, + {0x014480, 2}, + {0x014500, 35}, + {0x014590, 1}, + {0x014600, 32}, + {0x014704, 1}, + {0x01470c, 1}, + {0x014804, 2}, + {0x014814, 3}, + {0x014824, 2}, + {0x014834, 3}, + {0x014844, 2}, + {0x014854, 5}, + {0x014a00, 4}, + {0x014a14, 2}, + {0x015004, 1}, + {0x01500c, 4}, + {0x015204, 1}, + {0x015214, 4}, + {0x015228, 1}, + {0x015244, 1}, + {0x015250, 4}, + {0x015274, 1}, + {0x01527c, 1}, + {0x0152a4, 1}, + {0x015300, 4}, + {0x015314, 1}, + {0x01531c, 3}, + {0x015404, 2}, + {0x015430, 20}, + {0x015484, 2}, + {0x015498, 5}, + {0x0154bc, 1}, + {0x0154c8, 1}, + {0x0154d4, 2}, + {0x01550c, 3}, + {0x015524, 3}, + {0x015608, 3}, + {0x015618, 4}, + {0x015680, 16}, + {0x015704, 2}, + {0x015730, 20}, + {0x015784, 3}, + {0x015794, 3}, + {0x0157c0, 6}, + {0x015800, 52}, + {0x0158d4, 1}, + {0x0158dc, 1}, + {0x0158e4, 3}, + {0x0158f4, 1}, + {0x0158fc, 52}, + {0x015b04, 2}, + {0x015b14, 3}, + {0x015b24, 2}, + {0x015b38, 6}, + {0x015b60, 5}, + {0x015c04, 1}, + {0x015cb8, 18}, + {0x015d98, 1}, + {0x015e00, 1}, + {0x015e08, 1}, + {0x015e10, 1}, + {0x016000, 32}, + {0x016100, 7}, + {0x016120, 2}, + {0x016144, 3}, + {0x016204, 1}, + {0x01620c, 1}, + {0x016224, 1}, + {0x01622c, 1}, + {0x016234, 1}, + {0x01623c, 1}, + {0x016244, 1}, + {0x01624c, 1}, + {0x016254, 1}, + {0x01625c, 2}, + {0x016270, 1}, + {0x016280, 1}, + {0x016290, 1}, + {0x0162a0, 1}, + {0x0162b0, 1}, + {0x0162c0, 1}, + {0x0162d0, 1}, + {0x0162e0, 1}, + {0x0162f0, 1}, + {0x016300, 1}, + {0x016310, 1}, + {0x016320, 1}, + {0x016330, 1}, + {0x016340, 32}, + {0x016400, 19}, + {0x016454, 7}, + {0x016484, 1}, + {0x01648c, 1}, + {0x016800, 9}, + {0x016840, 16}, + {0x017800, 1}, + {0x017828, 1}, + {0x017850, 1}, + {0x017878, 1}, + {0x0178a0, 12}, + {0x0179ac, 1}, + {0x0179d8, 4}, + {0x017a00, 6}, + {0x017a20, 6}, + {0x017a40, 3}, + {0x017c00, 6}, + {0x017c20, 6}, + {0x017c40, 6}, + {0x017c60, 6}, + {0x017c80, 6}, + {0x017ca0, 6}, + {0x017cc0, 6}, + {0x017ce0, 6}, + {0x017d00, 3}, + {0x017d10, 5}, + {0x018400, 11}, + {0x018430, 2}, + {0x018440, 4}, + {0x018604, 1}, + {0x018618, 2}, + {0x018640, 4}, + {0x018660, 2}, + {0x018800, 1}, + {0x018810, 4}, + {0x018844, 1}, + {0x01884c, 4}, + {0x018880, 2}, + {0x01a400, 12}, + {0x01a444, 1}, + {0x01a44c, 1}, + {0x01a800, 1}, + {0x01a814, 3}, + {0x01a844, 1}, + {0x01a84c, 4}, + {0x01c400, 7}, + {0x01c500, 8}, + {0x01c544, 1}, + {0x01c554, 3}, + {0x01c564, 3}, + {0x01c574, 3}, + {0x01c604, 1}, + {0x01c60c, 3}, + {0x01c800, 1}, + {0x01c814, 3}, + {0x01c844, 1}, + {0x01c84c, 4}, + {0x01f000, 1}, + {0x01f400, 1}, + {0x01f408, 4}, + {0x01f41c, 3}, + {0x01f500, 2}, + {0x01f800, 1}, + {0x01f814, 3}, + {0x01f844, 1}, + {0x01f84c, 4}, + {0x01f880, 3}, + {0x020004, 1}, + {0x02000c, 4}, + {0x020080, 9}, + {0x020800, 16}, + {0x020900, 192}, + {0x021004, 1}, + {0x02100c, 4}, + {0x021400, 5}, + {0x021418, 5}, + {0x021480, 1}, + {0x021704, 2}, + {0x02173c, 17}, + {0x021804, 1}, + {0x02180c, 4}, + {0x021840, 2}, + {0x022280, 12}, + {0x022408, 6}, + {0x022444, 1}, + {0x022454, 3}, + {0x022504, 1}, + {0x02250c, 4}, + {0x022624, 1}, + {0x02262c, 3}, + {0x02263c, 1}, + {0x022804, 1}, + {0x02280c, 4}, + {0x022904, 4}, + {0x022924, 4}, + {0x024000, 36}, + {0x0240c0, 21}, + {0x024120, 11}, + {0x024200, 5}, + {0x024220, 1}, + {0x024230, 8}, + {0x024258, 1}, + {0x024260, 1}, + {0x024270, 9}, + {0x0242a0, 1}, + {0x0242b0, 4}, + {0x0242c8, 2}, + {0x024300, 5}, + {0x024318, 2}, + {0x02439c, 25}, + {0x024424, 4}, + {0x024464, 13}, + {0x0244a4, 1}, + {0x0244ac, 3}, + {0x0244c0, 2}, + {0x0244d0, 3}, + {0x0244e0, 3}, + {0x0244f0, 3}, + {0x024500, 3}, + {0x024510, 3}, + {0x024520, 3}, + {0x024530, 3}, + {0x024884, 11}, + {0x0248b4, 3}, + {0x0248c4, 1}, + {0x0248cc, 1}, + {0x0248d4, 1}, + {0x0248dc, 1}, + {0x0248f0, 2}, + {0x024908, 6}, + {0x024928, 6}, + {0x024968, 6}, + {0x024984, 3}, + {0x024994, 1}, + {0x02499c, 6}, + {0x0249b8, 7}, + {0x024a08, 6}, + {0x024a28, 6}, + {0x024a68, 6}, + {0x024a84, 1}, + {0x024a8c, 1}, + {0x024a94, 1}, + {0x024a9c, 13}, + {0x024ae0, 6}, + {0x024b00, 7}, + {0x024b20, 6}, + {0x024b40, 8}, + {0x024c00, 2}, + {0x024c24, 3}, + {0x024c34, 3}, + {0x025004, 6}, + {0x025800, 37}, + {0x025904, 1}, + {0x02590c, 1}, + {0x026000, 99}, + {0x026200, 1}, + {0x026800, 7}, + {0x026824, 6}, + {0x026840, 2}, + {0x026864, 1}, + {0x02686c, 1}, + {0x026874, 3}, + {0x026884, 1}, + {0x0268a4, 7}, + {0x026904, 1}, + {0x02690c, 4}, + {0x026940, 1}, + {0x026980, 33}, + {0x026a0c, 7}, + {0x026a30, 1}, + {0x026a44, 4}, + {0x026a60, 1}, + {0x026a70, 1}, + {0x026b00, 9}, + {0x026b44, 2}, + {0x026b68, 6}, + {0x026b84, 2}, + {0x026ba8, 14}, + {0x026c00, 16}, + {0x026c44, 1}, + {0x026c4c, 1}, + {0x026c84, 1}, + {0x026c8c, 4}, + {0x026cc4, 1}, + {0x026ccc, 4}, + {0x026d00, 2}, + {0x028800, 2}, + {0x028844, 1}, + {0x02884c, 4}, + {0x029004, 7}, + {0x029b30, 2}, + {0x029b50, 4}, + {0x02a004, 1}, + {0x02a00c, 4}, + {0x02a040, 6}, + {0x02a200, 1}, + {0x02a210, 11}, + {0x02a240, 8}, + {0x02a484, 1}, + {0x02a4c0, 16}, + {0x02a780, 1}, + {0x02a7a0, 4}, + {0x02a7c0, 1}, + {0x02a900, 1}, + {0x02aa04, 1}, + {0x02aa0c, 4}, + {0x02ab00, 40}, + {0x02aba4, 1}, + {0x02abac, 1}, + {0x02abb4, 1}, + {0x02abbc, 1}, + {0x02abc4, 1}, + {0x02abcc, 1}, + {0x02abe0, 4}, + {0x02abf4, 2}, + {0x02ac00, 2}, + {0x02ac10, 3}, + {0x02ac44, 3}, + {0x02ad00, 1}, + {0x02ad08, 1}, + {0x02ad10, 1}, + {0x02ad18, 1}, + {0x02ad20, 1}, + {0x02ad28, 1}, + {0x02ad30, 1}, + {0x02ad38, 1}, + {0x02ad40, 1}, + {0x02ad48, 1}, + {0x02ad50, 1}, + {0x02ad58, 1}, + {0x02ad60, 1}, + {0x02ad68, 1}, + {0x02ad70, 1}, + {0x02ad78, 1}, + {0x02ad80, 1}, + {0x02ad88, 1}, + {0x02ad90, 1}, + {0x02ad98, 1}, + {0x02ada0, 1}, + {0x02ada8, 1}, + {0x02adb0, 1}, + {0x02adb8, 1}, + {0x02adc0, 1}, + {0x02adc8, 1}, + {0x02add0, 1}, + {0x02add8, 1}, + {0x02ade0, 1}, + {0x02ade8, 1}, + {0x02adf0, 1}, + {0x02adf8, 1}, + {0x02ae00, 6}, + {0x02ae20, 5}, + {0x02ae40, 4}, + {0x02c000, 6}, + {0x02c100, 5}, + {0x02c204, 1}, + {0x02c214, 3}, + {0x02c224, 1}, + {0x02c22c, 4}, + {0x02c244, 2}, + {0x02c250, 5}, + {0x02c400, 2}, + {0x02c428, 2}, + {0x02c450, 2}, + {0x02c478, 2}, + {0x02c4a0, 24}, + {0x02c5ac, 1}, + {0x02c5d8, 4}, + {0x02c600, 6}, + {0x02c620, 6}, + {0x02c640, 6}, + {0x02c660, 6}, + {0x02c680, 3}, + {0x02c800, 3}, + {0x02c820, 6}, + {0x02c840, 6}, + {0x02c860, 6}, + {0x02c880, 6}, + {0x02c8a0, 6}, + {0x02c8c0, 6}, + {0x02c8e0, 6}, + {0x02c900, 6}, + {0x02c920, 6}, + {0x02c940, 6}, + {0x02c960, 6}, + {0x02c980, 6}, + {0x02c9a0, 6}, + {0x02c9c0, 6}, + {0x02c9e0, 6}, + {0x02ca00, 6}, + {0x02ca20, 6}, + {0x02ca40, 6}, + {0x02ca60, 6}, + {0x02ca80, 6}, + {0x02caa0, 6}, + {0x02cac0, 6}, + {0x02cae0, 6}, + {0x02cb00, 6}, + {0x02cb20, 6}, + {0x02cc48, 4}, + {0x02cd00, 9}, + {0x02cd40, 17}, + {0x030004, 2}, + {0x030034, 19}, + {0x030084, 2}, + {0x0300bc, 17}, + {0x030104, 2}, + {0x030138, 27}, + {0x030300, 7}, + {0x030340, 2}, + {0x03034c, 2}, + {0x030384, 1}, + {0x0303c0, 16}, + {0x030404, 1}, + {0x03040c, 4}, + {0x030804, 1}, + {0x03080c, 4}, + {0x030c04, 1}, + {0x030c0c, 4}, + {0x030c40, 4}, + {0x031000, 11}, + {0x031100, 11}, + {0x031200, 17}, + {0x031280, 6}, + {0x031304, 1}, + {0x03130c, 5}, + {0x031400, 6}, + {0x031420, 1}, + {0x031444, 2}, + {0x031454, 3}, + {0x031464, 2}, + {0x031474, 11}, + {0x031500, 7}, + {0x031520, 6}, + {0x031540, 8}, + {0x031600, 13}, + {0x031640, 6}, + {0x031700, 2}, + {0x034200, 24}, + {0x034280, 10}, + {0x0342ac, 2}, + {0x0342c0, 6}, + {0x0342f0, 39}, + {0x034600, 24}, + {0x034680, 10}, + {0x0346ac, 2}, + {0x0346c0, 6}, + {0x0346f0, 39}, + {0x034c00, 6}, + {0x034c20, 4}, + {0x034c40, 9}, + {0x034c80, 9}, + {0x034e04, 2}, + {0x034e14, 3}, + {0x034e44, 1}, + {0x034e4c, 4}, + {0x034e80, 6}, + {0x034f04, 1}, + {0x034f18, 11}, + {0x034f80, 2}, + {0x035000, 2}, + {0x035010, 3}, + {0x035044, 3}, + {0x035100, 66}, + {0x035210, 3}, + {0x035244, 3}, + {0x035300, 64}, + {0x035404, 1}, + {0x03540c, 8}, + {0x037000, 6}, + {0x03702c, 7}, + {0x037080, 10}, + {0x0370ac, 4}, + {0x037100, 4}, + {0x037200, 2}, + {0x037210, 3}, + {0x037244, 3}, + {0x037300, 70}, + {0x03742c, 7}, + {0x037480, 10}, + {0x0374ac, 4}, + {0x037500, 4}, + {0x037600, 2}, + {0x037610, 3}, + {0x037644, 3}, + {0x037700, 69}, + {0x037818, 4}, + {0x038000, 3}, + {0x038104, 3}, + {0x03813c, 2}, + {0x038150, 3}, + {0x038400, 2}, + {0x038428, 2}, + {0x038450, 2}, + {0x038478, 2}, + {0x0384a0, 22}, + {0x0385ac, 1}, + {0x0385d8, 4}, + {0x038600, 6}, + {0x038620, 6}, + {0x038640, 6}, + {0x038660, 6}, + {0x038680, 3}, + {0x038800, 3}, + {0x038820, 6}, + {0x038840, 6}, + {0x038860, 6}, + {0x038880, 6}, + {0x0388a0, 6}, + {0x0388c0, 6}, + {0x038900, 28}, + {0x038978, 1}, + {0x038a40, 25}, + {0x038ac0, 16}, + {0x039000, 35}, + {0x039090, 3}, + {0x039100, 35}, + {0x039190, 3}, + {0x039200, 35}, + {0x039290, 3}, + {0x039300, 35}, + {0x039390, 3}, + {0x039400, 35}, + {0x039490, 3}, + {0x039500, 1}, + {0x039800, 3}, + {0x039884, 1}, + {0x0398c0, 16}, + {0x039904, 2}, + {0x039934, 20}, + {0x039a04, 2}, + {0x039a10, 4}, + {0x039a24, 2}, + {0x039a30, 4}, + {0x039a44, 2}, + {0x039a50, 4}, + {0x039a64, 2}, + {0x039a70, 8}, + {0x039c00, 7}, + {0x039c20, 6}, + {0x039c40, 8}, + {0x039d00, 11}, + {0x039d40, 11}, + {0x039d84, 1}, + {0x039dc0, 26}, + {0x039e30, 2}, + {0x039e44, 3}, + {0x039e54, 1}, + {0x039e5c, 2}, + {0x039e80, 2}, + {0x039e90, 3}, + {0x039ea4, 1}, + {0x039eac, 3}, + {0x039ec0, 3}, + {0x039f00, 9}, + {0x039f40, 22}, + {0x039fa0, 6}, + {0x039fc0, 8}, + {0x03c000, 3}, + {0x03c010, 3}, + {0x03c020, 3}, + {0x03c040, 9}, + {0x03c068, 6}, + {0x03c090, 2}, + {0x03c0a0, 3}, + {0x03c0c0, 12}, + {0x03c0f4, 1}, + {0x03c100, 2}, + {0x03c110, 3}, + {0x03c120, 1}, + {0x03c130, 11}, + {0x03c160, 2}, + {0x03c180, 4}, + {0x03c194, 3}, + {0x03c1a4, 2}, + {0x03c1b0, 4}, + {0x03c2e0, 5}, + {0x03c2f8, 2}, + {0x03c30c, 13}, + {0x03c34c, 77}, + {0x03c48c, 18}, + {0x03c500, 32}, + {0x03c800, 16}, + {0x03c84c, 18}, + {0x03c8e0, 2}, + {0x03c8ec, 3}, + {0x03c900, 1}, + {0x03c910, 5}, + {0x03c930, 5}, + {0x03c950, 5}, + {0x03c970, 5}, + {0x03c9a4, 3}, + {0x03ca78, 34}, + {0x03cb3c, 18}, + {0x03cb94, 3}, + {0x03cba4, 3}, + {0x03cbf0, 1}, + {0x03cbf8, 10}, + {0x03cc30, 1}, + {0x03cc44, 4}, + {0x03cc60, 1}, + {0x03cc80, 1}, + {0x03cc90, 1}, + {0x03d004, 6}, + {0x03e004, 1}, + {0x03e00c, 4}, + {0x03e404, 1}, + {0x03e40c, 4}, + {0x03e604, 1}, + {0x03e60c, 4}, + {0x03e800, 1}, + {0x03f034, 19}, + {0x03f084, 2}, + {0x03f0c0, 16}, + {0x03f200, 1}, + {0x03f210, 1}, + {0x03f300, 6}, + {0x03f320, 6}, + {0x03f380, 9}, + {0x03f3c0, 16}, + {0x050000, 1}, + {0x050008, 2}, + {0x050044, 8}, + {0x050104, 1}, + {0x050178, 34}, + {0x050204, 1}, + {0x05020c, 1}, + {0x050214, 1}, + {0x050224, 1}, + {0x05022c, 1}, + {0x050234, 1}, + {0x05023c, 1}, + {0x050244, 1}, + {0x05024c, 1}, + {0x050254, 1}, + {0x050264, 3}, + {0x050280, 2}, + {0x0502b0, 2}, + {0x0502c4, 1}, + {0x0502cc, 1}, + {0x0502d4, 1}, + {0x0502dc, 1}, + {0x0502e4, 1}, + {0x0502ec, 1}, + {0x0502f4, 1}, + {0x0502fc, 1}, + {0x050304, 1}, + {0x05030c, 1}, + {0x050314, 3}, + {0x050324, 3}, + {0x050334, 6}, + {0x050380, 32}, + {0x050404, 1}, + {0x050438, 18}, + {0x050500, 2}, + {0x050544, 1}, + {0x05054c, 4}, + {0x050584, 2}, + {0x050598, 2}, + {0x0505a4, 1}, + {0x0505b0, 4}, + {0x0505c4, 1}, + {0x0505cc, 1}, + {0x0505d4, 1}, + {0x0505e0, 1}, + {0x0505f0, 17}, + {0x050700, 2}, + {0x050800, 1}, + {0x050820, 9}, + {0x050850, 5}, + {0x050870, 5}, + {0x0508c4, 3}, + {0x0508d4, 3}, + {0x0508e4, 3}, + {0x0508f4, 4}, + {0x050910, 5}, + {0x050930, 4}, + {0x050944, 1}, + {0x05094c, 57}, + {0x050a40, 1}, + {0x050a50, 28}, + {0x050ac4, 1}, + {0x050acc, 2}, + {0x050ad8, 10}, + {0x050b04, 1}, + {0x050b0c, 1}, + {0x050b14, 1}, + {0x050b20, 9}, + {0x050b50, 4}, + {0x050b64, 1}, + {0x050b70, 6}, + {0x050b8c, 4}, + {0x050ba4, 4}, + {0x050bc0, 1}, + {0x050bd0, 1}, + {0x050be0, 1}, + {0x050bf0, 1}, + {0x050c00, 1}, + {0x050c14, 1}, + {0x050c1c, 1}, + {0x050c24, 1}, + {0x050c2c, 1}, + {0x050c34, 1}, + {0x050c3c, 1}, + {0x050c44, 1}, + {0x050c4c, 1}, + {0x050c54, 4}, + {0x050c70, 1}, + {0x050c80, 1}, + {0x050c90, 1}, + {0x050ca0, 1}, + {0x050cb0, 1}, + {0x050cc0, 1}, + {0x050cd0, 1}, + {0x050ce0, 1}, + {0x050cf0, 1}, + {0x050d00, 1}, + {0x050d10, 1}, + {0x050d20, 1}, + {0x050d30, 1}, + {0x050d40, 1}, + {0x050d50, 1}, + {0x050d60, 1}, + {0x050d70, 1}, + {0x050d80, 1}, + {0x050d90, 1}, + {0x050da0, 1}, + {0x052000, 192}, + {0x052800, 2}, + {0x053000, 51}, + {0x053400, 3}, + {0x053420, 6}, + {0x053440, 6}, + {0x053460, 6}, + {0x053480, 6}, + {0x0534a0, 6}, + {0x0534c0, 6}, + {0x0534e0, 6}, + {0x053500, 6}, + {0x053520, 6}, + {0x053540, 6}, + {0x053560, 6}, + {0x053600, 32}, + {0x0536c0, 16}, + {0x053704, 1}, + {0x053740, 18}, + {0x0537a0, 10}, + {0x0537e0, 29}, + {0x053860, 10}, + {0x054204, 1}, + {0x05420c, 1}, + {0x054214, 1}, + {0x054224, 1}, + {0x05422c, 1}, + {0x054234, 1}, + {0x054244, 1}, + {0x05424c, 3}, + {0x054304, 1}, + {0x054340, 16}, + {0x0543a0, 1}, + {0x0543b0, 1}, + {0x0543c0, 1}, + {0x0543d0, 4}, + {0x054400, 8}, + {0x054820, 3}, + {0x054928, 9}, + {0x054954, 4}, + {0x054970, 1}, + {0x054980, 1}, + {0x054990, 3}, + {0x0549c4, 13}, + {0x054a04, 1}, + {0x054a0c, 4}, + {0x054a40, 1}, + {0x054a50, 5}, + {0x054a70, 1}, + {0x054a80, 1}, + {0x054a90, 2}, + {0x054c04, 1}, + {0x054c0c, 4}, + {0x054c40, 1}, + {0x054c50, 4}, + {0x054d40, 1}, + {0x054d80, 7}, + {0x054e04, 1}, + {0x054e0c, 4}, + {0x054e40, 1}, + {0x054e50, 4}, + {0x054f40, 1}, + {0x054f80, 7}, + {0x055000, 11}, + {0x055050, 7}, + {0x055070, 16}, + {0x0550b8, 8}, + {0x055200, 13}, + {0x055240, 11}, + {0x055300, 16}, + {0x055400, 3}, + {0x055420, 6}, + {0x055440, 6}, + {0x055460, 6}, + {0x055480, 6}, + {0x0554a0, 6}, + {0x0554c0, 6}, + {0x0554e0, 6}, + {0x055500, 6}, + {0x055520, 6}, + {0x055540, 6}, + {0x055560, 6}, + {0x055580, 6}, + {0x0555a0, 6}, + {0x0555c0, 6}, + {0x0555e0, 6}, + {0x055600, 6}, + {0x055620, 6}, + {0x055640, 6}, + {0x055660, 6}, + {0x056000, 2}, + {0x056044, 1}, + {0x05604c, 4}, + {0x056800, 2}, + {0x056844, 1}, + {0x05684c, 4}, + {0x057004, 7}, + {0x057b20, 2}, + {0x057b30, 2}, + {0x057b50, 5}, + {0x058004, 1}, + {0x05800c, 4}, + {0x058800, 7}, + {0x058820, 6}, + {0x058844, 2}, + {0x058864, 9}, + {0x058890, 1}, + {0x0588a0, 1}, + {0x0588b0, 1}, + {0x0588c0, 1}, + {0x0588d0, 1}, + {0x058904, 3}, + {0x058978, 66}, + {0x058a84, 1}, + {0x058a8c, 1}, + {0x058a94, 1}, + {0x058aa0, 2}, + {0x058ff0, 4}, + {0x059004, 1}, + {0x05900c, 4}, + {0x059080, 39}, + {0x059120, 1}, + {0x059130, 1}, + {0x059140, 1}, + {0x059150, 1}, + {0x059160, 1}, + {0x059800, 1}, + {0x059810, 1}, + {0x059820, 1}, + {0x059830, 1}, + {0x059840, 1}, + {0x059850, 1}, + {0x059860, 1}, + {0x059870, 1}, + {0x059880, 1}, + {0x059890, 1}, + {0x0598a0, 1}, + {0x0598d4, 2}, + {0x0598e0, 1}, + {0x059900, 3}, + {0x059980, 9}, + {0x0599c0, 32}, + {0x059a44, 3}, + {0x059a54, 1}, + {0x059a5c, 1}, + {0x059c00, 2}, + {0x059c28, 2}, + {0x059c50, 2}, + {0x059c78, 2}, + {0x059ca0, 20}, + {0x059dac, 1}, + {0x059dd8, 4}, + {0x059e00, 6}, + {0x059e20, 6}, + {0x059e40, 6}, + {0x059e60, 6}, + {0x059e80, 3}, + {0x05a000, 4}, + {0x05a020, 2}, + {0x05a030, 1}, + {0x05a04c, 2}, + {0x05a070, 4}, + {0x05a088, 3}, + {0x05a0a0, 1}, + {0x05a0b0, 1}, + {0x05a0c0, 1}, + {0x060020, 4}, + {0x060120, 4}, + {0x060144, 2}, + {0x060154, 7}, + {0x060174, 1}, + {0x06017c, 3}, + {0x06018c, 1}, + {0x0601a0, 4}, + {0x0601b8, 2}, + {0x0601c4, 2}, + {0x0601e4, 7}, + {0x060204, 1}, + {0x06020c, 4}, + {0x060240, 16}, + {0x060304, 1}, + {0x06030c, 1}, + {0x060400, 2}, + {0x060420, 3}, + {0x060430, 1}, + {0x060440, 1}, + {0x060484, 2}, + {0x0604b8, 18}, + {0x060504, 1}, + {0x06050c, 1}, + {0x060514, 1}, + {0x060900, 1}, + {0x060914, 1}, + {0x06091c, 2}, + {0x060930, 1}, + {0x060a00, 32}, + {0x060a84, 1}, + {0x060a8c, 1}, + {0x060a94, 1}, + {0x060a9c, 1}, + {0x060bf0, 3}, + {0x060c00, 11}, + {0x060c30, 3}, + {0x061004, 1}, + {0x061010, 1}, + {0x061018, 4}, + {0x061044, 2}, + {0x061054, 3}, + {0x061100, 32}, + {0x061200, 1}, + {0x061404, 1}, + {0x06140c, 4}, + {0x061440, 2}, + {0x061468, 2}, + {0x061478, 2}, + {0x06180c, 8}, + {0x061908, 6}, + {0x061928, 6}, + {0x061968, 6}, + {0x061988, 6}, + {0x0619a8, 6}, + {0x0619e8, 14}, + {0x062008, 5}, + {0x062024, 3}, + {0x062034, 1}, + {0x06203c, 1}, + {0x062044, 1}, + {0x06204c, 1}, + {0x062054, 1}, + {0x06205c, 3}, + {0x06206c, 1}, + {0x062080, 2}, + {0x062094, 1}, + {0x06209c, 2}, + {0x0620b0, 1}, + {0x0620c0, 1}, + {0x0620d0, 1}, + {0x0620e0, 1}, + {0x0620f4, 4}, + {0x0621c0, 2}, + {0x0621e4, 3}, + {0x0621f4, 3}, + {0x062404, 1}, + {0x06240c, 4}, + {0x062800, 12}, + {0x062834, 1}, + {0x06283c, 5}, + {0x062854, 1}, + {0x062900, 3}, + {0x062914, 1}, + {0x06291c, 1}, + {0x062924, 1}, + {0x06292c, 1}, + {0x062934, 1}, + {0x06293c, 2}, + {0x062950, 3}, + {0x062960, 2}, + {0x063000, 1}, + {0x063010, 4}, + {0x063024, 1}, + {0x06302c, 1}, + {0x063034, 1}, + {0x063044, 1}, + {0x06304c, 1}, + {0x063054, 1}, + {0x06305c, 2}, + {0x063070, 1}, + {0x063080, 1}, + {0x063090, 1}, + {0x0630a4, 3}, + {0x063100, 2}, + {0x063144, 1}, + {0x06314c, 1}, + {0x063154, 1}, + {0x063164, 1}, + {0x06316c, 1}, + {0x063174, 1}, + {0x063180, 16}, + {0x063208, 1}, + {0x063210, 1}, + {0x063218, 1}, + {0x063224, 3}, + {0x063240, 10}, + {0x063300, 8}, + {0x064004, 1}, + {0x06400c, 4}, + {0x064200, 3}, + {0x064300, 1}, + {0x064308, 6}, + {0x064324, 2}, + {0x064338, 2}, + {0x064380, 2}, + {0x064394, 1}, + {0x06439c, 2}, + {0x064400, 2}, + {0x064420, 3}, + {0x064430, 1}, + {0x064440, 1}, + {0x064484, 2}, + {0x0644b8, 18}, + {0x064504, 1}, + {0x06450c, 1}, + {0x064514, 1}, + {0x064804, 1}, + {0x064884, 2}, + {0x0648c8, 14}, + {0x065018, 2}, + {0x065080, 9}, + {0x0650c0, 16}, + {0x065104, 3}, + {0x065114, 1}, + {0x065200, 3}, + {0x065220, 6}, + {0x065240, 6}, + {0x065260, 6}, + {0x065280, 6}, + {0x065300, 1}, + {0x065f00, 2}, + {0x066000, 2}, + {0x066028, 2}, + {0x066050, 2}, + {0x066078, 2}, + {0x0660a0, 24}, + {0x0661ac, 1}, + {0x0661d8, 4}, + {0x066200, 6}, + {0x066220, 6}, + {0x066240, 6}, + {0x066260, 6}, + {0x066280, 3}, + {0x066400, 6}, + {0x066500, 17}, + {0x066550, 8}, + {0x066574, 3}, + {0x070004, 1}, + {0x07000c, 4}, + {0x070400, 7}, + {0x070420, 11}, + {0x070500, 5}, + {0x070524, 1}, + {0x07052c, 1}, + {0x070534, 1}, + {0x070540, 4}, + {0x070600, 142}, + {0x070884, 2}, + {0x0708b4, 19}, + {0x070904, 1}, + {0x070940, 16}, + {0x071000, 1}, + {0x071094, 1}, + {0x071104, 2}, + {0x071140, 16}, + {0x071204, 1}, + {0x07120c, 4}, + {0x071404, 1}, + {0x07140c, 1}, + {0x071414, 1}, + {0x07141c, 1}, + {0x071424, 1}, + {0x07142c, 1}, + {0x071434, 1}, + {0x07143c, 1}, + {0x071800, 2}, + {0x071904, 1}, + {0x071940, 16}, + {0x071a04, 1}, + {0x071a40, 16}, + {0x071b04, 1}, + {0x071b40, 16}, + {0x072000, 3}, + {0x072804, 1}, + {0x07280c, 4}, + {0x072880, 1}, + {0x072888, 3}, + {0x073000, 1}, + {0x073020, 1}, + {0x073040, 1}, + {0x073060, 1}, + {0x073080, 1}, + {0x0730a0, 1}, + {0x0730c0, 1}, + {0x0730e0, 1}, + {0x073100, 1}, + {0x073120, 1}, + {0x073140, 1}, + {0x073160, 1}, + {0x073180, 1}, + {0x0731a0, 1}, + {0x0731c0, 1}, + {0x0731e0, 1}, + {0x073200, 1}, + {0x073220, 1}, + {0x073240, 1}, + {0x073260, 1}, + {0x073280, 1}, + {0x0732a0, 1}, + {0x0732c0, 1}, + {0x0732e0, 1}, + {0x073300, 1}, + {0x073320, 1}, + {0x073340, 1}, + {0x073360, 1}, + {0x073380, 1}, + {0x0733a0, 1}, + {0x0733c0, 1}, + {0x0733e0, 1}, + {0x073800, 16}, + {0x074830, 4}, + {0x074884, 1}, + {0x074890, 4}, + {0x074900, 3}, + {0x074920, 6}, + {0x074940, 6}, + {0x074980, 9}, + {0x0749c0, 16}, + {0x074c00, 1}, + {0x074c28, 1}, + {0x074c50, 1}, + {0x074c78, 1}, + {0x074ca0, 12}, + {0x074dac, 1}, + {0x074dd8, 4}, + {0x074e00, 6}, + {0x074e20, 6}, + {0x074e40, 3}, + {0x075000, 3}, + {0x075010, 2}, + {0x075020, 3}, + {0x075100, 2}, + {0x078000, 20}, + {0x078054, 9}, + {0x07807c, 3}, + {0x07808c, 7}, + {0x078100, 5}, + {0x078118, 4}, + {0x078130, 2}, + {0x078204, 2}, + {0x078244, 15}, + {0x078284, 2}, + {0x0782c4, 17}, + {0x07830c, 4}, + {0x078320, 2}, + {0x078340, 2}, + {0x078360, 2}, + {0x078380, 2}, + {0x0783a0, 2}, + {0x0783d4, 1}, + {0x0783dc, 1}, + {0x078404, 1}, + {0x07840c, 4}, + {0x078804, 18}, + {0x078850, 4}, + {0x078884, 1}, + {0x07889c, 1}, + {0x0788a4, 23}, + {0x079000, 9}, + {0x079040, 16}, + {0x079804, 2}, + {0x079828, 6}, + {0x079844, 2}, + {0x079868, 6}, + {0x079884, 2}, + {0x0798a8, 7}, + {0x0798d0, 7}, + {0x079904, 1}, + {0x07990c, 1}, + {0x079914, 1}, + {0x079a04, 2}, + {0x079a40, 16}, + {0x079a84, 2}, + {0x079a90, 2}, + {0x079a9c, 13}, + {0x079b04, 2}, + {0x079b3c, 17}, + {0x079c04, 2}, + {0x079c40, 16}, + {0x079c84, 2}, + {0x079c98, 2}, + {0x079cc4, 13}, + {0x079cfc, 7}, + {0x079d24, 1}, + {0x079d2c, 1}, + {0x079d80, 2}, + {0x079d8c, 1}, + {0x079f00, 6}, + {0x079f20, 6}, + {0x079f40, 6}, + {0x079f60, 6}, + {0x079fa0, 3}, + {0x079fb0, 1}, + {0x07a000, 3}, + {0x07a010, 3}, + {0x07a020, 3}, + {0x07a040, 9}, + {0x07a068, 6}, + {0x07a090, 2}, + {0x07a0a0, 3}, + {0x07a0c0, 12}, + {0x07a0f4, 1}, + {0x07a100, 2}, + {0x07a110, 3}, + {0x07a120, 1}, + {0x07a130, 11}, + {0x07a160, 2}, + {0x07a180, 4}, + {0x07a194, 3}, + {0x07a1a4, 2}, + {0x07a1b0, 4}, + {0x07a2e0, 5}, + {0x07a2f8, 2}, + {0x07a30c, 13}, + {0x07a34c, 77}, + {0x07a48c, 18}, + {0x07a500, 32}, + {0x07a800, 16}, + {0x07a84c, 18}, + {0x07a8e0, 2}, + {0x07a8ec, 3}, + {0x07a900, 1}, + {0x07a910, 5}, + {0x07a930, 5}, + {0x07a950, 5}, + {0x07a970, 5}, + {0x07a9a4, 3}, + {0x07aa78, 34}, + {0x07ab3c, 18}, + {0x07ab94, 3}, + {0x07aba4, 3}, + {0x07abf0, 1}, + {0x07abf8, 10}, + {0x07ac30, 1}, + {0x07ac44, 4}, + {0x07ac60, 1}, + {0x07ac80, 1}, + {0x07ac90, 1}, + {0x07b004, 6}, + {0x080000, 29}, + {0x080340, 14}, + {0x08037c, 3}, + {0x08038c, 1}, + {0x0803c0, 2}, + {0x0803e4, 3}, + {0x0803f4, 3}, + {0x080404, 6}, + {0x080804, 2}, + {0x080874, 35}, + {0x081000, 129}, + {0x081210, 4}, + {0x081228, 3}, + {0x081240, 2}, + {0x081264, 2}, + {0x081274, 3}, + {0x081284, 2}, + {0x081298, 2}, + {0x0812a4, 1}, + {0x0812b4, 21}, + {0x081310, 8}, + {0x081344, 1}, + {0x08134c, 1}, + {0x081354, 1}, + {0x081364, 2}, + {0x081370, 4}, + {0x081384, 2}, + {0x081390, 4}, + {0x081404, 2}, + {0x081478, 34}, + {0x081504, 2}, + {0x081518, 14}, + {0x081580, 5}, + {0x081598, 2}, + {0x0815a4, 10}, + {0x082000, 29}, + {0x082340, 14}, + {0x08237c, 3}, + {0x08238c, 1}, + {0x0823c0, 2}, + {0x0823e4, 3}, + {0x0823f4, 3}, + {0x082404, 1}, + {0x08240c, 4}, + {0x082804, 2}, + {0x082874, 35}, + {0x082904, 2}, + {0x082974, 35}, + {0x083000, 129}, + {0x083210, 6}, + {0x083244, 2}, + {0x083254, 7}, + {0x083284, 1}, + {0x08328c, 1}, + {0x083294, 1}, + {0x0832a4, 1}, + {0x0832b4, 19}, + {0x083304, 2}, + {0x083310, 4}, + {0x083324, 2}, + {0x083330, 14}, + {0x084000, 29}, + {0x084340, 14}, + {0x08437c, 3}, + {0x08438c, 1}, + {0x0843c0, 2}, + {0x0843e4, 3}, + {0x0843f4, 3}, + {0x084404, 1}, + {0x08440c, 4}, + {0x084804, 2}, + {0x084874, 35}, + {0x084904, 2}, + {0x084974, 35}, + {0x085000, 32}, + {0x085200, 1}, + {0x085210, 7}, + {0x085240, 12}, + {0x085280, 2}, + {0x0852a4, 1}, + {0x0852b4, 3}, + {0x085304, 1}, + {0x08530c, 1}, + {0x085314, 1}, + {0x085324, 2}, + {0x085334, 3}, + {0x085344, 2}, + {0x085358, 2}, + {0x085364, 2}, + {0x085378, 2}, + {0x085384, 2}, + {0x085398, 2}, + {0x0853c0, 23}, + {0x086000, 2}, + {0x086020, 2}, + {0x086040, 1}, + {0x086400, 11}, + {0x086800, 3}, + {0x086820, 6}, + {0x086840, 6}, + {0x086860, 6}, + {0x086880, 6}, + {0x0868a0, 6}, + {0x0868c0, 6}, + {0x0868e0, 6}, + {0x086900, 9}, + {0x086940, 16}, + {0x087000, 26}, + {0x087100, 1}, + {0x087108, 1}, + {0x087110, 1}, + {0x087118, 1}, + {0x087120, 1}, + {0x087128, 1}, + {0x087130, 1}, + {0x087138, 1}, + {0x087140, 1}, + {0x087148, 1}, + {0x087150, 1}, + {0x087158, 1}, + {0x087160, 1}, + {0x087168, 1}, + {0x087170, 1}, + {0x087178, 1}, + {0x087180, 10}, + {0x0871b0, 9}, + {0x087200, 1}, + {0x087208, 1}, + {0x087210, 3}, + {0x090000, 17}, + {0x090060, 2}, + {0x09006c, 1}, + {0x090104, 1}, + {0x090140, 25}, + {0x0901a8, 2}, + {0x0901c0, 9}, + {0x0901e8, 2}, + {0x090204, 1}, + {0x090220, 24}, + {0x090300, 6}, + {0x090320, 9}, + {0x090348, 1}, + {0x090350, 1}, + {0x090400, 6}, + {0x090420, 9}, + {0x090448, 1}, + {0x090450, 1}, + {0x090500, 6}, + {0x090520, 6}, + {0x090540, 2}, + {0x090564, 2}, + {0x090578, 3}, + {0x091004, 3}, + {0x091800, 8}, + {0x091824, 2}, + {0x091830, 10}, + {0x091860, 6}, + {0x092000, 32}, + {0x093000, 1}, + {0x093020, 1}, + {0x093040, 1}, + {0x093060, 1}, + {0x093080, 1}, + {0x0930a0, 1}, + {0x0930c0, 1}, + {0x0930e0, 1}, + {0x093100, 1}, + {0x0931a0, 1}, + {0x0931c0, 1}, + {0x093200, 3}, + {0x093404, 1}, + {0x093440, 16}, + {0x093504, 1}, + {0x09353c, 28}, + {0x0935b0, 2}, + {0x0935c0, 3}, + {0x094000, 9}, + {0x094040, 19}, + {0x0940c0, 1}, + {0x094800, 1}, + {0x094828, 1}, + {0x094850, 1}, + {0x094878, 1}, + {0x0948a0, 8}, + {0x0949ac, 1}, + {0x0949d8, 4}, + {0x094a00, 6}, + {0x094a20, 6}, + {0x094a40, 3}, + {0x096000, 1}, + {0x096010, 4}, + {0x096028, 3}, + {0x096104, 1}, + {0x09610c, 7}, + {0x096204, 1}, + {0x09620c, 4}, + {0x096488, 1}, + {0x096498, 3}, + {0x0964b0, 4}, + {0x096504, 1}, + {0x09650c, 4}, + {0x096584, 4}, + {0x096600, 16}, + {0x096644, 2}, + {0x096658, 10}, + {0x096684, 2}, + {0x0966bc, 51}, + {0x096800, 11}, + {0x096884, 3}, + {0x0968a0, 12}, + {0x097100, 6}, + {0x097120, 1}, + {0x0a0000, 10}, + {0x0a0030, 4}, + {0x0a0080, 6}, + {0x0a00a0, 8}, + {0x0a0400, 7}, + {0x0a0420, 2}, + {0x0a0464, 3}, + {0x0a0480, 6}, + {0x0a04a0, 6}, + {0x0a0500, 30}, + {0x0a0580, 2}, + {0x0a0800, 14}, + {0x0a0840, 6}, + {0x0a0860, 18}, + {0x0a1004, 1}, + {0x0a100c, 4}, + {0x0a1044, 2}, + {0x0a1058, 2}, + {0x0a1064, 2}, + {0x0a1074, 6}, + {0x0a1090, 9}, + {0x0a1204, 1}, + {0x0a120c, 4}, + {0x0a1244, 2}, + {0x0a1254, 6}, + {0x0a1270, 6}, + {0x0a1300, 1}, + {0x0a1404, 1}, + {0x0a1440, 16}, + {0x0a1484, 1}, + {0x0a148c, 4}, + {0x0a14c4, 1}, + {0x0a14d0, 12}, + {0x0a1504, 1}, + {0x0a1510, 12}, + {0x0a1544, 1}, + {0x0a1550, 12}, + {0x0a1584, 1}, + {0x0a1590, 12}, + {0x0a15c4, 1}, + {0x0a15cc, 5}, + {0x0a15e4, 1}, + {0x0a15ec, 5}, + {0x0a1604, 1}, + {0x0a160c, 5}, + {0x0a1624, 1}, + {0x0a162c, 5}, + {0x0a1644, 1}, + {0x0a164c, 5}, + {0x0a1664, 1}, + {0x0a166c, 5}, + {0x0a1684, 1}, + {0x0a168c, 5}, + {0x0a16a4, 1}, + {0x0a16ac, 5}, + {0x0a16c4, 7}, + {0x0a16e4, 1}, + {0x0a16ec, 1}, + {0x0a16f4, 1}, + {0x0a16fc, 1}, + {0x0a1704, 17}, + {0x0a1754, 1}, + {0x0a175c, 1}, + {0x0a1764, 2}, + {0x0a1774, 3}, + {0x0a1800, 18}, + {0x0a1900, 3}, + {0x0a1948, 3}, + {0x0a1958, 6}, + {0x0a1974, 8}, + {0x0a2004, 1}, + {0x0a200c, 4}, + {0x0a2400, 2}, + {0x0a240c, 6}, + {0x0a2440, 1}, + {0x0a2450, 4}, + {0x0a2468, 3}, + {0x0a2480, 1}, + {0x0a24a0, 10}, + {0x0a24d0, 9}, + {0x0a2804, 1}, + {0x0a280c, 4}, + {0x0a2c00, 2}, + {0x0a2c0c, 2}, + {0x0a2c40, 1}, + {0x0a2c50, 4}, + {0x0a2c68, 3}, + {0x0a2c88, 2}, + {0x0a2cf0, 1}, + {0x0a3004, 1}, + {0x0a300c, 4}, + {0x0a3040, 2}, + {0x0a3064, 3}, + {0x0a3074, 4}, + {0x0a3200, 9}, + {0x0a3230, 2}, + {0x0a323c, 2}, + {0x0a3248, 4}, + {0x0a3400, 1}, + {0x0a3408, 3}, + {0x0a3418, 4}, + {0x0a3430, 2}, + {0x0a343c, 1}, + {0x0a3480, 1}, + {0x0a3490, 1}, + {0x0a3504, 1}, + {0x0a3510, 76}, + {0x0a4000, 48}, + {0x0a4100, 3}, + {0x0a4110, 6}, + {0x0a412c, 4}, + {0x0a4140, 1}, + {0x0a4304, 1}, + {0x0a4318, 10}, + {0x0a4804, 1}, + {0x0a480c, 4}, + {0x0a4840, 2}, + {0x0a4864, 3}, + {0x0a4874, 3}, + {0x0a4c04, 2}, + {0x0a4c10, 2}, + {0x0a4c1c, 6}, + {0x0a4c38, 2}, + {0x0a4c50, 8}, + {0x0a4c78, 19}, + {0x0a4d04, 2}, + {0x0a4d40, 21}, + {0x0a4da4, 1}, + {0x0a4dac, 1}, + {0x0a4db4, 1}, + {0x0a4dc0, 1}, + {0x0a5000, 14}, + {0x0a6000, 2}, + {0x0a6028, 2}, + {0x0a6050, 2}, + {0x0a6078, 2}, + {0x0a60a0, 35}, + {0x0a61ac, 1}, + {0x0a61d8, 4}, + {0x0a6200, 6}, + {0x0a6220, 6}, + {0x0a6240, 6}, + {0x0a6260, 6}, + {0x0a6280, 3}, + {0x0a6400, 3}, + {0x0a6420, 6}, + {0x0a6440, 6}, + {0x0a6460, 6}, + {0x0a6480, 6}, + {0x0a64a0, 6}, + {0x0a64c0, 6}, + {0x0a6500, 9}, + {0x0a6540, 18}, + {0x0a65c0, 4}, + {0x0a8000, 10}, + {0x0a802c, 15}, + {0x0a806c, 5}, + {0x0a8408, 5}, + {0x0a8424, 3}, + {0x0a8434, 6}, + {0x0a8450, 2}, + {0x0a845c, 5}, + {0x0a84c4, 1}, + {0x0a84cc, 4}, + {0x0a8604, 1}, + {0x0a860c, 4}, + {0x0a8700, 17}, + {0x0a8750, 4}, + {0x0a8800, 4}, + {0x0a8880, 1}, + {0x0a88a0, 1}, + {0x0a88c0, 1}, + {0x0a8900, 1}, + {0x0a8960, 1}, + {0x0a8980, 4}, + {0x0a8994, 1}, + {0x0a899c, 1}, + {0x0a89a4, 3}, + {0x0a89c0, 1}, + {0x0a8a00, 4}, + {0x0a8aa4, 1}, + {0x0a8aac, 1}, + {0x0a8ab4, 1}, + {0x0a8ad4, 10}, + {0x0a8b00, 2}, + {0x0a8b80, 4}, + {0x0a8c04, 6}, + {0x0a9000, 3}, + {0x0a9010, 21}, + {0x0a9080, 4}, + {0x0a9100, 10}, + {0x0a91f4, 3}, + {0x0a920c, 35}, + {0x0a92a4, 7}, + {0x0a930c, 39}, + {0x0a940c, 35}, + {0x0a949c, 9}, + {0x0a9500, 2}, + {0x0a950c, 3}, + {0x0a951c, 3}, + {0x0a954c, 13}, + {0x0a9804, 1}, + {0x0a980c, 4}, + {0x0a9c00, 2}, + {0x0a9c0c, 3}, + {0x0a9c1c, 2}, + {0x0a9c28, 1}, + {0x0a9c44, 1}, + {0x0a9c60, 17}, + {0x0a9d00, 1}, + {0x0a9d20, 8}, + {0x0a9d48, 3}, + {0x0a9d80, 1}, + {0x0a9dc4, 3}, + {0x0a9dd4, 3}, + {0x0a9de4, 3}, + {0x0a9df4, 4}, + {0x0a9e20, 8}, + {0x0a9e48, 3}, + {0x0a9e84, 1}, + {0x0a9e98, 1}, + {0x0a9ea0, 8}, + {0x0a9f00, 6}, + {0x0a9f20, 6}, + {0x0a9f40, 6}, + {0x0a9f60, 6}, + {0x0a9f80, 9}, + {0x0a9fc0, 31}, + {0x0aa204, 1}, + {0x0aa20c, 4}, + {0x0b0000, 2}, + {0x0b0010, 4}, + {0x0b1000, 15}, + {0x0b1040, 10}, + {0x0b1080, 20}, + {0x0b1100, 2}, + {0x0b1110, 2}, + {0x0b1120, 2}, + {0x0b1160, 2}, + {0x0b116c, 1}, + {0x0b1180, 3}, + {0x0b1190, 3}, + {0x0b11a0, 3}, + {0x0b11e0, 2}, + {0x0b11ec, 1}, + {0x0b1200, 2}, + {0x0b1210, 2}, + {0x0b1260, 2}, + {0x0b126c, 1}, + {0x0b1280, 3}, + {0x0b1290, 3}, + {0x0b12e0, 2}, + {0x0b12ec, 1}, + {0x0b1300, 2}, + {0x0b1310, 2}, + {0x0b1320, 2}, + {0x0b1360, 2}, + {0x0b136c, 1}, + {0x0b1380, 3}, + {0x0b1390, 3}, + {0x0b13e0, 2}, + {0x0b13ec, 1}, + {0x0b1400, 3}, + {0x0b1410, 3}, + {0x0b1460, 2}, + {0x0b146c, 1}, + {0x0b1480, 3}, + {0x0b1490, 7}, + {0x0b14b0, 4}, + {0x0b14e0, 2}, + {0x0b14ec, 1}, + {0x0b1500, 3}, + {0x0b1510, 3}, + {0x0b1560, 2}, + {0x0b156c, 1}, + {0x0b1580, 2}, + {0x0b1590, 2}, + {0x0b15e0, 2}, + {0x0b15ec, 1}, + {0x0b1600, 3}, + {0x0b1610, 3}, + {0x0b1660, 2}, + {0x0b166c, 1}, + {0x0b1680, 3}, + {0x0b1690, 3}, + {0x0b16e0, 2}, + {0x0b16ec, 1}, + {0x0b1700, 8}, + {0x0b1760, 2}, + {0x0b176c, 1}, + {0x0b1780, 3}, + {0x0b1790, 3}, + {0x0b17e0, 2}, + {0x0b17ec, 1}, + {0x0b1800, 9}, + {0x0b1840, 16}, + {0x0b2000, 2}, + {0x0b2010, 24}, + {0x0b2080, 9}, + {0x0b20c0, 18}, + {0x0b2110, 24}, + {0x0b2180, 9}, + {0x0b21c0, 18}, + {0x0b2210, 24}, + {0x0b2280, 9}, + {0x0b22c0, 18}, + {0x0b2310, 24}, + {0x0b2380, 9}, + {0x0b23c0, 18}, + {0x0b2410, 24}, + {0x0b2480, 9}, + {0x0b24c0, 18}, + {0x0b2510, 24}, + {0x0b2580, 9}, + {0x0b25c0, 18}, + {0x0b2610, 24}, + {0x0b2680, 9}, + {0x0b26c0, 18}, + {0x0b2710, 24}, + {0x0b2780, 9}, + {0x0b27c0, 16}, + {0x0b2900, 1}, + {0x0b2910, 1}, + {0x0b3000, 157}, + {0x0b3278, 26}, + {0x0b3300, 9}, + {0x0b3404, 1}, + {0x0b340c, 4}, + {0x0b3800, 73}, + {0x0b3940, 20}, + {0x0b39a0, 4}, + {0x0b39c4, 5}, + {0x0b3a00, 1}, + {0x0b3a20, 8}, + {0x0b3a44, 3}, + {0x0b4000, 7}, + {0x0b4040, 12}, + {0x0b4100, 9}, + {0x0b4140, 16}, + {0x0b5000, 2}, + {0x0b5028, 2}, + {0x0b5050, 2}, + {0x0b5078, 2}, + {0x0b50a0, 5}, + {0x0b51ac, 1}, + {0x0b51d8, 4}, + {0x0b5200, 6}, + {0x0b5220, 6}, + {0x0b5240, 6}, + {0x0b5260, 6}, + {0x0b5280, 3}, + {0x0b6800, 3}, + {0x0b6820, 6}, + {0x0b6840, 6}, + {0x0b6860, 6}, + {0x0b6880, 6}, + {0x0b68a0, 6}, + {0x0b68c0, 6}, + {0x0b68e0, 6}, + {0x0b6900, 6}, + {0x0d0000, 5}, + {0x0d0038, 4}, + {0x0d0050, 4}, + {0x0d0080, 3}, + {0x0d00a0, 6}, + {0x0d00c0, 6}, + {0x0d0100, 3}, + {0x0d0120, 6}, + {0x0d0140, 6}, + {0x0d0180, 14}, + {0x0d01c0, 6}, + {0x0d1000, 2}, + {0x0d100c, 1}, + {0x0d1020, 13}, + {0x0d1058, 1}, + {0x0d1060, 6}, + {0x0d1080, 1}, + {0x0d1100, 2}, + {0x0d110c, 1}, + {0x0d1120, 13}, + {0x0d1158, 1}, + {0x0d1160, 6}, + {0x0d1180, 1}, + {0x0d1200, 2}, + {0x0d120c, 1}, + {0x0d1220, 13}, + {0x0d1258, 1}, + {0x0d1260, 6}, + {0x0d1280, 1}, + {0x0d1300, 2}, + {0x0d130c, 1}, + {0x0d1320, 13}, + {0x0d1358, 1}, + {0x0d1360, 6}, + {0x0d1380, 1}, + {0x0d1400, 3}, + {0x0d1410, 10}, + {0x0d1440, 1}, + {0x0d1450, 6}, + {0x0d1484, 6}, + {0x0d14a0, 6}, + {0x0d1504, 25}, + {0x0d1580, 14}, + {0x0d15c0, 4}, + {0x0d1600, 6}, + {0x0d1620, 6}, + {0x0d1640, 3}, + {0x0d1660, 6}, + {0x0d1700, 2}, + {0x0d170c, 3}, + {0x0d1720, 6}, + {0x0d1800, 11}, + {0x0d1830, 7}, + {0x0d1850, 7}, + {0x0d1870, 9}, + {0x0d18a0, 2}, + {0x0d18c0, 2}, + {0x0d1a00, 1}, + {0x0d1a08, 13}, + {0x0d1a40, 11}, + {0x0d1c00, 24}, + {0x0d1c64, 5}, + {0x0d1c80, 3}, + {0x0d1c90, 2}, + {0x0d1d00, 15}, + {0x0d1d40, 16}, + {0x0d1d90, 4}, + {0x0d1dc0, 12}, + {0x0d1e00, 1}, + {0x0d1e20, 1}, + {0x0d1e28, 12}, + {0x0d1e5c, 7}, + {0x0d1f00, 14}, + {0x0d2000, 2}, + {0x0d200c, 1}, + {0x0d2020, 13}, + {0x0d2058, 1}, + {0x0d2060, 6}, + {0x0d2080, 1}, + {0x0d2100, 2}, + {0x0d210c, 1}, + {0x0d2120, 13}, + {0x0d2158, 1}, + {0x0d2160, 6}, + {0x0d2180, 1}, + {0x0d2200, 2}, + {0x0d220c, 1}, + {0x0d2220, 13}, + {0x0d2258, 1}, + {0x0d2260, 6}, + {0x0d2280, 1}, + {0x0d2300, 2}, + {0x0d230c, 1}, + {0x0d2320, 13}, + {0x0d2358, 1}, + {0x0d2360, 6}, + {0x0d2380, 1}, + {0x0d2400, 3}, + {0x0d2410, 10}, + {0x0d2440, 1}, + {0x0d2450, 6}, + {0x0d2484, 6}, + {0x0d24a0, 6}, + {0x0d2504, 25}, + {0x0d2580, 14}, + {0x0d25c0, 4}, + {0x0d2600, 6}, + {0x0d2620, 6}, + {0x0d2640, 3}, + {0x0d2660, 6}, + {0x0d2700, 2}, + {0x0d270c, 3}, + {0x0d2720, 6}, + {0x0d2800, 11}, + {0x0d2830, 7}, + {0x0d2850, 7}, + {0x0d2870, 9}, + {0x0d28a0, 2}, + {0x0d28c0, 2}, + {0x0d2a00, 1}, + {0x0d2a08, 13}, + {0x0d2a40, 11}, + {0x0d2c00, 24}, + {0x0d2c64, 5}, + {0x0d2c80, 3}, + {0x0d2c90, 2}, + {0x0d2d00, 15}, + {0x0d2d40, 16}, + {0x0d2d90, 4}, + {0x0d2dc0, 12}, + {0x0d2e00, 1}, + {0x0d2e20, 1}, + {0x0d2e28, 12}, + {0x0d2e5c, 7}, + {0x0d2f00, 14}, + {0x0d3000, 2}, + {0x0d3400, 1}, + {0x0d3428, 1}, + {0x0d3450, 1}, + {0x0d3478, 1}, + {0x0d34a0, 5}, + {0x0d35ac, 1}, + {0x0d35d8, 4}, + {0x0d3600, 6}, + {0x0d3620, 6}, + {0x0d3640, 3}, + {0x0d3e00, 3}, + {0x0d3e20, 6}, + {0x0d3e40, 6}, + {0x0d3e80, 3}, + {0x0d3e90, 1}, + {0x0d4000, 29}, + {0x0d4078, 4}, + {0x0d4090, 2}, + {0x0d40a0, 7}, + {0x0d40c0, 11}, + {0x0d4100, 14}, + {0x0d4140, 14}, + {0x0d4180, 61}, + {0x0d4278, 4}, + {0x0d4290, 2}, + {0x0d42a0, 7}, + {0x0d42c0, 11}, + {0x0d4300, 14}, + {0x0d4340, 14}, + {0x0d4380, 61}, + {0x0d4478, 4}, + {0x0d4490, 2}, + {0x0d44a0, 7}, + {0x0d44c0, 11}, + {0x0d4500, 14}, + {0x0d4540, 14}, + {0x0d4580, 61}, + {0x0d4678, 4}, + {0x0d4690, 2}, + {0x0d46a0, 7}, + {0x0d46c0, 11}, + {0x0d4700, 14}, + {0x0d4740, 14}, + {0x0d4780, 62}, + {0x0d487c, 1}, + {0x0d4a00, 8}, + {0x0d4a24, 15}, + {0x0d4a64, 30}, + {0x0d4b00, 4}, + {0x0d4b20, 3}, + {0x0d4c00, 6}, + {0x0d4c40, 14}, + {0x0d4c80, 9}, + {0x0d4d00, 9}, + {0x0d4d2c, 1}, + {0x0d4d40, 3}, + {0x0d4d60, 1}, + {0x0d4d80, 3}, + {0x0d4e00, 2}, + {0x0d4e0c, 1}, + {0x0d4e14, 5}, + {0x0d4e2c, 1}, + {0x0d4e34, 5}, + {0x0d4e4c, 1}, + {0x0d4e54, 5}, + {0x0d4e6c, 1}, + {0x0d4e74, 5}, + {0x0d4e8c, 1}, + {0x0d4e94, 5}, + {0x0d4eac, 1}, + {0x0d4eb4, 3}, + {0x0d5000, 29}, + {0x0d5078, 4}, + {0x0d5090, 2}, + {0x0d50a0, 7}, + {0x0d50c0, 11}, + {0x0d5100, 14}, + {0x0d5140, 14}, + {0x0d5180, 61}, + {0x0d5278, 4}, + {0x0d5290, 2}, + {0x0d52a0, 7}, + {0x0d52c0, 11}, + {0x0d5300, 14}, + {0x0d5340, 14}, + {0x0d5380, 61}, + {0x0d5478, 4}, + {0x0d5490, 2}, + {0x0d54a0, 7}, + {0x0d54c0, 11}, + {0x0d5500, 14}, + {0x0d5540, 14}, + {0x0d5580, 61}, + {0x0d5678, 4}, + {0x0d5690, 2}, + {0x0d56a0, 7}, + {0x0d56c0, 11}, + {0x0d5700, 14}, + {0x0d5740, 14}, + {0x0d5780, 62}, + {0x0d587c, 1}, + {0x0d5a00, 8}, + {0x0d5a24, 15}, + {0x0d5a64, 30}, + {0x0d5b00, 4}, + {0x0d5b20, 3}, + {0x0d5c00, 6}, + {0x0d5c40, 14}, + {0x0d5c80, 9}, + {0x0d5d00, 9}, + {0x0d5d2c, 1}, + {0x0d5d40, 3}, + {0x0d5d60, 1}, + {0x0d5d80, 3}, + {0x0d5e00, 2}, + {0x0d5e0c, 1}, + {0x0d5e14, 5}, + {0x0d5e2c, 1}, + {0x0d5e34, 5}, + {0x0d5e4c, 1}, + {0x0d5e54, 5}, + {0x0d5e6c, 1}, + {0x0d5e74, 5}, + {0x0d5e8c, 1}, + {0x0d5e94, 5}, + {0x0d5eac, 1}, + {0x0d5eb4, 3}, + {0x0d6000, 15}, + {0x0d6070, 3}, + {0x0d6080, 6}, + {0x0d6100, 9}, + {0x0d6204, 8}, + {0x0d6240, 13}, + {0x0d6280, 16}, + {0x0d6400, 8}, + {0x0d6424, 15}, + {0x0d6464, 15}, + {0x0d64a4, 15}, + {0x0d64e4, 30}, + {0x0d6580, 10}, + {0x0d65ac, 1}, + {0x0d65b4, 5}, + {0x0d65cc, 1}, + {0x0d65d4, 5}, + {0x0d65ec, 1}, + {0x0d65f4, 13}, + {0x0d6680, 7}, + {0x0d66a0, 5}, + {0x0d66c0, 5}, + {0x0d66e0, 4}, + {0x0d6800, 19}, + {0x0d6850, 10}, + {0x0d6880, 19}, + {0x0d68d0, 10}, + {0x0d6900, 19}, + {0x0d6950, 10}, + {0x0d6980, 19}, + {0x0d69d0, 10}, + {0x0d6a00, 19}, + {0x0d6a50, 10}, + {0x0d6a80, 19}, + {0x0d6ad0, 10}, + {0x0d6b00, 19}, + {0x0d6b50, 10}, + {0x0d6b80, 19}, + {0x0d6bd0, 10}, + {0x0d6c00, 19}, + {0x0d6c60, 6}, + {0x0d6c84, 1}, + {0x0d6c94, 8}, + {0x0d6cb8, 9}, + {0x0d6ce0, 4}, + {0x0d7000, 9}, + {0x0d7040, 16}, + {0x0d8000, 6}, + {0x0d8020, 3}, + {0x0d8030, 3}, + {0x0d8040, 6}, + {0x0d8060, 17}, + {0x0d80c0, 38}, + {0x0d8180, 1}, + {0x0d8400, 2}, + {0x0d8428, 2}, + {0x0d8450, 2}, + {0x0d8478, 2}, + {0x0d84a0, 16}, + {0x0d85ac, 1}, + {0x0d85d8, 4}, + {0x0d8600, 6}, + {0x0d8620, 6}, + {0x0d8640, 6}, + {0x0d8660, 6}, + {0x0d8680, 3}, + {0x0d8800, 2}, + {0x0d9000, 35}, + {0x0d9100, 26}, + {0x0d916c, 7}, + {0x0d91a0, 1}, + {0x0d91c0, 9}, + {0x0d91e8, 1}, + {0x0d9200, 6}, + {0x0d9220, 6}, + {0x0d9248, 4}, + {0x0d9280, 6}, + {0x0d929c, 1}, + {0x0d92a4, 2}, + {0x0d92b8, 9}, + {0x0d9304, 4}, + {0x0d9328, 3}, + {0x0d9340, 6}, + {0x0d9400, 1}, + {0x0d9408, 1}, + {0x0d9410, 2}, + {0x0d9424, 2}, + {0x0d9444, 1}, + {0x0d9480, 27}, + {0x0d9500, 6}, + {0x0d9520, 12}, + {0x0d9700, 7}, + {0x0d9744, 9}, + {0x0d976c, 2}, + {0x0d9780, 6}, + {0x0d97a0, 2}, + {0x0d9800, 35}, + {0x0d9900, 26}, + {0x0d996c, 7}, + {0x0d99a0, 1}, + {0x0d99c0, 9}, + {0x0d99e8, 1}, + {0x0d9a00, 6}, + {0x0d9a20, 6}, + {0x0d9a48, 4}, + {0x0d9a80, 6}, + {0x0d9a9c, 1}, + {0x0d9aa4, 2}, + {0x0d9ab8, 9}, + {0x0d9b04, 4}, + {0x0d9b28, 3}, + {0x0d9b40, 6}, + {0x0d9c00, 1}, + {0x0d9c08, 1}, + {0x0d9c10, 2}, + {0x0d9c24, 2}, + {0x0d9c44, 1}, + {0x0d9c80, 27}, + {0x0d9d00, 6}, + {0x0d9d20, 12}, + {0x0d9f00, 7}, + {0x0d9f44, 9}, + {0x0d9f6c, 2}, + {0x0d9f80, 6}, + {0x0d9fa0, 2}, + {0x0db000, 1028}, + {0x0dc018, 18}, + {0x0dc100, 4}, + {0x0dc118, 18}, + {0x0dc200, 12}, + {0x0dc300, 6}, + {0x0dc320, 5}, + {0x0dc340, 6}, + {0x0dc360, 5}, + {0x0dc380, 6}, + {0x0dc400, 9}, + {0x0dc440, 26}, + {0x0dc4c4, 1}, + {0x0dc4cc, 1}, + {0x0dc4d4, 1}, + {0x0dc50c, 7}, + {0x0dc544, 2}, + {0x0dc55c, 9}, + {0x0dc584, 7}, + {0x0dc5a4, 2}, + {0x0dc5b8, 2}, + {0x0dc5c4, 2}, + {0x0dc5d8, 2}, + {0x0dc600, 2}, + {0x0dcfbc, 15}, + {0x0dd000, 7}, + {0x0dd020, 6}, + {0x0dd040, 8}, + {0x0dd104, 1}, + {0x0dd10c, 1}, + {0x0dd200, 8}, + {0x0dd244, 2}, + {0x0dd268, 18}, + {0x0dd404, 1}, + {0x0dd440, 40}, + {0x0dd504, 3}, + {0x0dd514, 3}, + {0x0dd524, 3}, + {0x0dd534, 3}, + {0x0dd544, 3}, + {0x0dd554, 3}, + {0x0dd564, 3}, + {0x0dd574, 3}, + {0x0dd584, 3}, + {0x0dd594, 3}, + {0x0dd5a4, 3}, + {0x0dd5b4, 3}, + {0x0dd604, 2}, + {0x0dd640, 16}, + {0x0dd684, 3}, + {0x0dd704, 2}, + {0x0dd740, 18}, + {0x0ddc00, 4}, + {0x0ddc80, 1}, + {0x0ddd00, 6}, + {0x0ddd20, 6}, + {0x0ddd40, 6}, + {0x0ddd80, 1}, + {0x0dde00, 3}, + {0x0dde20, 10}, + {0x0dde50, 6}, + {0x0dde80, 9}, + {0x0ddec0, 16}, + {0x0de000, 123}, + {0x0de200, 2}, + {0x0de20c, 3}, + {0x0de220, 2}, + {0x0de22c, 3}, + {0x0de240, 2}, + {0x0de24c, 3}, + {0x0de260, 2}, + {0x0de26c, 3}, + {0x0de280, 2}, + {0x0de28c, 3}, + {0x0de2a0, 2}, + {0x0de2ac, 3}, + {0x0de2c0, 2}, + {0x0de2cc, 3}, + {0x0de2e0, 2}, + {0x0de2ec, 3}, + {0x0de300, 2}, + {0x0de30c, 3}, + {0x0de320, 2}, + {0x0de32c, 3}, + {0x0de340, 2}, + {0x0de34c, 3}, + {0x0de360, 2}, + {0x0de36c, 3}, + {0x0de380, 2}, + {0x0de38c, 3}, + {0x0de3a0, 2}, + {0x0de3ac, 3}, + {0x0de3c0, 2}, + {0x0de3cc, 3}, + {0x0de3e0, 2}, + {0x0de3ec, 3}, + {0x0de400, 123}, + {0x0de600, 2}, + {0x0de60c, 3}, + {0x0de620, 2}, + {0x0de62c, 3}, + {0x0de640, 2}, + {0x0de64c, 3}, + {0x0de660, 2}, + {0x0de66c, 3}, + {0x0de680, 2}, + {0x0de68c, 3}, + {0x0de6a0, 2}, + {0x0de6ac, 3}, + {0x0de6c0, 2}, + {0x0de6cc, 3}, + {0x0de6e0, 2}, + {0x0de6ec, 3}, + {0x0de700, 2}, + {0x0de70c, 3}, + {0x0de720, 2}, + {0x0de72c, 3}, + {0x0de740, 2}, + {0x0de74c, 3}, + {0x0de760, 2}, + {0x0de76c, 3}, + {0x0de780, 2}, + {0x0de78c, 3}, + {0x0de7a0, 2}, + {0x0de7ac, 3}, + {0x0de7c0, 2}, + {0x0de7cc, 3}, + {0x0de7e0, 2}, + {0x0de7ec, 3}, + {0x0de800, 123}, + {0x0dea00, 2}, + {0x0dea0c, 3}, + {0x0dea20, 2}, + {0x0dea2c, 3}, + {0x0dea40, 2}, + {0x0dea4c, 3}, + {0x0dea60, 2}, + {0x0dea6c, 3}, + {0x0dea80, 2}, + {0x0dea8c, 3}, + {0x0deaa0, 2}, + {0x0deaac, 3}, + {0x0deac0, 2}, + {0x0deacc, 3}, + {0x0deae0, 2}, + {0x0deaec, 3}, + {0x0deb00, 2}, + {0x0deb0c, 3}, + {0x0deb20, 2}, + {0x0deb2c, 3}, + {0x0deb40, 2}, + {0x0deb4c, 3}, + {0x0deb60, 2}, + {0x0deb6c, 3}, + {0x0deb80, 2}, + {0x0deb8c, 3}, + {0x0deba0, 2}, + {0x0debac, 3}, + {0x0debc0, 2}, + {0x0debcc, 3}, + {0x0debe0, 2}, + {0x0debec, 3}, + {0x0dec00, 123}, + {0x0dee00, 2}, + {0x0dee0c, 3}, + {0x0dee20, 2}, + {0x0dee2c, 3}, + {0x0dee40, 2}, + {0x0dee4c, 3}, + {0x0dee60, 2}, + {0x0dee6c, 3}, + {0x0dee80, 2}, + {0x0dee8c, 3}, + {0x0deea0, 2}, + {0x0deeac, 3}, + {0x0deec0, 2}, + {0x0deecc, 3}, + {0x0deee0, 2}, + {0x0deeec, 3}, + {0x0def00, 2}, + {0x0def0c, 3}, + {0x0def20, 2}, + {0x0def2c, 3}, + {0x0def40, 2}, + {0x0def4c, 3}, + {0x0def60, 2}, + {0x0def6c, 3}, + {0x0def80, 2}, + {0x0def8c, 3}, + {0x0defa0, 2}, + {0x0defac, 3}, + {0x0defc0, 2}, + {0x0defcc, 3}, + {0x0defe0, 2}, + {0x0defec, 3}, + {0x0df000, 123}, + {0x0df200, 2}, + {0x0df20c, 3}, + {0x0df220, 2}, + {0x0df22c, 3}, + {0x0df240, 2}, + {0x0df24c, 3}, + {0x0df260, 2}, + {0x0df26c, 3}, + {0x0df280, 2}, + {0x0df28c, 3}, + {0x0df2a0, 2}, + {0x0df2ac, 3}, + {0x0df2c0, 2}, + {0x0df2cc, 3}, + {0x0df2e0, 2}, + {0x0df2ec, 3}, + {0x0df300, 2}, + {0x0df30c, 3}, + {0x0df320, 2}, + {0x0df32c, 3}, + {0x0df340, 2}, + {0x0df34c, 3}, + {0x0df360, 2}, + {0x0df36c, 3}, + {0x0df380, 2}, + {0x0df38c, 3}, + {0x0df3a0, 2}, + {0x0df3ac, 3}, + {0x0df3c0, 2}, + {0x0df3cc, 3}, + {0x0df3e0, 2}, + {0x0df3ec, 3}, + {0x0df400, 2}, + {0x0e0000, 3}, + {0x0e0010, 4}, + {0x0e0028, 3}, + {0x0e0048, 2}, + {0x0e0058, 2}, + {0x0e0064, 32}, + {0x0e00f0, 1}, + {0x0e00fc, 35}, + {0x0e019c, 15}, + {0x0e01e0, 1}, + {0x0e01e8, 5}, + {0x0e0204, 5}, + {0x0e021c, 1}, + {0x0e0300, 16}, + {0x0e0400, 3}, + {0x0e0410, 4}, + {0x0e0428, 3}, + {0x0e0448, 2}, + {0x0e0458, 2}, + {0x0e0464, 32}, + {0x0e04f0, 1}, + {0x0e04fc, 35}, + {0x0e059c, 15}, + {0x0e05e0, 1}, + {0x0e05e8, 5}, + {0x0e0604, 5}, + {0x0e061c, 1}, + {0x0e0700, 16}, + {0x0e0800, 3}, + {0x0e0810, 4}, + {0x0e0828, 3}, + {0x0e0848, 2}, + {0x0e0858, 2}, + {0x0e0864, 32}, + {0x0e08f0, 1}, + {0x0e08fc, 35}, + {0x0e099c, 15}, + {0x0e09e0, 1}, + {0x0e09e8, 5}, + {0x0e0a04, 5}, + {0x0e0a1c, 1}, + {0x0e0b00, 16}, + {0x0e0c00, 3}, + {0x0e0c10, 4}, + {0x0e0c28, 3}, + {0x0e0c48, 2}, + {0x0e0c58, 2}, + {0x0e0c64, 32}, + {0x0e0cf0, 1}, + {0x0e0cfc, 35}, + {0x0e0d9c, 15}, + {0x0e0de0, 1}, + {0x0e0de8, 5}, + {0x0e0e04, 5}, + {0x0e0e1c, 1}, + {0x0e0f00, 16}, + {0x0e1000, 3}, + {0x0e1010, 4}, + {0x0e1028, 3}, + {0x0e1048, 2}, + {0x0e1058, 2}, + {0x0e1064, 32}, + {0x0e10f0, 1}, + {0x0e10fc, 35}, + {0x0e119c, 15}, + {0x0e11e0, 1}, + {0x0e11e8, 5}, + {0x0e1204, 5}, + {0x0e121c, 1}, + {0x0e1300, 16}, + {0x0e1400, 3}, + {0x0e1410, 4}, + {0x0e1428, 3}, + {0x0e1448, 2}, + {0x0e1458, 2}, + {0x0e1464, 32}, + {0x0e14f0, 1}, + {0x0e14fc, 35}, + {0x0e159c, 15}, + {0x0e15e0, 1}, + {0x0e15e8, 5}, + {0x0e1604, 5}, + {0x0e161c, 1}, + {0x0e1700, 16}, + {0x0e1800, 3}, + {0x0e1810, 4}, + {0x0e1828, 3}, + {0x0e1848, 2}, + {0x0e1858, 2}, + {0x0e1864, 32}, + {0x0e18f0, 1}, + {0x0e18fc, 35}, + {0x0e199c, 15}, + {0x0e19e0, 1}, + {0x0e19e8, 5}, + {0x0e1a04, 5}, + {0x0e1a1c, 1}, + {0x0e1b00, 16}, + {0x0e1c00, 3}, + {0x0e1c10, 4}, + {0x0e1c28, 3}, + {0x0e1c48, 2}, + {0x0e1c58, 2}, + {0x0e1c64, 32}, + {0x0e1cf0, 1}, + {0x0e1cfc, 35}, + {0x0e1d9c, 15}, + {0x0e1de0, 1}, + {0x0e1de8, 5}, + {0x0e1e04, 5}, + {0x0e1e1c, 1}, + {0x0e1f00, 16}, + {0x0e20c0, 8}, + {0x0e20ec, 5}, + {0x0e2108, 3}, + {0x0e2200, 5}, + {0x0e2218, 36}, + {0x0e2300, 6}, + {0x0e2330, 4}, + {0x0e2500, 3}, + {0x0e2510, 12}, + {0x0e26e0, 6}, + {0x0e2700, 6}, + {0x0e2720, 6}, + {0x0e2740, 3}, + {0x0e2780, 6}, + {0x0e27a0, 6}, + {0x0e27c0, 3}, + {0x0e2800, 67}, + {0x0e2a00, 6}, + {0x0e2a20, 6}, + {0x0e2a40, 3}, + {0x0e2a50, 3}, + {0x0e2a60, 1}, + {0x0e2a80, 17}, + {0x0e3020, 10}, + {0x0e3070, 1}, + {0x0e3080, 2}, + {0x0e308c, 1}, + {0x0e3440, 21}, + {0x0e34e4, 13}, + {0x0e3520, 6}, + {0x0e3540, 6}, + {0x0e3560, 6}, + {0x0e3580, 6}, + {0x0e35a0, 6}, + {0x0e35c0, 6}, + {0x0e35e0, 6}, + {0x0e3600, 16}, + {0x0e3804, 3}, + {0x0e3900, 33}, + {0x0e3a00, 6}, + {0x0e3a20, 2}, + {0x0e3a30, 1}, + {0x0e3a40, 8}, + {0x0e3a64, 5}, + {0x0e3c00, 1}, + {0x0e3c28, 1}, + {0x0e3c50, 1}, + {0x0e3c78, 1}, + {0x0e3ca0, 2}, + {0x0e3dac, 1}, + {0x0e3dd8, 4}, + {0x0e3e00, 6}, + {0x0e3e20, 6}, + {0x0e3e40, 3}, + {0x0e4010, 12}, + {0x0e4044, 3}, + {0x0e4084, 2}, + {0x0e40bc, 84}, + {0x0e4240, 18}, + {0x0e45f0, 4}, + {0x0e4604, 1}, + {0x0e4640, 16}, + {0x0e46f0, 4}, + {0x0e4704, 1}, + {0x0e4740, 16}, + {0x0e5000, 8}, + {0x0e6000, 9}, + {0x0e6040, 16}, + {0x0e8000, 9}, + {0x0e8080, 6}, + {0x0e80a0, 3}, + {0x0f0000, 3}, + {0x0f0014, 11}, + {0x0f004c, 3}, + {0x0f0060, 8}, + {0x0f00f0, 3}, + {0x0f0100, 1}, + {0x0f010c, 2}, + {0x0f0118, 1}, + {0x0f0130, 4}, + {0x0f0180, 3}, + {0x0f0190, 2}, + {0x0f01a8, 1}, + {0x0f01c0, 2}, + {0x0f01d0, 10}, + {0x0f0200, 61}, + {0x0f0404, 9}, + {0x0f0440, 12}, + {0x0f0480, 5}, + {0x0f04b8, 21}, + {0x0f0520, 1}, + {0x0f0528, 1}, + {0x0f0540, 2}, + {0x0f0580, 4}, + {0x0f05a0, 1}, + {0x0f05c0, 8}, + {0x0f0800, 17}, + {0x0f0850, 9}, + {0x0f0880, 9}, + {0x0f08b0, 9}, + {0x0f08e0, 9}, + {0x0f0920, 4}, + {0x0f093c, 5}, + {0x0f095c, 5}, + {0x0f097c, 5}, + {0x0f099c, 5}, + {0x0f09bc, 5}, + {0x0f09dc, 1}, + {0x0f0a90, 2}, + {0x0f0c00, 128}, + {0x0f0e04, 1}, + {0x0f0e14, 9}, + {0x0f0e3c, 1}, + {0x0f1000, 3}, + {0x0f1010, 12}, + {0x0f1080, 10}, + {0x0f10c0, 1}, + {0x0f10e0, 2}, + {0x0f10ec, 1}, + {0x0f10f4, 3}, + {0x0f1400, 6}, + {0x0f1420, 6}, + {0x0f1440, 6}, + {0x0f1460, 6}, + {0x0f1480, 6}, + {0x0f14a0, 6}, + {0x0f14c0, 6}, + {0x0f14e0, 6}, + {0x0f1500, 6}, + {0x0f1520, 6}, + {0x0f1540, 6}, + {0x0f1560, 6}, + {0x0f1580, 6}, + {0x0f15a0, 6}, + {0x0f15c0, 6}, + {0x0f15e0, 6}, + {0x0f1600, 6}, + {0x0f1620, 3}, + {0x0f1800, 3}, + {0x0f1840, 4}, + {0x0f1854, 3}, + {0x0f1864, 3}, + {0x0f1874, 3}, + {0x0f2000, 2}, + {0x0f200c, 3}, + {0x0f2020, 10}, + {0x0f2060, 6}, + {0x0f2080, 2}, + {0x0f208c, 3}, + {0x0f20a0, 10}, + {0x0f20e0, 6}, + {0x0f2100, 2}, + {0x0f210c, 3}, + {0x0f2120, 10}, + {0x0f2160, 6}, + {0x0f2180, 2}, + {0x0f218c, 3}, + {0x0f21a0, 10}, + {0x0f21e0, 6}, + {0x0f2200, 2}, + {0x0f220c, 3}, + {0x0f2220, 10}, + {0x0f2260, 6}, + {0x0f2280, 2}, + {0x0f228c, 3}, + {0x0f22a0, 10}, + {0x0f22e0, 6}, + {0x0f2300, 2}, + {0x0f230c, 3}, + {0x0f2320, 10}, + {0x0f2360, 6}, + {0x0f2380, 2}, + {0x0f238c, 3}, + {0x0f23a0, 10}, + {0x0f23e0, 6}, + {0x0f2400, 2}, + {0x0f240c, 3}, + {0x0f2420, 10}, + {0x0f2460, 6}, + {0x0f2480, 2}, + {0x0f248c, 3}, + {0x0f24a0, 10}, + {0x0f24e0, 6}, + {0x0f2500, 2}, + {0x0f250c, 3}, + {0x0f2520, 10}, + {0x0f2560, 6}, + {0x0f2580, 2}, + {0x0f258c, 3}, + {0x0f25a0, 10}, + {0x0f25e0, 6}, + {0x0f2600, 2}, + {0x0f260c, 3}, + {0x0f2620, 10}, + {0x0f2660, 6}, + {0x0f2680, 2}, + {0x0f268c, 3}, + {0x0f26a0, 10}, + {0x0f26e0, 6}, + {0x0f2700, 2}, + {0x0f270c, 3}, + {0x0f2720, 10}, + {0x0f2760, 6}, + {0x0f2780, 2}, + {0x0f278c, 3}, + {0x0f27a0, 10}, + {0x0f27e0, 6}, + {0x0f2800, 2}, + {0x0f280c, 3}, + {0x0f2820, 10}, + {0x0f2860, 6}, + {0x0f2880, 2}, + {0x0f288c, 3}, + {0x0f28a0, 10}, + {0x0f28e0, 6}, + {0x0f2900, 2}, + {0x0f290c, 3}, + {0x0f2920, 10}, + {0x0f2960, 6}, + {0x0f2980, 2}, + {0x0f298c, 3}, + {0x0f29a0, 10}, + {0x0f29e0, 6}, + {0x0f4000, 7}, + {0x0f4020, 4}, + {0x0f4204, 1}, + {0x0f4280, 35}, + {0x0f4310, 4}, + {0x0f4404, 1}, + {0x0f4480, 34}, + {0x0f4510, 10}, + {0x0f453c, 3}, + {0x0f4800, 7}, + {0x0f4820, 4}, + {0x0f4a04, 1}, + {0x0f4a80, 35}, + {0x0f4b10, 4}, + {0x0f4c04, 1}, + {0x0f4c80, 34}, + {0x0f4d10, 10}, + {0x0f4d3c, 3}, + {0x0f5000, 7}, + {0x0f5020, 4}, + {0x0f5204, 1}, + {0x0f5280, 35}, + {0x0f5310, 4}, + {0x0f5404, 1}, + {0x0f5480, 34}, + {0x0f5510, 10}, + {0x0f553c, 3}, + {0x0f5800, 7}, + {0x0f5820, 4}, + {0x0f5a04, 1}, + {0x0f5a80, 35}, + {0x0f5b10, 4}, + {0x0f5c04, 1}, + {0x0f5c80, 34}, + {0x0f5d10, 10}, + {0x0f5d3c, 3}, + {0x0f6000, 7}, + {0x0f6020, 4}, + {0x0f6204, 1}, + {0x0f6280, 35}, + {0x0f6310, 4}, + {0x0f6404, 1}, + {0x0f6480, 34}, + {0x0f6510, 10}, + {0x0f653c, 3}, + {0x0f6800, 7}, + {0x0f6820, 4}, + {0x0f6a04, 1}, + {0x0f6a80, 35}, + {0x0f6b10, 4}, + {0x0f6c04, 1}, + {0x0f6c80, 34}, + {0x0f6d10, 10}, + {0x0f6d3c, 3}, + {0x100000, 1}, + {0x100008, 1}, + {0x100010, 2}, + {0x100020, 1}, + {0x100028, 1}, + {0x100030, 2}, + {0x100040, 1}, + {0x100048, 1}, + {0x100050, 2}, + {0x100060, 1}, + {0x100068, 1}, + {0x100070, 2}, + {0x100080, 21}, + {0x100100, 21}, + {0x100180, 21}, + {0x100200, 21}, + {0x100284, 1}, + {0x1003b0, 5}, + {0x100400, 13}, + {0x100440, 13}, + {0x100480, 13}, + {0x1004c0, 13}, + {0x100500, 68}, + {0x100618, 1}, + {0x100804, 1}, + {0x10080c, 4}, + {0x100820, 9}, + {0x1008a0, 24}, + {0x100920, 24}, + {0x100a00, 48}, + {0x100b00, 59}, + {0x100d00, 1}, + {0x100d08, 2}, + {0x100d80, 4}, + {0x100da0, 6}, + {0x100e00, 4}, + {0x100e20, 1}, + {0x100e28, 7}, + {0x100e48, 7}, + {0x100e68, 7}, + {0x100e88, 6}, + {0x100ee0, 6}, + {0x100f00, 6}, + {0x100f1c, 10}, + {0x100f70, 8}, + {0x100f94, 4}, + {0x100fc0, 6}, + {0x100fe0, 6}, + {0x101400, 16}, + {0x101444, 1}, + {0x10145c, 10}, + {0x101504, 1}, + {0x10151c, 30}, + {0x101600, 1}, + {0x101628, 6}, + {0x101648, 6}, + {0x101680, 16}, + {0x1016e0, 16}, + {0x101780, 1}, + {0x101790, 16}, + {0x101a00, 14}, + {0x101a40, 3}, + {0x101a50, 2}, + {0x101a60, 2}, + {0x101a70, 2}, + {0x101a80, 2}, + {0x101a90, 1}, + {0x101a9c, 11}, + {0x101b0c, 5}, + {0x101c00, 34}, + {0x101d00, 3}, + {0x102000, 1}, + {0x102028, 1}, + {0x102050, 1}, + {0x102078, 1}, + {0x1020a0, 7}, + {0x1021ac, 1}, + {0x1021d8, 4}, + {0x102200, 6}, + {0x102220, 6}, + {0x102240, 3}, + {0x102560, 1}, + {0x102584, 10}, + {0x1025b0, 1}, + {0x1025fc, 1}, + {0x102604, 1}, + {0x1026ec, 69}, + {0x103000, 32}, + {0x103084, 5}, + {0x1030f8, 3}, + {0x103108, 3}, + {0x103118, 7}, + {0x103144, 1}, + {0x103160, 10}, + {0x103200, 32}, + {0x103284, 5}, + {0x1032f8, 3}, + {0x103308, 3}, + {0x103318, 7}, + {0x103344, 1}, + {0x103360, 10}, + {0x103400, 32}, + {0x103484, 5}, + {0x1034f8, 3}, + {0x103508, 3}, + {0x103518, 7}, + {0x103544, 1}, + {0x103560, 10}, + {0x103600, 32}, + {0x103684, 5}, + {0x1036f8, 3}, + {0x103708, 3}, + {0x103718, 7}, + {0x103744, 1}, + {0x103760, 10}, + {0x103800, 1}, + {0x10380c, 1}, + {0x103a00, 64}, + {0x104000, 3}, + {0x104020, 3}, + {0x104040, 3}, + {0x104060, 3}, + {0x104084, 1}, + {0x104090, 4}, + {0x1040a4, 1}, + {0x1040b0, 4}, + {0x1040c4, 1}, + {0x1040d0, 4}, + {0x1040e4, 1}, + {0x1040f0, 21}, + {0x104148, 18}, + {0x1041f0, 6}, + {0x104308, 9}, + {0x104330, 1}, + {0x104340, 16}, + {0x1043b4, 4}, + {0x1043c8, 4}, + {0x1043dc, 4}, + {0x1043f0, 4}, + {0x104404, 1}, + {0x104470, 36}, + {0x104504, 1}, + {0x104570, 36}, + {0x104604, 1}, + {0x104670, 36}, + {0x104704, 1}, + {0x104770, 50}, + {0x104840, 2}, + {0x10484c, 1}, + {0x104900, 1}, + {0x104908, 1}, + {0x104984, 1}, + {0x1049a0, 24}, + {0x104a08, 6}, + {0x104a28, 6}, + {0x104a48, 6}, + {0x104a68, 6}, + {0x104a88, 6}, + {0x104aa8, 6}, + {0x104ac8, 6}, + {0x104ae8, 15}, + {0x104b40, 16}, + {0x104c00, 6}, + {0x104c20, 6}, + {0x104c40, 6}, + {0x104c60, 6}, + {0x104c80, 6}, + {0x104ca0, 6}, + {0x104cc0, 6}, + {0x104ce0, 6}, + {0x104d00, 3}, + {0x104d20, 6}, + {0x104d40, 6}, + {0x105000, 448}, + {0x105704, 3}, + {0x105734, 1}, + {0x106000, 62}, + {0x106100, 32}, + {0x106184, 1}, + {0x107010, 1}, + {0x110000, 4}, + {0x110014, 2}, + {0x110020, 5}, + {0x110040, 5}, + {0x110060, 6}, + {0x110080, 5}, + {0x110098, 1}, + {0x1100a0, 4}, + {0x1100b8, 8}, + {0x1100e0, 6}, + {0x110200, 4}, + {0x110214, 2}, + {0x110220, 5}, + {0x110240, 5}, + {0x110260, 6}, + {0x110280, 5}, + {0x110298, 1}, + {0x1102a0, 4}, + {0x1102b8, 8}, + {0x1102e0, 6}, + {0x110400, 4}, + {0x110414, 2}, + {0x110420, 5}, + {0x110440, 5}, + {0x110460, 6}, + {0x110480, 5}, + {0x110498, 1}, + {0x1104a0, 4}, + {0x1104b8, 8}, + {0x1104e0, 6}, + {0x110600, 4}, + {0x110614, 2}, + {0x110620, 5}, + {0x110640, 5}, + {0x110660, 6}, + {0x110680, 5}, + {0x110698, 1}, + {0x1106a0, 4}, + {0x1106b8, 8}, + {0x1106e0, 6}, + {0x110800, 21}, + {0x110880, 15}, + {0x1108c0, 3}, + {0x1108d0, 2}, + {0x110900, 1}, + {0x111000, 1}, + {0x111028, 1}, + {0x111050, 1}, + {0x111078, 1}, + {0x1110a0, 5}, + {0x1111ac, 1}, + {0x1111d8, 4}, + {0x111200, 6}, + {0x111220, 6}, + {0x111240, 3}, + {0x111400, 3}, + {0x111480, 9}, + {0x1114c0, 25}, + {0x111540, 25}, + {0x1115c0, 25}, + {0x111640, 17}, + {0x11168c, 1}, + {0x118000, 29}, + {0x118078, 4}, + {0x118090, 2}, + {0x1180a0, 7}, + {0x1180c0, 11}, + {0x118100, 14}, + {0x118140, 14}, + {0x118180, 61}, + {0x118278, 4}, + {0x118290, 2}, + {0x1182a0, 7}, + {0x1182c0, 11}, + {0x118300, 14}, + {0x118340, 14}, + {0x118380, 61}, + {0x118478, 4}, + {0x118490, 2}, + {0x1184a0, 7}, + {0x1184c0, 11}, + {0x118500, 14}, + {0x118540, 14}, + {0x118580, 61}, + {0x118678, 4}, + {0x118690, 2}, + {0x1186a0, 7}, + {0x1186c0, 11}, + {0x118700, 14}, + {0x118740, 14}, + {0x118780, 62}, + {0x11887c, 1}, + {0x118a00, 8}, + {0x118a24, 15}, + {0x118a64, 30}, + {0x118b00, 4}, + {0x118b20, 3}, + {0x118c00, 6}, + {0x118c40, 14}, + {0x118c80, 9}, + {0x118d00, 9}, + {0x118d2c, 1}, + {0x118d40, 3}, + {0x118d60, 1}, + {0x118d80, 3}, + {0x118e00, 2}, + {0x118e0c, 1}, + {0x118e14, 5}, + {0x118e2c, 1}, + {0x118e34, 5}, + {0x118e4c, 1}, + {0x118e54, 5}, + {0x118e6c, 1}, + {0x118e74, 5}, + {0x118e8c, 1}, + {0x118e94, 5}, + {0x118eac, 1}, + {0x118eb4, 3}, + {0x119000, 29}, + {0x119078, 4}, + {0x119090, 2}, + {0x1190a0, 7}, + {0x1190c0, 11}, + {0x119100, 14}, + {0x119140, 14}, + {0x119180, 61}, + {0x119278, 4}, + {0x119290, 2}, + {0x1192a0, 7}, + {0x1192c0, 11}, + {0x119300, 14}, + {0x119340, 14}, + {0x119380, 61}, + {0x119478, 4}, + {0x119490, 2}, + {0x1194a0, 7}, + {0x1194c0, 11}, + {0x119500, 14}, + {0x119540, 14}, + {0x119580, 61}, + {0x119678, 4}, + {0x119690, 2}, + {0x1196a0, 7}, + {0x1196c0, 11}, + {0x119700, 14}, + {0x119740, 14}, + {0x119780, 62}, + {0x11987c, 1}, + {0x119a00, 8}, + {0x119a24, 15}, + {0x119a64, 30}, + {0x119b00, 4}, + {0x119b20, 3}, + {0x119c00, 6}, + {0x119c40, 14}, + {0x119c80, 9}, + {0x119d00, 9}, + {0x119d2c, 1}, + {0x119d40, 3}, + {0x119d60, 1}, + {0x119d80, 3}, + {0x119e00, 2}, + {0x119e0c, 1}, + {0x119e14, 5}, + {0x119e2c, 1}, + {0x119e34, 5}, + {0x119e4c, 1}, + {0x119e54, 5}, + {0x119e6c, 1}, + {0x119e74, 5}, + {0x119e8c, 1}, + {0x119e94, 5}, + {0x119eac, 1}, + {0x119eb4, 3}, + {0x11a000, 29}, + {0x11a078, 4}, + {0x11a090, 2}, + {0x11a0a0, 7}, + {0x11a0c0, 11}, + {0x11a100, 14}, + {0x11a140, 14}, + {0x11a180, 61}, + {0x11a278, 4}, + {0x11a290, 2}, + {0x11a2a0, 7}, + {0x11a2c0, 11}, + {0x11a300, 14}, + {0x11a340, 14}, + {0x11a380, 61}, + {0x11a478, 4}, + {0x11a490, 2}, + {0x11a4a0, 7}, + {0x11a4c0, 11}, + {0x11a500, 14}, + {0x11a540, 14}, + {0x11a580, 61}, + {0x11a678, 4}, + {0x11a690, 2}, + {0x11a6a0, 7}, + {0x11a6c0, 11}, + {0x11a700, 14}, + {0x11a740, 14}, + {0x11a780, 62}, + {0x11a87c, 1}, + {0x11aa00, 8}, + {0x11aa24, 15}, + {0x11aa64, 30}, + {0x11ab00, 4}, + {0x11ab20, 3}, + {0x11ac00, 6}, + {0x11ac40, 14}, + {0x11ac80, 9}, + {0x11ad00, 9}, + {0x11ad2c, 1}, + {0x11ad40, 3}, + {0x11ad60, 1}, + {0x11ad80, 3}, + {0x11ae00, 2}, + {0x11ae0c, 1}, + {0x11ae14, 5}, + {0x11ae2c, 1}, + {0x11ae34, 5}, + {0x11ae4c, 1}, + {0x11ae54, 5}, + {0x11ae6c, 1}, + {0x11ae74, 5}, + {0x11ae8c, 1}, + {0x11ae94, 5}, + {0x11aeac, 1}, + {0x11aeb4, 3}, + {0x11b000, 29}, + {0x11b078, 4}, + {0x11b090, 2}, + {0x11b0a0, 7}, + {0x11b0c0, 11}, + {0x11b100, 14}, + {0x11b140, 14}, + {0x11b180, 61}, + {0x11b278, 4}, + {0x11b290, 2}, + {0x11b2a0, 7}, + {0x11b2c0, 11}, + {0x11b300, 14}, + {0x11b340, 14}, + {0x11b380, 61}, + {0x11b478, 4}, + {0x11b490, 2}, + {0x11b4a0, 7}, + {0x11b4c0, 11}, + {0x11b500, 14}, + {0x11b540, 14}, + {0x11b580, 61}, + {0x11b678, 4}, + {0x11b690, 2}, + {0x11b6a0, 7}, + {0x11b6c0, 11}, + {0x11b700, 14}, + {0x11b740, 14}, + {0x11b780, 62}, + {0x11b87c, 1}, + {0x11ba00, 8}, + {0x11ba24, 15}, + {0x11ba64, 30}, + {0x11bb00, 4}, + {0x11bb20, 3}, + {0x11bc00, 6}, + {0x11bc40, 14}, + {0x11bc80, 9}, + {0x11bd00, 9}, + {0x11bd2c, 1}, + {0x11bd40, 3}, + {0x11bd60, 1}, + {0x11bd80, 3}, + {0x11be00, 2}, + {0x11be0c, 1}, + {0x11be14, 5}, + {0x11be2c, 1}, + {0x11be34, 5}, + {0x11be4c, 1}, + {0x11be54, 5}, + {0x11be6c, 1}, + {0x11be74, 5}, + {0x11be8c, 1}, + {0x11be94, 5}, + {0x11beac, 1}, + {0x11beb4, 3}, + {0x11c000, 19}, + {0x11c050, 10}, + {0x11c080, 19}, + {0x11c0d0, 10}, + {0x11c100, 19}, + {0x11c150, 10}, + {0x11c180, 19}, + {0x11c1d0, 10}, + {0x11c200, 19}, + {0x11c250, 10}, + {0x11c280, 19}, + {0x11c2d0, 10}, + {0x11c300, 19}, + {0x11c350, 10}, + {0x11c380, 19}, + {0x11c3d0, 10}, + {0x11c400, 19}, + {0x11c450, 10}, + {0x11c480, 19}, + {0x11c4d0, 10}, + {0x11c500, 19}, + {0x11c550, 10}, + {0x11c580, 19}, + {0x11c5d0, 10}, + {0x11c600, 19}, + {0x11c650, 10}, + {0x11c680, 19}, + {0x11c6d0, 10}, + {0x11c700, 19}, + {0x11c750, 10}, + {0x11c780, 19}, + {0x11c7d0, 10}, + {0x11c800, 19}, + {0x11c860, 6}, + {0x11c884, 1}, + {0x11c894, 22}, + {0x11c900, 7}, + {0x11d000, 7}, + {0x11d020, 15}, + {0x11d060, 15}, + {0x11d0a0, 15}, + {0x11d0e0, 15}, + {0x11d120, 15}, + {0x11d160, 15}, + {0x11d1a0, 15}, + {0x11d1e0, 15}, + {0x11d220, 15}, + {0x11d260, 15}, + {0x11d2a0, 15}, + {0x11d2e0, 15}, + {0x11d320, 15}, + {0x11d360, 15}, + {0x11d3a0, 15}, + {0x11d3e0, 17}, + {0x11d428, 3}, + {0x11d440, 5}, + {0x11d480, 9}, + {0x11d4a8, 3}, + {0x11d4c0, 5}, + {0x11d500, 9}, + {0x11d528, 3}, + {0x11d540, 5}, + {0x11d580, 9}, + {0x11d5a8, 3}, + {0x11d5c0, 5}, + {0x11d600, 6}, + {0x11d620, 6}, + {0x11d640, 6}, + {0x11d660, 6}, + {0x11d680, 6}, + {0x11d6a0, 6}, + {0x11d6c0, 6}, + {0x11d6e0, 6}, + {0x11d700, 12}, + {0x11d734, 1}, + {0x11d73c, 4}, + {0x11d750, 4}, + {0x11d764, 1}, + {0x11d800, 102}, + {0x11da00, 3}, + {0x11da10, 1}, + {0x11da18, 2}, + {0x11da24, 7}, + {0x11da50, 4}, + {0x11da80, 3}, + {0x11da90, 1}, + {0x11da98, 2}, + {0x11daa4, 7}, + {0x11dad0, 4}, + {0x11db00, 3}, + {0x11db10, 1}, + {0x11db18, 2}, + {0x11db24, 7}, + {0x11db50, 4}, + {0x11db80, 3}, + {0x11db90, 1}, + {0x11db98, 2}, + {0x11dba4, 7}, + {0x11dbd0, 4}, + {0x11dc00, 17}, + {0x11e000, 72}, + {0x11e200, 72}, + {0x11e400, 72}, + {0x11e600, 72}, + {0x11e800, 6}, + {0x11e820, 6}, + {0x11e840, 6}, + {0x11e860, 6}, + {0x11e880, 6}, + {0x11e8a0, 6}, + {0x11e8c0, 6}, + {0x11e8e0, 6}, + {0x11e900, 6}, + {0x11e920, 1}, + {0x11ea00, 6}, + {0x11ea20, 6}, + {0x11ea40, 6}, + {0x11ea60, 6}, + {0x11ea80, 6}, + {0x11eaa0, 6}, + {0x11eac0, 6}, + {0x11eae0, 6}, + {0x11eb00, 6}, + {0x11eb20, 1}, + {0x11ec00, 6}, + {0x11ec20, 6}, + {0x11ec40, 6}, + {0x11ec60, 6}, + {0x11ec80, 6}, + {0x11eca0, 6}, + {0x11ecc0, 6}, + {0x11ece0, 6}, + {0x11ed00, 6}, + {0x11ed20, 1}, + {0x11ee00, 6}, + {0x11ee20, 6}, + {0x11ee40, 6}, + {0x11ee60, 6}, + {0x11ee80, 6}, + {0x11eea0, 6}, + {0x11eec0, 6}, + {0x11eee0, 6}, + {0x11ef00, 6}, + {0x11ef20, 1}, + {0x11f000, 14}, + {0x11f040, 2}, + {0x11f080, 14}, + {0x11f0c0, 2}, + {0x11f100, 14}, + {0x11f140, 2}, + {0x11f180, 14}, + {0x11f1c0, 2}, + {0x11f400, 17}, + {0x11f448, 5}, + {0x11f460, 3}, + {0x11f470, 3}, + {0x11f480, 13}, + {0x11f4b8, 1}, + {0x11f500, 17}, + {0x11f548, 5}, + {0x11f560, 3}, + {0x11f570, 3}, + {0x11f580, 13}, + {0x11f5b8, 1}, + {0x11f600, 17}, + {0x11f648, 5}, + {0x11f660, 3}, + {0x11f670, 3}, + {0x11f680, 13}, + {0x11f6b8, 1}, + {0x11f700, 17}, + {0x11f748, 5}, + {0x11f760, 3}, + {0x11f770, 3}, + {0x11f780, 13}, + {0x11f7b8, 1}, + {0x11f800, 8}, + {0x11f824, 15}, + {0x11f864, 15}, + {0x11f8a4, 15}, + {0x11f8e4, 30}, + {0x11f980, 10}, + {0x11f9ac, 1}, + {0x11f9b4, 5}, + {0x11f9cc, 1}, + {0x11f9d4, 5}, + {0x11f9ec, 1}, + {0x11f9f4, 13}, + {0x11fa80, 7}, + {0x11faa0, 5}, + {0x11fac0, 5}, + {0x11fae0, 4}, + {0x11fc04, 8}, + {0x11fc40, 13}, + {0x11fc80, 16}, + {0x11fd00, 6}, + {0x11fd20, 3}, + {0x11fd30, 3}, + {0x11fd40, 19}, + {0x11fdc0, 3} }; + +enum { + IFC_MAX_RETRIES = 2048 +}; + +enum { + PCI_CAP_PTR = 0x34, + PCI_HDR_SIZE = 0x40, + PCI_EXT_SPACE_ADDR = 0xff, + + PCI_CTRL_OFFSET = 0x4, + PCI_COUNTER_OFFSET = 0x8, + PCI_SEMAPHORE_OFFSET = 0xc, + + PCI_ADDR_OFFSET = 0x10, + PCI_ADDR_BIT_LEN = 30, + + PCI_DATA_OFFSET = 0x14, + + PCI_FLAG_BIT_OFFS = 31, + + PCI_SPACE_BIT_OFFS = 0, + PCI_SPACE_BIT_LEN = 16, + + PCI_SIZE_VLD_BIT_OFFS = 28, + PCI_SIZE_VLD_BIT_LEN = 1, + + PCI_STATUS_BIT_OFFS = 29, + PCI_STATUS_BIT_LEN = 3, +}; + +enum{ + CX4_DEVID = 0x209, + CX4LX_DEVID = 0x20b, +}; + +struct mlx5_mst_dump { + void *on_demand_dump; + u32 dump_size; + u32 vsec_addr; + /*sync mst dump */ + struct mutex lock; +}; + +#define MLX5_PROTECTED_CR_SPCAE_DOMAIN 0x6 +#define MLX5_PROTECTED_CR_SCAN_CRSPACE 0x7 + +#define BAD_ACCESS_VAL 0xbadacce5 + +int mlx5_pciconf_set_addr_space(struct mlx5_core_dev *dev, u16 space) +{ + int ret = 0; + u32 val; + + ret = pci_read_config_dword(dev->pdev, + dev->mst_dump->vsec_addr + + PCI_CTRL_OFFSET, + &val); + if (ret) + goto out; + + val = MLX5_MERGE(val, space, PCI_SPACE_BIT_OFFS, PCI_SPACE_BIT_LEN); + ret = pci_write_config_dword(dev->pdev, + dev->mst_dump->vsec_addr + + PCI_CTRL_OFFSET, + val); + if (ret) + goto out; + + ret = pci_read_config_dword(dev->pdev, + dev->mst_dump->vsec_addr + + PCI_CTRL_OFFSET, + &val); + if (ret) + goto out; + + if (MLX5_EXTRACT(val, PCI_STATUS_BIT_OFFS, PCI_STATUS_BIT_LEN) == 0) + return -EINVAL; + + if ((space == MLX5_PROTECTED_CR_SCAN_CRSPACE || space == MLX5_PROTECTED_CR_SPCAE_DOMAIN) && + (!MLX5_EXTRACT(val, PCI_SIZE_VLD_BIT_OFFS, PCI_SIZE_VLD_BIT_LEN))) { + mlx5_core_warn(dev, "Failed to get protected cr space size, valid bit not set"); + return -EINVAL; + } + + return 0; +out: + return ret; +} + +int mlx5_pciconf_set_protected_addr_space(struct mlx5_core_dev *dev, + u32 *ret_space_size) { + int ret; + + if (!ret_space_size) + return -EINVAL; + + *ret_space_size = 0; + + ret = mlx5_pciconf_set_addr_space(dev, MLX5_PROTECTED_CR_SCAN_CRSPACE); + if (ret) { + ret = mlx5_pciconf_set_addr_space(dev, MLX5_PROTECTED_CR_SPCAE_DOMAIN); + if (ret) + return ret; + dev->priv.health.crdump->space = MLX5_PROTECTED_CR_SPCAE_DOMAIN; + } else { + dev->priv.health.crdump->space = MLX5_PROTECTED_CR_SCAN_CRSPACE; + } + + ret = pci_read_config_dword(dev->pdev, + dev->mst_dump->vsec_addr + + PCI_ADDR_OFFSET, + ret_space_size); + if (ret) { + mlx5_core_warn(dev, "Failed to get read protected cr space size"); + return ret; + } + + *ret_space_size = MLX5_EXTRACT(*ret_space_size, 0, PCI_ADDR_BIT_LEN); + + return 0; +} + +int mlx5_pciconf_cap9_sem(struct mlx5_core_dev *dev, int state) +{ + u32 counter = 0; + int retries = 0; + u32 lock_val; + int ret; + + if (state == UNLOCK) { + ret = pci_write_config_dword(dev->pdev, + dev->mst_dump->vsec_addr + + PCI_SEMAPHORE_OFFSET, + UNLOCK); + if (ret) + goto out; + } else { + do { + if (retries > IFC_MAX_RETRIES) + return -EBUSY; + ret = pci_read_config_dword(dev->pdev, + dev->mst_dump->vsec_addr + + PCI_SEMAPHORE_OFFSET, + &lock_val); + if (ret) + goto out; + if (lock_val) { + retries++; + usleep_range(1000, 2000); + continue; + } + ret = pci_read_config_dword(dev->pdev, + dev->mst_dump->vsec_addr + + PCI_COUNTER_OFFSET, + &counter); + if (ret) + goto out; + ret = pci_write_config_dword(dev->pdev, + dev->mst_dump->vsec_addr + + PCI_SEMAPHORE_OFFSET, + counter); + if (ret) + goto out; + ret = pci_read_config_dword(dev->pdev, + dev->mst_dump->vsec_addr + + PCI_SEMAPHORE_OFFSET, + &lock_val); + if (ret) + goto out; + retries++; + } while (counter != lock_val); + } + return 0; +out: + return ret; +} + +static int mlx5_pciconf_wait_on_flag(struct mlx5_core_dev *dev, + u8 expected_val) +{ + int retries = 0; + u32 flag; + int ret; + + do { + if (retries > IFC_MAX_RETRIES) + return -EBUSY; + ret = pci_read_config_dword(dev->pdev, + dev->mst_dump->vsec_addr + + PCI_ADDR_OFFSET, + &flag); + flag = MLX5_EXTRACT(flag, PCI_FLAG_BIT_OFFS, 1); + retries++; + if ((retries & 0xf) == 0) + usleep_range(1000, 2000); + } while (flag != expected_val); + return 0; +} + +static int mlx5_pciconf_read(struct mlx5_core_dev *dev, + unsigned int offset, + u32 *data) +{ + u32 address; + int ret; + + if (MLX5_EXTRACT(offset, 31, 1)) + return -EINVAL; + address = MLX5_MERGE(offset, 0, PCI_FLAG_BIT_OFFS, 1); + ret = pci_write_config_dword(dev->pdev, + dev->mst_dump->vsec_addr + + PCI_ADDR_OFFSET, + address); + if (ret) + goto out; + ret = mlx5_pciconf_wait_on_flag(dev, 1); + if (ret) + goto out; + ret = pci_read_config_dword(dev->pdev, + dev->mst_dump->vsec_addr + + PCI_DATA_OFFSET, + data); +out: + return ret; +} + +static int mlx5_pciconf_read_fast(struct mlx5_core_dev *dev, + unsigned int read_addr, + unsigned int *next_read_addr, + u32 *data) +{ + int ret; + + ret = mlx5_pciconf_read(dev, read_addr, data); + if (ret) + goto out; + + ret = pci_read_config_dword(dev->pdev, + dev->mst_dump->vsec_addr + + PCI_ADDR_OFFSET, + next_read_addr); + if (ret) + goto out; + + *next_read_addr = MLX5_EXTRACT(*next_read_addr, 0, PCI_ADDR_BIT_LEN); + + if (*next_read_addr <= read_addr) + ret = EINVAL; +out: + return ret; +} + +static int mlx5_pciconf_write(struct mlx5_core_dev *dev, + unsigned int offset, + u32 data) +{ + u32 address; + int ret; + + if (MLX5_EXTRACT(offset, 31, 1)) + return -EINVAL; + + /* Set flag to 0x1 */ + address = MLX5_MERGE(offset, 1, PCI_FLAG_BIT_OFFS, 1); + + ret = pci_write_config_dword(dev->pdev, + dev->mst_dump->vsec_addr + + PCI_DATA_OFFSET, + data); + if (ret) + goto out; + + ret = pci_write_config_dword(dev->pdev, + dev->mst_dump->vsec_addr + + PCI_ADDR_OFFSET, + address); + if (ret) + goto out; + + /* Wait for the flag to be cleared */ + ret = mlx5_pciconf_wait_on_flag(dev, 0); + +out: + return ret; +} + +int mlx5_block_op_pciconf(struct mlx5_core_dev *dev, + unsigned int offset, u32 *data, + int length) +{ + int read = length; + int i; + + if (length % 4) + return -EINVAL; + for (i = 0; i < length; i += 4) { + if (mlx5_pciconf_read(dev, offset + i, &data[(i >> 2)])) { + read = i; + goto cleanup; + } + } +cleanup: + return read; +} + +int mlx5_block_op_pciconf_fast(struct mlx5_core_dev *dev, + u32 *data, + int length) +{ + unsigned int next_read_addr = 0; + unsigned int read_addr = 0; + int i; + + if (length % 4) + return -EINVAL; + + for (i = 0; i < (length / 4); i++) + data[i] = BAD_ACCESS_VAL; + + while (read_addr < length) { + if (mlx5_pciconf_read_fast(dev, read_addr, &next_read_addr, &data[(read_addr >> 2)])) + return read_addr; + + read_addr = next_read_addr; + } + return length; +} + +static int mlx5_read_reg_dword(struct mlx5_core_dev *dev, u32 addr, u32 *data) +{ + int ret = 0; + + ret = pci_write_config_dword(dev->pdev, MLX5_ADDR_REG, addr); + if (ret) + return ret; + ret = pci_read_config_dword(dev->pdev, MLX5_DATA_REG, data); + return ret; +} + +static int mlx5_block_op_pciconf_old(struct mlx5_core_dev *dev, + unsigned int offset, u32 *data, int length) +{ + int read = length; + int i; + + if (length % 4) + return -EINVAL; + for (i = 0; i < length ; i += 4) { + if (mlx5_read_reg_dword(dev, offset + i, &data[(i >> 2)])) { + read = i; + goto cleanup; + } + } +cleanup: + return read; +} + +static int mlx5_read4_block_new(struct mlx5_core_dev *dev, + unsigned int offset, u32 *data, int length) +{ + return mlx5_block_op_pciconf(dev, offset, data, length); +} + +static int mlx5_read4_block_old(struct mlx5_core_dev *dev, + unsigned int offset, + u32 *data, + int length) +{ + return mlx5_block_op_pciconf_old(dev, offset, data, length); +} + +int mlx5_get_vendor_cap_addr(struct mlx5_core_dev *dev) +{ + int vend_cap; + int ret; + + vend_cap = pci_find_capability(dev->pdev, CAP_ID); + if (!vend_cap) + return 0; + dev->mst_dump->vsec_addr = vend_cap; + ret = mlx5_pciconf_cap9_sem(dev, LOCK); + if (ret) { + mlx5_core_warn(dev, "pciconf_cap9_sem locking failure\n"); + return 0; + } + if (mlx5_pciconf_set_addr_space(dev, MLX5_CR_SPACE_DOMAIN)) + vend_cap = 0; + ret = mlx5_pciconf_cap9_sem(dev, UNLOCK); + if (ret) + mlx5_core_warn(dev, "pciconf_cap9_sem unlocking failure\n"); + return vend_cap; +} + +int mlx5_mst_capture(struct mlx5_core_dev *dev) +{ + unsigned long (*blocks)[2]; + void *mst_data = NULL; + u32 total_len = 0; + int block_count; + int size = 0; + u32 *mst_out; + int ret = 0; + u32 i = 0; + u32 j = 0; + u32 *data; + u32 addr; + u32 hwid; + u32 rc; + + if (!dev->mst_dump) + return -ENODEV; + + mutex_lock(&dev->mst_dump->lock); + if (dev->mst_dump->vsec_addr) { + ret = mlx5_pciconf_cap9_sem(dev, LOCK); + if (ret) + goto unlock; + ret = mlx5_pciconf_set_addr_space(dev, MLX5_CR_SPACE_DOMAIN); + if (ret) + goto unlock; + ret = mlx5_read4_block_new(dev, MLX5_HWID_ADDR, &hwid, 4); + mlx5_pciconf_cap9_sem(dev, UNLOCK); + } else { + ret = mlx5_read_reg_dword(dev, MLX5_HWID_ADDR, &hwid); + if (ret) + goto unlock; + } + + if (hwid == CX4LX_DEVID) { + block_count = MLX5_NUM_MST_OFFSETS_MT4117; + blocks = (unsigned long (*)[2]) mlx5_mst_dump_regs_mt4117; + total_len = MLX5_MST_DUMP_SIZE_BYTES_MT4117; + } else if (hwid == CX4_DEVID) { + block_count = MLX5_NUM_MST_OFFSETS_MT4115; + blocks = (unsigned long (*)[2])mlx5_mst_dump_regs_mt4115; + total_len = MLX5_MST_DUMP_SIZE_BYTES_MT4115; + } else { + ret = -ENODEV; + goto unlock; + } + + mst_data = kcalloc(total_len, sizeof(u32), GFP_KERNEL); + if (!mst_data) { + ret = -ENOMEM; + goto unlock; + } + mst_out = mst_data; + if (dev->mst_dump->vsec_addr) { + ret = mlx5_pciconf_cap9_sem(dev, LOCK); + if (ret) + goto unlock; + } + for (i = 0; i < block_count; i++) { + data = kcalloc(blocks[i][1], sizeof(u32), GFP_KERNEL); + if (!data) { + ret = -ENOMEM; + goto unlock; + } + if (dev->mst_dump->vsec_addr) { + ret = mlx5_pciconf_set_addr_space(dev, + MLX5_CR_SPACE_DOMAIN); + if (ret) + goto unlock; + rc = mlx5_read4_block_new(dev, + blocks[i][0], + (u32 *)data, + blocks[i][1] * sizeof(u32)); + } else { + rc = mlx5_read4_block_old(dev, + blocks[i][0], + (u32 *)data, + blocks[i][1] * sizeof(u32)); + } + if (blocks[i][1] * sizeof(u32) != rc) { + kfree(data); + ret = -EINVAL; + goto unlock; + } + for (j = 0; j < blocks[i][1]; j++) { + addr = blocks[i][0] + (j * sizeof(u32)); + *mst_out = addr; + mst_out++; + *mst_out = ((u32 *)data)[j]; + mst_out++; + size += 2 * sizeof(u32); + } + kfree(data); + } + + if (dev->mst_dump->vsec_addr) + mlx5_pciconf_cap9_sem(dev, UNLOCK); + + kfree(dev->mst_dump->on_demand_dump); + dev->mst_dump->on_demand_dump = mst_data; + dev->mst_dump->dump_size = size; + ret = size; +unlock: + mutex_unlock(&dev->mst_dump->lock); + if (ret < 0) + kfree(mst_data); + return ret; +} + +u32 mlx5_mst_dump(struct mlx5_core_dev *dev, void *buff, u32 buff_sz) +{ + u32 copy_sz = 0; + + if (!dev->mst_dump) + return copy_sz; + + mutex_lock(&dev->mst_dump->lock); + if (dev->mst_dump->on_demand_dump) { + copy_sz = min(buff_sz, dev->mst_dump->dump_size); + memcpy(buff, dev->mst_dump->on_demand_dump, copy_sz); + } + mutex_unlock(&dev->mst_dump->lock); + + return copy_sz; +} + +void mlx5_mst_free_capture(struct mlx5_core_dev *dev) +{ + mutex_lock(&dev->mst_dump->lock); + kfree(dev->mst_dump->on_demand_dump); + dev->mst_dump->on_demand_dump = NULL; + dev->mst_dump->dump_size = 0; + mutex_unlock(&dev->mst_dump->lock); +} + +void mlx5_mst_dump_cleanup(struct mlx5_core_dev *dev) +{ + if (!dev->mst_dump) + return; + kfree(dev->mst_dump->on_demand_dump); + dev->mst_dump->on_demand_dump = NULL; + dev->mst_dump->dump_size = 0; + kfree(dev->mst_dump); +} + +int mlx5_mst_dump_init(struct mlx5_core_dev *dev) +{ + dev->mst_dump = kzalloc(sizeof(*dev->mst_dump), GFP_KERNEL); + if (!dev->mst_dump) + return -ENOMEM; + dev->mst_dump->vsec_addr = mlx5_get_vendor_cap_addr(dev); + mutex_init(&dev->mst_dump->lock); + return 0; +} + +static int mlx5_icmd_get_max_mailbox_sz(struct mlx5_core_dev *dev, + int *max_sz) +{ + return mlx5_pciconf_read(dev, MLX5_ICMD_MAILBOX_SZ, max_sz); +} + +static int mlx5_icmd_trigger(struct mlx5_core_dev *dev, + int opcode) +{ + union { + struct mlx5_icmd_ctrl_bits ctrl; + u32 ctrl_in; + u32 ctrl_out; + } u; + int retries = 0; + int ret; + + memset(&u.ctrl_in, 0, sizeof(u)); + + u.ctrl.opcode = cpu_to_be16(opcode); + u.ctrl.busy = 1; + + /* Write opcode to ctrl and set busy bit */ + ret = mlx5_pciconf_write(dev, MLX5_ICMD_CTRL, cpu_to_be32(u.ctrl_in)); + if (ret) + goto out; + + /* Read back ctrl and wait for busy bit to be cleared by hardware */ + do { + if (retries > IFC_MAX_RETRIES) + return -EBUSY; + + ret = mlx5_pciconf_read(dev, MLX5_ICMD_CTRL, &u.ctrl_out); + if (ret) + goto out; + + u.ctrl_out = cpu_to_be32(u.ctrl_out); + + retries++; + if ((retries & 0xf) == 0) + usleep_range(1000, 2000); + + } while (u.ctrl.busy != 0); + + if (u.ctrl.status) + return -EINVAL; + + return 0; +out: + return ret; +} + +static int mlx5_icmd_send(struct mlx5_core_dev *dev, + int opcode, void *mailbox, int dword_sz) +{ + u32 *mail_in = mailbox; + u32 *mail_out = mailbox; + int ret; + int i; + + /* Write mailbox input */ + for (i = 0; i < dword_sz; i++) { + ret = mlx5_pciconf_write(dev, + MLX5_ICMD_MAILBOX + i * 4, + cpu_to_be32(*mail_in++)); + + if (ret) + goto out; + } + + /* Trigger the cmd */ + mlx5_icmd_trigger(dev, opcode); + + /* Read mailbox output */ + for (i = 0; i < dword_sz; i++) { + *mail_out = 0; + ret = mlx5_pciconf_read(dev, + MLX5_ICMD_MAILBOX + i * 4, + mail_out); + + if (ret) + goto out; + + *mail_out = cpu_to_be32(*mail_out); + mail_out++; + } + +out: + return ret; +} + +int mlx5_icmd_access_register(struct mlx5_core_dev *dev, + int reg_id, + int method, + void *io_buff, + u32 io_buff_dw_sz) +{ + union { + struct mlx5_icmd_access_reg_input_bits mailbox_in; + struct mlx5_icmd_access_reg_output_bits mailbox_out; + u32 b[7]; + } u; + + u32 *data_in = io_buff; + u32 *data_out = io_buff; + int ret = 0; + int max_len; + int i; + + memset(u.b, 0, sizeof(u)); + + if (!dev->mst_dump) + return -ENODEV; + + if (!dev->mst_dump->vsec_addr) + return -ENODEV; + + if (io_buff_dw_sz > MLX5_ICMD_ACCESS_REG_DATA_DW_SZ) + return -EINVAL; + + u.mailbox_in.constant_1_2 = cpu_to_be16(0x1 << 11 | 0x4); + u.mailbox_in.register_id = cpu_to_be16(reg_id); + u.mailbox_in.method = method; + u.mailbox_in.constant_3 = 0x1; + u.mailbox_in.len = cpu_to_be16(0x3 << 11 | 0x3); + + for (i = 0; i < io_buff_dw_sz; i++) + u.mailbox_in.reg_data[i] = *data_in++; + + ret = mlx5_pciconf_cap9_sem(dev, LOCK); + if (ret) + goto out; + + ret = mlx5_pciconf_set_addr_space(dev, MLX5_ICMD_SPACE_DOMAIN); + if (ret) + goto unlock; + + ret = mlx5_icmd_get_max_mailbox_sz(dev, &max_len); + if (ret) + goto unlock; + + if (unlikely(max_len < sizeof(struct mlx5_icmd_access_reg_input_bits))) + return -EINVAL; + + /* Send access_register cmd */ + ret = mlx5_icmd_send(dev, MLX5_ICMD_ACCESS_REG, u.b, sizeof(u) / 4); + if (ret) + goto unlock; + + if (u.mailbox_out.status || + u.mailbox_out.register_id != cpu_to_be16(reg_id)) { + ret = u.mailbox_out.status; + goto unlock; + } + + /* Copy the output, length field takes 10 bits and unit is dword */ + if (method == MLX5_ICMD_QUERY) + memcpy(data_out, u.mailbox_out.reg_data, + ((cpu_to_be16(u.mailbox_out.len) & 0x7FF) - 1) * 4); + +unlock: + mlx5_pciconf_cap9_sem(dev, UNLOCK); +out: + return ret; +} + diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c new file mode 100644 index 0000000..8b33548 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c @@ -0,0 +1,917 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include "mlx5_core.h" +#include "lib/eq.h" +#include "lib/tout.h" +#ifdef CONFIG_MLX5_ESWITCH +#include "eswitch.h" +#endif + +enum { + MLX5_PAGES_CANT_GIVE = 0, + MLX5_PAGES_GIVE = 1, + MLX5_PAGES_TAKE = 2 +}; + +struct mlx5_pages_req { + struct mlx5_core_dev *dev; + u16 func_id; + u8 ec_function; + s32 npages; + struct work_struct work; + u8 release_all; +}; + +struct fw_page { + struct rb_node rb_node; + u64 addr; + struct page *page; + u32 function; + unsigned long bitmask; + struct list_head list; + unsigned int free_count; +}; + +enum { + MLX5_MAX_RECLAIM_TIME_MILI = 5000, + MLX5_NUM_4K_IN_PAGE = PAGE_SIZE / MLX5_ADAPTER_PAGE_SIZE, +}; + +static u32 get_function(u16 func_id, bool ec_function) +{ + return (u32)func_id | (ec_function << 16); +} + +static u32 get_ec_function(u32 function) +{ + return function >> 16; +} + +static u32 get_func_id(u32 function) +{ + return function & 0xffff; +} + +static struct rb_root *page_root_per_function(struct mlx5_core_dev *dev, u32 function) +{ + struct rb_root *root; + int err; + + root = xa_load(&dev->priv.page_root_xa, function); + if (root) + return root; + + root = kzalloc(sizeof(*root), GFP_KERNEL); + if (!root) + return ERR_PTR(-ENOMEM); + + err = xa_insert(&dev->priv.page_root_xa, function, root, GFP_KERNEL); + if (err) { + kfree(root); + return ERR_PTR(err); + } + + *root = RB_ROOT; + + return root; +} + +static int insert_page(struct mlx5_core_dev *dev, u64 addr, struct page *page, u32 function) +{ + struct rb_node *parent = NULL; + struct rb_root *root; + struct rb_node **new; + struct fw_page *nfp; + struct fw_page *tfp; + int i; + + root = page_root_per_function(dev, function); + if (IS_ERR(root)) + return PTR_ERR(root); + + new = &root->rb_node; + + while (*new) { + parent = *new; + tfp = rb_entry(parent, struct fw_page, rb_node); + if (tfp->addr < addr) + new = &parent->rb_left; + else if (tfp->addr > addr) + new = &parent->rb_right; + else + return -EEXIST; + } + + nfp = kzalloc(sizeof(*nfp), GFP_KERNEL); + if (!nfp) + return -ENOMEM; + + nfp->addr = addr; + nfp->page = page; + nfp->function = function; + nfp->free_count = MLX5_NUM_4K_IN_PAGE; + for (i = 0; i < MLX5_NUM_4K_IN_PAGE; i++) + set_bit(i, &nfp->bitmask); + + rb_link_node(&nfp->rb_node, parent, new); + rb_insert_color(&nfp->rb_node, root); + list_add(&nfp->list, &dev->priv.free_list); + + return 0; +} + +static struct fw_page *find_fw_page(struct mlx5_core_dev *dev, u64 addr, + u32 function) +{ + struct fw_page *result = NULL; + struct rb_root *root; + struct rb_node *tmp; + struct fw_page *tfp; + + root = xa_load(&dev->priv.page_root_xa, function); + if (WARN_ON_ONCE(!root)) + return NULL; + + tmp = root->rb_node; + + while (tmp) { + tfp = rb_entry(tmp, struct fw_page, rb_node); + if (tfp->addr < addr) { + tmp = tmp->rb_left; + } else if (tfp->addr > addr) { + tmp = tmp->rb_right; + } else { + result = tfp; + break; + } + } + + return result; +} + +static int mlx5_cmd_query_pages(struct mlx5_core_dev *dev, u16 *func_id, + s32 *npages, int boot) +{ + u32 out[MLX5_ST_SZ_DW(query_pages_out)] = {}; + u32 in[MLX5_ST_SZ_DW(query_pages_in)] = {}; + int err; + + MLX5_SET(query_pages_in, in, opcode, MLX5_CMD_OP_QUERY_PAGES); + MLX5_SET(query_pages_in, in, op_mod, boot ? + MLX5_QUERY_PAGES_IN_OP_MOD_BOOT_PAGES : + MLX5_QUERY_PAGES_IN_OP_MOD_INIT_PAGES); + MLX5_SET(query_pages_in, in, embedded_cpu_function, mlx5_core_is_ecpf(dev)); + + err = mlx5_cmd_exec_inout(dev, query_pages, in, out); + if (err) + return err; + + *npages = MLX5_GET(query_pages_out, out, num_pages); + *func_id = MLX5_GET(query_pages_out, out, function_id); + + return err; +} + +static int alloc_4k(struct mlx5_core_dev *dev, u64 *addr, u32 function) +{ + struct fw_page *fp = NULL; + struct fw_page *iter; + unsigned n; + + list_for_each_entry(iter, &dev->priv.free_list, list) { + if (iter->function != function) + continue; + fp = iter; + } + + if (list_empty(&dev->priv.free_list) || !fp) + return -ENOMEM; + + n = find_first_bit(&fp->bitmask, 8 * sizeof(fp->bitmask)); + if (n >= MLX5_NUM_4K_IN_PAGE) { + mlx5_core_warn(dev, "alloc 4k bug: fw page = 0x%llx, n = %u, bitmask: %lu, max num of 4K pages: %d\n", + fp->addr, n, fp->bitmask, MLX5_NUM_4K_IN_PAGE); + return -ENOENT; + } + clear_bit(n, &fp->bitmask); + fp->free_count--; + if (!fp->free_count) + list_del(&fp->list); + + *addr = fp->addr + n * MLX5_ADAPTER_PAGE_SIZE; + + return 0; +} + +#define MLX5_U64_4K_PAGE_MASK ((~(u64)0U) << PAGE_SHIFT) + +static void free_fwp(struct mlx5_core_dev *dev, struct fw_page *fwp, + bool in_free_list) +{ + struct rb_root *root; + + root = xa_load(&dev->priv.page_root_xa, fwp->function); + if (WARN_ON_ONCE(!root)) + return; + + rb_erase(&fwp->rb_node, root); + if (in_free_list) + list_del(&fwp->list); + dma_unmap_page(mlx5_core_dma_dev(dev), fwp->addr & MLX5_U64_4K_PAGE_MASK, + PAGE_SIZE, DMA_BIDIRECTIONAL); + __free_page(fwp->page); + kfree(fwp); +} + +static int free_4k(struct mlx5_core_dev *dev, u64 addr, u32 function) +{ + struct fw_page *fwp; + int n; + + fwp = find_fw_page(dev, addr & MLX5_U64_4K_PAGE_MASK, function); + if (!fwp) { + mlx5_core_warn_rl(dev, "page not found\n"); + return -ENOMEM; + } + n = (addr & ~MLX5_U64_4K_PAGE_MASK) >> MLX5_ADAPTER_PAGE_SHIFT; + if (test_bit(n, &fwp->bitmask)) { + mlx5_core_warn(dev, "addr 0x%llx is already freed, n %d\n", addr, n); + return -EINVAL; + } + + fwp->free_count++; + set_bit(n, &fwp->bitmask); + if (fwp->free_count == MLX5_NUM_4K_IN_PAGE) + free_fwp(dev, fwp, fwp->free_count != 1); + else if (fwp->free_count == 1) + list_add(&fwp->list, &dev->priv.free_list); + + return 0; +} + +static int alloc_system_page(struct mlx5_core_dev *dev, u32 function) +{ + struct device *device = mlx5_core_dma_dev(dev); + int nid = dev_to_node(device); + struct page *page; + u64 zero_addr = 1; + u64 addr; + int err; + + page = alloc_pages_node(nid, GFP_HIGHUSER, 0); + if (!page) { + mlx5_core_warn(dev, "failed to allocate page\n"); + return -ENOMEM; + } +map: + addr = dma_map_page(device, page, 0, PAGE_SIZE, DMA_BIDIRECTIONAL); + if (dma_mapping_error(device, addr)) { + mlx5_core_warn(dev, "failed dma mapping page\n"); + err = -ENOMEM; + goto err_mapping; + } + + /* Firmware doesn't support page with physical address 0 */ + if (addr == 0) { + zero_addr = addr; + goto map; + } + + err = insert_page(dev, addr, page, function); + if (err) { + mlx5_core_err(dev, "failed to track allocated page\n"); + dma_unmap_page(device, addr, PAGE_SIZE, DMA_BIDIRECTIONAL); + } + +err_mapping: + if (err) + __free_page(page); + + if (zero_addr == 0) + dma_unmap_page(device, zero_addr, PAGE_SIZE, + DMA_BIDIRECTIONAL); + + return err; +} + +static void page_notify_fail(struct mlx5_core_dev *dev, u16 func_id, + bool ec_function) +{ + u32 in[MLX5_ST_SZ_DW(manage_pages_in)] = {}; + int err; + + MLX5_SET(manage_pages_in, in, opcode, MLX5_CMD_OP_MANAGE_PAGES); + MLX5_SET(manage_pages_in, in, op_mod, MLX5_PAGES_CANT_GIVE); + MLX5_SET(manage_pages_in, in, function_id, func_id); + MLX5_SET(manage_pages_in, in, embedded_cpu_function, ec_function); + + err = mlx5_cmd_exec_in(dev, manage_pages, in); + if (err) + mlx5_core_warn(dev, "page notify failed func_id(%d) err(%d)\n", + func_id, err); + else + mlx5_core_warn(dev, "Page allocation failure notification on func_id(%d) sent to fw\n", + func_id); +} + +#ifdef CONFIG_MLX5_ESWITCH +static int vf_obtainable_pages(struct mlx5_core_dev *dev, u16 func_id, int npages, + int notify_fail, bool ec_function) +{ + struct mlx5_eswitch *esw = dev->priv.eswitch; + struct mlx5_vport *evport; + bool nvme_on_behalf; + + nvme_on_behalf = !MLX5_CAP_GEN(dev, resources_on_nvme_emulation_manager) && + MLX5_CAP_GEN(dev, nvme_device_emulation_manager); + + if (nvme_on_behalf) + return npages; + + if (MLX5_CAP_GEN(dev, port_type) != MLX5_CAP_PORT_TYPE_ETH) + return npages; + + evport = mlx5_eswitch_get_vport(esw, func_id); + if (IS_ERR(evport)) { + mlx5_core_dbg(dev, + "Failed giving %d pages to func:%d due to faulty vport\n", + npages, func_id); + return -EPERM; + } + + spin_lock(&evport->pg_counters_lock); + if (evport->page_limit) { + if (evport->fw_pages >= evport->page_limit) { + spin_unlock(&evport->pg_counters_lock); + mlx5_core_warn(dev, + "Failed giving %d pages to func:%d due to exceeding the limit:%d\n", + npages, func_id, evport->page_limit); + if (notify_fail) + page_notify_fail(dev, func_id, ec_function); + return -EPERM; + } + if ((evport->fw_pages + npages) > evport->page_limit) { + npages = evport->page_limit - evport->fw_pages; + mlx5_core_warn(dev, + "Giving only %d pages to func:%d due to reaching the limit:%d\n", + npages, func_id, evport->page_limit); + } + } + spin_unlock(&evport->pg_counters_lock); + return npages; +} + +static void update_pg_counters(struct mlx5_core_dev *dev, u16 func_id, int npages, bool add) +{ + struct mlx5_eswitch *esw = dev->priv.eswitch; + struct mlx5_vport *evport; + bool nvme_on_behalf; + + nvme_on_behalf = !MLX5_CAP_GEN(dev, resources_on_nvme_emulation_manager) && + MLX5_CAP_GEN(dev, nvme_device_emulation_manager); + + if (nvme_on_behalf) // BF1 + return; + + if (MLX5_CAP_GEN(dev, port_type) != MLX5_CAP_PORT_TYPE_ETH) + return; + + evport = mlx5_eswitch_get_vport(esw, func_id); + + if (IS_ERR(evport)) + return; + + spin_lock(&evport->pg_counters_lock); + if (add) + evport->fw_pages += npages; + else + evport->fw_pages -= npages; + spin_unlock(&evport->pg_counters_lock); +} +#endif + +static int give_pages(struct mlx5_core_dev *dev, u16 func_id, int npages, + int event, bool ec_function) +{ + unsigned long max_duration = jiffies + msecs_to_jiffies(mlx5_tout_ms(dev, CMD) / 2); + u32 function = get_function(func_id, ec_function); + u32 out[MLX5_ST_SZ_DW(manage_pages_out)] = {0}; + int inlen = MLX5_ST_SZ_BYTES(manage_pages_in); + int notify_fail = event; + u64 addr; + int err; + u32 *in; + int i; + +#ifdef CONFIG_MLX5_ESWITCH + if (func_id) { + npages = vf_obtainable_pages(dev, func_id, npages, + notify_fail, ec_function); + if (npages <= 0) + return -EPERM; + } +#endif + inlen += npages * MLX5_FLD_SZ_BYTES(manage_pages_in, pas[0]); + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) { + err = -ENOMEM; + mlx5_core_warn(dev, "vzalloc failed %d\n", inlen); + goto out_free; + } + + for (i = 0; i < npages; i++) { + if (time_after(jiffies, max_duration)) { + mlx5_core_warn(dev, + "%d pages alloc time exceeded the max permitted duration\n", + npages); + err = -ENOMEM; + goto out_4k; + } +retry: + err = alloc_4k(dev, &addr, function); + if (err) { + if (err == -ENOMEM) + err = alloc_system_page(dev, function); + if (err) { + dev->priv.fw_pages_alloc_failed += (npages - i); + goto out_4k; + } + + goto retry; + } + MLX5_ARRAY_SET64(manage_pages_in, in, pas, i, addr); + } + + MLX5_SET(manage_pages_in, in, opcode, MLX5_CMD_OP_MANAGE_PAGES); + MLX5_SET(manage_pages_in, in, op_mod, MLX5_PAGES_GIVE); + MLX5_SET(manage_pages_in, in, function_id, func_id); + MLX5_SET(manage_pages_in, in, input_num_entries, npages); + MLX5_SET(manage_pages_in, in, embedded_cpu_function, ec_function); + + err = mlx5_cmd_do(dev, in, inlen, out, sizeof(out)); + if (err == -EREMOTEIO) { + notify_fail = 0; + /* if triggered by FW and failed by FW ignore */ + if (event) { + err = 0; + goto out_dropped; + } + } + err = mlx5_cmd_check(dev, err, in, out); + if (err) { + mlx5_core_warn(dev, "func_id 0x%x, npages %d, err %d\n", + func_id, npages, err); + goto out_dropped; + } + + dev->priv.fw_pages += npages; + if (func_id) { + dev->priv.vfs_pages += npages; +#ifdef CONFIG_MLX5_ESWITCH + update_pg_counters(dev, func_id, npages, true); +#endif + } else if (mlx5_core_is_ecpf(dev) && !ec_function) { + dev->priv.host_pf_pages += npages; + } + + mlx5_core_dbg(dev, "npages %d, ec_function %d, func_id 0x%x, err %d\n", + npages, ec_function, func_id, err); + + kvfree(in); + return 0; + +out_dropped: + dev->priv.give_pages_dropped += npages; +out_4k: + for (i--; i >= 0; i--) + free_4k(dev, MLX5_GET64(manage_pages_in, in, pas[i]), function); +out_free: + kvfree(in); + if (notify_fail) + page_notify_fail(dev, func_id, ec_function); + return err; +} + +static void release_all_pages(struct mlx5_core_dev *dev, u16 func_id, + bool ec_function) +{ + u32 function = get_function(func_id, ec_function); + struct rb_root *root; + struct rb_node *p; + int npages = 0; + + root = xa_load(&dev->priv.page_root_xa, function); + if (WARN_ON_ONCE(!root)) + return; + + p = rb_first(root); + while (p) { + struct fw_page *fwp = rb_entry(p, struct fw_page, rb_node); + + p = rb_next(p); + npages += (MLX5_NUM_4K_IN_PAGE - fwp->free_count); + free_fwp(dev, fwp, fwp->free_count); + } + + dev->priv.fw_pages -= npages; + if (func_id) { + dev->priv.vfs_pages -= npages; +#ifdef CONFIG_MLX5_ESWITCH + update_pg_counters(dev, func_id, npages, false); +#endif + } else if (mlx5_core_is_ecpf(dev) && !ec_function) { + dev->priv.host_pf_pages -= npages; + } + + mlx5_core_dbg(dev, "npages %d, ec_function %d, func_id 0x%x\n", + npages, ec_function, func_id); +} + +static u32 fwp_fill_manage_pages_out(struct fw_page *fwp, u32 *out, u32 index, + u32 npages) +{ + u32 pages_set = 0; + unsigned int n; + + for_each_clear_bit(n, &fwp->bitmask, MLX5_NUM_4K_IN_PAGE) { + MLX5_ARRAY_SET64(manage_pages_out, out, pas, index + pages_set, + fwp->addr + (n * MLX5_ADAPTER_PAGE_SIZE)); + pages_set++; + + if (!--npages) + break; + } + + return pages_set; +} + +static int reclaim_pages_cmd(struct mlx5_core_dev *dev, + u32 *in, int in_size, u32 *out, int out_size) +{ + struct rb_root *root; + struct fw_page *fwp; + struct rb_node *p; + bool ec_function; + u32 func_id; + u32 npages; + u32 i = 0; + + if (!mlx5_cmd_is_down(dev)) + return mlx5_cmd_do(dev, in, in_size, out, out_size); + + /* No hard feelings, we want our pages back! */ + npages = MLX5_GET(manage_pages_in, in, input_num_entries); + func_id = MLX5_GET(manage_pages_in, in, function_id); + ec_function = MLX5_GET(manage_pages_in, in, embedded_cpu_function); + + root = xa_load(&dev->priv.page_root_xa, get_function(func_id, ec_function)); + if (WARN_ON_ONCE(!root)) + return -EEXIST; + + p = rb_first(root); + while (p && i < npages) { + fwp = rb_entry(p, struct fw_page, rb_node); + p = rb_next(p); + + i += fwp_fill_manage_pages_out(fwp, out, i, npages - i); + } + + MLX5_SET(manage_pages_out, out, output_num_entries, i); + return 0; +} + +static int reclaim_pages(struct mlx5_core_dev *dev, u16 func_id, int npages, + int *nclaimed, bool event, bool ec_function) +{ + u32 function = get_function(func_id, ec_function); + int outlen = MLX5_ST_SZ_BYTES(manage_pages_out); + u32 in[MLX5_ST_SZ_DW(manage_pages_in)] = {}; + int num_claimed; + u32 *out; + int err; + int i; + int claimed = 0; + + if (nclaimed) + *nclaimed = 0; + + outlen += npages * MLX5_FLD_SZ_BYTES(manage_pages_out, pas[0]); + out = kvzalloc(outlen, GFP_KERNEL); + if (!out) + return -ENOMEM; + + MLX5_SET(manage_pages_in, in, opcode, MLX5_CMD_OP_MANAGE_PAGES); + MLX5_SET(manage_pages_in, in, op_mod, MLX5_PAGES_TAKE); + MLX5_SET(manage_pages_in, in, function_id, func_id); + MLX5_SET(manage_pages_in, in, input_num_entries, npages); + MLX5_SET(manage_pages_in, in, embedded_cpu_function, ec_function); + + mlx5_core_dbg(dev, "func 0x%x, npages %d, outlen %d\n", + func_id, npages, outlen); + err = reclaim_pages_cmd(dev, in, sizeof(in), out, outlen); + if (err) { + npages = MLX5_GET(manage_pages_in, in, input_num_entries); + dev->priv.reclaim_pages_discard += npages; + } + /* if triggered by FW event and failed by FW then ignore */ + if (event && err == -EREMOTEIO) { + err = 0; + goto out_free; + } + + err = mlx5_cmd_check(dev, err, in, out); + if (err) { + mlx5_core_err(dev, "failed reclaiming pages: err %d\n", err); + goto out_free; + } + + num_claimed = MLX5_GET(manage_pages_out, out, output_num_entries); + if (num_claimed > npages) { + mlx5_core_warn(dev, "fw returned %d, driver asked %d => corruption\n", + num_claimed, npages); + err = -EINVAL; + goto out_free; + } + + for (i = 0; i < num_claimed; i++) + if (!free_4k(dev, MLX5_GET64(manage_pages_out, out, pas[i]), function)) + claimed++; + + if (nclaimed) + *nclaimed = claimed; + + dev->priv.fw_pages -= claimed; + if (func_id) { + dev->priv.vfs_pages -= claimed; +#ifdef CONFIG_MLX5_ESWITCH + update_pg_counters(dev, func_id, npages, false); +#endif + } else if (mlx5_core_is_ecpf(dev) && !ec_function) { + dev->priv.host_pf_pages -= claimed; + } + +out_free: + kvfree(out); + return err; +} + +static void pages_work_handler(struct work_struct *work) +{ + struct mlx5_pages_req *req = container_of(work, struct mlx5_pages_req, work); + struct mlx5_core_dev *dev = req->dev; + int err = 0; + + if (req->release_all) + release_all_pages(dev, req->func_id, req->ec_function); + else if (req->npages < 0) + err = reclaim_pages(dev, req->func_id, -1 * req->npages, NULL, + true, req->ec_function); + else if (req->npages > 0) + err = give_pages(dev, req->func_id, req->npages, 1, req->ec_function); + + if (err) + mlx5_core_warn(dev, "%s fail %d\n", + req->npages < 0 ? "reclaim" : "give", err); + + kfree(req); +} + +enum { + EC_FUNCTION_MASK = 0x8000, + RELEASE_ALL_PAGES_MASK = 0x4000, +}; + +static int req_pages_handler(struct notifier_block *nb, + unsigned long type, void *data) +{ + struct mlx5_pages_req *req; + struct mlx5_core_dev *dev; + struct mlx5_priv *priv; + struct mlx5_eqe *eqe; + bool ec_function; + bool release_all; + u16 func_id; + s32 npages; + + priv = mlx5_nb_cof(nb, struct mlx5_priv, pg_nb); + dev = container_of(priv, struct mlx5_core_dev, priv); + eqe = data; + + func_id = be16_to_cpu(eqe->data.req_pages.func_id); + npages = be32_to_cpu(eqe->data.req_pages.num_pages); + ec_function = be16_to_cpu(eqe->data.req_pages.ec_function) & EC_FUNCTION_MASK; + release_all = be16_to_cpu(eqe->data.req_pages.ec_function) & + RELEASE_ALL_PAGES_MASK; + mlx5_core_dbg(dev, "page request for func 0x%x, npages %d, release_all %d\n", + func_id, npages, release_all); + req = kzalloc(sizeof(*req), GFP_ATOMIC); + if (!req) { + mlx5_core_warn(dev, "failed to allocate pages request\n"); + return NOTIFY_DONE; + } + + req->dev = dev; + req->func_id = func_id; + req->npages = npages; + req->ec_function = ec_function; + req->release_all = release_all; + INIT_WORK(&req->work, pages_work_handler); + queue_work(dev->priv.pg_wq, &req->work); + return NOTIFY_OK; +} + +int mlx5_satisfy_startup_pages(struct mlx5_core_dev *dev, int boot) +{ + u16 func_id; + s32 npages; + int err; + + err = mlx5_cmd_query_pages(dev, &func_id, &npages, boot); + if (err) + return err; + + mlx5_core_dbg(dev, "requested %d %s pages for func_id 0x%x\n", + npages, boot ? "boot" : "init", func_id); + + return give_pages(dev, func_id, npages, 0, mlx5_core_is_ecpf(dev)); +} + +enum { + MLX5_BLKS_FOR_RECLAIM_PAGES = 12 +}; + +static int optimal_reclaimed_pages(void) +{ + struct mlx5_cmd_prot_block *block; + struct mlx5_cmd_layout *lay; + int ret; + + ret = (sizeof(lay->out) + MLX5_BLKS_FOR_RECLAIM_PAGES * sizeof(block->data) - + MLX5_ST_SZ_BYTES(manage_pages_out)) / + MLX5_FLD_SZ_BYTES(manage_pages_out, pas[0]); + + return ret; +} + +static int mlx5_reclaim_root_pages(struct mlx5_core_dev *dev, + struct rb_root *root, u32 function) +{ + u64 recl_pages_to_jiffies = msecs_to_jiffies(mlx5_tout_ms(dev, RECLAIM_PAGES)); + unsigned long end = jiffies + recl_pages_to_jiffies; + + while (!RB_EMPTY_ROOT(root)) { + int nclaimed; + int err; + + err = reclaim_pages(dev, get_func_id(function), optimal_reclaimed_pages(), + &nclaimed, false, get_ec_function(function)); + if (err) { + mlx5_core_warn(dev, "failed reclaiming pages (%d) for func id 0x%x\n", + err, get_func_id(function)); + return err; + } + + if (nclaimed) + end = jiffies + recl_pages_to_jiffies; + + if (time_after(jiffies, end)) { + mlx5_core_warn(dev, "FW did not return all pages. giving up...\n"); + break; + } + } + + return 0; +} + +int mlx5_reclaim_startup_pages(struct mlx5_core_dev *dev) +{ + struct rb_root *root; + unsigned long id; + void *entry; + + xa_for_each(&dev->priv.page_root_xa, id, entry) { + root = entry; + mlx5_reclaim_root_pages(dev, root, id); + xa_erase(&dev->priv.page_root_xa, id); + kfree(root); + } + + WARN_ON(!xa_empty(&dev->priv.page_root_xa)); + + if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { + dev->priv.vfs_pages = 0; + dev->priv.fw_pages = 0; + dev->priv.host_pf_pages = 0; + } + + WARN(dev->priv.fw_pages, + "FW pages counter is %d after reclaiming all pages\n", + dev->priv.fw_pages); + WARN(dev->priv.vfs_pages, + "VFs FW pages counter is %d after reclaiming all pages\n", + dev->priv.vfs_pages); + + /* Warning but don't dump stack */ + if (dev->priv.host_pf_pages) + mlx5_core_warn(dev, "External host PF FW pages counter is %d after reclaiming all pages\n", + dev->priv.host_pf_pages); + + return 0; +} + +int mlx5_pagealloc_init(struct mlx5_core_dev *dev) +{ + INIT_LIST_HEAD(&dev->priv.free_list); + dev->priv.pg_wq = create_singlethread_workqueue("mlx5_page_allocator"); + if (!dev->priv.pg_wq) + return -ENOMEM; + + xa_init(&dev->priv.page_root_xa); + mlx5_pages_debugfs_init(dev); + + return 0; +} + +void mlx5_pagealloc_cleanup(struct mlx5_core_dev *dev) +{ + mlx5_pages_debugfs_cleanup(dev); + xa_destroy(&dev->priv.page_root_xa); + destroy_workqueue(dev->priv.pg_wq); +} + +void mlx5_pagealloc_start(struct mlx5_core_dev *dev) +{ + MLX5_NB_INIT(&dev->priv.pg_nb, req_pages_handler, PAGE_REQUEST); + mlx5_eq_notifier_register(dev, &dev->priv.pg_nb); +} + +void mlx5_pagealloc_stop(struct mlx5_core_dev *dev) +{ + mlx5_eq_notifier_unregister(dev, &dev->priv.pg_nb); + flush_workqueue(dev->priv.pg_wq); +} + +int mlx5_wait_for_pages(struct mlx5_core_dev *dev, int *pages) +{ + u64 recl_vf_pages_to_jiffies = msecs_to_jiffies(mlx5_tout_ms(dev, RECLAIM_VFS_PAGES)); + unsigned long end = jiffies + recl_vf_pages_to_jiffies; + int prev_pages = *pages; + + /* In case of internal error we will free the pages manually later */ + if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { + mlx5_core_warn(dev, "Skipping wait for vf pages stage"); + return 0; + } + + mlx5_core_dbg(dev, "Waiting for %d pages\n", prev_pages); + while (*pages) { + if (time_after(jiffies, end)) { + mlx5_core_warn(dev, "aborting while there are %d pending pages\n", *pages); + return -ETIMEDOUT; + } + if (*pages < prev_pages) { + end = jiffies + recl_vf_pages_to_jiffies; + prev_pages = *pages; + } + msleep(50); + } + + mlx5_core_dbg(dev, "All pages received\n"); + return 0; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/params.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/params.c new file mode 100644 index 0000000..9b5faab --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/params.c @@ -0,0 +1,197 @@ +/* + * Copyright (c) 2013-2017, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +static char *guids; +module_param_named(guids, guids, charp, 0444); +MODULE_PARM_DESC(node_guid, "guids configuration. This module parameter will be obsolete!"); + +/* format: dddd:bb:vv.f-nn:nn:nn:nn:nn:nn:nn:nn:pp:pp:pp:pp:pp:pp:pp:pp:qq:qq:qq:qq:qq:qq:qq:qq, + * + * dddd:bb:vv.f are domain, bus, device, function for the device + * nn:nn:nn:nn:nn:nn:nn:nn is node guid to configure + * pp:pp:pp:pp:pp:pp:pp:pp is port 1 GUID + * qq:qq:qq:qq:qq:qq:qq:qq is port 2 GUID. this param is optional + * + * The comma indicates another record follows + */ + +static u64 extract_guid(int *g) +{ + return ((u64)g[0] << 56) | + ((u64)g[1] << 48) | + ((u64)g[2] << 40) | + ((u64)g[3] << 32) | + ((u64)g[4] << 24) | + ((u64)g[5] << 16) | + ((u64)g[6] << 8) | + (u64)g[7]; +} + +static int is_valid_len(const char *p, int *nport) +{ + int tmp; + char *x; + + x = strchr(p, ','); + if (x) + tmp = (int)(x - p); + else + tmp = strlen(p); + + switch (tmp) { + case 47: + *nport = 1; + break; + + case 71: + *nport = 2; + break; + + default: + return 0; + } + + return 1; +} + +static int get_record(const char *p, u64 *node_guid, u64 *port1_guid, + u64 *port2_guid, int *nport) +{ + int tmp[8]; + int err; + const char *guid_format = "%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x"; + int np; + + if (!is_valid_len(p, &np)) + return -EINVAL; + + err = sscanf(p, guid_format, tmp, tmp + 1, tmp + 2, tmp + 3, tmp + 4, + tmp + 5, tmp + 6, tmp + 7); + if (err != 8) + return -EINVAL; + + *node_guid = extract_guid(tmp); + p += 23; + if (*p != ':') + return -EINVAL; + p++; + + err = sscanf(p, guid_format, tmp, tmp + 1, tmp + 2, tmp + 3, tmp + 4, + tmp + 5, tmp + 6, tmp + 7); + if (err != 8) + return -EINVAL; + *port1_guid = extract_guid(tmp); + if (np != 2) { + *nport = np; + return 0; + } + + p += 23; + if (*p != ':') + return -EINVAL; + p++; + + err = sscanf(p, guid_format, tmp, tmp + 1, tmp + 2, tmp + 3, tmp + 4, + tmp + 5, tmp + 6, tmp + 7); + if (err != 8) + return -EINVAL; + *port2_guid = extract_guid(tmp); + *nport = np; + + return 0; +} + +int mlx5_update_guids(struct mlx5_core_dev *dev) +{ + struct pci_dev *pdev = dev->pdev; + const char *devp; + char *p = guids; + u64 port1_guid = 0; + u64 port2_guid = 0; + u64 node_guid; + int nport; + int dlen; + int err; + struct mlx5_hca_vport_context *req; + + if (!p) + return 0; + + devp = dev_name(&pdev->dev); + dlen = strlen(devp); + while (1) { + if (dlen >= strlen(p)) + return -ENODEV; + + if (!memcmp(devp, p, dlen)) { + p += dlen; + if (*p != '-') + return -EINVAL; + p++; + break; + } + + p = strchr(p, ','); + if (!p) + return -ENODEV; + p++; + } + + err = get_record(p, &node_guid, &port1_guid, &port2_guid, &nport); + if (err) + return err; + + req = kzalloc(sizeof(*req), GFP_KERNEL); + if (!req) + return -ENOMEM; + + req->node_guid = node_guid; + req->port_guid = port1_guid; + req->field_select = MLX5_HCA_VPORT_SEL_NODE_GUID | MLX5_HCA_VPORT_SEL_PORT_GUID; + err = mlx5_core_modify_hca_vport_context(dev, 0, 1, 0, req); + if (err) + goto out; + + if (nport == 2) { + req->port_guid = port2_guid; + err = mlx5_core_modify_hca_vport_context(dev, 0, 2, 0, req); + } + +out: + kfree(req); + + return err; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c new file mode 100644 index 0000000..9561c9d --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c @@ -0,0 +1,808 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2019 Mellanox Technologies. */ + +#include +#include +#include +#include +#include "mlx5_core.h" +#include "mlx5_irq.h" +#include "pci_irq.h" +#include "lib/sf.h" +#ifdef CONFIG_RFS_ACCEL +#include +#endif + +#define MLX5_PF_IRQ_CTRL_NUM (1) + +#define MLX5_SFS_PER_CTRL_IRQ 64 +#define MLX5_IRQ_CTRL_SF_MAX 8 +/* min num of vectors for SFs to be enabled */ +#define MLX5_IRQ_VEC_COMP_BASE_SF 2 + +#define MLX5_EQ_SHARE_IRQ_MAX_COMP (8) +#define MLX5_EQ_SHARE_IRQ_MAX_CTRL (UINT_MAX) +#define MLX5_EQ_SHARE_IRQ_MIN_COMP (1) +#define MLX5_EQ_SHARE_IRQ_MIN_CTRL (4) + +struct mlx5_irq { + struct atomic_notifier_head nh; + cpumask_var_t mask; + char name[MLX5_MAX_IRQ_NAME]; + struct mlx5_irq_pool *pool; + int refcount; + u32 index; + int irqn; +}; + +struct mlx5_irq_table { + struct mlx5_irq_pool *pf_pool; + struct mlx5_irq_pool *sf_ctrl_pool; + struct mlx5_irq_pool *sf_comp_pool; +}; + +/** + * mlx5_get_default_msix_vec_count - Get the default number of MSI-X vectors + * to be ssigned to each VF. + * @dev: PF to work on + * @num_vfs: Number of enabled VFs + */ +int mlx5_get_default_msix_vec_count(struct mlx5_core_dev *dev, int num_vfs) +{ + int num_vf_msix, min_msix, max_msix; + + num_vf_msix = MLX5_CAP_GEN_MAX(dev, num_total_dynamic_vf_msix); + if (!num_vf_msix) + return 0; + + min_msix = MLX5_CAP_GEN(dev, min_dynamic_vf_msix_table_size); + max_msix = MLX5_CAP_GEN(dev, max_dynamic_vf_msix_table_size); + + /* Limit maximum number of MSI-X vectors so the default configuration + * has some available in the pool. This will allow the user to increase + * the number of vectors in a VF without having to first size-down other + * VFs. + */ + return max(min(num_vf_msix / num_vfs, max_msix / 2), min_msix); +} + +/** + * mlx5_set_msix_vec_count - Set dynamically allocated MSI-X on the VF + * @dev: PF to work on + * @function_id: Internal PCI VF function IDd + * @msix_vec_count: Number of MSI-X vectors to set + */ +int mlx5_set_msix_vec_count(struct mlx5_core_dev *dev, int function_id, + int msix_vec_count) +{ + int query_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out); + int set_sz = MLX5_ST_SZ_BYTES(set_hca_cap_in); + void *hca_cap = NULL, *query_cap = NULL, *cap; + int num_vf_msix, min_msix, max_msix; + int ret; + + num_vf_msix = MLX5_CAP_GEN_MAX(dev, num_total_dynamic_vf_msix); + if (!num_vf_msix) + return 0; + + if (!MLX5_CAP_GEN(dev, vport_group_manager) || !mlx5_core_is_pf(dev)) + return -EOPNOTSUPP; + + min_msix = MLX5_CAP_GEN(dev, min_dynamic_vf_msix_table_size); + max_msix = MLX5_CAP_GEN(dev, max_dynamic_vf_msix_table_size); + + if (msix_vec_count < min_msix) + return -EINVAL; + + if (msix_vec_count > max_msix) + return -EOVERFLOW; + + query_cap = kzalloc(query_sz, GFP_KERNEL); + hca_cap = kzalloc(set_sz, GFP_KERNEL); + if (!hca_cap || !query_cap) { + ret = -ENOMEM; + goto out; + } + + ret = mlx5_vport_get_other_func_cap(dev, function_id, query_cap); + if (ret) + goto out; + + cap = MLX5_ADDR_OF(set_hca_cap_in, hca_cap, capability); + memcpy(cap, MLX5_ADDR_OF(query_hca_cap_out, query_cap, capability), + MLX5_UN_SZ_BYTES(hca_cap_union)); + MLX5_SET(cmd_hca_cap, cap, dynamic_msix_table_size, msix_vec_count); + + MLX5_SET(set_hca_cap_in, hca_cap, opcode, MLX5_CMD_OP_SET_HCA_CAP); + MLX5_SET(set_hca_cap_in, hca_cap, other_function, 1); + MLX5_SET(set_hca_cap_in, hca_cap, function_id, function_id); + + MLX5_SET(set_hca_cap_in, hca_cap, op_mod, + MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE << 1); + ret = mlx5_cmd_exec_in(dev, set_hca_cap, hca_cap); +out: + kfree(hca_cap); + kfree(query_cap); + return ret; +} + +static void irq_release(struct mlx5_irq *irq) +{ + struct mlx5_irq_pool *pool = irq->pool; + + xa_erase(&pool->irqs, irq->index); + /* free_irq requires that affinity_hint and rmap will be cleared + * before calling it. This is why there is asymmetry with set_rmap + * which should be called after alloc_irq but before request_irq. + */ + irq_update_affinity_hint(irq->irqn, NULL); + free_cpumask_var(irq->mask); + free_irq(irq->irqn, &irq->nh); + kfree(irq); +} + +int mlx5_irq_put(struct mlx5_irq *irq) +{ + struct mlx5_irq_pool *pool = irq->pool; + int ret = 0; + + mutex_lock(&pool->lock); + irq->refcount--; + if (!irq->refcount) { + irq_release(irq); + ret = 1; + } + mutex_unlock(&pool->lock); + return ret; +} + +int mlx5_irq_read_locked(struct mlx5_irq *irq) +{ + lockdep_assert_held(&irq->pool->lock); + return irq->refcount; +} + +int mlx5_irq_get_locked(struct mlx5_irq *irq) +{ + lockdep_assert_held(&irq->pool->lock); + if (WARN_ON_ONCE(!irq->refcount)) + return 0; + irq->refcount++; + return 1; +} + +static int irq_get(struct mlx5_irq *irq) +{ + int err; + + mutex_lock(&irq->pool->lock); + err = mlx5_irq_get_locked(irq); + mutex_unlock(&irq->pool->lock); + return err; +} + +static irqreturn_t irq_int_handler(int irq, void *nh) +{ + atomic_notifier_call_chain(nh, 0, NULL); + return IRQ_HANDLED; +} + +static void irq_sf_set_name(struct mlx5_irq_pool *pool, char *name, int vecidx) +{ + snprintf(name, MLX5_MAX_IRQ_NAME, "%s%d", pool->name, vecidx); +} + +static void irq_set_name(struct mlx5_irq_pool *pool, char *name, int vecidx) +{ + if (!pool->xa_num_irqs.max) { + /* in case we only have a single irq for the device */ + snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_combined%d", vecidx); + return; + } + + if (vecidx == pool->xa_num_irqs.max) { + snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_async%d", vecidx); + return; + } + + snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_comp%d", vecidx); +} + +struct mlx5_irq *mlx5_irq_alloc(struct mlx5_irq_pool *pool, int i, + const struct cpumask *affinity) +{ + struct mlx5_core_dev *dev = pool->dev; + char name[MLX5_MAX_IRQ_NAME]; + struct mlx5_irq *irq; + int err; + + irq = kzalloc(sizeof(*irq), GFP_KERNEL); + if (!irq) + return ERR_PTR(-ENOMEM); + irq->irqn = pci_irq_vector(dev->pdev, i); + if (!mlx5_irq_pool_is_sf_pool(pool)) + irq_set_name(pool, name, i); + else + irq_sf_set_name(pool, name, i); + ATOMIC_INIT_NOTIFIER_HEAD(&irq->nh); + snprintf(irq->name, MLX5_MAX_IRQ_NAME, + "%s@pci:%s", name, pci_name(dev->pdev)); + err = request_irq(irq->irqn, irq_int_handler, 0, irq->name, + &irq->nh); + if (err) { + mlx5_core_err(dev, "Failed to request irq. err = %d\n", err); + goto err_req_irq; + } + if (!zalloc_cpumask_var(&irq->mask, GFP_KERNEL)) { + mlx5_core_warn(dev, "zalloc_cpumask_var failed\n"); + err = -ENOMEM; + goto err_cpumask; + } + if (affinity) { + cpumask_copy(irq->mask, affinity); + irq_set_affinity_and_hint(irq->irqn, irq->mask); + } + irq->pool = pool; + irq->refcount = 1; + irq->index = i; + err = xa_err(xa_store(&pool->irqs, irq->index, irq, GFP_KERNEL)); + if (err) { + mlx5_core_err(dev, "Failed to alloc xa entry for irq(%u). err = %d\n", + irq->index, err); + goto err_xa; + } + return irq; +err_xa: + irq_update_affinity_hint(irq->irqn, NULL); + free_cpumask_var(irq->mask); +err_cpumask: + free_irq(irq->irqn, &irq->nh); +err_req_irq: + kfree(irq); + return ERR_PTR(err); +} + +int mlx5_irq_attach_nb(struct mlx5_irq *irq, struct notifier_block *nb) +{ + int ret; + + ret = irq_get(irq); + if (!ret) + /* Something very bad happens here, we are enabling EQ + * on non-existing IRQ. + */ + return -ENOENT; + ret = atomic_notifier_chain_register(&irq->nh, nb); + if (ret) + mlx5_irq_put(irq); + return ret; +} + +int mlx5_irq_detach_nb(struct mlx5_irq *irq, struct notifier_block *nb) +{ + int err = 0; + + err = atomic_notifier_chain_unregister(&irq->nh, nb); + mlx5_irq_put(irq); + return err; +} + +struct cpumask *mlx5_irq_get_affinity_mask(struct mlx5_irq *irq) +{ + return irq->mask; +} + +int mlx5_irq_get_index(struct mlx5_irq *irq) +{ + return irq->index; +} + +/* irq_pool API */ + +static int irq_pool_size_get(struct mlx5_irq_pool *pool) +{ + return pool->xa_num_irqs.max - pool->xa_num_irqs.min + 1; +} + +/* requesting an irq from a given pool according to given index */ +static struct mlx5_irq * +irq_pool_request_vector(struct mlx5_irq_pool *pool, int vecidx, + const struct cpumask *affinity) +{ + struct mlx5_irq *irq; + + mutex_lock(&pool->lock); + irq = xa_load(&pool->irqs, vecidx); + if (irq) { + mlx5_irq_get_locked(irq); + goto unlock; + } + irq = mlx5_irq_alloc(pool, vecidx, affinity); +unlock: + mutex_unlock(&pool->lock); + return irq; +} + +static struct mlx5_irq_pool *sf_ctrl_irq_pool_get(struct mlx5_irq_table *irq_table) +{ + return irq_table->sf_ctrl_pool; +} + +static struct mlx5_irq_pool *sf_irq_pool_get(struct mlx5_irq_table *irq_table) +{ + return irq_table->sf_comp_pool; +} + +struct mlx5_irq_pool *mlx5_irq_pool_get(struct mlx5_core_dev *dev) +{ + struct mlx5_irq_table *irq_table = mlx5_irq_table_get(dev); + struct mlx5_irq_pool *pool = NULL; + + if (mlx5_core_is_sf(dev)) + pool = sf_irq_pool_get(irq_table); + + /* In some configs, there won't be a pool of SFs IRQs. Hence, returning + * the PF IRQs pool in case the SF pool doesn't exist. + */ + return pool ? pool : irq_table->pf_pool; +} + +static struct mlx5_irq_pool *ctrl_irq_pool_get(struct mlx5_core_dev *dev) +{ + struct mlx5_irq_table *irq_table = mlx5_irq_table_get(dev); + struct mlx5_irq_pool *pool = NULL; + + if (mlx5_core_is_sf(dev)) + pool = sf_ctrl_irq_pool_get(irq_table); + + /* In some configs, there won't be a pool of SFs IRQs. Hence, returning + * the PF IRQs pool in case the SF pool doesn't exist. + */ + return pool ? pool : irq_table->pf_pool; +} + +/** + * mlx5_irqs_release - release one or more IRQs back to the system. + * @irqs: IRQs to be released. + * @nirqs: number of IRQs to be released. + */ +static void mlx5_irqs_release(struct mlx5_irq **irqs, int nirqs) +{ + int i; + + for (i = 0; i < nirqs; i++) { + synchronize_irq(irqs[i]->irqn); + mlx5_irq_put(irqs[i]); + } +} + +/** + * mlx5_ctrl_irq_release - release a ctrl IRQ back to the system. + * @ctrl_irq: ctrl IRQ to be released. + */ +void mlx5_ctrl_irq_release(struct mlx5_irq *ctrl_irq) +{ + mlx5_irqs_release(&ctrl_irq, 1); +} + +/** + * mlx5_ctrl_irq_request - request a ctrl IRQ for mlx5 device. + * @dev: mlx5 device that requesting the IRQ. + * + * This function returns a pointer to IRQ, or ERR_PTR in case of error. + */ +struct mlx5_irq *mlx5_ctrl_irq_request(struct mlx5_core_dev *dev) +{ + struct mlx5_irq_pool *pool = ctrl_irq_pool_get(dev); + cpumask_var_t req_mask; + struct mlx5_irq *irq; + + if (!zalloc_cpumask_var(&req_mask, GFP_KERNEL)) + return ERR_PTR(-ENOMEM); + cpumask_copy(req_mask, cpu_online_mask); + if (!mlx5_irq_pool_is_sf_pool(pool)) { + /* In case we are allocating a control IRQ for PF/VF */ + if (!pool->xa_num_irqs.max) { + cpumask_clear(req_mask); + /* In case we only have a single IRQ for PF/VF */ + cpumask_set_cpu(cpumask_first(cpu_online_mask), req_mask); + } + /* Allocate the IRQ in the last index of the pool */ + irq = irq_pool_request_vector(pool, pool->xa_num_irqs.max, req_mask); + } else { + irq = mlx5_irq_affinity_request(pool, req_mask); + } + + free_cpumask_var(req_mask); + return irq; +} + +/** + * mlx5_irq_request - request an IRQ for mlx5 PF/VF device. + * @dev: mlx5 device that requesting the IRQ. + * @vecidx: vector index of the IRQ. This argument is ignore if affinity is + * provided. + * @affinity: cpumask requested for this IRQ. + * + * This function returns a pointer to IRQ, or ERR_PTR in case of error. + */ +struct mlx5_irq *mlx5_irq_request(struct mlx5_core_dev *dev, u16 vecidx, + const struct cpumask *affinity) +{ + struct mlx5_irq_table *irq_table = mlx5_irq_table_get(dev); + struct mlx5_irq_pool *pool; + struct mlx5_irq *irq; + + pool = irq_table->pf_pool; + irq = irq_pool_request_vector(pool, vecidx, affinity); + if (IS_ERR(irq)) + return irq; + mlx5_core_dbg(dev, "irq %u mapped to cpu %*pbl, %u EQs on this irq\n", + irq->irqn, cpumask_pr_args(affinity), + irq->refcount / MLX5_EQ_REFS_PER_IRQ); + return irq; +} + +/** + * mlx5_irqs_release_vectors - release one or more IRQs back to the system. + * @irqs: IRQs to be released. + * @nirqs: number of IRQs to be released. + */ +void mlx5_irqs_release_vectors(struct mlx5_irq **irqs, int nirqs) +{ + mlx5_irqs_release(irqs, nirqs); +} + +/** + * mlx5_irqs_request_vectors - request one or more IRQs for mlx5 device. + * @dev: mlx5 device that is requesting the IRQs. + * @cpus: CPUs array for binding the IRQs + * @nirqs: number of IRQs to request. + * @irqs: an output array of IRQs pointers. + * + * Each IRQ is bound to at most 1 CPU. + * This function is requests nirqs IRQs, starting from @vecidx. + * + * This function returns the number of IRQs requested, (which might be smaller than + * @nirqs), if successful, or a negative error code in case of an error. + */ +int mlx5_irqs_request_vectors(struct mlx5_core_dev *dev, u16 *cpus, int nirqs, + struct mlx5_irq **irqs) +{ + cpumask_var_t req_mask; + struct mlx5_irq *irq; + int i; + + if (!zalloc_cpumask_var(&req_mask, GFP_KERNEL)) + return -ENOMEM; + for (i = 0; i < nirqs; i++) { + cpumask_set_cpu(cpus[i], req_mask); + irq = mlx5_irq_request(dev, i, req_mask); + if (IS_ERR(irq)) + break; + cpumask_clear(req_mask); + irqs[i] = irq; + } + + free_cpumask_var(req_mask); + return i ? i : PTR_ERR(irq); +} + +static int req_mask_local_spread(unsigned int i, int node, + const struct cpumask *irqs_req_mask) +{ + int cpu; + + if (node == NUMA_NO_NODE) { + for_each_cpu_and(cpu, cpu_online_mask, irqs_req_mask) + if (i-- == 0) + return cpu; + } else { + /* NUMA first. */ + for_each_cpu_and(cpu, cpumask_of_node(node), irqs_req_mask) + if (cpu_online(cpu)) + if (i-- == 0) + return cpu; + + for_each_online_cpu(cpu) { + /* Skip NUMA nodes, done above. */ + if (cpumask_test_cpu(cpu, cpumask_of_node(node))) + continue; + + if (i-- == 0) + return cpu; + } + } + WARN_ON(true); + return cpumask_first(cpu_online_mask); +} + +/** + * mlx5_irqs_request_mask - request one or more IRQs for mlx5 device. + * @dev: mlx5 device that is requesting the IRQs. + * @irqs: an output array of IRQs pointers. + * @irqs_req_mask: cpumask requested for these IRQs. + * + * Each IRQ is bounded to at most 1 CPU. + * This function returns the number of IRQs requested, (which might be smaller than + * cpumask_weight(@irqs_req_mask)), if successful, or a negative error code in + * case of an error. + */ +int mlx5_irqs_request_mask(struct mlx5_core_dev *dev, struct mlx5_irq **irqs, + struct cpumask *irqs_req_mask) +{ + struct mlx5_irq_pool *pool = mlx5_irq_pool_get(dev); + struct mlx5_irq *irq; + int nirqs; + int cpu; + int i; + + /* Request an IRQ for each online CPU in the given mask */ + cpumask_and(irqs_req_mask, irqs_req_mask, cpu_online_mask); + nirqs = cpumask_weight(irqs_req_mask); + for (i = 0; i < nirqs; i++) { + /* Iterate over the mask the caller provided in numa aware fashion. + * Local CPUs are requested first, followed by non-local ones. + */ + cpu = req_mask_local_spread(i, dev->priv.numa_node, irqs_req_mask); + + if (mlx5_irq_pool_is_sf_pool(pool)) + irq = mlx5_irq_affinity_request(pool, cpumask_of(cpu)); + else + irq = mlx5_irq_request(dev, i, cpumask_of(cpu)); + if (IS_ERR(irq)) { + if (!i) + return PTR_ERR(irq); + return i; + } + irqs[i] = irq; + mlx5_core_dbg(dev, "IRQ %u mapped to cpu %*pbl, %u EQs on this irq\n", + pci_irq_vector(dev->pdev, mlx5_irq_get_index(irq)), + cpumask_pr_args(mlx5_irq_get_affinity_mask(irq)), + mlx5_irq_read_locked(irq) / MLX5_EQ_REFS_PER_IRQ); + } + return i; +} + +static struct mlx5_irq_pool * +irq_pool_alloc(struct mlx5_core_dev *dev, int start, int size, char *name, + u32 min_threshold, u32 max_threshold) +{ + struct mlx5_irq_pool *pool = kvzalloc(sizeof(*pool), GFP_KERNEL); + + if (!pool) + return ERR_PTR(-ENOMEM); + pool->dev = dev; + mutex_init(&pool->lock); + xa_init_flags(&pool->irqs, XA_FLAGS_ALLOC); + pool->xa_num_irqs.min = start; + pool->xa_num_irqs.max = start + size - 1; + if (name) + snprintf(pool->name, MLX5_MAX_IRQ_NAME - MLX5_MAX_IRQ_IDX_CHARS, + "%s", name); + pool->min_threshold = min_threshold * MLX5_EQ_REFS_PER_IRQ; + pool->max_threshold = max_threshold * MLX5_EQ_REFS_PER_IRQ; + mlx5_core_dbg(dev, "pool->name = %s, pool->size = %d, pool->start = %d", + name, size, start); + return pool; +} + +static void irq_pool_free(struct mlx5_irq_pool *pool) +{ + struct mlx5_irq *irq; + unsigned long index; + unsigned int cpu; + + /* There are cases in which we are destrying the irq_table before + * freeing all the IRQs, fast teardown for example. Hence, free the irqs + * which might not have been freed. + */ + xa_for_each(&pool->irqs, index, irq) + irq_release(irq); + xa_destroy(&pool->irqs); + mutex_destroy(&pool->lock); + if (pool->irqs_per_cpu) + for_each_online_cpu(cpu) + WARN_ON(pool->irqs_per_cpu[cpu]); + kfree(pool->irqs_per_cpu); + kvfree(pool); +} + +static int irq_pools_init(struct mlx5_core_dev *dev, int sf_vec, int pf_vec) +{ + struct mlx5_irq_table *table = dev->priv.irq_table; + int num_sf_ctrl_by_msix; + int num_sf_ctrl_by_sfs; + int num_sf_ctrl; + int err; + + /* init pf_pool */ + table->pf_pool = irq_pool_alloc(dev, 0, pf_vec, NULL, + MLX5_EQ_SHARE_IRQ_MIN_COMP, + MLX5_EQ_SHARE_IRQ_MAX_COMP); + if (IS_ERR(table->pf_pool)) + return PTR_ERR(table->pf_pool); + if (!mlx5_sf_max_functions(dev)) + return 0; + if (sf_vec < MLX5_IRQ_VEC_COMP_BASE_SF) { + mlx5_core_dbg(dev, "Not enught IRQs for SFs. SF may run at lower performance\n"); + return 0; + } + + /* init sf_ctrl_pool */ + num_sf_ctrl_by_msix = DIV_ROUND_UP(sf_vec, MLX5_COMP_EQS_PER_SF); + num_sf_ctrl_by_sfs = DIV_ROUND_UP(mlx5_sf_max_functions(dev), + MLX5_SFS_PER_CTRL_IRQ); + num_sf_ctrl = min_t(int, num_sf_ctrl_by_msix, num_sf_ctrl_by_sfs); + num_sf_ctrl = min_t(int, MLX5_IRQ_CTRL_SF_MAX, num_sf_ctrl); + table->sf_ctrl_pool = irq_pool_alloc(dev, pf_vec, num_sf_ctrl, + "mlx5_sf_ctrl", + MLX5_EQ_SHARE_IRQ_MIN_CTRL, + MLX5_EQ_SHARE_IRQ_MAX_CTRL); + if (IS_ERR(table->sf_ctrl_pool)) { + err = PTR_ERR(table->sf_ctrl_pool); + goto err_pf; + } + /* init sf_comp_pool */ + table->sf_comp_pool = irq_pool_alloc(dev, pf_vec + num_sf_ctrl, + sf_vec - num_sf_ctrl - 1, "mlx5_sf_comp", + MLX5_EQ_SHARE_IRQ_MIN_COMP, + MLX5_EQ_SHARE_IRQ_MAX_COMP); + if (IS_ERR(table->sf_comp_pool)) { + err = PTR_ERR(table->sf_comp_pool); + goto err_sf_ctrl; + } + + table->sf_comp_pool->irqs_per_cpu = kcalloc(nr_cpu_ids, sizeof(u16), GFP_KERNEL); + if (!table->sf_comp_pool->irqs_per_cpu) { + err = -ENOMEM; + goto err_irqs_per_cpu; + } + + return 0; + +err_irqs_per_cpu: + irq_pool_free(table->sf_comp_pool); +err_sf_ctrl: + irq_pool_free(table->sf_ctrl_pool); +err_pf: + irq_pool_free(table->pf_pool); + return err; +} + +static void irq_pools_destroy(struct mlx5_irq_table *table) +{ + if (table->sf_ctrl_pool) { + irq_pool_free(table->sf_comp_pool); + irq_pool_free(table->sf_ctrl_pool); + } + irq_pool_free(table->pf_pool); +} + +/* irq_table API */ + +int mlx5_irq_table_init(struct mlx5_core_dev *dev) +{ + struct mlx5_irq_table *irq_table; + + if (mlx5_core_is_sf(dev)) + return 0; + + irq_table = kvzalloc(sizeof(*irq_table), GFP_KERNEL); + if (!irq_table) + return -ENOMEM; + + dev->priv.irq_table = irq_table; + return 0; +} + +void mlx5_irq_table_cleanup(struct mlx5_core_dev *dev) +{ + if (mlx5_core_is_sf(dev)) + return; + + kvfree(dev->priv.irq_table); +} + +int mlx5_irq_table_get_num_comp(struct mlx5_irq_table *table) +{ + if (!table->pf_pool->xa_num_irqs.max) + return 1; + return irq_pool_size_get(table->pf_pool) - MLX5_PF_IRQ_CTRL_NUM; +} + +int mlx5_irq_table_create(struct mlx5_core_dev *dev) +{ + int max_num_eq = MLX5_CAP_GEN(dev, max_num_eqs); + int num_eqs; + int total_vec; + int pf_vec; + int err; + + if (mlx5_core_is_sf(dev)) + return 0; + + if (max_num_eq) { + num_eqs = max_num_eq; + } else { + num_eqs = 1 << MLX5_CAP_GEN(dev, log_max_eq); + num_eqs -= MLX5_FW_RESERVED_EQS; + if (num_eqs <= 0) + return -ENOMEM; + } + + pf_vec = MLX5_CAP_GEN(dev, num_ports) * num_online_cpus() + 1; + pf_vec = min_t(int, pf_vec, num_eqs); + + total_vec = pf_vec; + if (mlx5_sf_max_functions(dev)) + total_vec += MLX5_IRQ_CTRL_SF_MAX + + MLX5_COMP_EQS_PER_SF * mlx5_sf_max_functions(dev); + + total_vec = pci_alloc_irq_vectors(dev->pdev, 1, total_vec, PCI_IRQ_MSIX); + if (total_vec < 0) + return total_vec; + pf_vec = min(pf_vec, total_vec); + + err = irq_pools_init(dev, total_vec - pf_vec, pf_vec); + if (err) + pci_free_irq_vectors(dev->pdev); + + return err; +} + +void mlx5_irq_table_destroy(struct mlx5_core_dev *dev) +{ + struct mlx5_irq_table *table = dev->priv.irq_table; + + if (mlx5_core_is_sf(dev)) + return; + + /* There are cases where IRQs still will be in used when we reaching + * to here. Hence, making sure all the irqs are released. + */ + irq_pools_destroy(table); + pci_free_irq_vectors(dev->pdev); +} + +bool mlx5_irq_table_have_dedicated_sfs_irqs(struct mlx5_irq_table *table) +{ + return table->sf_comp_pool; +} + +int mlx5_irq_table_get_sfs_vec(struct mlx5_irq_table *table) +{ + if (table->sf_comp_pool) + return min_t(int, num_online_cpus(), + table->sf_comp_pool->xa_num_irqs.max - + table->sf_comp_pool->xa_num_irqs.min + 1); + else + return mlx5_irq_table_get_num_comp(table); +} + +struct mlx5_irq_table *mlx5_irq_table_get(struct mlx5_core_dev *dev) +{ +#ifdef CONFIG_MLX5_SF + if (mlx5_core_is_sf(dev)) + return dev->priv.parent_mdev->priv.irq_table; +#endif + return dev->priv.irq_table; +} + +void mlx5_irq_rename(struct mlx5_core_dev *dev, struct mlx5_irq *irq, + const char *name) +{ + char *dst_name = irq->name; + + if (!name) { + char default_name[MLX5_MAX_IRQ_NAME]; + + irq_set_name(irq->pool, default_name, irq->index); + snprintf(dst_name, MLX5_MAX_IRQ_NAME, + "%s@pci:%s", default_name, pci_name(dev->pdev)); + } else { + snprintf(dst_name, MLX5_MAX_IRQ_NAME, "%s-%d", name, + irq->index - MLX5_PF_IRQ_CTRL_NUM); + } +} + diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.h new file mode 100644 index 0000000..9ed4b72 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#ifndef __PCI_IRQ_H__ +#define __PCI_IRQ_H__ + +#include + +#define MLX5_MAX_IRQ_NAME (32) +#define MLX5_FW_RESERVED_EQS 16 +/* max irq_index is 2047, so four chars */ +#define MLX5_MAX_IRQ_IDX_CHARS (4) +#define MLX5_EQ_REFS_PER_IRQ (2) + +struct mlx5_irq; + +struct mlx5_irq_pool { + char name[MLX5_MAX_IRQ_NAME - MLX5_MAX_IRQ_IDX_CHARS]; + struct xa_limit xa_num_irqs; + struct mutex lock; /* sync IRQs creations */ + struct xarray irqs; + u32 max_threshold; + u32 min_threshold; + u16 *irqs_per_cpu; + struct mlx5_core_dev *dev; +}; + +struct mlx5_irq_pool *mlx5_irq_pool_get(struct mlx5_core_dev *dev); +static inline bool mlx5_irq_pool_is_sf_pool(struct mlx5_irq_pool *pool) +{ + return !strncmp("mlx5_sf", pool->name, strlen("mlx5_sf")); +} + +struct mlx5_irq *mlx5_irq_alloc(struct mlx5_irq_pool *pool, int i, + const struct cpumask *affinity); +int mlx5_irq_get_locked(struct mlx5_irq *irq); +int mlx5_irq_read_locked(struct mlx5_irq *irq); +int mlx5_irq_put(struct mlx5_irq *irq); + +#endif /* __PCI_IRQ_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/pd.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/pd.c new file mode 100644 index 0000000..aabc53a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/pd.c @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include "mlx5_core.h" + +int mlx5_core_alloc_pd(struct mlx5_core_dev *dev, u32 *pdn) +{ + u32 out[MLX5_ST_SZ_DW(alloc_pd_out)] = {}; + u32 in[MLX5_ST_SZ_DW(alloc_pd_in)] = {}; + int err; + + MLX5_SET(alloc_pd_in, in, opcode, MLX5_CMD_OP_ALLOC_PD); + err = mlx5_cmd_exec_inout(dev, alloc_pd, in, out); + if (!err) + *pdn = MLX5_GET(alloc_pd_out, out, pd); + return err; +} +EXPORT_SYMBOL(mlx5_core_alloc_pd); + +int mlx5_core_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn) +{ + u32 in[MLX5_ST_SZ_DW(dealloc_pd_in)] = {}; + + MLX5_SET(dealloc_pd_in, in, opcode, MLX5_CMD_OP_DEALLOC_PD); + MLX5_SET(dealloc_pd_in, in, pd, pdn); + return mlx5_cmd_exec_in(dev, dealloc_pd, in); +} +EXPORT_SYMBOL(mlx5_core_dealloc_pd); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/port.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/port.c new file mode 100644 index 0000000..ebe3244 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/port.c @@ -0,0 +1,1455 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include "mlx5_core.h" + +/* calling with verbose false will not print error to log */ +int mlx5_access_reg(struct mlx5_core_dev *dev, void *data_in, int size_in, + void *data_out, int size_out, u16 reg_id, int arg, + int write, bool verbose) +{ + int outlen = MLX5_ST_SZ_BYTES(access_register_out) + size_out; + int inlen = MLX5_ST_SZ_BYTES(access_register_in) + size_in; + int err = -ENOMEM; + u32 *out = NULL; + u32 *in = NULL; + void *data; + + in = kvzalloc(inlen, GFP_KERNEL); + out = kvzalloc(outlen, GFP_KERNEL); + if (!in || !out) + goto out; + + data = MLX5_ADDR_OF(access_register_in, in, register_data); + memcpy(data, data_in, size_in); + + MLX5_SET(access_register_in, in, opcode, MLX5_CMD_OP_ACCESS_REG); + MLX5_SET(access_register_in, in, op_mod, !write); + MLX5_SET(access_register_in, in, argument, arg); + MLX5_SET(access_register_in, in, register_id, reg_id); + + err = mlx5_cmd_do(dev, in, inlen, out, outlen); + if (verbose) + err = mlx5_cmd_check(dev, err, in, out); + if (err) + goto out; + + data = MLX5_ADDR_OF(access_register_out, out, register_data); + memcpy(data_out, data, size_out); + +out: + kvfree(out); + kvfree(in); + return err; +} +EXPORT_SYMBOL_GPL(mlx5_access_reg); + +int mlx5_core_access_reg(struct mlx5_core_dev *dev, void *data_in, + int size_in, void *data_out, int size_out, + u16 reg_id, int arg, int write) +{ + return mlx5_access_reg(dev, data_in, size_in, data_out, size_out, + reg_id, arg, write, true); +} +EXPORT_SYMBOL_GPL(mlx5_core_access_reg); + +int mlx5_query_pcam_reg(struct mlx5_core_dev *dev, u32 *pcam, u8 feature_group, + u8 access_reg_group) +{ + u32 in[MLX5_ST_SZ_DW(pcam_reg)] = {0}; + int sz = MLX5_ST_SZ_BYTES(pcam_reg); + + MLX5_SET(pcam_reg, in, feature_group, feature_group); + MLX5_SET(pcam_reg, in, access_reg_group, access_reg_group); + + return mlx5_core_access_reg(dev, in, sz, pcam, sz, MLX5_REG_PCAM, 0, 0); +} + +int mlx5_query_mcam_reg(struct mlx5_core_dev *dev, u32 *mcam, u8 feature_group, + u8 access_reg_group) +{ + u32 in[MLX5_ST_SZ_DW(mcam_reg)] = {0}; + int sz = MLX5_ST_SZ_BYTES(mcam_reg); + + MLX5_SET(mcam_reg, in, feature_group, feature_group); + MLX5_SET(mcam_reg, in, access_reg_group, access_reg_group); + + return mlx5_core_access_reg(dev, in, sz, mcam, sz, MLX5_REG_MCAM, 0, 0); +} + +int mlx5_query_qcam_reg(struct mlx5_core_dev *mdev, u32 *qcam, + u8 feature_group, u8 access_reg_group) +{ + u32 in[MLX5_ST_SZ_DW(qcam_reg)] = {}; + int sz = MLX5_ST_SZ_BYTES(qcam_reg); + + MLX5_SET(qcam_reg, in, feature_group, feature_group); + MLX5_SET(qcam_reg, in, access_reg_group, access_reg_group); + + return mlx5_core_access_reg(mdev, in, sz, qcam, sz, MLX5_REG_QCAM, 0, 0); +} + +struct mlx5_reg_pcap { + u8 rsvd0; + u8 port_num; + u8 rsvd1[2]; + __be32 caps_127_96; + __be32 caps_95_64; + __be32 caps_63_32; + __be32 caps_31_0; +}; + +int mlx5_set_port_caps(struct mlx5_core_dev *dev, u8 port_num, u32 caps) +{ + struct mlx5_reg_pcap in; + struct mlx5_reg_pcap out; + + memset(&in, 0, sizeof(in)); + in.caps_127_96 = cpu_to_be32(caps); + in.port_num = port_num; + + return mlx5_core_access_reg(dev, &in, sizeof(in), &out, + sizeof(out), MLX5_REG_PCAP, 0, 1); +} +EXPORT_SYMBOL_GPL(mlx5_set_port_caps); + +int mlx5_query_port_ptys(struct mlx5_core_dev *dev, u32 *ptys, + int ptys_size, int proto_mask, u8 local_port) +{ + u32 in[MLX5_ST_SZ_DW(ptys_reg)] = {0}; + + MLX5_SET(ptys_reg, in, local_port, local_port); + MLX5_SET(ptys_reg, in, proto_mask, proto_mask); + return mlx5_core_access_reg(dev, in, sizeof(in), ptys, + ptys_size, MLX5_REG_PTYS, 0, 0); +} +EXPORT_SYMBOL_GPL(mlx5_query_port_ptys); + +int mlx5_set_port_beacon(struct mlx5_core_dev *dev, u16 beacon_duration) +{ + u32 in[MLX5_ST_SZ_DW(mlcr_reg)] = {0}; + u32 out[MLX5_ST_SZ_DW(mlcr_reg)]; + + MLX5_SET(mlcr_reg, in, local_port, 1); + MLX5_SET(mlcr_reg, in, beacon_duration, beacon_duration); + return mlx5_core_access_reg(dev, in, sizeof(in), out, + sizeof(out), MLX5_REG_MLCR, 0, 1); +} + +int mlx5_query_ib_port_oper(struct mlx5_core_dev *dev, u16 *link_width_oper, + u16 *proto_oper, u8 local_port) +{ + u32 out[MLX5_ST_SZ_DW(ptys_reg)]; + int err; + + err = mlx5_query_port_ptys(dev, out, sizeof(out), MLX5_PTYS_IB, + local_port); + if (err) + return err; + + *link_width_oper = MLX5_GET(ptys_reg, out, ib_link_width_oper); + *proto_oper = MLX5_GET(ptys_reg, out, ib_proto_oper); + + return 0; +} +EXPORT_SYMBOL(mlx5_query_ib_port_oper); + +/* This function should be used after setting a port register only */ +void mlx5_toggle_port_link(struct mlx5_core_dev *dev) +{ + enum mlx5_port_status ps; + + mlx5_query_port_admin_status(dev, &ps); + mlx5_set_port_admin_status(dev, MLX5_PORT_DOWN); + if (ps == MLX5_PORT_UP) + mlx5_set_port_admin_status(dev, MLX5_PORT_UP); +} +EXPORT_SYMBOL_GPL(mlx5_toggle_port_link); + +int mlx5_set_port_admin_status(struct mlx5_core_dev *dev, + enum mlx5_port_status status) +{ + u32 in[MLX5_ST_SZ_DW(paos_reg)] = {0}; + u32 out[MLX5_ST_SZ_DW(paos_reg)]; + + MLX5_SET(paos_reg, in, local_port, 1); + MLX5_SET(paos_reg, in, admin_status, status); + MLX5_SET(paos_reg, in, ase, 1); + return mlx5_core_access_reg(dev, in, sizeof(in), out, + sizeof(out), MLX5_REG_PAOS, 0, 1); +} +EXPORT_SYMBOL_GPL(mlx5_set_port_admin_status); + +int mlx5_query_port_admin_status(struct mlx5_core_dev *dev, + enum mlx5_port_status *status) +{ + u32 in[MLX5_ST_SZ_DW(paos_reg)] = {0}; + u32 out[MLX5_ST_SZ_DW(paos_reg)]; + int err; + + MLX5_SET(paos_reg, in, local_port, 1); + err = mlx5_core_access_reg(dev, in, sizeof(in), out, + sizeof(out), MLX5_REG_PAOS, 0, 0); + if (err) + return err; + *status = MLX5_GET(paos_reg, out, admin_status); + return 0; +} +EXPORT_SYMBOL_GPL(mlx5_query_port_admin_status); + +static void mlx5_query_port_mtu(struct mlx5_core_dev *dev, u16 *admin_mtu, + u16 *max_mtu, u16 *oper_mtu, u8 port) +{ + u32 in[MLX5_ST_SZ_DW(pmtu_reg)] = {0}; + u32 out[MLX5_ST_SZ_DW(pmtu_reg)]; + + MLX5_SET(pmtu_reg, in, local_port, port); + mlx5_core_access_reg(dev, in, sizeof(in), out, + sizeof(out), MLX5_REG_PMTU, 0, 0); + + if (max_mtu) + *max_mtu = MLX5_GET(pmtu_reg, out, max_mtu); + if (oper_mtu) + *oper_mtu = MLX5_GET(pmtu_reg, out, oper_mtu); + if (admin_mtu) + *admin_mtu = MLX5_GET(pmtu_reg, out, admin_mtu); +} + +int mlx5_set_port_mtu(struct mlx5_core_dev *dev, u16 mtu, u8 port) +{ + u32 in[MLX5_ST_SZ_DW(pmtu_reg)] = {0}; + u32 out[MLX5_ST_SZ_DW(pmtu_reg)]; + + MLX5_SET(pmtu_reg, in, admin_mtu, mtu); + MLX5_SET(pmtu_reg, in, local_port, port); + return mlx5_core_access_reg(dev, in, sizeof(in), out, + sizeof(out), MLX5_REG_PMTU, 0, 1); +} +EXPORT_SYMBOL_GPL(mlx5_set_port_mtu); + +void mlx5_query_port_max_mtu(struct mlx5_core_dev *dev, u16 *max_mtu, + u8 port) +{ + mlx5_query_port_mtu(dev, NULL, max_mtu, NULL, port); +} +EXPORT_SYMBOL_GPL(mlx5_query_port_max_mtu); + +void mlx5_query_port_oper_mtu(struct mlx5_core_dev *dev, u16 *oper_mtu, + u8 port) +{ + mlx5_query_port_mtu(dev, NULL, NULL, oper_mtu, port); +} +EXPORT_SYMBOL_GPL(mlx5_query_port_oper_mtu); + +int mlx5_query_module_num(struct mlx5_core_dev *dev, int *module_num) +{ + u32 in[MLX5_ST_SZ_DW(pmlp_reg)] = {0}; + u32 out[MLX5_ST_SZ_DW(pmlp_reg)]; + int module_mapping; + int err; + + MLX5_SET(pmlp_reg, in, local_port, 1); + err = mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out), + MLX5_REG_PMLP, 0, 0); + if (err) + return err; + + module_mapping = MLX5_GET(pmlp_reg, out, lane0_module_mapping); + *module_num = module_mapping & MLX5_EEPROM_IDENTIFIER_BYTE_MASK; + + return 0; +} +EXPORT_SYMBOL_GPL(mlx5_query_module_num); + +static int mlx5_query_module_id(struct mlx5_core_dev *dev, int module_num, + u8 *module_id) +{ + u32 in[MLX5_ST_SZ_DW(mcia_reg)] = {}; + u32 out[MLX5_ST_SZ_DW(mcia_reg)]; + int err, status; + u8 *ptr; + + MLX5_SET(mcia_reg, in, i2c_device_address, MLX5_I2C_ADDR_LOW); + MLX5_SET(mcia_reg, in, module, module_num); + MLX5_SET(mcia_reg, in, device_address, 0); + MLX5_SET(mcia_reg, in, page_number, 0); + MLX5_SET(mcia_reg, in, size, 1); + MLX5_SET(mcia_reg, in, l, 0); + + err = mlx5_core_access_reg(dev, in, sizeof(in), out, + sizeof(out), MLX5_REG_MCIA, 0, 0); + if (err) + return err; + + status = MLX5_GET(mcia_reg, out, status); + if (status) { + mlx5_core_err(dev, "query_mcia_reg failed: status: 0x%x\n", + status); + return -EIO; + } + ptr = MLX5_ADDR_OF(mcia_reg, out, dword_0); + + *module_id = ptr[0]; + + return 0; +} + +static int mlx5_qsfp_eeprom_page(u16 offset) +{ + if (offset < MLX5_EEPROM_PAGE_LENGTH) + /* Addresses between 0-255 - page 00 */ + return 0; + + /* Addresses between 256 - 639 belongs to pages 01, 02 and 03 + * For example, offset = 400 belongs to page 02: + * 1 + ((400 - 256)/128) = 2 + */ + return 1 + ((offset - MLX5_EEPROM_PAGE_LENGTH) / + MLX5_EEPROM_HIGH_PAGE_LENGTH); +} + +static int mlx5_qsfp_eeprom_high_page_offset(int page_num) +{ + if (!page_num) /* Page 0 always start from low page */ + return 0; + + /* High page */ + return page_num * MLX5_EEPROM_HIGH_PAGE_LENGTH; +} + +static void mlx5_qsfp_eeprom_params_set(u16 *i2c_addr, int *page_num, u16 *offset) +{ + *i2c_addr = MLX5_I2C_ADDR_LOW; + *page_num = mlx5_qsfp_eeprom_page(*offset); + *offset -= mlx5_qsfp_eeprom_high_page_offset(*page_num); +} + +static void mlx5_sfp_eeprom_params_set(u16 *i2c_addr, int *page_num, u16 *offset) +{ + *i2c_addr = MLX5_I2C_ADDR_LOW; + *page_num = 0; + + if (*offset < MLX5_EEPROM_PAGE_LENGTH) + return; + + *i2c_addr = MLX5_I2C_ADDR_HIGH; + *offset -= MLX5_EEPROM_PAGE_LENGTH; +} + +static int mlx5_query_mcia(struct mlx5_core_dev *dev, + struct mlx5_module_eeprom_query_params *params, u8 *data) +{ + u32 in[MLX5_ST_SZ_DW(mcia_reg)] = {}; + u32 out[MLX5_ST_SZ_DW(mcia_reg)]; + int status, err; + void *ptr; + u16 size; + + size = min_t(int, params->size, MLX5_EEPROM_MAX_BYTES); + + MLX5_SET(mcia_reg, in, l, 0); + MLX5_SET(mcia_reg, in, size, size); + MLX5_SET(mcia_reg, in, module, params->module_number); + MLX5_SET(mcia_reg, in, device_address, params->offset); + MLX5_SET(mcia_reg, in, page_number, params->page); + MLX5_SET(mcia_reg, in, i2c_device_address, params->i2c_address); + + err = mlx5_core_access_reg(dev, in, sizeof(in), out, + sizeof(out), MLX5_REG_MCIA, 0, 0); + if (err) + return err; + + status = MLX5_GET(mcia_reg, out, status); + if (status) { + mlx5_core_err(dev, "query_mcia_reg failed: status: 0x%x\n", + status); + return -EIO; + } + + ptr = MLX5_ADDR_OF(mcia_reg, out, dword_0); + memcpy(data, ptr, size); + + return size; +} + +int mlx5_query_module_eeprom(struct mlx5_core_dev *dev, + u16 offset, u16 size, u8 *data) +{ + struct mlx5_module_eeprom_query_params query = {0}; + u8 module_id; + int err; + + err = mlx5_query_module_num(dev, &query.module_number); + if (err) + return err; + + err = mlx5_query_module_id(dev, query.module_number, &module_id); + if (err) + return err; + + switch (module_id) { + case MLX5_MODULE_ID_SFP: + mlx5_sfp_eeprom_params_set(&query.i2c_address, &query.page, &offset); + break; + case MLX5_MODULE_ID_QSFP: + case MLX5_MODULE_ID_QSFP_PLUS: + case MLX5_MODULE_ID_QSFP28: + mlx5_qsfp_eeprom_params_set(&query.i2c_address, &query.page, &offset); + break; + default: + mlx5_core_err(dev, "Module ID not recognized: 0x%x\n", module_id); + return -EINVAL; + } + + if (offset + size > MLX5_EEPROM_PAGE_LENGTH) + /* Cross pages read, read until offset 256 in low page */ + size = MLX5_EEPROM_PAGE_LENGTH - offset; + + query.size = size; + query.offset = offset; + + return mlx5_query_mcia(dev, &query, data); +} +EXPORT_SYMBOL_GPL(mlx5_query_module_eeprom); + +int mlx5_query_module_eeprom_by_page(struct mlx5_core_dev *dev, + struct mlx5_module_eeprom_query_params *params, + u8 *data) +{ + u8 module_id; + int err; + + err = mlx5_query_module_num(dev, ¶ms->module_number); + if (err) + return err; + + err = mlx5_query_module_id(dev, params->module_number, &module_id); + if (err) + return err; + + switch (module_id) { + case MLX5_MODULE_ID_SFP: + if (params->page > 0) + return -EINVAL; + break; + case MLX5_MODULE_ID_QSFP: + case MLX5_MODULE_ID_QSFP28: + case MLX5_MODULE_ID_QSFP_PLUS: + if (params->page > 3) + return -EINVAL; + break; + case MLX5_MODULE_ID_DSFP: + break; + default: + mlx5_core_err(dev, "Module ID not recognized: 0x%x\n", module_id); + return -EINVAL; + } + + if (params->i2c_address != MLX5_I2C_ADDR_HIGH && + params->i2c_address != MLX5_I2C_ADDR_LOW) { + mlx5_core_err(dev, "I2C address not recognized: 0x%x\n", params->i2c_address); + return -EINVAL; + } + + return mlx5_query_mcia(dev, params, data); +} +EXPORT_SYMBOL_GPL(mlx5_query_module_eeprom_by_page); + +static int mlx5_query_port_pvlc(struct mlx5_core_dev *dev, u32 *pvlc, + int pvlc_size, u8 local_port) +{ + u32 in[MLX5_ST_SZ_DW(pvlc_reg)] = {0}; + + MLX5_SET(pvlc_reg, in, local_port, local_port); + return mlx5_core_access_reg(dev, in, sizeof(in), pvlc, + pvlc_size, MLX5_REG_PVLC, 0, 0); +} + +int mlx5_query_port_vl_hw_cap(struct mlx5_core_dev *dev, + u8 *vl_hw_cap, u8 local_port) +{ + u32 out[MLX5_ST_SZ_DW(pvlc_reg)]; + int err; + + err = mlx5_query_port_pvlc(dev, out, sizeof(out), local_port); + if (err) + return err; + + *vl_hw_cap = MLX5_GET(pvlc_reg, out, vl_hw_cap); + + return 0; +} +EXPORT_SYMBOL_GPL(mlx5_query_port_vl_hw_cap); + +static int mlx5_query_pfcc_reg(struct mlx5_core_dev *dev, u32 *out, + u32 out_size) +{ + u32 in[MLX5_ST_SZ_DW(pfcc_reg)] = {0}; + + MLX5_SET(pfcc_reg, in, local_port, 1); + + return mlx5_core_access_reg(dev, in, sizeof(in), out, + out_size, MLX5_REG_PFCC, 0, 0); +} + +int mlx5_set_port_pause(struct mlx5_core_dev *dev, u32 rx_pause, u32 tx_pause) +{ + u32 in[MLX5_ST_SZ_DW(pfcc_reg)] = {0}; + u32 out[MLX5_ST_SZ_DW(pfcc_reg)]; + + MLX5_SET(pfcc_reg, in, local_port, 1); + MLX5_SET(pfcc_reg, in, pptx, tx_pause); + MLX5_SET(pfcc_reg, in, pprx, rx_pause); + + return mlx5_core_access_reg(dev, in, sizeof(in), out, + sizeof(out), MLX5_REG_PFCC, 0, 1); +} +EXPORT_SYMBOL_GPL(mlx5_set_port_pause); + +int mlx5_query_port_pause(struct mlx5_core_dev *dev, + u32 *rx_pause, u32 *tx_pause) +{ + u32 out[MLX5_ST_SZ_DW(pfcc_reg)]; + int err; + + err = mlx5_query_pfcc_reg(dev, out, sizeof(out)); + if (err) + return err; + + if (rx_pause) + *rx_pause = MLX5_GET(pfcc_reg, out, pprx); + + if (tx_pause) + *tx_pause = MLX5_GET(pfcc_reg, out, pptx); + + return 0; +} +EXPORT_SYMBOL_GPL(mlx5_query_port_pause); + +int mlx5_set_port_pfc_prevention(struct mlx5_core_dev *dev, + u16 pfc_preven_critical, u16 pfc_preven_minor) +{ + u32 in[MLX5_ST_SZ_DW(pfcc_reg)] = {0}; + u32 out[MLX5_ST_SZ_DW(pfcc_reg)]; + + MLX5_SET(pfcc_reg, in, local_port, 1); + MLX5_SET(pfcc_reg, in, pptx_mask_n, 1); + MLX5_SET(pfcc_reg, in, pprx_mask_n, 1); + MLX5_SET(pfcc_reg, in, ppan_mask_n, 1); + MLX5_SET(pfcc_reg, in, critical_stall_mask, 1); + MLX5_SET(pfcc_reg, in, minor_stall_mask, 1); + MLX5_SET(pfcc_reg, in, device_stall_critical_watermark, pfc_preven_critical); + MLX5_SET(pfcc_reg, in, device_stall_minor_watermark, pfc_preven_minor); + + return mlx5_core_access_reg(dev, in, sizeof(in), out, + sizeof(out), MLX5_REG_PFCC, 0, 1); +} + +int mlx5_query_port_pfc_prevention(struct mlx5_core_dev *dev, + u16 *pfc_preven_critical) +{ + u32 out[MLX5_ST_SZ_DW(pfcc_reg)]; + int err; + + err = mlx5_query_pfcc_reg(dev, out, sizeof(out)); + if (err) + return err; + + if (pfc_preven_critical) + *pfc_preven_critical = MLX5_GET(pfcc_reg, out, + device_stall_critical_watermark); + + return 0; +} + +int mlx5_set_port_stall_watermark(struct mlx5_core_dev *dev, + u16 stall_critical_watermark, + u16 stall_minor_watermark) +{ + u32 in[MLX5_ST_SZ_DW(pfcc_reg)] = {0}; + u32 out[MLX5_ST_SZ_DW(pfcc_reg)]; + + MLX5_SET(pfcc_reg, in, local_port, 1); + MLX5_SET(pfcc_reg, in, pptx_mask_n, 1); + MLX5_SET(pfcc_reg, in, pprx_mask_n, 1); + MLX5_SET(pfcc_reg, in, ppan_mask_n, 1); + MLX5_SET(pfcc_reg, in, critical_stall_mask, 1); + MLX5_SET(pfcc_reg, in, minor_stall_mask, 1); + MLX5_SET(pfcc_reg, in, device_stall_critical_watermark, + stall_critical_watermark); + MLX5_SET(pfcc_reg, in, device_stall_minor_watermark, stall_minor_watermark); + + return mlx5_core_access_reg(dev, in, sizeof(in), out, + sizeof(out), MLX5_REG_PFCC, 0, 1); +} + +int mlx5_query_port_stall_watermark(struct mlx5_core_dev *dev, + u16 *stall_critical_watermark, + u16 *stall_minor_watermark) +{ + u32 out[MLX5_ST_SZ_DW(pfcc_reg)]; + int err; + + err = mlx5_query_pfcc_reg(dev, out, sizeof(out)); + if (err) + return err; + + if (stall_critical_watermark) + *stall_critical_watermark = MLX5_GET(pfcc_reg, out, + device_stall_critical_watermark); + + if (stall_minor_watermark) + *stall_minor_watermark = MLX5_GET(pfcc_reg, out, + device_stall_minor_watermark); + + return 0; +} + +int mlx5_set_port_pfc(struct mlx5_core_dev *dev, u8 pfc_en_tx, u8 pfc_en_rx) +{ + u32 in[MLX5_ST_SZ_DW(pfcc_reg)] = {0}; + u32 out[MLX5_ST_SZ_DW(pfcc_reg)]; + + MLX5_SET(pfcc_reg, in, local_port, 1); + MLX5_SET(pfcc_reg, in, pfctx, pfc_en_tx); + MLX5_SET(pfcc_reg, in, pfcrx, pfc_en_rx); + MLX5_SET_TO_ONES(pfcc_reg, in, prio_mask_tx); + MLX5_SET_TO_ONES(pfcc_reg, in, prio_mask_rx); + + return mlx5_core_access_reg(dev, in, sizeof(in), out, + sizeof(out), MLX5_REG_PFCC, 0, 1); +} +EXPORT_SYMBOL_GPL(mlx5_set_port_pfc); + +int mlx5_query_port_pfc(struct mlx5_core_dev *dev, u8 *pfc_en_tx, u8 *pfc_en_rx) +{ + u32 out[MLX5_ST_SZ_DW(pfcc_reg)]; + int err; + + err = mlx5_query_pfcc_reg(dev, out, sizeof(out)); + if (err) + return err; + + if (pfc_en_tx) + *pfc_en_tx = MLX5_GET(pfcc_reg, out, pfctx); + + if (pfc_en_rx) + *pfc_en_rx = MLX5_GET(pfcc_reg, out, pfcrx); + + return 0; +} +EXPORT_SYMBOL_GPL(mlx5_query_port_pfc); + +int mlx5_max_tc(struct mlx5_core_dev *mdev) +{ + u8 num_tc = MLX5_CAP_GEN(mdev, max_tc) ? : 8; + + return num_tc - 1; +} + +int mlx5_query_port_dcbx_param(struct mlx5_core_dev *mdev, u32 *out) +{ + u32 in[MLX5_ST_SZ_DW(dcbx_param)] = {0}; + + MLX5_SET(dcbx_param, in, port_number, 1); + + return mlx5_core_access_reg(mdev, in, sizeof(in), out, + sizeof(in), MLX5_REG_DCBX_PARAM, 0, 0); +} + +int mlx5_set_port_dcbx_param(struct mlx5_core_dev *mdev, u32 *in) +{ + u32 out[MLX5_ST_SZ_DW(dcbx_param)]; + + MLX5_SET(dcbx_param, in, port_number, 1); + + return mlx5_core_access_reg(mdev, in, sizeof(out), out, + sizeof(out), MLX5_REG_DCBX_PARAM, 0, 1); +} + +int mlx5_set_port_prio_tc(struct mlx5_core_dev *mdev, u8 *prio_tc) +{ + u32 in[MLX5_ST_SZ_DW(qtct_reg)] = {0}; + u32 out[MLX5_ST_SZ_DW(qtct_reg)]; + int err; + int i; + + for (i = 0; i < 8; i++) { + if (prio_tc[i] > mlx5_max_tc(mdev)) + return -EINVAL; + + MLX5_SET(qtct_reg, in, prio, i); + MLX5_SET(qtct_reg, in, tclass, prio_tc[i]); + + err = mlx5_core_access_reg(mdev, in, sizeof(in), out, + sizeof(out), MLX5_REG_QTCT, 0, 1); + if (err) + return err; + } + + return 0; +} +EXPORT_SYMBOL_GPL(mlx5_set_port_prio_tc); + +int mlx5_query_port_prio_tc(struct mlx5_core_dev *mdev, + u8 prio, u8 *tc) +{ + u32 in[MLX5_ST_SZ_DW(qtct_reg)]; + u32 out[MLX5_ST_SZ_DW(qtct_reg)]; + int err; + + memset(in, 0, sizeof(in)); + memset(out, 0, sizeof(out)); + + MLX5_SET(qtct_reg, in, port_number, 1); + MLX5_SET(qtct_reg, in, prio, prio); + + err = mlx5_core_access_reg(mdev, in, sizeof(in), out, + sizeof(out), MLX5_REG_QTCT, 0, 0); + if (!err) + *tc = MLX5_GET(qtct_reg, out, tclass); + + return err; +} +EXPORT_SYMBOL_GPL(mlx5_query_port_prio_tc); + +static int mlx5_set_port_qetcr_reg(struct mlx5_core_dev *mdev, u32 *in, + int inlen) +{ + u32 out[MLX5_ST_SZ_DW(qetc_reg)]; + + if (!MLX5_CAP_GEN(mdev, ets)) + return -EOPNOTSUPP; + + return mlx5_core_access_reg(mdev, in, inlen, out, sizeof(out), + MLX5_REG_QETCR, 0, 1); +} + +static int mlx5_query_port_qetcr_reg(struct mlx5_core_dev *mdev, u32 *out, + int outlen) +{ + u32 in[MLX5_ST_SZ_DW(qetc_reg)]; + + if (!MLX5_CAP_GEN(mdev, ets)) + return -EOPNOTSUPP; + + memset(in, 0, sizeof(in)); + return mlx5_core_access_reg(mdev, in, sizeof(in), out, outlen, + MLX5_REG_QETCR, 0, 0); +} + +int mlx5_set_port_tc_group(struct mlx5_core_dev *mdev, u8 *tc_group) +{ + u32 in[MLX5_ST_SZ_DW(qetc_reg)] = {0}; + int i; + + for (i = 0; i <= mlx5_max_tc(mdev); i++) { + MLX5_SET(qetc_reg, in, tc_configuration[i].g, 1); + MLX5_SET(qetc_reg, in, tc_configuration[i].group, tc_group[i]); + } + + return mlx5_set_port_qetcr_reg(mdev, in, sizeof(in)); +} +EXPORT_SYMBOL_GPL(mlx5_set_port_tc_group); + +int mlx5_query_port_tc_group(struct mlx5_core_dev *mdev, + u8 tc, u8 *tc_group) +{ + u32 out[MLX5_ST_SZ_DW(qetc_reg)]; + void *ets_tcn_conf; + int err; + + err = mlx5_query_port_qetcr_reg(mdev, out, sizeof(out)); + if (err) + return err; + + ets_tcn_conf = MLX5_ADDR_OF(qetc_reg, out, + tc_configuration[tc]); + + *tc_group = MLX5_GET(ets_tcn_config_reg, ets_tcn_conf, + group); + + return 0; +} +EXPORT_SYMBOL_GPL(mlx5_query_port_tc_group); + +int mlx5_set_port_tc_bw_alloc(struct mlx5_core_dev *mdev, u8 *tc_bw) +{ + u32 in[MLX5_ST_SZ_DW(qetc_reg)] = {0}; + int i; + + for (i = 0; i <= mlx5_max_tc(mdev); i++) { + MLX5_SET(qetc_reg, in, tc_configuration[i].b, 1); + MLX5_SET(qetc_reg, in, tc_configuration[i].bw_allocation, tc_bw[i]); + } + + return mlx5_set_port_qetcr_reg(mdev, in, sizeof(in)); +} +EXPORT_SYMBOL_GPL(mlx5_set_port_tc_bw_alloc); + +int mlx5_query_port_tc_bw_alloc(struct mlx5_core_dev *mdev, + u8 tc, u8 *bw_pct) +{ + u32 out[MLX5_ST_SZ_DW(qetc_reg)]; + void *ets_tcn_conf; + int err; + + err = mlx5_query_port_qetcr_reg(mdev, out, sizeof(out)); + if (err) + return err; + + ets_tcn_conf = MLX5_ADDR_OF(qetc_reg, out, + tc_configuration[tc]); + + *bw_pct = MLX5_GET(ets_tcn_config_reg, ets_tcn_conf, + bw_allocation); + + return 0; +} +EXPORT_SYMBOL_GPL(mlx5_query_port_tc_bw_alloc); + +int mlx5_modify_port_ets_rate_limit(struct mlx5_core_dev *mdev, + u8 *max_bw_value, + u8 *max_bw_units) +{ + u32 in[MLX5_ST_SZ_DW(qetc_reg)] = {0}; + void *ets_tcn_conf; + int i; + + MLX5_SET(qetc_reg, in, port_number, 1); + + for (i = 0; i <= mlx5_max_tc(mdev); i++) { + ets_tcn_conf = MLX5_ADDR_OF(qetc_reg, in, tc_configuration[i]); + + MLX5_SET(ets_tcn_config_reg, ets_tcn_conf, r, 1); + MLX5_SET(ets_tcn_config_reg, ets_tcn_conf, max_bw_units, + max_bw_units[i]); + MLX5_SET(ets_tcn_config_reg, ets_tcn_conf, max_bw_value, + max_bw_value[i]); + } + + return mlx5_set_port_qetcr_reg(mdev, in, sizeof(in)); +} +EXPORT_SYMBOL_GPL(mlx5_modify_port_ets_rate_limit); + +int mlx5_query_port_ets_rate_limit(struct mlx5_core_dev *mdev, + u8 *max_bw_value, + u8 *max_bw_units) +{ + u32 out[MLX5_ST_SZ_DW(qetc_reg)]; + void *ets_tcn_conf; + int err; + int i; + + err = mlx5_query_port_qetcr_reg(mdev, out, sizeof(out)); + if (err) + return err; + + for (i = 0; i <= mlx5_max_tc(mdev); i++) { + ets_tcn_conf = MLX5_ADDR_OF(qetc_reg, out, tc_configuration[i]); + + max_bw_value[i] = MLX5_GET(ets_tcn_config_reg, ets_tcn_conf, + max_bw_value); + max_bw_units[i] = MLX5_GET(ets_tcn_config_reg, ets_tcn_conf, + max_bw_units); + } + + return 0; +} +EXPORT_SYMBOL_GPL(mlx5_query_port_ets_rate_limit); + +int mlx5_set_port_wol(struct mlx5_core_dev *mdev, u8 wol_mode) +{ + u32 in[MLX5_ST_SZ_DW(set_wol_rol_in)] = {}; + + MLX5_SET(set_wol_rol_in, in, opcode, MLX5_CMD_OP_SET_WOL_ROL); + MLX5_SET(set_wol_rol_in, in, wol_mode_valid, 1); + MLX5_SET(set_wol_rol_in, in, wol_mode, wol_mode); + return mlx5_cmd_exec_in(mdev, set_wol_rol, in); +} +EXPORT_SYMBOL_GPL(mlx5_set_port_wol); + +int mlx5_query_port_wol(struct mlx5_core_dev *mdev, u8 *wol_mode) +{ + u32 out[MLX5_ST_SZ_DW(query_wol_rol_out)] = {}; + u32 in[MLX5_ST_SZ_DW(query_wol_rol_in)] = {}; + int err; + + MLX5_SET(query_wol_rol_in, in, opcode, MLX5_CMD_OP_QUERY_WOL_ROL); + err = mlx5_cmd_exec_inout(mdev, query_wol_rol, in, out); + if (!err) + *wol_mode = MLX5_GET(query_wol_rol_out, out, wol_mode); + + return err; +} +EXPORT_SYMBOL_GPL(mlx5_query_port_wol); + +static int mlx5_query_pddr(struct mlx5_core_dev *mdev, + int page_select, u32 *out, int outlen) +{ + u32 in[MLX5_ST_SZ_DW(pddr_reg)] = {0}; + + if (!MLX5_CAP_PCAM_REG(mdev, pddr)) + return -EOPNOTSUPP; + + MLX5_SET(pddr_reg, in, local_port, 1); + MLX5_SET(pddr_reg, in, page_select, page_select); + + return mlx5_core_access_reg(mdev, in, sizeof(in), out, outlen, MLX5_REG_PDDR, 0, 0); +} + +int mlx5_query_pddr_troubleshooting_info(struct mlx5_core_dev *mdev, + u16 *monitor_opcode, + u8 *status_message) +{ + int outlen = MLX5_ST_SZ_BYTES(pddr_reg); + u32 out[MLX5_ST_SZ_DW(pddr_reg)] = {0}; + int err; + + err = mlx5_query_pddr(mdev, MLX5_PDDR_TROUBLESHOOTING_INFO_PAGE, + out, outlen); + if (err) + return err; + + if (monitor_opcode) + *monitor_opcode = MLX5_GET(pddr_reg, out, + page_data.pddr_troubleshooting_page.status_opcode.pddr_monitor_opcode); + + if (status_message) + strncpy(status_message, + MLX5_ADDR_OF(pddr_reg, out, page_data.pddr_troubleshooting_page.status_message), + MLX5_FLD_SZ_BYTES(pddr_troubleshooting_page, status_message)); + + return 0; +} + +int mlx5_query_port_cong_status(struct mlx5_core_dev *mdev, int protocol, + int priority, int *is_enable) +{ + u32 in[MLX5_ST_SZ_DW(query_cong_status_in)]; + u32 out[MLX5_ST_SZ_DW(query_cong_status_out)]; + int err; + + memset(in, 0, sizeof(in)); + memset(out, 0, sizeof(out)); + + MLX5_SET(query_cong_status_in, in, opcode, + MLX5_CMD_OP_QUERY_CONG_STATUS); + MLX5_SET(query_cong_status_in, in, cong_protocol, protocol); + MLX5_SET(query_cong_status_in, in, priority, priority); + + err = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); + if (!err) + *is_enable = MLX5_GET(query_cong_status_out, out, enable); + return err; +} + +int mlx5_modify_port_cong_status(struct mlx5_core_dev *mdev, int protocol, + int priority, int enable) +{ + u32 in[MLX5_ST_SZ_DW(modify_cong_status_in)]; + u32 out[MLX5_ST_SZ_DW(modify_cong_status_out)]; + + memset(in, 0, sizeof(in)); + memset(out, 0, sizeof(out)); + + MLX5_SET(modify_cong_status_in, in, opcode, + MLX5_CMD_OP_MODIFY_CONG_STATUS); + MLX5_SET(modify_cong_status_in, in, cong_protocol, protocol); + MLX5_SET(modify_cong_status_in, in, priority, priority); + MLX5_SET(modify_cong_status_in, in, enable, enable); + + return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); +} + +int mlx5_query_port_cong_params(struct mlx5_core_dev *mdev, int protocol, + void *out, int out_size) +{ + u32 in[MLX5_ST_SZ_DW(query_cong_params_in)]; + + memset(in, 0, sizeof(in)); + + MLX5_SET(query_cong_params_in, in, opcode, + MLX5_CMD_OP_QUERY_CONG_PARAMS); + MLX5_SET(query_cong_params_in, in, cong_protocol, protocol); + + return mlx5_cmd_exec(mdev, in, sizeof(in), out, out_size); +} + +int mlx5_modify_port_cong_params(struct mlx5_core_dev *mdev, + void *in, int in_size) +{ + u32 out[MLX5_ST_SZ_DW(modify_cong_params_out)]; + + memset(out, 0, sizeof(out)); + + return mlx5_cmd_exec(mdev, in, in_size, out, sizeof(out)); +} + + +int mlx5_query_ports_check(struct mlx5_core_dev *mdev, u32 *out, int outlen) +{ + u32 in[MLX5_ST_SZ_DW(pcmr_reg)] = {0}; + + MLX5_SET(pcmr_reg, in, local_port, 1); + return mlx5_core_access_reg(mdev, in, sizeof(in), out, + outlen, MLX5_REG_PCMR, 0, 0); +} + +int mlx5_set_ports_check(struct mlx5_core_dev *mdev, u32 *in, int inlen) +{ + u32 out[MLX5_ST_SZ_DW(pcmr_reg)]; + + return mlx5_core_access_reg(mdev, in, inlen, out, + sizeof(out), MLX5_REG_PCMR, 0, 1); +} + +int mlx5_set_port_fcs(struct mlx5_core_dev *mdev, u8 enable) +{ + u32 in[MLX5_ST_SZ_DW(pcmr_reg)] = {0}; + int err; + + err = mlx5_query_ports_check(mdev, in, sizeof(in)); + if (err) + return err; + MLX5_SET(pcmr_reg, in, local_port, 1); + MLX5_SET(pcmr_reg, in, fcs_chk, enable); + return mlx5_set_ports_check(mdev, in, sizeof(in)); +} + +void mlx5_query_port_fcs(struct mlx5_core_dev *mdev, bool *supported, + bool *enabled) +{ + u32 out[MLX5_ST_SZ_DW(pcmr_reg)]; + /* Default values for FW which do not support MLX5_REG_PCMR */ + *supported = false; + *enabled = true; + + if (!MLX5_CAP_GEN(mdev, ports_check)) + return; + + if (mlx5_query_ports_check(mdev, out, sizeof(out))) + return; + + *supported = !!(MLX5_GET(pcmr_reg, out, fcs_cap)); + *enabled = !!(MLX5_GET(pcmr_reg, out, fcs_chk)); +} + +int mlx5_query_mtpps(struct mlx5_core_dev *mdev, u32 *mtpps, u32 mtpps_size) +{ + u32 in[MLX5_ST_SZ_DW(mtpps_reg)] = {0}; + + return mlx5_core_access_reg(mdev, in, sizeof(in), mtpps, + mtpps_size, MLX5_REG_MTPPS, 0, 0); +} + +int mlx5_set_mtpps(struct mlx5_core_dev *mdev, u32 *mtpps, u32 mtpps_size) +{ + u32 out[MLX5_ST_SZ_DW(mtpps_reg)] = {0}; + + return mlx5_core_access_reg(mdev, mtpps, mtpps_size, out, + sizeof(out), MLX5_REG_MTPPS, 0, 1); +} + +int mlx5_query_mtppse(struct mlx5_core_dev *mdev, u8 pin, u8 *arm, u8 *mode) +{ + u32 out[MLX5_ST_SZ_DW(mtppse_reg)] = {0}; + u32 in[MLX5_ST_SZ_DW(mtppse_reg)] = {0}; + int err = 0; + + MLX5_SET(mtppse_reg, in, pin, pin); + + err = mlx5_core_access_reg(mdev, in, sizeof(in), out, + sizeof(out), MLX5_REG_MTPPSE, 0, 0); + if (err) + return err; + + *arm = MLX5_GET(mtppse_reg, in, event_arm); + *mode = MLX5_GET(mtppse_reg, in, event_generation_mode); + + return err; +} + +int mlx5_set_mtppse(struct mlx5_core_dev *mdev, u8 pin, u8 arm, u8 mode) +{ + u32 out[MLX5_ST_SZ_DW(mtppse_reg)] = {0}; + u32 in[MLX5_ST_SZ_DW(mtppse_reg)] = {0}; + + MLX5_SET(mtppse_reg, in, pin, pin); + MLX5_SET(mtppse_reg, in, event_arm, arm); + MLX5_SET(mtppse_reg, in, event_generation_mode, mode); + + return mlx5_core_access_reg(mdev, in, sizeof(in), out, + sizeof(out), MLX5_REG_MTPPSE, 0, 1); +} + +int mlx5_set_trust_state(struct mlx5_core_dev *mdev, u8 trust_state) +{ + u32 out[MLX5_ST_SZ_DW(qpts_reg)] = {}; + u32 in[MLX5_ST_SZ_DW(qpts_reg)] = {}; + int err; + + MLX5_SET(qpts_reg, in, local_port, 1); + MLX5_SET(qpts_reg, in, trust_state, trust_state); + + err = mlx5_core_access_reg(mdev, in, sizeof(in), out, + sizeof(out), MLX5_REG_QPTS, 0, 1); + return err; +} + +int mlx5_query_trust_state(struct mlx5_core_dev *mdev, u8 *trust_state) +{ + u32 out[MLX5_ST_SZ_DW(qpts_reg)] = {}; + u32 in[MLX5_ST_SZ_DW(qpts_reg)] = {}; + int err; + + MLX5_SET(qpts_reg, in, local_port, 1); + + err = mlx5_core_access_reg(mdev, in, sizeof(in), out, + sizeof(out), MLX5_REG_QPTS, 0, 0); + if (!err) + *trust_state = MLX5_GET(qpts_reg, out, trust_state); + + return err; +} + +int mlx5_set_dscp2prio(struct mlx5_core_dev *mdev, u8 dscp, u8 prio) +{ + int sz = MLX5_ST_SZ_BYTES(qpdpm_reg); + void *qpdpm_dscp; + void *out; + void *in; + int err; + + in = kzalloc(sz, GFP_KERNEL); + out = kzalloc(sz, GFP_KERNEL); + if (!in || !out) { + err = -ENOMEM; + goto out; + } + + MLX5_SET(qpdpm_reg, in, local_port, 1); + err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_QPDPM, 0, 0); + if (err) + goto out; + + memcpy(in, out, sz); + MLX5_SET(qpdpm_reg, in, local_port, 1); + + /* Update the corresponding dscp entry */ + qpdpm_dscp = MLX5_ADDR_OF(qpdpm_reg, in, dscp[dscp]); + MLX5_SET16(qpdpm_dscp_reg, qpdpm_dscp, prio, prio); + MLX5_SET16(qpdpm_dscp_reg, qpdpm_dscp, e, 1); + err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_QPDPM, 0, 1); + +out: + kfree(in); + kfree(out); + return err; +} + +/* dscp2prio[i]: priority that dscp i mapped to */ +#define MLX5E_SUPPORTED_DSCP 64 +int mlx5_query_dscp2prio(struct mlx5_core_dev *mdev, u8 *dscp2prio) +{ + int sz = MLX5_ST_SZ_BYTES(qpdpm_reg); + void *qpdpm_dscp; + void *out; + void *in; + int err; + int i; + + in = kzalloc(sz, GFP_KERNEL); + out = kzalloc(sz, GFP_KERNEL); + if (!in || !out) { + err = -ENOMEM; + goto out; + } + + MLX5_SET(qpdpm_reg, in, local_port, 1); + err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_QPDPM, 0, 0); + if (err) + goto out; + + for (i = 0; i < (MLX5E_SUPPORTED_DSCP); i++) { + qpdpm_dscp = MLX5_ADDR_OF(qpdpm_reg, out, dscp[i]); + dscp2prio[i] = MLX5_GET16(qpdpm_dscp_reg, qpdpm_dscp, prio); + } + +out: + kfree(in); + kfree(out); + return err; +} + +static int is_valid_vf(struct mlx5_core_dev *dev, int vf) +{ + struct pci_dev *pdev = dev->pdev; + + if (vf == 1) + return 1; + + if (mlx5_core_is_pf(dev)) + return (vf <= pci_num_vf(pdev)) && (vf >= 1); + + return 0; +} + +int mlx5_core_query_gids(struct mlx5_core_dev *dev, u8 other_vport, + u8 port_num, u16 vf_num, u16 gid_index, + union ib_gid *gid) +{ + int in_sz = MLX5_ST_SZ_BYTES(query_hca_vport_gid_in); + int out_sz = MLX5_ST_SZ_BYTES(query_hca_vport_gid_out); + int is_group_manager; + void *out = NULL; + void *in = NULL; + union ib_gid *tmp; + int tbsz; + int nout; + int err; + + vf_num += 1; + if (!is_valid_vf(dev, vf_num)) { + mlx5_core_warn(dev, "invalid vf number %d", vf_num); + return -EINVAL; + } + + is_group_manager = MLX5_CAP_GEN(dev, vport_group_manager); + tbsz = mlx5_get_gid_table_len(MLX5_CAP_GEN(dev, gid_table_size)); + mlx5_core_dbg(dev, "vf_num %d, index %d, gid_table_size %d\n", + vf_num, gid_index, tbsz); + + if (gid_index > tbsz && gid_index != 0xffff) + return -EINVAL; + + if (gid_index == 0xffff) + nout = tbsz; + else + nout = 1; + + out_sz += nout * sizeof(*gid); + + in = kzalloc(in_sz, GFP_KERNEL); + out = kzalloc(out_sz, GFP_KERNEL); + if (!in || !out) { + err = -ENOMEM; + goto out; + } + + MLX5_SET(query_hca_vport_gid_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_VPORT_GID); + if (other_vport) { + if (is_group_manager) { + MLX5_SET(query_hca_vport_gid_in, in, vport_number, vf_num); + MLX5_SET(query_hca_vport_gid_in, in, other_vport, 1); + } else { + err = -EPERM; + goto out; + } + } + MLX5_SET(query_hca_vport_gid_in, in, gid_index, gid_index); + + if (MLX5_CAP_GEN(dev, num_ports) == 2) + MLX5_SET(query_hca_vport_gid_in, in, port_num, port_num); + + err = mlx5_cmd_exec(dev, in, in_sz, out, out_sz); + if (err) + goto out; + + tmp = out + MLX5_ST_SZ_BYTES(query_hca_vport_gid_out); + gid->global.subnet_prefix = tmp->global.subnet_prefix; + gid->global.interface_id = tmp->global.interface_id; + +out: + kfree(in); + kfree(out); + return err; +} +EXPORT_SYMBOL_GPL(mlx5_core_query_gids); + +int mlx5_core_query_pkeys(struct mlx5_core_dev *dev, u8 other_vport, + u8 port_num, u16 vf_num, u16 pkey_index, + u16 *pkey) +{ + int in_sz = MLX5_ST_SZ_BYTES(query_hca_vport_pkey_in); + int out_sz = MLX5_ST_SZ_BYTES(query_hca_vport_pkey_out); + int is_group_manager; + void *out = NULL; + void *in = NULL; + void *pkarr; + int nout; + int tbsz; + int err; + int i; + + is_group_manager = MLX5_CAP_GEN(dev, vport_group_manager); + mlx5_core_dbg(dev, "vf_num %d\n", vf_num); + + vf_num += 1; + if (!is_valid_vf(dev, vf_num)) { + mlx5_core_warn(dev, "invalid vf number %d", vf_num); + return -EINVAL; + } + + tbsz = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(dev, pkey_table_size)); + if (pkey_index > tbsz && pkey_index != 0xffff) + return -EINVAL; + + if (pkey_index == 0xffff) + nout = tbsz; + else + nout = 1; + + out_sz += nout * MLX5_ST_SZ_BYTES(pkey); + + in = kzalloc(in_sz, GFP_KERNEL); + out = kzalloc(out_sz, GFP_KERNEL); + if (!in || !out) { + err = -ENOMEM; + goto out; + } + + MLX5_SET(query_hca_vport_pkey_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_VPORT_PKEY); + if (other_vport) { + if (is_group_manager) { + MLX5_SET(query_hca_vport_pkey_in, in, vport_number, vf_num); + MLX5_SET(query_hca_vport_pkey_in, in, other_vport, 1); + } else { + err = -EPERM; + goto out; + } + } + MLX5_SET(query_hca_vport_pkey_in, in, pkey_index, pkey_index); + + if (MLX5_CAP_GEN(dev, num_ports) == 2) + MLX5_SET(query_hca_vport_pkey_in, in, port_num, port_num); + + err = mlx5_cmd_exec(dev, in, in_sz, out, out_sz); + if (err) + goto out; + + pkarr = out + MLX5_ST_SZ_BYTES(query_hca_vport_pkey_out); + for (i = 0; i < nout; i++, pkey++, pkarr += MLX5_ST_SZ_BYTES(pkey)) + *pkey = MLX5_GET_PR(pkey, pkarr, pkey); + +out: + kfree(in); + kfree(out); + return err; +} +EXPORT_SYMBOL_GPL(mlx5_core_query_pkeys); + +int mlx5_core_query_hca_vport_context(struct mlx5_core_dev *dev, + u8 other_vport, u8 port_num, + u16 vf_num, + struct mlx5_hca_vport_context *rep) +{ + int out_sz = MLX5_ST_SZ_BYTES(query_hca_vport_context_out); + u32 in[MLX5_ST_SZ_DW(query_hca_vport_context_in)]; + int is_group_manager; + void *out; + void *ctx; + int err; + + mlx5_core_dbg(dev, "vf_num %d\n", vf_num); + is_group_manager = MLX5_CAP_GEN(dev, vport_group_manager); + vf_num += 1; + if (!is_valid_vf(dev, vf_num)) { + mlx5_core_warn(dev, "invalid vf number %d", vf_num); + return -EINVAL; + } + + memset(in, 0, sizeof(in)); + out = kzalloc(out_sz, GFP_KERNEL); + if (!out) + return -ENOMEM; + + MLX5_SET(query_hca_vport_context_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_VPORT_CONTEXT); + + if (other_vport) { + if (is_group_manager) { + MLX5_SET(query_hca_vport_context_in, in, other_vport, 1); + MLX5_SET(query_hca_vport_context_in, in, vport_number, vf_num); + } else { + err = -EPERM; + goto ex; + } + } + + if (MLX5_CAP_GEN(dev, num_ports) == 2) + MLX5_SET(query_hca_vport_context_in, in, port_num, port_num); + + err = mlx5_cmd_exec(dev, in, sizeof(in), out, out_sz); + if (err) + goto ex; + + ctx = MLX5_ADDR_OF(query_hca_vport_context_out, out, hca_vport_context); + rep->field_select = MLX5_GET_PR(hca_vport_context, ctx, field_select); + rep->sm_virt_aware = MLX5_GET_PR(hca_vport_context, ctx, sm_virt_aware); + rep->has_smi = MLX5_GET_PR(hca_vport_context, ctx, has_smi); + rep->has_raw = MLX5_GET_PR(hca_vport_context, ctx, has_raw); + rep->policy = MLX5_GET_PR(hca_vport_context, ctx, vport_state_policy); + rep->phys_state = MLX5_GET_PR(hca_vport_context, ctx, + port_physical_state); + rep->vport_state = MLX5_GET_PR(hca_vport_context, ctx, vport_state); + rep->port_physical_state = MLX5_GET_PR(hca_vport_context, ctx, + port_physical_state); + rep->port_guid = MLX5_GET64_PR(hca_vport_context, ctx, port_guid); + rep->node_guid = MLX5_GET64_PR(hca_vport_context, ctx, node_guid); + rep->cap_mask1 = MLX5_GET_PR(hca_vport_context, ctx, cap_mask1); + rep->cap_mask1_perm = MLX5_GET_PR(hca_vport_context, ctx, + cap_mask1_field_select); + rep->cap_mask2 = MLX5_GET_PR(hca_vport_context, ctx, cap_mask2); + rep->cap_mask2_perm = MLX5_GET_PR(hca_vport_context, ctx, + cap_mask2_field_select); + rep->lid = MLX5_GET_PR(hca_vport_context, ctx, lid); + rep->init_type_reply = MLX5_GET_PR(hca_vport_context, ctx, + init_type_reply); + rep->lmc = MLX5_GET_PR(hca_vport_context, ctx, lmc); + rep->subnet_timeout = MLX5_GET_PR(hca_vport_context, ctx, + subnet_timeout); + rep->sm_lid = MLX5_GET_PR(hca_vport_context, ctx, sm_lid); + rep->sm_sl = MLX5_GET_PR(hca_vport_context, ctx, sm_sl); + rep->qkey_violation_counter = MLX5_GET_PR(hca_vport_context, ctx, + qkey_violation_counter); + rep->pkey_violation_counter = MLX5_GET_PR(hca_vport_context, ctx, + pkey_violation_counter); + rep->grh_required = MLX5_GET_PR(hca_vport_context, ctx, grh_required); + rep->sys_image_guid = MLX5_GET64_PR(hca_vport_context, ctx, + system_image_guid); + +ex: + kfree(out); + return err; +} +EXPORT_SYMBOL_GPL(mlx5_core_query_hca_vport_context); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/qos.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/qos.c new file mode 100644 index 0000000..0777be2 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/qos.c @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. */ + +#include "qos.h" + +#define MLX5_QOS_DEFAULT_DWRR_UID 0 + +bool mlx5_qos_is_supported(struct mlx5_core_dev *mdev) +{ + if (!MLX5_CAP_GEN(mdev, qos)) + return false; + if (!MLX5_CAP_QOS(mdev, nic_sq_scheduling)) + return false; + if (!MLX5_CAP_QOS(mdev, nic_bw_share)) + return false; + if (!MLX5_CAP_QOS(mdev, nic_rate_limit)) + return false; + return true; +} + +int mlx5_qos_max_leaf_nodes(struct mlx5_core_dev *mdev) +{ + return 1 << MLX5_CAP_QOS(mdev, log_max_qos_nic_queue_group); +} + +int mlx5_qos_create_leaf_node(struct mlx5_core_dev *mdev, u32 parent_id, + u32 bw_share, u32 max_avg_bw, u32 *id) +{ + u32 sched_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {0}; + + MLX5_SET(scheduling_context, sched_ctx, parent_element_id, parent_id); + MLX5_SET(scheduling_context, sched_ctx, element_type, + SCHEDULING_CONTEXT_ELEMENT_TYPE_QUEUE_GROUP); + MLX5_SET(scheduling_context, sched_ctx, bw_share, bw_share); + MLX5_SET(scheduling_context, sched_ctx, max_average_bw, max_avg_bw); + + return mlx5_create_scheduling_element_cmd(mdev, SCHEDULING_HIERARCHY_NIC, + sched_ctx, id); +} + +int mlx5_qos_create_inner_node(struct mlx5_core_dev *mdev, u32 parent_id, + u32 bw_share, u32 max_avg_bw, u32 *id) +{ + u32 sched_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {0}; + void *attr; + + MLX5_SET(scheduling_context, sched_ctx, parent_element_id, parent_id); + MLX5_SET(scheduling_context, sched_ctx, element_type, + SCHEDULING_CONTEXT_ELEMENT_TYPE_TSAR); + MLX5_SET(scheduling_context, sched_ctx, bw_share, bw_share); + MLX5_SET(scheduling_context, sched_ctx, max_average_bw, max_avg_bw); + + attr = MLX5_ADDR_OF(scheduling_context, sched_ctx, element_attributes); + MLX5_SET(tsar_element, attr, tsar_type, TSAR_ELEMENT_TSAR_TYPE_DWRR); + + return mlx5_create_scheduling_element_cmd(mdev, SCHEDULING_HIERARCHY_NIC, + sched_ctx, id); +} + +int mlx5_qos_create_root_node(struct mlx5_core_dev *mdev, u32 *id) +{ + return mlx5_qos_create_inner_node(mdev, MLX5_QOS_DEFAULT_DWRR_UID, 0, 0, id); +} + +int mlx5_qos_update_node(struct mlx5_core_dev *mdev, u32 parent_id, + u32 bw_share, u32 max_avg_bw, u32 id) +{ + u32 sched_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {0}; + u32 bitmask = 0; + + MLX5_SET(scheduling_context, sched_ctx, parent_element_id, parent_id); + MLX5_SET(scheduling_context, sched_ctx, bw_share, bw_share); + MLX5_SET(scheduling_context, sched_ctx, max_average_bw, max_avg_bw); + + bitmask |= MODIFY_SCHEDULING_ELEMENT_IN_MODIFY_BITMASK_BW_SHARE; + bitmask |= MODIFY_SCHEDULING_ELEMENT_IN_MODIFY_BITMASK_MAX_AVERAGE_BW; + + return mlx5_modify_scheduling_element_cmd(mdev, SCHEDULING_HIERARCHY_NIC, + sched_ctx, id, bitmask); +} + +int mlx5_qos_destroy_node(struct mlx5_core_dev *mdev, u32 id) +{ + return mlx5_destroy_scheduling_element_cmd(mdev, SCHEDULING_HIERARCHY_NIC, id); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/qos.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/qos.h new file mode 100644 index 0000000..125e4e4 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/qos.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. */ + +#ifndef __MLX5_QOS_H +#define __MLX5_QOS_H + +#include "mlx5_core.h" + +#define MLX5_DEBUG_QOS_MASK BIT(4) + +#define qos_err(mdev, fmt, ...) \ + mlx5_core_err(mdev, "QoS: " fmt, ##__VA_ARGS__) +#define qos_warn(mdev, fmt, ...) \ + mlx5_core_warn(mdev, "QoS: " fmt, ##__VA_ARGS__) +#define qos_dbg(mdev, fmt, ...) \ + mlx5_core_dbg_mask(mdev, MLX5_DEBUG_QOS_MASK, "QoS: " fmt, ##__VA_ARGS__) + +bool mlx5_qos_is_supported(struct mlx5_core_dev *mdev); +int mlx5_qos_max_leaf_nodes(struct mlx5_core_dev *mdev); + +int mlx5_qos_create_leaf_node(struct mlx5_core_dev *mdev, u32 parent_id, + u32 bw_share, u32 max_avg_bw, u32 *id); +int mlx5_qos_create_inner_node(struct mlx5_core_dev *mdev, u32 parent_id, + u32 bw_share, u32 max_avg_bw, u32 *id); +int mlx5_qos_create_root_node(struct mlx5_core_dev *mdev, u32 *id); +int mlx5_qos_update_node(struct mlx5_core_dev *mdev, u32 parent_id, u32 bw_share, + u32 max_avg_bw, u32 id); +int mlx5_qos_destroy_node(struct mlx5_core_dev *mdev, u32 id); + +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/rdma.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/rdma.c new file mode 100644 index 0000000..540cf05 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/rdma.c @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2019 Mellanox Technologies */ + +#include +#include +#include + +#include "lib/mlx5.h" +#include "eswitch.h" +#include "fs_core.h" +#include "rdma.h" + +static void mlx5_rdma_disable_roce_steering(struct mlx5_core_dev *dev) +{ + struct mlx5_core_roce *roce = &dev->priv.roce; + + mlx5_del_flow_rules(roce->allow_rule); + mlx5_destroy_flow_group(roce->fg); + mlx5_destroy_flow_table(roce->ft); +} + +static int mlx5_rdma_enable_roce_steering(struct mlx5_core_dev *dev) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_core_roce *roce = &dev->priv.roce; + struct mlx5_flow_handle *flow_rule = NULL; + struct mlx5_flow_table_attr ft_attr = {}; + struct mlx5_flow_namespace *ns = NULL; + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_spec *spec; + struct mlx5_flow_table *ft; + struct mlx5_flow_group *fg; + void *match_criteria; + u32 *flow_group_in; + void *misc; + int err; + + if (!(MLX5_CAP_FLOWTABLE_RDMA_RX(dev, ft_support) && + MLX5_CAP_FLOWTABLE_RDMA_RX(dev, table_miss_action_domain))) + return -EOPNOTSUPP; + + flow_group_in = kvzalloc(inlen, GFP_KERNEL); + if (!flow_group_in) + return -ENOMEM; + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) { + kvfree(flow_group_in); + return -ENOMEM; + } + + ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_RDMA_RX_KERNEL); + if (!ns) { + mlx5_core_err(dev, "Failed to get RDMA RX namespace"); + err = -EOPNOTSUPP; + goto free; + } + + ft_attr.max_fte = 1; + ft = mlx5_create_flow_table(ns, &ft_attr); + if (IS_ERR(ft)) { + mlx5_core_err(dev, "Failed to create RDMA RX flow table"); + err = PTR_ERR(ft); + goto free; + } + + MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, + MLX5_MATCH_MISC_PARAMETERS); + match_criteria = MLX5_ADDR_OF(create_flow_group_in, flow_group_in, + match_criteria); + MLX5_SET_TO_ONES(fte_match_param, match_criteria, + misc_parameters.source_port); + + fg = mlx5_create_flow_group(ft, flow_group_in); + if (IS_ERR(fg)) { + err = PTR_ERR(fg); + mlx5_core_err(dev, "Failed to create RDMA RX flow group err(%d)\n", err); + goto destroy_flow_table; + } + + spec->match_criteria_enable = MLX5_MATCH_MISC_PARAMETERS; + misc = MLX5_ADDR_OF(fte_match_param, spec->match_value, + misc_parameters); + MLX5_SET(fte_match_set_misc, misc, source_port, + dev->priv.eswitch->manager_vport); + misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, + misc_parameters); + MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_port); + + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_ALLOW; + flow_rule = mlx5_add_flow_rules(ft, spec, &flow_act, NULL, 0); + if (IS_ERR(flow_rule)) { + err = PTR_ERR(flow_rule); + mlx5_core_err(dev, "Failed to add RoCE allow rule, err=%d\n", + err); + goto destroy_flow_group; + } + + kvfree(spec); + kvfree(flow_group_in); + roce->ft = ft; + roce->fg = fg; + roce->allow_rule = flow_rule; + + return 0; + +destroy_flow_group: + mlx5_destroy_flow_group(fg); +destroy_flow_table: + mlx5_destroy_flow_table(ft); +free: + kvfree(spec); + kvfree(flow_group_in); + return err; +} + +static void mlx5_rdma_del_roce_addr(struct mlx5_core_dev *dev) +{ + mlx5_core_roce_gid_set(dev, 0, 0, 0, + NULL, NULL, false, 0, 1); +} + +static void mlx5_rdma_make_default_gid(struct mlx5_core_dev *dev, union ib_gid *gid) +{ + u8 hw_id[ETH_ALEN]; + + mlx5_query_mac_address(dev, hw_id); + gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL); + addrconf_addr_eui48(&gid->raw[8], hw_id); +} + +static int mlx5_rdma_add_roce_addr(struct mlx5_core_dev *dev) +{ + union ib_gid gid; + u8 mac[ETH_ALEN]; + + mlx5_rdma_make_default_gid(dev, &gid); + return mlx5_core_roce_gid_set(dev, 0, + MLX5_ROCE_VERSION_1, + 0, gid.raw, mac, + false, 0, 1); +} + +void mlx5_rdma_disable_roce(struct mlx5_core_dev *dev) +{ + struct mlx5_core_roce *roce = &dev->priv.roce; + + if (!roce->ft) + return; + + mlx5_rdma_disable_roce_steering(dev); + mlx5_rdma_del_roce_addr(dev); + mlx5_nic_vport_disable_roce(dev); +} + +void mlx5_rdma_enable_roce(struct mlx5_core_dev *dev) +{ + int err; + + if (!MLX5_CAP_GEN(dev, roce)) + return; + + err = mlx5_nic_vport_enable_roce(dev); + if (err) { + mlx5_core_err(dev, "Failed to enable RoCE: %d\n", err); + return; + } + + err = mlx5_rdma_add_roce_addr(dev); + if (err) { + mlx5_core_err(dev, "Failed to add RoCE address: %d\n", err); + goto disable_roce; + } + + err = mlx5_rdma_enable_roce_steering(dev); + if (err) { + mlx5_core_err(dev, "Failed to enable RoCE steering: %d\n", err); + goto del_roce_addr; + } + + return; + +del_roce_addr: + mlx5_rdma_del_roce_addr(dev); +disable_roce: + mlx5_nic_vport_disable_roce(dev); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/rdma.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/rdma.h new file mode 100644 index 0000000..750cff2 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/rdma.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2019 Mellanox Technologies. */ + +#ifndef __MLX5_RDMA_H__ +#define __MLX5_RDMA_H__ + +#include "mlx5_core.h" + +#ifdef CONFIG_MLX5_ESWITCH + +void mlx5_rdma_enable_roce(struct mlx5_core_dev *dev); +void mlx5_rdma_disable_roce(struct mlx5_core_dev *dev); + +#else /* CONFIG_MLX5_ESWITCH */ + +static inline void mlx5_rdma_enable_roce(struct mlx5_core_dev *dev) {} +static inline void mlx5_rdma_disable_roce(struct mlx5_core_dev *dev) {} + +#endif /* CONFIG_MLX5_ESWITCH */ +#endif /* __MLX5_RDMA_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/rl.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/rl.c new file mode 100644 index 0000000..7161220 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/rl.c @@ -0,0 +1,399 @@ +/* + * Copyright (c) 2013-2016, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include "mlx5_core.h" + +/* Scheduling element fw management */ +int mlx5_create_scheduling_element_cmd(struct mlx5_core_dev *dev, u8 hierarchy, + void *ctx, u32 *element_id) +{ + u32 out[MLX5_ST_SZ_DW(create_scheduling_element_in)] = {}; + u32 in[MLX5_ST_SZ_DW(create_scheduling_element_in)] = {}; + void *schedc; + int err; + + schedc = MLX5_ADDR_OF(create_scheduling_element_in, in, + scheduling_context); + MLX5_SET(create_scheduling_element_in, in, opcode, + MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT); + MLX5_SET(create_scheduling_element_in, in, scheduling_hierarchy, + hierarchy); + memcpy(schedc, ctx, MLX5_ST_SZ_BYTES(scheduling_context)); + + err = mlx5_cmd_exec_inout(dev, create_scheduling_element, in, out); + if (err) + return err; + + *element_id = MLX5_GET(create_scheduling_element_out, out, + scheduling_element_id); + return 0; +} + +int mlx5_modify_scheduling_element_cmd(struct mlx5_core_dev *dev, u8 hierarchy, + void *ctx, u32 element_id, + u32 modify_bitmask) +{ + u32 in[MLX5_ST_SZ_DW(modify_scheduling_element_in)] = {}; + void *schedc; + + schedc = MLX5_ADDR_OF(modify_scheduling_element_in, in, + scheduling_context); + MLX5_SET(modify_scheduling_element_in, in, opcode, + MLX5_CMD_OP_MODIFY_SCHEDULING_ELEMENT); + MLX5_SET(modify_scheduling_element_in, in, scheduling_element_id, + element_id); + MLX5_SET(modify_scheduling_element_in, in, modify_bitmask, + modify_bitmask); + MLX5_SET(modify_scheduling_element_in, in, scheduling_hierarchy, + hierarchy); + memcpy(schedc, ctx, MLX5_ST_SZ_BYTES(scheduling_context)); + + return mlx5_cmd_exec_in(dev, modify_scheduling_element, in); +} + +int mlx5_destroy_scheduling_element_cmd(struct mlx5_core_dev *dev, u8 hierarchy, + u32 element_id) +{ + u32 in[MLX5_ST_SZ_DW(destroy_scheduling_element_in)] = {}; + + MLX5_SET(destroy_scheduling_element_in, in, opcode, + MLX5_CMD_OP_DESTROY_SCHEDULING_ELEMENT); + MLX5_SET(destroy_scheduling_element_in, in, scheduling_element_id, + element_id); + MLX5_SET(destroy_scheduling_element_in, in, scheduling_hierarchy, + hierarchy); + + return mlx5_cmd_exec_in(dev, destroy_scheduling_element, in); +} + +static bool mlx5_rl_are_equal_raw(struct mlx5_rl_entry *entry, void *rl_in, + u16 uid) +{ + return (!memcmp(entry->rl_raw, rl_in, sizeof(entry->rl_raw)) && + entry->uid == uid); +} + +/* Finds an entry where we can register the given rate + * If the rate already exists, return the entry where it is registered, + * otherwise return the first available entry. + * If the table is full, return NULL + */ +static struct mlx5_rl_entry *find_rl_entry(struct mlx5_rl_table *table, + void *rl_in, u16 uid, bool dedicated) +{ + struct mlx5_rl_entry *ret_entry = NULL; + bool empty_found = false; + int i; + + lockdep_assert_held(&table->rl_lock); + WARN_ON(!table->rl_entry); + + for (i = 0; i < table->max_size; i++) { + if (dedicated) { + if (!table->rl_entry[i].refcount) + return &table->rl_entry[i]; + continue; + } + + if (table->rl_entry[i].refcount) { + if (table->rl_entry[i].dedicated) + continue; + if (mlx5_rl_are_equal_raw(&table->rl_entry[i], rl_in, + uid)) + return &table->rl_entry[i]; + } else if (!empty_found) { + empty_found = true; + ret_entry = &table->rl_entry[i]; + } + } + + return ret_entry; +} + +static int mlx5_set_pp_rate_limit_cmd(struct mlx5_core_dev *dev, + struct mlx5_rl_entry *entry, bool set) +{ + u32 in[MLX5_ST_SZ_DW(set_pp_rate_limit_in)] = {}; + void *pp_context; + + pp_context = MLX5_ADDR_OF(set_pp_rate_limit_in, in, ctx); + MLX5_SET(set_pp_rate_limit_in, in, opcode, + MLX5_CMD_OP_SET_PP_RATE_LIMIT); + MLX5_SET(set_pp_rate_limit_in, in, uid, entry->uid); + MLX5_SET(set_pp_rate_limit_in, in, rate_limit_index, entry->index); + if (set) + memcpy(pp_context, entry->rl_raw, sizeof(entry->rl_raw)); + return mlx5_cmd_exec_in(dev, set_pp_rate_limit, in); +} + +bool mlx5_rl_is_in_range(struct mlx5_core_dev *dev, u32 rate) +{ + struct mlx5_rl_table *table = &dev->priv.rl_table; + + return (rate <= table->max_rate && rate >= table->min_rate); +} +EXPORT_SYMBOL(mlx5_rl_is_in_range); + +bool mlx5_rl_are_equal(struct mlx5_rate_limit *rl_0, + struct mlx5_rate_limit *rl_1) +{ + return ((rl_0->rate == rl_1->rate) && + (rl_0->max_burst_sz == rl_1->max_burst_sz) && + (rl_0->typical_pkt_sz == rl_1->typical_pkt_sz)); +} +EXPORT_SYMBOL(mlx5_rl_are_equal); + +static int mlx5_rl_table_get(struct mlx5_rl_table *table) +{ + int i; + + lockdep_assert_held(&table->rl_lock); + + if (table->rl_entry) { + table->refcount++; + return 0; + } + + table->rl_entry = kcalloc(table->max_size, sizeof(struct mlx5_rl_entry), + GFP_KERNEL); + if (!table->rl_entry) + return -ENOMEM; + + /* The index represents the index in HW rate limit table + * Index 0 is reserved for unlimited rate + */ + for (i = 0; i < table->max_size; i++) + table->rl_entry[i].index = i + 1; + + table->refcount++; + return 0; +} + +static void mlx5_rl_table_put(struct mlx5_rl_table *table) +{ + lockdep_assert_held(&table->rl_lock); + if (--table->refcount) + return; + + kfree(table->rl_entry); + table->rl_entry = NULL; +} + +static void mlx5_rl_table_free(struct mlx5_core_dev *dev, struct mlx5_rl_table *table) +{ + int i; + + if (!table->rl_entry) + return; + + /* Clear all configured rates */ + for (i = 0; i < table->max_size; i++) + if (table->rl_entry[i].refcount) + mlx5_set_pp_rate_limit_cmd(dev, &table->rl_entry[i], false); + kfree(table->rl_entry); +} + +static void mlx5_rl_entry_get(struct mlx5_rl_entry *entry) +{ + entry->refcount++; +} + +static void +mlx5_rl_entry_put(struct mlx5_core_dev *dev, struct mlx5_rl_entry *entry) +{ + entry->refcount--; + if (!entry->refcount) + mlx5_set_pp_rate_limit_cmd(dev, entry, false); +} + +int mlx5_rl_add_rate_raw(struct mlx5_core_dev *dev, void *rl_in, u16 uid, + bool dedicated_entry, u16 *index) +{ + struct mlx5_rl_table *table = &dev->priv.rl_table; + struct mlx5_rl_entry *entry; + u32 rate; + int err; + + if (!table->max_size) + return -EOPNOTSUPP; + + rate = MLX5_GET(set_pp_rate_limit_context, rl_in, rate_limit); + if (!rate || !mlx5_rl_is_in_range(dev, rate)) { + mlx5_core_err(dev, "Invalid rate: %u, should be %u to %u\n", + rate, table->min_rate, table->max_rate); + return -EINVAL; + } + + mutex_lock(&table->rl_lock); + err = mlx5_rl_table_get(table); + if (err) + goto out; + + entry = find_rl_entry(table, rl_in, uid, dedicated_entry); + if (!entry) { + mlx5_core_err(dev, "Max number of %u rates reached\n", + table->max_size); + err = -ENOSPC; + goto rl_err; + } + if (!entry->refcount) { + /* new rate limit */ + memcpy(entry->rl_raw, rl_in, sizeof(entry->rl_raw)); + entry->uid = uid; + err = mlx5_set_pp_rate_limit_cmd(dev, entry, true); + if (err) { + mlx5_core_err( + dev, + "Failed configuring rate limit(err %d): rate %u, max_burst_sz %u, typical_pkt_sz %u\n", + err, rate, + MLX5_GET(set_pp_rate_limit_context, rl_in, + burst_upper_bound), + MLX5_GET(set_pp_rate_limit_context, rl_in, + typical_packet_size)); + goto rl_err; + } + + entry->dedicated = dedicated_entry; + } + mlx5_rl_entry_get(entry); + *index = entry->index; + mutex_unlock(&table->rl_lock); + return 0; + +rl_err: + mlx5_rl_table_put(table); +out: + mutex_unlock(&table->rl_lock); + return err; +} +EXPORT_SYMBOL(mlx5_rl_add_rate_raw); + +void mlx5_rl_remove_rate_raw(struct mlx5_core_dev *dev, u16 index) +{ + struct mlx5_rl_table *table = &dev->priv.rl_table; + struct mlx5_rl_entry *entry; + + mutex_lock(&table->rl_lock); + entry = &table->rl_entry[index - 1]; + mlx5_rl_entry_put(dev, entry); + mlx5_rl_table_put(table); + mutex_unlock(&table->rl_lock); +} +EXPORT_SYMBOL(mlx5_rl_remove_rate_raw); + +int mlx5_rl_add_rate(struct mlx5_core_dev *dev, u16 *index, + struct mlx5_rate_limit *rl) +{ + u8 rl_raw[MLX5_ST_SZ_BYTES(set_pp_rate_limit_context)] = {}; + + MLX5_SET(set_pp_rate_limit_context, rl_raw, rate_limit, rl->rate); + MLX5_SET(set_pp_rate_limit_context, rl_raw, burst_upper_bound, + rl->max_burst_sz); + MLX5_SET(set_pp_rate_limit_context, rl_raw, typical_packet_size, + rl->typical_pkt_sz); + + return mlx5_rl_add_rate_raw(dev, rl_raw, + MLX5_CAP_QOS(dev, packet_pacing_uid) ? + MLX5_SHARED_RESOURCE_UID : 0, + false, index); +} +EXPORT_SYMBOL(mlx5_rl_add_rate); + +void mlx5_rl_remove_rate(struct mlx5_core_dev *dev, struct mlx5_rate_limit *rl) +{ + u8 rl_raw[MLX5_ST_SZ_BYTES(set_pp_rate_limit_context)] = {}; + struct mlx5_rl_table *table = &dev->priv.rl_table; + struct mlx5_rl_entry *entry = NULL; + + /* 0 is a reserved value for unlimited rate */ + if (rl->rate == 0) + return; + + MLX5_SET(set_pp_rate_limit_context, rl_raw, rate_limit, rl->rate); + MLX5_SET(set_pp_rate_limit_context, rl_raw, burst_upper_bound, + rl->max_burst_sz); + MLX5_SET(set_pp_rate_limit_context, rl_raw, typical_packet_size, + rl->typical_pkt_sz); + + mutex_lock(&table->rl_lock); + entry = find_rl_entry(table, rl_raw, + MLX5_CAP_QOS(dev, packet_pacing_uid) ? + MLX5_SHARED_RESOURCE_UID : 0, false); + if (!entry || !entry->refcount) { + mlx5_core_warn(dev, "Rate %u, max_burst_sz %u typical_pkt_sz %u are not configured\n", + rl->rate, rl->max_burst_sz, rl->typical_pkt_sz); + goto out; + } + mlx5_rl_entry_put(dev, entry); + mlx5_rl_table_put(table); +out: + mutex_unlock(&table->rl_lock); +} +EXPORT_SYMBOL(mlx5_rl_remove_rate); + +int mlx5_init_rl_table(struct mlx5_core_dev *dev) +{ + struct mlx5_rl_table *table = &dev->priv.rl_table; + + if (!MLX5_CAP_GEN(dev, qos) || !MLX5_CAP_QOS(dev, packet_pacing)) { + table->max_size = 0; + return 0; + } + + mutex_init(&table->rl_lock); + + /* First entry is reserved for unlimited rate */ + table->max_size = MLX5_CAP_QOS(dev, packet_pacing_rate_table_size) - 1; + table->max_rate = MLX5_CAP_QOS(dev, packet_pacing_max_rate); + table->min_rate = MLX5_CAP_QOS(dev, packet_pacing_min_rate); + + mlx5_core_info(dev, "Rate limit: %u rates are supported, range: %uMbps to %uMbps\n", + table->max_size, + table->min_rate >> 10, + table->max_rate >> 10); + + return 0; +} + +void mlx5_cleanup_rl_table(struct mlx5_core_dev *dev) +{ + struct mlx5_rl_table *table = &dev->priv.rl_table; + + if (!MLX5_CAP_GEN(dev, qos) || !MLX5_CAP_QOS(dev, packet_pacing)) + return; + + mlx5_rl_table_free(dev, table); + mutex_destroy(&table->rl_lock); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/cmd.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/cmd.c new file mode 100644 index 0000000..a8d75c2 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/cmd.c @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020 Mellanox Technologies Ltd */ + +#include +#include "priv.h" + +int mlx5_cmd_alloc_sf(struct mlx5_core_dev *dev, u16 function_id) +{ + u32 out[MLX5_ST_SZ_DW(alloc_sf_out)] = {}; + u32 in[MLX5_ST_SZ_DW(alloc_sf_in)] = {}; + + MLX5_SET(alloc_sf_in, in, opcode, MLX5_CMD_OP_ALLOC_SF); + MLX5_SET(alloc_sf_in, in, function_id, function_id); + + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); +} + +int mlx5_cmd_dealloc_sf(struct mlx5_core_dev *dev, u16 function_id) +{ + u32 out[MLX5_ST_SZ_DW(dealloc_sf_out)] = {}; + u32 in[MLX5_ST_SZ_DW(dealloc_sf_in)] = {}; + + MLX5_SET(dealloc_sf_in, in, opcode, MLX5_CMD_OP_DEALLOC_SF); + MLX5_SET(dealloc_sf_in, in, function_id, function_id); + + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); +} + +int mlx5_cmd_sf_enable_hca(struct mlx5_core_dev *dev, u16 func_id) +{ + u32 out[MLX5_ST_SZ_DW(enable_hca_out)] = {}; + u32 in[MLX5_ST_SZ_DW(enable_hca_in)] = {}; + + MLX5_SET(enable_hca_in, in, opcode, MLX5_CMD_OP_ENABLE_HCA); + MLX5_SET(enable_hca_in, in, function_id, func_id); + MLX5_SET(enable_hca_in, in, embedded_cpu_function, 0); + return mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); +} + +int mlx5_cmd_sf_disable_hca(struct mlx5_core_dev *dev, u16 func_id) +{ + u32 out[MLX5_ST_SZ_DW(disable_hca_out)] = {}; + u32 in[MLX5_ST_SZ_DW(disable_hca_in)] = {}; + + MLX5_SET(disable_hca_in, in, opcode, MLX5_CMD_OP_DISABLE_HCA); + MLX5_SET(disable_hca_in, in, function_id, func_id); + MLX5_SET(enable_hca_in, in, embedded_cpu_function, 0); + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/cfg_driver.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/cfg_driver.c new file mode 100644 index 0000000..92eb7da --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/cfg_driver.c @@ -0,0 +1,263 @@ +#include +#include +#include "mlx5_core.h" +#include "dev.h" +#include "devlink.h" +#include + +#include "cfg_driver.h" + +struct mlx5_sf_cfg_devm { + struct mlxdevm device; + struct mlx5_sf_dev *sf_dev; +}; + +enum mlx5_devm_param_id { + MLX5_DEVM_PARAM_ID_CMPL_EQ_DEPTH, + MLX5_DEVM_PARAM_ID_ASYNC_EQ_DEPTH, + MLX5_DEVM_PARAM_ID_DISABLE_ROCE, + MLX5_DEVM_PARAM_ID_DISABLE_FC, + MLX5_DEVM_PARAM_ID_DISABLE_NETDEV, + MLX5_DEVM_PARAM_ID_MAX_CMPL_EQS, +}; + +static struct mlx5_sf_dev *mlxdevm_to_sf_dev(struct mlxdevm *devm) +{ + struct mlx5_sf_cfg_devm *sf_cfg_dev; + + sf_cfg_dev = container_of(devm, struct mlx5_sf_cfg_devm, device); + return sf_cfg_dev->sf_dev; +} + +static int mlx5_devm_cmpl_eq_depth_get(struct mlxdevm *devm, u32 id, + struct mlxdevm_param_gset_ctx *ctx) +{ + struct mlx5_sf_dev *sf_dev = mlxdevm_to_sf_dev(devm); + + ctx->val.vu32 = sf_dev->cmpl_eq_depth; + return 0; +} + +static int mlx5_devm_cmpl_eq_depth_set(struct mlxdevm *devm, u32 id, + struct mlxdevm_param_gset_ctx *ctx) +{ + struct mlx5_sf_dev *sf_dev = mlxdevm_to_sf_dev(devm); + + sf_dev->cmpl_eq_depth = ctx->val.vu32; + return 0; +} + +static int mlx5_devm_async_eq_depth_get(struct mlxdevm *devm, u32 id, + struct mlxdevm_param_gset_ctx *ctx) +{ + struct mlx5_sf_dev *sf_dev = mlxdevm_to_sf_dev(devm); + + ctx->val.vu32 = sf_dev->async_eq_depth; + return 0; +} + +static int mlx5_devm_async_eq_depth_set(struct mlxdevm *devm, u32 id, + struct mlxdevm_param_gset_ctx *ctx) +{ + struct mlx5_sf_dev *sf_dev = mlxdevm_to_sf_dev(devm); + + sf_dev->async_eq_depth = ctx->val.vu32; + return 0; +} + +static int mlx5_devm_eq_depth_validate(struct mlxdevm *devm, u32 id, + union mlxdevm_param_value val, + struct netlink_ext_ack *extack) +{ + return (val.vu32 >= 64 && val.vu32 <= 4096) ? 0 : -EINVAL; +} + +static int mlx5_devm_disable_fc_get(struct mlxdevm *devm, u32 id, + struct mlxdevm_param_gset_ctx *ctx) +{ + struct mlx5_sf_dev *sf_dev = mlxdevm_to_sf_dev(devm); + + ctx->val.vbool = sf_dev->disable_fc; + return 0; +} + +static int mlx5_devm_disable_fc_set(struct mlxdevm *devm, u32 id, + struct mlxdevm_param_gset_ctx *ctx) +{ + struct mlx5_sf_dev *sf_dev = mlxdevm_to_sf_dev(devm); + + sf_dev->disable_fc = ctx->val.vbool; + return 0; +} + +static int mlx5_devm_disable_netdev_get(struct mlxdevm *devm, u32 id, + struct mlxdevm_param_gset_ctx *ctx) +{ + struct mlx5_sf_dev *sf_dev = mlxdevm_to_sf_dev(devm); + + ctx->val.vbool = sf_dev->disable_netdev; + return 0; +} + +static int mlx5_devm_disable_netdev_set(struct mlxdevm *devm, u32 id, + struct mlxdevm_param_gset_ctx *ctx) +{ + struct mlx5_sf_dev *sf_dev = mlxdevm_to_sf_dev(devm); + + sf_dev->disable_netdev = ctx->val.vbool; + return 0; +} + +static int mlx5_devm_max_cmpl_eqs_get(struct mlxdevm *devm, u32 id, + struct mlxdevm_param_gset_ctx *ctx) +{ + struct mlx5_sf_dev *sf_dev = mlxdevm_to_sf_dev(devm); + + ctx->val.vu16 = sf_dev->max_cmpl_eqs; + return 0; +} + +static int mlx5_devm_max_cmpl_eqs_set(struct mlxdevm *devm, u32 id, + struct mlxdevm_param_gset_ctx *ctx) +{ + struct mlx5_sf_dev *sf_dev = mlxdevm_to_sf_dev(devm); + + sf_dev->max_cmpl_eqs = ctx->val.vu16; + return 0; +} + +static int mlx5_devm_max_cmpl_eqs_validate(struct mlxdevm *devm, u32 id, + union mlxdevm_param_value val, + struct netlink_ext_ack *extack) +{ + return (val.vu16 != 0) ? 0 : -EINVAL; +} + +static const struct mlxdevm_param mlx5_sf_cfg_devm_params[] = { + MLXDEVM_PARAM_DRIVER(MLX5_DEVM_PARAM_ID_CMPL_EQ_DEPTH, + "cmpl_eq_depth", MLXDEVM_PARAM_TYPE_U32, + BIT(MLXDEVM_PARAM_CMODE_RUNTIME), + mlx5_devm_cmpl_eq_depth_get, mlx5_devm_cmpl_eq_depth_set, + mlx5_devm_eq_depth_validate), + MLXDEVM_PARAM_DRIVER(MLX5_DEVM_PARAM_ID_ASYNC_EQ_DEPTH, + "async_eq_depth", MLXDEVM_PARAM_TYPE_U32, + BIT(MLXDEVM_PARAM_CMODE_RUNTIME), + mlx5_devm_async_eq_depth_get, mlx5_devm_async_eq_depth_set, + mlx5_devm_eq_depth_validate), + MLXDEVM_PARAM_DRIVER(MLX5_DEVM_PARAM_ID_DISABLE_FC, + "disable_fc", MLXDEVM_PARAM_TYPE_BOOL, + BIT(MLXDEVM_PARAM_CMODE_RUNTIME), + mlx5_devm_disable_fc_get, mlx5_devm_disable_fc_set, + NULL), + MLXDEVM_PARAM_DRIVER(MLX5_DEVM_PARAM_ID_DISABLE_NETDEV, + "disable_netdev", MLXDEVM_PARAM_TYPE_BOOL, + BIT(MLXDEVM_PARAM_CMODE_RUNTIME), + mlx5_devm_disable_netdev_get, mlx5_devm_disable_netdev_set, + NULL), + MLXDEVM_PARAM_DRIVER(MLX5_DEVM_PARAM_ID_MAX_CMPL_EQS, + "max_cmpl_eqs", MLXDEVM_PARAM_TYPE_U16, + BIT(MLXDEVM_PARAM_CMODE_RUNTIME), + mlx5_devm_max_cmpl_eqs_get, mlx5_devm_max_cmpl_eqs_set, + mlx5_devm_max_cmpl_eqs_validate), +}; + +static void mlx5_sf_cfg_devm_set_params_init_values(struct mlxdevm *devm) +{ + struct mlx5_sf_cfg_devm *sf_cfg_dev; + union mlxdevm_param_value value; + + sf_cfg_dev = container_of(devm, struct mlx5_sf_cfg_devm, device); + + value.vbool = false; + mlxdevm_param_driverinit_value_set(devm, MLX5_DEVM_PARAM_ID_DISABLE_ROCE, value); + + value.vbool = false; + mlxdevm_param_driverinit_value_set(devm, MLX5_DEVM_PARAM_ID_DISABLE_FC, value); + + value.vbool = false; + mlxdevm_param_driverinit_value_set(devm, MLX5_DEVM_PARAM_ID_DISABLE_NETDEV, value); + + value.vu32 = 0; + mlxdevm_param_driverinit_value_set(devm, MLX5_DEVM_PARAM_ID_CMPL_EQ_DEPTH, value); + + value.vu32 = 0; + mlxdevm_param_driverinit_value_set(devm, MLX5_DEVM_PARAM_ID_ASYNC_EQ_DEPTH, value); + + value.vu16 = 0; + mlxdevm_param_driverinit_value_set(devm, MLX5_DEVM_PARAM_ID_MAX_CMPL_EQS, value); + +} + +static int mlx5_sf_cfg_dev_probe(struct auxiliary_device *adev, + const struct auxiliary_device_id *id) +{ + struct mlx5_sf_dev *sf_dev = container_of(adev, struct mlx5_sf_dev, adev); + struct mlx5_sf_cfg_devm *sf_cfg_dev; + struct mlxdevm *devm; + int err; + + sf_cfg_dev = kzalloc(sizeof(*sf_cfg_dev), GFP_KERNEL); + if (!sf_cfg_dev) + return -ENOMEM; + + devm = &sf_cfg_dev->device; + devm->device = &sf_dev->adev.dev; + sf_cfg_dev->sf_dev = sf_dev; + + err = mlxdevm_register(devm); + if (err) + goto err; + + err = mlxdevm_params_register(devm, mlx5_sf_cfg_devm_params, + ARRAY_SIZE(mlx5_sf_cfg_devm_params)); + if (err) + goto params_reg_err; + + mlx5_sf_cfg_devm_set_params_init_values(devm); + mlxdevm_params_publish(devm); + + dev_set_drvdata(&sf_dev->adev.dev, sf_cfg_dev); + return 0; + +params_reg_err: + mlxdevm_unregister(devm); +err: + kfree(sf_cfg_dev); + return err; +} + +static void mlx5_sf_cfg_dev_remove(struct auxiliary_device *adev) +{ + struct mlx5_sf_dev *sf_dev = container_of(adev, struct mlx5_sf_dev, adev); + struct mlx5_sf_cfg_devm *sf_cfg_dev; + struct mlxdevm *devm; + + sf_cfg_dev = dev_get_drvdata(&sf_dev->adev.dev); + devm = &sf_cfg_dev->device; + mlxdevm_params_unregister(devm, mlx5_sf_cfg_devm_params, + ARRAY_SIZE(mlx5_sf_cfg_devm_params)); + mlxdevm_unregister(devm); + kfree(sf_cfg_dev); +} + +static const struct auxiliary_device_id mlx5_sf_dev_id_table[] = { + { .name = MLX5_ADEV_NAME "." MLX5_SF_DEV_ID_NAME, }, + { }, +}; + +static struct auxiliary_driver mlx5_sf_cfg_driver = { + .name = "sf_cfg", + .probe = mlx5_sf_cfg_dev_probe, + .remove = mlx5_sf_cfg_dev_remove, + .id_table = mlx5_sf_dev_id_table, +}; + +int mlx5_sf_cfg_driver_register(void) +{ + return auxiliary_driver_register(&mlx5_sf_cfg_driver); +} + +void mlx5_sf_cfg_driver_unregister(void) +{ + auxiliary_driver_unregister(&mlx5_sf_cfg_driver); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/cfg_driver.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/cfg_driver.h new file mode 100644 index 0000000..5a17f18 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/cfg_driver.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2021 Mellanox Technologies Ltd */ + +#ifndef __MLX5_SF_CFG_DRV_H__ +#define __MLX5_SF_CFG_DRV_H__ + +#ifdef CONFIG_MLX5_SF_CFG +int mlx5_sf_cfg_driver_register(void); +void mlx5_sf_cfg_driver_unregister(void); +#else +static inline int mlx5_sf_cfg_driver_register(void) +{ + return 0; +} + +static inline void mlx5_sf_cfg_driver_unregister(void) +{ +} + +#endif + +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c new file mode 100644 index 0000000..4486290 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.c @@ -0,0 +1,381 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020 Mellanox Technologies Ltd */ + +#include +#include +#include "mlx5_core.h" +#include "dev.h" +#include "sf/vhca_event.h" +#include "sf/sf.h" +#include "sf/mlx5_ifc_vhca_event.h" +#include "ecpf.h" +#define CREATE_TRACE_POINTS +#include "diag/dev_tracepoint.h" + +struct mlx5_sf_dev_table { + struct xarray devices; + unsigned int max_sfs; + phys_addr_t base_address; + u64 sf_bar_length; + struct notifier_block nb; + struct mutex table_lock; /* Serializes sf life cycle and vhca state change handler */ + struct workqueue_struct *active_wq; + struct work_struct work; + u8 stop_active_wq:1; + struct mlx5_core_dev *dev; +}; + +static bool mlx5_sf_dev_supported(const struct mlx5_core_dev *dev) +{ + return MLX5_CAP_GEN(dev, sf) && mlx5_vhca_event_supported(dev); +} + +bool mlx5_sf_dev_allocated(const struct mlx5_core_dev *dev) +{ + struct mlx5_sf_dev_table *table = dev->priv.sf_dev_table; + + return table && !xa_empty(&table->devices); +} + +static ssize_t sfnum_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct auxiliary_device *adev = container_of(dev, struct auxiliary_device, dev); + struct mlx5_sf_dev *sf_dev = container_of(adev, struct mlx5_sf_dev, adev); + + return sysfs_emit(buf, "%u\n", sf_dev->sfnum); +} +static DEVICE_ATTR_RO(sfnum); + +static struct attribute *sf_device_attrs[] = { + &dev_attr_sfnum.attr, + NULL, +}; + +static const struct attribute_group sf_attr_group = { + .attrs = sf_device_attrs, +}; + +static const struct attribute_group *sf_attr_groups[2] = { + &sf_attr_group, + NULL +}; + +static void mlx5_sf_dev_release(struct device *device) +{ + struct auxiliary_device *adev = container_of(device, struct auxiliary_device, dev); + struct mlx5_sf_dev *sf_dev = container_of(adev, struct mlx5_sf_dev, adev); + + mlx5_adev_idx_free(adev->id); + kfree(sf_dev); +} + +static void mlx5_sf_dev_remove(struct mlx5_core_dev *dev, struct mlx5_sf_dev *sf_dev) +{ + int id; + + id = sf_dev->adev.id; + trace_mlx5_sf_dev_del(dev, sf_dev, id); + + auxiliary_device_delete(&sf_dev->adev); + auxiliary_device_uninit(&sf_dev->adev); +} + +static void mlx5_sf_dev_add(struct mlx5_core_dev *dev, u16 sf_index, u16 fn_id, u32 sfnum) +{ + struct mlx5_sf_dev_table *table = dev->priv.sf_dev_table; + struct mlx5_sf_dev *sf_dev; + struct pci_dev *pdev; + int err; + int id; + + id = mlx5_adev_idx_alloc(); + if (id < 0) { + err = id; + goto add_err; + } + + sf_dev = kzalloc(sizeof(*sf_dev), GFP_KERNEL); + if (!sf_dev) { + mlx5_adev_idx_free(id); + err = -ENOMEM; + goto add_err; + } + pdev = dev->pdev; + sf_dev->adev.id = id; + sf_dev->adev.name = MLX5_SF_DEV_ID_NAME; + sf_dev->adev.dev.release = mlx5_sf_dev_release; + sf_dev->adev.dev.parent = &pdev->dev; + sf_dev->adev.dev.groups = sf_attr_groups; + sf_dev->sfnum = sfnum; + sf_dev->parent_mdev = dev; + sf_dev->fn_id = fn_id; + + if (!table->max_sfs) { + mlx5_adev_idx_free(id); + kfree(sf_dev); + err = -EOPNOTSUPP; + goto add_err; + } + sf_dev->bar_base_addr = table->base_address + (sf_index * table->sf_bar_length); + + trace_mlx5_sf_dev_add(dev, sf_dev, id); + + err = auxiliary_device_init(&sf_dev->adev); + if (err) { + mlx5_adev_idx_free(id); + kfree(sf_dev); + goto add_err; + } + + err = auxiliary_device_add(&sf_dev->adev); + if (err) { + put_device(&sf_dev->adev.dev); + goto add_err; + } + + err = xa_insert(&table->devices, sf_index, sf_dev, GFP_KERNEL); + if (err) + goto xa_err; + return; + +xa_err: + mlx5_sf_dev_remove(dev, sf_dev); +add_err: + mlx5_core_err(dev, "SF DEV: fail device add for index=%d sfnum=%d err=%d\n", + sf_index, sfnum, err); +} + +static void mlx5_sf_dev_del(struct mlx5_core_dev *dev, struct mlx5_sf_dev *sf_dev, u16 sf_index) +{ + struct mlx5_sf_dev_table *table = dev->priv.sf_dev_table; + + xa_erase(&table->devices, sf_index); + mlx5_sf_dev_remove(dev, sf_dev); +} + +static int +mlx5_sf_dev_state_change_handler(struct notifier_block *nb, unsigned long event_code, void *data) +{ + struct mlx5_sf_dev_table *table = container_of(nb, struct mlx5_sf_dev_table, nb); + const struct mlx5_vhca_state_event *event = data; + struct mlx5_sf_dev *sf_dev; + u16 max_functions; + u16 sf_index; + u16 base_id; + + max_functions = mlx5_sf_max_functions(table->dev); + if (!max_functions) + return 0; + + base_id = MLX5_CAP_GEN(table->dev, sf_base_id); + if (event->function_id < base_id || event->function_id >= (base_id + max_functions)) + return 0; + + sf_index = event->function_id - base_id; + mutex_lock(&table->table_lock); + sf_dev = xa_load(&table->devices, sf_index); + switch (event->new_vhca_state) { + case MLX5_VHCA_STATE_INVALID: + case MLX5_VHCA_STATE_ALLOCATED: + if (sf_dev) + mlx5_sf_dev_del(table->dev, sf_dev, sf_index); + break; + case MLX5_VHCA_STATE_TEARDOWN_REQUEST: + if (sf_dev) + mlx5_sf_dev_del(table->dev, sf_dev, sf_index); + else + mlx5_core_err(table->dev, + "SF DEV: teardown state for invalid dev index=%d fn_id=0x%x\n", + sf_index, event->sw_function_id); + break; + case MLX5_VHCA_STATE_ACTIVE: + if (!sf_dev) + mlx5_sf_dev_add(table->dev, sf_index, event->function_id, + event->sw_function_id); + break; + default: + break; + } + + mutex_unlock(&table->table_lock); + return 0; +} + +static int mlx5_sf_dev_vhca_arm_all(struct mlx5_sf_dev_table *table) +{ + struct mlx5_core_dev *dev = table->dev; + u16 max_functions; + u16 function_id; + int err = 0; + int i; + + max_functions = mlx5_sf_max_functions(dev); + function_id = MLX5_CAP_GEN(dev, sf_base_id); + /* Arm the vhca context as the vhca event notifier */ + for (i = 0; i < max_functions; i++) { + err = mlx5_vhca_event_arm(dev, function_id); + if (err) + return err; + + function_id++; + } + return 0; +} + +static void mlx5_sf_dev_add_active_work(struct work_struct *work) +{ + struct mlx5_sf_dev_table *table = container_of(work, struct mlx5_sf_dev_table, work); + u32 out[MLX5_ST_SZ_DW(query_vhca_state_out)] = {}; + struct mlx5_core_dev *dev = table->dev; + u16 max_functions; + u16 function_id; + u16 sw_func_id; + int err = 0; + u8 state; + int i; + + max_functions = mlx5_sf_max_functions(dev); + function_id = MLX5_CAP_GEN(dev, sf_base_id); + for (i = 0; i < max_functions; i++, function_id++) { + if (table->stop_active_wq) + return; + err = mlx5_cmd_query_vhca_state(dev, function_id, out, sizeof(out)); + if (err) + /* A failure of specific vhca doesn't mean others will + * fail as well. + */ + continue; + state = MLX5_GET(query_vhca_state_out, out, vhca_state_context.vhca_state); + if (state != MLX5_VHCA_STATE_ACTIVE) + continue; + + sw_func_id = MLX5_GET(query_vhca_state_out, out, vhca_state_context.sw_function_id); + mutex_lock(&table->table_lock); + /* Don't probe device which is already probe */ + if (!xa_load(&table->devices, i)) + mlx5_sf_dev_add(dev, i, function_id, sw_func_id); + /* There is a race where SF got inactive after the query + * above. e.g.: the query returns that the state of the + * SF is active, and after that the eswitch manager set it to + * inactive. + * This case cannot be managed in SW, since the probing of the + * SF is on one system, and the inactivation is on a different + * system. + * If the inactive is done after the SF perform init_hca(), + * the SF will fully probe and then removed. If it was + * done before init_hca(), the SF probe will fail. + */ + mutex_unlock(&table->table_lock); + } +} + +/* In case SFs are generated externally, probe active SFs */ +static int mlx5_sf_dev_queue_active_work(struct mlx5_sf_dev_table *table) +{ + if (MLX5_CAP_GEN(table->dev, eswitch_manager)) + return 0; /* the table is local */ + + /* Use a workqueue to probe active SFs, which are in large + * quantity and may take up to minutes to probe. + */ + table->active_wq = create_singlethread_workqueue("mlx5_active_sf"); + if (!table->active_wq) + return -ENOMEM; + INIT_WORK(&table->work, &mlx5_sf_dev_add_active_work); + queue_work(table->active_wq, &table->work); + return 0; +} + +static void mlx5_sf_dev_destroy_active_work(struct mlx5_sf_dev_table *table) +{ + if (table->active_wq) { + table->stop_active_wq = true; + destroy_workqueue(table->active_wq); + } +} + +void mlx5_sf_dev_table_create(struct mlx5_core_dev *dev) +{ + struct mlx5_sf_dev_table *table; + unsigned int max_sfs; + int err; + + if (!mlx5_sf_dev_supported(dev) || !mlx5_vhca_event_supported(dev)) + return; + + table = kzalloc(sizeof(*table), GFP_KERNEL); + if (!table) { + err = -ENOMEM; + goto table_err; + } + + table->nb.notifier_call = mlx5_sf_dev_state_change_handler; + table->dev = dev; + if (MLX5_CAP_GEN(dev, max_num_sf)) + max_sfs = MLX5_CAP_GEN(dev, max_num_sf); + else + max_sfs = 1 << MLX5_CAP_GEN(dev, log_max_sf); + table->sf_bar_length = 1 << (MLX5_CAP_GEN(dev, log_min_sf_size) + 12); + table->base_address = pci_resource_start(dev->pdev, 2); + table->max_sfs = max_sfs; + xa_init(&table->devices); + mutex_init(&table->table_lock); + dev->priv.sf_dev_table = table; + + err = mlx5_vhca_event_notifier_register(dev, &table->nb); + if (err) + goto vhca_err; + + err = mlx5_sf_dev_queue_active_work(table); + if (err) + goto add_active_err; + + err = mlx5_sf_dev_vhca_arm_all(table); + if (err) + goto arm_err; + mlx5_core_dbg(dev, "SF DEV: max sf devices=%d\n", max_sfs); + return; + +arm_err: + mlx5_sf_dev_destroy_active_work(table); +add_active_err: + mlx5_vhca_event_notifier_unregister(dev, &table->nb); +vhca_err: + table->max_sfs = 0; + kfree(table); + dev->priv.sf_dev_table = NULL; +table_err: + mlx5_core_err(dev, "SF DEV table create err = %d\n", err); +} + +static void mlx5_sf_dev_destroy_all(struct mlx5_sf_dev_table *table) +{ + struct mlx5_sf_dev *sf_dev; + unsigned long index; + + xa_for_each(&table->devices, index, sf_dev) { + xa_erase(&table->devices, index); + mlx5_sf_dev_remove(table->dev, sf_dev); + } +} + +void mlx5_sf_dev_table_destroy(struct mlx5_core_dev *dev) +{ + struct mlx5_sf_dev_table *table = dev->priv.sf_dev_table; + + if (!table) + return; + + mlx5_sf_dev_destroy_active_work(table); + mlx5_vhca_event_notifier_unregister(dev, &table->nb); + mutex_destroy(&table->table_lock); + + /* Now that event handler is not running, it is safe to destroy + * the sf device without race. + */ + mlx5_sf_dev_destroy_all(table); + + WARN_ON(!xa_empty(&table->devices)); + kfree(table); + dev->priv.sf_dev_table = NULL; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.h new file mode 100644 index 0000000..0a2b34a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/dev.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2020 Mellanox Technologies Ltd */ + +#ifndef __MLX5_SF_DEV_H__ +#define __MLX5_SF_DEV_H__ + +#ifdef CONFIG_MLX5_SF + +#include + +#define MLX5_SF_DEV_ID_NAME "sf" + +struct mlx5_sf_dev { + struct auxiliary_device adev; + struct mlx5_core_dev *parent_mdev; + struct mlx5_core_dev *mdev; + phys_addr_t bar_base_addr; + u32 sfnum; + u16 fn_id; + +#ifdef CONFIG_MLX5_SF_CFG + u32 cmpl_eq_depth; + u32 async_eq_depth; + bool disable_fc; + bool disable_netdev; + u16 max_cmpl_eqs; +#endif +}; + +void mlx5_sf_dev_table_create(struct mlx5_core_dev *dev); +void mlx5_sf_dev_table_destroy(struct mlx5_core_dev *dev); + +int mlx5_sf_driver_register(void); +void mlx5_sf_driver_unregister(void); + +bool mlx5_sf_dev_allocated(const struct mlx5_core_dev *dev); + +#else + +static inline void mlx5_sf_dev_table_create(struct mlx5_core_dev *dev) +{ +} + +static inline void mlx5_sf_dev_table_destroy(struct mlx5_core_dev *dev) +{ +} + +static inline int mlx5_sf_driver_register(void) +{ + return 0; +} + +static inline void mlx5_sf_driver_unregister(void) +{ +} + +static inline bool mlx5_sf_dev_allocated(const struct mlx5_core_dev *dev) +{ + return false; +} + +#endif + +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/diag/dev_tracepoint.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/diag/dev_tracepoint.h new file mode 100644 index 0000000..7f7c9af --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/diag/dev_tracepoint.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM mlx5 + +#if !defined(_MLX5_SF_DEV_TP_) || defined(TRACE_HEADER_MULTI_READ) +#define _MLX5_SF_DEV_TP_ + +#include +#include +#include "../../dev/dev.h" + +DECLARE_EVENT_CLASS(mlx5_sf_dev_template, + TP_PROTO(const struct mlx5_core_dev *dev, + const struct mlx5_sf_dev *sfdev, + int aux_id), + TP_ARGS(dev, sfdev, aux_id), + TP_STRUCT__entry(__string(devname, dev_name(dev->device)) + __field(const struct mlx5_sf_dev*, sfdev) + __field(int, aux_id) + __field(u16, hw_fn_id) + __field(u32, sfnum) + ), + TP_fast_assign(__assign_str(devname, dev_name(dev->device)); + __entry->sfdev = sfdev; + __entry->aux_id = aux_id; + __entry->hw_fn_id = sfdev->fn_id; + __entry->sfnum = sfdev->sfnum; + ), + TP_printk("(%s) sfdev=%pK aux_id=%d hw_id=0x%x sfnum=%u\n", + __get_str(devname), __entry->sfdev, + __entry->aux_id, __entry->hw_fn_id, + __entry->sfnum) +); + +DEFINE_EVENT(mlx5_sf_dev_template, mlx5_sf_dev_add, + TP_PROTO(const struct mlx5_core_dev *dev, + const struct mlx5_sf_dev *sfdev, + int aux_id), + TP_ARGS(dev, sfdev, aux_id) + ); + +DEFINE_EVENT(mlx5_sf_dev_template, mlx5_sf_dev_del, + TP_PROTO(const struct mlx5_core_dev *dev, + const struct mlx5_sf_dev *sfdev, + int aux_id), + TP_ARGS(dev, sfdev, aux_id) + ); + +#endif /* _MLX5_SF_DEV_TP_ */ + +/* This part must be outside protection */ +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH sf/dev/diag +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE dev_tracepoint +#include diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c new file mode 100644 index 0000000..63ef029 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c @@ -0,0 +1,133 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020 Mellanox Technologies Ltd */ + +#include +#include +#include "mlx5_core.h" +#include "dev.h" +#include "devlink.h" +#include "cfg_driver.h" + +static int mlx5_sf_dev_probe(struct auxiliary_device *adev, const struct auxiliary_device_id *id) +{ + struct mlx5_sf_dev *sf_dev = container_of(adev, struct mlx5_sf_dev, adev); + struct mlx5_core_dev *mdev; + struct devlink *devlink; + int err; + + devlink = mlx5_devlink_alloc(&adev->dev); + if (!devlink) + return -ENOMEM; + + mdev = devlink_priv(devlink); + mdev->device = &adev->dev; + mdev->pdev = sf_dev->parent_mdev->pdev; + mdev->bar_addr = sf_dev->bar_base_addr; + mdev->iseg_base = sf_dev->bar_base_addr; + mdev->coredev_type = MLX5_COREDEV_SF; + mdev->priv.parent_mdev = sf_dev->parent_mdev; + mdev->priv.adev_idx = adev->id; + sf_dev->mdev = mdev; + + err = mlx5_mdev_init(mdev, MLX5_DEFAULT_PROF); + if (err) { + mlx5_core_warn(mdev, "mlx5_mdev_init on err=%d\n", err); + goto mdev_err; + } + +#ifdef CONFIG_MLX5_SF_CFG + mdev->disable_en = sf_dev->disable_netdev; + mdev->disable_fc = sf_dev->disable_fc; + mdev->cmpl_eq_depth = sf_dev->cmpl_eq_depth; + mdev->async_eq_depth = sf_dev->async_eq_depth; + mdev->max_cmpl_eq_count = sf_dev->max_cmpl_eqs; +#endif + + mdev->iseg = ioremap(mdev->iseg_base, sizeof(*mdev->iseg)); + if (!mdev->iseg) { + mlx5_core_warn(mdev, "remap error\n"); + err = -ENOMEM; + goto remap_err; + } + + err = mlx5_init_one(mdev); + if (err) { + mlx5_core_warn(mdev, "mlx5_init_one err=%d\n", err); + goto init_one_err; + } + devlink_register(devlink); + return 0; + +init_one_err: + iounmap(mdev->iseg); +remap_err: + mlx5_mdev_uninit(mdev); +mdev_err: + mlx5_devlink_free(devlink); + return err; +} + +static void mlx5_sf_dev_remove(struct auxiliary_device *adev) +{ + struct mlx5_sf_dev *sf_dev = container_of(adev, struct mlx5_sf_dev, adev); + struct devlink *devlink = priv_to_devlink(sf_dev->mdev); + + set_bit(MLX5_INTERFACE_STATE_TEARDOWN, &sf_dev->mdev->intf_state); + devlink_unregister(devlink); + mlx5_uninit_one(sf_dev->mdev); + + /* health work might still be active, and it needs pci bar in + * order to know the NIC state. Therefore, drain the health WQ + * before removing the pci bars + */ + mlx5_drain_health_wq(sf_dev->mdev); + iounmap(sf_dev->mdev->iseg); + mlx5_mdev_uninit(sf_dev->mdev); + mlx5_devlink_free(devlink); +} + +static void mlx5_sf_dev_shutdown(struct auxiliary_device *adev) +{ + struct mlx5_sf_dev *sf_dev = container_of(adev, struct mlx5_sf_dev, adev); + + mlx5_unload_one(sf_dev->mdev); +} + +static const struct auxiliary_device_id mlx5_sf_dev_id_table[] = { + { .name = MLX5_ADEV_NAME "." MLX5_SF_DEV_ID_NAME, }, + { }, +}; + +MODULE_DEVICE_TABLE(auxiliary, mlx5_sf_dev_id_table); + +static struct auxiliary_driver mlx5_sf_driver = { + .name = MLX5_SF_DEV_ID_NAME, + .probe = mlx5_sf_dev_probe, + .remove = mlx5_sf_dev_remove, + .shutdown = mlx5_sf_dev_shutdown, + .id_table = mlx5_sf_dev_id_table, +}; + +int mlx5_sf_driver_register(void) +{ + int err; + + err = mlx5_sf_cfg_driver_register(); + if (err) + return err; + + err = auxiliary_driver_register(&mlx5_sf_driver); + if (err) + goto err; + return 0; + +err: + mlx5_sf_cfg_driver_unregister(); + return err; +} + +void mlx5_sf_driver_unregister(void) +{ + auxiliary_driver_unregister(&mlx5_sf_driver); + mlx5_sf_cfg_driver_unregister(); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c new file mode 100644 index 0000000..0fb4d7b --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c @@ -0,0 +1,597 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020 Mellanox Technologies Ltd */ + +#include +#include "eswitch.h" +#include "priv.h" +#include "sf/dev/dev.h" +#include "mlx5_ifc_vhca_event.h" +#include "vhca_event.h" +#include "ecpf.h" +#define CREATE_TRACE_POINTS +#include "diag/sf_tracepoint.h" + +struct mlx5_sf { + struct devlink_port dl_port; + unsigned int port_index; + u32 controller; + u16 id; + u16 hw_fn_id; + u16 hw_state; +}; + +struct mlx5_sf_table { + struct mlx5_core_dev *dev; /* To refer from notifier context. */ + struct xarray port_indices; /* port index based lookup. */ + refcount_t refcount; + struct completion disable_complete; + struct mutex sf_state_lock; /* Serializes sf state among user cmds & vhca event handler. */ + struct notifier_block esw_nb; + struct notifier_block vhca_nb; + u8 ecpu: 1; +}; + +static struct mlx5_sf * +mlx5_sf_lookup_by_index(struct mlx5_sf_table *table, unsigned int port_index) +{ + return xa_load(&table->port_indices, port_index); +} + +static struct mlx5_sf * +mlx5_sf_lookup_by_function_id(struct mlx5_sf_table *table, unsigned int fn_id) +{ + unsigned long index; + struct mlx5_sf *sf; + + xa_for_each(&table->port_indices, index, sf) { + if (sf->hw_fn_id == fn_id) + return sf; + } + return NULL; +} + +static int mlx5_sf_id_insert(struct mlx5_sf_table *table, struct mlx5_sf *sf) +{ + return xa_insert(&table->port_indices, sf->port_index, sf, GFP_KERNEL); +} + +static void mlx5_sf_id_erase(struct mlx5_sf_table *table, struct mlx5_sf *sf) +{ + xa_erase(&table->port_indices, sf->port_index); +} + +static struct mlx5_sf * +mlx5_sf_alloc(struct mlx5_sf_table *table, struct mlx5_eswitch *esw, + u32 controller, u32 sfnum, struct netlink_ext_ack *extack) +{ + unsigned int dl_port_index; + struct mlx5_sf *sf; + u16 hw_fn_id; + int id_err; + int err; + + if (!mlx5_esw_offloads_controller_valid(esw, controller)) { + NL_SET_ERR_MSG_MOD(extack, "Invalid controller number"); + return ERR_PTR(-EINVAL); + } + + id_err = mlx5_sf_hw_table_sf_alloc(table->dev, controller, sfnum); + if (id_err < 0) { + err = id_err; + goto id_err; + } + + sf = kzalloc(sizeof(*sf), GFP_KERNEL); + if (!sf) { + err = -ENOMEM; + goto alloc_err; + } + sf->id = id_err; + hw_fn_id = mlx5_sf_sw_to_hw_id(table->dev, controller, sf->id); + dl_port_index = mlx5_esw_vport_to_devlink_port_index(table->dev, hw_fn_id); + sf->port_index = dl_port_index; + sf->hw_fn_id = hw_fn_id; + sf->hw_state = MLX5_VHCA_STATE_ALLOCATED; + sf->controller = controller; + + err = mlx5_sf_id_insert(table, sf); + if (err) + goto insert_err; + + return sf; + +insert_err: + kfree(sf); +alloc_err: + mlx5_sf_hw_table_sf_free(table->dev, controller, id_err); +id_err: + if (err == -EEXIST) + NL_SET_ERR_MSG_MOD(extack, "SF already exist. Choose different sfnum"); + return ERR_PTR(err); +} + +static void mlx5_sf_free(struct mlx5_sf_table *table, struct mlx5_sf *sf) +{ + mlx5_sf_id_erase(table, sf); + mlx5_sf_hw_table_sf_free(table->dev, sf->controller, sf->id); + trace_mlx5_sf_free(table->dev, sf->port_index, sf->controller, sf->hw_fn_id); + kfree(sf); +} + +static struct mlx5_sf_table *mlx5_sf_table_try_get(struct mlx5_core_dev *dev) +{ + struct mlx5_sf_table *table = dev->priv.sf_table; + + if (!table) + return NULL; + + return refcount_inc_not_zero(&table->refcount) ? table : NULL; +} + +static void mlx5_sf_table_put(struct mlx5_sf_table *table) +{ + if (refcount_dec_and_test(&table->refcount)) + complete(&table->disable_complete); +} + +static enum devlink_port_fn_state mlx5_sf_to_devlink_state(u8 hw_state) +{ + switch (hw_state) { + case MLX5_VHCA_STATE_ACTIVE: + case MLX5_VHCA_STATE_IN_USE: + return DEVLINK_PORT_FN_STATE_ACTIVE; + case MLX5_VHCA_STATE_INVALID: + case MLX5_VHCA_STATE_ALLOCATED: + case MLX5_VHCA_STATE_TEARDOWN_REQUEST: + default: + return DEVLINK_PORT_FN_STATE_INACTIVE; + } +} + +static enum devlink_port_fn_opstate mlx5_sf_to_devlink_opstate(u8 hw_state) +{ + switch (hw_state) { + case MLX5_VHCA_STATE_IN_USE: + case MLX5_VHCA_STATE_TEARDOWN_REQUEST: + return DEVLINK_PORT_FN_OPSTATE_ATTACHED; + case MLX5_VHCA_STATE_INVALID: + case MLX5_VHCA_STATE_ALLOCATED: + case MLX5_VHCA_STATE_ACTIVE: + default: + return DEVLINK_PORT_FN_OPSTATE_DETACHED; + } +} + +static bool mlx5_sf_is_active(const struct mlx5_sf *sf) +{ + return sf->hw_state == MLX5_VHCA_STATE_ACTIVE || sf->hw_state == MLX5_VHCA_STATE_IN_USE; +} + +int mlx5_devlink_sf_port_fn_state_get(struct devlink_port *dl_port, + enum devlink_port_fn_state *state, + enum devlink_port_fn_opstate *opstate, + struct netlink_ext_ack *extack) +{ + struct mlx5_core_dev *dev = devlink_priv(dl_port->devlink); + struct mlx5_sf_table *table; + struct mlx5_sf *sf; + int err = 0; + + table = mlx5_sf_table_try_get(dev); + if (!table) + return -EOPNOTSUPP; + + sf = mlx5_sf_lookup_by_index(table, dl_port->index); + if (!sf) { + err = -EOPNOTSUPP; + goto sf_err; + } + mutex_lock(&table->sf_state_lock); + *state = mlx5_sf_to_devlink_state(sf->hw_state); + *opstate = mlx5_sf_to_devlink_opstate(sf->hw_state); + mutex_unlock(&table->sf_state_lock); +sf_err: + mlx5_sf_table_put(table); + return err; +} + +static int mlx5_sf_activate(struct mlx5_core_dev *dev, struct mlx5_sf *sf, + struct netlink_ext_ack *extack) +{ + int err; + + if (mlx5_sf_is_active(sf)) + return 0; + if (sf->hw_state != MLX5_VHCA_STATE_ALLOCATED) { + NL_SET_ERR_MSG_MOD(extack, "SF is inactivated but it is still attached"); + return -EBUSY; + } + + err = mlx5_cmd_sf_enable_hca(dev, sf->hw_fn_id); + if (err) + return err; + + sf->hw_state = MLX5_VHCA_STATE_ACTIVE; + trace_mlx5_sf_activate(dev, sf->port_index, sf->controller, sf->hw_fn_id); + return 0; +} + +static int mlx5_sf_deactivate(struct mlx5_core_dev *dev, struct mlx5_sf *sf) +{ + int err; + + if (!mlx5_sf_is_active(sf)) + return 0; + + err = mlx5_cmd_sf_disable_hca(dev, sf->hw_fn_id); + if (err) + return err; + + sf->hw_state = MLX5_VHCA_STATE_TEARDOWN_REQUEST; + trace_mlx5_sf_deactivate(dev, sf->port_index, sf->controller, sf->hw_fn_id); + return 0; +} + +static int mlx5_sf_state_set(struct mlx5_core_dev *dev, struct mlx5_sf_table *table, + struct mlx5_sf *sf, + enum devlink_port_fn_state state, + struct netlink_ext_ack *extack) +{ + int err = 0; + + mutex_lock(&table->sf_state_lock); + if (state == mlx5_sf_to_devlink_state(sf->hw_state)) + goto out; + if (state == DEVLINK_PORT_FN_STATE_ACTIVE) + err = mlx5_sf_activate(dev, sf, extack); + else if (state == DEVLINK_PORT_FN_STATE_INACTIVE) + err = mlx5_sf_deactivate(dev, sf); + else + err = -EINVAL; +out: + mutex_unlock(&table->sf_state_lock); + return err; +} + +int mlx5_devlink_sf_port_fn_state_set(struct devlink_port *dl_port, + enum devlink_port_fn_state state, + struct netlink_ext_ack *extack) +{ + struct mlx5_core_dev *dev = devlink_priv(dl_port->devlink); + struct mlx5_sf_table *table; + struct mlx5_sf *sf; + int err; + + table = mlx5_sf_table_try_get(dev); + if (!table) { + NL_SET_ERR_MSG_MOD(extack, + "Port state set is only supported in eswitch switchdev mode or SF ports are disabled."); + return -EOPNOTSUPP; + } + sf = mlx5_sf_lookup_by_index(table, dl_port->index); + if (!sf) { + err = -ENODEV; + goto out; + } + + err = mlx5_sf_state_set(dev, table, sf, state, extack); +out: + mlx5_sf_table_put(table); + return err; +} + +static int mlx5_sf_add(struct mlx5_core_dev *dev, struct mlx5_sf_table *table, + const struct devlink_port_new_attrs *new_attr, + struct netlink_ext_ack *extack, + unsigned int *new_port_index) +{ + struct mlx5_eswitch *esw = dev->priv.eswitch; + struct mlx5_sf *sf; + int err; + + sf = mlx5_sf_alloc(table, esw, new_attr->controller, new_attr->sfnum, extack); + if (IS_ERR(sf)) + return PTR_ERR(sf); + + err = mlx5_esw_offloads_sf_vport_enable(esw, &sf->dl_port, sf->hw_fn_id, + new_attr->controller, new_attr->sfnum); + if (err) + goto esw_err; + *new_port_index = sf->port_index; + trace_mlx5_sf_add(dev, sf->port_index, sf->controller, sf->hw_fn_id, new_attr->sfnum); + return 0; + +esw_err: + mlx5_sf_free(table, sf); + return err; +} + +static int +mlx5_sf_new_check_attr(struct mlx5_core_dev *dev, const struct devlink_port_new_attrs *new_attr, + struct netlink_ext_ack *extack) +{ + if (new_attr->flavour != DEVLINK_PORT_FLAVOUR_PCI_SF) { + NL_SET_ERR_MSG_MOD(extack, "Driver supports only SF port addition"); + return -EOPNOTSUPP; + } + if (new_attr->port_index_valid) { + NL_SET_ERR_MSG_MOD(extack, + "Driver does not support user defined port index assignment"); + return -EOPNOTSUPP; + } + if (!new_attr->sfnum_valid) { + NL_SET_ERR_MSG_MOD(extack, + "User must provide unique sfnum. Driver does not support auto assignment"); + return -EOPNOTSUPP; + } + if (new_attr->controller_valid && new_attr->controller && + !mlx5_core_is_ecpf_esw_manager(dev)) { + NL_SET_ERR_MSG_MOD(extack, "External controller is unsupported"); + return -EOPNOTSUPP; + } + if (new_attr->pfnum != mlx5_get_dev_index(dev)) { + NL_SET_ERR_MSG_MOD(extack, "Invalid pfnum supplied"); + return -EOPNOTSUPP; + } + return 0; +} + +int mlx5_devlink_sf_port_new(struct devlink *devlink, + const struct devlink_port_new_attrs *new_attr, + struct netlink_ext_ack *extack, + unsigned int *new_port_index) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + struct mlx5_sf_table *table; + int err; + + err = mlx5_sf_new_check_attr(dev, new_attr, extack); + if (err) + return err; + + table = mlx5_sf_table_try_get(dev); + if (!table) { + NL_SET_ERR_MSG_MOD(extack, + "Port add is only supported in eswitch switchdev mode or SF ports are disabled."); + return -EOPNOTSUPP; + } + err = mlx5_sf_add(dev, table, new_attr, extack, new_port_index); + mlx5_sf_table_put(table); + return err; +} + +static void mlx5_sf_dealloc(struct mlx5_sf_table *table, struct mlx5_sf *sf) +{ + if (sf->hw_state == MLX5_VHCA_STATE_ALLOCATED) { + mlx5_sf_free(table, sf); + } else if (mlx5_sf_is_active(sf)) { + /* Even if its active, it is treated as in_use because by the time, + * it is disabled here, it may getting used. So it is safe to + * always look for the event to ensure that it is recycled only after + * firmware gives confirmation that it is detached by the driver. + */ + mlx5_cmd_sf_disable_hca(table->dev, sf->hw_fn_id); + mlx5_sf_hw_table_sf_deferred_free(table->dev, sf->controller, sf->id); + kfree(sf); + } else { + mlx5_sf_hw_table_sf_deferred_free(table->dev, sf->controller, sf->id); + kfree(sf); + } +} + +int mlx5_devlink_sf_port_del(struct devlink *devlink, unsigned int port_index, + struct netlink_ext_ack *extack) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + struct mlx5_eswitch *esw = dev->priv.eswitch; + struct mlx5_sf_table *table; + struct mlx5_sf *sf; + int err = 0; + + table = mlx5_sf_table_try_get(dev); + if (!table) { + NL_SET_ERR_MSG_MOD(extack, + "Port del is only supported in eswitch switchdev mode or SF ports are disabled."); + return -EOPNOTSUPP; + } + sf = mlx5_sf_lookup_by_index(table, port_index); + if (!sf) { + err = -ENODEV; + goto sf_err; + } + + mlx5_esw_offloads_sf_vport_disable(esw, sf->hw_fn_id); + mlx5_sf_id_erase(table, sf); + + mutex_lock(&table->sf_state_lock); + mlx5_sf_dealloc(table, sf); + mutex_unlock(&table->sf_state_lock); +sf_err: + mlx5_sf_table_put(table); + return err; +} + +#if IS_ENABLED(CONFIG_MLXDEVM) +int mlx5_sf_index_to_hw_id(struct devlink *devlink, u16 *hw_fn_id, unsigned int port_index, + struct netlink_ext_ack *extack) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + struct mlx5_sf_table *table; + struct mlx5_sf *sf; + int err = 0; + + table = mlx5_sf_table_try_get(dev); + if (!table) { + NL_SET_ERR_MSG_MOD(extack, "Fail to get SF table."); + return -EOPNOTSUPP; + } + sf = mlx5_sf_lookup_by_index(table, port_index); + if (!sf) { + err = -ENODEV; + goto sf_err; + } + *hw_fn_id = sf->hw_fn_id; +sf_err: + mlx5_sf_table_put(table); + return err; +} +#endif + +static bool mlx5_sf_state_update_check(const struct mlx5_sf *sf, u8 new_state) +{ + if (sf->hw_state == MLX5_VHCA_STATE_ACTIVE && new_state == MLX5_VHCA_STATE_IN_USE) + return true; + + if (sf->hw_state == MLX5_VHCA_STATE_IN_USE && new_state == MLX5_VHCA_STATE_ACTIVE) + return true; + + if (sf->hw_state == MLX5_VHCA_STATE_TEARDOWN_REQUEST && + new_state == MLX5_VHCA_STATE_ALLOCATED) + return true; + + return false; +} + +static int mlx5_sf_vhca_event(struct notifier_block *nb, unsigned long opcode, void *data) +{ + struct mlx5_sf_table *table = container_of(nb, struct mlx5_sf_table, vhca_nb); + const struct mlx5_vhca_state_event *event = data; + bool update = false; + struct mlx5_sf *sf; + + table = mlx5_sf_table_try_get(table->dev); + if (!table) + return 0; + + mutex_lock(&table->sf_state_lock); + sf = mlx5_sf_lookup_by_function_id(table, event->function_id); + if (!sf) + goto sf_err; + + /* When driver is attached or detached to a function, an event + * notifies such state change. + */ + update = mlx5_sf_state_update_check(sf, event->new_vhca_state); + if (update) + sf->hw_state = event->new_vhca_state; + trace_mlx5_sf_update_state(table->dev, sf->port_index, sf->controller, + sf->hw_fn_id, sf->hw_state); +sf_err: + mutex_unlock(&table->sf_state_lock); + mlx5_sf_table_put(table); + return 0; +} + +static void mlx5_sf_table_enable(struct mlx5_sf_table *table) +{ + init_completion(&table->disable_complete); + refcount_set(&table->refcount, 1); +} + +static void mlx5_sf_deactivate_all(struct mlx5_sf_table *table) +{ + struct mlx5_eswitch *esw = table->dev->priv.eswitch; + unsigned long index; + struct mlx5_sf *sf; + + /* At this point, no new user commands can start and no vhca event can + * arrive. It is safe to destroy all user created SFs. + */ + xa_for_each(&table->port_indices, index, sf) { + mlx5_esw_offloads_sf_vport_disable(esw, sf->hw_fn_id); + mlx5_sf_id_erase(table, sf); + mlx5_sf_dealloc(table, sf); + } +} + +static void mlx5_sf_table_disable(struct mlx5_sf_table *table) +{ + if (!refcount_read(&table->refcount)) + return; + + /* Balances with refcount_set; drop the reference so that new user cmd cannot start + * and new vhca event handler cannot run. + */ + mlx5_sf_table_put(table); + wait_for_completion(&table->disable_complete); + + mlx5_sf_deactivate_all(table); +} + +static int mlx5_sf_esw_event(struct notifier_block *nb, unsigned long event, void *data) +{ + struct mlx5_sf_table *table = container_of(nb, struct mlx5_sf_table, esw_nb); + const struct mlx5_esw_event_info *mode = data; + + switch (mode->new_mode) { + case MLX5_ESWITCH_OFFLOADS: + mlx5_sf_table_enable(table); + break; + case MLX5_ESWITCH_LEGACY: + mlx5_sf_table_disable(table); + break; + default: + break; + } + + return 0; +} + +static bool mlx5_sf_table_supported(const struct mlx5_core_dev *dev) +{ + return dev->priv.eswitch && MLX5_ESWITCH_MANAGER(dev) && + mlx5_sf_hw_table_supported(dev); +} + +int mlx5_sf_table_init(struct mlx5_core_dev *dev) +{ + struct mlx5_sf_table *table; + int err; + + if (!mlx5_sf_table_supported(dev) || !mlx5_vhca_event_supported(dev)) + return 0; + + table = kzalloc(sizeof(*table), GFP_KERNEL); + if (!table) + return -ENOMEM; + + mutex_init(&table->sf_state_lock); + table->dev = dev; + xa_init(&table->port_indices); + dev->priv.sf_table = table; + refcount_set(&table->refcount, 0); + table->esw_nb.notifier_call = mlx5_sf_esw_event; + err = mlx5_esw_event_notifier_register(dev->priv.eswitch, &table->esw_nb); + if (err) + goto reg_err; + + table->vhca_nb.notifier_call = mlx5_sf_vhca_event; + err = mlx5_vhca_event_notifier_register(table->dev, &table->vhca_nb); + if (err) + goto vhca_err; + + return 0; + +vhca_err: + mlx5_esw_event_notifier_unregister(dev->priv.eswitch, &table->esw_nb); +reg_err: + mutex_destroy(&table->sf_state_lock); + kfree(table); + dev->priv.sf_table = NULL; + return err; +} + +void mlx5_sf_table_cleanup(struct mlx5_core_dev *dev) +{ + struct mlx5_sf_table *table = dev->priv.sf_table; + + if (!table) + return; + + mlx5_vhca_event_notifier_unregister(table->dev, &table->vhca_nb); + mlx5_esw_event_notifier_unregister(dev->priv.eswitch, &table->esw_nb); + WARN_ON(refcount_read(&table->refcount)); + mutex_destroy(&table->sf_state_lock); + WARN_ON(!xa_empty(&table->port_indices)); + kfree(table); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/diag/sf_tracepoint.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/diag/sf_tracepoint.h new file mode 100644 index 0000000..8bf1cd9 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/diag/sf_tracepoint.h @@ -0,0 +1,173 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM mlx5 + +#if !defined(_MLX5_SF_TP_) || defined(TRACE_HEADER_MULTI_READ) +#define _MLX5_SF_TP_ + +#include +#include +#include "sf/vhca_event.h" + +TRACE_EVENT(mlx5_sf_add, + TP_PROTO(const struct mlx5_core_dev *dev, + unsigned int port_index, + u32 controller, + u16 hw_fn_id, + u32 sfnum), + TP_ARGS(dev, port_index, controller, hw_fn_id, sfnum), + TP_STRUCT__entry(__string(devname, dev_name(dev->device)) + __field(unsigned int, port_index) + __field(u32, controller) + __field(u16, hw_fn_id) + __field(u32, sfnum) + ), + TP_fast_assign(__assign_str(devname, dev_name(dev->device)); + __entry->port_index = port_index; + __entry->controller = controller; + __entry->hw_fn_id = hw_fn_id; + __entry->sfnum = sfnum; + ), + TP_printk("(%s) port_index=%u controller=%u hw_id=0x%x sfnum=%u\n", + __get_str(devname), __entry->port_index, __entry->controller, + __entry->hw_fn_id, __entry->sfnum) +); + +TRACE_EVENT(mlx5_sf_free, + TP_PROTO(const struct mlx5_core_dev *dev, + unsigned int port_index, + u32 controller, + u16 hw_fn_id), + TP_ARGS(dev, port_index, controller, hw_fn_id), + TP_STRUCT__entry(__string(devname, dev_name(dev->device)) + __field(unsigned int, port_index) + __field(u32, controller) + __field(u16, hw_fn_id) + ), + TP_fast_assign(__assign_str(devname, dev_name(dev->device)); + __entry->port_index = port_index; + __entry->controller = controller; + __entry->hw_fn_id = hw_fn_id; + ), + TP_printk("(%s) port_index=%u controller=%u hw_id=0x%x\n", + __get_str(devname), __entry->port_index, __entry->controller, + __entry->hw_fn_id) +); + +TRACE_EVENT(mlx5_sf_hwc_alloc, + TP_PROTO(const struct mlx5_core_dev *dev, + u32 controller, + u16 hw_fn_id, + u32 sfnum), + TP_ARGS(dev, controller, hw_fn_id, sfnum), + TP_STRUCT__entry(__string(devname, dev_name(dev->device)) + __field(u32, controller) + __field(u16, hw_fn_id) + __field(u32, sfnum) + ), + TP_fast_assign(__assign_str(devname, dev_name(dev->device)); + __entry->controller = controller; + __entry->hw_fn_id = hw_fn_id; + __entry->sfnum = sfnum; + ), + TP_printk("(%s) controller=%u hw_id=0x%x sfnum=%u\n", + __get_str(devname), __entry->controller, __entry->hw_fn_id, + __entry->sfnum) +); + +TRACE_EVENT(mlx5_sf_hwc_free, + TP_PROTO(const struct mlx5_core_dev *dev, + u16 hw_fn_id), + TP_ARGS(dev, hw_fn_id), + TP_STRUCT__entry(__string(devname, dev_name(dev->device)) + __field(u16, hw_fn_id) + ), + TP_fast_assign(__assign_str(devname, dev_name(dev->device)); + __entry->hw_fn_id = hw_fn_id; + ), + TP_printk("(%s) hw_id=0x%x\n", __get_str(devname), __entry->hw_fn_id) +); + +TRACE_EVENT(mlx5_sf_hwc_deferred_free, + TP_PROTO(const struct mlx5_core_dev *dev, + u16 hw_fn_id), + TP_ARGS(dev, hw_fn_id), + TP_STRUCT__entry(__string(devname, dev_name(dev->device)) + __field(u16, hw_fn_id) + ), + TP_fast_assign(__assign_str(devname, dev_name(dev->device)); + __entry->hw_fn_id = hw_fn_id; + ), + TP_printk("(%s) hw_id=0x%x\n", __get_str(devname), __entry->hw_fn_id) +); + +DECLARE_EVENT_CLASS(mlx5_sf_state_template, + TP_PROTO(const struct mlx5_core_dev *dev, + u32 port_index, + u32 controller, + u16 hw_fn_id), + TP_ARGS(dev, port_index, controller, hw_fn_id), + TP_STRUCT__entry(__string(devname, dev_name(dev->device)) + __field(unsigned int, port_index) + __field(u32, controller) + __field(u16, hw_fn_id)), + TP_fast_assign(__assign_str(devname, dev_name(dev->device)); + __entry->port_index = port_index; + __entry->controller = controller; + __entry->hw_fn_id = hw_fn_id; + ), + TP_printk("(%s) port_index=%u controller=%u hw_id=0x%x\n", + __get_str(devname), __entry->port_index, __entry->controller, + __entry->hw_fn_id) +); + +DEFINE_EVENT(mlx5_sf_state_template, mlx5_sf_activate, + TP_PROTO(const struct mlx5_core_dev *dev, + u32 port_index, + u32 controller, + u16 hw_fn_id), + TP_ARGS(dev, port_index, controller, hw_fn_id) + ); + +DEFINE_EVENT(mlx5_sf_state_template, mlx5_sf_deactivate, + TP_PROTO(const struct mlx5_core_dev *dev, + u32 port_index, + u32 controller, + u16 hw_fn_id), + TP_ARGS(dev, port_index, controller, hw_fn_id) + ); + +TRACE_EVENT(mlx5_sf_update_state, + TP_PROTO(const struct mlx5_core_dev *dev, + unsigned int port_index, + u32 controller, + u16 hw_fn_id, + u8 state), + TP_ARGS(dev, port_index, controller, hw_fn_id, state), + TP_STRUCT__entry(__string(devname, dev_name(dev->device)) + __field(unsigned int, port_index) + __field(u32, controller) + __field(u16, hw_fn_id) + __field(u8, state) + ), + TP_fast_assign(__assign_str(devname, dev_name(dev->device)); + __entry->port_index = port_index; + __entry->controller = controller; + __entry->hw_fn_id = hw_fn_id; + __entry->state = state; + ), + TP_printk("(%s) port_index=%u controller=%u hw_id=0x%x state=%u\n", + __get_str(devname), __entry->port_index, __entry->controller, + __entry->hw_fn_id, __entry->state) +); + +#endif /* _MLX5_SF_TP_ */ + +/* This part must be outside protection */ +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH sf/diag +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE sf_tracepoint +#include diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/diag/vhca_tracepoint.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/diag/vhca_tracepoint.h new file mode 100644 index 0000000..fd814a1 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/diag/vhca_tracepoint.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM mlx5 + +#if !defined(_MLX5_SF_VHCA_TP_) || defined(TRACE_HEADER_MULTI_READ) +#define _MLX5_SF_VHCA_TP_ + +#include +#include +#include "sf/vhca_event.h" + +TRACE_EVENT(mlx5_sf_vhca_event, + TP_PROTO(const struct mlx5_core_dev *dev, + const struct mlx5_vhca_state_event *event), + TP_ARGS(dev, event), + TP_STRUCT__entry(__string(devname, dev_name(dev->device)) + __field(u16, hw_fn_id) + __field(u32, sfnum) + __field(u8, vhca_state) + ), + TP_fast_assign(__assign_str(devname, dev_name(dev->device)); + __entry->hw_fn_id = event->function_id; + __entry->sfnum = event->sw_function_id; + __entry->vhca_state = event->new_vhca_state; + ), + TP_printk("(%s) hw_id=0x%x sfnum=%u vhca_state=%d\n", + __get_str(devname), __entry->hw_fn_id, + __entry->sfnum, __entry->vhca_state) +); + +#endif /* _MLX5_SF_VHCA_TP_ */ + +/* This part must be outside protection */ +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH sf/diag +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE vhca_tracepoint +#include diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/hw_table.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/hw_table.c new file mode 100644 index 0000000..17aa348 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/hw_table.c @@ -0,0 +1,364 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020 Mellanox Technologies Ltd */ +#include +#include "vhca_event.h" +#include "priv.h" +#include "sf.h" +#include "mlx5_ifc_vhca_event.h" +#include "ecpf.h" +#include "mlx5_core.h" +#include "eswitch.h" +#include "diag/sf_tracepoint.h" + +struct mlx5_sf_hw { + u32 usr_sfnum; + u8 allocated: 1; + u8 pending_delete: 1; +}; + +struct mlx5_sf_hwc_table { + struct mlx5_sf_hw *sfs; + int max_fn; + u16 start_fn_id; +}; + +enum mlx5_sf_hwc_index { + MLX5_SF_HWC_LOCAL, + MLX5_SF_HWC_EXTERNAL, + MLX5_SF_HWC_MAX, +}; + +struct mlx5_sf_hw_table { + struct mlx5_core_dev *dev; + struct mutex table_lock; /* Serializes sf deletion and vhca state change handler. */ + struct notifier_block vhca_nb; + struct mlx5_sf_hwc_table hwc[MLX5_SF_HWC_MAX]; +}; + +static struct mlx5_sf_hwc_table * +mlx5_sf_controller_to_hwc(struct mlx5_core_dev *dev, u32 controller) +{ + int idx = !!controller; + + return &dev->priv.sf_hw_table->hwc[idx]; +} + +u16 mlx5_sf_sw_to_hw_id(struct mlx5_core_dev *dev, u32 controller, u16 sw_id) +{ + struct mlx5_sf_hwc_table *hwc; + + hwc = mlx5_sf_controller_to_hwc(dev, controller); + return hwc->start_fn_id + sw_id; +} + +static u16 mlx5_sf_hw_to_sw_id(struct mlx5_sf_hwc_table *hwc, u16 hw_id) +{ + return hw_id - hwc->start_fn_id; +} + +static struct mlx5_sf_hwc_table * +mlx5_sf_table_fn_to_hwc(struct mlx5_sf_hw_table *table, u16 fn_id) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(table->hwc); i++) { + if (table->hwc[i].max_fn && + fn_id >= table->hwc[i].start_fn_id && + fn_id < (table->hwc[i].start_fn_id + table->hwc[i].max_fn)) + return &table->hwc[i]; + } + return NULL; +} + +static int mlx5_sf_hw_table_id_alloc(struct mlx5_sf_hw_table *table, u32 controller, + u32 usr_sfnum) +{ + struct mlx5_sf_hwc_table *hwc; + int free_idx = -1; + int i; + + hwc = mlx5_sf_controller_to_hwc(table->dev, controller); + if (!hwc->sfs) + return -ENOSPC; + + for (i = 0; i < hwc->max_fn; i++) { + if (!hwc->sfs[i].allocated && free_idx == -1) { + free_idx = i; + continue; + } + + if (hwc->sfs[i].allocated && hwc->sfs[i].usr_sfnum == usr_sfnum) + return -EEXIST; + } + + if (free_idx == -1) + return -ENOSPC; + + hwc->sfs[free_idx].usr_sfnum = usr_sfnum; + hwc->sfs[free_idx].allocated = true; + return free_idx; +} + +static void mlx5_sf_hw_table_id_free(struct mlx5_sf_hw_table *table, u32 controller, int id) +{ + struct mlx5_sf_hwc_table *hwc; + + hwc = mlx5_sf_controller_to_hwc(table->dev, controller); + hwc->sfs[id].allocated = false; + hwc->sfs[id].pending_delete = false; +} + +int mlx5_sf_hw_table_sf_alloc(struct mlx5_core_dev *dev, u32 controller, u32 usr_sfnum) +{ + struct mlx5_sf_hw_table *table = dev->priv.sf_hw_table; + u16 hw_fn_id; + int sw_id; + int err; + + if (!table) + return -EOPNOTSUPP; + + mutex_lock(&table->table_lock); + sw_id = mlx5_sf_hw_table_id_alloc(table, controller, usr_sfnum); + if (sw_id < 0) { + err = sw_id; + goto exist_err; + } + + hw_fn_id = mlx5_sf_sw_to_hw_id(dev, controller, sw_id); + err = mlx5_cmd_alloc_sf(dev, hw_fn_id); + if (err) + goto err; + + err = mlx5_modify_vhca_sw_id(dev, hw_fn_id, usr_sfnum); + if (err) + goto vhca_err; + + if (controller) { + /* If this SF is for external controller, SF manager + * needs to arm firmware to receive the events. + */ + err = mlx5_vhca_event_arm(dev, hw_fn_id); + if (err) + goto vhca_err; + } + + trace_mlx5_sf_hwc_alloc(dev, controller, hw_fn_id, usr_sfnum); + mutex_unlock(&table->table_lock); + return sw_id; + +vhca_err: + mlx5_cmd_dealloc_sf(dev, hw_fn_id); +err: + mlx5_sf_hw_table_id_free(table, controller, sw_id); +exist_err: + mutex_unlock(&table->table_lock); + return err; +} + +void mlx5_sf_hw_table_sf_free(struct mlx5_core_dev *dev, u32 controller, u16 id) +{ + struct mlx5_sf_hw_table *table = dev->priv.sf_hw_table; + u16 hw_fn_id; + + mutex_lock(&table->table_lock); + hw_fn_id = mlx5_sf_sw_to_hw_id(dev, controller, id); + mlx5_cmd_dealloc_sf(dev, hw_fn_id); + mlx5_sf_hw_table_id_free(table, controller, id); + mutex_unlock(&table->table_lock); +} + +static void mlx5_sf_hw_table_hwc_sf_free(struct mlx5_core_dev *dev, + struct mlx5_sf_hwc_table *hwc, int idx) +{ + mlx5_cmd_dealloc_sf(dev, hwc->start_fn_id + idx); + hwc->sfs[idx].allocated = false; + hwc->sfs[idx].pending_delete = false; + trace_mlx5_sf_hwc_free(dev, hwc->start_fn_id + idx); +} + +void mlx5_sf_hw_table_sf_deferred_free(struct mlx5_core_dev *dev, u32 controller, u16 id) +{ + struct mlx5_sf_hw_table *table = dev->priv.sf_hw_table; + u32 out[MLX5_ST_SZ_DW(query_vhca_state_out)] = {}; + struct mlx5_sf_hwc_table *hwc; + u16 hw_fn_id; + u8 state; + int err; + + hw_fn_id = mlx5_sf_sw_to_hw_id(dev, controller, id); + hwc = mlx5_sf_controller_to_hwc(dev, controller); + mutex_lock(&table->table_lock); + err = mlx5_cmd_query_vhca_state(dev, hw_fn_id, out, sizeof(out)); + if (err) + goto err; + state = MLX5_GET(query_vhca_state_out, out, vhca_state_context.vhca_state); + if (state == MLX5_VHCA_STATE_ALLOCATED) { + mlx5_cmd_dealloc_sf(dev, hw_fn_id); + hwc->sfs[id].allocated = false; + } else { + hwc->sfs[id].pending_delete = true; + trace_mlx5_sf_hwc_deferred_free(dev, hw_fn_id); + } +err: + mutex_unlock(&table->table_lock); +} + +static void mlx5_sf_hw_table_hwc_dealloc_all(struct mlx5_core_dev *dev, + struct mlx5_sf_hwc_table *hwc) +{ + int i; + + for (i = 0; i < hwc->max_fn; i++) { + if (hwc->sfs[i].allocated) + mlx5_sf_hw_table_hwc_sf_free(dev, hwc, i); + } +} + +static void mlx5_sf_hw_table_dealloc_all(struct mlx5_sf_hw_table *table) +{ + mlx5_sf_hw_table_hwc_dealloc_all(table->dev, &table->hwc[MLX5_SF_HWC_EXTERNAL]); + mlx5_sf_hw_table_hwc_dealloc_all(table->dev, &table->hwc[MLX5_SF_HWC_LOCAL]); +} + +static int mlx5_sf_hw_table_hwc_init(struct mlx5_sf_hwc_table *hwc, u16 max_fn, u16 base_id) +{ + struct mlx5_sf_hw *sfs; + + if (!max_fn) + return 0; + + sfs = kcalloc(max_fn, sizeof(*sfs), GFP_KERNEL); + if (!sfs) + return -ENOMEM; + + hwc->sfs = sfs; + hwc->max_fn = max_fn; + hwc->start_fn_id = base_id; + return 0; +} + +static void mlx5_sf_hw_table_hwc_cleanup(struct mlx5_sf_hwc_table *hwc) +{ + kfree(hwc->sfs); +} + +int mlx5_sf_hw_table_init(struct mlx5_core_dev *dev) +{ + struct mlx5_sf_hw_table *table; + u16 max_ext_fn = 0; + u16 ext_base_id = 0; + u16 max_fn = 0; + u16 base_id; + int err; + + if (!mlx5_vhca_event_supported(dev)) + return 0; + + if (mlx5_sf_supported(dev)) + max_fn = mlx5_sf_max_functions(dev); + + err = mlx5_esw_sf_max_hpf_functions(dev, &max_ext_fn, &ext_base_id); + if (err) + return err; + + if (!max_fn && !max_ext_fn) + return 0; + + table = kzalloc(sizeof(*table), GFP_KERNEL); + if (!table) + return -ENOMEM; + + mutex_init(&table->table_lock); + table->dev = dev; + dev->priv.sf_hw_table = table; + + base_id = mlx5_sf_start_function_id(dev); + err = mlx5_sf_hw_table_hwc_init(&table->hwc[MLX5_SF_HWC_LOCAL], max_fn, base_id); + if (err) + goto table_err; + + err = mlx5_sf_hw_table_hwc_init(&table->hwc[MLX5_SF_HWC_EXTERNAL], + max_ext_fn, ext_base_id); + if (err) + goto ext_err; + + mlx5_core_dbg(dev, "SF HW table: max sfs = %d, ext sfs = %d\n", max_fn, max_ext_fn); + return 0; + +ext_err: + mlx5_sf_hw_table_hwc_cleanup(&table->hwc[MLX5_SF_HWC_LOCAL]); +table_err: + mutex_destroy(&table->table_lock); + kfree(table); + return err; +} + +void mlx5_sf_hw_table_cleanup(struct mlx5_core_dev *dev) +{ + struct mlx5_sf_hw_table *table = dev->priv.sf_hw_table; + + if (!table) + return; + + mutex_destroy(&table->table_lock); + mlx5_sf_hw_table_hwc_cleanup(&table->hwc[MLX5_SF_HWC_EXTERNAL]); + mlx5_sf_hw_table_hwc_cleanup(&table->hwc[MLX5_SF_HWC_LOCAL]); + kfree(table); +} + +static int mlx5_sf_hw_vhca_event(struct notifier_block *nb, unsigned long opcode, void *data) +{ + struct mlx5_sf_hw_table *table = container_of(nb, struct mlx5_sf_hw_table, vhca_nb); + const struct mlx5_vhca_state_event *event = data; + struct mlx5_sf_hwc_table *hwc; + struct mlx5_sf_hw *sf_hw; + u16 sw_id; + + if (event->new_vhca_state != MLX5_VHCA_STATE_ALLOCATED) + return 0; + + hwc = mlx5_sf_table_fn_to_hwc(table, event->function_id); + if (!hwc) + return 0; + + sw_id = mlx5_sf_hw_to_sw_id(hwc, event->function_id); + sf_hw = &hwc->sfs[sw_id]; + + mutex_lock(&table->table_lock); + /* SF driver notified through firmware that SF is finally detached. + * Hence recycle the sf hardware id for reuse. + */ + if (sf_hw->allocated && sf_hw->pending_delete) + mlx5_sf_hw_table_hwc_sf_free(table->dev, hwc, sw_id); + mutex_unlock(&table->table_lock); + return 0; +} + +int mlx5_sf_hw_table_create(struct mlx5_core_dev *dev) +{ + struct mlx5_sf_hw_table *table = dev->priv.sf_hw_table; + + if (!table) + return 0; + + table->vhca_nb.notifier_call = mlx5_sf_hw_vhca_event; + return mlx5_vhca_event_notifier_register(dev, &table->vhca_nb); +} + +void mlx5_sf_hw_table_destroy(struct mlx5_core_dev *dev) +{ + struct mlx5_sf_hw_table *table = dev->priv.sf_hw_table; + + if (!table) + return; + + mlx5_vhca_event_notifier_unregister(dev, &table->vhca_nb); + /* Dealloc SFs whose firmware event has been missed. */ + mlx5_sf_hw_table_dealloc_all(table); +} + +bool mlx5_sf_hw_table_supported(const struct mlx5_core_dev *dev) +{ + return !!dev->priv.sf_hw_table; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/mlx5_ifc_vhca_event.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/mlx5_ifc_vhca_event.h new file mode 100644 index 0000000..4fc8701 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/mlx5_ifc_vhca_event.h @@ -0,0 +1,82 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2020 Mellanox Technologies Ltd */ + +#ifndef __MLX5_IFC_VHCA_EVENT_H__ +#define __MLX5_IFC_VHCA_EVENT_H__ + +enum mlx5_ifc_vhca_state { + MLX5_VHCA_STATE_INVALID = 0x0, + MLX5_VHCA_STATE_ALLOCATED = 0x1, + MLX5_VHCA_STATE_ACTIVE = 0x2, + MLX5_VHCA_STATE_IN_USE = 0x3, + MLX5_VHCA_STATE_TEARDOWN_REQUEST = 0x4, +}; + +struct mlx5_ifc_vhca_state_context_bits { + u8 arm_change_event[0x1]; + u8 reserved_at_1[0xb]; + u8 vhca_state[0x4]; + u8 reserved_at_10[0x10]; + + u8 sw_function_id[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_query_vhca_state_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; + + struct mlx5_ifc_vhca_state_context_bits vhca_state_context; +}; + +struct mlx5_ifc_query_vhca_state_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 embedded_cpu_function[0x1]; + u8 reserved_at_41[0xf]; + u8 function_id[0x10]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_vhca_state_field_select_bits { + u8 reserved_at_0[0x1e]; + u8 sw_function_id[0x1]; + u8 arm_change_event[0x1]; +}; + +struct mlx5_ifc_modify_vhca_state_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_modify_vhca_state_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 embedded_cpu_function[0x1]; + u8 reserved_at_41[0xf]; + u8 function_id[0x10]; + + struct mlx5_ifc_vhca_state_field_select_bits vhca_state_field_select; + + struct mlx5_ifc_vhca_state_context_bits vhca_state_context; +}; + +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/priv.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/priv.h new file mode 100644 index 0000000..7114f3f --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/priv.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2020 Mellanox Technologies Ltd */ + +#ifndef __MLX5_SF_PRIV_H__ +#define __MLX5_SF_PRIV_H__ + +#include + +int mlx5_cmd_alloc_sf(struct mlx5_core_dev *dev, u16 function_id); +int mlx5_cmd_dealloc_sf(struct mlx5_core_dev *dev, u16 function_id); + +int mlx5_cmd_sf_enable_hca(struct mlx5_core_dev *dev, u16 func_id); +int mlx5_cmd_sf_disable_hca(struct mlx5_core_dev *dev, u16 func_id); + +u16 mlx5_sf_sw_to_hw_id(struct mlx5_core_dev *dev, u32 controller, u16 sw_id); + +int mlx5_sf_hw_table_sf_alloc(struct mlx5_core_dev *dev, u32 controller, u32 usr_sfnum); +void mlx5_sf_hw_table_sf_free(struct mlx5_core_dev *dev, u32 controller, u16 id); +void mlx5_sf_hw_table_sf_deferred_free(struct mlx5_core_dev *dev, u32 controller, u16 id); +bool mlx5_sf_hw_table_supported(const struct mlx5_core_dev *dev); + +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h new file mode 100644 index 0000000..ae31309 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/sf.h @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2020 Mellanox Technologies Ltd */ + +#ifndef __MLX5_SF_H__ +#define __MLX5_SF_H__ + +#include +#include "lib/sf.h" + +#ifdef CONFIG_MLX5_SF_MANAGER +int mlx5_sf_hw_table_init(struct mlx5_core_dev *dev); +void mlx5_sf_hw_table_cleanup(struct mlx5_core_dev *dev); + +int mlx5_sf_hw_table_create(struct mlx5_core_dev *dev); +void mlx5_sf_hw_table_destroy(struct mlx5_core_dev *dev); + +int mlx5_sf_table_init(struct mlx5_core_dev *dev); +void mlx5_sf_table_cleanup(struct mlx5_core_dev *dev); + +int mlx5_devlink_sf_port_new(struct devlink *devlink, + const struct devlink_port_new_attrs *add_attr, + struct netlink_ext_ack *extack, + unsigned int *new_port_index); +int mlx5_devlink_sf_port_del(struct devlink *devlink, unsigned int port_index, + struct netlink_ext_ack *extack); +int mlx5_devlink_sf_port_fn_state_get(struct devlink_port *dl_port, + enum devlink_port_fn_state *state, + enum devlink_port_fn_opstate *opstate, + struct netlink_ext_ack *extack); +int mlx5_devlink_sf_port_fn_state_set(struct devlink_port *dl_port, + enum devlink_port_fn_state state, + struct netlink_ext_ack *extack); +#if IS_ENABLED(CONFIG_MLXDEVM) +int mlx5_sf_index_to_hw_id(struct devlink *devlink, u16 *hw_fn_id, unsigned int port_index, + struct netlink_ext_ack *extack); +#endif +#else + +static inline int mlx5_sf_hw_table_init(struct mlx5_core_dev *dev) +{ + return 0; +} + +static inline void mlx5_sf_hw_table_cleanup(struct mlx5_core_dev *dev) +{ +} + +static inline int mlx5_sf_hw_table_create(struct mlx5_core_dev *dev) +{ + return 0; +} + +static inline void mlx5_sf_hw_table_destroy(struct mlx5_core_dev *dev) +{ +} + +static inline int mlx5_sf_table_init(struct mlx5_core_dev *dev) +{ + return 0; +} + +static inline void mlx5_sf_table_cleanup(struct mlx5_core_dev *dev) +{ +} + +#endif + +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.c new file mode 100644 index 0000000..d908fba --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.c @@ -0,0 +1,191 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020 Mellanox Technologies Ltd */ + +#include +#include "mlx5_ifc_vhca_event.h" +#include "mlx5_core.h" +#include "vhca_event.h" +#include "ecpf.h" +#define CREATE_TRACE_POINTS +#include "diag/vhca_tracepoint.h" + +struct mlx5_vhca_state_notifier { + struct mlx5_core_dev *dev; + struct mlx5_nb nb; + struct blocking_notifier_head n_head; +}; + +struct mlx5_vhca_event_work { + struct work_struct work; + struct mlx5_vhca_state_notifier *notifier; + struct mlx5_vhca_state_event event; +}; + +int mlx5_cmd_query_vhca_state(struct mlx5_core_dev *dev, u16 function_id, u32 *out, u32 outlen) +{ + u32 in[MLX5_ST_SZ_DW(query_vhca_state_in)] = {}; + + MLX5_SET(query_vhca_state_in, in, opcode, MLX5_CMD_OP_QUERY_VHCA_STATE); + MLX5_SET(query_vhca_state_in, in, function_id, function_id); + MLX5_SET(query_vhca_state_in, in, embedded_cpu_function, 0); + + return mlx5_cmd_exec(dev, in, sizeof(in), out, outlen); +} + +static int mlx5_cmd_modify_vhca_state(struct mlx5_core_dev *dev, u16 function_id, + u32 *in, u32 inlen) +{ + u32 out[MLX5_ST_SZ_DW(modify_vhca_state_out)] = {}; + + MLX5_SET(modify_vhca_state_in, in, opcode, MLX5_CMD_OP_MODIFY_VHCA_STATE); + MLX5_SET(modify_vhca_state_in, in, function_id, function_id); + MLX5_SET(modify_vhca_state_in, in, embedded_cpu_function, 0); + + return mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); +} + +int mlx5_modify_vhca_sw_id(struct mlx5_core_dev *dev, u16 function_id, u32 sw_fn_id) +{ + u32 out[MLX5_ST_SZ_DW(modify_vhca_state_out)] = {}; + u32 in[MLX5_ST_SZ_DW(modify_vhca_state_in)] = {}; + + MLX5_SET(modify_vhca_state_in, in, opcode, MLX5_CMD_OP_MODIFY_VHCA_STATE); + MLX5_SET(modify_vhca_state_in, in, function_id, function_id); + MLX5_SET(modify_vhca_state_in, in, embedded_cpu_function, 0); + MLX5_SET(modify_vhca_state_in, in, vhca_state_field_select.sw_function_id, 1); + MLX5_SET(modify_vhca_state_in, in, vhca_state_context.sw_function_id, sw_fn_id); + + return mlx5_cmd_exec_inout(dev, modify_vhca_state, in, out); +} + +int mlx5_vhca_event_arm(struct mlx5_core_dev *dev, u16 function_id) +{ + u32 in[MLX5_ST_SZ_DW(modify_vhca_state_in)] = {}; + + MLX5_SET(modify_vhca_state_in, in, vhca_state_context.arm_change_event, 1); + MLX5_SET(modify_vhca_state_in, in, vhca_state_field_select.arm_change_event, 1); + + return mlx5_cmd_modify_vhca_state(dev, function_id, in, sizeof(in)); +} + +static void +mlx5_vhca_event_notify(struct mlx5_core_dev *dev, struct mlx5_vhca_state_event *event) +{ + u32 out[MLX5_ST_SZ_DW(query_vhca_state_out)] = {}; + int err; + + err = mlx5_cmd_query_vhca_state(dev, event->function_id, out, sizeof(out)); + if (err) + return; + + event->sw_function_id = MLX5_GET(query_vhca_state_out, out, + vhca_state_context.sw_function_id); + event->new_vhca_state = MLX5_GET(query_vhca_state_out, out, + vhca_state_context.vhca_state); + + mlx5_vhca_event_arm(dev, event->function_id); + trace_mlx5_sf_vhca_event(dev, event); + + blocking_notifier_call_chain(&dev->priv.vhca_state_notifier->n_head, 0, event); +} + +static void mlx5_vhca_state_work_handler(struct work_struct *_work) +{ + struct mlx5_vhca_event_work *work = container_of(_work, struct mlx5_vhca_event_work, work); + struct mlx5_vhca_state_notifier *notifier = work->notifier; + struct mlx5_core_dev *dev = notifier->dev; + + mlx5_vhca_event_notify(dev, &work->event); + kfree(work); +} + +static int +mlx5_vhca_state_change_notifier(struct notifier_block *nb, unsigned long type, void *data) +{ + struct mlx5_vhca_state_notifier *notifier = + mlx5_nb_cof(nb, struct mlx5_vhca_state_notifier, nb); + struct mlx5_vhca_event_work *work; + struct mlx5_eqe *eqe = data; + + work = kzalloc(sizeof(*work), GFP_ATOMIC); + if (!work) + return NOTIFY_DONE; + INIT_WORK(&work->work, &mlx5_vhca_state_work_handler); + work->notifier = notifier; + work->event.function_id = be16_to_cpu(eqe->data.vhca_state.function_id); + mlx5_events_work_enqueue(notifier->dev, &work->work); + return NOTIFY_OK; +} + +void mlx5_vhca_state_cap_handle(struct mlx5_core_dev *dev, void *set_hca_cap) +{ + if (!mlx5_vhca_event_supported(dev)) + return; + + MLX5_SET(cmd_hca_cap, set_hca_cap, vhca_state, 1); + MLX5_SET(cmd_hca_cap, set_hca_cap, event_on_vhca_state_allocated, 1); + MLX5_SET(cmd_hca_cap, set_hca_cap, event_on_vhca_state_active, 1); + MLX5_SET(cmd_hca_cap, set_hca_cap, event_on_vhca_state_in_use, 1); + MLX5_SET(cmd_hca_cap, set_hca_cap, event_on_vhca_state_teardown_request, 1); +} + +int mlx5_vhca_event_init(struct mlx5_core_dev *dev) +{ + struct mlx5_vhca_state_notifier *notifier; + + if (!mlx5_vhca_event_supported(dev)) + return 0; + + notifier = kzalloc(sizeof(*notifier), GFP_KERNEL); + if (!notifier) + return -ENOMEM; + + dev->priv.vhca_state_notifier = notifier; + notifier->dev = dev; + BLOCKING_INIT_NOTIFIER_HEAD(¬ifier->n_head); + MLX5_NB_INIT(¬ifier->nb, mlx5_vhca_state_change_notifier, VHCA_STATE_CHANGE); + return 0; +} + +void mlx5_vhca_event_cleanup(struct mlx5_core_dev *dev) +{ + if (!mlx5_vhca_event_supported(dev)) + return; + + kfree(dev->priv.vhca_state_notifier); + dev->priv.vhca_state_notifier = NULL; +} + +void mlx5_vhca_event_start(struct mlx5_core_dev *dev) +{ + struct mlx5_vhca_state_notifier *notifier; + + if (!dev->priv.vhca_state_notifier) + return; + + notifier = dev->priv.vhca_state_notifier; + mlx5_eq_notifier_register(dev, ¬ifier->nb); +} + +void mlx5_vhca_event_stop(struct mlx5_core_dev *dev) +{ + struct mlx5_vhca_state_notifier *notifier; + + if (!dev->priv.vhca_state_notifier) + return; + + notifier = dev->priv.vhca_state_notifier; + mlx5_eq_notifier_unregister(dev, ¬ifier->nb); +} + +int mlx5_vhca_event_notifier_register(struct mlx5_core_dev *dev, struct notifier_block *nb) +{ + if (!dev->priv.vhca_state_notifier) + return -EOPNOTSUPP; + return blocking_notifier_chain_register(&dev->priv.vhca_state_notifier->n_head, nb); +} + +void mlx5_vhca_event_notifier_unregister(struct mlx5_core_dev *dev, struct notifier_block *nb) +{ + blocking_notifier_chain_unregister(&dev->priv.vhca_state_notifier->n_head, nb); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.h new file mode 100644 index 0000000..013cdfe --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sf/vhca_event.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2020 Mellanox Technologies Ltd */ + +#ifndef __MLX5_VHCA_EVENT_H__ +#define __MLX5_VHCA_EVENT_H__ + +#ifdef CONFIG_MLX5_SF + +struct mlx5_vhca_state_event { + u16 function_id; + u16 sw_function_id; + u8 new_vhca_state; +}; + +static inline bool mlx5_vhca_event_supported(const struct mlx5_core_dev *dev) +{ + return MLX5_CAP_GEN_MAX(dev, vhca_state); +} + +void mlx5_vhca_state_cap_handle(struct mlx5_core_dev *dev, void *set_hca_cap); +int mlx5_vhca_event_init(struct mlx5_core_dev *dev); +void mlx5_vhca_event_cleanup(struct mlx5_core_dev *dev); +void mlx5_vhca_event_start(struct mlx5_core_dev *dev); +void mlx5_vhca_event_stop(struct mlx5_core_dev *dev); +int mlx5_vhca_event_notifier_register(struct mlx5_core_dev *dev, struct notifier_block *nb); +void mlx5_vhca_event_notifier_unregister(struct mlx5_core_dev *dev, struct notifier_block *nb); +int mlx5_modify_vhca_sw_id(struct mlx5_core_dev *dev, u16 function_id, u32 sw_fn_id); +int mlx5_vhca_event_arm(struct mlx5_core_dev *dev, u16 function_id); +int mlx5_cmd_query_vhca_state(struct mlx5_core_dev *dev, u16 function_id, + u32 *out, u32 outlen); +#else + +static inline void mlx5_vhca_state_cap_handle(struct mlx5_core_dev *dev, void *set_hca_cap) +{ +} + +static inline int mlx5_vhca_event_init(struct mlx5_core_dev *dev) +{ + return 0; +} + +static inline void mlx5_vhca_event_cleanup(struct mlx5_core_dev *dev) +{ +} + +static inline void mlx5_vhca_event_start(struct mlx5_core_dev *dev) +{ +} + +static inline void mlx5_vhca_event_stop(struct mlx5_core_dev *dev) +{ +} + +#endif + +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sriov.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sriov.c new file mode 100644 index 0000000..4695874 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sriov.c @@ -0,0 +1,324 @@ +/* + * Copyright (c) 2014, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include "mlx5_core.h" +#include "mlx5_irq.h" +#include "eswitch.h" + +static int sriov_restore_guids(struct mlx5_core_dev *dev, int vf) +{ + struct mlx5_core_sriov *sriov = &dev->priv.sriov; + struct mlx5_hca_vport_context *in; + int err = 0; + + /* Restore sriov guid and policy settings */ + if (sriov->vfs_ctx[vf].node_guid || + sriov->vfs_ctx[vf].port_guid || + sriov->vfs_ctx[vf].policy != MLX5_POLICY_INVALID) { + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) + return -ENOMEM; + + in->node_guid = sriov->vfs_ctx[vf].node_guid; + in->port_guid = sriov->vfs_ctx[vf].port_guid; + in->policy = sriov->vfs_ctx[vf].policy; + in->field_select = + !!(in->port_guid) * MLX5_HCA_VPORT_SEL_PORT_GUID | + !!(in->node_guid) * MLX5_HCA_VPORT_SEL_NODE_GUID | + !!(in->policy) * MLX5_HCA_VPORT_SEL_STATE_POLICY; + + err = mlx5_core_modify_hca_vport_context(dev, 1, 1, vf + 1, in); + if (err) + mlx5_core_warn(dev, "modify vport context failed, unable to restore VF %d settings\n", vf); + + kfree(in); + } + + return err; +} + +static int mlx5_device_enable_sriov(struct mlx5_core_dev *dev, int num_vfs) +{ + struct mlx5_core_sriov *sriov = &dev->priv.sriov; + int err, vf, num_msix_count; + + if (!MLX5_ESWITCH_MANAGER(dev)) + goto enable_vfs_hca; + + err = mlx5_eswitch_enable(dev->priv.eswitch, num_vfs); + if (err) { + mlx5_core_warn(dev, + "failed to enable eswitch SRIOV (%d)\n", err); + return err; + } + +enable_vfs_hca: + err = mlx5_create_vfs_sysfs(dev, num_vfs); + if (err) { + mlx5_core_warn(dev, "failed to create SRIOV sysfs (%d)\n", err); +#ifdef CONFIG_MLX5_CORE_EN + if (MLX5_ESWITCH_MANAGER(dev)) + mlx5_eswitch_disable(dev->priv.eswitch); +#endif + return err; + } + + num_msix_count = mlx5_get_default_msix_vec_count(dev, num_vfs); + for (vf = 0; vf < num_vfs; vf++) { + err = mlx5_core_enable_hca(dev, vf + 1); + if (err) { + mlx5_core_warn(dev, "failed to enable VF %d (%d)\n", vf, err); + continue; + } + + err = mlx5_set_msix_vec_count(dev, vf + 1, num_msix_count); + if (err) { + mlx5_core_warn(dev, + "failed to set MSI-X vector counts VF %d, err %d\n", + vf, err); + continue; + } + + sriov->vfs_ctx[vf].enabled = 1; + if (MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_IB) { + err = sriov_restore_guids(dev, vf); + if (err) { + mlx5_core_warn(dev, + "failed to restore VF %d settings, err %d\n", + vf, err); + continue; + } + } + mlx5_core_dbg(dev, "successfully enabled VF* %d\n", vf); + } + + return 0; +} + +static void +mlx5_device_disable_sriov(struct mlx5_core_dev *dev, int num_vfs, bool clear_vf) +{ + struct mlx5_core_sriov *sriov = &dev->priv.sriov; + int err; + int vf; + + for (vf = num_vfs - 1; vf >= 0; vf--) { + if (!sriov->vfs_ctx[vf].enabled) + continue; + err = mlx5_core_disable_hca(dev, vf + 1); + if (err) { + mlx5_core_warn(dev, "failed to disable VF %d\n", vf); + continue; + } + sriov->vfs_ctx[vf].enabled = 0; + } + + mlx5_eswitch_disable_sriov(dev->priv.eswitch, clear_vf); + + mlx5_destroy_vfs_sysfs(dev, num_vfs); + + if (mlx5_wait_for_pages(dev, &dev->priv.vfs_pages)) + mlx5_core_warn(dev, "timeout reclaiming VFs pages\n"); +} + +static int mlx5_sriov_enable(struct pci_dev *pdev, int num_vfs) +{ + struct mlx5_core_dev *dev = pci_get_drvdata(pdev); + int err; + + if (num_vfs && pci_num_vf(dev->pdev)) { + if (num_vfs == pci_num_vf(dev->pdev)) + return 0; + + mlx5_core_warn(dev, + "VFs already enabled. Disable before enabling %d VFs\n", + num_vfs); + return -EBUSY; + } + + err = mlx5_device_enable_sriov(dev, num_vfs); + if (err) { + mlx5_core_warn(dev, "mlx5_device_enable_sriov failed : %d\n", err); + return err; + } + + err = pci_enable_sriov(pdev, num_vfs); + if (err) { + mlx5_core_warn(dev, "pci_enable_sriov failed : %d\n", err); + mlx5_device_disable_sriov(dev, num_vfs, true); + } + return err; +} + +static void mlx5_sriov_disable(struct pci_dev *pdev) +{ + struct mlx5_core_dev *dev = pci_get_drvdata(pdev); + int num_vfs = pci_num_vf(dev->pdev); + + pci_disable_sriov(pdev); + mlx5_device_disable_sriov(dev, num_vfs, true); +} + +int mlx5_core_sriov_configure(struct pci_dev *pdev, int num_vfs) +{ + struct mlx5_core_dev *dev = pci_get_drvdata(pdev); + struct mlx5_core_sriov *sriov = &dev->priv.sriov; + int err = 0; + + mlx5_core_dbg(dev, "requested num_vfs %d\n", num_vfs); + + if (num_vfs) + err = mlx5_sriov_enable(pdev, num_vfs); + else + mlx5_sriov_disable(pdev); + + if (!err) + sriov->num_vfs = num_vfs; + return err ? err : num_vfs; +} + +int mlx5_core_sriov_set_msix_vec_count(struct pci_dev *vf, int msix_vec_count) +{ + struct pci_dev *pf = pci_physfn(vf); + struct mlx5_core_sriov *sriov; + struct mlx5_core_dev *dev; + int num_vf_msix, id; + + dev = pci_get_drvdata(pf); + num_vf_msix = MLX5_CAP_GEN_MAX(dev, num_total_dynamic_vf_msix); + if (!num_vf_msix) + return -EOPNOTSUPP; + + if (!msix_vec_count) + msix_vec_count = + mlx5_get_default_msix_vec_count(dev, pci_num_vf(pf)); + + sriov = &dev->priv.sriov; + + /* Reversed translation of PCI VF function number to the internal + * function_id, which exists in the name of virtfn symlink. + */ + for (id = 0; id < pci_num_vf(pf); id++) { + if (!sriov->vfs_ctx[id].enabled) + continue; + + if (vf->devfn == pci_iov_virtfn_devfn(pf, id)) + break; + } + + if (id == pci_num_vf(pf) || !sriov->vfs_ctx[id].enabled) + return -EINVAL; + + return mlx5_set_msix_vec_count(dev, id + 1, msix_vec_count); +} + +int mlx5_sriov_attach(struct mlx5_core_dev *dev) +{ + if (!mlx5_core_is_pf(dev) || !pci_num_vf(dev->pdev)) + return 0; + + /* If sriov VFs exist in PCI level, enable them in device level */ + return mlx5_device_enable_sriov(dev, pci_num_vf(dev->pdev)); +} + +void mlx5_sriov_detach(struct mlx5_core_dev *dev) +{ + if (!mlx5_core_is_pf(dev)) + return; + + mlx5_device_disable_sriov(dev, pci_num_vf(dev->pdev), false); +} + +static u16 mlx5_get_max_vfs(struct mlx5_core_dev *dev) +{ + u16 host_total_vfs; + const u32 *out; + + if (mlx5_core_is_ecpf_esw_manager(dev)) { + out = mlx5_esw_query_functions(dev); + + /* Old FW doesn't support getting total_vfs from esw func + * but supports getting it from pci_sriov. + */ + if (IS_ERR(out)) + goto done; + host_total_vfs = MLX5_GET(query_esw_functions_out, out, + host_params_context.host_total_vfs); + kvfree(out); + if (host_total_vfs) + return host_total_vfs; + } + +done: + return pci_sriov_get_totalvfs(dev->pdev); +} + +int mlx5_sriov_init(struct mlx5_core_dev *dev) +{ + struct mlx5_core_sriov *sriov = &dev->priv.sriov; + struct pci_dev *pdev = dev->pdev; + int total_vfs; + int err; + + if (!mlx5_core_is_pf(dev)) + return 0; + + total_vfs = pci_sriov_get_totalvfs(pdev); + sriov->max_vfs = mlx5_get_max_vfs(dev); + sriov->num_vfs = pci_num_vf(pdev); + sriov->vfs_ctx = kcalloc(total_vfs, sizeof(*sriov->vfs_ctx), GFP_KERNEL); + if (!sriov->vfs_ctx) + return -ENOMEM; + + err = mlx5_sriov_sysfs_init(dev); + if (err) { + mlx5_core_warn(dev, "failed to init SRIOV sysfs (%d)\n", err); + kfree(sriov->vfs_ctx); + return err; + } + + return 0; +} + +void mlx5_sriov_cleanup(struct mlx5_core_dev *dev) +{ + struct mlx5_core_sriov *sriov = &dev->priv.sriov; + + if (!mlx5_core_is_pf(dev)) + return; + + mlx5_sriov_sysfs_cleanup(dev); + kfree(sriov->vfs_ctx); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sriov_sysfs.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sriov_sysfs.c new file mode 100644 index 0000000..f173e94 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/sriov_sysfs.c @@ -0,0 +1,1589 @@ +/* + * Copyright (c) 2014, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include "mlx5_core.h" +#include "eswitch.h" +#ifdef CONFIG_MLX5_ESWITCH +#include "esw/vf_meter.h" +#include "esw/qos.h" +#endif +#include "esw/legacy.h" + +struct vf_attributes { + struct attribute attr; + ssize_t (*show)(struct mlx5_sriov_vf *, struct vf_attributes *, + char *buf); + ssize_t (*store)(struct mlx5_sriov_vf *, struct vf_attributes *, + const char *buf, size_t count); +}; + +static ssize_t vf_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct vf_attributes *ga = + container_of(attr, struct vf_attributes, attr); + struct mlx5_sriov_vf *g = container_of(kobj, struct mlx5_sriov_vf, kobj); + + if (!ga->show) + return -EIO; + + return ga->show(g, ga, buf); +} + +static ssize_t vf_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t size) +{ + struct vf_attributes *ga = + container_of(attr, struct vf_attributes, attr); + struct mlx5_sriov_vf *g = container_of(kobj, struct mlx5_sriov_vf, kobj); + + if (!ga->store) + return -EIO; + + return ga->store(g, ga, buf, size); +} + +static ssize_t vf_paging_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct vf_attributes *ga = + container_of(attr, struct vf_attributes, attr); + struct mlx5_sriov_vf *g = container_of(kobj, struct mlx5_sriov_vf, page_kobj); + + if (!ga->show) + return -EIO; + + return ga->show(g, ga, buf); +} + +static ssize_t vf_paging_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t size) +{ + struct vf_attributes *ga = + container_of(attr, struct vf_attributes, attr); + struct mlx5_sriov_vf *g = container_of(kobj, struct mlx5_sriov_vf, page_kobj); + + if (!ga->store) + return -EIO; + + return ga->store(g, ga, buf, size); +} + +struct vf_group_attributes { + struct attribute attr; + ssize_t (*show)(struct mlx5_esw_rate_group *, struct vf_group_attributes *, + char *buf); + ssize_t (*store)(struct mlx5_esw_rate_group *, struct vf_group_attributes *, + const char *buf, size_t count); +}; + +static ssize_t vf_group_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct vf_group_attributes *ga = + container_of(attr, struct vf_group_attributes, attr); + struct mlx5_esw_rate_group *g = container_of(kobj, struct mlx5_esw_rate_group, kobj); + + if (!ga->show) + return -EIO; + + return ga->show(g, ga, buf); +} + +static ssize_t vf_group_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t size) +{ + struct vf_group_attributes *ga = + container_of(attr, struct vf_group_attributes, attr); + struct mlx5_esw_rate_group *g = container_of(kobj, struct mlx5_esw_rate_group, kobj); + + if (!ga->store) + return -EIO; + + return ga->store(g, ga, buf, size); +} + +static ssize_t max_tx_rate_group_show(struct mlx5_esw_rate_group *g, + struct vf_group_attributes *oa, + char *buf) +{ + return sprintf(buf, + "usage: write to set VF group max rate\n"); +} + +static ssize_t max_tx_rate_group_store(struct mlx5_esw_rate_group *g, + struct vf_group_attributes *oa, + const char *buf, size_t count) +{ + struct mlx5_core_dev *dev = g->dev; + struct mlx5_eswitch *esw = dev->priv.eswitch; + u32 max_rate; + int err; + + err = sscanf(buf, "%u", &max_rate); + if (err != 1) + return -EINVAL; + + err = mlx5_esw_qos_set_sysfs_group_max_rate(esw, g, max_rate); + + return err ? err : count; +} + +static ssize_t min_tx_rate_group_show(struct mlx5_esw_rate_group *g, + struct vf_group_attributes *oa, + char *buf) +{ + return sprintf(buf, + "usage: write to set VF group min rate\n"); +} + +static ssize_t min_tx_rate_group_store(struct mlx5_esw_rate_group *g, + struct vf_group_attributes *oa, + const char *buf, size_t count) +{ + struct mlx5_core_dev *dev = g->dev; + struct mlx5_eswitch *esw = dev->priv.eswitch; + u32 min_rate; + int err; + + err = sscanf(buf, "%u", &min_rate); + if (err != 1) + return -EINVAL; + + err = mlx5_esw_qos_set_sysfs_group_min_rate(esw, g, min_rate); + + return err ? err : count; +} + +static ssize_t port_show(struct mlx5_sriov_vf *g, struct vf_attributes *oa, + char *buf) +{ + struct mlx5_core_dev *dev = g->dev; + union ib_gid gid; + int err; + u8 *p; + + err = mlx5_core_query_gids(dev, 1, 1, g->vf, 0 , &gid); + if (err) { + mlx5_core_warn(dev, "failed to query gid at index 0 for vf %d\n", g->vf); + return err; + } + + p = &gid.raw[8]; + err = sprintf(buf, "%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n", + p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7]); + return err; +} + +static ssize_t port_store(struct mlx5_sriov_vf *g, struct vf_attributes *oa, + const char *buf, size_t count) +{ + struct mlx5_core_dev *dev = g->dev; + struct mlx5_vf_context *vfs_ctx = dev->priv.sriov.vfs_ctx; + struct mlx5_hca_vport_context *in; + u64 guid = 0; + int err; + int tmp[8]; + int i; + + err = sscanf(buf, "%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n", + &tmp[0], &tmp[1], &tmp[2], &tmp[3], &tmp[4], &tmp[5], &tmp[6], &tmp[7]); + if (err != 8) + return -EINVAL; + + for (i = 0; i < 8; i++) + guid += ((u64)tmp[i] << ((7 - i) * 8)); + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) + return -ENOMEM; + + in->field_select = MLX5_HCA_VPORT_SEL_PORT_GUID; + in->port_guid = guid; + err = mlx5_core_modify_hca_vport_context(dev, 1, 1, g->vf + 1, in); + kfree(in); + if (err) + return err; + + vfs_ctx[g->vf].port_guid = guid; + + return count; +} + +static int show_hca_node_guid(struct mlx5_core_dev *dev, u16 vf, + __be64 *node_guid) +{ + struct mlx5_hca_vport_context *rep; + int err; + + rep = kzalloc(sizeof(*rep), GFP_KERNEL); + if (!rep) + return -ENOMEM; + + err = mlx5_core_query_hca_vport_context(dev, 1, 1, vf, rep); + if (err) + goto free; + + *node_guid = cpu_to_be64(rep->node_guid); + + return 0; + +free: + kfree(rep); + return err; +} + +static int show_nic_node_guid(struct mlx5_core_dev *dev, u16 vf, + __be64 *node_guid) +{ + int err; + + err = mlx5_query_nic_vport_node_guid(dev, vf + 1, node_guid); + if (!err) + *node_guid = cpu_to_be64(*node_guid); + + return err; +} + +static ssize_t node_show(struct mlx5_sriov_vf *g, struct vf_attributes *oa, + char *buf) +{ + struct mlx5_core_dev *dev = g->dev; + __be64 guid; + + int err; + u8 *p; + + if (MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_IB) + err = show_hca_node_guid(dev, g->vf, &guid); + else if (MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_ETH) + err = show_nic_node_guid(dev, g->vf, &guid); + else + return -ENOTSUPP; + + if (err) { + mlx5_core_warn(dev, "failed to query node guid for vf %d (%d)\n", + g->vf, err); + return err; + } + + p = (u8 *)&guid; + err = sprintf(buf, "%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n", + p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7]); + + return err; +} + +static int modify_hca_node_guid(struct mlx5_core_dev *dev, u16 vf, + u64 node_guid) +{ + struct mlx5_vf_context *vfs_ctx = dev->priv.sriov.vfs_ctx; + struct mlx5_hca_vport_context *in; + int err; + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) + return -ENOMEM; + + in->field_select = MLX5_HCA_VPORT_SEL_NODE_GUID; + in->node_guid = node_guid; + err = mlx5_core_modify_hca_vport_context(dev, 1, 1, vf + 1, in); + if (!err) + vfs_ctx[vf].node_guid = node_guid; + kfree(in); + + return err; +} + +static int modify_nic_node_guid(struct mlx5_core_dev *dev, u16 vf, + u64 node_guid) +{ + return mlx5_modify_nic_vport_node_guid(dev, vf + 1, node_guid); +} + +static ssize_t node_store(struct mlx5_sriov_vf *g, struct vf_attributes *oa, + const char *buf, size_t count) +{ + struct mlx5_core_dev *dev = g->dev; + u64 guid = 0; + int err; + int tmp[8]; + int i; + + err = sscanf(buf, "%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n", + &tmp[0], &tmp[1], &tmp[2], &tmp[3], &tmp[4], &tmp[5], &tmp[6], &tmp[7]); + if (err != 8) + return -EINVAL; + + for (i = 0; i < 8; i++) + guid += ((u64)tmp[i] << ((7 - i) * 8)); + + if (MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_IB) + err = modify_hca_node_guid(dev, g->vf, guid); + else if (MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_ETH) + err = modify_nic_node_guid(dev, g->vf, guid); + else + return -ENOTSUPP; + + if (err) { + mlx5_core_warn(dev, "failed to modify node guid for vf %d (%d)\n", + g->vf, err); + return err; + } + + return count; +} + +static const char *policy_str(enum port_state_policy policy) +{ + switch (policy) { + case MLX5_POLICY_DOWN: return "Down\n"; + case MLX5_POLICY_UP: return "Up\n"; + case MLX5_POLICY_FOLLOW: return "Follow\n"; + default: return "Invalid policy\n"; + } +} + +static ssize_t policy_show(struct mlx5_sriov_vf *g, struct vf_attributes *oa, + char *buf) +{ + struct mlx5_core_dev *dev = g->dev; + struct mlx5_hca_vport_context *rep; + const char *p = NULL; + int err; + + rep = kzalloc(sizeof(*rep), GFP_KERNEL); + if (!rep) + return -ENOMEM; + + err = mlx5_core_query_hca_vport_context(dev, 1, 1, g->vf, rep); + if (err) { + mlx5_core_warn(dev, "failed to query port policy for vf %d (%d)\n", + g->vf, err); + goto free; + } + p = policy_str(rep->policy); + strcpy(buf, p); + +free: + kfree(rep); + return p ? strlen(p) : err; +} + +static int strpolicy(const char *buf, enum port_state_policy *policy) +{ + if (sysfs_streq(buf, "Down")) { + *policy = MLX5_POLICY_DOWN; + return 0; + } + + if (sysfs_streq(buf, "Up")) { + *policy = MLX5_POLICY_UP; + return 0; + } + + if (sysfs_streq(buf, "Follow")) { + *policy = MLX5_POLICY_FOLLOW; + return 0; + } + return -EINVAL; +} + +static ssize_t policy_store(struct mlx5_sriov_vf *g, struct vf_attributes *oa, + const char *buf, size_t count) +{ + struct mlx5_core_dev *dev = g->dev; + struct mlx5_vf_context *vfs_ctx = dev->priv.sriov.vfs_ctx; + struct mlx5_hca_vport_context *in; + enum port_state_policy policy; + int err; + + err = strpolicy(buf, &policy); + if (err) + return err; + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) + return -ENOMEM; + + in->policy = policy; + in->field_select = MLX5_HCA_VPORT_SEL_STATE_POLICY; + err = mlx5_core_modify_hca_vport_context(dev, 1, 1, g->vf + 1, in); + kfree(in); + if (err) + return err; + + vfs_ctx[g->vf].policy = policy; + + return count; +} + +#ifdef CONFIG_MLX5_ESWITCH +/* ETH SRIOV SYSFS */ +static ssize_t mac_show(struct mlx5_sriov_vf *g, struct vf_attributes *oa, + char *buf) +{ + return sprintf(buf, + "usage: write to set VF Mac Address\n"); +} + +static ssize_t mac_store(struct mlx5_sriov_vf *g, struct vf_attributes *oa, + const char *buf, size_t count) +{ + struct mlx5_core_dev *dev = g->dev; + u8 mac[ETH_ALEN]; + int err; + + err = sscanf(buf, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx", + &mac[0], &mac[1], &mac[2], &mac[3], &mac[4], &mac[5]); + if (err == 6) + goto set_mac; + + if (sysfs_streq(buf, "Random")) + eth_random_addr(mac); + else + return -EINVAL; + +set_mac: + err = mlx5_eswitch_set_vport_mac(dev->priv.eswitch, g->vf + 1, mac); + return err ? err : count; +} + +static ssize_t vlan_show(struct mlx5_sriov_vf *g, struct vf_attributes *oa, + char *buf) +{ + return sprintf(buf, + "usage: write to set VF Vlan," + " Qos, and optionally Vlan Protocol (default 802.1Q)\n"); +} + +static ssize_t vlan_store(struct mlx5_sriov_vf *g, struct vf_attributes *oa, + const char *buf, size_t count) +{ + struct mlx5_core_dev *dev = g->dev; + char vproto_ext[5] = {'\0'}; + __be16 vlan_proto; + u16 vlan_id; + u8 qos; + int err; + + err = sscanf(buf, "%hu:%hhu:802.%4s", &vlan_id, &qos, vproto_ext); + if (err == 3) { + if ((strcmp(vproto_ext, "1AD") == 0) || + (strcmp(vproto_ext, "1ad") == 0)) + vlan_proto = htons(ETH_P_8021AD); + else if ((strcmp(vproto_ext, "1Q") == 0) || + (strcmp(vproto_ext, "1q") == 0)) + vlan_proto = htons(ETH_P_8021Q); + else + return -EINVAL; + } else { + err = sscanf(buf, "%hu:%hhu", &vlan_id, &qos); + if (err != 2) + return -EINVAL; + vlan_proto = htons(ETH_P_8021Q); + } + + err = mlx5_eswitch_set_vport_vlan(dev->priv.eswitch, g->vf + 1, + vlan_id, qos, vlan_proto); + return err ? err : count; +} + +static const char *vlan_proto_str(u16 vlan, u8 qos, __be16 vlan_proto) +{ + if (!vlan && !qos) + return "N/A"; + + switch (vlan_proto) { + case htons(ETH_P_8021AD): return "802.1ad"; + case htons(ETH_P_8021Q): return "802.1Q"; + default: return "Invalid vlan protocol"; + } +} + +static ssize_t spoofcheck_show(struct mlx5_sriov_vf *g, + struct vf_attributes *oa, + char *buf) +{ + return sprintf(buf, + "usage: write to enable|disable VF SpoofCheck\n" + ); +} + +static ssize_t spoofcheck_store(struct mlx5_sriov_vf *g, + struct vf_attributes *oa, + const char *buf, + size_t count) +{ + struct mlx5_core_dev *dev = g->dev; + bool settings; + int err; + + if (sysfs_streq(buf, "ON")) + settings = true; + else if (sysfs_streq(buf, "OFF")) + settings = false; + else + return -EINVAL; + + err = mlx5_eswitch_set_vport_spoofchk(dev->priv.eswitch, g->vf + 1, settings); + return err ? err : count; +} + +static ssize_t trust_show(struct mlx5_sriov_vf *g, + struct vf_attributes *oa, + char *buf) +{ + return sprintf(buf, + "usage: write to trust|untrust VF\n" + ); +} + +static ssize_t trust_store(struct mlx5_sriov_vf *g, + struct vf_attributes *oa, + const char *buf, + size_t count) +{ + struct mlx5_core_dev *dev = g->dev; + bool settings; + int err; + + if (sysfs_streq(buf, "ON")) + settings = true; + else if (sysfs_streq(buf, "OFF")) + settings = false; + else + return -EINVAL; + + err = mlx5_eswitch_set_vport_trust(dev->priv.eswitch, g->vf + 1, settings); + return err ? err : count; +} + +static ssize_t link_state_show(struct mlx5_sriov_vf *g, + struct vf_attributes *oa, + char *buf) +{ + return sprintf(buf, "usage: write to set VF State\n"); +} + +static ssize_t link_state_store(struct mlx5_sriov_vf *g, + struct vf_attributes *oa, + const char *buf, + size_t count) +{ + struct mlx5_core_dev *dev = g->dev; + enum port_state_policy policy; + int err; + + err = strpolicy(buf, &policy); + if (err) + return err; + + err = mlx5_eswitch_set_vport_state(dev->priv.eswitch, g->vf + 1, policy); + return err ? err : count; +} + +static ssize_t max_tx_rate_show(struct mlx5_sriov_vf *g, + struct vf_attributes *oa, + char *buf) +{ + return sprintf(buf, + "usage: write to set VF max rate\n"); +} + +static ssize_t max_tx_rate_store(struct mlx5_sriov_vf *g, + struct vf_attributes *oa, + const char *buf, size_t count) +{ + struct mlx5_core_dev *dev = g->dev; + struct mlx5_eswitch *esw = dev->priv.eswitch; + struct mlx5_vport *evport = mlx5_eswitch_get_vport(esw, g->vf + 1); + u32 max_tx_rate; + u32 min_tx_rate; + int err; + + mutex_lock(&esw->state_lock); + min_tx_rate = evport->qos.min_rate; + mutex_unlock(&esw->state_lock); + + err = sscanf(buf, "%u", &max_tx_rate); + if (err != 1) + return -EINVAL; + + err = mlx5_eswitch_set_vport_rate(dev->priv.eswitch, g->vf + 1, + max_tx_rate, min_tx_rate); + return err ? err : count; +} + +static ssize_t group_show(struct mlx5_sriov_vf *g, + struct vf_attributes *oa, + char *buf) +{ + return sprintf(buf, + "usage: write to set VF vport group\n", + MLX5_ESW_QOS_SYSFS_GROUP_MAX_ID); +} + +static ssize_t group_store(struct mlx5_sriov_vf *g, + struct vf_attributes *oa, + const char *buf, size_t count) +{ + struct mlx5_core_dev *dev = g->dev; + struct mlx5_eswitch *esw = dev->priv.eswitch; + u32 group_id; + int err; + + err = sscanf(buf, "%u", &group_id); + if (err != 1) + return -EINVAL; + + if (group_id > MLX5_ESW_QOS_SYSFS_GROUP_MAX_ID) + return -EINVAL; + + err = mlx5_esw_qos_vport_update_sysfs_group(esw, g->vf + 1, group_id); + + return err ? err : count; +} + +static ssize_t min_tx_rate_show(struct mlx5_sriov_vf *g, + struct vf_attributes *oa, + char *buf) +{ + return sprintf(buf, + "usage: write to set VF min rate\n"); +} + +static ssize_t min_tx_rate_store(struct mlx5_sriov_vf *g, + struct vf_attributes *oa, + const char *buf, size_t count) +{ + struct mlx5_core_dev *dev = g->dev; + struct mlx5_eswitch *esw = dev->priv.eswitch; + struct mlx5_vport *evport = mlx5_eswitch_get_vport(esw, g->vf + 1); + u32 min_tx_rate; + u32 max_tx_rate; + int err; + + mutex_lock(&esw->state_lock); + max_tx_rate = evport->qos.max_rate; + mutex_unlock(&esw->state_lock); + + err = sscanf(buf, "%u", &min_tx_rate); + if (err != 1) + return -EINVAL; + + err = mlx5_eswitch_set_vport_rate(dev->priv.eswitch, g->vf + 1, + max_tx_rate, min_tx_rate); + return err ? err : count; +} + +static ssize_t min_pf_tx_rate_show(struct mlx5_sriov_vf *g, + struct vf_attributes *oa, + char *buf) +{ + return sprintf(buf, + "usage: write to set PF min rate\n"); +} + +static ssize_t min_pf_tx_rate_store(struct mlx5_sriov_vf *g, + struct vf_attributes *oa, + const char *buf, size_t count) +{ + struct mlx5_core_dev *dev = g->dev; + struct mlx5_eswitch *esw = dev->priv.eswitch; + struct mlx5_vport *evport = mlx5_eswitch_get_vport(esw, g->vf + 1); + u32 min_tx_rate; + u32 max_tx_rate; + int err; + + mutex_lock(&esw->state_lock); + max_tx_rate = evport->qos.max_rate; + mutex_unlock(&esw->state_lock); + + err = sscanf(buf, "%u", &min_tx_rate); + if (err != 1) + return -EINVAL; + + err = mlx5_eswitch_set_vport_rate(dev->priv.eswitch, g->vf, + max_tx_rate, min_tx_rate); + return err ? err : count; +} + +#define _sprintf(p, buf, format, arg...) \ + ((PAGE_SIZE - (int)(p - buf)) <= 0 ? 0 : \ + scnprintf(p, PAGE_SIZE - (int)(p - buf), format, ## arg)) + +static ssize_t trunk_show(struct mlx5_sriov_vf *g, + struct vf_attributes *oa, + char *buf) +{ + struct mlx5_core_dev *dev = g->dev; + struct mlx5_eswitch *esw = dev->priv.eswitch; + struct mlx5_vport *evport = mlx5_eswitch_get_vport(esw, g->vf + 1); + u16 vlan_id = 0; + char *ret = buf; + + mutex_lock(&esw->state_lock); + if (!!bitmap_weight(evport->info.vlan_trunk_8021q_bitmap, VLAN_N_VID)) { + ret += _sprintf(ret, buf, "Allowed 802.1Q VLANs:"); + for_each_set_bit(vlan_id, evport->info.vlan_trunk_8021q_bitmap, + VLAN_N_VID) + ret += _sprintf(ret, buf, " %d", vlan_id); + ret += _sprintf(ret, buf, "\n"); + } + mutex_unlock(&esw->state_lock); + + return (ssize_t)(ret - buf); +} + +static ssize_t trunk_store(struct mlx5_sriov_vf *g, + struct vf_attributes *oa, + const char *buf, + size_t count) +{ + struct mlx5_core_dev *dev = g->dev; + u16 start_vid, end_vid; + char op[5]; + int err; + + err = sscanf(buf, "%4s %hu %hu", op, &start_vid, &end_vid); + if (err != 3) + return -EINVAL; + + if (!strcmp(op, "add")) + err = mlx5_eswitch_add_vport_trunk_range(dev->priv.eswitch, + g->vf + 1, + start_vid, end_vid); + else if (!strcmp(op, "rem")) + err = mlx5_eswitch_del_vport_trunk_range(dev->priv.eswitch, + g->vf + 1, + start_vid, end_vid); + else + return -EINVAL; + + return err ? err : count; +} + +static ssize_t config_show(struct mlx5_sriov_vf *g, struct vf_attributes *oa, + char *buf) +{ + struct mlx5_core_dev *dev = g->dev; + struct mlx5_eswitch *esw = dev->priv.eswitch; + struct mlx5_vport *evport = mlx5_eswitch_get_vport(esw, g->vf + 1); + struct mlx5_vport_info *ivi; + char *p = buf; + + if (!esw && MLX5_CAP_GEN(esw->dev, vport_group_manager) && mlx5_core_is_pf(esw->dev)) + return -EPERM; + + if (IS_ERR(evport)) + return PTR_ERR(evport); + + mutex_lock(&esw->state_lock); + ivi = &evport->info; + p += _sprintf(p, buf, "VF : %d\n", g->vf); + p += _sprintf(p, buf, "MAC : %pM\n", ivi->mac); + p += _sprintf(p, buf, "VLAN : %d\n", ivi->vlan); + p += _sprintf(p, buf, "QoS : %d\n", ivi->qos); + p += _sprintf(p, buf, "VLAN Proto : %s\n", + vlan_proto_str(ivi->vlan, ivi->qos, ivi->vlan_proto)); + p += _sprintf(p, buf, "SpoofCheck : %s\n", ivi->spoofchk ? "ON" : "OFF"); + p += _sprintf(p, buf, "Trust : %s\n", ivi->trusted ? "ON" : "OFF"); + p += _sprintf(p, buf, "LinkState : %s", policy_str(ivi->link_state)); + + if (evport->qos.enabled) { + p += _sprintf(p, buf, "MinTxRate : %d\n", evport->qos.min_rate); + p += _sprintf(p, buf, "MaxTxRate : %d\n", evport->qos.max_rate); + if (evport->qos.group) + p += _sprintf(p, buf, "RateGroup : %d\n", + evport->qos.group->group_id); + else + p += _sprintf(p, buf, "RateGroup : 0\n"); + } else { + p += _sprintf(p, buf, "MinTxRate : 0\nMaxTxRate : 0\nRateGroup : 0\n"); + } + + p += _sprintf(p, buf, "VGT+ : %s\n", + !!bitmap_weight(ivi->vlan_trunk_8021q_bitmap, + VLAN_N_VID) ? "ON" : "OFF"); + p += _sprintf(p, buf, "RateGroup : %d\n", ivi->group); + mutex_unlock(&esw->state_lock); + + return (ssize_t)(p - buf); +} + +static ssize_t config_store(struct mlx5_sriov_vf *g, + struct vf_attributes *oa, + const char *buf, size_t count) +{ + return -ENOTSUPP; +} + +static ssize_t config_group_show(struct mlx5_esw_rate_group *g, + struct vf_group_attributes *oa, + char *buf) +{ + struct mlx5_core_dev *dev = g->dev; + struct mlx5_eswitch *esw = dev->priv.eswitch; + char *p = buf; + + if (!esw && MLX5_CAP_GEN(esw->dev, vport_group_manager) && + mlx5_core_is_pf(esw->dev)) + return -EPERM; + + mutex_lock(&esw->state_lock); + p += _sprintf(p, buf, "Num VFs : %d\n", g->num_vports); + p += _sprintf(p, buf, "MaxRate : %d\n", g->max_rate); + p += _sprintf(p, buf, "MinRate : %d\n", g->min_rate); + p += _sprintf(p, buf, "BWShare(Indirect cfg) : %d\n", g->bw_share); + mutex_unlock(&esw->state_lock); + + return (ssize_t)(p - buf); +} + +static ssize_t config_group_store(struct mlx5_esw_rate_group *g, + struct vf_group_attributes *oa, + const char *buf, size_t count) +{ + return -ENOTSUPP; +} + +static ssize_t stats_show(struct mlx5_sriov_vf *g, struct vf_attributes *oa, + char *buf) +{ + struct ifla_vf_stats_backport ifi_backport; + struct mlx5_core_dev *dev = g->dev; + struct mlx5_vport *vport = mlx5_eswitch_get_vport(dev->priv.eswitch, g->vf + 1); + struct ifla_vf_stats ifi; + struct mlx5_vport_drop_stats stats = {}; + int err; + char *p = buf; + + err = mlx5_eswitch_get_vport_stats(dev->priv.eswitch, g->vf + 1, &ifi); + if (err) + return -EINVAL; + + err = mlx5_eswitch_get_vport_stats_backport(dev->priv.eswitch, g->vf + 1, &ifi_backport); + if (err) + return -EINVAL; + + err = mlx5_esw_query_vport_drop_stats(dev, vport, &stats); + if (err) + return -EINVAL; + + p += _sprintf(p, buf, "tx_packets : %llu\n", ifi.tx_packets); + p += _sprintf(p, buf, "tx_bytes : %llu\n", ifi.tx_bytes); + p += _sprintf(p, buf, "tx_dropped : %llu\n", stats.tx_dropped); + p += _sprintf(p, buf, "rx_packets : %llu\n", ifi.rx_packets); + p += _sprintf(p, buf, "rx_bytes : %llu\n", ifi.rx_bytes); + p += _sprintf(p, buf, "rx_broadcast : %llu\n", ifi.broadcast); + p += _sprintf(p, buf, "rx_multicast : %llu\n", ifi.multicast); + p += _sprintf(p, buf, "tx_broadcast : %llu\n", ifi_backport.tx_broadcast); + p += _sprintf(p, buf, "tx_multicast : %llu\n", ifi_backport.tx_multicast); + p += _sprintf(p, buf, "rx_dropped : %llu\n", stats.rx_dropped); + + return (ssize_t)(p - buf); +} + +static ssize_t stats_store(struct mlx5_sriov_vf *g, struct vf_attributes *oa, + const char *buf, size_t count) +{ + return -ENOTSUPP; +} + +static ssize_t vf_meter_common_show(struct mlx5_sriov_vf *g, + struct vf_attributes *oa, char *buf, + int data_type) +{ + struct mlx5_core_dev *dev = g->dev; + struct mlx5_eswitch *esw = dev->priv.eswitch; + u64 data; + int err; + + err = mlx5_eswitch_get_vf_meter_data(esw, g->vf + 1, data_type, + g->meter_type.rx_tx, + g->meter_type.xps, &data); + if (err) + return err; + + return sprintf(buf, "%lld\n", data); +} + +static ssize_t vf_meter_common_store(struct mlx5_sriov_vf *g, + struct vf_attributes *oa, + const char *buf, size_t count, + int data_type) +{ + struct mlx5_core_dev *dev = g->dev; + struct mlx5_eswitch *esw = dev->priv.eswitch; + s64 data; + int err; + + err = kstrtos64(buf, 10, &data); + if (err) + return err; + + if (data < 0) + return -EINVAL; + + err = mlx5_eswitch_set_vf_meter_data(esw, g->vf + 1, data_type, + g->meter_type.rx_tx, + g->meter_type.xps, data); + + return err ? err : count; +} + +static ssize_t rate_show(struct mlx5_sriov_vf *g, struct vf_attributes *oa, + char *buf) +{ + return vf_meter_common_show(g, oa, buf, MLX5_RATE_LIMIT_DATA_RATE); +} + +static ssize_t rate_store(struct mlx5_sriov_vf *g, struct vf_attributes *oa, + const char *buf, size_t count) +{ + return vf_meter_common_store(g, oa, buf, count, MLX5_RATE_LIMIT_DATA_RATE); +} + +static ssize_t burst_show(struct mlx5_sriov_vf *g, struct vf_attributes *oa, + char *buf) +{ + return vf_meter_common_show(g, oa, buf, MLX5_RATE_LIMIT_DATA_BURST); +} + +static ssize_t burst_store(struct mlx5_sriov_vf *g, struct vf_attributes *oa, + const char *buf, size_t count) +{ + return vf_meter_common_store(g, oa, buf, count, MLX5_RATE_LIMIT_DATA_BURST); +} + +static ssize_t bytes_dropped_show(struct mlx5_sriov_vf *g, struct vf_attributes *oa, + char *buf) +{ + return vf_meter_common_show(g, oa, buf, MLX5_RATE_LIMIT_DATA_BYTES_DROPPED); +} + +static ssize_t bytes_dropped_store(struct mlx5_sriov_vf *g, struct vf_attributes *oa, + const char *buf, size_t count) +{ + return -EOPNOTSUPP; +} + +static ssize_t packets_dropped_show(struct mlx5_sriov_vf *g, struct vf_attributes *oa, + char *buf) +{ + return vf_meter_common_show(g, oa, buf, MLX5_RATE_LIMIT_DATA_PACKETS_DROPPED); +} + +static ssize_t packets_dropped_store(struct mlx5_sriov_vf *g, struct vf_attributes *oa, + const char *buf, size_t count) +{ + return -EOPNOTSUPP; +} + +static ssize_t page_limit_show(struct mlx5_sriov_vf *g, struct vf_attributes *oa, + char *buf) +{ + struct mlx5_eswitch *esw = g->dev->priv.eswitch; + struct mlx5_vport *evport; + u32 page_limit; + + evport = mlx5_eswitch_get_vport(esw, g->vf + 1); + spin_lock(&evport->pg_counters_lock); + page_limit = evport->page_limit; + spin_unlock(&evport->pg_counters_lock); + return sprintf(buf, "%u\n", page_limit); +} + +static ssize_t page_limit_store(struct mlx5_sriov_vf *g, struct vf_attributes *oa, + const char *buf, size_t count) +{ + struct mlx5_eswitch *esw = g->dev->priv.eswitch; + struct mlx5_vport *evport; + u32 limit; + int err; + + evport = mlx5_eswitch_get_vport(esw, g->vf + 1); + err = sscanf(buf, "%u", &limit); + if (err != 1) + return -EINVAL; + spin_lock(&evport->pg_counters_lock); + evport->page_limit = limit; + spin_unlock(&evport->pg_counters_lock); + return count; +} + +static ssize_t num_pages_show(struct mlx5_sriov_vf *g, struct vf_attributes *oa, + char *buf) +{ + struct mlx5_eswitch *esw = g->dev->priv.eswitch; + struct mlx5_vport *evport; + u32 fw_pages; + + evport = mlx5_eswitch_get_vport(esw, g->vf + 1); + spin_lock(&evport->pg_counters_lock); + fw_pages = evport->fw_pages; + spin_unlock(&evport->pg_counters_lock); + return sprintf(buf, "%u\n", fw_pages); +} + +static ssize_t num_pages_store(struct mlx5_sriov_vf *g, struct vf_attributes *oa, + const char *buf, size_t count) +{ + return -EOPNOTSUPP; +} + +#endif /* CONFIG_MLX5_ESWITCH */ + +static ssize_t num_vf_store(struct device *device, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct pci_dev *pdev = container_of(device, struct pci_dev, dev); + int req_vfs; + int err; + + if (kstrtoint(buf, 0, &req_vfs) || req_vfs < 0 || + req_vfs > pci_sriov_get_totalvfs(pdev)) + return -EINVAL; + + err = mlx5_core_sriov_configure(pdev, req_vfs); + if (err < 0) + return err; + + return count; +} + +static ssize_t num_vf_show(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct pci_dev *pdev = container_of(device, struct pci_dev, dev); + struct mlx5_core_dev *dev = pci_get_drvdata(pdev); + struct mlx5_core_sriov *sriov = &dev->priv.sriov; + + return sprintf(buf, "%d\n", sriov->num_vfs); +} + +static DEVICE_ATTR(mlx5_num_vfs, 0600, num_vf_show, num_vf_store); + +static const struct sysfs_ops vf_sysfs_ops = { + .show = vf_attr_show, + .store = vf_attr_store, +}; + +static const struct sysfs_ops vf_paging_ops = { + .show = vf_paging_attr_show, + .store = vf_paging_attr_store, +}; + +static const struct sysfs_ops vf_group_sysfs_ops = { + .show = vf_group_attr_show, + .store = vf_group_attr_store, +}; + +#define VF_RATE_GROUP_ATTR(_name) struct vf_group_attributes vf_group_attr_##_name = \ + __ATTR(_name, 0644, _name##_group_show, _name##_group_store) +#define VF_ATTR(_name) struct vf_attributes vf_attr_##_name = \ + __ATTR(_name, 0644, _name##_show, _name##_store) + +VF_ATTR(node); +VF_ATTR(port); +VF_ATTR(policy); + +#ifdef CONFIG_MLX5_ESWITCH +VF_ATTR(mac); +VF_ATTR(vlan); +VF_ATTR(link_state); +VF_ATTR(spoofcheck); +VF_ATTR(trust); +VF_ATTR(page_limit); +VF_ATTR(num_pages); +VF_ATTR(max_tx_rate); +VF_ATTR(min_tx_rate); +VF_ATTR(config); +VF_ATTR(trunk); +VF_ATTR(stats); +VF_ATTR(group); +VF_RATE_GROUP_ATTR(min_tx_rate); +VF_RATE_GROUP_ATTR(max_tx_rate); +VF_RATE_GROUP_ATTR(config); + +static struct attribute *vf_eth_attrs[] = { + &vf_attr_node.attr, + &vf_attr_mac.attr, + &vf_attr_vlan.attr, + &vf_attr_link_state.attr, + &vf_attr_spoofcheck.attr, + &vf_attr_trust.attr, + &vf_attr_max_tx_rate.attr, + &vf_attr_min_tx_rate.attr, + &vf_attr_config.attr, + &vf_attr_trunk.attr, + &vf_attr_stats.attr, + &vf_attr_group.attr, + NULL +}; + +static struct attribute *vf_group_attrs[] = { + &vf_group_attr_max_tx_rate.attr, + &vf_group_attr_min_tx_rate.attr, + &vf_group_attr_config.attr, + NULL +}; + +static struct attribute *vf_paging_attrs[] = { + &vf_attr_page_limit.attr, + &vf_attr_num_pages.attr, + NULL +}; + +static struct kobj_type vf_type_eth = { + .sysfs_ops = &vf_sysfs_ops, + .default_attrs = vf_eth_attrs +}; + +static struct kobj_type vf_paging = { + .sysfs_ops = &vf_paging_ops, + .default_attrs = vf_paging_attrs +}; + +static struct kobj_type vf_group = { + .sysfs_ops = &vf_group_sysfs_ops, + .default_attrs = vf_group_attrs +}; + +static struct vf_attributes pf_attr_min_pf_tx_rate = \ + __ATTR(min_tx_rate, 0644, min_pf_tx_rate_show, min_pf_tx_rate_store); + +static struct attribute *pf_eth_attrs[] = { + &pf_attr_min_pf_tx_rate.attr, + NULL, +}; + +static struct kobj_type pf_type_eth = { + .sysfs_ops = &vf_sysfs_ops, + .default_attrs = pf_eth_attrs +}; + +VF_ATTR(rate); +VF_ATTR(burst); +VF_ATTR(bytes_dropped); +VF_ATTR(packets_dropped); + +static struct attribute *vf_meters_eth_attrs[] = { + &vf_attr_rate.attr, + &vf_attr_burst.attr, + &vf_attr_bytes_dropped.attr, + &vf_attr_packets_dropped.attr, + NULL +}; + +static struct kobj_type vf_meters_type_eth = { + .sysfs_ops = &vf_sysfs_ops, + .default_attrs = vf_meters_eth_attrs +}; +#endif /* CONFIG_MLX5_ESWITCH */ + +static struct attribute *vf_ib_attrs[] = { + &vf_attr_node.attr, + &vf_attr_port.attr, + &vf_attr_policy.attr, + NULL +}; + +static struct kobj_type vf_type_ib = { + .sysfs_ops = &vf_sysfs_ops, + .default_attrs = vf_ib_attrs +}; + +static struct device_attribute *mlx5_class_attributes[] = { + &dev_attr_mlx5_num_vfs, +}; + +int mlx5_sriov_sysfs_init(struct mlx5_core_dev *dev) +{ + struct mlx5_core_sriov *sriov = &dev->priv.sriov; + struct device *device = &dev->pdev->dev; + int err; + int i; + + sriov->config = kobject_create_and_add("sriov", &device->kobj); + if (!sriov->config) + return -ENOMEM; + +#ifdef CONFIG_MLX5_ESWITCH + if (MLX5_CAP_QOS(dev, log_esw_max_sched_depth)) { + sriov->groups_config = kobject_create_and_add("groups", + sriov->config); + if (!sriov->groups_config) { + err = -ENOMEM; + goto err_groups; + } + } +#endif + + for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) { + err = device_create_file(device, mlx5_class_attributes[i]); + if (err) + goto err_attr; + } + + return 0; + +err_attr: + if (sriov->groups_config) { + kobject_put(sriov->groups_config); + sriov->groups_config = NULL; + } + +err_groups: + kobject_put(sriov->config); + sriov->config = NULL; + return err; +} + +void mlx5_sriov_sysfs_cleanup(struct mlx5_core_dev *dev) +{ + struct mlx5_core_sriov *sriov = &dev->priv.sriov; + struct device *device = &dev->pdev->dev; + int i; + + for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) + device_remove_file(device, mlx5_class_attributes[i]); + + if (MLX5_CAP_QOS(dev, log_esw_max_sched_depth)) + kobject_put(sriov->groups_config); + kobject_put(sriov->config); + sriov->config = NULL; +} + +int mlx5_create_vf_group_sysfs(struct mlx5_core_dev *dev, + u32 group_id, struct kobject *group_kobj) +{ + struct mlx5_core_sriov *sriov = &dev->priv.sriov; + int err; + +#ifdef CONFIG_MLX5_ESWITCH + err = kobject_init_and_add(group_kobj, &vf_group, sriov->groups_config, + "%d", group_id); + if (err) + return err; + + kobject_uevent(group_kobj, KOBJ_ADD); +#endif + + return 0; +} + +void mlx5_destroy_vf_group_sysfs(struct mlx5_core_dev *dev, + struct kobject *group_kobj) +{ +#ifdef CONFIG_MLX5_ESWITCH + kobject_put(group_kobj); +#endif +} + + +#ifdef CONFIG_MLX5_ESWITCH +static void mlx5_destroy_vfs_sysfs_meters(struct mlx5_core_dev *dev, int num_vfs) +{ + struct mlx5_core_sriov *sriov = &dev->priv.sriov; + struct mlx5_sriov_vf_meters *meters; + struct mlx5_sriov_vf *vf; + int i, j; + + for (i = 0; i < num_vfs; i++) { + vf = &sriov->vfs[i]; + + meters = vf->meters; + if (!meters) + break; + + for (j = 0; j < 4; j++) + kobject_put(&meters->meters[j].kobj); + + kobject_put(meters->rx_kobj); + kobject_put(meters->tx_kobj); + kobject_put(meters->kobj); + + kfree(meters); + } +} + +static int mlx5_init_vfs_sysfs_init_meter(struct mlx5_sriov_vf *vf, + struct mlx5_sriov_vf_meters *meters, + struct mlx5_sriov_vf *meter, + int rx_tx, int xps) +{ + struct kobject *parent; + int err; + + if (rx_tx == MLX5_RATE_LIMIT_TX) + parent = meters->tx_kobj; + else + parent = meters->rx_kobj; + + err = kobject_init_and_add(&meter->kobj, &vf_meters_type_eth, parent, + (xps == MLX5_RATE_LIMIT_PPS) ? "pps" : "bps"); + if (err) + return err; + + meter->dev = vf->dev; + meter->vf = vf->vf; + meter->meter_type.rx_tx = rx_tx; + meter->meter_type.xps = xps; + + return 0; +} + +int mlx5_create_vfs_sysfs_meters(struct mlx5_core_dev *dev, int num_vfs) +{ + struct mlx5_core_sriov *sriov = &dev->priv.sriov; + struct mlx5_sriov_vf_meters *meters; + struct mlx5_sriov_vf *vf; + int err, i; + + if (!(MLX5_CAP_GEN_64(dev, general_obj_types) & + MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_FLOW_METER_ASO)) + return 0; + + if (!(MLX5_CAP_QOS(dev, flow_meter_reg_id) & 0x20)) { + mlx5_core_warn(dev, "Metadata reg C5 can't be used for flow meter.\n"); + return 0; + } + + if (!MLX5_CAP_ESW_EGRESS_ACL(dev, execute_aso)) + return 0; + + for (i = 0; i < num_vfs; i++) { + vf = &sriov->vfs[i]; + + meters = kzalloc(sizeof(*meters), GFP_KERNEL); + if (!meters) { + err = -ENOMEM; + goto err_vf_meters; + } + + meters->kobj = kobject_create_and_add("meters", &vf->kobj); + if (!meters->kobj) { + err = -EINVAL; + goto err_vf_meters; + } + + meters->rx_kobj = kobject_create_and_add("rx", meters->kobj); + if (!meters->rx_kobj) { + err = -EINVAL; + goto err_vf_meters; + } + + meters->tx_kobj = kobject_create_and_add("tx", meters->kobj); + if (!meters->tx_kobj) { + err = -EINVAL; + goto err_vf_meters; + } + + err = mlx5_init_vfs_sysfs_init_meter(vf, meters, + &meters->meters[0], + MLX5_RATE_LIMIT_RX, + MLX5_RATE_LIMIT_BPS); + if (err) + goto err_vf_meters; + + err = mlx5_init_vfs_sysfs_init_meter(vf, meters, + &meters->meters[1], + MLX5_RATE_LIMIT_RX, + MLX5_RATE_LIMIT_PPS); + if (err) + goto err_put_meter_0; + + err = mlx5_init_vfs_sysfs_init_meter(vf, meters, + &meters->meters[2], + MLX5_RATE_LIMIT_TX, + MLX5_RATE_LIMIT_BPS); + if (err) + goto err_put_meter_1; + + err = mlx5_init_vfs_sysfs_init_meter(vf, meters, + &meters->meters[3], + MLX5_RATE_LIMIT_TX, + MLX5_RATE_LIMIT_PPS); + if (err) + goto err_put_meter_2; + + vf->meters = meters; + } + + return 0; + +err_put_meter_2: + kobject_put(&meters->meters[2].kobj); +err_put_meter_1: + kobject_put(&meters->meters[1].kobj); +err_put_meter_0: + kobject_put(&meters->meters[0].kobj); +err_vf_meters: + kobject_put(meters->rx_kobj); + kobject_put(meters->tx_kobj); + kobject_put(meters->kobj); + + kfree(meters); + + mlx5_destroy_vfs_sysfs_meters(dev, num_vfs); + + return err; +} +#endif + +int mlx5_create_vfs_sysfs(struct mlx5_core_dev *dev, int num_vfs) +{ + struct mlx5_core_sriov *sriov = &dev->priv.sriov; + struct mlx5_sriov_vf *tmp; + static struct kobj_type *sysfs; + int err; + int vf; + +#ifdef CONFIG_MLX5_ESWITCH + if (MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_ETH) + sysfs = &vf_type_eth; + else +#endif + sysfs = &vf_type_ib; + + sriov->vfs = kcalloc(num_vfs + 1, sizeof(*sriov->vfs), GFP_KERNEL); + if (!sriov->vfs) + return -ENOMEM; + + for (vf = 0; vf < num_vfs; vf++) { + tmp = &sriov->vfs[vf]; + tmp->dev = dev; + tmp->vf = vf; + err = kobject_init_and_add(&tmp->kobj, sysfs, sriov->config, + "%d", vf); + if (err) + goto err_vf; + + kobject_uevent(&tmp->kobj, KOBJ_ADD); + } +#ifdef CONFIG_MLX5_ESWITCH + if (MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_ETH) { + tmp = &sriov->vfs[vf]; + tmp->dev = dev; + tmp->vf = 0; + err = kobject_init_and_add(&tmp->kobj, &pf_type_eth, + sriov->config, "%s", "pf"); + if (err) { + --vf; + goto err_vf; + } + + kobject_uevent(&tmp->kobj, KOBJ_ADD); + } +#endif + +#ifdef CONFIG_MLX5_ESWITCH + err = mlx5_create_vfs_sysfs_meters(dev, num_vfs); + if (err) { + --vf; + goto err_vf; + } +#endif + +#ifdef CONFIG_MLX5_ESWITCH + if (MLX5_CAP_GEN(dev, port_type) != MLX5_CAP_PORT_TYPE_ETH) + return 0; + + for (vf = 0; vf < num_vfs; vf++) { + tmp = &sriov->vfs[vf]; + err = kobject_init_and_add(&tmp->page_kobj, &vf_paging, &tmp->kobj, + "paging_control"); + if (err) + goto err_vf; + + kobject_uevent(&tmp->page_kobj, KOBJ_ADD); + } +#endif + + return 0; + +err_vf: + for (; vf >= 0; vf--) { + tmp = &sriov->vfs[vf]; + kobject_put(&tmp->kobj); + } + + kfree(sriov->vfs); + sriov->vfs = NULL; + return err; +} + +void mlx5_destroy_vfs_sysfs(struct mlx5_core_dev *dev, int num_vfs) +{ + struct mlx5_core_sriov *sriov = &dev->priv.sriov; + struct mlx5_sriov_vf *tmp; + int vf; + +#ifdef CONFIG_MLX5_ESWITCH + if (MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_ETH) { + for (vf = 0; vf < num_vfs; vf++) { + tmp = &sriov->vfs[vf]; + kobject_put(&tmp->page_kobj); + } + } +#endif + +#ifdef CONFIG_MLX5_ESWITCH + mlx5_destroy_vfs_sysfs_meters(dev, num_vfs); +#endif + +#ifdef CONFIG_MLX5_ESWITCH + if (MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_ETH && num_vfs) { + tmp = &sriov->vfs[num_vfs]; + kobject_put(&tmp->kobj); + } +#endif + + for (vf = 0; vf < num_vfs; vf++) { + tmp = &sriov->vfs[vf]; + kobject_put(&tmp->kobj); + } + + kfree(sriov->vfs); + sriov->vfs = NULL; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/Makefile new file mode 100644 index 0000000..c78512e --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +subdir-ccflags-y += -I$(src)/.. diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c new file mode 100644 index 0000000..3a4cbf6 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c @@ -0,0 +1,2017 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2019 Mellanox Technologies. */ + +#include "dr_types.h" +#include "dr_ste.h" + +enum dr_action_domain { + DR_ACTION_DOMAIN_NIC_INGRESS, + DR_ACTION_DOMAIN_NIC_EGRESS, + DR_ACTION_DOMAIN_FDB_INGRESS, + DR_ACTION_DOMAIN_FDB_EGRESS, + DR_ACTION_DOMAIN_MAX, +}; + +enum dr_action_valid_state { + DR_ACTION_STATE_ERR, + DR_ACTION_STATE_NO_ACTION, + DR_ACTION_STATE_ENCAP, + DR_ACTION_STATE_DECAP, + DR_ACTION_STATE_MODIFY_HDR, + DR_ACTION_STATE_POP_VLAN, + DR_ACTION_STATE_PUSH_VLAN, + DR_ACTION_STATE_NON_TERM, + DR_ACTION_STATE_TERM, + DR_ACTION_STATE_ASO, + DR_ACTION_STATE_MAX, +}; + +static const char * const action_type_to_str[] = { + [DR_ACTION_TYP_TNL_L2_TO_L2] = "DR_ACTION_TYP_TNL_L2_TO_L2", + [DR_ACTION_TYP_L2_TO_TNL_L2] = "DR_ACTION_TYP_L2_TO_TNL_L2", + [DR_ACTION_TYP_TNL_L3_TO_L2] = "DR_ACTION_TYP_TNL_L3_TO_L2", + [DR_ACTION_TYP_L2_TO_TNL_L3] = "DR_ACTION_TYP_L2_TO_TNL_L3", + [DR_ACTION_TYP_DROP] = "DR_ACTION_TYP_DROP", + [DR_ACTION_TYP_QP] = "DR_ACTION_TYP_QP", + [DR_ACTION_TYP_FT] = "DR_ACTION_TYP_FT", + [DR_ACTION_TYP_CTR] = "DR_ACTION_TYP_CTR", + [DR_ACTION_TYP_TAG] = "DR_ACTION_TYP_TAG", + [DR_ACTION_TYP_MODIFY_HDR] = "DR_ACTION_TYP_MODIFY_HDR", + [DR_ACTION_TYP_VPORT] = "DR_ACTION_TYP_VPORT", + [DR_ACTION_TYP_POP_VLAN] = "DR_ACTION_TYP_POP_VLAN", + [DR_ACTION_TYP_PUSH_VLAN] = "DR_ACTION_TYP_PUSH_VLAN", + [DR_ACTION_TYP_SAMPLER] = "DR_ACTION_TYP_SAMPLER", + [DR_ACTION_TYP_INSERT_HDR] = "DR_ACTION_TYP_INSERT_HDR", + [DR_ACTION_TYP_ASO_FLOW_METER] = "DR_ACTION_TYP_ASO_FLOW_METER", + [DR_ACTION_TYP_REMOVE_HDR] = "DR_ACTION_TYP_REMOVE_HDR", + [DR_ACTION_TYP_MAX] = "DR_ACTION_UNKNOWN", +}; + +static const char *dr_action_id_to_str(enum mlx5dr_action_type action_id) +{ + if (action_id > DR_ACTION_TYP_MAX) + action_id = DR_ACTION_TYP_MAX; + return action_type_to_str[action_id]; +} + +static const enum dr_action_valid_state +next_action_state[DR_ACTION_DOMAIN_MAX][DR_ACTION_STATE_MAX][DR_ACTION_TYP_MAX] = { + [DR_ACTION_DOMAIN_NIC_INGRESS] = { + [DR_ACTION_STATE_NO_ACTION] = { + [DR_ACTION_TYP_DROP] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_QP] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_SAMPLER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_TAG] = DR_ACTION_STATE_NON_TERM, + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_NON_TERM, + [DR_ACTION_TYP_TNL_L2_TO_L2] = DR_ACTION_STATE_DECAP, + [DR_ACTION_TYP_TNL_L3_TO_L2] = DR_ACTION_STATE_DECAP, + [DR_ACTION_TYP_L2_TO_TNL_L2] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_L2_TO_TNL_L3] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_INSERT_HDR] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_REMOVE_HDR] = DR_ACTION_STATE_DECAP, + [DR_ACTION_TYP_MODIFY_HDR] = DR_ACTION_STATE_MODIFY_HDR, + [DR_ACTION_TYP_POP_VLAN] = DR_ACTION_STATE_POP_VLAN, + [DR_ACTION_TYP_PUSH_VLAN] = DR_ACTION_STATE_PUSH_VLAN, + [DR_ACTION_TYP_ASO_FLOW_METER] = DR_ACTION_STATE_ASO, + }, + [DR_ACTION_STATE_DECAP] = { + [DR_ACTION_TYP_DROP] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_QP] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_SAMPLER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_TAG] = DR_ACTION_STATE_DECAP, + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_DECAP, + [DR_ACTION_TYP_L2_TO_TNL_L2] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_L2_TO_TNL_L3] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_INSERT_HDR] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_MODIFY_HDR] = DR_ACTION_STATE_MODIFY_HDR, + [DR_ACTION_TYP_POP_VLAN] = DR_ACTION_STATE_POP_VLAN, + [DR_ACTION_TYP_PUSH_VLAN] = DR_ACTION_STATE_PUSH_VLAN, + [DR_ACTION_TYP_ASO_FLOW_METER] = DR_ACTION_STATE_ASO, + }, + [DR_ACTION_STATE_ENCAP] = { + [DR_ACTION_TYP_DROP] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_QP] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_SAMPLER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_TAG] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_ASO_FLOW_METER] = DR_ACTION_STATE_ASO, + }, + [DR_ACTION_STATE_MODIFY_HDR] = { + [DR_ACTION_TYP_DROP] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_QP] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_SAMPLER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_TAG] = DR_ACTION_STATE_MODIFY_HDR, + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_MODIFY_HDR, + [DR_ACTION_TYP_L2_TO_TNL_L2] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_L2_TO_TNL_L3] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_INSERT_HDR] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_PUSH_VLAN] = DR_ACTION_STATE_PUSH_VLAN, + [DR_ACTION_TYP_ASO_FLOW_METER] = DR_ACTION_STATE_ASO, + }, + [DR_ACTION_STATE_POP_VLAN] = { + [DR_ACTION_TYP_DROP] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_QP] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_SAMPLER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_TAG] = DR_ACTION_STATE_POP_VLAN, + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_POP_VLAN, + [DR_ACTION_TYP_POP_VLAN] = DR_ACTION_STATE_POP_VLAN, + [DR_ACTION_TYP_MODIFY_HDR] = DR_ACTION_STATE_MODIFY_HDR, + [DR_ACTION_TYP_L2_TO_TNL_L2] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_L2_TO_TNL_L3] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_INSERT_HDR] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_ASO_FLOW_METER] = DR_ACTION_STATE_ASO, + }, + [DR_ACTION_STATE_PUSH_VLAN] = { + [DR_ACTION_TYP_QP] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_SAMPLER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_TAG] = DR_ACTION_STATE_PUSH_VLAN, + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_PUSH_VLAN, + [DR_ACTION_TYP_PUSH_VLAN] = DR_ACTION_STATE_PUSH_VLAN, + [DR_ACTION_TYP_L2_TO_TNL_L2] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_L2_TO_TNL_L3] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_ASO_FLOW_METER] = DR_ACTION_STATE_ASO, + }, + [DR_ACTION_STATE_NON_TERM] = { + [DR_ACTION_TYP_DROP] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_QP] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_SAMPLER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_TAG] = DR_ACTION_STATE_NON_TERM, + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_NON_TERM, + [DR_ACTION_TYP_TNL_L2_TO_L2] = DR_ACTION_STATE_DECAP, + [DR_ACTION_TYP_TNL_L3_TO_L2] = DR_ACTION_STATE_DECAP, + [DR_ACTION_TYP_L2_TO_TNL_L2] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_L2_TO_TNL_L3] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_INSERT_HDR] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_REMOVE_HDR] = DR_ACTION_STATE_DECAP, + [DR_ACTION_TYP_MODIFY_HDR] = DR_ACTION_STATE_MODIFY_HDR, + [DR_ACTION_TYP_POP_VLAN] = DR_ACTION_STATE_POP_VLAN, + [DR_ACTION_TYP_PUSH_VLAN] = DR_ACTION_STATE_PUSH_VLAN, + [DR_ACTION_TYP_ASO_FLOW_METER] = DR_ACTION_STATE_ASO, + }, + [DR_ACTION_STATE_ASO] = { + [DR_ACTION_TYP_DROP] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_QP] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_ASO, + }, + [DR_ACTION_STATE_TERM] = { + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_TERM, + }, + }, + [DR_ACTION_DOMAIN_NIC_EGRESS] = { + [DR_ACTION_STATE_NO_ACTION] = { + [DR_ACTION_TYP_DROP] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_SAMPLER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_NON_TERM, + [DR_ACTION_TYP_L2_TO_TNL_L2] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_L2_TO_TNL_L3] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_INSERT_HDR] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_REMOVE_HDR] = DR_ACTION_STATE_DECAP, + [DR_ACTION_TYP_MODIFY_HDR] = DR_ACTION_STATE_MODIFY_HDR, + [DR_ACTION_TYP_PUSH_VLAN] = DR_ACTION_STATE_PUSH_VLAN, + [DR_ACTION_TYP_POP_VLAN] = DR_ACTION_STATE_POP_VLAN, + [DR_ACTION_TYP_ASO_FLOW_METER] = DR_ACTION_STATE_ASO, + }, + [DR_ACTION_STATE_DECAP] = { + [DR_ACTION_TYP_DROP] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_SAMPLER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_DECAP, + [DR_ACTION_TYP_ASO_FLOW_METER] = DR_ACTION_STATE_ASO, + }, + [DR_ACTION_STATE_ENCAP] = { + [DR_ACTION_TYP_DROP] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_SAMPLER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_ASO_FLOW_METER] = DR_ACTION_STATE_ASO, + }, + [DR_ACTION_STATE_MODIFY_HDR] = { + [DR_ACTION_TYP_DROP] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_SAMPLER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_MODIFY_HDR, + [DR_ACTION_TYP_L2_TO_TNL_L2] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_L2_TO_TNL_L3] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_INSERT_HDR] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_PUSH_VLAN] = DR_ACTION_STATE_PUSH_VLAN, + [DR_ACTION_TYP_ASO_FLOW_METER] = DR_ACTION_STATE_ASO, + }, + [DR_ACTION_STATE_POP_VLAN] = { + [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_SAMPLER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_POP_VLAN, + [DR_ACTION_TYP_POP_VLAN] = DR_ACTION_STATE_POP_VLAN, + [DR_ACTION_TYP_PUSH_VLAN] = DR_ACTION_STATE_PUSH_VLAN, + [DR_ACTION_TYP_MODIFY_HDR] = DR_ACTION_STATE_MODIFY_HDR, + [DR_ACTION_TYP_L2_TO_TNL_L2] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_L2_TO_TNL_L3] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_INSERT_HDR] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_ASO_FLOW_METER] = DR_ACTION_STATE_ASO, + }, + [DR_ACTION_STATE_PUSH_VLAN] = { + [DR_ACTION_TYP_DROP] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_SAMPLER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_PUSH_VLAN, + [DR_ACTION_TYP_PUSH_VLAN] = DR_ACTION_STATE_PUSH_VLAN, + [DR_ACTION_TYP_L2_TO_TNL_L2] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_L2_TO_TNL_L3] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_INSERT_HDR] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_ASO_FLOW_METER] = DR_ACTION_STATE_ASO, + }, + [DR_ACTION_STATE_NON_TERM] = { + [DR_ACTION_TYP_DROP] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_SAMPLER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_NON_TERM, + [DR_ACTION_TYP_L2_TO_TNL_L2] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_L2_TO_TNL_L3] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_INSERT_HDR] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_REMOVE_HDR] = DR_ACTION_STATE_DECAP, + [DR_ACTION_TYP_MODIFY_HDR] = DR_ACTION_STATE_MODIFY_HDR, + [DR_ACTION_TYP_PUSH_VLAN] = DR_ACTION_STATE_PUSH_VLAN, + [DR_ACTION_TYP_POP_VLAN] = DR_ACTION_STATE_POP_VLAN, + [DR_ACTION_TYP_ASO_FLOW_METER] = DR_ACTION_STATE_ASO, + }, + [DR_ACTION_STATE_ASO] = { + [DR_ACTION_TYP_L2_TO_TNL_L2] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_L2_TO_TNL_L3] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_VPORT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_MODIFY_HDR] = DR_ACTION_STATE_MODIFY_HDR, + [DR_ACTION_TYP_PUSH_VLAN] = DR_ACTION_STATE_PUSH_VLAN, + [DR_ACTION_TYP_POP_VLAN] = DR_ACTION_STATE_POP_VLAN, + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_ASO, + [DR_ACTION_TYP_DROP] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, + }, + [DR_ACTION_STATE_TERM] = { + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_TERM, + }, + }, + [DR_ACTION_DOMAIN_FDB_INGRESS] = { + [DR_ACTION_STATE_NO_ACTION] = { + [DR_ACTION_TYP_DROP] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_SAMPLER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_NON_TERM, + [DR_ACTION_TYP_TNL_L2_TO_L2] = DR_ACTION_STATE_DECAP, + [DR_ACTION_TYP_TNL_L3_TO_L2] = DR_ACTION_STATE_DECAP, + [DR_ACTION_TYP_L2_TO_TNL_L2] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_L2_TO_TNL_L3] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_INSERT_HDR] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_REMOVE_HDR] = DR_ACTION_STATE_DECAP, + [DR_ACTION_TYP_MODIFY_HDR] = DR_ACTION_STATE_MODIFY_HDR, + [DR_ACTION_TYP_PUSH_VLAN] = DR_ACTION_STATE_PUSH_VLAN, + [DR_ACTION_TYP_POP_VLAN] = DR_ACTION_STATE_POP_VLAN, + [DR_ACTION_TYP_VPORT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_ASO_FLOW_METER] = DR_ACTION_STATE_ASO, + }, + [DR_ACTION_STATE_DECAP] = { + [DR_ACTION_TYP_DROP] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_DECAP, + [DR_ACTION_TYP_SAMPLER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_MODIFY_HDR] = DR_ACTION_STATE_MODIFY_HDR, + [DR_ACTION_TYP_VPORT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_L2_TO_TNL_L2] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_L2_TO_TNL_L3] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_INSERT_HDR] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_POP_VLAN] = DR_ACTION_STATE_POP_VLAN, + [DR_ACTION_TYP_PUSH_VLAN] = DR_ACTION_STATE_PUSH_VLAN, + [DR_ACTION_TYP_ASO_FLOW_METER] = DR_ACTION_STATE_ASO, + }, + [DR_ACTION_STATE_ENCAP] = { + [DR_ACTION_TYP_DROP] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_QP] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_VPORT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_SAMPLER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_ASO_FLOW_METER] = DR_ACTION_STATE_ASO, + }, + [DR_ACTION_STATE_MODIFY_HDR] = { + [DR_ACTION_TYP_DROP] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_SAMPLER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_MODIFY_HDR, + [DR_ACTION_TYP_VPORT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_L2_TO_TNL_L2] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_L2_TO_TNL_L3] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_INSERT_HDR] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_PUSH_VLAN] = DR_ACTION_STATE_PUSH_VLAN, + [DR_ACTION_TYP_ASO_FLOW_METER] = DR_ACTION_STATE_ASO, + }, + [DR_ACTION_STATE_POP_VLAN] = { + [DR_ACTION_TYP_DROP] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_SAMPLER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_POP_VLAN] = DR_ACTION_STATE_POP_VLAN, + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_POP_VLAN, + [DR_ACTION_TYP_VPORT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_MODIFY_HDR] = DR_ACTION_STATE_MODIFY_HDR, + [DR_ACTION_TYP_L2_TO_TNL_L2] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_L2_TO_TNL_L3] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_INSERT_HDR] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_ASO_FLOW_METER] = DR_ACTION_STATE_ASO, + }, + [DR_ACTION_STATE_PUSH_VLAN] = { + [DR_ACTION_TYP_DROP] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_SAMPLER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_PUSH_VLAN] = DR_ACTION_STATE_PUSH_VLAN, + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_PUSH_VLAN, + [DR_ACTION_TYP_VPORT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_MODIFY_HDR] = DR_ACTION_STATE_MODIFY_HDR, + [DR_ACTION_TYP_L2_TO_TNL_L2] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_L2_TO_TNL_L3] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_INSERT_HDR] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_ASO_FLOW_METER] = DR_ACTION_STATE_ASO, + }, + [DR_ACTION_STATE_NON_TERM] = { + [DR_ACTION_TYP_DROP] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_SAMPLER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_NON_TERM, + [DR_ACTION_TYP_TNL_L2_TO_L2] = DR_ACTION_STATE_DECAP, + [DR_ACTION_TYP_TNL_L3_TO_L2] = DR_ACTION_STATE_DECAP, + [DR_ACTION_TYP_L2_TO_TNL_L2] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_L2_TO_TNL_L3] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_INSERT_HDR] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_REMOVE_HDR] = DR_ACTION_STATE_DECAP, + [DR_ACTION_TYP_MODIFY_HDR] = DR_ACTION_STATE_MODIFY_HDR, + [DR_ACTION_TYP_POP_VLAN] = DR_ACTION_STATE_POP_VLAN, + [DR_ACTION_TYP_PUSH_VLAN] = DR_ACTION_STATE_PUSH_VLAN, + [DR_ACTION_TYP_VPORT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_ASO_FLOW_METER] = DR_ACTION_STATE_ASO, + }, + [DR_ACTION_STATE_ASO] = { + [DR_ACTION_TYP_DROP] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_VPORT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_ASO, + }, + [DR_ACTION_STATE_TERM] = { + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_TERM, + }, + }, + [DR_ACTION_DOMAIN_FDB_EGRESS] = { + [DR_ACTION_STATE_NO_ACTION] = { + [DR_ACTION_TYP_DROP] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_SAMPLER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_NON_TERM, + [DR_ACTION_TYP_MODIFY_HDR] = DR_ACTION_STATE_MODIFY_HDR, + [DR_ACTION_TYP_L2_TO_TNL_L2] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_L2_TO_TNL_L3] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_INSERT_HDR] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_REMOVE_HDR] = DR_ACTION_STATE_DECAP, + [DR_ACTION_TYP_PUSH_VLAN] = DR_ACTION_STATE_PUSH_VLAN, + [DR_ACTION_TYP_POP_VLAN] = DR_ACTION_STATE_POP_VLAN, + [DR_ACTION_TYP_VPORT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_ASO_FLOW_METER] = DR_ACTION_STATE_ASO, + }, + [DR_ACTION_STATE_DECAP] = { + [DR_ACTION_TYP_DROP] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_DECAP, + [DR_ACTION_TYP_VPORT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_SAMPLER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_ASO_FLOW_METER] = DR_ACTION_STATE_ASO, + }, + [DR_ACTION_STATE_ENCAP] = { + [DR_ACTION_TYP_DROP] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_SAMPLER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_VPORT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_ASO_FLOW_METER] = DR_ACTION_STATE_ASO, + }, + [DR_ACTION_STATE_MODIFY_HDR] = { + [DR_ACTION_TYP_DROP] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_SAMPLER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_MODIFY_HDR, + [DR_ACTION_TYP_L2_TO_TNL_L2] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_L2_TO_TNL_L3] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_INSERT_HDR] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_PUSH_VLAN] = DR_ACTION_STATE_PUSH_VLAN, + [DR_ACTION_TYP_VPORT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_ASO_FLOW_METER] = DR_ACTION_STATE_ASO, + }, + [DR_ACTION_STATE_POP_VLAN] = { + [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_SAMPLER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_POP_VLAN, + [DR_ACTION_TYP_POP_VLAN] = DR_ACTION_STATE_POP_VLAN, + [DR_ACTION_TYP_PUSH_VLAN] = DR_ACTION_STATE_PUSH_VLAN, + [DR_ACTION_TYP_MODIFY_HDR] = DR_ACTION_STATE_MODIFY_HDR, + [DR_ACTION_TYP_L2_TO_TNL_L2] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_L2_TO_TNL_L3] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_INSERT_HDR] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_VPORT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_ASO_FLOW_METER] = DR_ACTION_STATE_ASO, + }, + [DR_ACTION_STATE_PUSH_VLAN] = { + [DR_ACTION_TYP_DROP] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_SAMPLER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_PUSH_VLAN] = DR_ACTION_STATE_PUSH_VLAN, + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_PUSH_VLAN, + [DR_ACTION_TYP_L2_TO_TNL_L2] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_L2_TO_TNL_L3] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_INSERT_HDR] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_VPORT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_ASO_FLOW_METER] = DR_ACTION_STATE_ASO, + }, + [DR_ACTION_STATE_NON_TERM] = { + [DR_ACTION_TYP_DROP] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_SAMPLER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_NON_TERM, + [DR_ACTION_TYP_MODIFY_HDR] = DR_ACTION_STATE_MODIFY_HDR, + [DR_ACTION_TYP_L2_TO_TNL_L2] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_L2_TO_TNL_L3] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_INSERT_HDR] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_REMOVE_HDR] = DR_ACTION_STATE_DECAP, + [DR_ACTION_TYP_PUSH_VLAN] = DR_ACTION_STATE_PUSH_VLAN, + [DR_ACTION_TYP_POP_VLAN] = DR_ACTION_STATE_POP_VLAN, + [DR_ACTION_TYP_VPORT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_ASO_FLOW_METER] = DR_ACTION_STATE_ASO, + }, + [DR_ACTION_STATE_ASO] = { + [DR_ACTION_TYP_L2_TO_TNL_L2] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_L2_TO_TNL_L3] = DR_ACTION_STATE_ENCAP, + [DR_ACTION_TYP_MODIFY_HDR] = DR_ACTION_STATE_MODIFY_HDR, + [DR_ACTION_TYP_PUSH_VLAN] = DR_ACTION_STATE_PUSH_VLAN, + [DR_ACTION_TYP_DROP] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_VPORT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_ASO, + }, + [DR_ACTION_STATE_TERM] = { + [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_TERM, + }, + }, +}; + +static int +dr_action_reformat_to_action_type(enum mlx5dr_action_reformat_type reformat_type, + enum mlx5dr_action_type *action_type) +{ + switch (reformat_type) { + case DR_ACTION_REFORMAT_TYP_TNL_L2_TO_L2: + *action_type = DR_ACTION_TYP_TNL_L2_TO_L2; + break; + case DR_ACTION_REFORMAT_TYP_L2_TO_TNL_L2: + *action_type = DR_ACTION_TYP_L2_TO_TNL_L2; + break; + case DR_ACTION_REFORMAT_TYP_TNL_L3_TO_L2: + *action_type = DR_ACTION_TYP_TNL_L3_TO_L2; + break; + case DR_ACTION_REFORMAT_TYP_L2_TO_TNL_L3: + *action_type = DR_ACTION_TYP_L2_TO_TNL_L3; + break; + case DR_ACTION_REFORMAT_TYP_INSERT_HDR: + *action_type = DR_ACTION_TYP_INSERT_HDR; + break; + case DR_ACTION_REFORMAT_TYP_REMOVE_HDR: + *action_type = DR_ACTION_TYP_REMOVE_HDR; + break; + default: + return -EINVAL; + } + + return 0; +} + +/* Apply the actions on the rule STE array starting from the last_ste. + * Actions might require more than one STE, new_num_stes will return + * the new size of the STEs array, rule with actions. + */ +static void dr_actions_apply(struct mlx5dr_domain *dmn, + enum mlx5dr_domain_nic_type nic_type, + u8 *action_type_set, + u8 *last_ste, + struct mlx5dr_ste_actions_attr *attr, + u32 *new_num_stes) +{ + struct mlx5dr_ste_ctx *ste_ctx = dmn->ste_ctx; + u32 added_stes = 0; + + if (nic_type == DR_DOMAIN_NIC_TYPE_RX) + mlx5dr_ste_set_actions_rx(ste_ctx, dmn, action_type_set, + last_ste, attr, &added_stes); + else + mlx5dr_ste_set_actions_tx(ste_ctx, dmn, action_type_set, + last_ste, attr, &added_stes); + + *new_num_stes += added_stes; +} + +static enum dr_action_domain +dr_action_get_action_domain(enum mlx5dr_domain_type domain, + enum mlx5dr_domain_nic_type nic_type) +{ + switch (domain) { + case MLX5DR_DOMAIN_TYPE_NIC_RX: + return DR_ACTION_DOMAIN_NIC_INGRESS; + case MLX5DR_DOMAIN_TYPE_NIC_TX: + return DR_ACTION_DOMAIN_NIC_EGRESS; + case MLX5DR_DOMAIN_TYPE_FDB: + if (nic_type == DR_DOMAIN_NIC_TYPE_RX) + return DR_ACTION_DOMAIN_FDB_INGRESS; + return DR_ACTION_DOMAIN_FDB_EGRESS; + default: + WARN_ON(true); + return DR_ACTION_DOMAIN_MAX; + } +} + +static +int dr_action_validate_and_get_next_state(enum dr_action_domain action_domain, + u32 action_type, + u32 *state) +{ + u32 cur_state = *state; + + /* Check action state machine is valid */ + *state = next_action_state[action_domain][cur_state][action_type]; + + if (*state == DR_ACTION_STATE_ERR) + return -EOPNOTSUPP; + + return 0; +} + +static int dr_action_handle_cs_recalc(struct mlx5dr_domain *dmn, + struct mlx5dr_action *dest_action, + u64 *final_icm_addr) +{ + int ret; + + switch (dest_action->action_type) { + case DR_ACTION_TYP_FT: + /* Allow destination flow table only if table is a terminating + * table, since there is an *assumption* that in such case FW + * will recalculate the CS. + */ + if (dest_action->dest_tbl->is_fw_tbl) { + *final_icm_addr = dest_action->dest_tbl->fw_tbl.rx_icm_addr; + } else { + mlx5dr_dbg(dmn, + "Destination FT should be terminating when modify TTL is used\n"); + return -EINVAL; + } + break; + + case DR_ACTION_TYP_VPORT: + /* If destination is vport we will get the FW flow table + * that recalculates the CS and forwards to the vport. + */ + ret = mlx5dr_domain_get_recalc_cs_ft_addr(dest_action->vport->dmn, + dest_action->vport->caps->num, + final_icm_addr); + if (ret) { + mlx5dr_err(dmn, "Failed to get FW cs recalc flow table\n"); + return ret; + } + break; + + default: + break; + } + + return 0; +} + +static void dr_action_modify_ttl_adjust(struct mlx5dr_domain *dmn, + struct mlx5dr_ste_actions_attr *attr, + bool rx_rule, + bool *recalc_cs_required) +{ + *recalc_cs_required = false; + + /* if device supports csum recalculation - no adjustment needed */ + if (mlx5dr_ste_supp_ttl_cs_recalc(&dmn->info.caps)) + return; + + /* no adjustment needed on TX rules */ + if (!rx_rule) + return; + + if (!MLX5_CAP_ESW_FLOWTABLE(dmn->mdev, fdb_ipv4_ttl_modify)) { + /* Ignore the modify TTL action. + * It is always kept as last HW action. + */ + attr->modify_actions--; + return; + } + + if (dmn->type == MLX5DR_DOMAIN_TYPE_FDB) + /* Due to a HW bug on some devices, modifying TTL on RX flows + * will cause an incorrect checksum calculation. In such cases + * we will use a FW table to recalculate the checksum. + */ + *recalc_cs_required = true; +} + +static void dr_action_print_sequence(struct mlx5dr_domain *dmn, + struct mlx5dr_action *actions[], + int last_idx) +{ + int i; + + for (i = 0; i <= last_idx; i++) + mlx5dr_err(dmn, "< %s (%d) > ", + dr_action_id_to_str(actions[i]->action_type), + actions[i]->action_type); +} + +#define WITH_VLAN_NUM_HW_ACTIONS 6 + +int mlx5dr_actions_build_ste_arr(struct mlx5dr_matcher *matcher, + struct mlx5dr_matcher_rx_tx *nic_matcher, + struct mlx5dr_action *actions[], + u32 num_actions, + u8 *ste_arr, + u32 *new_hw_ste_arr_sz) +{ + struct mlx5dr_domain_rx_tx *nic_dmn = nic_matcher->nic_tbl->nic_dmn; + bool rx_rule = nic_dmn->type == DR_DOMAIN_NIC_TYPE_RX; + struct mlx5dr_domain *dmn = matcher->tbl->dmn; + u8 action_type_set[DR_ACTION_TYP_MAX] = {}; + struct mlx5dr_ste_actions_attr attr = {}; + struct mlx5dr_action *dest_action = NULL; + u32 state = DR_ACTION_STATE_NO_ACTION; + enum dr_action_domain action_domain; + bool recalc_cs_required = false; + u8 *last_ste; + int i, ret; + + attr.gvmi = dmn->info.caps.gvmi; + attr.hit_gvmi = dmn->info.caps.gvmi; + attr.final_icm_addr = nic_dmn->default_icm_addr; + action_domain = dr_action_get_action_domain(dmn->type, nic_dmn->type); + + for (i = 0; i < num_actions; i++) { + struct mlx5dr_action_dest_tbl *dest_tbl; + struct mlx5dr_icm_chunk *chunk; + struct mlx5dr_action *action; + int max_actions_type = 1; + u32 action_type; + + action = actions[i]; + action_type = action->action_type; + + switch (action_type) { + case DR_ACTION_TYP_DROP: + attr.final_icm_addr = nic_dmn->drop_icm_addr; + break; + case DR_ACTION_TYP_FT: + dest_action = action; + dest_tbl = action->dest_tbl; + if (!dest_tbl->is_fw_tbl) { + if (dest_tbl->tbl->dmn != dmn) { + mlx5dr_err(dmn, + "Destination table belongs to a different domain\n"); + return -EINVAL; + } + if (dest_tbl->tbl->level <= matcher->tbl->level) { + mlx5_core_dbg_once(dmn->mdev, + "Connecting table to a lower/same level destination table\n"); + mlx5dr_dbg(dmn, + "Connecting table at level %d to a destination table at level %d\n", + matcher->tbl->level, + dest_tbl->tbl->level); + } + chunk = rx_rule ? dest_tbl->tbl->rx.s_anchor->chunk : + dest_tbl->tbl->tx.s_anchor->chunk; + attr.final_icm_addr = mlx5dr_icm_pool_get_chunk_icm_addr(chunk); + } else { + struct mlx5dr_cmd_query_flow_table_details output; + int ret; + + /* get the relevant addresses */ + if (!action->dest_tbl->fw_tbl.rx_icm_addr) { + ret = mlx5dr_cmd_query_flow_table(dmn->mdev, + dest_tbl->fw_tbl.type, + dest_tbl->fw_tbl.id, + &output); + if (!ret) { + dest_tbl->fw_tbl.tx_icm_addr = + output.sw_owner_icm_root_1; + dest_tbl->fw_tbl.rx_icm_addr = + output.sw_owner_icm_root_0; + } else { + mlx5dr_err(dmn, + "Failed mlx5_cmd_query_flow_table ret: %d\n", + ret); + return ret; + } + } + attr.final_icm_addr = rx_rule ? + dest_tbl->fw_tbl.rx_icm_addr : + dest_tbl->fw_tbl.tx_icm_addr; + } + break; + case DR_ACTION_TYP_QP: + mlx5dr_info(dmn, "Domain doesn't support QP\n"); + return -EOPNOTSUPP; + case DR_ACTION_TYP_CTR: + attr.ctr_id = action->ctr->ctr_id + + action->ctr->offset; + break; + case DR_ACTION_TYP_TAG: + attr.flow_tag = action->flow_tag->flow_tag; + break; + case DR_ACTION_TYP_TNL_L2_TO_L2: + break; + case DR_ACTION_TYP_TNL_L3_TO_L2: + attr.decap_index = action->rewrite->index; + attr.decap_actions = action->rewrite->num_of_actions; + attr.decap_with_vlan = + attr.decap_actions == WITH_VLAN_NUM_HW_ACTIONS; + if (action->rewrite->arg) + attr.decap_args_index = mlx5dr_arg_get_object_id(action->rewrite->arg); + break; + case DR_ACTION_TYP_MODIFY_HDR: + attr.modify_actions = action->rewrite->num_of_actions; + + if (action->rewrite->modify_ttl) + dr_action_modify_ttl_adjust(dmn, &attr, rx_rule, + &recalc_cs_required); + + if (!action->rewrite->single_action_opt) { + attr.modify_index = action->rewrite->index; + if (action->rewrite->arg) + attr.args_index = mlx5dr_arg_get_object_id(action->rewrite->arg); + } else { + attr.single_modify_action = action->rewrite->data; + } + break; + case DR_ACTION_TYP_L2_TO_TNL_L2: + case DR_ACTION_TYP_L2_TO_TNL_L3: + if (rx_rule && + !(dmn->ste_ctx->actions_caps & DR_STE_CTX_ACTION_CAP_RX_ENCAP)) { + mlx5dr_info(dmn, "Device doesn't support Encap on RX\n"); + return -EOPNOTSUPP; + } + attr.reformat.size = action->reformat->size; + attr.reformat.id = action->reformat->id; + break; + case DR_ACTION_TYP_SAMPLER: + attr.final_icm_addr = rx_rule ? action->sampler->rx_icm_addr : + action->sampler->tx_icm_addr; + break; + case DR_ACTION_TYP_VPORT: + attr.hit_gvmi = action->vport->caps->vhca_gvmi; + dest_action = action; + attr.final_icm_addr = rx_rule ? + action->vport->caps->icm_address_rx : + action->vport->caps->icm_address_tx; + break; + case DR_ACTION_TYP_POP_VLAN: + if (!rx_rule && !(dmn->ste_ctx->actions_caps & + DR_STE_CTX_ACTION_CAP_TX_POP)) { + mlx5dr_dbg(dmn, "Device doesn't support POP VLAN action on TX\n"); + return -EOPNOTSUPP; + } + + max_actions_type = MLX5DR_MAX_VLANS; + attr.vlans.count++; + break; + case DR_ACTION_TYP_PUSH_VLAN: + if (rx_rule && !(dmn->ste_ctx->actions_caps & + DR_STE_CTX_ACTION_CAP_RX_PUSH)) { + mlx5dr_dbg(dmn, "Device doesn't support PUSH VLAN action on RX\n"); + return -EOPNOTSUPP; + } + + max_actions_type = MLX5DR_MAX_VLANS; + if (attr.vlans.count == MLX5DR_MAX_VLANS) { + mlx5dr_dbg(dmn, "Max VLAN push/pop count exceeded\n"); + return -EINVAL; + } + + attr.vlans.headers[attr.vlans.count++] = action->push_vlan->vlan_hdr; + break; + case DR_ACTION_TYP_INSERT_HDR: + case DR_ACTION_TYP_REMOVE_HDR: + attr.reformat.size = action->reformat->size; + attr.reformat.id = action->reformat->id; + attr.reformat.param_0 = action->reformat->param_0; + attr.reformat.param_1 = action->reformat->param_1; + break; + case DR_ACTION_TYP_ASO_FLOW_METER: + attr.aso_flow_meter.obj_id = action->aso->obj_id; + attr.aso_flow_meter.offset = action->aso->offset; + attr.aso_flow_meter.dest_reg_id = action->aso->dest_reg_id; + attr.aso_flow_meter.init_color = action->aso->init_color; + break; + default: + mlx5dr_err(dmn, "Unsupported action type %d\n", action_type); + return -EINVAL; + } + + /* Check action duplication */ + if (++action_type_set[action_type] > max_actions_type) { + mlx5dr_err(dmn, "Action type %d supports only max %d time(s)\n", + action_type, max_actions_type); + return -EINVAL; + } + + /* Check action state machine is valid */ + if (dr_action_validate_and_get_next_state(action_domain, + action_type, + &state)) { + mlx5dr_err(dmn, "Invalid action (gvmi: %d, is_rx: %d) sequence provided:", + attr.gvmi, rx_rule); + dr_action_print_sequence(dmn, actions, i); + return -EOPNOTSUPP; + } + } + + *new_hw_ste_arr_sz = nic_matcher->num_of_builders; + last_ste = ste_arr + DR_STE_SIZE * (nic_matcher->num_of_builders - 1); + + if (recalc_cs_required && dest_action) { + ret = dr_action_handle_cs_recalc(dmn, dest_action, &attr.final_icm_addr); + if (ret) { + mlx5dr_err(dmn, + "Failed to handle checksum recalculation err %d\n", + ret); + return ret; + } + } + + dr_actions_apply(dmn, + nic_dmn->type, + action_type_set, + last_ste, + &attr, + new_hw_ste_arr_sz); + + return 0; +} + +static unsigned int action_size[DR_ACTION_TYP_MAX] = { + [DR_ACTION_TYP_TNL_L2_TO_L2] = sizeof(struct mlx5dr_action_reformat), + [DR_ACTION_TYP_L2_TO_TNL_L2] = sizeof(struct mlx5dr_action_reformat), + [DR_ACTION_TYP_TNL_L3_TO_L2] = sizeof(struct mlx5dr_action_rewrite), + [DR_ACTION_TYP_L2_TO_TNL_L3] = sizeof(struct mlx5dr_action_reformat), + [DR_ACTION_TYP_FT] = sizeof(struct mlx5dr_action_dest_tbl), + [DR_ACTION_TYP_CTR] = sizeof(struct mlx5dr_action_ctr), + [DR_ACTION_TYP_TAG] = sizeof(struct mlx5dr_action_flow_tag), + [DR_ACTION_TYP_MODIFY_HDR] = sizeof(struct mlx5dr_action_rewrite), + [DR_ACTION_TYP_VPORT] = sizeof(struct mlx5dr_action_vport), + [DR_ACTION_TYP_PUSH_VLAN] = sizeof(struct mlx5dr_action_push_vlan), + [DR_ACTION_TYP_INSERT_HDR] = sizeof(struct mlx5dr_action_reformat), + [DR_ACTION_TYP_REMOVE_HDR] = sizeof(struct mlx5dr_action_reformat), + [DR_ACTION_TYP_SAMPLER] = sizeof(struct mlx5dr_action_sampler), + [DR_ACTION_TYP_ASO_FLOW_METER] = sizeof(struct mlx5dr_action_aso_flow_meter), +}; + +static struct mlx5dr_action * +dr_action_create_generic(enum mlx5dr_action_type action_type) +{ + struct mlx5dr_action *action; + int extra_size; + + if (action_type < DR_ACTION_TYP_MAX) + extra_size = action_size[action_type]; + else + return NULL; + + action = kzalloc(sizeof(*action) + extra_size, GFP_KERNEL); + if (!action) + return NULL; + + action->action_type = action_type; + refcount_set(&action->refcount, 1); + action->data = action + 1; + + return action; +} + +struct mlx5dr_action *mlx5dr_action_create_drop(void) +{ + return dr_action_create_generic(DR_ACTION_TYP_DROP); +} + +struct mlx5dr_action * +mlx5dr_action_create_dest_table_num(struct mlx5dr_domain *dmn, u32 table_num) +{ + struct mlx5dr_action *action; + + action = dr_action_create_generic(DR_ACTION_TYP_FT); + if (!action) + return NULL; + + action->dest_tbl->is_fw_tbl = true; + action->dest_tbl->fw_tbl.dmn = dmn; + action->dest_tbl->fw_tbl.id = table_num; + action->dest_tbl->fw_tbl.type = FS_FT_FDB; + refcount_inc(&dmn->refcount); + + return action; +} + +struct mlx5dr_action * +mlx5dr_action_create_dest_table(struct mlx5dr_table *tbl) +{ + struct mlx5dr_action *action; + + refcount_inc(&tbl->refcount); + + action = dr_action_create_generic(DR_ACTION_TYP_FT); + if (!action) + goto dec_ref; + + action->dest_tbl->tbl = tbl; + + return action; + +dec_ref: + refcount_dec(&tbl->refcount); + return NULL; +} + +struct mlx5dr_action * +mlx5dr_action_create_mult_dest_tbl(struct mlx5dr_domain *dmn, + struct mlx5dr_action_dest *dests, + u32 num_of_dests, + bool ignore_flow_level, + u32 flow_source) +{ + struct mlx5dr_cmd_flow_destination_hw_info *hw_dests; + struct mlx5dr_action **ref_actions; + struct mlx5dr_action *action; + bool reformat_req = false; + u32 num_of_ref = 0; + u32 ref_act_cnt; + int ret; + int i; + + if (dmn->type != MLX5DR_DOMAIN_TYPE_FDB) { + mlx5dr_err(dmn, "Multiple destination support is for FDB only\n"); + return NULL; + } + + hw_dests = kcalloc(num_of_dests, sizeof(*hw_dests), GFP_KERNEL); + if (!hw_dests) + return NULL; + + if (unlikely(check_mul_overflow(num_of_dests, 2u, &ref_act_cnt))) + goto free_hw_dests; + + ref_actions = kcalloc(ref_act_cnt, sizeof(*ref_actions), GFP_KERNEL); + if (!ref_actions) + goto free_hw_dests; + + for (i = 0; i < num_of_dests; i++) { + struct mlx5dr_action *reformat_action = dests[i].reformat; + struct mlx5dr_action *dest_action = dests[i].dest; + + ref_actions[num_of_ref++] = dest_action; + + switch (dest_action->action_type) { + case DR_ACTION_TYP_VPORT: + hw_dests[i].vport.flags = MLX5_FLOW_DEST_VPORT_VHCA_ID; + hw_dests[i].type = MLX5_FLOW_DESTINATION_TYPE_VPORT; + hw_dests[i].vport.num = dest_action->vport->caps->num; + hw_dests[i].vport.vhca_id = dest_action->vport->caps->vhca_gvmi; + if (reformat_action) { + reformat_req = true; + hw_dests[i].vport.reformat_id = + reformat_action->reformat->id; + ref_actions[num_of_ref++] = reformat_action; + hw_dests[i].vport.flags |= MLX5_FLOW_DEST_VPORT_REFORMAT_ID; + } + break; + + case DR_ACTION_TYP_FT: + hw_dests[i].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + if (dest_action->dest_tbl->is_fw_tbl) + hw_dests[i].ft_id = dest_action->dest_tbl->fw_tbl.id; + else + hw_dests[i].ft_id = dest_action->dest_tbl->tbl->table_id; + break; + + default: + mlx5dr_dbg(dmn, "Invalid multiple destinations action\n"); + goto free_ref_actions; + } + } + + action = dr_action_create_generic(DR_ACTION_TYP_FT); + if (!action) + goto free_ref_actions; + + ret = mlx5dr_fw_create_md_tbl(dmn, + hw_dests, + num_of_dests, + reformat_req, + &action->dest_tbl->fw_tbl.id, + &action->dest_tbl->fw_tbl.group_id, + ignore_flow_level, + flow_source); + if (ret) + goto free_action; + + refcount_inc(&dmn->refcount); + + for (i = 0; i < num_of_ref; i++) + refcount_inc(&ref_actions[i]->refcount); + + action->dest_tbl->is_fw_tbl = true; + action->dest_tbl->fw_tbl.dmn = dmn; + action->dest_tbl->fw_tbl.type = FS_FT_FDB; + action->dest_tbl->fw_tbl.ref_actions = ref_actions; + action->dest_tbl->fw_tbl.num_of_ref_actions = num_of_ref; + + kfree(hw_dests); + + return action; + +free_action: + kfree(action); +free_ref_actions: + kfree(ref_actions); +free_hw_dests: + kfree(hw_dests); + return NULL; +} + +struct mlx5dr_action * +mlx5dr_action_create_dest_flow_fw_table(struct mlx5dr_domain *dmn, + struct mlx5_flow_table *ft) +{ + struct mlx5dr_action *action; + + action = dr_action_create_generic(DR_ACTION_TYP_FT); + if (!action) + return NULL; + + action->dest_tbl->is_fw_tbl = 1; + action->dest_tbl->fw_tbl.type = ft->type; + action->dest_tbl->fw_tbl.id = ft->id; + action->dest_tbl->fw_tbl.dmn = dmn; + + refcount_inc(&dmn->refcount); + + return action; +} + +struct mlx5dr_action * +mlx5dr_action_create_flow_counter(u32 counter_id) +{ + struct mlx5dr_action *action; + + action = dr_action_create_generic(DR_ACTION_TYP_CTR); + if (!action) + return NULL; + + action->ctr->ctr_id = counter_id; + + return action; +} + +struct mlx5dr_action *mlx5dr_action_create_tag(u32 tag_value) +{ + struct mlx5dr_action *action; + + action = dr_action_create_generic(DR_ACTION_TYP_TAG); + if (!action) + return NULL; + + action->flow_tag->flow_tag = tag_value & 0xffffff; + + return action; +} + +struct mlx5dr_action * +mlx5dr_action_create_flow_sampler(struct mlx5dr_domain *dmn, u32 sampler_id) +{ + struct mlx5dr_action *action; + u64 icm_rx, icm_tx; + int ret; + + ret = mlx5dr_cmd_query_flow_sampler(dmn->mdev, sampler_id, + &icm_rx, &icm_tx); + if (ret) + return NULL; + + action = dr_action_create_generic(DR_ACTION_TYP_SAMPLER); + if (!action) + return NULL; + + action->sampler->dmn = dmn; + action->sampler->sampler_id = sampler_id; + action->sampler->rx_icm_addr = icm_rx; + action->sampler->tx_icm_addr = icm_tx; + + refcount_inc(&dmn->refcount); + return action; +} + +static int +dr_action_verify_reformat_params(enum mlx5dr_action_type reformat_type, + struct mlx5dr_domain *dmn, + u8 reformat_param_0, + u8 reformat_param_1, + size_t data_sz, + void *data) +{ + if (reformat_type == DR_ACTION_TYP_INSERT_HDR) { + if ((!data && data_sz) || (data && !data_sz) || + MLX5_CAP_GEN_2(dmn->mdev, max_reformat_insert_size) < data_sz || + MLX5_CAP_GEN_2(dmn->mdev, max_reformat_insert_offset) < reformat_param_1) { + mlx5dr_dbg(dmn, "Invalid reformat parameters for INSERT_HDR\n"); + goto out_err; + } + } else if (reformat_type == DR_ACTION_TYP_REMOVE_HDR) { + if (data || + MLX5_CAP_GEN_2(dmn->mdev, max_reformat_remove_size) < data_sz || + MLX5_CAP_GEN_2(dmn->mdev, max_reformat_remove_offset) < reformat_param_1) { + mlx5dr_dbg(dmn, "Invalid reformat parameters for REMOVE_HDR\n"); + goto out_err; + } + } else if (reformat_param_0 || reformat_param_1 || + reformat_type > DR_ACTION_TYP_REMOVE_HDR) { + mlx5dr_dbg(dmn, "Invalid reformat parameters\n"); + goto out_err; + } + + if (dmn->type == MLX5DR_DOMAIN_TYPE_FDB) + return 0; + + if (dmn->type == MLX5DR_DOMAIN_TYPE_NIC_RX) { + if (reformat_type != DR_ACTION_TYP_TNL_L2_TO_L2 && + reformat_type != DR_ACTION_TYP_TNL_L3_TO_L2) { + mlx5dr_dbg(dmn, "Action reformat type not support on RX domain\n"); + goto out_err; + } + } else if (dmn->type == MLX5DR_DOMAIN_TYPE_NIC_TX) { + if (reformat_type != DR_ACTION_TYP_L2_TO_TNL_L2 && + reformat_type != DR_ACTION_TYP_L2_TO_TNL_L3) { + mlx5dr_dbg(dmn, "Action reformat type not support on TX domain\n"); + goto out_err; + } + } + + return 0; + +out_err: + return -EINVAL; +} + +static int +dr_action_create_reformat_action(struct mlx5dr_domain *dmn, + u8 reformat_param_0, u8 reformat_param_1, + size_t data_sz, void *data, + struct mlx5dr_action *action) +{ + u32 reformat_id; + int ret; + + switch (action->action_type) { + case DR_ACTION_TYP_L2_TO_TNL_L2: + case DR_ACTION_TYP_L2_TO_TNL_L3: + { + enum mlx5_reformat_ctx_type rt; + + if (action->action_type == DR_ACTION_TYP_L2_TO_TNL_L2) + rt = MLX5_REFORMAT_TYPE_L2_TO_L2_TUNNEL; + else + rt = MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL; + + ret = mlx5dr_cmd_create_reformat_ctx(dmn->mdev, rt, 0, 0, + data_sz, data, + &reformat_id); + if (ret) + return ret; + + action->reformat->id = reformat_id; + action->reformat->size = data_sz; + return 0; + } + case DR_ACTION_TYP_TNL_L2_TO_L2: + { + return 0; + } + case DR_ACTION_TYP_TNL_L3_TO_L2: + { + u8 *hw_actions = kzalloc(MLX5DR_ACTION_CACHE_LINE_SIZE, + GFP_KERNEL); + if (!hw_actions) + return -ENOMEM; + + ret = mlx5dr_ste_set_action_decap_l3_list(dmn->ste_ctx, + data, data_sz, + hw_actions, + MLX5DR_ACTION_CACHE_LINE_SIZE, + &action->rewrite->num_of_actions); + if (ret) { + mlx5dr_dbg(dmn, "Failed creating decap l3 action list\n"); + kfree(hw_actions); + return ret; + } + + action->rewrite->data = hw_actions; + action->rewrite->dmn = dmn; + + ret = mlx5dr_ste_alloc_modify_hdr(action); + if (ret) { + mlx5dr_dbg(dmn, "Failed prepare reformat data\n"); + kfree(hw_actions); + return ret; + } + return 0; + } + case DR_ACTION_TYP_INSERT_HDR: + ret = mlx5dr_cmd_create_reformat_ctx(dmn->mdev, + MLX5_REFORMAT_TYPE_INSERT_HDR, + reformat_param_0, + reformat_param_1, + data_sz, data, + &reformat_id); + if (ret) + return ret; + + action->reformat->id = reformat_id; + action->reformat->size = data_sz; + action->reformat->param_0 = reformat_param_0; + action->reformat->param_1 = reformat_param_1; + return 0; + case DR_ACTION_TYP_REMOVE_HDR: + action->reformat->id = 0; + action->reformat->size = data_sz; + action->reformat->param_0 = reformat_param_0; + action->reformat->param_1 = reformat_param_1; + return 0; + default: + mlx5dr_info(dmn, "Reformat type is not supported %d\n", action->action_type); + return -EINVAL; + } +} + +#define CVLAN_ETHERTYPE 0x8100 +#define SVLAN_ETHERTYPE 0x88a8 + +struct mlx5dr_action *mlx5dr_action_create_pop_vlan(void) +{ + return dr_action_create_generic(DR_ACTION_TYP_POP_VLAN); +} + +struct mlx5dr_action *mlx5dr_action_create_push_vlan(struct mlx5dr_domain *dmn, + __be32 vlan_hdr) +{ + u32 vlan_hdr_h = ntohl(vlan_hdr); + u16 ethertype = vlan_hdr_h >> 16; + struct mlx5dr_action *action; + + if (ethertype != SVLAN_ETHERTYPE && ethertype != CVLAN_ETHERTYPE) { + mlx5dr_dbg(dmn, "Invalid vlan ethertype\n"); + return NULL; + } + + action = dr_action_create_generic(DR_ACTION_TYP_PUSH_VLAN); + if (!action) + return NULL; + + action->push_vlan->vlan_hdr = vlan_hdr_h; + return action; +} + +struct mlx5dr_action * +mlx5dr_action_create_packet_reformat(struct mlx5dr_domain *dmn, + enum mlx5dr_action_reformat_type reformat_type, + u8 reformat_param_0, + u8 reformat_param_1, + size_t data_sz, + void *data) +{ + enum mlx5dr_action_type action_type; + struct mlx5dr_action *action; + int ret; + + refcount_inc(&dmn->refcount); + + /* General checks */ + ret = dr_action_reformat_to_action_type(reformat_type, &action_type); + if (ret) { + mlx5dr_dbg(dmn, "Invalid reformat_type provided\n"); + goto dec_ref; + } + + ret = dr_action_verify_reformat_params(action_type, dmn, + reformat_param_0, reformat_param_1, + data_sz, data); + if (ret) + goto dec_ref; + + action = dr_action_create_generic(action_type); + if (!action) + goto dec_ref; + + action->reformat->dmn = dmn; + + ret = dr_action_create_reformat_action(dmn, + reformat_param_0, + reformat_param_1, + data_sz, + data, + action); + if (ret) { + mlx5dr_dbg(dmn, "Failed creating reformat action %d\n", ret); + goto free_action; + } + + return action; + +free_action: + kfree(action); +dec_ref: + refcount_dec(&dmn->refcount); + return NULL; +} + +static int +dr_action_modify_sw_to_hw_add(struct mlx5dr_domain *dmn, + __be64 *sw_action, + __be64 *hw_action, + const struct mlx5dr_ste_action_modify_field **ret_hw_info) +{ + const struct mlx5dr_ste_action_modify_field *hw_action_info; + u8 max_length; + u16 sw_field; + u32 data; + + /* Get SW modify action data */ + sw_field = MLX5_GET(set_action_in, sw_action, field); + data = MLX5_GET(set_action_in, sw_action, data); + + /* Convert SW data to HW modify action format */ + hw_action_info = mlx5dr_ste_conv_modify_hdr_sw_field(dmn->ste_ctx, sw_field); + if (!hw_action_info) { + mlx5dr_dbg(dmn, "Modify add action invalid field given\n"); + return -EINVAL; + } + + max_length = hw_action_info->end - hw_action_info->start + 1; + + mlx5dr_ste_set_action_add(dmn->ste_ctx, + hw_action, + hw_action_info->hw_field, + hw_action_info->start, + max_length, + data); + + *ret_hw_info = hw_action_info; + + return 0; +} + +static int +dr_action_modify_sw_to_hw_set(struct mlx5dr_domain *dmn, + __be64 *sw_action, + __be64 *hw_action, + const struct mlx5dr_ste_action_modify_field **ret_hw_info) +{ + const struct mlx5dr_ste_action_modify_field *hw_action_info; + u8 offset, length, max_length; + u16 sw_field; + u32 data; + + /* Get SW modify action data */ + length = MLX5_GET(set_action_in, sw_action, length); + offset = MLX5_GET(set_action_in, sw_action, offset); + sw_field = MLX5_GET(set_action_in, sw_action, field); + data = MLX5_GET(set_action_in, sw_action, data); + + /* Convert SW data to HW modify action format */ + hw_action_info = mlx5dr_ste_conv_modify_hdr_sw_field(dmn->ste_ctx, sw_field); + if (!hw_action_info) { + mlx5dr_dbg(dmn, "Modify set action invalid field given\n"); + return -EINVAL; + } + + /* PRM defines that length zero specific length of 32bits */ + length = length ? length : 32; + + max_length = hw_action_info->end - hw_action_info->start + 1; + + if (length + offset > max_length) { + mlx5dr_dbg(dmn, "Modify action length + offset exceeds limit\n"); + return -EINVAL; + } + + mlx5dr_ste_set_action_set(dmn->ste_ctx, + hw_action, + hw_action_info->hw_field, + hw_action_info->start + offset, + length, + data); + + *ret_hw_info = hw_action_info; + + return 0; +} + +static int +dr_action_modify_sw_to_hw_copy(struct mlx5dr_domain *dmn, + __be64 *sw_action, + __be64 *hw_action, + const struct mlx5dr_ste_action_modify_field **ret_dst_hw_info, + const struct mlx5dr_ste_action_modify_field **ret_src_hw_info) +{ + u8 src_offset, dst_offset, src_max_length, dst_max_length, length; + const struct mlx5dr_ste_action_modify_field *hw_dst_action_info; + const struct mlx5dr_ste_action_modify_field *hw_src_action_info; + u16 src_field, dst_field; + + /* Get SW modify action data */ + src_field = MLX5_GET(copy_action_in, sw_action, src_field); + dst_field = MLX5_GET(copy_action_in, sw_action, dst_field); + src_offset = MLX5_GET(copy_action_in, sw_action, src_offset); + dst_offset = MLX5_GET(copy_action_in, sw_action, dst_offset); + length = MLX5_GET(copy_action_in, sw_action, length); + + /* Convert SW data to HW modify action format */ + hw_src_action_info = mlx5dr_ste_conv_modify_hdr_sw_field(dmn->ste_ctx, src_field); + hw_dst_action_info = mlx5dr_ste_conv_modify_hdr_sw_field(dmn->ste_ctx, dst_field); + if (!hw_src_action_info || !hw_dst_action_info) { + mlx5dr_dbg(dmn, "Modify copy action invalid field given\n"); + return -EINVAL; + } + + /* PRM defines that length zero specific length of 32bits */ + length = length ? length : 32; + + src_max_length = hw_src_action_info->end - + hw_src_action_info->start + 1; + dst_max_length = hw_dst_action_info->end - + hw_dst_action_info->start + 1; + + if (length + src_offset > src_max_length || + length + dst_offset > dst_max_length) { + mlx5dr_dbg(dmn, "Modify action length + offset exceeds limit\n"); + return -EINVAL; + } + + mlx5dr_ste_set_action_copy(dmn->ste_ctx, + hw_action, + hw_dst_action_info->hw_field, + hw_dst_action_info->start + dst_offset, + length, + hw_src_action_info->hw_field, + hw_src_action_info->start + src_offset); + + *ret_dst_hw_info = hw_dst_action_info; + *ret_src_hw_info = hw_src_action_info; + + return 0; +} + +static int +dr_action_modify_sw_to_hw(struct mlx5dr_domain *dmn, + __be64 *sw_action, + __be64 *hw_action, + const struct mlx5dr_ste_action_modify_field **ret_dst_hw_info, + const struct mlx5dr_ste_action_modify_field **ret_src_hw_info) +{ + u8 action; + int ret; + + *hw_action = 0; + *ret_src_hw_info = NULL; + + /* Get SW modify action type */ + action = MLX5_GET(set_action_in, sw_action, action_type); + + switch (action) { + case MLX5_ACTION_TYPE_SET: + ret = dr_action_modify_sw_to_hw_set(dmn, sw_action, + hw_action, + ret_dst_hw_info); + break; + + case MLX5_ACTION_TYPE_ADD: + ret = dr_action_modify_sw_to_hw_add(dmn, sw_action, + hw_action, + ret_dst_hw_info); + break; + + case MLX5_ACTION_TYPE_COPY: + ret = dr_action_modify_sw_to_hw_copy(dmn, sw_action, + hw_action, + ret_dst_hw_info, + ret_src_hw_info); + break; + + default: + mlx5dr_info(dmn, "Unsupported action_type for modify action\n"); + ret = -EOPNOTSUPP; + } + + return ret; +} + +static int +dr_action_modify_check_set_field_limitation(struct mlx5dr_action *action, + const __be64 *sw_action) +{ + u16 sw_field = MLX5_GET(set_action_in, sw_action, field); + struct mlx5dr_domain *dmn = action->rewrite->dmn; + + if (sw_field == MLX5_ACTION_IN_FIELD_METADATA_REG_A) { + action->rewrite->allow_rx = 0; + if (dmn->type != MLX5DR_DOMAIN_TYPE_NIC_TX) { + mlx5dr_dbg(dmn, "Unsupported field %d for RX/FDB set action\n", + sw_field); + return -EINVAL; + } + } else if (sw_field == MLX5_ACTION_IN_FIELD_METADATA_REG_B) { + action->rewrite->allow_tx = 0; + if (dmn->type != MLX5DR_DOMAIN_TYPE_NIC_RX) { + mlx5dr_dbg(dmn, "Unsupported field %d for TX/FDB set action\n", + sw_field); + return -EINVAL; + } + } + + if (!action->rewrite->allow_rx && !action->rewrite->allow_tx) { + mlx5dr_dbg(dmn, "Modify SET actions not supported on both RX and TX\n"); + return -EINVAL; + } + + return 0; +} + +static int +dr_action_modify_check_add_field_limitation(struct mlx5dr_action *action, + const __be64 *sw_action) +{ + u16 sw_field = MLX5_GET(set_action_in, sw_action, field); + struct mlx5dr_domain *dmn = action->rewrite->dmn; + + if (sw_field != MLX5_ACTION_IN_FIELD_OUT_IP_TTL && + sw_field != MLX5_ACTION_IN_FIELD_OUT_IPV6_HOPLIMIT && + sw_field != MLX5_ACTION_IN_FIELD_OUT_TCP_SEQ_NUM && + sw_field != MLX5_ACTION_IN_FIELD_OUT_TCP_ACK_NUM) { + mlx5dr_dbg(dmn, "Unsupported field %d for add action\n", + sw_field); + return -EINVAL; + } + + return 0; +} + +static int +dr_action_modify_check_copy_field_limitation(struct mlx5dr_action *action, + const __be64 *sw_action) +{ + struct mlx5dr_domain *dmn = action->rewrite->dmn; + u16 sw_fields[2]; + int i; + + sw_fields[0] = MLX5_GET(copy_action_in, sw_action, src_field); + sw_fields[1] = MLX5_GET(copy_action_in, sw_action, dst_field); + + for (i = 0; i < 2; i++) { + if (sw_fields[i] == MLX5_ACTION_IN_FIELD_METADATA_REG_A) { + action->rewrite->allow_rx = 0; + if (dmn->type != MLX5DR_DOMAIN_TYPE_NIC_TX) { + mlx5dr_dbg(dmn, "Unsupported field %d for RX/FDB set action\n", + sw_fields[i]); + return -EINVAL; + } + } else if (sw_fields[i] == MLX5_ACTION_IN_FIELD_METADATA_REG_B) { + action->rewrite->allow_tx = 0; + if (dmn->type != MLX5DR_DOMAIN_TYPE_NIC_RX) { + mlx5dr_dbg(dmn, "Unsupported field %d for TX/FDB set action\n", + sw_fields[i]); + return -EINVAL; + } + } + } + + if (!action->rewrite->allow_rx && !action->rewrite->allow_tx) { + mlx5dr_dbg(dmn, "Modify copy actions not supported on both RX and TX\n"); + return -EINVAL; + } + + return 0; +} + +static int +dr_action_modify_check_field_limitation(struct mlx5dr_action *action, + const __be64 *sw_action) +{ + struct mlx5dr_domain *dmn = action->rewrite->dmn; + u8 action_type; + int ret; + + action_type = MLX5_GET(set_action_in, sw_action, action_type); + + switch (action_type) { + case MLX5_ACTION_TYPE_SET: + ret = dr_action_modify_check_set_field_limitation(action, + sw_action); + break; + + case MLX5_ACTION_TYPE_ADD: + ret = dr_action_modify_check_add_field_limitation(action, + sw_action); + break; + + case MLX5_ACTION_TYPE_COPY: + ret = dr_action_modify_check_copy_field_limitation(action, + sw_action); + break; + + default: + mlx5dr_info(dmn, "Unsupported action %d modify action\n", + action_type); + ret = -EOPNOTSUPP; + } + + return ret; +} + +static bool +dr_action_modify_check_is_ttl_modify(const void *sw_action) +{ + u16 sw_field = MLX5_GET(set_action_in, sw_action, field); + + return sw_field == MLX5_ACTION_IN_FIELD_OUT_IP_TTL; +} + +static int dr_actions_convert_modify_header(struct mlx5dr_action *action, + u32 max_hw_actions, + u32 num_sw_actions, + __be64 sw_actions[], + __be64 hw_actions[], + u32 *num_hw_actions, + bool *modify_ttl) +{ + const struct mlx5dr_ste_action_modify_field *hw_dst_action_info; + const struct mlx5dr_ste_action_modify_field *hw_src_action_info; + struct mlx5dr_domain *dmn = action->rewrite->dmn; + __be64 *modify_ttl_sw_action = NULL; + int ret, i, hw_idx = 0; + __be64 *sw_action; + __be64 hw_action; + u16 hw_field = 0; + u32 l3_type = 0; + u32 l4_type = 0; + + *modify_ttl = false; + + action->rewrite->allow_rx = 1; + action->rewrite->allow_tx = 1; + + for (i = 0; i < num_sw_actions || modify_ttl_sw_action; i++) { + /* modify TTL is handled separately, as a last action */ + if (i == num_sw_actions) { + sw_action = modify_ttl_sw_action; + modify_ttl_sw_action = NULL; + } else { + sw_action = &sw_actions[i]; + } + + ret = dr_action_modify_check_field_limitation(action, + sw_action); + if (ret) + return ret; + + if (!(*modify_ttl) && + dr_action_modify_check_is_ttl_modify(sw_action)) { + modify_ttl_sw_action = sw_action; + *modify_ttl = true; + continue; + } + + /* Convert SW action to HW action */ + ret = dr_action_modify_sw_to_hw(dmn, + sw_action, + &hw_action, + &hw_dst_action_info, + &hw_src_action_info); + if (ret) + return ret; + + /* Due to a HW limitation we cannot modify 2 different L3 types */ + if (l3_type && hw_dst_action_info->l3_type && + hw_dst_action_info->l3_type != l3_type) { + mlx5dr_dbg(dmn, "Action list can't support two different L3 types\n"); + return -EINVAL; + } + if (hw_dst_action_info->l3_type) + l3_type = hw_dst_action_info->l3_type; + + /* Due to a HW limitation we cannot modify two different L4 types */ + if (l4_type && hw_dst_action_info->l4_type && + hw_dst_action_info->l4_type != l4_type) { + mlx5dr_dbg(dmn, "Action list can't support two different L4 types\n"); + return -EINVAL; + } + if (hw_dst_action_info->l4_type) + l4_type = hw_dst_action_info->l4_type; + + /* HW reads and executes two actions at once this means we + * need to create a gap if two actions access the same field + */ + if ((hw_idx % 2) && (hw_field == hw_dst_action_info->hw_field || + (hw_src_action_info && + hw_field == hw_src_action_info->hw_field))) { + /* Check if after gap insertion the total number of HW + * modify actions doesn't exceeds the limit + */ + hw_idx++; + if (hw_idx >= max_hw_actions) { + mlx5dr_dbg(dmn, "Modify header action number exceeds HW limit\n"); + return -EINVAL; + } + } + hw_field = hw_dst_action_info->hw_field; + + hw_actions[hw_idx] = hw_action; + hw_idx++; + } + + /* if the resulting HW actions list is empty, add NOP action */ + if (!hw_idx) + hw_idx++; + + *num_hw_actions = hw_idx; + + return 0; +} + +static int dr_action_create_modify_action(struct mlx5dr_domain *dmn, + size_t actions_sz, + __be64 actions[], + struct mlx5dr_action *action) +{ + u32 max_hw_actions; + u32 num_hw_actions; + u32 num_sw_actions; + __be64 *hw_actions; + bool modify_ttl; + int ret; + + num_sw_actions = actions_sz / DR_MODIFY_ACTION_SIZE; + max_hw_actions = mlx5dr_icm_pool_chunk_size_to_entries(DR_CHUNK_SIZE_16); + + if (num_sw_actions > max_hw_actions) { + mlx5dr_dbg(dmn, "Max number of actions %d exceeds limit %d\n", + num_sw_actions, max_hw_actions); + return -EINVAL; + } + + hw_actions = kcalloc(1, max_hw_actions * DR_MODIFY_ACTION_SIZE, GFP_KERNEL); + if (!hw_actions) + return -ENOMEM; + + ret = dr_actions_convert_modify_header(action, + max_hw_actions, + num_sw_actions, + actions, + hw_actions, + &num_hw_actions, + &modify_ttl); + if (ret) + goto free_hw_actions; + + action->rewrite->modify_ttl = modify_ttl; + action->rewrite->data = (u8 *)hw_actions; + action->rewrite->num_of_actions = num_hw_actions; + + if (num_hw_actions == 1 && + dmn->info.caps.sw_format_ver >= MLX5_STEERING_FORMAT_CONNECTX_6DX) { + action->rewrite->single_action_opt = true; + } else { + action->rewrite->single_action_opt = false; + ret = mlx5dr_ste_alloc_modify_hdr(action); + if (ret) + goto free_hw_actions; + } + + return 0; + +free_hw_actions: + kfree(hw_actions); + return ret; +} + +struct mlx5dr_action * +mlx5dr_action_create_modify_header(struct mlx5dr_domain *dmn, + u32 flags, + size_t actions_sz, + __be64 actions[]) +{ + struct mlx5dr_action *action; + int ret = 0; + + refcount_inc(&dmn->refcount); + + if (actions_sz % DR_MODIFY_ACTION_SIZE) { + mlx5dr_dbg(dmn, "Invalid modify actions size provided\n"); + goto dec_ref; + } + + action = dr_action_create_generic(DR_ACTION_TYP_MODIFY_HDR); + if (!action) + goto dec_ref; + + action->rewrite->dmn = dmn; + + ret = dr_action_create_modify_action(dmn, + actions_sz, + actions, + action); + if (ret) { + mlx5dr_dbg(dmn, "Failed creating modify header action %d\n", ret); + goto free_action; + } + + return action; + +free_action: + kfree(action); +dec_ref: + refcount_dec(&dmn->refcount); + return NULL; +} + +struct mlx5dr_action * +mlx5dr_action_create_dest_vport(struct mlx5dr_domain *dmn, + u16 vport, u8 vhca_id_valid, + u16 vhca_id) +{ + struct mlx5dr_cmd_vport_cap *vport_cap; + struct mlx5dr_domain *vport_dmn; + struct mlx5dr_action *action; + u8 peer_vport; + + peer_vport = vhca_id_valid && (vhca_id != dmn->info.caps.gvmi); + vport_dmn = peer_vport ? dmn->peer_dmn : dmn; + if (!vport_dmn) { + mlx5dr_dbg(dmn, "No peer vport domain for given vhca_id\n"); + return NULL; + } + + if (vport_dmn->type != MLX5DR_DOMAIN_TYPE_FDB) { + mlx5dr_dbg(dmn, "Domain doesn't support vport actions\n"); + return NULL; + } + + vport_cap = mlx5dr_domain_get_vport_cap(vport_dmn, vport); + if (!vport_cap) { + mlx5dr_err(dmn, + "Failed to get vport 0x%x caps - vport is disabled or invalid\n", + vport); + return NULL; + } + + action = dr_action_create_generic(DR_ACTION_TYP_VPORT); + if (!action) + return NULL; + + action->vport->dmn = vport_dmn; + action->vport->caps = vport_cap; + + return action; +} + +u32 mlx5dr_action_get_pkt_reformat_id(struct mlx5dr_action *action) +{ + return action->reformat->id; +} + +struct mlx5dr_action * +mlx5dr_action_create_aso_flow_meter(struct mlx5dr_domain *dmn, + u32 obj_id, u8 dest_reg_id, + u8 init_color, u8 meter_idx) +{ + struct mlx5dr_action *action; + + if (init_color > MLX5_FLOW_METER_COLOR_UNDEFINED) + return NULL; + + action = dr_action_create_generic(DR_ACTION_TYP_ASO_FLOW_METER); + if (!action) + return NULL; + + action->aso->obj_id = obj_id; + action->aso->offset = meter_idx; + action->aso->dest_reg_id = dest_reg_id; + action->aso->init_color = init_color; + action->aso->dmn = dmn; + + refcount_inc(&dmn->refcount); + + return action; +} + +struct mlx5dr_action * +mlx5dr_action_create_aso(struct mlx5dr_domain *dmn, u32 obj_id, + u8 dest_reg_id, u8 aso_type, + u8 init_color, u8 meter_id) +{ + struct mlx5dr_action *action; + + if (aso_type != MLX5_EXE_ASO_FLOW_METER) + return NULL; + + if (init_color > MLX5_FLOW_METER_COLOR_UNDEFINED) + return NULL; + + action = dr_action_create_generic(DR_ACTION_TYP_ASO_FLOW_METER); + if (!action) + return NULL; + + action->aso->obj_id = obj_id; + action->aso->offset = meter_id; + action->aso->dest_reg_id = dest_reg_id; + action->aso->init_color = init_color; + action->aso->dmn = dmn; + + refcount_inc(&dmn->refcount); + + return action; +} + +int mlx5dr_action_destroy(struct mlx5dr_action *action) +{ + if (WARN_ON_ONCE(refcount_read(&action->refcount) > 1)) + return -EBUSY; + + switch (action->action_type) { + case DR_ACTION_TYP_FT: + if (action->dest_tbl->is_fw_tbl) + refcount_dec(&action->dest_tbl->fw_tbl.dmn->refcount); + else + refcount_dec(&action->dest_tbl->tbl->refcount); + + if (action->dest_tbl->is_fw_tbl && + action->dest_tbl->fw_tbl.num_of_ref_actions) { + struct mlx5dr_action **ref_actions; + int i; + + ref_actions = action->dest_tbl->fw_tbl.ref_actions; + for (i = 0; i < action->dest_tbl->fw_tbl.num_of_ref_actions; i++) + refcount_dec(&ref_actions[i]->refcount); + + kfree(ref_actions); + + mlx5dr_fw_destroy_md_tbl(action->dest_tbl->fw_tbl.dmn, + action->dest_tbl->fw_tbl.id, + action->dest_tbl->fw_tbl.group_id); + } + break; + case DR_ACTION_TYP_TNL_L2_TO_L2: + case DR_ACTION_TYP_REMOVE_HDR: + refcount_dec(&action->reformat->dmn->refcount); + break; + case DR_ACTION_TYP_TNL_L3_TO_L2: + mlx5dr_ste_free_modify_hdr(action); + refcount_dec(&action->rewrite->dmn->refcount); + break; + case DR_ACTION_TYP_L2_TO_TNL_L2: + case DR_ACTION_TYP_L2_TO_TNL_L3: + case DR_ACTION_TYP_INSERT_HDR: + mlx5dr_cmd_destroy_reformat_ctx((action->reformat->dmn)->mdev, + action->reformat->id); + refcount_dec(&action->reformat->dmn->refcount); + break; + case DR_ACTION_TYP_MODIFY_HDR: + if (action->rewrite->single_action_opt) + kfree(action->rewrite->data); + else + mlx5dr_ste_free_modify_hdr(action); + refcount_dec(&action->rewrite->dmn->refcount); + break; + case DR_ACTION_TYP_SAMPLER: + refcount_dec(&action->sampler->dmn->refcount); + break; + case DR_ACTION_TYP_ASO_FLOW_METER: + refcount_dec(&action->aso->dmn->refcount); + break; + default: + break; + } + + kfree(action); + return 0; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_buddy.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_buddy.c new file mode 100644 index 0000000..fe228d9 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_buddy.c @@ -0,0 +1,168 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 - 2008 Mellanox Technologies. All rights reserved. + * Copyright (c) 2006 - 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. + */ + +#include "dr_types.h" + +int mlx5dr_buddy_init(struct mlx5dr_icm_buddy_mem *buddy, + unsigned int max_order) +{ + int i; + + buddy->max_order = max_order; + + INIT_LIST_HEAD(&buddy->list_node); + + buddy->bitmap = kcalloc(buddy->max_order + 1, + sizeof(*buddy->bitmap), + GFP_KERNEL); + buddy->num_free = kcalloc(buddy->max_order + 1, + sizeof(*buddy->num_free), + GFP_KERNEL); + + if (!buddy->bitmap || !buddy->num_free) + goto err_free_all; + + /* Allocating max_order bitmaps, one for each order */ + + for (i = 0; i <= buddy->max_order; ++i) { + unsigned int size = 1 << (buddy->max_order - i); + + buddy->bitmap[i] = bitmap_zalloc(size, GFP_KERNEL); + if (!buddy->bitmap[i]) + goto err_out_free_each_bit_per_order; + } + + /* In the beginning, we have only one order that is available for + * use (the biggest one), so mark the first bit in both bitmaps. + */ + + bitmap_set(buddy->bitmap[buddy->max_order], 0, 1); + + buddy->num_free[buddy->max_order] = 1; + + return 0; + +err_out_free_each_bit_per_order: + for (i = 0; i <= buddy->max_order; ++i) + bitmap_free(buddy->bitmap[i]); + +err_free_all: + kfree(buddy->num_free); + kfree(buddy->bitmap); + return -ENOMEM; +} + +void mlx5dr_buddy_cleanup(struct mlx5dr_icm_buddy_mem *buddy) +{ + int i; + + list_del(&buddy->list_node); + + for (i = 0; i <= buddy->max_order; ++i) + bitmap_free(buddy->bitmap[i]); + + kfree(buddy->num_free); + kfree(buddy->bitmap); +} + +static int dr_buddy_find_free_seg(struct mlx5dr_icm_buddy_mem *buddy, + unsigned int start_order, + unsigned int *segment, + unsigned int *order) +{ + unsigned int seg, order_iter, m; + + for (order_iter = start_order; + order_iter <= buddy->max_order; ++order_iter) { + if (!buddy->num_free[order_iter]) + continue; + + m = 1 << (buddy->max_order - order_iter); + seg = find_first_bit(buddy->bitmap[order_iter], m); + + if (WARN(seg >= m, + "ICM Buddy: failed finding free mem for order %d\n", + order_iter)) + return -ENOMEM; + + break; + } + + if (order_iter > buddy->max_order) + return -ENOMEM; + + *segment = seg; + *order = order_iter; + return 0; +} + +/** + * mlx5dr_buddy_alloc_mem() - Update second level bitmap. + * @buddy: Buddy to update. + * @order: Order of the buddy to update. + * @segment: Segment number. + * + * This function finds the first area of the ICM memory managed by this buddy. + * It uses the data structures of the buddy system in order to find the first + * area of free place, starting from the current order till the maximum order + * in the system. + * + * Return: 0 when segment is set, non-zero error status otherwise. + * + * The function returns the location (segment) in the whole buddy ICM memory + * area - the index of the memory segment that is available for use. + */ +int mlx5dr_buddy_alloc_mem(struct mlx5dr_icm_buddy_mem *buddy, + unsigned int order, + unsigned int *segment) +{ + unsigned int seg, order_iter; + int err; + + err = dr_buddy_find_free_seg(buddy, order, &seg, &order_iter); + if (err) + return err; + + bitmap_clear(buddy->bitmap[order_iter], seg, 1); + --buddy->num_free[order_iter]; + + /* If we found free memory in some order that is bigger than the + * required order, we need to split every order between the required + * order and the order that we found into two parts, and mark accordingly. + */ + while (order_iter > order) { + --order_iter; + seg <<= 1; + bitmap_set(buddy->bitmap[order_iter], seg ^ 1, 1); + ++buddy->num_free[order_iter]; + } + + seg <<= order; + *segment = seg; + + return 0; +} + +void mlx5dr_buddy_free_mem(struct mlx5dr_icm_buddy_mem *buddy, + unsigned int seg, unsigned int order) +{ + seg >>= order; + + /* Whenever a segment is free, + * the mem is added to the buddy that gave it. + */ + while (test_bit(seg ^ 1, buddy->bitmap[order])) { + bitmap_clear(buddy->bitmap[order], seg ^ 1, 1); + --buddy->num_free[order]; + seg >>= 1; + ++order; + } + bitmap_set(buddy->bitmap[order], seg, 1); + + ++buddy->num_free[order]; +} + diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c new file mode 100644 index 0000000..0796396 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c @@ -0,0 +1,875 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2019 Mellanox Technologies. */ + +#include "dr_types.h" + +int mlx5dr_cmd_query_esw_vport_context(struct mlx5_core_dev *mdev, + bool other_vport, + u16 vport_number, + u64 *icm_address_rx, + u64 *icm_address_tx) +{ + u32 out[MLX5_ST_SZ_DW(query_esw_vport_context_out)] = {}; + u32 in[MLX5_ST_SZ_DW(query_esw_vport_context_in)] = {}; + int err; + + MLX5_SET(query_esw_vport_context_in, in, opcode, + MLX5_CMD_OP_QUERY_ESW_VPORT_CONTEXT); + MLX5_SET(query_esw_vport_context_in, in, other_vport, other_vport); + MLX5_SET(query_esw_vport_context_in, in, vport_number, vport_number); + + err = mlx5_cmd_exec_inout(mdev, query_esw_vport_context, in, out); + if (err) + return err; + + *icm_address_rx = + MLX5_GET64(query_esw_vport_context_out, out, + esw_vport_context.sw_steering_vport_icm_address_rx); + *icm_address_tx = + MLX5_GET64(query_esw_vport_context_out, out, + esw_vport_context.sw_steering_vport_icm_address_tx); + return 0; +} + +int mlx5dr_cmd_query_gvmi(struct mlx5_core_dev *mdev, bool other_vport, + u16 vport_number, u16 *gvmi) +{ + u32 in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {}; + int out_size; + void *out; + int err; + + out_size = MLX5_ST_SZ_BYTES(query_hca_cap_out); + out = kzalloc(out_size, GFP_KERNEL); + if (!out) + return -ENOMEM; + + MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP); + MLX5_SET(query_hca_cap_in, in, other_function, other_vport); + MLX5_SET(query_hca_cap_in, in, function_id, vport_number); + MLX5_SET(query_hca_cap_in, in, op_mod, + MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE << 1 | + HCA_CAP_OPMOD_GET_CUR); + + err = mlx5_cmd_exec_inout(mdev, query_hca_cap, in, out); + if (err) { + kfree(out); + return err; + } + + *gvmi = MLX5_GET(query_hca_cap_out, out, capability.cmd_hca_cap.vhca_id); + + kfree(out); + return 0; +} + +int mlx5dr_cmd_query_esw_caps(struct mlx5_core_dev *mdev, + struct mlx5dr_esw_caps *caps) +{ + caps->drop_icm_address_rx = + MLX5_CAP64_ESW_FLOWTABLE(mdev, + sw_steering_fdb_action_drop_icm_address_rx); + caps->drop_icm_address_tx = + MLX5_CAP64_ESW_FLOWTABLE(mdev, + sw_steering_fdb_action_drop_icm_address_tx); + caps->uplink_icm_address_rx = + MLX5_CAP64_ESW_FLOWTABLE(mdev, + sw_steering_uplink_icm_address_rx); + caps->uplink_icm_address_tx = + MLX5_CAP64_ESW_FLOWTABLE(mdev, + sw_steering_uplink_icm_address_tx); + caps->sw_owner_v2 = MLX5_CAP_ESW_FLOWTABLE_FDB(mdev, sw_owner_v2); + if (!caps->sw_owner_v2) + caps->sw_owner = MLX5_CAP_ESW_FLOWTABLE_FDB(mdev, sw_owner); + + return 0; +} + +static int dr_cmd_query_nic_vport_roce_en(struct mlx5_core_dev *mdev, + u16 vport, bool *roce_en) +{ + u32 out[MLX5_ST_SZ_DW(query_nic_vport_context_out)] = {}; + u32 in[MLX5_ST_SZ_DW(query_nic_vport_context_in)] = {}; + int err; + + MLX5_SET(query_nic_vport_context_in, in, opcode, + MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT); + MLX5_SET(query_nic_vport_context_in, in, vport_number, vport); + MLX5_SET(query_nic_vport_context_in, in, other_vport, !!vport); + + err = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); + if (err) + return err; + + *roce_en = MLX5_GET(query_nic_vport_context_out, out, + nic_vport_context.roce_en); + return 0; +} + +int mlx5dr_cmd_query_device(struct mlx5_core_dev *mdev, + struct mlx5dr_cmd_caps *caps) +{ + bool roce_en; + int err; + + caps->prio_tag_required = MLX5_CAP_GEN(mdev, prio_tag_required); + caps->eswitch_manager = MLX5_CAP_GEN(mdev, eswitch_manager); + caps->gvmi = MLX5_CAP_GEN(mdev, vhca_id); + caps->flex_protocols = MLX5_CAP_GEN(mdev, flex_parser_protocols); + caps->sw_format_ver = MLX5_CAP_GEN(mdev, steering_format_version); + + if (MLX5_CAP_GEN(mdev, roce)) { + err = dr_cmd_query_nic_vport_roce_en(mdev, 0, &roce_en); + if (err) + return err; + + caps->roce_caps.roce_en = roce_en; + caps->roce_caps.fl_rc_qp_when_roce_disabled = + MLX5_CAP_ROCE(mdev, fl_rc_qp_when_roce_disabled); + caps->roce_caps.fl_rc_qp_when_roce_enabled = + MLX5_CAP_ROCE(mdev, fl_rc_qp_when_roce_enabled); + } + + caps->isolate_vl_tc = MLX5_CAP_GEN(mdev, isolate_vl_tc_new); + + caps->support_modify_argument = MLX5_CAP_GEN_64(mdev, general_obj_types) & + MLX5_GENERAL_OBJ_TYPES_CAP_HEADER_MODIFY_ARGUMENT; + + if (caps->support_modify_argument) { + caps->log_header_modify_argument_granularity = + MLX5_CAP_GEN(mdev, log_header_modify_argument_granularity); + caps->log_header_modify_argument_max_alloc = + MLX5_CAP_GEN(mdev, log_header_modify_argument_max_alloc); + } + + /* geneve_tlv_option_0_exist is the indication of + * STE support for lookup type flex_parser_ok + */ + caps->flex_parser_ok_bits_supp = + MLX5_CAP_FLOWTABLE(mdev, + flow_table_properties_nic_receive.ft_field_support.geneve_tlv_option_0_exist); + + if (caps->flex_protocols & MLX5_FLEX_PARSER_ICMP_V4_ENABLED) { + caps->flex_parser_id_icmp_dw0 = MLX5_CAP_GEN(mdev, flex_parser_id_icmp_dw0); + caps->flex_parser_id_icmp_dw1 = MLX5_CAP_GEN(mdev, flex_parser_id_icmp_dw1); + } + + if (caps->flex_protocols & MLX5_FLEX_PARSER_ICMP_V6_ENABLED) { + caps->flex_parser_id_icmpv6_dw0 = + MLX5_CAP_GEN(mdev, flex_parser_id_icmpv6_dw0); + caps->flex_parser_id_icmpv6_dw1 = + MLX5_CAP_GEN(mdev, flex_parser_id_icmpv6_dw1); + } + + if (caps->flex_protocols & MLX5_FLEX_PARSER_GENEVE_TLV_OPTION_0_ENABLED) + caps->flex_parser_id_geneve_tlv_option_0 = + MLX5_CAP_GEN(mdev, flex_parser_id_geneve_tlv_option_0); + + if (caps->flex_protocols & MLX5_FLEX_PARSER_MPLS_OVER_GRE_ENABLED) + caps->flex_parser_id_mpls_over_gre = + MLX5_CAP_GEN(mdev, flex_parser_id_outer_first_mpls_over_gre); + + if (caps->flex_protocols & MLX5_FLEX_PARSER_MPLS_OVER_UDP_ENABLED) + caps->flex_parser_id_mpls_over_udp = + MLX5_CAP_GEN(mdev, flex_parser_id_outer_first_mpls_over_udp_label); + + if (caps->flex_protocols & MLX5_FLEX_PARSER_GTPU_DW_0_ENABLED) + caps->flex_parser_id_gtpu_dw_0 = + MLX5_CAP_GEN(mdev, flex_parser_id_gtpu_dw_0); + + if (caps->flex_protocols & MLX5_FLEX_PARSER_GTPU_TEID_ENABLED) + caps->flex_parser_id_gtpu_teid = + MLX5_CAP_GEN(mdev, flex_parser_id_gtpu_teid); + + if (caps->flex_protocols & MLX5_FLEX_PARSER_GTPU_DW_2_ENABLED) + caps->flex_parser_id_gtpu_dw_2 = + MLX5_CAP_GEN(mdev, flex_parser_id_gtpu_dw_2); + + if (caps->flex_protocols & MLX5_FLEX_PARSER_GTPU_FIRST_EXT_DW_0_ENABLED) + caps->flex_parser_id_gtpu_first_ext_dw_0 = + MLX5_CAP_GEN(mdev, flex_parser_id_gtpu_first_ext_dw_0); + + caps->nic_rx_drop_address = + MLX5_CAP64_FLOWTABLE(mdev, sw_steering_nic_rx_action_drop_icm_address); + caps->nic_tx_drop_address = + MLX5_CAP64_FLOWTABLE(mdev, sw_steering_nic_tx_action_drop_icm_address); + caps->nic_tx_allow_address = + MLX5_CAP64_FLOWTABLE(mdev, sw_steering_nic_tx_action_allow_icm_address); + + caps->rx_sw_owner_v2 = MLX5_CAP_FLOWTABLE_NIC_RX(mdev, sw_owner_v2); + caps->tx_sw_owner_v2 = MLX5_CAP_FLOWTABLE_NIC_TX(mdev, sw_owner_v2); + + if (!caps->rx_sw_owner_v2) + caps->rx_sw_owner = MLX5_CAP_FLOWTABLE_NIC_RX(mdev, sw_owner); + if (!caps->tx_sw_owner_v2) + caps->tx_sw_owner = MLX5_CAP_FLOWTABLE_NIC_TX(mdev, sw_owner); + + caps->max_ft_level = MLX5_CAP_FLOWTABLE_NIC_RX(mdev, max_ft_level); + + caps->log_icm_size = MLX5_CAP_DEV_MEM(mdev, log_steering_sw_icm_size); + caps->hdr_modify_icm_addr = + MLX5_CAP64_DEV_MEM(mdev, header_modify_sw_icm_start_address); + + caps->log_modify_pattern_icm_size = + MLX5_CAP_DEV_MEM(mdev, log_header_modify_pattern_sw_icm_size); + caps->hdr_modify_pattern_icm_addr = + MLX5_CAP64_DEV_MEM(mdev, header_modify_pattern_sw_icm_start_address); + + caps->roce_min_src_udp = MLX5_CAP_ROCE(mdev, r_roce_min_src_udp_port); + + caps->is_ecpf = mlx5_core_is_ecpf_esw_manager(mdev); + + return 0; +} + +int mlx5dr_cmd_query_flow_table(struct mlx5_core_dev *dev, + enum fs_flow_table_type type, + u32 table_id, + struct mlx5dr_cmd_query_flow_table_details *output) +{ + u32 out[MLX5_ST_SZ_DW(query_flow_table_out)] = {}; + u32 in[MLX5_ST_SZ_DW(query_flow_table_in)] = {}; + int err; + + MLX5_SET(query_flow_table_in, in, opcode, + MLX5_CMD_OP_QUERY_FLOW_TABLE); + + MLX5_SET(query_flow_table_in, in, table_type, type); + MLX5_SET(query_flow_table_in, in, table_id, table_id); + + err = mlx5_cmd_exec_inout(dev, query_flow_table, in, out); + if (err) + return err; + + output->status = MLX5_GET(query_flow_table_out, out, status); + output->level = MLX5_GET(query_flow_table_out, out, flow_table_context.level); + + output->sw_owner_icm_root_1 = MLX5_GET64(query_flow_table_out, out, + flow_table_context.sw_owner_icm_root_1); + output->sw_owner_icm_root_0 = MLX5_GET64(query_flow_table_out, out, + flow_table_context.sw_owner_icm_root_0); + + return 0; +} + +int mlx5dr_cmd_query_flow_sampler(struct mlx5_core_dev *dev, + u32 sampler_id, + u64 *rx_icm_addr, + u64 *tx_icm_addr) +{ + u32 out[MLX5_ST_SZ_DW(query_sampler_obj_out)] = {}; + u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {}; + void *attr; + int ret; + + MLX5_SET(general_obj_in_cmd_hdr, in, opcode, + MLX5_CMD_OP_QUERY_GENERAL_OBJECT); + MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, + MLX5_GENERAL_OBJECT_TYPES_SAMPLER); + MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, sampler_id); + + ret = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + if (ret) + return ret; + + attr = MLX5_ADDR_OF(query_sampler_obj_out, out, sampler_object); + + *rx_icm_addr = MLX5_GET64(sampler_obj, attr, + sw_steering_icm_address_rx); + *tx_icm_addr = MLX5_GET64(sampler_obj, attr, + sw_steering_icm_address_tx); + + return 0; +} + +int mlx5dr_cmd_sync_steering(struct mlx5_core_dev *mdev) +{ + u32 in[MLX5_ST_SZ_DW(sync_steering_in)] = {}; + + /* Skip SYNC in case the device is internal error state. + * Besides a device error, this also happens when we're + * in fast teardown + */ + if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) + return 0; + + MLX5_SET(sync_steering_in, in, opcode, MLX5_CMD_OP_SYNC_STEERING); + + return mlx5_cmd_exec_in(mdev, sync_steering, in); +} + +int mlx5dr_cmd_set_fte_modify_and_vport(struct mlx5_core_dev *mdev, + u32 table_type, + u32 table_id, + u32 group_id, + u32 modify_header_id, + u16 vport) +{ + u32 out[MLX5_ST_SZ_DW(set_fte_out)] = {}; + void *in_flow_context; + unsigned int inlen; + void *in_dests; + u32 *in; + int err; + + inlen = MLX5_ST_SZ_BYTES(set_fte_in) + + 1 * MLX5_ST_SZ_BYTES(dest_format_struct); /* One destination only */ + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(set_fte_in, in, opcode, MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY); + MLX5_SET(set_fte_in, in, table_type, table_type); + MLX5_SET(set_fte_in, in, table_id, table_id); + + in_flow_context = MLX5_ADDR_OF(set_fte_in, in, flow_context); + MLX5_SET(flow_context, in_flow_context, group_id, group_id); + MLX5_SET(flow_context, in_flow_context, modify_header_id, modify_header_id); + MLX5_SET(flow_context, in_flow_context, destination_list_size, 1); + MLX5_SET(flow_context, in_flow_context, action, + MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | + MLX5_FLOW_CONTEXT_ACTION_MOD_HDR); + + in_dests = MLX5_ADDR_OF(flow_context, in_flow_context, destination); + MLX5_SET(dest_format_struct, in_dests, destination_type, + MLX5_FLOW_DESTINATION_TYPE_VPORT); + MLX5_SET(dest_format_struct, in_dests, destination_id, vport); + + err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out)); + kvfree(in); + + return err; +} + +int mlx5dr_cmd_del_flow_table_entry(struct mlx5_core_dev *mdev, + u32 table_type, + u32 table_id) +{ + u32 in[MLX5_ST_SZ_DW(delete_fte_in)] = {}; + + MLX5_SET(delete_fte_in, in, opcode, MLX5_CMD_OP_DELETE_FLOW_TABLE_ENTRY); + MLX5_SET(delete_fte_in, in, table_type, table_type); + MLX5_SET(delete_fte_in, in, table_id, table_id); + + return mlx5_cmd_exec_in(mdev, delete_fte, in); +} + +int mlx5dr_cmd_alloc_modify_header(struct mlx5_core_dev *mdev, + u32 table_type, + u8 num_of_actions, + u64 *actions, + u32 *modify_header_id) +{ + u32 out[MLX5_ST_SZ_DW(alloc_modify_header_context_out)] = {}; + void *p_actions; + u32 inlen; + u32 *in; + int err; + + inlen = MLX5_ST_SZ_BYTES(alloc_modify_header_context_in) + + num_of_actions * sizeof(u64); + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(alloc_modify_header_context_in, in, opcode, + MLX5_CMD_OP_ALLOC_MODIFY_HEADER_CONTEXT); + MLX5_SET(alloc_modify_header_context_in, in, table_type, table_type); + MLX5_SET(alloc_modify_header_context_in, in, num_of_actions, num_of_actions); + p_actions = MLX5_ADDR_OF(alloc_modify_header_context_in, in, actions); + memcpy(p_actions, actions, num_of_actions * sizeof(u64)); + + err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out)); + if (err) + goto out; + + *modify_header_id = MLX5_GET(alloc_modify_header_context_out, out, + modify_header_id); +out: + kvfree(in); + return err; +} + +int mlx5dr_cmd_dealloc_modify_header(struct mlx5_core_dev *mdev, + u32 modify_header_id) +{ + u32 in[MLX5_ST_SZ_DW(dealloc_modify_header_context_in)] = {}; + + MLX5_SET(dealloc_modify_header_context_in, in, opcode, + MLX5_CMD_OP_DEALLOC_MODIFY_HEADER_CONTEXT); + MLX5_SET(dealloc_modify_header_context_in, in, modify_header_id, + modify_header_id); + + return mlx5_cmd_exec_in(mdev, dealloc_modify_header_context, in); +} + +int mlx5dr_cmd_create_empty_flow_group(struct mlx5_core_dev *mdev, + u32 table_type, + u32 table_id, + u32 *group_id) +{ + u32 out[MLX5_ST_SZ_DW(create_flow_group_out)] = {}; + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + u32 *in; + int err; + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(create_flow_group_in, in, opcode, MLX5_CMD_OP_CREATE_FLOW_GROUP); + MLX5_SET(create_flow_group_in, in, table_type, table_type); + MLX5_SET(create_flow_group_in, in, table_id, table_id); + + err = mlx5_cmd_exec_inout(mdev, create_flow_group, in, out); + if (err) + goto out; + + *group_id = MLX5_GET(create_flow_group_out, out, group_id); + +out: + kvfree(in); + return err; +} + +int mlx5dr_cmd_destroy_flow_group(struct mlx5_core_dev *mdev, + u32 table_type, + u32 table_id, + u32 group_id) +{ + u32 in[MLX5_ST_SZ_DW(destroy_flow_group_in)] = {}; + + MLX5_SET(destroy_flow_group_in, in, opcode, + MLX5_CMD_OP_DESTROY_FLOW_GROUP); + MLX5_SET(destroy_flow_group_in, in, table_type, table_type); + MLX5_SET(destroy_flow_group_in, in, table_id, table_id); + MLX5_SET(destroy_flow_group_in, in, group_id, group_id); + + return mlx5_cmd_exec_in(mdev, destroy_flow_group, in); +} + +int mlx5dr_cmd_create_flow_table(struct mlx5_core_dev *mdev, + struct mlx5dr_cmd_create_flow_table_attr *attr, + u64 *fdb_rx_icm_addr, + u32 *table_id) +{ + u32 out[MLX5_ST_SZ_DW(create_flow_table_out)] = {}; + u32 in[MLX5_ST_SZ_DW(create_flow_table_in)] = {}; + void *ft_mdev; + int err; + + MLX5_SET(create_flow_table_in, in, opcode, MLX5_CMD_OP_CREATE_FLOW_TABLE); + MLX5_SET(create_flow_table_in, in, table_type, attr->table_type); + MLX5_SET(create_flow_table_in, in, uid, attr->uid); + + ft_mdev = MLX5_ADDR_OF(create_flow_table_in, in, flow_table_context); + MLX5_SET(flow_table_context, ft_mdev, termination_table, attr->term_tbl); + MLX5_SET(flow_table_context, ft_mdev, sw_owner, attr->sw_owner); + MLX5_SET(flow_table_context, ft_mdev, level, attr->level); + + if (attr->sw_owner) { + /* icm_addr_0 used for FDB RX / NIC TX / NIC_RX + * icm_addr_1 used for FDB TX + */ + if (attr->table_type == MLX5_FLOW_TABLE_TYPE_NIC_RX) { + MLX5_SET64(flow_table_context, ft_mdev, + sw_owner_icm_root_0, attr->icm_addr_rx); + } else if (attr->table_type == MLX5_FLOW_TABLE_TYPE_NIC_TX) { + MLX5_SET64(flow_table_context, ft_mdev, + sw_owner_icm_root_0, attr->icm_addr_tx); + } else if (attr->table_type == MLX5_FLOW_TABLE_TYPE_FDB) { + MLX5_SET64(flow_table_context, ft_mdev, + sw_owner_icm_root_0, attr->icm_addr_rx); + MLX5_SET64(flow_table_context, ft_mdev, + sw_owner_icm_root_1, attr->icm_addr_tx); + } + } + + MLX5_SET(create_flow_table_in, in, flow_table_context.decap_en, + attr->decap_en); + MLX5_SET(create_flow_table_in, in, flow_table_context.reformat_en, + attr->reformat_en); + + err = mlx5_cmd_exec_inout(mdev, create_flow_table, in, out); + if (err) + return err; + + *table_id = MLX5_GET(create_flow_table_out, out, table_id); + if (!attr->sw_owner && attr->table_type == MLX5_FLOW_TABLE_TYPE_FDB && + fdb_rx_icm_addr) + *fdb_rx_icm_addr = + (u64)MLX5_GET(create_flow_table_out, out, icm_address_31_0) | + (u64)MLX5_GET(create_flow_table_out, out, icm_address_39_32) << 32 | + (u64)MLX5_GET(create_flow_table_out, out, icm_address_63_40) << 40; + + return 0; +} + +int mlx5dr_cmd_destroy_flow_table(struct mlx5_core_dev *mdev, + u32 table_id, + u32 table_type) +{ + u32 in[MLX5_ST_SZ_DW(destroy_flow_table_in)] = {}; + + MLX5_SET(destroy_flow_table_in, in, opcode, + MLX5_CMD_OP_DESTROY_FLOW_TABLE); + MLX5_SET(destroy_flow_table_in, in, table_type, table_type); + MLX5_SET(destroy_flow_table_in, in, table_id, table_id); + + return mlx5_cmd_exec_in(mdev, destroy_flow_table, in); +} + +int mlx5dr_cmd_create_reformat_ctx(struct mlx5_core_dev *mdev, + enum mlx5_reformat_ctx_type rt, + u8 reformat_param_0, + u8 reformat_param_1, + size_t reformat_size, + void *reformat_data, + u32 *reformat_id) +{ + u32 out[MLX5_ST_SZ_DW(alloc_packet_reformat_context_out)] = {}; + size_t inlen, cmd_data_sz, cmd_total_sz; + void *prctx; + void *pdata; + void *in; + int err; + + cmd_total_sz = MLX5_ST_SZ_BYTES(alloc_packet_reformat_context_in); + cmd_data_sz = MLX5_FLD_SZ_BYTES(alloc_packet_reformat_context_in, + packet_reformat_context.reformat_data); + inlen = ALIGN(cmd_total_sz + reformat_size - cmd_data_sz, 4); + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(alloc_packet_reformat_context_in, in, opcode, + MLX5_CMD_OP_ALLOC_PACKET_REFORMAT_CONTEXT); + + prctx = MLX5_ADDR_OF(alloc_packet_reformat_context_in, in, packet_reformat_context); + pdata = MLX5_ADDR_OF(packet_reformat_context_in, prctx, reformat_data); + + MLX5_SET(packet_reformat_context_in, prctx, reformat_type, rt); + MLX5_SET(packet_reformat_context_in, prctx, reformat_param_0, reformat_param_0); + MLX5_SET(packet_reformat_context_in, prctx, reformat_param_1, reformat_param_1); + MLX5_SET(packet_reformat_context_in, prctx, reformat_data_size, reformat_size); + if (reformat_data && reformat_size) + memcpy(pdata, reformat_data, reformat_size); + + err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out)); + if (err) + return err; + + *reformat_id = MLX5_GET(alloc_packet_reformat_context_out, out, packet_reformat_id); + kvfree(in); + + return err; +} + +void mlx5dr_cmd_destroy_reformat_ctx(struct mlx5_core_dev *mdev, + u32 reformat_id) +{ + u32 in[MLX5_ST_SZ_DW(dealloc_packet_reformat_context_in)] = {}; + + MLX5_SET(dealloc_packet_reformat_context_in, in, opcode, + MLX5_CMD_OP_DEALLOC_PACKET_REFORMAT_CONTEXT); + MLX5_SET(dealloc_packet_reformat_context_in, in, packet_reformat_id, + reformat_id); + + mlx5_cmd_exec_in(mdev, dealloc_packet_reformat_context, in); +} + +int mlx5dr_cmd_query_gid(struct mlx5_core_dev *mdev, u8 vhca_port_num, + u16 index, struct mlx5dr_cmd_gid_attr *attr) +{ + u32 out[MLX5_ST_SZ_DW(query_roce_address_out)] = {}; + u32 in[MLX5_ST_SZ_DW(query_roce_address_in)] = {}; + int err; + + MLX5_SET(query_roce_address_in, in, opcode, + MLX5_CMD_OP_QUERY_ROCE_ADDRESS); + + MLX5_SET(query_roce_address_in, in, roce_address_index, index); + MLX5_SET(query_roce_address_in, in, vhca_port_num, vhca_port_num); + + err = mlx5_cmd_exec_inout(mdev, query_roce_address, in, out); + if (err) + return err; + + memcpy(&attr->gid, + MLX5_ADDR_OF(query_roce_address_out, + out, roce_address.source_l3_address), + sizeof(attr->gid)); + memcpy(attr->mac, + MLX5_ADDR_OF(query_roce_address_out, out, + roce_address.source_mac_47_32), + sizeof(attr->mac)); + + if (MLX5_GET(query_roce_address_out, out, + roce_address.roce_version) == MLX5_ROCE_VERSION_2) + attr->roce_ver = MLX5_ROCE_VERSION_2; + else + attr->roce_ver = MLX5_ROCE_VERSION_1; + + return 0; +} + +int mlx5dr_cmd_create_modify_header_arg(struct mlx5_core_dev *dev, + u16 log_obj_range, u32 pd, + u32 *obj_id) +{ + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {}; + u32 in[MLX5_ST_SZ_DW(create_modify_header_arg_in)] = {}; + void *attr; + int ret; + + attr = MLX5_ADDR_OF(create_modify_header_arg_in, in, hdr); + MLX5_SET(general_obj_in_cmd_hdr, attr, opcode, + MLX5_CMD_OP_CREATE_GENERAL_OBJECT); + MLX5_SET(general_obj_in_cmd_hdr, attr, obj_type, + MLX5_OBJ_TYPE_HEADER_MODIFY_ARGUMENT); + MLX5_SET(general_obj_in_cmd_hdr, attr, log_obj_range, + log_obj_range); + + attr = MLX5_ADDR_OF(create_modify_header_arg_in, in, arg); + MLX5_SET(modify_header_arg, attr, access_pd, pd); + + ret = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + if (ret) + return ret; + + *obj_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id); + return 0; +} + +void mlx5dr_cmd_destroy_modify_header_arg(struct mlx5_core_dev *dev, + u32 obj_id) +{ + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {}; + u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {}; + + MLX5_SET(general_obj_in_cmd_hdr, in, opcode, + MLX5_CMD_OP_DESTROY_GENERAL_OBJECT); + MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, + MLX5_OBJ_TYPE_HEADER_MODIFY_ARGUMENT); + MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, obj_id); + + mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); +} + +static int mlx5dr_cmd_set_extended_dest(struct mlx5_core_dev *dev, + struct mlx5dr_cmd_fte_info *fte, + bool *extended_dest) +{ + int fw_log_max_fdb_encap_uplink = MLX5_CAP_ESW(dev, log_max_fdb_encap_uplink); + int num_fwd_destinations = 0; + int num_encap = 0; + int i; + + *extended_dest = false; + if (!(fte->action.action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST)) + return 0; + for (i = 0; i < fte->dests_size; i++) { + if (fte->dest_arr[i].type == MLX5_FLOW_DESTINATION_TYPE_COUNTER) + continue; + if ((fte->dest_arr[i].type == MLX5_FLOW_DESTINATION_TYPE_VPORT || + fte->dest_arr[i].type == MLX5_FLOW_DESTINATION_TYPE_UPLINK) && + fte->dest_arr[i].vport.flags & MLX5_FLOW_DEST_VPORT_REFORMAT_ID) + num_encap++; + num_fwd_destinations++; + } + + if (num_fwd_destinations > 1 && num_encap > 0) + *extended_dest = true; + + if (*extended_dest && !fw_log_max_fdb_encap_uplink) { + mlx5_core_warn(dev, "FW does not support extended destination"); + return -EOPNOTSUPP; + } + if (num_encap > (1 << fw_log_max_fdb_encap_uplink)) { + mlx5_core_warn(dev, "FW does not support more than %d encaps", + 1 << fw_log_max_fdb_encap_uplink); + return -EOPNOTSUPP; + } + + return 0; +} + +int mlx5dr_cmd_set_fte(struct mlx5_core_dev *dev, + int opmod, int modify_mask, + struct mlx5dr_cmd_ft_info *ft, + u32 group_id, + struct mlx5dr_cmd_fte_info *fte) +{ + u32 out[MLX5_ST_SZ_DW(set_fte_out)] = {}; + void *in_flow_context, *vlan; + bool extended_dest = false; + void *in_match_value; + unsigned int inlen; + int dst_cnt_size; + void *in_dests; + u32 *in; + int err; + int i; + + if (mlx5dr_cmd_set_extended_dest(dev, fte, &extended_dest)) + return -EOPNOTSUPP; + + if (!extended_dest) + dst_cnt_size = MLX5_ST_SZ_BYTES(dest_format_struct); + else + dst_cnt_size = MLX5_ST_SZ_BYTES(extended_dest_format); + + inlen = MLX5_ST_SZ_BYTES(set_fte_in) + fte->dests_size * dst_cnt_size; + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(set_fte_in, in, opcode, MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY); + MLX5_SET(set_fte_in, in, op_mod, opmod); + MLX5_SET(set_fte_in, in, modify_enable_mask, modify_mask); + MLX5_SET(set_fte_in, in, table_type, ft->type); + MLX5_SET(set_fte_in, in, table_id, ft->id); + MLX5_SET(set_fte_in, in, flow_index, fte->index); + MLX5_SET(set_fte_in, in, ignore_flow_level, fte->ignore_flow_level); + if (ft->vport) { + MLX5_SET(set_fte_in, in, vport_number, ft->vport); + MLX5_SET(set_fte_in, in, other_vport, 1); + } + + in_flow_context = MLX5_ADDR_OF(set_fte_in, in, flow_context); + MLX5_SET(flow_context, in_flow_context, group_id, group_id); + + MLX5_SET(flow_context, in_flow_context, flow_tag, + fte->flow_context.flow_tag); + MLX5_SET(flow_context, in_flow_context, flow_source, + fte->flow_context.flow_source); + + MLX5_SET(flow_context, in_flow_context, extended_destination, + extended_dest); + if (extended_dest) { + u32 action; + + action = fte->action.action & + ~MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT; + MLX5_SET(flow_context, in_flow_context, action, action); + } else { + MLX5_SET(flow_context, in_flow_context, action, + fte->action.action); + if (fte->action.pkt_reformat) + MLX5_SET(flow_context, in_flow_context, packet_reformat_id, + fte->action.pkt_reformat->id); + } + if (fte->action.modify_hdr) + MLX5_SET(flow_context, in_flow_context, modify_header_id, + fte->action.modify_hdr->id); + + vlan = MLX5_ADDR_OF(flow_context, in_flow_context, push_vlan); + + MLX5_SET(vlan, vlan, ethtype, fte->action.vlan[0].ethtype); + MLX5_SET(vlan, vlan, vid, fte->action.vlan[0].vid); + MLX5_SET(vlan, vlan, prio, fte->action.vlan[0].prio); + + vlan = MLX5_ADDR_OF(flow_context, in_flow_context, push_vlan_2); + + MLX5_SET(vlan, vlan, ethtype, fte->action.vlan[1].ethtype); + MLX5_SET(vlan, vlan, vid, fte->action.vlan[1].vid); + MLX5_SET(vlan, vlan, prio, fte->action.vlan[1].prio); + + in_match_value = MLX5_ADDR_OF(flow_context, in_flow_context, + match_value); + memcpy(in_match_value, fte->val, sizeof(u32) * MLX5_ST_SZ_DW_MATCH_PARAM); + + in_dests = MLX5_ADDR_OF(flow_context, in_flow_context, destination); + if (fte->action.action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) { + int list_size = 0; + + for (i = 0; i < fte->dests_size; i++) { + unsigned int id, type = fte->dest_arr[i].type; + + if (type == MLX5_FLOW_DESTINATION_TYPE_COUNTER) + continue; + + switch (type) { + case MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM: + id = fte->dest_arr[i].ft_num; + type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + break; + case MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE: + id = fte->dest_arr[i].ft_id; + break; + case MLX5_FLOW_DESTINATION_TYPE_UPLINK: + case MLX5_FLOW_DESTINATION_TYPE_VPORT: + if (type == MLX5_FLOW_DESTINATION_TYPE_VPORT) { + id = fte->dest_arr[i].vport.num; + MLX5_SET(dest_format_struct, in_dests, + destination_eswitch_owner_vhca_id_valid, + !!(fte->dest_arr[i].vport.flags & + MLX5_FLOW_DEST_VPORT_VHCA_ID)); + } else { + id = 0; + MLX5_SET(dest_format_struct, in_dests, + destination_eswitch_owner_vhca_id_valid, 1); + } + MLX5_SET(dest_format_struct, in_dests, + destination_eswitch_owner_vhca_id, + fte->dest_arr[i].vport.vhca_id); + if (extended_dest && (fte->dest_arr[i].vport.flags & + MLX5_FLOW_DEST_VPORT_REFORMAT_ID)) { + MLX5_SET(dest_format_struct, in_dests, + packet_reformat, + !!(fte->dest_arr[i].vport.flags & + MLX5_FLOW_DEST_VPORT_REFORMAT_ID)); + MLX5_SET(extended_dest_format, in_dests, + packet_reformat_id, + fte->dest_arr[i].vport.reformat_id); + } + break; + case MLX5_FLOW_DESTINATION_TYPE_FLOW_SAMPLER: + id = fte->dest_arr[i].sampler_id; + break; + default: + id = fte->dest_arr[i].tir_num; + } + + MLX5_SET(dest_format_struct, in_dests, destination_type, + type); + MLX5_SET(dest_format_struct, in_dests, destination_id, id); + in_dests += dst_cnt_size; + list_size++; + } + + MLX5_SET(flow_context, in_flow_context, destination_list_size, + list_size); + } + + if (fte->action.action & MLX5_FLOW_CONTEXT_ACTION_COUNT) { + int max_list_size = BIT(MLX5_CAP_FLOWTABLE_TYPE(dev, + log_max_flow_counter, + ft->type)); + int list_size = 0; + + for (i = 0; i < fte->dests_size; i++) { + if (fte->dest_arr[i].type != + MLX5_FLOW_DESTINATION_TYPE_COUNTER) + continue; + + MLX5_SET(flow_counter_list, in_dests, flow_counter_id, + fte->dest_arr[i].counter_id); + in_dests += dst_cnt_size; + list_size++; + } + if (list_size > max_list_size) { + err = -EINVAL; + goto err_out; + } + + MLX5_SET(flow_context, in_flow_context, flow_counter_list_size, + list_size); + } + + err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); +err_out: + kvfree(in); + return err; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_dbg.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_dbg.c new file mode 100644 index 0000000..73f7639 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_dbg.c @@ -0,0 +1,658 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +#include +#include +#include +#include "dr_types.h" + +#define DR_DBG_PTR_TO_ID(p) ((u64)(uintptr_t)(p) & 0xFFFFFFFFULL) + +enum dr_dump_rec_type { + DR_DUMP_REC_TYPE_DOMAIN = 3000, + DR_DUMP_REC_TYPE_DOMAIN_INFO_FLEX_PARSER = 3001, + DR_DUMP_REC_TYPE_DOMAIN_INFO_DEV_ATTR = 3002, + DR_DUMP_REC_TYPE_DOMAIN_INFO_VPORT = 3003, + DR_DUMP_REC_TYPE_DOMAIN_INFO_CAPS = 3004, + DR_DUMP_REC_TYPE_DOMAIN_SEND_RING = 3005, + + DR_DUMP_REC_TYPE_TABLE = 3100, + DR_DUMP_REC_TYPE_TABLE_RX = 3101, + DR_DUMP_REC_TYPE_TABLE_TX = 3102, + + DR_DUMP_REC_TYPE_MATCHER = 3200, + DR_DUMP_REC_TYPE_MATCHER_MASK_DEPRECATED = 3201, + DR_DUMP_REC_TYPE_MATCHER_RX = 3202, + DR_DUMP_REC_TYPE_MATCHER_TX = 3203, + DR_DUMP_REC_TYPE_MATCHER_BUILDER = 3204, + DR_DUMP_REC_TYPE_MATCHER_MASK = 3205, + + DR_DUMP_REC_TYPE_RULE = 3300, + DR_DUMP_REC_TYPE_RULE_RX_ENTRY_V0 = 3301, + DR_DUMP_REC_TYPE_RULE_TX_ENTRY_V0 = 3302, + DR_DUMP_REC_TYPE_RULE_RX_ENTRY_V1 = 3303, + DR_DUMP_REC_TYPE_RULE_TX_ENTRY_V1 = 3304, + + DR_DUMP_REC_TYPE_ACTION_ENCAP_L2 = 3400, + DR_DUMP_REC_TYPE_ACTION_ENCAP_L3 = 3401, + DR_DUMP_REC_TYPE_ACTION_MODIFY_HDR = 3402, + DR_DUMP_REC_TYPE_ACTION_DROP = 3403, + DR_DUMP_REC_TYPE_ACTION_QP = 3404, + DR_DUMP_REC_TYPE_ACTION_FT = 3405, + DR_DUMP_REC_TYPE_ACTION_CTR = 3406, + DR_DUMP_REC_TYPE_ACTION_TAG = 3407, + DR_DUMP_REC_TYPE_ACTION_VPORT = 3408, + DR_DUMP_REC_TYPE_ACTION_DECAP_L2 = 3409, + DR_DUMP_REC_TYPE_ACTION_DECAP_L3 = 3410, + DR_DUMP_REC_TYPE_ACTION_DEVX_TIR = 3411, + DR_DUMP_REC_TYPE_ACTION_PUSH_VLAN = 3412, + DR_DUMP_REC_TYPE_ACTION_POP_VLAN = 3413, + DR_DUMP_REC_TYPE_ACTION_SAMPLER = 3415, + DR_DUMP_REC_TYPE_ACTION_INSERT_HDR = 3420, + DR_DUMP_REC_TYPE_ACTION_REMOVE_HDR = 3421 +}; + +void mlx5dr_dbg_tbl_add(struct mlx5dr_table *tbl) +{ + mutex_lock(&tbl->dmn->dump_info.dbg_mutex); + list_add_tail(&tbl->dbg_node, &tbl->dmn->dbg_tbl_list); + mutex_unlock(&tbl->dmn->dump_info.dbg_mutex); +} + +void mlx5dr_dbg_tbl_del(struct mlx5dr_table *tbl) +{ + mutex_lock(&tbl->dmn->dump_info.dbg_mutex); + list_del(&tbl->dbg_node); + mutex_unlock(&tbl->dmn->dump_info.dbg_mutex); +} + +void mlx5dr_dbg_rule_add(struct mlx5dr_rule *rule) +{ + struct mlx5dr_domain *dmn = rule->matcher->tbl->dmn; + + mutex_lock(&dmn->dump_info.dbg_mutex); + list_add_tail(&rule->dbg_node, &rule->matcher->dbg_rule_list); + mutex_unlock(&dmn->dump_info.dbg_mutex); +} + +void mlx5dr_dbg_rule_del(struct mlx5dr_rule *rule) +{ + struct mlx5dr_domain *dmn = rule->matcher->tbl->dmn; + + mutex_lock(&dmn->dump_info.dbg_mutex); + list_del(&rule->dbg_node); + mutex_unlock(&dmn->dump_info.dbg_mutex); +} + +static u64 dr_dump_icm_to_idx(u64 icm_addr) +{ + return (icm_addr >> 6) & 0xffffffff; +} + +#define DR_HEX_SIZE 256 + +static void +dr_dump_hex_print(char hex[DR_HEX_SIZE], char *src, u32 size) +{ + if (WARN_ON_ONCE(DR_HEX_SIZE < 2 * size + 1)) + size = DR_HEX_SIZE / 2 - 1; /* truncate */ + + bin2hex(hex, src, size); + hex[2 * size] = 0; /* NULL-terminate */ +} + +static int +dr_dump_rule_action_mem(struct seq_file *file, const u64 rule_id, + struct mlx5dr_rule_action_member *action_mem) +{ + struct mlx5dr_action *action = action_mem->action; + const u64 action_id = DR_DBG_PTR_TO_ID(action); + + switch (action->action_type) { + case DR_ACTION_TYP_DROP: + seq_printf(file, "%d,0x%llx,0x%llx\n", + DR_DUMP_REC_TYPE_ACTION_DROP, action_id, rule_id); + break; + case DR_ACTION_TYP_FT: + if (action->dest_tbl->is_fw_tbl) + seq_printf(file, "%d,0x%llx,0x%llx,0x%x,0x%x\n", + DR_DUMP_REC_TYPE_ACTION_FT, action_id, + rule_id, action->dest_tbl->fw_tbl.id, + -1); + else + seq_printf(file, "%d,0x%llx,0x%llx,0x%x,0x%llx\n", + DR_DUMP_REC_TYPE_ACTION_FT, action_id, + rule_id, action->dest_tbl->tbl->table_id, + DR_DBG_PTR_TO_ID(action->dest_tbl->tbl)); + + break; + case DR_ACTION_TYP_CTR: + seq_printf(file, "%d,0x%llx,0x%llx,0x%x\n", + DR_DUMP_REC_TYPE_ACTION_CTR, action_id, rule_id, + action->ctr->ctr_id + action->ctr->offset); + break; + case DR_ACTION_TYP_TAG: + seq_printf(file, "%d,0x%llx,0x%llx,0x%x\n", + DR_DUMP_REC_TYPE_ACTION_TAG, action_id, rule_id, + action->flow_tag->flow_tag); + break; + case DR_ACTION_TYP_MODIFY_HDR: + seq_printf(file, "%d,0x%llx,0x%llx,0x%x,%d\n", + DR_DUMP_REC_TYPE_ACTION_MODIFY_HDR, action_id, + rule_id, action->rewrite->index, + action->rewrite->single_action_opt); + break; + case DR_ACTION_TYP_VPORT: + seq_printf(file, "%d,0x%llx,0x%llx,0x%x\n", + DR_DUMP_REC_TYPE_ACTION_VPORT, action_id, rule_id, + action->vport->caps->num); + break; + case DR_ACTION_TYP_TNL_L2_TO_L2: + seq_printf(file, "%d,0x%llx,0x%llx\n", + DR_DUMP_REC_TYPE_ACTION_DECAP_L2, action_id, + rule_id); + break; + case DR_ACTION_TYP_TNL_L3_TO_L2: + seq_printf(file, "%d,0x%llx,0x%llx,0x%x\n", + DR_DUMP_REC_TYPE_ACTION_DECAP_L3, action_id, + rule_id, action->rewrite->index); + break; + case DR_ACTION_TYP_L2_TO_TNL_L2: + seq_printf(file, "%d,0x%llx,0x%llx,0x%x\n", + DR_DUMP_REC_TYPE_ACTION_ENCAP_L2, action_id, + rule_id, action->reformat->id); + break; + case DR_ACTION_TYP_L2_TO_TNL_L3: + seq_printf(file, "%d,0x%llx,0x%llx,0x%x\n", + DR_DUMP_REC_TYPE_ACTION_ENCAP_L3, action_id, + rule_id, action->reformat->id); + break; + case DR_ACTION_TYP_POP_VLAN: + seq_printf(file, "%d,0x%llx,0x%llx\n", + DR_DUMP_REC_TYPE_ACTION_POP_VLAN, action_id, + rule_id); + break; + case DR_ACTION_TYP_PUSH_VLAN: + seq_printf(file, "%d,0x%llx,0x%llx,0x%x\n", + DR_DUMP_REC_TYPE_ACTION_PUSH_VLAN, action_id, + rule_id, action->push_vlan->vlan_hdr); + break; + case DR_ACTION_TYP_INSERT_HDR: + seq_printf(file, "%d,0x%llx,0x%llx,0x%x,0x%x,0x%x\n", + DR_DUMP_REC_TYPE_ACTION_INSERT_HDR, action_id, + rule_id, action->reformat->id, + action->reformat->param_0, + action->reformat->param_1); + break; + case DR_ACTION_TYP_REMOVE_HDR: + seq_printf(file, "%d,0x%llx,0x%llx,0x%x,0x%x,0x%x\n", + DR_DUMP_REC_TYPE_ACTION_REMOVE_HDR, action_id, + rule_id, action->reformat->id, + action->reformat->param_0, + action->reformat->param_1); + break; + case DR_ACTION_TYP_SAMPLER: + seq_printf(file, + "%d,0x%llx,0x%llx,0x%x,0x%x,0x%x,0x%llx,0x%llx\n", + DR_DUMP_REC_TYPE_ACTION_SAMPLER, action_id, rule_id, + 0, 0, action->sampler->sampler_id, + action->sampler->rx_icm_addr, + action->sampler->tx_icm_addr); + break; + default: + return 0; + } + + return 0; +} + +static int +dr_dump_rule_mem(struct seq_file *file, struct mlx5dr_ste *ste, + bool is_rx, const u64 rule_id, u8 format_ver) +{ + char hw_ste_dump[DR_HEX_SIZE]; + u32 mem_rec_type; + + if (format_ver == MLX5_STEERING_FORMAT_CONNECTX_5) { + mem_rec_type = is_rx ? DR_DUMP_REC_TYPE_RULE_RX_ENTRY_V0 : + DR_DUMP_REC_TYPE_RULE_TX_ENTRY_V0; + } else { + mem_rec_type = is_rx ? DR_DUMP_REC_TYPE_RULE_RX_ENTRY_V1 : + DR_DUMP_REC_TYPE_RULE_TX_ENTRY_V1; + } + + dr_dump_hex_print(hw_ste_dump, (char *)mlx5dr_ste_get_hw_ste(ste), + DR_STE_SIZE_REDUCED); + + seq_printf(file, "%d,0x%llx,0x%llx,%s\n", mem_rec_type, + dr_dump_icm_to_idx(mlx5dr_ste_get_icm_addr(ste)), rule_id, + hw_ste_dump); + + return 0; +} + +static int +dr_dump_rule_rx_tx(struct seq_file *file, struct mlx5dr_rule_rx_tx *rule_rx_tx, + bool is_rx, const u64 rule_id, u8 format_ver) +{ + struct mlx5dr_ste *ste_arr[DR_RULE_MAX_STES + DR_ACTION_MAX_STES]; + struct mlx5dr_ste *curr_ste = rule_rx_tx->last_rule_ste; + int ret, i; + + if (mlx5dr_rule_get_reverse_rule_members(ste_arr, curr_ste, &i)) + return 0; + + while (i--) { + ret = dr_dump_rule_mem(file, ste_arr[i], is_rx, rule_id, + format_ver); + if (ret < 0) + return ret; + } + + return 0; +} + +static int dr_dump_rule(struct seq_file *file, struct mlx5dr_rule *rule) +{ + struct mlx5dr_rule_action_member *action_mem; + const u64 rule_id = DR_DBG_PTR_TO_ID(rule); + struct mlx5dr_rule_rx_tx *rx = &rule->rx; + struct mlx5dr_rule_rx_tx *tx = &rule->tx; + u8 format_ver; + int ret; + + format_ver = rule->matcher->tbl->dmn->info.caps.sw_format_ver; + + seq_printf(file, "%d,0x%llx,0x%llx\n", DR_DUMP_REC_TYPE_RULE, rule_id, + DR_DBG_PTR_TO_ID(rule->matcher)); + + if (rx->nic_matcher) { + ret = dr_dump_rule_rx_tx(file, rx, true, rule_id, format_ver); + if (ret < 0) + return ret; + } + + if (tx->nic_matcher) { + ret = dr_dump_rule_rx_tx(file, tx, false, rule_id, format_ver); + if (ret < 0) + return ret; + } + + list_for_each_entry(action_mem, &rule->rule_actions_list, list) { + ret = dr_dump_rule_action_mem(file, rule_id, action_mem); + if (ret < 0) + return ret; + } + + return 0; +} + +static int +dr_dump_matcher_mask(struct seq_file *file, struct mlx5dr_match_param *mask, + u8 criteria, const u64 matcher_id) +{ + char dump[DR_HEX_SIZE]; + + seq_printf(file, "%d,0x%llx,", DR_DUMP_REC_TYPE_MATCHER_MASK, + matcher_id); + + if (criteria & DR_MATCHER_CRITERIA_OUTER) { + dr_dump_hex_print(dump, (char *)&mask->outer, sizeof(mask->outer)); + seq_printf(file, "%s,", dump); + } else { + seq_puts(file, ","); + } + + if (criteria & DR_MATCHER_CRITERIA_INNER) { + dr_dump_hex_print(dump, (char *)&mask->inner, sizeof(mask->inner)); + seq_printf(file, "%s,", dump); + } else { + seq_puts(file, ","); + } + + if (criteria & DR_MATCHER_CRITERIA_MISC) { + dr_dump_hex_print(dump, (char *)&mask->misc, sizeof(mask->misc)); + seq_printf(file, "%s,", dump); + } else { + seq_puts(file, ","); + } + + if (criteria & DR_MATCHER_CRITERIA_MISC2) { + dr_dump_hex_print(dump, (char *)&mask->misc2, sizeof(mask->misc2)); + seq_printf(file, "%s,", dump); + } else { + seq_puts(file, ","); + } + + if (criteria & DR_MATCHER_CRITERIA_MISC3) { + dr_dump_hex_print(dump, (char *)&mask->misc3, sizeof(mask->misc3)); + seq_printf(file, "%s\n", dump); + } else { + seq_puts(file, ",\n"); + } + + return 0; +} + +static int +dr_dump_matcher_builder(struct seq_file *file, struct mlx5dr_ste_build *builder, + u32 index, bool is_rx, const u64 matcher_id) +{ + seq_printf(file, "%d,0x%llx,%d,%d,0x%x\n", + DR_DUMP_REC_TYPE_MATCHER_BUILDER, matcher_id, index, is_rx, + builder->lu_type); + + return 0; +} + +static int +dr_dump_matcher_rx_tx(struct seq_file *file, bool is_rx, + struct mlx5dr_matcher_rx_tx *matcher_rx_tx, + const u64 matcher_id) +{ + enum dr_dump_rec_type rec_type; + u64 s_icm_addr, e_icm_addr; + int i, ret; + + rec_type = is_rx ? DR_DUMP_REC_TYPE_MATCHER_RX : + DR_DUMP_REC_TYPE_MATCHER_TX; + + s_icm_addr = mlx5dr_icm_pool_get_chunk_icm_addr(matcher_rx_tx->s_htbl->chunk); + e_icm_addr = mlx5dr_icm_pool_get_chunk_icm_addr(matcher_rx_tx->e_anchor->chunk); + seq_printf(file, "%d,0x%llx,0x%llx,%d,0x%llx,0x%llx\n", + rec_type, DR_DBG_PTR_TO_ID(matcher_rx_tx), + matcher_id, matcher_rx_tx->num_of_builders, + dr_dump_icm_to_idx(s_icm_addr), + dr_dump_icm_to_idx(e_icm_addr)); + + for (i = 0; i < matcher_rx_tx->num_of_builders; i++) { + ret = dr_dump_matcher_builder(file, + &matcher_rx_tx->ste_builder[i], + i, is_rx, matcher_id); + if (ret < 0) + return ret; + } + + return 0; +} + +static int +dr_dump_matcher(struct seq_file *file, struct mlx5dr_matcher *matcher) +{ + struct mlx5dr_matcher_rx_tx *rx = &matcher->rx; + struct mlx5dr_matcher_rx_tx *tx = &matcher->tx; + u64 matcher_id; + int ret; + + matcher_id = DR_DBG_PTR_TO_ID(matcher); + + seq_printf(file, "%d,0x%llx,0x%llx,%d\n", DR_DUMP_REC_TYPE_MATCHER, + matcher_id, DR_DBG_PTR_TO_ID(matcher->tbl), matcher->prio); + + ret = dr_dump_matcher_mask(file, &matcher->mask, + matcher->match_criteria, matcher_id); + if (ret < 0) + return ret; + + if (rx->nic_tbl) { + ret = dr_dump_matcher_rx_tx(file, true, rx, matcher_id); + if (ret < 0) + return ret; + } + + if (tx->nic_tbl) { + ret = dr_dump_matcher_rx_tx(file, false, tx, matcher_id); + if (ret < 0) + return ret; + } + + return 0; +} + +static int +dr_dump_matcher_all(struct seq_file *file, struct mlx5dr_matcher *matcher) +{ + struct mlx5dr_rule *rule; + int ret; + + ret = dr_dump_matcher(file, matcher); + if (ret < 0) + return ret; + + list_for_each_entry(rule, &matcher->dbg_rule_list, dbg_node) { + ret = dr_dump_rule(file, rule); + if (ret < 0) + return ret; + } + + return 0; +} + +static int +dr_dump_table_rx_tx(struct seq_file *file, bool is_rx, + struct mlx5dr_table_rx_tx *table_rx_tx, + const u64 table_id) +{ + enum dr_dump_rec_type rec_type; + u64 s_icm_addr; + + rec_type = is_rx ? DR_DUMP_REC_TYPE_TABLE_RX : + DR_DUMP_REC_TYPE_TABLE_TX; + + s_icm_addr = mlx5dr_icm_pool_get_chunk_icm_addr(table_rx_tx->s_anchor->chunk); + seq_printf(file, "%d,0x%llx,0x%llx\n", rec_type, table_id, + dr_dump_icm_to_idx(s_icm_addr)); + + return 0; +} + +static int dr_dump_table(struct seq_file *file, struct mlx5dr_table *table) +{ + struct mlx5dr_table_rx_tx *rx = &table->rx; + struct mlx5dr_table_rx_tx *tx = &table->tx; + int ret; + + seq_printf(file, "%d,0x%llx,0x%llx,%d,%d\n", DR_DUMP_REC_TYPE_TABLE, + DR_DBG_PTR_TO_ID(table), DR_DBG_PTR_TO_ID(table->dmn), + table->table_type, table->level); + + if (rx->nic_dmn) { + ret = dr_dump_table_rx_tx(file, true, rx, + DR_DBG_PTR_TO_ID(table)); + if (ret < 0) + return ret; + } + + if (tx->nic_dmn) { + ret = dr_dump_table_rx_tx(file, false, tx, + DR_DBG_PTR_TO_ID(table)); + if (ret < 0) + return ret; + } + return 0; +} + +static int dr_dump_table_all(struct seq_file *file, struct mlx5dr_table *tbl) +{ + struct mlx5dr_matcher *matcher; + int ret; + + ret = dr_dump_table(file, tbl); + if (ret < 0) + return ret; + + list_for_each_entry(matcher, &tbl->matcher_list, list_node) { + ret = dr_dump_matcher_all(file, matcher); + if (ret < 0) + return ret; + } + return 0; +} + +static int +dr_dump_send_ring(struct seq_file *file, struct mlx5dr_send_ring *ring, + const u64 domain_id) +{ + seq_printf(file, "%d,0x%llx,0x%llx,0x%x,0x%x\n", + DR_DUMP_REC_TYPE_DOMAIN_SEND_RING, DR_DBG_PTR_TO_ID(ring), + domain_id, ring->cq->mcq.cqn, ring->qp->qpn); + return 0; +} + +static int +dr_dump_domain_info_flex_parser(struct seq_file *file, + const char *flex_parser_name, + const u8 flex_parser_value, + const u64 domain_id) +{ + seq_printf(file, "%d,0x%llx,%s,0x%x\n", + DR_DUMP_REC_TYPE_DOMAIN_INFO_FLEX_PARSER, domain_id, + flex_parser_name, flex_parser_value); + return 0; +} + +static int +dr_dump_domain_info_caps(struct seq_file *file, struct mlx5dr_cmd_caps *caps, + const u64 domain_id) +{ + struct mlx5dr_cmd_vport_cap *vport_caps; + unsigned long i, vports_num; + + xa_for_each(&caps->vports.vports_caps_xa, vports_num, vport_caps) + ; /* count the number of vports in xarray */ + + seq_printf(file, "%d,0x%llx,0x%x,0x%llx,0x%llx,0x%x,%lu,%d\n", + DR_DUMP_REC_TYPE_DOMAIN_INFO_CAPS, domain_id, caps->gvmi, + caps->nic_rx_drop_address, caps->nic_tx_drop_address, + caps->flex_protocols, vports_num, caps->eswitch_manager); + + xa_for_each(&caps->vports.vports_caps_xa, i, vport_caps) { + vport_caps = xa_load(&caps->vports.vports_caps_xa, i); + + seq_printf(file, "%d,0x%llx,%lu,0x%x,0x%llx,0x%llx\n", + DR_DUMP_REC_TYPE_DOMAIN_INFO_VPORT, domain_id, i, + vport_caps->vport_gvmi, vport_caps->icm_address_rx, + vport_caps->icm_address_tx); + } + return 0; +} + +static int +dr_dump_domain_info(struct seq_file *file, struct mlx5dr_domain_info *info, + const u64 domain_id) +{ + int ret; + + ret = dr_dump_domain_info_caps(file, &info->caps, domain_id); + if (ret < 0) + return ret; + + ret = dr_dump_domain_info_flex_parser(file, "icmp_dw0", + info->caps.flex_parser_id_icmp_dw0, + domain_id); + if (ret < 0) + return ret; + + ret = dr_dump_domain_info_flex_parser(file, "icmp_dw1", + info->caps.flex_parser_id_icmp_dw1, + domain_id); + if (ret < 0) + return ret; + + ret = dr_dump_domain_info_flex_parser(file, "icmpv6_dw0", + info->caps.flex_parser_id_icmpv6_dw0, + domain_id); + if (ret < 0) + return ret; + + ret = dr_dump_domain_info_flex_parser(file, "icmpv6_dw1", + info->caps.flex_parser_id_icmpv6_dw1, + domain_id); + if (ret < 0) + return ret; + + return 0; +} + +static int +dr_dump_domain(struct seq_file *file, struct mlx5dr_domain *dmn) +{ + u64 domain_id = DR_DBG_PTR_TO_ID(dmn); + int ret; + + seq_printf(file, "%d,0x%llx,%d,0%x,%d,%s\n", DR_DUMP_REC_TYPE_DOMAIN, + domain_id, dmn->type, dmn->info.caps.gvmi, + dmn->info.supp_sw_steering, pci_name(dmn->mdev->pdev)); + + ret = dr_dump_domain_info(file, &dmn->info, domain_id); + if (ret < 0) + return ret; + + if (dmn->info.supp_sw_steering) { + ret = dr_dump_send_ring(file, dmn->send_ring, domain_id); + if (ret < 0) + return ret; + } + + return 0; +} + +static int dr_dump_domain_all(struct seq_file *file, struct mlx5dr_domain *dmn) +{ + struct mlx5dr_table *tbl; + int ret; + + mutex_lock(&dmn->dump_info.dbg_mutex); + mlx5dr_domain_lock(dmn); + + ret = dr_dump_domain(file, dmn); + if (ret < 0) + goto unlock_mutex; + + list_for_each_entry(tbl, &dmn->dbg_tbl_list, dbg_node) { + ret = dr_dump_table_all(file, tbl); + if (ret < 0) + break; + } + +unlock_mutex: + mlx5dr_domain_unlock(dmn); + mutex_unlock(&dmn->dump_info.dbg_mutex); + return ret; +} + +static int dr_dump_show(struct seq_file *file, void *priv) +{ + return dr_dump_domain_all(file, file->private); +} +DEFINE_SHOW_ATTRIBUTE(dr_dump); + +void mlx5dr_dbg_init_dump(struct mlx5dr_domain *dmn) +{ + struct mlx5_core_dev *dev = dmn->mdev; + char file_name[128]; + + if (dmn->type != MLX5DR_DOMAIN_TYPE_FDB) { + mlx5_core_warn(dev, + "Steering dump is not supported for NIC RX/TX domains\n"); + return; + } + + dmn->dump_info.steering_debugfs = + debugfs_create_dir("steering", dev->priv.dbg.dbg_root); + dmn->dump_info.fdb_debugfs = + debugfs_create_dir("fdb", dmn->dump_info.steering_debugfs); + + sprintf(file_name, "dmn_%p", dmn); + debugfs_create_file(file_name, 0444, dmn->dump_info.fdb_debugfs, + dmn, &dr_dump_fops); + + INIT_LIST_HEAD(&dmn->dbg_tbl_list); + mutex_init(&dmn->dump_info.dbg_mutex); +} + +void mlx5dr_dbg_uninit_dump(struct mlx5dr_domain *dmn) +{ + debugfs_remove_recursive(dmn->dump_info.steering_debugfs); + mutex_destroy(&dmn->dump_info.dbg_mutex); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_dbg.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_dbg.h new file mode 100644 index 0000000..def6cf8 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_dbg.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +struct mlx5dr_dbg_dump_info { + struct mutex dbg_mutex; /* protect dbg lists */ + struct dentry *steering_debugfs; + struct dentry *fdb_debugfs; +}; + +void mlx5dr_dbg_init_dump(struct mlx5dr_domain *dmn); +void mlx5dr_dbg_uninit_dump(struct mlx5dr_domain *dmn); +void mlx5dr_dbg_tbl_add(struct mlx5dr_table *tbl); +void mlx5dr_dbg_tbl_del(struct mlx5dr_table *tbl); +void mlx5dr_dbg_rule_add(struct mlx5dr_rule *rule); +void mlx5dr_dbg_rule_del(struct mlx5dr_rule *rule); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_domain.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_domain.c new file mode 100644 index 0000000..7063bd9 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_domain.c @@ -0,0 +1,585 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2019 Mellanox Technologies. */ + +#include +#include +#include "dr_types.h" + +#define DR_DOMAIN_SW_STEERING_SUPPORTED(dmn, dmn_type) \ + ((dmn)->info.caps.dmn_type##_sw_owner || \ + ((dmn)->info.caps.dmn_type##_sw_owner_v2 && \ + (dmn)->info.caps.sw_format_ver <= MLX5_STEERING_FORMAT_CONNECTX_7)) + +static bool dr_domain_is_support_modify_hdr_cache(struct mlx5dr_domain *dmn) +{ + return dmn->info.caps.support_modify_argument; +} + +static int dr_domain_init_modify_header_resources(struct mlx5dr_domain *dmn) +{ + if (!dr_domain_is_support_modify_hdr_cache(dmn)) + return 0; + + dmn->modify_header_ptrn_icm_pool = + mlx5dr_icm_pool_create(dmn, DR_ICM_TYPE_MODIFY_HDR_PTRN); + if (!dmn->modify_header_ptrn_icm_pool) { + mlx5dr_err(dmn, "Couldn't get modify-header-pattern memory\n"); + return -ENOMEM; + } + /* create argument pool */ + dmn->modify_header_arg_pool_mngr = mlx5dr_arg_pool_mngr_create(dmn); + if (!dmn->modify_header_arg_pool_mngr) + goto free_modify_header_pattern; + + return 0; + +free_modify_header_pattern: + mlx5dr_icm_pool_destroy(dmn->modify_header_ptrn_icm_pool); + return -ENOMEM; +} + +static void dr_domain_destroy_modify_header_resources(struct mlx5dr_domain *dmn) +{ + if (!dr_domain_is_support_modify_hdr_cache(dmn)) + return; + + mlx5dr_icm_pool_destroy(dmn->modify_header_ptrn_icm_pool); + mlx5dr_arg_pool_mngr_destroy(dmn->modify_header_arg_pool_mngr); +} + +static void dr_domain_init_csum_recalc_fts(struct mlx5dr_domain *dmn) +{ + /* Per vport cached FW FT for checksum recalculation, this + * recalculation is needed due to a HW bug in STEv0. + */ + xa_init(&dmn->csum_fts_xa); +} + +static void dr_domain_uninit_csum_recalc_fts(struct mlx5dr_domain *dmn) +{ + struct mlx5dr_fw_recalc_cs_ft *recalc_cs_ft; + unsigned long i; + + xa_for_each(&dmn->csum_fts_xa, i, recalc_cs_ft) { + if (recalc_cs_ft) + mlx5dr_fw_destroy_recalc_cs_ft(dmn, recalc_cs_ft); + } + + xa_destroy(&dmn->csum_fts_xa); +} + +int mlx5dr_domain_get_recalc_cs_ft_addr(struct mlx5dr_domain *dmn, + u16 vport_num, + u64 *rx_icm_addr) +{ + struct mlx5dr_fw_recalc_cs_ft *recalc_cs_ft; + int ret; + + recalc_cs_ft = xa_load(&dmn->csum_fts_xa, vport_num); + if (!recalc_cs_ft) { + /* Table hasn't been created yet */ + recalc_cs_ft = mlx5dr_fw_create_recalc_cs_ft(dmn, vport_num); + if (!recalc_cs_ft) + return -EINVAL; + + ret = xa_err(xa_store(&dmn->csum_fts_xa, vport_num, + recalc_cs_ft, GFP_KERNEL)); + if (ret) + return ret; + } + + *rx_icm_addr = recalc_cs_ft->rx_icm_addr; + + return 0; +} + +static bool dr_domain_check_hw_basic_requirement_caps(struct mlx5dr_domain *dmn) +{ + if (dmn->info.caps.sw_format_ver >= MLX5_STEERING_FORMAT_CONNECTX_6DX && + !dr_domain_is_support_modify_hdr_cache(dmn)) { + return false; + } + + return true; +} + +static bool dr_domain_is_supp_sw_steering(struct mlx5dr_domain *dmn) +{ + if (!dr_domain_check_hw_basic_requirement_caps(dmn)) + return false; + + switch (dmn->type) { + case MLX5DR_DOMAIN_TYPE_NIC_RX: + return DR_DOMAIN_SW_STEERING_SUPPORTED(dmn, rx); + case MLX5DR_DOMAIN_TYPE_NIC_TX: + return DR_DOMAIN_SW_STEERING_SUPPORTED(dmn, tx); + case MLX5DR_DOMAIN_TYPE_FDB: + return DR_DOMAIN_SW_STEERING_SUPPORTED(dmn, fdb); + default: + return false; + } +} + +static int dr_domain_init_mem_resources(struct mlx5dr_domain *dmn) +{ + int ret; + + dmn->chunks_kmem_cache = kmem_cache_create("mlx5_dr_chunks", + sizeof(struct mlx5dr_icm_chunk), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!dmn->chunks_kmem_cache) { + mlx5dr_err(dmn, "Couldn't create chunks kmem_cache\n"); + return -ENOMEM; + } + + dmn->htbls_kmem_cache = kmem_cache_create("mlx5_dr_htbls", + sizeof(struct mlx5dr_ste_htbl), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!dmn->htbls_kmem_cache) { + mlx5dr_err(dmn, "Couldn't create hash tables kmem_cache\n"); + ret = -ENOMEM; + goto free_chunks_kmem_cache; + } + + dmn->ste_icm_pool = mlx5dr_icm_pool_create(dmn, DR_ICM_TYPE_STE); + if (!dmn->ste_icm_pool) { + mlx5dr_err(dmn, "Couldn't get icm memory\n"); + ret = -ENOMEM; + goto free_htbls_kmem_cache; + } + + dmn->action_icm_pool = mlx5dr_icm_pool_create(dmn, DR_ICM_TYPE_MODIFY_ACTION); + if (!dmn->action_icm_pool) { + mlx5dr_err(dmn, "Couldn't get action icm memory\n"); + ret = -ENOMEM; + goto free_ste_icm_pool; + } + + ret = mlx5dr_send_info_pool_create(dmn); + if (ret) { + mlx5dr_err(dmn, "Couldn't create send info pool\n"); + goto free_action_icm_pool; + } + + ret = dr_domain_init_modify_header_resources(dmn); + if (ret) { + mlx5dr_err(dmn, "Couldn't create modify-header-resources\n"); + goto free_send_info_pool; + } + + return 0; + +free_send_info_pool: + mlx5dr_send_info_pool_destroy(dmn); +free_action_icm_pool: + mlx5dr_icm_pool_destroy(dmn->action_icm_pool); +free_ste_icm_pool: + mlx5dr_icm_pool_destroy(dmn->ste_icm_pool); +free_htbls_kmem_cache: + kmem_cache_destroy(dmn->htbls_kmem_cache); +free_chunks_kmem_cache: + kmem_cache_destroy(dmn->chunks_kmem_cache); + + return ret; +} + +static void dr_domain_uninit_mem_resources(struct mlx5dr_domain *dmn) +{ + dr_domain_destroy_modify_header_resources(dmn); + mlx5dr_send_info_pool_destroy(dmn); + mlx5dr_icm_pool_destroy(dmn->action_icm_pool); + mlx5dr_icm_pool_destroy(dmn->ste_icm_pool); + kmem_cache_destroy(dmn->htbls_kmem_cache); + kmem_cache_destroy(dmn->chunks_kmem_cache); +} + +static int dr_domain_init_resources(struct mlx5dr_domain *dmn) +{ + int ret; + + dmn->ste_ctx = mlx5dr_ste_get_ctx(dmn->info.caps.sw_format_ver); + if (!dmn->ste_ctx) { + mlx5dr_err(dmn, "SW Steering on this device is unsupported\n"); + return -EOPNOTSUPP; + } + + ret = mlx5_core_alloc_pd(dmn->mdev, &dmn->pdn); + if (ret) { + mlx5dr_err(dmn, "Couldn't allocate PD, ret: %d", ret); + return ret; + } + + dmn->uar = mlx5_get_uars_page(dmn->mdev); + if (IS_ERR(dmn->uar)) { + mlx5dr_err(dmn, "Couldn't allocate UAR\n"); + ret = PTR_ERR(dmn->uar); + goto clean_pd; + } + + ret = dr_domain_init_mem_resources(dmn); + if (ret) { + mlx5dr_err(dmn, "Couldn't create domain memory resources\n"); + goto clean_uar; + } + + ret = mlx5dr_send_ring_alloc(dmn); + if (ret) { + mlx5dr_err(dmn, "Couldn't create send-ring\n"); + goto clean_mem_resources; + } + + return 0; + +clean_mem_resources: + dr_domain_uninit_mem_resources(dmn); +clean_uar: + mlx5_put_uars_page(dmn->mdev, dmn->uar); +clean_pd: + mlx5_core_dealloc_pd(dmn->mdev, dmn->pdn); + + return ret; +} + +static void dr_domain_uninit_resources(struct mlx5dr_domain *dmn) +{ + mlx5dr_send_ring_free(dmn, dmn->send_ring); + dr_domain_uninit_mem_resources(dmn); + mlx5_put_uars_page(dmn->mdev, dmn->uar); + mlx5_core_dealloc_pd(dmn->mdev, dmn->pdn); +} + +static void dr_domain_fill_uplink_caps(struct mlx5dr_domain *dmn, + struct mlx5dr_cmd_vport_cap *uplink_vport) +{ + struct mlx5dr_esw_caps *esw_caps = &dmn->info.caps.esw_caps; + + uplink_vport->num = MLX5_VPORT_UPLINK; + uplink_vport->icm_address_rx = esw_caps->uplink_icm_address_rx; + uplink_vport->icm_address_tx = esw_caps->uplink_icm_address_tx; + uplink_vport->vport_gvmi = 0; + uplink_vport->vhca_gvmi = dmn->info.caps.gvmi; +} + +static int dr_domain_query_vport(struct mlx5dr_domain *dmn, + u16 vport_number, + bool other_vport, + struct mlx5dr_cmd_vport_cap *vport_caps) +{ + int ret; + + ret = mlx5dr_cmd_query_esw_vport_context(dmn->mdev, + other_vport, + vport_number, + &vport_caps->icm_address_rx, + &vport_caps->icm_address_tx); + if (ret) + return ret; + + ret = mlx5dr_cmd_query_gvmi(dmn->mdev, + other_vport, + vport_number, + &vport_caps->vport_gvmi); + if (ret) + return ret; + + vport_caps->num = vport_number; + vport_caps->vhca_gvmi = dmn->info.caps.gvmi; + + return 0; +} + +static int dr_domain_query_esw_mngr(struct mlx5dr_domain *dmn) +{ + return dr_domain_query_vport(dmn, 0, false, + &dmn->info.caps.vports.esw_manager_caps); +} + +static void dr_domain_query_uplink(struct mlx5dr_domain *dmn) +{ + dr_domain_fill_uplink_caps(dmn, &dmn->info.caps.vports.uplink_caps); +} + +static struct mlx5dr_cmd_vport_cap * +dr_domain_add_vport_cap(struct mlx5dr_domain *dmn, u16 vport) +{ + struct mlx5dr_cmd_caps *caps = &dmn->info.caps; + struct mlx5dr_cmd_vport_cap *vport_caps; + int ret; + + vport_caps = kvzalloc(sizeof(*vport_caps), GFP_KERNEL); + if (!vport_caps) + return NULL; + + ret = dr_domain_query_vport(dmn, vport, true, vport_caps); + if (ret) { + kvfree(vport_caps); + return NULL; + } + + ret = xa_insert(&caps->vports.vports_caps_xa, vport, + vport_caps, GFP_KERNEL); + if (ret) { + mlx5dr_dbg(dmn, "Couldn't insert new vport into xarray (%d)\n", ret); + kvfree(vport_caps); + return ERR_PTR(ret); + } + + return vport_caps; +} + +static bool dr_domain_is_esw_mgr_vport(struct mlx5dr_domain *dmn, u16 vport) +{ + struct mlx5dr_cmd_caps *caps = &dmn->info.caps; + + return (caps->is_ecpf && vport == MLX5_VPORT_ECPF) || + (!caps->is_ecpf && vport == 0); +} + +struct mlx5dr_cmd_vport_cap * +mlx5dr_domain_get_vport_cap(struct mlx5dr_domain *dmn, u16 vport) +{ + struct mlx5dr_cmd_caps *caps = &dmn->info.caps; + struct mlx5dr_cmd_vport_cap *vport_caps; + + if (dr_domain_is_esw_mgr_vport(dmn, vport)) + return &caps->vports.esw_manager_caps; + + if (vport == MLX5_VPORT_UPLINK) + return &caps->vports.uplink_caps; + +vport_load: + vport_caps = xa_load(&caps->vports.vports_caps_xa, vport); + if (vport_caps) + return vport_caps; + + vport_caps = dr_domain_add_vport_cap(dmn, vport); + if (PTR_ERR(vport_caps) == -EBUSY) + /* caps were already stored by another thread */ + goto vport_load; + + return vport_caps; +} + +static void dr_domain_clear_vports(struct mlx5dr_domain *dmn) +{ + struct mlx5dr_cmd_vport_cap *vport_caps; + unsigned long i; + + xa_for_each(&dmn->info.caps.vports.vports_caps_xa, i, vport_caps) { + vport_caps = xa_erase(&dmn->info.caps.vports.vports_caps_xa, i); + kvfree(vport_caps); + } +} + +static int dr_domain_query_fdb_caps(struct mlx5_core_dev *mdev, + struct mlx5dr_domain *dmn) +{ + int ret; + + if (!dmn->info.caps.eswitch_manager) + return -EOPNOTSUPP; + + ret = mlx5dr_cmd_query_esw_caps(mdev, &dmn->info.caps.esw_caps); + if (ret) + return ret; + + dmn->info.caps.fdb_sw_owner = dmn->info.caps.esw_caps.sw_owner; + dmn->info.caps.fdb_sw_owner_v2 = dmn->info.caps.esw_caps.sw_owner_v2; + dmn->info.caps.esw_rx_drop_address = dmn->info.caps.esw_caps.drop_icm_address_rx; + dmn->info.caps.esw_tx_drop_address = dmn->info.caps.esw_caps.drop_icm_address_tx; + + xa_init(&dmn->info.caps.vports.vports_caps_xa); + + /* Query eswitch manager and uplink vports only. Rest of the + * vports (vport 0, VFs and SFs) will be queried dynamically. + */ + + ret = dr_domain_query_esw_mngr(dmn); + if (ret) { + mlx5dr_err(dmn, "Failed to query eswitch manager vport caps (err: %d)", ret); + goto free_vports_caps_xa; + } + + dr_domain_query_uplink(dmn); + + return 0; + +free_vports_caps_xa: + xa_destroy(&dmn->info.caps.vports.vports_caps_xa); + + return ret; +} + +static int dr_domain_caps_init(struct mlx5_core_dev *mdev, + struct mlx5dr_domain *dmn) +{ + struct mlx5dr_cmd_vport_cap *vport_cap; + int ret; + + if (MLX5_CAP_GEN(mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) { + mlx5dr_err(dmn, "Failed to allocate domain, bad link type\n"); + return -EOPNOTSUPP; + } + + ret = mlx5dr_cmd_query_device(mdev, &dmn->info.caps); + if (ret) + return ret; + + ret = dr_domain_query_fdb_caps(mdev, dmn); + if (ret) + return ret; + + if (!dr_domain_is_supp_sw_steering(dmn)) + return -ENOTSUPP; + + switch (dmn->type) { + case MLX5DR_DOMAIN_TYPE_NIC_RX: + dmn->info.rx.type = DR_DOMAIN_NIC_TYPE_RX; + dmn->info.rx.default_icm_addr = dmn->info.caps.nic_rx_drop_address; + dmn->info.rx.drop_icm_addr = dmn->info.caps.nic_rx_drop_address; + break; + case MLX5DR_DOMAIN_TYPE_NIC_TX: + dmn->info.tx.type = DR_DOMAIN_NIC_TYPE_TX; + dmn->info.tx.default_icm_addr = dmn->info.caps.nic_tx_allow_address; + dmn->info.tx.drop_icm_addr = dmn->info.caps.nic_tx_drop_address; + break; + case MLX5DR_DOMAIN_TYPE_FDB: + dmn->info.rx.type = DR_DOMAIN_NIC_TYPE_RX; + dmn->info.tx.type = DR_DOMAIN_NIC_TYPE_TX; + vport_cap = &dmn->info.caps.vports.esw_manager_caps; + + dmn->info.tx.default_icm_addr = vport_cap->icm_address_tx; + dmn->info.rx.default_icm_addr = vport_cap->icm_address_rx; + dmn->info.rx.drop_icm_addr = dmn->info.caps.esw_rx_drop_address; + dmn->info.tx.drop_icm_addr = dmn->info.caps.esw_tx_drop_address; + break; + default: + mlx5dr_err(dmn, "Invalid domain\n"); + ret = -EINVAL; + break; + } + + dmn->info.supp_sw_steering = true; + return ret; +} + +static void dr_domain_caps_uninit(struct mlx5dr_domain *dmn) +{ + dr_domain_clear_vports(dmn); + xa_destroy(&dmn->info.caps.vports.vports_caps_xa); +} + +struct mlx5dr_domain * +mlx5dr_domain_create(struct mlx5_core_dev *mdev, enum mlx5dr_domain_type type) +{ + struct mlx5dr_domain *dmn; + int ret; + + if (type > MLX5DR_DOMAIN_TYPE_FDB) + return NULL; + + dmn = kzalloc(sizeof(*dmn), GFP_KERNEL); + if (!dmn) + return NULL; + + dmn->mdev = mdev; + dmn->type = type; + refcount_set(&dmn->refcount, 1); + mutex_init(&dmn->info.rx.mutex); + mutex_init(&dmn->info.tx.mutex); + mutex_init(&dmn->modify_hdr_mutex); + + if (dr_domain_caps_init(mdev, dmn)) { + mlx5dr_err(dmn, "Failed init domain, no caps\n"); + goto free_domain; + } + + dmn->info.max_log_action_icm_sz = DR_CHUNK_SIZE_4K; + dmn->info.max_log_sw_icm_sz = min_t(u32, DR_CHUNK_SIZE_1024K, + dmn->info.caps.log_icm_size); + dmn->info.max_log_modify_hdr_pattern_icm_sz = + min_t(u32, DR_CHUNK_SIZE_4K, + dmn->info.caps.log_modify_pattern_icm_size); + + if (!dmn->info.supp_sw_steering) { + mlx5dr_err(dmn, "SW steering is not supported\n"); + goto uninit_caps; + } + + /* Allocate resources */ + ret = dr_domain_init_resources(dmn); + if (ret) { + mlx5dr_err(dmn, "Failed init domain resources\n"); + goto uninit_caps; + } + + dr_domain_init_csum_recalc_fts(dmn); + mlx5dr_dbg_init_dump(dmn); + INIT_LIST_HEAD(&dmn->modify_hdr_list); + + return dmn; + +uninit_caps: + dr_domain_caps_uninit(dmn); +free_domain: + kfree(dmn); + return NULL; +} + +/* Assure synchronization of the device steering tables with updates made by SW + * insertion. + */ +int mlx5dr_domain_sync(struct mlx5dr_domain *dmn, u32 flags) +{ + int ret = 0; + + if (flags & MLX5DR_DOMAIN_SYNC_FLAGS_SW) { + mlx5dr_domain_lock(dmn); + ret = mlx5dr_send_ring_force_drain(dmn); + mlx5dr_domain_unlock(dmn); + if (ret) { + mlx5dr_err(dmn, "Force drain failed flags: %d, ret: %d\n", + flags, ret); + return ret; + } + } + + if (flags & MLX5DR_DOMAIN_SYNC_FLAGS_HW) + ret = mlx5dr_cmd_sync_steering(dmn->mdev); + + return ret; +} + +int mlx5dr_domain_destroy(struct mlx5dr_domain *dmn) +{ + if (WARN_ON_ONCE(refcount_read(&dmn->refcount) > 1)) + return -EBUSY; + + /* make sure resources are not used by the hardware */ + mlx5dr_cmd_sync_steering(dmn->mdev); + mlx5dr_dbg_uninit_dump(dmn); + dr_domain_uninit_csum_recalc_fts(dmn); + dr_domain_uninit_resources(dmn); + dr_domain_caps_uninit(dmn); + mutex_destroy(&dmn->info.tx.mutex); + mutex_destroy(&dmn->info.rx.mutex); + mutex_destroy(&dmn->modify_hdr_mutex); + kfree(dmn); + return 0; +} + +void mlx5dr_domain_set_peer(struct mlx5dr_domain *dmn, + struct mlx5dr_domain *peer_dmn) +{ + mlx5dr_domain_lock(dmn); + + if (dmn->peer_dmn) + refcount_dec(&dmn->peer_dmn->refcount); + + dmn->peer_dmn = peer_dmn; + + if (dmn->peer_dmn) + refcount_inc(&dmn->peer_dmn->refcount); + + mlx5dr_domain_unlock(dmn); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_fw.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_fw.c new file mode 100644 index 0000000..f05ef0c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_fw.c @@ -0,0 +1,171 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2019 Mellanox Technologies. */ + +#include +#include "dr_types.h" + +struct mlx5dr_fw_recalc_cs_ft * +mlx5dr_fw_create_recalc_cs_ft(struct mlx5dr_domain *dmn, u16 vport_num) +{ + struct mlx5dr_cmd_create_flow_table_attr ft_attr = {}; + struct mlx5dr_fw_recalc_cs_ft *recalc_cs_ft; + u32 table_id, group_id, modify_hdr_id; + u64 rx_icm_addr, modify_ttl_action; + int ret; + + recalc_cs_ft = kzalloc(sizeof(*recalc_cs_ft), GFP_KERNEL); + if (!recalc_cs_ft) + return NULL; + + ft_attr.table_type = MLX5_FLOW_TABLE_TYPE_FDB; + ft_attr.level = dmn->info.caps.max_ft_level - 1; + ft_attr.term_tbl = true; + + ret = mlx5dr_cmd_create_flow_table(dmn->mdev, + &ft_attr, + &rx_icm_addr, + &table_id); + if (ret) { + mlx5dr_err(dmn, "Failed creating TTL W/A FW flow table %d\n", ret); + goto free_ttl_tbl; + } + + ret = mlx5dr_cmd_create_empty_flow_group(dmn->mdev, + MLX5_FLOW_TABLE_TYPE_FDB, + table_id, &group_id); + if (ret) { + mlx5dr_err(dmn, "Failed creating TTL W/A FW flow group %d\n", ret); + goto destroy_flow_table; + } + + /* Modify TTL action by adding zero to trigger CS recalculation */ + modify_ttl_action = 0; + MLX5_SET(set_action_in, &modify_ttl_action, action_type, MLX5_ACTION_TYPE_ADD); + MLX5_SET(set_action_in, &modify_ttl_action, field, MLX5_ACTION_IN_FIELD_OUT_IP_TTL); + + ret = mlx5dr_cmd_alloc_modify_header(dmn->mdev, MLX5_FLOW_TABLE_TYPE_FDB, 1, + &modify_ttl_action, + &modify_hdr_id); + if (ret) { + mlx5dr_err(dmn, "Failed modify header TTL %d\n", ret); + goto destroy_flow_group; + } + + ret = mlx5dr_cmd_set_fte_modify_and_vport(dmn->mdev, + MLX5_FLOW_TABLE_TYPE_FDB, + table_id, group_id, modify_hdr_id, + vport_num); + if (ret) { + mlx5dr_err(dmn, "Failed setting TTL W/A flow table entry %d\n", ret); + goto dealloc_modify_header; + } + + recalc_cs_ft->modify_hdr_id = modify_hdr_id; + recalc_cs_ft->rx_icm_addr = rx_icm_addr; + recalc_cs_ft->table_id = table_id; + recalc_cs_ft->group_id = group_id; + + return recalc_cs_ft; + +dealloc_modify_header: + mlx5dr_cmd_dealloc_modify_header(dmn->mdev, modify_hdr_id); +destroy_flow_group: + mlx5dr_cmd_destroy_flow_group(dmn->mdev, + MLX5_FLOW_TABLE_TYPE_FDB, + table_id, group_id); +destroy_flow_table: + mlx5dr_cmd_destroy_flow_table(dmn->mdev, table_id, MLX5_FLOW_TABLE_TYPE_FDB); +free_ttl_tbl: + kfree(recalc_cs_ft); + return NULL; +} + +void mlx5dr_fw_destroy_recalc_cs_ft(struct mlx5dr_domain *dmn, + struct mlx5dr_fw_recalc_cs_ft *recalc_cs_ft) +{ + mlx5dr_cmd_del_flow_table_entry(dmn->mdev, + MLX5_FLOW_TABLE_TYPE_FDB, + recalc_cs_ft->table_id); + mlx5dr_cmd_dealloc_modify_header(dmn->mdev, recalc_cs_ft->modify_hdr_id); + mlx5dr_cmd_destroy_flow_group(dmn->mdev, + MLX5_FLOW_TABLE_TYPE_FDB, + recalc_cs_ft->table_id, + recalc_cs_ft->group_id); + mlx5dr_cmd_destroy_flow_table(dmn->mdev, + recalc_cs_ft->table_id, + MLX5_FLOW_TABLE_TYPE_FDB); + + kfree(recalc_cs_ft); +} + +int mlx5dr_fw_create_md_tbl(struct mlx5dr_domain *dmn, + struct mlx5dr_cmd_flow_destination_hw_info *dest, + int num_dest, + bool reformat_req, + u32 *tbl_id, + u32 *group_id, + bool ignore_flow_level, + u32 flow_source) +{ + struct mlx5dr_cmd_create_flow_table_attr ft_attr = {}; + struct mlx5dr_cmd_fte_info fte_info = {}; + u32 val[MLX5_ST_SZ_DW_MATCH_PARAM] = {}; + struct mlx5dr_cmd_ft_info ft_info = {}; + int ret; + + ft_attr.table_type = MLX5_FLOW_TABLE_TYPE_FDB; + ft_attr.level = min_t(int, dmn->info.caps.max_ft_level - 2, + MLX5_FT_MAX_MULTIPATH_LEVEL); + ft_attr.reformat_en = reformat_req; + ft_attr.decap_en = reformat_req; + + ret = mlx5dr_cmd_create_flow_table(dmn->mdev, &ft_attr, NULL, tbl_id); + if (ret) { + mlx5dr_err(dmn, "Failed creating multi dest FW flow table %d\n", ret); + return ret; + } + + ret = mlx5dr_cmd_create_empty_flow_group(dmn->mdev, + MLX5_FLOW_TABLE_TYPE_FDB, + *tbl_id, group_id); + if (ret) { + mlx5dr_err(dmn, "Failed creating multi dest FW flow group %d\n", ret); + goto free_flow_table; + } + + ft_info.id = *tbl_id; + ft_info.type = FS_FT_FDB; + fte_info.action.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + fte_info.dests_size = num_dest; + fte_info.val = val; + fte_info.dest_arr = dest; + fte_info.ignore_flow_level = ignore_flow_level; + fte_info.flow_context.flow_source = flow_source; + + ret = mlx5dr_cmd_set_fte(dmn->mdev, 0, 0, &ft_info, *group_id, &fte_info); + if (ret) { + mlx5dr_err(dmn, "Failed setting fte into table %d\n", ret); + goto free_flow_group; + } + + return 0; + +free_flow_group: + mlx5dr_cmd_destroy_flow_group(dmn->mdev, MLX5_FLOW_TABLE_TYPE_FDB, + *tbl_id, *group_id); +free_flow_table: + mlx5dr_cmd_destroy_flow_table(dmn->mdev, *tbl_id, + MLX5_FLOW_TABLE_TYPE_FDB); + return ret; +} + +void mlx5dr_fw_destroy_md_tbl(struct mlx5dr_domain *dmn, + u32 tbl_id, u32 group_id) +{ + mlx5dr_cmd_del_flow_table_entry(dmn->mdev, FS_FT_FDB, tbl_id); + mlx5dr_cmd_destroy_flow_group(dmn->mdev, + MLX5_FLOW_TABLE_TYPE_FDB, + tbl_id, group_id); + mlx5dr_cmd_destroy_flow_table(dmn->mdev, tbl_id, + MLX5_FLOW_TABLE_TYPE_FDB); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_icm_pool.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_icm_pool.c new file mode 100644 index 0000000..1432015 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_icm_pool.c @@ -0,0 +1,752 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2019 Mellanox Technologies. */ + +#include "dr_types.h" + +#define DR_ICM_MODIFY_HDR_ALIGN_BASE 64 +#define DR_ICM_MODIFY_HDR_GRANULARITY_4K 12 +#define DR_ICM_POOL_HOT_MEMORY_FRACTION 4 + +struct mlx5dr_icm_dm { + u32 obj_id; + enum mlx5_sw_icm_type type; + phys_addr_t addr; + size_t length; +}; + +struct mlx5dr_icm_mr { + u32 mkey; + struct mlx5dr_icm_dm dm; + struct mlx5dr_domain *dmn; + size_t length; + u64 icm_start_addr; +}; + +static int dr_icm_create_dm_mkey(struct mlx5_core_dev *mdev, + u32 pd, u64 length, u64 start_addr, int mode, + u32 *mkey) +{ + u32 inlen = MLX5_ST_SZ_BYTES(create_mkey_in); + u32 in[MLX5_ST_SZ_DW(create_mkey_in)] = {}; + void *mkc; + + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + + MLX5_SET(mkc, mkc, access_mode_1_0, mode); + MLX5_SET(mkc, mkc, access_mode_4_2, (mode >> 2) & 0x7); + MLX5_SET(mkc, mkc, lw, 1); + MLX5_SET(mkc, mkc, lr, 1); + if (mode == MLX5_MKC_ACCESS_MODE_SW_ICM) { + MLX5_SET(mkc, mkc, rw, 1); + MLX5_SET(mkc, mkc, rr, 1); + } + + MLX5_SET64(mkc, mkc, len, length); + MLX5_SET(mkc, mkc, pd, pd); + MLX5_SET(mkc, mkc, qpn, 0xffffff); + MLX5_SET64(mkc, mkc, start_addr, start_addr); + + return mlx5_core_create_mkey(mdev, mkey, in, inlen); +} + +u64 mlx5dr_icm_pool_get_chunk_mr_addr(struct mlx5dr_icm_chunk *chunk) +{ + u32 offset = mlx5dr_icm_pool_dm_type_to_entry_size(chunk->buddy_mem->pool->icm_type); + + return (u64)offset * chunk->seg; +} + +u32 mlx5dr_icm_pool_get_chunk_rkey(struct mlx5dr_icm_chunk *chunk) +{ + return chunk->buddy_mem->icm_mr->mkey; +} + +u64 mlx5dr_icm_pool_get_chunk_icm_addr(struct mlx5dr_icm_chunk *chunk) +{ + u32 size = mlx5dr_icm_pool_dm_type_to_entry_size(chunk->buddy_mem->pool->icm_type); + + return (u64)chunk->buddy_mem->icm_mr->icm_start_addr + size * chunk->seg; +} + +u32 mlx5dr_icm_pool_get_chunk_byte_size(struct mlx5dr_icm_chunk *chunk) +{ + return mlx5dr_icm_pool_chunk_size_to_byte(chunk->size, + chunk->buddy_mem->pool->icm_type); +} + +u32 mlx5dr_icm_pool_get_chunk_num_of_entries(struct mlx5dr_icm_chunk *chunk) +{ + return mlx5dr_icm_pool_chunk_size_to_entries(chunk->size); +} + +static struct mlx5dr_icm_mr * +dr_icm_pool_mr_create(struct mlx5dr_icm_pool *pool) +{ + struct mlx5_core_dev *mdev = pool->dmn->mdev; + enum mlx5_sw_icm_type dm_type = 0; + struct mlx5dr_icm_mr *icm_mr; + size_t log_align_base = 0; + int err; + + icm_mr = kvzalloc(sizeof(*icm_mr), GFP_KERNEL); + if (!icm_mr) + return NULL; + + icm_mr->dmn = pool->dmn; + + icm_mr->dm.length = mlx5dr_icm_pool_chunk_size_to_byte(pool->max_log_chunk_sz, + pool->icm_type); + + switch (pool->icm_type) { + case DR_ICM_TYPE_STE: + dm_type = MLX5_SW_ICM_TYPE_STEERING; + log_align_base = ilog2(icm_mr->dm.length); + break; + case DR_ICM_TYPE_MODIFY_ACTION: + dm_type = MLX5_SW_ICM_TYPE_HEADER_MODIFY; + /* Align base is 64B */ + log_align_base = ilog2(DR_ICM_MODIFY_HDR_ALIGN_BASE); + break; + case DR_ICM_TYPE_MODIFY_HDR_PTRN: + dm_type = MLX5_SW_ICM_TYPE_HEADER_MODIFY_PATTERN; + /* Align base is 64B */ + log_align_base = ilog2(DR_ICM_MODIFY_HDR_ALIGN_BASE); + break; + default: + WARN_ON(pool->icm_type); + } + + icm_mr->dm.type = dm_type; + + err = mlx5_dm_sw_icm_alloc(mdev, icm_mr->dm.type, icm_mr->dm.length, + log_align_base, 0, &icm_mr->dm.addr, + &icm_mr->dm.obj_id); + if (err) { + mlx5dr_err(pool->dmn, "Failed to allocate SW ICM memory, err (%d)\n", err); + goto free_icm_mr; + } + + /* Register device memory */ + err = dr_icm_create_dm_mkey(mdev, pool->dmn->pdn, + icm_mr->dm.length, + icm_mr->dm.addr, + MLX5_MKC_ACCESS_MODE_SW_ICM, + &icm_mr->mkey); + if (err) { + mlx5dr_err(pool->dmn, "Failed to create SW ICM MKEY, err (%d)\n", err); + goto free_dm; + } + + icm_mr->icm_start_addr = icm_mr->dm.addr; + + if (icm_mr->icm_start_addr & (BIT(log_align_base) - 1)) { + mlx5dr_err(pool->dmn, "Failed to get Aligned ICM mem (asked: %zu)\n", + log_align_base); + goto free_mkey; + } + + return icm_mr; + +free_mkey: + mlx5_core_destroy_mkey(mdev, icm_mr->mkey); +free_dm: + mlx5_dm_sw_icm_dealloc(mdev, icm_mr->dm.type, icm_mr->dm.length, 0, + icm_mr->dm.addr, icm_mr->dm.obj_id); +free_icm_mr: + kvfree(icm_mr); + return NULL; +} + +static void dr_icm_pool_mr_destroy(struct mlx5dr_icm_mr *icm_mr) +{ + struct mlx5_core_dev *mdev = icm_mr->dmn->mdev; + struct mlx5dr_icm_dm *dm = &icm_mr->dm; + + mlx5_core_destroy_mkey(mdev, icm_mr->mkey); + mlx5_dm_sw_icm_dealloc(mdev, dm->type, dm->length, 0, + dm->addr, dm->obj_id); + kvfree(icm_mr); +} + +static int dr_icm_buddy_get_ste_size(struct mlx5dr_icm_buddy_mem *buddy) +{ + /* We support only one type of STE size, both for ConnectX-5 and later + * devices. Once the support for match STE which has a larger tag is + * added (32B instead of 16B), the STE size for devices later than + * ConnectX-5 needs to account for that. + */ + return DR_STE_SIZE_REDUCED; +} + +static void dr_icm_chunk_ste_init(struct mlx5dr_icm_chunk *chunk, int offset) +{ + int num_of_entries = mlx5dr_icm_pool_get_chunk_num_of_entries(chunk); + struct mlx5dr_icm_buddy_mem *buddy = chunk->buddy_mem; + int ste_size = dr_icm_buddy_get_ste_size(buddy); + int index = offset / DR_STE_SIZE; + + chunk->ste_arr = &buddy->ste_arr[index]; + chunk->miss_list = &buddy->miss_list[index]; + chunk->hw_ste_arr = buddy->hw_ste_arr + index * ste_size; + + memset(chunk->hw_ste_arr, 0, num_of_entries * ste_size); + memset(chunk->ste_arr, 0, + num_of_entries * sizeof(chunk->ste_arr[0])); +} + +static int dr_icm_buddy_init_ste_cache(struct mlx5dr_icm_buddy_mem *buddy) +{ + int num_of_entries = + mlx5dr_icm_pool_chunk_size_to_entries(buddy->pool->max_log_chunk_sz); + + buddy->ste_arr = kvcalloc(num_of_entries, + sizeof(struct mlx5dr_ste), GFP_KERNEL); + if (!buddy->ste_arr) + return -ENOMEM; + + /* Preallocate full STE size on non-ConnectX-5 devices since + * we need to support both full and reduced with the same cache. + */ + buddy->hw_ste_arr = kvcalloc(num_of_entries, + dr_icm_buddy_get_ste_size(buddy), GFP_KERNEL); + if (!buddy->hw_ste_arr) + goto free_ste_arr; + + buddy->miss_list = kvmalloc(num_of_entries * sizeof(struct list_head), GFP_KERNEL); + if (!buddy->miss_list) + goto free_hw_ste_arr; + + return 0; + +free_hw_ste_arr: + kvfree(buddy->hw_ste_arr); +free_ste_arr: + kvfree(buddy->ste_arr); + return -ENOMEM; +} + +static void dr_icm_buddy_cleanup_ste_cache(struct mlx5dr_icm_buddy_mem *buddy) +{ + kvfree(buddy->ste_arr); + kvfree(buddy->hw_ste_arr); + kvfree(buddy->miss_list); +} + +static int dr_icm_buddy_create(struct mlx5dr_icm_pool *pool) +{ + struct mlx5dr_icm_buddy_mem *buddy; + struct mlx5dr_icm_mr *icm_mr; + + icm_mr = dr_icm_pool_mr_create(pool); + if (!icm_mr) + return -ENOMEM; + + buddy = kvzalloc(sizeof(*buddy), GFP_KERNEL); + if (!buddy) + goto free_mr; + + if (mlx5dr_buddy_init(buddy, pool->max_log_chunk_sz)) + goto err_free_buddy; + + buddy->icm_mr = icm_mr; + buddy->pool = pool; + + if (pool->icm_type == DR_ICM_TYPE_STE) { + /* Reduce allocations by preallocating and reusing the STE structures */ + if (dr_icm_buddy_init_ste_cache(buddy)) + goto err_cleanup_buddy; + } + + /* add it to the -start- of the list in order to search in it first */ + list_add(&buddy->list_node, &pool->buddy_mem_list); + + return 0; + +err_cleanup_buddy: + mlx5dr_buddy_cleanup(buddy); +err_free_buddy: + kvfree(buddy); +free_mr: + dr_icm_pool_mr_destroy(icm_mr); + return -ENOMEM; +} + +static void dr_icm_buddy_destroy(struct mlx5dr_icm_buddy_mem *buddy) +{ + dr_icm_pool_mr_destroy(buddy->icm_mr); + + mlx5dr_buddy_cleanup(buddy); + + if (buddy->pool->icm_type == DR_ICM_TYPE_STE) + dr_icm_buddy_cleanup_ste_cache(buddy); + + kvfree(buddy); +} + +static struct mlx5dr_icm_chunk * +dr_icm_chunk_create(enum mlx5dr_icm_type icm_type, + enum mlx5dr_icm_chunk_size chunk_size, + struct mlx5dr_icm_buddy_mem *buddy_mem_pool, + unsigned int seg) +{ + struct kmem_cache *chunks_cache = buddy_mem_pool->pool->chunks_kmem_cache; + struct mlx5dr_icm_chunk *chunk; + int offset; + + chunk = kmem_cache_alloc(chunks_cache, GFP_KERNEL); + if (!chunk) + return NULL; + + offset = mlx5dr_icm_pool_dm_type_to_entry_size(icm_type) * seg; + + chunk->seg = seg; + chunk->size = chunk_size; + chunk->buddy_mem = buddy_mem_pool; + + if (icm_type == DR_ICM_TYPE_STE) + dr_icm_chunk_ste_init(chunk, offset); + + buddy_mem_pool->used_memory += mlx5dr_icm_pool_get_chunk_byte_size(chunk); + + return chunk; +} + +static bool dr_icm_pool_is_sync_required(struct mlx5dr_icm_pool *pool) +{ + int allow_hot_size; + + /* sync when hot memory reaches half of the pool size */ + allow_hot_size = + mlx5dr_icm_pool_chunk_size_to_byte(pool->max_log_chunk_sz, + pool->icm_type) / + DR_ICM_POOL_HOT_MEMORY_FRACTION; + + return pool->hot_memory_size > allow_hot_size; +} + +static void dr_icm_pool_clear_hot_chunks_arr(struct mlx5dr_icm_pool *pool) +{ + struct mlx5dr_icm_hot_chunk *hot_chunk; + u32 i, num_entries; + + for (i = 0; i < pool->hot_chunks_num; i++) { + hot_chunk = &pool->hot_chunks_arr[i]; + num_entries = mlx5dr_icm_pool_chunk_size_to_entries(hot_chunk->size); + mlx5dr_buddy_free_mem(hot_chunk->buddy_mem, + hot_chunk->seg, ilog2(num_entries)); + hot_chunk->buddy_mem->used_memory -= + mlx5dr_icm_pool_chunk_size_to_byte(hot_chunk->size, + pool->icm_type); + } + + pool->hot_chunks_num = 0; + pool->hot_memory_size = 0; +} + +static int dr_icm_pool_sync_all_buddy_pools(struct mlx5dr_icm_pool *pool) +{ + struct mlx5dr_icm_buddy_mem *buddy, *tmp_buddy; + int err; + + err = mlx5dr_cmd_sync_steering(pool->dmn->mdev); + if (err) { + mlx5dr_err(pool->dmn, "Failed to sync to HW (err: %d)\n", err); + return err; + } + + dr_icm_pool_clear_hot_chunks_arr(pool); + + list_for_each_entry_safe(buddy, tmp_buddy, &pool->buddy_mem_list, list_node) { + if (!buddy->used_memory && pool->icm_type == DR_ICM_TYPE_STE) + dr_icm_buddy_destroy(buddy); + } + + return 0; +} + +static int dr_icm_handle_buddies_get_mem(struct mlx5dr_icm_pool *pool, + enum mlx5dr_icm_chunk_size chunk_size, + struct mlx5dr_icm_buddy_mem **buddy, + unsigned int *seg) +{ + struct mlx5dr_icm_buddy_mem *buddy_mem_pool; + bool new_mem = false; + int err; + +alloc_buddy_mem: + /* find the next free place from the buddy list */ + list_for_each_entry(buddy_mem_pool, &pool->buddy_mem_list, list_node) { + err = mlx5dr_buddy_alloc_mem(buddy_mem_pool, + chunk_size, seg); + if (!err) + goto found; + + if (WARN_ON(new_mem)) { + /* We have new memory pool, first in the list */ + mlx5dr_err(pool->dmn, + "No memory for order: %d\n", + chunk_size); + goto out; + } + } + + /* no more available allocators in that pool, create new */ + err = dr_icm_buddy_create(pool); + if (err) { + mlx5dr_err(pool->dmn, + "Failed creating buddy for order %d\n", + chunk_size); + goto out; + } + + /* mark we have new memory, first in list */ + new_mem = true; + goto alloc_buddy_mem; + +found: + *buddy = buddy_mem_pool; +out: + return err; +} + +/* Allocate an ICM chunk, each chunk holds a piece of ICM memory and + * also memory used for HW STE management for optimizations. + */ +struct mlx5dr_icm_chunk * +mlx5dr_icm_alloc_chunk(struct mlx5dr_icm_pool *pool, + enum mlx5dr_icm_chunk_size chunk_size) +{ + struct mlx5dr_icm_chunk *chunk = NULL; + struct mlx5dr_icm_buddy_mem *buddy; + unsigned int seg; + int ret; + + if (chunk_size > pool->max_log_chunk_sz) + return NULL; + + mutex_lock(&pool->mutex); + /* find mem, get back the relevant buddy pool and seg in that mem */ + ret = dr_icm_handle_buddies_get_mem(pool, chunk_size, &buddy, &seg); + if (ret) + goto out; + + chunk = dr_icm_chunk_create(pool->icm_type, chunk_size, buddy, seg); + if (!chunk) + goto out_err; + + goto out; + +out_err: + mlx5dr_buddy_free_mem(buddy, seg, chunk_size); +out: + mutex_unlock(&pool->mutex); + return chunk; +} + +void mlx5dr_icm_free_chunk(struct mlx5dr_icm_chunk *chunk) +{ + struct mlx5dr_icm_buddy_mem *buddy = chunk->buddy_mem; + struct mlx5dr_icm_pool *pool = buddy->pool; + struct mlx5dr_icm_hot_chunk *hot_chunk; + struct kmem_cache *chunks_cache; + + chunks_cache = pool->chunks_kmem_cache; + + /* move the chunk to the waiting chunks array, AKA "hot" memory */ + mutex_lock(&pool->mutex); + + pool->hot_memory_size += mlx5dr_icm_pool_get_chunk_byte_size(chunk); + + hot_chunk = &pool->hot_chunks_arr[pool->hot_chunks_num++]; + hot_chunk->buddy_mem = chunk->buddy_mem; + hot_chunk->seg = chunk->seg; + hot_chunk->size = chunk->size; + + kmem_cache_free(chunks_cache, chunk); + + /* Check if we have chunks that are waiting for sync-ste */ + if (dr_icm_pool_is_sync_required(pool)) + dr_icm_pool_sync_all_buddy_pools(pool); + + mutex_unlock(&pool->mutex); +} + +struct mlx5dr_icm_pool *mlx5dr_icm_pool_create(struct mlx5dr_domain *dmn, + enum mlx5dr_icm_type icm_type) +{ + u32 num_of_chunks, entry_size, max_hot_size; + struct mlx5dr_icm_pool *pool = NULL; + + pool = kvzalloc(sizeof(*pool), GFP_KERNEL); + if (!pool) + return NULL; + + pool->dmn = dmn; + pool->icm_type = icm_type; + pool->chunks_kmem_cache = dmn->chunks_kmem_cache; + + INIT_LIST_HEAD(&pool->buddy_mem_list); + mutex_init(&pool->mutex); + + switch (icm_type) { + case DR_ICM_TYPE_STE: + pool->max_log_chunk_sz = dmn->info.max_log_sw_icm_sz; + break; + case DR_ICM_TYPE_MODIFY_ACTION: + pool->max_log_chunk_sz = dmn->info.max_log_action_icm_sz; + break; + case DR_ICM_TYPE_MODIFY_HDR_PTRN: + pool->max_log_chunk_sz = dmn->info.max_log_modify_hdr_pattern_icm_sz; + break; + default: + WARN_ON(icm_type); + } + + entry_size = mlx5dr_icm_pool_dm_type_to_entry_size(pool->icm_type); + + max_hot_size = mlx5dr_icm_pool_chunk_size_to_byte(pool->max_log_chunk_sz, + pool->icm_type) / + DR_ICM_POOL_HOT_MEMORY_FRACTION; + + num_of_chunks = DIV_ROUND_UP(max_hot_size, entry_size) + 1; + + pool->hot_chunks_arr = kvcalloc(num_of_chunks, + sizeof(struct mlx5dr_icm_hot_chunk), + GFP_KERNEL); + if (!pool->hot_chunks_arr) + goto free_pool; + + return pool; + +free_pool: + kvfree(pool); + return NULL; +} + +void mlx5dr_icm_pool_destroy(struct mlx5dr_icm_pool *pool) +{ + struct mlx5dr_icm_buddy_mem *buddy, *tmp_buddy; + + dr_icm_pool_clear_hot_chunks_arr(pool); + + list_for_each_entry_safe(buddy, tmp_buddy, &pool->buddy_mem_list, list_node) + dr_icm_buddy_destroy(buddy); + + kvfree(pool->hot_chunks_arr); + mutex_destroy(&pool->mutex); + kvfree(pool); +} + +struct dr_arg_pool { + enum mlx5dr_arg_chunk_size log_chunk_size; + struct mlx5dr_domain *dmn; + struct list_head free_list; + struct mutex mutex; /* protect arg pool */ +}; + +struct mlx5dr_arg_pool_mngr { + struct mlx5dr_domain *dmn; + struct dr_arg_pool *pools[DR_ARG_CHUNK_SIZE_MAX]; +}; + +static int dr_arg_add_new_objects_to_pool(struct dr_arg_pool *pool) +{ + struct mlx5dr_arg_object *arg_obj, *tmp_arg; + struct list_head cur_list; + u16 object_range; + int num_of_objects; + u32 obj_id = 0; + int ret; + int i; + + INIT_LIST_HEAD(&cur_list); + + object_range = pool->dmn->info.caps.log_header_modify_argument_granularity; + + object_range = + max_t(u32, + pool->dmn->info.caps.log_header_modify_argument_granularity, + DR_ICM_MODIFY_HDR_GRANULARITY_4K); + object_range = + min_t(u32, + pool->dmn->info.caps.log_header_modify_argument_max_alloc, + object_range); + + if (pool->log_chunk_size > object_range) { + mlx5dr_err(pool->dmn, + "Required chunk size (%d) is not supported\n", + pool->log_chunk_size); + return -ENOMEM; + } + + num_of_objects = (1 << (object_range - pool->log_chunk_size)); + /* Only one general object per range */ + ret = mlx5dr_cmd_create_modify_header_arg(pool->dmn->mdev, + object_range, + pool->dmn->pdn, + &obj_id); + if (ret) { + mlx5dr_err(pool->dmn, + "Failed allocating object with range: %d:\n", + object_range); + return ret; + } + + for (i = 0; i < num_of_objects; i++) { + arg_obj = kvzalloc(sizeof(*arg_obj), GFP_KERNEL); + if (!arg_obj) + goto clean_arg_obj; + + arg_obj->log_chunk_size = pool->log_chunk_size; + + list_add_tail(&arg_obj->list_node, &cur_list); + + arg_obj->obj_id = obj_id; + arg_obj->obj_offset = i * (1 << pool->log_chunk_size); + } + + list_splice_tail_init(&cur_list, &pool->free_list); + return 0; + +clean_arg_obj: + list_for_each_entry_safe(arg_obj, tmp_arg, &cur_list, list_node) { + list_del(&arg_obj->list_node); + kvfree(arg_obj); + } + mlx5dr_cmd_destroy_modify_header_arg(pool->dmn->mdev, obj_id); + return -ENOMEM; +} + +static struct dr_arg_pool *dr_arg_pool_create(struct mlx5dr_domain *dmn, + enum mlx5dr_arg_chunk_size chunk_size) +{ + struct dr_arg_pool *pool; + + pool = kvzalloc(sizeof(*pool), GFP_KERNEL); + if (!pool) + return NULL; + + pool->dmn = dmn; + + INIT_LIST_HEAD(&pool->free_list); + mutex_init(&pool->mutex); + + pool->log_chunk_size = chunk_size; + if (dr_arg_add_new_objects_to_pool(pool)) + goto free_pool; + + return pool; + +free_pool: + kvfree(pool); + + return NULL; +} + +static void dr_arg_pool_destroy(struct dr_arg_pool *pool) +{ + struct mlx5dr_arg_object *tmp_arg; + struct mlx5dr_arg_object *arg_obj; + + list_for_each_entry_safe(arg_obj, tmp_arg, &pool->free_list, list_node) { + list_del(&arg_obj->list_node); + if (!arg_obj->obj_offset) /* the first in range */ + mlx5dr_cmd_destroy_modify_header_arg(pool->dmn->mdev, + arg_obj->obj_id); + kvfree(arg_obj); + } + + mutex_destroy(&pool->mutex); + kvfree(pool); +} + +static struct mlx5dr_arg_object *dr_arg_get_obj_from_pool(struct dr_arg_pool *pool) +{ + struct mlx5dr_arg_object *arg_obj = NULL; + int ret; + + mutex_lock(&pool->mutex); + if (list_empty(&pool->free_list)) { + ret = dr_arg_add_new_objects_to_pool(pool); + if (ret) + goto out; + } + + arg_obj = list_first_entry_or_null(&pool->free_list, + struct mlx5dr_arg_object, + list_node); + WARN(!arg_obj, "couldn't get dr arg obj from pool"); + + if (arg_obj) + list_del_init(&arg_obj->list_node); + +out: + mutex_unlock(&pool->mutex); + return arg_obj; +} + +static void dr_arg_put_obj_in_pool(struct dr_arg_pool *pool, + struct mlx5dr_arg_object *arg_obj) +{ + mutex_lock(&pool->mutex); + list_add(&arg_obj->list_node, &pool->free_list); + mutex_unlock(&pool->mutex); +} + +void mlx5dr_arg_put_obj(struct mlx5dr_domain *dmn, + struct mlx5dr_arg_object *arg_obj) +{ + return dr_arg_put_obj_in_pool( + dmn->modify_header_arg_pool_mngr->pools[arg_obj->log_chunk_size], + arg_obj); +} + +struct mlx5dr_arg_object *mlx5dr_arg_get_obj(struct mlx5dr_domain *dmn, + enum mlx5dr_arg_chunk_size size) +{ + if (size >= DR_ARG_CHUNK_SIZE_MAX) + return NULL; + + return dr_arg_get_obj_from_pool(dmn->modify_header_arg_pool_mngr->pools[size]); +} + +uint32_t mlx5dr_arg_get_object_id(struct mlx5dr_arg_object *arg_obj) +{ + return (arg_obj->obj_id + arg_obj->obj_offset); +} + +struct mlx5dr_arg_pool_mngr *mlx5dr_arg_pool_mngr_create(struct mlx5dr_domain *dmn) +{ + struct mlx5dr_arg_pool_mngr *pool_mngr; + int i; + + pool_mngr = kvzalloc(sizeof(*pool_mngr), GFP_KERNEL); + if (!pool_mngr) + return NULL; + + pool_mngr->dmn = dmn; + + for (i = 0; i <= DR_ARG_CHUNK_SIZE_MAX - 1; i++) { + pool_mngr->pools[i] = dr_arg_pool_create(dmn, i); + if (!pool_mngr->pools[i]) + goto clean_pools; + } + + return pool_mngr; + +clean_pools: + for (i--; i >= 0; i--) + dr_arg_pool_destroy(pool_mngr->pools[i]); + kvfree(pool_mngr); + return NULL; +} + +void mlx5dr_arg_pool_mngr_destroy(struct mlx5dr_arg_pool_mngr *pool_mngr) +{ + int i; + + for (i = 0; i < DR_ARG_CHUNK_SIZE_MAX; i++) + dr_arg_pool_destroy(pool_mngr->pools[i]); + + kvfree(pool_mngr); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c new file mode 100644 index 0000000..0726848 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c @@ -0,0 +1,1108 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2019 Mellanox Technologies. */ + +#include "dr_types.h" + +static bool dr_mask_is_smac_set(struct mlx5dr_match_spec *spec) +{ + return (spec->smac_47_16 || spec->smac_15_0); +} + +static bool dr_mask_is_dmac_set(struct mlx5dr_match_spec *spec) +{ + return (spec->dmac_47_16 || spec->dmac_15_0); +} + +static bool dr_mask_is_l3_base_set(struct mlx5dr_match_spec *spec) +{ + return (spec->ip_protocol || spec->frag || spec->tcp_flags || + spec->ip_ecn || spec->ip_dscp); +} + +static bool dr_mask_is_tcp_udp_base_set(struct mlx5dr_match_spec *spec) +{ + return (spec->tcp_sport || spec->tcp_dport || + spec->udp_sport || spec->udp_dport); +} + +static bool dr_mask_is_ipv4_set(struct mlx5dr_match_spec *spec) +{ + return (spec->dst_ip_31_0 || spec->src_ip_31_0); +} + +static bool dr_mask_is_ipv4_5_tuple_set(struct mlx5dr_match_spec *spec) +{ + return (dr_mask_is_l3_base_set(spec) || + dr_mask_is_tcp_udp_base_set(spec) || + dr_mask_is_ipv4_set(spec)); +} + +static bool dr_mask_is_eth_l2_tnl_set(struct mlx5dr_match_misc *misc) +{ + return misc->vxlan_vni; +} + +static bool dr_mask_is_ttl_set(struct mlx5dr_match_spec *spec) +{ + return spec->ttl_hoplimit; +} + +static bool dr_mask_is_ipv4_ihl_set(struct mlx5dr_match_spec *spec) +{ + return spec->ipv4_ihl; +} + +#define DR_MASK_IS_L2_DST(_spec, _misc, _inner_outer) (_spec.first_vid || \ + (_spec).first_cfi || (_spec).first_prio || (_spec).cvlan_tag || \ + (_spec).svlan_tag || (_spec).dmac_47_16 || (_spec).dmac_15_0 || \ + (_spec).ethertype || (_spec).ip_version || \ + (_misc)._inner_outer##_second_vid || \ + (_misc)._inner_outer##_second_cfi || \ + (_misc)._inner_outer##_second_prio || \ + (_misc)._inner_outer##_second_cvlan_tag || \ + (_misc)._inner_outer##_second_svlan_tag) + +#define DR_MASK_IS_ETH_L4_SET(_spec, _misc, _inner_outer) ( \ + dr_mask_is_l3_base_set(&(_spec)) || \ + dr_mask_is_tcp_udp_base_set(&(_spec)) || \ + dr_mask_is_ttl_set(&(_spec)) || \ + (_misc)._inner_outer##_ipv6_flow_label) + +#define DR_MASK_IS_ETH_L4_MISC_SET(_misc3, _inner_outer) ( \ + (_misc3)._inner_outer##_tcp_seq_num || \ + (_misc3)._inner_outer##_tcp_ack_num) + +#define DR_MASK_IS_FIRST_MPLS_SET(_misc2, _inner_outer) ( \ + (_misc2)._inner_outer##_first_mpls_label || \ + (_misc2)._inner_outer##_first_mpls_exp || \ + (_misc2)._inner_outer##_first_mpls_s_bos || \ + (_misc2)._inner_outer##_first_mpls_ttl) + +static bool dr_mask_is_tnl_gre_set(struct mlx5dr_match_misc *misc) +{ + return (misc->gre_key_h || misc->gre_key_l || + misc->gre_protocol || misc->gre_c_present || + misc->gre_k_present || misc->gre_s_present); +} + +#define DR_MASK_IS_OUTER_MPLS_OVER_GRE_SET(_misc) (\ + (_misc)->outer_first_mpls_over_gre_label || \ + (_misc)->outer_first_mpls_over_gre_exp || \ + (_misc)->outer_first_mpls_over_gre_s_bos || \ + (_misc)->outer_first_mpls_over_gre_ttl) + +#define DR_MASK_IS_OUTER_MPLS_OVER_UDP_SET(_misc) (\ + (_misc)->outer_first_mpls_over_udp_label || \ + (_misc)->outer_first_mpls_over_udp_exp || \ + (_misc)->outer_first_mpls_over_udp_s_bos || \ + (_misc)->outer_first_mpls_over_udp_ttl) + +static bool +dr_mask_is_vxlan_gpe_set(struct mlx5dr_match_misc3 *misc3) +{ + return (misc3->outer_vxlan_gpe_vni || + misc3->outer_vxlan_gpe_next_protocol || + misc3->outer_vxlan_gpe_flags); +} + +static bool +dr_matcher_supp_vxlan_gpe(struct mlx5dr_cmd_caps *caps) +{ + return (caps->sw_format_ver >= MLX5_STEERING_FORMAT_CONNECTX_6DX) || + (caps->flex_protocols & MLX5_FLEX_PARSER_VXLAN_GPE_ENABLED); +} + +static bool +dr_mask_is_tnl_vxlan_gpe(struct mlx5dr_match_param *mask, + struct mlx5dr_domain *dmn) +{ + return dr_mask_is_vxlan_gpe_set(&mask->misc3) && + dr_matcher_supp_vxlan_gpe(&dmn->info.caps); +} + +static bool dr_mask_is_tnl_geneve_set(struct mlx5dr_match_misc *misc) +{ + return misc->geneve_vni || + misc->geneve_oam || + misc->geneve_protocol_type || + misc->geneve_opt_len; +} + +static bool dr_mask_is_tnl_geneve_tlv_opt(struct mlx5dr_match_misc3 *misc3) +{ + return misc3->geneve_tlv_option_0_data; +} + +static bool +dr_matcher_supp_flex_parser_ok(struct mlx5dr_cmd_caps *caps) +{ + return caps->flex_parser_ok_bits_supp; +} + +static bool dr_mask_is_tnl_geneve_tlv_opt_exist_set(struct mlx5dr_match_misc *misc, + struct mlx5dr_domain *dmn) +{ + return dr_matcher_supp_flex_parser_ok(&dmn->info.caps) && + misc->geneve_tlv_option_0_exist; +} + +static bool +dr_matcher_supp_tnl_geneve(struct mlx5dr_cmd_caps *caps) +{ + return (caps->sw_format_ver >= MLX5_STEERING_FORMAT_CONNECTX_6DX) || + (caps->flex_protocols & MLX5_FLEX_PARSER_GENEVE_ENABLED); +} + +static bool +dr_mask_is_tnl_geneve(struct mlx5dr_match_param *mask, + struct mlx5dr_domain *dmn) +{ + return dr_mask_is_tnl_geneve_set(&mask->misc) && + dr_matcher_supp_tnl_geneve(&dmn->info.caps); +} + +static bool dr_mask_is_tnl_gtpu_set(struct mlx5dr_match_misc3 *misc3) +{ + return misc3->gtpu_msg_flags || misc3->gtpu_msg_type || misc3->gtpu_teid; +} + +static bool dr_matcher_supp_tnl_gtpu(struct mlx5dr_cmd_caps *caps) +{ + return caps->flex_protocols & MLX5_FLEX_PARSER_GTPU_ENABLED; +} + +static bool dr_mask_is_tnl_gtpu(struct mlx5dr_match_param *mask, + struct mlx5dr_domain *dmn) +{ + return dr_mask_is_tnl_gtpu_set(&mask->misc3) && + dr_matcher_supp_tnl_gtpu(&dmn->info.caps); +} + +static int dr_matcher_supp_tnl_gtpu_dw_0(struct mlx5dr_cmd_caps *caps) +{ + return caps->flex_protocols & MLX5_FLEX_PARSER_GTPU_DW_0_ENABLED; +} + +static bool dr_mask_is_tnl_gtpu_dw_0(struct mlx5dr_match_param *mask, + struct mlx5dr_domain *dmn) +{ + return mask->misc3.gtpu_dw_0 && + dr_matcher_supp_tnl_gtpu_dw_0(&dmn->info.caps); +} + +static int dr_matcher_supp_tnl_gtpu_teid(struct mlx5dr_cmd_caps *caps) +{ + return caps->flex_protocols & MLX5_FLEX_PARSER_GTPU_TEID_ENABLED; +} + +static bool dr_mask_is_tnl_gtpu_teid(struct mlx5dr_match_param *mask, + struct mlx5dr_domain *dmn) +{ + return mask->misc3.gtpu_teid && + dr_matcher_supp_tnl_gtpu_teid(&dmn->info.caps); +} + +static int dr_matcher_supp_tnl_gtpu_dw_2(struct mlx5dr_cmd_caps *caps) +{ + return caps->flex_protocols & MLX5_FLEX_PARSER_GTPU_DW_2_ENABLED; +} + +static bool dr_mask_is_tnl_gtpu_dw_2(struct mlx5dr_match_param *mask, + struct mlx5dr_domain *dmn) +{ + return mask->misc3.gtpu_dw_2 && + dr_matcher_supp_tnl_gtpu_dw_2(&dmn->info.caps); +} + +static int dr_matcher_supp_tnl_gtpu_first_ext(struct mlx5dr_cmd_caps *caps) +{ + return caps->flex_protocols & MLX5_FLEX_PARSER_GTPU_FIRST_EXT_DW_0_ENABLED; +} + +static bool dr_mask_is_tnl_gtpu_first_ext(struct mlx5dr_match_param *mask, + struct mlx5dr_domain *dmn) +{ + return mask->misc3.gtpu_first_ext_dw_0 && + dr_matcher_supp_tnl_gtpu_first_ext(&dmn->info.caps); +} + +static bool dr_mask_is_tnl_gtpu_flex_parser_0(struct mlx5dr_match_param *mask, + struct mlx5dr_domain *dmn) +{ + struct mlx5dr_cmd_caps *caps = &dmn->info.caps; + + return (dr_is_flex_parser_0_id(caps->flex_parser_id_gtpu_dw_0) && + dr_mask_is_tnl_gtpu_dw_0(mask, dmn)) || + (dr_is_flex_parser_0_id(caps->flex_parser_id_gtpu_teid) && + dr_mask_is_tnl_gtpu_teid(mask, dmn)) || + (dr_is_flex_parser_0_id(caps->flex_parser_id_gtpu_dw_2) && + dr_mask_is_tnl_gtpu_dw_2(mask, dmn)) || + (dr_is_flex_parser_0_id(caps->flex_parser_id_gtpu_first_ext_dw_0) && + dr_mask_is_tnl_gtpu_first_ext(mask, dmn)); +} + +static bool dr_mask_is_tnl_gtpu_flex_parser_1(struct mlx5dr_match_param *mask, + struct mlx5dr_domain *dmn) +{ + struct mlx5dr_cmd_caps *caps = &dmn->info.caps; + + return (dr_is_flex_parser_1_id(caps->flex_parser_id_gtpu_dw_0) && + dr_mask_is_tnl_gtpu_dw_0(mask, dmn)) || + (dr_is_flex_parser_1_id(caps->flex_parser_id_gtpu_teid) && + dr_mask_is_tnl_gtpu_teid(mask, dmn)) || + (dr_is_flex_parser_1_id(caps->flex_parser_id_gtpu_dw_2) && + dr_mask_is_tnl_gtpu_dw_2(mask, dmn)) || + (dr_is_flex_parser_1_id(caps->flex_parser_id_gtpu_first_ext_dw_0) && + dr_mask_is_tnl_gtpu_first_ext(mask, dmn)); +} + +static bool dr_mask_is_tnl_gtpu_any(struct mlx5dr_match_param *mask, + struct mlx5dr_domain *dmn) +{ + return dr_mask_is_tnl_gtpu_flex_parser_0(mask, dmn) || + dr_mask_is_tnl_gtpu_flex_parser_1(mask, dmn) || + dr_mask_is_tnl_gtpu(mask, dmn); +} + +static int dr_matcher_supp_icmp_v4(struct mlx5dr_cmd_caps *caps) +{ + return (caps->sw_format_ver >= MLX5_STEERING_FORMAT_CONNECTX_6DX) || + (caps->flex_protocols & MLX5_FLEX_PARSER_ICMP_V4_ENABLED); +} + +static int dr_matcher_supp_icmp_v6(struct mlx5dr_cmd_caps *caps) +{ + return (caps->sw_format_ver >= MLX5_STEERING_FORMAT_CONNECTX_6DX) || + (caps->flex_protocols & MLX5_FLEX_PARSER_ICMP_V6_ENABLED); +} + +static bool dr_mask_is_icmpv6_set(struct mlx5dr_match_misc3 *misc3) +{ + return (misc3->icmpv6_type || misc3->icmpv6_code || + misc3->icmpv6_header_data); +} + +static bool dr_mask_is_icmp(struct mlx5dr_match_param *mask, + struct mlx5dr_domain *dmn) +{ + if (DR_MASK_IS_ICMPV4_SET(&mask->misc3)) + return dr_matcher_supp_icmp_v4(&dmn->info.caps); + else if (dr_mask_is_icmpv6_set(&mask->misc3)) + return dr_matcher_supp_icmp_v6(&dmn->info.caps); + + return false; +} + +static bool dr_mask_is_wqe_metadata_set(struct mlx5dr_match_misc2 *misc2) +{ + return misc2->metadata_reg_a; +} + +static bool dr_mask_is_reg_c_0_3_set(struct mlx5dr_match_misc2 *misc2) +{ + return (misc2->metadata_reg_c_0 || misc2->metadata_reg_c_1 || + misc2->metadata_reg_c_2 || misc2->metadata_reg_c_3); +} + +static bool dr_mask_is_reg_c_4_7_set(struct mlx5dr_match_misc2 *misc2) +{ + return (misc2->metadata_reg_c_4 || misc2->metadata_reg_c_5 || + misc2->metadata_reg_c_6 || misc2->metadata_reg_c_7); +} + +static bool dr_mask_is_gvmi_or_qpn_set(struct mlx5dr_match_misc *misc) +{ + return (misc->source_sqn || misc->source_port); +} + +static bool dr_mask_is_flex_parser_id_0_3_set(u32 flex_parser_id, + u32 flex_parser_value) +{ + if (flex_parser_id) + return flex_parser_id <= DR_STE_MAX_FLEX_0_ID; + + /* Using flex_parser 0 means that id is zero, thus value must be set. */ + return flex_parser_value; +} + +static bool dr_mask_is_flex_parser_0_3_set(struct mlx5dr_match_misc4 *misc4) +{ + return (dr_mask_is_flex_parser_id_0_3_set(misc4->prog_sample_field_id_0, + misc4->prog_sample_field_value_0) || + dr_mask_is_flex_parser_id_0_3_set(misc4->prog_sample_field_id_1, + misc4->prog_sample_field_value_1) || + dr_mask_is_flex_parser_id_0_3_set(misc4->prog_sample_field_id_2, + misc4->prog_sample_field_value_2) || + dr_mask_is_flex_parser_id_0_3_set(misc4->prog_sample_field_id_3, + misc4->prog_sample_field_value_3)); +} + +static bool dr_mask_is_flex_parser_id_4_7_set(u32 flex_parser_id) +{ + return flex_parser_id > DR_STE_MAX_FLEX_0_ID && + flex_parser_id <= DR_STE_MAX_FLEX_1_ID; +} + +static bool dr_mask_is_flex_parser_4_7_set(struct mlx5dr_match_misc4 *misc4) +{ + return (dr_mask_is_flex_parser_id_4_7_set(misc4->prog_sample_field_id_0) || + dr_mask_is_flex_parser_id_4_7_set(misc4->prog_sample_field_id_1) || + dr_mask_is_flex_parser_id_4_7_set(misc4->prog_sample_field_id_2) || + dr_mask_is_flex_parser_id_4_7_set(misc4->prog_sample_field_id_3)); +} + +static int dr_matcher_supp_tnl_mpls_over_gre(struct mlx5dr_cmd_caps *caps) +{ + return caps->flex_protocols & MLX5_FLEX_PARSER_MPLS_OVER_GRE_ENABLED; +} + +static bool dr_mask_is_tnl_mpls_over_gre(struct mlx5dr_match_param *mask, + struct mlx5dr_domain *dmn) +{ + return DR_MASK_IS_OUTER_MPLS_OVER_GRE_SET(&mask->misc2) && + dr_matcher_supp_tnl_mpls_over_gre(&dmn->info.caps); +} + +static int dr_matcher_supp_tnl_mpls_over_udp(struct mlx5dr_cmd_caps *caps) +{ + return caps->flex_protocols & MLX5_FLEX_PARSER_MPLS_OVER_UDP_ENABLED; +} + +static bool dr_mask_is_tnl_mpls_over_udp(struct mlx5dr_match_param *mask, + struct mlx5dr_domain *dmn) +{ + return DR_MASK_IS_OUTER_MPLS_OVER_UDP_SET(&mask->misc2) && + dr_matcher_supp_tnl_mpls_over_udp(&dmn->info.caps); +} + +static bool dr_mask_is_tnl_header_0_1_set(struct mlx5dr_match_misc5 *misc5) +{ + return misc5->tunnel_header_0 || misc5->tunnel_header_1; +} + +int mlx5dr_matcher_select_builders(struct mlx5dr_matcher *matcher, + struct mlx5dr_matcher_rx_tx *nic_matcher, + enum mlx5dr_ipv outer_ipv, + enum mlx5dr_ipv inner_ipv) +{ + nic_matcher->ste_builder = + nic_matcher->ste_builder_arr[outer_ipv][inner_ipv]; + nic_matcher->num_of_builders = + nic_matcher->num_of_builders_arr[outer_ipv][inner_ipv]; + + if (!nic_matcher->num_of_builders) { + mlx5dr_dbg(matcher->tbl->dmn, + "Rule not supported on this matcher due to IP related fields\n"); + return -EINVAL; + } + + return 0; +} + +static int dr_matcher_set_ste_builders(struct mlx5dr_matcher *matcher, + struct mlx5dr_matcher_rx_tx *nic_matcher, + enum mlx5dr_ipv outer_ipv, + enum mlx5dr_ipv inner_ipv) +{ + struct mlx5dr_domain_rx_tx *nic_dmn = nic_matcher->nic_tbl->nic_dmn; + struct mlx5dr_domain *dmn = matcher->tbl->dmn; + struct mlx5dr_ste_ctx *ste_ctx = dmn->ste_ctx; + struct mlx5dr_match_param mask = {}; + bool allow_empty_match = false; + struct mlx5dr_ste_build *sb; + bool inner, rx; + int idx = 0; + int ret, i; + + sb = nic_matcher->ste_builder_arr[outer_ipv][inner_ipv]; + rx = nic_dmn->type == DR_DOMAIN_NIC_TYPE_RX; + + /* Create a temporary mask to track and clear used mask fields */ + if (matcher->match_criteria & DR_MATCHER_CRITERIA_OUTER) + mask.outer = matcher->mask.outer; + + if (matcher->match_criteria & DR_MATCHER_CRITERIA_MISC) + mask.misc = matcher->mask.misc; + + if (matcher->match_criteria & DR_MATCHER_CRITERIA_INNER) + mask.inner = matcher->mask.inner; + + if (matcher->match_criteria & DR_MATCHER_CRITERIA_MISC2) + mask.misc2 = matcher->mask.misc2; + + if (matcher->match_criteria & DR_MATCHER_CRITERIA_MISC3) + mask.misc3 = matcher->mask.misc3; + + if (matcher->match_criteria & DR_MATCHER_CRITERIA_MISC4) + mask.misc4 = matcher->mask.misc4; + + if (matcher->match_criteria & DR_MATCHER_CRITERIA_MISC5) + mask.misc5 = matcher->mask.misc5; + + ret = mlx5dr_ste_build_pre_check(dmn, matcher->match_criteria, + &matcher->mask, NULL); + if (ret) + return ret; + + /* Optimize RX pipe by reducing source port match, since + * the FDB RX part is connected only to the wire. + */ + if (dmn->type == MLX5DR_DOMAIN_TYPE_FDB && + rx && mask.misc.source_port) { + mask.misc.source_port = 0; + mask.misc.source_eswitch_owner_vhca_id = 0; + allow_empty_match = true; + } + + /* Outer */ + if (matcher->match_criteria & (DR_MATCHER_CRITERIA_OUTER | + DR_MATCHER_CRITERIA_MISC | + DR_MATCHER_CRITERIA_MISC2 | + DR_MATCHER_CRITERIA_MISC3 | + DR_MATCHER_CRITERIA_MISC5)) { + inner = false; + + if (dr_mask_is_wqe_metadata_set(&mask.misc2)) + mlx5dr_ste_build_general_purpose(ste_ctx, &sb[idx++], + &mask, inner, rx); + + if (dr_mask_is_reg_c_0_3_set(&mask.misc2)) + mlx5dr_ste_build_register_0(ste_ctx, &sb[idx++], + &mask, inner, rx); + + if (dr_mask_is_reg_c_4_7_set(&mask.misc2)) + mlx5dr_ste_build_register_1(ste_ctx, &sb[idx++], + &mask, inner, rx); + + if (dr_mask_is_gvmi_or_qpn_set(&mask.misc) && + (dmn->type == MLX5DR_DOMAIN_TYPE_FDB || + dmn->type == MLX5DR_DOMAIN_TYPE_NIC_RX)) { + mlx5dr_ste_build_src_gvmi_qpn(ste_ctx, &sb[idx++], + &mask, dmn, inner, rx); + } + + if (dr_mask_is_smac_set(&mask.outer) && + dr_mask_is_dmac_set(&mask.outer)) { + mlx5dr_ste_build_eth_l2_src_dst(ste_ctx, &sb[idx++], + &mask, inner, rx); + } + + if (dr_mask_is_smac_set(&mask.outer)) + mlx5dr_ste_build_eth_l2_src(ste_ctx, &sb[idx++], + &mask, inner, rx); + + if (DR_MASK_IS_L2_DST(mask.outer, mask.misc, outer)) + mlx5dr_ste_build_eth_l2_dst(ste_ctx, &sb[idx++], + &mask, inner, rx); + + if (outer_ipv == DR_RULE_IPV6) { + if (DR_MASK_IS_DST_IP_SET(&mask.outer)) + mlx5dr_ste_build_eth_l3_ipv6_dst(ste_ctx, &sb[idx++], + &mask, inner, rx); + + if (DR_MASK_IS_SRC_IP_SET(&mask.outer)) + mlx5dr_ste_build_eth_l3_ipv6_src(ste_ctx, &sb[idx++], + &mask, inner, rx); + + if (DR_MASK_IS_ETH_L4_SET(mask.outer, mask.misc, outer)) + mlx5dr_ste_build_eth_ipv6_l3_l4(ste_ctx, &sb[idx++], + &mask, inner, rx); + } else { + if (dr_mask_is_ipv4_5_tuple_set(&mask.outer)) + mlx5dr_ste_build_eth_l3_ipv4_5_tuple(ste_ctx, &sb[idx++], + &mask, inner, rx); + + if (dr_mask_is_ttl_set(&mask.outer) || + dr_mask_is_ipv4_ihl_set(&mask.outer)) + mlx5dr_ste_build_eth_l3_ipv4_misc(ste_ctx, &sb[idx++], + &mask, inner, rx); + } + + if (dr_mask_is_tnl_vxlan_gpe(&mask, dmn)) + mlx5dr_ste_build_tnl_vxlan_gpe(ste_ctx, &sb[idx++], + &mask, inner, rx); + else if (dr_mask_is_tnl_geneve(&mask, dmn)) { + mlx5dr_ste_build_tnl_geneve(ste_ctx, &sb[idx++], + &mask, inner, rx); + if (dr_mask_is_tnl_geneve_tlv_opt(&mask.misc3)) + mlx5dr_ste_build_tnl_geneve_tlv_opt(ste_ctx, &sb[idx++], + &mask, &dmn->info.caps, + inner, rx); + if (dr_mask_is_tnl_geneve_tlv_opt_exist_set(&mask.misc, dmn)) + mlx5dr_ste_build_tnl_geneve_tlv_opt_exist(ste_ctx, &sb[idx++], + &mask, &dmn->info.caps, + inner, rx); + } else if (dr_mask_is_tnl_gtpu_any(&mask, dmn)) { + if (dr_mask_is_tnl_gtpu_flex_parser_0(&mask, dmn)) + mlx5dr_ste_build_tnl_gtpu_flex_parser_0(ste_ctx, &sb[idx++], + &mask, &dmn->info.caps, + inner, rx); + + if (dr_mask_is_tnl_gtpu_flex_parser_1(&mask, dmn)) + mlx5dr_ste_build_tnl_gtpu_flex_parser_1(ste_ctx, &sb[idx++], + &mask, &dmn->info.caps, + inner, rx); + + if (dr_mask_is_tnl_gtpu(&mask, dmn)) + mlx5dr_ste_build_tnl_gtpu(ste_ctx, &sb[idx++], + &mask, inner, rx); + } else if (dr_mask_is_tnl_header_0_1_set(&mask.misc5)) { + mlx5dr_ste_build_tnl_header_0_1(ste_ctx, &sb[idx++], + &mask, inner, rx); + } + + if (DR_MASK_IS_ETH_L4_MISC_SET(mask.misc3, outer)) + mlx5dr_ste_build_eth_l4_misc(ste_ctx, &sb[idx++], + &mask, inner, rx); + + if (DR_MASK_IS_FIRST_MPLS_SET(mask.misc2, outer)) + mlx5dr_ste_build_mpls(ste_ctx, &sb[idx++], + &mask, inner, rx); + + if (dr_mask_is_tnl_mpls_over_gre(&mask, dmn)) + mlx5dr_ste_build_tnl_mpls_over_gre(ste_ctx, &sb[idx++], + &mask, &dmn->info.caps, + inner, rx); + else if (dr_mask_is_tnl_mpls_over_udp(&mask, dmn)) + mlx5dr_ste_build_tnl_mpls_over_udp(ste_ctx, &sb[idx++], + &mask, &dmn->info.caps, + inner, rx); + + if (dr_mask_is_icmp(&mask, dmn)) + mlx5dr_ste_build_icmp(ste_ctx, &sb[idx++], + &mask, &dmn->info.caps, + inner, rx); + + if (dr_mask_is_tnl_gre_set(&mask.misc)) + mlx5dr_ste_build_tnl_gre(ste_ctx, &sb[idx++], + &mask, inner, rx); + } + + /* Inner */ + if (matcher->match_criteria & (DR_MATCHER_CRITERIA_INNER | + DR_MATCHER_CRITERIA_MISC | + DR_MATCHER_CRITERIA_MISC2 | + DR_MATCHER_CRITERIA_MISC3)) { + inner = true; + + if (dr_mask_is_eth_l2_tnl_set(&mask.misc)) + mlx5dr_ste_build_eth_l2_tnl(ste_ctx, &sb[idx++], + &mask, inner, rx); + + if (dr_mask_is_smac_set(&mask.inner) && + dr_mask_is_dmac_set(&mask.inner)) { + mlx5dr_ste_build_eth_l2_src_dst(ste_ctx, &sb[idx++], + &mask, inner, rx); + } + + if (dr_mask_is_smac_set(&mask.inner)) + mlx5dr_ste_build_eth_l2_src(ste_ctx, &sb[idx++], + &mask, inner, rx); + + if (DR_MASK_IS_L2_DST(mask.inner, mask.misc, inner)) + mlx5dr_ste_build_eth_l2_dst(ste_ctx, &sb[idx++], + &mask, inner, rx); + + if (inner_ipv == DR_RULE_IPV6) { + if (DR_MASK_IS_DST_IP_SET(&mask.inner)) + mlx5dr_ste_build_eth_l3_ipv6_dst(ste_ctx, &sb[idx++], + &mask, inner, rx); + + if (DR_MASK_IS_SRC_IP_SET(&mask.inner)) + mlx5dr_ste_build_eth_l3_ipv6_src(ste_ctx, &sb[idx++], + &mask, inner, rx); + + if (DR_MASK_IS_ETH_L4_SET(mask.inner, mask.misc, inner)) + mlx5dr_ste_build_eth_ipv6_l3_l4(ste_ctx, &sb[idx++], + &mask, inner, rx); + } else { + if (dr_mask_is_ipv4_5_tuple_set(&mask.inner)) + mlx5dr_ste_build_eth_l3_ipv4_5_tuple(ste_ctx, &sb[idx++], + &mask, inner, rx); + + if (dr_mask_is_ttl_set(&mask.inner) || + dr_mask_is_ipv4_ihl_set(&mask.inner)) + mlx5dr_ste_build_eth_l3_ipv4_misc(ste_ctx, &sb[idx++], + &mask, inner, rx); + } + + if (DR_MASK_IS_ETH_L4_MISC_SET(mask.misc3, inner)) + mlx5dr_ste_build_eth_l4_misc(ste_ctx, &sb[idx++], + &mask, inner, rx); + + if (DR_MASK_IS_FIRST_MPLS_SET(mask.misc2, inner)) + mlx5dr_ste_build_mpls(ste_ctx, &sb[idx++], + &mask, inner, rx); + + if (dr_mask_is_tnl_mpls_over_gre(&mask, dmn)) + mlx5dr_ste_build_tnl_mpls_over_gre(ste_ctx, &sb[idx++], + &mask, &dmn->info.caps, + inner, rx); + else if (dr_mask_is_tnl_mpls_over_udp(&mask, dmn)) + mlx5dr_ste_build_tnl_mpls_over_udp(ste_ctx, &sb[idx++], + &mask, &dmn->info.caps, + inner, rx); + } + + if (matcher->match_criteria & DR_MATCHER_CRITERIA_MISC4) { + if (dr_mask_is_flex_parser_0_3_set(&mask.misc4)) + mlx5dr_ste_build_flex_parser_0(ste_ctx, &sb[idx++], + &mask, false, rx); + + if (dr_mask_is_flex_parser_4_7_set(&mask.misc4)) + mlx5dr_ste_build_flex_parser_1(ste_ctx, &sb[idx++], + &mask, false, rx); + } + + /* Empty matcher, takes all */ + if ((!idx && allow_empty_match) || + matcher->match_criteria == DR_MATCHER_CRITERIA_EMPTY) + mlx5dr_ste_build_empty_always_hit(&sb[idx++], rx); + + if (idx == 0) { + mlx5dr_err(dmn, "Cannot generate any valid rules from mask\n"); + return -EINVAL; + } + + /* Check that all mask fields were consumed */ + for (i = 0; i < sizeof(struct mlx5dr_match_param); i++) { + if (((u8 *)&mask)[i] != 0) { + mlx5dr_dbg(dmn, "Mask contains unsupported parameters\n"); + return -EOPNOTSUPP; + } + } + + nic_matcher->ste_builder = sb; + nic_matcher->num_of_builders_arr[outer_ipv][inner_ipv] = idx; + + return 0; +} + +static int dr_nic_matcher_connect(struct mlx5dr_domain *dmn, + struct mlx5dr_matcher_rx_tx *curr_nic_matcher, + struct mlx5dr_matcher_rx_tx *next_nic_matcher, + struct mlx5dr_matcher_rx_tx *prev_nic_matcher) +{ + struct mlx5dr_table_rx_tx *nic_tbl = curr_nic_matcher->nic_tbl; + struct mlx5dr_domain_rx_tx *nic_dmn = nic_tbl->nic_dmn; + struct mlx5dr_htbl_connect_info info; + struct mlx5dr_ste_htbl *prev_htbl; + int ret; + + /* Connect end anchor hash table to next_htbl or to the default address */ + if (next_nic_matcher) { + info.type = CONNECT_HIT; + info.hit_next_htbl = next_nic_matcher->s_htbl; + } else { + info.type = CONNECT_MISS; + info.miss_icm_addr = nic_tbl->default_icm_addr; + } + ret = mlx5dr_ste_htbl_init_and_postsend(dmn, nic_dmn, + curr_nic_matcher->e_anchor, + &info, info.type == CONNECT_HIT); + if (ret) + return ret; + + /* Connect start hash table to end anchor */ + info.type = CONNECT_MISS; + info.miss_icm_addr = mlx5dr_icm_pool_get_chunk_icm_addr(curr_nic_matcher->e_anchor->chunk); + ret = mlx5dr_ste_htbl_init_and_postsend(dmn, nic_dmn, + curr_nic_matcher->s_htbl, + &info, false); + if (ret) + return ret; + + /* Connect previous hash table to matcher start hash table */ + if (prev_nic_matcher) + prev_htbl = prev_nic_matcher->e_anchor; + else + prev_htbl = nic_tbl->s_anchor; + + info.type = CONNECT_HIT; + info.hit_next_htbl = curr_nic_matcher->s_htbl; + ret = mlx5dr_ste_htbl_init_and_postsend(dmn, nic_dmn, prev_htbl, + &info, true); + if (ret) + return ret; + + /* Update the pointing ste and next hash table */ + curr_nic_matcher->s_htbl->pointing_ste = prev_htbl->chunk->ste_arr; + prev_htbl->chunk->ste_arr[0].next_htbl = curr_nic_matcher->s_htbl; + + if (next_nic_matcher) { + next_nic_matcher->s_htbl->pointing_ste = + curr_nic_matcher->e_anchor->chunk->ste_arr; + curr_nic_matcher->e_anchor->chunk->ste_arr[0].next_htbl = + next_nic_matcher->s_htbl; + } + + return 0; +} + +int mlx5dr_matcher_add_to_tbl_nic(struct mlx5dr_domain *dmn, + struct mlx5dr_matcher_rx_tx *nic_matcher) +{ + struct mlx5dr_matcher_rx_tx *next_nic_matcher, *prev_nic_matcher, *tmp_nic_matcher; + struct mlx5dr_table_rx_tx *nic_tbl = nic_matcher->nic_tbl; + bool first = true; + int ret; + + /* If the nic matcher is already on its parent nic table list, + * then it is already connected to the chain of nic matchers. + */ + if (!list_empty(&nic_matcher->list_node)) + return 0; + + next_nic_matcher = NULL; + list_for_each_entry(tmp_nic_matcher, &nic_tbl->nic_matcher_list, list_node) { + if (tmp_nic_matcher->prio >= nic_matcher->prio) { + next_nic_matcher = tmp_nic_matcher; + break; + } + first = false; + } + + prev_nic_matcher = NULL; + if (next_nic_matcher && !first) + prev_nic_matcher = list_prev_entry(next_nic_matcher, list_node); + else if (!first) + prev_nic_matcher = list_last_entry(&nic_tbl->nic_matcher_list, + struct mlx5dr_matcher_rx_tx, + list_node); + + ret = dr_nic_matcher_connect(dmn, nic_matcher, + next_nic_matcher, prev_nic_matcher); + if (ret) + return ret; + + if (prev_nic_matcher) + list_add(&nic_matcher->list_node, &prev_nic_matcher->list_node); + else if (next_nic_matcher) + list_add_tail(&nic_matcher->list_node, &next_nic_matcher->list_node); + else + list_add(&nic_matcher->list_node, &nic_matcher->nic_tbl->nic_matcher_list); + + return ret; +} + +static void dr_matcher_uninit_nic(struct mlx5dr_matcher_rx_tx *nic_matcher) +{ + mlx5dr_htbl_put(nic_matcher->s_htbl); + mlx5dr_htbl_put(nic_matcher->e_anchor); +} + +static void dr_matcher_uninit_fdb(struct mlx5dr_matcher *matcher) +{ + dr_matcher_uninit_nic(&matcher->rx); + dr_matcher_uninit_nic(&matcher->tx); +} + +static void dr_matcher_uninit(struct mlx5dr_matcher *matcher) +{ + struct mlx5dr_domain *dmn = matcher->tbl->dmn; + + switch (dmn->type) { + case MLX5DR_DOMAIN_TYPE_NIC_RX: + dr_matcher_uninit_nic(&matcher->rx); + break; + case MLX5DR_DOMAIN_TYPE_NIC_TX: + dr_matcher_uninit_nic(&matcher->tx); + break; + case MLX5DR_DOMAIN_TYPE_FDB: + dr_matcher_uninit_fdb(matcher); + break; + default: + WARN_ON(true); + break; + } +} + +static int dr_matcher_set_all_ste_builders(struct mlx5dr_matcher *matcher, + struct mlx5dr_matcher_rx_tx *nic_matcher) +{ + struct mlx5dr_domain *dmn = matcher->tbl->dmn; + + dr_matcher_set_ste_builders(matcher, nic_matcher, DR_RULE_IPV4, DR_RULE_IPV4); + dr_matcher_set_ste_builders(matcher, nic_matcher, DR_RULE_IPV4, DR_RULE_IPV6); + dr_matcher_set_ste_builders(matcher, nic_matcher, DR_RULE_IPV6, DR_RULE_IPV4); + dr_matcher_set_ste_builders(matcher, nic_matcher, DR_RULE_IPV6, DR_RULE_IPV6); + + if (!nic_matcher->ste_builder) { + mlx5dr_err(dmn, "Cannot generate IPv4 or IPv6 rules with given mask\n"); + return -EINVAL; + } + + return 0; +} + +static int dr_matcher_init_nic(struct mlx5dr_matcher *matcher, + struct mlx5dr_matcher_rx_tx *nic_matcher) +{ + struct mlx5dr_domain *dmn = matcher->tbl->dmn; + int ret; + + nic_matcher->prio = matcher->prio; + INIT_LIST_HEAD(&nic_matcher->list_node); + + ret = dr_matcher_set_all_ste_builders(matcher, nic_matcher); + if (ret) + return ret; + + nic_matcher->e_anchor = mlx5dr_ste_htbl_alloc(dmn->ste_icm_pool, + DR_CHUNK_SIZE_1, + MLX5DR_STE_LU_TYPE_DONT_CARE, + 0); + if (!nic_matcher->e_anchor) + return -ENOMEM; + + nic_matcher->s_htbl = mlx5dr_ste_htbl_alloc(dmn->ste_icm_pool, + DR_CHUNK_SIZE_1, + nic_matcher->ste_builder[0].lu_type, + nic_matcher->ste_builder[0].byte_mask); + if (!nic_matcher->s_htbl) { + ret = -ENOMEM; + goto free_e_htbl; + } + + /* make sure the tables exist while empty */ + mlx5dr_htbl_get(nic_matcher->s_htbl); + mlx5dr_htbl_get(nic_matcher->e_anchor); + + return 0; + +free_e_htbl: + mlx5dr_ste_htbl_free(nic_matcher->e_anchor); + return ret; +} + +static int dr_matcher_init_fdb(struct mlx5dr_matcher *matcher) +{ + int ret; + + ret = dr_matcher_init_nic(matcher, &matcher->rx); + if (ret) + return ret; + + ret = dr_matcher_init_nic(matcher, &matcher->tx); + if (ret) + goto uninit_nic_rx; + + return 0; + +uninit_nic_rx: + dr_matcher_uninit_nic(&matcher->rx); + return ret; +} + +static int dr_matcher_copy_param(struct mlx5dr_matcher *matcher, + struct mlx5dr_match_parameters *mask) +{ + struct mlx5dr_domain *dmn = matcher->tbl->dmn; + struct mlx5dr_match_parameters consumed_mask; + int i, ret = 0; + + if (matcher->match_criteria >= DR_MATCHER_CRITERIA_MAX) { + mlx5dr_err(dmn, "Invalid match criteria attribute\n"); + return -EINVAL; + } + + if (mask) { + if (mask->match_sz > DR_SZ_MATCH_PARAM) { + mlx5dr_err(dmn, "Invalid match size attribute\n"); + return -EINVAL; + } + + consumed_mask.match_buf = kzalloc(mask->match_sz, GFP_KERNEL); + if (!consumed_mask.match_buf) + return -ENOMEM; + + consumed_mask.match_sz = mask->match_sz; + memcpy(consumed_mask.match_buf, mask->match_buf, mask->match_sz); + mlx5dr_ste_copy_param(matcher->match_criteria, + &matcher->mask, &consumed_mask, true); + + /* Check that all mask data was consumed */ + for (i = 0; i < consumed_mask.match_sz; i++) { + if (!((u8 *)consumed_mask.match_buf)[i]) + continue; + + mlx5dr_dbg(dmn, + "Match param mask contains unsupported parameters\n"); + ret = -EOPNOTSUPP; + break; + } + + kfree(consumed_mask.match_buf); + } + + return ret; +} + +static int dr_matcher_init(struct mlx5dr_matcher *matcher, + struct mlx5dr_match_parameters *mask) +{ + struct mlx5dr_table *tbl = matcher->tbl; + struct mlx5dr_domain *dmn = tbl->dmn; + int ret; + + ret = dr_matcher_copy_param(matcher, mask); + if (ret) + return ret; + + switch (dmn->type) { + case MLX5DR_DOMAIN_TYPE_NIC_RX: + matcher->rx.nic_tbl = &tbl->rx; + ret = dr_matcher_init_nic(matcher, &matcher->rx); + break; + case MLX5DR_DOMAIN_TYPE_NIC_TX: + matcher->tx.nic_tbl = &tbl->tx; + ret = dr_matcher_init_nic(matcher, &matcher->tx); + break; + case MLX5DR_DOMAIN_TYPE_FDB: + matcher->rx.nic_tbl = &tbl->rx; + matcher->tx.nic_tbl = &tbl->tx; + ret = dr_matcher_init_fdb(matcher); + break; + default: + WARN_ON(true); + ret = -EINVAL; + } + + return ret; +} + +static void dr_matcher_add_to_dbg_list(struct mlx5dr_matcher *matcher) +{ + mutex_lock(&matcher->tbl->dmn->dump_info.dbg_mutex); + list_add(&matcher->list_node, &matcher->tbl->matcher_list); + mutex_unlock(&matcher->tbl->dmn->dump_info.dbg_mutex); +} + +static void dr_matcher_remove_from_dbg_list(struct mlx5dr_matcher *matcher) +{ + mutex_lock(&matcher->tbl->dmn->dump_info.dbg_mutex); + list_del(&matcher->list_node); + mutex_unlock(&matcher->tbl->dmn->dump_info.dbg_mutex); +} + +struct mlx5dr_matcher * +mlx5dr_matcher_create(struct mlx5dr_table *tbl, + u32 priority, + u8 match_criteria_enable, + struct mlx5dr_match_parameters *mask) +{ + struct mlx5dr_matcher *matcher; + int ret; + + refcount_inc(&tbl->refcount); + + matcher = kzalloc(sizeof(*matcher), GFP_KERNEL); + if (!matcher) + goto dec_ref; + + matcher->tbl = tbl; + matcher->prio = priority; + matcher->match_criteria = match_criteria_enable; + refcount_set(&matcher->refcount, 1); + INIT_LIST_HEAD(&matcher->list_node); + INIT_LIST_HEAD(&matcher->dbg_rule_list); + + mlx5dr_domain_lock(tbl->dmn); + + ret = dr_matcher_init(matcher, mask); + if (ret) + goto free_matcher; + + dr_matcher_add_to_dbg_list(matcher); + + mlx5dr_domain_unlock(tbl->dmn); + + return matcher; + +free_matcher: + mlx5dr_domain_unlock(tbl->dmn); + kfree(matcher); +dec_ref: + refcount_dec(&tbl->refcount); + return NULL; +} + +static int dr_matcher_disconnect_nic(struct mlx5dr_domain *dmn, + struct mlx5dr_table_rx_tx *nic_tbl, + struct mlx5dr_matcher_rx_tx *next_nic_matcher, + struct mlx5dr_matcher_rx_tx *prev_nic_matcher) +{ + struct mlx5dr_domain_rx_tx *nic_dmn = nic_tbl->nic_dmn; + struct mlx5dr_htbl_connect_info info; + struct mlx5dr_ste_htbl *prev_anchor; + + if (prev_nic_matcher) + prev_anchor = prev_nic_matcher->e_anchor; + else + prev_anchor = nic_tbl->s_anchor; + + /* Connect previous anchor hash table to next matcher or to the default address */ + if (next_nic_matcher) { + info.type = CONNECT_HIT; + info.hit_next_htbl = next_nic_matcher->s_htbl; + next_nic_matcher->s_htbl->pointing_ste = prev_anchor->chunk->ste_arr; + prev_anchor->chunk->ste_arr[0].next_htbl = next_nic_matcher->s_htbl; + } else { + info.type = CONNECT_MISS; + info.miss_icm_addr = nic_tbl->default_icm_addr; + prev_anchor->chunk->ste_arr[0].next_htbl = NULL; + } + + return mlx5dr_ste_htbl_init_and_postsend(dmn, nic_dmn, prev_anchor, + &info, true); +} + +int mlx5dr_matcher_remove_from_tbl_nic(struct mlx5dr_domain *dmn, + struct mlx5dr_matcher_rx_tx *nic_matcher) +{ + struct mlx5dr_matcher_rx_tx *prev_nic_matcher, *next_nic_matcher; + struct mlx5dr_table_rx_tx *nic_tbl = nic_matcher->nic_tbl; + int ret; + + /* If the nic matcher is not on its parent nic table list, + * then it is detached - no need to disconnect it. + */ + if (list_empty(&nic_matcher->list_node)) + return 0; + + if (list_is_last(&nic_matcher->list_node, &nic_tbl->nic_matcher_list)) + next_nic_matcher = NULL; + else + next_nic_matcher = list_next_entry(nic_matcher, list_node); + + if (nic_matcher->list_node.prev == &nic_tbl->nic_matcher_list) + prev_nic_matcher = NULL; + else + prev_nic_matcher = list_prev_entry(nic_matcher, list_node); + + ret = dr_matcher_disconnect_nic(dmn, nic_tbl, next_nic_matcher, prev_nic_matcher); + if (ret) + return ret; + + list_del_init(&nic_matcher->list_node); + return 0; +} + +int mlx5dr_matcher_destroy(struct mlx5dr_matcher *matcher) +{ + struct mlx5dr_table *tbl = matcher->tbl; + + if (WARN_ON_ONCE(refcount_read(&matcher->refcount) > 1)) + return -EBUSY; + + mlx5dr_domain_lock(tbl->dmn); + + dr_matcher_remove_from_dbg_list(matcher); + dr_matcher_uninit(matcher); + refcount_dec(&matcher->tbl->refcount); + + mlx5dr_domain_unlock(tbl->dmn); + kfree(matcher); + + return 0; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c new file mode 100644 index 0000000..74f6872 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c @@ -0,0 +1,1341 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2019 Mellanox Technologies. */ + +#include "dr_types.h" + +#define DR_RULE_MAX_STE_CHAIN (DR_RULE_MAX_STES + DR_ACTION_MAX_STES) + +static int dr_rule_append_to_miss_list(struct mlx5dr_domain *dmn, + enum mlx5dr_domain_nic_type nic_type, + struct mlx5dr_ste *new_last_ste, + struct list_head *miss_list, + struct list_head *send_list) +{ + struct mlx5dr_ste_ctx *ste_ctx = dmn->ste_ctx; + struct mlx5dr_ste_send_info *ste_info_last; + struct mlx5dr_ste *last_ste; + + /* The new entry will be inserted after the last */ + last_ste = list_last_entry(miss_list, struct mlx5dr_ste, miss_list_node); + WARN_ON(!last_ste); + + ste_info_last = mlx5dr_send_info_alloc(dmn, nic_type); + if (!ste_info_last) + return -ENOMEM; + + mlx5dr_ste_set_miss_addr(ste_ctx, mlx5dr_ste_get_hw_ste(last_ste), + mlx5dr_ste_get_icm_addr(new_last_ste)); + list_add_tail(&new_last_ste->miss_list_node, miss_list); + + mlx5dr_send_fill_and_append_ste_send_info(last_ste, DR_STE_SIZE_CTRL, + 0, mlx5dr_ste_get_hw_ste(last_ste), + ste_info_last, send_list, true); + + return 0; +} + +static struct mlx5dr_ste * +dr_rule_create_collision_htbl(struct mlx5dr_matcher *matcher, + struct mlx5dr_matcher_rx_tx *nic_matcher, + u8 *hw_ste) +{ + struct mlx5dr_domain *dmn = matcher->tbl->dmn; + struct mlx5dr_ste_ctx *ste_ctx = dmn->ste_ctx; + struct mlx5dr_ste_htbl *new_htbl; + struct mlx5dr_ste *ste; + u64 icm_addr; + + /* Create new table for miss entry */ + new_htbl = mlx5dr_ste_htbl_alloc(dmn->ste_icm_pool, + DR_CHUNK_SIZE_1, + MLX5DR_STE_LU_TYPE_DONT_CARE, + 0); + if (!new_htbl) { + mlx5dr_dbg(dmn, "Failed allocating collision table\n"); + return NULL; + } + + /* One and only entry, never grows */ + ste = new_htbl->chunk->ste_arr; + icm_addr = mlx5dr_icm_pool_get_chunk_icm_addr(nic_matcher->e_anchor->chunk); + mlx5dr_ste_set_miss_addr(ste_ctx, hw_ste, icm_addr); + mlx5dr_htbl_get(new_htbl); + + return ste; +} + +static struct mlx5dr_ste * +dr_rule_create_collision_entry(struct mlx5dr_matcher *matcher, + struct mlx5dr_matcher_rx_tx *nic_matcher, + u8 *hw_ste, + struct mlx5dr_ste *orig_ste) +{ + struct mlx5dr_ste *ste; + + ste = dr_rule_create_collision_htbl(matcher, nic_matcher, hw_ste); + if (!ste) { + mlx5dr_dbg(matcher->tbl->dmn, "Failed creating collision entry\n"); + return NULL; + } + + ste->ste_chain_location = orig_ste->ste_chain_location; + ste->htbl->pointing_ste = orig_ste->htbl->pointing_ste; + + /* In collision entry, all members share the same miss_list_head */ + ste->htbl->chunk->miss_list = mlx5dr_ste_get_miss_list(orig_ste); + + /* Next table */ + if (mlx5dr_ste_create_next_htbl(matcher, nic_matcher, ste, hw_ste, + DR_CHUNK_SIZE_1)) { + mlx5dr_dbg(matcher->tbl->dmn, "Failed allocating table\n"); + goto free_tbl; + } + + return ste; + +free_tbl: + mlx5dr_ste_free(ste, matcher, nic_matcher); + return NULL; +} + +static int +dr_rule_handle_one_ste_in_update_list(struct mlx5dr_ste_send_info *ste_info, + struct mlx5dr_domain *dmn) +{ + int ret; + + list_del(&ste_info->send_list); + + /* Copy data to ste, only reduced size or control, the last 16B (mask) + * is already written to the hw. + */ + if (ste_info->size == DR_STE_SIZE_CTRL) + memcpy(mlx5dr_ste_get_hw_ste(ste_info->ste), + ste_info->data, DR_STE_SIZE_CTRL); + else + memcpy(mlx5dr_ste_get_hw_ste(ste_info->ste), + ste_info->data, DR_STE_SIZE_REDUCED); + + ret = mlx5dr_send_postsend_ste(dmn, ste_info->ste, ste_info->data, + ste_info->size, ste_info->offset); + if (ret) + goto out; + +out: + mlx5dr_send_info_free(ste_info); + return ret; +} + +static int dr_rule_send_update_list(struct list_head *send_ste_list, + struct mlx5dr_domain *dmn, + bool is_reverse) +{ + struct mlx5dr_ste_send_info *ste_info, *tmp_ste_info; + int ret; + + if (is_reverse) { + list_for_each_entry_safe_reverse(ste_info, tmp_ste_info, + send_ste_list, send_list) { + ret = dr_rule_handle_one_ste_in_update_list(ste_info, + dmn); + if (ret) + return ret; + } + } else { + list_for_each_entry_safe(ste_info, tmp_ste_info, + send_ste_list, send_list) { + ret = dr_rule_handle_one_ste_in_update_list(ste_info, + dmn); + if (ret) + return ret; + } + } + + return 0; +} + +static struct mlx5dr_ste * +dr_rule_find_ste_in_miss_list(struct list_head *miss_list, u8 *hw_ste) +{ + struct mlx5dr_ste *ste; + + if (list_empty(miss_list)) + return NULL; + + /* Check if hw_ste is present in the list */ + list_for_each_entry(ste, miss_list, miss_list_node) { + if (mlx5dr_ste_equal_tag(mlx5dr_ste_get_hw_ste(ste), hw_ste)) + return ste; + } + + return NULL; +} + +static struct mlx5dr_ste * +dr_rule_rehash_handle_collision(struct mlx5dr_matcher *matcher, + struct mlx5dr_matcher_rx_tx *nic_matcher, + struct list_head *update_list, + struct mlx5dr_ste *col_ste, + u8 *hw_ste) +{ + struct mlx5dr_domain *dmn = matcher->tbl->dmn; + struct mlx5dr_ste *new_ste; + int ret; + + new_ste = dr_rule_create_collision_htbl(matcher, nic_matcher, hw_ste); + if (!new_ste) + return NULL; + + /* Update collision pointing STE */ + new_ste->htbl->pointing_ste = col_ste->htbl->pointing_ste; + + /* In collision entry, all members share the same miss_list_head */ + new_ste->htbl->chunk->miss_list = mlx5dr_ste_get_miss_list(col_ste); + + /* Update the previous from the list */ + ret = dr_rule_append_to_miss_list(dmn, nic_matcher->nic_tbl->nic_dmn->type, + new_ste, mlx5dr_ste_get_miss_list(col_ste), + update_list); + if (ret) { + mlx5dr_dbg(dmn, "Failed update dup entry\n"); + goto err_exit; + } + + return new_ste; + +err_exit: + mlx5dr_ste_free(new_ste, matcher, nic_matcher); + return NULL; +} + +static void dr_rule_rehash_copy_ste_ctrl(struct mlx5dr_matcher *matcher, + struct mlx5dr_matcher_rx_tx *nic_matcher, + struct mlx5dr_ste *cur_ste, + struct mlx5dr_ste *new_ste) +{ + new_ste->next_htbl = cur_ste->next_htbl; + new_ste->ste_chain_location = cur_ste->ste_chain_location; + + if (new_ste->next_htbl) + new_ste->next_htbl->pointing_ste = new_ste; + + /* We need to copy the refcount since this ste + * may have been traversed several times + */ + new_ste->refcount = cur_ste->refcount; + + /* Link old STEs rule to the new ste */ + mlx5dr_rule_set_last_member(cur_ste->rule_rx_tx, new_ste, false); +} + +static struct mlx5dr_ste * +dr_rule_rehash_copy_ste(struct mlx5dr_matcher *matcher, + struct mlx5dr_matcher_rx_tx *nic_matcher, + struct mlx5dr_ste *cur_ste, + struct mlx5dr_ste_htbl *new_htbl, + struct list_head *update_list) +{ + struct mlx5dr_domain *dmn = matcher->tbl->dmn; + struct mlx5dr_ste_send_info *ste_info; + bool use_update_list = false; + u8 hw_ste[DR_STE_SIZE] = {}; + struct mlx5dr_ste *new_ste; + u64 icm_addr; + int new_idx; + u8 sb_idx; + + /* Copy STE mask from the matcher */ + sb_idx = cur_ste->ste_chain_location - 1; + mlx5dr_ste_set_bit_mask(hw_ste, nic_matcher->ste_builder[sb_idx].bit_mask); + + /* Copy STE control and tag */ + icm_addr = mlx5dr_icm_pool_get_chunk_icm_addr(nic_matcher->e_anchor->chunk); + memcpy(hw_ste, mlx5dr_ste_get_hw_ste(cur_ste), DR_STE_SIZE_REDUCED); + mlx5dr_ste_set_miss_addr(dmn->ste_ctx, hw_ste, icm_addr); + + new_idx = mlx5dr_ste_calc_hash_index(hw_ste, new_htbl); + new_ste = &new_htbl->chunk->ste_arr[new_idx]; + + if (mlx5dr_ste_is_not_used(new_ste)) { + mlx5dr_htbl_get(new_htbl); + list_add_tail(&new_ste->miss_list_node, + mlx5dr_ste_get_miss_list(new_ste)); + } else { + new_ste = dr_rule_rehash_handle_collision(matcher, + nic_matcher, + update_list, + new_ste, + hw_ste); + if (!new_ste) { + mlx5dr_dbg(dmn, "Failed adding collision entry, index: %d\n", + new_idx); + return NULL; + } + new_htbl->ctrl.num_of_collisions++; + use_update_list = true; + } + + memcpy(mlx5dr_ste_get_hw_ste(new_ste), hw_ste, DR_STE_SIZE_REDUCED); + + new_htbl->ctrl.num_of_valid_entries++; + + if (use_update_list) { + ste_info = mlx5dr_send_info_alloc(dmn, + nic_matcher->nic_tbl->nic_dmn->type); + if (!ste_info) + goto err_exit; + + mlx5dr_send_fill_and_append_ste_send_info(new_ste, DR_STE_SIZE, 0, + hw_ste, ste_info, + update_list, true); + } + + dr_rule_rehash_copy_ste_ctrl(matcher, nic_matcher, cur_ste, new_ste); + + return new_ste; + +err_exit: + mlx5dr_ste_free(new_ste, matcher, nic_matcher); + return NULL; +} + +static int dr_rule_rehash_copy_miss_list(struct mlx5dr_matcher *matcher, + struct mlx5dr_matcher_rx_tx *nic_matcher, + struct list_head *cur_miss_list, + struct mlx5dr_ste_htbl *new_htbl, + struct list_head *update_list) +{ + struct mlx5dr_ste *tmp_ste, *cur_ste, *new_ste; + + if (list_empty(cur_miss_list)) + return 0; + + list_for_each_entry_safe(cur_ste, tmp_ste, cur_miss_list, miss_list_node) { + new_ste = dr_rule_rehash_copy_ste(matcher, + nic_matcher, + cur_ste, + new_htbl, + update_list); + if (!new_ste) + goto err_insert; + + list_del(&cur_ste->miss_list_node); + mlx5dr_htbl_put(cur_ste->htbl); + } + return 0; + +err_insert: + mlx5dr_err(matcher->tbl->dmn, "Fatal error during resize\n"); + WARN_ON(true); + return -EINVAL; +} + +static int dr_rule_rehash_copy_htbl(struct mlx5dr_matcher *matcher, + struct mlx5dr_matcher_rx_tx *nic_matcher, + struct mlx5dr_ste_htbl *cur_htbl, + struct mlx5dr_ste_htbl *new_htbl, + struct list_head *update_list) +{ + struct mlx5dr_ste *cur_ste; + int cur_entries; + int err = 0; + int i; + + cur_entries = mlx5dr_icm_pool_chunk_size_to_entries(cur_htbl->chunk->size); + + if (cur_entries < 1) { + mlx5dr_dbg(matcher->tbl->dmn, "Invalid number of entries\n"); + return -EINVAL; + } + + for (i = 0; i < cur_entries; i++) { + cur_ste = &cur_htbl->chunk->ste_arr[i]; + if (mlx5dr_ste_is_not_used(cur_ste)) /* Empty, nothing to copy */ + continue; + + err = dr_rule_rehash_copy_miss_list(matcher, + nic_matcher, + mlx5dr_ste_get_miss_list(cur_ste), + new_htbl, + update_list); + if (err) + goto clean_copy; + + /* In order to decrease the number of allocated ste_send_info + * structs, send the current table row now. + */ + err = dr_rule_send_update_list(update_list, matcher->tbl->dmn, false); + if (err) { + mlx5dr_dbg(matcher->tbl->dmn, "Failed updating table to HW\n"); + goto clean_copy; + } + } + +clean_copy: + return err; +} + +static struct mlx5dr_ste_htbl * +dr_rule_rehash_htbl(struct mlx5dr_rule *rule, + struct mlx5dr_rule_rx_tx *nic_rule, + struct mlx5dr_ste_htbl *cur_htbl, + u8 ste_location, + struct list_head *update_list, + enum mlx5dr_icm_chunk_size new_size) +{ + struct mlx5dr_ste_send_info *del_ste_info, *tmp_ste_info; + struct mlx5dr_matcher *matcher = rule->matcher; + struct mlx5dr_domain *dmn = matcher->tbl->dmn; + struct mlx5dr_matcher_rx_tx *nic_matcher; + struct mlx5dr_ste_send_info *ste_info; + struct mlx5dr_htbl_connect_info info; + struct mlx5dr_domain_rx_tx *nic_dmn; + u8 formatted_ste[DR_STE_SIZE] = {}; + LIST_HEAD(rehash_table_send_list); + struct mlx5dr_ste *ste_to_update; + struct mlx5dr_ste_htbl *new_htbl; + int err; + + nic_matcher = nic_rule->nic_matcher; + nic_dmn = nic_matcher->nic_tbl->nic_dmn; + + ste_info = mlx5dr_send_info_alloc(dmn, + nic_matcher->nic_tbl->nic_dmn->type); + if (!ste_info) + return NULL; + + new_htbl = mlx5dr_ste_htbl_alloc(dmn->ste_icm_pool, + new_size, + cur_htbl->lu_type, + cur_htbl->byte_mask); + if (!new_htbl) { + mlx5dr_err(dmn, "Failed to allocate new hash table\n"); + goto free_ste_info; + } + + /* Write new table to HW */ + info.type = CONNECT_MISS; + info.miss_icm_addr = mlx5dr_icm_pool_get_chunk_icm_addr(nic_matcher->e_anchor->chunk); + mlx5dr_ste_set_formatted_ste(dmn->ste_ctx, + dmn->info.caps.gvmi, + nic_dmn->type, + new_htbl, + formatted_ste, + &info); + + new_htbl->pointing_ste = cur_htbl->pointing_ste; + new_htbl->pointing_ste->next_htbl = new_htbl; + err = dr_rule_rehash_copy_htbl(matcher, + nic_matcher, + cur_htbl, + new_htbl, + &rehash_table_send_list); + if (err) + goto free_new_htbl; + + if (mlx5dr_send_postsend_htbl(dmn, new_htbl, formatted_ste, + nic_matcher->ste_builder[ste_location - 1].bit_mask)) { + mlx5dr_err(dmn, "Failed writing table to HW\n"); + goto free_new_htbl; + } + + /* Writing to the hw is done in regular order of rehash_table_send_list, + * in order to have the origin data written before the miss address of + * collision entries, if exists. + */ + if (dr_rule_send_update_list(&rehash_table_send_list, dmn, false)) { + mlx5dr_err(dmn, "Failed updating table to HW\n"); + goto free_ste_list; + } + + /* Connect previous hash table to current */ + if (ste_location == 1) { + /* The previous table is an anchor, anchors size is always one STE */ + struct mlx5dr_ste_htbl *prev_htbl = cur_htbl->pointing_ste->htbl; + + /* On matcher s_anchor we keep an extra refcount */ + mlx5dr_htbl_get(new_htbl); + mlx5dr_htbl_put(cur_htbl); + + nic_matcher->s_htbl = new_htbl; + + /* It is safe to operate dr_ste_set_hit_addr on the hw_ste here + * (48B len) which works only on first 32B + */ + mlx5dr_ste_set_hit_addr(dmn->ste_ctx, + prev_htbl->chunk->hw_ste_arr, + mlx5dr_icm_pool_get_chunk_icm_addr(new_htbl->chunk), + mlx5dr_icm_pool_get_chunk_num_of_entries(new_htbl->chunk)); + + ste_to_update = &prev_htbl->chunk->ste_arr[0]; + } else { + mlx5dr_ste_set_hit_addr_by_next_htbl(dmn->ste_ctx, + mlx5dr_ste_get_hw_ste(cur_htbl->pointing_ste), + new_htbl); + ste_to_update = cur_htbl->pointing_ste; + } + + mlx5dr_send_fill_and_append_ste_send_info(ste_to_update, DR_STE_SIZE_CTRL, + 0, mlx5dr_ste_get_hw_ste(ste_to_update), + ste_info, update_list, false); + + return new_htbl; + +free_ste_list: + /* Clean all ste_info's from the new table */ + list_for_each_entry_safe(del_ste_info, tmp_ste_info, + &rehash_table_send_list, send_list) { + list_del(&del_ste_info->send_list); + mlx5dr_send_info_free(del_ste_info); + } + +free_new_htbl: + mlx5dr_ste_htbl_free(new_htbl); +free_ste_info: + mlx5dr_send_info_free(ste_info); + mlx5dr_info(dmn, "Failed creating rehash table\n"); + return NULL; +} + +static struct mlx5dr_ste_htbl *dr_rule_rehash(struct mlx5dr_rule *rule, + struct mlx5dr_rule_rx_tx *nic_rule, + struct mlx5dr_ste_htbl *cur_htbl, + u8 ste_location, + struct list_head *update_list) +{ + struct mlx5dr_domain *dmn = rule->matcher->tbl->dmn; + enum mlx5dr_icm_chunk_size new_size; + + new_size = mlx5dr_icm_next_higher_chunk(cur_htbl->chunk->size); + new_size = min_t(u32, new_size, dmn->info.max_log_sw_icm_sz); + + if (new_size == cur_htbl->chunk->size) + return NULL; /* Skip rehash, we already at the max size */ + + return dr_rule_rehash_htbl(rule, nic_rule, cur_htbl, ste_location, + update_list, new_size); +} + +static struct mlx5dr_ste * +dr_rule_handle_collision(struct mlx5dr_matcher *matcher, + struct mlx5dr_matcher_rx_tx *nic_matcher, + struct mlx5dr_ste *ste, + u8 *hw_ste, + struct list_head *miss_list, + struct list_head *send_list) +{ + struct mlx5dr_domain *dmn = matcher->tbl->dmn; + struct mlx5dr_ste_send_info *ste_info; + struct mlx5dr_ste *new_ste; + + ste_info = mlx5dr_send_info_alloc(dmn, + nic_matcher->nic_tbl->nic_dmn->type); + if (!ste_info) + return NULL; + + new_ste = dr_rule_create_collision_entry(matcher, nic_matcher, hw_ste, ste); + if (!new_ste) + goto free_send_info; + + if (dr_rule_append_to_miss_list(dmn, nic_matcher->nic_tbl->nic_dmn->type, + new_ste, miss_list, send_list)) { + mlx5dr_dbg(dmn, "Failed to update prev miss_list\n"); + goto err_exit; + } + + mlx5dr_send_fill_and_append_ste_send_info(new_ste, DR_STE_SIZE, 0, hw_ste, + ste_info, send_list, false); + + ste->htbl->ctrl.num_of_collisions++; + ste->htbl->ctrl.num_of_valid_entries++; + + return new_ste; + +err_exit: + mlx5dr_ste_free(new_ste, matcher, nic_matcher); +free_send_info: + mlx5dr_send_info_free(ste_info); + return NULL; +} + +static void dr_rule_remove_action_members(struct mlx5dr_rule *rule) +{ + struct mlx5dr_rule_action_member *action_mem; + struct mlx5dr_rule_action_member *tmp; + + list_for_each_entry_safe(action_mem, tmp, &rule->rule_actions_list, list) { + list_del(&action_mem->list); + refcount_dec(&action_mem->action->refcount); + kvfree(action_mem); + } +} + +static int dr_rule_add_action_members(struct mlx5dr_rule *rule, + size_t num_actions, + struct mlx5dr_action *actions[]) +{ + struct mlx5dr_rule_action_member *action_mem; + int i; + + for (i = 0; i < num_actions; i++) { + action_mem = kvzalloc(sizeof(*action_mem), GFP_KERNEL); + if (!action_mem) + goto free_action_members; + + action_mem->action = actions[i]; + INIT_LIST_HEAD(&action_mem->list); + list_add_tail(&action_mem->list, &rule->rule_actions_list); + refcount_inc(&action_mem->action->refcount); + } + + return 0; + +free_action_members: + dr_rule_remove_action_members(rule); + return -ENOMEM; +} + +void mlx5dr_rule_set_last_member(struct mlx5dr_rule_rx_tx *nic_rule, + struct mlx5dr_ste *ste, + bool force) +{ + /* Update rule member is usually done for the last STE or during rule + * creation to recover from mid-creation failure (for this peruse the + * force flag is used) + */ + if (ste->next_htbl && !force) + return; + + /* Update is required since each rule keeps track of its last STE */ + ste->rule_rx_tx = nic_rule; + nic_rule->last_rule_ste = ste; +} + +static struct mlx5dr_ste *dr_rule_get_pointed_ste(struct mlx5dr_ste *curr_ste) +{ + struct mlx5dr_ste *first_ste; + + first_ste = list_first_entry(mlx5dr_ste_get_miss_list(curr_ste), + struct mlx5dr_ste, miss_list_node); + + return first_ste->htbl->pointing_ste; +} + +int mlx5dr_rule_get_reverse_rule_members(struct mlx5dr_ste **ste_arr, + struct mlx5dr_ste *curr_ste, + int *num_of_stes) +{ + bool first = false; + + *num_of_stes = 0; + + if (!curr_ste) + return -ENOENT; + + /* Iterate from last to first */ + while (!first) { + first = curr_ste->ste_chain_location == 1; + ste_arr[*num_of_stes] = curr_ste; + *num_of_stes += 1; + curr_ste = dr_rule_get_pointed_ste(curr_ste); + } + + return 0; +} + +static void dr_rule_clean_rule_members(struct mlx5dr_rule *rule, + struct mlx5dr_rule_rx_tx *nic_rule) +{ + struct mlx5dr_ste *ste_arr[DR_RULE_MAX_STES + DR_ACTION_MAX_STES]; + struct mlx5dr_ste *curr_ste = nic_rule->last_rule_ste; + int i; + + if (mlx5dr_rule_get_reverse_rule_members(ste_arr, curr_ste, &i)) + return; + + while (i--) + mlx5dr_ste_put(ste_arr[i], rule->matcher, nic_rule->nic_matcher); +} + +static u16 dr_get_bits_per_mask(u16 byte_mask) +{ + u16 bits = 0; + + while (byte_mask) { + byte_mask = byte_mask & (byte_mask - 1); + bits++; + } + + return bits; +} + +static bool dr_rule_need_enlarge_hash(struct mlx5dr_ste_htbl *htbl, + struct mlx5dr_domain *dmn, + struct mlx5dr_domain_rx_tx *nic_dmn) +{ + struct mlx5dr_ste_htbl_ctrl *ctrl = &htbl->ctrl; + int threshold; + + if (dmn->info.max_log_sw_icm_sz <= htbl->chunk->size) + return false; + + if (!mlx5dr_ste_htbl_may_grow(htbl)) + return false; + + if (dr_get_bits_per_mask(htbl->byte_mask) * BITS_PER_BYTE <= htbl->chunk->size) + return false; + + threshold = mlx5dr_ste_htbl_increase_threshold(htbl); + if (ctrl->num_of_collisions >= threshold && + (ctrl->num_of_valid_entries - ctrl->num_of_collisions) >= threshold) + return true; + + return false; +} + +static int dr_rule_handle_action_stes(struct mlx5dr_rule *rule, + struct mlx5dr_rule_rx_tx *nic_rule, + struct list_head *send_ste_list, + struct mlx5dr_ste *last_ste, + u8 *hw_ste_arr, + u32 new_hw_ste_arr_sz) +{ + struct mlx5dr_matcher_rx_tx *nic_matcher = nic_rule->nic_matcher; + struct mlx5dr_ste_send_info *ste_info_arr[DR_ACTION_MAX_STES]; + u8 num_of_builders = nic_matcher->num_of_builders; + struct mlx5dr_matcher *matcher = rule->matcher; + struct mlx5dr_domain *dmn = matcher->tbl->dmn; + u8 *curr_hw_ste, *prev_hw_ste; + struct mlx5dr_ste *action_ste; + int i, k; + + /* Two cases: + * 1. num_of_builders is equal to new_hw_ste_arr_sz, the action in the ste + * 2. num_of_builders is less then new_hw_ste_arr_sz, new ste was added + * to support the action. + */ + + for (i = num_of_builders, k = 0; i < new_hw_ste_arr_sz; i++, k++) { + curr_hw_ste = hw_ste_arr + i * DR_STE_SIZE; + prev_hw_ste = (i == 0) ? curr_hw_ste : hw_ste_arr + ((i - 1) * DR_STE_SIZE); + action_ste = dr_rule_create_collision_htbl(matcher, + nic_matcher, + curr_hw_ste); + if (!action_ste) + return -ENOMEM; + + mlx5dr_ste_get(action_ste); + + action_ste->htbl->pointing_ste = last_ste; + last_ste->next_htbl = action_ste->htbl; + last_ste = action_ste; + + /* While free ste we go over the miss list, so add this ste to the list */ + list_add_tail(&action_ste->miss_list_node, + mlx5dr_ste_get_miss_list(action_ste)); + + ste_info_arr[k] = mlx5dr_send_info_alloc(dmn, + nic_matcher->nic_tbl->nic_dmn->type); + if (!ste_info_arr[k]) + goto err_exit; + + /* Point current ste to the new action */ + mlx5dr_ste_set_hit_addr_by_next_htbl(dmn->ste_ctx, + prev_hw_ste, + action_ste->htbl); + + mlx5dr_rule_set_last_member(nic_rule, action_ste, true); + + mlx5dr_send_fill_and_append_ste_send_info(action_ste, DR_STE_SIZE, 0, + curr_hw_ste, + ste_info_arr[k], + send_ste_list, false); + } + + last_ste->next_htbl = NULL; + + return 0; + +err_exit: + mlx5dr_ste_put(action_ste, matcher, nic_matcher); + return -ENOMEM; +} + +static int dr_rule_handle_empty_entry(struct mlx5dr_matcher *matcher, + struct mlx5dr_matcher_rx_tx *nic_matcher, + struct mlx5dr_ste_htbl *cur_htbl, + struct mlx5dr_ste *ste, + u8 ste_location, + u8 *hw_ste, + struct list_head *miss_list, + struct list_head *send_list) +{ + struct mlx5dr_domain *dmn = matcher->tbl->dmn; + struct mlx5dr_ste_send_info *ste_info; + u64 icm_addr; + + /* Take ref on table, only on first time this ste is used */ + mlx5dr_htbl_get(cur_htbl); + + /* new entry -> new branch */ + list_add_tail(&ste->miss_list_node, miss_list); + + icm_addr = mlx5dr_icm_pool_get_chunk_icm_addr(nic_matcher->e_anchor->chunk); + mlx5dr_ste_set_miss_addr(dmn->ste_ctx, hw_ste, icm_addr); + + ste->ste_chain_location = ste_location; + + ste_info = mlx5dr_send_info_alloc(dmn, + nic_matcher->nic_tbl->nic_dmn->type); + if (!ste_info) + goto clean_ste_setting; + + if (mlx5dr_ste_create_next_htbl(matcher, + nic_matcher, + ste, + hw_ste, + DR_CHUNK_SIZE_1)) { + mlx5dr_dbg(dmn, "Failed allocating table\n"); + goto clean_ste_info; + } + + cur_htbl->ctrl.num_of_valid_entries++; + + mlx5dr_send_fill_and_append_ste_send_info(ste, DR_STE_SIZE, 0, hw_ste, + ste_info, send_list, false); + + return 0; + +clean_ste_info: + mlx5dr_send_info_free(ste_info); +clean_ste_setting: + list_del_init(&ste->miss_list_node); + mlx5dr_htbl_put(cur_htbl); + + return -ENOMEM; +} + +static struct mlx5dr_ste * +dr_rule_handle_ste_branch(struct mlx5dr_rule *rule, + struct mlx5dr_rule_rx_tx *nic_rule, + struct list_head *send_ste_list, + struct mlx5dr_ste_htbl *cur_htbl, + u8 *hw_ste, + u8 ste_location, + struct mlx5dr_ste_htbl **put_htbl) +{ + struct mlx5dr_matcher *matcher = rule->matcher; + struct mlx5dr_domain *dmn = matcher->tbl->dmn; + struct mlx5dr_matcher_rx_tx *nic_matcher; + struct mlx5dr_domain_rx_tx *nic_dmn; + struct mlx5dr_ste_htbl *new_htbl; + struct mlx5dr_ste *matched_ste; + struct list_head *miss_list; + bool skip_rehash = false; + struct mlx5dr_ste *ste; + int index; + + nic_matcher = nic_rule->nic_matcher; + nic_dmn = nic_matcher->nic_tbl->nic_dmn; + +again: + index = mlx5dr_ste_calc_hash_index(hw_ste, cur_htbl); + miss_list = &cur_htbl->chunk->miss_list[index]; + ste = &cur_htbl->chunk->ste_arr[index]; + + if (mlx5dr_ste_is_not_used(ste)) { + if (dr_rule_handle_empty_entry(matcher, nic_matcher, cur_htbl, + ste, ste_location, + hw_ste, miss_list, + send_ste_list)) + return NULL; + } else { + /* Hash table index in use, check if this ste is in the miss list */ + matched_ste = dr_rule_find_ste_in_miss_list(miss_list, hw_ste); + if (matched_ste) { + /* If it is last STE in the chain, and has the same tag + * it means that all the previous stes are the same, + * if so, this rule is duplicated. + */ + if (!mlx5dr_ste_is_last_in_rule(nic_matcher, ste_location)) + return matched_ste; + + mlx5dr_dbg(dmn, "Duplicate rule inserted\n"); + } + + if (!skip_rehash && dr_rule_need_enlarge_hash(cur_htbl, dmn, nic_dmn)) { + /* Hash table index in use, try to resize of the hash */ + skip_rehash = true; + + /* Hold the table till we update. + * Release in dr_rule_create_rule() + */ + *put_htbl = cur_htbl; + mlx5dr_htbl_get(cur_htbl); + + new_htbl = dr_rule_rehash(rule, nic_rule, cur_htbl, + ste_location, send_ste_list); + if (!new_htbl) { + mlx5dr_err(dmn, "Failed creating rehash table, htbl-log_size: %d\n", + cur_htbl->chunk->size); + mlx5dr_htbl_put(cur_htbl); + } else { + cur_htbl = new_htbl; + } + goto again; + } else { + /* Hash table index in use, add another collision (miss) */ + ste = dr_rule_handle_collision(matcher, + nic_matcher, + ste, + hw_ste, + miss_list, + send_ste_list); + if (!ste) { + mlx5dr_dbg(dmn, "failed adding collision entry, index: %d\n", + index); + return NULL; + } + } + } + return ste; +} + +static bool dr_rule_cmp_value_to_mask(u8 *mask, u8 *value, + u32 s_idx, u32 e_idx) +{ + u32 i; + + for (i = s_idx; i < e_idx; i++) { + if (value[i] & ~mask[i]) { + pr_info("Rule parameters contains a value not specified by mask\n"); + return false; + } + } + return true; +} + +static bool dr_rule_verify(struct mlx5dr_matcher *matcher, + struct mlx5dr_match_parameters *value, + struct mlx5dr_match_param *param) +{ + u8 match_criteria = matcher->match_criteria; + size_t value_size = value->match_sz; + u8 *mask_p = (u8 *)&matcher->mask; + u8 *param_p = (u8 *)param; + u32 s_idx, e_idx; + + if (!value_size || + (value_size > DR_SZ_MATCH_PARAM || (value_size % sizeof(u32)))) { + mlx5dr_err(matcher->tbl->dmn, "Rule parameters length is incorrect\n"); + return false; + } + + mlx5dr_ste_copy_param(matcher->match_criteria, param, value, false); + + if (match_criteria & DR_MATCHER_CRITERIA_OUTER) { + s_idx = offsetof(struct mlx5dr_match_param, outer); + e_idx = min(s_idx + sizeof(param->outer), value_size); + + if (!dr_rule_cmp_value_to_mask(mask_p, param_p, s_idx, e_idx)) { + mlx5dr_err(matcher->tbl->dmn, "Rule outer parameters contains a value not specified by mask\n"); + return false; + } + } + + if (match_criteria & DR_MATCHER_CRITERIA_MISC) { + s_idx = offsetof(struct mlx5dr_match_param, misc); + e_idx = min(s_idx + sizeof(param->misc), value_size); + + if (!dr_rule_cmp_value_to_mask(mask_p, param_p, s_idx, e_idx)) { + mlx5dr_err(matcher->tbl->dmn, "Rule misc parameters contains a value not specified by mask\n"); + return false; + } + } + + if (match_criteria & DR_MATCHER_CRITERIA_INNER) { + s_idx = offsetof(struct mlx5dr_match_param, inner); + e_idx = min(s_idx + sizeof(param->inner), value_size); + + if (!dr_rule_cmp_value_to_mask(mask_p, param_p, s_idx, e_idx)) { + mlx5dr_err(matcher->tbl->dmn, "Rule inner parameters contains a value not specified by mask\n"); + return false; + } + } + + if (match_criteria & DR_MATCHER_CRITERIA_MISC2) { + s_idx = offsetof(struct mlx5dr_match_param, misc2); + e_idx = min(s_idx + sizeof(param->misc2), value_size); + + if (!dr_rule_cmp_value_to_mask(mask_p, param_p, s_idx, e_idx)) { + mlx5dr_err(matcher->tbl->dmn, "Rule misc2 parameters contains a value not specified by mask\n"); + return false; + } + } + + if (match_criteria & DR_MATCHER_CRITERIA_MISC3) { + s_idx = offsetof(struct mlx5dr_match_param, misc3); + e_idx = min(s_idx + sizeof(param->misc3), value_size); + + if (!dr_rule_cmp_value_to_mask(mask_p, param_p, s_idx, e_idx)) { + mlx5dr_err(matcher->tbl->dmn, "Rule misc3 parameters contains a value not specified by mask\n"); + return false; + } + } + + if (match_criteria & DR_MATCHER_CRITERIA_MISC4) { + s_idx = offsetof(struct mlx5dr_match_param, misc4); + e_idx = min(s_idx + sizeof(param->misc4), value_size); + + if (!dr_rule_cmp_value_to_mask(mask_p, param_p, s_idx, e_idx)) { + mlx5dr_err(matcher->tbl->dmn, + "Rule misc4 parameters contains a value not specified by mask\n"); + return false; + } + } + + if (match_criteria & DR_MATCHER_CRITERIA_MISC5) { + s_idx = offsetof(struct mlx5dr_match_param, misc5); + e_idx = min(s_idx + sizeof(param->misc5), value_size); + + if (!dr_rule_cmp_value_to_mask(mask_p, param_p, s_idx, e_idx)) { + mlx5dr_err(matcher->tbl->dmn, "Rule misc5 parameters contains a value not specified by mask\n"); + return false; + } + } + return true; +} + +static int dr_rule_destroy_rule_nic(struct mlx5dr_rule *rule, + struct mlx5dr_rule_rx_tx *nic_rule) +{ + /* Check if this nic rule was actually created, or was it skipped + * and only the other type of the RX/TX nic rule was created. + */ + if (!nic_rule->last_rule_ste) + return 0; + + mlx5dr_domain_nic_lock(nic_rule->nic_matcher->nic_tbl->nic_dmn); + dr_rule_clean_rule_members(rule, nic_rule); + + nic_rule->nic_matcher->rules--; + if (!nic_rule->nic_matcher->rules) + mlx5dr_matcher_remove_from_tbl_nic(rule->matcher->tbl->dmn, + nic_rule->nic_matcher); + + mlx5dr_domain_nic_unlock(nic_rule->nic_matcher->nic_tbl->nic_dmn); + + return 0; +} + +static int dr_rule_destroy_rule_fdb(struct mlx5dr_rule *rule) +{ + dr_rule_destroy_rule_nic(rule, &rule->rx); + dr_rule_destroy_rule_nic(rule, &rule->tx); + return 0; +} + +static int dr_rule_destroy_rule(struct mlx5dr_rule *rule) +{ + struct mlx5dr_domain *dmn = rule->matcher->tbl->dmn; + + mlx5dr_dbg_rule_del(rule); + + switch (dmn->type) { + case MLX5DR_DOMAIN_TYPE_NIC_RX: + dr_rule_destroy_rule_nic(rule, &rule->rx); + break; + case MLX5DR_DOMAIN_TYPE_NIC_TX: + dr_rule_destroy_rule_nic(rule, &rule->tx); + break; + case MLX5DR_DOMAIN_TYPE_FDB: + dr_rule_destroy_rule_fdb(rule); + break; + default: + return -EINVAL; + } + + dr_rule_remove_action_members(rule); + kfree(rule); + return 0; +} + +static enum mlx5dr_ipv dr_rule_get_ipv(struct mlx5dr_match_spec *spec) +{ + if (spec->ip_version == 6 || spec->ethertype == ETH_P_IPV6) + return DR_RULE_IPV6; + + return DR_RULE_IPV4; +} + +static bool dr_rule_skip(enum mlx5dr_domain_type domain, + enum mlx5dr_domain_nic_type nic_type, + struct mlx5dr_match_param *mask, + struct mlx5dr_match_param *value, + u32 flow_source) +{ + bool rx = nic_type == DR_DOMAIN_NIC_TYPE_RX; + + if (domain != MLX5DR_DOMAIN_TYPE_FDB) + return false; + + if (mask->misc.source_port) { + if (rx && value->misc.source_port != MLX5_VPORT_UPLINK) + return true; + + if (!rx && value->misc.source_port == MLX5_VPORT_UPLINK) + return true; + } + + if (rx && flow_source == MLX5_FLOW_CONTEXT_FLOW_SOURCE_LOCAL_VPORT) + return true; + + if (!rx && flow_source == MLX5_FLOW_CONTEXT_FLOW_SOURCE_UPLINK) + return true; + + return false; +} + +static int +dr_rule_create_rule_nic(struct mlx5dr_rule *rule, + struct mlx5dr_rule_rx_tx *nic_rule, + struct mlx5dr_match_param *param, + size_t num_actions, + struct mlx5dr_action *actions[]) +{ + u8 hw_ste_arr[DR_RULE_MAX_STE_CHAIN * DR_STE_SIZE] = {}; + struct mlx5dr_ste_send_info *ste_info, *tmp_ste_info; + struct mlx5dr_matcher *matcher = rule->matcher; + struct mlx5dr_domain *dmn = matcher->tbl->dmn; + struct mlx5dr_matcher_rx_tx *nic_matcher; + struct mlx5dr_domain_rx_tx *nic_dmn; + struct mlx5dr_ste_htbl *htbl = NULL; + struct mlx5dr_ste_htbl *cur_htbl; + struct mlx5dr_ste *ste = NULL; + LIST_HEAD(send_ste_list); + u32 new_hw_ste_arr_sz; + int ret, i; + + nic_matcher = nic_rule->nic_matcher; + nic_dmn = nic_matcher->nic_tbl->nic_dmn; + + if (dr_rule_skip(dmn->type, nic_dmn->type, &matcher->mask, param, + rule->flow_source)) + return 0; + + mlx5dr_domain_nic_lock(nic_dmn); + + ret = mlx5dr_matcher_add_to_tbl_nic(dmn, nic_matcher); + if (ret) + goto free_hw_ste; + + ret = mlx5dr_matcher_select_builders(matcher, + nic_matcher, + dr_rule_get_ipv(¶m->outer), + dr_rule_get_ipv(¶m->inner)); + if (ret) + goto remove_from_nic_tbl; + + /* Set the tag values inside the ste array */ + ret = mlx5dr_ste_build_ste_arr(matcher, nic_matcher, param, hw_ste_arr); + if (ret) + goto remove_from_nic_tbl; + + /* Set the actions values/addresses inside the ste array */ + ret = mlx5dr_actions_build_ste_arr(matcher, nic_matcher, actions, + num_actions, hw_ste_arr, + &new_hw_ste_arr_sz); + if (ret) + goto remove_from_nic_tbl; + + cur_htbl = nic_matcher->s_htbl; + + /* Go over the array of STEs, and build dr_ste accordingly. + * The loop is over only the builders which are equal or less to the + * number of stes, in case we have actions that lives in other stes. + */ + for (i = 0; i < nic_matcher->num_of_builders; i++) { + /* Calculate CRC and keep new ste entry */ + u8 *cur_hw_ste_ent = hw_ste_arr + (i * DR_STE_SIZE); + + ste = dr_rule_handle_ste_branch(rule, + nic_rule, + &send_ste_list, + cur_htbl, + cur_hw_ste_ent, + i + 1, + &htbl); + if (!ste) { + mlx5dr_err(dmn, "Failed creating next branch\n"); + ret = -ENOENT; + goto free_rule; + } + + cur_htbl = ste->next_htbl; + + mlx5dr_ste_get(ste); + mlx5dr_rule_set_last_member(nic_rule, ste, true); + } + + /* Connect actions */ + ret = dr_rule_handle_action_stes(rule, nic_rule, &send_ste_list, + ste, hw_ste_arr, new_hw_ste_arr_sz); + if (ret) { + mlx5dr_dbg(dmn, "Failed apply actions\n"); + goto free_rule; + } + ret = dr_rule_send_update_list(&send_ste_list, dmn, true); + if (ret) { + mlx5dr_err(dmn, "Failed sending ste!\n"); + goto free_rule; + } + + if (htbl) + mlx5dr_htbl_put(htbl); + + nic_matcher->rules++; + + mlx5dr_domain_nic_unlock(nic_dmn); + + return 0; + +free_rule: + dr_rule_clean_rule_members(rule, nic_rule); + /* Clean all ste_info's */ + list_for_each_entry_safe(ste_info, tmp_ste_info, &send_ste_list, send_list) { + list_del(&ste_info->send_list); + mlx5dr_send_info_free(ste_info); + } + +remove_from_nic_tbl: + if (!nic_matcher->rules) + mlx5dr_matcher_remove_from_tbl_nic(dmn, nic_matcher); + +free_hw_ste: + mlx5dr_domain_nic_unlock(nic_dmn); + return ret; +} + +static int +dr_rule_create_rule_fdb(struct mlx5dr_rule *rule, + struct mlx5dr_match_param *param, + size_t num_actions, + struct mlx5dr_action *actions[]) +{ + struct mlx5dr_match_param copy_param = {}; + int ret; + + /* Copy match_param since they will be consumed during the first + * nic_rule insertion. + */ + memcpy(©_param, param, sizeof(struct mlx5dr_match_param)); + + ret = dr_rule_create_rule_nic(rule, &rule->rx, param, + num_actions, actions); + if (ret) + return ret; + + ret = dr_rule_create_rule_nic(rule, &rule->tx, ©_param, + num_actions, actions); + if (ret) + goto destroy_rule_nic_rx; + + return 0; + +destroy_rule_nic_rx: + dr_rule_destroy_rule_nic(rule, &rule->rx); + return ret; +} + +static struct mlx5dr_rule * +dr_rule_create_rule(struct mlx5dr_matcher *matcher, + struct mlx5dr_match_parameters *value, + size_t num_actions, + struct mlx5dr_action *actions[], + u32 flow_source) +{ + struct mlx5dr_domain *dmn = matcher->tbl->dmn; + struct mlx5dr_match_param param = {}; + struct mlx5dr_rule *rule; + int ret; + + if (!dr_rule_verify(matcher, value, ¶m)) + return NULL; + + rule = kzalloc(sizeof(*rule), GFP_KERNEL); + if (!rule) + return NULL; + + rule->matcher = matcher; + rule->flow_source = flow_source; + INIT_LIST_HEAD(&rule->rule_actions_list); + + ret = dr_rule_add_action_members(rule, num_actions, actions); + if (ret) + goto free_rule; + + switch (dmn->type) { + case MLX5DR_DOMAIN_TYPE_NIC_RX: + rule->rx.nic_matcher = &matcher->rx; + ret = dr_rule_create_rule_nic(rule, &rule->rx, ¶m, + num_actions, actions); + break; + case MLX5DR_DOMAIN_TYPE_NIC_TX: + rule->tx.nic_matcher = &matcher->tx; + ret = dr_rule_create_rule_nic(rule, &rule->tx, ¶m, + num_actions, actions); + break; + case MLX5DR_DOMAIN_TYPE_FDB: + rule->rx.nic_matcher = &matcher->rx; + rule->tx.nic_matcher = &matcher->tx; + ret = dr_rule_create_rule_fdb(rule, ¶m, + num_actions, actions); + break; + default: + ret = -EINVAL; + break; + } + + if (ret) + goto remove_action_members; + + INIT_LIST_HEAD(&rule->dbg_node); + mlx5dr_dbg_rule_add(rule); + return rule; + +remove_action_members: + dr_rule_remove_action_members(rule); +free_rule: + kfree(rule); + mlx5dr_err(dmn, "Failed creating rule\n"); + return NULL; +} + +struct mlx5dr_rule *mlx5dr_rule_create(struct mlx5dr_matcher *matcher, + struct mlx5dr_match_parameters *value, + size_t num_actions, + struct mlx5dr_action *actions[], + u32 flow_source) +{ + struct mlx5dr_rule *rule; + + refcount_inc(&matcher->refcount); + + rule = dr_rule_create_rule(matcher, value, num_actions, actions, flow_source); + if (!rule) + refcount_dec(&matcher->refcount); + + return rule; +} + +int mlx5dr_rule_destroy(struct mlx5dr_rule *rule) +{ + struct mlx5dr_matcher *matcher = rule->matcher; + int ret; + + ret = dr_rule_destroy_rule(rule); + if (!ret) + refcount_dec(&matcher->refcount); + + return ret; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c new file mode 100644 index 0000000..9c9e4a8 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c @@ -0,0 +1,1353 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2019 Mellanox Technologies. */ + +#include +#include "dr_types.h" + +#define QUEUE_SIZE 128 +#define SIGNAL_PER_DIV_QUEUE 16 +#define TH_NUMS_TO_DRAIN 2 +#define DR_SEND_INFO_POOL_SIZE 1000 + +enum { CQ_OK = 0, CQ_EMPTY = -1, CQ_POLL_ERR = -2 }; + +struct dr_data_seg { + u64 addr; + u32 length; + u32 lkey; + unsigned int send_flags; +}; + +enum send_info_type { + WRITE_ICM = 0, + GTA_ARG = 1, +}; + +struct postsend_info { + enum send_info_type type; + struct dr_data_seg write; + struct dr_data_seg read; + u64 remote_addr; + u32 rkey; +}; + +struct dr_qp_rtr_attr { + struct mlx5dr_cmd_gid_attr dgid_attr; + enum ib_mtu mtu; + u32 qp_num; + u16 port_num; + u8 min_rnr_timer; + u8 sgid_index; + u16 udp_src_port; + u8 fl:1; +}; + +struct dr_qp_rts_attr { + u8 timeout; + u8 retry_cnt; + u8 rnr_retry; +}; + +struct dr_qp_init_attr { + u32 cqn; + u32 pdn; + u32 max_send_wr; + struct mlx5_uars_page *uar; + u8 isolate_vl_tc:1; +}; + +struct mlx5dr_send_info_pool_obj { + struct mlx5dr_ste_send_info ste_send_info; + struct mlx5dr_send_info_pool *pool; + struct list_head list_node; +}; + +struct mlx5dr_send_info_pool { + struct list_head free_list; +}; + +static int dr_send_info_pool_fill(struct mlx5dr_send_info_pool *pool) +{ + struct mlx5dr_send_info_pool_obj *pool_obj, *tmp_pool_obj; + int i; + + for (i = 0; i < DR_SEND_INFO_POOL_SIZE; i++) { + pool_obj = kzalloc(sizeof(*pool_obj), GFP_KERNEL); + if (!pool_obj) + goto clean_pool; + + pool_obj->pool = pool; + list_add_tail(&pool_obj->list_node, &pool->free_list); + } + + return 0; + +clean_pool: + list_for_each_entry_safe(pool_obj, tmp_pool_obj, &pool->free_list, list_node) { + list_del(&pool_obj->list_node); + kfree(pool_obj); + } + + return -ENOMEM; +} + +static void dr_send_info_pool_destroy(struct mlx5dr_send_info_pool *pool) +{ + struct mlx5dr_send_info_pool_obj *pool_obj, *tmp_pool_obj; + + list_for_each_entry_safe(pool_obj, tmp_pool_obj, &pool->free_list, list_node) { + list_del(&pool_obj->list_node); + kfree(pool_obj); + } + + kfree(pool); +} + +void mlx5dr_send_info_pool_destroy(struct mlx5dr_domain *dmn) +{ + dr_send_info_pool_destroy(dmn->send_info_pool_tx); + dr_send_info_pool_destroy(dmn->send_info_pool_rx); +} + +static struct mlx5dr_send_info_pool *dr_send_info_pool_create(void) +{ + struct mlx5dr_send_info_pool *pool; + int ret; + + pool = kzalloc(sizeof(*pool), GFP_KERNEL); + if (!pool) + return NULL; + + INIT_LIST_HEAD(&pool->free_list); + + ret = dr_send_info_pool_fill(pool); + if (ret) { + kfree(pool); + return NULL; + } + + return pool; +} + +int mlx5dr_send_info_pool_create(struct mlx5dr_domain *dmn) +{ + dmn->send_info_pool_rx = dr_send_info_pool_create(); + if (!dmn->send_info_pool_rx) + return -ENOMEM; + + dmn->send_info_pool_tx = dr_send_info_pool_create(); + if (!dmn->send_info_pool_tx) { + dr_send_info_pool_destroy(dmn->send_info_pool_rx); + return -ENOMEM; + } + + return 0; +} + +struct mlx5dr_ste_send_info +*mlx5dr_send_info_alloc(struct mlx5dr_domain *dmn, + enum mlx5dr_domain_nic_type nic_type) +{ + struct mlx5dr_send_info_pool_obj *pool_obj = NULL; + struct mlx5dr_send_info_pool *pool; + int ret; + + pool = nic_type == DR_DOMAIN_NIC_TYPE_RX ? dmn->send_info_pool_rx : + dmn->send_info_pool_tx; + + if (unlikely(list_empty(&pool->free_list))) { + ret = dr_send_info_pool_fill(pool); + if (ret) + return NULL; + } + + pool_obj = list_first_entry_or_null(&pool->free_list, + struct mlx5dr_send_info_pool_obj, + list_node); + + if (likely(pool_obj)) { + list_del_init(&pool_obj->list_node); + } else { + WARN_ONCE(!pool_obj, "Failed getting ste send info obj from pool"); + return NULL; + } + + return &pool_obj->ste_send_info; +} + +void mlx5dr_send_info_free(struct mlx5dr_ste_send_info *ste_send_info) +{ + struct mlx5dr_send_info_pool_obj *pool_obj; + + pool_obj = container_of(ste_send_info, + struct mlx5dr_send_info_pool_obj, + ste_send_info); + + list_add(&pool_obj->list_node, &pool_obj->pool->free_list); +} + +static int dr_parse_cqe(struct mlx5dr_cq *dr_cq, struct mlx5_cqe64 *cqe64) +{ + unsigned int idx; + u8 opcode; + + opcode = get_cqe_opcode(cqe64); + if (opcode == MLX5_CQE_REQ_ERR) { + idx = be16_to_cpu(cqe64->wqe_counter) & + (dr_cq->qp->sq.wqe_cnt - 1); + dr_cq->qp->sq.cc = dr_cq->qp->sq.wqe_head[idx] + 1; + } else if (opcode == MLX5_CQE_RESP_ERR) { + ++dr_cq->qp->sq.cc; + } else { + idx = be16_to_cpu(cqe64->wqe_counter) & + (dr_cq->qp->sq.wqe_cnt - 1); + dr_cq->qp->sq.cc = dr_cq->qp->sq.wqe_head[idx] + 1; + + return CQ_OK; + } + + return CQ_POLL_ERR; +} + +static int dr_cq_poll_one(struct mlx5dr_cq *dr_cq) +{ + struct mlx5_cqe64 *cqe64; + int err; + + cqe64 = mlx5_cqwq_get_cqe(&dr_cq->wq); + if (!cqe64) { + if (unlikely(dr_cq->mdev->state == + MLX5_DEVICE_STATE_INTERNAL_ERROR)) { + mlx5_core_dbg_once(dr_cq->mdev, + "Polling CQ while device is shutting down\n"); + return CQ_POLL_ERR; + } + return CQ_EMPTY; + } + + mlx5_cqwq_pop(&dr_cq->wq); + err = dr_parse_cqe(dr_cq, cqe64); + mlx5_cqwq_update_db_record(&dr_cq->wq); + + return err; +} + +static int dr_poll_cq(struct mlx5dr_cq *dr_cq, int ne) +{ + int npolled; + int err = 0; + + for (npolled = 0; npolled < ne; ++npolled) { + err = dr_cq_poll_one(dr_cq); + if (err != CQ_OK) + break; + } + + return err == CQ_POLL_ERR ? err : npolled; +} + +static struct mlx5dr_qp *dr_create_rc_qp(struct mlx5_core_dev *mdev, + struct dr_qp_init_attr *attr) +{ + u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {}; + u32 temp_qpc[MLX5_ST_SZ_DW(qpc)] = {}; + struct mlx5_wq_param wqp; + struct mlx5dr_qp *dr_qp; + int inlen; + void *qpc; + void *in; + int err; + + dr_qp = kzalloc(sizeof(*dr_qp), GFP_KERNEL); + if (!dr_qp) + return NULL; + + wqp.buf_numa_node = mdev->priv.numa_node; + wqp.db_numa_node = mdev->priv.numa_node; + + dr_qp->rq.pc = 0; + dr_qp->rq.cc = 0; + dr_qp->rq.wqe_cnt = 256; + dr_qp->sq.pc = 0; + dr_qp->sq.cc = 0; + dr_qp->sq.head = 0; + dr_qp->sq.wqe_cnt = roundup_pow_of_two(attr->max_send_wr); + + MLX5_SET(qpc, temp_qpc, log_rq_stride, ilog2(MLX5_SEND_WQE_DS) - 4); + MLX5_SET(qpc, temp_qpc, log_rq_size, ilog2(dr_qp->rq.wqe_cnt)); + MLX5_SET(qpc, temp_qpc, log_sq_size, ilog2(dr_qp->sq.wqe_cnt)); + err = mlx5_wq_qp_create(mdev, &wqp, temp_qpc, &dr_qp->wq, + &dr_qp->wq_ctrl); + if (err) { + mlx5_core_warn(mdev, "Can't create QP WQ\n"); + goto err_wq; + } + + dr_qp->sq.wqe_head = kcalloc(dr_qp->sq.wqe_cnt, + sizeof(dr_qp->sq.wqe_head[0]), + GFP_KERNEL); + + if (!dr_qp->sq.wqe_head) { + mlx5_core_warn(mdev, "Can't allocate wqe head\n"); + goto err_wqe_head; + } + + inlen = MLX5_ST_SZ_BYTES(create_qp_in) + + MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) * + dr_qp->wq_ctrl.buf.npages; + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_in; + } + + qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); + MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC); + MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED); + MLX5_SET(qpc, qpc, isolate_vl_tc, attr->isolate_vl_tc); + MLX5_SET(qpc, qpc, pd, attr->pdn); + MLX5_SET(qpc, qpc, uar_page, attr->uar->index); + MLX5_SET(qpc, qpc, log_page_size, + dr_qp->wq_ctrl.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); + MLX5_SET(qpc, qpc, fre, 1); + MLX5_SET(qpc, qpc, rlky, 1); + MLX5_SET(qpc, qpc, cqn_snd, attr->cqn); + MLX5_SET(qpc, qpc, cqn_rcv, attr->cqn); + MLX5_SET(qpc, qpc, log_rq_stride, ilog2(MLX5_SEND_WQE_DS) - 4); + MLX5_SET(qpc, qpc, log_rq_size, ilog2(dr_qp->rq.wqe_cnt)); + MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ); + MLX5_SET(qpc, qpc, log_sq_size, ilog2(dr_qp->sq.wqe_cnt)); + MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(mdev)); + MLX5_SET64(qpc, qpc, dbr_addr, dr_qp->wq_ctrl.db.dma); + if (MLX5_CAP_GEN(mdev, cqe_version) == 1) + MLX5_SET(qpc, qpc, user_index, 0xFFFFFF); + mlx5_fill_page_frag_array(&dr_qp->wq_ctrl.buf, + (__be64 *)MLX5_ADDR_OF(create_qp_in, + in, pas)); + + MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP); + err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out)); + dr_qp->qpn = MLX5_GET(create_qp_out, out, qpn); + kvfree(in); + if (err) + goto err_in; + dr_qp->uar = attr->uar; + + return dr_qp; + +err_in: + kfree(dr_qp->sq.wqe_head); +err_wqe_head: + mlx5_wq_destroy(&dr_qp->wq_ctrl); +err_wq: + kfree(dr_qp); + return NULL; +} + +static void dr_destroy_qp(struct mlx5_core_dev *mdev, + struct mlx5dr_qp *dr_qp) +{ + u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {}; + + MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP); + MLX5_SET(destroy_qp_in, in, qpn, dr_qp->qpn); + mlx5_cmd_exec_in(mdev, destroy_qp, in); + + kfree(dr_qp->sq.wqe_head); + mlx5_wq_destroy(&dr_qp->wq_ctrl); + kfree(dr_qp); +} + +static void dr_cmd_notify_hw(struct mlx5dr_qp *dr_qp, void *ctrl) +{ + dma_wmb(); + *dr_qp->wq.sq.db = cpu_to_be32(dr_qp->sq.pc & 0xffff); + + /* After wmb() the hw aware of new work */ + wmb(); + + mlx5_write64(ctrl, dr_qp->uar->map + MLX5_BF_OFFSET); +} + +static void +dr_rdma_handle_flow_access_arg_segments(struct mlx5_wqe_ctrl_seg *wq_ctrl, + u32 remote_addr, + struct dr_data_seg *data_seg, + int *size) +{ + struct mlx5_wqe_header_modify_argument_update_seg *wq_arg_seg; + struct mlx5_wqe_flow_update_ctrl_seg *wq_flow_seg; + + wq_ctrl->general_id = cpu_to_be32(remote_addr); + wq_flow_seg = (void *)(wq_ctrl + 1); + + /* mlx5_wqe_flow_update_ctrl_seg - all reserved */ + memset(wq_flow_seg, 0, sizeof(*wq_flow_seg)); + wq_arg_seg = (void *)(wq_flow_seg + 1); + + memcpy(wq_arg_seg->argument_list, + (void *)data_seg->addr, + data_seg->length); + + *size = sizeof(*wq_ctrl) / 16 + /* WQE ctrl segment */ + sizeof(*wq_flow_seg) / 16 + /* WQE flow update ctrl seg - reserved */ + sizeof(*wq_arg_seg) / 16; /* WQE hdr modify arg seg - data */ +} + +static void +dr_rdma_handle_icm_write_segments(struct mlx5_wqe_ctrl_seg *wq_ctrl, + u64 remote_addr, + u32 rkey, + struct dr_data_seg *data_seg, + unsigned int *size) +{ + struct mlx5_wqe_raddr_seg *wq_raddr; + struct mlx5_wqe_data_seg *wq_dseg; + + wq_raddr = (void *)(wq_ctrl + 1); + + wq_raddr->raddr = cpu_to_be64(remote_addr); + wq_raddr->rkey = cpu_to_be32(rkey); + wq_raddr->reserved = 0; + + wq_dseg = (void *)(wq_raddr + 1); + + wq_dseg->byte_count = cpu_to_be32(data_seg->length); + wq_dseg->lkey = cpu_to_be32(data_seg->lkey); + wq_dseg->addr = cpu_to_be64(data_seg->addr); + + *size = sizeof(*wq_ctrl) / 16 + /* WQE ctrl segment */ + sizeof(*wq_dseg) / 16 + /* WQE data segment */ + sizeof(*wq_raddr) / 16; /* WQE remote addr segment */ +} + +static void dr_set_ctrl_seg(struct mlx5_wqe_ctrl_seg *wq_ctrl, + struct dr_data_seg *data_seg) +{ + wq_ctrl->signature = 0; + wq_ctrl->rsvd[0] = 0; + wq_ctrl->rsvd[1] = 0; + wq_ctrl->fm_ce_se = data_seg->send_flags & IB_SEND_SIGNALED ? + MLX5_WQE_CTRL_CQ_UPDATE : 0; + wq_ctrl->imm = 0; +} + +static void dr_rdma_segments(struct mlx5dr_qp *dr_qp, u64 remote_addr, + u32 rkey, struct dr_data_seg *data_seg, + u32 opcode, bool notify_hw) +{ + struct mlx5_wqe_ctrl_seg *wq_ctrl; + int opcode_mod = 0; + unsigned int size; + unsigned int idx; + + idx = dr_qp->sq.pc & (dr_qp->sq.wqe_cnt - 1); + + wq_ctrl = mlx5_wq_cyc_get_wqe(&dr_qp->wq.sq, idx); + dr_set_ctrl_seg(wq_ctrl, data_seg); + + switch (opcode) { + case MLX5_OPCODE_RDMA_READ: + case MLX5_OPCODE_RDMA_WRITE: + dr_rdma_handle_icm_write_segments(wq_ctrl, remote_addr, + rkey, data_seg, &size); + break; + case MLX5_OPCODE_FLOW_TBL_ACCESS: + opcode_mod = MLX5_CMD_OP_MOD_UPDATE_HEADER_MODIFY_ARGUMENT; + dr_rdma_handle_flow_access_arg_segments(wq_ctrl, remote_addr, + data_seg, &size); + break; + default: + WARN(true, "illegal opcode %d", opcode); + return; + } + + /* -------------------------------------------------------- + * |opcode_mod (8 bit)|wqe_index (16 bits)| opcod (8 bits)| + * -------------------------------------------------------- + */ + wq_ctrl->opmod_idx_opcode = + cpu_to_be32((opcode_mod << 24) | + ((dr_qp->sq.pc & 0xffff) << 8) | + opcode); + wq_ctrl->qpn_ds = cpu_to_be32(size | dr_qp->qpn << 8); + + dr_qp->sq.pc += DIV_ROUND_UP(size * 16, MLX5_SEND_WQE_BB); + dr_qp->sq.wqe_head[idx] = dr_qp->sq.head++; + + if (notify_hw) + dr_cmd_notify_hw(dr_qp, wq_ctrl); +} + +static void dr_post_send(struct mlx5dr_qp *dr_qp, struct postsend_info *send_info) +{ + if (send_info->type == WRITE_ICM) { + /* false, because we delay the post_send_db till the coming READ */ + dr_rdma_segments(dr_qp, send_info->remote_addr, send_info->rkey, + &send_info->write, MLX5_OPCODE_RDMA_WRITE, false); + /* true, because we send WRITE + READ together */ + dr_rdma_segments(dr_qp, send_info->remote_addr, send_info->rkey, + &send_info->read, MLX5_OPCODE_RDMA_READ, true); + } else { /* GTA_ARG */ + dr_rdma_segments(dr_qp, send_info->remote_addr, send_info->rkey, + &send_info->write, MLX5_OPCODE_FLOW_TBL_ACCESS, true); + } + +} + +/** + * mlx5dr_send_fill_and_append_ste_send_info: Add data to be sent + * with send_list parameters: + * + * @ste: The data that attached to this specific ste + * @size: of data to write + * @offset: of the data from start of the hw_ste entry + * @data: data + * @ste_info: ste to be sent with send_list + * @send_list: to append into it + * @copy_data: if true indicates that the data should be kept because + * it's not backuped any where (like in re-hash). + * if false, it lets the data to be updated after + * it was added to the list. + */ +void mlx5dr_send_fill_and_append_ste_send_info(struct mlx5dr_ste *ste, u16 size, + u16 offset, u8 *data, + struct mlx5dr_ste_send_info *ste_info, + struct list_head *send_list, + bool copy_data) +{ + ste_info->size = size; + ste_info->ste = ste; + ste_info->offset = offset; + + if (copy_data) { + memcpy(ste_info->data_cont, data, size); + ste_info->data = ste_info->data_cont; + } else { + ste_info->data = data; + } + + list_add_tail(&ste_info->send_list, send_list); +} + +/* The function tries to consume one wc each time, unless the queue is full, in + * that case, which means that the hw is behind the sw in a full queue len + * the function will drain the cq till it empty. + */ +static int dr_handle_pending_wc(struct mlx5dr_domain *dmn, + struct mlx5dr_send_ring *send_ring) +{ + bool is_drain = false; + int ne; + + if (send_ring->pending_wqe < send_ring->signal_th) + return 0; + + /* Queue is full start drain it */ + if (send_ring->pending_wqe >= + dmn->send_ring->signal_th * TH_NUMS_TO_DRAIN) + is_drain = true; + + do { + ne = dr_poll_cq(send_ring->cq, 1); + if (unlikely(ne < 0)) { + mlx5_core_warn_once(dmn->mdev, "SMFS QPN 0x%x is disabled/limited", + send_ring->qp->qpn); + send_ring->err_state = true; + return ne; + } else if (ne == 1) { + send_ring->pending_wqe -= send_ring->signal_th; + } + } while (ne == 1 || + (is_drain && send_ring->pending_wqe >= send_ring->signal_th)); + + return 0; +} + +static void dr_fill_write_args_segs(struct mlx5dr_send_ring *send_ring, + struct postsend_info *send_info) +{ + send_ring->pending_wqe++; + + if (send_ring->pending_wqe % send_ring->signal_th == 0) + send_info->write.send_flags = IB_SEND_SIGNALED; + else + send_info->write.send_flags = 0; +} + +static void dr_fill_write_icm_segs(struct mlx5dr_domain *dmn, + struct mlx5dr_send_ring *send_ring, + struct postsend_info *send_info) +{ + u32 buff_offset; + + if (send_info->write.length > dmn->info.max_inline_size) { + buff_offset = (send_ring->tx_head & + (dmn->send_ring->signal_th - 1)) * + send_ring->max_post_send_size; + /* Copy to ring mr */ + memcpy(send_ring->buf + buff_offset, + (void *)(uintptr_t)send_info->write.addr, + send_info->write.length); + send_info->write.addr = (uintptr_t)send_ring->mr->dma_addr + buff_offset; + send_info->write.lkey = send_ring->mr->mkey; + + send_ring->tx_head++; + } + + send_ring->pending_wqe++; + + if (send_ring->pending_wqe % send_ring->signal_th == 0) + send_info->write.send_flags |= IB_SEND_SIGNALED; + else + send_info->write.send_flags = 0; + + send_ring->pending_wqe++; + send_info->read.length = send_info->write.length; + /* Read into the sync buffer */ + send_info->read.addr = (uintptr_t)send_ring->sync_mr->dma_addr; + send_info->read.lkey = send_ring->sync_mr->mkey; + + if (send_ring->pending_wqe % send_ring->signal_th == 0) + send_info->read.send_flags |= IB_SEND_SIGNALED; + else + send_info->read.send_flags = 0; +} + +static void dr_fill_data_segs(struct mlx5dr_domain *dmn, + struct mlx5dr_send_ring *send_ring, + struct postsend_info *send_info) +{ + if (send_info->type == WRITE_ICM) + dr_fill_write_icm_segs(dmn, send_ring, send_info); + else /* args */ + dr_fill_write_args_segs(send_ring, send_info); +} + +static int dr_postsend_icm_data(struct mlx5dr_domain *dmn, + struct postsend_info *send_info) +{ + struct mlx5dr_send_ring *send_ring = dmn->send_ring; + int ret; + + if (unlikely(dmn->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR || + send_ring->err_state)) { + mlx5_core_dbg_once(dmn->mdev, + "Skipping post send: QP err state: %d, device state: %d\n", + send_ring->err_state, dmn->mdev->state); + return 0; + } + + spin_lock(&send_ring->lock); + + ret = dr_handle_pending_wc(dmn, send_ring); + if (ret) + goto out_unlock; + + dr_fill_data_segs(dmn, send_ring, send_info); + dr_post_send(send_ring->qp, send_info); + +out_unlock: + spin_unlock(&send_ring->lock); + return ret; +} + +static int dr_get_tbl_copy_details(struct mlx5dr_domain *dmn, + struct mlx5dr_ste_htbl *htbl, + u8 **data, + u32 *byte_size, + int *iterations, + int *num_stes) +{ + u32 chunk_byte_size = mlx5dr_icm_pool_get_chunk_byte_size(htbl->chunk); + int alloc_size; + + if (chunk_byte_size > dmn->send_ring->max_post_send_size) { + *iterations = chunk_byte_size / dmn->send_ring->max_post_send_size; + *byte_size = dmn->send_ring->max_post_send_size; + alloc_size = *byte_size; + *num_stes = *byte_size / DR_STE_SIZE; + } else { + *iterations = 1; + *num_stes = mlx5dr_icm_pool_get_chunk_num_of_entries(htbl->chunk); + alloc_size = *num_stes * DR_STE_SIZE; + } + + *data = kvzalloc(alloc_size, GFP_KERNEL); + if (!*data) + return -ENOMEM; + + return 0; +} + +/** + * mlx5dr_send_postsend_ste: write size bytes into offset from the hw cm. + * + * @dmn: Domain + * @ste: The ste struct that contains the data (at + * least part of it) + * @data: The real data to send size data + * @size: for writing. + * @offset: The offset from the icm mapped data to + * start write to this for write only part of the + * buffer. + * + * Return: 0 on success. + */ +int mlx5dr_send_postsend_ste(struct mlx5dr_domain *dmn, struct mlx5dr_ste *ste, + u8 *data, u16 size, u16 offset) +{ + struct postsend_info send_info = {}; + + mlx5dr_ste_prepare_for_postsend(dmn->ste_ctx, data, size); + + send_info.write.addr = (uintptr_t)data; + send_info.write.length = size; + send_info.write.lkey = 0; + send_info.remote_addr = mlx5dr_ste_get_mr_addr(ste) + offset; + send_info.rkey = mlx5dr_icm_pool_get_chunk_rkey(ste->htbl->chunk); + + return dr_postsend_icm_data(dmn, &send_info); +} + +int mlx5dr_send_postsend_htbl(struct mlx5dr_domain *dmn, + struct mlx5dr_ste_htbl *htbl, + u8 *formatted_ste, u8 *mask) +{ + u32 byte_size = mlx5dr_icm_pool_get_chunk_byte_size(htbl->chunk); + int num_stes_per_iter; + int iterations; + u8 *data; + int ret; + int i; + int j; + + ret = dr_get_tbl_copy_details(dmn, htbl, &data, &byte_size, + &iterations, &num_stes_per_iter); + if (ret) + return ret; + + mlx5dr_ste_prepare_for_postsend(dmn->ste_ctx, formatted_ste, DR_STE_SIZE); + + /* Send the data iteration times */ + for (i = 0; i < iterations; i++) { + u32 ste_index = i * (byte_size / DR_STE_SIZE); + struct postsend_info send_info = {}; + + /* Copy all ste's on the data buffer + * need to add the bit_mask + */ + for (j = 0; j < num_stes_per_iter; j++) { + struct mlx5dr_ste *ste = &htbl->chunk->ste_arr[ste_index + j]; + u32 ste_off = j * DR_STE_SIZE; + + if (mlx5dr_ste_is_not_used(ste)) { + memcpy(data + ste_off, + formatted_ste, DR_STE_SIZE); + } else { + /* Copy data */ + memcpy(data + ste_off, + htbl->chunk->hw_ste_arr + + DR_STE_SIZE_REDUCED * (ste_index + j), + DR_STE_SIZE_REDUCED); + /* Copy bit_mask */ + memcpy(data + ste_off + DR_STE_SIZE_REDUCED, + mask, DR_STE_SIZE_MASK); + /* Only when we have mask we need to re-arrange the STE */ + mlx5dr_ste_prepare_for_postsend(dmn->ste_ctx, + data + (j * DR_STE_SIZE), + DR_STE_SIZE); + } + } + + send_info.write.addr = (uintptr_t)data; + send_info.write.length = byte_size; + send_info.write.lkey = 0; + send_info.remote_addr = + mlx5dr_ste_get_mr_addr(htbl->chunk->ste_arr + ste_index); + send_info.rkey = mlx5dr_icm_pool_get_chunk_rkey(htbl->chunk); + + ret = dr_postsend_icm_data(dmn, &send_info); + if (ret) + goto out_free; + } + +out_free: + kvfree(data); + return ret; +} + +/* Initialize htble with default STEs */ +int mlx5dr_send_postsend_formatted_htbl(struct mlx5dr_domain *dmn, + struct mlx5dr_ste_htbl *htbl, + u8 *ste_init_data, + bool update_hw_ste) +{ + u32 byte_size = mlx5dr_icm_pool_get_chunk_byte_size(htbl->chunk); + int iterations; + int num_stes; + u8 *copy_dst; + u8 *data; + int ret; + int i; + + ret = dr_get_tbl_copy_details(dmn, htbl, &data, &byte_size, + &iterations, &num_stes); + if (ret) + return ret; + + if (update_hw_ste) { + /* Copy the reduced STE to hash table ste_arr */ + for (i = 0; i < num_stes; i++) { + copy_dst = htbl->chunk->hw_ste_arr + i * DR_STE_SIZE_REDUCED; + memcpy(copy_dst, ste_init_data, DR_STE_SIZE_REDUCED); + } + } + + mlx5dr_ste_prepare_for_postsend(dmn->ste_ctx, ste_init_data, DR_STE_SIZE); + + /* Copy the same STE on the data buffer */ + for (i = 0; i < num_stes; i++) { + copy_dst = data + i * DR_STE_SIZE; + memcpy(copy_dst, ste_init_data, DR_STE_SIZE); + } + + /* Send the data iteration times */ + for (i = 0; i < iterations; i++) { + u8 ste_index = i * (byte_size / DR_STE_SIZE); + struct postsend_info send_info = {}; + + send_info.write.addr = (uintptr_t)data; + send_info.write.length = byte_size; + send_info.write.lkey = 0; + send_info.remote_addr = + mlx5dr_ste_get_mr_addr(htbl->chunk->ste_arr + ste_index); + send_info.rkey = mlx5dr_icm_pool_get_chunk_rkey(htbl->chunk); + + ret = dr_postsend_icm_data(dmn, &send_info); + if (ret) + goto out_free; + } + +out_free: + kvfree(data); + return ret; +} + +int mlx5dr_send_postsend_action(struct mlx5dr_domain *dmn, + struct mlx5dr_action *action) +{ + struct postsend_info send_info = {}; + int ret; + + send_info.write.addr = (uintptr_t)action->rewrite->data; + send_info.write.length = action->rewrite->num_of_actions * + DR_MODIFY_ACTION_SIZE; + send_info.write.lkey = 0; + send_info.remote_addr = + mlx5dr_icm_pool_get_chunk_mr_addr(action->rewrite->chunk); + send_info.rkey = mlx5dr_icm_pool_get_chunk_rkey(action->rewrite->chunk); + + ret = dr_postsend_icm_data(dmn, &send_info); + + return ret; +} + +int mlx5dr_send_postsend_args(struct mlx5dr_domain *dmn, + struct mlx5dr_action *action) +{ + int data_len, iter = 0, cur_sent; + u64 addr; + int ret; + + addr = (uintptr_t)action->rewrite->data; + data_len = action->rewrite->num_of_actions * DR_MODIFY_ACTION_SIZE; + + do { + struct postsend_info send_info = {}; + + send_info.type = GTA_ARG; + send_info.write.addr = addr; + cur_sent = min_t(u32, data_len, MLX5DR_ACTION_CACHE_LINE_SIZE); + send_info.write.length = cur_sent; + send_info.write.lkey = 0; + send_info.remote_addr = + mlx5dr_arg_get_object_id(action->rewrite->arg) + iter; + + ret = dr_postsend_icm_data(dmn, &send_info); + if (ret) + goto out; + + iter++; + addr += cur_sent; + data_len -= cur_sent; + } while (data_len > 0); + +out: + return ret; +} + +static int dr_modify_qp_rst2init(struct mlx5_core_dev *mdev, + struct mlx5dr_qp *dr_qp, + int port) +{ + u32 in[MLX5_ST_SZ_DW(rst2init_qp_in)] = {}; + void *qpc; + + qpc = MLX5_ADDR_OF(rst2init_qp_in, in, qpc); + + MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, port); + MLX5_SET(qpc, qpc, pm_state, MLX5_QPC_PM_STATE_MIGRATED); + MLX5_SET(qpc, qpc, rre, 1); + MLX5_SET(qpc, qpc, rwe, 1); + + MLX5_SET(rst2init_qp_in, in, opcode, MLX5_CMD_OP_RST2INIT_QP); + MLX5_SET(rst2init_qp_in, in, qpn, dr_qp->qpn); + + return mlx5_cmd_exec_in(mdev, rst2init_qp, in); +} + +static int dr_cmd_modify_qp_rtr2rts(struct mlx5_core_dev *mdev, + struct mlx5dr_qp *dr_qp, + struct dr_qp_rts_attr *attr) +{ + u32 in[MLX5_ST_SZ_DW(rtr2rts_qp_in)] = {}; + void *qpc; + + qpc = MLX5_ADDR_OF(rtr2rts_qp_in, in, qpc); + + MLX5_SET(rtr2rts_qp_in, in, qpn, dr_qp->qpn); + + MLX5_SET(qpc, qpc, retry_count, attr->retry_cnt); + MLX5_SET(qpc, qpc, rnr_retry, attr->rnr_retry); + MLX5_SET(qpc, qpc, primary_address_path.ack_timeout, 0x8); /* ~1ms */ + + MLX5_SET(rtr2rts_qp_in, in, opcode, MLX5_CMD_OP_RTR2RTS_QP); + MLX5_SET(rtr2rts_qp_in, in, qpn, dr_qp->qpn); + + return mlx5_cmd_exec_in(mdev, rtr2rts_qp, in); +} + +static int dr_cmd_modify_qp_init2rtr(struct mlx5_core_dev *mdev, + struct mlx5dr_qp *dr_qp, + struct dr_qp_rtr_attr *attr) +{ + u32 in[MLX5_ST_SZ_DW(init2rtr_qp_in)] = {}; + void *qpc; + + qpc = MLX5_ADDR_OF(init2rtr_qp_in, in, qpc); + + MLX5_SET(init2rtr_qp_in, in, qpn, dr_qp->qpn); + + MLX5_SET(qpc, qpc, mtu, attr->mtu); + MLX5_SET(qpc, qpc, log_msg_max, DR_CHUNK_SIZE_MAX - 1); + MLX5_SET(qpc, qpc, remote_qpn, attr->qp_num); + memcpy(MLX5_ADDR_OF(qpc, qpc, primary_address_path.rmac_47_32), + attr->dgid_attr.mac, sizeof(attr->dgid_attr.mac)); + memcpy(MLX5_ADDR_OF(qpc, qpc, primary_address_path.rgid_rip), + attr->dgid_attr.gid, sizeof(attr->dgid_attr.gid)); + MLX5_SET(qpc, qpc, primary_address_path.src_addr_index, + attr->sgid_index); + + if (attr->dgid_attr.roce_ver == MLX5_ROCE_VERSION_2) + MLX5_SET(qpc, qpc, primary_address_path.udp_sport, + attr->udp_src_port); + + MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, attr->port_num); + MLX5_SET(qpc, qpc, primary_address_path.fl, attr->fl); + MLX5_SET(qpc, qpc, min_rnr_nak, 1); + + MLX5_SET(init2rtr_qp_in, in, opcode, MLX5_CMD_OP_INIT2RTR_QP); + MLX5_SET(init2rtr_qp_in, in, qpn, dr_qp->qpn); + + return mlx5_cmd_exec_in(mdev, init2rtr_qp, in); +} + +static bool dr_send_allow_fl(struct mlx5dr_cmd_caps *caps) +{ + /* Check whether RC RoCE QP creation with force loopback is allowed. + * There are two separate capability bits for this: + * - force loopback when RoCE is enabled + * - force loopback when RoCE is disabled + */ + return ((caps->roce_caps.roce_en && + caps->roce_caps.fl_rc_qp_when_roce_enabled) || + (!caps->roce_caps.roce_en && + caps->roce_caps.fl_rc_qp_when_roce_disabled)); +} + +static int dr_prepare_qp_to_rts(struct mlx5dr_domain *dmn) +{ + struct mlx5dr_qp *dr_qp = dmn->send_ring->qp; + struct dr_qp_rts_attr rts_attr = {}; + struct dr_qp_rtr_attr rtr_attr = {}; + enum ib_mtu mtu = IB_MTU_1024; + u16 gid_index = 0; + int port = 1; + int ret; + + /* Init */ + ret = dr_modify_qp_rst2init(dmn->mdev, dr_qp, port); + if (ret) { + mlx5dr_err(dmn, "Failed modify QP rst2init\n"); + return ret; + } + + /* RTR */ + rtr_attr.mtu = mtu; + rtr_attr.qp_num = dr_qp->qpn; + rtr_attr.min_rnr_timer = 12; + rtr_attr.port_num = port; + rtr_attr.udp_src_port = dmn->info.caps.roce_min_src_udp; + + /* If QP creation with force loopback is allowed, then there + * is no need for GID index when creating the QP. + * Otherwise we query GID attributes and use GID index. + */ + rtr_attr.fl = dr_send_allow_fl(&dmn->info.caps); + if (!rtr_attr.fl) { + ret = mlx5dr_cmd_query_gid(dmn->mdev, port, gid_index, + &rtr_attr.dgid_attr); + if (ret) + return ret; + + rtr_attr.sgid_index = gid_index; + } + + ret = dr_cmd_modify_qp_init2rtr(dmn->mdev, dr_qp, &rtr_attr); + if (ret) { + mlx5dr_err(dmn, "Failed modify QP init2rtr\n"); + return ret; + } + + /* RTS */ + rts_attr.timeout = 14; + rts_attr.retry_cnt = 7; + rts_attr.rnr_retry = 7; + + ret = dr_cmd_modify_qp_rtr2rts(dmn->mdev, dr_qp, &rts_attr); + if (ret) { + mlx5dr_err(dmn, "Failed modify QP rtr2rts\n"); + return ret; + } + + return 0; +} + +static void dr_cq_complete(struct mlx5_core_cq *mcq, + struct mlx5_eqe *eqe) +{ + pr_err("CQ completion CQ: #%u\n", mcq->cqn); +} + +static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev, + struct mlx5_uars_page *uar, + size_t ncqe) +{ + u32 temp_cqc[MLX5_ST_SZ_DW(cqc)] = {}; + u32 out[MLX5_ST_SZ_DW(create_cq_out)]; + struct mlx5_wq_param wqp; + struct mlx5_cqe64 *cqe; + struct mlx5dr_cq *cq; + int inlen, err, eqn; + void *cqc, *in; + __be64 *pas; + int vector; + u32 i; + + cq = kzalloc(sizeof(*cq), GFP_KERNEL); + if (!cq) + return NULL; + + ncqe = roundup_pow_of_two(ncqe); + MLX5_SET(cqc, temp_cqc, log_cq_size, ilog2(ncqe)); + + wqp.buf_numa_node = mdev->priv.numa_node; + wqp.db_numa_node = mdev->priv.numa_node; + + err = mlx5_cqwq_create(mdev, &wqp, temp_cqc, &cq->wq, + &cq->wq_ctrl); + if (err) + goto out; + + for (i = 0; i < mlx5_cqwq_get_size(&cq->wq); i++) { + cqe = mlx5_cqwq_get_wqe(&cq->wq, i); + cqe->op_own = MLX5_CQE_INVALID << 4 | MLX5_CQE_OWNER_MASK; + } + + inlen = MLX5_ST_SZ_BYTES(create_cq_in) + + sizeof(u64) * cq->wq_ctrl.buf.npages; + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + goto err_cqwq; + + vector = raw_smp_processor_id() % mlx5_comp_vectors_count(mdev); + err = mlx5_vector2eqn(mdev, vector, &eqn); + if (err) { + kvfree(in); + goto err_cqwq; + } + + cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context); + MLX5_SET(cqc, cqc, log_cq_size, ilog2(ncqe)); + MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn); + MLX5_SET(cqc, cqc, uar_page, uar->index); + MLX5_SET(cqc, cqc, log_page_size, cq->wq_ctrl.buf.page_shift - + MLX5_ADAPTER_PAGE_SHIFT); + MLX5_SET64(cqc, cqc, dbr_addr, cq->wq_ctrl.db.dma); + + pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas); + mlx5_fill_page_frag_array(&cq->wq_ctrl.buf, pas); + + cq->mcq.comp = dr_cq_complete; + + err = mlx5_core_create_cq(mdev, &cq->mcq, in, inlen, out, sizeof(out)); + kvfree(in); + + if (err) + goto err_cqwq; + + cq->mcq.cqe_sz = 64; + cq->mcq.set_ci_db = cq->wq_ctrl.db.db; + cq->mcq.arm_db = cq->wq_ctrl.db.db + 1; + *cq->mcq.set_ci_db = 0; + + /* set no-zero value, in order to avoid the HW to run db-recovery on + * CQ that used in polling mode. + */ + *cq->mcq.arm_db = cpu_to_be32(2 << 28); + + cq->mcq.vector = 0; + cq->mcq.uar = uar; + cq->mdev = mdev; + + return cq; + +err_cqwq: + mlx5_wq_destroy(&cq->wq_ctrl); +out: + kfree(cq); + return NULL; +} + +static void dr_destroy_cq(struct mlx5_core_dev *mdev, struct mlx5dr_cq *cq) +{ + mlx5_core_destroy_cq(mdev, &cq->mcq); + mlx5_wq_destroy(&cq->wq_ctrl); + kfree(cq); +} + +static int dr_create_mkey(struct mlx5_core_dev *mdev, u32 pdn, u32 *mkey) +{ + u32 in[MLX5_ST_SZ_DW(create_mkey_in)] = {}; + void *mkc; + + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA); + MLX5_SET(mkc, mkc, a, 1); + MLX5_SET(mkc, mkc, rw, 1); + MLX5_SET(mkc, mkc, rr, 1); + MLX5_SET(mkc, mkc, lw, 1); + MLX5_SET(mkc, mkc, lr, 1); + + MLX5_SET(mkc, mkc, pd, pdn); + MLX5_SET(mkc, mkc, length64, 1); + MLX5_SET(mkc, mkc, qpn, 0xffffff); + + return mlx5_core_create_mkey(mdev, mkey, in, sizeof(in)); +} + +static struct mlx5dr_mr *dr_reg_mr(struct mlx5_core_dev *mdev, + u32 pdn, void *buf, size_t size) +{ + struct mlx5dr_mr *mr = kzalloc(sizeof(*mr), GFP_KERNEL); + struct device *dma_device; + dma_addr_t dma_addr; + int err; + + if (!mr) + return NULL; + + dma_device = mlx5_core_dma_dev(mdev); + dma_addr = dma_map_single(dma_device, buf, size, + DMA_BIDIRECTIONAL); + err = dma_mapping_error(dma_device, dma_addr); + if (err) { + mlx5_core_warn(mdev, "Can't dma buf\n"); + kfree(mr); + return NULL; + } + + err = dr_create_mkey(mdev, pdn, &mr->mkey); + if (err) { + mlx5_core_warn(mdev, "Can't create mkey\n"); + dma_unmap_single(dma_device, dma_addr, size, + DMA_BIDIRECTIONAL); + kfree(mr); + return NULL; + } + + mr->dma_addr = dma_addr; + mr->size = size; + mr->addr = buf; + + return mr; +} + +static void dr_dereg_mr(struct mlx5_core_dev *mdev, struct mlx5dr_mr *mr) +{ + mlx5_core_destroy_mkey(mdev, mr->mkey); + dma_unmap_single(mlx5_core_dma_dev(mdev), mr->dma_addr, mr->size, + DMA_BIDIRECTIONAL); + kfree(mr); +} + +int mlx5dr_send_ring_alloc(struct mlx5dr_domain *dmn) +{ + struct dr_qp_init_attr init_attr = {}; + int cq_size; + int size; + int ret; + + dmn->send_ring = kzalloc(sizeof(*dmn->send_ring), GFP_KERNEL); + if (!dmn->send_ring) + return -ENOMEM; + + cq_size = QUEUE_SIZE + 1; + dmn->send_ring->cq = dr_create_cq(dmn->mdev, dmn->uar, cq_size); + if (!dmn->send_ring->cq) { + mlx5dr_err(dmn, "Failed creating CQ\n"); + ret = -ENOMEM; + goto free_send_ring; + } + + init_attr.cqn = dmn->send_ring->cq->mcq.cqn; + init_attr.pdn = dmn->pdn; + init_attr.uar = dmn->uar; + init_attr.max_send_wr = QUEUE_SIZE; + + /* Isolated VL is applicable only if force loopback is supported */ + if (dr_send_allow_fl(&dmn->info.caps)) + init_attr.isolate_vl_tc = dmn->info.caps.isolate_vl_tc; + + spin_lock_init(&dmn->send_ring->lock); + + dmn->send_ring->qp = dr_create_rc_qp(dmn->mdev, &init_attr); + if (!dmn->send_ring->qp) { + mlx5dr_err(dmn, "Failed creating QP\n"); + ret = -ENOMEM; + goto clean_cq; + } + + dmn->send_ring->cq->qp = dmn->send_ring->qp; + + dmn->info.max_send_wr = QUEUE_SIZE; + dmn->info.max_inline_size = min(dmn->send_ring->qp->max_inline_data, + DR_STE_SIZE); + + dmn->send_ring->signal_th = dmn->info.max_send_wr / + SIGNAL_PER_DIV_QUEUE; + + /* Prepare qp to be used */ + ret = dr_prepare_qp_to_rts(dmn); + if (ret) + goto clean_qp; + + dmn->send_ring->max_post_send_size = + mlx5dr_icm_pool_chunk_size_to_byte(DR_CHUNK_SIZE_1K, + DR_ICM_TYPE_STE); + + /* Allocating the max size as a buffer for writing */ + size = dmn->send_ring->signal_th * dmn->send_ring->max_post_send_size; + dmn->send_ring->buf = kzalloc(size, GFP_KERNEL); + if (!dmn->send_ring->buf) { + ret = -ENOMEM; + goto clean_qp; + } + + dmn->send_ring->buf_size = size; + + dmn->send_ring->mr = dr_reg_mr(dmn->mdev, + dmn->pdn, dmn->send_ring->buf, size); + if (!dmn->send_ring->mr) { + ret = -ENOMEM; + goto free_mem; + } + + dmn->send_ring->sync_buff = kzalloc(dmn->send_ring->max_post_send_size, + GFP_KERNEL); + if (!dmn->send_ring->sync_buff) { + ret = -ENOMEM; + goto clean_mr; + } + + dmn->send_ring->sync_mr = dr_reg_mr(dmn->mdev, + dmn->pdn, dmn->send_ring->sync_buff, + dmn->send_ring->max_post_send_size); + if (!dmn->send_ring->sync_mr) { + ret = -ENOMEM; + goto free_sync_mem; + } + + return 0; + +free_sync_mem: + kfree(dmn->send_ring->sync_buff); +clean_mr: + dr_dereg_mr(dmn->mdev, dmn->send_ring->mr); +free_mem: + kfree(dmn->send_ring->buf); +clean_qp: + dr_destroy_qp(dmn->mdev, dmn->send_ring->qp); +clean_cq: + dr_destroy_cq(dmn->mdev, dmn->send_ring->cq); +free_send_ring: + kfree(dmn->send_ring); + + return ret; +} + +void mlx5dr_send_ring_free(struct mlx5dr_domain *dmn, + struct mlx5dr_send_ring *send_ring) +{ + dr_destroy_qp(dmn->mdev, send_ring->qp); + dr_destroy_cq(dmn->mdev, send_ring->cq); + dr_dereg_mr(dmn->mdev, send_ring->sync_mr); + dr_dereg_mr(dmn->mdev, send_ring->mr); + kfree(send_ring->buf); + kfree(dmn->send_ring->sync_buff); + kfree(send_ring); +} + +int mlx5dr_send_ring_force_drain(struct mlx5dr_domain *dmn) +{ + struct mlx5dr_send_ring *send_ring = dmn->send_ring; + struct postsend_info send_info = {}; + u8 data[DR_STE_SIZE]; + int num_of_sends_req; + int ret; + int i; + + /* Sending this amount of requests makes sure we will get drain */ + num_of_sends_req = send_ring->signal_th * TH_NUMS_TO_DRAIN / 2; + + /* Send fake requests forcing the last to be signaled */ + send_info.write.addr = (uintptr_t)data; + send_info.write.length = DR_STE_SIZE; + send_info.write.lkey = 0; + /* Using the sync_mr in order to write/read */ + send_info.remote_addr = (uintptr_t)send_ring->sync_mr->addr; + send_info.rkey = send_ring->sync_mr->mkey; + + for (i = 0; i < num_of_sends_req; i++) { + ret = dr_postsend_icm_data(dmn, &send_info); + if (ret) + return ret; + } + + spin_lock(&send_ring->lock); + ret = dr_handle_pending_wc(dmn, send_ring); + spin_unlock(&send_ring->lock); + + return ret; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c new file mode 100644 index 0000000..ac69397 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c @@ -0,0 +1,1415 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2019 Mellanox Technologies. */ + +#include +#include +#include "dr_ste.h" + +struct dr_hw_ste_format { + u8 ctrl[DR_STE_SIZE_CTRL]; + u8 tag[DR_STE_SIZE_TAG]; + u8 mask[DR_STE_SIZE_MASK]; +}; + +static u32 dr_ste_crc32_calc(const void *input_data, size_t length) +{ + u32 crc = crc32(0, input_data, length); + + return (__force u32)htonl(crc); +} + +bool mlx5dr_ste_supp_ttl_cs_recalc(struct mlx5dr_cmd_caps *caps) +{ + return caps->sw_format_ver > MLX5_STEERING_FORMAT_CONNECTX_5; +} + +u32 mlx5dr_ste_calc_hash_index(u8 *hw_ste_p, struct mlx5dr_ste_htbl *htbl) +{ + u32 num_entries = mlx5dr_icm_pool_get_chunk_num_of_entries(htbl->chunk); + struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; + u8 masked[DR_STE_SIZE_TAG] = {}; + u32 crc32, index; + u16 bit; + int i; + + /* Don't calculate CRC if the result is predicted */ + if (num_entries == 1 || htbl->byte_mask == 0) + return 0; + + /* Mask tag using byte mask, bit per byte */ + bit = 1 << (DR_STE_SIZE_TAG - 1); + for (i = 0; i < DR_STE_SIZE_TAG; i++) { + if (htbl->byte_mask & bit) + masked[i] = hw_ste->tag[i]; + + bit = bit >> 1; + } + + crc32 = dr_ste_crc32_calc(masked, DR_STE_SIZE_TAG); + index = crc32 & (num_entries - 1); + + return index; +} + +u16 mlx5dr_ste_conv_bit_to_byte_mask(u8 *bit_mask) +{ + u16 byte_mask = 0; + int i; + + for (i = 0; i < DR_STE_SIZE_MASK; i++) { + byte_mask = byte_mask << 1; + if (bit_mask[i] == 0xff) + byte_mask |= 1; + } + return byte_mask; +} + +static u8 *dr_ste_get_tag(u8 *hw_ste_p) +{ + struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; + + return hw_ste->tag; +} + +void mlx5dr_ste_set_bit_mask(u8 *hw_ste_p, u8 *bit_mask) +{ + struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; + + memcpy(hw_ste->mask, bit_mask, DR_STE_SIZE_MASK); +} + +static void dr_ste_set_always_hit(struct dr_hw_ste_format *hw_ste) +{ + memset(&hw_ste->tag, 0, sizeof(hw_ste->tag)); + memset(&hw_ste->mask, 0, sizeof(hw_ste->mask)); +} + +static void dr_ste_set_always_miss(struct dr_hw_ste_format *hw_ste) +{ + hw_ste->tag[0] = 0xdc; + hw_ste->mask[0] = 0; +} + +void mlx5dr_ste_set_miss_addr(struct mlx5dr_ste_ctx *ste_ctx, + u8 *hw_ste_p, u64 miss_addr) +{ + ste_ctx->set_miss_addr(hw_ste_p, miss_addr); +} + +static void dr_ste_always_miss_addr(struct mlx5dr_ste_ctx *ste_ctx, + u8 *hw_ste, u64 miss_addr) +{ + ste_ctx->set_next_lu_type(hw_ste, MLX5DR_STE_LU_TYPE_DONT_CARE); + ste_ctx->set_miss_addr(hw_ste, miss_addr); + dr_ste_set_always_miss((struct dr_hw_ste_format *)hw_ste); +} + +void mlx5dr_ste_set_hit_addr(struct mlx5dr_ste_ctx *ste_ctx, + u8 *hw_ste, u64 icm_addr, u32 ht_size) +{ + ste_ctx->set_hit_addr(hw_ste, icm_addr, ht_size); +} + +u64 mlx5dr_ste_get_icm_addr(struct mlx5dr_ste *ste) +{ + u64 base_icm_addr = mlx5dr_icm_pool_get_chunk_icm_addr(ste->htbl->chunk); + u32 index = ste - ste->htbl->chunk->ste_arr; + + return base_icm_addr + DR_STE_SIZE * index; +} + +u64 mlx5dr_ste_get_mr_addr(struct mlx5dr_ste *ste) +{ + u32 index = ste - ste->htbl->chunk->ste_arr; + + return mlx5dr_icm_pool_get_chunk_mr_addr(ste->htbl->chunk) + DR_STE_SIZE * index; +} + +u8 *mlx5dr_ste_get_hw_ste(struct mlx5dr_ste *ste) +{ + u64 index = ste - ste->htbl->chunk->ste_arr; + + return ste->htbl->chunk->hw_ste_arr + DR_STE_SIZE_REDUCED * index; +} + +struct list_head *mlx5dr_ste_get_miss_list(struct mlx5dr_ste *ste) +{ + u32 index = ste - ste->htbl->chunk->ste_arr; + + return &ste->htbl->chunk->miss_list[index]; +} + +static void dr_ste_always_hit_htbl(struct mlx5dr_ste_ctx *ste_ctx, + u8 *hw_ste, + struct mlx5dr_ste_htbl *next_htbl) +{ + struct mlx5dr_icm_chunk *chunk = next_htbl->chunk; + + ste_ctx->set_byte_mask(hw_ste, next_htbl->byte_mask); + ste_ctx->set_next_lu_type(hw_ste, next_htbl->lu_type); + ste_ctx->set_hit_addr(hw_ste, mlx5dr_icm_pool_get_chunk_icm_addr(chunk), + mlx5dr_icm_pool_get_chunk_num_of_entries(chunk)); + + dr_ste_set_always_hit((struct dr_hw_ste_format *)hw_ste); +} + +bool mlx5dr_ste_is_last_in_rule(struct mlx5dr_matcher_rx_tx *nic_matcher, + u8 ste_location) +{ + return ste_location == nic_matcher->num_of_builders; +} + +/* Replace relevant fields, except of: + * htbl - keep the origin htbl + * miss_list + list - already took the src from the list. + * icm_addr/mr_addr - depends on the hosting table. + * + * Before: + * | a | -> | b | -> | c | -> + * + * After: + * | a | -> | c | -> + * While the data that was in b copied to a. + */ +static void dr_ste_replace(struct mlx5dr_ste *dst, struct mlx5dr_ste *src) +{ + memcpy(mlx5dr_ste_get_hw_ste(dst), mlx5dr_ste_get_hw_ste(src), + DR_STE_SIZE_REDUCED); + dst->next_htbl = src->next_htbl; + if (dst->next_htbl) + dst->next_htbl->pointing_ste = dst; + + dst->refcount = src->refcount; +} + +/* Free ste which is the head and the only one in miss_list */ +static void +dr_ste_remove_head_ste(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste *ste, + struct mlx5dr_matcher_rx_tx *nic_matcher, + struct mlx5dr_ste_send_info *ste_info_head, + struct list_head *send_ste_list, + struct mlx5dr_ste_htbl *stats_tbl) +{ + u8 tmp_data_ste[DR_STE_SIZE] = {}; + u64 miss_addr; + + miss_addr = mlx5dr_icm_pool_get_chunk_icm_addr(nic_matcher->e_anchor->chunk); + + /* Use temp ste because dr_ste_always_miss_addr + * touches bit_mask area which doesn't exist at ste->hw_ste. + * Need to use a full-sized (DR_STE_SIZE) hw_ste. + */ + memcpy(tmp_data_ste, mlx5dr_ste_get_hw_ste(ste), DR_STE_SIZE_REDUCED); + dr_ste_always_miss_addr(ste_ctx, tmp_data_ste, miss_addr); + memcpy(mlx5dr_ste_get_hw_ste(ste), tmp_data_ste, DR_STE_SIZE_REDUCED); + + list_del_init(&ste->miss_list_node); + + /* Write full STE size in order to have "always_miss" */ + mlx5dr_send_fill_and_append_ste_send_info(ste, DR_STE_SIZE, + 0, tmp_data_ste, + ste_info_head, + send_ste_list, + true /* Copy data */); + + stats_tbl->ctrl.num_of_valid_entries--; +} + +/* Free ste which is the head but NOT the only one in miss_list: + * |_ste_| --> |_next_ste_| -->|__| -->|__| -->/0 + */ +static void +dr_ste_replace_head_ste(struct mlx5dr_matcher_rx_tx *nic_matcher, + struct mlx5dr_ste *ste, + struct mlx5dr_ste *next_ste, + struct mlx5dr_ste_send_info *ste_info_head, + struct list_head *send_ste_list, + struct mlx5dr_ste_htbl *stats_tbl) + +{ + struct mlx5dr_ste_htbl *next_miss_htbl; + u8 hw_ste[DR_STE_SIZE] = {}; + int sb_idx; + + next_miss_htbl = next_ste->htbl; + + /* Remove from the miss_list the next_ste before copy */ + list_del_init(&next_ste->miss_list_node); + + /* Move data from next into ste */ + dr_ste_replace(ste, next_ste); + + /* Update the rule on STE change */ + mlx5dr_rule_set_last_member(next_ste->rule_rx_tx, ste, false); + + /* Copy all 64 hw_ste bytes */ + memcpy(hw_ste, mlx5dr_ste_get_hw_ste(ste), DR_STE_SIZE_REDUCED); + sb_idx = ste->ste_chain_location - 1; + mlx5dr_ste_set_bit_mask(hw_ste, + nic_matcher->ste_builder[sb_idx].bit_mask); + + /* Del the htbl that contains the next_ste. + * The origin htbl stay with the same number of entries. + */ + mlx5dr_htbl_put(next_miss_htbl); + + mlx5dr_send_fill_and_append_ste_send_info(ste, DR_STE_SIZE, + 0, hw_ste, + ste_info_head, + send_ste_list, + true /* Copy data */); + + stats_tbl->ctrl.num_of_collisions--; + stats_tbl->ctrl.num_of_valid_entries--; +} + +/* Free ste that is located in the middle of the miss list: + * |__| -->|_prev_ste_|->|_ste_|-->|_next_ste_| + */ +static void dr_ste_remove_middle_ste(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste *ste, + struct mlx5dr_ste_send_info *ste_info, + struct list_head *send_ste_list, + struct mlx5dr_ste_htbl *stats_tbl) +{ + struct mlx5dr_ste *prev_ste; + u64 miss_addr; + + prev_ste = list_prev_entry(ste, miss_list_node); + if (WARN_ON(!prev_ste)) + return; + + miss_addr = ste_ctx->get_miss_addr(mlx5dr_ste_get_hw_ste(ste)); + ste_ctx->set_miss_addr(mlx5dr_ste_get_hw_ste(prev_ste), miss_addr); + + mlx5dr_send_fill_and_append_ste_send_info(prev_ste, DR_STE_SIZE_CTRL, 0, + mlx5dr_ste_get_hw_ste(prev_ste), + ste_info, send_ste_list, + true /* Copy data*/); + + list_del_init(&ste->miss_list_node); + + stats_tbl->ctrl.num_of_valid_entries--; + stats_tbl->ctrl.num_of_collisions--; +} + +void mlx5dr_ste_free(struct mlx5dr_ste *ste, + struct mlx5dr_matcher *matcher, + struct mlx5dr_matcher_rx_tx *nic_matcher) +{ + struct mlx5dr_ste_send_info *cur_ste_info, *tmp_ste_info; + struct mlx5dr_domain *dmn = matcher->tbl->dmn; + struct mlx5dr_ste_ctx *ste_ctx = dmn->ste_ctx; + struct mlx5dr_ste_send_info ste_info_head; + struct mlx5dr_ste *next_ste, *first_ste; + bool put_on_origin_table = true; + struct mlx5dr_ste_htbl *stats_tbl; + LIST_HEAD(send_ste_list); + + first_ste = list_first_entry(mlx5dr_ste_get_miss_list(ste), + struct mlx5dr_ste, miss_list_node); + stats_tbl = first_ste->htbl; + + /* Two options: + * 1. ste is head: + * a. head ste is the only ste in the miss list + * b. head ste is not the only ste in the miss-list + * 2. ste is not head + */ + if (first_ste == ste) { /* Ste is the head */ + struct mlx5dr_ste *last_ste; + + last_ste = list_last_entry(mlx5dr_ste_get_miss_list(ste), + struct mlx5dr_ste, miss_list_node); + if (last_ste == first_ste) + next_ste = NULL; + else + next_ste = list_next_entry(ste, miss_list_node); + + if (!next_ste) { + /* One and only entry in the list */ + dr_ste_remove_head_ste(ste_ctx, ste, + nic_matcher, + &ste_info_head, + &send_ste_list, + stats_tbl); + } else { + /* First but not only entry in the list */ + dr_ste_replace_head_ste(nic_matcher, ste, + next_ste, &ste_info_head, + &send_ste_list, stats_tbl); + put_on_origin_table = false; + } + } else { /* Ste in the middle of the list */ + dr_ste_remove_middle_ste(ste_ctx, ste, + &ste_info_head, &send_ste_list, + stats_tbl); + } + + /* Update HW */ + list_for_each_entry_safe(cur_ste_info, tmp_ste_info, + &send_ste_list, send_list) { + list_del(&cur_ste_info->send_list); + mlx5dr_send_postsend_ste(dmn, cur_ste_info->ste, + cur_ste_info->data, cur_ste_info->size, + cur_ste_info->offset); + } + + if (put_on_origin_table) + mlx5dr_htbl_put(ste->htbl); +} + +bool mlx5dr_ste_equal_tag(void *src, void *dst) +{ + struct dr_hw_ste_format *s_hw_ste = (struct dr_hw_ste_format *)src; + struct dr_hw_ste_format *d_hw_ste = (struct dr_hw_ste_format *)dst; + + return !memcmp(s_hw_ste->tag, d_hw_ste->tag, DR_STE_SIZE_TAG); +} + +void mlx5dr_ste_set_hit_addr_by_next_htbl(struct mlx5dr_ste_ctx *ste_ctx, + u8 *hw_ste, + struct mlx5dr_ste_htbl *next_htbl) +{ + u64 icm_addr = mlx5dr_icm_pool_get_chunk_icm_addr(next_htbl->chunk); + u32 num_entries = + mlx5dr_icm_pool_get_chunk_num_of_entries(next_htbl->chunk); + + ste_ctx->set_hit_addr(hw_ste, icm_addr, num_entries); +} + +void mlx5dr_ste_prepare_for_postsend(struct mlx5dr_ste_ctx *ste_ctx, + u8 *hw_ste_p, u32 ste_size) +{ + if (ste_ctx->prepare_for_postsend) + ste_ctx->prepare_for_postsend(hw_ste_p, ste_size); +} + +/* Init one ste as a pattern for ste data array */ +void mlx5dr_ste_set_formatted_ste(struct mlx5dr_ste_ctx *ste_ctx, + u16 gvmi, + enum mlx5dr_domain_nic_type nic_type, + struct mlx5dr_ste_htbl *htbl, + u8 *formatted_ste, + struct mlx5dr_htbl_connect_info *connect_info) +{ + bool is_rx = nic_type == DR_DOMAIN_NIC_TYPE_RX; + u8 tmp_hw_ste[DR_STE_SIZE] = {0}; + + ste_ctx->ste_init(formatted_ste, htbl->lu_type, is_rx, gvmi); + + /* Use temp ste because dr_ste_always_miss_addr/hit_htbl + * touches bit_mask area which doesn't exist at ste->hw_ste. + * Need to use a full-sized (DR_STE_SIZE) hw_ste. + */ + memcpy(tmp_hw_ste, formatted_ste, DR_STE_SIZE_REDUCED); + if (connect_info->type == CONNECT_HIT) + dr_ste_always_hit_htbl(ste_ctx, tmp_hw_ste, + connect_info->hit_next_htbl); + else + dr_ste_always_miss_addr(ste_ctx, tmp_hw_ste, + connect_info->miss_icm_addr); + memcpy(formatted_ste, tmp_hw_ste, DR_STE_SIZE_REDUCED); +} + +int mlx5dr_ste_htbl_init_and_postsend(struct mlx5dr_domain *dmn, + struct mlx5dr_domain_rx_tx *nic_dmn, + struct mlx5dr_ste_htbl *htbl, + struct mlx5dr_htbl_connect_info *connect_info, + bool update_hw_ste) +{ + u8 formatted_ste[DR_STE_SIZE] = {}; + + mlx5dr_ste_set_formatted_ste(dmn->ste_ctx, + dmn->info.caps.gvmi, + nic_dmn->type, + htbl, + formatted_ste, + connect_info); + + return mlx5dr_send_postsend_formatted_htbl(dmn, htbl, formatted_ste, update_hw_ste); +} + +int mlx5dr_ste_create_next_htbl(struct mlx5dr_matcher *matcher, + struct mlx5dr_matcher_rx_tx *nic_matcher, + struct mlx5dr_ste *ste, + u8 *cur_hw_ste, + enum mlx5dr_icm_chunk_size log_table_size) +{ + struct mlx5dr_domain_rx_tx *nic_dmn = nic_matcher->nic_tbl->nic_dmn; + struct mlx5dr_domain *dmn = matcher->tbl->dmn; + struct mlx5dr_ste_ctx *ste_ctx = dmn->ste_ctx; + struct mlx5dr_htbl_connect_info info; + struct mlx5dr_ste_htbl *next_htbl; + + if (!mlx5dr_ste_is_last_in_rule(nic_matcher, ste->ste_chain_location)) { + u16 next_lu_type; + u16 byte_mask; + + next_lu_type = ste_ctx->get_next_lu_type(cur_hw_ste); + byte_mask = ste_ctx->get_byte_mask(cur_hw_ste); + + next_htbl = mlx5dr_ste_htbl_alloc(dmn->ste_icm_pool, + log_table_size, + next_lu_type, + byte_mask); + if (!next_htbl) { + mlx5dr_dbg(dmn, "Failed allocating table\n"); + return -ENOMEM; + } + + /* Write new table to HW */ + info.type = CONNECT_MISS; + info.miss_icm_addr = + mlx5dr_icm_pool_get_chunk_icm_addr(nic_matcher->e_anchor->chunk); + if (mlx5dr_ste_htbl_init_and_postsend(dmn, nic_dmn, next_htbl, + &info, false)) { + mlx5dr_info(dmn, "Failed writing table to HW\n"); + goto free_table; + } + + mlx5dr_ste_set_hit_addr_by_next_htbl(ste_ctx, + cur_hw_ste, next_htbl); + ste->next_htbl = next_htbl; + next_htbl->pointing_ste = ste; + } + + return 0; + +free_table: + mlx5dr_ste_htbl_free(next_htbl); + return -ENOENT; +} + +struct mlx5dr_ste_htbl *mlx5dr_ste_htbl_alloc(struct mlx5dr_icm_pool *pool, + enum mlx5dr_icm_chunk_size chunk_size, + u16 lu_type, u16 byte_mask) +{ + struct kmem_cache *htbls_cache = pool->dmn->htbls_kmem_cache; + struct mlx5dr_icm_chunk *chunk; + struct mlx5dr_ste_htbl *htbl; + u32 num_entries; + int i; + + htbl = kmem_cache_alloc(htbls_cache, GFP_KERNEL); + if (!htbl) + return NULL; + + chunk = mlx5dr_icm_alloc_chunk(pool, chunk_size); + if (!chunk) + goto out_free_htbl; + + htbl->chunk = chunk; + htbl->lu_type = lu_type; + htbl->byte_mask = byte_mask; + htbl->refcount = 0; + htbl->pointing_ste = NULL; + htbl->ctrl.num_of_valid_entries = 0; + htbl->ctrl.num_of_collisions = 0; + num_entries = mlx5dr_icm_pool_get_chunk_num_of_entries(chunk); + + for (i = 0; i < num_entries; i++) { + struct mlx5dr_ste *ste = &chunk->ste_arr[i]; + + ste->htbl = htbl; + ste->refcount = 0; + INIT_LIST_HEAD(&ste->miss_list_node); + INIT_LIST_HEAD(&chunk->miss_list[i]); + } + + return htbl; + +out_free_htbl: + kmem_cache_free(htbls_cache, htbl); + return NULL; +} + +int mlx5dr_ste_htbl_free(struct mlx5dr_ste_htbl *htbl) +{ + struct kmem_cache *htbls_cache = + htbl->chunk->buddy_mem->pool->dmn->htbls_kmem_cache; + + if (htbl->refcount) + return -EBUSY; + + mlx5dr_icm_free_chunk(htbl->chunk); + kmem_cache_free(htbls_cache, htbl); + return 0; +} + +void mlx5dr_ste_set_actions_tx(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_domain *dmn, + u8 *action_type_set, + u8 *hw_ste_arr, + struct mlx5dr_ste_actions_attr *attr, + u32 *added_stes) +{ + ste_ctx->set_actions_tx(dmn, action_type_set, ste_ctx->actions_caps, + hw_ste_arr, attr, added_stes); +} + +void mlx5dr_ste_set_actions_rx(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_domain *dmn, + u8 *action_type_set, + u8 *hw_ste_arr, + struct mlx5dr_ste_actions_attr *attr, + u32 *added_stes) +{ + ste_ctx->set_actions_rx(dmn, action_type_set, ste_ctx->actions_caps, + hw_ste_arr, attr, added_stes); +} + +const struct mlx5dr_ste_action_modify_field * +mlx5dr_ste_conv_modify_hdr_sw_field(struct mlx5dr_ste_ctx *ste_ctx, u16 sw_field) +{ + const struct mlx5dr_ste_action_modify_field *hw_field; + + if (sw_field >= ste_ctx->modify_field_arr_sz) + return NULL; + + hw_field = &ste_ctx->modify_field_arr[sw_field]; + if (!hw_field->end && !hw_field->start) + return NULL; + + return hw_field; +} + +void mlx5dr_ste_set_action_set(struct mlx5dr_ste_ctx *ste_ctx, + __be64 *hw_action, + u8 hw_field, + u8 shifter, + u8 length, + u32 data) +{ + ste_ctx->set_action_set((u8 *)hw_action, + hw_field, shifter, length, data); +} + +void mlx5dr_ste_set_action_add(struct mlx5dr_ste_ctx *ste_ctx, + __be64 *hw_action, + u8 hw_field, + u8 shifter, + u8 length, + u32 data) +{ + ste_ctx->set_action_add((u8 *)hw_action, + hw_field, shifter, length, data); +} + +void mlx5dr_ste_set_action_copy(struct mlx5dr_ste_ctx *ste_ctx, + __be64 *hw_action, + u8 dst_hw_field, + u8 dst_shifter, + u8 dst_len, + u8 src_hw_field, + u8 src_shifter) +{ + ste_ctx->set_action_copy((u8 *)hw_action, + dst_hw_field, dst_shifter, dst_len, + src_hw_field, src_shifter); +} + +int mlx5dr_ste_set_action_decap_l3_list(struct mlx5dr_ste_ctx *ste_ctx, + void *data, u32 data_sz, + u8 *hw_action, u32 hw_action_sz, + u16 *used_hw_action_num) +{ + /* Only Ethernet frame is supported, with VLAN (18) or without (14) */ + if (data_sz != HDR_LEN_L2 && data_sz != HDR_LEN_L2_W_VLAN) + return -EINVAL; + + return ste_ctx->set_action_decap_l3_list(data, data_sz, + hw_action, hw_action_sz, + used_hw_action_num); +} + +int mlx5dr_ste_alloc_modify_hdr(struct mlx5dr_action *action) +{ + u32 dynamic_chunck_size; + + dynamic_chunck_size = ilog2(roundup_pow_of_two(action->rewrite->num_of_actions)); + + /* HW modify action index granularity is at least 64B */ + dynamic_chunck_size = max_t(u32, dynamic_chunck_size, + DR_CHUNK_SIZE_8); + + return action->rewrite->dmn->ste_ctx->alloc_modify_hdr_chunk(action, + dynamic_chunck_size); +} + +void mlx5dr_ste_free_modify_hdr(struct mlx5dr_action *action) +{ + return action->rewrite->dmn->ste_ctx->dealloc_modify_hdr_chunk(action); +} + +static int dr_ste_build_pre_check_spec(struct mlx5dr_domain *dmn, + struct mlx5dr_match_spec *spec) +{ + if (spec->ip_version) { + if (spec->ip_version != 0xf) { + mlx5dr_err(dmn, + "Partial ip_version mask with src/dst IP is not supported\n"); + return -EINVAL; + } + } else if (spec->ethertype != 0xffff && + (DR_MASK_IS_SRC_IP_SET(spec) || DR_MASK_IS_DST_IP_SET(spec))) { + mlx5dr_err(dmn, + "Partial/no ethertype mask with src/dst IP is not supported\n"); + return -EINVAL; + } + + return 0; +} + +int mlx5dr_ste_build_pre_check(struct mlx5dr_domain *dmn, + u8 match_criteria, + struct mlx5dr_match_param *mask, + struct mlx5dr_match_param *value) +{ + if (value) + return 0; + + if (match_criteria & DR_MATCHER_CRITERIA_MISC) { + if (mask->misc.source_port && mask->misc.source_port != 0xffff) { + mlx5dr_err(dmn, + "Partial mask source_port is not supported\n"); + return -EINVAL; + } + if (mask->misc.source_eswitch_owner_vhca_id && + mask->misc.source_eswitch_owner_vhca_id != 0xffff) { + mlx5dr_err(dmn, + "Partial mask source_eswitch_owner_vhca_id is not supported\n"); + return -EINVAL; + } + } + + if ((match_criteria & DR_MATCHER_CRITERIA_OUTER) && + dr_ste_build_pre_check_spec(dmn, &mask->outer)) + return -EINVAL; + + if ((match_criteria & DR_MATCHER_CRITERIA_INNER) && + dr_ste_build_pre_check_spec(dmn, &mask->inner)) + return -EINVAL; + + return 0; +} + +int mlx5dr_ste_build_ste_arr(struct mlx5dr_matcher *matcher, + struct mlx5dr_matcher_rx_tx *nic_matcher, + struct mlx5dr_match_param *value, + u8 *ste_arr) +{ + struct mlx5dr_domain_rx_tx *nic_dmn = nic_matcher->nic_tbl->nic_dmn; + bool is_rx = nic_dmn->type == DR_DOMAIN_NIC_TYPE_RX; + struct mlx5dr_domain *dmn = matcher->tbl->dmn; + struct mlx5dr_ste_ctx *ste_ctx = dmn->ste_ctx; + struct mlx5dr_ste_build *sb; + int ret, i; + + ret = mlx5dr_ste_build_pre_check(dmn, matcher->match_criteria, + &matcher->mask, value); + if (ret) + return ret; + + sb = nic_matcher->ste_builder; + for (i = 0; i < nic_matcher->num_of_builders; i++) { + ste_ctx->ste_init(ste_arr, + sb->lu_type, + is_rx, + dmn->info.caps.gvmi); + + mlx5dr_ste_set_bit_mask(ste_arr, sb->bit_mask); + + ret = sb->ste_build_tag_func(value, sb, dr_ste_get_tag(ste_arr)); + if (ret) + return ret; + + /* Connect the STEs */ + if (i < (nic_matcher->num_of_builders - 1)) { + /* Need the next builder for these fields, + * not relevant for the last ste in the chain. + */ + sb++; + ste_ctx->set_next_lu_type(ste_arr, sb->lu_type); + ste_ctx->set_byte_mask(ste_arr, sb->byte_mask); + } + ste_arr += DR_STE_SIZE; + } + return 0; +} + +#define IFC_GET_CLR(typ, p, fld, clear) ({ \ + void *__p = (p); \ + u32 __t = MLX5_GET(typ, __p, fld); \ + if (clear) \ + MLX5_SET(typ, __p, fld, 0); \ + __t; \ +}) + +#define memcpy_and_clear(to, from, len, clear) ({ \ + void *__to = (to), *__from = (from); \ + size_t __len = (len); \ + memcpy(__to, __from, __len); \ + if (clear) \ + memset(__from, 0, __len); \ +}) + +static void dr_ste_copy_mask_misc(char *mask, struct mlx5dr_match_misc *spec, bool clr) +{ + spec->gre_c_present = IFC_GET_CLR(fte_match_set_misc, mask, gre_c_present, clr); + spec->gre_k_present = IFC_GET_CLR(fte_match_set_misc, mask, gre_k_present, clr); + spec->gre_s_present = IFC_GET_CLR(fte_match_set_misc, mask, gre_s_present, clr); + spec->source_vhca_port = IFC_GET_CLR(fte_match_set_misc, mask, source_vhca_port, clr); + spec->source_sqn = IFC_GET_CLR(fte_match_set_misc, mask, source_sqn, clr); + + spec->source_port = IFC_GET_CLR(fte_match_set_misc, mask, source_port, clr); + spec->source_eswitch_owner_vhca_id = + IFC_GET_CLR(fte_match_set_misc, mask, source_eswitch_owner_vhca_id, clr); + + spec->outer_second_prio = IFC_GET_CLR(fte_match_set_misc, mask, outer_second_prio, clr); + spec->outer_second_cfi = IFC_GET_CLR(fte_match_set_misc, mask, outer_second_cfi, clr); + spec->outer_second_vid = IFC_GET_CLR(fte_match_set_misc, mask, outer_second_vid, clr); + spec->inner_second_prio = IFC_GET_CLR(fte_match_set_misc, mask, inner_second_prio, clr); + spec->inner_second_cfi = IFC_GET_CLR(fte_match_set_misc, mask, inner_second_cfi, clr); + spec->inner_second_vid = IFC_GET_CLR(fte_match_set_misc, mask, inner_second_vid, clr); + + spec->outer_second_cvlan_tag = + IFC_GET_CLR(fte_match_set_misc, mask, outer_second_cvlan_tag, clr); + spec->inner_second_cvlan_tag = + IFC_GET_CLR(fte_match_set_misc, mask, inner_second_cvlan_tag, clr); + spec->outer_second_svlan_tag = + IFC_GET_CLR(fte_match_set_misc, mask, outer_second_svlan_tag, clr); + spec->inner_second_svlan_tag = + IFC_GET_CLR(fte_match_set_misc, mask, inner_second_svlan_tag, clr); + spec->gre_protocol = IFC_GET_CLR(fte_match_set_misc, mask, gre_protocol, clr); + + spec->gre_key_h = IFC_GET_CLR(fte_match_set_misc, mask, gre_key.nvgre.hi, clr); + spec->gre_key_l = IFC_GET_CLR(fte_match_set_misc, mask, gre_key.nvgre.lo, clr); + + spec->vxlan_vni = IFC_GET_CLR(fte_match_set_misc, mask, vxlan_vni, clr); + + spec->geneve_vni = IFC_GET_CLR(fte_match_set_misc, mask, geneve_vni, clr); + spec->geneve_tlv_option_0_exist = + IFC_GET_CLR(fte_match_set_misc, mask, geneve_tlv_option_0_exist, clr); + spec->geneve_oam = IFC_GET_CLR(fte_match_set_misc, mask, geneve_oam, clr); + + spec->outer_ipv6_flow_label = + IFC_GET_CLR(fte_match_set_misc, mask, outer_ipv6_flow_label, clr); + + spec->inner_ipv6_flow_label = + IFC_GET_CLR(fte_match_set_misc, mask, inner_ipv6_flow_label, clr); + + spec->geneve_opt_len = IFC_GET_CLR(fte_match_set_misc, mask, geneve_opt_len, clr); + spec->geneve_protocol_type = + IFC_GET_CLR(fte_match_set_misc, mask, geneve_protocol_type, clr); + + spec->bth_dst_qp = IFC_GET_CLR(fte_match_set_misc, mask, bth_dst_qp, clr); +} + +static void dr_ste_copy_mask_spec(char *mask, struct mlx5dr_match_spec *spec, bool clr) +{ + __be32 raw_ip[4]; + + spec->smac_47_16 = IFC_GET_CLR(fte_match_set_lyr_2_4, mask, smac_47_16, clr); + + spec->smac_15_0 = IFC_GET_CLR(fte_match_set_lyr_2_4, mask, smac_15_0, clr); + spec->ethertype = IFC_GET_CLR(fte_match_set_lyr_2_4, mask, ethertype, clr); + + spec->dmac_47_16 = IFC_GET_CLR(fte_match_set_lyr_2_4, mask, dmac_47_16, clr); + + spec->dmac_15_0 = IFC_GET_CLR(fte_match_set_lyr_2_4, mask, dmac_15_0, clr); + spec->first_prio = IFC_GET_CLR(fte_match_set_lyr_2_4, mask, first_prio, clr); + spec->first_cfi = IFC_GET_CLR(fte_match_set_lyr_2_4, mask, first_cfi, clr); + spec->first_vid = IFC_GET_CLR(fte_match_set_lyr_2_4, mask, first_vid, clr); + + spec->ip_protocol = IFC_GET_CLR(fte_match_set_lyr_2_4, mask, ip_protocol, clr); + spec->ip_dscp = IFC_GET_CLR(fte_match_set_lyr_2_4, mask, ip_dscp, clr); + spec->ip_ecn = IFC_GET_CLR(fte_match_set_lyr_2_4, mask, ip_ecn, clr); + spec->cvlan_tag = IFC_GET_CLR(fte_match_set_lyr_2_4, mask, cvlan_tag, clr); + spec->svlan_tag = IFC_GET_CLR(fte_match_set_lyr_2_4, mask, svlan_tag, clr); + spec->frag = IFC_GET_CLR(fte_match_set_lyr_2_4, mask, frag, clr); + spec->ip_version = IFC_GET_CLR(fte_match_set_lyr_2_4, mask, ip_version, clr); + spec->tcp_flags = IFC_GET_CLR(fte_match_set_lyr_2_4, mask, tcp_flags, clr); + spec->tcp_sport = IFC_GET_CLR(fte_match_set_lyr_2_4, mask, tcp_sport, clr); + spec->tcp_dport = IFC_GET_CLR(fte_match_set_lyr_2_4, mask, tcp_dport, clr); + + spec->ipv4_ihl = MLX5_GET(fte_match_set_lyr_2_4, mask, ipv4_ihl); + spec->ttl_hoplimit = IFC_GET_CLR(fte_match_set_lyr_2_4, mask, ttl_hoplimit, clr); + + spec->udp_sport = IFC_GET_CLR(fte_match_set_lyr_2_4, mask, udp_sport, clr); + spec->udp_dport = IFC_GET_CLR(fte_match_set_lyr_2_4, mask, udp_dport, clr); + + memcpy_and_clear(raw_ip, MLX5_ADDR_OF(fte_match_set_lyr_2_4, mask, + src_ipv4_src_ipv6.ipv6_layout.ipv6), + sizeof(raw_ip), clr); + + spec->src_ip_127_96 = be32_to_cpu(raw_ip[0]); + spec->src_ip_95_64 = be32_to_cpu(raw_ip[1]); + spec->src_ip_63_32 = be32_to_cpu(raw_ip[2]); + spec->src_ip_31_0 = be32_to_cpu(raw_ip[3]); + + memcpy_and_clear(raw_ip, MLX5_ADDR_OF(fte_match_set_lyr_2_4, mask, + dst_ipv4_dst_ipv6.ipv6_layout.ipv6), + sizeof(raw_ip), clr); + + spec->dst_ip_127_96 = be32_to_cpu(raw_ip[0]); + spec->dst_ip_95_64 = be32_to_cpu(raw_ip[1]); + spec->dst_ip_63_32 = be32_to_cpu(raw_ip[2]); + spec->dst_ip_31_0 = be32_to_cpu(raw_ip[3]); +} + +static void dr_ste_copy_mask_misc2(char *mask, struct mlx5dr_match_misc2 *spec, bool clr) +{ + spec->outer_first_mpls_label = + IFC_GET_CLR(fte_match_set_misc2, mask, outer_first_mpls.mpls_label, clr); + spec->outer_first_mpls_exp = + IFC_GET_CLR(fte_match_set_misc2, mask, outer_first_mpls.mpls_exp, clr); + spec->outer_first_mpls_s_bos = + IFC_GET_CLR(fte_match_set_misc2, mask, outer_first_mpls.mpls_s_bos, clr); + spec->outer_first_mpls_ttl = + IFC_GET_CLR(fte_match_set_misc2, mask, outer_first_mpls.mpls_ttl, clr); + spec->inner_first_mpls_label = + IFC_GET_CLR(fte_match_set_misc2, mask, inner_first_mpls.mpls_label, clr); + spec->inner_first_mpls_exp = + IFC_GET_CLR(fte_match_set_misc2, mask, inner_first_mpls.mpls_exp, clr); + spec->inner_first_mpls_s_bos = + IFC_GET_CLR(fte_match_set_misc2, mask, inner_first_mpls.mpls_s_bos, clr); + spec->inner_first_mpls_ttl = + IFC_GET_CLR(fte_match_set_misc2, mask, inner_first_mpls.mpls_ttl, clr); + spec->outer_first_mpls_over_gre_label = + IFC_GET_CLR(fte_match_set_misc2, mask, outer_first_mpls_over_gre.mpls_label, clr); + spec->outer_first_mpls_over_gre_exp = + IFC_GET_CLR(fte_match_set_misc2, mask, outer_first_mpls_over_gre.mpls_exp, clr); + spec->outer_first_mpls_over_gre_s_bos = + IFC_GET_CLR(fte_match_set_misc2, mask, outer_first_mpls_over_gre.mpls_s_bos, clr); + spec->outer_first_mpls_over_gre_ttl = + IFC_GET_CLR(fte_match_set_misc2, mask, outer_first_mpls_over_gre.mpls_ttl, clr); + spec->outer_first_mpls_over_udp_label = + IFC_GET_CLR(fte_match_set_misc2, mask, outer_first_mpls_over_udp.mpls_label, clr); + spec->outer_first_mpls_over_udp_exp = + IFC_GET_CLR(fte_match_set_misc2, mask, outer_first_mpls_over_udp.mpls_exp, clr); + spec->outer_first_mpls_over_udp_s_bos = + IFC_GET_CLR(fte_match_set_misc2, mask, outer_first_mpls_over_udp.mpls_s_bos, clr); + spec->outer_first_mpls_over_udp_ttl = + IFC_GET_CLR(fte_match_set_misc2, mask, outer_first_mpls_over_udp.mpls_ttl, clr); + spec->metadata_reg_c_7 = IFC_GET_CLR(fte_match_set_misc2, mask, metadata_reg_c_7, clr); + spec->metadata_reg_c_6 = IFC_GET_CLR(fte_match_set_misc2, mask, metadata_reg_c_6, clr); + spec->metadata_reg_c_5 = IFC_GET_CLR(fte_match_set_misc2, mask, metadata_reg_c_5, clr); + spec->metadata_reg_c_4 = IFC_GET_CLR(fte_match_set_misc2, mask, metadata_reg_c_4, clr); + spec->metadata_reg_c_3 = IFC_GET_CLR(fte_match_set_misc2, mask, metadata_reg_c_3, clr); + spec->metadata_reg_c_2 = IFC_GET_CLR(fte_match_set_misc2, mask, metadata_reg_c_2, clr); + spec->metadata_reg_c_1 = IFC_GET_CLR(fte_match_set_misc2, mask, metadata_reg_c_1, clr); + spec->metadata_reg_c_0 = IFC_GET_CLR(fte_match_set_misc2, mask, metadata_reg_c_0, clr); + spec->metadata_reg_a = IFC_GET_CLR(fte_match_set_misc2, mask, metadata_reg_a, clr); +} + +static void dr_ste_copy_mask_misc3(char *mask, struct mlx5dr_match_misc3 *spec, bool clr) +{ + spec->inner_tcp_seq_num = IFC_GET_CLR(fte_match_set_misc3, mask, inner_tcp_seq_num, clr); + spec->outer_tcp_seq_num = IFC_GET_CLR(fte_match_set_misc3, mask, outer_tcp_seq_num, clr); + spec->inner_tcp_ack_num = IFC_GET_CLR(fte_match_set_misc3, mask, inner_tcp_ack_num, clr); + spec->outer_tcp_ack_num = IFC_GET_CLR(fte_match_set_misc3, mask, outer_tcp_ack_num, clr); + spec->outer_vxlan_gpe_vni = + IFC_GET_CLR(fte_match_set_misc3, mask, outer_vxlan_gpe_vni, clr); + spec->outer_vxlan_gpe_next_protocol = + IFC_GET_CLR(fte_match_set_misc3, mask, outer_vxlan_gpe_next_protocol, clr); + spec->outer_vxlan_gpe_flags = + IFC_GET_CLR(fte_match_set_misc3, mask, outer_vxlan_gpe_flags, clr); + spec->icmpv4_header_data = IFC_GET_CLR(fte_match_set_misc3, mask, icmp_header_data, clr); + spec->icmpv6_header_data = + IFC_GET_CLR(fte_match_set_misc3, mask, icmpv6_header_data, clr); + spec->icmpv4_type = IFC_GET_CLR(fte_match_set_misc3, mask, icmp_type, clr); + spec->icmpv4_code = IFC_GET_CLR(fte_match_set_misc3, mask, icmp_code, clr); + spec->icmpv6_type = IFC_GET_CLR(fte_match_set_misc3, mask, icmpv6_type, clr); + spec->icmpv6_code = IFC_GET_CLR(fte_match_set_misc3, mask, icmpv6_code, clr); + spec->geneve_tlv_option_0_data = + IFC_GET_CLR(fte_match_set_misc3, mask, geneve_tlv_option_0_data, clr); + spec->gtpu_teid = IFC_GET_CLR(fte_match_set_misc3, mask, gtpu_teid, clr); + spec->gtpu_msg_flags = IFC_GET_CLR(fte_match_set_misc3, mask, gtpu_msg_flags, clr); + spec->gtpu_msg_type = IFC_GET_CLR(fte_match_set_misc3, mask, gtpu_msg_type, clr); + spec->gtpu_dw_0 = IFC_GET_CLR(fte_match_set_misc3, mask, gtpu_dw_0, clr); + spec->gtpu_dw_2 = IFC_GET_CLR(fte_match_set_misc3, mask, gtpu_dw_2, clr); + spec->gtpu_first_ext_dw_0 = + IFC_GET_CLR(fte_match_set_misc3, mask, gtpu_first_ext_dw_0, clr); +} + +static void dr_ste_copy_mask_misc4(char *mask, struct mlx5dr_match_misc4 *spec, bool clr) +{ + spec->prog_sample_field_id_0 = + IFC_GET_CLR(fte_match_set_misc4, mask, prog_sample_field_id_0, clr); + spec->prog_sample_field_value_0 = + IFC_GET_CLR(fte_match_set_misc4, mask, prog_sample_field_value_0, clr); + spec->prog_sample_field_id_1 = + IFC_GET_CLR(fte_match_set_misc4, mask, prog_sample_field_id_1, clr); + spec->prog_sample_field_value_1 = + IFC_GET_CLR(fte_match_set_misc4, mask, prog_sample_field_value_1, clr); + spec->prog_sample_field_id_2 = + IFC_GET_CLR(fte_match_set_misc4, mask, prog_sample_field_id_2, clr); + spec->prog_sample_field_value_2 = + IFC_GET_CLR(fte_match_set_misc4, mask, prog_sample_field_value_2, clr); + spec->prog_sample_field_id_3 = + IFC_GET_CLR(fte_match_set_misc4, mask, prog_sample_field_id_3, clr); + spec->prog_sample_field_value_3 = + IFC_GET_CLR(fte_match_set_misc4, mask, prog_sample_field_value_3, clr); +} + +static void dr_ste_copy_mask_misc5(char *mask, struct mlx5dr_match_misc5 *spec, bool clr) +{ + spec->macsec_tag_0 = + IFC_GET_CLR(fte_match_set_misc5, mask, macsec_tag_0, clr); + spec->macsec_tag_1 = + IFC_GET_CLR(fte_match_set_misc5, mask, macsec_tag_1, clr); + spec->macsec_tag_2 = + IFC_GET_CLR(fte_match_set_misc5, mask, macsec_tag_2, clr); + spec->macsec_tag_3 = + IFC_GET_CLR(fte_match_set_misc5, mask, macsec_tag_3, clr); + spec->tunnel_header_0 = + IFC_GET_CLR(fte_match_set_misc5, mask, tunnel_header_0, clr); + spec->tunnel_header_1 = + IFC_GET_CLR(fte_match_set_misc5, mask, tunnel_header_1, clr); + spec->tunnel_header_2 = + IFC_GET_CLR(fte_match_set_misc5, mask, tunnel_header_2, clr); + spec->tunnel_header_3 = + IFC_GET_CLR(fte_match_set_misc5, mask, tunnel_header_3, clr); +} + +void mlx5dr_ste_copy_param(u8 match_criteria, + struct mlx5dr_match_param *set_param, + struct mlx5dr_match_parameters *mask, + bool clr) +{ + u8 tail_param[MLX5_ST_SZ_BYTES(fte_match_set_lyr_2_4)] = {}; + u8 *data = (u8 *)mask->match_buf; + size_t param_location; + void *buff; + + if (match_criteria & DR_MATCHER_CRITERIA_OUTER) { + if (mask->match_sz < sizeof(struct mlx5dr_match_spec)) { + memcpy(tail_param, data, mask->match_sz); + buff = tail_param; + } else { + buff = mask->match_buf; + } + dr_ste_copy_mask_spec(buff, &set_param->outer, clr); + } + param_location = sizeof(struct mlx5dr_match_spec); + + if (match_criteria & DR_MATCHER_CRITERIA_MISC) { + if (mask->match_sz < param_location + + sizeof(struct mlx5dr_match_misc)) { + memcpy(tail_param, data + param_location, + mask->match_sz - param_location); + buff = tail_param; + } else { + buff = data + param_location; + } + dr_ste_copy_mask_misc(buff, &set_param->misc, clr); + } + param_location += sizeof(struct mlx5dr_match_misc); + + if (match_criteria & DR_MATCHER_CRITERIA_INNER) { + if (mask->match_sz < param_location + + sizeof(struct mlx5dr_match_spec)) { + memcpy(tail_param, data + param_location, + mask->match_sz - param_location); + buff = tail_param; + } else { + buff = data + param_location; + } + dr_ste_copy_mask_spec(buff, &set_param->inner, clr); + } + param_location += sizeof(struct mlx5dr_match_spec); + + if (match_criteria & DR_MATCHER_CRITERIA_MISC2) { + if (mask->match_sz < param_location + + sizeof(struct mlx5dr_match_misc2)) { + memcpy(tail_param, data + param_location, + mask->match_sz - param_location); + buff = tail_param; + } else { + buff = data + param_location; + } + dr_ste_copy_mask_misc2(buff, &set_param->misc2, clr); + } + + param_location += sizeof(struct mlx5dr_match_misc2); + + if (match_criteria & DR_MATCHER_CRITERIA_MISC3) { + if (mask->match_sz < param_location + + sizeof(struct mlx5dr_match_misc3)) { + memcpy(tail_param, data + param_location, + mask->match_sz - param_location); + buff = tail_param; + } else { + buff = data + param_location; + } + dr_ste_copy_mask_misc3(buff, &set_param->misc3, clr); + } + + param_location += sizeof(struct mlx5dr_match_misc3); + + if (match_criteria & DR_MATCHER_CRITERIA_MISC4) { + if (mask->match_sz < param_location + + sizeof(struct mlx5dr_match_misc4)) { + memcpy(tail_param, data + param_location, + mask->match_sz - param_location); + buff = tail_param; + } else { + buff = data + param_location; + } + dr_ste_copy_mask_misc4(buff, &set_param->misc4, clr); + } + + param_location += sizeof(struct mlx5dr_match_misc4); + + if (match_criteria & DR_MATCHER_CRITERIA_MISC5) { + if (mask->match_sz < param_location + + sizeof(struct mlx5dr_match_misc5)) { + memcpy(tail_param, data + param_location, + mask->match_sz - param_location); + buff = tail_param; + } else { + buff = data + param_location; + } + dr_ste_copy_mask_misc5(buff, &set_param->misc5, clr); + } +} + +void mlx5dr_ste_build_eth_l2_src_dst(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + bool inner, bool rx) +{ + sb->rx = rx; + sb->inner = inner; + ste_ctx->build_eth_l2_src_dst_init(sb, mask); +} + +void mlx5dr_ste_build_eth_l3_ipv6_dst(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + bool inner, bool rx) +{ + sb->rx = rx; + sb->inner = inner; + ste_ctx->build_eth_l3_ipv6_dst_init(sb, mask); +} + +void mlx5dr_ste_build_eth_l3_ipv6_src(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + bool inner, bool rx) +{ + sb->rx = rx; + sb->inner = inner; + ste_ctx->build_eth_l3_ipv6_src_init(sb, mask); +} + +void mlx5dr_ste_build_eth_l3_ipv4_5_tuple(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + bool inner, bool rx) +{ + sb->rx = rx; + sb->inner = inner; + ste_ctx->build_eth_l3_ipv4_5_tuple_init(sb, mask); +} + +void mlx5dr_ste_build_eth_l2_src(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + bool inner, bool rx) +{ + sb->rx = rx; + sb->inner = inner; + ste_ctx->build_eth_l2_src_init(sb, mask); +} + +void mlx5dr_ste_build_eth_l2_dst(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + bool inner, bool rx) +{ + sb->rx = rx; + sb->inner = inner; + ste_ctx->build_eth_l2_dst_init(sb, mask); +} + +void mlx5dr_ste_build_eth_l2_tnl(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, bool inner, bool rx) +{ + sb->rx = rx; + sb->inner = inner; + ste_ctx->build_eth_l2_tnl_init(sb, mask); +} + +void mlx5dr_ste_build_eth_l3_ipv4_misc(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + bool inner, bool rx) +{ + sb->rx = rx; + sb->inner = inner; + ste_ctx->build_eth_l3_ipv4_misc_init(sb, mask); +} + +void mlx5dr_ste_build_eth_ipv6_l3_l4(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + bool inner, bool rx) +{ + sb->rx = rx; + sb->inner = inner; + ste_ctx->build_eth_ipv6_l3_l4_init(sb, mask); +} + +static int dr_ste_build_empty_always_hit_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + return 0; +} + +void mlx5dr_ste_build_empty_always_hit(struct mlx5dr_ste_build *sb, bool rx) +{ + sb->rx = rx; + sb->lu_type = MLX5DR_STE_LU_TYPE_DONT_CARE; + sb->byte_mask = 0; + sb->ste_build_tag_func = &dr_ste_build_empty_always_hit_tag; +} + +void mlx5dr_ste_build_mpls(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + bool inner, bool rx) +{ + sb->rx = rx; + sb->inner = inner; + ste_ctx->build_mpls_init(sb, mask); +} + +void mlx5dr_ste_build_tnl_gre(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + bool inner, bool rx) +{ + sb->rx = rx; + sb->inner = inner; + ste_ctx->build_tnl_gre_init(sb, mask); +} + +void mlx5dr_ste_build_tnl_mpls_over_gre(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + struct mlx5dr_cmd_caps *caps, + bool inner, bool rx) +{ + sb->rx = rx; + sb->inner = inner; + sb->caps = caps; + return ste_ctx->build_tnl_mpls_over_gre_init(sb, mask); +} + +void mlx5dr_ste_build_tnl_mpls_over_udp(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + struct mlx5dr_cmd_caps *caps, + bool inner, bool rx) +{ + sb->rx = rx; + sb->inner = inner; + sb->caps = caps; + return ste_ctx->build_tnl_mpls_over_udp_init(sb, mask); +} + +void mlx5dr_ste_build_icmp(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + struct mlx5dr_cmd_caps *caps, + bool inner, bool rx) +{ + sb->rx = rx; + sb->inner = inner; + sb->caps = caps; + ste_ctx->build_icmp_init(sb, mask); +} + +void mlx5dr_ste_build_general_purpose(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + bool inner, bool rx) +{ + sb->rx = rx; + sb->inner = inner; + ste_ctx->build_general_purpose_init(sb, mask); +} + +void mlx5dr_ste_build_eth_l4_misc(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + bool inner, bool rx) +{ + sb->rx = rx; + sb->inner = inner; + ste_ctx->build_eth_l4_misc_init(sb, mask); +} + +void mlx5dr_ste_build_tnl_vxlan_gpe(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + bool inner, bool rx) +{ + sb->rx = rx; + sb->inner = inner; + ste_ctx->build_tnl_vxlan_gpe_init(sb, mask); +} + +void mlx5dr_ste_build_tnl_geneve(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + bool inner, bool rx) +{ + sb->rx = rx; + sb->inner = inner; + ste_ctx->build_tnl_geneve_init(sb, mask); +} + +void mlx5dr_ste_build_tnl_geneve_tlv_opt(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + struct mlx5dr_cmd_caps *caps, + bool inner, bool rx) +{ + sb->rx = rx; + sb->caps = caps; + sb->inner = inner; + ste_ctx->build_tnl_geneve_tlv_opt_init(sb, mask); +} + +void mlx5dr_ste_build_tnl_geneve_tlv_opt_exist(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + struct mlx5dr_cmd_caps *caps, + bool inner, bool rx) +{ + if (!ste_ctx->build_tnl_geneve_tlv_opt_exist_init) + return; + + sb->rx = rx; + sb->caps = caps; + sb->inner = inner; + ste_ctx->build_tnl_geneve_tlv_opt_exist_init(sb, mask); +} + +void mlx5dr_ste_build_tnl_gtpu(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + bool inner, bool rx) +{ + sb->rx = rx; + sb->inner = inner; + ste_ctx->build_tnl_gtpu_init(sb, mask); +} + +void mlx5dr_ste_build_tnl_gtpu_flex_parser_0(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + struct mlx5dr_cmd_caps *caps, + bool inner, bool rx) +{ + sb->rx = rx; + sb->caps = caps; + sb->inner = inner; + ste_ctx->build_tnl_gtpu_flex_parser_0_init(sb, mask); +} + +void mlx5dr_ste_build_tnl_gtpu_flex_parser_1(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + struct mlx5dr_cmd_caps *caps, + bool inner, bool rx) +{ + sb->rx = rx; + sb->caps = caps; + sb->inner = inner; + ste_ctx->build_tnl_gtpu_flex_parser_1_init(sb, mask); +} + +void mlx5dr_ste_build_register_0(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + bool inner, bool rx) +{ + sb->rx = rx; + sb->inner = inner; + ste_ctx->build_register_0_init(sb, mask); +} + +void mlx5dr_ste_build_register_1(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + bool inner, bool rx) +{ + sb->rx = rx; + sb->inner = inner; + ste_ctx->build_register_1_init(sb, mask); +} + +void mlx5dr_ste_build_src_gvmi_qpn(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + struct mlx5dr_domain *dmn, + bool inner, bool rx) +{ + /* Set vhca_id_valid before we reset source_eswitch_owner_vhca_id */ + sb->vhca_id_valid = mask->misc.source_eswitch_owner_vhca_id; + + sb->rx = rx; + sb->dmn = dmn; + sb->inner = inner; + ste_ctx->build_src_gvmi_qpn_init(sb, mask); +} + +void mlx5dr_ste_build_flex_parser_0(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + bool inner, bool rx) +{ + sb->rx = rx; + sb->inner = inner; + ste_ctx->build_flex_parser_0_init(sb, mask); +} + +void mlx5dr_ste_build_flex_parser_1(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + bool inner, bool rx) +{ + sb->rx = rx; + sb->inner = inner; + ste_ctx->build_flex_parser_1_init(sb, mask); +} + +void mlx5dr_ste_build_tnl_header_0_1(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + bool inner, bool rx) +{ + sb->rx = rx; + sb->inner = inner; + ste_ctx->build_tnl_header_0_1_init(sb, mask); +} + +struct mlx5dr_ste_ctx *mlx5dr_ste_get_ctx(u8 version) +{ + if (version == MLX5_STEERING_FORMAT_CONNECTX_5) + return mlx5dr_ste_get_ctx_v0(); + else if (version == MLX5_STEERING_FORMAT_CONNECTX_6DX) + return mlx5dr_ste_get_ctx_v1(); + else if (version == MLX5_STEERING_FORMAT_CONNECTX_7) + return mlx5dr_ste_get_ctx_v2(); + + return NULL; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.h new file mode 100644 index 0000000..a5ef86f --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.h @@ -0,0 +1,209 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. */ + +#ifndef _DR_STE_ +#define _DR_STE_ + +#include "dr_types.h" + +#define STE_IPV4 0x1 +#define STE_IPV6 0x2 +#define STE_TCP 0x1 +#define STE_UDP 0x2 +#define STE_SPI 0x3 +#define IP_VERSION_IPV4 0x4 +#define IP_VERSION_IPV6 0x6 +#define STE_SVLAN 0x1 +#define STE_CVLAN 0x2 +#define HDR_LEN_L2_MACS 0xC +#define HDR_LEN_L2_VLAN 0x4 +#define HDR_LEN_L2_ETHER 0x2 +#define HDR_LEN_L2 (HDR_LEN_L2_MACS + HDR_LEN_L2_ETHER) +#define HDR_LEN_L2_W_VLAN (HDR_LEN_L2 + HDR_LEN_L2_VLAN) + +/* Set to STE a specific value using DR_STE_SET */ +#define DR_STE_SET_VAL(lookup_type, tag, t_fname, spec, s_fname, value) do { \ + if ((spec)->s_fname) { \ + MLX5_SET(ste_##lookup_type, tag, t_fname, value); \ + (spec)->s_fname = 0; \ + } \ +} while (0) + +/* Set to STE spec->s_fname to tag->t_fname set spec->s_fname as used */ +#define DR_STE_SET_TAG(lookup_type, tag, t_fname, spec, s_fname) \ + DR_STE_SET_VAL(lookup_type, tag, t_fname, spec, s_fname, spec->s_fname) + +/* Set to STE -1 to tag->t_fname and set spec->s_fname as used */ +#define DR_STE_SET_ONES(lookup_type, tag, t_fname, spec, s_fname) \ + DR_STE_SET_VAL(lookup_type, tag, t_fname, spec, s_fname, -1) + +#define DR_STE_SET_TCP_FLAGS(lookup_type, tag, spec) do { \ + MLX5_SET(ste_##lookup_type, tag, tcp_ns, !!((spec)->tcp_flags & (1 << 8))); \ + MLX5_SET(ste_##lookup_type, tag, tcp_cwr, !!((spec)->tcp_flags & (1 << 7))); \ + MLX5_SET(ste_##lookup_type, tag, tcp_ece, !!((spec)->tcp_flags & (1 << 6))); \ + MLX5_SET(ste_##lookup_type, tag, tcp_urg, !!((spec)->tcp_flags & (1 << 5))); \ + MLX5_SET(ste_##lookup_type, tag, tcp_ack, !!((spec)->tcp_flags & (1 << 4))); \ + MLX5_SET(ste_##lookup_type, tag, tcp_psh, !!((spec)->tcp_flags & (1 << 3))); \ + MLX5_SET(ste_##lookup_type, tag, tcp_rst, !!((spec)->tcp_flags & (1 << 2))); \ + MLX5_SET(ste_##lookup_type, tag, tcp_syn, !!((spec)->tcp_flags & (1 << 1))); \ + MLX5_SET(ste_##lookup_type, tag, tcp_fin, !!((spec)->tcp_flags & (1 << 0))); \ +} while (0) + +#define DR_STE_SET_MPLS(lookup_type, mask, in_out, tag) do { \ + struct mlx5dr_match_misc2 *_mask = mask; \ + u8 *_tag = tag; \ + DR_STE_SET_TAG(lookup_type, _tag, mpls0_label, _mask, \ + in_out##_first_mpls_label);\ + DR_STE_SET_TAG(lookup_type, _tag, mpls0_s_bos, _mask, \ + in_out##_first_mpls_s_bos); \ + DR_STE_SET_TAG(lookup_type, _tag, mpls0_exp, _mask, \ + in_out##_first_mpls_exp); \ + DR_STE_SET_TAG(lookup_type, _tag, mpls0_ttl, _mask, \ + in_out##_first_mpls_ttl); \ +} while (0) + +#define DR_STE_SET_FLEX_PARSER_FIELD(tag, fname, caps, spec) do { \ + u8 parser_id = (caps)->flex_parser_id_##fname; \ + u8 *parser_ptr = dr_ste_calc_flex_parser_offset(tag, parser_id); \ + *(__be32 *)parser_ptr = cpu_to_be32((spec)->fname);\ + (spec)->fname = 0;\ +} while (0) + +#define DR_STE_IS_OUTER_MPLS_OVER_GRE_SET(_misc) (\ + (_misc)->outer_first_mpls_over_gre_label || \ + (_misc)->outer_first_mpls_over_gre_exp || \ + (_misc)->outer_first_mpls_over_gre_s_bos || \ + (_misc)->outer_first_mpls_over_gre_ttl) + +#define DR_STE_IS_OUTER_MPLS_OVER_UDP_SET(_misc) (\ + (_misc)->outer_first_mpls_over_udp_label || \ + (_misc)->outer_first_mpls_over_udp_exp || \ + (_misc)->outer_first_mpls_over_udp_s_bos || \ + (_misc)->outer_first_mpls_over_udp_ttl) + +enum dr_ste_action_modify_type_l3 { + DR_STE_ACTION_MDFY_TYPE_L3_NONE = 0x0, + DR_STE_ACTION_MDFY_TYPE_L3_IPV4 = 0x1, + DR_STE_ACTION_MDFY_TYPE_L3_IPV6 = 0x2, +}; + +enum dr_ste_action_modify_type_l4 { + DR_STE_ACTION_MDFY_TYPE_L4_NONE = 0x0, + DR_STE_ACTION_MDFY_TYPE_L4_TCP = 0x1, + DR_STE_ACTION_MDFY_TYPE_L4_UDP = 0x2, +}; + +enum { + HDR_MPLS_OFFSET_LABEL = 12, + HDR_MPLS_OFFSET_EXP = 9, + HDR_MPLS_OFFSET_S_BOS = 8, + HDR_MPLS_OFFSET_TTL = 0, +}; + +u16 mlx5dr_ste_conv_bit_to_byte_mask(u8 *bit_mask); + +static inline u8 * +dr_ste_calc_flex_parser_offset(u8 *tag, u8 parser_id) +{ + /* Calculate tag byte offset based on flex parser id */ + return tag + 4 * (3 - (parser_id % 4)); +} + +#define DR_STE_CTX_BUILDER(fname) \ + ((*build_##fname##_init)(struct mlx5dr_ste_build *sb, \ + struct mlx5dr_match_param *mask)) + +struct mlx5dr_ste_ctx { + /* Builders */ + void DR_STE_CTX_BUILDER(eth_l2_src_dst); + void DR_STE_CTX_BUILDER(eth_l3_ipv6_src); + void DR_STE_CTX_BUILDER(eth_l3_ipv6_dst); + void DR_STE_CTX_BUILDER(eth_l3_ipv4_5_tuple); + void DR_STE_CTX_BUILDER(eth_l2_src); + void DR_STE_CTX_BUILDER(eth_l2_dst); + void DR_STE_CTX_BUILDER(eth_l2_tnl); + void DR_STE_CTX_BUILDER(eth_l3_ipv4_misc); + void DR_STE_CTX_BUILDER(eth_ipv6_l3_l4); + void DR_STE_CTX_BUILDER(mpls); + void DR_STE_CTX_BUILDER(tnl_gre); + void DR_STE_CTX_BUILDER(tnl_mpls); + void DR_STE_CTX_BUILDER(tnl_mpls_over_gre); + void DR_STE_CTX_BUILDER(tnl_mpls_over_udp); + void DR_STE_CTX_BUILDER(icmp); + void DR_STE_CTX_BUILDER(general_purpose); + void DR_STE_CTX_BUILDER(eth_l4_misc); + void DR_STE_CTX_BUILDER(tnl_vxlan_gpe); + void DR_STE_CTX_BUILDER(tnl_geneve); + void DR_STE_CTX_BUILDER(tnl_geneve_tlv_opt); + void DR_STE_CTX_BUILDER(tnl_geneve_tlv_opt_exist); + void DR_STE_CTX_BUILDER(register_0); + void DR_STE_CTX_BUILDER(register_1); + void DR_STE_CTX_BUILDER(src_gvmi_qpn); + void DR_STE_CTX_BUILDER(flex_parser_0); + void DR_STE_CTX_BUILDER(flex_parser_1); + void DR_STE_CTX_BUILDER(tnl_gtpu); + void DR_STE_CTX_BUILDER(tnl_header_0_1); + void DR_STE_CTX_BUILDER(tnl_gtpu_flex_parser_0); + void DR_STE_CTX_BUILDER(tnl_gtpu_flex_parser_1); + + /* Getters and Setters */ + void (*ste_init)(u8 *hw_ste_p, u16 lu_type, + bool is_rx, u16 gvmi); + void (*set_next_lu_type)(u8 *hw_ste_p, u16 lu_type); + u16 (*get_next_lu_type)(u8 *hw_ste_p); + void (*set_miss_addr)(u8 *hw_ste_p, u64 miss_addr); + u64 (*get_miss_addr)(u8 *hw_ste_p); + void (*set_hit_addr)(u8 *hw_ste_p, u64 icm_addr, u32 ht_size); + void (*set_byte_mask)(u8 *hw_ste_p, u16 byte_mask); + u16 (*get_byte_mask)(u8 *hw_ste_p); + + /* Actions */ + u32 actions_caps; + void (*set_actions_rx)(struct mlx5dr_domain *dmn, + u8 *action_type_set, + u32 actions_caps, + u8 *hw_ste_arr, + struct mlx5dr_ste_actions_attr *attr, + u32 *added_stes); + void (*set_actions_tx)(struct mlx5dr_domain *dmn, + u8 *action_type_set, + u32 actions_caps, + u8 *hw_ste_arr, + struct mlx5dr_ste_actions_attr *attr, + u32 *added_stes); + u32 modify_field_arr_sz; + const struct mlx5dr_ste_action_modify_field *modify_field_arr; + void (*set_action_set)(u8 *hw_action, + u8 hw_field, + u8 shifter, + u8 length, + u32 data); + void (*set_action_add)(u8 *hw_action, + u8 hw_field, + u8 shifter, + u8 length, + u32 data); + void (*set_action_copy)(u8 *hw_action, + u8 dst_hw_field, + u8 dst_shifter, + u8 dst_len, + u8 src_hw_field, + u8 src_shifter); + int (*set_action_decap_l3_list)(void *data, + u32 data_sz, + u8 *hw_action, + u32 hw_action_sz, + u16 *used_hw_action_num); + int (*alloc_modify_hdr_chunk)(struct mlx5dr_action *action, + u32 chunck_size); + void (*dealloc_modify_hdr_chunk)(struct mlx5dr_action *action); + + /* Send */ + void (*prepare_for_postsend)(u8 *hw_ste_p, u32 ste_size); +}; + +struct mlx5dr_ste_ctx *mlx5dr_ste_get_ctx_v0(void); +struct mlx5dr_ste_ctx *mlx5dr_ste_get_ctx_v1(void); +struct mlx5dr_ste_ctx *mlx5dr_ste_get_ctx_v2(void); + +#endif /* _DR_STE_ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v0.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v0.c new file mode 100644 index 0000000..01d4d34 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v0.c @@ -0,0 +1,1995 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. */ + +#include +#include +#include "dr_ste.h" + +#define SVLAN_ETHERTYPE 0x88a8 +#define DR_STE_ENABLE_FLOW_TAG BIT(31) + +enum dr_ste_v0_entry_type { + DR_STE_TYPE_TX = 1, + DR_STE_TYPE_RX = 2, + DR_STE_TYPE_MODIFY_PKT = 6, +}; + +enum dr_ste_v0_action_tunl { + DR_STE_TUNL_ACTION_NONE = 0, + DR_STE_TUNL_ACTION_ENABLE = 1, + DR_STE_TUNL_ACTION_DECAP = 2, + DR_STE_TUNL_ACTION_L3_DECAP = 3, + DR_STE_TUNL_ACTION_POP_VLAN = 4, +}; + +enum dr_ste_v0_action_type { + DR_STE_ACTION_TYPE_PUSH_VLAN = 1, + DR_STE_ACTION_TYPE_ENCAP_L3 = 3, + DR_STE_ACTION_TYPE_ENCAP = 4, +}; + +enum dr_ste_v0_action_mdfy_op { + DR_STE_ACTION_MDFY_OP_COPY = 0x1, + DR_STE_ACTION_MDFY_OP_SET = 0x2, + DR_STE_ACTION_MDFY_OP_ADD = 0x3, +}; + +#define DR_STE_CALC_LU_TYPE(lookup_type, rx, inner) \ + ((inner) ? DR_STE_V0_LU_TYPE_##lookup_type##_I : \ + (rx) ? DR_STE_V0_LU_TYPE_##lookup_type##_D : \ + DR_STE_V0_LU_TYPE_##lookup_type##_O) + +enum { + DR_STE_V0_LU_TYPE_NOP = 0x00, + DR_STE_V0_LU_TYPE_SRC_GVMI_AND_QP = 0x05, + DR_STE_V0_LU_TYPE_ETHL2_TUNNELING_I = 0x0a, + DR_STE_V0_LU_TYPE_ETHL2_DST_O = 0x06, + DR_STE_V0_LU_TYPE_ETHL2_DST_I = 0x07, + DR_STE_V0_LU_TYPE_ETHL2_DST_D = 0x1b, + DR_STE_V0_LU_TYPE_ETHL2_SRC_O = 0x08, + DR_STE_V0_LU_TYPE_ETHL2_SRC_I = 0x09, + DR_STE_V0_LU_TYPE_ETHL2_SRC_D = 0x1c, + DR_STE_V0_LU_TYPE_ETHL2_SRC_DST_O = 0x36, + DR_STE_V0_LU_TYPE_ETHL2_SRC_DST_I = 0x37, + DR_STE_V0_LU_TYPE_ETHL2_SRC_DST_D = 0x38, + DR_STE_V0_LU_TYPE_ETHL3_IPV6_DST_O = 0x0d, + DR_STE_V0_LU_TYPE_ETHL3_IPV6_DST_I = 0x0e, + DR_STE_V0_LU_TYPE_ETHL3_IPV6_DST_D = 0x1e, + DR_STE_V0_LU_TYPE_ETHL3_IPV6_SRC_O = 0x0f, + DR_STE_V0_LU_TYPE_ETHL3_IPV6_SRC_I = 0x10, + DR_STE_V0_LU_TYPE_ETHL3_IPV6_SRC_D = 0x1f, + DR_STE_V0_LU_TYPE_ETHL3_IPV4_5_TUPLE_O = 0x11, + DR_STE_V0_LU_TYPE_ETHL3_IPV4_5_TUPLE_I = 0x12, + DR_STE_V0_LU_TYPE_ETHL3_IPV4_5_TUPLE_D = 0x20, + DR_STE_V0_LU_TYPE_ETHL3_IPV4_MISC_O = 0x29, + DR_STE_V0_LU_TYPE_ETHL3_IPV4_MISC_I = 0x2a, + DR_STE_V0_LU_TYPE_ETHL3_IPV4_MISC_D = 0x2b, + DR_STE_V0_LU_TYPE_ETHL4_O = 0x13, + DR_STE_V0_LU_TYPE_ETHL4_I = 0x14, + DR_STE_V0_LU_TYPE_ETHL4_D = 0x21, + DR_STE_V0_LU_TYPE_ETHL4_MISC_O = 0x2c, + DR_STE_V0_LU_TYPE_ETHL4_MISC_I = 0x2d, + DR_STE_V0_LU_TYPE_ETHL4_MISC_D = 0x2e, + DR_STE_V0_LU_TYPE_MPLS_FIRST_O = 0x15, + DR_STE_V0_LU_TYPE_MPLS_FIRST_I = 0x24, + DR_STE_V0_LU_TYPE_MPLS_FIRST_D = 0x25, + DR_STE_V0_LU_TYPE_GRE = 0x16, + DR_STE_V0_LU_TYPE_FLEX_PARSER_0 = 0x22, + DR_STE_V0_LU_TYPE_FLEX_PARSER_1 = 0x23, + DR_STE_V0_LU_TYPE_FLEX_PARSER_TNL_HEADER = 0x19, + DR_STE_V0_LU_TYPE_GENERAL_PURPOSE = 0x18, + DR_STE_V0_LU_TYPE_STEERING_REGISTERS_0 = 0x2f, + DR_STE_V0_LU_TYPE_STEERING_REGISTERS_1 = 0x30, + DR_STE_V0_LU_TYPE_TUNNEL_HEADER = 0x34, + DR_STE_V0_LU_TYPE_DONT_CARE = MLX5DR_STE_LU_TYPE_DONT_CARE, +}; + +enum { + DR_STE_V0_ACTION_MDFY_FLD_L2_0 = 0, + DR_STE_V0_ACTION_MDFY_FLD_L2_1 = 1, + DR_STE_V0_ACTION_MDFY_FLD_L2_2 = 2, + DR_STE_V0_ACTION_MDFY_FLD_L3_0 = 3, + DR_STE_V0_ACTION_MDFY_FLD_L3_1 = 4, + DR_STE_V0_ACTION_MDFY_FLD_L3_2 = 5, + DR_STE_V0_ACTION_MDFY_FLD_L3_3 = 6, + DR_STE_V0_ACTION_MDFY_FLD_L3_4 = 7, + DR_STE_V0_ACTION_MDFY_FLD_L4_0 = 8, + DR_STE_V0_ACTION_MDFY_FLD_L4_1 = 9, + DR_STE_V0_ACTION_MDFY_FLD_MPLS = 10, + DR_STE_V0_ACTION_MDFY_FLD_L2_TNL_0 = 11, + DR_STE_V0_ACTION_MDFY_FLD_REG_0 = 12, + DR_STE_V0_ACTION_MDFY_FLD_REG_1 = 13, + DR_STE_V0_ACTION_MDFY_FLD_REG_2 = 14, + DR_STE_V0_ACTION_MDFY_FLD_REG_3 = 15, + DR_STE_V0_ACTION_MDFY_FLD_L4_2 = 16, + DR_STE_V0_ACTION_MDFY_FLD_FLEX_0 = 17, + DR_STE_V0_ACTION_MDFY_FLD_FLEX_1 = 18, + DR_STE_V0_ACTION_MDFY_FLD_FLEX_2 = 19, + DR_STE_V0_ACTION_MDFY_FLD_FLEX_3 = 20, + DR_STE_V0_ACTION_MDFY_FLD_L2_TNL_1 = 21, + DR_STE_V0_ACTION_MDFY_FLD_METADATA = 22, + DR_STE_V0_ACTION_MDFY_FLD_RESERVED = 23, +}; + +static const struct mlx5dr_ste_action_modify_field dr_ste_v0_action_modify_field_arr[] = { + [MLX5_ACTION_IN_FIELD_OUT_SMAC_47_16] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L2_1, .start = 16, .end = 47, + }, + [MLX5_ACTION_IN_FIELD_OUT_SMAC_15_0] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L2_1, .start = 0, .end = 15, + }, + [MLX5_ACTION_IN_FIELD_OUT_ETHERTYPE] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L2_2, .start = 32, .end = 47, + }, + [MLX5_ACTION_IN_FIELD_OUT_DMAC_47_16] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L2_0, .start = 16, .end = 47, + }, + [MLX5_ACTION_IN_FIELD_OUT_DMAC_15_0] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L2_0, .start = 0, .end = 15, + }, + [MLX5_ACTION_IN_FIELD_OUT_IP_DSCP] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L3_1, .start = 0, .end = 5, + }, + [MLX5_ACTION_IN_FIELD_OUT_TCP_FLAGS] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L4_0, .start = 48, .end = 56, + .l4_type = DR_STE_ACTION_MDFY_TYPE_L4_TCP, + }, + [MLX5_ACTION_IN_FIELD_OUT_TCP_SPORT] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L4_0, .start = 0, .end = 15, + .l4_type = DR_STE_ACTION_MDFY_TYPE_L4_TCP, + }, + [MLX5_ACTION_IN_FIELD_OUT_TCP_DPORT] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L4_0, .start = 16, .end = 31, + .l4_type = DR_STE_ACTION_MDFY_TYPE_L4_TCP, + }, + [MLX5_ACTION_IN_FIELD_OUT_IP_TTL] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L3_1, .start = 8, .end = 15, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV4, + }, + [MLX5_ACTION_IN_FIELD_OUT_IPV6_HOPLIMIT] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L3_1, .start = 8, .end = 15, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_UDP_SPORT] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L4_0, .start = 0, .end = 15, + .l4_type = DR_STE_ACTION_MDFY_TYPE_L4_UDP, + }, + [MLX5_ACTION_IN_FIELD_OUT_UDP_DPORT] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L4_0, .start = 16, .end = 31, + .l4_type = DR_STE_ACTION_MDFY_TYPE_L4_UDP, + }, + [MLX5_ACTION_IN_FIELD_OUT_SIPV6_127_96] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L3_3, .start = 32, .end = 63, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_SIPV6_95_64] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L3_3, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_SIPV6_63_32] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L3_4, .start = 32, .end = 63, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_SIPV6_31_0] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L3_4, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_DIPV6_127_96] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L3_0, .start = 32, .end = 63, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_DIPV6_95_64] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L3_0, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_DIPV6_63_32] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L3_2, .start = 32, .end = 63, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_DIPV6_31_0] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L3_2, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_SIPV4] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L3_0, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV4, + }, + [MLX5_ACTION_IN_FIELD_OUT_DIPV4] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L3_0, .start = 32, .end = 63, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV4, + }, + [MLX5_ACTION_IN_FIELD_METADATA_REG_A] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_METADATA, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_METADATA_REG_B] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_METADATA, .start = 32, .end = 63, + }, + [MLX5_ACTION_IN_FIELD_METADATA_REG_C_0] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_REG_0, .start = 32, .end = 63, + }, + [MLX5_ACTION_IN_FIELD_METADATA_REG_C_1] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_REG_0, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_METADATA_REG_C_2] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_REG_1, .start = 32, .end = 63, + }, + [MLX5_ACTION_IN_FIELD_METADATA_REG_C_3] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_REG_1, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_METADATA_REG_C_4] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_REG_2, .start = 32, .end = 63, + }, + [MLX5_ACTION_IN_FIELD_METADATA_REG_C_5] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_REG_2, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_TCP_SEQ_NUM] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L4_1, .start = 32, .end = 63, + }, + [MLX5_ACTION_IN_FIELD_OUT_TCP_ACK_NUM] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L4_1, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_FIRST_VID] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L2_2, .start = 0, .end = 15, + }, +}; + +static void dr_ste_v0_set_entry_type(u8 *hw_ste_p, u8 entry_type) +{ + MLX5_SET(ste_general, hw_ste_p, entry_type, entry_type); +} + +static u8 dr_ste_v0_get_entry_type(u8 *hw_ste_p) +{ + return MLX5_GET(ste_general, hw_ste_p, entry_type); +} + +static void dr_ste_v0_set_miss_addr(u8 *hw_ste_p, u64 miss_addr) +{ + u64 index = miss_addr >> 6; + + /* Miss address for TX and RX STEs located in the same offsets */ + MLX5_SET(ste_rx_steering_mult, hw_ste_p, miss_address_39_32, index >> 26); + MLX5_SET(ste_rx_steering_mult, hw_ste_p, miss_address_31_6, index); +} + +static u64 dr_ste_v0_get_miss_addr(u8 *hw_ste_p) +{ + u64 index = + ((u64)MLX5_GET(ste_rx_steering_mult, hw_ste_p, miss_address_31_6) | + ((u64)MLX5_GET(ste_rx_steering_mult, hw_ste_p, miss_address_39_32)) << 26); + + return index << 6; +} + +static void dr_ste_v0_set_byte_mask(u8 *hw_ste_p, u16 byte_mask) +{ + MLX5_SET(ste_general, hw_ste_p, byte_mask, byte_mask); +} + +static u16 dr_ste_v0_get_byte_mask(u8 *hw_ste_p) +{ + return MLX5_GET(ste_general, hw_ste_p, byte_mask); +} + +static void dr_ste_v0_set_lu_type(u8 *hw_ste_p, u16 lu_type) +{ + MLX5_SET(ste_general, hw_ste_p, entry_sub_type, lu_type); +} + +static void dr_ste_v0_set_next_lu_type(u8 *hw_ste_p, u16 lu_type) +{ + MLX5_SET(ste_general, hw_ste_p, next_lu_type, lu_type); +} + +static u16 dr_ste_v0_get_next_lu_type(u8 *hw_ste_p) +{ + return MLX5_GET(ste_general, hw_ste_p, next_lu_type); +} + +static void dr_ste_v0_set_hit_gvmi(u8 *hw_ste_p, u16 gvmi) +{ + MLX5_SET(ste_general, hw_ste_p, next_table_base_63_48, gvmi); +} + +static void dr_ste_v0_set_hit_addr(u8 *hw_ste_p, u64 icm_addr, u32 ht_size) +{ + u64 index = (icm_addr >> 5) | ht_size; + + MLX5_SET(ste_general, hw_ste_p, next_table_base_39_32_size, index >> 27); + MLX5_SET(ste_general, hw_ste_p, next_table_base_31_5_size, index); +} + +static void dr_ste_v0_init_full(u8 *hw_ste_p, u16 lu_type, + enum dr_ste_v0_entry_type entry_type, u16 gvmi) +{ + dr_ste_v0_set_entry_type(hw_ste_p, entry_type); + dr_ste_v0_set_lu_type(hw_ste_p, lu_type); + dr_ste_v0_set_next_lu_type(hw_ste_p, MLX5DR_STE_LU_TYPE_DONT_CARE); + + /* Set GVMI once, this is the same for RX/TX + * bits 63_48 of next table base / miss address encode the next GVMI + */ + MLX5_SET(ste_rx_steering_mult, hw_ste_p, gvmi, gvmi); + MLX5_SET(ste_rx_steering_mult, hw_ste_p, next_table_base_63_48, gvmi); + MLX5_SET(ste_rx_steering_mult, hw_ste_p, miss_address_63_48, gvmi); +} + +static void dr_ste_v0_init(u8 *hw_ste_p, u16 lu_type, + bool is_rx, u16 gvmi) +{ + enum dr_ste_v0_entry_type entry_type; + + entry_type = is_rx ? DR_STE_TYPE_RX : DR_STE_TYPE_TX; + dr_ste_v0_init_full(hw_ste_p, lu_type, entry_type, gvmi); +} + +static void dr_ste_v0_rx_set_flow_tag(u8 *hw_ste_p, u32 flow_tag) +{ + MLX5_SET(ste_rx_steering_mult, hw_ste_p, qp_list_pointer, + DR_STE_ENABLE_FLOW_TAG | flow_tag); +} + +static void dr_ste_v0_set_counter_id(u8 *hw_ste_p, u32 ctr_id) +{ + /* This can be used for both rx_steering_mult and for sx_transmit */ + MLX5_SET(ste_rx_steering_mult, hw_ste_p, counter_trigger_15_0, ctr_id); + MLX5_SET(ste_rx_steering_mult, hw_ste_p, counter_trigger_23_16, ctr_id >> 16); +} + +static void dr_ste_v0_set_go_back_bit(u8 *hw_ste_p) +{ + MLX5_SET(ste_sx_transmit, hw_ste_p, go_back, 1); +} + +static void dr_ste_v0_set_tx_push_vlan(u8 *hw_ste_p, u32 vlan_hdr, + bool go_back) +{ + MLX5_SET(ste_sx_transmit, hw_ste_p, action_type, + DR_STE_ACTION_TYPE_PUSH_VLAN); + MLX5_SET(ste_sx_transmit, hw_ste_p, encap_pointer_vlan_data, vlan_hdr); + /* Due to HW limitation we need to set this bit, otherwise reformat + + * push vlan will not work. + */ + if (go_back) + dr_ste_v0_set_go_back_bit(hw_ste_p); +} + +static void dr_ste_v0_set_tx_encap(void *hw_ste_p, u32 reformat_id, + int size, bool encap_l3) +{ + MLX5_SET(ste_sx_transmit, hw_ste_p, action_type, + encap_l3 ? DR_STE_ACTION_TYPE_ENCAP_L3 : DR_STE_ACTION_TYPE_ENCAP); + /* The hardware expects here size in words (2 byte) */ + MLX5_SET(ste_sx_transmit, hw_ste_p, action_description, size / 2); + MLX5_SET(ste_sx_transmit, hw_ste_p, encap_pointer_vlan_data, reformat_id); +} + +static void dr_ste_v0_set_rx_decap(u8 *hw_ste_p) +{ + MLX5_SET(ste_rx_steering_mult, hw_ste_p, tunneling_action, + DR_STE_TUNL_ACTION_DECAP); + MLX5_SET(ste_rx_steering_mult, hw_ste_p, fail_on_error, 1); +} + +static void dr_ste_v0_set_rx_pop_vlan(u8 *hw_ste_p) +{ + MLX5_SET(ste_rx_steering_mult, hw_ste_p, tunneling_action, + DR_STE_TUNL_ACTION_POP_VLAN); +} + +static void dr_ste_v0_set_rx_decap_l3(u8 *hw_ste_p, bool vlan) +{ + MLX5_SET(ste_rx_steering_mult, hw_ste_p, tunneling_action, + DR_STE_TUNL_ACTION_L3_DECAP); + MLX5_SET(ste_modify_packet, hw_ste_p, action_description, vlan ? 1 : 0); + MLX5_SET(ste_rx_steering_mult, hw_ste_p, fail_on_error, 1); +} + +static void dr_ste_v0_set_rewrite_actions(u8 *hw_ste_p, u16 num_of_actions, + u32 re_write_index) +{ + MLX5_SET(ste_modify_packet, hw_ste_p, number_of_re_write_actions, + num_of_actions); + MLX5_SET(ste_modify_packet, hw_ste_p, header_re_write_actions_pointer, + re_write_index); +} + +static void dr_ste_v0_arr_init_next(u8 **last_ste, + u32 *added_stes, + enum dr_ste_v0_entry_type entry_type, + u16 gvmi) +{ + (*added_stes)++; + *last_ste += DR_STE_SIZE; + dr_ste_v0_init_full(*last_ste, MLX5DR_STE_LU_TYPE_DONT_CARE, + entry_type, gvmi); +} + +static void +dr_ste_v0_set_actions_tx(struct mlx5dr_domain *dmn, + u8 *action_type_set, + u32 actions_caps, + u8 *last_ste, + struct mlx5dr_ste_actions_attr *attr, + u32 *added_stes) +{ + bool encap = action_type_set[DR_ACTION_TYP_L2_TO_TNL_L2] || + action_type_set[DR_ACTION_TYP_L2_TO_TNL_L3]; + + /* We want to make sure the modify header comes before L2 + * encapsulation. The reason for that is that we support + * modify headers for outer headers only + */ + if (action_type_set[DR_ACTION_TYP_MODIFY_HDR] && attr->modify_actions) { + dr_ste_v0_set_entry_type(last_ste, DR_STE_TYPE_MODIFY_PKT); + dr_ste_v0_set_rewrite_actions(last_ste, + attr->modify_actions, + attr->modify_index); + } + + if (action_type_set[DR_ACTION_TYP_PUSH_VLAN]) { + int i; + + for (i = 0; i < attr->vlans.count; i++) { + if (i || action_type_set[DR_ACTION_TYP_MODIFY_HDR]) + dr_ste_v0_arr_init_next(&last_ste, + added_stes, + DR_STE_TYPE_TX, + attr->gvmi); + + dr_ste_v0_set_tx_push_vlan(last_ste, + attr->vlans.headers[i], + encap); + } + } + + if (encap) { + /* Modify header and encapsulation require a different STEs. + * Since modify header STE format doesn't support encapsulation + * tunneling_action. + */ + if (action_type_set[DR_ACTION_TYP_MODIFY_HDR] || + action_type_set[DR_ACTION_TYP_PUSH_VLAN]) + dr_ste_v0_arr_init_next(&last_ste, + added_stes, + DR_STE_TYPE_TX, + attr->gvmi); + + dr_ste_v0_set_tx_encap(last_ste, + attr->reformat.id, + attr->reformat.size, + action_type_set[DR_ACTION_TYP_L2_TO_TNL_L3]); + /* Whenever prio_tag_required enabled, we can be sure that the + * previous table (ACL) already push vlan to our packet, + * And due to HW limitation we need to set this bit, otherwise + * push vlan + reformat will not work. + */ + if (MLX5_CAP_GEN(dmn->mdev, prio_tag_required)) + dr_ste_v0_set_go_back_bit(last_ste); + } + + if (action_type_set[DR_ACTION_TYP_CTR]) + dr_ste_v0_set_counter_id(last_ste, attr->ctr_id); + + dr_ste_v0_set_hit_gvmi(last_ste, attr->hit_gvmi); + dr_ste_v0_set_hit_addr(last_ste, attr->final_icm_addr, 1); +} + +static void +dr_ste_v0_set_actions_rx(struct mlx5dr_domain *dmn, + u8 *action_type_set, + u32 actions_caps, + u8 *last_ste, + struct mlx5dr_ste_actions_attr *attr, + u32 *added_stes) +{ + if (action_type_set[DR_ACTION_TYP_CTR]) + dr_ste_v0_set_counter_id(last_ste, attr->ctr_id); + + if (action_type_set[DR_ACTION_TYP_TNL_L3_TO_L2]) { + dr_ste_v0_set_entry_type(last_ste, DR_STE_TYPE_MODIFY_PKT); + dr_ste_v0_set_rx_decap_l3(last_ste, attr->decap_with_vlan); + dr_ste_v0_set_rewrite_actions(last_ste, + attr->decap_actions, + attr->decap_index); + } + + if (action_type_set[DR_ACTION_TYP_TNL_L2_TO_L2]) + dr_ste_v0_set_rx_decap(last_ste); + + if (action_type_set[DR_ACTION_TYP_POP_VLAN]) { + int i; + + for (i = 0; i < attr->vlans.count; i++) { + if (i || + action_type_set[DR_ACTION_TYP_TNL_L2_TO_L2] || + action_type_set[DR_ACTION_TYP_TNL_L3_TO_L2]) + dr_ste_v0_arr_init_next(&last_ste, + added_stes, + DR_STE_TYPE_RX, + attr->gvmi); + + dr_ste_v0_set_rx_pop_vlan(last_ste); + } + } + + if (action_type_set[DR_ACTION_TYP_MODIFY_HDR] && attr->modify_actions) { + if (dr_ste_v0_get_entry_type(last_ste) == DR_STE_TYPE_MODIFY_PKT) + dr_ste_v0_arr_init_next(&last_ste, + added_stes, + DR_STE_TYPE_MODIFY_PKT, + attr->gvmi); + else + dr_ste_v0_set_entry_type(last_ste, DR_STE_TYPE_MODIFY_PKT); + + dr_ste_v0_set_rewrite_actions(last_ste, + attr->modify_actions, + attr->modify_index); + } + + if (action_type_set[DR_ACTION_TYP_TAG]) { + if (dr_ste_v0_get_entry_type(last_ste) == DR_STE_TYPE_MODIFY_PKT) + dr_ste_v0_arr_init_next(&last_ste, + added_stes, + DR_STE_TYPE_RX, + attr->gvmi); + + dr_ste_v0_rx_set_flow_tag(last_ste, attr->flow_tag); + } + + dr_ste_v0_set_hit_gvmi(last_ste, attr->hit_gvmi); + dr_ste_v0_set_hit_addr(last_ste, attr->final_icm_addr, 1); +} + +static void dr_ste_v0_set_action_set(u8 *hw_action, + u8 hw_field, + u8 shifter, + u8 length, + u32 data) +{ + length = (length == 32) ? 0 : length; + MLX5_SET(dr_action_hw_set, hw_action, opcode, DR_STE_ACTION_MDFY_OP_SET); + MLX5_SET(dr_action_hw_set, hw_action, destination_field_code, hw_field); + MLX5_SET(dr_action_hw_set, hw_action, destination_left_shifter, shifter); + MLX5_SET(dr_action_hw_set, hw_action, destination_length, length); + MLX5_SET(dr_action_hw_set, hw_action, inline_data, data); +} + +static void dr_ste_v0_set_action_add(u8 *hw_action, + u8 hw_field, + u8 shifter, + u8 length, + u32 data) +{ + length = (length == 32) ? 0 : length; + MLX5_SET(dr_action_hw_set, hw_action, opcode, DR_STE_ACTION_MDFY_OP_ADD); + MLX5_SET(dr_action_hw_set, hw_action, destination_field_code, hw_field); + MLX5_SET(dr_action_hw_set, hw_action, destination_left_shifter, shifter); + MLX5_SET(dr_action_hw_set, hw_action, destination_length, length); + MLX5_SET(dr_action_hw_set, hw_action, inline_data, data); +} + +static void dr_ste_v0_set_action_copy(u8 *hw_action, + u8 dst_hw_field, + u8 dst_shifter, + u8 dst_len, + u8 src_hw_field, + u8 src_shifter) +{ + MLX5_SET(dr_action_hw_copy, hw_action, opcode, DR_STE_ACTION_MDFY_OP_COPY); + MLX5_SET(dr_action_hw_copy, hw_action, destination_field_code, dst_hw_field); + MLX5_SET(dr_action_hw_copy, hw_action, destination_left_shifter, dst_shifter); + MLX5_SET(dr_action_hw_copy, hw_action, destination_length, dst_len); + MLX5_SET(dr_action_hw_copy, hw_action, source_field_code, src_hw_field); + MLX5_SET(dr_action_hw_copy, hw_action, source_left_shifter, src_shifter); +} + +#define DR_STE_DECAP_L3_MIN_ACTION_NUM 5 + +static int +dr_ste_v0_set_action_decap_l3_list(void *data, u32 data_sz, + u8 *hw_action, u32 hw_action_sz, + u16 *used_hw_action_num) +{ + struct mlx5_ifc_l2_hdr_bits *l2_hdr = data; + u32 hw_action_num; + int required_actions; + u32 hdr_fld_4b; + u16 hdr_fld_2b; + u16 vlan_type; + bool vlan; + + vlan = (data_sz != HDR_LEN_L2); + hw_action_num = hw_action_sz / MLX5_ST_SZ_BYTES(dr_action_hw_set); + required_actions = DR_STE_DECAP_L3_MIN_ACTION_NUM + !!vlan; + + if (hw_action_num < required_actions) + return -ENOMEM; + + /* dmac_47_16 */ + MLX5_SET(dr_action_hw_set, hw_action, + opcode, DR_STE_ACTION_MDFY_OP_SET); + MLX5_SET(dr_action_hw_set, hw_action, + destination_length, 0); + MLX5_SET(dr_action_hw_set, hw_action, + destination_field_code, DR_STE_V0_ACTION_MDFY_FLD_L2_0); + MLX5_SET(dr_action_hw_set, hw_action, + destination_left_shifter, 16); + hdr_fld_4b = MLX5_GET(l2_hdr, l2_hdr, dmac_47_16); + MLX5_SET(dr_action_hw_set, hw_action, + inline_data, hdr_fld_4b); + hw_action += MLX5_ST_SZ_BYTES(dr_action_hw_set); + + /* smac_47_16 */ + MLX5_SET(dr_action_hw_set, hw_action, + opcode, DR_STE_ACTION_MDFY_OP_SET); + MLX5_SET(dr_action_hw_set, hw_action, + destination_length, 0); + MLX5_SET(dr_action_hw_set, hw_action, + destination_field_code, DR_STE_V0_ACTION_MDFY_FLD_L2_1); + MLX5_SET(dr_action_hw_set, hw_action, destination_left_shifter, 16); + hdr_fld_4b = (MLX5_GET(l2_hdr, l2_hdr, smac_31_0) >> 16 | + MLX5_GET(l2_hdr, l2_hdr, smac_47_32) << 16); + MLX5_SET(dr_action_hw_set, hw_action, inline_data, hdr_fld_4b); + hw_action += MLX5_ST_SZ_BYTES(dr_action_hw_set); + + /* dmac_15_0 */ + MLX5_SET(dr_action_hw_set, hw_action, + opcode, DR_STE_ACTION_MDFY_OP_SET); + MLX5_SET(dr_action_hw_set, hw_action, + destination_length, 16); + MLX5_SET(dr_action_hw_set, hw_action, + destination_field_code, DR_STE_V0_ACTION_MDFY_FLD_L2_0); + MLX5_SET(dr_action_hw_set, hw_action, + destination_left_shifter, 0); + hdr_fld_2b = MLX5_GET(l2_hdr, l2_hdr, dmac_15_0); + MLX5_SET(dr_action_hw_set, hw_action, + inline_data, hdr_fld_2b); + hw_action += MLX5_ST_SZ_BYTES(dr_action_hw_set); + + /* ethertype + (optional) vlan */ + MLX5_SET(dr_action_hw_set, hw_action, + opcode, DR_STE_ACTION_MDFY_OP_SET); + MLX5_SET(dr_action_hw_set, hw_action, + destination_field_code, DR_STE_V0_ACTION_MDFY_FLD_L2_2); + MLX5_SET(dr_action_hw_set, hw_action, + destination_left_shifter, 32); + if (!vlan) { + hdr_fld_2b = MLX5_GET(l2_hdr, l2_hdr, ethertype); + MLX5_SET(dr_action_hw_set, hw_action, inline_data, hdr_fld_2b); + MLX5_SET(dr_action_hw_set, hw_action, destination_length, 16); + } else { + hdr_fld_2b = MLX5_GET(l2_hdr, l2_hdr, ethertype); + vlan_type = hdr_fld_2b == SVLAN_ETHERTYPE ? DR_STE_SVLAN : DR_STE_CVLAN; + hdr_fld_2b = MLX5_GET(l2_hdr, l2_hdr, vlan); + hdr_fld_4b = (vlan_type << 16) | hdr_fld_2b; + MLX5_SET(dr_action_hw_set, hw_action, inline_data, hdr_fld_4b); + MLX5_SET(dr_action_hw_set, hw_action, destination_length, 18); + } + hw_action += MLX5_ST_SZ_BYTES(dr_action_hw_set); + + /* smac_15_0 */ + MLX5_SET(dr_action_hw_set, hw_action, + opcode, DR_STE_ACTION_MDFY_OP_SET); + MLX5_SET(dr_action_hw_set, hw_action, + destination_length, 16); + MLX5_SET(dr_action_hw_set, hw_action, + destination_field_code, DR_STE_V0_ACTION_MDFY_FLD_L2_1); + MLX5_SET(dr_action_hw_set, hw_action, + destination_left_shifter, 0); + hdr_fld_2b = MLX5_GET(l2_hdr, l2_hdr, smac_31_0); + MLX5_SET(dr_action_hw_set, hw_action, inline_data, hdr_fld_2b); + hw_action += MLX5_ST_SZ_BYTES(dr_action_hw_set); + + if (vlan) { + MLX5_SET(dr_action_hw_set, hw_action, + opcode, DR_STE_ACTION_MDFY_OP_SET); + hdr_fld_2b = MLX5_GET(l2_hdr, l2_hdr, vlan_type); + MLX5_SET(dr_action_hw_set, hw_action, + inline_data, hdr_fld_2b); + MLX5_SET(dr_action_hw_set, hw_action, + destination_length, 16); + MLX5_SET(dr_action_hw_set, hw_action, + destination_field_code, DR_STE_V0_ACTION_MDFY_FLD_L2_2); + MLX5_SET(dr_action_hw_set, hw_action, + destination_left_shifter, 0); + } + + *used_hw_action_num = required_actions; + + return 0; +} + +static void +dr_ste_v0_build_eth_l2_src_dst_bit_mask(struct mlx5dr_match_param *value, + bool inner, u8 *bit_mask) +{ + struct mlx5dr_match_spec *mask = inner ? &value->inner : &value->outer; + + DR_STE_SET_TAG(eth_l2_src_dst, bit_mask, dmac_47_16, mask, dmac_47_16); + DR_STE_SET_TAG(eth_l2_src_dst, bit_mask, dmac_15_0, mask, dmac_15_0); + + if (mask->smac_47_16 || mask->smac_15_0) { + MLX5_SET(ste_eth_l2_src_dst, bit_mask, smac_47_32, + mask->smac_47_16 >> 16); + MLX5_SET(ste_eth_l2_src_dst, bit_mask, smac_31_0, + mask->smac_47_16 << 16 | mask->smac_15_0); + mask->smac_47_16 = 0; + mask->smac_15_0 = 0; + } + + DR_STE_SET_TAG(eth_l2_src_dst, bit_mask, first_vlan_id, mask, first_vid); + DR_STE_SET_TAG(eth_l2_src_dst, bit_mask, first_cfi, mask, first_cfi); + DR_STE_SET_TAG(eth_l2_src_dst, bit_mask, first_priority, mask, first_prio); + DR_STE_SET_ONES(eth_l2_src_dst, bit_mask, l3_type, mask, ip_version); + + if (mask->cvlan_tag) { + MLX5_SET(ste_eth_l2_src_dst, bit_mask, first_vlan_qualifier, -1); + mask->cvlan_tag = 0; + } else if (mask->svlan_tag) { + MLX5_SET(ste_eth_l2_src_dst, bit_mask, first_vlan_qualifier, -1); + mask->svlan_tag = 0; + } +} + +static int +dr_ste_v0_build_eth_l2_src_dst_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + struct mlx5dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; + + DR_STE_SET_TAG(eth_l2_src_dst, tag, dmac_47_16, spec, dmac_47_16); + DR_STE_SET_TAG(eth_l2_src_dst, tag, dmac_15_0, spec, dmac_15_0); + + if (spec->smac_47_16 || spec->smac_15_0) { + MLX5_SET(ste_eth_l2_src_dst, tag, smac_47_32, + spec->smac_47_16 >> 16); + MLX5_SET(ste_eth_l2_src_dst, tag, smac_31_0, + spec->smac_47_16 << 16 | spec->smac_15_0); + spec->smac_47_16 = 0; + spec->smac_15_0 = 0; + } + + if (spec->ip_version) { + if (spec->ip_version == IP_VERSION_IPV4) { + MLX5_SET(ste_eth_l2_src_dst, tag, l3_type, STE_IPV4); + spec->ip_version = 0; + } else if (spec->ip_version == IP_VERSION_IPV6) { + MLX5_SET(ste_eth_l2_src_dst, tag, l3_type, STE_IPV6); + spec->ip_version = 0; + } else { + return -EINVAL; + } + } + + DR_STE_SET_TAG(eth_l2_src_dst, tag, first_vlan_id, spec, first_vid); + DR_STE_SET_TAG(eth_l2_src_dst, tag, first_cfi, spec, first_cfi); + DR_STE_SET_TAG(eth_l2_src_dst, tag, first_priority, spec, first_prio); + + if (spec->cvlan_tag) { + MLX5_SET(ste_eth_l2_src_dst, tag, first_vlan_qualifier, DR_STE_CVLAN); + spec->cvlan_tag = 0; + } else if (spec->svlan_tag) { + MLX5_SET(ste_eth_l2_src_dst, tag, first_vlan_qualifier, DR_STE_SVLAN); + spec->svlan_tag = 0; + } + return 0; +} + +static void +dr_ste_v0_build_eth_l2_src_dst_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v0_build_eth_l2_src_dst_bit_mask(mask, sb->inner, sb->bit_mask); + + sb->lu_type = DR_STE_CALC_LU_TYPE(ETHL2_SRC_DST, sb->rx, sb->inner); + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v0_build_eth_l2_src_dst_tag; +} + +static int +dr_ste_v0_build_eth_l3_ipv6_dst_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + struct mlx5dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; + + DR_STE_SET_TAG(eth_l3_ipv6_dst, tag, dst_ip_127_96, spec, dst_ip_127_96); + DR_STE_SET_TAG(eth_l3_ipv6_dst, tag, dst_ip_95_64, spec, dst_ip_95_64); + DR_STE_SET_TAG(eth_l3_ipv6_dst, tag, dst_ip_63_32, spec, dst_ip_63_32); + DR_STE_SET_TAG(eth_l3_ipv6_dst, tag, dst_ip_31_0, spec, dst_ip_31_0); + + return 0; +} + +static void +dr_ste_v0_build_eth_l3_ipv6_dst_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v0_build_eth_l3_ipv6_dst_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_CALC_LU_TYPE(ETHL3_IPV6_DST, sb->rx, sb->inner); + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v0_build_eth_l3_ipv6_dst_tag; +} + +static int +dr_ste_v0_build_eth_l3_ipv6_src_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + struct mlx5dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; + + DR_STE_SET_TAG(eth_l3_ipv6_src, tag, src_ip_127_96, spec, src_ip_127_96); + DR_STE_SET_TAG(eth_l3_ipv6_src, tag, src_ip_95_64, spec, src_ip_95_64); + DR_STE_SET_TAG(eth_l3_ipv6_src, tag, src_ip_63_32, spec, src_ip_63_32); + DR_STE_SET_TAG(eth_l3_ipv6_src, tag, src_ip_31_0, spec, src_ip_31_0); + + return 0; +} + +static void +dr_ste_v0_build_eth_l3_ipv6_src_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v0_build_eth_l3_ipv6_src_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_CALC_LU_TYPE(ETHL3_IPV6_SRC, sb->rx, sb->inner); + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v0_build_eth_l3_ipv6_src_tag; +} + +static int +dr_ste_v0_build_eth_l3_ipv4_5_tuple_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + struct mlx5dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; + + DR_STE_SET_TAG(eth_l3_ipv4_5_tuple, tag, destination_address, spec, dst_ip_31_0); + DR_STE_SET_TAG(eth_l3_ipv4_5_tuple, tag, source_address, spec, src_ip_31_0); + DR_STE_SET_TAG(eth_l3_ipv4_5_tuple, tag, destination_port, spec, tcp_dport); + DR_STE_SET_TAG(eth_l3_ipv4_5_tuple, tag, destination_port, spec, udp_dport); + DR_STE_SET_TAG(eth_l3_ipv4_5_tuple, tag, source_port, spec, tcp_sport); + DR_STE_SET_TAG(eth_l3_ipv4_5_tuple, tag, source_port, spec, udp_sport); + DR_STE_SET_TAG(eth_l3_ipv4_5_tuple, tag, protocol, spec, ip_protocol); + DR_STE_SET_TAG(eth_l3_ipv4_5_tuple, tag, fragmented, spec, frag); + DR_STE_SET_TAG(eth_l3_ipv4_5_tuple, tag, dscp, spec, ip_dscp); + DR_STE_SET_TAG(eth_l3_ipv4_5_tuple, tag, ecn, spec, ip_ecn); + + if (spec->tcp_flags) { + DR_STE_SET_TCP_FLAGS(eth_l3_ipv4_5_tuple, tag, spec); + spec->tcp_flags = 0; + } + + return 0; +} + +static void +dr_ste_v0_build_eth_l3_ipv4_5_tuple_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v0_build_eth_l3_ipv4_5_tuple_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_CALC_LU_TYPE(ETHL3_IPV4_5_TUPLE, sb->rx, sb->inner); + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v0_build_eth_l3_ipv4_5_tuple_tag; +} + +static void +dr_ste_v0_build_eth_l2_src_or_dst_bit_mask(struct mlx5dr_match_param *value, + bool inner, u8 *bit_mask) +{ + struct mlx5dr_match_spec *mask = inner ? &value->inner : &value->outer; + struct mlx5dr_match_misc *misc_mask = &value->misc; + + DR_STE_SET_TAG(eth_l2_src, bit_mask, first_vlan_id, mask, first_vid); + DR_STE_SET_TAG(eth_l2_src, bit_mask, first_cfi, mask, first_cfi); + DR_STE_SET_TAG(eth_l2_src, bit_mask, first_priority, mask, first_prio); + DR_STE_SET_TAG(eth_l2_src, bit_mask, ip_fragmented, mask, frag); + DR_STE_SET_TAG(eth_l2_src, bit_mask, l3_ethertype, mask, ethertype); + DR_STE_SET_ONES(eth_l2_src, bit_mask, l3_type, mask, ip_version); + + if (mask->svlan_tag || mask->cvlan_tag) { + MLX5_SET(ste_eth_l2_src, bit_mask, first_vlan_qualifier, -1); + mask->cvlan_tag = 0; + mask->svlan_tag = 0; + } + + if (inner) { + if (misc_mask->inner_second_cvlan_tag || + misc_mask->inner_second_svlan_tag) { + MLX5_SET(ste_eth_l2_src, bit_mask, second_vlan_qualifier, -1); + misc_mask->inner_second_cvlan_tag = 0; + misc_mask->inner_second_svlan_tag = 0; + } + + DR_STE_SET_TAG(eth_l2_src, bit_mask, + second_vlan_id, misc_mask, inner_second_vid); + DR_STE_SET_TAG(eth_l2_src, bit_mask, + second_cfi, misc_mask, inner_second_cfi); + DR_STE_SET_TAG(eth_l2_src, bit_mask, + second_priority, misc_mask, inner_second_prio); + } else { + if (misc_mask->outer_second_cvlan_tag || + misc_mask->outer_second_svlan_tag) { + MLX5_SET(ste_eth_l2_src, bit_mask, second_vlan_qualifier, -1); + misc_mask->outer_second_cvlan_tag = 0; + misc_mask->outer_second_svlan_tag = 0; + } + + DR_STE_SET_TAG(eth_l2_src, bit_mask, + second_vlan_id, misc_mask, outer_second_vid); + DR_STE_SET_TAG(eth_l2_src, bit_mask, + second_cfi, misc_mask, outer_second_cfi); + DR_STE_SET_TAG(eth_l2_src, bit_mask, + second_priority, misc_mask, outer_second_prio); + } +} + +static int +dr_ste_v0_build_eth_l2_src_or_dst_tag(struct mlx5dr_match_param *value, + bool inner, u8 *tag) +{ + struct mlx5dr_match_spec *spec = inner ? &value->inner : &value->outer; + struct mlx5dr_match_misc *misc_spec = &value->misc; + + DR_STE_SET_TAG(eth_l2_src, tag, first_vlan_id, spec, first_vid); + DR_STE_SET_TAG(eth_l2_src, tag, first_cfi, spec, first_cfi); + DR_STE_SET_TAG(eth_l2_src, tag, first_priority, spec, first_prio); + DR_STE_SET_TAG(eth_l2_src, tag, ip_fragmented, spec, frag); + DR_STE_SET_TAG(eth_l2_src, tag, l3_ethertype, spec, ethertype); + + if (spec->ip_version) { + if (spec->ip_version == IP_VERSION_IPV4) { + MLX5_SET(ste_eth_l2_src, tag, l3_type, STE_IPV4); + spec->ip_version = 0; + } else if (spec->ip_version == IP_VERSION_IPV6) { + MLX5_SET(ste_eth_l2_src, tag, l3_type, STE_IPV6); + spec->ip_version = 0; + } else { + return -EINVAL; + } + } + + if (spec->cvlan_tag) { + MLX5_SET(ste_eth_l2_src, tag, first_vlan_qualifier, DR_STE_CVLAN); + spec->cvlan_tag = 0; + } else if (spec->svlan_tag) { + MLX5_SET(ste_eth_l2_src, tag, first_vlan_qualifier, DR_STE_SVLAN); + spec->svlan_tag = 0; + } + + if (inner) { + if (misc_spec->inner_second_cvlan_tag) { + MLX5_SET(ste_eth_l2_src, tag, second_vlan_qualifier, DR_STE_CVLAN); + misc_spec->inner_second_cvlan_tag = 0; + } else if (misc_spec->inner_second_svlan_tag) { + MLX5_SET(ste_eth_l2_src, tag, second_vlan_qualifier, DR_STE_SVLAN); + misc_spec->inner_second_svlan_tag = 0; + } + + DR_STE_SET_TAG(eth_l2_src, tag, second_vlan_id, misc_spec, inner_second_vid); + DR_STE_SET_TAG(eth_l2_src, tag, second_cfi, misc_spec, inner_second_cfi); + DR_STE_SET_TAG(eth_l2_src, tag, second_priority, misc_spec, inner_second_prio); + } else { + if (misc_spec->outer_second_cvlan_tag) { + MLX5_SET(ste_eth_l2_src, tag, second_vlan_qualifier, DR_STE_CVLAN); + misc_spec->outer_second_cvlan_tag = 0; + } else if (misc_spec->outer_second_svlan_tag) { + MLX5_SET(ste_eth_l2_src, tag, second_vlan_qualifier, DR_STE_SVLAN); + misc_spec->outer_second_svlan_tag = 0; + } + DR_STE_SET_TAG(eth_l2_src, tag, second_vlan_id, misc_spec, outer_second_vid); + DR_STE_SET_TAG(eth_l2_src, tag, second_cfi, misc_spec, outer_second_cfi); + DR_STE_SET_TAG(eth_l2_src, tag, second_priority, misc_spec, outer_second_prio); + } + + return 0; +} + +static void +dr_ste_v0_build_eth_l2_src_bit_mask(struct mlx5dr_match_param *value, + bool inner, u8 *bit_mask) +{ + struct mlx5dr_match_spec *mask = inner ? &value->inner : &value->outer; + + DR_STE_SET_TAG(eth_l2_src, bit_mask, smac_47_16, mask, smac_47_16); + DR_STE_SET_TAG(eth_l2_src, bit_mask, smac_15_0, mask, smac_15_0); + + dr_ste_v0_build_eth_l2_src_or_dst_bit_mask(value, inner, bit_mask); +} + +static int +dr_ste_v0_build_eth_l2_src_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + struct mlx5dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; + + DR_STE_SET_TAG(eth_l2_src, tag, smac_47_16, spec, smac_47_16); + DR_STE_SET_TAG(eth_l2_src, tag, smac_15_0, spec, smac_15_0); + + return dr_ste_v0_build_eth_l2_src_or_dst_tag(value, sb->inner, tag); +} + +static void +dr_ste_v0_build_eth_l2_src_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v0_build_eth_l2_src_bit_mask(mask, sb->inner, sb->bit_mask); + sb->lu_type = DR_STE_CALC_LU_TYPE(ETHL2_SRC, sb->rx, sb->inner); + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v0_build_eth_l2_src_tag; +} + +static void +dr_ste_v0_build_eth_l2_dst_bit_mask(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *bit_mask) +{ + struct mlx5dr_match_spec *mask = sb->inner ? &value->inner : &value->outer; + + DR_STE_SET_TAG(eth_l2_dst, bit_mask, dmac_47_16, mask, dmac_47_16); + DR_STE_SET_TAG(eth_l2_dst, bit_mask, dmac_15_0, mask, dmac_15_0); + + dr_ste_v0_build_eth_l2_src_or_dst_bit_mask(value, sb->inner, bit_mask); +} + +static int +dr_ste_v0_build_eth_l2_dst_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + struct mlx5dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; + + DR_STE_SET_TAG(eth_l2_dst, tag, dmac_47_16, spec, dmac_47_16); + DR_STE_SET_TAG(eth_l2_dst, tag, dmac_15_0, spec, dmac_15_0); + + return dr_ste_v0_build_eth_l2_src_or_dst_tag(value, sb->inner, tag); +} + +static void +dr_ste_v0_build_eth_l2_dst_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v0_build_eth_l2_dst_bit_mask(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_CALC_LU_TYPE(ETHL2_DST, sb->rx, sb->inner); + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v0_build_eth_l2_dst_tag; +} + +static void +dr_ste_v0_build_eth_l2_tnl_bit_mask(struct mlx5dr_match_param *value, + bool inner, u8 *bit_mask) +{ + struct mlx5dr_match_spec *mask = inner ? &value->inner : &value->outer; + struct mlx5dr_match_misc *misc = &value->misc; + + DR_STE_SET_TAG(eth_l2_tnl, bit_mask, dmac_47_16, mask, dmac_47_16); + DR_STE_SET_TAG(eth_l2_tnl, bit_mask, dmac_15_0, mask, dmac_15_0); + DR_STE_SET_TAG(eth_l2_tnl, bit_mask, first_vlan_id, mask, first_vid); + DR_STE_SET_TAG(eth_l2_tnl, bit_mask, first_cfi, mask, first_cfi); + DR_STE_SET_TAG(eth_l2_tnl, bit_mask, first_priority, mask, first_prio); + DR_STE_SET_TAG(eth_l2_tnl, bit_mask, ip_fragmented, mask, frag); + DR_STE_SET_TAG(eth_l2_tnl, bit_mask, l3_ethertype, mask, ethertype); + DR_STE_SET_ONES(eth_l2_tnl, bit_mask, l3_type, mask, ip_version); + + if (misc->vxlan_vni) { + MLX5_SET(ste_eth_l2_tnl, bit_mask, + l2_tunneling_network_id, (misc->vxlan_vni << 8)); + misc->vxlan_vni = 0; + } + + if (mask->svlan_tag || mask->cvlan_tag) { + MLX5_SET(ste_eth_l2_tnl, bit_mask, first_vlan_qualifier, -1); + mask->cvlan_tag = 0; + mask->svlan_tag = 0; + } +} + +static int +dr_ste_v0_build_eth_l2_tnl_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + struct mlx5dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; + struct mlx5dr_match_misc *misc = &value->misc; + + DR_STE_SET_TAG(eth_l2_tnl, tag, dmac_47_16, spec, dmac_47_16); + DR_STE_SET_TAG(eth_l2_tnl, tag, dmac_15_0, spec, dmac_15_0); + DR_STE_SET_TAG(eth_l2_tnl, tag, first_vlan_id, spec, first_vid); + DR_STE_SET_TAG(eth_l2_tnl, tag, first_cfi, spec, first_cfi); + DR_STE_SET_TAG(eth_l2_tnl, tag, ip_fragmented, spec, frag); + DR_STE_SET_TAG(eth_l2_tnl, tag, first_priority, spec, first_prio); + DR_STE_SET_TAG(eth_l2_tnl, tag, l3_ethertype, spec, ethertype); + + if (misc->vxlan_vni) { + MLX5_SET(ste_eth_l2_tnl, tag, l2_tunneling_network_id, + (misc->vxlan_vni << 8)); + misc->vxlan_vni = 0; + } + + if (spec->cvlan_tag) { + MLX5_SET(ste_eth_l2_tnl, tag, first_vlan_qualifier, DR_STE_CVLAN); + spec->cvlan_tag = 0; + } else if (spec->svlan_tag) { + MLX5_SET(ste_eth_l2_tnl, tag, first_vlan_qualifier, DR_STE_SVLAN); + spec->svlan_tag = 0; + } + + if (spec->ip_version) { + if (spec->ip_version == IP_VERSION_IPV4) { + MLX5_SET(ste_eth_l2_tnl, tag, l3_type, STE_IPV4); + spec->ip_version = 0; + } else if (spec->ip_version == IP_VERSION_IPV6) { + MLX5_SET(ste_eth_l2_tnl, tag, l3_type, STE_IPV6); + spec->ip_version = 0; + } else { + return -EINVAL; + } + } + + return 0; +} + +static void +dr_ste_v0_build_eth_l2_tnl_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v0_build_eth_l2_tnl_bit_mask(mask, sb->inner, sb->bit_mask); + + sb->lu_type = DR_STE_V0_LU_TYPE_ETHL2_TUNNELING_I; + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v0_build_eth_l2_tnl_tag; +} + +static int +dr_ste_v0_build_eth_l3_ipv4_misc_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + struct mlx5dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; + + DR_STE_SET_TAG(eth_l3_ipv4_misc, tag, time_to_live, spec, ttl_hoplimit); + DR_STE_SET_TAG(eth_l3_ipv4_misc, tag, ihl, spec, ipv4_ihl); + + return 0; +} + +static void +dr_ste_v0_build_eth_l3_ipv4_misc_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v0_build_eth_l3_ipv4_misc_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_CALC_LU_TYPE(ETHL3_IPV4_MISC, sb->rx, sb->inner); + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v0_build_eth_l3_ipv4_misc_tag; +} + +static int +dr_ste_v0_build_eth_ipv6_l3_l4_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + struct mlx5dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; + struct mlx5dr_match_misc *misc = &value->misc; + + DR_STE_SET_TAG(eth_l4, tag, dst_port, spec, tcp_dport); + DR_STE_SET_TAG(eth_l4, tag, src_port, spec, tcp_sport); + DR_STE_SET_TAG(eth_l4, tag, dst_port, spec, udp_dport); + DR_STE_SET_TAG(eth_l4, tag, src_port, spec, udp_sport); + DR_STE_SET_TAG(eth_l4, tag, protocol, spec, ip_protocol); + DR_STE_SET_TAG(eth_l4, tag, fragmented, spec, frag); + DR_STE_SET_TAG(eth_l4, tag, dscp, spec, ip_dscp); + DR_STE_SET_TAG(eth_l4, tag, ecn, spec, ip_ecn); + DR_STE_SET_TAG(eth_l4, tag, ipv6_hop_limit, spec, ttl_hoplimit); + + if (sb->inner) + DR_STE_SET_TAG(eth_l4, tag, flow_label, misc, inner_ipv6_flow_label); + else + DR_STE_SET_TAG(eth_l4, tag, flow_label, misc, outer_ipv6_flow_label); + + if (spec->tcp_flags) { + DR_STE_SET_TCP_FLAGS(eth_l4, tag, spec); + spec->tcp_flags = 0; + } + + return 0; +} + +static void +dr_ste_v0_build_eth_ipv6_l3_l4_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v0_build_eth_ipv6_l3_l4_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_CALC_LU_TYPE(ETHL4, sb->rx, sb->inner); + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v0_build_eth_ipv6_l3_l4_tag; +} + +static int +dr_ste_v0_build_mpls_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + struct mlx5dr_match_misc2 *misc2 = &value->misc2; + + if (sb->inner) + DR_STE_SET_MPLS(mpls, misc2, inner, tag); + else + DR_STE_SET_MPLS(mpls, misc2, outer, tag); + + return 0; +} + +static void +dr_ste_v0_build_mpls_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v0_build_mpls_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_CALC_LU_TYPE(MPLS_FIRST, sb->rx, sb->inner); + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v0_build_mpls_tag; +} + +static int +dr_ste_v0_build_tnl_gre_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + struct mlx5dr_match_misc *misc = &value->misc; + + DR_STE_SET_TAG(gre, tag, gre_protocol, misc, gre_protocol); + + DR_STE_SET_TAG(gre, tag, gre_k_present, misc, gre_k_present); + DR_STE_SET_TAG(gre, tag, gre_key_h, misc, gre_key_h); + DR_STE_SET_TAG(gre, tag, gre_key_l, misc, gre_key_l); + + DR_STE_SET_TAG(gre, tag, gre_c_present, misc, gre_c_present); + + DR_STE_SET_TAG(gre, tag, gre_s_present, misc, gre_s_present); + + return 0; +} + +static void +dr_ste_v0_build_tnl_gre_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v0_build_tnl_gre_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_V0_LU_TYPE_GRE; + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v0_build_tnl_gre_tag; +} + +static int +dr_ste_v0_build_tnl_mpls_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + struct mlx5dr_match_misc2 *misc_2 = &value->misc2; + u32 mpls_hdr; + + if (DR_STE_IS_OUTER_MPLS_OVER_GRE_SET(misc_2)) { + mpls_hdr = misc_2->outer_first_mpls_over_gre_label << HDR_MPLS_OFFSET_LABEL; + misc_2->outer_first_mpls_over_gre_label = 0; + mpls_hdr |= misc_2->outer_first_mpls_over_gre_exp << HDR_MPLS_OFFSET_EXP; + misc_2->outer_first_mpls_over_gre_exp = 0; + mpls_hdr |= misc_2->outer_first_mpls_over_gre_s_bos << HDR_MPLS_OFFSET_S_BOS; + misc_2->outer_first_mpls_over_gre_s_bos = 0; + mpls_hdr |= misc_2->outer_first_mpls_over_gre_ttl << HDR_MPLS_OFFSET_TTL; + misc_2->outer_first_mpls_over_gre_ttl = 0; + } else { + mpls_hdr = misc_2->outer_first_mpls_over_udp_label << HDR_MPLS_OFFSET_LABEL; + misc_2->outer_first_mpls_over_udp_label = 0; + mpls_hdr |= misc_2->outer_first_mpls_over_udp_exp << HDR_MPLS_OFFSET_EXP; + misc_2->outer_first_mpls_over_udp_exp = 0; + mpls_hdr |= misc_2->outer_first_mpls_over_udp_s_bos << HDR_MPLS_OFFSET_S_BOS; + misc_2->outer_first_mpls_over_udp_s_bos = 0; + mpls_hdr |= misc_2->outer_first_mpls_over_udp_ttl << HDR_MPLS_OFFSET_TTL; + misc_2->outer_first_mpls_over_udp_ttl = 0; + } + + MLX5_SET(ste_flex_parser_0, tag, flex_parser_3, mpls_hdr); + return 0; +} + +static void +dr_ste_v0_build_tnl_mpls_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v0_build_tnl_mpls_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_V0_LU_TYPE_FLEX_PARSER_0; + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v0_build_tnl_mpls_tag; +} + +static int +dr_ste_v0_build_tnl_mpls_over_udp_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + struct mlx5dr_match_misc2 *misc2 = &value->misc2; + u8 *parser_ptr; + u8 parser_id; + u32 mpls_hdr; + + mpls_hdr = misc2->outer_first_mpls_over_udp_label << HDR_MPLS_OFFSET_LABEL; + misc2->outer_first_mpls_over_udp_label = 0; + mpls_hdr |= misc2->outer_first_mpls_over_udp_exp << HDR_MPLS_OFFSET_EXP; + misc2->outer_first_mpls_over_udp_exp = 0; + mpls_hdr |= misc2->outer_first_mpls_over_udp_s_bos << HDR_MPLS_OFFSET_S_BOS; + misc2->outer_first_mpls_over_udp_s_bos = 0; + mpls_hdr |= misc2->outer_first_mpls_over_udp_ttl << HDR_MPLS_OFFSET_TTL; + misc2->outer_first_mpls_over_udp_ttl = 0; + + parser_id = sb->caps->flex_parser_id_mpls_over_udp; + parser_ptr = dr_ste_calc_flex_parser_offset(tag, parser_id); + *(__be32 *)parser_ptr = cpu_to_be32(mpls_hdr); + + return 0; +} + +static void +dr_ste_v0_build_tnl_mpls_over_udp_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v0_build_tnl_mpls_over_udp_tag(mask, sb, sb->bit_mask); + /* STEs with lookup type FLEX_PARSER_{0/1} includes + * flex parsers_{0-3}/{4-7} respectively. + */ + sb->lu_type = sb->caps->flex_parser_id_mpls_over_udp > DR_STE_MAX_FLEX_0_ID ? + DR_STE_V0_LU_TYPE_FLEX_PARSER_1 : + DR_STE_V0_LU_TYPE_FLEX_PARSER_0; + + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v0_build_tnl_mpls_over_udp_tag; +} + +static int +dr_ste_v0_build_tnl_mpls_over_gre_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + struct mlx5dr_match_misc2 *misc2 = &value->misc2; + u8 *parser_ptr; + u8 parser_id; + u32 mpls_hdr; + + mpls_hdr = misc2->outer_first_mpls_over_gre_label << HDR_MPLS_OFFSET_LABEL; + misc2->outer_first_mpls_over_gre_label = 0; + mpls_hdr |= misc2->outer_first_mpls_over_gre_exp << HDR_MPLS_OFFSET_EXP; + misc2->outer_first_mpls_over_gre_exp = 0; + mpls_hdr |= misc2->outer_first_mpls_over_gre_s_bos << HDR_MPLS_OFFSET_S_BOS; + misc2->outer_first_mpls_over_gre_s_bos = 0; + mpls_hdr |= misc2->outer_first_mpls_over_gre_ttl << HDR_MPLS_OFFSET_TTL; + misc2->outer_first_mpls_over_gre_ttl = 0; + + parser_id = sb->caps->flex_parser_id_mpls_over_gre; + parser_ptr = dr_ste_calc_flex_parser_offset(tag, parser_id); + *(__be32 *)parser_ptr = cpu_to_be32(mpls_hdr); + + return 0; +} + +static void +dr_ste_v0_build_tnl_mpls_over_gre_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v0_build_tnl_mpls_over_gre_tag(mask, sb, sb->bit_mask); + + /* STEs with lookup type FLEX_PARSER_{0/1} includes + * flex parsers_{0-3}/{4-7} respectively. + */ + sb->lu_type = sb->caps->flex_parser_id_mpls_over_gre > DR_STE_MAX_FLEX_0_ID ? + DR_STE_V0_LU_TYPE_FLEX_PARSER_1 : + DR_STE_V0_LU_TYPE_FLEX_PARSER_0; + + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v0_build_tnl_mpls_over_gre_tag; +} + +#define ICMP_TYPE_OFFSET_FIRST_DW 24 +#define ICMP_CODE_OFFSET_FIRST_DW 16 + +static int +dr_ste_v0_build_icmp_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + struct mlx5dr_match_misc3 *misc_3 = &value->misc3; + u32 *icmp_header_data; + int dw0_location; + int dw1_location; + u8 *parser_ptr; + u8 *icmp_type; + u8 *icmp_code; + bool is_ipv4; + u32 icmp_hdr; + + is_ipv4 = DR_MASK_IS_ICMPV4_SET(misc_3); + if (is_ipv4) { + icmp_header_data = &misc_3->icmpv4_header_data; + icmp_type = &misc_3->icmpv4_type; + icmp_code = &misc_3->icmpv4_code; + dw0_location = sb->caps->flex_parser_id_icmp_dw0; + dw1_location = sb->caps->flex_parser_id_icmp_dw1; + } else { + icmp_header_data = &misc_3->icmpv6_header_data; + icmp_type = &misc_3->icmpv6_type; + icmp_code = &misc_3->icmpv6_code; + dw0_location = sb->caps->flex_parser_id_icmpv6_dw0; + dw1_location = sb->caps->flex_parser_id_icmpv6_dw1; + } + + parser_ptr = dr_ste_calc_flex_parser_offset(tag, dw0_location); + icmp_hdr = (*icmp_type << ICMP_TYPE_OFFSET_FIRST_DW) | + (*icmp_code << ICMP_CODE_OFFSET_FIRST_DW); + *(__be32 *)parser_ptr = cpu_to_be32(icmp_hdr); + *icmp_code = 0; + *icmp_type = 0; + + parser_ptr = dr_ste_calc_flex_parser_offset(tag, dw1_location); + *(__be32 *)parser_ptr = cpu_to_be32(*icmp_header_data); + *icmp_header_data = 0; + + return 0; +} + +static void +dr_ste_v0_build_icmp_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + u8 parser_id; + bool is_ipv4; + + dr_ste_v0_build_icmp_tag(mask, sb, sb->bit_mask); + + /* STEs with lookup type FLEX_PARSER_{0/1} includes + * flex parsers_{0-3}/{4-7} respectively. + */ + is_ipv4 = DR_MASK_IS_ICMPV4_SET(&mask->misc3); + parser_id = is_ipv4 ? sb->caps->flex_parser_id_icmp_dw0 : + sb->caps->flex_parser_id_icmpv6_dw0; + sb->lu_type = parser_id > DR_STE_MAX_FLEX_0_ID ? + DR_STE_V0_LU_TYPE_FLEX_PARSER_1 : + DR_STE_V0_LU_TYPE_FLEX_PARSER_0; + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v0_build_icmp_tag; +} + +static int +dr_ste_v0_build_general_purpose_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + struct mlx5dr_match_misc2 *misc_2 = &value->misc2; + + DR_STE_SET_TAG(general_purpose, tag, general_purpose_lookup_field, + misc_2, metadata_reg_a); + + return 0; +} + +static void +dr_ste_v0_build_general_purpose_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v0_build_general_purpose_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_V0_LU_TYPE_GENERAL_PURPOSE; + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v0_build_general_purpose_tag; +} + +static int +dr_ste_v0_build_eth_l4_misc_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + struct mlx5dr_match_misc3 *misc3 = &value->misc3; + + if (sb->inner) { + DR_STE_SET_TAG(eth_l4_misc, tag, seq_num, misc3, inner_tcp_seq_num); + DR_STE_SET_TAG(eth_l4_misc, tag, ack_num, misc3, inner_tcp_ack_num); + } else { + DR_STE_SET_TAG(eth_l4_misc, tag, seq_num, misc3, outer_tcp_seq_num); + DR_STE_SET_TAG(eth_l4_misc, tag, ack_num, misc3, outer_tcp_ack_num); + } + + return 0; +} + +static void +dr_ste_v0_build_eth_l4_misc_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v0_build_eth_l4_misc_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_CALC_LU_TYPE(ETHL4_MISC, sb->rx, sb->inner); + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v0_build_eth_l4_misc_tag; +} + +static int +dr_ste_v0_build_flex_parser_tnl_vxlan_gpe_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + struct mlx5dr_match_misc3 *misc3 = &value->misc3; + + DR_STE_SET_TAG(flex_parser_tnl_vxlan_gpe, tag, + outer_vxlan_gpe_flags, misc3, + outer_vxlan_gpe_flags); + DR_STE_SET_TAG(flex_parser_tnl_vxlan_gpe, tag, + outer_vxlan_gpe_next_protocol, misc3, + outer_vxlan_gpe_next_protocol); + DR_STE_SET_TAG(flex_parser_tnl_vxlan_gpe, tag, + outer_vxlan_gpe_vni, misc3, + outer_vxlan_gpe_vni); + + return 0; +} + +static void +dr_ste_v0_build_flex_parser_tnl_vxlan_gpe_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v0_build_flex_parser_tnl_vxlan_gpe_tag(mask, sb, sb->bit_mask); + sb->lu_type = DR_STE_V0_LU_TYPE_FLEX_PARSER_TNL_HEADER; + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v0_build_flex_parser_tnl_vxlan_gpe_tag; +} + +static int +dr_ste_v0_build_flex_parser_tnl_geneve_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + struct mlx5dr_match_misc *misc = &value->misc; + + DR_STE_SET_TAG(flex_parser_tnl_geneve, tag, + geneve_protocol_type, misc, geneve_protocol_type); + DR_STE_SET_TAG(flex_parser_tnl_geneve, tag, + geneve_oam, misc, geneve_oam); + DR_STE_SET_TAG(flex_parser_tnl_geneve, tag, + geneve_opt_len, misc, geneve_opt_len); + DR_STE_SET_TAG(flex_parser_tnl_geneve, tag, + geneve_vni, misc, geneve_vni); + + return 0; +} + +static void +dr_ste_v0_build_flex_parser_tnl_geneve_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v0_build_flex_parser_tnl_geneve_tag(mask, sb, sb->bit_mask); + sb->lu_type = DR_STE_V0_LU_TYPE_FLEX_PARSER_TNL_HEADER; + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v0_build_flex_parser_tnl_geneve_tag; +} + +static int +dr_ste_v0_build_register_0_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + struct mlx5dr_match_misc2 *misc2 = &value->misc2; + + DR_STE_SET_TAG(register_0, tag, register_0_h, misc2, metadata_reg_c_0); + DR_STE_SET_TAG(register_0, tag, register_0_l, misc2, metadata_reg_c_1); + DR_STE_SET_TAG(register_0, tag, register_1_h, misc2, metadata_reg_c_2); + DR_STE_SET_TAG(register_0, tag, register_1_l, misc2, metadata_reg_c_3); + + return 0; +} + +static void +dr_ste_v0_build_register_0_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v0_build_register_0_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_V0_LU_TYPE_STEERING_REGISTERS_0; + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v0_build_register_0_tag; +} + +static int +dr_ste_v0_build_register_1_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + struct mlx5dr_match_misc2 *misc2 = &value->misc2; + + DR_STE_SET_TAG(register_1, tag, register_2_h, misc2, metadata_reg_c_4); + DR_STE_SET_TAG(register_1, tag, register_2_l, misc2, metadata_reg_c_5); + DR_STE_SET_TAG(register_1, tag, register_3_h, misc2, metadata_reg_c_6); + DR_STE_SET_TAG(register_1, tag, register_3_l, misc2, metadata_reg_c_7); + + return 0; +} + +static void +dr_ste_v0_build_register_1_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v0_build_register_1_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_V0_LU_TYPE_STEERING_REGISTERS_1; + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v0_build_register_1_tag; +} + +static void +dr_ste_v0_build_src_gvmi_qpn_bit_mask(struct mlx5dr_match_param *value, + u8 *bit_mask) +{ + struct mlx5dr_match_misc *misc_mask = &value->misc; + + DR_STE_SET_ONES(src_gvmi_qp, bit_mask, source_gvmi, misc_mask, source_port); + DR_STE_SET_ONES(src_gvmi_qp, bit_mask, source_qp, misc_mask, source_sqn); + misc_mask->source_eswitch_owner_vhca_id = 0; +} + +static int +dr_ste_v0_build_src_gvmi_qpn_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + struct mlx5dr_match_misc *misc = &value->misc; + struct mlx5dr_cmd_vport_cap *vport_cap; + struct mlx5dr_domain *dmn = sb->dmn; + struct mlx5dr_domain *vport_dmn; + u8 *bit_mask = sb->bit_mask; + bool source_gvmi_set; + + DR_STE_SET_TAG(src_gvmi_qp, tag, source_qp, misc, source_sqn); + + if (sb->vhca_id_valid) { + /* Find port GVMI based on the eswitch_owner_vhca_id */ + if (misc->source_eswitch_owner_vhca_id == dmn->info.caps.gvmi) + vport_dmn = dmn; + else if (dmn->peer_dmn && (misc->source_eswitch_owner_vhca_id == + dmn->peer_dmn->info.caps.gvmi)) + vport_dmn = dmn->peer_dmn; + else + return -EINVAL; + + misc->source_eswitch_owner_vhca_id = 0; + } else { + vport_dmn = dmn; + } + + source_gvmi_set = MLX5_GET(ste_src_gvmi_qp, bit_mask, source_gvmi); + if (source_gvmi_set) { + vport_cap = mlx5dr_domain_get_vport_cap(vport_dmn, + misc->source_port); + if (!vport_cap) { + mlx5dr_err(dmn, "Vport 0x%x is disabled or invalid\n", + misc->source_port); + return -EINVAL; + } + + if (vport_cap->vport_gvmi) + MLX5_SET(ste_src_gvmi_qp, tag, source_gvmi, vport_cap->vport_gvmi); + + misc->source_port = 0; + } + + return 0; +} + +static void +dr_ste_v0_build_src_gvmi_qpn_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v0_build_src_gvmi_qpn_bit_mask(mask, sb->bit_mask); + + sb->lu_type = DR_STE_V0_LU_TYPE_SRC_GVMI_AND_QP; + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v0_build_src_gvmi_qpn_tag; +} + +static void dr_ste_v0_set_flex_parser(u32 *misc4_field_id, + u32 *misc4_field_value, + bool *parser_is_used, + u8 *tag) +{ + u32 id = *misc4_field_id; + u8 *parser_ptr; + + if (id >= DR_NUM_OF_FLEX_PARSERS || parser_is_used[id]) + return; + + parser_is_used[id] = true; + parser_ptr = dr_ste_calc_flex_parser_offset(tag, id); + + *(__be32 *)parser_ptr = cpu_to_be32(*misc4_field_value); + *misc4_field_id = 0; + *misc4_field_value = 0; +} + +static int dr_ste_v0_build_flex_parser_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + struct mlx5dr_match_misc4 *misc_4_mask = &value->misc4; + bool parser_is_used[DR_NUM_OF_FLEX_PARSERS] = {}; + + dr_ste_v0_set_flex_parser(&misc_4_mask->prog_sample_field_id_0, + &misc_4_mask->prog_sample_field_value_0, + parser_is_used, tag); + + dr_ste_v0_set_flex_parser(&misc_4_mask->prog_sample_field_id_1, + &misc_4_mask->prog_sample_field_value_1, + parser_is_used, tag); + + dr_ste_v0_set_flex_parser(&misc_4_mask->prog_sample_field_id_2, + &misc_4_mask->prog_sample_field_value_2, + parser_is_used, tag); + + dr_ste_v0_set_flex_parser(&misc_4_mask->prog_sample_field_id_3, + &misc_4_mask->prog_sample_field_value_3, + parser_is_used, tag); + + return 0; +} + +static void dr_ste_v0_build_flex_parser_0_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + sb->lu_type = DR_STE_V0_LU_TYPE_FLEX_PARSER_0; + dr_ste_v0_build_flex_parser_tag(mask, sb, sb->bit_mask); + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v0_build_flex_parser_tag; +} + +static void dr_ste_v0_build_flex_parser_1_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + sb->lu_type = DR_STE_V0_LU_TYPE_FLEX_PARSER_1; + dr_ste_v0_build_flex_parser_tag(mask, sb, sb->bit_mask); + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v0_build_flex_parser_tag; +} + +static int +dr_ste_v0_build_flex_parser_tnl_geneve_tlv_opt_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + struct mlx5dr_match_misc3 *misc3 = &value->misc3; + u8 parser_id = sb->caps->flex_parser_id_geneve_tlv_option_0; + u8 *parser_ptr = dr_ste_calc_flex_parser_offset(tag, parser_id); + + MLX5_SET(ste_flex_parser_0, parser_ptr, flex_parser_3, + misc3->geneve_tlv_option_0_data); + misc3->geneve_tlv_option_0_data = 0; + + return 0; +} + +static void +dr_ste_v0_build_flex_parser_tnl_geneve_tlv_opt_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v0_build_flex_parser_tnl_geneve_tlv_opt_tag(mask, sb, sb->bit_mask); + + /* STEs with lookup type FLEX_PARSER_{0/1} includes + * flex parsers_{0-3}/{4-7} respectively. + */ + sb->lu_type = sb->caps->flex_parser_id_geneve_tlv_option_0 > 3 ? + DR_STE_V0_LU_TYPE_FLEX_PARSER_1 : + DR_STE_V0_LU_TYPE_FLEX_PARSER_0; + + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v0_build_flex_parser_tnl_geneve_tlv_opt_tag; +} + +static int dr_ste_v0_build_flex_parser_tnl_gtpu_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + struct mlx5dr_match_misc3 *misc3 = &value->misc3; + + DR_STE_SET_TAG(flex_parser_tnl_gtpu, tag, + gtpu_msg_flags, misc3, + gtpu_msg_flags); + DR_STE_SET_TAG(flex_parser_tnl_gtpu, tag, + gtpu_msg_type, misc3, + gtpu_msg_type); + DR_STE_SET_TAG(flex_parser_tnl_gtpu, tag, + gtpu_teid, misc3, + gtpu_teid); + + return 0; +} + +static void dr_ste_v0_build_flex_parser_tnl_gtpu_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v0_build_flex_parser_tnl_gtpu_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_V0_LU_TYPE_FLEX_PARSER_TNL_HEADER; + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v0_build_flex_parser_tnl_gtpu_tag; +} + +static int +dr_ste_v0_build_tnl_gtpu_flex_parser_0_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + if (dr_is_flex_parser_0_id(sb->caps->flex_parser_id_gtpu_dw_0)) + DR_STE_SET_FLEX_PARSER_FIELD(tag, gtpu_dw_0, sb->caps, &value->misc3); + if (dr_is_flex_parser_0_id(sb->caps->flex_parser_id_gtpu_teid)) + DR_STE_SET_FLEX_PARSER_FIELD(tag, gtpu_teid, sb->caps, &value->misc3); + if (dr_is_flex_parser_0_id(sb->caps->flex_parser_id_gtpu_dw_2)) + DR_STE_SET_FLEX_PARSER_FIELD(tag, gtpu_dw_2, sb->caps, &value->misc3); + if (dr_is_flex_parser_0_id(sb->caps->flex_parser_id_gtpu_first_ext_dw_0)) + DR_STE_SET_FLEX_PARSER_FIELD(tag, gtpu_first_ext_dw_0, sb->caps, &value->misc3); + return 0; +} + +static void +dr_ste_v0_build_tnl_gtpu_flex_parser_0_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v0_build_tnl_gtpu_flex_parser_0_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_V0_LU_TYPE_FLEX_PARSER_0; + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v0_build_tnl_gtpu_flex_parser_0_tag; +} + +static int +dr_ste_v0_build_tnl_gtpu_flex_parser_1_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + if (dr_is_flex_parser_1_id(sb->caps->flex_parser_id_gtpu_dw_0)) + DR_STE_SET_FLEX_PARSER_FIELD(tag, gtpu_dw_0, sb->caps, &value->misc3); + if (dr_is_flex_parser_1_id(sb->caps->flex_parser_id_gtpu_teid)) + DR_STE_SET_FLEX_PARSER_FIELD(tag, gtpu_teid, sb->caps, &value->misc3); + if (dr_is_flex_parser_1_id(sb->caps->flex_parser_id_gtpu_dw_2)) + DR_STE_SET_FLEX_PARSER_FIELD(tag, gtpu_dw_2, sb->caps, &value->misc3); + if (dr_is_flex_parser_1_id(sb->caps->flex_parser_id_gtpu_first_ext_dw_0)) + DR_STE_SET_FLEX_PARSER_FIELD(tag, gtpu_first_ext_dw_0, sb->caps, &value->misc3); + return 0; +} + +static void +dr_ste_v0_build_tnl_gtpu_flex_parser_1_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v0_build_tnl_gtpu_flex_parser_1_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_V0_LU_TYPE_FLEX_PARSER_1; + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v0_build_tnl_gtpu_flex_parser_1_tag; +} + +static int dr_ste_v0_build_tnl_header_0_1_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + uint8_t *tag) +{ + struct mlx5dr_match_misc5 *misc5 = &value->misc5; + + DR_STE_SET_TAG(tunnel_header, tag, tunnel_header_0, misc5, tunnel_header_0); + DR_STE_SET_TAG(tunnel_header, tag, tunnel_header_1, misc5, tunnel_header_1); + + return 0; +} + +static void dr_ste_v0_build_tnl_header_0_1_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + sb->lu_type = DR_STE_V0_LU_TYPE_TUNNEL_HEADER; + dr_ste_v0_build_tnl_header_0_1_tag(mask, sb, sb->bit_mask); + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v0_build_tnl_header_0_1_tag; +} + +static int +dr_ste_v0_alloc_modify_hdr_chunk(struct mlx5dr_action *action, + u32 chunck_size) +{ + int ret; + + action->rewrite->chunk = + mlx5dr_icm_alloc_chunk(action->rewrite->dmn->action_icm_pool, + chunck_size); + if (!action->rewrite->chunk) + return -ENOMEM; + + action->rewrite->index = (mlx5dr_icm_pool_get_chunk_icm_addr(action->rewrite->chunk) - + action->rewrite->dmn->info.caps.hdr_modify_icm_addr) / + MLX5DR_ACTION_CACHE_LINE_SIZE; + + ret = mlx5dr_send_postsend_action(action->rewrite->dmn, action); + if (ret) + goto free_chunk; + + return 0; + +free_chunk: + mlx5dr_icm_free_chunk(action->rewrite->chunk); + return -ENOMEM; +} + +static void dr_ste_v0_dealloc_modify_hdr_chunk(struct mlx5dr_action *action) +{ + mlx5dr_icm_free_chunk(action->rewrite->chunk); + kfree(action->rewrite->data); +} + +static struct mlx5dr_ste_ctx ste_ctx_v0 = { + /* Builders */ + .build_eth_l2_src_dst_init = &dr_ste_v0_build_eth_l2_src_dst_init, + .build_eth_l3_ipv6_src_init = &dr_ste_v0_build_eth_l3_ipv6_src_init, + .build_eth_l3_ipv6_dst_init = &dr_ste_v0_build_eth_l3_ipv6_dst_init, + .build_eth_l3_ipv4_5_tuple_init = &dr_ste_v0_build_eth_l3_ipv4_5_tuple_init, + .build_eth_l2_src_init = &dr_ste_v0_build_eth_l2_src_init, + .build_eth_l2_dst_init = &dr_ste_v0_build_eth_l2_dst_init, + .build_eth_l2_tnl_init = &dr_ste_v0_build_eth_l2_tnl_init, + .build_eth_l3_ipv4_misc_init = &dr_ste_v0_build_eth_l3_ipv4_misc_init, + .build_eth_ipv6_l3_l4_init = &dr_ste_v0_build_eth_ipv6_l3_l4_init, + .build_mpls_init = &dr_ste_v0_build_mpls_init, + .build_tnl_gre_init = &dr_ste_v0_build_tnl_gre_init, + .build_tnl_mpls_init = &dr_ste_v0_build_tnl_mpls_init, + .build_tnl_mpls_over_udp_init = &dr_ste_v0_build_tnl_mpls_over_udp_init, + .build_tnl_mpls_over_gre_init = &dr_ste_v0_build_tnl_mpls_over_gre_init, + .build_icmp_init = &dr_ste_v0_build_icmp_init, + .build_general_purpose_init = &dr_ste_v0_build_general_purpose_init, + .build_eth_l4_misc_init = &dr_ste_v0_build_eth_l4_misc_init, + .build_tnl_vxlan_gpe_init = &dr_ste_v0_build_flex_parser_tnl_vxlan_gpe_init, + .build_tnl_geneve_init = &dr_ste_v0_build_flex_parser_tnl_geneve_init, + .build_tnl_geneve_tlv_opt_init = &dr_ste_v0_build_flex_parser_tnl_geneve_tlv_opt_init, + .build_register_0_init = &dr_ste_v0_build_register_0_init, + .build_register_1_init = &dr_ste_v0_build_register_1_init, + .build_src_gvmi_qpn_init = &dr_ste_v0_build_src_gvmi_qpn_init, + .build_flex_parser_0_init = &dr_ste_v0_build_flex_parser_0_init, + .build_flex_parser_1_init = &dr_ste_v0_build_flex_parser_1_init, + .build_tnl_gtpu_init = &dr_ste_v0_build_flex_parser_tnl_gtpu_init, + .build_tnl_header_0_1_init = &dr_ste_v0_build_tnl_header_0_1_init, + .build_tnl_gtpu_flex_parser_0_init = &dr_ste_v0_build_tnl_gtpu_flex_parser_0_init, + .build_tnl_gtpu_flex_parser_1_init = &dr_ste_v0_build_tnl_gtpu_flex_parser_1_init, + + /* Getters and Setters */ + .ste_init = &dr_ste_v0_init, + .set_next_lu_type = &dr_ste_v0_set_next_lu_type, + .get_next_lu_type = &dr_ste_v0_get_next_lu_type, + .set_miss_addr = &dr_ste_v0_set_miss_addr, + .get_miss_addr = &dr_ste_v0_get_miss_addr, + .set_hit_addr = &dr_ste_v0_set_hit_addr, + .set_byte_mask = &dr_ste_v0_set_byte_mask, + .get_byte_mask = &dr_ste_v0_get_byte_mask, + + /* Actions */ + .actions_caps = DR_STE_CTX_ACTION_CAP_NONE, + .set_actions_rx = &dr_ste_v0_set_actions_rx, + .set_actions_tx = &dr_ste_v0_set_actions_tx, + .modify_field_arr_sz = ARRAY_SIZE(dr_ste_v0_action_modify_field_arr), + .modify_field_arr = dr_ste_v0_action_modify_field_arr, + .set_action_set = &dr_ste_v0_set_action_set, + .set_action_add = &dr_ste_v0_set_action_add, + .set_action_copy = &dr_ste_v0_set_action_copy, + .set_action_decap_l3_list = &dr_ste_v0_set_action_decap_l3_list, + .alloc_modify_hdr_chunk = &dr_ste_v0_alloc_modify_hdr_chunk, + .dealloc_modify_hdr_chunk = &dr_ste_v0_dealloc_modify_hdr_chunk, +}; + +struct mlx5dr_ste_ctx *mlx5dr_ste_get_ctx_v0(void) +{ + return &ste_ctx_v0; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v1.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v1.c new file mode 100644 index 0000000..5b63d9b --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v1.c @@ -0,0 +1,2483 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. */ + +#include +#include "mlx5_ifc_dr_ste_v1.h" +#include "dr_ste.h" + +#define DR_STE_CALC_DFNR_TYPE(lookup_type, inner) \ + ((inner) ? DR_STE_V1_LU_TYPE_##lookup_type##_I : \ + DR_STE_V1_LU_TYPE_##lookup_type##_O) + +enum dr_ste_v1_entry_format { + DR_STE_V1_TYPE_BWC_BYTE = 0x0, + DR_STE_V1_TYPE_BWC_DW = 0x1, + DR_STE_V1_TYPE_MATCH = 0x2, +}; + +/* Lookup type is built from 2B: [ Definer mode 1B ][ Definer index 1B ] */ +enum { + DR_STE_V1_LU_TYPE_NOP = 0x0000, + DR_STE_V1_LU_TYPE_ETHL2_TNL = 0x0002, + DR_STE_V1_LU_TYPE_IBL3_EXT = 0x0102, + DR_STE_V1_LU_TYPE_ETHL2_O = 0x0003, + DR_STE_V1_LU_TYPE_IBL4 = 0x0103, + DR_STE_V1_LU_TYPE_ETHL2_I = 0x0004, + DR_STE_V1_LU_TYPE_SRC_QP_GVMI = 0x0104, + DR_STE_V1_LU_TYPE_ETHL2_SRC_O = 0x0005, + DR_STE_V1_LU_TYPE_ETHL2_HEADERS_O = 0x0105, + DR_STE_V1_LU_TYPE_ETHL2_SRC_I = 0x0006, + DR_STE_V1_LU_TYPE_ETHL2_HEADERS_I = 0x0106, + DR_STE_V1_LU_TYPE_ETHL3_IPV4_5_TUPLE_O = 0x0007, + DR_STE_V1_LU_TYPE_IPV6_DES_O = 0x0107, + DR_STE_V1_LU_TYPE_ETHL3_IPV4_5_TUPLE_I = 0x0008, + DR_STE_V1_LU_TYPE_IPV6_DES_I = 0x0108, + DR_STE_V1_LU_TYPE_ETHL4_O = 0x0009, + DR_STE_V1_LU_TYPE_IPV6_SRC_O = 0x0109, + DR_STE_V1_LU_TYPE_ETHL4_I = 0x000a, + DR_STE_V1_LU_TYPE_IPV6_SRC_I = 0x010a, + DR_STE_V1_LU_TYPE_ETHL2_SRC_DST_O = 0x000b, + DR_STE_V1_LU_TYPE_MPLS_O = 0x010b, + DR_STE_V1_LU_TYPE_ETHL2_SRC_DST_I = 0x000c, + DR_STE_V1_LU_TYPE_MPLS_I = 0x010c, + DR_STE_V1_LU_TYPE_ETHL3_IPV4_MISC_O = 0x000d, + DR_STE_V1_LU_TYPE_GRE = 0x010d, + DR_STE_V1_LU_TYPE_FLEX_PARSER_TNL_HEADER = 0x000e, + DR_STE_V1_LU_TYPE_GENERAL_PURPOSE = 0x010e, + DR_STE_V1_LU_TYPE_ETHL3_IPV4_MISC_I = 0x000f, + DR_STE_V1_LU_TYPE_STEERING_REGISTERS_0 = 0x010f, + DR_STE_V1_LU_TYPE_STEERING_REGISTERS_1 = 0x0110, + DR_STE_V1_LU_TYPE_FLEX_PARSER_OK = 0x0011, + DR_STE_V1_LU_TYPE_FLEX_PARSER_0 = 0x0111, + DR_STE_V1_LU_TYPE_FLEX_PARSER_1 = 0x0112, + DR_STE_V1_LU_TYPE_ETHL4_MISC_O = 0x0113, + DR_STE_V1_LU_TYPE_ETHL4_MISC_I = 0x0114, + DR_STE_V1_LU_TYPE_INVALID = 0x00ff, + DR_STE_V1_LU_TYPE_DONT_CARE = MLX5DR_STE_LU_TYPE_DONT_CARE, +}; + +enum dr_ste_v1_header_anchors { + DR_STE_HEADER_ANCHOR_START_OUTER = 0x00, + DR_STE_HEADER_ANCHOR_1ST_VLAN = 0x02, + DR_STE_HEADER_ANCHOR_IPV6_IPV4 = 0x07, + DR_STE_HEADER_ANCHOR_INNER_MAC = 0x13, + DR_STE_HEADER_ANCHOR_INNER_IPV6_IPV4 = 0x19, +}; + +enum dr_ste_v1_action_size { + DR_STE_ACTION_SINGLE_SZ = 4, + DR_STE_ACTION_DOUBLE_SZ = 8, + DR_STE_ACTION_TRIPLE_SZ = 12, +}; + +enum dr_ste_v1_action_insert_ptr_attr { + DR_STE_V1_ACTION_INSERT_PTR_ATTR_NONE = 0, /* Regular push header (e.g. push vlan) */ + DR_STE_V1_ACTION_INSERT_PTR_ATTR_ENCAP = 1, /* Encapsulation / Tunneling */ + DR_STE_V1_ACTION_INSERT_PTR_ATTR_ESP = 2, /* IPsec */ +}; + +enum dr_ste_v1_action_id { + DR_STE_V1_ACTION_ID_NOP = 0x00, + DR_STE_V1_ACTION_ID_COPY = 0x05, + DR_STE_V1_ACTION_ID_SET = 0x06, + DR_STE_V1_ACTION_ID_ADD = 0x07, + DR_STE_V1_ACTION_ID_REMOVE_BY_SIZE = 0x08, + DR_STE_V1_ACTION_ID_REMOVE_HEADER_TO_HEADER = 0x09, + DR_STE_V1_ACTION_ID_INSERT_INLINE = 0x0a, + DR_STE_V1_ACTION_ID_INSERT_POINTER = 0x0b, + DR_STE_V1_ACTION_ID_FLOW_TAG = 0x0c, + DR_STE_V1_ACTION_ID_QUEUE_ID_SEL = 0x0d, + DR_STE_V1_ACTION_ID_ACCELERATED_LIST = 0x0e, + DR_STE_V1_ACTION_ID_MODIFY_LIST = 0x0f, + DR_STE_V1_ACTION_ID_ASO = 0x12, + DR_STE_V1_ACTION_ID_TRAILER = 0x13, + DR_STE_V1_ACTION_ID_COUNTER_ID = 0x14, + DR_STE_V1_ACTION_ID_MAX = 0x21, + /* use for special cases */ + DR_STE_V1_ACTION_ID_SPECIAL_ENCAP_L3 = 0x22, +}; + +enum { + DR_STE_V1_ACTION_MDFY_FLD_L2_OUT_0 = 0x00, + DR_STE_V1_ACTION_MDFY_FLD_L2_OUT_1 = 0x01, + DR_STE_V1_ACTION_MDFY_FLD_L2_OUT_2 = 0x02, + DR_STE_V1_ACTION_MDFY_FLD_SRC_L2_OUT_0 = 0x08, + DR_STE_V1_ACTION_MDFY_FLD_SRC_L2_OUT_1 = 0x09, + DR_STE_V1_ACTION_MDFY_FLD_L3_OUT_0 = 0x0e, + DR_STE_V1_ACTION_MDFY_FLD_L4_OUT_0 = 0x18, + DR_STE_V1_ACTION_MDFY_FLD_L4_OUT_1 = 0x19, + DR_STE_V1_ACTION_MDFY_FLD_IPV4_OUT_0 = 0x40, + DR_STE_V1_ACTION_MDFY_FLD_IPV4_OUT_1 = 0x41, + DR_STE_V1_ACTION_MDFY_FLD_IPV6_DST_OUT_0 = 0x44, + DR_STE_V1_ACTION_MDFY_FLD_IPV6_DST_OUT_1 = 0x45, + DR_STE_V1_ACTION_MDFY_FLD_IPV6_DST_OUT_2 = 0x46, + DR_STE_V1_ACTION_MDFY_FLD_IPV6_DST_OUT_3 = 0x47, + DR_STE_V1_ACTION_MDFY_FLD_IPV6_SRC_OUT_0 = 0x4c, + DR_STE_V1_ACTION_MDFY_FLD_IPV6_SRC_OUT_1 = 0x4d, + DR_STE_V1_ACTION_MDFY_FLD_IPV6_SRC_OUT_2 = 0x4e, + DR_STE_V1_ACTION_MDFY_FLD_IPV6_SRC_OUT_3 = 0x4f, + DR_STE_V1_ACTION_MDFY_FLD_TCP_MISC_0 = 0x5e, + DR_STE_V1_ACTION_MDFY_FLD_TCP_MISC_1 = 0x5f, + DR_STE_V1_ACTION_MDFY_FLD_CFG_HDR_0_0 = 0x6f, + DR_STE_V1_ACTION_MDFY_FLD_CFG_HDR_0_1 = 0x70, + DR_STE_V1_ACTION_MDFY_FLD_METADATA_2_CQE = 0x7b, + DR_STE_V1_ACTION_MDFY_FLD_GNRL_PURPOSE = 0x7c, + DR_STE_V1_ACTION_MDFY_FLD_REGISTER_2_0 = 0x8c, + DR_STE_V1_ACTION_MDFY_FLD_REGISTER_2_1 = 0x8d, + DR_STE_V1_ACTION_MDFY_FLD_REGISTER_1_0 = 0x8e, + DR_STE_V1_ACTION_MDFY_FLD_REGISTER_1_1 = 0x8f, + DR_STE_V1_ACTION_MDFY_FLD_REGISTER_0_0 = 0x90, + DR_STE_V1_ACTION_MDFY_FLD_REGISTER_0_1 = 0x91, +}; + +enum dr_ste_v1_aso_ctx_type { + DR_STE_V1_ASO_CTX_TYPE_POLICERS = 0x2, +}; + +static const struct mlx5dr_ste_action_modify_field dr_ste_v1_action_modify_field_arr[] = { + [MLX5_ACTION_IN_FIELD_OUT_SMAC_47_16] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_SRC_L2_OUT_0, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_SMAC_15_0] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_SRC_L2_OUT_1, .start = 16, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_ETHERTYPE] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_L2_OUT_1, .start = 0, .end = 15, + }, + [MLX5_ACTION_IN_FIELD_OUT_DMAC_47_16] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_L2_OUT_0, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_DMAC_15_0] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_L2_OUT_1, .start = 16, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_IP_DSCP] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_L3_OUT_0, .start = 18, .end = 23, + }, + [MLX5_ACTION_IN_FIELD_OUT_TCP_FLAGS] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_L4_OUT_1, .start = 16, .end = 24, + .l4_type = DR_STE_ACTION_MDFY_TYPE_L4_TCP, + }, + [MLX5_ACTION_IN_FIELD_OUT_TCP_SPORT] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_L4_OUT_0, .start = 16, .end = 31, + .l4_type = DR_STE_ACTION_MDFY_TYPE_L4_TCP, + }, + [MLX5_ACTION_IN_FIELD_OUT_TCP_DPORT] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_L4_OUT_0, .start = 0, .end = 15, + .l4_type = DR_STE_ACTION_MDFY_TYPE_L4_TCP, + }, + [MLX5_ACTION_IN_FIELD_OUT_IP_TTL] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_L3_OUT_0, .start = 8, .end = 15, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV4, + }, + [MLX5_ACTION_IN_FIELD_OUT_IPV6_HOPLIMIT] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_L3_OUT_0, .start = 8, .end = 15, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_UDP_SPORT] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_L4_OUT_0, .start = 16, .end = 31, + .l4_type = DR_STE_ACTION_MDFY_TYPE_L4_UDP, + }, + [MLX5_ACTION_IN_FIELD_OUT_UDP_DPORT] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_L4_OUT_0, .start = 0, .end = 15, + .l4_type = DR_STE_ACTION_MDFY_TYPE_L4_UDP, + }, + [MLX5_ACTION_IN_FIELD_OUT_SIPV6_127_96] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_IPV6_SRC_OUT_0, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_SIPV6_95_64] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_IPV6_SRC_OUT_1, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_SIPV6_63_32] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_IPV6_SRC_OUT_2, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_SIPV6_31_0] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_IPV6_SRC_OUT_3, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_DIPV6_127_96] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_IPV6_DST_OUT_0, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_DIPV6_95_64] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_IPV6_DST_OUT_1, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_DIPV6_63_32] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_IPV6_DST_OUT_2, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_DIPV6_31_0] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_IPV6_DST_OUT_3, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_SIPV4] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_IPV4_OUT_0, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV4, + }, + [MLX5_ACTION_IN_FIELD_OUT_DIPV4] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_IPV4_OUT_1, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV4, + }, + [MLX5_ACTION_IN_FIELD_METADATA_REG_A] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_GNRL_PURPOSE, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_METADATA_REG_B] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_METADATA_2_CQE, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_METADATA_REG_C_0] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_REGISTER_0_0, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_METADATA_REG_C_1] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_REGISTER_0_1, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_METADATA_REG_C_2] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_REGISTER_1_0, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_METADATA_REG_C_3] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_REGISTER_1_1, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_METADATA_REG_C_4] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_REGISTER_2_0, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_METADATA_REG_C_5] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_REGISTER_2_1, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_TCP_SEQ_NUM] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_TCP_MISC_0, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_TCP_ACK_NUM] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_TCP_MISC_1, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_FIRST_VID] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_L2_OUT_2, .start = 0, .end = 15, + }, + [MLX5_ACTION_IN_FIELD_OUT_EMD_31_0] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_CFG_HDR_0_1, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_EMD_47_32] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_CFG_HDR_0_0, .start = 0, .end = 15, + }, +}; + +static void dr_ste_v1_set_entry_type(u8 *hw_ste_p, u8 entry_type) +{ + MLX5_SET(ste_match_bwc_v1, hw_ste_p, entry_format, entry_type); +} + +static void dr_ste_v1_set_miss_addr(u8 *hw_ste_p, u64 miss_addr) +{ + u64 index = miss_addr >> 6; + + MLX5_SET(ste_match_bwc_v1, hw_ste_p, miss_address_39_32, index >> 26); + MLX5_SET(ste_match_bwc_v1, hw_ste_p, miss_address_31_6, index); +} + +static u64 dr_ste_v1_get_miss_addr(u8 *hw_ste_p) +{ + u64 index = + ((u64)MLX5_GET(ste_match_bwc_v1, hw_ste_p, miss_address_31_6) | + ((u64)MLX5_GET(ste_match_bwc_v1, hw_ste_p, miss_address_39_32)) << 26); + + return index << 6; +} + +static void dr_ste_v1_set_byte_mask(u8 *hw_ste_p, u16 byte_mask) +{ + MLX5_SET(ste_match_bwc_v1, hw_ste_p, byte_mask, byte_mask); +} + +static u16 dr_ste_v1_get_byte_mask(u8 *hw_ste_p) +{ + return MLX5_GET(ste_match_bwc_v1, hw_ste_p, byte_mask); +} + +static void dr_ste_v1_set_lu_type(u8 *hw_ste_p, u16 lu_type) +{ + MLX5_SET(ste_match_bwc_v1, hw_ste_p, entry_format, lu_type >> 8); + MLX5_SET(ste_match_bwc_v1, hw_ste_p, match_definer_ctx_idx, lu_type & 0xFF); +} + +static void dr_ste_v1_set_next_lu_type(u8 *hw_ste_p, u16 lu_type) +{ + MLX5_SET(ste_match_bwc_v1, hw_ste_p, next_entry_format, lu_type >> 8); + MLX5_SET(ste_match_bwc_v1, hw_ste_p, hash_definer_ctx_idx, lu_type & 0xFF); +} + +static u16 dr_ste_v1_get_next_lu_type(u8 *hw_ste_p) +{ + u8 mode = MLX5_GET(ste_match_bwc_v1, hw_ste_p, next_entry_format); + u8 index = MLX5_GET(ste_match_bwc_v1, hw_ste_p, hash_definer_ctx_idx); + + return (mode << 8 | index); +} + +static void dr_ste_v1_set_hit_gvmi(u8 *hw_ste_p, u16 gvmi) +{ + MLX5_SET(ste_match_bwc_v1, hw_ste_p, next_table_base_63_48, gvmi); +} + +static void dr_ste_v1_set_hit_addr(u8 *hw_ste_p, u64 icm_addr, u32 ht_size) +{ + u64 index = (icm_addr >> 5) | ht_size; + + MLX5_SET(ste_match_bwc_v1, hw_ste_p, next_table_base_39_32_size, index >> 27); + MLX5_SET(ste_match_bwc_v1, hw_ste_p, next_table_base_31_5_size, index); +} + +static void dr_ste_v1_init(u8 *hw_ste_p, u16 lu_type, + bool is_rx, u16 gvmi) +{ + dr_ste_v1_set_lu_type(hw_ste_p, lu_type); + dr_ste_v1_set_next_lu_type(hw_ste_p, MLX5DR_STE_LU_TYPE_DONT_CARE); + + MLX5_SET(ste_match_bwc_v1, hw_ste_p, gvmi, gvmi); + MLX5_SET(ste_match_bwc_v1, hw_ste_p, next_table_base_63_48, gvmi); + MLX5_SET(ste_match_bwc_v1, hw_ste_p, miss_address_63_48, gvmi); +} + +static void dr_ste_v1_prepare_for_postsend(u8 *hw_ste_p, + u32 ste_size) +{ + u8 *tag = hw_ste_p + DR_STE_SIZE_CTRL; + u8 *mask = tag + DR_STE_SIZE_TAG; + u8 tmp_tag[DR_STE_SIZE_TAG] = {}; + + if (ste_size == DR_STE_SIZE_CTRL) + return; + + WARN_ON(ste_size != DR_STE_SIZE); + + /* Backup tag */ + memcpy(tmp_tag, tag, DR_STE_SIZE_TAG); + + /* Swap mask and tag both are the same size */ + memcpy(tag, mask, DR_STE_SIZE_MASK); + memcpy(mask, tmp_tag, DR_STE_SIZE_TAG); +} + +static void dr_ste_v1_set_rx_flow_tag(u8 *s_action, u32 flow_tag) +{ + MLX5_SET(ste_single_action_flow_tag_v1, s_action, action_id, + DR_STE_V1_ACTION_ID_FLOW_TAG); + MLX5_SET(ste_single_action_flow_tag_v1, s_action, flow_tag, flow_tag); +} + +static void dr_ste_v1_set_counter_id(u8 *hw_ste_p, u32 ctr_id) +{ + MLX5_SET(ste_match_bwc_v1, hw_ste_p, counter_id, ctr_id); +} + +static void dr_ste_v1_set_reparse(u8 *hw_ste_p) +{ + MLX5_SET(ste_match_bwc_v1, hw_ste_p, reparse, 1); +} + +static void dr_ste_v1_set_encap(u8 *hw_ste_p, u8 *d_action, + u32 reformat_id, int size) +{ + MLX5_SET(ste_double_action_insert_with_ptr_v1, d_action, action_id, + DR_STE_V1_ACTION_ID_INSERT_POINTER); + /* The hardware expects here size in words (2 byte) */ + MLX5_SET(ste_double_action_insert_with_ptr_v1, d_action, size, size / 2); + MLX5_SET(ste_double_action_insert_with_ptr_v1, d_action, pointer, reformat_id); + MLX5_SET(ste_double_action_insert_with_ptr_v1, d_action, attributes, + DR_STE_V1_ACTION_INSERT_PTR_ATTR_ENCAP); + dr_ste_v1_set_reparse(hw_ste_p); +} + +static void dr_ste_v1_set_insert_hdr(u8 *hw_ste_p, u8 *d_action, + u32 reformat_id, + u8 anchor, u8 offset, + int size) +{ + MLX5_SET(ste_double_action_insert_with_ptr_v1, d_action, + action_id, DR_STE_V1_ACTION_ID_INSERT_POINTER); + MLX5_SET(ste_double_action_insert_with_ptr_v1, d_action, start_anchor, anchor); + + /* The hardware expects here size and offset in words (2 byte) */ + MLX5_SET(ste_double_action_insert_with_ptr_v1, d_action, size, size / 2); + MLX5_SET(ste_double_action_insert_with_ptr_v1, d_action, start_offset, offset / 2); + + MLX5_SET(ste_double_action_insert_with_ptr_v1, d_action, pointer, reformat_id); + MLX5_SET(ste_double_action_insert_with_ptr_v1, d_action, attributes, + DR_STE_V1_ACTION_INSERT_PTR_ATTR_NONE); + + dr_ste_v1_set_reparse(hw_ste_p); +} + +static void dr_ste_v1_set_remove_hdr(u8 *hw_ste_p, u8 *s_action, + u8 anchor, u8 offset, + int size) +{ + MLX5_SET(ste_single_action_remove_header_size_v1, s_action, + action_id, DR_STE_V1_ACTION_ID_REMOVE_BY_SIZE); + MLX5_SET(ste_single_action_remove_header_size_v1, s_action, start_anchor, anchor); + + /* The hardware expects here size and offset in words (2 byte) */ + MLX5_SET(ste_single_action_remove_header_size_v1, s_action, remove_size, size / 2); + MLX5_SET(ste_single_action_remove_header_size_v1, s_action, start_offset, offset / 2); + + dr_ste_v1_set_reparse(hw_ste_p); +} + +static void dr_ste_v1_set_push_vlan(u8 *hw_ste_p, u8 *d_action, + u32 vlan_hdr) +{ + MLX5_SET(ste_double_action_insert_with_inline_v1, d_action, + action_id, DR_STE_V1_ACTION_ID_INSERT_INLINE); + /* The hardware expects offset to vlan header in words (2 byte) */ + MLX5_SET(ste_double_action_insert_with_inline_v1, d_action, + start_offset, HDR_LEN_L2_MACS >> 1); + MLX5_SET(ste_double_action_insert_with_inline_v1, d_action, + inline_data, vlan_hdr); + + dr_ste_v1_set_reparse(hw_ste_p); +} + +static void dr_ste_v1_set_pop_vlan(u8 *hw_ste_p, u8 *s_action, u8 vlans_num) +{ + MLX5_SET(ste_single_action_remove_header_size_v1, s_action, + action_id, DR_STE_V1_ACTION_ID_REMOVE_BY_SIZE); + MLX5_SET(ste_single_action_remove_header_size_v1, s_action, + start_anchor, DR_STE_HEADER_ANCHOR_1ST_VLAN); + /* The hardware expects here size in words (2 byte) */ + MLX5_SET(ste_single_action_remove_header_size_v1, s_action, + remove_size, (HDR_LEN_L2_VLAN >> 1) * vlans_num); + + dr_ste_v1_set_reparse(hw_ste_p); +} + +static void dr_ste_v1_set_encap_l3(u8 *hw_ste_p, + u8 *frst_s_action, + u8 *scnd_d_action, + u32 reformat_id, + int size) +{ + /* Remove L2 headers */ + MLX5_SET(ste_single_action_remove_header_v1, frst_s_action, action_id, + DR_STE_V1_ACTION_ID_REMOVE_HEADER_TO_HEADER); + MLX5_SET(ste_single_action_remove_header_v1, frst_s_action, end_anchor, + DR_STE_HEADER_ANCHOR_IPV6_IPV4); + + /* Encapsulate with given reformat ID */ + MLX5_SET(ste_double_action_insert_with_ptr_v1, scnd_d_action, action_id, + DR_STE_V1_ACTION_ID_INSERT_POINTER); + /* The hardware expects here size in words (2 byte) */ + MLX5_SET(ste_double_action_insert_with_ptr_v1, scnd_d_action, size, size / 2); + MLX5_SET(ste_double_action_insert_with_ptr_v1, scnd_d_action, pointer, reformat_id); + MLX5_SET(ste_double_action_insert_with_ptr_v1, scnd_d_action, attributes, + DR_STE_V1_ACTION_INSERT_PTR_ATTR_ENCAP); + + dr_ste_v1_set_reparse(hw_ste_p); +} + +static void dr_ste_v1_set_rx_decap(u8 *hw_ste_p, u8 *s_action) +{ + MLX5_SET(ste_single_action_remove_header_v1, s_action, action_id, + DR_STE_V1_ACTION_ID_REMOVE_HEADER_TO_HEADER); + MLX5_SET(ste_single_action_remove_header_v1, s_action, decap, 1); + MLX5_SET(ste_single_action_remove_header_v1, s_action, vni_to_cqe, 1); + MLX5_SET(ste_single_action_remove_header_v1, s_action, end_anchor, + DR_STE_HEADER_ANCHOR_INNER_MAC); + + dr_ste_v1_set_reparse(hw_ste_p); +} + +static void dr_ste_v1_set_accelerated_rewrite_actions(u8 *hw_ste_p, + u8 *d_action, + u16 num_of_actions, + u32 re_write_index, + u32 re_write_args, + u8 *action_data) +{ + if (action_data) { + memcpy(d_action, action_data, DR_MODIFY_ACTION_SIZE); + } else { + MLX5_SET(ste_double_action_accelerated_modify_action_list, d_action, + action_id, DR_STE_V1_ACTION_ID_ACCELERATED_LIST); + MLX5_SET(ste_double_action_accelerated_modify_action_list, d_action, + modify_actions_pattern_pointer, re_write_index); + MLX5_SET(ste_double_action_accelerated_modify_action_list, d_action, + number_of_modify_actions, num_of_actions); + MLX5_SET(ste_double_action_accelerated_modify_action_list, d_action, + modify_actions_argument_pointer, re_write_args); + } + + dr_ste_v1_set_reparse(hw_ste_p); +} + +static void dr_ste_v1_set_aso_flow_meter(u8 *d_action, + u32 object_id, + u32 offset, + u8 dest_reg_id, + u8 init_color) +{ + MLX5_SET(ste_double_action_aso_v1, d_action, action_id, + DR_STE_V1_ACTION_ID_ASO); + MLX5_SET(ste_double_action_aso_v1, d_action, aso_context_number, + object_id + (offset / MLX5DR_ASO_FLOW_METER_NUM_PER_OBJ)); + /* Convert reg_c index to HW 64bit index */ + MLX5_SET(ste_double_action_aso_v1, d_action, dest_reg_id, + (dest_reg_id - 1) / 2); + MLX5_SET(ste_double_action_aso_v1, d_action, aso_context_type, + DR_STE_V1_ASO_CTX_TYPE_POLICERS); + MLX5_SET(ste_double_action_aso_v1, d_action, flow_meter.line_id, + offset % MLX5DR_ASO_FLOW_METER_NUM_PER_OBJ); + MLX5_SET(ste_double_action_aso_v1, d_action, flow_meter.initial_color, + init_color); +} + +static void dr_ste_v1_arr_init_next_match(u8 **last_ste, + u32 *added_stes, + u16 gvmi) +{ + u8 *action; + + (*added_stes)++; + *last_ste += DR_STE_SIZE; + dr_ste_v1_init(*last_ste, MLX5DR_STE_LU_TYPE_DONT_CARE, 0, gvmi); + dr_ste_v1_set_entry_type(*last_ste, DR_STE_V1_TYPE_MATCH); + + action = MLX5_ADDR_OF(ste_mask_and_match_v1, *last_ste, action); + memset(action, 0, MLX5_FLD_SZ_BYTES(ste_mask_and_match_v1, action)); +} + +static void dr_ste_v1_set_actions_tx(struct mlx5dr_domain *dmn, + u8 *action_type_set, + u32 actions_caps, + u8 *last_ste, + struct mlx5dr_ste_actions_attr *attr, + u32 *added_stes) +{ + u8 *action = MLX5_ADDR_OF(ste_match_bwc_v1, last_ste, action); + u8 action_sz = DR_STE_ACTION_DOUBLE_SZ; + bool allow_modify_hdr = true; + bool allow_encap = true; + + if (action_type_set[DR_ACTION_TYP_POP_VLAN]) { + if (action_sz < DR_STE_ACTION_SINGLE_SZ) { + dr_ste_v1_arr_init_next_match(&last_ste, added_stes, + attr->gvmi); + action = MLX5_ADDR_OF(ste_mask_and_match_v1, + last_ste, action); + action_sz = DR_STE_ACTION_TRIPLE_SZ; + } + dr_ste_v1_set_pop_vlan(last_ste, action, attr->vlans.count); + action_sz -= DR_STE_ACTION_SINGLE_SZ; + action += DR_STE_ACTION_SINGLE_SZ; + + /* Check if vlan_pop and modify_hdr on same STE is supported */ + if (!(actions_caps & DR_STE_CTX_ACTION_CAP_POP_MDFY)) + allow_modify_hdr = false; + } + + if (action_type_set[DR_ACTION_TYP_CTR]) + dr_ste_v1_set_counter_id(last_ste, attr->ctr_id); + + if (action_type_set[DR_ACTION_TYP_MODIFY_HDR]) { + if (!allow_modify_hdr || action_sz < DR_STE_ACTION_DOUBLE_SZ) { + dr_ste_v1_arr_init_next_match(&last_ste, added_stes, + attr->gvmi); + action = MLX5_ADDR_OF(ste_mask_and_match_v1, + last_ste, action); + action_sz = DR_STE_ACTION_TRIPLE_SZ; + } + dr_ste_v1_set_accelerated_rewrite_actions(last_ste, action, + attr->modify_actions, + attr->modify_index, + attr->args_index, + attr->single_modify_action); + action_sz -= DR_STE_ACTION_DOUBLE_SZ; + action += DR_STE_ACTION_DOUBLE_SZ; + allow_encap = false; + } + + if (action_type_set[DR_ACTION_TYP_PUSH_VLAN]) { + int i; + + for (i = 0; i < attr->vlans.count; i++) { + if (action_sz < DR_STE_ACTION_DOUBLE_SZ || !allow_encap) { + dr_ste_v1_arr_init_next_match(&last_ste, added_stes, attr->gvmi); + action = MLX5_ADDR_OF(ste_mask_and_match_v1, last_ste, action); + action_sz = DR_STE_ACTION_TRIPLE_SZ; + allow_encap = true; + } + dr_ste_v1_set_push_vlan(last_ste, action, + attr->vlans.headers[i]); + action_sz -= DR_STE_ACTION_DOUBLE_SZ; + action += DR_STE_ACTION_DOUBLE_SZ; + } + } + + if (action_type_set[DR_ACTION_TYP_L2_TO_TNL_L2]) { + if (!allow_encap || action_sz < DR_STE_ACTION_DOUBLE_SZ) { + dr_ste_v1_arr_init_next_match(&last_ste, added_stes, attr->gvmi); + action = MLX5_ADDR_OF(ste_mask_and_match_v1, last_ste, action); + action_sz = DR_STE_ACTION_TRIPLE_SZ; + allow_encap = true; + } + dr_ste_v1_set_encap(last_ste, action, + attr->reformat.id, + attr->reformat.size); + action_sz -= DR_STE_ACTION_DOUBLE_SZ; + action += DR_STE_ACTION_DOUBLE_SZ; + } else if (action_type_set[DR_ACTION_TYP_L2_TO_TNL_L3]) { + u8 *d_action; + + if (action_sz < DR_STE_ACTION_TRIPLE_SZ) { + dr_ste_v1_arr_init_next_match(&last_ste, added_stes, attr->gvmi); + action = MLX5_ADDR_OF(ste_mask_and_match_v1, last_ste, action); + action_sz = DR_STE_ACTION_TRIPLE_SZ; + } + d_action = action + DR_STE_ACTION_SINGLE_SZ; + + dr_ste_v1_set_encap_l3(last_ste, + action, d_action, + attr->reformat.id, + attr->reformat.size); + action_sz -= DR_STE_ACTION_TRIPLE_SZ; + action += DR_STE_ACTION_TRIPLE_SZ; + } else if (action_type_set[DR_ACTION_TYP_INSERT_HDR]) { + if (!allow_encap || action_sz < DR_STE_ACTION_DOUBLE_SZ) { + dr_ste_v1_arr_init_next_match(&last_ste, added_stes, attr->gvmi); + action = MLX5_ADDR_OF(ste_mask_and_match_v1, last_ste, action); + action_sz = DR_STE_ACTION_TRIPLE_SZ; + } + dr_ste_v1_set_insert_hdr(last_ste, action, + attr->reformat.id, + attr->reformat.param_0, + attr->reformat.param_1, + attr->reformat.size); + action_sz -= DR_STE_ACTION_DOUBLE_SZ; + action += DR_STE_ACTION_DOUBLE_SZ; + } else if (action_type_set[DR_ACTION_TYP_REMOVE_HDR]) { + if (action_sz < DR_STE_ACTION_SINGLE_SZ) { + dr_ste_v1_arr_init_next_match(&last_ste, added_stes, attr->gvmi); + action = MLX5_ADDR_OF(ste_mask_and_match_v1, last_ste, action); + action_sz = DR_STE_ACTION_TRIPLE_SZ; + } + dr_ste_v1_set_remove_hdr(last_ste, action, + attr->reformat.param_0, + attr->reformat.param_1, + attr->reformat.size); + action_sz -= DR_STE_ACTION_SINGLE_SZ; + action += DR_STE_ACTION_SINGLE_SZ; + } + + if (action_type_set[DR_ACTION_TYP_ASO_FLOW_METER]) { + if (action_sz < DR_STE_ACTION_DOUBLE_SZ) { + dr_ste_v1_arr_init_next_match(&last_ste, added_stes, attr->gvmi); + action = MLX5_ADDR_OF(ste_mask_and_match, last_ste, action); + action_sz = DR_STE_ACTION_TRIPLE_SZ; + } + dr_ste_v1_set_aso_flow_meter(action, + attr->aso_flow_meter.obj_id, + attr->aso_flow_meter.offset, + attr->aso_flow_meter.dest_reg_id, + attr->aso_flow_meter.init_color); + action_sz -= DR_STE_ACTION_DOUBLE_SZ; + action += DR_STE_ACTION_DOUBLE_SZ; + } + + dr_ste_v1_set_hit_gvmi(last_ste, attr->hit_gvmi); + dr_ste_v1_set_hit_addr(last_ste, attr->final_icm_addr, 1); +} + +static void dr_ste_v1_set_actions_rx(struct mlx5dr_domain *dmn, + u8 *action_type_set, + u32 actions_caps, + u8 *last_ste, + struct mlx5dr_ste_actions_attr *attr, + u32 *added_stes) +{ + u8 *action = MLX5_ADDR_OF(ste_match_bwc_v1, last_ste, action); + u8 action_sz = DR_STE_ACTION_DOUBLE_SZ; + bool allow_modify_hdr = true; + bool allow_ctr = true; + + if (action_type_set[DR_ACTION_TYP_TNL_L3_TO_L2]) { + dr_ste_v1_set_accelerated_rewrite_actions(last_ste, action, + attr->decap_actions, + attr->decap_index, + attr->decap_args_index, + NULL); + action_sz -= DR_STE_ACTION_DOUBLE_SZ; + action += DR_STE_ACTION_DOUBLE_SZ; + allow_modify_hdr = false; + allow_ctr = false; + } else if (action_type_set[DR_ACTION_TYP_TNL_L2_TO_L2]) { + dr_ste_v1_set_rx_decap(last_ste, action); + action_sz -= DR_STE_ACTION_SINGLE_SZ; + action += DR_STE_ACTION_SINGLE_SZ; + allow_modify_hdr = false; + allow_ctr = false; + } + + if (action_type_set[DR_ACTION_TYP_TAG]) { + if (action_sz < DR_STE_ACTION_SINGLE_SZ) { + dr_ste_v1_arr_init_next_match(&last_ste, added_stes, attr->gvmi); + action = MLX5_ADDR_OF(ste_mask_and_match_v1, last_ste, action); + action_sz = DR_STE_ACTION_TRIPLE_SZ; + allow_modify_hdr = true; + allow_ctr = true; + } + dr_ste_v1_set_rx_flow_tag(action, attr->flow_tag); + action_sz -= DR_STE_ACTION_SINGLE_SZ; + action += DR_STE_ACTION_SINGLE_SZ; + } + + if (action_type_set[DR_ACTION_TYP_POP_VLAN]) { + if (action_sz < DR_STE_ACTION_SINGLE_SZ || + !allow_modify_hdr) { + dr_ste_v1_arr_init_next_match(&last_ste, added_stes, attr->gvmi); + action = MLX5_ADDR_OF(ste_mask_and_match_v1, last_ste, action); + action_sz = DR_STE_ACTION_TRIPLE_SZ; + } + + dr_ste_v1_set_pop_vlan(last_ste, action, attr->vlans.count); + action_sz -= DR_STE_ACTION_SINGLE_SZ; + action += DR_STE_ACTION_SINGLE_SZ; + allow_ctr = false; + + /* Check if vlan_pop and modify_hdr on same STE is supported */ + if (!(actions_caps & DR_STE_CTX_ACTION_CAP_POP_MDFY)) + allow_modify_hdr = false; + } + + if (action_type_set[DR_ACTION_TYP_MODIFY_HDR]) { + /* Modify header and decapsulation must use different STEs */ + if (!allow_modify_hdr || action_sz < DR_STE_ACTION_DOUBLE_SZ) { + dr_ste_v1_arr_init_next_match(&last_ste, added_stes, attr->gvmi); + action = MLX5_ADDR_OF(ste_mask_and_match_v1, last_ste, action); + action_sz = DR_STE_ACTION_TRIPLE_SZ; + allow_modify_hdr = true; + allow_ctr = true; + } + dr_ste_v1_set_accelerated_rewrite_actions(last_ste, action, + attr->modify_actions, + attr->modify_index, + attr->args_index, + attr->single_modify_action); + action_sz -= DR_STE_ACTION_DOUBLE_SZ; + action += DR_STE_ACTION_DOUBLE_SZ; + } + + if (action_type_set[DR_ACTION_TYP_PUSH_VLAN]) { + int i; + + for (i = 0; i < attr->vlans.count; i++) { + if (action_sz < DR_STE_ACTION_DOUBLE_SZ || + !allow_modify_hdr) { + dr_ste_v1_arr_init_next_match(&last_ste, + added_stes, + attr->gvmi); + action = MLX5_ADDR_OF(ste_mask_and_match_v1, + last_ste, action); + action_sz = DR_STE_ACTION_TRIPLE_SZ; + } + dr_ste_v1_set_push_vlan(last_ste, action, + attr->vlans.headers[i]); + action_sz -= DR_STE_ACTION_DOUBLE_SZ; + action += DR_STE_ACTION_DOUBLE_SZ; + } + } + + if (action_type_set[DR_ACTION_TYP_CTR]) { + /* Counter action set after decap and before insert_hdr + * to exclude decaped / encaped header respectively. + */ + if (!allow_ctr) { + dr_ste_v1_arr_init_next_match(&last_ste, added_stes, attr->gvmi); + action = MLX5_ADDR_OF(ste_mask_and_match_v1, last_ste, action); + action_sz = DR_STE_ACTION_TRIPLE_SZ; + allow_modify_hdr = true; + } + dr_ste_v1_set_counter_id(last_ste, attr->ctr_id); + allow_ctr = false; + } + + if (action_type_set[DR_ACTION_TYP_L2_TO_TNL_L2]) { + if (action_sz < DR_STE_ACTION_DOUBLE_SZ) { + dr_ste_v1_arr_init_next_match(&last_ste, added_stes, attr->gvmi); + action = MLX5_ADDR_OF(ste_mask_and_match_v1, last_ste, action); + action_sz = DR_STE_ACTION_TRIPLE_SZ; + } + dr_ste_v1_set_encap(last_ste, action, + attr->reformat.id, + attr->reformat.size); + action_sz -= DR_STE_ACTION_DOUBLE_SZ; + action += DR_STE_ACTION_DOUBLE_SZ; + allow_modify_hdr = false; + } else if (action_type_set[DR_ACTION_TYP_L2_TO_TNL_L3]) { + u8 *d_action; + + if (action_sz < DR_STE_ACTION_TRIPLE_SZ) { + dr_ste_v1_arr_init_next_match(&last_ste, added_stes, attr->gvmi); + action = MLX5_ADDR_OF(ste_mask_and_match_v1, last_ste, action); + action_sz = DR_STE_ACTION_TRIPLE_SZ; + } + + d_action = action + DR_STE_ACTION_SINGLE_SZ; + + dr_ste_v1_set_encap_l3(last_ste, + action, d_action, + attr->reformat.id, + attr->reformat.size); + action_sz -= DR_STE_ACTION_TRIPLE_SZ; + allow_modify_hdr = false; + } else if (action_type_set[DR_ACTION_TYP_INSERT_HDR]) { + /* Modify header, decap, and encap must use different STEs */ + if (!allow_modify_hdr || action_sz < DR_STE_ACTION_DOUBLE_SZ) { + dr_ste_v1_arr_init_next_match(&last_ste, added_stes, attr->gvmi); + action = MLX5_ADDR_OF(ste_mask_and_match_v1, last_ste, action); + action_sz = DR_STE_ACTION_TRIPLE_SZ; + } + dr_ste_v1_set_insert_hdr(last_ste, action, + attr->reformat.id, + attr->reformat.param_0, + attr->reformat.param_1, + attr->reformat.size); + action_sz -= DR_STE_ACTION_DOUBLE_SZ; + action += DR_STE_ACTION_DOUBLE_SZ; + allow_modify_hdr = false; + } else if (action_type_set[DR_ACTION_TYP_REMOVE_HDR]) { + if (action_sz < DR_STE_ACTION_SINGLE_SZ) { + dr_ste_v1_arr_init_next_match(&last_ste, added_stes, attr->gvmi); + action = MLX5_ADDR_OF(ste_mask_and_match_v1, last_ste, action); + action_sz = DR_STE_ACTION_TRIPLE_SZ; + allow_modify_hdr = true; + allow_ctr = true; + } + dr_ste_v1_set_remove_hdr(last_ste, action, + attr->reformat.param_0, + attr->reformat.param_1, + attr->reformat.size); + action_sz -= DR_STE_ACTION_SINGLE_SZ; + action += DR_STE_ACTION_SINGLE_SZ; + } + + if (action_type_set[DR_ACTION_TYP_ASO_FLOW_METER]) { + if (action_sz < DR_STE_ACTION_DOUBLE_SZ) { + dr_ste_v1_arr_init_next_match(&last_ste, added_stes, attr->gvmi); + action = MLX5_ADDR_OF(ste_mask_and_match, last_ste, action); + action_sz = DR_STE_ACTION_TRIPLE_SZ; + } + dr_ste_v1_set_aso_flow_meter(action, + attr->aso_flow_meter.obj_id, + attr->aso_flow_meter.offset, + attr->aso_flow_meter.dest_reg_id, + attr->aso_flow_meter.init_color); + action_sz -= DR_STE_ACTION_DOUBLE_SZ; + action += DR_STE_ACTION_DOUBLE_SZ; + } + + dr_ste_v1_set_hit_gvmi(last_ste, attr->hit_gvmi); + dr_ste_v1_set_hit_addr(last_ste, attr->final_icm_addr, 1); +} + +static void dr_ste_v1_set_action_set(u8 *d_action, + u8 hw_field, + u8 shifter, + u8 length, + u32 data) +{ + shifter += MLX5_MODIFY_HEADER_V1_QW_OFFSET; + MLX5_SET(ste_double_action_set_v1, d_action, action_id, DR_STE_V1_ACTION_ID_SET); + MLX5_SET(ste_double_action_set_v1, d_action, destination_dw_offset, hw_field); + MLX5_SET(ste_double_action_set_v1, d_action, destination_left_shifter, shifter); + MLX5_SET(ste_double_action_set_v1, d_action, destination_length, length); + MLX5_SET(ste_double_action_set_v1, d_action, inline_data, data); +} + +static void dr_ste_v1_set_action_add(u8 *d_action, + u8 hw_field, + u8 shifter, + u8 length, + u32 data) +{ + shifter += MLX5_MODIFY_HEADER_V1_QW_OFFSET; + MLX5_SET(ste_double_action_add_v1, d_action, action_id, DR_STE_V1_ACTION_ID_ADD); + MLX5_SET(ste_double_action_add_v1, d_action, destination_dw_offset, hw_field); + MLX5_SET(ste_double_action_add_v1, d_action, destination_left_shifter, shifter); + MLX5_SET(ste_double_action_add_v1, d_action, destination_length, length); + MLX5_SET(ste_double_action_add_v1, d_action, add_value, data); +} + +static void dr_ste_v1_set_action_copy(u8 *d_action, + u8 dst_hw_field, + u8 dst_shifter, + u8 dst_len, + u8 src_hw_field, + u8 src_shifter) +{ + dst_shifter += MLX5_MODIFY_HEADER_V1_QW_OFFSET; + src_shifter += MLX5_MODIFY_HEADER_V1_QW_OFFSET; + MLX5_SET(ste_double_action_copy_v1, d_action, action_id, DR_STE_V1_ACTION_ID_COPY); + MLX5_SET(ste_double_action_copy_v1, d_action, destination_dw_offset, dst_hw_field); + MLX5_SET(ste_double_action_copy_v1, d_action, destination_left_shifter, dst_shifter); + MLX5_SET(ste_double_action_copy_v1, d_action, destination_length, dst_len); + MLX5_SET(ste_double_action_copy_v1, d_action, source_dw_offset, src_hw_field); + MLX5_SET(ste_double_action_copy_v1, d_action, source_right_shifter, src_shifter); +} + +#define DR_STE_DECAP_L3_ACTION_NUM 8 +#define DR_STE_L2_HDR_MAX_SZ 20 + +static int dr_ste_v1_set_action_decap_l3_list(void *data, + u32 data_sz, + u8 *hw_action, + u32 hw_action_sz, + u16 *used_hw_action_num) +{ + u8 padded_data[DR_STE_L2_HDR_MAX_SZ] = {}; + void *data_ptr = padded_data; + u16 used_actions = 0; + u32 inline_data_sz; + u32 i; + + if (hw_action_sz / DR_STE_ACTION_DOUBLE_SZ < DR_STE_DECAP_L3_ACTION_NUM) + return -EINVAL; + + inline_data_sz = + MLX5_FLD_SZ_BYTES(ste_double_action_insert_with_inline_v1, inline_data); + + /* Add an alignment padding */ + memcpy(padded_data + data_sz % inline_data_sz, data, data_sz); + + /* Remove L2L3 outer headers */ + MLX5_SET(ste_single_action_remove_header_v1, hw_action, action_id, + DR_STE_V1_ACTION_ID_REMOVE_HEADER_TO_HEADER); + MLX5_SET(ste_single_action_remove_header_v1, hw_action, decap, 1); + MLX5_SET(ste_single_action_remove_header_v1, hw_action, vni_to_cqe, 1); + MLX5_SET(ste_single_action_remove_header_v1, hw_action, end_anchor, + DR_STE_HEADER_ANCHOR_INNER_IPV6_IPV4); + hw_action += DR_STE_ACTION_DOUBLE_SZ; + used_actions++; /* Remove and NOP are a single double action */ + + /* Point to the last dword of the header */ + data_ptr += (data_sz / inline_data_sz) * inline_data_sz; + + /* Add the new header using inline action 4Byte at a time, the header + * is added in reversed order to the beginning of the packet to avoid + * incorrect parsing by the HW. Since header is 14B or 18B an extra + * two bytes are padded and later removed. + */ + for (i = 0; i < data_sz / inline_data_sz + 1; i++) { + void *addr_inline; + + MLX5_SET(ste_double_action_insert_with_inline_v1, hw_action, action_id, + DR_STE_V1_ACTION_ID_INSERT_INLINE); + /* The hardware expects here offset to words (2 bytes) */ + MLX5_SET(ste_double_action_insert_with_inline_v1, hw_action, start_offset, 0); + + /* Copy bytes one by one to avoid endianness problem */ + addr_inline = MLX5_ADDR_OF(ste_double_action_insert_with_inline_v1, + hw_action, inline_data); + memcpy(addr_inline, data_ptr - i * inline_data_sz, inline_data_sz); + hw_action += DR_STE_ACTION_DOUBLE_SZ; + used_actions++; + } + + /* Remove first 2 extra bytes */ + MLX5_SET(ste_single_action_remove_header_size_v1, hw_action, action_id, + DR_STE_V1_ACTION_ID_REMOVE_BY_SIZE); + MLX5_SET(ste_single_action_remove_header_size_v1, hw_action, start_offset, 0); + /* The hardware expects here size in words (2 bytes) */ + MLX5_SET(ste_single_action_remove_header_size_v1, hw_action, remove_size, 1); + used_actions++; + + *used_hw_action_num = used_actions; + + return 0; +} + +static void dr_ste_v1_build_eth_l2_src_dst_bit_mask(struct mlx5dr_match_param *value, + bool inner, u8 *bit_mask) +{ + struct mlx5dr_match_spec *mask = inner ? &value->inner : &value->outer; + + DR_STE_SET_TAG(eth_l2_src_dst_v1, bit_mask, dmac_47_16, mask, dmac_47_16); + DR_STE_SET_TAG(eth_l2_src_dst_v1, bit_mask, dmac_15_0, mask, dmac_15_0); + + DR_STE_SET_TAG(eth_l2_src_dst_v1, bit_mask, smac_47_16, mask, smac_47_16); + DR_STE_SET_TAG(eth_l2_src_dst_v1, bit_mask, smac_15_0, mask, smac_15_0); + + DR_STE_SET_TAG(eth_l2_src_dst_v1, bit_mask, first_vlan_id, mask, first_vid); + DR_STE_SET_TAG(eth_l2_src_dst_v1, bit_mask, first_cfi, mask, first_cfi); + DR_STE_SET_TAG(eth_l2_src_dst_v1, bit_mask, first_priority, mask, first_prio); + DR_STE_SET_ONES(eth_l2_src_dst_v1, bit_mask, l3_type, mask, ip_version); + + if (mask->cvlan_tag) { + MLX5_SET(ste_eth_l2_src_dst_v1, bit_mask, first_vlan_qualifier, -1); + mask->cvlan_tag = 0; + } else if (mask->svlan_tag) { + MLX5_SET(ste_eth_l2_src_dst_v1, bit_mask, first_vlan_qualifier, -1); + mask->svlan_tag = 0; + } +} + +static int dr_ste_v1_build_eth_l2_src_dst_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + struct mlx5dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; + + DR_STE_SET_TAG(eth_l2_src_dst_v1, tag, dmac_47_16, spec, dmac_47_16); + DR_STE_SET_TAG(eth_l2_src_dst_v1, tag, dmac_15_0, spec, dmac_15_0); + + DR_STE_SET_TAG(eth_l2_src_dst_v1, tag, smac_47_16, spec, smac_47_16); + DR_STE_SET_TAG(eth_l2_src_dst_v1, tag, smac_15_0, spec, smac_15_0); + + if (spec->ip_version == IP_VERSION_IPV4) { + MLX5_SET(ste_eth_l2_src_dst_v1, tag, l3_type, STE_IPV4); + spec->ip_version = 0; + } else if (spec->ip_version == IP_VERSION_IPV6) { + MLX5_SET(ste_eth_l2_src_dst_v1, tag, l3_type, STE_IPV6); + spec->ip_version = 0; + } else if (spec->ip_version) { + return -EINVAL; + } + + DR_STE_SET_TAG(eth_l2_src_dst_v1, tag, first_vlan_id, spec, first_vid); + DR_STE_SET_TAG(eth_l2_src_dst_v1, tag, first_cfi, spec, first_cfi); + DR_STE_SET_TAG(eth_l2_src_dst_v1, tag, first_priority, spec, first_prio); + + if (spec->cvlan_tag) { + MLX5_SET(ste_eth_l2_src_dst_v1, tag, first_vlan_qualifier, DR_STE_CVLAN); + spec->cvlan_tag = 0; + } else if (spec->svlan_tag) { + MLX5_SET(ste_eth_l2_src_dst_v1, tag, first_vlan_qualifier, DR_STE_SVLAN); + spec->svlan_tag = 0; + } + return 0; +} + +static void dr_ste_v1_build_eth_l2_src_dst_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v1_build_eth_l2_src_dst_bit_mask(mask, sb->inner, sb->bit_mask); + + sb->lu_type = DR_STE_CALC_DFNR_TYPE(ETHL2_SRC_DST, sb->inner); + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_eth_l2_src_dst_tag; +} + +static int dr_ste_v1_build_eth_l3_ipv6_dst_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + struct mlx5dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; + + DR_STE_SET_TAG(eth_l3_ipv6_dst, tag, dst_ip_127_96, spec, dst_ip_127_96); + DR_STE_SET_TAG(eth_l3_ipv6_dst, tag, dst_ip_95_64, spec, dst_ip_95_64); + DR_STE_SET_TAG(eth_l3_ipv6_dst, tag, dst_ip_63_32, spec, dst_ip_63_32); + DR_STE_SET_TAG(eth_l3_ipv6_dst, tag, dst_ip_31_0, spec, dst_ip_31_0); + + return 0; +} + +static void dr_ste_v1_build_eth_l3_ipv6_dst_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v1_build_eth_l3_ipv6_dst_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_CALC_DFNR_TYPE(IPV6_DES, sb->inner); + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_eth_l3_ipv6_dst_tag; +} + +static int dr_ste_v1_build_eth_l3_ipv6_src_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + struct mlx5dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; + + DR_STE_SET_TAG(eth_l3_ipv6_src, tag, src_ip_127_96, spec, src_ip_127_96); + DR_STE_SET_TAG(eth_l3_ipv6_src, tag, src_ip_95_64, spec, src_ip_95_64); + DR_STE_SET_TAG(eth_l3_ipv6_src, tag, src_ip_63_32, spec, src_ip_63_32); + DR_STE_SET_TAG(eth_l3_ipv6_src, tag, src_ip_31_0, spec, src_ip_31_0); + + return 0; +} + +static void dr_ste_v1_build_eth_l3_ipv6_src_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v1_build_eth_l3_ipv6_src_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_CALC_DFNR_TYPE(IPV6_SRC, sb->inner); + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_eth_l3_ipv6_src_tag; +} + +static int dr_ste_v1_build_eth_l3_ipv4_5_tuple_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + struct mlx5dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; + + DR_STE_SET_TAG(eth_l3_ipv4_5_tuple_v1, tag, destination_address, spec, dst_ip_31_0); + DR_STE_SET_TAG(eth_l3_ipv4_5_tuple_v1, tag, source_address, spec, src_ip_31_0); + DR_STE_SET_TAG(eth_l3_ipv4_5_tuple_v1, tag, destination_port, spec, tcp_dport); + DR_STE_SET_TAG(eth_l3_ipv4_5_tuple_v1, tag, destination_port, spec, udp_dport); + DR_STE_SET_TAG(eth_l3_ipv4_5_tuple_v1, tag, source_port, spec, tcp_sport); + DR_STE_SET_TAG(eth_l3_ipv4_5_tuple_v1, tag, source_port, spec, udp_sport); + DR_STE_SET_TAG(eth_l3_ipv4_5_tuple_v1, tag, protocol, spec, ip_protocol); + DR_STE_SET_TAG(eth_l3_ipv4_5_tuple_v1, tag, fragmented, spec, frag); + DR_STE_SET_TAG(eth_l3_ipv4_5_tuple_v1, tag, dscp, spec, ip_dscp); + DR_STE_SET_TAG(eth_l3_ipv4_5_tuple_v1, tag, ecn, spec, ip_ecn); + + if (spec->tcp_flags) { + DR_STE_SET_TCP_FLAGS(eth_l3_ipv4_5_tuple_v1, tag, spec); + spec->tcp_flags = 0; + } + + return 0; +} + +static void dr_ste_v1_build_eth_l3_ipv4_5_tuple_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v1_build_eth_l3_ipv4_5_tuple_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_CALC_DFNR_TYPE(ETHL3_IPV4_5_TUPLE, sb->inner); + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_eth_l3_ipv4_5_tuple_tag; +} + +static void dr_ste_v1_build_eth_l2_src_or_dst_bit_mask(struct mlx5dr_match_param *value, + bool inner, u8 *bit_mask) +{ + struct mlx5dr_match_spec *mask = inner ? &value->inner : &value->outer; + struct mlx5dr_match_misc *misc_mask = &value->misc; + + DR_STE_SET_TAG(eth_l2_src_v1, bit_mask, first_vlan_id, mask, first_vid); + DR_STE_SET_TAG(eth_l2_src_v1, bit_mask, first_cfi, mask, first_cfi); + DR_STE_SET_TAG(eth_l2_src_v1, bit_mask, first_priority, mask, first_prio); + DR_STE_SET_TAG(eth_l2_src_v1, bit_mask, ip_fragmented, mask, frag); + DR_STE_SET_TAG(eth_l2_src_v1, bit_mask, l3_ethertype, mask, ethertype); + DR_STE_SET_ONES(eth_l2_src_v1, bit_mask, l3_type, mask, ip_version); + + if (mask->svlan_tag || mask->cvlan_tag) { + MLX5_SET(ste_eth_l2_src_v1, bit_mask, first_vlan_qualifier, -1); + mask->cvlan_tag = 0; + mask->svlan_tag = 0; + } + + if (inner) { + if (misc_mask->inner_second_cvlan_tag || + misc_mask->inner_second_svlan_tag) { + MLX5_SET(ste_eth_l2_src_v1, bit_mask, second_vlan_qualifier, -1); + misc_mask->inner_second_cvlan_tag = 0; + misc_mask->inner_second_svlan_tag = 0; + } + + DR_STE_SET_TAG(eth_l2_src_v1, bit_mask, + second_vlan_id, misc_mask, inner_second_vid); + DR_STE_SET_TAG(eth_l2_src_v1, bit_mask, + second_cfi, misc_mask, inner_second_cfi); + DR_STE_SET_TAG(eth_l2_src_v1, bit_mask, + second_priority, misc_mask, inner_second_prio); + } else { + if (misc_mask->outer_second_cvlan_tag || + misc_mask->outer_second_svlan_tag) { + MLX5_SET(ste_eth_l2_src_v1, bit_mask, second_vlan_qualifier, -1); + misc_mask->outer_second_cvlan_tag = 0; + misc_mask->outer_second_svlan_tag = 0; + } + + DR_STE_SET_TAG(eth_l2_src_v1, bit_mask, + second_vlan_id, misc_mask, outer_second_vid); + DR_STE_SET_TAG(eth_l2_src_v1, bit_mask, + second_cfi, misc_mask, outer_second_cfi); + DR_STE_SET_TAG(eth_l2_src_v1, bit_mask, + second_priority, misc_mask, outer_second_prio); + } +} + +static int dr_ste_v1_build_eth_l2_src_or_dst_tag(struct mlx5dr_match_param *value, + bool inner, u8 *tag) +{ + struct mlx5dr_match_spec *spec = inner ? &value->inner : &value->outer; + struct mlx5dr_match_misc *misc_spec = &value->misc; + + DR_STE_SET_TAG(eth_l2_src_v1, tag, first_vlan_id, spec, first_vid); + DR_STE_SET_TAG(eth_l2_src_v1, tag, first_cfi, spec, first_cfi); + DR_STE_SET_TAG(eth_l2_src_v1, tag, first_priority, spec, first_prio); + DR_STE_SET_TAG(eth_l2_src_v1, tag, ip_fragmented, spec, frag); + DR_STE_SET_TAG(eth_l2_src_v1, tag, l3_ethertype, spec, ethertype); + + if (spec->ip_version == IP_VERSION_IPV4) { + MLX5_SET(ste_eth_l2_src_v1, tag, l3_type, STE_IPV4); + spec->ip_version = 0; + } else if (spec->ip_version == IP_VERSION_IPV6) { + MLX5_SET(ste_eth_l2_src_v1, tag, l3_type, STE_IPV6); + spec->ip_version = 0; + } else if (spec->ip_version) { + return -EINVAL; + } + + if (spec->cvlan_tag) { + MLX5_SET(ste_eth_l2_src_v1, tag, first_vlan_qualifier, DR_STE_CVLAN); + spec->cvlan_tag = 0; + } else if (spec->svlan_tag) { + MLX5_SET(ste_eth_l2_src_v1, tag, first_vlan_qualifier, DR_STE_SVLAN); + spec->svlan_tag = 0; + } + + if (inner) { + if (misc_spec->inner_second_cvlan_tag) { + MLX5_SET(ste_eth_l2_src_v1, tag, second_vlan_qualifier, DR_STE_CVLAN); + misc_spec->inner_second_cvlan_tag = 0; + } else if (misc_spec->inner_second_svlan_tag) { + MLX5_SET(ste_eth_l2_src_v1, tag, second_vlan_qualifier, DR_STE_SVLAN); + misc_spec->inner_second_svlan_tag = 0; + } + + DR_STE_SET_TAG(eth_l2_src_v1, tag, second_vlan_id, misc_spec, inner_second_vid); + DR_STE_SET_TAG(eth_l2_src_v1, tag, second_cfi, misc_spec, inner_second_cfi); + DR_STE_SET_TAG(eth_l2_src_v1, tag, second_priority, misc_spec, inner_second_prio); + } else { + if (misc_spec->outer_second_cvlan_tag) { + MLX5_SET(ste_eth_l2_src_v1, tag, second_vlan_qualifier, DR_STE_CVLAN); + misc_spec->outer_second_cvlan_tag = 0; + } else if (misc_spec->outer_second_svlan_tag) { + MLX5_SET(ste_eth_l2_src_v1, tag, second_vlan_qualifier, DR_STE_SVLAN); + misc_spec->outer_second_svlan_tag = 0; + } + DR_STE_SET_TAG(eth_l2_src_v1, tag, second_vlan_id, misc_spec, outer_second_vid); + DR_STE_SET_TAG(eth_l2_src_v1, tag, second_cfi, misc_spec, outer_second_cfi); + DR_STE_SET_TAG(eth_l2_src_v1, tag, second_priority, misc_spec, outer_second_prio); + } + + return 0; +} + +static void dr_ste_v1_build_eth_l2_src_bit_mask(struct mlx5dr_match_param *value, + bool inner, u8 *bit_mask) +{ + struct mlx5dr_match_spec *mask = inner ? &value->inner : &value->outer; + + DR_STE_SET_TAG(eth_l2_src_v1, bit_mask, smac_47_16, mask, smac_47_16); + DR_STE_SET_TAG(eth_l2_src_v1, bit_mask, smac_15_0, mask, smac_15_0); + + dr_ste_v1_build_eth_l2_src_or_dst_bit_mask(value, inner, bit_mask); +} + +static int dr_ste_v1_build_eth_l2_src_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + struct mlx5dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; + + DR_STE_SET_TAG(eth_l2_src_v1, tag, smac_47_16, spec, smac_47_16); + DR_STE_SET_TAG(eth_l2_src_v1, tag, smac_15_0, spec, smac_15_0); + + return dr_ste_v1_build_eth_l2_src_or_dst_tag(value, sb->inner, tag); +} + +static void dr_ste_v1_build_eth_l2_src_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v1_build_eth_l2_src_bit_mask(mask, sb->inner, sb->bit_mask); + + sb->lu_type = DR_STE_CALC_DFNR_TYPE(ETHL2_SRC, sb->inner); + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_eth_l2_src_tag; +} + +static void dr_ste_v1_build_eth_l2_dst_bit_mask(struct mlx5dr_match_param *value, + bool inner, u8 *bit_mask) +{ + struct mlx5dr_match_spec *mask = inner ? &value->inner : &value->outer; + + DR_STE_SET_TAG(eth_l2_dst_v1, bit_mask, dmac_47_16, mask, dmac_47_16); + DR_STE_SET_TAG(eth_l2_dst_v1, bit_mask, dmac_15_0, mask, dmac_15_0); + + dr_ste_v1_build_eth_l2_src_or_dst_bit_mask(value, inner, bit_mask); +} + +static int dr_ste_v1_build_eth_l2_dst_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + struct mlx5dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; + + DR_STE_SET_TAG(eth_l2_dst_v1, tag, dmac_47_16, spec, dmac_47_16); + DR_STE_SET_TAG(eth_l2_dst_v1, tag, dmac_15_0, spec, dmac_15_0); + + return dr_ste_v1_build_eth_l2_src_or_dst_tag(value, sb->inner, tag); +} + +static void dr_ste_v1_build_eth_l2_dst_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v1_build_eth_l2_dst_bit_mask(mask, sb->inner, sb->bit_mask); + + sb->lu_type = DR_STE_CALC_DFNR_TYPE(ETHL2, sb->inner); + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_eth_l2_dst_tag; +} + +static void dr_ste_v1_build_eth_l2_tnl_bit_mask(struct mlx5dr_match_param *value, + bool inner, u8 *bit_mask) +{ + struct mlx5dr_match_spec *mask = inner ? &value->inner : &value->outer; + struct mlx5dr_match_misc *misc = &value->misc; + + DR_STE_SET_TAG(eth_l2_tnl_v1, bit_mask, dmac_47_16, mask, dmac_47_16); + DR_STE_SET_TAG(eth_l2_tnl_v1, bit_mask, dmac_15_0, mask, dmac_15_0); + DR_STE_SET_TAG(eth_l2_tnl_v1, bit_mask, first_vlan_id, mask, first_vid); + DR_STE_SET_TAG(eth_l2_tnl_v1, bit_mask, first_cfi, mask, first_cfi); + DR_STE_SET_TAG(eth_l2_tnl_v1, bit_mask, first_priority, mask, first_prio); + DR_STE_SET_TAG(eth_l2_tnl_v1, bit_mask, ip_fragmented, mask, frag); + DR_STE_SET_TAG(eth_l2_tnl_v1, bit_mask, l3_ethertype, mask, ethertype); + DR_STE_SET_ONES(eth_l2_tnl_v1, bit_mask, l3_type, mask, ip_version); + + if (misc->vxlan_vni) { + MLX5_SET(ste_eth_l2_tnl_v1, bit_mask, + l2_tunneling_network_id, (misc->vxlan_vni << 8)); + misc->vxlan_vni = 0; + } + + if (mask->svlan_tag || mask->cvlan_tag) { + MLX5_SET(ste_eth_l2_tnl_v1, bit_mask, first_vlan_qualifier, -1); + mask->cvlan_tag = 0; + mask->svlan_tag = 0; + } +} + +static int dr_ste_v1_build_eth_l2_tnl_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + struct mlx5dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; + struct mlx5dr_match_misc *misc = &value->misc; + + DR_STE_SET_TAG(eth_l2_tnl_v1, tag, dmac_47_16, spec, dmac_47_16); + DR_STE_SET_TAG(eth_l2_tnl_v1, tag, dmac_15_0, spec, dmac_15_0); + DR_STE_SET_TAG(eth_l2_tnl_v1, tag, first_vlan_id, spec, first_vid); + DR_STE_SET_TAG(eth_l2_tnl_v1, tag, first_cfi, spec, first_cfi); + DR_STE_SET_TAG(eth_l2_tnl_v1, tag, ip_fragmented, spec, frag); + DR_STE_SET_TAG(eth_l2_tnl_v1, tag, first_priority, spec, first_prio); + DR_STE_SET_TAG(eth_l2_tnl_v1, tag, l3_ethertype, spec, ethertype); + + if (misc->vxlan_vni) { + MLX5_SET(ste_eth_l2_tnl_v1, tag, l2_tunneling_network_id, + (misc->vxlan_vni << 8)); + misc->vxlan_vni = 0; + } + + if (spec->cvlan_tag) { + MLX5_SET(ste_eth_l2_tnl_v1, tag, first_vlan_qualifier, DR_STE_CVLAN); + spec->cvlan_tag = 0; + } else if (spec->svlan_tag) { + MLX5_SET(ste_eth_l2_tnl_v1, tag, first_vlan_qualifier, DR_STE_SVLAN); + spec->svlan_tag = 0; + } + + if (spec->ip_version == IP_VERSION_IPV4) { + MLX5_SET(ste_eth_l2_tnl_v1, tag, l3_type, STE_IPV4); + spec->ip_version = 0; + } else if (spec->ip_version == IP_VERSION_IPV6) { + MLX5_SET(ste_eth_l2_tnl_v1, tag, l3_type, STE_IPV6); + spec->ip_version = 0; + } else if (spec->ip_version) { + return -EINVAL; + } + + return 0; +} + +static void dr_ste_v1_build_eth_l2_tnl_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v1_build_eth_l2_tnl_bit_mask(mask, sb->inner, sb->bit_mask); + + sb->lu_type = DR_STE_V1_LU_TYPE_ETHL2_TNL; + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_eth_l2_tnl_tag; +} + +static int dr_ste_v1_build_eth_l3_ipv4_misc_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + struct mlx5dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; + + DR_STE_SET_TAG(eth_l3_ipv4_misc_v1, tag, time_to_live, spec, ttl_hoplimit); + DR_STE_SET_TAG(eth_l3_ipv4_misc_v1, tag, ihl, spec, ipv4_ihl); + + return 0; +} + +static void dr_ste_v1_build_eth_l3_ipv4_misc_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v1_build_eth_l3_ipv4_misc_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_CALC_DFNR_TYPE(ETHL3_IPV4_MISC, sb->inner); + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_eth_l3_ipv4_misc_tag; +} + +static int dr_ste_v1_build_eth_ipv6_l3_l4_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + struct mlx5dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; + struct mlx5dr_match_misc *misc = &value->misc; + + DR_STE_SET_TAG(eth_l4_v1, tag, dst_port, spec, tcp_dport); + DR_STE_SET_TAG(eth_l4_v1, tag, src_port, spec, tcp_sport); + DR_STE_SET_TAG(eth_l4_v1, tag, dst_port, spec, udp_dport); + DR_STE_SET_TAG(eth_l4_v1, tag, src_port, spec, udp_sport); + DR_STE_SET_TAG(eth_l4_v1, tag, protocol, spec, ip_protocol); + DR_STE_SET_TAG(eth_l4_v1, tag, fragmented, spec, frag); + DR_STE_SET_TAG(eth_l4_v1, tag, dscp, spec, ip_dscp); + DR_STE_SET_TAG(eth_l4_v1, tag, ecn, spec, ip_ecn); + DR_STE_SET_TAG(eth_l4_v1, tag, ipv6_hop_limit, spec, ttl_hoplimit); + + if (sb->inner) + DR_STE_SET_TAG(eth_l4_v1, tag, flow_label, misc, inner_ipv6_flow_label); + else + DR_STE_SET_TAG(eth_l4_v1, tag, flow_label, misc, outer_ipv6_flow_label); + + if (spec->tcp_flags) { + DR_STE_SET_TCP_FLAGS(eth_l4_v1, tag, spec); + spec->tcp_flags = 0; + } + + return 0; +} + +static void dr_ste_v1_build_eth_ipv6_l3_l4_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v1_build_eth_ipv6_l3_l4_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_CALC_DFNR_TYPE(ETHL4, sb->inner); + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_eth_ipv6_l3_l4_tag; +} + +static int dr_ste_v1_build_mpls_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + struct mlx5dr_match_misc2 *misc2 = &value->misc2; + + if (sb->inner) + DR_STE_SET_MPLS(mpls_v1, misc2, inner, tag); + else + DR_STE_SET_MPLS(mpls_v1, misc2, outer, tag); + + return 0; +} + +static void dr_ste_v1_build_mpls_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v1_build_mpls_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_CALC_DFNR_TYPE(MPLS, sb->inner); + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_mpls_tag; +} + +static int dr_ste_v1_build_tnl_gre_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + struct mlx5dr_match_misc *misc = &value->misc; + + DR_STE_SET_TAG(gre_v1, tag, gre_protocol, misc, gre_protocol); + DR_STE_SET_TAG(gre_v1, tag, gre_k_present, misc, gre_k_present); + DR_STE_SET_TAG(gre_v1, tag, gre_key_h, misc, gre_key_h); + DR_STE_SET_TAG(gre_v1, tag, gre_key_l, misc, gre_key_l); + + DR_STE_SET_TAG(gre_v1, tag, gre_c_present, misc, gre_c_present); + DR_STE_SET_TAG(gre_v1, tag, gre_s_present, misc, gre_s_present); + + return 0; +} + +static void dr_ste_v1_build_tnl_gre_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v1_build_tnl_gre_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_V1_LU_TYPE_GRE; + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_tnl_gre_tag; +} + +static int dr_ste_v1_build_tnl_mpls_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + struct mlx5dr_match_misc2 *misc2 = &value->misc2; + + if (DR_STE_IS_OUTER_MPLS_OVER_GRE_SET(misc2)) { + DR_STE_SET_TAG(mpls_v1, tag, mpls0_label, + misc2, outer_first_mpls_over_gre_label); + + DR_STE_SET_TAG(mpls_v1, tag, mpls0_exp, + misc2, outer_first_mpls_over_gre_exp); + + DR_STE_SET_TAG(mpls_v1, tag, mpls0_s_bos, + misc2, outer_first_mpls_over_gre_s_bos); + + DR_STE_SET_TAG(mpls_v1, tag, mpls0_ttl, + misc2, outer_first_mpls_over_gre_ttl); + } else { + DR_STE_SET_TAG(mpls_v1, tag, mpls0_label, + misc2, outer_first_mpls_over_udp_label); + + DR_STE_SET_TAG(mpls_v1, tag, mpls0_exp, + misc2, outer_first_mpls_over_udp_exp); + + DR_STE_SET_TAG(mpls_v1, tag, mpls0_s_bos, + misc2, outer_first_mpls_over_udp_s_bos); + + DR_STE_SET_TAG(mpls_v1, tag, mpls0_ttl, + misc2, outer_first_mpls_over_udp_ttl); + } + + return 0; +} + +static void dr_ste_v1_build_tnl_mpls_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v1_build_tnl_mpls_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_V1_LU_TYPE_MPLS_I; + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_tnl_mpls_tag; +} + +static int dr_ste_v1_build_tnl_mpls_over_udp_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + struct mlx5dr_match_misc2 *misc2 = &value->misc2; + u8 *parser_ptr; + u8 parser_id; + u32 mpls_hdr; + + mpls_hdr = misc2->outer_first_mpls_over_udp_label << HDR_MPLS_OFFSET_LABEL; + misc2->outer_first_mpls_over_udp_label = 0; + mpls_hdr |= misc2->outer_first_mpls_over_udp_exp << HDR_MPLS_OFFSET_EXP; + misc2->outer_first_mpls_over_udp_exp = 0; + mpls_hdr |= misc2->outer_first_mpls_over_udp_s_bos << HDR_MPLS_OFFSET_S_BOS; + misc2->outer_first_mpls_over_udp_s_bos = 0; + mpls_hdr |= misc2->outer_first_mpls_over_udp_ttl << HDR_MPLS_OFFSET_TTL; + misc2->outer_first_mpls_over_udp_ttl = 0; + + parser_id = sb->caps->flex_parser_id_mpls_over_udp; + parser_ptr = dr_ste_calc_flex_parser_offset(tag, parser_id); + *(__be32 *)parser_ptr = cpu_to_be32(mpls_hdr); + + return 0; +} + +static void dr_ste_v1_build_tnl_mpls_over_udp_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v1_build_tnl_mpls_over_udp_tag(mask, sb, sb->bit_mask); + + /* STEs with lookup type FLEX_PARSER_{0/1} includes + * flex parsers_{0-3}/{4-7} respectively. + */ + sb->lu_type = sb->caps->flex_parser_id_mpls_over_udp > DR_STE_MAX_FLEX_0_ID ? + DR_STE_V1_LU_TYPE_FLEX_PARSER_1 : + DR_STE_V1_LU_TYPE_FLEX_PARSER_0; + + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_tnl_mpls_over_udp_tag; +} + +static int dr_ste_v1_build_tnl_mpls_over_gre_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + struct mlx5dr_match_misc2 *misc2 = &value->misc2; + u8 *parser_ptr; + u8 parser_id; + u32 mpls_hdr; + + mpls_hdr = misc2->outer_first_mpls_over_gre_label << HDR_MPLS_OFFSET_LABEL; + misc2->outer_first_mpls_over_gre_label = 0; + mpls_hdr |= misc2->outer_first_mpls_over_gre_exp << HDR_MPLS_OFFSET_EXP; + misc2->outer_first_mpls_over_gre_exp = 0; + mpls_hdr |= misc2->outer_first_mpls_over_gre_s_bos << HDR_MPLS_OFFSET_S_BOS; + misc2->outer_first_mpls_over_gre_s_bos = 0; + mpls_hdr |= misc2->outer_first_mpls_over_gre_ttl << HDR_MPLS_OFFSET_TTL; + misc2->outer_first_mpls_over_gre_ttl = 0; + + parser_id = sb->caps->flex_parser_id_mpls_over_gre; + parser_ptr = dr_ste_calc_flex_parser_offset(tag, parser_id); + *(__be32 *)parser_ptr = cpu_to_be32(mpls_hdr); + + return 0; +} + +static void dr_ste_v1_build_tnl_mpls_over_gre_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v1_build_tnl_mpls_over_gre_tag(mask, sb, sb->bit_mask); + + /* STEs with lookup type FLEX_PARSER_{0/1} includes + * flex parsers_{0-3}/{4-7} respectively. + */ + sb->lu_type = sb->caps->flex_parser_id_mpls_over_gre > DR_STE_MAX_FLEX_0_ID ? + DR_STE_V1_LU_TYPE_FLEX_PARSER_1 : + DR_STE_V1_LU_TYPE_FLEX_PARSER_0; + + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_tnl_mpls_over_gre_tag; +} + +static int dr_ste_v1_build_icmp_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + struct mlx5dr_match_misc3 *misc3 = &value->misc3; + bool is_ipv4 = DR_MASK_IS_ICMPV4_SET(misc3); + u32 *icmp_header_data; + u8 *icmp_type; + u8 *icmp_code; + + if (is_ipv4) { + icmp_header_data = &misc3->icmpv4_header_data; + icmp_type = &misc3->icmpv4_type; + icmp_code = &misc3->icmpv4_code; + } else { + icmp_header_data = &misc3->icmpv6_header_data; + icmp_type = &misc3->icmpv6_type; + icmp_code = &misc3->icmpv6_code; + } + + MLX5_SET(ste_icmp_v1, tag, icmp_header_data, *icmp_header_data); + MLX5_SET(ste_icmp_v1, tag, icmp_type, *icmp_type); + MLX5_SET(ste_icmp_v1, tag, icmp_code, *icmp_code); + + *icmp_header_data = 0; + *icmp_type = 0; + *icmp_code = 0; + + return 0; +} + +static void dr_ste_v1_build_icmp_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v1_build_icmp_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_V1_LU_TYPE_ETHL4_MISC_O; + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_icmp_tag; +} + +static int dr_ste_v1_build_general_purpose_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + struct mlx5dr_match_misc2 *misc2 = &value->misc2; + + DR_STE_SET_TAG(general_purpose, tag, general_purpose_lookup_field, + misc2, metadata_reg_a); + + return 0; +} + +static void dr_ste_v1_build_general_purpose_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v1_build_general_purpose_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_V1_LU_TYPE_GENERAL_PURPOSE; + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_general_purpose_tag; +} + +static int dr_ste_v1_build_eth_l4_misc_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + struct mlx5dr_match_misc3 *misc3 = &value->misc3; + + if (sb->inner) { + DR_STE_SET_TAG(eth_l4_misc_v1, tag, seq_num, misc3, inner_tcp_seq_num); + DR_STE_SET_TAG(eth_l4_misc_v1, tag, ack_num, misc3, inner_tcp_ack_num); + } else { + DR_STE_SET_TAG(eth_l4_misc_v1, tag, seq_num, misc3, outer_tcp_seq_num); + DR_STE_SET_TAG(eth_l4_misc_v1, tag, ack_num, misc3, outer_tcp_ack_num); + } + + return 0; +} + +static void dr_ste_v1_build_eth_l4_misc_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v1_build_eth_l4_misc_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_V1_LU_TYPE_ETHL4_MISC_O; + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_eth_l4_misc_tag; +} + +static int +dr_ste_v1_build_flex_parser_tnl_vxlan_gpe_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + struct mlx5dr_match_misc3 *misc3 = &value->misc3; + + DR_STE_SET_TAG(flex_parser_tnl_vxlan_gpe, tag, + outer_vxlan_gpe_flags, misc3, + outer_vxlan_gpe_flags); + DR_STE_SET_TAG(flex_parser_tnl_vxlan_gpe, tag, + outer_vxlan_gpe_next_protocol, misc3, + outer_vxlan_gpe_next_protocol); + DR_STE_SET_TAG(flex_parser_tnl_vxlan_gpe, tag, + outer_vxlan_gpe_vni, misc3, + outer_vxlan_gpe_vni); + + return 0; +} + +static void +dr_ste_v1_build_flex_parser_tnl_vxlan_gpe_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v1_build_flex_parser_tnl_vxlan_gpe_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_V1_LU_TYPE_FLEX_PARSER_TNL_HEADER; + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_flex_parser_tnl_vxlan_gpe_tag; +} + +static int +dr_ste_v1_build_flex_parser_tnl_geneve_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + struct mlx5dr_match_misc *misc = &value->misc; + + DR_STE_SET_TAG(flex_parser_tnl_geneve, tag, + geneve_protocol_type, misc, geneve_protocol_type); + DR_STE_SET_TAG(flex_parser_tnl_geneve, tag, + geneve_oam, misc, geneve_oam); + DR_STE_SET_TAG(flex_parser_tnl_geneve, tag, + geneve_opt_len, misc, geneve_opt_len); + DR_STE_SET_TAG(flex_parser_tnl_geneve, tag, + geneve_vni, misc, geneve_vni); + + return 0; +} + +static void +dr_ste_v1_build_flex_parser_tnl_geneve_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v1_build_flex_parser_tnl_geneve_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_V1_LU_TYPE_FLEX_PARSER_TNL_HEADER; + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_flex_parser_tnl_geneve_tag; +} + +static int dr_ste_v1_build_tnl_header_0_1_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + uint8_t *tag) +{ + struct mlx5dr_match_misc5 *misc5 = &value->misc5; + + DR_STE_SET_TAG(tunnel_header, tag, tunnel_header_0, misc5, tunnel_header_0); + DR_STE_SET_TAG(tunnel_header, tag, tunnel_header_1, misc5, tunnel_header_1); + + return 0; +} + +static void dr_ste_v1_build_tnl_header_0_1_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + sb->lu_type = DR_STE_V1_LU_TYPE_FLEX_PARSER_TNL_HEADER; + dr_ste_v1_build_tnl_header_0_1_tag(mask, sb, sb->bit_mask); + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_tnl_header_0_1_tag; +} + +static int dr_ste_v1_build_register_0_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + struct mlx5dr_match_misc2 *misc2 = &value->misc2; + + DR_STE_SET_TAG(register_0, tag, register_0_h, misc2, metadata_reg_c_0); + DR_STE_SET_TAG(register_0, tag, register_0_l, misc2, metadata_reg_c_1); + DR_STE_SET_TAG(register_0, tag, register_1_h, misc2, metadata_reg_c_2); + DR_STE_SET_TAG(register_0, tag, register_1_l, misc2, metadata_reg_c_3); + + return 0; +} + +static void dr_ste_v1_build_register_0_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v1_build_register_0_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_V1_LU_TYPE_STEERING_REGISTERS_0; + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_register_0_tag; +} + +static int dr_ste_v1_build_register_1_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + struct mlx5dr_match_misc2 *misc2 = &value->misc2; + + DR_STE_SET_TAG(register_1, tag, register_2_h, misc2, metadata_reg_c_4); + DR_STE_SET_TAG(register_1, tag, register_2_l, misc2, metadata_reg_c_5); + DR_STE_SET_TAG(register_1, tag, register_3_h, misc2, metadata_reg_c_6); + DR_STE_SET_TAG(register_1, tag, register_3_l, misc2, metadata_reg_c_7); + + return 0; +} + +static void dr_ste_v1_build_register_1_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v1_build_register_1_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_V1_LU_TYPE_STEERING_REGISTERS_1; + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_register_1_tag; +} + +static void dr_ste_v1_build_src_gvmi_qpn_bit_mask(struct mlx5dr_match_param *value, + u8 *bit_mask) +{ + struct mlx5dr_match_misc *misc_mask = &value->misc; + + DR_STE_SET_ONES(src_gvmi_qp_v1, bit_mask, source_gvmi, misc_mask, source_port); + DR_STE_SET_ONES(src_gvmi_qp_v1, bit_mask, source_qp, misc_mask, source_sqn); + misc_mask->source_eswitch_owner_vhca_id = 0; +} + +static int dr_ste_v1_build_src_gvmi_qpn_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + struct mlx5dr_match_misc *misc = &value->misc; + struct mlx5dr_cmd_vport_cap *vport_cap; + struct mlx5dr_domain *dmn = sb->dmn; + struct mlx5dr_domain *vport_dmn; + u8 *bit_mask = sb->bit_mask; + + DR_STE_SET_TAG(src_gvmi_qp_v1, tag, source_qp, misc, source_sqn); + + if (sb->vhca_id_valid) { + /* Find port GVMI based on the eswitch_owner_vhca_id */ + if (misc->source_eswitch_owner_vhca_id == dmn->info.caps.gvmi) + vport_dmn = dmn; + else if (dmn->peer_dmn && (misc->source_eswitch_owner_vhca_id == + dmn->peer_dmn->info.caps.gvmi)) + vport_dmn = dmn->peer_dmn; + else + return -EINVAL; + + misc->source_eswitch_owner_vhca_id = 0; + } else { + vport_dmn = dmn; + } + + if (!MLX5_GET(ste_src_gvmi_qp_v1, bit_mask, source_gvmi)) + return 0; + + vport_cap = mlx5dr_domain_get_vport_cap(vport_dmn, misc->source_port); + if (!vport_cap) { + mlx5dr_err(dmn, "Vport 0x%x is disabled or invalid\n", + misc->source_port); + return -EINVAL; + } + + if (vport_cap->vport_gvmi) + MLX5_SET(ste_src_gvmi_qp_v1, tag, source_gvmi, vport_cap->vport_gvmi); + + misc->source_port = 0; + return 0; +} + +static void dr_ste_v1_build_src_gvmi_qpn_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v1_build_src_gvmi_qpn_bit_mask(mask, sb->bit_mask); + + sb->lu_type = DR_STE_V1_LU_TYPE_SRC_QP_GVMI; + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_src_gvmi_qpn_tag; +} + +static void dr_ste_v1_set_flex_parser(u32 *misc4_field_id, + u32 *misc4_field_value, + bool *parser_is_used, + u8 *tag) +{ + u32 id = *misc4_field_id; + u8 *parser_ptr; + + if (id >= DR_NUM_OF_FLEX_PARSERS || parser_is_used[id]) + return; + + parser_is_used[id] = true; + parser_ptr = dr_ste_calc_flex_parser_offset(tag, id); + + *(__be32 *)parser_ptr = cpu_to_be32(*misc4_field_value); + *misc4_field_id = 0; + *misc4_field_value = 0; +} + +static int dr_ste_v1_build_felx_parser_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + struct mlx5dr_match_misc4 *misc_4_mask = &value->misc4; + bool parser_is_used[DR_NUM_OF_FLEX_PARSERS] = {}; + + dr_ste_v1_set_flex_parser(&misc_4_mask->prog_sample_field_id_0, + &misc_4_mask->prog_sample_field_value_0, + parser_is_used, tag); + + dr_ste_v1_set_flex_parser(&misc_4_mask->prog_sample_field_id_1, + &misc_4_mask->prog_sample_field_value_1, + parser_is_used, tag); + + dr_ste_v1_set_flex_parser(&misc_4_mask->prog_sample_field_id_2, + &misc_4_mask->prog_sample_field_value_2, + parser_is_used, tag); + + dr_ste_v1_set_flex_parser(&misc_4_mask->prog_sample_field_id_3, + &misc_4_mask->prog_sample_field_value_3, + parser_is_used, tag); + + return 0; +} + +static void dr_ste_v1_build_flex_parser_0_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + sb->lu_type = DR_STE_V1_LU_TYPE_FLEX_PARSER_0; + dr_ste_v1_build_felx_parser_tag(mask, sb, sb->bit_mask); + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_felx_parser_tag; +} + +static void dr_ste_v1_build_flex_parser_1_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + sb->lu_type = DR_STE_V1_LU_TYPE_FLEX_PARSER_1; + dr_ste_v1_build_felx_parser_tag(mask, sb, sb->bit_mask); + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_felx_parser_tag; +} + +static int +dr_ste_v1_build_flex_parser_tnl_geneve_tlv_opt_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + struct mlx5dr_match_misc3 *misc3 = &value->misc3; + u8 parser_id = sb->caps->flex_parser_id_geneve_tlv_option_0; + u8 *parser_ptr = dr_ste_calc_flex_parser_offset(tag, parser_id); + + MLX5_SET(ste_flex_parser_0, parser_ptr, flex_parser_3, + misc3->geneve_tlv_option_0_data); + misc3->geneve_tlv_option_0_data = 0; + + return 0; +} + +static void +dr_ste_v1_build_flex_parser_tnl_geneve_tlv_opt_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v1_build_flex_parser_tnl_geneve_tlv_opt_tag(mask, sb, sb->bit_mask); + + /* STEs with lookup type FLEX_PARSER_{0/1} includes + * flex parsers_{0-3}/{4-7} respectively. + */ + sb->lu_type = sb->caps->flex_parser_id_geneve_tlv_option_0 > 3 ? + DR_STE_V1_LU_TYPE_FLEX_PARSER_1 : + DR_STE_V1_LU_TYPE_FLEX_PARSER_0; + + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_flex_parser_tnl_geneve_tlv_opt_tag; +} + +static int +dr_ste_v1_build_flex_parser_tnl_geneve_tlv_opt_exist_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + uint8_t *tag) +{ + u8 parser_id = sb->caps->flex_parser_id_geneve_tlv_option_0; + struct mlx5dr_match_misc *misc = &value->misc; + + if (misc->geneve_tlv_option_0_exist) { + MLX5_SET(ste_flex_parser_ok, tag, flex_parsers_ok, 1 << parser_id); + misc->geneve_tlv_option_0_exist = 0; + } + + return 0; +} + +static void +dr_ste_v1_build_flex_parser_tnl_geneve_tlv_opt_exist_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + sb->lu_type = DR_STE_V1_LU_TYPE_FLEX_PARSER_OK; + dr_ste_v1_build_flex_parser_tnl_geneve_tlv_opt_exist_tag(mask, sb, sb->bit_mask); + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_flex_parser_tnl_geneve_tlv_opt_exist_tag; +} + +static int dr_ste_v1_build_flex_parser_tnl_gtpu_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + struct mlx5dr_match_misc3 *misc3 = &value->misc3; + + DR_STE_SET_TAG(flex_parser_tnl_gtpu, tag, gtpu_msg_flags, misc3, gtpu_msg_flags); + DR_STE_SET_TAG(flex_parser_tnl_gtpu, tag, gtpu_msg_type, misc3, gtpu_msg_type); + DR_STE_SET_TAG(flex_parser_tnl_gtpu, tag, gtpu_teid, misc3, gtpu_teid); + + return 0; +} + +static void dr_ste_v1_build_flex_parser_tnl_gtpu_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v1_build_flex_parser_tnl_gtpu_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_V1_LU_TYPE_FLEX_PARSER_TNL_HEADER; + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_flex_parser_tnl_gtpu_tag; +} + +static int +dr_ste_v1_build_tnl_gtpu_flex_parser_0_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + if (dr_is_flex_parser_0_id(sb->caps->flex_parser_id_gtpu_dw_0)) + DR_STE_SET_FLEX_PARSER_FIELD(tag, gtpu_dw_0, sb->caps, &value->misc3); + if (dr_is_flex_parser_0_id(sb->caps->flex_parser_id_gtpu_teid)) + DR_STE_SET_FLEX_PARSER_FIELD(tag, gtpu_teid, sb->caps, &value->misc3); + if (dr_is_flex_parser_0_id(sb->caps->flex_parser_id_gtpu_dw_2)) + DR_STE_SET_FLEX_PARSER_FIELD(tag, gtpu_dw_2, sb->caps, &value->misc3); + if (dr_is_flex_parser_0_id(sb->caps->flex_parser_id_gtpu_first_ext_dw_0)) + DR_STE_SET_FLEX_PARSER_FIELD(tag, gtpu_first_ext_dw_0, sb->caps, &value->misc3); + return 0; +} + +static void +dr_ste_v1_build_tnl_gtpu_flex_parser_0_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v1_build_tnl_gtpu_flex_parser_0_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_V1_LU_TYPE_FLEX_PARSER_0; + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_tnl_gtpu_flex_parser_0_tag; +} + +static int +dr_ste_v1_build_tnl_gtpu_flex_parser_1_tag(struct mlx5dr_match_param *value, + struct mlx5dr_ste_build *sb, + u8 *tag) +{ + if (dr_is_flex_parser_1_id(sb->caps->flex_parser_id_gtpu_dw_0)) + DR_STE_SET_FLEX_PARSER_FIELD(tag, gtpu_dw_0, sb->caps, &value->misc3); + if (dr_is_flex_parser_1_id(sb->caps->flex_parser_id_gtpu_teid)) + DR_STE_SET_FLEX_PARSER_FIELD(tag, gtpu_teid, sb->caps, &value->misc3); + if (dr_is_flex_parser_1_id(sb->caps->flex_parser_id_gtpu_dw_2)) + DR_STE_SET_FLEX_PARSER_FIELD(tag, gtpu_dw_2, sb->caps, &value->misc3); + if (dr_is_flex_parser_1_id(sb->caps->flex_parser_id_gtpu_first_ext_dw_0)) + DR_STE_SET_FLEX_PARSER_FIELD(tag, gtpu_first_ext_dw_0, sb->caps, &value->misc3); + return 0; +} + +static void +dr_ste_v1_build_tnl_gtpu_flex_parser_1_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) +{ + dr_ste_v1_build_tnl_gtpu_flex_parser_1_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_V1_LU_TYPE_FLEX_PARSER_1; + sb->byte_mask = mlx5dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_tnl_gtpu_flex_parser_1_tag; +} + +/* Cache structure and functions */ +static bool dr_ste_v1_compare_modify_hdr(size_t cur_num_of_actions, + __be64 cur_hw_actions[], + size_t num_of_actions, + __be64 hw_actions[]) +{ + int i; + + if (cur_num_of_actions != num_of_actions) + return false; + + for (i = 0; i < num_of_actions; i++) { + u8 action_id = + MLX5_GET(ste_double_action_add, &hw_actions[i], action_id); + + if (action_id == DR_STE_V1_ACTION_ID_COPY) { + if (hw_actions[i] != cur_hw_actions[i]) + return false; + } else { + if ((__be32)hw_actions[i] != + (__be32)cur_hw_actions[i]) + return false; + } + } + + return true; +} + +static bool dr_ste_v1_compare_reformat_hdr(size_t cur_num_of_actions, + u8 cur_hw_action[], + size_t num_of_actions, + u8 hw_action[]) +{ + /* The only check we have is according to the number + * of actions, and this was already done prior this call. + */ + return true; +} + +static bool dr_ste_v1_compare_pattern(enum mlx5dr_action_type cur_type, + size_t cur_num_of_actions, + u8 cur_hw_action[], + enum mlx5dr_action_type type, + size_t num_of_actions, + u8 hw_action[]) +{ + if ((cur_num_of_actions != num_of_actions) || (cur_type != type)) + return false; + + switch (type) { + case DR_ACTION_TYP_MODIFY_HDR: + return dr_ste_v1_compare_modify_hdr(cur_num_of_actions, + (__be64 *)cur_hw_action, + num_of_actions, + (__be64 *)hw_action); + case DR_ACTION_TYP_TNL_L3_TO_L2: + return dr_ste_v1_compare_reformat_hdr(cur_num_of_actions, + cur_hw_action, + num_of_actions, + hw_action); + default: + WARN(true, "Illegal action type - %d", type); + return false; + } +} + +struct dr_cached_pattern { + enum mlx5dr_action_type type; + struct { + struct mlx5dr_icm_chunk *chunk; + u8 *data; + u16 num_of_actions; + u32 index; + } rewrite_data; + refcount_t refcount; + struct list_head list; +}; + +static struct dr_cached_pattern * +dr_ste_v1_find_cached_pattern(struct mlx5dr_domain *dmn, + struct mlx5dr_action *action) +{ + struct dr_cached_pattern *tmp_cached_action; + struct dr_cached_pattern *cached_action; + + list_for_each_entry_safe(cached_action, tmp_cached_action, + &dmn->modify_hdr_list, list) { + if (dr_ste_v1_compare_pattern(cached_action->type, + cached_action->rewrite_data.num_of_actions, + cached_action->rewrite_data.data, + action->action_type, + action->rewrite->num_of_actions, + action->rewrite->data)) + return cached_action; + } + + return NULL; +} + +static struct dr_cached_pattern * +dr_ste_v1_get_cached_pattern(struct mlx5dr_domain *dmn, + struct mlx5dr_action *action) +{ + struct dr_cached_pattern *cached_action; + + cached_action = dr_ste_v1_find_cached_pattern(dmn, action); + if (cached_action) { + /* LRU: move it to be first in the list */ + list_del(&cached_action->list); + list_add_tail(&cached_action->list, &dmn->modify_hdr_list); + refcount_inc(&cached_action->refcount); + } + + return cached_action; +} + +static void dr_ste_v1_put_cached_pattern(struct mlx5dr_domain *dmn, + struct mlx5dr_action *action) +{ + struct dr_cached_pattern *cached_pattern; + + cached_pattern = dr_ste_v1_find_cached_pattern(dmn, action); + if (!cached_pattern) { + WARN(true, "Cached pattern not found"); + return; + } + + if (!refcount_dec_and_test(&cached_pattern->refcount)) + return; + + list_del(&cached_pattern->list); + + kfree(action->rewrite->data); + mlx5dr_icm_free_chunk(action->rewrite->chunk); + + kfree(cached_pattern); +} + +static int dr_ste_v1_add_pattern_to_cache(struct mlx5dr_domain *dmn, + struct mlx5dr_action *action) +{ + struct dr_cached_pattern *cached_pattern; + + cached_pattern = kzalloc(sizeof(*cached_pattern), GFP_KERNEL); + if (!cached_pattern) + return -ENOMEM; + + cached_pattern->type = action->action_type; + cached_pattern->rewrite_data.chunk = action->rewrite->chunk; + cached_pattern->rewrite_data.index = action->rewrite->index; + cached_pattern->rewrite_data.num_of_actions = + action->rewrite->num_of_actions; + cached_pattern->rewrite_data.data = action->rewrite->data; + + list_add_tail(&cached_pattern->list, &dmn->modify_hdr_list); + + refcount_set(&cached_pattern->refcount, 1); + + return 0; +} + +static enum mlx5dr_arg_chunk_size +dr_get_arg_size(struct mlx5dr_action *action) +{ + if (action->rewrite->num_of_actions <= 8) + return DR_ARG_CHUNK_SIZE_1; + if (action->rewrite->num_of_actions <= 16) + return DR_ARG_CHUNK_SIZE_2; + if (action->rewrite->num_of_actions <= 32) + return DR_ARG_CHUNK_SIZE_3; + + return DR_ARG_CHUNK_SIZE_MAX; +} + +static int dr_ste_v1_alloc_modify_hdr_arg(struct mlx5dr_domain *dmn, + struct mlx5dr_action *action) +{ + int ret; + + action->rewrite->arg = mlx5dr_arg_get_obj(dmn, dr_get_arg_size(action)); + if (!action->rewrite->arg) { + mlx5dr_err(dmn, "Failed allocating args object for modify header\n"); + return -ENOMEM; + } + + /* write it into the hw */ + ret = mlx5dr_send_postsend_args(dmn, action); + if (ret) { + mlx5dr_err(dmn, "Failed writing args object\n"); + goto put_obj; + } + + return 0; + +put_obj: + mlx5dr_arg_put_obj(dmn, action->rewrite->arg); + return ret; +} + +static int dr_ste_v1_alloc_modify_hdr_chunk(struct mlx5dr_action *action, + u32 chunck_size) +{ + struct mlx5dr_domain *dmn = action->rewrite->dmn; + struct dr_cached_pattern *cached_pattern; + int ret; + + if (!dmn->modify_header_ptrn_icm_pool) + return -ENOTSUPP; + + ret = dr_ste_v1_alloc_modify_hdr_arg(dmn, action); + if (ret) { + mlx5dr_err(dmn, "Failed allocating args for modify header\n"); + return -ENOMEM; + } + + mutex_lock(&dmn->modify_hdr_mutex); + + cached_pattern = dr_ste_v1_get_cached_pattern(dmn, action); + if (cached_pattern) { + /* no use the current one, use the cached */ + kfree(action->rewrite->data); + + action->rewrite->chunk = cached_pattern->rewrite_data.chunk; + action->rewrite->index = cached_pattern->rewrite_data.index; + action->rewrite->data = cached_pattern->rewrite_data.data; + + } else { + u64 *hw_actions; + int i; + + action->rewrite->chunk = + mlx5dr_icm_alloc_chunk(dmn->modify_header_ptrn_icm_pool, + chunck_size); + if (!action->rewrite->chunk) { + ret = -ENOMEM; + goto put_arg; + } + + hw_actions = (u64 *)action->rewrite->data; + + /* Here we mask the pattern data to create a valid pattern + * since we do an OR operation between the arg and pattern + * This should be fixed in the future on to keep the data valid */ + for (i = 0; i < action->rewrite->num_of_actions; i++) { + u8 action_id = MLX5_GET(ste_double_action_add, &hw_actions[i], action_id); + + if (action_id == DR_STE_V1_ACTION_ID_SET || + action_id == DR_STE_V1_ACTION_ID_ADD || + action_id == DR_STE_V1_ACTION_ID_INSERT_INLINE) + MLX5_SET(ste_double_action_set, &hw_actions[i], inline_data, 0); + } + + action->rewrite->index = (mlx5dr_icm_pool_get_chunk_icm_addr(action->rewrite->chunk) - + dmn->info.caps.hdr_modify_pattern_icm_addr) / + MLX5DR_ACTION_CACHE_LINE_SIZE; + + ret = mlx5dr_send_postsend_action(dmn, action); + if (ret) + goto clean_chunk; + + ret = dr_ste_v1_add_pattern_to_cache(dmn, action); + if (ret) { + mlx5dr_err(dmn, "Failed adding to cache\n"); + goto clean_chunk; + } + } + + mutex_unlock(&dmn->modify_hdr_mutex); + + return 0; + +clean_chunk: + mlx5dr_icm_free_chunk(action->rewrite->chunk); +put_arg: + mutex_unlock(&dmn->modify_hdr_mutex); + mlx5dr_arg_put_obj(action->rewrite->dmn, action->rewrite->arg); + + return ret; +} + +static void dr_ste_v1_dealloc_modify_hdr_chunk(struct mlx5dr_action *action) +{ + struct mlx5dr_domain *dmn = action->rewrite->dmn; + + mutex_lock(&dmn->modify_hdr_mutex); + dr_ste_v1_put_cached_pattern(action->rewrite->dmn, action); + mutex_unlock(&dmn->modify_hdr_mutex); + + mlx5dr_arg_put_obj(action->rewrite->dmn, action->rewrite->arg); +} + +static struct mlx5dr_ste_ctx ste_ctx_v1 = { + /* Builders */ + .build_eth_l2_src_dst_init = &dr_ste_v1_build_eth_l2_src_dst_init, + .build_eth_l3_ipv6_src_init = &dr_ste_v1_build_eth_l3_ipv6_src_init, + .build_eth_l3_ipv6_dst_init = &dr_ste_v1_build_eth_l3_ipv6_dst_init, + .build_eth_l3_ipv4_5_tuple_init = &dr_ste_v1_build_eth_l3_ipv4_5_tuple_init, + .build_eth_l2_src_init = &dr_ste_v1_build_eth_l2_src_init, + .build_eth_l2_dst_init = &dr_ste_v1_build_eth_l2_dst_init, + .build_eth_l2_tnl_init = &dr_ste_v1_build_eth_l2_tnl_init, + .build_eth_l3_ipv4_misc_init = &dr_ste_v1_build_eth_l3_ipv4_misc_init, + .build_eth_ipv6_l3_l4_init = &dr_ste_v1_build_eth_ipv6_l3_l4_init, + .build_mpls_init = &dr_ste_v1_build_mpls_init, + .build_tnl_gre_init = &dr_ste_v1_build_tnl_gre_init, + .build_tnl_mpls_init = &dr_ste_v1_build_tnl_mpls_init, + .build_tnl_mpls_over_udp_init = &dr_ste_v1_build_tnl_mpls_over_udp_init, + .build_tnl_mpls_over_gre_init = &dr_ste_v1_build_tnl_mpls_over_gre_init, + .build_icmp_init = &dr_ste_v1_build_icmp_init, + .build_general_purpose_init = &dr_ste_v1_build_general_purpose_init, + .build_eth_l4_misc_init = &dr_ste_v1_build_eth_l4_misc_init, + .build_tnl_vxlan_gpe_init = &dr_ste_v1_build_flex_parser_tnl_vxlan_gpe_init, + .build_tnl_geneve_init = &dr_ste_v1_build_flex_parser_tnl_geneve_init, + .build_tnl_geneve_tlv_opt_init = &dr_ste_v1_build_flex_parser_tnl_geneve_tlv_opt_init, + .build_tnl_geneve_tlv_opt_exist_init = &dr_ste_v1_build_flex_parser_tnl_geneve_tlv_opt_exist_init, + .build_register_0_init = &dr_ste_v1_build_register_0_init, + .build_register_1_init = &dr_ste_v1_build_register_1_init, + .build_src_gvmi_qpn_init = &dr_ste_v1_build_src_gvmi_qpn_init, + .build_flex_parser_0_init = &dr_ste_v1_build_flex_parser_0_init, + .build_flex_parser_1_init = &dr_ste_v1_build_flex_parser_1_init, + .build_tnl_gtpu_init = &dr_ste_v1_build_flex_parser_tnl_gtpu_init, + .build_tnl_header_0_1_init = &dr_ste_v1_build_tnl_header_0_1_init, + .build_tnl_gtpu_flex_parser_0_init = &dr_ste_v1_build_tnl_gtpu_flex_parser_0_init, + .build_tnl_gtpu_flex_parser_1_init = &dr_ste_v1_build_tnl_gtpu_flex_parser_1_init, + + /* Getters and Setters */ + .ste_init = &dr_ste_v1_init, + .set_next_lu_type = &dr_ste_v1_set_next_lu_type, + .get_next_lu_type = &dr_ste_v1_get_next_lu_type, + .set_miss_addr = &dr_ste_v1_set_miss_addr, + .get_miss_addr = &dr_ste_v1_get_miss_addr, + .set_hit_addr = &dr_ste_v1_set_hit_addr, + .set_byte_mask = &dr_ste_v1_set_byte_mask, + .get_byte_mask = &dr_ste_v1_get_byte_mask, + /* Actions */ + .actions_caps = DR_STE_CTX_ACTION_CAP_TX_POP | + DR_STE_CTX_ACTION_CAP_RX_PUSH | + DR_STE_CTX_ACTION_CAP_RX_ENCAP | + DR_STE_CTX_ACTION_CAP_POP_MDFY, + .set_actions_rx = &dr_ste_v1_set_actions_rx, + .set_actions_tx = &dr_ste_v1_set_actions_tx, + .modify_field_arr_sz = ARRAY_SIZE(dr_ste_v1_action_modify_field_arr), + .modify_field_arr = dr_ste_v1_action_modify_field_arr, + .set_action_set = &dr_ste_v1_set_action_set, + .set_action_add = &dr_ste_v1_set_action_add, + .set_action_copy = &dr_ste_v1_set_action_copy, + .set_action_decap_l3_list = &dr_ste_v1_set_action_decap_l3_list, + .alloc_modify_hdr_chunk = &dr_ste_v1_alloc_modify_hdr_chunk, + .dealloc_modify_hdr_chunk = &dr_ste_v1_dealloc_modify_hdr_chunk, + /* Send */ + .prepare_for_postsend = &dr_ste_v1_prepare_for_postsend, +}; + +struct mlx5dr_ste_ctx *mlx5dr_ste_get_ctx_v1(void) +{ + return &ste_ctx_v1; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v2.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v2.c new file mode 100644 index 0000000..dd4af8c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v2.c @@ -0,0 +1,183 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2022 NVIDIA CORPORATION. All rights reserved. */ + +#include +#include "dr_ste.h" + +enum { + DR_STE_V2_ACTION_MDFY_FLD_L2_OUT_0 = 0x00, + DR_STE_V2_ACTION_MDFY_FLD_L2_OUT_1 = 0x01, + DR_STE_V2_ACTION_MDFY_FLD_L2_OUT_2 = 0x02, + DR_STE_V2_ACTION_MDFY_FLD_SRC_L2_OUT_0 = 0x08, + DR_STE_V2_ACTION_MDFY_FLD_SRC_L2_OUT_1 = 0x09, + DR_STE_V2_ACTION_MDFY_FLD_L3_OUT_0 = 0x0e, + DR_STE_V2_ACTION_MDFY_FLD_L4_OUT_0 = 0x18, + DR_STE_V2_ACTION_MDFY_FLD_L4_OUT_1 = 0x19, + DR_STE_V2_ACTION_MDFY_FLD_IPV4_OUT_0 = 0x40, + DR_STE_V2_ACTION_MDFY_FLD_IPV4_OUT_1 = 0x41, + DR_STE_V2_ACTION_MDFY_FLD_IPV6_DST_OUT_0 = 0x44, + DR_STE_V2_ACTION_MDFY_FLD_IPV6_DST_OUT_1 = 0x45, + DR_STE_V2_ACTION_MDFY_FLD_IPV6_DST_OUT_2 = 0x46, + DR_STE_V2_ACTION_MDFY_FLD_IPV6_DST_OUT_3 = 0x47, + DR_STE_V2_ACTION_MDFY_FLD_IPV6_SRC_OUT_0 = 0x4c, + DR_STE_V2_ACTION_MDFY_FLD_IPV6_SRC_OUT_1 = 0x4d, + DR_STE_V2_ACTION_MDFY_FLD_IPV6_SRC_OUT_2 = 0x4e, + DR_STE_V2_ACTION_MDFY_FLD_IPV6_SRC_OUT_3 = 0x4f, + DR_STE_V2_ACTION_MDFY_FLD_TCP_MISC_0 = 0x5e, + DR_STE_V2_ACTION_MDFY_FLD_TCP_MISC_1 = 0x5f, + DR_STE_V2_ACTION_MDFY_FLD_CFG_HDR_0_0 = 0x6f, + DR_STE_V2_ACTION_MDFY_FLD_CFG_HDR_0_1 = 0x70, + DR_STE_V2_ACTION_MDFY_FLD_METADATA_2_CQE = 0x7b, + DR_STE_V2_ACTION_MDFY_FLD_GNRL_PURPOSE = 0x7c, + DR_STE_V2_ACTION_MDFY_FLD_REGISTER_2_0 = 0x90, + DR_STE_V2_ACTION_MDFY_FLD_REGISTER_2_1 = 0x91, + DR_STE_V2_ACTION_MDFY_FLD_REGISTER_1_0 = 0x92, + DR_STE_V2_ACTION_MDFY_FLD_REGISTER_1_1 = 0x93, + DR_STE_V2_ACTION_MDFY_FLD_REGISTER_0_0 = 0x94, + DR_STE_V2_ACTION_MDFY_FLD_REGISTER_0_1 = 0x95, +}; + +static const struct mlx5dr_ste_action_modify_field dr_ste_v2_action_modify_field_arr[] = { + [MLX5_ACTION_IN_FIELD_OUT_SMAC_47_16] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_SRC_L2_OUT_0, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_SMAC_15_0] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_SRC_L2_OUT_1, .start = 16, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_ETHERTYPE] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_L2_OUT_1, .start = 0, .end = 15, + }, + [MLX5_ACTION_IN_FIELD_OUT_DMAC_47_16] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_L2_OUT_0, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_DMAC_15_0] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_L2_OUT_1, .start = 16, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_IP_DSCP] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_L3_OUT_0, .start = 18, .end = 23, + }, + [MLX5_ACTION_IN_FIELD_OUT_TCP_FLAGS] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_L4_OUT_1, .start = 16, .end = 24, + .l4_type = DR_STE_ACTION_MDFY_TYPE_L4_TCP, + }, + [MLX5_ACTION_IN_FIELD_OUT_TCP_SPORT] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_L4_OUT_0, .start = 16, .end = 31, + .l4_type = DR_STE_ACTION_MDFY_TYPE_L4_TCP, + }, + [MLX5_ACTION_IN_FIELD_OUT_TCP_DPORT] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_L4_OUT_0, .start = 0, .end = 15, + .l4_type = DR_STE_ACTION_MDFY_TYPE_L4_TCP, + }, + [MLX5_ACTION_IN_FIELD_OUT_IP_TTL] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_L3_OUT_0, .start = 8, .end = 15, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV4, + }, + [MLX5_ACTION_IN_FIELD_OUT_IPV6_HOPLIMIT] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_L3_OUT_0, .start = 8, .end = 15, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_UDP_SPORT] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_L4_OUT_0, .start = 16, .end = 31, + .l4_type = DR_STE_ACTION_MDFY_TYPE_L4_UDP, + }, + [MLX5_ACTION_IN_FIELD_OUT_UDP_DPORT] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_L4_OUT_0, .start = 0, .end = 15, + .l4_type = DR_STE_ACTION_MDFY_TYPE_L4_UDP, + }, + [MLX5_ACTION_IN_FIELD_OUT_SIPV6_127_96] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_IPV6_SRC_OUT_0, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_SIPV6_95_64] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_IPV6_SRC_OUT_1, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_SIPV6_63_32] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_IPV6_SRC_OUT_2, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_SIPV6_31_0] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_IPV6_SRC_OUT_3, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_DIPV6_127_96] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_IPV6_DST_OUT_0, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_DIPV6_95_64] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_IPV6_DST_OUT_1, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_DIPV6_63_32] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_IPV6_DST_OUT_2, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_DIPV6_31_0] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_IPV6_DST_OUT_3, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_SIPV4] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_IPV4_OUT_0, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV4, + }, + [MLX5_ACTION_IN_FIELD_OUT_DIPV4] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_IPV4_OUT_1, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV4, + }, + [MLX5_ACTION_IN_FIELD_METADATA_REG_A] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_GNRL_PURPOSE, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_METADATA_REG_B] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_METADATA_2_CQE, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_METADATA_REG_C_0] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_REGISTER_0_0, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_METADATA_REG_C_1] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_REGISTER_0_1, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_METADATA_REG_C_2] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_REGISTER_1_0, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_METADATA_REG_C_3] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_REGISTER_1_1, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_METADATA_REG_C_4] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_REGISTER_2_0, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_METADATA_REG_C_5] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_REGISTER_2_1, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_TCP_SEQ_NUM] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_TCP_MISC_0, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_TCP_ACK_NUM] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_TCP_MISC_1, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_FIRST_VID] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_L2_OUT_2, .start = 0, .end = 15, + }, + [MLX5_ACTION_IN_FIELD_OUT_EMD_31_0] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_CFG_HDR_0_1, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_EMD_47_32] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_CFG_HDR_0_0, .start = 0, .end = 15, + }, +}; + +static struct mlx5dr_ste_ctx ste_ctx_v2; + +struct mlx5dr_ste_ctx *mlx5dr_ste_get_ctx_v2(void) +{ + if (!ste_ctx_v2.actions_caps) { + /* struct initialization required only for the first time */ + ste_ctx_v2 = *mlx5dr_ste_get_ctx_v1(); + ste_ctx_v2.actions_caps = DR_STE_CTX_ACTION_CAP_TX_POP | + DR_STE_CTX_ACTION_CAP_RX_PUSH | + DR_STE_CTX_ACTION_CAP_RX_ENCAP; + ste_ctx_v2.modify_field_arr = dr_ste_v2_action_modify_field_arr; + ste_ctx_v2.modify_field_arr_sz = ARRAY_SIZE(dr_ste_v2_action_modify_field_arr); + } + + return &ste_ctx_v2; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_table.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_table.c new file mode 100644 index 0000000..eb81759 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_table.c @@ -0,0 +1,316 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2019 Mellanox Technologies. */ + +#include "dr_types.h" + +static int dr_table_set_miss_action_nic(struct mlx5dr_domain *dmn, + struct mlx5dr_table_rx_tx *nic_tbl, + struct mlx5dr_action *action) +{ + struct mlx5dr_matcher_rx_tx *last_nic_matcher = NULL; + struct mlx5dr_htbl_connect_info info; + struct mlx5dr_ste_htbl *last_htbl; + struct mlx5dr_icm_chunk *chunk; + int ret; + + if (!list_empty(&nic_tbl->nic_matcher_list)) + last_nic_matcher = list_last_entry(&nic_tbl->nic_matcher_list, + struct mlx5dr_matcher_rx_tx, + list_node); + + if (last_nic_matcher) + last_htbl = last_nic_matcher->e_anchor; + else + last_htbl = nic_tbl->s_anchor; + + if (action) { + chunk = nic_tbl->nic_dmn->type == DR_DOMAIN_NIC_TYPE_RX ? + action->dest_tbl->tbl->rx.s_anchor->chunk : + action->dest_tbl->tbl->tx.s_anchor->chunk; + nic_tbl->default_icm_addr = mlx5dr_icm_pool_get_chunk_icm_addr(chunk); + } else { + nic_tbl->default_icm_addr = nic_tbl->nic_dmn->default_icm_addr; + } + + info.type = CONNECT_MISS; + info.miss_icm_addr = nic_tbl->default_icm_addr; + + ret = mlx5dr_ste_htbl_init_and_postsend(dmn, nic_tbl->nic_dmn, + last_htbl, &info, true); + if (ret) + mlx5dr_dbg(dmn, "Failed to set NIC RX/TX miss action, ret %d\n", ret); + + return ret; +} + +int mlx5dr_table_set_miss_action(struct mlx5dr_table *tbl, + struct mlx5dr_action *action) +{ + int ret; + + if (action && action->action_type != DR_ACTION_TYP_FT) + return -EOPNOTSUPP; + + mlx5dr_domain_lock(tbl->dmn); + + if (tbl->dmn->type == MLX5DR_DOMAIN_TYPE_NIC_RX || + tbl->dmn->type == MLX5DR_DOMAIN_TYPE_FDB) { + ret = dr_table_set_miss_action_nic(tbl->dmn, &tbl->rx, action); + if (ret) + goto out; + } + + if (tbl->dmn->type == MLX5DR_DOMAIN_TYPE_NIC_TX || + tbl->dmn->type == MLX5DR_DOMAIN_TYPE_FDB) { + ret = dr_table_set_miss_action_nic(tbl->dmn, &tbl->tx, action); + if (ret) + goto out; + } + + /* Release old action */ + if (tbl->miss_action) + refcount_dec(&tbl->miss_action->refcount); + + /* Set new miss action */ + tbl->miss_action = action; + if (tbl->miss_action) + refcount_inc(&action->refcount); + +out: + mlx5dr_domain_unlock(tbl->dmn); + return ret; +} + +static void dr_table_uninit_nic(struct mlx5dr_table_rx_tx *nic_tbl) +{ + mlx5dr_htbl_put(nic_tbl->s_anchor); +} + +static void dr_table_uninit_fdb(struct mlx5dr_table *tbl) +{ + dr_table_uninit_nic(&tbl->rx); + dr_table_uninit_nic(&tbl->tx); +} + +static void dr_table_uninit(struct mlx5dr_table *tbl) +{ + mlx5dr_domain_lock(tbl->dmn); + + switch (tbl->dmn->type) { + case MLX5DR_DOMAIN_TYPE_NIC_RX: + dr_table_uninit_nic(&tbl->rx); + break; + case MLX5DR_DOMAIN_TYPE_NIC_TX: + dr_table_uninit_nic(&tbl->tx); + break; + case MLX5DR_DOMAIN_TYPE_FDB: + dr_table_uninit_fdb(tbl); + break; + default: + WARN_ON(true); + break; + } + + mlx5dr_domain_unlock(tbl->dmn); +} + +static int dr_table_init_nic(struct mlx5dr_domain *dmn, + struct mlx5dr_table_rx_tx *nic_tbl) +{ + struct mlx5dr_domain_rx_tx *nic_dmn = nic_tbl->nic_dmn; + struct mlx5dr_htbl_connect_info info; + int ret; + + INIT_LIST_HEAD(&nic_tbl->nic_matcher_list); + + nic_tbl->default_icm_addr = nic_dmn->default_icm_addr; + + nic_tbl->s_anchor = mlx5dr_ste_htbl_alloc(dmn->ste_icm_pool, + DR_CHUNK_SIZE_1, + MLX5DR_STE_LU_TYPE_DONT_CARE, + 0); + if (!nic_tbl->s_anchor) { + mlx5dr_err(dmn, "Failed allocating htbl\n"); + return -ENOMEM; + } + + info.type = CONNECT_MISS; + info.miss_icm_addr = nic_dmn->default_icm_addr; + ret = mlx5dr_ste_htbl_init_and_postsend(dmn, nic_dmn, + nic_tbl->s_anchor, + &info, true); + if (ret) { + mlx5dr_err(dmn, "Failed int and send htbl\n"); + goto free_s_anchor; + } + + mlx5dr_htbl_get(nic_tbl->s_anchor); + + return 0; + +free_s_anchor: + mlx5dr_ste_htbl_free(nic_tbl->s_anchor); + return ret; +} + +static int dr_table_init_fdb(struct mlx5dr_table *tbl) +{ + int ret; + + ret = dr_table_init_nic(tbl->dmn, &tbl->rx); + if (ret) + return ret; + + ret = dr_table_init_nic(tbl->dmn, &tbl->tx); + if (ret) + goto destroy_rx; + + return 0; + +destroy_rx: + dr_table_uninit_nic(&tbl->rx); + return ret; +} + +static int dr_table_init(struct mlx5dr_table *tbl) +{ + int ret = 0; + + INIT_LIST_HEAD(&tbl->matcher_list); + + mlx5dr_domain_lock(tbl->dmn); + + switch (tbl->dmn->type) { + case MLX5DR_DOMAIN_TYPE_NIC_RX: + tbl->table_type = MLX5_FLOW_TABLE_TYPE_NIC_RX; + tbl->rx.nic_dmn = &tbl->dmn->info.rx; + ret = dr_table_init_nic(tbl->dmn, &tbl->rx); + break; + case MLX5DR_DOMAIN_TYPE_NIC_TX: + tbl->table_type = MLX5_FLOW_TABLE_TYPE_NIC_TX; + tbl->tx.nic_dmn = &tbl->dmn->info.tx; + ret = dr_table_init_nic(tbl->dmn, &tbl->tx); + break; + case MLX5DR_DOMAIN_TYPE_FDB: + tbl->table_type = MLX5_FLOW_TABLE_TYPE_FDB; + tbl->rx.nic_dmn = &tbl->dmn->info.rx; + tbl->tx.nic_dmn = &tbl->dmn->info.tx; + ret = dr_table_init_fdb(tbl); + break; + default: + WARN_ON(true); + break; + } + + mlx5dr_domain_unlock(tbl->dmn); + + return ret; +} + +static int dr_table_destroy_sw_owned_tbl(struct mlx5dr_table *tbl) +{ + return mlx5dr_cmd_destroy_flow_table(tbl->dmn->mdev, + tbl->table_id, + tbl->table_type); +} + +static int dr_table_create_sw_owned_tbl(struct mlx5dr_table *tbl, u16 uid) +{ + bool en_encap = !!(tbl->flags & MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT); + bool en_decap = !!(tbl->flags & MLX5_FLOW_TABLE_TUNNEL_EN_DECAP); + struct mlx5dr_cmd_create_flow_table_attr ft_attr = {}; + u64 icm_addr_rx = 0; + u64 icm_addr_tx = 0; + int ret; + + if (tbl->rx.s_anchor) + icm_addr_rx = mlx5dr_icm_pool_get_chunk_icm_addr(tbl->rx.s_anchor->chunk); + + if (tbl->tx.s_anchor) + icm_addr_tx = mlx5dr_icm_pool_get_chunk_icm_addr(tbl->tx.s_anchor->chunk); + + ft_attr.table_type = tbl->table_type; + ft_attr.icm_addr_rx = icm_addr_rx; + ft_attr.icm_addr_tx = icm_addr_tx; + ft_attr.level = tbl->dmn->info.caps.max_ft_level - 1; + ft_attr.sw_owner = true; + ft_attr.decap_en = en_decap; + ft_attr.reformat_en = en_encap; + ft_attr.uid = uid; + + ret = mlx5dr_cmd_create_flow_table(tbl->dmn->mdev, &ft_attr, + NULL, &tbl->table_id); + + return ret; +} + +struct mlx5dr_table *mlx5dr_table_create(struct mlx5dr_domain *dmn, u32 level, + u32 flags, u16 uid) +{ + struct mlx5dr_table *tbl; + int ret; + + refcount_inc(&dmn->refcount); + + tbl = kzalloc(sizeof(*tbl), GFP_KERNEL); + if (!tbl) + goto dec_ref; + + tbl->dmn = dmn; + tbl->level = level; + tbl->flags = flags; + refcount_set(&tbl->refcount, 1); + + ret = dr_table_init(tbl); + if (ret) + goto free_tbl; + + ret = dr_table_create_sw_owned_tbl(tbl, uid); + if (ret) + goto uninit_tbl; + + INIT_LIST_HEAD(&tbl->dbg_node); + mlx5dr_dbg_tbl_add(tbl); + return tbl; + +uninit_tbl: + dr_table_uninit(tbl); +free_tbl: + kfree(tbl); +dec_ref: + refcount_dec(&dmn->refcount); + return NULL; +} + +int mlx5dr_table_destroy(struct mlx5dr_table *tbl) +{ + int ret; + + if (WARN_ON_ONCE(refcount_read(&tbl->refcount) > 1)) + return -EBUSY; + + mlx5dr_dbg_tbl_del(tbl); + ret = dr_table_destroy_sw_owned_tbl(tbl); + if (ret) + mlx5dr_err(tbl->dmn, "Failed to destoy sw owned table\n"); + + dr_table_uninit(tbl); + + if (tbl->miss_action) + refcount_dec(&tbl->miss_action->refcount); + + refcount_dec(&tbl->dmn->refcount); + kfree(tbl); + + return ret; +} + +u32 mlx5dr_table_get_id(struct mlx5dr_table *tbl) +{ + return tbl->table_id; +} + +struct mlx5dr_table *mlx5dr_table_get_from_fs_ft(struct mlx5_flow_table *ft) +{ + return ft->fs_dr_table.dr_table; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h new file mode 100644 index 0000000..0b40133 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h @@ -0,0 +1,1586 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2019, Mellanox Technologies */ + +#ifndef _DR_TYPES_ +#define _DR_TYPES_ + +#include +#include +#include "fs_core.h" +#include "wq.h" +#include "lib/mlx5.h" +#include "mlx5_ifc_dr.h" +#include "mlx5dr.h" +#include "dr_dbg.h" + +#define DR_RULE_MAX_STES 18 +#define DR_ACTION_MAX_STES 6 +#define DR_STE_SVLAN 0x1 +#define DR_STE_CVLAN 0x2 +#define DR_SZ_MATCH_PARAM (MLX5_ST_SZ_DW_MATCH_PARAM * 4) +#define DR_NUM_OF_FLEX_PARSERS 8 +#define DR_STE_MAX_FLEX_0_ID 3 +#define DR_STE_MAX_FLEX_1_ID 7 + +#define mlx5dr_err(dmn, arg...) mlx5_core_err((dmn)->mdev, ##arg) +#define mlx5dr_info(dmn, arg...) mlx5_core_info((dmn)->mdev, ##arg) +#define mlx5dr_dbg(dmn, arg...) mlx5_core_dbg((dmn)->mdev, ##arg) + +static inline bool dr_is_flex_parser_0_id(u8 parser_id) +{ + return parser_id <= DR_STE_MAX_FLEX_0_ID; +} + +static inline bool dr_is_flex_parser_1_id(u8 parser_id) +{ + return parser_id > DR_STE_MAX_FLEX_0_ID; +} + +enum mlx5dr_icm_chunk_size { + DR_CHUNK_SIZE_1, + DR_CHUNK_SIZE_MIN = DR_CHUNK_SIZE_1, /* keep updated when changing */ + DR_CHUNK_SIZE_2, + DR_CHUNK_SIZE_4, + DR_CHUNK_SIZE_8, + DR_CHUNK_SIZE_16, + DR_CHUNK_SIZE_32, + DR_CHUNK_SIZE_64, + DR_CHUNK_SIZE_128, + DR_CHUNK_SIZE_256, + DR_CHUNK_SIZE_512, + DR_CHUNK_SIZE_1K, + DR_CHUNK_SIZE_2K, + DR_CHUNK_SIZE_4K, + DR_CHUNK_SIZE_8K, + DR_CHUNK_SIZE_16K, + DR_CHUNK_SIZE_32K, + DR_CHUNK_SIZE_64K, + DR_CHUNK_SIZE_128K, + DR_CHUNK_SIZE_256K, + DR_CHUNK_SIZE_512K, + DR_CHUNK_SIZE_1024K, + DR_CHUNK_SIZE_2048K, + DR_CHUNK_SIZE_MAX, +}; + +enum mlx5dr_icm_type { + DR_ICM_TYPE_STE, + DR_ICM_TYPE_MODIFY_ACTION, + DR_ICM_TYPE_MODIFY_HDR_PTRN, +}; + +static inline enum mlx5dr_icm_chunk_size +mlx5dr_icm_next_higher_chunk(enum mlx5dr_icm_chunk_size chunk) +{ + chunk += 2; + if (chunk < DR_CHUNK_SIZE_MAX) + return chunk; + + return DR_CHUNK_SIZE_MAX; +} + +enum { + DR_STE_SIZE = 64, + DR_STE_SIZE_CTRL = 32, + DR_STE_SIZE_TAG = 16, + DR_STE_SIZE_MASK = 16, + DR_STE_SIZE_REDUCED = DR_STE_SIZE - DR_STE_SIZE_MASK, +}; + +enum mlx5dr_ste_ctx_action_cap { + DR_STE_CTX_ACTION_CAP_NONE = 0, + DR_STE_CTX_ACTION_CAP_TX_POP = 1 << 0, + DR_STE_CTX_ACTION_CAP_RX_PUSH = 1 << 1, + DR_STE_CTX_ACTION_CAP_RX_ENCAP = 1 << 2, + DR_STE_CTX_ACTION_CAP_POP_MDFY = 1 << 3, +}; + +enum { + DR_MODIFY_ACTION_SIZE = 8, +}; + +enum mlx5dr_matcher_criteria { + DR_MATCHER_CRITERIA_EMPTY = 0, + DR_MATCHER_CRITERIA_OUTER = 1 << 0, + DR_MATCHER_CRITERIA_MISC = 1 << 1, + DR_MATCHER_CRITERIA_INNER = 1 << 2, + DR_MATCHER_CRITERIA_MISC2 = 1 << 3, + DR_MATCHER_CRITERIA_MISC3 = 1 << 4, + DR_MATCHER_CRITERIA_MISC4 = 1 << 5, + DR_MATCHER_CRITERIA_MISC5 = 1 << 6, + DR_MATCHER_CRITERIA_MAX = 1 << 7, +}; + +enum mlx5dr_action_type { + DR_ACTION_TYP_TNL_L2_TO_L2, + DR_ACTION_TYP_L2_TO_TNL_L2, + DR_ACTION_TYP_TNL_L3_TO_L2, + DR_ACTION_TYP_L2_TO_TNL_L3, + DR_ACTION_TYP_DROP, + DR_ACTION_TYP_QP, + DR_ACTION_TYP_FT, + DR_ACTION_TYP_CTR, + DR_ACTION_TYP_TAG, + DR_ACTION_TYP_MODIFY_HDR, + DR_ACTION_TYP_VPORT, + DR_ACTION_TYP_POP_VLAN, + DR_ACTION_TYP_PUSH_VLAN, + DR_ACCELERATED_MODIFY_ACTION_LIST, + DR_ACTION_TYP_INSERT_HDR, + DR_ACTION_TYP_REMOVE_HDR, + DR_ACTION_TYP_SAMPLER, + DR_ACTION_TYP_ASO_FLOW_METER, + DR_ACTION_TYP_MAX, +}; + +enum mlx5dr_ipv { + DR_RULE_IPV4, + DR_RULE_IPV6, + DR_RULE_IPV_MAX, +}; + +struct mlx5dr_icm_pool; +struct mlx5dr_icm_chunk; +struct mlx5dr_icm_buddy_mem; +struct mlx5dr_ste_htbl; +struct mlx5dr_match_param; +struct mlx5dr_cmd_caps; +struct mlx5dr_rule_rx_tx; +struct mlx5dr_matcher_rx_tx; +struct mlx5dr_ste_ctx; +struct mlx5dr_arg_pool_mngr; +struct mlx5dr_send_info_pool; + +struct mlx5dr_ste { + /* refcount: indicates the num of rules that using this ste */ + u32 refcount; + + /* this ste is part of a rule, located in ste's chain */ + u8 ste_chain_location; + + /* attached to the miss_list head at each htbl entry */ + struct list_head miss_list_node; + + /* this ste is member of htbl */ + struct mlx5dr_ste_htbl *htbl; + + struct mlx5dr_ste_htbl *next_htbl; + + /* The rule this STE belongs to */ + struct mlx5dr_rule_rx_tx *rule_rx_tx; +}; + +struct mlx5dr_ste_htbl_ctrl { + /* total number of valid entries belonging to this hash table. This + * includes the non collision and collision entries + */ + unsigned int num_of_valid_entries; + + /* total number of collisions entries attached to this table */ + unsigned int num_of_collisions; +}; + +struct mlx5dr_ste_htbl { + u16 lu_type; + u16 byte_mask; + u32 refcount; + struct mlx5dr_icm_chunk *chunk; + struct mlx5dr_ste *pointing_ste; + struct mlx5dr_ste_htbl_ctrl ctrl; +}; + +struct mlx5dr_ste_send_info { + struct mlx5dr_ste *ste; + struct list_head send_list; + u16 size; + u16 offset; + u8 data_cont[DR_STE_SIZE]; + u8 *data; +}; + +void mlx5dr_send_fill_and_append_ste_send_info(struct mlx5dr_ste *ste, u16 size, + u16 offset, u8 *data, + struct mlx5dr_ste_send_info *ste_info, + struct list_head *send_list, + bool copy_data); + +struct mlx5dr_ste_build { + u8 inner:1; + u8 rx:1; + u8 vhca_id_valid:1; + struct mlx5dr_domain *dmn; + struct mlx5dr_cmd_caps *caps; + u16 lu_type; + u16 byte_mask; + u8 bit_mask[DR_STE_SIZE_MASK]; + int (*ste_build_tag_func)(struct mlx5dr_match_param *spec, + struct mlx5dr_ste_build *sb, + u8 *tag); +}; + +struct mlx5dr_ste_htbl * +mlx5dr_ste_htbl_alloc(struct mlx5dr_icm_pool *pool, + enum mlx5dr_icm_chunk_size chunk_size, + u16 lu_type, u16 byte_mask); + +int mlx5dr_ste_htbl_free(struct mlx5dr_ste_htbl *htbl); + +static inline void mlx5dr_htbl_put(struct mlx5dr_ste_htbl *htbl) +{ + htbl->refcount--; + if (!htbl->refcount) + mlx5dr_ste_htbl_free(htbl); +} + +static inline void mlx5dr_htbl_get(struct mlx5dr_ste_htbl *htbl) +{ + htbl->refcount++; +} + +/* STE utils */ +u32 mlx5dr_ste_calc_hash_index(u8 *hw_ste_p, struct mlx5dr_ste_htbl *htbl); +void mlx5dr_ste_set_miss_addr(struct mlx5dr_ste_ctx *ste_ctx, + u8 *hw_ste, u64 miss_addr); +void mlx5dr_ste_set_hit_addr(struct mlx5dr_ste_ctx *ste_ctx, + u8 *hw_ste, u64 icm_addr, u32 ht_size); +void mlx5dr_ste_set_hit_addr_by_next_htbl(struct mlx5dr_ste_ctx *ste_ctx, + u8 *hw_ste, + struct mlx5dr_ste_htbl *next_htbl); +void mlx5dr_ste_set_bit_mask(u8 *hw_ste_p, u8 *bit_mask); +bool mlx5dr_ste_is_last_in_rule(struct mlx5dr_matcher_rx_tx *nic_matcher, + u8 ste_location); +u64 mlx5dr_ste_get_icm_addr(struct mlx5dr_ste *ste); +u64 mlx5dr_ste_get_mr_addr(struct mlx5dr_ste *ste); +struct list_head *mlx5dr_ste_get_miss_list(struct mlx5dr_ste *ste); + +#define MLX5DR_MAX_VLANS 2 + +struct mlx5dr_action_aso_flow_meter_info { + u32 obj_id; + u32 offset; + u8 dest_reg_id; + u8 init_color; +}; + +struct mlx5dr_ste_actions_attr { + u32 modify_index; + u16 modify_actions; + u8 *single_modify_action; + u32 decap_index; + u16 decap_actions; + u32 args_index; + u32 decap_args_index; + u8 decap_with_vlan:1; + u64 final_icm_addr; + u32 flow_tag; + u32 ctr_id; + u16 gvmi; + u16 hit_gvmi; + struct { + u32 id; + u32 size; + u8 param_0; + u8 param_1; + } reformat; + struct { + int count; + u32 headers[MLX5DR_MAX_VLANS]; + } vlans; + + struct { + u32 obj_id; + u32 offset; + u8 dest_reg_id; + u8 init_color; + } aso_flow_meter; +}; + +void mlx5dr_ste_set_actions_rx(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_domain *dmn, + u8 *action_type_set, + u8 *last_ste, + struct mlx5dr_ste_actions_attr *attr, + u32 *added_stes); +void mlx5dr_ste_set_actions_tx(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_domain *dmn, + u8 *action_type_set, + u8 *last_ste, + struct mlx5dr_ste_actions_attr *attr, + u32 *added_stes); + +void mlx5dr_ste_set_action_set(struct mlx5dr_ste_ctx *ste_ctx, + __be64 *hw_action, + u8 hw_field, + u8 shifter, + u8 length, + u32 data); +void mlx5dr_ste_set_action_add(struct mlx5dr_ste_ctx *ste_ctx, + __be64 *hw_action, + u8 hw_field, + u8 shifter, + u8 length, + u32 data); +void mlx5dr_ste_set_action_copy(struct mlx5dr_ste_ctx *ste_ctx, + __be64 *hw_action, + u8 dst_hw_field, + u8 dst_shifter, + u8 dst_len, + u8 src_hw_field, + u8 src_shifter); +int mlx5dr_ste_set_action_decap_l3_list(struct mlx5dr_ste_ctx *ste_ctx, + void *data, + u32 data_sz, + u8 *hw_action, + u32 hw_action_sz, + u16 *used_hw_action_num); +int mlx5dr_ste_alloc_modify_hdr(struct mlx5dr_action *action); +void mlx5dr_ste_free_modify_hdr(struct mlx5dr_action *action); + +#define MLX5DR_ACTION_CACHE_LINE_SIZE 64 + +/* modify-header arg pool */ +enum mlx5dr_arg_chunk_size { + DR_ARG_CHUNK_SIZE_1, + DR_ARG_CHUNK_SIZE_MIN = DR_ARG_CHUNK_SIZE_1, /* keep updated when changing */ + DR_ARG_CHUNK_SIZE_2, + DR_ARG_CHUNK_SIZE_3, + DR_ARG_CHUNK_SIZE_MAX, +}; + +struct mlx5dr_arg_object { + u32 obj_id; + u32 obj_offset; + struct list_head list_node; + enum mlx5dr_arg_chunk_size log_chunk_size; +}; + +void mlx5dr_arg_put_obj(struct mlx5dr_domain *dmn, + struct mlx5dr_arg_object *arg_obj); +struct mlx5dr_arg_object *mlx5dr_arg_get_obj(struct mlx5dr_domain *dmn, + enum mlx5dr_arg_chunk_size size); +u32 mlx5dr_arg_get_object_id(struct mlx5dr_arg_object *arg_obj); +struct mlx5dr_arg_pool_mngr *mlx5dr_arg_pool_mngr_create(struct mlx5dr_domain *dmn); +void mlx5dr_arg_pool_mngr_destroy(struct mlx5dr_arg_pool_mngr *pool_mngr); + +const struct mlx5dr_ste_action_modify_field * +mlx5dr_ste_conv_modify_hdr_sw_field(struct mlx5dr_ste_ctx *ste_ctx, u16 sw_field); + +struct mlx5dr_ste_ctx *mlx5dr_ste_get_ctx(u8 version); +void mlx5dr_ste_free(struct mlx5dr_ste *ste, + struct mlx5dr_matcher *matcher, + struct mlx5dr_matcher_rx_tx *nic_matcher); +static inline void mlx5dr_ste_put(struct mlx5dr_ste *ste, + struct mlx5dr_matcher *matcher, + struct mlx5dr_matcher_rx_tx *nic_matcher) +{ + ste->refcount--; + if (!ste->refcount) + mlx5dr_ste_free(ste, matcher, nic_matcher); +} + +/* initial as 0, increased only when ste appears in a new rule */ +static inline void mlx5dr_ste_get(struct mlx5dr_ste *ste) +{ + ste->refcount++; +} + +static inline bool mlx5dr_ste_is_not_used(struct mlx5dr_ste *ste) +{ + return !ste->refcount; +} + +bool mlx5dr_ste_equal_tag(void *src, void *dst); +int mlx5dr_ste_create_next_htbl(struct mlx5dr_matcher *matcher, + struct mlx5dr_matcher_rx_tx *nic_matcher, + struct mlx5dr_ste *ste, + u8 *cur_hw_ste, + enum mlx5dr_icm_chunk_size log_table_size); + +/* STE build functions */ +int mlx5dr_ste_build_pre_check(struct mlx5dr_domain *dmn, + u8 match_criteria, + struct mlx5dr_match_param *mask, + struct mlx5dr_match_param *value); +int mlx5dr_ste_build_ste_arr(struct mlx5dr_matcher *matcher, + struct mlx5dr_matcher_rx_tx *nic_matcher, + struct mlx5dr_match_param *value, + u8 *ste_arr); +void mlx5dr_ste_build_eth_l2_src_dst(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *builder, + struct mlx5dr_match_param *mask, + bool inner, bool rx); +void mlx5dr_ste_build_eth_l3_ipv4_5_tuple(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + bool inner, bool rx); +void mlx5dr_ste_build_eth_l3_ipv4_misc(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + bool inner, bool rx); +void mlx5dr_ste_build_eth_l3_ipv6_dst(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + bool inner, bool rx); +void mlx5dr_ste_build_eth_l3_ipv6_src(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + bool inner, bool rx); +void mlx5dr_ste_build_eth_l2_src(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + bool inner, bool rx); +void mlx5dr_ste_build_eth_l2_dst(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + bool inner, bool rx); +void mlx5dr_ste_build_eth_l2_tnl(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + bool inner, bool rx); +void mlx5dr_ste_build_eth_ipv6_l3_l4(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + bool inner, bool rx); +void mlx5dr_ste_build_eth_l4_misc(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + bool inner, bool rx); +void mlx5dr_ste_build_tnl_gre(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + bool inner, bool rx); +void mlx5dr_ste_build_mpls(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + bool inner, bool rx); +void mlx5dr_ste_build_tnl_mpls(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + bool inner, bool rx); +void mlx5dr_ste_build_tnl_mpls_over_gre(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + struct mlx5dr_cmd_caps *caps, + bool inner, bool rx); +void mlx5dr_ste_build_tnl_mpls_over_udp(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + struct mlx5dr_cmd_caps *caps, + bool inner, bool rx); +void mlx5dr_ste_build_icmp(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + struct mlx5dr_cmd_caps *caps, + bool inner, bool rx); +void mlx5dr_ste_build_tnl_vxlan_gpe(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + bool inner, bool rx); +void mlx5dr_ste_build_tnl_geneve(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + bool inner, bool rx); +void mlx5dr_ste_build_tnl_geneve_tlv_opt(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + struct mlx5dr_cmd_caps *caps, + bool inner, bool rx); +void mlx5dr_ste_build_tnl_geneve_tlv_opt_exist(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + struct mlx5dr_cmd_caps *caps, + bool inner, bool rx); +void mlx5dr_ste_build_tnl_gtpu(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + bool inner, bool rx); +void mlx5dr_ste_build_tnl_gtpu_flex_parser_0(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + struct mlx5dr_cmd_caps *caps, + bool inner, bool rx); +void mlx5dr_ste_build_tnl_gtpu_flex_parser_1(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + struct mlx5dr_cmd_caps *caps, + bool inner, bool rx); +void mlx5dr_ste_build_tnl_header_0_1(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + bool inner, bool rx); +void mlx5dr_ste_build_general_purpose(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + bool inner, bool rx); +void mlx5dr_ste_build_register_0(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + bool inner, bool rx); +void mlx5dr_ste_build_register_1(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + bool inner, bool rx); +void mlx5dr_ste_build_src_gvmi_qpn(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + struct mlx5dr_domain *dmn, + bool inner, bool rx); +void mlx5dr_ste_build_flex_parser_0(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + bool inner, bool rx); +void mlx5dr_ste_build_flex_parser_1(struct mlx5dr_ste_ctx *ste_ctx, + struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask, + bool inner, bool rx); +void mlx5dr_ste_build_empty_always_hit(struct mlx5dr_ste_build *sb, bool rx); + +/* Actions utils */ +int mlx5dr_actions_build_ste_arr(struct mlx5dr_matcher *matcher, + struct mlx5dr_matcher_rx_tx *nic_matcher, + struct mlx5dr_action *actions[], + u32 num_actions, + u8 *ste_arr, + u32 *new_hw_ste_arr_sz); + +struct mlx5dr_match_spec { + u32 smac_47_16; /* Source MAC address of incoming packet */ + /* Incoming packet Ethertype - this is the Ethertype + * following the last VLAN tag of the packet + */ + u32 smac_15_0:16; /* Source MAC address of incoming packet */ + u32 ethertype:16; + + u32 dmac_47_16; /* Destination MAC address of incoming packet */ + + u32 dmac_15_0:16; /* Destination MAC address of incoming packet */ + /* Priority of first VLAN tag in the incoming packet. + * Valid only when cvlan_tag==1 or svlan_tag==1 + */ + u32 first_prio:3; + /* CFI bit of first VLAN tag in the incoming packet. + * Valid only when cvlan_tag==1 or svlan_tag==1 + */ + u32 first_cfi:1; + /* VLAN ID of first VLAN tag in the incoming packet. + * Valid only when cvlan_tag==1 or svlan_tag==1 + */ + u32 first_vid:12; + + u32 ip_protocol:8; /* IP protocol */ + /* Differentiated Services Code Point derived from + * Traffic Class/TOS field of IPv6/v4 + */ + u32 ip_dscp:6; + /* Explicit Congestion Notification derived from + * Traffic Class/TOS field of IPv6/v4 + */ + u32 ip_ecn:2; + /* The first vlan in the packet is c-vlan (0x8100). + * cvlan_tag and svlan_tag cannot be set together + */ + u32 cvlan_tag:1; + /* The first vlan in the packet is s-vlan (0x8a88). + * cvlan_tag and svlan_tag cannot be set together + */ + u32 svlan_tag:1; + u32 frag:1; /* Packet is an IP fragment */ + u32 ip_version:4; /* IP version */ + /* TCP flags. ;Bit 0: FIN;Bit 1: SYN;Bit 2: RST;Bit 3: PSH;Bit 4: ACK; + * Bit 5: URG;Bit 6: ECE;Bit 7: CWR;Bit 8: NS + */ + u32 tcp_flags:9; + + /* TCP source port.;tcp and udp sport/dport are mutually exclusive */ + u32 tcp_sport:16; + /* TCP destination port. + * tcp and udp sport/dport are mutually exclusive + */ + u32 tcp_dport:16; + + u32 reserved_auto1:20; + u32 ipv4_ihl:4; + u32 ttl_hoplimit:8; + + /* UDP source port.;tcp and udp sport/dport are mutually exclusive */ + u32 udp_sport:16; + /* UDP destination port.;tcp and udp sport/dport are mutually exclusive */ + u32 udp_dport:16; + + /* IPv6 source address of incoming packets + * For IPv4 address use bits 31:0 (rest of the bits are reserved) + * This field should be qualified by an appropriate ethertype + */ + u32 src_ip_127_96; + /* IPv6 source address of incoming packets + * For IPv4 address use bits 31:0 (rest of the bits are reserved) + * This field should be qualified by an appropriate ethertype + */ + u32 src_ip_95_64; + /* IPv6 source address of incoming packets + * For IPv4 address use bits 31:0 (rest of the bits are reserved) + * This field should be qualified by an appropriate ethertype + */ + u32 src_ip_63_32; + /* IPv6 source address of incoming packets + * For IPv4 address use bits 31:0 (rest of the bits are reserved) + * This field should be qualified by an appropriate ethertype + */ + u32 src_ip_31_0; + /* IPv6 destination address of incoming packets + * For IPv4 address use bits 31:0 (rest of the bits are reserved) + * This field should be qualified by an appropriate ethertype + */ + u32 dst_ip_127_96; + /* IPv6 destination address of incoming packets + * For IPv4 address use bits 31:0 (rest of the bits are reserved) + * This field should be qualified by an appropriate ethertype + */ + u32 dst_ip_95_64; + /* IPv6 destination address of incoming packets + * For IPv4 address use bits 31:0 (rest of the bits are reserved) + * This field should be qualified by an appropriate ethertype + */ + u32 dst_ip_63_32; + /* IPv6 destination address of incoming packets + * For IPv4 address use bits 31:0 (rest of the bits are reserved) + * This field should be qualified by an appropriate ethertype + */ + u32 dst_ip_31_0; +}; + +struct mlx5dr_match_misc { + /* used with GRE, checksum exist when gre_c_present == 1 */ + u32 gre_c_present:1; + u32 reserved_auto1:1; + /* used with GRE, key exist when gre_k_present == 1 */ + u32 gre_k_present:1; + /* used with GRE, sequence number exist when gre_s_present == 1 */ + u32 gre_s_present:1; + u32 source_vhca_port:4; + u32 source_sqn:24; /* Source SQN */ + + u32 source_eswitch_owner_vhca_id:16; + /* Source port.;0xffff determines wire port */ + u32 source_port:16; + + /* Priority of second VLAN tag in the outer header of the incoming packet. + * Valid only when outer_second_cvlan_tag ==1 or outer_second_svlan_tag ==1 + */ + u32 outer_second_prio:3; + /* CFI bit of first VLAN tag in the outer header of the incoming packet. + * Valid only when outer_second_cvlan_tag ==1 or outer_second_svlan_tag ==1 + */ + u32 outer_second_cfi:1; + /* VLAN ID of first VLAN tag the outer header of the incoming packet. + * Valid only when outer_second_cvlan_tag ==1 or outer_second_svlan_tag ==1 + */ + u32 outer_second_vid:12; + /* Priority of second VLAN tag in the inner header of the incoming packet. + * Valid only when inner_second_cvlan_tag ==1 or inner_second_svlan_tag ==1 + */ + u32 inner_second_prio:3; + /* CFI bit of first VLAN tag in the inner header of the incoming packet. + * Valid only when inner_second_cvlan_tag ==1 or inner_second_svlan_tag ==1 + */ + u32 inner_second_cfi:1; + /* VLAN ID of first VLAN tag the inner header of the incoming packet. + * Valid only when inner_second_cvlan_tag ==1 or inner_second_svlan_tag ==1 + */ + u32 inner_second_vid:12; + + u32 outer_second_cvlan_tag:1; + u32 inner_second_cvlan_tag:1; + /* The second vlan in the outer header of the packet is c-vlan (0x8100). + * outer_second_cvlan_tag and outer_second_svlan_tag cannot be set together + */ + u32 outer_second_svlan_tag:1; + /* The second vlan in the inner header of the packet is c-vlan (0x8100). + * inner_second_cvlan_tag and inner_second_svlan_tag cannot be set together + */ + u32 inner_second_svlan_tag:1; + /* The second vlan in the outer header of the packet is s-vlan (0x8a88). + * outer_second_cvlan_tag and outer_second_svlan_tag cannot be set together + */ + u32 reserved_auto2:12; + /* The second vlan in the inner header of the packet is s-vlan (0x8a88). + * inner_second_cvlan_tag and inner_second_svlan_tag cannot be set together + */ + u32 gre_protocol:16; /* GRE Protocol (outer) */ + + u32 gre_key_h:24; /* GRE Key[31:8] (outer) */ + u32 gre_key_l:8; /* GRE Key [7:0] (outer) */ + + u32 vxlan_vni:24; /* VXLAN VNI (outer) */ + u32 reserved_auto3:8; + + u32 geneve_vni:24; /* GENEVE VNI field (outer) */ + u32 reserved_auto4:6; + u32 geneve_tlv_option_0_exist:1; + u32 geneve_oam:1; /* GENEVE OAM field (outer) */ + + u32 reserved_auto5:12; + u32 outer_ipv6_flow_label:20; /* Flow label of incoming IPv6 packet (outer) */ + + u32 reserved_auto6:12; + u32 inner_ipv6_flow_label:20; /* Flow label of incoming IPv6 packet (inner) */ + + u32 reserved_auto7:10; + u32 geneve_opt_len:6; /* GENEVE OptLen (outer) */ + u32 geneve_protocol_type:16; /* GENEVE protocol type (outer) */ + + u32 reserved_auto8:8; + u32 bth_dst_qp:24; /* Destination QP in BTH header */ + + u32 reserved_auto9; + u32 outer_esp_spi; + u32 reserved_auto10[3]; +}; + +struct mlx5dr_match_misc2 { + u32 outer_first_mpls_label:20; /* First MPLS LABEL (outer) */ + u32 outer_first_mpls_exp:3; /* First MPLS EXP (outer) */ + u32 outer_first_mpls_s_bos:1; /* First MPLS S_BOS (outer) */ + u32 outer_first_mpls_ttl:8; /* First MPLS TTL (outer) */ + + u32 inner_first_mpls_label:20; /* First MPLS LABEL (inner) */ + u32 inner_first_mpls_exp:3; /* First MPLS EXP (inner) */ + u32 inner_first_mpls_s_bos:1; /* First MPLS S_BOS (inner) */ + u32 inner_first_mpls_ttl:8; /* First MPLS TTL (inner) */ + + u32 outer_first_mpls_over_gre_label:20; /* last MPLS LABEL (outer) */ + u32 outer_first_mpls_over_gre_exp:3; /* last MPLS EXP (outer) */ + u32 outer_first_mpls_over_gre_s_bos:1; /* last MPLS S_BOS (outer) */ + u32 outer_first_mpls_over_gre_ttl:8; /* last MPLS TTL (outer) */ + + u32 outer_first_mpls_over_udp_label:20; /* last MPLS LABEL (outer) */ + u32 outer_first_mpls_over_udp_exp:3; /* last MPLS EXP (outer) */ + u32 outer_first_mpls_over_udp_s_bos:1; /* last MPLS S_BOS (outer) */ + u32 outer_first_mpls_over_udp_ttl:8; /* last MPLS TTL (outer) */ + + u32 metadata_reg_c_7; /* metadata_reg_c_7 */ + u32 metadata_reg_c_6; /* metadata_reg_c_6 */ + u32 metadata_reg_c_5; /* metadata_reg_c_5 */ + u32 metadata_reg_c_4; /* metadata_reg_c_4 */ + u32 metadata_reg_c_3; /* metadata_reg_c_3 */ + u32 metadata_reg_c_2; /* metadata_reg_c_2 */ + u32 metadata_reg_c_1; /* metadata_reg_c_1 */ + u32 metadata_reg_c_0; /* metadata_reg_c_0 */ + u32 metadata_reg_a; /* metadata_reg_a */ + u32 reserved_auto1[3]; +}; + +struct mlx5dr_match_misc3 { + u32 inner_tcp_seq_num; + u32 outer_tcp_seq_num; + u32 inner_tcp_ack_num; + u32 outer_tcp_ack_num; + + u32 reserved_auto1:8; + u32 outer_vxlan_gpe_vni:24; + + u32 outer_vxlan_gpe_next_protocol:8; + u32 outer_vxlan_gpe_flags:8; + u32 reserved_auto2:16; + + u32 icmpv4_header_data; + u32 icmpv6_header_data; + + u8 icmpv4_type; + u8 icmpv4_code; + u8 icmpv6_type; + u8 icmpv6_code; + + u32 geneve_tlv_option_0_data; + + u32 gtpu_teid; + + u8 gtpu_msg_type; + u8 gtpu_msg_flags; + u32 reserved_auto3:16; + + u32 gtpu_dw_2; + u32 gtpu_first_ext_dw_0; + u32 gtpu_dw_0; + u32 reserved_auto4; +}; + +struct mlx5dr_match_misc4 { + u32 prog_sample_field_value_0; + u32 prog_sample_field_id_0; + u32 prog_sample_field_value_1; + u32 prog_sample_field_id_1; + u32 prog_sample_field_value_2; + u32 prog_sample_field_id_2; + u32 prog_sample_field_value_3; + u32 prog_sample_field_id_3; + u32 reserved_auto1[8]; +}; + +struct mlx5dr_match_misc5 { + u32 macsec_tag_0; + u32 macsec_tag_1; + u32 macsec_tag_2; + u32 macsec_tag_3; + u32 tunnel_header_0; + u32 tunnel_header_1; + u32 tunnel_header_2; + u32 tunnel_header_3; +}; + +struct mlx5dr_match_param { + struct mlx5dr_match_spec outer; + struct mlx5dr_match_misc misc; + struct mlx5dr_match_spec inner; + struct mlx5dr_match_misc2 misc2; + struct mlx5dr_match_misc3 misc3; + struct mlx5dr_match_misc4 misc4; + struct mlx5dr_match_misc5 misc5; +}; + +#define DR_MASK_IS_ICMPV4_SET(_misc3) ((_misc3)->icmpv4_type || \ + (_misc3)->icmpv4_code || \ + (_misc3)->icmpv4_header_data) + +#define DR_MASK_IS_SRC_IP_SET(_spec) ((_spec)->src_ip_127_96 || \ + (_spec)->src_ip_95_64 || \ + (_spec)->src_ip_63_32 || \ + (_spec)->src_ip_31_0) + +#define DR_MASK_IS_DST_IP_SET(_spec) ((_spec)->dst_ip_127_96 || \ + (_spec)->dst_ip_95_64 || \ + (_spec)->dst_ip_63_32 || \ + (_spec)->dst_ip_31_0) + +struct mlx5dr_esw_caps { + u64 drop_icm_address_rx; + u64 drop_icm_address_tx; + u64 uplink_icm_address_rx; + u64 uplink_icm_address_tx; + u8 sw_owner:1; + u8 sw_owner_v2:1; +}; + +struct mlx5dr_cmd_vport_cap { + u16 vport_gvmi; + u16 vhca_gvmi; + u16 num; + u64 icm_address_rx; + u64 icm_address_tx; +}; + +struct mlx5dr_roce_cap { + u8 roce_en:1; + u8 fl_rc_qp_when_roce_disabled:1; + u8 fl_rc_qp_when_roce_enabled:1; +}; + +struct mlx5dr_vports { + struct mlx5dr_cmd_vport_cap esw_manager_caps; + struct mlx5dr_cmd_vport_cap uplink_caps; + struct xarray vports_caps_xa; +}; + +struct mlx5dr_cmd_caps { + u16 gvmi; + u64 nic_rx_drop_address; + u64 nic_tx_drop_address; + u64 nic_tx_allow_address; + u64 esw_rx_drop_address; + u64 esw_tx_drop_address; + u32 log_icm_size; + u64 hdr_modify_icm_addr; + u32 log_modify_pattern_icm_size; + u64 hdr_modify_pattern_icm_addr; + u32 flex_protocols; + u8 flex_parser_id_icmp_dw0; + u8 flex_parser_id_icmp_dw1; + u8 flex_parser_id_icmpv6_dw0; + u8 flex_parser_id_icmpv6_dw1; + u8 flex_parser_id_geneve_tlv_option_0; + u8 flex_parser_id_mpls_over_gre; + u8 flex_parser_id_mpls_over_udp; + u8 flex_parser_id_gtpu_dw_0; + u8 flex_parser_id_gtpu_teid; + u8 flex_parser_id_gtpu_dw_2; + u8 flex_parser_id_gtpu_first_ext_dw_0; + u8 flex_parser_ok_bits_supp; + u8 max_ft_level; + u16 roce_min_src_udp; + u8 sw_format_ver; + bool eswitch_manager; + bool rx_sw_owner; + bool tx_sw_owner; + bool fdb_sw_owner; + u8 rx_sw_owner_v2:1; + u8 tx_sw_owner_v2:1; + u8 fdb_sw_owner_v2:1; + struct mlx5dr_esw_caps esw_caps; + struct mlx5dr_vports vports; + bool prio_tag_required; + struct mlx5dr_roce_cap roce_caps; + u8 is_ecpf:1; + u8 isolate_vl_tc:1; + u16 log_header_modify_argument_granularity; + u16 log_header_modify_argument_max_alloc; + bool support_modify_argument; +}; + +enum mlx5dr_domain_nic_type { + DR_DOMAIN_NIC_TYPE_RX, + DR_DOMAIN_NIC_TYPE_TX, +}; + +struct mlx5dr_domain_rx_tx { + u64 drop_icm_addr; + u64 default_icm_addr; + enum mlx5dr_domain_nic_type type; + struct mutex mutex; /* protect rx/tx domain */ +}; + +struct mlx5dr_domain_info { + bool supp_sw_steering; + u32 max_inline_size; + u32 max_send_wr; + u32 max_log_sw_icm_sz; + u32 max_log_action_icm_sz; + u32 max_log_modify_hdr_pattern_icm_sz; + struct mlx5dr_domain_rx_tx rx; + struct mlx5dr_domain_rx_tx tx; + struct mlx5dr_cmd_caps caps; +}; + +struct mlx5dr_domain { + struct mlx5dr_domain *peer_dmn; + struct mlx5_core_dev *mdev; + u32 pdn; + struct mlx5_uars_page *uar; + enum mlx5dr_domain_type type; + refcount_t refcount; + struct mlx5dr_icm_pool *ste_icm_pool; + struct mlx5dr_icm_pool *action_icm_pool; + struct mlx5dr_icm_pool *modify_header_ptrn_icm_pool; + struct mlx5dr_arg_pool_mngr *modify_header_arg_pool_mngr; + struct mlx5dr_send_info_pool *send_info_pool_rx; + struct mlx5dr_send_info_pool *send_info_pool_tx; + struct mlx5dr_send_ring *send_ring; + struct mlx5dr_domain_info info; + struct xarray csum_fts_xa; + struct mlx5dr_ste_ctx *ste_ctx; + struct list_head dbg_tbl_list; + struct mlx5dr_dbg_dump_info dump_info; + struct kmem_cache *chunks_kmem_cache; + struct kmem_cache *htbls_kmem_cache; + /* cache for modify_header */ + struct list_head modify_hdr_list; + struct mutex modify_hdr_mutex; /* protects modify hdr resources */ +}; + +struct mlx5dr_table_rx_tx { + struct mlx5dr_ste_htbl *s_anchor; + struct mlx5dr_domain_rx_tx *nic_dmn; + u64 default_icm_addr; + struct list_head nic_matcher_list; +}; + +struct mlx5dr_table { + struct mlx5dr_domain *dmn; + struct mlx5dr_table_rx_tx rx; + struct mlx5dr_table_rx_tx tx; + u32 level; + u32 table_type; + u32 table_id; + u32 flags; + struct list_head matcher_list; + struct mlx5dr_action *miss_action; + refcount_t refcount; + struct list_head dbg_node; +}; + +struct mlx5dr_matcher_rx_tx { + struct mlx5dr_ste_htbl *s_htbl; + struct mlx5dr_ste_htbl *e_anchor; + struct mlx5dr_ste_build *ste_builder; + struct mlx5dr_ste_build ste_builder_arr[DR_RULE_IPV_MAX] + [DR_RULE_IPV_MAX] + [DR_RULE_MAX_STES]; + u8 num_of_builders; + u8 num_of_builders_arr[DR_RULE_IPV_MAX][DR_RULE_IPV_MAX]; + u64 default_icm_addr; + struct mlx5dr_table_rx_tx *nic_tbl; + u32 prio; + struct list_head list_node; + u32 rules; +}; + +struct mlx5dr_matcher { + struct mlx5dr_table *tbl; + struct mlx5dr_matcher_rx_tx rx; + struct mlx5dr_matcher_rx_tx tx; + struct list_head list_node; /* Used for both matchers and dbg managing */ + u32 prio; + struct mlx5dr_match_param mask; + u8 match_criteria; + refcount_t refcount; + struct list_head dbg_rule_list; +}; + +struct mlx5dr_ste_action_modify_field { + u16 hw_field; + u8 start; + u8 end; + u8 l3_type; + u8 l4_type; +}; + +struct mlx5dr_action_rewrite { + struct mlx5dr_domain *dmn; + struct mlx5dr_icm_chunk *chunk; + u8 *data; + bool single_action_opt; + u16 num_of_actions; + u32 index; + u8 allow_rx:1; + u8 allow_tx:1; + u8 modify_ttl:1; + union { /* hw specific */ + struct mlx5dr_arg_object *arg; + }; +}; + +struct mlx5dr_action_reformat { + struct mlx5dr_domain *dmn; + u32 id; + u32 size; + u8 param_0; + u8 param_1; +}; + +struct mlx5dr_action_sampler { + struct mlx5dr_domain *dmn; + u64 rx_icm_addr; + u64 tx_icm_addr; + u32 sampler_id; +}; + +struct mlx5dr_action_dest_tbl { + u8 is_fw_tbl:1; + union { + struct mlx5dr_table *tbl; + struct { + struct mlx5dr_domain *dmn; + u32 id; + u32 group_id; + enum fs_flow_table_type type; + u64 rx_icm_addr; + u64 tx_icm_addr; + struct mlx5dr_action **ref_actions; + u32 num_of_ref_actions; + } fw_tbl; + }; +}; + +struct mlx5dr_action_ctr { + u32 ctr_id; + u32 offset; +}; + +struct mlx5dr_action_vport { + struct mlx5dr_domain *dmn; + struct mlx5dr_cmd_vport_cap *caps; +}; + +struct mlx5dr_action_push_vlan { + u32 vlan_hdr; /* tpid_pcp_dei_vid */ +}; + +struct mlx5dr_action_flow_tag { + u32 flow_tag; +}; + +struct mlx5dr_rule_action_member { + struct mlx5dr_action *action; + struct list_head list; +}; + +struct mlx5dr_action_aso_flow_meter { + struct mlx5dr_domain *dmn; + u32 obj_id; + u32 offset; + u8 dest_reg_id; + u8 init_color; +}; + +struct mlx5dr_action { + enum mlx5dr_action_type action_type; + refcount_t refcount; + + union { + void *data; + struct mlx5dr_action_rewrite *rewrite; + struct mlx5dr_action_reformat *reformat; + struct mlx5dr_action_sampler *sampler; + struct mlx5dr_action_dest_tbl *dest_tbl; + struct mlx5dr_action_ctr *ctr; + struct mlx5dr_action_vport *vport; + struct mlx5dr_action_push_vlan *push_vlan; + struct mlx5dr_action_flow_tag *flow_tag; + struct mlx5dr_action_aso_flow_meter *aso; + }; +}; + +enum mlx5dr_connect_type { + CONNECT_HIT = 1, + CONNECT_MISS = 2, +}; + +struct mlx5dr_htbl_connect_info { + enum mlx5dr_connect_type type; + union { + struct mlx5dr_ste_htbl *hit_next_htbl; + u64 miss_icm_addr; + }; +}; + +struct mlx5dr_rule_rx_tx { + struct mlx5dr_matcher_rx_tx *nic_matcher; + struct mlx5dr_ste *last_rule_ste; +}; + +struct mlx5dr_rule { + struct mlx5dr_matcher *matcher; + struct mlx5dr_rule_rx_tx rx; + struct mlx5dr_rule_rx_tx tx; + struct list_head rule_actions_list; + struct list_head dbg_node; + u32 flow_source; +}; + +void mlx5dr_rule_set_last_member(struct mlx5dr_rule_rx_tx *nic_rule, + struct mlx5dr_ste *ste, + bool force); +int mlx5dr_rule_get_reverse_rule_members(struct mlx5dr_ste **ste_arr, + struct mlx5dr_ste *curr_ste, + int *num_of_stes); + +struct mlx5dr_icm_hot_chunk { + struct mlx5dr_icm_buddy_mem *buddy_mem; + unsigned int seg; + enum mlx5dr_icm_chunk_size size; +}; + +struct mlx5dr_icm_chunk { + struct mlx5dr_icm_buddy_mem *buddy_mem; + + /* indicates the index of this chunk in the whole memory, + * used for deleting the chunk from the buddy + */ + unsigned int seg; + enum mlx5dr_icm_chunk_size size; + + /* Memory optimisation */ + struct mlx5dr_ste *ste_arr; + u8 *hw_ste_arr; + struct list_head *miss_list; +}; + +struct mlx5dr_icm_pool { + enum mlx5dr_icm_type icm_type; + enum mlx5dr_icm_chunk_size max_log_chunk_sz; + struct mlx5dr_domain *dmn; + struct kmem_cache *chunks_kmem_cache; + + /* memory management */ + struct mutex mutex; /* protect the ICM pool and ICM buddy */ + struct list_head buddy_mem_list; + + /* Hardware may be accessing this memory but at some future, + * undetermined time, it might cease to do so. + * sync_ste command sets them free. + */ + struct mlx5dr_icm_hot_chunk *hot_chunks_arr; + u32 hot_chunks_num; + u64 hot_memory_size; +}; + +static inline void mlx5dr_domain_nic_lock(struct mlx5dr_domain_rx_tx *nic_dmn) +{ + mutex_lock(&nic_dmn->mutex); +} + +static inline void mlx5dr_domain_nic_unlock(struct mlx5dr_domain_rx_tx *nic_dmn) +{ + mutex_unlock(&nic_dmn->mutex); +} + +static inline void mlx5dr_domain_lock(struct mlx5dr_domain *dmn) +{ + mlx5dr_domain_nic_lock(&dmn->info.rx); + mlx5dr_domain_nic_lock(&dmn->info.tx); +} + +static inline void mlx5dr_domain_unlock(struct mlx5dr_domain *dmn) +{ + mlx5dr_domain_nic_unlock(&dmn->info.tx); + mlx5dr_domain_nic_unlock(&dmn->info.rx); +} + +int mlx5dr_matcher_add_to_tbl_nic(struct mlx5dr_domain *dmn, + struct mlx5dr_matcher_rx_tx *nic_matcher); +int mlx5dr_matcher_remove_from_tbl_nic(struct mlx5dr_domain *dmn, + struct mlx5dr_matcher_rx_tx *nic_matcher); + +int mlx5dr_matcher_select_builders(struct mlx5dr_matcher *matcher, + struct mlx5dr_matcher_rx_tx *nic_matcher, + enum mlx5dr_ipv outer_ipv, + enum mlx5dr_ipv inner_ipv); + +u64 mlx5dr_icm_pool_get_chunk_mr_addr(struct mlx5dr_icm_chunk *chunk); +u32 mlx5dr_icm_pool_get_chunk_rkey(struct mlx5dr_icm_chunk *chunk); +u64 mlx5dr_icm_pool_get_chunk_icm_addr(struct mlx5dr_icm_chunk *chunk); +u32 mlx5dr_icm_pool_get_chunk_num_of_entries(struct mlx5dr_icm_chunk *chunk); +u32 mlx5dr_icm_pool_get_chunk_byte_size(struct mlx5dr_icm_chunk *chunk); +u8 *mlx5dr_ste_get_hw_ste(struct mlx5dr_ste *ste); + +static inline int +mlx5dr_icm_pool_dm_type_to_entry_size(enum mlx5dr_icm_type icm_type) +{ + if (icm_type == DR_ICM_TYPE_STE) + return DR_STE_SIZE; + + return DR_MODIFY_ACTION_SIZE; +} + +static inline u32 +mlx5dr_icm_pool_chunk_size_to_entries(enum mlx5dr_icm_chunk_size chunk_size) +{ + return 1 << chunk_size; +} + +static inline int +mlx5dr_icm_pool_chunk_size_to_byte(enum mlx5dr_icm_chunk_size chunk_size, + enum mlx5dr_icm_type icm_type) +{ + int num_of_entries; + int entry_size; + + entry_size = mlx5dr_icm_pool_dm_type_to_entry_size(icm_type); + num_of_entries = mlx5dr_icm_pool_chunk_size_to_entries(chunk_size); + + return entry_size * num_of_entries; +} + +static inline int +mlx5dr_ste_htbl_increase_threshold(struct mlx5dr_ste_htbl *htbl) +{ + int num_of_entries = + mlx5dr_icm_pool_chunk_size_to_entries(htbl->chunk->size); + + /* Threshold is 50%, one is added to table of size 1 */ + return (num_of_entries + 1) / 2; +} + +static inline bool +mlx5dr_ste_htbl_may_grow(struct mlx5dr_ste_htbl *htbl) +{ + if (htbl->chunk->size == DR_CHUNK_SIZE_MAX - 1 || !htbl->byte_mask) + return false; + + return true; +} + +struct mlx5dr_cmd_vport_cap * +mlx5dr_domain_get_vport_cap(struct mlx5dr_domain *dmn, u16 vport); + +struct mlx5dr_cmd_query_flow_table_details { + u8 status; + u8 level; + u64 sw_owner_icm_root_1; + u64 sw_owner_icm_root_0; +}; + +struct mlx5dr_cmd_create_flow_table_attr { + u32 table_type; + u16 uid; + u64 icm_addr_rx; + u64 icm_addr_tx; + u8 level; + bool sw_owner; + bool term_tbl; + bool decap_en; + bool reformat_en; +}; + +/* internal API functions */ +int mlx5dr_cmd_query_device(struct mlx5_core_dev *mdev, + struct mlx5dr_cmd_caps *caps); +int mlx5dr_cmd_query_esw_vport_context(struct mlx5_core_dev *mdev, + bool other_vport, u16 vport_number, + u64 *icm_address_rx, + u64 *icm_address_tx); +int mlx5dr_cmd_query_gvmi(struct mlx5_core_dev *mdev, + bool other_vport, u16 vport_number, u16 *gvmi); +int mlx5dr_cmd_query_esw_caps(struct mlx5_core_dev *mdev, + struct mlx5dr_esw_caps *caps); +int mlx5dr_cmd_query_flow_sampler(struct mlx5_core_dev *dev, + u32 sampler_id, + u64 *rx_icm_addr, + u64 *tx_icm_addr); +int mlx5dr_cmd_sync_steering(struct mlx5_core_dev *mdev); +int mlx5dr_cmd_set_fte_modify_and_vport(struct mlx5_core_dev *mdev, + u32 table_type, + u32 table_id, + u32 group_id, + u32 modify_header_id, + u16 vport_id); +int mlx5dr_cmd_del_flow_table_entry(struct mlx5_core_dev *mdev, + u32 table_type, + u32 table_id); +int mlx5dr_cmd_alloc_modify_header(struct mlx5_core_dev *mdev, + u32 table_type, + u8 num_of_actions, + u64 *actions, + u32 *modify_header_id); +int mlx5dr_cmd_dealloc_modify_header(struct mlx5_core_dev *mdev, + u32 modify_header_id); +int mlx5dr_cmd_create_empty_flow_group(struct mlx5_core_dev *mdev, + u32 table_type, + u32 table_id, + u32 *group_id); +int mlx5dr_cmd_destroy_flow_group(struct mlx5_core_dev *mdev, + u32 table_type, + u32 table_id, + u32 group_id); +int mlx5dr_cmd_create_flow_table(struct mlx5_core_dev *mdev, + struct mlx5dr_cmd_create_flow_table_attr *attr, + u64 *fdb_rx_icm_addr, + u32 *table_id); +int mlx5dr_cmd_destroy_flow_table(struct mlx5_core_dev *mdev, + u32 table_id, + u32 table_type); +int mlx5dr_cmd_query_flow_table(struct mlx5_core_dev *dev, + enum fs_flow_table_type type, + u32 table_id, + struct mlx5dr_cmd_query_flow_table_details *output); +int mlx5dr_cmd_create_reformat_ctx(struct mlx5_core_dev *mdev, + enum mlx5_reformat_ctx_type rt, + u8 reformat_param_0, + u8 reformat_param_1, + size_t reformat_size, + void *reformat_data, + u32 *reformat_id); +void mlx5dr_cmd_destroy_reformat_ctx(struct mlx5_core_dev *mdev, + u32 reformat_id); + +struct mlx5dr_cmd_gid_attr { + u8 gid[16]; + u8 mac[6]; + u32 roce_ver; +}; + +struct mlx5dr_cmd_qp_create_attr { + u32 page_id; + u32 pdn; + u32 cqn; + u32 pm_state; + u32 service_type; + u32 buff_umem_id; + u32 db_umem_id; + u32 sq_wqe_cnt; + u32 rq_wqe_cnt; + u32 rq_wqe_shift; + u8 isolate_vl_tc:1; + u16 log_header_modify_argument_granularity; + bool support_modify_argument; +}; + +int mlx5dr_cmd_query_gid(struct mlx5_core_dev *mdev, u8 vhca_port_num, + u16 index, struct mlx5dr_cmd_gid_attr *attr); + +int mlx5dr_cmd_create_modify_header_arg(struct mlx5_core_dev *dev, + u16 log_obj_range, u32 pd, + u32 *obj_id); +void mlx5dr_cmd_destroy_modify_header_arg(struct mlx5_core_dev *dev, + u32 obj_id); + +struct mlx5dr_icm_pool *mlx5dr_icm_pool_create(struct mlx5dr_domain *dmn, + enum mlx5dr_icm_type icm_type); +void mlx5dr_icm_pool_destroy(struct mlx5dr_icm_pool *pool); + +struct mlx5dr_icm_chunk * +mlx5dr_icm_alloc_chunk(struct mlx5dr_icm_pool *pool, + enum mlx5dr_icm_chunk_size chunk_size); +void mlx5dr_icm_free_chunk(struct mlx5dr_icm_chunk *chunk); + +void mlx5dr_ste_prepare_for_postsend(struct mlx5dr_ste_ctx *ste_ctx, + u8 *hw_ste_p, u32 ste_size); +int mlx5dr_ste_htbl_init_and_postsend(struct mlx5dr_domain *dmn, + struct mlx5dr_domain_rx_tx *nic_dmn, + struct mlx5dr_ste_htbl *htbl, + struct mlx5dr_htbl_connect_info *connect_info, + bool update_hw_ste); +void mlx5dr_ste_set_formatted_ste(struct mlx5dr_ste_ctx *ste_ctx, + u16 gvmi, + enum mlx5dr_domain_nic_type nic_type, + struct mlx5dr_ste_htbl *htbl, + u8 *formatted_ste, + struct mlx5dr_htbl_connect_info *connect_info); +void mlx5dr_ste_copy_param(u8 match_criteria, + struct mlx5dr_match_param *set_param, + struct mlx5dr_match_parameters *mask, + bool clear); + +struct mlx5dr_qp { + struct mlx5_core_dev *mdev; + struct mlx5_wq_qp wq; + struct mlx5_uars_page *uar; + struct mlx5_wq_ctrl wq_ctrl; + u32 qpn; + struct { + unsigned int head; + unsigned int pc; + unsigned int cc; + unsigned int size; + unsigned int *wqe_head; + unsigned int wqe_cnt; + } sq; + struct { + unsigned int pc; + unsigned int cc; + unsigned int size; + unsigned int wqe_cnt; + } rq; + int max_inline_data; +}; + +struct mlx5dr_cq { + struct mlx5_core_dev *mdev; + struct mlx5_cqwq wq; + struct mlx5_wq_ctrl wq_ctrl; + struct mlx5_core_cq mcq; + struct mlx5dr_qp *qp; +}; + +struct mlx5dr_mr { + struct mlx5_core_dev *mdev; + u32 mkey; + dma_addr_t dma_addr; + void *addr; + size_t size; +}; + +#define MAX_SEND_CQE 64 + +struct mlx5dr_send_ring { + struct mlx5dr_cq *cq; + struct mlx5dr_qp *qp; + struct mlx5dr_mr *mr; + /* How much wqes are waiting for completion */ + u32 pending_wqe; + /* Signal request per this trash hold value */ + u16 signal_th; + /* Each post_send_size less than max_post_send_size */ + u32 max_post_send_size; + /* manage the send queue */ + u32 tx_head; + void *buf; + u32 buf_size; + u8 *sync_buff; + struct mlx5dr_mr *sync_mr; + spinlock_t lock; /* Protect the data path of the send ring */ + bool err_state; /* send_ring is not usable in err state */ +}; + +int mlx5dr_send_ring_alloc(struct mlx5dr_domain *dmn); +void mlx5dr_send_ring_free(struct mlx5dr_domain *dmn, + struct mlx5dr_send_ring *send_ring); +int mlx5dr_send_ring_force_drain(struct mlx5dr_domain *dmn); +int mlx5dr_send_postsend_ste(struct mlx5dr_domain *dmn, + struct mlx5dr_ste *ste, + u8 *data, + u16 size, + u16 offset); +int mlx5dr_send_postsend_htbl(struct mlx5dr_domain *dmn, + struct mlx5dr_ste_htbl *htbl, + u8 *formatted_ste, u8 *mask); +int mlx5dr_send_postsend_formatted_htbl(struct mlx5dr_domain *dmn, + struct mlx5dr_ste_htbl *htbl, + u8 *ste_init_data, + bool update_hw_ste); +int mlx5dr_send_postsend_action(struct mlx5dr_domain *dmn, + struct mlx5dr_action *action); +int mlx5dr_send_postsend_args(struct mlx5dr_domain *dmn, + struct mlx5dr_action *action); + +int mlx5dr_send_info_pool_create(struct mlx5dr_domain *dmn); +void mlx5dr_send_info_pool_destroy(struct mlx5dr_domain *dmn); +struct mlx5dr_ste_send_info *mlx5dr_send_info_alloc(struct mlx5dr_domain *dmn, + enum mlx5dr_domain_nic_type nic_type); +void mlx5dr_send_info_free(struct mlx5dr_ste_send_info *ste_send_info); + +struct mlx5dr_cmd_ft_info { + u32 id; + u16 vport; + enum fs_flow_table_type type; +}; + +struct mlx5dr_cmd_flow_destination_hw_info { + enum mlx5_flow_destination_type type; + union { + u32 tir_num; + u32 ft_num; + u32 ft_id; + u32 counter_id; + u32 sampler_id; + struct { + u16 num; + u16 vhca_id; + u32 reformat_id; + u8 flags; + } vport; + }; +}; + +struct mlx5dr_cmd_fte_info { + u32 dests_size; + u32 index; + struct mlx5_flow_context flow_context; + u32 *val; + struct mlx5_flow_act action; + struct mlx5dr_cmd_flow_destination_hw_info *dest_arr; + bool ignore_flow_level; +}; + +int mlx5dr_cmd_set_fte(struct mlx5_core_dev *dev, + int opmod, int modify_mask, + struct mlx5dr_cmd_ft_info *ft, + u32 group_id, + struct mlx5dr_cmd_fte_info *fte); + +bool mlx5dr_ste_supp_ttl_cs_recalc(struct mlx5dr_cmd_caps *caps); + +struct mlx5dr_fw_recalc_cs_ft { + u64 rx_icm_addr; + u32 table_id; + u32 group_id; + u32 modify_hdr_id; +}; + +struct mlx5dr_fw_recalc_cs_ft * +mlx5dr_fw_create_recalc_cs_ft(struct mlx5dr_domain *dmn, u16 vport_num); +void mlx5dr_fw_destroy_recalc_cs_ft(struct mlx5dr_domain *dmn, + struct mlx5dr_fw_recalc_cs_ft *recalc_cs_ft); +int mlx5dr_domain_get_recalc_cs_ft_addr(struct mlx5dr_domain *dmn, + u16 vport_num, + u64 *rx_icm_addr); +int mlx5dr_fw_create_md_tbl(struct mlx5dr_domain *dmn, + struct mlx5dr_cmd_flow_destination_hw_info *dest, + int num_dest, + bool reformat_req, + u32 *tbl_id, + u32 *group_id, + bool ignore_flow_level, + u32 flow_source); +void mlx5dr_fw_destroy_md_tbl(struct mlx5dr_domain *dmn, u32 tbl_id, + u32 group_id); +#endif /* _DR_TYPES_H_ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c new file mode 100644 index 0000000..2032689 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c @@ -0,0 +1,817 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2019 Mellanox Technologies */ + +#include +#include "mlx5_core.h" +#include "fs_core.h" +#include "fs_cmd.h" +#include "mlx5dr.h" +#include "fs_dr.h" + +static bool mlx5_dr_is_fw_table(u32 flags) +{ + if (flags & MLX5_FLOW_TABLE_TERMINATION) + return true; + + return false; +} + +static int mlx5_cmd_dr_update_root_ft(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + u32 underlay_qpn, + bool disconnect) +{ + return mlx5_fs_cmd_get_fw_cmds()->update_root_ft(ns, ft, underlay_qpn, + disconnect); +} + +static int set_miss_action(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + struct mlx5_flow_table *next_ft) +{ + struct mlx5dr_action *old_miss_action; + struct mlx5dr_action *action = NULL; + struct mlx5dr_table *next_tbl; + int err; + + next_tbl = next_ft ? next_ft->fs_dr_table.dr_table : NULL; + if (next_tbl) { + action = mlx5dr_action_create_dest_table(next_tbl); + if (!action) + return -EINVAL; + } + old_miss_action = ft->fs_dr_table.miss_action; + err = mlx5dr_table_set_miss_action(ft->fs_dr_table.dr_table, action); + if (err && action) { + err = mlx5dr_action_destroy(action); + if (err) + mlx5_core_err(ns->dev, + "Failed to destroy action (%d)\n", err); + action = NULL; + } + ft->fs_dr_table.miss_action = action; + if (old_miss_action) { + err = mlx5dr_action_destroy(old_miss_action); + if (err) + mlx5_core_err(ns->dev, "Failed to destroy action (%d)\n", + err); + } + + return err; +} + +static int mlx5_cmd_dr_create_flow_table(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + struct mlx5_flow_table_attr *ft_attr, + struct mlx5_flow_table *next_ft) +{ + struct mlx5dr_table *tbl; + u32 flags; + int err; + + if (mlx5_dr_is_fw_table(ft->flags)) + return mlx5_fs_cmd_get_fw_cmds()->create_flow_table(ns, ft, + ft_attr, + next_ft); + flags = ft->flags; + /* turn off encap/decap if not supported for sw-str by fw */ + if (!MLX5_CAP_FLOWTABLE(ns->dev, sw_owner_reformat_supported)) + flags = ft->flags & ~(MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT | + MLX5_FLOW_TABLE_TUNNEL_EN_DECAP); + + tbl = mlx5dr_table_create(ns->fs_dr_domain.dr_domain, ft->level, flags, + ft_attr->uid); + if (!tbl) { + mlx5_core_err(ns->dev, "Failed creating dr flow_table\n"); + return -EINVAL; + } + + ft->fs_dr_table.dr_table = tbl; + ft->id = mlx5dr_table_get_id(tbl); + + if (next_ft) { + err = set_miss_action(ns, ft, next_ft); + if (err) { + mlx5dr_table_destroy(tbl); + ft->fs_dr_table.dr_table = NULL; + return err; + } + } + + ft->max_fte = INT_MAX; + + return 0; +} + +static int mlx5_cmd_dr_destroy_flow_table(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft) +{ + struct mlx5dr_action *action = ft->fs_dr_table.miss_action; + int err; + + if (mlx5_dr_is_fw_table(ft->flags)) + return mlx5_fs_cmd_get_fw_cmds()->destroy_flow_table(ns, ft); + + err = mlx5dr_table_destroy(ft->fs_dr_table.dr_table); + if (err) { + mlx5_core_err(ns->dev, "Failed to destroy flow_table (%d)\n", + err); + return err; + } + if (action) { + err = mlx5dr_action_destroy(action); + if (err) { + mlx5_core_err(ns->dev, "Failed to destroy action(%d)\n", + err); + return err; + } + } + + return err; +} + +static int mlx5_cmd_dr_modify_flow_table(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + struct mlx5_flow_table *next_ft) +{ + if (mlx5_dr_is_fw_table(ft->flags)) + return mlx5_fs_cmd_get_fw_cmds()->modify_flow_table(ns, ft, next_ft); + + return set_miss_action(ns, ft, next_ft); +} + +static int mlx5_cmd_dr_create_flow_group(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + u32 *in, + struct mlx5_flow_group *fg) +{ + struct mlx5dr_matcher *matcher; + u32 priority = MLX5_GET(create_flow_group_in, in, + start_flow_index); + u8 match_criteria_enable = MLX5_GET(create_flow_group_in, + in, + match_criteria_enable); + struct mlx5dr_match_parameters mask; + + if (mlx5_dr_is_fw_table(ft->flags)) + return mlx5_fs_cmd_get_fw_cmds()->create_flow_group(ns, ft, in, + fg); + + mask.match_buf = MLX5_ADDR_OF(create_flow_group_in, + in, match_criteria); + mask.match_sz = sizeof(fg->mask.match_criteria); + + matcher = mlx5dr_matcher_create(ft->fs_dr_table.dr_table, + priority, + match_criteria_enable, + &mask); + if (!matcher) { + mlx5_core_err(ns->dev, "Failed creating matcher\n"); + return -EINVAL; + } + + fg->fs_dr_matcher.dr_matcher = matcher; + return 0; +} + +static int mlx5_cmd_dr_destroy_flow_group(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + struct mlx5_flow_group *fg) +{ + if (mlx5_dr_is_fw_table(ft->flags)) + return mlx5_fs_cmd_get_fw_cmds()->destroy_flow_group(ns, ft, fg); + + return mlx5dr_matcher_destroy(fg->fs_dr_matcher.dr_matcher); +} + +static struct mlx5dr_action *create_vport_action(struct mlx5dr_domain *domain, + struct mlx5_flow_rule *dst) +{ + struct mlx5_flow_destination *dest_attr = &dst->dest_attr; + + return mlx5dr_action_create_dest_vport(domain, dest_attr->vport.num, + dest_attr->vport.flags & + MLX5_FLOW_DEST_VPORT_VHCA_ID, + dest_attr->vport.vhca_id); +} + +static struct mlx5dr_action *create_uplink_action(struct mlx5dr_domain *domain, + struct mlx5_flow_rule *dst) +{ + struct mlx5_flow_destination *dest_attr = &dst->dest_attr; + + return mlx5dr_action_create_dest_vport(domain, MLX5_VPORT_UPLINK, 1, + dest_attr->vport.vhca_id); +} + +static struct mlx5dr_action *create_ft_action(struct mlx5dr_domain *domain, + struct mlx5_flow_rule *dst) +{ + struct mlx5_flow_table *dest_ft = dst->dest_attr.ft; + + if (mlx5_dr_is_fw_table(dest_ft->flags)) + return mlx5dr_action_create_dest_flow_fw_table(domain, dest_ft); + return mlx5dr_action_create_dest_table(dest_ft->fs_dr_table.dr_table); +} + +static struct mlx5dr_action *create_action_push_vlan(struct mlx5dr_domain *domain, + struct mlx5_fs_vlan *vlan) +{ + u16 n_ethtype = vlan->ethtype; + u8 prio = vlan->prio; + u16 vid = vlan->vid; + u32 vlan_hdr; + + vlan_hdr = (u32)n_ethtype << 16 | (u32)(prio) << 12 | (u32)vid; + return mlx5dr_action_create_push_vlan(domain, htonl(vlan_hdr)); +} + +static bool contain_vport_reformat_action(struct mlx5_flow_rule *dst) +{ + return (dst->dest_attr.type == MLX5_FLOW_DESTINATION_TYPE_VPORT || + dst->dest_attr.type == MLX5_FLOW_DESTINATION_TYPE_UPLINK) && + dst->dest_attr.vport.flags & MLX5_FLOW_DEST_VPORT_REFORMAT_ID; +} + +/* We want to support a rule with 32 destinations, which means we need to + * account for 32 destinations plus usually a counter plus one more action + * for a multi-destination flow table. + */ +#define MLX5_FLOW_CONTEXT_ACTION_MAX 34 +static int mlx5_cmd_dr_create_fte(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + struct mlx5_flow_group *group, + struct fs_fte *fte) +{ + struct mlx5dr_domain *domain = ns->fs_dr_domain.dr_domain; + struct mlx5dr_action_dest *term_actions; + struct mlx5dr_match_parameters params; + struct mlx5_core_dev *dev = ns->dev; + struct mlx5dr_action **fs_dr_actions; + struct mlx5dr_action *tmp_action; + struct mlx5dr_action **actions; + bool delay_encap_set = false; + struct mlx5dr_rule *rule; + struct mlx5_flow_rule *dst; + int fs_dr_num_actions = 0; + int num_term_actions = 0; + int num_actions = 0; + size_t match_sz; + int err = 0; + int i; + + if (mlx5_dr_is_fw_table(ft->flags)) + return mlx5_fs_cmd_get_fw_cmds()->create_fte(ns, ft, group, fte); + + actions = kcalloc(MLX5_FLOW_CONTEXT_ACTION_MAX, sizeof(*actions), + GFP_KERNEL); + if (!actions) { + err = -ENOMEM; + goto out_err; + } + + fs_dr_actions = kcalloc(MLX5_FLOW_CONTEXT_ACTION_MAX, + sizeof(*fs_dr_actions), GFP_KERNEL); + if (!fs_dr_actions) { + err = -ENOMEM; + goto free_actions_alloc; + } + + term_actions = kcalloc(MLX5_FLOW_CONTEXT_ACTION_MAX, + sizeof(*term_actions), GFP_KERNEL); + if (!term_actions) { + err = -ENOMEM; + goto free_fs_dr_actions_alloc; + } + + match_sz = sizeof(fte->val); + + /* Drop reformat action bit if destination vport set with reformat */ + if (fte->action.action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) { + list_for_each_entry(dst, &fte->node.children, node.list) { + if (!contain_vport_reformat_action(dst)) + continue; + + fte->action.action &= ~MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT; + break; + } + } + + /* The order of the actions are must to be keep, only the following + * order is supported by SW steering: + * TX: modify header -> push vlan -> encap + * RX: decap -> pop vlan -> modify header + */ + if (fte->action.action & MLX5_FLOW_CONTEXT_ACTION_DECAP) { + enum mlx5dr_action_reformat_type decap_type = + DR_ACTION_REFORMAT_TYP_TNL_L2_TO_L2; + + tmp_action = mlx5dr_action_create_packet_reformat(domain, + decap_type, + 0, 0, 0, + NULL); + if (!tmp_action) { + err = -ENOMEM; + goto free_actions; + } + fs_dr_actions[fs_dr_num_actions++] = tmp_action; + actions[num_actions++] = tmp_action; + } + + if (fte->action.action & MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT) { + bool is_decap = fte->action.pkt_reformat->reformat_type == + MLX5_REFORMAT_TYPE_L3_TUNNEL_TO_L2; + + if (is_decap) + actions[num_actions++] = + fte->action.pkt_reformat->action.dr_action; + else + delay_encap_set = true; + } + + if (fte->action.action & MLX5_FLOW_CONTEXT_ACTION_VLAN_POP) { + tmp_action = + mlx5dr_action_create_pop_vlan(); + if (!tmp_action) { + err = -ENOMEM; + goto free_actions; + } + fs_dr_actions[fs_dr_num_actions++] = tmp_action; + actions[num_actions++] = tmp_action; + } + + if (fte->action.action & MLX5_FLOW_CONTEXT_ACTION_VLAN_POP_2) { + tmp_action = + mlx5dr_action_create_pop_vlan(); + if (!tmp_action) { + err = -ENOMEM; + goto free_actions; + } + fs_dr_actions[fs_dr_num_actions++] = tmp_action; + actions[num_actions++] = tmp_action; + } + + if (fte->action.action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR) + actions[num_actions++] = + fte->action.modify_hdr->action.dr_action; + + if (fte->action.action & MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH) { + tmp_action = create_action_push_vlan(domain, &fte->action.vlan[0]); + if (!tmp_action) { + err = -ENOMEM; + goto free_actions; + } + fs_dr_actions[fs_dr_num_actions++] = tmp_action; + actions[num_actions++] = tmp_action; + } + + if (fte->action.action & MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH_2) { + tmp_action = create_action_push_vlan(domain, &fte->action.vlan[1]); + if (!tmp_action) { + err = -ENOMEM; + goto free_actions; + } + fs_dr_actions[fs_dr_num_actions++] = tmp_action; + actions[num_actions++] = tmp_action; + } + + if (delay_encap_set) + actions[num_actions++] = + fte->action.pkt_reformat->action.dr_action; + + /* The order of the actions below is not important */ + + if (fte->action.action & MLX5_FLOW_CONTEXT_ACTION_DROP) { + tmp_action = mlx5dr_action_create_drop(); + if (!tmp_action) { + err = -ENOMEM; + goto free_actions; + } + fs_dr_actions[fs_dr_num_actions++] = tmp_action; + term_actions[num_term_actions++].dest = tmp_action; + } + + if (fte->flow_context.flow_tag) { + tmp_action = + mlx5dr_action_create_tag(fte->flow_context.flow_tag); + if (!tmp_action) { + err = -ENOMEM; + goto free_actions; + } + fs_dr_actions[fs_dr_num_actions++] = tmp_action; + actions[num_actions++] = tmp_action; + } + + if (fte->action.action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) { + list_for_each_entry(dst, &fte->node.children, node.list) { + enum mlx5_flow_destination_type type = dst->dest_attr.type; + u32 id; + + if (num_actions >= MLX5_FLOW_CONTEXT_ACTION_MAX || + num_term_actions >= MLX5_FLOW_CONTEXT_ACTION_MAX || + fs_dr_num_actions >= MLX5_FLOW_CONTEXT_ACTION_MAX) { + err = -EOPNOTSUPP; + goto free_actions; + } + + if (type == MLX5_FLOW_DESTINATION_TYPE_COUNTER) + continue; + + switch (type) { + case MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE: + tmp_action = create_ft_action(domain, dst); + if (!tmp_action) { + err = -ENOMEM; + goto free_actions; + } + fs_dr_actions[fs_dr_num_actions++] = tmp_action; + term_actions[num_term_actions++].dest = tmp_action; + break; + case MLX5_FLOW_DESTINATION_TYPE_UPLINK: + case MLX5_FLOW_DESTINATION_TYPE_VPORT: + tmp_action = type == MLX5_FLOW_DESTINATION_TYPE_VPORT ? + create_vport_action(domain, dst) : + create_uplink_action(domain, dst); + if (!tmp_action) { + err = -ENOMEM; + goto free_actions; + } + fs_dr_actions[fs_dr_num_actions++] = tmp_action; + term_actions[num_term_actions].dest = tmp_action; + + if (dst->dest_attr.vport.flags & + MLX5_FLOW_DEST_VPORT_REFORMAT_ID) + term_actions[num_term_actions].reformat = + dst->dest_attr.vport.pkt_reformat->action.dr_action; + + num_term_actions++; + break; + case MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM: + id = dst->dest_attr.ft_num; + tmp_action = mlx5dr_action_create_dest_table_num(domain, + id); + if (!tmp_action) { + err = -ENOMEM; + goto free_actions; + } + fs_dr_actions[fs_dr_num_actions++] = tmp_action; + term_actions[num_term_actions++].dest = tmp_action; + break; + case MLX5_FLOW_DESTINATION_TYPE_FLOW_SAMPLER: + id = dst->dest_attr.sampler_id; + tmp_action = mlx5dr_action_create_flow_sampler(domain, + id); + if (!tmp_action) { + err = -ENOMEM; + goto free_actions; + } + fs_dr_actions[fs_dr_num_actions++] = tmp_action; + term_actions[num_term_actions++].dest = tmp_action; + break; + default: + err = -EOPNOTSUPP; + goto free_actions; + } + } + } + + if (fte->action.action & MLX5_FLOW_CONTEXT_ACTION_COUNT) { + list_for_each_entry(dst, &fte->node.children, node.list) { + u32 id; + + if (dst->dest_attr.type != + MLX5_FLOW_DESTINATION_TYPE_COUNTER) + continue; + + if (num_actions >= MLX5_FLOW_CONTEXT_ACTION_MAX || + fs_dr_num_actions >= MLX5_FLOW_CONTEXT_ACTION_MAX) { + err = -EOPNOTSUPP; + goto free_actions; + } + + id = dst->dest_attr.counter_id; + tmp_action = + mlx5dr_action_create_flow_counter(id); + if (!tmp_action) { + err = -ENOMEM; + goto free_actions; + } + + fs_dr_actions[fs_dr_num_actions++] = tmp_action; + actions[num_actions++] = tmp_action; + } + } + + if (fte->action.action & MLX5_FLOW_CONTEXT_ACTION_EXECUTE_ASO) { + if (fte->action.exe_aso.type != MLX5_EXE_ASO_FLOW_METER) { + err = -EOPNOTSUPP; + goto free_actions; + } + + tmp_action = + mlx5dr_action_create_aso(domain, + fte->action.exe_aso.object_id, + fte->action.exe_aso.return_reg_id, + fte->action.exe_aso.type, + fte->action.exe_aso.flow_meter.init_color, + fte->action.exe_aso.flow_meter.meter_idx); + if (!tmp_action) { + err = -ENOMEM; + goto free_actions; + } + fs_dr_actions[fs_dr_num_actions++] = tmp_action; + actions[num_actions++] = tmp_action; + } + + params.match_sz = match_sz; + params.match_buf = (u64 *)fte->val; + if (num_term_actions == 1) { + if (term_actions->reformat) { + if (num_actions >= MLX5_FLOW_CONTEXT_ACTION_MAX) { + err = -EOPNOTSUPP; + goto free_actions; + } + actions[num_actions++] = term_actions->reformat; + } + + if (num_actions >= MLX5_FLOW_CONTEXT_ACTION_MAX) { + err = -EOPNOTSUPP; + goto free_actions; + } + actions[num_actions++] = term_actions->dest; + } else if (num_term_actions > 1) { + bool ignore_flow_level = + !!(fte->action.flags & FLOW_ACT_IGNORE_FLOW_LEVEL); + u32 flow_source = fte->flow_context.flow_source; + + if (num_actions >= MLX5_FLOW_CONTEXT_ACTION_MAX || + fs_dr_num_actions >= MLX5_FLOW_CONTEXT_ACTION_MAX) { + err = -EOPNOTSUPP; + goto free_actions; + } + tmp_action = mlx5dr_action_create_mult_dest_tbl(domain, + term_actions, + num_term_actions, + ignore_flow_level, + flow_source); + if (!tmp_action) { + err = -EOPNOTSUPP; + goto free_actions; + } + fs_dr_actions[fs_dr_num_actions++] = tmp_action; + actions[num_actions++] = tmp_action; + } + + rule = mlx5dr_rule_create(group->fs_dr_matcher.dr_matcher, + ¶ms, + num_actions, + actions, + fte->flow_context.flow_source); + if (!rule) { + err = -EINVAL; + goto free_actions; + } + + kfree(term_actions); + kfree(actions); + + fte->fs_dr_rule.dr_rule = rule; + fte->fs_dr_rule.num_actions = fs_dr_num_actions; + fte->fs_dr_rule.dr_actions = fs_dr_actions; + + return 0; + +free_actions: + /* Free in reverse order to handle action dependencies */ + for (i = fs_dr_num_actions - 1; i >= 0; i--) + if (!IS_ERR_OR_NULL(fs_dr_actions[i])) + mlx5dr_action_destroy(fs_dr_actions[i]); + + kfree(term_actions); +free_fs_dr_actions_alloc: + kfree(fs_dr_actions); +free_actions_alloc: + kfree(actions); +out_err: + mlx5_core_err(dev, "Failed to create dr rule err(%d)\n", err); + return err; +} + +static int mlx5_cmd_dr_packet_reformat_alloc(struct mlx5_flow_root_namespace *ns, + struct mlx5_pkt_reformat_params *params, + enum mlx5_flow_namespace_type namespace, + struct mlx5_pkt_reformat *pkt_reformat) +{ + struct mlx5dr_domain *dr_domain = ns->fs_dr_domain.dr_domain; + struct mlx5dr_action *action; + int dr_reformat; + + switch (params->type) { + case MLX5_REFORMAT_TYPE_L2_TO_VXLAN: + case MLX5_REFORMAT_TYPE_L2_TO_NVGRE: + case MLX5_REFORMAT_TYPE_L2_TO_L2_TUNNEL: + dr_reformat = DR_ACTION_REFORMAT_TYP_L2_TO_TNL_L2; + break; + case MLX5_REFORMAT_TYPE_L3_TUNNEL_TO_L2: + dr_reformat = DR_ACTION_REFORMAT_TYP_TNL_L3_TO_L2; + break; + case MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL: + dr_reformat = DR_ACTION_REFORMAT_TYP_L2_TO_TNL_L3; + break; + case MLX5_REFORMAT_TYPE_INSERT_HDR: + dr_reformat = DR_ACTION_REFORMAT_TYP_INSERT_HDR; + break; + case MLX5_REFORMAT_TYPE_REMOVE_HDR: + dr_reformat = DR_ACTION_REFORMAT_TYP_REMOVE_HDR; + break; + default: + mlx5_core_err(ns->dev, "Packet-reformat not supported(%d)\n", + params->type); + return -EOPNOTSUPP; + } + + action = mlx5dr_action_create_packet_reformat(dr_domain, + dr_reformat, + params->param_0, + params->param_1, + params->size, + params->data); + if (!action) { + mlx5_core_err(ns->dev, "Failed allocating packet-reformat action\n"); + return -EINVAL; + } + + pkt_reformat->sw_owned = true; + pkt_reformat->action.dr_action = action; + + return 0; +} + +static void mlx5_cmd_dr_packet_reformat_dealloc(struct mlx5_flow_root_namespace *ns, + struct mlx5_pkt_reformat *pkt_reformat) +{ + mlx5dr_action_destroy(pkt_reformat->action.dr_action); +} + +static int mlx5_cmd_dr_modify_header_alloc(struct mlx5_flow_root_namespace *ns, + u8 namespace, u8 num_actions, + void *modify_actions, + struct mlx5_modify_hdr *modify_hdr) +{ + struct mlx5dr_domain *dr_domain = ns->fs_dr_domain.dr_domain; + struct mlx5dr_action *action; + size_t actions_sz; + + actions_sz = MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto) * + num_actions; + action = mlx5dr_action_create_modify_header(dr_domain, 0, + actions_sz, + modify_actions); + if (!action) { + mlx5_core_err(ns->dev, "Failed allocating modify-header action\n"); + return -EINVAL; + } + + modify_hdr->sw_owned = true; + modify_hdr->action.dr_action = action; + + return 0; +} + +static void mlx5_cmd_dr_modify_header_dealloc(struct mlx5_flow_root_namespace *ns, + struct mlx5_modify_hdr *modify_hdr) +{ + mlx5dr_action_destroy(modify_hdr->action.dr_action); +} + +static int +mlx5_cmd_dr_destroy_match_definer(struct mlx5_flow_root_namespace *ns, + int definer_id) +{ + return -EOPNOTSUPP; +} + +static int mlx5_cmd_dr_create_match_definer(struct mlx5_flow_root_namespace *ns, + u16 format_id, u32 *match_mask) +{ + return -EOPNOTSUPP; +} + +static int mlx5_cmd_dr_delete_fte(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + struct fs_fte *fte) +{ + struct mlx5_fs_dr_rule *rule = &fte->fs_dr_rule; + int err; + int i; + + if (mlx5_dr_is_fw_table(ft->flags)) + return mlx5_fs_cmd_get_fw_cmds()->delete_fte(ns, ft, fte); + + err = mlx5dr_rule_destroy(rule->dr_rule); + if (err) + return err; + + /* Free in reverse order to handle action dependencies */ + for (i = rule->num_actions - 1; i >= 0; i--) + if (!IS_ERR_OR_NULL(rule->dr_actions[i])) + mlx5dr_action_destroy(rule->dr_actions[i]); + + kfree(rule->dr_actions); + return 0; +} + +static int mlx5_cmd_dr_update_fte(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_table *ft, + struct mlx5_flow_group *group, + int modify_mask, + struct fs_fte *fte) +{ + struct fs_fte fte_tmp = {}; + int ret; + + if (mlx5_dr_is_fw_table(ft->flags)) + return mlx5_fs_cmd_get_fw_cmds()->update_fte(ns, ft, group, modify_mask, fte); + + /* Backup current dr rule details */ + fte_tmp.fs_dr_rule = fte->fs_dr_rule; + memset(&fte->fs_dr_rule, 0, sizeof(struct mlx5_fs_dr_rule)); + + /* First add the new updated rule, then delete the old rule */ + ret = mlx5_cmd_dr_create_fte(ns, ft, group, fte); + if (ret) + goto restore_fte; + + ret = mlx5_cmd_dr_delete_fte(ns, ft, &fte_tmp); + WARN_ONCE(ret, "dr update fte duplicate rule deletion failed\n"); + return ret; + +restore_fte: + fte->fs_dr_rule = fte_tmp.fs_dr_rule; + return ret; +} + +static int mlx5_cmd_dr_set_peer(struct mlx5_flow_root_namespace *ns, + struct mlx5_flow_root_namespace *peer_ns) +{ + struct mlx5dr_domain *peer_domain = NULL; + + if (peer_ns) + peer_domain = peer_ns->fs_dr_domain.dr_domain; + mlx5dr_domain_set_peer(ns->fs_dr_domain.dr_domain, + peer_domain); + return 0; +} + +static int mlx5_cmd_dr_create_ns(struct mlx5_flow_root_namespace *ns) +{ + ns->fs_dr_domain.dr_domain = + mlx5dr_domain_create(ns->dev, + MLX5DR_DOMAIN_TYPE_FDB); + if (!ns->fs_dr_domain.dr_domain) { + mlx5_core_err(ns->dev, "Failed to create dr flow namespace\n"); + return -EOPNOTSUPP; + } + return 0; +} + +static int mlx5_cmd_dr_destroy_ns(struct mlx5_flow_root_namespace *ns) +{ + return mlx5dr_domain_destroy(ns->fs_dr_domain.dr_domain); +} + +bool mlx5_fs_dr_is_supported(struct mlx5_core_dev *dev) +{ + return mlx5dr_is_supported(dev); +} + +static const struct mlx5_flow_cmds mlx5_flow_cmds_dr = { + .create_flow_table = mlx5_cmd_dr_create_flow_table, + .destroy_flow_table = mlx5_cmd_dr_destroy_flow_table, + .modify_flow_table = mlx5_cmd_dr_modify_flow_table, + .create_flow_group = mlx5_cmd_dr_create_flow_group, + .destroy_flow_group = mlx5_cmd_dr_destroy_flow_group, + .create_fte = mlx5_cmd_dr_create_fte, + .update_fte = mlx5_cmd_dr_update_fte, + .delete_fte = mlx5_cmd_dr_delete_fte, + .update_root_ft = mlx5_cmd_dr_update_root_ft, + .packet_reformat_alloc = mlx5_cmd_dr_packet_reformat_alloc, + .packet_reformat_dealloc = mlx5_cmd_dr_packet_reformat_dealloc, + .modify_header_alloc = mlx5_cmd_dr_modify_header_alloc, + .modify_header_dealloc = mlx5_cmd_dr_modify_header_dealloc, + .create_match_definer = mlx5_cmd_dr_create_match_definer, + .destroy_match_definer = mlx5_cmd_dr_destroy_match_definer, + .set_peer = mlx5_cmd_dr_set_peer, + .create_ns = mlx5_cmd_dr_create_ns, + .destroy_ns = mlx5_cmd_dr_destroy_ns, +}; + +const struct mlx5_flow_cmds *mlx5_fs_cmd_get_dr_cmds(void) +{ + return &mlx5_flow_cmds_dr; +} + +u32 mlx5_fs_dr_action_get_pkt_reformat_id(struct mlx5_pkt_reformat *pkt_reformat) +{ + return mlx5dr_action_get_pkt_reformat_id(pkt_reformat->action.dr_action); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.h new file mode 100644 index 0000000..bb1a613 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB + * Copyright (c) 2019 Mellanox Technologies + */ + +#ifndef _MLX5_FS_DR_ +#define _MLX5_FS_DR_ + +#include "mlx5dr.h" + +struct mlx5_flow_root_namespace; +struct fs_fte; + +struct mlx5_fs_dr_action { + struct mlx5dr_action *dr_action; +}; + +struct mlx5_fs_dr_ns { + struct mlx5_dr_ns *dr_ns; +}; + +struct mlx5_fs_dr_rule { + struct mlx5dr_rule *dr_rule; + /* Only actions created by fs_dr */ + struct mlx5dr_action **dr_actions; + int num_actions; +}; + +struct mlx5_fs_dr_domain { + struct mlx5dr_domain *dr_domain; +}; + +struct mlx5_fs_dr_matcher { + struct mlx5dr_matcher *dr_matcher; +}; + +struct mlx5_fs_dr_table { + struct mlx5dr_table *dr_table; + struct mlx5dr_action *miss_action; +}; + +#ifdef CONFIG_MLX5_SW_STEERING + +bool mlx5_fs_dr_is_supported(struct mlx5_core_dev *dev); + +const struct mlx5_flow_cmds *mlx5_fs_cmd_get_dr_cmds(void); + +u32 mlx5_fs_dr_action_get_pkt_reformat_id(struct mlx5_pkt_reformat *pkt_reformat); + +#else + +static inline const struct mlx5_flow_cmds *mlx5_fs_cmd_get_dr_cmds(void) +{ + return NULL; +} + +static inline bool mlx5_fs_dr_is_supported(struct mlx5_core_dev *dev) +{ + return false; +} + +static inline u32 mlx5_fs_dr_action_get_pkt_reformat_id(struct mlx5_pkt_reformat *pkt_reformat) +{ + WARN_ON(true); + + return 0; +} + +#endif /* CONFIG_MLX5_SW_STEERING */ +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5_ifc_dr.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5_ifc_dr.h new file mode 100644 index 0000000..81c3451 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5_ifc_dr.h @@ -0,0 +1,661 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2019, Mellanox Technologies */ + +#ifndef MLX5_IFC_DR_H +#define MLX5_IFC_DR_H + +enum { + MLX5DR_STE_LU_TYPE_DONT_CARE = 0x0f, +}; + +struct mlx5_ifc_ste_general_bits { + u8 entry_type[0x4]; + u8 reserved_at_4[0x4]; + u8 entry_sub_type[0x8]; + u8 byte_mask[0x10]; + + u8 next_table_base_63_48[0x10]; + u8 next_lu_type[0x8]; + u8 next_table_base_39_32_size[0x8]; + + u8 next_table_base_31_5_size[0x1b]; + u8 linear_hash_enable[0x1]; + u8 reserved_at_5c[0x2]; + u8 next_table_rank[0x2]; + + u8 reserved_at_60[0xa0]; + u8 tag_value[0x60]; + u8 bit_mask[0x60]; +}; + +struct mlx5_ifc_ste_sx_transmit_bits { + u8 entry_type[0x4]; + u8 reserved_at_4[0x4]; + u8 entry_sub_type[0x8]; + u8 byte_mask[0x10]; + + u8 next_table_base_63_48[0x10]; + u8 next_lu_type[0x8]; + u8 next_table_base_39_32_size[0x8]; + + u8 next_table_base_31_5_size[0x1b]; + u8 linear_hash_enable[0x1]; + u8 reserved_at_5c[0x2]; + u8 next_table_rank[0x2]; + + u8 sx_wire[0x1]; + u8 sx_func_lb[0x1]; + u8 sx_sniffer[0x1]; + u8 sx_wire_enable[0x1]; + u8 sx_func_lb_enable[0x1]; + u8 sx_sniffer_enable[0x1]; + u8 action_type[0x3]; + u8 reserved_at_69[0x1]; + u8 action_description[0x6]; + u8 gvmi[0x10]; + + u8 encap_pointer_vlan_data[0x20]; + + u8 loopback_syndome_en[0x8]; + u8 loopback_syndome[0x8]; + u8 counter_trigger[0x10]; + + u8 miss_address_63_48[0x10]; + u8 counter_trigger_23_16[0x8]; + u8 miss_address_39_32[0x8]; + + u8 miss_address_31_6[0x1a]; + u8 learning_point[0x1]; + u8 go_back[0x1]; + u8 match_polarity[0x1]; + u8 mask_mode[0x1]; + u8 miss_rank[0x2]; +}; + +struct mlx5_ifc_ste_rx_steering_mult_bits { + u8 entry_type[0x4]; + u8 reserved_at_4[0x4]; + u8 entry_sub_type[0x8]; + u8 byte_mask[0x10]; + + u8 next_table_base_63_48[0x10]; + u8 next_lu_type[0x8]; + u8 next_table_base_39_32_size[0x8]; + + u8 next_table_base_31_5_size[0x1b]; + u8 linear_hash_enable[0x1]; + u8 reserved_at_[0x2]; + u8 next_table_rank[0x2]; + + u8 member_count[0x10]; + u8 gvmi[0x10]; + + u8 qp_list_pointer[0x20]; + + u8 reserved_at_a0[0x1]; + u8 tunneling_action[0x3]; + u8 action_description[0x4]; + u8 reserved_at_a8[0x8]; + u8 counter_trigger_15_0[0x10]; + + u8 miss_address_63_48[0x10]; + u8 counter_trigger_23_16[0x08]; + u8 miss_address_39_32[0x8]; + + u8 miss_address_31_6[0x1a]; + u8 learning_point[0x1]; + u8 fail_on_error[0x1]; + u8 match_polarity[0x1]; + u8 mask_mode[0x1]; + u8 miss_rank[0x2]; +}; + +struct mlx5_ifc_ste_modify_packet_bits { + u8 entry_type[0x4]; + u8 reserved_at_4[0x4]; + u8 entry_sub_type[0x8]; + u8 byte_mask[0x10]; + + u8 next_table_base_63_48[0x10]; + u8 next_lu_type[0x8]; + u8 next_table_base_39_32_size[0x8]; + + u8 next_table_base_31_5_size[0x1b]; + u8 linear_hash_enable[0x1]; + u8 reserved_at_[0x2]; + u8 next_table_rank[0x2]; + + u8 number_of_re_write_actions[0x10]; + u8 gvmi[0x10]; + + u8 header_re_write_actions_pointer[0x20]; + + u8 reserved_at_a0[0x1]; + u8 tunneling_action[0x3]; + u8 action_description[0x4]; + u8 reserved_at_a8[0x8]; + u8 counter_trigger_15_0[0x10]; + + u8 miss_address_63_48[0x10]; + u8 counter_trigger_23_16[0x08]; + u8 miss_address_39_32[0x8]; + + u8 miss_address_31_6[0x1a]; + u8 learning_point[0x1]; + u8 fail_on_error[0x1]; + u8 match_polarity[0x1]; + u8 mask_mode[0x1]; + u8 miss_rank[0x2]; +}; + +struct mlx5_ifc_ste_mask_and_match_bits { + u8 entry_format[0x8]; + u8 counter_id[0x18]; + + u8 miss_address_63_48[0x10]; + u8 match_definer_ctx_idx[0x8]; + u8 miss_address_39_32[0x8]; + + u8 miss_address_31_6[0x1a]; + u8 reserved_at_5a[0x1]; + u8 match_polarity[0x1]; + u8 reparse[0x1]; + u8 reserved_at_5d[0x3]; + + u8 next_table_base_63_48[0x10]; + u8 hash_definer_ctx_idx[0x8]; + u8 next_table_base_39_32_size[0x8]; + + u8 next_table_base_31_5_size[0x1b]; + u8 hash_type[0x2]; + u8 hash_after_actions[0x1]; + u8 reserved_at_9e[0x2]; + + u8 action[0x60]; +}; + +struct mlx5_ifc_ste_eth_l2_src_bits { + u8 smac_47_16[0x20]; + + u8 smac_15_0[0x10]; + u8 l3_ethertype[0x10]; + + u8 qp_type[0x2]; + u8 ethertype_filter[0x1]; + u8 reserved_at_43[0x1]; + u8 sx_sniffer[0x1]; + u8 force_lb[0x1]; + u8 functional_lb[0x1]; + u8 port[0x1]; + u8 reserved_at_48[0x4]; + u8 first_priority[0x3]; + u8 first_cfi[0x1]; + u8 first_vlan_qualifier[0x2]; + u8 reserved_at_52[0x2]; + u8 first_vlan_id[0xc]; + + u8 ip_fragmented[0x1]; + u8 tcp_syn[0x1]; + u8 encp_type[0x2]; + u8 l3_type[0x2]; + u8 l4_type[0x2]; + u8 reserved_at_68[0x4]; + u8 second_priority[0x3]; + u8 second_cfi[0x1]; + u8 second_vlan_qualifier[0x2]; + u8 reserved_at_72[0x2]; + u8 second_vlan_id[0xc]; +}; + +struct mlx5_ifc_ste_eth_l2_dst_bits { + u8 dmac_47_16[0x20]; + + u8 dmac_15_0[0x10]; + u8 l3_ethertype[0x10]; + + u8 qp_type[0x2]; + u8 ethertype_filter[0x1]; + u8 reserved_at_43[0x1]; + u8 sx_sniffer[0x1]; + u8 force_lb[0x1]; + u8 functional_lb[0x1]; + u8 port[0x1]; + u8 reserved_at_48[0x4]; + u8 first_priority[0x3]; + u8 first_cfi[0x1]; + u8 first_vlan_qualifier[0x2]; + u8 reserved_at_52[0x2]; + u8 first_vlan_id[0xc]; + + u8 ip_fragmented[0x1]; + u8 tcp_syn[0x1]; + u8 encp_type[0x2]; + u8 l3_type[0x2]; + u8 l4_type[0x2]; + u8 reserved_at_68[0x4]; + u8 second_priority[0x3]; + u8 second_cfi[0x1]; + u8 second_vlan_qualifier[0x2]; + u8 reserved_at_72[0x2]; + u8 second_vlan_id[0xc]; +}; + +struct mlx5_ifc_ste_eth_l2_src_dst_bits { + u8 dmac_47_16[0x20]; + + u8 dmac_15_0[0x10]; + u8 smac_47_32[0x10]; + + u8 smac_31_0[0x20]; + + u8 sx_sniffer[0x1]; + u8 force_lb[0x1]; + u8 functional_lb[0x1]; + u8 port[0x1]; + u8 l3_type[0x2]; + u8 reserved_at_66[0x6]; + u8 first_priority[0x3]; + u8 first_cfi[0x1]; + u8 first_vlan_qualifier[0x2]; + u8 reserved_at_72[0x2]; + u8 first_vlan_id[0xc]; +}; + +struct mlx5_ifc_ste_eth_l3_ipv4_5_tuple_bits { + u8 destination_address[0x20]; + + u8 source_address[0x20]; + + u8 source_port[0x10]; + u8 destination_port[0x10]; + + u8 fragmented[0x1]; + u8 first_fragment[0x1]; + u8 reserved_at_62[0x2]; + u8 reserved_at_64[0x1]; + u8 ecn[0x2]; + u8 tcp_ns[0x1]; + u8 tcp_cwr[0x1]; + u8 tcp_ece[0x1]; + u8 tcp_urg[0x1]; + u8 tcp_ack[0x1]; + u8 tcp_psh[0x1]; + u8 tcp_rst[0x1]; + u8 tcp_syn[0x1]; + u8 tcp_fin[0x1]; + u8 dscp[0x6]; + u8 reserved_at_76[0x2]; + u8 protocol[0x8]; +}; + +struct mlx5_ifc_ste_eth_l3_ipv6_dst_bits { + u8 dst_ip_127_96[0x20]; + + u8 dst_ip_95_64[0x20]; + + u8 dst_ip_63_32[0x20]; + + u8 dst_ip_31_0[0x20]; +}; + +struct mlx5_ifc_ste_eth_l2_tnl_bits { + u8 dmac_47_16[0x20]; + + u8 dmac_15_0[0x10]; + u8 l3_ethertype[0x10]; + + u8 l2_tunneling_network_id[0x20]; + + u8 ip_fragmented[0x1]; + u8 tcp_syn[0x1]; + u8 encp_type[0x2]; + u8 l3_type[0x2]; + u8 l4_type[0x2]; + u8 first_priority[0x3]; + u8 first_cfi[0x1]; + u8 reserved_at_6c[0x3]; + u8 gre_key_flag[0x1]; + u8 first_vlan_qualifier[0x2]; + u8 reserved_at_72[0x2]; + u8 first_vlan_id[0xc]; +}; + +struct mlx5_ifc_ste_eth_l3_ipv6_src_bits { + u8 src_ip_127_96[0x20]; + + u8 src_ip_95_64[0x20]; + + u8 src_ip_63_32[0x20]; + + u8 src_ip_31_0[0x20]; +}; + +struct mlx5_ifc_ste_eth_l3_ipv4_misc_bits { + u8 version[0x4]; + u8 ihl[0x4]; + u8 reserved_at_8[0x8]; + u8 total_length[0x10]; + + u8 identification[0x10]; + u8 flags[0x3]; + u8 fragment_offset[0xd]; + + u8 time_to_live[0x8]; + u8 reserved_at_48[0x8]; + u8 checksum[0x10]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_ste_eth_l4_bits { + u8 fragmented[0x1]; + u8 first_fragment[0x1]; + u8 reserved_at_2[0x6]; + u8 protocol[0x8]; + u8 dst_port[0x10]; + + u8 ipv6_version[0x4]; + u8 reserved_at_24[0x1]; + u8 ecn[0x2]; + u8 tcp_ns[0x1]; + u8 tcp_cwr[0x1]; + u8 tcp_ece[0x1]; + u8 tcp_urg[0x1]; + u8 tcp_ack[0x1]; + u8 tcp_psh[0x1]; + u8 tcp_rst[0x1]; + u8 tcp_syn[0x1]; + u8 tcp_fin[0x1]; + u8 src_port[0x10]; + + u8 ipv6_payload_length[0x10]; + u8 ipv6_hop_limit[0x8]; + u8 dscp[0x6]; + u8 reserved_at_5e[0x2]; + + u8 tcp_data_offset[0x4]; + u8 reserved_at_64[0x8]; + u8 flow_label[0x14]; +}; + +struct mlx5_ifc_ste_eth_l4_misc_bits { + u8 checksum[0x10]; + u8 length[0x10]; + + u8 seq_num[0x20]; + + u8 ack_num[0x20]; + + u8 urgent_pointer[0x10]; + u8 window_size[0x10]; +}; + +struct mlx5_ifc_ste_mpls_bits { + u8 mpls0_label[0x14]; + u8 mpls0_exp[0x3]; + u8 mpls0_s_bos[0x1]; + u8 mpls0_ttl[0x8]; + + u8 mpls1_label[0x20]; + + u8 mpls2_label[0x20]; + + u8 reserved_at_60[0x16]; + u8 mpls4_s_bit[0x1]; + u8 mpls4_qualifier[0x1]; + u8 mpls3_s_bit[0x1]; + u8 mpls3_qualifier[0x1]; + u8 mpls2_s_bit[0x1]; + u8 mpls2_qualifier[0x1]; + u8 mpls1_s_bit[0x1]; + u8 mpls1_qualifier[0x1]; + u8 mpls0_s_bit[0x1]; + u8 mpls0_qualifier[0x1]; +}; + +struct mlx5_ifc_ste_register_0_bits { + u8 register_0_h[0x20]; + + u8 register_0_l[0x20]; + + u8 register_1_h[0x20]; + + u8 register_1_l[0x20]; +}; + +struct mlx5_ifc_ste_register_1_bits { + u8 register_2_h[0x20]; + + u8 register_2_l[0x20]; + + u8 register_3_h[0x20]; + + u8 register_3_l[0x20]; +}; + +struct mlx5_ifc_ste_gre_bits { + u8 gre_c_present[0x1]; + u8 reserved_at_30[0x1]; + u8 gre_k_present[0x1]; + u8 gre_s_present[0x1]; + u8 strict_src_route[0x1]; + u8 recur[0x3]; + u8 flags[0x5]; + u8 version[0x3]; + u8 gre_protocol[0x10]; + + u8 checksum[0x10]; + u8 offset[0x10]; + + u8 gre_key_h[0x18]; + u8 gre_key_l[0x8]; + + u8 seq_num[0x20]; +}; + +struct mlx5_ifc_ste_flex_parser_0_bits { + u8 flex_parser_3[0x20]; + + u8 flex_parser_2[0x20]; + + u8 flex_parser_1[0x20]; + + u8 flex_parser_0[0x20]; +}; + +struct mlx5_ifc_ste_flex_parser_1_bits { + u8 flex_parser_7[0x20]; + + u8 flex_parser_6[0x20]; + + u8 flex_parser_5[0x20]; + + u8 flex_parser_4[0x20]; +}; + +struct mlx5_ifc_ste_flex_parser_ok_bits { + u8 flex_parser_3[0x20]; + u8 flex_parser_2[0x20]; + u8 flex_parsers_ok[0x8]; + u8 reserved_at_48[0x18]; + u8 flex_parser_0[0x20]; +}; + +struct mlx5_ifc_ste_flex_parser_tnl_bits { + u8 flex_parser_tunneling_header_63_32[0x20]; + + u8 flex_parser_tunneling_header_31_0[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_ste_flex_parser_tnl_vxlan_gpe_bits { + u8 outer_vxlan_gpe_flags[0x8]; + u8 reserved_at_8[0x10]; + u8 outer_vxlan_gpe_next_protocol[0x8]; + + u8 outer_vxlan_gpe_vni[0x18]; + u8 reserved_at_38[0x8]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_ste_flex_parser_tnl_geneve_bits { + u8 reserved_at_0[0x2]; + u8 geneve_opt_len[0x6]; + u8 geneve_oam[0x1]; + u8 reserved_at_9[0x7]; + u8 geneve_protocol_type[0x10]; + + u8 geneve_vni[0x18]; + u8 reserved_at_38[0x8]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_ste_flex_parser_tnl_gtpu_bits { + u8 reserved_at_0[0x5]; + u8 gtpu_msg_flags[0x3]; + u8 gtpu_msg_type[0x8]; + u8 reserved_at_10[0x10]; + + u8 gtpu_teid[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_ste_tunnel_header_bits { + u8 tunnel_header_0[0x20]; + + u8 tunnel_header_1[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_ste_general_purpose_bits { + u8 general_purpose_lookup_field[0x20]; + + u8 reserved_at_20[0x20]; + + u8 reserved_at_40[0x20]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_ste_src_gvmi_qp_bits { + u8 loopback_syndrome[0x8]; + u8 reserved_at_8[0x8]; + u8 source_gvmi[0x10]; + + u8 reserved_at_20[0x5]; + u8 force_lb[0x1]; + u8 functional_lb[0x1]; + u8 source_is_requestor[0x1]; + u8 source_qp[0x18]; + + u8 reserved_at_40[0x20]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_ste_double_action_set_bits { + u8 action_id[0x8]; + u8 destination_dw_offset[0x8]; + u8 reserved_at_10[0x2]; + u8 destination_left_shifter[0x6]; + u8 reserved_at_18[0x2]; + u8 destination_length[0x6]; + + u8 inline_data[0x20]; +}; + +struct mlx5_ifc_ste_double_action_accelerated_modify_action_list_bits { + u8 action_id[0x8]; + u8 modify_actions_pattern_pointer[0x18]; + + u8 number_of_modify_actions[0x8]; + u8 modify_actions_argument_pointer[0x18]; +}; + +enum { + MLX5DR_ASO_FIRST_HIT_NUM_PER_OBJ = 512, + MLX5DR_ASO_FLOW_METER_NUM_PER_OBJ = 2, + MLX5DR_ASO_CT_NUM_PER_OBJ = 1, +}; + +struct mlx5_ifc_ste_aso_flow_meter_action_bits { + u8 reserved_at_0[0xc]; + u8 action[0x1]; + u8 initial_color[0x2]; + u8 line_id[0x1]; +}; + +struct mlx5_ifc_ste_double_action_aso_v1_bits { + u8 action_id[0x8]; + u8 aso_context_number[0x18]; + + u8 dest_reg_id[0x2]; + u8 change_ordering_tag[0x1]; + u8 aso_check_ordering[0x1]; + u8 aso_context_type[0x4]; + u8 reserved_at_28[0x8]; + union { + u8 aso_fields[0x10]; + struct mlx5_ifc_ste_aso_flow_meter_action_bits flow_meter; + }; +}; + +struct mlx5_ifc_ste_double_action_add_bits { + u8 action_id[0x8]; + u8 destination_dw_offset[0x8]; + u8 reserved_at_10[0x2]; + u8 destination_left_shifter[0x6]; + u8 reserved_at_18[0x2]; + u8 destination_length[0x6]; + + u8 add_value[0x20]; +}; + +struct mlx5_ifc_l2_hdr_bits { + u8 dmac_47_16[0x20]; + + u8 dmac_15_0[0x10]; + u8 smac_47_32[0x10]; + + u8 smac_31_0[0x20]; + + u8 ethertype[0x10]; + u8 vlan_type[0x10]; + + u8 vlan[0x10]; + u8 reserved_at_90[0x10]; +}; + +/* Both HW set and HW add share the same HW format with different opcodes */ +struct mlx5_ifc_dr_action_hw_set_bits { + u8 opcode[0x8]; + u8 destination_field_code[0x8]; + u8 reserved_at_10[0x2]; + u8 destination_left_shifter[0x6]; + u8 reserved_at_18[0x3]; + u8 destination_length[0x5]; + + u8 inline_data[0x20]; +}; + +struct mlx5_ifc_dr_action_hw_copy_bits { + u8 opcode[0x8]; + u8 destination_field_code[0x8]; + u8 reserved_at_10[0x2]; + u8 destination_left_shifter[0x6]; + u8 reserved_at_18[0x2]; + u8 destination_length[0x6]; + + u8 reserved_at_20[0x8]; + u8 source_field_code[0x8]; + u8 reserved_at_30[0x2]; + u8 source_left_shifter[0x6]; + u8 reserved_at_38[0x8]; +}; + +#endif /* MLX5_IFC_DR_H */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5_ifc_dr_ste_v1.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5_ifc_dr_ste_v1.h new file mode 100644 index 0000000..34c2bd1 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5_ifc_dr_ste_v1.h @@ -0,0 +1,434 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. */ + +#ifndef MLX5_IFC_DR_STE_V1_H +#define MLX5_IFC_DR_STE_V1_H + +enum mlx5_ifc_ste_v1_modify_hdr_offset { + MLX5_MODIFY_HEADER_V1_QW_OFFSET = 0x20, +}; + +struct mlx5_ifc_ste_single_action_flow_tag_v1_bits { + u8 action_id[0x8]; + u8 flow_tag[0x18]; +}; + +struct mlx5_ifc_ste_single_action_modify_list_v1_bits { + u8 action_id[0x8]; + u8 num_of_modify_actions[0x8]; + u8 modify_actions_ptr[0x10]; +}; + +struct mlx5_ifc_ste_single_action_remove_header_v1_bits { + u8 action_id[0x8]; + u8 reserved_at_8[0x2]; + u8 start_anchor[0x6]; + u8 reserved_at_10[0x2]; + u8 end_anchor[0x6]; + u8 reserved_at_18[0x4]; + u8 decap[0x1]; + u8 vni_to_cqe[0x1]; + u8 qos_profile[0x2]; +}; + +struct mlx5_ifc_ste_single_action_remove_header_size_v1_bits { + u8 action_id[0x8]; + u8 reserved_at_8[0x2]; + u8 start_anchor[0x6]; + u8 outer_l4_remove[0x1]; + u8 reserved_at_11[0x1]; + u8 start_offset[0x7]; + u8 reserved_at_18[0x1]; + u8 remove_size[0x6]; +}; + +struct mlx5_ifc_ste_double_action_copy_v1_bits { + u8 action_id[0x8]; + u8 destination_dw_offset[0x8]; + u8 reserved_at_10[0x2]; + u8 destination_left_shifter[0x6]; + u8 reserved_at_17[0x2]; + u8 destination_length[0x6]; + + u8 reserved_at_20[0x8]; + u8 source_dw_offset[0x8]; + u8 reserved_at_30[0x2]; + u8 source_right_shifter[0x6]; + u8 reserved_at_38[0x8]; +}; + +struct mlx5_ifc_ste_double_action_set_v1_bits { + u8 action_id[0x8]; + u8 destination_dw_offset[0x8]; + u8 reserved_at_10[0x2]; + u8 destination_left_shifter[0x6]; + u8 reserved_at_18[0x2]; + u8 destination_length[0x6]; + + u8 inline_data[0x20]; +}; + +struct mlx5_ifc_ste_double_action_add_v1_bits { + u8 action_id[0x8]; + u8 destination_dw_offset[0x8]; + u8 reserved_at_10[0x2]; + u8 destination_left_shifter[0x6]; + u8 reserved_at_18[0x2]; + u8 destination_length[0x6]; + + u8 add_value[0x20]; +}; + +struct mlx5_ifc_ste_double_action_insert_with_inline_v1_bits { + u8 action_id[0x8]; + u8 reserved_at_8[0x2]; + u8 start_anchor[0x6]; + u8 start_offset[0x7]; + u8 reserved_at_17[0x9]; + + u8 inline_data[0x20]; +}; + +struct mlx5_ifc_ste_double_action_insert_with_ptr_v1_bits { + u8 action_id[0x8]; + u8 reserved_at_8[0x2]; + u8 start_anchor[0x6]; + u8 start_offset[0x7]; + u8 size[0x6]; + u8 attributes[0x3]; + + u8 pointer[0x20]; +}; + +struct mlx5_ifc_ste_double_action_modify_action_list_v1_bits { + u8 action_id[0x8]; + u8 modify_actions_pattern_pointer[0x18]; + + u8 number_of_modify_actions[0x8]; + u8 modify_actions_argument_pointer[0x18]; +}; + +struct mlx5_ifc_ste_match_bwc_v1_bits { + u8 entry_format[0x8]; + u8 counter_id[0x18]; + + u8 miss_address_63_48[0x10]; + u8 match_definer_ctx_idx[0x8]; + u8 miss_address_39_32[0x8]; + + u8 miss_address_31_6[0x1a]; + u8 reserved_at_5a[0x1]; + u8 match_polarity[0x1]; + u8 reparse[0x1]; + u8 reserved_at_5d[0x3]; + + u8 next_table_base_63_48[0x10]; + u8 hash_definer_ctx_idx[0x8]; + u8 next_table_base_39_32_size[0x8]; + + u8 next_table_base_31_5_size[0x1b]; + u8 hash_type[0x2]; + u8 hash_after_actions[0x1]; + u8 reserved_at_9e[0x2]; + + u8 byte_mask[0x10]; + u8 next_entry_format[0x1]; + u8 mask_mode[0x1]; + u8 gvmi[0xe]; + + u8 action[0x40]; +}; + +struct mlx5_ifc_ste_mask_and_match_v1_bits { + u8 entry_format[0x8]; + u8 counter_id[0x18]; + + u8 miss_address_63_48[0x10]; + u8 match_definer_ctx_idx[0x8]; + u8 miss_address_39_32[0x8]; + + u8 miss_address_31_6[0x1a]; + u8 reserved_at_5a[0x1]; + u8 match_polarity[0x1]; + u8 reparse[0x1]; + u8 reserved_at_5d[0x3]; + + u8 next_table_base_63_48[0x10]; + u8 hash_definer_ctx_idx[0x8]; + u8 next_table_base_39_32_size[0x8]; + + u8 next_table_base_31_5_size[0x1b]; + u8 hash_type[0x2]; + u8 hash_after_actions[0x1]; + u8 reserved_at_9e[0x2]; + + u8 action[0x60]; +}; + +struct mlx5_ifc_ste_eth_l2_src_v1_bits { + u8 reserved_at_0[0x1]; + u8 sx_sniffer[0x1]; + u8 functional_loopback[0x1]; + u8 ip_fragmented[0x1]; + u8 qp_type[0x2]; + u8 encapsulation_type[0x2]; + u8 port[0x2]; + u8 l3_type[0x2]; + u8 l4_type[0x2]; + u8 first_vlan_qualifier[0x2]; + u8 first_priority[0x3]; + u8 first_cfi[0x1]; + u8 first_vlan_id[0xc]; + + u8 smac_47_16[0x20]; + + u8 smac_15_0[0x10]; + u8 l3_ethertype[0x10]; + + u8 reserved_at_60[0x6]; + u8 tcp_syn[0x1]; + u8 reserved_at_67[0x3]; + u8 force_loopback[0x1]; + u8 l2_ok[0x1]; + u8 l3_ok[0x1]; + u8 l4_ok[0x1]; + u8 second_vlan_qualifier[0x2]; + + u8 second_priority[0x3]; + u8 second_cfi[0x1]; + u8 second_vlan_id[0xc]; +}; + +struct mlx5_ifc_ste_eth_l2_dst_v1_bits { + u8 reserved_at_0[0x1]; + u8 sx_sniffer[0x1]; + u8 functional_lb[0x1]; + u8 ip_fragmented[0x1]; + u8 qp_type[0x2]; + u8 encapsulation_type[0x2]; + u8 port[0x2]; + u8 l3_type[0x2]; + u8 l4_type[0x2]; + u8 first_vlan_qualifier[0x2]; + u8 first_priority[0x3]; + u8 first_cfi[0x1]; + u8 first_vlan_id[0xc]; + + u8 dmac_47_16[0x20]; + + u8 dmac_15_0[0x10]; + u8 l3_ethertype[0x10]; + + u8 reserved_at_60[0x6]; + u8 tcp_syn[0x1]; + u8 reserved_at_67[0x3]; + u8 force_lb[0x1]; + u8 l2_ok[0x1]; + u8 l3_ok[0x1]; + u8 l4_ok[0x1]; + u8 second_vlan_qualifier[0x2]; + u8 second_priority[0x3]; + u8 second_cfi[0x1]; + u8 second_vlan_id[0xc]; +}; + +struct mlx5_ifc_ste_eth_l2_src_dst_v1_bits { + u8 dmac_47_16[0x20]; + + u8 smac_47_16[0x20]; + + u8 dmac_15_0[0x10]; + u8 reserved_at_50[0x2]; + u8 functional_lb[0x1]; + u8 reserved_at_53[0x5]; + u8 port[0x2]; + u8 l3_type[0x2]; + u8 reserved_at_5c[0x2]; + u8 first_vlan_qualifier[0x2]; + + u8 first_priority[0x3]; + u8 first_cfi[0x1]; + u8 first_vlan_id[0xc]; + u8 smac_15_0[0x10]; +}; + +struct mlx5_ifc_ste_eth_l3_ipv4_5_tuple_v1_bits { + u8 source_address[0x20]; + + u8 destination_address[0x20]; + + u8 source_port[0x10]; + u8 destination_port[0x10]; + + u8 reserved_at_60[0x4]; + u8 l4_ok[0x1]; + u8 l3_ok[0x1]; + u8 fragmented[0x1]; + u8 tcp_ns[0x1]; + u8 tcp_cwr[0x1]; + u8 tcp_ece[0x1]; + u8 tcp_urg[0x1]; + u8 tcp_ack[0x1]; + u8 tcp_psh[0x1]; + u8 tcp_rst[0x1]; + u8 tcp_syn[0x1]; + u8 tcp_fin[0x1]; + u8 dscp[0x6]; + u8 ecn[0x2]; + u8 protocol[0x8]; +}; + +struct mlx5_ifc_ste_eth_l2_tnl_v1_bits { + u8 l2_tunneling_network_id[0x20]; + + u8 dmac_47_16[0x20]; + + u8 dmac_15_0[0x10]; + u8 l3_ethertype[0x10]; + + u8 reserved_at_60[0x3]; + u8 ip_fragmented[0x1]; + u8 reserved_at_64[0x2]; + u8 encp_type[0x2]; + u8 reserved_at_68[0x2]; + u8 l3_type[0x2]; + u8 l4_type[0x2]; + u8 first_vlan_qualifier[0x2]; + u8 first_priority[0x3]; + u8 first_cfi[0x1]; + u8 first_vlan_id[0xc]; +}; + +struct mlx5_ifc_ste_eth_l3_ipv4_misc_v1_bits { + u8 identification[0x10]; + u8 flags[0x3]; + u8 fragment_offset[0xd]; + + u8 total_length[0x10]; + u8 checksum[0x10]; + + u8 version[0x4]; + u8 ihl[0x4]; + u8 time_to_live[0x8]; + u8 reserved_at_50[0x10]; + + u8 reserved_at_60[0x1c]; + u8 voq_internal_prio[0x4]; +}; + +struct mlx5_ifc_ste_eth_l4_v1_bits { + u8 ipv6_version[0x4]; + u8 reserved_at_4[0x4]; + u8 dscp[0x6]; + u8 ecn[0x2]; + u8 ipv6_hop_limit[0x8]; + u8 protocol[0x8]; + + u8 src_port[0x10]; + u8 dst_port[0x10]; + + u8 first_fragment[0x1]; + u8 reserved_at_41[0xb]; + u8 flow_label[0x14]; + + u8 tcp_data_offset[0x4]; + u8 l4_ok[0x1]; + u8 l3_ok[0x1]; + u8 fragmented[0x1]; + u8 tcp_ns[0x1]; + u8 tcp_cwr[0x1]; + u8 tcp_ece[0x1]; + u8 tcp_urg[0x1]; + u8 tcp_ack[0x1]; + u8 tcp_psh[0x1]; + u8 tcp_rst[0x1]; + u8 tcp_syn[0x1]; + u8 tcp_fin[0x1]; + u8 ipv6_paylen[0x10]; +}; + +struct mlx5_ifc_ste_eth_l4_misc_v1_bits { + u8 window_size[0x10]; + u8 urgent_pointer[0x10]; + + u8 ack_num[0x20]; + + u8 seq_num[0x20]; + + u8 length[0x10]; + u8 checksum[0x10]; +}; + +struct mlx5_ifc_ste_mpls_v1_bits { + u8 reserved_at_0[0x15]; + u8 mpls_ok[0x1]; + u8 mpls4_s_bit[0x1]; + u8 mpls4_qualifier[0x1]; + u8 mpls3_s_bit[0x1]; + u8 mpls3_qualifier[0x1]; + u8 mpls2_s_bit[0x1]; + u8 mpls2_qualifier[0x1]; + u8 mpls1_s_bit[0x1]; + u8 mpls1_qualifier[0x1]; + u8 mpls0_s_bit[0x1]; + u8 mpls0_qualifier[0x1]; + + u8 mpls0_label[0x14]; + u8 mpls0_exp[0x3]; + u8 mpls0_s_bos[0x1]; + u8 mpls0_ttl[0x8]; + + u8 mpls1_label[0x20]; + + u8 mpls2_label[0x20]; +}; + +struct mlx5_ifc_ste_gre_v1_bits { + u8 gre_c_present[0x1]; + u8 reserved_at_1[0x1]; + u8 gre_k_present[0x1]; + u8 gre_s_present[0x1]; + u8 strict_src_route[0x1]; + u8 recur[0x3]; + u8 flags[0x5]; + u8 version[0x3]; + u8 gre_protocol[0x10]; + + u8 reserved_at_20[0x20]; + + u8 gre_key_h[0x18]; + u8 gre_key_l[0x8]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_ste_src_gvmi_qp_v1_bits { + u8 loopback_synd[0x8]; + u8 reserved_at_8[0x7]; + u8 functional_lb[0x1]; + u8 source_gvmi[0x10]; + + u8 force_lb[0x1]; + u8 reserved_at_21[0x1]; + u8 source_is_requestor[0x1]; + u8 reserved_at_23[0x5]; + u8 source_qp[0x18]; + + u8 reserved_at_40[0x20]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_ste_icmp_v1_bits { + u8 icmp_payload_data[0x20]; + + u8 icmp_header_data[0x20]; + + u8 icmp_type[0x8]; + u8 icmp_code[0x8]; + u8 reserved_at_50[0x10]; + + u8 reserved_at_60[0x20]; +}; + +#endif /* MLX5_IFC_DR_STE_V1_H */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h new file mode 100644 index 0000000..5887388 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h @@ -0,0 +1,187 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2019, Mellanox Technologies */ + +#ifndef _MLX5DR_H_ +#define _MLX5DR_H_ + +struct mlx5dr_domain; +struct mlx5dr_table; +struct mlx5dr_matcher; +struct mlx5dr_rule; +struct mlx5dr_action; + +enum mlx5dr_domain_type { + MLX5DR_DOMAIN_TYPE_NIC_RX, + MLX5DR_DOMAIN_TYPE_NIC_TX, + MLX5DR_DOMAIN_TYPE_FDB, +}; + +enum mlx5dr_domain_sync_flags { + MLX5DR_DOMAIN_SYNC_FLAGS_SW = 1 << 0, + MLX5DR_DOMAIN_SYNC_FLAGS_HW = 1 << 1, +}; + +enum mlx5dr_action_reformat_type { + DR_ACTION_REFORMAT_TYP_TNL_L2_TO_L2, + DR_ACTION_REFORMAT_TYP_L2_TO_TNL_L2, + DR_ACTION_REFORMAT_TYP_TNL_L3_TO_L2, + DR_ACTION_REFORMAT_TYP_L2_TO_TNL_L3, + DR_ACTION_REFORMAT_TYP_INSERT_HDR, + DR_ACTION_REFORMAT_TYP_REMOVE_HDR, +}; + +struct mlx5dr_match_parameters { + size_t match_sz; + u64 *match_buf; /* Device spec format */ +}; + +struct mlx5dr_action_dest { + struct mlx5dr_action *dest; + struct mlx5dr_action *reformat; +}; + +struct mlx5dr_domain * +mlx5dr_domain_create(struct mlx5_core_dev *mdev, enum mlx5dr_domain_type type); + +int mlx5dr_domain_destroy(struct mlx5dr_domain *domain); + +int mlx5dr_domain_sync(struct mlx5dr_domain *domain, u32 flags); + +void mlx5dr_domain_set_peer(struct mlx5dr_domain *dmn, + struct mlx5dr_domain *peer_dmn); + +struct mlx5dr_table * +mlx5dr_table_create(struct mlx5dr_domain *domain, u32 level, u32 flags, + u16 uid); + +struct mlx5dr_table * +mlx5dr_table_get_from_fs_ft(struct mlx5_flow_table *ft); + +int mlx5dr_table_destroy(struct mlx5dr_table *table); + +u32 mlx5dr_table_get_id(struct mlx5dr_table *table); + +struct mlx5dr_matcher * +mlx5dr_matcher_create(struct mlx5dr_table *table, + u32 priority, + u8 match_criteria_enable, + struct mlx5dr_match_parameters *mask); + +int mlx5dr_matcher_destroy(struct mlx5dr_matcher *matcher); + +struct mlx5dr_rule * +mlx5dr_rule_create(struct mlx5dr_matcher *matcher, + struct mlx5dr_match_parameters *value, + size_t num_actions, + struct mlx5dr_action *actions[], + u32 flow_source); + +int mlx5dr_rule_destroy(struct mlx5dr_rule *rule); + +int mlx5dr_table_set_miss_action(struct mlx5dr_table *tbl, + struct mlx5dr_action *action); + +struct mlx5dr_action * +mlx5dr_action_create_dest_table_num(struct mlx5dr_domain *dmn, u32 table_num); + +struct mlx5dr_action * +mlx5dr_action_create_dest_table(struct mlx5dr_table *table); + +struct mlx5dr_action * +mlx5dr_action_create_dest_flow_fw_table(struct mlx5dr_domain *domain, + struct mlx5_flow_table *ft); + +struct mlx5dr_action * +mlx5dr_action_create_dest_vport(struct mlx5dr_domain *domain, + u16 vport, u8 vhca_id_valid, + u16 vhca_id); + +struct mlx5dr_action * +mlx5dr_action_create_mult_dest_tbl(struct mlx5dr_domain *dmn, + struct mlx5dr_action_dest *dests, + u32 num_of_dests, + bool ignore_flow_level, + u32 flow_source); + +struct mlx5dr_action *mlx5dr_action_create_drop(void); + +struct mlx5dr_action *mlx5dr_action_create_tag(u32 tag_value); + +struct mlx5dr_action * +mlx5dr_action_create_flow_sampler(struct mlx5dr_domain *dmn, u32 sampler_id); + +struct mlx5dr_action * +mlx5dr_action_create_flow_counter(u32 counter_id); + +struct mlx5dr_action * +mlx5dr_action_create_packet_reformat(struct mlx5dr_domain *dmn, + enum mlx5dr_action_reformat_type reformat_type, + u8 reformat_param_0, + u8 reformat_param_1, + size_t data_sz, + void *data); + +struct mlx5dr_action * +mlx5dr_action_create_modify_header(struct mlx5dr_domain *domain, + u32 flags, + size_t actions_sz, + __be64 actions[]); + +struct mlx5dr_action *mlx5dr_action_create_pop_vlan(void); + +struct mlx5dr_action * +mlx5dr_action_create_push_vlan(struct mlx5dr_domain *domain, __be32 vlan_hdr); + +u32 mlx5dr_action_get_pkt_reformat_id(struct mlx5dr_action *action); + +struct mlx5dr_action * +mlx5dr_action_create_aso(struct mlx5dr_domain *dmn, + u32 obj_id, + u8 return_reg_id, + u8 aso_type, + u8 init_color, + u8 meter_id); + +int mlx5dr_action_destroy(struct mlx5dr_action *action); + +static inline bool +mlx5dr_is_supported(struct mlx5_core_dev *dev) +{ + return MLX5_CAP_GEN(dev, roce) && + (MLX5_CAP_ESW_FLOWTABLE_FDB(dev, sw_owner) || + (MLX5_CAP_ESW_FLOWTABLE_FDB(dev, sw_owner_v2) && + (MLX5_CAP_GEN(dev, steering_format_version) <= + MLX5_STEERING_FORMAT_CONNECTX_7))); +} + +/* buddy functions & structure */ + +struct mlx5dr_icm_mr; + +struct mlx5dr_icm_buddy_mem { + unsigned long **bitmap; + unsigned int *num_free; + u32 max_order; + struct list_head list_node; + struct mlx5dr_icm_mr *icm_mr; + struct mlx5dr_icm_pool *pool; + + /* Amount of memory in used chunks - HW may be accessing this memory */ + u64 used_memory; + + /* Memory optimisation */ + struct mlx5dr_ste *ste_arr; + struct list_head *miss_list; + u8 *hw_ste_arr; +}; + +int mlx5dr_buddy_init(struct mlx5dr_icm_buddy_mem *buddy, + unsigned int max_order); +void mlx5dr_buddy_cleanup(struct mlx5dr_icm_buddy_mem *buddy); +int mlx5dr_buddy_alloc_mem(struct mlx5dr_icm_buddy_mem *buddy, + unsigned int order, + unsigned int *segment); +void mlx5dr_buddy_free_mem(struct mlx5dr_icm_buddy_mem *buddy, + unsigned int seg, unsigned int order); + +#endif /* _MLX5DR_H_ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/transobj.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/transobj.c new file mode 100644 index 0000000..b6931bb --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/transobj.c @@ -0,0 +1,505 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include "mlx5_core.h" +#include + +int mlx5_core_alloc_transport_domain(struct mlx5_core_dev *dev, u32 *tdn) +{ + u32 out[MLX5_ST_SZ_DW(alloc_transport_domain_out)] = {}; + u32 in[MLX5_ST_SZ_DW(alloc_transport_domain_in)] = {}; + int err; + + MLX5_SET(alloc_transport_domain_in, in, opcode, + MLX5_CMD_OP_ALLOC_TRANSPORT_DOMAIN); + + err = mlx5_cmd_exec_inout(dev, alloc_transport_domain, in, out); + if (!err) + *tdn = MLX5_GET(alloc_transport_domain_out, out, + transport_domain); + + return err; +} +EXPORT_SYMBOL(mlx5_core_alloc_transport_domain); + +void mlx5_core_dealloc_transport_domain(struct mlx5_core_dev *dev, u32 tdn) +{ + u32 in[MLX5_ST_SZ_DW(dealloc_transport_domain_in)] = {}; + + MLX5_SET(dealloc_transport_domain_in, in, opcode, + MLX5_CMD_OP_DEALLOC_TRANSPORT_DOMAIN); + MLX5_SET(dealloc_transport_domain_in, in, transport_domain, tdn); + mlx5_cmd_exec_in(dev, dealloc_transport_domain, in); +} +EXPORT_SYMBOL(mlx5_core_dealloc_transport_domain); + +int mlx5_core_create_rq(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *rqn) +{ + u32 out[MLX5_ST_SZ_DW(create_rq_out)] = {}; + int err; + + MLX5_SET(create_rq_in, in, opcode, MLX5_CMD_OP_CREATE_RQ); + err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); + if (!err) + *rqn = MLX5_GET(create_rq_out, out, rqn); + + return err; +} +EXPORT_SYMBOL(mlx5_core_create_rq); + +int mlx5_core_modify_rq(struct mlx5_core_dev *dev, u32 rqn, u32 *in) +{ + MLX5_SET(modify_rq_in, in, rqn, rqn); + MLX5_SET(modify_rq_in, in, opcode, MLX5_CMD_OP_MODIFY_RQ); + + return mlx5_cmd_exec_in(dev, modify_rq, in); +} +EXPORT_SYMBOL(mlx5_core_modify_rq); + +void mlx5_core_destroy_rq(struct mlx5_core_dev *dev, u32 rqn) +{ + u32 in[MLX5_ST_SZ_DW(destroy_rq_in)] = {}; + + MLX5_SET(destroy_rq_in, in, opcode, MLX5_CMD_OP_DESTROY_RQ); + MLX5_SET(destroy_rq_in, in, rqn, rqn); + mlx5_cmd_exec_in(dev, destroy_rq, in); +} +EXPORT_SYMBOL(mlx5_core_destroy_rq); + +int mlx5_core_query_rq(struct mlx5_core_dev *dev, u32 rqn, u32 *out) +{ + u32 in[MLX5_ST_SZ_DW(query_rq_in)] = {}; + + MLX5_SET(query_rq_in, in, opcode, MLX5_CMD_OP_QUERY_RQ); + MLX5_SET(query_rq_in, in, rqn, rqn); + + return mlx5_cmd_exec_inout(dev, query_rq, in, out); +} +EXPORT_SYMBOL(mlx5_core_query_rq); + +int mlx5_core_create_sq(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *sqn) +{ + u32 out[MLX5_ST_SZ_DW(create_sq_out)] = {}; + int err; + + MLX5_SET(create_sq_in, in, opcode, MLX5_CMD_OP_CREATE_SQ); + err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); + if (!err) + *sqn = MLX5_GET(create_sq_out, out, sqn); + + return err; +} + +int mlx5_core_modify_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *in) +{ + MLX5_SET(modify_sq_in, in, sqn, sqn); + MLX5_SET(modify_sq_in, in, opcode, MLX5_CMD_OP_MODIFY_SQ); + return mlx5_cmd_exec_in(dev, modify_sq, in); +} +EXPORT_SYMBOL(mlx5_core_modify_sq); + +void mlx5_core_destroy_sq(struct mlx5_core_dev *dev, u32 sqn) +{ + u32 in[MLX5_ST_SZ_DW(destroy_sq_in)] = {}; + + MLX5_SET(destroy_sq_in, in, opcode, MLX5_CMD_OP_DESTROY_SQ); + MLX5_SET(destroy_sq_in, in, sqn, sqn); + mlx5_cmd_exec_in(dev, destroy_sq, in); +} + +int mlx5_core_query_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *out) +{ + u32 in[MLX5_ST_SZ_DW(query_sq_in)] = {}; + + MLX5_SET(query_sq_in, in, opcode, MLX5_CMD_OP_QUERY_SQ); + MLX5_SET(query_sq_in, in, sqn, sqn); + return mlx5_cmd_exec_inout(dev, query_sq, in, out); +} +EXPORT_SYMBOL(mlx5_core_query_sq); + +int mlx5_core_query_sq_state(struct mlx5_core_dev *dev, u32 sqn, u8 *state) +{ + void *out; + void *sqc; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(query_sq_out); + out = kvzalloc(inlen, GFP_KERNEL); + if (!out) + return -ENOMEM; + + err = mlx5_core_query_sq(dev, sqn, out); + if (err) + goto out; + + sqc = MLX5_ADDR_OF(query_sq_out, out, sq_context); + *state = MLX5_GET(sqc, sqc, state); + +out: + kvfree(out); + return err; +} +EXPORT_SYMBOL_GPL(mlx5_core_query_sq_state); + +int mlx5_core_create_tir(struct mlx5_core_dev *dev, u32 *in, u32 *tirn) +{ + u32 out[MLX5_ST_SZ_DW(create_tir_out)] = {}; + int err; + + MLX5_SET(create_tir_in, in, opcode, MLX5_CMD_OP_CREATE_TIR); + err = mlx5_cmd_exec_inout(dev, create_tir, in, out); + if (!err) + *tirn = MLX5_GET(create_tir_out, out, tirn); + + return err; +} +EXPORT_SYMBOL(mlx5_core_create_tir); + +int mlx5_core_modify_tir(struct mlx5_core_dev *dev, u32 tirn, u32 *in) +{ + MLX5_SET(modify_tir_in, in, tirn, tirn); + MLX5_SET(modify_tir_in, in, opcode, MLX5_CMD_OP_MODIFY_TIR); + return mlx5_cmd_exec_in(dev, modify_tir, in); +} + +void mlx5_core_destroy_tir(struct mlx5_core_dev *dev, u32 tirn) +{ + u32 in[MLX5_ST_SZ_DW(destroy_tir_in)] = {}; + + MLX5_SET(destroy_tir_in, in, opcode, MLX5_CMD_OP_DESTROY_TIR); + MLX5_SET(destroy_tir_in, in, tirn, tirn); + mlx5_cmd_exec_in(dev, destroy_tir, in); +} +EXPORT_SYMBOL(mlx5_core_destroy_tir); + +int mlx5_core_create_tis(struct mlx5_core_dev *dev, u32 *in, u32 *tisn) +{ + u32 out[MLX5_ST_SZ_DW(create_tis_out)] = {}; + int err; + + MLX5_SET(create_tis_in, in, opcode, MLX5_CMD_OP_CREATE_TIS); + err = mlx5_cmd_exec_inout(dev, create_tis, in, out); + if (!err) + *tisn = MLX5_GET(create_tis_out, out, tisn); + + return err; +} +EXPORT_SYMBOL(mlx5_core_create_tis); + +int mlx5_core_modify_tis(struct mlx5_core_dev *dev, u32 tisn, u32 *in) +{ + MLX5_SET(modify_tis_in, in, tisn, tisn); + MLX5_SET(modify_tis_in, in, opcode, MLX5_CMD_OP_MODIFY_TIS); + + return mlx5_cmd_exec_in(dev, modify_tis, in); +} +EXPORT_SYMBOL(mlx5_core_modify_tis); + +void mlx5_core_destroy_tis(struct mlx5_core_dev *dev, u32 tisn) +{ + u32 in[MLX5_ST_SZ_DW(destroy_tis_in)] = {}; + + MLX5_SET(destroy_tis_in, in, opcode, MLX5_CMD_OP_DESTROY_TIS); + MLX5_SET(destroy_tis_in, in, tisn, tisn); + mlx5_cmd_exec_in(dev, destroy_tis, in); +} +EXPORT_SYMBOL(mlx5_core_destroy_tis); + +int mlx5_core_create_rqt(struct mlx5_core_dev *dev, u32 *in, int inlen, + u32 *rqtn) +{ + u32 out[MLX5_ST_SZ_DW(create_rqt_out)] = {}; + int err; + + MLX5_SET(create_rqt_in, in, opcode, MLX5_CMD_OP_CREATE_RQT); + err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); + if (!err) + *rqtn = MLX5_GET(create_rqt_out, out, rqtn); + + return err; +} +EXPORT_SYMBOL(mlx5_core_create_rqt); + +int mlx5_core_modify_rqt(struct mlx5_core_dev *dev, u32 rqtn, u32 *in, + int inlen) +{ + u32 out[MLX5_ST_SZ_DW(modify_rqt_out)] = {}; + + MLX5_SET(modify_rqt_in, in, rqtn, rqtn); + MLX5_SET(modify_rqt_in, in, opcode, MLX5_CMD_OP_MODIFY_RQT); + return mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); +} + +void mlx5_core_destroy_rqt(struct mlx5_core_dev *dev, u32 rqtn) +{ + u32 in[MLX5_ST_SZ_DW(destroy_rqt_in)] = {}; + + MLX5_SET(destroy_rqt_in, in, opcode, MLX5_CMD_OP_DESTROY_RQT); + MLX5_SET(destroy_rqt_in, in, rqtn, rqtn); + mlx5_cmd_exec_in(dev, destroy_rqt, in); +} +EXPORT_SYMBOL(mlx5_core_destroy_rqt); + +static int mlx5_hairpin_create_rq(struct mlx5_core_dev *mdev, + struct mlx5_hairpin_params *params, u32 *rqn) +{ + u32 in[MLX5_ST_SZ_DW(create_rq_in)] = {0}; + void *rqc, *wq; + + rqc = MLX5_ADDR_OF(create_rq_in, in, ctx); + wq = MLX5_ADDR_OF(rqc, rqc, wq); + + MLX5_SET(rqc, rqc, hairpin, 1); + MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RST); + MLX5_SET(rqc, rqc, counter_set_id, params->q_counter); + + MLX5_SET(wq, wq, log_hairpin_data_sz, params->log_data_size); + MLX5_SET(wq, wq, log_hairpin_num_packets, params->log_num_packets); + + return mlx5_core_create_rq(mdev, in, MLX5_ST_SZ_BYTES(create_rq_in), rqn); +} + +static int mlx5_hairpin_create_sq(struct mlx5_core_dev *mdev, + struct mlx5_hairpin_params *params, u32 *sqn) +{ + u32 in[MLX5_ST_SZ_DW(create_sq_in)] = {0}; + void *sqc, *wq; + + sqc = MLX5_ADDR_OF(create_sq_in, in, ctx); + wq = MLX5_ADDR_OF(sqc, sqc, wq); + + MLX5_SET(sqc, sqc, hairpin, 1); + MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RST); + + MLX5_SET(wq, wq, log_hairpin_data_sz, params->log_data_size); + MLX5_SET(wq, wq, log_hairpin_num_packets, params->log_num_packets); + + return mlx5_core_create_sq(mdev, in, MLX5_ST_SZ_BYTES(create_sq_in), sqn); +} + +static int mlx5_hairpin_create_queues(struct mlx5_hairpin *hp, + struct mlx5_hairpin_params *params) +{ + int i, j, err; + + for (i = 0; i < hp->num_channels; i++) { + err = mlx5_hairpin_create_rq(hp->func_mdev, params, &hp->rqn[i]); + if (err) + goto out_err_rq; + } + + for (i = 0; i < hp->num_channels; i++) { + err = mlx5_hairpin_create_sq(hp->peer_mdev, params, &hp->sqn[i]); + if (err) + goto out_err_sq; + } + + return 0; + +out_err_sq: + for (j = 0; j < i; j++) + mlx5_core_destroy_sq(hp->peer_mdev, hp->sqn[j]); + i = hp->num_channels; +out_err_rq: + for (j = 0; j < i; j++) + mlx5_core_destroy_rq(hp->func_mdev, hp->rqn[j]); + return err; +} + +static void mlx5_hairpin_destroy_queues(struct mlx5_hairpin *hp) +{ + int i; + + for (i = 0; i < hp->num_channels; i++) { + mlx5_core_destroy_rq(hp->func_mdev, hp->rqn[i]); + if (!hp->peer_gone) + mlx5_core_destroy_sq(hp->peer_mdev, hp->sqn[i]); + } +} + +static int mlx5_hairpin_modify_rq(struct mlx5_core_dev *func_mdev, u32 rqn, + int curr_state, int next_state, + u16 peer_vhca, u32 peer_sq) +{ + u32 in[MLX5_ST_SZ_DW(modify_rq_in)] = {}; + void *rqc; + + rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx); + + if (next_state == MLX5_RQC_STATE_RDY) { + MLX5_SET(rqc, rqc, hairpin_peer_sq, peer_sq); + MLX5_SET(rqc, rqc, hairpin_peer_vhca, peer_vhca); + } + + MLX5_SET(modify_rq_in, in, rq_state, curr_state); + MLX5_SET(rqc, rqc, state, next_state); + + return mlx5_core_modify_rq(func_mdev, rqn, in); +} + +static int mlx5_hairpin_modify_sq(struct mlx5_core_dev *peer_mdev, u32 sqn, + int curr_state, int next_state, + u16 peer_vhca, u32 peer_rq) +{ + u32 in[MLX5_ST_SZ_DW(modify_sq_in)] = {0}; + void *sqc; + + sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx); + + if (next_state == MLX5_SQC_STATE_RDY) { + MLX5_SET(sqc, sqc, hairpin_peer_rq, peer_rq); + MLX5_SET(sqc, sqc, hairpin_peer_vhca, peer_vhca); + } + + MLX5_SET(modify_sq_in, in, sq_state, curr_state); + MLX5_SET(sqc, sqc, state, next_state); + + return mlx5_core_modify_sq(peer_mdev, sqn, in); +} + +static int mlx5_hairpin_pair_queues(struct mlx5_hairpin *hp) +{ + int i, j, err; + + /* set peer SQs */ + for (i = 0; i < hp->num_channels; i++) { + err = mlx5_hairpin_modify_sq(hp->peer_mdev, hp->sqn[i], + MLX5_SQC_STATE_RST, MLX5_SQC_STATE_RDY, + MLX5_CAP_GEN(hp->func_mdev, vhca_id), hp->rqn[i]); + if (err) + goto err_modify_sq; + } + + /* set func RQs */ + for (i = 0; i < hp->num_channels; i++) { + err = mlx5_hairpin_modify_rq(hp->func_mdev, hp->rqn[i], + MLX5_RQC_STATE_RST, MLX5_RQC_STATE_RDY, + MLX5_CAP_GEN(hp->peer_mdev, vhca_id), hp->sqn[i]); + if (err) + goto err_modify_rq; + } + + return 0; + +err_modify_rq: + for (j = 0; j < i; j++) + mlx5_hairpin_modify_rq(hp->func_mdev, hp->rqn[j], MLX5_RQC_STATE_RDY, + MLX5_RQC_STATE_RST, 0, 0); + i = hp->num_channels; +err_modify_sq: + for (j = 0; j < i; j++) + mlx5_hairpin_modify_sq(hp->peer_mdev, hp->sqn[j], MLX5_SQC_STATE_RDY, + MLX5_SQC_STATE_RST, 0, 0); + return err; +} + +static void mlx5_hairpin_unpair_peer_sq(struct mlx5_hairpin *hp) +{ + int i; + + for (i = 0; i < hp->num_channels; i++) + mlx5_hairpin_modify_sq(hp->peer_mdev, hp->sqn[i], MLX5_SQC_STATE_RDY, + MLX5_SQC_STATE_RST, 0, 0); +} + +static void mlx5_hairpin_unpair_queues(struct mlx5_hairpin *hp) +{ + int i; + + /* unset func RQs */ + for (i = 0; i < hp->num_channels; i++) + mlx5_hairpin_modify_rq(hp->func_mdev, hp->rqn[i], MLX5_RQC_STATE_RDY, + MLX5_RQC_STATE_RST, 0, 0); + /* unset peer SQs */ + if (!hp->peer_gone) + mlx5_hairpin_unpair_peer_sq(hp); +} + +struct mlx5_hairpin * +mlx5_core_hairpin_create(struct mlx5_core_dev *func_mdev, + struct mlx5_core_dev *peer_mdev, + struct mlx5_hairpin_params *params) +{ + struct mlx5_hairpin *hp; + int size, err; + + size = sizeof(*hp) + params->num_channels * 2 * sizeof(u32); + hp = kzalloc(size, GFP_KERNEL); + if (!hp) + return ERR_PTR(-ENOMEM); + + hp->func_mdev = func_mdev; + hp->peer_mdev = peer_mdev; + hp->num_channels = params->num_channels; + + hp->rqn = (void *)hp + sizeof(*hp); + hp->sqn = hp->rqn + params->num_channels; + + /* alloc and pair func --> peer hairpin */ + err = mlx5_hairpin_create_queues(hp, params); + if (err) + goto err_create_queues; + + err = mlx5_hairpin_pair_queues(hp); + if (err) + goto err_pair_queues; + + return hp; + +err_pair_queues: + mlx5_hairpin_destroy_queues(hp); +err_create_queues: + kfree(hp); + return ERR_PTR(err); +} + +void mlx5_core_hairpin_destroy(struct mlx5_hairpin *hp) +{ + mlx5_hairpin_unpair_queues(hp); + mlx5_hairpin_destroy_queues(hp); + kfree(hp); +} + +void mlx5_core_hairpin_clear_dead_peer(struct mlx5_hairpin *hp) +{ + int i; + + mlx5_hairpin_unpair_peer_sq(hp); + + /* destroy peer SQ */ + for (i = 0; i < hp->num_channels; i++) + mlx5_core_destroy_sq(hp->peer_mdev, hp->sqn[i]); + + hp->peer_gone = true; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/uar.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/uar.c new file mode 100644 index 0000000..eb93f72 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/uar.c @@ -0,0 +1,407 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include "mlx5_core.h" + +static int mlx5_cmd_alloc_uar(struct mlx5_core_dev *dev, u32 *uarn) +{ + u32 out[MLX5_ST_SZ_DW(alloc_uar_out)] = {}; + u32 in[MLX5_ST_SZ_DW(alloc_uar_in)] = {}; + int err; + + MLX5_SET(alloc_uar_in, in, opcode, MLX5_CMD_OP_ALLOC_UAR); + err = mlx5_cmd_exec_inout(dev, alloc_uar, in, out); + if (err) + return err; + + *uarn = MLX5_GET(alloc_uar_out, out, uar); + return 0; +} + +static int mlx5_cmd_free_uar(struct mlx5_core_dev *dev, u32 uarn) +{ + u32 in[MLX5_ST_SZ_DW(dealloc_uar_in)] = {}; + + MLX5_SET(dealloc_uar_in, in, opcode, MLX5_CMD_OP_DEALLOC_UAR); + MLX5_SET(dealloc_uar_in, in, uar, uarn); + return mlx5_cmd_exec_in(dev, dealloc_uar, in); +} + +static int uars_per_sys_page(struct mlx5_core_dev *mdev) +{ + if (MLX5_CAP_GEN(mdev, uar_4k)) + return MLX5_CAP_GEN(mdev, num_of_uars_per_page); + + return 1; +} + +static u64 uar2pfn(struct mlx5_core_dev *mdev, u32 index) +{ + u32 system_page_index; + + if (MLX5_CAP_GEN(mdev, uar_4k)) + system_page_index = index >> (PAGE_SHIFT - MLX5_ADAPTER_PAGE_SHIFT); + else + system_page_index = index; + + return (mdev->bar_addr >> PAGE_SHIFT) + system_page_index; +} + +static void up_rel_func(struct kref *kref) +{ + struct mlx5_uars_page *up = container_of(kref, struct mlx5_uars_page, ref_count); + + list_del(&up->list); + iounmap(up->map); + if (mlx5_cmd_free_uar(up->mdev, up->index)) + mlx5_core_warn(up->mdev, "failed to free uar index %d\n", up->index); + bitmap_free(up->reg_bitmap); + bitmap_free(up->fp_bitmap); + kfree(up); +} + +static struct mlx5_uars_page *alloc_uars_page(struct mlx5_core_dev *mdev, + bool map_wc) +{ + struct mlx5_uars_page *up; + int err = -ENOMEM; + phys_addr_t pfn; + int bfregs; + int i; + + bfregs = uars_per_sys_page(mdev) * MLX5_BFREGS_PER_UAR; + up = kzalloc(sizeof(*up), GFP_KERNEL); + if (!up) + return ERR_PTR(err); + + up->mdev = mdev; + up->reg_bitmap = bitmap_zalloc(bfregs, GFP_KERNEL); + if (!up->reg_bitmap) + goto error1; + + up->fp_bitmap = bitmap_zalloc(bfregs, GFP_KERNEL); + if (!up->fp_bitmap) + goto error1; + + for (i = 0; i < bfregs; i++) + if ((i % MLX5_BFREGS_PER_UAR) < MLX5_NON_FP_BFREGS_PER_UAR) + set_bit(i, up->reg_bitmap); + else + set_bit(i, up->fp_bitmap); + + up->bfregs = bfregs; + up->fp_avail = bfregs * MLX5_FP_BFREGS_PER_UAR / MLX5_BFREGS_PER_UAR; + up->reg_avail = bfregs * MLX5_NON_FP_BFREGS_PER_UAR / MLX5_BFREGS_PER_UAR; + + err = mlx5_cmd_alloc_uar(mdev, &up->index); + if (err) { + mlx5_core_warn(mdev, "mlx5_cmd_alloc_uar() failed, %d\n", err); + goto error1; + } + + pfn = uar2pfn(mdev, up->index); + if (map_wc) { + up->map = ioremap_wc(pfn << PAGE_SHIFT, PAGE_SIZE); + if (!up->map) { + err = -EAGAIN; + goto error2; + } + } else { + up->map = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE); + if (!up->map) { + err = -ENOMEM; + goto error2; + } + } + kref_init(&up->ref_count); + mlx5_core_dbg(mdev, "allocated UAR page: index %d, total bfregs %d\n", + up->index, up->bfregs); + return up; + +error2: + if (mlx5_cmd_free_uar(mdev, up->index)) + mlx5_core_warn(mdev, "failed to free uar index %d\n", up->index); +error1: + bitmap_free(up->fp_bitmap); + bitmap_free(up->reg_bitmap); + kfree(up); + return ERR_PTR(err); +} + +struct mlx5_uars_page *mlx5_get_uars_page(struct mlx5_core_dev *mdev) +{ + struct mlx5_uars_page *ret; + + mutex_lock(&mdev->priv.bfregs.reg_head.lock); + if (!list_empty(&mdev->priv.bfregs.reg_head.list)) { + ret = list_first_entry(&mdev->priv.bfregs.reg_head.list, + struct mlx5_uars_page, list); + kref_get(&ret->ref_count); + goto out; + } + ret = alloc_uars_page(mdev, false); + if (IS_ERR(ret)) + goto out; + list_add(&ret->list, &mdev->priv.bfregs.reg_head.list); +out: + mutex_unlock(&mdev->priv.bfregs.reg_head.lock); + + return ret; +} +EXPORT_SYMBOL(mlx5_get_uars_page); + +void mlx5_put_uars_page(struct mlx5_core_dev *mdev, struct mlx5_uars_page *up) +{ + mutex_lock(&mdev->priv.bfregs.reg_head.lock); + kref_put(&up->ref_count, up_rel_func); + mutex_unlock(&mdev->priv.bfregs.reg_head.lock); +} +EXPORT_SYMBOL(mlx5_put_uars_page); + +static unsigned long map_offset(struct mlx5_core_dev *mdev, int dbi) +{ + /* return the offset in bytes from the start of the page to the + * blue flame area of the UAR + */ + return dbi / MLX5_BFREGS_PER_UAR * MLX5_ADAPTER_PAGE_SIZE + + (dbi % MLX5_BFREGS_PER_UAR) * + (1 << MLX5_CAP_GEN(mdev, log_bf_reg_size)) + MLX5_BF_OFFSET; +} + +static int alloc_bfreg(struct mlx5_core_dev *mdev, struct mlx5_sq_bfreg *bfreg, + bool map_wc, bool fast_path) +{ + struct mlx5_bfreg_data *bfregs; + struct mlx5_uars_page *up; + struct list_head *head; + unsigned long *bitmap; + unsigned int *avail; + struct mutex *lock; /* pointer to right mutex */ + int dbi; + + bfregs = &mdev->priv.bfregs; + if (map_wc) { + head = &bfregs->wc_head.list; + lock = &bfregs->wc_head.lock; + } else { + head = &bfregs->reg_head.list; + lock = &bfregs->reg_head.lock; + } + mutex_lock(lock); + if (list_empty(head)) { + up = alloc_uars_page(mdev, map_wc); + if (IS_ERR(up)) { + mutex_unlock(lock); + return PTR_ERR(up); + } + list_add(&up->list, head); + } else { + up = list_entry(head->next, struct mlx5_uars_page, list); + kref_get(&up->ref_count); + } + if (fast_path) { + bitmap = up->fp_bitmap; + avail = &up->fp_avail; + } else { + bitmap = up->reg_bitmap; + avail = &up->reg_avail; + } + dbi = find_first_bit(bitmap, up->bfregs); + clear_bit(dbi, bitmap); + (*avail)--; + if (!(*avail)) + list_del(&up->list); + + bfreg->map = up->map + map_offset(mdev, dbi); + bfreg->up = up; + bfreg->wc = map_wc; + bfreg->index = up->index + dbi / MLX5_BFREGS_PER_UAR; + mutex_unlock(lock); + + return 0; +} + +int mlx5_alloc_bfreg(struct mlx5_core_dev *mdev, struct mlx5_sq_bfreg *bfreg, + bool map_wc, bool fast_path) +{ + int err; + + err = alloc_bfreg(mdev, bfreg, map_wc, fast_path); + if (!err) + return 0; + + if (err == -EAGAIN && map_wc) + return alloc_bfreg(mdev, bfreg, false, fast_path); + + return err; +} +EXPORT_SYMBOL(mlx5_alloc_bfreg); + +static unsigned int addr_to_dbi_in_syspage(struct mlx5_core_dev *dev, + struct mlx5_uars_page *up, + struct mlx5_sq_bfreg *bfreg) +{ + unsigned int uar_idx; + unsigned int bfreg_idx; + unsigned int bf_reg_size; + + bf_reg_size = 1 << MLX5_CAP_GEN(dev, log_bf_reg_size); + + uar_idx = (bfreg->map - up->map) >> MLX5_ADAPTER_PAGE_SHIFT; + bfreg_idx = (((uintptr_t)bfreg->map % MLX5_ADAPTER_PAGE_SIZE) - MLX5_BF_OFFSET) / bf_reg_size; + + return uar_idx * MLX5_BFREGS_PER_UAR + bfreg_idx; +} + +void mlx5_free_bfreg(struct mlx5_core_dev *mdev, struct mlx5_sq_bfreg *bfreg) +{ + struct mlx5_bfreg_data *bfregs; + struct mlx5_uars_page *up; + struct mutex *lock; /* pointer to right mutex */ + unsigned int dbi; + bool fp; + unsigned int *avail; + unsigned long *bitmap; + struct list_head *head; + + bfregs = &mdev->priv.bfregs; + if (bfreg->wc) { + head = &bfregs->wc_head.list; + lock = &bfregs->wc_head.lock; + } else { + head = &bfregs->reg_head.list; + lock = &bfregs->reg_head.lock; + } + up = bfreg->up; + dbi = addr_to_dbi_in_syspage(mdev, up, bfreg); + fp = (dbi % MLX5_BFREGS_PER_UAR) >= MLX5_NON_FP_BFREGS_PER_UAR; + if (fp) { + avail = &up->fp_avail; + bitmap = up->fp_bitmap; + } else { + avail = &up->reg_avail; + bitmap = up->reg_bitmap; + } + mutex_lock(lock); + (*avail)++; + set_bit(dbi, bitmap); + if (*avail == 1) + list_add_tail(&up->list, head); + + kref_put(&up->ref_count, up_rel_func); + mutex_unlock(lock); +} +EXPORT_SYMBOL(mlx5_free_bfreg); + +static int mlx5_get_pcie_dev_link_caps(struct pci_dev *pdev, + enum pci_bus_speed *speed, + enum pcie_link_width *width) +{ + u32 lnkcap1, lnkcap2; + int err1, err2; + +#define PCI_EXP_LNKCAP_MLW_SHIFT 4 /* start of MLW mask in link capabilities */ + + *speed = PCI_SPEED_UNKNOWN; + *width = PCIE_LNK_WIDTH_UNKNOWN; + + err1 = pcie_capability_read_dword(pdev, PCI_EXP_LNKCAP, + &lnkcap1); + err2 = pcie_capability_read_dword(pdev, PCI_EXP_LNKCAP2, + &lnkcap2); + + if (err1 && err2) + return err1 ? err1 : err2; + + if (!err2 && lnkcap2) { /* PCIe r3.0-compliant */ + if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_8_0GB) + *speed = PCIE_SPEED_8_0GT; + else if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_5_0GB) + *speed = PCIE_SPEED_5_0GT; + else if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_2_5GB) + *speed = PCIE_SPEED_2_5GT; + } + if (!err1 && lnkcap1) { + *width = (lnkcap1 & PCI_EXP_LNKCAP_MLW) >> + PCI_EXP_LNKCAP_MLW_SHIFT; + if (*speed == PCI_SPEED_UNKNOWN) { /* pre-r3.0 */ + if (lnkcap1 & PCI_EXP_LNKCAP_SLS_8_0GB) + *speed = PCIE_SPEED_8_0GT; + else if (lnkcap1 & PCI_EXP_LNKCAP_SLS_5_0GB) + *speed = PCIE_SPEED_5_0GT; + else if (lnkcap1 & PCI_EXP_LNKCAP_SLS_2_5GB) + *speed = PCIE_SPEED_2_5GT; + } + } + + return 0; +} + +void mlx5_pcie_print_link_status(struct mlx5_core_dev *dev) +{ + enum pcie_link_width width, width_cap; + enum pci_bus_speed speed, speed_cap; + int err; + +#define PCIE_SPEED_STR(speed) \ + (speed == PCIE_SPEED_8_0GT ? "8.0GT/s" : \ + speed == PCIE_SPEED_5_0GT ? "5.0GT/s" : \ + speed == PCIE_SPEED_2_5GT ? "2.5GT/s" : \ + "Unknown") + + if (mlx5_get_pcie_dev_link_caps(dev->pdev, &speed_cap, &width_cap)) + return; + + err = pcie_bandwidth_available(dev->pdev, NULL, &speed, &width); + if (err || speed == PCI_SPEED_UNKNOWN || + width == PCIE_LNK_WIDTH_UNKNOWN) + return; + + if (width != width_cap) + mlx5_core_warn( + dev, + "PCIe width is lower than device's capability\n"); + if (speed != speed_cap) + mlx5_core_warn( + dev, + "PCIe speed is slower than device's capability\n"); + + mlx5_core_info(dev, "PCIe link speed is %s, device supports %s\n", + PCIE_SPEED_STR(speed), PCIE_SPEED_STR(speed_cap)); + mlx5_core_info(dev, "PCIe link width is x%d, device supports x%d\n", + width, width_cap); +} +EXPORT_SYMBOL(mlx5_pcie_print_link_status); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/vport.c new file mode 100644 index 0000000..054cb68 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/vport.c @@ -0,0 +1,1221 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include "mlx5_core.h" +#include "sf/sf.h" +#include "eswitch.h" + +/* Mutex to hold while enabling or disabling RoCE */ +static DEFINE_MUTEX(mlx5_roce_en_lock); + +u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport) +{ + u32 out[MLX5_ST_SZ_DW(query_vport_state_out)] = {}; + u32 in[MLX5_ST_SZ_DW(query_vport_state_in)] = {}; + int err; + + MLX5_SET(query_vport_state_in, in, opcode, + MLX5_CMD_OP_QUERY_VPORT_STATE); + MLX5_SET(query_vport_state_in, in, op_mod, opmod); + MLX5_SET(query_vport_state_in, in, vport_number, vport); + if (vport) + MLX5_SET(query_vport_state_in, in, other_vport, 1); + + err = mlx5_cmd_exec_inout(mdev, query_vport_state, in, out); + if (err) + return 0; + + return MLX5_GET(query_vport_state_out, out, state); +} + +int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, + u16 vport, u8 other_vport, u8 state) +{ + u32 in[MLX5_ST_SZ_DW(modify_vport_state_in)] = {}; + + MLX5_SET(modify_vport_state_in, in, opcode, + MLX5_CMD_OP_MODIFY_VPORT_STATE); + MLX5_SET(modify_vport_state_in, in, op_mod, opmod); + MLX5_SET(modify_vport_state_in, in, vport_number, vport); + MLX5_SET(modify_vport_state_in, in, other_vport, other_vport); + MLX5_SET(modify_vport_state_in, in, admin_state, state); + + return mlx5_cmd_exec_in(mdev, modify_vport_state, in); +} + +static int mlx5_query_nic_vport_context(struct mlx5_core_dev *mdev, u16 vport, + u32 *out) +{ + u32 in[MLX5_ST_SZ_DW(query_nic_vport_context_in)] = {}; + + MLX5_SET(query_nic_vport_context_in, in, opcode, + MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT); + MLX5_SET(query_nic_vport_context_in, in, vport_number, vport); + if (vport) + MLX5_SET(query_nic_vport_context_in, in, other_vport, 1); + + return mlx5_cmd_exec_inout(mdev, query_nic_vport_context, in, out); +} + +int mlx5_query_nic_vport_min_inline(struct mlx5_core_dev *mdev, + u16 vport, u8 *min_inline) +{ + u32 out[MLX5_ST_SZ_DW(query_nic_vport_context_out)] = {}; + int err; + + err = mlx5_query_nic_vport_context(mdev, vport, out); + if (!err) + *min_inline = MLX5_GET(query_nic_vport_context_out, out, + nic_vport_context.min_wqe_inline_mode); + return err; +} +EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_min_inline); + +void mlx5_query_min_inline(struct mlx5_core_dev *mdev, + u8 *min_inline_mode) +{ + switch (MLX5_CAP_ETH(mdev, wqe_inline_mode)) { + case MLX5_CAP_INLINE_MODE_VPORT_CONTEXT: + if (!mlx5_query_nic_vport_min_inline(mdev, 0, min_inline_mode)) + break; + fallthrough; + case MLX5_CAP_INLINE_MODE_L2: + *min_inline_mode = MLX5_INLINE_MODE_L2; + break; + case MLX5_CAP_INLINE_MODE_NOT_REQUIRED: + *min_inline_mode = MLX5_INLINE_MODE_NONE; + break; + } +} +EXPORT_SYMBOL_GPL(mlx5_query_min_inline); + +int mlx5_modify_nic_vport_min_inline(struct mlx5_core_dev *mdev, + u16 vport, u8 min_inline) +{ + u32 in[MLX5_ST_SZ_DW(modify_nic_vport_context_in)] = {}; + void *nic_vport_ctx; + + MLX5_SET(modify_nic_vport_context_in, in, + field_select.min_inline, 1); + MLX5_SET(modify_nic_vport_context_in, in, vport_number, vport); + MLX5_SET(modify_nic_vport_context_in, in, other_vport, 1); + + nic_vport_ctx = MLX5_ADDR_OF(modify_nic_vport_context_in, + in, nic_vport_context); + MLX5_SET(nic_vport_context, nic_vport_ctx, + min_wqe_inline_mode, min_inline); + MLX5_SET(modify_nic_vport_context_in, in, opcode, + MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT); + + return mlx5_cmd_exec_in(mdev, modify_nic_vport_context, in); +} + +int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev, + u16 vport, bool other, u8 *addr) +{ + u32 out[MLX5_ST_SZ_DW(query_nic_vport_context_out)] = {}; + u32 in[MLX5_ST_SZ_DW(query_nic_vport_context_in)] = {}; + u8 *out_addr; + int err; + + out_addr = MLX5_ADDR_OF(query_nic_vport_context_out, out, + nic_vport_context.permanent_address); + + MLX5_SET(query_nic_vport_context_in, in, opcode, + MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT); + MLX5_SET(query_nic_vport_context_in, in, vport_number, vport); + MLX5_SET(query_nic_vport_context_in, in, other_vport, other); + + err = mlx5_cmd_exec_inout(mdev, query_nic_vport_context, in, out); + if (!err) + ether_addr_copy(addr, &out_addr[2]); + + return err; +} +EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_mac_address); + +int mlx5_query_mac_address(struct mlx5_core_dev *mdev, u8 *addr) +{ + return mlx5_query_nic_vport_mac_address(mdev, 0, false, addr); +} +EXPORT_SYMBOL_GPL(mlx5_query_mac_address); + +int mlx5_modify_nic_vport_mac_address(struct mlx5_core_dev *mdev, + u16 vport, const u8 *addr) +{ + void *in; + int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in); + int err; + void *nic_vport_ctx; + u8 *perm_mac; + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(modify_nic_vport_context_in, in, + field_select.permanent_address, 1); + MLX5_SET(modify_nic_vport_context_in, in, vport_number, vport); + MLX5_SET(modify_nic_vport_context_in, in, other_vport, 1); + + nic_vport_ctx = MLX5_ADDR_OF(modify_nic_vport_context_in, + in, nic_vport_context); + perm_mac = MLX5_ADDR_OF(nic_vport_context, nic_vport_ctx, + permanent_address); + + ether_addr_copy(&perm_mac[2], addr); + MLX5_SET(modify_nic_vport_context_in, in, opcode, + MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT); + + err = mlx5_cmd_exec_in(mdev, modify_nic_vport_context, in); + + kvfree(in); + + return err; +} +EXPORT_SYMBOL_GPL(mlx5_modify_nic_vport_mac_address); + +int mlx5_query_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 *mtu) +{ + int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out); + u32 *out; + int err; + + out = kvzalloc(outlen, GFP_KERNEL); + if (!out) + return -ENOMEM; + + err = mlx5_query_nic_vport_context(mdev, 0, out); + if (!err) + *mtu = MLX5_GET(query_nic_vport_context_out, out, + nic_vport_context.mtu); + + kvfree(out); + return err; +} +EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_mtu); + +int mlx5_modify_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 mtu) +{ + int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in); + void *in; + int err; + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(modify_nic_vport_context_in, in, field_select.mtu, 1); + MLX5_SET(modify_nic_vport_context_in, in, nic_vport_context.mtu, mtu); + MLX5_SET(modify_nic_vport_context_in, in, opcode, + MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT); + + err = mlx5_cmd_exec_in(mdev, modify_nic_vport_context, in); + + kvfree(in); + return err; +} +EXPORT_SYMBOL_GPL(mlx5_modify_nic_vport_mtu); + +int mlx5_query_nic_vport_mac_list(struct mlx5_core_dev *dev, + u16 vport, + enum mlx5_list_type list_type, + u8 addr_list[][ETH_ALEN], + int *list_size) +{ + u32 in[MLX5_ST_SZ_DW(query_nic_vport_context_in)] = {0}; + void *nic_vport_ctx; + int max_list_size; + int req_list_size; + int out_sz; + void *out; + int err; + int i; + + req_list_size = *list_size; + + max_list_size = list_type == MLX5_NVPRT_LIST_TYPE_UC ? + 1 << MLX5_CAP_GEN(dev, log_max_current_uc_list) : + 1 << MLX5_CAP_GEN(dev, log_max_current_mc_list); + + if (req_list_size > max_list_size) { + mlx5_core_warn(dev, "Requested list size (%d) > (%d) max_list_size\n", + req_list_size, max_list_size); + req_list_size = max_list_size; + } + + out_sz = MLX5_ST_SZ_BYTES(query_nic_vport_context_in) + + req_list_size * MLX5_ST_SZ_BYTES(mac_address_layout); + + out = kzalloc(out_sz, GFP_KERNEL); + if (!out) + return -ENOMEM; + + MLX5_SET(query_nic_vport_context_in, in, opcode, + MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT); + MLX5_SET(query_nic_vport_context_in, in, allowed_list_type, list_type); + MLX5_SET(query_nic_vport_context_in, in, vport_number, vport); + MLX5_SET(query_nic_vport_context_in, in, other_vport, 1); + + err = mlx5_cmd_exec(dev, in, sizeof(in), out, out_sz); + if (err) + goto out; + + nic_vport_ctx = MLX5_ADDR_OF(query_nic_vport_context_out, out, + nic_vport_context); + req_list_size = MLX5_GET(nic_vport_context, nic_vport_ctx, + allowed_list_size); + + *list_size = req_list_size; + for (i = 0; i < req_list_size; i++) { + u8 *mac_addr = MLX5_ADDR_OF(nic_vport_context, + nic_vport_ctx, + current_uc_mac_address[i]) + 2; + ether_addr_copy(addr_list[i], mac_addr); + } +out: + kfree(out); + return err; +} +EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_mac_list); + +int mlx5_modify_nic_vport_mac_list(struct mlx5_core_dev *dev, + enum mlx5_list_type list_type, + u8 addr_list[][ETH_ALEN], + int list_size) +{ + u32 out[MLX5_ST_SZ_DW(modify_nic_vport_context_out)] = {}; + void *nic_vport_ctx; + int max_list_size; + int in_sz; + void *in; + int err; + int i; + + max_list_size = list_type == MLX5_NVPRT_LIST_TYPE_UC ? + 1 << MLX5_CAP_GEN(dev, log_max_current_uc_list) : + 1 << MLX5_CAP_GEN(dev, log_max_current_mc_list); + + if (list_size > max_list_size) + return -ENOSPC; + + in_sz = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in) + + list_size * MLX5_ST_SZ_BYTES(mac_address_layout); + + in = kzalloc(in_sz, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(modify_nic_vport_context_in, in, opcode, + MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT); + MLX5_SET(modify_nic_vport_context_in, in, + field_select.addresses_list, 1); + + nic_vport_ctx = MLX5_ADDR_OF(modify_nic_vport_context_in, in, + nic_vport_context); + + MLX5_SET(nic_vport_context, nic_vport_ctx, + allowed_list_type, list_type); + MLX5_SET(nic_vport_context, nic_vport_ctx, + allowed_list_size, list_size); + + for (i = 0; i < list_size; i++) { + u8 *curr_mac = MLX5_ADDR_OF(nic_vport_context, + nic_vport_ctx, + current_uc_mac_address[i]) + 2; + ether_addr_copy(curr_mac, addr_list[i]); + } + + err = mlx5_cmd_exec(dev, in, in_sz, out, sizeof(out)); + kfree(in); + return err; +} +EXPORT_SYMBOL_GPL(mlx5_modify_nic_vport_mac_list); + +int mlx5_query_nic_vport_vlans(struct mlx5_core_dev *dev, u32 vport, + unsigned long *vlans) +{ + u32 in[MLX5_ST_SZ_DW(query_nic_vport_context_in)]; + void *nic_vport_ctx; + int req_list_size; + int out_sz; + void *out; + int err; + int i; + + req_list_size = 1 << MLX5_CAP_GEN(dev, log_max_vlan_list); + out_sz = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in) + + req_list_size * MLX5_ST_SZ_BYTES(vlan_layout); + + memset(in, 0, sizeof(in)); + out = kzalloc(out_sz, GFP_KERNEL); + if (!out) + return -ENOMEM; + + MLX5_SET(query_nic_vport_context_in, in, opcode, + MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT); + MLX5_SET(query_nic_vport_context_in, in, allowed_list_type, + MLX5_NVPRT_LIST_TYPE_VLAN); + MLX5_SET(query_nic_vport_context_in, in, vport_number, vport); + + if (vport) + MLX5_SET(query_nic_vport_context_in, in, other_vport, 1); + + err = mlx5_cmd_exec(dev, in, sizeof(in), out, out_sz); + if (err) + goto out; + + nic_vport_ctx = MLX5_ADDR_OF(query_nic_vport_context_out, out, + nic_vport_context); + req_list_size = MLX5_GET(nic_vport_context, nic_vport_ctx, + allowed_list_size); + + for (i = 0; i < req_list_size; i++) { + void *vlan_addr = MLX5_ADDR_OF(nic_vport_context, + nic_vport_ctx, + current_uc_mac_address[i]); + bitmap_set(vlans, MLX5_GET(vlan_layout, vlan_addr, vlan), 1); + } +out: + kfree(out); + return err; +} +EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_vlans); + +int mlx5_modify_nic_vport_vlans(struct mlx5_core_dev *dev, + u16 vlans[], + int list_size) +{ + u32 out[MLX5_ST_SZ_DW(modify_nic_vport_context_out)]; + void *nic_vport_ctx; + int max_list_size; + int in_sz; + void *in; + int err; + int i; + + max_list_size = 1 << MLX5_CAP_GEN(dev, log_max_vlan_list); + + if (list_size > max_list_size) + return -ENOSPC; + + in_sz = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in) + + list_size * MLX5_ST_SZ_BYTES(vlan_layout); + + memset(out, 0, sizeof(out)); + in = kzalloc(in_sz, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(modify_nic_vport_context_in, in, opcode, + MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT); + MLX5_SET(modify_nic_vport_context_in, in, + field_select.addresses_list, 1); + + nic_vport_ctx = MLX5_ADDR_OF(modify_nic_vport_context_in, in, + nic_vport_context); + + MLX5_SET(nic_vport_context, nic_vport_ctx, + allowed_list_type, MLX5_NVPRT_LIST_TYPE_VLAN); + MLX5_SET(nic_vport_context, nic_vport_ctx, + allowed_list_size, list_size); + + for (i = 0; i < list_size; i++) { + void *vlan_addr = MLX5_ADDR_OF(nic_vport_context, + nic_vport_ctx, + current_uc_mac_address[i]); + MLX5_SET(vlan_layout, vlan_addr, vlan, vlans[i]); + } + + err = mlx5_cmd_exec(dev, in, in_sz, out, sizeof(out)); + kfree(in); + return err; +} +EXPORT_SYMBOL_GPL(mlx5_modify_nic_vport_vlans); + +int mlx5_query_nic_vport_system_image_guid(struct mlx5_core_dev *mdev, + u64 *system_image_guid) +{ + u32 *out; + int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out); + int err; + + out = kvzalloc(outlen, GFP_KERNEL); + if (!out) + return -ENOMEM; + + err = mlx5_query_nic_vport_context(mdev, 0, out); + if (err) + goto out; + + *system_image_guid = MLX5_GET64(query_nic_vport_context_out, out, + nic_vport_context.system_image_guid); +out: + kvfree(out); + return err; +} +EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_system_image_guid); + +int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u32 vport, + u64 *node_guid) +{ + u32 *out; + int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out); + + out = kvzalloc(outlen, GFP_KERNEL); + if (!out) + return -ENOMEM; + + mlx5_query_nic_vport_context(mdev, vport, out); + + *node_guid = MLX5_GET64(query_nic_vport_context_out, out, + nic_vport_context.node_guid); + + kvfree(out); + + return 0; +} +EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_node_guid); + +int mlx5_modify_nic_vport_node_guid(struct mlx5_core_dev *mdev, + u16 vport, u64 node_guid) +{ + int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in); + void *nic_vport_context; + void *in; + int err; + + if (!MLX5_CAP_GEN(mdev, vport_group_manager)) + return -EACCES; + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(modify_nic_vport_context_in, in, + field_select.node_guid, 1); + MLX5_SET(modify_nic_vport_context_in, in, vport_number, vport); + MLX5_SET(modify_nic_vport_context_in, in, other_vport, 1); + + nic_vport_context = MLX5_ADDR_OF(modify_nic_vport_context_in, + in, nic_vport_context); + MLX5_SET64(nic_vport_context, nic_vport_context, node_guid, node_guid); + MLX5_SET(modify_nic_vport_context_in, in, opcode, + MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT); + + err = mlx5_cmd_exec_in(mdev, modify_nic_vport_context, in); + + kvfree(in); + + return err; +} + +int mlx5_query_nic_vport_qkey_viol_cntr(struct mlx5_core_dev *mdev, + u16 *qkey_viol_cntr) +{ + u32 *out; + int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out); + + out = kvzalloc(outlen, GFP_KERNEL); + if (!out) + return -ENOMEM; + + mlx5_query_nic_vport_context(mdev, 0, out); + + *qkey_viol_cntr = MLX5_GET(query_nic_vport_context_out, out, + nic_vport_context.qkey_violation_counter); + + kvfree(out); + + return 0; +} +EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_qkey_viol_cntr); + +int mlx5_query_hca_vport_gid(struct mlx5_core_dev *dev, u8 other_vport, + u8 port_num, u16 vf_num, u16 gid_index, + union ib_gid *gid) +{ + int in_sz = MLX5_ST_SZ_BYTES(query_hca_vport_gid_in); + int out_sz = MLX5_ST_SZ_BYTES(query_hca_vport_gid_out); + int is_group_manager; + void *out = NULL; + void *in = NULL; + union ib_gid *tmp; + int tbsz; + int nout; + int err; + + is_group_manager = MLX5_CAP_GEN(dev, vport_group_manager); + tbsz = mlx5_get_gid_table_len(MLX5_CAP_GEN(dev, gid_table_size)); + mlx5_core_dbg(dev, "vf_num %d, index %d, gid_table_size %d\n", + vf_num, gid_index, tbsz); + + if (gid_index > tbsz && gid_index != 0xffff) + return -EINVAL; + + if (gid_index == 0xffff) + nout = tbsz; + else + nout = 1; + + out_sz += nout * sizeof(*gid); + + in = kzalloc(in_sz, GFP_KERNEL); + out = kzalloc(out_sz, GFP_KERNEL); + if (!in || !out) { + err = -ENOMEM; + goto out; + } + + MLX5_SET(query_hca_vport_gid_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_VPORT_GID); + if (other_vport) { + if (is_group_manager) { + MLX5_SET(query_hca_vport_gid_in, in, vport_number, vf_num); + MLX5_SET(query_hca_vport_gid_in, in, other_vport, 1); + } else { + err = -EPERM; + goto out; + } + } + MLX5_SET(query_hca_vport_gid_in, in, gid_index, gid_index); + + if (MLX5_CAP_GEN(dev, num_ports) == 2) + MLX5_SET(query_hca_vport_gid_in, in, port_num, port_num); + + err = mlx5_cmd_exec(dev, in, in_sz, out, out_sz); + if (err) + goto out; + + tmp = out + MLX5_ST_SZ_BYTES(query_hca_vport_gid_out); + gid->global.subnet_prefix = tmp->global.subnet_prefix; + gid->global.interface_id = tmp->global.interface_id; + +out: + kfree(in); + kfree(out); + return err; +} +EXPORT_SYMBOL_GPL(mlx5_query_hca_vport_gid); + +int mlx5_query_hca_vport_pkey(struct mlx5_core_dev *dev, u8 other_vport, + u8 port_num, u16 vf_num, u16 pkey_index, + u16 *pkey) +{ + int in_sz = MLX5_ST_SZ_BYTES(query_hca_vport_pkey_in); + int out_sz = MLX5_ST_SZ_BYTES(query_hca_vport_pkey_out); + int is_group_manager; + void *out = NULL; + void *in = NULL; + void *pkarr; + int nout; + int tbsz; + int err; + int i; + + is_group_manager = MLX5_CAP_GEN(dev, vport_group_manager); + + tbsz = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(dev, pkey_table_size)); + if (pkey_index > tbsz && pkey_index != 0xffff) + return -EINVAL; + + if (pkey_index == 0xffff) + nout = tbsz; + else + nout = 1; + + out_sz += nout * MLX5_ST_SZ_BYTES(pkey); + + in = kzalloc(in_sz, GFP_KERNEL); + out = kzalloc(out_sz, GFP_KERNEL); + if (!in || !out) { + err = -ENOMEM; + goto out; + } + + MLX5_SET(query_hca_vport_pkey_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_VPORT_PKEY); + if (other_vport) { + if (is_group_manager) { + MLX5_SET(query_hca_vport_pkey_in, in, vport_number, vf_num); + MLX5_SET(query_hca_vport_pkey_in, in, other_vport, 1); + } else { + err = -EPERM; + goto out; + } + } + MLX5_SET(query_hca_vport_pkey_in, in, pkey_index, pkey_index); + + if (MLX5_CAP_GEN(dev, num_ports) == 2) + MLX5_SET(query_hca_vport_pkey_in, in, port_num, port_num); + + err = mlx5_cmd_exec(dev, in, in_sz, out, out_sz); + if (err) + goto out; + + pkarr = MLX5_ADDR_OF(query_hca_vport_pkey_out, out, pkey); + for (i = 0; i < nout; i++, pkey++, pkarr += MLX5_ST_SZ_BYTES(pkey)) + *pkey = MLX5_GET_PR(pkey, pkarr, pkey); + +out: + kfree(in); + kfree(out); + return err; +} +EXPORT_SYMBOL_GPL(mlx5_query_hca_vport_pkey); + +int mlx5_query_hca_vport_context(struct mlx5_core_dev *dev, + u8 other_vport, u8 port_num, + u16 vf_num, + struct mlx5_hca_vport_context *rep) +{ + int out_sz = MLX5_ST_SZ_BYTES(query_hca_vport_context_out); + int in[MLX5_ST_SZ_DW(query_hca_vport_context_in)] = {}; + int is_group_manager; + void *out; + void *ctx; + int err; + + is_group_manager = MLX5_CAP_GEN(dev, vport_group_manager); + + out = kzalloc(out_sz, GFP_KERNEL); + if (!out) + return -ENOMEM; + + MLX5_SET(query_hca_vport_context_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_VPORT_CONTEXT); + + if (other_vport) { + if (is_group_manager) { + MLX5_SET(query_hca_vport_context_in, in, other_vport, 1); + MLX5_SET(query_hca_vport_context_in, in, vport_number, vf_num); + } else { + err = -EPERM; + goto ex; + } + } + + if (MLX5_CAP_GEN(dev, num_ports) == 2) + MLX5_SET(query_hca_vport_context_in, in, port_num, port_num); + + err = mlx5_cmd_exec_inout(dev, query_hca_vport_context, in, out); + if (err) + goto ex; + + ctx = MLX5_ADDR_OF(query_hca_vport_context_out, out, hca_vport_context); + rep->field_select = MLX5_GET_PR(hca_vport_context, ctx, field_select); + rep->sm_virt_aware = MLX5_GET_PR(hca_vport_context, ctx, sm_virt_aware); + rep->has_smi = MLX5_GET_PR(hca_vport_context, ctx, has_smi); + rep->has_raw = MLX5_GET_PR(hca_vport_context, ctx, has_raw); + rep->policy = MLX5_GET_PR(hca_vport_context, ctx, vport_state_policy); + rep->phys_state = MLX5_GET_PR(hca_vport_context, ctx, + port_physical_state); + rep->vport_state = MLX5_GET_PR(hca_vport_context, ctx, vport_state); + rep->port_physical_state = MLX5_GET_PR(hca_vport_context, ctx, + port_physical_state); + rep->port_guid = MLX5_GET64_PR(hca_vport_context, ctx, port_guid); + rep->node_guid = MLX5_GET64_PR(hca_vport_context, ctx, node_guid); + rep->cap_mask1 = MLX5_GET_PR(hca_vport_context, ctx, cap_mask1); + rep->cap_mask1_perm = MLX5_GET_PR(hca_vport_context, ctx, + cap_mask1_field_select); + rep->cap_mask2 = MLX5_GET_PR(hca_vport_context, ctx, cap_mask2); + rep->cap_mask2_perm = MLX5_GET_PR(hca_vport_context, ctx, + cap_mask2_field_select); + rep->lid = MLX5_GET_PR(hca_vport_context, ctx, lid); + rep->init_type_reply = MLX5_GET_PR(hca_vport_context, ctx, + init_type_reply); + rep->lmc = MLX5_GET_PR(hca_vport_context, ctx, lmc); + rep->subnet_timeout = MLX5_GET_PR(hca_vport_context, ctx, + subnet_timeout); + rep->sm_lid = MLX5_GET_PR(hca_vport_context, ctx, sm_lid); + rep->sm_sl = MLX5_GET_PR(hca_vport_context, ctx, sm_sl); + rep->qkey_violation_counter = MLX5_GET_PR(hca_vport_context, ctx, + qkey_violation_counter); + rep->pkey_violation_counter = MLX5_GET_PR(hca_vport_context, ctx, + pkey_violation_counter); + rep->grh_required = MLX5_GET_PR(hca_vport_context, ctx, grh_required); + rep->sys_image_guid = MLX5_GET64_PR(hca_vport_context, ctx, + system_image_guid); + +ex: + kfree(out); + return err; +} +EXPORT_SYMBOL_GPL(mlx5_query_hca_vport_context); + +int mlx5_query_hca_vport_system_image_guid(struct mlx5_core_dev *dev, + u64 *sys_image_guid) +{ + struct mlx5_hca_vport_context *rep; + int err; + + rep = kzalloc(sizeof(*rep), GFP_KERNEL); + if (!rep) + return -ENOMEM; + + err = mlx5_query_hca_vport_context(dev, 0, 1, 0, rep); + if (!err) + *sys_image_guid = rep->sys_image_guid; + + kfree(rep); + return err; +} +EXPORT_SYMBOL_GPL(mlx5_query_hca_vport_system_image_guid); + +int mlx5_query_hca_vport_node_guid(struct mlx5_core_dev *dev, + u64 *node_guid) +{ + struct mlx5_hca_vport_context *rep; + int err; + + rep = kzalloc(sizeof(*rep), GFP_KERNEL); + if (!rep) + return -ENOMEM; + + err = mlx5_query_hca_vport_context(dev, 0, 1, 0, rep); + if (!err) + *node_guid = rep->node_guid; + + kfree(rep); + return err; +} +EXPORT_SYMBOL_GPL(mlx5_query_hca_vport_node_guid); + +int mlx5_query_nic_vport_promisc(struct mlx5_core_dev *mdev, + u16 vport, + int *promisc_uc, + int *promisc_mc, + int *promisc_all) +{ + u32 *out; + int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out); + int err; + + out = kzalloc(outlen, GFP_KERNEL); + if (!out) + return -ENOMEM; + + err = mlx5_query_nic_vport_context(mdev, vport, out); + if (err) + goto out; + + *promisc_uc = MLX5_GET(query_nic_vport_context_out, out, + nic_vport_context.promisc_uc); + *promisc_mc = MLX5_GET(query_nic_vport_context_out, out, + nic_vport_context.promisc_mc); + *promisc_all = MLX5_GET(query_nic_vport_context_out, out, + nic_vport_context.promisc_all); + +out: + kfree(out); + return err; +} +EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_promisc); + +int mlx5_modify_nic_vport_promisc(struct mlx5_core_dev *mdev, + int promisc_uc, + int promisc_mc, + int promisc_all) +{ + void *in; + int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in); + int err; + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(modify_nic_vport_context_in, in, field_select.promisc, 1); + MLX5_SET(modify_nic_vport_context_in, in, + nic_vport_context.promisc_uc, promisc_uc); + MLX5_SET(modify_nic_vport_context_in, in, + nic_vport_context.promisc_mc, promisc_mc); + MLX5_SET(modify_nic_vport_context_in, in, + nic_vport_context.promisc_all, promisc_all); + MLX5_SET(modify_nic_vport_context_in, in, opcode, + MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT); + + err = mlx5_cmd_exec_in(mdev, modify_nic_vport_context, in); + + kvfree(in); + + return err; +} +EXPORT_SYMBOL_GPL(mlx5_modify_nic_vport_promisc); + +enum { + UC_LOCAL_LB, + MC_LOCAL_LB +}; + +int mlx5_nic_vport_update_local_lb(struct mlx5_core_dev *mdev, bool enable) +{ + int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in); + bool disable_local_lb; + void *in; + int err; + + if (!MLX5_CAP_GEN(mdev, disable_local_lb_mc) && + !MLX5_CAP_GEN(mdev, disable_local_lb_uc)) + return 0; + + mdev->local_lb.driver_state = enable; + disable_local_lb = mdev->local_lb.user_force_disable || !enable; + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(modify_nic_vport_context_in, in, + nic_vport_context.disable_mc_local_lb, disable_local_lb); + MLX5_SET(modify_nic_vport_context_in, in, + nic_vport_context.disable_uc_local_lb, disable_local_lb); + + if (MLX5_CAP_GEN(mdev, disable_local_lb_mc)) + MLX5_SET(modify_nic_vport_context_in, in, + field_select.disable_mc_local_lb, 1); + + if (MLX5_CAP_GEN(mdev, disable_local_lb_uc)) + MLX5_SET(modify_nic_vport_context_in, in, + field_select.disable_uc_local_lb, 1); + MLX5_SET(modify_nic_vport_context_in, in, opcode, + MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT); + + err = mlx5_cmd_exec_in(mdev, modify_nic_vport_context, in); + + if (!err) + mlx5_core_dbg(mdev, "%s local_lb\n", + enable ? "enable" : "disable"); + + kvfree(in); + return err; +} +EXPORT_SYMBOL_GPL(mlx5_nic_vport_update_local_lb); + +int mlx5_nic_vport_query_local_lb(struct mlx5_core_dev *mdev, bool *status) +{ + int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out); + u32 *out; + int value; + int err; + + out = kzalloc(outlen, GFP_KERNEL); + if (!out) + return -ENOMEM; + + err = mlx5_query_nic_vport_context(mdev, 0, out); + if (err) + goto out; + + value = MLX5_GET(query_nic_vport_context_out, out, + nic_vport_context.disable_mc_local_lb) << MC_LOCAL_LB; + + value |= MLX5_GET(query_nic_vport_context_out, out, + nic_vport_context.disable_uc_local_lb) << UC_LOCAL_LB; + + *status = !value; + +out: + kfree(out); + return err; +} +EXPORT_SYMBOL_GPL(mlx5_nic_vport_query_local_lb); + +enum mlx5_vport_roce_state { + MLX5_VPORT_ROCE_DISABLED = 0, + MLX5_VPORT_ROCE_ENABLED = 1, +}; + +static int mlx5_nic_vport_update_roce_state(struct mlx5_core_dev *mdev, + enum mlx5_vport_roce_state state) +{ + void *in; + int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in); + int err; + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(modify_nic_vport_context_in, in, field_select.roce_en, 1); + MLX5_SET(modify_nic_vport_context_in, in, nic_vport_context.roce_en, + state); + MLX5_SET(modify_nic_vport_context_in, in, opcode, + MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT); + + err = mlx5_cmd_exec_in(mdev, modify_nic_vport_context, in); + + kvfree(in); + + return err; +} + +int mlx5_nic_vport_enable_roce(struct mlx5_core_dev *mdev) +{ + int err = 0; + + mutex_lock(&mlx5_roce_en_lock); + if (!mdev->roce.roce_en) + err = mlx5_nic_vport_update_roce_state(mdev, MLX5_VPORT_ROCE_ENABLED); + + if (!err) + mdev->roce.roce_en++; + mutex_unlock(&mlx5_roce_en_lock); + + return err; +} +EXPORT_SYMBOL_GPL(mlx5_nic_vport_enable_roce); + +int mlx5_nic_vport_disable_roce(struct mlx5_core_dev *mdev) +{ + int err = 0; + + mutex_lock(&mlx5_roce_en_lock); + if (mdev->roce.roce_en) { + mdev->roce.roce_en--; + if (mdev->roce.roce_en == 0) + err = mlx5_nic_vport_update_roce_state(mdev, MLX5_VPORT_ROCE_DISABLED); + + if (err) + mdev->roce.roce_en++; + } + mutex_unlock(&mlx5_roce_en_lock); + return err; +} +EXPORT_SYMBOL(mlx5_nic_vport_disable_roce); + +int mlx5_core_query_vport_counter(struct mlx5_core_dev *dev, u8 other_vport, + int vf, u8 port_num, void *out) +{ + int in_sz = MLX5_ST_SZ_BYTES(query_vport_counter_in); + int is_group_manager; + void *in; + int err; + + is_group_manager = MLX5_CAP_GEN(dev, vport_group_manager); + in = kvzalloc(in_sz, GFP_KERNEL); + if (!in) { + err = -ENOMEM; + return err; + } + + MLX5_SET(query_vport_counter_in, in, opcode, + MLX5_CMD_OP_QUERY_VPORT_COUNTER); + if (other_vport) { + if (is_group_manager) { + MLX5_SET(query_vport_counter_in, in, other_vport, 1); + MLX5_SET(query_vport_counter_in, in, vport_number, vf + 1); + } else { + err = -EPERM; + goto free; + } + } + if (MLX5_CAP_GEN(dev, num_ports) == 2) + MLX5_SET(query_vport_counter_in, in, port_num, port_num); + + err = mlx5_cmd_exec_inout(dev, query_vport_counter, in, out); +free: + kvfree(in); + return err; +} +EXPORT_SYMBOL_GPL(mlx5_core_query_vport_counter); + +int mlx5_query_vport_down_stats(struct mlx5_core_dev *mdev, u16 vport, + u8 other_vport, u64 *rx_discard_vport_down, + u64 *tx_discard_vport_down) +{ + u32 out[MLX5_ST_SZ_DW(query_vnic_env_out)] = {}; + u32 in[MLX5_ST_SZ_DW(query_vnic_env_in)] = {}; + int err; + + MLX5_SET(query_vnic_env_in, in, opcode, + MLX5_CMD_OP_QUERY_VNIC_ENV); + MLX5_SET(query_vnic_env_in, in, op_mod, 0); + MLX5_SET(query_vnic_env_in, in, vport_number, vport); + MLX5_SET(query_vnic_env_in, in, other_vport, other_vport); + + err = mlx5_cmd_exec_inout(mdev, query_vnic_env, in, out); + if (err) + return err; + + *rx_discard_vport_down = MLX5_GET64(query_vnic_env_out, out, + vport_env.receive_discard_vport_down); + *tx_discard_vport_down = MLX5_GET64(query_vnic_env_out, out, + vport_env.transmit_discard_vport_down); + return 0; +} + +int mlx5_core_modify_hca_vport_context(struct mlx5_core_dev *dev, + u8 other_vport, u8 port_num, + int vf, + struct mlx5_hca_vport_context *req) +{ + int in_sz = MLX5_ST_SZ_BYTES(modify_hca_vport_context_in); + int is_group_manager; + void *ctx; + void *in; + int err; + + mlx5_core_dbg(dev, "vf %d\n", vf); + is_group_manager = MLX5_CAP_GEN(dev, vport_group_manager); + in = kzalloc(in_sz, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(modify_hca_vport_context_in, in, opcode, MLX5_CMD_OP_MODIFY_HCA_VPORT_CONTEXT); + if (other_vport) { + if (is_group_manager) { + MLX5_SET(modify_hca_vport_context_in, in, other_vport, 1); + MLX5_SET(modify_hca_vport_context_in, in, vport_number, vf); + } else { + err = -EPERM; + goto ex; + } + } + + if (MLX5_CAP_GEN(dev, num_ports) > 1) + MLX5_SET(modify_hca_vport_context_in, in, port_num, port_num); + + ctx = MLX5_ADDR_OF(modify_hca_vport_context_in, in, hca_vport_context); + MLX5_SET(hca_vport_context, ctx, field_select, req->field_select); + if (req->field_select & MLX5_HCA_VPORT_SEL_STATE_POLICY) + MLX5_SET(hca_vport_context, ctx, vport_state_policy, + req->policy); + if (req->field_select & MLX5_HCA_VPORT_SEL_PORT_GUID) + MLX5_SET64(hca_vport_context, ctx, port_guid, req->port_guid); + if (req->field_select & MLX5_HCA_VPORT_SEL_NODE_GUID) + MLX5_SET64(hca_vport_context, ctx, node_guid, req->node_guid); + MLX5_SET(hca_vport_context, ctx, cap_mask1, req->cap_mask1); + MLX5_SET(hca_vport_context, ctx, cap_mask1_field_select, + req->cap_mask1_perm); + err = mlx5_cmd_exec_in(dev, modify_hca_vport_context, in); +ex: + kfree(in); + return err; +} +EXPORT_SYMBOL_GPL(mlx5_core_modify_hca_vport_context); + +int mlx5_nic_vport_affiliate_multiport(struct mlx5_core_dev *master_mdev, + struct mlx5_core_dev *port_mdev) +{ + int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in); + void *in; + int err; + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + err = mlx5_nic_vport_enable_roce(port_mdev); + if (err) + goto free; + + MLX5_SET(modify_nic_vport_context_in, in, field_select.affiliation, 1); + MLX5_SET(modify_nic_vport_context_in, in, + nic_vport_context.affiliated_vhca_id, + MLX5_CAP_GEN(master_mdev, vhca_id)); + MLX5_SET(modify_nic_vport_context_in, in, + nic_vport_context.affiliation_criteria, + MLX5_CAP_GEN(port_mdev, affiliate_nic_vport_criteria)); + MLX5_SET(modify_nic_vport_context_in, in, opcode, + MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT); + + err = mlx5_cmd_exec_in(port_mdev, modify_nic_vport_context, in); + if (err) + mlx5_nic_vport_disable_roce(port_mdev); + +free: + kvfree(in); + return err; +} +EXPORT_SYMBOL_GPL(mlx5_nic_vport_affiliate_multiport); + +int mlx5_nic_vport_unaffiliate_multiport(struct mlx5_core_dev *port_mdev) +{ + int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in); + void *in; + int err; + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(modify_nic_vport_context_in, in, field_select.affiliation, 1); + MLX5_SET(modify_nic_vport_context_in, in, + nic_vport_context.affiliated_vhca_id, 0); + MLX5_SET(modify_nic_vport_context_in, in, + nic_vport_context.affiliation_criteria, 0); + MLX5_SET(modify_nic_vport_context_in, in, opcode, + MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT); + + err = mlx5_cmd_exec_in(port_mdev, modify_nic_vport_context, in); + if (!err) + mlx5_nic_vport_disable_roce(port_mdev); + + kvfree(in); + return err; +} +EXPORT_SYMBOL_GPL(mlx5_nic_vport_unaffiliate_multiport); + +u64 mlx5_query_nic_system_image_guid(struct mlx5_core_dev *mdev) +{ + int port_type_cap = MLX5_CAP_GEN(mdev, port_type); + u64 tmp; + int err; + + if (mdev->sys_image_guid) + return mdev->sys_image_guid; + + if (port_type_cap == MLX5_CAP_PORT_TYPE_ETH) + err = mlx5_query_nic_vport_system_image_guid(mdev, &tmp); + else + err = mlx5_query_hca_vport_system_image_guid(mdev, &tmp); + + mdev->sys_image_guid = err ? 0 : tmp; + + return mdev->sys_image_guid; +} +EXPORT_SYMBOL_GPL(mlx5_query_nic_system_image_guid); + +int mlx5_vport_get_other_func_cap(struct mlx5_core_dev *dev, u16 function_id, void *out) +{ + u16 opmod = (MLX5_CAP_GENERAL << 1) | (HCA_CAP_OPMOD_GET_MAX & 0x01); + u8 in[MLX5_ST_SZ_BYTES(query_hca_cap_in)] = {}; + + MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP); + MLX5_SET(query_hca_cap_in, in, op_mod, opmod); + MLX5_SET(query_hca_cap_in, in, function_id, function_id); + MLX5_SET(query_hca_cap_in, in, other_function, true); + return mlx5_cmd_exec_inout(dev, query_hca_cap, in, out); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/wq.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/wq.c new file mode 100644 index 0000000..3091dd0 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/wq.c @@ -0,0 +1,261 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include "wq.h" +#include "mlx5_core.h" + +int mlx5_wq_cyc_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param, + void *wqc, struct mlx5_wq_cyc *wq, + struct mlx5_wq_ctrl *wq_ctrl) +{ + u8 log_wq_stride = MLX5_GET(wq, wqc, log_wq_stride); + u8 log_wq_sz = MLX5_GET(wq, wqc, log_wq_sz); + struct mlx5_frag_buf_ctrl *fbc = &wq->fbc; + int err; + + err = mlx5_db_alloc_node(mdev, &wq_ctrl->db, param->db_numa_node); + if (err) { + mlx5_core_warn(mdev, "mlx5_db_alloc_node() failed, %d\n", err); + return err; + } + + wq->db = wq_ctrl->db.db; + + err = mlx5_frag_buf_alloc_node(mdev, wq_get_byte_sz(log_wq_sz, log_wq_stride), + &wq_ctrl->buf, param->buf_numa_node); + if (err) { + mlx5_core_warn(mdev, "mlx5_frag_buf_alloc_node() failed, %d\n", err); + goto err_db_free; + } + + mlx5_init_fbc(wq_ctrl->buf.frags, log_wq_stride, log_wq_sz, fbc); + wq->sz = mlx5_wq_cyc_get_size(wq); + + wq_ctrl->mdev = mdev; + + return 0; + +err_db_free: + mlx5_db_free(mdev, &wq_ctrl->db); + + return err; +} + +void mlx5_wq_cyc_wqe_dump(struct mlx5_wq_cyc *wq, u16 ix, u8 nstrides) +{ + size_t len; + void *wqe; + + if (!net_ratelimit()) + return; + + nstrides = max_t(u8, nstrides, 1); + + len = nstrides << wq->fbc.log_stride; + wqe = mlx5_wq_cyc_get_wqe(wq, ix); + + pr_info("WQE DUMP: WQ size %d WQ cur size %d, WQE index 0x%x, len: %zu\n", + mlx5_wq_cyc_get_size(wq), wq->cur_sz, ix, len); + print_hex_dump(KERN_WARNING, "", DUMP_PREFIX_OFFSET, 16, 1, wqe, len, false); +} + +void mlx5_wq_cyc_reset(struct mlx5_wq_cyc *wq) +{ + wq->wqe_ctr = 0; + wq->cur_sz = 0; + mlx5_wq_cyc_update_db_record(wq); +} + +int mlx5_wq_qp_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param, + void *qpc, struct mlx5_wq_qp *wq, + struct mlx5_wq_ctrl *wq_ctrl) +{ + u8 log_rq_stride = MLX5_GET(qpc, qpc, log_rq_stride) + 4; + u8 log_rq_sz = MLX5_GET(qpc, qpc, log_rq_size); + u8 log_sq_stride = ilog2(MLX5_SEND_WQE_BB); + u8 log_sq_sz = MLX5_GET(qpc, qpc, log_sq_size); + + u32 rq_byte_size; + int err; + + + + err = mlx5_db_alloc_node(mdev, &wq_ctrl->db, param->db_numa_node); + if (err) { + mlx5_core_warn(mdev, "mlx5_db_alloc_node() failed, %d\n", err); + return err; + } + + err = mlx5_frag_buf_alloc_node(mdev, + wq_get_byte_sz(log_rq_sz, log_rq_stride) + + wq_get_byte_sz(log_sq_sz, log_sq_stride), + &wq_ctrl->buf, param->buf_numa_node); + if (err) { + mlx5_core_warn(mdev, "mlx5_frag_buf_alloc_node() failed, %d\n", err); + goto err_db_free; + } + + mlx5_init_fbc(wq_ctrl->buf.frags, log_rq_stride, log_rq_sz, &wq->rq.fbc); + + rq_byte_size = wq_get_byte_sz(log_rq_sz, log_rq_stride); + + if (rq_byte_size < PAGE_SIZE) { + /* SQ starts within the same page of the RQ */ + u16 sq_strides_offset = rq_byte_size / MLX5_SEND_WQE_BB; + + mlx5_init_fbc_offset(wq_ctrl->buf.frags, + log_sq_stride, log_sq_sz, sq_strides_offset, + &wq->sq.fbc); + } else { + u16 rq_npages = rq_byte_size >> PAGE_SHIFT; + + mlx5_init_fbc(wq_ctrl->buf.frags + rq_npages, + log_sq_stride, log_sq_sz, &wq->sq.fbc); + } + + wq->rq.db = &wq_ctrl->db.db[MLX5_RCV_DBR]; + wq->sq.db = &wq_ctrl->db.db[MLX5_SND_DBR]; + + wq_ctrl->mdev = mdev; + + return 0; + +err_db_free: + mlx5_db_free(mdev, &wq_ctrl->db); + + return err; +} + +int mlx5_cqwq_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param, + void *cqc, struct mlx5_cqwq *wq, + struct mlx5_wq_ctrl *wq_ctrl) +{ + /* CQE_STRIDE_128 and CQE_STRIDE_128_PAD both mean 128B stride */ + u8 log_wq_stride = MLX5_GET(cqc, cqc, cqe_sz) == CQE_STRIDE_64 ? 6 : 7; + u8 log_wq_sz = MLX5_GET(cqc, cqc, log_cq_size); + int err; + + err = mlx5_db_alloc_node(mdev, &wq_ctrl->db, param->db_numa_node); + if (err) { + mlx5_core_warn(mdev, "mlx5_db_alloc_node() failed, %d\n", err); + return err; + } + + wq->db = wq_ctrl->db.db; + + err = mlx5_frag_buf_alloc_node(mdev, wq_get_byte_sz(log_wq_sz, log_wq_stride), + &wq_ctrl->buf, + param->buf_numa_node); + if (err) { + mlx5_core_warn(mdev, "mlx5_frag_buf_alloc_node() failed, %d\n", + err); + goto err_db_free; + } + + mlx5_init_fbc(wq_ctrl->buf.frags, log_wq_stride, log_wq_sz, &wq->fbc); + + wq_ctrl->mdev = mdev; + + return 0; + +err_db_free: + mlx5_db_free(mdev, &wq_ctrl->db); + + return err; +} + +static void mlx5_wq_ll_init_list(struct mlx5_wq_ll *wq) +{ + struct mlx5_wqe_srq_next_seg *next_seg; + int i; + + for (i = 0; i < wq->fbc.sz_m1; i++) { + next_seg = mlx5_wq_ll_get_wqe(wq, i); + next_seg->next_wqe_index = cpu_to_be16(i + 1); + } + next_seg = mlx5_wq_ll_get_wqe(wq, i); + wq->tail_next = &next_seg->next_wqe_index; +} + +int mlx5_wq_ll_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param, + void *wqc, struct mlx5_wq_ll *wq, + struct mlx5_wq_ctrl *wq_ctrl) +{ + u8 log_wq_stride = MLX5_GET(wq, wqc, log_wq_stride); + u8 log_wq_sz = MLX5_GET(wq, wqc, log_wq_sz); + struct mlx5_frag_buf_ctrl *fbc = &wq->fbc; + int err; + + err = mlx5_db_alloc_node(mdev, &wq_ctrl->db, param->db_numa_node); + if (err) { + mlx5_core_warn(mdev, "mlx5_db_alloc_node() failed, %d\n", err); + return err; + } + + wq->db = wq_ctrl->db.db; + + err = mlx5_frag_buf_alloc_node(mdev, wq_get_byte_sz(log_wq_sz, log_wq_stride), + &wq_ctrl->buf, param->buf_numa_node); + if (err) { + mlx5_core_warn(mdev, "mlx5_frag_buf_alloc_node() failed, %d\n", err); + goto err_db_free; + } + + mlx5_init_fbc(wq_ctrl->buf.frags, log_wq_stride, log_wq_sz, fbc); + + mlx5_wq_ll_init_list(wq); + wq_ctrl->mdev = mdev; + + return 0; + +err_db_free: + mlx5_db_free(mdev, &wq_ctrl->db); + + return err; +} + +void mlx5_wq_ll_reset(struct mlx5_wq_ll *wq) +{ + wq->head = 0; + wq->wqe_ctr = 0; + wq->cur_sz = 0; + mlx5_wq_ll_init_list(wq); + mlx5_wq_ll_update_db_record(wq); +} + +void mlx5_wq_destroy(struct mlx5_wq_ctrl *wq_ctrl) +{ + mlx5_frag_buf_free(wq_ctrl->mdev, &wq_ctrl->buf); + mlx5_db_free(wq_ctrl->mdev, &wq_ctrl->db); +} + diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/wq.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/wq.h new file mode 100644 index 0000000..e5c4dcd --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlx5/core/wq.h @@ -0,0 +1,308 @@ +/* + * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __MLX5_WQ_H__ +#define __MLX5_WQ_H__ + +#include +#include +#include + +struct mlx5_wq_param { + int buf_numa_node; + int db_numa_node; +}; + +struct mlx5_wq_ctrl { + struct mlx5_core_dev *mdev; + struct mlx5_frag_buf buf; + struct mlx5_db db; +}; + +struct mlx5_wq_cyc { + struct mlx5_frag_buf_ctrl fbc; + __be32 *db; + u16 sz; + u16 wqe_ctr; + u16 cur_sz; +}; + +struct mlx5_wq_qp { + struct mlx5_wq_cyc rq; + struct mlx5_wq_cyc sq; +}; + +struct mlx5_cqwq { + struct mlx5_frag_buf_ctrl fbc; + __be32 *db; + u32 cc; /* consumer counter */ +}; + +struct mlx5_wq_ll { + struct mlx5_frag_buf_ctrl fbc; + __be32 *db; + __be16 *tail_next; + u16 head; + u16 wqe_ctr; + u16 cur_sz; +}; + +int mlx5_wq_cyc_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param, + void *wqc, struct mlx5_wq_cyc *wq, + struct mlx5_wq_ctrl *wq_ctrl); +void mlx5_wq_cyc_wqe_dump(struct mlx5_wq_cyc *wq, u16 ix, u8 nstrides); +void mlx5_wq_cyc_reset(struct mlx5_wq_cyc *wq); + +int mlx5_wq_qp_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param, + void *qpc, struct mlx5_wq_qp *wq, + struct mlx5_wq_ctrl *wq_ctrl); + +int mlx5_cqwq_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param, + void *cqc, struct mlx5_cqwq *wq, + struct mlx5_wq_ctrl *wq_ctrl); + +int mlx5_wq_ll_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param, + void *wqc, struct mlx5_wq_ll *wq, + struct mlx5_wq_ctrl *wq_ctrl); +void mlx5_wq_ll_reset(struct mlx5_wq_ll *wq); + +void mlx5_wq_destroy(struct mlx5_wq_ctrl *wq_ctrl); + +static inline u32 mlx5_wq_cyc_get_size(struct mlx5_wq_cyc *wq) +{ + return (u32)wq->fbc.sz_m1 + 1; +} + +static inline int mlx5_wq_cyc_is_full(struct mlx5_wq_cyc *wq) +{ + return wq->cur_sz == wq->sz; +} + +static inline int mlx5_wq_cyc_missing(struct mlx5_wq_cyc *wq) +{ + return wq->sz - wq->cur_sz; +} + +static inline int mlx5_wq_cyc_is_empty(struct mlx5_wq_cyc *wq) +{ + return !wq->cur_sz; +} + +static inline void mlx5_wq_cyc_push(struct mlx5_wq_cyc *wq) +{ + wq->wqe_ctr++; + wq->cur_sz++; +} + +static inline void mlx5_wq_cyc_push_n(struct mlx5_wq_cyc *wq, u8 n) +{ + wq->wqe_ctr += n; + wq->cur_sz += n; +} + +static inline void mlx5_wq_cyc_pop(struct mlx5_wq_cyc *wq) +{ + wq->cur_sz--; +} + +static inline void mlx5_wq_cyc_update_db_record(struct mlx5_wq_cyc *wq) +{ + *wq->db = cpu_to_be32(wq->wqe_ctr); +} + +static inline u16 mlx5_wq_cyc_ctr2ix(struct mlx5_wq_cyc *wq, u16 ctr) +{ + return ctr & wq->fbc.sz_m1; +} + +static inline u16 mlx5_wq_cyc_get_head(struct mlx5_wq_cyc *wq) +{ + return mlx5_wq_cyc_ctr2ix(wq, wq->wqe_ctr); +} + +static inline u16 mlx5_wq_cyc_get_tail(struct mlx5_wq_cyc *wq) +{ + return mlx5_wq_cyc_ctr2ix(wq, wq->wqe_ctr - wq->cur_sz); +} + +static inline void *mlx5_wq_cyc_get_wqe(struct mlx5_wq_cyc *wq, u16 ix) +{ + return mlx5_frag_buf_get_wqe(&wq->fbc, ix); +} + +static inline u16 mlx5_wq_cyc_get_contig_wqebbs(struct mlx5_wq_cyc *wq, u16 ix) +{ + return mlx5_frag_buf_get_idx_last_contig_stride(&wq->fbc, ix) - ix + 1; +} + +static inline int mlx5_wq_cyc_cc_bigger(u16 cc1, u16 cc2) +{ + int equal = (cc1 == cc2); + int smaller = 0x8000 & (cc1 - cc2); + + return !equal && !smaller; +} + +static inline u16 mlx5_wq_cyc_get_counter(struct mlx5_wq_cyc *wq) +{ + return wq->wqe_ctr; +} + +static inline u32 mlx5_cqwq_get_size(struct mlx5_cqwq *wq) +{ + return wq->fbc.sz_m1 + 1; +} + +static inline u8 mlx5_cqwq_get_log_stride_size(struct mlx5_cqwq *wq) +{ + return wq->fbc.log_stride; +} + +static inline u32 mlx5_cqwq_ctr2ix(struct mlx5_cqwq *wq, u32 ctr) +{ + return ctr & wq->fbc.sz_m1; +} + +static inline u32 mlx5_cqwq_get_ci(struct mlx5_cqwq *wq) +{ + return mlx5_cqwq_ctr2ix(wq, wq->cc); +} + +static inline struct mlx5_cqe64 *mlx5_cqwq_get_wqe(struct mlx5_cqwq *wq, u32 ix) +{ + struct mlx5_cqe64 *cqe = mlx5_frag_buf_get_wqe(&wq->fbc, ix); + + /* For 128B CQEs the data is in the last 64B */ + cqe += wq->fbc.log_stride == 7; + + return cqe; +} + +static inline u32 mlx5_cqwq_get_ctr_wrap_cnt(struct mlx5_cqwq *wq, u32 ctr) +{ + return ctr >> wq->fbc.log_sz; +} + +static inline u32 mlx5_cqwq_get_wrap_cnt(struct mlx5_cqwq *wq) +{ + return mlx5_cqwq_get_ctr_wrap_cnt(wq, wq->cc); +} + +static inline void mlx5_cqwq_pop(struct mlx5_cqwq *wq) +{ + wq->cc++; +} + +static inline void mlx5_cqwq_update_db_record(struct mlx5_cqwq *wq) +{ + *wq->db = cpu_to_be32(wq->cc & 0xffffff); +} + +static inline struct mlx5_cqe64 *mlx5_cqwq_get_cqe(struct mlx5_cqwq *wq) +{ + u32 ci = mlx5_cqwq_get_ci(wq); + struct mlx5_cqe64 *cqe = mlx5_cqwq_get_wqe(wq, ci); + u8 cqe_ownership_bit = cqe->op_own & MLX5_CQE_OWNER_MASK; + u8 sw_ownership_val = mlx5_cqwq_get_wrap_cnt(wq) & 1; + + if (cqe_ownership_bit != sw_ownership_val) + return NULL; + + /* ensure cqe content is read after cqe ownership bit */ + dma_rmb(); + + return cqe; +} + +static inline u32 mlx5_wq_ll_get_size(struct mlx5_wq_ll *wq) +{ + return (u32)wq->fbc.sz_m1 + 1; +} + +static inline int mlx5_wq_ll_is_full(struct mlx5_wq_ll *wq) +{ + return wq->cur_sz == wq->fbc.sz_m1; +} + +static inline int mlx5_wq_ll_is_empty(struct mlx5_wq_ll *wq) +{ + return !wq->cur_sz; +} + +static inline int mlx5_wq_ll_missing(struct mlx5_wq_ll *wq) +{ + return wq->fbc.sz_m1 - wq->cur_sz; +} + +static inline void *mlx5_wq_ll_get_wqe(struct mlx5_wq_ll *wq, u16 ix) +{ + return mlx5_frag_buf_get_wqe(&wq->fbc, ix); +} + +static inline u16 mlx5_wq_ll_get_wqe_next_ix(struct mlx5_wq_ll *wq, u16 ix) +{ + struct mlx5_wqe_srq_next_seg *wqe = mlx5_wq_ll_get_wqe(wq, ix); + + return be16_to_cpu(wqe->next_wqe_index); +} + +static inline void mlx5_wq_ll_push(struct mlx5_wq_ll *wq, u16 head_next) +{ + wq->head = head_next; + wq->wqe_ctr++; + wq->cur_sz++; +} + +static inline void mlx5_wq_ll_pop(struct mlx5_wq_ll *wq, __be16 ix, + __be16 *next_tail_next) +{ + *wq->tail_next = ix; + wq->tail_next = next_tail_next; + wq->cur_sz--; +} + +static inline void mlx5_wq_ll_update_db_record(struct mlx5_wq_ll *wq) +{ + *wq->db = cpu_to_be32(wq->wqe_ctr); +} + +static inline u16 mlx5_wq_ll_get_head(struct mlx5_wq_ll *wq) +{ + return wq->head; +} + +static inline u16 mlx5_wq_ll_get_counter(struct mlx5_wq_ll *wq) +{ + return wq->wqe_ctr; +} + +#endif /* __MLX5_WQ_H__ */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxfw/Kconfig b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxfw/Kconfig new file mode 100644 index 0000000..c339f3c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxfw/Kconfig @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# Mellanox firmware flash library configuration +# + +config MLXFW + tristate "Mellanox Technologies firmware flash module" + help + This driver supports Mellanox Technologies Firmware + flashing common logic. + + To compile this driver as a module, choose M here: the + module will be called mlxfw. + select XZ_DEC + select NET_DEVLINK diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxfw/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxfw/Makefile new file mode 100644 index 0000000..36007cd --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxfw/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_MLXFW) += mlxfw.o +mlxfw-objs := mlxfw_fsm.o mlxfw_mfa2_tlv_multi.o mlxfw_mfa2.o diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxfw/mlxfw.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxfw/mlxfw.h new file mode 100644 index 0000000..e6475ea --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxfw/mlxfw.h @@ -0,0 +1,115 @@ +/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 */ +/* Copyright (c) 2017-2019 Mellanox Technologies. All rights reserved */ + +#ifndef _MLXFW_H +#define _MLXFW_H + +#include +#include +#include +#include + +struct mlxfw_dev { + const struct mlxfw_dev_ops *ops; + const char *psid; + u16 psid_size; + struct devlink *devlink; +}; + +static inline +struct device *mlxfw_dev_dev(struct mlxfw_dev *mlxfw_dev) +{ + return devlink_to_dev(mlxfw_dev->devlink); +} + +#define MLXFW_PRFX "mlxfw: " + +#define mlxfw_info(mlxfw_dev, fmt, ...) \ + dev_info(mlxfw_dev_dev(mlxfw_dev), MLXFW_PRFX fmt, ## __VA_ARGS__) +#define mlxfw_err(mlxfw_dev, fmt, ...) \ + dev_err(mlxfw_dev_dev(mlxfw_dev), MLXFW_PRFX fmt, ## __VA_ARGS__) +#define mlxfw_dbg(mlxfw_dev, fmt, ...) \ + dev_dbg(mlxfw_dev_dev(mlxfw_dev), MLXFW_PRFX fmt, ## __VA_ARGS__) + +enum mlxfw_fsm_state { + MLXFW_FSM_STATE_IDLE, + MLXFW_FSM_STATE_LOCKED, + MLXFW_FSM_STATE_INITIALIZE, + MLXFW_FSM_STATE_DOWNLOAD, + MLXFW_FSM_STATE_VERIFY, + MLXFW_FSM_STATE_APPLY, + MLXFW_FSM_STATE_ACTIVATE, +}; + +enum mlxfw_fsm_state_err { + MLXFW_FSM_STATE_ERR_OK, + MLXFW_FSM_STATE_ERR_ERROR, + MLXFW_FSM_STATE_ERR_REJECTED_DIGEST_ERR, + MLXFW_FSM_STATE_ERR_REJECTED_NOT_APPLICABLE, + MLXFW_FSM_STATE_ERR_REJECTED_UNKNOWN_KEY, + MLXFW_FSM_STATE_ERR_REJECTED_AUTH_FAILED, + MLXFW_FSM_STATE_ERR_REJECTED_UNSIGNED, + MLXFW_FSM_STATE_ERR_REJECTED_KEY_NOT_APPLICABLE, + MLXFW_FSM_STATE_ERR_REJECTED_BAD_FORMAT, + MLXFW_FSM_STATE_ERR_BLOCKED_PENDING_RESET, + MLXFW_FSM_STATE_ERR_MAX, +}; + +enum mlxfw_fsm_reactivate_status { + MLXFW_FSM_REACTIVATE_STATUS_OK, + MLXFW_FSM_REACTIVATE_STATUS_BUSY, + MLXFW_FSM_REACTIVATE_STATUS_PROHIBITED_FW_VER_ERR, + MLXFW_FSM_REACTIVATE_STATUS_FIRST_PAGE_COPY_FAILED, + MLXFW_FSM_REACTIVATE_STATUS_FIRST_PAGE_ERASE_FAILED, + MLXFW_FSM_REACTIVATE_STATUS_FIRST_PAGE_RESTORE_FAILED, + MLXFW_FSM_REACTIVATE_STATUS_CANDIDATE_FW_DEACTIVATION_FAILED, + MLXFW_FSM_REACTIVATE_STATUS_FW_ALREADY_ACTIVATED, + MLXFW_FSM_REACTIVATE_STATUS_ERR_DEVICE_RESET_REQUIRED, + MLXFW_FSM_REACTIVATE_STATUS_ERR_FW_PROGRAMMING_NEEDED, + MLXFW_FSM_REACTIVATE_STATUS_MAX, +}; + +struct mlxfw_dev_ops { + int (*component_query)(struct mlxfw_dev *mlxfw_dev, u16 component_index, + u32 *p_max_size, u8 *p_align_bits, + u16 *p_max_write_size); + + int (*fsm_lock)(struct mlxfw_dev *mlxfw_dev, u32 *fwhandle); + + int (*fsm_component_update)(struct mlxfw_dev *mlxfw_dev, u32 fwhandle, + u16 component_index, u32 component_size); + + int (*fsm_block_download)(struct mlxfw_dev *mlxfw_dev, u32 fwhandle, + u8 *data, u16 size, u32 offset); + + int (*fsm_component_verify)(struct mlxfw_dev *mlxfw_dev, u32 fwhandle, + u16 component_index); + + int (*fsm_activate)(struct mlxfw_dev *mlxfw_dev, u32 fwhandle); + + int (*fsm_reactivate)(struct mlxfw_dev *mlxfw_dev, u8 *status); + + int (*fsm_query_state)(struct mlxfw_dev *mlxfw_dev, u32 fwhandle, + enum mlxfw_fsm_state *fsm_state, + enum mlxfw_fsm_state_err *fsm_state_err); + + void (*fsm_cancel)(struct mlxfw_dev *mlxfw_dev, u32 fwhandle); + + void (*fsm_release)(struct mlxfw_dev *mlxfw_dev, u32 fwhandle); +}; + +#if IS_REACHABLE(CONFIG_MLXFW) +int mlxfw_firmware_flash(struct mlxfw_dev *mlxfw_dev, + const struct firmware *firmware, + struct netlink_ext_ack *extack); +#else +static inline +int mlxfw_firmware_flash(struct mlxfw_dev *mlxfw_dev, + const struct firmware *firmware, + struct netlink_ext_ack *extack) +{ + return -EOPNOTSUPP; +} +#endif + +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxfw/mlxfw_fsm.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxfw/mlxfw_fsm.c new file mode 100644 index 0000000..6f7a1ef --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxfw/mlxfw_fsm.c @@ -0,0 +1,439 @@ +// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 +/* Copyright (c) 2017-2019 Mellanox Technologies. All rights reserved */ + +#ifdef pr_fmt +#undef pr_fmt +#endif +#define pr_fmt(fmt) "mlxfw: " fmt + +#include +#include +#include + +#include "mlxfw.h" +#include "mlxfw_mfa2.h" + +#define MLXFW_FSM_STATE_WAIT_CYCLE_MS 200 +#define MLXFW_FSM_STATE_WAIT_TIMEOUT_MS 30000 +#define MLXFW_FSM_STATE_WAIT_ROUNDS \ + (MLXFW_FSM_STATE_WAIT_TIMEOUT_MS / MLXFW_FSM_STATE_WAIT_CYCLE_MS) +#define MLXFW_FSM_MAX_COMPONENT_SIZE (10 * (1 << 20)) + +static const int mlxfw_fsm_state_errno[] = { + [MLXFW_FSM_STATE_ERR_ERROR] = -EIO, + [MLXFW_FSM_STATE_ERR_REJECTED_DIGEST_ERR] = -EBADMSG, + [MLXFW_FSM_STATE_ERR_REJECTED_NOT_APPLICABLE] = -ENOENT, + [MLXFW_FSM_STATE_ERR_REJECTED_UNKNOWN_KEY] = -ENOKEY, + [MLXFW_FSM_STATE_ERR_REJECTED_AUTH_FAILED] = -EACCES, + [MLXFW_FSM_STATE_ERR_REJECTED_UNSIGNED] = -EKEYREVOKED, + [MLXFW_FSM_STATE_ERR_REJECTED_KEY_NOT_APPLICABLE] = -EKEYREJECTED, + [MLXFW_FSM_STATE_ERR_REJECTED_BAD_FORMAT] = -ENOEXEC, + [MLXFW_FSM_STATE_ERR_BLOCKED_PENDING_RESET] = -EBUSY, + [MLXFW_FSM_STATE_ERR_MAX] = -EINVAL +}; + +#define MLXFW_ERR_PRFX "Firmware flash failed: " +#define MLXFW_ERR_MSG(fwdev, extack, msg, err) do { \ + mlxfw_err(fwdev, "%s, err (%d)\n", MLXFW_ERR_PRFX msg, err); \ + NL_SET_ERR_MSG_MOD(extack, MLXFW_ERR_PRFX msg); \ +} while (0) + +static int mlxfw_fsm_state_err(struct mlxfw_dev *mlxfw_dev, + struct netlink_ext_ack *extack, + enum mlxfw_fsm_state_err err) +{ + enum mlxfw_fsm_state_err fsm_state_err; + + fsm_state_err = min_t(enum mlxfw_fsm_state_err, err, + MLXFW_FSM_STATE_ERR_MAX); + + switch (fsm_state_err) { + case MLXFW_FSM_STATE_ERR_ERROR: + MLXFW_ERR_MSG(mlxfw_dev, extack, "general error", err); + break; + case MLXFW_FSM_STATE_ERR_REJECTED_DIGEST_ERR: + MLXFW_ERR_MSG(mlxfw_dev, extack, "component hash mismatch", err); + break; + case MLXFW_FSM_STATE_ERR_REJECTED_NOT_APPLICABLE: + MLXFW_ERR_MSG(mlxfw_dev, extack, "component not applicable", err); + break; + case MLXFW_FSM_STATE_ERR_REJECTED_UNKNOWN_KEY: + MLXFW_ERR_MSG(mlxfw_dev, extack, "unknown key", err); + break; + case MLXFW_FSM_STATE_ERR_REJECTED_AUTH_FAILED: + MLXFW_ERR_MSG(mlxfw_dev, extack, "authentication failed", err); + break; + case MLXFW_FSM_STATE_ERR_REJECTED_UNSIGNED: + MLXFW_ERR_MSG(mlxfw_dev, extack, "component was not signed", err); + break; + case MLXFW_FSM_STATE_ERR_REJECTED_KEY_NOT_APPLICABLE: + MLXFW_ERR_MSG(mlxfw_dev, extack, "key not applicable", err); + break; + case MLXFW_FSM_STATE_ERR_REJECTED_BAD_FORMAT: + MLXFW_ERR_MSG(mlxfw_dev, extack, "bad format", err); + break; + case MLXFW_FSM_STATE_ERR_BLOCKED_PENDING_RESET: + MLXFW_ERR_MSG(mlxfw_dev, extack, "pending reset", err); + break; + case MLXFW_FSM_STATE_ERR_OK: + case MLXFW_FSM_STATE_ERR_MAX: + MLXFW_ERR_MSG(mlxfw_dev, extack, "unknown error", err); + break; + } + + return mlxfw_fsm_state_errno[fsm_state_err]; +}; + +static int mlxfw_fsm_state_wait(struct mlxfw_dev *mlxfw_dev, u32 fwhandle, + enum mlxfw_fsm_state fsm_state, + struct netlink_ext_ack *extack) +{ + enum mlxfw_fsm_state_err fsm_state_err; + enum mlxfw_fsm_state curr_fsm_state; + int times; + int err; + + times = MLXFW_FSM_STATE_WAIT_ROUNDS; +retry: + err = mlxfw_dev->ops->fsm_query_state(mlxfw_dev, fwhandle, + &curr_fsm_state, &fsm_state_err); + if (err) { + MLXFW_ERR_MSG(mlxfw_dev, extack, "FSM state query failed", err); + return err; + } + + if (fsm_state_err != MLXFW_FSM_STATE_ERR_OK) + return mlxfw_fsm_state_err(mlxfw_dev, extack, fsm_state_err); + + if (curr_fsm_state != fsm_state) { + if (--times == 0) { + MLXFW_ERR_MSG(mlxfw_dev, extack, + "Timeout reached on FSM state change", -ETIMEDOUT); + return -ETIMEDOUT; + } + msleep(MLXFW_FSM_STATE_WAIT_CYCLE_MS); + goto retry; + } + return 0; +} + +static int +mlxfw_fsm_reactivate_err(struct mlxfw_dev *mlxfw_dev, + struct netlink_ext_ack *extack, u8 err) +{ + enum mlxfw_fsm_reactivate_status status; + +#define MXFW_REACT_PRFX "Reactivate FSM: " +#define MLXFW_REACT_ERR(msg, err) \ + MLXFW_ERR_MSG(mlxfw_dev, extack, MXFW_REACT_PRFX msg, err) + + status = min_t(enum mlxfw_fsm_reactivate_status, err, + MLXFW_FSM_REACTIVATE_STATUS_MAX); + + switch (status) { + case MLXFW_FSM_REACTIVATE_STATUS_BUSY: + MLXFW_REACT_ERR("busy", err); + break; + case MLXFW_FSM_REACTIVATE_STATUS_PROHIBITED_FW_VER_ERR: + MLXFW_REACT_ERR("prohibited fw ver", err); + break; + case MLXFW_FSM_REACTIVATE_STATUS_FIRST_PAGE_COPY_FAILED: + MLXFW_REACT_ERR("first page copy failed", err); + break; + case MLXFW_FSM_REACTIVATE_STATUS_FIRST_PAGE_ERASE_FAILED: + MLXFW_REACT_ERR("first page erase failed", err); + break; + case MLXFW_FSM_REACTIVATE_STATUS_FIRST_PAGE_RESTORE_FAILED: + MLXFW_REACT_ERR("first page restore failed", err); + break; + case MLXFW_FSM_REACTIVATE_STATUS_CANDIDATE_FW_DEACTIVATION_FAILED: + MLXFW_REACT_ERR("candidate fw deactivation failed", err); + break; + case MLXFW_FSM_REACTIVATE_STATUS_ERR_DEVICE_RESET_REQUIRED: + MLXFW_REACT_ERR("device reset required", err); + break; + case MLXFW_FSM_REACTIVATE_STATUS_ERR_FW_PROGRAMMING_NEEDED: + MLXFW_REACT_ERR("fw programming needed", err); + break; + case MLXFW_FSM_REACTIVATE_STATUS_FW_ALREADY_ACTIVATED: + MLXFW_REACT_ERR("fw already activated", err); + break; + case MLXFW_FSM_REACTIVATE_STATUS_OK: + case MLXFW_FSM_REACTIVATE_STATUS_MAX: + MLXFW_REACT_ERR("unexpected error", err); + break; + } + return -EREMOTEIO; +}; + +static int mlxfw_fsm_reactivate(struct mlxfw_dev *mlxfw_dev, + struct netlink_ext_ack *extack, + bool *supported) +{ + u8 status; + int err; + + if (!mlxfw_dev->ops->fsm_reactivate) + return 0; + + err = mlxfw_dev->ops->fsm_reactivate(mlxfw_dev, &status); + if (err == -EOPNOTSUPP) { + *supported = false; + return 0; + } + + if (err) { + MLXFW_ERR_MSG(mlxfw_dev, extack, + "Could not reactivate firmware flash", err); + return err; + } + + if (status == MLXFW_FSM_REACTIVATE_STATUS_OK || + status == MLXFW_FSM_REACTIVATE_STATUS_FW_ALREADY_ACTIVATED) + return 0; + + return mlxfw_fsm_reactivate_err(mlxfw_dev, extack, status); +} + +static void mlxfw_status_notify(struct mlxfw_dev *mlxfw_dev, + const char *msg, const char *comp_name, + u32 done_bytes, u32 total_bytes) +{ + devlink_flash_update_status_notify(mlxfw_dev->devlink, msg, comp_name, + done_bytes, total_bytes); +} + +#define MLXFW_ALIGN_DOWN(x, align_bits) ((x) & ~((1 << (align_bits)) - 1)) +#define MLXFW_ALIGN_UP(x, align_bits) \ + MLXFW_ALIGN_DOWN((x) + ((1 << (align_bits)) - 1), (align_bits)) + +static int mlxfw_flash_component(struct mlxfw_dev *mlxfw_dev, + u32 fwhandle, + struct mlxfw_mfa2_component *comp, + bool reactivate_supp, + struct netlink_ext_ack *extack) +{ + u16 comp_max_write_size; + u8 comp_align_bits; + u32 comp_max_size; + char comp_name[8]; + u16 block_size; + u8 *block_ptr; + u32 offset; + int err; + + sprintf(comp_name, "%u", comp->index); + + err = mlxfw_dev->ops->component_query(mlxfw_dev, comp->index, + &comp_max_size, &comp_align_bits, + &comp_max_write_size); + if (err) { + MLXFW_ERR_MSG(mlxfw_dev, extack, "FSM component query failed", err); + return err; + } + + comp_max_size = min_t(u32, comp_max_size, MLXFW_FSM_MAX_COMPONENT_SIZE); + if (comp->data_size > comp_max_size) { + MLXFW_ERR_MSG(mlxfw_dev, extack, + "Component size is bigger than limit", -EINVAL); + return -EINVAL; + } + + comp_max_write_size = MLXFW_ALIGN_DOWN(comp_max_write_size, + comp_align_bits); + + mlxfw_dbg(mlxfw_dev, "Component update\n"); + mlxfw_status_notify(mlxfw_dev, "Updating component", comp_name, 0, 0); + err = mlxfw_dev->ops->fsm_component_update(mlxfw_dev, fwhandle, + comp->index, + comp->data_size); + if (err) { + if (!reactivate_supp) + MLXFW_ERR_MSG(mlxfw_dev, extack, + "FSM component update failed, FW reactivate is not supported", + err); + else + MLXFW_ERR_MSG(mlxfw_dev, extack, + "FSM component update failed", err); + return err; + } + + err = mlxfw_fsm_state_wait(mlxfw_dev, fwhandle, + MLXFW_FSM_STATE_DOWNLOAD, extack); + if (err) + goto err_out; + + mlxfw_dbg(mlxfw_dev, "Component download\n"); + mlxfw_status_notify(mlxfw_dev, "Downloading component", + comp_name, 0, comp->data_size); + for (offset = 0; + offset < MLXFW_ALIGN_UP(comp->data_size, comp_align_bits); + offset += comp_max_write_size) { + block_ptr = comp->data + offset; + block_size = (u16) min_t(u32, comp->data_size - offset, + comp_max_write_size); + err = mlxfw_dev->ops->fsm_block_download(mlxfw_dev, fwhandle, + block_ptr, block_size, + offset); + if (err) { + MLXFW_ERR_MSG(mlxfw_dev, extack, + "Component download failed", err); + goto err_out; + } + mlxfw_status_notify(mlxfw_dev, "Downloading component", + comp_name, offset + block_size, + comp->data_size); + } + + mlxfw_dbg(mlxfw_dev, "Component verify\n"); + mlxfw_status_notify(mlxfw_dev, "Verifying component", comp_name, 0, 0); + err = mlxfw_dev->ops->fsm_component_verify(mlxfw_dev, fwhandle, + comp->index); + if (err) { + MLXFW_ERR_MSG(mlxfw_dev, extack, + "FSM component verify failed", err); + goto err_out; + } + + err = mlxfw_fsm_state_wait(mlxfw_dev, fwhandle, + MLXFW_FSM_STATE_LOCKED, extack); + if (err) + goto err_out; + return 0; + +err_out: + mlxfw_dev->ops->fsm_cancel(mlxfw_dev, fwhandle); + return err; +} + +static int mlxfw_flash_components(struct mlxfw_dev *mlxfw_dev, u32 fwhandle, + struct mlxfw_mfa2_file *mfa2_file, + bool reactivate_supp, + struct netlink_ext_ack *extack) +{ + u32 component_count; + int err; + int i; + + err = mlxfw_mfa2_file_component_count(mfa2_file, mlxfw_dev->psid, + mlxfw_dev->psid_size, + &component_count); + if (err) { + MLXFW_ERR_MSG(mlxfw_dev, extack, + "Could not find device PSID in MFA2 file", err); + return err; + } + + for (i = 0; i < component_count; i++) { + struct mlxfw_mfa2_component *comp; + + comp = mlxfw_mfa2_file_component_get(mfa2_file, mlxfw_dev->psid, + mlxfw_dev->psid_size, i); + if (IS_ERR(comp)) { + err = PTR_ERR(comp); + MLXFW_ERR_MSG(mlxfw_dev, extack, + "Failed to get MFA2 component", err); + return err; + } + + mlxfw_info(mlxfw_dev, "Flashing component type %d\n", + comp->index); + err = mlxfw_flash_component(mlxfw_dev, fwhandle, comp, + reactivate_supp, extack); + mlxfw_mfa2_file_component_put(comp); + if (err) + return err; + } + return 0; +} + +int mlxfw_firmware_flash(struct mlxfw_dev *mlxfw_dev, + const struct firmware *firmware, + struct netlink_ext_ack *extack) +{ + struct mlxfw_mfa2_file *mfa2_file; + bool reactivate_supp = true; + u32 fwhandle; + int err; + + if (!mlxfw_mfa2_check(firmware)) { + MLXFW_ERR_MSG(mlxfw_dev, extack, + "Firmware file is not MFA2", -EINVAL); + return -EINVAL; + } + + mfa2_file = mlxfw_mfa2_file_init(firmware); + if (IS_ERR(mfa2_file)) { + err = PTR_ERR(mfa2_file); + MLXFW_ERR_MSG(mlxfw_dev, extack, + "Failed to initialize MFA2 firmware file", err); + return err; + } + + mlxfw_info(mlxfw_dev, "Initialize firmware flash process\n"); + mlxfw_status_notify(mlxfw_dev, "Initializing firmware flash process", + NULL, 0, 0); + err = mlxfw_dev->ops->fsm_lock(mlxfw_dev, &fwhandle); + if (err) { + MLXFW_ERR_MSG(mlxfw_dev, extack, + "Could not lock the firmware FSM", err); + goto err_fsm_lock; + } + + err = mlxfw_fsm_state_wait(mlxfw_dev, fwhandle, + MLXFW_FSM_STATE_LOCKED, extack); + if (err) + goto err_state_wait_idle_to_locked; + + err = mlxfw_fsm_reactivate(mlxfw_dev, extack, &reactivate_supp); + if (err) + goto err_fsm_reactivate; + + err = mlxfw_fsm_state_wait(mlxfw_dev, fwhandle, + MLXFW_FSM_STATE_LOCKED, extack); + if (err) + goto err_state_wait_reactivate_to_locked; + + err = mlxfw_flash_components(mlxfw_dev, fwhandle, mfa2_file, + reactivate_supp, extack); + if (err) + goto err_flash_components; + + mlxfw_dbg(mlxfw_dev, "Activate image\n"); + mlxfw_status_notify(mlxfw_dev, "Activating image", NULL, 0, 0); + err = mlxfw_dev->ops->fsm_activate(mlxfw_dev, fwhandle); + if (err) { + MLXFW_ERR_MSG(mlxfw_dev, extack, + "Could not activate the downloaded image", err); + goto err_fsm_activate; + } + + err = mlxfw_fsm_state_wait(mlxfw_dev, fwhandle, + MLXFW_FSM_STATE_LOCKED, extack); + if (err) + goto err_state_wait_activate_to_locked; + + mlxfw_dbg(mlxfw_dev, "Handle release\n"); + mlxfw_dev->ops->fsm_release(mlxfw_dev, fwhandle); + + mlxfw_info(mlxfw_dev, "Firmware flash done\n"); + mlxfw_status_notify(mlxfw_dev, "Firmware flash done", NULL, 0, 0); + mlxfw_mfa2_file_fini(mfa2_file); + return 0; + +err_state_wait_activate_to_locked: +err_fsm_activate: +err_flash_components: +err_state_wait_reactivate_to_locked: +err_fsm_reactivate: +err_state_wait_idle_to_locked: + mlxfw_dev->ops->fsm_release(mlxfw_dev, fwhandle); +err_fsm_lock: + mlxfw_mfa2_file_fini(mfa2_file); + return err; +} +EXPORT_SYMBOL(mlxfw_firmware_flash); + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_AUTHOR("Yotam Gigi "); +MODULE_DESCRIPTION("Mellanox firmware flash lib"); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2.c new file mode 100644 index 0000000..5789101 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2.c @@ -0,0 +1,592 @@ +// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 +/* Copyright (c) 2017-2019 Mellanox Technologies. All rights reserved */ + +#ifdef pr_fmt +#undef pr_fmt +#endif +#define pr_fmt(fmt) "mlxfw_mfa2: " fmt + +#include +#include +#include +#include +#include +#include "mlxfw_mfa2.h" +#include "mlxfw_mfa2_file.h" +#include "mlxfw_mfa2_tlv.h" +#include "mlxfw_mfa2_format.h" +#include "mlxfw_mfa2_tlv_multi.h" + +/* MFA2 FILE + * +----------------------------------+ + * | MFA2 finger print | + * +----------------------------------+ + * | package descriptor multi_tlv | + * | +------------------------------+ | +-----------------+ + * | | package descriptor tlv +-----> |num_devices=n | + * | +------------------------------+ | |num_components=m | + * +----------------------------------+ |CB offset | + * | device descriptor multi_tlv | |... | + * | +------------------------------+ | | | + * | | PSID tlv | | +-----------------+ + * | +------------------------------+ | + * | | component index tlv | | + * | +------------------------------+ | + * +----------------------------------+ + * | component descriptor multi_tlv | + * | +------------------------------+ | +-----------------+ + * | | component descriptor tlv +-----> |Among others: | + * | +------------------------------+ | |CB offset=o | + * +----------------------------------+ |comp index=i | + * | | |... | + * | | | | + * | | +-----------------+ + * | COMPONENT BLOCK (CB) | + * | | + * | | + * | | + * +----------------------------------+ + * + * On the top level, an MFA2 file contains: + * - Fingerprint + * - Several multi_tlvs (TLVs of type MLXFW_MFA2_TLV_MULTI, as defined in + * mlxfw_mfa2_format.h) + * - Compresses content block + * + * The first multi_tlv + * ------------------- + * The first multi TLV is treated as package descriptor, and expected to have a + * first TLV child of type MLXFW_MFA2_TLV_PACKAGE_DESCRIPTOR which contains all + * the global information needed to parse the file. Among others, it contains + * the number of device descriptors and component descriptor following this + * multi TLV. + * + * The device descriptor multi_tlv + * ------------------------------- + * The multi TLVs following the package descriptor are treated as device + * descriptor, and are expected to have the following children: + * - PSID TLV child of type MLXFW_MFA2_TLV_PSID containing that device PSID. + * - Component index of type MLXFW_MFA2_TLV_COMPONENT_PTR that contains that + * device component index. + * + * The component descriptor multi_tlv + * ---------------------------------- + * The multi TLVs following the device descriptor multi TLVs are treated as + * component descriptor, and are expected to have a first child of type + * MLXFW_MFA2_TLV_COMPONENT_DESCRIPTOR that contains mostly the component index, + * needed for the flash process and the offset to the binary within the + * component block. + */ + +static const u8 mlxfw_mfa2_fingerprint[] = "MLNX.MFA2.XZ.00!"; +static const int mlxfw_mfa2_fingerprint_len = + sizeof(mlxfw_mfa2_fingerprint) - 1; + +static const u8 mlxfw_mfa2_comp_magic[] = "#BIN.COMPONENT!#"; +static const int mlxfw_mfa2_comp_magic_len = sizeof(mlxfw_mfa2_comp_magic) - 1; + +bool mlxfw_mfa2_check(const struct firmware *fw) +{ + if (fw->size < sizeof(mlxfw_mfa2_fingerprint)) + return false; + + return memcmp(fw->data, mlxfw_mfa2_fingerprint, + mlxfw_mfa2_fingerprint_len) == 0; +} + +static bool +mlxfw_mfa2_tlv_multi_validate(const struct mlxfw_mfa2_file *mfa2_file, + const struct mlxfw_mfa2_tlv_multi *multi) +{ + const struct mlxfw_mfa2_tlv *tlv; + u16 idx; + + /* Check that all children are valid */ + mlxfw_mfa2_tlv_multi_foreach(mfa2_file, tlv, idx, multi) { + if (!tlv) { + pr_err("Multi has invalid child"); + return false; + } + } + return true; +} + +static bool +mlxfw_mfa2_file_dev_validate(const struct mlxfw_mfa2_file *mfa2_file, + const struct mlxfw_mfa2_tlv *dev_tlv, + u16 dev_idx) +{ + const struct mlxfw_mfa2_tlv_component_ptr *cptr; + const struct mlxfw_mfa2_tlv_multi *multi; + const struct mlxfw_mfa2_tlv_psid *psid; + const struct mlxfw_mfa2_tlv *tlv; + u16 cptr_count; + u16 cptr_idx; + int err; + + pr_debug("Device %d\n", dev_idx); + + multi = mlxfw_mfa2_tlv_multi_get(mfa2_file, dev_tlv); + if (!multi) { + pr_err("Device %d is not a valid TLV error\n", dev_idx); + return false; + } + + if (!mlxfw_mfa2_tlv_multi_validate(mfa2_file, multi)) + return false; + + /* Validate the device has PSID tlv */ + tlv = mlxfw_mfa2_tlv_multi_child_find(mfa2_file, multi, + MLXFW_MFA2_TLV_PSID, 0); + if (!tlv) { + pr_err("Device %d does not have PSID\n", dev_idx); + return false; + } + + psid = mlxfw_mfa2_tlv_psid_get(mfa2_file, tlv); + if (!psid) { + pr_err("Device %d PSID TLV is not valid\n", dev_idx); + return false; + } + + print_hex_dump_debug(" -- Device PSID ", DUMP_PREFIX_NONE, 16, 16, + psid->psid, be16_to_cpu(tlv->len), true); + + /* Validate the device has COMPONENT_PTR */ + err = mlxfw_mfa2_tlv_multi_child_count(mfa2_file, multi, + MLXFW_MFA2_TLV_COMPONENT_PTR, + &cptr_count); + if (err) + return false; + + if (cptr_count == 0) { + pr_err("Device %d has no components\n", dev_idx); + return false; + } + + for (cptr_idx = 0; cptr_idx < cptr_count; cptr_idx++) { + tlv = mlxfw_mfa2_tlv_multi_child_find(mfa2_file, multi, + MLXFW_MFA2_TLV_COMPONENT_PTR, + cptr_idx); + if (!tlv) + return false; + + cptr = mlxfw_mfa2_tlv_component_ptr_get(mfa2_file, tlv); + if (!cptr) { + pr_err("Device %d COMPONENT_PTR TLV is not valid\n", + dev_idx); + return false; + } + + pr_debug(" -- Component index %d\n", + be16_to_cpu(cptr->component_index)); + } + return true; +} + +static bool +mlxfw_mfa2_file_comp_validate(const struct mlxfw_mfa2_file *mfa2_file, + const struct mlxfw_mfa2_tlv *comp_tlv, + u16 comp_idx) +{ + const struct mlxfw_mfa2_tlv_component_descriptor *cdesc; + const struct mlxfw_mfa2_tlv_multi *multi; + const struct mlxfw_mfa2_tlv *tlv; + + pr_debug("Component %d\n", comp_idx); + + multi = mlxfw_mfa2_tlv_multi_get(mfa2_file, comp_tlv); + if (!multi) { + pr_err("Component %d is not a valid TLV error\n", comp_idx); + return false; + } + + if (!mlxfw_mfa2_tlv_multi_validate(mfa2_file, multi)) + return false; + + /* Check that component have COMPONENT_DESCRIPTOR as first child */ + tlv = mlxfw_mfa2_tlv_multi_child(mfa2_file, multi); + if (!tlv) { + pr_err("Component descriptor %d multi TLV error\n", comp_idx); + return false; + } + + cdesc = mlxfw_mfa2_tlv_component_descriptor_get(mfa2_file, tlv); + if (!cdesc) { + pr_err("Component %d does not have a valid descriptor\n", + comp_idx); + return false; + } + pr_debug(" -- Component type %d\n", be16_to_cpu(cdesc->identifier)); + pr_debug(" -- Offset 0x%llx and size %d\n", + ((u64) be32_to_cpu(cdesc->cb_offset_h) << 32) + | be32_to_cpu(cdesc->cb_offset_l), be32_to_cpu(cdesc->size)); + + return true; +} + +static bool mlxfw_mfa2_file_validate(const struct mlxfw_mfa2_file *mfa2_file) +{ + const struct mlxfw_mfa2_tlv *tlv; + u16 idx; + + pr_debug("Validating file\n"); + + /* check that all the devices exist */ + mlxfw_mfa2_tlv_foreach(mfa2_file, tlv, idx, mfa2_file->first_dev, + mfa2_file->dev_count) { + if (!tlv) { + pr_err("Device TLV error\n"); + return false; + } + + /* Check each device */ + if (!mlxfw_mfa2_file_dev_validate(mfa2_file, tlv, idx)) + return false; + } + + /* check that all the components exist */ + mlxfw_mfa2_tlv_foreach(mfa2_file, tlv, idx, mfa2_file->first_component, + mfa2_file->component_count) { + if (!tlv) { + pr_err("Device TLV error\n"); + return false; + } + + /* Check each component */ + if (!mlxfw_mfa2_file_comp_validate(mfa2_file, tlv, idx)) + return false; + } + return true; +} + +struct mlxfw_mfa2_file *mlxfw_mfa2_file_init(const struct firmware *fw) +{ + const struct mlxfw_mfa2_tlv_package_descriptor *pd; + const struct mlxfw_mfa2_tlv_multi *multi; + const struct mlxfw_mfa2_tlv *multi_child; + const struct mlxfw_mfa2_tlv *first_tlv; + struct mlxfw_mfa2_file *mfa2_file; + const void *first_tlv_ptr; + const void *cb_top_ptr; + + mfa2_file = kzalloc(sizeof(*mfa2_file), GFP_KERNEL); + if (!mfa2_file) + return ERR_PTR(-ENOMEM); + + mfa2_file->fw = fw; + first_tlv_ptr = fw->data + NLA_ALIGN(mlxfw_mfa2_fingerprint_len); + first_tlv = mlxfw_mfa2_tlv_get(mfa2_file, first_tlv_ptr); + if (!first_tlv) { + pr_err("Could not parse package descriptor TLV\n"); + goto err_out; + } + + multi = mlxfw_mfa2_tlv_multi_get(mfa2_file, first_tlv); + if (!multi) { + pr_err("First TLV is not of valid multi type\n"); + goto err_out; + } + + multi_child = mlxfw_mfa2_tlv_multi_child(mfa2_file, multi); + if (!multi_child) + goto err_out; + + pd = mlxfw_mfa2_tlv_package_descriptor_get(mfa2_file, multi_child); + if (!pd) { + pr_err("Could not parse package descriptor TLV\n"); + goto err_out; + } + + mfa2_file->first_dev = mlxfw_mfa2_tlv_next(mfa2_file, first_tlv); + if (!mfa2_file->first_dev) { + pr_err("First device TLV is not valid\n"); + goto err_out; + } + + mfa2_file->dev_count = be16_to_cpu(pd->num_devices); + mfa2_file->first_component = mlxfw_mfa2_tlv_advance(mfa2_file, + mfa2_file->first_dev, + mfa2_file->dev_count); + mfa2_file->component_count = be16_to_cpu(pd->num_components); + mfa2_file->cb = fw->data + NLA_ALIGN(be32_to_cpu(pd->cb_offset)); + if (!mlxfw_mfa2_valid_ptr(mfa2_file, mfa2_file->cb)) { + pr_err("Component block is out side the file\n"); + goto err_out; + } + mfa2_file->cb_archive_size = be32_to_cpu(pd->cb_archive_size); + cb_top_ptr = mfa2_file->cb + mfa2_file->cb_archive_size - 1; + if (!mlxfw_mfa2_valid_ptr(mfa2_file, cb_top_ptr)) { + pr_err("Component block size is too big\n"); + goto err_out; + } + + if (!mlxfw_mfa2_file_validate(mfa2_file)) + goto err_out; + return mfa2_file; +err_out: + kfree(mfa2_file); + return ERR_PTR(-EINVAL); +} + +static const struct mlxfw_mfa2_tlv_multi * +mlxfw_mfa2_tlv_dev_get(const struct mlxfw_mfa2_file *mfa2_file, + const char *psid, u16 psid_size) +{ + const struct mlxfw_mfa2_tlv_psid *tlv_psid; + const struct mlxfw_mfa2_tlv_multi *dev_multi; + const struct mlxfw_mfa2_tlv *dev_tlv; + const struct mlxfw_mfa2_tlv *tlv; + u32 idx; + + /* for each device tlv */ + mlxfw_mfa2_tlv_foreach(mfa2_file, dev_tlv, idx, mfa2_file->first_dev, + mfa2_file->dev_count) { + if (!dev_tlv) + return NULL; + + dev_multi = mlxfw_mfa2_tlv_multi_get(mfa2_file, dev_tlv); + if (!dev_multi) + return NULL; + + /* find psid child and compare */ + tlv = mlxfw_mfa2_tlv_multi_child_find(mfa2_file, dev_multi, + MLXFW_MFA2_TLV_PSID, 0); + if (!tlv) + return NULL; + if (be16_to_cpu(tlv->len) != psid_size) + continue; + + tlv_psid = mlxfw_mfa2_tlv_psid_get(mfa2_file, tlv); + if (!tlv_psid) + return NULL; + + if (memcmp(psid, tlv_psid->psid, psid_size) == 0) + return dev_multi; + } + + return NULL; +} + +int mlxfw_mfa2_file_component_count(const struct mlxfw_mfa2_file *mfa2_file, + const char *psid, u32 psid_size, + u32 *p_count) +{ + const struct mlxfw_mfa2_tlv_multi *dev_multi; + u16 count; + int err; + + dev_multi = mlxfw_mfa2_tlv_dev_get(mfa2_file, psid, psid_size); + if (!dev_multi) + return -EINVAL; + + err = mlxfw_mfa2_tlv_multi_child_count(mfa2_file, dev_multi, + MLXFW_MFA2_TLV_COMPONENT_PTR, + &count); + if (err) + return err; + + *p_count = count; + return 0; +} + +static int mlxfw_mfa2_xz_dec_run(struct xz_dec *xz_dec, struct xz_buf *xz_buf, + bool *finished) +{ + enum xz_ret xz_ret; + + xz_ret = xz_dec_run(xz_dec, xz_buf); + + switch (xz_ret) { + case XZ_STREAM_END: + *finished = true; + return 0; + case XZ_OK: + *finished = false; + return 0; + case XZ_MEM_ERROR: + pr_err("xz no memory\n"); + return -ENOMEM; + case XZ_DATA_ERROR: + pr_err("xz file corrupted\n"); + return -EINVAL; + case XZ_FORMAT_ERROR: + pr_err("xz format not found\n"); + return -EINVAL; + case XZ_OPTIONS_ERROR: + pr_err("unsupported xz option\n"); + return -EINVAL; + case XZ_MEMLIMIT_ERROR: + pr_err("xz dictionary too small\n"); + return -EINVAL; + default: + pr_err("xz error %d\n", xz_ret); + return -EINVAL; + } +} + +static int mlxfw_mfa2_file_cb_offset_xz(const struct mlxfw_mfa2_file *mfa2_file, + off_t off, size_t size, u8 *buf) +{ + struct xz_dec *xz_dec; + struct xz_buf dec_buf; + off_t curr_off = 0; + bool finished; + int err; + + xz_dec = xz_dec_init(XZ_DYNALLOC, (u32) -1); + if (!xz_dec) + return -EINVAL; + + dec_buf.in_size = mfa2_file->cb_archive_size; + dec_buf.in = mfa2_file->cb; + dec_buf.in_pos = 0; + dec_buf.out = buf; + + /* decode up to the offset */ + do { + dec_buf.out_pos = 0; + dec_buf.out_size = min_t(size_t, size, off - curr_off); + if (dec_buf.out_size == 0) + break; + + err = mlxfw_mfa2_xz_dec_run(xz_dec, &dec_buf, &finished); + if (err) + goto out; + if (finished) { + pr_err("xz section too short\n"); + err = -EINVAL; + goto out; + } + curr_off += dec_buf.out_pos; + } while (curr_off != off); + + /* decode the needed section */ + dec_buf.out_pos = 0; + dec_buf.out_size = size; + err = mlxfw_mfa2_xz_dec_run(xz_dec, &dec_buf, &finished); +out: + xz_dec_end(xz_dec); + return err; +} + +static const struct mlxfw_mfa2_tlv_component_descriptor * +mlxfw_mfa2_file_component_tlv_get(const struct mlxfw_mfa2_file *mfa2_file, + u16 comp_index) +{ + const struct mlxfw_mfa2_tlv_multi *multi; + const struct mlxfw_mfa2_tlv *multi_child; + const struct mlxfw_mfa2_tlv *comp_tlv; + + if (comp_index > mfa2_file->component_count) + return NULL; + + comp_tlv = mlxfw_mfa2_tlv_advance(mfa2_file, mfa2_file->first_component, + comp_index); + if (!comp_tlv) + return NULL; + + multi = mlxfw_mfa2_tlv_multi_get(mfa2_file, comp_tlv); + if (!multi) + return NULL; + + multi_child = mlxfw_mfa2_tlv_multi_child(mfa2_file, multi); + if (!multi_child) + return NULL; + + return mlxfw_mfa2_tlv_component_descriptor_get(mfa2_file, multi_child); +} + +struct mlxfw_mfa2_comp_data { + struct mlxfw_mfa2_component comp; + u8 buff[]; +}; + +static const struct mlxfw_mfa2_tlv_component_descriptor * +mlxfw_mfa2_file_component_find(const struct mlxfw_mfa2_file *mfa2_file, + const char *psid, int psid_size, + int component_index) +{ + const struct mlxfw_mfa2_tlv_component_ptr *cptr; + const struct mlxfw_mfa2_tlv_multi *dev_multi; + const struct mlxfw_mfa2_tlv *cptr_tlv; + u16 comp_idx; + + dev_multi = mlxfw_mfa2_tlv_dev_get(mfa2_file, psid, psid_size); + if (!dev_multi) + return NULL; + + cptr_tlv = mlxfw_mfa2_tlv_multi_child_find(mfa2_file, dev_multi, + MLXFW_MFA2_TLV_COMPONENT_PTR, + component_index); + if (!cptr_tlv) + return NULL; + + cptr = mlxfw_mfa2_tlv_component_ptr_get(mfa2_file, cptr_tlv); + if (!cptr) + return NULL; + + comp_idx = be16_to_cpu(cptr->component_index); + return mlxfw_mfa2_file_component_tlv_get(mfa2_file, comp_idx); +} + +struct mlxfw_mfa2_component * +mlxfw_mfa2_file_component_get(const struct mlxfw_mfa2_file *mfa2_file, + const char *psid, int psid_size, + int component_index) +{ + const struct mlxfw_mfa2_tlv_component_descriptor *comp; + struct mlxfw_mfa2_comp_data *comp_data; + u32 comp_buf_size; + off_t cb_offset; + u32 comp_size; + int err; + + comp = mlxfw_mfa2_file_component_find(mfa2_file, psid, psid_size, + component_index); + if (!comp) + return ERR_PTR(-EINVAL); + + cb_offset = (u64) be32_to_cpu(comp->cb_offset_h) << 32 | + be32_to_cpu(comp->cb_offset_l); + comp_size = be32_to_cpu(comp->size); + comp_buf_size = comp_size + mlxfw_mfa2_comp_magic_len; + + comp_data = vzalloc(sizeof(*comp_data) + comp_buf_size); + if (!comp_data) + return ERR_PTR(-ENOMEM); + comp_data->comp.data_size = comp_size; + comp_data->comp.index = be16_to_cpu(comp->identifier); + err = mlxfw_mfa2_file_cb_offset_xz(mfa2_file, cb_offset, comp_buf_size, + comp_data->buff); + if (err) { + pr_err("Component could not be reached in CB\n"); + goto err_out; + } + + if (memcmp(comp_data->buff, mlxfw_mfa2_comp_magic, + mlxfw_mfa2_comp_magic_len) != 0) { + pr_err("Component has wrong magic\n"); + err = -EINVAL; + goto err_out; + } + + comp_data->comp.data = comp_data->buff + mlxfw_mfa2_comp_magic_len; + return &comp_data->comp; +err_out: + vfree(comp_data); + return ERR_PTR(err); +} + +void mlxfw_mfa2_file_component_put(struct mlxfw_mfa2_component *comp) +{ + const struct mlxfw_mfa2_comp_data *comp_data; + + comp_data = container_of(comp, struct mlxfw_mfa2_comp_data, comp); + vfree(comp_data); +} + +void mlxfw_mfa2_file_fini(struct mlxfw_mfa2_file *mfa2_file) +{ + kfree(mfa2_file); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2.h new file mode 100644 index 0000000..5bba6ad --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 */ +/* Copyright (c) 2017-2019 Mellanox Technologies. All rights reserved */ + +#ifndef _MLXFW_MFA2_H +#define _MLXFW_MFA2_H + +#include +#include "mlxfw.h" + +struct mlxfw_mfa2_component { + u16 index; + u32 data_size; + u8 *data; +}; + +struct mlxfw_mfa2_file; + +bool mlxfw_mfa2_check(const struct firmware *fw); + +struct mlxfw_mfa2_file *mlxfw_mfa2_file_init(const struct firmware *fw); + +int mlxfw_mfa2_file_component_count(const struct mlxfw_mfa2_file *mfa2_file, + const char *psid, u32 psid_size, + u32 *p_count); + +struct mlxfw_mfa2_component * +mlxfw_mfa2_file_component_get(const struct mlxfw_mfa2_file *mfa2_file, + const char *psid, int psid_size, + int component_index); + +void mlxfw_mfa2_file_component_put(struct mlxfw_mfa2_component *component); + +void mlxfw_mfa2_file_fini(struct mlxfw_mfa2_file *mfa2_file); + +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_file.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_file.h new file mode 100644 index 0000000..874c0a2 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_file.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 */ +/* Copyright (c) 2017-2019 Mellanox Technologies. All rights reserved */ + +#ifndef _MLXFW_MFA2_FILE_H +#define _MLXFW_MFA2_FILE_H + +#include +#include + +struct mlxfw_mfa2_file { + const struct firmware *fw; + const struct mlxfw_mfa2_tlv *first_dev; + u16 dev_count; + const struct mlxfw_mfa2_tlv *first_component; + u16 component_count; + const void *cb; /* components block */ + u32 cb_archive_size; /* size of compressed components block */ +}; + +static inline bool mlxfw_mfa2_valid_ptr(const struct mlxfw_mfa2_file *mfa2_file, + const void *ptr) +{ + const void *valid_to = mfa2_file->fw->data + mfa2_file->fw->size; + const void *valid_from = mfa2_file->fw->data; + + return ptr > valid_from && ptr < valid_to; +} + +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_format.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_format.h new file mode 100644 index 0000000..b001e52 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_format.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 */ +/* Copyright (c) 2017-2019 Mellanox Technologies. All rights reserved */ + +#ifndef _MLXFW_MFA2_FORMAT_H +#define _MLXFW_MFA2_FORMAT_H + +#include "mlxfw_mfa2_file.h" +#include "mlxfw_mfa2_tlv.h" + +enum mlxfw_mfa2_tlv_type { + MLXFW_MFA2_TLV_MULTI_PART = 0x01, + MLXFW_MFA2_TLV_PACKAGE_DESCRIPTOR = 0x02, + MLXFW_MFA2_TLV_COMPONENT_DESCRIPTOR = 0x04, + MLXFW_MFA2_TLV_COMPONENT_PTR = 0x22, + MLXFW_MFA2_TLV_PSID = 0x2A, +}; + +enum mlxfw_mfa2_compression_type { + MLXFW_MFA2_COMPRESSION_TYPE_NONE, + MLXFW_MFA2_COMPRESSION_TYPE_XZ, +}; + +struct mlxfw_mfa2_tlv_package_descriptor { + __be16 num_components; + __be16 num_devices; + __be32 cb_offset; + __be32 cb_archive_size; + __be32 cb_size_h; + __be32 cb_size_l; + u8 padding[3]; + u8 cv_compression; + __be32 user_data_offset; +} __packed; + +MLXFW_MFA2_TLV(package_descriptor, struct mlxfw_mfa2_tlv_package_descriptor, + MLXFW_MFA2_TLV_PACKAGE_DESCRIPTOR); + +struct mlxfw_mfa2_tlv_multi { + __be16 num_extensions; + __be16 total_len; +} __packed; + +MLXFW_MFA2_TLV(multi, struct mlxfw_mfa2_tlv_multi, + MLXFW_MFA2_TLV_MULTI_PART); + +struct mlxfw_mfa2_tlv_psid { + u8 psid[0]; +} __packed; + +MLXFW_MFA2_TLV_VARSIZE(psid, struct mlxfw_mfa2_tlv_psid, + MLXFW_MFA2_TLV_PSID); + +struct mlxfw_mfa2_tlv_component_ptr { + __be16 storage_id; + __be16 component_index; + __be32 storage_address; +} __packed; + +MLXFW_MFA2_TLV(component_ptr, struct mlxfw_mfa2_tlv_component_ptr, + MLXFW_MFA2_TLV_COMPONENT_PTR); + +struct mlxfw_mfa2_tlv_component_descriptor { + __be16 pldm_classification; + __be16 identifier; + __be32 cb_offset_h; + __be32 cb_offset_l; + __be32 size; +} __packed; + +MLXFW_MFA2_TLV(component_descriptor, struct mlxfw_mfa2_tlv_component_descriptor, + MLXFW_MFA2_TLV_COMPONENT_DESCRIPTOR); + +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv.h new file mode 100644 index 0000000..2014a5d --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv.h @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 */ +/* Copyright (c) 2017-2019 Mellanox Technologies. All rights reserved */ + +#ifndef _MLXFW_MFA2_TLV_H +#define _MLXFW_MFA2_TLV_H + +#include +#include "mlxfw_mfa2_file.h" + +struct mlxfw_mfa2_tlv { + u8 version; + u8 type; + __be16 len; + u8 data[]; +} __packed; + +static inline const struct mlxfw_mfa2_tlv * +mlxfw_mfa2_tlv_get(const struct mlxfw_mfa2_file *mfa2_file, const void *ptr) +{ + if (!mlxfw_mfa2_valid_ptr(mfa2_file, ptr) || + !mlxfw_mfa2_valid_ptr(mfa2_file, ptr + sizeof(struct mlxfw_mfa2_tlv))) + return NULL; + return ptr; +} + +static inline const void * +mlxfw_mfa2_tlv_payload_get(const struct mlxfw_mfa2_file *mfa2_file, + const struct mlxfw_mfa2_tlv *tlv, u8 payload_type, + size_t payload_size, bool varsize) +{ + void *tlv_top; + + tlv_top = (void *) tlv + be16_to_cpu(tlv->len) - 1; + if (!mlxfw_mfa2_valid_ptr(mfa2_file, tlv) || + !mlxfw_mfa2_valid_ptr(mfa2_file, tlv_top)) + return NULL; + if (tlv->type != payload_type) + return NULL; + if (varsize && (be16_to_cpu(tlv->len) < payload_size)) + return NULL; + if (!varsize && (be16_to_cpu(tlv->len) != payload_size)) + return NULL; + + return tlv->data; +} + +#define MLXFW_MFA2_TLV(name, payload_type, tlv_type) \ +static inline const payload_type * \ +mlxfw_mfa2_tlv_ ## name ## _get(const struct mlxfw_mfa2_file *mfa2_file, \ + const struct mlxfw_mfa2_tlv *tlv) \ +{ \ + return mlxfw_mfa2_tlv_payload_get(mfa2_file, tlv, \ + tlv_type, sizeof(payload_type), \ + false); \ +} + +#define MLXFW_MFA2_TLV_VARSIZE(name, payload_type, tlv_type) \ +static inline const payload_type * \ +mlxfw_mfa2_tlv_ ## name ## _get(const struct mlxfw_mfa2_file *mfa2_file, \ + const struct mlxfw_mfa2_tlv *tlv) \ +{ \ + return mlxfw_mfa2_tlv_payload_get(mfa2_file, tlv, \ + tlv_type, sizeof(payload_type), \ + true); \ +} + +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv_multi.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv_multi.c new file mode 100644 index 0000000..418a60d --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv_multi.c @@ -0,0 +1,98 @@ +// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 +/* Copyright (c) 2017-2019 Mellanox Technologies. All rights reserved */ + +#ifdef pr_fmt +#undef pr_fmt +#endif +#define pr_fmt(fmt) "MFA2: " fmt + +#include "mlxfw_mfa2_tlv_multi.h" +#include + +#define MLXFW_MFA2_TLV_TOTAL_SIZE(tlv) \ + NLA_ALIGN(sizeof(*(tlv)) + be16_to_cpu((tlv)->len)) + +const struct mlxfw_mfa2_tlv * +mlxfw_mfa2_tlv_multi_child(const struct mlxfw_mfa2_file *mfa2_file, + const struct mlxfw_mfa2_tlv_multi *multi) +{ + size_t multi_len; + + multi_len = NLA_ALIGN(sizeof(struct mlxfw_mfa2_tlv_multi)); + return mlxfw_mfa2_tlv_get(mfa2_file, (void *) multi + multi_len); +} + +const struct mlxfw_mfa2_tlv * +mlxfw_mfa2_tlv_next(const struct mlxfw_mfa2_file *mfa2_file, + const struct mlxfw_mfa2_tlv *tlv) +{ + const struct mlxfw_mfa2_tlv_multi *multi; + u16 tlv_len; + void *next; + + tlv_len = MLXFW_MFA2_TLV_TOTAL_SIZE(tlv); + + if (tlv->type == MLXFW_MFA2_TLV_MULTI_PART) { + multi = mlxfw_mfa2_tlv_multi_get(mfa2_file, tlv); + tlv_len = NLA_ALIGN(tlv_len + be16_to_cpu(multi->total_len)); + } + + next = (void *) tlv + tlv_len; + return mlxfw_mfa2_tlv_get(mfa2_file, next); +} + +const struct mlxfw_mfa2_tlv * +mlxfw_mfa2_tlv_advance(const struct mlxfw_mfa2_file *mfa2_file, + const struct mlxfw_mfa2_tlv *from_tlv, u16 count) +{ + const struct mlxfw_mfa2_tlv *tlv; + u16 idx; + + mlxfw_mfa2_tlv_foreach(mfa2_file, tlv, idx, from_tlv, count) + if (!tlv) + return NULL; + return tlv; +} + +const struct mlxfw_mfa2_tlv * +mlxfw_mfa2_tlv_multi_child_find(const struct mlxfw_mfa2_file *mfa2_file, + const struct mlxfw_mfa2_tlv_multi *multi, + enum mlxfw_mfa2_tlv_type type, u16 index) +{ + const struct mlxfw_mfa2_tlv *tlv; + u16 skip = 0; + u16 idx; + + mlxfw_mfa2_tlv_multi_foreach(mfa2_file, tlv, idx, multi) { + if (!tlv) { + pr_err("TLV parsing error\n"); + return NULL; + } + if (tlv->type == type) + if (skip++ == index) + return tlv; + } + return NULL; +} + +int mlxfw_mfa2_tlv_multi_child_count(const struct mlxfw_mfa2_file *mfa2_file, + const struct mlxfw_mfa2_tlv_multi *multi, + enum mlxfw_mfa2_tlv_type type, + u16 *p_count) +{ + const struct mlxfw_mfa2_tlv *tlv; + u16 count = 0; + u16 idx; + + mlxfw_mfa2_tlv_multi_foreach(mfa2_file, tlv, idx, multi) { + if (!tlv) { + pr_err("TLV parsing error\n"); + return -EINVAL; + } + + if (tlv->type == type) + count++; + } + *p_count = count; + return 0; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv_multi.h b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv_multi.h new file mode 100644 index 0000000..633284e --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv_multi.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 */ +/* Copyright (c) 2017-2019 Mellanox Technologies. All rights reserved */ + +#ifndef _MLXFW_MFA2_TLV_MULTI_H +#define _MLXFW_MFA2_TLV_MULTI_H + +#include "mlxfw_mfa2_tlv.h" +#include "mlxfw_mfa2_format.h" +#include "mlxfw_mfa2_file.h" + +const struct mlxfw_mfa2_tlv * +mlxfw_mfa2_tlv_multi_child(const struct mlxfw_mfa2_file *mfa2_file, + const struct mlxfw_mfa2_tlv_multi *multi); + +const struct mlxfw_mfa2_tlv * +mlxfw_mfa2_tlv_next(const struct mlxfw_mfa2_file *mfa2_file, + const struct mlxfw_mfa2_tlv *tlv); + +const struct mlxfw_mfa2_tlv * +mlxfw_mfa2_tlv_advance(const struct mlxfw_mfa2_file *mfa2_file, + const struct mlxfw_mfa2_tlv *from_tlv, u16 count); + +const struct mlxfw_mfa2_tlv * +mlxfw_mfa2_tlv_multi_child_find(const struct mlxfw_mfa2_file *mfa2_file, + const struct mlxfw_mfa2_tlv_multi *multi, + enum mlxfw_mfa2_tlv_type type, u16 index); + +int mlxfw_mfa2_tlv_multi_child_count(const struct mlxfw_mfa2_file *mfa2_file, + const struct mlxfw_mfa2_tlv_multi *multi, + enum mlxfw_mfa2_tlv_type type, + u16 *p_count); + +#define mlxfw_mfa2_tlv_foreach(mfa2_file, tlv, idx, from_tlv, count) \ + for (idx = 0, tlv = from_tlv; idx < (count); \ + idx++, tlv = mlxfw_mfa2_tlv_next(mfa2_file, tlv)) + +#define mlxfw_mfa2_tlv_multi_foreach(mfa2_file, tlv, idx, multi) \ + mlxfw_mfa2_tlv_foreach(mfa2_file, tlv, idx, \ + mlxfw_mfa2_tlv_multi_child(mfa2_file, multi), \ + be16_to_cpu(multi->num_extensions) + 1) +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxsw/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxsw/Makefile new file mode 100644 index 0000000..f160aa5 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxsw/Makefile @@ -0,0 +1,3 @@ +obj-m += mlxsw_spectrum.o + +mlxsw_spectrum-y := spectrum_main.o diff --git a/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxsw/spectrum_main.c b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxsw/spectrum_main.c new file mode 100644 index 0000000..5f7449a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/net/ethernet/mellanox/mlxsw/spectrum_main.c @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB + * Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "mlxsw_spectrum" +#define DRV_VERSION "1.0" + +MODULE_AUTHOR("Mohammad Kabat"); +MODULE_DESCRIPTION("mlxsw_spectrum dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init spectrum_init(void) +{ + return 0; +} + +static void __exit spectrum_cleanup(void) +{ +} + +module_init(spectrum_init); +module_exit(spectrum_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/Kconfig b/src/mlnx-ofa_kernel-5.8/drivers/nvme/Kconfig new file mode 100644 index 0000000..87ae409 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/Kconfig @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0-only +menu "NVME Support" + +source "drivers/nvme/host/Kconfig" +source "drivers/nvme/target/Kconfig" + +endmenu diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/nvme/Makefile new file mode 100644 index 0000000..946ab6c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/Makefile @@ -0,0 +1,24 @@ +# SPDX-License-Identifier: GPL-2.0-only +# Retpoline support: check if this is the right architecture and that +# the kernel does not support it already. +# Alternatively, if we are called from the main mlnx-ofa build system, +# CONFIG_RETPOLINE will be set by the configure script, however +# subdir-ccflags-y will be set by the toplevel Makefile. +ifneq (,$(findstring $(ARCH),i386 x86_64)) + ifndef CONFIG_RETPOLINE + ifneq (,$(shell awk 'BEGIN {if ($(VERSION).$(PATCHLEVEL) < 4.15) {print 1}}' /dev/null | head -1) +kconfig_h=$(shell /bin/ls -1 $(K_BUILD)/include/*/kconfig.h 2> /dev/null | head -1) + +ifneq ($(kconfig_h),) +KCONFIG_H = -include $(kconfig_h) +endif + +V ?= 0 + +# GCC earlier than 4.6.0 will build modules which require 'mcount', +# and this symbol will not be available in the kernel if the kernel was +# compiled with GCC 4.6.0 and above. +# therefore, to prevent unknown symbol issues we disable function tracing. +# +CC = $(CROSS_COMPILE)gcc +CPP = $(CC) -E + +CPP_MAJOR := $(shell $(CPP) -dumpversion 2>&1 | cut -d'.' -f1) +CPP_MINOR := $(shell $(CPP) -dumpversion 2>&1 | cut -d'.' -f2) +CPP_PATCH := $(shell $(CPP) -dumpversion 2>&1 | cut -d'.' -f3) +# Assumes that major, minor, and patch cannot exceed 999 +CPP_VERS := $(shell expr 0$(CPP_MAJOR) \* 1000000 + 0$(CPP_MINOR) \* 1000 + 0$(CPP_PATCH)) +compile_h=$(shell /bin/ls -1 $(K_BUILD)/include/*/compile.h 2> /dev/null | head -1) +ifneq ($(compile_h),) +KERNEL_GCC_MAJOR := $(shell grep LINUX_COMPILER $(compile_h) | sed -r -e 's/.*gcc version ([0-9\.\-]*) .*/\1/g' | cut -d'.' -f1) +KERNEL_GCC_MINOR := $(shell grep LINUX_COMPILER $(compile_h) | sed -r -e 's/.*gcc version ([0-9\.\-]*) .*/\1/g' | cut -d'.' -f2) +KERNEL_GCC_PATCH := $(shell grep LINUX_COMPILER $(compile_h) | sed -r -e 's/.*gcc version ([0-9\.\-]*) .*/\1/g' | cut -d'.' -f3) +KERNEL_GCC_VER := $(shell expr 0$(KERNEL_GCC_MAJOR) \* 1000000 + 0$(KERNEL_GCC_MINOR) \* 1000 + 0$(KERNEL_GCC_PATCH)) +ifneq ($(shell if [ $(CPP_VERS) -lt 4006000 ] && [ $(KERNEL_GCC_VER) -ge 4006000 ]; then \ + echo "YES"; else echo ""; fi),) +$(info Warning: The kernel was compiled with GCC newer than 4.6.0, while the current GCC is older than 4.6.0, Disabling function tracing to prevent unknown symbol issues...) +override WITH_MAKE_PARAMS += CONFIG_FUNCTION_TRACER= CONFIG_HAVE_FENTRY= +endif +ifneq ($(shell if [ $(CPP_VERS) -ge 4006000 ] && [ $(KERNEL_GCC_VER) -lt 4006000 ]; then \ + echo "YES"; else echo ""; fi),) +$(info Warning: The kernel was compiled with GCC older than 4.6.0, while the current GCC is newer than 4.6.0, Disabling function tracing to prevent unknown symbol issues...) +override WITH_MAKE_PARAMS += CONFIG_FUNCTION_TRACER= CONFIG_HAVE_FENTRY= +endif +endif + +ifneq ($(shell if (echo $(KVER) | grep -qE 'uek'); then \ + echo "YES"; else echo ""; fi),) +override WITH_MAKE_PARAMS += ctf-dir=$(CWD)/.ctf +endif + +ifeq ($(CONFIG_GPU_DIRECT_STORAGE),y) +override WITH_MAKE_PARAMS += CONFIG_GPU_DIRECT_STORAGE=y +endif + +ifeq ($(CONFIG_NVME_HOST_WITHOUT_FC),m) +MY_NVMF_KCONFIGS := CONFIG_NVME_HOST_WITHOUT_FC=m \ + CONFIG_NVME_TARGET= \ + CONFIG_NVME_TARGET_LOOP= \ + CONFIG_NVME_TARGET_RDMA= \ + CONFIG_NVME_TARGET_FC= \ + CONFIG_NVME_TARGET_FCLOOP= \ + CONFIG_NVME_TARGET_DUMMY= \ + CONFIG_NVME_CORE=m \ + CONFIG_BLK_DEV_NVME=m \ + CONFIG_NVME_FABRICS=m \ + CONFIG_NVME_FC= \ + CONFIG_NVME_RDMA=m \ + CONFIG_NVME_MULTIPATH=y \ + CONFIG_NVME_HOST_DUMMY= +else +MY_NVMF_KCONFIGS := CONFIG_NVME_HOST_WITHOUT_FC= \ + CONFIG_NVME_TARGET=m \ + CONFIG_NVME_TARGET_LOOP=m \ + CONFIG_NVME_TARGET_RDMA=m \ + CONFIG_NVME_TARGET_FC=m \ + CONFIG_NVME_TARGET_FCLOOP=m \ + CONFIG_NVME_TARGET_DUMMY= \ + CONFIG_NVME_CORE=m \ + CONFIG_BLK_DEV_NVME=m \ + CONFIG_NVME_FABRICS=m \ + CONFIG_NVME_FC=m \ + CONFIG_NVME_RDMA=m \ + CONFIG_NVME_MULTIPATH=y \ + CONFIG_NVME_HOST_DUMMY= +endif + +EXTRA_CFLAGS += -DCONFIG_NVME_MULTIPATH=1 + +name := mlnx-nvme +VERSION=$(shell grep "define _version" $(name).spec | sed -e 's/.*_version //' | sed -e 's/}//' | sed -e 's/\s*//g') +RELEASE=$(shell grep "define _release" $(name).spec | sed -e 's/.*_release //' | sed -e 's/}//' | sed -e 's/\s*//g') +PACKAGE := $(name)-$(VERSION) +SHELL = /bin/bash +rpmspec := $(name).spec +rpmroot = $(PWD)/rpm-dist/ +rpmopts = --nodeps --buildroot='$(rpmroot)/_rpm' --define '_source_filedigest_algorithm md5' --define '_binary_filedigest_algorithm md5' +rpmmacros =\ + --define='_topdir $(rpmroot)'\ + --define='_rpmdir $(rpmroot)'\ + --define='_srcrpmdir $(rpmroot)'\ + --define='_sourcedir $(rpmroot)'\ + --define='_specdir $(PWD)' +override WITH_MAKE_PARAMS += KBUILD_EXTRA_SYMBOLS=$(OFA)/Module.symvers + +LINUXINCLUDE=\ + $(EXTRA_CFLAGS) \ + -include $(autoconf_h) \ + $(KCONFIG_H) \ + -include $(OFA)/include/linux/compat-2.6.h \ + -I$(PWD) \ + -I$(OFA)/include \ + -I$(OFA)/include/uapi \ + $(BACKPORT_INCLUDES) \ + $$(if $$(CONFIG_XEN),-D__XEN_INTERFACE_VERSION__=$$(CONFIG_XEN_INTERFACE_VERSION)) \ + $$(if $$(CONFIG_XEN),-I$$(srctree)/arch/x86/include/mach-xen) \ + -I$$(srctree)/arch/$$(SRCARCH)/include \ + -Iarch/$$(SRCARCH)/include/generated \ + -Iinclude \ + -I$$(srctree)/arch/$$(SRCARCH)/include/uapi \ + -Iarch/$$(SRCARCH)/include/generated/uapi \ + -I$$(srctree)/include \ + -I$$(srctree)/include/uapi \ + -Iinclude/generated/uapi \ + $$(if $$(KBUILD_SRC),-Iinclude2 -I$$(srctree)/include) \ + -I$$(srctree)/arch/$$(SRCARCH)/include \ + -Iarch/$$(SRCARCH)/include/generated \ + # + +default: +# compile with ofed driver + make -C $(K_BUILD) O=$(K_OBJ) M=$(shell pwd) $(WITH_MAKE_PARAMS) \ + $(MY_NVMF_KCONFIGS) \ + CONFIG_DTRACE= \ + CONFIG_CTF= \ + LINUXINCLUDE='$(LINUXINCLUDE)' \ + modules + +install: + make -C $(K_BUILD) O=$(K_OBJ) M=$(shell pwd) INSTALL_MOD_PATH=$(INSTALL_MOD_PATH) INSTALL_MOD_DIR=$(INSTALL_MOD_DIR) $(WITH_MAKE_PARAMS) modules_install + if [ ! -n "$(INSTALL_MOD_PATH)" ]; then /sbin/depmod $(KVER);fi; + +rpmcheck: + @which rpmbuild &> /dev/null; \ + if [ $$? -ne 0 ]; then \ + echo "*** This make target requires an rpm-based linux distribution."; \ + (exit 1); exit 1; \ + fi + -mkdir -p $(rpmroot)/BUILD + +srcrpm: dist rpmcheck $(rpmspec) + -rpmbuild -bs --define 'src_release $(RELEASE)' $(rpmmacros) $(rpmopts) $(rpmspec); \ + if [ $$? -ne 0 ]; then \ + (exit 1); exit 1; \ + fi + +binrpm: rpmcheck $(rpmspec) + -rpmbuild -bb $(rpmmacros) $(rpmopts) $(rpmspec); \ + if [ $$? -ne 0 ]; then \ + (exit 1); exit 1; \ + fi + +dist: + mkdir -p $(rpmroot)/$(PACKAGE)/ + cp {$(rpmspec),makefile,Makefile,dkms.conf,Module.supported} $(rpmroot)/$(PACKAGE)/ + cp common.postinst $(rpmroot)/$(PACKAGE)/ + cp -r target $(rpmroot)/$(PACKAGE)/ + cp -r host $(rpmroot)/$(PACKAGE)/ + cp -r lpfc $(rpmroot)/$(PACKAGE)/ + cp -r debian $(rpmroot)/$(PACKAGE)/ + cp -r tools $(rpmroot)/$(PACKAGE)/ + cd $(rpmroot) && tar czf $(PACKAGE).tgz $(PACKAGE) + cd $(rpmroot) && tar czf $(name)_$(VERSION).orig.tar.gz $(PACKAGE) + +clean: + rm -f *.o + rm -f *.ko *.ko.gz + rm -f *.mod.c + rm -f Module*.symvers modules*.order + +distclean: clean + @rm -rf $(PWD)/rpm-dist + rm -f makefile *.spec + +all: clean distclean dist srcrpm binrpm diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/autogen.sh b/src/mlnx-ofa_kernel-5.8/drivers/nvme/autogen.sh new file mode 100755 index 0000000..fb4410b --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/autogen.sh @@ -0,0 +1,11 @@ +#!/bin/bash +set -e + +name=mlnx-nvme +version=$(grep "define _version" ${name}_spec_ | sed -e 's/.*_version //' | sed -e 's/}//' | sed -e 's/\s*//g') +release=$(grep "define _release" ${name}_spec_ | sed -e 's/.*_release //' | sed -e 's/}//' | sed -e 's/\s*//g') + +/bin/cp -f ${name}_spec_ ${name}.spec +/bin/cp -f _makefile_ makefile +/bin/sed -i -r "s/^$name \(([0-9.-]+)\) (.*)/$name \($version-$release\) \2/" debian/changelog + diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/common.postinst b/src/mlnx-ofa_kernel-5.8/drivers/nvme/common.postinst new file mode 100755 index 0000000..bbf9aad --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/common.postinst @@ -0,0 +1,296 @@ +#!/bin/sh +# Copyright (C) 2002-2005 Flavio Stanchina +# Copyright (C) 2005-2006 Aric Cyr +# Copyright (C) 2007 Mario Limonciello +# Copyright (C) 2009 Alberto Milone + +set -e + +uname_s=$(uname -s) + +_get_kernel_dir() { + KVER=$1 + case ${uname_s} in + Linux) DIR="/lib/modules/$KVER/build" ;; + GNU/kFreeBSD) DIR="/usr/src/kfreebsd-headers-$KVER/sys" ;; + esac + echo $DIR +} + +_check_kernel_dir() { + DIR=$(_get_kernel_dir $1) + case ${uname_s} in + Linux) test -e $DIR/include ;; + GNU/kFreeBSD) test -e $DIR/kern && test -e $DIR/conf/kmod.mk ;; + *) return 1 ;; + esac + return $? +} + +# Check the existence of a kernel named as $1 +_is_kernel_name_correct() { + CORRECT="no" + KERNEL_NAME=$1 + + for kernel in /boot/config-*; do + KERNEL=${kernel#*-} + if [ "${KERNEL}" = "${KERNEL_NAME}" ]; then + CORRECT="yes" + break + fi + done + + echo $CORRECT +} + + +# Get the most recent kernel on Debian based systems. This keeps +# into account both the version and the ABI. If the current kernel +# is the most recent kernel then the function will print a null string. +_get_newest_kernel_debian() { + NEWEST_KERNEL= + NEWEST_VERSION= + NEWEST_ABI= + + for kernel in /boot/config-*; do + KERNEL=${kernel#*-} + KERNEL_VERSION=${KERNEL%%-*} + ABI=${KERNEL#*-} + ABI=${ABI%%-*} + + if [ -z "$NEWEST_KERNEL" ]; then + # The 1st time get a version which is bigger than $1 + COMPARE_TO=$1 + else + # Get the biggest version + COMPARE_TO="$NEWEST_VERSION-$NEWEST_ABI" + fi + + # if $kernel is greater than $COMPARE_TO + if [ `dpkg --compare-versions "$KERNEL_VERSION-$ABI" gt "$COMPARE_TO" && echo "yes" || \ + echo "no"` = "yes" ]; then + NEWEST_KERNEL=$KERNEL + NEWEST_VERSION=$KERNEL_VERSION + NEWEST_ABI=$ABI + fi + done + + echo "$NEWEST_KERNEL" +} + +# Get the most recent kernel in Rhel based systems. If the current kernel +# is the most recent kernel then the function will print a null string. +_get_newest_kernel_rhel() { + NEWEST_KERNEL= + + LAST_INSTALLED_KERNEL=$(rpm -q --whatprovides kernel --last | grep kernel -m1 | cut -f1 -d' ') + + LIK_FORMATTED_NAME=$(rpm -q $LAST_INSTALLED_KERNEL --queryformat="%{VERSION}-%{RELEASE}.%{ARCH}\n") + + if [ `echo $LIK_FORMATTED_NAME | grep 2.6 >/dev/null` ]; then + # Fedora and Suse + NEWEST_KERNEL=$LIK_FORMATTED_NAME + else + # Hack for Mandriva where $LIK_FORMATTED_NAME is broken + LIK_NAME=$(rpm -q $LAST_INSTALLED_KERNEL --queryformat="%{NAME}\n") + LIK_TYPE=${LIK_NAME#kernel-} + LIK_TYPE=${LIK_TYPE%%-*} + LIK_STRIPPED=${LIK_NAME#kernel-} + LIK_STRIPPED=${LIK_STRIPPED#$LIK_TYPE-} + LIK_STRIPPED_BASE=${LIK_STRIPPED%%-*} + LIK_STRIPPED_END=${LIK_STRIPPED#$LIK_STRIPPED_BASE-} + LIK_FINAL=$LIK_STRIPPED_BASE-$LIK_TYPE-$LIK_STRIPPED_END + + NEWEST_KERNEL=$LIK_FINAL + fi + + echo $NEWEST_KERNEL +} + +# Get the newest kernel on Debian and Rhel based systems. +get_newest_kernel() { + NEWEST_KERNEL= + # Try Debian first as rpm can be installed in Debian based distros + if [ -e /usr/bin/dpkg ]; then + # If DEB based + CURRENT_KERNEL=$1 + CURRENT_VERSION=${CURRENT_KERNEL%%-*} + CURRENT_ABI=${CURRENT_KERNEL#*-} + CURRENT_FLAVOUR=${CURRENT_ABI#*-} + CURRENT_ABI=${CURRENT_ABI%%-*} + NEWEST_KERNEL=$(_get_newest_kernel_debian "$CURRENT_VERSION-$CURRENT_ABI") + + elif [ `which rpm >/dev/null` ]; then + # If RPM based + NEWEST_KERNEL=$(_get_newest_kernel_rhel) + fi + + # Make sure that kernel name that we extracted corresponds to an installed + # kernel + if [ -n "$NEWEST_KERNEL" ] && [ `_is_kernel_name_correct $NEWEST_KERNEL` = "no" ]; then + NEWEST_KERNEL= + fi + + echo $NEWEST_KERNEL +} + +NAME=$1 +VERSION=$2 +TARBALL_ROOT=$3 +ARCH=$4 +UPGRADE=$5 + +if [ -z "$NAME" ] || [ -z "$VERSION" ]; then + echo "Need NAME, and VERSION defined" + echo "ARCH is optional" + exit 1 +fi + +KERNELS=$(ls /lib/modules/ 2>/dev/null || true) +CURRENT_KERNEL=$(uname -r) + +#We never want to keep an older version side by side to prevent conflicts +if [ -e "/var/lib/dkms/$NAME/$VERSION" ]; then + echo "Removing old $NAME-$VERSION DKMS files..." + dkms remove -m $NAME -v $VERSION --all +fi + +#Load new files, by source package and by tarball +if [ -f "$TARBALL_ROOT/$NAME-$VERSION.dkms.tar.gz" ]; then + if ! dkms ldtarball --archive "$TARBALL_ROOT/$NAME-$VERSION.dkms.tar.gz"; then + echo "" + echo "" + echo "Unable to load DKMS tarball $TARBALL_ROOT/$NAME-$VERSION.dkms.tar.gz." + echo "Common causes include: " + echo " - You must be using DKMS 2.1.0.0 or later to support binaries only" + echo " distribution specific archives." + echo " - Corrupt distribution specific archive" + echo "" + echo "" + exit 2 + fi +elif [ -d "/usr/src/$NAME-$VERSION" ]; then + echo "Loading new $NAME-$VERSION DKMS files..." + dkms add -m $NAME -v $VERSION > /dev/null +fi + +# On 1st installation, let us look for a directory +# in /lib/modules which matches `uname -r`. If none +# is found it is possible that buildd is being used +# and that uname -r is giving us the name of the +# kernel used by the buildd machine. +# +# If this is the case we try to build the kernel +# module for each kernel which has a directory in +# /lib/modules. Furthermore we will have to tell +# DKMS which architecture it should build the module +# for (e.g. if the buildd machine is using a +# 2.6.24-23-xen 64bit kernel). +# +# NOTE: if the headers are not installed then the +# module won't be built, as usual +if [ -z "$UPGRADE" ]; then + echo "First Installation: checking all kernels..." + for KERNEL in $KERNELS; do + if [ ${KERNEL} = ${CURRENT_KERNEL} ]; then + # Kernel found + KERNELS=$CURRENT_KERNEL + break + fi + done +else + KERNELS=$CURRENT_KERNEL +fi + +# Here we look for the most recent kernel so that we can +# build the module for it (in addition to doing it for the +# current kernel. +NEWEST_KERNEL=$(get_newest_kernel "$KERNELS") + +# If the current kernel doesn't come from the host of a chroot +if [ `_is_kernel_name_correct $CURRENT_KERNEL` = "yes" ]; then + # See if it's worth building the module for both the newest kernel + # and for the current kernel + if [ -n "$NEWEST_KERNEL" ] && [ ${CURRENT_KERNEL} != ${NEWEST_KERNEL} ]; then + echo "Building for $CURRENT_KERNEL and $NEWEST_KERNEL" + KERNELS="$CURRENT_KERNEL $NEWEST_KERNEL" + else + echo "Building only for $CURRENT_KERNEL" + fi +# The current kernel is not useful as it's the host's +else + echo "It is likely that $CURRENT_KERNEL belongs to a chroot's host" + + # Let's use only the newest kernel + if [ -n "$NEWEST_KERNEL" ]; then + KERNELS="$NEWEST_KERNEL" + echo "Building only for $NEWEST_KERNEL" + fi +fi + +if [ -n "$ARCH" ]; then + if which lsb_release >/dev/null && [ $(lsb_release -s -i) = "Ubuntu" ]; then + case $ARCH in + amd64) + ARCH="x86_64" + ;; + lpia|i?86) + ARCH="i686" + ;; + esac + fi + echo "Building for architecture $ARCH" + ARCH="-a $ARCH" +fi + +for KERNEL in $KERNELS; do + dkms_status=`dkms status -m $NAME -v $VERSION -k $KERNEL $ARCH` + if [ `echo $KERNEL | grep -c "BOOT"` -gt 0 ]; then + echo "" + echo "Module build and install for $KERNEL was skipped as " + echo "it is a BOOT variant" + continue + fi + + + #if the module isn't yet built, try to build it + if [ `echo $dkms_status | grep -c ": built"` -eq 0 ]; then + if [ ! -L /var/lib/dkms/$NAME/$VERSION/source ]; then + echo "This package appears to be a binaries-only package" + echo " you will not be able to build against kernel $KERNEL" + echo " since the package source was not provided" + continue + fi + if _check_kernel_dir $KERNEL; then + echo "Building initial module for $KERNEL" + set +e + dkms build -m $NAME -v $VERSION -k $KERNEL $ARCH > /dev/null + rc=$? + case $rc in + 9) + set -e + echo "Skipped." + continue + ;; + 0) + set -e + echo "Done." + ;; + *) + exit $rc + ;; + esac + dkms_status=`dkms status -m $NAME -v $VERSION -k $KERNEL $ARCH` + else + echo "Module build for the currently running kernel was skipped since the" + echo "kernel source for this kernel does not seem to be installed." + fi + fi + + #if the module is built (either pre-built or just now), install it + if [ `echo $dkms_status | grep -c ": built"` -eq 1 ] && + [ `echo $dkms_status | grep -c ": installed"` -eq 0 ]; then + dkms install -m $NAME -v $VERSION -k $KERNEL $ARCH --force + fi +done + diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/debian/changelog b/src/mlnx-ofa_kernel-5.8/drivers/nvme/debian/changelog new file mode 100644 index 0000000..f8dbea9 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/debian/changelog @@ -0,0 +1,5 @@ +mlnx-nvme (0.0-0) unstable; urgency=low + + * Initial release. + + -- Alaa Hleihel Sun, 21 Aug 2016 10:30:53 +0200 diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/debian/compat b/src/mlnx-ofa_kernel-5.8/drivers/nvme/debian/compat new file mode 100644 index 0000000..45a4fb7 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/debian/compat @@ -0,0 +1 @@ +8 diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/debian/control b/src/mlnx-ofa_kernel-5.8/drivers/nvme/debian/control new file mode 100644 index 0000000..32fb2f1 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/debian/control @@ -0,0 +1,17 @@ +Source: mlnx-nvme +Section: kernel +Priority: optional +Maintainer: Mellanox Technologies +Build-Depends: debhelper (>= 8.0.0), autotools-dev, bzip2, dkms +Standards-Version: 1.0 +Homepage: http://www.mellanox.com + +Package: mlnx-nvme-dkms +Section: kernel +Architecture: all +Depends: dkms, make, mlnx-ofed-kernel-dkms, ${misc:Depends} +Recommends: linux-headers-arm64 | linux-headers-powerpc | linux-headers-ppc64 | linux-headers-ppc64le | linux-headers-amd64 | linux-headers | linux-headers-generic +Description: DKMS support for nvme kernel module + This package provides integration with the DKMS infrastructure for automatically building out of tree kernel modules. + . + This package contains the source to be built with dkms. diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/debian/control.no_dkms b/src/mlnx-ofa_kernel-5.8/drivers/nvme/debian/control.no_dkms new file mode 100644 index 0000000..8949031 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/debian/control.no_dkms @@ -0,0 +1,14 @@ +Source: mlnx-nvme +Section: kernel +Priority: optional +Maintainer: Mellanox Technologies +Build-Depends: debhelper (>= 8.0.0), autotools-dev, bzip2, make +Standards-Version: 1.0 +Homepage: http://www.mellanox.com + +Package: mlnx-nvme-modules +Section: kernel +Architecture: any +Depends: mlnx-ofed-kernel-modules, ${misc:Depends} +Description: nvme kernel module + This package provides the binary code for the nvme-rdma kernel module. diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/debian/copyright b/src/mlnx-ofa_kernel-5.8/drivers/nvme/debian/copyright new file mode 100644 index 0000000..53aa878 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/debian/copyright @@ -0,0 +1,19 @@ +Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ + +Files: * +Copyright: Copyright 2017 Mellanox Technologies +License: GPL-2 + Mellanox OFED (MLNX_OFED) Software distributed under the terms of the GNU General Public License ("GPL") version 2 as published by the Free Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/debian/mlnx-nvme-dkms.postinst b/src/mlnx-ofa_kernel-5.8/drivers/nvme/debian/mlnx-nvme-dkms.postinst new file mode 100755 index 0000000..a24dd4a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/debian/mlnx-nvme-dkms.postinst @@ -0,0 +1,43 @@ +#!/bin/sh +set -e + +# Get the package version +NAME=mlnx-nvme +PACKAGE_NAME=$NAME-dkms +CVERSION=`dpkg-query -W -f='${Version}' $PACKAGE_NAME | awk -F "-" '{print $1}' | cut -d\: -f2` +ARCH=`uname -m` + +dkms_configure () { + POSTINST="/usr/src/$NAME-$CVERSION/common.postinst" + if [ -f "$POSTINST" ]; then + "$POSTINST" "$NAME" "$CVERSION" "/usr/share/$PACKAGE_NAME" "$ARCH" "$2" + return $? + fi + echo "WARNING: $POSTINST does not exist." >&2 + echo "ERROR: DKMS version is too old and $PACKAGE_NAME was not" >&2 + echo "built with legacy DKMS support." >&2 + echo "You must either rebuild $PACKAGE_NAME with legacy postinst" >&2 + echo "support or upgrade DKMS to a more current version." >&2 + return 1 +} + +case "$1" in + configure) + dkms_configure + ;; + + abort-upgrade|abort-remove|abort-deconfigure) + ;; + + *) + echo "postinst called with unknown argument \`$1'" >&2 + exit 1 + ;; +esac + +# dh_installdeb will replace this with shell code automatically +# generated by other debhelper scripts. + +#DEBHELPER# + +exit 0 diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/debian/mlnx-nvme-dkms.prerm b/src/mlnx-ofa_kernel-5.8/drivers/nvme/debian/mlnx-nvme-dkms.prerm new file mode 100755 index 0000000..72b90f5 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/debian/mlnx-nvme-dkms.prerm @@ -0,0 +1,11 @@ +#!/bin/sh +set -e + +# Get the package version +package=mlnx-nvme +version=`dpkg-query -W -f='${Version}' "$package-dkms" \ + | sed -e 's/[+-].*//'` + +dkms remove -m "$package" -v "$version" --all || true + +exit 0 diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/debian/rules b/src/mlnx-ofa_kernel-5.8/drivers/nvme/debian/rules new file mode 100755 index 0000000..ea71491 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/debian/rules @@ -0,0 +1,109 @@ +#!/usr/bin/make -f +# -*- makefile -*- +# Sample debian/rules that uses debhelper. +# This file was originally written by Joey Hess and Craig Small. +# As a special exception, when this file is copied by dh-make into a +# dh-make output file, you may use that output file without restriction. +# This special exception was added by Craig Small in version 0.37 of dh-make. +# +# This version is for a hypothetical package that can build a kernel modules +# architecture-dependant package via make-kpkg, as well as an +# architecture-independent module source package, and other packages +# either dep/indep for things like common files or userspace components +# needed for the kernel modules. + +# Uncomment this to turn on verbose mode. +#export DH_VERBOSE=1 + +WITH_DKMS ?= 1 +WITH_MOD_SIGN ?= 0 +MLXNUMC = $(shell grep ^processor /proc/cpuinfo | wc -l) +NJOBS ?= $(shell if [ $(MLXNUMC) -lt 16 ]; then echo $(MLXNUMC); else echo 16; fi) + +pname:=mlnx-nvme +psource:=$(pname)-source +ifeq ($(WITH_DKMS),1) +pdkms:=$(pname)-dkms +else +pdkms:=$(pname)-modules +endif + +pversion := $(shell dpkg-parsechangelog | sed -n 's/^Version: *\([^-]\+\)-.\+/\1/p') +prel := $(shell dpkg-parsechangelog | sed -n 's/^Version: *\([^-]\+\)-\(.\+\)/\2/p') + +export INSTALL_MOD_DIR:=updates +export INSTALL_MOD_PATH:=$(CURDIR)/debian/$(pdkms) + +DIST_NAME := $(shell lsb_release -si) +DIST_RELEASE := $(DIST_NAME)/$(shell lsb_release -sc) + + +KVER ?= $(shell uname -r) +KVER1 = $(shell echo $(KVER) | sed -e 's/_/-/g') +K_BUILD ?= "/lib/modules/$(KVER)/build" + +%: +ifeq ($(WITH_DKMS),1) + dh $@ --with dkms +else + dh $@ +endif + +override_dh_auto_clean: + +override_dh_auto_configure: + +override_dh_auto_build: +ifneq ($(WITH_DKMS),1) + @echo Building for $(KVER) + make clean || true + make -j$(NJOBS) KVER=$(KVER) K_BUILD=$(K_BUILD) CONFIG_NVME_HOST_WITHOUT_FC= +endif + +override_dh_auto_test: + +override_dh_auto_install: +ifneq ($(WITH_DKMS),1) + make install INSTALL_MOD_DIR=$(INSTALL_MOD_DIR) INSTALL_MOD_PATH=$(INSTALL_MOD_PATH) KERNELRELEASE=$(KVER) KVER=$(KVER) K_BUILD=$(K_BUILD) CONFIG_NVME_HOST_WITHOUT_FC= + find $(INSTALL_MOD_PATH) \( -type f -a -name "modules.*" \) -delete +ifeq ($(WITH_MOD_SIGN),1) + tools/sign-modules $(INSTALL_MOD_PATH)/lib/modules/ $(K_BUILD) +endif +endif + + # For dkms +ifeq ($(WITH_DKMS),1) + dh_installdirs -p$(pdkms) usr/src/$(pname)-$(pversion) + cp makefile debian/$(pdkms)/usr/src/$(pname)-$(pversion) + cp Makefile debian/$(pdkms)/usr/src/$(pname)-$(pversion) + cp dkms.conf debian/$(pdkms)/usr/src/$(pname)-$(pversion) + cp common.postinst debian/$(pdkms)/usr/src/$(pname)-$(pversion) + cp -a target debian/$(pdkms)/usr/src/$(pname)-$(pversion) + cp -a host debian/$(pdkms)/usr/src/$(pname)-$(pversion) + cp -a lpfc debian/$(pdkms)/usr/src/$(pname)-$(pversion) + + # Force DKMS to install our modules. + # This is mostly needed for modules that do not have a version number info, as DKMS + # will compare their srcversion field, which does not really say which module is newer. + dh_installdirs -p$(pdkms) usr/share/dkms/modules_to_force_install/ + echo "$(pname)" > debian/$(pdkms)/usr/share/dkms/modules_to_force_install/$(pname).force +endif + +override_dh_installinit: + + +ifneq ($(WITH_DKMS),1) +override_dh_gencontrol: + dh_gencontrol -- -v$(pversion)-$(prel).kver.$(KVER1) +endif + +ifneq ($(MLNX_KO_NO_STRIP),1) +ifneq ($(WITH_DKMS),1) +override_dh_strip: + dh_strip + find debian -name '*.ko' | xargs strip -g +ifeq ($(WITH_MOD_SIGN),1) + tools/sign-modules $(INSTALL_MOD_PATH)/lib/modules/ $(K_BUILD) +endif +endif +endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/debian/source/format b/src/mlnx-ofa_kernel-5.8/drivers/nvme/debian/source/format new file mode 100644 index 0000000..163aaf8 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/debian/source/format @@ -0,0 +1 @@ +3.0 (quilt) diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/dkms.conf b/src/mlnx-ofa_kernel-5.8/drivers/nvme/dkms.conf new file mode 100644 index 0000000..0eddb77 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/dkms.conf @@ -0,0 +1,62 @@ +# DKMS module name and version +PACKAGE_NAME="mlnx-nvme" +PACKAGE_VERSION="4.0" + +kernelver=${kernelver:-$(uname -r)} +kernel_source_dir=${kernel_source_dir:-/lib/modules/$kernelver/build} + +# Module name, source and destination directories, and build command-line +# +# host +# +BUILT_MODULE_NAME[0]="nvme-core" +BUILT_MODULE_LOCATION[0]="./host/" +DEST_MODULE_LOCATION[0]="/kernel/../updates/" + +BUILT_MODULE_NAME[1]="nvme" +BUILT_MODULE_LOCATION[1]="./host/" +DEST_MODULE_LOCATION[1]="/kernel/../updates/" + +BUILT_MODULE_NAME[2]="nvme-fabrics" +BUILT_MODULE_LOCATION[2]="./host/" +DEST_MODULE_LOCATION[2]="/kernel/../updates/" + +BUILT_MODULE_NAME[3]="nvme-rdma" +BUILT_MODULE_LOCATION[3]="./host/" +DEST_MODULE_LOCATION[3]="/kernel/../updates/" + +BUILT_MODULE_NAME[4]="nvme-fc" +BUILT_MODULE_LOCATION[4]="./host/" +DEST_MODULE_LOCATION[4]="/kernel/../updates/" + +# +# target +# +BUILT_MODULE_NAME[5]="nvmet" +BUILT_MODULE_LOCATION[5]="./target/" +DEST_MODULE_LOCATION[5]="/kernel/../updates/" + +BUILT_MODULE_NAME[6]="nvme-loop" +BUILT_MODULE_LOCATION[6]="./target/" +DEST_MODULE_LOCATION[6]="/kernel/../updates/" + +BUILT_MODULE_NAME[7]="nvmet-rdma" +BUILT_MODULE_LOCATION[7]="./target/" +DEST_MODULE_LOCATION[7]="/kernel/../updates/" + +BUILT_MODULE_NAME[8]="nvmet-fc" +BUILT_MODULE_LOCATION[8]="./target/" +DEST_MODULE_LOCATION[8]="/kernel/../updates/" + +BUILT_MODULE_NAME[9]="nvme-fcloop" +BUILT_MODULE_LOCATION[9]="./target/" +DEST_MODULE_LOCATION[9]="/kernel/../updates/" + +MAKE="make -j`MLXNUMC=$(grep ^processor /proc/cpuinfo | wc -l) && echo $(($MLXNUMC<16?$MLXNUMC:16))` KVER=$kernelver K_BUILD=$kernel_source_dir CONFIG_NVME_HOST_WITHOUT_FC=" + +# Cleanup command-line +CLEAN="make clean" + +# disable autoinstall since this module depends on mlnx-ofed-kernel-dkms +# mlnx-ofed-kernel-dkms will build this module on POST_INSTALL +AUTOINSTALL= diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/Kconfig b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/Kconfig new file mode 100644 index 0000000..dc0450c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/Kconfig @@ -0,0 +1,85 @@ +# SPDX-License-Identifier: GPL-2.0-only +config NVME_CORE + tristate + select BLK_DEV_INTEGRITY_T10 if BLK_DEV_INTEGRITY + +config BLK_DEV_NVME + tristate "NVM Express block device" + depends on PCI && BLOCK + select NVME_CORE + help + The NVM Express driver is for solid state drives directly + connected to the PCI or PCI Express bus. If you know you + don't have one of these, it is safe to answer N. + + To compile this driver as a module, choose M here: the + module will be called nvme. + +config NVME_MULTIPATH + bool "NVMe multipath support" + depends on NVME_CORE + help + This option enables support for multipath access to NVMe + subsystems. If this option is enabled only a single + /dev/nvmeXnY device will show up for each NVMe namespace, + even if it is accessible through multiple controllers. + +config NVME_HWMON + bool "NVMe hardware monitoring" + depends on (NVME_CORE=y && HWMON=y) || (NVME_CORE=m && HWMON) + help + This provides support for NVMe hardware monitoring. If enabled, + a hardware monitoring device will be created for each NVMe drive + in the system. + +config NVME_FABRICS + select NVME_CORE + tristate + +config NVME_RDMA + tristate "NVM Express over Fabrics RDMA host driver" + depends on INFINIBAND && INFINIBAND_ADDR_TRANS && BLOCK + select NVME_FABRICS + select SG_POOL + help + This provides support for the NVMe over Fabrics protocol using + the RDMA (Infiniband, RoCE, iWarp) transport. This allows you + to use remote block devices exported using the NVMe protocol set. + + To configure a NVMe over Fabrics controller use the nvme-cli tool + from https://github.com/linux-nvme/nvme-cli. + + If unsure, say N. + +config NVME_FC + tristate "NVM Express over Fabrics FC host driver" + depends on BLOCK + depends on HAS_DMA + select NVME_FABRICS + select SG_POOL + help + This provides support for the NVMe over Fabrics protocol using + the FC transport. This allows you to use remote block devices + exported using the NVMe protocol set. + + To configure a NVMe over Fabrics controller use the nvme-cli tool + from https://github.com/linux-nvme/nvme-cli. + + If unsure, say N. + +config NVME_TCP + tristate "NVM Express over Fabrics TCP host driver" + depends on INET + depends on BLOCK + select NVME_FABRICS + select CRYPTO + select CRYPTO_CRC32C + help + This provides support for the NVMe over Fabrics protocol using + the TCP transport. This allows you to use remote block devices + exported using the NVMe protocol set. + + To configure a NVMe over Fabrics controller use the nvme-cli tool + from https://github.com/linux-nvme/nvme-cli. + + If unsure, say N. diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/Makefile new file mode 100644 index 0000000..3340ca2 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/Makefile @@ -0,0 +1,66 @@ +# SPDX-License-Identifier: GPL-2.0 + +ccflags-y += -I$(src) +ifeq ($(CONFIG_GPU_DIRECT_STORAGE),y) +ccflags-y += -DCONFIG_NVFS +endif + +obj-$(CONFIG_NVME_CORE) += nvme-core.o +obj-$(CONFIG_BLK_DEV_NVME) += nvme.o +obj-$(CONFIG_NVME_FABRICS) += nvme-fabrics.o +obj-$(CONFIG_NVME_RDMA) += nvme-rdma.o +obj-$(CONFIG_NVME_FC) += nvme-fc.o +obj-$(CONFIG_NVME_TCP) += nvme-tcp.o +ifeq ($(CONFIG_BLK_DEV_NVME),m) +obj-$(CONFIG_COMPAT_NVME_SNAP_VFIO_PCI) += nvme_snap_vfio_pci.o +endif + +obj-$(CONFIG_NVME_HOST_DUMMY) += nvme-rdma.o + +# --with-nvmf-host-rdma-only +# dummy +obj-$(CONFIG_NVME_HOST_WITHOUT_FC) += nvme-fc.o + +ifeq ($(CONFIG_NVME_HOST_DUMMY),m) +nvme-rdma-y := nvme-rdma_dummy.o +else +nvme-core-y := core.o ioctl.o +nvme-core-$(CONFIG_TRACING) += trace.o +nvme-core-$(CONFIG_NVME_MULTIPATH) += multipath.o +nvme-core-$(CONFIG_BLK_DEV_ZONED) += zns.o +nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS) += fault_inject.o +nvme-core-$(CONFIG_NVME_HWMON) += hwmon.o + +nvme-y += pci.o +ifeq ($(CONFIG_GPU_DIRECT_STORAGE),y) +nvme-y += nvfs-dma.o +endif +ifeq ($(CONFIG_COMPAT_NVME_SNAP_VFIO_PCI),m) +nvme-y += passthru.o +endif + +nvme-fabrics-y += fabrics.o + +nvme-rdma-y += rdma.o +ifeq ($(CONFIG_GPU_DIRECT_STORAGE),y) +nvme-rdma-y += nvfs-rdma.o +endif + +ifeq ($(CONFIG_NVME_HOST_WITHOUT_FC),m) +nvme-fc-y += nvme-fc_dummy.o +else +nvme-fc-y += fc.o +endif + +nvme-tcp-y += tcp.o + +endif + +# these 2 drivers depend on nvme-fc, so replace them with dummy drivers +ifneq ($(filter m,$(CONFIG_NVME_FC) $(CONFIG_NVME_HOST_WITHOUT_FC)),) +obj-$(CONFIG_SCSI_QLA_FC) += qla2xxx.o +qla2xxx-y := qla2xxx_dummy.o + +obj-$(CONFIG_TCM_QLA2XXX) += tcm_qla2xxx.o +tcm_qla2xxx-y := tcm_qla2xxx_dummy.o +endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/core.c b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/core.c new file mode 100644 index 0000000..b540b33 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/core.c @@ -0,0 +1,4891 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NVM Express device driver + * Copyright (c) 2011-2014, Intel Corporation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nvme.h" +#include "fabrics.h" + +#define CREATE_TRACE_POINTS +#include "trace.h" + +#define NVME_MINORS (1U << MINORBITS) + +unsigned int admin_timeout = 60; +module_param(admin_timeout, uint, 0644); +MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands"); +EXPORT_SYMBOL_GPL(admin_timeout); + +unsigned int nvme_io_timeout = 30; +module_param_named(io_timeout, nvme_io_timeout, uint, 0644); +MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O"); +EXPORT_SYMBOL_GPL(nvme_io_timeout); + +static unsigned char shutdown_timeout = 5; +module_param(shutdown_timeout, byte, 0644); +MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown"); + +static u8 nvme_max_retries = 5; +module_param_named(max_retries, nvme_max_retries, byte, 0644); +MODULE_PARM_DESC(max_retries, "max number of retries a command may have"); + +static unsigned long default_ps_max_latency_us = 100000; +module_param(default_ps_max_latency_us, ulong, 0644); +MODULE_PARM_DESC(default_ps_max_latency_us, + "max power saving latency for new devices; use PM QOS to change per device"); + +static bool force_apst; +module_param(force_apst, bool, 0644); +MODULE_PARM_DESC(force_apst, "allow APST for newly enumerated devices even if quirked off"); + +static unsigned long apst_primary_timeout_ms = 100; +module_param(apst_primary_timeout_ms, ulong, 0644); +MODULE_PARM_DESC(apst_primary_timeout_ms, + "primary APST timeout in ms"); + +static unsigned long apst_secondary_timeout_ms = 2000; +module_param(apst_secondary_timeout_ms, ulong, 0644); +MODULE_PARM_DESC(apst_secondary_timeout_ms, + "secondary APST timeout in ms"); + +static unsigned long apst_primary_latency_tol_us = 15000; +module_param(apst_primary_latency_tol_us, ulong, 0644); +MODULE_PARM_DESC(apst_primary_latency_tol_us, + "primary APST latency tolerance in us"); + +static unsigned long apst_secondary_latency_tol_us = 100000; +module_param(apst_secondary_latency_tol_us, ulong, 0644); +MODULE_PARM_DESC(apst_secondary_latency_tol_us, + "secondary APST latency tolerance in us"); + +static bool streams; +module_param(streams, bool, 0644); +MODULE_PARM_DESC(streams, "turn on support for Streams write directives"); + +/* + * nvme_wq - hosts nvme related works that are not reset or delete + * nvme_reset_wq - hosts nvme reset works + * nvme_delete_wq - hosts nvme delete works + * + * nvme_wq will host works such as scan, aen handling, fw activation, + * keep-alive, periodic reconnects etc. nvme_reset_wq + * runs reset works which also flush works hosted on nvme_wq for + * serialization purposes. nvme_delete_wq host controller deletion + * works which flush reset works for serialization. + */ +struct workqueue_struct *nvme_wq; +EXPORT_SYMBOL_GPL(nvme_wq); + +struct workqueue_struct *nvme_reset_wq; +EXPORT_SYMBOL_GPL(nvme_reset_wq); + +struct workqueue_struct *nvme_delete_wq; +EXPORT_SYMBOL_GPL(nvme_delete_wq); + +static LIST_HEAD(nvme_subsystems); +static DEFINE_MUTEX(nvme_subsystems_lock); + +static DEFINE_IDA(nvme_instance_ida); +static dev_t nvme_ctrl_base_chr_devt; +static struct class *nvme_class; +static struct class *nvme_subsys_class; + +static DEFINE_IDA(nvme_ns_chr_minor_ida); +static dev_t nvme_ns_chr_devt; +static struct class *nvme_ns_chr_class; + +static void nvme_put_subsystem(struct nvme_subsystem *subsys); +static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, + unsigned nsid); +static void nvme_update_keep_alive(struct nvme_ctrl *ctrl, + struct nvme_command *cmd); + +void nvme_queue_scan(struct nvme_ctrl *ctrl) +{ + /* + * Only new queue scan work when admin and IO queues are both alive + */ + if (ctrl->state == NVME_CTRL_LIVE && ctrl->tagset) + queue_work(nvme_wq, &ctrl->scan_work); +} + +/* + * Use this function to proceed with scheduling reset_work for a controller + * that had previously been set to the resetting state. This is intended for + * code paths that can't be interrupted by other reset attempts. A hot removal + * may prevent this from succeeding. + */ +int nvme_try_sched_reset(struct nvme_ctrl *ctrl) +{ + if (ctrl->state != NVME_CTRL_RESETTING) + return -EBUSY; + if (!queue_work(nvme_reset_wq, &ctrl->reset_work)) + return -EBUSY; + return 0; +} +EXPORT_SYMBOL_GPL(nvme_try_sched_reset); + +static void nvme_failfast_work(struct work_struct *work) +{ + struct nvme_ctrl *ctrl = container_of(to_delayed_work(work), + struct nvme_ctrl, failfast_work); + + if (ctrl->state != NVME_CTRL_CONNECTING) + return; + + set_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags); + dev_info(ctrl->device, "failfast expired\n"); + nvme_kick_requeue_lists(ctrl); +} + +static inline void nvme_start_failfast_work(struct nvme_ctrl *ctrl) +{ + if (!ctrl->opts || ctrl->opts->fast_io_fail_tmo == -1) + return; + + schedule_delayed_work(&ctrl->failfast_work, + ctrl->opts->fast_io_fail_tmo * HZ); +} + +static inline void nvme_stop_failfast_work(struct nvme_ctrl *ctrl) +{ + if (!ctrl->opts) + return; + + cancel_delayed_work_sync(&ctrl->failfast_work); + clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags); +} + + +int nvme_reset_ctrl(struct nvme_ctrl *ctrl) +{ + if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) + return -EBUSY; + if (!queue_work(nvme_reset_wq, &ctrl->reset_work)) + return -EBUSY; + return 0; +} +EXPORT_SYMBOL_GPL(nvme_reset_ctrl); + +int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl) +{ + int ret; + + ret = nvme_reset_ctrl(ctrl); + if (!ret) { + flush_work(&ctrl->reset_work); + if (ctrl->state != NVME_CTRL_LIVE) + ret = -ENETRESET; + } + + return ret; +} + +static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl) +{ + dev_info(ctrl->device, + "Removing ctrl: NQN \"%s\"\n", nvmf_ctrl_subsysnqn(ctrl)); + + flush_work(&ctrl->reset_work); + nvme_stop_ctrl(ctrl); + nvme_remove_namespaces(ctrl); + ctrl->ops->delete_ctrl(ctrl); + nvme_uninit_ctrl(ctrl); +} + +static void nvme_delete_ctrl_work(struct work_struct *work) +{ + struct nvme_ctrl *ctrl = + container_of(work, struct nvme_ctrl, delete_work); + + nvme_do_delete_ctrl(ctrl); +} + +int nvme_delete_ctrl(struct nvme_ctrl *ctrl) +{ + if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING)) + return -EBUSY; + if (!queue_work(nvme_delete_wq, &ctrl->delete_work)) + return -EBUSY; + return 0; +} +EXPORT_SYMBOL_GPL(nvme_delete_ctrl); + +static void nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl) +{ + /* + * Keep a reference until nvme_do_delete_ctrl() complete, + * since ->delete_ctrl can free the controller. + */ + nvme_get_ctrl(ctrl); + if (nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING)) + nvme_do_delete_ctrl(ctrl); + nvme_put_ctrl(ctrl); +} + +static blk_status_t nvme_error_status(u16 status) +{ + switch (status & 0x7ff) { + case NVME_SC_SUCCESS: + return BLK_STS_OK; + case NVME_SC_CAP_EXCEEDED: + return BLK_STS_NOSPC; + case NVME_SC_LBA_RANGE: + case NVME_SC_CMD_INTERRUPTED: + case NVME_SC_NS_NOT_READY: + return BLK_STS_TARGET; + case NVME_SC_BAD_ATTRIBUTES: + case NVME_SC_ONCS_NOT_SUPPORTED: + case NVME_SC_INVALID_OPCODE: + case NVME_SC_INVALID_FIELD: + case NVME_SC_INVALID_NS: + return BLK_STS_NOTSUPP; + case NVME_SC_WRITE_FAULT: + case NVME_SC_READ_ERROR: + case NVME_SC_UNWRITTEN_BLOCK: + case NVME_SC_ACCESS_DENIED: + case NVME_SC_READ_ONLY: + case NVME_SC_COMPARE_FAILED: + return BLK_STS_MEDIUM; + case NVME_SC_GUARD_CHECK: + case NVME_SC_APPTAG_CHECK: + case NVME_SC_REFTAG_CHECK: + case NVME_SC_INVALID_PI: + return BLK_STS_PROTECTION; + case NVME_SC_RESERVATION_CONFLICT: + return BLK_STS_NEXUS; + case NVME_SC_HOST_PATH_ERROR: + return BLK_STS_TRANSPORT; + case NVME_SC_ZONE_TOO_MANY_ACTIVE: + return BLK_STS_ZONE_ACTIVE_RESOURCE; + case NVME_SC_ZONE_TOO_MANY_OPEN: + return BLK_STS_ZONE_OPEN_RESOURCE; + default: + return BLK_STS_IOERR; + } +} + +static void nvme_retry_req(struct request *req) +{ + unsigned long delay = 0; + u16 crd; + + /* The mask and shift result must be <= 3 */ + crd = (nvme_req(req)->status & NVME_SC_CRD) >> 11; + if (crd) + delay = nvme_req(req)->ctrl->crdt[crd - 1] * 100; + + nvme_req(req)->retries++; + blk_mq_requeue_request(req, false); + blk_mq_delay_kick_requeue_list(req->q, delay); +} + +enum nvme_disposition { + COMPLETE, + RETRY, + FAILOVER, +}; + +static inline enum nvme_disposition nvme_decide_disposition(struct request *req) +{ + if (likely(nvme_req(req)->status == 0)) + return COMPLETE; + + if (blk_noretry_request(req) || + (nvme_req(req)->status & NVME_SC_DNR) || + nvme_req(req)->retries >= nvme_max_retries) + return COMPLETE; + + if (req->cmd_flags & REQ_NVME_MPATH) { + if (nvme_is_path_error(nvme_req(req)->status) || + blk_queue_dying(req->q)) + return FAILOVER; + } else { + if (blk_queue_dying(req->q)) + return COMPLETE; + } + + return RETRY; +} + +static inline void nvme_end_req_zoned(struct request *req) +{ + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && + req_op(req) == REQ_OP_ZONE_APPEND) + req->__sector = nvme_lba_to_sect(req->q->queuedata, + le64_to_cpu(nvme_req(req)->result.u64)); +} + +static inline void nvme_end_req(struct request *req) +{ + blk_status_t status = nvme_error_status(nvme_req(req)->status); + + nvme_end_req_zoned(req); + nvme_trace_bio_complete(req); + blk_mq_end_request(req, status); +} + +void nvme_complete_rq(struct request *req) +{ + trace_nvme_complete_rq(req); + nvme_cleanup_cmd(req); + + if (nvme_req(req)->ctrl->kas) + nvme_req(req)->ctrl->comp_seen = true; + + switch (nvme_decide_disposition(req)) { + case COMPLETE: + nvme_end_req(req); + return; + case RETRY: + nvme_retry_req(req); + return; + case FAILOVER: + nvme_failover_req(req); + return; + } +} +EXPORT_SYMBOL_GPL(nvme_complete_rq); + +void nvme_complete_batch_req(struct request *req) +{ + trace_nvme_complete_rq(req); + nvme_cleanup_cmd(req); + nvme_end_req_zoned(req); +} +EXPORT_SYMBOL_GPL(nvme_complete_batch_req); + +/* + * Called to unwind from ->queue_rq on a failed command submission so that the + * multipathing code gets called to potentially failover to another path. + * The caller needs to unwind all transport specific resource allocations and + * must return propagate the return value. + */ +blk_status_t nvme_host_path_error(struct request *req) +{ + nvme_req(req)->status = NVME_SC_HOST_PATH_ERROR; + blk_mq_set_request_complete(req); + nvme_complete_rq(req); + return BLK_STS_OK; +} +EXPORT_SYMBOL_GPL(nvme_host_path_error); + +bool nvme_cancel_request(struct request *req, void *data, bool reserved) +{ + dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device, + "Cancelling I/O %d", req->tag); + + /* don't abort one completed request */ + if (blk_mq_request_completed(req)) + return true; + + nvme_req(req)->status = NVME_SC_HOST_ABORTED_CMD; + nvme_req(req)->flags |= NVME_REQ_CANCELLED; + blk_mq_complete_request(req); + return true; +} +EXPORT_SYMBOL_GPL(nvme_cancel_request); + +void nvme_cancel_tagset(struct nvme_ctrl *ctrl) +{ + if (ctrl->tagset) { + blk_mq_tagset_busy_iter(ctrl->tagset, + nvme_cancel_request, ctrl); + blk_mq_tagset_wait_completed_request(ctrl->tagset); + } +} +EXPORT_SYMBOL_GPL(nvme_cancel_tagset); + +void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl) +{ + if (ctrl->admin_tagset) { + blk_mq_tagset_busy_iter(ctrl->admin_tagset, + nvme_cancel_request, ctrl); + blk_mq_tagset_wait_completed_request(ctrl->admin_tagset); + } +} +EXPORT_SYMBOL_GPL(nvme_cancel_admin_tagset); + +bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, + enum nvme_ctrl_state new_state) +{ + enum nvme_ctrl_state old_state; + unsigned long flags; + bool changed = false; + + spin_lock_irqsave(&ctrl->lock, flags); + + old_state = ctrl->state; + switch (new_state) { + case NVME_CTRL_LIVE: + switch (old_state) { + case NVME_CTRL_NEW: + case NVME_CTRL_RESETTING: + case NVME_CTRL_CONNECTING: + changed = true; + fallthrough; + default: + break; + } + break; + case NVME_CTRL_RESETTING: + switch (old_state) { + case NVME_CTRL_NEW: + case NVME_CTRL_LIVE: + changed = true; + fallthrough; + default: + break; + } + break; + case NVME_CTRL_CONNECTING: + switch (old_state) { + case NVME_CTRL_NEW: + case NVME_CTRL_RESETTING: + changed = true; + fallthrough; + default: + break; + } + break; + case NVME_CTRL_DELETING: + switch (old_state) { + case NVME_CTRL_LIVE: + case NVME_CTRL_RESETTING: + case NVME_CTRL_CONNECTING: + changed = true; + fallthrough; + default: + break; + } + break; + case NVME_CTRL_DELETING_NOIO: + switch (old_state) { + case NVME_CTRL_DELETING: + case NVME_CTRL_DEAD: + changed = true; + fallthrough; + default: + break; + } + break; + case NVME_CTRL_DEAD: + switch (old_state) { + case NVME_CTRL_DELETING: + changed = true; + fallthrough; + default: + break; + } + break; + default: + break; + } + + if (changed) { + ctrl->state = new_state; + wake_up_all(&ctrl->state_wq); + } + + spin_unlock_irqrestore(&ctrl->lock, flags); + if (!changed) + return false; + + if (ctrl->state == NVME_CTRL_LIVE) { + if (old_state == NVME_CTRL_CONNECTING) + nvme_stop_failfast_work(ctrl); + nvme_kick_requeue_lists(ctrl); + } else if (ctrl->state == NVME_CTRL_CONNECTING && + old_state == NVME_CTRL_RESETTING) { + nvme_start_failfast_work(ctrl); + } + return changed; +} +EXPORT_SYMBOL_GPL(nvme_change_ctrl_state); + +/* + * Returns true for sink states that can't ever transition back to live. + */ +static bool nvme_state_terminal(struct nvme_ctrl *ctrl) +{ + switch (ctrl->state) { + case NVME_CTRL_NEW: + case NVME_CTRL_LIVE: + case NVME_CTRL_RESETTING: + case NVME_CTRL_CONNECTING: + return false; + case NVME_CTRL_DELETING: + case NVME_CTRL_DELETING_NOIO: + case NVME_CTRL_DEAD: + return true; + default: + WARN_ONCE(1, "Unhandled ctrl state:%d", ctrl->state); + return true; + } +} + +/* + * Waits for the controller state to be resetting, or returns false if it is + * not possible to ever transition to that state. + */ +bool nvme_wait_reset(struct nvme_ctrl *ctrl) +{ + wait_event(ctrl->state_wq, + nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING) || + nvme_state_terminal(ctrl)); + return ctrl->state == NVME_CTRL_RESETTING; +} +EXPORT_SYMBOL_GPL(nvme_wait_reset); + +static void nvme_free_ns_head(struct kref *ref) +{ + struct nvme_ns_head *head = + container_of(ref, struct nvme_ns_head, ref); + + nvme_mpath_remove_disk(head); + ida_simple_remove(&head->subsys->ns_ida, head->instance); + cleanup_srcu_struct(&head->srcu); + nvme_put_subsystem(head->subsys); + kfree(head); +} + +bool nvme_tryget_ns_head(struct nvme_ns_head *head) +{ + return kref_get_unless_zero(&head->ref); +} + +void nvme_put_ns_head(struct nvme_ns_head *head) +{ + kref_put(&head->ref, nvme_free_ns_head); +} + +static void nvme_free_ns(struct kref *kref) +{ + struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref); + + put_disk(ns->disk); + nvme_put_ns_head(ns->head); + nvme_put_ctrl(ns->ctrl); + kfree(ns); +} + +static inline bool nvme_get_ns(struct nvme_ns *ns) +{ + return kref_get_unless_zero(&ns->kref); +} + +void nvme_put_ns(struct nvme_ns *ns) +{ + kref_put(&ns->kref, nvme_free_ns); +} +EXPORT_SYMBOL_NS_GPL(nvme_put_ns, NVME_TARGET_PASSTHRU); + +static inline void nvme_clear_nvme_request(struct request *req) +{ + nvme_req(req)->status = 0; + nvme_req(req)->retries = 0; + nvme_req(req)->flags = 0; + req->rq_flags |= RQF_DONTPREP; +} + +static inline unsigned int nvme_req_op(struct nvme_command *cmd) +{ + return nvme_is_write(cmd) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN; +} + +static inline void nvme_init_request(struct request *req, + struct nvme_command *cmd) +{ + if (req->q->queuedata) + req->timeout = NVME_IO_TIMEOUT; + else /* no queuedata implies admin queue */ + req->timeout = NVME_ADMIN_TIMEOUT; + + /* passthru commands should let the driver set the SGL flags */ + cmd->common.flags &= ~NVME_CMD_SGL_ALL; + + req->cmd_flags |= REQ_FAILFAST_DRIVER; + if (req->mq_hctx->type == HCTX_TYPE_POLL) + req->cmd_flags |= REQ_POLLED; + nvme_clear_nvme_request(req); + memcpy(nvme_req(req)->cmd, cmd, sizeof(*cmd)); +} + +struct request *nvme_alloc_request(struct request_queue *q, + struct nvme_command *cmd, blk_mq_req_flags_t flags) +{ + struct request *req; + + req = blk_mq_alloc_request(q, nvme_req_op(cmd), flags); + if (!IS_ERR(req)) + nvme_init_request(req, cmd); + return req; +} +EXPORT_SYMBOL_GPL(nvme_alloc_request); + +static struct request *nvme_alloc_request_qid(struct request_queue *q, + struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid) +{ + struct request *req; + + req = blk_mq_alloc_request_hctx(q, nvme_req_op(cmd), flags, + qid ? qid - 1 : 0); + if (!IS_ERR(req)) + nvme_init_request(req, cmd); + return req; +} + +/* + * For something we're not in a state to send to the device the default action + * is to busy it and retry it after the controller state is recovered. However, + * if the controller is deleting or if anything is marked for failfast or + * nvme multipath it is immediately failed. + * + * Note: commands used to initialize the controller will be marked for failfast. + * Note: nvme cli/ioctl commands are marked for failfast. + */ +blk_status_t nvme_fail_nonready_command(struct nvme_ctrl *ctrl, + struct request *rq) +{ + if (ctrl->state != NVME_CTRL_DELETING_NOIO && + ctrl->state != NVME_CTRL_DELETING && + ctrl->state != NVME_CTRL_DEAD && + !test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) && + !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH)) + return BLK_STS_RESOURCE; + return nvme_host_path_error(rq); +} +EXPORT_SYMBOL_GPL(nvme_fail_nonready_command); + +bool __nvme_check_ready(struct nvme_ctrl *ctrl, struct request *rq, + bool queue_live) +{ + struct nvme_request *req = nvme_req(rq); + + /* + * currently we have a problem sending passthru commands + * on the admin_q if the controller is not LIVE because we can't + * make sure that they are going out after the admin connect, + * controller enable and/or other commands in the initialization + * sequence. until the controller will be LIVE, fail with + * BLK_STS_RESOURCE so that they will be rescheduled. + */ + if (rq->q == ctrl->admin_q && (req->flags & NVME_REQ_USERCMD)) + return false; + + if (ctrl->ops->flags & NVME_F_FABRICS) { + /* + * Only allow commands on a live queue, except for the connect + * command, which is require to set the queue live in the + * appropinquate states. + */ + switch (ctrl->state) { + case NVME_CTRL_CONNECTING: + if (blk_rq_is_passthrough(rq) && nvme_is_fabrics(req->cmd) && + req->cmd->fabrics.fctype == nvme_fabrics_type_connect) + return true; + break; + default: + break; + case NVME_CTRL_DEAD: + return false; + } + } + + return queue_live; +} +EXPORT_SYMBOL_GPL(__nvme_check_ready); + +static int nvme_toggle_streams(struct nvme_ctrl *ctrl, bool enable) +{ + struct nvme_command c = { }; + + c.directive.opcode = nvme_admin_directive_send; + c.directive.nsid = cpu_to_le32(NVME_NSID_ALL); + c.directive.doper = NVME_DIR_SND_ID_OP_ENABLE; + c.directive.dtype = NVME_DIR_IDENTIFY; + c.directive.tdtype = NVME_DIR_STREAMS; + c.directive.endir = enable ? NVME_DIR_ENDIR : 0; + + return nvme_submit_sync_cmd(ctrl->admin_q, &c, NULL, 0); +} + +static int nvme_disable_streams(struct nvme_ctrl *ctrl) +{ + return nvme_toggle_streams(ctrl, false); +} + +static int nvme_enable_streams(struct nvme_ctrl *ctrl) +{ + return nvme_toggle_streams(ctrl, true); +} + +static int nvme_get_stream_params(struct nvme_ctrl *ctrl, + struct streams_directive_params *s, u32 nsid) +{ + struct nvme_command c = { }; + + memset(s, 0, sizeof(*s)); + + c.directive.opcode = nvme_admin_directive_recv; + c.directive.nsid = cpu_to_le32(nsid); + c.directive.numd = cpu_to_le32(nvme_bytes_to_numd(sizeof(*s))); + c.directive.doper = NVME_DIR_RCV_ST_OP_PARAM; + c.directive.dtype = NVME_DIR_STREAMS; + + return nvme_submit_sync_cmd(ctrl->admin_q, &c, s, sizeof(*s)); +} + +static int nvme_configure_directives(struct nvme_ctrl *ctrl) +{ + struct streams_directive_params s; + int ret; + + if (!(ctrl->oacs & NVME_CTRL_OACS_DIRECTIVES)) + return 0; + if (!streams) + return 0; + + ret = nvme_enable_streams(ctrl); + if (ret) + return ret; + + ret = nvme_get_stream_params(ctrl, &s, NVME_NSID_ALL); + if (ret) + goto out_disable_stream; + + ctrl->nssa = le16_to_cpu(s.nssa); + if (ctrl->nssa < BLK_MAX_WRITE_HINTS - 1) { + dev_info(ctrl->device, "too few streams (%u) available\n", + ctrl->nssa); + goto out_disable_stream; + } + + ctrl->nr_streams = min_t(u16, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1); + dev_info(ctrl->device, "Using %u streams\n", ctrl->nr_streams); + return 0; + +out_disable_stream: + nvme_disable_streams(ctrl); + return ret; +} + +/* + * Check if 'req' has a write hint associated with it. If it does, assign + * a valid namespace stream to the write. + */ +static void nvme_assign_write_stream(struct nvme_ctrl *ctrl, + struct request *req, u16 *control, + u32 *dsmgmt) +{ + enum rw_hint streamid = req->write_hint; + + if (streamid == WRITE_LIFE_NOT_SET || streamid == WRITE_LIFE_NONE) + streamid = 0; + else { + streamid--; + if (WARN_ON_ONCE(streamid > ctrl->nr_streams)) + return; + + *control |= NVME_RW_DTYPE_STREAMS; + *dsmgmt |= streamid << 16; + } + + if (streamid < ARRAY_SIZE(req->q->write_hints)) + req->q->write_hints[streamid] += blk_rq_bytes(req) >> 9; +} + +static inline void nvme_setup_flush(struct nvme_ns *ns, + struct nvme_command *cmnd) +{ + memset(cmnd, 0, sizeof(*cmnd)); + cmnd->common.opcode = nvme_cmd_flush; + cmnd->common.nsid = cpu_to_le32(ns->head->ns_id); +} + +static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req, + struct nvme_command *cmnd) +{ + unsigned short segments = blk_rq_nr_discard_segments(req), n = 0; + struct nvme_dsm_range *range; + struct bio *bio; + + /* + * Some devices do not consider the DSM 'Number of Ranges' field when + * determining how much data to DMA. Always allocate memory for maximum + * number of segments to prevent device reading beyond end of buffer. + */ + static const size_t alloc_size = sizeof(*range) * NVME_DSM_MAX_RANGES; + + range = kzalloc(alloc_size, GFP_ATOMIC | __GFP_NOWARN); + if (!range) { + /* + * If we fail allocation our range, fallback to the controller + * discard page. If that's also busy, it's safe to return + * busy, as we know we can make progress once that's freed. + */ + if (test_and_set_bit_lock(0, &ns->ctrl->discard_page_busy)) + return BLK_STS_RESOURCE; + + range = page_address(ns->ctrl->discard_page); + } + + __rq_for_each_bio(bio, req) { + u64 slba = nvme_sect_to_lba(ns, bio->bi_iter.bi_sector); + u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift; + + if (n < segments) { + range[n].cattr = cpu_to_le32(0); + range[n].nlb = cpu_to_le32(nlb); + range[n].slba = cpu_to_le64(slba); + } + n++; + } + + if (WARN_ON_ONCE(n != segments)) { + if (virt_to_page(range) == ns->ctrl->discard_page) + clear_bit_unlock(0, &ns->ctrl->discard_page_busy); + else + kfree(range); + return BLK_STS_IOERR; + } + + memset(cmnd, 0, sizeof(*cmnd)); + cmnd->dsm.opcode = nvme_cmd_dsm; + cmnd->dsm.nsid = cpu_to_le32(ns->head->ns_id); + cmnd->dsm.nr = cpu_to_le32(segments - 1); + cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); + + req->special_vec.bv_page = virt_to_page(range); + req->special_vec.bv_offset = offset_in_page(range); + req->special_vec.bv_len = alloc_size; + req->rq_flags |= RQF_SPECIAL_PAYLOAD; + + return BLK_STS_OK; +} + +static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns, + struct request *req, struct nvme_command *cmnd) +{ + memset(cmnd, 0, sizeof(*cmnd)); + + if (ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) + return nvme_setup_discard(ns, req, cmnd); + + cmnd->write_zeroes.opcode = nvme_cmd_write_zeroes; + cmnd->write_zeroes.nsid = cpu_to_le32(ns->head->ns_id); + cmnd->write_zeroes.slba = + cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req))); + cmnd->write_zeroes.length = + cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); + + if (nvme_ns_has_pi(ns)) { + cmnd->write_zeroes.control = cpu_to_le16(NVME_RW_PRINFO_PRACT); + + switch (ns->pi_type) { + case NVME_NS_DPS_PI_TYPE1: + case NVME_NS_DPS_PI_TYPE2: + cmnd->write_zeroes.reftag = + cpu_to_le32(t10_pi_ref_tag(req)); + break; + } + } + + return BLK_STS_OK; +} + +static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, + struct request *req, struct nvme_command *cmnd, + enum nvme_opcode op) +{ + struct nvme_ctrl *ctrl = ns->ctrl; + u16 control = 0; + u32 dsmgmt = 0; + + if (req->cmd_flags & REQ_FUA) + control |= NVME_RW_FUA; + if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD)) + control |= NVME_RW_LR; + + if (req->cmd_flags & REQ_RAHEAD) + dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; + + cmnd->rw.opcode = op; + cmnd->rw.flags = 0; + cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id); + cmnd->rw.rsvd2 = 0; + cmnd->rw.metadata = 0; + cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req))); + cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); + cmnd->rw.reftag = 0; + cmnd->rw.apptag = 0; + cmnd->rw.appmask = 0; + + if (req_op(req) == REQ_OP_WRITE && ctrl->nr_streams) + nvme_assign_write_stream(ctrl, req, &control, &dsmgmt); + + if (ns->ms) { + /* + * If formated with metadata, the block layer always provides a + * metadata buffer if CONFIG_BLK_DEV_INTEGRITY is enabled. Else + * we enable the PRACT bit for protection information or set the + * namespace capacity to zero to prevent any I/O. + */ + if (!blk_integrity_rq(req)) { + if (WARN_ON_ONCE(!nvme_ns_has_pi(ns))) + return BLK_STS_NOTSUPP; + control |= NVME_RW_PRINFO_PRACT; + } + + switch (ns->pi_type) { + case NVME_NS_DPS_PI_TYPE3: + control |= NVME_RW_PRINFO_PRCHK_GUARD; + break; + case NVME_NS_DPS_PI_TYPE1: + case NVME_NS_DPS_PI_TYPE2: + control |= NVME_RW_PRINFO_PRCHK_GUARD | + NVME_RW_PRINFO_PRCHK_REF; + if (op == nvme_cmd_zone_append) + control |= NVME_RW_APPEND_PIREMAP; + cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req)); + break; + } + } + + cmnd->rw.control = cpu_to_le16(control); + cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); + return 0; +} + +void nvme_cleanup_cmd(struct request *req) +{ + if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { + struct nvme_ctrl *ctrl = nvme_req(req)->ctrl; + + if (req->special_vec.bv_page == ctrl->discard_page) + clear_bit_unlock(0, &ctrl->discard_page_busy); + else + kfree(bvec_virt(&req->special_vec)); + } +} +EXPORT_SYMBOL_GPL(nvme_cleanup_cmd); + +blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req) +{ + struct nvme_command *cmd = nvme_req(req)->cmd; + blk_status_t ret = BLK_STS_OK; + + if (!(req->rq_flags & RQF_DONTPREP)) + nvme_clear_nvme_request(req); + + switch (req_op(req)) { + case REQ_OP_DRV_IN: + case REQ_OP_DRV_OUT: + /* these are setup prior to execution in nvme_init_request() */ + break; + case REQ_OP_FLUSH: + nvme_setup_flush(ns, cmd); + break; + case REQ_OP_ZONE_RESET_ALL: + case REQ_OP_ZONE_RESET: + ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_RESET); + break; + case REQ_OP_ZONE_OPEN: + ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OPEN); + break; + case REQ_OP_ZONE_CLOSE: + ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_CLOSE); + break; + case REQ_OP_ZONE_FINISH: + ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH); + break; + case REQ_OP_WRITE_ZEROES: + ret = nvme_setup_write_zeroes(ns, req, cmd); + break; + case REQ_OP_DISCARD: + ret = nvme_setup_discard(ns, req, cmd); + break; + case REQ_OP_READ: + ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read); + break; + case REQ_OP_WRITE: + ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write); + break; + case REQ_OP_ZONE_APPEND: + ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append); + break; + default: + WARN_ON_ONCE(1); + return BLK_STS_IOERR; + } + + cmd->common.command_id = nvme_cid(req); + trace_nvme_setup_cmd(req, cmd); + return ret; +} +EXPORT_SYMBOL_GPL(nvme_setup_cmd); + +/* + * Return values: + * 0: success + * >0: nvme controller's cqe status response + * <0: kernel error in lieu of controller response + */ +static int nvme_execute_rq(struct gendisk *disk, struct request *rq, + bool at_head) +{ + blk_status_t status; + + status = blk_execute_rq(rq, at_head); + if (nvme_req(rq)->flags & NVME_REQ_CANCELLED) + return -EINTR; + if (nvme_req(rq)->status) + return nvme_req(rq)->status; + return blk_status_to_errno(status); +} + +/* + * Returns 0 on success. If the result is negative, it's a Linux error code; + * if the result is positive, it's an NVM Express status code + */ +int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, + union nvme_result *result, void *buffer, unsigned bufflen, + unsigned timeout, int qid, int at_head, + blk_mq_req_flags_t flags) +{ + struct request *req; + int ret; + + if (qid == NVME_QID_ANY) + req = nvme_alloc_request(q, cmd, flags); + else + req = nvme_alloc_request_qid(q, cmd, flags, qid); + if (IS_ERR(req)) + return PTR_ERR(req); + + if (timeout) + req->timeout = timeout; + + if (buffer && bufflen) { + ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL); + if (ret) + goto out; + } + + ret = nvme_execute_rq(NULL, req, at_head); + if (result && ret >= 0) + *result = nvme_req(req)->result; + out: + blk_mq_free_request(req); + return ret; +} +EXPORT_SYMBOL_GPL(__nvme_submit_sync_cmd); + +int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, + void *buffer, unsigned bufflen) +{ + return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0, + NVME_QID_ANY, 0, 0); +} +EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd); + +static u32 nvme_known_admin_effects(u8 opcode) +{ + switch (opcode) { + case nvme_admin_format_nvm: + return NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_NCC | + NVME_CMD_EFFECTS_CSE_MASK; + case nvme_admin_sanitize_nvm: + return NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK; + default: + break; + } + return 0; +} + +u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode) +{ + u32 effects = 0; + + if (ns) { + if (ns->head->effects) + effects = le32_to_cpu(ns->head->effects->iocs[opcode]); + if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC)) + dev_warn_once(ctrl->device, + "IO command:%02x has unhandled effects:%08x\n", + opcode, effects); + return 0; + } + + if (ctrl->effects) + effects = le32_to_cpu(ctrl->effects->acs[opcode]); + effects |= nvme_known_admin_effects(opcode); + + return effects; +} +EXPORT_SYMBOL_NS_GPL(nvme_command_effects, NVME_TARGET_PASSTHRU); + +static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, + u8 opcode) +{ + u32 effects = nvme_command_effects(ctrl, ns, opcode); + + /* + * For simplicity, IO to all namespaces is quiesced even if the command + * effects say only one namespace is affected. + */ + if (effects & NVME_CMD_EFFECTS_CSE_MASK) { + mutex_lock(&ctrl->scan_lock); + mutex_lock(&ctrl->subsys->lock); + nvme_mpath_start_freeze(ctrl->subsys); + nvme_mpath_wait_freeze(ctrl->subsys); + nvme_start_freeze(ctrl); + nvme_wait_freeze(ctrl); + } + return effects; +} + +static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects, + struct nvme_command *cmd, int status) +{ + if (effects & NVME_CMD_EFFECTS_CSE_MASK) { + nvme_unfreeze(ctrl); + nvme_mpath_unfreeze(ctrl->subsys); + mutex_unlock(&ctrl->subsys->lock); + nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL); + mutex_unlock(&ctrl->scan_lock); + } + if (effects & NVME_CMD_EFFECTS_CCC) + nvme_init_ctrl_finish(ctrl); + if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) { + nvme_queue_scan(ctrl); + flush_work(&ctrl->scan_work); + } + + switch (cmd->common.opcode) { + case nvme_admin_set_features: + switch (le32_to_cpu(cmd->common.cdw10) & 0xFF) { + case NVME_FEAT_KATO: + /* + * Keep alive commands interval on the host should be + * updated when KATO is modified by Set Features + * commands. + */ + if (!status) + nvme_update_keep_alive(ctrl, cmd); + break; + default: + break; + } + break; + default: + break; + } +} + +int nvme_execute_passthru_rq(struct request *rq) +{ + struct nvme_command *cmd = nvme_req(rq)->cmd; + struct nvme_ctrl *ctrl = nvme_req(rq)->ctrl; + struct nvme_ns *ns = rq->q->queuedata; + struct gendisk *disk = ns ? ns->disk : NULL; + u32 effects; + int ret; + + effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode); + ret = nvme_execute_rq(disk, rq, false); + if (effects) /* nothing to be done for zero cmd effects */ + nvme_passthru_end(ctrl, effects, cmd, ret); + + return ret; +} +EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU); + +/* + * Recommended frequency for KATO commands per NVMe 1.4 section 7.12.1: + * + * The host should send Keep Alive commands at half of the Keep Alive Timeout + * accounting for transport roundtrip times [..]. + */ +static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl) +{ + queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ / 2); +} + +static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status) +{ + struct nvme_ctrl *ctrl = rq->end_io_data; + unsigned long flags; + bool startka = false; + + blk_mq_free_request(rq); + + if (status) { + dev_err(ctrl->device, + "failed nvme_keep_alive_end_io error=%d\n", + status); + return; + } + + ctrl->comp_seen = false; + spin_lock_irqsave(&ctrl->lock, flags); + if (ctrl->state == NVME_CTRL_LIVE || + ctrl->state == NVME_CTRL_CONNECTING) + startka = true; + spin_unlock_irqrestore(&ctrl->lock, flags); + if (startka) + nvme_queue_keep_alive_work(ctrl); +} + +static void nvme_keep_alive_work(struct work_struct *work) +{ + struct nvme_ctrl *ctrl = container_of(to_delayed_work(work), + struct nvme_ctrl, ka_work); + bool comp_seen = ctrl->comp_seen; + struct request *rq; + + if ((ctrl->ctratt & NVME_CTRL_ATTR_TBKAS) && comp_seen) { + dev_dbg(ctrl->device, + "reschedule traffic based keep-alive timer\n"); + ctrl->comp_seen = false; + nvme_queue_keep_alive_work(ctrl); + return; + } + + rq = nvme_alloc_request(ctrl->admin_q, &ctrl->ka_cmd, + BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT); + if (IS_ERR(rq)) { + /* allocation failure, reset the controller */ + dev_err(ctrl->device, "keep-alive failed: %ld\n", PTR_ERR(rq)); + nvme_reset_ctrl(ctrl); + return; + } + + rq->timeout = ctrl->kato * HZ; + rq->end_io_data = ctrl; + blk_execute_rq_nowait(rq, false, nvme_keep_alive_end_io); +} + +static void nvme_start_keep_alive(struct nvme_ctrl *ctrl) +{ + if (unlikely(ctrl->kato == 0)) + return; + + nvme_queue_keep_alive_work(ctrl); +} + +void nvme_stop_keep_alive(struct nvme_ctrl *ctrl) +{ + if (unlikely(ctrl->kato == 0)) + return; + + cancel_delayed_work_sync(&ctrl->ka_work); +} +EXPORT_SYMBOL_GPL(nvme_stop_keep_alive); + +static void nvme_update_keep_alive(struct nvme_ctrl *ctrl, + struct nvme_command *cmd) +{ + unsigned int new_kato = + DIV_ROUND_UP(le32_to_cpu(cmd->common.cdw11), 1000); + + dev_info(ctrl->device, + "keep alive interval updated from %u ms to %u ms\n", + ctrl->kato * 1000 / 2, new_kato * 1000 / 2); + + nvme_stop_keep_alive(ctrl); + ctrl->kato = new_kato; + nvme_start_keep_alive(ctrl); +} + +/* + * In NVMe 1.0 the CNS field was just a binary controller or namespace + * flag, thus sending any new CNS opcodes has a big chance of not working. + * Qemu unfortunately had that bug after reporting a 1.1 version compliance + * (but not for any later version). + */ +static bool nvme_ctrl_limited_cns(struct nvme_ctrl *ctrl) +{ + if (ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS) + return ctrl->vs < NVME_VS(1, 2, 0); + return ctrl->vs < NVME_VS(1, 1, 0); +} + +static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) +{ + struct nvme_command c = { }; + int error; + + /* gcc-4.4.4 (at least) has issues with initializers and anon unions */ + c.identify.opcode = nvme_admin_identify; + c.identify.cns = NVME_ID_CNS_CTRL; + + *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL); + if (!*id) + return -ENOMEM; + + error = nvme_submit_sync_cmd(dev->admin_q, &c, *id, + sizeof(struct nvme_id_ctrl)); + if (error) + kfree(*id); + return error; +} + +static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids, + struct nvme_ns_id_desc *cur, bool *csi_seen) +{ + const char *warn_str = "ctrl returned bogus length:"; + void *data = cur; + + switch (cur->nidt) { + case NVME_NIDT_EUI64: + if (cur->nidl != NVME_NIDT_EUI64_LEN) { + dev_warn(ctrl->device, "%s %d for NVME_NIDT_EUI64\n", + warn_str, cur->nidl); + return -1; + } + if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) + return NVME_NIDT_EUI64_LEN; + memcpy(ids->eui64, data + sizeof(*cur), NVME_NIDT_EUI64_LEN); + return NVME_NIDT_EUI64_LEN; + case NVME_NIDT_NGUID: + if (cur->nidl != NVME_NIDT_NGUID_LEN) { + dev_warn(ctrl->device, "%s %d for NVME_NIDT_NGUID\n", + warn_str, cur->nidl); + return -1; + } + if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) + return NVME_NIDT_NGUID_LEN; + memcpy(ids->nguid, data + sizeof(*cur), NVME_NIDT_NGUID_LEN); + return NVME_NIDT_NGUID_LEN; + case NVME_NIDT_UUID: + if (cur->nidl != NVME_NIDT_UUID_LEN) { + dev_warn(ctrl->device, "%s %d for NVME_NIDT_UUID\n", + warn_str, cur->nidl); + return -1; + } + if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) + return NVME_NIDT_UUID_LEN; + uuid_copy(&ids->uuid, data + sizeof(*cur)); + return NVME_NIDT_UUID_LEN; + case NVME_NIDT_CSI: + if (cur->nidl != NVME_NIDT_CSI_LEN) { + dev_warn(ctrl->device, "%s %d for NVME_NIDT_CSI\n", + warn_str, cur->nidl); + return -1; + } + memcpy(&ids->csi, data + sizeof(*cur), NVME_NIDT_CSI_LEN); + *csi_seen = true; + return NVME_NIDT_CSI_LEN; + default: + /* Skip unknown types */ + return cur->nidl; + } +} + +static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, + struct nvme_ns_ids *ids) +{ + struct nvme_command c = { }; + bool csi_seen = false; + int status, pos, len; + void *data; + + if (ctrl->vs < NVME_VS(1, 3, 0) && !nvme_multi_css(ctrl)) + return 0; + if (ctrl->quirks & NVME_QUIRK_NO_NS_DESC_LIST) + return 0; + + c.identify.opcode = nvme_admin_identify; + c.identify.nsid = cpu_to_le32(nsid); + c.identify.cns = NVME_ID_CNS_NS_DESC_LIST; + + data = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL); + if (!data) + return -ENOMEM; + + status = nvme_submit_sync_cmd(ctrl->admin_q, &c, data, + NVME_IDENTIFY_DATA_SIZE); + if (status) { + dev_warn(ctrl->device, + "Identify Descriptors failed (nsid=%u, status=0x%x)\n", + nsid, status); + goto free_data; + } + + for (pos = 0; pos < NVME_IDENTIFY_DATA_SIZE; pos += len) { + struct nvme_ns_id_desc *cur = data + pos; + + if (cur->nidl == 0) + break; + + len = nvme_process_ns_desc(ctrl, ids, cur, &csi_seen); + if (len < 0) + break; + + len += sizeof(*cur); + } + + if (nvme_multi_css(ctrl) && !csi_seen) { + dev_warn(ctrl->device, "Command set not reported for nsid:%d\n", + nsid); + status = -EINVAL; + } + +free_data: + kfree(data); + return status; +} + +static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid, + struct nvme_ns_ids *ids, struct nvme_id_ns **id) +{ + struct nvme_command c = { }; + int error; + + /* gcc-4.4.4 (at least) has issues with initializers and anon unions */ + c.identify.opcode = nvme_admin_identify; + c.identify.nsid = cpu_to_le32(nsid); + c.identify.cns = NVME_ID_CNS_NS; + + *id = kmalloc(sizeof(**id), GFP_KERNEL); + if (!*id) + return -ENOMEM; + + error = nvme_submit_sync_cmd(ctrl->admin_q, &c, *id, sizeof(**id)); + if (error) { + dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error); + goto out_free_id; + } + + error = NVME_SC_INVALID_NS | NVME_SC_DNR; + if ((*id)->ncap == 0) /* namespace not allocated or attached */ + goto out_free_id; + + + if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) { + dev_info(ctrl->device, + "Ignoring bogus Namespace Identifiers\n"); + } else { + if (ctrl->vs >= NVME_VS(1, 1, 0) && + !memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) + memcpy(ids->eui64, (*id)->eui64, sizeof(ids->eui64)); + if (ctrl->vs >= NVME_VS(1, 2, 0) && + !memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) + memcpy(ids->nguid, (*id)->nguid, sizeof(ids->nguid)); + } + + return 0; + +out_free_id: + kfree(*id); + return error; +} + +static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid, + unsigned int dword11, void *buffer, size_t buflen, u32 *result) +{ + union nvme_result res = { 0 }; + struct nvme_command c = { }; + int ret; + + c.features.opcode = op; + c.features.fid = cpu_to_le32(fid); + c.features.dword11 = cpu_to_le32(dword11); + + ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res, + buffer, buflen, 0, NVME_QID_ANY, 0, 0); + if (ret >= 0 && result) + *result = le32_to_cpu(res.u32); + return ret; +} + +int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid, + unsigned int dword11, void *buffer, size_t buflen, + u32 *result) +{ + return nvme_features(dev, nvme_admin_set_features, fid, dword11, buffer, + buflen, result); +} +EXPORT_SYMBOL_GPL(nvme_set_features); + +int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid, + unsigned int dword11, void *buffer, size_t buflen, + u32 *result) +{ + return nvme_features(dev, nvme_admin_get_features, fid, dword11, buffer, + buflen, result); +} +EXPORT_SYMBOL_GPL(nvme_get_features); + +int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count) +{ + u32 q_count = (*count - 1) | ((*count - 1) << 16); + u32 result; + int status, nr_io_queues; + + status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, NULL, 0, + &result); + if (status < 0) + return status; + + /* + * Degraded controllers might return an error when setting the queue + * count. We still want to be able to bring them online and offer + * access to the admin queue, as that might be only way to fix them up. + */ + if (status > 0) { + dev_err(ctrl->device, "Could not set queue count (%d)\n", status); + *count = 0; + } else { + nr_io_queues = min(result & 0xffff, result >> 16) + 1; + *count = min(*count, nr_io_queues); + } + + return 0; +} +EXPORT_SYMBOL_GPL(nvme_set_queue_count); + +#define NVME_AEN_SUPPORTED \ + (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | \ + NVME_AEN_CFG_ANA_CHANGE | NVME_AEN_CFG_DISC_CHANGE) + +static void nvme_enable_aen(struct nvme_ctrl *ctrl) +{ + u32 result, supported_aens = ctrl->oaes & NVME_AEN_SUPPORTED; + int status; + + if (!supported_aens) + return; + + status = nvme_set_features(ctrl, NVME_FEAT_ASYNC_EVENT, supported_aens, + NULL, 0, &result); + if (status) + dev_warn(ctrl->device, "Failed to configure AEN (cfg %x)\n", + supported_aens); + + queue_work(nvme_wq, &ctrl->async_event_work); +} + +static int nvme_ns_open(struct nvme_ns *ns) +{ + + /* should never be called due to GENHD_FL_HIDDEN */ + if (WARN_ON_ONCE(nvme_ns_head_multipath(ns->head))) + goto fail; + if (!nvme_get_ns(ns)) + goto fail; + if (!try_module_get(ns->ctrl->ops->module)) + goto fail_put_ns; + + return 0; + +fail_put_ns: + nvme_put_ns(ns); +fail: + return -ENXIO; +} + +static void nvme_ns_release(struct nvme_ns *ns) +{ + + module_put(ns->ctrl->ops->module); + nvme_put_ns(ns); +} + +static int nvme_open(struct block_device *bdev, fmode_t mode) +{ + return nvme_ns_open(bdev->bd_disk->private_data); +} + +static void nvme_release(struct gendisk *disk, fmode_t mode) +{ + nvme_ns_release(disk->private_data); +} + +int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) +{ + /* some standard values */ + geo->heads = 1 << 6; + geo->sectors = 1 << 5; + geo->cylinders = get_capacity(bdev->bd_disk) >> 11; + return 0; +} + +#ifdef CONFIG_BLK_DEV_INTEGRITY +static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type, + u32 max_integrity_segments) +{ + struct blk_integrity integrity = { }; + + switch (pi_type) { + case NVME_NS_DPS_PI_TYPE3: + integrity.profile = &t10_pi_type3_crc; + integrity.tag_size = sizeof(u16) + sizeof(u32); + integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE; + break; + case NVME_NS_DPS_PI_TYPE1: + case NVME_NS_DPS_PI_TYPE2: + integrity.profile = &t10_pi_type1_crc; + integrity.tag_size = sizeof(u16); + integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE; + break; + default: + integrity.profile = NULL; + break; + } + integrity.tuple_size = ms; + blk_integrity_register(disk, &integrity); + blk_queue_max_integrity_segments(disk->queue, max_integrity_segments); +} +#else +static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type, + u32 max_integrity_segments) +{ +} +#endif /* CONFIG_BLK_DEV_INTEGRITY */ + +static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns) +{ + struct nvme_ctrl *ctrl = ns->ctrl; + struct request_queue *queue = disk->queue; + u32 size = queue_logical_block_size(queue); + + if (ctrl->max_discard_sectors == 0) { + blk_queue_flag_clear(QUEUE_FLAG_DISCARD, queue); + return; + } + + if (ctrl->nr_streams && ns->sws && ns->sgs) + size *= ns->sws * ns->sgs; + + BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) < + NVME_DSM_MAX_RANGES); + + queue->limits.discard_alignment = 0; + queue->limits.discard_granularity = size; + + /* If discard is already enabled, don't reset queue limits */ + if (blk_queue_flag_test_and_set(QUEUE_FLAG_DISCARD, queue)) + return; + + blk_queue_max_discard_sectors(queue, ctrl->max_discard_sectors); + blk_queue_max_discard_segments(queue, ctrl->max_discard_segments); + + if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) + blk_queue_max_write_zeroes_sectors(queue, UINT_MAX); +} + +static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b) +{ + return uuid_equal(&a->uuid, &b->uuid) && + memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 && + memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0 && + a->csi == b->csi; +} + +static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns, + u32 *phys_bs, u32 *io_opt) +{ + struct streams_directive_params s; + int ret; + + if (!ctrl->nr_streams) + return 0; + + ret = nvme_get_stream_params(ctrl, &s, ns->head->ns_id); + if (ret) + return ret; + + ns->sws = le32_to_cpu(s.sws); + ns->sgs = le16_to_cpu(s.sgs); + + if (ns->sws) { + *phys_bs = ns->sws * (1 << ns->lba_shift); + if (ns->sgs) + *io_opt = *phys_bs * ns->sgs; + } + + return 0; +} + +static void nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id) +{ + struct nvme_ctrl *ctrl = ns->ctrl; + + /* + * The PI implementation requires the metadata size to be equal to the + * t10 pi tuple size. + */ + ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms); + if (ns->ms == sizeof(struct t10_pi_tuple)) + ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK; + else + ns->pi_type = 0; + + ns->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS); + if (!ns->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)) + return; + + if (ctrl->ops->flags & NVME_F_FABRICS) { + /* + * The NVMe over Fabrics specification only supports metadata as + * part of the extended data LBA. We rely on HCA/HBA support to + * remap the separate metadata buffer from the block layer. + */ + if (WARN_ON_ONCE(!(id->flbas & NVME_NS_FLBAS_META_EXT))) + return; + + ns->features |= NVME_NS_EXT_LBAS; + + /* + * The current fabrics transport drivers support namespace + * metadata formats only if nvme_ns_has_pi() returns true. + * Suppress support for all other formats so the namespace will + * have a 0 capacity and not be usable through the block stack. + * + * Note, this check will need to be modified if any drivers + * gain the ability to use other metadata formats. + */ + if (ctrl->max_integrity_segments && nvme_ns_has_pi(ns)) + ns->features |= NVME_NS_METADATA_SUPPORTED; + } else { + /* + * For PCIe controllers, we can't easily remap the separate + * metadata buffer from the block layer and thus require a + * separate metadata buffer for block layer metadata/PI support. + * We allow extended LBAs for the passthrough interface, though. + */ + if (id->flbas & NVME_NS_FLBAS_META_EXT) + ns->features |= NVME_NS_EXT_LBAS; + else + ns->features |= NVME_NS_METADATA_SUPPORTED; + } +} + +static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, + struct request_queue *q) +{ + bool vwc = ctrl->vwc & NVME_CTRL_VWC_PRESENT; + + if (ctrl->max_hw_sectors) { + u32 max_segments = + (ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> 9)) + 1; + + max_segments = min_not_zero(max_segments, ctrl->max_segments); + blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors); + blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX)); + } + blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1); + blk_queue_dma_alignment(q, 3); + blk_queue_write_cache(q, vwc, vwc); +} + +static void nvme_update_disk_info(struct gendisk *disk, + struct nvme_ns *ns, struct nvme_id_ns *id) +{ + sector_t capacity = nvme_lba_to_sect(ns, le64_to_cpu(id->nsze)); + unsigned short bs = 1 << ns->lba_shift; + u32 atomic_bs, phys_bs, io_opt = 0; + + /* + * The block layer can't support LBA sizes larger than the page size + * yet, so catch this early and don't allow block I/O. + */ + if (ns->lba_shift > PAGE_SHIFT) { + capacity = 0; + bs = (1 << 9); + } + + blk_integrity_unregister(disk); + + atomic_bs = phys_bs = bs; + nvme_setup_streams_ns(ns->ctrl, ns, &phys_bs, &io_opt); + if (id->nabo == 0) { + /* + * Bit 1 indicates whether NAWUPF is defined for this namespace + * and whether it should be used instead of AWUPF. If NAWUPF == + * 0 then AWUPF must be used instead. + */ + if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf) + atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs; + else + atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs; + } + + if (id->nsfeat & NVME_NS_FEAT_IO_OPT) { + /* NPWG = Namespace Preferred Write Granularity */ + phys_bs = bs * (1 + le16_to_cpu(id->npwg)); + /* NOWS = Namespace Optimal Write Size */ + io_opt = bs * (1 + le16_to_cpu(id->nows)); + } + + blk_queue_logical_block_size(disk->queue, bs); + /* + * Linux filesystems assume writing a single physical block is + * an atomic operation. Hence limit the physical block size to the + * value of the Atomic Write Unit Power Fail parameter. + */ + blk_queue_physical_block_size(disk->queue, min(phys_bs, atomic_bs)); + blk_queue_io_min(disk->queue, phys_bs); + blk_queue_io_opt(disk->queue, io_opt); + + /* + * Register a metadata profile for PI, or the plain non-integrity NVMe + * metadata masquerading as Type 0 if supported, otherwise reject block + * I/O to namespaces with metadata except when the namespace supports + * PI, as it can strip/insert in that case. + */ + if (ns->ms) { + if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) && + (ns->features & NVME_NS_METADATA_SUPPORTED)) + nvme_init_integrity(disk, ns->ms, ns->pi_type, + ns->ctrl->max_integrity_segments); + else if (!nvme_ns_has_pi(ns)) + capacity = 0; + } + + set_capacity_and_notify(disk, capacity); + + nvme_config_discard(disk, ns); + blk_queue_max_write_zeroes_sectors(disk->queue, + ns->ctrl->max_zeroes_sectors); +} + +static inline bool nvme_first_scan(struct gendisk *disk) +{ + /* nvme_alloc_ns() scans the disk prior to adding it */ + return !disk_live(disk); +} + +static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id) +{ + struct nvme_ctrl *ctrl = ns->ctrl; + u32 iob; + + if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) && + is_power_of_2(ctrl->max_hw_sectors)) + iob = ctrl->max_hw_sectors; + else + iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob)); + + if (!iob) + return; + + if (!is_power_of_2(iob)) { + if (nvme_first_scan(ns->disk)) + pr_warn("%s: ignoring unaligned IO boundary:%u\n", + ns->disk->disk_name, iob); + return; + } + + if (blk_queue_is_zoned(ns->disk->queue)) { + if (nvme_first_scan(ns->disk)) + pr_warn("%s: ignoring zoned namespace IO boundary\n", + ns->disk->disk_name); + return; + } + + blk_queue_chunk_sectors(ns->queue, iob); +} + +static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id) +{ + unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK; + int ret; + + blk_mq_freeze_queue(ns->disk->queue); + ns->lba_shift = id->lbaf[lbaf].ds; + nvme_set_queue_limits(ns->ctrl, ns->queue); + + nvme_configure_metadata(ns, id); + nvme_set_chunk_sectors(ns, id); + nvme_update_disk_info(ns->disk, ns, id); + + if (ns->head->ids.csi == NVME_CSI_ZNS) { + ret = nvme_update_zone_info(ns, lbaf); + if (ret) { + blk_mq_unfreeze_queue(ns->disk->queue); + goto out; + } + } + + set_disk_ro(ns->disk, (id->nsattr & NVME_NS_ATTR_RO) || + test_bit(NVME_NS_FORCE_RO, &ns->flags)); + set_bit(NVME_NS_READY, &ns->flags); + blk_mq_unfreeze_queue(ns->disk->queue); + + if (blk_queue_is_zoned(ns->queue)) { + ret = nvme_revalidate_zones(ns); + if (ret && !nvme_first_scan(ns->disk)) + goto out; + } + + if (nvme_ns_head_multipath(ns->head)) { + blk_mq_freeze_queue(ns->head->disk->queue); + nvme_update_disk_info(ns->head->disk, ns, id); + set_disk_ro(ns->head->disk, + (id->nsattr & NVME_NS_ATTR_RO) || + test_bit(NVME_NS_FORCE_RO, &ns->flags)); + nvme_mpath_revalidate_paths(ns); + blk_stack_limits(&ns->head->disk->queue->limits, + &ns->queue->limits, 0); + disk_update_readahead(ns->head->disk); + blk_mq_unfreeze_queue(ns->head->disk->queue); + } + + ret = 0; +out: + /* + * If probing fails due an unsupported feature, hide the block device, + * but still allow other access. + */ + if (ret == -ENODEV) { + ns->disk->flags |= GENHD_FL_HIDDEN; + set_bit(NVME_NS_READY, &ns->flags); + ret = 0; + } + return ret; +} + +static char nvme_pr_type(enum pr_type type) +{ + switch (type) { + case PR_WRITE_EXCLUSIVE: + return 1; + case PR_EXCLUSIVE_ACCESS: + return 2; + case PR_WRITE_EXCLUSIVE_REG_ONLY: + return 3; + case PR_EXCLUSIVE_ACCESS_REG_ONLY: + return 4; + case PR_WRITE_EXCLUSIVE_ALL_REGS: + return 5; + case PR_EXCLUSIVE_ACCESS_ALL_REGS: + return 6; + default: + return 0; + } +}; + +static int nvme_send_ns_head_pr_command(struct block_device *bdev, + struct nvme_command *c, u8 data[16]) +{ + struct nvme_ns_head *head = bdev->bd_disk->private_data; + int srcu_idx = srcu_read_lock(&head->srcu); + struct nvme_ns *ns = nvme_find_path(head); + int ret = -EWOULDBLOCK; + + if (ns) { + c->common.nsid = cpu_to_le32(ns->head->ns_id); + ret = nvme_submit_sync_cmd(ns->queue, c, data, 16); + } + srcu_read_unlock(&head->srcu, srcu_idx); + return ret; +} + +static int nvme_send_ns_pr_command(struct nvme_ns *ns, struct nvme_command *c, + u8 data[16]) +{ + c->common.nsid = cpu_to_le32(ns->head->ns_id); + return nvme_submit_sync_cmd(ns->queue, c, data, 16); +} + +static int nvme_pr_command(struct block_device *bdev, u32 cdw10, + u64 key, u64 sa_key, u8 op) +{ + struct nvme_command c = { }; + u8 data[16] = { 0, }; + + put_unaligned_le64(key, &data[0]); + put_unaligned_le64(sa_key, &data[8]); + + c.common.opcode = op; + c.common.cdw10 = cpu_to_le32(cdw10); + + if (IS_ENABLED(CONFIG_NVME_MULTIPATH) && + bdev->bd_disk->fops == &nvme_ns_head_ops) + return nvme_send_ns_head_pr_command(bdev, &c, data); + return nvme_send_ns_pr_command(bdev->bd_disk->private_data, &c, data); +} + +static int nvme_pr_register(struct block_device *bdev, u64 old, + u64 new, unsigned flags) +{ + u32 cdw10; + + if (flags & ~PR_FL_IGNORE_KEY) + return -EOPNOTSUPP; + + cdw10 = old ? 2 : 0; + cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0; + cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */ + return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register); +} + +static int nvme_pr_reserve(struct block_device *bdev, u64 key, + enum pr_type type, unsigned flags) +{ + u32 cdw10; + + if (flags & ~PR_FL_IGNORE_KEY) + return -EOPNOTSUPP; + + cdw10 = nvme_pr_type(type) << 8; + cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0); + return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire); +} + +static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new, + enum pr_type type, bool abort) +{ + u32 cdw10 = nvme_pr_type(type) << 8 | (abort ? 2 : 1); + + return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire); +} + +static int nvme_pr_clear(struct block_device *bdev, u64 key) +{ + u32 cdw10 = 1 | (key ? 1 << 3 : 0); + + return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register); +} + +static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type) +{ + u32 cdw10 = nvme_pr_type(type) << 8 | (key ? 1 << 3 : 0); + + return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release); +} + +const struct pr_ops nvme_pr_ops = { + .pr_register = nvme_pr_register, + .pr_reserve = nvme_pr_reserve, + .pr_release = nvme_pr_release, + .pr_preempt = nvme_pr_preempt, + .pr_clear = nvme_pr_clear, +}; + +#ifdef CONFIG_BLK_SED_OPAL +int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, + bool send) +{ + struct nvme_ctrl *ctrl = data; + struct nvme_command cmd = { }; + + if (send) + cmd.common.opcode = nvme_admin_security_send; + else + cmd.common.opcode = nvme_admin_security_recv; + cmd.common.nsid = 0; + cmd.common.cdw10 = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8); + cmd.common.cdw11 = cpu_to_le32(len); + + return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len, 0, + NVME_QID_ANY, 1, 0); +} +EXPORT_SYMBOL_GPL(nvme_sec_submit); +#endif /* CONFIG_BLK_SED_OPAL */ + +#ifdef CONFIG_BLK_DEV_ZONED +static int nvme_report_zones(struct gendisk *disk, sector_t sector, + unsigned int nr_zones, report_zones_cb cb, void *data) +{ + return nvme_ns_report_zones(disk->private_data, sector, nr_zones, cb, + data); +} +#else +#define nvme_report_zones NULL +#endif /* CONFIG_BLK_DEV_ZONED */ + +static const struct block_device_operations nvme_bdev_ops = { + .owner = THIS_MODULE, + .ioctl = nvme_ioctl, + .open = nvme_open, + .release = nvme_release, + .getgeo = nvme_getgeo, + .report_zones = nvme_report_zones, + .pr_ops = &nvme_pr_ops, +}; + +static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled) +{ + unsigned long timeout = + ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies; + u32 csts, bit = enabled ? NVME_CSTS_RDY : 0; + int ret; + + while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) { + if (csts == ~0) + return -ENODEV; + if ((csts & NVME_CSTS_RDY) == bit) + break; + + usleep_range(1000, 2000); + if (fatal_signal_pending(current)) + return -EINTR; + if (time_after(jiffies, timeout)) { + dev_err(ctrl->device, + "Device not ready; aborting %s, CSTS=0x%x\n", + enabled ? "initialisation" : "reset", csts); + return -ENODEV; + } + } + + return ret; +} + +/* + * If the device has been passed off to us in an enabled state, just clear + * the enabled bit. The spec says we should set the 'shutdown notification + * bits', but doing so may cause the device to complete commands to the + * admin queue ... and we don't know what memory that might be pointing at! + */ +int nvme_disable_ctrl(struct nvme_ctrl *ctrl) +{ + int ret; + + ctrl->ctrl_config &= ~NVME_CC_SHN_MASK; + ctrl->ctrl_config &= ~NVME_CC_ENABLE; + + ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); + if (ret) + return ret; + + if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY) + msleep(NVME_QUIRK_DELAY_AMOUNT); + + ret = nvme_wait_ready(ctrl, ctrl->cap, false); + nvme_stop_keep_alive(ctrl); + + return ret; +} +EXPORT_SYMBOL_GPL(nvme_disable_ctrl); + +int nvme_enable_ctrl(struct nvme_ctrl *ctrl) +{ + unsigned dev_page_min; + int ret; + + ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap); + if (ret) { + dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret); + return ret; + } + dev_page_min = NVME_CAP_MPSMIN(ctrl->cap) + 12; + + if (NVME_CTRL_PAGE_SHIFT < dev_page_min) { + dev_err(ctrl->device, + "Minimum device page size %u too large for host (%u)\n", + 1 << dev_page_min, 1 << NVME_CTRL_PAGE_SHIFT); + return -ENODEV; + } + + if (NVME_CAP_CSS(ctrl->cap) & NVME_CAP_CSS_CSI) + ctrl->ctrl_config = NVME_CC_CSS_CSI; + else + ctrl->ctrl_config = NVME_CC_CSS_NVM; + ctrl->ctrl_config |= (NVME_CTRL_PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT; + ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE; + ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; + ctrl->ctrl_config |= NVME_CC_ENABLE; + + ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); + if (ret) + return ret; + + ret = nvme_wait_ready(ctrl, ctrl->cap, true); + if (ret) + return ret; + + nvme_start_keep_alive(ctrl); + + return 0; +} +EXPORT_SYMBOL_GPL(nvme_enable_ctrl); + +int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl) +{ + unsigned long timeout = jiffies + (ctrl->shutdown_timeout * HZ); + u32 csts; + int ret; + + ctrl->ctrl_config &= ~NVME_CC_SHN_MASK; + ctrl->ctrl_config |= NVME_CC_SHN_NORMAL; + + ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); + if (ret) + return ret; + + while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) { + if ((csts & NVME_CSTS_SHST_MASK) == NVME_CSTS_SHST_CMPLT) + break; + + msleep(100); + if (fatal_signal_pending(current)) + return -EINTR; + if (time_after(jiffies, timeout)) { + dev_err(ctrl->device, + "Device shutdown incomplete; abort shutdown\n"); + return -ENODEV; + } + } + + nvme_stop_keep_alive(ctrl); + + return ret; +} +EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl); + +static int nvme_configure_timestamp(struct nvme_ctrl *ctrl) +{ + __le64 ts; + int ret; + + if (!(ctrl->oncs & NVME_CTRL_ONCS_TIMESTAMP)) + return 0; + + ts = cpu_to_le64(ktime_to_ms(ktime_get_real())); + ret = nvme_set_features(ctrl, NVME_FEAT_TIMESTAMP, 0, &ts, sizeof(ts), + NULL); + if (ret) + dev_warn_once(ctrl->device, + "could not set timestamp (%d)\n", ret); + return ret; +} + +static int nvme_configure_acre(struct nvme_ctrl *ctrl) +{ + struct nvme_feat_host_behavior *host; + int ret; + + /* Don't bother enabling the feature if retry delay is not reported */ + if (!ctrl->crdt[0]) + return 0; + + host = kzalloc(sizeof(*host), GFP_KERNEL); + if (!host) + return 0; + + host->acre = NVME_ENABLE_ACRE; + ret = nvme_set_features(ctrl, NVME_FEAT_HOST_BEHAVIOR, 0, + host, sizeof(*host), NULL); + kfree(host); + return ret; +} + +/* + * The function checks whether the given total (exlat + enlat) latency of + * a power state allows the latter to be used as an APST transition target. + * It does so by comparing the latency to the primary and secondary latency + * tolerances defined by module params. If there's a match, the corresponding + * timeout value is returned and the matching tolerance index (1 or 2) is + * reported. + */ +static bool nvme_apst_get_transition_time(u64 total_latency, + u64 *transition_time, unsigned *last_index) +{ + if (total_latency <= apst_primary_latency_tol_us) { + if (*last_index == 1) + return false; + *last_index = 1; + *transition_time = apst_primary_timeout_ms; + return true; + } + if (apst_secondary_timeout_ms && + total_latency <= apst_secondary_latency_tol_us) { + if (*last_index <= 2) + return false; + *last_index = 2; + *transition_time = apst_secondary_timeout_ms; + return true; + } + return false; +} + +/* + * APST (Autonomous Power State Transition) lets us program a table of power + * state transitions that the controller will perform automatically. + * + * Depending on module params, one of the two supported techniques will be used: + * + * - If the parameters provide explicit timeouts and tolerances, they will be + * used to build a table with up to 2 non-operational states to transition to. + * The default parameter values were selected based on the values used by + * Microsoft's and Intel's NVMe drivers. Yet, since we don't implement dynamic + * regeneration of the APST table in the event of switching between external + * and battery power, the timeouts and tolerances reflect a compromise + * between values used by Microsoft for AC and battery scenarios. + * - If not, we'll configure the table with a simple heuristic: we are willing + * to spend at most 2% of the time transitioning between power states. + * Therefore, when running in any given state, we will enter the next + * lower-power non-operational state after waiting 50 * (enlat + exlat) + * microseconds, as long as that state's exit latency is under the requested + * maximum latency. + * + * We will not autonomously enter any non-operational state for which the total + * latency exceeds ps_max_latency_us. + * + * Users can set ps_max_latency_us to zero to turn off APST. + */ +static int nvme_configure_apst(struct nvme_ctrl *ctrl) +{ + struct nvme_feat_auto_pst *table; + unsigned apste = 0; + u64 max_lat_us = 0; + __le64 target = 0; + int max_ps = -1; + int state; + int ret; + unsigned last_lt_index = UINT_MAX; + + /* + * If APST isn't supported or if we haven't been initialized yet, + * then don't do anything. + */ + if (!ctrl->apsta) + return 0; + + if (ctrl->npss > 31) { + dev_warn(ctrl->device, "NPSS is invalid; not using APST\n"); + return 0; + } + + table = kzalloc(sizeof(*table), GFP_KERNEL); + if (!table) + return 0; + + if (!ctrl->apst_enabled || ctrl->ps_max_latency_us == 0) { + /* Turn off APST. */ + dev_dbg(ctrl->device, "APST disabled\n"); + goto done; + } + + /* + * Walk through all states from lowest- to highest-power. + * According to the spec, lower-numbered states use more power. NPSS, + * despite the name, is the index of the lowest-power state, not the + * number of states. + */ + for (state = (int)ctrl->npss; state >= 0; state--) { + u64 total_latency_us, exit_latency_us, transition_ms; + + if (target) + table->entries[state] = target; + + /* + * Don't allow transitions to the deepest state if it's quirked + * off. + */ + if (state == ctrl->npss && + (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) + continue; + + /* + * Is this state a useful non-operational state for higher-power + * states to autonomously transition to? + */ + if (!(ctrl->psd[state].flags & NVME_PS_FLAGS_NON_OP_STATE)) + continue; + + exit_latency_us = (u64)le32_to_cpu(ctrl->psd[state].exit_lat); + if (exit_latency_us > ctrl->ps_max_latency_us) + continue; + + total_latency_us = exit_latency_us + + le32_to_cpu(ctrl->psd[state].entry_lat); + + /* + * This state is good. It can be used as the APST idle target + * for higher power states. + */ + if (apst_primary_timeout_ms && apst_primary_latency_tol_us) { + if (!nvme_apst_get_transition_time(total_latency_us, + &transition_ms, &last_lt_index)) + continue; + } else { + transition_ms = total_latency_us + 19; + do_div(transition_ms, 20); + if (transition_ms > (1 << 24) - 1) + transition_ms = (1 << 24) - 1; + } + + target = cpu_to_le64((state << 3) | (transition_ms << 8)); + if (max_ps == -1) + max_ps = state; + if (total_latency_us > max_lat_us) + max_lat_us = total_latency_us; + } + + if (max_ps == -1) + dev_dbg(ctrl->device, "APST enabled but no non-operational states are available\n"); + else + dev_dbg(ctrl->device, "APST enabled: max PS = %d, max round-trip latency = %lluus, table = %*phN\n", + max_ps, max_lat_us, (int)sizeof(*table), table); + apste = 1; + +done: + ret = nvme_set_features(ctrl, NVME_FEAT_AUTO_PST, apste, + table, sizeof(*table), NULL); + if (ret) + dev_err(ctrl->device, "failed to set APST feature (%d)\n", ret); + kfree(table); + return ret; +} + +static void nvme_set_latency_tolerance(struct device *dev, s32 val) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + u64 latency; + + switch (val) { + case PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT: + case PM_QOS_LATENCY_ANY: + latency = U64_MAX; + break; + + default: + latency = val; + } + + if (ctrl->ps_max_latency_us != latency) { + ctrl->ps_max_latency_us = latency; + if (ctrl->state == NVME_CTRL_LIVE) + nvme_configure_apst(ctrl); + } +} + +struct nvme_core_quirk_entry { + /* + * NVMe model and firmware strings are padded with spaces. For + * simplicity, strings in the quirk table are padded with NULLs + * instead. + */ + u16 vid; + const char *mn; + const char *fr; + unsigned long quirks; +}; + +static const struct nvme_core_quirk_entry core_quirks[] = { + { + /* + * This Toshiba device seems to die using any APST states. See: + * https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1678184/comments/11 + */ + .vid = 0x1179, + .mn = "THNSF5256GPUK TOSHIBA", + .quirks = NVME_QUIRK_NO_APST, + }, + { + /* + * This LiteON CL1-3D*-Q11 firmware version has a race + * condition associated with actions related to suspend to idle + * LiteON has resolved the problem in future firmware + */ + .vid = 0x14a4, + .fr = "22301111", + .quirks = NVME_QUIRK_SIMPLE_SUSPEND, + }, + { + /* + * This Kioxia CD6-V Series / HPE PE8030 device times out and + * aborts I/O during any load, but more easily reproducible + * with discards (fstrim). + * + * The device is left in a state where it is also not possible + * to use "nvme set-feature" to disable APST, but booting with + * nvme_core.default_ps_max_latency=0 works. + */ + .vid = 0x1e0f, + .mn = "KCD6XVUL6T40", + .quirks = NVME_QUIRK_NO_APST, + } +}; + +/* match is null-terminated but idstr is space-padded. */ +static bool string_matches(const char *idstr, const char *match, size_t len) +{ + size_t matchlen; + + if (!match) + return true; + + matchlen = strlen(match); + WARN_ON_ONCE(matchlen > len); + + if (memcmp(idstr, match, matchlen)) + return false; + + for (; matchlen < len; matchlen++) + if (idstr[matchlen] != ' ') + return false; + + return true; +} + +static bool quirk_matches(const struct nvme_id_ctrl *id, + const struct nvme_core_quirk_entry *q) +{ + return q->vid == le16_to_cpu(id->vid) && + string_matches(id->mn, q->mn, sizeof(id->mn)) && + string_matches(id->fr, q->fr, sizeof(id->fr)); +} + +static void nvme_init_subnqn(struct nvme_subsystem *subsys, struct nvme_ctrl *ctrl, + struct nvme_id_ctrl *id) +{ + size_t nqnlen; + int off; + + if(!(ctrl->quirks & NVME_QUIRK_IGNORE_DEV_SUBNQN)) { + nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE); + if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) { + strlcpy(subsys->subnqn, id->subnqn, NVMF_NQN_SIZE); + return; + } + + if (ctrl->vs >= NVME_VS(1, 2, 1)) + dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n"); + } + + /* Generate a "fake" NQN per Figure 254 in NVMe 1.3 + ECN 001 */ + off = snprintf(subsys->subnqn, NVMF_NQN_SIZE, + "nqn.2014.08.org.nvmexpress:%04x%04x", + le16_to_cpu(id->vid), le16_to_cpu(id->ssvid)); + memcpy(subsys->subnqn + off, id->sn, sizeof(id->sn)); + off += sizeof(id->sn); + memcpy(subsys->subnqn + off, id->mn, sizeof(id->mn)); + off += sizeof(id->mn); + memset(subsys->subnqn + off, 0, sizeof(subsys->subnqn) - off); +} + +static void nvme_release_subsystem(struct device *dev) +{ + struct nvme_subsystem *subsys = + container_of(dev, struct nvme_subsystem, dev); + + if (subsys->instance >= 0) + ida_simple_remove(&nvme_instance_ida, subsys->instance); + kfree(subsys); +} + +static void nvme_destroy_subsystem(struct kref *ref) +{ + struct nvme_subsystem *subsys = + container_of(ref, struct nvme_subsystem, ref); + + mutex_lock(&nvme_subsystems_lock); + list_del(&subsys->entry); + mutex_unlock(&nvme_subsystems_lock); + + ida_destroy(&subsys->ns_ida); + device_del(&subsys->dev); + put_device(&subsys->dev); +} + +static void nvme_put_subsystem(struct nvme_subsystem *subsys) +{ + kref_put(&subsys->ref, nvme_destroy_subsystem); +} + +static struct nvme_subsystem *__nvme_find_get_subsystem(const char *subsysnqn) +{ + struct nvme_subsystem *subsys; + + lockdep_assert_held(&nvme_subsystems_lock); + + /* + * Fail matches for discovery subsystems. This results + * in each discovery controller bound to a unique subsystem. + * This avoids issues with validating controller values + * that can only be true when there is a single unique subsystem. + * There may be multiple and completely independent entities + * that provide discovery controllers. + */ + if (!strcmp(subsysnqn, NVME_DISC_SUBSYS_NAME)) + return NULL; + + list_for_each_entry(subsys, &nvme_subsystems, entry) { + if (strcmp(subsys->subnqn, subsysnqn)) + continue; + if (!kref_get_unless_zero(&subsys->ref)) + continue; + return subsys; + } + + return NULL; +} + +#define SUBSYS_ATTR_RO(_name, _mode, _show) \ + struct device_attribute subsys_attr_##_name = \ + __ATTR(_name, _mode, _show, NULL) + +static ssize_t nvme_subsys_show_nqn(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct nvme_subsystem *subsys = + container_of(dev, struct nvme_subsystem, dev); + + return sysfs_emit(buf, "%s\n", subsys->subnqn); +} +static SUBSYS_ATTR_RO(subsysnqn, S_IRUGO, nvme_subsys_show_nqn); + +static ssize_t nvme_subsys_show_type(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct nvme_subsystem *subsys = + container_of(dev, struct nvme_subsystem, dev); + + switch (subsys->subtype) { + case NVME_NQN_DISC: + return sysfs_emit(buf, "discovery\n"); + case NVME_NQN_NVME: + return sysfs_emit(buf, "nvm\n"); + default: + return sysfs_emit(buf, "reserved\n"); + } +} +static SUBSYS_ATTR_RO(subsystype, S_IRUGO, nvme_subsys_show_type); + +#define nvme_subsys_show_str_function(field) \ +static ssize_t subsys_##field##_show(struct device *dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + struct nvme_subsystem *subsys = \ + container_of(dev, struct nvme_subsystem, dev); \ + return sysfs_emit(buf, "%.*s\n", \ + (int)sizeof(subsys->field), subsys->field); \ +} \ +static SUBSYS_ATTR_RO(field, S_IRUGO, subsys_##field##_show); + +nvme_subsys_show_str_function(model); +nvme_subsys_show_str_function(serial); +nvme_subsys_show_str_function(firmware_rev); + +static struct attribute *nvme_subsys_attrs[] = { + &subsys_attr_model.attr, + &subsys_attr_serial.attr, + &subsys_attr_firmware_rev.attr, + &subsys_attr_subsysnqn.attr, + &subsys_attr_subsystype.attr, +#ifdef CONFIG_NVME_MULTIPATH + &subsys_attr_iopolicy.attr, +#endif + NULL, +}; + +static const struct attribute_group nvme_subsys_attrs_group = { + .attrs = nvme_subsys_attrs, +}; + +static const struct attribute_group *nvme_subsys_attrs_groups[] = { + &nvme_subsys_attrs_group, + NULL, +}; + +static inline bool nvme_discovery_ctrl(struct nvme_ctrl *ctrl) +{ + return ctrl->opts && ctrl->opts->discovery_nqn; +} + +static bool nvme_validate_cntlid(struct nvme_subsystem *subsys, + struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) +{ + struct nvme_ctrl *tmp; + + lockdep_assert_held(&nvme_subsystems_lock); + + list_for_each_entry(tmp, &subsys->ctrls, subsys_entry) { + if (nvme_state_terminal(tmp)) + continue; + + if (tmp->cntlid == ctrl->cntlid) { + dev_err(ctrl->device, + "Duplicate cntlid %u with %s, subsys %s, rejecting\n", + ctrl->cntlid, dev_name(tmp->device), + subsys->subnqn); + return false; + } + + if ((id->cmic & NVME_CTRL_CMIC_MULTI_CTRL) || + nvme_discovery_ctrl(ctrl)) + continue; + + dev_err(ctrl->device, + "Subsystem does not support multiple controllers\n"); + return false; + } + + return true; +} + +static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) +{ + struct nvme_subsystem *subsys, *found; + int ret; + + subsys = kzalloc(sizeof(*subsys), GFP_KERNEL); + if (!subsys) + return -ENOMEM; + + subsys->instance = -1; + mutex_init(&subsys->lock); + kref_init(&subsys->ref); + INIT_LIST_HEAD(&subsys->ctrls); + INIT_LIST_HEAD(&subsys->nsheads); + nvme_init_subnqn(subsys, ctrl, id); + memcpy(subsys->serial, id->sn, sizeof(subsys->serial)); + memcpy(subsys->model, id->mn, sizeof(subsys->model)); + memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev)); + subsys->vendor_id = le16_to_cpu(id->vid); + subsys->cmic = id->cmic; + + /* Versions prior to 1.4 don't necessarily report a valid type */ + if (id->cntrltype == NVME_CTRL_DISC || + !strcmp(subsys->subnqn, NVME_DISC_SUBSYS_NAME)) + subsys->subtype = NVME_NQN_DISC; + else + subsys->subtype = NVME_NQN_NVME; + + if (nvme_discovery_ctrl(ctrl) && subsys->subtype != NVME_NQN_DISC) { + dev_err(ctrl->device, + "Subsystem %s is not a discovery controller", + subsys->subnqn); + kfree(subsys); + return -EINVAL; + } + subsys->awupf = le16_to_cpu(id->awupf); + nvme_mpath_default_iopolicy(subsys); + + subsys->dev.class = nvme_subsys_class; + subsys->dev.release = nvme_release_subsystem; + subsys->dev.groups = nvme_subsys_attrs_groups; + dev_set_name(&subsys->dev, "nvme-subsys%d", ctrl->instance); + device_initialize(&subsys->dev); + + mutex_lock(&nvme_subsystems_lock); + found = __nvme_find_get_subsystem(subsys->subnqn); + if (found) { + put_device(&subsys->dev); + subsys = found; + + if (!nvme_validate_cntlid(subsys, ctrl, id)) { + ret = -EINVAL; + goto out_put_subsystem; + } + } else { + ret = device_add(&subsys->dev); + if (ret) { + dev_err(ctrl->device, + "failed to register subsystem device.\n"); + put_device(&subsys->dev); + goto out_unlock; + } + ida_init(&subsys->ns_ida); + list_add_tail(&subsys->entry, &nvme_subsystems); + } + + ret = sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj, + dev_name(ctrl->device)); + if (ret) { + dev_err(ctrl->device, + "failed to create sysfs link from subsystem.\n"); + goto out_put_subsystem; + } + + if (!found) + subsys->instance = ctrl->instance; + ctrl->subsys = subsys; + list_add_tail(&ctrl->subsys_entry, &subsys->ctrls); + mutex_unlock(&nvme_subsystems_lock); + return 0; + +out_put_subsystem: + nvme_put_subsystem(subsys); +out_unlock: + mutex_unlock(&nvme_subsystems_lock); + return ret; +} + +int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi, + void *log, size_t size, u64 offset) +{ + struct nvme_command c = { }; + u32 dwlen = nvme_bytes_to_numd(size); + + c.get_log_page.opcode = nvme_admin_get_log_page; + c.get_log_page.nsid = cpu_to_le32(nsid); + c.get_log_page.lid = log_page; + c.get_log_page.lsp = lsp; + c.get_log_page.numdl = cpu_to_le16(dwlen & ((1 << 16) - 1)); + c.get_log_page.numdu = cpu_to_le16(dwlen >> 16); + c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset)); + c.get_log_page.lpou = cpu_to_le32(upper_32_bits(offset)); + c.get_log_page.csi = csi; + + return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size); +} + +static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi, + struct nvme_effects_log **log) +{ + struct nvme_effects_log *cel = xa_load(&ctrl->cels, csi); + int ret; + + if (cel) + goto out; + + cel = kzalloc(sizeof(*cel), GFP_KERNEL); + if (!cel) + return -ENOMEM; + + ret = nvme_get_log(ctrl, 0x00, NVME_LOG_CMD_EFFECTS, 0, csi, + cel, sizeof(*cel), 0); + if (ret) { + kfree(cel); + return ret; + } + + xa_store(&ctrl->cels, csi, cel, GFP_KERNEL); +out: + *log = cel; + return 0; +} + +static inline u32 nvme_mps_to_sectors(struct nvme_ctrl *ctrl, u32 units) +{ + u32 page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12, val; + + if (check_shl_overflow(1U, units + page_shift - 9, &val)) + return UINT_MAX; + return val; +} + +static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl) +{ + struct nvme_command c = { }; + struct nvme_id_ctrl_nvm *id; + int ret; + + if (ctrl->oncs & NVME_CTRL_ONCS_DSM) { + ctrl->max_discard_sectors = UINT_MAX; + ctrl->max_discard_segments = NVME_DSM_MAX_RANGES; + } else { + ctrl->max_discard_sectors = 0; + ctrl->max_discard_segments = 0; + } + + /* + * Even though NVMe spec explicitly states that MDTS is not applicable + * to the write-zeroes, we are cautious and limit the size to the + * controllers max_hw_sectors value, which is based on the MDTS field + * and possibly other limiting factors. + */ + if ((ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES) && + !(ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES)) + ctrl->max_zeroes_sectors = ctrl->max_hw_sectors; + else + ctrl->max_zeroes_sectors = 0; + + if (nvme_ctrl_limited_cns(ctrl)) + return 0; + + id = kzalloc(sizeof(*id), GFP_KERNEL); + if (!id) + return 0; + + c.identify.opcode = nvme_admin_identify; + c.identify.cns = NVME_ID_CNS_CS_CTRL; + c.identify.csi = NVME_CSI_NVM; + + ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id)); + if (ret) + goto free_data; + + if (id->dmrl) + ctrl->max_discard_segments = id->dmrl; + if (id->dmrsl) + ctrl->max_discard_sectors = le32_to_cpu(id->dmrsl); + if (id->wzsl) + ctrl->max_zeroes_sectors = nvme_mps_to_sectors(ctrl, id->wzsl); + +free_data: + kfree(id); + return ret; +} + +static int nvme_init_identify(struct nvme_ctrl *ctrl) +{ + struct nvme_id_ctrl *id; + u32 max_hw_sectors; + bool prev_apst_enabled; + int ret; + + ret = nvme_identify_ctrl(ctrl, &id); + if (ret) { + dev_err(ctrl->device, "Identify Controller failed (%d)\n", ret); + return -EIO; + } + + if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) { + ret = nvme_get_effects_log(ctrl, NVME_CSI_NVM, &ctrl->effects); + if (ret < 0) + goto out_free; + } + + if (!(ctrl->ops->flags & NVME_F_FABRICS)) + ctrl->cntlid = le16_to_cpu(id->cntlid); + + if (!ctrl->identified) { + unsigned int i; + + ret = nvme_init_subsystem(ctrl, id); + if (ret) + goto out_free; + + /* + * Check for quirks. Quirk can depend on firmware version, + * so, in principle, the set of quirks present can change + * across a reset. As a possible future enhancement, we + * could re-scan for quirks every time we reinitialize + * the device, but we'd have to make sure that the driver + * behaves intelligently if the quirks change. + */ + for (i = 0; i < ARRAY_SIZE(core_quirks); i++) { + if (quirk_matches(id, &core_quirks[i])) + ctrl->quirks |= core_quirks[i].quirks; + } + } + + if (force_apst && (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) { + dev_warn(ctrl->device, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n"); + ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS; + } + + ctrl->crdt[0] = le16_to_cpu(id->crdt1); + ctrl->crdt[1] = le16_to_cpu(id->crdt2); + ctrl->crdt[2] = le16_to_cpu(id->crdt3); + + ctrl->oacs = le16_to_cpu(id->oacs); + ctrl->oncs = le16_to_cpu(id->oncs); + ctrl->mtfa = le16_to_cpu(id->mtfa); + ctrl->oaes = le32_to_cpu(id->oaes); + ctrl->wctemp = le16_to_cpu(id->wctemp); + ctrl->cctemp = le16_to_cpu(id->cctemp); + + atomic_set(&ctrl->abort_limit, id->acl + 1); + ctrl->vwc = id->vwc; + if (id->mdts) + max_hw_sectors = nvme_mps_to_sectors(ctrl, id->mdts); + else + max_hw_sectors = UINT_MAX; + ctrl->max_hw_sectors = + min_not_zero(ctrl->max_hw_sectors, max_hw_sectors); + + nvme_set_queue_limits(ctrl, ctrl->admin_q); + ctrl->sgls = le32_to_cpu(id->sgls); + ctrl->kas = le16_to_cpu(id->kas); + ctrl->max_namespaces = le32_to_cpu(id->mnan); + ctrl->ctratt = le32_to_cpu(id->ctratt); + + if (id->rtd3e) { + /* us -> s */ + u32 transition_time = le32_to_cpu(id->rtd3e) / USEC_PER_SEC; + + ctrl->shutdown_timeout = clamp_t(unsigned int, transition_time, + shutdown_timeout, 60); + + if (ctrl->shutdown_timeout != shutdown_timeout) + dev_info(ctrl->device, + "Shutdown timeout set to %u seconds\n", + ctrl->shutdown_timeout); + } else + ctrl->shutdown_timeout = shutdown_timeout; + + ctrl->npss = id->npss; + ctrl->apsta = id->apsta; + prev_apst_enabled = ctrl->apst_enabled; + if (ctrl->quirks & NVME_QUIRK_NO_APST) { + if (force_apst && id->apsta) { + dev_warn(ctrl->device, "forcibly allowing APST due to nvme_core.force_apst -- use at your own risk\n"); + ctrl->apst_enabled = true; + } else { + ctrl->apst_enabled = false; + } + } else { + ctrl->apst_enabled = id->apsta; + } + memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd)); + + if (ctrl->ops->flags & NVME_F_FABRICS) { + ctrl->icdoff = le16_to_cpu(id->icdoff); + ctrl->ioccsz = le32_to_cpu(id->ioccsz); + ctrl->iorcsz = le32_to_cpu(id->iorcsz); + ctrl->maxcmd = le16_to_cpu(id->maxcmd); + + /* + * In fabrics we need to verify the cntlid matches the + * admin connect + */ + if (ctrl->cntlid != le16_to_cpu(id->cntlid)) { + dev_err(ctrl->device, + "Mismatching cntlid: Connect %u vs Identify " + "%u, rejecting\n", + ctrl->cntlid, le16_to_cpu(id->cntlid)); + ret = -EINVAL; + goto out_free; + } + + if (!nvme_discovery_ctrl(ctrl) && !ctrl->kas) { + dev_err(ctrl->device, + "keep-alive support is mandatory for fabrics\n"); + ret = -EINVAL; + goto out_free; + } + } else { + ctrl->hmpre = le32_to_cpu(id->hmpre); + ctrl->hmmin = le32_to_cpu(id->hmmin); + ctrl->hmminds = le32_to_cpu(id->hmminds); + ctrl->hmmaxd = le16_to_cpu(id->hmmaxd); + } + + ret = nvme_mpath_init_identify(ctrl, id); + if (ret < 0) + goto out_free; + + if (ctrl->apst_enabled && !prev_apst_enabled) + dev_pm_qos_expose_latency_tolerance(ctrl->device); + else if (!ctrl->apst_enabled && prev_apst_enabled) + dev_pm_qos_hide_latency_tolerance(ctrl->device); + +out_free: + kfree(id); + return ret; +} + +/* + * Initialize the cached copies of the Identify data and various controller + * register in our nvme_ctrl structure. This should be called as soon as + * the admin queue is fully up and running. + */ +int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl) +{ + int ret; + + ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs); + if (ret) { + dev_err(ctrl->device, "Reading VS failed (%d)\n", ret); + return ret; + } + + ctrl->sqsize = min_t(u16, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize); + + if (ctrl->vs >= NVME_VS(1, 1, 0)) + ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap); + + ret = nvme_init_identify(ctrl); + if (ret) + return ret; + + ret = nvme_init_non_mdts_limits(ctrl); + if (ret < 0) + return ret; + + ret = nvme_configure_apst(ctrl); + if (ret < 0) + return ret; + + ret = nvme_configure_timestamp(ctrl); + if (ret < 0) + return ret; + + ret = nvme_configure_directives(ctrl); + if (ret < 0) + return ret; + + ret = nvme_configure_acre(ctrl); + if (ret < 0) + return ret; + + if (!ctrl->identified && !nvme_discovery_ctrl(ctrl)) { + ret = nvme_hwmon_init(ctrl); + if (ret < 0) + return ret; + } + + ctrl->identified = true; + + return 0; +} +EXPORT_SYMBOL_GPL(nvme_init_ctrl_finish); + +static int nvme_dev_open(struct inode *inode, struct file *file) +{ + struct nvme_ctrl *ctrl = + container_of(inode->i_cdev, struct nvme_ctrl, cdev); + + switch (ctrl->state) { + case NVME_CTRL_LIVE: + break; + default: + return -EWOULDBLOCK; + } + + nvme_get_ctrl(ctrl); + if (!try_module_get(ctrl->ops->module)) { + nvme_put_ctrl(ctrl); + return -EINVAL; + } + + file->private_data = ctrl; + return 0; +} + +static int nvme_dev_release(struct inode *inode, struct file *file) +{ + struct nvme_ctrl *ctrl = + container_of(inode->i_cdev, struct nvme_ctrl, cdev); + + module_put(ctrl->ops->module); + nvme_put_ctrl(ctrl); + return 0; +} + +static const struct file_operations nvme_dev_fops = { + .owner = THIS_MODULE, + .open = nvme_dev_open, + .release = nvme_dev_release, + .unlocked_ioctl = nvme_dev_ioctl, + .compat_ioctl = compat_ptr_ioctl, +}; + +static ssize_t nvme_sysfs_reset(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t count) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + int ret; + + ret = nvme_reset_ctrl_sync(ctrl); + if (ret < 0) + return ret; + return count; +} +static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset); + +static ssize_t nvme_sysfs_rescan(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t count) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + nvme_queue_scan(ctrl); + return count; +} +static DEVICE_ATTR(rescan_controller, S_IWUSR, NULL, nvme_sysfs_rescan); + +static inline struct nvme_ns_head *dev_to_ns_head(struct device *dev) +{ + struct gendisk *disk = dev_to_disk(dev); + + if (disk->fops == &nvme_bdev_ops) + return nvme_get_ns_from_dev(dev)->head; + else + return disk->private_data; +} + +static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvme_ns_head *head = dev_to_ns_head(dev); + struct nvme_ns_ids *ids = &head->ids; + struct nvme_subsystem *subsys = head->subsys; + int serial_len = sizeof(subsys->serial); + int model_len = sizeof(subsys->model); + + if (!uuid_is_null(&ids->uuid)) + return sysfs_emit(buf, "uuid.%pU\n", &ids->uuid); + + if (memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) + return sysfs_emit(buf, "eui.%16phN\n", ids->nguid); + + if (memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) + return sysfs_emit(buf, "eui.%8phN\n", ids->eui64); + + while (serial_len > 0 && (subsys->serial[serial_len - 1] == ' ' || + subsys->serial[serial_len - 1] == '\0')) + serial_len--; + while (model_len > 0 && (subsys->model[model_len - 1] == ' ' || + subsys->model[model_len - 1] == '\0')) + model_len--; + + return sysfs_emit(buf, "nvme.%04x-%*phN-%*phN-%08x\n", subsys->vendor_id, + serial_len, subsys->serial, model_len, subsys->model, + head->ns_id); +} +static DEVICE_ATTR_RO(wwid); + +static ssize_t nguid_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, "%pU\n", dev_to_ns_head(dev)->ids.nguid); +} +static DEVICE_ATTR_RO(nguid); + +static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids; + + /* For backward compatibility expose the NGUID to userspace if + * we have no UUID set + */ + if (uuid_is_null(&ids->uuid)) { + dev_warn_ratelimited(dev, + "No UUID available providing old NGUID\n"); + return sysfs_emit(buf, "%pU\n", ids->nguid); + } + return sysfs_emit(buf, "%pU\n", &ids->uuid); +} +static DEVICE_ATTR_RO(uuid); + +static ssize_t eui_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, "%8ph\n", dev_to_ns_head(dev)->ids.eui64); +} +static DEVICE_ATTR_RO(eui); + +static ssize_t nsid_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, "%d\n", dev_to_ns_head(dev)->ns_id); +} +static DEVICE_ATTR_RO(nsid); + +static struct attribute *nvme_ns_id_attrs[] = { + &dev_attr_wwid.attr, + &dev_attr_uuid.attr, + &dev_attr_nguid.attr, + &dev_attr_eui.attr, + &dev_attr_nsid.attr, +#ifdef CONFIG_NVME_MULTIPATH + &dev_attr_ana_grpid.attr, + &dev_attr_ana_state.attr, +#endif + NULL, +}; + +static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct device *dev = container_of(kobj, struct device, kobj); + struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids; + + if (a == &dev_attr_uuid.attr) { + if (uuid_is_null(&ids->uuid) && + !memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) + return 0; + } + if (a == &dev_attr_nguid.attr) { + if (!memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) + return 0; + } + if (a == &dev_attr_eui.attr) { + if (!memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) + return 0; + } +#ifdef CONFIG_NVME_MULTIPATH + if (a == &dev_attr_ana_grpid.attr || a == &dev_attr_ana_state.attr) { + if (dev_to_disk(dev)->fops != &nvme_bdev_ops) /* per-path attr */ + return 0; + if (!nvme_ctrl_use_ana(nvme_get_ns_from_dev(dev)->ctrl)) + return 0; + } +#endif + return a->mode; +} + +static const struct attribute_group nvme_ns_id_attr_group = { + .attrs = nvme_ns_id_attrs, + .is_visible = nvme_ns_id_attrs_are_visible, +}; + +const struct attribute_group *nvme_ns_id_attr_groups[] = { + &nvme_ns_id_attr_group, + NULL, +}; + +#define nvme_show_str_function(field) \ +static ssize_t field##_show(struct device *dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \ + return sysfs_emit(buf, "%.*s\n", \ + (int)sizeof(ctrl->subsys->field), ctrl->subsys->field); \ +} \ +static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); + +nvme_show_str_function(model); +nvme_show_str_function(serial); +nvme_show_str_function(firmware_rev); + +#define nvme_show_int_function(field) \ +static ssize_t field##_show(struct device *dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \ + return sysfs_emit(buf, "%d\n", ctrl->field); \ +} \ +static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); + +nvme_show_int_function(cntlid); +nvme_show_int_function(numa_node); +nvme_show_int_function(queue_count); +nvme_show_int_function(sqsize); +nvme_show_int_function(kato); + +static ssize_t nvme_sysfs_delete(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t count) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + if (device_remove_file_self(dev, attr)) + nvme_delete_ctrl_sync(ctrl); + return count; +} +static DEVICE_ATTR(delete_controller, S_IWUSR, NULL, nvme_sysfs_delete); + +static ssize_t nvme_sysfs_show_transport(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + return sysfs_emit(buf, "%s\n", ctrl->ops->name); +} +static DEVICE_ATTR(transport, S_IRUGO, nvme_sysfs_show_transport, NULL); + +static ssize_t nvme_sysfs_show_state(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + static const char *const state_name[] = { + [NVME_CTRL_NEW] = "new", + [NVME_CTRL_LIVE] = "live", + [NVME_CTRL_RESETTING] = "resetting", + [NVME_CTRL_CONNECTING] = "connecting", + [NVME_CTRL_DELETING] = "deleting", + [NVME_CTRL_DELETING_NOIO]= "deleting (no IO)", + [NVME_CTRL_DEAD] = "dead", + }; + + if ((unsigned)ctrl->state < ARRAY_SIZE(state_name) && + state_name[ctrl->state]) + return sysfs_emit(buf, "%s\n", state_name[ctrl->state]); + + return sysfs_emit(buf, "unknown state\n"); +} + +static DEVICE_ATTR(state, S_IRUGO, nvme_sysfs_show_state, NULL); + +static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + return sysfs_emit(buf, "%s\n", ctrl->subsys->subnqn); +} +static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL); + +static ssize_t nvme_sysfs_show_hostnqn(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + return sysfs_emit(buf, "%s\n", ctrl->opts->host->nqn); +} +static DEVICE_ATTR(hostnqn, S_IRUGO, nvme_sysfs_show_hostnqn, NULL); + +static ssize_t nvme_sysfs_show_hostid(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + return sysfs_emit(buf, "%pU\n", &ctrl->opts->host->id); +} +static DEVICE_ATTR(hostid, S_IRUGO, nvme_sysfs_show_hostid, NULL); + +static ssize_t nvme_sysfs_show_address(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + return ctrl->ops->get_address(ctrl, buf, PAGE_SIZE); +} +static DEVICE_ATTR(address, S_IRUGO, nvme_sysfs_show_address, NULL); + +static ssize_t nvme_ctrl_loss_tmo_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + struct nvmf_ctrl_options *opts = ctrl->opts; + + if (ctrl->opts->max_reconnects == -1) + return sysfs_emit(buf, "off\n"); + return sysfs_emit(buf, "%d\n", + opts->max_reconnects * opts->reconnect_delay); +} + +static ssize_t nvme_ctrl_loss_tmo_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + struct nvmf_ctrl_options *opts = ctrl->opts; + int ctrl_loss_tmo, err; + + err = kstrtoint(buf, 10, &ctrl_loss_tmo); + if (err) + return -EINVAL; + + if (ctrl_loss_tmo < 0) + opts->max_reconnects = -1; + else + opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo, + opts->reconnect_delay); + return count; +} +static DEVICE_ATTR(ctrl_loss_tmo, S_IRUGO | S_IWUSR, + nvme_ctrl_loss_tmo_show, nvme_ctrl_loss_tmo_store); + +static ssize_t nvme_ctrl_reconnect_delay_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + if (ctrl->opts->reconnect_delay == -1) + return sysfs_emit(buf, "off\n"); + return sysfs_emit(buf, "%d\n", ctrl->opts->reconnect_delay); +} + +static ssize_t nvme_ctrl_reconnect_delay_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + unsigned int v; + int err; + + err = kstrtou32(buf, 10, &v); + if (err) + return err; + + ctrl->opts->reconnect_delay = v; + return count; +} +static DEVICE_ATTR(reconnect_delay, S_IRUGO | S_IWUSR, + nvme_ctrl_reconnect_delay_show, nvme_ctrl_reconnect_delay_store); + +static ssize_t nvme_ctrl_fast_io_fail_tmo_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + if (ctrl->opts->fast_io_fail_tmo == -1) + return sysfs_emit(buf, "off\n"); + return sysfs_emit(buf, "%d\n", ctrl->opts->fast_io_fail_tmo); +} + +static ssize_t nvme_ctrl_fast_io_fail_tmo_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + struct nvmf_ctrl_options *opts = ctrl->opts; + int fast_io_fail_tmo, err; + + err = kstrtoint(buf, 10, &fast_io_fail_tmo); + if (err) + return -EINVAL; + + if (fast_io_fail_tmo < 0) + opts->fast_io_fail_tmo = -1; + else + opts->fast_io_fail_tmo = fast_io_fail_tmo; + return count; +} +static DEVICE_ATTR(fast_io_fail_tmo, S_IRUGO | S_IWUSR, + nvme_ctrl_fast_io_fail_tmo_show, nvme_ctrl_fast_io_fail_tmo_store); + +static struct attribute *nvme_dev_attrs[] = { + &dev_attr_reset_controller.attr, + &dev_attr_rescan_controller.attr, + &dev_attr_model.attr, + &dev_attr_serial.attr, + &dev_attr_firmware_rev.attr, + &dev_attr_cntlid.attr, + &dev_attr_delete_controller.attr, + &dev_attr_transport.attr, + &dev_attr_subsysnqn.attr, + &dev_attr_address.attr, + &dev_attr_state.attr, + &dev_attr_numa_node.attr, + &dev_attr_queue_count.attr, + &dev_attr_sqsize.attr, + &dev_attr_hostnqn.attr, + &dev_attr_hostid.attr, + &dev_attr_ctrl_loss_tmo.attr, + &dev_attr_reconnect_delay.attr, + &dev_attr_fast_io_fail_tmo.attr, + &dev_attr_kato.attr, + NULL +}; + +static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct device *dev = container_of(kobj, struct device, kobj); + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + if (a == &dev_attr_delete_controller.attr && !ctrl->ops->delete_ctrl) + return 0; + if (a == &dev_attr_address.attr && !ctrl->ops->get_address) + return 0; + if (a == &dev_attr_hostnqn.attr && !ctrl->opts) + return 0; + if (a == &dev_attr_hostid.attr && !ctrl->opts) + return 0; + if (a == &dev_attr_ctrl_loss_tmo.attr && !ctrl->opts) + return 0; + if (a == &dev_attr_reconnect_delay.attr && !ctrl->opts) + return 0; + if (a == &dev_attr_fast_io_fail_tmo.attr && !ctrl->opts) + return 0; + + return a->mode; +} + +static const struct attribute_group nvme_dev_attrs_group = { + .attrs = nvme_dev_attrs, + .is_visible = nvme_dev_attrs_are_visible, +}; + +static const struct attribute_group *nvme_dev_attr_groups[] = { + &nvme_dev_attrs_group, + NULL, +}; + +static struct nvme_ns_head *nvme_find_ns_head(struct nvme_subsystem *subsys, + unsigned nsid) +{ + struct nvme_ns_head *h; + + lockdep_assert_held(&subsys->lock); + + list_for_each_entry(h, &subsys->nsheads, entry) { + if (h->ns_id != nsid) + continue; + if (!list_empty(&h->list) && nvme_tryget_ns_head(h)) + return h; + } + + return NULL; +} + +static int nvme_subsys_check_duplicate_ids(struct nvme_subsystem *subsys, + struct nvme_ns_ids *ids) +{ + bool has_uuid = !uuid_is_null(&ids->uuid); + bool has_nguid = memchr_inv(ids->nguid, 0, sizeof(ids->nguid)); + bool has_eui64 = memchr_inv(ids->eui64, 0, sizeof(ids->eui64)); + struct nvme_ns_head *h; + + lockdep_assert_held(&subsys->lock); + + list_for_each_entry(h, &subsys->nsheads, entry) { + if (has_uuid && uuid_equal(&ids->uuid, &h->ids.uuid)) + return -EINVAL; + if (has_nguid && + memcmp(&ids->nguid, &h->ids.nguid, sizeof(ids->nguid)) == 0) + return -EINVAL; + if (has_eui64 && + memcmp(&ids->eui64, &h->ids.eui64, sizeof(ids->eui64)) == 0) + return -EINVAL; + } + + return 0; +} + +static void nvme_cdev_rel(struct device *dev) +{ + ida_simple_remove(&nvme_ns_chr_minor_ida, MINOR(dev->devt)); +} + +void nvme_cdev_del(struct cdev *cdev, struct device *cdev_device) +{ + cdev_device_del(cdev, cdev_device); + put_device(cdev_device); +} + +int nvme_cdev_add(struct cdev *cdev, struct device *cdev_device, + const struct file_operations *fops, struct module *owner) +{ + int minor, ret; + + minor = ida_simple_get(&nvme_ns_chr_minor_ida, 0, 0, GFP_KERNEL); + if (minor < 0) + return minor; + cdev_device->devt = MKDEV(MAJOR(nvme_ns_chr_devt), minor); + cdev_device->class = nvme_ns_chr_class; + cdev_device->release = nvme_cdev_rel; + device_initialize(cdev_device); + cdev_init(cdev, fops); + cdev->owner = owner; + ret = cdev_device_add(cdev, cdev_device); + if (ret) + put_device(cdev_device); + + return ret; +} + +static int nvme_ns_chr_open(struct inode *inode, struct file *file) +{ + return nvme_ns_open(container_of(inode->i_cdev, struct nvme_ns, cdev)); +} + +static int nvme_ns_chr_release(struct inode *inode, struct file *file) +{ + nvme_ns_release(container_of(inode->i_cdev, struct nvme_ns, cdev)); + return 0; +} + +static const struct file_operations nvme_ns_chr_fops = { + .owner = THIS_MODULE, + .open = nvme_ns_chr_open, + .release = nvme_ns_chr_release, + .unlocked_ioctl = nvme_ns_chr_ioctl, + .compat_ioctl = compat_ptr_ioctl, +}; + +static int nvme_add_ns_cdev(struct nvme_ns *ns) +{ + int ret; + + ns->cdev_device.parent = ns->ctrl->device; + ret = dev_set_name(&ns->cdev_device, "ng%dn%d", + ns->ctrl->instance, ns->head->instance); + if (ret) + return ret; + + return nvme_cdev_add(&ns->cdev, &ns->cdev_device, &nvme_ns_chr_fops, + ns->ctrl->ops->module); +} + +static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, + unsigned nsid, struct nvme_ns_ids *ids) +{ + struct nvme_ns_head *head; + size_t size = sizeof(*head); + int ret = -ENOMEM; + +#ifdef CONFIG_NVME_MULTIPATH + size += num_possible_nodes() * sizeof(struct nvme_ns *); +#endif + + head = kzalloc(size, GFP_KERNEL); + if (!head) + goto out; + ret = ida_simple_get(&ctrl->subsys->ns_ida, 1, 0, GFP_KERNEL); + if (ret < 0) + goto out_free_head; + head->instance = ret; + INIT_LIST_HEAD(&head->list); + ret = init_srcu_struct(&head->srcu); + if (ret) + goto out_ida_remove; + head->subsys = ctrl->subsys; + head->ns_id = nsid; + head->ids = *ids; + kref_init(&head->ref); + + ret = nvme_subsys_check_duplicate_ids(ctrl->subsys, &head->ids); + if (ret) { + dev_err(ctrl->device, + "duplicate IDs for nsid %d\n", nsid); + goto out_cleanup_srcu; + } + + if (head->ids.csi) { + ret = nvme_get_effects_log(ctrl, head->ids.csi, &head->effects); + if (ret) + goto out_cleanup_srcu; + } else + head->effects = ctrl->effects; + + ret = nvme_mpath_alloc_disk(ctrl, head); + if (ret) + goto out_cleanup_srcu; + + list_add_tail(&head->entry, &ctrl->subsys->nsheads); + + kref_get(&ctrl->subsys->ref); + + return head; +out_cleanup_srcu: + cleanup_srcu_struct(&head->srcu); +out_ida_remove: + ida_simple_remove(&ctrl->subsys->ns_ida, head->instance); +out_free_head: + kfree(head); +out: + if (ret > 0) + ret = blk_status_to_errno(nvme_error_status(ret)); + return ERR_PTR(ret); +} + +static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid, + struct nvme_ns_ids *ids, bool is_shared) +{ + struct nvme_ctrl *ctrl = ns->ctrl; + struct nvme_ns_head *head = NULL; + int ret = 0; + + mutex_lock(&ctrl->subsys->lock); + head = nvme_find_ns_head(ctrl->subsys, nsid); + if (!head) { + head = nvme_alloc_ns_head(ctrl, nsid, ids); + if (IS_ERR(head)) { + ret = PTR_ERR(head); + goto out_unlock; + } + head->shared = is_shared; + } else { + ret = -EINVAL; + if (!is_shared || !head->shared) { + dev_err(ctrl->device, + "Duplicate unshared namespace %d\n", nsid); + goto out_put_ns_head; + } + if (!nvme_ns_ids_equal(&head->ids, ids)) { + dev_err(ctrl->device, + "IDs don't match for shared namespace %d\n", + nsid); + goto out_put_ns_head; + } + } + + list_add_tail_rcu(&ns->siblings, &head->list); + ns->head = head; + mutex_unlock(&ctrl->subsys->lock); + return 0; + +out_put_ns_head: + nvme_put_ns_head(head); +out_unlock: + mutex_unlock(&ctrl->subsys->lock); + return ret; +} + +struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) +{ + struct nvme_ns *ns, *ret = NULL; + + down_read(&ctrl->namespaces_rwsem); + list_for_each_entry(ns, &ctrl->namespaces, list) { + if (ns->head->ns_id == nsid) { + if (!nvme_get_ns(ns)) + continue; + ret = ns; + break; + } + if (ns->head->ns_id > nsid) + break; + } + up_read(&ctrl->namespaces_rwsem); + return ret; +} +EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU); + +/* + * Add the namespace to the controller list while keeping the list ordered. + */ +static void nvme_ns_add_to_ctrl_list(struct nvme_ns *ns) +{ + struct nvme_ns *tmp; + + list_for_each_entry_reverse(tmp, &ns->ctrl->namespaces, list) { + if (tmp->head->ns_id < ns->head->ns_id) { + list_add(&ns->list, &tmp->list); + return; + } + } + list_add(&ns->list, &ns->ctrl->namespaces); +} + +static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, + struct nvme_ns_ids *ids) +{ + struct nvme_ns *ns; + struct gendisk *disk; + struct nvme_id_ns *id; + int node = ctrl->numa_node; + + if (nvme_identify_ns(ctrl, nsid, ids, &id)) + return; + + ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); + if (!ns) + goto out_free_id; + + disk = blk_mq_alloc_disk(ctrl->tagset, ns); + if (IS_ERR(disk)) + goto out_free_ns; + disk->fops = &nvme_bdev_ops; + disk->private_data = ns; + + ns->disk = disk; + ns->queue = disk->queue; + + if (ctrl->opts && ctrl->opts->data_digest) + blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, ns->queue); + + blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue); + if (ctrl->ops->flags & NVME_F_PCI_P2PDMA) + blk_queue_flag_set(QUEUE_FLAG_PCI_P2PDMA, ns->queue); + + ns->ctrl = ctrl; + kref_init(&ns->kref); + + if (nvme_init_ns_head(ns, nsid, ids, id->nmic & NVME_NS_NMIC_SHARED)) + goto out_cleanup_disk; + + /* + * Without the multipath code enabled, multiple controller per + * subsystems are visible as devices and thus we cannot use the + * subsystem instance. + */ + if (!nvme_mpath_set_disk_name(ns, disk->disk_name, &disk->flags)) + sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, + ns->head->instance); + + if (nvme_update_ns_info(ns, id)) + goto out_unlink_ns; + + down_write(&ctrl->namespaces_rwsem); + nvme_ns_add_to_ctrl_list(ns); + up_write(&ctrl->namespaces_rwsem); + nvme_get_ctrl(ctrl); + + if (device_add_disk(ctrl->device, ns->disk, nvme_ns_id_attr_groups)) + goto out_cleanup_ns_from_list; + + if (!nvme_ns_head_multipath(ns->head)) + nvme_add_ns_cdev(ns); + + nvme_mpath_add_disk(ns, id); + nvme_fault_inject_init(&ns->fault_inject, ns->disk->disk_name); + kfree(id); + + return; + + out_cleanup_ns_from_list: + nvme_put_ctrl(ctrl); + down_write(&ctrl->namespaces_rwsem); + list_del_init(&ns->list); + up_write(&ctrl->namespaces_rwsem); + out_unlink_ns: + mutex_lock(&ctrl->subsys->lock); + list_del_rcu(&ns->siblings); + if (list_empty(&ns->head->list)) + list_del_init(&ns->head->entry); + mutex_unlock(&ctrl->subsys->lock); + nvme_put_ns_head(ns->head); + out_cleanup_disk: + blk_cleanup_disk(disk); + out_free_ns: + kfree(ns); + out_free_id: + kfree(id); +} + +static void nvme_ns_remove(struct nvme_ns *ns) +{ + bool last_path = false; + + if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags)) + return; + + clear_bit(NVME_NS_READY, &ns->flags); + set_capacity(ns->disk, 0); + nvme_fault_inject_fini(&ns->fault_inject); + + /* + * Ensure that !NVME_NS_READY is seen by other threads to prevent + * this ns going back into current_path. + */ + synchronize_srcu(&ns->head->srcu); + + /* wait for concurrent submissions */ + if (nvme_mpath_clear_current_path(ns)) + synchronize_srcu(&ns->head->srcu); + + mutex_lock(&ns->ctrl->subsys->lock); + list_del_rcu(&ns->siblings); + if (list_empty(&ns->head->list)) { + list_del_init(&ns->head->entry); + last_path = true; + } + mutex_unlock(&ns->ctrl->subsys->lock); + + /* guarantee not available in head->list */ + synchronize_rcu(); + + if (!nvme_ns_head_multipath(ns->head)) + nvme_cdev_del(&ns->cdev, &ns->cdev_device); + del_gendisk(ns->disk); + blk_cleanup_queue(ns->queue); + + down_write(&ns->ctrl->namespaces_rwsem); + list_del_init(&ns->list); + up_write(&ns->ctrl->namespaces_rwsem); + + if (last_path) + nvme_mpath_shutdown_disk(ns->head); + nvme_put_ns(ns); +} + +static void nvme_ns_remove_by_nsid(struct nvme_ctrl *ctrl, u32 nsid) +{ + struct nvme_ns *ns = nvme_find_get_ns(ctrl, nsid); + + if (ns) { + nvme_ns_remove(ns); + nvme_put_ns(ns); + } +} + +static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_ids *ids) +{ + struct nvme_id_ns *id; + int ret = NVME_SC_INVALID_NS | NVME_SC_DNR; + + if (test_bit(NVME_NS_DEAD, &ns->flags)) + goto out; + + ret = nvme_identify_ns(ns->ctrl, ns->head->ns_id, ids, &id); + if (ret) + goto out; + + ret = NVME_SC_INVALID_NS | NVME_SC_DNR; + if (!nvme_ns_ids_equal(&ns->head->ids, ids)) { + dev_err(ns->ctrl->device, + "identifiers changed for nsid %d\n", ns->head->ns_id); + goto out_free_id; + } + + ret = nvme_update_ns_info(ns, id); + +out_free_id: + kfree(id); +out: + /* + * Only remove the namespace if we got a fatal error back from the + * device, otherwise ignore the error and just move on. + * + * TODO: we should probably schedule a delayed retry here. + */ + if (ret > 0 && (ret & NVME_SC_DNR)) + nvme_ns_remove(ns); +} + +static void nvme_validate_or_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) +{ + struct nvme_ns_ids ids = { }; + struct nvme_ns *ns; + + if (nvme_identify_ns_descs(ctrl, nsid, &ids)) + return; + + ns = nvme_find_get_ns(ctrl, nsid); + if (ns) { + nvme_validate_ns(ns, &ids); + nvme_put_ns(ns); + return; + } + + switch (ids.csi) { + case NVME_CSI_NVM: + nvme_alloc_ns(ctrl, nsid, &ids); + break; + case NVME_CSI_ZNS: + if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) { + dev_warn(ctrl->device, + "nsid %u not supported without CONFIG_BLK_DEV_ZONED\n", + nsid); + break; + } + if (!nvme_multi_css(ctrl)) { + dev_warn(ctrl->device, + "command set not reported for nsid: %d\n", + nsid); + break; + } + nvme_alloc_ns(ctrl, nsid, &ids); + break; + default: + dev_warn(ctrl->device, "unknown csi %u for nsid %u\n", + ids.csi, nsid); + break; + } +} + +static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, + unsigned nsid) +{ + struct nvme_ns *ns, *next; + LIST_HEAD(rm_list); + + down_write(&ctrl->namespaces_rwsem); + list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) { + if (ns->head->ns_id > nsid || test_bit(NVME_NS_DEAD, &ns->flags)) + list_move_tail(&ns->list, &rm_list); + } + up_write(&ctrl->namespaces_rwsem); + + list_for_each_entry_safe(ns, next, &rm_list, list) + nvme_ns_remove(ns); + +} + +static int nvme_scan_ns_list(struct nvme_ctrl *ctrl) +{ + const int nr_entries = NVME_IDENTIFY_DATA_SIZE / sizeof(__le32); + __le32 *ns_list; + u32 prev = 0; + int ret = 0, i; + + if (nvme_ctrl_limited_cns(ctrl)) + return -EOPNOTSUPP; + + ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL); + if (!ns_list) + return -ENOMEM; + + for (;;) { + struct nvme_command cmd = { + .identify.opcode = nvme_admin_identify, + .identify.cns = NVME_ID_CNS_NS_ACTIVE_LIST, + .identify.nsid = cpu_to_le32(prev), + }; + + ret = nvme_submit_sync_cmd(ctrl->admin_q, &cmd, ns_list, + NVME_IDENTIFY_DATA_SIZE); + if (ret) { + dev_warn(ctrl->device, + "Identify NS List failed (status=0x%x)\n", ret); + goto free; + } + + for (i = 0; i < nr_entries; i++) { + u32 nsid = le32_to_cpu(ns_list[i]); + + if (!nsid) /* end of the list? */ + goto out; + nvme_validate_or_alloc_ns(ctrl, nsid); + while (++prev < nsid) + nvme_ns_remove_by_nsid(ctrl, prev); + } + } + out: + nvme_remove_invalid_namespaces(ctrl, prev); + free: + kfree(ns_list); + return ret; +} + +static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl) +{ + struct nvme_id_ctrl *id; + u32 nn, i; + + if (nvme_identify_ctrl(ctrl, &id)) + return; + nn = le32_to_cpu(id->nn); + kfree(id); + + for (i = 1; i <= nn; i++) + nvme_validate_or_alloc_ns(ctrl, i); + + nvme_remove_invalid_namespaces(ctrl, nn); +} + +static void nvme_clear_changed_ns_log(struct nvme_ctrl *ctrl) +{ + size_t log_size = NVME_MAX_CHANGED_NAMESPACES * sizeof(__le32); + __le32 *log; + int error; + + log = kzalloc(log_size, GFP_KERNEL); + if (!log) + return; + + /* + * We need to read the log to clear the AEN, but we don't want to rely + * on it for the changed namespace information as userspace could have + * raced with us in reading the log page, which could cause us to miss + * updates. + */ + error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0, + NVME_CSI_NVM, log, log_size, 0); + if (error) + dev_warn(ctrl->device, + "reading changed ns log failed: %d\n", error); + + kfree(log); +} + +static void nvme_scan_work(struct work_struct *work) +{ + struct nvme_ctrl *ctrl = + container_of(work, struct nvme_ctrl, scan_work); + + /* No tagset on a live ctrl means IO queues could not created */ + if (ctrl->state != NVME_CTRL_LIVE || !ctrl->tagset) + return; + + if (test_and_clear_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events)) { + dev_info(ctrl->device, "rescanning namespaces.\n"); + nvme_clear_changed_ns_log(ctrl); + } + + mutex_lock(&ctrl->scan_lock); + if (nvme_scan_ns_list(ctrl) != 0) + nvme_scan_ns_sequential(ctrl); + mutex_unlock(&ctrl->scan_lock); +} + +/* + * This function iterates the namespace list unlocked to allow recovery from + * controller failure. It is up to the caller to ensure the namespace list is + * not modified by scan work while this function is executing. + */ +void nvme_remove_namespaces(struct nvme_ctrl *ctrl) +{ + struct nvme_ns *ns, *next; + LIST_HEAD(ns_list); + + /* + * make sure to requeue I/O to all namespaces as these + * might result from the scan itself and must complete + * for the scan_work to make progress + */ + nvme_mpath_clear_ctrl_paths(ctrl); + + /* prevent racing with ns scanning */ + flush_work(&ctrl->scan_work); + + /* + * The dead states indicates the controller was not gracefully + * disconnected. In that case, we won't be able to flush any data while + * removing the namespaces' disks; fail all the queues now to avoid + * potentially having to clean up the failed sync later. + */ + if (ctrl->state == NVME_CTRL_DEAD) + nvme_kill_queues(ctrl); + + /* this is a no-op when called from the controller reset handler */ + nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO); + + down_write(&ctrl->namespaces_rwsem); + list_splice_init(&ctrl->namespaces, &ns_list); + up_write(&ctrl->namespaces_rwsem); + + list_for_each_entry_safe(ns, next, &ns_list, list) + nvme_ns_remove(ns); +} +EXPORT_SYMBOL_GPL(nvme_remove_namespaces); + +static int nvme_class_uevent(struct device *dev, struct kobj_uevent_env *env) +{ + struct nvme_ctrl *ctrl = + container_of(dev, struct nvme_ctrl, ctrl_device); + struct nvmf_ctrl_options *opts = ctrl->opts; + int ret; + + ret = add_uevent_var(env, "NVME_TRTYPE=%s", ctrl->ops->name); + if (ret) + return ret; + + if (opts) { + ret = add_uevent_var(env, "NVME_TRADDR=%s", opts->traddr); + if (ret) + return ret; + + ret = add_uevent_var(env, "NVME_TRSVCID=%s", + opts->trsvcid ?: "none"); + if (ret) + return ret; + + ret = add_uevent_var(env, "NVME_HOST_TRADDR=%s", + opts->host_traddr ?: "none"); + if (ret) + return ret; + + ret = add_uevent_var(env, "NVME_HOST_IFACE=%s", + opts->host_iface ?: "none"); + } + return ret; +} + +static void nvme_aen_uevent(struct nvme_ctrl *ctrl) +{ + char *envp[2] = { NULL, NULL }; + u32 aen_result = ctrl->aen_result; + + ctrl->aen_result = 0; + if (!aen_result) + return; + + envp[0] = kasprintf(GFP_KERNEL, "NVME_AEN=%#08x", aen_result); + if (!envp[0]) + return; + kobject_uevent_env(&ctrl->device->kobj, KOBJ_CHANGE, envp); + kfree(envp[0]); +} + +static void nvme_async_event_work(struct work_struct *work) +{ + struct nvme_ctrl *ctrl = + container_of(work, struct nvme_ctrl, async_event_work); + + nvme_aen_uevent(ctrl); + + /* + * The transport drivers must guarantee AER submission here is safe by + * flushing ctrl async_event_work after changing the controller state + * from LIVE and before freeing the admin queue. + */ + if (ctrl->state == NVME_CTRL_LIVE) + ctrl->ops->submit_async_event(ctrl); +} + +static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl) +{ + + u32 csts; + + if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) + return false; + + if (csts == ~0) + return false; + + return ((ctrl->ctrl_config & NVME_CC_ENABLE) && (csts & NVME_CSTS_PP)); +} + +static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl) +{ + struct nvme_fw_slot_info_log *log; + + log = kmalloc(sizeof(*log), GFP_KERNEL); + if (!log) + return; + + if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, NVME_CSI_NVM, + log, sizeof(*log), 0)) + dev_warn(ctrl->device, "Get FW SLOT INFO log error\n"); + kfree(log); +} + +static void nvme_fw_act_work(struct work_struct *work) +{ + struct nvme_ctrl *ctrl = container_of(work, + struct nvme_ctrl, fw_act_work); + unsigned long fw_act_timeout; + + if (ctrl->mtfa) + fw_act_timeout = jiffies + + msecs_to_jiffies(ctrl->mtfa * 100); + else + fw_act_timeout = jiffies + + msecs_to_jiffies(admin_timeout * 1000); + + nvme_stop_queues(ctrl); + while (nvme_ctrl_pp_status(ctrl)) { + if (time_after(jiffies, fw_act_timeout)) { + dev_warn(ctrl->device, + "Fw activation timeout, reset controller\n"); + nvme_try_sched_reset(ctrl); + return; + } + msleep(100); + } + + if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) + return; + + nvme_start_queues(ctrl); + /* read FW slot information to clear the AER */ + nvme_get_fw_slot_info(ctrl); +} + +static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result) +{ + u32 aer_notice_type = (result & 0xff00) >> 8; + + trace_nvme_async_event(ctrl, aer_notice_type); + + switch (aer_notice_type) { + case NVME_AER_NOTICE_NS_CHANGED: + set_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events); + nvme_queue_scan(ctrl); + break; + case NVME_AER_NOTICE_FW_ACT_STARTING: + /* + * We are (ab)using the RESETTING state to prevent subsequent + * recovery actions from interfering with the controller's + * firmware activation. + */ + if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) + queue_work(nvme_wq, &ctrl->fw_act_work); + break; +#ifdef CONFIG_NVME_MULTIPATH + case NVME_AER_NOTICE_ANA: + if (!ctrl->ana_log_buf) + break; + queue_work(nvme_wq, &ctrl->ana_work); + break; +#endif + case NVME_AER_NOTICE_DISC_CHANGED: + ctrl->aen_result = result; + break; + default: + dev_warn(ctrl->device, "async event result %08x\n", result); + } +} + +void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, + volatile union nvme_result *res) +{ + u32 result = le32_to_cpu(res->u32); + u32 aer_type = result & 0x07; + + if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS) + return; + + switch (aer_type) { + case NVME_AER_NOTICE: + nvme_handle_aen_notice(ctrl, result); + break; + case NVME_AER_ERROR: + case NVME_AER_SMART: + case NVME_AER_CSS: + case NVME_AER_VS: + trace_nvme_async_event(ctrl, aer_type); + ctrl->aen_result = result; + break; + default: + break; + } + queue_work(nvme_wq, &ctrl->async_event_work); +} +EXPORT_SYMBOL_GPL(nvme_complete_async_event); + +void nvme_stop_ctrl(struct nvme_ctrl *ctrl) +{ + nvme_mpath_stop(ctrl); + nvme_stop_failfast_work(ctrl); + flush_work(&ctrl->async_event_work); + cancel_work_sync(&ctrl->fw_act_work); + if (ctrl->ops->stop_ctrl) + ctrl->ops->stop_ctrl(ctrl); +} +EXPORT_SYMBOL_GPL(nvme_stop_ctrl); + +void nvme_start_ctrl(struct nvme_ctrl *ctrl) +{ + nvme_enable_aen(ctrl); + + if (ctrl->queue_count > 1) { + nvme_queue_scan(ctrl); + nvme_start_queues(ctrl); + } +} +EXPORT_SYMBOL_GPL(nvme_start_ctrl); + +void nvme_uninit_ctrl(struct nvme_ctrl *ctrl) +{ + nvme_hwmon_exit(ctrl); + nvme_fault_inject_fini(&ctrl->fault_inject); + dev_pm_qos_hide_latency_tolerance(ctrl->device); + cdev_device_del(&ctrl->cdev, ctrl->device); + nvme_put_ctrl(ctrl); +} +EXPORT_SYMBOL_GPL(nvme_uninit_ctrl); + +static void nvme_free_cels(struct nvme_ctrl *ctrl) +{ + struct nvme_effects_log *cel; + unsigned long i; + + xa_for_each(&ctrl->cels, i, cel) { + xa_erase(&ctrl->cels, i); + kfree(cel); + } + + xa_destroy(&ctrl->cels); +} + +static void nvme_free_ctrl(struct device *dev) +{ + struct nvme_ctrl *ctrl = + container_of(dev, struct nvme_ctrl, ctrl_device); + struct nvme_subsystem *subsys = ctrl->subsys; + + if (!subsys || ctrl->instance != subsys->instance) + ida_simple_remove(&nvme_instance_ida, ctrl->instance); + + nvme_free_cels(ctrl); + nvme_mpath_uninit(ctrl); + __free_page(ctrl->discard_page); + + if (subsys) { + mutex_lock(&nvme_subsystems_lock); + list_del(&ctrl->subsys_entry); + sysfs_remove_link(&subsys->dev.kobj, dev_name(ctrl->device)); + mutex_unlock(&nvme_subsystems_lock); + } + + ctrl->ops->free_ctrl(ctrl); + + if (subsys) + nvme_put_subsystem(subsys); +} + +/* + * Initialize a NVMe controller structures. This needs to be called during + * earliest initialization so that we have the initialized structured around + * during probing. + */ +int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, + const struct nvme_ctrl_ops *ops, unsigned long quirks) +{ + int ret; + + ctrl->state = NVME_CTRL_NEW; + clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags); + spin_lock_init(&ctrl->lock); + mutex_init(&ctrl->scan_lock); + INIT_LIST_HEAD(&ctrl->namespaces); + xa_init(&ctrl->cels); + init_rwsem(&ctrl->namespaces_rwsem); + ctrl->dev = dev; + ctrl->ops = ops; + ctrl->quirks = quirks; + ctrl->numa_node = NUMA_NO_NODE; + INIT_WORK(&ctrl->scan_work, nvme_scan_work); + INIT_WORK(&ctrl->async_event_work, nvme_async_event_work); + INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work); + INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work); + init_waitqueue_head(&ctrl->state_wq); + + INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work); + INIT_DELAYED_WORK(&ctrl->failfast_work, nvme_failfast_work); + memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd)); + ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive; + + BUILD_BUG_ON(NVME_DSM_MAX_RANGES * sizeof(struct nvme_dsm_range) > + PAGE_SIZE); + ctrl->discard_page = alloc_page(GFP_KERNEL); + if (!ctrl->discard_page) { + ret = -ENOMEM; + goto out; + } + + ret = ida_simple_get(&nvme_instance_ida, 0, 0, GFP_KERNEL); + if (ret < 0) + goto out; + ctrl->instance = ret; + + device_initialize(&ctrl->ctrl_device); + ctrl->device = &ctrl->ctrl_device; + ctrl->device->devt = MKDEV(MAJOR(nvme_ctrl_base_chr_devt), + ctrl->instance); + ctrl->device->class = nvme_class; + ctrl->device->parent = ctrl->dev; + ctrl->device->groups = nvme_dev_attr_groups; + ctrl->device->release = nvme_free_ctrl; + dev_set_drvdata(ctrl->device, ctrl); + ret = dev_set_name(ctrl->device, "nvme%d", ctrl->instance); + if (ret) + goto out_release_instance; + + nvme_get_ctrl(ctrl); + cdev_init(&ctrl->cdev, &nvme_dev_fops); + ctrl->cdev.owner = ops->module; + ret = cdev_device_add(&ctrl->cdev, ctrl->device); + if (ret) + goto out_free_name; + + /* + * Initialize latency tolerance controls. The sysfs files won't + * be visible to userspace unless the device actually supports APST. + */ + ctrl->device->power.set_latency_tolerance = nvme_set_latency_tolerance; + dev_pm_qos_update_user_latency_tolerance(ctrl->device, + min(default_ps_max_latency_us, (unsigned long)S32_MAX)); + + nvme_fault_inject_init(&ctrl->fault_inject, dev_name(ctrl->device)); + nvme_mpath_init_ctrl(ctrl); + + return 0; +out_free_name: + nvme_put_ctrl(ctrl); + kfree_const(ctrl->device->kobj.name); +out_release_instance: + ida_simple_remove(&nvme_instance_ida, ctrl->instance); +out: + if (ctrl->discard_page) + __free_page(ctrl->discard_page); + return ret; +} +EXPORT_SYMBOL_GPL(nvme_init_ctrl); + +static void nvme_start_ns_queue(struct nvme_ns *ns) +{ + if (test_and_clear_bit(NVME_NS_STOPPED, &ns->flags)) + blk_mq_unquiesce_queue(ns->queue); +} + +static void nvme_stop_ns_queue(struct nvme_ns *ns) +{ + if (!test_and_set_bit(NVME_NS_STOPPED, &ns->flags)) + blk_mq_quiesce_queue(ns->queue); + else + blk_mq_wait_quiesce_done(ns->queue); +} + +/* + * Prepare a queue for teardown. + * + * This must forcibly unquiesce queues to avoid blocking dispatch, and only set + * the capacity to 0 after that to avoid blocking dispatchers that may be + * holding bd_butex. This will end buffered writers dirtying pages that can't + * be synced. + */ +static void nvme_set_queue_dying(struct nvme_ns *ns) +{ + if (test_and_set_bit(NVME_NS_DEAD, &ns->flags)) + return; + + blk_mark_disk_dead(ns->disk); + nvme_start_ns_queue(ns); + + set_capacity_and_notify(ns->disk, 0); +} + +/** + * nvme_kill_queues(): Ends all namespace queues + * @ctrl: the dead controller that needs to end + * + * Call this function when the driver determines it is unable to get the + * controller in a state capable of servicing IO. + */ +void nvme_kill_queues(struct nvme_ctrl *ctrl) +{ + struct nvme_ns *ns; + + down_read(&ctrl->namespaces_rwsem); + + /* Forcibly unquiesce queues to avoid blocking dispatch */ + if (ctrl->admin_q && !blk_queue_dying(ctrl->admin_q)) + nvme_start_admin_queue(ctrl); + + list_for_each_entry(ns, &ctrl->namespaces, list) + nvme_set_queue_dying(ns); + + up_read(&ctrl->namespaces_rwsem); +} +EXPORT_SYMBOL_GPL(nvme_kill_queues); + +void nvme_unfreeze(struct nvme_ctrl *ctrl) +{ + struct nvme_ns *ns; + + down_read(&ctrl->namespaces_rwsem); + list_for_each_entry(ns, &ctrl->namespaces, list) + blk_mq_unfreeze_queue(ns->queue); + up_read(&ctrl->namespaces_rwsem); +} +EXPORT_SYMBOL_GPL(nvme_unfreeze); + +int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout) +{ + struct nvme_ns *ns; + + down_read(&ctrl->namespaces_rwsem); + list_for_each_entry(ns, &ctrl->namespaces, list) { + timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout); + if (timeout <= 0) + break; + } + up_read(&ctrl->namespaces_rwsem); + return timeout; +} +EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout); + +void nvme_wait_freeze(struct nvme_ctrl *ctrl) +{ + struct nvme_ns *ns; + + down_read(&ctrl->namespaces_rwsem); + list_for_each_entry(ns, &ctrl->namespaces, list) + blk_mq_freeze_queue_wait(ns->queue); + up_read(&ctrl->namespaces_rwsem); +} +EXPORT_SYMBOL_GPL(nvme_wait_freeze); + +void nvme_start_freeze(struct nvme_ctrl *ctrl) +{ + struct nvme_ns *ns; + + down_read(&ctrl->namespaces_rwsem); + list_for_each_entry(ns, &ctrl->namespaces, list) + blk_freeze_queue_start(ns->queue); + up_read(&ctrl->namespaces_rwsem); +} +EXPORT_SYMBOL_GPL(nvme_start_freeze); + +void nvme_stop_queues(struct nvme_ctrl *ctrl) +{ + struct nvme_ns *ns; + + down_read(&ctrl->namespaces_rwsem); + list_for_each_entry(ns, &ctrl->namespaces, list) + nvme_stop_ns_queue(ns); + up_read(&ctrl->namespaces_rwsem); +} +EXPORT_SYMBOL_GPL(nvme_stop_queues); + +void nvme_start_queues(struct nvme_ctrl *ctrl) +{ + struct nvme_ns *ns; + + down_read(&ctrl->namespaces_rwsem); + list_for_each_entry(ns, &ctrl->namespaces, list) + nvme_start_ns_queue(ns); + up_read(&ctrl->namespaces_rwsem); +} +EXPORT_SYMBOL_GPL(nvme_start_queues); + +void nvme_stop_admin_queue(struct nvme_ctrl *ctrl) +{ + if (!test_and_set_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags)) + blk_mq_quiesce_queue(ctrl->admin_q); + else + blk_mq_wait_quiesce_done(ctrl->admin_q); +} +EXPORT_SYMBOL_GPL(nvme_stop_admin_queue); + +void nvme_start_admin_queue(struct nvme_ctrl *ctrl) +{ + if (test_and_clear_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags)) + blk_mq_unquiesce_queue(ctrl->admin_q); +} +EXPORT_SYMBOL_GPL(nvme_start_admin_queue); + +void nvme_sync_io_queues(struct nvme_ctrl *ctrl) +{ + struct nvme_ns *ns; + + down_read(&ctrl->namespaces_rwsem); + list_for_each_entry(ns, &ctrl->namespaces, list) + blk_sync_queue(ns->queue); + up_read(&ctrl->namespaces_rwsem); +} +EXPORT_SYMBOL_GPL(nvme_sync_io_queues); + +static inline bool disk_is_nvme(struct gendisk *disk) +{ + if (!disk_to_dev(disk)->parent) + return false; + + return disk_to_dev(disk)->parent->class == nvme_class; +} + +struct nvme_ns *disk_to_nvme_ns(struct gendisk *disk) +{ + struct nvme_ns *ns = NULL; + +#ifdef CONFIG_NVME_MULTIPATH + if (disk->fops == &nvme_ns_head_ops) + ns = nvme_find_path(disk->private_data); + else if (disk_is_nvme(disk)) + ns = disk->private_data; +#else + if (disk_is_nvme(disk)) + ns = disk->private_data; +#endif + + return ns; +} +EXPORT_SYMBOL_GPL(disk_to_nvme_ns); + +void nvme_sync_queues(struct nvme_ctrl *ctrl) +{ + nvme_sync_io_queues(ctrl); + if (ctrl->admin_q) + blk_sync_queue(ctrl->admin_q); +} +EXPORT_SYMBOL_GPL(nvme_sync_queues); + +struct nvme_ctrl *nvme_ctrl_from_file(struct file *file) +{ + if (file->f_op != &nvme_dev_fops) + return NULL; + return file->private_data; +} +EXPORT_SYMBOL_NS_GPL(nvme_ctrl_from_file, NVME_TARGET_PASSTHRU); + +/* + * Check we didn't inadvertently grow the command structure sizes: + */ +static inline void _nvme_check_size(void) +{ + BUILD_BUG_ON(sizeof(struct nvme_common_command) != 64); + BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64); + BUILD_BUG_ON(sizeof(struct nvme_identify) != 64); + BUILD_BUG_ON(sizeof(struct nvme_features) != 64); + BUILD_BUG_ON(sizeof(struct nvme_download_firmware) != 64); + BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64); + BUILD_BUG_ON(sizeof(struct nvme_dsm_cmd) != 64); + BUILD_BUG_ON(sizeof(struct nvme_write_zeroes_cmd) != 64); + BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64); + BUILD_BUG_ON(sizeof(struct nvme_get_log_page_command) != 64); + BUILD_BUG_ON(sizeof(struct nvme_command) != 64); + BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE); + BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE); + BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != NVME_IDENTIFY_DATA_SIZE); + BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != NVME_IDENTIFY_DATA_SIZE); + BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_nvm) != NVME_IDENTIFY_DATA_SIZE); + BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64); + BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512); + BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64); + BUILD_BUG_ON(sizeof(struct nvme_directive_cmd) != 64); +} + + +static int __init nvme_core_init(void) +{ + int result = -ENOMEM; + + _nvme_check_size(); + + nvme_wq = alloc_workqueue("nvme-wq", + WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0); + if (!nvme_wq) + goto out; + + nvme_reset_wq = alloc_workqueue("nvme-reset-wq", + WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0); + if (!nvme_reset_wq) + goto destroy_wq; + + nvme_delete_wq = alloc_workqueue("nvme-delete-wq", + WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0); + if (!nvme_delete_wq) + goto destroy_reset_wq; + + result = alloc_chrdev_region(&nvme_ctrl_base_chr_devt, 0, + NVME_MINORS, "nvme"); + if (result < 0) + goto destroy_delete_wq; + + nvme_class = class_create(THIS_MODULE, "nvme"); + if (IS_ERR(nvme_class)) { + result = PTR_ERR(nvme_class); + goto unregister_chrdev; + } + nvme_class->dev_uevent = nvme_class_uevent; + + nvme_subsys_class = class_create(THIS_MODULE, "nvme-subsystem"); + if (IS_ERR(nvme_subsys_class)) { + result = PTR_ERR(nvme_subsys_class); + goto destroy_class; + } + + result = alloc_chrdev_region(&nvme_ns_chr_devt, 0, NVME_MINORS, + "nvme-generic"); + if (result < 0) + goto destroy_subsys_class; + + nvme_ns_chr_class = class_create(THIS_MODULE, "nvme-generic"); + if (IS_ERR(nvme_ns_chr_class)) { + result = PTR_ERR(nvme_ns_chr_class); + goto unregister_generic_ns; + } + + return 0; + +unregister_generic_ns: + unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS); +destroy_subsys_class: + class_destroy(nvme_subsys_class); +destroy_class: + class_destroy(nvme_class); +unregister_chrdev: + unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS); +destroy_delete_wq: + destroy_workqueue(nvme_delete_wq); +destroy_reset_wq: + destroy_workqueue(nvme_reset_wq); +destroy_wq: + destroy_workqueue(nvme_wq); +out: + return result; +} + +static void __exit nvme_core_exit(void) +{ + class_destroy(nvme_ns_chr_class); + class_destroy(nvme_subsys_class); + class_destroy(nvme_class); + unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS); + unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS); + destroy_workqueue(nvme_delete_wq); + destroy_workqueue(nvme_reset_wq); + destroy_workqueue(nvme_wq); + ida_destroy(&nvme_ns_chr_minor_ida); + ida_destroy(&nvme_instance_ida); +} + +MODULE_LICENSE("GPL"); +MODULE_VERSION("1.0"); +module_init(nvme_core_init); +module_exit(nvme_core_exit); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/fabrics.c b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/fabrics.c new file mode 100644 index 0000000..cf42468 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/fabrics.c @@ -0,0 +1,1205 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NVMe over Fabrics common host code. + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + */ +#ifdef pr_fmt +#undef pr_fmt +#endif +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include +#include "nvme.h" +#include "fabrics.h" + +static LIST_HEAD(nvmf_transports); +static DECLARE_RWSEM(nvmf_transports_rwsem); + +static LIST_HEAD(nvmf_hosts); +static DEFINE_MUTEX(nvmf_hosts_mutex); + +static struct nvmf_host *nvmf_default_host; + +static struct nvmf_host *__nvmf_host_find(const char *hostnqn) +{ + struct nvmf_host *host; + + list_for_each_entry(host, &nvmf_hosts, list) { + if (!strcmp(host->nqn, hostnqn)) + return host; + } + + return NULL; +} + +static struct nvmf_host *nvmf_host_add(const char *hostnqn) +{ + struct nvmf_host *host; + + mutex_lock(&nvmf_hosts_mutex); + host = __nvmf_host_find(hostnqn); + if (host) { + kref_get(&host->ref); + goto out_unlock; + } + + host = kmalloc(sizeof(*host), GFP_KERNEL); + if (!host) + goto out_unlock; + + kref_init(&host->ref); + strlcpy(host->nqn, hostnqn, NVMF_NQN_SIZE); + + list_add_tail(&host->list, &nvmf_hosts); +out_unlock: + mutex_unlock(&nvmf_hosts_mutex); + return host; +} + +static struct nvmf_host *nvmf_host_default(void) +{ + struct nvmf_host *host; + + host = kmalloc(sizeof(*host), GFP_KERNEL); + if (!host) + return NULL; + + kref_init(&host->ref); + uuid_gen(&host->id); + snprintf(host->nqn, NVMF_NQN_SIZE, + "nqn.2014-08.org.nvmexpress:uuid:%pUb", &host->id); + + mutex_lock(&nvmf_hosts_mutex); + list_add_tail(&host->list, &nvmf_hosts); + mutex_unlock(&nvmf_hosts_mutex); + + return host; +} + +static void nvmf_host_destroy(struct kref *ref) +{ + struct nvmf_host *host = container_of(ref, struct nvmf_host, ref); + + mutex_lock(&nvmf_hosts_mutex); + list_del(&host->list); + mutex_unlock(&nvmf_hosts_mutex); + + kfree(host); +} + +static void nvmf_host_put(struct nvmf_host *host) +{ + if (host) + kref_put(&host->ref, nvmf_host_destroy); +} + +/** + * nvmf_get_address() - Get address/port + * @ctrl: Host NVMe controller instance which we got the address + * @buf: OUTPUT parameter that will contain the address/port + * @size: buffer size + */ +int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size) +{ + int len = 0; + + if (ctrl->opts->mask & NVMF_OPT_TRADDR) + len += scnprintf(buf, size, "traddr=%s", ctrl->opts->traddr); + if (ctrl->opts->mask & NVMF_OPT_TRSVCID) + len += scnprintf(buf + len, size - len, "%strsvcid=%s", + (len) ? "," : "", ctrl->opts->trsvcid); + if (ctrl->opts->mask & NVMF_OPT_HOST_TRADDR) + len += scnprintf(buf + len, size - len, "%shost_traddr=%s", + (len) ? "," : "", ctrl->opts->host_traddr); + if (ctrl->opts->mask & NVMF_OPT_HOST_IFACE) + len += scnprintf(buf + len, size - len, "%shost_iface=%s", + (len) ? "," : "", ctrl->opts->host_iface); + len += scnprintf(buf + len, size - len, "\n"); + + return len; +} +EXPORT_SYMBOL_GPL(nvmf_get_address); + +/** + * nvmf_reg_read32() - NVMe Fabrics "Property Get" API function. + * @ctrl: Host NVMe controller instance maintaining the admin + * queue used to submit the property read command to + * the allocated NVMe controller resource on the target system. + * @off: Starting offset value of the targeted property + * register (see the fabrics section of the NVMe standard). + * @val: OUTPUT parameter that will contain the value of + * the property after a successful read. + * + * Used by the host system to retrieve a 32-bit capsule property value + * from an NVMe controller on the target system. + * + * ("Capsule property" is an "PCIe register concept" applied to the + * NVMe fabrics space.) + * + * Return: + * 0: successful read + * > 0: NVMe error status code + * < 0: Linux errno error code + */ +int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val) +{ + struct nvme_command cmd; + union nvme_result res; + int ret; + + memset(&cmd, 0, sizeof(cmd)); + cmd.prop_get.opcode = nvme_fabrics_command; + cmd.prop_get.fctype = nvme_fabrics_type_property_get; + cmd.prop_get.offset = cpu_to_le32(off); + + ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0, 0, + NVME_QID_ANY, 0, 0); + + if (ret >= 0) + *val = le64_to_cpu(res.u64); + if (unlikely(ret != 0)) + dev_err(ctrl->device, + "Property Get error: %d, offset %#x\n", + ret > 0 ? ret & ~NVME_SC_DNR : ret, off); + + return ret; +} +EXPORT_SYMBOL_GPL(nvmf_reg_read32); + +/** + * nvmf_reg_read64() - NVMe Fabrics "Property Get" API function. + * @ctrl: Host NVMe controller instance maintaining the admin + * queue used to submit the property read command to + * the allocated controller resource on the target system. + * @off: Starting offset value of the targeted property + * register (see the fabrics section of the NVMe standard). + * @val: OUTPUT parameter that will contain the value of + * the property after a successful read. + * + * Used by the host system to retrieve a 64-bit capsule property value + * from an NVMe controller on the target system. + * + * ("Capsule property" is an "PCIe register concept" applied to the + * NVMe fabrics space.) + * + * Return: + * 0: successful read + * > 0: NVMe error status code + * < 0: Linux errno error code + */ +int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val) +{ + struct nvme_command cmd = { }; + union nvme_result res; + int ret; + + cmd.prop_get.opcode = nvme_fabrics_command; + cmd.prop_get.fctype = nvme_fabrics_type_property_get; + cmd.prop_get.attrib = 1; + cmd.prop_get.offset = cpu_to_le32(off); + + ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0, 0, + NVME_QID_ANY, 0, 0); + + if (ret >= 0) + *val = le64_to_cpu(res.u64); + if (unlikely(ret != 0)) + dev_err(ctrl->device, + "Property Get error: %d, offset %#x\n", + ret > 0 ? ret & ~NVME_SC_DNR : ret, off); + return ret; +} +EXPORT_SYMBOL_GPL(nvmf_reg_read64); + +/** + * nvmf_reg_write32() - NVMe Fabrics "Property Write" API function. + * @ctrl: Host NVMe controller instance maintaining the admin + * queue used to submit the property read command to + * the allocated NVMe controller resource on the target system. + * @off: Starting offset value of the targeted property + * register (see the fabrics section of the NVMe standard). + * @val: Input parameter that contains the value to be + * written to the property. + * + * Used by the NVMe host system to write a 32-bit capsule property value + * to an NVMe controller on the target system. + * + * ("Capsule property" is an "PCIe register concept" applied to the + * NVMe fabrics space.) + * + * Return: + * 0: successful write + * > 0: NVMe error status code + * < 0: Linux errno error code + */ +int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val) +{ + struct nvme_command cmd = { }; + int ret; + + cmd.prop_set.opcode = nvme_fabrics_command; + cmd.prop_set.fctype = nvme_fabrics_type_property_set; + cmd.prop_set.attrib = 0; + cmd.prop_set.offset = cpu_to_le32(off); + cmd.prop_set.value = cpu_to_le64(val); + + ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, NULL, NULL, 0, 0, + NVME_QID_ANY, 0, 0); + if (unlikely(ret)) + dev_err(ctrl->device, + "Property Set error: %d, offset %#x\n", + ret > 0 ? ret & ~NVME_SC_DNR : ret, off); + return ret; +} +EXPORT_SYMBOL_GPL(nvmf_reg_write32); + +/** + * nvmf_log_connect_error() - Error-parsing-diagnostic print out function for + * connect() errors. + * @ctrl: The specific /dev/nvmeX device that had the error. + * @errval: Error code to be decoded in a more human-friendly + * printout. + * @offset: For use with the NVMe error code + * NVME_SC_CONNECT_INVALID_PARAM. + * @cmd: This is the SQE portion of a submission capsule. + * @data: This is the "Data" portion of a submission capsule. + */ +static void nvmf_log_connect_error(struct nvme_ctrl *ctrl, + int errval, int offset, struct nvme_command *cmd, + struct nvmf_connect_data *data) +{ + int err_sctype = errval & ~NVME_SC_DNR; + + switch (err_sctype) { + case (NVME_SC_CONNECT_INVALID_PARAM): + if (offset >> 16) { + char *inv_data = "Connect Invalid Data Parameter"; + + switch (offset & 0xffff) { + case (offsetof(struct nvmf_connect_data, cntlid)): + dev_err(ctrl->device, + "%s, cntlid: %d\n", + inv_data, data->cntlid); + break; + case (offsetof(struct nvmf_connect_data, hostnqn)): + dev_err(ctrl->device, + "%s, hostnqn \"%s\"\n", + inv_data, data->hostnqn); + break; + case (offsetof(struct nvmf_connect_data, subsysnqn)): + dev_err(ctrl->device, + "%s, subsysnqn \"%s\"\n", + inv_data, data->subsysnqn); + break; + default: + dev_err(ctrl->device, + "%s, starting byte offset: %d\n", + inv_data, offset & 0xffff); + break; + } + } else { + char *inv_sqe = "Connect Invalid SQE Parameter"; + + switch (offset) { + case (offsetof(struct nvmf_connect_command, qid)): + dev_err(ctrl->device, + "%s, qid %d\n", + inv_sqe, cmd->connect.qid); + break; + default: + dev_err(ctrl->device, + "%s, starting byte offset: %d\n", + inv_sqe, offset); + } + } + break; + case NVME_SC_CONNECT_INVALID_HOST: + dev_err(ctrl->device, + "Connect for subsystem %s is not allowed, hostnqn: %s\n", + data->subsysnqn, data->hostnqn); + break; + case NVME_SC_CONNECT_CTRL_BUSY: + dev_err(ctrl->device, + "Connect command failed: controller is busy or not available\n"); + break; + case NVME_SC_CONNECT_FORMAT: + dev_err(ctrl->device, + "Connect incompatible format: %d", + cmd->connect.recfmt); + break; + case NVME_SC_HOST_PATH_ERROR: + dev_err(ctrl->device, + "Connect command failed: host path error\n"); + break; + default: + dev_err(ctrl->device, + "Connect command failed, error wo/DNR bit: %d\n", + err_sctype); + break; + } +} + +/** + * nvmf_connect_admin_queue() - NVMe Fabrics Admin Queue "Connect" + * API function. + * @ctrl: Host nvme controller instance used to request + * a new NVMe controller allocation on the target + * system and establish an NVMe Admin connection to + * that controller. + * + * This function enables an NVMe host device to request a new allocation of + * an NVMe controller resource on a target system as well establish a + * fabrics-protocol connection of the NVMe Admin queue between the + * host system device and the allocated NVMe controller on the + * target system via a NVMe Fabrics "Connect" command. + * + * Return: + * 0: success + * > 0: NVMe error status code + * < 0: Linux errno error code + * + */ +int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl) +{ + struct nvme_command cmd = { }; + union nvme_result res; + struct nvmf_connect_data *data; + int ret; + + cmd.connect.opcode = nvme_fabrics_command; + cmd.connect.fctype = nvme_fabrics_type_connect; + cmd.connect.qid = 0; + cmd.connect.sqsize = cpu_to_le16(NVME_AQ_DEPTH - 1); + + /* + * Set keep-alive timeout in seconds granularity (ms * 1000) + */ + cmd.connect.kato = cpu_to_le32(ctrl->kato * 1000); + + if (ctrl->opts->disable_sqflow) + cmd.connect.cattr |= NVME_CONNECT_DISABLE_SQFLOW; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + uuid_copy(&data->hostid, &ctrl->opts->host->id); + data->cntlid = cpu_to_le16(0xffff); + strncpy(data->subsysnqn, ctrl->opts->subsysnqn, NVMF_NQN_SIZE); + strncpy(data->hostnqn, ctrl->opts->host->nqn, NVMF_NQN_SIZE); + + ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, + data, sizeof(*data), 0, NVME_QID_ANY, 1, + BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT); + if (ret) { + nvmf_log_connect_error(ctrl, ret, le32_to_cpu(res.u32), + &cmd, data); + goto out_free_data; + } + + ctrl->cntlid = le16_to_cpu(res.u16); + +out_free_data: + kfree(data); + return ret; +} +EXPORT_SYMBOL_GPL(nvmf_connect_admin_queue); + +/** + * nvmf_connect_io_queue() - NVMe Fabrics I/O Queue "Connect" + * API function. + * @ctrl: Host nvme controller instance used to establish an + * NVMe I/O queue connection to the already allocated NVMe + * controller on the target system. + * @qid: NVMe I/O queue number for the new I/O connection between + * host and target (note qid == 0 is illegal as this is + * the Admin queue, per NVMe standard). + * + * This function issues a fabrics-protocol connection + * of a NVMe I/O queue (via NVMe Fabrics "Connect" command) + * between the host system device and the allocated NVMe controller + * on the target system. + * + * Return: + * 0: success + * > 0: NVMe error status code + * < 0: Linux errno error code + */ +int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid) +{ + struct nvme_command cmd = { }; + struct nvmf_connect_data *data; + union nvme_result res; + int ret; + + cmd.connect.opcode = nvme_fabrics_command; + cmd.connect.fctype = nvme_fabrics_type_connect; + cmd.connect.qid = cpu_to_le16(qid); + cmd.connect.sqsize = cpu_to_le16(ctrl->sqsize); + + if (ctrl->opts->disable_sqflow) + cmd.connect.cattr |= NVME_CONNECT_DISABLE_SQFLOW; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + uuid_copy(&data->hostid, &ctrl->opts->host->id); + data->cntlid = cpu_to_le16(ctrl->cntlid); + strncpy(data->subsysnqn, ctrl->opts->subsysnqn, NVMF_NQN_SIZE); + strncpy(data->hostnqn, ctrl->opts->host->nqn, NVMF_NQN_SIZE); + + ret = __nvme_submit_sync_cmd(ctrl->connect_q, &cmd, &res, + data, sizeof(*data), 0, qid, 1, + BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT); + if (ret) { + nvmf_log_connect_error(ctrl, ret, le32_to_cpu(res.u32), + &cmd, data); + } + kfree(data); + return ret; +} +EXPORT_SYMBOL_GPL(nvmf_connect_io_queue); + +bool nvmf_should_reconnect(struct nvme_ctrl *ctrl) +{ + if (ctrl->opts->max_reconnects == -1 || + ctrl->nr_reconnects < ctrl->opts->max_reconnects) + return true; + + return false; +} +EXPORT_SYMBOL_GPL(nvmf_should_reconnect); + +/** + * nvmf_register_transport() - NVMe Fabrics Library registration function. + * @ops: Transport ops instance to be registered to the + * common fabrics library. + * + * API function that registers the type of specific transport fabric + * being implemented to the common NVMe fabrics library. Part of + * the overall init sequence of starting up a fabrics driver. + */ +int nvmf_register_transport(struct nvmf_transport_ops *ops) +{ + if (!ops->create_ctrl) + return -EINVAL; + + down_write(&nvmf_transports_rwsem); + list_add_tail(&ops->entry, &nvmf_transports); + up_write(&nvmf_transports_rwsem); + + return 0; +} +EXPORT_SYMBOL_GPL(nvmf_register_transport); + +/** + * nvmf_unregister_transport() - NVMe Fabrics Library unregistration function. + * @ops: Transport ops instance to be unregistered from the + * common fabrics library. + * + * Fabrics API function that unregisters the type of specific transport + * fabric being implemented from the common NVMe fabrics library. + * Part of the overall exit sequence of unloading the implemented driver. + */ +void nvmf_unregister_transport(struct nvmf_transport_ops *ops) +{ + down_write(&nvmf_transports_rwsem); + list_del(&ops->entry); + up_write(&nvmf_transports_rwsem); +} +EXPORT_SYMBOL_GPL(nvmf_unregister_transport); + +static struct nvmf_transport_ops *nvmf_lookup_transport( + struct nvmf_ctrl_options *opts) +{ + struct nvmf_transport_ops *ops; + + lockdep_assert_held(&nvmf_transports_rwsem); + + list_for_each_entry(ops, &nvmf_transports, entry) { + if (strcmp(ops->name, opts->transport) == 0) + return ops; + } + + return NULL; +} + +static const match_table_t opt_tokens = { + { NVMF_OPT_TRANSPORT, "transport=%s" }, + { NVMF_OPT_TRADDR, "traddr=%s" }, + { NVMF_OPT_TRSVCID, "trsvcid=%s" }, + { NVMF_OPT_NQN, "nqn=%s" }, + { NVMF_OPT_QUEUE_SIZE, "queue_size=%d" }, + { NVMF_OPT_NR_IO_QUEUES, "nr_io_queues=%d" }, + { NVMF_OPT_RECONNECT_DELAY, "reconnect_delay=%d" }, + { NVMF_OPT_CTRL_LOSS_TMO, "ctrl_loss_tmo=%d" }, + { NVMF_OPT_KATO, "keep_alive_tmo=%d" }, + { NVMF_OPT_HOSTNQN, "hostnqn=%s" }, + { NVMF_OPT_HOST_TRADDR, "host_traddr=%s" }, + { NVMF_OPT_HOST_IFACE, "host_iface=%s" }, + { NVMF_OPT_HOST_ID, "hostid=%s" }, + { NVMF_OPT_DUP_CONNECT, "duplicate_connect" }, + { NVMF_OPT_DISABLE_SQFLOW, "disable_sqflow" }, + { NVMF_OPT_HDR_DIGEST, "hdr_digest" }, + { NVMF_OPT_DATA_DIGEST, "data_digest" }, + { NVMF_OPT_NR_WRITE_QUEUES, "nr_write_queues=%d" }, + { NVMF_OPT_NR_POLL_QUEUES, "nr_poll_queues=%d" }, + { NVMF_OPT_TOS, "tos=%d" }, + { NVMF_OPT_FAIL_FAST_TMO, "fast_io_fail_tmo=%d" }, + { NVMF_OPT_DISCOVERY, "discovery" }, + { NVMF_OPT_ERR, NULL } +}; + +static int nvmf_parse_options(struct nvmf_ctrl_options *opts, + const char *buf) +{ + substring_t args[MAX_OPT_ARGS]; + char *options, *o, *p; + int token, ret = 0; + size_t nqnlen = 0; + int ctrl_loss_tmo = NVMF_DEF_CTRL_LOSS_TMO; + uuid_t hostid; + + /* Set defaults */ + opts->queue_size = NVMF_DEF_QUEUE_SIZE; + opts->nr_io_queues = num_online_cpus(); + opts->reconnect_delay = NVMF_DEF_RECONNECT_DELAY; + opts->kato = 0; + opts->duplicate_connect = false; + opts->fast_io_fail_tmo = NVMF_DEF_FAIL_FAST_TMO; + opts->hdr_digest = false; + opts->data_digest = false; + opts->tos = -1; /* < 0 == use transport default */ + + options = o = kstrdup(buf, GFP_KERNEL); + if (!options) + return -ENOMEM; + + uuid_gen(&hostid); + + while ((p = strsep(&o, ",\n")) != NULL) { + if (!*p) + continue; + + token = match_token(p, opt_tokens, args); + opts->mask |= token; + switch (token) { + case NVMF_OPT_TRANSPORT: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + kfree(opts->transport); + opts->transport = p; + break; + case NVMF_OPT_NQN: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + kfree(opts->subsysnqn); + opts->subsysnqn = p; + nqnlen = strlen(opts->subsysnqn); + if (nqnlen >= NVMF_NQN_SIZE) { + pr_err("%s needs to be < %d bytes\n", + opts->subsysnqn, NVMF_NQN_SIZE); + ret = -EINVAL; + goto out; + } + opts->discovery_nqn = + !(strcmp(opts->subsysnqn, + NVME_DISC_SUBSYS_NAME)); + break; + case NVMF_OPT_TRADDR: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + kfree(opts->traddr); + opts->traddr = p; + break; + case NVMF_OPT_TRSVCID: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + kfree(opts->trsvcid); + opts->trsvcid = p; + break; + case NVMF_OPT_QUEUE_SIZE: + if (match_int(args, &token)) { + ret = -EINVAL; + goto out; + } + if (token < NVMF_MIN_QUEUE_SIZE || + token > NVMF_MAX_QUEUE_SIZE) { + pr_err("Invalid queue_size %d\n", token); + ret = -EINVAL; + goto out; + } + opts->queue_size = token; + break; + case NVMF_OPT_NR_IO_QUEUES: + if (match_int(args, &token)) { + ret = -EINVAL; + goto out; + } + if (token <= 0) { + pr_err("Invalid number of IOQs %d\n", token); + ret = -EINVAL; + goto out; + } + if (opts->discovery_nqn) { + pr_debug("Ignoring nr_io_queues value for discovery controller\n"); + break; + } + + opts->nr_io_queues = min_t(unsigned int, + num_online_cpus(), token); + break; + case NVMF_OPT_KATO: + if (match_int(args, &token)) { + ret = -EINVAL; + goto out; + } + + if (token < 0) { + pr_err("Invalid keep_alive_tmo %d\n", token); + ret = -EINVAL; + goto out; + } else if (token == 0 && !opts->discovery_nqn) { + /* Allowed for debug */ + pr_warn("keep_alive_tmo 0 won't execute keep alives!!!\n"); + } + opts->kato = token; + break; + case NVMF_OPT_CTRL_LOSS_TMO: + if (match_int(args, &token)) { + ret = -EINVAL; + goto out; + } + + if (token < 0) + pr_warn("ctrl_loss_tmo < 0 will reconnect forever\n"); + ctrl_loss_tmo = token; + break; + case NVMF_OPT_FAIL_FAST_TMO: + if (match_int(args, &token)) { + ret = -EINVAL; + goto out; + } + + if (token >= 0) + pr_warn("I/O fail on reconnect controller after %d sec\n", + token); + else + token = -1; + + opts->fast_io_fail_tmo = token; + break; + case NVMF_OPT_HOSTNQN: + if (opts->host) { + pr_err("hostnqn already user-assigned: %s\n", + opts->host->nqn); + ret = -EADDRINUSE; + goto out; + } + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + nqnlen = strlen(p); + if (nqnlen >= NVMF_NQN_SIZE) { + pr_err("%s needs to be < %d bytes\n", + p, NVMF_NQN_SIZE); + kfree(p); + ret = -EINVAL; + goto out; + } + opts->host = nvmf_host_add(p); + kfree(p); + if (!opts->host) { + ret = -ENOMEM; + goto out; + } + break; + case NVMF_OPT_RECONNECT_DELAY: + if (match_int(args, &token)) { + ret = -EINVAL; + goto out; + } + if (token <= 0) { + pr_err("Invalid reconnect_delay %d\n", token); + ret = -EINVAL; + goto out; + } + opts->reconnect_delay = token; + break; + case NVMF_OPT_HOST_TRADDR: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + kfree(opts->host_traddr); + opts->host_traddr = p; + break; + case NVMF_OPT_HOST_IFACE: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + kfree(opts->host_iface); + opts->host_iface = p; + break; + case NVMF_OPT_HOST_ID: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + ret = uuid_parse(p, &hostid); + if (ret) { + pr_err("Invalid hostid %s\n", p); + ret = -EINVAL; + kfree(p); + goto out; + } + kfree(p); + break; + case NVMF_OPT_DUP_CONNECT: + opts->duplicate_connect = true; + break; + case NVMF_OPT_DISABLE_SQFLOW: + opts->disable_sqflow = true; + break; + case NVMF_OPT_HDR_DIGEST: + opts->hdr_digest = true; + break; + case NVMF_OPT_DATA_DIGEST: + opts->data_digest = true; + break; + case NVMF_OPT_NR_WRITE_QUEUES: + if (match_int(args, &token)) { + ret = -EINVAL; + goto out; + } + if (token <= 0) { + pr_err("Invalid nr_write_queues %d\n", token); + ret = -EINVAL; + goto out; + } + opts->nr_write_queues = token; + break; + case NVMF_OPT_NR_POLL_QUEUES: + if (match_int(args, &token)) { + ret = -EINVAL; + goto out; + } + if (token <= 0) { + pr_err("Invalid nr_poll_queues %d\n", token); + ret = -EINVAL; + goto out; + } + opts->nr_poll_queues = token; + break; + case NVMF_OPT_TOS: + if (match_int(args, &token)) { + ret = -EINVAL; + goto out; + } + if (token < 0) { + pr_err("Invalid type of service %d\n", token); + ret = -EINVAL; + goto out; + } + if (token > 255) { + pr_warn("Clamping type of service to 255\n"); + token = 255; + } + opts->tos = token; + break; + case NVMF_OPT_DISCOVERY: + opts->discovery_nqn = true; + break; + default: + pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n", + p); + ret = -EINVAL; + goto out; + } + } + + if (opts->discovery_nqn) { + opts->nr_io_queues = 0; + opts->nr_write_queues = 0; + opts->nr_poll_queues = 0; + opts->duplicate_connect = true; + } else { + if (!opts->kato) + opts->kato = NVME_DEFAULT_KATO; + } + if (ctrl_loss_tmo < 0) { + opts->max_reconnects = -1; + } else { + opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo, + opts->reconnect_delay); + if (ctrl_loss_tmo < opts->fast_io_fail_tmo) + pr_warn("failfast tmo (%d) larger than controller loss tmo (%d)\n", + opts->fast_io_fail_tmo, ctrl_loss_tmo); + } + + if (!opts->host) { + kref_get(&nvmf_default_host->ref); + opts->host = nvmf_default_host; + } + + uuid_copy(&opts->host->id, &hostid); + +out: + kfree(options); + return ret; +} + +static int nvmf_check_required_opts(struct nvmf_ctrl_options *opts, + unsigned int required_opts) +{ + if ((opts->mask & required_opts) != required_opts) { + int i; + + for (i = 0; i < ARRAY_SIZE(opt_tokens); i++) { + if ((opt_tokens[i].token & required_opts) && + !(opt_tokens[i].token & opts->mask)) { + pr_warn("missing parameter '%s'\n", + opt_tokens[i].pattern); + } + } + + return -EINVAL; + } + + return 0; +} + +bool nvmf_ip_options_match(struct nvme_ctrl *ctrl, + struct nvmf_ctrl_options *opts) +{ + if (!nvmf_ctlr_matches_baseopts(ctrl, opts) || + strcmp(opts->traddr, ctrl->opts->traddr) || + strcmp(opts->trsvcid, ctrl->opts->trsvcid)) + return false; + + /* + * Checking the local address is rough. In most cases, none is specified + * and the host port is selected by the stack. + * + * Assume no match if: + * - local address is specified and address is not the same + * - local address is not specified but remote is, or vice versa + * (admin using specific host_traddr when it matters). + */ + if ((opts->mask & NVMF_OPT_HOST_TRADDR) && + (ctrl->opts->mask & NVMF_OPT_HOST_TRADDR)) { + if (strcmp(opts->host_traddr, ctrl->opts->host_traddr)) + return false; + } else if ((opts->mask & NVMF_OPT_HOST_TRADDR) || + (ctrl->opts->mask & NVMF_OPT_HOST_TRADDR)) { + return false; + } + + return true; +} +EXPORT_SYMBOL_GPL(nvmf_ip_options_match); + +static int nvmf_check_allowed_opts(struct nvmf_ctrl_options *opts, + unsigned int allowed_opts) +{ + if (opts->mask & ~allowed_opts) { + int i; + + for (i = 0; i < ARRAY_SIZE(opt_tokens); i++) { + if ((opt_tokens[i].token & opts->mask) && + (opt_tokens[i].token & ~allowed_opts)) { + pr_warn("invalid parameter '%s'\n", + opt_tokens[i].pattern); + } + } + + return -EINVAL; + } + + return 0; +} + +void nvmf_free_options(struct nvmf_ctrl_options *opts) +{ + nvmf_host_put(opts->host); + kfree(opts->transport); + kfree(opts->traddr); + kfree(opts->trsvcid); + kfree(opts->subsysnqn); + kfree(opts->host_traddr); + kfree(opts->host_iface); + kfree(opts); +} +EXPORT_SYMBOL_GPL(nvmf_free_options); + +#define NVMF_REQUIRED_OPTS (NVMF_OPT_TRANSPORT | NVMF_OPT_NQN) +#define NVMF_ALLOWED_OPTS (NVMF_OPT_QUEUE_SIZE | NVMF_OPT_NR_IO_QUEUES | \ + NVMF_OPT_KATO | NVMF_OPT_HOSTNQN | \ + NVMF_OPT_HOST_ID | NVMF_OPT_DUP_CONNECT |\ + NVMF_OPT_DISABLE_SQFLOW | NVMF_OPT_DISCOVERY |\ + NVMF_OPT_FAIL_FAST_TMO) + +static struct nvme_ctrl * +nvmf_create_ctrl(struct device *dev, const char *buf) +{ + struct nvmf_ctrl_options *opts; + struct nvmf_transport_ops *ops; + struct nvme_ctrl *ctrl; + int ret; + + opts = kzalloc(sizeof(*opts), GFP_KERNEL); + if (!opts) + return ERR_PTR(-ENOMEM); + + ret = nvmf_parse_options(opts, buf); + if (ret) + goto out_free_opts; + + + request_module("nvme-%s", opts->transport); + + /* + * Check the generic options first as we need a valid transport for + * the lookup below. Then clear the generic flags so that transport + * drivers don't have to care about them. + */ + ret = nvmf_check_required_opts(opts, NVMF_REQUIRED_OPTS); + if (ret) + goto out_free_opts; + opts->mask &= ~NVMF_REQUIRED_OPTS; + + down_read(&nvmf_transports_rwsem); + ops = nvmf_lookup_transport(opts); + if (!ops) { + pr_info("no handler found for transport %s.\n", + opts->transport); + ret = -EINVAL; + goto out_unlock; + } + + if (!try_module_get(ops->module)) { + ret = -EBUSY; + goto out_unlock; + } + up_read(&nvmf_transports_rwsem); + + ret = nvmf_check_required_opts(opts, ops->required_opts); + if (ret) + goto out_module_put; + ret = nvmf_check_allowed_opts(opts, NVMF_ALLOWED_OPTS | + ops->allowed_opts | ops->required_opts); + if (ret) + goto out_module_put; + + ctrl = ops->create_ctrl(dev, opts); + if (IS_ERR(ctrl)) { + ret = PTR_ERR(ctrl); + goto out_module_put; + } + + module_put(ops->module); + return ctrl; + +out_module_put: + module_put(ops->module); + goto out_free_opts; +out_unlock: + up_read(&nvmf_transports_rwsem); +out_free_opts: + nvmf_free_options(opts); + return ERR_PTR(ret); +} + +static struct class *nvmf_class; +static struct device *nvmf_device; +static DEFINE_MUTEX(nvmf_dev_mutex); + +static ssize_t nvmf_dev_write(struct file *file, const char __user *ubuf, + size_t count, loff_t *pos) +{ + struct seq_file *seq_file = file->private_data; + struct nvme_ctrl *ctrl; + const char *buf; + int ret = 0; + + if (count > PAGE_SIZE) + return -ENOMEM; + + buf = memdup_user_nul(ubuf, count); + if (IS_ERR(buf)) + return PTR_ERR(buf); + + mutex_lock(&nvmf_dev_mutex); + if (seq_file->private) { + ret = -EINVAL; + goto out_unlock; + } + + ctrl = nvmf_create_ctrl(nvmf_device, buf); + if (IS_ERR(ctrl)) { + ret = PTR_ERR(ctrl); + goto out_unlock; + } + + seq_file->private = ctrl; + +out_unlock: + mutex_unlock(&nvmf_dev_mutex); + kfree(buf); + return ret ? ret : count; +} + +static void __nvmf_concat_opt_tokens(struct seq_file *seq_file) +{ + const struct match_token *tok; + int idx; + + /* + * Add dummy entries for instance and cntlid to + * signal an invalid/non-existing controller + */ + seq_puts(seq_file, "instance=-1,cntlid=-1"); + for (idx = 0; idx < ARRAY_SIZE(opt_tokens); idx++) { + tok = &opt_tokens[idx]; + if (tok->token == NVMF_OPT_ERR) + continue; + seq_puts(seq_file, ","); + seq_puts(seq_file, tok->pattern); + } + seq_puts(seq_file, "\n"); +} + +static int nvmf_dev_show(struct seq_file *seq_file, void *private) +{ + struct nvme_ctrl *ctrl; + + mutex_lock(&nvmf_dev_mutex); + ctrl = seq_file->private; + if (!ctrl) { + __nvmf_concat_opt_tokens(seq_file); + goto out_unlock; + } + + seq_printf(seq_file, "instance=%d,cntlid=%d\n", + ctrl->instance, ctrl->cntlid); + +out_unlock: + mutex_unlock(&nvmf_dev_mutex); + return 0; +} + +static int nvmf_dev_open(struct inode *inode, struct file *file) +{ + /* + * The miscdevice code initializes file->private_data, but doesn't + * make use of it later. + */ + file->private_data = NULL; + return single_open(file, nvmf_dev_show, NULL); +} + +static int nvmf_dev_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq_file = file->private_data; + struct nvme_ctrl *ctrl = seq_file->private; + + if (ctrl) + nvme_put_ctrl(ctrl); + return single_release(inode, file); +} + +static const struct file_operations nvmf_dev_fops = { + .owner = THIS_MODULE, + .write = nvmf_dev_write, + .read = seq_read, + .open = nvmf_dev_open, + .release = nvmf_dev_release, +}; + +static struct miscdevice nvmf_misc = { + .minor = MISC_DYNAMIC_MINOR, + .name = "nvme-fabrics", + .fops = &nvmf_dev_fops, +}; + +static int __init nvmf_init(void) +{ + int ret; + + nvmf_default_host = nvmf_host_default(); + if (!nvmf_default_host) + return -ENOMEM; + + nvmf_class = class_create(THIS_MODULE, "nvme-fabrics"); + if (IS_ERR(nvmf_class)) { + pr_err("couldn't register class nvme-fabrics\n"); + ret = PTR_ERR(nvmf_class); + goto out_free_host; + } + + nvmf_device = + device_create(nvmf_class, NULL, MKDEV(0, 0), NULL, "ctl"); + if (IS_ERR(nvmf_device)) { + pr_err("couldn't create nvme-fabris device!\n"); + ret = PTR_ERR(nvmf_device); + goto out_destroy_class; + } + + ret = misc_register(&nvmf_misc); + if (ret) { + pr_err("couldn't register misc device: %d\n", ret); + goto out_destroy_device; + } + + return 0; + +out_destroy_device: + device_destroy(nvmf_class, MKDEV(0, 0)); +out_destroy_class: + class_destroy(nvmf_class); +out_free_host: + nvmf_host_put(nvmf_default_host); + return ret; +} + +static void __exit nvmf_exit(void) +{ + misc_deregister(&nvmf_misc); + device_destroy(nvmf_class, MKDEV(0, 0)); + class_destroy(nvmf_class); + nvmf_host_put(nvmf_default_host); + + BUILD_BUG_ON(sizeof(struct nvmf_common_command) != 64); + BUILD_BUG_ON(sizeof(struct nvmf_connect_command) != 64); + BUILD_BUG_ON(sizeof(struct nvmf_property_get_command) != 64); + BUILD_BUG_ON(sizeof(struct nvmf_property_set_command) != 64); + BUILD_BUG_ON(sizeof(struct nvmf_connect_data) != 1024); +} + +MODULE_LICENSE("GPL v2"); + +module_init(nvmf_init); +module_exit(nvmf_exit); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/fabrics.h b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/fabrics.h new file mode 100644 index 0000000..1e3a09c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/fabrics.h @@ -0,0 +1,203 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * NVMe over Fabrics common host code. + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + */ +#ifndef _NVME_FABRICS_H +#define _NVME_FABRICS_H 1 + +#include +#include + +#define NVMF_MIN_QUEUE_SIZE 16 +#define NVMF_MAX_QUEUE_SIZE 1024 +#define NVMF_DEF_QUEUE_SIZE 128 +#define NVMF_DEF_RECONNECT_DELAY 10 +/* default to 600 seconds of reconnect attempts before giving up */ +#define NVMF_DEF_CTRL_LOSS_TMO 600 +/* default is -1: the fail fast mechanism is disabled */ +#define NVMF_DEF_FAIL_FAST_TMO -1 + +/* + * Reserved one command for internal usage. This command is used for sending + * the connect command, as well as for the keep alive command on the admin + * queue once live. + */ +#define NVMF_RESERVED_TAGS 1 + +/* + * Define a host as seen by the target. We allocate one at boot, but also + * allow the override it when creating controllers. This is both to provide + * persistence of the Host NQN over multiple boots, and to allow using + * multiple ones, for example in a container scenario. Because we must not + * use different Host NQNs with the same Host ID we generate a Host ID and + * use this structure to keep track of the relation between the two. + */ +struct nvmf_host { + struct kref ref; + struct list_head list; + char nqn[NVMF_NQN_SIZE]; + uuid_t id; +}; + +/** + * enum nvmf_parsing_opts - used to define the sysfs parsing options used. + */ +enum { + NVMF_OPT_ERR = 0, + NVMF_OPT_TRANSPORT = 1 << 0, + NVMF_OPT_NQN = 1 << 1, + NVMF_OPT_TRADDR = 1 << 2, + NVMF_OPT_TRSVCID = 1 << 3, + NVMF_OPT_QUEUE_SIZE = 1 << 4, + NVMF_OPT_NR_IO_QUEUES = 1 << 5, + NVMF_OPT_TL_RETRY_COUNT = 1 << 6, + NVMF_OPT_KATO = 1 << 7, + NVMF_OPT_HOSTNQN = 1 << 8, + NVMF_OPT_RECONNECT_DELAY = 1 << 9, + NVMF_OPT_HOST_TRADDR = 1 << 10, + NVMF_OPT_CTRL_LOSS_TMO = 1 << 11, + NVMF_OPT_HOST_ID = 1 << 12, + NVMF_OPT_DUP_CONNECT = 1 << 13, + NVMF_OPT_DISABLE_SQFLOW = 1 << 14, + NVMF_OPT_HDR_DIGEST = 1 << 15, + NVMF_OPT_DATA_DIGEST = 1 << 16, + NVMF_OPT_NR_WRITE_QUEUES = 1 << 17, + NVMF_OPT_NR_POLL_QUEUES = 1 << 18, + NVMF_OPT_TOS = 1 << 19, + NVMF_OPT_FAIL_FAST_TMO = 1 << 20, + NVMF_OPT_HOST_IFACE = 1 << 21, + NVMF_OPT_DISCOVERY = 1 << 22, +}; + +/** + * struct nvmf_ctrl_options - Used to hold the options specified + * with the parsing opts enum. + * @mask: Used by the fabrics library to parse through sysfs options + * on adding a NVMe controller. + * @transport: Holds the fabric transport "technology name" (for a lack of + * better description) that will be used by an NVMe controller + * being added. + * @subsysnqn: Hold the fully qualified NQN subystem name (format defined + * in the NVMe specification, "NVMe Qualified Names"). + * @traddr: The transport-specific TRADDR field for a port on the + * subsystem which is adding a controller. + * @trsvcid: The transport-specific TRSVCID field for a port on the + * subsystem which is adding a controller. + * @host_traddr: A transport-specific field identifying the NVME host port + * to use for the connection to the controller. + * @host_iface: A transport-specific field identifying the NVME host + * interface to use for the connection to the controller. + * @queue_size: Number of IO queue elements. + * @nr_io_queues: Number of controller IO queues that will be established. + * @reconnect_delay: Time between two consecutive reconnect attempts. + * @discovery_nqn: indicates if the subsysnqn is the well-known discovery NQN. + * @kato: Keep-alive timeout. + * @host: Virtual NVMe host, contains the NQN and Host ID. + * @max_reconnects: maximum number of allowed reconnect attempts before removing + * the controller, (-1) means reconnect forever, zero means remove + * immediately; + * @disable_sqflow: disable controller sq flow control + * @hdr_digest: generate/verify header digest (TCP) + * @data_digest: generate/verify data digest (TCP) + * @nr_write_queues: number of queues for write I/O + * @nr_poll_queues: number of queues for polling I/O + * @tos: type of service + * @fast_io_fail_tmo: Fast I/O fail timeout in seconds + */ +struct nvmf_ctrl_options { + unsigned mask; + char *transport; + char *subsysnqn; + char *traddr; + char *trsvcid; + char *host_traddr; + char *host_iface; + size_t queue_size; + unsigned int nr_io_queues; + unsigned int reconnect_delay; + bool discovery_nqn; + bool duplicate_connect; + unsigned int kato; + struct nvmf_host *host; + int max_reconnects; + bool disable_sqflow; + bool hdr_digest; + bool data_digest; + unsigned int nr_write_queues; + unsigned int nr_poll_queues; + int tos; + int fast_io_fail_tmo; +}; + +/* + * struct nvmf_transport_ops - used to register a specific + * fabric implementation of NVMe fabrics. + * @entry: Used by the fabrics library to add the new + * registration entry to its linked-list internal tree. + * @module: Transport module reference + * @name: Name of the NVMe fabric driver implementation. + * @required_opts: sysfs command-line options that must be specified + * when adding a new NVMe controller. + * @allowed_opts: sysfs command-line options that can be specified + * when adding a new NVMe controller. + * @create_ctrl(): function pointer that points to a non-NVMe + * implementation-specific fabric technology + * that would go into starting up that fabric + * for the purpose of conneciton to an NVMe controller + * using that fabric technology. + * + * Notes: + * 1. At minimum, 'required_opts' and 'allowed_opts' should + * be set to the same enum parsing options defined earlier. + * 2. create_ctrl() must be defined (even if it does nothing) + * 3. struct nvmf_transport_ops must be statically allocated in the + * modules .bss section so that a pure module_get on @module + * prevents the memory from beeing freed. + */ +struct nvmf_transport_ops { + struct list_head entry; + struct module *module; + const char *name; + int required_opts; + int allowed_opts; + struct nvme_ctrl *(*create_ctrl)(struct device *dev, + struct nvmf_ctrl_options *opts); +}; + +static inline bool +nvmf_ctlr_matches_baseopts(struct nvme_ctrl *ctrl, + struct nvmf_ctrl_options *opts) +{ + if (ctrl->state == NVME_CTRL_DELETING || + ctrl->state == NVME_CTRL_DELETING_NOIO || + ctrl->state == NVME_CTRL_DEAD || + strcmp(opts->subsysnqn, ctrl->opts->subsysnqn) || + strcmp(opts->host->nqn, ctrl->opts->host->nqn) || + memcmp(&opts->host->id, &ctrl->opts->host->id, sizeof(uuid_t))) + return false; + + return true; +} + +static inline char *nvmf_ctrl_subsysnqn(struct nvme_ctrl *ctrl) +{ + if (!ctrl->subsys) + return ctrl->opts->subsysnqn; + return ctrl->subsys->subnqn; +} + +int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val); +int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val); +int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val); +int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl); +int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid); +int nvmf_register_transport(struct nvmf_transport_ops *ops); +void nvmf_unregister_transport(struct nvmf_transport_ops *ops); +void nvmf_free_options(struct nvmf_ctrl_options *opts); +int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size); +bool nvmf_should_reconnect(struct nvme_ctrl *ctrl); +bool nvmf_ip_options_match(struct nvme_ctrl *ctrl, + struct nvmf_ctrl_options *opts); + +#endif /* _NVME_FABRICS_H */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/fault_inject.c b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/fault_inject.c new file mode 100644 index 0000000..83d2e68 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/fault_inject.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fault injection support for nvme. + * + * Copyright (c) 2018, Oracle and/or its affiliates + */ + +#include +#include "nvme.h" + +static DECLARE_FAULT_ATTR(fail_default_attr); +/* optional fault injection attributes boot time option: + * nvme_core.fail_request=,,, + */ +static char *fail_request; +module_param(fail_request, charp, 0000); + +void nvme_fault_inject_init(struct nvme_fault_inject *fault_inj, + const char *dev_name) +{ + struct dentry *dir, *parent; + struct fault_attr *attr = &fault_inj->attr; + + /* set default fault injection attribute */ + if (fail_request) + setup_fault_attr(&fail_default_attr, fail_request); + + /* create debugfs directory and attribute */ + parent = debugfs_create_dir(dev_name, NULL); + if (!parent) { + pr_warn("%s: failed to create debugfs directory\n", dev_name); + return; + } + + *attr = fail_default_attr; + dir = fault_create_debugfs_attr("fault_inject", parent, attr); + if (IS_ERR(dir)) { + pr_warn("%s: failed to create debugfs attr\n", dev_name); + debugfs_remove_recursive(parent); + return; + } + fault_inj->parent = parent; + + /* create debugfs for status code and dont_retry */ + fault_inj->status = NVME_SC_INVALID_OPCODE; + fault_inj->dont_retry = true; + debugfs_create_x16("status", 0600, dir, &fault_inj->status); + debugfs_create_bool("dont_retry", 0600, dir, &fault_inj->dont_retry); +} + +void nvme_fault_inject_fini(struct nvme_fault_inject *fault_inject) +{ + /* remove debugfs directories */ + debugfs_remove_recursive(fault_inject->parent); +} + +void nvme_should_fail(struct request *req) +{ + struct gendisk *disk = req->q->disk; + struct nvme_fault_inject *fault_inject = NULL; + u16 status; + + if (disk) { + struct nvme_ns *ns = disk->private_data; + + if (ns) + fault_inject = &ns->fault_inject; + else + WARN_ONCE(1, "No namespace found for request\n"); + } else { + fault_inject = &nvme_req(req)->ctrl->fault_inject; + } + + if (fault_inject && should_fail(&fault_inject->attr, 1)) { + /* inject status code and DNR bit */ + status = fault_inject->status; + if (fault_inject->dont_retry) + status |= NVME_SC_DNR; + nvme_req(req)->status = status; + } +} +EXPORT_SYMBOL_GPL(nvme_should_fail); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/fc.c b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/fc.c new file mode 100644 index 0000000..7acf64a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/fc.c @@ -0,0 +1,4052 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2016 Avago Technologies. All rights reserved. + */ +#ifdef pr_fmt +#undef pr_fmt +#endif +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include +#include +#include "nvme.h" +#include "fabrics.h" +#include +#include +#include "fc.h" +#include +#include + +/* *************************** Data Structures/Defines ****************** */ + + +enum nvme_fc_queue_flags { + NVME_FC_Q_CONNECTED = 0, + NVME_FC_Q_LIVE, +}; + +#define NVME_FC_DEFAULT_DEV_LOSS_TMO 60 /* seconds */ +#define NVME_FC_DEFAULT_RECONNECT_TMO 2 /* delay between reconnects + * when connected and a + * connection failure. + */ + +struct nvme_fc_queue { + struct nvme_fc_ctrl *ctrl; + struct device *dev; + struct blk_mq_hw_ctx *hctx; + void *lldd_handle; + size_t cmnd_capsule_len; + u32 qnum; + u32 rqcnt; + u32 seqno; + + u64 connection_id; + atomic_t csn; + + unsigned long flags; +} __aligned(sizeof(u64)); /* alignment for other things alloc'd with */ + +enum nvme_fcop_flags { + FCOP_FLAGS_TERMIO = (1 << 0), + FCOP_FLAGS_AEN = (1 << 1), +}; + +struct nvmefc_ls_req_op { + struct nvmefc_ls_req ls_req; + + struct nvme_fc_rport *rport; + struct nvme_fc_queue *queue; + struct request *rq; + u32 flags; + + int ls_error; + struct completion ls_done; + struct list_head lsreq_list; /* rport->ls_req_list */ + bool req_queued; +}; + +struct nvmefc_ls_rcv_op { + struct nvme_fc_rport *rport; + struct nvmefc_ls_rsp *lsrsp; + union nvmefc_ls_requests *rqstbuf; + union nvmefc_ls_responses *rspbuf; + u16 rqstdatalen; + bool handled; + dma_addr_t rspdma; + struct list_head lsrcv_list; /* rport->ls_rcv_list */ +} __aligned(sizeof(u64)); /* alignment for other things alloc'd with */ + +enum nvme_fcpop_state { + FCPOP_STATE_UNINIT = 0, + FCPOP_STATE_IDLE = 1, + FCPOP_STATE_ACTIVE = 2, + FCPOP_STATE_ABORTED = 3, + FCPOP_STATE_COMPLETE = 4, +}; + +struct nvme_fc_fcp_op { + struct nvme_request nreq; /* + * nvme/host/core.c + * requires this to be + * the 1st element in the + * private structure + * associated with the + * request. + */ + struct nvmefc_fcp_req fcp_req; + + struct nvme_fc_ctrl *ctrl; + struct nvme_fc_queue *queue; + struct request *rq; + + atomic_t state; + u32 flags; + u32 rqno; + u32 nents; + + struct nvme_fc_cmd_iu cmd_iu; + struct nvme_fc_ersp_iu rsp_iu; +}; + +struct nvme_fcp_op_w_sgl { + struct nvme_fc_fcp_op op; + struct scatterlist sgl[NVME_INLINE_SG_CNT]; + uint8_t priv[]; +}; + +struct nvme_fc_lport { + struct nvme_fc_local_port localport; + + struct ida endp_cnt; + struct list_head port_list; /* nvme_fc_port_list */ + struct list_head endp_list; + struct device *dev; /* physical device for dma */ + struct nvme_fc_port_template *ops; + struct kref ref; + atomic_t act_rport_cnt; +} __aligned(sizeof(u64)); /* alignment for other things alloc'd with */ + +struct nvme_fc_rport { + struct nvme_fc_remote_port remoteport; + + struct list_head endp_list; /* for lport->endp_list */ + struct list_head ctrl_list; + struct list_head ls_req_list; + struct list_head ls_rcv_list; + struct list_head disc_list; + struct device *dev; /* physical device for dma */ + struct nvme_fc_lport *lport; + spinlock_t lock; + struct kref ref; + atomic_t act_ctrl_cnt; + unsigned long dev_loss_end; + struct work_struct lsrcv_work; +} __aligned(sizeof(u64)); /* alignment for other things alloc'd with */ + +/* fc_ctrl flags values - specified as bit positions */ +#define ASSOC_ACTIVE 0 +#define ASSOC_FAILED 1 +#define FCCTRL_TERMIO 2 + +struct nvme_fc_ctrl { + spinlock_t lock; + struct nvme_fc_queue *queues; + struct device *dev; + struct nvme_fc_lport *lport; + struct nvme_fc_rport *rport; + u32 cnum; + + bool ioq_live; + u64 association_id; + struct nvmefc_ls_rcv_op *rcv_disconn; + + struct list_head ctrl_list; /* rport->ctrl_list */ + + struct blk_mq_tag_set admin_tag_set; + struct blk_mq_tag_set tag_set; + + struct work_struct ioerr_work; + struct delayed_work connect_work; + + struct kref ref; + unsigned long flags; + u32 iocnt; + wait_queue_head_t ioabort_wait; + + struct nvme_fc_fcp_op aen_ops[NVME_NR_AEN_COMMANDS]; + + struct nvme_ctrl ctrl; +}; + +static inline struct nvme_fc_ctrl * +to_fc_ctrl(struct nvme_ctrl *ctrl) +{ + return container_of(ctrl, struct nvme_fc_ctrl, ctrl); +} + +static inline struct nvme_fc_lport * +localport_to_lport(struct nvme_fc_local_port *portptr) +{ + return container_of(portptr, struct nvme_fc_lport, localport); +} + +static inline struct nvme_fc_rport * +remoteport_to_rport(struct nvme_fc_remote_port *portptr) +{ + return container_of(portptr, struct nvme_fc_rport, remoteport); +} + +static inline struct nvmefc_ls_req_op * +ls_req_to_lsop(struct nvmefc_ls_req *lsreq) +{ + return container_of(lsreq, struct nvmefc_ls_req_op, ls_req); +} + +static inline struct nvme_fc_fcp_op * +fcp_req_to_fcp_op(struct nvmefc_fcp_req *fcpreq) +{ + return container_of(fcpreq, struct nvme_fc_fcp_op, fcp_req); +} + + + +/* *************************** Globals **************************** */ + + +static DEFINE_SPINLOCK(nvme_fc_lock); + +static LIST_HEAD(nvme_fc_lport_list); +static DEFINE_IDA(nvme_fc_local_port_cnt); +static DEFINE_IDA(nvme_fc_ctrl_cnt); + +static struct workqueue_struct *nvme_fc_wq; + +static bool nvme_fc_waiting_to_unload; +static DECLARE_COMPLETION(nvme_fc_unload_proceed); + +/* + * These items are short-term. They will eventually be moved into + * a generic FC class. See comments in module init. + */ +static struct device *fc_udev_device; + +static void nvme_fc_complete_rq(struct request *rq); + +/* *********************** FC-NVME Port Management ************************ */ + +static void __nvme_fc_delete_hw_queue(struct nvme_fc_ctrl *, + struct nvme_fc_queue *, unsigned int); + +static void nvme_fc_handle_ls_rqst_work(struct work_struct *work); + + +static void +nvme_fc_free_lport(struct kref *ref) +{ + struct nvme_fc_lport *lport = + container_of(ref, struct nvme_fc_lport, ref); + unsigned long flags; + + WARN_ON(lport->localport.port_state != FC_OBJSTATE_DELETED); + WARN_ON(!list_empty(&lport->endp_list)); + + /* remove from transport list */ + spin_lock_irqsave(&nvme_fc_lock, flags); + list_del(&lport->port_list); + if (nvme_fc_waiting_to_unload && list_empty(&nvme_fc_lport_list)) + complete(&nvme_fc_unload_proceed); + spin_unlock_irqrestore(&nvme_fc_lock, flags); + + ida_simple_remove(&nvme_fc_local_port_cnt, lport->localport.port_num); + ida_destroy(&lport->endp_cnt); + + put_device(lport->dev); + + kfree(lport); +} + +static void +nvme_fc_lport_put(struct nvme_fc_lport *lport) +{ + kref_put(&lport->ref, nvme_fc_free_lport); +} + +static int +nvme_fc_lport_get(struct nvme_fc_lport *lport) +{ + return kref_get_unless_zero(&lport->ref); +} + + +static struct nvme_fc_lport * +nvme_fc_attach_to_unreg_lport(struct nvme_fc_port_info *pinfo, + struct nvme_fc_port_template *ops, + struct device *dev) +{ + struct nvme_fc_lport *lport; + unsigned long flags; + + spin_lock_irqsave(&nvme_fc_lock, flags); + + list_for_each_entry(lport, &nvme_fc_lport_list, port_list) { + if (lport->localport.node_name != pinfo->node_name || + lport->localport.port_name != pinfo->port_name) + continue; + + if (lport->dev != dev) { + lport = ERR_PTR(-EXDEV); + goto out_done; + } + + if (lport->localport.port_state != FC_OBJSTATE_DELETED) { + lport = ERR_PTR(-EEXIST); + goto out_done; + } + + if (!nvme_fc_lport_get(lport)) { + /* + * fails if ref cnt already 0. If so, + * act as if lport already deleted + */ + lport = NULL; + goto out_done; + } + + /* resume the lport */ + + lport->ops = ops; + lport->localport.port_role = pinfo->port_role; + lport->localport.port_id = pinfo->port_id; + lport->localport.port_state = FC_OBJSTATE_ONLINE; + + spin_unlock_irqrestore(&nvme_fc_lock, flags); + + return lport; + } + + lport = NULL; + +out_done: + spin_unlock_irqrestore(&nvme_fc_lock, flags); + + return lport; +} + +/** + * nvme_fc_register_localport - transport entry point called by an + * LLDD to register the existence of a NVME + * host FC port. + * @pinfo: pointer to information about the port to be registered + * @template: LLDD entrypoints and operational parameters for the port + * @dev: physical hardware device node port corresponds to. Will be + * used for DMA mappings + * @portptr: pointer to a local port pointer. Upon success, the routine + * will allocate a nvme_fc_local_port structure and place its + * address in the local port pointer. Upon failure, local port + * pointer will be set to 0. + * + * Returns: + * a completion status. Must be 0 upon success; a negative errno + * (ex: -ENXIO) upon failure. + */ +int +nvme_fc_register_localport(struct nvme_fc_port_info *pinfo, + struct nvme_fc_port_template *template, + struct device *dev, + struct nvme_fc_local_port **portptr) +{ + struct nvme_fc_lport *newrec; + unsigned long flags; + int ret, idx; + + if (!template->localport_delete || !template->remoteport_delete || + !template->ls_req || !template->fcp_io || + !template->ls_abort || !template->fcp_abort || + !template->max_hw_queues || !template->max_sgl_segments || + !template->max_dif_sgl_segments || !template->dma_boundary) { + ret = -EINVAL; + goto out_reghost_failed; + } + + /* + * look to see if there is already a localport that had been + * deregistered and in the process of waiting for all the + * references to fully be removed. If the references haven't + * expired, we can simply re-enable the localport. Remoteports + * and controller reconnections should resume naturally. + */ + newrec = nvme_fc_attach_to_unreg_lport(pinfo, template, dev); + + /* found an lport, but something about its state is bad */ + if (IS_ERR(newrec)) { + ret = PTR_ERR(newrec); + goto out_reghost_failed; + + /* found existing lport, which was resumed */ + } else if (newrec) { + *portptr = &newrec->localport; + return 0; + } + + /* nothing found - allocate a new localport struct */ + + newrec = kmalloc((sizeof(*newrec) + template->local_priv_sz), + GFP_KERNEL); + if (!newrec) { + ret = -ENOMEM; + goto out_reghost_failed; + } + + idx = ida_simple_get(&nvme_fc_local_port_cnt, 0, 0, GFP_KERNEL); + if (idx < 0) { + ret = -ENOSPC; + goto out_fail_kfree; + } + + if (!get_device(dev) && dev) { + ret = -ENODEV; + goto out_ida_put; + } + + INIT_LIST_HEAD(&newrec->port_list); + INIT_LIST_HEAD(&newrec->endp_list); + kref_init(&newrec->ref); + atomic_set(&newrec->act_rport_cnt, 0); + newrec->ops = template; + newrec->dev = dev; + ida_init(&newrec->endp_cnt); + if (template->local_priv_sz) + newrec->localport.private = &newrec[1]; + else + newrec->localport.private = NULL; + newrec->localport.node_name = pinfo->node_name; + newrec->localport.port_name = pinfo->port_name; + newrec->localport.port_role = pinfo->port_role; + newrec->localport.port_id = pinfo->port_id; + newrec->localport.port_state = FC_OBJSTATE_ONLINE; + newrec->localport.port_num = idx; + + spin_lock_irqsave(&nvme_fc_lock, flags); + list_add_tail(&newrec->port_list, &nvme_fc_lport_list); + spin_unlock_irqrestore(&nvme_fc_lock, flags); + + if (dev) + dma_set_seg_boundary(dev, template->dma_boundary); + + *portptr = &newrec->localport; + return 0; + +out_ida_put: + ida_simple_remove(&nvme_fc_local_port_cnt, idx); +out_fail_kfree: + kfree(newrec); +out_reghost_failed: + *portptr = NULL; + + return ret; +} +EXPORT_SYMBOL_GPL(nvme_fc_register_localport); + +/** + * nvme_fc_unregister_localport - transport entry point called by an + * LLDD to deregister/remove a previously + * registered a NVME host FC port. + * @portptr: pointer to the (registered) local port that is to be deregistered. + * + * Returns: + * a completion status. Must be 0 upon success; a negative errno + * (ex: -ENXIO) upon failure. + */ +int +nvme_fc_unregister_localport(struct nvme_fc_local_port *portptr) +{ + struct nvme_fc_lport *lport = localport_to_lport(portptr); + unsigned long flags; + + if (!portptr) + return -EINVAL; + + spin_lock_irqsave(&nvme_fc_lock, flags); + + if (portptr->port_state != FC_OBJSTATE_ONLINE) { + spin_unlock_irqrestore(&nvme_fc_lock, flags); + return -EINVAL; + } + portptr->port_state = FC_OBJSTATE_DELETED; + + spin_unlock_irqrestore(&nvme_fc_lock, flags); + + if (atomic_read(&lport->act_rport_cnt) == 0) + lport->ops->localport_delete(&lport->localport); + + nvme_fc_lport_put(lport); + + return 0; +} +EXPORT_SYMBOL_GPL(nvme_fc_unregister_localport); + +/* + * TRADDR strings, per FC-NVME are fixed format: + * "nn-0x<16hexdigits>:pn-0x<16hexdigits>" - 43 characters + * udev event will only differ by prefix of what field is + * being specified: + * "NVMEFC_HOST_TRADDR=" or "NVMEFC_TRADDR=" - 19 max characters + * 19 + 43 + null_fudge = 64 characters + */ +#define FCNVME_TRADDR_LENGTH 64 + +static void +nvme_fc_signal_discovery_scan(struct nvme_fc_lport *lport, + struct nvme_fc_rport *rport) +{ + char hostaddr[FCNVME_TRADDR_LENGTH]; /* NVMEFC_HOST_TRADDR=...*/ + char tgtaddr[FCNVME_TRADDR_LENGTH]; /* NVMEFC_TRADDR=...*/ + char *envp[4] = { "FC_EVENT=nvmediscovery", hostaddr, tgtaddr, NULL }; + + if (!(rport->remoteport.port_role & FC_PORT_ROLE_NVME_DISCOVERY)) + return; + + snprintf(hostaddr, sizeof(hostaddr), + "NVMEFC_HOST_TRADDR=nn-0x%016llx:pn-0x%016llx", + lport->localport.node_name, lport->localport.port_name); + snprintf(tgtaddr, sizeof(tgtaddr), + "NVMEFC_TRADDR=nn-0x%016llx:pn-0x%016llx", + rport->remoteport.node_name, rport->remoteport.port_name); + kobject_uevent_env(&fc_udev_device->kobj, KOBJ_CHANGE, envp); +} + +static void +nvme_fc_free_rport(struct kref *ref) +{ + struct nvme_fc_rport *rport = + container_of(ref, struct nvme_fc_rport, ref); + struct nvme_fc_lport *lport = + localport_to_lport(rport->remoteport.localport); + unsigned long flags; + + WARN_ON(rport->remoteport.port_state != FC_OBJSTATE_DELETED); + WARN_ON(!list_empty(&rport->ctrl_list)); + + /* remove from lport list */ + spin_lock_irqsave(&nvme_fc_lock, flags); + list_del(&rport->endp_list); + spin_unlock_irqrestore(&nvme_fc_lock, flags); + + WARN_ON(!list_empty(&rport->disc_list)); + ida_simple_remove(&lport->endp_cnt, rport->remoteport.port_num); + + kfree(rport); + + nvme_fc_lport_put(lport); +} + +static void +nvme_fc_rport_put(struct nvme_fc_rport *rport) +{ + kref_put(&rport->ref, nvme_fc_free_rport); +} + +static int +nvme_fc_rport_get(struct nvme_fc_rport *rport) +{ + return kref_get_unless_zero(&rport->ref); +} + +static void +nvme_fc_resume_controller(struct nvme_fc_ctrl *ctrl) +{ + switch (ctrl->ctrl.state) { + case NVME_CTRL_NEW: + case NVME_CTRL_CONNECTING: + /* + * As all reconnects were suppressed, schedule a + * connect. + */ + dev_info(ctrl->ctrl.device, + "NVME-FC{%d}: connectivity re-established. " + "Attempting reconnect\n", ctrl->cnum); + + queue_delayed_work(nvme_wq, &ctrl->connect_work, 0); + break; + + case NVME_CTRL_RESETTING: + /* + * Controller is already in the process of terminating the + * association. No need to do anything further. The reconnect + * step will naturally occur after the reset completes. + */ + break; + + default: + /* no action to take - let it delete */ + break; + } +} + +static struct nvme_fc_rport * +nvme_fc_attach_to_suspended_rport(struct nvme_fc_lport *lport, + struct nvme_fc_port_info *pinfo) +{ + struct nvme_fc_rport *rport; + struct nvme_fc_ctrl *ctrl; + unsigned long flags; + + spin_lock_irqsave(&nvme_fc_lock, flags); + + list_for_each_entry(rport, &lport->endp_list, endp_list) { + if (rport->remoteport.node_name != pinfo->node_name || + rport->remoteport.port_name != pinfo->port_name) + continue; + + if (!nvme_fc_rport_get(rport)) { + rport = ERR_PTR(-ENOLCK); + goto out_done; + } + + spin_unlock_irqrestore(&nvme_fc_lock, flags); + + spin_lock_irqsave(&rport->lock, flags); + + /* has it been unregistered */ + if (rport->remoteport.port_state != FC_OBJSTATE_DELETED) { + /* means lldd called us twice */ + spin_unlock_irqrestore(&rport->lock, flags); + nvme_fc_rport_put(rport); + return ERR_PTR(-ESTALE); + } + + rport->remoteport.port_role = pinfo->port_role; + rport->remoteport.port_id = pinfo->port_id; + rport->remoteport.port_state = FC_OBJSTATE_ONLINE; + rport->dev_loss_end = 0; + + /* + * kick off a reconnect attempt on all associations to the + * remote port. A successful reconnects will resume i/o. + */ + list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list) + nvme_fc_resume_controller(ctrl); + + spin_unlock_irqrestore(&rport->lock, flags); + + return rport; + } + + rport = NULL; + +out_done: + spin_unlock_irqrestore(&nvme_fc_lock, flags); + + return rport; +} + +static inline void +__nvme_fc_set_dev_loss_tmo(struct nvme_fc_rport *rport, + struct nvme_fc_port_info *pinfo) +{ + if (pinfo->dev_loss_tmo) + rport->remoteport.dev_loss_tmo = pinfo->dev_loss_tmo; + else + rport->remoteport.dev_loss_tmo = NVME_FC_DEFAULT_DEV_LOSS_TMO; +} + +/** + * nvme_fc_register_remoteport - transport entry point called by an + * LLDD to register the existence of a NVME + * subsystem FC port on its fabric. + * @localport: pointer to the (registered) local port that the remote + * subsystem port is connected to. + * @pinfo: pointer to information about the port to be registered + * @portptr: pointer to a remote port pointer. Upon success, the routine + * will allocate a nvme_fc_remote_port structure and place its + * address in the remote port pointer. Upon failure, remote port + * pointer will be set to 0. + * + * Returns: + * a completion status. Must be 0 upon success; a negative errno + * (ex: -ENXIO) upon failure. + */ +int +nvme_fc_register_remoteport(struct nvme_fc_local_port *localport, + struct nvme_fc_port_info *pinfo, + struct nvme_fc_remote_port **portptr) +{ + struct nvme_fc_lport *lport = localport_to_lport(localport); + struct nvme_fc_rport *newrec; + unsigned long flags; + int ret, idx; + + if (!nvme_fc_lport_get(lport)) { + ret = -ESHUTDOWN; + goto out_reghost_failed; + } + + /* + * look to see if there is already a remoteport that is waiting + * for a reconnect (within dev_loss_tmo) with the same WWN's. + * If so, transition to it and reconnect. + */ + newrec = nvme_fc_attach_to_suspended_rport(lport, pinfo); + + /* found an rport, but something about its state is bad */ + if (IS_ERR(newrec)) { + ret = PTR_ERR(newrec); + goto out_lport_put; + + /* found existing rport, which was resumed */ + } else if (newrec) { + nvme_fc_lport_put(lport); + __nvme_fc_set_dev_loss_tmo(newrec, pinfo); + nvme_fc_signal_discovery_scan(lport, newrec); + *portptr = &newrec->remoteport; + return 0; + } + + /* nothing found - allocate a new remoteport struct */ + + newrec = kmalloc((sizeof(*newrec) + lport->ops->remote_priv_sz), + GFP_KERNEL); + if (!newrec) { + ret = -ENOMEM; + goto out_lport_put; + } + + idx = ida_simple_get(&lport->endp_cnt, 0, 0, GFP_KERNEL); + if (idx < 0) { + ret = -ENOSPC; + goto out_kfree_rport; + } + + INIT_LIST_HEAD(&newrec->endp_list); + INIT_LIST_HEAD(&newrec->ctrl_list); + INIT_LIST_HEAD(&newrec->ls_req_list); + INIT_LIST_HEAD(&newrec->disc_list); + kref_init(&newrec->ref); + atomic_set(&newrec->act_ctrl_cnt, 0); + spin_lock_init(&newrec->lock); + newrec->remoteport.localport = &lport->localport; + INIT_LIST_HEAD(&newrec->ls_rcv_list); + newrec->dev = lport->dev; + newrec->lport = lport; + if (lport->ops->remote_priv_sz) + newrec->remoteport.private = &newrec[1]; + else + newrec->remoteport.private = NULL; + newrec->remoteport.port_role = pinfo->port_role; + newrec->remoteport.node_name = pinfo->node_name; + newrec->remoteport.port_name = pinfo->port_name; + newrec->remoteport.port_id = pinfo->port_id; + newrec->remoteport.port_state = FC_OBJSTATE_ONLINE; + newrec->remoteport.port_num = idx; + __nvme_fc_set_dev_loss_tmo(newrec, pinfo); + INIT_WORK(&newrec->lsrcv_work, nvme_fc_handle_ls_rqst_work); + + spin_lock_irqsave(&nvme_fc_lock, flags); + list_add_tail(&newrec->endp_list, &lport->endp_list); + spin_unlock_irqrestore(&nvme_fc_lock, flags); + + nvme_fc_signal_discovery_scan(lport, newrec); + + *portptr = &newrec->remoteport; + return 0; + +out_kfree_rport: + kfree(newrec); +out_lport_put: + nvme_fc_lport_put(lport); +out_reghost_failed: + *portptr = NULL; + return ret; +} +EXPORT_SYMBOL_GPL(nvme_fc_register_remoteport); + +static int +nvme_fc_abort_lsops(struct nvme_fc_rport *rport) +{ + struct nvmefc_ls_req_op *lsop; + unsigned long flags; + +restart: + spin_lock_irqsave(&rport->lock, flags); + + list_for_each_entry(lsop, &rport->ls_req_list, lsreq_list) { + if (!(lsop->flags & FCOP_FLAGS_TERMIO)) { + lsop->flags |= FCOP_FLAGS_TERMIO; + spin_unlock_irqrestore(&rport->lock, flags); + rport->lport->ops->ls_abort(&rport->lport->localport, + &rport->remoteport, + &lsop->ls_req); + goto restart; + } + } + spin_unlock_irqrestore(&rport->lock, flags); + + return 0; +} + +static void +nvme_fc_ctrl_connectivity_loss(struct nvme_fc_ctrl *ctrl) +{ + dev_info(ctrl->ctrl.device, + "NVME-FC{%d}: controller connectivity lost. Awaiting " + "Reconnect", ctrl->cnum); + + switch (ctrl->ctrl.state) { + case NVME_CTRL_NEW: + case NVME_CTRL_LIVE: + /* + * Schedule a controller reset. The reset will terminate the + * association and schedule the reconnect timer. Reconnects + * will be attempted until either the ctlr_loss_tmo + * (max_retries * connect_delay) expires or the remoteport's + * dev_loss_tmo expires. + */ + if (nvme_reset_ctrl(&ctrl->ctrl)) { + dev_warn(ctrl->ctrl.device, + "NVME-FC{%d}: Couldn't schedule reset.\n", + ctrl->cnum); + nvme_delete_ctrl(&ctrl->ctrl); + } + break; + + case NVME_CTRL_CONNECTING: + /* + * The association has already been terminated and the + * controller is attempting reconnects. No need to do anything + * futher. Reconnects will be attempted until either the + * ctlr_loss_tmo (max_retries * connect_delay) expires or the + * remoteport's dev_loss_tmo expires. + */ + break; + + case NVME_CTRL_RESETTING: + /* + * Controller is already in the process of terminating the + * association. No need to do anything further. The reconnect + * step will kick in naturally after the association is + * terminated. + */ + break; + + case NVME_CTRL_DELETING: + case NVME_CTRL_DELETING_NOIO: + default: + /* no action to take - let it delete */ + break; + } +} + +/** + * nvme_fc_unregister_remoteport - transport entry point called by an + * LLDD to deregister/remove a previously + * registered a NVME subsystem FC port. + * @portptr: pointer to the (registered) remote port that is to be + * deregistered. + * + * Returns: + * a completion status. Must be 0 upon success; a negative errno + * (ex: -ENXIO) upon failure. + */ +int +nvme_fc_unregister_remoteport(struct nvme_fc_remote_port *portptr) +{ + struct nvme_fc_rport *rport = remoteport_to_rport(portptr); + struct nvme_fc_ctrl *ctrl; + unsigned long flags; + + if (!portptr) + return -EINVAL; + + spin_lock_irqsave(&rport->lock, flags); + + if (portptr->port_state != FC_OBJSTATE_ONLINE) { + spin_unlock_irqrestore(&rport->lock, flags); + return -EINVAL; + } + portptr->port_state = FC_OBJSTATE_DELETED; + + rport->dev_loss_end = jiffies + (portptr->dev_loss_tmo * HZ); + + list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list) { + /* if dev_loss_tmo==0, dev loss is immediate */ + if (!portptr->dev_loss_tmo) { + dev_warn(ctrl->ctrl.device, + "NVME-FC{%d}: controller connectivity lost.\n", + ctrl->cnum); + nvme_delete_ctrl(&ctrl->ctrl); + } else + nvme_fc_ctrl_connectivity_loss(ctrl); + } + + spin_unlock_irqrestore(&rport->lock, flags); + + nvme_fc_abort_lsops(rport); + + if (atomic_read(&rport->act_ctrl_cnt) == 0) + rport->lport->ops->remoteport_delete(portptr); + + /* + * release the reference, which will allow, if all controllers + * go away, which should only occur after dev_loss_tmo occurs, + * for the rport to be torn down. + */ + nvme_fc_rport_put(rport); + + return 0; +} +EXPORT_SYMBOL_GPL(nvme_fc_unregister_remoteport); + +/** + * nvme_fc_rescan_remoteport - transport entry point called by an + * LLDD to request a nvme device rescan. + * @remoteport: pointer to the (registered) remote port that is to be + * rescanned. + * + * Returns: N/A + */ +void +nvme_fc_rescan_remoteport(struct nvme_fc_remote_port *remoteport) +{ + struct nvme_fc_rport *rport = remoteport_to_rport(remoteport); + + nvme_fc_signal_discovery_scan(rport->lport, rport); +} +EXPORT_SYMBOL_GPL(nvme_fc_rescan_remoteport); + +int +nvme_fc_set_remoteport_devloss(struct nvme_fc_remote_port *portptr, + u32 dev_loss_tmo) +{ + struct nvme_fc_rport *rport = remoteport_to_rport(portptr); + unsigned long flags; + + spin_lock_irqsave(&rport->lock, flags); + + if (portptr->port_state != FC_OBJSTATE_ONLINE) { + spin_unlock_irqrestore(&rport->lock, flags); + return -EINVAL; + } + + /* a dev_loss_tmo of 0 (immediate) is allowed to be set */ + rport->remoteport.dev_loss_tmo = dev_loss_tmo; + + spin_unlock_irqrestore(&rport->lock, flags); + + return 0; +} +EXPORT_SYMBOL_GPL(nvme_fc_set_remoteport_devloss); + + +/* *********************** FC-NVME DMA Handling **************************** */ + +/* + * The fcloop device passes in a NULL device pointer. Real LLD's will + * pass in a valid device pointer. If NULL is passed to the dma mapping + * routines, depending on the platform, it may or may not succeed, and + * may crash. + * + * As such: + * Wrapper all the dma routines and check the dev pointer. + * + * If simple mappings (return just a dma address, we'll noop them, + * returning a dma address of 0. + * + * On more complex mappings (dma_map_sg), a pseudo routine fills + * in the scatter list, setting all dma addresses to 0. + */ + +static inline dma_addr_t +fc_dma_map_single(struct device *dev, void *ptr, size_t size, + enum dma_data_direction dir) +{ + return dev ? dma_map_single(dev, ptr, size, dir) : (dma_addr_t)0L; +} + +static inline int +fc_dma_mapping_error(struct device *dev, dma_addr_t dma_addr) +{ + return dev ? dma_mapping_error(dev, dma_addr) : 0; +} + +static inline void +fc_dma_unmap_single(struct device *dev, dma_addr_t addr, size_t size, + enum dma_data_direction dir) +{ + if (dev) + dma_unmap_single(dev, addr, size, dir); +} + +static inline void +fc_dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size, + enum dma_data_direction dir) +{ + if (dev) + dma_sync_single_for_cpu(dev, addr, size, dir); +} + +static inline void +fc_dma_sync_single_for_device(struct device *dev, dma_addr_t addr, size_t size, + enum dma_data_direction dir) +{ + if (dev) + dma_sync_single_for_device(dev, addr, size, dir); +} + +/* pseudo dma_map_sg call */ +static int +fc_map_sg(struct scatterlist *sg, int nents) +{ + struct scatterlist *s; + int i; + + WARN_ON(nents == 0 || sg[0].length == 0); + + for_each_sg(sg, s, nents, i) { + s->dma_address = 0L; +#ifdef CONFIG_NEED_SG_DMA_LENGTH + s->dma_length = s->length; +#endif + } + return nents; +} + +static inline int +fc_dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, + enum dma_data_direction dir) +{ + return dev ? dma_map_sg(dev, sg, nents, dir) : fc_map_sg(sg, nents); +} + +static inline void +fc_dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, + enum dma_data_direction dir) +{ + if (dev) + dma_unmap_sg(dev, sg, nents, dir); +} + +/* *********************** FC-NVME LS Handling **************************** */ + +static void nvme_fc_ctrl_put(struct nvme_fc_ctrl *); +static int nvme_fc_ctrl_get(struct nvme_fc_ctrl *); + +static void nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg); + +static void +__nvme_fc_finish_ls_req(struct nvmefc_ls_req_op *lsop) +{ + struct nvme_fc_rport *rport = lsop->rport; + struct nvmefc_ls_req *lsreq = &lsop->ls_req; + unsigned long flags; + + spin_lock_irqsave(&rport->lock, flags); + + if (!lsop->req_queued) { + spin_unlock_irqrestore(&rport->lock, flags); + return; + } + + list_del(&lsop->lsreq_list); + + lsop->req_queued = false; + + spin_unlock_irqrestore(&rport->lock, flags); + + fc_dma_unmap_single(rport->dev, lsreq->rqstdma, + (lsreq->rqstlen + lsreq->rsplen), + DMA_BIDIRECTIONAL); + + nvme_fc_rport_put(rport); +} + +static int +__nvme_fc_send_ls_req(struct nvme_fc_rport *rport, + struct nvmefc_ls_req_op *lsop, + void (*done)(struct nvmefc_ls_req *req, int status)) +{ + struct nvmefc_ls_req *lsreq = &lsop->ls_req; + unsigned long flags; + int ret = 0; + + if (rport->remoteport.port_state != FC_OBJSTATE_ONLINE) + return -ECONNREFUSED; + + if (!nvme_fc_rport_get(rport)) + return -ESHUTDOWN; + + lsreq->done = done; + lsop->rport = rport; + lsop->req_queued = false; + INIT_LIST_HEAD(&lsop->lsreq_list); + init_completion(&lsop->ls_done); + + lsreq->rqstdma = fc_dma_map_single(rport->dev, lsreq->rqstaddr, + lsreq->rqstlen + lsreq->rsplen, + DMA_BIDIRECTIONAL); + if (fc_dma_mapping_error(rport->dev, lsreq->rqstdma)) { + ret = -EFAULT; + goto out_putrport; + } + lsreq->rspdma = lsreq->rqstdma + lsreq->rqstlen; + + spin_lock_irqsave(&rport->lock, flags); + + list_add_tail(&lsop->lsreq_list, &rport->ls_req_list); + + lsop->req_queued = true; + + spin_unlock_irqrestore(&rport->lock, flags); + + ret = rport->lport->ops->ls_req(&rport->lport->localport, + &rport->remoteport, lsreq); + if (ret) + goto out_unlink; + + return 0; + +out_unlink: + lsop->ls_error = ret; + spin_lock_irqsave(&rport->lock, flags); + lsop->req_queued = false; + list_del(&lsop->lsreq_list); + spin_unlock_irqrestore(&rport->lock, flags); + fc_dma_unmap_single(rport->dev, lsreq->rqstdma, + (lsreq->rqstlen + lsreq->rsplen), + DMA_BIDIRECTIONAL); +out_putrport: + nvme_fc_rport_put(rport); + + return ret; +} + +static void +nvme_fc_send_ls_req_done(struct nvmefc_ls_req *lsreq, int status) +{ + struct nvmefc_ls_req_op *lsop = ls_req_to_lsop(lsreq); + + lsop->ls_error = status; + complete(&lsop->ls_done); +} + +static int +nvme_fc_send_ls_req(struct nvme_fc_rport *rport, struct nvmefc_ls_req_op *lsop) +{ + struct nvmefc_ls_req *lsreq = &lsop->ls_req; + struct fcnvme_ls_rjt *rjt = lsreq->rspaddr; + int ret; + + ret = __nvme_fc_send_ls_req(rport, lsop, nvme_fc_send_ls_req_done); + + if (!ret) { + /* + * No timeout/not interruptible as we need the struct + * to exist until the lldd calls us back. Thus mandate + * wait until driver calls back. lldd responsible for + * the timeout action + */ + wait_for_completion(&lsop->ls_done); + + __nvme_fc_finish_ls_req(lsop); + + ret = lsop->ls_error; + } + + if (ret) + return ret; + + /* ACC or RJT payload ? */ + if (rjt->w0.ls_cmd == FCNVME_LS_RJT) + return -ENXIO; + + return 0; +} + +static int +nvme_fc_send_ls_req_async(struct nvme_fc_rport *rport, + struct nvmefc_ls_req_op *lsop, + void (*done)(struct nvmefc_ls_req *req, int status)) +{ + /* don't wait for completion */ + + return __nvme_fc_send_ls_req(rport, lsop, done); +} + +static int +nvme_fc_connect_admin_queue(struct nvme_fc_ctrl *ctrl, + struct nvme_fc_queue *queue, u16 qsize, u16 ersp_ratio) +{ + struct nvmefc_ls_req_op *lsop; + struct nvmefc_ls_req *lsreq; + struct fcnvme_ls_cr_assoc_rqst *assoc_rqst; + struct fcnvme_ls_cr_assoc_acc *assoc_acc; + unsigned long flags; + int ret, fcret = 0; + + lsop = kzalloc((sizeof(*lsop) + + sizeof(*assoc_rqst) + sizeof(*assoc_acc) + + ctrl->lport->ops->lsrqst_priv_sz), GFP_KERNEL); + if (!lsop) { + dev_info(ctrl->ctrl.device, + "NVME-FC{%d}: send Create Association failed: ENOMEM\n", + ctrl->cnum); + ret = -ENOMEM; + goto out_no_memory; + } + + assoc_rqst = (struct fcnvme_ls_cr_assoc_rqst *)&lsop[1]; + assoc_acc = (struct fcnvme_ls_cr_assoc_acc *)&assoc_rqst[1]; + lsreq = &lsop->ls_req; + if (ctrl->lport->ops->lsrqst_priv_sz) + lsreq->private = &assoc_acc[1]; + else + lsreq->private = NULL; + + assoc_rqst->w0.ls_cmd = FCNVME_LS_CREATE_ASSOCIATION; + assoc_rqst->desc_list_len = + cpu_to_be32(sizeof(struct fcnvme_lsdesc_cr_assoc_cmd)); + + assoc_rqst->assoc_cmd.desc_tag = + cpu_to_be32(FCNVME_LSDESC_CREATE_ASSOC_CMD); + assoc_rqst->assoc_cmd.desc_len = + fcnvme_lsdesc_len( + sizeof(struct fcnvme_lsdesc_cr_assoc_cmd)); + + assoc_rqst->assoc_cmd.ersp_ratio = cpu_to_be16(ersp_ratio); + assoc_rqst->assoc_cmd.sqsize = cpu_to_be16(qsize - 1); + /* Linux supports only Dynamic controllers */ + assoc_rqst->assoc_cmd.cntlid = cpu_to_be16(0xffff); + uuid_copy(&assoc_rqst->assoc_cmd.hostid, &ctrl->ctrl.opts->host->id); + strncpy(assoc_rqst->assoc_cmd.hostnqn, ctrl->ctrl.opts->host->nqn, + min(FCNVME_ASSOC_HOSTNQN_LEN, NVMF_NQN_SIZE)); + strncpy(assoc_rqst->assoc_cmd.subnqn, ctrl->ctrl.opts->subsysnqn, + min(FCNVME_ASSOC_SUBNQN_LEN, NVMF_NQN_SIZE)); + + lsop->queue = queue; + lsreq->rqstaddr = assoc_rqst; + lsreq->rqstlen = sizeof(*assoc_rqst); + lsreq->rspaddr = assoc_acc; + lsreq->rsplen = sizeof(*assoc_acc); + lsreq->timeout = NVME_FC_LS_TIMEOUT_SEC; + + ret = nvme_fc_send_ls_req(ctrl->rport, lsop); + if (ret) + goto out_free_buffer; + + /* process connect LS completion */ + + /* validate the ACC response */ + if (assoc_acc->hdr.w0.ls_cmd != FCNVME_LS_ACC) + fcret = VERR_LSACC; + else if (assoc_acc->hdr.desc_list_len != + fcnvme_lsdesc_len( + sizeof(struct fcnvme_ls_cr_assoc_acc))) + fcret = VERR_CR_ASSOC_ACC_LEN; + else if (assoc_acc->hdr.rqst.desc_tag != + cpu_to_be32(FCNVME_LSDESC_RQST)) + fcret = VERR_LSDESC_RQST; + else if (assoc_acc->hdr.rqst.desc_len != + fcnvme_lsdesc_len(sizeof(struct fcnvme_lsdesc_rqst))) + fcret = VERR_LSDESC_RQST_LEN; + else if (assoc_acc->hdr.rqst.w0.ls_cmd != FCNVME_LS_CREATE_ASSOCIATION) + fcret = VERR_CR_ASSOC; + else if (assoc_acc->associd.desc_tag != + cpu_to_be32(FCNVME_LSDESC_ASSOC_ID)) + fcret = VERR_ASSOC_ID; + else if (assoc_acc->associd.desc_len != + fcnvme_lsdesc_len( + sizeof(struct fcnvme_lsdesc_assoc_id))) + fcret = VERR_ASSOC_ID_LEN; + else if (assoc_acc->connectid.desc_tag != + cpu_to_be32(FCNVME_LSDESC_CONN_ID)) + fcret = VERR_CONN_ID; + else if (assoc_acc->connectid.desc_len != + fcnvme_lsdesc_len(sizeof(struct fcnvme_lsdesc_conn_id))) + fcret = VERR_CONN_ID_LEN; + + if (fcret) { + ret = -EBADF; + dev_err(ctrl->dev, + "q %d Create Association LS failed: %s\n", + queue->qnum, validation_errors[fcret]); + } else { + spin_lock_irqsave(&ctrl->lock, flags); + ctrl->association_id = + be64_to_cpu(assoc_acc->associd.association_id); + queue->connection_id = + be64_to_cpu(assoc_acc->connectid.connection_id); + set_bit(NVME_FC_Q_CONNECTED, &queue->flags); + spin_unlock_irqrestore(&ctrl->lock, flags); + } + +out_free_buffer: + kfree(lsop); +out_no_memory: + if (ret) + dev_err(ctrl->dev, + "queue %d connect admin queue failed (%d).\n", + queue->qnum, ret); + return ret; +} + +static int +nvme_fc_connect_queue(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue, + u16 qsize, u16 ersp_ratio) +{ + struct nvmefc_ls_req_op *lsop; + struct nvmefc_ls_req *lsreq; + struct fcnvme_ls_cr_conn_rqst *conn_rqst; + struct fcnvme_ls_cr_conn_acc *conn_acc; + int ret, fcret = 0; + + lsop = kzalloc((sizeof(*lsop) + + sizeof(*conn_rqst) + sizeof(*conn_acc) + + ctrl->lport->ops->lsrqst_priv_sz), GFP_KERNEL); + if (!lsop) { + dev_info(ctrl->ctrl.device, + "NVME-FC{%d}: send Create Connection failed: ENOMEM\n", + ctrl->cnum); + ret = -ENOMEM; + goto out_no_memory; + } + + conn_rqst = (struct fcnvme_ls_cr_conn_rqst *)&lsop[1]; + conn_acc = (struct fcnvme_ls_cr_conn_acc *)&conn_rqst[1]; + lsreq = &lsop->ls_req; + if (ctrl->lport->ops->lsrqst_priv_sz) + lsreq->private = (void *)&conn_acc[1]; + else + lsreq->private = NULL; + + conn_rqst->w0.ls_cmd = FCNVME_LS_CREATE_CONNECTION; + conn_rqst->desc_list_len = cpu_to_be32( + sizeof(struct fcnvme_lsdesc_assoc_id) + + sizeof(struct fcnvme_lsdesc_cr_conn_cmd)); + + conn_rqst->associd.desc_tag = cpu_to_be32(FCNVME_LSDESC_ASSOC_ID); + conn_rqst->associd.desc_len = + fcnvme_lsdesc_len( + sizeof(struct fcnvme_lsdesc_assoc_id)); + conn_rqst->associd.association_id = cpu_to_be64(ctrl->association_id); + conn_rqst->connect_cmd.desc_tag = + cpu_to_be32(FCNVME_LSDESC_CREATE_CONN_CMD); + conn_rqst->connect_cmd.desc_len = + fcnvme_lsdesc_len( + sizeof(struct fcnvme_lsdesc_cr_conn_cmd)); + conn_rqst->connect_cmd.ersp_ratio = cpu_to_be16(ersp_ratio); + conn_rqst->connect_cmd.qid = cpu_to_be16(queue->qnum); + conn_rqst->connect_cmd.sqsize = cpu_to_be16(qsize - 1); + + lsop->queue = queue; + lsreq->rqstaddr = conn_rqst; + lsreq->rqstlen = sizeof(*conn_rqst); + lsreq->rspaddr = conn_acc; + lsreq->rsplen = sizeof(*conn_acc); + lsreq->timeout = NVME_FC_LS_TIMEOUT_SEC; + + ret = nvme_fc_send_ls_req(ctrl->rport, lsop); + if (ret) + goto out_free_buffer; + + /* process connect LS completion */ + + /* validate the ACC response */ + if (conn_acc->hdr.w0.ls_cmd != FCNVME_LS_ACC) + fcret = VERR_LSACC; + else if (conn_acc->hdr.desc_list_len != + fcnvme_lsdesc_len(sizeof(struct fcnvme_ls_cr_conn_acc))) + fcret = VERR_CR_CONN_ACC_LEN; + else if (conn_acc->hdr.rqst.desc_tag != cpu_to_be32(FCNVME_LSDESC_RQST)) + fcret = VERR_LSDESC_RQST; + else if (conn_acc->hdr.rqst.desc_len != + fcnvme_lsdesc_len(sizeof(struct fcnvme_lsdesc_rqst))) + fcret = VERR_LSDESC_RQST_LEN; + else if (conn_acc->hdr.rqst.w0.ls_cmd != FCNVME_LS_CREATE_CONNECTION) + fcret = VERR_CR_CONN; + else if (conn_acc->connectid.desc_tag != + cpu_to_be32(FCNVME_LSDESC_CONN_ID)) + fcret = VERR_CONN_ID; + else if (conn_acc->connectid.desc_len != + fcnvme_lsdesc_len(sizeof(struct fcnvme_lsdesc_conn_id))) + fcret = VERR_CONN_ID_LEN; + + if (fcret) { + ret = -EBADF; + dev_err(ctrl->dev, + "q %d Create I/O Connection LS failed: %s\n", + queue->qnum, validation_errors[fcret]); + } else { + queue->connection_id = + be64_to_cpu(conn_acc->connectid.connection_id); + set_bit(NVME_FC_Q_CONNECTED, &queue->flags); + } + +out_free_buffer: + kfree(lsop); +out_no_memory: + if (ret) + dev_err(ctrl->dev, + "queue %d connect I/O queue failed (%d).\n", + queue->qnum, ret); + return ret; +} + +static void +nvme_fc_disconnect_assoc_done(struct nvmefc_ls_req *lsreq, int status) +{ + struct nvmefc_ls_req_op *lsop = ls_req_to_lsop(lsreq); + + __nvme_fc_finish_ls_req(lsop); + + /* fc-nvme initiator doesn't care about success or failure of cmd */ + + kfree(lsop); +} + +/* + * This routine sends a FC-NVME LS to disconnect (aka terminate) + * the FC-NVME Association. Terminating the association also + * terminates the FC-NVME connections (per queue, both admin and io + * queues) that are part of the association. E.g. things are torn + * down, and the related FC-NVME Association ID and Connection IDs + * become invalid. + * + * The behavior of the fc-nvme initiator is such that it's + * understanding of the association and connections will implicitly + * be torn down. The action is implicit as it may be due to a loss of + * connectivity with the fc-nvme target, so you may never get a + * response even if you tried. As such, the action of this routine + * is to asynchronously send the LS, ignore any results of the LS, and + * continue on with terminating the association. If the fc-nvme target + * is present and receives the LS, it too can tear down. + */ +static void +nvme_fc_xmt_disconnect_assoc(struct nvme_fc_ctrl *ctrl) +{ + struct fcnvme_ls_disconnect_assoc_rqst *discon_rqst; + struct fcnvme_ls_disconnect_assoc_acc *discon_acc; + struct nvmefc_ls_req_op *lsop; + struct nvmefc_ls_req *lsreq; + int ret; + + lsop = kzalloc((sizeof(*lsop) + + sizeof(*discon_rqst) + sizeof(*discon_acc) + + ctrl->lport->ops->lsrqst_priv_sz), GFP_KERNEL); + if (!lsop) { + dev_info(ctrl->ctrl.device, + "NVME-FC{%d}: send Disconnect Association " + "failed: ENOMEM\n", + ctrl->cnum); + return; + } + + discon_rqst = (struct fcnvme_ls_disconnect_assoc_rqst *)&lsop[1]; + discon_acc = (struct fcnvme_ls_disconnect_assoc_acc *)&discon_rqst[1]; + lsreq = &lsop->ls_req; + if (ctrl->lport->ops->lsrqst_priv_sz) + lsreq->private = (void *)&discon_acc[1]; + else + lsreq->private = NULL; + + nvmefc_fmt_lsreq_discon_assoc(lsreq, discon_rqst, discon_acc, + ctrl->association_id); + + ret = nvme_fc_send_ls_req_async(ctrl->rport, lsop, + nvme_fc_disconnect_assoc_done); + if (ret) + kfree(lsop); +} + +static void +nvme_fc_xmt_ls_rsp_done(struct nvmefc_ls_rsp *lsrsp) +{ + struct nvmefc_ls_rcv_op *lsop = lsrsp->nvme_fc_private; + struct nvme_fc_rport *rport = lsop->rport; + struct nvme_fc_lport *lport = rport->lport; + unsigned long flags; + + spin_lock_irqsave(&rport->lock, flags); + list_del(&lsop->lsrcv_list); + spin_unlock_irqrestore(&rport->lock, flags); + + fc_dma_sync_single_for_cpu(lport->dev, lsop->rspdma, + sizeof(*lsop->rspbuf), DMA_TO_DEVICE); + fc_dma_unmap_single(lport->dev, lsop->rspdma, + sizeof(*lsop->rspbuf), DMA_TO_DEVICE); + + kfree(lsop); + + nvme_fc_rport_put(rport); +} + +static void +nvme_fc_xmt_ls_rsp(struct nvmefc_ls_rcv_op *lsop) +{ + struct nvme_fc_rport *rport = lsop->rport; + struct nvme_fc_lport *lport = rport->lport; + struct fcnvme_ls_rqst_w0 *w0 = &lsop->rqstbuf->w0; + int ret; + + fc_dma_sync_single_for_device(lport->dev, lsop->rspdma, + sizeof(*lsop->rspbuf), DMA_TO_DEVICE); + + ret = lport->ops->xmt_ls_rsp(&lport->localport, &rport->remoteport, + lsop->lsrsp); + if (ret) { + dev_warn(lport->dev, + "LLDD rejected LS RSP xmt: LS %d status %d\n", + w0->ls_cmd, ret); + nvme_fc_xmt_ls_rsp_done(lsop->lsrsp); + return; + } +} + +static struct nvme_fc_ctrl * +nvme_fc_match_disconn_ls(struct nvme_fc_rport *rport, + struct nvmefc_ls_rcv_op *lsop) +{ + struct fcnvme_ls_disconnect_assoc_rqst *rqst = + &lsop->rqstbuf->rq_dis_assoc; + struct nvme_fc_ctrl *ctrl, *ret = NULL; + struct nvmefc_ls_rcv_op *oldls = NULL; + u64 association_id = be64_to_cpu(rqst->associd.association_id); + unsigned long flags; + + spin_lock_irqsave(&rport->lock, flags); + + list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list) { + if (!nvme_fc_ctrl_get(ctrl)) + continue; + spin_lock(&ctrl->lock); + if (association_id == ctrl->association_id) { + oldls = ctrl->rcv_disconn; + ctrl->rcv_disconn = lsop; + ret = ctrl; + } + spin_unlock(&ctrl->lock); + if (ret) + /* leave the ctrl get reference */ + break; + nvme_fc_ctrl_put(ctrl); + } + + spin_unlock_irqrestore(&rport->lock, flags); + + /* transmit a response for anything that was pending */ + if (oldls) { + dev_info(rport->lport->dev, + "NVME-FC{%d}: Multiple Disconnect Association " + "LS's received\n", ctrl->cnum); + /* overwrite good response with bogus failure */ + oldls->lsrsp->rsplen = nvme_fc_format_rjt(oldls->rspbuf, + sizeof(*oldls->rspbuf), + rqst->w0.ls_cmd, + FCNVME_RJT_RC_UNAB, + FCNVME_RJT_EXP_NONE, 0); + nvme_fc_xmt_ls_rsp(oldls); + } + + return ret; +} + +/* + * returns true to mean LS handled and ls_rsp can be sent + * returns false to defer ls_rsp xmt (will be done as part of + * association termination) + */ +static bool +nvme_fc_ls_disconnect_assoc(struct nvmefc_ls_rcv_op *lsop) +{ + struct nvme_fc_rport *rport = lsop->rport; + struct fcnvme_ls_disconnect_assoc_rqst *rqst = + &lsop->rqstbuf->rq_dis_assoc; + struct fcnvme_ls_disconnect_assoc_acc *acc = + &lsop->rspbuf->rsp_dis_assoc; + struct nvme_fc_ctrl *ctrl = NULL; + int ret = 0; + + memset(acc, 0, sizeof(*acc)); + + ret = nvmefc_vldt_lsreq_discon_assoc(lsop->rqstdatalen, rqst); + if (!ret) { + /* match an active association */ + ctrl = nvme_fc_match_disconn_ls(rport, lsop); + if (!ctrl) + ret = VERR_NO_ASSOC; + } + + if (ret) { + dev_info(rport->lport->dev, + "Disconnect LS failed: %s\n", + validation_errors[ret]); + lsop->lsrsp->rsplen = nvme_fc_format_rjt(acc, + sizeof(*acc), rqst->w0.ls_cmd, + (ret == VERR_NO_ASSOC) ? + FCNVME_RJT_RC_INV_ASSOC : + FCNVME_RJT_RC_LOGIC, + FCNVME_RJT_EXP_NONE, 0); + return true; + } + + /* format an ACCept response */ + + lsop->lsrsp->rsplen = sizeof(*acc); + + nvme_fc_format_rsp_hdr(acc, FCNVME_LS_ACC, + fcnvme_lsdesc_len( + sizeof(struct fcnvme_ls_disconnect_assoc_acc)), + FCNVME_LS_DISCONNECT_ASSOC); + + /* + * the transmit of the response will occur after the exchanges + * for the association have been ABTS'd by + * nvme_fc_delete_association(). + */ + + /* fail the association */ + nvme_fc_error_recovery(ctrl, "Disconnect Association LS received"); + + /* release the reference taken by nvme_fc_match_disconn_ls() */ + nvme_fc_ctrl_put(ctrl); + + return false; +} + +/* + * Actual Processing routine for received FC-NVME LS Requests from the LLD + * returns true if a response should be sent afterward, false if rsp will + * be sent asynchronously. + */ +static bool +nvme_fc_handle_ls_rqst(struct nvmefc_ls_rcv_op *lsop) +{ + struct fcnvme_ls_rqst_w0 *w0 = &lsop->rqstbuf->w0; + bool ret = true; + + lsop->lsrsp->nvme_fc_private = lsop; + lsop->lsrsp->rspbuf = lsop->rspbuf; + lsop->lsrsp->rspdma = lsop->rspdma; + lsop->lsrsp->done = nvme_fc_xmt_ls_rsp_done; + /* Be preventative. handlers will later set to valid length */ + lsop->lsrsp->rsplen = 0; + + /* + * handlers: + * parse request input, execute the request, and format the + * LS response + */ + switch (w0->ls_cmd) { + case FCNVME_LS_DISCONNECT_ASSOC: + ret = nvme_fc_ls_disconnect_assoc(lsop); + break; + case FCNVME_LS_DISCONNECT_CONN: + lsop->lsrsp->rsplen = nvme_fc_format_rjt(lsop->rspbuf, + sizeof(*lsop->rspbuf), w0->ls_cmd, + FCNVME_RJT_RC_UNSUP, FCNVME_RJT_EXP_NONE, 0); + break; + case FCNVME_LS_CREATE_ASSOCIATION: + case FCNVME_LS_CREATE_CONNECTION: + lsop->lsrsp->rsplen = nvme_fc_format_rjt(lsop->rspbuf, + sizeof(*lsop->rspbuf), w0->ls_cmd, + FCNVME_RJT_RC_LOGIC, FCNVME_RJT_EXP_NONE, 0); + break; + default: + lsop->lsrsp->rsplen = nvme_fc_format_rjt(lsop->rspbuf, + sizeof(*lsop->rspbuf), w0->ls_cmd, + FCNVME_RJT_RC_INVAL, FCNVME_RJT_EXP_NONE, 0); + break; + } + + return(ret); +} + +static void +nvme_fc_handle_ls_rqst_work(struct work_struct *work) +{ + struct nvme_fc_rport *rport = + container_of(work, struct nvme_fc_rport, lsrcv_work); + struct fcnvme_ls_rqst_w0 *w0; + struct nvmefc_ls_rcv_op *lsop; + unsigned long flags; + bool sendrsp; + +restart: + sendrsp = true; + spin_lock_irqsave(&rport->lock, flags); + list_for_each_entry(lsop, &rport->ls_rcv_list, lsrcv_list) { + if (lsop->handled) + continue; + + lsop->handled = true; + if (rport->remoteport.port_state == FC_OBJSTATE_ONLINE) { + spin_unlock_irqrestore(&rport->lock, flags); + sendrsp = nvme_fc_handle_ls_rqst(lsop); + } else { + spin_unlock_irqrestore(&rport->lock, flags); + w0 = &lsop->rqstbuf->w0; + lsop->lsrsp->rsplen = nvme_fc_format_rjt( + lsop->rspbuf, + sizeof(*lsop->rspbuf), + w0->ls_cmd, + FCNVME_RJT_RC_UNAB, + FCNVME_RJT_EXP_NONE, 0); + } + if (sendrsp) + nvme_fc_xmt_ls_rsp(lsop); + goto restart; + } + spin_unlock_irqrestore(&rport->lock, flags); +} + +/** + * nvme_fc_rcv_ls_req - transport entry point called by an LLDD + * upon the reception of a NVME LS request. + * + * The nvme-fc layer will copy payload to an internal structure for + * processing. As such, upon completion of the routine, the LLDD may + * immediately free/reuse the LS request buffer passed in the call. + * + * If this routine returns error, the LLDD should abort the exchange. + * + * @portptr: pointer to the (registered) remote port that the LS + * was received from. The remoteport is associated with + * a specific localport. + * @lsrsp: pointer to a nvmefc_ls_rsp response structure to be + * used to reference the exchange corresponding to the LS + * when issuing an ls response. + * @lsreqbuf: pointer to the buffer containing the LS Request + * @lsreqbuf_len: length, in bytes, of the received LS request + */ +int +nvme_fc_rcv_ls_req(struct nvme_fc_remote_port *portptr, + struct nvmefc_ls_rsp *lsrsp, + void *lsreqbuf, u32 lsreqbuf_len) +{ + struct nvme_fc_rport *rport = remoteport_to_rport(portptr); + struct nvme_fc_lport *lport = rport->lport; + struct fcnvme_ls_rqst_w0 *w0 = (struct fcnvme_ls_rqst_w0 *)lsreqbuf; + struct nvmefc_ls_rcv_op *lsop; + unsigned long flags; + int ret; + + nvme_fc_rport_get(rport); + + /* validate there's a routine to transmit a response */ + if (!lport->ops->xmt_ls_rsp) { + dev_info(lport->dev, + "RCV %s LS failed: no LLDD xmt_ls_rsp\n", + (w0->ls_cmd <= NVME_FC_LAST_LS_CMD_VALUE) ? + nvmefc_ls_names[w0->ls_cmd] : ""); + ret = -EINVAL; + goto out_put; + } + + if (lsreqbuf_len > sizeof(union nvmefc_ls_requests)) { + dev_info(lport->dev, + "RCV %s LS failed: payload too large\n", + (w0->ls_cmd <= NVME_FC_LAST_LS_CMD_VALUE) ? + nvmefc_ls_names[w0->ls_cmd] : ""); + ret = -E2BIG; + goto out_put; + } + + lsop = kzalloc(sizeof(*lsop) + + sizeof(union nvmefc_ls_requests) + + sizeof(union nvmefc_ls_responses), + GFP_KERNEL); + if (!lsop) { + dev_info(lport->dev, + "RCV %s LS failed: No memory\n", + (w0->ls_cmd <= NVME_FC_LAST_LS_CMD_VALUE) ? + nvmefc_ls_names[w0->ls_cmd] : ""); + ret = -ENOMEM; + goto out_put; + } + lsop->rqstbuf = (union nvmefc_ls_requests *)&lsop[1]; + lsop->rspbuf = (union nvmefc_ls_responses *)&lsop->rqstbuf[1]; + + lsop->rspdma = fc_dma_map_single(lport->dev, lsop->rspbuf, + sizeof(*lsop->rspbuf), + DMA_TO_DEVICE); + if (fc_dma_mapping_error(lport->dev, lsop->rspdma)) { + dev_info(lport->dev, + "RCV %s LS failed: DMA mapping failure\n", + (w0->ls_cmd <= NVME_FC_LAST_LS_CMD_VALUE) ? + nvmefc_ls_names[w0->ls_cmd] : ""); + ret = -EFAULT; + goto out_free; + } + + lsop->rport = rport; + lsop->lsrsp = lsrsp; + + memcpy(lsop->rqstbuf, lsreqbuf, lsreqbuf_len); + lsop->rqstdatalen = lsreqbuf_len; + + spin_lock_irqsave(&rport->lock, flags); + if (rport->remoteport.port_state != FC_OBJSTATE_ONLINE) { + spin_unlock_irqrestore(&rport->lock, flags); + ret = -ENOTCONN; + goto out_unmap; + } + list_add_tail(&lsop->lsrcv_list, &rport->ls_rcv_list); + spin_unlock_irqrestore(&rport->lock, flags); + + schedule_work(&rport->lsrcv_work); + + return 0; + +out_unmap: + fc_dma_unmap_single(lport->dev, lsop->rspdma, + sizeof(*lsop->rspbuf), DMA_TO_DEVICE); +out_free: + kfree(lsop); +out_put: + nvme_fc_rport_put(rport); + return ret; +} +EXPORT_SYMBOL_GPL(nvme_fc_rcv_ls_req); + + +/* *********************** NVME Ctrl Routines **************************** */ + +static void +__nvme_fc_exit_request(struct nvme_fc_ctrl *ctrl, + struct nvme_fc_fcp_op *op) +{ + fc_dma_unmap_single(ctrl->lport->dev, op->fcp_req.rspdma, + sizeof(op->rsp_iu), DMA_FROM_DEVICE); + fc_dma_unmap_single(ctrl->lport->dev, op->fcp_req.cmddma, + sizeof(op->cmd_iu), DMA_TO_DEVICE); + + atomic_set(&op->state, FCPOP_STATE_UNINIT); +} + +static void +nvme_fc_exit_request(struct blk_mq_tag_set *set, struct request *rq, + unsigned int hctx_idx) +{ + struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq); + + return __nvme_fc_exit_request(set->driver_data, op); +} + +static int +__nvme_fc_abort_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_fcp_op *op) +{ + unsigned long flags; + int opstate; + + spin_lock_irqsave(&ctrl->lock, flags); + opstate = atomic_xchg(&op->state, FCPOP_STATE_ABORTED); + if (opstate != FCPOP_STATE_ACTIVE) + atomic_set(&op->state, opstate); + else if (test_bit(FCCTRL_TERMIO, &ctrl->flags)) { + op->flags |= FCOP_FLAGS_TERMIO; + ctrl->iocnt++; + } + spin_unlock_irqrestore(&ctrl->lock, flags); + + if (opstate != FCPOP_STATE_ACTIVE) + return -ECANCELED; + + ctrl->lport->ops->fcp_abort(&ctrl->lport->localport, + &ctrl->rport->remoteport, + op->queue->lldd_handle, + &op->fcp_req); + + return 0; +} + +static void +nvme_fc_abort_aen_ops(struct nvme_fc_ctrl *ctrl) +{ + struct nvme_fc_fcp_op *aen_op = ctrl->aen_ops; + int i; + + /* ensure we've initialized the ops once */ + if (!(aen_op->flags & FCOP_FLAGS_AEN)) + return; + + for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++) + __nvme_fc_abort_op(ctrl, aen_op); +} + +static inline void +__nvme_fc_fcpop_chk_teardowns(struct nvme_fc_ctrl *ctrl, + struct nvme_fc_fcp_op *op, int opstate) +{ + unsigned long flags; + + if (opstate == FCPOP_STATE_ABORTED) { + spin_lock_irqsave(&ctrl->lock, flags); + if (test_bit(FCCTRL_TERMIO, &ctrl->flags) && + op->flags & FCOP_FLAGS_TERMIO) { + if (!--ctrl->iocnt) + wake_up(&ctrl->ioabort_wait); + } + spin_unlock_irqrestore(&ctrl->lock, flags); + } +} + +static void +nvme_fc_ctrl_ioerr_work(struct work_struct *work) +{ + struct nvme_fc_ctrl *ctrl = + container_of(work, struct nvme_fc_ctrl, ioerr_work); + + nvme_fc_error_recovery(ctrl, "transport detected io error"); +} + +static void +nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) +{ + struct nvme_fc_fcp_op *op = fcp_req_to_fcp_op(req); + struct request *rq = op->rq; + struct nvmefc_fcp_req *freq = &op->fcp_req; + struct nvme_fc_ctrl *ctrl = op->ctrl; + struct nvme_fc_queue *queue = op->queue; + struct nvme_completion *cqe = &op->rsp_iu.cqe; + struct nvme_command *sqe = &op->cmd_iu.sqe; + __le16 status = cpu_to_le16(NVME_SC_SUCCESS << 1); + union nvme_result result; + bool terminate_assoc = true; + int opstate; + + /* + * WARNING: + * The current linux implementation of a nvme controller + * allocates a single tag set for all io queues and sizes + * the io queues to fully hold all possible tags. Thus, the + * implementation does not reference or care about the sqhd + * value as it never needs to use the sqhd/sqtail pointers + * for submission pacing. + * + * This affects the FC-NVME implementation in two ways: + * 1) As the value doesn't matter, we don't need to waste + * cycles extracting it from ERSPs and stamping it in the + * cases where the transport fabricates CQEs on successful + * completions. + * 2) The FC-NVME implementation requires that delivery of + * ERSP completions are to go back to the nvme layer in order + * relative to the rsn, such that the sqhd value will always + * be "in order" for the nvme layer. As the nvme layer in + * linux doesn't care about sqhd, there's no need to return + * them in order. + * + * Additionally: + * As the core nvme layer in linux currently does not look at + * every field in the cqe - in cases where the FC transport must + * fabricate a CQE, the following fields will not be set as they + * are not referenced: + * cqe.sqid, cqe.sqhd, cqe.command_id + * + * Failure or error of an individual i/o, in a transport + * detected fashion unrelated to the nvme completion status, + * potentially cause the initiator and target sides to get out + * of sync on SQ head/tail (aka outstanding io count allowed). + * Per FC-NVME spec, failure of an individual command requires + * the connection to be terminated, which in turn requires the + * association to be terminated. + */ + + opstate = atomic_xchg(&op->state, FCPOP_STATE_COMPLETE); + + fc_dma_sync_single_for_cpu(ctrl->lport->dev, op->fcp_req.rspdma, + sizeof(op->rsp_iu), DMA_FROM_DEVICE); + + if (opstate == FCPOP_STATE_ABORTED) + status = cpu_to_le16(NVME_SC_HOST_ABORTED_CMD << 1); + else if (freq->status) { + status = cpu_to_le16(NVME_SC_HOST_PATH_ERROR << 1); + dev_info(ctrl->ctrl.device, + "NVME-FC{%d}: io failed due to lldd error %d\n", + ctrl->cnum, freq->status); + } + + /* + * For the linux implementation, if we have an unsuccesful + * status, they blk-mq layer can typically be called with the + * non-zero status and the content of the cqe isn't important. + */ + if (status) + goto done; + + /* + * command completed successfully relative to the wire + * protocol. However, validate anything received and + * extract the status and result from the cqe (create it + * where necessary). + */ + + switch (freq->rcv_rsplen) { + + case 0: + case NVME_FC_SIZEOF_ZEROS_RSP: + /* + * No response payload or 12 bytes of payload (which + * should all be zeros) are considered successful and + * no payload in the CQE by the transport. + */ + if (freq->transferred_length != + be32_to_cpu(op->cmd_iu.data_len)) { + status = cpu_to_le16(NVME_SC_HOST_PATH_ERROR << 1); + dev_info(ctrl->ctrl.device, + "NVME-FC{%d}: io failed due to bad transfer " + "length: %d vs expected %d\n", + ctrl->cnum, freq->transferred_length, + be32_to_cpu(op->cmd_iu.data_len)); + goto done; + } + result.u64 = 0; + break; + + case sizeof(struct nvme_fc_ersp_iu): + /* + * The ERSP IU contains a full completion with CQE. + * Validate ERSP IU and look at cqe. + */ + if (unlikely(be16_to_cpu(op->rsp_iu.iu_len) != + (freq->rcv_rsplen / 4) || + be32_to_cpu(op->rsp_iu.xfrd_len) != + freq->transferred_length || + op->rsp_iu.ersp_result || + sqe->common.command_id != cqe->command_id)) { + status = cpu_to_le16(NVME_SC_HOST_PATH_ERROR << 1); + dev_info(ctrl->ctrl.device, + "NVME-FC{%d}: io failed due to bad NVMe_ERSP: " + "iu len %d, xfr len %d vs %d, status code " + "%d, cmdid %d vs %d\n", + ctrl->cnum, be16_to_cpu(op->rsp_iu.iu_len), + be32_to_cpu(op->rsp_iu.xfrd_len), + freq->transferred_length, + op->rsp_iu.ersp_result, + sqe->common.command_id, + cqe->command_id); + goto done; + } + result = cqe->result; + status = cqe->status; + break; + + default: + status = cpu_to_le16(NVME_SC_HOST_PATH_ERROR << 1); + dev_info(ctrl->ctrl.device, + "NVME-FC{%d}: io failed due to odd NVMe_xRSP iu " + "len %d\n", + ctrl->cnum, freq->rcv_rsplen); + goto done; + } + + terminate_assoc = false; + +done: + if (op->flags & FCOP_FLAGS_AEN) { + nvme_complete_async_event(&queue->ctrl->ctrl, status, &result); + __nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate); + atomic_set(&op->state, FCPOP_STATE_IDLE); + op->flags = FCOP_FLAGS_AEN; /* clear other flags */ + nvme_fc_ctrl_put(ctrl); + goto check_error; + } + + __nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate); + if (!nvme_try_complete_req(rq, status, result)) + nvme_fc_complete_rq(rq); + +check_error: + if (terminate_assoc && ctrl->ctrl.state != NVME_CTRL_RESETTING) + queue_work(nvme_reset_wq, &ctrl->ioerr_work); +} + +static int +__nvme_fc_init_request(struct nvme_fc_ctrl *ctrl, + struct nvme_fc_queue *queue, struct nvme_fc_fcp_op *op, + struct request *rq, u32 rqno) +{ + struct nvme_fcp_op_w_sgl *op_w_sgl = + container_of(op, typeof(*op_w_sgl), op); + struct nvme_fc_cmd_iu *cmdiu = &op->cmd_iu; + int ret = 0; + + memset(op, 0, sizeof(*op)); + op->fcp_req.cmdaddr = &op->cmd_iu; + op->fcp_req.cmdlen = sizeof(op->cmd_iu); + op->fcp_req.rspaddr = &op->rsp_iu; + op->fcp_req.rsplen = sizeof(op->rsp_iu); + op->fcp_req.done = nvme_fc_fcpio_done; + op->ctrl = ctrl; + op->queue = queue; + op->rq = rq; + op->rqno = rqno; + + cmdiu->format_id = NVME_CMD_FORMAT_ID; + cmdiu->fc_id = NVME_CMD_FC_ID; + cmdiu->iu_len = cpu_to_be16(sizeof(*cmdiu) / sizeof(u32)); + if (queue->qnum) + cmdiu->rsv_cat = fccmnd_set_cat_css(0, + (NVME_CC_CSS_NVM >> NVME_CC_CSS_SHIFT)); + else + cmdiu->rsv_cat = fccmnd_set_cat_admin(0); + + op->fcp_req.cmddma = fc_dma_map_single(ctrl->lport->dev, + &op->cmd_iu, sizeof(op->cmd_iu), DMA_TO_DEVICE); + if (fc_dma_mapping_error(ctrl->lport->dev, op->fcp_req.cmddma)) { + dev_err(ctrl->dev, + "FCP Op failed - cmdiu dma mapping failed.\n"); + ret = -EFAULT; + goto out_on_error; + } + + op->fcp_req.rspdma = fc_dma_map_single(ctrl->lport->dev, + &op->rsp_iu, sizeof(op->rsp_iu), + DMA_FROM_DEVICE); + if (fc_dma_mapping_error(ctrl->lport->dev, op->fcp_req.rspdma)) { + dev_err(ctrl->dev, + "FCP Op failed - rspiu dma mapping failed.\n"); + ret = -EFAULT; + } + + atomic_set(&op->state, FCPOP_STATE_IDLE); +out_on_error: + return ret; +} + +static int +nvme_fc_init_request(struct blk_mq_tag_set *set, struct request *rq, + unsigned int hctx_idx, unsigned int numa_node) +{ + struct nvme_fc_ctrl *ctrl = set->driver_data; + struct nvme_fcp_op_w_sgl *op = blk_mq_rq_to_pdu(rq); + int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0; + struct nvme_fc_queue *queue = &ctrl->queues[queue_idx]; + int res; + + res = __nvme_fc_init_request(ctrl, queue, &op->op, rq, queue->rqcnt++); + if (res) + return res; + op->op.fcp_req.first_sgl = op->sgl; + op->op.fcp_req.private = &op->priv[0]; + nvme_req(rq)->ctrl = &ctrl->ctrl; + nvme_req(rq)->cmd = &op->op.cmd_iu.sqe; + return res; +} + +static int +nvme_fc_init_aen_ops(struct nvme_fc_ctrl *ctrl) +{ + struct nvme_fc_fcp_op *aen_op; + struct nvme_fc_cmd_iu *cmdiu; + struct nvme_command *sqe; + void *private = NULL; + int i, ret; + + aen_op = ctrl->aen_ops; + for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++) { + if (ctrl->lport->ops->fcprqst_priv_sz) { + private = kzalloc(ctrl->lport->ops->fcprqst_priv_sz, + GFP_KERNEL); + if (!private) + return -ENOMEM; + } + + cmdiu = &aen_op->cmd_iu; + sqe = &cmdiu->sqe; + ret = __nvme_fc_init_request(ctrl, &ctrl->queues[0], + aen_op, (struct request *)NULL, + (NVME_AQ_BLK_MQ_DEPTH + i)); + if (ret) { + kfree(private); + return ret; + } + + aen_op->flags = FCOP_FLAGS_AEN; + aen_op->fcp_req.private = private; + + memset(sqe, 0, sizeof(*sqe)); + sqe->common.opcode = nvme_admin_async_event; + /* Note: core layer may overwrite the sqe.command_id value */ + sqe->common.command_id = NVME_AQ_BLK_MQ_DEPTH + i; + } + return 0; +} + +static void +nvme_fc_term_aen_ops(struct nvme_fc_ctrl *ctrl) +{ + struct nvme_fc_fcp_op *aen_op; + int i; + + cancel_work_sync(&ctrl->ctrl.async_event_work); + aen_op = ctrl->aen_ops; + for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++) { + __nvme_fc_exit_request(ctrl, aen_op); + + kfree(aen_op->fcp_req.private); + aen_op->fcp_req.private = NULL; + } +} + +static inline void +__nvme_fc_init_hctx(struct blk_mq_hw_ctx *hctx, struct nvme_fc_ctrl *ctrl, + unsigned int qidx) +{ + struct nvme_fc_queue *queue = &ctrl->queues[qidx]; + + hctx->driver_data = queue; + queue->hctx = hctx; +} + +static int +nvme_fc_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) +{ + struct nvme_fc_ctrl *ctrl = data; + + __nvme_fc_init_hctx(hctx, ctrl, hctx_idx + 1); + + return 0; +} + +static int +nvme_fc_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) +{ + struct nvme_fc_ctrl *ctrl = data; + + __nvme_fc_init_hctx(hctx, ctrl, hctx_idx); + + return 0; +} + +static void +nvme_fc_init_queue(struct nvme_fc_ctrl *ctrl, int idx) +{ + struct nvme_fc_queue *queue; + + queue = &ctrl->queues[idx]; + memset(queue, 0, sizeof(*queue)); + queue->ctrl = ctrl; + queue->qnum = idx; + atomic_set(&queue->csn, 0); + queue->dev = ctrl->dev; + + if (idx > 0) + queue->cmnd_capsule_len = ctrl->ctrl.ioccsz * 16; + else + queue->cmnd_capsule_len = sizeof(struct nvme_command); + + /* + * Considered whether we should allocate buffers for all SQEs + * and CQEs and dma map them - mapping their respective entries + * into the request structures (kernel vm addr and dma address) + * thus the driver could use the buffers/mappings directly. + * It only makes sense if the LLDD would use them for its + * messaging api. It's very unlikely most adapter api's would use + * a native NVME sqe/cqe. More reasonable if FC-NVME IU payload + * structures were used instead. + */ +} + +/* + * This routine terminates a queue at the transport level. + * The transport has already ensured that all outstanding ios on + * the queue have been terminated. + * The transport will send a Disconnect LS request to terminate + * the queue's connection. Termination of the admin queue will also + * terminate the association at the target. + */ +static void +nvme_fc_free_queue(struct nvme_fc_queue *queue) +{ + if (!test_and_clear_bit(NVME_FC_Q_CONNECTED, &queue->flags)) + return; + + clear_bit(NVME_FC_Q_LIVE, &queue->flags); + /* + * Current implementation never disconnects a single queue. + * It always terminates a whole association. So there is never + * a disconnect(queue) LS sent to the target. + */ + + queue->connection_id = 0; + atomic_set(&queue->csn, 0); +} + +static void +__nvme_fc_delete_hw_queue(struct nvme_fc_ctrl *ctrl, + struct nvme_fc_queue *queue, unsigned int qidx) +{ + if (ctrl->lport->ops->delete_queue) + ctrl->lport->ops->delete_queue(&ctrl->lport->localport, qidx, + queue->lldd_handle); + queue->lldd_handle = NULL; +} + +static void +nvme_fc_free_io_queues(struct nvme_fc_ctrl *ctrl) +{ + int i; + + for (i = 1; i < ctrl->ctrl.queue_count; i++) + nvme_fc_free_queue(&ctrl->queues[i]); +} + +static int +__nvme_fc_create_hw_queue(struct nvme_fc_ctrl *ctrl, + struct nvme_fc_queue *queue, unsigned int qidx, u16 qsize) +{ + int ret = 0; + + queue->lldd_handle = NULL; + if (ctrl->lport->ops->create_queue) + ret = ctrl->lport->ops->create_queue(&ctrl->lport->localport, + qidx, qsize, &queue->lldd_handle); + + return ret; +} + +static void +nvme_fc_delete_hw_io_queues(struct nvme_fc_ctrl *ctrl) +{ + struct nvme_fc_queue *queue = &ctrl->queues[ctrl->ctrl.queue_count - 1]; + int i; + + for (i = ctrl->ctrl.queue_count - 1; i >= 1; i--, queue--) + __nvme_fc_delete_hw_queue(ctrl, queue, i); +} + +static int +nvme_fc_create_hw_io_queues(struct nvme_fc_ctrl *ctrl, u16 qsize) +{ + struct nvme_fc_queue *queue = &ctrl->queues[1]; + int i, ret; + + for (i = 1; i < ctrl->ctrl.queue_count; i++, queue++) { + ret = __nvme_fc_create_hw_queue(ctrl, queue, i, qsize); + if (ret) + goto delete_queues; + } + + return 0; + +delete_queues: + for (; i > 0; i--) + __nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[i], i); + return ret; +} + +static int +nvme_fc_connect_io_queues(struct nvme_fc_ctrl *ctrl, u16 qsize) +{ + int i, ret = 0; + + for (i = 1; i < ctrl->ctrl.queue_count; i++) { + ret = nvme_fc_connect_queue(ctrl, &ctrl->queues[i], qsize, + (qsize / 5)); + if (ret) + break; + ret = nvmf_connect_io_queue(&ctrl->ctrl, i); + if (ret) + break; + + set_bit(NVME_FC_Q_LIVE, &ctrl->queues[i].flags); + } + + return ret; +} + +static void +nvme_fc_init_io_queues(struct nvme_fc_ctrl *ctrl) +{ + int i; + + for (i = 1; i < ctrl->ctrl.queue_count; i++) + nvme_fc_init_queue(ctrl, i); +} + +static void +nvme_fc_ctrl_free(struct kref *ref) +{ + struct nvme_fc_ctrl *ctrl = + container_of(ref, struct nvme_fc_ctrl, ref); + unsigned long flags; + + if (ctrl->ctrl.tagset) { + blk_cleanup_queue(ctrl->ctrl.connect_q); + blk_mq_free_tag_set(&ctrl->tag_set); + } + + /* remove from rport list */ + spin_lock_irqsave(&ctrl->rport->lock, flags); + list_del(&ctrl->ctrl_list); + spin_unlock_irqrestore(&ctrl->rport->lock, flags); + + nvme_start_admin_queue(&ctrl->ctrl); + blk_cleanup_queue(ctrl->ctrl.admin_q); + blk_cleanup_queue(ctrl->ctrl.fabrics_q); + blk_mq_free_tag_set(&ctrl->admin_tag_set); + + kfree(ctrl->queues); + + put_device(ctrl->dev); + nvme_fc_rport_put(ctrl->rport); + + ida_simple_remove(&nvme_fc_ctrl_cnt, ctrl->cnum); + if (ctrl->ctrl.opts) + nvmf_free_options(ctrl->ctrl.opts); + kfree(ctrl); +} + +static void +nvme_fc_ctrl_put(struct nvme_fc_ctrl *ctrl) +{ + kref_put(&ctrl->ref, nvme_fc_ctrl_free); +} + +static int +nvme_fc_ctrl_get(struct nvme_fc_ctrl *ctrl) +{ + return kref_get_unless_zero(&ctrl->ref); +} + +/* + * All accesses from nvme core layer done - can now free the + * controller. Called after last nvme_put_ctrl() call + */ +static void +nvme_fc_nvme_ctrl_freed(struct nvme_ctrl *nctrl) +{ + struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl); + + WARN_ON(nctrl != &ctrl->ctrl); + + nvme_fc_ctrl_put(ctrl); +} + +/* + * This routine is used by the transport when it needs to find active + * io on a queue that is to be terminated. The transport uses + * blk_mq_tagset_busy_itr() to find the busy requests, which then invoke + * this routine to kill them on a 1 by 1 basis. + * + * As FC allocates FC exchange for each io, the transport must contact + * the LLDD to terminate the exchange, thus releasing the FC exchange. + * After terminating the exchange the LLDD will call the transport's + * normal io done path for the request, but it will have an aborted + * status. The done path will return the io request back to the block + * layer with an error status. + */ +static bool +nvme_fc_terminate_exchange(struct request *req, void *data, bool reserved) +{ + struct nvme_ctrl *nctrl = data; + struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl); + struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(req); + + op->nreq.flags |= NVME_REQ_CANCELLED; + __nvme_fc_abort_op(ctrl, op); + return true; +} + +/* + * This routine runs through all outstanding commands on the association + * and aborts them. This routine is typically be called by the + * delete_association routine. It is also called due to an error during + * reconnect. In that scenario, it is most likely a command that initializes + * the controller, including fabric Connect commands on io queues, that + * may have timed out or failed thus the io must be killed for the connect + * thread to see the error. + */ +static void +__nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues) +{ + int q; + + /* + * if aborting io, the queues are no longer good, mark them + * all as not live. + */ + if (ctrl->ctrl.queue_count > 1) { + for (q = 1; q < ctrl->ctrl.queue_count; q++) + clear_bit(NVME_FC_Q_LIVE, &ctrl->queues[q].flags); + } + clear_bit(NVME_FC_Q_LIVE, &ctrl->queues[0].flags); + + /* + * If io queues are present, stop them and terminate all outstanding + * ios on them. As FC allocates FC exchange for each io, the + * transport must contact the LLDD to terminate the exchange, + * thus releasing the FC exchange. We use blk_mq_tagset_busy_itr() + * to tell us what io's are busy and invoke a transport routine + * to kill them with the LLDD. After terminating the exchange + * the LLDD will call the transport's normal io done path, but it + * will have an aborted status. The done path will return the + * io requests back to the block layer as part of normal completions + * (but with error status). + */ + if (ctrl->ctrl.queue_count > 1) { + nvme_stop_queues(&ctrl->ctrl); + nvme_sync_io_queues(&ctrl->ctrl); + blk_mq_tagset_busy_iter(&ctrl->tag_set, + nvme_fc_terminate_exchange, &ctrl->ctrl); + blk_mq_tagset_wait_completed_request(&ctrl->tag_set); + if (start_queues) + nvme_start_queues(&ctrl->ctrl); + } + + /* + * Other transports, which don't have link-level contexts bound + * to sqe's, would try to gracefully shutdown the controller by + * writing the registers for shutdown and polling (call + * nvme_shutdown_ctrl()). Given a bunch of i/o was potentially + * just aborted and we will wait on those contexts, and given + * there was no indication of how live the controlelr is on the + * link, don't send more io to create more contexts for the + * shutdown. Let the controller fail via keepalive failure if + * its still present. + */ + + /* + * clean up the admin queue. Same thing as above. + */ + nvme_stop_keep_alive(&ctrl->ctrl); + nvme_stop_admin_queue(&ctrl->ctrl); + blk_sync_queue(ctrl->ctrl.admin_q); + blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, + nvme_fc_terminate_exchange, &ctrl->ctrl); + blk_mq_tagset_wait_completed_request(&ctrl->admin_tag_set); +} + +static void +nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg) +{ + /* + * if an error (io timeout, etc) while (re)connecting, the remote + * port requested terminating of the association (disconnect_ls) + * or an error (timeout or abort) occurred on an io while creating + * the controller. Abort any ios on the association and let the + * create_association error path resolve things. + */ + if (ctrl->ctrl.state == NVME_CTRL_CONNECTING) { + __nvme_fc_abort_outstanding_ios(ctrl, true); + set_bit(ASSOC_FAILED, &ctrl->flags); + return; + } + + /* Otherwise, only proceed if in LIVE state - e.g. on first error */ + if (ctrl->ctrl.state != NVME_CTRL_LIVE) + return; + + dev_warn(ctrl->ctrl.device, + "NVME-FC{%d}: transport association event: %s\n", + ctrl->cnum, errmsg); + dev_warn(ctrl->ctrl.device, + "NVME-FC{%d}: resetting controller\n", ctrl->cnum); + + nvme_reset_ctrl(&ctrl->ctrl); +} + +static enum blk_eh_timer_return +nvme_fc_timeout(struct request *rq, bool reserved) +{ + struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq); + struct nvme_fc_ctrl *ctrl = op->ctrl; + struct nvme_fc_cmd_iu *cmdiu = &op->cmd_iu; + struct nvme_command *sqe = &cmdiu->sqe; + + /* + * Attempt to abort the offending command. Command completion + * will detect the aborted io and will fail the connection. + */ + dev_info(ctrl->ctrl.device, + "NVME-FC{%d.%d}: io timeout: opcode %d fctype %d w10/11: " + "x%08x/x%08x\n", + ctrl->cnum, op->queue->qnum, sqe->common.opcode, + sqe->connect.fctype, sqe->common.cdw10, sqe->common.cdw11); + if (__nvme_fc_abort_op(ctrl, op)) + nvme_fc_error_recovery(ctrl, "io timeout abort failed"); + + /* + * the io abort has been initiated. Have the reset timer + * restarted and the abort completion will complete the io + * shortly. Avoids a synchronous wait while the abort finishes. + */ + return BLK_EH_RESET_TIMER; +} + +static int +nvme_fc_map_data(struct nvme_fc_ctrl *ctrl, struct request *rq, + struct nvme_fc_fcp_op *op) +{ + struct nvmefc_fcp_req *freq = &op->fcp_req; + int ret; + + freq->sg_cnt = 0; + + if (!blk_rq_nr_phys_segments(rq)) + return 0; + + freq->sg_table.sgl = freq->first_sgl; + ret = sg_alloc_table_chained(&freq->sg_table, + blk_rq_nr_phys_segments(rq), freq->sg_table.sgl, + NVME_INLINE_SG_CNT); + if (ret) + return -ENOMEM; + + op->nents = blk_rq_map_sg(rq->q, rq, freq->sg_table.sgl); + WARN_ON(op->nents > blk_rq_nr_phys_segments(rq)); + freq->sg_cnt = fc_dma_map_sg(ctrl->lport->dev, freq->sg_table.sgl, + op->nents, rq_dma_dir(rq)); + if (unlikely(freq->sg_cnt <= 0)) { + sg_free_table_chained(&freq->sg_table, NVME_INLINE_SG_CNT); + freq->sg_cnt = 0; + return -EFAULT; + } + + /* + * TODO: blk_integrity_rq(rq) for DIF + */ + return 0; +} + +static void +nvme_fc_unmap_data(struct nvme_fc_ctrl *ctrl, struct request *rq, + struct nvme_fc_fcp_op *op) +{ + struct nvmefc_fcp_req *freq = &op->fcp_req; + + if (!freq->sg_cnt) + return; + + fc_dma_unmap_sg(ctrl->lport->dev, freq->sg_table.sgl, op->nents, + rq_dma_dir(rq)); + + sg_free_table_chained(&freq->sg_table, NVME_INLINE_SG_CNT); + + freq->sg_cnt = 0; +} + +/* + * In FC, the queue is a logical thing. At transport connect, the target + * creates its "queue" and returns a handle that is to be given to the + * target whenever it posts something to the corresponding SQ. When an + * SQE is sent on a SQ, FC effectively considers the SQE, or rather the + * command contained within the SQE, an io, and assigns a FC exchange + * to it. The SQE and the associated SQ handle are sent in the initial + * CMD IU sents on the exchange. All transfers relative to the io occur + * as part of the exchange. The CQE is the last thing for the io, + * which is transferred (explicitly or implicitly) with the RSP IU + * sent on the exchange. After the CQE is received, the FC exchange is + * terminaed and the Exchange may be used on a different io. + * + * The transport to LLDD api has the transport making a request for a + * new fcp io request to the LLDD. The LLDD then allocates a FC exchange + * resource and transfers the command. The LLDD will then process all + * steps to complete the io. Upon completion, the transport done routine + * is called. + * + * So - while the operation is outstanding to the LLDD, there is a link + * level FC exchange resource that is also outstanding. This must be + * considered in all cleanup operations. + */ +static blk_status_t +nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue, + struct nvme_fc_fcp_op *op, u32 data_len, + enum nvmefc_fcp_datadir io_dir) +{ + struct nvme_fc_cmd_iu *cmdiu = &op->cmd_iu; + struct nvme_command *sqe = &cmdiu->sqe; + int ret, opstate; + + /* + * before attempting to send the io, check to see if we believe + * the target device is present + */ + if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE) + return BLK_STS_RESOURCE; + + if (!nvme_fc_ctrl_get(ctrl)) + return BLK_STS_IOERR; + + /* format the FC-NVME CMD IU and fcp_req */ + cmdiu->connection_id = cpu_to_be64(queue->connection_id); + cmdiu->data_len = cpu_to_be32(data_len); + switch (io_dir) { + case NVMEFC_FCP_WRITE: + cmdiu->flags = FCNVME_CMD_FLAGS_WRITE; + break; + case NVMEFC_FCP_READ: + cmdiu->flags = FCNVME_CMD_FLAGS_READ; + break; + case NVMEFC_FCP_NODATA: + cmdiu->flags = 0; + break; + } + op->fcp_req.payload_length = data_len; + op->fcp_req.io_dir = io_dir; + op->fcp_req.transferred_length = 0; + op->fcp_req.rcv_rsplen = 0; + op->fcp_req.status = NVME_SC_SUCCESS; + op->fcp_req.sqid = cpu_to_le16(queue->qnum); + + /* + * validate per fabric rules, set fields mandated by fabric spec + * as well as those by FC-NVME spec. + */ + WARN_ON_ONCE(sqe->common.metadata); + sqe->common.flags |= NVME_CMD_SGL_METABUF; + + /* + * format SQE DPTR field per FC-NVME rules: + * type=0x5 Transport SGL Data Block Descriptor + * subtype=0xA Transport-specific value + * address=0 + * length=length of the data series + */ + sqe->rw.dptr.sgl.type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) | + NVME_SGL_FMT_TRANSPORT_A; + sqe->rw.dptr.sgl.length = cpu_to_le32(data_len); + sqe->rw.dptr.sgl.addr = 0; + + if (!(op->flags & FCOP_FLAGS_AEN)) { + ret = nvme_fc_map_data(ctrl, op->rq, op); + if (ret < 0) { + nvme_cleanup_cmd(op->rq); + nvme_fc_ctrl_put(ctrl); + if (ret == -ENOMEM || ret == -EAGAIN) + return BLK_STS_RESOURCE; + return BLK_STS_IOERR; + } + } + + fc_dma_sync_single_for_device(ctrl->lport->dev, op->fcp_req.cmddma, + sizeof(op->cmd_iu), DMA_TO_DEVICE); + + atomic_set(&op->state, FCPOP_STATE_ACTIVE); + + if (!(op->flags & FCOP_FLAGS_AEN)) + blk_mq_start_request(op->rq); + + cmdiu->csn = cpu_to_be32(atomic_inc_return(&queue->csn)); + ret = ctrl->lport->ops->fcp_io(&ctrl->lport->localport, + &ctrl->rport->remoteport, + queue->lldd_handle, &op->fcp_req); + + if (ret) { + /* + * If the lld fails to send the command is there an issue with + * the csn value? If the command that fails is the Connect, + * no - as the connection won't be live. If it is a command + * post-connect, it's possible a gap in csn may be created. + * Does this matter? As Linux initiators don't send fused + * commands, no. The gap would exist, but as there's nothing + * that depends on csn order to be delivered on the target + * side, it shouldn't hurt. It would be difficult for a + * target to even detect the csn gap as it has no idea when the + * cmd with the csn was supposed to arrive. + */ + opstate = atomic_xchg(&op->state, FCPOP_STATE_COMPLETE); + __nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate); + + if (!(op->flags & FCOP_FLAGS_AEN)) { + nvme_fc_unmap_data(ctrl, op->rq, op); + nvme_cleanup_cmd(op->rq); + } + + nvme_fc_ctrl_put(ctrl); + + if (ctrl->rport->remoteport.port_state == FC_OBJSTATE_ONLINE && + ret != -EBUSY) + return BLK_STS_IOERR; + + return BLK_STS_RESOURCE; + } + + return BLK_STS_OK; +} + +static blk_status_t +nvme_fc_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct nvme_ns *ns = hctx->queue->queuedata; + struct nvme_fc_queue *queue = hctx->driver_data; + struct nvme_fc_ctrl *ctrl = queue->ctrl; + struct request *rq = bd->rq; + struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq); + enum nvmefc_fcp_datadir io_dir; + bool queue_ready = test_bit(NVME_FC_Q_LIVE, &queue->flags); + u32 data_len; + blk_status_t ret; + + if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE || + !nvme_check_ready(&queue->ctrl->ctrl, rq, queue_ready)) + return nvme_fail_nonready_command(&queue->ctrl->ctrl, rq); + + ret = nvme_setup_cmd(ns, rq); + if (ret) + return ret; + + /* + * nvme core doesn't quite treat the rq opaquely. Commands such + * as WRITE ZEROES will return a non-zero rq payload_bytes yet + * there is no actual payload to be transferred. + * To get it right, key data transmission on there being 1 or + * more physical segments in the sg list. If there is no + * physical segments, there is no payload. + */ + if (blk_rq_nr_phys_segments(rq)) { + data_len = blk_rq_payload_bytes(rq); + io_dir = ((rq_data_dir(rq) == WRITE) ? + NVMEFC_FCP_WRITE : NVMEFC_FCP_READ); + } else { + data_len = 0; + io_dir = NVMEFC_FCP_NODATA; + } + + + return nvme_fc_start_fcp_op(ctrl, queue, op, data_len, io_dir); +} + +static void +nvme_fc_submit_async_event(struct nvme_ctrl *arg) +{ + struct nvme_fc_ctrl *ctrl = to_fc_ctrl(arg); + struct nvme_fc_fcp_op *aen_op; + blk_status_t ret; + + if (test_bit(FCCTRL_TERMIO, &ctrl->flags)) + return; + + aen_op = &ctrl->aen_ops[0]; + + ret = nvme_fc_start_fcp_op(ctrl, aen_op->queue, aen_op, 0, + NVMEFC_FCP_NODATA); + if (ret) + dev_err(ctrl->ctrl.device, + "failed async event work\n"); +} + +static void +nvme_fc_complete_rq(struct request *rq) +{ + struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq); + struct nvme_fc_ctrl *ctrl = op->ctrl; + + atomic_set(&op->state, FCPOP_STATE_IDLE); + op->flags &= ~FCOP_FLAGS_TERMIO; + + nvme_fc_unmap_data(ctrl, rq, op); + nvme_complete_rq(rq); + nvme_fc_ctrl_put(ctrl); +} + +static int nvme_fc_map_queues(struct blk_mq_tag_set *set) +{ + struct nvme_fc_ctrl *ctrl = set->driver_data; + int i; + + for (i = 0; i < set->nr_maps; i++) { + struct blk_mq_queue_map *map = &set->map[i]; + + if (!map->nr_queues) { + WARN_ON(i == HCTX_TYPE_DEFAULT); + continue; + } + + /* Call LLDD map queue functionality if defined */ + if (ctrl->lport->ops->map_queues) + ctrl->lport->ops->map_queues(&ctrl->lport->localport, + map); + else + blk_mq_map_queues(map); + } + return 0; +} + +static const struct blk_mq_ops nvme_fc_mq_ops = { + .queue_rq = nvme_fc_queue_rq, + .complete = nvme_fc_complete_rq, + .init_request = nvme_fc_init_request, + .exit_request = nvme_fc_exit_request, + .init_hctx = nvme_fc_init_hctx, + .timeout = nvme_fc_timeout, + .map_queues = nvme_fc_map_queues, +}; + +static int +nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl) +{ + struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; + unsigned int nr_io_queues; + int ret; + + nr_io_queues = min(min(opts->nr_io_queues, num_online_cpus()), + ctrl->lport->ops->max_hw_queues); + ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues); + if (ret) { + dev_info(ctrl->ctrl.device, + "set_queue_count failed: %d\n", ret); + return ret; + } + + ctrl->ctrl.queue_count = nr_io_queues + 1; + if (!nr_io_queues) + return 0; + + nvme_fc_init_io_queues(ctrl); + + memset(&ctrl->tag_set, 0, sizeof(ctrl->tag_set)); + ctrl->tag_set.ops = &nvme_fc_mq_ops; + ctrl->tag_set.queue_depth = ctrl->ctrl.opts->queue_size; + ctrl->tag_set.reserved_tags = NVMF_RESERVED_TAGS; + ctrl->tag_set.numa_node = ctrl->ctrl.numa_node; + ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; + ctrl->tag_set.cmd_size = + struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv, + ctrl->lport->ops->fcprqst_priv_sz); + ctrl->tag_set.driver_data = ctrl; + ctrl->tag_set.nr_hw_queues = ctrl->ctrl.queue_count - 1; + ctrl->tag_set.timeout = NVME_IO_TIMEOUT; + + ret = blk_mq_alloc_tag_set(&ctrl->tag_set); + if (ret) + return ret; + + ctrl->ctrl.tagset = &ctrl->tag_set; + + ctrl->ctrl.connect_q = blk_mq_init_queue(&ctrl->tag_set); + if (IS_ERR(ctrl->ctrl.connect_q)) { + ret = PTR_ERR(ctrl->ctrl.connect_q); + goto out_free_tag_set; + } + + ret = nvme_fc_create_hw_io_queues(ctrl, ctrl->ctrl.sqsize + 1); + if (ret) + goto out_cleanup_blk_queue; + + ret = nvme_fc_connect_io_queues(ctrl, ctrl->ctrl.sqsize + 1); + if (ret) + goto out_delete_hw_queues; + + ctrl->ioq_live = true; + + return 0; + +out_delete_hw_queues: + nvme_fc_delete_hw_io_queues(ctrl); +out_cleanup_blk_queue: + blk_cleanup_queue(ctrl->ctrl.connect_q); +out_free_tag_set: + blk_mq_free_tag_set(&ctrl->tag_set); + nvme_fc_free_io_queues(ctrl); + + /* force put free routine to ignore io queues */ + ctrl->ctrl.tagset = NULL; + + return ret; +} + +static int +nvme_fc_recreate_io_queues(struct nvme_fc_ctrl *ctrl) +{ + struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; + u32 prior_ioq_cnt = ctrl->ctrl.queue_count - 1; + unsigned int nr_io_queues; + int ret; + + nr_io_queues = min(min(opts->nr_io_queues, num_online_cpus()), + ctrl->lport->ops->max_hw_queues); + ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues); + if (ret) { + dev_info(ctrl->ctrl.device, + "set_queue_count failed: %d\n", ret); + return ret; + } + + if (!nr_io_queues && prior_ioq_cnt) { + dev_info(ctrl->ctrl.device, + "Fail Reconnect: At least 1 io queue " + "required (was %d)\n", prior_ioq_cnt); + return -ENOSPC; + } + + ctrl->ctrl.queue_count = nr_io_queues + 1; + /* check for io queues existing */ + if (ctrl->ctrl.queue_count == 1) + return 0; + + if (prior_ioq_cnt != nr_io_queues) { + dev_info(ctrl->ctrl.device, + "reconnect: revising io queue count from %d to %d\n", + prior_ioq_cnt, nr_io_queues); + blk_mq_update_nr_hw_queues(&ctrl->tag_set, nr_io_queues); + } + + ret = nvme_fc_create_hw_io_queues(ctrl, ctrl->ctrl.sqsize + 1); + if (ret) + goto out_free_io_queues; + + ret = nvme_fc_connect_io_queues(ctrl, ctrl->ctrl.sqsize + 1); + if (ret) + goto out_delete_hw_queues; + + return 0; + +out_delete_hw_queues: + nvme_fc_delete_hw_io_queues(ctrl); +out_free_io_queues: + nvme_fc_free_io_queues(ctrl); + return ret; +} + +static void +nvme_fc_rport_active_on_lport(struct nvme_fc_rport *rport) +{ + struct nvme_fc_lport *lport = rport->lport; + + atomic_inc(&lport->act_rport_cnt); +} + +static void +nvme_fc_rport_inactive_on_lport(struct nvme_fc_rport *rport) +{ + struct nvme_fc_lport *lport = rport->lport; + u32 cnt; + + cnt = atomic_dec_return(&lport->act_rport_cnt); + if (cnt == 0 && lport->localport.port_state == FC_OBJSTATE_DELETED) + lport->ops->localport_delete(&lport->localport); +} + +static int +nvme_fc_ctlr_active_on_rport(struct nvme_fc_ctrl *ctrl) +{ + struct nvme_fc_rport *rport = ctrl->rport; + u32 cnt; + + if (test_and_set_bit(ASSOC_ACTIVE, &ctrl->flags)) + return 1; + + cnt = atomic_inc_return(&rport->act_ctrl_cnt); + if (cnt == 1) + nvme_fc_rport_active_on_lport(rport); + + return 0; +} + +static int +nvme_fc_ctlr_inactive_on_rport(struct nvme_fc_ctrl *ctrl) +{ + struct nvme_fc_rport *rport = ctrl->rport; + struct nvme_fc_lport *lport = rport->lport; + u32 cnt; + + /* clearing of ctrl->flags ASSOC_ACTIVE bit is in association delete */ + + cnt = atomic_dec_return(&rport->act_ctrl_cnt); + if (cnt == 0) { + if (rport->remoteport.port_state == FC_OBJSTATE_DELETED) + lport->ops->remoteport_delete(&rport->remoteport); + nvme_fc_rport_inactive_on_lport(rport); + } + + return 0; +} + +/* + * This routine restarts the controller on the host side, and + * on the link side, recreates the controller association. + */ +static int +nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) +{ + struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; + struct nvmefc_ls_rcv_op *disls = NULL; + unsigned long flags; + int ret; + bool changed; + + ++ctrl->ctrl.nr_reconnects; + + if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE) + return -ENODEV; + + if (nvme_fc_ctlr_active_on_rport(ctrl)) + return -ENOTUNIQ; + + dev_info(ctrl->ctrl.device, + "NVME-FC{%d}: create association : host wwpn 0x%016llx " + " rport wwpn 0x%016llx: NQN \"%s\"\n", + ctrl->cnum, ctrl->lport->localport.port_name, + ctrl->rport->remoteport.port_name, ctrl->ctrl.opts->subsysnqn); + + clear_bit(ASSOC_FAILED, &ctrl->flags); + + /* + * Create the admin queue + */ + + ret = __nvme_fc_create_hw_queue(ctrl, &ctrl->queues[0], 0, + NVME_AQ_DEPTH); + if (ret) + goto out_free_queue; + + ret = nvme_fc_connect_admin_queue(ctrl, &ctrl->queues[0], + NVME_AQ_DEPTH, (NVME_AQ_DEPTH / 4)); + if (ret) + goto out_delete_hw_queue; + + ret = nvmf_connect_admin_queue(&ctrl->ctrl); + if (ret) + goto out_disconnect_admin_queue; + + set_bit(NVME_FC_Q_LIVE, &ctrl->queues[0].flags); + + /* + * Check controller capabilities + * + * todo:- add code to check if ctrl attributes changed from + * prior connection values + */ + + ret = nvme_enable_ctrl(&ctrl->ctrl); + if (ret || test_bit(ASSOC_FAILED, &ctrl->flags)) + goto out_disconnect_admin_queue; + + ctrl->ctrl.max_segments = ctrl->lport->ops->max_sgl_segments; + ctrl->ctrl.max_hw_sectors = ctrl->ctrl.max_segments << + (ilog2(SZ_4K) - 9); + + nvme_start_admin_queue(&ctrl->ctrl); + + ret = nvme_init_ctrl_finish(&ctrl->ctrl); + if (ret || test_bit(ASSOC_FAILED, &ctrl->flags)) + goto out_disable_ctrl; + + /* sanity checks */ + + /* FC-NVME does not have other data in the capsule */ + if (ctrl->ctrl.icdoff) { + dev_err(ctrl->ctrl.device, "icdoff %d is not supported!\n", + ctrl->ctrl.icdoff); + ret = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + goto out_disable_ctrl; + } + + /* FC-NVME supports normal SGL Data Block Descriptors */ + if (!nvme_ctrl_sgl_supported(&ctrl->ctrl)) { + dev_err(ctrl->ctrl.device, + "Mandatory sgls are not supported!\n"); + ret = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + goto out_disconnect_admin_queue; + } + + if (opts->queue_size > ctrl->ctrl.maxcmd) { + /* warn if maxcmd is lower than queue_size */ + dev_warn(ctrl->ctrl.device, + "queue_size %zu > ctrl maxcmd %u, reducing " + "to maxcmd\n", + opts->queue_size, ctrl->ctrl.maxcmd); + opts->queue_size = ctrl->ctrl.maxcmd; + } + + if (opts->queue_size > ctrl->ctrl.sqsize + 1) { + /* warn if sqsize is lower than queue_size */ + dev_warn(ctrl->ctrl.device, + "queue_size %zu > ctrl sqsize %u, reducing " + "to sqsize\n", + opts->queue_size, ctrl->ctrl.sqsize + 1); + opts->queue_size = ctrl->ctrl.sqsize + 1; + } + + ret = nvme_fc_init_aen_ops(ctrl); + if (ret) + goto out_term_aen_ops; + + /* + * Create the io queues + */ + + if (ctrl->ctrl.queue_count > 1) { + if (!ctrl->ioq_live) + ret = nvme_fc_create_io_queues(ctrl); + else + ret = nvme_fc_recreate_io_queues(ctrl); + } + if (ret || test_bit(ASSOC_FAILED, &ctrl->flags)) + goto out_term_aen_ops; + + changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); + + ctrl->ctrl.nr_reconnects = 0; + + if (changed) + nvme_start_ctrl(&ctrl->ctrl); + + return 0; /* Success */ + +out_term_aen_ops: + nvme_fc_term_aen_ops(ctrl); +out_disable_ctrl: + nvme_disable_ctrl(&ctrl->ctrl); +out_disconnect_admin_queue: + /* send a Disconnect(association) LS to fc-nvme target */ + nvme_fc_xmt_disconnect_assoc(ctrl); + spin_lock_irqsave(&ctrl->lock, flags); + ctrl->association_id = 0; + disls = ctrl->rcv_disconn; + ctrl->rcv_disconn = NULL; + spin_unlock_irqrestore(&ctrl->lock, flags); + if (disls) + nvme_fc_xmt_ls_rsp(disls); +out_delete_hw_queue: + __nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0); +out_free_queue: + nvme_fc_free_queue(&ctrl->queues[0]); + clear_bit(ASSOC_ACTIVE, &ctrl->flags); + nvme_fc_ctlr_inactive_on_rport(ctrl); + + return ret; +} + + +/* + * This routine stops operation of the controller on the host side. + * On the host os stack side: Admin and IO queues are stopped, + * outstanding ios on them terminated via FC ABTS. + * On the link side: the association is terminated. + */ +static void +nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl) +{ + struct nvmefc_ls_rcv_op *disls = NULL; + unsigned long flags; + + if (!test_and_clear_bit(ASSOC_ACTIVE, &ctrl->flags)) + return; + + spin_lock_irqsave(&ctrl->lock, flags); + set_bit(FCCTRL_TERMIO, &ctrl->flags); + ctrl->iocnt = 0; + spin_unlock_irqrestore(&ctrl->lock, flags); + + __nvme_fc_abort_outstanding_ios(ctrl, false); + + /* kill the aens as they are a separate path */ + nvme_fc_abort_aen_ops(ctrl); + + /* wait for all io that had to be aborted */ + spin_lock_irq(&ctrl->lock); + wait_event_lock_irq(ctrl->ioabort_wait, ctrl->iocnt == 0, ctrl->lock); + clear_bit(FCCTRL_TERMIO, &ctrl->flags); + spin_unlock_irq(&ctrl->lock); + + nvme_fc_term_aen_ops(ctrl); + + /* + * send a Disconnect(association) LS to fc-nvme target + * Note: could have been sent at top of process, but + * cleaner on link traffic if after the aborts complete. + * Note: if association doesn't exist, association_id will be 0 + */ + if (ctrl->association_id) + nvme_fc_xmt_disconnect_assoc(ctrl); + + spin_lock_irqsave(&ctrl->lock, flags); + ctrl->association_id = 0; + disls = ctrl->rcv_disconn; + ctrl->rcv_disconn = NULL; + spin_unlock_irqrestore(&ctrl->lock, flags); + if (disls) + /* + * if a Disconnect Request was waiting for a response, send + * now that all ABTS's have been issued (and are complete). + */ + nvme_fc_xmt_ls_rsp(disls); + + if (ctrl->ctrl.tagset) { + nvme_fc_delete_hw_io_queues(ctrl); + nvme_fc_free_io_queues(ctrl); + } + + __nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0); + nvme_fc_free_queue(&ctrl->queues[0]); + + /* re-enable the admin_q so anything new can fast fail */ + nvme_start_admin_queue(&ctrl->ctrl); + + /* resume the io queues so that things will fast fail */ + nvme_start_queues(&ctrl->ctrl); + + nvme_fc_ctlr_inactive_on_rport(ctrl); +} + +static void +nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl) +{ + struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl); + + cancel_work_sync(&ctrl->ioerr_work); + cancel_delayed_work_sync(&ctrl->connect_work); + /* + * kill the association on the link side. this will block + * waiting for io to terminate + */ + nvme_fc_delete_association(ctrl); +} + +static void +nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status) +{ + struct nvme_fc_rport *rport = ctrl->rport; + struct nvme_fc_remote_port *portptr = &rport->remoteport; + unsigned long recon_delay = ctrl->ctrl.opts->reconnect_delay * HZ; + bool recon = true; + + if (ctrl->ctrl.state != NVME_CTRL_CONNECTING) + return; + + if (portptr->port_state == FC_OBJSTATE_ONLINE) { + dev_info(ctrl->ctrl.device, + "NVME-FC{%d}: reset: Reconnect attempt failed (%d)\n", + ctrl->cnum, status); + if (status > 0 && (status & NVME_SC_DNR)) + recon = false; + } else if (time_after_eq(jiffies, rport->dev_loss_end)) + recon = false; + + if (recon && nvmf_should_reconnect(&ctrl->ctrl)) { + if (portptr->port_state == FC_OBJSTATE_ONLINE) + dev_info(ctrl->ctrl.device, + "NVME-FC{%d}: Reconnect attempt in %ld " + "seconds\n", + ctrl->cnum, recon_delay / HZ); + else if (time_after(jiffies + recon_delay, rport->dev_loss_end)) + recon_delay = rport->dev_loss_end - jiffies; + + queue_delayed_work(nvme_wq, &ctrl->connect_work, recon_delay); + } else { + if (portptr->port_state == FC_OBJSTATE_ONLINE) { + if (status > 0 && (status & NVME_SC_DNR)) + dev_warn(ctrl->ctrl.device, + "NVME-FC{%d}: reconnect failure\n", + ctrl->cnum); + else + dev_warn(ctrl->ctrl.device, + "NVME-FC{%d}: Max reconnect attempts " + "(%d) reached.\n", + ctrl->cnum, ctrl->ctrl.nr_reconnects); + } else + dev_warn(ctrl->ctrl.device, + "NVME-FC{%d}: dev_loss_tmo (%d) expired " + "while waiting for remoteport connectivity.\n", + ctrl->cnum, min_t(int, portptr->dev_loss_tmo, + (ctrl->ctrl.opts->max_reconnects * + ctrl->ctrl.opts->reconnect_delay))); + WARN_ON(nvme_delete_ctrl(&ctrl->ctrl)); + } +} + +static void +nvme_fc_reset_ctrl_work(struct work_struct *work) +{ + struct nvme_fc_ctrl *ctrl = + container_of(work, struct nvme_fc_ctrl, ctrl.reset_work); + + nvme_stop_ctrl(&ctrl->ctrl); + + /* will block will waiting for io to terminate */ + nvme_fc_delete_association(ctrl); + + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) + dev_err(ctrl->ctrl.device, + "NVME-FC{%d}: error_recovery: Couldn't change state " + "to CONNECTING\n", ctrl->cnum); + + if (ctrl->rport->remoteport.port_state == FC_OBJSTATE_ONLINE) { + if (!queue_delayed_work(nvme_wq, &ctrl->connect_work, 0)) { + dev_err(ctrl->ctrl.device, + "NVME-FC{%d}: failed to schedule connect " + "after reset\n", ctrl->cnum); + } else { + flush_delayed_work(&ctrl->connect_work); + } + } else { + nvme_fc_reconnect_or_delete(ctrl, -ENOTCONN); + } +} + + +static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = { + .name = "fc", + .module = THIS_MODULE, + .flags = NVME_F_FABRICS, + .reg_read32 = nvmf_reg_read32, + .reg_read64 = nvmf_reg_read64, + .reg_write32 = nvmf_reg_write32, + .free_ctrl = nvme_fc_nvme_ctrl_freed, + .submit_async_event = nvme_fc_submit_async_event, + .delete_ctrl = nvme_fc_delete_ctrl, + .get_address = nvmf_get_address, +}; + +static void +nvme_fc_connect_ctrl_work(struct work_struct *work) +{ + int ret; + + struct nvme_fc_ctrl *ctrl = + container_of(to_delayed_work(work), + struct nvme_fc_ctrl, connect_work); + + ret = nvme_fc_create_association(ctrl); + if (ret) + nvme_fc_reconnect_or_delete(ctrl, ret); + else + dev_info(ctrl->ctrl.device, + "NVME-FC{%d}: controller connect complete\n", + ctrl->cnum); +} + + +static const struct blk_mq_ops nvme_fc_admin_mq_ops = { + .queue_rq = nvme_fc_queue_rq, + .complete = nvme_fc_complete_rq, + .init_request = nvme_fc_init_request, + .exit_request = nvme_fc_exit_request, + .init_hctx = nvme_fc_init_admin_hctx, + .timeout = nvme_fc_timeout, +}; + + +/* + * Fails a controller request if it matches an existing controller + * (association) with the same tuple: + * + * + * The ports don't need to be compared as they are intrinsically + * already matched by the port pointers supplied. + */ +static bool +nvme_fc_existing_controller(struct nvme_fc_rport *rport, + struct nvmf_ctrl_options *opts) +{ + struct nvme_fc_ctrl *ctrl; + unsigned long flags; + bool found = false; + + spin_lock_irqsave(&rport->lock, flags); + list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list) { + found = nvmf_ctlr_matches_baseopts(&ctrl->ctrl, opts); + if (found) + break; + } + spin_unlock_irqrestore(&rport->lock, flags); + + return found; +} + +static struct nvme_ctrl * +nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, + struct nvme_fc_lport *lport, struct nvme_fc_rport *rport) +{ + struct nvme_fc_ctrl *ctrl; + unsigned long flags; + int ret, idx, ctrl_loss_tmo; + + if (!(rport->remoteport.port_role & + (FC_PORT_ROLE_NVME_DISCOVERY | FC_PORT_ROLE_NVME_TARGET))) { + ret = -EBADR; + goto out_fail; + } + + if (!opts->duplicate_connect && + nvme_fc_existing_controller(rport, opts)) { + ret = -EALREADY; + goto out_fail; + } + + ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL); + if (!ctrl) { + ret = -ENOMEM; + goto out_fail; + } + + idx = ida_simple_get(&nvme_fc_ctrl_cnt, 0, 0, GFP_KERNEL); + if (idx < 0) { + ret = -ENOSPC; + goto out_free_ctrl; + } + + /* + * if ctrl_loss_tmo is being enforced and the default reconnect delay + * is being used, change to a shorter reconnect delay for FC. + */ + if (opts->max_reconnects != -1 && + opts->reconnect_delay == NVMF_DEF_RECONNECT_DELAY && + opts->reconnect_delay > NVME_FC_DEFAULT_RECONNECT_TMO) { + ctrl_loss_tmo = opts->max_reconnects * opts->reconnect_delay; + opts->reconnect_delay = NVME_FC_DEFAULT_RECONNECT_TMO; + opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo, + opts->reconnect_delay); + } + + ctrl->ctrl.opts = opts; + ctrl->ctrl.nr_reconnects = 0; + if (lport->dev) + ctrl->ctrl.numa_node = dev_to_node(lport->dev); + else + ctrl->ctrl.numa_node = NUMA_NO_NODE; + INIT_LIST_HEAD(&ctrl->ctrl_list); + ctrl->lport = lport; + ctrl->rport = rport; + ctrl->dev = lport->dev; + ctrl->cnum = idx; + ctrl->ioq_live = false; + init_waitqueue_head(&ctrl->ioabort_wait); + + get_device(ctrl->dev); + kref_init(&ctrl->ref); + + INIT_WORK(&ctrl->ctrl.reset_work, nvme_fc_reset_ctrl_work); + INIT_DELAYED_WORK(&ctrl->connect_work, nvme_fc_connect_ctrl_work); + INIT_WORK(&ctrl->ioerr_work, nvme_fc_ctrl_ioerr_work); + spin_lock_init(&ctrl->lock); + + /* io queue count */ + ctrl->ctrl.queue_count = min_t(unsigned int, + opts->nr_io_queues, + lport->ops->max_hw_queues); + ctrl->ctrl.queue_count++; /* +1 for admin queue */ + + ctrl->ctrl.sqsize = opts->queue_size - 1; + ctrl->ctrl.kato = opts->kato; + ctrl->ctrl.cntlid = 0xffff; + + ret = -ENOMEM; + ctrl->queues = kcalloc(ctrl->ctrl.queue_count, + sizeof(struct nvme_fc_queue), GFP_KERNEL); + if (!ctrl->queues) + goto out_free_ida; + + nvme_fc_init_queue(ctrl, 0); + + memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set)); + ctrl->admin_tag_set.ops = &nvme_fc_admin_mq_ops; + ctrl->admin_tag_set.queue_depth = NVME_AQ_MQ_TAG_DEPTH; + ctrl->admin_tag_set.reserved_tags = NVMF_RESERVED_TAGS; + ctrl->admin_tag_set.numa_node = ctrl->ctrl.numa_node; + ctrl->admin_tag_set.cmd_size = + struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv, + ctrl->lport->ops->fcprqst_priv_sz); + ctrl->admin_tag_set.driver_data = ctrl; + ctrl->admin_tag_set.nr_hw_queues = 1; + ctrl->admin_tag_set.timeout = NVME_ADMIN_TIMEOUT; + ctrl->admin_tag_set.flags = BLK_MQ_F_NO_SCHED; + + ret = blk_mq_alloc_tag_set(&ctrl->admin_tag_set); + if (ret) + goto out_free_queues; + ctrl->ctrl.admin_tagset = &ctrl->admin_tag_set; + + ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set); + if (IS_ERR(ctrl->ctrl.fabrics_q)) { + ret = PTR_ERR(ctrl->ctrl.fabrics_q); + goto out_free_admin_tag_set; + } + + ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set); + if (IS_ERR(ctrl->ctrl.admin_q)) { + ret = PTR_ERR(ctrl->ctrl.admin_q); + goto out_cleanup_fabrics_q; + } + + /* + * Would have been nice to init io queues tag set as well. + * However, we require interaction from the controller + * for max io queue count before we can do so. + * Defer this to the connect path. + */ + + ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_fc_ctrl_ops, 0); + if (ret) + goto out_cleanup_admin_q; + + /* at this point, teardown path changes to ref counting on nvme ctrl */ + + spin_lock_irqsave(&rport->lock, flags); + list_add_tail(&ctrl->ctrl_list, &rport->ctrl_list); + spin_unlock_irqrestore(&rport->lock, flags); + + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING) || + !nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { + dev_err(ctrl->ctrl.device, + "NVME-FC{%d}: failed to init ctrl state\n", ctrl->cnum); + goto fail_ctrl; + } + + if (!queue_delayed_work(nvme_wq, &ctrl->connect_work, 0)) { + dev_err(ctrl->ctrl.device, + "NVME-FC{%d}: failed to schedule initial connect\n", + ctrl->cnum); + goto fail_ctrl; + } + + flush_delayed_work(&ctrl->connect_work); + + dev_info(ctrl->ctrl.device, + "NVME-FC{%d}: new ctrl: NQN \"%s\"\n", + ctrl->cnum, nvmf_ctrl_subsysnqn(&ctrl->ctrl)); + + return &ctrl->ctrl; + +fail_ctrl: + nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING); + cancel_work_sync(&ctrl->ioerr_work); + cancel_work_sync(&ctrl->ctrl.reset_work); + cancel_delayed_work_sync(&ctrl->connect_work); + + ctrl->ctrl.opts = NULL; + + /* initiate nvme ctrl ref counting teardown */ + nvme_uninit_ctrl(&ctrl->ctrl); + + /* Remove core ctrl ref. */ + nvme_put_ctrl(&ctrl->ctrl); + + /* as we're past the point where we transition to the ref + * counting teardown path, if we return a bad pointer here, + * the calling routine, thinking it's prior to the + * transition, will do an rport put. Since the teardown + * path also does a rport put, we do an extra get here to + * so proper order/teardown happens. + */ + nvme_fc_rport_get(rport); + + return ERR_PTR(-EIO); + +out_cleanup_admin_q: + blk_cleanup_queue(ctrl->ctrl.admin_q); +out_cleanup_fabrics_q: + blk_cleanup_queue(ctrl->ctrl.fabrics_q); +out_free_admin_tag_set: + blk_mq_free_tag_set(&ctrl->admin_tag_set); +out_free_queues: + kfree(ctrl->queues); +out_free_ida: + put_device(ctrl->dev); + ida_simple_remove(&nvme_fc_ctrl_cnt, ctrl->cnum); +out_free_ctrl: + kfree(ctrl); +out_fail: + /* exit via here doesn't follow ctlr ref points */ + return ERR_PTR(ret); +} + + +struct nvmet_fc_traddr { + u64 nn; + u64 pn; +}; + +static int +__nvme_fc_parse_u64(substring_t *sstr, u64 *val) +{ + u64 token64; + + if (match_u64(sstr, &token64)) + return -EINVAL; + *val = token64; + + return 0; +} + +/* + * This routine validates and extracts the WWN's from the TRADDR string. + * As kernel parsers need the 0x to determine number base, universally + * build string to parse with 0x prefix before parsing name strings. + */ +static int +nvme_fc_parse_traddr(struct nvmet_fc_traddr *traddr, char *buf, size_t blen) +{ + char name[2 + NVME_FC_TRADDR_HEXNAMELEN + 1]; + substring_t wwn = { name, &name[sizeof(name)-1] }; + int nnoffset, pnoffset; + + /* validate if string is one of the 2 allowed formats */ + if (strnlen(buf, blen) == NVME_FC_TRADDR_MAXLENGTH && + !strncmp(buf, "nn-0x", NVME_FC_TRADDR_OXNNLEN) && + !strncmp(&buf[NVME_FC_TRADDR_MAX_PN_OFFSET], + "pn-0x", NVME_FC_TRADDR_OXNNLEN)) { + nnoffset = NVME_FC_TRADDR_OXNNLEN; + pnoffset = NVME_FC_TRADDR_MAX_PN_OFFSET + + NVME_FC_TRADDR_OXNNLEN; + } else if ((strnlen(buf, blen) == NVME_FC_TRADDR_MINLENGTH && + !strncmp(buf, "nn-", NVME_FC_TRADDR_NNLEN) && + !strncmp(&buf[NVME_FC_TRADDR_MIN_PN_OFFSET], + "pn-", NVME_FC_TRADDR_NNLEN))) { + nnoffset = NVME_FC_TRADDR_NNLEN; + pnoffset = NVME_FC_TRADDR_MIN_PN_OFFSET + NVME_FC_TRADDR_NNLEN; + } else + goto out_einval; + + name[0] = '0'; + name[1] = 'x'; + name[2 + NVME_FC_TRADDR_HEXNAMELEN] = 0; + + memcpy(&name[2], &buf[nnoffset], NVME_FC_TRADDR_HEXNAMELEN); + if (__nvme_fc_parse_u64(&wwn, &traddr->nn)) + goto out_einval; + + memcpy(&name[2], &buf[pnoffset], NVME_FC_TRADDR_HEXNAMELEN); + if (__nvme_fc_parse_u64(&wwn, &traddr->pn)) + goto out_einval; + + return 0; + +out_einval: + pr_warn("%s: bad traddr string\n", __func__); + return -EINVAL; +} + +static struct nvme_ctrl * +nvme_fc_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts) +{ + struct nvme_fc_lport *lport; + struct nvme_fc_rport *rport; + struct nvme_ctrl *ctrl; + struct nvmet_fc_traddr laddr = { 0L, 0L }; + struct nvmet_fc_traddr raddr = { 0L, 0L }; + unsigned long flags; + int ret; + + ret = nvme_fc_parse_traddr(&raddr, opts->traddr, NVMF_TRADDR_SIZE); + if (ret || !raddr.nn || !raddr.pn) + return ERR_PTR(-EINVAL); + + ret = nvme_fc_parse_traddr(&laddr, opts->host_traddr, NVMF_TRADDR_SIZE); + if (ret || !laddr.nn || !laddr.pn) + return ERR_PTR(-EINVAL); + + /* find the host and remote ports to connect together */ + spin_lock_irqsave(&nvme_fc_lock, flags); + list_for_each_entry(lport, &nvme_fc_lport_list, port_list) { + if (lport->localport.node_name != laddr.nn || + lport->localport.port_name != laddr.pn || + lport->localport.port_state != FC_OBJSTATE_ONLINE) + continue; + + list_for_each_entry(rport, &lport->endp_list, endp_list) { + if (rport->remoteport.node_name != raddr.nn || + rport->remoteport.port_name != raddr.pn || + rport->remoteport.port_state != FC_OBJSTATE_ONLINE) + continue; + + /* if fail to get reference fall through. Will error */ + if (!nvme_fc_rport_get(rport)) + break; + + spin_unlock_irqrestore(&nvme_fc_lock, flags); + + ctrl = nvme_fc_init_ctrl(dev, opts, lport, rport); + if (IS_ERR(ctrl)) + nvme_fc_rport_put(rport); + return ctrl; + } + } + spin_unlock_irqrestore(&nvme_fc_lock, flags); + + pr_warn("%s: %s - %s combination not found\n", + __func__, opts->traddr, opts->host_traddr); + return ERR_PTR(-ENOENT); +} + + +static struct nvmf_transport_ops nvme_fc_transport = { + .name = "fc", + .module = THIS_MODULE, + .required_opts = NVMF_OPT_TRADDR | NVMF_OPT_HOST_TRADDR, + .allowed_opts = NVMF_OPT_RECONNECT_DELAY | NVMF_OPT_CTRL_LOSS_TMO, + .create_ctrl = nvme_fc_create_ctrl, +}; + +/* Arbitrary successive failures max. With lots of subsystems could be high */ +#define DISCOVERY_MAX_FAIL 20 + +static ssize_t nvme_fc_nvme_discovery_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + unsigned long flags; + LIST_HEAD(local_disc_list); + struct nvme_fc_lport *lport; + struct nvme_fc_rport *rport; + int failcnt = 0; + + spin_lock_irqsave(&nvme_fc_lock, flags); +restart: + list_for_each_entry(lport, &nvme_fc_lport_list, port_list) { + list_for_each_entry(rport, &lport->endp_list, endp_list) { + if (!nvme_fc_lport_get(lport)) + continue; + if (!nvme_fc_rport_get(rport)) { + /* + * This is a temporary condition. Upon restart + * this rport will be gone from the list. + * + * Revert the lport put and retry. Anything + * added to the list already will be skipped (as + * they are no longer list_empty). Loops should + * resume at rports that were not yet seen. + */ + nvme_fc_lport_put(lport); + + if (failcnt++ < DISCOVERY_MAX_FAIL) + goto restart; + + pr_err("nvme_discovery: too many reference " + "failures\n"); + goto process_local_list; + } + if (list_empty(&rport->disc_list)) + list_add_tail(&rport->disc_list, + &local_disc_list); + } + } + +process_local_list: + while (!list_empty(&local_disc_list)) { + rport = list_first_entry(&local_disc_list, + struct nvme_fc_rport, disc_list); + list_del_init(&rport->disc_list); + spin_unlock_irqrestore(&nvme_fc_lock, flags); + + lport = rport->lport; + /* signal discovery. Won't hurt if it repeats */ + nvme_fc_signal_discovery_scan(lport, rport); + nvme_fc_rport_put(rport); + nvme_fc_lport_put(lport); + + spin_lock_irqsave(&nvme_fc_lock, flags); + } + spin_unlock_irqrestore(&nvme_fc_lock, flags); + + return count; +} + +static DEVICE_ATTR(nvme_discovery, 0200, NULL, nvme_fc_nvme_discovery_store); + +#ifdef CONFIG_BLK_CGROUP_FC_APPID +/* Parse the cgroup id from a buf and return the length of cgrpid */ +static int fc_parse_cgrpid(const char *buf, u64 *id) +{ + char cgrp_id[16+1]; + int cgrpid_len, j; + + memset(cgrp_id, 0x0, sizeof(cgrp_id)); + for (cgrpid_len = 0, j = 0; cgrpid_len < 17; cgrpid_len++) { + if (buf[cgrpid_len] != ':') + cgrp_id[cgrpid_len] = buf[cgrpid_len]; + else { + j = 1; + break; + } + } + if (!j) + return -EINVAL; + if (kstrtou64(cgrp_id, 16, id) < 0) + return -EINVAL; + return cgrpid_len; +} + +/* + * fc_update_appid: Parse and update the appid in the blkcg associated with + * cgroupid. + * @buf: buf contains both cgrpid and appid info + * @count: size of the buffer + */ +static int fc_update_appid(const char *buf, size_t count) +{ + u64 cgrp_id; + int appid_len = 0; + int cgrpid_len = 0; + char app_id[FC_APPID_LEN]; + int ret = 0; + + if (buf[count-1] == '\n') + count--; + + if ((count > (16+1+FC_APPID_LEN)) || (!strchr(buf, ':'))) + return -EINVAL; + + cgrpid_len = fc_parse_cgrpid(buf, &cgrp_id); + if (cgrpid_len < 0) + return -EINVAL; + appid_len = count - cgrpid_len - 1; + if (appid_len > FC_APPID_LEN) + return -EINVAL; + + memset(app_id, 0x0, sizeof(app_id)); + memcpy(app_id, &buf[cgrpid_len+1], appid_len); + ret = blkcg_set_fc_appid(app_id, cgrp_id, sizeof(app_id)); + if (ret < 0) + return ret; + return count; +} + +static ssize_t fc_appid_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + int ret = 0; + + ret = fc_update_appid(buf, count); + if (ret < 0) + return -EINVAL; + return count; +} +static DEVICE_ATTR(appid_store, 0200, NULL, fc_appid_store); +#endif /* CONFIG_BLK_CGROUP_FC_APPID */ + +static struct attribute *nvme_fc_attrs[] = { + &dev_attr_nvme_discovery.attr, +#ifdef CONFIG_BLK_CGROUP_FC_APPID + &dev_attr_appid_store.attr, +#endif + NULL +}; + +static const struct attribute_group nvme_fc_attr_group = { + .attrs = nvme_fc_attrs, +}; + +static const struct attribute_group *nvme_fc_attr_groups[] = { + &nvme_fc_attr_group, + NULL +}; + +static struct class fc_class = { + .name = "fc", + .dev_groups = nvme_fc_attr_groups, + .owner = THIS_MODULE, +}; + +static int __init nvme_fc_init_module(void) +{ + int ret; + + nvme_fc_wq = alloc_workqueue("nvme_fc_wq", WQ_MEM_RECLAIM, 0); + if (!nvme_fc_wq) + return -ENOMEM; + + /* + * NOTE: + * It is expected that in the future the kernel will combine + * the FC-isms that are currently under scsi and now being + * added to by NVME into a new standalone FC class. The SCSI + * and NVME protocols and their devices would be under this + * new FC class. + * + * As we need something to post FC-specific udev events to, + * specifically for nvme probe events, start by creating the + * new device class. When the new standalone FC class is + * put in place, this code will move to a more generic + * location for the class. + */ + ret = class_register(&fc_class); + if (ret) { + pr_err("couldn't register class fc\n"); + goto out_destroy_wq; + } + + /* + * Create a device for the FC-centric udev events + */ + fc_udev_device = device_create(&fc_class, NULL, MKDEV(0, 0), NULL, + "fc_udev_device"); + if (IS_ERR(fc_udev_device)) { + pr_err("couldn't create fc_udev device!\n"); + ret = PTR_ERR(fc_udev_device); + goto out_destroy_class; + } + + ret = nvmf_register_transport(&nvme_fc_transport); + if (ret) + goto out_destroy_device; + + return 0; + +out_destroy_device: + device_destroy(&fc_class, MKDEV(0, 0)); +out_destroy_class: + class_unregister(&fc_class); +out_destroy_wq: + destroy_workqueue(nvme_fc_wq); + + return ret; +} + +static void +nvme_fc_delete_controllers(struct nvme_fc_rport *rport) +{ + struct nvme_fc_ctrl *ctrl; + + spin_lock(&rport->lock); + list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list) { + dev_warn(ctrl->ctrl.device, + "NVME-FC{%d}: transport unloading: deleting ctrl\n", + ctrl->cnum); + nvme_delete_ctrl(&ctrl->ctrl); + } + spin_unlock(&rport->lock); +} + +static void +nvme_fc_cleanup_for_unload(void) +{ + struct nvme_fc_lport *lport; + struct nvme_fc_rport *rport; + + list_for_each_entry(lport, &nvme_fc_lport_list, port_list) { + list_for_each_entry(rport, &lport->endp_list, endp_list) { + nvme_fc_delete_controllers(rport); + } + } +} + +static void __exit nvme_fc_exit_module(void) +{ + unsigned long flags; + bool need_cleanup = false; + + spin_lock_irqsave(&nvme_fc_lock, flags); + nvme_fc_waiting_to_unload = true; + if (!list_empty(&nvme_fc_lport_list)) { + need_cleanup = true; + nvme_fc_cleanup_for_unload(); + } + spin_unlock_irqrestore(&nvme_fc_lock, flags); + if (need_cleanup) { + pr_info("%s: waiting for ctlr deletes\n", __func__); + wait_for_completion(&nvme_fc_unload_proceed); + pr_info("%s: ctrl deletes complete\n", __func__); + } + + nvmf_unregister_transport(&nvme_fc_transport); + + ida_destroy(&nvme_fc_local_port_cnt); + ida_destroy(&nvme_fc_ctrl_cnt); + + device_destroy(&fc_class, MKDEV(0, 0)); + class_unregister(&fc_class); + destroy_workqueue(nvme_fc_wq); +} + +module_init(nvme_fc_init_module); +module_exit(nvme_fc_exit_module); + +MODULE_LICENSE("GPL v2"); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/fc.h b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/fc.h new file mode 100644 index 0000000..05ce566 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/fc.h @@ -0,0 +1,227 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2016, Avago Technologies + */ + +#ifndef _NVME_FC_TRANSPORT_H +#define _NVME_FC_TRANSPORT_H 1 + + +/* + * Common definitions between the nvme_fc (host) transport and + * nvmet_fc (target) transport implementation. + */ + +/* + * ****************** FC-NVME LS HANDLING ****************** + */ + +union nvmefc_ls_requests { + struct fcnvme_ls_rqst_w0 w0; + struct fcnvme_ls_cr_assoc_rqst rq_cr_assoc; + struct fcnvme_ls_cr_conn_rqst rq_cr_conn; + struct fcnvme_ls_disconnect_assoc_rqst rq_dis_assoc; + struct fcnvme_ls_disconnect_conn_rqst rq_dis_conn; +} __aligned(128); /* alignment for other things alloc'd with */ + +union nvmefc_ls_responses { + struct fcnvme_ls_rjt rsp_rjt; + struct fcnvme_ls_cr_assoc_acc rsp_cr_assoc; + struct fcnvme_ls_cr_conn_acc rsp_cr_conn; + struct fcnvme_ls_disconnect_assoc_acc rsp_dis_assoc; + struct fcnvme_ls_disconnect_conn_acc rsp_dis_conn; +} __aligned(128); /* alignment for other things alloc'd with */ + +static inline void +nvme_fc_format_rsp_hdr(void *buf, u8 ls_cmd, __be32 desc_len, u8 rqst_ls_cmd) +{ + struct fcnvme_ls_acc_hdr *acc = buf; + + acc->w0.ls_cmd = ls_cmd; + acc->desc_list_len = desc_len; + acc->rqst.desc_tag = cpu_to_be32(FCNVME_LSDESC_RQST); + acc->rqst.desc_len = + fcnvme_lsdesc_len(sizeof(struct fcnvme_lsdesc_rqst)); + acc->rqst.w0.ls_cmd = rqst_ls_cmd; +} + +static inline int +nvme_fc_format_rjt(void *buf, u16 buflen, u8 ls_cmd, + u8 reason, u8 explanation, u8 vendor) +{ + struct fcnvme_ls_rjt *rjt = buf; + + nvme_fc_format_rsp_hdr(buf, FCNVME_LSDESC_RQST, + fcnvme_lsdesc_len(sizeof(struct fcnvme_ls_rjt)), + ls_cmd); + rjt->rjt.desc_tag = cpu_to_be32(FCNVME_LSDESC_RJT); + rjt->rjt.desc_len = fcnvme_lsdesc_len(sizeof(struct fcnvme_lsdesc_rjt)); + rjt->rjt.reason_code = reason; + rjt->rjt.reason_explanation = explanation; + rjt->rjt.vendor = vendor; + + return sizeof(struct fcnvme_ls_rjt); +} + +/* Validation Error indexes into the string table below */ +enum { + VERR_NO_ERROR = 0, + VERR_CR_ASSOC_LEN = 1, + VERR_CR_ASSOC_RQST_LEN = 2, + VERR_CR_ASSOC_CMD = 3, + VERR_CR_ASSOC_CMD_LEN = 4, + VERR_ERSP_RATIO = 5, + VERR_ASSOC_ALLOC_FAIL = 6, + VERR_QUEUE_ALLOC_FAIL = 7, + VERR_CR_CONN_LEN = 8, + VERR_CR_CONN_RQST_LEN = 9, + VERR_ASSOC_ID = 10, + VERR_ASSOC_ID_LEN = 11, + VERR_NO_ASSOC = 12, + VERR_CONN_ID = 13, + VERR_CONN_ID_LEN = 14, + VERR_INVAL_CONN = 15, + VERR_CR_CONN_CMD = 16, + VERR_CR_CONN_CMD_LEN = 17, + VERR_DISCONN_LEN = 18, + VERR_DISCONN_RQST_LEN = 19, + VERR_DISCONN_CMD = 20, + VERR_DISCONN_CMD_LEN = 21, + VERR_DISCONN_SCOPE = 22, + VERR_RS_LEN = 23, + VERR_RS_RQST_LEN = 24, + VERR_RS_CMD = 25, + VERR_RS_CMD_LEN = 26, + VERR_RS_RCTL = 27, + VERR_RS_RO = 28, + VERR_LSACC = 29, + VERR_LSDESC_RQST = 30, + VERR_LSDESC_RQST_LEN = 31, + VERR_CR_ASSOC = 32, + VERR_CR_ASSOC_ACC_LEN = 33, + VERR_CR_CONN = 34, + VERR_CR_CONN_ACC_LEN = 35, + VERR_DISCONN = 36, + VERR_DISCONN_ACC_LEN = 37, +}; + +static char *validation_errors[] = { + "OK", + "Bad CR_ASSOC Length", + "Bad CR_ASSOC Rqst Length", + "Not CR_ASSOC Cmd", + "Bad CR_ASSOC Cmd Length", + "Bad Ersp Ratio", + "Association Allocation Failed", + "Queue Allocation Failed", + "Bad CR_CONN Length", + "Bad CR_CONN Rqst Length", + "Not Association ID", + "Bad Association ID Length", + "No Association", + "Not Connection ID", + "Bad Connection ID Length", + "Invalid Connection ID", + "Not CR_CONN Cmd", + "Bad CR_CONN Cmd Length", + "Bad DISCONN Length", + "Bad DISCONN Rqst Length", + "Not DISCONN Cmd", + "Bad DISCONN Cmd Length", + "Bad Disconnect Scope", + "Bad RS Length", + "Bad RS Rqst Length", + "Not RS Cmd", + "Bad RS Cmd Length", + "Bad RS R_CTL", + "Bad RS Relative Offset", + "Not LS_ACC", + "Not LSDESC_RQST", + "Bad LSDESC_RQST Length", + "Not CR_ASSOC Rqst", + "Bad CR_ASSOC ACC Length", + "Not CR_CONN Rqst", + "Bad CR_CONN ACC Length", + "Not Disconnect Rqst", + "Bad Disconnect ACC Length", +}; + +#define NVME_FC_LAST_LS_CMD_VALUE FCNVME_LS_DISCONNECT_CONN + +static char *nvmefc_ls_names[] = { + "Reserved (0)", + "RJT (1)", + "ACC (2)", + "Create Association", + "Create Connection", + "Disconnect Association", + "Disconnect Connection", +}; + +static inline void +nvmefc_fmt_lsreq_discon_assoc(struct nvmefc_ls_req *lsreq, + struct fcnvme_ls_disconnect_assoc_rqst *discon_rqst, + struct fcnvme_ls_disconnect_assoc_acc *discon_acc, + u64 association_id) +{ + lsreq->rqstaddr = discon_rqst; + lsreq->rqstlen = sizeof(*discon_rqst); + lsreq->rspaddr = discon_acc; + lsreq->rsplen = sizeof(*discon_acc); + lsreq->timeout = NVME_FC_LS_TIMEOUT_SEC; + + discon_rqst->w0.ls_cmd = FCNVME_LS_DISCONNECT_ASSOC; + discon_rqst->desc_list_len = cpu_to_be32( + sizeof(struct fcnvme_lsdesc_assoc_id) + + sizeof(struct fcnvme_lsdesc_disconn_cmd)); + + discon_rqst->associd.desc_tag = cpu_to_be32(FCNVME_LSDESC_ASSOC_ID); + discon_rqst->associd.desc_len = + fcnvme_lsdesc_len( + sizeof(struct fcnvme_lsdesc_assoc_id)); + + discon_rqst->associd.association_id = cpu_to_be64(association_id); + + discon_rqst->discon_cmd.desc_tag = cpu_to_be32( + FCNVME_LSDESC_DISCONN_CMD); + discon_rqst->discon_cmd.desc_len = + fcnvme_lsdesc_len( + sizeof(struct fcnvme_lsdesc_disconn_cmd)); +} + +static inline int +nvmefc_vldt_lsreq_discon_assoc(u32 rqstlen, + struct fcnvme_ls_disconnect_assoc_rqst *rqst) +{ + int ret = 0; + + if (rqstlen < sizeof(struct fcnvme_ls_disconnect_assoc_rqst)) + ret = VERR_DISCONN_LEN; + else if (rqst->desc_list_len != + fcnvme_lsdesc_len( + sizeof(struct fcnvme_ls_disconnect_assoc_rqst))) + ret = VERR_DISCONN_RQST_LEN; + else if (rqst->associd.desc_tag != cpu_to_be32(FCNVME_LSDESC_ASSOC_ID)) + ret = VERR_ASSOC_ID; + else if (rqst->associd.desc_len != + fcnvme_lsdesc_len( + sizeof(struct fcnvme_lsdesc_assoc_id))) + ret = VERR_ASSOC_ID_LEN; + else if (rqst->discon_cmd.desc_tag != + cpu_to_be32(FCNVME_LSDESC_DISCONN_CMD)) + ret = VERR_DISCONN_CMD; + else if (rqst->discon_cmd.desc_len != + fcnvme_lsdesc_len( + sizeof(struct fcnvme_lsdesc_disconn_cmd))) + ret = VERR_DISCONN_CMD_LEN; + /* + * As the standard changed on the LS, check if old format and scope + * something other than Association (e.g. 0). + */ + else if (rqst->discon_cmd.rsvd8[0]) + ret = VERR_DISCONN_SCOPE; + + return ret; +} + +#endif /* _NVME_FC_TRANSPORT_H */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/hwmon.c b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/hwmon.c new file mode 100644 index 0000000..0a586d7 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/hwmon.c @@ -0,0 +1,267 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NVM Express hardware monitoring support + * Copyright (c) 2019, Guenter Roeck + */ + +#include +#include +#include + +#include "nvme.h" + +struct nvme_hwmon_data { + struct nvme_ctrl *ctrl; + struct nvme_smart_log log; + struct mutex read_lock; +}; + +static int nvme_get_temp_thresh(struct nvme_ctrl *ctrl, int sensor, bool under, + long *temp) +{ + unsigned int threshold = sensor << NVME_TEMP_THRESH_SELECT_SHIFT; + u32 status; + int ret; + + if (under) + threshold |= NVME_TEMP_THRESH_TYPE_UNDER; + + ret = nvme_get_features(ctrl, NVME_FEAT_TEMP_THRESH, threshold, NULL, 0, + &status); + if (ret > 0) + return -EIO; + if (ret < 0) + return ret; + *temp = kelvin_to_millicelsius(status & NVME_TEMP_THRESH_MASK); + + return 0; +} + +static int nvme_set_temp_thresh(struct nvme_ctrl *ctrl, int sensor, bool under, + long temp) +{ + unsigned int threshold = sensor << NVME_TEMP_THRESH_SELECT_SHIFT; + int ret; + + temp = millicelsius_to_kelvin(temp); + threshold |= clamp_val(temp, 0, NVME_TEMP_THRESH_MASK); + + if (under) + threshold |= NVME_TEMP_THRESH_TYPE_UNDER; + + ret = nvme_set_features(ctrl, NVME_FEAT_TEMP_THRESH, threshold, NULL, 0, + NULL); + if (ret > 0) + return -EIO; + + return ret; +} + +static int nvme_hwmon_get_smart_log(struct nvme_hwmon_data *data) +{ + return nvme_get_log(data->ctrl, NVME_NSID_ALL, NVME_LOG_SMART, 0, + NVME_CSI_NVM, &data->log, sizeof(data->log), 0); +} + +static int nvme_hwmon_read(struct device *dev, enum hwmon_sensor_types type, + u32 attr, int channel, long *val) +{ + struct nvme_hwmon_data *data = dev_get_drvdata(dev); + struct nvme_smart_log *log = &data->log; + int temp; + int err; + + /* + * First handle attributes which don't require us to read + * the smart log. + */ + switch (attr) { + case hwmon_temp_max: + return nvme_get_temp_thresh(data->ctrl, channel, false, val); + case hwmon_temp_min: + return nvme_get_temp_thresh(data->ctrl, channel, true, val); + case hwmon_temp_crit: + *val = kelvin_to_millicelsius(data->ctrl->cctemp); + return 0; + default: + break; + } + + mutex_lock(&data->read_lock); + err = nvme_hwmon_get_smart_log(data); + if (err) + goto unlock; + + switch (attr) { + case hwmon_temp_input: + if (!channel) + temp = get_unaligned_le16(log->temperature); + else + temp = le16_to_cpu(log->temp_sensor[channel - 1]); + *val = kelvin_to_millicelsius(temp); + break; + case hwmon_temp_alarm: + *val = !!(log->critical_warning & NVME_SMART_CRIT_TEMPERATURE); + break; + default: + err = -EOPNOTSUPP; + break; + } +unlock: + mutex_unlock(&data->read_lock); + return err; +} + +static int nvme_hwmon_write(struct device *dev, enum hwmon_sensor_types type, + u32 attr, int channel, long val) +{ + struct nvme_hwmon_data *data = dev_get_drvdata(dev); + + switch (attr) { + case hwmon_temp_max: + return nvme_set_temp_thresh(data->ctrl, channel, false, val); + case hwmon_temp_min: + return nvme_set_temp_thresh(data->ctrl, channel, true, val); + default: + break; + } + + return -EOPNOTSUPP; +} + +static const char * const nvme_hwmon_sensor_names[] = { + "Composite", + "Sensor 1", + "Sensor 2", + "Sensor 3", + "Sensor 4", + "Sensor 5", + "Sensor 6", + "Sensor 7", + "Sensor 8", +}; + +static int nvme_hwmon_read_string(struct device *dev, + enum hwmon_sensor_types type, u32 attr, + int channel, const char **str) +{ + *str = nvme_hwmon_sensor_names[channel]; + return 0; +} + +static umode_t nvme_hwmon_is_visible(const void *_data, + enum hwmon_sensor_types type, + u32 attr, int channel) +{ + const struct nvme_hwmon_data *data = _data; + + switch (attr) { + case hwmon_temp_crit: + if (!channel && data->ctrl->cctemp) + return 0444; + break; + case hwmon_temp_max: + case hwmon_temp_min: + if ((!channel && data->ctrl->wctemp) || + (channel && data->log.temp_sensor[channel - 1])) { + if (data->ctrl->quirks & + NVME_QUIRK_NO_TEMP_THRESH_CHANGE) + return 0444; + return 0644; + } + break; + case hwmon_temp_alarm: + if (!channel) + return 0444; + break; + case hwmon_temp_input: + case hwmon_temp_label: + if (!channel || data->log.temp_sensor[channel - 1]) + return 0444; + break; + default: + break; + } + return 0; +} + +static const struct hwmon_channel_info *nvme_hwmon_info[] = { + HWMON_CHANNEL_INFO(chip, HWMON_C_REGISTER_TZ), + HWMON_CHANNEL_INFO(temp, + HWMON_T_INPUT | HWMON_T_MAX | HWMON_T_MIN | + HWMON_T_CRIT | HWMON_T_LABEL | HWMON_T_ALARM, + HWMON_T_INPUT | HWMON_T_MAX | HWMON_T_MIN | + HWMON_T_LABEL, + HWMON_T_INPUT | HWMON_T_MAX | HWMON_T_MIN | + HWMON_T_LABEL, + HWMON_T_INPUT | HWMON_T_MAX | HWMON_T_MIN | + HWMON_T_LABEL, + HWMON_T_INPUT | HWMON_T_MAX | HWMON_T_MIN | + HWMON_T_LABEL, + HWMON_T_INPUT | HWMON_T_MAX | HWMON_T_MIN | + HWMON_T_LABEL, + HWMON_T_INPUT | HWMON_T_MAX | HWMON_T_MIN | + HWMON_T_LABEL, + HWMON_T_INPUT | HWMON_T_MAX | HWMON_T_MIN | + HWMON_T_LABEL, + HWMON_T_INPUT | HWMON_T_MAX | HWMON_T_MIN | + HWMON_T_LABEL), + NULL +}; + +static const struct hwmon_ops nvme_hwmon_ops = { + .is_visible = nvme_hwmon_is_visible, + .read = nvme_hwmon_read, + .read_string = nvme_hwmon_read_string, + .write = nvme_hwmon_write, +}; + +static const struct hwmon_chip_info nvme_hwmon_chip_info = { + .ops = &nvme_hwmon_ops, + .info = nvme_hwmon_info, +}; + +int nvme_hwmon_init(struct nvme_ctrl *ctrl) +{ + struct device *dev = ctrl->device; + struct nvme_hwmon_data *data; + struct device *hwmon; + int err; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return 0; + + data->ctrl = ctrl; + mutex_init(&data->read_lock); + + err = nvme_hwmon_get_smart_log(data); + if (err) { + dev_warn(dev, "Failed to read smart log (error %d)\n", err); + kfree(data); + return err; + } + + hwmon = hwmon_device_register_with_info(dev, "nvme", + data, &nvme_hwmon_chip_info, + NULL); + if (IS_ERR(hwmon)) { + dev_warn(dev, "Failed to instantiate hwmon device\n"); + kfree(data); + return PTR_ERR(hwmon); + } + ctrl->hwmon_device = hwmon; + return 0; +} + +void nvme_hwmon_exit(struct nvme_ctrl *ctrl) +{ + if (ctrl->hwmon_device) { + struct nvme_hwmon_data *data = + dev_get_drvdata(ctrl->hwmon_device); + + hwmon_device_unregister(ctrl->hwmon_device); + ctrl->hwmon_device = NULL; + kfree(data); + } +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/ioctl.c b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/ioctl.c new file mode 100644 index 0000000..2231496 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/ioctl.c @@ -0,0 +1,497 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2011-2014, Intel Corporation. + * Copyright (c) 2017-2021 Christoph Hellwig. + */ +#include /* for force_successful_syscall_return */ +#include +#include "nvme.h" + +/* + * Convert integer values from ioctl structures to user pointers, silently + * ignoring the upper bits in the compat case to match behaviour of 32-bit + * kernels. + */ +static void __user *nvme_to_user_ptr(uintptr_t ptrval) +{ + if (in_compat_syscall()) + ptrval = (compat_uptr_t)ptrval; + return (void __user *)ptrval; +} + +static void *nvme_add_user_metadata(struct bio *bio, void __user *ubuf, + unsigned len, u32 seed, bool write) +{ + struct bio_integrity_payload *bip; + int ret = -ENOMEM; + void *buf; + + buf = kmalloc(len, GFP_KERNEL); + if (!buf) + goto out; + + ret = -EFAULT; + if (write && copy_from_user(buf, ubuf, len)) + goto out_free_meta; + + bip = bio_integrity_alloc(bio, GFP_KERNEL, 1); + if (IS_ERR(bip)) { + ret = PTR_ERR(bip); + goto out_free_meta; + } + + bip->bip_iter.bi_size = len; + bip->bip_iter.bi_sector = seed; + ret = bio_integrity_add_page(bio, virt_to_page(buf), len, + offset_in_page(buf)); + if (ret == len) + return buf; + ret = -ENOMEM; +out_free_meta: + kfree(buf); +out: + return ERR_PTR(ret); +} + +static int nvme_submit_user_cmd(struct request_queue *q, + struct nvme_command *cmd, void __user *ubuffer, + unsigned bufflen, void __user *meta_buffer, unsigned meta_len, + u32 meta_seed, u64 *result, unsigned timeout) +{ + bool write = nvme_is_write(cmd); + struct nvme_ns *ns = q->queuedata; + struct block_device *bdev = ns ? ns->disk->part0 : NULL; + struct request *req; + struct bio *bio = NULL; + void *meta = NULL; + int ret; + + req = nvme_alloc_request(q, cmd, 0); + if (IS_ERR(req)) + return PTR_ERR(req); + + if (timeout) + req->timeout = timeout; + nvme_req(req)->flags |= NVME_REQ_USERCMD; + + if (ubuffer && bufflen) { + ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, + GFP_KERNEL); + if (ret) + goto out; + bio = req->bio; + if (bdev) + bio_set_dev(bio, bdev); + if (bdev && meta_buffer && meta_len) { + meta = nvme_add_user_metadata(bio, meta_buffer, meta_len, + meta_seed, write); + if (IS_ERR(meta)) { + ret = PTR_ERR(meta); + goto out_unmap; + } + req->cmd_flags |= REQ_INTEGRITY; + } + } + + ret = nvme_execute_passthru_rq(req); + if (result) + *result = le64_to_cpu(nvme_req(req)->result.u64); + if (meta && !ret && !write) { + if (copy_to_user(meta_buffer, meta, meta_len)) + ret = -EFAULT; + } + kfree(meta); + out_unmap: + if (bio) + blk_rq_unmap_user(bio); + out: + blk_mq_free_request(req); + return ret; +} + + +static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) +{ + struct nvme_user_io io; + struct nvme_command c; + unsigned length, meta_len; + void __user *metadata; + + if (copy_from_user(&io, uio, sizeof(io))) + return -EFAULT; + if (io.flags) + return -EINVAL; + + switch (io.opcode) { + case nvme_cmd_write: + case nvme_cmd_read: + case nvme_cmd_compare: + break; + default: + return -EINVAL; + } + + length = (io.nblocks + 1) << ns->lba_shift; + + if ((io.control & NVME_RW_PRINFO_PRACT) && + ns->ms == sizeof(struct t10_pi_tuple)) { + /* + * Protection information is stripped/inserted by the + * controller. + */ + if (nvme_to_user_ptr(io.metadata)) + return -EINVAL; + meta_len = 0; + metadata = NULL; + } else { + meta_len = (io.nblocks + 1) * ns->ms; + metadata = nvme_to_user_ptr(io.metadata); + } + + if (ns->features & NVME_NS_EXT_LBAS) { + length += meta_len; + meta_len = 0; + } else if (meta_len) { + if ((io.metadata & 3) || !io.metadata) + return -EINVAL; + } + + memset(&c, 0, sizeof(c)); + c.rw.opcode = io.opcode; + c.rw.flags = io.flags; + c.rw.nsid = cpu_to_le32(ns->head->ns_id); + c.rw.slba = cpu_to_le64(io.slba); + c.rw.length = cpu_to_le16(io.nblocks); + c.rw.control = cpu_to_le16(io.control); + c.rw.dsmgmt = cpu_to_le32(io.dsmgmt); + c.rw.reftag = cpu_to_le32(io.reftag); + c.rw.apptag = cpu_to_le16(io.apptag); + c.rw.appmask = cpu_to_le16(io.appmask); + + return nvme_submit_user_cmd(ns->queue, &c, + nvme_to_user_ptr(io.addr), length, + metadata, meta_len, lower_32_bits(io.slba), NULL, 0); +} + +static bool nvme_validate_passthru_nsid(struct nvme_ctrl *ctrl, + struct nvme_ns *ns, __u32 nsid) +{ + if (ns && nsid != ns->head->ns_id) { + dev_err(ctrl->device, + "%s: nsid (%u) in cmd does not match nsid (%u)" + "of namespace\n", + current->comm, nsid, ns->head->ns_id); + return false; + } + + return true; +} + +static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, + struct nvme_passthru_cmd __user *ucmd) +{ + struct nvme_passthru_cmd cmd; + struct nvme_command c; + unsigned timeout = 0; + u64 result; + int status; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + if (copy_from_user(&cmd, ucmd, sizeof(cmd))) + return -EFAULT; + if (cmd.flags) + return -EINVAL; + if (!nvme_validate_passthru_nsid(ctrl, ns, cmd.nsid)) + return -EINVAL; + + memset(&c, 0, sizeof(c)); + c.common.opcode = cmd.opcode; + c.common.flags = cmd.flags; + c.common.nsid = cpu_to_le32(cmd.nsid); + c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); + c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); + c.common.cdw10 = cpu_to_le32(cmd.cdw10); + c.common.cdw11 = cpu_to_le32(cmd.cdw11); + c.common.cdw12 = cpu_to_le32(cmd.cdw12); + c.common.cdw13 = cpu_to_le32(cmd.cdw13); + c.common.cdw14 = cpu_to_le32(cmd.cdw14); + c.common.cdw15 = cpu_to_le32(cmd.cdw15); + + if (cmd.timeout_ms) + timeout = msecs_to_jiffies(cmd.timeout_ms); + + status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, + nvme_to_user_ptr(cmd.addr), cmd.data_len, + nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, + 0, &result, timeout); + + if (status >= 0) { + if (put_user(result, &ucmd->result)) + return -EFAULT; + } + + return status; +} + +static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, + struct nvme_passthru_cmd64 __user *ucmd) +{ + struct nvme_passthru_cmd64 cmd; + struct nvme_command c; + unsigned timeout = 0; + int status; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + if (copy_from_user(&cmd, ucmd, sizeof(cmd))) + return -EFAULT; + if (cmd.flags) + return -EINVAL; + if (!nvme_validate_passthru_nsid(ctrl, ns, cmd.nsid)) + return -EINVAL; + + memset(&c, 0, sizeof(c)); + c.common.opcode = cmd.opcode; + c.common.flags = cmd.flags; + c.common.nsid = cpu_to_le32(cmd.nsid); + c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); + c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); + c.common.cdw10 = cpu_to_le32(cmd.cdw10); + c.common.cdw11 = cpu_to_le32(cmd.cdw11); + c.common.cdw12 = cpu_to_le32(cmd.cdw12); + c.common.cdw13 = cpu_to_le32(cmd.cdw13); + c.common.cdw14 = cpu_to_le32(cmd.cdw14); + c.common.cdw15 = cpu_to_le32(cmd.cdw15); + + if (cmd.timeout_ms) + timeout = msecs_to_jiffies(cmd.timeout_ms); + + status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, + nvme_to_user_ptr(cmd.addr), cmd.data_len, + nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, + 0, &cmd.result, timeout); + + if (status >= 0) { + if (put_user(cmd.result, &ucmd->result)) + return -EFAULT; + } + + return status; +} + +static bool is_ctrl_ioctl(unsigned int cmd) +{ + if (cmd == NVME_IOCTL_ADMIN_CMD || cmd == NVME_IOCTL_ADMIN64_CMD) + return true; + if (is_sed_ioctl(cmd)) + return true; + return false; +} + +static int nvme_ctrl_ioctl(struct nvme_ctrl *ctrl, unsigned int cmd, + void __user *argp) +{ + switch (cmd) { + case NVME_IOCTL_ADMIN_CMD: + return nvme_user_cmd(ctrl, NULL, argp); + case NVME_IOCTL_ADMIN64_CMD: + return nvme_user_cmd64(ctrl, NULL, argp); + default: + return sed_ioctl(ctrl->opal_dev, cmd, argp); + } +} + +#ifdef COMPAT_FOR_U64_ALIGNMENT +struct nvme_user_io32 { + __u8 opcode; + __u8 flags; + __u16 control; + __u16 nblocks; + __u16 rsvd; + __u64 metadata; + __u64 addr; + __u64 slba; + __u32 dsmgmt; + __u32 reftag; + __u16 apptag; + __u16 appmask; +} __attribute__((__packed__)); +#define NVME_IOCTL_SUBMIT_IO32 _IOW('N', 0x42, struct nvme_user_io32) +#endif /* COMPAT_FOR_U64_ALIGNMENT */ + +static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd, + void __user *argp) +{ + switch (cmd) { + case NVME_IOCTL_ID: + force_successful_syscall_return(); + return ns->head->ns_id; + case NVME_IOCTL_IO_CMD: + return nvme_user_cmd(ns->ctrl, ns, argp); + /* + * struct nvme_user_io can have different padding on some 32-bit ABIs. + * Just accept the compat version as all fields that are used are the + * same size and at the same offset. + */ +#ifdef COMPAT_FOR_U64_ALIGNMENT + case NVME_IOCTL_SUBMIT_IO32: +#endif + case NVME_IOCTL_SUBMIT_IO: + return nvme_submit_io(ns, argp); + case NVME_IOCTL_IO64_CMD: + return nvme_user_cmd64(ns->ctrl, ns, argp); + default: + return -ENOTTY; + } +} + +static int __nvme_ioctl(struct nvme_ns *ns, unsigned int cmd, void __user *arg) +{ + if (is_ctrl_ioctl(cmd)) + return nvme_ctrl_ioctl(ns->ctrl, cmd, arg); + return nvme_ns_ioctl(ns, cmd, arg); +} + +int nvme_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + struct nvme_ns *ns = bdev->bd_disk->private_data; + + return __nvme_ioctl(ns, cmd, (void __user *)arg); +} + +long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct nvme_ns *ns = + container_of(file_inode(file)->i_cdev, struct nvme_ns, cdev); + + return __nvme_ioctl(ns, cmd, (void __user *)arg); +} + +#ifdef CONFIG_NVME_MULTIPATH +static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd, + void __user *argp, struct nvme_ns_head *head, int srcu_idx) + __releases(&head->srcu) +{ + struct nvme_ctrl *ctrl = ns->ctrl; + int ret; + + nvme_get_ctrl(ns->ctrl); + srcu_read_unlock(&head->srcu, srcu_idx); + ret = nvme_ctrl_ioctl(ns->ctrl, cmd, argp); + + nvme_put_ctrl(ctrl); + return ret; +} + +int nvme_ns_head_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + struct nvme_ns_head *head = bdev->bd_disk->private_data; + void __user *argp = (void __user *)arg; + struct nvme_ns *ns; + int srcu_idx, ret = -EWOULDBLOCK; + + srcu_idx = srcu_read_lock(&head->srcu); + ns = nvme_find_path(head); + if (!ns) + goto out_unlock; + + /* + * Handle ioctls that apply to the controller instead of the namespace + * seperately and drop the ns SRCU reference early. This avoids a + * deadlock when deleting namespaces using the passthrough interface. + */ + if (is_ctrl_ioctl(cmd)) + return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx); + + ret = nvme_ns_ioctl(ns, cmd, argp); +out_unlock: + srcu_read_unlock(&head->srcu, srcu_idx); + return ret; +} + +long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct cdev *cdev = file_inode(file)->i_cdev; + struct nvme_ns_head *head = + container_of(cdev, struct nvme_ns_head, cdev); + void __user *argp = (void __user *)arg; + struct nvme_ns *ns; + int srcu_idx, ret = -EWOULDBLOCK; + + srcu_idx = srcu_read_lock(&head->srcu); + ns = nvme_find_path(head); + if (!ns) + goto out_unlock; + + if (is_ctrl_ioctl(cmd)) + return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx); + + ret = nvme_ns_ioctl(ns, cmd, argp); +out_unlock: + srcu_read_unlock(&head->srcu, srcu_idx); + return ret; +} +#endif /* CONFIG_NVME_MULTIPATH */ + +static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp) +{ + struct nvme_ns *ns; + int ret; + + down_read(&ctrl->namespaces_rwsem); + if (list_empty(&ctrl->namespaces)) { + ret = -ENOTTY; + goto out_unlock; + } + + ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list); + if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) { + dev_warn(ctrl->device, + "NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n"); + ret = -EINVAL; + goto out_unlock; + } + + dev_warn(ctrl->device, + "using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n"); + kref_get(&ns->kref); + up_read(&ctrl->namespaces_rwsem); + + ret = nvme_user_cmd(ctrl, ns, argp); + nvme_put_ns(ns); + return ret; + +out_unlock: + up_read(&ctrl->namespaces_rwsem); + return ret; +} + +long nvme_dev_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct nvme_ctrl *ctrl = file->private_data; + void __user *argp = (void __user *)arg; + + switch (cmd) { + case NVME_IOCTL_ADMIN_CMD: + return nvme_user_cmd(ctrl, NULL, argp); + case NVME_IOCTL_ADMIN64_CMD: + return nvme_user_cmd64(ctrl, NULL, argp); + case NVME_IOCTL_IO_CMD: + return nvme_dev_user_cmd(ctrl, argp); + case NVME_IOCTL_RESET: + dev_warn(ctrl->device, "resetting controller\n"); + return nvme_reset_ctrl_sync(ctrl); + case NVME_IOCTL_SUBSYS_RESET: + return nvme_reset_subsystem(ctrl); + case NVME_IOCTL_RESCAN: + nvme_queue_scan(ctrl); + return 0; + default: + return -ENOTTY; + } +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/multipath.c b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/multipath.c new file mode 100644 index 0000000..ff77523 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/multipath.c @@ -0,0 +1,921 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2017-2018 Christoph Hellwig. + */ + +#include +#include +#include +#include "nvme.h" + +static bool multipath = true; +module_param(multipath, bool, 0444); +MODULE_PARM_DESC(multipath, + "turn on native support for multiple controllers per subsystem"); + +static const char *nvme_iopolicy_names[] = { + [NVME_IOPOLICY_NUMA] = "numa", + [NVME_IOPOLICY_RR] = "round-robin", +}; + +static int iopolicy = NVME_IOPOLICY_NUMA; + +static int nvme_set_iopolicy(const char *val, const struct kernel_param *kp) +{ + if (!val) + return -EINVAL; + if (!strncmp(val, "numa", 4)) + iopolicy = NVME_IOPOLICY_NUMA; + else if (!strncmp(val, "round-robin", 11)) + iopolicy = NVME_IOPOLICY_RR; + else + return -EINVAL; + + return 0; +} + +static int nvme_get_iopolicy(char *buf, const struct kernel_param *kp) +{ + return sprintf(buf, "%s\n", nvme_iopolicy_names[iopolicy]); +} + +module_param_call(iopolicy, nvme_set_iopolicy, nvme_get_iopolicy, + &iopolicy, 0644); +MODULE_PARM_DESC(iopolicy, + "Default multipath I/O policy; 'numa' (default) or 'round-robin'"); + +void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys) +{ + subsys->iopolicy = iopolicy; +} + +void nvme_mpath_unfreeze(struct nvme_subsystem *subsys) +{ + struct nvme_ns_head *h; + + lockdep_assert_held(&subsys->lock); + list_for_each_entry(h, &subsys->nsheads, entry) + if (h->disk) + blk_mq_unfreeze_queue(h->disk->queue); +} + +void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys) +{ + struct nvme_ns_head *h; + + lockdep_assert_held(&subsys->lock); + list_for_each_entry(h, &subsys->nsheads, entry) + if (h->disk) + blk_mq_freeze_queue_wait(h->disk->queue); +} + +void nvme_mpath_start_freeze(struct nvme_subsystem *subsys) +{ + struct nvme_ns_head *h; + + lockdep_assert_held(&subsys->lock); + list_for_each_entry(h, &subsys->nsheads, entry) + if (h->disk) + blk_freeze_queue_start(h->disk->queue); +} + +/* + * If multipathing is enabled we need to always use the subsystem instance + * number for numbering our devices to avoid conflicts between subsystems that + * have multiple controllers and thus use the multipath-aware subsystem node + * and those that have a single controller and use the controller node + * directly. + */ +bool nvme_mpath_set_disk_name(struct nvme_ns *ns, char *disk_name, int *flags) +{ + if (!multipath) + return false; + if (!ns->head->disk) { + sprintf(disk_name, "nvme%dn%d", ns->ctrl->subsys->instance, + ns->head->instance); + return true; + } + sprintf(disk_name, "nvme%dc%dn%d", ns->ctrl->subsys->instance, + ns->ctrl->instance, ns->head->instance); + *flags = GENHD_FL_HIDDEN; + return true; +} + +void nvme_failover_req(struct request *req) +{ + struct nvme_ns *ns = req->q->queuedata; + u16 status = nvme_req(req)->status & 0x7ff; + unsigned long flags; + struct bio *bio; + + nvme_mpath_clear_current_path(ns); + + /* + * If we got back an ANA error, we know the controller is alive but not + * ready to serve this namespace. Kick of a re-read of the ANA + * information page, and just try any other available path for now. + */ + if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) { + set_bit(NVME_NS_ANA_PENDING, &ns->flags); + queue_work(nvme_wq, &ns->ctrl->ana_work); + } + + spin_lock_irqsave(&ns->head->requeue_lock, flags); + for (bio = req->bio; bio; bio = bio->bi_next) { + bio_set_dev(bio, ns->head->disk->part0); + if (bio->bi_opf & REQ_POLLED) { + bio->bi_opf &= ~REQ_POLLED; + bio->bi_cookie = BLK_QC_T_NONE; + } + } + blk_steal_bios(&ns->head->requeue_list, req); + spin_unlock_irqrestore(&ns->head->requeue_lock, flags); + + blk_mq_end_request(req, 0); + kblockd_schedule_work(&ns->head->requeue_work); +} + +void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) +{ + struct nvme_ns *ns; + + down_read(&ctrl->namespaces_rwsem); + list_for_each_entry(ns, &ctrl->namespaces, list) { + if (!ns->head->disk) + continue; + kblockd_schedule_work(&ns->head->requeue_work); + if (ctrl->state == NVME_CTRL_LIVE) + disk_uevent(ns->head->disk, KOBJ_CHANGE); + } + up_read(&ctrl->namespaces_rwsem); +} + +static const char *nvme_ana_state_names[] = { + [0] = "invalid state", + [NVME_ANA_OPTIMIZED] = "optimized", + [NVME_ANA_NONOPTIMIZED] = "non-optimized", + [NVME_ANA_INACCESSIBLE] = "inaccessible", + [NVME_ANA_PERSISTENT_LOSS] = "persistent-loss", + [NVME_ANA_CHANGE] = "change", +}; + +bool nvme_mpath_clear_current_path(struct nvme_ns *ns) +{ + struct nvme_ns_head *head = ns->head; + bool changed = false; + int node; + + if (!head) + goto out; + + for_each_node(node) { + if (ns == rcu_access_pointer(head->current_path[node])) { + rcu_assign_pointer(head->current_path[node], NULL); + changed = true; + } + } +out: + return changed; +} + +void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl) +{ + struct nvme_ns *ns; + + down_read(&ctrl->namespaces_rwsem); + list_for_each_entry(ns, &ctrl->namespaces, list) { + nvme_mpath_clear_current_path(ns); + kblockd_schedule_work(&ns->head->requeue_work); + } + up_read(&ctrl->namespaces_rwsem); +} + +void nvme_mpath_revalidate_paths(struct nvme_ns *ns) +{ + struct nvme_ns_head *head = ns->head; + sector_t capacity = get_capacity(head->disk); + int node; + + list_for_each_entry_rcu(ns, &head->list, siblings) { + if (capacity != get_capacity(ns->disk)) + clear_bit(NVME_NS_READY, &ns->flags); + } + + for_each_node(node) + rcu_assign_pointer(head->current_path[node], NULL); +} + +static bool nvme_path_is_disabled(struct nvme_ns *ns) +{ + /* + * We don't treat NVME_CTRL_DELETING as a disabled path as I/O should + * still be able to complete assuming that the controller is connected. + * Otherwise it will fail immediately and return to the requeue list. + */ + if (ns->ctrl->state != NVME_CTRL_LIVE && + ns->ctrl->state != NVME_CTRL_DELETING) + return true; + if (test_bit(NVME_NS_ANA_PENDING, &ns->flags) || + !test_bit(NVME_NS_READY, &ns->flags)) + return true; + return false; +} + +static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node) +{ + int found_distance = INT_MAX, fallback_distance = INT_MAX, distance; + struct nvme_ns *found = NULL, *fallback = NULL, *ns; + + list_for_each_entry_rcu(ns, &head->list, siblings) { + if (nvme_path_is_disabled(ns)) + continue; + + if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA) + distance = node_distance(node, ns->ctrl->numa_node); + else + distance = LOCAL_DISTANCE; + + switch (ns->ana_state) { + case NVME_ANA_OPTIMIZED: + if (distance < found_distance) { + found_distance = distance; + found = ns; + } + break; + case NVME_ANA_NONOPTIMIZED: + if (distance < fallback_distance) { + fallback_distance = distance; + fallback = ns; + } + break; + default: + break; + } + } + + if (!found) + found = fallback; + if (found) + rcu_assign_pointer(head->current_path[node], found); + return found; +} + +static struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head, + struct nvme_ns *ns) +{ + ns = list_next_or_null_rcu(&head->list, &ns->siblings, struct nvme_ns, + siblings); + if (ns) + return ns; + return list_first_or_null_rcu(&head->list, struct nvme_ns, siblings); +} + +static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head, + int node, struct nvme_ns *old) +{ + struct nvme_ns *ns, *found = NULL; + + if (list_is_singular(&head->list)) { + if (nvme_path_is_disabled(old)) + return NULL; + return old; + } + + for (ns = nvme_next_ns(head, old); + ns && ns != old; + ns = nvme_next_ns(head, ns)) { + if (nvme_path_is_disabled(ns)) + continue; + + if (ns->ana_state == NVME_ANA_OPTIMIZED) { + found = ns; + goto out; + } + if (ns->ana_state == NVME_ANA_NONOPTIMIZED) + found = ns; + } + + /* + * The loop above skips the current path for round-robin semantics. + * Fall back to the current path if either: + * - no other optimized path found and current is optimized, + * - no other usable path found and current is usable. + */ + if (!nvme_path_is_disabled(old) && + (old->ana_state == NVME_ANA_OPTIMIZED || + (!found && old->ana_state == NVME_ANA_NONOPTIMIZED))) + return old; + + if (!found) + return NULL; +out: + rcu_assign_pointer(head->current_path[node], found); + return found; +} + +static inline bool nvme_path_is_optimized(struct nvme_ns *ns) +{ + return ns->ctrl->state == NVME_CTRL_LIVE && + ns->ana_state == NVME_ANA_OPTIMIZED; +} + +inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head) +{ + int node = numa_node_id(); + struct nvme_ns *ns; + + ns = srcu_dereference(head->current_path[node], &head->srcu); + if (unlikely(!ns)) + return __nvme_find_path(head, node); + + if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR) + return nvme_round_robin_path(head, node, ns); + if (unlikely(!nvme_path_is_optimized(ns))) + return __nvme_find_path(head, node); + return ns; +} + +static bool nvme_available_path(struct nvme_ns_head *head) +{ + struct nvme_ns *ns; + + list_for_each_entry_rcu(ns, &head->list, siblings) { + if (test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ns->ctrl->flags)) + continue; + switch (ns->ctrl->state) { + case NVME_CTRL_LIVE: + case NVME_CTRL_RESETTING: + case NVME_CTRL_CONNECTING: + /* fallthru */ + return true; + default: + break; + } + } + return false; +} + +static void nvme_ns_head_submit_bio(struct bio *bio) +{ + struct nvme_ns_head *head = bio->bi_bdev->bd_disk->private_data; + struct device *dev = disk_to_dev(head->disk); + struct nvme_ns *ns; + int srcu_idx; + + /* + * The namespace might be going away and the bio might be moved to a + * different queue via blk_steal_bios(), so we need to use the bio_split + * pool from the original queue to allocate the bvecs from. + */ + blk_queue_split(&bio); + + srcu_idx = srcu_read_lock(&head->srcu); + ns = nvme_find_path(head); + if (likely(ns)) { + bio_set_dev(bio, ns->disk->part0); + bio->bi_opf |= REQ_NVME_MPATH; + trace_block_bio_remap(bio, disk_devt(ns->head->disk), + bio->bi_iter.bi_sector); + submit_bio_noacct(bio); + } else if (nvme_available_path(head)) { + dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n"); + + spin_lock_irq(&head->requeue_lock); + bio_list_add(&head->requeue_list, bio); + spin_unlock_irq(&head->requeue_lock); + } else { + dev_warn_ratelimited(dev, "no available path - failing I/O\n"); + + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + } + + srcu_read_unlock(&head->srcu, srcu_idx); +} + +static int nvme_ns_head_open(struct block_device *bdev, fmode_t mode) +{ + if (!nvme_tryget_ns_head(bdev->bd_disk->private_data)) + return -ENXIO; + return 0; +} + +static void nvme_ns_head_release(struct gendisk *disk, fmode_t mode) +{ + nvme_put_ns_head(disk->private_data); +} + +#ifdef CONFIG_BLK_DEV_ZONED +static int nvme_ns_head_report_zones(struct gendisk *disk, sector_t sector, + unsigned int nr_zones, report_zones_cb cb, void *data) +{ + struct nvme_ns_head *head = disk->private_data; + struct nvme_ns *ns; + int srcu_idx, ret = -EWOULDBLOCK; + + srcu_idx = srcu_read_lock(&head->srcu); + ns = nvme_find_path(head); + if (ns) + ret = nvme_ns_report_zones(ns, sector, nr_zones, cb, data); + srcu_read_unlock(&head->srcu, srcu_idx); + return ret; +} +#else +#define nvme_ns_head_report_zones NULL +#endif /* CONFIG_BLK_DEV_ZONED */ + +const struct block_device_operations nvme_ns_head_ops = { + .owner = THIS_MODULE, + .submit_bio = nvme_ns_head_submit_bio, + .open = nvme_ns_head_open, + .release = nvme_ns_head_release, + .ioctl = nvme_ns_head_ioctl, + .getgeo = nvme_getgeo, + .report_zones = nvme_ns_head_report_zones, + .pr_ops = &nvme_pr_ops, +}; + +static inline struct nvme_ns_head *cdev_to_ns_head(struct cdev *cdev) +{ + return container_of(cdev, struct nvme_ns_head, cdev); +} + +static int nvme_ns_head_chr_open(struct inode *inode, struct file *file) +{ + if (!nvme_tryget_ns_head(cdev_to_ns_head(inode->i_cdev))) + return -ENXIO; + return 0; +} + +static int nvme_ns_head_chr_release(struct inode *inode, struct file *file) +{ + nvme_put_ns_head(cdev_to_ns_head(inode->i_cdev)); + return 0; +} + +static const struct file_operations nvme_ns_head_chr_fops = { + .owner = THIS_MODULE, + .open = nvme_ns_head_chr_open, + .release = nvme_ns_head_chr_release, + .unlocked_ioctl = nvme_ns_head_chr_ioctl, + .compat_ioctl = compat_ptr_ioctl, +}; + +static int nvme_add_ns_head_cdev(struct nvme_ns_head *head) +{ + int ret; + + head->cdev_device.parent = &head->subsys->dev; + ret = dev_set_name(&head->cdev_device, "ng%dn%d", + head->subsys->instance, head->instance); + if (ret) + return ret; + ret = nvme_cdev_add(&head->cdev, &head->cdev_device, + &nvme_ns_head_chr_fops, THIS_MODULE); + return ret; +} + +static void nvme_requeue_work(struct work_struct *work) +{ + struct nvme_ns_head *head = + container_of(work, struct nvme_ns_head, requeue_work); + struct bio *bio, *next; + + spin_lock_irq(&head->requeue_lock); + next = bio_list_get(&head->requeue_list); + spin_unlock_irq(&head->requeue_lock); + + while ((bio = next) != NULL) { + next = bio->bi_next; + bio->bi_next = NULL; + + submit_bio_noacct(bio); + } +} + +int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) +{ + bool vwc = false; + + mutex_init(&head->lock); + bio_list_init(&head->requeue_list); + spin_lock_init(&head->requeue_lock); + INIT_WORK(&head->requeue_work, nvme_requeue_work); + + /* + * Add a multipath node if the subsystems supports multiple controllers. + * We also do this for private namespaces as the namespace sharing data could + * change after a rescan. + */ + if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) || !multipath) + return 0; + + head->disk = blk_alloc_disk(ctrl->numa_node); + if (!head->disk) + return -ENOMEM; + head->disk->fops = &nvme_ns_head_ops; + head->disk->private_data = head; + sprintf(head->disk->disk_name, "nvme%dn%d", + ctrl->subsys->instance, head->instance); + + blk_queue_flag_set(QUEUE_FLAG_NONROT, head->disk->queue); + blk_queue_flag_set(QUEUE_FLAG_NOWAIT, head->disk->queue); + /* + * This assumes all controllers that refer to a namespace either + * support poll queues or not. That is not a strict guarantee, + * but if the assumption is wrong the effect is only suboptimal + * performance but not correctness problem. + */ + if (ctrl->tagset->nr_maps > HCTX_TYPE_POLL && + ctrl->tagset->map[HCTX_TYPE_POLL].nr_queues) + blk_queue_flag_set(QUEUE_FLAG_POLL, head->disk->queue); + + /* set to a default value of 512 until the disk is validated */ + blk_queue_logical_block_size(head->disk->queue, 512); + blk_set_stacking_limits(&head->disk->queue->limits); + + /* we need to propagate up the VMC settings */ + if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) + vwc = true; + blk_queue_write_cache(head->disk->queue, vwc, vwc); + return 0; +} + +static void nvme_mpath_set_live(struct nvme_ns *ns) +{ + struct nvme_ns_head *head = ns->head; + int rc; + + if (!head->disk) + return; + + /* + * test_and_set_bit() is used because it is protecting against two nvme + * paths simultaneously calling device_add_disk() on the same namespace + * head. + */ + if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) { + rc = device_add_disk(&head->subsys->dev, head->disk, + nvme_ns_id_attr_groups); + if (rc) { + clear_bit(NVME_NSHEAD_DISK_LIVE, &ns->flags); + return; + } + nvme_add_ns_head_cdev(head); + } + + mutex_lock(&head->lock); + if (nvme_path_is_optimized(ns)) { + int node, srcu_idx; + + srcu_idx = srcu_read_lock(&head->srcu); + for_each_node(node) + __nvme_find_path(head, node); + srcu_read_unlock(&head->srcu, srcu_idx); + } + mutex_unlock(&head->lock); + + synchronize_srcu(&head->srcu); + kblockd_schedule_work(&head->requeue_work); +} + +static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data, + int (*cb)(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *, + void *)) +{ + void *base = ctrl->ana_log_buf; + size_t offset = sizeof(struct nvme_ana_rsp_hdr); + int error, i; + + lockdep_assert_held(&ctrl->ana_lock); + + for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) { + struct nvme_ana_group_desc *desc = base + offset; + u32 nr_nsids; + size_t nsid_buf_size; + + if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc))) + return -EINVAL; + + nr_nsids = le32_to_cpu(desc->nnsids); + nsid_buf_size = flex_array_size(desc, nsids, nr_nsids); + + if (WARN_ON_ONCE(desc->grpid == 0)) + return -EINVAL; + if (WARN_ON_ONCE(le32_to_cpu(desc->grpid) > ctrl->anagrpmax)) + return -EINVAL; + if (WARN_ON_ONCE(desc->state == 0)) + return -EINVAL; + if (WARN_ON_ONCE(desc->state > NVME_ANA_CHANGE)) + return -EINVAL; + + offset += sizeof(*desc); + if (WARN_ON_ONCE(offset > ctrl->ana_log_size - nsid_buf_size)) + return -EINVAL; + + error = cb(ctrl, desc, data); + if (error) + return error; + + offset += nsid_buf_size; + } + + return 0; +} + +static inline bool nvme_state_is_live(enum nvme_ana_state state) +{ + return state == NVME_ANA_OPTIMIZED || state == NVME_ANA_NONOPTIMIZED; +} + +static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc, + struct nvme_ns *ns) +{ + ns->ana_grpid = le32_to_cpu(desc->grpid); + ns->ana_state = desc->state; + clear_bit(NVME_NS_ANA_PENDING, &ns->flags); + + if (nvme_state_is_live(ns->ana_state)) + nvme_mpath_set_live(ns); +} + +static int nvme_update_ana_state(struct nvme_ctrl *ctrl, + struct nvme_ana_group_desc *desc, void *data) +{ + u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0; + unsigned *nr_change_groups = data; + struct nvme_ns *ns; + + dev_dbg(ctrl->device, "ANA group %d: %s.\n", + le32_to_cpu(desc->grpid), + nvme_ana_state_names[desc->state]); + + if (desc->state == NVME_ANA_CHANGE) + (*nr_change_groups)++; + + if (!nr_nsids) + return 0; + + down_read(&ctrl->namespaces_rwsem); + list_for_each_entry(ns, &ctrl->namespaces, list) { + unsigned nsid; +again: + nsid = le32_to_cpu(desc->nsids[n]); + if (ns->head->ns_id < nsid) + continue; + if (ns->head->ns_id == nsid) + nvme_update_ns_ana_state(desc, ns); + if (++n == nr_nsids) + break; + if (ns->head->ns_id > nsid) + goto again; + } + up_read(&ctrl->namespaces_rwsem); + return 0; +} + +static int nvme_read_ana_log(struct nvme_ctrl *ctrl) +{ + u32 nr_change_groups = 0; + int error; + + mutex_lock(&ctrl->ana_lock); + error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0, NVME_CSI_NVM, + ctrl->ana_log_buf, ctrl->ana_log_size, 0); + if (error) { + dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error); + goto out_unlock; + } + + error = nvme_parse_ana_log(ctrl, &nr_change_groups, + nvme_update_ana_state); + if (error) + goto out_unlock; + + /* + * In theory we should have an ANATT timer per group as they might enter + * the change state at different times. But that is a lot of overhead + * just to protect against a target that keeps entering new changes + * states while never finishing previous ones. But we'll still + * eventually time out once all groups are in change state, so this + * isn't a big deal. + * + * We also double the ANATT value to provide some slack for transports + * or AEN processing overhead. + */ + if (nr_change_groups) + mod_timer(&ctrl->anatt_timer, ctrl->anatt * HZ * 2 + jiffies); + else + del_timer_sync(&ctrl->anatt_timer); +out_unlock: + mutex_unlock(&ctrl->ana_lock); + return error; +} + +static void nvme_ana_work(struct work_struct *work) +{ + struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work); + + if (ctrl->state != NVME_CTRL_LIVE) + return; + + nvme_read_ana_log(ctrl); +} + +static void nvme_anatt_timeout(struct timer_list *t) +{ + struct nvme_ctrl *ctrl = from_timer(ctrl, t, anatt_timer); + + dev_info(ctrl->device, "ANATT timeout, resetting controller.\n"); + nvme_reset_ctrl(ctrl); +} + +void nvme_mpath_stop(struct nvme_ctrl *ctrl) +{ + if (!nvme_ctrl_use_ana(ctrl)) + return; + del_timer_sync(&ctrl->anatt_timer); + cancel_work_sync(&ctrl->ana_work); +} + +#define SUBSYS_ATTR_RW(_name, _mode, _show, _store) \ + struct device_attribute subsys_attr_##_name = \ + __ATTR(_name, _mode, _show, _store) + +static ssize_t nvme_subsys_iopolicy_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvme_subsystem *subsys = + container_of(dev, struct nvme_subsystem, dev); + + return sysfs_emit(buf, "%s\n", + nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]); +} + +static ssize_t nvme_subsys_iopolicy_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct nvme_subsystem *subsys = + container_of(dev, struct nvme_subsystem, dev); + int i; + + for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) { + if (sysfs_streq(buf, nvme_iopolicy_names[i])) { + WRITE_ONCE(subsys->iopolicy, i); + return count; + } + } + + return -EINVAL; +} +SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR, + nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store); + +static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid); +} +DEVICE_ATTR_RO(ana_grpid); + +static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvme_ns *ns = nvme_get_ns_from_dev(dev); + + return sysfs_emit(buf, "%s\n", nvme_ana_state_names[ns->ana_state]); +} +DEVICE_ATTR_RO(ana_state); + +static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl, + struct nvme_ana_group_desc *desc, void *data) +{ + struct nvme_ana_group_desc *dst = data; + + if (desc->grpid != dst->grpid) + return 0; + + *dst = *desc; + return -ENXIO; /* just break out of the loop */ +} + +void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id) +{ + if (nvme_ctrl_use_ana(ns->ctrl)) { + struct nvme_ana_group_desc desc = { + .grpid = id->anagrpid, + .state = 0, + }; + + mutex_lock(&ns->ctrl->ana_lock); + ns->ana_grpid = le32_to_cpu(id->anagrpid); + nvme_parse_ana_log(ns->ctrl, &desc, nvme_lookup_ana_group_desc); + mutex_unlock(&ns->ctrl->ana_lock); + if (desc.state) { + /* found the group desc: update */ + nvme_update_ns_ana_state(&desc, ns); + } else { + /* group desc not found: trigger a re-read */ + set_bit(NVME_NS_ANA_PENDING, &ns->flags); + queue_work(nvme_wq, &ns->ctrl->ana_work); + } + } else { + ns->ana_state = NVME_ANA_OPTIMIZED; + nvme_mpath_set_live(ns); + } + + if (blk_queue_stable_writes(ns->queue) && ns->head->disk) + blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, + ns->head->disk->queue); +#ifdef CONFIG_BLK_DEV_ZONED + if (blk_queue_is_zoned(ns->queue) && ns->head->disk) + ns->head->disk->queue->nr_zones = ns->queue->nr_zones; +#endif +} + +void nvme_mpath_shutdown_disk(struct nvme_ns_head *head) +{ + if (!head->disk) + return; + kblockd_schedule_work(&head->requeue_work); + if (test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) { + nvme_cdev_del(&head->cdev, &head->cdev_device); + del_gendisk(head->disk); + } +} + +void nvme_mpath_remove_disk(struct nvme_ns_head *head) +{ + if (!head->disk) + return; + blk_mark_disk_dead(head->disk); + /* make sure all pending bios are cleaned up */ + kblockd_schedule_work(&head->requeue_work); + flush_work(&head->requeue_work); + blk_cleanup_disk(head->disk); +} + +void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl) +{ + mutex_init(&ctrl->ana_lock); + timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0); + INIT_WORK(&ctrl->ana_work, nvme_ana_work); +} + +int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) +{ + size_t max_transfer_size = ctrl->max_hw_sectors << SECTOR_SHIFT; + size_t ana_log_size; + int error = 0; + + /* check if multipath is enabled and we have the capability */ + if (!multipath || !ctrl->subsys || + !(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA)) + return 0; + + if (!ctrl->max_namespaces || + ctrl->max_namespaces > le32_to_cpu(id->nn)) { + dev_err(ctrl->device, + "Invalid MNAN value %u\n", ctrl->max_namespaces); + return -EINVAL; + } + + ctrl->anacap = id->anacap; + ctrl->anatt = id->anatt; + ctrl->nanagrpid = le32_to_cpu(id->nanagrpid); + ctrl->anagrpmax = le32_to_cpu(id->anagrpmax); + + ana_log_size = sizeof(struct nvme_ana_rsp_hdr) + + ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc) + + ctrl->max_namespaces * sizeof(__le32); + if (ana_log_size > max_transfer_size) { + dev_err(ctrl->device, + "ANA log page size (%zd) larger than MDTS (%zd).\n", + ana_log_size, max_transfer_size); + dev_err(ctrl->device, "disabling ANA support.\n"); + goto out_uninit; + } + if (ana_log_size > ctrl->ana_log_size) { + nvme_mpath_stop(ctrl); + nvme_mpath_uninit(ctrl); + ctrl->ana_log_buf = kmalloc(ana_log_size, GFP_KERNEL); + if (!ctrl->ana_log_buf) + return -ENOMEM; + } + ctrl->ana_log_size = ana_log_size; + error = nvme_read_ana_log(ctrl); + if (error) + goto out_uninit; + return 0; + +out_uninit: + nvme_mpath_uninit(ctrl); + return error; +} + +void nvme_mpath_uninit(struct nvme_ctrl *ctrl) +{ + kfree(ctrl->ana_log_buf); + ctrl->ana_log_buf = NULL; + ctrl->ana_log_size = 0; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/nvfs-dma.c b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/nvfs-dma.c new file mode 100644 index 0000000..2653427 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/nvfs-dma.c @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2021 NVIDIA Corporation. + */ + +#ifdef CONFIG_NVFS +#define MODULE_PREFIX nvme_v1 +#include "nvfs.h" + +struct nvfs_dma_rw_ops *nvfs_ops = NULL; + +atomic_t nvfs_shutdown = ATOMIC_INIT(1); + +DEFINE_PER_CPU(long, nvfs_n_ops); + +// must have for compatability +#define NVIDIA_FS_COMPAT_FT(ops) \ + (NVIDIA_FS_CHECK_FT_SGLIST_PREP(ops) && NVIDIA_FS_CHECK_FT_SGLIST_DMA(ops)) + +// protected via nvfs_module_mutex +int REGISTER_FUNC (struct nvfs_dma_rw_ops *ops) +{ + if (NVIDIA_FS_COMPAT_FT(ops)) { + nvfs_ops = ops; + atomic_set(&nvfs_shutdown, 0); + return 0; + } else + return -ENOTSUPP; + +} +EXPORT_SYMBOL(REGISTER_FUNC); + +// protected via nvfs_module_mutex +void UNREGISTER_FUNC (void) +{ + (void) atomic_cmpxchg(&nvfs_shutdown, 0, 1); + do{ + msleep(NVFS_HOLD_TIME_MS); + } while (nvfs_count_ops()); + nvfs_ops = NULL; +} +EXPORT_SYMBOL(UNREGISTER_FUNC); +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/nvfs-dma.h b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/nvfs-dma.h new file mode 100644 index 0000000..abed616 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/nvfs-dma.h @@ -0,0 +1,112 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2021 NVIDIA Corporation. + */ + +#ifndef NVFS_DMA_H +#define NVFS_DMA_H + +static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev, + struct request *req, struct nvme_rw_command *cmnd); + +static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev, + struct request *req, struct nvme_rw_command *cmd, int entries); + +static bool nvme_nvfs_unmap_data(struct nvme_dev *dev, struct request *req) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + enum dma_data_direction dma_dir = rq_dma_dir(req); + + if (!iod || !iod->nents) + return false; + if (iod->sg && !is_pci_p2pdma_page(sg_page(iod->sg)) && + !blk_integrity_rq(req) && + !iod->dma_len && + nvfs_ops != NULL) { + int count; + count = nvfs_ops->nvfs_dma_unmap_sg(dev->dev, iod->sg, iod->nents, + dma_dir); + + if (!count) + return false; + + nvfs_put_ops(); + return true; + } + return false; +} + +static blk_status_t nvme_nvfs_map_data(struct nvme_dev *dev, struct request *req, + struct nvme_command *cmnd, bool *is_nvfs_io) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct request_queue *q = req->q; + enum dma_data_direction dma_dir = rq_dma_dir(req); + blk_status_t ret = BLK_STS_RESOURCE; + int nr_mapped; + + nr_mapped = 0; + *is_nvfs_io = false; + + if (!blk_integrity_rq(req) && nvfs_get_ops()) { + iod->dma_len = 0; + iod->sg = mempool_alloc(dev->iod_mempool, GFP_ATOMIC); + if (!iod->sg) { + nvfs_put_ops(); + return BLK_STS_RESOURCE; + } + + sg_init_table(iod->sg, blk_rq_nr_phys_segments(req)); + // associates bio pages to scatterlist + iod->nents = nvfs_ops->nvfs_blk_rq_map_sg(q, req, iod->sg); + if (!iod->nents) { + mempool_free(iod->sg, dev->iod_mempool); + nvfs_put_ops(); + return BLK_STS_IOERR; // reset to original ret + } + *is_nvfs_io = true; + + if (unlikely((iod->nents == NVFS_IO_ERR))) { + pr_err("%s: failed to map sg_nents=:%d\n", __func__, iod->nents); + mempool_free(iod->sg, dev->iod_mempool); + nvfs_put_ops(); + return BLK_STS_IOERR; + } + + nr_mapped = nvfs_ops->nvfs_dma_map_sg_attrs(dev->dev, + iod->sg, + iod->nents, + dma_dir, + DMA_ATTR_NO_WARN); + + if (unlikely((nr_mapped == NVFS_IO_ERR))) { + mempool_free(iod->sg, dev->iod_mempool); + nvfs_put_ops(); + pr_err("%s: failed to dma map sglist=:%d\n", __func__, iod->nents); + return BLK_STS_IOERR; + } + + if (unlikely(nr_mapped == NVFS_CPU_REQ)) { + mempool_free(iod->sg, dev->iod_mempool); + nvfs_put_ops(); + BUG(); + } + + iod->use_sgl = nvme_pci_use_sgls(dev, req); + if (iod->use_sgl) { // TBD: not tested on SGL mode supporting drive + ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw, nr_mapped); + } else { + // push dma address to hw registers + ret = nvme_pci_setup_prps(dev, req, &cmnd->rw); + } + + if (ret != BLK_STS_OK) { + nvme_nvfs_unmap_data(dev, req); + mempool_free(iod->sg, dev->iod_mempool); + } + return ret; + } + return ret; +} + +#endif /* NVFS_DMA_H */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/nvfs-rdma.c b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/nvfs-rdma.c new file mode 100644 index 0000000..0fb7ac2 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/nvfs-rdma.c @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2021 NVIDIA Corporation. + */ + +#ifdef CONFIG_NVFS +#define MODULE_PREFIX nvme_rdma_v1 +#include "nvfs.h" + +struct nvfs_dma_rw_ops *nvfs_ops = NULL; + +atomic_t nvfs_shutdown = ATOMIC_INIT(1); + +DEFINE_PER_CPU(long, nvfs_n_ops); + +// must have for compatability +#define NVIDIA_FS_COMPAT_FT(ops) \ + (NVIDIA_FS_CHECK_FT_SGLIST_PREP(ops) && NVIDIA_FS_CHECK_FT_SGLIST_DMA(ops)) + +// protected via nvfs_module_mutex +int REGISTER_FUNC (struct nvfs_dma_rw_ops *ops) +{ + if (NVIDIA_FS_COMPAT_FT(ops)) { + nvfs_ops = ops; + atomic_set(&nvfs_shutdown, 0); + return 0; + } else + return -ENOTSUPP; + +} +EXPORT_SYMBOL(REGISTER_FUNC); + +// protected via nvfs_module_mutex +void UNREGISTER_FUNC (void) +{ + (void) atomic_cmpxchg(&nvfs_shutdown, 0, 1); + do{ + msleep(NVFS_HOLD_TIME_MS); + } while (nvfs_count_ops()); + nvfs_ops = NULL; +} +EXPORT_SYMBOL(UNREGISTER_FUNC); +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/nvfs-rdma.h b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/nvfs-rdma.h new file mode 100644 index 0000000..7649b95 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/nvfs-rdma.h @@ -0,0 +1,121 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2021 NVIDIA Corporation. + */ + +#ifndef NVFS_RDMA_H +#define NVFS_RDMA_H + +#define DEV queue->device->dev->dma_device +#define SGL req->data_sgl.sg_table.sgl + +static int nvme_rdma_map_sg_inline(struct nvme_rdma_queue *queue, + struct nvme_rdma_request *req, struct nvme_command *c, + int count); + +static int nvme_rdma_map_sg_single(struct nvme_rdma_queue *queue, + struct nvme_rdma_request *req, struct nvme_command *c); + + +static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue, + struct nvme_rdma_request *req, struct nvme_command *c, + int count); + +static bool nvme_rdma_nvfs_unmap_data(struct nvme_rdma_queue *queue, + struct request *rq) + +{ + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + enum dma_data_direction dma_dir = rq_dma_dir(rq); + int count; + + if (!blk_integrity_rq(rq) && nvfs_ops != NULL) { + count = nvfs_ops->nvfs_dma_unmap_sg(DEV, SGL, req->data_sgl.nents, + dma_dir); + if (count) { + nvfs_put_ops(); + sg_free_table_chained(&req->data_sgl.sg_table, NVME_INLINE_SG_CNT); + return true; + } + } + return false; +} + +static int nvme_rdma_nvfs_map_data(struct nvme_rdma_queue *queue, struct request *rq, + struct nvme_command *cmnd, bool *is_nvfs_io) +{ + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_rdma_device *dev = queue->device; + enum dma_data_direction dma_dir = rq_dma_dir(rq); + int count, ret = 0; + + *is_nvfs_io = false; + count = 0; + if (!blk_integrity_rq(rq) && nvfs_get_ops()) { + // associates bio pages to scatterlist + count = nvfs_ops->nvfs_blk_rq_map_sg(rq->q, rq , SGL); + if (!count) { + nvfs_put_ops(); + return 0; // fall to cpu path + } + + *is_nvfs_io = true; + if (unlikely((count == NVFS_IO_ERR))) { + nvfs_put_ops(); + pr_err("%s: failed to map sg_nents=:%d\n", __func__, req->data_sgl.nents); + ret = -EIO; + goto out_free_table; + } + req->data_sgl.nents = count; + + count = nvfs_ops->nvfs_dma_map_sg_attrs(DEV, + SGL, + req->data_sgl.nents, + dma_dir, + DMA_ATTR_NO_WARN); + + if (unlikely((count == NVFS_IO_ERR))) { + nvfs_put_ops(); + ret = -EIO; + goto out_free_table; + } + + if (unlikely(count == NVFS_CPU_REQ)) { + nvfs_put_ops(); + BUG(); + return -EIO; + } + + if (count <= dev->num_inline_segments) { + if (rq_data_dir(rq) == WRITE && nvme_rdma_queue_idx(queue) && + queue->ctrl->use_inline_data && + blk_rq_payload_bytes(rq) <= + nvme_rdma_inline_data_size(queue)) { + ret = nvme_rdma_map_sg_inline(queue, req, cmnd, count); + goto out; + } + + if (count == 1 && dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) { + ret = nvme_rdma_map_sg_single(queue, req, cmnd); + goto out; + } + } + + ret = nvme_rdma_map_sg_fr(queue, req, cmnd, count); +out: + if (unlikely(ret)) { + nvme_rdma_nvfs_unmap_data(queue, rq); + } + + return ret; + } else { + // Fall to CPU path + return 0; + } + +out_free_table: + sg_free_table_chained(&req->data_sgl.sg_table, NVME_INLINE_SG_CNT); + return ret; +} + +#endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/nvfs.h b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/nvfs.h new file mode 100644 index 0000000..665abc8 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/nvfs.h @@ -0,0 +1,103 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2021 NVIDIA Corporation. + */ + +#ifndef NVFS_H +#define NVFS_H + +#include +#include +#include +#include +#include +#include +#include + +#define REGSTR2(x) x##_register_nvfs_dma_ops +#define REGSTR(x) REGSTR2(x) + +#define UNREGSTR2(x) x##_unregister_nvfs_dma_ops +#define UNREGSTR(x) UNREGSTR2(x) + +#define REGISTER_FUNC REGSTR(MODULE_PREFIX) +#define UNREGISTER_FUNC UNREGSTR(MODULE_PREFIX) + +#define NVFS_IO_ERR -1 +#define NVFS_CPU_REQ -2 + +#define NVFS_HOLD_TIME_MS 1000 + +extern struct nvfs_dma_rw_ops *nvfs_ops; + +extern atomic_t nvfs_shutdown; + +DECLARE_PER_CPU(long, nvfs_n_ops); + +static inline long nvfs_count_ops(void) +{ + int i; + long sum = 0; + for_each_possible_cpu(i) + sum += per_cpu(nvfs_n_ops, i); + return sum; +} + +static inline bool nvfs_get_ops(void) +{ + if (nvfs_ops && !atomic_read(&nvfs_shutdown)) { + this_cpu_inc(nvfs_n_ops); + return true; + } + return false; +} + +static inline void nvfs_put_ops(void) +{ + this_cpu_dec(nvfs_n_ops); +} + +struct nvfs_dma_rw_ops { + unsigned long long ft_bmap; // feature bitmap + + int (*nvfs_blk_rq_map_sg) (struct request_queue *q, + struct request *req, + struct scatterlist *sglist); + + int (*nvfs_dma_map_sg_attrs) (struct device *device, + struct scatterlist *sglist, + int nents, + enum dma_data_direction dma_dir, + unsigned long attrs); + + int (*nvfs_dma_unmap_sg) (struct device *device, + struct scatterlist *sglist, + int nents, + enum dma_data_direction dma_dir); + + bool (*nvfs_is_gpu_page) (struct page *page); + + unsigned int (*nvfs_gpu_index) (struct page *page); + + unsigned int (*nvfs_device_priority) (struct device *dev, unsigned int gpu_index); +}; + +// feature list for dma_ops, values indicate bit pos +enum ft_bits { + nvfs_ft_prep_sglist = 1ULL << 0, + nvfs_ft_map_sglist = 1ULL << 1, + nvfs_ft_is_gpu_page = 1ULL << 2, + nvfs_ft_device_priority = 1ULL << 3, +}; + +// check features for use in registration with vendor drivers +#define NVIDIA_FS_CHECK_FT_SGLIST_PREP(ops) ((ops)->ft_bmap & nvfs_ft_prep_sglist) +#define NVIDIA_FS_CHECK_FT_SGLIST_DMA(ops) ((ops)->ft_bmap & nvfs_ft_map_sglist) +#define NVIDIA_FS_CHECK_FT_GPU_PAGE(ops) ((ops)->ft_bmap & nvfs_ft_is_gpu_page) +#define NVIDIA_FS_CHECK_FT_DEVICE_PRIORITY(ops) ((ops)->ft_bmap & nvfs_ft_device_priority) + +int REGISTER_FUNC (struct nvfs_dma_rw_ops *ops); + +void UNREGISTER_FUNC (void); + +#endif /* NVFS_H */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/nvme-core_dummy.c b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/nvme-core_dummy.c new file mode 100644 index 0000000..5784c7a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/nvme-core_dummy.c @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "nvme-core" +#define PFX DRV_NAME ": " +#define DRV_VERSION "2.0.0" +#define DRV_RELDATE "November 13, 2016" + +MODULE_AUTHOR("Alaa Hleihel"); +MODULE_DESCRIPTION("nvme-core dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init nvme_core_init(void) +{ + return 0; +} + +static void __exit nvme_core_cleanup(void) +{ +} + +module_init(nvme_core_init); +module_exit(nvme_core_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/nvme-fabrics_dummy.c b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/nvme-fabrics_dummy.c new file mode 100644 index 0000000..ec999ca --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/nvme-fabrics_dummy.c @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "nvme-fabrics" +#define PFX DRV_NAME ": " +#define DRV_VERSION "2.0.0" +#define DRV_RELDATE "November 13, 2016" + +MODULE_AUTHOR("Alaa Hleihel"); +MODULE_DESCRIPTION("nvme-fabrics dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init nvme_fabrics_init(void) +{ + return 0; +} + +static void __exit nvme_fabrics_cleanup(void) +{ +} + +module_init(nvme_fabrics_init); +module_exit(nvme_fabrics_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/nvme-fc_dummy.c b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/nvme-fc_dummy.c new file mode 100644 index 0000000..fe3b1b6 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/nvme-fc_dummy.c @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "nvme-fc" +#define PFX DRV_NAME ": " +#define DRV_VERSION "2.0.0" +#define DRV_RELDATE "July 27, 2017" + +MODULE_AUTHOR("Alaa Hleihel"); +MODULE_DESCRIPTION("nvme-fc dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init nvme_fc_init(void) +{ + return 0; +} + +static void __exit nvme_fc_cleanup(void) +{ +} + +module_init(nvme_fc_init); +module_exit(nvme_fc_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/nvme-rdma_dummy.c b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/nvme-rdma_dummy.c new file mode 100644 index 0000000..3ba86d3 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/nvme-rdma_dummy.c @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "nvme-rdma" +#define PFX DRV_NAME ": " +#define DRV_VERSION "2.0.0" +#define DRV_RELDATE "November 13, 2016" + +MODULE_AUTHOR("Alaa Hleihel"); +MODULE_DESCRIPTION("nvme-rdma dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init nvme_rdma_init(void) +{ + return 0; +} + +static void __exit nvme_rdma_cleanup(void) +{ +} + +module_init(nvme_rdma_init); +module_exit(nvme_rdma_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/nvme.h b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/nvme.h new file mode 100644 index 0000000..58a7983 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/nvme.h @@ -0,0 +1,940 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2011-2014, Intel Corporation. + */ + +#ifndef _NVME_H +#define _NVME_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +extern unsigned int nvme_io_timeout; +#define NVME_IO_TIMEOUT (nvme_io_timeout * HZ) + +extern unsigned int admin_timeout; +#define NVME_ADMIN_TIMEOUT (admin_timeout * HZ) + +#define NVME_DEFAULT_KATO 5 + +#ifdef CONFIG_ARCH_NO_SG_CHAIN +#define NVME_INLINE_SG_CNT 0 +#define NVME_INLINE_METADATA_SG_CNT 0 +#else +#define NVME_INLINE_SG_CNT 2 +#define NVME_INLINE_METADATA_SG_CNT 1 +#endif + +/* + * Default to a 4K page size, with the intention to update this + * path in the future to accommodate architectures with differing + * kernel and IO page sizes. + */ +#define NVME_CTRL_PAGE_SHIFT 12 +#define NVME_CTRL_PAGE_SIZE (1 << NVME_CTRL_PAGE_SHIFT) + +extern struct workqueue_struct *nvme_wq; +extern struct workqueue_struct *nvme_reset_wq; +extern struct workqueue_struct *nvme_delete_wq; + +/* + * List of workarounds for devices that required behavior not specified in + * the standard. + */ +enum nvme_quirks { + /* + * Prefers I/O aligned to a stripe size specified in a vendor + * specific Identify field. + */ + NVME_QUIRK_STRIPE_SIZE = (1 << 0), + + /* + * The controller doesn't handle Identify value others than 0 or 1 + * correctly. + */ + NVME_QUIRK_IDENTIFY_CNS = (1 << 1), + + /* + * The controller deterministically returns O's on reads to + * logical blocks that deallocate was called on. + */ + NVME_QUIRK_DEALLOCATE_ZEROES = (1 << 2), + + /* + * The controller needs a delay before starts checking the device + * readiness, which is done by reading the NVME_CSTS_RDY bit. + */ + NVME_QUIRK_DELAY_BEFORE_CHK_RDY = (1 << 3), + + /* + * APST should not be used. + */ + NVME_QUIRK_NO_APST = (1 << 4), + + /* + * The deepest sleep state should not be used. + */ + NVME_QUIRK_NO_DEEPEST_PS = (1 << 5), + + /* + * Set MEDIUM priority on SQ creation + */ + NVME_QUIRK_MEDIUM_PRIO_SQ = (1 << 7), + + /* + * Ignore device provided subnqn. + */ + NVME_QUIRK_IGNORE_DEV_SUBNQN = (1 << 8), + + /* + * Broken Write Zeroes. + */ + NVME_QUIRK_DISABLE_WRITE_ZEROES = (1 << 9), + + /* + * Force simple suspend/resume path. + */ + NVME_QUIRK_SIMPLE_SUSPEND = (1 << 10), + + /* + * Use only one interrupt vector for all queues + */ + NVME_QUIRK_SINGLE_VECTOR = (1 << 11), + + /* + * Use non-standard 128 bytes SQEs. + */ + NVME_QUIRK_128_BYTES_SQES = (1 << 12), + + /* + * Prevent tag overlap between queues + */ + NVME_QUIRK_SHARED_TAGS = (1 << 13), + + /* + * Don't change the value of the temperature threshold feature + */ + NVME_QUIRK_NO_TEMP_THRESH_CHANGE = (1 << 14), + + /* + * The controller doesn't handle the Identify Namespace + * Identification Descriptor list subcommand despite claiming + * NVMe 1.3 compliance. + */ + NVME_QUIRK_NO_NS_DESC_LIST = (1 << 15), + + /* + * The controller does not properly handle DMA addresses over + * 48 bits. + */ + NVME_QUIRK_DMA_ADDRESS_BITS_48 = (1 << 16), + + /* + * The controller requires the command_id value be be limited, so skip + * encoding the generation sequence number. + */ + NVME_QUIRK_SKIP_CID_GEN = (1 << 17), + + /* + * Reports garbage in the namespace identifiers (eui64, nguid, uuid). + */ + NVME_QUIRK_BOGUS_NID = (1 << 18), +}; + +/* + * Common request structure for NVMe passthrough. All drivers must have + * this structure as the first member of their request-private data. + */ +struct nvme_request { + struct nvme_command *cmd; + union nvme_result result; + u8 genctr; + u8 retries; + u8 flags; + u16 status; + struct nvme_ctrl *ctrl; +}; + +/* + * Mark a bio as coming in through the mpath node. + */ +#define REQ_NVME_MPATH REQ_DRV + +enum { + NVME_REQ_CANCELLED = (1 << 0), + NVME_REQ_USERCMD = (1 << 1), +}; + +static inline struct nvme_request *nvme_req(struct request *req) +{ + return blk_mq_rq_to_pdu(req); +} + +static inline u16 nvme_req_qid(struct request *req) +{ + if (!req->q->queuedata) + return 0; + + return req->mq_hctx->queue_num + 1; +} + +/* The below value is the specific amount of delay needed before checking + * readiness in case of the PCI_DEVICE(0x1c58, 0x0003), which needs the + * NVME_QUIRK_DELAY_BEFORE_CHK_RDY quirk enabled. The value (in ms) was + * found empirically. + */ +#define NVME_QUIRK_DELAY_AMOUNT 2300 + +/* + * enum nvme_ctrl_state: Controller state + * + * @NVME_CTRL_NEW: New controller just allocated, initial state + * @NVME_CTRL_LIVE: Controller is connected and I/O capable + * @NVME_CTRL_RESETTING: Controller is resetting (or scheduled reset) + * @NVME_CTRL_CONNECTING: Controller is disconnected, now connecting the + * transport + * @NVME_CTRL_DELETING: Controller is deleting (or scheduled deletion) + * @NVME_CTRL_DELETING_NOIO: Controller is deleting and I/O is not + * disabled/failed immediately. This state comes + * after all async event processing took place and + * before ns removal and the controller deletion + * progress + * @NVME_CTRL_DEAD: Controller is non-present/unresponsive during + * shutdown or removal. In this case we forcibly + * kill all inflight I/O as they have no chance to + * complete + */ +enum nvme_ctrl_state { + NVME_CTRL_NEW, + NVME_CTRL_LIVE, + NVME_CTRL_RESETTING, + NVME_CTRL_CONNECTING, + NVME_CTRL_DELETING, + NVME_CTRL_DELETING_NOIO, + NVME_CTRL_DEAD, +}; + +struct nvme_fault_inject { +#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS + struct fault_attr attr; + struct dentry *parent; + bool dont_retry; /* DNR, do not retry */ + u16 status; /* status code */ +#endif +}; + +struct nvme_ctrl { + bool comp_seen; + enum nvme_ctrl_state state; + bool identified; + spinlock_t lock; + struct mutex scan_lock; + const struct nvme_ctrl_ops *ops; + struct request_queue *admin_q; + struct request_queue *connect_q; + struct request_queue *fabrics_q; + struct device *dev; + int instance; + int numa_node; + struct blk_mq_tag_set *tagset; + struct blk_mq_tag_set *admin_tagset; + struct list_head namespaces; + struct rw_semaphore namespaces_rwsem; + struct device ctrl_device; + struct device *device; /* char device */ +#ifdef CONFIG_NVME_HWMON + struct device *hwmon_device; +#endif + struct cdev cdev; + struct work_struct reset_work; + struct work_struct delete_work; + wait_queue_head_t state_wq; + + struct nvme_subsystem *subsys; + struct list_head subsys_entry; + + struct opal_dev *opal_dev; + + char name[12]; + u16 cntlid; + + u32 ctrl_config; + u16 mtfa; + u32 queue_count; + + u64 cap; + u32 max_hw_sectors; + u32 max_segments; + u32 max_integrity_segments; + u32 max_discard_sectors; + u32 max_discard_segments; + u32 max_zeroes_sectors; +#ifdef CONFIG_BLK_DEV_ZONED + u32 max_zone_append; +#endif + u16 crdt[3]; + u16 oncs; + u16 oacs; + u16 nssa; + u16 nr_streams; + u16 sqsize; + u32 max_namespaces; + atomic_t abort_limit; + u8 vwc; + u32 vs; + u32 sgls; + u16 kas; + u8 npss; + u8 apsta; + u16 wctemp; + u16 cctemp; + u32 oaes; + u32 aen_result; + u32 ctratt; + unsigned int shutdown_timeout; + unsigned int kato; + bool subsystem; + unsigned long quirks; + struct nvme_id_power_state psd[32]; + struct nvme_effects_log *effects; + struct xarray cels; + struct work_struct scan_work; + struct work_struct async_event_work; + struct delayed_work ka_work; + struct delayed_work failfast_work; + struct nvme_command ka_cmd; + struct work_struct fw_act_work; + unsigned long events; + +#ifdef CONFIG_NVME_MULTIPATH + /* asymmetric namespace access: */ + u8 anacap; + u8 anatt; + u32 anagrpmax; + u32 nanagrpid; + struct mutex ana_lock; + struct nvme_ana_rsp_hdr *ana_log_buf; + size_t ana_log_size; + struct timer_list anatt_timer; + struct work_struct ana_work; +#endif + + /* Power saving configuration */ + u64 ps_max_latency_us; + bool apst_enabled; + + /* PCIe only: */ + u32 hmpre; + u32 hmmin; + u32 hmminds; + u16 hmmaxd; + + /* Fabrics only */ + u32 ioccsz; + u32 iorcsz; + u16 icdoff; + u16 maxcmd; + int nr_reconnects; + unsigned long flags; +#define NVME_CTRL_FAILFAST_EXPIRED 0 +#define NVME_CTRL_ADMIN_Q_STOPPED 1 + struct nvmf_ctrl_options *opts; + + struct page *discard_page; + unsigned long discard_page_busy; + + struct nvme_fault_inject fault_inject; +}; + +enum nvme_iopolicy { + NVME_IOPOLICY_NUMA, + NVME_IOPOLICY_RR, +}; + +struct nvme_subsystem { + int instance; + struct device dev; + /* + * Because we unregister the device on the last put we need + * a separate refcount. + */ + struct kref ref; + struct list_head entry; + struct mutex lock; + struct list_head ctrls; + struct list_head nsheads; + char subnqn[NVMF_NQN_SIZE]; + char serial[20]; + char model[40]; + char firmware_rev[8]; + u8 cmic; + enum nvme_subsys_type subtype; + u16 vendor_id; + u16 awupf; /* 0's based awupf value. */ + struct ida ns_ida; +#ifdef CONFIG_NVME_MULTIPATH + enum nvme_iopolicy iopolicy; +#endif +}; + +/* + * Container structure for uniqueue namespace identifiers. + */ +struct nvme_ns_ids { + u8 eui64[8]; + u8 nguid[16]; + uuid_t uuid; + u8 csi; +}; + +/* + * Anchor structure for namespaces. There is one for each namespace in a + * NVMe subsystem that any of our controllers can see, and the namespace + * structure for each controller is chained of it. For private namespaces + * there is a 1:1 relation to our namespace structures, that is ->list + * only ever has a single entry for private namespaces. + */ +struct nvme_ns_head { + struct list_head list; + struct srcu_struct srcu; + struct nvme_subsystem *subsys; + unsigned ns_id; + struct nvme_ns_ids ids; + struct list_head entry; + struct kref ref; + bool shared; + int instance; + struct nvme_effects_log *effects; + + struct cdev cdev; + struct device cdev_device; + + struct gendisk *disk; +#ifdef CONFIG_NVME_MULTIPATH + struct bio_list requeue_list; + spinlock_t requeue_lock; + struct work_struct requeue_work; + struct mutex lock; + unsigned long flags; +#define NVME_NSHEAD_DISK_LIVE 0 + struct nvme_ns __rcu *current_path[]; +#endif +}; + +static inline bool nvme_ns_head_multipath(struct nvme_ns_head *head) +{ + return IS_ENABLED(CONFIG_NVME_MULTIPATH) && head->disk; +} + +enum nvme_ns_features { + NVME_NS_EXT_LBAS = 1 << 0, /* support extended LBA format */ + NVME_NS_METADATA_SUPPORTED = 1 << 1, /* support getting generated md */ +}; + +struct nvme_ns { + struct list_head list; + + struct nvme_ctrl *ctrl; + struct request_queue *queue; + struct gendisk *disk; +#ifdef CONFIG_NVME_MULTIPATH + enum nvme_ana_state ana_state; + u32 ana_grpid; +#endif + struct list_head siblings; + struct kref kref; + struct nvme_ns_head *head; + + int lba_shift; + u16 ms; + u16 sgs; + u32 sws; + u8 pi_type; +#ifdef CONFIG_BLK_DEV_ZONED + u64 zsze; +#endif + unsigned long features; + unsigned long flags; +#define NVME_NS_REMOVING 0 +#define NVME_NS_DEAD 1 +#define NVME_NS_ANA_PENDING 2 +#define NVME_NS_FORCE_RO 3 +#define NVME_NS_READY 4 +#define NVME_NS_STOPPED 5 + + struct cdev cdev; + struct device cdev_device; + + struct nvme_fault_inject fault_inject; + +}; + +/* NVMe ns supports metadata actions by the controller (generate/strip) */ +static inline bool nvme_ns_has_pi(struct nvme_ns *ns) +{ + return ns->pi_type && ns->ms == sizeof(struct t10_pi_tuple); +} + +struct nvme_ctrl_ops { + const char *name; + struct module *module; + unsigned int flags; +#define NVME_F_FABRICS (1 << 0) +#define NVME_F_METADATA_SUPPORTED (1 << 1) +#define NVME_F_PCI_P2PDMA (1 << 2) + int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val); + int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val); + int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val); + void (*free_ctrl)(struct nvme_ctrl *ctrl); + void (*submit_async_event)(struct nvme_ctrl *ctrl); + void (*delete_ctrl)(struct nvme_ctrl *ctrl); + void (*stop_ctrl)(struct nvme_ctrl *ctrl); + int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size); +}; + +/* + * nvme command_id is constructed as such: + * | xxxx | xxxxxxxxxxxx | + * gen request tag + */ +#define nvme_genctr_mask(gen) (gen & 0xf) +#define nvme_cid_install_genctr(gen) (nvme_genctr_mask(gen) << 12) +#define nvme_genctr_from_cid(cid) ((cid & 0xf000) >> 12) +#define nvme_tag_from_cid(cid) (cid & 0xfff) + +static inline u16 nvme_cid(struct request *rq) +{ + return nvme_cid_install_genctr(nvme_req(rq)->genctr) | rq->tag; +} + +static inline struct request *nvme_find_rq(struct blk_mq_tags *tags, + u16 command_id) +{ + u8 genctr = nvme_genctr_from_cid(command_id); + u16 tag = nvme_tag_from_cid(command_id); + struct request *rq; + + rq = blk_mq_tag_to_rq(tags, tag); + if (unlikely(!rq)) { + pr_err("could not locate request for tag %#x\n", + tag); + return NULL; + } + if (unlikely(nvme_genctr_mask(nvme_req(rq)->genctr) != genctr)) { + dev_err(nvme_req(rq)->ctrl->device, + "request %#x genctr mismatch (got %#x expected %#x)\n", + tag, genctr, nvme_genctr_mask(nvme_req(rq)->genctr)); + return NULL; + } + return rq; +} + +static inline struct request *nvme_cid_to_rq(struct blk_mq_tags *tags, + u16 command_id) +{ + return blk_mq_tag_to_rq(tags, nvme_tag_from_cid(command_id)); +} + +#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS +void nvme_fault_inject_init(struct nvme_fault_inject *fault_inj, + const char *dev_name); +void nvme_fault_inject_fini(struct nvme_fault_inject *fault_inject); +void nvme_should_fail(struct request *req); +#else +static inline void nvme_fault_inject_init(struct nvme_fault_inject *fault_inj, + const char *dev_name) +{ +} +static inline void nvme_fault_inject_fini(struct nvme_fault_inject *fault_inj) +{ +} +static inline void nvme_should_fail(struct request *req) {} +#endif + +static inline int nvme_reset_subsystem(struct nvme_ctrl *ctrl) +{ + if (!ctrl->subsystem) + return -ENOTTY; + return ctrl->ops->reg_write32(ctrl, NVME_REG_NSSR, 0x4E564D65); +} + +/* + * Convert a 512B sector number to a device logical block number. + */ +static inline u64 nvme_sect_to_lba(struct nvme_ns *ns, sector_t sector) +{ + return sector >> (ns->lba_shift - SECTOR_SHIFT); +} + +/* + * Convert a device logical block number to a 512B sector number. + */ +static inline sector_t nvme_lba_to_sect(struct nvme_ns *ns, u64 lba) +{ + return lba << (ns->lba_shift - SECTOR_SHIFT); +} + +/* + * Convert byte length to nvme's 0-based num dwords + */ +static inline u32 nvme_bytes_to_numd(size_t len) +{ + return (len >> 2) - 1; +} + +static inline bool nvme_is_ana_error(u16 status) +{ + switch (status & 0x7ff) { + case NVME_SC_ANA_TRANSITION: + case NVME_SC_ANA_INACCESSIBLE: + case NVME_SC_ANA_PERSISTENT_LOSS: + return true; + default: + return false; + } +} + +static inline bool nvme_is_path_error(u16 status) +{ + /* check for a status code type of 'path related status' */ + return (status & 0x700) == 0x300; +} + +/* + * Fill in the status and result information from the CQE, and then figure out + * if blk-mq will need to use IPI magic to complete the request, and if yes do + * so. If not let the caller complete the request without an indirect function + * call. + */ +static inline bool nvme_try_complete_req(struct request *req, __le16 status, + union nvme_result result) +{ + struct nvme_request *rq = nvme_req(req); + struct nvme_ctrl *ctrl = rq->ctrl; + + if (!(ctrl->quirks & NVME_QUIRK_SKIP_CID_GEN)) + rq->genctr++; + + rq->status = le16_to_cpu(status) >> 1; + rq->result = result; + /* inject error when permitted by fault injection framework */ + nvme_should_fail(req); + if (unlikely(blk_should_fake_timeout(req->q))) + return true; + return blk_mq_complete_request_remote(req); +} + +static inline void nvme_get_ctrl(struct nvme_ctrl *ctrl) +{ + get_device(ctrl->device); +} + +static inline void nvme_put_ctrl(struct nvme_ctrl *ctrl) +{ + put_device(ctrl->device); +} + +static inline bool nvme_is_aen_req(u16 qid, __u16 command_id) +{ + return !qid && + nvme_tag_from_cid(command_id) >= NVME_AQ_BLK_MQ_DEPTH; +} + +void nvme_complete_rq(struct request *req); +void nvme_complete_batch_req(struct request *req); + +static __always_inline void nvme_complete_batch(struct io_comp_batch *iob, + void (*fn)(struct request *rq)) +{ + struct request *req; + + rq_list_for_each(&iob->req_list, req) { + fn(req); + nvme_complete_batch_req(req); + } + blk_mq_end_request_batch(iob); +} + +blk_status_t nvme_host_path_error(struct request *req); +bool nvme_cancel_request(struct request *req, void *data, bool reserved); +void nvme_cancel_tagset(struct nvme_ctrl *ctrl); +void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl); +bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, + enum nvme_ctrl_state new_state); +bool nvme_wait_reset(struct nvme_ctrl *ctrl); +int nvme_disable_ctrl(struct nvme_ctrl *ctrl); +int nvme_enable_ctrl(struct nvme_ctrl *ctrl); +int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl); +int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, + const struct nvme_ctrl_ops *ops, unsigned long quirks); +void nvme_uninit_ctrl(struct nvme_ctrl *ctrl); +void nvme_start_ctrl(struct nvme_ctrl *ctrl); +void nvme_stop_ctrl(struct nvme_ctrl *ctrl); +int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl); + +void nvme_remove_namespaces(struct nvme_ctrl *ctrl); + +int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, + bool send); + +void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, + volatile union nvme_result *res); + +void nvme_stop_queues(struct nvme_ctrl *ctrl); +void nvme_start_queues(struct nvme_ctrl *ctrl); +void nvme_stop_admin_queue(struct nvme_ctrl *ctrl); +void nvme_start_admin_queue(struct nvme_ctrl *ctrl); +void nvme_kill_queues(struct nvme_ctrl *ctrl); +void nvme_sync_queues(struct nvme_ctrl *ctrl); +void nvme_sync_io_queues(struct nvme_ctrl *ctrl); +void nvme_unfreeze(struct nvme_ctrl *ctrl); +void nvme_wait_freeze(struct nvme_ctrl *ctrl); +int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout); +void nvme_start_freeze(struct nvme_ctrl *ctrl); + +#define NVME_QID_ANY -1 +struct request *nvme_alloc_request(struct request_queue *q, + struct nvme_command *cmd, blk_mq_req_flags_t flags); +void nvme_cleanup_cmd(struct request *req); +blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req); +blk_status_t nvme_fail_nonready_command(struct nvme_ctrl *ctrl, + struct request *req); +bool __nvme_check_ready(struct nvme_ctrl *ctrl, struct request *rq, + bool queue_live); + +static inline bool nvme_check_ready(struct nvme_ctrl *ctrl, struct request *rq, + bool queue_live) +{ + if (likely(ctrl->state == NVME_CTRL_LIVE)) + return true; + if (ctrl->ops->flags & NVME_F_FABRICS && + ctrl->state == NVME_CTRL_DELETING) + return queue_live; + return __nvme_check_ready(ctrl, rq, queue_live); +} +int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, + void *buf, unsigned bufflen); +int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, + union nvme_result *result, void *buffer, unsigned bufflen, + unsigned timeout, int qid, int at_head, + blk_mq_req_flags_t flags); +int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid, + unsigned int dword11, void *buffer, size_t buflen, + u32 *result); +int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid, + unsigned int dword11, void *buffer, size_t buflen, + u32 *result); +int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count); +void nvme_stop_keep_alive(struct nvme_ctrl *ctrl); +int nvme_reset_ctrl(struct nvme_ctrl *ctrl); +int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl); +int nvme_try_sched_reset(struct nvme_ctrl *ctrl); +int nvme_delete_ctrl(struct nvme_ctrl *ctrl); +void nvme_queue_scan(struct nvme_ctrl *ctrl); +int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi, + void *log, size_t size, u64 offset); +bool nvme_tryget_ns_head(struct nvme_ns_head *head); +void nvme_put_ns_head(struct nvme_ns_head *head); +int nvme_cdev_add(struct cdev *cdev, struct device *cdev_device, + const struct file_operations *fops, struct module *owner); +void nvme_cdev_del(struct cdev *cdev, struct device *cdev_device); +int nvme_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg); +long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +int nvme_ns_head_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg); +long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd, + unsigned long arg); +long nvme_dev_ioctl(struct file *file, unsigned int cmd, + unsigned long arg); +int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo); + +extern const struct attribute_group *nvme_ns_id_attr_groups[]; +extern const struct pr_ops nvme_pr_ops; +extern const struct block_device_operations nvme_ns_head_ops; + +struct nvme_ns *nvme_find_path(struct nvme_ns_head *head); +#ifdef CONFIG_NVME_MULTIPATH +static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl) +{ + return ctrl->ana_log_buf != NULL; +} + +void nvme_mpath_unfreeze(struct nvme_subsystem *subsys); +void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys); +void nvme_mpath_start_freeze(struct nvme_subsystem *subsys); +void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys); +bool nvme_mpath_set_disk_name(struct nvme_ns *ns, char *disk_name, int *flags); +void nvme_failover_req(struct request *req); +void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl); +int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head); +void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id); +void nvme_mpath_remove_disk(struct nvme_ns_head *head); +int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id); +void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl); +void nvme_mpath_uninit(struct nvme_ctrl *ctrl); +void nvme_mpath_stop(struct nvme_ctrl *ctrl); +bool nvme_mpath_clear_current_path(struct nvme_ns *ns); +void nvme_mpath_revalidate_paths(struct nvme_ns *ns); +void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl); +void nvme_mpath_shutdown_disk(struct nvme_ns_head *head); + +static inline void nvme_trace_bio_complete(struct request *req) +{ + struct nvme_ns *ns = req->q->queuedata; + + if (req->cmd_flags & REQ_NVME_MPATH) + trace_block_bio_complete(ns->head->disk->queue, req->bio); +} + +extern struct device_attribute dev_attr_ana_grpid; +extern struct device_attribute dev_attr_ana_state; +extern struct device_attribute subsys_attr_iopolicy; + +#else +static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl) +{ + return false; +} +static inline bool nvme_mpath_set_disk_name(struct nvme_ns *ns, char *disk_name, + int *flags) +{ + return false; +} +static inline void nvme_failover_req(struct request *req) +{ +} +static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) +{ +} +static inline int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, + struct nvme_ns_head *head) +{ + return 0; +} +static inline void nvme_mpath_add_disk(struct nvme_ns *ns, + struct nvme_id_ns *id) +{ +} +static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head) +{ +} +static inline bool nvme_mpath_clear_current_path(struct nvme_ns *ns) +{ + return false; +} +static inline void nvme_mpath_revalidate_paths(struct nvme_ns *ns) +{ +} +static inline void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl) +{ +} +static inline void nvme_mpath_shutdown_disk(struct nvme_ns_head *head) +{ +} +static inline void nvme_trace_bio_complete(struct request *req) +{ +} +static inline void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl) +{ +} +static inline int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, + struct nvme_id_ctrl *id) +{ + if (ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA) + dev_warn(ctrl->device, +"Please enable CONFIG_NVME_MULTIPATH for full support of multi-port devices.\n"); + return 0; +} +static inline void nvme_mpath_uninit(struct nvme_ctrl *ctrl) +{ +} +static inline void nvme_mpath_stop(struct nvme_ctrl *ctrl) +{ +} +static inline void nvme_mpath_unfreeze(struct nvme_subsystem *subsys) +{ +} +static inline void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys) +{ +} +static inline void nvme_mpath_start_freeze(struct nvme_subsystem *subsys) +{ +} +static inline void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys) +{ +} +#endif /* CONFIG_NVME_MULTIPATH */ + +int nvme_revalidate_zones(struct nvme_ns *ns); +int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector, + unsigned int nr_zones, report_zones_cb cb, void *data); +#ifdef CONFIG_BLK_DEV_ZONED +int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf); +blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req, + struct nvme_command *cmnd, + enum nvme_zone_mgmt_action action); +#else +static inline blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, + struct request *req, struct nvme_command *cmnd, + enum nvme_zone_mgmt_action action) +{ + return BLK_STS_NOTSUPP; +} + +static inline int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf) +{ + dev_warn(ns->ctrl->device, + "Please enable CONFIG_BLK_DEV_ZONED to support ZNS devices\n"); + return -EPROTONOSUPPORT; +} +#endif + +static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev) +{ + return dev_to_disk(dev)->private_data; +} + +#ifdef CONFIG_NVME_HWMON +int nvme_hwmon_init(struct nvme_ctrl *ctrl); +void nvme_hwmon_exit(struct nvme_ctrl *ctrl); +#else +static inline int nvme_hwmon_init(struct nvme_ctrl *ctrl) +{ + return 0; +} + +static inline void nvme_hwmon_exit(struct nvme_ctrl *ctrl) +{ +} +#endif + +static inline bool nvme_ctrl_sgl_supported(struct nvme_ctrl *ctrl) +{ + return ctrl->sgls & ((1 << 0) | (1 << 1)); +} + +struct nvme_ns *disk_to_nvme_ns(struct gendisk *disk); +u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, + u8 opcode); +int nvme_execute_passthru_rq(struct request *rq); +struct nvme_ctrl *nvme_ctrl_from_file(struct file *file); +struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid); +void nvme_put_ns(struct nvme_ns *ns); + +static inline bool nvme_multi_css(struct nvme_ctrl *ctrl) +{ + return (ctrl->ctrl_config & NVME_CC_CSS_MASK) == NVME_CC_CSS_CSI; +} + +#endif /* _NVME_H */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/nvme_dummy.c b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/nvme_dummy.c new file mode 100644 index 0000000..12a2c4e --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/nvme_dummy.c @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "nvme" +#define PFX DRV_NAME ": " +#define DRV_VERSION "2.0.0" +#define DRV_RELDATE "November 13, 2016" + +MODULE_AUTHOR("Alaa Hleihel"); +MODULE_DESCRIPTION("nvme dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init nvme_init(void) +{ + return 0; +} + +static void __exit nvme_cleanup(void) +{ +} + +module_init(nvme_init); +module_exit(nvme_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/nvme_snap_vfio_pci.c b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/nvme_snap_vfio_pci.c new file mode 100644 index 0000000..66025f6 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/nvme_snap_vfio_pci.c @@ -0,0 +1,366 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved + * Author: Max Gurtovoy + */ + +#ifdef pr_fmt +#undef pr_fmt +#endif +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +struct mlx5_nvme_vfio_pci_device; + +struct mlx5_nvme_pinned_iova_phys { + struct rb_node node; + struct vfio_iommu_iova iommu_iova; +}; + +struct nvme_snap_notifier_block { + struct notifier_block nb; + struct mlx5_nvme_vfio_pci_device *mvdev; +}; + +struct mlx5_nvme_vfio_pci_device { + struct vfio_pci_core_device vdev; + bool efficient_cap; + + /* only valid for efficient capable */ + struct nvme_snap_notifier_block *nb; +}; + +#define SNAP_NVME_ADMIN_VENDOR_IOVA_MGMT 0xC4U +#define SNAP_NVME_IOVA_MGMT_OPM_MAP_RANGE 0x00U +#define SNAP_NVME_IOVA_MGMT_OPM_UNMAP_RANGE 0x01U +#define SNAP_NVME_IOVA_MGMT_OPM_UNMAP_ALL_RANGES 0x02U +#define SNAP_NVME_IOVA_MGMT_DW10_FID_SHIFT 0x0000U +#define SNAP_NVME_IOVA_MGMT_DW10_FID_MASK 0xFFFFU +#define SNAP_NVME_IOVA_MGMT_DW10_OPM_SHIFT 0x0010U +#define SNAP_NVME_IOVA_MGMT_DW10_OPM_MASK 0x000F0000U +#define SNAP_NVME_IOVA_MGMT_DW15_SZU_SHIFT 0x0000U +#define SNAP_NVME_IOVA_MGMT_DW15_SZU_MASK 0x000FU +#define SNAP_NVME_IOVA_MGMT_DW15_SZ_SHIFT 0x0004U +#define SNAP_NVME_IOVA_MGMT_DW15_SZ_MASK 0xFFFFFFF0U + +static void init_iova_mgmt_cmd(struct nvme_command *cmd, unsigned int fid, + unsigned int opm, unsigned int szu, + unsigned int sz, u64 siova, u64 tiova) +{ + u32 dw10, dw11, dw12, dw13, dw14, dw15; + + dw10 = fid << SNAP_NVME_IOVA_MGMT_DW10_FID_SHIFT & + SNAP_NVME_IOVA_MGMT_DW10_FID_MASK; + dw10 |= opm << SNAP_NVME_IOVA_MGMT_DW10_OPM_SHIFT & + SNAP_NVME_IOVA_MGMT_DW10_OPM_MASK; + dw11 = (u32)siova; + dw12 = (u32)(siova >> 32); + dw13 = (u32)tiova; + dw14 = (u32)(tiova >> 32); + dw15 = szu << SNAP_NVME_IOVA_MGMT_DW15_SZU_SHIFT & + SNAP_NVME_IOVA_MGMT_DW15_SZU_MASK; + dw15 |= sz << SNAP_NVME_IOVA_MGMT_DW15_SZ_SHIFT & + SNAP_NVME_IOVA_MGMT_DW15_SZ_MASK; + + cmd->common.opcode = SNAP_NVME_ADMIN_VENDOR_IOVA_MGMT; + cmd->common.flags = 0; + cmd->common.nsid = 0; + cmd->common.metadata = 0; + cmd->common.cdw2[0] = 0; + cmd->common.cdw2[1] = 0; + cmd->common.dptr.prp1 = 0; + cmd->common.dptr.prp2 = 0; + cmd->common.cdw10 = cpu_to_le32(dw10); + cmd->common.cdw11 = cpu_to_le32(dw11); + cmd->common.cdw12 = cpu_to_le32(dw12); + cmd->common.cdw13 = cpu_to_le32(dw13); + cmd->common.cdw14 = cpu_to_le32(dw14); + cmd->common.cdw15 = cpu_to_le32(dw15); +} + +static int mlx5_nvme_vfio_pci_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct nvme_snap_notifier_block *snap_nb = + container_of(nb, struct nvme_snap_notifier_block, nb); + struct mlx5_nvme_vfio_pci_device *mvdev = snap_nb->mvdev; + struct vfio_iommu_iova *iommu_iova; + int fid = mvdev->vdev.pdev->is_physfn ? + 0 : (pci_iov_vf_id(mvdev->vdev.pdev) + 1); + struct pci_dev *pdev; + struct nvme_command cmd; + int status; + unsigned int opm; + + /* Vendor drivers MUST unpin pages in response to an invalidation. */ + if (action != VFIO_IOMMU_NOTIFY_IOVA_UNPIN && + action != VFIO_IOMMU_NOTIFY_IOVA_PIN) + return NOTIFY_DONE; + + pdev = mvdev->vdev.pdev->physfn; + if (pdev) { + iommu_iova = data; + if (action == VFIO_IOMMU_NOTIFY_IOVA_UNPIN) + opm = SNAP_NVME_IOVA_MGMT_OPM_UNMAP_RANGE; + else + opm = SNAP_NVME_IOVA_MGMT_OPM_MAP_RANGE; + + init_iova_mgmt_cmd(&cmd, fid, opm, 0, + iommu_iova->size / 0x1000U, iommu_iova->iova, + iommu_iova->phys); + status = nvme_pdev_admin_passthru_sync(pdev, &cmd, NULL, 0, 0); + if (!status) + return NOTIFY_OK; + } + + return NOTIFY_DONE; +} + +static void +nvme_snap_unregister_efficient_notifier(struct mlx5_nvme_vfio_pci_device *mvdev) +{ + struct nvme_snap_notifier_block *nb = mvdev->nb; + int fid = mvdev->vdev.pdev->is_physfn ? + 0 : (pci_iov_vf_id(mvdev->vdev.pdev) + 1); + struct pci_dev *pdev; + struct nvme_command cmd; + + vfio_unregister_notifier(mvdev->vdev.vdev.dev, VFIO_IOMMU_NOTIFY, + &nb->nb); + kfree(nb); + mvdev->nb = NULL; + + pdev = mvdev->vdev.pdev->physfn; + if (pdev) { + init_iova_mgmt_cmd(&cmd, fid, + SNAP_NVME_IOVA_MGMT_OPM_UNMAP_ALL_RANGES, 0, + 0, 0, 0); + nvme_pdev_admin_passthru_sync(pdev, &cmd, NULL, 0, 0); + } +} + +static int +nvme_snap_register_efficient_notifier(struct mlx5_nvme_vfio_pci_device *mvdev) +{ + struct nvme_snap_notifier_block *nb; + unsigned long events; + int ret; + + nb = kzalloc(sizeof(*nb), GFP_KERNEL); + if (!nb) + return -ENOMEM; + + mvdev->nb = nb; + nb->mvdev = mvdev; + + events = VFIO_IOMMU_NOTIFY_IOVA_UNPIN | VFIO_IOMMU_NOTIFY_IOVA_PIN; + nb->nb.notifier_call = mlx5_nvme_vfio_pci_notifier; + ret = vfio_register_notifier(mvdev->vdev.vdev.dev, VFIO_IOMMU_NOTIFY, + &events, &nb->nb); + if (ret) + goto out_free; + + ret = vfio_notify_iova_map(mvdev->vdev.vdev.dev); + if (ret) + goto unregister; + return 0; + +unregister: + vfio_unregister_notifier(mvdev->vdev.vdev.dev, VFIO_IOMMU_NOTIFY, + &nb->nb); +out_free: + kfree(nb); + mvdev->nb = NULL; + return ret; +} + +static int nvme_snap_vfio_pci_open_device(struct vfio_device *core_vdev) +{ + struct vfio_pci_core_device *vdev = + container_of(core_vdev, struct vfio_pci_core_device, vdev); + struct mlx5_nvme_vfio_pci_device *mvdev = + container_of(vdev, struct mlx5_nvme_vfio_pci_device, vdev); + int ret; + + ret = vfio_pci_core_enable(vdev); + if (ret) + return ret; + + vfio_pci_core_finish_enable(vdev); + + if (!mvdev->efficient_cap) + goto out; + + ret = nvme_snap_register_efficient_notifier(mvdev); + if (ret) + goto out_err; + +out: + return 0; + +out_err: + vfio_pci_core_disable(vdev); + return ret; +} + +static void nvme_snap_vfio_pci_close_device(struct vfio_device *core_vdev) +{ + struct vfio_pci_core_device *vdev = + container_of(core_vdev, struct vfio_pci_core_device, vdev); + struct mlx5_nvme_vfio_pci_device *mvdev = + container_of(vdev, struct mlx5_nvme_vfio_pci_device, vdev); + + if (mvdev->efficient_cap) + nvme_snap_unregister_efficient_notifier(mvdev); + + vfio_pci_core_close_device(core_vdev); +} + +static const struct vfio_device_ops mlx5_nvme_vfio_pci_ops = { + .name = "nvme-snap-vfio-pci", + .open_device = nvme_snap_vfio_pci_open_device, + .close_device = nvme_snap_vfio_pci_close_device, + .ioctl = vfio_pci_core_ioctl, + .read = vfio_pci_core_read, + .write = vfio_pci_core_write, + .mmap = vfio_pci_core_mmap, + .request = vfio_pci_core_request, + .match = vfio_pci_core_match, +}; + +static void init_identify_ctrl_cmd(struct nvme_command *cmd) +{ + cmd->identify.opcode = nvme_admin_identify; + cmd->identify.cns = NVME_ID_CNS_CTRL; +} + +static int nvme_snap_fill_caps(struct mlx5_nvme_vfio_pci_device *mvdev) +{ + struct nvme_command cmd = {}; + struct pci_dev *pdev; + struct nvme_id_ctrl *id_ctrl; + int ret; + + pdev = mvdev->vdev.pdev->physfn; + if (!pdev) + return 0; + + id_ctrl = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL); + if (!id_ctrl) + return -ENOMEM; + + init_identify_ctrl_cmd(&cmd); + ret = nvme_pdev_admin_passthru_sync(pdev, &cmd, id_ctrl, + sizeof(struct nvme_id_ctrl), 0); + if (ret) { + kfree(id_ctrl); + return ret; + } + + if ((le16_to_cpu(id_ctrl->immts) & NVME_IMMTS_UNMAP_RANGED) && + (le16_to_cpu(id_ctrl->immts) & NVME_IMMTS_UNMAP_ALL) && + id_ctrl->imms) { + mvdev->efficient_cap = true; + dev_info(&mvdev->vdev.pdev->dev, "Efficient DMA is supported"); + } + + kfree(id_ctrl); + + return 0; +} + +static int mlx5_nvme_vfio_pci_probe(struct pci_dev *pdev, + const struct pci_device_id *id) +{ + struct mlx5_nvme_vfio_pci_device *mvdev; + int ret; + + mvdev = kzalloc(sizeof(*mvdev), GFP_KERNEL); + if (!mvdev) + return -ENOMEM; + + vfio_pci_core_init_device(&mvdev->vdev, pdev, &mlx5_nvme_vfio_pci_ops); + + if (pdev->is_virtfn) { + ret = nvme_snap_fill_caps(mvdev); + if (ret) + goto out_free; + } + + ret = vfio_pci_core_register_device(&mvdev->vdev); + if (ret) + goto out_free; + + dev_set_drvdata(&pdev->dev, mvdev); + + return 0; + +out_free: + vfio_pci_core_uninit_device(&mvdev->vdev); + kfree(mvdev); + return ret; +} + +static void mlx5_nvme_vfio_pci_remove(struct pci_dev *pdev) +{ + struct mlx5_nvme_vfio_pci_device *mvdev = dev_get_drvdata(&pdev->dev); + + if (mvdev->efficient_cap) + WARN_ON(mvdev->nb); + + vfio_pci_core_unregister_device(&mvdev->vdev); + vfio_pci_core_uninit_device(&mvdev->vdev); + kfree(mvdev); +} + +static const struct pci_device_id mlx5_nvme_vfio_pci_table[] = { + { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_MELLANOX, 0x6001) }, + { 0, } +}; + +MODULE_DEVICE_TABLE(pci, mlx5_nvme_vfio_pci_table); + +static struct pci_driver mlx5_nvme_vfio_pci_driver = { + .name = "nvme-snap-vfio-pci", + .id_table = mlx5_nvme_vfio_pci_table, + .probe = mlx5_nvme_vfio_pci_probe, + .remove = mlx5_nvme_vfio_pci_remove, + .err_handler = &vfio_pci_core_err_handlers, +}; + +static void __exit nvme_snap_vfio_pci_cleanup(void) +{ + pci_unregister_driver(&mlx5_nvme_vfio_pci_driver); +} + +static int __init nvme_snap_vfio_pci_init(void) +{ + return pci_register_driver(&mlx5_nvme_vfio_pci_driver); +} + +module_init(nvme_snap_vfio_pci_init); +module_exit(nvme_snap_vfio_pci_cleanup); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Max Gurtovoy "); +MODULE_AUTHOR("Israel Rukshin "); +MODULE_DESCRIPTION( + "NVMe SNAP VFIO PCI - User Level meta-driver for Mellanox NVMe SNAP device family"); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/passthru.c b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/passthru.c new file mode 100644 index 0000000..9acc404 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/passthru.c @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#include "nvme.h" +#include "passthru.h" + +static int __nvme_execute_rq(struct gendisk *disk, struct request *rq, + bool at_head) +{ + blk_status_t status; + + status = blk_execute_rq(disk, rq, at_head); + if (nvme_req(rq)->flags & NVME_REQ_CANCELLED) + return -EINTR; + if (nvme_req(rq)->status) + return nvme_req(rq)->status; + return blk_status_to_errno(status); +} + +int nvme_admin_passthru_sync(struct nvme_ctrl *ctrl, struct nvme_command *cmd, + void *buffer, unsigned int bufflen, + unsigned int timeout_ms) +{ + struct request *req; + int ret; + + req = nvme_alloc_request(ctrl->admin_q, cmd, 0); + if (IS_ERR(req)) + return PTR_ERR(req); + + if (timeout_ms) + req->timeout = msecs_to_jiffies(timeout_ms); + nvme_req(req)->flags |= NVME_REQ_USERCMD; + + if (buffer && bufflen) { + ret = blk_rq_map_kern(ctrl->admin_q, req, buffer, bufflen, + GFP_KERNEL); + if (ret) + goto out; + } + + ret = __nvme_execute_rq(NULL, req, false); + +out: + blk_mq_free_request(req); + return ret; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/passthru.h b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/passthru.h new file mode 100644 index 0000000..66e81db --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/passthru.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#ifndef _NVME_PASSTHRU_H +#define _NVME_PASSTHRU_H + +#include + +int nvme_admin_passthru_sync(struct nvme_ctrl *ctrl, struct nvme_command *cmd, + void *buffer, unsigned int bufflen, + unsigned int timeout_ms); + +#endif /* _NVME_PASSTHRU_H */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/pci.c b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/pci.c new file mode 100644 index 0000000..c930ab7 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/pci.c @@ -0,0 +1,3829 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NVM Express device driver + * Copyright (c) 2011-2014, Intel Corporation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "trace.h" +#include "nvme.h" +#ifdef CONFIG_COMPAT_NVME_SNAP_VFIO_PCI +#include "passthru.h" +#endif + +#ifdef CONFIG_NVFS +#include "nvfs.h" +#endif + +#define SQ_SIZE(q) ((q)->q_depth << (q)->sqes) +#define CQ_SIZE(q) ((q)->q_depth * sizeof(struct nvme_completion)) + +#define SGES_PER_PAGE (PAGE_SIZE / sizeof(struct nvme_sgl_desc)) + +/* + * These can be higher, but we need to ensure that any command doesn't + * require an sg allocation that needs more than a page of data. + */ +#define NVME_MAX_KB_SZ 4096 +#define NVME_MAX_SEGS 127 + +static int use_threaded_interrupts; +module_param(use_threaded_interrupts, int, 0); + +static bool use_cmb_sqes = true; +module_param(use_cmb_sqes, bool, 0444); +MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes"); + +static unsigned int max_host_mem_size_mb = 128; +module_param(max_host_mem_size_mb, uint, 0444); +MODULE_PARM_DESC(max_host_mem_size_mb, + "Maximum Host Memory Buffer (HMB) size per controller (in MiB)"); + +static unsigned int sgl_threshold = SZ_32K; +module_param(sgl_threshold, uint, 0644); +MODULE_PARM_DESC(sgl_threshold, + "Use SGLs when average request segment size is larger or equal to " + "this size. Use 0 to disable SGLs."); + +#define NVME_PCI_MIN_QUEUE_SIZE 2 +#define NVME_PCI_MAX_QUEUE_SIZE 4095 +static int io_queue_depth_set(const char *val, const struct kernel_param *kp); +static const struct kernel_param_ops io_queue_depth_ops = { + .set = io_queue_depth_set, + .get = param_get_uint, +}; + +static unsigned int io_queue_depth = 1024; +module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644); +MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2 and < 4096"); + +static int io_queue_count_set(const char *val, const struct kernel_param *kp) +{ + unsigned int n; + int ret; + + ret = kstrtouint(val, 10, &n); + if (ret != 0 || n > num_possible_cpus()) + return -EINVAL; + return param_set_uint(val, kp); +} + +static const struct kernel_param_ops io_queue_count_ops = { + .set = io_queue_count_set, + .get = param_get_uint, +}; + +static unsigned int write_queues; +module_param_cb(write_queues, &io_queue_count_ops, &write_queues, 0644); +MODULE_PARM_DESC(write_queues, + "Number of queues to use for writes. If not set, reads and writes " + "will share a queue set."); + +static unsigned int poll_queues; +module_param_cb(poll_queues, &io_queue_count_ops, &poll_queues, 0644); +MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO."); + +static bool noacpi; +module_param(noacpi, bool, 0444); +MODULE_PARM_DESC(noacpi, "disable acpi bios quirks"); + +static int num_p2p_queues_set(const char *val, const struct kernel_param *kp); +static const struct kernel_param_ops num_p2p_queues_ops = { + .set = num_p2p_queues_set, + .get = param_get_int, +}; + +static unsigned int num_p2p_queues = 0; +module_param_cb(num_p2p_queues, &num_p2p_queues_ops, &num_p2p_queues, S_IRUGO); +MODULE_PARM_DESC(num_p2p_queues, + "number of I/O queues to create for peer-to-peer data transfer per pci function (Default: 0)"); + +struct nvme_dev; +struct nvme_queue; + +static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown); +static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled); +static int nvme_suspend_queue(struct nvme_queue *nvmeq); +static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid); +static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid); +static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode); + +/* + * Represents an NVM Express device. Each nvme_dev is a PCI function. + */ +struct nvme_dev { + struct nvme_queue *queues; + struct blk_mq_tag_set tagset; + struct blk_mq_tag_set admin_tagset; + u32 __iomem *dbs; + struct device *dev; + struct dma_pool *prp_page_pool; + struct dma_pool *prp_small_pool; + unsigned online_queues; + unsigned max_qid; + unsigned io_queues[HCTX_MAX_TYPES]; + unsigned int num_vecs; + u32 q_depth; + int io_sqes; + u32 db_stride; + void __iomem *bar; + unsigned long bar_mapped_size; + struct work_struct remove_work; + struct mutex shutdown_lock; + bool subsystem; + u64 cmb_size; + bool cmb_use_sqes; + u32 cmbsz; + u32 cmbloc; + struct nvme_ctrl ctrl; + unsigned num_p2p_queues; + u32 last_ps; + bool hmb; + + mempool_t *iod_mempool; + + /* shadow doorbell buffer support: */ + u32 *dbbuf_dbs; + dma_addr_t dbbuf_dbs_dma_addr; + u32 *dbbuf_eis; + dma_addr_t dbbuf_eis_dma_addr; + + /* host memory buffer support: */ + u64 host_mem_size; + u32 nr_host_mem_descs; + dma_addr_t host_mem_descs_dma; + struct nvme_host_mem_buf_desc *host_mem_descs; + void **host_mem_desc_bufs; + unsigned int nr_allocated_queues; + unsigned int nr_write_queues; + unsigned int nr_poll_queues; + + bool attrs_added; +}; + +static int io_queue_depth_set(const char *val, const struct kernel_param *kp) +{ + return param_set_uint_minmax(val, kp, NVME_PCI_MIN_QUEUE_SIZE, + NVME_PCI_MAX_QUEUE_SIZE); +} + +static int num_p2p_queues_set(const char *val, const struct kernel_param *kp) +{ + unsigned n = 0; + int ret; + + ret = kstrtouint(val, 0, &n); + if (ret != 0 || n > 65534) + return -EINVAL; + + return param_set_uint(val, kp); +} + +static inline unsigned int sq_idx(unsigned int qid, u32 stride) +{ + return qid * 2 * stride; +} + +static inline unsigned int cq_idx(unsigned int qid, u32 stride) +{ + return (qid * 2 + 1) * stride; +} + +static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl) +{ + return container_of(ctrl, struct nvme_dev, ctrl); +} + +/* + * An NVM Express queue. Each device has at least two (one for admin + * commands and one for I/O commands). + */ +struct nvme_queue { + struct nvme_dev *dev; + spinlock_t sq_lock; + void *sq_cmds; + /* only used for poll queues: */ + spinlock_t cq_poll_lock ____cacheline_aligned_in_smp; + struct nvme_completion *cqes; + dma_addr_t sq_dma_addr; + dma_addr_t cq_dma_addr; + u32 __iomem *q_db; + u32 q_depth; + u16 cq_vector; + u16 sq_tail; + u16 last_sq_tail; + u16 cq_head; + u16 qid; + u8 cq_phase; + u8 sqes; + unsigned long flags; +#define NVMEQ_ENABLED 0 +#define NVMEQ_SQ_CMB 1 +#define NVMEQ_DELETE_ERROR 2 +#define NVMEQ_POLLED 3 + u32 *dbbuf_sq_db; + u32 *dbbuf_cq_db; + u32 *dbbuf_sq_ei; + u32 *dbbuf_cq_ei; + struct completion delete_done; + + /* p2p */ + bool p2p; + struct nvme_peer_resource resource; +}; + +/* + * The nvme_iod describes the data in an I/O. + * + * The sg pointer contains the list of PRP/SGL chunk allocations in addition + * to the actual struct scatterlist. + */ +struct nvme_iod { + struct nvme_request req; + struct nvme_command cmd; + struct nvme_queue *nvmeq; + bool use_sgl; + int aborted; + int npages; /* In the PRP list. 0 means small pool in use */ + int nents; /* Used in scatterlist */ + dma_addr_t first_dma; + unsigned int dma_len; /* length of single DMA segment mapping */ + dma_addr_t meta_dma; + struct scatterlist *sg; +}; + +static int nvme_peer_init_resource(struct nvme_queue *nvmeq, + enum nvme_peer_resource_mask mask, + void (* stop_master_peer)(void *priv), void *dd_data) +{ + struct nvme_dev *dev = nvmeq->dev; + struct pci_dev *pdev = to_pci_dev(dev->dev); + int qid = nvmeq->qid; + int ret = 0; + + if (mask & NVME_PEER_SQT_DBR) + /* Calculation from NVMe 1.2.1 SPEC */ + nvmeq->resource.sqt_dbr_addr = pci_bus_address(pdev, 0) + (0x1000 + ((2 * (qid)) * (4 << NVME_CAP_STRIDE(dev->ctrl.cap)))); + + if (mask & NVME_PEER_CQH_DBR) + /* Calculation from NVMe 1.2.1 SPEC */ + nvmeq->resource.cqh_dbr_addr = pci_bus_address(pdev, 0) + (0x1000 + ((2 * (qid) + 1) * (4 << NVME_CAP_STRIDE(dev->ctrl.cap)))); + + if (mask & NVME_PEER_SQ_PAS) + nvmeq->resource.sq_dma_addr = nvmeq->sq_dma_addr; + + if (mask & NVME_PEER_CQ_PAS) + nvmeq->resource.cq_dma_addr = nvmeq->cq_dma_addr; + + if (mask & NVME_PEER_SQ_SZ) + nvmeq->resource.nvme_sq_size = SQ_SIZE(nvmeq); + + if (mask & NVME_PEER_CQ_SZ) + nvmeq->resource.nvme_cq_size = CQ_SIZE(nvmeq); + + if (mask & NVME_PEER_MEM_LOG_PG_SZ) + nvmeq->resource.memory_log_page_size = __ffs(NVME_CTRL_PAGE_SIZE >> 12); /* memory_log_page_size is in 4K granularity */ + + nvmeq->resource.flags = NVME_QUEUE_PHYS_CONTIG; + nvmeq->resource.stop_master_peer = stop_master_peer; + nvmeq->resource.dd_data = dd_data; + + return ret; +} + +void nvme_peer_flush_resource(struct nvme_peer_resource *resource, bool restart) +{ + struct nvme_queue *nvmeq = container_of(resource, struct nvme_queue, + resource); + + mutex_lock(&resource->lock); + resource->stop_master_peer = NULL; + resource->dd_data = NULL; + mutex_unlock(&resource->lock); + + if (restart) { + nvme_suspend_queue(nvmeq); + adapter_delete_sq(nvmeq->dev, nvmeq->qid); + adapter_delete_cq(nvmeq->dev, nvmeq->qid); + } +} +EXPORT_SYMBOL_GPL(nvme_peer_flush_resource); + +void nvme_peer_put_resource(struct nvme_peer_resource *resource, bool restart) +{ + struct nvme_queue *nvmeq = container_of(resource, struct nvme_queue, + resource); + mutex_lock(&resource->lock); + resource->in_use = false; + mutex_unlock(&resource->lock); + + // TODO: create/destroy on demand + + /* Restart the queue for future usage */ + if (restart) + nvme_create_queue(nvmeq, nvmeq->qid, false); +} +EXPORT_SYMBOL_GPL(nvme_peer_put_resource); + +struct nvme_peer_resource *nvme_peer_get_resource(struct pci_dev *pdev, + enum nvme_peer_resource_mask mask, + void (* stop_master_peer)(void *priv), void *dd_data) +{ + struct nvme_dev *dev = pci_get_drvdata(pdev); + struct nvme_queue *nvmeq; + unsigned i; + int ret; + + if (!dev) + return NULL; + + + for (i = 0; i < dev->online_queues; i++) { + nvmeq = &dev->queues[i]; + if (nvmeq->p2p) { + mutex_lock(&nvmeq->resource.lock); + if (!nvmeq->resource.in_use) { + /* + * In case the creation of the queue failed at + * nvme_peer_put_resource(), retry to create it. + */ + if (!test_bit(NVMEQ_ENABLED, &nvmeq->flags)) { + ret = nvme_create_queue(nvmeq, + nvmeq->qid, + false); + dev_info(dev->ctrl.device, + "Recreate qid %d ret = %d\n", + nvmeq->qid, ret); + if (ret) { + mutex_unlock(&nvmeq->resource.lock); + continue; + } + } + + ret = nvme_peer_init_resource(nvmeq, mask, + stop_master_peer, + dd_data); + if (!ret) { + nvmeq->resource.in_use = true; + mutex_unlock(&nvmeq->resource.lock); + return &nvmeq->resource; + } + } + mutex_unlock(&nvmeq->resource.lock); + } + } + + return NULL; +} +EXPORT_SYMBOL_GPL(nvme_peer_get_resource); + +static inline unsigned int nvme_dbbuf_size(struct nvme_dev *dev) +{ + return dev->nr_allocated_queues * 8 * dev->db_stride; +} + +static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev) +{ + unsigned int mem_size = nvme_dbbuf_size(dev); + + if (dev->dbbuf_dbs) { + /* + * Clear the dbbuf memory so the driver doesn't observe stale + * values from the previous instantiation. + */ + memset(dev->dbbuf_dbs, 0, mem_size); + memset(dev->dbbuf_eis, 0, mem_size); + return 0; + } + + dev->dbbuf_dbs = dma_alloc_coherent(dev->dev, mem_size, + &dev->dbbuf_dbs_dma_addr, + GFP_KERNEL); + if (!dev->dbbuf_dbs) + return -ENOMEM; + dev->dbbuf_eis = dma_alloc_coherent(dev->dev, mem_size, + &dev->dbbuf_eis_dma_addr, + GFP_KERNEL); + if (!dev->dbbuf_eis) { + dma_free_coherent(dev->dev, mem_size, + dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr); + dev->dbbuf_dbs = NULL; + return -ENOMEM; + } + + return 0; +} + +static void nvme_dbbuf_dma_free(struct nvme_dev *dev) +{ + unsigned int mem_size = nvme_dbbuf_size(dev); + + if (dev->dbbuf_dbs) { + dma_free_coherent(dev->dev, mem_size, + dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr); + dev->dbbuf_dbs = NULL; + } + if (dev->dbbuf_eis) { + dma_free_coherent(dev->dev, mem_size, + dev->dbbuf_eis, dev->dbbuf_eis_dma_addr); + dev->dbbuf_eis = NULL; + } +} + +static void nvme_dbbuf_init(struct nvme_dev *dev, + struct nvme_queue *nvmeq, int qid) +{ + if (!dev->dbbuf_dbs || !qid) + return; + + nvmeq->dbbuf_sq_db = &dev->dbbuf_dbs[sq_idx(qid, dev->db_stride)]; + nvmeq->dbbuf_cq_db = &dev->dbbuf_dbs[cq_idx(qid, dev->db_stride)]; + nvmeq->dbbuf_sq_ei = &dev->dbbuf_eis[sq_idx(qid, dev->db_stride)]; + nvmeq->dbbuf_cq_ei = &dev->dbbuf_eis[cq_idx(qid, dev->db_stride)]; +} + +static void nvme_dbbuf_free(struct nvme_queue *nvmeq) +{ + if (!nvmeq->qid) + return; + + nvmeq->dbbuf_sq_db = NULL; + nvmeq->dbbuf_cq_db = NULL; + nvmeq->dbbuf_sq_ei = NULL; + nvmeq->dbbuf_cq_ei = NULL; +} + +static void nvme_dbbuf_set(struct nvme_dev *dev) +{ + struct nvme_command c = { }; + unsigned int i; + + if (!dev->dbbuf_dbs) + return; + + c.dbbuf.opcode = nvme_admin_dbbuf; + c.dbbuf.prp1 = cpu_to_le64(dev->dbbuf_dbs_dma_addr); + c.dbbuf.prp2 = cpu_to_le64(dev->dbbuf_eis_dma_addr); + + if (nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0)) { + dev_warn(dev->ctrl.device, "unable to set dbbuf\n"); + /* Free memory and continue on */ + nvme_dbbuf_dma_free(dev); + + for (i = 1; i <= dev->online_queues; i++) + nvme_dbbuf_free(&dev->queues[i]); + } +} + +static inline int nvme_dbbuf_need_event(u16 event_idx, u16 new_idx, u16 old) +{ + return (u16)(new_idx - event_idx - 1) < (u16)(new_idx - old); +} + +/* Update dbbuf and return true if an MMIO is required */ +static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db, + volatile u32 *dbbuf_ei) +{ + if (dbbuf_db) { + u16 old_value; + + /* + * Ensure that the queue is written before updating + * the doorbell in memory + */ + wmb(); + + old_value = *dbbuf_db; + *dbbuf_db = value; + + /* + * Ensure that the doorbell is updated before reading the event + * index from memory. The controller needs to provide similar + * ordering to ensure the envent index is updated before reading + * the doorbell. + */ + mb(); + + if (!nvme_dbbuf_need_event(*dbbuf_ei, value, old_value)) + return false; + } + + return true; +} + +/* + * Will slightly overestimate the number of pages needed. This is OK + * as it only leads to a small amount of wasted memory for the lifetime of + * the I/O. + */ +static int nvme_pci_npages_prp(void) +{ + unsigned nprps = DIV_ROUND_UP(NVME_MAX_KB_SZ + NVME_CTRL_PAGE_SIZE, + NVME_CTRL_PAGE_SIZE); + return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); +} + +/* + * Calculates the number of pages needed for the SGL segments. For example a 4k + * page can accommodate 256 SGL descriptors. + */ +static int nvme_pci_npages_sgl(void) +{ + return DIV_ROUND_UP(NVME_MAX_SEGS * sizeof(struct nvme_sgl_desc), + PAGE_SIZE); +} + +static size_t nvme_pci_iod_alloc_size(void) +{ + size_t npages = max(nvme_pci_npages_prp(), nvme_pci_npages_sgl()); + + return sizeof(__le64 *) * npages + + sizeof(struct scatterlist) * NVME_MAX_SEGS; +} + +static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) +{ + struct nvme_dev *dev = data; + struct nvme_queue *nvmeq = &dev->queues[0]; + + WARN_ON(hctx_idx != 0); + WARN_ON(dev->admin_tagset.tags[0] != hctx->tags); + + hctx->driver_data = nvmeq; + return 0; +} + +static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) +{ + struct nvme_dev *dev = data; + struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1]; + + WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags); + hctx->driver_data = nvmeq; + return 0; +} + +static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req, + unsigned int hctx_idx, unsigned int numa_node) +{ + struct nvme_dev *dev = set->driver_data; + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + int queue_idx = (set == &dev->tagset) ? hctx_idx + 1 : 0; + struct nvme_queue *nvmeq = &dev->queues[queue_idx]; + + BUG_ON(!nvmeq); + iod->nvmeq = nvmeq; + + nvme_req(req)->ctrl = &dev->ctrl; + nvme_req(req)->cmd = &iod->cmd; + return 0; +} + +static int queue_irq_offset(struct nvme_dev *dev) +{ + /* if we have more than 1 vec, admin queue offsets us by 1 */ + if (dev->num_vecs > 1) + return 1; + + return 0; +} + +static int nvme_pci_map_queues(struct blk_mq_tag_set *set) +{ + struct nvme_dev *dev = set->driver_data; + int i, qoff, offset; + + offset = queue_irq_offset(dev); + for (i = 0, qoff = 0; i < set->nr_maps; i++) { + struct blk_mq_queue_map *map = &set->map[i]; + + map->nr_queues = dev->io_queues[i]; + if (!map->nr_queues) { + BUG_ON(i == HCTX_TYPE_DEFAULT); + continue; + } + + /* + * The poll queue(s) doesn't have an IRQ (and hence IRQ + * affinity), so use the regular blk-mq cpu mapping + */ + map->queue_offset = qoff; + if (i != HCTX_TYPE_POLL && offset) + blk_mq_pci_map_queues(map, to_pci_dev(dev->dev), offset); + else + blk_mq_map_queues(map); + qoff += map->nr_queues; + offset += map->nr_queues; + } + + return 0; +} + +/* + * Write sq tail if we are asked to, or if the next command would wrap. + */ +static inline void nvme_write_sq_db(struct nvme_queue *nvmeq, bool write_sq) +{ + if (!write_sq) { + u16 next_tail = nvmeq->sq_tail + 1; + + if (next_tail == nvmeq->q_depth) + next_tail = 0; + if (next_tail != nvmeq->last_sq_tail) + return; + } + + if (nvme_dbbuf_update_and_check_event(nvmeq->sq_tail, + nvmeq->dbbuf_sq_db, nvmeq->dbbuf_sq_ei)) + writel(nvmeq->sq_tail, nvmeq->q_db); + nvmeq->last_sq_tail = nvmeq->sq_tail; +} + +static inline void nvme_sq_copy_cmd(struct nvme_queue *nvmeq, + struct nvme_command *cmd) +{ + memcpy(nvmeq->sq_cmds + (nvmeq->sq_tail << nvmeq->sqes), + absolute_pointer(cmd), sizeof(*cmd)); + if (++nvmeq->sq_tail == nvmeq->q_depth) + nvmeq->sq_tail = 0; +} + +static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx) +{ + struct nvme_queue *nvmeq = hctx->driver_data; + + spin_lock(&nvmeq->sq_lock); + if (nvmeq->sq_tail != nvmeq->last_sq_tail) + nvme_write_sq_db(nvmeq, true); + spin_unlock(&nvmeq->sq_lock); +} + +static void **nvme_pci_iod_list(struct request *req) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + return (void **)(iod->sg + blk_rq_nr_phys_segments(req)); +} + +static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + int nseg = blk_rq_nr_phys_segments(req); + unsigned int avg_seg_size; + + avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg); + + if (!nvme_ctrl_sgl_supported(&dev->ctrl)) + return false; + if (!iod->nvmeq->qid) + return false; + if (!sgl_threshold || avg_seg_size < sgl_threshold) + return false; + return true; +} + +static void nvme_free_prps(struct nvme_dev *dev, struct request *req) +{ + const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1; + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + dma_addr_t dma_addr = iod->first_dma; + int i; + + for (i = 0; i < iod->npages; i++) { + __le64 *prp_list = nvme_pci_iod_list(req)[i]; + dma_addr_t next_dma_addr = le64_to_cpu(prp_list[last_prp]); + + dma_pool_free(dev->prp_page_pool, prp_list, dma_addr); + dma_addr = next_dma_addr; + } +} + +static void nvme_free_sgls(struct nvme_dev *dev, struct request *req) +{ + const int last_sg = SGES_PER_PAGE - 1; + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + dma_addr_t dma_addr = iod->first_dma; + int i; + + for (i = 0; i < iod->npages; i++) { + struct nvme_sgl_desc *sg_list = nvme_pci_iod_list(req)[i]; + dma_addr_t next_dma_addr = le64_to_cpu((sg_list[last_sg]).addr); + + dma_pool_free(dev->prp_page_pool, sg_list, dma_addr); + dma_addr = next_dma_addr; + } +} + +#ifdef CONFIG_NVFS +#include "nvfs-dma.h" +#endif + +static void nvme_unmap_sg(struct nvme_dev *dev, struct request *req) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + +#ifdef CONFIG_NVFS + if (nvme_nvfs_unmap_data(dev, req)) + return; +#endif + if (is_pci_p2pdma_page(sg_page(iod->sg))) + pci_p2pdma_unmap_sg(dev->dev, iod->sg, iod->nents, + rq_dma_dir(req)); + else + dma_unmap_sg(dev->dev, iod->sg, iod->nents, rq_dma_dir(req)); +} + +static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + + if (iod->dma_len) { + dma_unmap_page(dev->dev, iod->first_dma, iod->dma_len, + rq_dma_dir(req)); + return; + } + + WARN_ON_ONCE(!iod->nents); + + nvme_unmap_sg(dev, req); + if (iod->npages == 0) + dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0], + iod->first_dma); + else if (iod->use_sgl) + nvme_free_sgls(dev, req); + else + nvme_free_prps(dev, req); + mempool_free(iod->sg, dev->iod_mempool); +} + +static void nvme_print_sgl(struct scatterlist *sgl, int nents) +{ + int i; + struct scatterlist *sg; + + for_each_sg(sgl, sg, nents, i) { + dma_addr_t phys = sg_phys(sg); + pr_warn("sg[%d] phys_addr:%pad offset:%d length:%d " + "dma_address:%pad dma_length:%d\n", + i, &phys, sg->offset, sg->length, &sg_dma_address(sg), + sg_dma_len(sg)); + } +} + +static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev, + struct request *req, struct nvme_rw_command *cmnd) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct dma_pool *pool; + int length = blk_rq_payload_bytes(req); + struct scatterlist *sg = iod->sg; + int dma_len = sg_dma_len(sg); + u64 dma_addr = sg_dma_address(sg); + int offset = dma_addr & (NVME_CTRL_PAGE_SIZE - 1); + __le64 *prp_list; + void **list = nvme_pci_iod_list(req); + dma_addr_t prp_dma; + int nprps, i; + + length -= (NVME_CTRL_PAGE_SIZE - offset); + if (length <= 0) { + iod->first_dma = 0; + goto done; + } + + dma_len -= (NVME_CTRL_PAGE_SIZE - offset); + if (dma_len) { + dma_addr += (NVME_CTRL_PAGE_SIZE - offset); + } else { + sg = sg_next(sg); + dma_addr = sg_dma_address(sg); + dma_len = sg_dma_len(sg); + } + + if (length <= NVME_CTRL_PAGE_SIZE) { + iod->first_dma = dma_addr; + goto done; + } + + nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE); + if (nprps <= (256 / 8)) { + pool = dev->prp_small_pool; + iod->npages = 0; + } else { + pool = dev->prp_page_pool; + iod->npages = 1; + } + + prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); + if (!prp_list) { + iod->first_dma = dma_addr; + iod->npages = -1; + return BLK_STS_RESOURCE; + } + list[0] = prp_list; + iod->first_dma = prp_dma; + i = 0; + for (;;) { + if (i == NVME_CTRL_PAGE_SIZE >> 3) { + __le64 *old_prp_list = prp_list; + prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); + if (!prp_list) + goto free_prps; + list[iod->npages++] = prp_list; + prp_list[0] = old_prp_list[i - 1]; + old_prp_list[i - 1] = cpu_to_le64(prp_dma); + i = 1; + } + prp_list[i++] = cpu_to_le64(dma_addr); + dma_len -= NVME_CTRL_PAGE_SIZE; + dma_addr += NVME_CTRL_PAGE_SIZE; + length -= NVME_CTRL_PAGE_SIZE; + if (length <= 0) + break; + if (dma_len > 0) + continue; + if (unlikely(dma_len < 0)) + goto bad_sgl; + sg = sg_next(sg); + dma_addr = sg_dma_address(sg); + dma_len = sg_dma_len(sg); + } +done: + cmnd->dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); + cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma); + return BLK_STS_OK; +free_prps: + nvme_free_prps(dev, req); + return BLK_STS_RESOURCE; +bad_sgl: + WARN(DO_ONCE(nvme_print_sgl, iod->sg, iod->nents), + "Invalid SGL for payload:%d nents:%d\n", + blk_rq_payload_bytes(req), iod->nents); + return BLK_STS_IOERR; +} + +static void nvme_pci_sgl_set_data(struct nvme_sgl_desc *sge, + struct scatterlist *sg) +{ + sge->addr = cpu_to_le64(sg_dma_address(sg)); + sge->length = cpu_to_le32(sg_dma_len(sg)); + sge->type = NVME_SGL_FMT_DATA_DESC << 4; +} + +static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge, + dma_addr_t dma_addr, int entries) +{ + sge->addr = cpu_to_le64(dma_addr); + if (entries < SGES_PER_PAGE) { + sge->length = cpu_to_le32(entries * sizeof(*sge)); + sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4; + } else { + sge->length = cpu_to_le32(PAGE_SIZE); + sge->type = NVME_SGL_FMT_SEG_DESC << 4; + } +} + +static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev, + struct request *req, struct nvme_rw_command *cmd, int entries) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct dma_pool *pool; + struct nvme_sgl_desc *sg_list; + struct scatterlist *sg = iod->sg; + dma_addr_t sgl_dma; + int i = 0; + + /* setting the transfer type as SGL */ + cmd->flags = NVME_CMD_SGL_METABUF; + + if (entries == 1) { + nvme_pci_sgl_set_data(&cmd->dptr.sgl, sg); + return BLK_STS_OK; + } + + if (entries <= (256 / sizeof(struct nvme_sgl_desc))) { + pool = dev->prp_small_pool; + iod->npages = 0; + } else { + pool = dev->prp_page_pool; + iod->npages = 1; + } + + sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma); + if (!sg_list) { + iod->npages = -1; + return BLK_STS_RESOURCE; + } + + nvme_pci_iod_list(req)[0] = sg_list; + iod->first_dma = sgl_dma; + + nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries); + + do { + if (i == SGES_PER_PAGE) { + struct nvme_sgl_desc *old_sg_desc = sg_list; + struct nvme_sgl_desc *link = &old_sg_desc[i - 1]; + + sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma); + if (!sg_list) + goto free_sgls; + + i = 0; + nvme_pci_iod_list(req)[iod->npages++] = sg_list; + sg_list[i++] = *link; + nvme_pci_sgl_set_seg(link, sgl_dma, entries); + } + + nvme_pci_sgl_set_data(&sg_list[i++], sg); + sg = sg_next(sg); + } while (--entries > 0); + + return BLK_STS_OK; +free_sgls: + nvme_free_sgls(dev, req); + return BLK_STS_RESOURCE; +} + +static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev, + struct request *req, struct nvme_rw_command *cmnd, + struct bio_vec *bv) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + unsigned int offset = bv->bv_offset & (NVME_CTRL_PAGE_SIZE - 1); + unsigned int first_prp_len = NVME_CTRL_PAGE_SIZE - offset; + + iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0); + if (dma_mapping_error(dev->dev, iod->first_dma)) + return BLK_STS_RESOURCE; + iod->dma_len = bv->bv_len; + + cmnd->dptr.prp1 = cpu_to_le64(iod->first_dma); + if (bv->bv_len > first_prp_len) + cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma + first_prp_len); + return BLK_STS_OK; +} + +static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev, + struct request *req, struct nvme_rw_command *cmnd, + struct bio_vec *bv) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + + iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0); + if (dma_mapping_error(dev->dev, iod->first_dma)) + return BLK_STS_RESOURCE; + iod->dma_len = bv->bv_len; + + cmnd->flags = NVME_CMD_SGL_METABUF; + cmnd->dptr.sgl.addr = cpu_to_le64(iod->first_dma); + cmnd->dptr.sgl.length = cpu_to_le32(iod->dma_len); + cmnd->dptr.sgl.type = NVME_SGL_FMT_DATA_DESC << 4; + return BLK_STS_OK; +} + +static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, + struct nvme_command *cmnd) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + blk_status_t ret = BLK_STS_RESOURCE; + int nr_mapped; + +#ifdef CONFIG_NVFS + bool is_nvfs_io = false; + ret = nvme_nvfs_map_data(dev, req, cmnd, &is_nvfs_io); + if (is_nvfs_io) + return ret; +#endif + + if (blk_rq_nr_phys_segments(req) == 1) { + struct bio_vec bv = req_bvec(req); + + if (!is_pci_p2pdma_page(bv.bv_page)) { + if (bv.bv_offset + bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2) + return nvme_setup_prp_simple(dev, req, + &cmnd->rw, &bv); + + if (iod->nvmeq->qid && sgl_threshold && + nvme_ctrl_sgl_supported(&dev->ctrl)) + return nvme_setup_sgl_simple(dev, req, + &cmnd->rw, &bv); + } + } + + iod->dma_len = 0; + iod->sg = mempool_alloc(dev->iod_mempool, GFP_ATOMIC); + if (!iod->sg) + return BLK_STS_RESOURCE; + sg_init_table(iod->sg, blk_rq_nr_phys_segments(req)); + iod->nents = blk_rq_map_sg(req->q, req, iod->sg); + if (!iod->nents) + goto out_free_sg; + + if (is_pci_p2pdma_page(sg_page(iod->sg))) + nr_mapped = pci_p2pdma_map_sg_attrs(dev->dev, iod->sg, + iod->nents, rq_dma_dir(req), DMA_ATTR_NO_WARN); + else + nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents, + rq_dma_dir(req), DMA_ATTR_NO_WARN); + if (!nr_mapped) + goto out_free_sg; + + iod->use_sgl = nvme_pci_use_sgls(dev, req); + if (iod->use_sgl) + ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw, nr_mapped); + else + ret = nvme_pci_setup_prps(dev, req, &cmnd->rw); + if (ret != BLK_STS_OK) + goto out_unmap_sg; + return BLK_STS_OK; + +out_unmap_sg: + nvme_unmap_sg(dev, req); +out_free_sg: + mempool_free(iod->sg, dev->iod_mempool); + return ret; +} + +static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req, + struct nvme_command *cmnd) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + + iod->meta_dma = dma_map_bvec(dev->dev, rq_integrity_vec(req), + rq_dma_dir(req), 0); + if (dma_mapping_error(dev->dev, iod->meta_dma)) + return BLK_STS_IOERR; + cmnd->rw.metadata = cpu_to_le64(iod->meta_dma); + return BLK_STS_OK; +} + +static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + blk_status_t ret; + + iod->aborted = 0; + iod->npages = -1; + iod->nents = 0; + + ret = nvme_setup_cmd(req->q->queuedata, req); + if (ret) + return ret; + + if (blk_rq_nr_phys_segments(req)) { + ret = nvme_map_data(dev, req, &iod->cmd); + if (ret) + goto out_free_cmd; + } + + if (blk_integrity_rq(req)) { + ret = nvme_map_metadata(dev, req, &iod->cmd); + if (ret) + goto out_unmap_data; + } + + blk_mq_start_request(req); + return BLK_STS_OK; +out_unmap_data: + nvme_unmap_data(dev, req); +out_free_cmd: + nvme_cleanup_cmd(req); + return ret; +} + +/* + * NOTE: ns is NULL when called on the admin queue. + */ +static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct nvme_queue *nvmeq = hctx->driver_data; + struct nvme_dev *dev = nvmeq->dev; + struct request *req = bd->rq; + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + blk_status_t ret; + + /* + * We should not need to do this, but we're still using this to + * ensure we can drain requests on a dying queue. + */ + if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags))) + return BLK_STS_IOERR; + + if (unlikely(!nvme_check_ready(&dev->ctrl, req, true))) + return nvme_fail_nonready_command(&dev->ctrl, req); + + ret = nvme_prep_rq(dev, req); + if (unlikely(ret)) + return ret; + spin_lock(&nvmeq->sq_lock); + nvme_sq_copy_cmd(nvmeq, &iod->cmd); + nvme_write_sq_db(nvmeq, bd->last); + spin_unlock(&nvmeq->sq_lock); + return BLK_STS_OK; +} + +static void nvme_submit_cmds(struct nvme_queue *nvmeq, struct request **rqlist) +{ + spin_lock(&nvmeq->sq_lock); + while (!rq_list_empty(*rqlist)) { + struct request *req = rq_list_pop(rqlist); + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + + nvme_sq_copy_cmd(nvmeq, &iod->cmd); + } + nvme_write_sq_db(nvmeq, true); + spin_unlock(&nvmeq->sq_lock); +} + +static bool nvme_prep_rq_batch(struct nvme_queue *nvmeq, struct request *req) +{ + /* + * We should not need to do this, but we're still using this to + * ensure we can drain requests on a dying queue. + */ + if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags))) + return false; + if (unlikely(!nvme_check_ready(&nvmeq->dev->ctrl, req, true))) + return false; + + req->mq_hctx->tags->rqs[req->tag] = req; + return nvme_prep_rq(nvmeq->dev, req) == BLK_STS_OK; +} + +static void nvme_queue_rqs(struct request **rqlist) +{ + struct request *req, *next, *prev = NULL; + struct request *requeue_list = NULL; + + rq_list_for_each_safe(rqlist, req, next) { + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; + + if (!nvme_prep_rq_batch(nvmeq, req)) { + /* detach 'req' and add to remainder list */ + rq_list_move(rqlist, &requeue_list, req, prev); + + req = prev; + if (!req) + continue; + } + + if (!next || req->mq_hctx != next->mq_hctx) { + /* detach rest of list, and submit */ + req->rq_next = NULL; + nvme_submit_cmds(nvmeq, rqlist); + *rqlist = next; + prev = NULL; + } else + prev = req; + } + + *rqlist = requeue_list; +} + +static __always_inline void nvme_pci_unmap_rq(struct request *req) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct nvme_dev *dev = iod->nvmeq->dev; + + if (blk_integrity_rq(req)) + dma_unmap_page(dev->dev, iod->meta_dma, + rq_integrity_vec(req)->bv_len, rq_data_dir(req)); + if (blk_rq_nr_phys_segments(req)) + nvme_unmap_data(dev, req); +} + +static void nvme_pci_complete_rq(struct request *req) +{ + nvme_pci_unmap_rq(req); + nvme_complete_rq(req); +} + +static void nvme_pci_complete_batch(struct io_comp_batch *iob) +{ + nvme_complete_batch(iob, nvme_pci_unmap_rq); +} + +/* We read the CQE phase first to check if the rest of the entry is valid */ +static inline bool nvme_cqe_pending(struct nvme_queue *nvmeq) +{ + struct nvme_completion *hcqe = &nvmeq->cqes[nvmeq->cq_head]; + + return (le16_to_cpu(READ_ONCE(hcqe->status)) & 1) == nvmeq->cq_phase; +} + +static inline void nvme_ring_cq_doorbell(struct nvme_queue *nvmeq) +{ + u16 head = nvmeq->cq_head; + + if (nvme_dbbuf_update_and_check_event(head, nvmeq->dbbuf_cq_db, + nvmeq->dbbuf_cq_ei)) + writel(head, nvmeq->q_db + nvmeq->dev->db_stride); +} + +static inline struct blk_mq_tags *nvme_queue_tagset(struct nvme_queue *nvmeq) +{ + if (!nvmeq->qid) + return nvmeq->dev->admin_tagset.tags[0]; + return nvmeq->dev->tagset.tags[nvmeq->qid - 1]; +} + +static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, + struct io_comp_batch *iob, u16 idx) +{ + struct nvme_completion *cqe = &nvmeq->cqes[idx]; + __u16 command_id = READ_ONCE(cqe->command_id); + struct request *req; + + /* + * AEN requests are special as they don't time out and can + * survive any kind of queue freeze and often don't respond to + * aborts. We don't even bother to allocate a struct request + * for them but rather special case them here. + */ + if (unlikely(nvme_is_aen_req(nvmeq->qid, command_id))) { + nvme_complete_async_event(&nvmeq->dev->ctrl, + cqe->status, &cqe->result); + return; + } + + req = nvme_find_rq(nvme_queue_tagset(nvmeq), command_id); + if (unlikely(!req)) { + dev_warn(nvmeq->dev->ctrl.device, + "invalid id %d completed on queue %d\n", + command_id, le16_to_cpu(cqe->sq_id)); + return; + } + + trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail); + if (!nvme_try_complete_req(req, cqe->status, cqe->result) && + !blk_mq_add_to_batch(req, iob, nvme_req(req)->status, + nvme_pci_complete_batch)) + nvme_pci_complete_rq(req); +} + +static inline void nvme_update_cq_head(struct nvme_queue *nvmeq) +{ + u32 tmp = nvmeq->cq_head + 1; + + if (tmp == nvmeq->q_depth) { + nvmeq->cq_head = 0; + nvmeq->cq_phase ^= 1; + } else { + nvmeq->cq_head = tmp; + } +} + +static inline int nvme_poll_cq(struct nvme_queue *nvmeq, + struct io_comp_batch *iob) +{ + int found = 0; + + while (nvme_cqe_pending(nvmeq)) { + found++; + /* + * load-load control dependency between phase and the rest of + * the cqe requires a full read memory barrier + */ + dma_rmb(); + nvme_handle_cqe(nvmeq, iob, nvmeq->cq_head); + nvme_update_cq_head(nvmeq); + } + + if (found) + nvme_ring_cq_doorbell(nvmeq); + return found; +} + +static irqreturn_t nvme_irq(int irq, void *data) +{ + struct nvme_queue *nvmeq = data; + DEFINE_IO_COMP_BATCH(iob); + + if (nvme_poll_cq(nvmeq, &iob)) { + if (!rq_list_empty(iob.req_list)) + nvme_pci_complete_batch(&iob); + return IRQ_HANDLED; + } + return IRQ_NONE; +} + +static irqreturn_t nvme_irq_check(int irq, void *data) +{ + struct nvme_queue *nvmeq = data; + + if (nvme_cqe_pending(nvmeq)) + return IRQ_WAKE_THREAD; + return IRQ_NONE; +} + +/* + * Poll for completions for any interrupt driven queue + * Can be called from any context. + */ +static void nvme_poll_irqdisable(struct nvme_queue *nvmeq) +{ + struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev); + + if (nvmeq->p2p) + return; + + WARN_ON_ONCE(test_bit(NVMEQ_POLLED, &nvmeq->flags)); + + disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); + nvme_poll_cq(nvmeq, NULL); + enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); +} + +static int nvme_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) +{ + struct nvme_queue *nvmeq = hctx->driver_data; + bool found; + + if (!nvme_cqe_pending(nvmeq)) + return 0; + + spin_lock(&nvmeq->cq_poll_lock); + found = nvme_poll_cq(nvmeq, iob); + spin_unlock(&nvmeq->cq_poll_lock); + + return found; +} + +static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl) +{ + struct nvme_dev *dev = to_nvme_dev(ctrl); + struct nvme_queue *nvmeq = &dev->queues[0]; + struct nvme_command c = { }; + + c.common.opcode = nvme_admin_async_event; + c.common.command_id = NVME_AQ_BLK_MQ_DEPTH; + + spin_lock(&nvmeq->sq_lock); + nvme_sq_copy_cmd(nvmeq, &c); + nvme_write_sq_db(nvmeq, true); + spin_unlock(&nvmeq->sq_lock); +} + +static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) +{ + struct nvme_command c = { }; + + c.delete_queue.opcode = opcode; + c.delete_queue.qid = cpu_to_le16(id); + + return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0); +} + +static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, + struct nvme_queue *nvmeq, s16 vector) +{ + struct nvme_command c = { }; + int flags = NVME_QUEUE_PHYS_CONTIG; + + if (!test_bit(NVMEQ_POLLED, &nvmeq->flags) && !nvmeq->p2p) + flags |= NVME_CQ_IRQ_ENABLED; + + /* + * Note: we (ab)use the fact that the prp fields survive if no data + * is attached to the request. + */ + c.create_cq.opcode = nvme_admin_create_cq; + c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr); + c.create_cq.cqid = cpu_to_le16(qid); + c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1); + c.create_cq.cq_flags = cpu_to_le16(flags); + c.create_cq.irq_vector = cpu_to_le16(vector); + + return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0); +} + +static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, + struct nvme_queue *nvmeq) +{ + struct nvme_ctrl *ctrl = &dev->ctrl; + struct nvme_command c = { }; + int flags = NVME_QUEUE_PHYS_CONTIG; + + /* + * Some drives have a bug that auto-enables WRRU if MEDIUM isn't + * set. Since URGENT priority is zeroes, it makes all queues + * URGENT. + */ + if (ctrl->quirks & NVME_QUIRK_MEDIUM_PRIO_SQ) + flags |= NVME_SQ_PRIO_MEDIUM; + + /* + * Note: we (ab)use the fact that the prp fields survive if no data + * is attached to the request. + */ + c.create_sq.opcode = nvme_admin_create_sq; + c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr); + c.create_sq.sqid = cpu_to_le16(qid); + c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1); + c.create_sq.sq_flags = cpu_to_le16(flags); + c.create_sq.cqid = cpu_to_le16(qid); + + return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0); +} + +static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid) +{ + return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid); +} + +static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) +{ + return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid); +} + +static void abort_endio(struct request *req, blk_status_t error) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = iod->nvmeq; + + dev_warn(nvmeq->dev->ctrl.device, + "Abort status: 0x%x", nvme_req(req)->status); + atomic_inc(&nvmeq->dev->ctrl.abort_limit); + blk_mq_free_request(req); +} + +static bool nvme_should_reset(struct nvme_dev *dev, u32 csts) +{ + /* If true, indicates loss of adapter communication, possibly by a + * NVMe Subsystem reset. + */ + bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO); + + /* If there is a reset/reinit ongoing, we shouldn't reset again. */ + switch (dev->ctrl.state) { + case NVME_CTRL_RESETTING: + case NVME_CTRL_CONNECTING: + return false; + default: + break; + } + + /* We shouldn't reset unless the controller is on fatal error state + * _or_ if we lost the communication with it. + */ + if (!(csts & NVME_CSTS_CFS) && !nssro) + return false; + + return true; +} + +static void nvme_warn_reset(struct nvme_dev *dev, u32 csts) +{ + /* Read a config register to help see what died. */ + u16 pci_status; + int result; + + result = pci_read_config_word(to_pci_dev(dev->dev), PCI_STATUS, + &pci_status); + if (result == PCIBIOS_SUCCESSFUL) + dev_warn(dev->ctrl.device, + "controller is down; will reset: CSTS=0x%x, PCI_STATUS=0x%hx\n", + csts, pci_status); + else + dev_warn(dev->ctrl.device, + "controller is down; will reset: CSTS=0x%x, PCI_STATUS read failed (%d)\n", + csts, result); + + if (csts != ~0) + return; + + dev_warn(dev->ctrl.device, + "Does your device have a faulty power saving mode enabled?\n"); + dev_warn(dev->ctrl.device, + "Try \"nvme_core.default_ps_max_latency_us=0 pcie_aspm=off\" and report a bug\n"); +} + +static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = iod->nvmeq; + struct nvme_dev *dev = nvmeq->dev; + struct request *abort_req; + struct nvme_command cmd = { }; + u32 csts = readl(dev->bar + NVME_REG_CSTS); + + /* If PCI error recovery process is happening, we cannot reset or + * the recovery mechanism will surely fail. + */ + mb(); + if (pci_channel_offline(to_pci_dev(dev->dev))) + return BLK_EH_RESET_TIMER; + + /* + * Reset immediately if the controller is failed + */ + if (nvme_should_reset(dev, csts)) { + nvme_warn_reset(dev, csts); + nvme_dev_disable(dev, false); + nvme_reset_ctrl(&dev->ctrl); + return BLK_EH_DONE; + } + + /* + * Did we miss an interrupt? + */ + if (test_bit(NVMEQ_POLLED, &nvmeq->flags)) + nvme_poll(req->mq_hctx, NULL); + else + nvme_poll_irqdisable(nvmeq); + + if (blk_mq_request_completed(req)) { + dev_warn(dev->ctrl.device, + "I/O %d QID %d timeout, completion polled\n", + req->tag, nvmeq->qid); + return BLK_EH_DONE; + } + + /* + * Shutdown immediately if controller times out while starting. The + * reset work will see the pci device disabled when it gets the forced + * cancellation error. All outstanding requests are completed on + * shutdown, so we return BLK_EH_DONE. + */ + switch (dev->ctrl.state) { + case NVME_CTRL_CONNECTING: + nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); + fallthrough; + case NVME_CTRL_DELETING: + dev_warn_ratelimited(dev->ctrl.device, + "I/O %d QID %d timeout, disable controller\n", + req->tag, nvmeq->qid); + nvme_req(req)->flags |= NVME_REQ_CANCELLED; + nvme_dev_disable(dev, true); + return BLK_EH_DONE; + case NVME_CTRL_RESETTING: + return BLK_EH_RESET_TIMER; + default: + break; + } + + /* + * Shutdown the controller immediately and schedule a reset if the + * command was already aborted once before and still hasn't been + * returned to the driver, or if this is the admin queue. + */ + if (!nvmeq->qid || iod->aborted) { + dev_warn(dev->ctrl.device, + "I/O %d QID %d timeout, reset controller\n", + req->tag, nvmeq->qid); + nvme_req(req)->flags |= NVME_REQ_CANCELLED; + nvme_dev_disable(dev, false); + nvme_reset_ctrl(&dev->ctrl); + + return BLK_EH_DONE; + } + + if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) { + atomic_inc(&dev->ctrl.abort_limit); + return BLK_EH_RESET_TIMER; + } + iod->aborted = 1; + + cmd.abort.opcode = nvme_admin_abort_cmd; + cmd.abort.cid = nvme_cid(req); + cmd.abort.sqid = cpu_to_le16(nvmeq->qid); + + dev_warn(nvmeq->dev->ctrl.device, + "I/O %d QID %d timeout, aborting\n", + req->tag, nvmeq->qid); + + abort_req = nvme_alloc_request(dev->ctrl.admin_q, &cmd, + BLK_MQ_REQ_NOWAIT); + if (IS_ERR(abort_req)) { + atomic_inc(&dev->ctrl.abort_limit); + return BLK_EH_RESET_TIMER; + } + + abort_req->end_io_data = NULL; + blk_execute_rq_nowait(abort_req, false, abort_endio); + + /* + * The aborted req will be completed on receiving the abort req. + * We enable the timer again. If hit twice, it'll cause a device reset, + * as the device then is in a faulty state. + */ + return BLK_EH_RESET_TIMER; +} + +static void nvme_free_queue(struct nvme_queue *nvmeq) +{ + dma_free_coherent(nvmeq->dev->dev, CQ_SIZE(nvmeq), + (void *)nvmeq->cqes, nvmeq->cq_dma_addr); + if (!nvmeq->sq_cmds) + return; + + if (test_and_clear_bit(NVMEQ_SQ_CMB, &nvmeq->flags)) { + pci_free_p2pmem(to_pci_dev(nvmeq->dev->dev), + nvmeq->sq_cmds, SQ_SIZE(nvmeq)); + } else { + dma_free_coherent(nvmeq->dev->dev, SQ_SIZE(nvmeq), + nvmeq->sq_cmds, nvmeq->sq_dma_addr); + } +} + +static void nvme_free_queues(struct nvme_dev *dev, int lowest) +{ + int i; + + for (i = dev->ctrl.queue_count - 1; i >= lowest; i--) { + dev->ctrl.queue_count--; + nvme_free_queue(&dev->queues[i]); + } +} + +/** + * nvme_suspend_queue - put queue into suspended state + * @nvmeq: queue to suspend + */ +static int nvme_suspend_queue(struct nvme_queue *nvmeq) +{ + if (!test_and_clear_bit(NVMEQ_ENABLED, &nvmeq->flags)) + return 1; + + /* ensure that nvme_queue_rq() sees NVMEQ_ENABLED cleared */ + mb(); + + if(nvmeq->p2p) { + mutex_lock(&nvmeq->resource.lock); + if (nvmeq->resource.in_use && nvmeq->resource.stop_master_peer) { + mutex_unlock(&nvmeq->resource.lock); + nvmeq->resource.stop_master_peer(nvmeq->resource.dd_data); + } else + mutex_unlock(&nvmeq->resource.lock); + } + + nvmeq->dev->online_queues--; + if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q) + nvme_stop_admin_queue(&nvmeq->dev->ctrl); + if (!nvmeq->p2p && !test_and_clear_bit(NVMEQ_POLLED, &nvmeq->flags)) + pci_free_irq(to_pci_dev(nvmeq->dev->dev), nvmeq->cq_vector, nvmeq); + return 0; +} + +static void nvme_suspend_io_queues(struct nvme_dev *dev) +{ + int i; + + for (i = dev->ctrl.queue_count - 1; i > 0; i--) + nvme_suspend_queue(&dev->queues[i]); +} + +static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown) +{ + struct nvme_queue *nvmeq = &dev->queues[0]; + + if (shutdown) + nvme_shutdown_ctrl(&dev->ctrl); + else + nvme_disable_ctrl(&dev->ctrl); + + nvme_poll_irqdisable(nvmeq); +} + +/* + * Called only on a device that has been disabled and after all other threads + * that can check this device's completion queues have synced, except + * nvme_poll(). This is the last chance for the driver to see a natural + * completion before nvme_cancel_request() terminates all incomplete requests. + */ +static void nvme_reap_pending_cqes(struct nvme_dev *dev) +{ + int i; + + for (i = dev->ctrl.queue_count - 1; i > 0; i--) { + if (dev->queues[i].p2p) + continue; + spin_lock(&dev->queues[i].cq_poll_lock); + nvme_poll_cq(&dev->queues[i], NULL); + spin_unlock(&dev->queues[i].cq_poll_lock); + } +} + +static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues, + int entry_size) +{ + int q_depth = dev->q_depth; + unsigned q_size_aligned = roundup(q_depth * entry_size, + NVME_CTRL_PAGE_SIZE); + + if (q_size_aligned * nr_io_queues > dev->cmb_size) { + u64 mem_per_q = div_u64(dev->cmb_size, nr_io_queues); + + mem_per_q = round_down(mem_per_q, NVME_CTRL_PAGE_SIZE); + q_depth = div_u64(mem_per_q, entry_size); + + /* + * Ensure the reduced q_depth is above some threshold where it + * would be better to map queues in system memory with the + * original depth + */ + if (q_depth < 64) + return -ENOMEM; + } + + return q_depth; +} + +static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, + int qid) +{ + struct pci_dev *pdev = to_pci_dev(dev->dev); + + if (qid && dev->cmb_use_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) { + nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(nvmeq)); + if (nvmeq->sq_cmds) { + nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev, + nvmeq->sq_cmds); + if (nvmeq->sq_dma_addr) { + set_bit(NVMEQ_SQ_CMB, &nvmeq->flags); + return 0; + } + + pci_free_p2pmem(pdev, nvmeq->sq_cmds, SQ_SIZE(nvmeq)); + } + } + + nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(nvmeq), + &nvmeq->sq_dma_addr, GFP_KERNEL); + if (!nvmeq->sq_cmds) + return -ENOMEM; + return 0; +} + +static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth) +{ + struct nvme_queue *nvmeq = &dev->queues[qid]; + + if (dev->ctrl.queue_count > qid) + return 0; + + nvmeq->sqes = qid ? dev->io_sqes : NVME_ADM_SQES; + nvmeq->q_depth = depth; + nvmeq->cqes = dma_alloc_coherent(dev->dev, CQ_SIZE(nvmeq), + &nvmeq->cq_dma_addr, GFP_KERNEL); + if (!nvmeq->cqes) + goto free_nvmeq; + + if (nvme_alloc_sq_cmds(dev, nvmeq, qid)) + goto free_cqdma; + + nvmeq->dev = dev; + spin_lock_init(&nvmeq->sq_lock); + spin_lock_init(&nvmeq->cq_poll_lock); + nvmeq->cq_head = 0; + nvmeq->cq_phase = 1; + nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; + nvmeq->qid = qid; + nvmeq->p2p = qid > (dev->max_qid - dev->num_p2p_queues); + if (nvmeq->p2p) + mutex_init(&nvmeq->resource.lock); + dev->ctrl.queue_count++; + + return 0; + + free_cqdma: + dma_free_coherent(dev->dev, CQ_SIZE(nvmeq), (void *)nvmeq->cqes, + nvmeq->cq_dma_addr); + free_nvmeq: + return -ENOMEM; +} + +static int queue_request_irq(struct nvme_queue *nvmeq) +{ + struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev); + int nr = nvmeq->dev->ctrl.instance; + + if (use_threaded_interrupts) { + return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq_check, + nvme_irq, nvmeq, "nvme%dq%d", nr, nvmeq->qid); + } else { + return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq, + NULL, nvmeq, "nvme%dq%d", nr, nvmeq->qid); + } +} + +static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) +{ + struct nvme_dev *dev = nvmeq->dev; + + nvmeq->sq_tail = 0; + nvmeq->last_sq_tail = 0; + nvmeq->cq_head = 0; + nvmeq->cq_phase = 1; + nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; + memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq)); + nvme_dbbuf_init(dev, nvmeq, qid); + dev->online_queues++; + wmb(); /* ensure the first interrupt sees the initialization */ +} + +/* + * Try getting shutdown_lock while setting up IO queues. + */ +static int nvme_setup_io_queues_trylock(struct nvme_dev *dev) +{ + /* + * Give up if the lock is being held by nvme_dev_disable. + */ + if (!mutex_trylock(&dev->shutdown_lock)) + return -ENODEV; + + /* + * Controller is in wrong state, fail early. + */ + if (dev->ctrl.state != NVME_CTRL_CONNECTING) { + mutex_unlock(&dev->shutdown_lock); + return -ENODEV; + } + + return 0; +} + +static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled) +{ + struct nvme_dev *dev = nvmeq->dev; + int result; + u16 vector = 0; + + clear_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags); + + /* + * A queue's vector matches the queue identifier unless the controller + * has only one vector available. + */ + if (!polled && !nvmeq->p2p) + vector = dev->num_vecs == 1 ? 0 : qid; + else if (polled) + set_bit(NVMEQ_POLLED, &nvmeq->flags); + + result = adapter_alloc_cq(dev, qid, nvmeq, vector); + if (result) + return result; + + result = adapter_alloc_sq(dev, qid, nvmeq); + if (result < 0) + return result; + if (result) + goto release_cq; + + nvmeq->cq_vector = vector; + + if (nvmeq->p2p) { + if (!mutex_trylock(&dev->shutdown_lock)) + result = -ENODEV; + } else { + result = nvme_setup_io_queues_trylock(dev); + } + if (result) + return result; + nvme_init_queue(nvmeq, qid); + if (!nvmeq->p2p && !polled) { + result = queue_request_irq(nvmeq); + if (result < 0) + goto release_sq; + } + + set_bit(NVMEQ_ENABLED, &nvmeq->flags); + mutex_unlock(&dev->shutdown_lock); + return result; + +release_sq: + dev->online_queues--; + mutex_unlock(&dev->shutdown_lock); + adapter_delete_sq(dev, qid); +release_cq: + adapter_delete_cq(dev, qid); + return result; +} + +static const struct blk_mq_ops nvme_mq_admin_ops = { + .queue_rq = nvme_queue_rq, + .complete = nvme_pci_complete_rq, + .init_hctx = nvme_admin_init_hctx, + .init_request = nvme_init_request, + .timeout = nvme_timeout, +}; + +static const struct blk_mq_ops nvme_mq_ops = { + .queue_rq = nvme_queue_rq, + .queue_rqs = nvme_queue_rqs, + .complete = nvme_pci_complete_rq, + .commit_rqs = nvme_commit_rqs, + .init_hctx = nvme_init_hctx, + .init_request = nvme_init_request, + .map_queues = nvme_pci_map_queues, + .timeout = nvme_timeout, + .poll = nvme_poll, +}; + +static void nvme_dev_remove_admin(struct nvme_dev *dev) +{ + if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) { + /* + * If the controller was reset during removal, it's possible + * user requests may be waiting on a stopped queue. Start the + * queue to flush these to completion. + */ + nvme_start_admin_queue(&dev->ctrl); + blk_cleanup_queue(dev->ctrl.admin_q); + blk_mq_free_tag_set(&dev->admin_tagset); + } +} + +static int nvme_alloc_admin_tags(struct nvme_dev *dev) +{ + if (!dev->ctrl.admin_q) { + dev->admin_tagset.ops = &nvme_mq_admin_ops; + dev->admin_tagset.nr_hw_queues = 1; + + dev->admin_tagset.queue_depth = NVME_AQ_MQ_TAG_DEPTH; + dev->admin_tagset.timeout = NVME_ADMIN_TIMEOUT; + dev->admin_tagset.numa_node = dev->ctrl.numa_node; + dev->admin_tagset.cmd_size = sizeof(struct nvme_iod); + dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED; + dev->admin_tagset.driver_data = dev; + + if (blk_mq_alloc_tag_set(&dev->admin_tagset)) + return -ENOMEM; + dev->ctrl.admin_tagset = &dev->admin_tagset; + + dev->ctrl.admin_q = blk_mq_init_queue(&dev->admin_tagset); + if (IS_ERR(dev->ctrl.admin_q)) { + blk_mq_free_tag_set(&dev->admin_tagset); + return -ENOMEM; + } + if (!blk_get_queue(dev->ctrl.admin_q)) { + nvme_dev_remove_admin(dev); + dev->ctrl.admin_q = NULL; + return -ENODEV; + } + } else + nvme_start_admin_queue(&dev->ctrl); + + return 0; +} + +static unsigned long db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues) +{ + return NVME_REG_DBS + ((nr_io_queues + 1) * 8 * dev->db_stride); +} + +static int nvme_remap_bar(struct nvme_dev *dev, unsigned long size) +{ + struct pci_dev *pdev = to_pci_dev(dev->dev); + + if (size <= dev->bar_mapped_size) + return 0; + if (size > pci_resource_len(pdev, 0)) + return -ENOMEM; + if (dev->bar) + iounmap(dev->bar); + dev->bar = ioremap(pci_resource_start(pdev, 0), size); + if (!dev->bar) { + dev->bar_mapped_size = 0; + return -ENOMEM; + } + dev->bar_mapped_size = size; + dev->dbs = dev->bar + NVME_REG_DBS; + + return 0; +} + +static int nvme_pci_configure_admin_queue(struct nvme_dev *dev) +{ + int result; + u32 aqa; + struct nvme_queue *nvmeq; + + result = nvme_remap_bar(dev, db_bar_size(dev, 0)); + if (result < 0) + return result; + + dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1, 0) ? + NVME_CAP_NSSRC(dev->ctrl.cap) : 0; + + if (dev->subsystem && + (readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO)) + writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS); + + result = nvme_disable_ctrl(&dev->ctrl); + if (result < 0) + return result; + + result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH); + if (result) + return result; + + dev->ctrl.numa_node = dev_to_node(dev->dev); + + nvmeq = &dev->queues[0]; + aqa = nvmeq->q_depth - 1; + aqa |= aqa << 16; + + writel(aqa, dev->bar + NVME_REG_AQA); + lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ); + lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ); + + result = nvme_enable_ctrl(&dev->ctrl); + if (result) + return result; + + nvmeq->cq_vector = 0; + nvme_init_queue(nvmeq, 0); + result = queue_request_irq(nvmeq); + if (result) { + dev->online_queues--; + return result; + } + + set_bit(NVMEQ_ENABLED, &nvmeq->flags); + return result; +} + +static int nvme_create_io_queues(struct nvme_dev *dev) +{ + unsigned i, max, rw_queues; + int ret = 0; + + for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) { + if (nvme_alloc_queue(dev, i, dev->q_depth)) { + ret = -ENOMEM; + break; + } + } + + max = min(dev->max_qid, dev->ctrl.queue_count - 1); + if (max != 1 && dev->io_queues[HCTX_TYPE_POLL]) { + rw_queues = dev->io_queues[HCTX_TYPE_DEFAULT] + + dev->io_queues[HCTX_TYPE_READ]; + } else { + rw_queues = max; + } + + for (i = dev->online_queues; i <= max; i++) { + bool polled = i > rw_queues && !dev->queues[i].p2p; + + ret = nvme_create_queue(&dev->queues[i], i, polled); + if (ret) + break; + } + + /* + * Ignore failing Create SQ/CQ commands, we can continue with less + * than the desired amount of queues, and even a controller without + * I/O queues can still be used to issue admin commands. This might + * be useful to upgrade a buggy firmware for example. + */ + return ret >= 0 ? 0 : ret; +} + +static ssize_t nvme_num_p2p_queues_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); + unsigned num_p2p_queues = ndev->online_queues > 1 ? + ndev->num_p2p_queues : 0; + + return scnprintf(buf, PAGE_SIZE, "%u\n", num_p2p_queues); +} +static DEVICE_ATTR(num_p2p_queues, S_IRUGO, nvme_num_p2p_queues_show, NULL); + +static u64 nvme_cmb_size_unit(struct nvme_dev *dev) +{ + u8 szu = (dev->cmbsz >> NVME_CMBSZ_SZU_SHIFT) & NVME_CMBSZ_SZU_MASK; + + return 1ULL << (12 + 4 * szu); +} + +static u32 nvme_cmb_size(struct nvme_dev *dev) +{ + return (dev->cmbsz >> NVME_CMBSZ_SZ_SHIFT) & NVME_CMBSZ_SZ_MASK; +} + +static void nvme_map_cmb(struct nvme_dev *dev) +{ + u64 size, offset; + resource_size_t bar_size; + struct pci_dev *pdev = to_pci_dev(dev->dev); + int bar; + + if (dev->cmb_size) + return; + + if (NVME_CAP_CMBS(dev->ctrl.cap)) + writel(NVME_CMBMSC_CRE, dev->bar + NVME_REG_CMBMSC); + + dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ); + if (!dev->cmbsz) + return; + dev->cmbloc = readl(dev->bar + NVME_REG_CMBLOC); + + size = nvme_cmb_size_unit(dev) * nvme_cmb_size(dev); + offset = nvme_cmb_size_unit(dev) * NVME_CMB_OFST(dev->cmbloc); + bar = NVME_CMB_BIR(dev->cmbloc); + bar_size = pci_resource_len(pdev, bar); + + if (offset > bar_size) + return; + + /* + * Tell the controller about the host side address mapping the CMB, + * and enable CMB decoding for the NVMe 1.4+ scheme: + */ + if (NVME_CAP_CMBS(dev->ctrl.cap)) { + hi_lo_writeq(NVME_CMBMSC_CRE | NVME_CMBMSC_CMSE | + (pci_bus_address(pdev, bar) + offset), + dev->bar + NVME_REG_CMBMSC); + } + + /* + * Controllers may support a CMB size larger than their BAR, + * for example, due to being behind a bridge. Reduce the CMB to + * the reported size of the BAR + */ + if (size > bar_size - offset) + size = bar_size - offset; + + if (pci_p2pdma_add_resource(pdev, bar, size, offset)) { + dev_warn(dev->ctrl.device, + "failed to register the CMB\n"); + return; + } + + dev->cmb_size = size; + dev->cmb_use_sqes = use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS); + + if ((dev->cmbsz & (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) == + (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) + pci_p2pmem_publish(pdev, true); +} + +static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits) +{ + u32 host_mem_size = dev->host_mem_size >> NVME_CTRL_PAGE_SHIFT; + u64 dma_addr = dev->host_mem_descs_dma; + struct nvme_command c = { }; + int ret; + + c.features.opcode = nvme_admin_set_features; + c.features.fid = cpu_to_le32(NVME_FEAT_HOST_MEM_BUF); + c.features.dword11 = cpu_to_le32(bits); + c.features.dword12 = cpu_to_le32(host_mem_size); + c.features.dword13 = cpu_to_le32(lower_32_bits(dma_addr)); + c.features.dword14 = cpu_to_le32(upper_32_bits(dma_addr)); + c.features.dword15 = cpu_to_le32(dev->nr_host_mem_descs); + + ret = nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0); + if (ret) { + dev_warn(dev->ctrl.device, + "failed to set host mem (err %d, flags %#x).\n", + ret, bits); + } else + dev->hmb = bits & NVME_HOST_MEM_ENABLE; + + return ret; +} + +static void nvme_free_host_mem(struct nvme_dev *dev) +{ + int i; + + for (i = 0; i < dev->nr_host_mem_descs; i++) { + struct nvme_host_mem_buf_desc *desc = &dev->host_mem_descs[i]; + size_t size = le32_to_cpu(desc->size) * NVME_CTRL_PAGE_SIZE; + + dma_free_attrs(dev->dev, size, dev->host_mem_desc_bufs[i], + le64_to_cpu(desc->addr), + DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN); + } + + kfree(dev->host_mem_desc_bufs); + dev->host_mem_desc_bufs = NULL; + dma_free_coherent(dev->dev, + dev->nr_host_mem_descs * sizeof(*dev->host_mem_descs), + dev->host_mem_descs, dev->host_mem_descs_dma); + dev->host_mem_descs = NULL; + dev->nr_host_mem_descs = 0; +} + +static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred, + u32 chunk_size) +{ + struct nvme_host_mem_buf_desc *descs; + u32 max_entries, len; + dma_addr_t descs_dma; + int i = 0; + void **bufs; + u64 size, tmp; + + tmp = (preferred + chunk_size - 1); + do_div(tmp, chunk_size); + max_entries = tmp; + + if (dev->ctrl.hmmaxd && dev->ctrl.hmmaxd < max_entries) + max_entries = dev->ctrl.hmmaxd; + + descs = dma_alloc_coherent(dev->dev, max_entries * sizeof(*descs), + &descs_dma, GFP_KERNEL); + if (!descs) + goto out; + + bufs = kcalloc(max_entries, sizeof(*bufs), GFP_KERNEL); + if (!bufs) + goto out_free_descs; + + for (size = 0; size < preferred && i < max_entries; size += len) { + dma_addr_t dma_addr; + + len = min_t(u64, chunk_size, preferred - size); + bufs[i] = dma_alloc_attrs(dev->dev, len, &dma_addr, GFP_KERNEL, + DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN); + if (!bufs[i]) + break; + + descs[i].addr = cpu_to_le64(dma_addr); + descs[i].size = cpu_to_le32(len / NVME_CTRL_PAGE_SIZE); + i++; + } + + if (!size) + goto out_free_bufs; + + dev->nr_host_mem_descs = i; + dev->host_mem_size = size; + dev->host_mem_descs = descs; + dev->host_mem_descs_dma = descs_dma; + dev->host_mem_desc_bufs = bufs; + return 0; + +out_free_bufs: + while (--i >= 0) { + size_t size = le32_to_cpu(descs[i].size) * NVME_CTRL_PAGE_SIZE; + + dma_free_attrs(dev->dev, size, bufs[i], + le64_to_cpu(descs[i].addr), + DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN); + } + + kfree(bufs); +out_free_descs: + dma_free_coherent(dev->dev, max_entries * sizeof(*descs), descs, + descs_dma); +out: + dev->host_mem_descs = NULL; + return -ENOMEM; +} + +static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred) +{ + u64 min_chunk = min_t(u64, preferred, PAGE_SIZE * MAX_ORDER_NR_PAGES); + u64 hmminds = max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2); + u64 chunk_size; + + /* start big and work our way down */ + for (chunk_size = min_chunk; chunk_size >= hmminds; chunk_size /= 2) { + if (!__nvme_alloc_host_mem(dev, preferred, chunk_size)) { + if (!min || dev->host_mem_size >= min) + return 0; + nvme_free_host_mem(dev); + } + } + + return -ENOMEM; +} + +static int nvme_setup_host_mem(struct nvme_dev *dev) +{ + u64 max = (u64)max_host_mem_size_mb * SZ_1M; + u64 preferred = (u64)dev->ctrl.hmpre * 4096; + u64 min = (u64)dev->ctrl.hmmin * 4096; + u32 enable_bits = NVME_HOST_MEM_ENABLE; + int ret; + + preferred = min(preferred, max); + if (min > max) { + dev_warn(dev->ctrl.device, + "min host memory (%lld MiB) above limit (%d MiB).\n", + min >> ilog2(SZ_1M), max_host_mem_size_mb); + nvme_free_host_mem(dev); + return 0; + } + + /* + * If we already have a buffer allocated check if we can reuse it. + */ + if (dev->host_mem_descs) { + if (dev->host_mem_size >= min) + enable_bits |= NVME_HOST_MEM_RETURN; + else + nvme_free_host_mem(dev); + } + + if (!dev->host_mem_descs) { + if (nvme_alloc_host_mem(dev, min, preferred)) { + dev_warn(dev->ctrl.device, + "failed to allocate host memory buffer.\n"); + return 0; /* controller must work without HMB */ + } + + dev_info(dev->ctrl.device, + "allocated %lld MiB host memory buffer.\n", + dev->host_mem_size >> ilog2(SZ_1M)); + } + + ret = nvme_set_host_mem(dev, enable_bits); + if (ret) + nvme_free_host_mem(dev); + return ret; +} + +static ssize_t cmb_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); + + return sysfs_emit(buf, "cmbloc : x%08x\ncmbsz : x%08x\n", + ndev->cmbloc, ndev->cmbsz); +} +static DEVICE_ATTR_RO(cmb); + +static ssize_t cmbloc_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); + + return sysfs_emit(buf, "%u\n", ndev->cmbloc); +} +static DEVICE_ATTR_RO(cmbloc); + +static ssize_t cmbsz_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); + + return sysfs_emit(buf, "%u\n", ndev->cmbsz); +} +static DEVICE_ATTR_RO(cmbsz); + +static ssize_t hmb_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); + + return sysfs_emit(buf, "%d\n", ndev->hmb); +} + +static ssize_t hmb_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); + bool new; + int ret; + + if (strtobool(buf, &new) < 0) + return -EINVAL; + + if (new == ndev->hmb) + return count; + + if (new) { + ret = nvme_setup_host_mem(ndev); + } else { + ret = nvme_set_host_mem(ndev, 0); + if (!ret) + nvme_free_host_mem(ndev); + } + + if (ret < 0) + return ret; + + return count; +} +static DEVICE_ATTR_RW(hmb); + +static umode_t nvme_pci_attrs_are_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct nvme_ctrl *ctrl = + dev_get_drvdata(container_of(kobj, struct device, kobj)); + struct nvme_dev *dev = to_nvme_dev(ctrl); + + if (a == &dev_attr_cmb.attr || + a == &dev_attr_cmbloc.attr || + a == &dev_attr_cmbsz.attr) { + if (!dev->cmbsz) + return 0; + } + if (a == &dev_attr_hmb.attr && !ctrl->hmpre) + return 0; + + return a->mode; +} + +static struct attribute *nvme_pci_attrs[] = { + &dev_attr_cmb.attr, + &dev_attr_cmbloc.attr, + &dev_attr_cmbsz.attr, + &dev_attr_hmb.attr, + NULL, +}; + +static const struct attribute_group nvme_pci_attr_group = { + .attrs = nvme_pci_attrs, + .is_visible = nvme_pci_attrs_are_visible, +}; + +/* + * nirqs is the number of interrupts available for write and read + * queues. The core already reserved an interrupt for the admin queue. + */ +static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs) +{ + struct nvme_dev *dev = affd->priv; + unsigned int nr_read_queues, nr_write_queues = dev->nr_write_queues; + + /* + * If there is no interrupt available for queues, ensure that + * the default queue is set to 1. The affinity set size is + * also set to one, but the irq core ignores it for this case. + * + * If only one interrupt is available or 'write_queue' == 0, combine + * write and read queues. + * + * If 'write_queues' > 0, ensure it leaves room for at least one read + * queue. + */ + if (!nrirqs) { + nrirqs = 1; + nr_read_queues = 0; + } else if (nrirqs == 1 || !nr_write_queues) { + nr_read_queues = 0; + } else if (nr_write_queues >= nrirqs) { + nr_read_queues = 1; + } else { + nr_read_queues = nrirqs - nr_write_queues; + } + + dev->io_queues[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues; + affd->set_size[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues; + dev->io_queues[HCTX_TYPE_READ] = nr_read_queues; + affd->set_size[HCTX_TYPE_READ] = nr_read_queues; + affd->nr_sets = nr_read_queues ? 2 : 1; +} + +static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues) +{ + struct pci_dev *pdev = to_pci_dev(dev->dev); + struct irq_affinity affd = { + .pre_vectors = 1, + .calc_sets = nvme_calc_irq_sets, + .priv = dev, + }; + unsigned int irq_queues, poll_queues; + + /* + * Poll queues don't need interrupts, but we need at least one I/O queue + * left over for non-polled I/O. + */ + poll_queues = min(dev->nr_poll_queues, nr_io_queues - 1); + dev->io_queues[HCTX_TYPE_POLL] = poll_queues; + + /* + * Initialize for the single interrupt case, will be updated in + * nvme_calc_irq_sets(). + */ + dev->io_queues[HCTX_TYPE_DEFAULT] = 1; + dev->io_queues[HCTX_TYPE_READ] = 0; + + /* + * We need interrupts for the admin queue and each non-polled I/O queue, + * but some Apple controllers require all queues to use the first + * vector. + */ + irq_queues = 1; + if (!(dev->ctrl.quirks & NVME_QUIRK_SINGLE_VECTOR)) + irq_queues += (nr_io_queues - poll_queues - dev->num_p2p_queues); + return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues, + PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd); +} + +static void nvme_disable_io_queues(struct nvme_dev *dev) +{ + if (__nvme_disable_io_queues(dev, nvme_admin_delete_sq)) + __nvme_disable_io_queues(dev, nvme_admin_delete_cq); +} + +static unsigned int nvme_max_io_queues(struct nvme_dev *dev) +{ + /* + * If tags are shared with admin queue (Apple bug), then + * make sure we only use one IO queue. + */ + if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) + return 1; + return num_possible_cpus() + dev->nr_write_queues + dev->nr_poll_queues; +} + +static int nvme_setup_io_queues(struct nvme_dev *dev) +{ + struct nvme_queue *adminq = &dev->queues[0]; + struct pci_dev *pdev = to_pci_dev(dev->dev); + unsigned int nr_io_queues; + unsigned long size; + int result; + + /* + * Sample the module parameters once at reset time so that we have + * stable values to work with. + */ + dev->nr_write_queues = write_queues; + dev->nr_poll_queues = poll_queues; + + nr_io_queues = dev->nr_allocated_queues - 1 + dev->num_p2p_queues; + + result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues); + if (result < 0) + return result; + + if (nr_io_queues == 0) + return 0; + + /* + * Free IRQ resources as soon as NVMEQ_ENABLED bit transitions + * from set to unset. If there is a window to it is truely freed, + * pci_free_irq_vectors() jumping into this window will crash. + * And take lock to avoid racing with pci_free_irq_vectors() in + * nvme_dev_disable() path. + */ + result = nvme_setup_io_queues_trylock(dev); + if (result) + return result; + if (test_and_clear_bit(NVMEQ_ENABLED, &adminq->flags)) + pci_free_irq(pdev, 0, adminq); + + if (dev->cmb_use_sqes) { + result = nvme_cmb_qdepth(dev, nr_io_queues, + sizeof(struct nvme_command)); + if (result > 0) + dev->q_depth = result; + else + dev->cmb_use_sqes = false; + } + + if (dev->num_p2p_queues) { + if (nr_io_queues <= dev->num_p2p_queues) { + if (nr_io_queues > 1) + dev->num_p2p_queues = nr_io_queues - 1;//dedicate only 1 io queue for non p2p + else + dev->num_p2p_queues = 0;//dedicate the only io queue for non p2p + } + } + + do { + size = db_bar_size(dev, nr_io_queues); + result = nvme_remap_bar(dev, size); + if (!result) + break; + if (dev->num_p2p_queues) + dev->num_p2p_queues--; + if (!--nr_io_queues) { + result = -ENOMEM; + goto out_unlock; + } + } while (1); + adminq->q_db = dev->dbs; + + retry: + /* Deregister the admin queue's interrupt */ + if (test_and_clear_bit(NVMEQ_ENABLED, &adminq->flags)) + pci_free_irq(pdev, 0, adminq); + + /* + * If we enable msix early due to not intx, disable it again before + * setting up the full range we need. + */ + pci_free_irq_vectors(pdev); + + result = nvme_setup_irqs(dev, nr_io_queues); + if (result <= 0) { + result = -EIO; + goto out_unlock; + } + + dev->num_vecs = result; + result = max(result - 1 + dev->num_p2p_queues, 1u); + dev->max_qid = result + dev->io_queues[HCTX_TYPE_POLL]; + + /* + * Should investigate if there's a performance win from allocating + * more queues than interrupt vectors; it might allow the submission + * path to scale better, even if the receive path is limited by the + * number of interrupts. + */ + result = queue_request_irq(adminq); + if (result) + goto out_unlock; + set_bit(NVMEQ_ENABLED, &adminq->flags); + mutex_unlock(&dev->shutdown_lock); + + result = nvme_create_io_queues(dev); + if (result || dev->online_queues < 2) + return result; + + if (dev->online_queues - 1 < dev->max_qid) { + nr_io_queues = dev->online_queues - 1; + nvme_disable_io_queues(dev); + result = nvme_setup_io_queues_trylock(dev); + if (result) + return result; + nvme_suspend_io_queues(dev); + goto retry; + } + dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n", + dev->io_queues[HCTX_TYPE_DEFAULT], + dev->io_queues[HCTX_TYPE_READ], + dev->io_queues[HCTX_TYPE_POLL]); + return 0; +out_unlock: + mutex_unlock(&dev->shutdown_lock); + return result; +} + +static void nvme_del_queue_end(struct request *req, blk_status_t error) +{ + struct nvme_queue *nvmeq = req->end_io_data; + + blk_mq_free_request(req); + complete(&nvmeq->delete_done); +} + +static void nvme_del_cq_end(struct request *req, blk_status_t error) +{ + struct nvme_queue *nvmeq = req->end_io_data; + + if (error) + set_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags); + + nvme_del_queue_end(req, error); +} + +static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode) +{ + struct request_queue *q = nvmeq->dev->ctrl.admin_q; + struct request *req; + struct nvme_command cmd = { }; + + cmd.delete_queue.opcode = opcode; + cmd.delete_queue.qid = cpu_to_le16(nvmeq->qid); + + req = nvme_alloc_request(q, &cmd, BLK_MQ_REQ_NOWAIT); + if (IS_ERR(req)) + return PTR_ERR(req); + + req->end_io_data = nvmeq; + + init_completion(&nvmeq->delete_done); + blk_execute_rq_nowait(req, false, opcode == nvme_admin_delete_cq ? + nvme_del_cq_end : nvme_del_queue_end); + return 0; +} + +static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode) +{ + int nr_queues = dev->online_queues - 1, sent = 0; + unsigned long timeout; + + retry: + timeout = NVME_ADMIN_TIMEOUT; + while (nr_queues > 0) { + if (nvme_delete_queue(&dev->queues[nr_queues], opcode)) + break; + nr_queues--; + sent++; + } + while (sent) { + struct nvme_queue *nvmeq = &dev->queues[nr_queues + sent]; + + timeout = wait_for_completion_io_timeout(&nvmeq->delete_done, + timeout); + if (timeout == 0) + return false; + + sent--; + if (nr_queues) + goto retry; + } + return true; +} + +static void nvme_dev_add(struct nvme_dev *dev) +{ + int ret; + unsigned nr_hw_queues = dev->online_queues - 1 - dev->num_p2p_queues; + + if (!dev->ctrl.tagset) { + dev->tagset.ops = &nvme_mq_ops; + dev->tagset.nr_hw_queues = nr_hw_queues; + dev->tagset.nr_maps = 2; /* default + read */ + if (dev->io_queues[HCTX_TYPE_POLL]) + dev->tagset.nr_maps++; + dev->tagset.timeout = NVME_IO_TIMEOUT; + dev->tagset.numa_node = dev->ctrl.numa_node; + dev->tagset.queue_depth = min_t(unsigned int, dev->q_depth, + BLK_MQ_MAX_DEPTH) - 1; + dev->tagset.cmd_size = sizeof(struct nvme_iod); + dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; + dev->tagset.driver_data = dev; + + /* + * Some Apple controllers requires tags to be unique + * across admin and IO queue, so reserve the first 32 + * tags of the IO queue. + */ + if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) + dev->tagset.reserved_tags = NVME_AQ_DEPTH; + + ret = blk_mq_alloc_tag_set(&dev->tagset); + if (ret) { + dev_warn(dev->ctrl.device, + "IO queues tagset allocation failed %d\n", ret); + return; + } + dev->ctrl.tagset = &dev->tagset; + } else { + blk_mq_update_nr_hw_queues(&dev->tagset, nr_hw_queues); + + /* Free previously allocated queues that are no longer usable */ + nvme_free_queues(dev, dev->online_queues); + } + + nvme_dbbuf_set(dev); +} + +static int nvme_pci_enable(struct nvme_dev *dev) +{ + int result = -ENOMEM; + struct pci_dev *pdev = to_pci_dev(dev->dev); + int dma_address_bits = 64; + + if (pci_enable_device_mem(pdev)) + return result; + + pci_set_master(pdev); + + if (dev->ctrl.quirks & NVME_QUIRK_DMA_ADDRESS_BITS_48) + dma_address_bits = 48; + if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(dma_address_bits))) + goto disable; + + if (readl(dev->bar + NVME_REG_CSTS) == -1) { + result = -ENODEV; + goto disable; + } + + /* + * Some devices and/or platforms don't advertise or work with INTx + * interrupts. Pre-enable a single MSIX or MSI vec for setup. We'll + * adjust this later. + */ + result = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES); + if (result < 0) + return result; + + dev->ctrl.cap = lo_hi_readq(dev->bar + NVME_REG_CAP); + + dev->q_depth = min_t(u32, NVME_CAP_MQES(dev->ctrl.cap) + 1, + io_queue_depth); + dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */ + dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap); + dev->dbs = dev->bar + 4096; + + /* + * Some Apple controllers require a non-standard SQE size. + * Interestingly they also seem to ignore the CC:IOSQES register + * so we don't bother updating it here. + */ + if (dev->ctrl.quirks & NVME_QUIRK_128_BYTES_SQES) + dev->io_sqes = 7; + else + dev->io_sqes = NVME_NVM_IOSQES; + + /* + * Temporary fix for the Apple controller found in the MacBook8,1 and + * some MacBook7,1 to avoid controller resets and data loss. + */ + if (pdev->vendor == PCI_VENDOR_ID_APPLE && pdev->device == 0x2001) { + dev->q_depth = 2; + dev_warn(dev->ctrl.device, "detected Apple NVMe controller, " + "set queue depth=%u to work around controller resets\n", + dev->q_depth); + } else if (pdev->vendor == PCI_VENDOR_ID_SAMSUNG && + (pdev->device == 0xa821 || pdev->device == 0xa822) && + NVME_CAP_MQES(dev->ctrl.cap) == 0) { + dev->q_depth = 64; + dev_err(dev->ctrl.device, "detected PM1725 NVMe controller, " + "set queue depth=%u\n", dev->q_depth); + } + + /* + * Controllers with the shared tags quirk need the IO queue to be + * big enough so that we get 32 tags for the admin queue + */ + if ((dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) && + (dev->q_depth < (NVME_AQ_DEPTH + 2))) { + dev->q_depth = NVME_AQ_DEPTH + 2; + dev_warn(dev->ctrl.device, "IO queue depth clamped to %d\n", + dev->q_depth); + } + + + nvme_map_cmb(dev); + + pci_enable_pcie_error_reporting(pdev); + pci_save_state(pdev); + return 0; + + disable: + pci_disable_device(pdev); + return result; +} + +static void nvme_dev_unmap(struct nvme_dev *dev) +{ + if (dev->bar) + iounmap(dev->bar); + pci_release_mem_regions(to_pci_dev(dev->dev)); +} + +static void nvme_pci_disable(struct nvme_dev *dev) +{ + struct pci_dev *pdev = to_pci_dev(dev->dev); + + pci_free_irq_vectors(pdev); + + if (pci_is_enabled(pdev)) { + pci_disable_pcie_error_reporting(pdev); + pci_disable_device(pdev); + } +} + +static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) +{ + bool dead = true, freeze = false; + struct pci_dev *pdev = to_pci_dev(dev->dev); + + mutex_lock(&dev->shutdown_lock); + if (pci_is_enabled(pdev)) { + u32 csts = readl(dev->bar + NVME_REG_CSTS); + + if (dev->ctrl.state == NVME_CTRL_LIVE || + dev->ctrl.state == NVME_CTRL_RESETTING) { + freeze = true; + nvme_start_freeze(&dev->ctrl); + } + dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) || + pdev->error_state != pci_channel_io_normal); + } + + /* + * Give the controller a chance to complete all entered requests if + * doing a safe shutdown. + */ + if (!dead && shutdown && freeze) + nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT); + + nvme_stop_queues(&dev->ctrl); + + if (!dead && dev->ctrl.queue_count > 0) { + nvme_disable_io_queues(dev); + nvme_disable_admin_queue(dev, shutdown); + } + nvme_suspend_io_queues(dev); + nvme_suspend_queue(&dev->queues[0]); + nvme_pci_disable(dev); + nvme_reap_pending_cqes(dev); + + blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl); + blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl); + blk_mq_tagset_wait_completed_request(&dev->tagset); + blk_mq_tagset_wait_completed_request(&dev->admin_tagset); + + /* + * The driver will not be starting up queues again if shutting down so + * must flush all entered requests to their failed completion to avoid + * deadlocking blk-mq hot-cpu notifier. + */ + if (shutdown) { + nvme_start_queues(&dev->ctrl); + if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) + nvme_start_admin_queue(&dev->ctrl); + } + mutex_unlock(&dev->shutdown_lock); +} + +static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown) +{ + if (!nvme_wait_reset(&dev->ctrl)) + return -EBUSY; + nvme_dev_disable(dev, shutdown); + return 0; +} + +static int nvme_setup_prp_pools(struct nvme_dev *dev) +{ + dev->prp_page_pool = dma_pool_create("prp list page", dev->dev, + NVME_CTRL_PAGE_SIZE, + NVME_CTRL_PAGE_SIZE, 0); + if (!dev->prp_page_pool) + return -ENOMEM; + + /* Optimisation for I/Os between 4k and 128k */ + dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev, + 256, 256, 0); + if (!dev->prp_small_pool) { + dma_pool_destroy(dev->prp_page_pool); + return -ENOMEM; + } + return 0; +} + +static void nvme_release_prp_pools(struct nvme_dev *dev) +{ + dma_pool_destroy(dev->prp_page_pool); + dma_pool_destroy(dev->prp_small_pool); +} + +static void nvme_free_tagset(struct nvme_dev *dev) +{ + if (dev->tagset.tags) + blk_mq_free_tag_set(&dev->tagset); + dev->ctrl.tagset = NULL; +} + +static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl) +{ + struct nvme_dev *dev = to_nvme_dev(ctrl); + + nvme_dbbuf_dma_free(dev); + nvme_free_tagset(dev); + if (dev->ctrl.admin_q) + blk_put_queue(dev->ctrl.admin_q); + free_opal_dev(dev->ctrl.opal_dev); + mempool_destroy(dev->iod_mempool); + put_device(dev->dev); + kfree(dev->queues); + kfree(dev); +} + +static void nvme_remove_dead_ctrl(struct nvme_dev *dev) +{ + /* + * Set state to deleting now to avoid blocking nvme_wait_reset(), which + * may be holding this pci_dev's device lock. + */ + nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); + nvme_get_ctrl(&dev->ctrl); + nvme_dev_disable(dev, false); + nvme_kill_queues(&dev->ctrl); + if (!queue_work(nvme_wq, &dev->remove_work)) + nvme_put_ctrl(&dev->ctrl); +} + +static void nvme_reset_work(struct work_struct *work) +{ + struct nvme_dev *dev = + container_of(work, struct nvme_dev, ctrl.reset_work); + bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL); + int result; + + if (dev->ctrl.state != NVME_CTRL_RESETTING) { + dev_warn(dev->ctrl.device, "ctrl state %d is not RESETTING\n", + dev->ctrl.state); + result = -ENODEV; + goto out; + } + + /* + * If we're called to reset a live controller first shut it down before + * moving on. + */ + if (dev->ctrl.ctrl_config & NVME_CC_ENABLE) + nvme_dev_disable(dev, false); + nvme_sync_queues(&dev->ctrl); + + mutex_lock(&dev->shutdown_lock); + result = nvme_pci_enable(dev); + if (result) + goto out_unlock; + + result = nvme_pci_configure_admin_queue(dev); + if (result) + goto out_unlock; + + result = nvme_alloc_admin_tags(dev); + if (result) + goto out_unlock; + + /* + * Limit the max command size to prevent iod->sg allocations going + * over a single page. + */ + dev->ctrl.max_hw_sectors = min_t(u32, + NVME_MAX_KB_SZ << 1, dma_max_mapping_size(dev->dev) >> 9); + dev->ctrl.max_segments = NVME_MAX_SEGS; + + /* + * Don't limit the IOMMU merged segment size. + */ + dma_set_max_seg_size(dev->dev, 0xffffffff); + dma_set_min_align_mask(dev->dev, NVME_CTRL_PAGE_SIZE - 1); + + mutex_unlock(&dev->shutdown_lock); + + /* + * Introduce CONNECTING state from nvme-fc/rdma transports to mark the + * initializing procedure here. + */ + if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) { + dev_warn(dev->ctrl.device, + "failed to mark controller CONNECTING\n"); + result = -EBUSY; + goto out; + } + + /* + * We do not support an SGL for metadata (yet), so we are limited to a + * single integrity segment for the separate metadata pointer. + */ + dev->ctrl.max_integrity_segments = 1; + + result = nvme_init_ctrl_finish(&dev->ctrl); + if (result) + goto out; + + if (dev->ctrl.oacs & NVME_CTRL_OACS_SEC_SUPP) { + if (!dev->ctrl.opal_dev) + dev->ctrl.opal_dev = + init_opal_dev(&dev->ctrl, &nvme_sec_submit); + else if (was_suspend) + opal_unlock_from_suspend(dev->ctrl.opal_dev); + } else { + free_opal_dev(dev->ctrl.opal_dev); + dev->ctrl.opal_dev = NULL; + } + + if (dev->ctrl.oacs & NVME_CTRL_OACS_DBBUF_SUPP) { + result = nvme_dbbuf_dma_alloc(dev); + if (result) + dev_warn(dev->dev, + "unable to allocate dma for dbbuf\n"); + } + + if (dev->ctrl.hmpre) { + result = nvme_setup_host_mem(dev); + if (result < 0) + goto out; + } + + result = nvme_setup_io_queues(dev); + if (result) + goto out; + + /* + * Keep the controller around but remove all namespaces if we don't have + * any working I/O queue. + */ + if (dev->online_queues < 2) { + dev_warn(dev->ctrl.device, "IO queues not created\n"); + nvme_kill_queues(&dev->ctrl); + nvme_remove_namespaces(&dev->ctrl); + nvme_free_tagset(dev); + } else { + nvme_start_queues(&dev->ctrl); + nvme_wait_freeze(&dev->ctrl); + nvme_dev_add(dev); + nvme_unfreeze(&dev->ctrl); + } + + /* + * If only admin queue live, keep it to do further investigation or + * recovery. + */ + if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) { + dev_warn(dev->ctrl.device, + "failed to mark controller live state\n"); + result = -ENODEV; + goto out; + } + + if (!dev->attrs_added && !sysfs_create_group(&dev->ctrl.device->kobj, + &nvme_pci_attr_group)) + dev->attrs_added = true; + + nvme_start_ctrl(&dev->ctrl); + return; + + out_unlock: + mutex_unlock(&dev->shutdown_lock); + out: + if (result) + dev_warn(dev->ctrl.device, + "Removing after probe failure status: %d\n", result); + nvme_remove_dead_ctrl(dev); +} + +static void nvme_remove_dead_ctrl_work(struct work_struct *work) +{ + struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work); + struct pci_dev *pdev = to_pci_dev(dev->dev); + + if (pci_get_drvdata(pdev)) + device_release_driver(&pdev->dev); + nvme_put_ctrl(&dev->ctrl); +} + +static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val) +{ + *val = readl(to_nvme_dev(ctrl)->bar + off); + return 0; +} + +static int nvme_pci_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val) +{ + writel(val, to_nvme_dev(ctrl)->bar + off); + return 0; +} + +static int nvme_pci_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val) +{ + *val = lo_hi_readq(to_nvme_dev(ctrl)->bar + off); + return 0; +} + +static int nvme_pci_get_address(struct nvme_ctrl *ctrl, char *buf, int size) +{ + struct pci_dev *pdev = to_pci_dev(to_nvme_dev(ctrl)->dev); + + return snprintf(buf, size, "%s\n", dev_name(&pdev->dev)); +} + +static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { + .name = "pcie", + .module = THIS_MODULE, + .flags = NVME_F_METADATA_SUPPORTED | + NVME_F_PCI_P2PDMA, + .reg_read32 = nvme_pci_reg_read32, + .reg_write32 = nvme_pci_reg_write32, + .reg_read64 = nvme_pci_reg_read64, + .free_ctrl = nvme_pci_free_ctrl, + .submit_async_event = nvme_pci_submit_async_event, + .get_address = nvme_pci_get_address, +}; + +struct pci_dev *nvme_find_pdev_from_bdev(struct block_device *bdev) +{ + struct nvme_ns *ns = disk_to_nvme_ns(bdev->bd_disk); + + if (!ns || ns->ctrl->ops != &nvme_pci_ctrl_ops) + return NULL; + + return to_pci_dev(to_nvme_dev(ns->ctrl)->dev); +} +EXPORT_SYMBOL_GPL(nvme_find_pdev_from_bdev); + +unsigned nvme_find_ns_id_from_bdev(struct block_device *bdev) +{ + struct nvme_ns *ns = disk_to_nvme_ns(bdev->bd_disk); + + if (!ns) + return 0; + + return ns->head->ns_id; +} +EXPORT_SYMBOL_GPL(nvme_find_ns_id_from_bdev); + +static int nvme_dev_map(struct nvme_dev *dev) +{ + struct pci_dev *pdev = to_pci_dev(dev->dev); + + if (pci_request_mem_regions(pdev, "nvme")) + return -ENODEV; + + if (nvme_remap_bar(dev, NVME_REG_DBS + 4096)) + goto release; + + return 0; + release: + pci_release_mem_regions(pdev); + return -ENODEV; +} + +static unsigned long check_vendor_combination_bug(struct pci_dev *pdev) +{ + if (pdev->vendor == 0x144d && pdev->device == 0xa802) { + /* + * Several Samsung devices seem to drop off the PCIe bus + * randomly when APST is on and uses the deepest sleep state. + * This has been observed on a Samsung "SM951 NVMe SAMSUNG + * 256GB", a "PM951 NVMe SAMSUNG 512GB", and a "Samsung SSD + * 950 PRO 256GB", but it seems to be restricted to two Dell + * laptops. + */ + if (dmi_match(DMI_SYS_VENDOR, "Dell Inc.") && + (dmi_match(DMI_PRODUCT_NAME, "XPS 15 9550") || + dmi_match(DMI_PRODUCT_NAME, "Precision 5510"))) + return NVME_QUIRK_NO_DEEPEST_PS; + } else if (pdev->vendor == 0x144d && pdev->device == 0xa804) { + /* + * Samsung SSD 960 EVO drops off the PCIe bus after system + * suspend on a Ryzen board, ASUS PRIME B350M-A, as well as + * within few minutes after bootup on a Coffee Lake board - + * ASUS PRIME Z370-A + */ + if (dmi_match(DMI_BOARD_VENDOR, "ASUSTeK COMPUTER INC.") && + (dmi_match(DMI_BOARD_NAME, "PRIME B350M-A") || + dmi_match(DMI_BOARD_NAME, "PRIME Z370-A"))) + return NVME_QUIRK_NO_APST; + } else if ((pdev->vendor == 0x144d && (pdev->device == 0xa801 || + pdev->device == 0xa808 || pdev->device == 0xa809)) || + (pdev->vendor == 0x1e0f && pdev->device == 0x0001)) { + /* + * Forcing to use host managed nvme power settings for + * lowest idle power with quick resume latency on + * Samsung and Toshiba SSDs based on suspend behavior + * on Coffee Lake board for LENOVO C640 + */ + if ((dmi_match(DMI_BOARD_VENDOR, "LENOVO")) && + dmi_match(DMI_BOARD_NAME, "LNVNB161216")) + return NVME_QUIRK_SIMPLE_SUSPEND; + } + + return 0; +} + +static void nvme_async_probe(void *data, async_cookie_t cookie) +{ + struct nvme_dev *dev = data; + + flush_work(&dev->ctrl.reset_work); + flush_work(&dev->ctrl.scan_work); + nvme_put_ctrl(&dev->ctrl); +} + +static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) +{ + int node, result = -ENOMEM; + struct nvme_dev *dev; + unsigned long quirks = id->driver_data; + size_t alloc_size; + + node = dev_to_node(&pdev->dev); + if (node == NUMA_NO_NODE) + set_dev_node(&pdev->dev, first_memory_node); + + dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node); + if (!dev) + return -ENOMEM; + + dev->nr_write_queues = write_queues; + dev->nr_poll_queues = poll_queues; + dev->nr_allocated_queues = nvme_max_io_queues(dev) + 1; + dev->queues = kcalloc_node(dev->nr_allocated_queues + num_p2p_queues, + sizeof(struct nvme_queue), GFP_KERNEL, node); + if (!dev->queues) + goto free; + + dev->num_p2p_queues = num_p2p_queues; + dev->dev = get_device(&pdev->dev); + pci_set_drvdata(pdev, dev); + + result = nvme_dev_map(dev); + if (result) + goto put_pci; + + INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work); + INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work); + mutex_init(&dev->shutdown_lock); + + result = nvme_setup_prp_pools(dev); + if (result) + goto unmap; + + quirks |= check_vendor_combination_bug(pdev); + + if (!noacpi && acpi_storage_d3(&pdev->dev)) { + /* + * Some systems use a bios work around to ask for D3 on + * platforms that support kernel managed suspend. + */ + dev_info(&pdev->dev, + "platform quirk: setting simple suspend\n"); + quirks |= NVME_QUIRK_SIMPLE_SUSPEND; + } + + /* + * Double check that our mempool alloc size will cover the biggest + * command we support. + */ + alloc_size = nvme_pci_iod_alloc_size(); + WARN_ON_ONCE(alloc_size > PAGE_SIZE); + + dev->iod_mempool = mempool_create_node(1, mempool_kmalloc, + mempool_kfree, + (void *) alloc_size, + GFP_KERNEL, node); + if (!dev->iod_mempool) { + result = -ENOMEM; + goto release_pools; + } + + result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops, + quirks); + if (result) + goto release_mempool; + + /* + * We populate sysfs to know how many p2p queues can be totaly created. + * Note that we add the P2P attribute to the nvme_ctrl kobj which removes + * the need to remove it on exit. Since nvme_dev_attrs_group has no name we can pass + * NULL as final argument to sysfs_add_file_to_group. + */ + if (sysfs_add_file_to_group(&dev->ctrl.device->kobj, + &dev_attr_num_p2p_queues.attr, NULL)) + dev_warn(dev->ctrl.device, + "failed to add sysfs attribute for num P2P queues\n"); + + dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev)); + + nvme_reset_ctrl(&dev->ctrl); + async_schedule(nvme_async_probe, dev); + + return 0; + + release_mempool: + mempool_destroy(dev->iod_mempool); + release_pools: + nvme_release_prp_pools(dev); + unmap: + nvme_dev_unmap(dev); + put_pci: + put_device(dev->dev); + free: + kfree(dev->queues); + kfree(dev); + return result; +} + +static void nvme_reset_prepare(struct pci_dev *pdev) +{ + struct nvme_dev *dev = pci_get_drvdata(pdev); + + /* + * We don't need to check the return value from waiting for the reset + * state as pci_dev device lock is held, making it impossible to race + * with ->remove(). + */ + nvme_disable_prepare_reset(dev, false); + nvme_sync_queues(&dev->ctrl); +} + +static void nvme_reset_done(struct pci_dev *pdev) +{ + struct nvme_dev *dev = pci_get_drvdata(pdev); + + if (!nvme_try_sched_reset(&dev->ctrl)) + flush_work(&dev->ctrl.reset_work); +} + +static void nvme_shutdown(struct pci_dev *pdev) +{ + struct nvme_dev *dev = pci_get_drvdata(pdev); + + nvme_disable_prepare_reset(dev, true); +} + +static void nvme_remove_attrs(struct nvme_dev *dev) +{ + if (dev->attrs_added) + sysfs_remove_group(&dev->ctrl.device->kobj, + &nvme_pci_attr_group); +} + +/* + * The driver's remove may be called on a device in a partially initialized + * state. This function must not have any dependencies on the device state in + * order to proceed. + */ +static void nvme_remove(struct pci_dev *pdev) +{ + struct nvme_dev *dev = pci_get_drvdata(pdev); + + nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); + pci_set_drvdata(pdev, NULL); + + if (!pci_device_is_present(pdev)) { + nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD); + nvme_dev_disable(dev, true); + } + + flush_work(&dev->ctrl.reset_work); + nvme_stop_ctrl(&dev->ctrl); + nvme_remove_namespaces(&dev->ctrl); + nvme_dev_disable(dev, true); + nvme_remove_attrs(dev); + nvme_free_host_mem(dev); + nvme_dev_remove_admin(dev); + nvme_free_queues(dev, 0); + nvme_release_prp_pools(dev); + nvme_dev_unmap(dev); + nvme_uninit_ctrl(&dev->ctrl); +} + +#ifdef CONFIG_PM_SLEEP +static int nvme_get_power_state(struct nvme_ctrl *ctrl, u32 *ps) +{ + return nvme_get_features(ctrl, NVME_FEAT_POWER_MGMT, 0, NULL, 0, ps); +} + +static int nvme_set_power_state(struct nvme_ctrl *ctrl, u32 ps) +{ + return nvme_set_features(ctrl, NVME_FEAT_POWER_MGMT, ps, NULL, 0, NULL); +} + +static int nvme_resume(struct device *dev) +{ + struct nvme_dev *ndev = pci_get_drvdata(to_pci_dev(dev)); + struct nvme_ctrl *ctrl = &ndev->ctrl; + + if (ndev->last_ps == U32_MAX || + nvme_set_power_state(ctrl, ndev->last_ps) != 0) + goto reset; + if (ctrl->hmpre && nvme_setup_host_mem(ndev)) + goto reset; + + return 0; +reset: + return nvme_try_sched_reset(ctrl); +} + +static int nvme_suspend(struct device *dev) +{ + struct pci_dev *pdev = to_pci_dev(dev); + struct nvme_dev *ndev = pci_get_drvdata(pdev); + struct nvme_ctrl *ctrl = &ndev->ctrl; + int ret = -EBUSY; + + ndev->last_ps = U32_MAX; + + /* + * The platform does not remove power for a kernel managed suspend so + * use host managed nvme power settings for lowest idle power if + * possible. This should have quicker resume latency than a full device + * shutdown. But if the firmware is involved after the suspend or the + * device does not support any non-default power states, shut down the + * device fully. + * + * If ASPM is not enabled for the device, shut down the device and allow + * the PCI bus layer to put it into D3 in order to take the PCIe link + * down, so as to allow the platform to achieve its minimum low-power + * state (which may not be possible if the link is up). + */ + if (pm_suspend_via_firmware() || !ctrl->npss || + !pcie_aspm_enabled(pdev) || + (ndev->ctrl.quirks & NVME_QUIRK_SIMPLE_SUSPEND)) + return nvme_disable_prepare_reset(ndev, true); + + nvme_start_freeze(ctrl); + nvme_wait_freeze(ctrl); + nvme_sync_queues(ctrl); + + if (ctrl->state != NVME_CTRL_LIVE) + goto unfreeze; + + /* + * Host memory access may not be successful in a system suspend state, + * but the specification allows the controller to access memory in a + * non-operational power state. + */ + if (ndev->hmb) { + ret = nvme_set_host_mem(ndev, 0); + if (ret < 0) + goto unfreeze; + } + + ret = nvme_get_power_state(ctrl, &ndev->last_ps); + if (ret < 0) + goto unfreeze; + + /* + * A saved state prevents pci pm from generically controlling the + * device's power. If we're using protocol specific settings, we don't + * want pci interfering. + */ + pci_save_state(pdev); + + ret = nvme_set_power_state(ctrl, ctrl->npss); + if (ret < 0) + goto unfreeze; + + if (ret) { + /* discard the saved state */ + pci_load_saved_state(pdev, NULL); + + /* + * Clearing npss forces a controller reset on resume. The + * correct value will be rediscovered then. + */ + ret = nvme_disable_prepare_reset(ndev, true); + ctrl->npss = 0; + } +unfreeze: + nvme_unfreeze(ctrl); + return ret; +} + +static int nvme_simple_suspend(struct device *dev) +{ + struct nvme_dev *ndev = pci_get_drvdata(to_pci_dev(dev)); + + return nvme_disable_prepare_reset(ndev, true); +} + +static int nvme_simple_resume(struct device *dev) +{ + struct pci_dev *pdev = to_pci_dev(dev); + struct nvme_dev *ndev = pci_get_drvdata(pdev); + + return nvme_try_sched_reset(&ndev->ctrl); +} + +static const struct dev_pm_ops nvme_dev_pm_ops = { + .suspend = nvme_suspend, + .resume = nvme_resume, + .freeze = nvme_simple_suspend, + .thaw = nvme_simple_resume, + .poweroff = nvme_simple_suspend, + .restore = nvme_simple_resume, +}; +#endif /* CONFIG_PM_SLEEP */ + +static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev, + pci_channel_state_t state) +{ + struct nvme_dev *dev = pci_get_drvdata(pdev); + + /* + * A frozen channel requires a reset. When detected, this method will + * shutdown the controller to quiesce. The controller will be restarted + * after the slot reset through driver's slot_reset callback. + */ + switch (state) { + case pci_channel_io_normal: + return PCI_ERS_RESULT_CAN_RECOVER; + case pci_channel_io_frozen: + dev_warn(dev->ctrl.device, + "frozen state error detected, reset controller\n"); + nvme_dev_disable(dev, false); + return PCI_ERS_RESULT_NEED_RESET; + case pci_channel_io_perm_failure: + dev_warn(dev->ctrl.device, + "failure state error detected, request disconnect\n"); + return PCI_ERS_RESULT_DISCONNECT; + } + return PCI_ERS_RESULT_NEED_RESET; +} + +static pci_ers_result_t nvme_slot_reset(struct pci_dev *pdev) +{ + struct nvme_dev *dev = pci_get_drvdata(pdev); + + dev_info(dev->ctrl.device, "restart after slot reset\n"); + pci_restore_state(pdev); + nvme_reset_ctrl(&dev->ctrl); + return PCI_ERS_RESULT_RECOVERED; +} + +static void nvme_error_resume(struct pci_dev *pdev) +{ + struct nvme_dev *dev = pci_get_drvdata(pdev); + + flush_work(&dev->ctrl.reset_work); +} + +static const struct pci_error_handlers nvme_err_handler = { + .error_detected = nvme_error_detected, + .slot_reset = nvme_slot_reset, + .resume = nvme_error_resume, + .reset_prepare = nvme_reset_prepare, + .reset_done = nvme_reset_done, +}; + +static const struct pci_device_id nvme_id_table[] = { + { PCI_VDEVICE(INTEL, 0x0953), /* Intel 750/P3500/P3600/P3700 */ + .driver_data = NVME_QUIRK_STRIPE_SIZE | + NVME_QUIRK_DEALLOCATE_ZEROES, }, + { PCI_VDEVICE(INTEL, 0x0a53), /* Intel P3520 */ + .driver_data = NVME_QUIRK_STRIPE_SIZE | + NVME_QUIRK_DEALLOCATE_ZEROES, }, + { PCI_VDEVICE(INTEL, 0x0a54), /* Intel P4500/P4600 */ + .driver_data = NVME_QUIRK_STRIPE_SIZE | + NVME_QUIRK_DEALLOCATE_ZEROES | + NVME_QUIRK_IGNORE_DEV_SUBNQN, }, + { PCI_VDEVICE(INTEL, 0x0a55), /* Dell Express Flash P4600 */ + .driver_data = NVME_QUIRK_STRIPE_SIZE | + NVME_QUIRK_DEALLOCATE_ZEROES, }, + { PCI_VDEVICE(INTEL, 0xf1a5), /* Intel 600P/P3100 */ + .driver_data = NVME_QUIRK_NO_DEEPEST_PS | + NVME_QUIRK_MEDIUM_PRIO_SQ | + NVME_QUIRK_NO_TEMP_THRESH_CHANGE | + NVME_QUIRK_DISABLE_WRITE_ZEROES, }, + { PCI_VDEVICE(INTEL, 0xf1a6), /* Intel 760p/Pro 7600p */ + .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, }, + { PCI_VDEVICE(INTEL, 0x5845), /* Qemu emulated controller */ + .driver_data = NVME_QUIRK_IDENTIFY_CNS | + NVME_QUIRK_DISABLE_WRITE_ZEROES | + NVME_QUIRK_BOGUS_NID, }, + { PCI_VDEVICE(REDHAT, 0x0010), /* Qemu emulated controller */ + .driver_data = NVME_QUIRK_BOGUS_NID, }, + { PCI_DEVICE(0x126f, 0x2263), /* Silicon Motion unidentified */ + .driver_data = NVME_QUIRK_NO_NS_DESC_LIST | + NVME_QUIRK_BOGUS_NID, }, + { PCI_DEVICE(0x1bb1, 0x0100), /* Seagate Nytro Flash Storage */ + .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY | + NVME_QUIRK_NO_NS_DESC_LIST, }, + { PCI_DEVICE(0x1c58, 0x0003), /* HGST adapter */ + .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, + { PCI_DEVICE(0x1c58, 0x0023), /* WDC SN200 adapter */ + .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, + { PCI_DEVICE(0x1c5f, 0x0540), /* Memblaze Pblaze4 adapter */ + .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, + { PCI_DEVICE(0x144d, 0xa821), /* Samsung PM1725 */ + .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, + { PCI_DEVICE(0x144d, 0xa822), /* Samsung PM1725a */ + .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY | + NVME_QUIRK_DISABLE_WRITE_ZEROES| + NVME_QUIRK_IGNORE_DEV_SUBNQN, }, + { PCI_DEVICE(0x1987, 0x5012), /* Phison E12 */ + .driver_data = NVME_QUIRK_BOGUS_NID, }, + { PCI_DEVICE(0x1987, 0x5016), /* Phison E16 */ + .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, }, + { PCI_DEVICE(0x1b4b, 0x1092), /* Lexar 256 GB SSD */ + .driver_data = NVME_QUIRK_NO_NS_DESC_LIST | + NVME_QUIRK_IGNORE_DEV_SUBNQN, }, + { PCI_DEVICE(0x1cc1, 0x33f8), /* ADATA IM2P33F8ABR1 1 TB */ + .driver_data = NVME_QUIRK_BOGUS_NID, }, + { PCI_DEVICE(0x10ec, 0x5762), /* ADATA SX6000LNP */ + .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN | + NVME_QUIRK_BOGUS_NID, }, + { PCI_DEVICE(0x1cc1, 0x8201), /* ADATA SX8200PNP 512GB */ + .driver_data = NVME_QUIRK_NO_DEEPEST_PS | + NVME_QUIRK_IGNORE_DEV_SUBNQN, }, + { PCI_DEVICE(0x1c5c, 0x1504), /* SK Hynix PC400 */ + .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, + { PCI_DEVICE(0x1c5c, 0x174a), /* SK Hynix P31 SSD */ + .driver_data = NVME_QUIRK_BOGUS_NID, }, + { PCI_DEVICE(0x15b7, 0x2001), /* Sandisk Skyhawk */ + .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, + { PCI_DEVICE(0x1d97, 0x2263), /* SPCC */ + .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, + { PCI_DEVICE(0x144d, 0xa80b), /* Samsung PM9B1 256G and 512G */ + .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, + { PCI_DEVICE(0x144d, 0xa809), /* Samsung MZALQ256HBJD 256G */ + .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, + { PCI_DEVICE(0x1cc4, 0x6303), /* UMIS RPJTJ512MGE1QDY 512G */ + .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, + { PCI_DEVICE(0x1cc4, 0x6302), /* UMIS RPJTJ256MGE1QDY 256G */ + .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, + { PCI_DEVICE(0x2646, 0x2262), /* KINGSTON SKC2000 NVMe SSD */ + .driver_data = NVME_QUIRK_NO_DEEPEST_PS, }, + { PCI_DEVICE(0x2646, 0x2263), /* KINGSTON A2000 NVMe SSD */ + .driver_data = NVME_QUIRK_NO_DEEPEST_PS, }, + { PCI_DEVICE(0x1e4B, 0x1002), /* MAXIO MAP1002 */ + .driver_data = NVME_QUIRK_BOGUS_NID, }, + { PCI_DEVICE(0x1e4B, 0x1202), /* MAXIO MAP1202 */ + .driver_data = NVME_QUIRK_BOGUS_NID, }, + { PCI_DEVICE(0x1cc1, 0x5350), /* ADATA XPG GAMMIX S50 */ + .driver_data = NVME_QUIRK_BOGUS_NID, }, + { PCI_DEVICE(0x1e49, 0x0041), /* ZHITAI TiPro7000 NVMe SSD */ + .driver_data = NVME_QUIRK_NO_DEEPEST_PS, }, + { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x0061), + .driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, }, + { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x0065), + .driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, }, + { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x8061), + .driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, }, + { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0xcd00), + .driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, }, + { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0xcd01), + .driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, }, + { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0xcd02), + .driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, }, + { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001), + .driver_data = NVME_QUIRK_SINGLE_VECTOR }, + { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) }, + { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2005), + .driver_data = NVME_QUIRK_SINGLE_VECTOR | + NVME_QUIRK_128_BYTES_SQES | + NVME_QUIRK_SHARED_TAGS | + NVME_QUIRK_SKIP_CID_GEN }, + + { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, + { 0, } +}; +MODULE_DEVICE_TABLE(pci, nvme_id_table); + +static struct pci_driver nvme_driver = { + .name = "nvme", + .id_table = nvme_id_table, + .probe = nvme_probe, + .remove = nvme_remove, + .shutdown = nvme_shutdown, +#ifdef CONFIG_PM_SLEEP + .driver = { + .pm = &nvme_dev_pm_ops, + }, +#endif + .sriov_configure = pci_sriov_configure_simple, + .err_handler = &nvme_err_handler, +}; + +#ifdef CONFIG_COMPAT_NVME_SNAP_VFIO_PCI +int nvme_pdev_admin_passthru_sync(struct pci_dev *pdev, + struct nvme_command *cmd, void *buffer, + unsigned int bufflen, unsigned int timeout_ms) +{ + struct nvme_dev *dev; + int ret; + + if (pdev->driver != &nvme_driver) + return -EINVAL; + + dev = pci_get_drvdata(pdev); + if (!dev) + return -EINVAL; + + nvme_get_ctrl(&dev->ctrl); + ret = nvme_admin_passthru_sync(&dev->ctrl, cmd, buffer, bufflen, + timeout_ms); + nvme_put_ctrl(&dev->ctrl); + + return ret; +} +EXPORT_SYMBOL_GPL(nvme_pdev_admin_passthru_sync); +#endif + +static int __init nvme_init(void) +{ + BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64); + BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64); + BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); + BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2); + + return pci_register_driver(&nvme_driver); +} + +static void __exit nvme_exit(void) +{ + pci_unregister_driver(&nvme_driver); + flush_workqueue(nvme_wq); +} + +MODULE_AUTHOR("Matthew Wilcox "); +MODULE_LICENSE("GPL"); +MODULE_VERSION("1.0"); +module_init(nvme_init); +module_exit(nvme_exit); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/qla2xxx_dummy.c b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/qla2xxx_dummy.c new file mode 100644 index 0000000..4768bcf --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/qla2xxx_dummy.c @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2020 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "qla2xxx" +#define PFX DRV_NAME ": " +#define DRV_VERSION "2.0.0" +#define DRV_RELDATE "February 09, 2020" + +MODULE_AUTHOR("Alaa Hleihel"); +MODULE_DESCRIPTION("qla2xxx dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init qla2xxx_init(void) +{ + return 0; +} + +static void __exit qla2xxx_cleanup(void) +{ +} + +module_init(qla2xxx_init); +module_exit(qla2xxx_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/rdma.c b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/rdma.c new file mode 100644 index 0000000..99c329c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/rdma.c @@ -0,0 +1,2581 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NVMe over Fabrics RDMA host code. + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + */ +#ifdef pr_fmt +#undef pr_fmt +#endif +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "nvme.h" +#include "fabrics.h" + +#ifdef CONFIG_NVFS +#include "nvfs.h" +#endif + +#define NVME_RDMA_CM_TIMEOUT_MS 3000 /* 3 second */ + +#define NVME_RDMA_MAX_SEGMENTS 256 + +#define NVME_RDMA_MAX_INLINE_SEGMENTS 4 + +#define NVME_RDMA_DATA_SGL_SIZE \ + (sizeof(struct scatterlist) * NVME_INLINE_SG_CNT) +#define NVME_RDMA_METADATA_SGL_SIZE \ + (sizeof(struct scatterlist) * NVME_INLINE_METADATA_SG_CNT) + +struct nvme_rdma_device { + struct ib_device *dev; + struct ib_pd *pd; + struct kref ref; + struct list_head entry; + unsigned int num_inline_segments; +}; + +struct nvme_rdma_qe { + struct ib_cqe cqe; + void *data; + u64 dma; +}; + +struct nvme_rdma_sgl { + int nents; + struct sg_table sg_table; +}; + +struct nvme_rdma_queue; +struct nvme_rdma_request { + struct nvme_request req; + struct ib_mr *mr; + struct nvme_rdma_qe sqe; + union nvme_result result; + __le16 status; + refcount_t ref; + struct ib_sge sge[1 + NVME_RDMA_MAX_INLINE_SEGMENTS]; + u32 num_sge; + struct ib_reg_wr reg_wr; + struct ib_cqe reg_cqe; + struct nvme_rdma_queue *queue; + struct nvme_rdma_sgl data_sgl; + struct nvme_rdma_sgl *metadata_sgl; + bool use_sig_mr; +}; + +enum nvme_rdma_queue_flags { + NVME_RDMA_Q_ALLOCATED = 0, + NVME_RDMA_Q_LIVE = 1, + NVME_RDMA_Q_TR_READY = 2, +}; + +struct nvme_rdma_queue { + struct nvme_rdma_qe *rsp_ring; + int queue_size; + size_t cmnd_capsule_len; + struct nvme_rdma_ctrl *ctrl; + struct nvme_rdma_device *device; + struct ib_cq *ib_cq; + struct ib_qp *qp; + + unsigned long flags; + struct rdma_cm_id *cm_id; + int cm_error; + struct completion cm_done; + bool pi_support; + int cq_size; + struct mutex queue_lock; +}; + +struct nvme_rdma_ctrl { + /* read only in the hot path */ + struct nvme_rdma_queue *queues; + + /* other member variables */ + struct blk_mq_tag_set tag_set; + struct work_struct err_work; + + struct nvme_rdma_qe async_event_sqe; + + struct delayed_work reconnect_work; + + struct list_head list; + + struct blk_mq_tag_set admin_tag_set; + struct nvme_rdma_device *device; + + u32 max_fr_pages; + + struct sockaddr_storage addr; + struct sockaddr_storage src_addr; + + struct nvme_ctrl ctrl; + bool use_inline_data; + u32 io_queues[HCTX_MAX_TYPES]; +}; + +static inline struct nvme_rdma_ctrl *to_rdma_ctrl(struct nvme_ctrl *ctrl) +{ + return container_of(ctrl, struct nvme_rdma_ctrl, ctrl); +} + +static LIST_HEAD(device_list); +static DEFINE_MUTEX(device_list_mutex); + +static LIST_HEAD(nvme_rdma_ctrl_list); +static DEFINE_MUTEX(nvme_rdma_ctrl_mutex); + +/* + * Disabling this option makes small I/O goes faster, but is fundamentally + * unsafe. With it turned off we will have to register a global rkey that + * allows read and write access to all physical memory. + */ +static bool register_always = true; +module_param(register_always, bool, 0444); +MODULE_PARM_DESC(register_always, + "Use memory registration even for contiguous memory regions"); + +static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event); +static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc); +static void nvme_rdma_complete_rq(struct request *rq); + +static const struct blk_mq_ops nvme_rdma_mq_ops; +static const struct blk_mq_ops nvme_rdma_admin_mq_ops; + +static inline int nvme_rdma_queue_idx(struct nvme_rdma_queue *queue) +{ + return queue - queue->ctrl->queues; +} + +static bool nvme_rdma_poll_queue(struct nvme_rdma_queue *queue) +{ + return nvme_rdma_queue_idx(queue) > + queue->ctrl->io_queues[HCTX_TYPE_DEFAULT] + + queue->ctrl->io_queues[HCTX_TYPE_READ]; +} + +static inline size_t nvme_rdma_inline_data_size(struct nvme_rdma_queue *queue) +{ + return queue->cmnd_capsule_len - sizeof(struct nvme_command); +} + +static void nvme_rdma_free_qe(struct ib_device *ibdev, struct nvme_rdma_qe *qe, + size_t capsule_size, enum dma_data_direction dir) +{ + ib_dma_unmap_single(ibdev, qe->dma, capsule_size, dir); + kfree(qe->data); +} + +static int nvme_rdma_alloc_qe(struct ib_device *ibdev, struct nvme_rdma_qe *qe, + size_t capsule_size, enum dma_data_direction dir) +{ + qe->data = kzalloc(capsule_size, GFP_KERNEL); + if (!qe->data) + return -ENOMEM; + + qe->dma = ib_dma_map_single(ibdev, qe->data, capsule_size, dir); + if (ib_dma_mapping_error(ibdev, qe->dma)) { + kfree(qe->data); + qe->data = NULL; + return -ENOMEM; + } + + return 0; +} + +static void nvme_rdma_free_ring(struct ib_device *ibdev, + struct nvme_rdma_qe *ring, size_t ib_queue_size, + size_t capsule_size, enum dma_data_direction dir) +{ + int i; + + for (i = 0; i < ib_queue_size; i++) + nvme_rdma_free_qe(ibdev, &ring[i], capsule_size, dir); + kfree(ring); +} + +static struct nvme_rdma_qe *nvme_rdma_alloc_ring(struct ib_device *ibdev, + size_t ib_queue_size, size_t capsule_size, + enum dma_data_direction dir) +{ + struct nvme_rdma_qe *ring; + int i; + + ring = kcalloc(ib_queue_size, sizeof(struct nvme_rdma_qe), GFP_KERNEL); + if (!ring) + return NULL; + + /* + * Bind the CQEs (post recv buffers) DMA mapping to the RDMA queue + * lifetime. It's safe, since any chage in the underlying RDMA device + * will issue error recovery and queue re-creation. + */ + for (i = 0; i < ib_queue_size; i++) { + if (nvme_rdma_alloc_qe(ibdev, &ring[i], capsule_size, dir)) + goto out_free_ring; + } + + return ring; + +out_free_ring: + nvme_rdma_free_ring(ibdev, ring, i, capsule_size, dir); + return NULL; +} + +static void nvme_rdma_qp_event(struct ib_event *event, void *context) +{ + pr_debug("QP event %s (%d)\n", + ib_event_msg(event->event), event->event); + +} + +static int nvme_rdma_wait_for_cm(struct nvme_rdma_queue *queue) +{ + int ret; + + ret = wait_for_completion_interruptible(&queue->cm_done); + if (ret) + return ret; + WARN_ON_ONCE(queue->cm_error > 0); + return queue->cm_error; +} + +static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor) +{ + struct nvme_rdma_device *dev = queue->device; + struct ib_qp_init_attr init_attr; + int ret; + + memset(&init_attr, 0, sizeof(init_attr)); + init_attr.event_handler = nvme_rdma_qp_event; + /* +1 for drain */ + init_attr.cap.max_send_wr = factor * queue->queue_size + 1; + /* +1 for drain */ + init_attr.cap.max_recv_wr = queue->queue_size + 1; + init_attr.cap.max_recv_sge = 1; + init_attr.cap.max_send_sge = 1 + dev->num_inline_segments; + init_attr.sq_sig_type = IB_SIGNAL_REQ_WR; + init_attr.qp_type = IB_QPT_RC; + init_attr.send_cq = queue->ib_cq; + init_attr.recv_cq = queue->ib_cq; + if (queue->pi_support) + init_attr.create_flags |= IB_QP_CREATE_INTEGRITY_EN; + init_attr.qp_context = queue; + + ret = rdma_create_qp(queue->cm_id, dev->pd, &init_attr); + + queue->qp = queue->cm_id->qp; + return ret; +} + +static void nvme_rdma_exit_request(struct blk_mq_tag_set *set, + struct request *rq, unsigned int hctx_idx) +{ + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + + kfree(req->sqe.data); +} + +static int nvme_rdma_init_request(struct blk_mq_tag_set *set, + struct request *rq, unsigned int hctx_idx, + unsigned int numa_node) +{ + struct nvme_rdma_ctrl *ctrl = set->driver_data; + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0; + struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx]; + + nvme_req(rq)->ctrl = &ctrl->ctrl; + req->sqe.data = kzalloc(sizeof(struct nvme_command), GFP_KERNEL); + if (!req->sqe.data) + return -ENOMEM; + + /* metadata nvme_rdma_sgl struct is located after command's data SGL */ + if (queue->pi_support) + req->metadata_sgl = (void *)nvme_req(rq) + + sizeof(struct nvme_rdma_request) + + NVME_RDMA_DATA_SGL_SIZE; + + req->queue = queue; + nvme_req(rq)->cmd = req->sqe.data; + + return 0; +} + +static int nvme_rdma_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) +{ + struct nvme_rdma_ctrl *ctrl = data; + struct nvme_rdma_queue *queue = &ctrl->queues[hctx_idx + 1]; + + BUG_ON(hctx_idx >= ctrl->ctrl.queue_count); + + hctx->driver_data = queue; + return 0; +} + +static int nvme_rdma_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) +{ + struct nvme_rdma_ctrl *ctrl = data; + struct nvme_rdma_queue *queue = &ctrl->queues[0]; + + BUG_ON(hctx_idx != 0); + + hctx->driver_data = queue; + return 0; +} + +static void nvme_rdma_free_dev(struct kref *ref) +{ + struct nvme_rdma_device *ndev = + container_of(ref, struct nvme_rdma_device, ref); + + mutex_lock(&device_list_mutex); + list_del(&ndev->entry); + mutex_unlock(&device_list_mutex); + + ib_dealloc_pd(ndev->pd); + kfree(ndev); +} + +static void nvme_rdma_dev_put(struct nvme_rdma_device *dev) +{ + kref_put(&dev->ref, nvme_rdma_free_dev); +} + +static int nvme_rdma_dev_get(struct nvme_rdma_device *dev) +{ + return kref_get_unless_zero(&dev->ref); +} + +static struct nvme_rdma_device * +nvme_rdma_find_get_device(struct rdma_cm_id *cm_id) +{ + struct nvme_rdma_device *ndev; + + mutex_lock(&device_list_mutex); + list_for_each_entry(ndev, &device_list, entry) { + if (ndev->dev->node_guid == cm_id->device->node_guid && + nvme_rdma_dev_get(ndev)) + goto out_unlock; + } + + ndev = kzalloc(sizeof(*ndev), GFP_KERNEL); + if (!ndev) + goto out_err; + + ndev->dev = cm_id->device; + kref_init(&ndev->ref); + + ndev->pd = ib_alloc_pd(ndev->dev, + register_always ? 0 : IB_PD_UNSAFE_GLOBAL_RKEY); + if (IS_ERR(ndev->pd)) + goto out_free_dev; + + if (!(ndev->dev->attrs.device_cap_flags & + IB_DEVICE_MEM_MGT_EXTENSIONS)) { + dev_err(&ndev->dev->dev, + "Memory registrations not supported.\n"); + goto out_free_pd; + } + + ndev->num_inline_segments = min(NVME_RDMA_MAX_INLINE_SEGMENTS, + ndev->dev->attrs.max_send_sge - 1); + list_add(&ndev->entry, &device_list); +out_unlock: + mutex_unlock(&device_list_mutex); + return ndev; + +out_free_pd: + ib_dealloc_pd(ndev->pd); +out_free_dev: + kfree(ndev); +out_err: + mutex_unlock(&device_list_mutex); + return NULL; +} + +static void nvme_rdma_free_cq(struct nvme_rdma_queue *queue) +{ + if (nvme_rdma_poll_queue(queue)) + ib_free_cq(queue->ib_cq); + else + ib_cq_pool_put(queue->ib_cq, queue->cq_size); +} + +static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue) +{ + struct nvme_rdma_device *dev; + struct ib_device *ibdev; + + if (!test_and_clear_bit(NVME_RDMA_Q_TR_READY, &queue->flags)) + return; + + dev = queue->device; + ibdev = dev->dev; + + if (queue->pi_support) + ib_mr_pool_destroy(queue->qp, &queue->qp->sig_mrs); + ib_mr_pool_destroy(queue->qp, &queue->qp->rdma_mrs); + + /* + * The cm_id object might have been destroyed during RDMA connection + * establishment error flow to avoid getting other cma events, thus + * the destruction of the QP shouldn't use rdma_cm API. + */ + ib_destroy_qp(queue->qp); + nvme_rdma_free_cq(queue); + + nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size, + sizeof(struct nvme_completion), DMA_FROM_DEVICE); + + nvme_rdma_dev_put(dev); +} + +static int nvme_rdma_get_max_fr_pages(struct ib_device *ibdev, bool pi_support) +{ + u32 max_page_list_len; + + if (pi_support) + max_page_list_len = ibdev->attrs.max_pi_fast_reg_page_list_len; + else + max_page_list_len = ibdev->attrs.max_fast_reg_page_list_len; + + return min_t(u32, NVME_RDMA_MAX_SEGMENTS, max_page_list_len - 1); +} + +static int nvme_rdma_create_cq(struct ib_device *ibdev, + struct nvme_rdma_queue *queue) +{ + int ret, comp_vector, idx = nvme_rdma_queue_idx(queue); + enum ib_poll_context poll_ctx; + + /* + * Spread I/O queues completion vectors according their queue index. + * Admin queues can always go on completion vector 0. + */ + comp_vector = (idx == 0 ? idx : idx - 1) % ibdev->num_comp_vectors; + + /* Polling queues need direct cq polling context */ + if (nvme_rdma_poll_queue(queue)) { + poll_ctx = IB_POLL_DIRECT; + queue->ib_cq = ib_alloc_cq(ibdev, queue, queue->cq_size, + comp_vector, poll_ctx); + } else { + poll_ctx = IB_POLL_SOFTIRQ; + queue->ib_cq = ib_cq_pool_get(ibdev, queue->cq_size, + comp_vector, poll_ctx); + } + + if (IS_ERR(queue->ib_cq)) { + ret = PTR_ERR(queue->ib_cq); + return ret; + } + + return 0; +} + +static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) +{ + struct ib_device *ibdev; + const int send_wr_factor = 3; /* MR, SEND, INV */ + const int cq_factor = send_wr_factor + 1; /* + RECV */ + int ret, pages_per_mr; + + queue->device = nvme_rdma_find_get_device(queue->cm_id); + if (!queue->device) { + dev_err(queue->cm_id->device->dev.parent, + "no client data found!\n"); + return -ECONNREFUSED; + } + ibdev = queue->device->dev; + + /* +1 for ib_stop_cq */ + queue->cq_size = cq_factor * queue->queue_size + 1; + + ret = nvme_rdma_create_cq(ibdev, queue); + if (ret) + goto out_put_dev; + + ret = nvme_rdma_create_qp(queue, send_wr_factor); + if (ret) + goto out_destroy_ib_cq; + + queue->rsp_ring = nvme_rdma_alloc_ring(ibdev, queue->queue_size, + sizeof(struct nvme_completion), DMA_FROM_DEVICE); + if (!queue->rsp_ring) { + ret = -ENOMEM; + goto out_destroy_qp; + } + + /* + * Currently we don't use SG_GAPS MR's so if the first entry is + * misaligned we'll end up using two entries for a single data page, + * so one additional entry is required. + */ + pages_per_mr = nvme_rdma_get_max_fr_pages(ibdev, queue->pi_support) + 1; + ret = ib_mr_pool_init(queue->qp, &queue->qp->rdma_mrs, + queue->queue_size, + IB_MR_TYPE_MEM_REG, + pages_per_mr, 0); + if (ret) { + dev_err(queue->ctrl->ctrl.device, + "failed to initialize MR pool sized %d for QID %d\n", + queue->queue_size, nvme_rdma_queue_idx(queue)); + goto out_destroy_ring; + } + + if (queue->pi_support) { + ret = ib_mr_pool_init(queue->qp, &queue->qp->sig_mrs, + queue->queue_size, IB_MR_TYPE_INTEGRITY, + pages_per_mr, pages_per_mr); + if (ret) { + dev_err(queue->ctrl->ctrl.device, + "failed to initialize PI MR pool sized %d for QID %d\n", + queue->queue_size, nvme_rdma_queue_idx(queue)); + goto out_destroy_mr_pool; + } + } + + set_bit(NVME_RDMA_Q_TR_READY, &queue->flags); + + return 0; + +out_destroy_mr_pool: + ib_mr_pool_destroy(queue->qp, &queue->qp->rdma_mrs); +out_destroy_ring: + nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size, + sizeof(struct nvme_completion), DMA_FROM_DEVICE); +out_destroy_qp: + rdma_destroy_qp(queue->cm_id); +out_destroy_ib_cq: + nvme_rdma_free_cq(queue); +out_put_dev: + nvme_rdma_dev_put(queue->device); + return ret; +} + +static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl, + int idx, size_t queue_size) +{ + struct nvme_rdma_queue *queue; + struct sockaddr *src_addr = NULL; + int ret; + + queue = &ctrl->queues[idx]; + mutex_init(&queue->queue_lock); + queue->ctrl = ctrl; + if (idx && ctrl->ctrl.max_integrity_segments) + queue->pi_support = true; + else + queue->pi_support = false; + init_completion(&queue->cm_done); + + if (idx > 0) + queue->cmnd_capsule_len = ctrl->ctrl.ioccsz * 16; + else + queue->cmnd_capsule_len = sizeof(struct nvme_command); + + queue->queue_size = queue_size; + + queue->cm_id = rdma_create_id(&init_net, nvme_rdma_cm_handler, queue, + RDMA_PS_TCP, IB_QPT_RC); + if (IS_ERR(queue->cm_id)) { + dev_info(ctrl->ctrl.device, + "failed to create CM ID: %ld\n", PTR_ERR(queue->cm_id)); + ret = PTR_ERR(queue->cm_id); + goto out_destroy_mutex; + } + + if (ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR) + src_addr = (struct sockaddr *)&ctrl->src_addr; + + queue->cm_error = -ETIMEDOUT; + ret = rdma_resolve_addr(queue->cm_id, src_addr, + (struct sockaddr *)&ctrl->addr, + NVME_RDMA_CM_TIMEOUT_MS); + if (ret) { + dev_info(ctrl->ctrl.device, + "rdma_resolve_addr failed (%d).\n", ret); + goto out_destroy_cm_id; + } + + ret = nvme_rdma_wait_for_cm(queue); + if (ret) { + dev_info(ctrl->ctrl.device, + "rdma connection establishment failed (%d)\n", ret); + goto out_destroy_cm_id; + } + + set_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags); + + return 0; + +out_destroy_cm_id: + rdma_destroy_id(queue->cm_id); + nvme_rdma_destroy_queue_ib(queue); +out_destroy_mutex: + mutex_destroy(&queue->queue_lock); + return ret; +} + +static void __nvme_rdma_stop_queue(struct nvme_rdma_queue *queue) +{ + rdma_disconnect(queue->cm_id); + ib_drain_qp(queue->qp); +} + +static void nvme_rdma_stop_queue(struct nvme_rdma_queue *queue) +{ + mutex_lock(&queue->queue_lock); + if (test_and_clear_bit(NVME_RDMA_Q_LIVE, &queue->flags)) + __nvme_rdma_stop_queue(queue); + mutex_unlock(&queue->queue_lock); +} + +static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue) +{ + if (!test_and_clear_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags)) + return; + + rdma_destroy_id(queue->cm_id); + nvme_rdma_destroy_queue_ib(queue); + mutex_destroy(&queue->queue_lock); +} + +static void nvme_rdma_free_io_queues(struct nvme_rdma_ctrl *ctrl) +{ + int i; + + for (i = 1; i < ctrl->ctrl.queue_count; i++) + nvme_rdma_free_queue(&ctrl->queues[i]); +} + +static void nvme_rdma_stop_io_queues(struct nvme_rdma_ctrl *ctrl) +{ + int i; + + for (i = 1; i < ctrl->ctrl.queue_count; i++) + nvme_rdma_stop_queue(&ctrl->queues[i]); +} + +static int nvme_rdma_start_queue(struct nvme_rdma_ctrl *ctrl, int idx) +{ + struct nvme_rdma_queue *queue = &ctrl->queues[idx]; + int ret; + + if (idx) + ret = nvmf_connect_io_queue(&ctrl->ctrl, idx); + else + ret = nvmf_connect_admin_queue(&ctrl->ctrl); + + if (!ret) { + set_bit(NVME_RDMA_Q_LIVE, &queue->flags); + } else { + if (test_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags)) + __nvme_rdma_stop_queue(queue); + dev_info(ctrl->ctrl.device, + "failed to connect queue: %d ret=%d\n", idx, ret); + } + return ret; +} + +static int nvme_rdma_start_io_queues(struct nvme_rdma_ctrl *ctrl) +{ + int i, ret = 0; + + for (i = 1; i < ctrl->ctrl.queue_count; i++) { + ret = nvme_rdma_start_queue(ctrl, i); + if (ret) + goto out_stop_queues; + } + + return 0; + +out_stop_queues: + for (i--; i >= 1; i--) + nvme_rdma_stop_queue(&ctrl->queues[i]); + return ret; +} + +static int nvme_rdma_alloc_io_queues(struct nvme_rdma_ctrl *ctrl) +{ + struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; + struct ib_device *ibdev = ctrl->device->dev; + unsigned int nr_io_queues, nr_default_queues; + unsigned int nr_read_queues, nr_poll_queues; + int i, ret; + + nr_read_queues = min_t(unsigned int, ibdev->num_comp_vectors, + min(opts->nr_io_queues, num_online_cpus())); + nr_default_queues = min_t(unsigned int, ibdev->num_comp_vectors, + min(opts->nr_write_queues, num_online_cpus())); + nr_poll_queues = min(opts->nr_poll_queues, num_online_cpus()); + nr_io_queues = nr_read_queues + nr_default_queues + nr_poll_queues; + + ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues); + if (ret) + return ret; + + if (nr_io_queues == 0) { + dev_err(ctrl->ctrl.device, + "unable to set any I/O queues\n"); + return -ENOMEM; + } + + ctrl->ctrl.queue_count = nr_io_queues + 1; + dev_info(ctrl->ctrl.device, + "creating %d I/O queues.\n", nr_io_queues); + + if (opts->nr_write_queues && nr_read_queues < nr_io_queues) { + /* + * separate read/write queues + * hand out dedicated default queues only after we have + * sufficient read queues. + */ + ctrl->io_queues[HCTX_TYPE_READ] = nr_read_queues; + nr_io_queues -= ctrl->io_queues[HCTX_TYPE_READ]; + ctrl->io_queues[HCTX_TYPE_DEFAULT] = + min(nr_default_queues, nr_io_queues); + nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT]; + } else { + /* + * shared read/write queues + * either no write queues were requested, or we don't have + * sufficient queue count to have dedicated default queues. + */ + ctrl->io_queues[HCTX_TYPE_DEFAULT] = + min(nr_read_queues, nr_io_queues); + nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT]; + } + + if (opts->nr_poll_queues && nr_io_queues) { + /* map dedicated poll queues only if we have queues left */ + ctrl->io_queues[HCTX_TYPE_POLL] = + min(nr_poll_queues, nr_io_queues); + } + + for (i = 1; i < ctrl->ctrl.queue_count; i++) { + ret = nvme_rdma_alloc_queue(ctrl, i, + ctrl->ctrl.sqsize + 1); + if (ret) + goto out_free_queues; + } + + return 0; + +out_free_queues: + for (i--; i >= 1; i--) + nvme_rdma_free_queue(&ctrl->queues[i]); + + return ret; +} + +static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl, + bool admin) +{ + struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); + struct blk_mq_tag_set *set; + int ret; + + if (admin) { + set = &ctrl->admin_tag_set; + memset(set, 0, sizeof(*set)); + set->ops = &nvme_rdma_admin_mq_ops; + set->queue_depth = NVME_AQ_MQ_TAG_DEPTH; + set->reserved_tags = NVMF_RESERVED_TAGS; + set->numa_node = nctrl->numa_node; + set->cmd_size = sizeof(struct nvme_rdma_request) + + NVME_RDMA_DATA_SGL_SIZE; + set->driver_data = ctrl; + set->nr_hw_queues = 1; + set->timeout = NVME_ADMIN_TIMEOUT; + set->flags = BLK_MQ_F_NO_SCHED; + } else { + set = &ctrl->tag_set; + memset(set, 0, sizeof(*set)); + set->ops = &nvme_rdma_mq_ops; + set->queue_depth = nctrl->sqsize + 1; + set->reserved_tags = NVMF_RESERVED_TAGS; + set->numa_node = nctrl->numa_node; + set->flags = BLK_MQ_F_SHOULD_MERGE; + set->cmd_size = sizeof(struct nvme_rdma_request) + + NVME_RDMA_DATA_SGL_SIZE; + if (nctrl->max_integrity_segments) + set->cmd_size += sizeof(struct nvme_rdma_sgl) + + NVME_RDMA_METADATA_SGL_SIZE; + set->driver_data = ctrl; + set->nr_hw_queues = nctrl->queue_count - 1; + set->timeout = NVME_IO_TIMEOUT; + set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2; + } + + ret = blk_mq_alloc_tag_set(set); + if (ret) + return ERR_PTR(ret); + + return set; +} + +static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl, + bool remove) +{ + if (remove) { + blk_cleanup_queue(ctrl->ctrl.admin_q); + blk_cleanup_queue(ctrl->ctrl.fabrics_q); + blk_mq_free_tag_set(ctrl->ctrl.admin_tagset); + } + if (ctrl->async_event_sqe.data) { + cancel_work_sync(&ctrl->ctrl.async_event_work); + nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe, + sizeof(struct nvme_command), DMA_TO_DEVICE); + ctrl->async_event_sqe.data = NULL; + } + nvme_rdma_free_queue(&ctrl->queues[0]); +} + +static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, + bool new) +{ + bool pi_capable = false; + int error; + + error = nvme_rdma_alloc_queue(ctrl, 0, NVME_AQ_DEPTH); + if (error) + return error; + + ctrl->device = ctrl->queues[0].device; + ctrl->ctrl.numa_node = ibdev_to_node(ctrl->device->dev); + + /* T10-PI support */ + if (ctrl->device->dev->attrs.device_cap_flags & + IB_DEVICE_INTEGRITY_HANDOVER) + pi_capable = true; + + ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev, + pi_capable); + + /* + * Bind the async event SQE DMA mapping to the admin queue lifetime. + * It's safe, since any chage in the underlying RDMA device will issue + * error recovery and queue re-creation. + */ + error = nvme_rdma_alloc_qe(ctrl->device->dev, &ctrl->async_event_sqe, + sizeof(struct nvme_command), DMA_TO_DEVICE); + if (error) + goto out_free_queue; + + if (new) { + ctrl->ctrl.admin_tagset = nvme_rdma_alloc_tagset(&ctrl->ctrl, true); + if (IS_ERR(ctrl->ctrl.admin_tagset)) { + error = PTR_ERR(ctrl->ctrl.admin_tagset); + goto out_free_async_qe; + } + + ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set); + if (IS_ERR(ctrl->ctrl.fabrics_q)) { + error = PTR_ERR(ctrl->ctrl.fabrics_q); + goto out_free_tagset; + } + + ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set); + if (IS_ERR(ctrl->ctrl.admin_q)) { + error = PTR_ERR(ctrl->ctrl.admin_q); + goto out_cleanup_fabrics_q; + } + } + + error = nvme_rdma_start_queue(ctrl, 0); + if (error) + goto out_cleanup_queue; + + error = nvme_enable_ctrl(&ctrl->ctrl); + if (error) + goto out_stop_queue; + + ctrl->ctrl.max_segments = ctrl->max_fr_pages; + ctrl->ctrl.max_hw_sectors = ctrl->max_fr_pages << (ilog2(SZ_4K) - 9); + if (pi_capable) + ctrl->ctrl.max_integrity_segments = ctrl->max_fr_pages; + else + ctrl->ctrl.max_integrity_segments = 0; + + nvme_start_admin_queue(&ctrl->ctrl); + + error = nvme_init_ctrl_finish(&ctrl->ctrl); + if (error) + goto out_quiesce_queue; + + return 0; + +out_quiesce_queue: + if (new) + nvme_shutdown_ctrl(&ctrl->ctrl); + else + nvme_disable_ctrl(&ctrl->ctrl); + + nvme_stop_admin_queue(&ctrl->ctrl); + blk_sync_queue(ctrl->ctrl.admin_q); +out_stop_queue: + nvme_rdma_stop_queue(&ctrl->queues[0]); + nvme_cancel_admin_tagset(&ctrl->ctrl); +out_cleanup_queue: + if (new) + blk_cleanup_queue(ctrl->ctrl.admin_q); +out_cleanup_fabrics_q: + if (new) + blk_cleanup_queue(ctrl->ctrl.fabrics_q); +out_free_tagset: + if (new) + blk_mq_free_tag_set(ctrl->ctrl.admin_tagset); +out_free_async_qe: + if (ctrl->async_event_sqe.data) { + nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe, + sizeof(struct nvme_command), DMA_TO_DEVICE); + ctrl->async_event_sqe.data = NULL; + } +out_free_queue: + nvme_rdma_free_queue(&ctrl->queues[0]); + return error; +} + +static void nvme_rdma_destroy_io_queues(struct nvme_rdma_ctrl *ctrl, + bool remove) +{ + if (remove) { + blk_cleanup_queue(ctrl->ctrl.connect_q); + blk_mq_free_tag_set(ctrl->ctrl.tagset); + } + nvme_rdma_free_io_queues(ctrl); +} + +static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) +{ + int ret; + + ret = nvme_rdma_alloc_io_queues(ctrl); + if (ret) + return ret; + + if (new) { + ctrl->ctrl.tagset = nvme_rdma_alloc_tagset(&ctrl->ctrl, false); + if (IS_ERR(ctrl->ctrl.tagset)) { + ret = PTR_ERR(ctrl->ctrl.tagset); + goto out_free_io_queues; + } + + ctrl->ctrl.connect_q = blk_mq_init_queue(&ctrl->tag_set); + if (IS_ERR(ctrl->ctrl.connect_q)) { + ret = PTR_ERR(ctrl->ctrl.connect_q); + goto out_free_tag_set; + } + } + + ret = nvme_rdma_start_io_queues(ctrl); + if (ret) + goto out_cleanup_connect_q; + + if (!new) { + nvme_start_queues(&ctrl->ctrl); + if (!nvme_wait_freeze_timeout(&ctrl->ctrl, NVME_IO_TIMEOUT)) { + /* + * If we timed out waiting for freeze we are likely to + * be stuck. Fail the controller initialization just + * to be safe. + */ + ret = -ENODEV; + goto out_wait_freeze_timed_out; + } + blk_mq_update_nr_hw_queues(ctrl->ctrl.tagset, + ctrl->ctrl.queue_count - 1); + nvme_unfreeze(&ctrl->ctrl); + } + + return 0; + +out_wait_freeze_timed_out: + nvme_stop_queues(&ctrl->ctrl); + nvme_sync_io_queues(&ctrl->ctrl); + nvme_rdma_stop_io_queues(ctrl); +out_cleanup_connect_q: + nvme_cancel_tagset(&ctrl->ctrl); + if (new) + blk_cleanup_queue(ctrl->ctrl.connect_q); +out_free_tag_set: + if (new) + blk_mq_free_tag_set(ctrl->ctrl.tagset); +out_free_io_queues: + nvme_rdma_free_io_queues(ctrl); + return ret; +} + +static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl, + bool remove) +{ + nvme_stop_admin_queue(&ctrl->ctrl); + blk_sync_queue(ctrl->ctrl.admin_q); + nvme_rdma_stop_queue(&ctrl->queues[0]); + nvme_cancel_admin_tagset(&ctrl->ctrl); + if (remove) + nvme_start_admin_queue(&ctrl->ctrl); + nvme_rdma_destroy_admin_queue(ctrl, remove); +} + +static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl, + bool remove) +{ + if (ctrl->ctrl.queue_count > 1) { + nvme_start_freeze(&ctrl->ctrl); + nvme_stop_queues(&ctrl->ctrl); + nvme_sync_io_queues(&ctrl->ctrl); + nvme_rdma_stop_io_queues(ctrl); + nvme_cancel_tagset(&ctrl->ctrl); + if (remove) + nvme_start_queues(&ctrl->ctrl); + nvme_rdma_destroy_io_queues(ctrl, remove); + } +} + +static void nvme_rdma_stop_ctrl(struct nvme_ctrl *nctrl) +{ + struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); + + cancel_work_sync(&ctrl->err_work); + cancel_delayed_work_sync(&ctrl->reconnect_work); +} + +static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl) +{ + struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); + + if (list_empty(&ctrl->list)) + goto free_ctrl; + + mutex_lock(&nvme_rdma_ctrl_mutex); + list_del(&ctrl->list); + mutex_unlock(&nvme_rdma_ctrl_mutex); + + nvmf_free_options(nctrl->opts); +free_ctrl: + kfree(ctrl->queues); + kfree(ctrl); +} + +static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl) +{ + /* If we are resetting/deleting then do nothing */ + if (ctrl->ctrl.state != NVME_CTRL_CONNECTING) { + WARN_ON_ONCE(ctrl->ctrl.state == NVME_CTRL_NEW || + ctrl->ctrl.state == NVME_CTRL_LIVE); + return; + } + + if (nvmf_should_reconnect(&ctrl->ctrl)) { + dev_info(ctrl->ctrl.device, "Reconnecting in %d seconds...\n", + ctrl->ctrl.opts->reconnect_delay); + queue_delayed_work(nvme_wq, &ctrl->reconnect_work, + ctrl->ctrl.opts->reconnect_delay * HZ); + } else { + nvme_delete_ctrl(&ctrl->ctrl); + } +} + +static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new) +{ + int ret; + bool changed; + + ret = nvme_rdma_configure_admin_queue(ctrl, new); + if (ret) + return ret; + + if (ctrl->ctrl.icdoff) { + ret = -EOPNOTSUPP; + dev_err(ctrl->ctrl.device, "icdoff is not supported!\n"); + goto disable_ctrl; + } + + if (!(ctrl->ctrl.sgls & (1 << 2))) { + ret = -EOPNOTSUPP; + dev_err(ctrl->ctrl.device, + "Mandatory keyed sgls are not supported!\n"); + goto disable_ctrl; + } + + if (ctrl->ctrl.opts->queue_size > ctrl->ctrl.sqsize + 1) { + dev_warn(ctrl->ctrl.device, + "queue_size %zu > ctrl sqsize %u, clamping down\n", + ctrl->ctrl.opts->queue_size, ctrl->ctrl.sqsize + 1); + } + + if (ctrl->ctrl.sqsize + 1 > NVME_RDMA_MAX_QUEUE_SIZE) { + dev_warn(ctrl->ctrl.device, + "ctrl sqsize %u > max queue size %u, clamping down\n", + ctrl->ctrl.sqsize + 1, NVME_RDMA_MAX_QUEUE_SIZE); + ctrl->ctrl.sqsize = NVME_RDMA_MAX_QUEUE_SIZE - 1; + } + + if (ctrl->ctrl.sqsize + 1 > ctrl->ctrl.maxcmd) { + dev_warn(ctrl->ctrl.device, + "sqsize %u > ctrl maxcmd %u, clamping down\n", + ctrl->ctrl.sqsize + 1, ctrl->ctrl.maxcmd); + ctrl->ctrl.sqsize = ctrl->ctrl.maxcmd - 1; + } + + if (ctrl->ctrl.sgls & (1 << 20)) + ctrl->use_inline_data = true; + + if (ctrl->ctrl.queue_count > 1) { + ret = nvme_rdma_configure_io_queues(ctrl, new); + if (ret) + goto disable_ctrl; + } + + changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); + if (!changed) { + /* + * state change failure is ok if we started ctrl delete, + * unless we're during creation of a new controller to + * avoid races with teardown flow. + */ + WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING && + ctrl->ctrl.state != NVME_CTRL_DELETING_NOIO); + WARN_ON_ONCE(new); + ret = -EINVAL; + goto destroy_io; + } + + nvme_start_ctrl(&ctrl->ctrl); + return 0; + +destroy_io: + if (ctrl->ctrl.queue_count > 1) { + nvme_stop_queues(&ctrl->ctrl); + nvme_sync_io_queues(&ctrl->ctrl); + nvme_rdma_stop_io_queues(ctrl); + nvme_cancel_tagset(&ctrl->ctrl); + nvme_rdma_destroy_io_queues(ctrl, new); + } +disable_ctrl: + if (new) + nvme_shutdown_ctrl(&ctrl->ctrl); + else + nvme_disable_ctrl(&ctrl->ctrl); + + nvme_stop_admin_queue(&ctrl->ctrl); + blk_sync_queue(ctrl->ctrl.admin_q); + nvme_rdma_stop_queue(&ctrl->queues[0]); + nvme_cancel_admin_tagset(&ctrl->ctrl); + nvme_rdma_destroy_admin_queue(ctrl, new); + return ret; +} + +static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) +{ + struct nvme_rdma_ctrl *ctrl = container_of(to_delayed_work(work), + struct nvme_rdma_ctrl, reconnect_work); + + ++ctrl->ctrl.nr_reconnects; + + if (nvme_rdma_setup_ctrl(ctrl, false)) + goto requeue; + + dev_info(ctrl->ctrl.device, "Successfully reconnected (%d attempts)\n", + ctrl->ctrl.nr_reconnects); + + ctrl->ctrl.nr_reconnects = 0; + + return; + +requeue: + dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n", + ctrl->ctrl.nr_reconnects); + nvme_rdma_reconnect_or_remove(ctrl); +} + +static void nvme_rdma_error_recovery_work(struct work_struct *work) +{ + struct nvme_rdma_ctrl *ctrl = container_of(work, + struct nvme_rdma_ctrl, err_work); + + nvme_stop_keep_alive(&ctrl->ctrl); + flush_work(&ctrl->ctrl.async_event_work); + nvme_rdma_teardown_io_queues(ctrl, false); + nvme_start_queues(&ctrl->ctrl); + nvme_rdma_teardown_admin_queue(ctrl, false); + nvme_start_admin_queue(&ctrl->ctrl); + + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { + /* state change failure is ok if we started ctrl delete */ + WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING && + ctrl->ctrl.state != NVME_CTRL_DELETING_NOIO); + return; + } + + nvme_rdma_reconnect_or_remove(ctrl); +} + +static int nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl) +{ + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING)) { + if (ctrl->ctrl.state == NVME_CTRL_RESETTING) + return 0; + else + return -EAGAIN; + } + + dev_warn(ctrl->ctrl.device, "starting error recovery\n"); + queue_work(nvme_reset_wq, &ctrl->err_work); + return 0; +} + +static void nvme_rdma_end_request(struct nvme_rdma_request *req) +{ + struct request *rq = blk_mq_rq_from_pdu(req); + + if (!refcount_dec_and_test(&req->ref)) + return; + if (!nvme_try_complete_req(rq, req->status, req->result)) + nvme_rdma_complete_rq(rq); +} + +static int nvme_rdma_wr_error(struct nvme_rdma_ctrl *ctrl, struct ib_wc *wc, + const char *op) +{ + if (ctrl->ctrl.state == NVME_CTRL_LIVE) + dev_info(ctrl->ctrl.device, + "%s for CQE 0x%p failed with status %s (%d)\n", + op, wc->wr_cqe, + ib_wc_status_msg(wc->status), wc->status); + return nvme_rdma_error_recovery(ctrl); +} + +static void nvme_rdma_req_error(struct nvme_rdma_request *req, struct ib_wc *wc, + const char *op) +{ + struct nvme_rdma_queue *queue = req->queue; + struct nvme_rdma_ctrl *ctrl = queue->ctrl; + + if (nvme_rdma_wr_error(ctrl, wc, op) && + wc->status != IB_WC_WR_FLUSH_ERR) { + req->status = cpu_to_le16((NVME_SC_ABORT_REQ | + NVME_SC_DNR) << 1); + nvme_try_complete_req(blk_mq_rq_from_pdu(req), req->status, + req->result); + } +} + +static void nvme_rdma_memreg_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct nvme_rdma_request *req = + container_of(wc->wr_cqe, struct nvme_rdma_request, reg_cqe); + + if (unlikely(wc->status != IB_WC_SUCCESS)) + nvme_rdma_req_error(req, wc, "MEMREG"); +} + +static void nvme_rdma_inv_rkey_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct nvme_rdma_request *req = + container_of(wc->wr_cqe, struct nvme_rdma_request, reg_cqe); + + if (unlikely(wc->status != IB_WC_SUCCESS)) + nvme_rdma_req_error(req, wc, "LOCAL_INV"); + else + nvme_rdma_end_request(req); +} + +static int nvme_rdma_inv_rkey(struct nvme_rdma_queue *queue, + struct nvme_rdma_request *req) +{ + struct ib_send_wr wr = { + .opcode = IB_WR_LOCAL_INV, + .next = NULL, + .num_sge = 0, + .send_flags = IB_SEND_SIGNALED, + .ex.invalidate_rkey = req->mr->rkey, + }; + + req->reg_cqe.done = nvme_rdma_inv_rkey_done; + wr.wr_cqe = &req->reg_cqe; + + return ib_post_send(queue->qp, &wr, NULL); +} + +#ifdef CONFIG_NVFS +#include "nvfs-rdma.h" +#endif + +static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue, + struct request *rq) +{ + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_rdma_device *dev = queue->device; + struct ib_device *ibdev = dev->dev; + struct list_head *pool = &queue->qp->rdma_mrs; + + if (!blk_rq_nr_phys_segments(rq)) + return; + + if (blk_integrity_rq(rq)) { + ib_dma_unmap_sg(ibdev, req->metadata_sgl->sg_table.sgl, + req->metadata_sgl->nents, rq_dma_dir(rq)); + sg_free_table_chained(&req->metadata_sgl->sg_table, + NVME_INLINE_METADATA_SG_CNT); + } + + if (req->use_sig_mr) + pool = &queue->qp->sig_mrs; + + if (req->mr) { + ib_mr_pool_put(queue->qp, pool, req->mr); + req->mr = NULL; + } + +#ifdef CONFIG_NVFS + if (nvme_rdma_nvfs_unmap_data(queue, rq)) + return; +#endif + + ib_dma_unmap_sg(ibdev, req->data_sgl.sg_table.sgl, req->data_sgl.nents, + rq_dma_dir(rq)); + sg_free_table_chained(&req->data_sgl.sg_table, NVME_INLINE_SG_CNT); +} + +static int nvme_rdma_set_sg_null(struct nvme_command *c) +{ + struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl; + + sg->addr = 0; + put_unaligned_le24(0, sg->length); + put_unaligned_le32(0, sg->key); + sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4; + return 0; +} + +static int nvme_rdma_map_sg_inline(struct nvme_rdma_queue *queue, + struct nvme_rdma_request *req, struct nvme_command *c, + int count) +{ + struct nvme_sgl_desc *sg = &c->common.dptr.sgl; + struct ib_sge *sge = &req->sge[1]; + struct scatterlist *sgl; + u32 len = 0; + int i; + + for_each_sg(req->data_sgl.sg_table.sgl, sgl, count, i) { + sge->addr = sg_dma_address(sgl); + sge->length = sg_dma_len(sgl); + sge->lkey = queue->device->pd->local_dma_lkey; + len += sge->length; + sge++; + } + + sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff); + sg->length = cpu_to_le32(len); + sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET; + + req->num_sge += count; + return 0; +} + +static int nvme_rdma_map_sg_single(struct nvme_rdma_queue *queue, + struct nvme_rdma_request *req, struct nvme_command *c) +{ + struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl; + + sg->addr = cpu_to_le64(sg_dma_address(req->data_sgl.sg_table.sgl)); + put_unaligned_le24(sg_dma_len(req->data_sgl.sg_table.sgl), sg->length); + put_unaligned_le32(queue->device->pd->unsafe_global_rkey, sg->key); + sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4; + return 0; +} + +static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue, + struct nvme_rdma_request *req, struct nvme_command *c, + int count) +{ + struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl; + int nr; + + req->mr = ib_mr_pool_get(queue->qp, &queue->qp->rdma_mrs); + if (WARN_ON_ONCE(!req->mr)) + return -EAGAIN; + + /* + * Align the MR to a 4K page size to match the ctrl page size and + * the block virtual boundary. + */ + nr = ib_map_mr_sg(req->mr, req->data_sgl.sg_table.sgl, count, NULL, + SZ_4K); + if (unlikely(nr < count)) { + ib_mr_pool_put(queue->qp, &queue->qp->rdma_mrs, req->mr); + req->mr = NULL; + if (nr < 0) + return nr; + return -EINVAL; + } + + ib_update_fast_reg_key(req->mr, ib_inc_rkey(req->mr->rkey)); + + req->reg_cqe.done = nvme_rdma_memreg_done; + memset(&req->reg_wr, 0, sizeof(req->reg_wr)); + req->reg_wr.wr.opcode = IB_WR_REG_MR; + req->reg_wr.wr.wr_cqe = &req->reg_cqe; + req->reg_wr.wr.num_sge = 0; + req->reg_wr.mr = req->mr; + req->reg_wr.key = req->mr->rkey; + req->reg_wr.access = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE; + + sg->addr = cpu_to_le64(req->mr->iova); + put_unaligned_le24(req->mr->length, sg->length); + put_unaligned_le32(req->mr->rkey, sg->key); + sg->type = (NVME_KEY_SGL_FMT_DATA_DESC << 4) | + NVME_SGL_FMT_INVALIDATE; + + return 0; +} + +static void nvme_rdma_set_sig_domain(struct blk_integrity *bi, + struct nvme_command *cmd, struct ib_sig_domain *domain, + u16 control, u8 pi_type) +{ + domain->sig_type = IB_SIG_TYPE_T10_DIF; + domain->sig.dif.bg_type = IB_T10DIF_CRC; + domain->sig.dif.pi_interval = 1 << bi->interval_exp; + domain->sig.dif.ref_tag = le32_to_cpu(cmd->rw.reftag); + if (control & NVME_RW_PRINFO_PRCHK_REF) + domain->sig.dif.ref_remap = true; + + domain->sig.dif.app_tag = le16_to_cpu(cmd->rw.apptag); + domain->sig.dif.apptag_check_mask = le16_to_cpu(cmd->rw.appmask); + domain->sig.dif.app_escape = true; + if (pi_type == NVME_NS_DPS_PI_TYPE3) + domain->sig.dif.ref_escape = true; +} + +static void nvme_rdma_set_sig_attrs(struct blk_integrity *bi, + struct nvme_command *cmd, struct ib_sig_attrs *sig_attrs, + u8 pi_type) +{ + u16 control = le16_to_cpu(cmd->rw.control); + + memset(sig_attrs, 0, sizeof(*sig_attrs)); + if (control & NVME_RW_PRINFO_PRACT) { + /* for WRITE_INSERT/READ_STRIP no memory domain */ + sig_attrs->mem.sig_type = IB_SIG_TYPE_NONE; + nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->wire, control, + pi_type); + /* Clear the PRACT bit since HCA will generate/verify the PI */ + control &= ~NVME_RW_PRINFO_PRACT; + cmd->rw.control = cpu_to_le16(control); + } else { + /* for WRITE_PASS/READ_PASS both wire/memory domains exist */ + nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->wire, control, + pi_type); + nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->mem, control, + pi_type); + } +} + +static void nvme_rdma_set_prot_checks(struct nvme_command *cmd, u8 *mask) +{ + *mask = 0; + if (le16_to_cpu(cmd->rw.control) & NVME_RW_PRINFO_PRCHK_REF) + *mask |= IB_SIG_CHECK_REFTAG; + if (le16_to_cpu(cmd->rw.control) & NVME_RW_PRINFO_PRCHK_GUARD) + *mask |= IB_SIG_CHECK_GUARD; +} + +static void nvme_rdma_sig_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct nvme_rdma_request *req = + container_of(wc->wr_cqe, struct nvme_rdma_request, reg_cqe); + + if (unlikely(wc->status != IB_WC_SUCCESS)) + nvme_rdma_req_error(req, wc, "SIG"); +} + +static int nvme_rdma_map_sg_pi(struct nvme_rdma_queue *queue, + struct nvme_rdma_request *req, struct nvme_command *c, + int count, int pi_count) +{ + struct nvme_rdma_sgl *sgl = &req->data_sgl; + struct ib_reg_wr *wr = &req->reg_wr; + struct request *rq = blk_mq_rq_from_pdu(req); + struct nvme_ns *ns = rq->q->queuedata; + struct bio *bio = rq->bio; + struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl; + int nr; + + req->mr = ib_mr_pool_get(queue->qp, &queue->qp->sig_mrs); + if (WARN_ON_ONCE(!req->mr)) + return -EAGAIN; + + nr = ib_map_mr_sg_pi(req->mr, sgl->sg_table.sgl, count, NULL, + req->metadata_sgl->sg_table.sgl, pi_count, NULL, + SZ_4K); + if (unlikely(nr)) + goto mr_put; + + nvme_rdma_set_sig_attrs(blk_get_integrity(bio->bi_bdev->bd_disk), c, + req->mr->sig_attrs, ns->pi_type); + nvme_rdma_set_prot_checks(c, &req->mr->sig_attrs->check_mask); + + ib_update_fast_reg_key(req->mr, ib_inc_rkey(req->mr->rkey)); + + req->reg_cqe.done = nvme_rdma_sig_done; + memset(wr, 0, sizeof(*wr)); + wr->wr.opcode = IB_WR_REG_MR_INTEGRITY; + wr->wr.wr_cqe = &req->reg_cqe; + wr->wr.num_sge = 0; + wr->wr.send_flags = 0; + wr->mr = req->mr; + wr->key = req->mr->rkey; + wr->access = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE; + + sg->addr = cpu_to_le64(req->mr->iova); + put_unaligned_le24(req->mr->length, sg->length); + put_unaligned_le32(req->mr->rkey, sg->key); + sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4; + + return 0; + +mr_put: + ib_mr_pool_put(queue->qp, &queue->qp->sig_mrs, req->mr); + req->mr = NULL; + if (nr < 0) + return nr; + return -EINVAL; +} + +static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, + struct request *rq, struct nvme_command *c) +{ + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_rdma_device *dev = queue->device; + struct ib_device *ibdev = dev->dev; + int pi_count = 0; + int count, ret; + + req->num_sge = 1; + refcount_set(&req->ref, 2); /* send and recv completions */ + + c->common.flags |= NVME_CMD_SGL_METABUF; + + if (!blk_rq_nr_phys_segments(rq)) + return nvme_rdma_set_sg_null(c); + + req->data_sgl.sg_table.sgl = (struct scatterlist *)(req + 1); + ret = sg_alloc_table_chained(&req->data_sgl.sg_table, + blk_rq_nr_phys_segments(rq), req->data_sgl.sg_table.sgl, + NVME_INLINE_SG_CNT); + if (ret) + return -ENOMEM; + +#ifdef CONFIG_NVFS + { + bool is_nvfs_io = false; + ret = nvme_rdma_nvfs_map_data(queue, rq, c, &is_nvfs_io); + if (is_nvfs_io) + return ret; + } +#endif + + req->data_sgl.nents = blk_rq_map_sg(rq->q, rq, + req->data_sgl.sg_table.sgl); + + count = ib_dma_map_sg(ibdev, req->data_sgl.sg_table.sgl, + req->data_sgl.nents, rq_dma_dir(rq)); + if (unlikely(count <= 0)) { + ret = -EIO; + goto out_free_table; + } + + if (blk_integrity_rq(rq)) { + req->metadata_sgl->sg_table.sgl = + (struct scatterlist *)(req->metadata_sgl + 1); + ret = sg_alloc_table_chained(&req->metadata_sgl->sg_table, + blk_rq_count_integrity_sg(rq->q, rq->bio), + req->metadata_sgl->sg_table.sgl, + NVME_INLINE_METADATA_SG_CNT); + if (unlikely(ret)) { + ret = -ENOMEM; + goto out_unmap_sg; + } + + req->metadata_sgl->nents = blk_rq_map_integrity_sg(rq->q, + rq->bio, req->metadata_sgl->sg_table.sgl); + pi_count = ib_dma_map_sg(ibdev, + req->metadata_sgl->sg_table.sgl, + req->metadata_sgl->nents, + rq_dma_dir(rq)); + if (unlikely(pi_count <= 0)) { + ret = -EIO; + goto out_free_pi_table; + } + } + + if (req->use_sig_mr) { + ret = nvme_rdma_map_sg_pi(queue, req, c, count, pi_count); + goto out; + } + + if (count <= dev->num_inline_segments) { + if (rq_data_dir(rq) == WRITE && nvme_rdma_queue_idx(queue) && + queue->ctrl->use_inline_data && + blk_rq_payload_bytes(rq) <= + nvme_rdma_inline_data_size(queue)) { + ret = nvme_rdma_map_sg_inline(queue, req, c, count); + goto out; + } + + if (count == 1 && dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) { + ret = nvme_rdma_map_sg_single(queue, req, c); + goto out; + } + } + + ret = nvme_rdma_map_sg_fr(queue, req, c, count); +out: + if (unlikely(ret)) + goto out_unmap_pi_sg; + + return 0; + +out_unmap_pi_sg: + if (blk_integrity_rq(rq)) + ib_dma_unmap_sg(ibdev, req->metadata_sgl->sg_table.sgl, + req->metadata_sgl->nents, rq_dma_dir(rq)); +out_free_pi_table: + if (blk_integrity_rq(rq)) + sg_free_table_chained(&req->metadata_sgl->sg_table, + NVME_INLINE_METADATA_SG_CNT); +out_unmap_sg: + ib_dma_unmap_sg(ibdev, req->data_sgl.sg_table.sgl, req->data_sgl.nents, + rq_dma_dir(rq)); +out_free_table: + sg_free_table_chained(&req->data_sgl.sg_table, NVME_INLINE_SG_CNT); + return ret; +} + +static void nvme_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct nvme_rdma_qe *qe = + container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe); + struct nvme_rdma_request *req = + container_of(qe, struct nvme_rdma_request, sqe); + + if (unlikely(wc->status != IB_WC_SUCCESS)) + nvme_rdma_req_error(req, wc, "SEND"); + else + nvme_rdma_end_request(req); +} + +static int nvme_rdma_post_send(struct nvme_rdma_queue *queue, + struct nvme_rdma_qe *qe, struct ib_sge *sge, u32 num_sge, + struct ib_send_wr *first) +{ + struct ib_send_wr wr; + int ret; + + sge->addr = qe->dma; + sge->length = sizeof(struct nvme_command); + sge->lkey = queue->device->pd->local_dma_lkey; + + wr.next = NULL; + wr.wr_cqe = &qe->cqe; + wr.sg_list = sge; + wr.num_sge = num_sge; + wr.opcode = IB_WR_SEND; + wr.send_flags = IB_SEND_SIGNALED; + + if (first) + first->next = ≀ + else + first = ≀ + + ret = ib_post_send(queue->qp, first, NULL); + if (unlikely(ret)) { + dev_err(queue->ctrl->ctrl.device, + "%s failed with error code %d\n", __func__, ret); + } + return ret; +} + +static int nvme_rdma_post_recv(struct nvme_rdma_queue *queue, + struct nvme_rdma_qe *qe) +{ + struct ib_recv_wr wr; + struct ib_sge list; + int ret; + + list.addr = qe->dma; + list.length = sizeof(struct nvme_completion); + list.lkey = queue->device->pd->local_dma_lkey; + + qe->cqe.done = nvme_rdma_recv_done; + + wr.next = NULL; + wr.wr_cqe = &qe->cqe; + wr.sg_list = &list; + wr.num_sge = 1; + + ret = ib_post_recv(queue->qp, &wr, NULL); + if (unlikely(ret)) { + dev_err(queue->ctrl->ctrl.device, + "%s failed with error code %d\n", __func__, ret); + } + return ret; +} + +static struct blk_mq_tags *nvme_rdma_tagset(struct nvme_rdma_queue *queue) +{ + u32 queue_idx = nvme_rdma_queue_idx(queue); + + if (queue_idx == 0) + return queue->ctrl->admin_tag_set.tags[queue_idx]; + return queue->ctrl->tag_set.tags[queue_idx - 1]; +} + +static void nvme_rdma_async_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct nvme_rdma_queue *queue = wc->qp->qp_context; + + if (unlikely(wc->status != IB_WC_SUCCESS)) + nvme_rdma_wr_error(queue->ctrl, wc, "ASYNC"); +} + +static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg) +{ + struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(arg); + struct nvme_rdma_queue *queue = &ctrl->queues[0]; + struct ib_device *dev = queue->device->dev; + struct nvme_rdma_qe *sqe = &ctrl->async_event_sqe; + struct nvme_command *cmd = sqe->data; + struct ib_sge sge; + int ret; + + ib_dma_sync_single_for_cpu(dev, sqe->dma, sizeof(*cmd), DMA_TO_DEVICE); + + memset(cmd, 0, sizeof(*cmd)); + cmd->common.opcode = nvme_admin_async_event; + cmd->common.command_id = NVME_AQ_BLK_MQ_DEPTH; + cmd->common.flags |= NVME_CMD_SGL_METABUF; + nvme_rdma_set_sg_null(cmd); + + sqe->cqe.done = nvme_rdma_async_done; + + ib_dma_sync_single_for_device(dev, sqe->dma, sizeof(*cmd), + DMA_TO_DEVICE); + + ret = nvme_rdma_post_send(queue, sqe, &sge, 1, NULL); + WARN_ON_ONCE(ret); +} + +static struct nvme_rdma_request *nvme_rdma_comp_to_req( + struct nvme_rdma_queue *queue, struct nvme_completion *cqe) +{ + struct request *rq; + struct nvme_rdma_request *req; + + rq = nvme_find_rq(nvme_rdma_tagset(queue), cqe->command_id); + if (!rq) { + dev_err(queue->ctrl->ctrl.device, + "got bad command_id %#x on QP %#x\n", + cqe->command_id, queue->qp->qp_num); + nvme_rdma_error_recovery(queue->ctrl); + return NULL; + } + req = blk_mq_rq_to_pdu(rq); + + req->status = cqe->status; + req->result = cqe->result; + + return req; +} +static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue, + struct nvme_completion *cqe, struct ib_wc *wc) +{ + struct nvme_rdma_request *req; + int ret = 0; + + req = nvme_rdma_comp_to_req(queue, cqe); + if (!req) { + nvme_rdma_error_recovery(queue->ctrl); + return ret; + } + + if (wc->wc_flags & IB_WC_WITH_INVALIDATE) { + if (unlikely(!req->mr || + wc->ex.invalidate_rkey != req->mr->rkey)) { + dev_err(queue->ctrl->ctrl.device, + "Bogus remote invalidation for rkey %#x\n", + req->mr ? req->mr->rkey : 0); + nvme_rdma_error_recovery(queue->ctrl); + } + } else if (req->mr) { + int ret; + + ret = nvme_rdma_inv_rkey(queue, req); + if (unlikely(ret < 0)) { + dev_err(queue->ctrl->ctrl.device, + "Queueing INV WR for rkey %#x failed (%d)\n", + req->mr->rkey, ret); + nvme_rdma_error_recovery(queue->ctrl); + } + /* the local invalidation completion will end the request */ + return ret; + } + + nvme_rdma_end_request(req); + + return ret; +} + +static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct nvme_rdma_qe *qe = + container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe); + struct nvme_rdma_queue *queue = wc->qp->qp_context; + struct ib_device *ibdev = queue->device->dev; + struct nvme_completion *cqe = qe->data; + const size_t len = sizeof(struct nvme_completion); + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + nvme_rdma_wr_error(queue->ctrl, wc, "RECV"); + return; + } + + /* sanity checking for received data length */ + if (unlikely(wc->byte_len < len)) { + dev_err(queue->ctrl->ctrl.device, + "Unexpected nvme completion length(%d)\n", wc->byte_len); + nvme_rdma_error_recovery(queue->ctrl); + return; + } + + ib_dma_sync_single_for_cpu(ibdev, qe->dma, len, DMA_FROM_DEVICE); + /* + * AEN requests are special as they don't time out and can + * survive any kind of queue freeze and often don't respond to + * aborts. We don't even bother to allocate a struct request + * for them but rather special case them here. + */ + if (unlikely(nvme_is_aen_req(nvme_rdma_queue_idx(queue), + cqe->command_id))) + nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status, + &cqe->result); + else + nvme_rdma_process_nvme_rsp(queue, cqe, wc); + ib_dma_sync_single_for_device(ibdev, qe->dma, len, DMA_FROM_DEVICE); + + nvme_rdma_post_recv(queue, qe); +} + +static int nvme_rdma_conn_established(struct nvme_rdma_queue *queue) +{ + int ret, i; + + for (i = 0; i < queue->queue_size; i++) { + ret = nvme_rdma_post_recv(queue, &queue->rsp_ring[i]); + if (ret) + return ret; + } + + return 0; +} + +static int nvme_rdma_conn_rejected(struct nvme_rdma_queue *queue, + struct rdma_cm_event *ev) +{ + struct rdma_cm_id *cm_id = queue->cm_id; + int status = ev->status; + const char *rej_msg; + const struct nvme_rdma_cm_rej *rej_data; + u8 rej_data_len; + + rej_msg = rdma_reject_msg(cm_id, status); + rej_data = rdma_consumer_reject_data(cm_id, ev, &rej_data_len); + + if (rej_data && rej_data_len >= sizeof(u16)) { + u16 sts = le16_to_cpu(rej_data->sts); + + dev_err(queue->ctrl->ctrl.device, + "Connect rejected: status %d (%s) nvme status %d (%s).\n", + status, rej_msg, sts, nvme_rdma_cm_msg(sts)); + } else { + dev_err(queue->ctrl->ctrl.device, + "Connect rejected: status %d (%s).\n", status, rej_msg); + } + + return -ECONNRESET; +} + +static int nvme_rdma_addr_resolved(struct nvme_rdma_queue *queue) +{ + struct nvme_ctrl *ctrl = &queue->ctrl->ctrl; + int ret; + + ret = nvme_rdma_create_queue_ib(queue); + if (ret) + return ret; + + if (ctrl->opts->tos >= 0) + rdma_set_service_type(queue->cm_id, ctrl->opts->tos); + ret = rdma_resolve_route(queue->cm_id, NVME_RDMA_CM_TIMEOUT_MS); + if (ret) { + dev_err(ctrl->device, "rdma_resolve_route failed (%d).\n", + queue->cm_error); + goto out_destroy_queue; + } + + return 0; + +out_destroy_queue: + nvme_rdma_destroy_queue_ib(queue); + return ret; +} + +static int nvme_rdma_route_resolved(struct nvme_rdma_queue *queue) +{ + struct nvme_rdma_ctrl *ctrl = queue->ctrl; + struct rdma_conn_param param = { }; + struct nvme_rdma_cm_req priv = { }; + int ret; + + param.qp_num = queue->qp->qp_num; + param.flow_control = 1; + + param.responder_resources = queue->device->dev->attrs.max_qp_rd_atom; + /* maximum retry count */ + param.retry_count = 7; + param.rnr_retry_count = 7; + param.private_data = &priv; + param.private_data_len = sizeof(priv); + + priv.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0); + priv.qid = cpu_to_le16(nvme_rdma_queue_idx(queue)); + /* + * set the admin queue depth to the minimum size + * specified by the Fabrics standard. + */ + if (priv.qid == 0) { + priv.hrqsize = cpu_to_le16(NVME_AQ_DEPTH); + priv.hsqsize = cpu_to_le16(NVME_AQ_DEPTH - 1); + } else { + /* + * current interpretation of the fabrics spec + * is at minimum you make hrqsize sqsize+1, or a + * 1's based representation of sqsize. + */ + priv.hrqsize = cpu_to_le16(queue->queue_size); + priv.hsqsize = cpu_to_le16(queue->ctrl->ctrl.sqsize); + } + + ret = rdma_connect_locked(queue->cm_id, ¶m); + if (ret) { + dev_err(ctrl->ctrl.device, + "rdma_connect_locked failed (%d).\n", ret); + return ret; + } + + return 0; +} + +static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *ev) +{ + struct nvme_rdma_queue *queue = cm_id->context; + int cm_error = 0; + + dev_dbg(queue->ctrl->ctrl.device, "%s (%d): status %d id %p\n", + rdma_event_msg(ev->event), ev->event, + ev->status, cm_id); + + switch (ev->event) { + case RDMA_CM_EVENT_ADDR_RESOLVED: + cm_error = nvme_rdma_addr_resolved(queue); + break; + case RDMA_CM_EVENT_ROUTE_RESOLVED: + cm_error = nvme_rdma_route_resolved(queue); + break; + case RDMA_CM_EVENT_ESTABLISHED: + queue->cm_error = nvme_rdma_conn_established(queue); + /* complete cm_done regardless of success/failure */ + complete(&queue->cm_done); + return 0; + case RDMA_CM_EVENT_REJECTED: + cm_error = nvme_rdma_conn_rejected(queue, ev); + break; + case RDMA_CM_EVENT_ROUTE_ERROR: + case RDMA_CM_EVENT_CONNECT_ERROR: + case RDMA_CM_EVENT_UNREACHABLE: + case RDMA_CM_EVENT_ADDR_ERROR: + dev_dbg(queue->ctrl->ctrl.device, + "CM error event %d\n", ev->event); + cm_error = -ECONNRESET; + break; + case RDMA_CM_EVENT_DISCONNECTED: + case RDMA_CM_EVENT_ADDR_CHANGE: + case RDMA_CM_EVENT_TIMEWAIT_EXIT: + dev_dbg(queue->ctrl->ctrl.device, + "disconnect received - connection closed\n"); + nvme_rdma_error_recovery(queue->ctrl); + break; + case RDMA_CM_EVENT_DEVICE_REMOVAL: + /* device removal is handled via the ib_client API */ + break; + default: + dev_err(queue->ctrl->ctrl.device, + "Unexpected RDMA CM event (%d)\n", ev->event); + nvme_rdma_error_recovery(queue->ctrl); + break; + } + + if (cm_error) { + queue->cm_error = cm_error; + complete(&queue->cm_done); + } + + return 0; +} + +static void nvme_rdma_complete_timed_out(struct request *rq) +{ + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_rdma_queue *queue = req->queue; + + nvme_rdma_stop_queue(queue); + if (blk_mq_request_started(rq) && !blk_mq_request_completed(rq)) { + nvme_req(rq)->status = NVME_SC_HOST_ABORTED_CMD; + blk_mq_complete_request(rq); + } +} + +static enum blk_eh_timer_return +nvme_rdma_timeout(struct request *rq, bool reserved) +{ + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_rdma_queue *queue = req->queue; + struct nvme_rdma_ctrl *ctrl = queue->ctrl; + + dev_warn(ctrl->ctrl.device, "I/O %d QID %d timeout\n", + rq->tag, nvme_rdma_queue_idx(queue)); + + if (ctrl->ctrl.state != NVME_CTRL_LIVE) { + /* + * If we are resetting, connecting or deleting we should + * complete immediately because we may block controller + * teardown or setup sequence + * - ctrl disable/shutdown fabrics requests + * - connect requests + * - initialization admin requests + * - I/O requests that entered after unquiescing and + * the controller stopped responding + * + * All other requests should be cancelled by the error + * recovery work, so it's fine that we fail it here. + */ + nvme_rdma_complete_timed_out(rq); + return BLK_EH_DONE; + } + + /* + * LIVE state should trigger the normal error recovery which will + * handle completing this request. + */ + nvme_rdma_error_recovery(ctrl); + return BLK_EH_RESET_TIMER; +} + +static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct nvme_ns *ns = hctx->queue->queuedata; + struct nvme_rdma_queue *queue = hctx->driver_data; + struct request *rq = bd->rq; + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_rdma_qe *sqe = &req->sqe; + struct nvme_command *c = nvme_req(rq)->cmd; + struct ib_device *dev; + bool queue_ready = test_bit(NVME_RDMA_Q_LIVE, &queue->flags); + blk_status_t ret; + int err; + + WARN_ON_ONCE(rq->tag < 0); + + if (!nvme_check_ready(&queue->ctrl->ctrl, rq, queue_ready)) + return nvme_fail_nonready_command(&queue->ctrl->ctrl, rq); + + dev = queue->device->dev; + + req->sqe.dma = ib_dma_map_single(dev, req->sqe.data, + sizeof(struct nvme_command), + DMA_TO_DEVICE); + err = ib_dma_mapping_error(dev, req->sqe.dma); + if (unlikely(err)) + return BLK_STS_RESOURCE; + + ib_dma_sync_single_for_cpu(dev, sqe->dma, + sizeof(struct nvme_command), DMA_TO_DEVICE); + + ret = nvme_setup_cmd(ns, rq); + if (ret) + goto unmap_qe; + + blk_mq_start_request(rq); + + if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) && + queue->pi_support && + (c->common.opcode == nvme_cmd_write || + c->common.opcode == nvme_cmd_read) && + nvme_ns_has_pi(ns)) + req->use_sig_mr = true; + else + req->use_sig_mr = false; + + err = nvme_rdma_map_data(queue, rq, c); + if (unlikely(err < 0)) { + dev_err(queue->ctrl->ctrl.device, + "Failed to map data (%d)\n", err); + goto err; + } + + sqe->cqe.done = nvme_rdma_send_done; + + ib_dma_sync_single_for_device(dev, sqe->dma, + sizeof(struct nvme_command), DMA_TO_DEVICE); + + err = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge, + req->mr ? &req->reg_wr.wr : NULL); + if (unlikely(err)) + goto err_unmap; + + return BLK_STS_OK; + +err_unmap: + nvme_rdma_unmap_data(queue, rq); +err: + if (err == -EIO) + ret = nvme_host_path_error(rq); + else if (err == -ENOMEM || err == -EAGAIN) + ret = BLK_STS_RESOURCE; + else + ret = BLK_STS_IOERR; + nvme_cleanup_cmd(rq); +unmap_qe: + ib_dma_unmap_single(dev, req->sqe.dma, sizeof(struct nvme_command), + DMA_TO_DEVICE); + return ret; +} + +static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) +{ + struct nvme_rdma_queue *queue = hctx->driver_data; + + return ib_process_cq_direct(queue->ib_cq, -1); +} + +static void nvme_rdma_check_pi_status(struct nvme_rdma_request *req) +{ + struct request *rq = blk_mq_rq_from_pdu(req); + struct ib_mr_status mr_status; + int ret; + + ret = ib_check_mr_status(req->mr, IB_MR_CHECK_SIG_STATUS, &mr_status); + if (ret) { + pr_err("ib_check_mr_status failed, ret %d\n", ret); + nvme_req(rq)->status = NVME_SC_INVALID_PI; + return; + } + + if (mr_status.fail_status & IB_MR_CHECK_SIG_STATUS) { + switch (mr_status.sig_err.err_type) { + case IB_SIG_BAD_GUARD: + nvme_req(rq)->status = NVME_SC_GUARD_CHECK; + break; + case IB_SIG_BAD_REFTAG: + nvme_req(rq)->status = NVME_SC_REFTAG_CHECK; + break; + case IB_SIG_BAD_APPTAG: + nvme_req(rq)->status = NVME_SC_APPTAG_CHECK; + break; + } + pr_err("PI error found type %d expected 0x%x vs actual 0x%x\n", + mr_status.sig_err.err_type, mr_status.sig_err.expected, + mr_status.sig_err.actual); + } +} + +static void nvme_rdma_complete_rq(struct request *rq) +{ + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_rdma_queue *queue = req->queue; + struct ib_device *ibdev = queue->device->dev; + + if (req->use_sig_mr) + nvme_rdma_check_pi_status(req); + + nvme_rdma_unmap_data(queue, rq); + ib_dma_unmap_single(ibdev, req->sqe.dma, sizeof(struct nvme_command), + DMA_TO_DEVICE); + nvme_complete_rq(rq); +} + +static int nvme_rdma_map_queues(struct blk_mq_tag_set *set) +{ + struct nvme_rdma_ctrl *ctrl = set->driver_data; + struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; + + if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) { + /* separate read/write queues */ + set->map[HCTX_TYPE_DEFAULT].nr_queues = + ctrl->io_queues[HCTX_TYPE_DEFAULT]; + set->map[HCTX_TYPE_DEFAULT].queue_offset = 0; + set->map[HCTX_TYPE_READ].nr_queues = + ctrl->io_queues[HCTX_TYPE_READ]; + set->map[HCTX_TYPE_READ].queue_offset = + ctrl->io_queues[HCTX_TYPE_DEFAULT]; + } else { + /* shared read/write queues */ + set->map[HCTX_TYPE_DEFAULT].nr_queues = + ctrl->io_queues[HCTX_TYPE_DEFAULT]; + set->map[HCTX_TYPE_DEFAULT].queue_offset = 0; + set->map[HCTX_TYPE_READ].nr_queues = + ctrl->io_queues[HCTX_TYPE_DEFAULT]; + set->map[HCTX_TYPE_READ].queue_offset = 0; + } + blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_DEFAULT], + ctrl->device->dev, 0); + blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_READ], + ctrl->device->dev, 0); + + if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) { + /* map dedicated poll queues only if we have queues left */ + set->map[HCTX_TYPE_POLL].nr_queues = + ctrl->io_queues[HCTX_TYPE_POLL]; + set->map[HCTX_TYPE_POLL].queue_offset = + ctrl->io_queues[HCTX_TYPE_DEFAULT] + + ctrl->io_queues[HCTX_TYPE_READ]; + blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]); + } + + dev_info(ctrl->ctrl.device, + "mapped %d/%d/%d default/read/poll queues.\n", + ctrl->io_queues[HCTX_TYPE_DEFAULT], + ctrl->io_queues[HCTX_TYPE_READ], + ctrl->io_queues[HCTX_TYPE_POLL]); + + return 0; +} + +static const struct blk_mq_ops nvme_rdma_mq_ops = { + .queue_rq = nvme_rdma_queue_rq, + .complete = nvme_rdma_complete_rq, + .init_request = nvme_rdma_init_request, + .exit_request = nvme_rdma_exit_request, + .init_hctx = nvme_rdma_init_hctx, + .timeout = nvme_rdma_timeout, + .map_queues = nvme_rdma_map_queues, + .poll = nvme_rdma_poll, +}; + +static const struct blk_mq_ops nvme_rdma_admin_mq_ops = { + .queue_rq = nvme_rdma_queue_rq, + .complete = nvme_rdma_complete_rq, + .init_request = nvme_rdma_init_request, + .exit_request = nvme_rdma_exit_request, + .init_hctx = nvme_rdma_init_admin_hctx, + .timeout = nvme_rdma_timeout, +}; + +static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) +{ + nvme_rdma_teardown_io_queues(ctrl, shutdown); + nvme_stop_admin_queue(&ctrl->ctrl); + if (shutdown) + nvme_shutdown_ctrl(&ctrl->ctrl); + else + nvme_disable_ctrl(&ctrl->ctrl); + nvme_rdma_teardown_admin_queue(ctrl, shutdown); +} + +static void nvme_rdma_delete_ctrl(struct nvme_ctrl *ctrl) +{ + nvme_rdma_shutdown_ctrl(to_rdma_ctrl(ctrl), true); +} + +static void nvme_rdma_reset_ctrl_work(struct work_struct *work) +{ + struct nvme_rdma_ctrl *ctrl = + container_of(work, struct nvme_rdma_ctrl, ctrl.reset_work); + + nvme_stop_ctrl(&ctrl->ctrl); + nvme_rdma_shutdown_ctrl(ctrl, false); + + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { + /* state change failure should never happen */ + WARN_ON_ONCE(1); + return; + } + + if (nvme_rdma_setup_ctrl(ctrl, false)) + goto out_fail; + + return; + +out_fail: + ++ctrl->ctrl.nr_reconnects; + nvme_rdma_reconnect_or_remove(ctrl); +} + +static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = { + .name = "rdma", + .module = THIS_MODULE, + .flags = NVME_F_FABRICS | NVME_F_METADATA_SUPPORTED, + .reg_read32 = nvmf_reg_read32, + .reg_read64 = nvmf_reg_read64, + .reg_write32 = nvmf_reg_write32, + .free_ctrl = nvme_rdma_free_ctrl, + .submit_async_event = nvme_rdma_submit_async_event, + .delete_ctrl = nvme_rdma_delete_ctrl, + .get_address = nvmf_get_address, + .stop_ctrl = nvme_rdma_stop_ctrl, +}; + +/* + * Fails a connection request if it matches an existing controller + * (association) with the same tuple: + * + * + * if local address is not specified in the request, it will match an + * existing controller with all the other parameters the same and no + * local port address specified as well. + * + * The ports don't need to be compared as they are intrinsically + * already matched by the port pointers supplied. + */ +static bool +nvme_rdma_existing_controller(struct nvmf_ctrl_options *opts) +{ + struct nvme_rdma_ctrl *ctrl; + bool found = false; + + mutex_lock(&nvme_rdma_ctrl_mutex); + list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list) { + found = nvmf_ip_options_match(&ctrl->ctrl, opts); + if (found) + break; + } + mutex_unlock(&nvme_rdma_ctrl_mutex); + + return found; +} + +static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, + struct nvmf_ctrl_options *opts) +{ + struct nvme_rdma_ctrl *ctrl; + int ret; + bool changed; + + ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL); + if (!ctrl) + return ERR_PTR(-ENOMEM); + ctrl->ctrl.opts = opts; + INIT_LIST_HEAD(&ctrl->list); + + if (!(opts->mask & NVMF_OPT_TRSVCID)) { + opts->trsvcid = + kstrdup(__stringify(NVME_RDMA_IP_PORT), GFP_KERNEL); + if (!opts->trsvcid) { + ret = -ENOMEM; + goto out_free_ctrl; + } + opts->mask |= NVMF_OPT_TRSVCID; + } + + ret = inet_pton_with_scope(&init_net, AF_UNSPEC, + opts->traddr, opts->trsvcid, &ctrl->addr); + if (ret) { + pr_err("malformed address passed: %s:%s\n", + opts->traddr, opts->trsvcid); + goto out_free_ctrl; + } + + if (opts->mask & NVMF_OPT_HOST_TRADDR) { + ret = inet_pton_with_scope(&init_net, AF_UNSPEC, + opts->host_traddr, NULL, &ctrl->src_addr); + if (ret) { + pr_err("malformed src address passed: %s\n", + opts->host_traddr); + goto out_free_ctrl; + } + } + + if (!opts->duplicate_connect && nvme_rdma_existing_controller(opts)) { + ret = -EALREADY; + goto out_free_ctrl; + } + + INIT_DELAYED_WORK(&ctrl->reconnect_work, + nvme_rdma_reconnect_ctrl_work); + INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work); + INIT_WORK(&ctrl->ctrl.reset_work, nvme_rdma_reset_ctrl_work); + + ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues + + opts->nr_poll_queues + 1; + ctrl->ctrl.sqsize = opts->queue_size - 1; + ctrl->ctrl.kato = opts->kato; + + ret = -ENOMEM; + ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues), + GFP_KERNEL); + if (!ctrl->queues) + goto out_free_ctrl; + + ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_rdma_ctrl_ops, + 0 /* no quirks, we're perfect! */); + if (ret) + goto out_kfree_queues; + + changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING); + WARN_ON_ONCE(!changed); + + ret = nvme_rdma_setup_ctrl(ctrl, true); + if (ret) + goto out_uninit_ctrl; + + dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs\n", + nvmf_ctrl_subsysnqn(&ctrl->ctrl), &ctrl->addr); + + mutex_lock(&nvme_rdma_ctrl_mutex); + list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list); + mutex_unlock(&nvme_rdma_ctrl_mutex); + + return &ctrl->ctrl; + +out_uninit_ctrl: + nvme_uninit_ctrl(&ctrl->ctrl); + nvme_put_ctrl(&ctrl->ctrl); + if (ret > 0) + ret = -EIO; + return ERR_PTR(ret); +out_kfree_queues: + kfree(ctrl->queues); +out_free_ctrl: + kfree(ctrl); + return ERR_PTR(ret); +} + +static struct nvmf_transport_ops nvme_rdma_transport = { + .name = "rdma", + .module = THIS_MODULE, + .required_opts = NVMF_OPT_TRADDR, + .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY | + NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO | + NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES | + NVMF_OPT_TOS, + .create_ctrl = nvme_rdma_create_ctrl, +}; + +static void nvme_rdma_remove_one(struct ib_device *ib_device, void *client_data) +{ + struct nvme_rdma_ctrl *ctrl; + struct nvme_rdma_device *ndev; + bool found = false; + + mutex_lock(&device_list_mutex); + list_for_each_entry(ndev, &device_list, entry) { + if (ndev->dev == ib_device) { + found = true; + break; + } + } + mutex_unlock(&device_list_mutex); + + if (!found) + return; + + /* Delete all controllers using this device */ + mutex_lock(&nvme_rdma_ctrl_mutex); + list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list) { + if (ctrl->device->dev != ib_device) + continue; + nvme_delete_ctrl(&ctrl->ctrl); + } + mutex_unlock(&nvme_rdma_ctrl_mutex); + + flush_workqueue(nvme_delete_wq); +} + +static struct ib_client nvme_rdma_ib_client = { + .name = "nvme_rdma", + .remove = nvme_rdma_remove_one +}; + +static int __init nvme_rdma_init_module(void) +{ + int ret; + + ret = ib_register_client(&nvme_rdma_ib_client); + if (ret) + return ret; + + ret = nvmf_register_transport(&nvme_rdma_transport); + if (ret) + goto err_unreg_client; + + return 0; + +err_unreg_client: + ib_unregister_client(&nvme_rdma_ib_client); + return ret; +} + +static void __exit nvme_rdma_cleanup_module(void) +{ + struct nvme_rdma_ctrl *ctrl; + + nvmf_unregister_transport(&nvme_rdma_transport); + ib_unregister_client(&nvme_rdma_ib_client); + + mutex_lock(&nvme_rdma_ctrl_mutex); + list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list) + nvme_delete_ctrl(&ctrl->ctrl); + mutex_unlock(&nvme_rdma_ctrl_mutex); + flush_workqueue(nvme_delete_wq); +} + +module_init(nvme_rdma_init_module); +module_exit(nvme_rdma_cleanup_module); + +MODULE_LICENSE("GPL v2"); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/tcm_qla2xxx_dummy.c b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/tcm_qla2xxx_dummy.c new file mode 100644 index 0000000..df6cd71 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/tcm_qla2xxx_dummy.c @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2020 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "tcm_qla2xxx" +#define PFX DRV_NAME ": " +#define DRV_VERSION "2.0.0" +#define DRV_RELDATE "February 09, 2020" + +MODULE_AUTHOR("Alaa Hleihel"); +MODULE_DESCRIPTION("tcm_qla2xxx dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init tcm_qla2xxx_init(void) +{ + return 0; +} + +static void __exit tcm_qla2xxx_cleanup(void) +{ +} + +module_init(tcm_qla2xxx_init); +module_exit(tcm_qla2xxx_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/tcp.c b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/tcp.c new file mode 100644 index 0000000..35301fb --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/tcp.c @@ -0,0 +1,2693 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NVMe over Fabrics TCP host. + * Copyright (c) 2018 Lightbits Labs. All rights reserved. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nvme.h" +#include "fabrics.h" + +struct nvme_tcp_queue; + +/* Define the socket priority to use for connections were it is desirable + * that the NIC consider performing optimized packet processing or filtering. + * A non-zero value being sufficient to indicate general consideration of any + * possible optimization. Making it a module param allows for alternative + * values that may be unique for some NIC implementations. + */ +static int so_priority; +module_param(so_priority, int, 0644); +MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority"); + +enum nvme_tcp_send_state { + NVME_TCP_SEND_CMD_PDU = 0, + NVME_TCP_SEND_H2C_PDU, + NVME_TCP_SEND_DATA, + NVME_TCP_SEND_DDGST, +}; + +struct nvme_tcp_request { + struct nvme_request req; + void *pdu; + struct nvme_tcp_queue *queue; + u32 data_len; + u32 pdu_len; + u32 pdu_sent; + u32 h2cdata_left; + u32 h2cdata_offset; + u16 ttag; + __le16 status; + struct list_head entry; + struct llist_node lentry; + __le32 ddgst; + + struct bio *curr_bio; + struct iov_iter iter; + + /* send state */ + size_t offset; + size_t data_sent; + enum nvme_tcp_send_state state; +}; + +enum nvme_tcp_queue_flags { + NVME_TCP_Q_ALLOCATED = 0, + NVME_TCP_Q_LIVE = 1, + NVME_TCP_Q_POLLING = 2, +}; + +enum nvme_tcp_recv_state { + NVME_TCP_RECV_PDU = 0, + NVME_TCP_RECV_DATA, + NVME_TCP_RECV_DDGST, +}; + +struct nvme_tcp_ctrl; +struct nvme_tcp_queue { + struct socket *sock; + struct work_struct io_work; + int io_cpu; + + struct mutex queue_lock; + struct mutex send_mutex; + struct llist_head req_list; + struct list_head send_list; + + /* recv state */ + void *pdu; + int pdu_remaining; + int pdu_offset; + size_t data_remaining; + size_t ddgst_remaining; + unsigned int nr_cqe; + + /* send state */ + struct nvme_tcp_request *request; + + int queue_size; + u32 maxh2cdata; + size_t cmnd_capsule_len; + struct nvme_tcp_ctrl *ctrl; + unsigned long flags; + bool rd_enabled; + + bool hdr_digest; + bool data_digest; + struct ahash_request *rcv_hash; + struct ahash_request *snd_hash; + __le32 exp_ddgst; + __le32 recv_ddgst; + + struct page_frag_cache pf_cache; + + void (*state_change)(struct sock *); + void (*data_ready)(struct sock *); + void (*write_space)(struct sock *); +}; + +struct nvme_tcp_ctrl { + /* read only in the hot path */ + struct nvme_tcp_queue *queues; + struct blk_mq_tag_set tag_set; + + /* other member variables */ + struct list_head list; + struct blk_mq_tag_set admin_tag_set; + struct sockaddr_storage addr; + struct sockaddr_storage src_addr; + struct nvme_ctrl ctrl; + + struct work_struct err_work; + struct delayed_work connect_work; + struct nvme_tcp_request async_req; + u32 io_queues[HCTX_MAX_TYPES]; +}; + +static LIST_HEAD(nvme_tcp_ctrl_list); +static DEFINE_MUTEX(nvme_tcp_ctrl_mutex); +static struct workqueue_struct *nvme_tcp_wq; +static const struct blk_mq_ops nvme_tcp_mq_ops; +static const struct blk_mq_ops nvme_tcp_admin_mq_ops; +static int nvme_tcp_try_send(struct nvme_tcp_queue *queue); + +static inline struct nvme_tcp_ctrl *to_tcp_ctrl(struct nvme_ctrl *ctrl) +{ + return container_of(ctrl, struct nvme_tcp_ctrl, ctrl); +} + +static inline int nvme_tcp_queue_id(struct nvme_tcp_queue *queue) +{ + return queue - queue->ctrl->queues; +} + +static inline struct blk_mq_tags *nvme_tcp_tagset(struct nvme_tcp_queue *queue) +{ + u32 queue_idx = nvme_tcp_queue_id(queue); + + if (queue_idx == 0) + return queue->ctrl->admin_tag_set.tags[queue_idx]; + return queue->ctrl->tag_set.tags[queue_idx - 1]; +} + +static inline u8 nvme_tcp_hdgst_len(struct nvme_tcp_queue *queue) +{ + return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0; +} + +static inline u8 nvme_tcp_ddgst_len(struct nvme_tcp_queue *queue) +{ + return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0; +} + +static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_queue *queue) +{ + return queue->cmnd_capsule_len - sizeof(struct nvme_command); +} + +static inline bool nvme_tcp_async_req(struct nvme_tcp_request *req) +{ + return req == &req->queue->ctrl->async_req; +} + +static inline bool nvme_tcp_has_inline_data(struct nvme_tcp_request *req) +{ + struct request *rq; + + if (unlikely(nvme_tcp_async_req(req))) + return false; /* async events don't have a request */ + + rq = blk_mq_rq_from_pdu(req); + + return rq_data_dir(rq) == WRITE && req->data_len && + req->data_len <= nvme_tcp_inline_data_size(req->queue); +} + +static inline struct page *nvme_tcp_req_cur_page(struct nvme_tcp_request *req) +{ + return req->iter.bvec->bv_page; +} + +static inline size_t nvme_tcp_req_cur_offset(struct nvme_tcp_request *req) +{ + return req->iter.bvec->bv_offset + req->iter.iov_offset; +} + +static inline size_t nvme_tcp_req_cur_length(struct nvme_tcp_request *req) +{ + return min_t(size_t, iov_iter_single_seg_count(&req->iter), + req->pdu_len - req->pdu_sent); +} + +static inline size_t nvme_tcp_pdu_data_left(struct nvme_tcp_request *req) +{ + return rq_data_dir(blk_mq_rq_from_pdu(req)) == WRITE ? + req->pdu_len - req->pdu_sent : 0; +} + +static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req, + int len) +{ + return nvme_tcp_pdu_data_left(req) <= len; +} + +static void nvme_tcp_init_iter(struct nvme_tcp_request *req, + unsigned int dir) +{ + struct request *rq = blk_mq_rq_from_pdu(req); + struct bio_vec *vec; + unsigned int size; + int nr_bvec; + size_t offset; + + if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) { + vec = &rq->special_vec; + nr_bvec = 1; + size = blk_rq_payload_bytes(rq); + offset = 0; + } else { + struct bio *bio = req->curr_bio; + struct bvec_iter bi; + struct bio_vec bv; + + vec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter); + nr_bvec = 0; + bio_for_each_bvec(bv, bio, bi) { + nr_bvec++; + } + size = bio->bi_iter.bi_size; + offset = bio->bi_iter.bi_bvec_done; + } + + iov_iter_bvec(&req->iter, dir, vec, nr_bvec, size); + req->iter.iov_offset = offset; +} + +static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req, + int len) +{ + req->data_sent += len; + req->pdu_sent += len; + iov_iter_advance(&req->iter, len); + if (!iov_iter_count(&req->iter) && + req->data_sent < req->data_len) { + req->curr_bio = req->curr_bio->bi_next; + nvme_tcp_init_iter(req, WRITE); + } +} + +static inline void nvme_tcp_send_all(struct nvme_tcp_queue *queue) +{ + int ret; + + /* drain the send queue as much as we can... */ + do { + ret = nvme_tcp_try_send(queue); + } while (ret > 0); +} + +static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue) +{ + return !list_empty(&queue->send_list) || + !llist_empty(&queue->req_list); +} + +static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req, + bool sync, bool last) +{ + struct nvme_tcp_queue *queue = req->queue; + bool empty; + + empty = llist_add(&req->lentry, &queue->req_list) && + list_empty(&queue->send_list) && !queue->request; + + /* + * if we're the first on the send_list and we can try to send + * directly, otherwise queue io_work. Also, only do that if we + * are on the same cpu, so we don't introduce contention. + */ + if (queue->io_cpu == raw_smp_processor_id() && + sync && empty && mutex_trylock(&queue->send_mutex)) { + nvme_tcp_send_all(queue); + mutex_unlock(&queue->send_mutex); + } + + if (last && nvme_tcp_queue_more(queue)) + queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); +} + +static void nvme_tcp_process_req_list(struct nvme_tcp_queue *queue) +{ + struct nvme_tcp_request *req; + struct llist_node *node; + + for (node = llist_del_all(&queue->req_list); node; node = node->next) { + req = llist_entry(node, struct nvme_tcp_request, lentry); + list_add(&req->entry, &queue->send_list); + } +} + +static inline struct nvme_tcp_request * +nvme_tcp_fetch_request(struct nvme_tcp_queue *queue) +{ + struct nvme_tcp_request *req; + + req = list_first_entry_or_null(&queue->send_list, + struct nvme_tcp_request, entry); + if (!req) { + nvme_tcp_process_req_list(queue); + req = list_first_entry_or_null(&queue->send_list, + struct nvme_tcp_request, entry); + if (unlikely(!req)) + return NULL; + } + + list_del(&req->entry); + return req; +} + +static inline void nvme_tcp_ddgst_final(struct ahash_request *hash, + __le32 *dgst) +{ + ahash_request_set_crypt(hash, NULL, (u8 *)dgst, 0); + crypto_ahash_final(hash); +} + +static inline void nvme_tcp_ddgst_update(struct ahash_request *hash, + struct page *page, off_t off, size_t len) +{ + struct scatterlist sg; + + sg_init_marker(&sg, 1); + sg_set_page(&sg, page, len, off); + ahash_request_set_crypt(hash, &sg, NULL, len); + crypto_ahash_update(hash); +} + +static inline void nvme_tcp_hdgst(struct ahash_request *hash, + void *pdu, size_t len) +{ + struct scatterlist sg; + + sg_init_one(&sg, pdu, len); + ahash_request_set_crypt(hash, &sg, pdu + len, len); + crypto_ahash_digest(hash); +} + +static int nvme_tcp_verify_hdgst(struct nvme_tcp_queue *queue, + void *pdu, size_t pdu_len) +{ + struct nvme_tcp_hdr *hdr = pdu; + __le32 recv_digest; + __le32 exp_digest; + + if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) { + dev_err(queue->ctrl->ctrl.device, + "queue %d: header digest flag is cleared\n", + nvme_tcp_queue_id(queue)); + return -EPROTO; + } + + recv_digest = *(__le32 *)(pdu + hdr->hlen); + nvme_tcp_hdgst(queue->rcv_hash, pdu, pdu_len); + exp_digest = *(__le32 *)(pdu + hdr->hlen); + if (recv_digest != exp_digest) { + dev_err(queue->ctrl->ctrl.device, + "header digest error: recv %#x expected %#x\n", + le32_to_cpu(recv_digest), le32_to_cpu(exp_digest)); + return -EIO; + } + + return 0; +} + +static int nvme_tcp_check_ddgst(struct nvme_tcp_queue *queue, void *pdu) +{ + struct nvme_tcp_hdr *hdr = pdu; + u8 digest_len = nvme_tcp_hdgst_len(queue); + u32 len; + + len = le32_to_cpu(hdr->plen) - hdr->hlen - + ((hdr->flags & NVME_TCP_F_HDGST) ? digest_len : 0); + + if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) { + dev_err(queue->ctrl->ctrl.device, + "queue %d: data digest flag is cleared\n", + nvme_tcp_queue_id(queue)); + return -EPROTO; + } + crypto_ahash_init(queue->rcv_hash); + + return 0; +} + +static void nvme_tcp_exit_request(struct blk_mq_tag_set *set, + struct request *rq, unsigned int hctx_idx) +{ + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); + + page_frag_free(req->pdu); +} + +static int nvme_tcp_init_request(struct blk_mq_tag_set *set, + struct request *rq, unsigned int hctx_idx, + unsigned int numa_node) +{ + struct nvme_tcp_ctrl *ctrl = set->driver_data; + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_tcp_cmd_pdu *pdu; + int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0; + struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx]; + u8 hdgst = nvme_tcp_hdgst_len(queue); + + req->pdu = page_frag_alloc(&queue->pf_cache, + sizeof(struct nvme_tcp_cmd_pdu) + hdgst, + GFP_KERNEL | __GFP_ZERO); + if (!req->pdu) + return -ENOMEM; + + pdu = req->pdu; + req->queue = queue; + nvme_req(rq)->ctrl = &ctrl->ctrl; + nvme_req(rq)->cmd = &pdu->cmd; + + return 0; +} + +static int nvme_tcp_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) +{ + struct nvme_tcp_ctrl *ctrl = data; + struct nvme_tcp_queue *queue = &ctrl->queues[hctx_idx + 1]; + + hctx->driver_data = queue; + return 0; +} + +static int nvme_tcp_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) +{ + struct nvme_tcp_ctrl *ctrl = data; + struct nvme_tcp_queue *queue = &ctrl->queues[0]; + + hctx->driver_data = queue; + return 0; +} + +static enum nvme_tcp_recv_state +nvme_tcp_recv_state(struct nvme_tcp_queue *queue) +{ + return (queue->pdu_remaining) ? NVME_TCP_RECV_PDU : + (queue->ddgst_remaining) ? NVME_TCP_RECV_DDGST : + NVME_TCP_RECV_DATA; +} + +static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue *queue) +{ + queue->pdu_remaining = sizeof(struct nvme_tcp_rsp_pdu) + + nvme_tcp_hdgst_len(queue); + queue->pdu_offset = 0; + queue->data_remaining = -1; + queue->ddgst_remaining = 0; +} + +static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl) +{ + if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) + return; + + dev_warn(ctrl->device, "starting error recovery\n"); + queue_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work); +} + +static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue, + struct nvme_completion *cqe) +{ + struct nvme_tcp_request *req; + struct request *rq; + + rq = nvme_find_rq(nvme_tcp_tagset(queue), cqe->command_id); + if (!rq) { + dev_err(queue->ctrl->ctrl.device, + "got bad cqe.command_id %#x on queue %d\n", + cqe->command_id, nvme_tcp_queue_id(queue)); + nvme_tcp_error_recovery(&queue->ctrl->ctrl); + return -EINVAL; + } + + req = blk_mq_rq_to_pdu(rq); + if (req->status == cpu_to_le16(NVME_SC_SUCCESS)) + req->status = cqe->status; + + if (!nvme_try_complete_req(rq, req->status, cqe->result)) + nvme_complete_rq(rq); + queue->nr_cqe++; + + return 0; +} + +static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue, + struct nvme_tcp_data_pdu *pdu) +{ + struct request *rq; + + rq = nvme_find_rq(nvme_tcp_tagset(queue), pdu->command_id); + if (!rq) { + dev_err(queue->ctrl->ctrl.device, + "got bad c2hdata.command_id %#x on queue %d\n", + pdu->command_id, nvme_tcp_queue_id(queue)); + return -ENOENT; + } + + if (!blk_rq_payload_bytes(rq)) { + dev_err(queue->ctrl->ctrl.device, + "queue %d tag %#x unexpected data\n", + nvme_tcp_queue_id(queue), rq->tag); + return -EIO; + } + + queue->data_remaining = le32_to_cpu(pdu->data_length); + + if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS && + unlikely(!(pdu->hdr.flags & NVME_TCP_F_DATA_LAST))) { + dev_err(queue->ctrl->ctrl.device, + "queue %d tag %#x SUCCESS set but not last PDU\n", + nvme_tcp_queue_id(queue), rq->tag); + nvme_tcp_error_recovery(&queue->ctrl->ctrl); + return -EPROTO; + } + + return 0; +} + +static int nvme_tcp_handle_comp(struct nvme_tcp_queue *queue, + struct nvme_tcp_rsp_pdu *pdu) +{ + struct nvme_completion *cqe = &pdu->cqe; + int ret = 0; + + /* + * AEN requests are special as they don't time out and can + * survive any kind of queue freeze and often don't respond to + * aborts. We don't even bother to allocate a struct request + * for them but rather special case them here. + */ + if (unlikely(nvme_is_aen_req(nvme_tcp_queue_id(queue), + cqe->command_id))) + nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status, + &cqe->result); + else + ret = nvme_tcp_process_nvme_cqe(queue, cqe); + + return ret; +} + +static void nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req) +{ + struct nvme_tcp_data_pdu *data = req->pdu; + struct nvme_tcp_queue *queue = req->queue; + struct request *rq = blk_mq_rq_from_pdu(req); + u32 h2cdata_sent = req->pdu_len; + u8 hdgst = nvme_tcp_hdgst_len(queue); + u8 ddgst = nvme_tcp_ddgst_len(queue); + + req->state = NVME_TCP_SEND_H2C_PDU; + req->offset = 0; + req->pdu_len = min(req->h2cdata_left, queue->maxh2cdata); + req->pdu_sent = 0; + req->h2cdata_left -= req->pdu_len; + req->h2cdata_offset += h2cdata_sent; + + memset(data, 0, sizeof(*data)); + data->hdr.type = nvme_tcp_h2c_data; + if (!req->h2cdata_left) + data->hdr.flags = NVME_TCP_F_DATA_LAST; + if (queue->hdr_digest) + data->hdr.flags |= NVME_TCP_F_HDGST; + if (queue->data_digest) + data->hdr.flags |= NVME_TCP_F_DDGST; + data->hdr.hlen = sizeof(*data); + data->hdr.pdo = data->hdr.hlen + hdgst; + data->hdr.plen = + cpu_to_le32(data->hdr.hlen + hdgst + req->pdu_len + ddgst); + data->ttag = req->ttag; + data->command_id = nvme_cid(rq); + data->data_offset = cpu_to_le32(req->h2cdata_offset); + data->data_length = cpu_to_le32(req->pdu_len); +} + +static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue, + struct nvme_tcp_r2t_pdu *pdu) +{ + struct nvme_tcp_request *req; + struct request *rq; + u32 r2t_length = le32_to_cpu(pdu->r2t_length); + u32 r2t_offset = le32_to_cpu(pdu->r2t_offset); + + rq = nvme_find_rq(nvme_tcp_tagset(queue), pdu->command_id); + if (!rq) { + dev_err(queue->ctrl->ctrl.device, + "got bad r2t.command_id %#x on queue %d\n", + pdu->command_id, nvme_tcp_queue_id(queue)); + return -ENOENT; + } + req = blk_mq_rq_to_pdu(rq); + + if (unlikely(!r2t_length)) { + dev_err(queue->ctrl->ctrl.device, + "req %d r2t len is %u, probably a bug...\n", + rq->tag, r2t_length); + return -EPROTO; + } + + if (unlikely(req->data_sent + r2t_length > req->data_len)) { + dev_err(queue->ctrl->ctrl.device, + "req %d r2t len %u exceeded data len %u (%zu sent)\n", + rq->tag, r2t_length, req->data_len, req->data_sent); + return -EPROTO; + } + + if (unlikely(r2t_offset < req->data_sent)) { + dev_err(queue->ctrl->ctrl.device, + "req %d unexpected r2t offset %u (expected %zu)\n", + rq->tag, r2t_offset, req->data_sent); + return -EPROTO; + } + + req->pdu_len = 0; + req->h2cdata_left = r2t_length; + req->h2cdata_offset = r2t_offset; + req->ttag = pdu->ttag; + + nvme_tcp_setup_h2c_data_pdu(req); + nvme_tcp_queue_request(req, false, true); + + return 0; +} + +static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb, + unsigned int *offset, size_t *len) +{ + struct nvme_tcp_hdr *hdr; + char *pdu = queue->pdu; + size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining); + int ret; + + ret = skb_copy_bits(skb, *offset, + &pdu[queue->pdu_offset], rcv_len); + if (unlikely(ret)) + return ret; + + queue->pdu_remaining -= rcv_len; + queue->pdu_offset += rcv_len; + *offset += rcv_len; + *len -= rcv_len; + if (queue->pdu_remaining) + return 0; + + hdr = queue->pdu; + if (queue->hdr_digest) { + ret = nvme_tcp_verify_hdgst(queue, queue->pdu, hdr->hlen); + if (unlikely(ret)) + return ret; + } + + + if (queue->data_digest) { + ret = nvme_tcp_check_ddgst(queue, queue->pdu); + if (unlikely(ret)) + return ret; + } + + switch (hdr->type) { + case nvme_tcp_c2h_data: + return nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu); + case nvme_tcp_rsp: + nvme_tcp_init_recv_ctx(queue); + return nvme_tcp_handle_comp(queue, (void *)queue->pdu); + case nvme_tcp_r2t: + nvme_tcp_init_recv_ctx(queue); + return nvme_tcp_handle_r2t(queue, (void *)queue->pdu); + default: + dev_err(queue->ctrl->ctrl.device, + "unsupported pdu type (%d)\n", hdr->type); + return -EINVAL; + } +} + +static inline void nvme_tcp_end_request(struct request *rq, u16 status) +{ + union nvme_result res = {}; + + if (!nvme_try_complete_req(rq, cpu_to_le16(status << 1), res)) + nvme_complete_rq(rq); +} + +static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, + unsigned int *offset, size_t *len) +{ + struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu; + struct request *rq = + nvme_cid_to_rq(nvme_tcp_tagset(queue), pdu->command_id); + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); + + while (true) { + int recv_len, ret; + + recv_len = min_t(size_t, *len, queue->data_remaining); + if (!recv_len) + break; + + if (!iov_iter_count(&req->iter)) { + req->curr_bio = req->curr_bio->bi_next; + + /* + * If we don`t have any bios it means that controller + * sent more data than we requested, hence error + */ + if (!req->curr_bio) { + dev_err(queue->ctrl->ctrl.device, + "queue %d no space in request %#x", + nvme_tcp_queue_id(queue), rq->tag); + nvme_tcp_init_recv_ctx(queue); + return -EIO; + } + nvme_tcp_init_iter(req, READ); + } + + /* we can read only from what is left in this bio */ + recv_len = min_t(size_t, recv_len, + iov_iter_count(&req->iter)); + + if (queue->data_digest) + ret = skb_copy_and_hash_datagram_iter(skb, *offset, + &req->iter, recv_len, queue->rcv_hash); + else + ret = skb_copy_datagram_iter(skb, *offset, + &req->iter, recv_len); + if (ret) { + dev_err(queue->ctrl->ctrl.device, + "queue %d failed to copy request %#x data", + nvme_tcp_queue_id(queue), rq->tag); + return ret; + } + + *len -= recv_len; + *offset += recv_len; + queue->data_remaining -= recv_len; + } + + if (!queue->data_remaining) { + if (queue->data_digest) { + nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst); + queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH; + } else { + if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) { + nvme_tcp_end_request(rq, + le16_to_cpu(req->status)); + queue->nr_cqe++; + } + nvme_tcp_init_recv_ctx(queue); + } + } + + return 0; +} + +static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue, + struct sk_buff *skb, unsigned int *offset, size_t *len) +{ + struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu; + char *ddgst = (char *)&queue->recv_ddgst; + size_t recv_len = min_t(size_t, *len, queue->ddgst_remaining); + off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining; + int ret; + + ret = skb_copy_bits(skb, *offset, &ddgst[off], recv_len); + if (unlikely(ret)) + return ret; + + queue->ddgst_remaining -= recv_len; + *offset += recv_len; + *len -= recv_len; + if (queue->ddgst_remaining) + return 0; + + if (queue->recv_ddgst != queue->exp_ddgst) { + struct request *rq = nvme_cid_to_rq(nvme_tcp_tagset(queue), + pdu->command_id); + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); + + req->status = cpu_to_le16(NVME_SC_DATA_XFER_ERROR); + + dev_err(queue->ctrl->ctrl.device, + "data digest error: recv %#x expected %#x\n", + le32_to_cpu(queue->recv_ddgst), + le32_to_cpu(queue->exp_ddgst)); + } + + if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) { + struct request *rq = nvme_cid_to_rq(nvme_tcp_tagset(queue), + pdu->command_id); + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); + + nvme_tcp_end_request(rq, le16_to_cpu(req->status)); + queue->nr_cqe++; + } + + nvme_tcp_init_recv_ctx(queue); + return 0; +} + +static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff *skb, + unsigned int offset, size_t len) +{ + struct nvme_tcp_queue *queue = desc->arg.data; + size_t consumed = len; + int result; + + while (len) { + switch (nvme_tcp_recv_state(queue)) { + case NVME_TCP_RECV_PDU: + result = nvme_tcp_recv_pdu(queue, skb, &offset, &len); + break; + case NVME_TCP_RECV_DATA: + result = nvme_tcp_recv_data(queue, skb, &offset, &len); + break; + case NVME_TCP_RECV_DDGST: + result = nvme_tcp_recv_ddgst(queue, skb, &offset, &len); + break; + default: + result = -EFAULT; + } + if (result) { + dev_err(queue->ctrl->ctrl.device, + "receive failed: %d\n", result); + queue->rd_enabled = false; + nvme_tcp_error_recovery(&queue->ctrl->ctrl); + return result; + } + } + + return consumed; +} + +static void nvme_tcp_data_ready(struct sock *sk) +{ + struct nvme_tcp_queue *queue; + + read_lock_bh(&sk->sk_callback_lock); + queue = sk->sk_user_data; + if (likely(queue && queue->rd_enabled) && + !test_bit(NVME_TCP_Q_POLLING, &queue->flags)) + queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); + read_unlock_bh(&sk->sk_callback_lock); +} + +static void nvme_tcp_write_space(struct sock *sk) +{ + struct nvme_tcp_queue *queue; + + read_lock_bh(&sk->sk_callback_lock); + queue = sk->sk_user_data; + if (likely(queue && sk_stream_is_writeable(sk))) { + clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); + } + read_unlock_bh(&sk->sk_callback_lock); +} + +static void nvme_tcp_state_change(struct sock *sk) +{ + struct nvme_tcp_queue *queue; + + read_lock_bh(&sk->sk_callback_lock); + queue = sk->sk_user_data; + if (!queue) + goto done; + + switch (sk->sk_state) { + case TCP_CLOSE: + case TCP_CLOSE_WAIT: + case TCP_LAST_ACK: + case TCP_FIN_WAIT1: + case TCP_FIN_WAIT2: + nvme_tcp_error_recovery(&queue->ctrl->ctrl); + break; + default: + dev_info(queue->ctrl->ctrl.device, + "queue %d socket state %d\n", + nvme_tcp_queue_id(queue), sk->sk_state); + } + + queue->state_change(sk); +done: + read_unlock_bh(&sk->sk_callback_lock); +} + +static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue) +{ + queue->request = NULL; +} + +static void nvme_tcp_fail_request(struct nvme_tcp_request *req) +{ + if (nvme_tcp_async_req(req)) { + union nvme_result res = {}; + + nvme_complete_async_event(&req->queue->ctrl->ctrl, + cpu_to_le16(NVME_SC_HOST_PATH_ERROR), &res); + } else { + nvme_tcp_end_request(blk_mq_rq_from_pdu(req), + NVME_SC_HOST_PATH_ERROR); + } +} + +static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) +{ + struct nvme_tcp_queue *queue = req->queue; + int req_data_len = req->data_len; + u32 h2cdata_left = req->h2cdata_left; + + while (true) { + struct page *page = nvme_tcp_req_cur_page(req); + size_t offset = nvme_tcp_req_cur_offset(req); + size_t len = nvme_tcp_req_cur_length(req); + bool last = nvme_tcp_pdu_last_send(req, len); + int req_data_sent = req->data_sent; + int ret, flags = MSG_DONTWAIT; + + if (last && !queue->data_digest && !nvme_tcp_queue_more(queue)) + flags |= MSG_EOR; + else + flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST; + + if (sendpage_ok(page)) { + ret = kernel_sendpage(queue->sock, page, offset, len, + flags); + } else { + ret = sock_no_sendpage(queue->sock, page, offset, len, + flags); + } + if (ret <= 0) + return ret; + + if (queue->data_digest) + nvme_tcp_ddgst_update(queue->snd_hash, page, + offset, ret); + + /* + * update the request iterator except for the last payload send + * in the request where we don't want to modify it as we may + * compete with the RX path completing the request. + */ + if (req_data_sent + ret < req_data_len) + nvme_tcp_advance_req(req, ret); + + /* fully successful last send in current PDU */ + if (last && ret == len) { + if (queue->data_digest) { + nvme_tcp_ddgst_final(queue->snd_hash, + &req->ddgst); + req->state = NVME_TCP_SEND_DDGST; + req->offset = 0; + } else { + if (h2cdata_left) + nvme_tcp_setup_h2c_data_pdu(req); + else + nvme_tcp_done_send_req(queue); + } + return 1; + } + } + return -EAGAIN; +} + +static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req) +{ + struct nvme_tcp_queue *queue = req->queue; + struct nvme_tcp_cmd_pdu *pdu = req->pdu; + bool inline_data = nvme_tcp_has_inline_data(req); + u8 hdgst = nvme_tcp_hdgst_len(queue); + int len = sizeof(*pdu) + hdgst - req->offset; + int flags = MSG_DONTWAIT; + int ret; + + if (inline_data || nvme_tcp_queue_more(queue)) + flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST; + else + flags |= MSG_EOR; + + if (queue->hdr_digest && !req->offset) + nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); + + ret = kernel_sendpage(queue->sock, virt_to_page(pdu), + offset_in_page(pdu) + req->offset, len, flags); + if (unlikely(ret <= 0)) + return ret; + + len -= ret; + if (!len) { + if (inline_data) { + req->state = NVME_TCP_SEND_DATA; + if (queue->data_digest) + crypto_ahash_init(queue->snd_hash); + } else { + nvme_tcp_done_send_req(queue); + } + return 1; + } + req->offset += ret; + + return -EAGAIN; +} + +static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req) +{ + struct nvme_tcp_queue *queue = req->queue; + struct nvme_tcp_data_pdu *pdu = req->pdu; + u8 hdgst = nvme_tcp_hdgst_len(queue); + int len = sizeof(*pdu) - req->offset + hdgst; + int ret; + + if (queue->hdr_digest && !req->offset) + nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); + + if (!req->h2cdata_left) + ret = kernel_sendpage(queue->sock, virt_to_page(pdu), + offset_in_page(pdu) + req->offset, len, + MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST); + else + ret = sock_no_sendpage(queue->sock, virt_to_page(pdu), + offset_in_page(pdu) + req->offset, len, + MSG_DONTWAIT | MSG_MORE); + if (unlikely(ret <= 0)) + return ret; + + len -= ret; + if (!len) { + req->state = NVME_TCP_SEND_DATA; + if (queue->data_digest) + crypto_ahash_init(queue->snd_hash); + return 1; + } + req->offset += ret; + + return -EAGAIN; +} + +static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req) +{ + struct nvme_tcp_queue *queue = req->queue; + size_t offset = req->offset; + u32 h2cdata_left = req->h2cdata_left; + int ret; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT }; + struct kvec iov = { + .iov_base = (u8 *)&req->ddgst + req->offset, + .iov_len = NVME_TCP_DIGEST_LENGTH - req->offset + }; + + if (nvme_tcp_queue_more(queue)) + msg.msg_flags |= MSG_MORE; + else + msg.msg_flags |= MSG_EOR; + + ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len); + if (unlikely(ret <= 0)) + return ret; + + if (offset + ret == NVME_TCP_DIGEST_LENGTH) { + if (h2cdata_left) + nvme_tcp_setup_h2c_data_pdu(req); + else + nvme_tcp_done_send_req(queue); + return 1; + } + + req->offset += ret; + return -EAGAIN; +} + +static int nvme_tcp_try_send(struct nvme_tcp_queue *queue) +{ + struct nvme_tcp_request *req; + int ret = 1; + + if (!queue->request) { + queue->request = nvme_tcp_fetch_request(queue); + if (!queue->request) + return 0; + } + req = queue->request; + + if (req->state == NVME_TCP_SEND_CMD_PDU) { + ret = nvme_tcp_try_send_cmd_pdu(req); + if (ret <= 0) + goto done; + if (!nvme_tcp_has_inline_data(req)) + return ret; + } + + if (req->state == NVME_TCP_SEND_H2C_PDU) { + ret = nvme_tcp_try_send_data_pdu(req); + if (ret <= 0) + goto done; + } + + if (req->state == NVME_TCP_SEND_DATA) { + ret = nvme_tcp_try_send_data(req); + if (ret <= 0) + goto done; + } + + if (req->state == NVME_TCP_SEND_DDGST) + ret = nvme_tcp_try_send_ddgst(req); +done: + if (ret == -EAGAIN) { + ret = 0; + } else if (ret < 0) { + dev_err(queue->ctrl->ctrl.device, + "failed to send request %d\n", ret); + nvme_tcp_fail_request(queue->request); + nvme_tcp_done_send_req(queue); + } + return ret; +} + +static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue) +{ + struct socket *sock = queue->sock; + struct sock *sk = sock->sk; + read_descriptor_t rd_desc; + int consumed; + + rd_desc.arg.data = queue; + rd_desc.count = 1; + lock_sock(sk); + queue->nr_cqe = 0; + consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb); + release_sock(sk); + return consumed; +} + +static void nvme_tcp_io_work(struct work_struct *w) +{ + struct nvme_tcp_queue *queue = + container_of(w, struct nvme_tcp_queue, io_work); + unsigned long deadline = jiffies + msecs_to_jiffies(1); + + do { + bool pending = false; + int result; + + if (mutex_trylock(&queue->send_mutex)) { + result = nvme_tcp_try_send(queue); + mutex_unlock(&queue->send_mutex); + if (result > 0) + pending = true; + else if (unlikely(result < 0)) + break; + } + + result = nvme_tcp_try_recv(queue); + if (result > 0) + pending = true; + else if (unlikely(result < 0)) + return; + + if (!pending || !queue->rd_enabled) + return; + + } while (!time_after(jiffies, deadline)); /* quota is exhausted */ + + queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); +} + +static void nvme_tcp_free_crypto(struct nvme_tcp_queue *queue) +{ + struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash); + + ahash_request_free(queue->rcv_hash); + ahash_request_free(queue->snd_hash); + crypto_free_ahash(tfm); +} + +static int nvme_tcp_alloc_crypto(struct nvme_tcp_queue *queue) +{ + struct crypto_ahash *tfm; + + tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL); + if (!queue->snd_hash) + goto free_tfm; + ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL); + + queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL); + if (!queue->rcv_hash) + goto free_snd_hash; + ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL); + + return 0; +free_snd_hash: + ahash_request_free(queue->snd_hash); +free_tfm: + crypto_free_ahash(tfm); + return -ENOMEM; +} + +static void nvme_tcp_free_async_req(struct nvme_tcp_ctrl *ctrl) +{ + struct nvme_tcp_request *async = &ctrl->async_req; + + page_frag_free(async->pdu); +} + +static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl) +{ + struct nvme_tcp_queue *queue = &ctrl->queues[0]; + struct nvme_tcp_request *async = &ctrl->async_req; + u8 hdgst = nvme_tcp_hdgst_len(queue); + + async->pdu = page_frag_alloc(&queue->pf_cache, + sizeof(struct nvme_tcp_cmd_pdu) + hdgst, + GFP_KERNEL | __GFP_ZERO); + if (!async->pdu) + return -ENOMEM; + + async->queue = &ctrl->queues[0]; + return 0; +} + +static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid) +{ + struct page *page; + struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); + struct nvme_tcp_queue *queue = &ctrl->queues[qid]; + + if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags)) + return; + + if (queue->hdr_digest || queue->data_digest) + nvme_tcp_free_crypto(queue); + + if (queue->pf_cache.va) { + page = virt_to_head_page(queue->pf_cache.va); + __page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias); + queue->pf_cache.va = NULL; + } + sock_release(queue->sock); + kfree(queue->pdu); + mutex_destroy(&queue->send_mutex); + mutex_destroy(&queue->queue_lock); +} + +static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue) +{ + struct nvme_tcp_icreq_pdu *icreq; + struct nvme_tcp_icresp_pdu *icresp; + struct msghdr msg = {}; + struct kvec iov; + bool ctrl_hdgst, ctrl_ddgst; + u32 maxh2cdata; + int ret; + + icreq = kzalloc(sizeof(*icreq), GFP_KERNEL); + if (!icreq) + return -ENOMEM; + + icresp = kzalloc(sizeof(*icresp), GFP_KERNEL); + if (!icresp) { + ret = -ENOMEM; + goto free_icreq; + } + + icreq->hdr.type = nvme_tcp_icreq; + icreq->hdr.hlen = sizeof(*icreq); + icreq->hdr.pdo = 0; + icreq->hdr.plen = cpu_to_le32(icreq->hdr.hlen); + icreq->pfv = cpu_to_le16(NVME_TCP_PFV_1_0); + icreq->maxr2t = 0; /* single inflight r2t supported */ + icreq->hpda = 0; /* no alignment constraint */ + if (queue->hdr_digest) + icreq->digest |= NVME_TCP_HDR_DIGEST_ENABLE; + if (queue->data_digest) + icreq->digest |= NVME_TCP_DATA_DIGEST_ENABLE; + + iov.iov_base = icreq; + iov.iov_len = sizeof(*icreq); + ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len); + if (ret < 0) + goto free_icresp; + + memset(&msg, 0, sizeof(msg)); + iov.iov_base = icresp; + iov.iov_len = sizeof(*icresp); + ret = kernel_recvmsg(queue->sock, &msg, &iov, 1, + iov.iov_len, msg.msg_flags); + if (ret < 0) + goto free_icresp; + + ret = -EINVAL; + if (icresp->hdr.type != nvme_tcp_icresp) { + pr_err("queue %d: bad type returned %d\n", + nvme_tcp_queue_id(queue), icresp->hdr.type); + goto free_icresp; + } + + if (le32_to_cpu(icresp->hdr.plen) != sizeof(*icresp)) { + pr_err("queue %d: bad pdu length returned %d\n", + nvme_tcp_queue_id(queue), icresp->hdr.plen); + goto free_icresp; + } + + if (icresp->pfv != NVME_TCP_PFV_1_0) { + pr_err("queue %d: bad pfv returned %d\n", + nvme_tcp_queue_id(queue), icresp->pfv); + goto free_icresp; + } + + ctrl_ddgst = !!(icresp->digest & NVME_TCP_DATA_DIGEST_ENABLE); + if ((queue->data_digest && !ctrl_ddgst) || + (!queue->data_digest && ctrl_ddgst)) { + pr_err("queue %d: data digest mismatch host: %s ctrl: %s\n", + nvme_tcp_queue_id(queue), + queue->data_digest ? "enabled" : "disabled", + ctrl_ddgst ? "enabled" : "disabled"); + goto free_icresp; + } + + ctrl_hdgst = !!(icresp->digest & NVME_TCP_HDR_DIGEST_ENABLE); + if ((queue->hdr_digest && !ctrl_hdgst) || + (!queue->hdr_digest && ctrl_hdgst)) { + pr_err("queue %d: header digest mismatch host: %s ctrl: %s\n", + nvme_tcp_queue_id(queue), + queue->hdr_digest ? "enabled" : "disabled", + ctrl_hdgst ? "enabled" : "disabled"); + goto free_icresp; + } + + if (icresp->cpda != 0) { + pr_err("queue %d: unsupported cpda returned %d\n", + nvme_tcp_queue_id(queue), icresp->cpda); + goto free_icresp; + } + + maxh2cdata = le32_to_cpu(icresp->maxdata); + if ((maxh2cdata % 4) || (maxh2cdata < NVME_TCP_MIN_MAXH2CDATA)) { + pr_err("queue %d: invalid maxh2cdata returned %u\n", + nvme_tcp_queue_id(queue), maxh2cdata); + goto free_icresp; + } + queue->maxh2cdata = maxh2cdata; + + ret = 0; +free_icresp: + kfree(icresp); +free_icreq: + kfree(icreq); + return ret; +} + +static bool nvme_tcp_admin_queue(struct nvme_tcp_queue *queue) +{ + return nvme_tcp_queue_id(queue) == 0; +} + +static bool nvme_tcp_default_queue(struct nvme_tcp_queue *queue) +{ + struct nvme_tcp_ctrl *ctrl = queue->ctrl; + int qid = nvme_tcp_queue_id(queue); + + return !nvme_tcp_admin_queue(queue) && + qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT]; +} + +static bool nvme_tcp_read_queue(struct nvme_tcp_queue *queue) +{ + struct nvme_tcp_ctrl *ctrl = queue->ctrl; + int qid = nvme_tcp_queue_id(queue); + + return !nvme_tcp_admin_queue(queue) && + !nvme_tcp_default_queue(queue) && + qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] + + ctrl->io_queues[HCTX_TYPE_READ]; +} + +static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue) +{ + struct nvme_tcp_ctrl *ctrl = queue->ctrl; + int qid = nvme_tcp_queue_id(queue); + + return !nvme_tcp_admin_queue(queue) && + !nvme_tcp_default_queue(queue) && + !nvme_tcp_read_queue(queue) && + qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] + + ctrl->io_queues[HCTX_TYPE_READ] + + ctrl->io_queues[HCTX_TYPE_POLL]; +} + +static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue) +{ + struct nvme_tcp_ctrl *ctrl = queue->ctrl; + int qid = nvme_tcp_queue_id(queue); + int n = 0; + + if (nvme_tcp_default_queue(queue)) + n = qid - 1; + else if (nvme_tcp_read_queue(queue)) + n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - 1; + else if (nvme_tcp_poll_queue(queue)) + n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - + ctrl->io_queues[HCTX_TYPE_READ] - 1; + queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false); +} + +static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, + int qid, size_t queue_size) +{ + struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); + struct nvme_tcp_queue *queue = &ctrl->queues[qid]; + int ret, rcv_pdu_size; + + mutex_init(&queue->queue_lock); + queue->ctrl = ctrl; + init_llist_head(&queue->req_list); + INIT_LIST_HEAD(&queue->send_list); + mutex_init(&queue->send_mutex); + INIT_WORK(&queue->io_work, nvme_tcp_io_work); + queue->queue_size = queue_size; + + if (qid > 0) + queue->cmnd_capsule_len = nctrl->ioccsz * 16; + else + queue->cmnd_capsule_len = sizeof(struct nvme_command) + + NVME_TCP_ADMIN_CCSZ; + + ret = sock_create(ctrl->addr.ss_family, SOCK_STREAM, + IPPROTO_TCP, &queue->sock); + if (ret) { + dev_err(nctrl->device, + "failed to create socket: %d\n", ret); + goto err_destroy_mutex; + } + + /* Single syn retry */ + tcp_sock_set_syncnt(queue->sock->sk, 1); + + /* Set TCP no delay */ + tcp_sock_set_nodelay(queue->sock->sk); + + /* + * Cleanup whatever is sitting in the TCP transmit queue on socket + * close. This is done to prevent stale data from being sent should + * the network connection be restored before TCP times out. + */ + sock_no_linger(queue->sock->sk); + + if (so_priority > 0) + sock_set_priority(queue->sock->sk, so_priority); + + /* Set socket type of service */ + if (nctrl->opts->tos >= 0) + ip_sock_set_tos(queue->sock->sk, nctrl->opts->tos); + + /* Set 10 seconds timeout for icresp recvmsg */ + queue->sock->sk->sk_rcvtimeo = 10 * HZ; + + queue->sock->sk->sk_allocation = GFP_ATOMIC; + nvme_tcp_set_queue_io_cpu(queue); + queue->request = NULL; + queue->data_remaining = 0; + queue->ddgst_remaining = 0; + queue->pdu_remaining = 0; + queue->pdu_offset = 0; + sk_set_memalloc(queue->sock->sk); + + if (nctrl->opts->mask & NVMF_OPT_HOST_TRADDR) { + ret = kernel_bind(queue->sock, (struct sockaddr *)&ctrl->src_addr, + sizeof(ctrl->src_addr)); + if (ret) { + dev_err(nctrl->device, + "failed to bind queue %d socket %d\n", + qid, ret); + goto err_sock; + } + } + + if (nctrl->opts->mask & NVMF_OPT_HOST_IFACE) { + char *iface = nctrl->opts->host_iface; + sockptr_t optval = KERNEL_SOCKPTR(iface); + + ret = sock_setsockopt(queue->sock, SOL_SOCKET, SO_BINDTODEVICE, + optval, strlen(iface)); + if (ret) { + dev_err(nctrl->device, + "failed to bind to interface %s queue %d err %d\n", + iface, qid, ret); + goto err_sock; + } + } + + queue->hdr_digest = nctrl->opts->hdr_digest; + queue->data_digest = nctrl->opts->data_digest; + if (queue->hdr_digest || queue->data_digest) { + ret = nvme_tcp_alloc_crypto(queue); + if (ret) { + dev_err(nctrl->device, + "failed to allocate queue %d crypto\n", qid); + goto err_sock; + } + } + + rcv_pdu_size = sizeof(struct nvme_tcp_rsp_pdu) + + nvme_tcp_hdgst_len(queue); + queue->pdu = kmalloc(rcv_pdu_size, GFP_KERNEL); + if (!queue->pdu) { + ret = -ENOMEM; + goto err_crypto; + } + + dev_dbg(nctrl->device, "connecting queue %d\n", + nvme_tcp_queue_id(queue)); + + ret = kernel_connect(queue->sock, (struct sockaddr *)&ctrl->addr, + sizeof(ctrl->addr), 0); + if (ret) { + dev_err(nctrl->device, + "failed to connect socket: %d\n", ret); + goto err_rcv_pdu; + } + + ret = nvme_tcp_init_connection(queue); + if (ret) + goto err_init_connect; + + queue->rd_enabled = true; + set_bit(NVME_TCP_Q_ALLOCATED, &queue->flags); + nvme_tcp_init_recv_ctx(queue); + + write_lock_bh(&queue->sock->sk->sk_callback_lock); + queue->sock->sk->sk_user_data = queue; + queue->state_change = queue->sock->sk->sk_state_change; + queue->data_ready = queue->sock->sk->sk_data_ready; + queue->write_space = queue->sock->sk->sk_write_space; + queue->sock->sk->sk_data_ready = nvme_tcp_data_ready; + queue->sock->sk->sk_state_change = nvme_tcp_state_change; + queue->sock->sk->sk_write_space = nvme_tcp_write_space; +#ifdef CONFIG_NET_RX_BUSY_POLL + queue->sock->sk->sk_ll_usec = 1; +#endif + write_unlock_bh(&queue->sock->sk->sk_callback_lock); + + return 0; + +err_init_connect: + kernel_sock_shutdown(queue->sock, SHUT_RDWR); +err_rcv_pdu: + kfree(queue->pdu); +err_crypto: + if (queue->hdr_digest || queue->data_digest) + nvme_tcp_free_crypto(queue); +err_sock: + sock_release(queue->sock); + queue->sock = NULL; +err_destroy_mutex: + mutex_destroy(&queue->send_mutex); + mutex_destroy(&queue->queue_lock); + return ret; +} + +static void nvme_tcp_restore_sock_calls(struct nvme_tcp_queue *queue) +{ + struct socket *sock = queue->sock; + + write_lock_bh(&sock->sk->sk_callback_lock); + sock->sk->sk_user_data = NULL; + sock->sk->sk_data_ready = queue->data_ready; + sock->sk->sk_state_change = queue->state_change; + sock->sk->sk_write_space = queue->write_space; + write_unlock_bh(&sock->sk->sk_callback_lock); +} + +static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue) +{ + kernel_sock_shutdown(queue->sock, SHUT_RDWR); + nvme_tcp_restore_sock_calls(queue); + cancel_work_sync(&queue->io_work); +} + +static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid) +{ + struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); + struct nvme_tcp_queue *queue = &ctrl->queues[qid]; + + mutex_lock(&queue->queue_lock); + if (test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags)) + __nvme_tcp_stop_queue(queue); + mutex_unlock(&queue->queue_lock); +} + +static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx) +{ + struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); + int ret; + + if (idx) + ret = nvmf_connect_io_queue(nctrl, idx); + else + ret = nvmf_connect_admin_queue(nctrl); + + if (!ret) { + set_bit(NVME_TCP_Q_LIVE, &ctrl->queues[idx].flags); + } else { + if (test_bit(NVME_TCP_Q_ALLOCATED, &ctrl->queues[idx].flags)) + __nvme_tcp_stop_queue(&ctrl->queues[idx]); + dev_err(nctrl->device, + "failed to connect queue: %d ret=%d\n", idx, ret); + } + return ret; +} + +static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl, + bool admin) +{ + struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); + struct blk_mq_tag_set *set; + int ret; + + if (admin) { + set = &ctrl->admin_tag_set; + memset(set, 0, sizeof(*set)); + set->ops = &nvme_tcp_admin_mq_ops; + set->queue_depth = NVME_AQ_MQ_TAG_DEPTH; + set->reserved_tags = NVMF_RESERVED_TAGS; + set->numa_node = nctrl->numa_node; + set->flags = BLK_MQ_F_BLOCKING; + set->cmd_size = sizeof(struct nvme_tcp_request); + set->driver_data = ctrl; + set->nr_hw_queues = 1; + set->timeout = NVME_ADMIN_TIMEOUT; + } else { + set = &ctrl->tag_set; + memset(set, 0, sizeof(*set)); + set->ops = &nvme_tcp_mq_ops; + set->queue_depth = nctrl->sqsize + 1; + set->reserved_tags = NVMF_RESERVED_TAGS; + set->numa_node = nctrl->numa_node; + set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING; + set->cmd_size = sizeof(struct nvme_tcp_request); + set->driver_data = ctrl; + set->nr_hw_queues = nctrl->queue_count - 1; + set->timeout = NVME_IO_TIMEOUT; + set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2; + } + + ret = blk_mq_alloc_tag_set(set); + if (ret) + return ERR_PTR(ret); + + return set; +} + +static void nvme_tcp_free_admin_queue(struct nvme_ctrl *ctrl) +{ + if (to_tcp_ctrl(ctrl)->async_req.pdu) { + cancel_work_sync(&ctrl->async_event_work); + nvme_tcp_free_async_req(to_tcp_ctrl(ctrl)); + to_tcp_ctrl(ctrl)->async_req.pdu = NULL; + } + + nvme_tcp_free_queue(ctrl, 0); +} + +static void nvme_tcp_free_io_queues(struct nvme_ctrl *ctrl) +{ + int i; + + for (i = 1; i < ctrl->queue_count; i++) + nvme_tcp_free_queue(ctrl, i); +} + +static void nvme_tcp_stop_io_queues(struct nvme_ctrl *ctrl) +{ + int i; + + for (i = 1; i < ctrl->queue_count; i++) + nvme_tcp_stop_queue(ctrl, i); +} + +static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl) +{ + int i, ret = 0; + + for (i = 1; i < ctrl->queue_count; i++) { + ret = nvme_tcp_start_queue(ctrl, i); + if (ret) + goto out_stop_queues; + } + + return 0; + +out_stop_queues: + for (i--; i >= 1; i--) + nvme_tcp_stop_queue(ctrl, i); + return ret; +} + +static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl) +{ + int ret; + + ret = nvme_tcp_alloc_queue(ctrl, 0, NVME_AQ_DEPTH); + if (ret) + return ret; + + ret = nvme_tcp_alloc_async_req(to_tcp_ctrl(ctrl)); + if (ret) + goto out_free_queue; + + return 0; + +out_free_queue: + nvme_tcp_free_queue(ctrl, 0); + return ret; +} + +static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) +{ + int i, ret; + + for (i = 1; i < ctrl->queue_count; i++) { + ret = nvme_tcp_alloc_queue(ctrl, i, + ctrl->sqsize + 1); + if (ret) + goto out_free_queues; + } + + return 0; + +out_free_queues: + for (i--; i >= 1; i--) + nvme_tcp_free_queue(ctrl, i); + + return ret; +} + +static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl *ctrl) +{ + unsigned int nr_io_queues; + + nr_io_queues = min(ctrl->opts->nr_io_queues, num_online_cpus()); + nr_io_queues += min(ctrl->opts->nr_write_queues, num_online_cpus()); + nr_io_queues += min(ctrl->opts->nr_poll_queues, num_online_cpus()); + + return nr_io_queues; +} + +static void nvme_tcp_set_io_queues(struct nvme_ctrl *nctrl, + unsigned int nr_io_queues) +{ + struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); + struct nvmf_ctrl_options *opts = nctrl->opts; + + if (opts->nr_write_queues && opts->nr_io_queues < nr_io_queues) { + /* + * separate read/write queues + * hand out dedicated default queues only after we have + * sufficient read queues. + */ + ctrl->io_queues[HCTX_TYPE_READ] = opts->nr_io_queues; + nr_io_queues -= ctrl->io_queues[HCTX_TYPE_READ]; + ctrl->io_queues[HCTX_TYPE_DEFAULT] = + min(opts->nr_write_queues, nr_io_queues); + nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT]; + } else { + /* + * shared read/write queues + * either no write queues were requested, or we don't have + * sufficient queue count to have dedicated default queues. + */ + ctrl->io_queues[HCTX_TYPE_DEFAULT] = + min(opts->nr_io_queues, nr_io_queues); + nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT]; + } + + if (opts->nr_poll_queues && nr_io_queues) { + /* map dedicated poll queues only if we have queues left */ + ctrl->io_queues[HCTX_TYPE_POLL] = + min(opts->nr_poll_queues, nr_io_queues); + } +} + +static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) +{ + unsigned int nr_io_queues; + int ret; + + nr_io_queues = nvme_tcp_nr_io_queues(ctrl); + ret = nvme_set_queue_count(ctrl, &nr_io_queues); + if (ret) + return ret; + + if (nr_io_queues == 0) { + dev_err(ctrl->device, + "unable to set any I/O queues\n"); + return -ENOMEM; + } + + ctrl->queue_count = nr_io_queues + 1; + dev_info(ctrl->device, + "creating %d I/O queues.\n", nr_io_queues); + + nvme_tcp_set_io_queues(ctrl, nr_io_queues); + + return __nvme_tcp_alloc_io_queues(ctrl); +} + +static void nvme_tcp_destroy_io_queues(struct nvme_ctrl *ctrl, bool remove) +{ + nvme_tcp_stop_io_queues(ctrl); + if (remove) { + blk_cleanup_queue(ctrl->connect_q); + blk_mq_free_tag_set(ctrl->tagset); + } + nvme_tcp_free_io_queues(ctrl); +} + +static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) +{ + int ret; + + ret = nvme_tcp_alloc_io_queues(ctrl); + if (ret) + return ret; + + if (new) { + ctrl->tagset = nvme_tcp_alloc_tagset(ctrl, false); + if (IS_ERR(ctrl->tagset)) { + ret = PTR_ERR(ctrl->tagset); + goto out_free_io_queues; + } + + ctrl->connect_q = blk_mq_init_queue(ctrl->tagset); + if (IS_ERR(ctrl->connect_q)) { + ret = PTR_ERR(ctrl->connect_q); + goto out_free_tag_set; + } + } + + ret = nvme_tcp_start_io_queues(ctrl); + if (ret) + goto out_cleanup_connect_q; + + if (!new) { + nvme_start_queues(ctrl); + if (!nvme_wait_freeze_timeout(ctrl, NVME_IO_TIMEOUT)) { + /* + * If we timed out waiting for freeze we are likely to + * be stuck. Fail the controller initialization just + * to be safe. + */ + ret = -ENODEV; + goto out_wait_freeze_timed_out; + } + blk_mq_update_nr_hw_queues(ctrl->tagset, + ctrl->queue_count - 1); + nvme_unfreeze(ctrl); + } + + return 0; + +out_wait_freeze_timed_out: + nvme_stop_queues(ctrl); + nvme_sync_io_queues(ctrl); + nvme_tcp_stop_io_queues(ctrl); +out_cleanup_connect_q: + nvme_cancel_tagset(ctrl); + if (new) + blk_cleanup_queue(ctrl->connect_q); +out_free_tag_set: + if (new) + blk_mq_free_tag_set(ctrl->tagset); +out_free_io_queues: + nvme_tcp_free_io_queues(ctrl); + return ret; +} + +static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove) +{ + nvme_tcp_stop_queue(ctrl, 0); + if (remove) { + blk_cleanup_queue(ctrl->admin_q); + blk_cleanup_queue(ctrl->fabrics_q); + blk_mq_free_tag_set(ctrl->admin_tagset); + } + nvme_tcp_free_admin_queue(ctrl); +} + +static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) +{ + int error; + + error = nvme_tcp_alloc_admin_queue(ctrl); + if (error) + return error; + + if (new) { + ctrl->admin_tagset = nvme_tcp_alloc_tagset(ctrl, true); + if (IS_ERR(ctrl->admin_tagset)) { + error = PTR_ERR(ctrl->admin_tagset); + goto out_free_queue; + } + + ctrl->fabrics_q = blk_mq_init_queue(ctrl->admin_tagset); + if (IS_ERR(ctrl->fabrics_q)) { + error = PTR_ERR(ctrl->fabrics_q); + goto out_free_tagset; + } + + ctrl->admin_q = blk_mq_init_queue(ctrl->admin_tagset); + if (IS_ERR(ctrl->admin_q)) { + error = PTR_ERR(ctrl->admin_q); + goto out_cleanup_fabrics_q; + } + } + + error = nvme_tcp_start_queue(ctrl, 0); + if (error) + goto out_cleanup_queue; + + error = nvme_enable_ctrl(ctrl); + if (error) + goto out_stop_queue; + + nvme_start_admin_queue(ctrl); + + error = nvme_init_ctrl_finish(ctrl); + if (error) + goto out_quiesce_queue; + + return 0; + +out_quiesce_queue: + nvme_stop_admin_queue(ctrl); + blk_sync_queue(ctrl->admin_q); +out_stop_queue: + nvme_tcp_stop_queue(ctrl, 0); + nvme_cancel_admin_tagset(ctrl); +out_cleanup_queue: + if (new) + blk_cleanup_queue(ctrl->admin_q); +out_cleanup_fabrics_q: + if (new) + blk_cleanup_queue(ctrl->fabrics_q); +out_free_tagset: + if (new) + blk_mq_free_tag_set(ctrl->admin_tagset); +out_free_queue: + nvme_tcp_free_admin_queue(ctrl); + return error; +} + +static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl, + bool remove) +{ + nvme_stop_admin_queue(ctrl); + blk_sync_queue(ctrl->admin_q); + nvme_tcp_stop_queue(ctrl, 0); + nvme_cancel_admin_tagset(ctrl); + if (remove) + nvme_start_admin_queue(ctrl); + nvme_tcp_destroy_admin_queue(ctrl, remove); +} + +static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl, + bool remove) +{ + if (ctrl->queue_count <= 1) + return; + nvme_stop_admin_queue(ctrl); + nvme_start_freeze(ctrl); + nvme_stop_queues(ctrl); + nvme_sync_io_queues(ctrl); + nvme_tcp_stop_io_queues(ctrl); + nvme_cancel_tagset(ctrl); + if (remove) + nvme_start_queues(ctrl); + nvme_tcp_destroy_io_queues(ctrl, remove); +} + +static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl) +{ + /* If we are resetting/deleting then do nothing */ + if (ctrl->state != NVME_CTRL_CONNECTING) { + WARN_ON_ONCE(ctrl->state == NVME_CTRL_NEW || + ctrl->state == NVME_CTRL_LIVE); + return; + } + + if (nvmf_should_reconnect(ctrl)) { + dev_info(ctrl->device, "Reconnecting in %d seconds...\n", + ctrl->opts->reconnect_delay); + queue_delayed_work(nvme_wq, &to_tcp_ctrl(ctrl)->connect_work, + ctrl->opts->reconnect_delay * HZ); + } else { + dev_info(ctrl->device, "Removing controller...\n"); + nvme_delete_ctrl(ctrl); + } +} + +static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new) +{ + struct nvmf_ctrl_options *opts = ctrl->opts; + int ret; + + ret = nvme_tcp_configure_admin_queue(ctrl, new); + if (ret) + return ret; + + if (ctrl->icdoff) { + ret = -EOPNOTSUPP; + dev_err(ctrl->device, "icdoff is not supported!\n"); + goto destroy_admin; + } + + if (!nvme_ctrl_sgl_supported(ctrl)) { + ret = -EOPNOTSUPP; + dev_err(ctrl->device, "Mandatory sgls are not supported!\n"); + goto destroy_admin; + } + + if (opts->queue_size > ctrl->sqsize + 1) + dev_warn(ctrl->device, + "queue_size %zu > ctrl sqsize %u, clamping down\n", + opts->queue_size, ctrl->sqsize + 1); + + if (ctrl->sqsize + 1 > ctrl->maxcmd) { + dev_warn(ctrl->device, + "sqsize %u > ctrl maxcmd %u, clamping down\n", + ctrl->sqsize + 1, ctrl->maxcmd); + ctrl->sqsize = ctrl->maxcmd - 1; + } + + if (ctrl->queue_count > 1) { + ret = nvme_tcp_configure_io_queues(ctrl, new); + if (ret) + goto destroy_admin; + } + + if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) { + /* + * state change failure is ok if we started ctrl delete, + * unless we're during creation of a new controller to + * avoid races with teardown flow. + */ + WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING && + ctrl->state != NVME_CTRL_DELETING_NOIO); + WARN_ON_ONCE(new); + ret = -EINVAL; + goto destroy_io; + } + + nvme_start_ctrl(ctrl); + return 0; + +destroy_io: + if (ctrl->queue_count > 1) { + nvme_stop_queues(ctrl); + nvme_sync_io_queues(ctrl); + nvme_tcp_stop_io_queues(ctrl); + nvme_cancel_tagset(ctrl); + nvme_tcp_destroy_io_queues(ctrl, new); + } +destroy_admin: + nvme_stop_admin_queue(ctrl); + blk_sync_queue(ctrl->admin_q); + nvme_tcp_stop_queue(ctrl, 0); + nvme_cancel_admin_tagset(ctrl); + nvme_tcp_destroy_admin_queue(ctrl, new); + return ret; +} + +static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work) +{ + struct nvme_tcp_ctrl *tcp_ctrl = container_of(to_delayed_work(work), + struct nvme_tcp_ctrl, connect_work); + struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl; + + ++ctrl->nr_reconnects; + + if (nvme_tcp_setup_ctrl(ctrl, false)) + goto requeue; + + dev_info(ctrl->device, "Successfully reconnected (%d attempt)\n", + ctrl->nr_reconnects); + + ctrl->nr_reconnects = 0; + + return; + +requeue: + dev_info(ctrl->device, "Failed reconnect attempt %d\n", + ctrl->nr_reconnects); + nvme_tcp_reconnect_or_remove(ctrl); +} + +static void nvme_tcp_error_recovery_work(struct work_struct *work) +{ + struct nvme_tcp_ctrl *tcp_ctrl = container_of(work, + struct nvme_tcp_ctrl, err_work); + struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl; + + nvme_stop_keep_alive(ctrl); + flush_work(&ctrl->async_event_work); + nvme_tcp_teardown_io_queues(ctrl, false); + /* unquiesce to fail fast pending requests */ + nvme_start_queues(ctrl); + nvme_tcp_teardown_admin_queue(ctrl, false); + nvme_start_admin_queue(ctrl); + + if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) { + /* state change failure is ok if we started ctrl delete */ + WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING && + ctrl->state != NVME_CTRL_DELETING_NOIO); + return; + } + + nvme_tcp_reconnect_or_remove(ctrl); +} + +static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown) +{ + nvme_tcp_teardown_io_queues(ctrl, shutdown); + nvme_stop_admin_queue(ctrl); + if (shutdown) + nvme_shutdown_ctrl(ctrl); + else + nvme_disable_ctrl(ctrl); + nvme_tcp_teardown_admin_queue(ctrl, shutdown); +} + +static void nvme_tcp_delete_ctrl(struct nvme_ctrl *ctrl) +{ + nvme_tcp_teardown_ctrl(ctrl, true); +} + +static void nvme_reset_ctrl_work(struct work_struct *work) +{ + struct nvme_ctrl *ctrl = + container_of(work, struct nvme_ctrl, reset_work); + + nvme_stop_ctrl(ctrl); + nvme_tcp_teardown_ctrl(ctrl, false); + + if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) { + /* state change failure is ok if we started ctrl delete */ + WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING && + ctrl->state != NVME_CTRL_DELETING_NOIO); + return; + } + + if (nvme_tcp_setup_ctrl(ctrl, false)) + goto out_fail; + + return; + +out_fail: + ++ctrl->nr_reconnects; + nvme_tcp_reconnect_or_remove(ctrl); +} + +static void nvme_tcp_stop_ctrl(struct nvme_ctrl *ctrl) +{ + cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work); + cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work); +} + +static void nvme_tcp_free_ctrl(struct nvme_ctrl *nctrl) +{ + struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); + + if (list_empty(&ctrl->list)) + goto free_ctrl; + + mutex_lock(&nvme_tcp_ctrl_mutex); + list_del(&ctrl->list); + mutex_unlock(&nvme_tcp_ctrl_mutex); + + nvmf_free_options(nctrl->opts); +free_ctrl: + kfree(ctrl->queues); + kfree(ctrl); +} + +static void nvme_tcp_set_sg_null(struct nvme_command *c) +{ + struct nvme_sgl_desc *sg = &c->common.dptr.sgl; + + sg->addr = 0; + sg->length = 0; + sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) | + NVME_SGL_FMT_TRANSPORT_A; +} + +static void nvme_tcp_set_sg_inline(struct nvme_tcp_queue *queue, + struct nvme_command *c, u32 data_len) +{ + struct nvme_sgl_desc *sg = &c->common.dptr.sgl; + + sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff); + sg->length = cpu_to_le32(data_len); + sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET; +} + +static void nvme_tcp_set_sg_host_data(struct nvme_command *c, + u32 data_len) +{ + struct nvme_sgl_desc *sg = &c->common.dptr.sgl; + + sg->addr = 0; + sg->length = cpu_to_le32(data_len); + sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) | + NVME_SGL_FMT_TRANSPORT_A; +} + +static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg) +{ + struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(arg); + struct nvme_tcp_queue *queue = &ctrl->queues[0]; + struct nvme_tcp_cmd_pdu *pdu = ctrl->async_req.pdu; + struct nvme_command *cmd = &pdu->cmd; + u8 hdgst = nvme_tcp_hdgst_len(queue); + + memset(pdu, 0, sizeof(*pdu)); + pdu->hdr.type = nvme_tcp_cmd; + if (queue->hdr_digest) + pdu->hdr.flags |= NVME_TCP_F_HDGST; + pdu->hdr.hlen = sizeof(*pdu); + pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst); + + cmd->common.opcode = nvme_admin_async_event; + cmd->common.command_id = NVME_AQ_BLK_MQ_DEPTH; + cmd->common.flags |= NVME_CMD_SGL_METABUF; + nvme_tcp_set_sg_null(cmd); + + ctrl->async_req.state = NVME_TCP_SEND_CMD_PDU; + ctrl->async_req.offset = 0; + ctrl->async_req.curr_bio = NULL; + ctrl->async_req.data_len = 0; + + nvme_tcp_queue_request(&ctrl->async_req, true, true); +} + +static void nvme_tcp_complete_timed_out(struct request *rq) +{ + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl; + + nvme_tcp_stop_queue(ctrl, nvme_tcp_queue_id(req->queue)); + if (blk_mq_request_started(rq) && !blk_mq_request_completed(rq)) { + nvme_req(rq)->status = NVME_SC_HOST_ABORTED_CMD; + blk_mq_complete_request(rq); + } +} + +static enum blk_eh_timer_return +nvme_tcp_timeout(struct request *rq, bool reserved) +{ + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl; + struct nvme_tcp_cmd_pdu *pdu = req->pdu; + + dev_warn(ctrl->device, + "queue %d: timeout request %#x type %d\n", + nvme_tcp_queue_id(req->queue), rq->tag, pdu->hdr.type); + + if (ctrl->state != NVME_CTRL_LIVE) { + /* + * If we are resetting, connecting or deleting we should + * complete immediately because we may block controller + * teardown or setup sequence + * - ctrl disable/shutdown fabrics requests + * - connect requests + * - initialization admin requests + * - I/O requests that entered after unquiescing and + * the controller stopped responding + * + * All other requests should be cancelled by the error + * recovery work, so it's fine that we fail it here. + */ + nvme_tcp_complete_timed_out(rq); + return BLK_EH_DONE; + } + + /* + * LIVE state should trigger the normal error recovery which will + * handle completing this request. + */ + nvme_tcp_error_recovery(ctrl); + return BLK_EH_RESET_TIMER; +} + +static blk_status_t nvme_tcp_map_data(struct nvme_tcp_queue *queue, + struct request *rq) +{ + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_tcp_cmd_pdu *pdu = req->pdu; + struct nvme_command *c = &pdu->cmd; + + c->common.flags |= NVME_CMD_SGL_METABUF; + + if (!blk_rq_nr_phys_segments(rq)) + nvme_tcp_set_sg_null(c); + else if (rq_data_dir(rq) == WRITE && + req->data_len <= nvme_tcp_inline_data_size(queue)) + nvme_tcp_set_sg_inline(queue, c, req->data_len); + else + nvme_tcp_set_sg_host_data(c, req->data_len); + + return 0; +} + +static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns, + struct request *rq) +{ + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_tcp_cmd_pdu *pdu = req->pdu; + struct nvme_tcp_queue *queue = req->queue; + u8 hdgst = nvme_tcp_hdgst_len(queue), ddgst = 0; + blk_status_t ret; + + ret = nvme_setup_cmd(ns, rq); + if (ret) + return ret; + + req->state = NVME_TCP_SEND_CMD_PDU; + req->status = cpu_to_le16(NVME_SC_SUCCESS); + req->offset = 0; + req->data_sent = 0; + req->pdu_len = 0; + req->pdu_sent = 0; + req->h2cdata_left = 0; + req->data_len = blk_rq_nr_phys_segments(rq) ? + blk_rq_payload_bytes(rq) : 0; + req->curr_bio = rq->bio; + if (req->curr_bio && req->data_len) + nvme_tcp_init_iter(req, rq_data_dir(rq)); + + if (rq_data_dir(rq) == WRITE && + req->data_len <= nvme_tcp_inline_data_size(queue)) + req->pdu_len = req->data_len; + + pdu->hdr.type = nvme_tcp_cmd; + pdu->hdr.flags = 0; + if (queue->hdr_digest) + pdu->hdr.flags |= NVME_TCP_F_HDGST; + if (queue->data_digest && req->pdu_len) { + pdu->hdr.flags |= NVME_TCP_F_DDGST; + ddgst = nvme_tcp_ddgst_len(queue); + } + pdu->hdr.hlen = sizeof(*pdu); + pdu->hdr.pdo = req->pdu_len ? pdu->hdr.hlen + hdgst : 0; + pdu->hdr.plen = + cpu_to_le32(pdu->hdr.hlen + hdgst + req->pdu_len + ddgst); + + ret = nvme_tcp_map_data(queue, rq); + if (unlikely(ret)) { + nvme_cleanup_cmd(rq); + dev_err(queue->ctrl->ctrl.device, + "Failed to map data (%d)\n", ret); + return ret; + } + + return 0; +} + +static void nvme_tcp_commit_rqs(struct blk_mq_hw_ctx *hctx) +{ + struct nvme_tcp_queue *queue = hctx->driver_data; + + if (!llist_empty(&queue->req_list)) + queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); +} + +static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct nvme_ns *ns = hctx->queue->queuedata; + struct nvme_tcp_queue *queue = hctx->driver_data; + struct request *rq = bd->rq; + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); + bool queue_ready = test_bit(NVME_TCP_Q_LIVE, &queue->flags); + blk_status_t ret; + + if (!nvme_check_ready(&queue->ctrl->ctrl, rq, queue_ready)) + return nvme_fail_nonready_command(&queue->ctrl->ctrl, rq); + + ret = nvme_tcp_setup_cmd_pdu(ns, rq); + if (unlikely(ret)) + return ret; + + blk_mq_start_request(rq); + + nvme_tcp_queue_request(req, true, bd->last); + + return BLK_STS_OK; +} + +static int nvme_tcp_map_queues(struct blk_mq_tag_set *set) +{ + struct nvme_tcp_ctrl *ctrl = set->driver_data; + struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; + + if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) { + /* separate read/write queues */ + set->map[HCTX_TYPE_DEFAULT].nr_queues = + ctrl->io_queues[HCTX_TYPE_DEFAULT]; + set->map[HCTX_TYPE_DEFAULT].queue_offset = 0; + set->map[HCTX_TYPE_READ].nr_queues = + ctrl->io_queues[HCTX_TYPE_READ]; + set->map[HCTX_TYPE_READ].queue_offset = + ctrl->io_queues[HCTX_TYPE_DEFAULT]; + } else { + /* shared read/write queues */ + set->map[HCTX_TYPE_DEFAULT].nr_queues = + ctrl->io_queues[HCTX_TYPE_DEFAULT]; + set->map[HCTX_TYPE_DEFAULT].queue_offset = 0; + set->map[HCTX_TYPE_READ].nr_queues = + ctrl->io_queues[HCTX_TYPE_DEFAULT]; + set->map[HCTX_TYPE_READ].queue_offset = 0; + } + blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); + blk_mq_map_queues(&set->map[HCTX_TYPE_READ]); + + if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) { + /* map dedicated poll queues only if we have queues left */ + set->map[HCTX_TYPE_POLL].nr_queues = + ctrl->io_queues[HCTX_TYPE_POLL]; + set->map[HCTX_TYPE_POLL].queue_offset = + ctrl->io_queues[HCTX_TYPE_DEFAULT] + + ctrl->io_queues[HCTX_TYPE_READ]; + blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]); + } + + dev_info(ctrl->ctrl.device, + "mapped %d/%d/%d default/read/poll queues.\n", + ctrl->io_queues[HCTX_TYPE_DEFAULT], + ctrl->io_queues[HCTX_TYPE_READ], + ctrl->io_queues[HCTX_TYPE_POLL]); + + return 0; +} + +static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) +{ + struct nvme_tcp_queue *queue = hctx->driver_data; + struct sock *sk = queue->sock->sk; + + if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags)) + return 0; + + set_bit(NVME_TCP_Q_POLLING, &queue->flags); + if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue)) + sk_busy_loop(sk, true); + nvme_tcp_try_recv(queue); + clear_bit(NVME_TCP_Q_POLLING, &queue->flags); + return queue->nr_cqe; +} + +static const struct blk_mq_ops nvme_tcp_mq_ops = { + .queue_rq = nvme_tcp_queue_rq, + .commit_rqs = nvme_tcp_commit_rqs, + .complete = nvme_complete_rq, + .init_request = nvme_tcp_init_request, + .exit_request = nvme_tcp_exit_request, + .init_hctx = nvme_tcp_init_hctx, + .timeout = nvme_tcp_timeout, + .map_queues = nvme_tcp_map_queues, + .poll = nvme_tcp_poll, +}; + +static const struct blk_mq_ops nvme_tcp_admin_mq_ops = { + .queue_rq = nvme_tcp_queue_rq, + .complete = nvme_complete_rq, + .init_request = nvme_tcp_init_request, + .exit_request = nvme_tcp_exit_request, + .init_hctx = nvme_tcp_init_admin_hctx, + .timeout = nvme_tcp_timeout, +}; + +static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = { + .name = "tcp", + .module = THIS_MODULE, + .flags = NVME_F_FABRICS, + .reg_read32 = nvmf_reg_read32, + .reg_read64 = nvmf_reg_read64, + .reg_write32 = nvmf_reg_write32, + .free_ctrl = nvme_tcp_free_ctrl, + .submit_async_event = nvme_tcp_submit_async_event, + .delete_ctrl = nvme_tcp_delete_ctrl, + .get_address = nvmf_get_address, + .stop_ctrl = nvme_tcp_stop_ctrl, +}; + +static bool +nvme_tcp_existing_controller(struct nvmf_ctrl_options *opts) +{ + struct nvme_tcp_ctrl *ctrl; + bool found = false; + + mutex_lock(&nvme_tcp_ctrl_mutex); + list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list) { + found = nvmf_ip_options_match(&ctrl->ctrl, opts); + if (found) + break; + } + mutex_unlock(&nvme_tcp_ctrl_mutex); + + return found; +} + +static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, + struct nvmf_ctrl_options *opts) +{ + struct nvme_tcp_ctrl *ctrl; + int ret; + + ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL); + if (!ctrl) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&ctrl->list); + ctrl->ctrl.opts = opts; + ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues + + opts->nr_poll_queues + 1; + ctrl->ctrl.sqsize = opts->queue_size - 1; + ctrl->ctrl.kato = opts->kato; + + INIT_DELAYED_WORK(&ctrl->connect_work, + nvme_tcp_reconnect_ctrl_work); + INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work); + INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work); + + if (!(opts->mask & NVMF_OPT_TRSVCID)) { + opts->trsvcid = + kstrdup(__stringify(NVME_TCP_DISC_PORT), GFP_KERNEL); + if (!opts->trsvcid) { + ret = -ENOMEM; + goto out_free_ctrl; + } + opts->mask |= NVMF_OPT_TRSVCID; + } + + ret = inet_pton_with_scope(&init_net, AF_UNSPEC, + opts->traddr, opts->trsvcid, &ctrl->addr); + if (ret) { + pr_err("malformed address passed: %s:%s\n", + opts->traddr, opts->trsvcid); + goto out_free_ctrl; + } + + if (opts->mask & NVMF_OPT_HOST_TRADDR) { + ret = inet_pton_with_scope(&init_net, AF_UNSPEC, + opts->host_traddr, NULL, &ctrl->src_addr); + if (ret) { + pr_err("malformed src address passed: %s\n", + opts->host_traddr); + goto out_free_ctrl; + } + } + + if (opts->mask & NVMF_OPT_HOST_IFACE) { + if (!__dev_get_by_name(&init_net, opts->host_iface)) { + pr_err("invalid interface passed: %s\n", + opts->host_iface); + ret = -ENODEV; + goto out_free_ctrl; + } + } + + if (!opts->duplicate_connect && nvme_tcp_existing_controller(opts)) { + ret = -EALREADY; + goto out_free_ctrl; + } + + ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues), + GFP_KERNEL); + if (!ctrl->queues) { + ret = -ENOMEM; + goto out_free_ctrl; + } + + ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_tcp_ctrl_ops, 0); + if (ret) + goto out_kfree_queues; + + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { + WARN_ON_ONCE(1); + ret = -EINTR; + goto out_uninit_ctrl; + } + + ret = nvme_tcp_setup_ctrl(&ctrl->ctrl, true); + if (ret) + goto out_uninit_ctrl; + + dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n", + nvmf_ctrl_subsysnqn(&ctrl->ctrl), &ctrl->addr); + + mutex_lock(&nvme_tcp_ctrl_mutex); + list_add_tail(&ctrl->list, &nvme_tcp_ctrl_list); + mutex_unlock(&nvme_tcp_ctrl_mutex); + + return &ctrl->ctrl; + +out_uninit_ctrl: + nvme_uninit_ctrl(&ctrl->ctrl); + nvme_put_ctrl(&ctrl->ctrl); + if (ret > 0) + ret = -EIO; + return ERR_PTR(ret); +out_kfree_queues: + kfree(ctrl->queues); +out_free_ctrl: + kfree(ctrl); + return ERR_PTR(ret); +} + +static struct nvmf_transport_ops nvme_tcp_transport = { + .name = "tcp", + .module = THIS_MODULE, + .required_opts = NVMF_OPT_TRADDR, + .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY | + NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO | + NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST | + NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES | + NVMF_OPT_TOS | NVMF_OPT_HOST_IFACE, + .create_ctrl = nvme_tcp_create_ctrl, +}; + +static int __init nvme_tcp_init_module(void) +{ + nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq", + WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); + if (!nvme_tcp_wq) + return -ENOMEM; + + nvmf_register_transport(&nvme_tcp_transport); + return 0; +} + +static void __exit nvme_tcp_cleanup_module(void) +{ + struct nvme_tcp_ctrl *ctrl; + + nvmf_unregister_transport(&nvme_tcp_transport); + + mutex_lock(&nvme_tcp_ctrl_mutex); + list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list) + nvme_delete_ctrl(&ctrl->ctrl); + mutex_unlock(&nvme_tcp_ctrl_mutex); + flush_workqueue(nvme_delete_wq); + + destroy_workqueue(nvme_tcp_wq); +} + +module_init(nvme_tcp_init_module); +module_exit(nvme_tcp_cleanup_module); + +MODULE_LICENSE("GPL v2"); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/trace.c b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/trace.c new file mode 100644 index 0000000..2a89c5a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/trace.c @@ -0,0 +1,325 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NVM Express device driver tracepoints + * Copyright (c) 2018 Johannes Thumshirn, SUSE Linux GmbH + */ + +#include +#include "trace.h" + +static const char *nvme_trace_delete_sq(struct trace_seq *p, u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u16 sqid = get_unaligned_le16(cdw10); + + trace_seq_printf(p, "sqid=%u", sqid); + trace_seq_putc(p, 0); + + return ret; +} + +static const char *nvme_trace_create_sq(struct trace_seq *p, u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u16 sqid = get_unaligned_le16(cdw10); + u16 qsize = get_unaligned_le16(cdw10 + 2); + u16 sq_flags = get_unaligned_le16(cdw10 + 4); + u16 cqid = get_unaligned_le16(cdw10 + 6); + + + trace_seq_printf(p, "sqid=%u, qsize=%u, sq_flags=0x%x, cqid=%u", + sqid, qsize, sq_flags, cqid); + trace_seq_putc(p, 0); + + return ret; +} + +static const char *nvme_trace_delete_cq(struct trace_seq *p, u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u16 cqid = get_unaligned_le16(cdw10); + + trace_seq_printf(p, "cqid=%u", cqid); + trace_seq_putc(p, 0); + + return ret; +} + +static const char *nvme_trace_create_cq(struct trace_seq *p, u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u16 cqid = get_unaligned_le16(cdw10); + u16 qsize = get_unaligned_le16(cdw10 + 2); + u16 cq_flags = get_unaligned_le16(cdw10 + 4); + u16 irq_vector = get_unaligned_le16(cdw10 + 6); + + trace_seq_printf(p, "cqid=%u, qsize=%u, cq_flags=0x%x, irq_vector=%u", + cqid, qsize, cq_flags, irq_vector); + trace_seq_putc(p, 0); + + return ret; +} + +static const char *nvme_trace_admin_identify(struct trace_seq *p, u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u8 cns = cdw10[0]; + u16 ctrlid = get_unaligned_le16(cdw10 + 2); + + trace_seq_printf(p, "cns=%u, ctrlid=%u", cns, ctrlid); + trace_seq_putc(p, 0); + + return ret; +} + +static const char *nvme_trace_admin_set_features(struct trace_seq *p, + u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u8 fid = cdw10[0]; + u8 sv = cdw10[3] & 0x8; + u32 cdw11 = get_unaligned_le32(cdw10 + 4); + + trace_seq_printf(p, "fid=0x%x, sv=0x%x, cdw11=0x%x", fid, sv, cdw11); + trace_seq_putc(p, 0); + + return ret; +} + +static const char *nvme_trace_admin_get_features(struct trace_seq *p, + u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u8 fid = cdw10[0]; + u8 sel = cdw10[1] & 0x7; + u32 cdw11 = get_unaligned_le32(cdw10 + 4); + + trace_seq_printf(p, "fid=0x%x, sel=0x%x, cdw11=0x%x", fid, sel, cdw11); + trace_seq_putc(p, 0); + + return ret; +} + +static const char *nvme_trace_get_lba_status(struct trace_seq *p, + u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u64 slba = get_unaligned_le64(cdw10); + u32 mndw = get_unaligned_le32(cdw10 + 8); + u16 rl = get_unaligned_le16(cdw10 + 12); + u8 atype = cdw10[15]; + + trace_seq_printf(p, "slba=0x%llx, mndw=0x%x, rl=0x%x, atype=%u", + slba, mndw, rl, atype); + trace_seq_putc(p, 0); + + return ret; +} + +static const char *nvme_trace_admin_format_nvm(struct trace_seq *p, u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u8 lbaf = cdw10[0] & 0xF; + u8 mset = (cdw10[0] >> 4) & 0x1; + u8 pi = (cdw10[0] >> 5) & 0x7; + u8 pil = cdw10[1] & 0x1; + u8 ses = (cdw10[1] >> 1) & 0x7; + + trace_seq_printf(p, "lbaf=%u, mset=%u, pi=%u, pil=%u, ses=%u", + lbaf, mset, pi, pil, ses); + + trace_seq_putc(p, 0); + + return ret; +} + +static const char *nvme_trace_read_write(struct trace_seq *p, u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u64 slba = get_unaligned_le64(cdw10); + u16 length = get_unaligned_le16(cdw10 + 8); + u16 control = get_unaligned_le16(cdw10 + 10); + u32 dsmgmt = get_unaligned_le32(cdw10 + 12); + u32 reftag = get_unaligned_le32(cdw10 + 16); + + trace_seq_printf(p, + "slba=%llu, len=%u, ctrl=0x%x, dsmgmt=%u, reftag=%u", + slba, length, control, dsmgmt, reftag); + trace_seq_putc(p, 0); + + return ret; +} + +static const char *nvme_trace_dsm(struct trace_seq *p, u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + + trace_seq_printf(p, "nr=%u, attributes=%u", + get_unaligned_le32(cdw10), + get_unaligned_le32(cdw10 + 4)); + trace_seq_putc(p, 0); + + return ret; +} + +static const char *nvme_trace_zone_mgmt_send(struct trace_seq *p, u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u64 slba = get_unaligned_le64(cdw10); + u8 zsa = cdw10[12]; + u8 all = cdw10[13]; + + trace_seq_printf(p, "slba=%llu, zsa=%u, all=%u", slba, zsa, all); + trace_seq_putc(p, 0); + + return ret; +} + +static const char *nvme_trace_zone_mgmt_recv(struct trace_seq *p, u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u64 slba = get_unaligned_le64(cdw10); + u32 numd = get_unaligned_le32(cdw10 + 8); + u8 zra = cdw10[12]; + u8 zrasf = cdw10[13]; + u8 pr = cdw10[14]; + + trace_seq_printf(p, "slba=%llu, numd=%u, zra=%u, zrasf=%u, pr=%u", + slba, numd, zra, zrasf, pr); + trace_seq_putc(p, 0); + + return ret; +} + +static const char *nvme_trace_common(struct trace_seq *p, u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + + trace_seq_printf(p, "cdw10=%*ph", 24, cdw10); + trace_seq_putc(p, 0); + + return ret; +} + +const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, + u8 opcode, u8 *cdw10) +{ + switch (opcode) { + case nvme_admin_delete_sq: + return nvme_trace_delete_sq(p, cdw10); + case nvme_admin_create_sq: + return nvme_trace_create_sq(p, cdw10); + case nvme_admin_delete_cq: + return nvme_trace_delete_cq(p, cdw10); + case nvme_admin_create_cq: + return nvme_trace_create_cq(p, cdw10); + case nvme_admin_identify: + return nvme_trace_admin_identify(p, cdw10); + case nvme_admin_set_features: + return nvme_trace_admin_set_features(p, cdw10); + case nvme_admin_get_features: + return nvme_trace_admin_get_features(p, cdw10); + case nvme_admin_get_lba_status: + return nvme_trace_get_lba_status(p, cdw10); + case nvme_admin_format_nvm: + return nvme_trace_admin_format_nvm(p, cdw10); + default: + return nvme_trace_common(p, cdw10); + } +} + +const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p, + u8 opcode, u8 *cdw10) +{ + switch (opcode) { + case nvme_cmd_read: + case nvme_cmd_write: + case nvme_cmd_write_zeroes: + case nvme_cmd_zone_append: + return nvme_trace_read_write(p, cdw10); + case nvme_cmd_dsm: + return nvme_trace_dsm(p, cdw10); + case nvme_cmd_zone_mgmt_send: + return nvme_trace_zone_mgmt_send(p, cdw10); + case nvme_cmd_zone_mgmt_recv: + return nvme_trace_zone_mgmt_recv(p, cdw10); + default: + return nvme_trace_common(p, cdw10); + } +} + +static const char *nvme_trace_fabrics_property_set(struct trace_seq *p, u8 *spc) +{ + const char *ret = trace_seq_buffer_ptr(p); + u8 attrib = spc[0]; + u32 ofst = get_unaligned_le32(spc + 4); + u64 value = get_unaligned_le64(spc + 8); + + trace_seq_printf(p, "attrib=%u, ofst=0x%x, value=0x%llx", + attrib, ofst, value); + trace_seq_putc(p, 0); + return ret; +} + +static const char *nvme_trace_fabrics_connect(struct trace_seq *p, u8 *spc) +{ + const char *ret = trace_seq_buffer_ptr(p); + u16 recfmt = get_unaligned_le16(spc); + u16 qid = get_unaligned_le16(spc + 2); + u16 sqsize = get_unaligned_le16(spc + 4); + u8 cattr = spc[6]; + u32 kato = get_unaligned_le32(spc + 8); + + trace_seq_printf(p, "recfmt=%u, qid=%u, sqsize=%u, cattr=%u, kato=%u", + recfmt, qid, sqsize, cattr, kato); + trace_seq_putc(p, 0); + return ret; +} + +static const char *nvme_trace_fabrics_property_get(struct trace_seq *p, u8 *spc) +{ + const char *ret = trace_seq_buffer_ptr(p); + u8 attrib = spc[0]; + u32 ofst = get_unaligned_le32(spc + 4); + + trace_seq_printf(p, "attrib=%u, ofst=0x%x", attrib, ofst); + trace_seq_putc(p, 0); + return ret; +} + +static const char *nvme_trace_fabrics_common(struct trace_seq *p, u8 *spc) +{ + const char *ret = trace_seq_buffer_ptr(p); + + trace_seq_printf(p, "specific=%*ph", 24, spc); + trace_seq_putc(p, 0); + return ret; +} + +const char *nvme_trace_parse_fabrics_cmd(struct trace_seq *p, + u8 fctype, u8 *spc) +{ + switch (fctype) { + case nvme_fabrics_type_property_set: + return nvme_trace_fabrics_property_set(p, spc); + case nvme_fabrics_type_connect: + return nvme_trace_fabrics_connect(p, spc); + case nvme_fabrics_type_property_get: + return nvme_trace_fabrics_property_get(p, spc); + default: + return nvme_trace_fabrics_common(p, spc); + } +} + +const char *nvme_trace_disk_name(struct trace_seq *p, char *name) +{ + const char *ret = trace_seq_buffer_ptr(p); + + if (*name) + trace_seq_printf(p, "disk=%s, ", name); + trace_seq_putc(p, 0); + + return ret; +} + +EXPORT_TRACEPOINT_SYMBOL_GPL(nvme_sq); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/trace.h b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/trace.h new file mode 100644 index 0000000..fe8c900 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/trace.h @@ -0,0 +1,175 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * NVM Express device driver tracepoints + * Copyright (c) 2018 Johannes Thumshirn, SUSE Linux GmbH + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM nvme + +#if !defined(_TRACE_NVME_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_NVME_H + +#include +#include +#include + +#include "nvme.h" + +const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, u8 opcode, + u8 *cdw10); +const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p, u8 opcode, + u8 *cdw10); +const char *nvme_trace_parse_fabrics_cmd(struct trace_seq *p, u8 fctype, + u8 *spc); + +#define parse_nvme_cmd(qid, opcode, fctype, cdw10) \ + ((opcode) == nvme_fabrics_command ? \ + nvme_trace_parse_fabrics_cmd(p, fctype, cdw10) : \ + ((qid) ? \ + nvme_trace_parse_nvm_cmd(p, opcode, cdw10) : \ + nvme_trace_parse_admin_cmd(p, opcode, cdw10))) + +const char *nvme_trace_disk_name(struct trace_seq *p, char *name); +#define __print_disk_name(name) \ + nvme_trace_disk_name(p, name) + +#ifndef TRACE_HEADER_MULTI_READ +static inline void __assign_disk_name(char *name, struct gendisk *disk) +{ + if (disk) + memcpy(name, disk->disk_name, DISK_NAME_LEN); + else + memset(name, 0, DISK_NAME_LEN); +} +#endif + +TRACE_EVENT(nvme_setup_cmd, + TP_PROTO(struct request *req, struct nvme_command *cmd), + TP_ARGS(req, cmd), + TP_STRUCT__entry( + __array(char, disk, DISK_NAME_LEN) + __field(int, ctrl_id) + __field(int, qid) + __field(u8, opcode) + __field(u8, flags) + __field(u8, fctype) + __field(u16, cid) + __field(u32, nsid) + __field(bool, metadata) + __array(u8, cdw10, 24) + ), + TP_fast_assign( + __entry->ctrl_id = nvme_req(req)->ctrl->instance; + __entry->qid = nvme_req_qid(req); + __entry->opcode = cmd->common.opcode; + __entry->flags = cmd->common.flags; + __entry->cid = cmd->common.command_id; + __entry->nsid = le32_to_cpu(cmd->common.nsid); + __entry->metadata = !!blk_integrity_rq(req); + __entry->fctype = cmd->fabrics.fctype; + __assign_disk_name(__entry->disk, req->q->disk); + memcpy(__entry->cdw10, &cmd->common.cdw10, + sizeof(__entry->cdw10)); + ), + TP_printk("nvme%d: %sqid=%d, cmdid=%u, nsid=%u, flags=0x%x, meta=0x%x, cmd=(%s %s)", + __entry->ctrl_id, __print_disk_name(__entry->disk), + __entry->qid, __entry->cid, __entry->nsid, + __entry->flags, __entry->metadata, + show_opcode_name(__entry->qid, __entry->opcode, + __entry->fctype), + parse_nvme_cmd(__entry->qid, __entry->opcode, + __entry->fctype, __entry->cdw10)) +); + +TRACE_EVENT(nvme_complete_rq, + TP_PROTO(struct request *req), + TP_ARGS(req), + TP_STRUCT__entry( + __array(char, disk, DISK_NAME_LEN) + __field(int, ctrl_id) + __field(int, qid) + __field(int, cid) + __field(u64, result) + __field(u8, retries) + __field(u8, flags) + __field(u16, status) + ), + TP_fast_assign( + __entry->ctrl_id = nvme_req(req)->ctrl->instance; + __entry->qid = nvme_req_qid(req); + __entry->cid = nvme_req(req)->cmd->common.command_id; + __entry->result = le64_to_cpu(nvme_req(req)->result.u64); + __entry->retries = nvme_req(req)->retries; + __entry->flags = nvme_req(req)->flags; + __entry->status = nvme_req(req)->status; + __assign_disk_name(__entry->disk, req->q->disk); + ), + TP_printk("nvme%d: %sqid=%d, cmdid=%u, res=%#llx, retries=%u, flags=0x%x, status=%#x", + __entry->ctrl_id, __print_disk_name(__entry->disk), + __entry->qid, __entry->cid, __entry->result, + __entry->retries, __entry->flags, __entry->status) + +); + +#define aer_name(aer) { aer, #aer } + +TRACE_EVENT(nvme_async_event, + TP_PROTO(struct nvme_ctrl *ctrl, u32 result), + TP_ARGS(ctrl, result), + TP_STRUCT__entry( + __field(int, ctrl_id) + __field(u32, result) + ), + TP_fast_assign( + __entry->ctrl_id = ctrl->instance; + __entry->result = result; + ), + TP_printk("nvme%d: NVME_AEN=%#08x [%s]", + __entry->ctrl_id, __entry->result, + __print_symbolic(__entry->result, + aer_name(NVME_AER_NOTICE_NS_CHANGED), + aer_name(NVME_AER_NOTICE_ANA), + aer_name(NVME_AER_NOTICE_FW_ACT_STARTING), + aer_name(NVME_AER_NOTICE_DISC_CHANGED), + aer_name(NVME_AER_ERROR), + aer_name(NVME_AER_SMART), + aer_name(NVME_AER_CSS), + aer_name(NVME_AER_VS)) + ) +); + +#undef aer_name + +TRACE_EVENT(nvme_sq, + TP_PROTO(struct request *req, __le16 sq_head, int sq_tail), + TP_ARGS(req, sq_head, sq_tail), + TP_STRUCT__entry( + __field(int, ctrl_id) + __array(char, disk, DISK_NAME_LEN) + __field(int, qid) + __field(u16, sq_head) + __field(u16, sq_tail) + ), + TP_fast_assign( + __entry->ctrl_id = nvme_req(req)->ctrl->instance; + __assign_disk_name(__entry->disk, req->q->disk); + __entry->qid = nvme_req_qid(req); + __entry->sq_head = le16_to_cpu(sq_head); + __entry->sq_tail = sq_tail; + ), + TP_printk("nvme%d: %sqid=%d, head=%u, tail=%u", + __entry->ctrl_id, __print_disk_name(__entry->disk), + __entry->qid, __entry->sq_head, __entry->sq_tail + ) +); + +#endif /* _TRACE_NVME_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace + +/* This part must be outside protection */ +#include diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/zns.c b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/zns.c new file mode 100644 index 0000000..9f81beb --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/host/zns.c @@ -0,0 +1,250 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020 Western Digital Corporation or its affiliates. + */ + +#include +#include +#include "nvme.h" + +int nvme_revalidate_zones(struct nvme_ns *ns) +{ + struct request_queue *q = ns->queue; + int ret; + + ret = blk_revalidate_disk_zones(ns->disk, NULL); + if (!ret) + blk_queue_max_zone_append_sectors(q, ns->ctrl->max_zone_append); + return ret; +} + +static int nvme_set_max_append(struct nvme_ctrl *ctrl) +{ + struct nvme_command c = { }; + struct nvme_id_ctrl_zns *id; + int status; + + id = kzalloc(sizeof(*id), GFP_KERNEL); + if (!id) + return -ENOMEM; + + c.identify.opcode = nvme_admin_identify; + c.identify.cns = NVME_ID_CNS_CS_CTRL; + c.identify.csi = NVME_CSI_ZNS; + + status = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id)); + if (status) { + kfree(id); + return status; + } + + if (id->zasl) + ctrl->max_zone_append = 1 << (id->zasl + 3); + else + ctrl->max_zone_append = ctrl->max_hw_sectors; + kfree(id); + return 0; +} + +int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf) +{ + struct nvme_effects_log *log = ns->head->effects; + struct request_queue *q = ns->queue; + struct nvme_command c = { }; + struct nvme_id_ns_zns *id; + int status; + + /* Driver requires zone append support */ + if ((le32_to_cpu(log->iocs[nvme_cmd_zone_append]) & + NVME_CMD_EFFECTS_CSUPP)) { + if (test_and_clear_bit(NVME_NS_FORCE_RO, &ns->flags)) + dev_warn(ns->ctrl->device, + "Zone Append supported for zoned namespace:%d. Remove read-only mode\n", + ns->head->ns_id); + } else { + set_bit(NVME_NS_FORCE_RO, &ns->flags); + dev_warn(ns->ctrl->device, + "Zone Append not supported for zoned namespace:%d. Forcing to read-only mode\n", + ns->head->ns_id); + } + + /* Lazily query controller append limit for the first zoned namespace */ + if (!ns->ctrl->max_zone_append) { + status = nvme_set_max_append(ns->ctrl); + if (status) + return status; + } + + id = kzalloc(sizeof(*id), GFP_KERNEL); + if (!id) + return -ENOMEM; + + c.identify.opcode = nvme_admin_identify; + c.identify.nsid = cpu_to_le32(ns->head->ns_id); + c.identify.cns = NVME_ID_CNS_CS_NS; + c.identify.csi = NVME_CSI_ZNS; + + status = nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, id, sizeof(*id)); + if (status) + goto free_data; + + /* + * We currently do not handle devices requiring any of the zoned + * operation characteristics. + */ + if (id->zoc) { + dev_warn(ns->ctrl->device, + "zone operations:%x not supported for namespace:%u\n", + le16_to_cpu(id->zoc), ns->head->ns_id); + status = -ENODEV; + goto free_data; + } + + ns->zsze = nvme_lba_to_sect(ns, le64_to_cpu(id->lbafe[lbaf].zsze)); + if (!is_power_of_2(ns->zsze)) { + dev_warn(ns->ctrl->device, + "invalid zone size:%llu for namespace:%u\n", + ns->zsze, ns->head->ns_id); + status = -ENODEV; + goto free_data; + } + + blk_queue_set_zoned(ns->disk, BLK_ZONED_HM); + blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); + blk_queue_max_open_zones(q, le32_to_cpu(id->mor) + 1); + blk_queue_max_active_zones(q, le32_to_cpu(id->mar) + 1); +free_data: + kfree(id); + return status; +} + +static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns, + unsigned int nr_zones, size_t *buflen) +{ + struct request_queue *q = ns->disk->queue; + size_t bufsize; + void *buf; + + const size_t min_bufsize = sizeof(struct nvme_zone_report) + + sizeof(struct nvme_zone_descriptor); + + nr_zones = min_t(unsigned int, nr_zones, + get_capacity(ns->disk) >> ilog2(ns->zsze)); + + bufsize = sizeof(struct nvme_zone_report) + + nr_zones * sizeof(struct nvme_zone_descriptor); + bufsize = min_t(size_t, bufsize, + queue_max_hw_sectors(q) << SECTOR_SHIFT); + bufsize = min_t(size_t, bufsize, queue_max_segments(q) << PAGE_SHIFT); + + while (bufsize >= min_bufsize) { + buf = __vmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY); + if (buf) { + *buflen = bufsize; + return buf; + } + bufsize >>= 1; + } + return NULL; +} + +static int nvme_zone_parse_entry(struct nvme_ns *ns, + struct nvme_zone_descriptor *entry, + unsigned int idx, report_zones_cb cb, + void *data) +{ + struct blk_zone zone = { }; + + if ((entry->zt & 0xf) != NVME_ZONE_TYPE_SEQWRITE_REQ) { + dev_err(ns->ctrl->device, "invalid zone type %#x\n", + entry->zt); + return -EINVAL; + } + + zone.type = BLK_ZONE_TYPE_SEQWRITE_REQ; + zone.cond = entry->zs >> 4; + zone.len = ns->zsze; + zone.capacity = nvme_lba_to_sect(ns, le64_to_cpu(entry->zcap)); + zone.start = nvme_lba_to_sect(ns, le64_to_cpu(entry->zslba)); + if (zone.cond == BLK_ZONE_COND_FULL) + zone.wp = zone.start + zone.len; + else + zone.wp = nvme_lba_to_sect(ns, le64_to_cpu(entry->wp)); + + return cb(&zone, idx, data); +} + +int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector, + unsigned int nr_zones, report_zones_cb cb, void *data) +{ + struct nvme_zone_report *report; + struct nvme_command c = { }; + int ret, zone_idx = 0; + unsigned int nz, i; + size_t buflen; + + if (ns->head->ids.csi != NVME_CSI_ZNS) + return -EINVAL; + + report = nvme_zns_alloc_report_buffer(ns, nr_zones, &buflen); + if (!report) + return -ENOMEM; + + c.zmr.opcode = nvme_cmd_zone_mgmt_recv; + c.zmr.nsid = cpu_to_le32(ns->head->ns_id); + c.zmr.numd = cpu_to_le32(nvme_bytes_to_numd(buflen)); + c.zmr.zra = NVME_ZRA_ZONE_REPORT; + c.zmr.zrasf = NVME_ZRASF_ZONE_REPORT_ALL; + c.zmr.pr = NVME_REPORT_ZONE_PARTIAL; + + sector &= ~(ns->zsze - 1); + while (zone_idx < nr_zones && sector < get_capacity(ns->disk)) { + memset(report, 0, buflen); + + c.zmr.slba = cpu_to_le64(nvme_sect_to_lba(ns, sector)); + ret = nvme_submit_sync_cmd(ns->queue, &c, report, buflen); + if (ret) { + if (ret > 0) + ret = -EIO; + goto out_free; + } + + nz = min((unsigned int)le64_to_cpu(report->nr_zones), nr_zones); + if (!nz) + break; + + for (i = 0; i < nz && zone_idx < nr_zones; i++) { + ret = nvme_zone_parse_entry(ns, &report->entries[i], + zone_idx, cb, data); + if (ret) + goto out_free; + zone_idx++; + } + + sector += ns->zsze * nz; + } + + if (zone_idx > 0) + ret = zone_idx; + else + ret = -EINVAL; +out_free: + kvfree(report); + return ret; +} + +blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req, + struct nvme_command *c, enum nvme_zone_mgmt_action action) +{ + memset(c, 0, sizeof(*c)); + + c->zms.opcode = nvme_cmd_zone_mgmt_send; + c->zms.nsid = cpu_to_le32(ns->head->ns_id); + c->zms.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req))); + c->zms.zsa = action; + + if (req_op(req) == REQ_OP_ZONE_RESET_ALL) + c->zms.select_all = 1; + + return BLK_STS_OK; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/lpfc/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/nvme/lpfc/Makefile new file mode 100644 index 0000000..658f47f --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/lpfc/Makefile @@ -0,0 +1,11 @@ +# +# Build this dummy module only when building real nvme core module +# + +ifneq ($(CONFIG_NVME_HOST_DUMMY),m) + +obj-$(CONFIG_SCSI_LPFC) := lpfc.o + +lpfc-y := lpfc_dummy.o + +endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/lpfc/lpfc_dummy.c b/src/mlnx-ofa_kernel-5.8/drivers/nvme/lpfc/lpfc_dummy.c new file mode 100644 index 0000000..fb13319 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/lpfc/lpfc_dummy.c @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2017 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "lpfc" +#define PFX DRV_NAME ": " +#define DRV_VERSION "4.1" +#define DRV_RELDATE "June 05, 2017" + +MODULE_AUTHOR("Alaa Hleihel"); +MODULE_DESCRIPTION("lpfc dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init lpfc_init(void) +{ + return 0; +} + +static void __exit lpfc_cleanup(void) +{ +} + +module_init(lpfc_init); +module_exit(lpfc_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/mlnx-nvme_spec_ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/mlnx-nvme_spec_ new file mode 100644 index 0000000..4bd47ec --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/mlnx-nvme_spec_ @@ -0,0 +1,226 @@ +# +# Copyright (c) 2016 Mellanox Technologies. All rights reserved. +# +# This Software is licensed under one of the following licenses: +# +# 1) under the terms of the "Common Public License 1.0" a copy of which is +# available from the Open Source Initiative, see +# http://www.opensource.org/licenses/cpl.php. +# +# 2) under the terms of the "The BSD License" a copy of which is +# available from the Open Source Initiative, see +# http://www.opensource.org/licenses/bsd-license.php. +# +# 3) under the terms of the "GNU General Public License (GPL) Version 2" a +# copy of which is available from the Open Source Initiative, see +# http://www.opensource.org/licenses/gpl-license.php. +# +# Licensee has the right to choose one of the above licenses. +# +# Redistributions of source code must retain the above copyright +# notice and one of the license notices. +# +# Redistributions in binary form must reproduce both the above copyright +# notice, one of the license notices in the documentation +# and/or other materials provided with the distribution. +# +# + +%{!?_name: %define _name mlnx-nvme} +%{!?_version: %define _version 4.0} +%{!?_release: %define _release 0} + +# KMP is disabled by default +%{!?KMP: %global KMP 0} + +%{!?NVME_HOST_WITHOUT_FC: %global NVME_HOST_WITHOUT_FC 0} + +# take kernel version or default to uname -r +%{!?KVERSION: %global KVERSION %(uname -r)} +%global kernel_version %{KVERSION} +%global krelver %(echo -n %{KVERSION} | sed -e 's/-/_/g') +# take path to kernel sources if provided, otherwise look in default location (for non KMP rpms). +%{!?K_SRC: %global K_SRC /lib/modules/%{KVERSION}/build} + +# define release version +%{!?src_release: %global src_release %{_release}_%{krelver}} +%if "%{KMP}" != "1" +%global _release1 %{src_release} +%else +%global _release1 %{_release} +%endif +%global _kmp_rel %{_release1}%{?_kmp_build_num}%{?_dist} + +Summary: %{_name} Driver +Name: %{_name} +Version: %{_version} +Release: %{_release1}%{?_dist} +License: GPLv2 +Url: http://www.mellanox.com +Group: System Environment/Base +Source: %{_name}-%{_version}.tgz +BuildRoot: %{?build_root:%{build_root}}%{!?build_root:/var/tmp/OFED} +Vendor: Mellanox Technologies +%description +%{name} kernel modules + +# build KMP rpms? +%if "%{KMP}" == "1" +%global kernel_release() $(make -s -C %{1} kernelrelease M=$PWD) +BuildRequires: %kernel_module_package_buildreqs +%(mkdir -p %{buildroot}) +%(echo '%defattr (-,root,root)' > %{buildroot}/file_list) +%(echo '/lib/modules/%2-%1' >> %{buildroot}/file_list) +%(echo '%config(noreplace) %{_sysconfdir}/depmod.d/zz02-%{name}-*-%1.conf' >> %{buildroot}/file_list) +%{kernel_module_package -f %{buildroot}/file_list -r %{_kmp_rel} } +%else +%global kernel_source() %{K_SRC} +%global kernel_release() %{KVERSION} +%global flavors_to_build default +%endif + +# +# setup module sign scripts if paths to the keys are given +# +%global WITH_MOD_SIGN %(if ( test -f "$MODULE_SIGN_PRIV_KEY" && test -f "$MODULE_SIGN_PUB_KEY" ); \ + then \ + echo -n '1'; \ + else \ + echo -n '0'; fi) + +%if "%{WITH_MOD_SIGN}" == "1" +# call module sign script +%global __modsign_install_post \ + %{_builddir}/%{name}-%{version}/source/tools/sign-modules %{buildroot}/lib/modules/ %{kernel_source default} || exit 1 \ +%{nil} + +%global __debug_package 1 +%global buildsubdir %{name}-%{version} +# Disgusting hack alert! We need to ensure we sign modules *after* all +# invocations of strip occur, which is in __debug_install_post if +# find-debuginfo.sh runs, and __os_install_post if not. +# +%global __spec_install_post \ + %{?__debug_package:%{__debug_install_post}} \ + %{__arch_install_post} \ + %{__os_install_post} \ + %{__modsign_install_post} \ +%{nil} + +%endif # end of setup module sign scripts +# + +%if "%{_vendor}" == "suse" +%debug_package +%endif + +# set modules dir +%if "%{_vendor}" == "redhat" || ("%{_vendor}" == "openEuler") +%if 0%{?fedora} +%global install_mod_dir updates/%{name} +%else +%global install_mod_dir extra/%{name} +%endif +%endif + +%if "%{_vendor}" == "suse" +%global install_mod_dir updates/%{name} +%endif + +%{!?install_mod_dir: %global install_mod_dir updates/%{name}} + +%prep +%setup +set -- * +mkdir source +mv "$@" source/ +mkdir obj + +%build +export EXTRA_CFLAGS='-DVERSION=\"%version\"' +export INSTALL_MOD_DIR=%{install_mod_dir} +export CONF_OPTIONS="%{configure_options}" +for flavor in %{flavors_to_build}; do + export K_BUILD=%{kernel_source $flavor} + export KVER=%{kernel_release $K_BUILD} + export LIB_MOD_DIR=/lib/modules/$KVER/$INSTALL_MOD_DIR + rm -rf obj/$flavor + cp -r source obj/$flavor + cd $PWD/obj/$flavor +%if "%{NVME_HOST_WITHOUT_FC}" == "1" + make CONFIG_NVME_HOST_WITHOUT_FC=m +%else + make CONFIG_NVME_HOST_WITHOUT_FC= +%endif + cd - +done + +%install +export INSTALL_MOD_PATH=%{buildroot} +export INSTALL_MOD_DIR=%{install_mod_dir} +export PREFIX=%{_prefix} +for flavor in %flavors_to_build; do + export K_BUILD=%{kernel_source $flavor} + export KVER=%{kernel_release $K_BUILD} + cd $PWD/obj/$flavor +%if "%{NVME_HOST_WITHOUT_FC}" == "1" + make install KERNELRELEASE=$KVER CONFIG_NVME_HOST_WITHOUT_FC=m +%else + make install KERNELRELEASE=$KVER CONFIG_NVME_HOST_WITHOUT_FC= +%endif + # Cleanup unnecessary kernel-generated module dependency files. + find $INSTALL_MOD_PATH/lib/modules -iname 'modules.*' -exec rm {} \; + cd - +done + +# Set the module(s) to be executable, so that they will be stripped when packaged. +find %{buildroot} \( -type f -name '*.ko' -o -name '*ko.gz' \) -exec %{__chmod} u+x \{\} \; + +%{__install} -d %{buildroot}%{_sysconfdir}/depmod.d/ +for module in `find %{buildroot}/ -name '*.ko' -o -name '*.ko.gz' | sort` +do +ko_name=${module##*/} +mod_name=${ko_name/.ko*/} +mod_path=${module/*\/%{name}} +mod_path=${mod_path/\/${ko_name}} +%if "%{_vendor}" == "suse" + for flavor in %{flavors_to_build}; do + if [[ $module =~ $flavor ]] || [ "X%{KMP}" != "X1" ];then + echo "override ${mod_name} * updates/%{name}${mod_path}" >> %{buildroot}%{_sysconfdir}/depmod.d/zz02-%{name}-${mod_name}-$flavor.conf + fi + done +%else + %if 0%{?fedora} + echo "override ${mod_name} * updates/%{name}${mod_path}" >> %{buildroot}%{_sysconfdir}/depmod.d/zz02-%{name}-${mod_name}.conf + %else + %if "%{_vendor}" == "redhat" || ("%{_vendor}" == "openEuler") + echo "override ${mod_name} * weak-updates/%{name}${mod_path}" >> %{buildroot}%{_sysconfdir}/depmod.d/zz02-%{name}-${mod_name}.conf + %endif + echo "override ${mod_name} * extra/%{name}${mod_path}" >> %{buildroot}%{_sysconfdir}/depmod.d/zz02-%{name}-${mod_name}.conf + %endif +%endif +done + + +%clean +rm -rf %{buildroot} + +%post +if [ $1 -ge 1 ]; then # This package is being installed or reinstalled + /sbin/depmod %{KVERSION} +fi +# END of post + +%postun +/sbin/depmod %{KVERSION} + +%if "%{KMP}" != "1" +%files +%defattr(-,root,root,-) +/lib/modules/%{KVERSION}/%{install_mod_dir}/ +%config(noreplace) %{_sysconfdir}/depmod.d/zz02-%{name}-*.conf +%endif + +%changelog +* Sun Aug 21 2016 Alaa Hleihel +- Initial packaging diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/Kconfig b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/Kconfig new file mode 100644 index 0000000..973561c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/Kconfig @@ -0,0 +1,85 @@ +# SPDX-License-Identifier: GPL-2.0-only + +config NVME_TARGET + tristate "NVMe Target support" + depends on BLOCK + depends on CONFIGFS_FS + select BLK_DEV_INTEGRITY_T10 if BLK_DEV_INTEGRITY + select SGL_ALLOC + help + This enabled target side support for the NVMe protocol, that is + it allows the Linux kernel to implement NVMe subsystems and + controllers and export Linux block devices as NVMe namespaces. + You need to select at least one of the transports below to make this + functionality useful. + + To configure the NVMe target you probably want to use the nvmetcli + tool from http://git.infradead.org/users/hch/nvmetcli.git. + +config NVME_TARGET_PASSTHRU + bool "NVMe Target Passthrough support" + depends on NVME_TARGET + depends on NVME_CORE=y || NVME_CORE=NVME_TARGET + help + This enables target side NVMe passthru controller support for the + NVMe Over Fabrics protocol. It allows for hosts to manage and + directly access an actual NVMe controller residing on the target + side, including executing Vendor Unique Commands. + + If unsure, say N. + +config NVME_TARGET_LOOP + tristate "NVMe loopback device support" + depends on NVME_TARGET + select NVME_FABRICS + select SG_POOL + help + This enables the NVMe loopback device support, which can be useful + to test NVMe host and target side features. + + If unsure, say N. + +config NVME_TARGET_RDMA + tristate "NVMe over Fabrics RDMA target support" + depends on INFINIBAND && INFINIBAND_ADDR_TRANS + depends on NVME_TARGET + select SGL_ALLOC + help + This enables the NVMe RDMA target support, which allows exporting NVMe + devices over RDMA. + + If unsure, say N. + +config NVME_TARGET_FC + tristate "NVMe over Fabrics FC target driver" + depends on NVME_TARGET + depends on HAS_DMA + select SGL_ALLOC + help + This enables the NVMe FC target support, which allows exporting NVMe + devices over FC. + + If unsure, say N. + +config NVME_TARGET_FCLOOP + tristate "NVMe over Fabrics FC Transport Loopback Test driver" + depends on NVME_TARGET + select NVME_FABRICS + select SG_POOL + depends on NVME_FC + depends on NVME_TARGET_FC + help + This enables the NVMe FC loopback test support, which can be useful + to test NVMe-FC transport interfaces. + + If unsure, say N. + +config NVME_TARGET_TCP + tristate "NVMe over Fabrics TCP target support" + depends on INET + depends on NVME_TARGET + help + This enables the NVMe TCP target support, which allows exporting NVMe + devices over TCP. + + If unsure, say N. diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/Makefile new file mode 100644 index 0000000..226e070 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/Makefile @@ -0,0 +1,42 @@ +# SPDX-License-Identifier: GPL-2.0 + +ccflags-y += -I$(src) +obj-$(CONFIG_NVME_TARGET) += nvmet.o +obj-$(CONFIG_NVME_TARGET_LOOP) += nvme-loop.o +obj-$(CONFIG_NVME_TARGET_RDMA) += nvmet-rdma.o +obj-$(CONFIG_NVME_TARGET_FC) += nvmet-fc.o +obj-$(CONFIG_NVME_TARGET_FCLOOP) += nvme-fcloop.o +obj-$(CONFIG_NVME_TARGET_TCP) += nvmet-tcp.o + +obj-$(CONFIG_NVME_TARGET_DUMMY) += nvmet-rdma.o + +# --with-nvmf-host-rdma-only +# dummy +obj-$(CONFIG_NVME_HOST_WITHOUT_FC) += nvmet.o +obj-$(CONFIG_NVME_HOST_WITHOUT_FC) += nvme-loop.o +obj-$(CONFIG_NVME_HOST_WITHOUT_FC) += nvmet-rdma.o +obj-$(CONFIG_NVME_HOST_WITHOUT_FC) += nvmet-fc.o +obj-$(CONFIG_NVME_HOST_WITHOUT_FC) += nvme-fcloop.o + +ifeq ($(CONFIG_NVME_TARGET_DUMMY),m) +nvmet-rdma-y += nvmet-rdma_dummy.o +else +ifeq ($(CONFIG_NVME_HOST_WITHOUT_FC),m) +nvmet-y += nvmet_dummy.o +nvme-loop-y += nvme-loop_dummy.o +nvmet-rdma-y += nvmet-rdma_dummy.o +nvmet-fc-y += nvmet-fc_dummy.o +nvme-fcloop-y += nvme-fcloop_dummy.o +else +nvmet-y += core.o configfs.o admin-cmd.o fabrics-cmd.o \ + discovery.o io-cmd-file.o io-cmd-bdev.o +nvmet-$(CONFIG_NVME_TARGET_PASSTHRU) += passthru.o +nvmet-$(CONFIG_BLK_DEV_ZONED) += zns.o +nvme-loop-y += loop.o +nvmet-rdma-y += rdma.o +nvmet-fc-y += fc.o +nvme-fcloop-y += fcloop.o +nvmet-tcp-y += tcp.o +nvmet-$(CONFIG_TRACING) += trace.o +endif +endif diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/admin-cmd.c b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/admin-cmd.c new file mode 100644 index 0000000..d7df068 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/admin-cmd.c @@ -0,0 +1,1078 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NVMe admin command implementation. + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + */ +#ifdef pr_fmt +#undef pr_fmt +#endif +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include + +#include +#include +#include "nvmet.h" + +u32 nvmet_get_log_page_len(struct nvme_command *cmd) +{ + u32 len = le16_to_cpu(cmd->get_log_page.numdu); + + len <<= 16; + len += le16_to_cpu(cmd->get_log_page.numdl); + /* NUMD is a 0's based value */ + len += 1; + len *= sizeof(u32); + + return len; +} + +static u32 nvmet_feat_data_len(struct nvmet_req *req, u32 cdw10) +{ + switch (cdw10 & 0xff) { + case NVME_FEAT_HOST_ID: + return sizeof(req->sq->ctrl->hostid); + default: + return 0; + } +} + +u64 nvmet_get_log_page_offset(struct nvme_command *cmd) +{ + return le64_to_cpu(cmd->get_log_page.lpo); +} + +static void nvmet_execute_get_log_page_noop(struct nvmet_req *req) +{ + nvmet_req_complete(req, nvmet_zero_sgl(req, 0, req->transfer_len)); +} + +static void nvmet_execute_get_log_page_error(struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + unsigned long flags; + off_t offset = 0; + u64 slot; + u64 i; + + spin_lock_irqsave(&ctrl->error_lock, flags); + slot = ctrl->err_counter % NVMET_ERROR_LOG_SLOTS; + + for (i = 0; i < NVMET_ERROR_LOG_SLOTS; i++) { + if (nvmet_copy_to_sgl(req, offset, &ctrl->slots[slot], + sizeof(struct nvme_error_slot))) + break; + + if (slot == 0) + slot = NVMET_ERROR_LOG_SLOTS - 1; + else + slot--; + offset += sizeof(struct nvme_error_slot); + } + spin_unlock_irqrestore(&ctrl->error_lock, flags); + nvmet_req_complete(req, 0); +} + +static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req, + struct nvme_smart_log *slog) +{ + u64 host_reads, host_writes, data_units_read, data_units_written; + u16 status; + + status = nvmet_req_find_ns(req); + if (status) + return status; + + /* we don't have the right data for file backed ns */ + if (!req->ns->bdev) + return NVME_SC_SUCCESS; + + host_reads = part_stat_read(req->ns->bdev, ios[READ]); + data_units_read = + DIV_ROUND_UP(part_stat_read(req->ns->bdev, sectors[READ]), 1000); + host_writes = part_stat_read(req->ns->bdev, ios[WRITE]); + data_units_written = + DIV_ROUND_UP(part_stat_read(req->ns->bdev, sectors[WRITE]), 1000); + + put_unaligned_le64(host_reads, &slog->host_reads[0]); + put_unaligned_le64(data_units_read, &slog->data_units_read[0]); + put_unaligned_le64(host_writes, &slog->host_writes[0]); + put_unaligned_le64(data_units_written, &slog->data_units_written[0]); + + return NVME_SC_SUCCESS; +} + +static u16 nvmet_get_smart_log_all(struct nvmet_req *req, + struct nvme_smart_log *slog) +{ + u64 host_reads = 0, host_writes = 0; + u64 data_units_read = 0, data_units_written = 0; + struct nvmet_ns *ns; + struct nvmet_ctrl *ctrl; + unsigned long idx; + + ctrl = req->sq->ctrl; + xa_for_each(&ctrl->subsys->namespaces, idx, ns) { + /* we don't have the right data for file backed ns */ + if (!ns->bdev) + continue; + host_reads += part_stat_read(ns->bdev, ios[READ]); + data_units_read += DIV_ROUND_UP( + part_stat_read(ns->bdev, sectors[READ]), 1000); + host_writes += part_stat_read(ns->bdev, ios[WRITE]); + data_units_written += DIV_ROUND_UP( + part_stat_read(ns->bdev, sectors[WRITE]), 1000); + } + + put_unaligned_le64(host_reads, &slog->host_reads[0]); + put_unaligned_le64(data_units_read, &slog->data_units_read[0]); + put_unaligned_le64(host_writes, &slog->host_writes[0]); + put_unaligned_le64(data_units_written, &slog->data_units_written[0]); + + return NVME_SC_SUCCESS; +} + +static void nvmet_execute_get_log_page_smart(struct nvmet_req *req) +{ + struct nvme_smart_log *log; + u16 status = NVME_SC_INTERNAL; + unsigned long flags; + + if (req->transfer_len != sizeof(*log)) + goto out; + + log = kzalloc(sizeof(*log), GFP_KERNEL); + if (!log) + goto out; + + if (req->cmd->get_log_page.nsid == cpu_to_le32(NVME_NSID_ALL)) + status = nvmet_get_smart_log_all(req, log); + else + status = nvmet_get_smart_log_nsid(req, log); + if (status) + goto out_free_log; + + spin_lock_irqsave(&req->sq->ctrl->error_lock, flags); + put_unaligned_le64(req->sq->ctrl->err_counter, + &log->num_err_log_entries); + spin_unlock_irqrestore(&req->sq->ctrl->error_lock, flags); + + status = nvmet_copy_to_sgl(req, 0, log, sizeof(*log)); +out_free_log: + kfree(log); +out: + nvmet_req_complete(req, status); +} + +static void nvmet_get_cmd_effects_nvm(struct nvme_effects_log *log) +{ + log->acs[nvme_admin_get_log_page] = cpu_to_le32(1 << 0); + log->acs[nvme_admin_identify] = cpu_to_le32(1 << 0); + log->acs[nvme_admin_abort_cmd] = cpu_to_le32(1 << 0); + log->acs[nvme_admin_set_features] = cpu_to_le32(1 << 0); + log->acs[nvme_admin_get_features] = cpu_to_le32(1 << 0); + log->acs[nvme_admin_async_event] = cpu_to_le32(1 << 0); + log->acs[nvme_admin_keep_alive] = cpu_to_le32(1 << 0); + + log->iocs[nvme_cmd_read] = cpu_to_le32(1 << 0); + log->iocs[nvme_cmd_write] = cpu_to_le32(1 << 0); + log->iocs[nvme_cmd_flush] = cpu_to_le32(1 << 0); + log->iocs[nvme_cmd_dsm] = cpu_to_le32(1 << 0); + log->iocs[nvme_cmd_write_zeroes] = cpu_to_le32(1 << 0); +} + +static void nvmet_get_cmd_effects_zns(struct nvme_effects_log *log) +{ + log->iocs[nvme_cmd_zone_append] = cpu_to_le32(1 << 0); + log->iocs[nvme_cmd_zone_mgmt_send] = cpu_to_le32(1 << 0); + log->iocs[nvme_cmd_zone_mgmt_recv] = cpu_to_le32(1 << 0); +} + +static void nvmet_execute_get_log_cmd_effects_ns(struct nvmet_req *req) +{ + struct nvme_effects_log *log; + u16 status = NVME_SC_SUCCESS; + + log = kzalloc(sizeof(*log), GFP_KERNEL); + if (!log) { + status = NVME_SC_INTERNAL; + goto out; + } + + switch (req->cmd->get_log_page.csi) { + case NVME_CSI_NVM: + nvmet_get_cmd_effects_nvm(log); + break; + case NVME_CSI_ZNS: + if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) { + status = NVME_SC_INVALID_IO_CMD_SET; + goto free; + } + nvmet_get_cmd_effects_nvm(log); + nvmet_get_cmd_effects_zns(log); + break; + default: + status = NVME_SC_INVALID_LOG_PAGE; + goto free; + } + + status = nvmet_copy_to_sgl(req, 0, log, sizeof(*log)); +free: + kfree(log); +out: + nvmet_req_complete(req, status); +} + +static void nvmet_execute_get_log_changed_ns(struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + u16 status = NVME_SC_INTERNAL; + size_t len; + + if (req->transfer_len != NVME_MAX_CHANGED_NAMESPACES * sizeof(__le32)) + goto out; + + mutex_lock(&ctrl->lock); + if (ctrl->nr_changed_ns == U32_MAX) + len = sizeof(__le32); + else + len = ctrl->nr_changed_ns * sizeof(__le32); + status = nvmet_copy_to_sgl(req, 0, ctrl->changed_ns_list, len); + if (!status) + status = nvmet_zero_sgl(req, len, req->transfer_len - len); + ctrl->nr_changed_ns = 0; + nvmet_clear_aen_bit(req, NVME_AEN_BIT_NS_ATTR); + mutex_unlock(&ctrl->lock); +out: + nvmet_req_complete(req, status); +} + +static u32 nvmet_format_ana_group(struct nvmet_req *req, u32 grpid, + struct nvme_ana_group_desc *desc) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + struct nvmet_ns *ns; + unsigned long idx; + u32 count = 0; + + if (!(req->cmd->get_log_page.lsp & NVME_ANA_LOG_RGO)) { + xa_for_each(&ctrl->subsys->namespaces, idx, ns) + if (ns->anagrpid == grpid) + desc->nsids[count++] = cpu_to_le32(ns->nsid); + } + + desc->grpid = cpu_to_le32(grpid); + desc->nnsids = cpu_to_le32(count); + desc->chgcnt = cpu_to_le64(nvmet_ana_chgcnt); + desc->state = req->port->ana_state[grpid]; + memset(desc->rsvd17, 0, sizeof(desc->rsvd17)); + return struct_size(desc, nsids, count); +} + +static void nvmet_execute_get_log_page_ana(struct nvmet_req *req) +{ + struct nvme_ana_rsp_hdr hdr = { 0, }; + struct nvme_ana_group_desc *desc; + size_t offset = sizeof(struct nvme_ana_rsp_hdr); /* start beyond hdr */ + size_t len; + u32 grpid; + u16 ngrps = 0; + u16 status; + + status = NVME_SC_INTERNAL; + desc = kmalloc(struct_size(desc, nsids, NVMET_MAX_NAMESPACES), + GFP_KERNEL); + if (!desc) + goto out; + + down_read(&nvmet_ana_sem); + for (grpid = 1; grpid <= NVMET_MAX_ANAGRPS; grpid++) { + if (!nvmet_ana_group_enabled[grpid]) + continue; + len = nvmet_format_ana_group(req, grpid, desc); + status = nvmet_copy_to_sgl(req, offset, desc, len); + if (status) + break; + offset += len; + ngrps++; + } + for ( ; grpid <= NVMET_MAX_ANAGRPS; grpid++) { + if (nvmet_ana_group_enabled[grpid]) + ngrps++; + } + + hdr.chgcnt = cpu_to_le64(nvmet_ana_chgcnt); + hdr.ngrps = cpu_to_le16(ngrps); + nvmet_clear_aen_bit(req, NVME_AEN_BIT_ANA_CHANGE); + up_read(&nvmet_ana_sem); + + kfree(desc); + + /* copy the header last once we know the number of groups */ + status = nvmet_copy_to_sgl(req, 0, &hdr, sizeof(hdr)); +out: + nvmet_req_complete(req, status); +} + +static void nvmet_execute_get_log_page(struct nvmet_req *req) +{ + if (!nvmet_check_transfer_len(req, nvmet_get_log_page_len(req->cmd))) + return; + + switch (req->cmd->get_log_page.lid) { + case NVME_LOG_ERROR: + return nvmet_execute_get_log_page_error(req); + case NVME_LOG_SMART: + return nvmet_execute_get_log_page_smart(req); + case NVME_LOG_FW_SLOT: + /* + * We only support a single firmware slot which always is + * active, so we can zero out the whole firmware slot log and + * still claim to fully implement this mandatory log page. + */ + return nvmet_execute_get_log_page_noop(req); + case NVME_LOG_CHANGED_NS: + return nvmet_execute_get_log_changed_ns(req); + case NVME_LOG_CMD_EFFECTS: + return nvmet_execute_get_log_cmd_effects_ns(req); + case NVME_LOG_ANA: + return nvmet_execute_get_log_page_ana(req); + } + pr_debug("unhandled lid %d on qid %d\n", + req->cmd->get_log_page.lid, req->sq->qid); + req->error_loc = offsetof(struct nvme_get_log_page_command, lid); + nvmet_req_complete(req, NVME_SC_INVALID_FIELD | NVME_SC_DNR); +} + +static bool nvmet_is_write_zeroes(struct nvmet_ctrl *ctrl) +{ + struct nvmet_ns *ns; + unsigned long idx; + + xa_for_each(&ctrl->subsys->namespaces, idx, ns) + if (!bdev_write_zeroes_sectors(ns->bdev)) + return false; + return true; +} + +static void nvmet_execute_identify_ctrl(struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + struct nvmet_subsys *subsys = ctrl->subsys; + struct nvme_id_ctrl *id; + u32 cmd_capsule_size; + u16 status = 0; + + if (!subsys->subsys_discovered) { + mutex_lock(&subsys->lock); + subsys->subsys_discovered = true; + mutex_unlock(&subsys->lock); + } + + id = kzalloc(sizeof(*id), GFP_KERNEL); + if (!id) { + status = NVME_SC_INTERNAL; + goto out; + } + + /* XXX: figure out how to assign real vendors IDs. */ + id->vid = 0; + id->ssvid = 0; + + memcpy(id->sn, ctrl->subsys->serial, NVMET_SN_MAX_SIZE); + memcpy_and_pad(id->mn, sizeof(id->mn), subsys->model_number, + strlen(subsys->model_number), ' '); + memcpy_and_pad(id->fr, sizeof(id->fr), + UTS_RELEASE, strlen(UTS_RELEASE), ' '); + + id->rab = 6; + + if (nvmet_is_disc_subsys(ctrl->subsys)) + id->cntrltype = NVME_CTRL_DISC; + else + id->cntrltype = NVME_CTRL_IO; + + /* + * XXX: figure out how we can assign a IEEE OUI, but until then + * the safest is to leave it as zeroes. + */ + + /* we support multiple ports, multiples hosts and ANA: */ + id->cmic = NVME_CTRL_CMIC_MULTI_PORT | NVME_CTRL_CMIC_MULTI_CTRL | + NVME_CTRL_CMIC_ANA; + + /* Limit MDTS according to transport capability + * + * limit the data transfer size in offload case according to device + * capability. + * */ + if (req->port->offload) + id->mdts = ctrl->ops->peer_to_peer_mdts(req->port); + else if (ctrl->ops->get_mdts) + id->mdts = ctrl->ops->get_mdts(ctrl); + else + id->mdts = 0; + + if (req->port->offload && req->port->offload_passthrough_sqe_rw) + id->ovsncs |= cpu_to_le32(NVME_OVSNCS_PASSTHROUGH_SQE_RW); + + id->cntlid = cpu_to_le16(ctrl->cntlid); + id->ver = cpu_to_le32(ctrl->subsys->ver); + + /* XXX: figure out what to do about RTD3R/RTD3 */ + id->oaes = cpu_to_le32(NVMET_AEN_CFG_OPTIONAL); + id->ctratt = cpu_to_le32(NVME_CTRL_ATTR_HID_128_BIT | + (ctrl->subsys->offloadble ? 0 : NVME_CTRL_ATTR_TBKAS)); + + id->oacs = 0; + + /* + * We don't really have a practical limit on the number of abort + * comands. But we don't do anything useful for abort either, so + * no point in allowing more abort commands than the spec requires. + */ + id->acl = 3; + + id->aerl = NVMET_ASYNC_EVENTS - 1; + + /* first slot is read-only, only one slot supported */ + id->frmw = (1 << 0) | (1 << 1); + id->lpa = (1 << 0) | (1 << 1) | (1 << 2); + id->elpe = NVMET_ERROR_LOG_SLOTS - 1; + id->npss = 0; + + /* We support keep-alive timeout in granularity of seconds */ + id->kas = cpu_to_le16(NVMET_KAS); + + id->sqes = (0x6 << 4) | 0x6; + id->cqes = (0x4 << 4) | 0x4; + + /* no enforcement soft-limit for maxcmd - pick arbitrary high value */ + id->maxcmd = cpu_to_le16(NVMET_MAX_CMD); + + id->nn = cpu_to_le32(NVMET_MAX_NAMESPACES); + id->mnan = cpu_to_le32(NVMET_MAX_NAMESPACES); + if (!req->port->offload || nvmet_is_write_zeroes(ctrl)) + id->oncs = cpu_to_le16(NVME_CTRL_ONCS_DSM | + NVME_CTRL_ONCS_WRITE_ZEROES); + else + id->oncs = cpu_to_le16(NVME_CTRL_ONCS_DSM); + + /* XXX: don't report vwc if the underlying device is write through */ + id->vwc = NVME_CTRL_VWC_PRESENT; + + /* + * We can't support atomic writes bigger than a LBA without support + * from the backend device. + */ + id->awun = 0; + id->awupf = 0; + + id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */ + if (ctrl->ops->flags & NVMF_KEYED_SGLS) + id->sgls |= cpu_to_le32(1 << 2); + if (ctrl->sqe_inline_size) + id->sgls |= cpu_to_le32(1 << 20); + + strlcpy(id->subnqn, ctrl->subsys->subsysnqn, sizeof(id->subnqn)); + + /* + * Max command capsule size is sqe + in-capsule data size. + * Disable in-capsule data for Metadata capable controllers. + */ + cmd_capsule_size = sizeof(struct nvme_command); + if (!ctrl->pi_support) + cmd_capsule_size += ctrl->sqe_inline_size; + id->ioccsz = cpu_to_le32(cmd_capsule_size / 16); + + /* Max response capsule size is cqe */ + id->iorcsz = cpu_to_le32(sizeof(struct nvme_completion) / 16); + + id->msdbd = ctrl->ops->msdbd; + + id->anacap = (1 << 0) | (1 << 1) | (1 << 2) | (1 << 3) | (1 << 4); + id->anatt = 10; /* random value */ + id->anagrpmax = cpu_to_le32(NVMET_MAX_ANAGRPS); + id->nanagrpid = cpu_to_le32(NVMET_MAX_ANAGRPS); + + /* + * Meh, we don't really support any power state. Fake up the same + * values that qemu does. + */ + id->psd[0].max_power = cpu_to_le16(0x9c4); + id->psd[0].entry_lat = cpu_to_le32(0x10); + id->psd[0].exit_lat = cpu_to_le32(0x4); + + id->nwpc = 1 << 0; /* write protect and no write protect */ + + status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id)); + + kfree(id); +out: + nvmet_req_complete(req, status); +} + +static void nvmet_execute_identify_ns(struct nvmet_req *req) +{ + struct nvme_id_ns *id; + u16 status; + + if (le32_to_cpu(req->cmd->identify.nsid) == NVME_NSID_ALL) { + req->error_loc = offsetof(struct nvme_identify, nsid); + status = NVME_SC_INVALID_NS | NVME_SC_DNR; + goto out; + } + + id = kzalloc(sizeof(*id), GFP_KERNEL); + if (!id) { + status = NVME_SC_INTERNAL; + goto out; + } + + /* return an all zeroed buffer if we can't find an active namespace */ + status = nvmet_req_find_ns(req); + if (status) { + status = 0; + goto done; + } + + nvmet_ns_revalidate(req->ns); + + /* + * nuse = ncap = nsze isn't always true, but we have no way to find + * that out from the underlying device. + */ + id->ncap = id->nsze = + cpu_to_le64(req->ns->size >> req->ns->blksize_shift); + switch (req->port->ana_state[req->ns->anagrpid]) { + case NVME_ANA_INACCESSIBLE: + case NVME_ANA_PERSISTENT_LOSS: + break; + default: + id->nuse = id->nsze; + break; + } + + if (req->ns->bdev) + nvmet_bdev_set_limits(req->ns->bdev, id); + + /* + * We just provide a single LBA format that matches what the + * underlying device reports. + */ + id->nlbaf = 0; + id->flbas = 0; + + /* + * Our namespace might always be shared. Not just with other + * controllers, but also with any other user of the block device. + */ + id->nmic = NVME_NS_NMIC_SHARED; + id->anagrpid = cpu_to_le32(req->ns->anagrpid); + + memcpy(&id->nguid, &req->ns->nguid, sizeof(id->nguid)); + + id->lbaf[0].ds = req->ns->blksize_shift; + + if (req->sq->ctrl->pi_support && nvmet_ns_has_pi(req->ns)) { + id->dpc = NVME_NS_DPC_PI_FIRST | NVME_NS_DPC_PI_LAST | + NVME_NS_DPC_PI_TYPE1 | NVME_NS_DPC_PI_TYPE2 | + NVME_NS_DPC_PI_TYPE3; + id->mc = NVME_MC_EXTENDED_LBA; + id->dps = req->ns->pi_type; + id->flbas = NVME_NS_FLBAS_META_EXT; + id->lbaf[0].ms = cpu_to_le16(req->ns->metadata_size); + } + + if (req->ns->readonly) + id->nsattr |= (1 << 0); +done: + if (!status) + status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id)); + + kfree(id); +out: + nvmet_req_complete(req, status); +} + +static void nvmet_execute_identify_nslist(struct nvmet_req *req) +{ + static const int buf_size = NVME_IDENTIFY_DATA_SIZE; + struct nvmet_ctrl *ctrl = req->sq->ctrl; + struct nvmet_ns *ns; + unsigned long idx; + u32 min_nsid = le32_to_cpu(req->cmd->identify.nsid); + __le32 *list; + u16 status = 0; + int i = 0; + + list = kzalloc(buf_size, GFP_KERNEL); + if (!list) { + status = NVME_SC_INTERNAL; + goto out; + } + + xa_for_each(&ctrl->subsys->namespaces, idx, ns) { + if (ns->nsid <= min_nsid) + continue; + list[i++] = cpu_to_le32(ns->nsid); + if (i == buf_size / sizeof(__le32)) + break; + } + + status = nvmet_copy_to_sgl(req, 0, list, buf_size); + + kfree(list); +out: + nvmet_req_complete(req, status); +} + +static u16 nvmet_copy_ns_identifier(struct nvmet_req *req, u8 type, u8 len, + void *id, off_t *off) +{ + struct nvme_ns_id_desc desc = { + .nidt = type, + .nidl = len, + }; + u16 status; + + status = nvmet_copy_to_sgl(req, *off, &desc, sizeof(desc)); + if (status) + return status; + *off += sizeof(desc); + + status = nvmet_copy_to_sgl(req, *off, id, len); + if (status) + return status; + *off += len; + + return 0; +} + +static void nvmet_execute_identify_desclist(struct nvmet_req *req) +{ + off_t off = 0; + u16 status; + + status = nvmet_req_find_ns(req); + if (status) + goto out; + + if (memchr_inv(&req->ns->uuid, 0, sizeof(req->ns->uuid))) { + status = nvmet_copy_ns_identifier(req, NVME_NIDT_UUID, + NVME_NIDT_UUID_LEN, + &req->ns->uuid, &off); + if (status) + goto out; + } + if (memchr_inv(req->ns->nguid, 0, sizeof(req->ns->nguid))) { + status = nvmet_copy_ns_identifier(req, NVME_NIDT_NGUID, + NVME_NIDT_NGUID_LEN, + &req->ns->nguid, &off); + if (status) + goto out; + } + + status = nvmet_copy_ns_identifier(req, NVME_NIDT_CSI, + NVME_NIDT_CSI_LEN, + &req->ns->csi, &off); + if (status) + goto out; + + if (sg_zero_buffer(req->sg, req->sg_cnt, NVME_IDENTIFY_DATA_SIZE - off, + off) != NVME_IDENTIFY_DATA_SIZE - off) + status = NVME_SC_INTERNAL | NVME_SC_DNR; + +out: + nvmet_req_complete(req, status); +} + +static bool nvmet_handle_identify_desclist(struct nvmet_req *req) +{ + switch (req->cmd->identify.csi) { + case NVME_CSI_NVM: + nvmet_execute_identify_desclist(req); + return true; + case NVME_CSI_ZNS: + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED)) { + nvmet_execute_identify_desclist(req); + return true; + } + return false; + default: + return false; + } +} + +static void nvmet_execute_identify(struct nvmet_req *req) +{ + if (!nvmet_check_transfer_len(req, NVME_IDENTIFY_DATA_SIZE)) + return; + + switch (req->cmd->identify.cns) { + case NVME_ID_CNS_NS: + switch (req->cmd->identify.csi) { + case NVME_CSI_NVM: + return nvmet_execute_identify_ns(req); + default: + break; + } + break; + case NVME_ID_CNS_CS_NS: + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED)) { + switch (req->cmd->identify.csi) { + case NVME_CSI_ZNS: + return nvmet_execute_identify_cns_cs_ns(req); + default: + break; + } + } + break; + case NVME_ID_CNS_CTRL: + switch (req->cmd->identify.csi) { + case NVME_CSI_NVM: + return nvmet_execute_identify_ctrl(req); + } + break; + case NVME_ID_CNS_CS_CTRL: + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED)) { + switch (req->cmd->identify.csi) { + case NVME_CSI_ZNS: + return nvmet_execute_identify_cns_cs_ctrl(req); + default: + break; + } + } + break; + case NVME_ID_CNS_NS_ACTIVE_LIST: + switch (req->cmd->identify.csi) { + case NVME_CSI_NVM: + return nvmet_execute_identify_nslist(req); + default: + break; + } + break; + case NVME_ID_CNS_NS_DESC_LIST: + if (nvmet_handle_identify_desclist(req) == true) + return; + break; + } + + nvmet_req_cns_error_complete(req); +} + +/* + * A "minimum viable" abort implementation: the command is mandatory in the + * spec, but we are not required to do any useful work. We couldn't really + * do a useful abort, so don't bother even with waiting for the command + * to be exectuted and return immediately telling the command to abort + * wasn't found. + */ +static void nvmet_execute_abort(struct nvmet_req *req) +{ + if (!nvmet_check_transfer_len(req, 0)) + return; + nvmet_set_result(req, 1); + nvmet_req_complete(req, 0); +} + +static u16 nvmet_write_protect_flush_sync(struct nvmet_req *req) +{ + u16 status; + + if (req->ns->file) + status = nvmet_file_flush(req); + else + status = nvmet_bdev_flush(req); + + if (status) + pr_err("write protect flush failed nsid: %u\n", req->ns->nsid); + return status; +} + +static u16 nvmet_set_feat_write_protect(struct nvmet_req *req) +{ + u32 write_protect = le32_to_cpu(req->cmd->common.cdw11); + struct nvmet_subsys *subsys = nvmet_req_subsys(req); + u16 status; + + status = nvmet_req_find_ns(req); + if (status) + return status; + + mutex_lock(&subsys->lock); + switch (write_protect) { + case NVME_NS_WRITE_PROTECT: + req->ns->readonly = true; + status = nvmet_write_protect_flush_sync(req); + if (status) + req->ns->readonly = false; + break; + case NVME_NS_NO_WRITE_PROTECT: + req->ns->readonly = false; + status = 0; + break; + default: + break; + } + + if (!status) + nvmet_ns_changed(subsys, req->ns->nsid); + mutex_unlock(&subsys->lock); + return status; +} + +u16 nvmet_set_feat_kato(struct nvmet_req *req) +{ + u32 val32 = le32_to_cpu(req->cmd->common.cdw11); + + nvmet_stop_keep_alive_timer(req->sq->ctrl); + req->sq->ctrl->kato = DIV_ROUND_UP(val32, 1000); + nvmet_start_keep_alive_timer(req->sq->ctrl); + + nvmet_set_result(req, req->sq->ctrl->kato); + + return 0; +} + +u16 nvmet_set_feat_async_event(struct nvmet_req *req, u32 mask) +{ + u32 val32 = le32_to_cpu(req->cmd->common.cdw11); + + if (val32 & ~mask) { + req->error_loc = offsetof(struct nvme_common_command, cdw11); + return NVME_SC_INVALID_FIELD | NVME_SC_DNR; + } + + WRITE_ONCE(req->sq->ctrl->aen_enabled, val32); + nvmet_set_result(req, val32); + + return 0; +} + +void nvmet_execute_set_features(struct nvmet_req *req) +{ + struct nvmet_subsys *subsys = nvmet_req_subsys(req); + u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10); + u32 cdw11 = le32_to_cpu(req->cmd->common.cdw11); + u16 status = 0; + u16 nsqr; + u16 ncqr; + + if (!nvmet_check_transfer_len(req, 0)) + return; + + switch (cdw10 & 0xff) { + case NVME_FEAT_NUM_QUEUES: + ncqr = (cdw11 >> 16) & 0xffff; + nsqr = cdw11 & 0xffff; + if (ncqr == 0xffff || nsqr == 0xffff) { + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + break; + } + nvmet_set_result(req, + (subsys->max_qid - 1) | ((subsys->max_qid - 1) << 16)); + break; + case NVME_FEAT_KATO: + status = nvmet_set_feat_kato(req); + break; + case NVME_FEAT_ASYNC_EVENT: + status = nvmet_set_feat_async_event(req, NVMET_AEN_CFG_ALL); + break; + case NVME_FEAT_HOST_ID: + status = NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; + break; + case NVME_FEAT_WRITE_PROTECT: + status = nvmet_set_feat_write_protect(req); + break; + default: + req->error_loc = offsetof(struct nvme_common_command, cdw10); + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + break; + } + + nvmet_req_complete(req, status); +} + +static u16 nvmet_get_feat_write_protect(struct nvmet_req *req) +{ + struct nvmet_subsys *subsys = nvmet_req_subsys(req); + u32 result; + + result = nvmet_req_find_ns(req); + if (result) + return result; + + mutex_lock(&subsys->lock); + if (req->ns->readonly == true) + result = NVME_NS_WRITE_PROTECT; + else + result = NVME_NS_NO_WRITE_PROTECT; + nvmet_set_result(req, result); + mutex_unlock(&subsys->lock); + + return 0; +} + +void nvmet_get_feat_kato(struct nvmet_req *req) +{ + nvmet_set_result(req, req->sq->ctrl->kato * 1000); +} + +void nvmet_get_feat_async_event(struct nvmet_req *req) +{ + nvmet_set_result(req, READ_ONCE(req->sq->ctrl->aen_enabled)); +} + +void nvmet_execute_get_features(struct nvmet_req *req) +{ + struct nvmet_subsys *subsys = nvmet_req_subsys(req); + u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10); + u16 status = 0; + + if (!nvmet_check_transfer_len(req, nvmet_feat_data_len(req, cdw10))) + return; + + switch (cdw10 & 0xff) { + /* + * These features are mandatory in the spec, but we don't + * have a useful way to implement them. We'll eventually + * need to come up with some fake values for these. + */ +#if 0 + case NVME_FEAT_ARBITRATION: + break; + case NVME_FEAT_POWER_MGMT: + break; + case NVME_FEAT_TEMP_THRESH: + break; + case NVME_FEAT_ERR_RECOVERY: + break; + case NVME_FEAT_IRQ_COALESCE: + break; + case NVME_FEAT_IRQ_CONFIG: + break; + case NVME_FEAT_WRITE_ATOMIC: + break; +#endif + case NVME_FEAT_ASYNC_EVENT: + nvmet_get_feat_async_event(req); + break; + case NVME_FEAT_VOLATILE_WC: + nvmet_set_result(req, 1); + break; + case NVME_FEAT_NUM_QUEUES: + nvmet_set_result(req, + (subsys->max_qid-1) | ((subsys->max_qid-1) << 16)); + break; + case NVME_FEAT_KATO: + nvmet_get_feat_kato(req); + break; + case NVME_FEAT_HOST_ID: + /* need 128-bit host identifier flag */ + if (!(req->cmd->common.cdw11 & cpu_to_le32(1 << 0))) { + req->error_loc = + offsetof(struct nvme_common_command, cdw11); + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + break; + } + + status = nvmet_copy_to_sgl(req, 0, &req->sq->ctrl->hostid, + sizeof(req->sq->ctrl->hostid)); + break; + case NVME_FEAT_WRITE_PROTECT: + status = nvmet_get_feat_write_protect(req); + break; + default: + req->error_loc = + offsetof(struct nvme_common_command, cdw10); + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + break; + } + + nvmet_req_complete(req, status); +} + +void nvmet_execute_async_event(struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + + if (!nvmet_check_transfer_len(req, 0)) + return; + + mutex_lock(&ctrl->lock); + if (ctrl->nr_async_event_cmds >= NVMET_ASYNC_EVENTS) { + mutex_unlock(&ctrl->lock); + nvmet_req_complete(req, NVME_SC_ASYNC_LIMIT | NVME_SC_DNR); + return; + } + ctrl->async_event_cmds[ctrl->nr_async_event_cmds++] = req; + mutex_unlock(&ctrl->lock); + + schedule_work(&ctrl->async_event_work); +} + +void nvmet_execute_keep_alive(struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + u16 status = 0; + + if (!nvmet_check_transfer_len(req, 0)) + return; + + if (!ctrl->kato) { + status = NVME_SC_KA_TIMEOUT_INVALID; + goto out; + } + + pr_debug("ctrl %d update keep-alive timer for %d secs\n", + ctrl->cntlid, ctrl->kato); + mod_delayed_work(system_wq, &ctrl->ka_work, ctrl->kato * HZ); +out: + nvmet_req_complete(req, status); +} + +u16 nvmet_parse_admin_cmd(struct nvmet_req *req) +{ + struct nvme_command *cmd = req->cmd; + u16 ret; + + if (nvme_is_fabrics(cmd)) + return nvmet_parse_fabrics_cmd(req); + if (nvmet_is_disc_subsys(nvmet_req_subsys(req))) + return nvmet_parse_discovery_cmd(req); + + ret = nvmet_check_ctrl_status(req); + if (unlikely(ret)) + return ret; + + if (nvmet_is_passthru_req(req)) + return nvmet_parse_passthru_admin_cmd(req); + + switch (cmd->common.opcode) { + case nvme_admin_get_log_page: + req->execute = nvmet_execute_get_log_page; + return 0; + case nvme_admin_identify: + req->execute = nvmet_execute_identify; + return 0; + case nvme_admin_abort_cmd: + req->execute = nvmet_execute_abort; + return 0; + case nvme_admin_set_features: + req->execute = nvmet_execute_set_features; + return 0; + case nvme_admin_get_features: + req->execute = nvmet_execute_get_features; + return 0; + case nvme_admin_async_event: + req->execute = nvmet_execute_async_event; + return 0; + case nvme_admin_keep_alive: + req->execute = nvmet_execute_keep_alive; + return 0; + default: + return nvmet_report_invalid_opcode(req); + } +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/configfs.c b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/configfs.c new file mode 100644 index 0000000..b2efb6d --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/configfs.c @@ -0,0 +1,2277 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Configfs interface for the NVMe target. + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + */ +#ifdef pr_fmt +#undef pr_fmt +#endif +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nvmet.h" + +static const struct config_item_type nvmet_host_type; +static const struct config_item_type nvmet_subsys_type; + +static LIST_HEAD(nvmet_ports_list); +struct list_head *nvmet_ports = &nvmet_ports_list; + +struct nvmet_type_name_map { + u8 type; + const char *name; +}; + +static struct nvmet_type_name_map nvmet_transport[] = { + { NVMF_TRTYPE_RDMA, "rdma" }, + { NVMF_TRTYPE_FC, "fc" }, + { NVMF_TRTYPE_TCP, "tcp" }, + { NVMF_TRTYPE_LOOP, "loop" }, +}; + +static const struct nvmet_type_name_map nvmet_addr_family[] = { + { NVMF_ADDR_FAMILY_PCI, "pcie" }, + { NVMF_ADDR_FAMILY_IP4, "ipv4" }, + { NVMF_ADDR_FAMILY_IP6, "ipv6" }, + { NVMF_ADDR_FAMILY_IB, "ib" }, + { NVMF_ADDR_FAMILY_FC, "fc" }, + { NVMF_ADDR_FAMILY_LOOP, "loop" }, +}; + +static bool nvmet_is_port_enabled(struct nvmet_port *p, const char *caller) +{ + if (p->enabled) + pr_err("Disable port '%u' before changing attribute in %s\n", + le16_to_cpu(p->disc_addr.portid), caller); + return p->enabled; +} + +/* + * nvmet_port Generic ConfigFS definitions. + * Used in any place in the ConfigFS tree that refers to an address. + */ +static ssize_t nvmet_addr_adrfam_show(struct config_item *item, char *page) +{ + u8 adrfam = to_nvmet_port(item)->disc_addr.adrfam; + int i; + + for (i = 1; i < ARRAY_SIZE(nvmet_addr_family); i++) { + if (nvmet_addr_family[i].type == adrfam) + return sprintf(page, "%s\n", nvmet_addr_family[i].name); + } + + return sprintf(page, "\n"); +} + +static ssize_t nvmet_addr_adrfam_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_port *port = to_nvmet_port(item); + int i; + + if (nvmet_is_port_enabled(port, __func__)) + return -EACCES; + + for (i = 1; i < ARRAY_SIZE(nvmet_addr_family); i++) { + if (sysfs_streq(page, nvmet_addr_family[i].name)) + goto found; + } + + pr_err("Invalid value '%s' for adrfam\n", page); + return -EINVAL; + +found: + port->disc_addr.adrfam = nvmet_addr_family[i].type; + return count; +} + +CONFIGFS_ATTR(nvmet_, addr_adrfam); + +static ssize_t nvmet_addr_portid_show(struct config_item *item, + char *page) +{ + struct nvmet_port *port = to_nvmet_port(item); + + return snprintf(page, PAGE_SIZE, "%d\n", + le16_to_cpu(port->disc_addr.portid)); +} + +static ssize_t nvmet_addr_portid_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_port *port = to_nvmet_port(item); + u16 portid = 0; + + if (kstrtou16(page, 0, &portid)) { + pr_err("Invalid value '%s' for portid\n", page); + return -EINVAL; + } + + if (nvmet_is_port_enabled(port, __func__)) + return -EACCES; + + port->disc_addr.portid = cpu_to_le16(portid); + return count; +} + +CONFIGFS_ATTR(nvmet_, addr_portid); + +static ssize_t nvmet_addr_traddr_show(struct config_item *item, + char *page) +{ + struct nvmet_port *port = to_nvmet_port(item); + + return snprintf(page, PAGE_SIZE, "%s\n", + port->disc_addr.traddr); +} + +static ssize_t nvmet_addr_traddr_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_port *port = to_nvmet_port(item); + + if (count > NVMF_TRADDR_SIZE) { + pr_err("Invalid value '%s' for traddr\n", page); + return -EINVAL; + } + + if (nvmet_is_port_enabled(port, __func__)) + return -EACCES; + + if (sscanf(page, "%s\n", port->disc_addr.traddr) != 1) + return -EINVAL; + return count; +} + +CONFIGFS_ATTR(nvmet_, addr_traddr); + +static const struct nvmet_type_name_map nvmet_addr_treq[] = { + { NVMF_TREQ_NOT_SPECIFIED, "not specified" }, + { NVMF_TREQ_REQUIRED, "required" }, + { NVMF_TREQ_NOT_REQUIRED, "not required" }, +}; + +static ssize_t nvmet_addr_treq_show(struct config_item *item, char *page) +{ + u8 treq = to_nvmet_port(item)->disc_addr.treq & + NVME_TREQ_SECURE_CHANNEL_MASK; + int i; + + for (i = 0; i < ARRAY_SIZE(nvmet_addr_treq); i++) { + if (treq == nvmet_addr_treq[i].type) + return sprintf(page, "%s\n", nvmet_addr_treq[i].name); + } + + return sprintf(page, "\n"); +} + +static ssize_t nvmet_addr_treq_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_port *port = to_nvmet_port(item); + u8 treq = port->disc_addr.treq & ~NVME_TREQ_SECURE_CHANNEL_MASK; + int i; + + if (nvmet_is_port_enabled(port, __func__)) + return -EACCES; + + for (i = 0; i < ARRAY_SIZE(nvmet_addr_treq); i++) { + if (sysfs_streq(page, nvmet_addr_treq[i].name)) + goto found; + } + + pr_err("Invalid value '%s' for treq\n", page); + return -EINVAL; + +found: + treq |= nvmet_addr_treq[i].type; + port->disc_addr.treq = treq; + return count; +} + +CONFIGFS_ATTR(nvmet_, addr_treq); + +static ssize_t nvmet_addr_trsvcid_show(struct config_item *item, + char *page) +{ + struct nvmet_port *port = to_nvmet_port(item); + + return snprintf(page, PAGE_SIZE, "%s\n", + port->disc_addr.trsvcid); +} + +static ssize_t nvmet_addr_trsvcid_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_port *port = to_nvmet_port(item); + + if (count > NVMF_TRSVCID_SIZE) { + pr_err("Invalid value '%s' for trsvcid\n", page); + return -EINVAL; + } + if (nvmet_is_port_enabled(port, __func__)) + return -EACCES; + + if (sscanf(page, "%s\n", port->disc_addr.trsvcid) != 1) + return -EINVAL; + return count; +} + +CONFIGFS_ATTR(nvmet_, addr_trsvcid); + +static ssize_t nvmet_param_inline_data_size_show(struct config_item *item, + char *page) +{ + struct nvmet_port *port = to_nvmet_port(item); + + return snprintf(page, PAGE_SIZE, "%d\n", port->inline_data_size); +} + +static ssize_t nvmet_param_inline_data_size_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_port *port = to_nvmet_port(item); + int ret; + + if (nvmet_is_port_enabled(port, __func__)) + return -EACCES; + ret = kstrtoint(page, 0, &port->inline_data_size); + if (ret) { + pr_err("Invalid value '%s' for inline_data_size\n", page); + return -EINVAL; + } + return count; +} + +CONFIGFS_ATTR(nvmet_, param_inline_data_size); + +#ifdef CONFIG_BLK_DEV_INTEGRITY +static ssize_t nvmet_param_pi_enable_show(struct config_item *item, + char *page) +{ + struct nvmet_port *port = to_nvmet_port(item); + + return snprintf(page, PAGE_SIZE, "%d\n", port->pi_enable); +} + +static ssize_t nvmet_param_pi_enable_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_port *port = to_nvmet_port(item); + bool val; + + if (strtobool(page, &val)) + return -EINVAL; + + if (nvmet_is_port_enabled(port, __func__)) + return -EACCES; + + port->pi_enable = val; + return count; +} + +CONFIGFS_ATTR(nvmet_, param_pi_enable); +#endif + +static ssize_t nvmet_addr_trtype_show(struct config_item *item, + char *page) +{ + struct nvmet_port *port = to_nvmet_port(item); + int i; + + for (i = 0; i < ARRAY_SIZE(nvmet_transport); i++) { + if (port->disc_addr.trtype == nvmet_transport[i].type) + return sprintf(page, "%s\n", nvmet_transport[i].name); + } + + return sprintf(page, "\n"); +} + +static void nvmet_port_init_tsas_rdma(struct nvmet_port *port) +{ + port->disc_addr.tsas.rdma.qptype = NVMF_RDMA_QPTYPE_CONNECTED; + port->disc_addr.tsas.rdma.prtype = NVMF_RDMA_PRTYPE_NOT_SPECIFIED; + port->disc_addr.tsas.rdma.cms = NVMF_RDMA_CMS_RDMA_CM; +} + +static ssize_t nvmet_addr_trtype_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_port *port = to_nvmet_port(item); + int i; + + if (nvmet_is_port_enabled(port, __func__)) + return -EACCES; + + for (i = 0; i < ARRAY_SIZE(nvmet_transport); i++) { + if (sysfs_streq(page, nvmet_transport[i].name)) + goto found; + } + + pr_err("Invalid value '%s' for trtype\n", page); + return -EINVAL; + +found: + memset(&port->disc_addr.tsas, 0, NVMF_TSAS_SIZE); + port->disc_addr.trtype = nvmet_transport[i].type; + if (port->disc_addr.trtype == NVMF_TRTYPE_RDMA) + nvmet_port_init_tsas_rdma(port); + return count; +} + +CONFIGFS_ATTR(nvmet_, addr_trtype); + +static ssize_t nvmet_addr_tractive_show(struct config_item *item, char *page) +{ + struct nvmet_port *port = to_nvmet_port(item); + + return sprintf(page, "%d\n", nvmet_is_port_active(port)); +} + +CONFIGFS_ATTR_RO(nvmet_, addr_tractive); + +static ssize_t nvmet_param_offload_queues_show(struct config_item *item, + char *page) +{ + struct nvmet_port *port = to_nvmet_port(item); + + return snprintf(page, PAGE_SIZE, "%d\n", port->offload_queues); +} + +static ssize_t nvmet_param_offload_queues_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_port *port = to_nvmet_port(item); + u32 offload_queues; + int ret; + + if (port->enabled) { + pr_err("Cannot modify offload_queues while enabled\n"); + pr_err("Disable the port before modifying\n"); + return -EACCES; + } + + ret = kstrtou32(page, 0, &offload_queues); + if (ret || !offload_queues || offload_queues > num_possible_cpus()) { + pr_err("Invalid value '%s' for offload_queues\n", page); + return -EINVAL; + } + port->offload_queues = offload_queues; + + return count; +} + +CONFIGFS_ATTR(nvmet_, param_offload_queues); + +static ssize_t nvmet_param_offload_srq_size_show(struct config_item *item, + char *page) +{ + struct nvmet_port *port = to_nvmet_port(item); + + return snprintf(page, PAGE_SIZE, "%zu\n", port->offload_srq_size); +} + +static ssize_t nvmet_param_offload_srq_size_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_port *port = to_nvmet_port(item); + u32 offload_srq_size; + int ret; + + if (port->enabled) { + pr_err("Cannot modify offload_srq_size while enabled\n"); + pr_err("Disable the port before modifying\n"); + return -EACCES; + } + + ret = kstrtou32(page, 0, &offload_srq_size); + if (ret || offload_srq_size < 256) { + pr_err("Invalid value '%s' for offload_srq_size, should >= 256\n", page); + return -EINVAL; + } + port->offload_srq_size = offload_srq_size; + + return count; +} + +CONFIGFS_ATTR(nvmet_, param_offload_srq_size); + +static ssize_t nvmet_param_offload_queue_size_show(struct config_item *item, + char *page) +{ + struct nvmet_port *port = to_nvmet_port(item); + + return snprintf(page, PAGE_SIZE, "%u\n", port->offload_queue_size); +} + +static ssize_t nvmet_param_offload_queue_size_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_port *port = to_nvmet_port(item); + u32 offload_queue_size; + int ret; + + if (nvmet_is_port_enabled(port, __func__)) + return -EACCES; + + ret = kstrtou32(page, 0, &offload_queue_size); + if (ret) { + pr_err("Invalid value '%s' for offload_queue_size\n", page); + return -EINVAL; + } + + if (!is_power_of_2(offload_queue_size)){ + pr_err("offload_queue_size is not power of 2\n"); + return -EINVAL; + } + port->offload_queue_size = offload_queue_size; + + return count; +} + +CONFIGFS_ATTR(nvmet_, param_offload_queue_size); + +static ssize_t nvmet_param_offload_passthrough_sqe_rw_show( + struct config_item *item, + char *page) +{ + struct nvmet_port *port = to_nvmet_port(item); + + return snprintf(page, PAGE_SIZE, "%d\n", + port->offload_passthrough_sqe_rw); +} + +static ssize_t nvmet_param_offload_passthrough_sqe_rw_store( + struct config_item *item, const char *page, size_t count) +{ + struct nvmet_port *port = to_nvmet_port(item); + bool enable; + + if (nvmet_is_port_enabled(port, __func__)) + return -EACCES; + + if (strtobool(page, &enable)) + return -EINVAL; + + port->offload_passthrough_sqe_rw = enable; + + return count; +} + +CONFIGFS_ATTR(nvmet_, param_offload_passthrough_sqe_rw); + +/* + * Namespace structures & file operation functions below + */ +static ssize_t nvmet_ns_device_path_show(struct config_item *item, char *page) +{ + return sprintf(page, "%s\n", to_nvmet_ns(item)->device_path); +} + +static ssize_t nvmet_ns_device_path_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_ns *ns = to_nvmet_ns(item); + struct nvmet_subsys *subsys = ns->subsys; + size_t len; + int ret; + + mutex_lock(&subsys->lock); + ret = -EBUSY; + if (ns->enabled) + goto out_unlock; + + ret = -EINVAL; + len = strcspn(page, "\n"); + if (!len) + goto out_unlock; + + kfree(ns->device_path); + ret = -ENOMEM; + ns->device_path = kmemdup_nul(page, len, GFP_KERNEL); + if (!ns->device_path) + goto out_unlock; + + mutex_unlock(&subsys->lock); + return count; + +out_unlock: + mutex_unlock(&subsys->lock); + return ret; +} + +CONFIGFS_ATTR(nvmet_ns_, device_path); + +#ifdef CONFIG_PCI_P2PDMA +static ssize_t nvmet_ns_p2pmem_show(struct config_item *item, char *page) +{ + struct nvmet_ns *ns = to_nvmet_ns(item); + + return pci_p2pdma_enable_show(page, ns->p2p_dev, ns->use_p2pmem); +} + +static ssize_t nvmet_ns_p2pmem_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_ns *ns = to_nvmet_ns(item); + struct pci_dev *p2p_dev = NULL; + bool use_p2pmem; + int ret = count; + int error; + + mutex_lock(&ns->subsys->lock); + if (ns->enabled) { + ret = -EBUSY; + goto out_unlock; + } + + error = pci_p2pdma_enable_store(page, &p2p_dev, &use_p2pmem); + if (error) { + ret = error; + goto out_unlock; + } + + ns->use_p2pmem = use_p2pmem; + pci_dev_put(ns->p2p_dev); + ns->p2p_dev = p2p_dev; + +out_unlock: + mutex_unlock(&ns->subsys->lock); + + return ret; +} + +CONFIGFS_ATTR(nvmet_ns_, p2pmem); +#endif /* CONFIG_PCI_P2PDMA */ + +static ssize_t nvmet_ns_device_uuid_show(struct config_item *item, char *page) +{ + return sprintf(page, "%pUb\n", &to_nvmet_ns(item)->uuid); +} + +static ssize_t nvmet_ns_device_uuid_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_ns *ns = to_nvmet_ns(item); + struct nvmet_subsys *subsys = ns->subsys; + int ret = 0; + + mutex_lock(&subsys->lock); + if (ns->enabled) { + ret = -EBUSY; + goto out_unlock; + } + + if (uuid_parse(page, &ns->uuid)) + ret = -EINVAL; + +out_unlock: + mutex_unlock(&subsys->lock); + return ret ? ret : count; +} + +CONFIGFS_ATTR(nvmet_ns_, device_uuid); + +static ssize_t nvmet_ns_device_nguid_show(struct config_item *item, char *page) +{ + return sprintf(page, "%pUb\n", &to_nvmet_ns(item)->nguid); +} + +static ssize_t nvmet_ns_device_nguid_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_ns *ns = to_nvmet_ns(item); + struct nvmet_subsys *subsys = ns->subsys; + u8 nguid[16]; + const char *p = page; + int i; + int ret = 0; + + mutex_lock(&subsys->lock); + if (ns->enabled) { + ret = -EBUSY; + goto out_unlock; + } + + for (i = 0; i < 16; i++) { + if (p + 2 > page + count) { + ret = -EINVAL; + goto out_unlock; + } + if (!isxdigit(p[0]) || !isxdigit(p[1])) { + ret = -EINVAL; + goto out_unlock; + } + + nguid[i] = (hex_to_bin(p[0]) << 4) | hex_to_bin(p[1]); + p += 2; + + if (*p == '-' || *p == ':') + p++; + } + + memcpy(&ns->nguid, nguid, sizeof(nguid)); +out_unlock: + mutex_unlock(&subsys->lock); + return ret ? ret : count; +} + +CONFIGFS_ATTR(nvmet_ns_, device_nguid); + +static ssize_t nvmet_ns_ana_grpid_show(struct config_item *item, char *page) +{ + return sprintf(page, "%u\n", to_nvmet_ns(item)->anagrpid); +} + +static ssize_t nvmet_ns_ana_grpid_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_ns *ns = to_nvmet_ns(item); + u32 oldgrpid, newgrpid; + int ret; + + ret = kstrtou32(page, 0, &newgrpid); + if (ret) + return ret; + + if (newgrpid < 1 || newgrpid > NVMET_MAX_ANAGRPS) + return -EINVAL; + + down_write(&nvmet_ana_sem); + oldgrpid = ns->anagrpid; + nvmet_ana_group_enabled[newgrpid]++; + ns->anagrpid = newgrpid; + nvmet_ana_group_enabled[oldgrpid]--; + nvmet_ana_chgcnt++; + up_write(&nvmet_ana_sem); + + nvmet_send_ana_event(ns->subsys, NULL); + return count; +} + +CONFIGFS_ATTR(nvmet_ns_, ana_grpid); + +static ssize_t nvmet_ns_enable_show(struct config_item *item, char *page) +{ + return sprintf(page, "%d\n", to_nvmet_ns(item)->enabled); +} + +static ssize_t nvmet_ns_enable_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_ns *ns = to_nvmet_ns(item); + bool enable; + int ret = 0; + + if (strtobool(page, &enable)) + return -EINVAL; + + if (enable) + ret = nvmet_ns_enable(ns); + else + nvmet_ns_disable(ns); + + return ret ? ret : count; +} + +CONFIGFS_ATTR(nvmet_ns_, enable); + +static ssize_t nvmet_ns_buffered_io_show(struct config_item *item, char *page) +{ + return sprintf(page, "%d\n", to_nvmet_ns(item)->buffered_io); +} + +static ssize_t nvmet_ns_buffered_io_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_ns *ns = to_nvmet_ns(item); + bool val; + + if (strtobool(page, &val)) + return -EINVAL; + + mutex_lock(&ns->subsys->lock); + if (ns->enabled) { + pr_err("disable ns before setting buffered_io value.\n"); + mutex_unlock(&ns->subsys->lock); + return -EINVAL; + } + + ns->buffered_io = val; + mutex_unlock(&ns->subsys->lock); + return count; +} + +CONFIGFS_ATTR(nvmet_ns_, buffered_io); + +static ssize_t nvmet_ns_offload_cmd_tmo_us_show(struct config_item *item, + char *page) +{ + return sprintf(page, "%d\n", to_nvmet_ns(item)->offload_cmd_tmo_us); +} + +static ssize_t nvmet_ns_offload_cmd_tmo_us_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_ns *ns = to_nvmet_ns(item); + u32 offload_cmd_tmo_us; + int ret; + + ret = kstrtou32(page, 0, &offload_cmd_tmo_us); + if (ret) + return ret; + + mutex_lock(&ns->subsys->lock); + if (ns->enabled) { + pr_err("disable ns before setting offload_cmd_tmo_us value.\n"); + mutex_unlock(&ns->subsys->lock); + return -EINVAL; + } + + ns->offload_cmd_tmo_us = offload_cmd_tmo_us; + mutex_unlock(&ns->subsys->lock); + return count; +} + +CONFIGFS_ATTR(nvmet_ns_, offload_cmd_tmo_us); + +static ssize_t nvmet_ns_revalidate_size_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_ns *ns = to_nvmet_ns(item); + bool val; + + if (strtobool(page, &val)) + return -EINVAL; + + if (!val) + return -EINVAL; + + mutex_lock(&ns->subsys->lock); + if (!ns->enabled) { + pr_err("enable ns before revalidate.\n"); + mutex_unlock(&ns->subsys->lock); + return -EINVAL; + } + nvmet_ns_revalidate(ns); + mutex_unlock(&ns->subsys->lock); + return count; +} + +CONFIGFS_ATTR_WO(nvmet_ns_, revalidate_size); + +/* + * Offload Namespace attributes and functions below + */ +static ssize_t +nvmet_ns_offload_cmds(struct nvmet_ns *ns, char *page, + u64 (*offload_cmds)(struct nvmet_ns *ns)) +{ + struct nvmet_subsys *subsys = ns->subsys; + bool valid = false; + u64 cmds; + + mutex_lock(&subsys->lock); + if (subsys->offloadble && offload_cmds && ns->enabled) { + cmds = offload_cmds(ns); + valid = true; + } + mutex_unlock(&subsys->lock); + + if (valid) + return sprintf(page, "%llu\n", cmds); + else + return sprintf(page, "%d\n", -1); +} + +static ssize_t +nvmet_ns_offload_read_cmds_show(struct config_item *item, char *page) +{ + struct nvmet_ns *ns = to_nvmet_ns(item); + + return nvmet_ns_offload_cmds(ns, page, + ns->subsys->offload_ns_read_cmds); +} +CONFIGFS_ATTR_RO(nvmet_ns_, offload_read_cmds); + +static ssize_t +nvmet_ns_offload_read_blocks_show(struct config_item *item, char *page) +{ + struct nvmet_ns *ns = to_nvmet_ns(item); + + return nvmet_ns_offload_cmds(ns, page, + ns->subsys->offload_ns_read_blocks); +} +CONFIGFS_ATTR_RO(nvmet_ns_, offload_read_blocks); + +static ssize_t +nvmet_ns_offload_write_cmds_show(struct config_item *item, char *page) +{ + struct nvmet_ns *ns = to_nvmet_ns(item); + + return nvmet_ns_offload_cmds(ns, page, + ns->subsys->offload_ns_write_cmds); +} +CONFIGFS_ATTR_RO(nvmet_ns_, offload_write_cmds); + +static ssize_t +nvmet_ns_offload_write_blocks_show(struct config_item *item, char *page) +{ + struct nvmet_ns *ns = to_nvmet_ns(item); + + return nvmet_ns_offload_cmds(ns, page, + ns->subsys->offload_ns_write_blocks); +} +CONFIGFS_ATTR_RO(nvmet_ns_, offload_write_blocks); + +static ssize_t +nvmet_ns_offload_write_inline_cmds_show(struct config_item *item, char *page) +{ + struct nvmet_ns *ns = to_nvmet_ns(item); + + return nvmet_ns_offload_cmds(ns, page, + ns->subsys->offload_ns_write_inline_cmds); +} +CONFIGFS_ATTR_RO(nvmet_ns_, offload_write_inline_cmds); + +static ssize_t +nvmet_ns_offload_flush_cmds_show(struct config_item *item, char *page) +{ + struct nvmet_ns *ns = to_nvmet_ns(item); + + return nvmet_ns_offload_cmds(ns, page, + ns->subsys->offload_ns_flush_cmds); +} +CONFIGFS_ATTR_RO(nvmet_ns_, offload_flush_cmds); + +static ssize_t +nvmet_ns_offload_error_cmds_show(struct config_item *item, char *page) +{ + struct nvmet_ns *ns = to_nvmet_ns(item); + + return nvmet_ns_offload_cmds(ns, page, + ns->subsys->offload_ns_error_cmds); +} +CONFIGFS_ATTR_RO(nvmet_ns_, offload_error_cmds); + +static ssize_t +nvmet_ns_offload_backend_error_cmds_show(struct config_item *item, char *page) +{ + struct nvmet_ns *ns = to_nvmet_ns(item); + + return nvmet_ns_offload_cmds(ns, page, + ns->subsys->offload_ns_backend_error_cmds); +} +CONFIGFS_ATTR_RO(nvmet_ns_, offload_backend_error_cmds); + +static struct configfs_attribute *nvmet_ns_attrs[] = { + &nvmet_ns_attr_device_path, + &nvmet_ns_attr_device_nguid, + &nvmet_ns_attr_device_uuid, + &nvmet_ns_attr_ana_grpid, + &nvmet_ns_attr_enable, + &nvmet_ns_attr_offload_read_cmds, + &nvmet_ns_attr_offload_read_blocks, + &nvmet_ns_attr_offload_write_cmds, + &nvmet_ns_attr_offload_write_blocks, + &nvmet_ns_attr_offload_write_inline_cmds, + &nvmet_ns_attr_offload_flush_cmds, + &nvmet_ns_attr_offload_error_cmds, + &nvmet_ns_attr_offload_backend_error_cmds, + &nvmet_ns_attr_offload_cmd_tmo_us, + &nvmet_ns_attr_buffered_io, + &nvmet_ns_attr_revalidate_size, +#ifdef CONFIG_PCI_P2PDMA + &nvmet_ns_attr_p2pmem, +#endif + NULL, +}; + +static void nvmet_ns_release(struct config_item *item) +{ + struct nvmet_ns *ns = to_nvmet_ns(item); + + nvmet_ns_free(ns); +} + +static struct configfs_item_operations nvmet_ns_item_ops = { + .release = nvmet_ns_release, +}; + +static const struct config_item_type nvmet_ns_type = { + .ct_item_ops = &nvmet_ns_item_ops, + .ct_attrs = nvmet_ns_attrs, + .ct_owner = THIS_MODULE, +}; + +static ssize_t nvmet_offload_ctx_traddr_show(struct config_item *item, char *page) +{ + struct nvmet_offload_ctx *ctx = to_nvmet_offload_ctx(item); + + return snprintf(page, PAGE_SIZE, "%s\n", ctx->port->disc_addr.traddr); +} +CONFIGFS_ATTR_RO(nvmet_offload_ctx_, traddr); + +static ssize_t nvmet_offload_ctx_trsvcid_show(struct config_item *item, char *page) +{ + struct nvmet_offload_ctx *ctx = to_nvmet_offload_ctx(item); + + return snprintf(page, PAGE_SIZE, "%s\n", ctx->port->disc_addr.trsvcid); +} +CONFIGFS_ATTR_RO(nvmet_offload_ctx_, trsvcid); + +static ssize_t +nvmet_offload_ctx_stat_show(struct config_item *item, char *page) +{ + struct nvmet_offload_ctx *ctx = to_nvmet_offload_ctx(item); + struct nvmet_subsys *subsys = ctx->ns->subsys; + bool valid = false; + struct nvmet_ns_counters counters; + + mutex_lock(&subsys->lock); + if (subsys->offloadble && subsys->offload_query_counters && + ctx->ns->enabled) { + subsys->offload_query_counters(ctx->ctx, &counters); + valid = true; + } + mutex_unlock(&subsys->lock); + + if (!valid) + return sprintf(page, "%d\n", -1); + + return sprintf(page, "read_cmd | read_blocks | write_cmd" + " | write_blocks | write_inline_cmd | flush_cmd | error_cmd" + " | backend_error_cmd | last_read_latency_0.1_usec" + " | last_write_latency_0.1_usec | queue_depth\n" + "%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t" + "%llu\n", + counters.num_read_cmd, counters.num_read_blocks, + counters.num_write_cmd, counters.num_write_blocks, + counters.num_write_inline_cmd, counters.num_flush_cmd, + counters.num_error_cmd, counters.num_backend_error_cmd, + counters.last_read_latency, counters.last_write_latency, + counters.queue_depth); +} +CONFIGFS_ATTR_RO(nvmet_offload_ctx_, stat); + +static struct configfs_attribute *nvmet_offload_ctx_attrs[] = { + &nvmet_offload_ctx_attr_traddr, + &nvmet_offload_ctx_attr_trsvcid, + &nvmet_offload_ctx_attr_stat, + NULL, +}; + +static const struct config_item_type nvmet_offload_ctx_type = { + .ct_attrs = nvmet_offload_ctx_attrs, + .ct_owner = THIS_MODULE, +}; + +static const struct config_item_type nvmet_offload_ctxs_type = { + .ct_owner = THIS_MODULE, +}; + +static struct config_group *nvmet_ns_make(struct config_group *group, + const char *name) +{ + struct nvmet_subsys *subsys = namespaces_to_subsys(&group->cg_item); + struct nvmet_ns *ns; + int ret; + u32 nsid; + + ret = kstrtou32(name, 0, &nsid); + if (ret) + goto out; + + ret = -EINVAL; + if (nsid == 0 || nsid == NVME_NSID_ALL) { + pr_err("invalid nsid %#x", nsid); + goto out; + } + + ret = -ENOMEM; + ns = nvmet_ns_alloc(subsys, nsid); + if (!ns) + goto out; + config_group_init_type_name(&ns->group, name, &nvmet_ns_type); + + config_group_init_type_name(&ns->offload_ctxs_group, + "offload_ctxs", &nvmet_offload_ctxs_type); + configfs_add_default_group(&ns->offload_ctxs_group, + &ns->group); + + pr_info("adding nsid %d to subsystem %s\n", nsid, subsys->subsysnqn); + + return &ns->group; +out: + return ERR_PTR(ret); +} + +static struct configfs_group_operations nvmet_namespaces_group_ops = { + .make_group = nvmet_ns_make, +}; + +static const struct config_item_type nvmet_namespaces_type = { + .ct_group_ops = &nvmet_namespaces_group_ops, + .ct_owner = THIS_MODULE, +}; + +#ifdef CONFIG_NVME_TARGET_PASSTHRU + +static ssize_t nvmet_passthru_device_path_show(struct config_item *item, + char *page) +{ + struct nvmet_subsys *subsys = to_subsys(item->ci_parent); + + return snprintf(page, PAGE_SIZE, "%s\n", subsys->passthru_ctrl_path); +} + +static ssize_t nvmet_passthru_device_path_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_subsys *subsys = to_subsys(item->ci_parent); + size_t len; + int ret; + + mutex_lock(&subsys->lock); + + ret = -EBUSY; + if (subsys->passthru_ctrl) + goto out_unlock; + + ret = -EINVAL; + len = strcspn(page, "\n"); + if (!len) + goto out_unlock; + + kfree(subsys->passthru_ctrl_path); + ret = -ENOMEM; + subsys->passthru_ctrl_path = kstrndup(page, len, GFP_KERNEL); + if (!subsys->passthru_ctrl_path) + goto out_unlock; + + mutex_unlock(&subsys->lock); + + return count; +out_unlock: + mutex_unlock(&subsys->lock); + return ret; +} +CONFIGFS_ATTR(nvmet_passthru_, device_path); + +static ssize_t nvmet_passthru_enable_show(struct config_item *item, + char *page) +{ + struct nvmet_subsys *subsys = to_subsys(item->ci_parent); + + return sprintf(page, "%d\n", subsys->passthru_ctrl ? 1 : 0); +} + +static ssize_t nvmet_passthru_enable_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_subsys *subsys = to_subsys(item->ci_parent); + bool enable; + int ret = 0; + + if (strtobool(page, &enable)) + return -EINVAL; + + if (enable) + ret = nvmet_passthru_ctrl_enable(subsys); + else + nvmet_passthru_ctrl_disable(subsys); + + return ret ? ret : count; +} +CONFIGFS_ATTR(nvmet_passthru_, enable); + +static ssize_t nvmet_passthru_admin_timeout_show(struct config_item *item, + char *page) +{ + return sprintf(page, "%u\n", to_subsys(item->ci_parent)->admin_timeout); +} + +static ssize_t nvmet_passthru_admin_timeout_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_subsys *subsys = to_subsys(item->ci_parent); + unsigned int timeout; + + if (kstrtouint(page, 0, &timeout)) + return -EINVAL; + subsys->admin_timeout = timeout; + return count; +} +CONFIGFS_ATTR(nvmet_passthru_, admin_timeout); + +static ssize_t nvmet_passthru_io_timeout_show(struct config_item *item, + char *page) +{ + return sprintf(page, "%u\n", to_subsys(item->ci_parent)->io_timeout); +} + +static ssize_t nvmet_passthru_io_timeout_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_subsys *subsys = to_subsys(item->ci_parent); + unsigned int timeout; + + if (kstrtouint(page, 0, &timeout)) + return -EINVAL; + subsys->io_timeout = timeout; + return count; +} +CONFIGFS_ATTR(nvmet_passthru_, io_timeout); + +static ssize_t nvmet_passthru_clear_ids_show(struct config_item *item, + char *page) +{ + return sprintf(page, "%u\n", to_subsys(item->ci_parent)->clear_ids); +} + +static ssize_t nvmet_passthru_clear_ids_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_subsys *subsys = to_subsys(item->ci_parent); + unsigned int clear_ids; + + if (kstrtouint(page, 0, &clear_ids)) + return -EINVAL; + subsys->clear_ids = clear_ids; + return count; +} +CONFIGFS_ATTR(nvmet_passthru_, clear_ids); + +static struct configfs_attribute *nvmet_passthru_attrs[] = { + &nvmet_passthru_attr_device_path, + &nvmet_passthru_attr_enable, + &nvmet_passthru_attr_admin_timeout, + &nvmet_passthru_attr_io_timeout, + &nvmet_passthru_attr_clear_ids, + NULL, +}; + +static const struct config_item_type nvmet_passthru_type = { + .ct_attrs = nvmet_passthru_attrs, + .ct_owner = THIS_MODULE, +}; + +static void nvmet_add_passthru_group(struct nvmet_subsys *subsys) +{ + config_group_init_type_name(&subsys->passthru_group, + "passthru", &nvmet_passthru_type); + configfs_add_default_group(&subsys->passthru_group, + &subsys->group); +} + +#else /* CONFIG_NVME_TARGET_PASSTHRU */ + +static void nvmet_add_passthru_group(struct nvmet_subsys *subsys) +{ +} + +#endif /* CONFIG_NVME_TARGET_PASSTHRU */ + +static int nvmet_port_subsys_allow_link(struct config_item *parent, + struct config_item *target) +{ + struct nvmet_port *port = to_nvmet_port(parent->ci_parent); + struct nvmet_subsys *subsys; + struct nvmet_subsys_link *link, *p; + int ret; + + if (target->ci_type != &nvmet_subsys_type) { + pr_err("can only link subsystems into the subsystems dir.!\n"); + return -EINVAL; + } + subsys = to_subsys(target); + link = kmalloc(sizeof(*link), GFP_KERNEL); + if (!link) + return -ENOMEM; + link->subsys = subsys; + + down_write(&nvmet_config_sem); + ret = -EEXIST; + list_for_each_entry(p, &port->subsystems, entry) { + if (p->subsys == subsys) + goto out_free_link; + } + + if (list_empty(&port->subsystems)) { + ret = nvmet_enable_port(port, subsys); + if (ret) + goto out_free_link; + } else if (port->offload && !port->many_offload_subsys_support) { + /* + * This limitation exists only in 1.0 spec. + * Spec 1.1 solved it by passing CNTLID in private data format. + */ + pr_err("Offloaded port restricted to have one subsytem enabled\n"); + ret = -EINVAL; + goto out_free_link; + } else if (port->offload != subsys->offloadble) { + pr_err("can only link subsystems to ports with the same offloadble polarity\n"); + ret = -EINVAL; + goto out_free_link; + } + + list_add_tail(&link->entry, &port->subsystems); + /* We initialize offload subsystem attrs in the first port */ + if (!subsys->num_ports && subsys->offloadble) + nvmet_init_offload_subsystem_port_attrs(port, subsys); + subsys->num_ports++; + nvmet_port_disc_changed(port, subsys); + + up_write(&nvmet_config_sem); + return 0; + +out_free_link: + up_write(&nvmet_config_sem); + kfree(link); + return ret; +} + +static void nvmet_port_subsys_drop_link(struct config_item *parent, + struct config_item *target) +{ + struct nvmet_port *port = to_nvmet_port(parent->ci_parent); + struct nvmet_subsys *subsys = to_subsys(target); + struct nvmet_subsys_link *p; + + down_write(&nvmet_config_sem); + list_for_each_entry(p, &port->subsystems, entry) { + if (p->subsys == subsys) + goto found; + } + up_write(&nvmet_config_sem); + return; + +found: + list_del(&p->entry); + nvmet_port_del_ctrls(port, subsys); + nvmet_port_disc_changed(port, subsys); + + if (list_empty(&port->subsystems)) + nvmet_disable_port(port); + p->subsys->num_ports--; + /* We un-initialize offload subsystem attrs in the last port */ + if (!subsys->num_ports && subsys->offloadble) + nvmet_uninit_offload_subsystem_port_attrs(subsys); + up_write(&nvmet_config_sem); + kfree(p); +} + +static struct configfs_item_operations nvmet_port_subsys_item_ops = { + .allow_link = nvmet_port_subsys_allow_link, + .drop_link = nvmet_port_subsys_drop_link, +}; + +static const struct config_item_type nvmet_port_subsys_type = { + .ct_item_ops = &nvmet_port_subsys_item_ops, + .ct_owner = THIS_MODULE, +}; + +static int nvmet_allowed_hosts_allow_link(struct config_item *parent, + struct config_item *target) +{ + struct nvmet_subsys *subsys = to_subsys(parent->ci_parent); + struct nvmet_host *host; + struct nvmet_host_link *link, *p; + int ret; + + if (target->ci_type != &nvmet_host_type) { + pr_err("can only link hosts into the allowed_hosts directory!\n"); + return -EINVAL; + } + + host = to_host(target); + link = kmalloc(sizeof(*link), GFP_KERNEL); + if (!link) + return -ENOMEM; + link->host = host; + + down_write(&nvmet_config_sem); + ret = -EINVAL; + if (subsys->allow_any_host) { + pr_err("can't add hosts when allow_any_host is set!\n"); + goto out_free_link; + } + + ret = -EEXIST; + list_for_each_entry(p, &subsys->hosts, entry) { + if (!strcmp(nvmet_host_name(p->host), nvmet_host_name(host))) + goto out_free_link; + } + list_add_tail(&link->entry, &subsys->hosts); + nvmet_subsys_disc_changed(subsys, host); + + up_write(&nvmet_config_sem); + return 0; +out_free_link: + up_write(&nvmet_config_sem); + kfree(link); + return ret; +} + +static void nvmet_allowed_hosts_drop_link(struct config_item *parent, + struct config_item *target) +{ + struct nvmet_subsys *subsys = to_subsys(parent->ci_parent); + struct nvmet_host *host = to_host(target); + struct nvmet_host_link *p; + + down_write(&nvmet_config_sem); + list_for_each_entry(p, &subsys->hosts, entry) { + if (!strcmp(nvmet_host_name(p->host), nvmet_host_name(host))) + goto found; + } + up_write(&nvmet_config_sem); + return; + +found: + list_del(&p->entry); + nvmet_subsys_disc_changed(subsys, host); + + up_write(&nvmet_config_sem); + kfree(p); +} + +static struct configfs_item_operations nvmet_allowed_hosts_item_ops = { + .allow_link = nvmet_allowed_hosts_allow_link, + .drop_link = nvmet_allowed_hosts_drop_link, +}; + +static const struct config_item_type nvmet_allowed_hosts_type = { + .ct_item_ops = &nvmet_allowed_hosts_item_ops, + .ct_owner = THIS_MODULE, +}; + +static ssize_t nvmet_subsys_attr_allow_any_host_show(struct config_item *item, + char *page) +{ + return snprintf(page, PAGE_SIZE, "%d\n", + to_subsys(item)->allow_any_host); +} + +static ssize_t nvmet_subsys_attr_allow_any_host_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_subsys *subsys = to_subsys(item); + bool allow_any_host; + int ret = 0; + + if (strtobool(page, &allow_any_host)) + return -EINVAL; + + down_write(&nvmet_config_sem); + if (allow_any_host && !list_empty(&subsys->hosts)) { + pr_err("Can't set allow_any_host when explicit hosts are set!\n"); + ret = -EINVAL; + goto out_unlock; + } + + if (subsys->allow_any_host != allow_any_host) { + subsys->allow_any_host = allow_any_host; + nvmet_subsys_disc_changed(subsys, NULL); + } + +out_unlock: + up_write(&nvmet_config_sem); + return ret ? ret : count; +} + +CONFIGFS_ATTR(nvmet_subsys_, attr_allow_any_host); + +static ssize_t nvmet_subsys_attr_version_show(struct config_item *item, + char *page) +{ + struct nvmet_subsys *subsys = to_subsys(item); + + if (NVME_TERTIARY(subsys->ver)) + return snprintf(page, PAGE_SIZE, "%llu.%llu.%llu\n", + NVME_MAJOR(subsys->ver), + NVME_MINOR(subsys->ver), + NVME_TERTIARY(subsys->ver)); + + return snprintf(page, PAGE_SIZE, "%llu.%llu\n", + NVME_MAJOR(subsys->ver), + NVME_MINOR(subsys->ver)); +} + +static ssize_t +nvmet_subsys_attr_version_store_locked(struct nvmet_subsys *subsys, + const char *page, size_t count) +{ + int major, minor, tertiary = 0; + int ret; + + if (subsys->subsys_discovered) { + if (NVME_TERTIARY(subsys->ver)) + pr_err("Can't set version number. %llu.%llu.%llu is already assigned\n", + NVME_MAJOR(subsys->ver), + NVME_MINOR(subsys->ver), + NVME_TERTIARY(subsys->ver)); + else + pr_err("Can't set version number. %llu.%llu is already assigned\n", + NVME_MAJOR(subsys->ver), + NVME_MINOR(subsys->ver)); + return -EINVAL; + } + + /* passthru subsystems use the underlying controller's version */ + if (nvmet_is_passthru_subsys(subsys)) + return -EINVAL; + + ret = sscanf(page, "%d.%d.%d\n", &major, &minor, &tertiary); + if (ret != 2 && ret != 3) + return -EINVAL; + + subsys->ver = NVME_VS(major, minor, tertiary); + + return count; +} + +static ssize_t nvmet_subsys_attr_version_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_subsys *subsys = to_subsys(item); + ssize_t ret; + + down_write(&nvmet_config_sem); + mutex_lock(&subsys->lock); + ret = nvmet_subsys_attr_version_store_locked(subsys, page, count); + mutex_unlock(&subsys->lock); + up_write(&nvmet_config_sem); + + return ret; +} +CONFIGFS_ATTR(nvmet_subsys_, attr_version); + +/* See Section 1.5 of NVMe 1.4 */ +static bool nvmet_is_ascii(const char c) +{ + return c >= 0x20 && c <= 0x7e; +} + +static ssize_t nvmet_subsys_attr_serial_show(struct config_item *item, + char *page) +{ + struct nvmet_subsys *subsys = to_subsys(item); + + return snprintf(page, PAGE_SIZE, "%.*s\n", + NVMET_SN_MAX_SIZE, subsys->serial); +} + +static ssize_t +nvmet_subsys_attr_serial_store_locked(struct nvmet_subsys *subsys, + const char *page, size_t count) +{ + int pos, len = strcspn(page, "\n"); + + if (subsys->subsys_discovered) { + pr_err("Can't set serial number. %s is already assigned\n", + subsys->serial); + return -EINVAL; + } + + if (!len || len > NVMET_SN_MAX_SIZE) { + pr_err("Serial Number can not be empty or exceed %d Bytes\n", + NVMET_SN_MAX_SIZE); + return -EINVAL; + } + + for (pos = 0; pos < len; pos++) { + if (!nvmet_is_ascii(page[pos])) { + pr_err("Serial Number must contain only ASCII strings\n"); + return -EINVAL; + } + } + + memcpy_and_pad(subsys->serial, NVMET_SN_MAX_SIZE, page, len, ' '); + + return count; +} + +static ssize_t nvmet_subsys_attr_serial_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_subsys *subsys = to_subsys(item); + ssize_t ret; + + down_write(&nvmet_config_sem); + mutex_lock(&subsys->lock); + ret = nvmet_subsys_attr_serial_store_locked(subsys, page, count); + mutex_unlock(&subsys->lock); + up_write(&nvmet_config_sem); + + return ret; +} +CONFIGFS_ATTR(nvmet_subsys_, attr_serial); + +static ssize_t nvmet_subsys_attr_cntlid_min_show(struct config_item *item, + char *page) +{ + return snprintf(page, PAGE_SIZE, "%u\n", to_subsys(item)->cntlid_min); +} + +static ssize_t nvmet_subsys_attr_cntlid_min_store(struct config_item *item, + const char *page, size_t cnt) +{ + u16 cntlid_min; + + if (sscanf(page, "%hu\n", &cntlid_min) != 1) + return -EINVAL; + + if (cntlid_min == 0) + return -EINVAL; + + down_write(&nvmet_config_sem); + if (cntlid_min >= to_subsys(item)->cntlid_max) + goto out_unlock; + to_subsys(item)->cntlid_min = cntlid_min; + up_write(&nvmet_config_sem); + return cnt; + +out_unlock: + up_write(&nvmet_config_sem); + return -EINVAL; +} +CONFIGFS_ATTR(nvmet_subsys_, attr_cntlid_min); + +static ssize_t nvmet_subsys_attr_cntlid_max_show(struct config_item *item, + char *page) +{ + return snprintf(page, PAGE_SIZE, "%u\n", to_subsys(item)->cntlid_max); +} + +static ssize_t nvmet_subsys_attr_cntlid_max_store(struct config_item *item, + const char *page, size_t cnt) +{ + u16 cntlid_max; + + if (sscanf(page, "%hu\n", &cntlid_max) != 1) + return -EINVAL; + + if (cntlid_max == 0) + return -EINVAL; + + down_write(&nvmet_config_sem); + if (cntlid_max <= to_subsys(item)->cntlid_min) + goto out_unlock; + to_subsys(item)->cntlid_max = cntlid_max; + up_write(&nvmet_config_sem); + return cnt; + +out_unlock: + up_write(&nvmet_config_sem); + return -EINVAL; +} +CONFIGFS_ATTR(nvmet_subsys_, attr_cntlid_max); + +static ssize_t nvmet_subsys_attr_model_show(struct config_item *item, + char *page) +{ + struct nvmet_subsys *subsys = to_subsys(item); + + return snprintf(page, PAGE_SIZE, "%s\n", subsys->model_number); +} + +static ssize_t nvmet_subsys_attr_model_store_locked(struct nvmet_subsys *subsys, + const char *page, size_t count) +{ + int pos = 0, len; + + if (subsys->subsys_discovered) { + pr_err("Can't set model number. %s is already assigned\n", + subsys->model_number); + return -EINVAL; + } + + len = strcspn(page, "\n"); + if (!len) + return -EINVAL; + + if (len > NVMET_MN_MAX_SIZE) { + pr_err("Model number size can not exceed %d Bytes\n", + NVMET_MN_MAX_SIZE); + return -EINVAL; + } + + for (pos = 0; pos < len; pos++) { + if (!nvmet_is_ascii(page[pos])) + return -EINVAL; + } + + subsys->model_number = kmemdup_nul(page, len, GFP_KERNEL); + if (!subsys->model_number) + return -ENOMEM; + return count; +} + +static ssize_t nvmet_subsys_attr_model_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_subsys *subsys = to_subsys(item); + ssize_t ret; + + down_write(&nvmet_config_sem); + mutex_lock(&subsys->lock); + ret = nvmet_subsys_attr_model_store_locked(subsys, page, count); + mutex_unlock(&subsys->lock); + up_write(&nvmet_config_sem); + + return ret; +} +CONFIGFS_ATTR(nvmet_subsys_, attr_model); + +#ifdef CONFIG_BLK_DEV_INTEGRITY +static ssize_t nvmet_subsys_attr_pi_enable_show(struct config_item *item, + char *page) +{ + return snprintf(page, PAGE_SIZE, "%d\n", to_subsys(item)->pi_support); +} + +static ssize_t nvmet_subsys_attr_pi_enable_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_subsys *subsys = to_subsys(item); + bool pi_enable; + + if (strtobool(page, &pi_enable)) + return -EINVAL; + + subsys->pi_support = pi_enable; + return count; +} +CONFIGFS_ATTR(nvmet_subsys_, attr_pi_enable); +#endif + +static ssize_t +nvmet_subsys_attr_offload_subsys_unknown_ns_cmds_show(struct config_item *item, + char *page) +{ + struct nvmet_subsys *subsys = to_subsys(item); + bool valid = false; + u64 unknown_cmds; + + down_write(&nvmet_config_sem); + mutex_lock(&subsys->lock); + if (subsys->offloadble && subsys->offload_subsys_unknown_ns_cmds) { + unknown_cmds = subsys->offload_subsys_unknown_ns_cmds(subsys); + valid = true; + } + mutex_unlock(&subsys->lock); + up_write(&nvmet_config_sem); + + if (valid) + return snprintf(page, PAGE_SIZE, "%llu\n", unknown_cmds); + else + return snprintf(page, PAGE_SIZE, "%d\n", -1); +} +CONFIGFS_ATTR_RO(nvmet_subsys_, attr_offload_subsys_unknown_ns_cmds); + +static ssize_t nvmet_subsys_attr_offload_show(struct config_item *item, + char *page) +{ + return snprintf(page, PAGE_SIZE, "%d\n", + to_subsys(item)->offloadble); +} + +static ssize_t nvmet_subsys_attr_offload_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_subsys *subsys = to_subsys(item); + bool offload; + struct nvmet_ns *ns; + int ret = 0; + unsigned long idx; + + if (strtobool(page, &offload)) + return -EINVAL; + + down_write(&nvmet_config_sem); + mutex_lock(&subsys->lock); + if (subsys->offloadble == offload) + goto out_unlock; + + if (!list_empty(&subsys->ctrls)) { + pr_err("Can't update offload polarity with active controller!\n"); + ret = -EBUSY; + goto out_unlock; + } + + if (subsys->num_ports) { + pr_err("Can't update offload polarity with active port!\n"); + ret = -EBUSY; + goto out_unlock; + } + + if (offload) { + xa_for_each(&subsys->namespaces, idx, ns) { + ns->pdev = nvme_find_pdev_from_bdev(ns->bdev); + if (!ns->pdev) { + pr_err("Couldn't find nvme pci device from device %s\n", + ns->device_path); + ret = -EINVAL; + goto out_unlock; + } + } + } + + xa_for_each(&subsys->namespaces, idx, ns) { + if (offload) { + pci_dev_get(ns->pdev); + } else { + WARN_ON_ONCE(!ns->pdev); + pci_dev_put(ns->pdev); + ns->pdev = NULL; + } + } + + subsys->offloadble = offload; +out_unlock: + mutex_unlock(&subsys->lock); + up_write(&nvmet_config_sem); + return ret ? ret : count; +} +CONFIGFS_ATTR(nvmet_subsys_, attr_offload); + +static struct configfs_attribute *nvmet_subsys_attrs[] = { + &nvmet_subsys_attr_attr_allow_any_host, + &nvmet_subsys_attr_attr_version, + &nvmet_subsys_attr_attr_serial, + &nvmet_subsys_attr_attr_cntlid_min, + &nvmet_subsys_attr_attr_cntlid_max, + &nvmet_subsys_attr_attr_model, +#ifdef CONFIG_BLK_DEV_INTEGRITY + &nvmet_subsys_attr_attr_pi_enable, +#endif + &nvmet_subsys_attr_attr_offload, + &nvmet_subsys_attr_attr_offload_subsys_unknown_ns_cmds, + NULL, +}; + +/* + * Subsystem structures & folder operation functions below + */ +static void nvmet_subsys_release(struct config_item *item) +{ + struct nvmet_subsys *subsys = to_subsys(item); + + nvmet_subsys_del_ctrls(subsys); + nvmet_subsys_put(subsys); +} + +static struct configfs_item_operations nvmet_subsys_item_ops = { + .release = nvmet_subsys_release, +}; + +static const struct config_item_type nvmet_subsys_type = { + .ct_item_ops = &nvmet_subsys_item_ops, + .ct_attrs = nvmet_subsys_attrs, + .ct_owner = THIS_MODULE, +}; + +static struct config_group *nvmet_subsys_make(struct config_group *group, + const char *name) +{ + struct nvmet_subsys *subsys; + + if (sysfs_streq(name, NVME_DISC_SUBSYS_NAME)) { + pr_err("can't create discovery subsystem through configfs\n"); + return ERR_PTR(-EINVAL); + } + + subsys = nvmet_subsys_alloc(name, NVME_NQN_NVME); + if (IS_ERR(subsys)) + return ERR_CAST(subsys); + + config_group_init_type_name(&subsys->group, name, &nvmet_subsys_type); + + config_group_init_type_name(&subsys->namespaces_group, + "namespaces", &nvmet_namespaces_type); + configfs_add_default_group(&subsys->namespaces_group, &subsys->group); + + config_group_init_type_name(&subsys->allowed_hosts_group, + "allowed_hosts", &nvmet_allowed_hosts_type); + configfs_add_default_group(&subsys->allowed_hosts_group, + &subsys->group); + + nvmet_add_passthru_group(subsys); + + return &subsys->group; +} + +static struct configfs_group_operations nvmet_subsystems_group_ops = { + .make_group = nvmet_subsys_make, +}; + +static const struct config_item_type nvmet_subsystems_type = { + .ct_group_ops = &nvmet_subsystems_group_ops, + .ct_owner = THIS_MODULE, +}; + +static ssize_t nvmet_referral_enable_show(struct config_item *item, + char *page) +{ + return snprintf(page, PAGE_SIZE, "%d\n", to_nvmet_port(item)->enabled); +} + +static ssize_t nvmet_referral_enable_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_port *parent = to_nvmet_port(item->ci_parent->ci_parent); + struct nvmet_port *port = to_nvmet_port(item); + bool enable; + + if (strtobool(page, &enable)) + goto inval; + + if (enable) + nvmet_referral_enable(parent, port); + else + nvmet_referral_disable(parent, port); + + return count; +inval: + pr_err("Invalid value '%s' for enable\n", page); + return -EINVAL; +} + +CONFIGFS_ATTR(nvmet_referral_, enable); + +/* + * Discovery Service subsystem definitions + */ +static struct configfs_attribute *nvmet_referral_attrs[] = { + &nvmet_attr_addr_adrfam, + &nvmet_attr_addr_portid, + &nvmet_attr_addr_treq, + &nvmet_attr_addr_traddr, + &nvmet_attr_addr_trsvcid, + &nvmet_attr_addr_trtype, + &nvmet_referral_attr_enable, + NULL, +}; + +static void nvmet_referral_notify(struct config_group *group, + struct config_item *item) +{ + struct nvmet_port *parent = to_nvmet_port(item->ci_parent->ci_parent); + struct nvmet_port *port = to_nvmet_port(item); + + nvmet_referral_disable(parent, port); +} + +static void nvmet_referral_release(struct config_item *item) +{ + struct nvmet_port *port = to_nvmet_port(item); + + kfree(port); +} + +static struct configfs_item_operations nvmet_referral_item_ops = { + .release = nvmet_referral_release, +}; + +static const struct config_item_type nvmet_referral_type = { + .ct_owner = THIS_MODULE, + .ct_attrs = nvmet_referral_attrs, + .ct_item_ops = &nvmet_referral_item_ops, +}; + +static struct config_group *nvmet_referral_make( + struct config_group *group, const char *name) +{ + struct nvmet_port *port; + + port = kzalloc(sizeof(*port), GFP_KERNEL); + if (!port) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&port->entry); + config_group_init_type_name(&port->group, name, &nvmet_referral_type); + + return &port->group; +} + +static struct configfs_group_operations nvmet_referral_group_ops = { + .make_group = nvmet_referral_make, + .disconnect_notify = nvmet_referral_notify, +}; + +static const struct config_item_type nvmet_referrals_type = { + .ct_owner = THIS_MODULE, + .ct_group_ops = &nvmet_referral_group_ops, +}; + +static struct nvmet_type_name_map nvmet_ana_state[] = { + { NVME_ANA_OPTIMIZED, "optimized" }, + { NVME_ANA_NONOPTIMIZED, "non-optimized" }, + { NVME_ANA_INACCESSIBLE, "inaccessible" }, + { NVME_ANA_PERSISTENT_LOSS, "persistent-loss" }, + { NVME_ANA_CHANGE, "change" }, +}; + +static ssize_t nvmet_ana_group_ana_state_show(struct config_item *item, + char *page) +{ + struct nvmet_ana_group *grp = to_ana_group(item); + enum nvme_ana_state state = grp->port->ana_state[grp->grpid]; + int i; + + for (i = 0; i < ARRAY_SIZE(nvmet_ana_state); i++) { + if (state == nvmet_ana_state[i].type) + return sprintf(page, "%s\n", nvmet_ana_state[i].name); + } + + return sprintf(page, "\n"); +} + +static ssize_t nvmet_ana_group_ana_state_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_ana_group *grp = to_ana_group(item); + enum nvme_ana_state *ana_state = grp->port->ana_state; + int i; + + for (i = 0; i < ARRAY_SIZE(nvmet_ana_state); i++) { + if (sysfs_streq(page, nvmet_ana_state[i].name)) + goto found; + } + + pr_err("Invalid value '%s' for ana_state\n", page); + return -EINVAL; + +found: + down_write(&nvmet_ana_sem); + ana_state[grp->grpid] = (enum nvme_ana_state) nvmet_ana_state[i].type; + nvmet_ana_chgcnt++; + up_write(&nvmet_ana_sem); + nvmet_port_send_ana_event(grp->port); + return count; +} + +CONFIGFS_ATTR(nvmet_ana_group_, ana_state); + +static struct configfs_attribute *nvmet_ana_group_attrs[] = { + &nvmet_ana_group_attr_ana_state, + NULL, +}; + +static void nvmet_ana_group_release(struct config_item *item) +{ + struct nvmet_ana_group *grp = to_ana_group(item); + + if (grp == &grp->port->ana_default_group) + return; + + down_write(&nvmet_ana_sem); + grp->port->ana_state[grp->grpid] = NVME_ANA_INACCESSIBLE; + nvmet_ana_group_enabled[grp->grpid]--; + up_write(&nvmet_ana_sem); + + nvmet_port_send_ana_event(grp->port); + kfree(grp); +} + +static struct configfs_item_operations nvmet_ana_group_item_ops = { + .release = nvmet_ana_group_release, +}; + +static const struct config_item_type nvmet_ana_group_type = { + .ct_item_ops = &nvmet_ana_group_item_ops, + .ct_attrs = nvmet_ana_group_attrs, + .ct_owner = THIS_MODULE, +}; + +static struct config_group *nvmet_ana_groups_make_group( + struct config_group *group, const char *name) +{ + struct nvmet_port *port = ana_groups_to_port(&group->cg_item); + struct nvmet_ana_group *grp; + u32 grpid; + int ret; + + ret = kstrtou32(name, 0, &grpid); + if (ret) + goto out; + + ret = -EINVAL; + if (grpid <= 1 || grpid > NVMET_MAX_ANAGRPS) + goto out; + + ret = -ENOMEM; + grp = kzalloc(sizeof(*grp), GFP_KERNEL); + if (!grp) + goto out; + grp->port = port; + grp->grpid = grpid; + + down_write(&nvmet_ana_sem); + nvmet_ana_group_enabled[grpid]++; + up_write(&nvmet_ana_sem); + + nvmet_port_send_ana_event(grp->port); + + config_group_init_type_name(&grp->group, name, &nvmet_ana_group_type); + return &grp->group; +out: + return ERR_PTR(ret); +} + +static struct configfs_group_operations nvmet_ana_groups_group_ops = { + .make_group = nvmet_ana_groups_make_group, +}; + +static const struct config_item_type nvmet_ana_groups_type = { + .ct_group_ops = &nvmet_ana_groups_group_ops, + .ct_owner = THIS_MODULE, +}; + +/* + * Ports definitions. + */ +static void nvmet_port_release(struct config_item *item) +{ + struct nvmet_port *port = to_nvmet_port(item); + + /* Let inflight controllers teardown complete */ + flush_scheduled_work(); + list_del(&port->global_entry); + + kfree(port->ana_state); + kfree(port); +} + +static struct configfs_attribute *nvmet_port_attrs[] = { + &nvmet_attr_addr_adrfam, + &nvmet_attr_addr_treq, + &nvmet_attr_addr_traddr, + &nvmet_attr_addr_trsvcid, + &nvmet_attr_addr_trtype, + &nvmet_attr_addr_tractive, + &nvmet_attr_param_inline_data_size, + &nvmet_attr_param_offload_queues, + &nvmet_attr_param_offload_srq_size, + &nvmet_attr_param_offload_queue_size, + &nvmet_attr_param_offload_passthrough_sqe_rw, +#ifdef CONFIG_BLK_DEV_INTEGRITY + &nvmet_attr_param_pi_enable, +#endif + NULL, +}; + +static struct configfs_item_operations nvmet_port_item_ops = { + .release = nvmet_port_release, +}; + +static const struct config_item_type nvmet_port_type = { + .ct_attrs = nvmet_port_attrs, + .ct_item_ops = &nvmet_port_item_ops, + .ct_owner = THIS_MODULE, +}; + +static struct config_group *nvmet_ports_make(struct config_group *group, + const char *name) +{ + struct nvmet_port *port; + u16 portid; + u32 i; + + if (kstrtou16(name, 0, &portid)) + return ERR_PTR(-EINVAL); + + port = kzalloc(sizeof(*port), GFP_KERNEL); + if (!port) + return ERR_PTR(-ENOMEM); + + port->ana_state = kcalloc(NVMET_MAX_ANAGRPS + 1, + sizeof(*port->ana_state), GFP_KERNEL); + if (!port->ana_state) { + kfree(port); + return ERR_PTR(-ENOMEM); + } + + for (i = 1; i <= NVMET_MAX_ANAGRPS; i++) { + if (i == NVMET_DEFAULT_ANA_GRPID) + port->ana_state[1] = NVME_ANA_OPTIMIZED; + else + port->ana_state[i] = NVME_ANA_INACCESSIBLE; + } + + list_add(&port->global_entry, &nvmet_ports_list); + + INIT_LIST_HEAD(&port->entry); + INIT_LIST_HEAD(&port->subsystems); + INIT_LIST_HEAD(&port->referrals); + port->inline_data_size = -1; /* < 0 == let the transport choose */ + port->offload_queues = 1; + port->offload_srq_size = 1024; + port->offload_queue_size = NVMET_QUEUE_SIZE; + + port->disc_addr.portid = cpu_to_le16(portid); + port->disc_addr.adrfam = NVMF_ADDR_FAMILY_MAX; + port->disc_addr.treq = NVMF_TREQ_DISABLE_SQFLOW; + config_group_init_type_name(&port->group, name, &nvmet_port_type); + + config_group_init_type_name(&port->subsys_group, + "subsystems", &nvmet_port_subsys_type); + configfs_add_default_group(&port->subsys_group, &port->group); + + config_group_init_type_name(&port->referrals_group, + "referrals", &nvmet_referrals_type); + configfs_add_default_group(&port->referrals_group, &port->group); + + config_group_init_type_name(&port->ana_groups_group, + "ana_groups", &nvmet_ana_groups_type); + configfs_add_default_group(&port->ana_groups_group, &port->group); + + port->ana_default_group.port = port; + port->ana_default_group.grpid = NVMET_DEFAULT_ANA_GRPID; + config_group_init_type_name(&port->ana_default_group.group, + __stringify(NVMET_DEFAULT_ANA_GRPID), + &nvmet_ana_group_type); + configfs_add_default_group(&port->ana_default_group.group, + &port->ana_groups_group); + + return &port->group; +} + +static struct configfs_group_operations nvmet_ports_group_ops = { + .make_group = nvmet_ports_make, +}; + +static const struct config_item_type nvmet_ports_type = { + .ct_group_ops = &nvmet_ports_group_ops, + .ct_owner = THIS_MODULE, +}; + +static struct config_group nvmet_subsystems_group; +static struct config_group nvmet_ports_group; + +static void nvmet_host_release(struct config_item *item) +{ + struct nvmet_host *host = to_host(item); + + kfree(host); +} + +static struct configfs_item_operations nvmet_host_item_ops = { + .release = nvmet_host_release, +}; + +static const struct config_item_type nvmet_host_type = { + .ct_item_ops = &nvmet_host_item_ops, + .ct_owner = THIS_MODULE, +}; + +static struct config_group *nvmet_hosts_make_group(struct config_group *group, + const char *name) +{ + struct nvmet_host *host; + + host = kzalloc(sizeof(*host), GFP_KERNEL); + if (!host) + return ERR_PTR(-ENOMEM); + + config_group_init_type_name(&host->group, name, &nvmet_host_type); + + return &host->group; +} + +static struct configfs_group_operations nvmet_hosts_group_ops = { + .make_group = nvmet_hosts_make_group, +}; + +static const struct config_item_type nvmet_hosts_type = { + .ct_group_ops = &nvmet_hosts_group_ops, + .ct_owner = THIS_MODULE, +}; + +static struct config_group nvmet_hosts_group; + +static const struct config_item_type nvmet_root_type = { + .ct_owner = THIS_MODULE, +}; + +static struct configfs_subsystem nvmet_configfs_subsystem = { + .su_group = { + .cg_item = { + .ci_namebuf = "nvmet", + .ci_type = &nvmet_root_type, + }, + }, +}; + +void nvmet_offload_ctx_configfs_del(struct nvmet_offload_ctx *ctx) +{ + if (d_inode(ctx->group.cg_item.ci_dentry)) { + pr_info("Removing offload ctx %d from configfs\n", ctx->id); + configfs_unregister_group(&ctx->group); + } +} +EXPORT_SYMBOL_GPL(nvmet_offload_ctx_configfs_del); + +int nvmet_offload_ctx_configfs_create(struct nvmet_offload_ctx *ctx) +{ + int res = 0; + char name[CONFIGFS_ITEM_NAME_LEN]; + + sprintf(name, "%d", ctx->id); + pr_info("Adding offload ctx %s to configfs\n", name); + + config_group_init_type_name(&ctx->group, name, &nvmet_offload_ctx_type); + + res = configfs_register_group(&ctx->ns->offload_ctxs_group, + &ctx->group); + if (res) + pr_err("failed to register configfs group for offload ctx %s\n", + name); + + return res; +} +EXPORT_SYMBOL_GPL(nvmet_offload_ctx_configfs_create); + +int __init nvmet_init_configfs(void) +{ + int ret; + + config_group_init(&nvmet_configfs_subsystem.su_group); + mutex_init(&nvmet_configfs_subsystem.su_mutex); + + config_group_init_type_name(&nvmet_subsystems_group, + "subsystems", &nvmet_subsystems_type); + configfs_add_default_group(&nvmet_subsystems_group, + &nvmet_configfs_subsystem.su_group); + + config_group_init_type_name(&nvmet_ports_group, + "ports", &nvmet_ports_type); + configfs_add_default_group(&nvmet_ports_group, + &nvmet_configfs_subsystem.su_group); + + config_group_init_type_name(&nvmet_hosts_group, + "hosts", &nvmet_hosts_type); + configfs_add_default_group(&nvmet_hosts_group, + &nvmet_configfs_subsystem.su_group); + + ret = configfs_register_subsystem(&nvmet_configfs_subsystem); + if (ret) { + pr_err("configfs_register_subsystem: %d\n", ret); + return ret; + } + + return 0; +} + +void __exit nvmet_exit_configfs(void) +{ + configfs_unregister_subsystem(&nvmet_configfs_subsystem); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/core.c b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/core.c new file mode 100644 index 0000000..1796aaf --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/core.c @@ -0,0 +1,1863 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Common code for the NVMe target. + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + */ +#ifdef pr_fmt +#undef pr_fmt +#endif +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include + +#define CREATE_TRACE_POINTS +#include "trace.h" + +#include "nvmet.h" + +struct workqueue_struct *buffered_io_wq; +struct workqueue_struct *zbd_wq; +static const struct nvmet_fabrics_ops *nvmet_transports[NVMF_TRTYPE_MAX]; +static DEFINE_IDA(cntlid_ida); + +/* + * This read/write semaphore is used to synchronize access to configuration + * information on a target system that will result in discovery log page + * information change for at least one host. + * The full list of resources to protected by this semaphore is: + * + * - subsystems list + * - per-subsystem allowed hosts list + * - allow_any_host subsystem attribute + * - nvmet_genctr + * - the nvmet_transports array + * + * When updating any of those lists/structures write lock should be obtained, + * while when reading (popolating discovery log page or checking host-subsystem + * link) read lock is obtained to allow concurrent reads. + */ +DECLARE_RWSEM(nvmet_config_sem); + +u32 nvmet_ana_group_enabled[NVMET_MAX_ANAGRPS + 1]; +u64 nvmet_ana_chgcnt; +DECLARE_RWSEM(nvmet_ana_sem); + +inline u16 errno_to_nvme_status(struct nvmet_req *req, int errno) +{ + switch (errno) { + case 0: + return NVME_SC_SUCCESS; + case -ENOSPC: + req->error_loc = offsetof(struct nvme_rw_command, length); + return NVME_SC_CAP_EXCEEDED | NVME_SC_DNR; + case -EREMOTEIO: + req->error_loc = offsetof(struct nvme_rw_command, slba); + return NVME_SC_LBA_RANGE | NVME_SC_DNR; + case -EOPNOTSUPP: + req->error_loc = offsetof(struct nvme_common_command, opcode); + switch (req->cmd->common.opcode) { + case nvme_cmd_dsm: + case nvme_cmd_write_zeroes: + return NVME_SC_ONCS_NOT_SUPPORTED | NVME_SC_DNR; + default: + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } + break; + case -ENODATA: + req->error_loc = offsetof(struct nvme_rw_command, nsid); + return NVME_SC_ACCESS_DENIED; + case -EIO: + fallthrough; + default: + req->error_loc = offsetof(struct nvme_common_command, opcode); + return NVME_SC_INTERNAL | NVME_SC_DNR; + } +} + +u16 nvmet_report_invalid_opcode(struct nvmet_req *req) +{ + pr_debug("unhandled cmd %d on qid %d\n", req->cmd->common.opcode, + req->sq->qid); + + req->error_loc = offsetof(struct nvme_common_command, opcode); + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; +} + +static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port, + const char *subsysnqn); + +u16 nvmet_copy_to_sgl(struct nvmet_req *req, off_t off, const void *buf, + size_t len) +{ + if (sg_pcopy_from_buffer(req->sg, req->sg_cnt, buf, len, off) != len) { + req->error_loc = offsetof(struct nvme_common_command, dptr); + return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR; + } + return 0; +} + +u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf, size_t len) +{ + if (sg_pcopy_to_buffer(req->sg, req->sg_cnt, buf, len, off) != len) { + req->error_loc = offsetof(struct nvme_common_command, dptr); + return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR; + } + return 0; +} + +u16 nvmet_zero_sgl(struct nvmet_req *req, off_t off, size_t len) +{ + if (sg_zero_buffer(req->sg, req->sg_cnt, len, off) != len) { + req->error_loc = offsetof(struct nvme_common_command, dptr); + return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR; + } + return 0; +} + +static u32 nvmet_max_nsid(struct nvmet_subsys *subsys) +{ + struct nvmet_ns *cur; + unsigned long idx; + u32 nsid = 0; + + xa_for_each(&subsys->namespaces, idx, cur) + nsid = cur->nsid; + + return nsid; +} + +static u32 nvmet_async_event_result(struct nvmet_async_event *aen) +{ + return aen->event_type | (aen->event_info << 8) | (aen->log_page << 16); +} + +static void nvmet_async_events_failall(struct nvmet_ctrl *ctrl) +{ + struct nvmet_req *req; + + mutex_lock(&ctrl->lock); + while (ctrl->nr_async_event_cmds) { + req = ctrl->async_event_cmds[--ctrl->nr_async_event_cmds]; + mutex_unlock(&ctrl->lock); + nvmet_req_complete(req, NVME_SC_INTERNAL | NVME_SC_DNR); + mutex_lock(&ctrl->lock); + } + mutex_unlock(&ctrl->lock); +} + +static void nvmet_async_events_process(struct nvmet_ctrl *ctrl) +{ + struct nvmet_async_event *aen; + struct nvmet_req *req; + + mutex_lock(&ctrl->lock); + while (ctrl->nr_async_event_cmds && !list_empty(&ctrl->async_events)) { + aen = list_first_entry(&ctrl->async_events, + struct nvmet_async_event, entry); + req = ctrl->async_event_cmds[--ctrl->nr_async_event_cmds]; + nvmet_set_result(req, nvmet_async_event_result(aen)); + + list_del(&aen->entry); + kfree(aen); + + mutex_unlock(&ctrl->lock); + trace_nvmet_async_event(ctrl, req->cqe->result.u32); + nvmet_req_complete(req, 0); + mutex_lock(&ctrl->lock); + } + mutex_unlock(&ctrl->lock); +} + +static void nvmet_async_events_free(struct nvmet_ctrl *ctrl) +{ + struct nvmet_async_event *aen, *tmp; + + mutex_lock(&ctrl->lock); + list_for_each_entry_safe(aen, tmp, &ctrl->async_events, entry) { + list_del(&aen->entry); + kfree(aen); + } + mutex_unlock(&ctrl->lock); +} + +static void nvmet_async_event_work(struct work_struct *work) +{ + struct nvmet_ctrl *ctrl = + container_of(work, struct nvmet_ctrl, async_event_work); + + nvmet_async_events_process(ctrl); +} + +void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type, + u8 event_info, u8 log_page) +{ + struct nvmet_async_event *aen; + + aen = kmalloc(sizeof(*aen), GFP_KERNEL); + if (!aen) + return; + + aen->event_type = event_type; + aen->event_info = event_info; + aen->log_page = log_page; + + mutex_lock(&ctrl->lock); + list_add_tail(&aen->entry, &ctrl->async_events); + mutex_unlock(&ctrl->lock); + + schedule_work(&ctrl->async_event_work); +} + +static void nvmet_add_to_changed_ns_log(struct nvmet_ctrl *ctrl, __le32 nsid) +{ + u32 i; + + mutex_lock(&ctrl->lock); + if (ctrl->nr_changed_ns > NVME_MAX_CHANGED_NAMESPACES) + goto out_unlock; + + for (i = 0; i < ctrl->nr_changed_ns; i++) { + if (ctrl->changed_ns_list[i] == nsid) + goto out_unlock; + } + + if (ctrl->nr_changed_ns == NVME_MAX_CHANGED_NAMESPACES) { + ctrl->changed_ns_list[0] = cpu_to_le32(0xffffffff); + ctrl->nr_changed_ns = U32_MAX; + goto out_unlock; + } + + ctrl->changed_ns_list[ctrl->nr_changed_ns++] = nsid; +out_unlock: + mutex_unlock(&ctrl->lock); +} + +void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid) +{ + struct nvmet_ctrl *ctrl; + + lockdep_assert_held(&subsys->lock); + + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) { + nvmet_add_to_changed_ns_log(ctrl, cpu_to_le32(nsid)); + if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_NS_ATTR)) + continue; + nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE, + NVME_AER_NOTICE_NS_CHANGED, + NVME_LOG_CHANGED_NS); + } +} + +void nvmet_send_ana_event(struct nvmet_subsys *subsys, + struct nvmet_port *port) +{ + struct nvmet_ctrl *ctrl; + + mutex_lock(&subsys->lock); + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) { + if (port && ctrl->port != port) + continue; + if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_ANA_CHANGE)) + continue; + nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE, + NVME_AER_NOTICE_ANA, NVME_LOG_ANA); + } + mutex_unlock(&subsys->lock); +} + +void nvmet_port_send_ana_event(struct nvmet_port *port) +{ + struct nvmet_subsys_link *p; + + down_read(&nvmet_config_sem); + list_for_each_entry(p, &port->subsystems, entry) + nvmet_send_ana_event(p->subsys, port); + up_read(&nvmet_config_sem); +} + +int nvmet_register_transport(const struct nvmet_fabrics_ops *ops) +{ + int ret = 0; + + down_write(&nvmet_config_sem); + if (nvmet_transports[ops->type]) + ret = -EINVAL; + else + nvmet_transports[ops->type] = ops; + up_write(&nvmet_config_sem); + + return ret; +} +EXPORT_SYMBOL_GPL(nvmet_register_transport); + +void nvmet_unregister_transport(const struct nvmet_fabrics_ops *ops) +{ + down_write(&nvmet_config_sem); + nvmet_transports[ops->type] = NULL; + up_write(&nvmet_config_sem); +} +EXPORT_SYMBOL_GPL(nvmet_unregister_transport); + +void nvmet_port_del_ctrls(struct nvmet_port *port, struct nvmet_subsys *subsys) +{ + struct nvmet_ctrl *ctrl; + + mutex_lock(&subsys->lock); + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) { + if (ctrl->port == port) + ctrl->ops->delete_ctrl(ctrl); + } + mutex_unlock(&subsys->lock); +} + +static bool nvmet_peer_to_peer_capable(struct nvmet_port *port) +{ + const struct nvmet_fabrics_ops *ops; + + lockdep_assert_held(&nvmet_config_sem); + + ops = nvmet_transports[port->disc_addr.trtype]; + if (ops->peer_to_peer_capable && + ops->install_queue && + ops->create_offload_ctrl && + ops->destroy_offload_ctrl && + ops->enable_offload_ns && + ops->disable_offload_ns && + ops->peer_to_peer_sqe_inline_size && + ops->peer_to_peer_mdts && + ops->offload_subsys_unknown_ns_cmds && + ops->offload_ns_read_cmds && + ops->offload_ns_read_blocks && + ops->offload_ns_write_cmds && + ops->offload_ns_write_blocks && + ops->offload_ns_write_inline_cmds && + ops->offload_ns_flush_cmds && + ops->offload_ns_error_cmds && + ops->offload_ns_backend_error_cmds && + ops->offload_query_counters && + ops->check_subsys_match_offload_port) + return ops->peer_to_peer_capable(port); + + return false; +} + +void nvmet_init_offload_subsystem_port_attrs(struct nvmet_port *port, + struct nvmet_subsys *subsys) +{ + const struct nvmet_fabrics_ops *ops = port->tr_ops; + + lockdep_assert_held(&nvmet_config_sem); + WARN_ON_ONCE(subsys->num_ports || !subsys->offloadble); + + if (!subsys->offload_subsys_unknown_ns_cmds) + subsys->offload_subsys_unknown_ns_cmds = + ops->offload_subsys_unknown_ns_cmds; + if (!subsys->offload_ns_read_cmds) + subsys->offload_ns_read_cmds = + ops->offload_ns_read_cmds; + if (!subsys->offload_ns_read_blocks) + subsys->offload_ns_read_blocks = + ops->offload_ns_read_blocks; + if (!subsys->offload_ns_write_cmds) + subsys->offload_ns_write_cmds = + ops->offload_ns_write_cmds; + if (!subsys->offload_ns_write_blocks) + subsys->offload_ns_write_blocks = + ops->offload_ns_write_blocks; + if (!subsys->offload_ns_write_inline_cmds) + subsys->offload_ns_write_inline_cmds = + ops->offload_ns_write_inline_cmds; + if (!subsys->offload_ns_flush_cmds) + subsys->offload_ns_flush_cmds = + ops->offload_ns_flush_cmds; + if (!subsys->offload_ns_error_cmds) + subsys->offload_ns_error_cmds = + ops->offload_ns_error_cmds; + if (!subsys->offload_ns_backend_error_cmds) + subsys->offload_ns_backend_error_cmds = + ops->offload_ns_backend_error_cmds; + if (!subsys->offload_query_counters) + subsys->offload_query_counters = + ops->offload_query_counters; +} + +void nvmet_uninit_offload_subsystem_port_attrs(struct nvmet_subsys *subsys) +{ + lockdep_assert_held(&nvmet_config_sem); + WARN_ON_ONCE(subsys->num_ports || !subsys->offloadble); + + subsys->offload_ns_backend_error_cmds = NULL; + subsys->offload_ns_error_cmds = NULL; + subsys->offload_ns_flush_cmds = NULL; + subsys->offload_ns_write_inline_cmds = NULL; + subsys->offload_ns_write_blocks = NULL; + subsys->offload_ns_write_cmds = NULL; + subsys->offload_ns_read_blocks = NULL; + subsys->offload_ns_read_cmds = NULL; + subsys->offload_subsys_unknown_ns_cmds = NULL; + subsys->offload_query_counters = NULL; +} + +int nvmet_enable_port(struct nvmet_port *port, struct nvmet_subsys *subsys) +{ + const struct nvmet_fabrics_ops *ops; + int ret; + + lockdep_assert_held(&nvmet_config_sem); + + ops = nvmet_transports[port->disc_addr.trtype]; + if (!ops) { + up_write(&nvmet_config_sem); + request_module("nvmet-transport-%d", port->disc_addr.trtype); + down_write(&nvmet_config_sem); + ops = nvmet_transports[port->disc_addr.trtype]; + if (!ops) { + pr_err("transport type %d not supported\n", + port->disc_addr.trtype); + return -EINVAL; + } + } + + if (!try_module_get(ops->owner)) + return -EINVAL; + + /* + * If the user requested PI support and the transport isn't pi capable, + * don't enable the port. + */ + if (port->pi_enable && !(ops->flags & NVMF_METADATA_SUPPORTED)) { + pr_err("T10-PI is not supported by transport type %d\n", + port->disc_addr.trtype); + ret = -EINVAL; + goto out_module_put; + } + + if (port->pi_enable && subsys->offloadble) { + pr_err("T10-PI is not supported on offloadble subsystem\n"); + ret = -EINVAL; + goto out_module_put; + } + + ret = ops->add_port(port); + if (ret) + goto out_module_put; + + if (subsys->offloadble && !nvmet_peer_to_peer_capable(port)) { + ret = -EINVAL; + goto out_remove_port; + } + + if (subsys->offloadble && + !ops->check_subsys_match_offload_port(port, subsys)) { + ret = -EINVAL; + goto out_remove_port; + } + + /* If the transport didn't set inline_data_size, then disable it. */ + if (port->inline_data_size < 0) + port->inline_data_size = 0; + + port->enabled = true; + port->offload = subsys->offloadble; + port->tr_ops = ops; + return 0; + + +out_remove_port: + ops->remove_port(port); +out_module_put: + module_put(ops->owner); + return ret; +} + +void nvmet_disable_port(struct nvmet_port *port) +{ + + lockdep_assert_held(&nvmet_config_sem); + + port->enabled = false; + port->tr_ops->remove_port(port); + module_put(port->tr_ops->owner); + port->tr_ops = NULL; + + port->offload = false; +} + +bool nvmet_is_port_active(struct nvmet_port *port) +{ + if (port->tr_ops && port->tr_ops->is_port_active) + return port->tr_ops->is_port_active(port); + + return port->enabled; +} + +static void nvmet_keep_alive_timer(struct work_struct *work) +{ + struct nvmet_ctrl *ctrl = container_of(to_delayed_work(work), + struct nvmet_ctrl, ka_work); + bool reset_tbkas = ctrl->reset_tbkas; + + ctrl->reset_tbkas = false; + if (reset_tbkas) { + pr_debug("ctrl %d reschedule traffic based keep-alive timer\n", + ctrl->cntlid); + schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ); + return; + } + + pr_err("ctrl %d keep-alive timer (%d seconds) expired!\n", + ctrl->cntlid, ctrl->kato); + + nvmet_ctrl_fatal_error(ctrl); +} + +void nvmet_start_keep_alive_timer(struct nvmet_ctrl *ctrl) +{ + if (unlikely(ctrl->kato == 0)) + return; + + pr_debug("ctrl %d start keep-alive timer for %d secs\n", + ctrl->cntlid, ctrl->kato); + + schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ); +} + +void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl) +{ + if (unlikely(ctrl->kato == 0)) + return; + + pr_debug("ctrl %d stop keep-alive\n", ctrl->cntlid); + + cancel_delayed_work_sync(&ctrl->ka_work); +} + +u16 nvmet_req_find_ns(struct nvmet_req *req) +{ + u32 nsid = le32_to_cpu(req->cmd->common.nsid); + + req->ns = xa_load(&nvmet_req_subsys(req)->namespaces, nsid); + if (unlikely(!req->ns)) { + req->error_loc = offsetof(struct nvme_common_command, nsid); + return NVME_SC_INVALID_NS | NVME_SC_DNR; + } + + percpu_ref_get(&req->ns->ref); + return NVME_SC_SUCCESS; +} + +static void nvmet_destroy_namespace(struct percpu_ref *ref) +{ + struct nvmet_ns *ns = container_of(ref, struct nvmet_ns, ref); + + complete(&ns->disable_done); +} + +void nvmet_put_namespace(struct nvmet_ns *ns) +{ + percpu_ref_put(&ns->ref); +} + +static void nvmet_ns_dev_disable(struct nvmet_ns *ns) +{ + nvmet_bdev_ns_disable(ns); + nvmet_file_ns_disable(ns); +} + +static int nvmet_p2pmem_ns_enable(struct nvmet_ns *ns) +{ + int ret; + struct pci_dev *p2p_dev; + + if (!ns->use_p2pmem) + return 0; + + if (!ns->bdev) { + pr_err("peer-to-peer DMA is not supported by non-block device namespaces\n"); + return -EINVAL; + } + + if (!blk_queue_pci_p2pdma(ns->bdev->bd_disk->queue)) { + pr_err("peer-to-peer DMA is not supported by the driver of %s\n", + ns->device_path); + return -EINVAL; + } + + if (ns->p2p_dev) { + ret = pci_p2pdma_distance(ns->p2p_dev, nvmet_ns_dev(ns), true); + if (ret < 0) + return -EINVAL; + } else { + /* + * Right now we just check that there is p2pmem available so + * we can report an error to the user right away if there + * is not. We'll find the actual device to use once we + * setup the controller when the port's device is available. + */ + + p2p_dev = pci_p2pmem_find(nvmet_ns_dev(ns)); + if (!p2p_dev) { + pr_err("no peer-to-peer memory is available for %s\n", + ns->device_path); + return -EINVAL; + } + + pci_dev_put(p2p_dev); + } + + return 0; +} + +/* + * Note: ctrl->subsys->lock should be held when calling this function + */ +static void nvmet_p2pmem_ns_add_p2p(struct nvmet_ctrl *ctrl, + struct nvmet_ns *ns) +{ + struct device *clients[2]; + struct pci_dev *p2p_dev; + int ret; + + if (!ctrl->p2p_client || !ns->use_p2pmem) + return; + + if (ns->p2p_dev) { + ret = pci_p2pdma_distance(ns->p2p_dev, ctrl->p2p_client, true); + if (ret < 0) + return; + + p2p_dev = pci_dev_get(ns->p2p_dev); + } else { + clients[0] = ctrl->p2p_client; + clients[1] = nvmet_ns_dev(ns); + + p2p_dev = pci_p2pmem_find_many(clients, ARRAY_SIZE(clients)); + if (!p2p_dev) { + pr_err("no peer-to-peer memory is available that's supported by %s and %s\n", + dev_name(ctrl->p2p_client), ns->device_path); + return; + } + } + + ret = radix_tree_insert(&ctrl->p2p_ns_map, ns->nsid, p2p_dev); + if (ret < 0) + pci_dev_put(p2p_dev); + + pr_info("using p2pmem on %s for nsid %d\n", pci_name(p2p_dev), + ns->nsid); +} + +void nvmet_ns_revalidate(struct nvmet_ns *ns) +{ + loff_t oldsize = ns->size; + + if (ns->bdev) + nvmet_bdev_ns_revalidate(ns); + else + nvmet_file_ns_revalidate(ns); + + if (oldsize != ns->size) + nvmet_ns_changed(ns->subsys, ns->nsid); +} + +static int nvmet_offload_ns_enable(struct nvmet_ns *ns) +{ + struct nvmet_subsys *subsys = ns->subsys; + struct nvmet_port *port; + struct nvmet_subsys_link *s; + struct nvmet_ctrl *ctrl; + int ret = 0; + + list_for_each_entry(port, nvmet_ports, global_entry) { + list_for_each_entry(s, &port->subsystems, entry) { + if (s->subsys != subsys) + continue; + if (!port->tr_ops->check_subsys_match_offload_port(port, + subsys)) + return -EINVAL; + } + } + + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) { + if (ctrl->offload_ctrl) { + ret = ctrl->ops->enable_offload_ns(ctrl, ns); + if (ret) + return ret; + } + } + + return ret; +} + + +int nvmet_ns_enable(struct nvmet_ns *ns) +{ + struct nvmet_subsys *subsys = ns->subsys; + struct nvmet_ctrl *ctrl; + int ret; + + mutex_lock(&subsys->lock); + ret = 0; + + if (nvmet_is_passthru_subsys(subsys)) { + pr_info("cannot enable both passthru and regular namespaces for a single subsystem"); + goto out_unlock; + } + + if (ns->enabled) + goto out_unlock; + + ret = -EMFILE; + if (subsys->nr_namespaces == NVMET_MAX_NAMESPACES) + goto out_unlock; + + ret = nvmet_bdev_ns_enable(ns); + if (ret == -ENOTBLK) + ret = nvmet_file_ns_enable(ns); + if (ret) + goto out_unlock; + + ret = nvmet_p2pmem_ns_enable(ns); + if (ret) + goto out_dev_disable; + + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) + nvmet_p2pmem_ns_add_p2p(ctrl, ns); + + if (subsys->offloadble) { + ns->pdev = nvme_find_pdev_from_bdev(ns->bdev); + if (!ns->pdev) { + pr_err("Couldn't find nvme pci device from device %s\n", + ns->device_path); + ret = -EINVAL; + goto out_dev_put; + } + pci_dev_get(ns->pdev); + } + + ret = percpu_ref_init(&ns->ref, nvmet_destroy_namespace, + 0, GFP_KERNEL); + if (ret) + goto out_pdev_put; + + ret = xa_insert(&subsys->namespaces, ns->nsid, ns, GFP_KERNEL); + if (ret) + goto out_restore_subsys_maxnsid; + + subsys->nr_namespaces++; + + if (ns->pdev) { + ret = nvmet_offload_ns_enable(ns); + if (ret) + goto out_remove_list; + } + + if (ns->nsid > subsys->max_nsid) + subsys->max_nsid = ns->nsid; + + nvmet_ns_changed(subsys, ns->nsid); + + ns->enabled = true; + ret = 0; +out_unlock: + mutex_unlock(&subsys->lock); + return ret; +out_remove_list: + subsys->nr_namespaces--; + xa_erase(&ns->subsys->namespaces, ns->nsid); +out_restore_subsys_maxnsid: + subsys->max_nsid = nvmet_max_nsid(subsys); + percpu_ref_exit(&ns->ref); +out_pdev_put: + if (ns->pdev) { + pci_dev_put(ns->pdev); + ns->pdev = NULL; + } +out_dev_put: + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) + pci_dev_put(radix_tree_delete(&ctrl->p2p_ns_map, ns->nsid)); +out_dev_disable: + nvmet_ns_dev_disable(ns); + goto out_unlock; +} + +void nvmet_ns_disable(struct nvmet_ns *ns) +{ + struct nvmet_subsys *subsys = ns->subsys; + struct nvmet_ctrl *ctrl; + + mutex_lock(&subsys->lock); + if (!ns->enabled) + goto out_unlock; + + ns->enabled = false; + xa_erase(&ns->subsys->namespaces, ns->nsid); + if (ns->nsid == subsys->max_nsid) + subsys->max_nsid = nvmet_max_nsid(subsys); + + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) + pci_dev_put(radix_tree_delete(&ctrl->p2p_ns_map, ns->nsid)); + + mutex_unlock(&subsys->lock); + + /* + * Now that we removed the namespaces from the lookup list, we + * can kill the per_cpu ref and wait for any remaining references + * to be dropped, as well as a RCU grace period for anyone only + * using the namepace under rcu_read_lock(). Note that we can't + * use call_rcu here as we need to ensure the namespaces have + * been fully destroyed before unloading the module. + */ + percpu_ref_kill(&ns->ref); + synchronize_rcu(); + wait_for_completion(&ns->disable_done); + percpu_ref_exit(&ns->ref); + + mutex_lock(&subsys->lock); + if (ns->pdev) { + struct nvmet_ctrl *tmp; + + list_for_each_entry_safe(ctrl, tmp, &subsys->ctrls, + subsys_entry) { + if (ctrl->offload_ctrl) { + mutex_unlock(&subsys->lock); + ctrl->ops->disable_offload_ns(ctrl, ns); + mutex_lock(&subsys->lock); + } + } + } + + subsys->nr_namespaces--; + nvmet_ns_changed(subsys, ns->nsid); + nvmet_ns_dev_disable(ns); + if (ns->pdev) { + pci_dev_put(ns->pdev); + ns->pdev = NULL; + } +out_unlock: + mutex_unlock(&subsys->lock); +} + +void nvmet_ns_free(struct nvmet_ns *ns) +{ + nvmet_ns_disable(ns); + + down_write(&nvmet_ana_sem); + nvmet_ana_group_enabled[ns->anagrpid]--; + up_write(&nvmet_ana_sem); + + kfree(ns->device_path); + kfree(ns); +} + +struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid) +{ + struct nvmet_ns *ns; + + ns = kzalloc(sizeof(*ns), GFP_KERNEL); + if (!ns) + return NULL; + + init_completion(&ns->disable_done); + + ns->nsid = nsid; + ns->subsys = subsys; + + down_write(&nvmet_ana_sem); + ns->anagrpid = NVMET_DEFAULT_ANA_GRPID; + nvmet_ana_group_enabled[ns->anagrpid]++; + up_write(&nvmet_ana_sem); + + uuid_gen(&ns->uuid); + ns->buffered_io = false; + ns->csi = NVME_CSI_NVM; + ns->offload_cmd_tmo_us = NVMET_DEFAULT_CMD_TIMEOUT_USEC; + + return ns; +} + +static void nvmet_update_sq_head(struct nvmet_req *req) +{ + if (req->sq->size) { + u32 old_sqhd, new_sqhd; + + do { + old_sqhd = req->sq->sqhd; + new_sqhd = (old_sqhd + 1) % req->sq->size; + } while (cmpxchg(&req->sq->sqhd, old_sqhd, new_sqhd) != + old_sqhd); + } + req->cqe->sq_head = cpu_to_le16(req->sq->sqhd & 0x0000FFFF); +} + +static void nvmet_set_error(struct nvmet_req *req, u16 status) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + struct nvme_error_slot *new_error_slot; + unsigned long flags; + + req->cqe->status = cpu_to_le16(status << 1); + + if (!ctrl || req->error_loc == NVMET_NO_ERROR_LOC) + return; + + spin_lock_irqsave(&ctrl->error_lock, flags); + ctrl->err_counter++; + new_error_slot = + &ctrl->slots[ctrl->err_counter % NVMET_ERROR_LOG_SLOTS]; + + new_error_slot->error_count = cpu_to_le64(ctrl->err_counter); + new_error_slot->sqid = cpu_to_le16(req->sq->qid); + new_error_slot->cmdid = cpu_to_le16(req->cmd->common.command_id); + new_error_slot->status_field = cpu_to_le16(status << 1); + new_error_slot->param_error_location = cpu_to_le16(req->error_loc); + new_error_slot->lba = cpu_to_le64(req->error_slba); + new_error_slot->nsid = req->cmd->common.nsid; + spin_unlock_irqrestore(&ctrl->error_lock, flags); + + /* set the more bit for this request */ + req->cqe->status |= cpu_to_le16(1 << 14); +} + +static void __nvmet_req_complete(struct nvmet_req *req, u16 status) +{ + struct nvmet_ns *ns = req->ns; + + if (!req->sq->sqhd_disabled) + nvmet_update_sq_head(req); + req->cqe->sq_id = cpu_to_le16(req->sq->qid); + req->cqe->command_id = req->cmd->common.command_id; + + if (unlikely(status)) + nvmet_set_error(req, status); + + trace_nvmet_req_complete(req); + + req->ops->queue_response(req); + if (ns) + nvmet_put_namespace(ns); +} + +void nvmet_req_complete(struct nvmet_req *req, u16 status) +{ + __nvmet_req_complete(req, status); + percpu_ref_put(&req->sq->ref); +} +EXPORT_SYMBOL_GPL(nvmet_req_complete); + +void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, + u16 qid, u16 size) +{ + cq->qid = qid; + cq->size = size; +} + +void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, + u16 qid, u16 size) +{ + sq->sqhd = 0; + sq->qid = qid; + sq->size = size; + + ctrl->sqs[qid] = sq; +} + +static void nvmet_confirm_sq(struct percpu_ref *ref) +{ + struct nvmet_sq *sq = container_of(ref, struct nvmet_sq, ref); + + complete(&sq->confirm_done); +} + +void nvmet_sq_destroy(struct nvmet_sq *sq) +{ + struct nvmet_ctrl *ctrl = sq->ctrl; + + /* + * If this is the admin queue, complete all AERs so that our + * queue doesn't have outstanding requests on it. + */ + if (ctrl && ctrl->sqs && ctrl->sqs[0] == sq) + nvmet_async_events_failall(ctrl); + percpu_ref_kill_and_confirm(&sq->ref, nvmet_confirm_sq); + wait_for_completion(&sq->confirm_done); + wait_for_completion(&sq->free_done); + percpu_ref_exit(&sq->ref); + + if (ctrl) { + /* + * The teardown flow may take some time, and the host may not + * send us keep-alive during this period, hence reset the + * traffic based keep-alive timer so we don't trigger a + * controller teardown as a result of a keep-alive expiration. + */ + ctrl->reset_tbkas = true; + sq->ctrl->sqs[sq->qid] = NULL; + nvmet_ctrl_put(ctrl); + sq->ctrl = NULL; /* allows reusing the queue later */ + } +} +EXPORT_SYMBOL_GPL(nvmet_sq_destroy); + +static void nvmet_sq_free(struct percpu_ref *ref) +{ + struct nvmet_sq *sq = container_of(ref, struct nvmet_sq, ref); + + complete(&sq->free_done); +} + +int nvmet_sq_init(struct nvmet_sq *sq) +{ + int ret; + + ret = percpu_ref_init(&sq->ref, nvmet_sq_free, 0, GFP_KERNEL); + if (ret) { + pr_err("percpu_ref init failed!\n"); + return ret; + } + init_completion(&sq->free_done); + init_completion(&sq->confirm_done); + + return 0; +} +EXPORT_SYMBOL_GPL(nvmet_sq_init); + +static inline u16 nvmet_check_ana_state(struct nvmet_port *port, + struct nvmet_ns *ns) +{ + enum nvme_ana_state state = port->ana_state[ns->anagrpid]; + + if (unlikely(state == NVME_ANA_INACCESSIBLE)) + return NVME_SC_ANA_INACCESSIBLE; + if (unlikely(state == NVME_ANA_PERSISTENT_LOSS)) + return NVME_SC_ANA_PERSISTENT_LOSS; + if (unlikely(state == NVME_ANA_CHANGE)) + return NVME_SC_ANA_TRANSITION; + return 0; +} + +static inline u16 nvmet_io_cmd_check_access(struct nvmet_req *req) +{ + if (unlikely(req->ns->readonly)) { + switch (req->cmd->common.opcode) { + case nvme_cmd_read: + case nvme_cmd_flush: + break; + default: + return NVME_SC_NS_WRITE_PROTECTED; + } + } + + return 0; +} + +static u16 nvmet_parse_io_cmd(struct nvmet_req *req) +{ + u16 ret; + + ret = nvmet_check_ctrl_status(req); + if (unlikely(ret)) + return ret; + + if (nvmet_is_passthru_req(req)) + return nvmet_parse_passthru_io_cmd(req); + + ret = nvmet_req_find_ns(req); + if (unlikely(ret)) + return ret; + + ret = nvmet_check_ana_state(req->port, req->ns); + if (unlikely(ret)) { + req->error_loc = offsetof(struct nvme_common_command, nsid); + return ret; + } + ret = nvmet_io_cmd_check_access(req); + if (unlikely(ret)) { + req->error_loc = offsetof(struct nvme_common_command, nsid); + return ret; + } + + switch (req->ns->csi) { + case NVME_CSI_NVM: + if (req->ns->file) + return nvmet_file_parse_io_cmd(req); + return nvmet_bdev_parse_io_cmd(req); + case NVME_CSI_ZNS: + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED)) + return nvmet_bdev_zns_parse_io_cmd(req); + return NVME_SC_INVALID_IO_CMD_SET; + default: + return NVME_SC_INVALID_IO_CMD_SET; + } +} + +bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, + struct nvmet_sq *sq, const struct nvmet_fabrics_ops *ops) +{ + u8 flags = req->cmd->common.flags; + u16 status; + + req->cq = cq; + req->sq = sq; + req->ops = ops; + req->sg = NULL; + req->metadata_sg = NULL; + req->sg_cnt = 0; + req->metadata_sg_cnt = 0; + req->transfer_len = 0; + req->metadata_len = 0; + req->cqe->status = 0; + req->cqe->sq_head = 0; + req->ns = NULL; + req->error_loc = NVMET_NO_ERROR_LOC; + req->error_slba = 0; + + /* no support for fused commands yet */ + if (unlikely(flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND))) { + req->error_loc = offsetof(struct nvme_common_command, flags); + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + goto fail; + } + + /* + * For fabrics, PSDT field shall describe metadata pointer (MPTR) that + * contains an address of a single contiguous physical buffer that is + * byte aligned. + */ + if (unlikely((flags & NVME_CMD_SGL_ALL) != NVME_CMD_SGL_METABUF)) { + req->error_loc = offsetof(struct nvme_common_command, flags); + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + goto fail; + } + + if (unlikely(!req->sq->ctrl)) + /* will return an error for any non-connect command: */ + status = nvmet_parse_connect_cmd(req); + else if (likely(req->sq->qid != 0)) + status = nvmet_parse_io_cmd(req); + else + status = nvmet_parse_admin_cmd(req); + + if (status) + goto fail; + + trace_nvmet_req_init(req, req->cmd); + + if (unlikely(!percpu_ref_tryget_live(&sq->ref))) { + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + goto fail; + } + + if (sq->ctrl) + sq->ctrl->reset_tbkas = true; + + return true; + +fail: + __nvmet_req_complete(req, status); + return false; +} +EXPORT_SYMBOL_GPL(nvmet_req_init); + +void nvmet_req_uninit(struct nvmet_req *req) +{ + percpu_ref_put(&req->sq->ref); + if (req->ns) + nvmet_put_namespace(req->ns); +} +EXPORT_SYMBOL_GPL(nvmet_req_uninit); + +bool nvmet_check_transfer_len(struct nvmet_req *req, size_t len) +{ + if (unlikely(len != req->transfer_len)) { + req->error_loc = offsetof(struct nvme_common_command, dptr); + nvmet_req_complete(req, NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR); + return false; + } + + return true; +} +EXPORT_SYMBOL_GPL(nvmet_check_transfer_len); + +bool nvmet_check_data_len_lte(struct nvmet_req *req, size_t data_len) +{ + if (unlikely(data_len > req->transfer_len)) { + req->error_loc = offsetof(struct nvme_common_command, dptr); + nvmet_req_complete(req, NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR); + return false; + } + + return true; +} + +static unsigned int nvmet_data_transfer_len(struct nvmet_req *req) +{ + return req->transfer_len - req->metadata_len; +} + +static int nvmet_req_alloc_p2pmem_sgls(struct pci_dev *p2p_dev, + struct nvmet_req *req) +{ + req->sg = pci_p2pmem_alloc_sgl(p2p_dev, &req->sg_cnt, + nvmet_data_transfer_len(req)); + if (!req->sg) + goto out_err; + + if (req->metadata_len) { + req->metadata_sg = pci_p2pmem_alloc_sgl(p2p_dev, + &req->metadata_sg_cnt, req->metadata_len); + if (!req->metadata_sg) + goto out_free_sg; + } + + req->p2p_dev = p2p_dev; + + return 0; +out_free_sg: + pci_p2pmem_free_sgl(req->p2p_dev, req->sg); +out_err: + return -ENOMEM; +} + +static struct pci_dev *nvmet_req_find_p2p_dev(struct nvmet_req *req) +{ + if (!IS_ENABLED(CONFIG_PCI_P2PDMA) || + !req->sq->ctrl || !req->sq->qid || !req->ns) + return NULL; + return radix_tree_lookup(&req->sq->ctrl->p2p_ns_map, req->ns->nsid); +} + +int nvmet_req_alloc_sgls(struct nvmet_req *req) +{ + struct pci_dev *p2p_dev = nvmet_req_find_p2p_dev(req); + + if (p2p_dev && !nvmet_req_alloc_p2pmem_sgls(p2p_dev, req)) + return 0; + + req->sg = sgl_alloc(nvmet_data_transfer_len(req), GFP_KERNEL, + &req->sg_cnt); + if (unlikely(!req->sg)) + goto out; + + if (req->metadata_len) { + req->metadata_sg = sgl_alloc(req->metadata_len, GFP_KERNEL, + &req->metadata_sg_cnt); + if (unlikely(!req->metadata_sg)) + goto out_free; + } + + return 0; +out_free: + sgl_free(req->sg); +out: + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(nvmet_req_alloc_sgls); + +void nvmet_req_free_sgls(struct nvmet_req *req) +{ + if (req->p2p_dev) { + pci_p2pmem_free_sgl(req->p2p_dev, req->sg); + if (req->metadata_sg) + pci_p2pmem_free_sgl(req->p2p_dev, req->metadata_sg); + req->p2p_dev = NULL; + } else { + sgl_free(req->sg); + if (req->metadata_sg) + sgl_free(req->metadata_sg); + } + + req->sg = NULL; + req->metadata_sg = NULL; + req->sg_cnt = 0; + req->metadata_sg_cnt = 0; +} +EXPORT_SYMBOL_GPL(nvmet_req_free_sgls); + +static inline bool nvmet_cc_en(u32 cc) +{ + return (cc >> NVME_CC_EN_SHIFT) & 0x1; +} + +static inline u8 nvmet_cc_css(u32 cc) +{ + return (cc >> NVME_CC_CSS_SHIFT) & 0x7; +} + +static inline u8 nvmet_cc_mps(u32 cc) +{ + return (cc >> NVME_CC_MPS_SHIFT) & 0xf; +} + +static inline u8 nvmet_cc_ams(u32 cc) +{ + return (cc >> NVME_CC_AMS_SHIFT) & 0x7; +} + +static inline u8 nvmet_cc_shn(u32 cc) +{ + return (cc >> NVME_CC_SHN_SHIFT) & 0x3; +} + +static inline u8 nvmet_cc_iosqes(u32 cc) +{ + return (cc >> NVME_CC_IOSQES_SHIFT) & 0xf; +} + +static inline u8 nvmet_cc_iocqes(u32 cc) +{ + return (cc >> NVME_CC_IOCQES_SHIFT) & 0xf; +} + +static inline bool nvmet_css_supported(u8 cc_css) +{ + switch (cc_css <<= NVME_CC_CSS_SHIFT) { + case NVME_CC_CSS_NVM: + case NVME_CC_CSS_CSI: + return true; + default: + return false; + } +} + +static void nvmet_start_ctrl(struct nvmet_ctrl *ctrl) +{ + lockdep_assert_held(&ctrl->lock); + + /* + * Only I/O controllers should verify iosqes,iocqes. + * Strictly speaking, the spec says a discovery controller + * should verify iosqes,iocqes are zeroed, however that + * would break backwards compatibility, so don't enforce it. + */ + if (!nvmet_is_disc_subsys(ctrl->subsys) && + (nvmet_cc_iosqes(ctrl->cc) != NVME_NVM_IOSQES || + nvmet_cc_iocqes(ctrl->cc) != NVME_NVM_IOCQES)) { + ctrl->csts = NVME_CSTS_CFS; + return; + } + + if (nvmet_cc_mps(ctrl->cc) != 0 || + nvmet_cc_ams(ctrl->cc) != 0 || + !nvmet_css_supported(nvmet_cc_css(ctrl->cc))) { + ctrl->csts = NVME_CSTS_CFS; + return; + } + + ctrl->csts = NVME_CSTS_RDY; + + /* + * Controllers that are not yet enabled should not really enforce the + * keep alive timeout, but we still want to track a timeout and cleanup + * in case a host died before it enabled the controller. Hence, simply + * reset the keep alive timer when the controller is enabled. + */ + if (ctrl->kato) + mod_delayed_work(system_wq, &ctrl->ka_work, ctrl->kato * HZ); +} + +static void nvmet_clear_ctrl(struct nvmet_ctrl *ctrl) +{ + lockdep_assert_held(&ctrl->lock); + + /* XXX: tear down queues? */ + ctrl->csts &= ~NVME_CSTS_RDY; + ctrl->cc = 0; +} + +void nvmet_update_cc(struct nvmet_ctrl *ctrl, u32 new) +{ + u32 old; + + mutex_lock(&ctrl->lock); + old = ctrl->cc; + ctrl->cc = new; + + if (nvmet_cc_en(new) && !nvmet_cc_en(old)) + nvmet_start_ctrl(ctrl); + if (!nvmet_cc_en(new) && nvmet_cc_en(old)) + nvmet_clear_ctrl(ctrl); + if (nvmet_cc_shn(new) && !nvmet_cc_shn(old)) { + nvmet_clear_ctrl(ctrl); + ctrl->csts |= NVME_CSTS_SHST_CMPLT; + } + if (!nvmet_cc_shn(new) && nvmet_cc_shn(old)) + ctrl->csts &= ~NVME_CSTS_SHST_CMPLT; + mutex_unlock(&ctrl->lock); +} + +static void nvmet_init_cap(struct nvmet_ctrl *ctrl) +{ + /* command sets supported: NVMe command set: */ + ctrl->cap = (1ULL << 37); + /* Controller supports one or more I/O Command Sets */ + ctrl->cap |= (1ULL << 43); + /* CC.EN timeout in 500msec units: */ + ctrl->cap |= (15ULL << 24); + /* maximum queue entries supported: */ + if (ctrl->ops->get_max_queue_size) + ctrl->cap |= ctrl->ops->get_max_queue_size(ctrl) - 1; + else + ctrl->cap |= NVMET_QUEUE_SIZE - 1; + + if (nvmet_is_passthru_subsys(ctrl->subsys)) + nvmet_passthrough_override_cap(ctrl); +} + +struct nvmet_ctrl *nvmet_ctrl_find_get(const char *subsysnqn, + const char *hostnqn, u16 cntlid, + struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = NULL; + struct nvmet_subsys *subsys; + + subsys = nvmet_find_get_subsys(req->port, subsysnqn); + if (!subsys) { + pr_warn("connect request for invalid subsystem %s!\n", + subsysnqn); + req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(subsysnqn); + goto out; + } + + mutex_lock(&subsys->lock); + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) { + if (ctrl->cntlid == cntlid) { + if (strncmp(hostnqn, ctrl->hostnqn, NVMF_NQN_SIZE)) { + pr_warn("hostnqn mismatch.\n"); + continue; + } + if (!kref_get_unless_zero(&ctrl->ref)) + continue; + + /* ctrl found */ + goto found; + } + } + + ctrl = NULL; /* ctrl not found */ + pr_warn("could not find controller %d for subsys %s / host %s\n", + cntlid, subsysnqn, hostnqn); + req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(cntlid); + +found: + mutex_unlock(&subsys->lock); + nvmet_subsys_put(subsys); +out: + return ctrl; +} + +u16 nvmet_check_ctrl_status(struct nvmet_req *req) +{ + if (unlikely(!(req->sq->ctrl->cc & NVME_CC_ENABLE))) { + pr_err("got cmd %d while CC.EN == 0 on qid = %d\n", + req->cmd->common.opcode, req->sq->qid); + return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; + } + + if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) { + pr_err("got cmd %d while CSTS.RDY == 0 on qid = %d\n", + req->cmd->common.opcode, req->sq->qid); + return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; + } + return 0; +} + +bool nvmet_host_allowed(struct nvmet_subsys *subsys, const char *hostnqn) +{ + struct nvmet_host_link *p; + + lockdep_assert_held(&nvmet_config_sem); + + if (subsys->allow_any_host) + return true; + + if (nvmet_is_disc_subsys(subsys)) /* allow all access to disc subsys */ + return true; + + list_for_each_entry(p, &subsys->hosts, entry) { + if (!strcmp(nvmet_host_name(p->host), hostnqn)) + return true; + } + + return false; +} + +/* + * Note: ctrl->subsys->lock should be held when calling this function + */ +static void nvmet_setup_p2p_ns_map(struct nvmet_ctrl *ctrl, + struct nvmet_req *req) +{ + struct nvmet_ns *ns; + unsigned long idx; + + if (!req->p2p_client) + return; + + ctrl->p2p_client = get_device(req->p2p_client); + + xa_for_each(&ctrl->subsys->namespaces, idx, ns) + nvmet_p2pmem_ns_add_p2p(ctrl, ns); +} + +/* + * Note: ctrl->subsys->lock should be held when calling this function + */ +static void nvmet_release_p2p_ns_map(struct nvmet_ctrl *ctrl) +{ + struct radix_tree_iter iter; + void __rcu **slot; + + radix_tree_for_each_slot(slot, &ctrl->p2p_ns_map, &iter, 0) + pci_dev_put(radix_tree_deref_slot(slot)); + + put_device(ctrl->p2p_client); +} + +static void nvmet_fatal_error_handler(struct work_struct *work) +{ + struct nvmet_ctrl *ctrl = + container_of(work, struct nvmet_ctrl, fatal_err_work); + + pr_err("ctrl %d fatal error occurred!\n", ctrl->cntlid); + ctrl->ops->delete_ctrl(ctrl); +} + +u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, + struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp) +{ + struct nvmet_subsys *subsys; + struct nvmet_ctrl *ctrl; + int ret; + u16 status; + + status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; + subsys = nvmet_find_get_subsys(req->port, subsysnqn); + if (!subsys) { + pr_warn("connect request for invalid subsystem %s!\n", + subsysnqn); + req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(subsysnqn); + req->error_loc = offsetof(struct nvme_common_command, dptr); + goto out; + } + + down_read(&nvmet_config_sem); + if (!nvmet_host_allowed(subsys, hostnqn)) { + pr_info("connect by host %s for subsystem %s not allowed\n", + hostnqn, subsysnqn); + req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(hostnqn); + up_read(&nvmet_config_sem); + status = NVME_SC_CONNECT_INVALID_HOST | NVME_SC_DNR; + req->error_loc = offsetof(struct nvme_common_command, dptr); + goto out_put_subsystem; + } + up_read(&nvmet_config_sem); + + status = NVME_SC_INTERNAL; + ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL); + if (!ctrl) + goto out_put_subsystem; + mutex_init(&ctrl->lock); + + ctrl->port = req->port; + ctrl->ops = req->ops; + +#ifdef CONFIG_NVME_TARGET_PASSTHRU + /* By default, set loop targets to clear IDS by default */ + if (ctrl->port->disc_addr.trtype == NVMF_TRTYPE_LOOP) + subsys->clear_ids = 1; +#endif + + INIT_WORK(&ctrl->async_event_work, nvmet_async_event_work); + INIT_LIST_HEAD(&ctrl->async_events); + INIT_RADIX_TREE(&ctrl->p2p_ns_map, GFP_KERNEL); + INIT_WORK(&ctrl->fatal_err_work, nvmet_fatal_error_handler); + INIT_DELAYED_WORK(&ctrl->ka_work, nvmet_keep_alive_timer); + + memcpy(ctrl->subsysnqn, subsysnqn, NVMF_NQN_SIZE); + memcpy(ctrl->hostnqn, hostnqn, NVMF_NQN_SIZE); + + kref_init(&ctrl->ref); + ctrl->subsys = subsys; + nvmet_init_cap(ctrl); + WRITE_ONCE(ctrl->aen_enabled, NVMET_AEN_CFG_OPTIONAL); + + ctrl->changed_ns_list = kmalloc_array(NVME_MAX_CHANGED_NAMESPACES, + sizeof(__le32), GFP_KERNEL); + if (!ctrl->changed_ns_list) + goto out_free_ctrl; + + ctrl->sqs = kcalloc(subsys->max_qid + 1, + sizeof(struct nvmet_sq *), + GFP_KERNEL); + if (!ctrl->sqs) + goto out_free_changed_ns_list; + + if (subsys->cntlid_min > subsys->cntlid_max) + goto out_free_sqs; + + ret = ida_simple_get(&cntlid_ida, + subsys->cntlid_min, subsys->cntlid_max, + GFP_KERNEL); + if (ret < 0) { + status = NVME_SC_CONNECT_CTRL_BUSY | NVME_SC_DNR; + goto out_free_sqs; + } + ctrl->cntlid = ret; + + /* + * Discovery controllers may use some arbitrary high value + * in order to cleanup stale discovery sessions + */ + if (nvmet_is_disc_subsys(ctrl->subsys) && !kato) + kato = NVMET_DISC_KATO_MS; + + /* keep-alive timeout in seconds */ + ctrl->kato = DIV_ROUND_UP(kato, 1000); + + ctrl->err_counter = 0; + spin_lock_init(&ctrl->error_lock); + + if (ctrl->port->offload) + ctrl->sqe_inline_size = ctrl->ops->peer_to_peer_sqe_inline_size(ctrl); + else + ctrl->sqe_inline_size = req->port->inline_data_size; + + nvmet_start_keep_alive_timer(ctrl); + + mutex_lock(&subsys->lock); + list_add_tail(&ctrl->subsys_entry, &subsys->ctrls); + nvmet_setup_p2p_ns_map(ctrl, req); + mutex_unlock(&subsys->lock); + + *ctrlp = ctrl; + return 0; + +out_free_sqs: + kfree(ctrl->sqs); +out_free_changed_ns_list: + kfree(ctrl->changed_ns_list); +out_free_ctrl: + kfree(ctrl); +out_put_subsystem: + nvmet_subsys_put(subsys); +out: + return status; +} + +static void nvmet_ctrl_free(struct kref *ref) +{ + struct nvmet_ctrl *ctrl = container_of(ref, struct nvmet_ctrl, ref); + struct nvmet_subsys *subsys = ctrl->subsys; + + mutex_lock(&subsys->lock); + nvmet_release_p2p_ns_map(ctrl); + list_del(&ctrl->subsys_entry); + mutex_unlock(&subsys->lock); + + if (ctrl->offload_ctrl) + ctrl->ops->destroy_offload_ctrl(ctrl); + + nvmet_stop_keep_alive_timer(ctrl); + + flush_work(&ctrl->async_event_work); + cancel_work_sync(&ctrl->fatal_err_work); + + ida_simple_remove(&cntlid_ida, ctrl->cntlid); + + nvmet_async_events_free(ctrl); + kfree(ctrl->sqs); + kfree(ctrl->changed_ns_list); + kfree(ctrl); + + nvmet_subsys_put(subsys); +} + +void nvmet_ctrl_put(struct nvmet_ctrl *ctrl) +{ + kref_put(&ctrl->ref, nvmet_ctrl_free); +} + +void nvmet_ctrl_fatal_error(struct nvmet_ctrl *ctrl) +{ + mutex_lock(&ctrl->lock); + if (!(ctrl->csts & NVME_CSTS_CFS)) { + ctrl->csts |= NVME_CSTS_CFS; + schedule_work(&ctrl->fatal_err_work); + } + mutex_unlock(&ctrl->lock); +} +EXPORT_SYMBOL_GPL(nvmet_ctrl_fatal_error); + +static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port, + const char *subsysnqn) +{ + struct nvmet_subsys_link *p; + + if (!port) + return NULL; + + if (!strcmp(NVME_DISC_SUBSYS_NAME, subsysnqn)) { + if (!kref_get_unless_zero(&nvmet_disc_subsys->ref)) + return NULL; + return nvmet_disc_subsys; + } + + down_read(&nvmet_config_sem); + list_for_each_entry(p, &port->subsystems, entry) { + if (!strncmp(p->subsys->subsysnqn, subsysnqn, + NVMF_NQN_SIZE)) { + if (!kref_get_unless_zero(&p->subsys->ref)) + break; + up_read(&nvmet_config_sem); + return p->subsys; + } + } + up_read(&nvmet_config_sem); + return NULL; +} + +struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn, + enum nvme_subsys_type type) +{ + struct nvmet_subsys *subsys; + char serial[NVMET_SN_MAX_SIZE / 2]; + int ret; + + subsys = kzalloc(sizeof(*subsys), GFP_KERNEL); + if (!subsys) + return ERR_PTR(-ENOMEM); + + subsys->ver = NVMET_DEFAULT_VS; + /* generate a random serial number as our controllers are ephemeral: */ + get_random_bytes(&serial, sizeof(serial)); + bin2hex(subsys->serial, &serial, sizeof(serial)); + + subsys->model_number = kstrdup(NVMET_DEFAULT_CTRL_MODEL, GFP_KERNEL); + if (!subsys->model_number) { + ret = -ENOMEM; + goto free_subsys; + } + + switch (type) { + case NVME_NQN_NVME: + subsys->max_qid = NVMET_NR_QUEUES; + break; + case NVME_NQN_DISC: + case NVME_NQN_CURR: + subsys->max_qid = 0; + break; + default: + pr_err("%s: Unknown Subsystem type - %d\n", __func__, type); + ret = -EINVAL; + goto free_mn; + } + subsys->type = type; + subsys->subsysnqn = kstrndup(subsysnqn, NVMF_NQN_SIZE, + GFP_KERNEL); + if (!subsys->subsysnqn) { + ret = -ENOMEM; + goto free_mn; + } + subsys->cntlid_min = NVME_CNTLID_MIN; + subsys->cntlid_max = NVME_CNTLID_MAX; + kref_init(&subsys->ref); + + mutex_init(&subsys->lock); + xa_init(&subsys->namespaces); + INIT_LIST_HEAD(&subsys->ctrls); + INIT_LIST_HEAD(&subsys->hosts); + + return subsys; + +free_mn: + kfree(subsys->model_number); +free_subsys: + kfree(subsys); + return ERR_PTR(ret); +} + +static void nvmet_subsys_free(struct kref *ref) +{ + struct nvmet_subsys *subsys = + container_of(ref, struct nvmet_subsys, ref); + + WARN_ON_ONCE(!xa_empty(&subsys->namespaces)); + + xa_destroy(&subsys->namespaces); + nvmet_passthru_subsys_free(subsys); + + kfree(subsys->subsysnqn); + kfree(subsys->model_number); + kfree(subsys); +} + +void nvmet_subsys_del_ctrls(struct nvmet_subsys *subsys) +{ + struct nvmet_ctrl *ctrl; + + mutex_lock(&subsys->lock); + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) + ctrl->ops->delete_ctrl(ctrl); + mutex_unlock(&subsys->lock); +} + +void nvmet_subsys_put(struct nvmet_subsys *subsys) +{ + kref_put(&subsys->ref, nvmet_subsys_free); +} + +static int __init nvmet_init(void) +{ + int error; + + nvmet_ana_group_enabled[NVMET_DEFAULT_ANA_GRPID] = 1; + + zbd_wq = alloc_workqueue("nvmet-zbd-wq", WQ_MEM_RECLAIM, 0); + if (!zbd_wq) + return -ENOMEM; + + buffered_io_wq = alloc_workqueue("nvmet-buffered-io-wq", + WQ_MEM_RECLAIM, 0); + if (!buffered_io_wq) { + error = -ENOMEM; + goto out_free_zbd_work_queue; + } + + error = nvmet_init_discovery(); + if (error) + goto out_free_work_queue; + + error = nvmet_init_configfs(); + if (error) + goto out_exit_discovery; + return 0; + +out_exit_discovery: + nvmet_exit_discovery(); +out_free_work_queue: + destroy_workqueue(buffered_io_wq); +out_free_zbd_work_queue: + destroy_workqueue(zbd_wq); + return error; +} + +static void __exit nvmet_exit(void) +{ + nvmet_exit_configfs(); + nvmet_exit_discovery(); + ida_destroy(&cntlid_ida); + destroy_workqueue(buffered_io_wq); + destroy_workqueue(zbd_wq); + + BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_entry) != 1024); + BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_hdr) != 1024); +} + +module_init(nvmet_init); +module_exit(nvmet_exit); + +MODULE_LICENSE("GPL v2"); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/discovery.c b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/discovery.c new file mode 100644 index 0000000..964d97b --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/discovery.c @@ -0,0 +1,407 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Discovery service for the NVMe over Fabrics target. + * Copyright (C) 2016 Intel Corporation. All rights reserved. + */ +#ifdef pr_fmt +#undef pr_fmt +#endif +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include "nvmet.h" + +struct nvmet_subsys *nvmet_disc_subsys; + +static u64 nvmet_genctr; + +static void __nvmet_disc_changed(struct nvmet_port *port, + struct nvmet_ctrl *ctrl) +{ + if (ctrl->port != port) + return; + + if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_DISC_CHANGE)) + return; + + nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE, + NVME_AER_NOTICE_DISC_CHANGED, NVME_LOG_DISC); +} + +void nvmet_port_disc_changed(struct nvmet_port *port, + struct nvmet_subsys *subsys) +{ + struct nvmet_ctrl *ctrl; + + lockdep_assert_held(&nvmet_config_sem); + nvmet_genctr++; + + mutex_lock(&nvmet_disc_subsys->lock); + list_for_each_entry(ctrl, &nvmet_disc_subsys->ctrls, subsys_entry) { + if (subsys && !nvmet_host_allowed(subsys, ctrl->hostnqn)) + continue; + + __nvmet_disc_changed(port, ctrl); + } + mutex_unlock(&nvmet_disc_subsys->lock); + + /* If transport can signal change, notify transport */ + if (port->tr_ops && port->tr_ops->discovery_chg) + port->tr_ops->discovery_chg(port); +} + +static void __nvmet_subsys_disc_changed(struct nvmet_port *port, + struct nvmet_subsys *subsys, + struct nvmet_host *host) +{ + struct nvmet_ctrl *ctrl; + + mutex_lock(&nvmet_disc_subsys->lock); + list_for_each_entry(ctrl, &nvmet_disc_subsys->ctrls, subsys_entry) { + if (host && strcmp(nvmet_host_name(host), ctrl->hostnqn)) + continue; + + __nvmet_disc_changed(port, ctrl); + } + mutex_unlock(&nvmet_disc_subsys->lock); +} + +void nvmet_subsys_disc_changed(struct nvmet_subsys *subsys, + struct nvmet_host *host) +{ + struct nvmet_port *port; + struct nvmet_subsys_link *s; + + lockdep_assert_held(&nvmet_config_sem); + nvmet_genctr++; + + list_for_each_entry(port, nvmet_ports, global_entry) + list_for_each_entry(s, &port->subsystems, entry) { + if (s->subsys != subsys) + continue; + __nvmet_subsys_disc_changed(port, subsys, host); + } +} + +void nvmet_referral_enable(struct nvmet_port *parent, struct nvmet_port *port) +{ + down_write(&nvmet_config_sem); + if (list_empty(&port->entry)) { + list_add_tail(&port->entry, &parent->referrals); + port->enabled = true; + nvmet_port_disc_changed(parent, NULL); + } + up_write(&nvmet_config_sem); +} + +void nvmet_referral_disable(struct nvmet_port *parent, struct nvmet_port *port) +{ + down_write(&nvmet_config_sem); + if (!list_empty(&port->entry)) { + port->enabled = false; + list_del_init(&port->entry); + nvmet_port_disc_changed(parent, NULL); + } + up_write(&nvmet_config_sem); +} + +static void nvmet_format_discovery_entry(struct nvmf_disc_rsp_page_hdr *hdr, + struct nvmet_port *port, char *subsys_nqn, char *traddr, + u8 type, u32 numrec) +{ + struct nvmf_disc_rsp_page_entry *e = &hdr->entries[numrec]; + + e->trtype = port->disc_addr.trtype; + e->adrfam = port->disc_addr.adrfam; + e->treq = port->disc_addr.treq; + e->portid = port->disc_addr.portid; + /* we support only dynamic controllers */ + e->cntlid = cpu_to_le16(NVME_CNTLID_DYNAMIC); + e->asqsz = cpu_to_le16(NVME_AQ_DEPTH); + e->subtype = type; + memcpy(e->trsvcid, port->disc_addr.trsvcid, NVMF_TRSVCID_SIZE); + memcpy(e->traddr, traddr, NVMF_TRADDR_SIZE); + memcpy(e->tsas.common, port->disc_addr.tsas.common, NVMF_TSAS_SIZE); + strncpy(e->subnqn, subsys_nqn, NVMF_NQN_SIZE); +} + +/* + * nvmet_set_disc_traddr - set a correct discovery log entry traddr + * + * IP based transports (e.g RDMA) can listen on "any" ipv4/ipv6 addresses + * (INADDR_ANY or IN6ADDR_ANY_INIT). The discovery log page traddr reply + * must not contain that "any" IP address. If the transport implements + * .disc_traddr, use it. this callback will set the discovery traddr + * from the req->port address in case the port in question listens + * "any" IP address. + */ +static void nvmet_set_disc_traddr(struct nvmet_req *req, struct nvmet_port *port, + char *traddr) +{ + if (req->ops->disc_traddr) + req->ops->disc_traddr(req, port, traddr); + else + memcpy(traddr, port->disc_addr.traddr, NVMF_TRADDR_SIZE); +} + +static size_t discovery_log_entries(struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + struct nvmet_subsys_link *p; + struct nvmet_port *r; + size_t entries = 1; + + list_for_each_entry(p, &req->port->subsystems, entry) { + if (!nvmet_host_allowed(p->subsys, ctrl->hostnqn)) + continue; + entries++; + } + list_for_each_entry(r, &req->port->referrals, entry) + entries++; + return entries; +} + +static void nvmet_execute_disc_get_log_page(struct nvmet_req *req) +{ + const int entry_size = sizeof(struct nvmf_disc_rsp_page_entry); + struct nvmet_ctrl *ctrl = req->sq->ctrl; + struct nvmf_disc_rsp_page_hdr *hdr; + u64 offset = nvmet_get_log_page_offset(req->cmd); + size_t data_len = nvmet_get_log_page_len(req->cmd); + size_t alloc_len; + struct nvmet_subsys_link *p; + struct nvmet_port *r; + u32 numrec = 0; + u16 status = 0; + void *buffer; + char traddr[NVMF_TRADDR_SIZE]; + + if (!nvmet_check_transfer_len(req, data_len)) + return; + + if (req->cmd->get_log_page.lid != NVME_LOG_DISC) { + req->error_loc = + offsetof(struct nvme_get_log_page_command, lid); + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + goto out; + } + + /* Spec requires dword aligned offsets */ + if (offset & 0x3) { + req->error_loc = + offsetof(struct nvme_get_log_page_command, lpo); + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + goto out; + } + + /* + * Make sure we're passing at least a buffer of response header size. + * If host provided data len is less than the header size, only the + * number of bytes requested by host will be sent to host. + */ + down_read(&nvmet_config_sem); + alloc_len = sizeof(*hdr) + entry_size * discovery_log_entries(req); + buffer = kzalloc(alloc_len, GFP_KERNEL); + if (!buffer) { + up_read(&nvmet_config_sem); + status = NVME_SC_INTERNAL; + goto out; + } + hdr = buffer; + + nvmet_set_disc_traddr(req, req->port, traddr); + + nvmet_format_discovery_entry(hdr, req->port, + nvmet_disc_subsys->subsysnqn, + traddr, NVME_NQN_CURR, numrec); + numrec++; + + list_for_each_entry(p, &req->port->subsystems, entry) { + if (!nvmet_host_allowed(p->subsys, ctrl->hostnqn)) + continue; + + nvmet_format_discovery_entry(hdr, req->port, + p->subsys->subsysnqn, traddr, + NVME_NQN_NVME, numrec); + numrec++; + } + + list_for_each_entry(r, &req->port->referrals, entry) { + nvmet_format_discovery_entry(hdr, r, + NVME_DISC_SUBSYS_NAME, + r->disc_addr.traddr, + NVME_NQN_DISC, numrec); + numrec++; + } + + hdr->genctr = cpu_to_le64(nvmet_genctr); + hdr->numrec = cpu_to_le64(numrec); + hdr->recfmt = cpu_to_le16(0); + + nvmet_clear_aen_bit(req, NVME_AEN_BIT_DISC_CHANGE); + + up_read(&nvmet_config_sem); + + status = nvmet_copy_to_sgl(req, 0, buffer + offset, data_len); + kfree(buffer); +out: + nvmet_req_complete(req, status); +} + +static void nvmet_execute_disc_identify(struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + struct nvme_id_ctrl *id; + u16 status = 0; + + if (!nvmet_check_transfer_len(req, NVME_IDENTIFY_DATA_SIZE)) + return; + + if (req->cmd->identify.cns != NVME_ID_CNS_CTRL) { + req->error_loc = offsetof(struct nvme_identify, cns); + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + goto out; + } + + id = kzalloc(sizeof(*id), GFP_KERNEL); + if (!id) { + status = NVME_SC_INTERNAL; + goto out; + } + + memcpy(id->sn, ctrl->subsys->serial, NVMET_SN_MAX_SIZE); + memset(id->fr, ' ', sizeof(id->fr)); + memcpy_and_pad(id->mn, sizeof(id->mn), ctrl->subsys->model_number, + strlen(ctrl->subsys->model_number), ' '); + memcpy_and_pad(id->fr, sizeof(id->fr), + UTS_RELEASE, strlen(UTS_RELEASE), ' '); + + id->cntrltype = NVME_CTRL_DISC; + + /* no limit on data transfer sizes for now */ + id->mdts = 0; + id->cntlid = cpu_to_le16(ctrl->cntlid); + id->ver = cpu_to_le32(ctrl->subsys->ver); + id->lpa = (1 << 2); + + /* no enforcement soft-limit for maxcmd - pick arbitrary high value */ + id->maxcmd = cpu_to_le16(NVMET_MAX_CMD); + + id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */ + if (ctrl->ops->flags & NVMF_KEYED_SGLS) + id->sgls |= cpu_to_le32(1 << 2); + if (req->port->inline_data_size) + id->sgls |= cpu_to_le32(1 << 20); + + id->oaes = cpu_to_le32(NVMET_DISC_AEN_CFG_OPTIONAL); + + strlcpy(id->subnqn, ctrl->subsys->subsysnqn, sizeof(id->subnqn)); + + status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id)); + + kfree(id); +out: + nvmet_req_complete(req, status); +} + +static void nvmet_execute_disc_set_features(struct nvmet_req *req) +{ + u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10); + u16 stat; + + if (!nvmet_check_transfer_len(req, 0)) + return; + + switch (cdw10 & 0xff) { + case NVME_FEAT_KATO: + stat = nvmet_set_feat_kato(req); + break; + case NVME_FEAT_ASYNC_EVENT: + stat = nvmet_set_feat_async_event(req, + NVMET_DISC_AEN_CFG_OPTIONAL); + break; + default: + req->error_loc = + offsetof(struct nvme_common_command, cdw10); + stat = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + break; + } + + nvmet_req_complete(req, stat); +} + +static void nvmet_execute_disc_get_features(struct nvmet_req *req) +{ + u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10); + u16 stat = 0; + + if (!nvmet_check_transfer_len(req, 0)) + return; + + switch (cdw10 & 0xff) { + case NVME_FEAT_KATO: + nvmet_get_feat_kato(req); + break; + case NVME_FEAT_ASYNC_EVENT: + nvmet_get_feat_async_event(req); + break; + default: + req->error_loc = + offsetof(struct nvme_common_command, cdw10); + stat = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + break; + } + + nvmet_req_complete(req, stat); +} + +u16 nvmet_parse_discovery_cmd(struct nvmet_req *req) +{ + struct nvme_command *cmd = req->cmd; + + if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) { + pr_err("got cmd %d while not ready\n", + cmd->common.opcode); + req->error_loc = + offsetof(struct nvme_common_command, opcode); + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } + + switch (cmd->common.opcode) { + case nvme_admin_set_features: + req->execute = nvmet_execute_disc_set_features; + return 0; + case nvme_admin_get_features: + req->execute = nvmet_execute_disc_get_features; + return 0; + case nvme_admin_async_event: + req->execute = nvmet_execute_async_event; + return 0; + case nvme_admin_keep_alive: + req->execute = nvmet_execute_keep_alive; + return 0; + case nvme_admin_get_log_page: + req->execute = nvmet_execute_disc_get_log_page; + return 0; + case nvme_admin_identify: + req->execute = nvmet_execute_disc_identify; + return 0; + default: + pr_debug("unhandled cmd %d\n", cmd->common.opcode); + req->error_loc = offsetof(struct nvme_common_command, opcode); + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } + +} + +int __init nvmet_init_discovery(void) +{ + nvmet_disc_subsys = + nvmet_subsys_alloc(NVME_DISC_SUBSYS_NAME, NVME_NQN_CURR); + return PTR_ERR_OR_ZERO(nvmet_disc_subsys); +} + +void nvmet_exit_discovery(void) +{ + nvmet_subsys_put(nvmet_disc_subsys); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/fabrics-cmd.c b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/fabrics-cmd.c new file mode 100644 index 0000000..35f7292 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/fabrics-cmd.c @@ -0,0 +1,338 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NVMe Fabrics command implementation. + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + */ +#ifdef pr_fmt +#undef pr_fmt +#endif +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include "nvmet.h" + +static void nvmet_execute_prop_set(struct nvmet_req *req) +{ + u64 val = le64_to_cpu(req->cmd->prop_set.value); + u16 status = 0; + + if (!nvmet_check_transfer_len(req, 0)) + return; + + if (req->cmd->prop_set.attrib & 1) { + req->error_loc = + offsetof(struct nvmf_property_set_command, attrib); + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + goto out; + } + + switch (le32_to_cpu(req->cmd->prop_set.offset)) { + case NVME_REG_CC: + nvmet_update_cc(req->sq->ctrl, val); + break; + default: + req->error_loc = + offsetof(struct nvmf_property_set_command, offset); + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + } +out: + nvmet_req_complete(req, status); +} + +static void nvmet_execute_prop_get(struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + u16 status = 0; + u64 val = 0; + + if (!nvmet_check_transfer_len(req, 0)) + return; + + if (req->cmd->prop_get.attrib & 1) { + switch (le32_to_cpu(req->cmd->prop_get.offset)) { + case NVME_REG_CAP: + val = ctrl->cap; + break; + default: + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + break; + } + } else { + switch (le32_to_cpu(req->cmd->prop_get.offset)) { + case NVME_REG_VS: + val = ctrl->subsys->ver; + break; + case NVME_REG_CC: + val = ctrl->cc; + break; + case NVME_REG_CSTS: + val = ctrl->csts; + break; + default: + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + break; + } + } + + if (status && req->cmd->prop_get.attrib & 1) { + req->error_loc = + offsetof(struct nvmf_property_get_command, offset); + } else { + req->error_loc = + offsetof(struct nvmf_property_get_command, attrib); + } + + req->cqe->result.u64 = cpu_to_le64(val); + nvmet_req_complete(req, status); +} + +u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req) +{ + struct nvme_command *cmd = req->cmd; + + switch (cmd->fabrics.fctype) { + case nvme_fabrics_type_property_set: + req->execute = nvmet_execute_prop_set; + break; + case nvme_fabrics_type_property_get: + req->execute = nvmet_execute_prop_get; + break; + default: + pr_debug("received unknown capsule type 0x%x\n", + cmd->fabrics.fctype); + req->error_loc = offsetof(struct nvmf_common_command, fctype); + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } + + return 0; +} + +static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req) +{ + struct nvmf_connect_command *c = &req->cmd->connect; + u16 qid = le16_to_cpu(c->qid); + u16 sqsize = le16_to_cpu(c->sqsize); + struct nvmet_ctrl *old; + u16 mqes = NVME_CAP_MQES(ctrl->cap); + u16 ret; + + if (!sqsize) { + pr_warn("queue size zero!\n"); + req->error_loc = offsetof(struct nvmf_connect_command, sqsize); + req->cqe->result.u32 = IPO_IATTR_CONNECT_SQE(sqsize); + ret = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; + goto err; + } + + if (ctrl->sqs[qid] != NULL) { + pr_warn("qid %u has already been created\n", qid); + req->error_loc = offsetof(struct nvmf_connect_command, qid); + return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; + } + + if (sqsize > mqes) { + pr_warn("sqsize %u is larger than MQES supported %u cntlid %d\n", + sqsize, mqes, ctrl->cntlid); + req->error_loc = offsetof(struct nvmf_connect_command, sqsize); + req->cqe->result.u32 = IPO_IATTR_CONNECT_SQE(sqsize); + return NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; + } + + old = cmpxchg(&req->sq->ctrl, NULL, ctrl); + if (old) { + pr_warn("queue already connected!\n"); + req->error_loc = offsetof(struct nvmf_connect_command, opcode); + return NVME_SC_CONNECT_CTRL_BUSY | NVME_SC_DNR; + } + + /* note: convert queue size from 0's-based value to 1's-based value */ + nvmet_cq_setup(ctrl, req->cq, qid, sqsize + 1); + nvmet_sq_setup(ctrl, req->sq, qid, sqsize + 1); + + if (c->cattr & NVME_CONNECT_DISABLE_SQFLOW) { + req->sq->sqhd_disabled = true; + req->cqe->sq_head = cpu_to_le16(0xffff); + } + + if (ctrl->ops->install_queue) { + ret = ctrl->ops->install_queue(req->sq); + if (ret) { + pr_err("failed to install queue %d cntlid %d ret %x\n", + qid, ctrl->cntlid, ret); + ctrl->sqs[qid] = NULL; + goto err; + } + } + + return 0; + +err: + req->sq->ctrl = NULL; + return ret; +} + +static void nvmet_execute_admin_connect(struct nvmet_req *req) +{ + struct nvmf_connect_command *c = &req->cmd->connect; + struct nvmf_connect_data *d; + struct nvmet_ctrl *ctrl = NULL; + u16 status = 0; + + if (!nvmet_check_transfer_len(req, sizeof(struct nvmf_connect_data))) + return; + + d = kmalloc(sizeof(*d), GFP_KERNEL); + if (!d) { + status = NVME_SC_INTERNAL; + goto complete; + } + + status = nvmet_copy_from_sgl(req, 0, d, sizeof(*d)); + if (status) + goto out; + + /* zero out initial completion result, assign values as needed */ + req->cqe->result.u32 = 0; + + if (c->recfmt != 0) { + pr_warn("invalid connect version (%d).\n", + le16_to_cpu(c->recfmt)); + req->error_loc = offsetof(struct nvmf_connect_command, recfmt); + status = NVME_SC_CONNECT_FORMAT | NVME_SC_DNR; + goto out; + } + + if (unlikely(d->cntlid != cpu_to_le16(0xffff))) { + pr_warn("connect attempt for invalid controller ID %#x\n", + d->cntlid); + status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; + req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(cntlid); + goto out; + } + + status = nvmet_alloc_ctrl(d->subsysnqn, d->hostnqn, req, + le32_to_cpu(c->kato), &ctrl); + if (status) + goto out; + + ctrl->pi_support = ctrl->port->pi_enable && ctrl->subsys->pi_support; + + uuid_copy(&ctrl->hostid, &d->hostid); + + status = nvmet_install_queue(ctrl, req); + if (status) { + nvmet_ctrl_put(ctrl); + goto out; + } + + pr_info("creating %s controller %d for subsystem %s for NQN %s%s.\n", + nvmet_is_disc_subsys(ctrl->subsys) ? "discovery" : "nvm", + ctrl->cntlid, ctrl->subsys->subsysnqn, ctrl->hostnqn, + ctrl->pi_support ? " T10-PI is enabled" : ""); + req->cqe->result.u16 = cpu_to_le16(ctrl->cntlid); + +out: + kfree(d); +complete: + nvmet_req_complete(req, status); +} + +static void nvmet_execute_io_connect(struct nvmet_req *req) +{ + struct nvmf_connect_command *c = &req->cmd->connect; + struct nvmf_connect_data *d; + struct nvmet_ctrl *ctrl; + u16 qid = le16_to_cpu(c->qid); + u16 status = 0; + + if (!nvmet_check_transfer_len(req, sizeof(struct nvmf_connect_data))) + return; + + d = kmalloc(sizeof(*d), GFP_KERNEL); + if (!d) { + status = NVME_SC_INTERNAL; + goto complete; + } + + status = nvmet_copy_from_sgl(req, 0, d, sizeof(*d)); + if (status) + goto out; + + /* zero out initial completion result, assign values as needed */ + req->cqe->result.u32 = 0; + + if (c->recfmt != 0) { + pr_warn("invalid connect version (%d).\n", + le16_to_cpu(c->recfmt)); + status = NVME_SC_CONNECT_FORMAT | NVME_SC_DNR; + goto out; + } + + ctrl = nvmet_ctrl_find_get(d->subsysnqn, d->hostnqn, + le16_to_cpu(d->cntlid), req); + if (!ctrl) { + status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; + goto out; + } + + if (unlikely(qid > ctrl->subsys->max_qid)) { + pr_warn("invalid queue id (%d)\n", qid); + status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; + req->cqe->result.u32 = IPO_IATTR_CONNECT_SQE(qid); + goto out_ctrl_put; + } + + /* + * create offloaded ctrl for P2P I/O when receiving first successful + * I/O connect and destroy it when freeing the controller + */ + if (req->port->offload && !ctrl->offload_ctrl) { + status = ctrl->ops->create_offload_ctrl(ctrl); + if (status) { + status = NVME_SC_INTERNAL | NVME_SC_DNR; + goto out_ctrl_put; + } + } + + status = nvmet_install_queue(ctrl, req); + if (status) + goto out_ctrl_put; + + /* pass back cntlid for successful completion */ + req->cqe->result.u16 = cpu_to_le16(ctrl->cntlid); + + pr_debug("adding queue %d to ctrl %d.\n", qid, ctrl->cntlid); + +out: + kfree(d); +complete: + nvmet_req_complete(req, status); + return; + +out_ctrl_put: + nvmet_ctrl_put(ctrl); + goto out; +} + +u16 nvmet_parse_connect_cmd(struct nvmet_req *req) +{ + struct nvme_command *cmd = req->cmd; + + if (!nvme_is_fabrics(cmd)) { + pr_debug("invalid command 0x%x on unconnected queue.\n", + cmd->fabrics.opcode); + req->error_loc = offsetof(struct nvme_common_command, opcode); + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } + if (cmd->fabrics.fctype != nvme_fabrics_type_connect) { + pr_debug("invalid capsule type 0x%x on unconnected queue.\n", + cmd->fabrics.fctype); + req->error_loc = offsetof(struct nvmf_common_command, fctype); + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } + + if (cmd->connect.qid == 0) + req->execute = nvmet_execute_admin_connect; + else + req->execute = nvmet_execute_io_connect; + return 0; +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/fc.c b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/fc.c new file mode 100644 index 0000000..800a2da --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/fc.c @@ -0,0 +1,2948 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2016 Avago Technologies. All rights reserved. + */ +#ifdef pr_fmt +#undef pr_fmt +#endif +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include +#include + +#include "nvmet.h" +#include +#include +#include "../host/fc.h" + + +/* *************************** Data Structures/Defines ****************** */ + + +#define NVMET_LS_CTX_COUNT 256 + +struct nvmet_fc_tgtport; +struct nvmet_fc_tgt_assoc; + +struct nvmet_fc_ls_iod { /* for an LS RQST RCV */ + struct nvmefc_ls_rsp *lsrsp; + struct nvmefc_tgt_fcp_req *fcpreq; /* only if RS */ + + struct list_head ls_rcv_list; /* tgtport->ls_rcv_list */ + + struct nvmet_fc_tgtport *tgtport; + struct nvmet_fc_tgt_assoc *assoc; + void *hosthandle; + + union nvmefc_ls_requests *rqstbuf; + union nvmefc_ls_responses *rspbuf; + u16 rqstdatalen; + dma_addr_t rspdma; + + struct scatterlist sg[2]; + + struct work_struct work; +} __aligned(sizeof(unsigned long long)); + +struct nvmet_fc_ls_req_op { /* for an LS RQST XMT */ + struct nvmefc_ls_req ls_req; + + struct nvmet_fc_tgtport *tgtport; + void *hosthandle; + + int ls_error; + struct list_head lsreq_list; /* tgtport->ls_req_list */ + bool req_queued; +}; + + +/* desired maximum for a single sequence - if sg list allows it */ +#define NVMET_FC_MAX_SEQ_LENGTH (256 * 1024) + +enum nvmet_fcp_datadir { + NVMET_FCP_NODATA, + NVMET_FCP_WRITE, + NVMET_FCP_READ, + NVMET_FCP_ABORTED, +}; + +struct nvmet_fc_fcp_iod { + struct nvmefc_tgt_fcp_req *fcpreq; + + struct nvme_fc_cmd_iu cmdiubuf; + struct nvme_fc_ersp_iu rspiubuf; + dma_addr_t rspdma; + struct scatterlist *next_sg; + struct scatterlist *data_sg; + int data_sg_cnt; + u32 offset; + enum nvmet_fcp_datadir io_dir; + bool active; + bool abort; + bool aborted; + bool writedataactive; + spinlock_t flock; + + struct nvmet_req req; + struct work_struct defer_work; + + struct nvmet_fc_tgtport *tgtport; + struct nvmet_fc_tgt_queue *queue; + + struct list_head fcp_list; /* tgtport->fcp_list */ +}; + +struct nvmet_fc_tgtport { + struct nvmet_fc_target_port fc_target_port; + + struct list_head tgt_list; /* nvmet_fc_target_list */ + struct device *dev; /* dev for dma mapping */ + struct nvmet_fc_target_template *ops; + + struct nvmet_fc_ls_iod *iod; + spinlock_t lock; + struct list_head ls_rcv_list; + struct list_head ls_req_list; + struct list_head ls_busylist; + struct list_head assoc_list; + struct list_head host_list; + struct ida assoc_cnt; + struct nvmet_fc_port_entry *pe; + struct kref ref; + u32 max_sg_cnt; +}; + +struct nvmet_fc_port_entry { + struct nvmet_fc_tgtport *tgtport; + struct nvmet_port *port; + u64 node_name; + u64 port_name; + struct list_head pe_list; +}; + +struct nvmet_fc_defer_fcp_req { + struct list_head req_list; + struct nvmefc_tgt_fcp_req *fcp_req; +}; + +struct nvmet_fc_tgt_queue { + bool ninetypercent; + u16 qid; + u16 sqsize; + u16 ersp_ratio; + __le16 sqhd; + atomic_t connected; + atomic_t sqtail; + atomic_t zrspcnt; + atomic_t rsn; + spinlock_t qlock; + struct nvmet_cq nvme_cq; + struct nvmet_sq nvme_sq; + struct nvmet_fc_tgt_assoc *assoc; + struct list_head fod_list; + struct list_head pending_cmd_list; + struct list_head avail_defer_list; + struct workqueue_struct *work_q; + struct kref ref; + struct rcu_head rcu; + struct nvmet_fc_fcp_iod fod[]; /* array of fcp_iods */ +} __aligned(sizeof(unsigned long long)); + +struct nvmet_fc_hostport { + struct nvmet_fc_tgtport *tgtport; + void *hosthandle; + struct list_head host_list; + struct kref ref; + u8 invalid; +}; + +struct nvmet_fc_tgt_assoc { + u64 association_id; + u32 a_id; + atomic_t terminating; + struct nvmet_fc_tgtport *tgtport; + struct nvmet_fc_hostport *hostport; + struct nvmet_fc_ls_iod *rcv_disconn; + struct list_head a_list; + struct nvmet_fc_tgt_queue __rcu *queues[NVMET_NR_QUEUES + 1]; + struct kref ref; + struct work_struct del_work; + struct rcu_head rcu; +}; + + +static inline int +nvmet_fc_iodnum(struct nvmet_fc_ls_iod *iodptr) +{ + return (iodptr - iodptr->tgtport->iod); +} + +static inline int +nvmet_fc_fodnum(struct nvmet_fc_fcp_iod *fodptr) +{ + return (fodptr - fodptr->queue->fod); +} + + +/* + * Association and Connection IDs: + * + * Association ID will have random number in upper 6 bytes and zero + * in lower 2 bytes + * + * Connection IDs will be Association ID with QID or'd in lower 2 bytes + * + * note: Association ID = Connection ID for queue 0 + */ +#define BYTES_FOR_QID sizeof(u16) +#define BYTES_FOR_QID_SHIFT (BYTES_FOR_QID * 8) +#define NVMET_FC_QUEUEID_MASK ((u64)((1 << BYTES_FOR_QID_SHIFT) - 1)) + +static inline u64 +nvmet_fc_makeconnid(struct nvmet_fc_tgt_assoc *assoc, u16 qid) +{ + return (assoc->association_id | qid); +} + +static inline u64 +nvmet_fc_getassociationid(u64 connectionid) +{ + return connectionid & ~NVMET_FC_QUEUEID_MASK; +} + +static inline u16 +nvmet_fc_getqueueid(u64 connectionid) +{ + return (u16)(connectionid & NVMET_FC_QUEUEID_MASK); +} + +static inline struct nvmet_fc_tgtport * +targetport_to_tgtport(struct nvmet_fc_target_port *targetport) +{ + return container_of(targetport, struct nvmet_fc_tgtport, + fc_target_port); +} + +static inline struct nvmet_fc_fcp_iod * +nvmet_req_to_fod(struct nvmet_req *nvme_req) +{ + return container_of(nvme_req, struct nvmet_fc_fcp_iod, req); +} + + +/* *************************** Globals **************************** */ + + +static DEFINE_SPINLOCK(nvmet_fc_tgtlock); + +static LIST_HEAD(nvmet_fc_target_list); +static DEFINE_IDA(nvmet_fc_tgtport_cnt); +static LIST_HEAD(nvmet_fc_portentry_list); + + +static void nvmet_fc_handle_ls_rqst_work(struct work_struct *work); +static void nvmet_fc_fcp_rqst_op_defer_work(struct work_struct *work); +static void nvmet_fc_tgt_a_put(struct nvmet_fc_tgt_assoc *assoc); +static int nvmet_fc_tgt_a_get(struct nvmet_fc_tgt_assoc *assoc); +static void nvmet_fc_tgt_q_put(struct nvmet_fc_tgt_queue *queue); +static int nvmet_fc_tgt_q_get(struct nvmet_fc_tgt_queue *queue); +static void nvmet_fc_tgtport_put(struct nvmet_fc_tgtport *tgtport); +static int nvmet_fc_tgtport_get(struct nvmet_fc_tgtport *tgtport); +static void nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport, + struct nvmet_fc_fcp_iod *fod); +static void nvmet_fc_delete_target_assoc(struct nvmet_fc_tgt_assoc *assoc); +static void nvmet_fc_xmt_ls_rsp(struct nvmet_fc_tgtport *tgtport, + struct nvmet_fc_ls_iod *iod); + + +/* *********************** FC-NVME DMA Handling **************************** */ + +/* + * The fcloop device passes in a NULL device pointer. Real LLD's will + * pass in a valid device pointer. If NULL is passed to the dma mapping + * routines, depending on the platform, it may or may not succeed, and + * may crash. + * + * As such: + * Wrapper all the dma routines and check the dev pointer. + * + * If simple mappings (return just a dma address, we'll noop them, + * returning a dma address of 0. + * + * On more complex mappings (dma_map_sg), a pseudo routine fills + * in the scatter list, setting all dma addresses to 0. + */ + +static inline dma_addr_t +fc_dma_map_single(struct device *dev, void *ptr, size_t size, + enum dma_data_direction dir) +{ + return dev ? dma_map_single(dev, ptr, size, dir) : (dma_addr_t)0L; +} + +static inline int +fc_dma_mapping_error(struct device *dev, dma_addr_t dma_addr) +{ + return dev ? dma_mapping_error(dev, dma_addr) : 0; +} + +static inline void +fc_dma_unmap_single(struct device *dev, dma_addr_t addr, size_t size, + enum dma_data_direction dir) +{ + if (dev) + dma_unmap_single(dev, addr, size, dir); +} + +static inline void +fc_dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size, + enum dma_data_direction dir) +{ + if (dev) + dma_sync_single_for_cpu(dev, addr, size, dir); +} + +static inline void +fc_dma_sync_single_for_device(struct device *dev, dma_addr_t addr, size_t size, + enum dma_data_direction dir) +{ + if (dev) + dma_sync_single_for_device(dev, addr, size, dir); +} + +/* pseudo dma_map_sg call */ +static int +fc_map_sg(struct scatterlist *sg, int nents) +{ + struct scatterlist *s; + int i; + + WARN_ON(nents == 0 || sg[0].length == 0); + + for_each_sg(sg, s, nents, i) { + s->dma_address = 0L; +#ifdef CONFIG_NEED_SG_DMA_LENGTH + s->dma_length = s->length; +#endif + } + return nents; +} + +static inline int +fc_dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, + enum dma_data_direction dir) +{ + return dev ? dma_map_sg(dev, sg, nents, dir) : fc_map_sg(sg, nents); +} + +static inline void +fc_dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, + enum dma_data_direction dir) +{ + if (dev) + dma_unmap_sg(dev, sg, nents, dir); +} + + +/* ********************** FC-NVME LS XMT Handling ************************* */ + + +static void +__nvmet_fc_finish_ls_req(struct nvmet_fc_ls_req_op *lsop) +{ + struct nvmet_fc_tgtport *tgtport = lsop->tgtport; + struct nvmefc_ls_req *lsreq = &lsop->ls_req; + unsigned long flags; + + spin_lock_irqsave(&tgtport->lock, flags); + + if (!lsop->req_queued) { + spin_unlock_irqrestore(&tgtport->lock, flags); + return; + } + + list_del(&lsop->lsreq_list); + + lsop->req_queued = false; + + spin_unlock_irqrestore(&tgtport->lock, flags); + + fc_dma_unmap_single(tgtport->dev, lsreq->rqstdma, + (lsreq->rqstlen + lsreq->rsplen), + DMA_BIDIRECTIONAL); + + nvmet_fc_tgtport_put(tgtport); +} + +static int +__nvmet_fc_send_ls_req(struct nvmet_fc_tgtport *tgtport, + struct nvmet_fc_ls_req_op *lsop, + void (*done)(struct nvmefc_ls_req *req, int status)) +{ + struct nvmefc_ls_req *lsreq = &lsop->ls_req; + unsigned long flags; + int ret = 0; + + if (!tgtport->ops->ls_req) + return -EOPNOTSUPP; + + if (!nvmet_fc_tgtport_get(tgtport)) + return -ESHUTDOWN; + + lsreq->done = done; + lsop->req_queued = false; + INIT_LIST_HEAD(&lsop->lsreq_list); + + lsreq->rqstdma = fc_dma_map_single(tgtport->dev, lsreq->rqstaddr, + lsreq->rqstlen + lsreq->rsplen, + DMA_BIDIRECTIONAL); + if (fc_dma_mapping_error(tgtport->dev, lsreq->rqstdma)) { + ret = -EFAULT; + goto out_puttgtport; + } + lsreq->rspdma = lsreq->rqstdma + lsreq->rqstlen; + + spin_lock_irqsave(&tgtport->lock, flags); + + list_add_tail(&lsop->lsreq_list, &tgtport->ls_req_list); + + lsop->req_queued = true; + + spin_unlock_irqrestore(&tgtport->lock, flags); + + ret = tgtport->ops->ls_req(&tgtport->fc_target_port, lsop->hosthandle, + lsreq); + if (ret) + goto out_unlink; + + return 0; + +out_unlink: + lsop->ls_error = ret; + spin_lock_irqsave(&tgtport->lock, flags); + lsop->req_queued = false; + list_del(&lsop->lsreq_list); + spin_unlock_irqrestore(&tgtport->lock, flags); + fc_dma_unmap_single(tgtport->dev, lsreq->rqstdma, + (lsreq->rqstlen + lsreq->rsplen), + DMA_BIDIRECTIONAL); +out_puttgtport: + nvmet_fc_tgtport_put(tgtport); + + return ret; +} + +static int +nvmet_fc_send_ls_req_async(struct nvmet_fc_tgtport *tgtport, + struct nvmet_fc_ls_req_op *lsop, + void (*done)(struct nvmefc_ls_req *req, int status)) +{ + /* don't wait for completion */ + + return __nvmet_fc_send_ls_req(tgtport, lsop, done); +} + +static void +nvmet_fc_disconnect_assoc_done(struct nvmefc_ls_req *lsreq, int status) +{ + struct nvmet_fc_ls_req_op *lsop = + container_of(lsreq, struct nvmet_fc_ls_req_op, ls_req); + + __nvmet_fc_finish_ls_req(lsop); + + /* fc-nvme target doesn't care about success or failure of cmd */ + + kfree(lsop); +} + +/* + * This routine sends a FC-NVME LS to disconnect (aka terminate) + * the FC-NVME Association. Terminating the association also + * terminates the FC-NVME connections (per queue, both admin and io + * queues) that are part of the association. E.g. things are torn + * down, and the related FC-NVME Association ID and Connection IDs + * become invalid. + * + * The behavior of the fc-nvme target is such that it's + * understanding of the association and connections will implicitly + * be torn down. The action is implicit as it may be due to a loss of + * connectivity with the fc-nvme host, so the target may never get a + * response even if it tried. As such, the action of this routine + * is to asynchronously send the LS, ignore any results of the LS, and + * continue on with terminating the association. If the fc-nvme host + * is present and receives the LS, it too can tear down. + */ +static void +nvmet_fc_xmt_disconnect_assoc(struct nvmet_fc_tgt_assoc *assoc) +{ + struct nvmet_fc_tgtport *tgtport = assoc->tgtport; + struct fcnvme_ls_disconnect_assoc_rqst *discon_rqst; + struct fcnvme_ls_disconnect_assoc_acc *discon_acc; + struct nvmet_fc_ls_req_op *lsop; + struct nvmefc_ls_req *lsreq; + int ret; + + /* + * If ls_req is NULL or no hosthandle, it's an older lldd and no + * message is normal. Otherwise, send unless the hostport has + * already been invalidated by the lldd. + */ + if (!tgtport->ops->ls_req || !assoc->hostport || + assoc->hostport->invalid) + return; + + lsop = kzalloc((sizeof(*lsop) + + sizeof(*discon_rqst) + sizeof(*discon_acc) + + tgtport->ops->lsrqst_priv_sz), GFP_KERNEL); + if (!lsop) { + dev_info(tgtport->dev, + "{%d:%d} send Disconnect Association failed: ENOMEM\n", + tgtport->fc_target_port.port_num, assoc->a_id); + return; + } + + discon_rqst = (struct fcnvme_ls_disconnect_assoc_rqst *)&lsop[1]; + discon_acc = (struct fcnvme_ls_disconnect_assoc_acc *)&discon_rqst[1]; + lsreq = &lsop->ls_req; + if (tgtport->ops->lsrqst_priv_sz) + lsreq->private = (void *)&discon_acc[1]; + else + lsreq->private = NULL; + + lsop->tgtport = tgtport; + lsop->hosthandle = assoc->hostport->hosthandle; + + nvmefc_fmt_lsreq_discon_assoc(lsreq, discon_rqst, discon_acc, + assoc->association_id); + + ret = nvmet_fc_send_ls_req_async(tgtport, lsop, + nvmet_fc_disconnect_assoc_done); + if (ret) { + dev_info(tgtport->dev, + "{%d:%d} XMT Disconnect Association failed: %d\n", + tgtport->fc_target_port.port_num, assoc->a_id, ret); + kfree(lsop); + } +} + + +/* *********************** FC-NVME Port Management ************************ */ + + +static int +nvmet_fc_alloc_ls_iodlist(struct nvmet_fc_tgtport *tgtport) +{ + struct nvmet_fc_ls_iod *iod; + int i; + + iod = kcalloc(NVMET_LS_CTX_COUNT, sizeof(struct nvmet_fc_ls_iod), + GFP_KERNEL); + if (!iod) + return -ENOMEM; + + tgtport->iod = iod; + + for (i = 0; i < NVMET_LS_CTX_COUNT; iod++, i++) { + INIT_WORK(&iod->work, nvmet_fc_handle_ls_rqst_work); + iod->tgtport = tgtport; + list_add_tail(&iod->ls_rcv_list, &tgtport->ls_rcv_list); + + iod->rqstbuf = kzalloc(sizeof(union nvmefc_ls_requests) + + sizeof(union nvmefc_ls_responses), + GFP_KERNEL); + if (!iod->rqstbuf) + goto out_fail; + + iod->rspbuf = (union nvmefc_ls_responses *)&iod->rqstbuf[1]; + + iod->rspdma = fc_dma_map_single(tgtport->dev, iod->rspbuf, + sizeof(*iod->rspbuf), + DMA_TO_DEVICE); + if (fc_dma_mapping_error(tgtport->dev, iod->rspdma)) + goto out_fail; + } + + return 0; + +out_fail: + kfree(iod->rqstbuf); + list_del(&iod->ls_rcv_list); + for (iod--, i--; i >= 0; iod--, i--) { + fc_dma_unmap_single(tgtport->dev, iod->rspdma, + sizeof(*iod->rspbuf), DMA_TO_DEVICE); + kfree(iod->rqstbuf); + list_del(&iod->ls_rcv_list); + } + + kfree(iod); + + return -EFAULT; +} + +static void +nvmet_fc_free_ls_iodlist(struct nvmet_fc_tgtport *tgtport) +{ + struct nvmet_fc_ls_iod *iod = tgtport->iod; + int i; + + for (i = 0; i < NVMET_LS_CTX_COUNT; iod++, i++) { + fc_dma_unmap_single(tgtport->dev, + iod->rspdma, sizeof(*iod->rspbuf), + DMA_TO_DEVICE); + kfree(iod->rqstbuf); + list_del(&iod->ls_rcv_list); + } + kfree(tgtport->iod); +} + +static struct nvmet_fc_ls_iod * +nvmet_fc_alloc_ls_iod(struct nvmet_fc_tgtport *tgtport) +{ + struct nvmet_fc_ls_iod *iod; + unsigned long flags; + + spin_lock_irqsave(&tgtport->lock, flags); + iod = list_first_entry_or_null(&tgtport->ls_rcv_list, + struct nvmet_fc_ls_iod, ls_rcv_list); + if (iod) + list_move_tail(&iod->ls_rcv_list, &tgtport->ls_busylist); + spin_unlock_irqrestore(&tgtport->lock, flags); + return iod; +} + + +static void +nvmet_fc_free_ls_iod(struct nvmet_fc_tgtport *tgtport, + struct nvmet_fc_ls_iod *iod) +{ + unsigned long flags; + + spin_lock_irqsave(&tgtport->lock, flags); + list_move(&iod->ls_rcv_list, &tgtport->ls_rcv_list); + spin_unlock_irqrestore(&tgtport->lock, flags); +} + +static void +nvmet_fc_prep_fcp_iodlist(struct nvmet_fc_tgtport *tgtport, + struct nvmet_fc_tgt_queue *queue) +{ + struct nvmet_fc_fcp_iod *fod = queue->fod; + int i; + + for (i = 0; i < queue->sqsize; fod++, i++) { + INIT_WORK(&fod->defer_work, nvmet_fc_fcp_rqst_op_defer_work); + fod->tgtport = tgtport; + fod->queue = queue; + fod->active = false; + fod->abort = false; + fod->aborted = false; + fod->fcpreq = NULL; + list_add_tail(&fod->fcp_list, &queue->fod_list); + spin_lock_init(&fod->flock); + + fod->rspdma = fc_dma_map_single(tgtport->dev, &fod->rspiubuf, + sizeof(fod->rspiubuf), DMA_TO_DEVICE); + if (fc_dma_mapping_error(tgtport->dev, fod->rspdma)) { + list_del(&fod->fcp_list); + for (fod--, i--; i >= 0; fod--, i--) { + fc_dma_unmap_single(tgtport->dev, fod->rspdma, + sizeof(fod->rspiubuf), + DMA_TO_DEVICE); + fod->rspdma = 0L; + list_del(&fod->fcp_list); + } + + return; + } + } +} + +static void +nvmet_fc_destroy_fcp_iodlist(struct nvmet_fc_tgtport *tgtport, + struct nvmet_fc_tgt_queue *queue) +{ + struct nvmet_fc_fcp_iod *fod = queue->fod; + int i; + + for (i = 0; i < queue->sqsize; fod++, i++) { + if (fod->rspdma) + fc_dma_unmap_single(tgtport->dev, fod->rspdma, + sizeof(fod->rspiubuf), DMA_TO_DEVICE); + } +} + +static struct nvmet_fc_fcp_iod * +nvmet_fc_alloc_fcp_iod(struct nvmet_fc_tgt_queue *queue) +{ + struct nvmet_fc_fcp_iod *fod; + + lockdep_assert_held(&queue->qlock); + + fod = list_first_entry_or_null(&queue->fod_list, + struct nvmet_fc_fcp_iod, fcp_list); + if (fod) { + list_del(&fod->fcp_list); + fod->active = true; + /* + * no queue reference is taken, as it was taken by the + * queue lookup just prior to the allocation. The iod + * will "inherit" that reference. + */ + } + return fod; +} + + +static void +nvmet_fc_queue_fcp_req(struct nvmet_fc_tgtport *tgtport, + struct nvmet_fc_tgt_queue *queue, + struct nvmefc_tgt_fcp_req *fcpreq) +{ + struct nvmet_fc_fcp_iod *fod = fcpreq->nvmet_fc_private; + + /* + * put all admin cmds on hw queue id 0. All io commands go to + * the respective hw queue based on a modulo basis + */ + fcpreq->hwqid = queue->qid ? + ((queue->qid - 1) % tgtport->ops->max_hw_queues) : 0; + + nvmet_fc_handle_fcp_rqst(tgtport, fod); +} + +static void +nvmet_fc_fcp_rqst_op_defer_work(struct work_struct *work) +{ + struct nvmet_fc_fcp_iod *fod = + container_of(work, struct nvmet_fc_fcp_iod, defer_work); + + /* Submit deferred IO for processing */ + nvmet_fc_queue_fcp_req(fod->tgtport, fod->queue, fod->fcpreq); + +} + +static void +nvmet_fc_free_fcp_iod(struct nvmet_fc_tgt_queue *queue, + struct nvmet_fc_fcp_iod *fod) +{ + struct nvmefc_tgt_fcp_req *fcpreq = fod->fcpreq; + struct nvmet_fc_tgtport *tgtport = fod->tgtport; + struct nvmet_fc_defer_fcp_req *deferfcp; + unsigned long flags; + + fc_dma_sync_single_for_cpu(tgtport->dev, fod->rspdma, + sizeof(fod->rspiubuf), DMA_TO_DEVICE); + + fcpreq->nvmet_fc_private = NULL; + + fod->active = false; + fod->abort = false; + fod->aborted = false; + fod->writedataactive = false; + fod->fcpreq = NULL; + + tgtport->ops->fcp_req_release(&tgtport->fc_target_port, fcpreq); + + /* release the queue lookup reference on the completed IO */ + nvmet_fc_tgt_q_put(queue); + + spin_lock_irqsave(&queue->qlock, flags); + deferfcp = list_first_entry_or_null(&queue->pending_cmd_list, + struct nvmet_fc_defer_fcp_req, req_list); + if (!deferfcp) { + list_add_tail(&fod->fcp_list, &fod->queue->fod_list); + spin_unlock_irqrestore(&queue->qlock, flags); + return; + } + + /* Re-use the fod for the next pending cmd that was deferred */ + list_del(&deferfcp->req_list); + + fcpreq = deferfcp->fcp_req; + + /* deferfcp can be reused for another IO at a later date */ + list_add_tail(&deferfcp->req_list, &queue->avail_defer_list); + + spin_unlock_irqrestore(&queue->qlock, flags); + + /* Save NVME CMD IO in fod */ + memcpy(&fod->cmdiubuf, fcpreq->rspaddr, fcpreq->rsplen); + + /* Setup new fcpreq to be processed */ + fcpreq->rspaddr = NULL; + fcpreq->rsplen = 0; + fcpreq->nvmet_fc_private = fod; + fod->fcpreq = fcpreq; + fod->active = true; + + /* inform LLDD IO is now being processed */ + tgtport->ops->defer_rcv(&tgtport->fc_target_port, fcpreq); + + /* + * Leave the queue lookup get reference taken when + * fod was originally allocated. + */ + + queue_work(queue->work_q, &fod->defer_work); +} + +static struct nvmet_fc_tgt_queue * +nvmet_fc_alloc_target_queue(struct nvmet_fc_tgt_assoc *assoc, + u16 qid, u16 sqsize) +{ + struct nvmet_fc_tgt_queue *queue; + int ret; + + if (qid > NVMET_NR_QUEUES) + return NULL; + + queue = kzalloc(struct_size(queue, fod, sqsize), GFP_KERNEL); + if (!queue) + return NULL; + + if (!nvmet_fc_tgt_a_get(assoc)) + goto out_free_queue; + + queue->work_q = alloc_workqueue("ntfc%d.%d.%d", 0, 0, + assoc->tgtport->fc_target_port.port_num, + assoc->a_id, qid); + if (!queue->work_q) + goto out_a_put; + + queue->qid = qid; + queue->sqsize = sqsize; + queue->assoc = assoc; + INIT_LIST_HEAD(&queue->fod_list); + INIT_LIST_HEAD(&queue->avail_defer_list); + INIT_LIST_HEAD(&queue->pending_cmd_list); + atomic_set(&queue->connected, 0); + atomic_set(&queue->sqtail, 0); + atomic_set(&queue->rsn, 1); + atomic_set(&queue->zrspcnt, 0); + spin_lock_init(&queue->qlock); + kref_init(&queue->ref); + + nvmet_fc_prep_fcp_iodlist(assoc->tgtport, queue); + + ret = nvmet_sq_init(&queue->nvme_sq); + if (ret) + goto out_fail_iodlist; + + WARN_ON(assoc->queues[qid]); + rcu_assign_pointer(assoc->queues[qid], queue); + + return queue; + +out_fail_iodlist: + nvmet_fc_destroy_fcp_iodlist(assoc->tgtport, queue); + destroy_workqueue(queue->work_q); +out_a_put: + nvmet_fc_tgt_a_put(assoc); +out_free_queue: + kfree(queue); + return NULL; +} + + +static void +nvmet_fc_tgt_queue_free(struct kref *ref) +{ + struct nvmet_fc_tgt_queue *queue = + container_of(ref, struct nvmet_fc_tgt_queue, ref); + + rcu_assign_pointer(queue->assoc->queues[queue->qid], NULL); + + nvmet_fc_destroy_fcp_iodlist(queue->assoc->tgtport, queue); + + nvmet_fc_tgt_a_put(queue->assoc); + + destroy_workqueue(queue->work_q); + + kfree_rcu(queue, rcu); +} + +static void +nvmet_fc_tgt_q_put(struct nvmet_fc_tgt_queue *queue) +{ + kref_put(&queue->ref, nvmet_fc_tgt_queue_free); +} + +static int +nvmet_fc_tgt_q_get(struct nvmet_fc_tgt_queue *queue) +{ + return kref_get_unless_zero(&queue->ref); +} + + +static void +nvmet_fc_delete_target_queue(struct nvmet_fc_tgt_queue *queue) +{ + struct nvmet_fc_tgtport *tgtport = queue->assoc->tgtport; + struct nvmet_fc_fcp_iod *fod = queue->fod; + struct nvmet_fc_defer_fcp_req *deferfcp, *tempptr; + unsigned long flags; + int i; + bool disconnect; + + disconnect = atomic_xchg(&queue->connected, 0); + + /* if not connected, nothing to do */ + if (!disconnect) + return; + + spin_lock_irqsave(&queue->qlock, flags); + /* abort outstanding io's */ + for (i = 0; i < queue->sqsize; fod++, i++) { + if (fod->active) { + spin_lock(&fod->flock); + fod->abort = true; + /* + * only call lldd abort routine if waiting for + * writedata. other outstanding ops should finish + * on their own. + */ + if (fod->writedataactive) { + fod->aborted = true; + spin_unlock(&fod->flock); + tgtport->ops->fcp_abort( + &tgtport->fc_target_port, fod->fcpreq); + } else + spin_unlock(&fod->flock); + } + } + + /* Cleanup defer'ed IOs in queue */ + list_for_each_entry_safe(deferfcp, tempptr, &queue->avail_defer_list, + req_list) { + list_del(&deferfcp->req_list); + kfree(deferfcp); + } + + for (;;) { + deferfcp = list_first_entry_or_null(&queue->pending_cmd_list, + struct nvmet_fc_defer_fcp_req, req_list); + if (!deferfcp) + break; + + list_del(&deferfcp->req_list); + spin_unlock_irqrestore(&queue->qlock, flags); + + tgtport->ops->defer_rcv(&tgtport->fc_target_port, + deferfcp->fcp_req); + + tgtport->ops->fcp_abort(&tgtport->fc_target_port, + deferfcp->fcp_req); + + tgtport->ops->fcp_req_release(&tgtport->fc_target_port, + deferfcp->fcp_req); + + /* release the queue lookup reference */ + nvmet_fc_tgt_q_put(queue); + + kfree(deferfcp); + + spin_lock_irqsave(&queue->qlock, flags); + } + spin_unlock_irqrestore(&queue->qlock, flags); + + flush_workqueue(queue->work_q); + + nvmet_sq_destroy(&queue->nvme_sq); + + nvmet_fc_tgt_q_put(queue); +} + +static struct nvmet_fc_tgt_queue * +nvmet_fc_find_target_queue(struct nvmet_fc_tgtport *tgtport, + u64 connection_id) +{ + struct nvmet_fc_tgt_assoc *assoc; + struct nvmet_fc_tgt_queue *queue; + u64 association_id = nvmet_fc_getassociationid(connection_id); + u16 qid = nvmet_fc_getqueueid(connection_id); + + if (qid > NVMET_NR_QUEUES) + return NULL; + + rcu_read_lock(); + list_for_each_entry_rcu(assoc, &tgtport->assoc_list, a_list) { + if (association_id == assoc->association_id) { + queue = rcu_dereference(assoc->queues[qid]); + if (queue && + (!atomic_read(&queue->connected) || + !nvmet_fc_tgt_q_get(queue))) + queue = NULL; + rcu_read_unlock(); + return queue; + } + } + rcu_read_unlock(); + return NULL; +} + +static void +nvmet_fc_hostport_free(struct kref *ref) +{ + struct nvmet_fc_hostport *hostport = + container_of(ref, struct nvmet_fc_hostport, ref); + struct nvmet_fc_tgtport *tgtport = hostport->tgtport; + unsigned long flags; + + spin_lock_irqsave(&tgtport->lock, flags); + list_del(&hostport->host_list); + spin_unlock_irqrestore(&tgtport->lock, flags); + if (tgtport->ops->host_release && hostport->invalid) + tgtport->ops->host_release(hostport->hosthandle); + kfree(hostport); + nvmet_fc_tgtport_put(tgtport); +} + +static void +nvmet_fc_hostport_put(struct nvmet_fc_hostport *hostport) +{ + kref_put(&hostport->ref, nvmet_fc_hostport_free); +} + +static int +nvmet_fc_hostport_get(struct nvmet_fc_hostport *hostport) +{ + return kref_get_unless_zero(&hostport->ref); +} + +static void +nvmet_fc_free_hostport(struct nvmet_fc_hostport *hostport) +{ + /* if LLDD not implemented, leave as NULL */ + if (!hostport || !hostport->hosthandle) + return; + + nvmet_fc_hostport_put(hostport); +} + +static struct nvmet_fc_hostport * +nvmet_fc_match_hostport(struct nvmet_fc_tgtport *tgtport, void *hosthandle) +{ + struct nvmet_fc_hostport *host; + + lockdep_assert_held(&tgtport->lock); + + list_for_each_entry(host, &tgtport->host_list, host_list) { + if (host->hosthandle == hosthandle && !host->invalid) { + if (nvmet_fc_hostport_get(host)) + return (host); + } + } + + return NULL; +} + +static struct nvmet_fc_hostport * +nvmet_fc_alloc_hostport(struct nvmet_fc_tgtport *tgtport, void *hosthandle) +{ + struct nvmet_fc_hostport *newhost, *match = NULL; + unsigned long flags; + + /* if LLDD not implemented, leave as NULL */ + if (!hosthandle) + return NULL; + + /* + * take reference for what will be the newly allocated hostport if + * we end up using a new allocation + */ + if (!nvmet_fc_tgtport_get(tgtport)) + return ERR_PTR(-EINVAL); + + spin_lock_irqsave(&tgtport->lock, flags); + match = nvmet_fc_match_hostport(tgtport, hosthandle); + spin_unlock_irqrestore(&tgtport->lock, flags); + + if (match) { + /* no new allocation - release reference */ + nvmet_fc_tgtport_put(tgtport); + return match; + } + + newhost = kzalloc(sizeof(*newhost), GFP_KERNEL); + if (!newhost) { + /* no new allocation - release reference */ + nvmet_fc_tgtport_put(tgtport); + return ERR_PTR(-ENOMEM); + } + + spin_lock_irqsave(&tgtport->lock, flags); + match = nvmet_fc_match_hostport(tgtport, hosthandle); + if (match) { + /* new allocation not needed */ + kfree(newhost); + newhost = match; + /* no new allocation - release reference */ + nvmet_fc_tgtport_put(tgtport); + } else { + newhost->tgtport = tgtport; + newhost->hosthandle = hosthandle; + INIT_LIST_HEAD(&newhost->host_list); + kref_init(&newhost->ref); + + list_add_tail(&newhost->host_list, &tgtport->host_list); + } + spin_unlock_irqrestore(&tgtport->lock, flags); + + return newhost; +} + +static void +nvmet_fc_delete_assoc(struct work_struct *work) +{ + struct nvmet_fc_tgt_assoc *assoc = + container_of(work, struct nvmet_fc_tgt_assoc, del_work); + + nvmet_fc_delete_target_assoc(assoc); + nvmet_fc_tgt_a_put(assoc); +} + +static struct nvmet_fc_tgt_assoc * +nvmet_fc_alloc_target_assoc(struct nvmet_fc_tgtport *tgtport, void *hosthandle) +{ + struct nvmet_fc_tgt_assoc *assoc, *tmpassoc; + unsigned long flags; + u64 ran; + int idx; + bool needrandom = true; + + assoc = kzalloc(sizeof(*assoc), GFP_KERNEL); + if (!assoc) + return NULL; + + idx = ida_simple_get(&tgtport->assoc_cnt, 0, 0, GFP_KERNEL); + if (idx < 0) + goto out_free_assoc; + + if (!nvmet_fc_tgtport_get(tgtport)) + goto out_ida; + + assoc->hostport = nvmet_fc_alloc_hostport(tgtport, hosthandle); + if (IS_ERR(assoc->hostport)) + goto out_put; + + assoc->tgtport = tgtport; + assoc->a_id = idx; + INIT_LIST_HEAD(&assoc->a_list); + kref_init(&assoc->ref); + INIT_WORK(&assoc->del_work, nvmet_fc_delete_assoc); + atomic_set(&assoc->terminating, 0); + + while (needrandom) { + get_random_bytes(&ran, sizeof(ran) - BYTES_FOR_QID); + ran = ran << BYTES_FOR_QID_SHIFT; + + spin_lock_irqsave(&tgtport->lock, flags); + needrandom = false; + list_for_each_entry(tmpassoc, &tgtport->assoc_list, a_list) { + if (ran == tmpassoc->association_id) { + needrandom = true; + break; + } + } + if (!needrandom) { + assoc->association_id = ran; + list_add_tail_rcu(&assoc->a_list, &tgtport->assoc_list); + } + spin_unlock_irqrestore(&tgtport->lock, flags); + } + + return assoc; + +out_put: + nvmet_fc_tgtport_put(tgtport); +out_ida: + ida_simple_remove(&tgtport->assoc_cnt, idx); +out_free_assoc: + kfree(assoc); + return NULL; +} + +static void +nvmet_fc_target_assoc_free(struct kref *ref) +{ + struct nvmet_fc_tgt_assoc *assoc = + container_of(ref, struct nvmet_fc_tgt_assoc, ref); + struct nvmet_fc_tgtport *tgtport = assoc->tgtport; + struct nvmet_fc_ls_iod *oldls; + unsigned long flags; + + /* Send Disconnect now that all i/o has completed */ + nvmet_fc_xmt_disconnect_assoc(assoc); + + nvmet_fc_free_hostport(assoc->hostport); + spin_lock_irqsave(&tgtport->lock, flags); + list_del_rcu(&assoc->a_list); + oldls = assoc->rcv_disconn; + spin_unlock_irqrestore(&tgtport->lock, flags); + /* if pending Rcv Disconnect Association LS, send rsp now */ + if (oldls) + nvmet_fc_xmt_ls_rsp(tgtport, oldls); + ida_simple_remove(&tgtport->assoc_cnt, assoc->a_id); + dev_info(tgtport->dev, + "{%d:%d} Association freed\n", + tgtport->fc_target_port.port_num, assoc->a_id); + kfree_rcu(assoc, rcu); + nvmet_fc_tgtport_put(tgtport); +} + +static void +nvmet_fc_tgt_a_put(struct nvmet_fc_tgt_assoc *assoc) +{ + kref_put(&assoc->ref, nvmet_fc_target_assoc_free); +} + +static int +nvmet_fc_tgt_a_get(struct nvmet_fc_tgt_assoc *assoc) +{ + return kref_get_unless_zero(&assoc->ref); +} + +static void +nvmet_fc_delete_target_assoc(struct nvmet_fc_tgt_assoc *assoc) +{ + struct nvmet_fc_tgtport *tgtport = assoc->tgtport; + struct nvmet_fc_tgt_queue *queue; + int i, terminating; + + terminating = atomic_xchg(&assoc->terminating, 1); + + /* if already terminating, do nothing */ + if (terminating) + return; + + + for (i = NVMET_NR_QUEUES; i >= 0; i--) { + rcu_read_lock(); + queue = rcu_dereference(assoc->queues[i]); + if (!queue) { + rcu_read_unlock(); + continue; + } + + if (!nvmet_fc_tgt_q_get(queue)) { + rcu_read_unlock(); + continue; + } + rcu_read_unlock(); + nvmet_fc_delete_target_queue(queue); + nvmet_fc_tgt_q_put(queue); + } + + dev_info(tgtport->dev, + "{%d:%d} Association deleted\n", + tgtport->fc_target_port.port_num, assoc->a_id); + + nvmet_fc_tgt_a_put(assoc); +} + +static struct nvmet_fc_tgt_assoc * +nvmet_fc_find_target_assoc(struct nvmet_fc_tgtport *tgtport, + u64 association_id) +{ + struct nvmet_fc_tgt_assoc *assoc; + struct nvmet_fc_tgt_assoc *ret = NULL; + + rcu_read_lock(); + list_for_each_entry_rcu(assoc, &tgtport->assoc_list, a_list) { + if (association_id == assoc->association_id) { + ret = assoc; + if (!nvmet_fc_tgt_a_get(assoc)) + ret = NULL; + break; + } + } + rcu_read_unlock(); + + return ret; +} + +static void +nvmet_fc_portentry_bind(struct nvmet_fc_tgtport *tgtport, + struct nvmet_fc_port_entry *pe, + struct nvmet_port *port) +{ + lockdep_assert_held(&nvmet_fc_tgtlock); + + pe->tgtport = tgtport; + tgtport->pe = pe; + + pe->port = port; + port->priv = pe; + + pe->node_name = tgtport->fc_target_port.node_name; + pe->port_name = tgtport->fc_target_port.port_name; + INIT_LIST_HEAD(&pe->pe_list); + + list_add_tail(&pe->pe_list, &nvmet_fc_portentry_list); +} + +static void +nvmet_fc_portentry_unbind(struct nvmet_fc_port_entry *pe) +{ + unsigned long flags; + + spin_lock_irqsave(&nvmet_fc_tgtlock, flags); + if (pe->tgtport) + pe->tgtport->pe = NULL; + list_del(&pe->pe_list); + spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags); +} + +/* + * called when a targetport deregisters. Breaks the relationship + * with the nvmet port, but leaves the port_entry in place so that + * re-registration can resume operation. + */ +static void +nvmet_fc_portentry_unbind_tgt(struct nvmet_fc_tgtport *tgtport) +{ + struct nvmet_fc_port_entry *pe; + unsigned long flags; + + spin_lock_irqsave(&nvmet_fc_tgtlock, flags); + pe = tgtport->pe; + if (pe) + pe->tgtport = NULL; + tgtport->pe = NULL; + spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags); +} + +/* + * called when a new targetport is registered. Looks in the + * existing nvmet port_entries to see if the nvmet layer is + * configured for the targetport's wwn's. (the targetport existed, + * nvmet configured, the lldd unregistered the tgtport, and is now + * reregistering the same targetport). If so, set the nvmet port + * port entry on the targetport. + */ +static void +nvmet_fc_portentry_rebind_tgt(struct nvmet_fc_tgtport *tgtport) +{ + struct nvmet_fc_port_entry *pe; + unsigned long flags; + + spin_lock_irqsave(&nvmet_fc_tgtlock, flags); + list_for_each_entry(pe, &nvmet_fc_portentry_list, pe_list) { + if (tgtport->fc_target_port.node_name == pe->node_name && + tgtport->fc_target_port.port_name == pe->port_name) { + WARN_ON(pe->tgtport); + tgtport->pe = pe; + pe->tgtport = tgtport; + break; + } + } + spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags); +} + +/** + * nvme_fc_register_targetport - transport entry point called by an + * LLDD to register the existence of a local + * NVME subystem FC port. + * @pinfo: pointer to information about the port to be registered + * @template: LLDD entrypoints and operational parameters for the port + * @dev: physical hardware device node port corresponds to. Will be + * used for DMA mappings + * @portptr: pointer to a local port pointer. Upon success, the routine + * will allocate a nvme_fc_local_port structure and place its + * address in the local port pointer. Upon failure, local port + * pointer will be set to NULL. + * + * Returns: + * a completion status. Must be 0 upon success; a negative errno + * (ex: -ENXIO) upon failure. + */ +int +nvmet_fc_register_targetport(struct nvmet_fc_port_info *pinfo, + struct nvmet_fc_target_template *template, + struct device *dev, + struct nvmet_fc_target_port **portptr) +{ + struct nvmet_fc_tgtport *newrec; + unsigned long flags; + int ret, idx; + + if (!template->xmt_ls_rsp || !template->fcp_op || + !template->fcp_abort || + !template->fcp_req_release || !template->targetport_delete || + !template->max_hw_queues || !template->max_sgl_segments || + !template->max_dif_sgl_segments || !template->dma_boundary) { + ret = -EINVAL; + goto out_regtgt_failed; + } + + newrec = kzalloc((sizeof(*newrec) + template->target_priv_sz), + GFP_KERNEL); + if (!newrec) { + ret = -ENOMEM; + goto out_regtgt_failed; + } + + idx = ida_simple_get(&nvmet_fc_tgtport_cnt, 0, 0, GFP_KERNEL); + if (idx < 0) { + ret = -ENOSPC; + goto out_fail_kfree; + } + + if (!get_device(dev) && dev) { + ret = -ENODEV; + goto out_ida_put; + } + + newrec->fc_target_port.node_name = pinfo->node_name; + newrec->fc_target_port.port_name = pinfo->port_name; + if (template->target_priv_sz) + newrec->fc_target_port.private = &newrec[1]; + else + newrec->fc_target_port.private = NULL; + newrec->fc_target_port.port_id = pinfo->port_id; + newrec->fc_target_port.port_num = idx; + INIT_LIST_HEAD(&newrec->tgt_list); + newrec->dev = dev; + newrec->ops = template; + spin_lock_init(&newrec->lock); + INIT_LIST_HEAD(&newrec->ls_rcv_list); + INIT_LIST_HEAD(&newrec->ls_req_list); + INIT_LIST_HEAD(&newrec->ls_busylist); + INIT_LIST_HEAD(&newrec->assoc_list); + INIT_LIST_HEAD(&newrec->host_list); + kref_init(&newrec->ref); + ida_init(&newrec->assoc_cnt); + newrec->max_sg_cnt = template->max_sgl_segments; + + ret = nvmet_fc_alloc_ls_iodlist(newrec); + if (ret) { + ret = -ENOMEM; + goto out_free_newrec; + } + + nvmet_fc_portentry_rebind_tgt(newrec); + + spin_lock_irqsave(&nvmet_fc_tgtlock, flags); + list_add_tail(&newrec->tgt_list, &nvmet_fc_target_list); + spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags); + + *portptr = &newrec->fc_target_port; + return 0; + +out_free_newrec: + put_device(dev); +out_ida_put: + ida_simple_remove(&nvmet_fc_tgtport_cnt, idx); +out_fail_kfree: + kfree(newrec); +out_regtgt_failed: + *portptr = NULL; + return ret; +} +EXPORT_SYMBOL_GPL(nvmet_fc_register_targetport); + + +static void +nvmet_fc_free_tgtport(struct kref *ref) +{ + struct nvmet_fc_tgtport *tgtport = + container_of(ref, struct nvmet_fc_tgtport, ref); + struct device *dev = tgtport->dev; + unsigned long flags; + + spin_lock_irqsave(&nvmet_fc_tgtlock, flags); + list_del(&tgtport->tgt_list); + spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags); + + nvmet_fc_free_ls_iodlist(tgtport); + + /* let the LLDD know we've finished tearing it down */ + tgtport->ops->targetport_delete(&tgtport->fc_target_port); + + ida_simple_remove(&nvmet_fc_tgtport_cnt, + tgtport->fc_target_port.port_num); + + ida_destroy(&tgtport->assoc_cnt); + + kfree(tgtport); + + put_device(dev); +} + +static void +nvmet_fc_tgtport_put(struct nvmet_fc_tgtport *tgtport) +{ + kref_put(&tgtport->ref, nvmet_fc_free_tgtport); +} + +static int +nvmet_fc_tgtport_get(struct nvmet_fc_tgtport *tgtport) +{ + return kref_get_unless_zero(&tgtport->ref); +} + +static void +__nvmet_fc_free_assocs(struct nvmet_fc_tgtport *tgtport) +{ + struct nvmet_fc_tgt_assoc *assoc; + + rcu_read_lock(); + list_for_each_entry_rcu(assoc, &tgtport->assoc_list, a_list) { + if (!nvmet_fc_tgt_a_get(assoc)) + continue; + if (!schedule_work(&assoc->del_work)) + /* already deleting - release local reference */ + nvmet_fc_tgt_a_put(assoc); + } + rcu_read_unlock(); +} + +/** + * nvmet_fc_invalidate_host - transport entry point called by an LLDD + * to remove references to a hosthandle for LS's. + * + * The nvmet-fc layer ensures that any references to the hosthandle + * on the targetport are forgotten (set to NULL). The LLDD will + * typically call this when a login with a remote host port has been + * lost, thus LS's for the remote host port are no longer possible. + * + * If an LS request is outstanding to the targetport/hosthandle (or + * issued concurrently with the call to invalidate the host), the + * LLDD is responsible for terminating/aborting the LS and completing + * the LS request. It is recommended that these terminations/aborts + * occur after calling to invalidate the host handle to avoid additional + * retries by the nvmet-fc transport. The nvmet-fc transport may + * continue to reference host handle while it cleans up outstanding + * NVME associations. The nvmet-fc transport will call the + * ops->host_release() callback to notify the LLDD that all references + * are complete and the related host handle can be recovered. + * Note: if there are no references, the callback may be called before + * the invalidate host call returns. + * + * @target_port: pointer to the (registered) target port that a prior + * LS was received on and which supplied the transport the + * hosthandle. + * @hosthandle: the handle (pointer) that represents the host port + * that no longer has connectivity and that LS's should + * no longer be directed to. + */ +void +nvmet_fc_invalidate_host(struct nvmet_fc_target_port *target_port, + void *hosthandle) +{ + struct nvmet_fc_tgtport *tgtport = targetport_to_tgtport(target_port); + struct nvmet_fc_tgt_assoc *assoc, *next; + unsigned long flags; + bool noassoc = true; + + spin_lock_irqsave(&tgtport->lock, flags); + list_for_each_entry_safe(assoc, next, + &tgtport->assoc_list, a_list) { + if (!assoc->hostport || + assoc->hostport->hosthandle != hosthandle) + continue; + if (!nvmet_fc_tgt_a_get(assoc)) + continue; + assoc->hostport->invalid = 1; + noassoc = false; + if (!schedule_work(&assoc->del_work)) + /* already deleting - release local reference */ + nvmet_fc_tgt_a_put(assoc); + } + spin_unlock_irqrestore(&tgtport->lock, flags); + + /* if there's nothing to wait for - call the callback */ + if (noassoc && tgtport->ops->host_release) + tgtport->ops->host_release(hosthandle); +} +EXPORT_SYMBOL_GPL(nvmet_fc_invalidate_host); + +/* + * nvmet layer has called to terminate an association + */ +static void +nvmet_fc_delete_ctrl(struct nvmet_ctrl *ctrl) +{ + struct nvmet_fc_tgtport *tgtport, *next; + struct nvmet_fc_tgt_assoc *assoc; + struct nvmet_fc_tgt_queue *queue; + unsigned long flags; + bool found_ctrl = false; + + /* this is a bit ugly, but don't want to make locks layered */ + spin_lock_irqsave(&nvmet_fc_tgtlock, flags); + list_for_each_entry_safe(tgtport, next, &nvmet_fc_target_list, + tgt_list) { + if (!nvmet_fc_tgtport_get(tgtport)) + continue; + spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags); + + rcu_read_lock(); + list_for_each_entry_rcu(assoc, &tgtport->assoc_list, a_list) { + queue = rcu_dereference(assoc->queues[0]); + if (queue && queue->nvme_sq.ctrl == ctrl) { + if (nvmet_fc_tgt_a_get(assoc)) + found_ctrl = true; + break; + } + } + rcu_read_unlock(); + + nvmet_fc_tgtport_put(tgtport); + + if (found_ctrl) { + if (!schedule_work(&assoc->del_work)) + /* already deleting - release local reference */ + nvmet_fc_tgt_a_put(assoc); + return; + } + + spin_lock_irqsave(&nvmet_fc_tgtlock, flags); + } + spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags); +} + +/** + * nvme_fc_unregister_targetport - transport entry point called by an + * LLDD to deregister/remove a previously + * registered a local NVME subsystem FC port. + * @target_port: pointer to the (registered) target port that is to be + * deregistered. + * + * Returns: + * a completion status. Must be 0 upon success; a negative errno + * (ex: -ENXIO) upon failure. + */ +int +nvmet_fc_unregister_targetport(struct nvmet_fc_target_port *target_port) +{ + struct nvmet_fc_tgtport *tgtport = targetport_to_tgtport(target_port); + + nvmet_fc_portentry_unbind_tgt(tgtport); + + /* terminate any outstanding associations */ + __nvmet_fc_free_assocs(tgtport); + + /* + * should terminate LS's as well. However, LS's will be generated + * at the tail end of association termination, so they likely don't + * exist yet. And even if they did, it's worthwhile to just let + * them finish and targetport ref counting will clean things up. + */ + + nvmet_fc_tgtport_put(tgtport); + + return 0; +} +EXPORT_SYMBOL_GPL(nvmet_fc_unregister_targetport); + + +/* ********************** FC-NVME LS RCV Handling ************************* */ + + +static void +nvmet_fc_ls_create_association(struct nvmet_fc_tgtport *tgtport, + struct nvmet_fc_ls_iod *iod) +{ + struct fcnvme_ls_cr_assoc_rqst *rqst = &iod->rqstbuf->rq_cr_assoc; + struct fcnvme_ls_cr_assoc_acc *acc = &iod->rspbuf->rsp_cr_assoc; + struct nvmet_fc_tgt_queue *queue; + int ret = 0; + + memset(acc, 0, sizeof(*acc)); + + /* + * FC-NVME spec changes. There are initiators sending different + * lengths as padding sizes for Create Association Cmd descriptor + * was incorrect. + * Accept anything of "minimum" length. Assume format per 1.15 + * spec (with HOSTID reduced to 16 bytes), ignore how long the + * trailing pad length is. + */ + if (iod->rqstdatalen < FCNVME_LSDESC_CRA_RQST_MINLEN) + ret = VERR_CR_ASSOC_LEN; + else if (be32_to_cpu(rqst->desc_list_len) < + FCNVME_LSDESC_CRA_RQST_MIN_LISTLEN) + ret = VERR_CR_ASSOC_RQST_LEN; + else if (rqst->assoc_cmd.desc_tag != + cpu_to_be32(FCNVME_LSDESC_CREATE_ASSOC_CMD)) + ret = VERR_CR_ASSOC_CMD; + else if (be32_to_cpu(rqst->assoc_cmd.desc_len) < + FCNVME_LSDESC_CRA_CMD_DESC_MIN_DESCLEN) + ret = VERR_CR_ASSOC_CMD_LEN; + else if (!rqst->assoc_cmd.ersp_ratio || + (be16_to_cpu(rqst->assoc_cmd.ersp_ratio) >= + be16_to_cpu(rqst->assoc_cmd.sqsize))) + ret = VERR_ERSP_RATIO; + + else { + /* new association w/ admin queue */ + iod->assoc = nvmet_fc_alloc_target_assoc( + tgtport, iod->hosthandle); + if (!iod->assoc) + ret = VERR_ASSOC_ALLOC_FAIL; + else { + queue = nvmet_fc_alloc_target_queue(iod->assoc, 0, + be16_to_cpu(rqst->assoc_cmd.sqsize)); + if (!queue) + ret = VERR_QUEUE_ALLOC_FAIL; + } + } + + if (ret) { + dev_err(tgtport->dev, + "Create Association LS failed: %s\n", + validation_errors[ret]); + iod->lsrsp->rsplen = nvme_fc_format_rjt(acc, + sizeof(*acc), rqst->w0.ls_cmd, + FCNVME_RJT_RC_LOGIC, + FCNVME_RJT_EXP_NONE, 0); + return; + } + + queue->ersp_ratio = be16_to_cpu(rqst->assoc_cmd.ersp_ratio); + atomic_set(&queue->connected, 1); + queue->sqhd = 0; /* best place to init value */ + + dev_info(tgtport->dev, + "{%d:%d} Association created\n", + tgtport->fc_target_port.port_num, iod->assoc->a_id); + + /* format a response */ + + iod->lsrsp->rsplen = sizeof(*acc); + + nvme_fc_format_rsp_hdr(acc, FCNVME_LS_ACC, + fcnvme_lsdesc_len( + sizeof(struct fcnvme_ls_cr_assoc_acc)), + FCNVME_LS_CREATE_ASSOCIATION); + acc->associd.desc_tag = cpu_to_be32(FCNVME_LSDESC_ASSOC_ID); + acc->associd.desc_len = + fcnvme_lsdesc_len( + sizeof(struct fcnvme_lsdesc_assoc_id)); + acc->associd.association_id = + cpu_to_be64(nvmet_fc_makeconnid(iod->assoc, 0)); + acc->connectid.desc_tag = cpu_to_be32(FCNVME_LSDESC_CONN_ID); + acc->connectid.desc_len = + fcnvme_lsdesc_len( + sizeof(struct fcnvme_lsdesc_conn_id)); + acc->connectid.connection_id = acc->associd.association_id; +} + +static void +nvmet_fc_ls_create_connection(struct nvmet_fc_tgtport *tgtport, + struct nvmet_fc_ls_iod *iod) +{ + struct fcnvme_ls_cr_conn_rqst *rqst = &iod->rqstbuf->rq_cr_conn; + struct fcnvme_ls_cr_conn_acc *acc = &iod->rspbuf->rsp_cr_conn; + struct nvmet_fc_tgt_queue *queue; + int ret = 0; + + memset(acc, 0, sizeof(*acc)); + + if (iod->rqstdatalen < sizeof(struct fcnvme_ls_cr_conn_rqst)) + ret = VERR_CR_CONN_LEN; + else if (rqst->desc_list_len != + fcnvme_lsdesc_len( + sizeof(struct fcnvme_ls_cr_conn_rqst))) + ret = VERR_CR_CONN_RQST_LEN; + else if (rqst->associd.desc_tag != cpu_to_be32(FCNVME_LSDESC_ASSOC_ID)) + ret = VERR_ASSOC_ID; + else if (rqst->associd.desc_len != + fcnvme_lsdesc_len( + sizeof(struct fcnvme_lsdesc_assoc_id))) + ret = VERR_ASSOC_ID_LEN; + else if (rqst->connect_cmd.desc_tag != + cpu_to_be32(FCNVME_LSDESC_CREATE_CONN_CMD)) + ret = VERR_CR_CONN_CMD; + else if (rqst->connect_cmd.desc_len != + fcnvme_lsdesc_len( + sizeof(struct fcnvme_lsdesc_cr_conn_cmd))) + ret = VERR_CR_CONN_CMD_LEN; + else if (!rqst->connect_cmd.ersp_ratio || + (be16_to_cpu(rqst->connect_cmd.ersp_ratio) >= + be16_to_cpu(rqst->connect_cmd.sqsize))) + ret = VERR_ERSP_RATIO; + + else { + /* new io queue */ + iod->assoc = nvmet_fc_find_target_assoc(tgtport, + be64_to_cpu(rqst->associd.association_id)); + if (!iod->assoc) + ret = VERR_NO_ASSOC; + else { + queue = nvmet_fc_alloc_target_queue(iod->assoc, + be16_to_cpu(rqst->connect_cmd.qid), + be16_to_cpu(rqst->connect_cmd.sqsize)); + if (!queue) + ret = VERR_QUEUE_ALLOC_FAIL; + + /* release get taken in nvmet_fc_find_target_assoc */ + nvmet_fc_tgt_a_put(iod->assoc); + } + } + + if (ret) { + dev_err(tgtport->dev, + "Create Connection LS failed: %s\n", + validation_errors[ret]); + iod->lsrsp->rsplen = nvme_fc_format_rjt(acc, + sizeof(*acc), rqst->w0.ls_cmd, + (ret == VERR_NO_ASSOC) ? + FCNVME_RJT_RC_INV_ASSOC : + FCNVME_RJT_RC_LOGIC, + FCNVME_RJT_EXP_NONE, 0); + return; + } + + queue->ersp_ratio = be16_to_cpu(rqst->connect_cmd.ersp_ratio); + atomic_set(&queue->connected, 1); + queue->sqhd = 0; /* best place to init value */ + + /* format a response */ + + iod->lsrsp->rsplen = sizeof(*acc); + + nvme_fc_format_rsp_hdr(acc, FCNVME_LS_ACC, + fcnvme_lsdesc_len(sizeof(struct fcnvme_ls_cr_conn_acc)), + FCNVME_LS_CREATE_CONNECTION); + acc->connectid.desc_tag = cpu_to_be32(FCNVME_LSDESC_CONN_ID); + acc->connectid.desc_len = + fcnvme_lsdesc_len( + sizeof(struct fcnvme_lsdesc_conn_id)); + acc->connectid.connection_id = + cpu_to_be64(nvmet_fc_makeconnid(iod->assoc, + be16_to_cpu(rqst->connect_cmd.qid))); +} + +/* + * Returns true if the LS response is to be transmit + * Returns false if the LS response is to be delayed + */ +static int +nvmet_fc_ls_disconnect(struct nvmet_fc_tgtport *tgtport, + struct nvmet_fc_ls_iod *iod) +{ + struct fcnvme_ls_disconnect_assoc_rqst *rqst = + &iod->rqstbuf->rq_dis_assoc; + struct fcnvme_ls_disconnect_assoc_acc *acc = + &iod->rspbuf->rsp_dis_assoc; + struct nvmet_fc_tgt_assoc *assoc = NULL; + struct nvmet_fc_ls_iod *oldls = NULL; + unsigned long flags; + int ret = 0; + + memset(acc, 0, sizeof(*acc)); + + ret = nvmefc_vldt_lsreq_discon_assoc(iod->rqstdatalen, rqst); + if (!ret) { + /* match an active association - takes an assoc ref if !NULL */ + assoc = nvmet_fc_find_target_assoc(tgtport, + be64_to_cpu(rqst->associd.association_id)); + iod->assoc = assoc; + if (!assoc) + ret = VERR_NO_ASSOC; + } + + if (ret || !assoc) { + dev_err(tgtport->dev, + "Disconnect LS failed: %s\n", + validation_errors[ret]); + iod->lsrsp->rsplen = nvme_fc_format_rjt(acc, + sizeof(*acc), rqst->w0.ls_cmd, + (ret == VERR_NO_ASSOC) ? + FCNVME_RJT_RC_INV_ASSOC : + FCNVME_RJT_RC_LOGIC, + FCNVME_RJT_EXP_NONE, 0); + return true; + } + + /* format a response */ + + iod->lsrsp->rsplen = sizeof(*acc); + + nvme_fc_format_rsp_hdr(acc, FCNVME_LS_ACC, + fcnvme_lsdesc_len( + sizeof(struct fcnvme_ls_disconnect_assoc_acc)), + FCNVME_LS_DISCONNECT_ASSOC); + + /* release get taken in nvmet_fc_find_target_assoc */ + nvmet_fc_tgt_a_put(assoc); + + /* + * The rules for LS response says the response cannot + * go back until ABTS's have been sent for all outstanding + * I/O and a Disconnect Association LS has been sent. + * So... save off the Disconnect LS to send the response + * later. If there was a prior LS already saved, replace + * it with the newer one and send a can't perform reject + * on the older one. + */ + spin_lock_irqsave(&tgtport->lock, flags); + oldls = assoc->rcv_disconn; + assoc->rcv_disconn = iod; + spin_unlock_irqrestore(&tgtport->lock, flags); + + nvmet_fc_delete_target_assoc(assoc); + + if (oldls) { + dev_info(tgtport->dev, + "{%d:%d} Multiple Disconnect Association LS's " + "received\n", + tgtport->fc_target_port.port_num, assoc->a_id); + /* overwrite good response with bogus failure */ + oldls->lsrsp->rsplen = nvme_fc_format_rjt(oldls->rspbuf, + sizeof(*iod->rspbuf), + /* ok to use rqst, LS is same */ + rqst->w0.ls_cmd, + FCNVME_RJT_RC_UNAB, + FCNVME_RJT_EXP_NONE, 0); + nvmet_fc_xmt_ls_rsp(tgtport, oldls); + } + + return false; +} + + +/* *********************** NVME Ctrl Routines **************************** */ + + +static void nvmet_fc_fcp_nvme_cmd_done(struct nvmet_req *nvme_req); + +static const struct nvmet_fabrics_ops nvmet_fc_tgt_fcp_ops; + +static void +nvmet_fc_xmt_ls_rsp_done(struct nvmefc_ls_rsp *lsrsp) +{ + struct nvmet_fc_ls_iod *iod = lsrsp->nvme_fc_private; + struct nvmet_fc_tgtport *tgtport = iod->tgtport; + + fc_dma_sync_single_for_cpu(tgtport->dev, iod->rspdma, + sizeof(*iod->rspbuf), DMA_TO_DEVICE); + nvmet_fc_free_ls_iod(tgtport, iod); + nvmet_fc_tgtport_put(tgtport); +} + +static void +nvmet_fc_xmt_ls_rsp(struct nvmet_fc_tgtport *tgtport, + struct nvmet_fc_ls_iod *iod) +{ + int ret; + + fc_dma_sync_single_for_device(tgtport->dev, iod->rspdma, + sizeof(*iod->rspbuf), DMA_TO_DEVICE); + + ret = tgtport->ops->xmt_ls_rsp(&tgtport->fc_target_port, iod->lsrsp); + if (ret) + nvmet_fc_xmt_ls_rsp_done(iod->lsrsp); +} + +/* + * Actual processing routine for received FC-NVME LS Requests from the LLD + */ +static void +nvmet_fc_handle_ls_rqst(struct nvmet_fc_tgtport *tgtport, + struct nvmet_fc_ls_iod *iod) +{ + struct fcnvme_ls_rqst_w0 *w0 = &iod->rqstbuf->rq_cr_assoc.w0; + bool sendrsp = true; + + iod->lsrsp->nvme_fc_private = iod; + iod->lsrsp->rspbuf = iod->rspbuf; + iod->lsrsp->rspdma = iod->rspdma; + iod->lsrsp->done = nvmet_fc_xmt_ls_rsp_done; + /* Be preventative. handlers will later set to valid length */ + iod->lsrsp->rsplen = 0; + + iod->assoc = NULL; + + /* + * handlers: + * parse request input, execute the request, and format the + * LS response + */ + switch (w0->ls_cmd) { + case FCNVME_LS_CREATE_ASSOCIATION: + /* Creates Association and initial Admin Queue/Connection */ + nvmet_fc_ls_create_association(tgtport, iod); + break; + case FCNVME_LS_CREATE_CONNECTION: + /* Creates an IO Queue/Connection */ + nvmet_fc_ls_create_connection(tgtport, iod); + break; + case FCNVME_LS_DISCONNECT_ASSOC: + /* Terminate a Queue/Connection or the Association */ + sendrsp = nvmet_fc_ls_disconnect(tgtport, iod); + break; + default: + iod->lsrsp->rsplen = nvme_fc_format_rjt(iod->rspbuf, + sizeof(*iod->rspbuf), w0->ls_cmd, + FCNVME_RJT_RC_INVAL, FCNVME_RJT_EXP_NONE, 0); + } + + if (sendrsp) + nvmet_fc_xmt_ls_rsp(tgtport, iod); +} + +/* + * Actual processing routine for received FC-NVME LS Requests from the LLD + */ +static void +nvmet_fc_handle_ls_rqst_work(struct work_struct *work) +{ + struct nvmet_fc_ls_iod *iod = + container_of(work, struct nvmet_fc_ls_iod, work); + struct nvmet_fc_tgtport *tgtport = iod->tgtport; + + nvmet_fc_handle_ls_rqst(tgtport, iod); +} + + +/** + * nvmet_fc_rcv_ls_req - transport entry point called by an LLDD + * upon the reception of a NVME LS request. + * + * The nvmet-fc layer will copy payload to an internal structure for + * processing. As such, upon completion of the routine, the LLDD may + * immediately free/reuse the LS request buffer passed in the call. + * + * If this routine returns error, the LLDD should abort the exchange. + * + * @target_port: pointer to the (registered) target port the LS was + * received on. + * @hosthandle: pointer to the host specific data, gets stored in iod. + * @lsrsp: pointer to a lsrsp structure to be used to reference + * the exchange corresponding to the LS. + * @lsreqbuf: pointer to the buffer containing the LS Request + * @lsreqbuf_len: length, in bytes, of the received LS request + */ +int +nvmet_fc_rcv_ls_req(struct nvmet_fc_target_port *target_port, + void *hosthandle, + struct nvmefc_ls_rsp *lsrsp, + void *lsreqbuf, u32 lsreqbuf_len) +{ + struct nvmet_fc_tgtport *tgtport = targetport_to_tgtport(target_port); + struct nvmet_fc_ls_iod *iod; + struct fcnvme_ls_rqst_w0 *w0 = (struct fcnvme_ls_rqst_w0 *)lsreqbuf; + + if (lsreqbuf_len > sizeof(union nvmefc_ls_requests)) { + dev_info(tgtport->dev, + "RCV %s LS failed: payload too large (%d)\n", + (w0->ls_cmd <= NVME_FC_LAST_LS_CMD_VALUE) ? + nvmefc_ls_names[w0->ls_cmd] : "", + lsreqbuf_len); + return -E2BIG; + } + + if (!nvmet_fc_tgtport_get(tgtport)) { + dev_info(tgtport->dev, + "RCV %s LS failed: target deleting\n", + (w0->ls_cmd <= NVME_FC_LAST_LS_CMD_VALUE) ? + nvmefc_ls_names[w0->ls_cmd] : ""); + return -ESHUTDOWN; + } + + iod = nvmet_fc_alloc_ls_iod(tgtport); + if (!iod) { + dev_info(tgtport->dev, + "RCV %s LS failed: context allocation failed\n", + (w0->ls_cmd <= NVME_FC_LAST_LS_CMD_VALUE) ? + nvmefc_ls_names[w0->ls_cmd] : ""); + nvmet_fc_tgtport_put(tgtport); + return -ENOENT; + } + + iod->lsrsp = lsrsp; + iod->fcpreq = NULL; + memcpy(iod->rqstbuf, lsreqbuf, lsreqbuf_len); + iod->rqstdatalen = lsreqbuf_len; + iod->hosthandle = hosthandle; + + schedule_work(&iod->work); + + return 0; +} +EXPORT_SYMBOL_GPL(nvmet_fc_rcv_ls_req); + + +/* + * ********************** + * Start of FCP handling + * ********************** + */ + +static int +nvmet_fc_alloc_tgt_pgs(struct nvmet_fc_fcp_iod *fod) +{ + struct scatterlist *sg; + unsigned int nent; + + sg = sgl_alloc(fod->req.transfer_len, GFP_KERNEL, &nent); + if (!sg) + goto out; + + fod->data_sg = sg; + fod->data_sg_cnt = nent; + fod->data_sg_cnt = fc_dma_map_sg(fod->tgtport->dev, sg, nent, + ((fod->io_dir == NVMET_FCP_WRITE) ? + DMA_FROM_DEVICE : DMA_TO_DEVICE)); + /* note: write from initiator perspective */ + fod->next_sg = fod->data_sg; + + return 0; + +out: + return NVME_SC_INTERNAL; +} + +static void +nvmet_fc_free_tgt_pgs(struct nvmet_fc_fcp_iod *fod) +{ + if (!fod->data_sg || !fod->data_sg_cnt) + return; + + fc_dma_unmap_sg(fod->tgtport->dev, fod->data_sg, fod->data_sg_cnt, + ((fod->io_dir == NVMET_FCP_WRITE) ? + DMA_FROM_DEVICE : DMA_TO_DEVICE)); + sgl_free(fod->data_sg); + fod->data_sg = NULL; + fod->data_sg_cnt = 0; +} + + +static bool +queue_90percent_full(struct nvmet_fc_tgt_queue *q, u32 sqhd) +{ + u32 sqtail, used; + + /* egad, this is ugly. And sqtail is just a best guess */ + sqtail = atomic_read(&q->sqtail) % q->sqsize; + + used = (sqtail < sqhd) ? (sqtail + q->sqsize - sqhd) : (sqtail - sqhd); + return ((used * 10) >= (((u32)(q->sqsize - 1) * 9))); +} + +/* + * Prep RSP payload. + * May be a NVMET_FCOP_RSP or NVMET_FCOP_READDATA_RSP op + */ +static void +nvmet_fc_prep_fcp_rsp(struct nvmet_fc_tgtport *tgtport, + struct nvmet_fc_fcp_iod *fod) +{ + struct nvme_fc_ersp_iu *ersp = &fod->rspiubuf; + struct nvme_common_command *sqe = &fod->cmdiubuf.sqe.common; + struct nvme_completion *cqe = &ersp->cqe; + u32 *cqewd = (u32 *)cqe; + bool send_ersp = false; + u32 rsn, rspcnt, xfr_length; + + if (fod->fcpreq->op == NVMET_FCOP_READDATA_RSP) + xfr_length = fod->req.transfer_len; + else + xfr_length = fod->offset; + + /* + * check to see if we can send a 0's rsp. + * Note: to send a 0's response, the NVME-FC host transport will + * recreate the CQE. The host transport knows: sq id, SQHD (last + * seen in an ersp), and command_id. Thus it will create a + * zero-filled CQE with those known fields filled in. Transport + * must send an ersp for any condition where the cqe won't match + * this. + * + * Here are the FC-NVME mandated cases where we must send an ersp: + * every N responses, where N=ersp_ratio + * force fabric commands to send ersp's (not in FC-NVME but good + * practice) + * normal cmds: any time status is non-zero, or status is zero + * but words 0 or 1 are non-zero. + * the SQ is 90% or more full + * the cmd is a fused command + * transferred data length not equal to cmd iu length + */ + rspcnt = atomic_inc_return(&fod->queue->zrspcnt); + if (!(rspcnt % fod->queue->ersp_ratio) || + nvme_is_fabrics((struct nvme_command *) sqe) || + xfr_length != fod->req.transfer_len || + (le16_to_cpu(cqe->status) & 0xFFFE) || cqewd[0] || cqewd[1] || + (sqe->flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND)) || + queue_90percent_full(fod->queue, le16_to_cpu(cqe->sq_head))) + send_ersp = true; + + /* re-set the fields */ + fod->fcpreq->rspaddr = ersp; + fod->fcpreq->rspdma = fod->rspdma; + + if (!send_ersp) { + memset(ersp, 0, NVME_FC_SIZEOF_ZEROS_RSP); + fod->fcpreq->rsplen = NVME_FC_SIZEOF_ZEROS_RSP; + } else { + ersp->iu_len = cpu_to_be16(sizeof(*ersp)/sizeof(u32)); + rsn = atomic_inc_return(&fod->queue->rsn); + ersp->rsn = cpu_to_be32(rsn); + ersp->xfrd_len = cpu_to_be32(xfr_length); + fod->fcpreq->rsplen = sizeof(*ersp); + } + + fc_dma_sync_single_for_device(tgtport->dev, fod->rspdma, + sizeof(fod->rspiubuf), DMA_TO_DEVICE); +} + +static void nvmet_fc_xmt_fcp_op_done(struct nvmefc_tgt_fcp_req *fcpreq); + +static void +nvmet_fc_abort_op(struct nvmet_fc_tgtport *tgtport, + struct nvmet_fc_fcp_iod *fod) +{ + struct nvmefc_tgt_fcp_req *fcpreq = fod->fcpreq; + + /* data no longer needed */ + nvmet_fc_free_tgt_pgs(fod); + + /* + * if an ABTS was received or we issued the fcp_abort early + * don't call abort routine again. + */ + /* no need to take lock - lock was taken earlier to get here */ + if (!fod->aborted) + tgtport->ops->fcp_abort(&tgtport->fc_target_port, fcpreq); + + nvmet_fc_free_fcp_iod(fod->queue, fod); +} + +static void +nvmet_fc_xmt_fcp_rsp(struct nvmet_fc_tgtport *tgtport, + struct nvmet_fc_fcp_iod *fod) +{ + int ret; + + fod->fcpreq->op = NVMET_FCOP_RSP; + fod->fcpreq->timeout = 0; + + nvmet_fc_prep_fcp_rsp(tgtport, fod); + + ret = tgtport->ops->fcp_op(&tgtport->fc_target_port, fod->fcpreq); + if (ret) + nvmet_fc_abort_op(tgtport, fod); +} + +static void +nvmet_fc_transfer_fcp_data(struct nvmet_fc_tgtport *tgtport, + struct nvmet_fc_fcp_iod *fod, u8 op) +{ + struct nvmefc_tgt_fcp_req *fcpreq = fod->fcpreq; + struct scatterlist *sg = fod->next_sg; + unsigned long flags; + u32 remaininglen = fod->req.transfer_len - fod->offset; + u32 tlen = 0; + int ret; + + fcpreq->op = op; + fcpreq->offset = fod->offset; + fcpreq->timeout = NVME_FC_TGTOP_TIMEOUT_SEC; + + /* + * for next sequence: + * break at a sg element boundary + * attempt to keep sequence length capped at + * NVMET_FC_MAX_SEQ_LENGTH but allow sequence to + * be longer if a single sg element is larger + * than that amount. This is done to avoid creating + * a new sg list to use for the tgtport api. + */ + fcpreq->sg = sg; + fcpreq->sg_cnt = 0; + while (tlen < remaininglen && + fcpreq->sg_cnt < tgtport->max_sg_cnt && + tlen + sg_dma_len(sg) < NVMET_FC_MAX_SEQ_LENGTH) { + fcpreq->sg_cnt++; + tlen += sg_dma_len(sg); + sg = sg_next(sg); + } + if (tlen < remaininglen && fcpreq->sg_cnt == 0) { + fcpreq->sg_cnt++; + tlen += min_t(u32, sg_dma_len(sg), remaininglen); + sg = sg_next(sg); + } + if (tlen < remaininglen) + fod->next_sg = sg; + else + fod->next_sg = NULL; + + fcpreq->transfer_length = tlen; + fcpreq->transferred_length = 0; + fcpreq->fcp_error = 0; + fcpreq->rsplen = 0; + + /* + * If the last READDATA request: check if LLDD supports + * combined xfr with response. + */ + if ((op == NVMET_FCOP_READDATA) && + ((fod->offset + fcpreq->transfer_length) == fod->req.transfer_len) && + (tgtport->ops->target_features & NVMET_FCTGTFEAT_READDATA_RSP)) { + fcpreq->op = NVMET_FCOP_READDATA_RSP; + nvmet_fc_prep_fcp_rsp(tgtport, fod); + } + + ret = tgtport->ops->fcp_op(&tgtport->fc_target_port, fod->fcpreq); + if (ret) { + /* + * should be ok to set w/o lock as its in the thread of + * execution (not an async timer routine) and doesn't + * contend with any clearing action + */ + fod->abort = true; + + if (op == NVMET_FCOP_WRITEDATA) { + spin_lock_irqsave(&fod->flock, flags); + fod->writedataactive = false; + spin_unlock_irqrestore(&fod->flock, flags); + nvmet_req_complete(&fod->req, NVME_SC_INTERNAL); + } else /* NVMET_FCOP_READDATA or NVMET_FCOP_READDATA_RSP */ { + fcpreq->fcp_error = ret; + fcpreq->transferred_length = 0; + nvmet_fc_xmt_fcp_op_done(fod->fcpreq); + } + } +} + +static inline bool +__nvmet_fc_fod_op_abort(struct nvmet_fc_fcp_iod *fod, bool abort) +{ + struct nvmefc_tgt_fcp_req *fcpreq = fod->fcpreq; + struct nvmet_fc_tgtport *tgtport = fod->tgtport; + + /* if in the middle of an io and we need to tear down */ + if (abort) { + if (fcpreq->op == NVMET_FCOP_WRITEDATA) { + nvmet_req_complete(&fod->req, NVME_SC_INTERNAL); + return true; + } + + nvmet_fc_abort_op(tgtport, fod); + return true; + } + + return false; +} + +/* + * actual done handler for FCP operations when completed by the lldd + */ +static void +nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod) +{ + struct nvmefc_tgt_fcp_req *fcpreq = fod->fcpreq; + struct nvmet_fc_tgtport *tgtport = fod->tgtport; + unsigned long flags; + bool abort; + + spin_lock_irqsave(&fod->flock, flags); + abort = fod->abort; + fod->writedataactive = false; + spin_unlock_irqrestore(&fod->flock, flags); + + switch (fcpreq->op) { + + case NVMET_FCOP_WRITEDATA: + if (__nvmet_fc_fod_op_abort(fod, abort)) + return; + if (fcpreq->fcp_error || + fcpreq->transferred_length != fcpreq->transfer_length) { + spin_lock_irqsave(&fod->flock, flags); + fod->abort = true; + spin_unlock_irqrestore(&fod->flock, flags); + + nvmet_req_complete(&fod->req, NVME_SC_INTERNAL); + return; + } + + fod->offset += fcpreq->transferred_length; + if (fod->offset != fod->req.transfer_len) { + spin_lock_irqsave(&fod->flock, flags); + fod->writedataactive = true; + spin_unlock_irqrestore(&fod->flock, flags); + + /* transfer the next chunk */ + nvmet_fc_transfer_fcp_data(tgtport, fod, + NVMET_FCOP_WRITEDATA); + return; + } + + /* data transfer complete, resume with nvmet layer */ + fod->req.execute(&fod->req); + break; + + case NVMET_FCOP_READDATA: + case NVMET_FCOP_READDATA_RSP: + if (__nvmet_fc_fod_op_abort(fod, abort)) + return; + if (fcpreq->fcp_error || + fcpreq->transferred_length != fcpreq->transfer_length) { + nvmet_fc_abort_op(tgtport, fod); + return; + } + + /* success */ + + if (fcpreq->op == NVMET_FCOP_READDATA_RSP) { + /* data no longer needed */ + nvmet_fc_free_tgt_pgs(fod); + nvmet_fc_free_fcp_iod(fod->queue, fod); + return; + } + + fod->offset += fcpreq->transferred_length; + if (fod->offset != fod->req.transfer_len) { + /* transfer the next chunk */ + nvmet_fc_transfer_fcp_data(tgtport, fod, + NVMET_FCOP_READDATA); + return; + } + + /* data transfer complete, send response */ + + /* data no longer needed */ + nvmet_fc_free_tgt_pgs(fod); + + nvmet_fc_xmt_fcp_rsp(tgtport, fod); + + break; + + case NVMET_FCOP_RSP: + if (__nvmet_fc_fod_op_abort(fod, abort)) + return; + nvmet_fc_free_fcp_iod(fod->queue, fod); + break; + + default: + break; + } +} + +static void +nvmet_fc_xmt_fcp_op_done(struct nvmefc_tgt_fcp_req *fcpreq) +{ + struct nvmet_fc_fcp_iod *fod = fcpreq->nvmet_fc_private; + + nvmet_fc_fod_op_done(fod); +} + +/* + * actual completion handler after execution by the nvmet layer + */ +static void +__nvmet_fc_fcp_nvme_cmd_done(struct nvmet_fc_tgtport *tgtport, + struct nvmet_fc_fcp_iod *fod, int status) +{ + struct nvme_common_command *sqe = &fod->cmdiubuf.sqe.common; + struct nvme_completion *cqe = &fod->rspiubuf.cqe; + unsigned long flags; + bool abort; + + spin_lock_irqsave(&fod->flock, flags); + abort = fod->abort; + spin_unlock_irqrestore(&fod->flock, flags); + + /* if we have a CQE, snoop the last sq_head value */ + if (!status) + fod->queue->sqhd = cqe->sq_head; + + if (abort) { + nvmet_fc_abort_op(tgtport, fod); + return; + } + + /* if an error handling the cmd post initial parsing */ + if (status) { + /* fudge up a failed CQE status for our transport error */ + memset(cqe, 0, sizeof(*cqe)); + cqe->sq_head = fod->queue->sqhd; /* echo last cqe sqhd */ + cqe->sq_id = cpu_to_le16(fod->queue->qid); + cqe->command_id = sqe->command_id; + cqe->status = cpu_to_le16(status); + } else { + + /* + * try to push the data even if the SQE status is non-zero. + * There may be a status where data still was intended to + * be moved + */ + if ((fod->io_dir == NVMET_FCP_READ) && (fod->data_sg_cnt)) { + /* push the data over before sending rsp */ + nvmet_fc_transfer_fcp_data(tgtport, fod, + NVMET_FCOP_READDATA); + return; + } + + /* writes & no data - fall thru */ + } + + /* data no longer needed */ + nvmet_fc_free_tgt_pgs(fod); + + nvmet_fc_xmt_fcp_rsp(tgtport, fod); +} + + +static void +nvmet_fc_fcp_nvme_cmd_done(struct nvmet_req *nvme_req) +{ + struct nvmet_fc_fcp_iod *fod = nvmet_req_to_fod(nvme_req); + struct nvmet_fc_tgtport *tgtport = fod->tgtport; + + __nvmet_fc_fcp_nvme_cmd_done(tgtport, fod, 0); +} + + +/* + * Actual processing routine for received FC-NVME I/O Requests from the LLD + */ +static void +nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport, + struct nvmet_fc_fcp_iod *fod) +{ + struct nvme_fc_cmd_iu *cmdiu = &fod->cmdiubuf; + u32 xfrlen = be32_to_cpu(cmdiu->data_len); + int ret; + + /* + * Fused commands are currently not supported in the linux + * implementation. + * + * As such, the implementation of the FC transport does not + * look at the fused commands and order delivery to the upper + * layer until we have both based on csn. + */ + + fod->fcpreq->done = nvmet_fc_xmt_fcp_op_done; + + if (cmdiu->flags & FCNVME_CMD_FLAGS_WRITE) { + fod->io_dir = NVMET_FCP_WRITE; + if (!nvme_is_write(&cmdiu->sqe)) + goto transport_error; + } else if (cmdiu->flags & FCNVME_CMD_FLAGS_READ) { + fod->io_dir = NVMET_FCP_READ; + if (nvme_is_write(&cmdiu->sqe)) + goto transport_error; + } else { + fod->io_dir = NVMET_FCP_NODATA; + if (xfrlen) + goto transport_error; + } + + fod->req.cmd = &fod->cmdiubuf.sqe; + fod->req.cqe = &fod->rspiubuf.cqe; + if (tgtport->pe) + fod->req.port = tgtport->pe->port; + + /* clear any response payload */ + memset(&fod->rspiubuf, 0, sizeof(fod->rspiubuf)); + + fod->data_sg = NULL; + fod->data_sg_cnt = 0; + + ret = nvmet_req_init(&fod->req, + &fod->queue->nvme_cq, + &fod->queue->nvme_sq, + &nvmet_fc_tgt_fcp_ops); + if (!ret) { + /* bad SQE content or invalid ctrl state */ + /* nvmet layer has already called op done to send rsp. */ + return; + } + + fod->req.transfer_len = xfrlen; + + /* keep a running counter of tail position */ + atomic_inc(&fod->queue->sqtail); + + if (fod->req.transfer_len) { + ret = nvmet_fc_alloc_tgt_pgs(fod); + if (ret) { + nvmet_req_complete(&fod->req, ret); + return; + } + } + fod->req.sg = fod->data_sg; + fod->req.sg_cnt = fod->data_sg_cnt; + fod->offset = 0; + + if (fod->io_dir == NVMET_FCP_WRITE) { + /* pull the data over before invoking nvmet layer */ + nvmet_fc_transfer_fcp_data(tgtport, fod, NVMET_FCOP_WRITEDATA); + return; + } + + /* + * Reads or no data: + * + * can invoke the nvmet_layer now. If read data, cmd completion will + * push the data + */ + fod->req.execute(&fod->req); + return; + +transport_error: + nvmet_fc_abort_op(tgtport, fod); +} + +/** + * nvmet_fc_rcv_fcp_req - transport entry point called by an LLDD + * upon the reception of a NVME FCP CMD IU. + * + * Pass a FC-NVME FCP CMD IU received from the FC link to the nvmet-fc + * layer for processing. + * + * The nvmet_fc layer allocates a local job structure (struct + * nvmet_fc_fcp_iod) from the queue for the io and copies the + * CMD IU buffer to the job structure. As such, on a successful + * completion (returns 0), the LLDD may immediately free/reuse + * the CMD IU buffer passed in the call. + * + * However, in some circumstances, due to the packetized nature of FC + * and the api of the FC LLDD which may issue a hw command to send the + * response, but the LLDD may not get the hw completion for that command + * and upcall the nvmet_fc layer before a new command may be + * asynchronously received - its possible for a command to be received + * before the LLDD and nvmet_fc have recycled the job structure. It gives + * the appearance of more commands received than fits in the sq. + * To alleviate this scenario, a temporary queue is maintained in the + * transport for pending LLDD requests waiting for a queue job structure. + * In these "overrun" cases, a temporary queue element is allocated + * the LLDD request and CMD iu buffer information remembered, and the + * routine returns a -EOVERFLOW status. Subsequently, when a queue job + * structure is freed, it is immediately reallocated for anything on the + * pending request list. The LLDDs defer_rcv() callback is called, + * informing the LLDD that it may reuse the CMD IU buffer, and the io + * is then started normally with the transport. + * + * The LLDD, when receiving an -EOVERFLOW completion status, is to treat + * the completion as successful but must not reuse the CMD IU buffer + * until the LLDD's defer_rcv() callback has been called for the + * corresponding struct nvmefc_tgt_fcp_req pointer. + * + * If there is any other condition in which an error occurs, the + * transport will return a non-zero status indicating the error. + * In all cases other than -EOVERFLOW, the transport has not accepted the + * request and the LLDD should abort the exchange. + * + * @target_port: pointer to the (registered) target port the FCP CMD IU + * was received on. + * @fcpreq: pointer to a fcpreq request structure to be used to reference + * the exchange corresponding to the FCP Exchange. + * @cmdiubuf: pointer to the buffer containing the FCP CMD IU + * @cmdiubuf_len: length, in bytes, of the received FCP CMD IU + */ +int +nvmet_fc_rcv_fcp_req(struct nvmet_fc_target_port *target_port, + struct nvmefc_tgt_fcp_req *fcpreq, + void *cmdiubuf, u32 cmdiubuf_len) +{ + struct nvmet_fc_tgtport *tgtport = targetport_to_tgtport(target_port); + struct nvme_fc_cmd_iu *cmdiu = cmdiubuf; + struct nvmet_fc_tgt_queue *queue; + struct nvmet_fc_fcp_iod *fod; + struct nvmet_fc_defer_fcp_req *deferfcp; + unsigned long flags; + + /* validate iu, so the connection id can be used to find the queue */ + if ((cmdiubuf_len != sizeof(*cmdiu)) || + (cmdiu->format_id != NVME_CMD_FORMAT_ID) || + (cmdiu->fc_id != NVME_CMD_FC_ID) || + (be16_to_cpu(cmdiu->iu_len) != (sizeof(*cmdiu)/4))) + return -EIO; + + queue = nvmet_fc_find_target_queue(tgtport, + be64_to_cpu(cmdiu->connection_id)); + if (!queue) + return -ENOTCONN; + + /* + * note: reference taken by find_target_queue + * After successful fod allocation, the fod will inherit the + * ownership of that reference and will remove the reference + * when the fod is freed. + */ + + spin_lock_irqsave(&queue->qlock, flags); + + fod = nvmet_fc_alloc_fcp_iod(queue); + if (fod) { + spin_unlock_irqrestore(&queue->qlock, flags); + + fcpreq->nvmet_fc_private = fod; + fod->fcpreq = fcpreq; + + memcpy(&fod->cmdiubuf, cmdiubuf, cmdiubuf_len); + + nvmet_fc_queue_fcp_req(tgtport, queue, fcpreq); + + return 0; + } + + if (!tgtport->ops->defer_rcv) { + spin_unlock_irqrestore(&queue->qlock, flags); + /* release the queue lookup reference */ + nvmet_fc_tgt_q_put(queue); + return -ENOENT; + } + + deferfcp = list_first_entry_or_null(&queue->avail_defer_list, + struct nvmet_fc_defer_fcp_req, req_list); + if (deferfcp) { + /* Just re-use one that was previously allocated */ + list_del(&deferfcp->req_list); + } else { + spin_unlock_irqrestore(&queue->qlock, flags); + + /* Now we need to dynamically allocate one */ + deferfcp = kmalloc(sizeof(*deferfcp), GFP_KERNEL); + if (!deferfcp) { + /* release the queue lookup reference */ + nvmet_fc_tgt_q_put(queue); + return -ENOMEM; + } + spin_lock_irqsave(&queue->qlock, flags); + } + + /* For now, use rspaddr / rsplen to save payload information */ + fcpreq->rspaddr = cmdiubuf; + fcpreq->rsplen = cmdiubuf_len; + deferfcp->fcp_req = fcpreq; + + /* defer processing till a fod becomes available */ + list_add_tail(&deferfcp->req_list, &queue->pending_cmd_list); + + /* NOTE: the queue lookup reference is still valid */ + + spin_unlock_irqrestore(&queue->qlock, flags); + + return -EOVERFLOW; +} +EXPORT_SYMBOL_GPL(nvmet_fc_rcv_fcp_req); + +/** + * nvmet_fc_rcv_fcp_abort - transport entry point called by an LLDD + * upon the reception of an ABTS for a FCP command + * + * Notify the transport that an ABTS has been received for a FCP command + * that had been given to the transport via nvmet_fc_rcv_fcp_req(). The + * LLDD believes the command is still being worked on + * (template_ops->fcp_req_release() has not been called). + * + * The transport will wait for any outstanding work (an op to the LLDD, + * which the lldd should complete with error due to the ABTS; or the + * completion from the nvmet layer of the nvme command), then will + * stop processing and call the nvmet_fc_rcv_fcp_req() callback to + * return the i/o context to the LLDD. The LLDD may send the BA_ACC + * to the ABTS either after return from this function (assuming any + * outstanding op work has been terminated) or upon the callback being + * called. + * + * @target_port: pointer to the (registered) target port the FCP CMD IU + * was received on. + * @fcpreq: pointer to the fcpreq request structure that corresponds + * to the exchange that received the ABTS. + */ +void +nvmet_fc_rcv_fcp_abort(struct nvmet_fc_target_port *target_port, + struct nvmefc_tgt_fcp_req *fcpreq) +{ + struct nvmet_fc_fcp_iod *fod = fcpreq->nvmet_fc_private; + struct nvmet_fc_tgt_queue *queue; + unsigned long flags; + + if (!fod || fod->fcpreq != fcpreq) + /* job appears to have already completed, ignore abort */ + return; + + queue = fod->queue; + + spin_lock_irqsave(&queue->qlock, flags); + if (fod->active) { + /* + * mark as abort. The abort handler, invoked upon completion + * of any work, will detect the aborted status and do the + * callback. + */ + spin_lock(&fod->flock); + fod->abort = true; + fod->aborted = true; + spin_unlock(&fod->flock); + } + spin_unlock_irqrestore(&queue->qlock, flags); +} +EXPORT_SYMBOL_GPL(nvmet_fc_rcv_fcp_abort); + + +struct nvmet_fc_traddr { + u64 nn; + u64 pn; +}; + +static int +__nvme_fc_parse_u64(substring_t *sstr, u64 *val) +{ + u64 token64; + + if (match_u64(sstr, &token64)) + return -EINVAL; + *val = token64; + + return 0; +} + +/* + * This routine validates and extracts the WWN's from the TRADDR string. + * As kernel parsers need the 0x to determine number base, universally + * build string to parse with 0x prefix before parsing name strings. + */ +static int +nvme_fc_parse_traddr(struct nvmet_fc_traddr *traddr, char *buf, size_t blen) +{ + char name[2 + NVME_FC_TRADDR_HEXNAMELEN + 1]; + substring_t wwn = { name, &name[sizeof(name)-1] }; + int nnoffset, pnoffset; + + /* validate if string is one of the 2 allowed formats */ + if (strnlen(buf, blen) == NVME_FC_TRADDR_MAXLENGTH && + !strncmp(buf, "nn-0x", NVME_FC_TRADDR_OXNNLEN) && + !strncmp(&buf[NVME_FC_TRADDR_MAX_PN_OFFSET], + "pn-0x", NVME_FC_TRADDR_OXNNLEN)) { + nnoffset = NVME_FC_TRADDR_OXNNLEN; + pnoffset = NVME_FC_TRADDR_MAX_PN_OFFSET + + NVME_FC_TRADDR_OXNNLEN; + } else if ((strnlen(buf, blen) == NVME_FC_TRADDR_MINLENGTH && + !strncmp(buf, "nn-", NVME_FC_TRADDR_NNLEN) && + !strncmp(&buf[NVME_FC_TRADDR_MIN_PN_OFFSET], + "pn-", NVME_FC_TRADDR_NNLEN))) { + nnoffset = NVME_FC_TRADDR_NNLEN; + pnoffset = NVME_FC_TRADDR_MIN_PN_OFFSET + NVME_FC_TRADDR_NNLEN; + } else + goto out_einval; + + name[0] = '0'; + name[1] = 'x'; + name[2 + NVME_FC_TRADDR_HEXNAMELEN] = 0; + + memcpy(&name[2], &buf[nnoffset], NVME_FC_TRADDR_HEXNAMELEN); + if (__nvme_fc_parse_u64(&wwn, &traddr->nn)) + goto out_einval; + + memcpy(&name[2], &buf[pnoffset], NVME_FC_TRADDR_HEXNAMELEN); + if (__nvme_fc_parse_u64(&wwn, &traddr->pn)) + goto out_einval; + + return 0; + +out_einval: + pr_warn("%s: bad traddr string\n", __func__); + return -EINVAL; +} + +static int +nvmet_fc_add_port(struct nvmet_port *port) +{ + struct nvmet_fc_tgtport *tgtport; + struct nvmet_fc_port_entry *pe; + struct nvmet_fc_traddr traddr = { 0L, 0L }; + unsigned long flags; + int ret; + + /* validate the address info */ + if ((port->disc_addr.trtype != NVMF_TRTYPE_FC) || + (port->disc_addr.adrfam != NVMF_ADDR_FAMILY_FC)) + return -EINVAL; + + /* map the traddr address info to a target port */ + + ret = nvme_fc_parse_traddr(&traddr, port->disc_addr.traddr, + sizeof(port->disc_addr.traddr)); + if (ret) + return ret; + + pe = kzalloc(sizeof(*pe), GFP_KERNEL); + if (!pe) + return -ENOMEM; + + ret = -ENXIO; + spin_lock_irqsave(&nvmet_fc_tgtlock, flags); + list_for_each_entry(tgtport, &nvmet_fc_target_list, tgt_list) { + if ((tgtport->fc_target_port.node_name == traddr.nn) && + (tgtport->fc_target_port.port_name == traddr.pn)) { + /* a FC port can only be 1 nvmet port id */ + if (!tgtport->pe) { + nvmet_fc_portentry_bind(tgtport, pe, port); + ret = 0; + } else + ret = -EALREADY; + break; + } + } + spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags); + + if (ret) + kfree(pe); + + return ret; +} + +static void +nvmet_fc_remove_port(struct nvmet_port *port) +{ + struct nvmet_fc_port_entry *pe = port->priv; + + nvmet_fc_portentry_unbind(pe); + + kfree(pe); +} + +static void +nvmet_fc_discovery_chg(struct nvmet_port *port) +{ + struct nvmet_fc_port_entry *pe = port->priv; + struct nvmet_fc_tgtport *tgtport = pe->tgtport; + + if (tgtport && tgtport->ops->discovery_event) + tgtport->ops->discovery_event(&tgtport->fc_target_port); +} + +static const struct nvmet_fabrics_ops nvmet_fc_tgt_fcp_ops = { + .owner = THIS_MODULE, + .type = NVMF_TRTYPE_FC, + .msdbd = 1, + .add_port = nvmet_fc_add_port, + .remove_port = nvmet_fc_remove_port, + .queue_response = nvmet_fc_fcp_nvme_cmd_done, + .delete_ctrl = nvmet_fc_delete_ctrl, + .discovery_chg = nvmet_fc_discovery_chg, +}; + +static int __init nvmet_fc_init_module(void) +{ + return nvmet_register_transport(&nvmet_fc_tgt_fcp_ops); +} + +static void __exit nvmet_fc_exit_module(void) +{ + /* sanity check - all lports should be removed */ + if (!list_empty(&nvmet_fc_target_list)) + pr_warn("%s: targetport list not empty\n", __func__); + + nvmet_unregister_transport(&nvmet_fc_tgt_fcp_ops); + + ida_destroy(&nvmet_fc_tgtport_cnt); +} + +module_init(nvmet_fc_init_module); +module_exit(nvmet_fc_exit_module); + +MODULE_LICENSE("GPL v2"); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/fcloop.c b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/fcloop.c new file mode 100644 index 0000000..f55979e --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/fcloop.c @@ -0,0 +1,1654 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2016 Avago Technologies. All rights reserved. + */ +#ifdef pr_fmt +#undef pr_fmt +#endif +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include + +#include "../host/nvme.h" +#include "../target/nvmet.h" +#include +#include + + +enum { + NVMF_OPT_ERR = 0, + NVMF_OPT_WWNN = 1 << 0, + NVMF_OPT_WWPN = 1 << 1, + NVMF_OPT_ROLES = 1 << 2, + NVMF_OPT_FCADDR = 1 << 3, + NVMF_OPT_LPWWNN = 1 << 4, + NVMF_OPT_LPWWPN = 1 << 5, +}; + +struct fcloop_ctrl_options { + int mask; + u64 wwnn; + u64 wwpn; + u32 roles; + u32 fcaddr; + u64 lpwwnn; + u64 lpwwpn; +}; + +static const match_table_t opt_tokens = { + { NVMF_OPT_WWNN, "wwnn=%s" }, + { NVMF_OPT_WWPN, "wwpn=%s" }, + { NVMF_OPT_ROLES, "roles=%d" }, + { NVMF_OPT_FCADDR, "fcaddr=%x" }, + { NVMF_OPT_LPWWNN, "lpwwnn=%s" }, + { NVMF_OPT_LPWWPN, "lpwwpn=%s" }, + { NVMF_OPT_ERR, NULL } +}; + +static int fcloop_verify_addr(substring_t *s) +{ + size_t blen = s->to - s->from + 1; + + if (strnlen(s->from, blen) != NVME_FC_TRADDR_HEXNAMELEN + 2 || + strncmp(s->from, "0x", 2)) + return -EINVAL; + + return 0; +} + +static int +fcloop_parse_options(struct fcloop_ctrl_options *opts, + const char *buf) +{ + substring_t args[MAX_OPT_ARGS]; + char *options, *o, *p; + int token, ret = 0; + u64 token64; + + options = o = kstrdup(buf, GFP_KERNEL); + if (!options) + return -ENOMEM; + + while ((p = strsep(&o, ",\n")) != NULL) { + if (!*p) + continue; + + token = match_token(p, opt_tokens, args); + opts->mask |= token; + switch (token) { + case NVMF_OPT_WWNN: + if (fcloop_verify_addr(args) || + match_u64(args, &token64)) { + ret = -EINVAL; + goto out_free_options; + } + opts->wwnn = token64; + break; + case NVMF_OPT_WWPN: + if (fcloop_verify_addr(args) || + match_u64(args, &token64)) { + ret = -EINVAL; + goto out_free_options; + } + opts->wwpn = token64; + break; + case NVMF_OPT_ROLES: + if (match_int(args, &token)) { + ret = -EINVAL; + goto out_free_options; + } + opts->roles = token; + break; + case NVMF_OPT_FCADDR: + if (match_hex(args, &token)) { + ret = -EINVAL; + goto out_free_options; + } + opts->fcaddr = token; + break; + case NVMF_OPT_LPWWNN: + if (fcloop_verify_addr(args) || + match_u64(args, &token64)) { + ret = -EINVAL; + goto out_free_options; + } + opts->lpwwnn = token64; + break; + case NVMF_OPT_LPWWPN: + if (fcloop_verify_addr(args) || + match_u64(args, &token64)) { + ret = -EINVAL; + goto out_free_options; + } + opts->lpwwpn = token64; + break; + default: + pr_warn("unknown parameter or missing value '%s'\n", p); + ret = -EINVAL; + goto out_free_options; + } + } + +out_free_options: + kfree(options); + return ret; +} + + +static int +fcloop_parse_nm_options(struct device *dev, u64 *nname, u64 *pname, + const char *buf) +{ + substring_t args[MAX_OPT_ARGS]; + char *options, *o, *p; + int token, ret = 0; + u64 token64; + + *nname = -1; + *pname = -1; + + options = o = kstrdup(buf, GFP_KERNEL); + if (!options) + return -ENOMEM; + + while ((p = strsep(&o, ",\n")) != NULL) { + if (!*p) + continue; + + token = match_token(p, opt_tokens, args); + switch (token) { + case NVMF_OPT_WWNN: + if (fcloop_verify_addr(args) || + match_u64(args, &token64)) { + ret = -EINVAL; + goto out_free_options; + } + *nname = token64; + break; + case NVMF_OPT_WWPN: + if (fcloop_verify_addr(args) || + match_u64(args, &token64)) { + ret = -EINVAL; + goto out_free_options; + } + *pname = token64; + break; + default: + pr_warn("unknown parameter or missing value '%s'\n", p); + ret = -EINVAL; + goto out_free_options; + } + } + +out_free_options: + kfree(options); + + if (!ret) { + if (*nname == -1) + return -EINVAL; + if (*pname == -1) + return -EINVAL; + } + + return ret; +} + + +#define LPORT_OPTS (NVMF_OPT_WWNN | NVMF_OPT_WWPN) + +#define RPORT_OPTS (NVMF_OPT_WWNN | NVMF_OPT_WWPN | \ + NVMF_OPT_LPWWNN | NVMF_OPT_LPWWPN) + +#define TGTPORT_OPTS (NVMF_OPT_WWNN | NVMF_OPT_WWPN) + + +static DEFINE_SPINLOCK(fcloop_lock); +static LIST_HEAD(fcloop_lports); +static LIST_HEAD(fcloop_nports); + +struct fcloop_lport { + struct nvme_fc_local_port *localport; + struct list_head lport_list; + struct completion unreg_done; +}; + +struct fcloop_lport_priv { + struct fcloop_lport *lport; +}; + +struct fcloop_rport { + struct nvme_fc_remote_port *remoteport; + struct nvmet_fc_target_port *targetport; + struct fcloop_nport *nport; + struct fcloop_lport *lport; + spinlock_t lock; + struct list_head ls_list; + struct work_struct ls_work; +}; + +struct fcloop_tport { + struct nvmet_fc_target_port *targetport; + struct nvme_fc_remote_port *remoteport; + struct fcloop_nport *nport; + struct fcloop_lport *lport; + spinlock_t lock; + struct list_head ls_list; + struct work_struct ls_work; +}; + +struct fcloop_nport { + struct fcloop_rport *rport; + struct fcloop_tport *tport; + struct fcloop_lport *lport; + struct list_head nport_list; + struct kref ref; + u64 node_name; + u64 port_name; + u32 port_role; + u32 port_id; +}; + +struct fcloop_lsreq { + struct nvmefc_ls_req *lsreq; + struct nvmefc_ls_rsp ls_rsp; + int lsdir; /* H2T or T2H */ + int status; + struct list_head ls_list; /* fcloop_rport->ls_list */ +}; + +struct fcloop_rscn { + struct fcloop_tport *tport; + struct work_struct work; +}; + +enum { + INI_IO_START = 0, + INI_IO_ACTIVE = 1, + INI_IO_ABORTED = 2, + INI_IO_COMPLETED = 3, +}; + +struct fcloop_fcpreq { + struct fcloop_tport *tport; + struct nvmefc_fcp_req *fcpreq; + spinlock_t reqlock; + u16 status; + u32 inistate; + bool active; + bool aborted; + struct kref ref; + struct work_struct fcp_rcv_work; + struct work_struct abort_rcv_work; + struct work_struct tio_done_work; + struct nvmefc_tgt_fcp_req tgt_fcp_req; +}; + +struct fcloop_ini_fcpreq { + struct nvmefc_fcp_req *fcpreq; + struct fcloop_fcpreq *tfcp_req; + spinlock_t inilock; +}; + +static inline struct fcloop_lsreq * +ls_rsp_to_lsreq(struct nvmefc_ls_rsp *lsrsp) +{ + return container_of(lsrsp, struct fcloop_lsreq, ls_rsp); +} + +static inline struct fcloop_fcpreq * +tgt_fcp_req_to_fcpreq(struct nvmefc_tgt_fcp_req *tgt_fcpreq) +{ + return container_of(tgt_fcpreq, struct fcloop_fcpreq, tgt_fcp_req); +} + + +static int +fcloop_create_queue(struct nvme_fc_local_port *localport, + unsigned int qidx, u16 qsize, + void **handle) +{ + *handle = localport; + return 0; +} + +static void +fcloop_delete_queue(struct nvme_fc_local_port *localport, + unsigned int idx, void *handle) +{ +} + +static void +fcloop_rport_lsrqst_work(struct work_struct *work) +{ + struct fcloop_rport *rport = + container_of(work, struct fcloop_rport, ls_work); + struct fcloop_lsreq *tls_req; + + spin_lock(&rport->lock); + for (;;) { + tls_req = list_first_entry_or_null(&rport->ls_list, + struct fcloop_lsreq, ls_list); + if (!tls_req) + break; + + list_del(&tls_req->ls_list); + spin_unlock(&rport->lock); + + tls_req->lsreq->done(tls_req->lsreq, tls_req->status); + /* + * callee may free memory containing tls_req. + * do not reference lsreq after this. + */ + + spin_lock(&rport->lock); + } + spin_unlock(&rport->lock); +} + +static int +fcloop_h2t_ls_req(struct nvme_fc_local_port *localport, + struct nvme_fc_remote_port *remoteport, + struct nvmefc_ls_req *lsreq) +{ + struct fcloop_lsreq *tls_req = lsreq->private; + struct fcloop_rport *rport = remoteport->private; + int ret = 0; + + tls_req->lsreq = lsreq; + INIT_LIST_HEAD(&tls_req->ls_list); + + if (!rport->targetport) { + tls_req->status = -ECONNREFUSED; + spin_lock(&rport->lock); + list_add_tail(&rport->ls_list, &tls_req->ls_list); + spin_unlock(&rport->lock); + schedule_work(&rport->ls_work); + return ret; + } + + tls_req->status = 0; + ret = nvmet_fc_rcv_ls_req(rport->targetport, rport, + &tls_req->ls_rsp, + lsreq->rqstaddr, lsreq->rqstlen); + + return ret; +} + +static int +fcloop_h2t_xmt_ls_rsp(struct nvmet_fc_target_port *targetport, + struct nvmefc_ls_rsp *lsrsp) +{ + struct fcloop_lsreq *tls_req = ls_rsp_to_lsreq(lsrsp); + struct nvmefc_ls_req *lsreq = tls_req->lsreq; + struct fcloop_tport *tport = targetport->private; + struct nvme_fc_remote_port *remoteport = tport->remoteport; + struct fcloop_rport *rport; + + memcpy(lsreq->rspaddr, lsrsp->rspbuf, + ((lsreq->rsplen < lsrsp->rsplen) ? + lsreq->rsplen : lsrsp->rsplen)); + + lsrsp->done(lsrsp); + + if (remoteport) { + rport = remoteport->private; + spin_lock(&rport->lock); + list_add_tail(&rport->ls_list, &tls_req->ls_list); + spin_unlock(&rport->lock); + schedule_work(&rport->ls_work); + } + + return 0; +} + +static void +fcloop_tport_lsrqst_work(struct work_struct *work) +{ + struct fcloop_tport *tport = + container_of(work, struct fcloop_tport, ls_work); + struct fcloop_lsreq *tls_req; + + spin_lock(&tport->lock); + for (;;) { + tls_req = list_first_entry_or_null(&tport->ls_list, + struct fcloop_lsreq, ls_list); + if (!tls_req) + break; + + list_del(&tls_req->ls_list); + spin_unlock(&tport->lock); + + tls_req->lsreq->done(tls_req->lsreq, tls_req->status); + /* + * callee may free memory containing tls_req. + * do not reference lsreq after this. + */ + + spin_lock(&tport->lock); + } + spin_unlock(&tport->lock); +} + +static int +fcloop_t2h_ls_req(struct nvmet_fc_target_port *targetport, void *hosthandle, + struct nvmefc_ls_req *lsreq) +{ + struct fcloop_lsreq *tls_req = lsreq->private; + struct fcloop_tport *tport = targetport->private; + int ret = 0; + + /* + * hosthandle should be the dst.rport value. + * hosthandle ignored as fcloop currently is + * 1:1 tgtport vs remoteport + */ + tls_req->lsreq = lsreq; + INIT_LIST_HEAD(&tls_req->ls_list); + + if (!tport->remoteport) { + tls_req->status = -ECONNREFUSED; + spin_lock(&tport->lock); + list_add_tail(&tport->ls_list, &tls_req->ls_list); + spin_unlock(&tport->lock); + schedule_work(&tport->ls_work); + return ret; + } + + tls_req->status = 0; + ret = nvme_fc_rcv_ls_req(tport->remoteport, &tls_req->ls_rsp, + lsreq->rqstaddr, lsreq->rqstlen); + + return ret; +} + +static int +fcloop_t2h_xmt_ls_rsp(struct nvme_fc_local_port *localport, + struct nvme_fc_remote_port *remoteport, + struct nvmefc_ls_rsp *lsrsp) +{ + struct fcloop_lsreq *tls_req = ls_rsp_to_lsreq(lsrsp); + struct nvmefc_ls_req *lsreq = tls_req->lsreq; + struct fcloop_rport *rport = remoteport->private; + struct nvmet_fc_target_port *targetport = rport->targetport; + struct fcloop_tport *tport; + + memcpy(lsreq->rspaddr, lsrsp->rspbuf, + ((lsreq->rsplen < lsrsp->rsplen) ? + lsreq->rsplen : lsrsp->rsplen)); + lsrsp->done(lsrsp); + + if (targetport) { + tport = targetport->private; + spin_lock(&tport->lock); + list_add_tail(&tport->ls_list, &tls_req->ls_list); + spin_unlock(&tport->lock); + schedule_work(&tport->ls_work); + } + + return 0; +} + +static void +fcloop_t2h_host_release(void *hosthandle) +{ + /* host handle ignored for now */ +} + +/* + * Simulate reception of RSCN and converting it to a initiator transport + * call to rescan a remote port. + */ +static void +fcloop_tgt_rscn_work(struct work_struct *work) +{ + struct fcloop_rscn *tgt_rscn = + container_of(work, struct fcloop_rscn, work); + struct fcloop_tport *tport = tgt_rscn->tport; + + if (tport->remoteport) + nvme_fc_rescan_remoteport(tport->remoteport); + kfree(tgt_rscn); +} + +static void +fcloop_tgt_discovery_evt(struct nvmet_fc_target_port *tgtport) +{ + struct fcloop_rscn *tgt_rscn; + + tgt_rscn = kzalloc(sizeof(*tgt_rscn), GFP_KERNEL); + if (!tgt_rscn) + return; + + tgt_rscn->tport = tgtport->private; + INIT_WORK(&tgt_rscn->work, fcloop_tgt_rscn_work); + + schedule_work(&tgt_rscn->work); +} + +static void +fcloop_tfcp_req_free(struct kref *ref) +{ + struct fcloop_fcpreq *tfcp_req = + container_of(ref, struct fcloop_fcpreq, ref); + + kfree(tfcp_req); +} + +static void +fcloop_tfcp_req_put(struct fcloop_fcpreq *tfcp_req) +{ + kref_put(&tfcp_req->ref, fcloop_tfcp_req_free); +} + +static int +fcloop_tfcp_req_get(struct fcloop_fcpreq *tfcp_req) +{ + return kref_get_unless_zero(&tfcp_req->ref); +} + +static void +fcloop_call_host_done(struct nvmefc_fcp_req *fcpreq, + struct fcloop_fcpreq *tfcp_req, int status) +{ + struct fcloop_ini_fcpreq *inireq = NULL; + + if (fcpreq) { + inireq = fcpreq->private; + spin_lock(&inireq->inilock); + inireq->tfcp_req = NULL; + spin_unlock(&inireq->inilock); + + fcpreq->status = status; + fcpreq->done(fcpreq); + } + + /* release original io reference on tgt struct */ + fcloop_tfcp_req_put(tfcp_req); +} + +static bool drop_fabric_opcode; +#define DROP_OPCODE_MASK 0x00FF +/* fabrics opcode will have a bit set above 1st byte */ +static int drop_opcode = -1; +static int drop_instance; +static int drop_amount; +static int drop_current_cnt; + +/* + * Routine to parse io and determine if the io is to be dropped. + * Returns: + * 0 if io is not obstructed + * 1 if io was dropped + */ +static int check_for_drop(struct fcloop_fcpreq *tfcp_req) +{ + struct nvmefc_fcp_req *fcpreq = tfcp_req->fcpreq; + struct nvme_fc_cmd_iu *cmdiu = fcpreq->cmdaddr; + struct nvme_command *sqe = &cmdiu->sqe; + + if (drop_opcode == -1) + return 0; + + pr_info("%s: seq opcd x%02x fctype x%02x: drop F %s op x%02x " + "inst %d start %d amt %d\n", + __func__, sqe->common.opcode, sqe->fabrics.fctype, + drop_fabric_opcode ? "y" : "n", + drop_opcode, drop_current_cnt, drop_instance, drop_amount); + + if ((drop_fabric_opcode && + (sqe->common.opcode != nvme_fabrics_command || + sqe->fabrics.fctype != drop_opcode)) || + (!drop_fabric_opcode && sqe->common.opcode != drop_opcode)) + return 0; + + if (++drop_current_cnt >= drop_instance) { + if (drop_current_cnt >= drop_instance + drop_amount) + drop_opcode = -1; + return 1; + } + + return 0; +} + +static void +fcloop_fcp_recv_work(struct work_struct *work) +{ + struct fcloop_fcpreq *tfcp_req = + container_of(work, struct fcloop_fcpreq, fcp_rcv_work); + struct nvmefc_fcp_req *fcpreq = tfcp_req->fcpreq; + int ret = 0; + bool aborted = false; + + spin_lock_irq(&tfcp_req->reqlock); + switch (tfcp_req->inistate) { + case INI_IO_START: + tfcp_req->inistate = INI_IO_ACTIVE; + break; + case INI_IO_ABORTED: + aborted = true; + break; + default: + spin_unlock_irq(&tfcp_req->reqlock); + WARN_ON(1); + return; + } + spin_unlock_irq(&tfcp_req->reqlock); + + if (unlikely(aborted)) + ret = -ECANCELED; + else { + if (likely(!check_for_drop(tfcp_req))) + ret = nvmet_fc_rcv_fcp_req(tfcp_req->tport->targetport, + &tfcp_req->tgt_fcp_req, + fcpreq->cmdaddr, fcpreq->cmdlen); + else + pr_info("%s: dropped command ********\n", __func__); + } + if (ret) + fcloop_call_host_done(fcpreq, tfcp_req, ret); + + return; +} + +static void +fcloop_fcp_abort_recv_work(struct work_struct *work) +{ + struct fcloop_fcpreq *tfcp_req = + container_of(work, struct fcloop_fcpreq, abort_rcv_work); + struct nvmefc_fcp_req *fcpreq; + bool completed = false; + + spin_lock_irq(&tfcp_req->reqlock); + fcpreq = tfcp_req->fcpreq; + switch (tfcp_req->inistate) { + case INI_IO_ABORTED: + break; + case INI_IO_COMPLETED: + completed = true; + break; + default: + spin_unlock_irq(&tfcp_req->reqlock); + WARN_ON(1); + return; + } + spin_unlock_irq(&tfcp_req->reqlock); + + if (unlikely(completed)) { + /* remove reference taken in original abort downcall */ + fcloop_tfcp_req_put(tfcp_req); + return; + } + + if (tfcp_req->tport->targetport) + nvmet_fc_rcv_fcp_abort(tfcp_req->tport->targetport, + &tfcp_req->tgt_fcp_req); + + spin_lock_irq(&tfcp_req->reqlock); + tfcp_req->fcpreq = NULL; + spin_unlock_irq(&tfcp_req->reqlock); + + fcloop_call_host_done(fcpreq, tfcp_req, -ECANCELED); + /* call_host_done releases reference for abort downcall */ +} + +/* + * FCP IO operation done by target completion. + * call back up initiator "done" flows. + */ +static void +fcloop_tgt_fcprqst_done_work(struct work_struct *work) +{ + struct fcloop_fcpreq *tfcp_req = + container_of(work, struct fcloop_fcpreq, tio_done_work); + struct nvmefc_fcp_req *fcpreq; + + spin_lock_irq(&tfcp_req->reqlock); + fcpreq = tfcp_req->fcpreq; + tfcp_req->inistate = INI_IO_COMPLETED; + spin_unlock_irq(&tfcp_req->reqlock); + + fcloop_call_host_done(fcpreq, tfcp_req, tfcp_req->status); +} + + +static int +fcloop_fcp_req(struct nvme_fc_local_port *localport, + struct nvme_fc_remote_port *remoteport, + void *hw_queue_handle, + struct nvmefc_fcp_req *fcpreq) +{ + struct fcloop_rport *rport = remoteport->private; + struct fcloop_ini_fcpreq *inireq = fcpreq->private; + struct fcloop_fcpreq *tfcp_req; + + if (!rport->targetport) + return -ECONNREFUSED; + + tfcp_req = kzalloc(sizeof(*tfcp_req), GFP_ATOMIC); + if (!tfcp_req) + return -ENOMEM; + + inireq->fcpreq = fcpreq; + inireq->tfcp_req = tfcp_req; + spin_lock_init(&inireq->inilock); + + tfcp_req->fcpreq = fcpreq; + tfcp_req->tport = rport->targetport->private; + tfcp_req->inistate = INI_IO_START; + spin_lock_init(&tfcp_req->reqlock); + INIT_WORK(&tfcp_req->fcp_rcv_work, fcloop_fcp_recv_work); + INIT_WORK(&tfcp_req->abort_rcv_work, fcloop_fcp_abort_recv_work); + INIT_WORK(&tfcp_req->tio_done_work, fcloop_tgt_fcprqst_done_work); + kref_init(&tfcp_req->ref); + + schedule_work(&tfcp_req->fcp_rcv_work); + + return 0; +} + +static void +fcloop_fcp_copy_data(u8 op, struct scatterlist *data_sg, + struct scatterlist *io_sg, u32 offset, u32 length) +{ + void *data_p, *io_p; + u32 data_len, io_len, tlen; + + io_p = sg_virt(io_sg); + io_len = io_sg->length; + + for ( ; offset; ) { + tlen = min_t(u32, offset, io_len); + offset -= tlen; + io_len -= tlen; + if (!io_len) { + io_sg = sg_next(io_sg); + io_p = sg_virt(io_sg); + io_len = io_sg->length; + } else + io_p += tlen; + } + + data_p = sg_virt(data_sg); + data_len = data_sg->length; + + for ( ; length; ) { + tlen = min_t(u32, io_len, data_len); + tlen = min_t(u32, tlen, length); + + if (op == NVMET_FCOP_WRITEDATA) + memcpy(data_p, io_p, tlen); + else + memcpy(io_p, data_p, tlen); + + length -= tlen; + + io_len -= tlen; + if ((!io_len) && (length)) { + io_sg = sg_next(io_sg); + io_p = sg_virt(io_sg); + io_len = io_sg->length; + } else + io_p += tlen; + + data_len -= tlen; + if ((!data_len) && (length)) { + data_sg = sg_next(data_sg); + data_p = sg_virt(data_sg); + data_len = data_sg->length; + } else + data_p += tlen; + } +} + +static int +fcloop_fcp_op(struct nvmet_fc_target_port *tgtport, + struct nvmefc_tgt_fcp_req *tgt_fcpreq) +{ + struct fcloop_fcpreq *tfcp_req = tgt_fcp_req_to_fcpreq(tgt_fcpreq); + struct nvmefc_fcp_req *fcpreq; + u32 rsplen = 0, xfrlen = 0; + int fcp_err = 0, active, aborted; + u8 op = tgt_fcpreq->op; + + spin_lock_irq(&tfcp_req->reqlock); + fcpreq = tfcp_req->fcpreq; + active = tfcp_req->active; + aborted = tfcp_req->aborted; + tfcp_req->active = true; + spin_unlock_irq(&tfcp_req->reqlock); + + if (unlikely(active)) + /* illegal - call while i/o active */ + return -EALREADY; + + if (unlikely(aborted)) { + /* target transport has aborted i/o prior */ + spin_lock_irq(&tfcp_req->reqlock); + tfcp_req->active = false; + spin_unlock_irq(&tfcp_req->reqlock); + tgt_fcpreq->transferred_length = 0; + tgt_fcpreq->fcp_error = -ECANCELED; + tgt_fcpreq->done(tgt_fcpreq); + return 0; + } + + /* + * if fcpreq is NULL, the I/O has been aborted (from + * initiator side). For the target side, act as if all is well + * but don't actually move data. + */ + + switch (op) { + case NVMET_FCOP_WRITEDATA: + xfrlen = tgt_fcpreq->transfer_length; + if (fcpreq) { + fcloop_fcp_copy_data(op, tgt_fcpreq->sg, + fcpreq->first_sgl, tgt_fcpreq->offset, + xfrlen); + fcpreq->transferred_length += xfrlen; + } + break; + + case NVMET_FCOP_READDATA: + case NVMET_FCOP_READDATA_RSP: + xfrlen = tgt_fcpreq->transfer_length; + if (fcpreq) { + fcloop_fcp_copy_data(op, tgt_fcpreq->sg, + fcpreq->first_sgl, tgt_fcpreq->offset, + xfrlen); + fcpreq->transferred_length += xfrlen; + } + if (op == NVMET_FCOP_READDATA) + break; + + /* Fall-Thru to RSP handling */ + fallthrough; + + case NVMET_FCOP_RSP: + if (fcpreq) { + rsplen = ((fcpreq->rsplen < tgt_fcpreq->rsplen) ? + fcpreq->rsplen : tgt_fcpreq->rsplen); + memcpy(fcpreq->rspaddr, tgt_fcpreq->rspaddr, rsplen); + if (rsplen < tgt_fcpreq->rsplen) + fcp_err = -E2BIG; + fcpreq->rcv_rsplen = rsplen; + fcpreq->status = 0; + } + tfcp_req->status = 0; + break; + + default: + fcp_err = -EINVAL; + break; + } + + spin_lock_irq(&tfcp_req->reqlock); + tfcp_req->active = false; + spin_unlock_irq(&tfcp_req->reqlock); + + tgt_fcpreq->transferred_length = xfrlen; + tgt_fcpreq->fcp_error = fcp_err; + tgt_fcpreq->done(tgt_fcpreq); + + return 0; +} + +static void +fcloop_tgt_fcp_abort(struct nvmet_fc_target_port *tgtport, + struct nvmefc_tgt_fcp_req *tgt_fcpreq) +{ + struct fcloop_fcpreq *tfcp_req = tgt_fcp_req_to_fcpreq(tgt_fcpreq); + + /* + * mark aborted only in case there were 2 threads in transport + * (one doing io, other doing abort) and only kills ops posted + * after the abort request + */ + spin_lock_irq(&tfcp_req->reqlock); + tfcp_req->aborted = true; + spin_unlock_irq(&tfcp_req->reqlock); + + tfcp_req->status = NVME_SC_INTERNAL; + + /* + * nothing more to do. If io wasn't active, the transport should + * immediately call the req_release. If it was active, the op + * will complete, and the lldd should call req_release. + */ +} + +static void +fcloop_fcp_req_release(struct nvmet_fc_target_port *tgtport, + struct nvmefc_tgt_fcp_req *tgt_fcpreq) +{ + struct fcloop_fcpreq *tfcp_req = tgt_fcp_req_to_fcpreq(tgt_fcpreq); + + schedule_work(&tfcp_req->tio_done_work); +} + +static void +fcloop_h2t_ls_abort(struct nvme_fc_local_port *localport, + struct nvme_fc_remote_port *remoteport, + struct nvmefc_ls_req *lsreq) +{ +} + +static void +fcloop_t2h_ls_abort(struct nvmet_fc_target_port *targetport, + void *hosthandle, struct nvmefc_ls_req *lsreq) +{ +} + +static void +fcloop_fcp_abort(struct nvme_fc_local_port *localport, + struct nvme_fc_remote_port *remoteport, + void *hw_queue_handle, + struct nvmefc_fcp_req *fcpreq) +{ + struct fcloop_ini_fcpreq *inireq = fcpreq->private; + struct fcloop_fcpreq *tfcp_req; + bool abortio = true; + + spin_lock(&inireq->inilock); + tfcp_req = inireq->tfcp_req; + if (tfcp_req) + fcloop_tfcp_req_get(tfcp_req); + spin_unlock(&inireq->inilock); + + if (!tfcp_req) + /* abort has already been called */ + return; + + /* break initiator/target relationship for io */ + spin_lock_irq(&tfcp_req->reqlock); + switch (tfcp_req->inistate) { + case INI_IO_START: + case INI_IO_ACTIVE: + tfcp_req->inistate = INI_IO_ABORTED; + break; + case INI_IO_COMPLETED: + abortio = false; + break; + default: + spin_unlock_irq(&tfcp_req->reqlock); + WARN_ON(1); + return; + } + spin_unlock_irq(&tfcp_req->reqlock); + + if (abortio) + /* leave the reference while the work item is scheduled */ + WARN_ON(!schedule_work(&tfcp_req->abort_rcv_work)); + else { + /* + * as the io has already had the done callback made, + * nothing more to do. So release the reference taken above + */ + fcloop_tfcp_req_put(tfcp_req); + } +} + +static void +fcloop_nport_free(struct kref *ref) +{ + struct fcloop_nport *nport = + container_of(ref, struct fcloop_nport, ref); + unsigned long flags; + + spin_lock_irqsave(&fcloop_lock, flags); + list_del(&nport->nport_list); + spin_unlock_irqrestore(&fcloop_lock, flags); + + kfree(nport); +} + +static void +fcloop_nport_put(struct fcloop_nport *nport) +{ + kref_put(&nport->ref, fcloop_nport_free); +} + +static int +fcloop_nport_get(struct fcloop_nport *nport) +{ + return kref_get_unless_zero(&nport->ref); +} + +static void +fcloop_localport_delete(struct nvme_fc_local_port *localport) +{ + struct fcloop_lport_priv *lport_priv = localport->private; + struct fcloop_lport *lport = lport_priv->lport; + + /* release any threads waiting for the unreg to complete */ + complete(&lport->unreg_done); +} + +static void +fcloop_remoteport_delete(struct nvme_fc_remote_port *remoteport) +{ + struct fcloop_rport *rport = remoteport->private; + + flush_work(&rport->ls_work); + fcloop_nport_put(rport->nport); +} + +static void +fcloop_targetport_delete(struct nvmet_fc_target_port *targetport) +{ + struct fcloop_tport *tport = targetport->private; + + flush_work(&tport->ls_work); + fcloop_nport_put(tport->nport); +} + +#define FCLOOP_HW_QUEUES 4 +#define FCLOOP_SGL_SEGS 256 +#define FCLOOP_DMABOUND_4G 0xFFFFFFFF + +static struct nvme_fc_port_template fctemplate = { + .localport_delete = fcloop_localport_delete, + .remoteport_delete = fcloop_remoteport_delete, + .create_queue = fcloop_create_queue, + .delete_queue = fcloop_delete_queue, + .ls_req = fcloop_h2t_ls_req, + .fcp_io = fcloop_fcp_req, + .ls_abort = fcloop_h2t_ls_abort, + .fcp_abort = fcloop_fcp_abort, + .xmt_ls_rsp = fcloop_t2h_xmt_ls_rsp, + .max_hw_queues = FCLOOP_HW_QUEUES, + .max_sgl_segments = FCLOOP_SGL_SEGS, + .max_dif_sgl_segments = FCLOOP_SGL_SEGS, + .dma_boundary = FCLOOP_DMABOUND_4G, + /* sizes of additional private data for data structures */ + .local_priv_sz = sizeof(struct fcloop_lport_priv), + .remote_priv_sz = sizeof(struct fcloop_rport), + .lsrqst_priv_sz = sizeof(struct fcloop_lsreq), + .fcprqst_priv_sz = sizeof(struct fcloop_ini_fcpreq), +}; + +static struct nvmet_fc_target_template tgttemplate = { + .targetport_delete = fcloop_targetport_delete, + .xmt_ls_rsp = fcloop_h2t_xmt_ls_rsp, + .fcp_op = fcloop_fcp_op, + .fcp_abort = fcloop_tgt_fcp_abort, + .fcp_req_release = fcloop_fcp_req_release, + .discovery_event = fcloop_tgt_discovery_evt, + .ls_req = fcloop_t2h_ls_req, + .ls_abort = fcloop_t2h_ls_abort, + .host_release = fcloop_t2h_host_release, + .max_hw_queues = FCLOOP_HW_QUEUES, + .max_sgl_segments = FCLOOP_SGL_SEGS, + .max_dif_sgl_segments = FCLOOP_SGL_SEGS, + .dma_boundary = FCLOOP_DMABOUND_4G, + /* optional features */ + .target_features = 0, + /* sizes of additional private data for data structures */ + .target_priv_sz = sizeof(struct fcloop_tport), + .lsrqst_priv_sz = sizeof(struct fcloop_lsreq), +}; + +static ssize_t +fcloop_create_local_port(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct nvme_fc_port_info pinfo; + struct fcloop_ctrl_options *opts; + struct nvme_fc_local_port *localport; + struct fcloop_lport *lport; + struct fcloop_lport_priv *lport_priv; + unsigned long flags; + int ret = -ENOMEM; + + lport = kzalloc(sizeof(*lport), GFP_KERNEL); + if (!lport) + return -ENOMEM; + + opts = kzalloc(sizeof(*opts), GFP_KERNEL); + if (!opts) + goto out_free_lport; + + ret = fcloop_parse_options(opts, buf); + if (ret) + goto out_free_opts; + + /* everything there ? */ + if ((opts->mask & LPORT_OPTS) != LPORT_OPTS) { + ret = -EINVAL; + goto out_free_opts; + } + + memset(&pinfo, 0, sizeof(pinfo)); + pinfo.node_name = opts->wwnn; + pinfo.port_name = opts->wwpn; + pinfo.port_role = opts->roles; + pinfo.port_id = opts->fcaddr; + + ret = nvme_fc_register_localport(&pinfo, &fctemplate, NULL, &localport); + if (!ret) { + /* success */ + lport_priv = localport->private; + lport_priv->lport = lport; + + lport->localport = localport; + INIT_LIST_HEAD(&lport->lport_list); + + spin_lock_irqsave(&fcloop_lock, flags); + list_add_tail(&lport->lport_list, &fcloop_lports); + spin_unlock_irqrestore(&fcloop_lock, flags); + } + +out_free_opts: + kfree(opts); +out_free_lport: + /* free only if we're going to fail */ + if (ret) + kfree(lport); + + return ret ? ret : count; +} + + +static void +__unlink_local_port(struct fcloop_lport *lport) +{ + list_del(&lport->lport_list); +} + +static int +__wait_localport_unreg(struct fcloop_lport *lport) +{ + int ret; + + init_completion(&lport->unreg_done); + + ret = nvme_fc_unregister_localport(lport->localport); + + wait_for_completion(&lport->unreg_done); + + kfree(lport); + + return ret; +} + + +static ssize_t +fcloop_delete_local_port(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct fcloop_lport *tlport, *lport = NULL; + u64 nodename, portname; + unsigned long flags; + int ret; + + ret = fcloop_parse_nm_options(dev, &nodename, &portname, buf); + if (ret) + return ret; + + spin_lock_irqsave(&fcloop_lock, flags); + + list_for_each_entry(tlport, &fcloop_lports, lport_list) { + if (tlport->localport->node_name == nodename && + tlport->localport->port_name == portname) { + lport = tlport; + __unlink_local_port(lport); + break; + } + } + spin_unlock_irqrestore(&fcloop_lock, flags); + + if (!lport) + return -ENOENT; + + ret = __wait_localport_unreg(lport); + + return ret ? ret : count; +} + +static struct fcloop_nport * +fcloop_alloc_nport(const char *buf, size_t count, bool remoteport) +{ + struct fcloop_nport *newnport, *nport = NULL; + struct fcloop_lport *tmplport, *lport = NULL; + struct fcloop_ctrl_options *opts; + unsigned long flags; + u32 opts_mask = (remoteport) ? RPORT_OPTS : TGTPORT_OPTS; + int ret; + + opts = kzalloc(sizeof(*opts), GFP_KERNEL); + if (!opts) + return NULL; + + ret = fcloop_parse_options(opts, buf); + if (ret) + goto out_free_opts; + + /* everything there ? */ + if ((opts->mask & opts_mask) != opts_mask) { + ret = -EINVAL; + goto out_free_opts; + } + + newnport = kzalloc(sizeof(*newnport), GFP_KERNEL); + if (!newnport) + goto out_free_opts; + + INIT_LIST_HEAD(&newnport->nport_list); + newnport->node_name = opts->wwnn; + newnport->port_name = opts->wwpn; + if (opts->mask & NVMF_OPT_ROLES) + newnport->port_role = opts->roles; + if (opts->mask & NVMF_OPT_FCADDR) + newnport->port_id = opts->fcaddr; + kref_init(&newnport->ref); + + spin_lock_irqsave(&fcloop_lock, flags); + + list_for_each_entry(tmplport, &fcloop_lports, lport_list) { + if (tmplport->localport->node_name == opts->wwnn && + tmplport->localport->port_name == opts->wwpn) + goto out_invalid_opts; + + if (tmplport->localport->node_name == opts->lpwwnn && + tmplport->localport->port_name == opts->lpwwpn) + lport = tmplport; + } + + if (remoteport) { + if (!lport) + goto out_invalid_opts; + newnport->lport = lport; + } + + list_for_each_entry(nport, &fcloop_nports, nport_list) { + if (nport->node_name == opts->wwnn && + nport->port_name == opts->wwpn) { + if ((remoteport && nport->rport) || + (!remoteport && nport->tport)) { + nport = NULL; + goto out_invalid_opts; + } + + fcloop_nport_get(nport); + + spin_unlock_irqrestore(&fcloop_lock, flags); + + if (remoteport) + nport->lport = lport; + if (opts->mask & NVMF_OPT_ROLES) + nport->port_role = opts->roles; + if (opts->mask & NVMF_OPT_FCADDR) + nport->port_id = opts->fcaddr; + goto out_free_newnport; + } + } + + list_add_tail(&newnport->nport_list, &fcloop_nports); + + spin_unlock_irqrestore(&fcloop_lock, flags); + + kfree(opts); + return newnport; + +out_invalid_opts: + spin_unlock_irqrestore(&fcloop_lock, flags); +out_free_newnport: + kfree(newnport); +out_free_opts: + kfree(opts); + return nport; +} + +static ssize_t +fcloop_create_remote_port(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct nvme_fc_remote_port *remoteport; + struct fcloop_nport *nport; + struct fcloop_rport *rport; + struct nvme_fc_port_info pinfo; + int ret; + + nport = fcloop_alloc_nport(buf, count, true); + if (!nport) + return -EIO; + + memset(&pinfo, 0, sizeof(pinfo)); + pinfo.node_name = nport->node_name; + pinfo.port_name = nport->port_name; + pinfo.port_role = nport->port_role; + pinfo.port_id = nport->port_id; + + ret = nvme_fc_register_remoteport(nport->lport->localport, + &pinfo, &remoteport); + if (ret || !remoteport) { + fcloop_nport_put(nport); + return ret; + } + + /* success */ + rport = remoteport->private; + rport->remoteport = remoteport; + rport->targetport = (nport->tport) ? nport->tport->targetport : NULL; + if (nport->tport) { + nport->tport->remoteport = remoteport; + nport->tport->lport = nport->lport; + } + rport->nport = nport; + rport->lport = nport->lport; + nport->rport = rport; + spin_lock_init(&rport->lock); + INIT_WORK(&rport->ls_work, fcloop_rport_lsrqst_work); + INIT_LIST_HEAD(&rport->ls_list); + + return count; +} + + +static struct fcloop_rport * +__unlink_remote_port(struct fcloop_nport *nport) +{ + struct fcloop_rport *rport = nport->rport; + + if (rport && nport->tport) + nport->tport->remoteport = NULL; + nport->rport = NULL; + + return rport; +} + +static int +__remoteport_unreg(struct fcloop_nport *nport, struct fcloop_rport *rport) +{ + if (!rport) + return -EALREADY; + + return nvme_fc_unregister_remoteport(rport->remoteport); +} + +static ssize_t +fcloop_delete_remote_port(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct fcloop_nport *nport = NULL, *tmpport; + static struct fcloop_rport *rport; + u64 nodename, portname; + unsigned long flags; + int ret; + + ret = fcloop_parse_nm_options(dev, &nodename, &portname, buf); + if (ret) + return ret; + + spin_lock_irqsave(&fcloop_lock, flags); + + list_for_each_entry(tmpport, &fcloop_nports, nport_list) { + if (tmpport->node_name == nodename && + tmpport->port_name == portname && tmpport->rport) { + nport = tmpport; + rport = __unlink_remote_port(nport); + break; + } + } + + spin_unlock_irqrestore(&fcloop_lock, flags); + + if (!nport) + return -ENOENT; + + ret = __remoteport_unreg(nport, rport); + + return ret ? ret : count; +} + +static ssize_t +fcloop_create_target_port(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct nvmet_fc_target_port *targetport; + struct fcloop_nport *nport; + struct fcloop_tport *tport; + struct nvmet_fc_port_info tinfo; + int ret; + + nport = fcloop_alloc_nport(buf, count, false); + if (!nport) + return -EIO; + + tinfo.node_name = nport->node_name; + tinfo.port_name = nport->port_name; + tinfo.port_id = nport->port_id; + + ret = nvmet_fc_register_targetport(&tinfo, &tgttemplate, NULL, + &targetport); + if (ret) { + fcloop_nport_put(nport); + return ret; + } + + /* success */ + tport = targetport->private; + tport->targetport = targetport; + tport->remoteport = (nport->rport) ? nport->rport->remoteport : NULL; + if (nport->rport) + nport->rport->targetport = targetport; + tport->nport = nport; + tport->lport = nport->lport; + nport->tport = tport; + spin_lock_init(&tport->lock); + INIT_WORK(&tport->ls_work, fcloop_tport_lsrqst_work); + INIT_LIST_HEAD(&tport->ls_list); + + return count; +} + + +static struct fcloop_tport * +__unlink_target_port(struct fcloop_nport *nport) +{ + struct fcloop_tport *tport = nport->tport; + + if (tport && nport->rport) + nport->rport->targetport = NULL; + nport->tport = NULL; + + return tport; +} + +static int +__targetport_unreg(struct fcloop_nport *nport, struct fcloop_tport *tport) +{ + if (!tport) + return -EALREADY; + + return nvmet_fc_unregister_targetport(tport->targetport); +} + +static ssize_t +fcloop_delete_target_port(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct fcloop_nport *nport = NULL, *tmpport; + struct fcloop_tport *tport = NULL; + u64 nodename, portname; + unsigned long flags; + int ret; + + ret = fcloop_parse_nm_options(dev, &nodename, &portname, buf); + if (ret) + return ret; + + spin_lock_irqsave(&fcloop_lock, flags); + + list_for_each_entry(tmpport, &fcloop_nports, nport_list) { + if (tmpport->node_name == nodename && + tmpport->port_name == portname && tmpport->tport) { + nport = tmpport; + tport = __unlink_target_port(nport); + break; + } + } + + spin_unlock_irqrestore(&fcloop_lock, flags); + + if (!nport) + return -ENOENT; + + ret = __targetport_unreg(nport, tport); + + return ret ? ret : count; +} + +static ssize_t +fcloop_set_cmd_drop(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + unsigned int opcode; + int starting, amount; + + if (sscanf(buf, "%x:%d:%d", &opcode, &starting, &amount) != 3) + return -EBADRQC; + + drop_current_cnt = 0; + drop_fabric_opcode = (opcode & ~DROP_OPCODE_MASK) ? true : false; + drop_opcode = (opcode & DROP_OPCODE_MASK); + drop_instance = starting; + /* the check to drop routine uses instance + count to know when + * to end. Thus, if dropping 1 instance, count should be 0. + * so subtract 1 from the count. + */ + drop_amount = amount - 1; + + pr_info("%s: DROP: Starting at instance %d of%s opcode x%x drop +%d " + "instances\n", + __func__, drop_instance, drop_fabric_opcode ? " fabric" : "", + drop_opcode, drop_amount); + + return count; +} + + +static DEVICE_ATTR(add_local_port, 0200, NULL, fcloop_create_local_port); +static DEVICE_ATTR(del_local_port, 0200, NULL, fcloop_delete_local_port); +static DEVICE_ATTR(add_remote_port, 0200, NULL, fcloop_create_remote_port); +static DEVICE_ATTR(del_remote_port, 0200, NULL, fcloop_delete_remote_port); +static DEVICE_ATTR(add_target_port, 0200, NULL, fcloop_create_target_port); +static DEVICE_ATTR(del_target_port, 0200, NULL, fcloop_delete_target_port); +static DEVICE_ATTR(set_cmd_drop, 0200, NULL, fcloop_set_cmd_drop); + +static struct attribute *fcloop_dev_attrs[] = { + &dev_attr_add_local_port.attr, + &dev_attr_del_local_port.attr, + &dev_attr_add_remote_port.attr, + &dev_attr_del_remote_port.attr, + &dev_attr_add_target_port.attr, + &dev_attr_del_target_port.attr, + &dev_attr_set_cmd_drop.attr, + NULL +}; + +static const struct attribute_group fclopp_dev_attrs_group = { + .attrs = fcloop_dev_attrs, +}; + +static const struct attribute_group *fcloop_dev_attr_groups[] = { + &fclopp_dev_attrs_group, + NULL, +}; + +static struct class *fcloop_class; +static struct device *fcloop_device; + + +static int __init fcloop_init(void) +{ + int ret; + + fcloop_class = class_create(THIS_MODULE, "fcloop"); + if (IS_ERR(fcloop_class)) { + pr_err("couldn't register class fcloop\n"); + ret = PTR_ERR(fcloop_class); + return ret; + } + + fcloop_device = device_create_with_groups( + fcloop_class, NULL, MKDEV(0, 0), NULL, + fcloop_dev_attr_groups, "ctl"); + if (IS_ERR(fcloop_device)) { + pr_err("couldn't create ctl device!\n"); + ret = PTR_ERR(fcloop_device); + goto out_destroy_class; + } + + get_device(fcloop_device); + + return 0; + +out_destroy_class: + class_destroy(fcloop_class); + return ret; +} + +static void __exit fcloop_exit(void) +{ + struct fcloop_lport *lport = NULL; + struct fcloop_nport *nport = NULL; + struct fcloop_tport *tport; + struct fcloop_rport *rport; + unsigned long flags; + int ret; + + spin_lock_irqsave(&fcloop_lock, flags); + + for (;;) { + nport = list_first_entry_or_null(&fcloop_nports, + typeof(*nport), nport_list); + if (!nport) + break; + + tport = __unlink_target_port(nport); + rport = __unlink_remote_port(nport); + + spin_unlock_irqrestore(&fcloop_lock, flags); + + ret = __targetport_unreg(nport, tport); + if (ret) + pr_warn("%s: Failed deleting target port\n", __func__); + + ret = __remoteport_unreg(nport, rport); + if (ret) + pr_warn("%s: Failed deleting remote port\n", __func__); + + spin_lock_irqsave(&fcloop_lock, flags); + } + + for (;;) { + lport = list_first_entry_or_null(&fcloop_lports, + typeof(*lport), lport_list); + if (!lport) + break; + + __unlink_local_port(lport); + + spin_unlock_irqrestore(&fcloop_lock, flags); + + ret = __wait_localport_unreg(lport); + if (ret) + pr_warn("%s: Failed deleting local port\n", __func__); + + spin_lock_irqsave(&fcloop_lock, flags); + } + + spin_unlock_irqrestore(&fcloop_lock, flags); + + put_device(fcloop_device); + + device_destroy(fcloop_class, MKDEV(0, 0)); + class_destroy(fcloop_class); +} + +module_init(fcloop_init); +module_exit(fcloop_exit); + +MODULE_LICENSE("GPL v2"); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/io-cmd-bdev.c b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/io-cmd-bdev.c new file mode 100644 index 0000000..cc4b760 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/io-cmd-bdev.c @@ -0,0 +1,463 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NVMe I/O command implementation. + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + */ +#ifdef pr_fmt +#undef pr_fmt +#endif +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include "nvmet.h" + +void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id) +{ + const struct queue_limits *ql = &bdev_get_queue(bdev)->limits; + /* Number of logical blocks per physical block. */ + const u32 lpp = ql->physical_block_size / ql->logical_block_size; + /* Logical blocks per physical block, 0's based. */ + const __le16 lpp0b = to0based(lpp); + + /* + * For NVMe 1.2 and later, bit 1 indicates that the fields NAWUN, + * NAWUPF, and NACWU are defined for this namespace and should be + * used by the host for this namespace instead of the AWUN, AWUPF, + * and ACWU fields in the Identify Controller data structure. If + * any of these fields are zero that means that the corresponding + * field from the identify controller data structure should be used. + */ + id->nsfeat |= 1 << 1; + id->nawun = lpp0b; + id->nawupf = lpp0b; + id->nacwu = lpp0b; + + /* + * Bit 4 indicates that the fields NPWG, NPWA, NPDG, NPDA, and + * NOWS are defined for this namespace and should be used by + * the host for I/O optimization. + */ + id->nsfeat |= 1 << 4; + /* NPWG = Namespace Preferred Write Granularity. 0's based */ + id->npwg = lpp0b; + /* NPWA = Namespace Preferred Write Alignment. 0's based */ + id->npwa = id->npwg; + /* NPDG = Namespace Preferred Deallocate Granularity. 0's based */ + id->npdg = to0based(ql->discard_granularity / ql->logical_block_size); + /* NPDG = Namespace Preferred Deallocate Alignment */ + id->npda = id->npdg; + /* NOWS = Namespace Optimal Write Size */ + id->nows = to0based(ql->io_opt / ql->logical_block_size); +} + +void nvmet_bdev_ns_disable(struct nvmet_ns *ns) +{ + if (ns->bdev) { + blkdev_put(ns->bdev, FMODE_WRITE | FMODE_READ); + ns->bdev = NULL; + } +} + +static void nvmet_bdev_ns_enable_integrity(struct nvmet_ns *ns) +{ + struct blk_integrity *bi = bdev_get_integrity(ns->bdev); + + if (bi) { + ns->metadata_size = bi->tuple_size; + if (bi->profile == &t10_pi_type1_crc) + ns->pi_type = NVME_NS_DPS_PI_TYPE1; + else if (bi->profile == &t10_pi_type3_crc) + ns->pi_type = NVME_NS_DPS_PI_TYPE3; + else + /* Unsupported metadata type */ + ns->metadata_size = 0; + } +} + +int nvmet_bdev_ns_enable(struct nvmet_ns *ns) +{ + int ret; + + ns->bdev = blkdev_get_by_path(ns->device_path, + FMODE_READ | FMODE_WRITE, NULL); + if (IS_ERR(ns->bdev)) { + ret = PTR_ERR(ns->bdev); + if (ret != -ENOTBLK) { + pr_err("failed to open block device %s: (%ld)\n", + ns->device_path, PTR_ERR(ns->bdev)); + } + ns->bdev = NULL; + return ret; + } + ns->size = bdev_nr_bytes(ns->bdev); + ns->blksize_shift = blksize_bits(bdev_logical_block_size(ns->bdev)); + + ns->pi_type = 0; + ns->metadata_size = 0; + if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY_T10)) + nvmet_bdev_ns_enable_integrity(ns); + + if (bdev_is_zoned(ns->bdev)) { + if (!nvmet_bdev_zns_enable(ns)) { + nvmet_bdev_ns_disable(ns); + return -EINVAL; + } + ns->csi = NVME_CSI_ZNS; + } + + return 0; +} + +void nvmet_bdev_ns_revalidate(struct nvmet_ns *ns) +{ + ns->size = bdev_nr_bytes(ns->bdev); +} + +u16 blk_to_nvme_status(struct nvmet_req *req, blk_status_t blk_sts) +{ + u16 status = NVME_SC_SUCCESS; + + if (likely(blk_sts == BLK_STS_OK)) + return status; + /* + * Right now there exists M : 1 mapping between block layer error + * to the NVMe status code (see nvme_error_status()). For consistency, + * when we reverse map we use most appropriate NVMe Status code from + * the group of the NVMe staus codes used in the nvme_error_status(). + */ + switch (blk_sts) { + case BLK_STS_NOSPC: + status = NVME_SC_CAP_EXCEEDED | NVME_SC_DNR; + req->error_loc = offsetof(struct nvme_rw_command, length); + break; + case BLK_STS_TARGET: + status = NVME_SC_LBA_RANGE | NVME_SC_DNR; + req->error_loc = offsetof(struct nvme_rw_command, slba); + break; + case BLK_STS_NOTSUPP: + req->error_loc = offsetof(struct nvme_common_command, opcode); + switch (req->cmd->common.opcode) { + case nvme_cmd_dsm: + case nvme_cmd_write_zeroes: + status = NVME_SC_ONCS_NOT_SUPPORTED | NVME_SC_DNR; + break; + default: + status = NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } + break; + case BLK_STS_MEDIUM: + status = NVME_SC_ACCESS_DENIED; + req->error_loc = offsetof(struct nvme_rw_command, nsid); + break; + case BLK_STS_IOERR: + default: + status = NVME_SC_INTERNAL | NVME_SC_DNR; + req->error_loc = offsetof(struct nvme_common_command, opcode); + } + + switch (req->cmd->common.opcode) { + case nvme_cmd_read: + case nvme_cmd_write: + req->error_slba = le64_to_cpu(req->cmd->rw.slba); + break; + case nvme_cmd_write_zeroes: + req->error_slba = + le64_to_cpu(req->cmd->write_zeroes.slba); + break; + default: + req->error_slba = 0; + } + return status; +} + +static void nvmet_bio_done(struct bio *bio) +{ + struct nvmet_req *req = bio->bi_private; + + nvmet_req_complete(req, blk_to_nvme_status(req, bio->bi_status)); + nvmet_req_bio_put(req, bio); +} + +#ifdef CONFIG_BLK_DEV_INTEGRITY +static int nvmet_bdev_alloc_bip(struct nvmet_req *req, struct bio *bio, + struct sg_mapping_iter *miter) +{ + struct blk_integrity *bi; + struct bio_integrity_payload *bip; + int rc; + size_t resid, len; + + bi = bdev_get_integrity(req->ns->bdev); + if (unlikely(!bi)) { + pr_err("Unable to locate bio_integrity\n"); + return -ENODEV; + } + + bip = bio_integrity_alloc(bio, GFP_NOIO, + bio_max_segs(req->metadata_sg_cnt)); + if (IS_ERR(bip)) { + pr_err("Unable to allocate bio_integrity_payload\n"); + return PTR_ERR(bip); + } + + bip->bip_iter.bi_size = bio_integrity_bytes(bi, bio_sectors(bio)); + /* virtual start sector must be in integrity interval units */ + bip_set_seed(bip, bio->bi_iter.bi_sector >> + (bi->interval_exp - SECTOR_SHIFT)); + + resid = bip->bip_iter.bi_size; + while (resid > 0 && sg_miter_next(miter)) { + len = min_t(size_t, miter->length, resid); + rc = bio_integrity_add_page(bio, miter->page, len, + offset_in_page(miter->addr)); + if (unlikely(rc != len)) { + pr_err("bio_integrity_add_page() failed; %d\n", rc); + sg_miter_stop(miter); + return -ENOMEM; + } + + resid -= len; + if (len < miter->length) + miter->consumed -= miter->length - len; + } + sg_miter_stop(miter); + + return 0; +} +#else +static int nvmet_bdev_alloc_bip(struct nvmet_req *req, struct bio *bio, + struct sg_mapping_iter *miter) +{ + return -EINVAL; +} +#endif /* CONFIG_BLK_DEV_INTEGRITY */ + +static void nvmet_bdev_execute_rw(struct nvmet_req *req) +{ + unsigned int sg_cnt = req->sg_cnt; + struct bio *bio; + struct scatterlist *sg; + struct blk_plug plug; + sector_t sector; + int op, i, rc; + struct sg_mapping_iter prot_miter; + unsigned int iter_flags; + unsigned int total_len = nvmet_rw_data_len(req) + req->metadata_len; + + if (!nvmet_check_transfer_len(req, total_len)) + return; + + if (!req->sg_cnt) { + nvmet_req_complete(req, 0); + return; + } + + if (req->cmd->rw.opcode == nvme_cmd_write) { + op = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE; + if (req->cmd->rw.control & cpu_to_le16(NVME_RW_FUA)) + op |= REQ_FUA; + iter_flags = SG_MITER_TO_SG; + } else { + op = REQ_OP_READ; + iter_flags = SG_MITER_FROM_SG; + } + + if (is_pci_p2pdma_page(sg_page(req->sg))) + op |= REQ_NOMERGE; + + sector = nvmet_lba_to_sect(req->ns, req->cmd->rw.slba); + + if (nvmet_use_inline_bvec(req)) { + bio = &req->b.inline_bio; + bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec)); + } else { + bio = bio_alloc(GFP_KERNEL, bio_max_segs(sg_cnt)); + } + bio_set_dev(bio, req->ns->bdev); + bio->bi_iter.bi_sector = sector; + bio->bi_private = req; + bio->bi_end_io = nvmet_bio_done; + bio->bi_opf = op; + + blk_start_plug(&plug); + if (req->metadata_len) + sg_miter_start(&prot_miter, req->metadata_sg, + req->metadata_sg_cnt, iter_flags); + + for_each_sg(req->sg, sg, req->sg_cnt, i) { + while (bio_add_page(bio, sg_page(sg), sg->length, sg->offset) + != sg->length) { + struct bio *prev = bio; + + if (req->metadata_len) { + rc = nvmet_bdev_alloc_bip(req, bio, + &prot_miter); + if (unlikely(rc)) { + bio_io_error(bio); + return; + } + } + + bio = bio_alloc(GFP_KERNEL, bio_max_segs(sg_cnt)); + bio_set_dev(bio, req->ns->bdev); + bio->bi_iter.bi_sector = sector; + bio->bi_opf = op; + + bio_chain(bio, prev); + submit_bio(prev); + } + + sector += sg->length >> 9; + sg_cnt--; + } + + if (req->metadata_len) { + rc = nvmet_bdev_alloc_bip(req, bio, &prot_miter); + if (unlikely(rc)) { + bio_io_error(bio); + return; + } + } + + submit_bio(bio); + blk_finish_plug(&plug); +} + +static void nvmet_bdev_execute_flush(struct nvmet_req *req) +{ + struct bio *bio = &req->b.inline_bio; + + if (!nvmet_check_transfer_len(req, 0)) + return; + + bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec)); + bio_set_dev(bio, req->ns->bdev); + bio->bi_private = req; + bio->bi_end_io = nvmet_bio_done; + bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; + + submit_bio(bio); +} + +u16 nvmet_bdev_flush(struct nvmet_req *req) +{ + if (blkdev_issue_flush(req->ns->bdev)) + return NVME_SC_INTERNAL | NVME_SC_DNR; + return 0; +} + +static u16 nvmet_bdev_discard_range(struct nvmet_req *req, + struct nvme_dsm_range *range, struct bio **bio) +{ + struct nvmet_ns *ns = req->ns; + int ret; + + ret = __blkdev_issue_discard(ns->bdev, + nvmet_lba_to_sect(ns, range->slba), + le32_to_cpu(range->nlb) << (ns->blksize_shift - 9), + GFP_KERNEL, 0, bio); + if (ret && ret != -EOPNOTSUPP) { + req->error_slba = le64_to_cpu(range->slba); + return errno_to_nvme_status(req, ret); + } + return NVME_SC_SUCCESS; +} + +static void nvmet_bdev_execute_discard(struct nvmet_req *req) +{ + struct nvme_dsm_range range; + struct bio *bio = NULL; + int i; + u16 status; + + for (i = 0; i <= le32_to_cpu(req->cmd->dsm.nr); i++) { + status = nvmet_copy_from_sgl(req, i * sizeof(range), &range, + sizeof(range)); + if (status) + break; + + status = nvmet_bdev_discard_range(req, &range, &bio); + if (status) + break; + } + + if (bio) { + bio->bi_private = req; + bio->bi_end_io = nvmet_bio_done; + if (status) + bio_io_error(bio); + else + submit_bio(bio); + } else { + nvmet_req_complete(req, status); + } +} + +static void nvmet_bdev_execute_dsm(struct nvmet_req *req) +{ + if (!nvmet_check_data_len_lte(req, nvmet_dsm_len(req))) + return; + + switch (le32_to_cpu(req->cmd->dsm.attributes)) { + case NVME_DSMGMT_AD: + nvmet_bdev_execute_discard(req); + return; + case NVME_DSMGMT_IDR: + case NVME_DSMGMT_IDW: + default: + /* Not supported yet */ + nvmet_req_complete(req, 0); + return; + } +} + +static void nvmet_bdev_execute_write_zeroes(struct nvmet_req *req) +{ + struct nvme_write_zeroes_cmd *write_zeroes = &req->cmd->write_zeroes; + struct bio *bio = NULL; + sector_t sector; + sector_t nr_sector; + int ret; + + if (!nvmet_check_transfer_len(req, 0)) + return; + + sector = nvmet_lba_to_sect(req->ns, write_zeroes->slba); + nr_sector = (((sector_t)le16_to_cpu(write_zeroes->length) + 1) << + (req->ns->blksize_shift - 9)); + + ret = __blkdev_issue_zeroout(req->ns->bdev, sector, nr_sector, + GFP_KERNEL, &bio, 0); + if (bio) { + bio->bi_private = req; + bio->bi_end_io = nvmet_bio_done; + submit_bio(bio); + } else { + nvmet_req_complete(req, errno_to_nvme_status(req, ret)); + } +} + +u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req) +{ + switch (req->cmd->common.opcode) { + case nvme_cmd_read: + case nvme_cmd_write: + req->execute = nvmet_bdev_execute_rw; + if (req->sq->ctrl->pi_support && nvmet_ns_has_pi(req->ns)) + req->metadata_len = nvmet_rw_metadata_len(req); + return 0; + case nvme_cmd_flush: + req->execute = nvmet_bdev_execute_flush; + return 0; + case nvme_cmd_dsm: + req->execute = nvmet_bdev_execute_dsm; + return 0; + case nvme_cmd_write_zeroes: + req->execute = nvmet_bdev_execute_write_zeroes; + return 0; + default: + return nvmet_report_invalid_opcode(req); + } +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/io-cmd-file.c b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/io-cmd-file.c new file mode 100644 index 0000000..af96d90 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/io-cmd-file.c @@ -0,0 +1,410 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NVMe Over Fabrics Target File I/O commands implementation. + * Copyright (c) 2017-2018 Western Digital Corporation or its + * affiliates. + */ +#ifdef pr_fmt +#undef pr_fmt +#endif +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include "nvmet.h" + +#define NVMET_MAX_MPOOL_BVEC 16 +#define NVMET_MIN_MPOOL_OBJ 16 + +int nvmet_file_ns_revalidate(struct nvmet_ns *ns) +{ + struct kstat stat; + int ret; + + ret = vfs_getattr(&ns->file->f_path, &stat, STATX_SIZE, + AT_STATX_FORCE_SYNC); + if (!ret) + ns->size = stat.size; + return ret; +} + +void nvmet_file_ns_disable(struct nvmet_ns *ns) +{ + if (ns->file) { + if (ns->buffered_io) + flush_workqueue(buffered_io_wq); + mempool_destroy(ns->bvec_pool); + ns->bvec_pool = NULL; + kmem_cache_destroy(ns->bvec_cache); + ns->bvec_cache = NULL; + fput(ns->file); + ns->file = NULL; + } +} + +int nvmet_file_ns_enable(struct nvmet_ns *ns) +{ + int flags = O_RDWR | O_LARGEFILE; + int ret; + + if (!ns->buffered_io) + flags |= O_DIRECT; + + ns->file = filp_open(ns->device_path, flags, 0); + if (IS_ERR(ns->file)) { + ret = PTR_ERR(ns->file); + pr_err("failed to open file %s: (%d)\n", + ns->device_path, ret); + ns->file = NULL; + return ret; + } + + ret = nvmet_file_ns_revalidate(ns); + if (ret) + goto err; + + /* + * i_blkbits can be greater than the universally accepted upper bound, + * so make sure we export a sane namespace lba_shift. + */ + ns->blksize_shift = min_t(u8, + file_inode(ns->file)->i_blkbits, 12); + + ns->bvec_cache = kmem_cache_create("nvmet-bvec", + NVMET_MAX_MPOOL_BVEC * sizeof(struct bio_vec), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!ns->bvec_cache) { + ret = -ENOMEM; + goto err; + } + + ns->bvec_pool = mempool_create(NVMET_MIN_MPOOL_OBJ, mempool_alloc_slab, + mempool_free_slab, ns->bvec_cache); + + if (!ns->bvec_pool) { + ret = -ENOMEM; + goto err; + } + + return ret; +err: + ns->size = 0; + ns->blksize_shift = 0; + nvmet_file_ns_disable(ns); + return ret; +} + +static void nvmet_file_init_bvec(struct bio_vec *bv, struct scatterlist *sg) +{ + bv->bv_page = sg_page(sg); + bv->bv_offset = sg->offset; + bv->bv_len = sg->length; +} + +static ssize_t nvmet_file_submit_bvec(struct nvmet_req *req, loff_t pos, + unsigned long nr_segs, size_t count, int ki_flags) +{ + struct kiocb *iocb = &req->f.iocb; + ssize_t (*call_iter)(struct kiocb *iocb, struct iov_iter *iter); + struct iov_iter iter; + int rw; + + if (req->cmd->rw.opcode == nvme_cmd_write) { + if (req->cmd->rw.control & cpu_to_le16(NVME_RW_FUA)) + ki_flags |= IOCB_DSYNC; + call_iter = req->ns->file->f_op->write_iter; + rw = WRITE; + } else { + call_iter = req->ns->file->f_op->read_iter; + rw = READ; + } + + iov_iter_bvec(&iter, rw, req->f.bvec, nr_segs, count); + + iocb->ki_pos = pos; + iocb->ki_filp = req->ns->file; + iocb->ki_flags = ki_flags | iocb_flags(req->ns->file); + + return call_iter(iocb, &iter); +} + +static void nvmet_file_io_done(struct kiocb *iocb, long ret) +{ + struct nvmet_req *req = container_of(iocb, struct nvmet_req, f.iocb); + u16 status = NVME_SC_SUCCESS; + + if (req->f.bvec != req->inline_bvec) { + if (likely(req->f.mpool_alloc == false)) + kfree(req->f.bvec); + else + mempool_free(req->f.bvec, req->ns->bvec_pool); + } + + if (unlikely(ret != req->transfer_len)) + status = errno_to_nvme_status(req, ret); + nvmet_req_complete(req, status); +} + +static bool nvmet_file_execute_io(struct nvmet_req *req, int ki_flags) +{ + ssize_t nr_bvec = req->sg_cnt; + unsigned long bv_cnt = 0; + bool is_sync = false; + size_t len = 0, total_len = 0; + ssize_t ret = 0; + loff_t pos; + int i; + struct scatterlist *sg; + + if (req->f.mpool_alloc && nr_bvec > NVMET_MAX_MPOOL_BVEC) + is_sync = true; + + pos = le64_to_cpu(req->cmd->rw.slba) << req->ns->blksize_shift; + if (unlikely(pos + req->transfer_len > req->ns->size)) { + nvmet_req_complete(req, errno_to_nvme_status(req, -ENOSPC)); + return true; + } + + memset(&req->f.iocb, 0, sizeof(struct kiocb)); + for_each_sg(req->sg, sg, req->sg_cnt, i) { + nvmet_file_init_bvec(&req->f.bvec[bv_cnt], sg); + len += req->f.bvec[bv_cnt].bv_len; + total_len += req->f.bvec[bv_cnt].bv_len; + bv_cnt++; + + WARN_ON_ONCE((nr_bvec - 1) < 0); + + if (unlikely(is_sync) && + (nr_bvec - 1 == 0 || bv_cnt == NVMET_MAX_MPOOL_BVEC)) { + ret = nvmet_file_submit_bvec(req, pos, bv_cnt, len, 0); + if (ret < 0) + goto complete; + + pos += len; + bv_cnt = 0; + len = 0; + } + nr_bvec--; + } + + if (WARN_ON_ONCE(total_len != req->transfer_len)) { + ret = -EIO; + goto complete; + } + + if (unlikely(is_sync)) { + ret = total_len; + goto complete; + } + + /* + * A NULL ki_complete ask for synchronous execution, which we want + * for the IOCB_NOWAIT case. + */ + if (!(ki_flags & IOCB_NOWAIT)) + req->f.iocb.ki_complete = nvmet_file_io_done; + + ret = nvmet_file_submit_bvec(req, pos, bv_cnt, total_len, ki_flags); + + switch (ret) { + case -EIOCBQUEUED: + return true; + case -EAGAIN: + if (WARN_ON_ONCE(!(ki_flags & IOCB_NOWAIT))) + goto complete; + return false; + case -EOPNOTSUPP: + /* + * For file systems returning error -EOPNOTSUPP, handle + * IOCB_NOWAIT error case separately and retry without + * IOCB_NOWAIT. + */ + if ((ki_flags & IOCB_NOWAIT)) + return false; + break; + } + +complete: + nvmet_file_io_done(&req->f.iocb, ret); + return true; +} + +static void nvmet_file_buffered_io_work(struct work_struct *w) +{ + struct nvmet_req *req = container_of(w, struct nvmet_req, f.work); + + nvmet_file_execute_io(req, 0); +} + +static void nvmet_file_submit_buffered_io(struct nvmet_req *req) +{ + INIT_WORK(&req->f.work, nvmet_file_buffered_io_work); + queue_work(buffered_io_wq, &req->f.work); +} + +static void nvmet_file_execute_rw(struct nvmet_req *req) +{ + ssize_t nr_bvec = req->sg_cnt; + + if (!nvmet_check_transfer_len(req, nvmet_rw_data_len(req))) + return; + + if (!req->sg_cnt || !nr_bvec) { + nvmet_req_complete(req, 0); + return; + } + + if (nr_bvec > NVMET_MAX_INLINE_BIOVEC) + req->f.bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec), + GFP_KERNEL); + else + req->f.bvec = req->inline_bvec; + + if (unlikely(!req->f.bvec)) { + /* fallback under memory pressure */ + req->f.bvec = mempool_alloc(req->ns->bvec_pool, GFP_KERNEL); + req->f.mpool_alloc = true; + } else + req->f.mpool_alloc = false; + + if (req->ns->buffered_io) { + if (likely(!req->f.mpool_alloc) && + (req->ns->file->f_mode & FMODE_NOWAIT) && + nvmet_file_execute_io(req, IOCB_NOWAIT)) + return; + nvmet_file_submit_buffered_io(req); + } else + nvmet_file_execute_io(req, 0); +} + +u16 nvmet_file_flush(struct nvmet_req *req) +{ + return errno_to_nvme_status(req, vfs_fsync(req->ns->file, 1)); +} + +static void nvmet_file_flush_work(struct work_struct *w) +{ + struct nvmet_req *req = container_of(w, struct nvmet_req, f.work); + + nvmet_req_complete(req, nvmet_file_flush(req)); +} + +static void nvmet_file_execute_flush(struct nvmet_req *req) +{ + if (!nvmet_check_transfer_len(req, 0)) + return; + INIT_WORK(&req->f.work, nvmet_file_flush_work); + schedule_work(&req->f.work); +} + +static void nvmet_file_execute_discard(struct nvmet_req *req) +{ + int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE; + struct nvme_dsm_range range; + loff_t offset, len; + u16 status = 0; + int ret; + int i; + + for (i = 0; i <= le32_to_cpu(req->cmd->dsm.nr); i++) { + status = nvmet_copy_from_sgl(req, i * sizeof(range), &range, + sizeof(range)); + if (status) + break; + + offset = le64_to_cpu(range.slba) << req->ns->blksize_shift; + len = le32_to_cpu(range.nlb); + len <<= req->ns->blksize_shift; + if (offset + len > req->ns->size) { + req->error_slba = le64_to_cpu(range.slba); + status = errno_to_nvme_status(req, -ENOSPC); + break; + } + + ret = vfs_fallocate(req->ns->file, mode, offset, len); + if (ret && ret != -EOPNOTSUPP) { + req->error_slba = le64_to_cpu(range.slba); + status = errno_to_nvme_status(req, ret); + break; + } + } + + nvmet_req_complete(req, status); +} + +static void nvmet_file_dsm_work(struct work_struct *w) +{ + struct nvmet_req *req = container_of(w, struct nvmet_req, f.work); + + switch (le32_to_cpu(req->cmd->dsm.attributes)) { + case NVME_DSMGMT_AD: + nvmet_file_execute_discard(req); + return; + case NVME_DSMGMT_IDR: + case NVME_DSMGMT_IDW: + default: + /* Not supported yet */ + nvmet_req_complete(req, 0); + return; + } +} + +static void nvmet_file_execute_dsm(struct nvmet_req *req) +{ + if (!nvmet_check_data_len_lte(req, nvmet_dsm_len(req))) + return; + INIT_WORK(&req->f.work, nvmet_file_dsm_work); + schedule_work(&req->f.work); +} + +static void nvmet_file_write_zeroes_work(struct work_struct *w) +{ + struct nvmet_req *req = container_of(w, struct nvmet_req, f.work); + struct nvme_write_zeroes_cmd *write_zeroes = &req->cmd->write_zeroes; + int mode = FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE; + loff_t offset; + loff_t len; + int ret; + + offset = le64_to_cpu(write_zeroes->slba) << req->ns->blksize_shift; + len = (((sector_t)le16_to_cpu(write_zeroes->length) + 1) << + req->ns->blksize_shift); + + if (unlikely(offset + len > req->ns->size)) { + nvmet_req_complete(req, errno_to_nvme_status(req, -ENOSPC)); + return; + } + + ret = vfs_fallocate(req->ns->file, mode, offset, len); + nvmet_req_complete(req, ret < 0 ? errno_to_nvme_status(req, ret) : 0); +} + +static void nvmet_file_execute_write_zeroes(struct nvmet_req *req) +{ + if (!nvmet_check_transfer_len(req, 0)) + return; + INIT_WORK(&req->f.work, nvmet_file_write_zeroes_work); + schedule_work(&req->f.work); +} + +u16 nvmet_file_parse_io_cmd(struct nvmet_req *req) +{ + switch (req->cmd->common.opcode) { + case nvme_cmd_read: + case nvme_cmd_write: + req->execute = nvmet_file_execute_rw; + return 0; + case nvme_cmd_flush: + req->execute = nvmet_file_execute_flush; + return 0; + case nvme_cmd_dsm: + req->execute = nvmet_file_execute_dsm; + return 0; + case nvme_cmd_write_zeroes: + req->execute = nvmet_file_execute_write_zeroes; + return 0; + default: + return nvmet_report_invalid_opcode(req); + } +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/loop.c b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/loop.c new file mode 100644 index 0000000..dc2f595 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/loop.c @@ -0,0 +1,744 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NVMe over Fabrics loopback device. + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + */ +#ifdef pr_fmt +#undef pr_fmt +#endif +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include "nvmet.h" +#include "../host/nvme.h" +#include "../host/fabrics.h" + +#define NVME_LOOP_MAX_SEGMENTS 256 + +struct nvme_loop_iod { + struct nvme_request nvme_req; + struct nvme_command cmd; + struct nvme_completion cqe; + struct nvmet_req req; + struct nvme_loop_queue *queue; + struct work_struct work; + struct sg_table sg_table; + struct scatterlist first_sgl[]; +}; + +struct nvme_loop_ctrl { + struct nvme_loop_queue *queues; + + struct blk_mq_tag_set admin_tag_set; + + struct list_head list; + struct blk_mq_tag_set tag_set; + struct nvme_loop_iod async_event_iod; + struct nvme_ctrl ctrl; + + struct nvmet_port *port; +}; + +static inline struct nvme_loop_ctrl *to_loop_ctrl(struct nvme_ctrl *ctrl) +{ + return container_of(ctrl, struct nvme_loop_ctrl, ctrl); +} + +enum nvme_loop_queue_flags { + NVME_LOOP_Q_LIVE = 0, +}; + +struct nvme_loop_queue { + struct nvmet_cq nvme_cq; + struct nvmet_sq nvme_sq; + struct nvme_loop_ctrl *ctrl; + unsigned long flags; +}; + +static LIST_HEAD(nvme_loop_ports); +static DEFINE_MUTEX(nvme_loop_ports_mutex); + +static LIST_HEAD(nvme_loop_ctrl_list); +static DEFINE_MUTEX(nvme_loop_ctrl_mutex); + +static void nvme_loop_queue_response(struct nvmet_req *nvme_req); +static void nvme_loop_delete_ctrl(struct nvmet_ctrl *ctrl); + +static const struct nvmet_fabrics_ops nvme_loop_ops; + +static inline int nvme_loop_queue_idx(struct nvme_loop_queue *queue) +{ + return queue - queue->ctrl->queues; +} + +static void nvme_loop_complete_rq(struct request *req) +{ + struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(req); + + sg_free_table_chained(&iod->sg_table, NVME_INLINE_SG_CNT); + nvme_complete_rq(req); +} + +static struct blk_mq_tags *nvme_loop_tagset(struct nvme_loop_queue *queue) +{ + u32 queue_idx = nvme_loop_queue_idx(queue); + + if (queue_idx == 0) + return queue->ctrl->admin_tag_set.tags[queue_idx]; + return queue->ctrl->tag_set.tags[queue_idx - 1]; +} + +static void nvme_loop_queue_response(struct nvmet_req *req) +{ + struct nvme_loop_queue *queue = + container_of(req->sq, struct nvme_loop_queue, nvme_sq); + struct nvme_completion *cqe = req->cqe; + + /* + * AEN requests are special as they don't time out and can + * survive any kind of queue freeze and often don't respond to + * aborts. We don't even bother to allocate a struct request + * for them but rather special case them here. + */ + if (unlikely(nvme_is_aen_req(nvme_loop_queue_idx(queue), + cqe->command_id))) { + nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status, + &cqe->result); + } else { + struct request *rq; + + rq = nvme_find_rq(nvme_loop_tagset(queue), cqe->command_id); + if (!rq) { + dev_err(queue->ctrl->ctrl.device, + "got bad command_id %#x on queue %d\n", + cqe->command_id, nvme_loop_queue_idx(queue)); + return; + } + + if (!nvme_try_complete_req(rq, cqe->status, cqe->result)) + nvme_loop_complete_rq(rq); + } +} + +static void nvme_loop_execute_work(struct work_struct *work) +{ + struct nvme_loop_iod *iod = + container_of(work, struct nvme_loop_iod, work); + + iod->req.execute(&iod->req); +} + +static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct nvme_ns *ns = hctx->queue->queuedata; + struct nvme_loop_queue *queue = hctx->driver_data; + struct request *req = bd->rq; + struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(req); + bool queue_ready = test_bit(NVME_LOOP_Q_LIVE, &queue->flags); + blk_status_t ret; + + if (!nvme_check_ready(&queue->ctrl->ctrl, req, queue_ready)) + return nvme_fail_nonready_command(&queue->ctrl->ctrl, req); + + ret = nvme_setup_cmd(ns, req); + if (ret) + return ret; + + blk_mq_start_request(req); + iod->cmd.common.flags |= NVME_CMD_SGL_METABUF; + iod->req.port = queue->ctrl->port; + if (!nvmet_req_init(&iod->req, &queue->nvme_cq, + &queue->nvme_sq, &nvme_loop_ops)) + return BLK_STS_OK; + + if (blk_rq_nr_phys_segments(req)) { + iod->sg_table.sgl = iod->first_sgl; + if (sg_alloc_table_chained(&iod->sg_table, + blk_rq_nr_phys_segments(req), + iod->sg_table.sgl, NVME_INLINE_SG_CNT)) { + nvme_cleanup_cmd(req); + return BLK_STS_RESOURCE; + } + + iod->req.sg = iod->sg_table.sgl; + iod->req.sg_cnt = blk_rq_map_sg(req->q, req, iod->sg_table.sgl); + iod->req.transfer_len = blk_rq_payload_bytes(req); + } + + schedule_work(&iod->work); + return BLK_STS_OK; +} + +static void nvme_loop_submit_async_event(struct nvme_ctrl *arg) +{ + struct nvme_loop_ctrl *ctrl = to_loop_ctrl(arg); + struct nvme_loop_queue *queue = &ctrl->queues[0]; + struct nvme_loop_iod *iod = &ctrl->async_event_iod; + + memset(&iod->cmd, 0, sizeof(iod->cmd)); + iod->cmd.common.opcode = nvme_admin_async_event; + iod->cmd.common.command_id = NVME_AQ_BLK_MQ_DEPTH; + iod->cmd.common.flags |= NVME_CMD_SGL_METABUF; + + if (!nvmet_req_init(&iod->req, &queue->nvme_cq, &queue->nvme_sq, + &nvme_loop_ops)) { + dev_err(ctrl->ctrl.device, "failed async event work\n"); + return; + } + + schedule_work(&iod->work); +} + +static int nvme_loop_init_iod(struct nvme_loop_ctrl *ctrl, + struct nvme_loop_iod *iod, unsigned int queue_idx) +{ + iod->req.cmd = &iod->cmd; + iod->req.cqe = &iod->cqe; + iod->queue = &ctrl->queues[queue_idx]; + INIT_WORK(&iod->work, nvme_loop_execute_work); + return 0; +} + +static int nvme_loop_init_request(struct blk_mq_tag_set *set, + struct request *req, unsigned int hctx_idx, + unsigned int numa_node) +{ + struct nvme_loop_ctrl *ctrl = set->driver_data; + struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(req); + + nvme_req(req)->ctrl = &ctrl->ctrl; + nvme_req(req)->cmd = &iod->cmd; + return nvme_loop_init_iod(ctrl, blk_mq_rq_to_pdu(req), + (set == &ctrl->tag_set) ? hctx_idx + 1 : 0); +} + +static struct lock_class_key loop_hctx_fq_lock_key; + +static int nvme_loop_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) +{ + struct nvme_loop_ctrl *ctrl = data; + struct nvme_loop_queue *queue = &ctrl->queues[hctx_idx + 1]; + + BUG_ON(hctx_idx >= ctrl->ctrl.queue_count); + + /* + * flush_end_io() can be called recursively for us, so use our own + * lock class key for avoiding lockdep possible recursive locking, + * then we can remove the dynamically allocated lock class for each + * flush queue, that way may cause horrible boot delay. + */ + blk_mq_hctx_set_fq_lock_class(hctx, &loop_hctx_fq_lock_key); + + hctx->driver_data = queue; + return 0; +} + +static int nvme_loop_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) +{ + struct nvme_loop_ctrl *ctrl = data; + struct nvme_loop_queue *queue = &ctrl->queues[0]; + + BUG_ON(hctx_idx != 0); + + hctx->driver_data = queue; + return 0; +} + +static const struct blk_mq_ops nvme_loop_mq_ops = { + .queue_rq = nvme_loop_queue_rq, + .complete = nvme_loop_complete_rq, + .init_request = nvme_loop_init_request, + .init_hctx = nvme_loop_init_hctx, +}; + +static const struct blk_mq_ops nvme_loop_admin_mq_ops = { + .queue_rq = nvme_loop_queue_rq, + .complete = nvme_loop_complete_rq, + .init_request = nvme_loop_init_request, + .init_hctx = nvme_loop_init_admin_hctx, +}; + +static void nvme_loop_destroy_admin_queue(struct nvme_loop_ctrl *ctrl) +{ + if (!test_and_clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags)) + return; + nvmet_sq_destroy(&ctrl->queues[0].nvme_sq); + blk_cleanup_queue(ctrl->ctrl.admin_q); + blk_cleanup_queue(ctrl->ctrl.fabrics_q); + blk_mq_free_tag_set(&ctrl->admin_tag_set); +} + +static void nvme_loop_free_ctrl(struct nvme_ctrl *nctrl) +{ + struct nvme_loop_ctrl *ctrl = to_loop_ctrl(nctrl); + + if (list_empty(&ctrl->list)) + goto free_ctrl; + + mutex_lock(&nvme_loop_ctrl_mutex); + list_del(&ctrl->list); + mutex_unlock(&nvme_loop_ctrl_mutex); + + if (nctrl->tagset) { + blk_cleanup_queue(ctrl->ctrl.connect_q); + blk_mq_free_tag_set(&ctrl->tag_set); + } + kfree(ctrl->queues); + nvmf_free_options(nctrl->opts); +free_ctrl: + kfree(ctrl); +} + +static void nvme_loop_destroy_io_queues(struct nvme_loop_ctrl *ctrl) +{ + int i; + + for (i = 1; i < ctrl->ctrl.queue_count; i++) { + clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[i].flags); + nvmet_sq_destroy(&ctrl->queues[i].nvme_sq); + } + ctrl->ctrl.queue_count = 1; +} + +static int nvme_loop_init_io_queues(struct nvme_loop_ctrl *ctrl) +{ + struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; + unsigned int nr_io_queues; + int ret, i; + + nr_io_queues = min(opts->nr_io_queues, num_online_cpus()); + ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues); + if (ret || !nr_io_queues) + return ret; + + dev_info(ctrl->ctrl.device, "creating %d I/O queues.\n", nr_io_queues); + + for (i = 1; i <= nr_io_queues; i++) { + ctrl->queues[i].ctrl = ctrl; + ret = nvmet_sq_init(&ctrl->queues[i].nvme_sq); + if (ret) + goto out_destroy_queues; + + ctrl->ctrl.queue_count++; + } + + return 0; + +out_destroy_queues: + nvme_loop_destroy_io_queues(ctrl); + return ret; +} + +static int nvme_loop_connect_io_queues(struct nvme_loop_ctrl *ctrl) +{ + int i, ret; + + for (i = 1; i < ctrl->ctrl.queue_count; i++) { + ret = nvmf_connect_io_queue(&ctrl->ctrl, i); + if (ret) + return ret; + set_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[i].flags); + } + + return 0; +} + +static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) +{ + int error; + + memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set)); + ctrl->admin_tag_set.ops = &nvme_loop_admin_mq_ops; + ctrl->admin_tag_set.queue_depth = NVME_AQ_MQ_TAG_DEPTH; + ctrl->admin_tag_set.reserved_tags = NVMF_RESERVED_TAGS; + ctrl->admin_tag_set.numa_node = ctrl->ctrl.numa_node; + ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_loop_iod) + + NVME_INLINE_SG_CNT * sizeof(struct scatterlist); + ctrl->admin_tag_set.driver_data = ctrl; + ctrl->admin_tag_set.nr_hw_queues = 1; + ctrl->admin_tag_set.timeout = NVME_ADMIN_TIMEOUT; + ctrl->admin_tag_set.flags = BLK_MQ_F_NO_SCHED; + + ctrl->queues[0].ctrl = ctrl; + error = nvmet_sq_init(&ctrl->queues[0].nvme_sq); + if (error) + return error; + ctrl->ctrl.queue_count = 1; + + error = blk_mq_alloc_tag_set(&ctrl->admin_tag_set); + if (error) + goto out_free_sq; + ctrl->ctrl.admin_tagset = &ctrl->admin_tag_set; + + ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set); + if (IS_ERR(ctrl->ctrl.fabrics_q)) { + error = PTR_ERR(ctrl->ctrl.fabrics_q); + goto out_free_tagset; + } + + ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set); + if (IS_ERR(ctrl->ctrl.admin_q)) { + error = PTR_ERR(ctrl->ctrl.admin_q); + goto out_cleanup_fabrics_q; + } + /* reset stopped state for the fresh admin queue */ + clear_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->ctrl.flags); + + error = nvmf_connect_admin_queue(&ctrl->ctrl); + if (error) + goto out_cleanup_queue; + + set_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags); + + error = nvme_enable_ctrl(&ctrl->ctrl); + if (error) + goto out_cleanup_queue; + + ctrl->ctrl.max_hw_sectors = + (NVME_LOOP_MAX_SEGMENTS - 1) << (PAGE_SHIFT - 9); + + nvme_start_admin_queue(&ctrl->ctrl); + + error = nvme_init_ctrl_finish(&ctrl->ctrl); + if (error) + goto out_cleanup_queue; + + return 0; + +out_cleanup_queue: + clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags); + blk_cleanup_queue(ctrl->ctrl.admin_q); +out_cleanup_fabrics_q: + blk_cleanup_queue(ctrl->ctrl.fabrics_q); +out_free_tagset: + blk_mq_free_tag_set(&ctrl->admin_tag_set); +out_free_sq: + nvmet_sq_destroy(&ctrl->queues[0].nvme_sq); + return error; +} + +static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl, bool shutdown) +{ + if (ctrl->ctrl.queue_count > 1) { + nvme_stop_queues(&ctrl->ctrl); + blk_mq_tagset_busy_iter(&ctrl->tag_set, + nvme_cancel_request, &ctrl->ctrl); + blk_mq_tagset_wait_completed_request(&ctrl->tag_set); + nvme_loop_destroy_io_queues(ctrl); + } + + nvme_stop_admin_queue(&ctrl->ctrl); + if (ctrl->ctrl.state == NVME_CTRL_LIVE) + nvme_shutdown_ctrl(&ctrl->ctrl); + + blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, + nvme_cancel_request, &ctrl->ctrl); + blk_mq_tagset_wait_completed_request(&ctrl->admin_tag_set); + nvme_loop_destroy_admin_queue(ctrl); +} + +static void nvme_loop_delete_ctrl_host(struct nvme_ctrl *ctrl) +{ + nvme_loop_shutdown_ctrl(to_loop_ctrl(ctrl), true); +} + +static void nvme_loop_delete_ctrl(struct nvmet_ctrl *nctrl) +{ + struct nvme_loop_ctrl *ctrl; + + mutex_lock(&nvme_loop_ctrl_mutex); + list_for_each_entry(ctrl, &nvme_loop_ctrl_list, list) { + if (ctrl->ctrl.cntlid == nctrl->cntlid) + nvme_delete_ctrl(&ctrl->ctrl); + } + mutex_unlock(&nvme_loop_ctrl_mutex); +} + +static void nvme_loop_reset_ctrl_work(struct work_struct *work) +{ + struct nvme_loop_ctrl *ctrl = + container_of(work, struct nvme_loop_ctrl, ctrl.reset_work); + int ret; + + nvme_stop_ctrl(&ctrl->ctrl); + nvme_loop_shutdown_ctrl(ctrl, false); + + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { + if (ctrl->ctrl.state != NVME_CTRL_DELETING && + ctrl->ctrl.state != NVME_CTRL_DELETING_NOIO) + /* state change failure for non-deleted ctrl? */ + WARN_ON_ONCE(1); + return; + } + + ret = nvme_loop_configure_admin_queue(ctrl); + if (ret) + goto out_disable; + + ret = nvme_loop_init_io_queues(ctrl); + if (ret) + goto out_destroy_admin; + + ret = nvme_loop_connect_io_queues(ctrl); + if (ret) + goto out_destroy_io; + + blk_mq_update_nr_hw_queues(&ctrl->tag_set, + ctrl->ctrl.queue_count - 1); + + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE)) + WARN_ON_ONCE(1); + + nvme_start_ctrl(&ctrl->ctrl); + + return; + +out_destroy_io: + nvme_loop_destroy_io_queues(ctrl); +out_destroy_admin: + nvme_disable_ctrl(&ctrl->ctrl); + nvme_loop_destroy_admin_queue(ctrl); +out_disable: + dev_warn(ctrl->ctrl.device, "Removing after reset failure\n"); + nvme_uninit_ctrl(&ctrl->ctrl); +} + +static const struct nvme_ctrl_ops nvme_loop_ctrl_ops = { + .name = "loop", + .module = THIS_MODULE, + .flags = NVME_F_FABRICS, + .reg_read32 = nvmf_reg_read32, + .reg_read64 = nvmf_reg_read64, + .reg_write32 = nvmf_reg_write32, + .free_ctrl = nvme_loop_free_ctrl, + .submit_async_event = nvme_loop_submit_async_event, + .delete_ctrl = nvme_loop_delete_ctrl_host, + .get_address = nvmf_get_address, +}; + +static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl) +{ + int ret; + + ret = nvme_loop_init_io_queues(ctrl); + if (ret) + return ret; + + memset(&ctrl->tag_set, 0, sizeof(ctrl->tag_set)); + ctrl->tag_set.ops = &nvme_loop_mq_ops; + ctrl->tag_set.queue_depth = ctrl->ctrl.opts->queue_size; + ctrl->tag_set.reserved_tags = NVMF_RESERVED_TAGS; + ctrl->tag_set.numa_node = ctrl->ctrl.numa_node; + ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; + ctrl->tag_set.cmd_size = sizeof(struct nvme_loop_iod) + + NVME_INLINE_SG_CNT * sizeof(struct scatterlist); + ctrl->tag_set.driver_data = ctrl; + ctrl->tag_set.nr_hw_queues = ctrl->ctrl.queue_count - 1; + ctrl->tag_set.timeout = NVME_IO_TIMEOUT; + ctrl->ctrl.tagset = &ctrl->tag_set; + + ret = blk_mq_alloc_tag_set(&ctrl->tag_set); + if (ret) + goto out_destroy_queues; + + ctrl->ctrl.connect_q = blk_mq_init_queue(&ctrl->tag_set); + if (IS_ERR(ctrl->ctrl.connect_q)) { + ret = PTR_ERR(ctrl->ctrl.connect_q); + goto out_free_tagset; + } + + ret = nvme_loop_connect_io_queues(ctrl); + if (ret) + goto out_cleanup_connect_q; + + return 0; + +out_cleanup_connect_q: + blk_cleanup_queue(ctrl->ctrl.connect_q); +out_free_tagset: + blk_mq_free_tag_set(&ctrl->tag_set); +out_destroy_queues: + nvme_loop_destroy_io_queues(ctrl); + return ret; +} + +static struct nvmet_port *nvme_loop_find_port(struct nvme_ctrl *ctrl) +{ + struct nvmet_port *p, *found = NULL; + + mutex_lock(&nvme_loop_ports_mutex); + list_for_each_entry(p, &nvme_loop_ports, entry) { + /* if no transport address is specified use the first port */ + if ((ctrl->opts->mask & NVMF_OPT_TRADDR) && + strcmp(ctrl->opts->traddr, p->disc_addr.traddr)) + continue; + found = p; + break; + } + mutex_unlock(&nvme_loop_ports_mutex); + return found; +} + +static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev, + struct nvmf_ctrl_options *opts) +{ + struct nvme_loop_ctrl *ctrl; + int ret; + + ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL); + if (!ctrl) + return ERR_PTR(-ENOMEM); + ctrl->ctrl.opts = opts; + INIT_LIST_HEAD(&ctrl->list); + + INIT_WORK(&ctrl->ctrl.reset_work, nvme_loop_reset_ctrl_work); + + ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_loop_ctrl_ops, + 0 /* no quirks, we're perfect! */); + if (ret) { + kfree(ctrl); + goto out; + } + + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) + WARN_ON_ONCE(1); + + ret = -ENOMEM; + + ctrl->ctrl.sqsize = opts->queue_size - 1; + ctrl->ctrl.kato = opts->kato; + ctrl->port = nvme_loop_find_port(&ctrl->ctrl); + + ctrl->queues = kcalloc(opts->nr_io_queues + 1, sizeof(*ctrl->queues), + GFP_KERNEL); + if (!ctrl->queues) + goto out_uninit_ctrl; + + ret = nvme_loop_configure_admin_queue(ctrl); + if (ret) + goto out_free_queues; + + if (opts->queue_size > ctrl->ctrl.maxcmd) { + /* warn if maxcmd is lower than queue_size */ + dev_warn(ctrl->ctrl.device, + "queue_size %zu > ctrl maxcmd %u, clamping down\n", + opts->queue_size, ctrl->ctrl.maxcmd); + opts->queue_size = ctrl->ctrl.maxcmd; + } + + if (opts->nr_io_queues) { + ret = nvme_loop_create_io_queues(ctrl); + if (ret) + goto out_remove_admin_queue; + } + + nvme_loop_init_iod(ctrl, &ctrl->async_event_iod, 0); + + dev_info(ctrl->ctrl.device, + "new ctrl: \"%s\"\n", ctrl->ctrl.opts->subsysnqn); + + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE)) + WARN_ON_ONCE(1); + + mutex_lock(&nvme_loop_ctrl_mutex); + list_add_tail(&ctrl->list, &nvme_loop_ctrl_list); + mutex_unlock(&nvme_loop_ctrl_mutex); + + nvme_start_ctrl(&ctrl->ctrl); + + return &ctrl->ctrl; + +out_remove_admin_queue: + nvme_disable_ctrl(&ctrl->ctrl); + nvme_loop_destroy_admin_queue(ctrl); +out_free_queues: + kfree(ctrl->queues); +out_uninit_ctrl: + nvme_uninit_ctrl(&ctrl->ctrl); + nvme_put_ctrl(&ctrl->ctrl); +out: + if (ret > 0) + ret = -EIO; + return ERR_PTR(ret); +} + +static int nvme_loop_add_port(struct nvmet_port *port) +{ + mutex_lock(&nvme_loop_ports_mutex); + list_add_tail(&port->entry, &nvme_loop_ports); + mutex_unlock(&nvme_loop_ports_mutex); + return 0; +} + +static void nvme_loop_remove_port(struct nvmet_port *port) +{ + mutex_lock(&nvme_loop_ports_mutex); + list_del_init(&port->entry); + mutex_unlock(&nvme_loop_ports_mutex); + + /* + * Ensure any ctrls that are in the process of being + * deleted are in fact deleted before we return + * and free the port. This is to prevent active + * ctrls from using a port after it's freed. + */ + flush_workqueue(nvme_delete_wq); +} + +static const struct nvmet_fabrics_ops nvme_loop_ops = { + .owner = THIS_MODULE, + .type = NVMF_TRTYPE_LOOP, + .add_port = nvme_loop_add_port, + .remove_port = nvme_loop_remove_port, + .queue_response = nvme_loop_queue_response, + .delete_ctrl = nvme_loop_delete_ctrl, +}; + +static struct nvmf_transport_ops nvme_loop_transport = { + .name = "loop", + .module = THIS_MODULE, + .create_ctrl = nvme_loop_create_ctrl, + .allowed_opts = NVMF_OPT_TRADDR, +}; + +static int __init nvme_loop_init_module(void) +{ + int ret; + + ret = nvmet_register_transport(&nvme_loop_ops); + if (ret) + return ret; + + ret = nvmf_register_transport(&nvme_loop_transport); + if (ret) + nvmet_unregister_transport(&nvme_loop_ops); + + return ret; +} + +static void __exit nvme_loop_cleanup_module(void) +{ + struct nvme_loop_ctrl *ctrl, *next; + + nvmf_unregister_transport(&nvme_loop_transport); + nvmet_unregister_transport(&nvme_loop_ops); + + mutex_lock(&nvme_loop_ctrl_mutex); + list_for_each_entry_safe(ctrl, next, &nvme_loop_ctrl_list, list) + nvme_delete_ctrl(&ctrl->ctrl); + mutex_unlock(&nvme_loop_ctrl_mutex); + + flush_workqueue(nvme_delete_wq); +} + +module_init(nvme_loop_init_module); +module_exit(nvme_loop_cleanup_module); + +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS("nvmet-transport-254"); /* 254 == NVMF_TRTYPE_LOOP */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/nvme-fcloop_dummy.c b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/nvme-fcloop_dummy.c new file mode 100644 index 0000000..a23bcd7 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/nvme-fcloop_dummy.c @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "nvme-fcloop" +#define PFX DRV_NAME ": " +#define DRV_VERSION "2.0.0" +#define DRV_RELDATE "July 27, 2017" + +MODULE_AUTHOR("Alaa Hleihel"); +MODULE_DESCRIPTION("nvme-fcloop dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init nvme_fcloop_init(void) +{ + return 0; +} + +static void __exit nvme_fcloop_cleanup(void) +{ +} + +module_init(nvme_fcloop_init); +module_exit(nvme_fcloop_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/nvme-loop_dummy.c b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/nvme-loop_dummy.c new file mode 100644 index 0000000..94724ec --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/nvme-loop_dummy.c @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "nvme-loop" +#define PFX DRV_NAME ": " +#define DRV_VERSION "2.0.0" +#define DRV_RELDATE "November 14, 2016" + +MODULE_AUTHOR("Alaa Hleihel"); +MODULE_DESCRIPTION("nvme-loop dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init nvme_loop_init(void) +{ + return 0; +} + +static void __exit nvme_loop_cleanup(void) +{ +} + +module_init(nvme_loop_init); +module_exit(nvme_loop_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/nvmet-fc_dummy.c b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/nvmet-fc_dummy.c new file mode 100644 index 0000000..cdcdfff --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/nvmet-fc_dummy.c @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "nvmet-fc" +#define PFX DRV_NAME ": " +#define DRV_VERSION "2.0.0" +#define DRV_RELDATE "July 27, 2017" + +MODULE_AUTHOR("Alaa Hleihel"); +MODULE_DESCRIPTION("nvmet-fc dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init nvmet_fc_init(void) +{ + return 0; +} + +static void __exit nvmet_fc_cleanup(void) +{ +} + +module_init(nvmet_fc_init); +module_exit(nvmet_fc_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/nvmet-rdma_dummy.c b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/nvmet-rdma_dummy.c new file mode 100644 index 0000000..6866e57 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/nvmet-rdma_dummy.c @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "nvmet-rdma" +#define PFX DRV_NAME ": " +#define DRV_VERSION "2.0.0" +#define DRV_RELDATE "November 14, 2016" + +MODULE_AUTHOR("Alaa Hleihel"); +MODULE_DESCRIPTION("nvmet-rdma dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init nvmet_rdma_init(void) +{ + return 0; +} + +static void __exit nvmet_rdma_cleanup(void) +{ +} + +module_init(nvmet_rdma_init); +module_exit(nvmet_rdma_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/nvmet.h b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/nvmet.h new file mode 100644 index 0000000..4b526b2 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/nvmet.h @@ -0,0 +1,756 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + */ + +#ifndef _NVMET_H +#define _NVMET_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define NVMET_DEFAULT_VS NVME_VS(1, 3, 0) + +#define NVMET_ASYNC_EVENTS 4 +#define NVMET_ERROR_LOG_SLOTS 128 +#define NVMET_NO_ERROR_LOC ((u16)-1) +#define NVMET_DEFAULT_CTRL_MODEL "Linux" +#define NVMET_MN_MAX_SIZE 40 +#define NVMET_SN_MAX_SIZE 20 + +/* + * Supported optional AENs: + */ +#define NVMET_AEN_CFG_OPTIONAL \ + (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_ANA_CHANGE) +#define NVMET_DISC_AEN_CFG_OPTIONAL \ + (NVME_AEN_CFG_DISC_CHANGE) + +/* + * Plus mandatory SMART AENs (we'll never send them, but allow enabling them): + */ +#define NVMET_AEN_CFG_ALL \ + (NVME_SMART_CRIT_SPARE | NVME_SMART_CRIT_TEMPERATURE | \ + NVME_SMART_CRIT_RELIABILITY | NVME_SMART_CRIT_MEDIA | \ + NVME_SMART_CRIT_VOLATILE_MEMORY | NVMET_AEN_CFG_OPTIONAL) + +/* Helper Macros when NVMe error is NVME_SC_CONNECT_INVALID_PARAM + * The 16 bit shift is to set IATTR bit to 1, which means offending + * offset starts in the data section of connect() + */ +#define IPO_IATTR_CONNECT_DATA(x) \ + (cpu_to_le32((1 << 16) | (offsetof(struct nvmf_connect_data, x)))) +#define IPO_IATTR_CONNECT_SQE(x) \ + (cpu_to_le32(offsetof(struct nvmf_connect_command, x))) + +struct nvmet_ns { + struct percpu_ref ref; + struct block_device *bdev; + struct pci_dev *pdev; + struct file *file; + bool readonly; + u32 nsid; + u32 blksize_shift; + loff_t size; + u8 nguid[16]; + uuid_t uuid; + u32 anagrpid; + + bool buffered_io; + bool enabled; + struct nvmet_subsys *subsys; + const char *device_path; + + struct config_group device_group; + struct config_group group; + struct config_group offload_ctxs_group; + + struct completion disable_done; + mempool_t *bvec_pool; + struct kmem_cache *bvec_cache; + + u32 offload_cmd_tmo_us; + int use_p2pmem; + struct pci_dev *p2p_dev; + int pi_type; + int metadata_size; + u8 csi; +}; + +static inline struct nvmet_ns *to_nvmet_ns(struct config_item *item) +{ + return container_of(to_config_group(item), struct nvmet_ns, group); +} + +static inline struct device *nvmet_ns_dev(struct nvmet_ns *ns) +{ + return ns->bdev ? disk_to_dev(ns->bdev->bd_disk) : NULL; +} + +struct nvmet_cq { + u16 qid; + u16 size; +}; + +struct nvmet_sq { + struct nvmet_ctrl *ctrl; + struct percpu_ref ref; + u16 qid; + u16 size; + u32 sqhd; + bool sqhd_disabled; + struct completion free_done; + struct completion confirm_done; +}; + +struct nvmet_ana_group { + struct config_group group; + struct nvmet_port *port; + u32 grpid; +}; + +static inline struct nvmet_ana_group *to_ana_group(struct config_item *item) +{ + return container_of(to_config_group(item), struct nvmet_ana_group, + group); +} + +/** + * struct nvmet_port - Common structure to keep port + * information for the target. + * @entry: Entry into referrals or transport list. + * @disc_addr: Address information is stored in a format defined + * for a discovery log page entry. + * @group: ConfigFS group for this element's folder. + * @priv: Private data for the transport. + */ +struct nvmet_port { + struct list_head entry; + struct nvmf_disc_rsp_page_entry disc_addr; + struct config_group group; + struct config_group subsys_group; + struct list_head subsystems; + struct config_group referrals_group; + struct list_head referrals; + struct list_head global_entry; + struct config_group ana_groups_group; + struct nvmet_ana_group ana_default_group; + enum nvme_ana_state *ana_state; + void *priv; + bool enabled; + bool offload; + int inline_data_size; + const struct nvmet_fabrics_ops *tr_ops; + bool pi_enable; + bool many_offload_subsys_support; + u32 offload_queues; + u32 offload_queue_size; + size_t offload_srq_size; + bool offload_passthrough_sqe_rw; +}; + +static inline struct nvmet_port *to_nvmet_port(struct config_item *item) +{ + return container_of(to_config_group(item), struct nvmet_port, + group); +} + +static inline struct nvmet_port *ana_groups_to_port( + struct config_item *item) +{ + return container_of(to_config_group(item), struct nvmet_port, + ana_groups_group); +} + +struct nvmet_ctrl { + struct nvmet_subsys *subsys; + struct nvmet_sq **sqs; + + bool reset_tbkas; + + struct mutex lock; + u64 cap; + u32 cc; + u32 csts; + + uuid_t hostid; + u16 cntlid; + u32 kato; + + struct nvmet_port *port; + + u32 aen_enabled; + unsigned long aen_masked; + struct nvmet_req *async_event_cmds[NVMET_ASYNC_EVENTS]; + unsigned int nr_async_event_cmds; + struct list_head async_events; + struct work_struct async_event_work; + + struct list_head subsys_entry; + struct kref ref; + struct delayed_work ka_work; + struct work_struct fatal_err_work; + + const struct nvmet_fabrics_ops *ops; + + __le32 *changed_ns_list; + u32 nr_changed_ns; + + char subsysnqn[NVMF_NQN_FIELD_LEN]; + char hostnqn[NVMF_NQN_FIELD_LEN]; + + unsigned int sqe_inline_size; + struct device *p2p_client; + struct radix_tree_root p2p_ns_map; + + spinlock_t error_lock; + u64 err_counter; + struct nvme_error_slot slots[NVMET_ERROR_LOG_SLOTS]; + bool pi_support; + void *offload_ctrl; +}; + +struct nvmet_offload_ctx { + void *ctx; + struct nvmet_port *port; + struct nvmet_ns *ns; + int id; + struct config_group group; +}; + +struct nvmet_ns_counters { + u64 num_read_cmd; + u64 num_read_blocks; + u64 num_write_cmd; + u64 num_write_blocks; + u64 num_write_inline_cmd; + u64 num_flush_cmd; + u64 num_error_cmd; + u64 num_backend_error_cmd; + u64 last_read_latency; + u64 last_write_latency; + u64 queue_depth; +}; + +struct nvmet_subsys { + enum nvme_subsys_type type; + + struct mutex lock; + struct kref ref; + + struct xarray namespaces; + unsigned int nr_namespaces; + u32 max_nsid; + u16 cntlid_min; + u16 cntlid_max; + + struct list_head ctrls; + + struct list_head hosts; + bool allow_any_host; + + u16 max_qid; + + u64 ver; + char serial[NVMET_SN_MAX_SIZE]; + bool subsys_discovered; + char *subsysnqn; + bool pi_support; + + struct config_group group; + + struct config_group namespaces_group; + struct config_group allowed_hosts_group; + + bool offloadble; + unsigned int num_ports; + u64 (*offload_subsys_unknown_ns_cmds)(struct nvmet_subsys *subsys); + u64 (*offload_ns_read_cmds)(struct nvmet_ns *ns); + u64 (*offload_ns_read_blocks)(struct nvmet_ns *ns); + u64 (*offload_ns_write_cmds)(struct nvmet_ns *ns); + u64 (*offload_ns_write_blocks)(struct nvmet_ns *ns); + u64 (*offload_ns_write_inline_cmds)(struct nvmet_ns *ns); + u64 (*offload_ns_flush_cmds)(struct nvmet_ns *ns); + u64 (*offload_ns_error_cmds)(struct nvmet_ns *ns); + u64 (*offload_ns_backend_error_cmds)(struct nvmet_ns *ns); + void (*offload_query_counters)(void *ctx, + struct nvmet_ns_counters *counters); + + char *model_number; + +#ifdef CONFIG_NVME_TARGET_PASSTHRU + struct nvme_ctrl *passthru_ctrl; + char *passthru_ctrl_path; + struct config_group passthru_group; + unsigned int admin_timeout; + unsigned int io_timeout; + unsigned int clear_ids; +#endif /* CONFIG_NVME_TARGET_PASSTHRU */ + +#ifdef CONFIG_BLK_DEV_ZONED + u8 zasl; +#endif /* CONFIG_BLK_DEV_ZONED */ +}; + +static inline struct nvmet_offload_ctx *to_nvmet_offload_ctx(struct config_item *item) +{ + return container_of(to_config_group(item), struct nvmet_offload_ctx, + group); +} + +static inline struct nvmet_subsys *to_subsys(struct config_item *item) +{ + return container_of(to_config_group(item), struct nvmet_subsys, group); +} + +static inline struct nvmet_subsys *namespaces_to_subsys( + struct config_item *item) +{ + return container_of(to_config_group(item), struct nvmet_subsys, + namespaces_group); +} + +struct nvmet_host { + struct config_group group; +}; + +static inline struct nvmet_host *to_host(struct config_item *item) +{ + return container_of(to_config_group(item), struct nvmet_host, group); +} + +static inline char *nvmet_host_name(struct nvmet_host *host) +{ + return config_item_name(&host->group.cg_item); +} + +struct nvmet_host_link { + struct list_head entry; + struct nvmet_host *host; +}; + +struct nvmet_subsys_link { + struct list_head entry; + struct nvmet_subsys *subsys; +}; + +struct nvmet_req; +struct nvmet_fabrics_ops { + struct module *owner; + unsigned int type; + unsigned int msdbd; + unsigned int flags; +#define NVMF_KEYED_SGLS (1 << 0) +#define NVMF_METADATA_SUPPORTED (1 << 1) + bool (*peer_to_peer_capable)(struct nvmet_port *port); + int (*create_offload_ctrl)(struct nvmet_ctrl *ctrl); + void (*destroy_offload_ctrl)(struct nvmet_ctrl *ctrl); + int (*enable_offload_ns)(struct nvmet_ctrl *ctrl, struct nvmet_ns *ns); + void (*disable_offload_ns)(struct nvmet_ctrl *ctrl, + struct nvmet_ns *ns); + unsigned int (*peer_to_peer_sqe_inline_size)(struct nvmet_ctrl *ctrl); + u8 (*peer_to_peer_mdts)(struct nvmet_port *port); + u64 (*offload_subsys_unknown_ns_cmds)(struct nvmet_subsys *subsys); + u64 (*offload_ns_read_cmds)(struct nvmet_ns *ns); + u64 (*offload_ns_read_blocks)(struct nvmet_ns *ns); + u64 (*offload_ns_write_cmds)(struct nvmet_ns *ns); + u64 (*offload_ns_write_blocks)(struct nvmet_ns *ns); + u64 (*offload_ns_write_inline_cmds)(struct nvmet_ns *ns); + u64 (*offload_ns_flush_cmds)(struct nvmet_ns *ns); + u64 (*offload_ns_error_cmds)(struct nvmet_ns *ns); + u64 (*offload_ns_backend_error_cmds)(struct nvmet_ns *ns); + bool (*check_subsys_match_offload_port)(struct nvmet_port *port, + struct nvmet_subsys *subsys); + bool (*is_port_active)(struct nvmet_port *port); + void (*queue_response)(struct nvmet_req *req); + int (*add_port)(struct nvmet_port *port); + void (*remove_port)(struct nvmet_port *port); + void (*delete_ctrl)(struct nvmet_ctrl *ctrl); + void (*disc_traddr)(struct nvmet_req *req, + struct nvmet_port *port, char *traddr); + u16 (*install_queue)(struct nvmet_sq *nvme_sq); + void (*discovery_chg)(struct nvmet_port *port); + void (*offload_query_counters)(void *ctx, + struct nvmet_ns_counters *counters); + u8 (*get_mdts)(const struct nvmet_ctrl *ctrl); + u16 (*get_max_queue_size)(const struct nvmet_ctrl *ctrl); +}; + +#define NVMET_MAX_INLINE_BIOVEC 8 +#define NVMET_MAX_INLINE_DATA_LEN NVMET_MAX_INLINE_BIOVEC * PAGE_SIZE + +struct nvmet_req { + struct nvme_command *cmd; + struct nvme_completion *cqe; + struct nvmet_sq *sq; + struct nvmet_cq *cq; + struct nvmet_ns *ns; + struct scatterlist *sg; + struct scatterlist *metadata_sg; + struct bio_vec inline_bvec[NVMET_MAX_INLINE_BIOVEC]; + union { + struct { + struct bio inline_bio; + } b; + struct { + bool mpool_alloc; + struct kiocb iocb; + struct bio_vec *bvec; + struct work_struct work; + } f; + struct { + struct bio inline_bio; + struct request *rq; + struct work_struct work; + bool use_workqueue; + } p; +#ifdef CONFIG_BLK_DEV_ZONED + struct { + struct bio inline_bio; + struct work_struct zmgmt_work; + } z; +#endif /* CONFIG_BLK_DEV_ZONED */ + }; + int sg_cnt; + int metadata_sg_cnt; + /* data length as parsed from the SGL descriptor: */ + size_t transfer_len; + size_t metadata_len; + + struct nvmet_port *port; + + void (*execute)(struct nvmet_req *req); + const struct nvmet_fabrics_ops *ops; + + struct pci_dev *p2p_dev; + struct device *p2p_client; + u16 error_loc; + u64 error_slba; +}; + +extern struct workqueue_struct *buffered_io_wq; +extern struct workqueue_struct *zbd_wq; + +static inline void nvmet_set_result(struct nvmet_req *req, u32 result) +{ + req->cqe->result.u32 = cpu_to_le32(result); +} + +/* + * NVMe command writes actually are DMA reads for us on the target side. + */ +static inline enum dma_data_direction +nvmet_data_dir(struct nvmet_req *req) +{ + return nvme_is_write(req->cmd) ? DMA_FROM_DEVICE : DMA_TO_DEVICE; +} + +struct nvmet_async_event { + struct list_head entry; + u8 event_type; + u8 event_info; + u8 log_page; +}; + +static inline void nvmet_clear_aen_bit(struct nvmet_req *req, u32 bn) +{ + int rae = le32_to_cpu(req->cmd->common.cdw10) & 1 << 15; + + if (!rae) + clear_bit(bn, &req->sq->ctrl->aen_masked); +} + +static inline bool nvmet_aen_bit_disabled(struct nvmet_ctrl *ctrl, u32 bn) +{ + if (!(READ_ONCE(ctrl->aen_enabled) & (1 << bn))) + return true; + return test_and_set_bit(bn, &ctrl->aen_masked); +} + +void nvmet_get_feat_kato(struct nvmet_req *req); +void nvmet_get_feat_async_event(struct nvmet_req *req); +u16 nvmet_set_feat_kato(struct nvmet_req *req); +u16 nvmet_set_feat_async_event(struct nvmet_req *req, u32 mask); +void nvmet_execute_async_event(struct nvmet_req *req); +void nvmet_start_keep_alive_timer(struct nvmet_ctrl *ctrl); +void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl); + +u16 nvmet_parse_connect_cmd(struct nvmet_req *req); +void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id); +u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req); +u16 nvmet_file_parse_io_cmd(struct nvmet_req *req); +u16 nvmet_bdev_zns_parse_io_cmd(struct nvmet_req *req); +u16 nvmet_parse_admin_cmd(struct nvmet_req *req); +u16 nvmet_parse_discovery_cmd(struct nvmet_req *req); +u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req); + +bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, + struct nvmet_sq *sq, const struct nvmet_fabrics_ops *ops); +void nvmet_req_uninit(struct nvmet_req *req); +bool nvmet_check_transfer_len(struct nvmet_req *req, size_t len); +bool nvmet_check_data_len_lte(struct nvmet_req *req, size_t data_len); +void nvmet_req_complete(struct nvmet_req *req, u16 status); +int nvmet_req_alloc_sgls(struct nvmet_req *req); +void nvmet_req_free_sgls(struct nvmet_req *req); + +void nvmet_execute_set_features(struct nvmet_req *req); +void nvmet_execute_get_features(struct nvmet_req *req); +void nvmet_execute_keep_alive(struct nvmet_req *req); + +void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid, + u16 size); +void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, u16 qid, + u16 size); +void nvmet_sq_destroy(struct nvmet_sq *sq); +int nvmet_sq_init(struct nvmet_sq *sq); + +void nvmet_ctrl_fatal_error(struct nvmet_ctrl *ctrl); + +void nvmet_update_cc(struct nvmet_ctrl *ctrl, u32 new); +u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, + struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp); +struct nvmet_ctrl *nvmet_ctrl_find_get(const char *subsysnqn, + const char *hostnqn, u16 cntlid, + struct nvmet_req *req); +void nvmet_ctrl_put(struct nvmet_ctrl *ctrl); +u16 nvmet_check_ctrl_status(struct nvmet_req *req); + +struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn, + enum nvme_subsys_type type); +void nvmet_subsys_put(struct nvmet_subsys *subsys); +void nvmet_subsys_del_ctrls(struct nvmet_subsys *subsys); + +u16 nvmet_req_find_ns(struct nvmet_req *req); +void nvmet_put_namespace(struct nvmet_ns *ns); +int nvmet_ns_enable(struct nvmet_ns *ns); +void nvmet_ns_disable(struct nvmet_ns *ns); +struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid); +void nvmet_ns_free(struct nvmet_ns *ns); + +void nvmet_send_ana_event(struct nvmet_subsys *subsys, + struct nvmet_port *port); +void nvmet_port_send_ana_event(struct nvmet_port *port); + +int nvmet_register_transport(const struct nvmet_fabrics_ops *ops); +void nvmet_unregister_transport(const struct nvmet_fabrics_ops *ops); + +void nvmet_port_del_ctrls(struct nvmet_port *port, + struct nvmet_subsys *subsys); + +int nvmet_enable_port(struct nvmet_port *port, struct nvmet_subsys *subsys); +void nvmet_disable_port(struct nvmet_port *port); +bool nvmet_is_port_active(struct nvmet_port *port); + +void nvmet_init_offload_subsystem_port_attrs(struct nvmet_port *port, + struct nvmet_subsys *subsys); +void nvmet_uninit_offload_subsystem_port_attrs(struct nvmet_subsys *subsys); + +int nvmet_offload_ctx_configfs_create(struct nvmet_offload_ctx *ctx); +void nvmet_offload_ctx_configfs_del(struct nvmet_offload_ctx *ctx); + +void nvmet_referral_enable(struct nvmet_port *parent, struct nvmet_port *port); +void nvmet_referral_disable(struct nvmet_port *parent, struct nvmet_port *port); + +u16 nvmet_copy_to_sgl(struct nvmet_req *req, off_t off, const void *buf, + size_t len); +u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf, + size_t len); +u16 nvmet_zero_sgl(struct nvmet_req *req, off_t off, size_t len); + +u32 nvmet_get_log_page_len(struct nvme_command *cmd); +u64 nvmet_get_log_page_offset(struct nvme_command *cmd); + +extern struct list_head *nvmet_ports; +void nvmet_port_disc_changed(struct nvmet_port *port, + struct nvmet_subsys *subsys); +void nvmet_subsys_disc_changed(struct nvmet_subsys *subsys, + struct nvmet_host *host); +void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type, + u8 event_info, u8 log_page); + +#define NVMET_QUEUE_SIZE 1024 +#define NVMET_NR_QUEUES 128 +#define NVMET_MAX_CMD NVMET_QUEUE_SIZE + +/* + * Nice round number that makes a list of nsids fit into a page. + * Should become tunable at some point in the future. + */ +#define NVMET_MAX_NAMESPACES 1024 + +/* + * 0 is not a valid ANA group ID, so we start numbering at 1. + * + * ANA Group 1 exists without manual intervention, has namespaces assigned to it + * by default, and is available in an optimized state through all ports. + */ +#define NVMET_MAX_ANAGRPS 128 +#define NVMET_DEFAULT_ANA_GRPID 1 +#define NVMET_DEFAULT_CMD_TIMEOUT_USEC 30000000 + +#define NVMET_KAS 10 +#define NVMET_DISC_KATO_MS 120000 + +int __init nvmet_init_configfs(void); +void __exit nvmet_exit_configfs(void); + +int __init nvmet_init_discovery(void); +void nvmet_exit_discovery(void); + +extern struct nvmet_subsys *nvmet_disc_subsys; +extern struct rw_semaphore nvmet_config_sem; + +extern u32 nvmet_ana_group_enabled[NVMET_MAX_ANAGRPS + 1]; +extern u64 nvmet_ana_chgcnt; +extern struct rw_semaphore nvmet_ana_sem; + +bool nvmet_host_allowed(struct nvmet_subsys *subsys, const char *hostnqn); + +int nvmet_bdev_ns_enable(struct nvmet_ns *ns); +int nvmet_file_ns_enable(struct nvmet_ns *ns); +void nvmet_bdev_ns_disable(struct nvmet_ns *ns); +void nvmet_file_ns_disable(struct nvmet_ns *ns); +u16 nvmet_bdev_flush(struct nvmet_req *req); +u16 nvmet_file_flush(struct nvmet_req *req); +void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid); +void nvmet_bdev_ns_revalidate(struct nvmet_ns *ns); +int nvmet_file_ns_revalidate(struct nvmet_ns *ns); +void nvmet_ns_revalidate(struct nvmet_ns *ns); +u16 blk_to_nvme_status(struct nvmet_req *req, blk_status_t blk_sts); + +bool nvmet_bdev_zns_enable(struct nvmet_ns *ns); +void nvmet_execute_identify_cns_cs_ctrl(struct nvmet_req *req); +void nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req); +void nvmet_bdev_execute_zone_mgmt_recv(struct nvmet_req *req); +void nvmet_bdev_execute_zone_mgmt_send(struct nvmet_req *req); +void nvmet_bdev_execute_zone_append(struct nvmet_req *req); + +static inline u32 nvmet_rw_data_len(struct nvmet_req *req) +{ + return ((u32)le16_to_cpu(req->cmd->rw.length) + 1) << + req->ns->blksize_shift; +} + +static inline u32 nvmet_rw_metadata_len(struct nvmet_req *req) +{ + if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) + return 0; + return ((u32)le16_to_cpu(req->cmd->rw.length) + 1) * + req->ns->metadata_size; +} + +static inline u32 nvmet_dsm_len(struct nvmet_req *req) +{ + return (le32_to_cpu(req->cmd->dsm.nr) + 1) * + sizeof(struct nvme_dsm_range); +} + +static inline struct nvmet_subsys *nvmet_req_subsys(struct nvmet_req *req) +{ + return req->sq->ctrl->subsys; +} + +static inline bool nvmet_is_disc_subsys(struct nvmet_subsys *subsys) +{ + return subsys->type != NVME_NQN_NVME; +} + +#ifdef CONFIG_NVME_TARGET_PASSTHRU +void nvmet_passthru_subsys_free(struct nvmet_subsys *subsys); +int nvmet_passthru_ctrl_enable(struct nvmet_subsys *subsys); +void nvmet_passthru_ctrl_disable(struct nvmet_subsys *subsys); +u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req); +u16 nvmet_parse_passthru_io_cmd(struct nvmet_req *req); +static inline bool nvmet_is_passthru_subsys(struct nvmet_subsys *subsys) +{ + return subsys->passthru_ctrl; +} +#else /* CONFIG_NVME_TARGET_PASSTHRU */ +static inline void nvmet_passthru_subsys_free(struct nvmet_subsys *subsys) +{ +} +static inline void nvmet_passthru_ctrl_disable(struct nvmet_subsys *subsys) +{ +} +static inline u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req) +{ + return 0; +} +static inline u16 nvmet_parse_passthru_io_cmd(struct nvmet_req *req) +{ + return 0; +} +static inline bool nvmet_is_passthru_subsys(struct nvmet_subsys *subsys) +{ + return NULL; +} +#endif /* CONFIG_NVME_TARGET_PASSTHRU */ + +static inline bool nvmet_is_passthru_req(struct nvmet_req *req) +{ + return nvmet_is_passthru_subsys(nvmet_req_subsys(req)); +} + +void nvmet_passthrough_override_cap(struct nvmet_ctrl *ctrl); + +u16 errno_to_nvme_status(struct nvmet_req *req, int errno); +u16 nvmet_report_invalid_opcode(struct nvmet_req *req); + +/* Convert a 32-bit number to a 16-bit 0's based number */ +static inline __le16 to0based(u32 a) +{ + return cpu_to_le16(max(1U, min(1U << 16, a)) - 1); +} + +static inline bool nvmet_ns_has_pi(struct nvmet_ns *ns) +{ + if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) + return false; + return ns->pi_type && ns->metadata_size == sizeof(struct t10_pi_tuple); +} + +static inline __le64 nvmet_sect_to_lba(struct nvmet_ns *ns, sector_t sect) +{ + return cpu_to_le64(sect >> (ns->blksize_shift - SECTOR_SHIFT)); +} + +static inline sector_t nvmet_lba_to_sect(struct nvmet_ns *ns, __le64 lba) +{ + return le64_to_cpu(lba) << (ns->blksize_shift - SECTOR_SHIFT); +} + +static inline bool nvmet_use_inline_bvec(struct nvmet_req *req) +{ + return req->transfer_len <= NVMET_MAX_INLINE_DATA_LEN && + req->sg_cnt <= NVMET_MAX_INLINE_BIOVEC; +} + +static inline void nvmet_req_cns_error_complete(struct nvmet_req *req) +{ + pr_debug("unhandled identify cns %d on qid %d\n", + req->cmd->identify.cns, req->sq->qid); + req->error_loc = offsetof(struct nvme_identify, cns); + nvmet_req_complete(req, NVME_SC_INVALID_FIELD | NVME_SC_DNR); +} + +static inline void nvmet_req_bio_put(struct nvmet_req *req, struct bio *bio) +{ + if (bio != &req->b.inline_bio) + bio_put(bio); +} + +#endif /* _NVMET_H */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/nvmet_dummy.c b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/nvmet_dummy.c new file mode 100644 index 0000000..c8ba4b9 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/nvmet_dummy.c @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "nvmet" +#define PFX DRV_NAME ": " +#define DRV_VERSION "2.0.0" +#define DRV_RELDATE "November 14, 2016" + +MODULE_AUTHOR("Alaa Hleihel"); +MODULE_DESCRIPTION("nvmet dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init nvmet_init(void) +{ + return 0; +} + +static void __exit nvmet_cleanup(void) +{ +} + +module_init(nvmet_init); +module_exit(nvmet_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/passthru.c b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/passthru.c new file mode 100644 index 0000000..ee942fe --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/passthru.c @@ -0,0 +1,653 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NVMe Over Fabrics Target Passthrough command implementation. + * + * Copyright (c) 2017-2018 Western Digital Corporation or its + * affiliates. + * Copyright (c) 2019-2020, Eideticom Inc. + * + */ +#ifdef pr_fmt +#undef pr_fmt +#endif +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include + +#include "../host/nvme.h" +#include "nvmet.h" + +MODULE_IMPORT_NS(NVME_TARGET_PASSTHRU); + +/* + * xarray to maintain one passthru subsystem per nvme controller. + */ +static DEFINE_XARRAY(passthru_subsystems); + +void nvmet_passthrough_override_cap(struct nvmet_ctrl *ctrl) +{ + /* + * Multiple command set support can only be declared if the underlying + * controller actually supports it. + */ + if (!nvme_multi_css(ctrl->subsys->passthru_ctrl)) + ctrl->cap &= ~(1ULL << 43); +} + +static u16 nvmet_passthru_override_id_descs(struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + u16 status = NVME_SC_SUCCESS; + int pos, len; + bool csi_seen = false; + void *data; + u8 csi; + + if (!ctrl->subsys->clear_ids) + return status; + + data = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL); + if (!data) + return NVME_SC_INTERNAL; + + status = nvmet_copy_from_sgl(req, 0, data, NVME_IDENTIFY_DATA_SIZE); + if (status) + goto out_free; + + for (pos = 0; pos < NVME_IDENTIFY_DATA_SIZE; pos += len) { + struct nvme_ns_id_desc *cur = data + pos; + + if (cur->nidl == 0) + break; + if (cur->nidt == NVME_NIDT_CSI) { + memcpy(&csi, cur + 1, NVME_NIDT_CSI_LEN); + csi_seen = true; + break; + } + len = sizeof(struct nvme_ns_id_desc) + cur->nidl; + } + + memset(data, 0, NVME_IDENTIFY_DATA_SIZE); + if (csi_seen) { + struct nvme_ns_id_desc *cur = data; + + cur->nidt = NVME_NIDT_CSI; + cur->nidl = NVME_NIDT_CSI_LEN; + memcpy(cur + 1, &csi, NVME_NIDT_CSI_LEN); + } + status = nvmet_copy_to_sgl(req, 0, data, NVME_IDENTIFY_DATA_SIZE); +out_free: + kfree(data); + return status; +} + +static u16 nvmet_passthru_override_id_ctrl(struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + struct nvme_ctrl *pctrl = ctrl->subsys->passthru_ctrl; + u16 status = NVME_SC_SUCCESS; + struct nvme_id_ctrl *id; + unsigned int max_hw_sectors; + int page_shift; + + id = kzalloc(sizeof(*id), GFP_KERNEL); + if (!id) + return NVME_SC_INTERNAL; + + status = nvmet_copy_from_sgl(req, 0, id, sizeof(*id)); + if (status) + goto out_free; + + id->cntlid = cpu_to_le16(ctrl->cntlid); + id->ver = cpu_to_le32(ctrl->subsys->ver); + + /* + * The passthru NVMe driver may have a limit on the number of segments + * which depends on the host's memory fragementation. To solve this, + * ensure mdts is limited to the pages equal to the number of segments. + */ + max_hw_sectors = min_not_zero(pctrl->max_segments << (PAGE_SHIFT - 9), + pctrl->max_hw_sectors); + + /* + * nvmet_passthru_map_sg is limitted to using a single bio so limit + * the mdts based on BIO_MAX_VECS as well + */ + max_hw_sectors = min_not_zero(BIO_MAX_VECS << (PAGE_SHIFT - 9), + max_hw_sectors); + + page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12; + + id->mdts = ilog2(max_hw_sectors) + 9 - page_shift; + + id->acl = 3; + /* + * We export aerl limit for the fabrics controller, update this when + * passthru based aerl support is added. + */ + id->aerl = NVMET_ASYNC_EVENTS - 1; + + /* emulate kas as most of the PCIe ctrl don't have a support for kas */ + id->kas = cpu_to_le16(NVMET_KAS); + + /* don't support host memory buffer */ + id->hmpre = 0; + id->hmmin = 0; + + id->sqes = min_t(__u8, ((0x6 << 4) | 0x6), id->sqes); + id->cqes = min_t(__u8, ((0x4 << 4) | 0x4), id->cqes); + id->maxcmd = cpu_to_le16(NVMET_MAX_CMD); + + /* don't support fuse commands */ + id->fuses = 0; + + id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */ + if (ctrl->ops->flags & NVMF_KEYED_SGLS) + id->sgls |= cpu_to_le32(1 << 2); + if (req->port->inline_data_size) + id->sgls |= cpu_to_le32(1 << 20); + + /* + * When passsthru controller is setup using nvme-loop transport it will + * export the passthru ctrl subsysnqn (PCIe NVMe ctrl) and will fail in + * the nvme/host/core.c in the nvme_init_subsystem()->nvme_active_ctrl() + * code path with duplicate ctr subsynqn. In order to prevent that we + * mask the passthru-ctrl subsysnqn with the target ctrl subsysnqn. + */ + memcpy(id->subnqn, ctrl->subsysnqn, sizeof(id->subnqn)); + + /* use fabric id-ctrl values */ + id->ioccsz = cpu_to_le32((sizeof(struct nvme_command) + + req->port->inline_data_size) / 16); + id->iorcsz = cpu_to_le32(sizeof(struct nvme_completion) / 16); + + id->msdbd = ctrl->ops->msdbd; + + /* Support multipath connections with fabrics */ + id->cmic |= 1 << 1; + + /* Disable reservations, see nvmet_parse_passthru_io_cmd() */ + id->oncs &= cpu_to_le16(~NVME_CTRL_ONCS_RESERVATIONS); + + status = nvmet_copy_to_sgl(req, 0, id, sizeof(struct nvme_id_ctrl)); + +out_free: + kfree(id); + return status; +} + +static u16 nvmet_passthru_override_id_ns(struct nvmet_req *req) +{ + u16 status = NVME_SC_SUCCESS; + struct nvme_id_ns *id; + int i; + + id = kzalloc(sizeof(*id), GFP_KERNEL); + if (!id) + return NVME_SC_INTERNAL; + + status = nvmet_copy_from_sgl(req, 0, id, sizeof(struct nvme_id_ns)); + if (status) + goto out_free; + + for (i = 0; i < (id->nlbaf + 1); i++) + if (id->lbaf[i].ms) + memset(&id->lbaf[i], 0, sizeof(id->lbaf[i])); + + id->flbas = id->flbas & ~(1 << 4); + + /* + * Presently the NVMEof target code does not support sending + * metadata, so we must disable it here. This should be updated + * once target starts supporting metadata. + */ + id->mc = 0; + + if (req->sq->ctrl->subsys->clear_ids) { + memset(id->nguid, 0, NVME_NIDT_NGUID_LEN); + memset(id->eui64, 0, NVME_NIDT_EUI64_LEN); + } + + status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id)); + +out_free: + kfree(id); + return status; +} + +static void nvmet_passthru_execute_cmd_work(struct work_struct *w) +{ + struct nvmet_req *req = container_of(w, struct nvmet_req, p.work); + struct request *rq = req->p.rq; + int status; + + status = nvme_execute_passthru_rq(rq); + + if (status == NVME_SC_SUCCESS && + req->cmd->common.opcode == nvme_admin_identify) { + switch (req->cmd->identify.cns) { + case NVME_ID_CNS_CTRL: + nvmet_passthru_override_id_ctrl(req); + break; + case NVME_ID_CNS_NS: + nvmet_passthru_override_id_ns(req); + break; + case NVME_ID_CNS_NS_DESC_LIST: + nvmet_passthru_override_id_descs(req); + break; + } + } else if (status < 0) + status = NVME_SC_INTERNAL; + + req->cqe->result = nvme_req(rq)->result; + nvmet_req_complete(req, status); + blk_mq_free_request(rq); +} + +static void nvmet_passthru_req_done(struct request *rq, + blk_status_t blk_status) +{ + struct nvmet_req *req = rq->end_io_data; + + req->cqe->result = nvme_req(rq)->result; + nvmet_req_complete(req, nvme_req(rq)->status); + blk_mq_free_request(rq); +} + +static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq) +{ + struct scatterlist *sg; + struct bio *bio; + int i; + + if (req->sg_cnt > BIO_MAX_VECS) + return -EINVAL; + + if (nvmet_use_inline_bvec(req)) { + bio = &req->p.inline_bio; + bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec)); + } else { + bio = bio_alloc(GFP_KERNEL, bio_max_segs(req->sg_cnt)); + bio->bi_end_io = bio_put; + } + bio->bi_opf = req_op(rq); + + for_each_sg(req->sg, sg, req->sg_cnt, i) { + if (bio_add_pc_page(rq->q, bio, sg_page(sg), sg->length, + sg->offset) < sg->length) { + nvmet_req_bio_put(req, bio); + return -EINVAL; + } + } + + blk_rq_bio_prep(rq, bio, req->sg_cnt); + + return 0; +} + +static void nvmet_passthru_execute_cmd(struct nvmet_req *req) +{ + struct nvme_ctrl *ctrl = nvmet_req_subsys(req)->passthru_ctrl; + struct request_queue *q = ctrl->admin_q; + struct nvme_ns *ns = NULL; + struct request *rq = NULL; + unsigned int timeout; + u32 effects; + u16 status; + int ret; + + if (likely(req->sq->qid != 0)) { + u32 nsid = le32_to_cpu(req->cmd->common.nsid); + + ns = nvme_find_get_ns(ctrl, nsid); + if (unlikely(!ns)) { + pr_err("failed to get passthru ns nsid:%u\n", nsid); + status = NVME_SC_INVALID_NS | NVME_SC_DNR; + goto out; + } + + q = ns->queue; + timeout = nvmet_req_subsys(req)->io_timeout; + } else { + timeout = nvmet_req_subsys(req)->admin_timeout; + } + + rq = nvme_alloc_request(q, req->cmd, 0); + if (IS_ERR(rq)) { + status = NVME_SC_INTERNAL; + goto out_put_ns; + } + + if (timeout) + rq->timeout = timeout; + + if (req->sg_cnt) { + ret = nvmet_passthru_map_sg(req, rq); + if (unlikely(ret)) { + status = NVME_SC_INTERNAL; + goto out_put_req; + } + } + + /* + * If there are effects for the command we are about to execute, or + * an end_req function we need to use nvme_execute_passthru_rq() + * synchronously in a work item seeing the end_req function and + * nvme_passthru_end() can't be called in the request done callback + * which is typically in interrupt context. + */ + effects = nvme_command_effects(ctrl, ns, req->cmd->common.opcode); + if (req->p.use_workqueue || effects) { + INIT_WORK(&req->p.work, nvmet_passthru_execute_cmd_work); + req->p.rq = rq; + schedule_work(&req->p.work); + } else { + rq->end_io_data = req; + blk_execute_rq_nowait(rq, false, nvmet_passthru_req_done); + } + + if (ns) + nvme_put_ns(ns); + + return; + +out_put_req: + blk_mq_free_request(rq); +out_put_ns: + if (ns) + nvme_put_ns(ns); +out: + nvmet_req_complete(req, status); +} + +/* + * We need to emulate set host behaviour to ensure that any requested + * behaviour of the target's host matches the requested behaviour + * of the device's host and fail otherwise. + */ +static void nvmet_passthru_set_host_behaviour(struct nvmet_req *req) +{ + struct nvme_ctrl *ctrl = nvmet_req_subsys(req)->passthru_ctrl; + struct nvme_feat_host_behavior *host; + u16 status = NVME_SC_INTERNAL; + int ret; + + host = kzalloc(sizeof(*host) * 2, GFP_KERNEL); + if (!host) + goto out_complete_req; + + ret = nvme_get_features(ctrl, NVME_FEAT_HOST_BEHAVIOR, 0, + host, sizeof(*host), NULL); + if (ret) + goto out_free_host; + + status = nvmet_copy_from_sgl(req, 0, &host[1], sizeof(*host)); + if (status) + goto out_free_host; + + if (memcmp(&host[0], &host[1], sizeof(host[0]))) { + pr_warn("target host has requested different behaviour from the local host\n"); + status = NVME_SC_INTERNAL; + } + +out_free_host: + kfree(host); +out_complete_req: + nvmet_req_complete(req, status); +} + +static u16 nvmet_setup_passthru_command(struct nvmet_req *req) +{ + req->p.use_workqueue = false; + req->execute = nvmet_passthru_execute_cmd; + return NVME_SC_SUCCESS; +} + +u16 nvmet_parse_passthru_io_cmd(struct nvmet_req *req) +{ + /* Reject any commands with non-sgl flags set (ie. fused commands) */ + if (req->cmd->common.flags & ~NVME_CMD_SGL_ALL) + return NVME_SC_INVALID_FIELD; + + switch (req->cmd->common.opcode) { + case nvme_cmd_resv_register: + case nvme_cmd_resv_report: + case nvme_cmd_resv_acquire: + case nvme_cmd_resv_release: + /* + * Reservations cannot be supported properly because the + * underlying device has no way of differentiating different + * hosts that connect via fabrics. This could potentially be + * emulated in the future if regular targets grow support for + * this feature. + */ + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } + + return nvmet_setup_passthru_command(req); +} + +/* + * Only features that are emulated or specifically allowed in the list are + * passed down to the controller. This function implements the allow list for + * both get and set features. + */ +static u16 nvmet_passthru_get_set_features(struct nvmet_req *req) +{ + switch (le32_to_cpu(req->cmd->features.fid)) { + case NVME_FEAT_ARBITRATION: + case NVME_FEAT_POWER_MGMT: + case NVME_FEAT_LBA_RANGE: + case NVME_FEAT_TEMP_THRESH: + case NVME_FEAT_ERR_RECOVERY: + case NVME_FEAT_VOLATILE_WC: + case NVME_FEAT_WRITE_ATOMIC: + case NVME_FEAT_AUTO_PST: + case NVME_FEAT_TIMESTAMP: + case NVME_FEAT_HCTM: + case NVME_FEAT_NOPSC: + case NVME_FEAT_RRL: + case NVME_FEAT_PLM_CONFIG: + case NVME_FEAT_PLM_WINDOW: + case NVME_FEAT_HOST_BEHAVIOR: + case NVME_FEAT_SANITIZE: + case NVME_FEAT_VENDOR_START ... NVME_FEAT_VENDOR_END: + return nvmet_setup_passthru_command(req); + + case NVME_FEAT_ASYNC_EVENT: + /* There is no support for forwarding ASYNC events */ + case NVME_FEAT_IRQ_COALESCE: + case NVME_FEAT_IRQ_CONFIG: + /* The IRQ settings will not apply to the target controller */ + case NVME_FEAT_HOST_MEM_BUF: + /* + * Any HMB that's set will not be passed through and will + * not work as expected + */ + case NVME_FEAT_SW_PROGRESS: + /* + * The Pre-Boot Software Load Count doesn't make much + * sense for a target to export + */ + case NVME_FEAT_RESV_MASK: + case NVME_FEAT_RESV_PERSIST: + /* No reservations, see nvmet_parse_passthru_io_cmd() */ + default: + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } +} + +u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req) +{ + /* Reject any commands with non-sgl flags set (ie. fused commands) */ + if (req->cmd->common.flags & ~NVME_CMD_SGL_ALL) + return NVME_SC_INVALID_FIELD; + + /* + * Passthru all vendor specific commands + */ + if (req->cmd->common.opcode >= nvme_admin_vendor_start) + return nvmet_setup_passthru_command(req); + + switch (req->cmd->common.opcode) { + case nvme_admin_async_event: + req->execute = nvmet_execute_async_event; + return NVME_SC_SUCCESS; + case nvme_admin_keep_alive: + /* + * Most PCIe ctrls don't support keep alive cmd, we route keep + * alive to the non-passthru mode. In future please change this + * code when PCIe ctrls with keep alive support available. + */ + req->execute = nvmet_execute_keep_alive; + return NVME_SC_SUCCESS; + case nvme_admin_set_features: + switch (le32_to_cpu(req->cmd->features.fid)) { + case NVME_FEAT_ASYNC_EVENT: + case NVME_FEAT_KATO: + case NVME_FEAT_NUM_QUEUES: + case NVME_FEAT_HOST_ID: + req->execute = nvmet_execute_set_features; + return NVME_SC_SUCCESS; + case NVME_FEAT_HOST_BEHAVIOR: + req->execute = nvmet_passthru_set_host_behaviour; + return NVME_SC_SUCCESS; + default: + return nvmet_passthru_get_set_features(req); + } + break; + case nvme_admin_get_features: + switch (le32_to_cpu(req->cmd->features.fid)) { + case NVME_FEAT_ASYNC_EVENT: + case NVME_FEAT_KATO: + case NVME_FEAT_NUM_QUEUES: + case NVME_FEAT_HOST_ID: + req->execute = nvmet_execute_get_features; + return NVME_SC_SUCCESS; + default: + return nvmet_passthru_get_set_features(req); + } + break; + case nvme_admin_identify: + switch (req->cmd->identify.cns) { + case NVME_ID_CNS_CTRL: + req->execute = nvmet_passthru_execute_cmd; + req->p.use_workqueue = true; + return NVME_SC_SUCCESS; + case NVME_ID_CNS_CS_CTRL: + switch (req->cmd->identify.csi) { + case NVME_CSI_ZNS: + req->execute = nvmet_passthru_execute_cmd; + req->p.use_workqueue = true; + return NVME_SC_SUCCESS; + } + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + case NVME_ID_CNS_NS: + req->execute = nvmet_passthru_execute_cmd; + req->p.use_workqueue = true; + return NVME_SC_SUCCESS; + case NVME_ID_CNS_CS_NS: + switch (req->cmd->identify.csi) { + case NVME_CSI_ZNS: + req->execute = nvmet_passthru_execute_cmd; + req->p.use_workqueue = true; + return NVME_SC_SUCCESS; + } + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + default: + return nvmet_setup_passthru_command(req); + } + case nvme_admin_get_log_page: + return nvmet_setup_passthru_command(req); + default: + /* Reject commands not in the allowlist above */ + return nvmet_report_invalid_opcode(req); + } +} + +int nvmet_passthru_ctrl_enable(struct nvmet_subsys *subsys) +{ + struct nvme_ctrl *ctrl; + struct file *file; + int ret = -EINVAL; + void *old; + + mutex_lock(&subsys->lock); + if (!subsys->passthru_ctrl_path) + goto out_unlock; + if (subsys->passthru_ctrl) + goto out_unlock; + + if (subsys->nr_namespaces) { + pr_info("cannot enable both passthru and regular namespaces for a single subsystem"); + goto out_unlock; + } + + file = filp_open(subsys->passthru_ctrl_path, O_RDWR, 0); + if (IS_ERR(file)) { + ret = PTR_ERR(file); + goto out_unlock; + } + + ctrl = nvme_ctrl_from_file(file); + if (!ctrl) { + pr_err("failed to open nvme controller %s\n", + subsys->passthru_ctrl_path); + + goto out_put_file; + } + + old = xa_cmpxchg(&passthru_subsystems, ctrl->cntlid, NULL, + subsys, GFP_KERNEL); + if (xa_is_err(old)) { + ret = xa_err(old); + goto out_put_file; + } + + if (old) + goto out_put_file; + + subsys->passthru_ctrl = ctrl; + subsys->ver = ctrl->vs; + + if (subsys->ver < NVME_VS(1, 2, 1)) { + pr_warn("nvme controller version is too old: %llu.%llu.%llu, advertising 1.2.1\n", + NVME_MAJOR(subsys->ver), NVME_MINOR(subsys->ver), + NVME_TERTIARY(subsys->ver)); + subsys->ver = NVME_VS(1, 2, 1); + } + nvme_get_ctrl(ctrl); + __module_get(subsys->passthru_ctrl->ops->module); + ret = 0; + +out_put_file: + filp_close(file, NULL); +out_unlock: + mutex_unlock(&subsys->lock); + return ret; +} + +static void __nvmet_passthru_ctrl_disable(struct nvmet_subsys *subsys) +{ + if (subsys->passthru_ctrl) { + xa_erase(&passthru_subsystems, subsys->passthru_ctrl->cntlid); + module_put(subsys->passthru_ctrl->ops->module); + nvme_put_ctrl(subsys->passthru_ctrl); + } + subsys->passthru_ctrl = NULL; + subsys->ver = NVMET_DEFAULT_VS; +} + +void nvmet_passthru_ctrl_disable(struct nvmet_subsys *subsys) +{ + mutex_lock(&subsys->lock); + __nvmet_passthru_ctrl_disable(subsys); + mutex_unlock(&subsys->lock); +} + +void nvmet_passthru_subsys_free(struct nvmet_subsys *subsys) +{ + mutex_lock(&subsys->lock); + __nvmet_passthru_ctrl_disable(subsys); + mutex_unlock(&subsys->lock); + kfree(subsys->passthru_ctrl_path); +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/rdma.c b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/rdma.c new file mode 100644 index 0000000..dd6d556 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/rdma.c @@ -0,0 +1,2279 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NVMe over Fabrics RDMA target. + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + */ +#ifdef pr_fmt +#undef pr_fmt +#endif +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include "nvmet.h" +#include "rdma_offload.h" + +/* + * We allow at least 1 page, up to 4 SGEs, and up to 16KB of inline data + */ +#define NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE PAGE_SIZE +#define NVMET_RDMA_MAX_INLINE_SGE 4 +#define NVMET_RDMA_MAX_INLINE_DATA_SIZE max_t(int, SZ_16K, PAGE_SIZE) + +/* Assume mpsmin == device_page_size == 4KB */ +#define NVMET_RDMA_MAX_MDTS 8 +#define NVMET_RDMA_MAX_METADATA_MDTS 5 + +struct nvmet_rdma_srq; + +struct nvmet_rdma_cmd { + struct ib_sge sge[NVMET_RDMA_MAX_INLINE_SGE + 1]; + struct ib_cqe cqe; + struct ib_recv_wr wr; + struct scatterlist inline_sg[NVMET_RDMA_MAX_INLINE_SGE]; + struct nvme_command *nvme_cmd; + struct nvmet_rdma_queue *queue; + struct nvmet_rdma_srq *nsrq; +}; + +enum { + NVMET_RDMA_REQ_INLINE_DATA = (1 << 0), + NVMET_RDMA_REQ_INVALIDATE_RKEY = (1 << 1), +}; + +struct nvmet_rdma_rsp { + struct ib_sge send_sge; + struct ib_cqe send_cqe; + struct ib_send_wr send_wr; + + struct nvmet_rdma_cmd *cmd; + struct nvmet_rdma_queue *queue; + + struct ib_cqe read_cqe; + struct ib_cqe write_cqe; + struct rdma_rw_ctx rw; + + struct nvmet_req req; + + bool allocated; + u8 n_rdma; + u32 flags; + u32 invalidate_rkey; + + struct list_head wait_list; + struct list_head free_list; +}; + +enum nvmet_rdma_queue_state { + NVMET_RDMA_Q_CONNECTING, + NVMET_RDMA_Q_LIVE, + NVMET_RDMA_Q_DISCONNECTING, +}; + +struct nvmet_rdma_queue { + struct rdma_cm_id *cm_id; + struct ib_qp *qp; + struct nvmet_port *port; + struct ib_cq *cq; + atomic_t sq_wr_avail; + struct nvmet_rdma_device *dev; + struct nvmet_rdma_srq *nsrq; + spinlock_t state_lock; + enum nvmet_rdma_queue_state state; + struct nvmet_cq nvme_cq; + struct nvmet_sq nvme_sq; + + struct nvmet_rdma_rsp *rsps; + struct list_head free_rsps; + spinlock_t rsps_lock; + struct nvmet_rdma_cmd *cmds; + + struct work_struct release_work; + struct work_struct disconnect_work; + struct list_head rsp_wait_list; + struct list_head rsp_wr_wait_list; + spinlock_t rsp_wr_wait_lock; + + int idx; + int host_qid; + int comp_vector; + int recv_queue_size; + int send_queue_size; + + struct list_head queue_list; + + bool offload; + struct nvmet_rdma_xrq *xrq; +}; + +struct nvmet_rdma_port { + struct nvmet_port *nport; + struct sockaddr_storage addr; + struct rdma_cm_id *cm_id; + __be64 node_guid; + struct list_head entry; + struct delayed_work repair_work; +}; + +struct nvmet_rdma_srq { + struct ib_srq *srq; + struct nvmet_rdma_cmd *cmds; + struct nvmet_rdma_device *ndev; +}; + +struct nvmet_rdma_device { + struct ib_device *device; + struct ib_pd *pd; + struct nvmet_rdma_srq **srqs; + int srq_count; + size_t srq_size; + struct kref ref; + struct list_head entry; + int inline_data_size; + int inline_page_count; + bool rts2rts_qp_rmp; +}; + +static bool nvmet_rdma_use_srq; +module_param_named(use_srq, nvmet_rdma_use_srq, bool, 0444); +MODULE_PARM_DESC(use_srq, "Use shared receive queue."); + +static int srq_size_set(const char *val, const struct kernel_param *kp); +static const struct kernel_param_ops srq_size_ops = { + .set = srq_size_set, + .get = param_get_int, +}; + +static int nvmet_rdma_srq_size = 1024; +module_param_cb(srq_size, &srq_size_ops, &nvmet_rdma_srq_size, 0644); +MODULE_PARM_DESC(srq_size, "set Shared Receive Queue (SRQ) size, should >= 256 (default: 1024)"); + +static unsigned long long nvmet_rdma_offload_mem_start = 0; +module_param_named(offload_mem_start, nvmet_rdma_offload_mem_start, ullong, 0444); +MODULE_PARM_DESC(offload_mem_start, + "Start address of the memory dedicated for P2P data transfer. If not set, the driver will allocate 1MB staging buffer per offload context." + "Using bigger staging buffer will improve performance. Must be contiguous and aligned to" __stringify(PAGE_SIZE) "(default:0)"); + +static unsigned int nvmet_rdma_offload_mem_size_mb = 0; +module_param_named(offload_mem_size, nvmet_rdma_offload_mem_size_mb, uint, 0444); +MODULE_PARM_DESC(offload_mem_size, "Total dedicated memory size (in MiB) for P2P data transfers. The result of (offload_buffer_size * number offload context created) must not" + " exceed this value. Only used if offload_mem_start param is set (default:0)"); + +static int offload_buffer_size_set(const char *val, + const struct kernel_param *kp); +static const struct kernel_param_ops offload_buffer_size_ops = { + .set = offload_buffer_size_set, + .get = param_get_int, +}; + +static int nvmet_rdma_offload_buffer_size_mb = 128; +module_param_cb(offload_buffer_size, &offload_buffer_size_ops, + &nvmet_rdma_offload_buffer_size_mb, 0644); +MODULE_PARM_DESC(offload_buffer_size, "Staging buffer size (in MiB) per offload context. For static staging buffer, the result of (offload_buffer_size * number offload context created)" + " must not exceed offload_mem_size and only used if offload_mem_start and offload_mem_size params are set. Should be >= 8 (default:128)"); + +static DEFINE_IDA(nvmet_rdma_queue_ida); +static LIST_HEAD(nvmet_rdma_queue_list); +static DEFINE_MUTEX(nvmet_rdma_queue_mutex); + +static LIST_HEAD(device_list); +static DEFINE_MUTEX(device_list_mutex); + +static LIST_HEAD(port_list); +static DEFINE_MUTEX(port_list_mutex); + +static LIST_HEAD(nvmet_rdma_xrq_list); +static DEFINE_MUTEX(nvmet_rdma_xrq_mutex); +static struct nvmet_rdma_staging_buf_pool nvmet_rdma_st_pool; +static DEFINE_IDA(nvmet_rdma_bectrl_ida); + +static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp); +static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc); +static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc); +static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc); +static void nvmet_rdma_write_data_done(struct ib_cq *cq, struct ib_wc *wc); +static void nvmet_rdma_qp_event(struct ib_event *event, void *priv); +static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue); +static void nvmet_rdma_free_dev(struct kref *ref); +static void nvmet_rdma_free_rsp(struct nvmet_rdma_device *ndev, + struct nvmet_rdma_rsp *r); +static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev, + struct nvmet_rdma_rsp *r); + +static const struct nvmet_fabrics_ops nvmet_rdma_ops; + +static int srq_size_set(const char *val, const struct kernel_param *kp) +{ + int n = 0, ret; + + ret = kstrtoint(val, 10, &n); + if (ret != 0 || n < 256) + return -EINVAL; + + return param_set_int(val, kp); +} + +static int num_pages(int len) +{ + return 1 + (((len - 1) & PAGE_MASK) >> PAGE_SHIFT); +} + +static int offload_buffer_size_set(const char *val, + const struct kernel_param *kp) +{ + int n = 0, ret; + + ret = kstrtoint(val, 0, &n); + if (ret != 0 || n < 8) + return -EINVAL; + + return param_set_int(val, kp); +} + +static inline bool nvmet_rdma_need_data_in(struct nvmet_rdma_rsp *rsp) +{ + return nvme_is_write(rsp->req.cmd) && + rsp->req.transfer_len && + !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA); +} + +static inline bool nvmet_rdma_need_data_out(struct nvmet_rdma_rsp *rsp) +{ + return !nvme_is_write(rsp->req.cmd) && + rsp->req.transfer_len && + !rsp->req.cqe->status && + !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA); +} + +static inline struct nvmet_rdma_rsp * +nvmet_rdma_get_rsp(struct nvmet_rdma_queue *queue) +{ + struct nvmet_rdma_rsp *rsp; + unsigned long flags; + + spin_lock_irqsave(&queue->rsps_lock, flags); + + rsp = list_first_entry_or_null(&queue->free_rsps, + struct nvmet_rdma_rsp, free_list); + if (likely(rsp)) + list_del(&rsp->free_list); + spin_unlock_irqrestore(&queue->rsps_lock, flags); + + if (unlikely(!rsp)) { + int ret; + + rsp = kzalloc(sizeof(*rsp), GFP_KERNEL); + if (unlikely(!rsp)) + return NULL; + ret = nvmet_rdma_alloc_rsp(queue->dev, rsp); + if (unlikely(ret)) { + kfree(rsp); + return NULL; + } + + rsp->allocated = true; + } + + return rsp; +} + +static inline void +nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp) +{ + unsigned long flags; + + if (unlikely(rsp->allocated)) { + nvmet_rdma_free_rsp(rsp->queue->dev, rsp); + kfree(rsp); + return; + } + + spin_lock_irqsave(&rsp->queue->rsps_lock, flags); + list_add_tail(&rsp->free_list, &rsp->queue->free_rsps); + spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags); +} + +static void nvmet_rdma_free_inline_pages(struct nvmet_rdma_device *ndev, + struct nvmet_rdma_cmd *c) +{ + struct scatterlist *sg; + struct ib_sge *sge; + int i; + + if (!ndev->inline_data_size) + return; + + sg = c->inline_sg; + sge = &c->sge[1]; + + for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) { + if (sge->length) + ib_dma_unmap_page(ndev->device, sge->addr, + sge->length, DMA_FROM_DEVICE); + if (sg_page(sg)) + __free_page(sg_page(sg)); + } +} + +static int nvmet_rdma_alloc_inline_pages(struct nvmet_rdma_device *ndev, + struct nvmet_rdma_cmd *c) +{ + struct scatterlist *sg; + struct ib_sge *sge; + struct page *pg; + int len; + int i; + + if (!ndev->inline_data_size) + return 0; + + sg = c->inline_sg; + sg_init_table(sg, ndev->inline_page_count); + sge = &c->sge[1]; + len = ndev->inline_data_size; + + for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) { + pg = alloc_page(GFP_KERNEL); + if (!pg) + goto out_err; + sg_assign_page(sg, pg); + sge->addr = ib_dma_map_page(ndev->device, + pg, 0, PAGE_SIZE, DMA_FROM_DEVICE); + if (ib_dma_mapping_error(ndev->device, sge->addr)) + goto out_err; + sge->length = min_t(int, len, PAGE_SIZE); + sge->lkey = ndev->pd->local_dma_lkey; + len -= sge->length; + } + + return 0; +out_err: + for (; i >= 0; i--, sg--, sge--) { + if (sge->length) + ib_dma_unmap_page(ndev->device, sge->addr, + sge->length, DMA_FROM_DEVICE); + if (sg_page(sg)) + __free_page(sg_page(sg)); + } + return -ENOMEM; +} + +static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev, + struct nvmet_rdma_cmd *c, bool admin) +{ + /* NVMe command / RDMA RECV */ + c->nvme_cmd = kmalloc(sizeof(*c->nvme_cmd), GFP_KERNEL); + if (!c->nvme_cmd) + goto out; + + c->sge[0].addr = ib_dma_map_single(ndev->device, c->nvme_cmd, + sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); + if (ib_dma_mapping_error(ndev->device, c->sge[0].addr)) + goto out_free_cmd; + + c->sge[0].length = sizeof(*c->nvme_cmd); + c->sge[0].lkey = ndev->pd->local_dma_lkey; + + if (!admin && nvmet_rdma_alloc_inline_pages(ndev, c)) + goto out_unmap_cmd; + + c->cqe.done = nvmet_rdma_recv_done; + + c->wr.wr_cqe = &c->cqe; + c->wr.sg_list = c->sge; + c->wr.num_sge = admin ? 1 : ndev->inline_page_count + 1; + + return 0; + +out_unmap_cmd: + ib_dma_unmap_single(ndev->device, c->sge[0].addr, + sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); +out_free_cmd: + kfree(c->nvme_cmd); + +out: + return -ENOMEM; +} + +static void nvmet_rdma_free_cmd(struct nvmet_rdma_device *ndev, + struct nvmet_rdma_cmd *c, bool admin) +{ + if (!admin) + nvmet_rdma_free_inline_pages(ndev, c); + ib_dma_unmap_single(ndev->device, c->sge[0].addr, + sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); + kfree(c->nvme_cmd); +} + +static struct nvmet_rdma_cmd * +nvmet_rdma_alloc_cmds(struct nvmet_rdma_device *ndev, + int nr_cmds, bool admin) +{ + struct nvmet_rdma_cmd *cmds; + int ret = -EINVAL, i; + + cmds = kcalloc(nr_cmds, sizeof(struct nvmet_rdma_cmd), GFP_KERNEL); + if (!cmds) + goto out; + + for (i = 0; i < nr_cmds; i++) { + ret = nvmet_rdma_alloc_cmd(ndev, cmds + i, admin); + if (ret) + goto out_free; + } + + return cmds; + +out_free: + while (--i >= 0) + nvmet_rdma_free_cmd(ndev, cmds + i, admin); + kfree(cmds); +out: + return ERR_PTR(ret); +} + +static void nvmet_rdma_free_cmds(struct nvmet_rdma_device *ndev, + struct nvmet_rdma_cmd *cmds, int nr_cmds, bool admin) +{ + int i; + + for (i = 0; i < nr_cmds; i++) + nvmet_rdma_free_cmd(ndev, cmds + i, admin); + kfree(cmds); +} + +static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev, + struct nvmet_rdma_rsp *r) +{ + /* NVMe CQE / RDMA SEND */ + r->req.cqe = kmalloc(sizeof(*r->req.cqe), GFP_KERNEL); + if (!r->req.cqe) + goto out; + + r->send_sge.addr = ib_dma_map_single(ndev->device, r->req.cqe, + sizeof(*r->req.cqe), DMA_TO_DEVICE); + if (ib_dma_mapping_error(ndev->device, r->send_sge.addr)) + goto out_free_rsp; + + if (!ib_uses_virt_dma(ndev->device)) + r->req.p2p_client = &ndev->device->dev; + r->send_sge.length = sizeof(*r->req.cqe); + r->send_sge.lkey = ndev->pd->local_dma_lkey; + + r->send_cqe.done = nvmet_rdma_send_done; + + r->send_wr.wr_cqe = &r->send_cqe; + r->send_wr.sg_list = &r->send_sge; + r->send_wr.num_sge = 1; + r->send_wr.send_flags = IB_SEND_SIGNALED; + + /* Data In / RDMA READ */ + r->read_cqe.done = nvmet_rdma_read_data_done; + /* Data Out / RDMA WRITE */ + r->write_cqe.done = nvmet_rdma_write_data_done; + + return 0; + +out_free_rsp: + kfree(r->req.cqe); +out: + return -ENOMEM; +} + +static void nvmet_rdma_free_rsp(struct nvmet_rdma_device *ndev, + struct nvmet_rdma_rsp *r) +{ + ib_dma_unmap_single(ndev->device, r->send_sge.addr, + sizeof(*r->req.cqe), DMA_TO_DEVICE); + kfree(r->req.cqe); +} + +static int +nvmet_rdma_alloc_rsps(struct nvmet_rdma_queue *queue) +{ + struct nvmet_rdma_device *ndev = queue->dev; + int nr_rsps = queue->recv_queue_size * 2; + int ret = -EINVAL, i; + + queue->rsps = kcalloc(nr_rsps, sizeof(struct nvmet_rdma_rsp), + GFP_KERNEL); + if (!queue->rsps) + goto out; + + for (i = 0; i < nr_rsps; i++) { + struct nvmet_rdma_rsp *rsp = &queue->rsps[i]; + + ret = nvmet_rdma_alloc_rsp(ndev, rsp); + if (ret) + goto out_free; + + list_add_tail(&rsp->free_list, &queue->free_rsps); + } + + return 0; + +out_free: + while (--i >= 0) { + struct nvmet_rdma_rsp *rsp = &queue->rsps[i]; + + list_del(&rsp->free_list); + nvmet_rdma_free_rsp(ndev, rsp); + } + kfree(queue->rsps); +out: + return ret; +} + +static void nvmet_rdma_free_rsps(struct nvmet_rdma_queue *queue) +{ + struct nvmet_rdma_device *ndev = queue->dev; + int i, nr_rsps = queue->recv_queue_size * 2; + + for (i = 0; i < nr_rsps; i++) { + struct nvmet_rdma_rsp *rsp = &queue->rsps[i]; + + list_del(&rsp->free_list); + nvmet_rdma_free_rsp(ndev, rsp); + } + kfree(queue->rsps); +} + +static int nvmet_rdma_post_recv(struct nvmet_rdma_device *ndev, + struct nvmet_rdma_cmd *cmd) +{ + int ret; + + ib_dma_sync_single_for_device(ndev->device, + cmd->sge[0].addr, cmd->sge[0].length, + DMA_FROM_DEVICE); + + if (cmd->nsrq) + ret = ib_post_srq_recv(cmd->nsrq->srq, &cmd->wr, NULL); + else + ret = ib_post_recv(cmd->queue->qp, &cmd->wr, NULL); + + if (unlikely(ret)) + pr_err("post_recv cmd failed\n"); + + return ret; +} + +static void nvmet_rdma_process_wr_wait_list(struct nvmet_rdma_queue *queue) +{ + spin_lock(&queue->rsp_wr_wait_lock); + while (!list_empty(&queue->rsp_wr_wait_list)) { + struct nvmet_rdma_rsp *rsp; + bool ret; + + rsp = list_entry(queue->rsp_wr_wait_list.next, + struct nvmet_rdma_rsp, wait_list); + list_del(&rsp->wait_list); + + spin_unlock(&queue->rsp_wr_wait_lock); + ret = nvmet_rdma_execute_command(rsp); + spin_lock(&queue->rsp_wr_wait_lock); + + if (!ret) { + list_add(&rsp->wait_list, &queue->rsp_wr_wait_list); + break; + } + } + spin_unlock(&queue->rsp_wr_wait_lock); +} + +static u16 nvmet_rdma_check_pi_status(struct ib_mr *sig_mr) +{ + struct ib_mr_status mr_status; + int ret; + u16 status = 0; + + ret = ib_check_mr_status(sig_mr, IB_MR_CHECK_SIG_STATUS, &mr_status); + if (ret) { + pr_err("ib_check_mr_status failed, ret %d\n", ret); + return NVME_SC_INVALID_PI; + } + + if (mr_status.fail_status & IB_MR_CHECK_SIG_STATUS) { + switch (mr_status.sig_err.err_type) { + case IB_SIG_BAD_GUARD: + status = NVME_SC_GUARD_CHECK; + break; + case IB_SIG_BAD_REFTAG: + status = NVME_SC_REFTAG_CHECK; + break; + case IB_SIG_BAD_APPTAG: + status = NVME_SC_APPTAG_CHECK; + break; + } + pr_err("PI error found type %d expected 0x%x vs actual 0x%x\n", + mr_status.sig_err.err_type, + mr_status.sig_err.expected, + mr_status.sig_err.actual); + } + + return status; +} + +static void nvmet_rdma_set_sig_domain(struct blk_integrity *bi, + struct nvme_command *cmd, struct ib_sig_domain *domain, + u16 control, u8 pi_type) +{ + domain->sig_type = IB_SIG_TYPE_T10_DIF; + domain->sig.dif.bg_type = IB_T10DIF_CRC; + domain->sig.dif.pi_interval = 1 << bi->interval_exp; + domain->sig.dif.ref_tag = le32_to_cpu(cmd->rw.reftag); + if (control & NVME_RW_PRINFO_PRCHK_REF) + domain->sig.dif.ref_remap = true; + + domain->sig.dif.app_tag = le16_to_cpu(cmd->rw.apptag); + domain->sig.dif.apptag_check_mask = le16_to_cpu(cmd->rw.appmask); + domain->sig.dif.app_escape = true; + if (pi_type == NVME_NS_DPS_PI_TYPE3) + domain->sig.dif.ref_escape = true; +} + +static void nvmet_rdma_set_sig_attrs(struct nvmet_req *req, + struct ib_sig_attrs *sig_attrs) +{ + struct nvme_command *cmd = req->cmd; + u16 control = le16_to_cpu(cmd->rw.control); + u8 pi_type = req->ns->pi_type; + struct blk_integrity *bi; + + bi = bdev_get_integrity(req->ns->bdev); + + memset(sig_attrs, 0, sizeof(*sig_attrs)); + + if (control & NVME_RW_PRINFO_PRACT) { + /* for WRITE_INSERT/READ_STRIP no wire domain */ + sig_attrs->wire.sig_type = IB_SIG_TYPE_NONE; + nvmet_rdma_set_sig_domain(bi, cmd, &sig_attrs->mem, control, + pi_type); + /* Clear the PRACT bit since HCA will generate/verify the PI */ + control &= ~NVME_RW_PRINFO_PRACT; + cmd->rw.control = cpu_to_le16(control); + /* PI is added by the HW */ + req->transfer_len += req->metadata_len; + } else { + /* for WRITE_PASS/READ_PASS both wire/memory domains exist */ + nvmet_rdma_set_sig_domain(bi, cmd, &sig_attrs->wire, control, + pi_type); + nvmet_rdma_set_sig_domain(bi, cmd, &sig_attrs->mem, control, + pi_type); + } + + if (control & NVME_RW_PRINFO_PRCHK_REF) + sig_attrs->check_mask |= IB_SIG_CHECK_REFTAG; + if (control & NVME_RW_PRINFO_PRCHK_GUARD) + sig_attrs->check_mask |= IB_SIG_CHECK_GUARD; + if (control & NVME_RW_PRINFO_PRCHK_APP) + sig_attrs->check_mask |= IB_SIG_CHECK_APPTAG; +} + +static int nvmet_rdma_rw_ctx_init(struct nvmet_rdma_rsp *rsp, u64 addr, u32 key, + struct ib_sig_attrs *sig_attrs) +{ + struct rdma_cm_id *cm_id = rsp->queue->cm_id; + struct nvmet_req *req = &rsp->req; + int ret; + + if (req->metadata_len) + ret = rdma_rw_ctx_signature_init(&rsp->rw, cm_id->qp, + cm_id->port_num, req->sg, req->sg_cnt, + req->metadata_sg, req->metadata_sg_cnt, sig_attrs, + addr, key, nvmet_data_dir(req)); + else + ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num, + req->sg, req->sg_cnt, 0, addr, key, + nvmet_data_dir(req)); + + return ret; +} + +static void nvmet_rdma_rw_ctx_destroy(struct nvmet_rdma_rsp *rsp) +{ + struct rdma_cm_id *cm_id = rsp->queue->cm_id; + struct nvmet_req *req = &rsp->req; + + if (req->metadata_len) + rdma_rw_ctx_destroy_signature(&rsp->rw, cm_id->qp, + cm_id->port_num, req->sg, req->sg_cnt, + req->metadata_sg, req->metadata_sg_cnt, + nvmet_data_dir(req)); + else + rdma_rw_ctx_destroy(&rsp->rw, cm_id->qp, cm_id->port_num, + req->sg, req->sg_cnt, nvmet_data_dir(req)); +} + +static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp) +{ + struct nvmet_rdma_queue *queue = rsp->queue; + + atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail); + + if (rsp->n_rdma) + nvmet_rdma_rw_ctx_destroy(rsp); + + if (rsp->req.sg != rsp->cmd->inline_sg) + nvmet_req_free_sgls(&rsp->req); + + if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list))) + nvmet_rdma_process_wr_wait_list(queue); + + nvmet_rdma_put_rsp(rsp); +} + +static void nvmet_rdma_error_comp(struct nvmet_rdma_queue *queue) +{ + if (queue->nvme_sq.ctrl) { + nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl); + } else { + /* + * we didn't setup the controller yet in case + * of admin connect error, just disconnect and + * cleanup the queue + */ + nvmet_rdma_queue_disconnect(queue); + } +} + +static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct nvmet_rdma_rsp *rsp = + container_of(wc->wr_cqe, struct nvmet_rdma_rsp, send_cqe); + struct nvmet_rdma_queue *queue = wc->qp->qp_context; + + nvmet_rdma_release_rsp(rsp); + + if (unlikely(wc->status != IB_WC_SUCCESS && + wc->status != IB_WC_WR_FLUSH_ERR)) { + pr_err("SEND for CQE 0x%p failed with status %s (%d).\n", + wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status); + nvmet_rdma_error_comp(queue); + } +} + +static void nvmet_rdma_queue_response(struct nvmet_req *req) +{ + struct nvmet_rdma_rsp *rsp = + container_of(req, struct nvmet_rdma_rsp, req); + struct rdma_cm_id *cm_id = rsp->queue->cm_id; + struct ib_send_wr *first_wr; + + if (rsp->flags & NVMET_RDMA_REQ_INVALIDATE_RKEY) { + rsp->send_wr.opcode = IB_WR_SEND_WITH_INV; + rsp->send_wr.ex.invalidate_rkey = rsp->invalidate_rkey; + } else { + rsp->send_wr.opcode = IB_WR_SEND; + } + + if (nvmet_rdma_need_data_out(rsp)) { + if (rsp->req.metadata_len) + first_wr = rdma_rw_ctx_wrs(&rsp->rw, cm_id->qp, + cm_id->port_num, &rsp->write_cqe, NULL); + else + first_wr = rdma_rw_ctx_wrs(&rsp->rw, cm_id->qp, + cm_id->port_num, NULL, &rsp->send_wr); + } else { + first_wr = &rsp->send_wr; + } + + nvmet_rdma_post_recv(rsp->queue->dev, rsp->cmd); + + ib_dma_sync_single_for_device(rsp->queue->dev->device, + rsp->send_sge.addr, rsp->send_sge.length, + DMA_TO_DEVICE); + + if (unlikely(ib_post_send(cm_id->qp, first_wr, NULL))) { + pr_err("sending cmd response failed\n"); + nvmet_rdma_release_rsp(rsp); + } +} + +static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct nvmet_rdma_rsp *rsp = + container_of(wc->wr_cqe, struct nvmet_rdma_rsp, read_cqe); + struct nvmet_rdma_queue *queue = wc->qp->qp_context; + u16 status = 0; + + WARN_ON(rsp->n_rdma <= 0); + atomic_add(rsp->n_rdma, &queue->sq_wr_avail); + rsp->n_rdma = 0; + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + nvmet_rdma_rw_ctx_destroy(rsp); + nvmet_req_uninit(&rsp->req); + nvmet_rdma_release_rsp(rsp); + if (wc->status != IB_WC_WR_FLUSH_ERR) { + pr_info("RDMA READ for CQE 0x%p failed with status %s (%d).\n", + wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status); + nvmet_rdma_error_comp(queue); + } + return; + } + + if (rsp->req.metadata_len) + status = nvmet_rdma_check_pi_status(rsp->rw.reg->mr); + nvmet_rdma_rw_ctx_destroy(rsp); + + if (unlikely(status)) + nvmet_req_complete(&rsp->req, status); + else + rsp->req.execute(&rsp->req); +} + +static void nvmet_rdma_write_data_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct nvmet_rdma_rsp *rsp = + container_of(wc->wr_cqe, struct nvmet_rdma_rsp, write_cqe); + struct nvmet_rdma_queue *queue = wc->qp->qp_context; + struct rdma_cm_id *cm_id = rsp->queue->cm_id; + u16 status; + + if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) + return; + + WARN_ON(rsp->n_rdma <= 0); + atomic_add(rsp->n_rdma, &queue->sq_wr_avail); + rsp->n_rdma = 0; + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + nvmet_rdma_rw_ctx_destroy(rsp); + nvmet_req_uninit(&rsp->req); + nvmet_rdma_release_rsp(rsp); + if (wc->status != IB_WC_WR_FLUSH_ERR) { + pr_info("RDMA WRITE for CQE failed with status %s (%d).\n", + ib_wc_status_msg(wc->status), wc->status); + nvmet_rdma_error_comp(queue); + } + return; + } + + /* + * Upon RDMA completion check the signature status + * - if succeeded send good NVMe response + * - if failed send bad NVMe response with appropriate error + */ + status = nvmet_rdma_check_pi_status(rsp->rw.reg->mr); + if (unlikely(status)) + rsp->req.cqe->status = cpu_to_le16(status << 1); + nvmet_rdma_rw_ctx_destroy(rsp); + + if (unlikely(ib_post_send(cm_id->qp, &rsp->send_wr, NULL))) { + pr_err("sending cmd response failed\n"); + nvmet_rdma_release_rsp(rsp); + } +} + +static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len, + u64 off) +{ + int sg_count = num_pages(len); + struct scatterlist *sg; + int i; + + sg = rsp->cmd->inline_sg; + for (i = 0; i < sg_count; i++, sg++) { + if (i < sg_count - 1) + sg_unmark_end(sg); + else + sg_mark_end(sg); + sg->offset = off; + sg->length = min_t(int, len, PAGE_SIZE - off); + len -= sg->length; + if (!i) + off = 0; + } + + rsp->req.sg = rsp->cmd->inline_sg; + rsp->req.sg_cnt = sg_count; +} + +static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp) +{ + struct nvme_sgl_desc *sgl = &rsp->req.cmd->common.dptr.sgl; + u64 off = le64_to_cpu(sgl->addr); + u32 len = le32_to_cpu(sgl->length); + + if (!nvme_is_write(rsp->req.cmd)) { + rsp->req.error_loc = + offsetof(struct nvme_common_command, opcode); + return NVME_SC_INVALID_FIELD | NVME_SC_DNR; + } + + if (off + len > rsp->queue->dev->inline_data_size) { + pr_err("invalid inline data offset!\n"); + return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR; + } + + /* no data command? */ + if (!len) + return 0; + + nvmet_rdma_use_inline_sg(rsp, len, off); + rsp->flags |= NVMET_RDMA_REQ_INLINE_DATA; + rsp->req.transfer_len += len; + return 0; +} + +static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp, + struct nvme_keyed_sgl_desc *sgl, bool invalidate) +{ + u64 addr = le64_to_cpu(sgl->addr); + u32 key = get_unaligned_le32(sgl->key); + struct ib_sig_attrs sig_attrs; + int ret; + + rsp->req.transfer_len = get_unaligned_le24(sgl->length); + + /* no data command? */ + if (!rsp->req.transfer_len) + return 0; + + if (rsp->req.metadata_len) + nvmet_rdma_set_sig_attrs(&rsp->req, &sig_attrs); + + ret = nvmet_req_alloc_sgls(&rsp->req); + if (unlikely(ret < 0)) + goto error_out; + + ret = nvmet_rdma_rw_ctx_init(rsp, addr, key, &sig_attrs); + if (unlikely(ret < 0)) + goto error_out; + rsp->n_rdma += ret; + + if (invalidate) { + rsp->invalidate_rkey = key; + rsp->flags |= NVMET_RDMA_REQ_INVALIDATE_RKEY; + } + + return 0; + +error_out: + rsp->req.transfer_len = 0; + return NVME_SC_INTERNAL; +} + +static u16 nvmet_rdma_map_sgl(struct nvmet_rdma_rsp *rsp) +{ + struct nvme_keyed_sgl_desc *sgl = &rsp->req.cmd->common.dptr.ksgl; + + switch (sgl->type >> 4) { + case NVME_SGL_FMT_DATA_DESC: + switch (sgl->type & 0xf) { + case NVME_SGL_FMT_OFFSET: + return nvmet_rdma_map_sgl_inline(rsp); + default: + pr_err("invalid SGL subtype: %#x\n", sgl->type); + rsp->req.error_loc = + offsetof(struct nvme_common_command, dptr); + return NVME_SC_INVALID_FIELD | NVME_SC_DNR; + } + case NVME_KEY_SGL_FMT_DATA_DESC: + switch (sgl->type & 0xf) { + case NVME_SGL_FMT_ADDRESS | NVME_SGL_FMT_INVALIDATE: + return nvmet_rdma_map_sgl_keyed(rsp, sgl, true); + case NVME_SGL_FMT_ADDRESS: + return nvmet_rdma_map_sgl_keyed(rsp, sgl, false); + default: + pr_err("invalid SGL subtype: %#x\n", sgl->type); + rsp->req.error_loc = + offsetof(struct nvme_common_command, dptr); + return NVME_SC_INVALID_FIELD | NVME_SC_DNR; + } + default: + pr_err("invalid SGL type: %#x\n", sgl->type); + rsp->req.error_loc = offsetof(struct nvme_common_command, dptr); + return NVME_SC_SGL_INVALID_TYPE | NVME_SC_DNR; + } +} + +static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp) +{ + struct nvmet_rdma_queue *queue = rsp->queue; + + if (unlikely(atomic_sub_return(1 + rsp->n_rdma, + &queue->sq_wr_avail) < 0)) { + pr_debug("IB send queue full (needed %d): queue %u cntlid %u\n", + 1 + rsp->n_rdma, queue->idx, + queue->nvme_sq.ctrl->cntlid); + atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail); + return false; + } + + if (nvmet_rdma_need_data_in(rsp)) { + if (rdma_rw_ctx_post(&rsp->rw, queue->qp, + queue->cm_id->port_num, &rsp->read_cqe, NULL)) + nvmet_req_complete(&rsp->req, NVME_SC_DATA_XFER_ERROR); + } else { + rsp->req.execute(&rsp->req); + } + + return true; +} + +static void nvmet_rdma_handle_command(struct nvmet_rdma_queue *queue, + struct nvmet_rdma_rsp *cmd) +{ + u16 status; + + ib_dma_sync_single_for_cpu(queue->dev->device, + cmd->cmd->sge[0].addr, cmd->cmd->sge[0].length, + DMA_FROM_DEVICE); + ib_dma_sync_single_for_cpu(queue->dev->device, + cmd->send_sge.addr, cmd->send_sge.length, + DMA_TO_DEVICE); + + if (!nvmet_req_init(&cmd->req, &queue->nvme_cq, + &queue->nvme_sq, &nvmet_rdma_ops)) + return; + + status = nvmet_rdma_map_sgl(cmd); + if (status) + goto out_err; + + if (unlikely(!nvmet_rdma_execute_command(cmd))) { + spin_lock(&queue->rsp_wr_wait_lock); + list_add_tail(&cmd->wait_list, &queue->rsp_wr_wait_list); + spin_unlock(&queue->rsp_wr_wait_lock); + } + + return; + +out_err: + nvmet_req_complete(&cmd->req, status); +} + +static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct nvmet_rdma_cmd *cmd = + container_of(wc->wr_cqe, struct nvmet_rdma_cmd, cqe); + struct nvmet_rdma_queue *queue = wc->qp->qp_context; + struct nvmet_rdma_rsp *rsp; + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + if (wc->status != IB_WC_WR_FLUSH_ERR) { + pr_err("RECV for CQE 0x%p failed with status %s (%d)\n", + wc->wr_cqe, ib_wc_status_msg(wc->status), + wc->status); + nvmet_rdma_error_comp(queue); + } + return; + } + + if (unlikely(wc->byte_len < sizeof(struct nvme_command))) { + pr_err("Ctrl Fatal Error: capsule size less than 64 bytes\n"); + nvmet_rdma_error_comp(queue); + return; + } + + cmd->queue = queue; + rsp = nvmet_rdma_get_rsp(queue); + if (unlikely(!rsp)) { + /* + * we get here only under memory pressure, + * silently drop and have the host retry + * as we can't even fail it. + */ + nvmet_rdma_post_recv(queue->dev, cmd); + return; + } + rsp->queue = queue; + rsp->cmd = cmd; + rsp->flags = 0; + rsp->req.cmd = cmd->nvme_cmd; + rsp->req.port = queue->port; + rsp->n_rdma = 0; + + if (unlikely(queue->state != NVMET_RDMA_Q_LIVE)) { + unsigned long flags; + + spin_lock_irqsave(&queue->state_lock, flags); + if (queue->state == NVMET_RDMA_Q_CONNECTING) + list_add_tail(&rsp->wait_list, &queue->rsp_wait_list); + else + nvmet_rdma_put_rsp(rsp); + spin_unlock_irqrestore(&queue->state_lock, flags); + return; + } + + nvmet_rdma_handle_command(queue, rsp); +} + +static void nvmet_rdma_destroy_srq(struct nvmet_rdma_srq *nsrq) +{ + nvmet_rdma_free_cmds(nsrq->ndev, nsrq->cmds, nsrq->ndev->srq_size, + false); + ib_destroy_srq(nsrq->srq); + + kfree(nsrq); +} + +static void nvmet_rdma_destroy_srqs(struct nvmet_rdma_device *ndev) +{ + int i; + + if (!ndev->srqs) + return; + + for (i = 0; i < ndev->srq_count; i++) + nvmet_rdma_destroy_srq(ndev->srqs[i]); + + kfree(ndev->srqs); +} + +static struct nvmet_rdma_srq * +nvmet_rdma_init_srq(struct nvmet_rdma_device *ndev) +{ + struct ib_srq_init_attr srq_attr = { NULL, }; + size_t srq_size = ndev->srq_size; + struct nvmet_rdma_srq *nsrq; + struct ib_srq *srq; + int ret, i; + + nsrq = kzalloc(sizeof(*nsrq), GFP_KERNEL); + if (!nsrq) + return ERR_PTR(-ENOMEM); + + srq_attr.attr.max_wr = srq_size; + srq_attr.attr.max_sge = 1 + ndev->inline_page_count; + srq_attr.attr.srq_limit = 0; + srq_attr.srq_type = IB_SRQT_BASIC; + srq = ib_create_srq(ndev->pd, &srq_attr); + if (IS_ERR(srq)) { + ret = PTR_ERR(srq); + goto out_free; + } + + nsrq->cmds = nvmet_rdma_alloc_cmds(ndev, srq_size, false); + if (IS_ERR(nsrq->cmds)) { + ret = PTR_ERR(nsrq->cmds); + goto out_destroy_srq; + } + + nsrq->srq = srq; + nsrq->ndev = ndev; + + for (i = 0; i < srq_size; i++) { + nsrq->cmds[i].nsrq = nsrq; + ret = nvmet_rdma_post_recv(ndev, &nsrq->cmds[i]); + if (ret) + goto out_free_cmds; + } + + return nsrq; + +out_free_cmds: + nvmet_rdma_free_cmds(ndev, nsrq->cmds, srq_size, false); +out_destroy_srq: + ib_destroy_srq(srq); +out_free: + kfree(nsrq); + return ERR_PTR(ret); +} + +static int nvmet_rdma_init_srqs(struct nvmet_rdma_device *ndev) +{ + int i, ret; + + if (!ndev->device->attrs.max_srq_wr || !ndev->device->attrs.max_srq) { + /* + * If SRQs aren't supported we just go ahead and use normal + * non-shared receive queues. + */ + pr_info("SRQ requested but not supported.\n"); + return 0; + } + + ndev->srq_size = min(ndev->device->attrs.max_srq_wr, + nvmet_rdma_srq_size); + ndev->srq_count = min(ndev->device->num_comp_vectors, + ndev->device->attrs.max_srq); + + ndev->srqs = kcalloc(ndev->srq_count, sizeof(*ndev->srqs), GFP_KERNEL); + if (!ndev->srqs) + return -ENOMEM; + + for (i = 0; i < ndev->srq_count; i++) { + ndev->srqs[i] = nvmet_rdma_init_srq(ndev); + if (IS_ERR(ndev->srqs[i])) { + ret = PTR_ERR(ndev->srqs[i]); + goto err_srq; + } + } + + return 0; + +err_srq: + while (--i >= 0) + nvmet_rdma_destroy_srq(ndev->srqs[i]); + kfree(ndev->srqs); + return ret; +} + +static void nvmet_rdma_free_dev(struct kref *ref) +{ + struct nvmet_rdma_device *ndev = + container_of(ref, struct nvmet_rdma_device, ref); + + mutex_lock(&device_list_mutex); + list_del(&ndev->entry); + mutex_unlock(&device_list_mutex); + + nvmet_rdma_destroy_srqs(ndev); + ib_dealloc_pd(ndev->pd); + + kfree(ndev); +} + +static struct nvmet_rdma_device * +nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id) +{ + struct nvmet_rdma_port *port = cm_id->context; + struct nvmet_port *nport = port->nport; + struct nvmet_rdma_device *ndev; + int inline_page_count; + int inline_sge_count; + int ret; + + mutex_lock(&device_list_mutex); + list_for_each_entry(ndev, &device_list, entry) { + if (ndev->device->node_guid == cm_id->device->node_guid && + kref_get_unless_zero(&ndev->ref)) + goto out_unlock; + } + + ndev = kzalloc(sizeof(*ndev), GFP_KERNEL); + if (!ndev) + goto out_err; + + inline_page_count = num_pages(nport->inline_data_size); + inline_sge_count = max(cm_id->device->attrs.max_sge_rd, + cm_id->device->attrs.max_recv_sge) - 1; + if (inline_page_count > inline_sge_count) { + pr_warn("inline_data_size %d cannot be supported by device %s. Reducing to %lu.\n", + nport->inline_data_size, cm_id->device->name, + inline_sge_count * PAGE_SIZE); + nport->inline_data_size = inline_sge_count * PAGE_SIZE; + inline_page_count = inline_sge_count; + } + ndev->inline_data_size = nport->inline_data_size; + ndev->inline_page_count = inline_page_count; + + if (nport->pi_enable && !(cm_id->device->attrs.device_cap_flags & + IB_DEVICE_INTEGRITY_HANDOVER)) { + pr_warn("T10-PI is not supported by device %s. Disabling it\n", + cm_id->device->name); + nport->pi_enable = false; + } + + ndev->device = cm_id->device; + kref_init(&ndev->ref); + + if (nport->offload_srq_size > ndev->device->attrs.max_srq_wr) { + pr_warn("offload_srq_size %zu cannot be supported by device %s. Reducing to %d.\n", + nport->offload_srq_size, cm_id->device->name, + ndev->device->attrs.max_srq_wr); + nport->offload_srq_size = ndev->device->attrs.max_srq_wr; + } + + ndev->pd = ib_alloc_pd(ndev->device, 0); + if (IS_ERR(ndev->pd)) + goto out_free_dev; + + ndev->rts2rts_qp_rmp = nport->many_offload_subsys_support; + + if (nvmet_rdma_use_srq || (nport->offload && ndev->rts2rts_qp_rmp)) { + ret = nvmet_rdma_init_srqs(ndev); + if (ret) + goto out_free_pd; + } + + list_add(&ndev->entry, &device_list); +out_unlock: + mutex_unlock(&device_list_mutex); + pr_debug("added %s.\n", ndev->device->name); + return ndev; + +out_free_pd: + ib_dealloc_pd(ndev->pd); +out_free_dev: + kfree(ndev); +out_err: + mutex_unlock(&device_list_mutex); + return NULL; +} + +static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue) +{ + struct ib_qp_init_attr qp_attr = { }; + struct nvmet_rdma_device *ndev = queue->dev; + int nr_cqe, ret, i, factor; + + if (queue->offload && !ndev->rts2rts_qp_rmp) { + ret = nvmet_rdma_find_get_xrq(queue, NULL); + if (ret) { + pr_err("failed to get XRQ for queue (%d)\n", + queue->host_qid); + goto out; + } + } + + /* + * Reserve CQ slots for RECV + RDMA_READ/RDMA_WRITE + RDMA_SEND. + */ + nr_cqe = queue->recv_queue_size + 2 * queue->send_queue_size; + + queue->cq = ib_cq_pool_get(ndev->device, nr_cqe + 1, + queue->comp_vector, IB_POLL_WORKQUEUE); + if (IS_ERR(queue->cq)) { + ret = PTR_ERR(queue->cq); + pr_err("failed to create CQ cqe= %d ret= %d\n", + nr_cqe + 1, ret); + goto err_destroy_xrq; + } + + qp_attr.qp_context = queue; + qp_attr.event_handler = nvmet_rdma_qp_event; + qp_attr.send_cq = queue->cq; + qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; + qp_attr.qp_type = IB_QPT_RC; + /* +1 for drain */ + qp_attr.cap.max_send_wr = queue->send_queue_size + 1; + factor = rdma_rw_mr_factor(ndev->device, queue->cm_id->port_num, + 1 << NVMET_RDMA_MAX_MDTS); + qp_attr.cap.max_rdma_ctxs = queue->send_queue_size * factor; + qp_attr.cap.max_send_sge = max(ndev->device->attrs.max_sge_rd, + ndev->device->attrs.max_send_sge); + + if (queue->xrq) { + qp_attr.srq = queue->xrq->ofl_srq->srq; + qp_attr.recv_cq = NULL; + } else if (queue->nsrq) { + qp_attr.srq = queue->nsrq->srq; + qp_attr.recv_cq = queue->cq; + } else { + /* +1 for drain */ + qp_attr.cap.max_recv_wr = 1 + queue->recv_queue_size; + qp_attr.cap.max_recv_sge = 1 + ndev->inline_page_count; + qp_attr.recv_cq = queue->cq; + } + + if (queue->port->pi_enable && queue->host_qid) + qp_attr.create_flags |= IB_QP_CREATE_INTEGRITY_EN; + + ret = rdma_create_qp(queue->cm_id, ndev->pd, &qp_attr); + if (ret) { + pr_err("failed to create_qp ret= %d\n", ret); + goto err_destroy_cq; + } + queue->qp = queue->cm_id->qp; + + atomic_set(&queue->sq_wr_avail, qp_attr.cap.max_send_wr); + + pr_debug("%s: max_cqe= %d max_sge= %d sq_size = %d cm_id= %p\n", + __func__, queue->cq->cqe, qp_attr.cap.max_send_sge, + qp_attr.cap.max_send_wr, queue->cm_id); + + if (!queue->nsrq && !queue->xrq) { + for (i = 0; i < queue->recv_queue_size; i++) { + queue->cmds[i].queue = queue; + ret = nvmet_rdma_post_recv(ndev, &queue->cmds[i]); + if (ret) + goto err_destroy_qp; + } + } + +out: + return ret; + +err_destroy_qp: + rdma_destroy_qp(queue->cm_id); +err_destroy_cq: + ib_cq_pool_put(queue->cq, nr_cqe + 1); +err_destroy_xrq: + if (queue->xrq) + kref_put(&queue->xrq->ref, nvmet_rdma_destroy_xrq); + goto out; +} + +static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue) +{ + /* todo: bug #974802 - need to debug with fw */ + ib_drain_qp(queue->qp); + if (queue->cm_id) + rdma_destroy_id(queue->cm_id); + ib_destroy_qp(queue->qp); + ib_cq_pool_put(queue->cq, queue->recv_queue_size + 2 * + queue->send_queue_size + 1); + if (queue->xrq) + kref_put(&queue->xrq->ref, nvmet_rdma_destroy_xrq); +} + +static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue) +{ + pr_debug("freeing queue %d\n", queue->idx); + + nvmet_sq_destroy(&queue->nvme_sq); + + nvmet_rdma_destroy_queue_ib(queue); + if (!queue->nsrq && !queue->offload) { + nvmet_rdma_free_cmds(queue->dev, queue->cmds, + queue->recv_queue_size, + !queue->host_qid); + } + nvmet_rdma_free_rsps(queue); + ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx); + kfree(queue); +} + +static void nvmet_rdma_release_queue_work(struct work_struct *w) +{ + struct nvmet_rdma_queue *queue = + container_of(w, struct nvmet_rdma_queue, release_work); + struct nvmet_rdma_device *dev = queue->dev; + + nvmet_rdma_free_queue(queue); + + kref_put(&dev->ref, nvmet_rdma_free_dev); +} + +static void nvmet_rdma_disconnect_queue_work(struct work_struct *w) +{ + struct nvmet_rdma_queue *queue = + container_of(w, struct nvmet_rdma_queue, disconnect_work); + + if (queue) + nvmet_rdma_queue_disconnect(queue); +} + +static int +nvmet_rdma_parse_cm_connect_req(struct rdma_conn_param *conn, + struct nvmet_rdma_queue *queue) +{ + struct nvme_rdma_cm_req *req; + + req = (struct nvme_rdma_cm_req *)conn->private_data; + if (!req || conn->private_data_len == 0) + return NVME_RDMA_CM_INVALID_LEN; + + if (le16_to_cpu(req->recfmt) != NVME_RDMA_CM_FMT_1_0) + return NVME_RDMA_CM_INVALID_RECFMT; + + queue->host_qid = le16_to_cpu(req->qid); + + /* + * req->hsqsize corresponds to our recv queue size plus 1 + * req->hrqsize corresponds to our send queue size + */ + queue->recv_queue_size = le16_to_cpu(req->hsqsize) + 1; + queue->send_queue_size = le16_to_cpu(req->hrqsize); + + if (!queue->host_qid && queue->recv_queue_size > NVME_AQ_DEPTH) + return NVME_RDMA_CM_INVALID_HSQSIZE; + + /* XXX: Should we enforce some kind of max for IO queues? */ + + return 0; +} + +static int nvmet_rdma_cm_reject(struct rdma_cm_id *cm_id, + enum nvme_rdma_cm_status status) +{ + struct nvme_rdma_cm_rej rej; + + pr_debug("rejecting connect request: status %d (%s)\n", + status, nvme_rdma_cm_msg(status)); + + rej.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0); + rej.sts = cpu_to_le16(status); + + return rdma_reject(cm_id, (void *)&rej, sizeof(rej), + IB_CM_REJ_CONSUMER_DEFINED); +} + +static struct nvmet_rdma_queue * +nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev, + struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) +{ + struct nvmet_rdma_port *port = cm_id->context; + struct nvmet_rdma_queue *queue; + int ret; + + queue = kzalloc(sizeof(*queue), GFP_KERNEL); + if (!queue) { + ret = NVME_RDMA_CM_NO_RSC; + goto out_reject; + } + + ret = nvmet_sq_init(&queue->nvme_sq); + if (ret) { + ret = NVME_RDMA_CM_NO_RSC; + goto out_free_queue; + } + + ret = nvmet_rdma_parse_cm_connect_req(&event->param.conn, queue); + if (ret) + goto out_destroy_sq; + + /* + * Schedules the actual release because calling rdma_destroy_id from + * inside a CM callback would trigger a deadlock. (great API design..) + */ + INIT_WORK(&queue->release_work, nvmet_rdma_release_queue_work); + INIT_WORK(&queue->disconnect_work, nvmet_rdma_disconnect_queue_work); + queue->dev = ndev; + queue->cm_id = cm_id; + queue->port = port->nport; + queue->offload = queue->port->offload && queue->host_qid; + + spin_lock_init(&queue->state_lock); + queue->state = NVMET_RDMA_Q_CONNECTING; + INIT_LIST_HEAD(&queue->rsp_wait_list); + INIT_LIST_HEAD(&queue->rsp_wr_wait_list); + spin_lock_init(&queue->rsp_wr_wait_lock); + INIT_LIST_HEAD(&queue->free_rsps); + spin_lock_init(&queue->rsps_lock); + INIT_LIST_HEAD(&queue->queue_list); + + queue->idx = ida_simple_get(&nvmet_rdma_queue_ida, 0, 0, GFP_KERNEL); + if (queue->idx < 0) { + ret = NVME_RDMA_CM_NO_RSC; + goto out_destroy_sq; + } + + /* + * Spread the io queues across completion vectors, + * but still keep all admin queues on vector 0. + */ + queue->comp_vector = !queue->host_qid ? 0 : + queue->idx % ndev->device->num_comp_vectors; + + + ret = nvmet_rdma_alloc_rsps(queue); + if (ret) { + ret = NVME_RDMA_CM_NO_RSC; + goto out_ida_remove; + } + + if (ndev->srqs) { + queue->nsrq = ndev->srqs[queue->comp_vector % ndev->srq_count]; + } else if(!queue->nsrq && !queue->offload) { + queue->cmds = nvmet_rdma_alloc_cmds(ndev, + queue->recv_queue_size, + !queue->host_qid); + if (IS_ERR(queue->cmds)) { + ret = NVME_RDMA_CM_NO_RSC; + goto out_free_responses; + } + } + + ret = nvmet_rdma_create_queue_ib(queue); + if (ret) { + pr_err("%s: creating RDMA queue failed (%d).\n", + __func__, ret); + ret = NVME_RDMA_CM_NO_RSC; + goto out_free_cmds; + } + + return queue; + +out_free_cmds: + if (!queue->nsrq && !queue->offload) { + nvmet_rdma_free_cmds(queue->dev, queue->cmds, + queue->recv_queue_size, + !queue->host_qid); + } +out_free_responses: + nvmet_rdma_free_rsps(queue); +out_ida_remove: + ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx); +out_destroy_sq: + nvmet_sq_destroy(&queue->nvme_sq); +out_free_queue: + kfree(queue); +out_reject: + nvmet_rdma_cm_reject(cm_id, ret); + return NULL; +} + +static void nvmet_rdma_qp_event(struct ib_event *event, void *priv) +{ + struct nvmet_rdma_queue *queue = priv; + + switch (event->event) { + case IB_EVENT_COMM_EST: + rdma_notify(queue->cm_id, event->event); + break; + case IB_EXP_EVENT_XRQ_QP_ERR: + pr_err("queue %p received IB QP event: %s (%d)\n", + queue, ib_event_msg(event->event), event->event); + schedule_work(&queue->disconnect_work); + break; + case IB_EVENT_QP_LAST_WQE_REACHED: + pr_debug("received IB QP event: %s (%d)\n", + ib_event_msg(event->event), event->event); + break; + default: + pr_err("received IB QP event: %s (%d)\n", + ib_event_msg(event->event), event->event); + break; + } +} + +static int nvmet_rdma_cm_accept(struct rdma_cm_id *cm_id, + struct nvmet_rdma_queue *queue, + struct rdma_conn_param *p) +{ + struct rdma_conn_param param = { }; + struct nvme_rdma_cm_rep priv = { }; + int ret = -ENOMEM; + + param.rnr_retry_count = 7; + if (queue->offload) + param.min_rnr_timer = IB_RNR_TIMER_000_01; + + param.flow_control = 1; + param.initiator_depth = min_t(u8, p->initiator_depth, + queue->dev->device->attrs.max_qp_init_rd_atom); + param.private_data = &priv; + param.private_data_len = sizeof(priv); + priv.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0); + priv.crqsize = cpu_to_le16(queue->recv_queue_size); + + ret = rdma_accept(cm_id, ¶m); + if (ret) + pr_err("rdma_accept failed (error code = %d)\n", ret); + + return ret; +} + +static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) +{ + struct nvmet_rdma_device *ndev; + struct nvmet_rdma_queue *queue; + int ret = -EINVAL; + + ndev = nvmet_rdma_find_get_device(cm_id); + if (!ndev) { + nvmet_rdma_cm_reject(cm_id, NVME_RDMA_CM_NO_RSC); + return -ECONNREFUSED; + } + + queue = nvmet_rdma_alloc_queue(ndev, cm_id, event); + if (!queue) { + ret = -ENOMEM; + goto put_device; + } + + if (queue->host_qid == 0) { + /* Let inflight controller teardown complete */ + flush_scheduled_work(); + } + + ret = nvmet_rdma_cm_accept(cm_id, queue, &event->param.conn); + if (ret) { + /* + * Don't destroy the cm_id in free path, as we implicitly + * destroy the cm_id here with non-zero ret code. + */ + queue->cm_id = NULL; + goto free_queue; + } + + mutex_lock(&nvmet_rdma_queue_mutex); + list_add_tail(&queue->queue_list, &nvmet_rdma_queue_list); + mutex_unlock(&nvmet_rdma_queue_mutex); + + return 0; + +free_queue: + nvmet_rdma_free_queue(queue); +put_device: + kref_put(&ndev->ref, nvmet_rdma_free_dev); + + return ret; +} + +static void nvmet_rdma_queue_established(struct nvmet_rdma_queue *queue) +{ + unsigned long flags; + + spin_lock_irqsave(&queue->state_lock, flags); + if (queue->state != NVMET_RDMA_Q_CONNECTING) { + pr_warn("trying to establish a connected queue\n"); + goto out_unlock; + } + queue->state = NVMET_RDMA_Q_LIVE; + + while (!list_empty(&queue->rsp_wait_list)) { + struct nvmet_rdma_rsp *cmd; + + cmd = list_first_entry(&queue->rsp_wait_list, + struct nvmet_rdma_rsp, wait_list); + list_del(&cmd->wait_list); + + spin_unlock_irqrestore(&queue->state_lock, flags); + nvmet_rdma_handle_command(queue, cmd); + spin_lock_irqsave(&queue->state_lock, flags); + } + +out_unlock: + spin_unlock_irqrestore(&queue->state_lock, flags); +} + +static void __nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue) +{ + bool disconnect = false; + unsigned long flags; + + pr_debug("cm_id= %p queue->state= %d\n", queue->cm_id, queue->state); + + spin_lock_irqsave(&queue->state_lock, flags); + switch (queue->state) { + case NVMET_RDMA_Q_CONNECTING: + while (!list_empty(&queue->rsp_wait_list)) { + struct nvmet_rdma_rsp *rsp; + + rsp = list_first_entry(&queue->rsp_wait_list, + struct nvmet_rdma_rsp, + wait_list); + list_del(&rsp->wait_list); + nvmet_rdma_put_rsp(rsp); + } + fallthrough; + case NVMET_RDMA_Q_LIVE: + queue->state = NVMET_RDMA_Q_DISCONNECTING; + disconnect = true; + break; + case NVMET_RDMA_Q_DISCONNECTING: + break; + } + spin_unlock_irqrestore(&queue->state_lock, flags); + + if (disconnect) { + rdma_disconnect(queue->cm_id); + schedule_work(&queue->release_work); + } +} + +static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue) +{ + bool disconnect = false; + + mutex_lock(&nvmet_rdma_queue_mutex); + if (!list_empty(&queue->queue_list)) { + list_del_init(&queue->queue_list); + disconnect = true; + } + mutex_unlock(&nvmet_rdma_queue_mutex); + + if (disconnect) + __nvmet_rdma_queue_disconnect(queue); +} + +static void nvmet_rdma_queue_connect_fail(struct rdma_cm_id *cm_id, + struct nvmet_rdma_queue *queue) +{ + WARN_ON_ONCE(queue->state != NVMET_RDMA_Q_CONNECTING); + + mutex_lock(&nvmet_rdma_queue_mutex); + if (!list_empty(&queue->queue_list)) + list_del_init(&queue->queue_list); + mutex_unlock(&nvmet_rdma_queue_mutex); + + pr_err("failed to connect queue %d\n", queue->idx); + schedule_work(&queue->release_work); +} + +static void nvmet_rdma_destroy_xrqs(struct nvmet_port *nport) +{ + struct nvmet_rdma_xrq *xrq, *next; + + mutex_lock(&nvmet_rdma_xrq_mutex); + list_for_each_entry_safe(xrq, next, &nvmet_rdma_xrq_list, entry) { + if (xrq->port == nport) { + /* + * nvmet_rdma_destroy_xrq lock nvmet_rdma_xrq_mutex too, + * so we need to unlock it to avoid a deadlock. + */ + mutex_unlock(&nvmet_rdma_xrq_mutex); + kref_put(&xrq->ref, nvmet_rdma_destroy_xrq); + mutex_lock(&nvmet_rdma_xrq_mutex); + } + } + mutex_unlock(&nvmet_rdma_xrq_mutex); +} + +static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) +{ + struct nvmet_rdma_queue *queue = NULL; + int ret = 0; + + if (cm_id->qp) + queue = cm_id->qp->qp_context; + + pr_debug("%s (%d): status %d id %p\n", + rdma_event_msg(event->event), event->event, + event->status, cm_id); + + switch (event->event) { + case RDMA_CM_EVENT_CONNECT_REQUEST: + ret = nvmet_rdma_queue_connect(cm_id, event); + break; + case RDMA_CM_EVENT_ESTABLISHED: + nvmet_rdma_queue_established(queue); + break; + case RDMA_CM_EVENT_ADDR_CHANGE: + if (!queue) { + struct nvmet_rdma_port *port = cm_id->context; + + schedule_delayed_work(&port->repair_work, 0); + break; + } + fallthrough; + case RDMA_CM_EVENT_DISCONNECTED: + case RDMA_CM_EVENT_TIMEWAIT_EXIT: + nvmet_rdma_queue_disconnect(queue); + break; + case RDMA_CM_EVENT_DEVICE_REMOVAL: + break; /* handled by nvmet_rdma_remove_one */ + case RDMA_CM_EVENT_REJECTED: + pr_debug("Connection rejected: %s\n", + rdma_reject_msg(cm_id, event->status)); + fallthrough; + case RDMA_CM_EVENT_UNREACHABLE: + case RDMA_CM_EVENT_CONNECT_ERROR: + nvmet_rdma_queue_connect_fail(cm_id, queue); + break; + default: + pr_err("received unrecognized RDMA CM event %d\n", + event->event); + break; + } + + return ret; +} + +static void nvmet_rdma_delete_ctrl(struct nvmet_ctrl *ctrl) +{ + struct nvmet_rdma_queue *queue; + +restart: + mutex_lock(&nvmet_rdma_queue_mutex); + list_for_each_entry(queue, &nvmet_rdma_queue_list, queue_list) { + if (queue->nvme_sq.ctrl == ctrl) { + list_del_init(&queue->queue_list); + mutex_unlock(&nvmet_rdma_queue_mutex); + + __nvmet_rdma_queue_disconnect(queue); + goto restart; + } + } + mutex_unlock(&nvmet_rdma_queue_mutex); +} + +static void nvmet_rdma_destroy_port_queues(struct nvmet_rdma_port *port) +{ + struct nvmet_rdma_queue *queue, *tmp; + struct nvmet_port *nport = port->nport; + + mutex_lock(&nvmet_rdma_queue_mutex); + list_for_each_entry_safe(queue, tmp, &nvmet_rdma_queue_list, + queue_list) { + if (queue->port != nport) + continue; + + list_del_init(&queue->queue_list); + __nvmet_rdma_queue_disconnect(queue); + } + mutex_unlock(&nvmet_rdma_queue_mutex); +} + +static void nvmet_rdma_disable_port(struct nvmet_rdma_port *port) +{ + struct rdma_cm_id *cm_id = xchg(&port->cm_id, NULL); + struct nvmet_port *nport = port->nport; + + if (nport->offload && cm_id) + nvmet_rdma_destroy_xrqs(nport); + + port->cm_id = NULL; + + if (cm_id) + rdma_destroy_id(cm_id); + + /* + * Destroy the remaining queues, which are not belong to any + * controller yet. Do it here after the RDMA-CM was destroyed + * guarantees that no new queue will be created. + */ + nvmet_rdma_destroy_port_queues(port); +} + +static int nvmet_rdma_enable_port(struct nvmet_rdma_port *port) +{ + struct sockaddr *addr = (struct sockaddr *)&port->addr; + struct rdma_cm_id *cm_id; + int ret; + + cm_id = rdma_create_id(&init_net, nvmet_rdma_cm_handler, port, + RDMA_PS_TCP, IB_QPT_RC); + if (IS_ERR(cm_id)) { + pr_err("CM ID creation failed\n"); + return PTR_ERR(cm_id); + } + + /* + * Allow both IPv4 and IPv6 sockets to bind a single port + * at the same time. + */ + ret = rdma_set_afonly(cm_id, 1); + if (ret) { + pr_err("rdma_set_afonly failed (%d)\n", ret); + goto out_destroy_id; + } + + ret = rdma_bind_addr(cm_id, addr); + if (ret) { + pr_err("binding CM ID to %pISpcs failed (%d)\n", addr, ret); + goto out_destroy_id; + } + + ret = rdma_listen(cm_id, 128); + if (ret) { + pr_err("listening to %pISpcs failed (%d)\n", addr, ret); + goto out_destroy_id; + } + + port->cm_id = cm_id; + if (cm_id->device) { + port->node_guid = cm_id->device->node_guid; + port->nport->many_offload_subsys_support = + cm_id->device->attrs.device_cap_flags & + IB_DEVICE_QP_MODIFY_RMP; + } + + pr_info("enabling port %d (%pISpcs)\n", + le16_to_cpu(port->nport->disc_addr.portid), addr); + + return 0; + +out_destroy_id: + rdma_destroy_id(cm_id); + return ret; +} + +static void nvmet_rdma_repair_port_work(struct work_struct *w) +{ + struct nvmet_rdma_port *port = container_of(to_delayed_work(w), + struct nvmet_rdma_port, repair_work); + int ret; + + nvmet_rdma_disable_port(port); + ret = nvmet_rdma_enable_port(port); + if (ret) + schedule_delayed_work(&port->repair_work, 5 * HZ); +} + +static int nvmet_rdma_add_port(struct nvmet_port *nport) +{ + struct nvmet_rdma_port *port; + __kernel_sa_family_t af; + int ret; + + port = kzalloc(sizeof(*port), GFP_KERNEL); + if (!port) + return -ENOMEM; + + nport->priv = port; + port->nport = nport; + INIT_DELAYED_WORK(&port->repair_work, nvmet_rdma_repair_port_work); + + switch (nport->disc_addr.adrfam) { + case NVMF_ADDR_FAMILY_IP4: + af = AF_INET; + break; + case NVMF_ADDR_FAMILY_IP6: + af = AF_INET6; + break; + default: + pr_err("address family %d not supported\n", + nport->disc_addr.adrfam); + ret = -EINVAL; + goto out_free_port; + } + + if (nport->inline_data_size < 0) { + nport->inline_data_size = NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE; + } else if (nport->inline_data_size > NVMET_RDMA_MAX_INLINE_DATA_SIZE) { + pr_warn("inline_data_size %u is too large, reducing to %u\n", + nport->inline_data_size, + NVMET_RDMA_MAX_INLINE_DATA_SIZE); + nport->inline_data_size = NVMET_RDMA_MAX_INLINE_DATA_SIZE; + } + + ret = inet_pton_with_scope(&init_net, af, nport->disc_addr.traddr, + nport->disc_addr.trsvcid, &port->addr); + if (ret) { + pr_err("malformed ip/port passed: %s:%s\n", + nport->disc_addr.traddr, nport->disc_addr.trsvcid); + goto out_free_port; + } + + ret = nvmet_rdma_enable_port(port); + if (ret) + goto out_free_port; + + mutex_lock(&port_list_mutex); + list_add_tail(&port->entry, &port_list); + mutex_unlock(&port_list_mutex); + + return 0; + +out_free_port: + kfree(port); + return ret; +} + +static void nvmet_rdma_remove_port(struct nvmet_port *nport) +{ + struct nvmet_rdma_port *port = nport->priv; + + mutex_lock(&port_list_mutex); + list_del(&port->entry); + mutex_unlock(&port_list_mutex); + + cancel_delayed_work_sync(&port->repair_work); + nvmet_rdma_disable_port(port); + kfree(port); +} + +static void nvmet_rdma_disc_port_addr(struct nvmet_req *req, + struct nvmet_port *nport, char *traddr) +{ + struct nvmet_rdma_port *port = nport->priv; + struct rdma_cm_id *cm_id = port->cm_id; + + if (inet_addr_is_any((struct sockaddr *)&cm_id->route.addr.src_addr)) { + struct nvmet_rdma_rsp *rsp = + container_of(req, struct nvmet_rdma_rsp, req); + struct rdma_cm_id *req_cm_id = rsp->queue->cm_id; + struct sockaddr *addr = (void *)&req_cm_id->route.addr.src_addr; + + sprintf(traddr, "%pISc", addr); + } else { + memcpy(traddr, nport->disc_addr.traddr, NVMF_TRADDR_SIZE); + } +} + +static u8 nvmet_rdma_get_mdts(const struct nvmet_ctrl *ctrl) +{ + if (ctrl->pi_support) + return NVMET_RDMA_MAX_METADATA_MDTS; + return NVMET_RDMA_MAX_MDTS; +} + +static u16 nvmet_rdma_get_max_queue_size(const struct nvmet_ctrl *ctrl) +{ + return NVME_RDMA_MAX_QUEUE_SIZE; +} + +static bool nvmet_rdma_is_port_active(struct nvmet_port *nport) +{ + struct nvmet_rdma_port *port = nport->priv; + + return port->cm_id ? true : false; +} + +static const struct nvmet_fabrics_ops nvmet_rdma_ops = { + .owner = THIS_MODULE, + .type = NVMF_TRTYPE_RDMA, + .msdbd = 1, + .flags = NVMF_KEYED_SGLS | NVMF_METADATA_SUPPORTED, + .add_port = nvmet_rdma_add_port, + .is_port_active = nvmet_rdma_is_port_active, + .remove_port = nvmet_rdma_remove_port, + .peer_to_peer_capable = nvmet_rdma_peer_to_peer_capable, + .install_queue = nvmet_rdma_install_offload_queue, + .create_offload_ctrl = nvmet_rdma_create_offload_ctrl, + .enable_offload_ns = nvmet_rdma_enable_offload_ns, + .disable_offload_ns = nvmet_rdma_disable_offload_ns, + .peer_to_peer_sqe_inline_size = nvmet_rdma_peer_to_peer_sqe_inline_size, + .peer_to_peer_mdts = nvmet_rdma_peer_to_peer_mdts, + .offload_subsys_unknown_ns_cmds = nvmet_rdma_offload_subsys_unknown_ns_cmds, + .offload_ns_read_cmds = nvmet_rdma_offload_ns_read_cmds, + .offload_ns_read_blocks = nvmet_rdma_offload_ns_read_blocks, + .offload_ns_write_cmds = nvmet_rdma_offload_ns_write_cmds, + .offload_ns_write_blocks = nvmet_rdma_offload_ns_write_blocks, + .offload_ns_write_inline_cmds = nvmet_rdma_offload_ns_write_inline_cmds, + .offload_ns_flush_cmds = nvmet_rdma_offload_ns_flush_cmds, + .offload_ns_error_cmds = nvmet_rdma_offload_ns_error_cmds, + .offload_ns_backend_error_cmds = nvmet_rdma_offload_ns_backend_error_cmds, + .offload_query_counters = nvmet_rdma_offload_query_counters, + .check_subsys_match_offload_port = nvmet_rdma_check_subsys_match_offload_port, + .destroy_offload_ctrl = nvmet_rdma_destroy_offload_ctrl, + .queue_response = nvmet_rdma_queue_response, + .delete_ctrl = nvmet_rdma_delete_ctrl, + .disc_traddr = nvmet_rdma_disc_port_addr, + .get_mdts = nvmet_rdma_get_mdts, + .get_max_queue_size = nvmet_rdma_get_max_queue_size, +}; + +static int nvmet_rdma_add_one(struct ib_device *ib_device) +{ + struct nvmet_rdma_port *port, *n; + + mutex_lock(&port_list_mutex); + list_for_each_entry_safe(port, n, &port_list, entry) { + if (port->node_guid != ib_device->node_guid) + continue; + + schedule_delayed_work(&port->repair_work, HZ); + } + mutex_unlock(&port_list_mutex); + + return 0; +} + +static void nvmet_rdma_remove_one(struct ib_device *ib_device, void *client_data) +{ + struct nvmet_rdma_queue *queue, *tmp; + struct nvmet_rdma_port *port, *n; + struct nvmet_rdma_device *ndev; + bool found = false; + + mutex_lock(&device_list_mutex); + list_for_each_entry(ndev, &device_list, entry) { + if (ndev->device == ib_device) { + found = true; + break; + } + } + mutex_unlock(&device_list_mutex); + + /* + * IB Device that is used by nvmet controllers is being removed, + * delete all queues using this device. + */ + mutex_lock(&nvmet_rdma_queue_mutex); + list_for_each_entry_safe(queue, tmp, &nvmet_rdma_queue_list, + queue_list) { + if (queue->dev->device != ib_device) + continue; + + pr_info("Removing queue %d\n", queue->idx); + list_del_init(&queue->queue_list); + __nvmet_rdma_queue_disconnect(queue); + } + mutex_unlock(&nvmet_rdma_queue_mutex); + + mutex_lock(&port_list_mutex); + list_for_each_entry_safe(port, n, &port_list, entry) { + if (port->node_guid != ib_device->node_guid) + continue; + + nvmet_rdma_disable_port(port); + } + mutex_unlock(&port_list_mutex); + + if (found) + flush_scheduled_work(); +} + +static struct ib_client nvmet_rdma_ib_client = { + .name = "nvmet_rdma", + .add = nvmet_rdma_add_one, + .remove = nvmet_rdma_remove_one +}; + +static int __init nvmet_rdma_init(void) +{ + struct nvmet_rdma_staging_buf *st, *tmp; + int ret; + + memset(&nvmet_rdma_st_pool, 0, + sizeof(struct nvmet_rdma_staging_buf_pool)); + + INIT_LIST_HEAD(&nvmet_rdma_st_pool.list); + + if (nvmet_rdma_offload_mem_start && nvmet_rdma_offload_mem_size_mb && + nvmet_rdma_offload_buffer_size_mb && + nvmet_rdma_offload_mem_size_mb >= nvmet_rdma_offload_buffer_size_mb) { + ret = nvmet_rdma_init_st_pool(&nvmet_rdma_st_pool, + nvmet_rdma_offload_mem_start, + nvmet_rdma_offload_mem_size_mb, + nvmet_rdma_offload_buffer_size_mb); + if (ret) + return ret; + } + + ret = ib_register_client(&nvmet_rdma_ib_client); + if (ret) + goto error; + + ret = nvmet_register_transport(&nvmet_rdma_ops); + if (ret) + goto err_ib_client; + + return 0; + +err_ib_client: + ib_unregister_client(&nvmet_rdma_ib_client); + +error: + list_for_each_entry_safe(st, tmp, &nvmet_rdma_st_pool.list, entry) { + list_del(&st->entry); + nvmet_rdma_free_st_buff(st); + } + + return ret; +} + +static void __exit nvmet_rdma_exit(void) +{ + struct nvmet_rdma_staging_buf *st, *tmp; + + nvmet_unregister_transport(&nvmet_rdma_ops); + ib_unregister_client(&nvmet_rdma_ib_client); + WARN_ON_ONCE(!list_empty(&nvmet_rdma_queue_list)); + ida_destroy(&nvmet_rdma_queue_ida); + ida_destroy(&nvmet_rdma_bectrl_ida); + + mutex_lock(&nvmet_rdma_xrq_mutex); + list_for_each_entry_safe(st, tmp, &nvmet_rdma_st_pool.list, entry) { + list_del(&st->entry); + nvmet_rdma_free_st_buff(st); + } + mutex_unlock(&nvmet_rdma_xrq_mutex); +} + +module_init(nvmet_rdma_init); +module_exit(nvmet_rdma_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS("nvmet-transport-1"); /* 1 == NVMF_TRTYPE_RDMA */ + +#include "rdma_offload.c" diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/rdma_offload.c b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/rdma_offload.c new file mode 100644 index 0000000..61bd39a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/rdma_offload.c @@ -0,0 +1,1127 @@ +/* + * Copyright (c) 2017, Mellanox Technologies, Ltd. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, see . + */ + +#include "rdma_offload.h" + +static unsigned int +__nvmet_rdma_peer_to_peer_sqe_inline_size(struct ib_nvmf_caps *nvmf_caps, + struct nvmet_port *nport); +static int nvmet_rdma_attach_xrq(struct nvmet_rdma_xrq *xrq, + struct nvmet_ctrl *ctrl); + +static int nvmet_rdma_fill_srq_nvmf_attrs(struct ib_srq_init_attr *srq_attr, + struct nvmet_rdma_xrq *xrq) +{ + struct ib_nvmf_caps *nvmf_caps = &xrq->ndev->device->attrs.nvmf_caps; + unsigned int sqe_inline_size = __nvmet_rdma_peer_to_peer_sqe_inline_size(nvmf_caps, xrq->port); + + srq_attr->ext.nvmf.type = IB_NVMF_READ_WRITE_FLUSH_OFFLOAD; + if (xrq->port->offload_passthrough_sqe_rw) + srq_attr->ext.nvmf.passthrough_sqe_rw_service_en = 1; + srq_attr->ext.nvmf.log_max_namespace = ilog2(nvmf_caps->max_namespace); + srq_attr->ext.nvmf.cmd_size = (sizeof(struct nvme_command) + sqe_inline_size) / 16; + srq_attr->ext.nvmf.data_offset = 0; + srq_attr->ext.nvmf.log_max_io_size = ilog2(nvmf_caps->max_io_sz); + srq_attr->ext.nvmf.nvme_memory_log_page_size = 0; + srq_attr->ext.nvmf.nvme_queue_size = clamp_t(u32, + xrq->port->offload_queue_size, + nvmf_caps->min_queue_sz, + nvmf_caps->max_queue_sz); + srq_attr->ext.nvmf.staging_buffer_number_of_pages = xrq->st->num_pages; + srq_attr->ext.nvmf.staging_buffer_log_page_size = ilog2(xrq->st->page_size >> 12); //4k granularity in PRM + srq_attr->ext.nvmf.staging_buffer_pas = kzalloc(sizeof(dma_addr_t) * xrq->st->num_pages, GFP_KERNEL); + if (!srq_attr->ext.nvmf.staging_buffer_pas) + return -ENOMEM; + + return 0; +} + +static void nvmet_rdma_free_st_buff(struct nvmet_rdma_staging_buf *st) +{ + if (st->dynamic) + kfree(st->staging_pages); + kfree(st->staging_dma_addrs); + kfree(st); +} + +/** + * Called with nvmet_rdma_xrq_mutex held + **/ +static void nvmet_rdma_release_st_buff(struct nvmet_rdma_staging_buf *st) +{ + if (st->dynamic) + nvmet_rdma_free_st_buff(st); + else + list_add_tail(&st->entry, &nvmet_rdma_st_pool.list); +} + +static struct nvmet_rdma_staging_buf *nvmet_rdma_alloc_st_buff(u16 num_pages, + unsigned int page_size_mb, bool dynamic) +{ + struct nvmet_rdma_staging_buf *st; + + st = kzalloc(sizeof(struct nvmet_rdma_staging_buf), GFP_KERNEL); + if (!st) + return NULL; + + st->staging_dma_addrs = kzalloc(sizeof(dma_addr_t) * num_pages, GFP_KERNEL); + if (!st->staging_dma_addrs) + goto free_st; + + if (dynamic) { + /* only in dynamic allocation we use virtual addresses too */ + st->staging_pages = kzalloc(sizeof(void*) * num_pages, GFP_KERNEL); + if (!st->staging_pages) + goto free_st_dma_addrs; + } + + st->num_pages = num_pages; + st->page_size = page_size_mb * SZ_1M; + st->dynamic = dynamic; + + return st; + +free_st_dma_addrs: + kfree(st->staging_dma_addrs); +free_st: + kfree(st); + + return NULL; +} + +static void nvmet_rdma_destroy_xrq(struct kref *ref) +{ + struct nvmet_rdma_xrq *xrq = + container_of(ref, struct nvmet_rdma_xrq, ref); + struct nvmet_rdma_device *ndev = xrq->ndev; + struct nvmet_rdma_staging_buf *st = xrq->st; + int i; + + pr_info("destroying XRQ %p port %p\n", xrq, xrq->port); + + mutex_lock(&nvmet_rdma_xrq_mutex); + if (!list_empty(&xrq->entry)) + list_del_init(&xrq->entry); + mutex_unlock(&nvmet_rdma_xrq_mutex); + + /* TODO: check if need to reduce refcound on pdev */ + nvmet_rdma_free_cmds(ndev, xrq->ofl_srq->cmds, xrq->ofl_srq_size, false); + ib_destroy_srq(xrq->ofl_srq->srq); + if (st->dynamic) { + for (i = 0 ; i < st->num_pages ; i++) + dma_free_coherent(ndev->device->dma_device, + st->page_size, st->staging_pages[i], + st->staging_dma_addrs[i]); + } + + ib_free_cq(xrq->cq); + nvmet_rdma_release_st_buff(st); + kfree(xrq->ofl_srq); + kfree(xrq); + kref_put(&ndev->ref, nvmet_rdma_free_dev); +} + +static int nvmet_rdma_init_xrq(struct nvmet_rdma_queue *queue, + struct nvmet_subsys *subsys) +{ + struct ib_srq_init_attr srq_attr = { NULL, }; + struct ib_srq *srq; + int ret, i, j; + struct nvmet_rdma_xrq *xrq; + struct nvmet_rdma_staging_buf *st; + struct nvmet_port *port = queue->port; + size_t srq_size = port->offload_srq_size; + struct nvmet_rdma_device *ndev = queue->dev; + + xrq = kzalloc(sizeof(*xrq), GFP_KERNEL); + if (!xrq) + return -ENOMEM; + + xrq->ofl_srq = kzalloc(sizeof(*xrq->ofl_srq), GFP_KERNEL); + if (!xrq->ofl_srq) { + ret = -ENOMEM; + goto free_xrq; + } + + kref_init(&xrq->ref); + mutex_init(&xrq->offload_ctrl_mutex); + INIT_LIST_HEAD(&xrq->be_ctrls_list); + mutex_init(&xrq->be_mutex); + xrq->ndev = ndev; + xrq->port = port; + xrq->subsys = subsys; + + if (!list_empty(&nvmet_rdma_st_pool.list)) { + st = list_first_entry(&nvmet_rdma_st_pool.list, + struct nvmet_rdma_staging_buf, entry); + list_del(&st->entry); + } else { + u16 num_pages = nvmet_rdma_offload_buffer_size_mb / + NVMET_DYNAMIC_STAGING_BUFFER_PAGE_SIZE_MB; + st = nvmet_rdma_alloc_st_buff(num_pages, + NVMET_DYNAMIC_STAGING_BUFFER_PAGE_SIZE_MB, + true); + } + if (!st) { + ret = -ENOMEM; + goto free_ofl_srq; + } + xrq->st = st; + + pr_info("using %s staging buffer %p\n", + st->dynamic ? "dynamic" : "static", st); + + /* This CQ is not associated to a specific queue */ + xrq->cq = ib_alloc_cq(ndev->device, NULL, 4096, 0, IB_POLL_WORKQUEUE); + if (IS_ERR(xrq->cq)) { + ret = PTR_ERR(xrq->cq); + pr_err("failed to create CQ for xrq cqe= %d ret= %d\n", + 4096, ret); + goto free_xrq_st; + } + + srq_attr.attr.max_wr = srq_size; + srq_attr.attr.max_sge = 1 + ndev->inline_page_count; + srq_attr.srq_type = IB_EXP_SRQT_NVMF; + if (nvmet_rdma_fill_srq_nvmf_attrs(&srq_attr, xrq)) { + ret = -ENOMEM; + goto free_xrq_cq; + } + + for (i = 0 ; i < st->num_pages ; i++) { + if (st->dynamic) { + st->staging_pages[i] = + dma_alloc_coherent(ndev->device->dma_device, + st->page_size, + &st->staging_dma_addrs[i], + GFP_KERNEL); + if (!st->staging_pages[i]) { + ret = -ENOMEM; + goto release_st_buf; + } + } + memcpy(&srq_attr.ext.nvmf.staging_buffer_pas[i], + &st->staging_dma_addrs[i], sizeof(dma_addr_t)); + } + + srq_attr.ext.cq = xrq->cq; + srq = ib_create_srq(ndev->pd, &srq_attr); + if (IS_ERR(srq)) { + pr_err("failed to create xrq SRQ"); + ret = PTR_ERR(srq); + goto release_st_buf; + } + + xrq->ofl_srq->cmds = nvmet_rdma_alloc_cmds(ndev, srq_size, false); + if (IS_ERR(xrq->ofl_srq->cmds)) { + ret = PTR_ERR(xrq->ofl_srq->cmds); + goto out_destroy_srq; + } + + if (!kref_get_unless_zero(&ndev->ref)) { + ret = -EINVAL; + goto out_free_cmds; + } + + xrq->ofl_srq->srq = srq; + xrq->ofl_srq->ndev = ndev; + xrq->ofl_srq_size = srq_size; + st->xrq = xrq; + xrq->nvme_queue_depth = srq_attr.ext.nvmf.nvme_queue_size; + queue->xrq = xrq; + + for (i = 0; i < srq_size; i++) { + xrq->ofl_srq->cmds[i].queue = queue; + xrq->ofl_srq->cmds[i].nsrq = xrq->ofl_srq; + ret = nvmet_rdma_post_recv(ndev, &xrq->ofl_srq->cmds[i]); + if (ret) { + pr_err("initial post_recv failed on XRQ 0x%p\n", srq); + goto out_kref_put; + } + } + + kfree(srq_attr.ext.nvmf.staging_buffer_pas); + + return 0; + +out_kref_put: + kref_put(&ndev->ref, nvmet_rdma_free_dev); +out_free_cmds: + nvmet_rdma_free_cmds(ndev, xrq->ofl_srq->cmds, srq_size, false); +out_destroy_srq: + ib_destroy_srq(srq); +release_st_buf: + if (st->dynamic) { + for (j = 0 ; j < i ; j++) + dma_free_coherent(ndev->device->dma_device, + st->page_size, st->staging_pages[j], + st->staging_dma_addrs[j]); + } + kfree(srq_attr.ext.nvmf.staging_buffer_pas); +free_xrq_cq: + ib_free_cq(xrq->cq); +free_xrq_st: + nvmet_rdma_release_st_buff(st); +free_ofl_srq: + kfree(xrq->ofl_srq); +free_xrq: + kfree(xrq); + + return ret; +} + +static int nvmet_rdma_find_get_xrq(struct nvmet_rdma_queue *queue, + struct nvmet_ctrl *ctrl) +{ + struct nvmet_rdma_xrq *xrq; + struct nvmet_subsys *subsys = ctrl ? ctrl->subsys : NULL; + int active_xrq = 0; + uint min = UINT_MAX; + int ret = 0; + + mutex_lock(&nvmet_rdma_xrq_mutex); + list_for_each_entry(xrq, &nvmet_rdma_xrq_list, entry) { + if (xrq->port == queue->port && + (!subsys || xrq->subsys == subsys)) { + active_xrq++; + if (kref_read(&xrq->ref) && kref_read(&xrq->ref) < min) + min = kref_read(&xrq->ref); + } + } + + list_for_each_entry(xrq, &nvmet_rdma_xrq_list, entry) { + if (xrq->port == queue->port && + (!subsys || xrq->subsys == subsys) && + active_xrq == queue->port->offload_queues && + kref_read(&xrq->ref) == min && + kref_get_unless_zero(&xrq->ref)) { + queue->xrq = xrq; + goto out_unlock; + } + } + + ret = nvmet_rdma_init_xrq(queue, subsys); + if (ret) + goto out_unlock; + + kref_get(&queue->xrq->ref); + list_add_tail(&queue->xrq->entry, &nvmet_rdma_xrq_list); + if (ctrl) + ret = nvmet_rdma_attach_xrq(queue->xrq, ctrl); + +out_unlock: + mutex_unlock(&nvmet_rdma_xrq_mutex); + return ret; +} + +static u16 nvmet_rdma_install_offload_queue(struct nvmet_sq *sq) +{ + struct nvmet_rdma_queue *queue = + container_of(sq, struct nvmet_rdma_queue, nvme_sq); + int qp_attr_mask = IB_QP_STATE | IB_QP_OFFLOAD_TYPE; + struct ib_qp_attr attr; + int ret; + + if (!queue->offload) + return 0; + + memset(&attr, 0, sizeof(attr)); + attr.qp_state = IB_QPS_RTS; + attr.offload_type = IB_QP_OFFLOAD_NVMF; + + if (!queue->xrq) { + WARN_ON_ONCE(!queue->dev->rts2rts_qp_rmp); + ret = nvmet_rdma_find_get_xrq(queue, sq->ctrl); + if (ret) { + pr_err("failed to get XRQ for queue (%d)\n", + queue->host_qid); + return NVME_SC_INTERNAL | NVME_SC_DNR; + } + qp_attr_mask |= IB_QP_RMPN_XRQN; + attr.rmpn_xrqn = queue->xrq->ofl_srq->srq->ext.xrc.srq_num; + } + + ret = ib_modify_qp(queue->cm_id->qp, &attr, qp_attr_mask); + if (ret) + return NVME_SC_INTERNAL | NVME_SC_DNR; + return 0; +} + +static void nvmet_rdma_free_be_ctrl(struct nvmet_rdma_backend_ctrl *be_ctrl) +{ + lockdep_assert_held(&be_ctrl->xrq->be_mutex); + list_del_init(&be_ctrl->entry); + be_ctrl->xrq->nr_be_ctrls--; + ida_simple_remove(&nvmet_rdma_bectrl_ida, be_ctrl->offload_ctx.id); + nvmet_offload_ctx_configfs_del(&be_ctrl->offload_ctx); + + if (be_ctrl->ibns) + ib_detach_nvmf_ns(be_ctrl->ibns); + if (be_ctrl->ofl) + nvme_peer_flush_resource(be_ctrl->ofl, be_ctrl->restart); + if (be_ctrl->ibctrl) + ib_destroy_nvmf_backend_ctrl(be_ctrl->ibctrl); + if (be_ctrl->ofl) + nvme_peer_put_resource(be_ctrl->ofl, be_ctrl->restart); + kref_put(&be_ctrl->xrq->ref, nvmet_rdma_destroy_xrq); + kfree(be_ctrl); +} + +static void nvmet_rdma_release_be_ctrl(struct nvmet_rdma_backend_ctrl *be_ctrl) +{ + struct nvmet_rdma_xrq *xrq = be_ctrl->xrq; + + mutex_lock(&xrq->be_mutex); + if (!list_empty(&be_ctrl->entry)) + nvmet_rdma_free_be_ctrl(be_ctrl); + mutex_unlock(&xrq->be_mutex); +} + +static void nvmet_rdma_stop_master_peer(void *priv) +{ + struct nvmet_rdma_backend_ctrl *be_ctrl = priv; + + dev_info(&be_ctrl->pdev->dev, + "Stopping master peer (be_ctrl %p)\n", be_ctrl); + + be_ctrl->restart = false; + nvmet_rdma_release_be_ctrl(be_ctrl); +} + +static void nvmet_rdma_backend_ctrl_event(struct ib_event *event, void *priv) +{ + struct nvmet_rdma_backend_ctrl *be_ctrl = priv; + + switch (event->event) { + case IB_EVENT_XRQ_NVMF_BACKEND_CTRL_PCI_ERR: + be_ctrl->restart = false; + schedule_work(&be_ctrl->release_work); + break; + case IB_EVENT_XRQ_NVMF_BACKEND_CTRL_TO_ERR: + be_ctrl->restart = true; + schedule_work(&be_ctrl->release_work); + break; + default: + break; + } + + dev_err(&be_ctrl->pdev->dev, + "received IB Backend ctrl event: %s (%d) be_ctrl %p id %d\n", + ib_event_msg(event->event), event->event, be_ctrl, + be_ctrl->offload_ctx.id); +} + +static int nvmet_rdma_init_be_ctrl_attr(struct ib_nvmf_backend_ctrl_init_attr *attr, + struct nvmet_rdma_backend_ctrl *be_ctrl, + struct ib_nvmf_caps *nvmf_caps) +{ + struct nvme_peer_resource *ofl = be_ctrl->ofl; + unsigned int nvme_cq_depth, nvme_sq_depth; + + nvme_sq_depth = ofl->nvme_sq_size / sizeof(struct nvme_command); + nvme_cq_depth = ofl->nvme_cq_size / sizeof(struct nvme_completion); + + if (nvme_sq_depth != be_ctrl->xrq->nvme_queue_depth) { + pr_err("nvme SQ depth for offload is %u, actual is %u\n", + be_ctrl->xrq->nvme_queue_depth, nvme_sq_depth); + return -EINVAL; + } + + if (nvme_cq_depth != be_ctrl->xrq->nvme_queue_depth) { + pr_err("nvme CQ depth for offload is %u, actual is %u\n", + be_ctrl->xrq->nvme_queue_depth, nvme_cq_depth); + return -EINVAL; + } + + memset(attr, 0, sizeof(*attr)); + + attr->be_context = be_ctrl; + attr->event_handler = nvmet_rdma_backend_ctrl_event; + attr->cq_page_offset = 0; + attr->sq_page_offset = 0; + attr->cq_log_page_size = ilog2(max_t(u32, ofl->nvme_cq_size >> 12, 1)); + attr->sq_log_page_size = ilog2(max_t(u32, ofl->nvme_sq_size >> 12, 1)); + attr->initial_cqh_db_value = 0; + attr->initial_sqt_db_value = 0; + if (nvmf_caps->min_cmd_timeout_us && nvmf_caps->max_cmd_timeout_us) + attr->cmd_timeout_us = clamp_t(u32, + be_ctrl->ns->offload_cmd_tmo_us, + nvmf_caps->min_cmd_timeout_us, + nvmf_caps->max_cmd_timeout_us); + else + attr->cmd_timeout_us = 0; + if (be_ctrl->ns->offload_cmd_tmo_us != attr->cmd_timeout_us) { + pr_warn("Clamping offload_cmd_tmo_us to %u\n", + attr->cmd_timeout_us); + be_ctrl->ns->offload_cmd_tmo_us = attr->cmd_timeout_us; + } + attr->cqh_dbr_addr = ofl->cqh_dbr_addr; + attr->sqt_dbr_addr = ofl->sqt_dbr_addr; + attr->cq_pas = ofl->cq_dma_addr; + attr->sq_pas = ofl->sq_dma_addr; + + return 0; +} + +static void nvmet_rdma_init_ns_attr(struct ib_nvmf_ns_init_attr *attr, + u32 frontend_namespace, + u32 backend_namespace, + u16 lba_data_size, + u16 backend_ctrl_id) +{ + memset(attr, 0, sizeof(*attr)); + + attr->frontend_namespace = frontend_namespace; + attr->backend_namespace = backend_namespace; + attr->lba_data_size = lba_data_size; + attr->backend_ctrl_id = backend_ctrl_id; +} + +static void nvmet_release_backend_ctrl_work(struct work_struct *w) +{ + struct nvmet_rdma_backend_ctrl *be_ctrl = + container_of(w, struct nvmet_rdma_backend_ctrl, release_work); + + nvmet_rdma_release_be_ctrl(be_ctrl); +} + +static u8 nvmet_rdma_peer_to_peer_mdts(struct nvmet_port *nport) +{ + struct nvmet_rdma_port *port = nport->priv; + struct rdma_cm_id *cm_id = port->cm_id; + + /* we assume ctrl page_size is 4K */ + return ilog2(cm_id->device->attrs.nvmf_caps.max_io_sz / SZ_4K); +} + +static struct nvmet_rdma_backend_ctrl * +nvmet_rdma_create_be_ctrl(struct nvmet_rdma_xrq *xrq, + struct nvmet_ns *ns) +{ + struct nvmet_rdma_backend_ctrl *be_ctrl; + struct ib_nvmf_backend_ctrl_init_attr init_attr; + struct ib_nvmf_ns_init_attr ns_init_attr; + struct ib_nvmf_caps *nvmf_caps = &xrq->ndev->device->attrs.nvmf_caps; + int p2p_mdts = nvmet_rdma_peer_to_peer_mdts(xrq->port); + int mdts = ilog2(queue_max_hw_sectors(ns->bdev->bd_disk->queue) >> 3); + int err; + unsigned be_nsid; + + if (p2p_mdts > mdts) { + pr_err("Device %s MDTS %d is smaller than P2P MDTS %d\n", + ns->device_path, mdts, p2p_mdts); + return ERR_PTR(-EINVAL); + } + + mutex_lock(&xrq->be_mutex); + if (xrq->nr_be_ctrls == nvmf_caps->max_be_ctrl) { + pr_err("Reached max number of supported be ctrl per XRQ (%u)\n", + nvmf_caps->max_be_ctrl); + mutex_unlock(&xrq->be_mutex); + return ERR_PTR(-EINVAL); + } + mutex_unlock(&xrq->be_mutex); + + be_ctrl = kzalloc(sizeof(*be_ctrl), GFP_KERNEL); + if (!be_ctrl) { + err = -ENOMEM; + goto out_err; + } + + INIT_WORK(&be_ctrl->release_work, + nvmet_release_backend_ctrl_work); + + kref_get(&xrq->ref); + + be_ctrl->ofl = nvme_peer_get_resource(ns->pdev, + NVME_PEER_SQT_DBR | + NVME_PEER_CQH_DBR | + NVME_PEER_SQ_PAS | + NVME_PEER_CQ_PAS | + NVME_PEER_SQ_SZ | + NVME_PEER_CQ_SZ | + NVME_PEER_MEM_LOG_PG_SZ, + nvmet_rdma_stop_master_peer, be_ctrl); + if (!be_ctrl->ofl) { + dev_err(&ns->pdev->dev, + "Failed to get peer resource xrq=%p be_ctrl=%p\n", + xrq, be_ctrl); + err = -ENODEV; + goto out_free_be_ctrl; + } + be_ctrl->restart = true; + be_ctrl->pdev = ns->pdev; + be_ctrl->ns = ns; + be_ctrl->xrq = xrq; + + err = nvmet_rdma_init_be_ctrl_attr(&init_attr, be_ctrl, nvmf_caps); + if (err) + goto out_put_resource; + + be_ctrl->ibctrl = ib_create_nvmf_backend_ctrl(xrq->ofl_srq->srq, &init_attr); + if (IS_ERR(be_ctrl->ibctrl)) { + dev_err(&ns->pdev->dev, + "Failed to create nvmf backend ctrl xrq=%p\n", xrq); + err = PTR_ERR(be_ctrl->ibctrl); + goto out_put_resource; + } + + be_nsid = nvme_find_ns_id_from_bdev(ns->bdev); + if (!be_nsid) { + err = -ENODEV; + goto out_destroy_be_ctrl; + } + + nvmet_rdma_init_ns_attr(&ns_init_attr, ns->nsid, be_nsid, + max_t(u16, 1 << ns->blksize_shift, 512), + be_ctrl->ibctrl->id); + be_ctrl->ibns = ib_attach_nvmf_ns(be_ctrl->ibctrl, &ns_init_attr); + if (IS_ERR(be_ctrl->ibns)) { + dev_err(&ns->pdev->dev, + "Failed to attach nvmf ns xrq=%p be_ctrl=%p\n", + xrq, be_ctrl); + err = PTR_ERR(be_ctrl->ibns); + goto out_destroy_be_ctrl; + } + + be_ctrl->offload_ctx.ctx = be_ctrl; + be_ctrl->offload_ctx.port = xrq->port; + be_ctrl->offload_ctx.ns = ns; + be_ctrl->offload_ctx.id = ida_simple_get(&nvmet_rdma_bectrl_ida, 0, 0, + GFP_KERNEL); + if (be_ctrl->offload_ctx.id < 0) { + err = -ENOMEM; + goto out_detach_ns; + } + + err = nvmet_offload_ctx_configfs_create(&be_ctrl->offload_ctx); + if (err) + goto out_ida_remove; + + mutex_lock(&xrq->be_mutex); + list_add_tail(&be_ctrl->entry, &xrq->be_ctrls_list); + xrq->nr_be_ctrls++; + mutex_unlock(&xrq->be_mutex); + + return be_ctrl; + +out_ida_remove: + ida_simple_remove(&nvmet_rdma_bectrl_ida, be_ctrl->offload_ctx.id); +out_detach_ns: + ib_detach_nvmf_ns(be_ctrl->ibns); +out_destroy_be_ctrl: + ib_destroy_nvmf_backend_ctrl(be_ctrl->ibctrl); +out_put_resource: + /* + * Flush the resource after destoying the backend controller is safe + * here, since there is no traffic on it. + */ + nvme_peer_flush_resource(be_ctrl->ofl, true); + nvme_peer_put_resource(be_ctrl->ofl, true); +out_free_be_ctrl: + kref_put(&xrq->ref, nvmet_rdma_destroy_xrq); + kfree(be_ctrl); +out_err: + return ERR_PTR(err); +} + +/** + * Passing ns == NULL will destroy all the be ctrs for the given XRQ + **/ +static void nvmet_rdma_free_be_ctrls(struct nvmet_rdma_xrq *xrq, + struct nvmet_ns *ns) +{ + struct nvmet_rdma_backend_ctrl *be_ctrl, *next; + + mutex_lock(&xrq->be_mutex); + list_for_each_entry_safe(be_ctrl, next, &xrq->be_ctrls_list, entry) + if (!ns || be_ctrl->ns == ns) + nvmet_rdma_free_be_ctrl(be_ctrl); + mutex_unlock(&xrq->be_mutex); +} + +static bool nvmet_rdma_ns_attached_to_xrq(struct nvmet_rdma_xrq *xrq, + struct nvmet_ns *ns) +{ + struct nvmet_rdma_backend_ctrl *be_ctrl; + bool found = false; + + mutex_lock(&xrq->be_mutex); + list_for_each_entry(be_ctrl, &xrq->be_ctrls_list, entry) { + if (be_ctrl->ns == ns) { + found = true; + break; + } + } + mutex_unlock(&xrq->be_mutex); + + return found; +} + +static int nvmet_rdma_enable_offload_ns(struct nvmet_ctrl *ctrl, + struct nvmet_ns *ns) +{ + struct nvmet_rdma_xrq *xrq; + struct nvmet_rdma_backend_ctrl *be_ctrl; + struct nvmet_rdma_offload_ctrl *offload_ctrl = ctrl->offload_ctrl; + struct nvmet_rdma_offload_ctx *offload_ctx; + int err = 0; + + mutex_lock(&offload_ctrl->ctx_mutex); + list_for_each_entry(offload_ctx, &offload_ctrl->ctx_list, entry) { + xrq = offload_ctx->xrq; + if (!nvmet_rdma_ns_attached_to_xrq(xrq, ns)) { + be_ctrl = nvmet_rdma_create_be_ctrl(xrq, ns); + if (IS_ERR(be_ctrl)) { + err = PTR_ERR(be_ctrl); + goto out_free; + } + } + } + mutex_unlock(&offload_ctrl->ctx_mutex); + + return 0; + +out_free: + list_for_each_entry(offload_ctx, &offload_ctrl->ctx_list, entry) + nvmet_rdma_free_be_ctrls(offload_ctx->xrq, ns); + mutex_unlock(&offload_ctrl->ctx_mutex); + + return err; +} + +static void nvmet_rdma_disable_offload_ns(struct nvmet_ctrl *ctrl, + struct nvmet_ns *ns) +{ + struct nvmet_rdma_offload_ctrl *offload_ctrl = ctrl->offload_ctrl; + struct nvmet_rdma_offload_ctx *offload_ctx; + + mutex_lock(&offload_ctrl->ctx_mutex); + list_for_each_entry(offload_ctx, &offload_ctrl->ctx_list, entry) + nvmet_rdma_free_be_ctrls(offload_ctx->xrq, ns); + mutex_unlock(&offload_ctrl->ctx_mutex); +} + +/** + * Called with offload_ctrl ctx_mutex held + **/ +static void nvmet_rdma_free_offload_ctx(struct nvmet_rdma_offload_ctx *offload_ctx) +{ + struct nvmet_rdma_xrq *xrq = offload_ctx->xrq; + + mutex_lock(&xrq->offload_ctrl_mutex); + xrq->offload_ctrls_cnt--; + if (!xrq->offload_ctrls_cnt) + nvmet_rdma_free_be_ctrls(xrq, NULL); + list_del_init(&offload_ctx->entry); + mutex_unlock(&xrq->offload_ctrl_mutex); + kfree(offload_ctx); +} + +static void nvmet_rdma_free_offload_ctxs(struct nvmet_rdma_offload_ctrl *offload_ctrl) +{ + struct nvmet_rdma_offload_ctx *offload_ctx, *next; + + mutex_lock(&offload_ctrl->ctx_mutex); + list_for_each_entry_safe(offload_ctx, next, &offload_ctrl->ctx_list, entry) + nvmet_rdma_free_offload_ctx(offload_ctx); + mutex_unlock(&offload_ctrl->ctx_mutex); +} + +static int nvmet_rdma_attach_xrq(struct nvmet_rdma_xrq *xrq, + struct nvmet_ctrl *ctrl) +{ + struct nvmet_ns *ns; + struct nvmet_rdma_offload_ctx *offload_ctx; + unsigned long idx; + struct nvmet_rdma_backend_ctrl *be_ctrl; + struct nvmet_rdma_offload_ctrl *offload_ctrl = ctrl->offload_ctrl; + int err; + + offload_ctx = kzalloc(sizeof(*offload_ctx), GFP_KERNEL); + if (!offload_ctx) + return -ENOMEM; + offload_ctx->xrq = xrq; + if (!xrq->subsys) + xrq->subsys = ctrl->subsys; + + xa_for_each(&ctrl->subsys->namespaces, idx, ns) { + if (!nvmet_rdma_ns_attached_to_xrq(xrq, ns)) { + be_ctrl = nvmet_rdma_create_be_ctrl(xrq, ns); + if (IS_ERR(be_ctrl)) { + err = PTR_ERR(be_ctrl); + goto out_free_offload_ctx; + } + } + } + + mutex_lock(&xrq->offload_ctrl_mutex); + xrq->offload_ctrls_cnt++; + mutex_unlock(&xrq->offload_ctrl_mutex); + mutex_lock(&offload_ctrl->ctx_mutex); + list_add_tail(&offload_ctx->entry, &offload_ctrl->ctx_list); + mutex_unlock(&offload_ctrl->ctx_mutex); + + return 0; + +out_free_offload_ctx: + nvmet_rdma_free_be_ctrls(xrq, NULL); + kfree(offload_ctx); + + return err; +} + +static int nvmet_rdma_create_offload_ctrl(struct nvmet_ctrl *ctrl) +{ + struct nvmet_rdma_xrq *xrq; + struct nvmet_rdma_offload_ctrl *offload_ctrl; + int err; + + offload_ctrl = kzalloc(sizeof(*offload_ctrl), GFP_KERNEL); + if (!offload_ctrl) + return -ENOMEM; + + ctrl->offload_ctrl = offload_ctrl; + INIT_LIST_HEAD(&offload_ctrl->ctx_list); + mutex_init(&offload_ctrl->ctx_mutex); + + mutex_lock(&nvmet_rdma_xrq_mutex); + list_for_each_entry(xrq, &nvmet_rdma_xrq_list, entry) { + if (xrq->port == ctrl->port && + (!xrq->subsys || xrq->subsys == ctrl->subsys)) { + err = nvmet_rdma_attach_xrq(xrq, ctrl); + if (err) + goto out_free; + } + } + mutex_unlock(&nvmet_rdma_xrq_mutex); + + return 0; + +out_free: + nvmet_rdma_free_offload_ctxs(offload_ctrl); + kfree(offload_ctrl); + mutex_unlock(&nvmet_rdma_xrq_mutex); + ctrl->offload_ctrl = NULL; + + return err; +} + +static void nvmet_rdma_destroy_offload_ctrl(struct nvmet_ctrl *ctrl) +{ + struct nvmet_rdma_offload_ctrl *offload_ctrl = ctrl->offload_ctrl; + + ctrl->offload_ctrl = NULL; + nvmet_rdma_free_offload_ctxs(offload_ctrl); + kfree(offload_ctrl); +} + +static u64 +nvmet_rdma_offload_subsys_unknown_ns_cmds(struct nvmet_subsys *subsys) +{ + struct nvmet_rdma_xrq *xrq; + struct ib_srq_attr attr; + u64 unknown_cmds = 0; + int ret; + + mutex_lock(&nvmet_rdma_xrq_mutex); + list_for_each_entry(xrq, &nvmet_rdma_xrq_list, entry) { + if (xrq->subsys == subsys) { + memset(&attr, 0, sizeof(attr)); + ret = ib_query_srq(xrq->ofl_srq->srq, &attr); + if (!ret) + unknown_cmds += attr.nvmf.cmd_unknown_namespace_cnt; + } + } + mutex_unlock(&nvmet_rdma_xrq_mutex); + + return unknown_cmds; +} + +static void +nvmet_rdma_update_counters(struct ib_nvmf_ns_attr *attr, + struct nvmet_ns_counters *counters) +{ + counters->num_read_cmd = attr->num_read_cmd; + counters->num_read_blocks = attr->num_read_blocks; + counters->num_write_cmd = attr->num_write_cmd; + counters->num_write_blocks = attr->num_write_blocks; + counters->num_write_inline_cmd = attr->num_write_inline_cmd; + counters->num_flush_cmd = attr->num_flush_cmd; + counters->num_error_cmd = attr->num_error_cmd; + counters->num_backend_error_cmd = attr->num_backend_error_cmd; + counters->last_read_latency = attr->last_read_latency; + counters->last_write_latency = attr->last_write_latency; + counters->queue_depth = attr->queue_depth; +} + +static void +nvmet_rdma_offload_query_counters(void *ctx, struct nvmet_ns_counters *counters) +{ + struct nvmet_rdma_backend_ctrl *be_ctrl = ctx; + struct ib_nvmf_ns_attr attr; + int ret; + + memset(counters, 0, sizeof(*counters)); + memset(&attr, 0, sizeof(attr)); + ret = ib_query_nvmf_ns(be_ctrl->ibns, &attr); + if (ret) { + pr_err("Failed to query counters"); + return; + } + nvmet_rdma_update_counters(&attr, counters); +} + +static u64 +nvmet_rdma_query_ns_counter(struct nvmet_ns *ns, + enum nvmet_rdma_offload_ns_counter counter) +{ + struct nvmet_rdma_xrq *xrq; + struct nvmet_rdma_backend_ctrl *be_ctrl; + struct ib_nvmf_ns_attr attr; + u64 cmds = 0; + int ret; + + mutex_lock(&nvmet_rdma_xrq_mutex); + list_for_each_entry(xrq, &nvmet_rdma_xrq_list, entry) { + if (xrq->subsys == ns->subsys) { + mutex_lock(&xrq->be_mutex); + list_for_each_entry(be_ctrl, &xrq->be_ctrls_list, entry) { + if (be_ctrl->ns == ns) { + memset(&attr, 0, sizeof(attr)); + ret = ib_query_nvmf_ns(be_ctrl->ibns, &attr); + if (!ret) { + switch (counter) { + case NVMET_RDMA_OFFLOAD_NS_READ_CMDS: + cmds += attr.num_read_cmd; + break; + case NVMET_RDMA_OFFLOAD_NS_READ_BLOCKS: + cmds += attr.num_read_blocks; + break; + case NVMET_RDMA_OFFLOAD_NS_WRITE_CMDS: + cmds += attr.num_write_cmd; + break; + case NVMET_RDMA_OFFLOAD_NS_WRITE_BLOCKS: + cmds += attr.num_write_blocks; + break; + case NVMET_RDMA_OFFLOAD_NS_WRITE_INLINE_CMDS: + cmds += attr.num_write_inline_cmd; + break; + case NVMET_RDMA_OFFLOAD_NS_FLUSH_CMDS: + cmds += attr.num_flush_cmd; + break; + case NVMET_RDMA_OFFLOAD_NS_ERROR_CMDS: + cmds += attr.num_error_cmd; + break; + case NVMET_RDMA_OFFLOAD_NS_BACKEND_ERROR_CMDS: + cmds += attr.num_backend_error_cmd; + break; + default: + pr_err("received unknown counter for offloaded namespace query (%d)\n", + counter); + break; + } + } + } + } + mutex_unlock(&xrq->be_mutex); + } + } + mutex_unlock(&nvmet_rdma_xrq_mutex); + + return cmds; +} + +static u64 nvmet_rdma_offload_ns_read_cmds(struct nvmet_ns *ns) +{ + return nvmet_rdma_query_ns_counter(ns, + NVMET_RDMA_OFFLOAD_NS_READ_CMDS); +} + +static u64 nvmet_rdma_offload_ns_read_blocks(struct nvmet_ns *ns) +{ + return nvmet_rdma_query_ns_counter(ns, + NVMET_RDMA_OFFLOAD_NS_READ_BLOCKS); +} + +static u64 nvmet_rdma_offload_ns_write_cmds(struct nvmet_ns *ns) +{ + return nvmet_rdma_query_ns_counter(ns, + NVMET_RDMA_OFFLOAD_NS_WRITE_CMDS); +} + +static u64 nvmet_rdma_offload_ns_write_blocks(struct nvmet_ns *ns) +{ + return nvmet_rdma_query_ns_counter(ns, + NVMET_RDMA_OFFLOAD_NS_WRITE_BLOCKS); +} + +static u64 nvmet_rdma_offload_ns_write_inline_cmds(struct nvmet_ns *ns) +{ + return nvmet_rdma_query_ns_counter(ns, + NVMET_RDMA_OFFLOAD_NS_WRITE_INLINE_CMDS); +} + +static u64 nvmet_rdma_offload_ns_flush_cmds(struct nvmet_ns *ns) +{ + return nvmet_rdma_query_ns_counter(ns, + NVMET_RDMA_OFFLOAD_NS_FLUSH_CMDS); + +} + +static u64 nvmet_rdma_offload_ns_error_cmds(struct nvmet_ns *ns) +{ + return nvmet_rdma_query_ns_counter(ns, + NVMET_RDMA_OFFLOAD_NS_ERROR_CMDS); +} + +static u64 nvmet_rdma_offload_ns_backend_error_cmds(struct nvmet_ns *ns) +{ + return nvmet_rdma_query_ns_counter(ns, + NVMET_RDMA_OFFLOAD_NS_BACKEND_ERROR_CMDS); +} + +static unsigned int __nvmet_rdma_peer_to_peer_sqe_inline_size(struct ib_nvmf_caps *nvmf_caps, + struct nvmet_port *nport) +{ + unsigned int sqe_inline_size = nport->inline_data_size; + int p2p_sqe_inline_size = (nvmf_caps->max_cmd_size * 16) - sizeof(struct nvme_command); + + if (p2p_sqe_inline_size >= 0) + sqe_inline_size = min_t(unsigned int, + p2p_sqe_inline_size, + sqe_inline_size); + + return sqe_inline_size; +} + +static unsigned int nvmet_rdma_peer_to_peer_sqe_inline_size(struct nvmet_ctrl *ctrl) +{ + struct nvmet_rdma_port *port = ctrl->port->priv; + struct rdma_cm_id *cm_id = port->cm_id; + struct ib_nvmf_caps *nvmf_caps = &cm_id->device->attrs.nvmf_caps; + + return __nvmet_rdma_peer_to_peer_sqe_inline_size(nvmf_caps, ctrl->port); +} + +static bool nvmet_rdma_peer_to_peer_capable(struct nvmet_port *nport) +{ + struct nvmet_rdma_port *port = nport->priv; + struct rdma_cm_id *cm_id = port->cm_id; + struct ib_nvmf_caps *nvmf_caps = &cm_id->device->attrs.nvmf_caps; + + if (!(cm_id->device->attrs.device_cap_flags & + IB_DEVICE_NVMF_TARGET_OFFLOAD)) + return false; + + if (!nvmf_caps->passthrough_sqe_rw_service && + nport->offload_passthrough_sqe_rw) { + pr_err("Offload passthrough is not supported by device %s\n", + cm_id->device->name); + return false; + } + + return true; +} + +static bool nvmet_rdma_check_subsys_match_offload_port(struct nvmet_port *nport, + struct nvmet_subsys *subsys) +{ + struct nvmet_rdma_port *port = nport->priv; + struct rdma_cm_id *cm_id = port->cm_id; + struct ib_nvmf_caps *nvmf_caps = &cm_id->device->attrs.nvmf_caps; + struct nvmet_ns *ns; + unsigned long idx; + + if (nvmf_caps->max_frontend_nsid) { + xa_for_each(&subsys->namespaces, idx, ns) { + if (ns->nsid > nvmf_caps->max_frontend_nsid) { + pr_err("Reached maximal namespace ID (%u/%u)\n", + ns->nsid, nvmf_caps->max_frontend_nsid); + return false; + } + } + } + + if (subsys->nr_namespaces > nvmf_caps->max_namespace) { + pr_err("Reached max number of ns per offload subsys (%u/%u)\n", + subsys->nr_namespaces, nvmf_caps->max_namespace); + return false; + } + + /* + * Assume number of namespaces equals number of backend controllers, + * because in the current implementation a backend controller is created + * for each namespace. + */ + if (subsys->nr_namespaces > nvmf_caps->max_be_ctrl) { + pr_err("Reached max number of supported be ctrl per XRQ (%u)\n", + nvmf_caps->max_be_ctrl); + return false; + } + + return true; +} + +static int nvmet_rdma_init_st_pool(struct nvmet_rdma_staging_buf_pool *pool, + unsigned long long mem_start, + unsigned int mem_size, + unsigned int buffer_size) +{ + struct nvmet_rdma_staging_buf *st, *tmp; + int i, err = -EINVAL; + int size = mem_size / buffer_size; + unsigned long start_pfn, end_pfn; + + if (!PAGE_ALIGNED(mem_start)) + goto out; + + start_pfn = PFN_DOWN(mem_start); + end_pfn = PFN_DOWN(mem_start + mem_size * SZ_1M); + for (; start_pfn < end_pfn; start_pfn++) { + if (pfn_valid(start_pfn)) + goto out; + } + + for (i = 0; i < size; i++) { + st = nvmet_rdma_alloc_st_buff(1, buffer_size, false); + if (!st) { + err = -ENOMEM; + goto error; + } + st->staging_dma_addrs[0] = mem_start + i * buffer_size * SZ_1M; + pr_debug("pool_entry=%d staging_buffer_address=0x%llx\n", i, st->staging_dma_addrs[0]); + list_add_tail(&st->entry, &pool->list); + pool->size++; + } + + pr_info("offload_mem_start=0x%llx pool_size=%d, buf_size=%u\n", + mem_start, + pool->size, + buffer_size); + + return 0; + +error: + list_for_each_entry_safe(st, tmp, &pool->list, entry) { + list_del(&st->entry); + nvmet_rdma_free_st_buff(st); + } +out: + return err; +} + diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/rdma_offload.h b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/rdma_offload.h new file mode 100644 index 0000000..847161e --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/rdma_offload.h @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2017, Mellanox Technologies, Ltd. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, see . + */ + +#ifndef _RDMA_OFFLOAD_H +#define _RDMA_OFFLOAD_H + +#include +#include "nvmet.h" + +#define NVMET_DYNAMIC_STAGING_BUFFER_PAGE_SIZE_MB 1 + +struct nvmet_rdma_xrq; +struct nvmet_rdma_device; +struct nvmet_rdma_cmd; +struct nvmet_rdma_queue queue; +struct nvmet_rdma_srq; + +enum nvmet_rdma_offload_ns_counter { + NVMET_RDMA_OFFLOAD_NS_READ_CMDS, + NVMET_RDMA_OFFLOAD_NS_READ_BLOCKS, + NVMET_RDMA_OFFLOAD_NS_WRITE_CMDS, + NVMET_RDMA_OFFLOAD_NS_WRITE_BLOCKS, + NVMET_RDMA_OFFLOAD_NS_WRITE_INLINE_CMDS, + NVMET_RDMA_OFFLOAD_NS_FLUSH_CMDS, + NVMET_RDMA_OFFLOAD_NS_ERROR_CMDS, + NVMET_RDMA_OFFLOAD_NS_BACKEND_ERROR_CMDS, +}; + +struct nvmet_rdma_backend_ctrl { + struct ib_nvmf_ctrl *ibctrl; + struct ib_nvmf_ns *ibns; + struct nvmet_ns *ns; + struct pci_dev *pdev; + struct list_head entry; + struct nvme_peer_resource *ofl; + struct nvmet_rdma_xrq *xrq; + struct work_struct release_work; + /* Restart the nvme queue for future usage */ + bool restart; + struct nvmet_offload_ctx offload_ctx; +}; + +struct nvmet_rdma_offload_ctx { + struct nvmet_rdma_xrq *xrq; + struct list_head entry; +}; + +struct nvmet_rdma_offload_ctrl { + struct list_head ctx_list; + struct mutex ctx_mutex; +}; + +struct nvmet_rdma_staging_buf_pool { + struct list_head list; + int size; +}; + +struct nvmet_rdma_staging_buf { + void **staging_pages; + dma_addr_t *staging_dma_addrs; + u16 num_pages; + unsigned int page_size; // in Bytes + struct list_head entry; + bool dynamic; + struct nvmet_rdma_xrq *xrq; +}; + +struct nvmet_rdma_xrq { + struct nvmet_rdma_device *ndev; + struct nvmet_port *port; + struct nvmet_subsys *subsys; + int offload_ctrls_cnt; + struct mutex offload_ctrl_mutex; + struct list_head be_ctrls_list; + int nr_be_ctrls; + struct mutex be_mutex; + struct nvmet_rdma_srq *ofl_srq; + size_t ofl_srq_size; + struct ib_cq *cq; + struct nvmet_rdma_staging_buf *st; + struct kref ref; + struct list_head entry; + unsigned int nvme_queue_depth; +}; + +static void nvmet_rdma_free_st_buff(struct nvmet_rdma_staging_buf *st); +static void nvmet_rdma_destroy_xrq(struct kref *ref); +static int nvmet_rdma_find_get_xrq(struct nvmet_rdma_queue *queue, + struct nvmet_ctrl *ctrl); +static u16 nvmet_rdma_install_offload_queue(struct nvmet_sq *sq); +static int nvmet_rdma_create_offload_ctrl(struct nvmet_ctrl *ctrl); +static void nvmet_rdma_destroy_offload_ctrl(struct nvmet_ctrl *ctrl); +static int nvmet_rdma_enable_offload_ns(struct nvmet_ctrl *ctrl, + struct nvmet_ns *ns); +static void nvmet_rdma_disable_offload_ns(struct nvmet_ctrl *ctrl, + struct nvmet_ns *ns); +static bool nvmet_rdma_peer_to_peer_capable(struct nvmet_port *nport); +static bool nvmet_rdma_check_subsys_match_offload_port(struct nvmet_port *nport, + struct nvmet_subsys *subsys); +static unsigned int nvmet_rdma_peer_to_peer_sqe_inline_size(struct nvmet_ctrl *ctrl); +static u8 nvmet_rdma_peer_to_peer_mdts(struct nvmet_port *nport); +static u64 nvmet_rdma_offload_subsys_unknown_ns_cmds(struct nvmet_subsys *subsys); +static u64 nvmet_rdma_offload_ns_read_cmds(struct nvmet_ns *ns); +static u64 nvmet_rdma_offload_ns_read_blocks(struct nvmet_ns *ns); +static u64 nvmet_rdma_offload_ns_write_cmds(struct nvmet_ns *ns); +static u64 nvmet_rdma_offload_ns_write_blocks(struct nvmet_ns *ns); +static u64 nvmet_rdma_offload_ns_write_inline_cmds(struct nvmet_ns *ns); +static u64 nvmet_rdma_offload_ns_flush_cmds(struct nvmet_ns *ns); +static u64 nvmet_rdma_offload_ns_error_cmds(struct nvmet_ns *ns); +static u64 nvmet_rdma_offload_ns_backend_error_cmds(struct nvmet_ns *ns); +static void nvmet_rdma_offload_query_counters(void *ctx, + struct nvmet_ns_counters *counters); +static int nvmet_rdma_init_st_pool(struct nvmet_rdma_staging_buf_pool *pool, + unsigned long long mem_start, + unsigned int mem_size, + unsigned int buffer_size); + +#endif /* _RDMA_OFFLOAD_H */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/tcp.c b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/tcp.c new file mode 100644 index 0000000..55d315f --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/tcp.c @@ -0,0 +1,1879 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NVMe over Fabrics TCP target. + * Copyright (c) 2018 Lightbits Labs. All rights reserved. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nvmet.h" + +#define NVMET_TCP_DEF_INLINE_DATA_SIZE (4 * PAGE_SIZE) + +/* Define the socket priority to use for connections were it is desirable + * that the NIC consider performing optimized packet processing or filtering. + * A non-zero value being sufficient to indicate general consideration of any + * possible optimization. Making it a module param allows for alternative + * values that may be unique for some NIC implementations. + */ +static int so_priority; +module_param(so_priority, int, 0644); +MODULE_PARM_DESC(so_priority, "nvmet tcp socket optimize priority"); + +/* Define a time period (in usecs) that io_work() shall sample an activated + * queue before determining it to be idle. This optional module behavior + * can enable NIC solutions that support socket optimized packet processing + * using advanced interrupt moderation techniques. + */ +static int idle_poll_period_usecs; +module_param(idle_poll_period_usecs, int, 0644); +MODULE_PARM_DESC(idle_poll_period_usecs, + "nvmet tcp io_work poll till idle time period in usecs"); + +#define NVMET_TCP_RECV_BUDGET 8 +#define NVMET_TCP_SEND_BUDGET 8 +#define NVMET_TCP_IO_WORK_BUDGET 64 + +enum nvmet_tcp_send_state { + NVMET_TCP_SEND_DATA_PDU, + NVMET_TCP_SEND_DATA, + NVMET_TCP_SEND_R2T, + NVMET_TCP_SEND_DDGST, + NVMET_TCP_SEND_RESPONSE +}; + +enum nvmet_tcp_recv_state { + NVMET_TCP_RECV_PDU, + NVMET_TCP_RECV_DATA, + NVMET_TCP_RECV_DDGST, + NVMET_TCP_RECV_ERR, +}; + +enum { + NVMET_TCP_F_INIT_FAILED = (1 << 0), +}; + +struct nvmet_tcp_cmd { + struct nvmet_tcp_queue *queue; + struct nvmet_req req; + + struct nvme_tcp_cmd_pdu *cmd_pdu; + struct nvme_tcp_rsp_pdu *rsp_pdu; + struct nvme_tcp_data_pdu *data_pdu; + struct nvme_tcp_r2t_pdu *r2t_pdu; + + u32 rbytes_done; + u32 wbytes_done; + + u32 pdu_len; + u32 pdu_recv; + int sg_idx; + int nr_mapped; + struct msghdr recv_msg; + struct kvec *iov; + u32 flags; + + struct list_head entry; + struct llist_node lentry; + + /* send state */ + u32 offset; + struct scatterlist *cur_sg; + enum nvmet_tcp_send_state state; + + __le32 exp_ddgst; + __le32 recv_ddgst; +}; + +enum nvmet_tcp_queue_state { + NVMET_TCP_Q_CONNECTING, + NVMET_TCP_Q_LIVE, + NVMET_TCP_Q_DISCONNECTING, +}; + +struct nvmet_tcp_queue { + struct socket *sock; + struct nvmet_tcp_port *port; + struct work_struct io_work; + struct nvmet_cq nvme_cq; + struct nvmet_sq nvme_sq; + + /* send state */ + struct nvmet_tcp_cmd *cmds; + unsigned int nr_cmds; + struct list_head free_list; + struct llist_head resp_list; + struct list_head resp_send_list; + int send_list_len; + struct nvmet_tcp_cmd *snd_cmd; + + /* recv state */ + int offset; + int left; + enum nvmet_tcp_recv_state rcv_state; + struct nvmet_tcp_cmd *cmd; + union nvme_tcp_pdu pdu; + + /* digest state */ + bool hdr_digest; + bool data_digest; + struct ahash_request *snd_hash; + struct ahash_request *rcv_hash; + + unsigned long poll_end; + + spinlock_t state_lock; + enum nvmet_tcp_queue_state state; + + struct sockaddr_storage sockaddr; + struct sockaddr_storage sockaddr_peer; + struct work_struct release_work; + + int idx; + struct list_head queue_list; + + struct nvmet_tcp_cmd connect; + + struct page_frag_cache pf_cache; + + void (*data_ready)(struct sock *); + void (*state_change)(struct sock *); + void (*write_space)(struct sock *); +}; + +struct nvmet_tcp_port { + struct socket *sock; + struct work_struct accept_work; + struct nvmet_port *nport; + struct sockaddr_storage addr; + void (*data_ready)(struct sock *); +}; + +static DEFINE_IDA(nvmet_tcp_queue_ida); +static LIST_HEAD(nvmet_tcp_queue_list); +static DEFINE_MUTEX(nvmet_tcp_queue_mutex); + +static struct workqueue_struct *nvmet_tcp_wq; +static const struct nvmet_fabrics_ops nvmet_tcp_ops; +static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c); +static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd); +static void nvmet_tcp_free_cmd_buffers(struct nvmet_tcp_cmd *cmd); +static void nvmet_tcp_unmap_pdu_iovec(struct nvmet_tcp_cmd *cmd); + +static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue, + struct nvmet_tcp_cmd *cmd) +{ + if (unlikely(!queue->nr_cmds)) { + /* We didn't allocate cmds yet, send 0xffff */ + return USHRT_MAX; + } + + return cmd - queue->cmds; +} + +static inline bool nvmet_tcp_has_data_in(struct nvmet_tcp_cmd *cmd) +{ + return nvme_is_write(cmd->req.cmd) && + cmd->rbytes_done < cmd->req.transfer_len; +} + +static inline bool nvmet_tcp_need_data_in(struct nvmet_tcp_cmd *cmd) +{ + return nvmet_tcp_has_data_in(cmd) && !cmd->req.cqe->status; +} + +static inline bool nvmet_tcp_need_data_out(struct nvmet_tcp_cmd *cmd) +{ + return !nvme_is_write(cmd->req.cmd) && + cmd->req.transfer_len > 0 && + !cmd->req.cqe->status; +} + +static inline bool nvmet_tcp_has_inline_data(struct nvmet_tcp_cmd *cmd) +{ + return nvme_is_write(cmd->req.cmd) && cmd->pdu_len && + !cmd->rbytes_done; +} + +static inline struct nvmet_tcp_cmd * +nvmet_tcp_get_cmd(struct nvmet_tcp_queue *queue) +{ + struct nvmet_tcp_cmd *cmd; + + cmd = list_first_entry_or_null(&queue->free_list, + struct nvmet_tcp_cmd, entry); + if (!cmd) + return NULL; + list_del_init(&cmd->entry); + + cmd->rbytes_done = cmd->wbytes_done = 0; + cmd->pdu_len = 0; + cmd->pdu_recv = 0; + cmd->iov = NULL; + cmd->flags = 0; + return cmd; +} + +static inline void nvmet_tcp_put_cmd(struct nvmet_tcp_cmd *cmd) +{ + if (unlikely(cmd == &cmd->queue->connect)) + return; + + list_add_tail(&cmd->entry, &cmd->queue->free_list); +} + +static inline int queue_cpu(struct nvmet_tcp_queue *queue) +{ + return queue->sock->sk->sk_incoming_cpu; +} + +static inline u8 nvmet_tcp_hdgst_len(struct nvmet_tcp_queue *queue) +{ + return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0; +} + +static inline u8 nvmet_tcp_ddgst_len(struct nvmet_tcp_queue *queue) +{ + return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0; +} + +static inline void nvmet_tcp_hdgst(struct ahash_request *hash, + void *pdu, size_t len) +{ + struct scatterlist sg; + + sg_init_one(&sg, pdu, len); + ahash_request_set_crypt(hash, &sg, pdu + len, len); + crypto_ahash_digest(hash); +} + +static int nvmet_tcp_verify_hdgst(struct nvmet_tcp_queue *queue, + void *pdu, size_t len) +{ + struct nvme_tcp_hdr *hdr = pdu; + __le32 recv_digest; + __le32 exp_digest; + + if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) { + pr_err("queue %d: header digest enabled but no header digest\n", + queue->idx); + return -EPROTO; + } + + recv_digest = *(__le32 *)(pdu + hdr->hlen); + nvmet_tcp_hdgst(queue->rcv_hash, pdu, len); + exp_digest = *(__le32 *)(pdu + hdr->hlen); + if (recv_digest != exp_digest) { + pr_err("queue %d: header digest error: recv %#x expected %#x\n", + queue->idx, le32_to_cpu(recv_digest), + le32_to_cpu(exp_digest)); + return -EPROTO; + } + + return 0; +} + +static int nvmet_tcp_check_ddgst(struct nvmet_tcp_queue *queue, void *pdu) +{ + struct nvme_tcp_hdr *hdr = pdu; + u8 digest_len = nvmet_tcp_hdgst_len(queue); + u32 len; + + len = le32_to_cpu(hdr->plen) - hdr->hlen - + (hdr->flags & NVME_TCP_F_HDGST ? digest_len : 0); + + if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) { + pr_err("queue %d: data digest flag is cleared\n", queue->idx); + return -EPROTO; + } + + return 0; +} + +static void nvmet_tcp_free_cmd_buffers(struct nvmet_tcp_cmd *cmd) +{ + WARN_ON(unlikely(cmd->nr_mapped > 0)); + + kfree(cmd->iov); + sgl_free(cmd->req.sg); + cmd->iov = NULL; + cmd->req.sg = NULL; +} + +static void nvmet_tcp_unmap_pdu_iovec(struct nvmet_tcp_cmd *cmd) +{ + struct scatterlist *sg; + int i; + + sg = &cmd->req.sg[cmd->sg_idx]; + + for (i = 0; i < cmd->nr_mapped; i++) + kunmap(sg_page(&sg[i])); + + cmd->nr_mapped = 0; +} + +static void nvmet_tcp_map_pdu_iovec(struct nvmet_tcp_cmd *cmd) +{ + struct kvec *iov = cmd->iov; + struct scatterlist *sg; + u32 length, offset, sg_offset; + + length = cmd->pdu_len; + cmd->nr_mapped = DIV_ROUND_UP(length, PAGE_SIZE); + offset = cmd->rbytes_done; + cmd->sg_idx = offset / PAGE_SIZE; + sg_offset = offset % PAGE_SIZE; + sg = &cmd->req.sg[cmd->sg_idx]; + + while (length) { + u32 iov_len = min_t(u32, length, sg->length - sg_offset); + + iov->iov_base = kmap(sg_page(sg)) + sg->offset + sg_offset; + iov->iov_len = iov_len; + + length -= iov_len; + sg = sg_next(sg); + iov++; + sg_offset = 0; + } + + iov_iter_kvec(&cmd->recv_msg.msg_iter, READ, cmd->iov, + cmd->nr_mapped, cmd->pdu_len); +} + +static void nvmet_tcp_fatal_error(struct nvmet_tcp_queue *queue) +{ + queue->rcv_state = NVMET_TCP_RECV_ERR; + if (queue->nvme_sq.ctrl) + nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl); + else + kernel_sock_shutdown(queue->sock, SHUT_RDWR); +} + +static void nvmet_tcp_socket_error(struct nvmet_tcp_queue *queue, int status) +{ + if (status == -EPIPE || status == -ECONNRESET) + kernel_sock_shutdown(queue->sock, SHUT_RDWR); + else + nvmet_tcp_fatal_error(queue); +} + +static int nvmet_tcp_map_data(struct nvmet_tcp_cmd *cmd) +{ + struct nvme_sgl_desc *sgl = &cmd->req.cmd->common.dptr.sgl; + u32 len = le32_to_cpu(sgl->length); + + if (!len) + return 0; + + if (sgl->type == ((NVME_SGL_FMT_DATA_DESC << 4) | + NVME_SGL_FMT_OFFSET)) { + if (!nvme_is_write(cmd->req.cmd)) + return NVME_SC_INVALID_FIELD | NVME_SC_DNR; + + if (len > cmd->req.port->inline_data_size) + return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR; + cmd->pdu_len = len; + } + cmd->req.transfer_len += len; + + cmd->req.sg = sgl_alloc(len, GFP_KERNEL, &cmd->req.sg_cnt); + if (!cmd->req.sg) + return NVME_SC_INTERNAL; + cmd->cur_sg = cmd->req.sg; + + if (nvmet_tcp_has_data_in(cmd)) { + cmd->iov = kmalloc_array(cmd->req.sg_cnt, + sizeof(*cmd->iov), GFP_KERNEL); + if (!cmd->iov) + goto err; + } + + return 0; +err: + nvmet_tcp_free_cmd_buffers(cmd); + return NVME_SC_INTERNAL; +} + +static void nvmet_tcp_calc_ddgst(struct ahash_request *hash, + struct nvmet_tcp_cmd *cmd) +{ + ahash_request_set_crypt(hash, cmd->req.sg, + (void *)&cmd->exp_ddgst, cmd->req.transfer_len); + crypto_ahash_digest(hash); +} + +static void nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd *cmd) +{ + struct nvme_tcp_data_pdu *pdu = cmd->data_pdu; + struct nvmet_tcp_queue *queue = cmd->queue; + u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue); + u8 ddgst = nvmet_tcp_ddgst_len(cmd->queue); + + cmd->offset = 0; + cmd->state = NVMET_TCP_SEND_DATA_PDU; + + pdu->hdr.type = nvme_tcp_c2h_data; + pdu->hdr.flags = NVME_TCP_F_DATA_LAST | (queue->nvme_sq.sqhd_disabled ? + NVME_TCP_F_DATA_SUCCESS : 0); + pdu->hdr.hlen = sizeof(*pdu); + pdu->hdr.pdo = pdu->hdr.hlen + hdgst; + pdu->hdr.plen = + cpu_to_le32(pdu->hdr.hlen + hdgst + + cmd->req.transfer_len + ddgst); + pdu->command_id = cmd->req.cqe->command_id; + pdu->data_length = cpu_to_le32(cmd->req.transfer_len); + pdu->data_offset = cpu_to_le32(cmd->wbytes_done); + + if (queue->data_digest) { + pdu->hdr.flags |= NVME_TCP_F_DDGST; + nvmet_tcp_calc_ddgst(queue->snd_hash, cmd); + } + + if (cmd->queue->hdr_digest) { + pdu->hdr.flags |= NVME_TCP_F_HDGST; + nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); + } +} + +static void nvmet_setup_r2t_pdu(struct nvmet_tcp_cmd *cmd) +{ + struct nvme_tcp_r2t_pdu *pdu = cmd->r2t_pdu; + struct nvmet_tcp_queue *queue = cmd->queue; + u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue); + + cmd->offset = 0; + cmd->state = NVMET_TCP_SEND_R2T; + + pdu->hdr.type = nvme_tcp_r2t; + pdu->hdr.flags = 0; + pdu->hdr.hlen = sizeof(*pdu); + pdu->hdr.pdo = 0; + pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst); + + pdu->command_id = cmd->req.cmd->common.command_id; + pdu->ttag = nvmet_tcp_cmd_tag(cmd->queue, cmd); + pdu->r2t_length = cpu_to_le32(cmd->req.transfer_len - cmd->rbytes_done); + pdu->r2t_offset = cpu_to_le32(cmd->rbytes_done); + if (cmd->queue->hdr_digest) { + pdu->hdr.flags |= NVME_TCP_F_HDGST; + nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); + } +} + +static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd) +{ + struct nvme_tcp_rsp_pdu *pdu = cmd->rsp_pdu; + struct nvmet_tcp_queue *queue = cmd->queue; + u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue); + + cmd->offset = 0; + cmd->state = NVMET_TCP_SEND_RESPONSE; + + pdu->hdr.type = nvme_tcp_rsp; + pdu->hdr.flags = 0; + pdu->hdr.hlen = sizeof(*pdu); + pdu->hdr.pdo = 0; + pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst); + if (cmd->queue->hdr_digest) { + pdu->hdr.flags |= NVME_TCP_F_HDGST; + nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); + } +} + +static void nvmet_tcp_process_resp_list(struct nvmet_tcp_queue *queue) +{ + struct llist_node *node; + struct nvmet_tcp_cmd *cmd; + + for (node = llist_del_all(&queue->resp_list); node; node = node->next) { + cmd = llist_entry(node, struct nvmet_tcp_cmd, lentry); + list_add(&cmd->entry, &queue->resp_send_list); + queue->send_list_len++; + } +} + +static struct nvmet_tcp_cmd *nvmet_tcp_fetch_cmd(struct nvmet_tcp_queue *queue) +{ + queue->snd_cmd = list_first_entry_or_null(&queue->resp_send_list, + struct nvmet_tcp_cmd, entry); + if (!queue->snd_cmd) { + nvmet_tcp_process_resp_list(queue); + queue->snd_cmd = + list_first_entry_or_null(&queue->resp_send_list, + struct nvmet_tcp_cmd, entry); + if (unlikely(!queue->snd_cmd)) + return NULL; + } + + list_del_init(&queue->snd_cmd->entry); + queue->send_list_len--; + + if (nvmet_tcp_need_data_out(queue->snd_cmd)) + nvmet_setup_c2h_data_pdu(queue->snd_cmd); + else if (nvmet_tcp_need_data_in(queue->snd_cmd)) + nvmet_setup_r2t_pdu(queue->snd_cmd); + else + nvmet_setup_response_pdu(queue->snd_cmd); + + return queue->snd_cmd; +} + +static void nvmet_tcp_queue_response(struct nvmet_req *req) +{ + struct nvmet_tcp_cmd *cmd = + container_of(req, struct nvmet_tcp_cmd, req); + struct nvmet_tcp_queue *queue = cmd->queue; + struct nvme_sgl_desc *sgl; + u32 len; + + if (unlikely(cmd == queue->cmd)) { + sgl = &cmd->req.cmd->common.dptr.sgl; + len = le32_to_cpu(sgl->length); + + /* + * Wait for inline data before processing the response. + * Avoid using helpers, this might happen before + * nvmet_req_init is completed. + */ + if (queue->rcv_state == NVMET_TCP_RECV_PDU && + len && len <= cmd->req.port->inline_data_size && + nvme_is_write(cmd->req.cmd)) + return; + } + + llist_add(&cmd->lentry, &queue->resp_list); + queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &cmd->queue->io_work); +} + +static void nvmet_tcp_execute_request(struct nvmet_tcp_cmd *cmd) +{ + if (unlikely(cmd->flags & NVMET_TCP_F_INIT_FAILED)) + nvmet_tcp_queue_response(&cmd->req); + else + cmd->req.execute(&cmd->req); +} + +static int nvmet_try_send_data_pdu(struct nvmet_tcp_cmd *cmd) +{ + u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue); + int left = sizeof(*cmd->data_pdu) - cmd->offset + hdgst; + int ret; + + ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->data_pdu), + offset_in_page(cmd->data_pdu) + cmd->offset, + left, MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST); + if (ret <= 0) + return ret; + + cmd->offset += ret; + left -= ret; + + if (left) + return -EAGAIN; + + cmd->state = NVMET_TCP_SEND_DATA; + cmd->offset = 0; + return 1; +} + +static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd, bool last_in_batch) +{ + struct nvmet_tcp_queue *queue = cmd->queue; + int ret; + + while (cmd->cur_sg) { + struct page *page = sg_page(cmd->cur_sg); + u32 left = cmd->cur_sg->length - cmd->offset; + int flags = MSG_DONTWAIT; + + if ((!last_in_batch && cmd->queue->send_list_len) || + cmd->wbytes_done + left < cmd->req.transfer_len || + queue->data_digest || !queue->nvme_sq.sqhd_disabled) + flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST; + + ret = kernel_sendpage(cmd->queue->sock, page, cmd->offset, + left, flags); + if (ret <= 0) + return ret; + + cmd->offset += ret; + cmd->wbytes_done += ret; + + /* Done with sg?*/ + if (cmd->offset == cmd->cur_sg->length) { + cmd->cur_sg = sg_next(cmd->cur_sg); + cmd->offset = 0; + } + } + + if (queue->data_digest) { + cmd->state = NVMET_TCP_SEND_DDGST; + cmd->offset = 0; + } else { + if (queue->nvme_sq.sqhd_disabled) { + cmd->queue->snd_cmd = NULL; + nvmet_tcp_put_cmd(cmd); + } else { + nvmet_setup_response_pdu(cmd); + } + } + + if (queue->nvme_sq.sqhd_disabled) + nvmet_tcp_free_cmd_buffers(cmd); + + return 1; + +} + +static int nvmet_try_send_response(struct nvmet_tcp_cmd *cmd, + bool last_in_batch) +{ + u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue); + int left = sizeof(*cmd->rsp_pdu) - cmd->offset + hdgst; + int flags = MSG_DONTWAIT; + int ret; + + if (!last_in_batch && cmd->queue->send_list_len) + flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST; + else + flags |= MSG_EOR; + + ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->rsp_pdu), + offset_in_page(cmd->rsp_pdu) + cmd->offset, left, flags); + if (ret <= 0) + return ret; + cmd->offset += ret; + left -= ret; + + if (left) + return -EAGAIN; + + nvmet_tcp_free_cmd_buffers(cmd); + cmd->queue->snd_cmd = NULL; + nvmet_tcp_put_cmd(cmd); + return 1; +} + +static int nvmet_try_send_r2t(struct nvmet_tcp_cmd *cmd, bool last_in_batch) +{ + u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue); + int left = sizeof(*cmd->r2t_pdu) - cmd->offset + hdgst; + int flags = MSG_DONTWAIT; + int ret; + + if (!last_in_batch && cmd->queue->send_list_len) + flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST; + else + flags |= MSG_EOR; + + ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->r2t_pdu), + offset_in_page(cmd->r2t_pdu) + cmd->offset, left, flags); + if (ret <= 0) + return ret; + cmd->offset += ret; + left -= ret; + + if (left) + return -EAGAIN; + + cmd->queue->snd_cmd = NULL; + return 1; +} + +static int nvmet_try_send_ddgst(struct nvmet_tcp_cmd *cmd, bool last_in_batch) +{ + struct nvmet_tcp_queue *queue = cmd->queue; + int left = NVME_TCP_DIGEST_LENGTH - cmd->offset; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT }; + struct kvec iov = { + .iov_base = (u8 *)&cmd->exp_ddgst + cmd->offset, + .iov_len = left + }; + int ret; + + if (!last_in_batch && cmd->queue->send_list_len) + msg.msg_flags |= MSG_MORE; + else + msg.msg_flags |= MSG_EOR; + + ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len); + if (unlikely(ret <= 0)) + return ret; + + cmd->offset += ret; + left -= ret; + + if (left) + return -EAGAIN; + + if (queue->nvme_sq.sqhd_disabled) { + cmd->queue->snd_cmd = NULL; + nvmet_tcp_put_cmd(cmd); + } else { + nvmet_setup_response_pdu(cmd); + } + return 1; +} + +static int nvmet_tcp_try_send_one(struct nvmet_tcp_queue *queue, + bool last_in_batch) +{ + struct nvmet_tcp_cmd *cmd = queue->snd_cmd; + int ret = 0; + + if (!cmd || queue->state == NVMET_TCP_Q_DISCONNECTING) { + cmd = nvmet_tcp_fetch_cmd(queue); + if (unlikely(!cmd)) + return 0; + } + + if (cmd->state == NVMET_TCP_SEND_DATA_PDU) { + ret = nvmet_try_send_data_pdu(cmd); + if (ret <= 0) + goto done_send; + } + + if (cmd->state == NVMET_TCP_SEND_DATA) { + ret = nvmet_try_send_data(cmd, last_in_batch); + if (ret <= 0) + goto done_send; + } + + if (cmd->state == NVMET_TCP_SEND_DDGST) { + ret = nvmet_try_send_ddgst(cmd, last_in_batch); + if (ret <= 0) + goto done_send; + } + + if (cmd->state == NVMET_TCP_SEND_R2T) { + ret = nvmet_try_send_r2t(cmd, last_in_batch); + if (ret <= 0) + goto done_send; + } + + if (cmd->state == NVMET_TCP_SEND_RESPONSE) + ret = nvmet_try_send_response(cmd, last_in_batch); + +done_send: + if (ret < 0) { + if (ret == -EAGAIN) + return 0; + return ret; + } + + return 1; +} + +static int nvmet_tcp_try_send(struct nvmet_tcp_queue *queue, + int budget, int *sends) +{ + int i, ret = 0; + + for (i = 0; i < budget; i++) { + ret = nvmet_tcp_try_send_one(queue, i == budget - 1); + if (unlikely(ret < 0)) { + nvmet_tcp_socket_error(queue, ret); + goto done; + } else if (ret == 0) { + break; + } + (*sends)++; + } +done: + return ret; +} + +static void nvmet_prepare_receive_pdu(struct nvmet_tcp_queue *queue) +{ + queue->offset = 0; + queue->left = sizeof(struct nvme_tcp_hdr); + queue->cmd = NULL; + queue->rcv_state = NVMET_TCP_RECV_PDU; +} + +static void nvmet_tcp_free_crypto(struct nvmet_tcp_queue *queue) +{ + struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash); + + ahash_request_free(queue->rcv_hash); + ahash_request_free(queue->snd_hash); + crypto_free_ahash(tfm); +} + +static int nvmet_tcp_alloc_crypto(struct nvmet_tcp_queue *queue) +{ + struct crypto_ahash *tfm; + + tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL); + if (!queue->snd_hash) + goto free_tfm; + ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL); + + queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL); + if (!queue->rcv_hash) + goto free_snd_hash; + ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL); + + return 0; +free_snd_hash: + ahash_request_free(queue->snd_hash); +free_tfm: + crypto_free_ahash(tfm); + return -ENOMEM; +} + + +static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue) +{ + struct nvme_tcp_icreq_pdu *icreq = &queue->pdu.icreq; + struct nvme_tcp_icresp_pdu *icresp = &queue->pdu.icresp; + struct msghdr msg = {}; + struct kvec iov; + int ret; + + if (le32_to_cpu(icreq->hdr.plen) != sizeof(struct nvme_tcp_icreq_pdu)) { + pr_err("bad nvme-tcp pdu length (%d)\n", + le32_to_cpu(icreq->hdr.plen)); + nvmet_tcp_fatal_error(queue); + } + + if (icreq->pfv != NVME_TCP_PFV_1_0) { + pr_err("queue %d: bad pfv %d\n", queue->idx, icreq->pfv); + return -EPROTO; + } + + if (icreq->hpda != 0) { + pr_err("queue %d: unsupported hpda %d\n", queue->idx, + icreq->hpda); + return -EPROTO; + } + + queue->hdr_digest = !!(icreq->digest & NVME_TCP_HDR_DIGEST_ENABLE); + queue->data_digest = !!(icreq->digest & NVME_TCP_DATA_DIGEST_ENABLE); + if (queue->hdr_digest || queue->data_digest) { + ret = nvmet_tcp_alloc_crypto(queue); + if (ret) + return ret; + } + + memset(icresp, 0, sizeof(*icresp)); + icresp->hdr.type = nvme_tcp_icresp; + icresp->hdr.hlen = sizeof(*icresp); + icresp->hdr.pdo = 0; + icresp->hdr.plen = cpu_to_le32(icresp->hdr.hlen); + icresp->pfv = cpu_to_le16(NVME_TCP_PFV_1_0); + icresp->maxdata = cpu_to_le32(0x400000); /* 16M arbitrary limit */ + icresp->cpda = 0; + if (queue->hdr_digest) + icresp->digest |= NVME_TCP_HDR_DIGEST_ENABLE; + if (queue->data_digest) + icresp->digest |= NVME_TCP_DATA_DIGEST_ENABLE; + + iov.iov_base = icresp; + iov.iov_len = sizeof(*icresp); + ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len); + if (ret < 0) + goto free_crypto; + + queue->state = NVMET_TCP_Q_LIVE; + nvmet_prepare_receive_pdu(queue); + return 0; +free_crypto: + if (queue->hdr_digest || queue->data_digest) + nvmet_tcp_free_crypto(queue); + return ret; +} + +static void nvmet_tcp_handle_req_failure(struct nvmet_tcp_queue *queue, + struct nvmet_tcp_cmd *cmd, struct nvmet_req *req) +{ + size_t data_len = le32_to_cpu(req->cmd->common.dptr.sgl.length); + int ret; + + /* + * This command has not been processed yet, hence we are trying to + * figure out if there is still pending data left to receive. If + * we don't, we can simply prepare for the next pdu and bail out, + * otherwise we will need to prepare a buffer and receive the + * stale data before continuing forward. + */ + if (!nvme_is_write(cmd->req.cmd) || !data_len || + data_len > cmd->req.port->inline_data_size) { + nvmet_prepare_receive_pdu(queue); + return; + } + + ret = nvmet_tcp_map_data(cmd); + if (unlikely(ret)) { + pr_err("queue %d: failed to map data\n", queue->idx); + nvmet_tcp_fatal_error(queue); + return; + } + + queue->rcv_state = NVMET_TCP_RECV_DATA; + nvmet_tcp_map_pdu_iovec(cmd); + cmd->flags |= NVMET_TCP_F_INIT_FAILED; +} + +static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue) +{ + struct nvme_tcp_data_pdu *data = &queue->pdu.data; + struct nvmet_tcp_cmd *cmd; + + if (likely(queue->nr_cmds)) + cmd = &queue->cmds[data->ttag]; + else + cmd = &queue->connect; + + if (le32_to_cpu(data->data_offset) != cmd->rbytes_done) { + pr_err("ttag %u unexpected data offset %u (expected %u)\n", + data->ttag, le32_to_cpu(data->data_offset), + cmd->rbytes_done); + /* FIXME: use path and transport errors */ + nvmet_req_complete(&cmd->req, + NVME_SC_INVALID_FIELD | NVME_SC_DNR); + return -EPROTO; + } + + cmd->pdu_len = le32_to_cpu(data->data_length); + cmd->pdu_recv = 0; + nvmet_tcp_map_pdu_iovec(cmd); + queue->cmd = cmd; + queue->rcv_state = NVMET_TCP_RECV_DATA; + + return 0; +} + +static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue) +{ + struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr; + struct nvme_command *nvme_cmd = &queue->pdu.cmd.cmd; + struct nvmet_req *req; + int ret; + + if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) { + if (hdr->type != nvme_tcp_icreq) { + pr_err("unexpected pdu type (%d) before icreq\n", + hdr->type); + nvmet_tcp_fatal_error(queue); + return -EPROTO; + } + return nvmet_tcp_handle_icreq(queue); + } + + if (hdr->type == nvme_tcp_h2c_data) { + ret = nvmet_tcp_handle_h2c_data_pdu(queue); + if (unlikely(ret)) + return ret; + return 0; + } + + queue->cmd = nvmet_tcp_get_cmd(queue); + if (unlikely(!queue->cmd)) { + /* This should never happen */ + pr_err("queue %d: out of commands (%d) send_list_len: %d, opcode: %d", + queue->idx, queue->nr_cmds, queue->send_list_len, + nvme_cmd->common.opcode); + nvmet_tcp_fatal_error(queue); + return -ENOMEM; + } + + req = &queue->cmd->req; + memcpy(req->cmd, nvme_cmd, sizeof(*nvme_cmd)); + + if (unlikely(!nvmet_req_init(req, &queue->nvme_cq, + &queue->nvme_sq, &nvmet_tcp_ops))) { + pr_err("failed cmd %p id %d opcode %d, data_len: %d\n", + req->cmd, req->cmd->common.command_id, + req->cmd->common.opcode, + le32_to_cpu(req->cmd->common.dptr.sgl.length)); + + nvmet_tcp_handle_req_failure(queue, queue->cmd, req); + return 0; + } + + ret = nvmet_tcp_map_data(queue->cmd); + if (unlikely(ret)) { + pr_err("queue %d: failed to map data\n", queue->idx); + if (nvmet_tcp_has_inline_data(queue->cmd)) + nvmet_tcp_fatal_error(queue); + else + nvmet_req_complete(req, ret); + ret = -EAGAIN; + goto out; + } + + if (nvmet_tcp_need_data_in(queue->cmd)) { + if (nvmet_tcp_has_inline_data(queue->cmd)) { + queue->rcv_state = NVMET_TCP_RECV_DATA; + nvmet_tcp_map_pdu_iovec(queue->cmd); + return 0; + } + /* send back R2T */ + nvmet_tcp_queue_response(&queue->cmd->req); + goto out; + } + + queue->cmd->req.execute(&queue->cmd->req); +out: + nvmet_prepare_receive_pdu(queue); + return ret; +} + +static const u8 nvme_tcp_pdu_sizes[] = { + [nvme_tcp_icreq] = sizeof(struct nvme_tcp_icreq_pdu), + [nvme_tcp_cmd] = sizeof(struct nvme_tcp_cmd_pdu), + [nvme_tcp_h2c_data] = sizeof(struct nvme_tcp_data_pdu), +}; + +static inline u8 nvmet_tcp_pdu_size(u8 type) +{ + size_t idx = type; + + return (idx < ARRAY_SIZE(nvme_tcp_pdu_sizes) && + nvme_tcp_pdu_sizes[idx]) ? + nvme_tcp_pdu_sizes[idx] : 0; +} + +static inline bool nvmet_tcp_pdu_valid(u8 type) +{ + switch (type) { + case nvme_tcp_icreq: + case nvme_tcp_cmd: + case nvme_tcp_h2c_data: + /* fallthru */ + return true; + } + + return false; +} + +static int nvmet_tcp_try_recv_pdu(struct nvmet_tcp_queue *queue) +{ + struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr; + int len; + struct kvec iov; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT }; + +recv: + iov.iov_base = (void *)&queue->pdu + queue->offset; + iov.iov_len = queue->left; + len = kernel_recvmsg(queue->sock, &msg, &iov, 1, + iov.iov_len, msg.msg_flags); + if (unlikely(len < 0)) + return len; + + queue->offset += len; + queue->left -= len; + if (queue->left) + return -EAGAIN; + + if (queue->offset == sizeof(struct nvme_tcp_hdr)) { + u8 hdgst = nvmet_tcp_hdgst_len(queue); + + if (unlikely(!nvmet_tcp_pdu_valid(hdr->type))) { + pr_err("unexpected pdu type %d\n", hdr->type); + nvmet_tcp_fatal_error(queue); + return -EIO; + } + + if (unlikely(hdr->hlen != nvmet_tcp_pdu_size(hdr->type))) { + pr_err("pdu %d bad hlen %d\n", hdr->type, hdr->hlen); + return -EIO; + } + + queue->left = hdr->hlen - queue->offset + hdgst; + goto recv; + } + + if (queue->hdr_digest && + nvmet_tcp_verify_hdgst(queue, &queue->pdu, hdr->hlen)) { + nvmet_tcp_fatal_error(queue); /* fatal */ + return -EPROTO; + } + + if (queue->data_digest && + nvmet_tcp_check_ddgst(queue, &queue->pdu)) { + nvmet_tcp_fatal_error(queue); /* fatal */ + return -EPROTO; + } + + return nvmet_tcp_done_recv_pdu(queue); +} + +static void nvmet_tcp_prep_recv_ddgst(struct nvmet_tcp_cmd *cmd) +{ + struct nvmet_tcp_queue *queue = cmd->queue; + + nvmet_tcp_calc_ddgst(queue->rcv_hash, cmd); + queue->offset = 0; + queue->left = NVME_TCP_DIGEST_LENGTH; + queue->rcv_state = NVMET_TCP_RECV_DDGST; +} + +static int nvmet_tcp_try_recv_data(struct nvmet_tcp_queue *queue) +{ + struct nvmet_tcp_cmd *cmd = queue->cmd; + int ret; + + while (msg_data_left(&cmd->recv_msg)) { + ret = sock_recvmsg(cmd->queue->sock, &cmd->recv_msg, + cmd->recv_msg.msg_flags); + if (ret <= 0) + return ret; + + cmd->pdu_recv += ret; + cmd->rbytes_done += ret; + } + + nvmet_tcp_unmap_pdu_iovec(cmd); + if (queue->data_digest) { + nvmet_tcp_prep_recv_ddgst(cmd); + return 0; + } + + if (cmd->rbytes_done == cmd->req.transfer_len) + nvmet_tcp_execute_request(cmd); + + nvmet_prepare_receive_pdu(queue); + return 0; +} + +static int nvmet_tcp_try_recv_ddgst(struct nvmet_tcp_queue *queue) +{ + struct nvmet_tcp_cmd *cmd = queue->cmd; + int ret; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT }; + struct kvec iov = { + .iov_base = (void *)&cmd->recv_ddgst + queue->offset, + .iov_len = queue->left + }; + + ret = kernel_recvmsg(queue->sock, &msg, &iov, 1, + iov.iov_len, msg.msg_flags); + if (unlikely(ret < 0)) + return ret; + + queue->offset += ret; + queue->left -= ret; + if (queue->left) + return -EAGAIN; + + if (queue->data_digest && cmd->exp_ddgst != cmd->recv_ddgst) { + pr_err("queue %d: cmd %d pdu (%d) data digest error: recv %#x expected %#x\n", + queue->idx, cmd->req.cmd->common.command_id, + queue->pdu.cmd.hdr.type, le32_to_cpu(cmd->recv_ddgst), + le32_to_cpu(cmd->exp_ddgst)); + nvmet_tcp_finish_cmd(cmd); + nvmet_tcp_fatal_error(queue); + ret = -EPROTO; + goto out; + } + + if (cmd->rbytes_done == cmd->req.transfer_len) + nvmet_tcp_execute_request(cmd); + + ret = 0; +out: + nvmet_prepare_receive_pdu(queue); + return ret; +} + +static int nvmet_tcp_try_recv_one(struct nvmet_tcp_queue *queue) +{ + int result = 0; + + if (unlikely(queue->rcv_state == NVMET_TCP_RECV_ERR)) + return 0; + + if (queue->rcv_state == NVMET_TCP_RECV_PDU) { + result = nvmet_tcp_try_recv_pdu(queue); + if (result != 0) + goto done_recv; + } + + if (queue->rcv_state == NVMET_TCP_RECV_DATA) { + result = nvmet_tcp_try_recv_data(queue); + if (result != 0) + goto done_recv; + } + + if (queue->rcv_state == NVMET_TCP_RECV_DDGST) { + result = nvmet_tcp_try_recv_ddgst(queue); + if (result != 0) + goto done_recv; + } + +done_recv: + if (result < 0) { + if (result == -EAGAIN) + return 0; + return result; + } + return 1; +} + +static int nvmet_tcp_try_recv(struct nvmet_tcp_queue *queue, + int budget, int *recvs) +{ + int i, ret = 0; + + for (i = 0; i < budget; i++) { + ret = nvmet_tcp_try_recv_one(queue); + if (unlikely(ret < 0)) { + nvmet_tcp_socket_error(queue, ret); + goto done; + } else if (ret == 0) { + break; + } + (*recvs)++; + } +done: + return ret; +} + +static void nvmet_tcp_schedule_release_queue(struct nvmet_tcp_queue *queue) +{ + spin_lock(&queue->state_lock); + if (queue->state != NVMET_TCP_Q_DISCONNECTING) { + queue->state = NVMET_TCP_Q_DISCONNECTING; + schedule_work(&queue->release_work); + } + spin_unlock(&queue->state_lock); +} + +static inline void nvmet_tcp_arm_queue_deadline(struct nvmet_tcp_queue *queue) +{ + queue->poll_end = jiffies + usecs_to_jiffies(idle_poll_period_usecs); +} + +static bool nvmet_tcp_check_queue_deadline(struct nvmet_tcp_queue *queue, + int ops) +{ + if (!idle_poll_period_usecs) + return false; + + if (ops) + nvmet_tcp_arm_queue_deadline(queue); + + return !time_after(jiffies, queue->poll_end); +} + +static void nvmet_tcp_io_work(struct work_struct *w) +{ + struct nvmet_tcp_queue *queue = + container_of(w, struct nvmet_tcp_queue, io_work); + bool pending; + int ret, ops = 0; + + do { + pending = false; + + ret = nvmet_tcp_try_recv(queue, NVMET_TCP_RECV_BUDGET, &ops); + if (ret > 0) + pending = true; + else if (ret < 0) + return; + + ret = nvmet_tcp_try_send(queue, NVMET_TCP_SEND_BUDGET, &ops); + if (ret > 0) + pending = true; + else if (ret < 0) + return; + + } while (pending && ops < NVMET_TCP_IO_WORK_BUDGET); + + /* + * Requeue the worker if idle deadline period is in progress or any + * ops activity was recorded during the do-while loop above. + */ + if (nvmet_tcp_check_queue_deadline(queue, ops) || pending) + queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work); +} + +static int nvmet_tcp_alloc_cmd(struct nvmet_tcp_queue *queue, + struct nvmet_tcp_cmd *c) +{ + u8 hdgst = nvmet_tcp_hdgst_len(queue); + + c->queue = queue; + c->req.port = queue->port->nport; + + c->cmd_pdu = page_frag_alloc(&queue->pf_cache, + sizeof(*c->cmd_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO); + if (!c->cmd_pdu) + return -ENOMEM; + c->req.cmd = &c->cmd_pdu->cmd; + + c->rsp_pdu = page_frag_alloc(&queue->pf_cache, + sizeof(*c->rsp_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO); + if (!c->rsp_pdu) + goto out_free_cmd; + c->req.cqe = &c->rsp_pdu->cqe; + + c->data_pdu = page_frag_alloc(&queue->pf_cache, + sizeof(*c->data_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO); + if (!c->data_pdu) + goto out_free_rsp; + + c->r2t_pdu = page_frag_alloc(&queue->pf_cache, + sizeof(*c->r2t_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO); + if (!c->r2t_pdu) + goto out_free_data; + + c->recv_msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; + + list_add_tail(&c->entry, &queue->free_list); + + return 0; +out_free_data: + page_frag_free(c->data_pdu); +out_free_rsp: + page_frag_free(c->rsp_pdu); +out_free_cmd: + page_frag_free(c->cmd_pdu); + return -ENOMEM; +} + +static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c) +{ + page_frag_free(c->r2t_pdu); + page_frag_free(c->data_pdu); + page_frag_free(c->rsp_pdu); + page_frag_free(c->cmd_pdu); +} + +static int nvmet_tcp_alloc_cmds(struct nvmet_tcp_queue *queue) +{ + struct nvmet_tcp_cmd *cmds; + int i, ret = -EINVAL, nr_cmds = queue->nr_cmds; + + cmds = kcalloc(nr_cmds, sizeof(struct nvmet_tcp_cmd), GFP_KERNEL); + if (!cmds) + goto out; + + for (i = 0; i < nr_cmds; i++) { + ret = nvmet_tcp_alloc_cmd(queue, cmds + i); + if (ret) + goto out_free; + } + + queue->cmds = cmds; + + return 0; +out_free: + while (--i >= 0) + nvmet_tcp_free_cmd(cmds + i); + kfree(cmds); +out: + return ret; +} + +static void nvmet_tcp_free_cmds(struct nvmet_tcp_queue *queue) +{ + struct nvmet_tcp_cmd *cmds = queue->cmds; + int i; + + for (i = 0; i < queue->nr_cmds; i++) + nvmet_tcp_free_cmd(cmds + i); + + nvmet_tcp_free_cmd(&queue->connect); + kfree(cmds); +} + +static void nvmet_tcp_restore_socket_callbacks(struct nvmet_tcp_queue *queue) +{ + struct socket *sock = queue->sock; + + write_lock_bh(&sock->sk->sk_callback_lock); + sock->sk->sk_data_ready = queue->data_ready; + sock->sk->sk_state_change = queue->state_change; + sock->sk->sk_write_space = queue->write_space; + sock->sk->sk_user_data = NULL; + write_unlock_bh(&sock->sk->sk_callback_lock); +} + +static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd) +{ + nvmet_req_uninit(&cmd->req); + nvmet_tcp_unmap_pdu_iovec(cmd); + nvmet_tcp_free_cmd_buffers(cmd); +} + +static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue) +{ + struct nvmet_tcp_cmd *cmd = queue->cmds; + int i; + + for (i = 0; i < queue->nr_cmds; i++, cmd++) { + if (nvmet_tcp_need_data_in(cmd)) + nvmet_req_uninit(&cmd->req); + + nvmet_tcp_unmap_pdu_iovec(cmd); + nvmet_tcp_free_cmd_buffers(cmd); + } + + if (!queue->nr_cmds && nvmet_tcp_need_data_in(&queue->connect)) { + /* failed in connect */ + nvmet_tcp_finish_cmd(&queue->connect); + } +} + +static void nvmet_tcp_release_queue_work(struct work_struct *w) +{ + struct page *page; + struct nvmet_tcp_queue *queue = + container_of(w, struct nvmet_tcp_queue, release_work); + + mutex_lock(&nvmet_tcp_queue_mutex); + list_del_init(&queue->queue_list); + mutex_unlock(&nvmet_tcp_queue_mutex); + + nvmet_tcp_restore_socket_callbacks(queue); + cancel_work_sync(&queue->io_work); + /* stop accepting incoming data */ + queue->rcv_state = NVMET_TCP_RECV_ERR; + + nvmet_tcp_uninit_data_in_cmds(queue); + nvmet_sq_destroy(&queue->nvme_sq); + cancel_work_sync(&queue->io_work); + sock_release(queue->sock); + nvmet_tcp_free_cmds(queue); + if (queue->hdr_digest || queue->data_digest) + nvmet_tcp_free_crypto(queue); + ida_simple_remove(&nvmet_tcp_queue_ida, queue->idx); + + page = virt_to_head_page(queue->pf_cache.va); + __page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias); + kfree(queue); +} + +static void nvmet_tcp_data_ready(struct sock *sk) +{ + struct nvmet_tcp_queue *queue; + + read_lock_bh(&sk->sk_callback_lock); + queue = sk->sk_user_data; + if (likely(queue)) + queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work); + read_unlock_bh(&sk->sk_callback_lock); +} + +static void nvmet_tcp_write_space(struct sock *sk) +{ + struct nvmet_tcp_queue *queue; + + read_lock_bh(&sk->sk_callback_lock); + queue = sk->sk_user_data; + if (unlikely(!queue)) + goto out; + + if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) { + queue->write_space(sk); + goto out; + } + + if (sk_stream_is_writeable(sk)) { + clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work); + } +out: + read_unlock_bh(&sk->sk_callback_lock); +} + +static void nvmet_tcp_state_change(struct sock *sk) +{ + struct nvmet_tcp_queue *queue; + + read_lock_bh(&sk->sk_callback_lock); + queue = sk->sk_user_data; + if (!queue) + goto done; + + switch (sk->sk_state) { + case TCP_FIN_WAIT2: + case TCP_LAST_ACK: + break; + case TCP_FIN_WAIT1: + case TCP_CLOSE_WAIT: + case TCP_CLOSE: + /* FALLTHRU */ + nvmet_tcp_schedule_release_queue(queue); + break; + default: + pr_warn("queue %d unhandled state %d\n", + queue->idx, sk->sk_state); + } +done: + read_unlock_bh(&sk->sk_callback_lock); +} + +static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue) +{ + struct socket *sock = queue->sock; + struct inet_sock *inet = inet_sk(sock->sk); + int ret; + + ret = kernel_getsockname(sock, + (struct sockaddr *)&queue->sockaddr); + if (ret < 0) + return ret; + + ret = kernel_getpeername(sock, + (struct sockaddr *)&queue->sockaddr_peer); + if (ret < 0) + return ret; + + /* + * Cleanup whatever is sitting in the TCP transmit queue on socket + * close. This is done to prevent stale data from being sent should + * the network connection be restored before TCP times out. + */ + sock_no_linger(sock->sk); + + if (so_priority > 0) + sock_set_priority(sock->sk, so_priority); + + /* Set socket type of service */ + if (inet->rcv_tos > 0) + ip_sock_set_tos(sock->sk, inet->rcv_tos); + + ret = 0; + write_lock_bh(&sock->sk->sk_callback_lock); + if (sock->sk->sk_state != TCP_ESTABLISHED) { + /* + * If the socket is already closing, don't even start + * consuming it + */ + ret = -ENOTCONN; + } else { + sock->sk->sk_user_data = queue; + queue->data_ready = sock->sk->sk_data_ready; + sock->sk->sk_data_ready = nvmet_tcp_data_ready; + queue->state_change = sock->sk->sk_state_change; + sock->sk->sk_state_change = nvmet_tcp_state_change; + queue->write_space = sock->sk->sk_write_space; + sock->sk->sk_write_space = nvmet_tcp_write_space; + if (idle_poll_period_usecs) + nvmet_tcp_arm_queue_deadline(queue); + queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work); + } + write_unlock_bh(&sock->sk->sk_callback_lock); + + return ret; +} + +static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port, + struct socket *newsock) +{ + struct nvmet_tcp_queue *queue; + int ret; + + queue = kzalloc(sizeof(*queue), GFP_KERNEL); + if (!queue) + return -ENOMEM; + + INIT_WORK(&queue->release_work, nvmet_tcp_release_queue_work); + INIT_WORK(&queue->io_work, nvmet_tcp_io_work); + queue->sock = newsock; + queue->port = port; + queue->nr_cmds = 0; + spin_lock_init(&queue->state_lock); + queue->state = NVMET_TCP_Q_CONNECTING; + INIT_LIST_HEAD(&queue->free_list); + init_llist_head(&queue->resp_list); + INIT_LIST_HEAD(&queue->resp_send_list); + + queue->idx = ida_simple_get(&nvmet_tcp_queue_ida, 0, 0, GFP_KERNEL); + if (queue->idx < 0) { + ret = queue->idx; + goto out_free_queue; + } + + ret = nvmet_tcp_alloc_cmd(queue, &queue->connect); + if (ret) + goto out_ida_remove; + + ret = nvmet_sq_init(&queue->nvme_sq); + if (ret) + goto out_free_connect; + + nvmet_prepare_receive_pdu(queue); + + mutex_lock(&nvmet_tcp_queue_mutex); + list_add_tail(&queue->queue_list, &nvmet_tcp_queue_list); + mutex_unlock(&nvmet_tcp_queue_mutex); + + ret = nvmet_tcp_set_queue_sock(queue); + if (ret) + goto out_destroy_sq; + + return 0; +out_destroy_sq: + mutex_lock(&nvmet_tcp_queue_mutex); + list_del_init(&queue->queue_list); + mutex_unlock(&nvmet_tcp_queue_mutex); + nvmet_sq_destroy(&queue->nvme_sq); +out_free_connect: + nvmet_tcp_free_cmd(&queue->connect); +out_ida_remove: + ida_simple_remove(&nvmet_tcp_queue_ida, queue->idx); +out_free_queue: + kfree(queue); + return ret; +} + +static void nvmet_tcp_accept_work(struct work_struct *w) +{ + struct nvmet_tcp_port *port = + container_of(w, struct nvmet_tcp_port, accept_work); + struct socket *newsock; + int ret; + + while (true) { + ret = kernel_accept(port->sock, &newsock, O_NONBLOCK); + if (ret < 0) { + if (ret != -EAGAIN) + pr_warn("failed to accept err=%d\n", ret); + return; + } + ret = nvmet_tcp_alloc_queue(port, newsock); + if (ret) { + pr_err("failed to allocate queue\n"); + sock_release(newsock); + } + } +} + +static void nvmet_tcp_listen_data_ready(struct sock *sk) +{ + struct nvmet_tcp_port *port; + + read_lock_bh(&sk->sk_callback_lock); + port = sk->sk_user_data; + if (!port) + goto out; + + if (sk->sk_state == TCP_LISTEN) + schedule_work(&port->accept_work); +out: + read_unlock_bh(&sk->sk_callback_lock); +} + +static int nvmet_tcp_add_port(struct nvmet_port *nport) +{ + struct nvmet_tcp_port *port; + __kernel_sa_family_t af; + int ret; + + port = kzalloc(sizeof(*port), GFP_KERNEL); + if (!port) + return -ENOMEM; + + switch (nport->disc_addr.adrfam) { + case NVMF_ADDR_FAMILY_IP4: + af = AF_INET; + break; + case NVMF_ADDR_FAMILY_IP6: + af = AF_INET6; + break; + default: + pr_err("address family %d not supported\n", + nport->disc_addr.adrfam); + ret = -EINVAL; + goto err_port; + } + + ret = inet_pton_with_scope(&init_net, af, nport->disc_addr.traddr, + nport->disc_addr.trsvcid, &port->addr); + if (ret) { + pr_err("malformed ip/port passed: %s:%s\n", + nport->disc_addr.traddr, nport->disc_addr.trsvcid); + goto err_port; + } + + port->nport = nport; + INIT_WORK(&port->accept_work, nvmet_tcp_accept_work); + if (port->nport->inline_data_size < 0) + port->nport->inline_data_size = NVMET_TCP_DEF_INLINE_DATA_SIZE; + + ret = sock_create(port->addr.ss_family, SOCK_STREAM, + IPPROTO_TCP, &port->sock); + if (ret) { + pr_err("failed to create a socket\n"); + goto err_port; + } + + port->sock->sk->sk_user_data = port; + port->data_ready = port->sock->sk->sk_data_ready; + port->sock->sk->sk_data_ready = nvmet_tcp_listen_data_ready; + sock_set_reuseaddr(port->sock->sk); + tcp_sock_set_nodelay(port->sock->sk); + if (so_priority > 0) + sock_set_priority(port->sock->sk, so_priority); + + ret = kernel_bind(port->sock, (struct sockaddr *)&port->addr, + sizeof(port->addr)); + if (ret) { + pr_err("failed to bind port socket %d\n", ret); + goto err_sock; + } + + ret = kernel_listen(port->sock, 128); + if (ret) { + pr_err("failed to listen %d on port sock\n", ret); + goto err_sock; + } + + nport->priv = port; + pr_info("enabling port %d (%pISpc)\n", + le16_to_cpu(nport->disc_addr.portid), &port->addr); + + return 0; + +err_sock: + sock_release(port->sock); +err_port: + kfree(port); + return ret; +} + +static void nvmet_tcp_destroy_port_queues(struct nvmet_tcp_port *port) +{ + struct nvmet_tcp_queue *queue; + + mutex_lock(&nvmet_tcp_queue_mutex); + list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list) + if (queue->port == port) + kernel_sock_shutdown(queue->sock, SHUT_RDWR); + mutex_unlock(&nvmet_tcp_queue_mutex); +} + +static void nvmet_tcp_remove_port(struct nvmet_port *nport) +{ + struct nvmet_tcp_port *port = nport->priv; + + write_lock_bh(&port->sock->sk->sk_callback_lock); + port->sock->sk->sk_data_ready = port->data_ready; + port->sock->sk->sk_user_data = NULL; + write_unlock_bh(&port->sock->sk->sk_callback_lock); + cancel_work_sync(&port->accept_work); + /* + * Destroy the remaining queues, which are not belong to any + * controller yet. + */ + nvmet_tcp_destroy_port_queues(port); + + sock_release(port->sock); + kfree(port); +} + +static void nvmet_tcp_delete_ctrl(struct nvmet_ctrl *ctrl) +{ + struct nvmet_tcp_queue *queue; + + mutex_lock(&nvmet_tcp_queue_mutex); + list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list) + if (queue->nvme_sq.ctrl == ctrl) + kernel_sock_shutdown(queue->sock, SHUT_RDWR); + mutex_unlock(&nvmet_tcp_queue_mutex); +} + +static u16 nvmet_tcp_install_queue(struct nvmet_sq *sq) +{ + struct nvmet_tcp_queue *queue = + container_of(sq, struct nvmet_tcp_queue, nvme_sq); + + if (sq->qid == 0) { + /* Let inflight controller teardown complete */ + flush_scheduled_work(); + } + + queue->nr_cmds = sq->size * 2; + if (nvmet_tcp_alloc_cmds(queue)) + return NVME_SC_INTERNAL; + return 0; +} + +static void nvmet_tcp_disc_port_addr(struct nvmet_req *req, + struct nvmet_port *nport, char *traddr) +{ + struct nvmet_tcp_port *port = nport->priv; + + if (inet_addr_is_any((struct sockaddr *)&port->addr)) { + struct nvmet_tcp_cmd *cmd = + container_of(req, struct nvmet_tcp_cmd, req); + struct nvmet_tcp_queue *queue = cmd->queue; + + sprintf(traddr, "%pISc", (struct sockaddr *)&queue->sockaddr); + } else { + memcpy(traddr, nport->disc_addr.traddr, NVMF_TRADDR_SIZE); + } +} + +static const struct nvmet_fabrics_ops nvmet_tcp_ops = { + .owner = THIS_MODULE, + .type = NVMF_TRTYPE_TCP, + .msdbd = 1, + .add_port = nvmet_tcp_add_port, + .remove_port = nvmet_tcp_remove_port, + .queue_response = nvmet_tcp_queue_response, + .delete_ctrl = nvmet_tcp_delete_ctrl, + .install_queue = nvmet_tcp_install_queue, + .disc_traddr = nvmet_tcp_disc_port_addr, +}; + +static int __init nvmet_tcp_init(void) +{ + int ret; + + nvmet_tcp_wq = alloc_workqueue("nvmet_tcp_wq", WQ_HIGHPRI, 0); + if (!nvmet_tcp_wq) + return -ENOMEM; + + ret = nvmet_register_transport(&nvmet_tcp_ops); + if (ret) + goto err; + + return 0; +err: + destroy_workqueue(nvmet_tcp_wq); + return ret; +} + +static void __exit nvmet_tcp_exit(void) +{ + struct nvmet_tcp_queue *queue; + + nvmet_unregister_transport(&nvmet_tcp_ops); + + flush_scheduled_work(); + mutex_lock(&nvmet_tcp_queue_mutex); + list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list) + kernel_sock_shutdown(queue->sock, SHUT_RDWR); + mutex_unlock(&nvmet_tcp_queue_mutex); + flush_scheduled_work(); + + destroy_workqueue(nvmet_tcp_wq); +} + +module_init(nvmet_tcp_init); +module_exit(nvmet_tcp_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS("nvmet-transport-3"); /* 3 == NVMF_TRTYPE_TCP */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/trace.c b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/trace.c new file mode 100644 index 0000000..bff454d --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/trace.c @@ -0,0 +1,235 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NVM Express target device driver tracepoints + * Copyright (c) 2018 Johannes Thumshirn, SUSE Linux GmbH + */ + +#include +#include "trace.h" + +static const char *nvmet_trace_admin_identify(struct trace_seq *p, u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u8 cns = cdw10[0]; + u16 ctrlid = get_unaligned_le16(cdw10 + 2); + + trace_seq_printf(p, "cns=%u, ctrlid=%u", cns, ctrlid); + trace_seq_putc(p, 0); + + return ret; +} + +static const char *nvmet_trace_admin_get_features(struct trace_seq *p, + u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u8 fid = cdw10[0]; + u8 sel = cdw10[1] & 0x7; + u32 cdw11 = get_unaligned_le32(cdw10 + 4); + + trace_seq_printf(p, "fid=0x%x, sel=0x%x, cdw11=0x%x", fid, sel, cdw11); + trace_seq_putc(p, 0); + + return ret; +} + +static const char *nvmet_trace_get_lba_status(struct trace_seq *p, + u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u64 slba = get_unaligned_le64(cdw10); + u32 mndw = get_unaligned_le32(cdw10 + 8); + u16 rl = get_unaligned_le16(cdw10 + 12); + u8 atype = cdw10[15]; + + trace_seq_printf(p, "slba=0x%llx, mndw=0x%x, rl=0x%x, atype=%u", + slba, mndw, rl, atype); + trace_seq_putc(p, 0); + + return ret; +} + +static const char *nvmet_trace_admin_set_features(struct trace_seq *p, + u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u8 fid = cdw10[0]; + u8 sv = cdw10[3] & 0x8; + u32 cdw11 = get_unaligned_le32(cdw10 + 4); + + trace_seq_printf(p, "fid=0x%x, sv=0x%x, cdw11=0x%x", fid, sv, cdw11); + trace_seq_putc(p, 0); + + return ret; +} + +static const char *nvmet_trace_read_write(struct trace_seq *p, u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u64 slba = get_unaligned_le64(cdw10); + u16 length = get_unaligned_le16(cdw10 + 8); + u16 control = get_unaligned_le16(cdw10 + 10); + u32 dsmgmt = get_unaligned_le32(cdw10 + 12); + u32 reftag = get_unaligned_le32(cdw10 + 16); + + trace_seq_printf(p, + "slba=%llu, len=%u, ctrl=0x%x, dsmgmt=%u, reftag=%u", + slba, length, control, dsmgmt, reftag); + trace_seq_putc(p, 0); + + return ret; +} + +static const char *nvmet_trace_dsm(struct trace_seq *p, u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + + trace_seq_printf(p, "nr=%u, attributes=%u", + get_unaligned_le32(cdw10), + get_unaligned_le32(cdw10 + 4)); + trace_seq_putc(p, 0); + + return ret; +} + +static const char *nvmet_trace_common(struct trace_seq *p, u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + + trace_seq_printf(p, "cdw10=%*ph", 24, cdw10); + trace_seq_putc(p, 0); + + return ret; +} + +const char *nvmet_trace_parse_admin_cmd(struct trace_seq *p, + u8 opcode, u8 *cdw10) +{ + switch (opcode) { + case nvme_admin_identify: + return nvmet_trace_admin_identify(p, cdw10); + case nvme_admin_set_features: + return nvmet_trace_admin_set_features(p, cdw10); + case nvme_admin_get_features: + return nvmet_trace_admin_get_features(p, cdw10); + case nvme_admin_get_lba_status: + return nvmet_trace_get_lba_status(p, cdw10); + default: + return nvmet_trace_common(p, cdw10); + } +} + +const char *nvmet_trace_parse_nvm_cmd(struct trace_seq *p, + u8 opcode, u8 *cdw10) +{ + switch (opcode) { + case nvme_cmd_read: + case nvme_cmd_write: + case nvme_cmd_write_zeroes: + return nvmet_trace_read_write(p, cdw10); + case nvme_cmd_dsm: + return nvmet_trace_dsm(p, cdw10); + default: + return nvmet_trace_common(p, cdw10); + } +} + +static const char *nvmet_trace_fabrics_property_set(struct trace_seq *p, + u8 *spc) +{ + const char *ret = trace_seq_buffer_ptr(p); + u8 attrib = spc[0]; + u32 ofst = get_unaligned_le32(spc + 4); + u64 value = get_unaligned_le64(spc + 8); + + trace_seq_printf(p, "attrib=%u, ofst=0x%x, value=0x%llx", + attrib, ofst, value); + trace_seq_putc(p, 0); + return ret; +} + +static const char *nvmet_trace_fabrics_connect(struct trace_seq *p, + u8 *spc) +{ + const char *ret = trace_seq_buffer_ptr(p); + u16 recfmt = get_unaligned_le16(spc); + u16 qid = get_unaligned_le16(spc + 2); + u16 sqsize = get_unaligned_le16(spc + 4); + u8 cattr = spc[6]; + u32 kato = get_unaligned_le32(spc + 8); + + trace_seq_printf(p, "recfmt=%u, qid=%u, sqsize=%u, cattr=%u, kato=%u", + recfmt, qid, sqsize, cattr, kato); + trace_seq_putc(p, 0); + return ret; +} + +static const char *nvmet_trace_fabrics_property_get(struct trace_seq *p, + u8 *spc) +{ + const char *ret = trace_seq_buffer_ptr(p); + u8 attrib = spc[0]; + u32 ofst = get_unaligned_le32(spc + 4); + + trace_seq_printf(p, "attrib=%u, ofst=0x%x", attrib, ofst); + trace_seq_putc(p, 0); + return ret; +} + +static const char *nvmet_trace_fabrics_common(struct trace_seq *p, u8 *spc) +{ + const char *ret = trace_seq_buffer_ptr(p); + + trace_seq_printf(p, "specific=%*ph", 24, spc); + trace_seq_putc(p, 0); + return ret; +} + +const char *nvmet_trace_parse_fabrics_cmd(struct trace_seq *p, + u8 fctype, u8 *spc) +{ + switch (fctype) { + case nvme_fabrics_type_property_set: + return nvmet_trace_fabrics_property_set(p, spc); + case nvme_fabrics_type_connect: + return nvmet_trace_fabrics_connect(p, spc); + case nvme_fabrics_type_property_get: + return nvmet_trace_fabrics_property_get(p, spc); + default: + return nvmet_trace_fabrics_common(p, spc); + } +} + +const char *nvmet_trace_disk_name(struct trace_seq *p, char *name) +{ + const char *ret = trace_seq_buffer_ptr(p); + + if (*name) + trace_seq_printf(p, "disk=%s, ", name); + trace_seq_putc(p, 0); + + return ret; +} + +const char *nvmet_trace_ctrl_name(struct trace_seq *p, struct nvmet_ctrl *ctrl) +{ + const char *ret = trace_seq_buffer_ptr(p); + + /* + * XXX: We don't know the controller instance before executing the + * connect command itself because the connect command for the admin + * queue will not provide the cntlid which will be allocated in this + * command. In case of io queues, the controller instance will be + * mapped by the extra data of the connect command. + * If we can know the extra data of the connect command in this stage, + * we can update this print statement later. + */ + if (ctrl) + trace_seq_printf(p, "%d", ctrl->cntlid); + else + trace_seq_printf(p, "_"); + trace_seq_putc(p, 0); + + return ret; +} + diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/trace.h b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/trace.h new file mode 100644 index 0000000..6109b38 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/trace.h @@ -0,0 +1,165 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * NVM Express target device driver tracepoints + * Copyright (c) 2018 Johannes Thumshirn, SUSE Linux GmbH + * + * This is entirely based on drivers/nvme/host/trace.h + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM nvmet + +#if !defined(_TRACE_NVMET_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_NVMET_H + +#include +#include +#include + +#include "nvmet.h" + +const char *nvmet_trace_parse_admin_cmd(struct trace_seq *p, u8 opcode, + u8 *cdw10); +const char *nvmet_trace_parse_nvm_cmd(struct trace_seq *p, u8 opcode, + u8 *cdw10); +const char *nvmet_trace_parse_fabrics_cmd(struct trace_seq *p, u8 fctype, + u8 *spc); + +#define parse_nvme_cmd(qid, opcode, fctype, cdw10) \ + ((opcode) == nvme_fabrics_command ? \ + nvmet_trace_parse_fabrics_cmd(p, fctype, cdw10) : \ + (qid ? \ + nvmet_trace_parse_nvm_cmd(p, opcode, cdw10) : \ + nvmet_trace_parse_admin_cmd(p, opcode, cdw10))) + +const char *nvmet_trace_ctrl_name(struct trace_seq *p, struct nvmet_ctrl *ctrl); +#define __print_ctrl_name(ctrl) \ + nvmet_trace_ctrl_name(p, ctrl) + +const char *nvmet_trace_disk_name(struct trace_seq *p, char *name); +#define __print_disk_name(name) \ + nvmet_trace_disk_name(p, name) + +#ifndef TRACE_HEADER_MULTI_READ +static inline struct nvmet_ctrl *nvmet_req_to_ctrl(struct nvmet_req *req) +{ + return req->sq->ctrl; +} + +static inline void __assign_req_name(char *name, struct nvmet_req *req) +{ + if (!req->ns) { + memset(name, 0, DISK_NAME_LEN); + return; + } + + strncpy(name, req->ns->device_path, + min_t(size_t, DISK_NAME_LEN, strlen(req->ns->device_path))); +} +#endif + +TRACE_EVENT(nvmet_req_init, + TP_PROTO(struct nvmet_req *req, struct nvme_command *cmd), + TP_ARGS(req, cmd), + TP_STRUCT__entry( + __field(struct nvme_command *, cmd) + __field(struct nvmet_ctrl *, ctrl) + __array(char, disk, DISK_NAME_LEN) + __field(int, qid) + __field(u16, cid) + __field(u8, opcode) + __field(u8, fctype) + __field(u8, flags) + __field(u32, nsid) + __field(u64, metadata) + __array(u8, cdw10, 24) + ), + TP_fast_assign( + __entry->cmd = cmd; + __entry->ctrl = nvmet_req_to_ctrl(req); + __assign_req_name(__entry->disk, req); + __entry->qid = req->sq->qid; + __entry->cid = cmd->common.command_id; + __entry->opcode = cmd->common.opcode; + __entry->fctype = cmd->fabrics.fctype; + __entry->flags = cmd->common.flags; + __entry->nsid = le32_to_cpu(cmd->common.nsid); + __entry->metadata = le64_to_cpu(cmd->common.metadata); + memcpy(__entry->cdw10, &cmd->common.cdw10, + sizeof(__entry->cdw10)); + ), + TP_printk("nvmet%s: %sqid=%d, cmdid=%u, nsid=%u, flags=%#x, " + "meta=%#llx, cmd=(%s, %s)", + __print_ctrl_name(__entry->ctrl), + __print_disk_name(__entry->disk), + __entry->qid, __entry->cid, __entry->nsid, + __entry->flags, __entry->metadata, + show_opcode_name(__entry->qid, __entry->opcode, + __entry->fctype), + parse_nvme_cmd(__entry->qid, __entry->opcode, + __entry->fctype, __entry->cdw10)) +); + +TRACE_EVENT(nvmet_req_complete, + TP_PROTO(struct nvmet_req *req), + TP_ARGS(req), + TP_STRUCT__entry( + __field(struct nvmet_ctrl *, ctrl) + __array(char, disk, DISK_NAME_LEN) + __field(int, qid) + __field(int, cid) + __field(u64, result) + __field(u16, status) + ), + TP_fast_assign( + __entry->ctrl = nvmet_req_to_ctrl(req); + __entry->qid = req->cq->qid; + __entry->cid = req->cqe->command_id; + __entry->result = le64_to_cpu(req->cqe->result.u64); + __entry->status = le16_to_cpu(req->cqe->status) >> 1; + __assign_req_name(__entry->disk, req); + ), + TP_printk("nvmet%s: %sqid=%d, cmdid=%u, res=%#llx, status=%#x", + __print_ctrl_name(__entry->ctrl), + __print_disk_name(__entry->disk), + __entry->qid, __entry->cid, __entry->result, __entry->status) + +); + +#define aer_name(aer) { aer, #aer } + +TRACE_EVENT(nvmet_async_event, + TP_PROTO(struct nvmet_ctrl *ctrl, __le32 result), + TP_ARGS(ctrl, result), + TP_STRUCT__entry( + __field(int, ctrl_id) + __field(u32, result) + ), + TP_fast_assign( + __entry->ctrl_id = ctrl->cntlid; + __entry->result = (le32_to_cpu(result) & 0xff00) >> 8; + ), + TP_printk("nvmet%d: NVME_AEN=%#08x [%s]", + __entry->ctrl_id, __entry->result, + __print_symbolic(__entry->result, + aer_name(NVME_AER_NOTICE_NS_CHANGED), + aer_name(NVME_AER_NOTICE_ANA), + aer_name(NVME_AER_NOTICE_FW_ACT_STARTING), + aer_name(NVME_AER_NOTICE_DISC_CHANGED), + aer_name(NVME_AER_ERROR), + aer_name(NVME_AER_SMART), + aer_name(NVME_AER_CSS), + aer_name(NVME_AER_VS)) + ) +); +#undef aer_name + +#endif /* _TRACE_NVMET_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace + +/* This part must be outside protection */ +#include diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/zns.c b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/zns.c new file mode 100644 index 0000000..927198b --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/target/zns.c @@ -0,0 +1,625 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NVMe ZNS-ZBD command implementation. + * Copyright (C) 2021 Western Digital Corporation or its affiliates. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include "nvmet.h" + +/* + * We set the Memory Page Size Minimum (MPSMIN) for target controller to 0 + * which gets added by 12 in the nvme_enable_ctrl() which results in 2^12 = 4k + * as page_shift value. When calculating the ZASL use shift by 12. + */ +#define NVMET_MPSMIN_SHIFT 12 + +static inline u8 nvmet_zasl(unsigned int zone_append_sects) +{ + /* + * Zone Append Size Limit (zasl) is expressed as a power of 2 value + * with the minimum memory page size (i.e. 12) as unit. + */ + return ilog2(zone_append_sects >> (NVMET_MPSMIN_SHIFT - 9)); +} + +static int validate_conv_zones_cb(struct blk_zone *z, + unsigned int i, void *data) +{ + if (z->type == BLK_ZONE_TYPE_CONVENTIONAL) + return -EOPNOTSUPP; + return 0; +} + +bool nvmet_bdev_zns_enable(struct nvmet_ns *ns) +{ + struct request_queue *q = ns->bdev->bd_disk->queue; + u8 zasl = nvmet_zasl(queue_max_zone_append_sectors(q)); + struct gendisk *bd_disk = ns->bdev->bd_disk; + int ret; + + if (ns->subsys->zasl) { + if (ns->subsys->zasl > zasl) + return false; + } + ns->subsys->zasl = zasl; + + /* + * Generic zoned block devices may have a smaller last zone which is + * not supported by ZNS. Exclude zoned drives that have such smaller + * last zone. + */ + if (get_capacity(bd_disk) & (bdev_zone_sectors(ns->bdev) - 1)) + return false; + /* + * ZNS does not define a conventional zone type. If the underlying + * device has a bitmap set indicating the existence of conventional + * zones, reject the device. Otherwise, use report zones to detect if + * the device has conventional zones. + */ + if (ns->bdev->bd_disk->queue->conv_zones_bitmap) + return false; + + ret = blkdev_report_zones(ns->bdev, 0, blkdev_nr_zones(bd_disk), + validate_conv_zones_cb, NULL); + if (ret < 0) + return false; + + ns->blksize_shift = blksize_bits(bdev_logical_block_size(ns->bdev)); + + return true; +} + +void nvmet_execute_identify_cns_cs_ctrl(struct nvmet_req *req) +{ + u8 zasl = req->sq->ctrl->subsys->zasl; + struct nvmet_ctrl *ctrl = req->sq->ctrl; + struct nvme_id_ctrl_zns *id; + u16 status; + + id = kzalloc(sizeof(*id), GFP_KERNEL); + if (!id) { + status = NVME_SC_INTERNAL; + goto out; + } + + if (ctrl->ops->get_mdts) + id->zasl = min_t(u8, ctrl->ops->get_mdts(ctrl), zasl); + else + id->zasl = zasl; + + status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id)); + + kfree(id); +out: + nvmet_req_complete(req, status); +} + +void nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req) +{ + struct nvme_id_ns_zns *id_zns; + u64 zsze; + u16 status; + u32 mar, mor; + + if (le32_to_cpu(req->cmd->identify.nsid) == NVME_NSID_ALL) { + req->error_loc = offsetof(struct nvme_identify, nsid); + status = NVME_SC_INVALID_NS | NVME_SC_DNR; + goto out; + } + + id_zns = kzalloc(sizeof(*id_zns), GFP_KERNEL); + if (!id_zns) { + status = NVME_SC_INTERNAL; + goto out; + } + + status = nvmet_req_find_ns(req); + if (status) + goto done; + + if (!bdev_is_zoned(req->ns->bdev)) { + req->error_loc = offsetof(struct nvme_identify, nsid); + goto done; + } + + nvmet_ns_revalidate(req->ns); + zsze = (bdev_zone_sectors(req->ns->bdev) << 9) >> + req->ns->blksize_shift; + id_zns->lbafe[0].zsze = cpu_to_le64(zsze); + + mor = bdev_max_open_zones(req->ns->bdev); + if (!mor) + mor = U32_MAX; + else + mor--; + id_zns->mor = cpu_to_le32(mor); + + mar = bdev_max_active_zones(req->ns->bdev); + if (!mar) + mar = U32_MAX; + else + mar--; + id_zns->mar = cpu_to_le32(mar); + +done: + status = nvmet_copy_to_sgl(req, 0, id_zns, sizeof(*id_zns)); + kfree(id_zns); +out: + nvmet_req_complete(req, status); +} + +static u16 nvmet_bdev_validate_zone_mgmt_recv(struct nvmet_req *req) +{ + sector_t sect = nvmet_lba_to_sect(req->ns, req->cmd->zmr.slba); + u32 out_bufsize = (le32_to_cpu(req->cmd->zmr.numd) + 1) << 2; + + if (sect >= get_capacity(req->ns->bdev->bd_disk)) { + req->error_loc = offsetof(struct nvme_zone_mgmt_recv_cmd, slba); + return NVME_SC_LBA_RANGE | NVME_SC_DNR; + } + + if (out_bufsize < sizeof(struct nvme_zone_report)) { + req->error_loc = offsetof(struct nvme_zone_mgmt_recv_cmd, numd); + return NVME_SC_INVALID_FIELD | NVME_SC_DNR; + } + + if (req->cmd->zmr.zra != NVME_ZRA_ZONE_REPORT) { + req->error_loc = offsetof(struct nvme_zone_mgmt_recv_cmd, zra); + return NVME_SC_INVALID_FIELD | NVME_SC_DNR; + } + + switch (req->cmd->zmr.pr) { + case 0: + case 1: + break; + default: + req->error_loc = offsetof(struct nvme_zone_mgmt_recv_cmd, pr); + return NVME_SC_INVALID_FIELD | NVME_SC_DNR; + } + + switch (req->cmd->zmr.zrasf) { + case NVME_ZRASF_ZONE_REPORT_ALL: + case NVME_ZRASF_ZONE_STATE_EMPTY: + case NVME_ZRASF_ZONE_STATE_IMP_OPEN: + case NVME_ZRASF_ZONE_STATE_EXP_OPEN: + case NVME_ZRASF_ZONE_STATE_CLOSED: + case NVME_ZRASF_ZONE_STATE_FULL: + case NVME_ZRASF_ZONE_STATE_READONLY: + case NVME_ZRASF_ZONE_STATE_OFFLINE: + break; + default: + req->error_loc = + offsetof(struct nvme_zone_mgmt_recv_cmd, zrasf); + return NVME_SC_INVALID_FIELD | NVME_SC_DNR; + } + + return NVME_SC_SUCCESS; +} + +struct nvmet_report_zone_data { + struct nvmet_req *req; + u64 out_buf_offset; + u64 out_nr_zones; + u64 nr_zones; + u8 zrasf; +}; + +static int nvmet_bdev_report_zone_cb(struct blk_zone *z, unsigned i, void *d) +{ + static const unsigned int nvme_zrasf_to_blk_zcond[] = { + [NVME_ZRASF_ZONE_STATE_EMPTY] = BLK_ZONE_COND_EMPTY, + [NVME_ZRASF_ZONE_STATE_IMP_OPEN] = BLK_ZONE_COND_IMP_OPEN, + [NVME_ZRASF_ZONE_STATE_EXP_OPEN] = BLK_ZONE_COND_EXP_OPEN, + [NVME_ZRASF_ZONE_STATE_CLOSED] = BLK_ZONE_COND_CLOSED, + [NVME_ZRASF_ZONE_STATE_READONLY] = BLK_ZONE_COND_READONLY, + [NVME_ZRASF_ZONE_STATE_FULL] = BLK_ZONE_COND_FULL, + [NVME_ZRASF_ZONE_STATE_OFFLINE] = BLK_ZONE_COND_OFFLINE, + }; + struct nvmet_report_zone_data *rz = d; + + if (rz->zrasf != NVME_ZRASF_ZONE_REPORT_ALL && + z->cond != nvme_zrasf_to_blk_zcond[rz->zrasf]) + return 0; + + if (rz->nr_zones < rz->out_nr_zones) { + struct nvme_zone_descriptor zdesc = { }; + u16 status; + + zdesc.zcap = nvmet_sect_to_lba(rz->req->ns, z->capacity); + zdesc.zslba = nvmet_sect_to_lba(rz->req->ns, z->start); + zdesc.wp = nvmet_sect_to_lba(rz->req->ns, z->wp); + zdesc.za = z->reset ? 1 << 2 : 0; + zdesc.zs = z->cond << 4; + zdesc.zt = z->type; + + status = nvmet_copy_to_sgl(rz->req, rz->out_buf_offset, &zdesc, + sizeof(zdesc)); + if (status) + return -EINVAL; + + rz->out_buf_offset += sizeof(zdesc); + } + + rz->nr_zones++; + + return 0; +} + +static unsigned long nvmet_req_nr_zones_from_slba(struct nvmet_req *req) +{ + unsigned int sect = nvmet_lba_to_sect(req->ns, req->cmd->zmr.slba); + + return blkdev_nr_zones(req->ns->bdev->bd_disk) - + (sect >> ilog2(bdev_zone_sectors(req->ns->bdev))); +} + +static unsigned long get_nr_zones_from_buf(struct nvmet_req *req, u32 bufsize) +{ + if (bufsize <= sizeof(struct nvme_zone_report)) + return 0; + + return (bufsize - sizeof(struct nvme_zone_report)) / + sizeof(struct nvme_zone_descriptor); +} + +static void nvmet_bdev_zone_zmgmt_recv_work(struct work_struct *w) +{ + struct nvmet_req *req = container_of(w, struct nvmet_req, z.zmgmt_work); + sector_t start_sect = nvmet_lba_to_sect(req->ns, req->cmd->zmr.slba); + unsigned long req_slba_nr_zones = nvmet_req_nr_zones_from_slba(req); + u32 out_bufsize = (le32_to_cpu(req->cmd->zmr.numd) + 1) << 2; + __le64 nr_zones; + u16 status; + int ret; + struct nvmet_report_zone_data rz_data = { + .out_nr_zones = get_nr_zones_from_buf(req, out_bufsize), + /* leave the place for report zone header */ + .out_buf_offset = sizeof(struct nvme_zone_report), + .zrasf = req->cmd->zmr.zrasf, + .nr_zones = 0, + .req = req, + }; + + status = nvmet_bdev_validate_zone_mgmt_recv(req); + if (status) + goto out; + + if (!req_slba_nr_zones) { + status = NVME_SC_SUCCESS; + goto out; + } + + ret = blkdev_report_zones(req->ns->bdev, start_sect, req_slba_nr_zones, + nvmet_bdev_report_zone_cb, &rz_data); + if (ret < 0) { + status = NVME_SC_INTERNAL; + goto out; + } + + /* + * When partial bit is set nr_zones must indicate the number of zone + * descriptors actually transferred. + */ + if (req->cmd->zmr.pr) + rz_data.nr_zones = min(rz_data.nr_zones, rz_data.out_nr_zones); + + nr_zones = cpu_to_le64(rz_data.nr_zones); + status = nvmet_copy_to_sgl(req, 0, &nr_zones, sizeof(nr_zones)); + +out: + nvmet_req_complete(req, status); +} + +void nvmet_bdev_execute_zone_mgmt_recv(struct nvmet_req *req) +{ + INIT_WORK(&req->z.zmgmt_work, nvmet_bdev_zone_zmgmt_recv_work); + queue_work(zbd_wq, &req->z.zmgmt_work); +} + +static inline enum req_opf zsa_req_op(u8 zsa) +{ + switch (zsa) { + case NVME_ZONE_OPEN: + return REQ_OP_ZONE_OPEN; + case NVME_ZONE_CLOSE: + return REQ_OP_ZONE_CLOSE; + case NVME_ZONE_FINISH: + return REQ_OP_ZONE_FINISH; + case NVME_ZONE_RESET: + return REQ_OP_ZONE_RESET; + default: + return REQ_OP_LAST; + } +} + +static u16 blkdev_zone_mgmt_errno_to_nvme_status(int ret) +{ + switch (ret) { + case 0: + return NVME_SC_SUCCESS; + case -EINVAL: + case -EIO: + return NVME_SC_ZONE_INVALID_TRANSITION | NVME_SC_DNR; + default: + return NVME_SC_INTERNAL; + } +} + +struct nvmet_zone_mgmt_send_all_data { + unsigned long *zbitmap; + struct nvmet_req *req; +}; + +static int zmgmt_send_scan_cb(struct blk_zone *z, unsigned i, void *d) +{ + struct nvmet_zone_mgmt_send_all_data *data = d; + + switch (zsa_req_op(data->req->cmd->zms.zsa)) { + case REQ_OP_ZONE_OPEN: + switch (z->cond) { + case BLK_ZONE_COND_CLOSED: + break; + default: + return 0; + } + break; + case REQ_OP_ZONE_CLOSE: + switch (z->cond) { + case BLK_ZONE_COND_IMP_OPEN: + case BLK_ZONE_COND_EXP_OPEN: + break; + default: + return 0; + } + break; + case REQ_OP_ZONE_FINISH: + switch (z->cond) { + case BLK_ZONE_COND_IMP_OPEN: + case BLK_ZONE_COND_EXP_OPEN: + case BLK_ZONE_COND_CLOSED: + break; + default: + return 0; + } + break; + default: + return -EINVAL; + } + + set_bit(i, data->zbitmap); + + return 0; +} + +static u16 nvmet_bdev_zone_mgmt_emulate_all(struct nvmet_req *req) +{ + struct block_device *bdev = req->ns->bdev; + unsigned int nr_zones = blkdev_nr_zones(bdev->bd_disk); + struct request_queue *q = bdev_get_queue(bdev); + struct bio *bio = NULL; + sector_t sector = 0; + int ret; + struct nvmet_zone_mgmt_send_all_data d = { + .req = req, + }; + + d.zbitmap = kcalloc_node(BITS_TO_LONGS(nr_zones), sizeof(*(d.zbitmap)), + GFP_NOIO, q->node); + if (!d.zbitmap) { + ret = -ENOMEM; + goto out; + } + + /* Scan and build bitmap of the eligible zones */ + ret = blkdev_report_zones(bdev, 0, nr_zones, zmgmt_send_scan_cb, &d); + if (ret != nr_zones) { + if (ret > 0) + ret = -EIO; + goto out; + } else { + /* We scanned all the zones */ + ret = 0; + } + + while (sector < get_capacity(bdev->bd_disk)) { + if (test_bit(blk_queue_zone_no(q, sector), d.zbitmap)) { + bio = blk_next_bio(bio, 0, GFP_KERNEL); + bio->bi_opf = zsa_req_op(req->cmd->zms.zsa) | REQ_SYNC; + bio->bi_iter.bi_sector = sector; + bio_set_dev(bio, bdev); + /* This may take a while, so be nice to others */ + cond_resched(); + } + sector += blk_queue_zone_sectors(q); + } + + if (bio) { + ret = submit_bio_wait(bio); + bio_put(bio); + } + +out: + kfree(d.zbitmap); + + return blkdev_zone_mgmt_errno_to_nvme_status(ret); +} + +static u16 nvmet_bdev_execute_zmgmt_send_all(struct nvmet_req *req) +{ + int ret; + + switch (zsa_req_op(req->cmd->zms.zsa)) { + case REQ_OP_ZONE_RESET: + ret = blkdev_zone_mgmt(req->ns->bdev, REQ_OP_ZONE_RESET, 0, + get_capacity(req->ns->bdev->bd_disk), + GFP_KERNEL); + if (ret < 0) + return blkdev_zone_mgmt_errno_to_nvme_status(ret); + break; + case REQ_OP_ZONE_OPEN: + case REQ_OP_ZONE_CLOSE: + case REQ_OP_ZONE_FINISH: + return nvmet_bdev_zone_mgmt_emulate_all(req); + default: + /* this is needed to quiet compiler warning */ + req->error_loc = offsetof(struct nvme_zone_mgmt_send_cmd, zsa); + return NVME_SC_INVALID_FIELD | NVME_SC_DNR; + } + + return NVME_SC_SUCCESS; +} + +static void nvmet_bdev_zmgmt_send_work(struct work_struct *w) +{ + struct nvmet_req *req = container_of(w, struct nvmet_req, z.zmgmt_work); + sector_t sect = nvmet_lba_to_sect(req->ns, req->cmd->zms.slba); + enum req_opf op = zsa_req_op(req->cmd->zms.zsa); + struct block_device *bdev = req->ns->bdev; + sector_t zone_sectors = bdev_zone_sectors(bdev); + u16 status = NVME_SC_SUCCESS; + int ret; + + if (op == REQ_OP_LAST) { + req->error_loc = offsetof(struct nvme_zone_mgmt_send_cmd, zsa); + status = NVME_SC_ZONE_INVALID_TRANSITION | NVME_SC_DNR; + goto out; + } + + /* when select all bit is set slba field is ignored */ + if (req->cmd->zms.select_all) { + status = nvmet_bdev_execute_zmgmt_send_all(req); + goto out; + } + + if (sect >= get_capacity(bdev->bd_disk)) { + req->error_loc = offsetof(struct nvme_zone_mgmt_send_cmd, slba); + status = NVME_SC_LBA_RANGE | NVME_SC_DNR; + goto out; + } + + if (sect & (zone_sectors - 1)) { + req->error_loc = offsetof(struct nvme_zone_mgmt_send_cmd, slba); + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + goto out; + } + + ret = blkdev_zone_mgmt(bdev, op, sect, zone_sectors, GFP_KERNEL); + if (ret < 0) + status = blkdev_zone_mgmt_errno_to_nvme_status(ret); + +out: + nvmet_req_complete(req, status); +} + +void nvmet_bdev_execute_zone_mgmt_send(struct nvmet_req *req) +{ + INIT_WORK(&req->z.zmgmt_work, nvmet_bdev_zmgmt_send_work); + queue_work(zbd_wq, &req->z.zmgmt_work); +} + +static void nvmet_bdev_zone_append_bio_done(struct bio *bio) +{ + struct nvmet_req *req = bio->bi_private; + + if (bio->bi_status == BLK_STS_OK) { + req->cqe->result.u64 = + nvmet_sect_to_lba(req->ns, bio->bi_iter.bi_sector); + } + + nvmet_req_complete(req, blk_to_nvme_status(req, bio->bi_status)); + nvmet_req_bio_put(req, bio); +} + +void nvmet_bdev_execute_zone_append(struct nvmet_req *req) +{ + sector_t sect = nvmet_lba_to_sect(req->ns, req->cmd->rw.slba); + u16 status = NVME_SC_SUCCESS; + unsigned int total_len = 0; + struct scatterlist *sg; + struct bio *bio; + int sg_cnt; + + /* Request is completed on len mismatch in nvmet_check_transter_len() */ + if (!nvmet_check_transfer_len(req, nvmet_rw_data_len(req))) + return; + + if (!req->sg_cnt) { + nvmet_req_complete(req, 0); + return; + } + + if (sect >= get_capacity(req->ns->bdev->bd_disk)) { + req->error_loc = offsetof(struct nvme_rw_command, slba); + status = NVME_SC_LBA_RANGE | NVME_SC_DNR; + goto out; + } + + if (sect & (bdev_zone_sectors(req->ns->bdev) - 1)) { + req->error_loc = offsetof(struct nvme_rw_command, slba); + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + goto out; + } + + if (nvmet_use_inline_bvec(req)) { + bio = &req->z.inline_bio; + bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec)); + } else { + bio = bio_alloc(GFP_KERNEL, req->sg_cnt); + } + + bio->bi_opf = REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE; + bio->bi_end_io = nvmet_bdev_zone_append_bio_done; + bio_set_dev(bio, req->ns->bdev); + bio->bi_iter.bi_sector = sect; + bio->bi_private = req; + if (req->cmd->rw.control & cpu_to_le16(NVME_RW_FUA)) + bio->bi_opf |= REQ_FUA; + + for_each_sg(req->sg, sg, req->sg_cnt, sg_cnt) { + struct page *p = sg_page(sg); + unsigned int l = sg->length; + unsigned int o = sg->offset; + unsigned int ret; + + ret = bio_add_zone_append_page(bio, p, l, o); + if (ret != sg->length) { + status = NVME_SC_INTERNAL; + goto out_put_bio; + } + total_len += sg->length; + } + + if (total_len != nvmet_rw_data_len(req)) { + status = NVME_SC_INTERNAL | NVME_SC_DNR; + goto out_put_bio; + } + + submit_bio(bio); + return; + +out_put_bio: + nvmet_req_bio_put(req, bio); +out: + nvmet_req_complete(req, status); +} + +u16 nvmet_bdev_zns_parse_io_cmd(struct nvmet_req *req) +{ + struct nvme_command *cmd = req->cmd; + + switch (cmd->common.opcode) { + case nvme_cmd_zone_append: + req->execute = nvmet_bdev_execute_zone_append; + return 0; + case nvme_cmd_zone_mgmt_recv: + req->execute = nvmet_bdev_execute_zone_mgmt_recv; + return 0; + case nvme_cmd_zone_mgmt_send: + req->execute = nvmet_bdev_execute_zone_mgmt_send; + return 0; + default: + return nvmet_bdev_parse_io_cmd(req); + } +} diff --git a/src/mlnx-ofa_kernel-5.8/drivers/nvme/tools/sign-modules b/src/mlnx-ofa_kernel-5.8/drivers/nvme/tools/sign-modules new file mode 100755 index 0000000..b790769 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/nvme/tools/sign-modules @@ -0,0 +1,58 @@ +#! /bin/bash + +moddir=$1; shift +KBUILD=$1; shift + +SOURCES_DIR= +case "$KBUILD" in + *linux-obj*) + SOURCES_DIR=$(readlink -f $KBUILD 2>/dev/null | sed -e 's/-obj.*//g') + ;; + */usr/src/linux-*-obj/*) + SOURCES_DIR=$(readlink -f $KBUILD 2>/dev/null | sed -e 's/-obj.*//g') + ;; + *) + SOURCES_DIR=$(readlink -f ${KBUILD/build/source}) + ;; +esac +if [ ! -e "$SOURCES_DIR" ]; then + SOURCES_DIR=$KBUILD +fi + +SIGN_FILE= +if [ -e "${KBUILD}/scripts/sign-file" ]; then + SIGN_FILE="${KBUILD}/scripts/sign-file" +elif [ -e "${SOURCES_DIR}/scripts/sign-file" ]; then + SIGN_FILE="${SOURCES_DIR}/scripts/sign-file" +else + echo "Error: Sign tool does not exist at '$KBUILD' or '$SOURCES_DIR' !" >&2 + exit 1 +fi +echo "Found Sign tool at: '${SIGN_FILE}'" + +if [ ! -e "${MODULE_SIGN_PRIV_KEY}" ]; then + echo "Error: MODULE_SIGN_PRIV_KEY is not set to valid path!" >&2 + exit 1 +fi +if [ ! -e "${MODULE_SIGN_PUB_KEY}" ]; then + echo "Error: MODULE_SIGN_PUB_KEY is not set to valid path!" >&2 + exit 1 +fi + +modules=`find $moddir -name '*.ko' -o -name '*.ko.gz'` +for mod in $modules +do + dir=`dirname $mod` + file=`basename $mod` + + ${SIGN_FILE} sha256 ${MODULE_SIGN_PRIV_KEY} ${MODULE_SIGN_PUB_KEY} ${dir}/${file} + rm -f ${dir}/${file}.{sig,dig} +done + +RANDOMMOD=$(find $moddir -type f -name '*.ko' -o -name '*.ko.gz' | sort -R | tail -n 1) +if [ "~Module signature appended~" != "$(tail -c 28 $RANDOMMOD)" ]; then + echo "*** Modules are unsigned! ***" + exit 1 +fi + +exit 0 diff --git a/src/mlnx-ofa_kernel-5.8/drivers/scsi/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/scsi/Makefile new file mode 100644 index 0000000..e13f5ff --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/scsi/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_SCSI_SRP_ATTRS) += scsi_transport_srp.o diff --git a/src/mlnx-ofa_kernel-5.8/drivers/scsi/scsi_priv.h b/src/mlnx-ofa_kernel-5.8/drivers/scsi/scsi_priv.h new file mode 100644 index 0000000..a0ee31d --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/scsi/scsi_priv.h @@ -0,0 +1,193 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _SCSI_PRIV_H +#define _SCSI_PRIV_H + +#include +#include +#include + +struct bsg_device; +struct request_queue; +struct request; +struct scsi_cmnd; +struct scsi_device; +struct scsi_target; +struct scsi_host_template; +struct Scsi_Host; +struct scsi_nl_hdr; + +#define SCSI_CMD_RETRIES_NO_LIMIT -1 + +/* + * Scsi Error Handler Flags + */ +#define SCSI_EH_ABORT_SCHEDULED 0x0002 /* Abort has been scheduled */ + +#define SCSI_SENSE_VALID(scmd) \ + (((scmd)->sense_buffer[0] & 0x70) == 0x70) + +/* hosts.c */ +extern int scsi_init_hosts(void); +extern void scsi_exit_hosts(void); + +/* scsi.c */ +int scsi_init_sense_cache(struct Scsi_Host *shost); +void scsi_init_command(struct scsi_device *dev, struct scsi_cmnd *cmd); +#ifdef CONFIG_SCSI_LOGGING +void scsi_log_send(struct scsi_cmnd *cmd); +void scsi_log_completion(struct scsi_cmnd *cmd, int disposition); +#else +static inline void scsi_log_send(struct scsi_cmnd *cmd) + { }; +static inline void scsi_log_completion(struct scsi_cmnd *cmd, int disposition) + { }; +#endif + +/* scsi_devinfo.c */ + +/* list of keys for the lists */ +enum scsi_devinfo_key { + SCSI_DEVINFO_GLOBAL = 0, + SCSI_DEVINFO_SPI, +}; + +extern blist_flags_t scsi_get_device_flags(struct scsi_device *sdev, + const unsigned char *vendor, + const unsigned char *model); +extern blist_flags_t scsi_get_device_flags_keyed(struct scsi_device *sdev, + const unsigned char *vendor, + const unsigned char *model, + enum scsi_devinfo_key key); +extern int scsi_dev_info_list_add_keyed(int compatible, char *vendor, + char *model, char *strflags, + blist_flags_t flags, + enum scsi_devinfo_key key); +extern int scsi_dev_info_list_del_keyed(char *vendor, char *model, + enum scsi_devinfo_key key); +extern int scsi_dev_info_add_list(enum scsi_devinfo_key key, const char *name); +extern int scsi_dev_info_remove_list(enum scsi_devinfo_key key); + +extern int __init scsi_init_devinfo(void); +extern void scsi_exit_devinfo(void); + +/* scsi_error.c */ +extern void scmd_eh_abort_handler(struct work_struct *work); +extern enum blk_eh_timer_return scsi_times_out(struct request *req); +extern int scsi_error_handler(void *host); +extern enum scsi_disposition scsi_decide_disposition(struct scsi_cmnd *cmd); +extern void scsi_eh_wakeup(struct Scsi_Host *shost); +extern void scsi_eh_scmd_add(struct scsi_cmnd *); +void scsi_eh_ready_devs(struct Scsi_Host *shost, + struct list_head *work_q, + struct list_head *done_q); +int scsi_eh_get_sense(struct list_head *work_q, + struct list_head *done_q); +int scsi_noretry_cmd(struct scsi_cmnd *scmd); +void scsi_eh_done(struct scsi_cmnd *scmd); + +/* scsi_lib.c */ +extern int scsi_maybe_unblock_host(struct scsi_device *sdev); +extern void scsi_device_unbusy(struct scsi_device *sdev, struct scsi_cmnd *cmd); +extern void scsi_queue_insert(struct scsi_cmnd *cmd, int reason); +extern void scsi_io_completion(struct scsi_cmnd *, unsigned int); +extern void scsi_run_host_queues(struct Scsi_Host *shost); +extern void scsi_requeue_run_queue(struct work_struct *work); +extern void scsi_start_queue(struct scsi_device *sdev); +extern int scsi_mq_setup_tags(struct Scsi_Host *shost); +extern void scsi_mq_free_tags(struct kref *kref); +extern void scsi_exit_queue(void); +extern void scsi_evt_thread(struct work_struct *work); + +/* scsi_proc.c */ +#ifdef CONFIG_SCSI_PROC_FS +extern void scsi_proc_hostdir_add(struct scsi_host_template *); +extern void scsi_proc_hostdir_rm(struct scsi_host_template *); +extern void scsi_proc_host_add(struct Scsi_Host *); +extern void scsi_proc_host_rm(struct Scsi_Host *); +extern int scsi_init_procfs(void); +extern void scsi_exit_procfs(void); +#else +# define scsi_proc_hostdir_add(sht) do { } while (0) +# define scsi_proc_hostdir_rm(sht) do { } while (0) +# define scsi_proc_host_add(shost) do { } while (0) +# define scsi_proc_host_rm(shost) do { } while (0) +# define scsi_init_procfs() (0) +# define scsi_exit_procfs() do { } while (0) +#endif /* CONFIG_PROC_FS */ + +/* scsi_scan.c */ +void scsi_enable_async_suspend(struct device *dev); +extern int scsi_complete_async_scans(void); +extern int scsi_scan_host_selected(struct Scsi_Host *, unsigned int, + unsigned int, u64, enum scsi_scan_mode); +extern void scsi_forget_host(struct Scsi_Host *); +extern void scsi_rescan_device(struct device *); + +/* scsi_sysctl.c */ +#ifdef CONFIG_SYSCTL +extern int scsi_init_sysctl(void); +extern void scsi_exit_sysctl(void); +#else +# define scsi_init_sysctl() (0) +# define scsi_exit_sysctl() do { } while (0) +#endif /* CONFIG_SYSCTL */ + +/* scsi_sysfs.c */ +extern int scsi_sysfs_add_sdev(struct scsi_device *); +extern int scsi_sysfs_add_host(struct Scsi_Host *); +extern int scsi_sysfs_register(void); +extern void scsi_sysfs_unregister(void); +extern void scsi_sysfs_device_initialize(struct scsi_device *); +extern int scsi_sysfs_target_initialize(struct scsi_device *); +extern struct scsi_transport_template blank_transport_template; +extern void __scsi_remove_device(struct scsi_device *); + +extern struct bus_type scsi_bus_type; +extern const struct attribute_group *scsi_shost_groups[]; + +/* scsi_netlink.c */ +#ifdef CONFIG_SCSI_NETLINK +extern void scsi_netlink_init(void); +extern void scsi_netlink_exit(void); +extern struct sock *scsi_nl_sock; +#else +static inline void scsi_netlink_init(void) {} +static inline void scsi_netlink_exit(void) {} +#endif + +/* scsi_pm.c */ +#ifdef CONFIG_PM +extern const struct dev_pm_ops scsi_bus_pm_ops; + +extern void scsi_autopm_get_target(struct scsi_target *); +extern void scsi_autopm_put_target(struct scsi_target *); +extern int scsi_autopm_get_host(struct Scsi_Host *); +extern void scsi_autopm_put_host(struct Scsi_Host *); +#else +static inline void scsi_autopm_get_target(struct scsi_target *t) {} +static inline void scsi_autopm_put_target(struct scsi_target *t) {} +static inline int scsi_autopm_get_host(struct Scsi_Host *h) { return 0; } +static inline void scsi_autopm_put_host(struct Scsi_Host *h) {} +#endif /* CONFIG_PM */ + +/* scsi_dh.c */ +#ifdef CONFIG_SCSI_DH +void scsi_dh_add_device(struct scsi_device *sdev); +void scsi_dh_release_device(struct scsi_device *sdev); +#else +static inline void scsi_dh_add_device(struct scsi_device *sdev) { } +static inline void scsi_dh_release_device(struct scsi_device *sdev) { } +#endif + +struct bsg_device *scsi_bsg_register_queue(struct scsi_device *sdev); + +extern int scsi_device_max_queue_depth(struct scsi_device *sdev); + +/* + * internal scsi timeout functions: for use by mid-layer and transport + * classes. + */ + +#define SCSI_DEVICE_BLOCK_MAX_TIMEOUT 600 /* units in seconds */ + +#endif /* _SCSI_PRIV_H */ diff --git a/src/mlnx-ofa_kernel-5.8/drivers/scsi/scsi_transport_srp.c b/src/mlnx-ofa_kernel-5.8/drivers/scsi/scsi_transport_srp.c new file mode 100644 index 0000000..0e2991d --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/scsi/scsi_transport_srp.c @@ -0,0 +1,905 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * SCSI RDMA (SRP) transport class + * + * Copyright (C) 2007 FUJITA Tomonori + */ +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include "scsi_priv.h" + +struct srp_host_attrs { + atomic_t next_port_id; +}; +#define to_srp_host_attrs(host) ((struct srp_host_attrs *)(host)->shost_data) + +#define SRP_HOST_ATTRS 0 +#define SRP_RPORT_ATTRS 8 + +struct srp_internal { + struct scsi_transport_template t; + struct srp_function_template *f; + + struct device_attribute *host_attrs[SRP_HOST_ATTRS + 1]; + + struct device_attribute *rport_attrs[SRP_RPORT_ATTRS + 1]; + struct transport_container rport_attr_cont; +}; + +static int scsi_is_srp_rport(const struct device *dev); + +#define to_srp_internal(tmpl) container_of(tmpl, struct srp_internal, t) + +#define dev_to_rport(d) container_of(d, struct srp_rport, dev) +#define transport_class_to_srp_rport(dev) dev_to_rport((dev)->parent) +static inline struct Scsi_Host *rport_to_shost(struct srp_rport *r) +{ + return dev_to_shost(r->dev.parent); +} + +static int find_child_rport(struct device *dev, void *data) +{ + struct device **child = data; + + if (scsi_is_srp_rport(dev)) { + WARN_ON_ONCE(*child); + *child = dev; + } + return 0; +} + +static inline struct srp_rport *shost_to_rport(struct Scsi_Host *shost) +{ + struct device *child = NULL; + + WARN_ON_ONCE(device_for_each_child(&shost->shost_gendev, &child, + find_child_rport) < 0); + return child ? dev_to_rport(child) : NULL; +} + +/** + * srp_tmo_valid() - check timeout combination validity + * @reconnect_delay: Reconnect delay in seconds. + * @fast_io_fail_tmo: Fast I/O fail timeout in seconds. + * @dev_loss_tmo: Device loss timeout in seconds. + * + * The combination of the timeout parameters must be such that SCSI commands + * are finished in a reasonable time. Hence do not allow the fast I/O fail + * timeout to exceed SCSI_DEVICE_BLOCK_MAX_TIMEOUT nor allow dev_loss_tmo to + * exceed that limit if failing I/O fast has been disabled. Furthermore, these + * parameters must be such that multipath can detect failed paths timely. + * Hence do not allow all three parameters to be disabled simultaneously. + */ +int srp_tmo_valid(int reconnect_delay, int fast_io_fail_tmo, long dev_loss_tmo) +{ + if (reconnect_delay < 0 && fast_io_fail_tmo < 0 && dev_loss_tmo < 0) + return -EINVAL; + if (reconnect_delay == 0) + return -EINVAL; + if (fast_io_fail_tmo > SCSI_DEVICE_BLOCK_MAX_TIMEOUT) + return -EINVAL; + if (fast_io_fail_tmo < 0 && + dev_loss_tmo > SCSI_DEVICE_BLOCK_MAX_TIMEOUT) + return -EINVAL; + if (dev_loss_tmo >= LONG_MAX / HZ) + return -EINVAL; + if (fast_io_fail_tmo >= 0 && dev_loss_tmo >= 0 && + fast_io_fail_tmo >= dev_loss_tmo) + return -EINVAL; + if (fast_io_fail_tmo > 0 && reconnect_delay > 0 && + fast_io_fail_tmo >= reconnect_delay) + return -EINVAL; + if (fast_io_fail_tmo < 0 && reconnect_delay > 0) + return -EINVAL; + return 0; +} +EXPORT_SYMBOL_GPL(srp_tmo_valid); + +static int srp_host_setup(struct transport_container *tc, struct device *dev, + struct device *cdev) +{ + struct Scsi_Host *shost = dev_to_shost(dev); + struct srp_host_attrs *srp_host = to_srp_host_attrs(shost); + + atomic_set(&srp_host->next_port_id, 0); + return 0; +} + +static DECLARE_TRANSPORT_CLASS(srp_host_class, "srp_host", srp_host_setup, + NULL, NULL); + +static DECLARE_TRANSPORT_CLASS(srp_rport_class, "srp_remote_ports", + NULL, NULL, NULL); + +static ssize_t +show_srp_rport_id(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct srp_rport *rport = transport_class_to_srp_rport(dev); + return sprintf(buf, "%16phC\n", rport->port_id); +} + +static DEVICE_ATTR(port_id, S_IRUGO, show_srp_rport_id, NULL); + +static const struct { + u32 value; + char *name; +} srp_rport_role_names[] = { + {SRP_RPORT_ROLE_INITIATOR, "SRP Initiator"}, + {SRP_RPORT_ROLE_TARGET, "SRP Target"}, +}; + +static ssize_t +show_srp_rport_roles(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct srp_rport *rport = transport_class_to_srp_rport(dev); + int i; + char *name = NULL; + + for (i = 0; i < ARRAY_SIZE(srp_rport_role_names); i++) + if (srp_rport_role_names[i].value == rport->roles) { + name = srp_rport_role_names[i].name; + break; + } + return sprintf(buf, "%s\n", name ? : "unknown"); +} + +static DEVICE_ATTR(roles, S_IRUGO, show_srp_rport_roles, NULL); + +static ssize_t store_srp_rport_delete(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct srp_rport *rport = transport_class_to_srp_rport(dev); + struct Scsi_Host *shost = dev_to_shost(dev); + struct srp_internal *i = to_srp_internal(shost->transportt); + + if (i->f->rport_delete) { + i->f->rport_delete(rport); + return count; + } else { + return -ENOSYS; + } +} + +static DEVICE_ATTR(delete, S_IWUSR, NULL, store_srp_rport_delete); + +static ssize_t show_srp_rport_state(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + static const char *const state_name[] = { + [SRP_RPORT_RUNNING] = "running", + [SRP_RPORT_BLOCKED] = "blocked", + [SRP_RPORT_FAIL_FAST] = "fail-fast", + [SRP_RPORT_LOST] = "lost", + }; + struct srp_rport *rport = transport_class_to_srp_rport(dev); + enum srp_rport_state state = rport->state; + + return sprintf(buf, "%s\n", + (unsigned)state < ARRAY_SIZE(state_name) ? + state_name[state] : "???"); +} + +static DEVICE_ATTR(state, S_IRUGO, show_srp_rport_state, NULL); + +static ssize_t srp_show_tmo(char *buf, int tmo) +{ + return tmo >= 0 ? sprintf(buf, "%d\n", tmo) : sprintf(buf, "off\n"); +} + +int srp_parse_tmo(int *tmo, const char *buf) +{ + int res = 0; + + if (strncmp(buf, "off", 3) != 0) + res = kstrtoint(buf, 0, tmo); + else + *tmo = -1; + + return res; +} +EXPORT_SYMBOL(srp_parse_tmo); + +static ssize_t show_reconnect_delay(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct srp_rport *rport = transport_class_to_srp_rport(dev); + + return srp_show_tmo(buf, rport->reconnect_delay); +} + +static ssize_t store_reconnect_delay(struct device *dev, + struct device_attribute *attr, + const char *buf, const size_t count) +{ + struct srp_rport *rport = transport_class_to_srp_rport(dev); + int res, delay; + + res = srp_parse_tmo(&delay, buf); + if (res) + goto out; + res = srp_tmo_valid(delay, rport->fast_io_fail_tmo, + rport->dev_loss_tmo); + if (res) + goto out; + + if (rport->reconnect_delay <= 0 && delay > 0 && + rport->state != SRP_RPORT_RUNNING) { + queue_delayed_work(system_long_wq, &rport->reconnect_work, + delay * HZ); + } else if (delay <= 0) { + cancel_delayed_work(&rport->reconnect_work); + } + rport->reconnect_delay = delay; + res = count; + +out: + return res; +} + +static DEVICE_ATTR(reconnect_delay, S_IRUGO | S_IWUSR, show_reconnect_delay, + store_reconnect_delay); + +static ssize_t show_failed_reconnects(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct srp_rport *rport = transport_class_to_srp_rport(dev); + + return sprintf(buf, "%d\n", rport->failed_reconnects); +} + +static DEVICE_ATTR(failed_reconnects, S_IRUGO, show_failed_reconnects, NULL); + +static ssize_t show_srp_rport_fast_io_fail_tmo(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct srp_rport *rport = transport_class_to_srp_rport(dev); + + return srp_show_tmo(buf, rport->fast_io_fail_tmo); +} + +static ssize_t store_srp_rport_fast_io_fail_tmo(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct srp_rport *rport = transport_class_to_srp_rport(dev); + int res; + int fast_io_fail_tmo; + + res = srp_parse_tmo(&fast_io_fail_tmo, buf); + if (res) + goto out; + res = srp_tmo_valid(rport->reconnect_delay, fast_io_fail_tmo, + rport->dev_loss_tmo); + if (res) + goto out; + rport->fast_io_fail_tmo = fast_io_fail_tmo; + res = count; + +out: + return res; +} + +static DEVICE_ATTR(fast_io_fail_tmo, S_IRUGO | S_IWUSR, + show_srp_rport_fast_io_fail_tmo, + store_srp_rport_fast_io_fail_tmo); + +static ssize_t show_srp_rport_dev_loss_tmo(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct srp_rport *rport = transport_class_to_srp_rport(dev); + + return srp_show_tmo(buf, rport->dev_loss_tmo); +} + +static ssize_t store_srp_rport_dev_loss_tmo(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct srp_rport *rport = transport_class_to_srp_rport(dev); + int res; + int dev_loss_tmo; + + res = srp_parse_tmo(&dev_loss_tmo, buf); + if (res) + goto out; + res = srp_tmo_valid(rport->reconnect_delay, rport->fast_io_fail_tmo, + dev_loss_tmo); + if (res) + goto out; + rport->dev_loss_tmo = dev_loss_tmo; + res = count; + +out: + return res; +} + +static DEVICE_ATTR(dev_loss_tmo, S_IRUGO | S_IWUSR, + show_srp_rport_dev_loss_tmo, + store_srp_rport_dev_loss_tmo); + +static int srp_rport_set_state(struct srp_rport *rport, + enum srp_rport_state new_state) +{ + enum srp_rport_state old_state = rport->state; + + lockdep_assert_held(&rport->mutex); + + switch (new_state) { + case SRP_RPORT_RUNNING: + switch (old_state) { + case SRP_RPORT_LOST: + goto invalid; + default: + break; + } + break; + case SRP_RPORT_BLOCKED: + switch (old_state) { + case SRP_RPORT_RUNNING: + break; + default: + goto invalid; + } + break; + case SRP_RPORT_FAIL_FAST: + switch (old_state) { + case SRP_RPORT_LOST: + goto invalid; + default: + break; + } + break; + case SRP_RPORT_LOST: + break; + } + rport->state = new_state; + return 0; + +invalid: + return -EINVAL; +} + +/** + * srp_reconnect_work() - reconnect and schedule a new attempt if necessary + * @work: Work structure used for scheduling this operation. + */ +static void srp_reconnect_work(struct work_struct *work) +{ + struct srp_rport *rport = container_of(to_delayed_work(work), + struct srp_rport, reconnect_work); + struct Scsi_Host *shost = rport_to_shost(rport); + int delay, res; + + res = srp_reconnect_rport(rport); + if (res != 0) { + shost_printk(KERN_ERR, shost, + "reconnect attempt %d failed (%d)\n", + ++rport->failed_reconnects, res); + delay = rport->reconnect_delay * + min(100, max(1, rport->failed_reconnects - 10)); + if (delay > 0) + queue_delayed_work(system_long_wq, + &rport->reconnect_work, delay * HZ); + } +} + +/* + * scsi_target_block() must have been called before this function is + * called to guarantee that no .queuecommand() calls are in progress. + */ +static void __rport_fail_io_fast(struct srp_rport *rport) +{ + struct Scsi_Host *shost = rport_to_shost(rport); + struct srp_internal *i; + + lockdep_assert_held(&rport->mutex); + + if (srp_rport_set_state(rport, SRP_RPORT_FAIL_FAST)) + return; + + scsi_target_unblock(rport->dev.parent, SDEV_TRANSPORT_OFFLINE); + + /* Involve the LLD if possible to terminate all I/O on the rport. */ + i = to_srp_internal(shost->transportt); + if (i->f->terminate_rport_io) + i->f->terminate_rport_io(rport); +} + +/** + * rport_fast_io_fail_timedout() - fast I/O failure timeout handler + * @work: Work structure used for scheduling this operation. + */ +static void rport_fast_io_fail_timedout(struct work_struct *work) +{ + struct srp_rport *rport = container_of(to_delayed_work(work), + struct srp_rport, fast_io_fail_work); + struct Scsi_Host *shost = rport_to_shost(rport); + + pr_info("fast_io_fail_tmo expired for SRP %s / %s.\n", + dev_name(&rport->dev), dev_name(&shost->shost_gendev)); + + mutex_lock(&rport->mutex); + if (rport->state == SRP_RPORT_BLOCKED) + __rport_fail_io_fast(rport); + mutex_unlock(&rport->mutex); +} + +/** + * rport_dev_loss_timedout() - device loss timeout handler + * @work: Work structure used for scheduling this operation. + */ +static void rport_dev_loss_timedout(struct work_struct *work) +{ + struct srp_rport *rport = container_of(to_delayed_work(work), + struct srp_rport, dev_loss_work); + struct Scsi_Host *shost = rport_to_shost(rport); + struct srp_internal *i = to_srp_internal(shost->transportt); + + pr_info("dev_loss_tmo expired for SRP %s / %s.\n", + dev_name(&rport->dev), dev_name(&shost->shost_gendev)); + + mutex_lock(&rport->mutex); + WARN_ON(srp_rport_set_state(rport, SRP_RPORT_LOST) != 0); + scsi_target_unblock(rport->dev.parent, SDEV_TRANSPORT_OFFLINE); + mutex_unlock(&rport->mutex); + + i->f->rport_delete(rport); +} + +static void __srp_start_tl_fail_timers(struct srp_rport *rport) +{ + struct Scsi_Host *shost = rport_to_shost(rport); + int delay, fast_io_fail_tmo, dev_loss_tmo; + + lockdep_assert_held(&rport->mutex); + + delay = rport->reconnect_delay; + fast_io_fail_tmo = rport->fast_io_fail_tmo; + dev_loss_tmo = rport->dev_loss_tmo; + pr_debug("%s current state: %d\n", dev_name(&shost->shost_gendev), + rport->state); + + if (rport->state == SRP_RPORT_LOST) + return; + if (delay > 0) + queue_delayed_work(system_long_wq, &rport->reconnect_work, + 1UL * delay * HZ); + if ((fast_io_fail_tmo >= 0 || dev_loss_tmo >= 0) && + srp_rport_set_state(rport, SRP_RPORT_BLOCKED) == 0) { + pr_debug("%s new state: %d\n", dev_name(&shost->shost_gendev), + rport->state); + scsi_target_block(&shost->shost_gendev); + if (fast_io_fail_tmo >= 0) + queue_delayed_work(system_long_wq, + &rport->fast_io_fail_work, + 1UL * fast_io_fail_tmo * HZ); + if (dev_loss_tmo >= 0) + queue_delayed_work(system_long_wq, + &rport->dev_loss_work, + 1UL * dev_loss_tmo * HZ); + } +} + +/** + * srp_start_tl_fail_timers() - start the transport layer failure timers + * @rport: SRP target port. + * + * Start the transport layer fast I/O failure and device loss timers. Do not + * modify a timer that was already started. + */ +void srp_start_tl_fail_timers(struct srp_rport *rport) +{ + mutex_lock(&rport->mutex); + __srp_start_tl_fail_timers(rport); + mutex_unlock(&rport->mutex); +} +EXPORT_SYMBOL(srp_start_tl_fail_timers); + +/** + * srp_reconnect_rport() - reconnect to an SRP target port + * @rport: SRP target port. + * + * Blocks SCSI command queueing before invoking reconnect() such that + * queuecommand() won't be invoked concurrently with reconnect() from outside + * the SCSI EH. This is important since a reconnect() implementation may + * reallocate resources needed by queuecommand(). + * + * Notes: + * - This function neither waits until outstanding requests have finished nor + * tries to abort these. It is the responsibility of the reconnect() + * function to finish outstanding commands before reconnecting to the target + * port. + * - It is the responsibility of the caller to ensure that the resources + * reallocated by the reconnect() function won't be used while this function + * is in progress. One possible strategy is to invoke this function from + * the context of the SCSI EH thread only. Another possible strategy is to + * lock the rport mutex inside each SCSI LLD callback that can be invoked by + * the SCSI EH (the scsi_host_template.eh_*() functions and also the + * scsi_host_template.queuecommand() function). + */ +int srp_reconnect_rport(struct srp_rport *rport) +{ + struct Scsi_Host *shost = rport_to_shost(rport); + struct srp_internal *i = to_srp_internal(shost->transportt); + struct scsi_device *sdev; + int res; + + pr_debug("SCSI host %s\n", dev_name(&shost->shost_gendev)); + + res = mutex_lock_interruptible(&rport->mutex); + if (res) + goto out; + if (rport->state != SRP_RPORT_FAIL_FAST && rport->state != SRP_RPORT_LOST) + /* + * sdev state must be SDEV_TRANSPORT_OFFLINE, transition + * to SDEV_BLOCK is illegal. Calling scsi_target_unblock() + * later is ok though, scsi_internal_device_unblock_nowait() + * treats SDEV_TRANSPORT_OFFLINE like SDEV_BLOCK. + */ + scsi_target_block(&shost->shost_gendev); + res = rport->state != SRP_RPORT_LOST ? i->f->reconnect(rport) : -ENODEV; + pr_debug("%s (state %d): transport.reconnect() returned %d\n", + dev_name(&shost->shost_gendev), rport->state, res); + if (res == 0) { + cancel_delayed_work(&rport->fast_io_fail_work); + cancel_delayed_work(&rport->dev_loss_work); + + rport->failed_reconnects = 0; + srp_rport_set_state(rport, SRP_RPORT_RUNNING); + scsi_target_unblock(&shost->shost_gendev, SDEV_RUNNING); + /* + * If the SCSI error handler has offlined one or more devices, + * invoking scsi_target_unblock() won't change the state of + * these devices into running so do that explicitly. + */ + shost_for_each_device(sdev, shost) { + mutex_lock(&sdev->state_mutex); + if (sdev->sdev_state == SDEV_OFFLINE) + sdev->sdev_state = SDEV_RUNNING; + mutex_unlock(&sdev->state_mutex); + } + } else if (rport->state == SRP_RPORT_RUNNING) { + /* + * srp_reconnect_rport() has been invoked with fast_io_fail + * and dev_loss off. Mark the port as failed and start the TL + * failure timers if these had not yet been started. + */ + __rport_fail_io_fast(rport); + __srp_start_tl_fail_timers(rport); + } else if (rport->state != SRP_RPORT_BLOCKED) { + scsi_target_unblock(&shost->shost_gendev, + SDEV_TRANSPORT_OFFLINE); + } + mutex_unlock(&rport->mutex); + +out: + return res; +} +EXPORT_SYMBOL(srp_reconnect_rport); + +/** + * srp_timed_out() - SRP transport intercept of the SCSI timeout EH + * @scmd: SCSI command. + * + * If a timeout occurs while an rport is in the blocked state, ask the SCSI + * EH to continue waiting (BLK_EH_RESET_TIMER). Otherwise let the SCSI core + * handle the timeout (BLK_EH_DONE). + * + * Note: This function is called from soft-IRQ context and with the request + * queue lock held. + */ +enum blk_eh_timer_return srp_timed_out(struct scsi_cmnd *scmd) +{ + struct scsi_device *sdev = scmd->device; + struct Scsi_Host *shost = sdev->host; + struct srp_internal *i = to_srp_internal(shost->transportt); + struct srp_rport *rport = shost_to_rport(shost); + + pr_debug("timeout for sdev %s\n", dev_name(&sdev->sdev_gendev)); + return rport && rport->fast_io_fail_tmo < 0 && + rport->dev_loss_tmo < 0 && + i->f->reset_timer_if_blocked && scsi_device_blocked(sdev) ? + BLK_EH_RESET_TIMER : BLK_EH_DONE; +} +EXPORT_SYMBOL(srp_timed_out); + +static void srp_rport_release(struct device *dev) +{ + struct srp_rport *rport = dev_to_rport(dev); + + put_device(dev->parent); + kfree(rport); +} + +static int scsi_is_srp_rport(const struct device *dev) +{ + return dev->release == srp_rport_release; +} + +static int srp_rport_match(struct attribute_container *cont, + struct device *dev) +{ + struct Scsi_Host *shost; + struct srp_internal *i; + + if (!scsi_is_srp_rport(dev)) + return 0; + + shost = dev_to_shost(dev->parent); + if (!shost->transportt) + return 0; + if (shost->transportt->host_attrs.ac.class != &srp_host_class.class) + return 0; + + i = to_srp_internal(shost->transportt); + return &i->rport_attr_cont.ac == cont; +} + +static int srp_host_match(struct attribute_container *cont, struct device *dev) +{ + struct Scsi_Host *shost; + struct srp_internal *i; + + if (!scsi_is_host_device(dev)) + return 0; + + shost = dev_to_shost(dev); + if (!shost->transportt) + return 0; + if (shost->transportt->host_attrs.ac.class != &srp_host_class.class) + return 0; + + i = to_srp_internal(shost->transportt); + return &i->t.host_attrs.ac == cont; +} + +/** + * srp_rport_get() - increment rport reference count + * @rport: SRP target port. + */ +void srp_rport_get(struct srp_rport *rport) +{ + get_device(&rport->dev); +} +EXPORT_SYMBOL(srp_rport_get); + +/** + * srp_rport_put() - decrement rport reference count + * @rport: SRP target port. + */ +void srp_rport_put(struct srp_rport *rport) +{ + put_device(&rport->dev); +} +EXPORT_SYMBOL(srp_rport_put); + +/** + * srp_rport_add - add a SRP remote port to the device hierarchy + * @shost: scsi host the remote port is connected to. + * @ids: The port id for the remote port. + * + * Publishes a port to the rest of the system. + */ +struct srp_rport *srp_rport_add(struct Scsi_Host *shost, + struct srp_rport_identifiers *ids) +{ + struct srp_rport *rport; + struct device *parent = &shost->shost_gendev; + struct srp_internal *i = to_srp_internal(shost->transportt); + int id, ret; + + rport = kzalloc(sizeof(*rport), GFP_KERNEL); + if (!rport) + return ERR_PTR(-ENOMEM); + + mutex_init(&rport->mutex); + + device_initialize(&rport->dev); + + rport->dev.parent = get_device(parent); + rport->dev.release = srp_rport_release; + + memcpy(rport->port_id, ids->port_id, sizeof(rport->port_id)); + rport->roles = ids->roles; + + if (i->f->reconnect) + rport->reconnect_delay = i->f->reconnect_delay ? + *i->f->reconnect_delay : 10; + INIT_DELAYED_WORK(&rport->reconnect_work, srp_reconnect_work); + rport->fast_io_fail_tmo = i->f->fast_io_fail_tmo ? + *i->f->fast_io_fail_tmo : 15; + rport->dev_loss_tmo = i->f->dev_loss_tmo ? *i->f->dev_loss_tmo : 60; + INIT_DELAYED_WORK(&rport->fast_io_fail_work, + rport_fast_io_fail_timedout); + INIT_DELAYED_WORK(&rport->dev_loss_work, rport_dev_loss_timedout); + + id = atomic_inc_return(&to_srp_host_attrs(shost)->next_port_id); + dev_set_name(&rport->dev, "port-%d:%d", shost->host_no, id); + + transport_setup_device(&rport->dev); + + ret = device_add(&rport->dev); + if (ret) { + transport_destroy_device(&rport->dev); + put_device(&rport->dev); + return ERR_PTR(ret); + } + + transport_add_device(&rport->dev); + transport_configure_device(&rport->dev); + + return rport; +} +EXPORT_SYMBOL_GPL(srp_rport_add); + +/** + * srp_rport_del - remove a SRP remote port + * @rport: SRP remote port to remove + * + * Removes the specified SRP remote port. + */ +void srp_rport_del(struct srp_rport *rport) +{ + struct device *dev = &rport->dev; + + transport_remove_device(dev); + device_del(dev); + transport_destroy_device(dev); + + put_device(dev); +} +EXPORT_SYMBOL_GPL(srp_rport_del); + +static int do_srp_rport_del(struct device *dev, void *data) +{ + if (scsi_is_srp_rport(dev)) + srp_rport_del(dev_to_rport(dev)); + return 0; +} + +/** + * srp_remove_host - tear down a Scsi_Host's SRP data structures + * @shost: Scsi Host that is torn down + * + * Removes all SRP remote ports for a given Scsi_Host. + * Must be called just before scsi_remove_host for SRP HBAs. + */ +void srp_remove_host(struct Scsi_Host *shost) +{ + device_for_each_child(&shost->shost_gendev, NULL, do_srp_rport_del); +} +EXPORT_SYMBOL_GPL(srp_remove_host); + +/** + * srp_stop_rport_timers - stop the transport layer recovery timers + * @rport: SRP remote port for which to stop the timers. + * + * Must be called after srp_remove_host() and scsi_remove_host(). The caller + * must hold a reference on the rport (rport->dev) and on the SCSI host + * (rport->dev.parent). + */ +void srp_stop_rport_timers(struct srp_rport *rport) +{ + mutex_lock(&rport->mutex); + if (rport->state == SRP_RPORT_BLOCKED) + __rport_fail_io_fast(rport); + srp_rport_set_state(rport, SRP_RPORT_LOST); + mutex_unlock(&rport->mutex); + + cancel_delayed_work_sync(&rport->reconnect_work); + cancel_delayed_work_sync(&rport->fast_io_fail_work); + cancel_delayed_work_sync(&rport->dev_loss_work); +} +EXPORT_SYMBOL_GPL(srp_stop_rport_timers); + +/** + * srp_attach_transport - instantiate SRP transport template + * @ft: SRP transport class function template + */ +struct scsi_transport_template * +srp_attach_transport(struct srp_function_template *ft) +{ + int count; + struct srp_internal *i; + + i = kzalloc(sizeof(*i), GFP_KERNEL); + if (!i) + return NULL; + + i->t.host_size = sizeof(struct srp_host_attrs); + i->t.host_attrs.ac.attrs = &i->host_attrs[0]; + i->t.host_attrs.ac.class = &srp_host_class.class; + i->t.host_attrs.ac.match = srp_host_match; + i->host_attrs[0] = NULL; + transport_container_register(&i->t.host_attrs); + + i->rport_attr_cont.ac.attrs = &i->rport_attrs[0]; + i->rport_attr_cont.ac.class = &srp_rport_class.class; + i->rport_attr_cont.ac.match = srp_rport_match; + + count = 0; + i->rport_attrs[count++] = &dev_attr_port_id; + i->rport_attrs[count++] = &dev_attr_roles; + if (ft->has_rport_state) { + i->rport_attrs[count++] = &dev_attr_state; + i->rport_attrs[count++] = &dev_attr_fast_io_fail_tmo; + i->rport_attrs[count++] = &dev_attr_dev_loss_tmo; + } + if (ft->reconnect) { + i->rport_attrs[count++] = &dev_attr_reconnect_delay; + i->rport_attrs[count++] = &dev_attr_failed_reconnects; + } + if (ft->rport_delete) + i->rport_attrs[count++] = &dev_attr_delete; + i->rport_attrs[count++] = NULL; + BUG_ON(count > ARRAY_SIZE(i->rport_attrs)); + + transport_container_register(&i->rport_attr_cont); + + i->f = ft; + + return &i->t; +} +EXPORT_SYMBOL_GPL(srp_attach_transport); + +/** + * srp_release_transport - release SRP transport template instance + * @t: transport template instance + */ +void srp_release_transport(struct scsi_transport_template *t) +{ + struct srp_internal *i = to_srp_internal(t); + + transport_container_unregister(&i->t.host_attrs); + transport_container_unregister(&i->rport_attr_cont); + + kfree(i); +} +EXPORT_SYMBOL_GPL(srp_release_transport); + +static __init int srp_transport_init(void) +{ + int ret; + + ret = transport_class_register(&srp_host_class); + if (ret) + return ret; + ret = transport_class_register(&srp_rport_class); + if (ret) + goto unregister_host_class; + + return 0; +unregister_host_class: + transport_class_unregister(&srp_host_class); + return ret; +} + +static void __exit srp_transport_exit(void) +{ + transport_class_unregister(&srp_host_class); + transport_class_unregister(&srp_rport_class); +} + +MODULE_AUTHOR("FUJITA Tomonori"); +MODULE_DESCRIPTION("SRP Transport Attributes"); +MODULE_LICENSE("GPL"); + +module_init(srp_transport_init); +module_exit(srp_transport_exit); diff --git a/src/mlnx-ofa_kernel-5.8/drivers/vdpa/mlx5/Makefile b/src/mlnx-ofa_kernel-5.8/drivers/vdpa/mlx5/Makefile new file mode 100644 index 0000000..c47ca75 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/vdpa/mlx5/Makefile @@ -0,0 +1,3 @@ +obj-m += mlx5_vdpa.o + +mlx5_vdpa-y := vdpa_main.o diff --git a/src/mlnx-ofa_kernel-5.8/drivers/vdpa/mlx5/vdpa_main.c b/src/mlnx-ofa_kernel-5.8/drivers/vdpa/mlx5/vdpa_main.c new file mode 100644 index 0000000..d58a347 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/drivers/vdpa/mlx5/vdpa_main.c @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "mlx5_vdpa" +#define DRV_VERSION "2.31" + +MODULE_AUTHOR("Mohammad Kabat"); +MODULE_DESCRIPTION("mlx5_vdpa dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init vdpa_init(void) +{ + return 0; +} + +static void __exit vdpa_cleanup(void) +{ +} + +module_init(vdpa_init); +module_exit(vdpa_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/fs/cifs/Makefile b/src/mlnx-ofa_kernel-5.8/fs/cifs/Makefile new file mode 100644 index 0000000..c4f8f39 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/fs/cifs/Makefile @@ -0,0 +1,3 @@ +obj-m += cifs.o + +cifs-y := cifs_main.o diff --git a/src/mlnx-ofa_kernel-5.8/fs/cifs/cifs_main.c b/src/mlnx-ofa_kernel-5.8/fs/cifs/cifs_main.c new file mode 100644 index 0000000..65ade02 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/fs/cifs/cifs_main.c @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define DRV_NAME "cifs" +#define DRV_VERSION "2.31" + +MODULE_AUTHOR("Mohammad Kabat"); +MODULE_DESCRIPTION("cifs dummy kernel module"); +MODULE_LICENSE("Dual BSD/GPL"); +#ifdef RETPOLINE_MLNX +MODULE_INFO(retpoline, "Y"); +#endif +MODULE_VERSION(DRV_VERSION); + +static int __init cifs_init(void) +{ + return 0; +} + +static void __exit cifs_cleanup(void) +{ +} + +module_init(cifs_init); +module_exit(cifs_cleanup); diff --git a/src/mlnx-ofa_kernel-5.8/include/asm-generic/bug.h b/src/mlnx-ofa_kernel-5.8/include/asm-generic/bug.h new file mode 100644 index 0000000..b8d9a9a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/include/asm-generic/bug.h @@ -0,0 +1,12 @@ +#ifndef _COMPAT_ASM_GENERIC_BUG_H +#define _COMPAT_ASM_GENERIC_BUG_H + +#include "../../compat/config.h" + +#include_next + +#ifndef CUT_HERE +#define CUT_HERE "------------[ cut here ]------------\n" +#endif + +#endif /* _COMPAT_ASM_GENERIC_BUG_H */ diff --git a/src/mlnx-ofa_kernel-5.8/include/linux/auxiliary_bus.h b/src/mlnx-ofa_kernel-5.8/include/linux/auxiliary_bus.h new file mode 100644 index 0000000..de21d9d --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/include/linux/auxiliary_bus.h @@ -0,0 +1,251 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2019-2020 Intel Corporation + * + * Please see Documentation/driver-api/auxiliary_bus.rst for more information. + */ + +#ifndef _AUXILIARY_BUS_H_ +#define _AUXILIARY_BUS_H_ + +#include +#include + +/** + * DOC: DEVICE_LIFESPAN + * + * The registering driver is the entity that allocates memory for the + * auxiliary_device and registers it on the auxiliary bus. It is important to + * note that, as opposed to the platform bus, the registering driver is wholly + * responsible for the management of the memory used for the device object. + * + * To be clear the memory for the auxiliary_device is freed in the release() + * callback defined by the registering driver. The registering driver should + * only call auxiliary_device_delete() and then auxiliary_device_uninit() when + * it is done with the device. The release() function is then automatically + * called if and when other code releases their reference to the devices. + * + * A parent object, defined in the shared header file, contains the + * auxiliary_device. It also contains a pointer to the shared object(s), which + * also is defined in the shared header. Both the parent object and the shared + * object(s) are allocated by the registering driver. This layout allows the + * auxiliary_driver's registering module to perform a container_of() call to go + * from the pointer to the auxiliary_device, that is passed during the call to + * the auxiliary_driver's probe function, up to the parent object, and then + * have access to the shared object(s). + * + * The memory for the shared object(s) must have a lifespan equal to, or + * greater than, the lifespan of the memory for the auxiliary_device. The + * auxiliary_driver should only consider that the shared object is valid as + * long as the auxiliary_device is still registered on the auxiliary bus. It + * is up to the registering driver to manage (e.g. free or keep available) the + * memory for the shared object beyond the life of the auxiliary_device. + * + * The registering driver must unregister all auxiliary devices before its own + * driver.remove() is completed. An easy way to ensure this is to use the + * devm_add_action_or_reset() call to register a function against the parent + * device which unregisters the auxiliary device object(s). + * + * Finally, any operations which operate on the auxiliary devices must continue + * to function (if only to return an error) after the registering driver + * unregisters the auxiliary device. + */ + +/** + * struct auxiliary_device - auxiliary device object. + * @dev: Device, + * The release and parent fields of the device structure must be filled + * in + * @name: Match name found by the auxiliary device driver, + * @id: unique identitier if multiple devices of the same name are exported, + * + * An auxiliary_device represents a part of its parent device's functionality. + * It is given a name that, combined with the registering drivers + * KBUILD_MODNAME, creates a match_name that is used for driver binding, and an + * id that combined with the match_name provide a unique name to register with + * the bus subsystem. For example, a driver registering an auxiliary device is + * named 'foo_mod.ko' and the subdevice is named 'foo_dev'. The match name is + * therefore 'foo_mod.foo_dev'. + * + * Registering an auxiliary_device is a three-step process. + * + * First, a 'struct auxiliary_device' needs to be defined or allocated for each + * sub-device desired. The name, id, dev.release, and dev.parent fields of + * this structure must be filled in as follows. + * + * The 'name' field is to be given a name that is recognized by the auxiliary + * driver. If two auxiliary_devices with the same match_name, eg + * "foo_mod.foo_dev", are registered onto the bus, they must have unique id + * values (e.g. "x" and "y") so that the registered devices names are + * "foo_mod.foo_dev.x" and "foo_mod.foo_dev.y". If match_name + id are not + * unique, then the device_add fails and generates an error message. + * + * The auxiliary_device.dev.type.release or auxiliary_device.dev.release must + * be populated with a non-NULL pointer to successfully register the + * auxiliary_device. This release call is where resources associated with the + * auxiliary device must be free'ed. Because once the device is placed on the + * bus the parent driver can not tell what other code may have a reference to + * this data. + * + * The auxiliary_device.dev.parent should be set. Typically to the registering + * drivers device. + * + * Second, call auxiliary_device_init(), which checks several aspects of the + * auxiliary_device struct and performs a device_initialize(). After this step + * completes, any error state must have a call to auxiliary_device_uninit() in + * its resolution path. + * + * The third and final step in registering an auxiliary_device is to perform a + * call to auxiliary_device_add(), which sets the name of the device and adds + * the device to the bus. + * + * .. code-block:: c + * + * #define MY_DEVICE_NAME "foo_dev" + * + * ... + * + * struct auxiliary_device *my_aux_dev = my_aux_dev_alloc(xxx); + * + * // Step 1: + * my_aux_dev->name = MY_DEVICE_NAME; + * my_aux_dev->id = my_unique_id_alloc(xxx); + * my_aux_dev->dev.release = my_aux_dev_release; + * my_aux_dev->dev.parent = my_dev; + * + * // Step 2: + * if (auxiliary_device_init(my_aux_dev)) + * goto fail; + * + * // Step 3: + * if (auxiliary_device_add(my_aux_dev)) { + * auxiliary_device_uninit(my_aux_dev); + * goto fail; + * } + * + * ... + * + * + * Unregistering an auxiliary_device is a two-step process to mirror the + * register process. First call auxiliary_device_delete(), then call + * auxiliary_device_uninit(). + * + * .. code-block:: c + * + * auxiliary_device_delete(my_dev->my_aux_dev); + * auxiliary_device_uninit(my_dev->my_aux_dev); + */ +struct auxiliary_device { + struct device dev; + const char *name; + u32 id; +}; + +/** + * struct auxiliary_driver - Definition of an auxiliary bus driver + * @probe: Called when a matching device is added to the bus. + * @remove: Called when device is removed from the bus. + * @shutdown: Called at shut-down time to quiesce the device. + * @suspend: Called to put the device to sleep mode. Usually to a power state. + * @resume: Called to bring a device from sleep mode. + * @name: Driver name. + * @driver: Core driver structure. + * @id_table: Table of devices this driver should match on the bus. + * + * Auxiliary drivers follow the standard driver model convention, where + * discovery/enumeration is handled by the core, and drivers provide probe() + * and remove() methods. They support power management and shutdown + * notifications using the standard conventions. + * + * Auxiliary drivers register themselves with the bus by calling + * auxiliary_driver_register(). The id_table contains the match_names of + * auxiliary devices that a driver can bind with. + * + * .. code-block:: c + * + * static const struct auxiliary_device_id my_auxiliary_id_table[] = { + * { .name = "foo_mod.foo_dev" }, + * {}, + * }; + * + * MODULE_DEVICE_TABLE(auxiliary, my_auxiliary_id_table); + * + * struct auxiliary_driver my_drv = { + * .name = "myauxiliarydrv", + * .id_table = my_auxiliary_id_table, + * .probe = my_drv_probe, + * .remove = my_drv_remove + * }; + */ +struct auxiliary_driver { + int (*probe)(struct auxiliary_device *auxdev, const struct auxiliary_device_id *id); + void (*remove)(struct auxiliary_device *auxdev); + void (*shutdown)(struct auxiliary_device *auxdev); + int (*suspend)(struct auxiliary_device *auxdev, pm_message_t state); + int (*resume)(struct auxiliary_device *auxdev); + const char *name; + struct device_driver driver; + const struct auxiliary_device_id *id_table; +}; + +static inline void *auxiliary_get_drvdata(struct auxiliary_device *auxdev) +{ + return dev_get_drvdata(&auxdev->dev); +} + +static inline void auxiliary_set_drvdata(struct auxiliary_device *auxdev, void *data) +{ + dev_set_drvdata(&auxdev->dev, data); +} + +static inline struct auxiliary_device *to_auxiliary_dev(struct device *dev) +{ + return container_of(dev, struct auxiliary_device, dev); +} + +static inline struct auxiliary_driver *to_auxiliary_drv(struct device_driver *drv) +{ + return container_of(drv, struct auxiliary_driver, driver); +} + +int auxiliary_device_init(struct auxiliary_device *auxdev); +int __auxiliary_device_add(struct auxiliary_device *auxdev, const char *modname); +#define auxiliary_device_add(auxdev) __auxiliary_device_add(auxdev, KBUILD_MODNAME) + +static inline void auxiliary_device_uninit(struct auxiliary_device *auxdev) +{ + put_device(&auxdev->dev); +} + +static inline void auxiliary_device_delete(struct auxiliary_device *auxdev) +{ + device_del(&auxdev->dev); +} + +int __auxiliary_driver_register(struct auxiliary_driver *auxdrv, struct module *owner, + const char *modname); +#define auxiliary_driver_register(auxdrv) \ + __auxiliary_driver_register(auxdrv, THIS_MODULE, KBUILD_MODNAME) + +void auxiliary_driver_unregister(struct auxiliary_driver *auxdrv); + +/** + * module_auxiliary_driver() - Helper macro for registering an auxiliary driver + * @__auxiliary_driver: auxiliary driver struct + * + * Helper macro for auxiliary drivers which do not do anything special in + * module init/exit. This eliminates a lot of boilerplate. Each module may only + * use this macro once, and calling it replaces module_init() and module_exit() + * + * .. code-block:: c + * + * module_auxiliary_driver(my_drv); + */ +#define module_auxiliary_driver(__auxiliary_driver) \ + module_driver(__auxiliary_driver, auxiliary_driver_register, auxiliary_driver_unregister) + +struct auxiliary_device *auxiliary_find_device(struct device *start, + const void *data, + int (*match)(struct device *dev, const void *data)); + +#endif /* _AUXILIARY_BUS_H_ */ diff --git a/src/mlnx-ofa_kernel-5.8/include/linux/bit.h b/src/mlnx-ofa_kernel-5.8/include/linux/bit.h new file mode 100644 index 0000000..cb0cfdd --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/include/linux/bit.h @@ -0,0 +1,14 @@ +#ifndef _COMPAT_LINUX_BIT_H +#define _COMPAT_LINUX_BIT_H + +/* Include the autogenerated header file */ +#include "../../compat/config.h" + +#include_next + +#ifndef GENMASK +#define GENMASK(h, l) \ + (((~0UL) - (1UL << (l)) + 1) & (~0UL >> (BITS_PER_LONG - 1 - (h)))) +#endif + +#endif /* _COMPAT_LINUX_BIT_H */ diff --git a/src/mlnx-ofa_kernel-5.8/include/linux/bitfield.h b/src/mlnx-ofa_kernel-5.8/include/linux/bitfield.h new file mode 100644 index 0000000..591326e --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/include/linux/bitfield.h @@ -0,0 +1,165 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2014 Felix Fietkau + * Copyright (C) 2004 - 2009 Ivo van Doorn + */ + +#ifndef _COMPAT_LINUX_BITFIELD_H +#define _COMAPAT_LINUX_BITFIELD_H + +#include "../../compat/config.h" + +#ifdef HAVE_BITFIELD_H +#include_next +#else +#include +#include + +/* + * Bitfield access macros + * + * FIELD_{GET,PREP} macros take as first parameter shifted mask + * from which they extract the base mask and shift amount. + * Mask must be a compilation time constant. + * + * Example: + * + * #define REG_FIELD_A GENMASK(6, 0) + * #define REG_FIELD_B BIT(7) + * #define REG_FIELD_C GENMASK(15, 8) + * #define REG_FIELD_D GENMASK(31, 16) + * + * Get: + * a = FIELD_GET(REG_FIELD_A, reg); + * b = FIELD_GET(REG_FIELD_B, reg); + * + * Set: + * reg = FIELD_PREP(REG_FIELD_A, 1) | + * FIELD_PREP(REG_FIELD_B, 0) | + * FIELD_PREP(REG_FIELD_C, c) | + * FIELD_PREP(REG_FIELD_D, 0x40); + * + * Modify: + * reg &= ~REG_FIELD_C; + * reg |= FIELD_PREP(REG_FIELD_C, c); + */ + +#define __bf_shf(x) (__builtin_ffsll(x) - 1) + +#define __BF_FIELD_CHECK(_mask, _reg, _val, _pfx) \ + ({ \ + BUILD_BUG_ON_MSG(!__builtin_constant_p(_mask), \ + _pfx "mask is not constant"); \ + BUILD_BUG_ON_MSG((_mask) == 0, _pfx "mask is zero"); \ + BUILD_BUG_ON_MSG(__builtin_constant_p(_val) ? \ + ~((_mask) >> __bf_shf(_mask)) & (_val) : 0, \ + _pfx "value too large for the field"); \ + BUILD_BUG_ON_MSG((_mask) > (typeof(_reg))~0ull, \ + _pfx "type of reg too small for mask"); \ + __BUILD_BUG_ON_NOT_POWER_OF_2((_mask) + \ + (1ULL << __bf_shf(_mask))); \ + }) + +/** + * FIELD_MAX() - produce the maximum value representable by a field + * @_mask: shifted mask defining the field's length and position + * + * FIELD_MAX() returns the maximum value that can be held in the field + * specified by @_mask. + */ +#define FIELD_MAX(_mask) \ + ({ \ + __BF_FIELD_CHECK(_mask, 0ULL, 0ULL, "FIELD_MAX: "); \ + (typeof(_mask))((_mask) >> __bf_shf(_mask)); \ + }) + +/** + * FIELD_FIT() - check if value fits in the field + * @_mask: shifted mask defining the field's length and position + * @_val: value to test against the field + * + * Return: true if @_val can fit inside @_mask, false if @_val is too big. + */ +#define FIELD_FIT(_mask, _val) \ + ({ \ + __BF_FIELD_CHECK(_mask, 0ULL, 0ULL, "FIELD_FIT: "); \ + !((((typeof(_mask))_val) << __bf_shf(_mask)) & ~(_mask)); \ + }) + +/** + * FIELD_PREP() - prepare a bitfield element + * @_mask: shifted mask defining the field's length and position + * @_val: value to put in the field + * + * FIELD_PREP() masks and shifts up the value. The result should + * be combined with other fields of the bitfield using logical OR. + */ +#define FIELD_PREP(_mask, _val) \ + ({ \ + __BF_FIELD_CHECK(_mask, 0ULL, _val, "FIELD_PREP: "); \ + ((typeof(_mask))(_val) << __bf_shf(_mask)) & (_mask); \ + }) + +/** + * FIELD_GET() - extract a bitfield element + * @_mask: shifted mask defining the field's length and position + * @_reg: value of entire bitfield + * + * FIELD_GET() extracts the field specified by @_mask from the + * bitfield passed in as @_reg by masking and shifting it down. + */ +#define FIELD_GET(_mask, _reg) \ + ({ \ + __BF_FIELD_CHECK(_mask, _reg, 0U, "FIELD_GET: "); \ + (typeof(_mask))(((_reg) & (_mask)) >> __bf_shf(_mask)); \ + }) + +extern void __compiletime_error("value doesn't fit into mask") +__field_overflow(void); +extern void __compiletime_error("bad bitfield mask") +__bad_mask(void); +static __always_inline u64 field_multiplier(u64 field) +{ + if ((field | (field - 1)) & ((field | (field - 1)) + 1)) + __bad_mask(); + return field & -field; +} +static __always_inline u64 field_mask(u64 field) +{ + return field / field_multiplier(field); +} +#define field_max(field) ((typeof(field))field_mask(field)) +#define ____MAKE_OP(type,base,to,from) \ +static __always_inline __##type type##_encode_bits(base v, base field) \ +{ \ + if (__builtin_constant_p(v) && (v & ~field_mask(field))) \ + __field_overflow(); \ + return to((v & field_mask(field)) * field_multiplier(field)); \ +} \ +static __always_inline __##type type##_replace_bits(__##type old, \ + base val, base field) \ +{ \ + return (old & ~to(field)) | type##_encode_bits(val, field); \ +} \ +static __always_inline void type##p_replace_bits(__##type *p, \ + base val, base field) \ +{ \ + *p = (*p & ~to(field)) | type##_encode_bits(val, field); \ +} \ +static __always_inline base type##_get_bits(__##type v, base field) \ +{ \ + return (from(v) & field)/field_multiplier(field); \ +} +#define __MAKE_OP(size) \ + ____MAKE_OP(le##size,u##size,cpu_to_le##size,le##size##_to_cpu) \ + ____MAKE_OP(be##size,u##size,cpu_to_be##size,be##size##_to_cpu) \ + ____MAKE_OP(u##size,u##size,,) +____MAKE_OP(u8,u8,,) +__MAKE_OP(16) +__MAKE_OP(32) +__MAKE_OP(64) +#undef __MAKE_OP +#undef ____MAKE_OP + +#endif /* HAVE_BITFIELD_H */ +#endif /* _COMPAT_LINUX_BITFIELD_H */ diff --git a/src/mlnx-ofa_kernel-5.8/include/linux/bitmap.h b/src/mlnx-ofa_kernel-5.8/include/linux/bitmap.h new file mode 100644 index 0000000..7e2cf1d --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/include/linux/bitmap.h @@ -0,0 +1,41 @@ +#ifndef _COMPAT_LINUX_BITMAP_H +#define _COMPAT_LINUX_BITMAP_H + +#include "../../compat/config.h" + +#include_next + + +#ifndef HAVE_BITMAP_KZALLOC +#define bitmap_alloc LINUX_BACKPORT(bitmap_alloc) +extern unsigned long *bitmap_alloc(unsigned int nbits, gfp_t flags); +#define bitmap_zalloc LINUX_BACKPORT(bitmap_zalloc) +extern unsigned long *bitmap_zalloc(unsigned int nbits, gfp_t flags); +#endif + +#ifndef HAVE_BITMAP_FREE +#define bitmap_free LINUX_BACKPORT(bitmap_free) +extern void bitmap_free(const unsigned long *bitmap); +#endif + +#ifndef HAVE_BITMAP_FROM_ARR32 +#if BITS_PER_LONG == 64 +extern void bitmap_from_arr32(unsigned long *bitmap, const u32 *buf, + unsigned int nbits); +#else + +static inline void bitmap_copy_clear_tail(unsigned long *dst, + const unsigned long *src, unsigned int nbits) +{ + bitmap_copy(dst, src, nbits); + if (nbits % BITS_PER_LONG) + dst[nbits / BITS_PER_LONG] &= BITMAP_LAST_WORD_MASK(nbits); +} + +#define bitmap_from_arr32(bitmap, buf, nbits) \ + bitmap_copy_clear_tail((unsigned long *) (bitmap), \ + (const unsigned long *) (buf), (nbits)) +#endif +#endif + +#endif /* _COMPAT_LINUX_BITMAP_H */ diff --git a/src/mlnx-ofa_kernel-5.8/include/linux/bitops.h b/src/mlnx-ofa_kernel-5.8/include/linux/bitops.h new file mode 100644 index 0000000..b22ded2 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/include/linux/bitops.h @@ -0,0 +1,17 @@ +#ifndef _COMPAT_LINUX_BITOPS_H +#define _COMPAT_LINUX_BITOPS_H + +/* Include the autogenerated header file */ +#include "../../compat/config.h" + +#include_next + +#ifndef BIT_ULL +#define BIT_ULL(nr) (1ULL << (nr)) +#endif + +#ifndef BITS_PER_TYPE +#define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE) +#endif + +#endif /* _COMPAT_LINUX_BITOPS_H */ diff --git a/src/mlnx-ofa_kernel-5.8/include/linux/bits.h b/src/mlnx-ofa_kernel-5.8/include/linux/bits.h new file mode 100644 index 0000000..57b62fa --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/include/linux/bits.h @@ -0,0 +1,24 @@ +#ifndef _COMPAT_LINUX_BITS_H +#define _COMPAT_LINUX_BITS_H + +/* Include the autogenerated header file */ +#include "../../compat/config.h" + +#ifdef HAVE_BITS_H +#include_next +#endif + +#include + +#ifndef GENMASK +#define GENMASK(h, l) \ + (((~0UL) - (1UL << (l)) + 1) & (~0UL >> (BITS_PER_LONG - 1 - (h)))) +#endif + +#ifndef GENMASK_ULL +#define GENMASK_ULL(h, l) \ + (((~ULL(0)) - (ULL(1) << (l)) + 1) & \ + (~ULL(0) >> (BITS_PER_LONG_LONG - 1 - (h)))) +#endif + +#endif /* _COMPAT_LINUX_BITS_H */ diff --git a/src/mlnx-ofa_kernel-5.8/include/linux/blk-mq-pci.h b/src/mlnx-ofa_kernel-5.8/include/linux/blk-mq-pci.h new file mode 100644 index 0000000..77649a3 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/include/linux/blk-mq-pci.h @@ -0,0 +1,43 @@ +#ifndef _COMPAT_LINUX_BLK_MQ_PCI_H +#define _COMPAT_LINUX_BLK_MQ_PCI_H 1 + +#include "../../compat/config.h" + +#ifdef HAVE_BLK_MQ_PCI_H +#include_next +#endif + +#if defined(HAVE_BLK_MQ_OPS_MAP_QUEUES) && \ + !defined(HAVE_BLK_MQ_PCI_MAP_QUEUES_3_ARGS) && \ + defined(HAVE_PCI_IRQ_GET_AFFINITY) + +#include +#include + +static inline +int __blk_mq_pci_map_queues(struct blk_mq_tag_set *set, struct pci_dev *pdev, + int offset) +{ + const struct cpumask *mask; + unsigned int queue, cpu; + + for (queue = 0; queue < set->nr_hw_queues; queue++) { + mask = pci_irq_get_affinity(pdev, queue + offset); + if (!mask) + goto fallback; + + for_each_cpu(cpu, mask) + set->mq_map[cpu] = queue; + } + + return 0; + +fallback: + WARN_ON_ONCE(set->nr_hw_queues > 1); + for_each_possible_cpu(cpu) + set->mq_map[cpu] = 0; + return 0; +} +#endif + +#endif /* _COMPAT_LINUX_BLK_MQ_PCI_H */ diff --git a/src/mlnx-ofa_kernel-5.8/include/linux/blk-mq-rdma.h b/src/mlnx-ofa_kernel-5.8/include/linux/blk-mq-rdma.h new file mode 100644 index 0000000..48e9b00 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/include/linux/blk-mq-rdma.h @@ -0,0 +1,20 @@ +#ifndef _LINUX_BLK_MQ_RDMA_H +#define _LINUX_BLK_MQ_RDMA_H + +#include "../../compat/config.h" + +#ifdef HAVE_BLK_MQ_TAG_SET_HAS_MAP +struct blk_mq_tag_set; +struct ib_device; + +#define blk_mq_rdma_map_queues LINUX_BACKPORT(blk_mq_rdma_map_queues) +#ifdef HAVE_BLK_MQ_RDMA_MAP_QUEUES_MAP +int blk_mq_rdma_map_queues(struct blk_mq_queue_map *map, + struct ib_device *dev, int first_vec); +#else +int blk_mq_rdma_map_queues(struct blk_mq_tag_set *set, + struct ib_device *dev, int first_vec); +#endif +#endif /* HAVE_BLK_MQ_TAG_SET_HAS_MAP */ + +#endif /* _LINUX_BLK_MQ_RDMA_H */ diff --git a/src/mlnx-ofa_kernel-5.8/include/linux/blk-mq.h b/src/mlnx-ofa_kernel-5.8/include/linux/blk-mq.h new file mode 100644 index 0000000..1c9f5ea --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/include/linux/blk-mq.h @@ -0,0 +1,172 @@ +#ifndef _COMPAT_LINUX_BLK_MQ_H +#define _COMPAT_LINUX_BLK_MQ_H + +#include "../../compat/config.h" + +#include_next +#ifndef HAVE_BLK_MQ_TAGSET_WAIT_COMPLETED_REQUEST +#include +#endif + +#ifndef HAVE_BLK_MQ_MAP_QUEUES +int blk_mq_map_queues(struct blk_mq_tag_set *set); +#endif + +#ifndef HAVE_BLK_MQ_FREEZE_QUEUE_WAIT_TIMEOUT +static inline int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, + unsigned long timeout) +{ + return wait_event_timeout(q->mq_freeze_wq, +#ifdef HAVE_REQUEST_QUEUE_Q_USAGE_COUNTER + percpu_ref_is_zero(&q->q_usage_counter), +#else + percpu_ref_is_zero(&q->mq_usage_counter), +#endif + timeout); +} +#endif + +#ifndef HAVE_BLK_MQ_FREEZE_QUEUE_WAIT +static inline void blk_mq_freeze_queue_wait(struct request_queue *q) +{ +#ifdef HAVE_REQUEST_QUEUE_Q_USAGE_COUNTER + wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter)); +#else + wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter)); +#endif +} +#endif + +#if !defined(HAVE_BLK_MQ_TAGSET_BUSY_ITER) && \ + defined(HAVE_BLK_MQ_ALL_TAG_BUSY_ITER) +static inline void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, + busy_tag_iter_fn *fn, void *priv) +{ + int i; + + for (i = 0; i < tagset->nr_hw_queues; i++) { + if (tagset->tags && tagset->tags[i]) + blk_mq_all_tag_busy_iter(tagset->tags[i], fn, priv); + } +} +#endif + +#if !defined(HAVE_REQUEST_TO_QC_T) && defined(HAVE_BLK_TYPES_REQ_HIPRI) +static inline blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, + struct request *rq) +{ + if (rq->tag != -1) + return rq->tag | (hctx->queue_num << BLK_QC_T_SHIFT); + + return rq->internal_tag | (hctx->queue_num << BLK_QC_T_SHIFT) | + BLK_QC_T_INTERNAL; +} +#endif + +#ifndef HAVE_BLK_STATUS_T + +typedef int blk_status_t; +#define BLK_STS_OK BLK_MQ_RQ_QUEUE_OK +#define BLK_STS_RESOURCE BLK_MQ_RQ_QUEUE_BUSY +#define BLK_STS_IOERR BLK_MQ_RQ_QUEUE_ERROR + +#define BLK_STS_NOSPC -ENOSPC +#define BLK_STS_NOTSUPP -EOPNOTSUPP +#define BLK_STS_MEDIUM -ENODATA +#define BLK_STS_TIMEOUT -ETIMEDOUT +#define BLK_STS_TRANSPORT -ENOLINK +#define BLK_STS_TARGET -EREMOTEIO +#define BLK_STS_NEXUS -EBADE +#define BLK_STS_PROTECTION -EILSEQ + +static inline int blk_status_to_errno(blk_status_t status) +{ + return status; +} + +#endif /* HAVE_BLK_STATUS_T */ + +#ifndef HAVE_BLK_PATH_ERROR +static inline bool blk_path_error(blk_status_t error) +{ + switch (error) { + case BLK_STS_NOTSUPP: + case BLK_STS_NOSPC: + case BLK_STS_TARGET: + case BLK_STS_NEXUS: + case BLK_STS_MEDIUM: + case BLK_STS_PROTECTION: + return false; + } + + /* Anything else could be a path failure, so should be retried */ + return true; +} +#endif + +#ifdef HAVE_MQ_RQ_STATE +#ifndef HAVE_BLK_MQ_SET_REQUEST_COMPLETE +static inline void blk_mq_set_request_complete(struct request *rq) +{ + WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); +} +#endif + +#ifndef HAVE_BLK_MQ_REQUEST_COMPLETED +static inline enum mq_rq_state blk_mq_rq_state(struct request *rq) +{ + return READ_ONCE(rq->state); +} + +static inline int blk_mq_request_completed(struct request *rq) +{ + return blk_mq_rq_state(rq) == MQ_RQ_COMPLETE; +} +#endif +#endif /* HAVE_MQ_RQ_STATE */ + +#ifndef HAVE_BLK_MQ_TAGSET_WAIT_COMPLETED_REQUEST +#ifdef HAVE_MQ_RQ_STATE +#ifdef HAVE_BLK_MQ_BUSY_TAG_ITER_FN_BOOL_3_PARAMS +static inline bool blk_mq_tagset_count_completed_rqs(struct request *rq, + void *data, bool reserved) +#elif defined HAVE_BLK_MQ_BUSY_TAG_ITER_FN_BOOL_2_PARAMS +static inline bool blk_mq_tagset_count_completed_rqs(struct request *rq, + void *data) +#else +static inline void blk_mq_tagset_count_completed_rqs(struct request *rq, + void *data, bool reserved) +#endif +{ + unsigned *count = data; + + if (blk_mq_request_completed(rq)) + (*count)++; +#ifdef HAVE_BLK_MQ_BUSY_TAG_ITER_FN_BOOL + return true; +#endif +} + +static inline void +blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset) +{ + while (true) { + unsigned count = 0; + + blk_mq_tagset_busy_iter(tagset, + blk_mq_tagset_count_completed_rqs, &count); + if (!count) + break; + msleep(5); + } +} +#else +static inline void +blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset) +{ + msleep(100); +} +#endif /* HAVE_MQ_RQ_STATE */ +#endif /* HAVE_BLK_MQ_TAGSET_WAIT_COMPLETED_REQUEST */ + +#endif /* _COMPAT_LINUX_BLK_MQ_H */ diff --git a/src/mlnx-ofa_kernel-5.8/include/linux/blk_types.h b/src/mlnx-ofa_kernel-5.8/include/linux/blk_types.h new file mode 100644 index 0000000..1097163 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/include/linux/blk_types.h @@ -0,0 +1,12 @@ +#ifndef _COMPAT_LINUX_BLK_TYPES_H +#define _COMPAT_LINUX_BLK_TYPES_H + +#include "../../compat/config.h" + +#include_next + +#ifndef HAVE_BLK_MQ_REQ_FLAGS_T +typedef __u32 __bitwise blk_mq_req_flags_t; +#endif + +#endif /* _COMPAT_LINUX_BLK_TYPES_H */ diff --git a/src/mlnx-ofa_kernel-5.8/include/linux/blkdev.h b/src/mlnx-ofa_kernel-5.8/include/linux/blkdev.h new file mode 100644 index 0000000..a7a7912 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/include/linux/blkdev.h @@ -0,0 +1,95 @@ +#ifndef _COMPAT_LINUX_BLKDEV_H +#define _COMPAT_LINUX_BLKDEV_H + +#include "../../compat/config.h" + +#include_next + +#ifndef SECTOR_SHIFT +#define SECTOR_SHIFT 9 +#endif +#ifndef SECTOR_SIZE +#define SECTOR_SIZE (1 << SECTOR_SHIFT) +#endif + +#ifndef rq_dma_dir +#ifdef HAVE_OP_IS_WRITE +#define rq_dma_dir(rq) \ + (op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE) +#else +#define rq_dma_dir(rq) \ + (rq_data_dir(rq) ? DMA_TO_DEVICE : DMA_FROM_DEVICE) +#endif +#endif + +#ifndef HAVE_BLK_RQ_IS_PASSTHROUGH +static inline bool blk_rq_is_passthrough(struct request *rq) +{ + return rq->cmd_type != REQ_TYPE_FS; +} +#endif + +#ifndef HAVE_BLK_QUEUE_WRITE_CACHE +#ifdef HAVE_QUEUE_FLAG_WC_FUA +static inline void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua) +{ + spin_lock_irq(q->queue_lock); + if (wc) + queue_flag_set(QUEUE_FLAG_WC, q); + else + queue_flag_clear(QUEUE_FLAG_WC, q); + if (fua) + queue_flag_set(QUEUE_FLAG_FUA, q); + else + queue_flag_clear(QUEUE_FLAG_FUA, q); + spin_unlock_irq(q->queue_lock); + + wbt_set_write_cache(q->rq_wb, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); +} +#else +static inline void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua) +{ +} +#endif +#endif + +#ifndef HAVE_BLK_QUEUE_FLAG_SET +static inline void blk_queue_flag_set(unsigned int flag, struct request_queue *q) +{ + unsigned long flags; + + spin_lock_irqsave(q->queue_lock, flags); + queue_flag_set(flag, q); + spin_unlock_irqrestore(q->queue_lock, flags); +} + +static inline void blk_queue_flag_clear(unsigned int flag, struct request_queue *q) +{ + unsigned long flags; + + spin_lock_irqsave(q->queue_lock, flags); + queue_flag_clear(flag, q); + spin_unlock_irqrestore(q->queue_lock, flags); +} + +static inline bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q) +{ + unsigned long flags; + bool res; + + spin_lock_irqsave(q->queue_lock, flags); + res = queue_flag_test_and_set(flag, q); + spin_unlock_irqrestore(q->queue_lock, flags); + + return res; +} +#endif + +#ifndef HAVE_QUEUE_FLAG_PCI_P2PDMA +static inline unsigned int blk_queue_pci_p2pdma(struct request_queue *q) +{ + return 0; +} +#endif + +#endif /* _COMPAT_LINUX_BLKDEV_H */ diff --git a/src/mlnx-ofa_kernel-5.8/include/linux/bpf.h b/src/mlnx-ofa_kernel-5.8/include/linux/bpf.h new file mode 100644 index 0000000..03a91c0 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/include/linux/bpf.h @@ -0,0 +1,56 @@ +#ifndef _COMPAT_LINUX_BPF_H +#define _COMPAT_LINUX_BPF_H + +#include "../../compat/config.h" + +#ifdef HAVE_LINUX_BPF_H +#include_next + +#if defined(HAVE_XDP_CONVERT_TO_XDP_FRAME) && \ + defined(HAVE_XDP_REDIRECT) && \ + defined(HAVE_NDO_XDP) && \ + defined(HAVE_NDO_XDP_XMIT) +#define HAVE_XDP +#else +#undef HAVE_XDP +#endif + +#if defined(HAVE_XDP_CONVERT_TO_XDP_FRAME) && \ + defined(HAVE_XDP_REDIRECT) && \ + defined(HAVE_NDO_XDP_EXTENDED) +#define HAVE_XDP_EXTENDED +#else +#undef HAVE_XDP_EXTENDED +#endif + + +/*Note - if you use HAVE_XDP_ENABLE define you should include in file you use this define*/ +#if defined(HAVE_XDP) || defined(HAVE_XDP_EXTENDED) +#define HAVE_XDP_ENABLE +#else +#undef HAVE_XDP_ENABLE +#endif + +#ifdef HAVE_XDP_SUPPORT +#ifndef HAVE_BPF_PROG_INC_EXPORTED +#define bpf_prog_inc LINUX_BACKPORT(bpf_prog_inc) +static inline struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) +{ + return bpf_prog_add(prog, 1); +} +#endif +#endif/* HAVE_XDP_SUPPORT */ + +#ifndef HAVE_BPF_PROG_SUB +struct bpf_prog; +#define bpf_prog_sub LINUX_BACKPORT(bpf_prog_sub) +void bpf_prog_sub(struct bpf_prog *prog, int i); +#endif + +#endif /* HAVE_LINUX_BPF_H */ + +#ifndef XDP_PACKET_HEADROOM +#define XDP_PACKET_HEADROOM 256 +#endif + +#endif /* _COMPAT_LINUX_BPF_H */ diff --git a/src/mlnx-ofa_kernel-5.8/include/linux/bpf_trace.h b/src/mlnx-ofa_kernel-5.8/include/linux/bpf_trace.h new file mode 100644 index 0000000..6acf46b --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/include/linux/bpf_trace.h @@ -0,0 +1,11 @@ +#ifndef _COMPAT_LINUX_BPF_TRACE_H +#define _COMPAT_LINUX_BPF_TRACE_H + +#include "../../compat/config.h" + +#ifdef HAVE_LINUX_BPF_TRACE_H +#include +#include_next +#endif + +#endif /* _COMPAT_LINUX_BPF_TRACE_H */ diff --git a/src/mlnx-ofa_kernel-5.8/include/linux/build_bug.h b/src/mlnx-ofa_kernel-5.8/include/linux/build_bug.h new file mode 100644 index 0000000..6c316eb --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/include/linux/build_bug.h @@ -0,0 +1,17 @@ +#ifndef _COMPAT_LINUX_BUILD_BUG_H +#define _COMPAT_LINUX_BUILD_BUG_H + +/* Include the autogenerated header file */ +#include "../../compat/config.h" + +#ifdef HAVE_BUILD_BUG_H +#include_next +#endif + +/* Force a compilation error if a constant expression is not a power of 2 */ +#ifndef __BUILD_BUG_ON_NOT_POWER_OF_2 +#define __BUILD_BUG_ON_NOT_POWER_OF_2(n) \ + BUILD_BUG_ON(((n) & ((n) - 1)) != 0) +#endif + +#endif /* _COMPAT_LINUX_BUILD_BUG_H */ diff --git a/src/mlnx-ofa_kernel-5.8/include/linux/cdev.h b/src/mlnx-ofa_kernel-5.8/include/linux/cdev.h new file mode 100644 index 0000000..758e53c --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/include/linux/cdev.h @@ -0,0 +1,48 @@ +#ifndef _COMPAT_LINUX_CDEV_H +#define _COMPAT_LINUX_CDEV_H + +#include "../../compat/config.h" + +#include_next + +#ifndef HAVE_CDEV_SET_PARENT +#include + +#define cdev_set_parent LINUX_BACKPORT(cdev_set_parent) +static inline void cdev_set_parent(struct cdev *p, struct kobject *kobj) +{ + WARN_ON(!kobj->state_initialized); + p->kobj.parent = kobj; +} + +#define cdev_device_add LINUX_BACKPORT(cdev_device_add) +static inline int cdev_device_add(struct cdev *cdev, struct device *dev) +{ + int rc = 0; + + if (dev->devt) { + cdev_set_parent(cdev, &dev->kobj); + + rc = cdev_add(cdev, dev->devt, 1); + if (rc) + return rc; + } + + rc = device_add(dev); + if (rc) + cdev_del(cdev); + + return rc; +} + +#define cdev_device_del LINUX_BACKPORT(cdev_device_del) +static inline void cdev_device_del(struct cdev *cdev, struct device *dev) +{ + device_del(dev); + if (dev->devt) + cdev_del(cdev); +} + +#endif /* HAVE_CDEV_SET_PARENT */ + +#endif /* _COMPAT_LINUX_CDEV_H */ diff --git a/src/mlnx-ofa_kernel-5.8/include/linux/cgroup_rdma.h b/src/mlnx-ofa_kernel-5.8/include/linux/cgroup_rdma.h new file mode 100644 index 0000000..4ba1ef3 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/include/linux/cgroup_rdma.h @@ -0,0 +1,10 @@ +#ifndef _COMPAT_LINUX_CGROUP_RDMA_H +#define _COMPAT_LINUX_CGROUP_RDMA_H + +#include "../../compat/config.h" + +#ifdef HAVE_CGROUP_RDMA_H +#include_next +#endif + +#endif /* _COMPAT_LINUX_CGROUP_RDMA_H */ diff --git a/src/mlnx-ofa_kernel-5.8/include/linux/compat-2.6.h b/src/mlnx-ofa_kernel-5.8/include/linux/compat-2.6.h new file mode 100644 index 0000000..e9828e0 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/include/linux/compat-2.6.h @@ -0,0 +1,81 @@ +#ifndef LINUX_26_COMPAT_H +#define LINUX_26_COMPAT_H + +#define LINUX_BACKPORT(__sym) backport_ ##__sym + +/* Include the autogenerated header file */ +#include "../../compat/config.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * The define overwriting module_init is based on the original module_init + * which looks like this: + * #define module_init(initfn) \ + * static inline initcall_t __inittest(void) \ + * { return initfn; } \ + * int init_module(void) __attribute__((alias(#initfn))); + * + * To the call to the initfn we added the symbol dependency on compat + * to make sure that compat.ko gets loaded for any compat modules. + */ +void backport_dependency_symbol(void); + +#ifndef __has_attribute +# define __has_attribute(x) __GCC4_has_attribute_##x +# define __GCC4_has_attribute___assume_aligned__ (__GNUC_MINOR__ >= 9) +# define __GCC4_has_attribute___copy__ 0 +# define __GCC4_has_attribute___designated_init__ 0 +# define __GCC4_has_attribute___externally_visible__ 1 +# define __GCC4_has_attribute___noclone__ 1 +# define __GCC4_has_attribute___nonstring__ 0 +# define __GCC4_has_attribute___no_sanitize_address__ (__GNUC_MINOR__ >= 8) +#endif + +#ifndef __GCC4_has_attribute___copy__ +# define __GCC4_has_attribute___copy__ 0 +#endif + +#if __has_attribute(__copy__) +# define __copy(symbol) __attribute__((__copy__(symbol))) +#else +# define __copy(symbol) +#endif + +#undef module_init +#define module_init(initfn) \ + static int __init __init_backport(void) \ + { \ + backport_dependency_symbol(); \ + return initfn(); \ + } \ + int init_module(void) __copy(initfn) __attribute__((alias("__init_backport"))); + + +/* + * Each compat file represents compatibility code for new kernel + * code introduced for *that* kernel revision. + */ + +#if (LINUX_VERSION_CODE > KERNEL_VERSION(3,9,0)) +#include +#include +#include +#include +#include +#include +#endif /* LINUX_VERSION_CODE > KERNEL_VERSION(3,9,0) */ + + +#ifndef HAVE_ELFCOREHDR_ADDR_EXPORTED +#define elfcorehdr_addr LINUX_BACKPORT(elfcorehdr_addr) +#endif + +#endif /* LINUX_26_COMPAT_H */ diff --git a/src/mlnx-ofa_kernel-5.8/include/linux/compat-3.10.h b/src/mlnx-ofa_kernel-5.8/include/linux/compat-3.10.h new file mode 100644 index 0000000..66c2d89 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/include/linux/compat-3.10.h @@ -0,0 +1,8 @@ +#ifndef LINUX_3_10_COMPAT_H +#define LINUX_3_10_COMPAT_H + +#include +#define random32() prandom_u32() + + +#endif /* LINUX_3_10_COMPAT_H */ diff --git a/src/mlnx-ofa_kernel-5.8/include/linux/compat-3.12.h b/src/mlnx-ofa_kernel-5.8/include/linux/compat-3.12.h new file mode 100644 index 0000000..db4844f --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/include/linux/compat-3.12.h @@ -0,0 +1,16 @@ +#ifndef LINUX_3_12_COMPAT_H +#define LINUX_3_12_COMPAT_H + +#include + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(3,12,0)) + +#include + +#ifndef HAVE_UDP4_HWCSUM +#define udp4_hwcsum LINUX_BACKPORT(udp4_hwcsum) +void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst); +#endif + +#endif /* (LINUX_VERSION_CODE < KERNEL_VERSION(3,12,0)) */ +#endif /* LINUX_3_12_COMPAT_H */ diff --git a/src/mlnx-ofa_kernel-5.8/include/linux/compat-3.15.h b/src/mlnx-ofa_kernel-5.8/include/linux/compat-3.15.h new file mode 100644 index 0000000..bb7ec03 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/include/linux/compat-3.15.h @@ -0,0 +1,13 @@ +#ifndef LINUX_3_15_COMPAT_H +#define LINUX_3_15_COMPAT_H + +#include + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(3,15,0)) + +#define kvfree LINUX_BACKPORT(kvfree) +extern void kvfree(const void *addr); + +#endif /* (LINUX_VERSION_CODE < KERNEL_VERSION(3,15,0)) */ + +#endif /* LINUX_3_15_COMPAT_H */ diff --git a/src/mlnx-ofa_kernel-5.8/include/linux/compat-4.0.h b/src/mlnx-ofa_kernel-5.8/include/linux/compat-4.0.h new file mode 100644 index 0000000..b3c37aa --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/include/linux/compat-4.0.h @@ -0,0 +1,26 @@ +#ifndef LINUX_4_0_COMPAT_H +#define LINUX_4_0_COMPAT_H + +#include + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,0,0)) +#include + +#define kstrdup_const LINUX_BACKPORT(kstrdup_const) +static inline const char *kstrdup_const(const char *s, gfp_t gfp) +{ + return kstrdup(s, gfp); +} + +#ifndef HAVE_KFREE_CONST +#include +#include + +static inline void kfree_const(const void *x) +{ + kfree(x); +} +#endif +#endif /* (LINUX_VERSION_CODE < KERNEL_VERSION(4,0,0)) */ + +#endif /* LINUX_4_0_COMPAT_H */ diff --git a/src/mlnx-ofa_kernel-5.8/include/linux/compat-4.1.h b/src/mlnx-ofa_kernel-5.8/include/linux/compat-4.1.h new file mode 100644 index 0000000..2f9db39 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/include/linux/compat-4.1.h @@ -0,0 +1,23 @@ +#ifndef LINUX_4_1_COMPAT_H +#define LINUX_4_1_COMPAT_H + +#include + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,1,0)) + +#include + +#define cpumask_local_spread LINUX_BACKPORT(cpumask_local_spread) + +#if NR_CPUS == 1 +static inline unsigned int cpumask_local_spread(unsigned int i, int node) +{ + return 0; +} +#else +unsigned int cpumask_local_spread(unsigned int i, int node); +#endif + +#endif /* (LINUX_VERSION_CODE < KERNEL_VERSION(4,1,0)) */ + +#endif /* LINUX_4_1_COMPAT_H */ diff --git a/src/mlnx-ofa_kernel-5.8/include/linux/compat-4.10.h b/src/mlnx-ofa_kernel-5.8/include/linux/compat-4.10.h new file mode 100644 index 0000000..c3dd8c5 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/include/linux/compat-4.10.h @@ -0,0 +1,102 @@ +#ifndef LINUX_4_10_COMPAT_H +#define LINUX_4_10_COMPAT_H + +#include + +#ifndef PCI_VENDOR_ID_MELLANOX +#define PCI_VENDOR_ID_MELLANOX 0x15b3 +#endif + +#ifndef PCI_DEVICE_ID_MELLANOX_CONNECTX3 +#define PCI_DEVICE_ID_MELLANOX_CONNECTX3 0x1003 +#endif + +#ifndef PCI_DEVICE_ID_MELLANOX_CONNECTX3_PRO +#define PCI_DEVICE_ID_MELLANOX_CONNECTX3_PRO 0x1007 +#endif + +#ifndef PCI_DEVICE_ID_MELLANOX_CONNECTIB +#define PCI_DEVICE_ID_MELLANOX_CONNECTIB 0x1011 +#endif + +#ifndef PCI_DEVICE_ID_MELLANOX_CONNECTX4 +#define PCI_DEVICE_ID_MELLANOX_CONNECTX4 0x1013 +#endif + +#ifndef PCI_DEVICE_ID_MELLANOX_CONNECTX4_LX +#define PCI_DEVICE_ID_MELLANOX_CONNECTX4_LX 0x1015 +#endif + +#ifndef PCI_DEVICE_ID_MELLANOX_TAVOR +#define PCI_DEVICE_ID_MELLANOX_TAVOR 0x5a44 +#endif + +#ifndef PCI_DEVICE_ID_MELLANOX_TAVOR_BRIDGE +#define PCI_DEVICE_ID_MELLANOX_TAVOR_BRIDGE 0x5a46 +#endif + +#ifndef PCI_DEVICE_ID_MELLANOX_SINAI_OLD +#define PCI_DEVICE_ID_MELLANOX_SINAI_OLD 0x5e8c +#endif + +#ifndef PCI_DEVICE_ID_MELLANOX_SINAI +#define PCI_DEVICE_ID_MELLANOX_SINAI 0x6274 +#endif + +#ifndef PCI_DEVICE_ID_MELLANOX_ARBEL_COMPAT +#define PCI_DEVICE_ID_MELLANOX_ARBEL_COMPAT 0x6278 +#endif + +#ifndef PCI_DEVICE_ID_MELLANOX_ARBEL +#define PCI_DEVICE_ID_MELLANOX_ARBEL 0x6282 +#endif + +#ifndef PCI_DEVICE_ID_MELLANOX_HERMON_SDR +#define PCI_DEVICE_ID_MELLANOX_HERMON_SDR 0x6340 +#endif + +#ifndef PCI_DEVICE_ID_MELLANOX_HERMON_DDR +#define PCI_DEVICE_ID_MELLANOX_HERMON_DDR 0x634a +#endif + +#ifndef PCI_DEVICE_ID_MELLANOX_HERMON_QDR +#define PCI_DEVICE_ID_MELLANOX_HERMON_QDR 0x6354 +#endif + +#ifndef PCI_DEVICE_ID_MELLANOX_HERMON_EN +#define PCI_DEVICE_ID_MELLANOX_HERMON_EN 0x6368 +#endif + +#ifndef PCI_DEVICE_ID_MELLANOX_CONNECTX_EN +#define PCI_DEVICE_ID_MELLANOX_CONNECTX_EN 0x6372 +#endif + +#ifndef PCI_DEVICE_ID_MELLANOX_HERMON_DDR_GEN2 +#define PCI_DEVICE_ID_MELLANOX_HERMON_DDR_GEN2 0x6732 +#endif + +#ifndef PCI_DEVICE_ID_MELLANOX_HERMON_QDR_GEN2 +#define PCI_DEVICE_ID_MELLANOX_HERMON_QDR_GEN2 0x673c +#endif + +#ifndef PCI_DEVICE_ID_MELLANOX_CONNECTX_EN_5_GEN2 +#define PCI_DEVICE_ID_MELLANOX_CONNECTX_EN_5_GEN2 0x6746 +#endif + +#ifndef PCI_DEVICE_ID_MELLANOX_HERMON_EN_GEN2 +#define PCI_DEVICE_ID_MELLANOX_HERMON_EN_GEN2 0x6750 +#endif + +#ifndef PCI_DEVICE_ID_MELLANOX_CONNECTX_EN_T_GEN2 +#define PCI_DEVICE_ID_MELLANOX_CONNECTX_EN_T_GEN2 0x675a +#endif + +#ifndef PCI_DEVICE_ID_MELLANOX_CONNECTX_EN_GEN2 +#define PCI_DEVICE_ID_MELLANOX_CONNECTX_EN_GEN2 0x6764 +#endif + +#ifndef PCI_DEVICE_ID_MELLANOX_CONNECTX2 +#define PCI_DEVICE_ID_MELLANOX_CONNECTX2 0x676e +#endif + +#endif /* LINUX_4_10_COMPAT_H */ diff --git a/src/mlnx-ofa_kernel-5.8/include/linux/compat_fix.h b/src/mlnx-ofa_kernel-5.8/include/linux/compat_fix.h new file mode 100644 index 0000000..9c7018b --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/include/linux/compat_fix.h @@ -0,0 +1,55 @@ +#ifndef COMPAT_FIX_H +#define COMPAT_FIX_H + +#include "../../compat/config.h" + +#ifdef CONFIG_MLX5_ESWITCH +#ifdef CONFIG_COMPAT_CLS_FLOWER_MOD +#ifndef CONFIG_NET_SCHED_NEW +#define HAVE_UNSUPPORTED_NDO_SETUP_TC 1 +#endif +#endif + +#ifdef CONFIG_COMPAT_TCF_TUNNEL_KEY_MOD +#define HAVE_NET_TC_ACT_TC_TUNNEL_KEY_H 1 +#define HAVE_TCF_TUNNEL_INFO 1 +#endif + +#ifdef CONFIG_COMPAT_TCF_VLAN_MOD +#define HAVE_IS_TCF_VLAN 1 +#endif +#else /* CONFIG_MLX5_ESWITCH */ +#undef CONFIG_COMPAT_IP_TUNNELS +#undef CONFIG_COMPAT_TCF_GACT +#undef CONFIG_COMPAT_FLOW_DISSECTOR +#undef CONFIG_COMPAT_CLS_FLOWER_MOD +#undef CONFIG_COMPAT_TCF_TUNNEL_KEY_MOD +#undef CONFIG_COMPAT_TCF_VLAN_MOD +#endif + +#ifdef CONFIG_COMPAT_TCF_PEDIT_MOD +#define HAVE_TCF_PEDIT_TCFP_KEYS_EX 1 +#endif + +#if defined(HAVE_TC_CLS_OFFLOAD_HANDLE) && defined(CONFIG_COMPAT_RHEL_JD) +#define CONFIG_COMPAT_SKIP_OFFLOAD_HANDLE 1 +#endif + +#ifdef HAVE_FLOW_CLS_OFFLOAD +#define HAVE_TC_CLS_OFFLOAD_EXTACK 1 +#define HAVE_TC_CLSFLOWER_STATS 1 +#define HAVE_TC_CLS_FLOWER_OFFLOAD_HAS_STATS_FIELD 1 +#define HAVE_TC_CLS_FLOWER_OFFLOAD_COMMON 1 +#endif + +#if defined(HAVE_TC_CLS_FLOWER_OFFLOAD_COMMON) && \ + defined(HAVE_IS_TCF_GACT_GOTO_CHAIN) && \ + defined(HAVE_FLOWER_MULTI_MASK) +#define CONFIG_COMPAT_PRIO_CHAIN_SUPPORT 1 +#endif + +#if defined(HAVE___TC_INDR_BLOCK_CB_REGISTER) || defined(HAVE___FLOW_INDR_BLOCK_CB_REGISTER) || defined(HAVE_FLOW_BLOCK_CB) +#define HAVE_TC_INDR_API 1 +#endif + +#endif /* COMPAT_FIX_H */ diff --git a/src/mlnx-ofa_kernel-5.8/include/linux/compiler-clang.h b/src/mlnx-ofa_kernel-5.8/include/linux/compiler-clang.h new file mode 100644 index 0000000..4970f7a --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/include/linux/compiler-clang.h @@ -0,0 +1,26 @@ +#ifndef _COMPAT_LINUX_COMPILER_CLANG_H +#define _COMPAT_LINUX_COMPILER_CLANG_H + +#include "../../compat/config.h" + +#include_next + +#ifndef HAVE_LINUX_OVERFLOW_H + +/* + + * Not all versions of clang implement the the type-generic versions + + * of the builtin overflow checkers. Fortunately, clang implements + + * __has_builtin allowing us to avoid awkward version + + * checks. Unfortunately, we don't know which version of gcc clang + + * pretends to be, so the macro may or may not be defined. + + */ +#undef COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW +#if __has_builtin(__builtin_mul_overflow) && \ + __has_builtin(__builtin_add_overflow) && \ + __has_builtin(__builtin_sub_overflow) +#define COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW 1 +#endif + +#endif /* HAVE_LINUX_OVERFLOW_H */ + +#endif /* _COMPAT_LINUX_COMPILER_CLANG_H */ diff --git a/src/mlnx-ofa_kernel-5.8/include/linux/compiler-gcc.h b/src/mlnx-ofa_kernel-5.8/include/linux/compiler-gcc.h new file mode 100644 index 0000000..9e9e314 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/include/linux/compiler-gcc.h @@ -0,0 +1,26 @@ +#ifndef _COMPAT_LINUX_COMPILER_GCC_H +#define _COMPAT_LINUX_COMPILER_GCC_H + +#include "../../compat/config.h" + +#include_next + +#ifndef fallthrough +# define fallthrough do {} while (0) /* fallthrough */ +#endif + +#ifndef GCC_VERSION +#define GCC_VERSION (__GNUC__ * 10000 \ + + __GNUC_MINOR__ * 100 \ + + __GNUC_PATCHLEVEL__) +#endif + +#ifndef HAVE_LINUX_OVERFLOW_H + +#if GCC_VERSION >= 50100 +#define COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW 1 +#endif + +#endif /* HAVE_LINUX_OVERFLOW_H */ + +#endif /* _COMPAT_LINUX_COMPILER_GCC_H */ diff --git a/src/mlnx-ofa_kernel-5.8/include/linux/compiler-intel.h b/src/mlnx-ofa_kernel-5.8/include/linux/compiler-intel.h new file mode 100644 index 0000000..da26894 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/include/linux/compiler-intel.h @@ -0,0 +1,17 @@ +#ifndef _COMPAT_LINUX_COMPILER_INTEL_H +#define _COMPAT_LINUX_COMPILER_INTEL_H + +#include "../../compat/config.h" + +#include_next + +#ifndef HAVE_LINUX_OVERFLOW_H + +/* + * icc defines __GNUC__, but does not implement the builtin overflow checkers. + */ +#undef COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW + +#endif /* HAVE_LINUX_OVERFLOW_H */ + +#endif /* _COMPAT_LINUX_COMPILER_INTEL_H */ diff --git a/src/mlnx-ofa_kernel-5.8/include/linux/compiler.h b/src/mlnx-ofa_kernel-5.8/include/linux/compiler.h new file mode 100644 index 0000000..c490cf8 --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/include/linux/compiler.h @@ -0,0 +1,54 @@ +#ifndef _COMPAT_LINUX_COMPILER_H +#define _COMPAT_LINUX_COMPILER_H + +#include "../../compat/config.h" + +#include_next +#include + +#ifndef OPTIMIZER_HIDE_VAR +/* Make the optimizer believe the variable can be manipulated arbitrarily. */ +#define OPTIMIZER_HIDE_VAR(var) \ + __asm__ ("" : "=r" (var) : "0" (var)) +#endif + +#ifndef HAVE_CONST_READ_ONCE_SIZE +#define __read_once_size LINUX_BACKPORT(__read_once_size) +static __always_inline void __read_once_size(const volatile void *p, void *res, int size) +{ + switch (size) { + case 1: *(__u8 *)res = *(volatile __u8 *)p; break; + case 2: *(__u16 *)res = *(volatile __u16 *)p; break; + case 4: *(__u32 *)res = *(volatile __u32 *)p; break; +#ifdef CONFIG_64BIT + case 8: *(__u64 *)res = *(volatile __u64 *)p; break; +#endif + default: + barrier(); + __builtin_memcpy((void *)res, (const void *)p, size); + barrier(); + } +} +#endif + +#ifndef __percpu +#define __percpu +#endif + +#ifndef __aligned +#define __aligned(x) __attribute__((aligned(x))) +#endif + +#ifndef READ_ONCE +#define READ_ONCE(val) ACCESS_ONCE(val) +#elif !defined (HAVE_CONST_READ_ONCE_SIZE) +#undef READ_ONCE +#define READ_ONCE(x) \ + ({ union { typeof(x) __val; char __c[1]; } __u; __read_once_size(&(x), __u.__c, sizeof(x)); __u.__val; }) +#endif + +#ifndef WRITE_ONCE +#define WRITE_ONCE(var, val) { ACCESS_ONCE(var) = val; } +#endif + +#endif /* _COMPAT_LINUX_COMPILER_H */ diff --git a/src/mlnx-ofa_kernel-5.8/include/linux/compiler_attributes.h b/src/mlnx-ofa_kernel-5.8/include/linux/compiler_attributes.h new file mode 100644 index 0000000..80256cf --- /dev/null +++ b/src/mlnx-ofa_kernel-5.8/include/linux/compiler_attributes.h @@ -0,0 +1,28 @@ +#ifndef _COMPAT_LINUX_COMPILER_ATTRIBUTES_H +#define _COMPAT_LINUX_COMPILER_ATTRIBUTES_H + +#include "../../compat/config.h" + +#include_next +#include + +#ifndef __GCC4_has_attribute___fallthrough__ +# define __GCC4_has_attribute___fallthrough__ 0 + /* Add the pseudo keyword 'fallthrough' so case statement blocks + * must end with any of these keywords: + * break; + * fallthrough; + * goto